From 26a029d407be480d791972afb5975cf62c9360a6 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 19 Apr 2024 02:47:55 +0200 Subject: Adding upstream version 124.0.1. Signed-off-by: Daniel Baumann --- media/libvpx/LICENSE | 31 + media/libvpx/Makefile.in | 24 + media/libvpx/README_MOZILLA | 18 + media/libvpx/config/generic/vp8_rtcd.h | 167 + media/libvpx/config/generic/vp9_rtcd.h | 92 + media/libvpx/config/generic/vpx_config.asm | 97 + media/libvpx/config/generic/vpx_config.c | 10 + media/libvpx/config/generic/vpx_config.h | 108 + media/libvpx/config/generic/vpx_dsp_rtcd.h | 744 ++ media/libvpx/config/generic/vpx_scale_rtcd.h | 70 + media/libvpx/config/linux/arm/vp8_rtcd.h | 265 + media/libvpx/config/linux/arm/vp9_rtcd.h | 130 + media/libvpx/config/linux/arm/vpx_config.asm | 97 + media/libvpx/config/linux/arm/vpx_config.c | 10 + media/libvpx/config/linux/arm/vpx_config.h | 108 + media/libvpx/config/linux/arm/vpx_dsp_rtcd.h | 1415 ++++ media/libvpx/config/linux/arm/vpx_scale_rtcd.h | 75 + media/libvpx/config/linux/arm64/vp8_rtcd.h | 201 + media/libvpx/config/linux/arm64/vp9_rtcd.h | 106 + media/libvpx/config/linux/arm64/vpx_config.asm | 97 + media/libvpx/config/linux/arm64/vpx_config.c | 10 + media/libvpx/config/linux/arm64/vpx_config.h | 108 + media/libvpx/config/linux/arm64/vpx_dsp_rtcd.h | 1184 ++++ media/libvpx/config/linux/arm64/vpx_scale_rtcd.h | 75 + media/libvpx/config/linux/ia32/vp8_rtcd.h | 323 + media/libvpx/config/linux/ia32/vp9_rtcd.h | 156 + media/libvpx/config/linux/ia32/vpx_config.asm | 93 + media/libvpx/config/linux/ia32/vpx_config.c | 10 + media/libvpx/config/linux/ia32/vpx_config.h | 108 + media/libvpx/config/linux/ia32/vpx_dsp_rtcd.h | 1604 +++++ media/libvpx/config/linux/ia32/vpx_scale_rtcd.h | 73 + media/libvpx/config/linux/x64/vp8_rtcd.h | 248 + media/libvpx/config/linux/x64/vp9_rtcd.h | 135 + media/libvpx/config/linux/x64/vpx_config.asm | 93 + media/libvpx/config/linux/x64/vpx_config.c | 10 + media/libvpx/config/linux/x64/vpx_config.h | 108 + media/libvpx/config/linux/x64/vpx_dsp_rtcd.h | 1296 ++++ media/libvpx/config/linux/x64/vpx_scale_rtcd.h | 73 + media/libvpx/config/mac/ia32/vp8_rtcd.h | 323 + media/libvpx/config/mac/ia32/vp9_rtcd.h | 156 + media/libvpx/config/mac/ia32/vpx_config.asm | 93 + media/libvpx/config/mac/ia32/vpx_config.c | 10 + media/libvpx/config/mac/ia32/vpx_config.h | 108 + media/libvpx/config/mac/ia32/vpx_dsp_rtcd.h | 1604 +++++ media/libvpx/config/mac/ia32/vpx_scale_rtcd.h | 73 + media/libvpx/config/mac/x64/vp8_rtcd.h | 248 + media/libvpx/config/mac/x64/vp9_rtcd.h | 135 + media/libvpx/config/mac/x64/vpx_config.asm | 93 + media/libvpx/config/mac/x64/vpx_config.c | 10 + media/libvpx/config/mac/x64/vpx_config.h | 108 + media/libvpx/config/mac/x64/vpx_dsp_rtcd.h | 1296 ++++ media/libvpx/config/mac/x64/vpx_scale_rtcd.h | 73 + media/libvpx/config/vpx_version.h | 8 + media/libvpx/config/win/aarch64/vp8_rtcd.h | 201 + media/libvpx/config/win/aarch64/vp9_rtcd.h | 106 + media/libvpx/config/win/aarch64/vpx_config.asm | 97 + media/libvpx/config/win/aarch64/vpx_config.c | 10 + media/libvpx/config/win/aarch64/vpx_config.h | 108 + media/libvpx/config/win/aarch64/vpx_dsp_rtcd.h | 1184 ++++ media/libvpx/config/win/aarch64/vpx_scale_rtcd.h | 75 + media/libvpx/config/win/ia32/vp8_rtcd.h | 323 + media/libvpx/config/win/ia32/vp9_rtcd.h | 156 + media/libvpx/config/win/ia32/vpx_config.asm | 93 + media/libvpx/config/win/ia32/vpx_config.c | 10 + media/libvpx/config/win/ia32/vpx_config.h | 108 + media/libvpx/config/win/ia32/vpx_dsp_rtcd.h | 1604 +++++ media/libvpx/config/win/ia32/vpx_scale_rtcd.h | 73 + media/libvpx/config/win/x64/vp8_rtcd.h | 248 + media/libvpx/config/win/x64/vp9_rtcd.h | 135 + media/libvpx/config/win/x64/vpx_config.asm | 93 + media/libvpx/config/win/x64/vpx_config.c | 10 + media/libvpx/config/win/x64/vpx_config.h | 108 + media/libvpx/config/win/x64/vpx_dsp_rtcd.h | 1296 ++++ media/libvpx/config/win/x64/vpx_scale_rtcd.h | 73 + media/libvpx/generate_sources_mozbuild.sh | 293 + media/libvpx/input_frame_validation.patch | 44 + media/libvpx/input_frame_validation_vp9.patch | 36 + media/libvpx/libvpx/.clang-format | 9 + media/libvpx/libvpx/.mailmap | 56 + media/libvpx/libvpx/AUTHORS | 228 + media/libvpx/libvpx/CHANGELOG | 929 +++ media/libvpx/libvpx/CONTRIBUTING.md | 29 + media/libvpx/libvpx/LICENSE | 31 + media/libvpx/libvpx/PATENTS | 23 + media/libvpx/libvpx/README | 189 + media/libvpx/libvpx/args.c | 215 + media/libvpx/libvpx/args.h | 63 + media/libvpx/libvpx/build/make/Android.mk | 217 + media/libvpx/libvpx/build/make/Makefile | 492 ++ media/libvpx/libvpx/build/make/ads2armasm_ms.pl | 39 + media/libvpx/libvpx/build/make/ads2gas.pl | 157 + media/libvpx/libvpx/build/make/ads2gas_apple.pl | 114 + media/libvpx/libvpx/build/make/armlink_adapter.sh | 54 + media/libvpx/libvpx/build/make/configure.sh | 1703 +++++ media/libvpx/libvpx/build/make/gen_asm_deps.sh | 64 + media/libvpx/libvpx/build/make/gen_msvs_def.sh | 83 + media/libvpx/libvpx/build/make/gen_msvs_sln.sh | 255 + media/libvpx/libvpx/build/make/gen_msvs_vcxproj.sh | 508 ++ media/libvpx/libvpx/build/make/ios-Info.plist | 37 + media/libvpx/libvpx/build/make/iosbuild.sh | 384 ++ media/libvpx/libvpx/build/make/msvs_common.sh | 124 + media/libvpx/libvpx/build/make/rtcd.pl | 528 ++ media/libvpx/libvpx/build/make/thumb.pm | 60 + media/libvpx/libvpx/build/make/version.sh | 78 + .../non_greedy_mv_test_files/cur_frame_16x16.txt | 2 + .../non_greedy_mv_test_files/estimation_16x16.txt | 2 + .../non_greedy_mv_test_files/exhaust_16x16.txt | 2 + .../ground_truth_16x16.txt | 2 + .../non_greedy_mv_test_files/localVar_16x16.txt | 2 + .../build_debug/non_greedy_mv_test_files/raw_1.png | Bin 0 -> 661279 bytes .../non_greedy_mv_test_files/raw_1_12_12.png | Bin 0 -> 919025 bytes .../non_greedy_mv_test_files/ref_frame_16x16.txt | 2 + media/libvpx/libvpx/codereview.settings | 4 + media/libvpx/libvpx/configure | 831 +++ media/libvpx/libvpx/docs.mk | 48 + media/libvpx/libvpx/examples.mk | 423 ++ media/libvpx/libvpx/examples/decode_to_md5.c | 132 + media/libvpx/libvpx/examples/decode_with_drops.c | 148 + media/libvpx/libvpx/examples/postproc.c | 133 + media/libvpx/libvpx/examples/resize_util.c | 131 + media/libvpx/libvpx/examples/set_maps.c | 243 + media/libvpx/libvpx/examples/simple_decoder.c | 149 + media/libvpx/libvpx/examples/simple_encoder.c | 247 + media/libvpx/libvpx/examples/svc_context.h | 112 + media/libvpx/libvpx/examples/svc_encodeframe.c | 634 ++ media/libvpx/libvpx/examples/twopass_encoder.c | 257 + .../libvpx/examples/vp8_multi_resolution_encoder.c | 666 ++ media/libvpx/libvpx/examples/vp8cx_set_ref.c | 187 + .../libvpx/libvpx/examples/vp9_lossless_encoder.c | 137 + .../libvpx/examples/vp9_spatial_svc_encoder.c | 1216 ++++ media/libvpx/libvpx/examples/vp9cx_set_ref.c | 320 + media/libvpx/libvpx/examples/vpx_dec_fuzzer.cc | 125 + .../libvpx/examples/vpx_temporal_svc_encoder.c | 1069 +++ media/libvpx/libvpx/ivfdec.c | 112 + media/libvpx/libvpx/ivfdec.h | 28 + media/libvpx/libvpx/ivfenc.c | 59 + media/libvpx/libvpx/ivfenc.h | 40 + media/libvpx/libvpx/keywords.dox | 51 + media/libvpx/libvpx/libs.doxy_template | 1260 ++++ media/libvpx/libvpx/libs.mk | 801 +++ media/libvpx/libvpx/mainpage.dox | 55 + media/libvpx/libvpx/md5_utils.c | 237 + media/libvpx/libvpx/md5_utils.h | 49 + media/libvpx/libvpx/rate_hist.c | 292 + media/libvpx/libvpx/rate_hist.h | 40 + media/libvpx/libvpx/solution.mk | 31 + media/libvpx/libvpx/test/acm_random.h | 88 + .../libvpx/libvpx/test/active_map_refresh_test.cc | 128 + media/libvpx/libvpx/test/active_map_test.cc | 93 + media/libvpx/libvpx/test/add_noise_test.cc | 149 + .../libvpx/libvpx/test/alt_ref_aq_segment_test.cc | 157 + media/libvpx/libvpx/test/altref_test.cc | 152 + media/libvpx/libvpx/test/android/Android.mk | 67 + media/libvpx/libvpx/test/android/README | 33 + media/libvpx/libvpx/test/android/get_files.py | 118 + .../libvpx/libvpx/test/android/scrape_gtest_log.py | 57 + media/libvpx/libvpx/test/aq_segment_test.cc | 109 + media/libvpx/libvpx/test/avg_test.cc | 757 +++ media/libvpx/libvpx/test/bench.cc | 38 + media/libvpx/libvpx/test/bench.h | 32 + media/libvpx/libvpx/test/blockiness_test.cc | 222 + media/libvpx/libvpx/test/borders_test.cc | 84 + media/libvpx/libvpx/test/buffer.h | 382 ++ media/libvpx/libvpx/test/byte_alignment_test.cc | 185 + media/libvpx/libvpx/test/clear_system_state.h | 23 + media/libvpx/libvpx/test/codec_factory.h | 266 + media/libvpx/libvpx/test/comp_avg_pred_test.cc | 275 + media/libvpx/libvpx/test/config_test.cc | 62 + media/libvpx/libvpx/test/consistency_test.cc | 215 + media/libvpx/libvpx/test/convolve_test.cc | 1518 +++++ media/libvpx/libvpx/test/cpu_speed_test.cc | 156 + media/libvpx/libvpx/test/cq_test.cc | 131 + media/libvpx/libvpx/test/cx_set_ref.sh | 60 + media/libvpx/libvpx/test/dct16x16_test.cc | 1029 +++ media/libvpx/libvpx/test/dct32x32_test.cc | 605 ++ media/libvpx/libvpx/test/dct_partial_test.cc | 183 + media/libvpx/libvpx/test/dct_test.cc | 790 +++ media/libvpx/libvpx/test/decode_api_test.cc | 215 + media/libvpx/libvpx/test/decode_corrupted.cc | 103 + media/libvpx/libvpx/test/decode_perf_test.cc | 263 + media/libvpx/libvpx/test/decode_svc_test.cc | 124 + media/libvpx/libvpx/test/decode_test_driver.cc | 119 + media/libvpx/libvpx/test/decode_test_driver.h | 162 + media/libvpx/libvpx/test/decode_to_md5.sh | 73 + media/libvpx/libvpx/test/decode_with_drops.sh | 79 + media/libvpx/libvpx/test/encode_api_test.cc | 949 +++ media/libvpx/libvpx/test/encode_perf_test.cc | 188 + media/libvpx/libvpx/test/encode_test_driver.cc | 269 + media/libvpx/libvpx/test/encode_test_driver.h | 302 + media/libvpx/libvpx/test/error_resilience_test.cc | 582 ++ media/libvpx/libvpx/test/examples.sh | 29 + .../libvpx/test/external_frame_buffer_test.cc | 518 ++ media/libvpx/libvpx/test/fdct8x8_test.cc | 791 +++ media/libvpx/libvpx/test/frame_size_tests.cc | 215 + media/libvpx/libvpx/test/hadamard_test.cc | 380 ++ media/libvpx/libvpx/test/i420_video_source.h | 33 + media/libvpx/libvpx/test/idct8x8_test.cc | 87 + media/libvpx/libvpx/test/idct_test.cc | 180 + media/libvpx/libvpx/test/init_vpx_test.cc | 96 + media/libvpx/libvpx/test/init_vpx_test.h | 18 + media/libvpx/libvpx/test/invalid_file_test.cc | 220 + media/libvpx/libvpx/test/ivf_video_source.h | 106 + media/libvpx/libvpx/test/keyframe_test.cc | 256 + media/libvpx/libvpx/test/level_test.cc | 147 + media/libvpx/libvpx/test/lpf_test.cc | 721 ++ media/libvpx/libvpx/test/md5_helper.h | 75 + media/libvpx/libvpx/test/minmax_test.cc | 248 + media/libvpx/libvpx/test/non_greedy_mv_test.cc | 200 + media/libvpx/libvpx/test/partial_idct_test.cc | 973 +++ media/libvpx/libvpx/test/postproc.sh | 63 + media/libvpx/libvpx/test/pp_filter_test.cc | 575 ++ media/libvpx/libvpx/test/predict_test.cc | 414 ++ media/libvpx/libvpx/test/quantize_test.cc | 234 + media/libvpx/libvpx/test/realtime_test.cc | 117 + media/libvpx/libvpx/test/register_state_check.h | 205 + media/libvpx/libvpx/test/resize_test.cc | 783 +++ media/libvpx/libvpx/test/resize_util.sh | 69 + media/libvpx/libvpx/test/sad_test.cc | 2079 ++++++ media/libvpx/libvpx/test/set_maps.sh | 59 + media/libvpx/libvpx/test/set_roi.cc | 166 + media/libvpx/libvpx/test/simple_decoder.sh | 61 + media/libvpx/libvpx/test/simple_encode_test.cc | 574 ++ media/libvpx/libvpx/test/simple_encoder.sh | 59 + media/libvpx/libvpx/test/stress.sh | 183 + media/libvpx/libvpx/test/sum_squares_test.cc | 341 + media/libvpx/libvpx/test/superframe_test.cc | 102 + media/libvpx/libvpx/test/svc_datarate_test.cc | 1796 +++++ media/libvpx/libvpx/test/svc_end_to_end_test.cc | 825 +++ media/libvpx/libvpx/test/svc_test.cc | 135 + media/libvpx/libvpx/test/svc_test.h | 67 + media/libvpx/libvpx/test/test-data.mk | 899 +++ media/libvpx/libvpx/test/test-data.sha1 | 873 +++ media/libvpx/libvpx/test/test.mk | 234 + media/libvpx/libvpx/test/test_intra_pred_speed.cc | 616 ++ media/libvpx/libvpx/test/test_libvpx.cc | 18 + media/libvpx/libvpx/test/test_rc_interface.cc | 6 + media/libvpx/libvpx/test/test_vector_test.cc | 204 + media/libvpx/libvpx/test/test_vectors.cc | 385 ++ media/libvpx/libvpx/test/test_vectors.h | 34 + media/libvpx/libvpx/test/tile_independence_test.cc | 104 + media/libvpx/libvpx/test/timestamp_test.cc | 101 + media/libvpx/libvpx/test/tools_common.sh | 447 ++ media/libvpx/libvpx/test/twopass_encoder.sh | 63 + media/libvpx/libvpx/test/user_priv_test.cc | 100 + media/libvpx/libvpx/test/util.h | 48 + media/libvpx/libvpx/test/variance_test.cc | 1993 ++++++ media/libvpx/libvpx/test/video_source.h | 286 + media/libvpx/libvpx/test/vp8_boolcoder_test.cc | 118 + media/libvpx/libvpx/test/vp8_datarate_test.cc | 438 ++ media/libvpx/libvpx/test/vp8_decrypt_test.cc | 69 + media/libvpx/libvpx/test/vp8_denoiser_sse2_test.cc | 119 + media/libvpx/libvpx/test/vp8_fdct4x4_test.cc | 211 + media/libvpx/libvpx/test/vp8_fragments_test.cc | 36 + .../libvpx/test/vp8_multi_resolution_encoder.sh | 87 + media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc | 417 ++ media/libvpx/libvpx/test/vp9_arf_freq_test.cc | 219 + media/libvpx/libvpx/test/vp9_block_error_test.cc | 218 + media/libvpx/libvpx/test/vp9_boolcoder_test.cc | 92 + media/libvpx/libvpx/test/vp9_c_vs_simd_encode.sh | 420 ++ media/libvpx/libvpx/test/vp9_datarate_test.cc | 1096 +++ media/libvpx/libvpx/test/vp9_decrypt_test.cc | 69 + media/libvpx/libvpx/test/vp9_denoiser_test.cc | 136 + .../test/vp9_encoder_parms_get_to_decoder.cc | 153 + media/libvpx/libvpx/test/vp9_end_to_end_test.cc | 354 + media/libvpx/libvpx/test/vp9_ethread_test.cc | 429 ++ media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc | 979 +++ media/libvpx/libvpx/test/vp9_intrapred_test.cc | 1207 ++++ media/libvpx/libvpx/test/vp9_lossless_test.cc | 125 + media/libvpx/libvpx/test/vp9_motion_vector_test.cc | 99 + media/libvpx/libvpx/test/vp9_quantize_test.cc | 725 ++ media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc | 672 ++ media/libvpx/libvpx/test/vp9_roi_test.cc | 148 + media/libvpx/libvpx/test/vp9_scale_test.cc | 215 + .../libvpx/libvpx/test/vp9_skip_loopfilter_test.cc | 183 + media/libvpx/libvpx/test/vp9_subtract_test.cc | 321 + media/libvpx/libvpx/test/vp9_thread_test.cc | 295 + media/libvpx/libvpx/test/vpx_scale_test.cc | 101 + media/libvpx/libvpx/test/vpx_scale_test.h | 201 + .../libvpx/libvpx/test/vpx_temporal_svc_encoder.sh | 334 + media/libvpx/libvpx/test/vpxdec.sh | 135 + media/libvpx/libvpx/test/vpxenc.sh | 489 ++ media/libvpx/libvpx/test/webm_video_source.h | 95 + media/libvpx/libvpx/test/y4m_test.cc | 244 + media/libvpx/libvpx/test/y4m_video_source.h | 123 + .../libvpx/libvpx/test/yuv_temporal_filter_test.cc | 726 ++ media/libvpx/libvpx/test/yuv_video_source.h | 128 + .../libvpx/third_party/googletest/README.libvpx | 29 + .../libvpx/libvpx/third_party/googletest/gtest.mk | 1 + .../third_party/googletest/src/.clang-format | 4 + .../libvpx/third_party/googletest/src/CONTRIBUTORS | 65 + .../libvpx/third_party/googletest/src/LICENSE | 28 + .../libvpx/third_party/googletest/src/README.md | 217 + .../src/include/gtest/gtest-assertion-result.h | 237 + .../src/include/gtest/gtest-death-test.h | 345 + .../googletest/src/include/gtest/gtest-matchers.h | 956 +++ .../googletest/src/include/gtest/gtest-message.h | 218 + .../src/include/gtest/gtest-param-test.h | 510 ++ .../googletest/src/include/gtest/gtest-printers.h | 1048 +++ .../googletest/src/include/gtest/gtest-spi.h | 248 + .../googletest/src/include/gtest/gtest-test-part.h | 190 + .../src/include/gtest/gtest-typed-test.h | 331 + .../googletest/src/include/gtest/gtest.h | 2297 +++++++ .../googletest/src/include/gtest/gtest_pred_impl.h | 279 + .../googletest/src/include/gtest/gtest_prod.h | 60 + .../src/include/gtest/internal/custom/README.md | 44 + .../src/include/gtest/internal/custom/gtest-port.h | 68 + .../include/gtest/internal/custom/gtest-printers.h | 42 + .../src/include/gtest/internal/custom/gtest.h | 37 + .../gtest/internal/gtest-death-test-internal.h | 306 + .../src/include/gtest/internal/gtest-filepath.h | 210 + .../src/include/gtest/internal/gtest-internal.h | 1570 +++++ .../src/include/gtest/internal/gtest-param-util.h | 956 +++ .../src/include/gtest/internal/gtest-port-arch.h | 116 + .../src/include/gtest/internal/gtest-port.h | 2413 +++++++ .../src/include/gtest/internal/gtest-string.h | 177 + .../src/include/gtest/internal/gtest-type-util.h | 186 + .../third_party/googletest/src/src/gtest-all.cc | 49 + .../googletest/src/src/gtest-assertion-result.cc | 77 + .../googletest/src/src/gtest-death-test.cc | 1620 +++++ .../googletest/src/src/gtest-filepath.cc | 367 + .../googletest/src/src/gtest-internal-inl.h | 1212 ++++ .../googletest/src/src/gtest-matchers.cc | 98 + .../third_party/googletest/src/src/gtest-port.cc | 1394 ++++ .../googletest/src/src/gtest-printers.cc | 553 ++ .../googletest/src/src/gtest-test-part.cc | 105 + .../googletest/src/src/gtest-typed-test.cc | 104 + .../libvpx/third_party/googletest/src/src/gtest.cc | 6795 +++++++++++++++++++ .../third_party/googletest/src/src/gtest_main.cc | 53 + media/libvpx/libvpx/third_party/libyuv/LICENSE | 29 + .../libvpx/libvpx/third_party/libyuv/README.libvpx | 23 + .../libyuv/include/libyuv/basic_types.h | 65 + .../third_party/libyuv/include/libyuv/compare.h | 111 + .../third_party/libyuv/include/libyuv/convert.h | 406 ++ .../libyuv/include/libyuv/convert_argb.h | 687 ++ .../libyuv/include/libyuv/convert_from.h | 342 + .../libyuv/include/libyuv/convert_from_argb.h | 287 + .../third_party/libyuv/include/libyuv/cpu_id.h | 119 + .../third_party/libyuv/include/libyuv/macros_msa.h | 233 + .../libyuv/include/libyuv/mjpeg_decoder.h | 195 + .../libyuv/include/libyuv/planar_functions.h | 847 +++ .../third_party/libyuv/include/libyuv/rotate.h | 164 + .../libyuv/include/libyuv/rotate_argb.h | 37 + .../third_party/libyuv/include/libyuv/rotate_row.h | 194 + .../libvpx/third_party/libyuv/include/libyuv/row.h | 3471 ++++++++++ .../third_party/libyuv/include/libyuv/scale.h | 131 + .../third_party/libyuv/include/libyuv/scale_argb.h | 76 + .../third_party/libyuv/include/libyuv/scale_row.h | 944 +++ .../third_party/libyuv/include/libyuv/version.h | 16 + .../libyuv/include/libyuv/video_common.h | 188 + .../libvpx/third_party/libyuv/source/compare.cc | 429 ++ .../third_party/libyuv/source/compare_common.cc | 104 + .../third_party/libyuv/source/compare_gcc.cc | 360 + .../third_party/libyuv/source/compare_msa.cc | 97 + .../third_party/libyuv/source/compare_neon.cc | 96 + .../third_party/libyuv/source/compare_neon64.cc | 90 + .../third_party/libyuv/source/compare_win.cc | 241 + .../libvpx/third_party/libyuv/source/convert.cc | 1740 +++++ .../third_party/libyuv/source/convert_argb.cc | 2231 ++++++ .../third_party/libyuv/source/convert_from.cc | 1429 ++++ .../third_party/libyuv/source/convert_from_argb.cc | 1617 +++++ .../third_party/libyuv/source/convert_jpeg.cc | 332 + .../third_party/libyuv/source/convert_to_argb.cc | 291 + .../third_party/libyuv/source/convert_to_i420.cc | 277 + .../libvpx/third_party/libyuv/source/cpu_id.cc | 276 + .../third_party/libyuv/source/mjpeg_decoder.cc | 573 ++ .../third_party/libyuv/source/mjpeg_validate.cc | 70 + .../third_party/libyuv/source/planar_functions.cc | 3587 ++++++++++ .../libvpx/third_party/libyuv/source/rotate.cc | 514 ++ .../libvpx/third_party/libyuv/source/rotate_any.cc | 73 + .../third_party/libyuv/source/rotate_argb.cc | 224 + .../third_party/libyuv/source/rotate_common.cc | 106 + .../libvpx/third_party/libyuv/source/rotate_gcc.cc | 374 ++ .../libvpx/third_party/libyuv/source/rotate_msa.cc | 250 + .../third_party/libyuv/source/rotate_neon.cc | 416 ++ .../third_party/libyuv/source/rotate_neon64.cc | 426 ++ .../libvpx/third_party/libyuv/source/rotate_win.cc | 252 + .../libvpx/third_party/libyuv/source/row_any.cc | 1211 ++++ .../libvpx/third_party/libyuv/source/row_common.cc | 3237 +++++++++ .../libvpx/third_party/libyuv/source/row_gcc.cc | 6677 ++++++++++++++++++ .../libvpx/third_party/libyuv/source/row_msa.cc | 3512 ++++++++++ .../libvpx/third_party/libyuv/source/row_neon.cc | 2693 ++++++++ .../libvpx/third_party/libyuv/source/row_neon64.cc | 2884 ++++++++ .../libvpx/third_party/libyuv/source/row_win.cc | 6234 +++++++++++++++++ .../libvpx/third_party/libyuv/source/scale.cc | 1741 +++++ .../libvpx/third_party/libyuv/source/scale_any.cc | 464 ++ .../libvpx/third_party/libyuv/source/scale_argb.cc | 1010 +++ .../third_party/libyuv/source/scale_common.cc | 1323 ++++ .../libvpx/third_party/libyuv/source/scale_gcc.cc | 1374 ++++ .../libvpx/third_party/libyuv/source/scale_msa.cc | 949 +++ .../libvpx/third_party/libyuv/source/scale_neon.cc | 970 +++ .../third_party/libyuv/source/scale_neon64.cc | 1064 +++ .../libvpx/third_party/libyuv/source/scale_win.cc | 1391 ++++ .../third_party/libyuv/source/video_common.cc | 62 + media/libvpx/libvpx/third_party/x86inc/LICENSE | 18 + .../libvpx/libvpx/third_party/x86inc/README.libvpx | 19 + media/libvpx/libvpx/third_party/x86inc/x86inc.asm | 1923 ++++++ media/libvpx/libvpx/tools.mk | 116 + media/libvpx/libvpx/tools_common.c | 776 +++ media/libvpx/libvpx/tools_common.h | 201 + media/libvpx/libvpx/usage.dox | 136 + media/libvpx/libvpx/usage_cx.dox | 15 + media/libvpx/libvpx/usage_dx.dox | 64 + media/libvpx/libvpx/video_common.h | 23 + media/libvpx/libvpx/video_reader.c | 97 + media/libvpx/libvpx/video_reader.h | 51 + media/libvpx/libvpx/video_writer.c | 80 + media/libvpx/libvpx/video_writer.h | 44 + media/libvpx/libvpx/vp8/common/alloccommon.c | 187 + media/libvpx/libvpx/vp8/common/alloccommon.h | 30 + .../libvpx/libvpx/vp8/common/arm/loopfilter_arm.c | 85 + .../libvpx/libvpx/vp8/common/arm/loopfilter_arm.h | 31 + .../vp8/common/arm/neon/bilinearpredict_neon.c | 764 +++ .../libvpx/vp8/common/arm/neon/copymem_neon.c | 52 + .../vp8/common/arm/neon/dc_only_idct_add_neon.c | 41 + .../libvpx/vp8/common/arm/neon/dequant_idct_neon.c | 141 + .../libvpx/vp8/common/arm/neon/dequantizeb_neon.c | 26 + .../libvpx/vp8/common/arm/neon/idct_blk_neon.c | 295 + .../libvpx/vp8/common/arm/neon/iwalsh_neon.c | 102 + .../arm/neon/loopfiltersimplehorizontaledge_neon.c | 106 + .../arm/neon/loopfiltersimpleverticaledge_neon.c | 274 + .../libvpx/vp8/common/arm/neon/mbloopfilter_neon.c | 613 ++ .../vp8/common/arm/neon/shortidct4x4llm_neon.c | 121 + .../vp8/common/arm/neon/sixtappredict_neon.c | 1729 +++++ .../vp8/common/arm/neon/vp8_loopfilter_neon.c | 538 ++ media/libvpx/libvpx/vp8/common/blockd.c | 19 + media/libvpx/libvpx/vp8/common/blockd.h | 311 + media/libvpx/libvpx/vp8/common/coefupdateprobs.h | 197 + media/libvpx/libvpx/vp8/common/common.h | 48 + media/libvpx/libvpx/vp8/common/context.c | 398 ++ media/libvpx/libvpx/vp8/common/debugmodes.c | 135 + .../libvpx/libvpx/vp8/common/default_coef_probs.h | 160 + media/libvpx/libvpx/vp8/common/dequantize.c | 37 + media/libvpx/libvpx/vp8/common/entropy.c | 147 + media/libvpx/libvpx/vp8/common/entropy.h | 108 + media/libvpx/libvpx/vp8/common/entropymode.c | 104 + media/libvpx/libvpx/vp8/common/entropymode.h | 88 + media/libvpx/libvpx/vp8/common/entropymv.c | 47 + media/libvpx/libvpx/vp8/common/entropymv.h | 49 + media/libvpx/libvpx/vp8/common/extend.c | 167 + media/libvpx/libvpx/vp8/common/extend.h | 32 + media/libvpx/libvpx/vp8/common/filter.c | 381 ++ media/libvpx/libvpx/vp8/common/filter.h | 31 + media/libvpx/libvpx/vp8/common/findnearmv.c | 159 + media/libvpx/libvpx/vp8/common/findnearmv.h | 151 + .../libvpx/vp8/common/generic/systemdependent.c | 111 + media/libvpx/libvpx/vp8/common/header.h | 48 + media/libvpx/libvpx/vp8/common/idct_blk.c | 72 + media/libvpx/libvpx/vp8/common/idctllm.c | 185 + media/libvpx/libvpx/vp8/common/invtrans.h | 57 + .../libvpx/libvpx/vp8/common/loongarch/idct_lsx.c | 322 + .../vp8/common/loongarch/loopfilter_filters_lsx.c | 743 ++ .../vp8/common/loongarch/sixtap_filter_lsx.c | 1904 ++++++ media/libvpx/libvpx/vp8/common/loopfilter.h | 101 + .../libvpx/libvpx/vp8/common/loopfilter_filters.c | 397 ++ media/libvpx/libvpx/vp8/common/mbpitch.c | 57 + media/libvpx/libvpx/vp8/common/mfqe.c | 327 + .../vp8/common/mips/dspr2/dequantize_dspr2.c | 29 + .../libvpx/vp8/common/mips/dspr2/filter_dspr2.c | 2767 ++++++++ .../libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c | 76 + .../libvpx/vp8/common/mips/dspr2/idctllm_dspr2.c | 346 + .../vp8/common/mips/dspr2/reconinter_dspr2.c | 97 + .../mips/dspr2/vp8_loopfilter_filters_dspr2.c | 2401 +++++++ .../libvpx/vp8/common/mips/mmi/copymem_mmi.c | 114 + .../libvpx/vp8/common/mips/mmi/dequantize_mmi.c | 115 + .../libvpx/vp8/common/mips/mmi/idct_blk_mmi.c | 70 + .../libvpx/vp8/common/mips/mmi/idctllm_mmi.c | 335 + .../vp8/common/mips/mmi/loopfilter_filters_mmi.c | 1415 ++++ .../libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c | 427 ++ .../vp8/common/mips/msa/bilinear_filter_msa.c | 797 +++ .../libvpx/vp8/common/mips/msa/copymem_msa.c | 62 + media/libvpx/libvpx/vp8/common/mips/msa/idct_msa.c | 406 ++ .../vp8/common/mips/msa/loopfilter_filters_msa.c | 709 ++ media/libvpx/libvpx/vp8/common/mips/msa/mfqe_msa.c | 139 + .../libvpx/vp8/common/mips/msa/sixtap_filter_msa.c | 1738 +++++ .../libvpx/vp8/common/mips/msa/vp8_macros_msa.h | 1762 +++++ media/libvpx/libvpx/vp8/common/modecont.c | 26 + media/libvpx/libvpx/vp8/common/modecont.h | 24 + media/libvpx/libvpx/vp8/common/mv.h | 33 + media/libvpx/libvpx/vp8/common/onyx.h | 285 + media/libvpx/libvpx/vp8/common/onyxc_int.h | 177 + media/libvpx/libvpx/vp8/common/onyxd.h | 61 + media/libvpx/libvpx/vp8/common/postproc.c | 264 + media/libvpx/libvpx/vp8/common/postproc.h | 45 + media/libvpx/libvpx/vp8/common/ppflags.h | 39 + media/libvpx/libvpx/vp8/common/quant_common.c | 130 + media/libvpx/libvpx/vp8/common/quant_common.h | 33 + media/libvpx/libvpx/vp8/common/reconinter.c | 503 ++ media/libvpx/libvpx/vp8/common/reconinter.h | 36 + media/libvpx/libvpx/vp8/common/reconintra.c | 104 + media/libvpx/libvpx/vp8/common/reconintra.h | 35 + media/libvpx/libvpx/vp8/common/reconintra4x4.c | 75 + media/libvpx/libvpx/vp8/common/reconintra4x4.h | 45 + media/libvpx/libvpx/vp8/common/rtcd.c | 15 + media/libvpx/libvpx/vp8/common/rtcd_defs.pl | 244 + media/libvpx/libvpx/vp8/common/setupintrarecon.c | 38 + media/libvpx/libvpx/vp8/common/setupintrarecon.h | 40 + media/libvpx/libvpx/vp8/common/swapyv12buffer.c | 32 + media/libvpx/libvpx/vp8/common/swapyv12buffer.h | 27 + media/libvpx/libvpx/vp8/common/systemdependent.h | 27 + media/libvpx/libvpx/vp8/common/threading.h | 215 + media/libvpx/libvpx/vp8/common/treecoder.c | 102 + media/libvpx/libvpx/vp8/common/treecoder.h | 82 + .../libvpx/libvpx/vp8/common/vp8_entropymodedata.h | 172 + media/libvpx/libvpx/vp8/common/vp8_loopfilter.c | 566 ++ .../libvpx/libvpx/vp8/common/vp8_skin_detection.c | 109 + .../libvpx/libvpx/vp8/common/vp8_skin_detection.h | 47 + .../libvpx/vp8/common/x86/bilinear_filter_sse2.c | 336 + .../libvpx/vp8/common/x86/dequantize_mmx.asm | 259 + media/libvpx/libvpx/vp8/common/x86/idct_blk_mmx.c | 23 + media/libvpx/libvpx/vp8/common/x86/idct_blk_sse2.c | 84 + media/libvpx/libvpx/vp8/common/x86/idctllm_mmx.asm | 296 + .../libvpx/libvpx/vp8/common/x86/idctllm_sse2.asm | 710 ++ media/libvpx/libvpx/vp8/common/x86/iwalsh_sse2.asm | 123 + .../common/x86/loopfilter_block_sse2_x86_64.asm | 817 +++ .../libvpx/vp8/common/x86/loopfilter_sse2.asm | 1642 +++++ .../libvpx/libvpx/vp8/common/x86/loopfilter_x86.c | 129 + media/libvpx/libvpx/vp8/common/x86/mfqe_sse2.asm | 289 + media/libvpx/libvpx/vp8/common/x86/recon_mmx.asm | 120 + media/libvpx/libvpx/vp8/common/x86/recon_sse2.asm | 118 + .../libvpx/libvpx/vp8/common/x86/subpixel_mmx.asm | 270 + .../libvpx/libvpx/vp8/common/x86/subpixel_sse2.asm | 963 +++ .../libvpx/vp8/common/x86/subpixel_ssse3.asm | 1515 +++++ media/libvpx/libvpx/vp8/common/x86/vp8_asm_stubs.c | 365 + media/libvpx/libvpx/vp8/decoder/dboolhuff.c | 72 + media/libvpx/libvpx/vp8/decoder/dboolhuff.h | 132 + media/libvpx/libvpx/vp8/decoder/decodeframe.c | 1263 ++++ media/libvpx/libvpx/vp8/decoder/decodemv.c | 562 ++ media/libvpx/libvpx/vp8/decoder/decodemv.h | 26 + media/libvpx/libvpx/vp8/decoder/decoderthreading.h | 30 + media/libvpx/libvpx/vp8/decoder/detokenize.c | 210 + media/libvpx/libvpx/vp8/decoder/detokenize.h | 27 + media/libvpx/libvpx/vp8/decoder/ec_types.h | 53 + .../libvpx/libvpx/vp8/decoder/error_concealment.c | 482 ++ .../libvpx/libvpx/vp8/decoder/error_concealment.h | 41 + media/libvpx/libvpx/vp8/decoder/onyxd_if.c | 460 ++ media/libvpx/libvpx/vp8/decoder/onyxd_int.h | 141 + media/libvpx/libvpx/vp8/decoder/threading.c | 907 +++ media/libvpx/libvpx/vp8/decoder/treereader.h | 45 + .../libvpx/vp8/encoder/arm/neon/denoising_neon.c | 460 ++ .../vp8/encoder/arm/neon/fastquantizeb_neon.c | 91 + .../libvpx/vp8/encoder/arm/neon/shortfdct_neon.c | 261 + .../vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c | 121 + media/libvpx/libvpx/vp8/encoder/bitstream.c | 1381 ++++ media/libvpx/libvpx/vp8/encoder/bitstream.h | 32 + media/libvpx/libvpx/vp8/encoder/block.h | 168 + media/libvpx/libvpx/vp8/encoder/boolhuff.c | 63 + media/libvpx/libvpx/vp8/encoder/boolhuff.h | 112 + media/libvpx/libvpx/vp8/encoder/copy_c.c | 27 + media/libvpx/libvpx/vp8/encoder/dct.c | 108 + media/libvpx/libvpx/vp8/encoder/dct_value_cost.h | 344 + media/libvpx/libvpx/vp8/encoder/dct_value_tokens.h | 848 +++ .../libvpx/libvpx/vp8/encoder/defaultcoefcounts.h | 235 + media/libvpx/libvpx/vp8/encoder/denoising.c | 725 ++ media/libvpx/libvpx/vp8/encoder/denoising.h | 103 + media/libvpx/libvpx/vp8/encoder/encodeframe.c | 1306 ++++ media/libvpx/libvpx/vp8/encoder/encodeframe.h | 40 + media/libvpx/libvpx/vp8/encoder/encodeintra.c | 116 + media/libvpx/libvpx/vp8/encoder/encodeintra.h | 28 + media/libvpx/libvpx/vp8/encoder/encodemb.c | 512 ++ media/libvpx/libvpx/vp8/encoder/encodemb.h | 40 + media/libvpx/libvpx/vp8/encoder/encodemv.c | 320 + media/libvpx/libvpx/vp8/encoder/encodemv.h | 29 + media/libvpx/libvpx/vp8/encoder/ethreading.c | 664 ++ media/libvpx/libvpx/vp8/encoder/ethreading.h | 32 + media/libvpx/libvpx/vp8/encoder/firstpass.c | 3090 +++++++++ media/libvpx/libvpx/vp8/encoder/firstpass.h | 31 + media/libvpx/libvpx/vp8/encoder/lookahead.c | 184 + media/libvpx/libvpx/vp8/encoder/lookahead.h | 99 + .../libvpx/libvpx/vp8/encoder/loongarch/dct_lsx.c | 161 + .../libvpx/vp8/encoder/loongarch/encodeopt_lsx.c | 82 + .../vp8/encoder/loongarch/vp8_quantize_lsx.c | 145 + media/libvpx/libvpx/vp8/encoder/mcomp.c | 1561 +++++ media/libvpx/libvpx/vp8/encoder/mcomp.h | 75 + media/libvpx/libvpx/vp8/encoder/mips/mmi/dct_mmi.c | 434 ++ .../libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c | 263 + media/libvpx/libvpx/vp8/encoder/mips/msa/dct_msa.c | 196 + .../libvpx/vp8/encoder/mips/msa/denoising_msa.c | 568 ++ .../libvpx/vp8/encoder/mips/msa/encodeopt_msa.c | 167 + .../libvpx/vp8/encoder/mips/msa/quantize_msa.c | 211 + .../vp8/encoder/mips/msa/temporal_filter_msa.c | 284 + media/libvpx/libvpx/vp8/encoder/modecosts.c | 48 + media/libvpx/libvpx/vp8/encoder/modecosts.h | 26 + media/libvpx/libvpx/vp8/encoder/mr_dissim.c | 215 + media/libvpx/libvpx/vp8/encoder/mr_dissim.h | 27 + media/libvpx/libvpx/vp8/encoder/onyx_if.c | 5420 +++++++++++++++ media/libvpx/libvpx/vp8/encoder/onyx_int.h | 745 +++ media/libvpx/libvpx/vp8/encoder/pickinter.c | 1347 ++++ media/libvpx/libvpx/vp8/encoder/pickinter.h | 33 + media/libvpx/libvpx/vp8/encoder/picklpf.c | 392 ++ media/libvpx/libvpx/vp8/encoder/picklpf.h | 30 + media/libvpx/libvpx/vp8/encoder/quantize.h | 34 + media/libvpx/libvpx/vp8/encoder/ratectrl.c | 1591 +++++ media/libvpx/libvpx/vp8/encoder/ratectrl.h | 40 + media/libvpx/libvpx/vp8/encoder/rdopt.c | 2394 +++++++ media/libvpx/libvpx/vp8/encoder/rdopt.h | 126 + media/libvpx/libvpx/vp8/encoder/segmentation.c | 55 + media/libvpx/libvpx/vp8/encoder/segmentation.h | 29 + media/libvpx/libvpx/vp8/encoder/temporal_filter.c | 434 ++ media/libvpx/libvpx/vp8/encoder/temporal_filter.h | 26 + media/libvpx/libvpx/vp8/encoder/tokenize.c | 468 ++ media/libvpx/libvpx/vp8/encoder/tokenize.h | 48 + media/libvpx/libvpx/vp8/encoder/treewriter.c | 33 + media/libvpx/libvpx/vp8/encoder/treewriter.h | 106 + media/libvpx/libvpx/vp8/encoder/vp8_quantize.c | 492 ++ .../libvpx/vp8/encoder/x86/block_error_sse2.asm | 188 + media/libvpx/libvpx/vp8/encoder/x86/copy_sse2.asm | 94 + media/libvpx/libvpx/vp8/encoder/x86/copy_sse3.asm | 147 + media/libvpx/libvpx/vp8/encoder/x86/dct_sse2.asm | 434 ++ .../libvpx/libvpx/vp8/encoder/x86/denoising_sse2.c | 372 + .../libvpx/libvpx/vp8/encoder/x86/fwalsh_sse2.asm | 166 + .../libvpx/libvpx/vp8/encoder/x86/quantize_sse4.c | 141 + .../vp8/encoder/x86/temporal_filter_apply_sse2.asm | 209 + .../libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c | 28 + .../libvpx/vp8/encoder/x86/vp8_quantize_sse2.c | 226 + .../libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c | 93 + media/libvpx/libvpx/vp8/exports_dec | 2 + media/libvpx/libvpx/vp8/exports_enc | 2 + media/libvpx/libvpx/vp8/vp8_common.mk | 149 + media/libvpx/libvpx/vp8/vp8_cx_iface.c | 1398 ++++ media/libvpx/libvpx/vp8/vp8_dx_iface.c | 752 +++ media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc | 429 ++ media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h | 64 + media/libvpx/libvpx/vp8/vp8cx.mk | 132 + media/libvpx/libvpx/vp8/vp8dx.mk | 39 + .../common/arm/neon/vp9_highbd_iht16x16_add_neon.c | 446 ++ .../common/arm/neon/vp9_highbd_iht4x4_add_neon.c | 181 + .../common/arm/neon/vp9_highbd_iht8x8_add_neon.c | 345 + .../vp9/common/arm/neon/vp9_iht16x16_add_neon.c | 279 + .../vp9/common/arm/neon/vp9_iht4x4_add_neon.c | 76 + .../vp9/common/arm/neon/vp9_iht8x8_add_neon.c | 68 + .../libvpx/vp9/common/arm/neon/vp9_iht_neon.h | 272 + .../vp9/common/mips/dspr2/vp9_itrans16_dspr2.c | 98 + .../vp9/common/mips/dspr2/vp9_itrans4_dspr2.c | 90 + .../vp9/common/mips/dspr2/vp9_itrans8_dspr2.c | 84 + .../libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c | 80 + .../libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c | 61 + .../libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c | 79 + .../libvpx/vp9/common/mips/msa/vp9_mfqe_msa.c | 134 + media/libvpx/libvpx/vp9/common/ppc/vp9_idct_vsx.c | 116 + media/libvpx/libvpx/vp9/common/vp9_alloccommon.c | 195 + media/libvpx/libvpx/vp9/common/vp9_alloccommon.h | 49 + media/libvpx/libvpx/vp9/common/vp9_blockd.c | 131 + media/libvpx/libvpx/vp9/common/vp9_blockd.h | 322 + media/libvpx/libvpx/vp9/common/vp9_common.h | 59 + media/libvpx/libvpx/vp9/common/vp9_common_data.c | 259 + media/libvpx/libvpx/vp9/common/vp9_common_data.h | 45 + media/libvpx/libvpx/vp9/common/vp9_debugmodes.c | 88 + media/libvpx/libvpx/vp9/common/vp9_entropy.c | 1100 +++ media/libvpx/libvpx/vp9/common/vp9_entropy.h | 197 + media/libvpx/libvpx/vp9/common/vp9_entropymode.c | 469 ++ media/libvpx/libvpx/vp9/common/vp9_entropymode.h | 107 + media/libvpx/libvpx/vp9/common/vp9_entropymv.c | 191 + media/libvpx/libvpx/vp9/common/vp9_entropymv.h | 136 + media/libvpx/libvpx/vp9/common/vp9_enums.h | 145 + media/libvpx/libvpx/vp9/common/vp9_filter.c | 82 + media/libvpx/libvpx/vp9/common/vp9_filter.h | 42 + media/libvpx/libvpx/vp9/common/vp9_frame_buffers.c | 82 + media/libvpx/libvpx/vp9/common/vp9_frame_buffers.h | 53 + media/libvpx/libvpx/vp9/common/vp9_idct.c | 398 ++ media/libvpx/libvpx/vp9/common/vp9_idct.h | 81 + media/libvpx/libvpx/vp9/common/vp9_loopfilter.c | 1633 +++++ media/libvpx/libvpx/vp9/common/vp9_loopfilter.h | 160 + media/libvpx/libvpx/vp9/common/vp9_mfqe.c | 383 ++ media/libvpx/libvpx/vp9/common/vp9_mfqe.h | 31 + media/libvpx/libvpx/vp9/common/vp9_mv.h | 57 + media/libvpx/libvpx/vp9/common/vp9_mvref_common.c | 199 + media/libvpx/libvpx/vp9/common/vp9_mvref_common.h | 323 + media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h | 468 ++ media/libvpx/libvpx/vp9/common/vp9_postproc.c | 435 ++ media/libvpx/libvpx/vp9/common/vp9_postproc.h | 53 + media/libvpx/libvpx/vp9/common/vp9_ppflags.h | 36 + media/libvpx/libvpx/vp9/common/vp9_pred_common.c | 316 + media/libvpx/libvpx/vp9/common/vp9_pred_common.h | 197 + media/libvpx/libvpx/vp9/common/vp9_quant_common.c | 206 + media/libvpx/libvpx/vp9/common/vp9_quant_common.h | 36 + media/libvpx/libvpx/vp9/common/vp9_reconinter.c | 290 + media/libvpx/libvpx/vp9/common/vp9_reconinter.h | 107 + media/libvpx/libvpx/vp9/common/vp9_reconintra.c | 431 ++ media/libvpx/libvpx/vp9/common/vp9_reconintra.h | 31 + media/libvpx/libvpx/vp9/common/vp9_rtcd.c | 15 + media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl | 228 + media/libvpx/libvpx/vp9/common/vp9_scale.c | 171 + media/libvpx/libvpx/vp9/common/vp9_scale.h | 71 + media/libvpx/libvpx/vp9/common/vp9_scan.c | 725 ++ media/libvpx/libvpx/vp9/common/vp9_scan.h | 58 + media/libvpx/libvpx/vp9/common/vp9_seg_common.c | 62 + media/libvpx/libvpx/vp9/common/vp9_seg_common.h | 86 + media/libvpx/libvpx/vp9/common/vp9_thread_common.c | 596 ++ media/libvpx/libvpx/vp9/common/vp9_thread_common.h | 83 + media/libvpx/libvpx/vp9/common/vp9_tile_common.c | 57 + media/libvpx/libvpx/vp9/common/vp9_tile_common.h | 40 + .../vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c | 419 ++ .../vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c | 131 + .../vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c | 255 + .../libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c | 224 + .../libvpx/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm | 289 + media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c | 3063 +++++++++ media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.h | 35 + media/libvpx/libvpx/vp9/decoder/vp9_decodemv.c | 850 +++ media/libvpx/libvpx/vp9/decoder/vp9_decodemv.h | 29 + media/libvpx/libvpx/vp9/decoder/vp9_decoder.c | 585 ++ media/libvpx/libvpx/vp9/decoder/vp9_decoder.h | 189 + media/libvpx/libvpx/vp9/decoder/vp9_detokenize.c | 333 + media/libvpx/libvpx/vp9/decoder/vp9_detokenize.h | 29 + media/libvpx/libvpx/vp9/decoder/vp9_dsubexp.c | 72 + media/libvpx/libvpx/vp9/decoder/vp9_dsubexp.h | 26 + media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c | 124 + media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h | 45 + .../libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c | 2173 ++++++ .../vp9/encoder/arm/neon/vp9_denoiser_neon.c | 356 + .../encoder/arm/neon/vp9_diamond_search_sad_neon.c | 296 + .../libvpx/vp9/encoder/arm/neon/vp9_error_neon.c | 102 + .../vp9/encoder/arm/neon/vp9_frame_scale_neon.c | 844 +++ .../vp9/encoder/arm/neon/vp9_highbd_error_neon.c | 49 + .../arm/neon/vp9_highbd_temporal_filter_neon.c | 872 +++ .../vp9/encoder/arm/neon/vp9_quantize_neon.c | 403 ++ .../encoder/arm/neon/vp9_temporal_filter_neon.c | 849 +++ .../libvpx/vp9/encoder/mips/msa/vp9_error_msa.c | 108 + .../vp9/encoder/mips/msa/vp9_fdct16x16_msa.c | 501 ++ .../libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c | 98 + .../libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c | 65 + .../libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h | 116 + .../libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c | 287 + media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.c | 63 + media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.h | 127 + media/libvpx/libvpx/vp9/encoder/vp9_aq_360.c | 75 + media/libvpx/libvpx/vp9/encoder/vp9_aq_360.h | 27 + .../libvpx/libvpx/vp9/encoder/vp9_aq_complexity.c | 160 + .../libvpx/libvpx/vp9/encoder/vp9_aq_complexity.h | 36 + .../libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c | 702 ++ .../libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h | 147 + media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.c | 247 + media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.h | 34 + media/libvpx/libvpx/vp9/encoder/vp9_bitstream.c | 1387 ++++ media/libvpx/libvpx/vp9/encoder/vp9_bitstream.h | 49 + media/libvpx/libvpx/vp9/encoder/vp9_block.h | 225 + media/libvpx/libvpx/vp9/encoder/vp9_blockiness.c | 135 + media/libvpx/libvpx/vp9/encoder/vp9_blockiness.h | 26 + media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c | 161 + media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h | 106 + media/libvpx/libvpx/vp9/encoder/vp9_cost.c | 65 + media/libvpx/libvpx/vp9/encoder/vp9_cost.h | 57 + media/libvpx/libvpx/vp9/encoder/vp9_dct.c | 687 ++ media/libvpx/libvpx/vp9/encoder/vp9_denoiser.c | 839 +++ media/libvpx/libvpx/vp9/encoder/vp9_denoiser.h | 132 + media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c | 6581 ++++++++++++++++++ media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.h | 57 + media/libvpx/libvpx/vp9/encoder/vp9_encodemb.c | 1061 +++ media/libvpx/libvpx/vp9/encoder/vp9_encodemb.h | 60 + media/libvpx/libvpx/vp9/encoder/vp9_encodemv.c | 271 + media/libvpx/libvpx/vp9/encoder/vp9_encodemv.h | 38 + media/libvpx/libvpx/vp9/encoder/vp9_encoder.c | 7074 ++++++++++++++++++++ media/libvpx/libvpx/vp9/encoder/vp9_encoder.h | 1664 +++++ media/libvpx/libvpx/vp9/encoder/vp9_ethread.c | 692 ++ media/libvpx/libvpx/vp9/encoder/vp9_ethread.h | 77 + media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c | 281 + media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h | 63 + media/libvpx/libvpx/vp9/encoder/vp9_extend.c | 203 + media/libvpx/libvpx/vp9/encoder/vp9_extend.h | 31 + media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c | 3906 +++++++++++ media/libvpx/libvpx/vp9/encoder/vp9_firstpass.h | 274 + .../libvpx/vp9/encoder/vp9_firstpass_stats.h | 54 + media/libvpx/libvpx/vp9/encoder/vp9_frame_scale.c | 136 + media/libvpx/libvpx/vp9/encoder/vp9_job_queue.h | 46 + media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c | 235 + media/libvpx/libvpx/vp9/encoder/vp9_lookahead.h | 127 + media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.c | 388 ++ media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.h | 40 + media/libvpx/libvpx/vp9/encoder/vp9_mcomp.c | 3035 +++++++++ media/libvpx/libvpx/vp9/encoder/vp9_mcomp.h | 178 + media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c | 334 + media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.h | 41 + .../libvpx/libvpx/vp9/encoder/vp9_noise_estimate.c | 302 + .../libvpx/libvpx/vp9/encoder/vp9_noise_estimate.h | 54 + .../libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.c | 536 ++ .../libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.h | 129 + .../libvpx/vp9/encoder/vp9_partition_models.h | 975 +++ media/libvpx/libvpx/vp9/encoder/vp9_picklpf.c | 203 + media/libvpx/libvpx/vp9/encoder/vp9_picklpf.h | 29 + media/libvpx/libvpx/vp9/encoder/vp9_pickmode.c | 2992 +++++++++ media/libvpx/libvpx/vp9/encoder/vp9_pickmode.h | 35 + media/libvpx/libvpx/vp9/encoder/vp9_quantize.c | 326 + media/libvpx/libvpx/vp9/encoder/vp9_quantize.h | 59 + media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c | 3391 ++++++++++ media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h | 359 + media/libvpx/libvpx/vp9/encoder/vp9_rd.c | 795 +++ media/libvpx/libvpx/vp9/encoder/vp9_rd.h | 235 + media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c | 4923 ++++++++++++++ media/libvpx/libvpx/vp9/encoder/vp9_rdopt.h | 63 + media/libvpx/libvpx/vp9/encoder/vp9_resize.c | 832 +++ media/libvpx/libvpx/vp9/encoder/vp9_resize.h | 68 + media/libvpx/libvpx/vp9/encoder/vp9_segmentation.c | 325 + media/libvpx/libvpx/vp9/encoder/vp9_segmentation.h | 55 + .../libvpx/libvpx/vp9/encoder/vp9_skin_detection.c | 174 + .../libvpx/libvpx/vp9/encoder/vp9_skin_detection.h | 40 + .../libvpx/libvpx/vp9/encoder/vp9_speed_features.c | 1093 +++ .../libvpx/libvpx/vp9/encoder/vp9_speed_features.h | 674 ++ media/libvpx/libvpx/vp9/encoder/vp9_subexp.c | 196 + media/libvpx/libvpx/vp9/encoder/vp9_subexp.h | 41 + .../libvpx/vp9/encoder/vp9_svc_layercontext.c | 1376 ++++ .../libvpx/vp9/encoder/vp9_svc_layercontext.h | 290 + .../libvpx/vp9/encoder/vp9_temporal_filter.c | 1205 ++++ .../libvpx/vp9/encoder/vp9_temporal_filter.h | 46 + .../vp9/encoder/vp9_temporal_filter_constants.h | 410 ++ media/libvpx/libvpx/vp9/encoder/vp9_tokenize.c | 490 ++ media/libvpx/libvpx/vp9/encoder/vp9_tokenize.h | 130 + media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c | 1541 +++++ media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h | 46 + media/libvpx/libvpx/vp9/encoder/vp9_treewriter.c | 58 + media/libvpx/libvpx/vp9/encoder/vp9_treewriter.h | 51 + .../vp9/encoder/x86/highbd_temporal_filter_sse4.c | 893 +++ .../libvpx/vp9/encoder/x86/temporal_filter_sse4.c | 875 +++ .../libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c | 1537 +++++ .../libvpx/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm | 69 + .../libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c | 327 + .../libvpx/libvpx/vp9/encoder/x86/vp9_error_avx2.c | 161 + .../libvpx/vp9/encoder/x86/vp9_error_sse2.asm | 115 + .../libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c | 907 +++ .../x86/vp9_highbd_block_error_intrin_sse2.c | 72 + .../libvpx/vp9/encoder/x86/vp9_quantize_avx2.c | 439 ++ .../libvpx/vp9/encoder/x86/vp9_quantize_sse2.c | 126 + .../libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c | 252 + media/libvpx/libvpx/vp9/exports_dec | 2 + media/libvpx/libvpx/vp9/exports_enc | 2 + media/libvpx/libvpx/vp9/ratectrl_rtc.cc | 348 + media/libvpx/libvpx/vp9/ratectrl_rtc.h | 115 + media/libvpx/libvpx/vp9/simple_encode.cc | 1332 ++++ media/libvpx/libvpx/vp9/simple_encode.h | 583 ++ media/libvpx/libvpx/vp9/vp9_common.mk | 99 + media/libvpx/libvpx/vp9/vp9_cx_iface.c | 2432 +++++++ media/libvpx/libvpx/vp9/vp9_cx_iface.h | 49 + media/libvpx/libvpx/vp9/vp9_dx_iface.c | 743 ++ media/libvpx/libvpx/vp9/vp9_dx_iface.h | 52 + media/libvpx/libvpx/vp9/vp9_iface_common.c | 136 + media/libvpx/libvpx/vp9/vp9_iface_common.h | 44 + media/libvpx/libvpx/vp9/vp9cx.mk | 178 + media/libvpx/libvpx/vp9/vp9dx.mk | 34 + media/libvpx/libvpx/vpx/exports_com | 19 + media/libvpx/libvpx/vpx/exports_dec | 8 + media/libvpx/libvpx/vpx/exports_enc | 9 + .../libvpx/vpx/internal/vpx_codec_internal.h | 480 ++ .../libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h | 74 + media/libvpx/libvpx/vpx/src/vpx_codec.c | 133 + media/libvpx/libvpx/vpx/src/vpx_decoder.c | 190 + media/libvpx/libvpx/vpx/src/vpx_encoder.c | 382 ++ media/libvpx/libvpx/vpx/src/vpx_image.c | 237 + media/libvpx/libvpx/vpx/src/vpx_tpl.c | 107 + media/libvpx/libvpx/vpx/vp8.h | 136 + media/libvpx/libvpx/vpx/vp8cx.h | 1118 ++++ media/libvpx/libvpx/vpx/vp8dx.h | 228 + media/libvpx/libvpx/vpx/vpx_codec.h | 475 ++ media/libvpx/libvpx/vpx/vpx_codec.mk | 47 + media/libvpx/libvpx/vpx/vpx_decoder.h | 367 + media/libvpx/libvpx/vpx/vpx_encoder.h | 1127 ++++ media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h | 558 ++ media/libvpx/libvpx/vpx/vpx_frame_buffer.h | 83 + media/libvpx/libvpx/vpx/vpx_image.h | 209 + media/libvpx/libvpx/vpx/vpx_integer.h | 40 + media/libvpx/libvpx/vpx/vpx_tpl.h | 102 + media/libvpx/libvpx/vpx_dsp/add_noise.c | 74 + media/libvpx/libvpx/vpx_dsp/arm/avg_neon.c | 238 + media/libvpx/libvpx/vpx_dsp/arm/avg_pred_neon.c | 65 + media/libvpx/libvpx/vpx_dsp/arm/deblock_neon.c | 480 ++ media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.c | 439 ++ media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.h | 318 + media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.c | 419 ++ media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.h | 2919 ++++++++ media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.c | 85 + media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.h | 105 + media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.c | 143 + media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.h | 307 + media/libvpx/libvpx/vpx_dsp/arm/fdct_neon.h | 542 ++ .../libvpx/libvpx/vpx_dsp/arm/fdct_partial_neon.c | 180 + media/libvpx/libvpx/vpx_dsp/arm/hadamard_neon.c | 158 + media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_neon.c | 140 + .../libvpx/vpx_dsp/arm/highbd_avg_pred_neon.c | 64 + .../libvpx/vpx_dsp/arm/highbd_hadamard_neon.c | 215 + .../libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c | 1361 ++++ .../vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c | 640 ++ .../vpx_dsp/arm/highbd_idct32x32_135_add_neon.c | 757 +++ .../vpx_dsp/arm/highbd_idct32x32_34_add_neon.c | 625 ++ .../libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c | 88 + .../libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c | 89 + .../libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c | 371 + media/libvpx/libvpx/vpx_dsp/arm/highbd_idct_neon.h | 474 ++ .../libvpx/vpx_dsp/arm/highbd_intrapred_neon.c | 2514 +++++++ .../libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c | 776 +++ .../libvpx/vpx_dsp/arm/highbd_quantize_neon.c | 300 + .../libvpx/libvpx/vpx_dsp/arm/highbd_sad4d_neon.c | 273 + media/libvpx/libvpx/vpx_dsp/arm/highbd_sad_neon.c | 408 ++ media/libvpx/libvpx/vpx_dsp/arm/highbd_sse_neon.c | 238 + .../vpx_dsp/arm/highbd_subpel_variance_neon.c | 586 ++ .../libvpx/vpx_dsp/arm/highbd_variance_neon.c | 436 ++ .../vpx_dsp/arm/highbd_variance_neon_dotprod.c | 96 + .../libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c | 931 +++ .../vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c | 183 + .../vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c | 113 + .../libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c | 58 + .../libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c | 77 + .../libvpx/libvpx/vpx_dsp/arm/idct16x16_add_neon.c | 764 +++ .../libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c | 674 ++ .../libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c | 58 + .../libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c | 513 ++ .../libvpx/libvpx/vpx_dsp/arm/idct32x32_add_neon.c | 776 +++ .../libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm | 66 + .../libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c | 47 + .../libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm | 188 + media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.c | 59 + .../libvpx/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c | 65 + media/libvpx/libvpx/vpx_dsp/arm/idct8x8_add_neon.c | 59 + media/libvpx/libvpx/vpx_dsp/arm/idct_neon.asm | 46 + media/libvpx/libvpx/vpx_dsp/arm/idct_neon.h | 919 +++ media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon.c | 1942 ++++++ .../libvpx/vpx_dsp/arm/intrapred_neon_asm.asm | 630 ++ .../libvpx/vpx_dsp/arm/loopfilter_16_neon.asm | 666 ++ .../libvpx/vpx_dsp/arm/loopfilter_4_neon.asm | 549 ++ .../libvpx/vpx_dsp/arm/loopfilter_8_neon.asm | 491 ++ media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c | 1107 +++ media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h | 473 ++ media/libvpx/libvpx/vpx_dsp/arm/quantize_neon.c | 286 + media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c | 228 + .../libvpx/libvpx/vpx_dsp/arm/sad4d_neon_dotprod.c | 176 + media/libvpx/libvpx/vpx_dsp/arm/sad_neon.c | 391 ++ media/libvpx/libvpx/vpx_dsp/arm/sad_neon_dotprod.c | 247 + media/libvpx/libvpx/vpx_dsp/arm/save_reg_neon.asm | 34 + media/libvpx/libvpx/vpx_dsp/arm/sse_neon.c | 188 + media/libvpx/libvpx/vpx_dsp/arm/sse_neon_dotprod.c | 197 + .../libvpx/vpx_dsp/arm/subpel_variance_neon.c | 489 ++ media/libvpx/libvpx/vpx_dsp/arm/subtract_neon.c | 137 + media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h | 275 + media/libvpx/libvpx/vpx_dsp/arm/sum_squares_neon.c | 100 + media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h | 1546 +++++ media/libvpx/libvpx/vpx_dsp/arm/variance_neon.c | 332 + .../libvpx/vpx_dsp/arm/variance_neon_dotprod.c | 298 + .../vpx_convolve8_avg_horiz_filter_type1_neon.asm | 438 ++ .../vpx_convolve8_avg_horiz_filter_type2_neon.asm | 439 ++ .../vpx_convolve8_avg_vert_filter_type1_neon.asm | 486 ++ .../vpx_convolve8_avg_vert_filter_type2_neon.asm | 487 ++ .../arm/vpx_convolve8_horiz_filter_type1_neon.asm | 415 ++ .../arm/vpx_convolve8_horiz_filter_type2_neon.asm | 415 ++ .../libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c | 965 +++ .../libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h | 431 ++ .../libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c | 41 + .../libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.h | 29 + .../vpx_dsp/arm/vpx_convolve8_neon_dotprod.c | 1117 ++++ .../libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c | 989 +++ .../arm/vpx_convolve8_vert_filter_type1_neon.asm | 457 ++ .../arm/vpx_convolve8_vert_filter_type2_neon.asm | 455 ++ .../libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c | 139 + .../vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm | 116 + .../libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c | 100 + .../vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm | 84 + .../libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c | 68 + .../libvpx/vpx_dsp/arm/vpx_convolve_neon_dotprod.c | 66 + .../libvpx/vpx_dsp/arm/vpx_convolve_neon_i8mm.c | 66 + .../libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c | 320 + media/libvpx/libvpx/vpx_dsp/avg.c | 441 ++ media/libvpx/libvpx/vpx_dsp/bitreader.c | 100 + media/libvpx/libvpx/vpx_dsp/bitreader.h | 163 + media/libvpx/libvpx/vpx_dsp/bitreader_buffer.c | 44 + media/libvpx/libvpx/vpx_dsp/bitreader_buffer.h | 47 + media/libvpx/libvpx/vpx_dsp/bitwriter.c | 42 + media/libvpx/libvpx/vpx_dsp/bitwriter.h | 120 + media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.c | 43 + media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.h | 38 + media/libvpx/libvpx/vpx_dsp/deblock.c | 196 + media/libvpx/libvpx/vpx_dsp/fastssim.c | 498 ++ media/libvpx/libvpx/vpx_dsp/fwd_txfm.c | 809 +++ media/libvpx/libvpx/vpx_dsp/fwd_txfm.h | 25 + media/libvpx/libvpx/vpx_dsp/intrapred.c | 917 +++ media/libvpx/libvpx/vpx_dsp/inv_txfm.c | 2701 ++++++++ media/libvpx/libvpx/vpx_dsp/inv_txfm.h | 125 + media/libvpx/libvpx/vpx_dsp/loongarch/avg_lsx.c | 90 + .../libvpx/libvpx/vpx_dsp/loongarch/avg_pred_lsx.c | 83 + .../vpx_dsp/loongarch/bitdepth_conversion_lsx.h | 41 + .../libvpx/vpx_dsp/loongarch/fwd_dct32x32_lsx.c | 1176 ++++ .../libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.c | 350 + .../libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.h | 381 ++ .../libvpx/vpx_dsp/loongarch/idct32x32_lsx.c | 834 +++ .../libvpx/vpx_dsp/loongarch/intrapred_lsx.c | 98 + .../libvpx/vpx_dsp/loongarch/loopfilter_16_lsx.c | 1320 ++++ .../libvpx/vpx_dsp/loongarch/loopfilter_4_lsx.c | 214 + .../libvpx/vpx_dsp/loongarch/loopfilter_8_lsx.c | 458 ++ .../libvpx/vpx_dsp/loongarch/loopfilter_lsx.h | 167 + .../libvpx/vpx_dsp/loongarch/quantize_intrin_lsx.c | 244 + media/libvpx/libvpx/vpx_dsp/loongarch/sad_lsx.c | 717 ++ .../vpx_dsp/loongarch/sub_pixel_variance_lsx.c | 874 +++ .../libvpx/libvpx/vpx_dsp/loongarch/subtract_lsx.c | 371 + .../libvpx/vpx_dsp/loongarch/txfm_macros_lsx.h | 48 + .../libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.c | 263 + .../libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.h | 62 + .../loongarch/vpx_convolve8_avg_horiz_lsx.c | 972 +++ .../vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c | 737 ++ .../vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c | 918 +++ .../vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c | 814 +++ .../libvpx/vpx_dsp/loongarch/vpx_convolve8_lsx.c | 697 ++ .../vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c | 825 +++ .../vpx_dsp/loongarch/vpx_convolve_avg_lsx.c | 321 + .../vpx_dsp/loongarch/vpx_convolve_copy_lsx.c | 437 ++ .../libvpx/vpx_dsp/loongarch/vpx_convolve_lsx.h | 138 + media/libvpx/libvpx/vpx_dsp/loopfilter.c | 743 ++ media/libvpx/libvpx/vpx_dsp/mips/add_noise_msa.c | 54 + media/libvpx/libvpx/vpx_dsp/mips/avg_msa.c | 731 ++ media/libvpx/libvpx/vpx_dsp/mips/common_dspr2.c | 30 + media/libvpx/libvpx/vpx_dsp/mips/common_dspr2.h | 48 + .../libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c | 256 + .../vpx_dsp/mips/convolve2_avg_horiz_dspr2.c | 802 +++ media/libvpx/libvpx/vpx_dsp/mips/convolve2_dspr2.c | 1029 +++ .../libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c | 681 ++ .../libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c | 237 + .../libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c | 647 ++ .../vpx_dsp/mips/convolve8_avg_horiz_dspr2.c | 998 +++ media/libvpx/libvpx/vpx_dsp/mips/convolve8_dspr2.c | 1602 +++++ .../libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c | 878 +++ .../libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c | 360 + .../libvpx/vpx_dsp/mips/convolve_common_dspr2.h | 58 + media/libvpx/libvpx/vpx_dsp/mips/deblock_msa.c | 742 ++ .../libvpx/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c | 948 +++ media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.c | 272 + media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.h | 364 + media/libvpx/libvpx/vpx_dsp/mips/idct16x16_msa.c | 486 ++ media/libvpx/libvpx/vpx_dsp/mips/idct32x32_msa.c | 730 ++ media/libvpx/libvpx/vpx_dsp/mips/idct4x4_msa.c | 99 + media/libvpx/libvpx/vpx_dsp/mips/idct8x8_msa.c | 117 + .../libvpx/libvpx/vpx_dsp/mips/intrapred16_dspr2.c | 325 + .../libvpx/libvpx/vpx_dsp/mips/intrapred4_dspr2.c | 225 + .../libvpx/libvpx/vpx_dsp/mips/intrapred8_dspr2.c | 603 ++ media/libvpx/libvpx/vpx_dsp/mips/intrapred_msa.c | 738 ++ media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h | 75 + media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_msa.h | 411 ++ media/libvpx/libvpx/vpx_dsp/mips/itrans16_dspr2.c | 1230 ++++ .../libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c | 1119 ++++ media/libvpx/libvpx/vpx_dsp/mips/itrans32_dspr2.c | 1218 ++++ media/libvpx/libvpx/vpx_dsp/mips/itrans4_dspr2.c | 375 ++ media/libvpx/libvpx/vpx_dsp/mips/itrans8_dspr2.c | 690 ++ .../libvpx/libvpx/vpx_dsp/mips/loopfilter_16_msa.c | 1489 ++++ .../libvpx/libvpx/vpx_dsp/mips/loopfilter_4_msa.c | 147 + .../libvpx/libvpx/vpx_dsp/mips/loopfilter_8_msa.c | 333 + .../libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.c | 326 + .../libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.h | 734 ++ .../libvpx/vpx_dsp/mips/loopfilter_macros_dspr2.h | 435 ++ .../libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h | 355 + .../libvpx/vpx_dsp/mips/loopfilter_mb_dspr2.c | 588 ++ .../vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c | 732 ++ .../libvpx/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c | 756 +++ media/libvpx/libvpx/vpx_dsp/mips/loopfilter_msa.h | 177 + media/libvpx/libvpx/vpx_dsp/mips/macros_msa.h | 1971 ++++++ media/libvpx/libvpx/vpx_dsp/mips/sad_mmi.c | 807 +++ media/libvpx/libvpx/vpx_dsp/mips/sad_msa.c | 804 +++ .../libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c | 1789 +++++ media/libvpx/libvpx/vpx_dsp/mips/subtract_mmi.c | 306 + media/libvpx/libvpx/vpx_dsp/mips/subtract_msa.c | 264 + media/libvpx/libvpx/vpx_dsp/mips/sum_squares_msa.c | 129 + media/libvpx/libvpx/vpx_dsp/mips/txfm_macros_msa.h | 101 + media/libvpx/libvpx/vpx_dsp/mips/variance_mmi.c | 1357 ++++ media/libvpx/libvpx/vpx_dsp/mips/variance_msa.c | 622 ++ .../vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c | 716 ++ .../libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c | 611 ++ .../vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c | 684 ++ .../libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c | 692 ++ .../libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c | 716 ++ .../libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c | 1227 ++++ .../libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c | 699 ++ .../libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c | 234 + .../libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c | 249 + .../libvpx/libvpx/vpx_dsp/mips/vpx_convolve_msa.h | 122 + media/libvpx/libvpx/vpx_dsp/postproc.h | 25 + .../libvpx/vpx_dsp/ppc/bitdepth_conversion_vsx.h | 47 + media/libvpx/libvpx/vpx_dsp/ppc/deblock_vsx.c | 374 ++ media/libvpx/libvpx/vpx_dsp/ppc/fdct32x32_vsx.c | 553 ++ media/libvpx/libvpx/vpx_dsp/ppc/hadamard_vsx.c | 119 + media/libvpx/libvpx/vpx_dsp/ppc/intrapred_vsx.c | 767 +++ media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c | 1828 +++++ media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.h | 48 + media/libvpx/libvpx/vpx_dsp/ppc/quantize_vsx.c | 301 + media/libvpx/libvpx/vpx_dsp/ppc/sad_vsx.c | 261 + media/libvpx/libvpx/vpx_dsp/ppc/subtract_vsx.c | 117 + media/libvpx/libvpx/vpx_dsp/ppc/transpose_vsx.h | 133 + media/libvpx/libvpx/vpx_dsp/ppc/txfm_common_vsx.h | 90 + media/libvpx/libvpx/vpx_dsp/ppc/types_vsx.h | 108 + media/libvpx/libvpx/vpx_dsp/ppc/variance_vsx.c | 271 + media/libvpx/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c | 408 ++ media/libvpx/libvpx/vpx_dsp/prob.c | 47 + media/libvpx/libvpx/vpx_dsp/prob.h | 106 + media/libvpx/libvpx/vpx_dsp/psnr.c | 258 + media/libvpx/libvpx/vpx_dsp/psnr.h | 54 + media/libvpx/libvpx/vpx_dsp/psnrhvs.c | 281 + media/libvpx/libvpx/vpx_dsp/quantize.c | 324 + media/libvpx/libvpx/vpx_dsp/quantize.h | 46 + media/libvpx/libvpx/vpx_dsp/sad.c | 256 + media/libvpx/libvpx/vpx_dsp/skin_detection.c | 79 + media/libvpx/libvpx/vpx_dsp/skin_detection.h | 24 + media/libvpx/libvpx/vpx_dsp/sse.c | 59 + media/libvpx/libvpx/vpx_dsp/ssim.c | 461 ++ media/libvpx/libvpx/vpx_dsp/ssim.h | 87 + media/libvpx/libvpx/vpx_dsp/subtract.c | 54 + media/libvpx/libvpx/vpx_dsp/sum_squares.c | 26 + media/libvpx/libvpx/vpx_dsp/txfm_common.h | 66 + media/libvpx/libvpx/vpx_dsp/variance.c | 566 ++ media/libvpx/libvpx/vpx_dsp/variance.h | 88 + media/libvpx/libvpx/vpx_dsp/vpx_convolve.c | 537 ++ media/libvpx/libvpx/vpx_dsp/vpx_convolve.h | 38 + media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk | 485 ++ media/libvpx/libvpx/vpx_dsp/vpx_dsp_common.h | 89 + media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c | 15 + media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl | 1828 +++++ media/libvpx/libvpx/vpx_dsp/vpx_filter.h | 48 + media/libvpx/libvpx/vpx_dsp/x86/add_noise_sse2.asm | 88 + media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_avx2.c | 519 ++ media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_sse2.c | 614 ++ media/libvpx/libvpx/vpx_dsp/x86/avg_pred_avx2.c | 111 + media/libvpx/libvpx/vpx_dsp/x86/avg_pred_sse2.c | 69 + .../libvpx/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm | 130 + .../libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h | 44 + .../vpx_dsp/x86/bitdepth_conversion_sse2.asm | 90 + .../libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h | 56 + media/libvpx/libvpx/vpx_dsp/x86/convolve.h | 279 + media/libvpx/libvpx/vpx_dsp/x86/convolve_avx2.h | 161 + media/libvpx/libvpx/vpx_dsp/x86/convolve_sse2.h | 88 + media/libvpx/libvpx/vpx_dsp/x86/convolve_ssse3.h | 112 + media/libvpx/libvpx/vpx_dsp/x86/deblock_sse2.asm | 432 ++ .../libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h | 2930 ++++++++ .../libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h | 3130 +++++++++ media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c | 399 ++ .../libvpx/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h | 1015 +++ media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.c | 272 + media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h | 371 + .../libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm | 361 + .../libvpx/vpx_dsp/x86/highbd_convolve_avx2.c | 1495 +++++ .../libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c | 355 + .../libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c | 349 + .../libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c | 782 +++ .../libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c | 765 +++ .../libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c | 160 + .../libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c | 47 + .../libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c | 213 + .../libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c | 210 + .../vpx_dsp/x86/highbd_intrapred_intrin_sse2.c | 534 ++ .../vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c | 930 +++ .../libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm | 453 ++ .../libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h | 404 ++ .../libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h | 112 + .../libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c | 1140 ++++ .../vpx_dsp/x86/highbd_quantize_intrin_avx2.c | 254 + .../vpx_dsp/x86/highbd_quantize_intrin_sse2.c | 153 + .../libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c | 462 ++ .../libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm | 326 + media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_avx2.c | 522 ++ .../libvpx/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm | 416 ++ .../x86/highbd_subpel_variance_impl_sse2.asm | 1021 +++ .../vpx_dsp/x86/highbd_variance_impl_sse2.asm | 315 + .../libvpx/vpx_dsp/x86/highbd_variance_sse2.c | 608 ++ media/libvpx/libvpx/vpx_dsp/x86/intrapred_sse2.asm | 860 +++ .../libvpx/libvpx/vpx_dsp/x86/intrapred_ssse3.asm | 871 +++ media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_avx2.c | 626 ++ media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.c | 1235 ++++ media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.h | 710 ++ media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c | 364 + media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h | 110 + media/libvpx/libvpx/vpx_dsp/x86/inv_wht_sse2.asm | 103 + media/libvpx/libvpx/vpx_dsp/x86/loopfilter_avx2.c | 913 +++ .../libvpx/vpx_dsp/x86/loopfilter_intrin_sse2.c | 1779 +++++ media/libvpx/libvpx/vpx_dsp/x86/mem_sse2.h | 154 + media/libvpx/libvpx/vpx_dsp/x86/post_proc_sse2.c | 141 + media/libvpx/libvpx/vpx_dsp/x86/quantize_avx.c | 254 + media/libvpx/libvpx/vpx_dsp/x86/quantize_avx2.c | 290 + media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.c | 113 + media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.h | 126 + media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.c | 228 + media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.h | 51 + media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx2.c | 184 + media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx512.c | 83 + media/libvpx/libvpx/vpx_dsp/x86/sad4d_sse2.asm | 278 + media/libvpx/libvpx/vpx_dsp/x86/sad_avx2.c | 208 + media/libvpx/libvpx/vpx_dsp/x86/sad_sse2.asm | 332 + media/libvpx/libvpx/vpx_dsp/x86/sse_avx2.c | 368 + media/libvpx/libvpx/vpx_dsp/x86/sse_sse4.c | 312 + .../libvpx/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm | 219 + .../libvpx/vpx_dsp/x86/subpel_variance_sse2.asm | 1467 ++++ media/libvpx/libvpx/vpx_dsp/x86/subtract_avx2.c | 203 + media/libvpx/libvpx/vpx_dsp/x86/subtract_sse2.asm | 128 + media/libvpx/libvpx/vpx_dsp/x86/sum_squares_sse2.c | 105 + media/libvpx/libvpx/vpx_dsp/x86/transpose_sse2.h | 367 + media/libvpx/libvpx/vpx_dsp/x86/txfm_common_sse2.h | 32 + media/libvpx/libvpx/vpx_dsp/x86/variance_avx2.c | 872 +++ media/libvpx/libvpx/vpx_dsp/x86/variance_sse2.c | 565 ++ .../libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm | 226 + .../vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm | 964 +++ .../x86/vpx_high_subpixel_bilinear_sse2.asm | 496 ++ .../vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c | 1161 ++++ .../vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c | 1374 ++++ .../vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c | 1087 +++ .../libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm | 989 +++ .../libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm | 803 +++ .../vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm | 450 ++ .../vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm | 420 ++ .../libvpx/libvpx/vpx_mem/include/vpx_mem_intrnl.h | 31 + media/libvpx/libvpx/vpx_mem/vpx_mem.c | 86 + media/libvpx/libvpx/vpx_mem/vpx_mem.h | 52 + media/libvpx/libvpx/vpx_mem/vpx_mem.mk | 4 + media/libvpx/libvpx/vpx_ports/aarch32_cpudetect.c | 90 + media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c | 199 + media/libvpx/libvpx/vpx_ports/arm.h | 41 + media/libvpx/libvpx/vpx_ports/arm_cpudetect.h | 52 + media/libvpx/libvpx/vpx_ports/asmdefs_mmi.h | 81 + media/libvpx/libvpx/vpx_ports/bitops.h | 94 + .../libvpx/libvpx/vpx_ports/compiler_attributes.h | 69 + media/libvpx/libvpx/vpx_ports/emmintrin_compat.h | 55 + media/libvpx/libvpx/vpx_ports/emms_mmx.asm | 18 + media/libvpx/libvpx/vpx_ports/emms_mmx.c | 15 + .../libvpx/libvpx/vpx_ports/float_control_word.asm | 33 + media/libvpx/libvpx/vpx_ports/loongarch.h | 29 + .../libvpx/libvpx/vpx_ports/loongarch_cpudetect.c | 40 + media/libvpx/libvpx/vpx_ports/mem.h | 44 + media/libvpx/libvpx/vpx_ports/mem_ops.h | 227 + media/libvpx/libvpx/vpx_ports/mem_ops_aligned.h | 171 + media/libvpx/libvpx/vpx_ports/mips.h | 27 + media/libvpx/libvpx/vpx_ports/mips_cpudetect.c | 57 + media/libvpx/libvpx/vpx_ports/msvc.h | 32 + media/libvpx/libvpx/vpx_ports/ppc.h | 29 + media/libvpx/libvpx/vpx_ports/ppc_cpudetect.c | 80 + media/libvpx/libvpx/vpx_ports/static_assert.h | 30 + media/libvpx/libvpx/vpx_ports/system_state.h | 30 + media/libvpx/libvpx/vpx_ports/vpx_once.h | 140 + media/libvpx/libvpx/vpx_ports/vpx_ports.mk | 58 + media/libvpx/libvpx/vpx_ports/vpx_timer.h | 109 + media/libvpx/libvpx/vpx_ports/x86.h | 402 ++ media/libvpx/libvpx/vpx_ports/x86_abi_support.asm | 425 ++ .../libvpx/libvpx/vpx_scale/generic/gen_scalers.c | 228 + media/libvpx/libvpx/vpx_scale/generic/vpx_scale.c | 529 ++ media/libvpx/libvpx/vpx_scale/generic/yv12config.c | 308 + media/libvpx/libvpx/vpx_scale/generic/yv12extend.c | 335 + .../libvpx/vpx_scale/mips/dspr2/yv12extend_dspr2.c | 138 + media/libvpx/libvpx/vpx_scale/vpx_scale.h | 22 + media/libvpx/libvpx/vpx_scale/vpx_scale.mk | 16 + media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c | 15 + media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.pl | 44 + media/libvpx/libvpx/vpx_scale/yv12config.h | 103 + media/libvpx/libvpx/vpx_util/endian_inl.h | 118 + media/libvpx/libvpx/vpx_util/loongson_intrinsics.h | 2090 ++++++ media/libvpx/libvpx/vpx_util/vpx_atomics.h | 111 + media/libvpx/libvpx/vpx_util/vpx_debug_util.c | 282 + media/libvpx/libvpx/vpx_util/vpx_debug_util.h | 70 + media/libvpx/libvpx/vpx_util/vpx_thread.c | 181 + media/libvpx/libvpx/vpx_util/vpx_thread.h | 438 ++ media/libvpx/libvpx/vpx_util/vpx_timestamp.h | 49 + media/libvpx/libvpx/vpx_util/vpx_util.mk | 20 + media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.c | 46 + media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.h | 27 + media/libvpx/libvpx/vpxdec.c | 1146 ++++ media/libvpx/libvpx/vpxenc.c | 2070 ++++++ media/libvpx/libvpx/vpxenc.h | 65 + media/libvpx/libvpx/vpxstats.c | 105 + media/libvpx/libvpx/vpxstats.h | 43 + media/libvpx/libvpx/warnings.c | 109 + media/libvpx/libvpx/warnings.h | 33 + media/libvpx/libvpx/webmdec.cc | 226 + media/libvpx/libvpx/webmdec.h | 69 + media/libvpx/libvpx/webmenc.cc | 95 + media/libvpx/libvpx/webmenc.h | 55 + media/libvpx/libvpx/y4menc.c | 57 + media/libvpx/libvpx/y4menc.h | 33 + media/libvpx/libvpx/y4minput.c | 1170 ++++ media/libvpx/libvpx/y4minput.h | 76 + media/libvpx/lint_config.sh | 112 + media/libvpx/moz.build | 155 + media/libvpx/moz.yaml | 76 + media/libvpx/rename_duplicate_files.patch | 22 + media/libvpx/sources.mozbuild | 1235 ++++ media/libvpx/win64_build_fix.patch | 22 + 1270 files changed, 526733 insertions(+) create mode 100644 media/libvpx/LICENSE create mode 100644 media/libvpx/Makefile.in create mode 100644 media/libvpx/README_MOZILLA create mode 100644 media/libvpx/config/generic/vp8_rtcd.h create mode 100644 media/libvpx/config/generic/vp9_rtcd.h create mode 100644 media/libvpx/config/generic/vpx_config.asm create mode 100644 media/libvpx/config/generic/vpx_config.c create mode 100644 media/libvpx/config/generic/vpx_config.h create mode 100644 media/libvpx/config/generic/vpx_dsp_rtcd.h create mode 100644 media/libvpx/config/generic/vpx_scale_rtcd.h create mode 100644 media/libvpx/config/linux/arm/vp8_rtcd.h create mode 100644 media/libvpx/config/linux/arm/vp9_rtcd.h create mode 100644 media/libvpx/config/linux/arm/vpx_config.asm create mode 100644 media/libvpx/config/linux/arm/vpx_config.c create mode 100644 media/libvpx/config/linux/arm/vpx_config.h create mode 100644 media/libvpx/config/linux/arm/vpx_dsp_rtcd.h create mode 100644 media/libvpx/config/linux/arm/vpx_scale_rtcd.h create mode 100644 media/libvpx/config/linux/arm64/vp8_rtcd.h create mode 100644 media/libvpx/config/linux/arm64/vp9_rtcd.h create mode 100644 media/libvpx/config/linux/arm64/vpx_config.asm create mode 100644 media/libvpx/config/linux/arm64/vpx_config.c create mode 100644 media/libvpx/config/linux/arm64/vpx_config.h create mode 100644 media/libvpx/config/linux/arm64/vpx_dsp_rtcd.h create mode 100644 media/libvpx/config/linux/arm64/vpx_scale_rtcd.h create mode 100644 media/libvpx/config/linux/ia32/vp8_rtcd.h create mode 100644 media/libvpx/config/linux/ia32/vp9_rtcd.h create mode 100644 media/libvpx/config/linux/ia32/vpx_config.asm create mode 100644 media/libvpx/config/linux/ia32/vpx_config.c create mode 100644 media/libvpx/config/linux/ia32/vpx_config.h create mode 100644 media/libvpx/config/linux/ia32/vpx_dsp_rtcd.h create mode 100644 media/libvpx/config/linux/ia32/vpx_scale_rtcd.h create mode 100644 media/libvpx/config/linux/x64/vp8_rtcd.h create mode 100644 media/libvpx/config/linux/x64/vp9_rtcd.h create mode 100644 media/libvpx/config/linux/x64/vpx_config.asm create mode 100644 media/libvpx/config/linux/x64/vpx_config.c create mode 100644 media/libvpx/config/linux/x64/vpx_config.h create mode 100644 media/libvpx/config/linux/x64/vpx_dsp_rtcd.h create mode 100644 media/libvpx/config/linux/x64/vpx_scale_rtcd.h create mode 100644 media/libvpx/config/mac/ia32/vp8_rtcd.h create mode 100644 media/libvpx/config/mac/ia32/vp9_rtcd.h create mode 100644 media/libvpx/config/mac/ia32/vpx_config.asm create mode 100644 media/libvpx/config/mac/ia32/vpx_config.c create mode 100644 media/libvpx/config/mac/ia32/vpx_config.h create mode 100644 media/libvpx/config/mac/ia32/vpx_dsp_rtcd.h create mode 100644 media/libvpx/config/mac/ia32/vpx_scale_rtcd.h create mode 100644 media/libvpx/config/mac/x64/vp8_rtcd.h create mode 100644 media/libvpx/config/mac/x64/vp9_rtcd.h create mode 100644 media/libvpx/config/mac/x64/vpx_config.asm create mode 100644 media/libvpx/config/mac/x64/vpx_config.c create mode 100644 media/libvpx/config/mac/x64/vpx_config.h create mode 100644 media/libvpx/config/mac/x64/vpx_dsp_rtcd.h create mode 100644 media/libvpx/config/mac/x64/vpx_scale_rtcd.h create mode 100644 media/libvpx/config/vpx_version.h create mode 100644 media/libvpx/config/win/aarch64/vp8_rtcd.h create mode 100644 media/libvpx/config/win/aarch64/vp9_rtcd.h create mode 100644 media/libvpx/config/win/aarch64/vpx_config.asm create mode 100644 media/libvpx/config/win/aarch64/vpx_config.c create mode 100644 media/libvpx/config/win/aarch64/vpx_config.h create mode 100644 media/libvpx/config/win/aarch64/vpx_dsp_rtcd.h create mode 100644 media/libvpx/config/win/aarch64/vpx_scale_rtcd.h create mode 100644 media/libvpx/config/win/ia32/vp8_rtcd.h create mode 100644 media/libvpx/config/win/ia32/vp9_rtcd.h create mode 100755 media/libvpx/config/win/ia32/vpx_config.asm create mode 100644 media/libvpx/config/win/ia32/vpx_config.c create mode 100644 media/libvpx/config/win/ia32/vpx_config.h create mode 100644 media/libvpx/config/win/ia32/vpx_dsp_rtcd.h create mode 100644 media/libvpx/config/win/ia32/vpx_scale_rtcd.h create mode 100644 media/libvpx/config/win/x64/vp8_rtcd.h create mode 100644 media/libvpx/config/win/x64/vp9_rtcd.h create mode 100644 media/libvpx/config/win/x64/vpx_config.asm create mode 100644 media/libvpx/config/win/x64/vpx_config.c create mode 100644 media/libvpx/config/win/x64/vpx_config.h create mode 100644 media/libvpx/config/win/x64/vpx_dsp_rtcd.h create mode 100644 media/libvpx/config/win/x64/vpx_scale_rtcd.h create mode 100755 media/libvpx/generate_sources_mozbuild.sh create mode 100644 media/libvpx/input_frame_validation.patch create mode 100644 media/libvpx/input_frame_validation_vp9.patch create mode 100644 media/libvpx/libvpx/.clang-format create mode 100644 media/libvpx/libvpx/.mailmap create mode 100644 media/libvpx/libvpx/AUTHORS create mode 100644 media/libvpx/libvpx/CHANGELOG create mode 100644 media/libvpx/libvpx/CONTRIBUTING.md create mode 100644 media/libvpx/libvpx/LICENSE create mode 100644 media/libvpx/libvpx/PATENTS create mode 100644 media/libvpx/libvpx/README create mode 100644 media/libvpx/libvpx/args.c create mode 100644 media/libvpx/libvpx/args.h create mode 100644 media/libvpx/libvpx/build/make/Android.mk create mode 100644 media/libvpx/libvpx/build/make/Makefile create mode 100755 media/libvpx/libvpx/build/make/ads2armasm_ms.pl create mode 100755 media/libvpx/libvpx/build/make/ads2gas.pl create mode 100755 media/libvpx/libvpx/build/make/ads2gas_apple.pl create mode 100755 media/libvpx/libvpx/build/make/armlink_adapter.sh create mode 100644 media/libvpx/libvpx/build/make/configure.sh create mode 100755 media/libvpx/libvpx/build/make/gen_asm_deps.sh create mode 100755 media/libvpx/libvpx/build/make/gen_msvs_def.sh create mode 100755 media/libvpx/libvpx/build/make/gen_msvs_sln.sh create mode 100755 media/libvpx/libvpx/build/make/gen_msvs_vcxproj.sh create mode 100644 media/libvpx/libvpx/build/make/ios-Info.plist create mode 100755 media/libvpx/libvpx/build/make/iosbuild.sh create mode 100644 media/libvpx/libvpx/build/make/msvs_common.sh create mode 100755 media/libvpx/libvpx/build/make/rtcd.pl create mode 100644 media/libvpx/libvpx/build/make/thumb.pm create mode 100755 media/libvpx/libvpx/build/make/version.sh create mode 100644 media/libvpx/libvpx/build_debug/non_greedy_mv_test_files/cur_frame_16x16.txt create mode 100644 media/libvpx/libvpx/build_debug/non_greedy_mv_test_files/estimation_16x16.txt create mode 100644 media/libvpx/libvpx/build_debug/non_greedy_mv_test_files/exhaust_16x16.txt create mode 100644 media/libvpx/libvpx/build_debug/non_greedy_mv_test_files/ground_truth_16x16.txt create mode 100644 media/libvpx/libvpx/build_debug/non_greedy_mv_test_files/localVar_16x16.txt create mode 100644 media/libvpx/libvpx/build_debug/non_greedy_mv_test_files/raw_1.png create mode 100644 media/libvpx/libvpx/build_debug/non_greedy_mv_test_files/raw_1_12_12.png create mode 100644 media/libvpx/libvpx/build_debug/non_greedy_mv_test_files/ref_frame_16x16.txt create mode 100644 media/libvpx/libvpx/codereview.settings create mode 100755 media/libvpx/libvpx/configure create mode 100644 media/libvpx/libvpx/docs.mk create mode 100644 media/libvpx/libvpx/examples.mk create mode 100644 media/libvpx/libvpx/examples/decode_to_md5.c create mode 100644 media/libvpx/libvpx/examples/decode_with_drops.c create mode 100644 media/libvpx/libvpx/examples/postproc.c create mode 100644 media/libvpx/libvpx/examples/resize_util.c create mode 100644 media/libvpx/libvpx/examples/set_maps.c create mode 100644 media/libvpx/libvpx/examples/simple_decoder.c create mode 100644 media/libvpx/libvpx/examples/simple_encoder.c create mode 100644 media/libvpx/libvpx/examples/svc_context.h create mode 100644 media/libvpx/libvpx/examples/svc_encodeframe.c create mode 100644 media/libvpx/libvpx/examples/twopass_encoder.c create mode 100644 media/libvpx/libvpx/examples/vp8_multi_resolution_encoder.c create mode 100644 media/libvpx/libvpx/examples/vp8cx_set_ref.c create mode 100644 media/libvpx/libvpx/examples/vp9_lossless_encoder.c create mode 100644 media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c create mode 100644 media/libvpx/libvpx/examples/vp9cx_set_ref.c create mode 100644 media/libvpx/libvpx/examples/vpx_dec_fuzzer.cc create mode 100644 media/libvpx/libvpx/examples/vpx_temporal_svc_encoder.c create mode 100644 media/libvpx/libvpx/ivfdec.c create mode 100644 media/libvpx/libvpx/ivfdec.h create mode 100644 media/libvpx/libvpx/ivfenc.c create mode 100644 media/libvpx/libvpx/ivfenc.h create mode 100644 media/libvpx/libvpx/keywords.dox create mode 100644 media/libvpx/libvpx/libs.doxy_template create mode 100644 media/libvpx/libvpx/libs.mk create mode 100644 media/libvpx/libvpx/mainpage.dox create mode 100644 media/libvpx/libvpx/md5_utils.c create mode 100644 media/libvpx/libvpx/md5_utils.h create mode 100644 media/libvpx/libvpx/rate_hist.c create mode 100644 media/libvpx/libvpx/rate_hist.h create mode 100644 media/libvpx/libvpx/solution.mk create mode 100644 media/libvpx/libvpx/test/acm_random.h create mode 100644 media/libvpx/libvpx/test/active_map_refresh_test.cc create mode 100644 media/libvpx/libvpx/test/active_map_test.cc create mode 100644 media/libvpx/libvpx/test/add_noise_test.cc create mode 100644 media/libvpx/libvpx/test/alt_ref_aq_segment_test.cc create mode 100644 media/libvpx/libvpx/test/altref_test.cc create mode 100644 media/libvpx/libvpx/test/android/Android.mk create mode 100644 media/libvpx/libvpx/test/android/README create mode 100644 media/libvpx/libvpx/test/android/get_files.py create mode 100644 media/libvpx/libvpx/test/android/scrape_gtest_log.py create mode 100644 media/libvpx/libvpx/test/aq_segment_test.cc create mode 100644 media/libvpx/libvpx/test/avg_test.cc create mode 100644 media/libvpx/libvpx/test/bench.cc create mode 100644 media/libvpx/libvpx/test/bench.h create mode 100644 media/libvpx/libvpx/test/blockiness_test.cc create mode 100644 media/libvpx/libvpx/test/borders_test.cc create mode 100644 media/libvpx/libvpx/test/buffer.h create mode 100644 media/libvpx/libvpx/test/byte_alignment_test.cc create mode 100644 media/libvpx/libvpx/test/clear_system_state.h create mode 100644 media/libvpx/libvpx/test/codec_factory.h create mode 100644 media/libvpx/libvpx/test/comp_avg_pred_test.cc create mode 100644 media/libvpx/libvpx/test/config_test.cc create mode 100644 media/libvpx/libvpx/test/consistency_test.cc create mode 100644 media/libvpx/libvpx/test/convolve_test.cc create mode 100644 media/libvpx/libvpx/test/cpu_speed_test.cc create mode 100644 media/libvpx/libvpx/test/cq_test.cc create mode 100755 media/libvpx/libvpx/test/cx_set_ref.sh create mode 100644 media/libvpx/libvpx/test/dct16x16_test.cc create mode 100644 media/libvpx/libvpx/test/dct32x32_test.cc create mode 100644 media/libvpx/libvpx/test/dct_partial_test.cc create mode 100644 media/libvpx/libvpx/test/dct_test.cc create mode 100644 media/libvpx/libvpx/test/decode_api_test.cc create mode 100644 media/libvpx/libvpx/test/decode_corrupted.cc create mode 100644 media/libvpx/libvpx/test/decode_perf_test.cc create mode 100644 media/libvpx/libvpx/test/decode_svc_test.cc create mode 100644 media/libvpx/libvpx/test/decode_test_driver.cc create mode 100644 media/libvpx/libvpx/test/decode_test_driver.h create mode 100755 media/libvpx/libvpx/test/decode_to_md5.sh create mode 100755 media/libvpx/libvpx/test/decode_with_drops.sh create mode 100644 media/libvpx/libvpx/test/encode_api_test.cc create mode 100644 media/libvpx/libvpx/test/encode_perf_test.cc create mode 100644 media/libvpx/libvpx/test/encode_test_driver.cc create mode 100644 media/libvpx/libvpx/test/encode_test_driver.h create mode 100644 media/libvpx/libvpx/test/error_resilience_test.cc create mode 100755 media/libvpx/libvpx/test/examples.sh create mode 100644 media/libvpx/libvpx/test/external_frame_buffer_test.cc create mode 100644 media/libvpx/libvpx/test/fdct8x8_test.cc create mode 100644 media/libvpx/libvpx/test/frame_size_tests.cc create mode 100644 media/libvpx/libvpx/test/hadamard_test.cc create mode 100644 media/libvpx/libvpx/test/i420_video_source.h create mode 100644 media/libvpx/libvpx/test/idct8x8_test.cc create mode 100644 media/libvpx/libvpx/test/idct_test.cc create mode 100644 media/libvpx/libvpx/test/init_vpx_test.cc create mode 100644 media/libvpx/libvpx/test/init_vpx_test.h create mode 100644 media/libvpx/libvpx/test/invalid_file_test.cc create mode 100644 media/libvpx/libvpx/test/ivf_video_source.h create mode 100644 media/libvpx/libvpx/test/keyframe_test.cc create mode 100644 media/libvpx/libvpx/test/level_test.cc create mode 100644 media/libvpx/libvpx/test/lpf_test.cc create mode 100644 media/libvpx/libvpx/test/md5_helper.h create mode 100644 media/libvpx/libvpx/test/minmax_test.cc create mode 100644 media/libvpx/libvpx/test/non_greedy_mv_test.cc create mode 100644 media/libvpx/libvpx/test/partial_idct_test.cc create mode 100755 media/libvpx/libvpx/test/postproc.sh create mode 100644 media/libvpx/libvpx/test/pp_filter_test.cc create mode 100644 media/libvpx/libvpx/test/predict_test.cc create mode 100644 media/libvpx/libvpx/test/quantize_test.cc create mode 100644 media/libvpx/libvpx/test/realtime_test.cc create mode 100644 media/libvpx/libvpx/test/register_state_check.h create mode 100644 media/libvpx/libvpx/test/resize_test.cc create mode 100755 media/libvpx/libvpx/test/resize_util.sh create mode 100644 media/libvpx/libvpx/test/sad_test.cc create mode 100755 media/libvpx/libvpx/test/set_maps.sh create mode 100644 media/libvpx/libvpx/test/set_roi.cc create mode 100755 media/libvpx/libvpx/test/simple_decoder.sh create mode 100644 media/libvpx/libvpx/test/simple_encode_test.cc create mode 100755 media/libvpx/libvpx/test/simple_encoder.sh create mode 100755 media/libvpx/libvpx/test/stress.sh create mode 100644 media/libvpx/libvpx/test/sum_squares_test.cc create mode 100644 media/libvpx/libvpx/test/superframe_test.cc create mode 100644 media/libvpx/libvpx/test/svc_datarate_test.cc create mode 100644 media/libvpx/libvpx/test/svc_end_to_end_test.cc create mode 100644 media/libvpx/libvpx/test/svc_test.cc create mode 100644 media/libvpx/libvpx/test/svc_test.h create mode 100644 media/libvpx/libvpx/test/test-data.mk create mode 100644 media/libvpx/libvpx/test/test-data.sha1 create mode 100644 media/libvpx/libvpx/test/test.mk create mode 100644 media/libvpx/libvpx/test/test_intra_pred_speed.cc create mode 100644 media/libvpx/libvpx/test/test_libvpx.cc create mode 100644 media/libvpx/libvpx/test/test_rc_interface.cc create mode 100644 media/libvpx/libvpx/test/test_vector_test.cc create mode 100644 media/libvpx/libvpx/test/test_vectors.cc create mode 100644 media/libvpx/libvpx/test/test_vectors.h create mode 100644 media/libvpx/libvpx/test/tile_independence_test.cc create mode 100644 media/libvpx/libvpx/test/timestamp_test.cc create mode 100755 media/libvpx/libvpx/test/tools_common.sh create mode 100755 media/libvpx/libvpx/test/twopass_encoder.sh create mode 100644 media/libvpx/libvpx/test/user_priv_test.cc create mode 100644 media/libvpx/libvpx/test/util.h create mode 100644 media/libvpx/libvpx/test/variance_test.cc create mode 100644 media/libvpx/libvpx/test/video_source.h create mode 100644 media/libvpx/libvpx/test/vp8_boolcoder_test.cc create mode 100644 media/libvpx/libvpx/test/vp8_datarate_test.cc create mode 100644 media/libvpx/libvpx/test/vp8_decrypt_test.cc create mode 100644 media/libvpx/libvpx/test/vp8_denoiser_sse2_test.cc create mode 100644 media/libvpx/libvpx/test/vp8_fdct4x4_test.cc create mode 100644 media/libvpx/libvpx/test/vp8_fragments_test.cc create mode 100755 media/libvpx/libvpx/test/vp8_multi_resolution_encoder.sh create mode 100644 media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc create mode 100644 media/libvpx/libvpx/test/vp9_arf_freq_test.cc create mode 100644 media/libvpx/libvpx/test/vp9_block_error_test.cc create mode 100644 media/libvpx/libvpx/test/vp9_boolcoder_test.cc create mode 100755 media/libvpx/libvpx/test/vp9_c_vs_simd_encode.sh create mode 100644 media/libvpx/libvpx/test/vp9_datarate_test.cc create mode 100644 media/libvpx/libvpx/test/vp9_decrypt_test.cc create mode 100644 media/libvpx/libvpx/test/vp9_denoiser_test.cc create mode 100644 media/libvpx/libvpx/test/vp9_encoder_parms_get_to_decoder.cc create mode 100644 media/libvpx/libvpx/test/vp9_end_to_end_test.cc create mode 100644 media/libvpx/libvpx/test/vp9_ethread_test.cc create mode 100644 media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc create mode 100644 media/libvpx/libvpx/test/vp9_intrapred_test.cc create mode 100644 media/libvpx/libvpx/test/vp9_lossless_test.cc create mode 100644 media/libvpx/libvpx/test/vp9_motion_vector_test.cc create mode 100644 media/libvpx/libvpx/test/vp9_quantize_test.cc create mode 100644 media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc create mode 100644 media/libvpx/libvpx/test/vp9_roi_test.cc create mode 100644 media/libvpx/libvpx/test/vp9_scale_test.cc create mode 100644 media/libvpx/libvpx/test/vp9_skip_loopfilter_test.cc create mode 100644 media/libvpx/libvpx/test/vp9_subtract_test.cc create mode 100644 media/libvpx/libvpx/test/vp9_thread_test.cc create mode 100644 media/libvpx/libvpx/test/vpx_scale_test.cc create mode 100644 media/libvpx/libvpx/test/vpx_scale_test.h create mode 100755 media/libvpx/libvpx/test/vpx_temporal_svc_encoder.sh create mode 100755 media/libvpx/libvpx/test/vpxdec.sh create mode 100755 media/libvpx/libvpx/test/vpxenc.sh create mode 100644 media/libvpx/libvpx/test/webm_video_source.h create mode 100644 media/libvpx/libvpx/test/y4m_test.cc create mode 100644 media/libvpx/libvpx/test/y4m_video_source.h create mode 100644 media/libvpx/libvpx/test/yuv_temporal_filter_test.cc create mode 100644 media/libvpx/libvpx/test/yuv_video_source.h create mode 100644 media/libvpx/libvpx/third_party/googletest/README.libvpx create mode 100644 media/libvpx/libvpx/third_party/googletest/gtest.mk create mode 100644 media/libvpx/libvpx/third_party/googletest/src/.clang-format create mode 100644 media/libvpx/libvpx/third_party/googletest/src/CONTRIBUTORS create mode 100644 media/libvpx/libvpx/third_party/googletest/src/LICENSE create mode 100644 media/libvpx/libvpx/third_party/googletest/src/README.md create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-assertion-result.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-matchers.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-message.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/README.md create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/src/gtest-all.cc create mode 100644 media/libvpx/libvpx/third_party/googletest/src/src/gtest-assertion-result.cc create mode 100644 media/libvpx/libvpx/third_party/googletest/src/src/gtest-death-test.cc create mode 100644 media/libvpx/libvpx/third_party/googletest/src/src/gtest-filepath.cc create mode 100644 media/libvpx/libvpx/third_party/googletest/src/src/gtest-internal-inl.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/src/gtest-matchers.cc create mode 100644 media/libvpx/libvpx/third_party/googletest/src/src/gtest-port.cc create mode 100644 media/libvpx/libvpx/third_party/googletest/src/src/gtest-printers.cc create mode 100644 media/libvpx/libvpx/third_party/googletest/src/src/gtest-test-part.cc create mode 100644 media/libvpx/libvpx/third_party/googletest/src/src/gtest-typed-test.cc create mode 100644 media/libvpx/libvpx/third_party/googletest/src/src/gtest.cc create mode 100644 media/libvpx/libvpx/third_party/googletest/src/src/gtest_main.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/LICENSE create mode 100644 media/libvpx/libvpx/third_party/libyuv/README.libvpx create mode 100644 media/libvpx/libvpx/third_party/libyuv/include/libyuv/basic_types.h create mode 100644 media/libvpx/libvpx/third_party/libyuv/include/libyuv/compare.h create mode 100644 media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert.h create mode 100644 media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_argb.h create mode 100644 media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_from.h create mode 100644 media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h create mode 100644 media/libvpx/libvpx/third_party/libyuv/include/libyuv/cpu_id.h create mode 100644 media/libvpx/libvpx/third_party/libyuv/include/libyuv/macros_msa.h create mode 100644 media/libvpx/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h create mode 100644 media/libvpx/libvpx/third_party/libyuv/include/libyuv/planar_functions.h create mode 100644 media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate.h create mode 100644 media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate_argb.h create mode 100644 media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate_row.h create mode 100644 media/libvpx/libvpx/third_party/libyuv/include/libyuv/row.h create mode 100644 media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale.h create mode 100644 media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale_argb.h create mode 100644 media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale_row.h create mode 100644 media/libvpx/libvpx/third_party/libyuv/include/libyuv/version.h create mode 100644 media/libvpx/libvpx/third_party/libyuv/include/libyuv/video_common.h create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/compare.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/compare_common.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/compare_gcc.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/compare_msa.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/compare_neon.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/compare_neon64.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/compare_win.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/convert.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/convert_argb.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/convert_from.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/convert_from_argb.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/convert_jpeg.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/convert_to_argb.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/convert_to_i420.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/cpu_id.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/mjpeg_decoder.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/mjpeg_validate.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/planar_functions.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/rotate.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/rotate_any.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/rotate_argb.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/rotate_common.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/rotate_gcc.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/rotate_msa.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/rotate_neon.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/rotate_neon64.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/rotate_win.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/row_any.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/row_common.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/row_gcc.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/row_msa.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/row_neon.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/row_neon64.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/row_win.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/scale.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/scale_any.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/scale_argb.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/scale_common.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/scale_gcc.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/scale_msa.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/scale_neon.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/scale_neon64.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/scale_win.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/video_common.cc create mode 100644 media/libvpx/libvpx/third_party/x86inc/LICENSE create mode 100644 media/libvpx/libvpx/third_party/x86inc/README.libvpx create mode 100644 media/libvpx/libvpx/third_party/x86inc/x86inc.asm create mode 100644 media/libvpx/libvpx/tools.mk create mode 100644 media/libvpx/libvpx/tools_common.c create mode 100644 media/libvpx/libvpx/tools_common.h create mode 100644 media/libvpx/libvpx/usage.dox create mode 100644 media/libvpx/libvpx/usage_cx.dox create mode 100644 media/libvpx/libvpx/usage_dx.dox create mode 100644 media/libvpx/libvpx/video_common.h create mode 100644 media/libvpx/libvpx/video_reader.c create mode 100644 media/libvpx/libvpx/video_reader.h create mode 100644 media/libvpx/libvpx/video_writer.c create mode 100644 media/libvpx/libvpx/video_writer.h create mode 100644 media/libvpx/libvpx/vp8/common/alloccommon.c create mode 100644 media/libvpx/libvpx/vp8/common/alloccommon.h create mode 100644 media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.c create mode 100644 media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.h create mode 100644 media/libvpx/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c create mode 100644 media/libvpx/libvpx/vp8/common/arm/neon/copymem_neon.c create mode 100644 media/libvpx/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c create mode 100644 media/libvpx/libvpx/vp8/common/arm/neon/dequant_idct_neon.c create mode 100644 media/libvpx/libvpx/vp8/common/arm/neon/dequantizeb_neon.c create mode 100644 media/libvpx/libvpx/vp8/common/arm/neon/idct_blk_neon.c create mode 100644 media/libvpx/libvpx/vp8/common/arm/neon/iwalsh_neon.c create mode 100644 media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c create mode 100644 media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c create mode 100644 media/libvpx/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c create mode 100644 media/libvpx/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c create mode 100644 media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c create mode 100644 media/libvpx/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c create mode 100644 media/libvpx/libvpx/vp8/common/blockd.c create mode 100644 media/libvpx/libvpx/vp8/common/blockd.h create mode 100644 media/libvpx/libvpx/vp8/common/coefupdateprobs.h create mode 100644 media/libvpx/libvpx/vp8/common/common.h create mode 100644 media/libvpx/libvpx/vp8/common/context.c create mode 100644 media/libvpx/libvpx/vp8/common/debugmodes.c create mode 100644 media/libvpx/libvpx/vp8/common/default_coef_probs.h create mode 100644 media/libvpx/libvpx/vp8/common/dequantize.c create mode 100644 media/libvpx/libvpx/vp8/common/entropy.c create mode 100644 media/libvpx/libvpx/vp8/common/entropy.h create mode 100644 media/libvpx/libvpx/vp8/common/entropymode.c create mode 100644 media/libvpx/libvpx/vp8/common/entropymode.h create mode 100644 media/libvpx/libvpx/vp8/common/entropymv.c create mode 100644 media/libvpx/libvpx/vp8/common/entropymv.h create mode 100644 media/libvpx/libvpx/vp8/common/extend.c create mode 100644 media/libvpx/libvpx/vp8/common/extend.h create mode 100644 media/libvpx/libvpx/vp8/common/filter.c create mode 100644 media/libvpx/libvpx/vp8/common/filter.h create mode 100644 media/libvpx/libvpx/vp8/common/findnearmv.c create mode 100644 media/libvpx/libvpx/vp8/common/findnearmv.h create mode 100644 media/libvpx/libvpx/vp8/common/generic/systemdependent.c create mode 100644 media/libvpx/libvpx/vp8/common/header.h create mode 100644 media/libvpx/libvpx/vp8/common/idct_blk.c create mode 100644 media/libvpx/libvpx/vp8/common/idctllm.c create mode 100644 media/libvpx/libvpx/vp8/common/invtrans.h create mode 100644 media/libvpx/libvpx/vp8/common/loongarch/idct_lsx.c create mode 100644 media/libvpx/libvpx/vp8/common/loongarch/loopfilter_filters_lsx.c create mode 100644 media/libvpx/libvpx/vp8/common/loongarch/sixtap_filter_lsx.c create mode 100644 media/libvpx/libvpx/vp8/common/loopfilter.h create mode 100644 media/libvpx/libvpx/vp8/common/loopfilter_filters.c create mode 100644 media/libvpx/libvpx/vp8/common/mbpitch.c create mode 100644 media/libvpx/libvpx/vp8/common/mfqe.c create mode 100644 media/libvpx/libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c create mode 100644 media/libvpx/libvpx/vp8/common/mips/dspr2/filter_dspr2.c create mode 100644 media/libvpx/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c create mode 100644 media/libvpx/libvpx/vp8/common/mips/dspr2/idctllm_dspr2.c create mode 100644 media/libvpx/libvpx/vp8/common/mips/dspr2/reconinter_dspr2.c create mode 100644 media/libvpx/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c create mode 100644 media/libvpx/libvpx/vp8/common/mips/mmi/copymem_mmi.c create mode 100644 media/libvpx/libvpx/vp8/common/mips/mmi/dequantize_mmi.c create mode 100644 media/libvpx/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c create mode 100644 media/libvpx/libvpx/vp8/common/mips/mmi/idctllm_mmi.c create mode 100644 media/libvpx/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c create mode 100644 media/libvpx/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c create mode 100644 media/libvpx/libvpx/vp8/common/mips/msa/bilinear_filter_msa.c create mode 100644 media/libvpx/libvpx/vp8/common/mips/msa/copymem_msa.c create mode 100644 media/libvpx/libvpx/vp8/common/mips/msa/idct_msa.c create mode 100644 media/libvpx/libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c create mode 100644 media/libvpx/libvpx/vp8/common/mips/msa/mfqe_msa.c create mode 100644 media/libvpx/libvpx/vp8/common/mips/msa/sixtap_filter_msa.c create mode 100644 media/libvpx/libvpx/vp8/common/mips/msa/vp8_macros_msa.h create mode 100644 media/libvpx/libvpx/vp8/common/modecont.c create mode 100644 media/libvpx/libvpx/vp8/common/modecont.h create mode 100644 media/libvpx/libvpx/vp8/common/mv.h create mode 100644 media/libvpx/libvpx/vp8/common/onyx.h create mode 100644 media/libvpx/libvpx/vp8/common/onyxc_int.h create mode 100644 media/libvpx/libvpx/vp8/common/onyxd.h create mode 100644 media/libvpx/libvpx/vp8/common/postproc.c create mode 100644 media/libvpx/libvpx/vp8/common/postproc.h create mode 100644 media/libvpx/libvpx/vp8/common/ppflags.h create mode 100644 media/libvpx/libvpx/vp8/common/quant_common.c create mode 100644 media/libvpx/libvpx/vp8/common/quant_common.h create mode 100644 media/libvpx/libvpx/vp8/common/reconinter.c create mode 100644 media/libvpx/libvpx/vp8/common/reconinter.h create mode 100644 media/libvpx/libvpx/vp8/common/reconintra.c create mode 100644 media/libvpx/libvpx/vp8/common/reconintra.h create mode 100644 media/libvpx/libvpx/vp8/common/reconintra4x4.c create mode 100644 media/libvpx/libvpx/vp8/common/reconintra4x4.h create mode 100644 media/libvpx/libvpx/vp8/common/rtcd.c create mode 100644 media/libvpx/libvpx/vp8/common/rtcd_defs.pl create mode 100644 media/libvpx/libvpx/vp8/common/setupintrarecon.c create mode 100644 media/libvpx/libvpx/vp8/common/setupintrarecon.h create mode 100644 media/libvpx/libvpx/vp8/common/swapyv12buffer.c create mode 100644 media/libvpx/libvpx/vp8/common/swapyv12buffer.h create mode 100644 media/libvpx/libvpx/vp8/common/systemdependent.h create mode 100644 media/libvpx/libvpx/vp8/common/threading.h create mode 100644 media/libvpx/libvpx/vp8/common/treecoder.c create mode 100644 media/libvpx/libvpx/vp8/common/treecoder.h create mode 100644 media/libvpx/libvpx/vp8/common/vp8_entropymodedata.h create mode 100644 media/libvpx/libvpx/vp8/common/vp8_loopfilter.c create mode 100644 media/libvpx/libvpx/vp8/common/vp8_skin_detection.c create mode 100644 media/libvpx/libvpx/vp8/common/vp8_skin_detection.h create mode 100644 media/libvpx/libvpx/vp8/common/x86/bilinear_filter_sse2.c create mode 100644 media/libvpx/libvpx/vp8/common/x86/dequantize_mmx.asm create mode 100644 media/libvpx/libvpx/vp8/common/x86/idct_blk_mmx.c create mode 100644 media/libvpx/libvpx/vp8/common/x86/idct_blk_sse2.c create mode 100644 media/libvpx/libvpx/vp8/common/x86/idctllm_mmx.asm create mode 100644 media/libvpx/libvpx/vp8/common/x86/idctllm_sse2.asm create mode 100644 media/libvpx/libvpx/vp8/common/x86/iwalsh_sse2.asm create mode 100644 media/libvpx/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm create mode 100644 media/libvpx/libvpx/vp8/common/x86/loopfilter_sse2.asm create mode 100644 media/libvpx/libvpx/vp8/common/x86/loopfilter_x86.c create mode 100644 media/libvpx/libvpx/vp8/common/x86/mfqe_sse2.asm create mode 100644 media/libvpx/libvpx/vp8/common/x86/recon_mmx.asm create mode 100644 media/libvpx/libvpx/vp8/common/x86/recon_sse2.asm create mode 100644 media/libvpx/libvpx/vp8/common/x86/subpixel_mmx.asm create mode 100644 media/libvpx/libvpx/vp8/common/x86/subpixel_sse2.asm create mode 100644 media/libvpx/libvpx/vp8/common/x86/subpixel_ssse3.asm create mode 100644 media/libvpx/libvpx/vp8/common/x86/vp8_asm_stubs.c create mode 100644 media/libvpx/libvpx/vp8/decoder/dboolhuff.c create mode 100644 media/libvpx/libvpx/vp8/decoder/dboolhuff.h create mode 100644 media/libvpx/libvpx/vp8/decoder/decodeframe.c create mode 100644 media/libvpx/libvpx/vp8/decoder/decodemv.c create mode 100644 media/libvpx/libvpx/vp8/decoder/decodemv.h create mode 100644 media/libvpx/libvpx/vp8/decoder/decoderthreading.h create mode 100644 media/libvpx/libvpx/vp8/decoder/detokenize.c create mode 100644 media/libvpx/libvpx/vp8/decoder/detokenize.h create mode 100644 media/libvpx/libvpx/vp8/decoder/ec_types.h create mode 100644 media/libvpx/libvpx/vp8/decoder/error_concealment.c create mode 100644 media/libvpx/libvpx/vp8/decoder/error_concealment.h create mode 100644 media/libvpx/libvpx/vp8/decoder/onyxd_if.c create mode 100644 media/libvpx/libvpx/vp8/decoder/onyxd_int.h create mode 100644 media/libvpx/libvpx/vp8/decoder/threading.c create mode 100644 media/libvpx/libvpx/vp8/decoder/treereader.h create mode 100644 media/libvpx/libvpx/vp8/encoder/arm/neon/denoising_neon.c create mode 100644 media/libvpx/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c create mode 100644 media/libvpx/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c create mode 100644 media/libvpx/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c create mode 100644 media/libvpx/libvpx/vp8/encoder/bitstream.c create mode 100644 media/libvpx/libvpx/vp8/encoder/bitstream.h create mode 100644 media/libvpx/libvpx/vp8/encoder/block.h create mode 100644 media/libvpx/libvpx/vp8/encoder/boolhuff.c create mode 100644 media/libvpx/libvpx/vp8/encoder/boolhuff.h create mode 100644 media/libvpx/libvpx/vp8/encoder/copy_c.c create mode 100644 media/libvpx/libvpx/vp8/encoder/dct.c create mode 100644 media/libvpx/libvpx/vp8/encoder/dct_value_cost.h create mode 100644 media/libvpx/libvpx/vp8/encoder/dct_value_tokens.h create mode 100644 media/libvpx/libvpx/vp8/encoder/defaultcoefcounts.h create mode 100644 media/libvpx/libvpx/vp8/encoder/denoising.c create mode 100644 media/libvpx/libvpx/vp8/encoder/denoising.h create mode 100644 media/libvpx/libvpx/vp8/encoder/encodeframe.c create mode 100644 media/libvpx/libvpx/vp8/encoder/encodeframe.h create mode 100644 media/libvpx/libvpx/vp8/encoder/encodeintra.c create mode 100644 media/libvpx/libvpx/vp8/encoder/encodeintra.h create mode 100644 media/libvpx/libvpx/vp8/encoder/encodemb.c create mode 100644 media/libvpx/libvpx/vp8/encoder/encodemb.h create mode 100644 media/libvpx/libvpx/vp8/encoder/encodemv.c create mode 100644 media/libvpx/libvpx/vp8/encoder/encodemv.h create mode 100644 media/libvpx/libvpx/vp8/encoder/ethreading.c create mode 100644 media/libvpx/libvpx/vp8/encoder/ethreading.h create mode 100644 media/libvpx/libvpx/vp8/encoder/firstpass.c create mode 100644 media/libvpx/libvpx/vp8/encoder/firstpass.h create mode 100644 media/libvpx/libvpx/vp8/encoder/lookahead.c create mode 100644 media/libvpx/libvpx/vp8/encoder/lookahead.h create mode 100644 media/libvpx/libvpx/vp8/encoder/loongarch/dct_lsx.c create mode 100644 media/libvpx/libvpx/vp8/encoder/loongarch/encodeopt_lsx.c create mode 100644 media/libvpx/libvpx/vp8/encoder/loongarch/vp8_quantize_lsx.c create mode 100644 media/libvpx/libvpx/vp8/encoder/mcomp.c create mode 100644 media/libvpx/libvpx/vp8/encoder/mcomp.h create mode 100644 media/libvpx/libvpx/vp8/encoder/mips/mmi/dct_mmi.c create mode 100644 media/libvpx/libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c create mode 100644 media/libvpx/libvpx/vp8/encoder/mips/msa/dct_msa.c create mode 100644 media/libvpx/libvpx/vp8/encoder/mips/msa/denoising_msa.c create mode 100644 media/libvpx/libvpx/vp8/encoder/mips/msa/encodeopt_msa.c create mode 100644 media/libvpx/libvpx/vp8/encoder/mips/msa/quantize_msa.c create mode 100644 media/libvpx/libvpx/vp8/encoder/mips/msa/temporal_filter_msa.c create mode 100644 media/libvpx/libvpx/vp8/encoder/modecosts.c create mode 100644 media/libvpx/libvpx/vp8/encoder/modecosts.h create mode 100644 media/libvpx/libvpx/vp8/encoder/mr_dissim.c create mode 100644 media/libvpx/libvpx/vp8/encoder/mr_dissim.h create mode 100644 media/libvpx/libvpx/vp8/encoder/onyx_if.c create mode 100644 media/libvpx/libvpx/vp8/encoder/onyx_int.h create mode 100644 media/libvpx/libvpx/vp8/encoder/pickinter.c create mode 100644 media/libvpx/libvpx/vp8/encoder/pickinter.h create mode 100644 media/libvpx/libvpx/vp8/encoder/picklpf.c create mode 100644 media/libvpx/libvpx/vp8/encoder/picklpf.h create mode 100644 media/libvpx/libvpx/vp8/encoder/quantize.h create mode 100644 media/libvpx/libvpx/vp8/encoder/ratectrl.c create mode 100644 media/libvpx/libvpx/vp8/encoder/ratectrl.h create mode 100644 media/libvpx/libvpx/vp8/encoder/rdopt.c create mode 100644 media/libvpx/libvpx/vp8/encoder/rdopt.h create mode 100644 media/libvpx/libvpx/vp8/encoder/segmentation.c create mode 100644 media/libvpx/libvpx/vp8/encoder/segmentation.h create mode 100644 media/libvpx/libvpx/vp8/encoder/temporal_filter.c create mode 100644 media/libvpx/libvpx/vp8/encoder/temporal_filter.h create mode 100644 media/libvpx/libvpx/vp8/encoder/tokenize.c create mode 100644 media/libvpx/libvpx/vp8/encoder/tokenize.h create mode 100644 media/libvpx/libvpx/vp8/encoder/treewriter.c create mode 100644 media/libvpx/libvpx/vp8/encoder/treewriter.h create mode 100644 media/libvpx/libvpx/vp8/encoder/vp8_quantize.c create mode 100644 media/libvpx/libvpx/vp8/encoder/x86/block_error_sse2.asm create mode 100644 media/libvpx/libvpx/vp8/encoder/x86/copy_sse2.asm create mode 100644 media/libvpx/libvpx/vp8/encoder/x86/copy_sse3.asm create mode 100644 media/libvpx/libvpx/vp8/encoder/x86/dct_sse2.asm create mode 100644 media/libvpx/libvpx/vp8/encoder/x86/denoising_sse2.c create mode 100644 media/libvpx/libvpx/vp8/encoder/x86/fwalsh_sse2.asm create mode 100644 media/libvpx/libvpx/vp8/encoder/x86/quantize_sse4.c create mode 100644 media/libvpx/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm create mode 100644 media/libvpx/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c create mode 100644 media/libvpx/libvpx/vp8/encoder/x86/vp8_quantize_sse2.c create mode 100644 media/libvpx/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c create mode 100644 media/libvpx/libvpx/vp8/exports_dec create mode 100644 media/libvpx/libvpx/vp8/exports_enc create mode 100644 media/libvpx/libvpx/vp8/vp8_common.mk create mode 100644 media/libvpx/libvpx/vp8/vp8_cx_iface.c create mode 100644 media/libvpx/libvpx/vp8/vp8_dx_iface.c create mode 100644 media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc create mode 100644 media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h create mode 100644 media/libvpx/libvpx/vp8/vp8cx.mk create mode 100644 media/libvpx/libvpx/vp8/vp8dx.mk create mode 100644 media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c create mode 100644 media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c create mode 100644 media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c create mode 100644 media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht16x16_add_neon.c create mode 100644 media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c create mode 100644 media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c create mode 100644 media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht_neon.h create mode 100644 media/libvpx/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c create mode 100644 media/libvpx/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c create mode 100644 media/libvpx/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c create mode 100644 media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c create mode 100644 media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c create mode 100644 media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c create mode 100644 media/libvpx/libvpx/vp9/common/mips/msa/vp9_mfqe_msa.c create mode 100644 media/libvpx/libvpx/vp9/common/ppc/vp9_idct_vsx.c create mode 100644 media/libvpx/libvpx/vp9/common/vp9_alloccommon.c create mode 100644 media/libvpx/libvpx/vp9/common/vp9_alloccommon.h create mode 100644 media/libvpx/libvpx/vp9/common/vp9_blockd.c create mode 100644 media/libvpx/libvpx/vp9/common/vp9_blockd.h create mode 100644 media/libvpx/libvpx/vp9/common/vp9_common.h create mode 100644 media/libvpx/libvpx/vp9/common/vp9_common_data.c create mode 100644 media/libvpx/libvpx/vp9/common/vp9_common_data.h create mode 100644 media/libvpx/libvpx/vp9/common/vp9_debugmodes.c create mode 100644 media/libvpx/libvpx/vp9/common/vp9_entropy.c create mode 100644 media/libvpx/libvpx/vp9/common/vp9_entropy.h create mode 100644 media/libvpx/libvpx/vp9/common/vp9_entropymode.c create mode 100644 media/libvpx/libvpx/vp9/common/vp9_entropymode.h create mode 100644 media/libvpx/libvpx/vp9/common/vp9_entropymv.c create mode 100644 media/libvpx/libvpx/vp9/common/vp9_entropymv.h create mode 100644 media/libvpx/libvpx/vp9/common/vp9_enums.h create mode 100644 media/libvpx/libvpx/vp9/common/vp9_filter.c create mode 100644 media/libvpx/libvpx/vp9/common/vp9_filter.h create mode 100644 media/libvpx/libvpx/vp9/common/vp9_frame_buffers.c create mode 100644 media/libvpx/libvpx/vp9/common/vp9_frame_buffers.h create mode 100644 media/libvpx/libvpx/vp9/common/vp9_idct.c create mode 100644 media/libvpx/libvpx/vp9/common/vp9_idct.h create mode 100644 media/libvpx/libvpx/vp9/common/vp9_loopfilter.c create mode 100644 media/libvpx/libvpx/vp9/common/vp9_loopfilter.h create mode 100644 media/libvpx/libvpx/vp9/common/vp9_mfqe.c create mode 100644 media/libvpx/libvpx/vp9/common/vp9_mfqe.h create mode 100644 media/libvpx/libvpx/vp9/common/vp9_mv.h create mode 100644 media/libvpx/libvpx/vp9/common/vp9_mvref_common.c create mode 100644 media/libvpx/libvpx/vp9/common/vp9_mvref_common.h create mode 100644 media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h create mode 100644 media/libvpx/libvpx/vp9/common/vp9_postproc.c create mode 100644 media/libvpx/libvpx/vp9/common/vp9_postproc.h create mode 100644 media/libvpx/libvpx/vp9/common/vp9_ppflags.h create mode 100644 media/libvpx/libvpx/vp9/common/vp9_pred_common.c create mode 100644 media/libvpx/libvpx/vp9/common/vp9_pred_common.h create mode 100644 media/libvpx/libvpx/vp9/common/vp9_quant_common.c create mode 100644 media/libvpx/libvpx/vp9/common/vp9_quant_common.h create mode 100644 media/libvpx/libvpx/vp9/common/vp9_reconinter.c create mode 100644 media/libvpx/libvpx/vp9/common/vp9_reconinter.h create mode 100644 media/libvpx/libvpx/vp9/common/vp9_reconintra.c create mode 100644 media/libvpx/libvpx/vp9/common/vp9_reconintra.h create mode 100644 media/libvpx/libvpx/vp9/common/vp9_rtcd.c create mode 100644 media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl create mode 100644 media/libvpx/libvpx/vp9/common/vp9_scale.c create mode 100644 media/libvpx/libvpx/vp9/common/vp9_scale.h create mode 100644 media/libvpx/libvpx/vp9/common/vp9_scan.c create mode 100644 media/libvpx/libvpx/vp9/common/vp9_scan.h create mode 100644 media/libvpx/libvpx/vp9/common/vp9_seg_common.c create mode 100644 media/libvpx/libvpx/vp9/common/vp9_seg_common.h create mode 100644 media/libvpx/libvpx/vp9/common/vp9_thread_common.c create mode 100644 media/libvpx/libvpx/vp9/common/vp9_thread_common.h create mode 100644 media/libvpx/libvpx/vp9/common/vp9_tile_common.c create mode 100644 media/libvpx/libvpx/vp9/common/vp9_tile_common.h create mode 100644 media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c create mode 100644 media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c create mode 100644 media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c create mode 100644 media/libvpx/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c create mode 100644 media/libvpx/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm create mode 100644 media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c create mode 100644 media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.h create mode 100644 media/libvpx/libvpx/vp9/decoder/vp9_decodemv.c create mode 100644 media/libvpx/libvpx/vp9/decoder/vp9_decodemv.h create mode 100644 media/libvpx/libvpx/vp9/decoder/vp9_decoder.c create mode 100644 media/libvpx/libvpx/vp9/decoder/vp9_decoder.h create mode 100644 media/libvpx/libvpx/vp9/decoder/vp9_detokenize.c create mode 100644 media/libvpx/libvpx/vp9/decoder/vp9_detokenize.h create mode 100644 media/libvpx/libvpx/vp9/decoder/vp9_dsubexp.c create mode 100644 media/libvpx/libvpx/vp9/decoder/vp9_dsubexp.h create mode 100644 media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c create mode 100644 media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h create mode 100644 media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c create mode 100644 media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c create mode 100644 media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c create mode 100644 media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_neon.c create mode 100644 media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c create mode 100644 media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_error_neon.c create mode 100644 media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c create mode 100644 media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c create mode 100644 media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c create mode 100644 media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c create mode 100644 media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c create mode 100644 media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c create mode 100644 media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c create mode 100644 media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h create mode 100644 media/libvpx/libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_aq_360.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_aq_360.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_aq_complexity.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_aq_complexity.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_bitstream.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_bitstream.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_block.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_blockiness.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_blockiness.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_cost.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_cost.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_dct.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_denoiser.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_denoiser.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_encodemb.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_encodemb.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_encodemv.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_encodemv.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_encoder.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_encoder.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_ethread.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_ethread.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_extend.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_extend.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_firstpass.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_firstpass_stats.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_frame_scale.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_job_queue.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_lookahead.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_mcomp.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_mcomp.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_noise_estimate.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_noise_estimate.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_partition_models.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_picklpf.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_picklpf.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_pickmode.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_pickmode.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_quantize.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_quantize.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_rd.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_rd.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_rdopt.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_resize.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_resize.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_segmentation.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_segmentation.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_skin_detection.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_skin_detection.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_speed_features.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_speed_features.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_subexp.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_subexp.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter_constants.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_tokenize.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_tokenize.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_treewriter.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_treewriter.h create mode 100644 media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c create mode 100644 media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_sse4.c create mode 100644 media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c create mode 100644 media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm create mode 100644 media/libvpx/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c create mode 100644 media/libvpx/libvpx/vp9/encoder/x86/vp9_error_avx2.c create mode 100644 media/libvpx/libvpx/vp9/encoder/x86/vp9_error_sse2.asm create mode 100644 media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c create mode 100644 media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c create mode 100644 media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c create mode 100644 media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c create mode 100644 media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c create mode 100644 media/libvpx/libvpx/vp9/exports_dec create mode 100644 media/libvpx/libvpx/vp9/exports_enc create mode 100644 media/libvpx/libvpx/vp9/ratectrl_rtc.cc create mode 100644 media/libvpx/libvpx/vp9/ratectrl_rtc.h create mode 100644 media/libvpx/libvpx/vp9/simple_encode.cc create mode 100644 media/libvpx/libvpx/vp9/simple_encode.h create mode 100644 media/libvpx/libvpx/vp9/vp9_common.mk create mode 100644 media/libvpx/libvpx/vp9/vp9_cx_iface.c create mode 100644 media/libvpx/libvpx/vp9/vp9_cx_iface.h create mode 100644 media/libvpx/libvpx/vp9/vp9_dx_iface.c create mode 100644 media/libvpx/libvpx/vp9/vp9_dx_iface.h create mode 100644 media/libvpx/libvpx/vp9/vp9_iface_common.c create mode 100644 media/libvpx/libvpx/vp9/vp9_iface_common.h create mode 100644 media/libvpx/libvpx/vp9/vp9cx.mk create mode 100644 media/libvpx/libvpx/vp9/vp9dx.mk create mode 100644 media/libvpx/libvpx/vpx/exports_com create mode 100644 media/libvpx/libvpx/vpx/exports_dec create mode 100644 media/libvpx/libvpx/vpx/exports_enc create mode 100644 media/libvpx/libvpx/vpx/internal/vpx_codec_internal.h create mode 100644 media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h create mode 100644 media/libvpx/libvpx/vpx/src/vpx_codec.c create mode 100644 media/libvpx/libvpx/vpx/src/vpx_decoder.c create mode 100644 media/libvpx/libvpx/vpx/src/vpx_encoder.c create mode 100644 media/libvpx/libvpx/vpx/src/vpx_image.c create mode 100644 media/libvpx/libvpx/vpx/src/vpx_tpl.c create mode 100644 media/libvpx/libvpx/vpx/vp8.h create mode 100644 media/libvpx/libvpx/vpx/vp8cx.h create mode 100644 media/libvpx/libvpx/vpx/vp8dx.h create mode 100644 media/libvpx/libvpx/vpx/vpx_codec.h create mode 100644 media/libvpx/libvpx/vpx/vpx_codec.mk create mode 100644 media/libvpx/libvpx/vpx/vpx_decoder.h create mode 100644 media/libvpx/libvpx/vpx/vpx_encoder.h create mode 100644 media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h create mode 100644 media/libvpx/libvpx/vpx/vpx_frame_buffer.h create mode 100644 media/libvpx/libvpx/vpx/vpx_image.h create mode 100644 media/libvpx/libvpx/vpx/vpx_integer.h create mode 100644 media/libvpx/libvpx/vpx/vpx_tpl.h create mode 100644 media/libvpx/libvpx/vpx_dsp/add_noise.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/avg_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/avg_pred_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/deblock_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.h create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.h create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.h create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.h create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/fdct_neon.h create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/fdct_partial_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/hadamard_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_pred_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_hadamard_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_idct_neon.h create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_quantize_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_sad4d_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_sad_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_sse_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon_dotprod.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/idct16x16_add_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/idct32x32_add_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/idct8x8_add_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/idct_neon.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/idct_neon.h create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon_asm.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/loopfilter_16_neon.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/quantize_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon_dotprod.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/sad_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/sad_neon_dotprod.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/save_reg_neon.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/sse_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/sse_neon_dotprod.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/subpel_variance_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/subtract_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/sum_squares_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/variance_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/variance_neon_dotprod.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.h create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_dotprod.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_i8mm.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/avg.c create mode 100644 media/libvpx/libvpx/vpx_dsp/bitreader.c create mode 100644 media/libvpx/libvpx/vpx_dsp/bitreader.h create mode 100644 media/libvpx/libvpx/vpx_dsp/bitreader_buffer.c create mode 100644 media/libvpx/libvpx/vpx_dsp/bitreader_buffer.h create mode 100644 media/libvpx/libvpx/vpx_dsp/bitwriter.c create mode 100644 media/libvpx/libvpx/vpx_dsp/bitwriter.h create mode 100644 media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.c create mode 100644 media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.h create mode 100644 media/libvpx/libvpx/vpx_dsp/deblock.c create mode 100644 media/libvpx/libvpx/vpx_dsp/fastssim.c create mode 100644 media/libvpx/libvpx/vpx_dsp/fwd_txfm.c create mode 100644 media/libvpx/libvpx/vpx_dsp/fwd_txfm.h create mode 100644 media/libvpx/libvpx/vpx_dsp/intrapred.c create mode 100644 media/libvpx/libvpx/vpx_dsp/inv_txfm.c create mode 100644 media/libvpx/libvpx/vpx_dsp/inv_txfm.h create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/avg_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/avg_pred_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/bitdepth_conversion_lsx.h create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/fwd_dct32x32_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.h create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/idct32x32_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/intrapred_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_16_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_4_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_8_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_lsx.h create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/quantize_intrin_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/sad_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/sub_pixel_variance_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/subtract_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/txfm_macros_lsx.h create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.h create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_lsx.h create mode 100644 media/libvpx/libvpx/vpx_dsp/loopfilter.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/add_noise_msa.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/avg_msa.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/common_dspr2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/common_dspr2.h create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/convolve2_dspr2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/convolve8_dspr2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/convolve_common_dspr2.h create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/deblock_msa.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.h create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/idct16x16_msa.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/idct32x32_msa.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/idct4x4_msa.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/idct8x8_msa.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/intrapred16_dspr2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/intrapred4_dspr2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/intrapred8_dspr2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/intrapred_msa.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_msa.h create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/itrans16_dspr2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/itrans32_dspr2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/itrans4_dspr2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/itrans8_dspr2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/loopfilter_16_msa.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/loopfilter_4_msa.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/loopfilter_8_msa.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.h create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/loopfilter_macros_dspr2.h create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/loopfilter_mb_dspr2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/loopfilter_msa.h create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/macros_msa.h create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/sad_mmi.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/sad_msa.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/subtract_mmi.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/subtract_msa.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/sum_squares_msa.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/txfm_macros_msa.h create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/variance_mmi.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/variance_msa.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_msa.h create mode 100644 media/libvpx/libvpx/vpx_dsp/postproc.h create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/bitdepth_conversion_vsx.h create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/deblock_vsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/fdct32x32_vsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/hadamard_vsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/intrapred_vsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.h create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/quantize_vsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/sad_vsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/subtract_vsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/transpose_vsx.h create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/txfm_common_vsx.h create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/types_vsx.h create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/variance_vsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/prob.c create mode 100644 media/libvpx/libvpx/vpx_dsp/prob.h create mode 100644 media/libvpx/libvpx/vpx_dsp/psnr.c create mode 100644 media/libvpx/libvpx/vpx_dsp/psnr.h create mode 100644 media/libvpx/libvpx/vpx_dsp/psnrhvs.c create mode 100644 media/libvpx/libvpx/vpx_dsp/quantize.c create mode 100644 media/libvpx/libvpx/vpx_dsp/quantize.h create mode 100644 media/libvpx/libvpx/vpx_dsp/sad.c create mode 100644 media/libvpx/libvpx/vpx_dsp/skin_detection.c create mode 100644 media/libvpx/libvpx/vpx_dsp/skin_detection.h create mode 100644 media/libvpx/libvpx/vpx_dsp/sse.c create mode 100644 media/libvpx/libvpx/vpx_dsp/ssim.c create mode 100644 media/libvpx/libvpx/vpx_dsp/ssim.h create mode 100644 media/libvpx/libvpx/vpx_dsp/subtract.c create mode 100644 media/libvpx/libvpx/vpx_dsp/sum_squares.c create mode 100644 media/libvpx/libvpx/vpx_dsp/txfm_common.h create mode 100644 media/libvpx/libvpx/vpx_dsp/variance.c create mode 100644 media/libvpx/libvpx/vpx_dsp/variance.h create mode 100644 media/libvpx/libvpx/vpx_dsp/vpx_convolve.c create mode 100644 media/libvpx/libvpx/vpx_dsp/vpx_convolve.h create mode 100644 media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk create mode 100644 media/libvpx/libvpx/vpx_dsp/vpx_dsp_common.h create mode 100644 media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c create mode 100644 media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl create mode 100644 media/libvpx/libvpx/vpx_dsp/vpx_filter.h create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/add_noise_sse2.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_avx2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_sse2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/avg_pred_avx2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/avg_pred_sse2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/convolve.h create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/convolve_avx2.h create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/convolve_sse2.h create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/convolve_ssse3.h create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/deblock_sse2.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_avx2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_sse2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/intrapred_sse2.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/intrapred_ssse3.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_avx2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.h create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/inv_wht_sse2.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/loopfilter_avx2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/loopfilter_intrin_sse2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/mem_sse2.h create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/post_proc_sse2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/quantize_avx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/quantize_avx2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.h create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.h create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx512.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/sad4d_sse2.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/sad_avx2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/sad_sse2.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/sse_avx2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/sse_sse4.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/subtract_avx2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/subtract_sse2.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/sum_squares_sse2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/transpose_sse2.h create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/txfm_common_sse2.h create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/variance_avx2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/variance_sse2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm create mode 100644 media/libvpx/libvpx/vpx_mem/include/vpx_mem_intrnl.h create mode 100644 media/libvpx/libvpx/vpx_mem/vpx_mem.c create mode 100644 media/libvpx/libvpx/vpx_mem/vpx_mem.h create mode 100644 media/libvpx/libvpx/vpx_mem/vpx_mem.mk create mode 100644 media/libvpx/libvpx/vpx_ports/aarch32_cpudetect.c create mode 100644 media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c create mode 100644 media/libvpx/libvpx/vpx_ports/arm.h create mode 100644 media/libvpx/libvpx/vpx_ports/arm_cpudetect.h create mode 100644 media/libvpx/libvpx/vpx_ports/asmdefs_mmi.h create mode 100644 media/libvpx/libvpx/vpx_ports/bitops.h create mode 100644 media/libvpx/libvpx/vpx_ports/compiler_attributes.h create mode 100644 media/libvpx/libvpx/vpx_ports/emmintrin_compat.h create mode 100644 media/libvpx/libvpx/vpx_ports/emms_mmx.asm create mode 100644 media/libvpx/libvpx/vpx_ports/emms_mmx.c create mode 100644 media/libvpx/libvpx/vpx_ports/float_control_word.asm create mode 100644 media/libvpx/libvpx/vpx_ports/loongarch.h create mode 100644 media/libvpx/libvpx/vpx_ports/loongarch_cpudetect.c create mode 100644 media/libvpx/libvpx/vpx_ports/mem.h create mode 100644 media/libvpx/libvpx/vpx_ports/mem_ops.h create mode 100644 media/libvpx/libvpx/vpx_ports/mem_ops_aligned.h create mode 100644 media/libvpx/libvpx/vpx_ports/mips.h create mode 100644 media/libvpx/libvpx/vpx_ports/mips_cpudetect.c create mode 100644 media/libvpx/libvpx/vpx_ports/msvc.h create mode 100644 media/libvpx/libvpx/vpx_ports/ppc.h create mode 100644 media/libvpx/libvpx/vpx_ports/ppc_cpudetect.c create mode 100644 media/libvpx/libvpx/vpx_ports/static_assert.h create mode 100644 media/libvpx/libvpx/vpx_ports/system_state.h create mode 100644 media/libvpx/libvpx/vpx_ports/vpx_once.h create mode 100644 media/libvpx/libvpx/vpx_ports/vpx_ports.mk create mode 100644 media/libvpx/libvpx/vpx_ports/vpx_timer.h create mode 100644 media/libvpx/libvpx/vpx_ports/x86.h create mode 100644 media/libvpx/libvpx/vpx_ports/x86_abi_support.asm create mode 100644 media/libvpx/libvpx/vpx_scale/generic/gen_scalers.c create mode 100644 media/libvpx/libvpx/vpx_scale/generic/vpx_scale.c create mode 100644 media/libvpx/libvpx/vpx_scale/generic/yv12config.c create mode 100644 media/libvpx/libvpx/vpx_scale/generic/yv12extend.c create mode 100644 media/libvpx/libvpx/vpx_scale/mips/dspr2/yv12extend_dspr2.c create mode 100644 media/libvpx/libvpx/vpx_scale/vpx_scale.h create mode 100644 media/libvpx/libvpx/vpx_scale/vpx_scale.mk create mode 100644 media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c create mode 100644 media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.pl create mode 100644 media/libvpx/libvpx/vpx_scale/yv12config.h create mode 100644 media/libvpx/libvpx/vpx_util/endian_inl.h create mode 100644 media/libvpx/libvpx/vpx_util/loongson_intrinsics.h create mode 100644 media/libvpx/libvpx/vpx_util/vpx_atomics.h create mode 100644 media/libvpx/libvpx/vpx_util/vpx_debug_util.c create mode 100644 media/libvpx/libvpx/vpx_util/vpx_debug_util.h create mode 100644 media/libvpx/libvpx/vpx_util/vpx_thread.c create mode 100644 media/libvpx/libvpx/vpx_util/vpx_thread.h create mode 100644 media/libvpx/libvpx/vpx_util/vpx_timestamp.h create mode 100644 media/libvpx/libvpx/vpx_util/vpx_util.mk create mode 100644 media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.c create mode 100644 media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.h create mode 100644 media/libvpx/libvpx/vpxdec.c create mode 100644 media/libvpx/libvpx/vpxenc.c create mode 100644 media/libvpx/libvpx/vpxenc.h create mode 100644 media/libvpx/libvpx/vpxstats.c create mode 100644 media/libvpx/libvpx/vpxstats.h create mode 100644 media/libvpx/libvpx/warnings.c create mode 100644 media/libvpx/libvpx/warnings.h create mode 100644 media/libvpx/libvpx/webmdec.cc create mode 100644 media/libvpx/libvpx/webmdec.h create mode 100644 media/libvpx/libvpx/webmenc.cc create mode 100644 media/libvpx/libvpx/webmenc.h create mode 100644 media/libvpx/libvpx/y4menc.c create mode 100644 media/libvpx/libvpx/y4menc.h create mode 100644 media/libvpx/libvpx/y4minput.c create mode 100644 media/libvpx/libvpx/y4minput.h create mode 100755 media/libvpx/lint_config.sh create mode 100644 media/libvpx/moz.build create mode 100644 media/libvpx/moz.yaml create mode 100644 media/libvpx/rename_duplicate_files.patch create mode 100644 media/libvpx/sources.mozbuild create mode 100644 media/libvpx/win64_build_fix.patch (limited to 'media/libvpx') diff --git a/media/libvpx/LICENSE b/media/libvpx/LICENSE new file mode 100644 index 0000000000..1ce44343c4 --- /dev/null +++ b/media/libvpx/LICENSE @@ -0,0 +1,31 @@ +Copyright (c) 2010, The WebM Project authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + + * Neither the name of Google, nor the WebM Project, nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/media/libvpx/Makefile.in b/media/libvpx/Makefile.in new file mode 100644 index 0000000000..f095da8e62 --- /dev/null +++ b/media/libvpx/Makefile.in @@ -0,0 +1,24 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +# Set up the libvpx assembler config. + +include $(topsrcdir)/config/rules.mk + +ifeq ($(TARGET_CPU),arm) +ifdef GNU_AS +# The ARM asm is written in ARM RVCT syntax, but we actually build it with +# gas using GNU syntax. Add some rules to perform the conversion. + +# Previously used $(dir $(ASFILES)) to figure out which directories to generate. +# However, .S (as opposed to .s) files are not added to ASFILES. There is only +# one directory with arm assembly currently so enumerate it manually. +GENERATED_DIRS += libvpx/vpx_dsp/arm + +libvpx/vpx_dsp/arm/%.asm.S: $(srcdir)/libvpx/vpx_dsp/arm/%.asm $(call mkdir_deps,libvpx/vpx_dsp/arm) + $(PERL) $(topsrcdir)/media/libvpx/libvpx/build/make/ads2gas.pl < $< > $@ + +$(addsuffix .$(OBJ_SUFFIX), idct4x4_add_neon.asm idct8x8_add_neon.asm idct16x16_add_neon.asm): libvpx/vpx_dsp/arm/idct_neon.asm.S +endif +endif diff --git a/media/libvpx/README_MOZILLA b/media/libvpx/README_MOZILLA new file mode 100644 index 0000000000..16ef9439bb --- /dev/null +++ b/media/libvpx/README_MOZILLA @@ -0,0 +1,18 @@ +The source from this directory was copied from the libvpx +git repository. The only changes made are those in the +included patch files and the addition of moz.build and +Makefile.in build files for the Mozilla build system. + +The libvpx git repository is: + + https://chromium.googlesource.com/webm/libvpx + +See moz.yaml for the current in-tree version. + +To update run + +$ ./mach vendor media/libvpx/moz.yaml --patch-mode=none +$ hg commit -m "Update libvpx" +$ ./mach vendor media/libvpx/moz.yaml --patch-mode=only +$ hg commit -m "Apply local patches to libvpx" + diff --git a/media/libvpx/config/generic/vp8_rtcd.h b/media/libvpx/config/generic/vp8_rtcd.h new file mode 100644 index 0000000000..1f1e691e23 --- /dev/null +++ b/media/libvpx/config/generic/vp8_rtcd.h @@ -0,0 +1,167 @@ +// This file is generated. Do not edit. +#ifndef VP8_RTCD_H_ +#define VP8_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP8 + */ + +struct blockd; +struct macroblockd; +struct loop_filter_info; + +/* Encoder forward decls */ +struct block; +struct macroblock; +struct variance_vtable; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_c + +void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_c + +void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_c + +void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_c + +int vp8_block_error_c(short *coeff, short *dqcoeff); +#define vp8_block_error vp8_block_error_c + +void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +#define vp8_copy32xn vp8_copy32xn_c + +void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem16x16 vp8_copy_mem16x16_c + +void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem8x4 vp8_copy_mem8x4_c + +void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem8x8 vp8_copy_mem8x8_c + +void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +#define vp8_dc_only_idct_add vp8_dc_only_idct_add_c + +int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter vp8_denoiser_filter_c + +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_c + +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride); +#define vp8_dequant_idct_add vp8_dequant_idct_add_c + +void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_c + +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c + +void vp8_dequantize_b_c(struct blockd*, short *DQC); +#define vp8_dequantize_b vp8_dequantize_b_c + +int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_diamond_search_sad vp8_diamond_search_sad_c + +void vp8_fast_quantize_b_c(struct block *, struct blockd *); +#define vp8_fast_quantize_b vp8_fast_quantize_b_c + +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bh vp8_loop_filter_bh_c + +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bv vp8_loop_filter_bv_c + +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbh vp8_loop_filter_mbh_c + +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbv vp8_loop_filter_mbv_c + +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_c + +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_c + +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_c + +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_c + +int vp8_mbblock_error_c(struct macroblock *mb, int dc); +#define vp8_mbblock_error vp8_mbblock_error_c + +int vp8_mbuverror_c(struct macroblock *mb); +#define vp8_mbuverror vp8_mbuverror_c + +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_refining_search_sad vp8_refining_search_sad_c + +void vp8_regular_quantize_b_c(struct block *, struct blockd *); +#define vp8_regular_quantize_b vp8_regular_quantize_b_c + +void vp8_short_fdct4x4_c(short *input, short *output, int pitch); +#define vp8_short_fdct4x4 vp8_short_fdct4x4_c + +void vp8_short_fdct8x4_c(short *input, short *output, int pitch); +#define vp8_short_fdct8x4 vp8_short_fdct8x4_c + +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +#define vp8_short_idct4x4llm vp8_short_idct4x4llm_c + +void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff); +#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_c + +void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + +void vp8_short_walsh4x4_c(short *input, short *output, int pitch); +#define vp8_short_walsh4x4 vp8_short_walsh4x4_c + +void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_c + +void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_c + +void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_c + +void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_c + +void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); +#define vp8_temporal_filter_apply vp8_temporal_filter_apply_c + +void vp8_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +static void setup_rtcd_internal(void) +{ +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/generic/vp9_rtcd.h b/media/libvpx/config/generic/vp9_rtcd.h new file mode 100644 index 0000000000..01b1e9d073 --- /dev/null +++ b/media/libvpx/config/generic/vp9_rtcd.h @@ -0,0 +1,92 @@ +// This file is generated. Do not edit. +#ifndef VP9_RTCD_H_ +#define VP9_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP9 + */ + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" + +struct macroblockd; + +/* Encoder forward decls */ +struct macroblock; +struct macroblock_plane; +struct vp9_sad_table; +struct ScanOrder; +struct search_site_config; +struct mv; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); +#define vp9_apply_temporal_filter vp9_apply_temporal_filter_c + +int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +#define vp9_block_error vp9_block_error_c + +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +#define vp9_block_error_fp vp9_block_error_fp_c + +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_c + +void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht16x16 vp9_fht16x16_c + +void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht4x4 vp9_fht4x4_c + +void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht8x8 vp9_fht8x8_c + +void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vp9_fwht4x4 vp9_fwht4x4_c + +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c + +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c + +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c + +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vp9_quantize_fp vp9_quantize_fp_c + +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c + +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c + +void vp9_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +static void setup_rtcd_internal(void) +{ +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/generic/vpx_config.asm b/media/libvpx/config/generic/vpx_config.asm new file mode 100644 index 0000000000..47243ad198 --- /dev/null +++ b/media/libvpx/config/generic/vpx_config.asm @@ -0,0 +1,97 @@ +@ This file was created from a .asm file +@ using the ads2gas.pl script. +.syntax unified +.equ VPX_ARCH_ARM , 0 +.equ VPX_ARCH_AARCH64 , 0 +.equ VPX_ARCH_MIPS , 0 +.equ VPX_ARCH_X86 , 0 +.equ VPX_ARCH_X86_64 , 0 +.equ VPX_ARCH_PPC , 0 +.equ VPX_ARCH_LOONGARCH , 0 +.equ HAVE_NEON_ASM , 0 +.equ HAVE_NEON , 0 +.equ HAVE_NEON_DOTPROD , 0 +.equ HAVE_NEON_I8MM , 0 +.equ HAVE_SVE , 0 +.equ HAVE_MIPS32 , 0 +.equ HAVE_DSPR2 , 0 +.equ HAVE_MSA , 0 +.equ HAVE_MIPS64 , 0 +.equ HAVE_MMX , 0 +.equ HAVE_SSE , 0 +.equ HAVE_SSE2 , 0 +.equ HAVE_SSE3 , 0 +.equ HAVE_SSSE3 , 0 +.equ HAVE_SSE4_1 , 0 +.equ HAVE_AVX , 0 +.equ HAVE_AVX2 , 0 +.equ HAVE_AVX512 , 0 +.equ HAVE_VSX , 0 +.equ HAVE_MMI , 0 +.equ HAVE_LSX , 0 +.equ HAVE_LASX , 0 +.equ HAVE_VPX_PORTS , 1 +.equ HAVE_PTHREAD_H , 1 +.equ CONFIG_DEPENDENCY_TRACKING , 1 +.equ CONFIG_EXTERNAL_BUILD , 1 +.equ CONFIG_INSTALL_DOCS , 0 +.equ CONFIG_INSTALL_BINS , 1 +.equ CONFIG_INSTALL_LIBS , 1 +.equ CONFIG_INSTALL_SRCS , 0 +.equ CONFIG_DEBUG , 0 +.equ CONFIG_GPROF , 0 +.equ CONFIG_GCOV , 0 +.equ CONFIG_RVCT , 0 +.equ CONFIG_GCC , 1 +.equ CONFIG_MSVS , 0 +.equ CONFIG_PIC , 1 +.equ CONFIG_BIG_ENDIAN , 0 +.equ CONFIG_CODEC_SRCS , 0 +.equ CONFIG_DEBUG_LIBS , 0 +.equ CONFIG_DEQUANT_TOKENS , 0 +.equ CONFIG_DC_RECON , 0 +.equ CONFIG_RUNTIME_CPU_DETECT , 0 +.equ CONFIG_POSTPROC , 0 +.equ CONFIG_VP9_POSTPROC , 0 +.equ CONFIG_MULTITHREAD , 1 +.equ CONFIG_INTERNAL_STATS , 0 +.equ CONFIG_VP8_ENCODER , 1 +.equ CONFIG_VP8_DECODER , 1 +.equ CONFIG_VP9_ENCODER , 1 +.equ CONFIG_VP9_DECODER , 1 +.equ CONFIG_VP8 , 1 +.equ CONFIG_VP9 , 1 +.equ CONFIG_ENCODERS , 1 +.equ CONFIG_DECODERS , 1 +.equ CONFIG_STATIC_MSVCRT , 0 +.equ CONFIG_SPATIAL_RESAMPLING , 1 +.equ CONFIG_REALTIME_ONLY , 0 +.equ CONFIG_ONTHEFLY_BITPACKING , 0 +.equ CONFIG_ERROR_CONCEALMENT , 0 +.equ CONFIG_SHARED , 0 +.equ CONFIG_STATIC , 1 +.equ CONFIG_SMALL , 0 +.equ CONFIG_POSTPROC_VISUALIZER , 0 +.equ CONFIG_OS_SUPPORT , 1 +.equ CONFIG_UNIT_TESTS , 0 +.equ CONFIG_WEBM_IO , 1 +.equ CONFIG_LIBYUV , 1 +.equ CONFIG_DECODE_PERF_TESTS , 0 +.equ CONFIG_ENCODE_PERF_TESTS , 0 +.equ CONFIG_MULTI_RES_ENCODING , 1 +.equ CONFIG_TEMPORAL_DENOISING , 1 +.equ CONFIG_VP9_TEMPORAL_DENOISING , 0 +.equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0 +.equ CONFIG_VP9_HIGHBITDEPTH , 0 +.equ CONFIG_BETTER_HW_COMPATIBILITY , 0 +.equ CONFIG_EXPERIMENTAL , 0 +.equ CONFIG_SIZE_LIMIT , 1 +.equ CONFIG_ALWAYS_ADJUST_BPM , 0 +.equ CONFIG_BITSTREAM_DEBUG , 0 +.equ CONFIG_MISMATCH_DEBUG , 0 +.equ CONFIG_FP_MB_STATS , 0 +.equ CONFIG_EMULATE_HARDWARE , 0 +.equ CONFIG_NON_GREEDY_MV , 0 +.equ CONFIG_RATE_CTRL , 0 +.equ CONFIG_COLLECT_COMPONENT_TIMING , 0 + .section .note.GNU-stack,"",%progbits diff --git a/media/libvpx/config/generic/vpx_config.c b/media/libvpx/config/generic/vpx_config.c new file mode 100644 index 0000000000..d1c3d1acd7 --- /dev/null +++ b/media/libvpx/config/generic/vpx_config.c @@ -0,0 +1,10 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +#include "vpx/vpx_codec.h" +static const char* const cfg = "--target=generic-gnu --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512"; +const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/generic/vpx_config.h b/media/libvpx/config/generic/vpx_config.h new file mode 100644 index 0000000000..774a531ed9 --- /dev/null +++ b/media/libvpx/config/generic/vpx_config.h @@ -0,0 +1,108 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +/* This file automatically generated by configure. Do not edit! */ +#ifndef VPX_CONFIG_H +#define VPX_CONFIG_H +#define RESTRICT +#define INLINE inline +#define VPX_ARCH_ARM 0 +#define VPX_ARCH_AARCH64 0 +#define VPX_ARCH_MIPS 0 +#define VPX_ARCH_X86 0 +#define VPX_ARCH_X86_64 0 +#define VPX_ARCH_PPC 0 +#define VPX_ARCH_LOONGARCH 0 +#define HAVE_NEON_ASM 0 +#define HAVE_NEON 0 +#define HAVE_NEON_DOTPROD 0 +#define HAVE_NEON_I8MM 0 +#define HAVE_SVE 0 +#define HAVE_MIPS32 0 +#define HAVE_DSPR2 0 +#define HAVE_MSA 0 +#define HAVE_MIPS64 0 +#define HAVE_MMX 0 +#define HAVE_SSE 0 +#define HAVE_SSE2 0 +#define HAVE_SSE3 0 +#define HAVE_SSSE3 0 +#define HAVE_SSE4_1 0 +#define HAVE_AVX 0 +#define HAVE_AVX2 0 +#define HAVE_AVX512 0 +#define HAVE_VSX 0 +#define HAVE_MMI 0 +#define HAVE_LSX 0 +#define HAVE_LASX 0 +#define HAVE_VPX_PORTS 1 +#define HAVE_PTHREAD_H 1 +#define CONFIG_DEPENDENCY_TRACKING 1 +#define CONFIG_EXTERNAL_BUILD 1 +#define CONFIG_INSTALL_DOCS 0 +#define CONFIG_INSTALL_BINS 1 +#define CONFIG_INSTALL_LIBS 1 +#define CONFIG_INSTALL_SRCS 0 +#define CONFIG_DEBUG 0 +#define CONFIG_GPROF 0 +#define CONFIG_GCOV 0 +#define CONFIG_RVCT 0 +#define CONFIG_GCC 1 +#define CONFIG_MSVS 0 +#define CONFIG_PIC 1 +#define CONFIG_BIG_ENDIAN 0 +#define CONFIG_CODEC_SRCS 0 +#define CONFIG_DEBUG_LIBS 0 +#define CONFIG_DEQUANT_TOKENS 0 +#define CONFIG_DC_RECON 0 +#define CONFIG_RUNTIME_CPU_DETECT 0 +#define CONFIG_POSTPROC 0 +#define CONFIG_VP9_POSTPROC 0 +#define CONFIG_MULTITHREAD 1 +#define CONFIG_INTERNAL_STATS 0 +#define CONFIG_VP8_ENCODER 1 +#define CONFIG_VP8_DECODER 1 +#define CONFIG_VP9_ENCODER 1 +#define CONFIG_VP9_DECODER 1 +#define CONFIG_VP8 1 +#define CONFIG_VP9 1 +#define CONFIG_ENCODERS 1 +#define CONFIG_DECODERS 1 +#define CONFIG_STATIC_MSVCRT 0 +#define CONFIG_SPATIAL_RESAMPLING 1 +#define CONFIG_REALTIME_ONLY 0 +#define CONFIG_ONTHEFLY_BITPACKING 0 +#define CONFIG_ERROR_CONCEALMENT 0 +#define CONFIG_SHARED 0 +#define CONFIG_STATIC 1 +#define CONFIG_SMALL 0 +#define CONFIG_POSTPROC_VISUALIZER 0 +#define CONFIG_OS_SUPPORT 1 +#define CONFIG_UNIT_TESTS 0 +#define CONFIG_WEBM_IO 1 +#define CONFIG_LIBYUV 1 +#define CONFIG_DECODE_PERF_TESTS 0 +#define CONFIG_ENCODE_PERF_TESTS 0 +#define CONFIG_MULTI_RES_ENCODING 1 +#define CONFIG_TEMPORAL_DENOISING 1 +#define CONFIG_VP9_TEMPORAL_DENOISING 0 +#define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 +#define CONFIG_BETTER_HW_COMPATIBILITY 0 +#define CONFIG_EXPERIMENTAL 0 +#define CONFIG_SIZE_LIMIT 1 +#define CONFIG_ALWAYS_ADJUST_BPM 0 +#define CONFIG_BITSTREAM_DEBUG 0 +#define CONFIG_MISMATCH_DEBUG 0 +#define CONFIG_FP_MB_STATS 0 +#define CONFIG_EMULATE_HARDWARE 0 +#define CONFIG_NON_GREEDY_MV 0 +#define CONFIG_RATE_CTRL 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 +#define DECODE_WIDTH_LIMIT 8192 +#define DECODE_HEIGHT_LIMIT 4608 +#endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/generic/vpx_dsp_rtcd.h b/media/libvpx/config/generic/vpx_dsp_rtcd.h new file mode 100644 index 0000000000..1843a0b421 --- /dev/null +++ b/media/libvpx/config/generic/vpx_dsp_rtcd.h @@ -0,0 +1,744 @@ +// This file is generated. Do not edit. +#ifndef VPX_DSP_RTCD_H_ +#define VPX_DSP_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * DSP + */ + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; + struct ScanOrder; +#endif + + +#ifdef __cplusplus +extern "C" { +#endif + +unsigned int vpx_avg_4x4_c(const uint8_t *, int p); +#define vpx_avg_4x4 vpx_avg_4x4_c + +unsigned int vpx_avg_8x8_c(const uint8_t *, int p); +#define vpx_avg_8x8 vpx_avg_8x8_c + +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +#define vpx_comp_avg_pred vpx_comp_avg_pred_c + +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve8 vpx_convolve8_c + +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve8_avg vpx_convolve8_avg_c + +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_c + +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_c + +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve8_horiz vpx_convolve8_horiz_c + +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve8_vert vpx_convolve8_vert_c + +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve_avg vpx_convolve_avg_c + +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve_copy vpx_convolve_copy_c + +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c + +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c + +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c + +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c + +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c + +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c + +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c + +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c + +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c + +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_c + +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_c + +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_c + +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_c + +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_c + +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_c + +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c + +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_c + +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c + +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_c + +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_c + +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c + +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c + +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_c + +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_c + +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c + +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c + +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_c + +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_c + +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_c + +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_c + +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_c + +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_c + +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_c + +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_c + +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_c + +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_c + +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_c + +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_c + +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_c + +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_c + +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_c + +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_c + +void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct16x16 vpx_fdct16x16_c + +void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct16x16_1 vpx_fdct16x16_1_c + +void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32 vpx_fdct32x32_c + +void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32_1 vpx_fdct32x32_1_c + +void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c + +void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4 vpx_fdct4x4_c + +void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4_1 vpx_fdct4x4_1_c + +void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct8x8 vpx_fdct8x8_c + +void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct8x8_1 vpx_fdct8x8_1_c + +void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get16x16var vpx_get16x16var_c + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); +#define vpx_get4x4sse_cs vpx_get4x4sse_cs_c + +void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get8x8var vpx_get8x8var_c + +unsigned int vpx_get_mb_ss_c(const int16_t *); +#define vpx_get_mb_ss vpx_get_mb_ss_c + +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_16x16 vpx_h_predictor_16x16_c + +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_32x32 vpx_h_predictor_32x32_c + +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_c + +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c + +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +#define vpx_hadamard_16x16 vpx_hadamard_16x16_c + +void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +#define vpx_hadamard_32x32 vpx_hadamard_32x32_c + +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +#define vpx_hadamard_8x8 vpx_hadamard_8x8_c + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c + +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c + +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_1_add vpx_idct16x16_1_add_c + +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_256_add vpx_idct16x16_256_add_c + +void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_38_add vpx_idct16x16_38_add_c + +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c + +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_135_add vpx_idct32x32_135_add_c + +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_1_add vpx_idct32x32_1_add_c + +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_34_add vpx_idct32x32_34_add_c + +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct4x4_16_add vpx_idct4x4_16_add_c + +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct4x4_1_add vpx_idct4x4_1_add_c + +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_12_add vpx_idct8x8_12_add_c + +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_1_add vpx_idct8x8_1_add_c + +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_64_add vpx_idct8x8_64_add_c + +int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); +#define vpx_int_pro_col vpx_int_pro_col_c + +void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +#define vpx_int_pro_row vpx_int_pro_row_c + +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c + +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c + +void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_16 vpx_lpf_horizontal_16_c + +void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_16_dual vpx_lpf_horizontal_16_dual_c + +void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_4 vpx_lpf_horizontal_4_c + +void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_horizontal_4_dual vpx_lpf_horizontal_4_dual_c + +void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_8 vpx_lpf_horizontal_8_c + +void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_horizontal_8_dual vpx_lpf_horizontal_8_dual_c + +void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_c + +void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16_dual vpx_lpf_vertical_16_dual_c + +void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_4 vpx_lpf_vertical_4_c + +void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_vertical_4_dual vpx_lpf_vertical_4_dual_c + +void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_8 vpx_lpf_vertical_8_c + +void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_c + +void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +#define vpx_minmax_8x8 vpx_minmax_8x8_c + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_mse16x16 vpx_mse16x16_c + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_mse16x8 vpx_mse16x8_c + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_mse8x16 vpx_mse8x16_c + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_mse8x8 vpx_mse8x8_c + +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vpx_quantize_b vpx_quantize_b_c + +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c + +unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad16x16 vpx_sad16x16_c + +unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x16_avg vpx_sad16x16_avg_c + +void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad16x16x4d vpx_sad16x16x4d_c + +unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad16x32 vpx_sad16x32_c + +unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x32_avg vpx_sad16x32_avg_c + +void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad16x32x4d vpx_sad16x32x4d_c + +unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad16x8 vpx_sad16x8_c + +unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x8_avg vpx_sad16x8_avg_c + +void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad16x8x4d vpx_sad16x8x4d_c + +unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad32x16 vpx_sad32x16_c + +unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad32x16_avg vpx_sad32x16_avg_c + +void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad32x16x4d vpx_sad32x16x4d_c + +unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad32x32 vpx_sad32x32_c + +unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad32x32_avg vpx_sad32x32_avg_c + +void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad32x32x4d vpx_sad32x32x4d_c + +unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad32x64 vpx_sad32x64_c + +unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad32x64_avg vpx_sad32x64_avg_c + +void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad32x64x4d vpx_sad32x64x4d_c + +unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad4x4 vpx_sad4x4_c + +unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x4_avg vpx_sad4x4_avg_c + +void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad4x4x4d vpx_sad4x4x4d_c + +unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad4x8 vpx_sad4x8_c + +unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x8_avg vpx_sad4x8_avg_c + +void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad4x8x4d vpx_sad4x8x4d_c + +unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad64x32 vpx_sad64x32_c + +unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad64x32_avg vpx_sad64x32_avg_c + +void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad64x32x4d vpx_sad64x32x4d_c + +unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad64x64 vpx_sad64x64_c + +unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad64x64_avg vpx_sad64x64_avg_c + +void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad64x64x4d vpx_sad64x64x4d_c + +unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x16 vpx_sad8x16_c + +unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x16_avg vpx_sad8x16_avg_c + +void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x16x4d vpx_sad8x16x4d_c + +unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x4 vpx_sad8x4_c + +unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x4_avg vpx_sad8x4_avg_c + +void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x4x4d vpx_sad8x4x4d_c + +unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x8 vpx_sad8x8_c + +unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x8_avg vpx_sad8x8_avg_c + +void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x8x4d vpx_sad8x8x4d_c + +unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x16 vpx_sad_skip_16x16_c + +void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x16x4d vpx_sad_skip_16x16x4d_c + +unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x32 vpx_sad_skip_16x32_c + +void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x32x4d vpx_sad_skip_16x32x4d_c + +unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x8 vpx_sad_skip_16x8_c + +void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x8x4d vpx_sad_skip_16x8x4d_c + +unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x16 vpx_sad_skip_32x16_c + +void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x16x4d vpx_sad_skip_32x16x4d_c + +unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x32 vpx_sad_skip_32x32_c + +void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x32x4d vpx_sad_skip_32x32x4d_c + +unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x64 vpx_sad_skip_32x64_c + +void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x64x4d vpx_sad_skip_32x64x4d_c + +unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c + +void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c + +unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_c + +void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_c + +unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_64x32 vpx_sad_skip_64x32_c + +void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_64x32x4d vpx_sad_skip_64x32x4d_c + +unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_64x64 vpx_sad_skip_64x64_c + +void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_64x64x4d vpx_sad_skip_64x64x4d_c + +unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_c + +void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_c + +unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c + +void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c + +unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_c + +void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_c + +int vpx_satd_c(const int16_t *coeff, int length); +#define vpx_satd vpx_satd_c + +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_2d vpx_scaled_2d_c + +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c + +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c + +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c + +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_horiz vpx_scaled_horiz_c + +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_vert vpx_scaled_vert_c + +int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +#define vpx_sse vpx_sse_c + +uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_c + +uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_c + +uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_c + +uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_c + +uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_c + +uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_c + +uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_c + +uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_c + +uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_c + +uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_c + +uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_c + +uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_c + +uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_c + +uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_c + +uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_c + +uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_c + +uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_c + +uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_c + +uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_c + +uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c + +uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c + +uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_c + +uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_c + +uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_c + +uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_c + +uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x8 vpx_sub_pixel_variance8x8_c + +void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +#define vpx_subtract_block vpx_subtract_block_c + +uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size); +#define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_c + +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_c + +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_c + +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_c + +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_c + +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_16x16 vpx_v_predictor_16x16_c + +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_32x32 vpx_v_predictor_32x32_c + +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_c + +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_c + +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x16 vpx_variance16x16_c + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x32 vpx_variance16x32_c + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x8 vpx_variance16x8_c + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x16 vpx_variance32x16_c + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x32 vpx_variance32x32_c + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x64 vpx_variance32x64_c + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x4 vpx_variance4x4_c + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x8 vpx_variance4x8_c + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance64x32 vpx_variance64x32_c + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance64x64 vpx_variance64x64_c + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x16 vpx_variance8x16_c + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x4 vpx_variance8x4_c + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x8 vpx_variance8x8_c + +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c + +int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl); +#define vpx_vector_var vpx_vector_var_c + +void vpx_dsp_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +static void setup_rtcd_internal(void) +{ +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/generic/vpx_scale_rtcd.h b/media/libvpx/config/generic/vpx_scale_rtcd.h new file mode 100644 index 0000000000..d12f52764e --- /dev/null +++ b/media/libvpx/config/generic/vpx_scale_rtcd.h @@ -0,0 +1,70 @@ +// This file is generated. Do not edit. +#ifndef VPX_SCALE_RTCD_H_ +#define VPX_SCALE_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c + +void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c + +void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c + +void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c + +void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c + +void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c + +void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c + +void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vp8_yv12_copy_frame vp8_yv12_copy_frame_c + +void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c + +void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_borders vpx_extend_frame_borders_c + +void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c + +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c + +void vpx_scale_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +static void setup_rtcd_internal(void) +{ +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/linux/arm/vp8_rtcd.h b/media/libvpx/config/linux/arm/vp8_rtcd.h new file mode 100644 index 0000000000..be54a85cde --- /dev/null +++ b/media/libvpx/config/linux/arm/vp8_rtcd.h @@ -0,0 +1,265 @@ +// This file is generated. Do not edit. +#ifndef VP8_RTCD_H_ +#define VP8_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP8 + */ + +struct blockd; +struct macroblockd; +struct loop_filter_info; + +/* Encoder forward decls */ +struct block; +struct macroblock; +struct variance_vtable; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict16x16_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +int vp8_block_error_c(short *coeff, short *dqcoeff); +#define vp8_block_error vp8_block_error_c + +void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +#define vp8_copy32xn vp8_copy32xn_c + +void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem16x16_neon(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); + +void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x4_neon(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); + +void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x8_neon(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); + +void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); + +int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +RTCD_EXTERN int (*vp8_denoiser_filter)(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); + +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_uv_neon(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +RTCD_EXTERN int (*vp8_denoiser_filter_uv)(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); + +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride); +void vp8_dequant_idct_add_neon(short *input, short *dq, unsigned char *dest, int stride); +RTCD_EXTERN void (*vp8_dequant_idct_add)(short *input, short *dq, unsigned char *dest, int stride); + +void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +RTCD_EXTERN void (*vp8_dequant_idct_add_uv_block)(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); + +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +RTCD_EXTERN void (*vp8_dequant_idct_add_y_block)(short *q, short *dq, unsigned char *dst, int stride, char *eobs); + +void vp8_dequantize_b_c(struct blockd*, short *DQC); +void vp8_dequantize_b_neon(struct blockd*, short *DQC); +RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *DQC); + +int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_diamond_search_sad vp8_diamond_search_sad_c + +void vp8_fast_quantize_b_c(struct block *, struct blockd *); +void vp8_fast_quantize_b_neon(struct block *, struct blockd *); +RTCD_EXTERN void (*vp8_fast_quantize_b)(struct block *, struct blockd *); + +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_bv)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_bh)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); + +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bvs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); + +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); + +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); + +int vp8_mbblock_error_c(struct macroblock *mb, int dc); +#define vp8_mbblock_error vp8_mbblock_error_c + +int vp8_mbuverror_c(struct macroblock *mb); +#define vp8_mbuverror vp8_mbuverror_c + +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_refining_search_sad vp8_refining_search_sad_c + +void vp8_regular_quantize_b_c(struct block *, struct blockd *); +#define vp8_regular_quantize_b vp8_regular_quantize_b_c + +void vp8_short_fdct4x4_c(short *input, short *output, int pitch); +void vp8_short_fdct4x4_neon(short *input, short *output, int pitch); +RTCD_EXTERN void (*vp8_short_fdct4x4)(short *input, short *output, int pitch); + +void vp8_short_fdct8x4_c(short *input, short *output, int pitch); +void vp8_short_fdct8x4_neon(short *input, short *output, int pitch); +RTCD_EXTERN void (*vp8_short_fdct8x4)(short *input, short *output, int pitch); + +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_short_idct4x4llm_neon(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); + +void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff); +void vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff); +RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *mb_dqcoeff); + +void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + +void vp8_short_walsh4x4_c(short *input, short *output, int pitch); +void vp8_short_walsh4x4_neon(short *input, short *output, int pitch); +RTCD_EXTERN void (*vp8_short_walsh4x4)(short *input, short *output, int pitch); + +void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + (void)flags; + + vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_c; + if (flags & HAS_NEON) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_neon; + vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_c; + if (flags & HAS_NEON) vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_neon; + vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_c; + if (flags & HAS_NEON) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_neon; + vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_c; + if (flags & HAS_NEON) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_neon; + vp8_copy_mem16x16 = vp8_copy_mem16x16_c; + if (flags & HAS_NEON) vp8_copy_mem16x16 = vp8_copy_mem16x16_neon; + vp8_copy_mem8x4 = vp8_copy_mem8x4_c; + if (flags & HAS_NEON) vp8_copy_mem8x4 = vp8_copy_mem8x4_neon; + vp8_copy_mem8x8 = vp8_copy_mem8x8_c; + if (flags & HAS_NEON) vp8_copy_mem8x8 = vp8_copy_mem8x8_neon; + vp8_dc_only_idct_add = vp8_dc_only_idct_add_c; + if (flags & HAS_NEON) vp8_dc_only_idct_add = vp8_dc_only_idct_add_neon; + vp8_denoiser_filter = vp8_denoiser_filter_c; + if (flags & HAS_NEON) vp8_denoiser_filter = vp8_denoiser_filter_neon; + vp8_denoiser_filter_uv = vp8_denoiser_filter_uv_c; + if (flags & HAS_NEON) vp8_denoiser_filter_uv = vp8_denoiser_filter_uv_neon; + vp8_dequant_idct_add = vp8_dequant_idct_add_c; + if (flags & HAS_NEON) vp8_dequant_idct_add = vp8_dequant_idct_add_neon; + vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_c; + if (flags & HAS_NEON) vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_neon; + vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_c; + if (flags & HAS_NEON) vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_neon; + vp8_dequantize_b = vp8_dequantize_b_c; + if (flags & HAS_NEON) vp8_dequantize_b = vp8_dequantize_b_neon; + vp8_fast_quantize_b = vp8_fast_quantize_b_c; + if (flags & HAS_NEON) vp8_fast_quantize_b = vp8_fast_quantize_b_neon; + vp8_loop_filter_bh = vp8_loop_filter_bh_c; + if (flags & HAS_NEON) vp8_loop_filter_bh = vp8_loop_filter_bh_neon; + vp8_loop_filter_bv = vp8_loop_filter_bv_c; + if (flags & HAS_NEON) vp8_loop_filter_bv = vp8_loop_filter_bv_neon; + vp8_loop_filter_mbh = vp8_loop_filter_mbh_c; + if (flags & HAS_NEON) vp8_loop_filter_mbh = vp8_loop_filter_mbh_neon; + vp8_loop_filter_mbv = vp8_loop_filter_mbv_c; + if (flags & HAS_NEON) vp8_loop_filter_mbv = vp8_loop_filter_mbv_neon; + vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_c; + if (flags & HAS_NEON) vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_neon; + vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_c; + if (flags & HAS_NEON) vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_neon; + vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_c; + if (flags & HAS_NEON) vp8_loop_filter_simple_mbh = vp8_loop_filter_mbhs_neon; + vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_c; + if (flags & HAS_NEON) vp8_loop_filter_simple_mbv = vp8_loop_filter_mbvs_neon; + vp8_short_fdct4x4 = vp8_short_fdct4x4_c; + if (flags & HAS_NEON) vp8_short_fdct4x4 = vp8_short_fdct4x4_neon; + vp8_short_fdct8x4 = vp8_short_fdct8x4_c; + if (flags & HAS_NEON) vp8_short_fdct8x4 = vp8_short_fdct8x4_neon; + vp8_short_idct4x4llm = vp8_short_idct4x4llm_c; + if (flags & HAS_NEON) vp8_short_idct4x4llm = vp8_short_idct4x4llm_neon; + vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_c; + if (flags & HAS_NEON) vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_neon; + vp8_short_walsh4x4 = vp8_short_walsh4x4_c; + if (flags & HAS_NEON) vp8_short_walsh4x4 = vp8_short_walsh4x4_neon; + vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_c; + if (flags & HAS_NEON) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_neon; + vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_c; + if (flags & HAS_NEON) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_neon; + vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_c; + if (flags & HAS_NEON) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_neon; + vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_c; + if (flags & HAS_NEON) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_neon; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/linux/arm/vp9_rtcd.h b/media/libvpx/config/linux/arm/vp9_rtcd.h new file mode 100644 index 0000000000..46fcf338fb --- /dev/null +++ b/media/libvpx/config/linux/arm/vp9_rtcd.h @@ -0,0 +1,130 @@ +// This file is generated. Do not edit. +#ifndef VP9_RTCD_H_ +#define VP9_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP9 + */ + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" + +struct macroblockd; + +/* Encoder forward decls */ +struct macroblock; +struct macroblock_plane; +struct vp9_sad_table; +struct ScanOrder; +struct search_site_config; +struct mv; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +int64_t vp9_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +RTCD_EXTERN int64_t (*vp9_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); + +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +RTCD_EXTERN int64_t (*vp9_block_error_fp)(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); + +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); +int vp9_diamond_search_sad_neon(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); +RTCD_EXTERN int (*vp9_diamond_search_sad)(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); + +void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type); +RTCD_EXTERN void (*vp9_fht16x16)(const int16_t *input, tran_low_t *output, int stride, int tx_type); + +void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type); +RTCD_EXTERN void (*vp9_fht4x4)(const int16_t *input, tran_low_t *output, int stride, int tx_type); + +void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht8x8_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type); +RTCD_EXTERN void (*vp9_fht8x8)(const int16_t *input, tran_low_t *output, int stride, int tx_type); + +void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vp9_fwht4x4 vp9_fwht4x4_c + +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); + +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); + +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); + +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +void vp9_scale_and_extend_frame_neon(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); + +void vp9_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + (void)flags; + + vp9_block_error = vp9_block_error_c; + if (flags & HAS_NEON) vp9_block_error = vp9_block_error_neon; + vp9_block_error_fp = vp9_block_error_fp_c; + if (flags & HAS_NEON) vp9_block_error_fp = vp9_block_error_fp_neon; + vp9_diamond_search_sad = vp9_diamond_search_sad_c; + if (flags & HAS_NEON) vp9_diamond_search_sad = vp9_diamond_search_sad_neon; + vp9_fht16x16 = vp9_fht16x16_c; + if (flags & HAS_NEON) vp9_fht16x16 = vp9_fht16x16_neon; + vp9_fht4x4 = vp9_fht4x4_c; + if (flags & HAS_NEON) vp9_fht4x4 = vp9_fht4x4_neon; + vp9_fht8x8 = vp9_fht8x8_c; + if (flags & HAS_NEON) vp9_fht8x8 = vp9_fht8x8_neon; + vp9_iht16x16_256_add = vp9_iht16x16_256_add_c; + if (flags & HAS_NEON) vp9_iht16x16_256_add = vp9_iht16x16_256_add_neon; + vp9_iht4x4_16_add = vp9_iht4x4_16_add_c; + if (flags & HAS_NEON) vp9_iht4x4_16_add = vp9_iht4x4_16_add_neon; + vp9_iht8x8_64_add = vp9_iht8x8_64_add_c; + if (flags & HAS_NEON) vp9_iht8x8_64_add = vp9_iht8x8_64_add_neon; + vp9_quantize_fp = vp9_quantize_fp_c; + if (flags & HAS_NEON) vp9_quantize_fp = vp9_quantize_fp_neon; + vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_c; + if (flags & HAS_NEON) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_neon; + vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c; + if (flags & HAS_NEON) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_neon; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/linux/arm/vpx_config.asm b/media/libvpx/config/linux/arm/vpx_config.asm new file mode 100644 index 0000000000..ee43d0f922 --- /dev/null +++ b/media/libvpx/config/linux/arm/vpx_config.asm @@ -0,0 +1,97 @@ +@ This file was created from a .asm file +@ using the ads2gas.pl script. +.syntax unified +.equ VPX_ARCH_ARM , 1 +.equ VPX_ARCH_AARCH64 , 0 +.equ VPX_ARCH_MIPS , 0 +.equ VPX_ARCH_X86 , 0 +.equ VPX_ARCH_X86_64 , 0 +.equ VPX_ARCH_PPC , 0 +.equ VPX_ARCH_LOONGARCH , 0 +.equ HAVE_NEON_ASM , 1 +.equ HAVE_NEON , 1 +.equ HAVE_NEON_DOTPROD , 0 +.equ HAVE_NEON_I8MM , 0 +.equ HAVE_SVE , 0 +.equ HAVE_MIPS32 , 0 +.equ HAVE_DSPR2 , 0 +.equ HAVE_MSA , 0 +.equ HAVE_MIPS64 , 0 +.equ HAVE_MMX , 0 +.equ HAVE_SSE , 0 +.equ HAVE_SSE2 , 0 +.equ HAVE_SSE3 , 0 +.equ HAVE_SSSE3 , 0 +.equ HAVE_SSE4_1 , 0 +.equ HAVE_AVX , 0 +.equ HAVE_AVX2 , 0 +.equ HAVE_AVX512 , 0 +.equ HAVE_VSX , 0 +.equ HAVE_MMI , 0 +.equ HAVE_LSX , 0 +.equ HAVE_LASX , 0 +.equ HAVE_VPX_PORTS , 1 +.equ HAVE_PTHREAD_H , 1 +.equ CONFIG_DEPENDENCY_TRACKING , 1 +.equ CONFIG_EXTERNAL_BUILD , 1 +.equ CONFIG_INSTALL_DOCS , 0 +.equ CONFIG_INSTALL_BINS , 1 +.equ CONFIG_INSTALL_LIBS , 1 +.equ CONFIG_INSTALL_SRCS , 0 +.equ CONFIG_DEBUG , 0 +.equ CONFIG_GPROF , 0 +.equ CONFIG_GCOV , 0 +.equ CONFIG_RVCT , 0 +.equ CONFIG_GCC , 1 +.equ CONFIG_MSVS , 0 +.equ CONFIG_PIC , 1 +.equ CONFIG_BIG_ENDIAN , 0 +.equ CONFIG_CODEC_SRCS , 0 +.equ CONFIG_DEBUG_LIBS , 0 +.equ CONFIG_DEQUANT_TOKENS , 0 +.equ CONFIG_DC_RECON , 0 +.equ CONFIG_RUNTIME_CPU_DETECT , 1 +.equ CONFIG_POSTPROC , 0 +.equ CONFIG_VP9_POSTPROC , 0 +.equ CONFIG_MULTITHREAD , 1 +.equ CONFIG_INTERNAL_STATS , 0 +.equ CONFIG_VP8_ENCODER , 1 +.equ CONFIG_VP8_DECODER , 1 +.equ CONFIG_VP9_ENCODER , 1 +.equ CONFIG_VP9_DECODER , 1 +.equ CONFIG_VP8 , 1 +.equ CONFIG_VP9 , 1 +.equ CONFIG_ENCODERS , 1 +.equ CONFIG_DECODERS , 1 +.equ CONFIG_STATIC_MSVCRT , 0 +.equ CONFIG_SPATIAL_RESAMPLING , 1 +.equ CONFIG_REALTIME_ONLY , 1 +.equ CONFIG_ONTHEFLY_BITPACKING , 0 +.equ CONFIG_ERROR_CONCEALMENT , 0 +.equ CONFIG_SHARED , 0 +.equ CONFIG_STATIC , 1 +.equ CONFIG_SMALL , 0 +.equ CONFIG_POSTPROC_VISUALIZER , 0 +.equ CONFIG_OS_SUPPORT , 1 +.equ CONFIG_UNIT_TESTS , 0 +.equ CONFIG_WEBM_IO , 1 +.equ CONFIG_LIBYUV , 1 +.equ CONFIG_DECODE_PERF_TESTS , 0 +.equ CONFIG_ENCODE_PERF_TESTS , 0 +.equ CONFIG_MULTI_RES_ENCODING , 1 +.equ CONFIG_TEMPORAL_DENOISING , 1 +.equ CONFIG_VP9_TEMPORAL_DENOISING , 0 +.equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0 +.equ CONFIG_VP9_HIGHBITDEPTH , 0 +.equ CONFIG_BETTER_HW_COMPATIBILITY , 0 +.equ CONFIG_EXPERIMENTAL , 0 +.equ CONFIG_SIZE_LIMIT , 1 +.equ CONFIG_ALWAYS_ADJUST_BPM , 0 +.equ CONFIG_BITSTREAM_DEBUG , 0 +.equ CONFIG_MISMATCH_DEBUG , 0 +.equ CONFIG_FP_MB_STATS , 0 +.equ CONFIG_EMULATE_HARDWARE , 0 +.equ CONFIG_NON_GREEDY_MV , 0 +.equ CONFIG_RATE_CTRL , 0 +.equ CONFIG_COLLECT_COMPONENT_TIMING , 0 + .section .note.GNU-stack,"",%progbits diff --git a/media/libvpx/config/linux/arm/vpx_config.c b/media/libvpx/config/linux/arm/vpx_config.c new file mode 100644 index 0000000000..c885d910c0 --- /dev/null +++ b/media/libvpx/config/linux/arm/vpx_config.c @@ -0,0 +1,10 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +#include "vpx/vpx_codec.h" +static const char* const cfg = "--target=armv7-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-runtime-cpu-detect --enable-realtime-only"; +const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/linux/arm/vpx_config.h b/media/libvpx/config/linux/arm/vpx_config.h new file mode 100644 index 0000000000..bfd2c04e07 --- /dev/null +++ b/media/libvpx/config/linux/arm/vpx_config.h @@ -0,0 +1,108 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +/* This file automatically generated by configure. Do not edit! */ +#ifndef VPX_CONFIG_H +#define VPX_CONFIG_H +#define RESTRICT +#define INLINE inline +#define VPX_ARCH_ARM 1 +#define VPX_ARCH_AARCH64 0 +#define VPX_ARCH_MIPS 0 +#define VPX_ARCH_X86 0 +#define VPX_ARCH_X86_64 0 +#define VPX_ARCH_PPC 0 +#define VPX_ARCH_LOONGARCH 0 +#define HAVE_NEON_ASM 1 +#define HAVE_NEON 1 +#define HAVE_NEON_DOTPROD 0 +#define HAVE_NEON_I8MM 0 +#define HAVE_SVE 0 +#define HAVE_MIPS32 0 +#define HAVE_DSPR2 0 +#define HAVE_MSA 0 +#define HAVE_MIPS64 0 +#define HAVE_MMX 0 +#define HAVE_SSE 0 +#define HAVE_SSE2 0 +#define HAVE_SSE3 0 +#define HAVE_SSSE3 0 +#define HAVE_SSE4_1 0 +#define HAVE_AVX 0 +#define HAVE_AVX2 0 +#define HAVE_AVX512 0 +#define HAVE_VSX 0 +#define HAVE_MMI 0 +#define HAVE_LSX 0 +#define HAVE_LASX 0 +#define HAVE_VPX_PORTS 1 +#define HAVE_PTHREAD_H 1 +#define CONFIG_DEPENDENCY_TRACKING 1 +#define CONFIG_EXTERNAL_BUILD 1 +#define CONFIG_INSTALL_DOCS 0 +#define CONFIG_INSTALL_BINS 1 +#define CONFIG_INSTALL_LIBS 1 +#define CONFIG_INSTALL_SRCS 0 +#define CONFIG_DEBUG 0 +#define CONFIG_GPROF 0 +#define CONFIG_GCOV 0 +#define CONFIG_RVCT 0 +#define CONFIG_GCC 1 +#define CONFIG_MSVS 0 +#define CONFIG_PIC 1 +#define CONFIG_BIG_ENDIAN 0 +#define CONFIG_CODEC_SRCS 0 +#define CONFIG_DEBUG_LIBS 0 +#define CONFIG_DEQUANT_TOKENS 0 +#define CONFIG_DC_RECON 0 +#define CONFIG_RUNTIME_CPU_DETECT 1 +#define CONFIG_POSTPROC 0 +#define CONFIG_VP9_POSTPROC 0 +#define CONFIG_MULTITHREAD 1 +#define CONFIG_INTERNAL_STATS 0 +#define CONFIG_VP8_ENCODER 1 +#define CONFIG_VP8_DECODER 1 +#define CONFIG_VP9_ENCODER 1 +#define CONFIG_VP9_DECODER 1 +#define CONFIG_VP8 1 +#define CONFIG_VP9 1 +#define CONFIG_ENCODERS 1 +#define CONFIG_DECODERS 1 +#define CONFIG_STATIC_MSVCRT 0 +#define CONFIG_SPATIAL_RESAMPLING 1 +#define CONFIG_REALTIME_ONLY 1 +#define CONFIG_ONTHEFLY_BITPACKING 0 +#define CONFIG_ERROR_CONCEALMENT 0 +#define CONFIG_SHARED 0 +#define CONFIG_STATIC 1 +#define CONFIG_SMALL 0 +#define CONFIG_POSTPROC_VISUALIZER 0 +#define CONFIG_OS_SUPPORT 1 +#define CONFIG_UNIT_TESTS 0 +#define CONFIG_WEBM_IO 1 +#define CONFIG_LIBYUV 1 +#define CONFIG_DECODE_PERF_TESTS 0 +#define CONFIG_ENCODE_PERF_TESTS 0 +#define CONFIG_MULTI_RES_ENCODING 1 +#define CONFIG_TEMPORAL_DENOISING 1 +#define CONFIG_VP9_TEMPORAL_DENOISING 0 +#define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 +#define CONFIG_BETTER_HW_COMPATIBILITY 0 +#define CONFIG_EXPERIMENTAL 0 +#define CONFIG_SIZE_LIMIT 1 +#define CONFIG_ALWAYS_ADJUST_BPM 0 +#define CONFIG_BITSTREAM_DEBUG 0 +#define CONFIG_MISMATCH_DEBUG 0 +#define CONFIG_FP_MB_STATS 0 +#define CONFIG_EMULATE_HARDWARE 0 +#define CONFIG_NON_GREEDY_MV 0 +#define CONFIG_RATE_CTRL 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 +#define DECODE_WIDTH_LIMIT 8192 +#define DECODE_HEIGHT_LIMIT 4608 +#endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/linux/arm/vpx_dsp_rtcd.h b/media/libvpx/config/linux/arm/vpx_dsp_rtcd.h new file mode 100644 index 0000000000..fbe85df601 --- /dev/null +++ b/media/libvpx/config/linux/arm/vpx_dsp_rtcd.h @@ -0,0 +1,1415 @@ +// This file is generated. Do not edit. +#ifndef VPX_DSP_RTCD_H_ +#define VPX_DSP_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * DSP + */ + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; + struct ScanOrder; +#endif + + +#ifdef __cplusplus +extern "C" { +#endif + +unsigned int vpx_avg_4x4_c(const uint8_t *, int p); +unsigned int vpx_avg_4x4_neon(const uint8_t *, int p); +RTCD_EXTERN unsigned int (*vpx_avg_4x4)(const uint8_t *, int p); + +unsigned int vpx_avg_8x8_c(const uint8_t *, int p); +unsigned int vpx_avg_8x8_neon(const uint8_t *, int p); +RTCD_EXTERN unsigned int (*vpx_avg_8x8)(const uint8_t *, int p); + +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +void vpx_comp_avg_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +RTCD_EXTERN void (*vpx_comp_avg_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); + +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d117_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d117_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d117_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d117_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d135_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d135_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d135_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d135_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c + +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c + +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct16x16)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct16x16_1)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct32x32)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct32x32_1)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct32x32_rd)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct4x4)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct4x4_1)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct8x8)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct8x8_1)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_get4x4sse_cs)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); + +void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get_mb_ss_c(const int16_t *); +#define vpx_get_mb_ss vpx_get_mb_ss_c + +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); + +void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_32x32)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); + +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c + +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_38_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_1_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride); + +int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); +int16_t vpx_int_pro_col_neon(const uint8_t *ref, const int width); +RTCD_EXTERN int16_t (*vpx_int_pro_col)(const uint8_t *ref, const int width); + +void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +void vpx_int_pro_row_neon(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +RTCD_EXTERN void (*vpx_int_pro_row)(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); + +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c + +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c + +void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_16_dual)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_horizontal_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_horizontal_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_16_dual)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_vertical_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +void vpx_minmax_8x8_neon(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +RTCD_EXTERN void (*vpx_minmax_8x8)(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vpx_quantize_b_32x32)(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad16x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad16x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad16x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad4x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad4x4_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad4x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad4x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad64x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad8x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad8x4_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad8x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad8x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_4x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_4x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_8x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +int vpx_satd_c(const int16_t *coeff, int length); +int vpx_satd_neon(const int16_t *coeff, int length); +RTCD_EXTERN int (*vpx_satd)(const int16_t *coeff, int length); + +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c + +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c + +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c + +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_horiz vpx_scaled_horiz_c + +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_vert vpx_scaled_vert_c + +int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +RTCD_EXTERN int64_t (*vpx_sse)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); + +uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vpx_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +RTCD_EXTERN void (*vpx_subtract_block)(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); + +uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size); +uint64_t vpx_sum_squares_2d_i16_neon(const int16_t *src, int stride, int size); +RTCD_EXTERN uint64_t (*vpx_sum_squares_2d_i16)(const int16_t *src, int stride, int size); + +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance4x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c + +int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl); +int vpx_vector_var_neon(const int16_t *ref, const int16_t *src, const int bwl); +RTCD_EXTERN int (*vpx_vector_var)(const int16_t *ref, const int16_t *src, const int bwl); + +void vpx_dsp_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + (void)flags; + + vpx_avg_4x4 = vpx_avg_4x4_c; + if (flags & HAS_NEON) vpx_avg_4x4 = vpx_avg_4x4_neon; + vpx_avg_8x8 = vpx_avg_8x8_c; + if (flags & HAS_NEON) vpx_avg_8x8 = vpx_avg_8x8_neon; + vpx_comp_avg_pred = vpx_comp_avg_pred_c; + if (flags & HAS_NEON) vpx_comp_avg_pred = vpx_comp_avg_pred_neon; + vpx_convolve8 = vpx_convolve8_c; + if (flags & HAS_NEON) vpx_convolve8 = vpx_convolve8_neon; + vpx_convolve8_avg = vpx_convolve8_avg_c; + if (flags & HAS_NEON) vpx_convolve8_avg = vpx_convolve8_avg_neon; + vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_c; + if (flags & HAS_NEON) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_neon; + vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_c; + if (flags & HAS_NEON) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_neon; + vpx_convolve8_horiz = vpx_convolve8_horiz_c; + if (flags & HAS_NEON) vpx_convolve8_horiz = vpx_convolve8_horiz_neon; + vpx_convolve8_vert = vpx_convolve8_vert_c; + if (flags & HAS_NEON) vpx_convolve8_vert = vpx_convolve8_vert_neon; + vpx_convolve_avg = vpx_convolve_avg_c; + if (flags & HAS_NEON) vpx_convolve_avg = vpx_convolve_avg_neon; + vpx_convolve_copy = vpx_convolve_copy_c; + if (flags & HAS_NEON) vpx_convolve_copy = vpx_convolve_copy_neon; + vpx_d117_predictor_16x16 = vpx_d117_predictor_16x16_c; + if (flags & HAS_NEON) vpx_d117_predictor_16x16 = vpx_d117_predictor_16x16_neon; + vpx_d117_predictor_32x32 = vpx_d117_predictor_32x32_c; + if (flags & HAS_NEON) vpx_d117_predictor_32x32 = vpx_d117_predictor_32x32_neon; + vpx_d117_predictor_4x4 = vpx_d117_predictor_4x4_c; + if (flags & HAS_NEON) vpx_d117_predictor_4x4 = vpx_d117_predictor_4x4_neon; + vpx_d117_predictor_8x8 = vpx_d117_predictor_8x8_c; + if (flags & HAS_NEON) vpx_d117_predictor_8x8 = vpx_d117_predictor_8x8_neon; + vpx_d135_predictor_16x16 = vpx_d135_predictor_16x16_c; + if (flags & HAS_NEON) vpx_d135_predictor_16x16 = vpx_d135_predictor_16x16_neon; + vpx_d135_predictor_32x32 = vpx_d135_predictor_32x32_c; + if (flags & HAS_NEON) vpx_d135_predictor_32x32 = vpx_d135_predictor_32x32_neon; + vpx_d135_predictor_4x4 = vpx_d135_predictor_4x4_c; + if (flags & HAS_NEON) vpx_d135_predictor_4x4 = vpx_d135_predictor_4x4_neon; + vpx_d135_predictor_8x8 = vpx_d135_predictor_8x8_c; + if (flags & HAS_NEON) vpx_d135_predictor_8x8 = vpx_d135_predictor_8x8_neon; + vpx_d153_predictor_16x16 = vpx_d153_predictor_16x16_c; + if (flags & HAS_NEON) vpx_d153_predictor_16x16 = vpx_d153_predictor_16x16_neon; + vpx_d153_predictor_32x32 = vpx_d153_predictor_32x32_c; + if (flags & HAS_NEON) vpx_d153_predictor_32x32 = vpx_d153_predictor_32x32_neon; + vpx_d153_predictor_4x4 = vpx_d153_predictor_4x4_c; + if (flags & HAS_NEON) vpx_d153_predictor_4x4 = vpx_d153_predictor_4x4_neon; + vpx_d153_predictor_8x8 = vpx_d153_predictor_8x8_c; + if (flags & HAS_NEON) vpx_d153_predictor_8x8 = vpx_d153_predictor_8x8_neon; + vpx_d207_predictor_16x16 = vpx_d207_predictor_16x16_c; + if (flags & HAS_NEON) vpx_d207_predictor_16x16 = vpx_d207_predictor_16x16_neon; + vpx_d207_predictor_32x32 = vpx_d207_predictor_32x32_c; + if (flags & HAS_NEON) vpx_d207_predictor_32x32 = vpx_d207_predictor_32x32_neon; + vpx_d207_predictor_4x4 = vpx_d207_predictor_4x4_c; + if (flags & HAS_NEON) vpx_d207_predictor_4x4 = vpx_d207_predictor_4x4_neon; + vpx_d207_predictor_8x8 = vpx_d207_predictor_8x8_c; + if (flags & HAS_NEON) vpx_d207_predictor_8x8 = vpx_d207_predictor_8x8_neon; + vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_c; + if (flags & HAS_NEON) vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_neon; + vpx_d45_predictor_32x32 = vpx_d45_predictor_32x32_c; + if (flags & HAS_NEON) vpx_d45_predictor_32x32 = vpx_d45_predictor_32x32_neon; + vpx_d45_predictor_4x4 = vpx_d45_predictor_4x4_c; + if (flags & HAS_NEON) vpx_d45_predictor_4x4 = vpx_d45_predictor_4x4_neon; + vpx_d45_predictor_8x8 = vpx_d45_predictor_8x8_c; + if (flags & HAS_NEON) vpx_d45_predictor_8x8 = vpx_d45_predictor_8x8_neon; + vpx_d63_predictor_16x16 = vpx_d63_predictor_16x16_c; + if (flags & HAS_NEON) vpx_d63_predictor_16x16 = vpx_d63_predictor_16x16_neon; + vpx_d63_predictor_32x32 = vpx_d63_predictor_32x32_c; + if (flags & HAS_NEON) vpx_d63_predictor_32x32 = vpx_d63_predictor_32x32_neon; + vpx_d63_predictor_4x4 = vpx_d63_predictor_4x4_c; + if (flags & HAS_NEON) vpx_d63_predictor_4x4 = vpx_d63_predictor_4x4_neon; + vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_c; + if (flags & HAS_NEON) vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_neon; + vpx_dc_128_predictor_16x16 = vpx_dc_128_predictor_16x16_c; + if (flags & HAS_NEON) vpx_dc_128_predictor_16x16 = vpx_dc_128_predictor_16x16_neon; + vpx_dc_128_predictor_32x32 = vpx_dc_128_predictor_32x32_c; + if (flags & HAS_NEON) vpx_dc_128_predictor_32x32 = vpx_dc_128_predictor_32x32_neon; + vpx_dc_128_predictor_4x4 = vpx_dc_128_predictor_4x4_c; + if (flags & HAS_NEON) vpx_dc_128_predictor_4x4 = vpx_dc_128_predictor_4x4_neon; + vpx_dc_128_predictor_8x8 = vpx_dc_128_predictor_8x8_c; + if (flags & HAS_NEON) vpx_dc_128_predictor_8x8 = vpx_dc_128_predictor_8x8_neon; + vpx_dc_left_predictor_16x16 = vpx_dc_left_predictor_16x16_c; + if (flags & HAS_NEON) vpx_dc_left_predictor_16x16 = vpx_dc_left_predictor_16x16_neon; + vpx_dc_left_predictor_32x32 = vpx_dc_left_predictor_32x32_c; + if (flags & HAS_NEON) vpx_dc_left_predictor_32x32 = vpx_dc_left_predictor_32x32_neon; + vpx_dc_left_predictor_4x4 = vpx_dc_left_predictor_4x4_c; + if (flags & HAS_NEON) vpx_dc_left_predictor_4x4 = vpx_dc_left_predictor_4x4_neon; + vpx_dc_left_predictor_8x8 = vpx_dc_left_predictor_8x8_c; + if (flags & HAS_NEON) vpx_dc_left_predictor_8x8 = vpx_dc_left_predictor_8x8_neon; + vpx_dc_predictor_16x16 = vpx_dc_predictor_16x16_c; + if (flags & HAS_NEON) vpx_dc_predictor_16x16 = vpx_dc_predictor_16x16_neon; + vpx_dc_predictor_32x32 = vpx_dc_predictor_32x32_c; + if (flags & HAS_NEON) vpx_dc_predictor_32x32 = vpx_dc_predictor_32x32_neon; + vpx_dc_predictor_4x4 = vpx_dc_predictor_4x4_c; + if (flags & HAS_NEON) vpx_dc_predictor_4x4 = vpx_dc_predictor_4x4_neon; + vpx_dc_predictor_8x8 = vpx_dc_predictor_8x8_c; + if (flags & HAS_NEON) vpx_dc_predictor_8x8 = vpx_dc_predictor_8x8_neon; + vpx_dc_top_predictor_16x16 = vpx_dc_top_predictor_16x16_c; + if (flags & HAS_NEON) vpx_dc_top_predictor_16x16 = vpx_dc_top_predictor_16x16_neon; + vpx_dc_top_predictor_32x32 = vpx_dc_top_predictor_32x32_c; + if (flags & HAS_NEON) vpx_dc_top_predictor_32x32 = vpx_dc_top_predictor_32x32_neon; + vpx_dc_top_predictor_4x4 = vpx_dc_top_predictor_4x4_c; + if (flags & HAS_NEON) vpx_dc_top_predictor_4x4 = vpx_dc_top_predictor_4x4_neon; + vpx_dc_top_predictor_8x8 = vpx_dc_top_predictor_8x8_c; + if (flags & HAS_NEON) vpx_dc_top_predictor_8x8 = vpx_dc_top_predictor_8x8_neon; + vpx_fdct16x16 = vpx_fdct16x16_c; + if (flags & HAS_NEON) vpx_fdct16x16 = vpx_fdct16x16_neon; + vpx_fdct16x16_1 = vpx_fdct16x16_1_c; + if (flags & HAS_NEON) vpx_fdct16x16_1 = vpx_fdct16x16_1_neon; + vpx_fdct32x32 = vpx_fdct32x32_c; + if (flags & HAS_NEON) vpx_fdct32x32 = vpx_fdct32x32_neon; + vpx_fdct32x32_1 = vpx_fdct32x32_1_c; + if (flags & HAS_NEON) vpx_fdct32x32_1 = vpx_fdct32x32_1_neon; + vpx_fdct32x32_rd = vpx_fdct32x32_rd_c; + if (flags & HAS_NEON) vpx_fdct32x32_rd = vpx_fdct32x32_rd_neon; + vpx_fdct4x4 = vpx_fdct4x4_c; + if (flags & HAS_NEON) vpx_fdct4x4 = vpx_fdct4x4_neon; + vpx_fdct4x4_1 = vpx_fdct4x4_1_c; + if (flags & HAS_NEON) vpx_fdct4x4_1 = vpx_fdct4x4_1_neon; + vpx_fdct8x8 = vpx_fdct8x8_c; + if (flags & HAS_NEON) vpx_fdct8x8 = vpx_fdct8x8_neon; + vpx_fdct8x8_1 = vpx_fdct8x8_1_c; + if (flags & HAS_NEON) vpx_fdct8x8_1 = vpx_fdct8x8_1_neon; + vpx_get16x16var = vpx_get16x16var_c; + if (flags & HAS_NEON) vpx_get16x16var = vpx_get16x16var_neon; + vpx_get4x4sse_cs = vpx_get4x4sse_cs_c; + if (flags & HAS_NEON) vpx_get4x4sse_cs = vpx_get4x4sse_cs_neon; + vpx_get8x8var = vpx_get8x8var_c; + if (flags & HAS_NEON) vpx_get8x8var = vpx_get8x8var_neon; + vpx_h_predictor_16x16 = vpx_h_predictor_16x16_c; + if (flags & HAS_NEON) vpx_h_predictor_16x16 = vpx_h_predictor_16x16_neon; + vpx_h_predictor_32x32 = vpx_h_predictor_32x32_c; + if (flags & HAS_NEON) vpx_h_predictor_32x32 = vpx_h_predictor_32x32_neon; + vpx_h_predictor_4x4 = vpx_h_predictor_4x4_c; + if (flags & HAS_NEON) vpx_h_predictor_4x4 = vpx_h_predictor_4x4_neon; + vpx_h_predictor_8x8 = vpx_h_predictor_8x8_c; + if (flags & HAS_NEON) vpx_h_predictor_8x8 = vpx_h_predictor_8x8_neon; + vpx_hadamard_16x16 = vpx_hadamard_16x16_c; + if (flags & HAS_NEON) vpx_hadamard_16x16 = vpx_hadamard_16x16_neon; + vpx_hadamard_32x32 = vpx_hadamard_32x32_c; + if (flags & HAS_NEON) vpx_hadamard_32x32 = vpx_hadamard_32x32_neon; + vpx_hadamard_8x8 = vpx_hadamard_8x8_c; + if (flags & HAS_NEON) vpx_hadamard_8x8 = vpx_hadamard_8x8_neon; + vpx_idct16x16_10_add = vpx_idct16x16_10_add_c; + if (flags & HAS_NEON) vpx_idct16x16_10_add = vpx_idct16x16_10_add_neon; + vpx_idct16x16_1_add = vpx_idct16x16_1_add_c; + if (flags & HAS_NEON) vpx_idct16x16_1_add = vpx_idct16x16_1_add_neon; + vpx_idct16x16_256_add = vpx_idct16x16_256_add_c; + if (flags & HAS_NEON) vpx_idct16x16_256_add = vpx_idct16x16_256_add_neon; + vpx_idct16x16_38_add = vpx_idct16x16_38_add_c; + if (flags & HAS_NEON) vpx_idct16x16_38_add = vpx_idct16x16_38_add_neon; + vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_c; + if (flags & HAS_NEON) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_neon; + vpx_idct32x32_135_add = vpx_idct32x32_135_add_c; + if (flags & HAS_NEON) vpx_idct32x32_135_add = vpx_idct32x32_135_add_neon; + vpx_idct32x32_1_add = vpx_idct32x32_1_add_c; + if (flags & HAS_NEON) vpx_idct32x32_1_add = vpx_idct32x32_1_add_neon; + vpx_idct32x32_34_add = vpx_idct32x32_34_add_c; + if (flags & HAS_NEON) vpx_idct32x32_34_add = vpx_idct32x32_34_add_neon; + vpx_idct4x4_16_add = vpx_idct4x4_16_add_c; + if (flags & HAS_NEON) vpx_idct4x4_16_add = vpx_idct4x4_16_add_neon; + vpx_idct4x4_1_add = vpx_idct4x4_1_add_c; + if (flags & HAS_NEON) vpx_idct4x4_1_add = vpx_idct4x4_1_add_neon; + vpx_idct8x8_12_add = vpx_idct8x8_12_add_c; + if (flags & HAS_NEON) vpx_idct8x8_12_add = vpx_idct8x8_12_add_neon; + vpx_idct8x8_1_add = vpx_idct8x8_1_add_c; + if (flags & HAS_NEON) vpx_idct8x8_1_add = vpx_idct8x8_1_add_neon; + vpx_idct8x8_64_add = vpx_idct8x8_64_add_c; + if (flags & HAS_NEON) vpx_idct8x8_64_add = vpx_idct8x8_64_add_neon; + vpx_int_pro_col = vpx_int_pro_col_c; + if (flags & HAS_NEON) vpx_int_pro_col = vpx_int_pro_col_neon; + vpx_int_pro_row = vpx_int_pro_row_c; + if (flags & HAS_NEON) vpx_int_pro_row = vpx_int_pro_row_neon; + vpx_lpf_horizontal_16 = vpx_lpf_horizontal_16_c; + if (flags & HAS_NEON) vpx_lpf_horizontal_16 = vpx_lpf_horizontal_16_neon; + vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_c; + if (flags & HAS_NEON) vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_neon; + vpx_lpf_horizontal_4 = vpx_lpf_horizontal_4_c; + if (flags & HAS_NEON) vpx_lpf_horizontal_4 = vpx_lpf_horizontal_4_neon; + vpx_lpf_horizontal_4_dual = vpx_lpf_horizontal_4_dual_c; + if (flags & HAS_NEON) vpx_lpf_horizontal_4_dual = vpx_lpf_horizontal_4_dual_neon; + vpx_lpf_horizontal_8 = vpx_lpf_horizontal_8_c; + if (flags & HAS_NEON) vpx_lpf_horizontal_8 = vpx_lpf_horizontal_8_neon; + vpx_lpf_horizontal_8_dual = vpx_lpf_horizontal_8_dual_c; + if (flags & HAS_NEON) vpx_lpf_horizontal_8_dual = vpx_lpf_horizontal_8_dual_neon; + vpx_lpf_vertical_16 = vpx_lpf_vertical_16_c; + if (flags & HAS_NEON) vpx_lpf_vertical_16 = vpx_lpf_vertical_16_neon; + vpx_lpf_vertical_16_dual = vpx_lpf_vertical_16_dual_c; + if (flags & HAS_NEON) vpx_lpf_vertical_16_dual = vpx_lpf_vertical_16_dual_neon; + vpx_lpf_vertical_4 = vpx_lpf_vertical_4_c; + if (flags & HAS_NEON) vpx_lpf_vertical_4 = vpx_lpf_vertical_4_neon; + vpx_lpf_vertical_4_dual = vpx_lpf_vertical_4_dual_c; + if (flags & HAS_NEON) vpx_lpf_vertical_4_dual = vpx_lpf_vertical_4_dual_neon; + vpx_lpf_vertical_8 = vpx_lpf_vertical_8_c; + if (flags & HAS_NEON) vpx_lpf_vertical_8 = vpx_lpf_vertical_8_neon; + vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_c; + if (flags & HAS_NEON) vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_neon; + vpx_minmax_8x8 = vpx_minmax_8x8_c; + if (flags & HAS_NEON) vpx_minmax_8x8 = vpx_minmax_8x8_neon; + vpx_mse16x16 = vpx_mse16x16_c; + if (flags & HAS_NEON) vpx_mse16x16 = vpx_mse16x16_neon; + vpx_mse16x8 = vpx_mse16x8_c; + if (flags & HAS_NEON) vpx_mse16x8 = vpx_mse16x8_neon; + vpx_mse8x16 = vpx_mse8x16_c; + if (flags & HAS_NEON) vpx_mse8x16 = vpx_mse8x16_neon; + vpx_mse8x8 = vpx_mse8x8_c; + if (flags & HAS_NEON) vpx_mse8x8 = vpx_mse8x8_neon; + vpx_quantize_b = vpx_quantize_b_c; + if (flags & HAS_NEON) vpx_quantize_b = vpx_quantize_b_neon; + vpx_quantize_b_32x32 = vpx_quantize_b_32x32_c; + if (flags & HAS_NEON) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_neon; + vpx_sad16x16 = vpx_sad16x16_c; + if (flags & HAS_NEON) vpx_sad16x16 = vpx_sad16x16_neon; + vpx_sad16x16_avg = vpx_sad16x16_avg_c; + if (flags & HAS_NEON) vpx_sad16x16_avg = vpx_sad16x16_avg_neon; + vpx_sad16x16x4d = vpx_sad16x16x4d_c; + if (flags & HAS_NEON) vpx_sad16x16x4d = vpx_sad16x16x4d_neon; + vpx_sad16x32 = vpx_sad16x32_c; + if (flags & HAS_NEON) vpx_sad16x32 = vpx_sad16x32_neon; + vpx_sad16x32_avg = vpx_sad16x32_avg_c; + if (flags & HAS_NEON) vpx_sad16x32_avg = vpx_sad16x32_avg_neon; + vpx_sad16x32x4d = vpx_sad16x32x4d_c; + if (flags & HAS_NEON) vpx_sad16x32x4d = vpx_sad16x32x4d_neon; + vpx_sad16x8 = vpx_sad16x8_c; + if (flags & HAS_NEON) vpx_sad16x8 = vpx_sad16x8_neon; + vpx_sad16x8_avg = vpx_sad16x8_avg_c; + if (flags & HAS_NEON) vpx_sad16x8_avg = vpx_sad16x8_avg_neon; + vpx_sad16x8x4d = vpx_sad16x8x4d_c; + if (flags & HAS_NEON) vpx_sad16x8x4d = vpx_sad16x8x4d_neon; + vpx_sad32x16 = vpx_sad32x16_c; + if (flags & HAS_NEON) vpx_sad32x16 = vpx_sad32x16_neon; + vpx_sad32x16_avg = vpx_sad32x16_avg_c; + if (flags & HAS_NEON) vpx_sad32x16_avg = vpx_sad32x16_avg_neon; + vpx_sad32x16x4d = vpx_sad32x16x4d_c; + if (flags & HAS_NEON) vpx_sad32x16x4d = vpx_sad32x16x4d_neon; + vpx_sad32x32 = vpx_sad32x32_c; + if (flags & HAS_NEON) vpx_sad32x32 = vpx_sad32x32_neon; + vpx_sad32x32_avg = vpx_sad32x32_avg_c; + if (flags & HAS_NEON) vpx_sad32x32_avg = vpx_sad32x32_avg_neon; + vpx_sad32x32x4d = vpx_sad32x32x4d_c; + if (flags & HAS_NEON) vpx_sad32x32x4d = vpx_sad32x32x4d_neon; + vpx_sad32x64 = vpx_sad32x64_c; + if (flags & HAS_NEON) vpx_sad32x64 = vpx_sad32x64_neon; + vpx_sad32x64_avg = vpx_sad32x64_avg_c; + if (flags & HAS_NEON) vpx_sad32x64_avg = vpx_sad32x64_avg_neon; + vpx_sad32x64x4d = vpx_sad32x64x4d_c; + if (flags & HAS_NEON) vpx_sad32x64x4d = vpx_sad32x64x4d_neon; + vpx_sad4x4 = vpx_sad4x4_c; + if (flags & HAS_NEON) vpx_sad4x4 = vpx_sad4x4_neon; + vpx_sad4x4_avg = vpx_sad4x4_avg_c; + if (flags & HAS_NEON) vpx_sad4x4_avg = vpx_sad4x4_avg_neon; + vpx_sad4x4x4d = vpx_sad4x4x4d_c; + if (flags & HAS_NEON) vpx_sad4x4x4d = vpx_sad4x4x4d_neon; + vpx_sad4x8 = vpx_sad4x8_c; + if (flags & HAS_NEON) vpx_sad4x8 = vpx_sad4x8_neon; + vpx_sad4x8_avg = vpx_sad4x8_avg_c; + if (flags & HAS_NEON) vpx_sad4x8_avg = vpx_sad4x8_avg_neon; + vpx_sad4x8x4d = vpx_sad4x8x4d_c; + if (flags & HAS_NEON) vpx_sad4x8x4d = vpx_sad4x8x4d_neon; + vpx_sad64x32 = vpx_sad64x32_c; + if (flags & HAS_NEON) vpx_sad64x32 = vpx_sad64x32_neon; + vpx_sad64x32_avg = vpx_sad64x32_avg_c; + if (flags & HAS_NEON) vpx_sad64x32_avg = vpx_sad64x32_avg_neon; + vpx_sad64x32x4d = vpx_sad64x32x4d_c; + if (flags & HAS_NEON) vpx_sad64x32x4d = vpx_sad64x32x4d_neon; + vpx_sad64x64 = vpx_sad64x64_c; + if (flags & HAS_NEON) vpx_sad64x64 = vpx_sad64x64_neon; + vpx_sad64x64_avg = vpx_sad64x64_avg_c; + if (flags & HAS_NEON) vpx_sad64x64_avg = vpx_sad64x64_avg_neon; + vpx_sad64x64x4d = vpx_sad64x64x4d_c; + if (flags & HAS_NEON) vpx_sad64x64x4d = vpx_sad64x64x4d_neon; + vpx_sad8x16 = vpx_sad8x16_c; + if (flags & HAS_NEON) vpx_sad8x16 = vpx_sad8x16_neon; + vpx_sad8x16_avg = vpx_sad8x16_avg_c; + if (flags & HAS_NEON) vpx_sad8x16_avg = vpx_sad8x16_avg_neon; + vpx_sad8x16x4d = vpx_sad8x16x4d_c; + if (flags & HAS_NEON) vpx_sad8x16x4d = vpx_sad8x16x4d_neon; + vpx_sad8x4 = vpx_sad8x4_c; + if (flags & HAS_NEON) vpx_sad8x4 = vpx_sad8x4_neon; + vpx_sad8x4_avg = vpx_sad8x4_avg_c; + if (flags & HAS_NEON) vpx_sad8x4_avg = vpx_sad8x4_avg_neon; + vpx_sad8x4x4d = vpx_sad8x4x4d_c; + if (flags & HAS_NEON) vpx_sad8x4x4d = vpx_sad8x4x4d_neon; + vpx_sad8x8 = vpx_sad8x8_c; + if (flags & HAS_NEON) vpx_sad8x8 = vpx_sad8x8_neon; + vpx_sad8x8_avg = vpx_sad8x8_avg_c; + if (flags & HAS_NEON) vpx_sad8x8_avg = vpx_sad8x8_avg_neon; + vpx_sad8x8x4d = vpx_sad8x8x4d_c; + if (flags & HAS_NEON) vpx_sad8x8x4d = vpx_sad8x8x4d_neon; + vpx_sad_skip_16x16 = vpx_sad_skip_16x16_c; + if (flags & HAS_NEON) vpx_sad_skip_16x16 = vpx_sad_skip_16x16_neon; + vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_c; + if (flags & HAS_NEON) vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_neon; + vpx_sad_skip_16x32 = vpx_sad_skip_16x32_c; + if (flags & HAS_NEON) vpx_sad_skip_16x32 = vpx_sad_skip_16x32_neon; + vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_c; + if (flags & HAS_NEON) vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_neon; + vpx_sad_skip_16x8 = vpx_sad_skip_16x8_c; + if (flags & HAS_NEON) vpx_sad_skip_16x8 = vpx_sad_skip_16x8_neon; + vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_c; + if (flags & HAS_NEON) vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_neon; + vpx_sad_skip_32x16 = vpx_sad_skip_32x16_c; + if (flags & HAS_NEON) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_neon; + vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_c; + if (flags & HAS_NEON) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_neon; + vpx_sad_skip_32x32 = vpx_sad_skip_32x32_c; + if (flags & HAS_NEON) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_neon; + vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_c; + if (flags & HAS_NEON) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_neon; + vpx_sad_skip_32x64 = vpx_sad_skip_32x64_c; + if (flags & HAS_NEON) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_neon; + vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_c; + if (flags & HAS_NEON) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_neon; + vpx_sad_skip_4x4 = vpx_sad_skip_4x4_c; + if (flags & HAS_NEON) vpx_sad_skip_4x4 = vpx_sad_skip_4x4_neon; + vpx_sad_skip_4x4x4d = vpx_sad_skip_4x4x4d_c; + if (flags & HAS_NEON) vpx_sad_skip_4x4x4d = vpx_sad_skip_4x4x4d_neon; + vpx_sad_skip_4x8 = vpx_sad_skip_4x8_c; + if (flags & HAS_NEON) vpx_sad_skip_4x8 = vpx_sad_skip_4x8_neon; + vpx_sad_skip_4x8x4d = vpx_sad_skip_4x8x4d_c; + if (flags & HAS_NEON) vpx_sad_skip_4x8x4d = vpx_sad_skip_4x8x4d_neon; + vpx_sad_skip_64x32 = vpx_sad_skip_64x32_c; + if (flags & HAS_NEON) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_neon; + vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_c; + if (flags & HAS_NEON) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_neon; + vpx_sad_skip_64x64 = vpx_sad_skip_64x64_c; + if (flags & HAS_NEON) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_neon; + vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_c; + if (flags & HAS_NEON) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_neon; + vpx_sad_skip_8x16 = vpx_sad_skip_8x16_c; + if (flags & HAS_NEON) vpx_sad_skip_8x16 = vpx_sad_skip_8x16_neon; + vpx_sad_skip_8x16x4d = vpx_sad_skip_8x16x4d_c; + if (flags & HAS_NEON) vpx_sad_skip_8x16x4d = vpx_sad_skip_8x16x4d_neon; + vpx_sad_skip_8x4 = vpx_sad_skip_8x4_c; + if (flags & HAS_NEON) vpx_sad_skip_8x4 = vpx_sad_skip_8x4_neon; + vpx_sad_skip_8x4x4d = vpx_sad_skip_8x4x4d_c; + if (flags & HAS_NEON) vpx_sad_skip_8x4x4d = vpx_sad_skip_8x4x4d_neon; + vpx_sad_skip_8x8 = vpx_sad_skip_8x8_c; + if (flags & HAS_NEON) vpx_sad_skip_8x8 = vpx_sad_skip_8x8_neon; + vpx_sad_skip_8x8x4d = vpx_sad_skip_8x8x4d_c; + if (flags & HAS_NEON) vpx_sad_skip_8x8x4d = vpx_sad_skip_8x8x4d_neon; + vpx_satd = vpx_satd_c; + if (flags & HAS_NEON) vpx_satd = vpx_satd_neon; + vpx_scaled_2d = vpx_scaled_2d_c; + if (flags & HAS_NEON) vpx_scaled_2d = vpx_scaled_2d_neon; + vpx_sse = vpx_sse_c; + if (flags & HAS_NEON) vpx_sse = vpx_sse_neon; + vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_c; + if (flags & HAS_NEON) vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_neon; + vpx_sub_pixel_avg_variance16x32 = vpx_sub_pixel_avg_variance16x32_c; + if (flags & HAS_NEON) vpx_sub_pixel_avg_variance16x32 = vpx_sub_pixel_avg_variance16x32_neon; + vpx_sub_pixel_avg_variance16x8 = vpx_sub_pixel_avg_variance16x8_c; + if (flags & HAS_NEON) vpx_sub_pixel_avg_variance16x8 = vpx_sub_pixel_avg_variance16x8_neon; + vpx_sub_pixel_avg_variance32x16 = vpx_sub_pixel_avg_variance32x16_c; + if (flags & HAS_NEON) vpx_sub_pixel_avg_variance32x16 = vpx_sub_pixel_avg_variance32x16_neon; + vpx_sub_pixel_avg_variance32x32 = vpx_sub_pixel_avg_variance32x32_c; + if (flags & HAS_NEON) vpx_sub_pixel_avg_variance32x32 = vpx_sub_pixel_avg_variance32x32_neon; + vpx_sub_pixel_avg_variance32x64 = vpx_sub_pixel_avg_variance32x64_c; + if (flags & HAS_NEON) vpx_sub_pixel_avg_variance32x64 = vpx_sub_pixel_avg_variance32x64_neon; + vpx_sub_pixel_avg_variance4x4 = vpx_sub_pixel_avg_variance4x4_c; + if (flags & HAS_NEON) vpx_sub_pixel_avg_variance4x4 = vpx_sub_pixel_avg_variance4x4_neon; + vpx_sub_pixel_avg_variance4x8 = vpx_sub_pixel_avg_variance4x8_c; + if (flags & HAS_NEON) vpx_sub_pixel_avg_variance4x8 = vpx_sub_pixel_avg_variance4x8_neon; + vpx_sub_pixel_avg_variance64x32 = vpx_sub_pixel_avg_variance64x32_c; + if (flags & HAS_NEON) vpx_sub_pixel_avg_variance64x32 = vpx_sub_pixel_avg_variance64x32_neon; + vpx_sub_pixel_avg_variance64x64 = vpx_sub_pixel_avg_variance64x64_c; + if (flags & HAS_NEON) vpx_sub_pixel_avg_variance64x64 = vpx_sub_pixel_avg_variance64x64_neon; + vpx_sub_pixel_avg_variance8x16 = vpx_sub_pixel_avg_variance8x16_c; + if (flags & HAS_NEON) vpx_sub_pixel_avg_variance8x16 = vpx_sub_pixel_avg_variance8x16_neon; + vpx_sub_pixel_avg_variance8x4 = vpx_sub_pixel_avg_variance8x4_c; + if (flags & HAS_NEON) vpx_sub_pixel_avg_variance8x4 = vpx_sub_pixel_avg_variance8x4_neon; + vpx_sub_pixel_avg_variance8x8 = vpx_sub_pixel_avg_variance8x8_c; + if (flags & HAS_NEON) vpx_sub_pixel_avg_variance8x8 = vpx_sub_pixel_avg_variance8x8_neon; + vpx_sub_pixel_variance16x16 = vpx_sub_pixel_variance16x16_c; + if (flags & HAS_NEON) vpx_sub_pixel_variance16x16 = vpx_sub_pixel_variance16x16_neon; + vpx_sub_pixel_variance16x32 = vpx_sub_pixel_variance16x32_c; + if (flags & HAS_NEON) vpx_sub_pixel_variance16x32 = vpx_sub_pixel_variance16x32_neon; + vpx_sub_pixel_variance16x8 = vpx_sub_pixel_variance16x8_c; + if (flags & HAS_NEON) vpx_sub_pixel_variance16x8 = vpx_sub_pixel_variance16x8_neon; + vpx_sub_pixel_variance32x16 = vpx_sub_pixel_variance32x16_c; + if (flags & HAS_NEON) vpx_sub_pixel_variance32x16 = vpx_sub_pixel_variance32x16_neon; + vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_c; + if (flags & HAS_NEON) vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_neon; + vpx_sub_pixel_variance32x64 = vpx_sub_pixel_variance32x64_c; + if (flags & HAS_NEON) vpx_sub_pixel_variance32x64 = vpx_sub_pixel_variance32x64_neon; + vpx_sub_pixel_variance4x4 = vpx_sub_pixel_variance4x4_c; + if (flags & HAS_NEON) vpx_sub_pixel_variance4x4 = vpx_sub_pixel_variance4x4_neon; + vpx_sub_pixel_variance4x8 = vpx_sub_pixel_variance4x8_c; + if (flags & HAS_NEON) vpx_sub_pixel_variance4x8 = vpx_sub_pixel_variance4x8_neon; + vpx_sub_pixel_variance64x32 = vpx_sub_pixel_variance64x32_c; + if (flags & HAS_NEON) vpx_sub_pixel_variance64x32 = vpx_sub_pixel_variance64x32_neon; + vpx_sub_pixel_variance64x64 = vpx_sub_pixel_variance64x64_c; + if (flags & HAS_NEON) vpx_sub_pixel_variance64x64 = vpx_sub_pixel_variance64x64_neon; + vpx_sub_pixel_variance8x16 = vpx_sub_pixel_variance8x16_c; + if (flags & HAS_NEON) vpx_sub_pixel_variance8x16 = vpx_sub_pixel_variance8x16_neon; + vpx_sub_pixel_variance8x4 = vpx_sub_pixel_variance8x4_c; + if (flags & HAS_NEON) vpx_sub_pixel_variance8x4 = vpx_sub_pixel_variance8x4_neon; + vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_c; + if (flags & HAS_NEON) vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_neon; + vpx_subtract_block = vpx_subtract_block_c; + if (flags & HAS_NEON) vpx_subtract_block = vpx_subtract_block_neon; + vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_c; + if (flags & HAS_NEON) vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_neon; + vpx_tm_predictor_16x16 = vpx_tm_predictor_16x16_c; + if (flags & HAS_NEON) vpx_tm_predictor_16x16 = vpx_tm_predictor_16x16_neon; + vpx_tm_predictor_32x32 = vpx_tm_predictor_32x32_c; + if (flags & HAS_NEON) vpx_tm_predictor_32x32 = vpx_tm_predictor_32x32_neon; + vpx_tm_predictor_4x4 = vpx_tm_predictor_4x4_c; + if (flags & HAS_NEON) vpx_tm_predictor_4x4 = vpx_tm_predictor_4x4_neon; + vpx_tm_predictor_8x8 = vpx_tm_predictor_8x8_c; + if (flags & HAS_NEON) vpx_tm_predictor_8x8 = vpx_tm_predictor_8x8_neon; + vpx_v_predictor_16x16 = vpx_v_predictor_16x16_c; + if (flags & HAS_NEON) vpx_v_predictor_16x16 = vpx_v_predictor_16x16_neon; + vpx_v_predictor_32x32 = vpx_v_predictor_32x32_c; + if (flags & HAS_NEON) vpx_v_predictor_32x32 = vpx_v_predictor_32x32_neon; + vpx_v_predictor_4x4 = vpx_v_predictor_4x4_c; + if (flags & HAS_NEON) vpx_v_predictor_4x4 = vpx_v_predictor_4x4_neon; + vpx_v_predictor_8x8 = vpx_v_predictor_8x8_c; + if (flags & HAS_NEON) vpx_v_predictor_8x8 = vpx_v_predictor_8x8_neon; + vpx_variance16x16 = vpx_variance16x16_c; + if (flags & HAS_NEON) vpx_variance16x16 = vpx_variance16x16_neon; + vpx_variance16x32 = vpx_variance16x32_c; + if (flags & HAS_NEON) vpx_variance16x32 = vpx_variance16x32_neon; + vpx_variance16x8 = vpx_variance16x8_c; + if (flags & HAS_NEON) vpx_variance16x8 = vpx_variance16x8_neon; + vpx_variance32x16 = vpx_variance32x16_c; + if (flags & HAS_NEON) vpx_variance32x16 = vpx_variance32x16_neon; + vpx_variance32x32 = vpx_variance32x32_c; + if (flags & HAS_NEON) vpx_variance32x32 = vpx_variance32x32_neon; + vpx_variance32x64 = vpx_variance32x64_c; + if (flags & HAS_NEON) vpx_variance32x64 = vpx_variance32x64_neon; + vpx_variance4x4 = vpx_variance4x4_c; + if (flags & HAS_NEON) vpx_variance4x4 = vpx_variance4x4_neon; + vpx_variance4x8 = vpx_variance4x8_c; + if (flags & HAS_NEON) vpx_variance4x8 = vpx_variance4x8_neon; + vpx_variance64x32 = vpx_variance64x32_c; + if (flags & HAS_NEON) vpx_variance64x32 = vpx_variance64x32_neon; + vpx_variance64x64 = vpx_variance64x64_c; + if (flags & HAS_NEON) vpx_variance64x64 = vpx_variance64x64_neon; + vpx_variance8x16 = vpx_variance8x16_c; + if (flags & HAS_NEON) vpx_variance8x16 = vpx_variance8x16_neon; + vpx_variance8x4 = vpx_variance8x4_c; + if (flags & HAS_NEON) vpx_variance8x4 = vpx_variance8x4_neon; + vpx_variance8x8 = vpx_variance8x8_c; + if (flags & HAS_NEON) vpx_variance8x8 = vpx_variance8x8_neon; + vpx_vector_var = vpx_vector_var_c; + if (flags & HAS_NEON) vpx_vector_var = vpx_vector_var_neon; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/linux/arm/vpx_scale_rtcd.h b/media/libvpx/config/linux/arm/vpx_scale_rtcd.h new file mode 100644 index 0000000000..b371368275 --- /dev/null +++ b/media/libvpx/config/linux/arm/vpx_scale_rtcd.h @@ -0,0 +1,75 @@ +// This file is generated. Do not edit. +#ifndef VPX_SCALE_RTCD_H_ +#define VPX_SCALE_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c + +void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c + +void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c + +void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c + +void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c + +void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c + +void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c + +void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vp8_yv12_copy_frame vp8_yv12_copy_frame_c + +void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c + +void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_borders vpx_extend_frame_borders_c + +void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c + +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c + +void vpx_scale_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + (void)flags; + +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/linux/arm64/vp8_rtcd.h b/media/libvpx/config/linux/arm64/vp8_rtcd.h new file mode 100644 index 0000000000..d204ef751e --- /dev/null +++ b/media/libvpx/config/linux/arm64/vp8_rtcd.h @@ -0,0 +1,201 @@ +// This file is generated. Do not edit. +#ifndef VP8_RTCD_H_ +#define VP8_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP8 + */ + +struct blockd; +struct macroblockd; +struct loop_filter_info; + +/* Encoder forward decls */ +struct block; +struct macroblock; +struct variance_vtable; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict16x16_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_neon + +void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_neon + +void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_neon + +void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_neon + +int vp8_block_error_c(short *coeff, short *dqcoeff); +#define vp8_block_error vp8_block_error_c + +void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +#define vp8_copy32xn vp8_copy32xn_c + +void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem16x16_neon(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem16x16 vp8_copy_mem16x16_neon + +void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x4_neon(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem8x4 vp8_copy_mem8x4_neon + +void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x8_neon(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem8x8 vp8_copy_mem8x8_neon + +void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +#define vp8_dc_only_idct_add vp8_dc_only_idct_add_neon + +int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter vp8_denoiser_filter_neon + +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_uv_neon(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_neon + +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride); +void vp8_dequant_idct_add_neon(short *input, short *dq, unsigned char *dest, int stride); +#define vp8_dequant_idct_add vp8_dequant_idct_add_neon + +void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon + +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon + +void vp8_dequantize_b_c(struct blockd*, short *DQC); +void vp8_dequantize_b_neon(struct blockd*, short *DQC); +#define vp8_dequantize_b vp8_dequantize_b_neon + +int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_diamond_search_sad vp8_diamond_search_sad_c + +void vp8_fast_quantize_b_c(struct block *, struct blockd *); +void vp8_fast_quantize_b_neon(struct block *, struct blockd *); +#define vp8_fast_quantize_b vp8_fast_quantize_b_neon + +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bh vp8_loop_filter_bh_neon + +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bv vp8_loop_filter_bv_neon + +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbh vp8_loop_filter_mbh_neon + +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbv vp8_loop_filter_mbv_neon + +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_neon + +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bvs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_neon + +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbh vp8_loop_filter_mbhs_neon + +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbv vp8_loop_filter_mbvs_neon + +int vp8_mbblock_error_c(struct macroblock *mb, int dc); +#define vp8_mbblock_error vp8_mbblock_error_c + +int vp8_mbuverror_c(struct macroblock *mb); +#define vp8_mbuverror vp8_mbuverror_c + +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_refining_search_sad vp8_refining_search_sad_c + +void vp8_regular_quantize_b_c(struct block *, struct blockd *); +#define vp8_regular_quantize_b vp8_regular_quantize_b_c + +void vp8_short_fdct4x4_c(short *input, short *output, int pitch); +void vp8_short_fdct4x4_neon(short *input, short *output, int pitch); +#define vp8_short_fdct4x4 vp8_short_fdct4x4_neon + +void vp8_short_fdct8x4_c(short *input, short *output, int pitch); +void vp8_short_fdct8x4_neon(short *input, short *output, int pitch); +#define vp8_short_fdct8x4 vp8_short_fdct8x4_neon + +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_short_idct4x4llm_neon(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +#define vp8_short_idct4x4llm vp8_short_idct4x4llm_neon + +void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff); +void vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff); +#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_neon + +void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + +void vp8_short_walsh4x4_c(short *input, short *output, int pitch); +void vp8_short_walsh4x4_neon(short *input, short *output, int pitch); +#define vp8_short_walsh4x4 vp8_short_walsh4x4_neon + +void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_neon + +void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_neon + +void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_neon + +void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_neon + +void vp8_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + (void)flags; + +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/linux/arm64/vp9_rtcd.h b/media/libvpx/config/linux/arm64/vp9_rtcd.h new file mode 100644 index 0000000000..738de4f9f4 --- /dev/null +++ b/media/libvpx/config/linux/arm64/vp9_rtcd.h @@ -0,0 +1,106 @@ +// This file is generated. Do not edit. +#ifndef VP9_RTCD_H_ +#define VP9_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP9 + */ + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" + +struct macroblockd; + +/* Encoder forward decls */ +struct macroblock; +struct macroblock_plane; +struct vp9_sad_table; +struct ScanOrder; +struct search_site_config; +struct mv; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +int64_t vp9_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +#define vp9_block_error vp9_block_error_neon + +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +#define vp9_block_error_fp vp9_block_error_fp_neon + +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); +int vp9_diamond_search_sad_neon(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_neon + +void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht16x16 vp9_fht16x16_neon + +void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht4x4 vp9_fht4x4_neon + +void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht8x8_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht8x8 vp9_fht8x8_neon + +void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vp9_fwht4x4 vp9_fwht4x4_c + +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht16x16_256_add vp9_iht16x16_256_add_neon + +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht4x4_16_add vp9_iht4x4_16_add_neon + +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon + +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vp9_quantize_fp vp9_quantize_fp_neon + +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_neon + +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +void vp9_scale_and_extend_frame_neon(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_neon + +void vp9_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + (void)flags; + +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/linux/arm64/vpx_config.asm b/media/libvpx/config/linux/arm64/vpx_config.asm new file mode 100644 index 0000000000..499c16202c --- /dev/null +++ b/media/libvpx/config/linux/arm64/vpx_config.asm @@ -0,0 +1,97 @@ +@ This file was created from a .asm file +@ using the ads2gas.pl script. +.syntax unified +.equ VPX_ARCH_ARM , 1 +.equ VPX_ARCH_AARCH64 , 1 +.equ VPX_ARCH_MIPS , 0 +.equ VPX_ARCH_X86 , 0 +.equ VPX_ARCH_X86_64 , 0 +.equ VPX_ARCH_PPC , 0 +.equ VPX_ARCH_LOONGARCH , 0 +.equ HAVE_NEON_ASM , 0 +.equ HAVE_NEON , 1 +.equ HAVE_NEON_DOTPROD , 1 +.equ HAVE_NEON_I8MM , 1 +.equ HAVE_SVE , 1 +.equ HAVE_MIPS32 , 0 +.equ HAVE_DSPR2 , 0 +.equ HAVE_MSA , 0 +.equ HAVE_MIPS64 , 0 +.equ HAVE_MMX , 0 +.equ HAVE_SSE , 0 +.equ HAVE_SSE2 , 0 +.equ HAVE_SSE3 , 0 +.equ HAVE_SSSE3 , 0 +.equ HAVE_SSE4_1 , 0 +.equ HAVE_AVX , 0 +.equ HAVE_AVX2 , 0 +.equ HAVE_AVX512 , 0 +.equ HAVE_VSX , 0 +.equ HAVE_MMI , 0 +.equ HAVE_LSX , 0 +.equ HAVE_LASX , 0 +.equ HAVE_VPX_PORTS , 1 +.equ HAVE_PTHREAD_H , 1 +.equ CONFIG_DEPENDENCY_TRACKING , 1 +.equ CONFIG_EXTERNAL_BUILD , 1 +.equ CONFIG_INSTALL_DOCS , 0 +.equ CONFIG_INSTALL_BINS , 1 +.equ CONFIG_INSTALL_LIBS , 1 +.equ CONFIG_INSTALL_SRCS , 0 +.equ CONFIG_DEBUG , 0 +.equ CONFIG_GPROF , 0 +.equ CONFIG_GCOV , 0 +.equ CONFIG_RVCT , 0 +.equ CONFIG_GCC , 1 +.equ CONFIG_MSVS , 0 +.equ CONFIG_PIC , 1 +.equ CONFIG_BIG_ENDIAN , 0 +.equ CONFIG_CODEC_SRCS , 0 +.equ CONFIG_DEBUG_LIBS , 0 +.equ CONFIG_DEQUANT_TOKENS , 0 +.equ CONFIG_DC_RECON , 0 +.equ CONFIG_RUNTIME_CPU_DETECT , 1 +.equ CONFIG_POSTPROC , 0 +.equ CONFIG_VP9_POSTPROC , 0 +.equ CONFIG_MULTITHREAD , 1 +.equ CONFIG_INTERNAL_STATS , 0 +.equ CONFIG_VP8_ENCODER , 1 +.equ CONFIG_VP8_DECODER , 1 +.equ CONFIG_VP9_ENCODER , 1 +.equ CONFIG_VP9_DECODER , 1 +.equ CONFIG_VP8 , 1 +.equ CONFIG_VP9 , 1 +.equ CONFIG_ENCODERS , 1 +.equ CONFIG_DECODERS , 1 +.equ CONFIG_STATIC_MSVCRT , 0 +.equ CONFIG_SPATIAL_RESAMPLING , 1 +.equ CONFIG_REALTIME_ONLY , 1 +.equ CONFIG_ONTHEFLY_BITPACKING , 0 +.equ CONFIG_ERROR_CONCEALMENT , 0 +.equ CONFIG_SHARED , 0 +.equ CONFIG_STATIC , 1 +.equ CONFIG_SMALL , 0 +.equ CONFIG_POSTPROC_VISUALIZER , 0 +.equ CONFIG_OS_SUPPORT , 1 +.equ CONFIG_UNIT_TESTS , 0 +.equ CONFIG_WEBM_IO , 1 +.equ CONFIG_LIBYUV , 1 +.equ CONFIG_DECODE_PERF_TESTS , 0 +.equ CONFIG_ENCODE_PERF_TESTS , 0 +.equ CONFIG_MULTI_RES_ENCODING , 1 +.equ CONFIG_TEMPORAL_DENOISING , 1 +.equ CONFIG_VP9_TEMPORAL_DENOISING , 0 +.equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0 +.equ CONFIG_VP9_HIGHBITDEPTH , 0 +.equ CONFIG_BETTER_HW_COMPATIBILITY , 0 +.equ CONFIG_EXPERIMENTAL , 0 +.equ CONFIG_SIZE_LIMIT , 1 +.equ CONFIG_ALWAYS_ADJUST_BPM , 0 +.equ CONFIG_BITSTREAM_DEBUG , 0 +.equ CONFIG_MISMATCH_DEBUG , 0 +.equ CONFIG_FP_MB_STATS , 0 +.equ CONFIG_EMULATE_HARDWARE , 0 +.equ CONFIG_NON_GREEDY_MV , 0 +.equ CONFIG_RATE_CTRL , 0 +.equ CONFIG_COLLECT_COMPONENT_TIMING , 0 + .section .note.GNU-stack,"",%progbits diff --git a/media/libvpx/config/linux/arm64/vpx_config.c b/media/libvpx/config/linux/arm64/vpx_config.c new file mode 100644 index 0000000000..74baa0689c --- /dev/null +++ b/media/libvpx/config/linux/arm64/vpx_config.c @@ -0,0 +1,10 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +#include "vpx/vpx_codec.h" +static const char* const cfg = "--target=arm64-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-realtime-only"; +const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/linux/arm64/vpx_config.h b/media/libvpx/config/linux/arm64/vpx_config.h new file mode 100644 index 0000000000..3c5f2e33ca --- /dev/null +++ b/media/libvpx/config/linux/arm64/vpx_config.h @@ -0,0 +1,108 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +/* This file automatically generated by configure. Do not edit! */ +#ifndef VPX_CONFIG_H +#define VPX_CONFIG_H +#define RESTRICT +#define INLINE inline +#define VPX_ARCH_ARM 1 +#define VPX_ARCH_AARCH64 1 +#define VPX_ARCH_MIPS 0 +#define VPX_ARCH_X86 0 +#define VPX_ARCH_X86_64 0 +#define VPX_ARCH_PPC 0 +#define VPX_ARCH_LOONGARCH 0 +#define HAVE_NEON_ASM 0 +#define HAVE_NEON 1 +#define HAVE_NEON_DOTPROD 1 +#define HAVE_NEON_I8MM 1 +#define HAVE_SVE 1 +#define HAVE_MIPS32 0 +#define HAVE_DSPR2 0 +#define HAVE_MSA 0 +#define HAVE_MIPS64 0 +#define HAVE_MMX 0 +#define HAVE_SSE 0 +#define HAVE_SSE2 0 +#define HAVE_SSE3 0 +#define HAVE_SSSE3 0 +#define HAVE_SSE4_1 0 +#define HAVE_AVX 0 +#define HAVE_AVX2 0 +#define HAVE_AVX512 0 +#define HAVE_VSX 0 +#define HAVE_MMI 0 +#define HAVE_LSX 0 +#define HAVE_LASX 0 +#define HAVE_VPX_PORTS 1 +#define HAVE_PTHREAD_H 1 +#define CONFIG_DEPENDENCY_TRACKING 1 +#define CONFIG_EXTERNAL_BUILD 1 +#define CONFIG_INSTALL_DOCS 0 +#define CONFIG_INSTALL_BINS 1 +#define CONFIG_INSTALL_LIBS 1 +#define CONFIG_INSTALL_SRCS 0 +#define CONFIG_DEBUG 0 +#define CONFIG_GPROF 0 +#define CONFIG_GCOV 0 +#define CONFIG_RVCT 0 +#define CONFIG_GCC 1 +#define CONFIG_MSVS 0 +#define CONFIG_PIC 1 +#define CONFIG_BIG_ENDIAN 0 +#define CONFIG_CODEC_SRCS 0 +#define CONFIG_DEBUG_LIBS 0 +#define CONFIG_DEQUANT_TOKENS 0 +#define CONFIG_DC_RECON 0 +#define CONFIG_RUNTIME_CPU_DETECT 1 +#define CONFIG_POSTPROC 0 +#define CONFIG_VP9_POSTPROC 0 +#define CONFIG_MULTITHREAD 1 +#define CONFIG_INTERNAL_STATS 0 +#define CONFIG_VP8_ENCODER 1 +#define CONFIG_VP8_DECODER 1 +#define CONFIG_VP9_ENCODER 1 +#define CONFIG_VP9_DECODER 1 +#define CONFIG_VP8 1 +#define CONFIG_VP9 1 +#define CONFIG_ENCODERS 1 +#define CONFIG_DECODERS 1 +#define CONFIG_STATIC_MSVCRT 0 +#define CONFIG_SPATIAL_RESAMPLING 1 +#define CONFIG_REALTIME_ONLY 1 +#define CONFIG_ONTHEFLY_BITPACKING 0 +#define CONFIG_ERROR_CONCEALMENT 0 +#define CONFIG_SHARED 0 +#define CONFIG_STATIC 1 +#define CONFIG_SMALL 0 +#define CONFIG_POSTPROC_VISUALIZER 0 +#define CONFIG_OS_SUPPORT 1 +#define CONFIG_UNIT_TESTS 0 +#define CONFIG_WEBM_IO 1 +#define CONFIG_LIBYUV 1 +#define CONFIG_DECODE_PERF_TESTS 0 +#define CONFIG_ENCODE_PERF_TESTS 0 +#define CONFIG_MULTI_RES_ENCODING 1 +#define CONFIG_TEMPORAL_DENOISING 1 +#define CONFIG_VP9_TEMPORAL_DENOISING 0 +#define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 +#define CONFIG_BETTER_HW_COMPATIBILITY 0 +#define CONFIG_EXPERIMENTAL 0 +#define CONFIG_SIZE_LIMIT 1 +#define CONFIG_ALWAYS_ADJUST_BPM 0 +#define CONFIG_BITSTREAM_DEBUG 0 +#define CONFIG_MISMATCH_DEBUG 0 +#define CONFIG_FP_MB_STATS 0 +#define CONFIG_EMULATE_HARDWARE 0 +#define CONFIG_NON_GREEDY_MV 0 +#define CONFIG_RATE_CTRL 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 +#define DECODE_WIDTH_LIMIT 8192 +#define DECODE_HEIGHT_LIMIT 4608 +#endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/linux/arm64/vpx_dsp_rtcd.h b/media/libvpx/config/linux/arm64/vpx_dsp_rtcd.h new file mode 100644 index 0000000000..5a9b05ca14 --- /dev/null +++ b/media/libvpx/config/linux/arm64/vpx_dsp_rtcd.h @@ -0,0 +1,1184 @@ +// This file is generated. Do not edit. +#ifndef VPX_DSP_RTCD_H_ +#define VPX_DSP_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * DSP + */ + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; + struct ScanOrder; +#endif + + +#ifdef __cplusplus +extern "C" { +#endif + +unsigned int vpx_avg_4x4_c(const uint8_t *, int p); +unsigned int vpx_avg_4x4_neon(const uint8_t *, int p); +#define vpx_avg_4x4 vpx_avg_4x4_neon + +unsigned int vpx_avg_8x8_c(const uint8_t *, int p); +unsigned int vpx_avg_8x8_neon(const uint8_t *, int p); +#define vpx_avg_8x8 vpx_avg_8x8_neon + +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +void vpx_comp_avg_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +#define vpx_comp_avg_pred vpx_comp_avg_pred_neon + +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve_avg vpx_convolve_avg_neon + +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve_copy vpx_convolve_copy_neon + +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_neon + +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_neon + +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_neon + +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_neon + +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_neon + +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_neon + +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_neon + +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_neon + +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_neon + +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_neon + +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_neon + +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_neon + +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_neon + +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_neon + +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_neon + +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_neon + +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_neon + +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_neon + +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_neon + +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_neon + +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c + +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_neon + +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_neon + +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_neon + +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_neon + +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c + +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_neon + +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_neon + +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_neon + +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_neon + +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_neon + +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_neon + +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_neon + +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_neon + +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_neon + +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_neon + +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_neon + +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_neon + +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_neon + +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_neon + +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_neon + +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_neon + +void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct16x16 vpx_fdct16x16_neon + +void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct16x16_1 vpx_fdct16x16_1_neon + +void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32 vpx_fdct32x32_neon + +void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32_1 vpx_fdct32x32_1_neon + +void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32_rd vpx_fdct32x32_rd_neon + +void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4 vpx_fdct4x4_neon + +void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4_1 vpx_fdct4x4_1_neon + +void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct8x8 vpx_fdct8x8_neon + +void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct8x8_1 vpx_fdct8x8_1_neon + +void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vpx_get4x4sse_cs_neon_dotprod(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_get4x4sse_cs)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); + +void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get_mb_ss_c(const int16_t *); +#define vpx_get_mb_ss vpx_get_mb_ss_c + +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_16x16 vpx_h_predictor_16x16_neon + +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_32x32 vpx_h_predictor_32x32_neon + +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_neon + +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_neon + +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +#define vpx_hadamard_16x16 vpx_hadamard_16x16_neon + +void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +#define vpx_hadamard_32x32 vpx_hadamard_32x32_neon + +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +#define vpx_hadamard_8x8 vpx_hadamard_8x8_neon + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c + +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_10_add vpx_idct16x16_10_add_neon + +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_1_add vpx_idct16x16_1_add_neon + +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_256_add vpx_idct16x16_256_add_neon + +void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_38_add vpx_idct16x16_38_add_neon + +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_neon + +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_135_add vpx_idct32x32_135_add_neon + +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_1_add vpx_idct32x32_1_add_neon + +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_34_add vpx_idct32x32_34_add_neon + +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct4x4_16_add vpx_idct4x4_16_add_neon + +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct4x4_1_add vpx_idct4x4_1_add_neon + +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_12_add vpx_idct8x8_12_add_neon + +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_1_add vpx_idct8x8_1_add_neon + +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_64_add vpx_idct8x8_64_add_neon + +int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); +int16_t vpx_int_pro_col_neon(const uint8_t *ref, const int width); +#define vpx_int_pro_col vpx_int_pro_col_neon + +void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +void vpx_int_pro_row_neon(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +#define vpx_int_pro_row vpx_int_pro_row_neon + +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c + +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c + +void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_16 vpx_lpf_horizontal_16_neon + +void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_16_dual vpx_lpf_horizontal_16_dual_neon + +void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_4 vpx_lpf_horizontal_4_neon + +void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_horizontal_4_dual vpx_lpf_horizontal_4_dual_neon + +void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_8 vpx_lpf_horizontal_8_neon + +void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_horizontal_8_dual vpx_lpf_horizontal_8_dual_neon + +void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_neon + +void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16_dual vpx_lpf_vertical_16_dual_neon + +void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_4 vpx_lpf_vertical_4_neon + +void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_vertical_4_dual vpx_lpf_vertical_4_dual_neon + +void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_8 vpx_lpf_vertical_8_neon + +void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_neon + +void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +void vpx_minmax_8x8_neon(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +#define vpx_minmax_8x8 vpx_minmax_8x8_neon + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vpx_quantize_b vpx_quantize_b_neon + +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_neon + +unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x16_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad16x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x32_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad16x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x8_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad16x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x16_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x32_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x64_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad4x4 vpx_sad4x4_neon + +unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x4_avg vpx_sad4x4_avg_neon + +void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad4x4x4d vpx_sad4x4x4d_neon + +unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad4x8 vpx_sad4x8_neon + +unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x8_avg vpx_sad4x8_avg_neon + +void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad4x8x4d vpx_sad4x8x4d_neon + +unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x32_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad64x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x64_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x16 vpx_sad8x16_neon + +unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x16_avg vpx_sad8x16_avg_neon + +void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x16x4d vpx_sad8x16x4d_neon + +unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x4 vpx_sad8x4_neon + +unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x4_avg vpx_sad8x4_avg_neon + +void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x4x4d vpx_sad8x4x4d_neon + +unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x8 vpx_sad8x8_neon + +unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x8_avg vpx_sad8x8_avg_neon + +void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x8x4d vpx_sad8x8x4d_neon + +unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x16x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x8x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_neon + +void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_neon + +unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_neon + +void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_neon + +unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_neon + +void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_neon + +unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_neon + +void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_neon + +unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_neon + +void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_neon + +int vpx_satd_c(const int16_t *coeff, int length); +int vpx_satd_neon(const int16_t *coeff, int length); +#define vpx_satd vpx_satd_neon + +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_2d vpx_scaled_2d_neon + +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c + +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c + +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c + +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_horiz vpx_scaled_horiz_c + +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_vert vpx_scaled_vert_c + +int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +RTCD_EXTERN int64_t (*vpx_sse)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); + +uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_neon + +uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_neon + +uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_neon + +uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_neon + +uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_neon + +uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_neon + +uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_neon + +uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_neon + +uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_neon + +uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_neon + +uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_neon + +uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_neon + +uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_neon + +uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_neon + +uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_neon + +uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_neon + +uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_neon + +uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_neon + +uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_neon + +uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_neon + +uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_neon + +uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_neon + +uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_neon + +uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_neon + +uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_neon + +uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x8 vpx_sub_pixel_variance8x8_neon + +void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vpx_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +#define vpx_subtract_block vpx_subtract_block_neon + +uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size); +uint64_t vpx_sum_squares_2d_i16_neon(const int16_t *src, int stride, int size); +#define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_neon + +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_neon + +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_neon + +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_neon + +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_neon + +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_16x16 vpx_v_predictor_16x16_neon + +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_32x32 vpx_v_predictor_32x32_neon + +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_neon + +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_neon + +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance4x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c + +int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl); +int vpx_vector_var_neon(const int16_t *ref, const int16_t *src, const int bwl); +#define vpx_vector_var vpx_vector_var_neon + +void vpx_dsp_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + (void)flags; + + vpx_convolve8 = vpx_convolve8_neon; + if (flags & HAS_NEON_DOTPROD) vpx_convolve8 = vpx_convolve8_neon_dotprod; + if (flags & HAS_NEON_I8MM) vpx_convolve8 = vpx_convolve8_neon_i8mm; + vpx_convolve8_avg = vpx_convolve8_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_convolve8_avg = vpx_convolve8_avg_neon_dotprod; + if (flags & HAS_NEON_I8MM) vpx_convolve8_avg = vpx_convolve8_avg_neon_i8mm; + vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_neon; + if (flags & HAS_NEON_DOTPROD) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_neon_dotprod; + if (flags & HAS_NEON_I8MM) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_neon_i8mm; + vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_neon; + if (flags & HAS_NEON_DOTPROD) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_neon_dotprod; + if (flags & HAS_NEON_I8MM) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_neon_i8mm; + vpx_convolve8_horiz = vpx_convolve8_horiz_neon; + if (flags & HAS_NEON_DOTPROD) vpx_convolve8_horiz = vpx_convolve8_horiz_neon_dotprod; + if (flags & HAS_NEON_I8MM) vpx_convolve8_horiz = vpx_convolve8_horiz_neon_i8mm; + vpx_convolve8_vert = vpx_convolve8_vert_neon; + if (flags & HAS_NEON_DOTPROD) vpx_convolve8_vert = vpx_convolve8_vert_neon_dotprod; + if (flags & HAS_NEON_I8MM) vpx_convolve8_vert = vpx_convolve8_vert_neon_i8mm; + vpx_get16x16var = vpx_get16x16var_neon; + if (flags & HAS_NEON_DOTPROD) vpx_get16x16var = vpx_get16x16var_neon_dotprod; + vpx_get4x4sse_cs = vpx_get4x4sse_cs_neon; + if (flags & HAS_NEON_DOTPROD) vpx_get4x4sse_cs = vpx_get4x4sse_cs_neon_dotprod; + vpx_get8x8var = vpx_get8x8var_neon; + if (flags & HAS_NEON_DOTPROD) vpx_get8x8var = vpx_get8x8var_neon_dotprod; + vpx_mse16x16 = vpx_mse16x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_mse16x16 = vpx_mse16x16_neon_dotprod; + vpx_mse16x8 = vpx_mse16x8_neon; + if (flags & HAS_NEON_DOTPROD) vpx_mse16x8 = vpx_mse16x8_neon_dotprod; + vpx_mse8x16 = vpx_mse8x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_mse8x16 = vpx_mse8x16_neon_dotprod; + vpx_mse8x8 = vpx_mse8x8_neon; + if (flags & HAS_NEON_DOTPROD) vpx_mse8x8 = vpx_mse8x8_neon_dotprod; + vpx_sad16x16 = vpx_sad16x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x16 = vpx_sad16x16_neon_dotprod; + vpx_sad16x16_avg = vpx_sad16x16_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x16_avg = vpx_sad16x16_avg_neon_dotprod; + vpx_sad16x16x4d = vpx_sad16x16x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x16x4d = vpx_sad16x16x4d_neon_dotprod; + vpx_sad16x32 = vpx_sad16x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x32 = vpx_sad16x32_neon_dotprod; + vpx_sad16x32_avg = vpx_sad16x32_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x32_avg = vpx_sad16x32_avg_neon_dotprod; + vpx_sad16x32x4d = vpx_sad16x32x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x32x4d = vpx_sad16x32x4d_neon_dotprod; + vpx_sad16x8 = vpx_sad16x8_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x8 = vpx_sad16x8_neon_dotprod; + vpx_sad16x8_avg = vpx_sad16x8_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x8_avg = vpx_sad16x8_avg_neon_dotprod; + vpx_sad16x8x4d = vpx_sad16x8x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x8x4d = vpx_sad16x8x4d_neon_dotprod; + vpx_sad32x16 = vpx_sad32x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x16 = vpx_sad32x16_neon_dotprod; + vpx_sad32x16_avg = vpx_sad32x16_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x16_avg = vpx_sad32x16_avg_neon_dotprod; + vpx_sad32x16x4d = vpx_sad32x16x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x16x4d = vpx_sad32x16x4d_neon_dotprod; + vpx_sad32x32 = vpx_sad32x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x32 = vpx_sad32x32_neon_dotprod; + vpx_sad32x32_avg = vpx_sad32x32_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x32_avg = vpx_sad32x32_avg_neon_dotprod; + vpx_sad32x32x4d = vpx_sad32x32x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x32x4d = vpx_sad32x32x4d_neon_dotprod; + vpx_sad32x64 = vpx_sad32x64_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x64 = vpx_sad32x64_neon_dotprod; + vpx_sad32x64_avg = vpx_sad32x64_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x64_avg = vpx_sad32x64_avg_neon_dotprod; + vpx_sad32x64x4d = vpx_sad32x64x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x64x4d = vpx_sad32x64x4d_neon_dotprod; + vpx_sad64x32 = vpx_sad64x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad64x32 = vpx_sad64x32_neon_dotprod; + vpx_sad64x32_avg = vpx_sad64x32_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad64x32_avg = vpx_sad64x32_avg_neon_dotprod; + vpx_sad64x32x4d = vpx_sad64x32x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad64x32x4d = vpx_sad64x32x4d_neon_dotprod; + vpx_sad64x64 = vpx_sad64x64_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad64x64 = vpx_sad64x64_neon_dotprod; + vpx_sad64x64_avg = vpx_sad64x64_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad64x64_avg = vpx_sad64x64_avg_neon_dotprod; + vpx_sad64x64x4d = vpx_sad64x64x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad64x64x4d = vpx_sad64x64x4d_neon_dotprod; + vpx_sad_skip_16x16 = vpx_sad_skip_16x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x16 = vpx_sad_skip_16x16_neon_dotprod; + vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_neon_dotprod; + vpx_sad_skip_16x32 = vpx_sad_skip_16x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x32 = vpx_sad_skip_16x32_neon_dotprod; + vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_neon_dotprod; + vpx_sad_skip_16x8 = vpx_sad_skip_16x8_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x8 = vpx_sad_skip_16x8_neon_dotprod; + vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_neon_dotprod; + vpx_sad_skip_32x16 = vpx_sad_skip_32x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_neon_dotprod; + vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_neon_dotprod; + vpx_sad_skip_32x32 = vpx_sad_skip_32x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_neon_dotprod; + vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_neon_dotprod; + vpx_sad_skip_32x64 = vpx_sad_skip_32x64_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_neon_dotprod; + vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_neon_dotprod; + vpx_sad_skip_64x32 = vpx_sad_skip_64x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_neon_dotprod; + vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_neon_dotprod; + vpx_sad_skip_64x64 = vpx_sad_skip_64x64_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_neon_dotprod; + vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_neon_dotprod; + vpx_sse = vpx_sse_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sse = vpx_sse_neon_dotprod; + vpx_variance16x16 = vpx_variance16x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance16x16 = vpx_variance16x16_neon_dotprod; + vpx_variance16x32 = vpx_variance16x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance16x32 = vpx_variance16x32_neon_dotprod; + vpx_variance16x8 = vpx_variance16x8_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance16x8 = vpx_variance16x8_neon_dotprod; + vpx_variance32x16 = vpx_variance32x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance32x16 = vpx_variance32x16_neon_dotprod; + vpx_variance32x32 = vpx_variance32x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance32x32 = vpx_variance32x32_neon_dotprod; + vpx_variance32x64 = vpx_variance32x64_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance32x64 = vpx_variance32x64_neon_dotprod; + vpx_variance4x4 = vpx_variance4x4_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance4x4 = vpx_variance4x4_neon_dotprod; + vpx_variance4x8 = vpx_variance4x8_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance4x8 = vpx_variance4x8_neon_dotprod; + vpx_variance64x32 = vpx_variance64x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance64x32 = vpx_variance64x32_neon_dotprod; + vpx_variance64x64 = vpx_variance64x64_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance64x64 = vpx_variance64x64_neon_dotprod; + vpx_variance8x16 = vpx_variance8x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance8x16 = vpx_variance8x16_neon_dotprod; + vpx_variance8x4 = vpx_variance8x4_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance8x4 = vpx_variance8x4_neon_dotprod; + vpx_variance8x8 = vpx_variance8x8_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance8x8 = vpx_variance8x8_neon_dotprod; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/linux/arm64/vpx_scale_rtcd.h b/media/libvpx/config/linux/arm64/vpx_scale_rtcd.h new file mode 100644 index 0000000000..b371368275 --- /dev/null +++ b/media/libvpx/config/linux/arm64/vpx_scale_rtcd.h @@ -0,0 +1,75 @@ +// This file is generated. Do not edit. +#ifndef VPX_SCALE_RTCD_H_ +#define VPX_SCALE_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c + +void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c + +void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c + +void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c + +void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c + +void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c + +void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c + +void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vp8_yv12_copy_frame vp8_yv12_copy_frame_c + +void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c + +void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_borders vpx_extend_frame_borders_c + +void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c + +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c + +void vpx_scale_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + (void)flags; + +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/linux/ia32/vp8_rtcd.h b/media/libvpx/config/linux/ia32/vp8_rtcd.h new file mode 100644 index 0000000000..7a23227e4d --- /dev/null +++ b/media/libvpx/config/linux/ia32/vp8_rtcd.h @@ -0,0 +1,323 @@ +// This file is generated. Do not edit. +#ifndef VP8_RTCD_H_ +#define VP8_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP8 + */ + +struct blockd; +struct macroblockd; +struct loop_filter_info; + +/* Encoder forward decls */ +struct block; +struct macroblock; +struct variance_vtable; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict4x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +int vp8_block_error_c(short *coeff, short *dqcoeff); +int vp8_block_error_sse2(short *coeff, short *dqcoeff); +RTCD_EXTERN int (*vp8_block_error)(short *coeff, short *dqcoeff); + +void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +void vp8_copy32xn_sse2(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +void vp8_copy32xn_sse3(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); + +void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); + +void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x4_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); + +void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x8_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); + +void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); + +int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +RTCD_EXTERN int (*vp8_denoiser_filter)(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); + +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +RTCD_EXTERN int (*vp8_denoiser_filter_uv)(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); + +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride); +void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *dest, int stride); +RTCD_EXTERN void (*vp8_dequant_idct_add)(short *input, short *dq, unsigned char *dest, int stride); + +void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +void vp8_dequant_idct_add_uv_block_sse2(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +RTCD_EXTERN void (*vp8_dequant_idct_add_uv_block)(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); + +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +RTCD_EXTERN void (*vp8_dequant_idct_add_y_block)(short *q, short *dq, unsigned char *dst, int stride, char *eobs); + +void vp8_dequantize_b_c(struct blockd*, short *DQC); +void vp8_dequantize_b_mmx(struct blockd*, short *DQC); +RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *DQC); + +int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_diamond_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); + +void vp8_fast_quantize_b_c(struct block *, struct blockd *); +void vp8_fast_quantize_b_sse2(struct block *, struct blockd *); +void vp8_fast_quantize_b_ssse3(struct block *, struct blockd *); +RTCD_EXTERN void (*vp8_fast_quantize_b)(struct block *, struct blockd *); + +void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +void vp8_filter_by_weight16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +RTCD_EXTERN void (*vp8_filter_by_weight16x16)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); + +void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight4x4 vp8_filter_by_weight4x4_c + +void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +RTCD_EXTERN void (*vp8_filter_by_weight8x8)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); + +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_bv)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_bh)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); + +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); + +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); + +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); + +int vp8_mbblock_error_c(struct macroblock *mb, int dc); +int vp8_mbblock_error_sse2(struct macroblock *mb, int dc); +RTCD_EXTERN int (*vp8_mbblock_error)(struct macroblock *mb, int dc); + +int vp8_mbuverror_c(struct macroblock *mb); +int vp8_mbuverror_sse2(struct macroblock *mb); +RTCD_EXTERN int (*vp8_mbuverror)(struct macroblock *mb); + +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); + +void vp8_regular_quantize_b_c(struct block *, struct blockd *); +void vp8_regular_quantize_b_sse2(struct block *, struct blockd *); +void vp8_regular_quantize_b_sse4_1(struct block *, struct blockd *); +RTCD_EXTERN void (*vp8_regular_quantize_b)(struct block *, struct blockd *); + +void vp8_short_fdct4x4_c(short *input, short *output, int pitch); +void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch); +RTCD_EXTERN void (*vp8_short_fdct4x4)(short *input, short *output, int pitch); + +void vp8_short_fdct8x4_c(short *input, short *output, int pitch); +void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch); +RTCD_EXTERN void (*vp8_short_fdct8x4)(short *input, short *output, int pitch); + +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); + +void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff); +void vp8_short_inv_walsh4x4_sse2(short *input, short *mb_dqcoeff); +RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *mb_dqcoeff); + +void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + +void vp8_short_walsh4x4_c(short *input, short *output, int pitch); +void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch); +RTCD_EXTERN void (*vp8_short_walsh4x4)(short *input, short *output, int pitch); + +void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); +void vp8_temporal_filter_apply_sse2(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); +RTCD_EXTERN void (*vp8_temporal_filter_apply)(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); + +void vp8_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/x86.h" +static void setup_rtcd_internal(void) +{ + int flags = x86_simd_caps(); + + (void)flags; + + vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_c; + if (flags & HAS_SSE2) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_sse2; + if (flags & HAS_SSSE3) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_ssse3; + vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_c; + if (flags & HAS_SSE2) vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_sse2; + vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_c; + if (flags & HAS_SSE2) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_sse2; + vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_c; + if (flags & HAS_SSE2) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_sse2; + if (flags & HAS_SSSE3) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_ssse3; + vp8_block_error = vp8_block_error_c; + if (flags & HAS_SSE2) vp8_block_error = vp8_block_error_sse2; + vp8_copy32xn = vp8_copy32xn_c; + if (flags & HAS_SSE2) vp8_copy32xn = vp8_copy32xn_sse2; + if (flags & HAS_SSE3) vp8_copy32xn = vp8_copy32xn_sse3; + vp8_copy_mem16x16 = vp8_copy_mem16x16_c; + if (flags & HAS_SSE2) vp8_copy_mem16x16 = vp8_copy_mem16x16_sse2; + vp8_copy_mem8x4 = vp8_copy_mem8x4_c; + if (flags & HAS_MMX) vp8_copy_mem8x4 = vp8_copy_mem8x4_mmx; + vp8_copy_mem8x8 = vp8_copy_mem8x8_c; + if (flags & HAS_MMX) vp8_copy_mem8x8 = vp8_copy_mem8x8_mmx; + vp8_dc_only_idct_add = vp8_dc_only_idct_add_c; + if (flags & HAS_MMX) vp8_dc_only_idct_add = vp8_dc_only_idct_add_mmx; + vp8_denoiser_filter = vp8_denoiser_filter_c; + if (flags & HAS_SSE2) vp8_denoiser_filter = vp8_denoiser_filter_sse2; + vp8_denoiser_filter_uv = vp8_denoiser_filter_uv_c; + if (flags & HAS_SSE2) vp8_denoiser_filter_uv = vp8_denoiser_filter_uv_sse2; + vp8_dequant_idct_add = vp8_dequant_idct_add_c; + if (flags & HAS_MMX) vp8_dequant_idct_add = vp8_dequant_idct_add_mmx; + vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_c; + if (flags & HAS_SSE2) vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_sse2; + vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_c; + if (flags & HAS_SSE2) vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_sse2; + vp8_dequantize_b = vp8_dequantize_b_c; + if (flags & HAS_MMX) vp8_dequantize_b = vp8_dequantize_b_mmx; + vp8_diamond_search_sad = vp8_diamond_search_sad_c; + if (flags & HAS_SSE2) vp8_diamond_search_sad = vp8_diamond_search_sadx4; + vp8_fast_quantize_b = vp8_fast_quantize_b_c; + if (flags & HAS_SSE2) vp8_fast_quantize_b = vp8_fast_quantize_b_sse2; + if (flags & HAS_SSSE3) vp8_fast_quantize_b = vp8_fast_quantize_b_ssse3; + vp8_filter_by_weight16x16 = vp8_filter_by_weight16x16_c; + if (flags & HAS_SSE2) vp8_filter_by_weight16x16 = vp8_filter_by_weight16x16_sse2; + vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_c; + if (flags & HAS_SSE2) vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_sse2; + vp8_loop_filter_bh = vp8_loop_filter_bh_c; + if (flags & HAS_SSE2) vp8_loop_filter_bh = vp8_loop_filter_bh_sse2; + vp8_loop_filter_bv = vp8_loop_filter_bv_c; + if (flags & HAS_SSE2) vp8_loop_filter_bv = vp8_loop_filter_bv_sse2; + vp8_loop_filter_mbh = vp8_loop_filter_mbh_c; + if (flags & HAS_SSE2) vp8_loop_filter_mbh = vp8_loop_filter_mbh_sse2; + vp8_loop_filter_mbv = vp8_loop_filter_mbv_c; + if (flags & HAS_SSE2) vp8_loop_filter_mbv = vp8_loop_filter_mbv_sse2; + vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_c; + if (flags & HAS_SSE2) vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_sse2; + vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_c; + if (flags & HAS_SSE2) vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_sse2; + vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_c; + if (flags & HAS_SSE2) vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_sse2; + vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_c; + if (flags & HAS_SSE2) vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_sse2; + vp8_mbblock_error = vp8_mbblock_error_c; + if (flags & HAS_SSE2) vp8_mbblock_error = vp8_mbblock_error_sse2; + vp8_mbuverror = vp8_mbuverror_c; + if (flags & HAS_SSE2) vp8_mbuverror = vp8_mbuverror_sse2; + vp8_refining_search_sad = vp8_refining_search_sad_c; + if (flags & HAS_SSE2) vp8_refining_search_sad = vp8_refining_search_sadx4; + vp8_regular_quantize_b = vp8_regular_quantize_b_c; + if (flags & HAS_SSE2) vp8_regular_quantize_b = vp8_regular_quantize_b_sse2; + if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1; + vp8_short_fdct4x4 = vp8_short_fdct4x4_c; + if (flags & HAS_SSE2) vp8_short_fdct4x4 = vp8_short_fdct4x4_sse2; + vp8_short_fdct8x4 = vp8_short_fdct8x4_c; + if (flags & HAS_SSE2) vp8_short_fdct8x4 = vp8_short_fdct8x4_sse2; + vp8_short_idct4x4llm = vp8_short_idct4x4llm_c; + if (flags & HAS_MMX) vp8_short_idct4x4llm = vp8_short_idct4x4llm_mmx; + vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_c; + if (flags & HAS_SSE2) vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_sse2; + vp8_short_walsh4x4 = vp8_short_walsh4x4_c; + if (flags & HAS_SSE2) vp8_short_walsh4x4 = vp8_short_walsh4x4_sse2; + vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_c; + if (flags & HAS_SSE2) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_ssse3; + vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_c; + if (flags & HAS_MMX) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_mmx; + if (flags & HAS_SSSE3) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_ssse3; + vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_c; + if (flags & HAS_SSE2) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_ssse3; + vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_c; + if (flags & HAS_SSE2) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_ssse3; + vp8_temporal_filter_apply = vp8_temporal_filter_apply_c; + if (flags & HAS_SSE2) vp8_temporal_filter_apply = vp8_temporal_filter_apply_sse2; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/linux/ia32/vp9_rtcd.h b/media/libvpx/config/linux/ia32/vp9_rtcd.h new file mode 100644 index 0000000000..3136db471c --- /dev/null +++ b/media/libvpx/config/linux/ia32/vp9_rtcd.h @@ -0,0 +1,156 @@ +// This file is generated. Do not edit. +#ifndef VP9_RTCD_H_ +#define VP9_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP9 + */ + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" + +struct macroblockd; + +/* Encoder forward decls */ +struct macroblock; +struct macroblock_plane; +struct vp9_sad_table; +struct ScanOrder; +struct search_site_config; +struct mv; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); +void vp9_apply_temporal_filter_sse4_1(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); +RTCD_EXTERN void (*vp9_apply_temporal_filter)(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); + +int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +RTCD_EXTERN int64_t (*vp9_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); + +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +RTCD_EXTERN int64_t (*vp9_block_error_fp)(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); + +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_c + +void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type); +RTCD_EXTERN void (*vp9_fht16x16)(const int16_t *input, tran_low_t *output, int stride, int tx_type); + +void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type); +RTCD_EXTERN void (*vp9_fht4x4)(const int16_t *input, tran_low_t *output, int stride, int tx_type); + +void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type); +RTCD_EXTERN void (*vp9_fht8x8)(const int16_t *input, tran_low_t *output, int stride, int tx_type); + +void vp9_filter_by_weight16x16_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); +void vp9_filter_by_weight16x16_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); +RTCD_EXTERN void (*vp9_filter_by_weight16x16)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); + +void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); +void vp9_filter_by_weight8x8_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); +RTCD_EXTERN void (*vp9_filter_by_weight8x8)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); + +void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); +void vp9_fwht4x4_sse2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vp9_fwht4x4)(const int16_t *input, tran_low_t *output, int stride); + +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); + +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); + +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); + +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); + +void vp9_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/x86.h" +static void setup_rtcd_internal(void) +{ + int flags = x86_simd_caps(); + + (void)flags; + + vp9_apply_temporal_filter = vp9_apply_temporal_filter_c; + if (flags & HAS_SSE4_1) vp9_apply_temporal_filter = vp9_apply_temporal_filter_sse4_1; + vp9_block_error = vp9_block_error_c; + if (flags & HAS_SSE2) vp9_block_error = vp9_block_error_sse2; + if (flags & HAS_AVX2) vp9_block_error = vp9_block_error_avx2; + vp9_block_error_fp = vp9_block_error_fp_c; + if (flags & HAS_SSE2) vp9_block_error_fp = vp9_block_error_fp_sse2; + if (flags & HAS_AVX2) vp9_block_error_fp = vp9_block_error_fp_avx2; + vp9_fht16x16 = vp9_fht16x16_c; + if (flags & HAS_SSE2) vp9_fht16x16 = vp9_fht16x16_sse2; + vp9_fht4x4 = vp9_fht4x4_c; + if (flags & HAS_SSE2) vp9_fht4x4 = vp9_fht4x4_sse2; + vp9_fht8x8 = vp9_fht8x8_c; + if (flags & HAS_SSE2) vp9_fht8x8 = vp9_fht8x8_sse2; + vp9_filter_by_weight16x16 = vp9_filter_by_weight16x16_c; + if (flags & HAS_SSE2) vp9_filter_by_weight16x16 = vp9_filter_by_weight16x16_sse2; + vp9_filter_by_weight8x8 = vp9_filter_by_weight8x8_c; + if (flags & HAS_SSE2) vp9_filter_by_weight8x8 = vp9_filter_by_weight8x8_sse2; + vp9_fwht4x4 = vp9_fwht4x4_c; + if (flags & HAS_SSE2) vp9_fwht4x4 = vp9_fwht4x4_sse2; + vp9_iht16x16_256_add = vp9_iht16x16_256_add_c; + if (flags & HAS_SSE2) vp9_iht16x16_256_add = vp9_iht16x16_256_add_sse2; + vp9_iht4x4_16_add = vp9_iht4x4_16_add_c; + if (flags & HAS_SSE2) vp9_iht4x4_16_add = vp9_iht4x4_16_add_sse2; + vp9_iht8x8_64_add = vp9_iht8x8_64_add_c; + if (flags & HAS_SSE2) vp9_iht8x8_64_add = vp9_iht8x8_64_add_sse2; + vp9_quantize_fp = vp9_quantize_fp_c; + if (flags & HAS_SSE2) vp9_quantize_fp = vp9_quantize_fp_sse2; + if (flags & HAS_SSSE3) vp9_quantize_fp = vp9_quantize_fp_ssse3; + if (flags & HAS_AVX2) vp9_quantize_fp = vp9_quantize_fp_avx2; + vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_c; + if (flags & HAS_SSSE3) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_ssse3; + if (flags & HAS_AVX2) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_avx2; + vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c; + if (flags & HAS_SSSE3) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_ssse3; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/linux/ia32/vpx_config.asm b/media/libvpx/config/linux/ia32/vpx_config.asm new file mode 100644 index 0000000000..eaa3950d37 --- /dev/null +++ b/media/libvpx/config/linux/ia32/vpx_config.asm @@ -0,0 +1,93 @@ +%define VPX_ARCH_ARM 0 +%define VPX_ARCH_AARCH64 0 +%define VPX_ARCH_MIPS 0 +%define VPX_ARCH_X86 1 +%define VPX_ARCH_X86_64 0 +%define VPX_ARCH_PPC 0 +%define VPX_ARCH_LOONGARCH 0 +%define HAVE_NEON_ASM 0 +%define HAVE_NEON 0 +%define HAVE_NEON_DOTPROD 0 +%define HAVE_NEON_I8MM 0 +%define HAVE_SVE 0 +%define HAVE_MIPS32 0 +%define HAVE_DSPR2 0 +%define HAVE_MSA 0 +%define HAVE_MIPS64 0 +%define HAVE_MMX 1 +%define HAVE_SSE 1 +%define HAVE_SSE2 1 +%define HAVE_SSE3 1 +%define HAVE_SSSE3 1 +%define HAVE_SSE4_1 1 +%define HAVE_AVX 1 +%define HAVE_AVX2 1 +%define HAVE_AVX512 0 +%define HAVE_VSX 0 +%define HAVE_MMI 0 +%define HAVE_LSX 0 +%define HAVE_LASX 0 +%define HAVE_VPX_PORTS 1 +%define HAVE_PTHREAD_H 1 +%define CONFIG_DEPENDENCY_TRACKING 1 +%define CONFIG_EXTERNAL_BUILD 1 +%define CONFIG_INSTALL_DOCS 0 +%define CONFIG_INSTALL_BINS 1 +%define CONFIG_INSTALL_LIBS 1 +%define CONFIG_INSTALL_SRCS 0 +%define CONFIG_DEBUG 0 +%define CONFIG_GPROF 0 +%define CONFIG_GCOV 0 +%define CONFIG_RVCT 0 +%define CONFIG_GCC 1 +%define CONFIG_MSVS 0 +%define CONFIG_PIC 1 +%define CONFIG_BIG_ENDIAN 0 +%define CONFIG_CODEC_SRCS 0 +%define CONFIG_DEBUG_LIBS 0 +%define CONFIG_DEQUANT_TOKENS 0 +%define CONFIG_DC_RECON 0 +%define CONFIG_RUNTIME_CPU_DETECT 1 +%define CONFIG_POSTPROC 1 +%define CONFIG_VP9_POSTPROC 1 +%define CONFIG_MULTITHREAD 1 +%define CONFIG_INTERNAL_STATS 0 +%define CONFIG_VP8_ENCODER 1 +%define CONFIG_VP8_DECODER 1 +%define CONFIG_VP9_ENCODER 1 +%define CONFIG_VP9_DECODER 1 +%define CONFIG_VP8 1 +%define CONFIG_VP9 1 +%define CONFIG_ENCODERS 1 +%define CONFIG_DECODERS 1 +%define CONFIG_STATIC_MSVCRT 0 +%define CONFIG_SPATIAL_RESAMPLING 1 +%define CONFIG_REALTIME_ONLY 0 +%define CONFIG_ONTHEFLY_BITPACKING 0 +%define CONFIG_ERROR_CONCEALMENT 0 +%define CONFIG_SHARED 0 +%define CONFIG_STATIC 1 +%define CONFIG_SMALL 0 +%define CONFIG_POSTPROC_VISUALIZER 0 +%define CONFIG_OS_SUPPORT 1 +%define CONFIG_UNIT_TESTS 0 +%define CONFIG_WEBM_IO 1 +%define CONFIG_LIBYUV 1 +%define CONFIG_DECODE_PERF_TESTS 0 +%define CONFIG_ENCODE_PERF_TESTS 0 +%define CONFIG_MULTI_RES_ENCODING 1 +%define CONFIG_TEMPORAL_DENOISING 1 +%define CONFIG_VP9_TEMPORAL_DENOISING 0 +%define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +%define CONFIG_VP9_HIGHBITDEPTH 0 +%define CONFIG_BETTER_HW_COMPATIBILITY 0 +%define CONFIG_EXPERIMENTAL 0 +%define CONFIG_SIZE_LIMIT 1 +%define CONFIG_ALWAYS_ADJUST_BPM 0 +%define CONFIG_BITSTREAM_DEBUG 0 +%define CONFIG_MISMATCH_DEBUG 0 +%define CONFIG_FP_MB_STATS 0 +%define CONFIG_EMULATE_HARDWARE 0 +%define CONFIG_NON_GREEDY_MV 0 +%define CONFIG_RATE_CTRL 0 +%define CONFIG_COLLECT_COMPONENT_TIMING 0 diff --git a/media/libvpx/config/linux/ia32/vpx_config.c b/media/libvpx/config/linux/ia32/vpx_config.c new file mode 100644 index 0000000000..6805ab62a8 --- /dev/null +++ b/media/libvpx/config/linux/ia32/vpx_config.c @@ -0,0 +1,10 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +#include "vpx/vpx_codec.h" +static const char* const cfg = "--target=x86-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm"; +const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/linux/ia32/vpx_config.h b/media/libvpx/config/linux/ia32/vpx_config.h new file mode 100644 index 0000000000..69fd63bf02 --- /dev/null +++ b/media/libvpx/config/linux/ia32/vpx_config.h @@ -0,0 +1,108 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +/* This file automatically generated by configure. Do not edit! */ +#ifndef VPX_CONFIG_H +#define VPX_CONFIG_H +#define RESTRICT +#define INLINE inline +#define VPX_ARCH_ARM 0 +#define VPX_ARCH_AARCH64 0 +#define VPX_ARCH_MIPS 0 +#define VPX_ARCH_X86 1 +#define VPX_ARCH_X86_64 0 +#define VPX_ARCH_PPC 0 +#define VPX_ARCH_LOONGARCH 0 +#define HAVE_NEON_ASM 0 +#define HAVE_NEON 0 +#define HAVE_NEON_DOTPROD 0 +#define HAVE_NEON_I8MM 0 +#define HAVE_SVE 0 +#define HAVE_MIPS32 0 +#define HAVE_DSPR2 0 +#define HAVE_MSA 0 +#define HAVE_MIPS64 0 +#define HAVE_MMX 1 +#define HAVE_SSE 1 +#define HAVE_SSE2 1 +#define HAVE_SSE3 1 +#define HAVE_SSSE3 1 +#define HAVE_SSE4_1 1 +#define HAVE_AVX 1 +#define HAVE_AVX2 1 +#define HAVE_AVX512 0 +#define HAVE_VSX 0 +#define HAVE_MMI 0 +#define HAVE_LSX 0 +#define HAVE_LASX 0 +#define HAVE_VPX_PORTS 1 +#define HAVE_PTHREAD_H 1 +#define CONFIG_DEPENDENCY_TRACKING 1 +#define CONFIG_EXTERNAL_BUILD 1 +#define CONFIG_INSTALL_DOCS 0 +#define CONFIG_INSTALL_BINS 1 +#define CONFIG_INSTALL_LIBS 1 +#define CONFIG_INSTALL_SRCS 0 +#define CONFIG_DEBUG 0 +#define CONFIG_GPROF 0 +#define CONFIG_GCOV 0 +#define CONFIG_RVCT 0 +#define CONFIG_GCC 1 +#define CONFIG_MSVS 0 +#define CONFIG_PIC 1 +#define CONFIG_BIG_ENDIAN 0 +#define CONFIG_CODEC_SRCS 0 +#define CONFIG_DEBUG_LIBS 0 +#define CONFIG_DEQUANT_TOKENS 0 +#define CONFIG_DC_RECON 0 +#define CONFIG_RUNTIME_CPU_DETECT 1 +#define CONFIG_POSTPROC 1 +#define CONFIG_VP9_POSTPROC 1 +#define CONFIG_MULTITHREAD 1 +#define CONFIG_INTERNAL_STATS 0 +#define CONFIG_VP8_ENCODER 1 +#define CONFIG_VP8_DECODER 1 +#define CONFIG_VP9_ENCODER 1 +#define CONFIG_VP9_DECODER 1 +#define CONFIG_VP8 1 +#define CONFIG_VP9 1 +#define CONFIG_ENCODERS 1 +#define CONFIG_DECODERS 1 +#define CONFIG_STATIC_MSVCRT 0 +#define CONFIG_SPATIAL_RESAMPLING 1 +#define CONFIG_REALTIME_ONLY 0 +#define CONFIG_ONTHEFLY_BITPACKING 0 +#define CONFIG_ERROR_CONCEALMENT 0 +#define CONFIG_SHARED 0 +#define CONFIG_STATIC 1 +#define CONFIG_SMALL 0 +#define CONFIG_POSTPROC_VISUALIZER 0 +#define CONFIG_OS_SUPPORT 1 +#define CONFIG_UNIT_TESTS 0 +#define CONFIG_WEBM_IO 1 +#define CONFIG_LIBYUV 1 +#define CONFIG_DECODE_PERF_TESTS 0 +#define CONFIG_ENCODE_PERF_TESTS 0 +#define CONFIG_MULTI_RES_ENCODING 1 +#define CONFIG_TEMPORAL_DENOISING 1 +#define CONFIG_VP9_TEMPORAL_DENOISING 0 +#define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 +#define CONFIG_BETTER_HW_COMPATIBILITY 0 +#define CONFIG_EXPERIMENTAL 0 +#define CONFIG_SIZE_LIMIT 1 +#define CONFIG_ALWAYS_ADJUST_BPM 0 +#define CONFIG_BITSTREAM_DEBUG 0 +#define CONFIG_MISMATCH_DEBUG 0 +#define CONFIG_FP_MB_STATS 0 +#define CONFIG_EMULATE_HARDWARE 0 +#define CONFIG_NON_GREEDY_MV 0 +#define CONFIG_RATE_CTRL 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 +#define DECODE_WIDTH_LIMIT 8192 +#define DECODE_HEIGHT_LIMIT 4608 +#endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/linux/ia32/vpx_dsp_rtcd.h b/media/libvpx/config/linux/ia32/vpx_dsp_rtcd.h new file mode 100644 index 0000000000..dd688b035b --- /dev/null +++ b/media/libvpx/config/linux/ia32/vpx_dsp_rtcd.h @@ -0,0 +1,1604 @@ +// This file is generated. Do not edit. +#ifndef VPX_DSP_RTCD_H_ +#define VPX_DSP_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * DSP + */ + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; + struct ScanOrder; +#endif + + +#ifdef __cplusplus +extern "C" { +#endif + +unsigned int vpx_avg_4x4_c(const uint8_t *, int p); +unsigned int vpx_avg_4x4_sse2(const uint8_t *, int p); +RTCD_EXTERN unsigned int (*vpx_avg_4x4)(const uint8_t *, int p); + +unsigned int vpx_avg_8x8_c(const uint8_t *, int p); +unsigned int vpx_avg_8x8_sse2(const uint8_t *, int p); +RTCD_EXTERN unsigned int (*vpx_avg_8x8)(const uint8_t *, int p); + +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +void vpx_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +RTCD_EXTERN void (*vpx_comp_avg_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); + +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c + +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c + +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c + +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c + +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c + +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c + +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c + +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c + +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c + +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c + +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct16x16)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct16x16_1)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_sse2(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_avx2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct32x32)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct32x32_1)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_rd_sse2(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_rd_avx2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct32x32_rd)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct4x4_sse2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct4x4)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct4x4_1)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_sse2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct8x8)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct8x8_1)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); +#define vpx_get4x4sse_cs vpx_get4x4sse_cs_c + +void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get_mb_ss_c(const int16_t *); +unsigned int vpx_get_mb_ss_sse2(const int16_t *); +RTCD_EXTERN unsigned int (*vpx_get_mb_ss)(const int16_t *); + +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); + +void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_32x32)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); + +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c + +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_38_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_1_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride); + +int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); +int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width); +RTCD_EXTERN int16_t (*vpx_int_pro_col)(const uint8_t *ref, const int width); + +void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +RTCD_EXTERN void (*vpx_int_pro_row)(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); + +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_iwht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c + +void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_avx2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_dual_avx2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_16_dual)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_horizontal_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_horizontal_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_16_dual)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_vertical_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols,int flimit); +void vpx_mbpost_proc_across_ip_sse2(unsigned char *src, int pitch, int rows, int cols,int flimit); +RTCD_EXTERN void (*vpx_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols,int flimit); + +void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); +void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit); +RTCD_EXTERN void (*vpx_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols,int flimit); + +void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +RTCD_EXTERN void (*vpx_minmax_8x8)(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch); +void vpx_plane_add_noise_sse2(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch); +RTCD_EXTERN void (*vpx_plane_add_noise)(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch); + +void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +void vpx_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +RTCD_EXTERN void (*vpx_post_proc_down_and_across_mb_row)(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); + +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vpx_quantize_b_32x32)(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad16x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad16x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad16x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x16_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad4x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad4x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad4x4_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad4x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad4x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad4x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad64x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad8x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad8x4_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad8x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad8x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c + +void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c + +unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c + +void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c + +unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +int vpx_satd_c(const int16_t *coeff, int length); +int vpx_satd_sse2(const int16_t *coeff, int length); +int vpx_satd_avx2(const int16_t *coeff, int length); +RTCD_EXTERN int (*vpx_satd)(const int16_t *coeff, int length); + +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c + +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c + +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c + +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_horiz vpx_scaled_horiz_c + +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_vert vpx_scaled_vert_c + +int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_sse4_1(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +RTCD_EXTERN int64_t (*vpx_sse)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); + +uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vpx_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +RTCD_EXTERN void (*vpx_subtract_block)(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); + +uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size); +uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size); +RTCD_EXTERN uint64_t (*vpx_sum_squares_2d_i16)(const int16_t *src, int stride, int size); + +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance4x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c + +int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl); +int vpx_vector_var_sse2(const int16_t *ref, const int16_t *src, const int bwl); +RTCD_EXTERN int (*vpx_vector_var)(const int16_t *ref, const int16_t *src, const int bwl); + +void vpx_dsp_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/x86.h" +static void setup_rtcd_internal(void) +{ + int flags = x86_simd_caps(); + + (void)flags; + + vpx_avg_4x4 = vpx_avg_4x4_c; + if (flags & HAS_SSE2) vpx_avg_4x4 = vpx_avg_4x4_sse2; + vpx_avg_8x8 = vpx_avg_8x8_c; + if (flags & HAS_SSE2) vpx_avg_8x8 = vpx_avg_8x8_sse2; + vpx_comp_avg_pred = vpx_comp_avg_pred_c; + if (flags & HAS_SSE2) vpx_comp_avg_pred = vpx_comp_avg_pred_sse2; + if (flags & HAS_AVX2) vpx_comp_avg_pred = vpx_comp_avg_pred_avx2; + vpx_convolve8 = vpx_convolve8_c; + if (flags & HAS_SSE2) vpx_convolve8 = vpx_convolve8_sse2; + if (flags & HAS_SSSE3) vpx_convolve8 = vpx_convolve8_ssse3; + if (flags & HAS_AVX2) vpx_convolve8 = vpx_convolve8_avx2; + vpx_convolve8_avg = vpx_convolve8_avg_c; + if (flags & HAS_SSE2) vpx_convolve8_avg = vpx_convolve8_avg_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_avg = vpx_convolve8_avg_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg = vpx_convolve8_avg_avx2; + vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_c; + if (flags & HAS_SSE2) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_avx2; + vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_c; + if (flags & HAS_SSE2) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_avx2; + vpx_convolve8_horiz = vpx_convolve8_horiz_c; + if (flags & HAS_SSE2) vpx_convolve8_horiz = vpx_convolve8_horiz_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_horiz = vpx_convolve8_horiz_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_horiz = vpx_convolve8_horiz_avx2; + vpx_convolve8_vert = vpx_convolve8_vert_c; + if (flags & HAS_SSE2) vpx_convolve8_vert = vpx_convolve8_vert_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_vert = vpx_convolve8_vert_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_vert = vpx_convolve8_vert_avx2; + vpx_convolve_avg = vpx_convolve_avg_c; + if (flags & HAS_SSE2) vpx_convolve_avg = vpx_convolve_avg_sse2; + vpx_convolve_copy = vpx_convolve_copy_c; + if (flags & HAS_SSE2) vpx_convolve_copy = vpx_convolve_copy_sse2; + vpx_d153_predictor_16x16 = vpx_d153_predictor_16x16_c; + if (flags & HAS_SSSE3) vpx_d153_predictor_16x16 = vpx_d153_predictor_16x16_ssse3; + vpx_d153_predictor_32x32 = vpx_d153_predictor_32x32_c; + if (flags & HAS_SSSE3) vpx_d153_predictor_32x32 = vpx_d153_predictor_32x32_ssse3; + vpx_d153_predictor_4x4 = vpx_d153_predictor_4x4_c; + if (flags & HAS_SSSE3) vpx_d153_predictor_4x4 = vpx_d153_predictor_4x4_ssse3; + vpx_d153_predictor_8x8 = vpx_d153_predictor_8x8_c; + if (flags & HAS_SSSE3) vpx_d153_predictor_8x8 = vpx_d153_predictor_8x8_ssse3; + vpx_d207_predictor_16x16 = vpx_d207_predictor_16x16_c; + if (flags & HAS_SSSE3) vpx_d207_predictor_16x16 = vpx_d207_predictor_16x16_ssse3; + vpx_d207_predictor_32x32 = vpx_d207_predictor_32x32_c; + if (flags & HAS_SSSE3) vpx_d207_predictor_32x32 = vpx_d207_predictor_32x32_ssse3; + vpx_d207_predictor_4x4 = vpx_d207_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_d207_predictor_4x4 = vpx_d207_predictor_4x4_sse2; + vpx_d207_predictor_8x8 = vpx_d207_predictor_8x8_c; + if (flags & HAS_SSSE3) vpx_d207_predictor_8x8 = vpx_d207_predictor_8x8_ssse3; + vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_c; + if (flags & HAS_SSSE3) vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_ssse3; + vpx_d45_predictor_32x32 = vpx_d45_predictor_32x32_c; + if (flags & HAS_SSSE3) vpx_d45_predictor_32x32 = vpx_d45_predictor_32x32_ssse3; + vpx_d45_predictor_4x4 = vpx_d45_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_d45_predictor_4x4 = vpx_d45_predictor_4x4_sse2; + vpx_d45_predictor_8x8 = vpx_d45_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_d45_predictor_8x8 = vpx_d45_predictor_8x8_sse2; + vpx_d63_predictor_16x16 = vpx_d63_predictor_16x16_c; + if (flags & HAS_SSSE3) vpx_d63_predictor_16x16 = vpx_d63_predictor_16x16_ssse3; + vpx_d63_predictor_32x32 = vpx_d63_predictor_32x32_c; + if (flags & HAS_SSSE3) vpx_d63_predictor_32x32 = vpx_d63_predictor_32x32_ssse3; + vpx_d63_predictor_4x4 = vpx_d63_predictor_4x4_c; + if (flags & HAS_SSSE3) vpx_d63_predictor_4x4 = vpx_d63_predictor_4x4_ssse3; + vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_c; + if (flags & HAS_SSSE3) vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_ssse3; + vpx_dc_128_predictor_16x16 = vpx_dc_128_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_dc_128_predictor_16x16 = vpx_dc_128_predictor_16x16_sse2; + vpx_dc_128_predictor_32x32 = vpx_dc_128_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_dc_128_predictor_32x32 = vpx_dc_128_predictor_32x32_sse2; + vpx_dc_128_predictor_4x4 = vpx_dc_128_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_dc_128_predictor_4x4 = vpx_dc_128_predictor_4x4_sse2; + vpx_dc_128_predictor_8x8 = vpx_dc_128_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_dc_128_predictor_8x8 = vpx_dc_128_predictor_8x8_sse2; + vpx_dc_left_predictor_16x16 = vpx_dc_left_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_dc_left_predictor_16x16 = vpx_dc_left_predictor_16x16_sse2; + vpx_dc_left_predictor_32x32 = vpx_dc_left_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_dc_left_predictor_32x32 = vpx_dc_left_predictor_32x32_sse2; + vpx_dc_left_predictor_4x4 = vpx_dc_left_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_dc_left_predictor_4x4 = vpx_dc_left_predictor_4x4_sse2; + vpx_dc_left_predictor_8x8 = vpx_dc_left_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_dc_left_predictor_8x8 = vpx_dc_left_predictor_8x8_sse2; + vpx_dc_predictor_16x16 = vpx_dc_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_dc_predictor_16x16 = vpx_dc_predictor_16x16_sse2; + vpx_dc_predictor_32x32 = vpx_dc_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_dc_predictor_32x32 = vpx_dc_predictor_32x32_sse2; + vpx_dc_predictor_4x4 = vpx_dc_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_dc_predictor_4x4 = vpx_dc_predictor_4x4_sse2; + vpx_dc_predictor_8x8 = vpx_dc_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_dc_predictor_8x8 = vpx_dc_predictor_8x8_sse2; + vpx_dc_top_predictor_16x16 = vpx_dc_top_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_dc_top_predictor_16x16 = vpx_dc_top_predictor_16x16_sse2; + vpx_dc_top_predictor_32x32 = vpx_dc_top_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_dc_top_predictor_32x32 = vpx_dc_top_predictor_32x32_sse2; + vpx_dc_top_predictor_4x4 = vpx_dc_top_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_dc_top_predictor_4x4 = vpx_dc_top_predictor_4x4_sse2; + vpx_dc_top_predictor_8x8 = vpx_dc_top_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_dc_top_predictor_8x8 = vpx_dc_top_predictor_8x8_sse2; + vpx_fdct16x16 = vpx_fdct16x16_c; + if (flags & HAS_SSE2) vpx_fdct16x16 = vpx_fdct16x16_sse2; + if (flags & HAS_AVX2) vpx_fdct16x16 = vpx_fdct16x16_avx2; + vpx_fdct16x16_1 = vpx_fdct16x16_1_c; + if (flags & HAS_SSE2) vpx_fdct16x16_1 = vpx_fdct16x16_1_sse2; + vpx_fdct32x32 = vpx_fdct32x32_c; + if (flags & HAS_SSE2) vpx_fdct32x32 = vpx_fdct32x32_sse2; + if (flags & HAS_AVX2) vpx_fdct32x32 = vpx_fdct32x32_avx2; + vpx_fdct32x32_1 = vpx_fdct32x32_1_c; + if (flags & HAS_SSE2) vpx_fdct32x32_1 = vpx_fdct32x32_1_sse2; + vpx_fdct32x32_rd = vpx_fdct32x32_rd_c; + if (flags & HAS_SSE2) vpx_fdct32x32_rd = vpx_fdct32x32_rd_sse2; + if (flags & HAS_AVX2) vpx_fdct32x32_rd = vpx_fdct32x32_rd_avx2; + vpx_fdct4x4 = vpx_fdct4x4_c; + if (flags & HAS_SSE2) vpx_fdct4x4 = vpx_fdct4x4_sse2; + vpx_fdct4x4_1 = vpx_fdct4x4_1_c; + if (flags & HAS_SSE2) vpx_fdct4x4_1 = vpx_fdct4x4_1_sse2; + vpx_fdct8x8 = vpx_fdct8x8_c; + if (flags & HAS_SSE2) vpx_fdct8x8 = vpx_fdct8x8_sse2; + vpx_fdct8x8_1 = vpx_fdct8x8_1_c; + if (flags & HAS_SSE2) vpx_fdct8x8_1 = vpx_fdct8x8_1_sse2; + vpx_get16x16var = vpx_get16x16var_c; + if (flags & HAS_SSE2) vpx_get16x16var = vpx_get16x16var_sse2; + if (flags & HAS_AVX2) vpx_get16x16var = vpx_get16x16var_avx2; + vpx_get8x8var = vpx_get8x8var_c; + if (flags & HAS_SSE2) vpx_get8x8var = vpx_get8x8var_sse2; + vpx_get_mb_ss = vpx_get_mb_ss_c; + if (flags & HAS_SSE2) vpx_get_mb_ss = vpx_get_mb_ss_sse2; + vpx_h_predictor_16x16 = vpx_h_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_h_predictor_16x16 = vpx_h_predictor_16x16_sse2; + vpx_h_predictor_32x32 = vpx_h_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_h_predictor_32x32 = vpx_h_predictor_32x32_sse2; + vpx_h_predictor_4x4 = vpx_h_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_h_predictor_4x4 = vpx_h_predictor_4x4_sse2; + vpx_h_predictor_8x8 = vpx_h_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_h_predictor_8x8 = vpx_h_predictor_8x8_sse2; + vpx_hadamard_16x16 = vpx_hadamard_16x16_c; + if (flags & HAS_SSE2) vpx_hadamard_16x16 = vpx_hadamard_16x16_sse2; + if (flags & HAS_AVX2) vpx_hadamard_16x16 = vpx_hadamard_16x16_avx2; + vpx_hadamard_32x32 = vpx_hadamard_32x32_c; + if (flags & HAS_SSE2) vpx_hadamard_32x32 = vpx_hadamard_32x32_sse2; + if (flags & HAS_AVX2) vpx_hadamard_32x32 = vpx_hadamard_32x32_avx2; + vpx_hadamard_8x8 = vpx_hadamard_8x8_c; + if (flags & HAS_SSE2) vpx_hadamard_8x8 = vpx_hadamard_8x8_sse2; + vpx_idct16x16_10_add = vpx_idct16x16_10_add_c; + if (flags & HAS_SSE2) vpx_idct16x16_10_add = vpx_idct16x16_10_add_sse2; + vpx_idct16x16_1_add = vpx_idct16x16_1_add_c; + if (flags & HAS_SSE2) vpx_idct16x16_1_add = vpx_idct16x16_1_add_sse2; + vpx_idct16x16_256_add = vpx_idct16x16_256_add_c; + if (flags & HAS_SSE2) vpx_idct16x16_256_add = vpx_idct16x16_256_add_sse2; + if (flags & HAS_AVX2) vpx_idct16x16_256_add = vpx_idct16x16_256_add_avx2; + vpx_idct16x16_38_add = vpx_idct16x16_38_add_c; + if (flags & HAS_SSE2) vpx_idct16x16_38_add = vpx_idct16x16_38_add_sse2; + vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_c; + if (flags & HAS_SSE2) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_sse2; + if (flags & HAS_AVX2) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_avx2; + vpx_idct32x32_135_add = vpx_idct32x32_135_add_c; + if (flags & HAS_SSE2) vpx_idct32x32_135_add = vpx_idct32x32_135_add_sse2; + if (flags & HAS_SSSE3) vpx_idct32x32_135_add = vpx_idct32x32_135_add_ssse3; + if (flags & HAS_AVX2) vpx_idct32x32_135_add = vpx_idct32x32_135_add_avx2; + vpx_idct32x32_1_add = vpx_idct32x32_1_add_c; + if (flags & HAS_SSE2) vpx_idct32x32_1_add = vpx_idct32x32_1_add_sse2; + vpx_idct32x32_34_add = vpx_idct32x32_34_add_c; + if (flags & HAS_SSE2) vpx_idct32x32_34_add = vpx_idct32x32_34_add_sse2; + if (flags & HAS_SSSE3) vpx_idct32x32_34_add = vpx_idct32x32_34_add_ssse3; + vpx_idct4x4_16_add = vpx_idct4x4_16_add_c; + if (flags & HAS_SSE2) vpx_idct4x4_16_add = vpx_idct4x4_16_add_sse2; + vpx_idct4x4_1_add = vpx_idct4x4_1_add_c; + if (flags & HAS_SSE2) vpx_idct4x4_1_add = vpx_idct4x4_1_add_sse2; + vpx_idct8x8_12_add = vpx_idct8x8_12_add_c; + if (flags & HAS_SSE2) vpx_idct8x8_12_add = vpx_idct8x8_12_add_sse2; + if (flags & HAS_SSSE3) vpx_idct8x8_12_add = vpx_idct8x8_12_add_ssse3; + vpx_idct8x8_1_add = vpx_idct8x8_1_add_c; + if (flags & HAS_SSE2) vpx_idct8x8_1_add = vpx_idct8x8_1_add_sse2; + vpx_idct8x8_64_add = vpx_idct8x8_64_add_c; + if (flags & HAS_SSE2) vpx_idct8x8_64_add = vpx_idct8x8_64_add_sse2; + vpx_int_pro_col = vpx_int_pro_col_c; + if (flags & HAS_SSE2) vpx_int_pro_col = vpx_int_pro_col_sse2; + vpx_int_pro_row = vpx_int_pro_row_c; + if (flags & HAS_SSE2) vpx_int_pro_row = vpx_int_pro_row_sse2; + vpx_iwht4x4_16_add = vpx_iwht4x4_16_add_c; + if (flags & HAS_SSE2) vpx_iwht4x4_16_add = vpx_iwht4x4_16_add_sse2; + vpx_lpf_horizontal_16 = vpx_lpf_horizontal_16_c; + if (flags & HAS_SSE2) vpx_lpf_horizontal_16 = vpx_lpf_horizontal_16_sse2; + if (flags & HAS_AVX2) vpx_lpf_horizontal_16 = vpx_lpf_horizontal_16_avx2; + vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_c; + if (flags & HAS_SSE2) vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_sse2; + if (flags & HAS_AVX2) vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_avx2; + vpx_lpf_horizontal_4 = vpx_lpf_horizontal_4_c; + if (flags & HAS_SSE2) vpx_lpf_horizontal_4 = vpx_lpf_horizontal_4_sse2; + vpx_lpf_horizontal_4_dual = vpx_lpf_horizontal_4_dual_c; + if (flags & HAS_SSE2) vpx_lpf_horizontal_4_dual = vpx_lpf_horizontal_4_dual_sse2; + vpx_lpf_horizontal_8 = vpx_lpf_horizontal_8_c; + if (flags & HAS_SSE2) vpx_lpf_horizontal_8 = vpx_lpf_horizontal_8_sse2; + vpx_lpf_horizontal_8_dual = vpx_lpf_horizontal_8_dual_c; + if (flags & HAS_SSE2) vpx_lpf_horizontal_8_dual = vpx_lpf_horizontal_8_dual_sse2; + vpx_lpf_vertical_16 = vpx_lpf_vertical_16_c; + if (flags & HAS_SSE2) vpx_lpf_vertical_16 = vpx_lpf_vertical_16_sse2; + vpx_lpf_vertical_16_dual = vpx_lpf_vertical_16_dual_c; + if (flags & HAS_SSE2) vpx_lpf_vertical_16_dual = vpx_lpf_vertical_16_dual_sse2; + vpx_lpf_vertical_4 = vpx_lpf_vertical_4_c; + if (flags & HAS_SSE2) vpx_lpf_vertical_4 = vpx_lpf_vertical_4_sse2; + vpx_lpf_vertical_4_dual = vpx_lpf_vertical_4_dual_c; + if (flags & HAS_SSE2) vpx_lpf_vertical_4_dual = vpx_lpf_vertical_4_dual_sse2; + vpx_lpf_vertical_8 = vpx_lpf_vertical_8_c; + if (flags & HAS_SSE2) vpx_lpf_vertical_8 = vpx_lpf_vertical_8_sse2; + vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_c; + if (flags & HAS_SSE2) vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_sse2; + vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_c; + if (flags & HAS_SSE2) vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_sse2; + vpx_mbpost_proc_down = vpx_mbpost_proc_down_c; + if (flags & HAS_SSE2) vpx_mbpost_proc_down = vpx_mbpost_proc_down_sse2; + vpx_minmax_8x8 = vpx_minmax_8x8_c; + if (flags & HAS_SSE2) vpx_minmax_8x8 = vpx_minmax_8x8_sse2; + vpx_mse16x16 = vpx_mse16x16_c; + if (flags & HAS_SSE2) vpx_mse16x16 = vpx_mse16x16_sse2; + if (flags & HAS_AVX2) vpx_mse16x16 = vpx_mse16x16_avx2; + vpx_mse16x8 = vpx_mse16x8_c; + if (flags & HAS_SSE2) vpx_mse16x8 = vpx_mse16x8_sse2; + if (flags & HAS_AVX2) vpx_mse16x8 = vpx_mse16x8_avx2; + vpx_mse8x16 = vpx_mse8x16_c; + if (flags & HAS_SSE2) vpx_mse8x16 = vpx_mse8x16_sse2; + vpx_mse8x8 = vpx_mse8x8_c; + if (flags & HAS_SSE2) vpx_mse8x8 = vpx_mse8x8_sse2; + vpx_plane_add_noise = vpx_plane_add_noise_c; + if (flags & HAS_SSE2) vpx_plane_add_noise = vpx_plane_add_noise_sse2; + vpx_post_proc_down_and_across_mb_row = vpx_post_proc_down_and_across_mb_row_c; + if (flags & HAS_SSE2) vpx_post_proc_down_and_across_mb_row = vpx_post_proc_down_and_across_mb_row_sse2; + vpx_quantize_b = vpx_quantize_b_c; + if (flags & HAS_SSE2) vpx_quantize_b = vpx_quantize_b_sse2; + if (flags & HAS_SSSE3) vpx_quantize_b = vpx_quantize_b_ssse3; + if (flags & HAS_AVX) vpx_quantize_b = vpx_quantize_b_avx; + if (flags & HAS_AVX2) vpx_quantize_b = vpx_quantize_b_avx2; + vpx_quantize_b_32x32 = vpx_quantize_b_32x32_c; + if (flags & HAS_SSSE3) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_ssse3; + if (flags & HAS_AVX) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx; + if (flags & HAS_AVX2) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx2; + vpx_sad16x16 = vpx_sad16x16_c; + if (flags & HAS_SSE2) vpx_sad16x16 = vpx_sad16x16_sse2; + vpx_sad16x16_avg = vpx_sad16x16_avg_c; + if (flags & HAS_SSE2) vpx_sad16x16_avg = vpx_sad16x16_avg_sse2; + vpx_sad16x16x4d = vpx_sad16x16x4d_c; + if (flags & HAS_SSE2) vpx_sad16x16x4d = vpx_sad16x16x4d_sse2; + vpx_sad16x32 = vpx_sad16x32_c; + if (flags & HAS_SSE2) vpx_sad16x32 = vpx_sad16x32_sse2; + vpx_sad16x32_avg = vpx_sad16x32_avg_c; + if (flags & HAS_SSE2) vpx_sad16x32_avg = vpx_sad16x32_avg_sse2; + vpx_sad16x32x4d = vpx_sad16x32x4d_c; + if (flags & HAS_SSE2) vpx_sad16x32x4d = vpx_sad16x32x4d_sse2; + vpx_sad16x8 = vpx_sad16x8_c; + if (flags & HAS_SSE2) vpx_sad16x8 = vpx_sad16x8_sse2; + vpx_sad16x8_avg = vpx_sad16x8_avg_c; + if (flags & HAS_SSE2) vpx_sad16x8_avg = vpx_sad16x8_avg_sse2; + vpx_sad16x8x4d = vpx_sad16x8x4d_c; + if (flags & HAS_SSE2) vpx_sad16x8x4d = vpx_sad16x8x4d_sse2; + vpx_sad32x16 = vpx_sad32x16_c; + if (flags & HAS_SSE2) vpx_sad32x16 = vpx_sad32x16_sse2; + if (flags & HAS_AVX2) vpx_sad32x16 = vpx_sad32x16_avx2; + vpx_sad32x16_avg = vpx_sad32x16_avg_c; + if (flags & HAS_SSE2) vpx_sad32x16_avg = vpx_sad32x16_avg_sse2; + if (flags & HAS_AVX2) vpx_sad32x16_avg = vpx_sad32x16_avg_avx2; + vpx_sad32x16x4d = vpx_sad32x16x4d_c; + if (flags & HAS_SSE2) vpx_sad32x16x4d = vpx_sad32x16x4d_sse2; + vpx_sad32x32 = vpx_sad32x32_c; + if (flags & HAS_SSE2) vpx_sad32x32 = vpx_sad32x32_sse2; + if (flags & HAS_AVX2) vpx_sad32x32 = vpx_sad32x32_avx2; + vpx_sad32x32_avg = vpx_sad32x32_avg_c; + if (flags & HAS_SSE2) vpx_sad32x32_avg = vpx_sad32x32_avg_sse2; + if (flags & HAS_AVX2) vpx_sad32x32_avg = vpx_sad32x32_avg_avx2; + vpx_sad32x32x4d = vpx_sad32x32x4d_c; + if (flags & HAS_SSE2) vpx_sad32x32x4d = vpx_sad32x32x4d_sse2; + if (flags & HAS_AVX2) vpx_sad32x32x4d = vpx_sad32x32x4d_avx2; + vpx_sad32x64 = vpx_sad32x64_c; + if (flags & HAS_SSE2) vpx_sad32x64 = vpx_sad32x64_sse2; + if (flags & HAS_AVX2) vpx_sad32x64 = vpx_sad32x64_avx2; + vpx_sad32x64_avg = vpx_sad32x64_avg_c; + if (flags & HAS_SSE2) vpx_sad32x64_avg = vpx_sad32x64_avg_sse2; + if (flags & HAS_AVX2) vpx_sad32x64_avg = vpx_sad32x64_avg_avx2; + vpx_sad32x64x4d = vpx_sad32x64x4d_c; + if (flags & HAS_SSE2) vpx_sad32x64x4d = vpx_sad32x64x4d_sse2; + vpx_sad4x4 = vpx_sad4x4_c; + if (flags & HAS_SSE2) vpx_sad4x4 = vpx_sad4x4_sse2; + vpx_sad4x4_avg = vpx_sad4x4_avg_c; + if (flags & HAS_SSE2) vpx_sad4x4_avg = vpx_sad4x4_avg_sse2; + vpx_sad4x4x4d = vpx_sad4x4x4d_c; + if (flags & HAS_SSE2) vpx_sad4x4x4d = vpx_sad4x4x4d_sse2; + vpx_sad4x8 = vpx_sad4x8_c; + if (flags & HAS_SSE2) vpx_sad4x8 = vpx_sad4x8_sse2; + vpx_sad4x8_avg = vpx_sad4x8_avg_c; + if (flags & HAS_SSE2) vpx_sad4x8_avg = vpx_sad4x8_avg_sse2; + vpx_sad4x8x4d = vpx_sad4x8x4d_c; + if (flags & HAS_SSE2) vpx_sad4x8x4d = vpx_sad4x8x4d_sse2; + vpx_sad64x32 = vpx_sad64x32_c; + if (flags & HAS_SSE2) vpx_sad64x32 = vpx_sad64x32_sse2; + if (flags & HAS_AVX2) vpx_sad64x32 = vpx_sad64x32_avx2; + vpx_sad64x32_avg = vpx_sad64x32_avg_c; + if (flags & HAS_SSE2) vpx_sad64x32_avg = vpx_sad64x32_avg_sse2; + if (flags & HAS_AVX2) vpx_sad64x32_avg = vpx_sad64x32_avg_avx2; + vpx_sad64x32x4d = vpx_sad64x32x4d_c; + if (flags & HAS_SSE2) vpx_sad64x32x4d = vpx_sad64x32x4d_sse2; + vpx_sad64x64 = vpx_sad64x64_c; + if (flags & HAS_SSE2) vpx_sad64x64 = vpx_sad64x64_sse2; + if (flags & HAS_AVX2) vpx_sad64x64 = vpx_sad64x64_avx2; + vpx_sad64x64_avg = vpx_sad64x64_avg_c; + if (flags & HAS_SSE2) vpx_sad64x64_avg = vpx_sad64x64_avg_sse2; + if (flags & HAS_AVX2) vpx_sad64x64_avg = vpx_sad64x64_avg_avx2; + vpx_sad64x64x4d = vpx_sad64x64x4d_c; + if (flags & HAS_SSE2) vpx_sad64x64x4d = vpx_sad64x64x4d_sse2; + if (flags & HAS_AVX2) vpx_sad64x64x4d = vpx_sad64x64x4d_avx2; + vpx_sad8x16 = vpx_sad8x16_c; + if (flags & HAS_SSE2) vpx_sad8x16 = vpx_sad8x16_sse2; + vpx_sad8x16_avg = vpx_sad8x16_avg_c; + if (flags & HAS_SSE2) vpx_sad8x16_avg = vpx_sad8x16_avg_sse2; + vpx_sad8x16x4d = vpx_sad8x16x4d_c; + if (flags & HAS_SSE2) vpx_sad8x16x4d = vpx_sad8x16x4d_sse2; + vpx_sad8x4 = vpx_sad8x4_c; + if (flags & HAS_SSE2) vpx_sad8x4 = vpx_sad8x4_sse2; + vpx_sad8x4_avg = vpx_sad8x4_avg_c; + if (flags & HAS_SSE2) vpx_sad8x4_avg = vpx_sad8x4_avg_sse2; + vpx_sad8x4x4d = vpx_sad8x4x4d_c; + if (flags & HAS_SSE2) vpx_sad8x4x4d = vpx_sad8x4x4d_sse2; + vpx_sad8x8 = vpx_sad8x8_c; + if (flags & HAS_SSE2) vpx_sad8x8 = vpx_sad8x8_sse2; + vpx_sad8x8_avg = vpx_sad8x8_avg_c; + if (flags & HAS_SSE2) vpx_sad8x8_avg = vpx_sad8x8_avg_sse2; + vpx_sad8x8x4d = vpx_sad8x8x4d_c; + if (flags & HAS_SSE2) vpx_sad8x8x4d = vpx_sad8x8x4d_sse2; + vpx_sad_skip_16x16 = vpx_sad_skip_16x16_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x16 = vpx_sad_skip_16x16_sse2; + vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_sse2; + vpx_sad_skip_16x32 = vpx_sad_skip_16x32_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x32 = vpx_sad_skip_16x32_sse2; + vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_sse2; + vpx_sad_skip_16x8 = vpx_sad_skip_16x8_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x8 = vpx_sad_skip_16x8_sse2; + vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_sse2; + vpx_sad_skip_32x16 = vpx_sad_skip_32x16_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_avx2; + vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_avx2; + vpx_sad_skip_32x32 = vpx_sad_skip_32x32_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_avx2; + vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_avx2; + vpx_sad_skip_32x64 = vpx_sad_skip_32x64_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_avx2; + vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_avx2; + vpx_sad_skip_4x8 = vpx_sad_skip_4x8_c; + if (flags & HAS_SSE2) vpx_sad_skip_4x8 = vpx_sad_skip_4x8_sse2; + vpx_sad_skip_4x8x4d = vpx_sad_skip_4x8x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_4x8x4d = vpx_sad_skip_4x8x4d_sse2; + vpx_sad_skip_64x32 = vpx_sad_skip_64x32_c; + if (flags & HAS_SSE2) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_avx2; + vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_avx2; + vpx_sad_skip_64x64 = vpx_sad_skip_64x64_c; + if (flags & HAS_SSE2) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_avx2; + vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_avx2; + vpx_sad_skip_8x16 = vpx_sad_skip_8x16_c; + if (flags & HAS_SSE2) vpx_sad_skip_8x16 = vpx_sad_skip_8x16_sse2; + vpx_sad_skip_8x16x4d = vpx_sad_skip_8x16x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_8x16x4d = vpx_sad_skip_8x16x4d_sse2; + vpx_sad_skip_8x8 = vpx_sad_skip_8x8_c; + if (flags & HAS_SSE2) vpx_sad_skip_8x8 = vpx_sad_skip_8x8_sse2; + vpx_sad_skip_8x8x4d = vpx_sad_skip_8x8x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_8x8x4d = vpx_sad_skip_8x8x4d_sse2; + vpx_satd = vpx_satd_c; + if (flags & HAS_SSE2) vpx_satd = vpx_satd_sse2; + if (flags & HAS_AVX2) vpx_satd = vpx_satd_avx2; + vpx_scaled_2d = vpx_scaled_2d_c; + if (flags & HAS_SSSE3) vpx_scaled_2d = vpx_scaled_2d_ssse3; + vpx_sse = vpx_sse_c; + if (flags & HAS_SSE4_1) vpx_sse = vpx_sse_sse4_1; + if (flags & HAS_AVX2) vpx_sse = vpx_sse_avx2; + vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_ssse3; + vpx_sub_pixel_avg_variance16x32 = vpx_sub_pixel_avg_variance16x32_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance16x32 = vpx_sub_pixel_avg_variance16x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance16x32 = vpx_sub_pixel_avg_variance16x32_ssse3; + vpx_sub_pixel_avg_variance16x8 = vpx_sub_pixel_avg_variance16x8_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance16x8 = vpx_sub_pixel_avg_variance16x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance16x8 = vpx_sub_pixel_avg_variance16x8_ssse3; + vpx_sub_pixel_avg_variance32x16 = vpx_sub_pixel_avg_variance32x16_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance32x16 = vpx_sub_pixel_avg_variance32x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance32x16 = vpx_sub_pixel_avg_variance32x16_ssse3; + vpx_sub_pixel_avg_variance32x32 = vpx_sub_pixel_avg_variance32x32_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance32x32 = vpx_sub_pixel_avg_variance32x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance32x32 = vpx_sub_pixel_avg_variance32x32_ssse3; + if (flags & HAS_AVX2) vpx_sub_pixel_avg_variance32x32 = vpx_sub_pixel_avg_variance32x32_avx2; + vpx_sub_pixel_avg_variance32x64 = vpx_sub_pixel_avg_variance32x64_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance32x64 = vpx_sub_pixel_avg_variance32x64_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance32x64 = vpx_sub_pixel_avg_variance32x64_ssse3; + vpx_sub_pixel_avg_variance4x4 = vpx_sub_pixel_avg_variance4x4_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance4x4 = vpx_sub_pixel_avg_variance4x4_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance4x4 = vpx_sub_pixel_avg_variance4x4_ssse3; + vpx_sub_pixel_avg_variance4x8 = vpx_sub_pixel_avg_variance4x8_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance4x8 = vpx_sub_pixel_avg_variance4x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance4x8 = vpx_sub_pixel_avg_variance4x8_ssse3; + vpx_sub_pixel_avg_variance64x32 = vpx_sub_pixel_avg_variance64x32_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance64x32 = vpx_sub_pixel_avg_variance64x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance64x32 = vpx_sub_pixel_avg_variance64x32_ssse3; + vpx_sub_pixel_avg_variance64x64 = vpx_sub_pixel_avg_variance64x64_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance64x64 = vpx_sub_pixel_avg_variance64x64_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance64x64 = vpx_sub_pixel_avg_variance64x64_ssse3; + if (flags & HAS_AVX2) vpx_sub_pixel_avg_variance64x64 = vpx_sub_pixel_avg_variance64x64_avx2; + vpx_sub_pixel_avg_variance8x16 = vpx_sub_pixel_avg_variance8x16_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance8x16 = vpx_sub_pixel_avg_variance8x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance8x16 = vpx_sub_pixel_avg_variance8x16_ssse3; + vpx_sub_pixel_avg_variance8x4 = vpx_sub_pixel_avg_variance8x4_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance8x4 = vpx_sub_pixel_avg_variance8x4_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance8x4 = vpx_sub_pixel_avg_variance8x4_ssse3; + vpx_sub_pixel_avg_variance8x8 = vpx_sub_pixel_avg_variance8x8_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance8x8 = vpx_sub_pixel_avg_variance8x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance8x8 = vpx_sub_pixel_avg_variance8x8_ssse3; + vpx_sub_pixel_variance16x16 = vpx_sub_pixel_variance16x16_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance16x16 = vpx_sub_pixel_variance16x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance16x16 = vpx_sub_pixel_variance16x16_ssse3; + vpx_sub_pixel_variance16x32 = vpx_sub_pixel_variance16x32_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance16x32 = vpx_sub_pixel_variance16x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance16x32 = vpx_sub_pixel_variance16x32_ssse3; + vpx_sub_pixel_variance16x8 = vpx_sub_pixel_variance16x8_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance16x8 = vpx_sub_pixel_variance16x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance16x8 = vpx_sub_pixel_variance16x8_ssse3; + vpx_sub_pixel_variance32x16 = vpx_sub_pixel_variance32x16_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance32x16 = vpx_sub_pixel_variance32x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance32x16 = vpx_sub_pixel_variance32x16_ssse3; + vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_ssse3; + if (flags & HAS_AVX2) vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_avx2; + vpx_sub_pixel_variance32x64 = vpx_sub_pixel_variance32x64_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance32x64 = vpx_sub_pixel_variance32x64_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance32x64 = vpx_sub_pixel_variance32x64_ssse3; + vpx_sub_pixel_variance4x4 = vpx_sub_pixel_variance4x4_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance4x4 = vpx_sub_pixel_variance4x4_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance4x4 = vpx_sub_pixel_variance4x4_ssse3; + vpx_sub_pixel_variance4x8 = vpx_sub_pixel_variance4x8_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance4x8 = vpx_sub_pixel_variance4x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance4x8 = vpx_sub_pixel_variance4x8_ssse3; + vpx_sub_pixel_variance64x32 = vpx_sub_pixel_variance64x32_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance64x32 = vpx_sub_pixel_variance64x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance64x32 = vpx_sub_pixel_variance64x32_ssse3; + vpx_sub_pixel_variance64x64 = vpx_sub_pixel_variance64x64_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance64x64 = vpx_sub_pixel_variance64x64_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance64x64 = vpx_sub_pixel_variance64x64_ssse3; + if (flags & HAS_AVX2) vpx_sub_pixel_variance64x64 = vpx_sub_pixel_variance64x64_avx2; + vpx_sub_pixel_variance8x16 = vpx_sub_pixel_variance8x16_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance8x16 = vpx_sub_pixel_variance8x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x16 = vpx_sub_pixel_variance8x16_ssse3; + vpx_sub_pixel_variance8x4 = vpx_sub_pixel_variance8x4_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance8x4 = vpx_sub_pixel_variance8x4_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x4 = vpx_sub_pixel_variance8x4_ssse3; + vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_ssse3; + vpx_subtract_block = vpx_subtract_block_c; + if (flags & HAS_SSE2) vpx_subtract_block = vpx_subtract_block_sse2; + if (flags & HAS_AVX2) vpx_subtract_block = vpx_subtract_block_avx2; + vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_c; + if (flags & HAS_SSE2) vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_sse2; + vpx_tm_predictor_16x16 = vpx_tm_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_tm_predictor_16x16 = vpx_tm_predictor_16x16_sse2; + vpx_tm_predictor_32x32 = vpx_tm_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_tm_predictor_32x32 = vpx_tm_predictor_32x32_sse2; + vpx_tm_predictor_4x4 = vpx_tm_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_tm_predictor_4x4 = vpx_tm_predictor_4x4_sse2; + vpx_tm_predictor_8x8 = vpx_tm_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_tm_predictor_8x8 = vpx_tm_predictor_8x8_sse2; + vpx_v_predictor_16x16 = vpx_v_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_v_predictor_16x16 = vpx_v_predictor_16x16_sse2; + vpx_v_predictor_32x32 = vpx_v_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_v_predictor_32x32 = vpx_v_predictor_32x32_sse2; + vpx_v_predictor_4x4 = vpx_v_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_v_predictor_4x4 = vpx_v_predictor_4x4_sse2; + vpx_v_predictor_8x8 = vpx_v_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_v_predictor_8x8 = vpx_v_predictor_8x8_sse2; + vpx_variance16x16 = vpx_variance16x16_c; + if (flags & HAS_SSE2) vpx_variance16x16 = vpx_variance16x16_sse2; + if (flags & HAS_AVX2) vpx_variance16x16 = vpx_variance16x16_avx2; + vpx_variance16x32 = vpx_variance16x32_c; + if (flags & HAS_SSE2) vpx_variance16x32 = vpx_variance16x32_sse2; + if (flags & HAS_AVX2) vpx_variance16x32 = vpx_variance16x32_avx2; + vpx_variance16x8 = vpx_variance16x8_c; + if (flags & HAS_SSE2) vpx_variance16x8 = vpx_variance16x8_sse2; + if (flags & HAS_AVX2) vpx_variance16x8 = vpx_variance16x8_avx2; + vpx_variance32x16 = vpx_variance32x16_c; + if (flags & HAS_SSE2) vpx_variance32x16 = vpx_variance32x16_sse2; + if (flags & HAS_AVX2) vpx_variance32x16 = vpx_variance32x16_avx2; + vpx_variance32x32 = vpx_variance32x32_c; + if (flags & HAS_SSE2) vpx_variance32x32 = vpx_variance32x32_sse2; + if (flags & HAS_AVX2) vpx_variance32x32 = vpx_variance32x32_avx2; + vpx_variance32x64 = vpx_variance32x64_c; + if (flags & HAS_SSE2) vpx_variance32x64 = vpx_variance32x64_sse2; + if (flags & HAS_AVX2) vpx_variance32x64 = vpx_variance32x64_avx2; + vpx_variance4x4 = vpx_variance4x4_c; + if (flags & HAS_SSE2) vpx_variance4x4 = vpx_variance4x4_sse2; + vpx_variance4x8 = vpx_variance4x8_c; + if (flags & HAS_SSE2) vpx_variance4x8 = vpx_variance4x8_sse2; + vpx_variance64x32 = vpx_variance64x32_c; + if (flags & HAS_SSE2) vpx_variance64x32 = vpx_variance64x32_sse2; + if (flags & HAS_AVX2) vpx_variance64x32 = vpx_variance64x32_avx2; + vpx_variance64x64 = vpx_variance64x64_c; + if (flags & HAS_SSE2) vpx_variance64x64 = vpx_variance64x64_sse2; + if (flags & HAS_AVX2) vpx_variance64x64 = vpx_variance64x64_avx2; + vpx_variance8x16 = vpx_variance8x16_c; + if (flags & HAS_SSE2) vpx_variance8x16 = vpx_variance8x16_sse2; + if (flags & HAS_AVX2) vpx_variance8x16 = vpx_variance8x16_avx2; + vpx_variance8x4 = vpx_variance8x4_c; + if (flags & HAS_SSE2) vpx_variance8x4 = vpx_variance8x4_sse2; + if (flags & HAS_AVX2) vpx_variance8x4 = vpx_variance8x4_avx2; + vpx_variance8x8 = vpx_variance8x8_c; + if (flags & HAS_SSE2) vpx_variance8x8 = vpx_variance8x8_sse2; + if (flags & HAS_AVX2) vpx_variance8x8 = vpx_variance8x8_avx2; + vpx_vector_var = vpx_vector_var_c; + if (flags & HAS_SSE2) vpx_vector_var = vpx_vector_var_sse2; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/linux/ia32/vpx_scale_rtcd.h b/media/libvpx/config/linux/ia32/vpx_scale_rtcd.h new file mode 100644 index 0000000000..5f09104ea6 --- /dev/null +++ b/media/libvpx/config/linux/ia32/vpx_scale_rtcd.h @@ -0,0 +1,73 @@ +// This file is generated. Do not edit. +#ifndef VPX_SCALE_RTCD_H_ +#define VPX_SCALE_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c + +void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c + +void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c + +void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c + +void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c + +void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c + +void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c + +void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vp8_yv12_copy_frame vp8_yv12_copy_frame_c + +void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c + +void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_borders vpx_extend_frame_borders_c + +void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c + +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c + +void vpx_scale_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/x86.h" +static void setup_rtcd_internal(void) +{ + int flags = x86_simd_caps(); + + (void)flags; + +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/linux/x64/vp8_rtcd.h b/media/libvpx/config/linux/x64/vp8_rtcd.h new file mode 100644 index 0000000000..dc850b4fe0 --- /dev/null +++ b/media/libvpx/config/linux/x64/vp8_rtcd.h @@ -0,0 +1,248 @@ +// This file is generated. Do not edit. +#ifndef VP8_RTCD_H_ +#define VP8_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP8 + */ + +struct blockd; +struct macroblockd; +struct loop_filter_info; + +/* Encoder forward decls */ +struct block; +struct macroblock; +struct variance_vtable; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict4x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_sse2 + +void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_sse2 + +void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +int vp8_block_error_c(short *coeff, short *dqcoeff); +int vp8_block_error_sse2(short *coeff, short *dqcoeff); +#define vp8_block_error vp8_block_error_sse2 + +void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +void vp8_copy32xn_sse2(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +void vp8_copy32xn_sse3(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); + +void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem16x16 vp8_copy_mem16x16_sse2 + +void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x4_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem8x4 vp8_copy_mem8x4_mmx + +void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x8_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem8x8 vp8_copy_mem8x8_mmx + +void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +#define vp8_dc_only_idct_add vp8_dc_only_idct_add_mmx + +int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter vp8_denoiser_filter_sse2 + +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_sse2 + +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride); +void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *dest, int stride); +#define vp8_dequant_idct_add vp8_dequant_idct_add_mmx + +void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +void vp8_dequant_idct_add_uv_block_sse2(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_sse2 + +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2 + +void vp8_dequantize_b_c(struct blockd*, short *DQC); +void vp8_dequantize_b_mmx(struct blockd*, short *DQC); +#define vp8_dequantize_b vp8_dequantize_b_mmx + +int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_diamond_search_sad vp8_diamond_search_sadx4 + +void vp8_fast_quantize_b_c(struct block *, struct blockd *); +void vp8_fast_quantize_b_sse2(struct block *, struct blockd *); +void vp8_fast_quantize_b_ssse3(struct block *, struct blockd *); +RTCD_EXTERN void (*vp8_fast_quantize_b)(struct block *, struct blockd *); + +void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +void vp8_filter_by_weight16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight16x16 vp8_filter_by_weight16x16_sse2 + +void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight4x4 vp8_filter_by_weight4x4_c + +void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight8x8 vp8_filter_by_weight8x8_sse2 + +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bh vp8_loop_filter_bh_sse2 + +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bv vp8_loop_filter_bv_sse2 + +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbh vp8_loop_filter_mbh_sse2 + +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbv vp8_loop_filter_mbv_sse2 + +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_sse2 + +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_sse2 + +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_sse2 + +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_sse2 + +int vp8_mbblock_error_c(struct macroblock *mb, int dc); +int vp8_mbblock_error_sse2(struct macroblock *mb, int dc); +#define vp8_mbblock_error vp8_mbblock_error_sse2 + +int vp8_mbuverror_c(struct macroblock *mb); +int vp8_mbuverror_sse2(struct macroblock *mb); +#define vp8_mbuverror vp8_mbuverror_sse2 + +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_refining_search_sad vp8_refining_search_sadx4 + +void vp8_regular_quantize_b_c(struct block *, struct blockd *); +void vp8_regular_quantize_b_sse2(struct block *, struct blockd *); +void vp8_regular_quantize_b_sse4_1(struct block *, struct blockd *); +RTCD_EXTERN void (*vp8_regular_quantize_b)(struct block *, struct blockd *); + +void vp8_short_fdct4x4_c(short *input, short *output, int pitch); +void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch); +#define vp8_short_fdct4x4 vp8_short_fdct4x4_sse2 + +void vp8_short_fdct8x4_c(short *input, short *output, int pitch); +void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch); +#define vp8_short_fdct8x4 vp8_short_fdct8x4_sse2 + +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +#define vp8_short_idct4x4llm vp8_short_idct4x4llm_mmx + +void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff); +void vp8_short_inv_walsh4x4_sse2(short *input, short *mb_dqcoeff); +#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_sse2 + +void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + +void vp8_short_walsh4x4_c(short *input, short *output, int pitch); +void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch); +#define vp8_short_walsh4x4 vp8_short_walsh4x4_sse2 + +void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); +void vp8_temporal_filter_apply_sse2(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); +#define vp8_temporal_filter_apply vp8_temporal_filter_apply_sse2 + +void vp8_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/x86.h" +static void setup_rtcd_internal(void) +{ + int flags = x86_simd_caps(); + + (void)flags; + + vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_sse2; + if (flags & HAS_SSSE3) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_ssse3; + vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_sse2; + if (flags & HAS_SSSE3) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_ssse3; + vp8_copy32xn = vp8_copy32xn_sse2; + if (flags & HAS_SSE3) vp8_copy32xn = vp8_copy32xn_sse3; + vp8_fast_quantize_b = vp8_fast_quantize_b_sse2; + if (flags & HAS_SSSE3) vp8_fast_quantize_b = vp8_fast_quantize_b_ssse3; + vp8_regular_quantize_b = vp8_regular_quantize_b_sse2; + if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1; + vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_ssse3; + vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_mmx; + if (flags & HAS_SSSE3) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_ssse3; + vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_ssse3; + vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_ssse3; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/linux/x64/vp9_rtcd.h b/media/libvpx/config/linux/x64/vp9_rtcd.h new file mode 100644 index 0000000000..8644c3598d --- /dev/null +++ b/media/libvpx/config/linux/x64/vp9_rtcd.h @@ -0,0 +1,135 @@ +// This file is generated. Do not edit. +#ifndef VP9_RTCD_H_ +#define VP9_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP9 + */ + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" + +struct macroblockd; + +/* Encoder forward decls */ +struct macroblock; +struct macroblock_plane; +struct vp9_sad_table; +struct ScanOrder; +struct search_site_config; +struct mv; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); +void vp9_apply_temporal_filter_sse4_1(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); +RTCD_EXTERN void (*vp9_apply_temporal_filter)(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); + +int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +RTCD_EXTERN int64_t (*vp9_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); + +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +RTCD_EXTERN int64_t (*vp9_block_error_fp)(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); + +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_c + +void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht16x16 vp9_fht16x16_sse2 + +void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht4x4 vp9_fht4x4_sse2 + +void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht8x8 vp9_fht8x8_sse2 + +void vp9_filter_by_weight16x16_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); +void vp9_filter_by_weight16x16_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); +#define vp9_filter_by_weight16x16 vp9_filter_by_weight16x16_sse2 + +void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); +void vp9_filter_by_weight8x8_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); +#define vp9_filter_by_weight8x8 vp9_filter_by_weight8x8_sse2 + +void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); +void vp9_fwht4x4_sse2(const int16_t *input, tran_low_t *output, int stride); +#define vp9_fwht4x4 vp9_fwht4x4_sse2 + +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2 + +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht4x4_16_add vp9_iht4x4_16_add_sse2 + +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht8x8_64_add vp9_iht8x8_64_add_sse2 + +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); + +void vp9_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/x86.h" +static void setup_rtcd_internal(void) +{ + int flags = x86_simd_caps(); + + (void)flags; + + vp9_apply_temporal_filter = vp9_apply_temporal_filter_c; + if (flags & HAS_SSE4_1) vp9_apply_temporal_filter = vp9_apply_temporal_filter_sse4_1; + vp9_block_error = vp9_block_error_sse2; + if (flags & HAS_AVX2) vp9_block_error = vp9_block_error_avx2; + vp9_block_error_fp = vp9_block_error_fp_sse2; + if (flags & HAS_AVX2) vp9_block_error_fp = vp9_block_error_fp_avx2; + vp9_quantize_fp = vp9_quantize_fp_sse2; + if (flags & HAS_SSSE3) vp9_quantize_fp = vp9_quantize_fp_ssse3; + if (flags & HAS_AVX2) vp9_quantize_fp = vp9_quantize_fp_avx2; + vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_c; + if (flags & HAS_SSSE3) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_ssse3; + if (flags & HAS_AVX2) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_avx2; + vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c; + if (flags & HAS_SSSE3) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_ssse3; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/linux/x64/vpx_config.asm b/media/libvpx/config/linux/x64/vpx_config.asm new file mode 100644 index 0000000000..8715768a2e --- /dev/null +++ b/media/libvpx/config/linux/x64/vpx_config.asm @@ -0,0 +1,93 @@ +%define VPX_ARCH_ARM 0 +%define VPX_ARCH_AARCH64 0 +%define VPX_ARCH_MIPS 0 +%define VPX_ARCH_X86 0 +%define VPX_ARCH_X86_64 1 +%define VPX_ARCH_PPC 0 +%define VPX_ARCH_LOONGARCH 0 +%define HAVE_NEON_ASM 0 +%define HAVE_NEON 0 +%define HAVE_NEON_DOTPROD 0 +%define HAVE_NEON_I8MM 0 +%define HAVE_SVE 0 +%define HAVE_MIPS32 0 +%define HAVE_DSPR2 0 +%define HAVE_MSA 0 +%define HAVE_MIPS64 0 +%define HAVE_MMX 1 +%define HAVE_SSE 1 +%define HAVE_SSE2 1 +%define HAVE_SSE3 1 +%define HAVE_SSSE3 1 +%define HAVE_SSE4_1 1 +%define HAVE_AVX 1 +%define HAVE_AVX2 1 +%define HAVE_AVX512 0 +%define HAVE_VSX 0 +%define HAVE_MMI 0 +%define HAVE_LSX 0 +%define HAVE_LASX 0 +%define HAVE_VPX_PORTS 1 +%define HAVE_PTHREAD_H 1 +%define CONFIG_DEPENDENCY_TRACKING 1 +%define CONFIG_EXTERNAL_BUILD 1 +%define CONFIG_INSTALL_DOCS 0 +%define CONFIG_INSTALL_BINS 1 +%define CONFIG_INSTALL_LIBS 1 +%define CONFIG_INSTALL_SRCS 0 +%define CONFIG_DEBUG 0 +%define CONFIG_GPROF 0 +%define CONFIG_GCOV 0 +%define CONFIG_RVCT 0 +%define CONFIG_GCC 1 +%define CONFIG_MSVS 0 +%define CONFIG_PIC 1 +%define CONFIG_BIG_ENDIAN 0 +%define CONFIG_CODEC_SRCS 0 +%define CONFIG_DEBUG_LIBS 0 +%define CONFIG_DEQUANT_TOKENS 0 +%define CONFIG_DC_RECON 0 +%define CONFIG_RUNTIME_CPU_DETECT 1 +%define CONFIG_POSTPROC 1 +%define CONFIG_VP9_POSTPROC 1 +%define CONFIG_MULTITHREAD 1 +%define CONFIG_INTERNAL_STATS 0 +%define CONFIG_VP8_ENCODER 1 +%define CONFIG_VP8_DECODER 1 +%define CONFIG_VP9_ENCODER 1 +%define CONFIG_VP9_DECODER 1 +%define CONFIG_VP8 1 +%define CONFIG_VP9 1 +%define CONFIG_ENCODERS 1 +%define CONFIG_DECODERS 1 +%define CONFIG_STATIC_MSVCRT 0 +%define CONFIG_SPATIAL_RESAMPLING 1 +%define CONFIG_REALTIME_ONLY 0 +%define CONFIG_ONTHEFLY_BITPACKING 0 +%define CONFIG_ERROR_CONCEALMENT 0 +%define CONFIG_SHARED 0 +%define CONFIG_STATIC 1 +%define CONFIG_SMALL 0 +%define CONFIG_POSTPROC_VISUALIZER 0 +%define CONFIG_OS_SUPPORT 1 +%define CONFIG_UNIT_TESTS 0 +%define CONFIG_WEBM_IO 1 +%define CONFIG_LIBYUV 1 +%define CONFIG_DECODE_PERF_TESTS 0 +%define CONFIG_ENCODE_PERF_TESTS 0 +%define CONFIG_MULTI_RES_ENCODING 1 +%define CONFIG_TEMPORAL_DENOISING 1 +%define CONFIG_VP9_TEMPORAL_DENOISING 0 +%define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +%define CONFIG_VP9_HIGHBITDEPTH 0 +%define CONFIG_BETTER_HW_COMPATIBILITY 0 +%define CONFIG_EXPERIMENTAL 0 +%define CONFIG_SIZE_LIMIT 1 +%define CONFIG_ALWAYS_ADJUST_BPM 0 +%define CONFIG_BITSTREAM_DEBUG 0 +%define CONFIG_MISMATCH_DEBUG 0 +%define CONFIG_FP_MB_STATS 0 +%define CONFIG_EMULATE_HARDWARE 0 +%define CONFIG_NON_GREEDY_MV 0 +%define CONFIG_RATE_CTRL 0 +%define CONFIG_COLLECT_COMPONENT_TIMING 0 diff --git a/media/libvpx/config/linux/x64/vpx_config.c b/media/libvpx/config/linux/x64/vpx_config.c new file mode 100644 index 0000000000..e4dcb394c3 --- /dev/null +++ b/media/libvpx/config/linux/x64/vpx_config.c @@ -0,0 +1,10 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +#include "vpx/vpx_codec.h" +static const char* const cfg = "--target=x86_64-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm"; +const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/linux/x64/vpx_config.h b/media/libvpx/config/linux/x64/vpx_config.h new file mode 100644 index 0000000000..ab4439aaf4 --- /dev/null +++ b/media/libvpx/config/linux/x64/vpx_config.h @@ -0,0 +1,108 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +/* This file automatically generated by configure. Do not edit! */ +#ifndef VPX_CONFIG_H +#define VPX_CONFIG_H +#define RESTRICT +#define INLINE inline +#define VPX_ARCH_ARM 0 +#define VPX_ARCH_AARCH64 0 +#define VPX_ARCH_MIPS 0 +#define VPX_ARCH_X86 0 +#define VPX_ARCH_X86_64 1 +#define VPX_ARCH_PPC 0 +#define VPX_ARCH_LOONGARCH 0 +#define HAVE_NEON_ASM 0 +#define HAVE_NEON 0 +#define HAVE_NEON_DOTPROD 0 +#define HAVE_NEON_I8MM 0 +#define HAVE_SVE 0 +#define HAVE_MIPS32 0 +#define HAVE_DSPR2 0 +#define HAVE_MSA 0 +#define HAVE_MIPS64 0 +#define HAVE_MMX 1 +#define HAVE_SSE 1 +#define HAVE_SSE2 1 +#define HAVE_SSE3 1 +#define HAVE_SSSE3 1 +#define HAVE_SSE4_1 1 +#define HAVE_AVX 1 +#define HAVE_AVX2 1 +#define HAVE_AVX512 0 +#define HAVE_VSX 0 +#define HAVE_MMI 0 +#define HAVE_LSX 0 +#define HAVE_LASX 0 +#define HAVE_VPX_PORTS 1 +#define HAVE_PTHREAD_H 1 +#define CONFIG_DEPENDENCY_TRACKING 1 +#define CONFIG_EXTERNAL_BUILD 1 +#define CONFIG_INSTALL_DOCS 0 +#define CONFIG_INSTALL_BINS 1 +#define CONFIG_INSTALL_LIBS 1 +#define CONFIG_INSTALL_SRCS 0 +#define CONFIG_DEBUG 0 +#define CONFIG_GPROF 0 +#define CONFIG_GCOV 0 +#define CONFIG_RVCT 0 +#define CONFIG_GCC 1 +#define CONFIG_MSVS 0 +#define CONFIG_PIC 1 +#define CONFIG_BIG_ENDIAN 0 +#define CONFIG_CODEC_SRCS 0 +#define CONFIG_DEBUG_LIBS 0 +#define CONFIG_DEQUANT_TOKENS 0 +#define CONFIG_DC_RECON 0 +#define CONFIG_RUNTIME_CPU_DETECT 1 +#define CONFIG_POSTPROC 1 +#define CONFIG_VP9_POSTPROC 1 +#define CONFIG_MULTITHREAD 1 +#define CONFIG_INTERNAL_STATS 0 +#define CONFIG_VP8_ENCODER 1 +#define CONFIG_VP8_DECODER 1 +#define CONFIG_VP9_ENCODER 1 +#define CONFIG_VP9_DECODER 1 +#define CONFIG_VP8 1 +#define CONFIG_VP9 1 +#define CONFIG_ENCODERS 1 +#define CONFIG_DECODERS 1 +#define CONFIG_STATIC_MSVCRT 0 +#define CONFIG_SPATIAL_RESAMPLING 1 +#define CONFIG_REALTIME_ONLY 0 +#define CONFIG_ONTHEFLY_BITPACKING 0 +#define CONFIG_ERROR_CONCEALMENT 0 +#define CONFIG_SHARED 0 +#define CONFIG_STATIC 1 +#define CONFIG_SMALL 0 +#define CONFIG_POSTPROC_VISUALIZER 0 +#define CONFIG_OS_SUPPORT 1 +#define CONFIG_UNIT_TESTS 0 +#define CONFIG_WEBM_IO 1 +#define CONFIG_LIBYUV 1 +#define CONFIG_DECODE_PERF_TESTS 0 +#define CONFIG_ENCODE_PERF_TESTS 0 +#define CONFIG_MULTI_RES_ENCODING 1 +#define CONFIG_TEMPORAL_DENOISING 1 +#define CONFIG_VP9_TEMPORAL_DENOISING 0 +#define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 +#define CONFIG_BETTER_HW_COMPATIBILITY 0 +#define CONFIG_EXPERIMENTAL 0 +#define CONFIG_SIZE_LIMIT 1 +#define CONFIG_ALWAYS_ADJUST_BPM 0 +#define CONFIG_BITSTREAM_DEBUG 0 +#define CONFIG_MISMATCH_DEBUG 0 +#define CONFIG_FP_MB_STATS 0 +#define CONFIG_EMULATE_HARDWARE 0 +#define CONFIG_NON_GREEDY_MV 0 +#define CONFIG_RATE_CTRL 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 +#define DECODE_WIDTH_LIMIT 8192 +#define DECODE_HEIGHT_LIMIT 4608 +#endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/linux/x64/vpx_dsp_rtcd.h b/media/libvpx/config/linux/x64/vpx_dsp_rtcd.h new file mode 100644 index 0000000000..755e916dd1 --- /dev/null +++ b/media/libvpx/config/linux/x64/vpx_dsp_rtcd.h @@ -0,0 +1,1296 @@ +// This file is generated. Do not edit. +#ifndef VPX_DSP_RTCD_H_ +#define VPX_DSP_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * DSP + */ + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; + struct ScanOrder; +#endif + + +#ifdef __cplusplus +extern "C" { +#endif + +unsigned int vpx_avg_4x4_c(const uint8_t *, int p); +unsigned int vpx_avg_4x4_sse2(const uint8_t *, int p); +#define vpx_avg_4x4 vpx_avg_4x4_sse2 + +unsigned int vpx_avg_8x8_c(const uint8_t *, int p); +unsigned int vpx_avg_8x8_sse2(const uint8_t *, int p); +#define vpx_avg_8x8 vpx_avg_8x8_sse2 + +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +void vpx_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +RTCD_EXTERN void (*vpx_comp_avg_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); + +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve_avg vpx_convolve_avg_sse2 + +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve_copy vpx_convolve_copy_sse2 + +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c + +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c + +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c + +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c + +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c + +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c + +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c + +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c + +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_sse2 + +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_sse2 + +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_sse2 + +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c + +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c + +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_sse2 + +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_sse2 + +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_sse2 + +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_sse2 + +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_sse2 + +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_sse2 + +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_sse2 + +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_sse2 + +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_sse2 + +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_sse2 + +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_sse2 + +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_sse2 + +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_sse2 + +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_sse2 + +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_sse2 + +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_sse2 + +void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct16x16)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct16x16_1 vpx_fdct16x16_1_sse2 + +void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_sse2(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_avx2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct32x32)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32_1 vpx_fdct32x32_1_sse2 + +void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_rd_sse2(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_rd_avx2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct32x32_rd)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct4x4_sse2(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4 vpx_fdct4x4_sse2 + +void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4_1 vpx_fdct4x4_1_sse2 + +void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_sse2(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_ssse3(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct8x8)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct8x8_1 vpx_fdct8x8_1_sse2 + +void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); +#define vpx_get4x4sse_cs vpx_get4x4sse_cs_c + +void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get8x8var vpx_get8x8var_sse2 + +unsigned int vpx_get_mb_ss_c(const int16_t *); +unsigned int vpx_get_mb_ss_sse2(const int16_t *); +#define vpx_get_mb_ss vpx_get_mb_ss_sse2 + +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_16x16 vpx_h_predictor_16x16_sse2 + +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_32x32 vpx_h_predictor_32x32_sse2 + +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_sse2 + +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_sse2 + +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); + +void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_32x32)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); + +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_8x8_ssse3(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c + +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_10_add vpx_idct16x16_10_add_sse2 + +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_1_add vpx_idct16x16_1_add_sse2 + +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_38_add vpx_idct16x16_38_add_sse2 + +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_1_add vpx_idct32x32_1_add_sse2 + +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct4x4_16_add vpx_idct4x4_16_add_sse2 + +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct4x4_1_add vpx_idct4x4_1_add_sse2 + +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_1_add vpx_idct8x8_1_add_sse2 + +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_64_add vpx_idct8x8_64_add_sse2 + +int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); +int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width); +#define vpx_int_pro_col vpx_int_pro_col_sse2 + +void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +#define vpx_int_pro_row vpx_int_pro_row_sse2 + +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_sse2 + +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c + +void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_avx2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_dual_avx2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_16_dual)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_4 vpx_lpf_horizontal_4_sse2 + +void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_horizontal_4_dual vpx_lpf_horizontal_4_dual_sse2 + +void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_8 vpx_lpf_horizontal_8_sse2 + +void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_horizontal_8_dual vpx_lpf_horizontal_8_dual_sse2 + +void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_sse2 + +void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16_dual vpx_lpf_vertical_16_dual_sse2 + +void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_4 vpx_lpf_vertical_4_sse2 + +void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_vertical_4_dual vpx_lpf_vertical_4_dual_sse2 + +void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_8 vpx_lpf_vertical_8_sse2 + +void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_sse2 + +void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols,int flimit); +void vpx_mbpost_proc_across_ip_sse2(unsigned char *src, int pitch, int rows, int cols,int flimit); +#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_sse2 + +void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); +void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit); +#define vpx_mbpost_proc_down vpx_mbpost_proc_down_sse2 + +void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +#define vpx_minmax_8x8 vpx_minmax_8x8_sse2 + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_mse8x16 vpx_mse8x16_sse2 + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_mse8x8 vpx_mse8x8_sse2 + +void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch); +void vpx_plane_add_noise_sse2(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch); +#define vpx_plane_add_noise vpx_plane_add_noise_sse2 + +void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +void vpx_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +#define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_sse2 + +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vpx_quantize_b_32x32)(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad16x16 vpx_sad16x16_sse2 + +unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x16_avg vpx_sad16x16_avg_sse2 + +void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad16x16x4d vpx_sad16x16x4d_sse2 + +unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad16x32 vpx_sad16x32_sse2 + +unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x32_avg vpx_sad16x32_avg_sse2 + +void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad16x32x4d vpx_sad16x32x4d_sse2 + +unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad16x8 vpx_sad16x8_sse2 + +unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x8_avg vpx_sad16x8_avg_sse2 + +void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad16x8x4d vpx_sad16x8x4d_sse2 + +unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x16_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad32x16x4d vpx_sad32x16x4d_sse2 + +unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad32x64x4d vpx_sad32x64x4d_sse2 + +unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad4x4 vpx_sad4x4_sse2 + +unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad4x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x4_avg vpx_sad4x4_avg_sse2 + +void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad4x4x4d vpx_sad4x4x4d_sse2 + +unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad4x8 vpx_sad4x8_sse2 + +unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad4x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x8_avg vpx_sad4x8_avg_sse2 + +void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad4x8x4d vpx_sad4x8x4d_sse2 + +unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad64x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad64x32x4d vpx_sad64x32x4d_sse2 + +unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x16 vpx_sad8x16_sse2 + +unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x16_avg vpx_sad8x16_avg_sse2 + +void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x16x4d vpx_sad8x16x4d_sse2 + +unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x4 vpx_sad8x4_sse2 + +unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x4_avg vpx_sad8x4_avg_sse2 + +void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x4x4d vpx_sad8x4x4d_sse2 + +unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x8 vpx_sad8x8_sse2 + +unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x8_avg vpx_sad8x8_avg_sse2 + +void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x8x4d vpx_sad8x8x4d_sse2 + +unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x16 vpx_sad_skip_16x16_sse2 + +void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x16x4d vpx_sad_skip_16x16x4d_sse2 + +unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x32 vpx_sad_skip_16x32_sse2 + +void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x32x4d vpx_sad_skip_16x32x4d_sse2 + +unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x8 vpx_sad_skip_16x8_sse2 + +void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x8x4d vpx_sad_skip_16x8x4d_sse2 + +unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c + +void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c + +unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_sse2 + +void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_sse2 + +unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_sse2 + +void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_sse2 + +unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c + +void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c + +unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_sse2 + +void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_sse2 + +int vpx_satd_c(const int16_t *coeff, int length); +int vpx_satd_sse2(const int16_t *coeff, int length); +int vpx_satd_avx2(const int16_t *coeff, int length); +RTCD_EXTERN int (*vpx_satd)(const int16_t *coeff, int length); + +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c + +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c + +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c + +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_horiz vpx_scaled_horiz_c + +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_vert vpx_scaled_vert_c + +int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_sse4_1(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +RTCD_EXTERN int64_t (*vpx_sse)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); + +uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vpx_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +RTCD_EXTERN void (*vpx_subtract_block)(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); + +uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size); +uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size); +#define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_sse2 + +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_sse2 + +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_sse2 + +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_sse2 + +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_sse2 + +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_16x16 vpx_v_predictor_16x16_sse2 + +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_32x32 vpx_v_predictor_32x32_sse2 + +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_sse2 + +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_sse2 + +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x4 vpx_variance4x4_sse2 + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x8 vpx_variance4x8_sse2 + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c + +int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl); +int vpx_vector_var_sse2(const int16_t *ref, const int16_t *src, const int bwl); +#define vpx_vector_var vpx_vector_var_sse2 + +void vpx_dsp_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/x86.h" +static void setup_rtcd_internal(void) +{ + int flags = x86_simd_caps(); + + (void)flags; + + vpx_comp_avg_pred = vpx_comp_avg_pred_sse2; + if (flags & HAS_AVX2) vpx_comp_avg_pred = vpx_comp_avg_pred_avx2; + vpx_convolve8 = vpx_convolve8_sse2; + if (flags & HAS_SSSE3) vpx_convolve8 = vpx_convolve8_ssse3; + if (flags & HAS_AVX2) vpx_convolve8 = vpx_convolve8_avx2; + vpx_convolve8_avg = vpx_convolve8_avg_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_avg = vpx_convolve8_avg_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg = vpx_convolve8_avg_avx2; + vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_avx2; + vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_avx2; + vpx_convolve8_horiz = vpx_convolve8_horiz_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_horiz = vpx_convolve8_horiz_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_horiz = vpx_convolve8_horiz_avx2; + vpx_convolve8_vert = vpx_convolve8_vert_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_vert = vpx_convolve8_vert_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_vert = vpx_convolve8_vert_avx2; + vpx_d153_predictor_16x16 = vpx_d153_predictor_16x16_c; + if (flags & HAS_SSSE3) vpx_d153_predictor_16x16 = vpx_d153_predictor_16x16_ssse3; + vpx_d153_predictor_32x32 = vpx_d153_predictor_32x32_c; + if (flags & HAS_SSSE3) vpx_d153_predictor_32x32 = vpx_d153_predictor_32x32_ssse3; + vpx_d153_predictor_4x4 = vpx_d153_predictor_4x4_c; + if (flags & HAS_SSSE3) vpx_d153_predictor_4x4 = vpx_d153_predictor_4x4_ssse3; + vpx_d153_predictor_8x8 = vpx_d153_predictor_8x8_c; + if (flags & HAS_SSSE3) vpx_d153_predictor_8x8 = vpx_d153_predictor_8x8_ssse3; + vpx_d207_predictor_16x16 = vpx_d207_predictor_16x16_c; + if (flags & HAS_SSSE3) vpx_d207_predictor_16x16 = vpx_d207_predictor_16x16_ssse3; + vpx_d207_predictor_32x32 = vpx_d207_predictor_32x32_c; + if (flags & HAS_SSSE3) vpx_d207_predictor_32x32 = vpx_d207_predictor_32x32_ssse3; + vpx_d207_predictor_8x8 = vpx_d207_predictor_8x8_c; + if (flags & HAS_SSSE3) vpx_d207_predictor_8x8 = vpx_d207_predictor_8x8_ssse3; + vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_c; + if (flags & HAS_SSSE3) vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_ssse3; + vpx_d45_predictor_32x32 = vpx_d45_predictor_32x32_c; + if (flags & HAS_SSSE3) vpx_d45_predictor_32x32 = vpx_d45_predictor_32x32_ssse3; + vpx_d63_predictor_16x16 = vpx_d63_predictor_16x16_c; + if (flags & HAS_SSSE3) vpx_d63_predictor_16x16 = vpx_d63_predictor_16x16_ssse3; + vpx_d63_predictor_32x32 = vpx_d63_predictor_32x32_c; + if (flags & HAS_SSSE3) vpx_d63_predictor_32x32 = vpx_d63_predictor_32x32_ssse3; + vpx_d63_predictor_4x4 = vpx_d63_predictor_4x4_c; + if (flags & HAS_SSSE3) vpx_d63_predictor_4x4 = vpx_d63_predictor_4x4_ssse3; + vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_c; + if (flags & HAS_SSSE3) vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_ssse3; + vpx_fdct16x16 = vpx_fdct16x16_sse2; + if (flags & HAS_AVX2) vpx_fdct16x16 = vpx_fdct16x16_avx2; + vpx_fdct32x32 = vpx_fdct32x32_sse2; + if (flags & HAS_AVX2) vpx_fdct32x32 = vpx_fdct32x32_avx2; + vpx_fdct32x32_rd = vpx_fdct32x32_rd_sse2; + if (flags & HAS_AVX2) vpx_fdct32x32_rd = vpx_fdct32x32_rd_avx2; + vpx_fdct8x8 = vpx_fdct8x8_sse2; + if (flags & HAS_SSSE3) vpx_fdct8x8 = vpx_fdct8x8_ssse3; + vpx_get16x16var = vpx_get16x16var_sse2; + if (flags & HAS_AVX2) vpx_get16x16var = vpx_get16x16var_avx2; + vpx_hadamard_16x16 = vpx_hadamard_16x16_sse2; + if (flags & HAS_AVX2) vpx_hadamard_16x16 = vpx_hadamard_16x16_avx2; + vpx_hadamard_32x32 = vpx_hadamard_32x32_sse2; + if (flags & HAS_AVX2) vpx_hadamard_32x32 = vpx_hadamard_32x32_avx2; + vpx_hadamard_8x8 = vpx_hadamard_8x8_sse2; + if (flags & HAS_SSSE3) vpx_hadamard_8x8 = vpx_hadamard_8x8_ssse3; + vpx_idct16x16_256_add = vpx_idct16x16_256_add_sse2; + if (flags & HAS_AVX2) vpx_idct16x16_256_add = vpx_idct16x16_256_add_avx2; + vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_sse2; + if (flags & HAS_AVX2) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_avx2; + vpx_idct32x32_135_add = vpx_idct32x32_135_add_sse2; + if (flags & HAS_SSSE3) vpx_idct32x32_135_add = vpx_idct32x32_135_add_ssse3; + if (flags & HAS_AVX2) vpx_idct32x32_135_add = vpx_idct32x32_135_add_avx2; + vpx_idct32x32_34_add = vpx_idct32x32_34_add_sse2; + if (flags & HAS_SSSE3) vpx_idct32x32_34_add = vpx_idct32x32_34_add_ssse3; + vpx_idct8x8_12_add = vpx_idct8x8_12_add_sse2; + if (flags & HAS_SSSE3) vpx_idct8x8_12_add = vpx_idct8x8_12_add_ssse3; + vpx_lpf_horizontal_16 = vpx_lpf_horizontal_16_sse2; + if (flags & HAS_AVX2) vpx_lpf_horizontal_16 = vpx_lpf_horizontal_16_avx2; + vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_sse2; + if (flags & HAS_AVX2) vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_avx2; + vpx_mse16x16 = vpx_mse16x16_sse2; + if (flags & HAS_AVX2) vpx_mse16x16 = vpx_mse16x16_avx2; + vpx_mse16x8 = vpx_mse16x8_sse2; + if (flags & HAS_AVX2) vpx_mse16x8 = vpx_mse16x8_avx2; + vpx_quantize_b = vpx_quantize_b_sse2; + if (flags & HAS_SSSE3) vpx_quantize_b = vpx_quantize_b_ssse3; + if (flags & HAS_AVX) vpx_quantize_b = vpx_quantize_b_avx; + if (flags & HAS_AVX2) vpx_quantize_b = vpx_quantize_b_avx2; + vpx_quantize_b_32x32 = vpx_quantize_b_32x32_c; + if (flags & HAS_SSSE3) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_ssse3; + if (flags & HAS_AVX) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx; + if (flags & HAS_AVX2) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx2; + vpx_sad32x16 = vpx_sad32x16_sse2; + if (flags & HAS_AVX2) vpx_sad32x16 = vpx_sad32x16_avx2; + vpx_sad32x16_avg = vpx_sad32x16_avg_sse2; + if (flags & HAS_AVX2) vpx_sad32x16_avg = vpx_sad32x16_avg_avx2; + vpx_sad32x32 = vpx_sad32x32_sse2; + if (flags & HAS_AVX2) vpx_sad32x32 = vpx_sad32x32_avx2; + vpx_sad32x32_avg = vpx_sad32x32_avg_sse2; + if (flags & HAS_AVX2) vpx_sad32x32_avg = vpx_sad32x32_avg_avx2; + vpx_sad32x32x4d = vpx_sad32x32x4d_sse2; + if (flags & HAS_AVX2) vpx_sad32x32x4d = vpx_sad32x32x4d_avx2; + vpx_sad32x64 = vpx_sad32x64_sse2; + if (flags & HAS_AVX2) vpx_sad32x64 = vpx_sad32x64_avx2; + vpx_sad32x64_avg = vpx_sad32x64_avg_sse2; + if (flags & HAS_AVX2) vpx_sad32x64_avg = vpx_sad32x64_avg_avx2; + vpx_sad64x32 = vpx_sad64x32_sse2; + if (flags & HAS_AVX2) vpx_sad64x32 = vpx_sad64x32_avx2; + vpx_sad64x32_avg = vpx_sad64x32_avg_sse2; + if (flags & HAS_AVX2) vpx_sad64x32_avg = vpx_sad64x32_avg_avx2; + vpx_sad64x64 = vpx_sad64x64_sse2; + if (flags & HAS_AVX2) vpx_sad64x64 = vpx_sad64x64_avx2; + vpx_sad64x64_avg = vpx_sad64x64_avg_sse2; + if (flags & HAS_AVX2) vpx_sad64x64_avg = vpx_sad64x64_avg_avx2; + vpx_sad64x64x4d = vpx_sad64x64x4d_sse2; + if (flags & HAS_AVX2) vpx_sad64x64x4d = vpx_sad64x64x4d_avx2; + vpx_sad_skip_32x16 = vpx_sad_skip_32x16_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_avx2; + vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_avx2; + vpx_sad_skip_32x32 = vpx_sad_skip_32x32_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_avx2; + vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_avx2; + vpx_sad_skip_32x64 = vpx_sad_skip_32x64_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_avx2; + vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_avx2; + vpx_sad_skip_64x32 = vpx_sad_skip_64x32_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_avx2; + vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_avx2; + vpx_sad_skip_64x64 = vpx_sad_skip_64x64_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_avx2; + vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_avx2; + vpx_satd = vpx_satd_sse2; + if (flags & HAS_AVX2) vpx_satd = vpx_satd_avx2; + vpx_scaled_2d = vpx_scaled_2d_c; + if (flags & HAS_SSSE3) vpx_scaled_2d = vpx_scaled_2d_ssse3; + vpx_sse = vpx_sse_c; + if (flags & HAS_SSE4_1) vpx_sse = vpx_sse_sse4_1; + if (flags & HAS_AVX2) vpx_sse = vpx_sse_avx2; + vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_ssse3; + vpx_sub_pixel_avg_variance16x32 = vpx_sub_pixel_avg_variance16x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance16x32 = vpx_sub_pixel_avg_variance16x32_ssse3; + vpx_sub_pixel_avg_variance16x8 = vpx_sub_pixel_avg_variance16x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance16x8 = vpx_sub_pixel_avg_variance16x8_ssse3; + vpx_sub_pixel_avg_variance32x16 = vpx_sub_pixel_avg_variance32x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance32x16 = vpx_sub_pixel_avg_variance32x16_ssse3; + vpx_sub_pixel_avg_variance32x32 = vpx_sub_pixel_avg_variance32x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance32x32 = vpx_sub_pixel_avg_variance32x32_ssse3; + if (flags & HAS_AVX2) vpx_sub_pixel_avg_variance32x32 = vpx_sub_pixel_avg_variance32x32_avx2; + vpx_sub_pixel_avg_variance32x64 = vpx_sub_pixel_avg_variance32x64_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance32x64 = vpx_sub_pixel_avg_variance32x64_ssse3; + vpx_sub_pixel_avg_variance4x4 = vpx_sub_pixel_avg_variance4x4_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance4x4 = vpx_sub_pixel_avg_variance4x4_ssse3; + vpx_sub_pixel_avg_variance4x8 = vpx_sub_pixel_avg_variance4x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance4x8 = vpx_sub_pixel_avg_variance4x8_ssse3; + vpx_sub_pixel_avg_variance64x32 = vpx_sub_pixel_avg_variance64x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance64x32 = vpx_sub_pixel_avg_variance64x32_ssse3; + vpx_sub_pixel_avg_variance64x64 = vpx_sub_pixel_avg_variance64x64_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance64x64 = vpx_sub_pixel_avg_variance64x64_ssse3; + if (flags & HAS_AVX2) vpx_sub_pixel_avg_variance64x64 = vpx_sub_pixel_avg_variance64x64_avx2; + vpx_sub_pixel_avg_variance8x16 = vpx_sub_pixel_avg_variance8x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance8x16 = vpx_sub_pixel_avg_variance8x16_ssse3; + vpx_sub_pixel_avg_variance8x4 = vpx_sub_pixel_avg_variance8x4_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance8x4 = vpx_sub_pixel_avg_variance8x4_ssse3; + vpx_sub_pixel_avg_variance8x8 = vpx_sub_pixel_avg_variance8x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance8x8 = vpx_sub_pixel_avg_variance8x8_ssse3; + vpx_sub_pixel_variance16x16 = vpx_sub_pixel_variance16x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance16x16 = vpx_sub_pixel_variance16x16_ssse3; + vpx_sub_pixel_variance16x32 = vpx_sub_pixel_variance16x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance16x32 = vpx_sub_pixel_variance16x32_ssse3; + vpx_sub_pixel_variance16x8 = vpx_sub_pixel_variance16x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance16x8 = vpx_sub_pixel_variance16x8_ssse3; + vpx_sub_pixel_variance32x16 = vpx_sub_pixel_variance32x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance32x16 = vpx_sub_pixel_variance32x16_ssse3; + vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_ssse3; + if (flags & HAS_AVX2) vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_avx2; + vpx_sub_pixel_variance32x64 = vpx_sub_pixel_variance32x64_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance32x64 = vpx_sub_pixel_variance32x64_ssse3; + vpx_sub_pixel_variance4x4 = vpx_sub_pixel_variance4x4_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance4x4 = vpx_sub_pixel_variance4x4_ssse3; + vpx_sub_pixel_variance4x8 = vpx_sub_pixel_variance4x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance4x8 = vpx_sub_pixel_variance4x8_ssse3; + vpx_sub_pixel_variance64x32 = vpx_sub_pixel_variance64x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance64x32 = vpx_sub_pixel_variance64x32_ssse3; + vpx_sub_pixel_variance64x64 = vpx_sub_pixel_variance64x64_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance64x64 = vpx_sub_pixel_variance64x64_ssse3; + if (flags & HAS_AVX2) vpx_sub_pixel_variance64x64 = vpx_sub_pixel_variance64x64_avx2; + vpx_sub_pixel_variance8x16 = vpx_sub_pixel_variance8x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x16 = vpx_sub_pixel_variance8x16_ssse3; + vpx_sub_pixel_variance8x4 = vpx_sub_pixel_variance8x4_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x4 = vpx_sub_pixel_variance8x4_ssse3; + vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_ssse3; + vpx_subtract_block = vpx_subtract_block_sse2; + if (flags & HAS_AVX2) vpx_subtract_block = vpx_subtract_block_avx2; + vpx_variance16x16 = vpx_variance16x16_sse2; + if (flags & HAS_AVX2) vpx_variance16x16 = vpx_variance16x16_avx2; + vpx_variance16x32 = vpx_variance16x32_sse2; + if (flags & HAS_AVX2) vpx_variance16x32 = vpx_variance16x32_avx2; + vpx_variance16x8 = vpx_variance16x8_sse2; + if (flags & HAS_AVX2) vpx_variance16x8 = vpx_variance16x8_avx2; + vpx_variance32x16 = vpx_variance32x16_sse2; + if (flags & HAS_AVX2) vpx_variance32x16 = vpx_variance32x16_avx2; + vpx_variance32x32 = vpx_variance32x32_sse2; + if (flags & HAS_AVX2) vpx_variance32x32 = vpx_variance32x32_avx2; + vpx_variance32x64 = vpx_variance32x64_sse2; + if (flags & HAS_AVX2) vpx_variance32x64 = vpx_variance32x64_avx2; + vpx_variance64x32 = vpx_variance64x32_sse2; + if (flags & HAS_AVX2) vpx_variance64x32 = vpx_variance64x32_avx2; + vpx_variance64x64 = vpx_variance64x64_sse2; + if (flags & HAS_AVX2) vpx_variance64x64 = vpx_variance64x64_avx2; + vpx_variance8x16 = vpx_variance8x16_sse2; + if (flags & HAS_AVX2) vpx_variance8x16 = vpx_variance8x16_avx2; + vpx_variance8x4 = vpx_variance8x4_sse2; + if (flags & HAS_AVX2) vpx_variance8x4 = vpx_variance8x4_avx2; + vpx_variance8x8 = vpx_variance8x8_sse2; + if (flags & HAS_AVX2) vpx_variance8x8 = vpx_variance8x8_avx2; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/linux/x64/vpx_scale_rtcd.h b/media/libvpx/config/linux/x64/vpx_scale_rtcd.h new file mode 100644 index 0000000000..5f09104ea6 --- /dev/null +++ b/media/libvpx/config/linux/x64/vpx_scale_rtcd.h @@ -0,0 +1,73 @@ +// This file is generated. Do not edit. +#ifndef VPX_SCALE_RTCD_H_ +#define VPX_SCALE_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c + +void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c + +void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c + +void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c + +void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c + +void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c + +void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c + +void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vp8_yv12_copy_frame vp8_yv12_copy_frame_c + +void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c + +void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_borders vpx_extend_frame_borders_c + +void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c + +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c + +void vpx_scale_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/x86.h" +static void setup_rtcd_internal(void) +{ + int flags = x86_simd_caps(); + + (void)flags; + +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/mac/ia32/vp8_rtcd.h b/media/libvpx/config/mac/ia32/vp8_rtcd.h new file mode 100644 index 0000000000..7a23227e4d --- /dev/null +++ b/media/libvpx/config/mac/ia32/vp8_rtcd.h @@ -0,0 +1,323 @@ +// This file is generated. Do not edit. +#ifndef VP8_RTCD_H_ +#define VP8_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP8 + */ + +struct blockd; +struct macroblockd; +struct loop_filter_info; + +/* Encoder forward decls */ +struct block; +struct macroblock; +struct variance_vtable; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict4x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +int vp8_block_error_c(short *coeff, short *dqcoeff); +int vp8_block_error_sse2(short *coeff, short *dqcoeff); +RTCD_EXTERN int (*vp8_block_error)(short *coeff, short *dqcoeff); + +void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +void vp8_copy32xn_sse2(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +void vp8_copy32xn_sse3(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); + +void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); + +void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x4_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); + +void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x8_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); + +void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); + +int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +RTCD_EXTERN int (*vp8_denoiser_filter)(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); + +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +RTCD_EXTERN int (*vp8_denoiser_filter_uv)(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); + +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride); +void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *dest, int stride); +RTCD_EXTERN void (*vp8_dequant_idct_add)(short *input, short *dq, unsigned char *dest, int stride); + +void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +void vp8_dequant_idct_add_uv_block_sse2(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +RTCD_EXTERN void (*vp8_dequant_idct_add_uv_block)(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); + +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +RTCD_EXTERN void (*vp8_dequant_idct_add_y_block)(short *q, short *dq, unsigned char *dst, int stride, char *eobs); + +void vp8_dequantize_b_c(struct blockd*, short *DQC); +void vp8_dequantize_b_mmx(struct blockd*, short *DQC); +RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *DQC); + +int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_diamond_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); + +void vp8_fast_quantize_b_c(struct block *, struct blockd *); +void vp8_fast_quantize_b_sse2(struct block *, struct blockd *); +void vp8_fast_quantize_b_ssse3(struct block *, struct blockd *); +RTCD_EXTERN void (*vp8_fast_quantize_b)(struct block *, struct blockd *); + +void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +void vp8_filter_by_weight16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +RTCD_EXTERN void (*vp8_filter_by_weight16x16)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); + +void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight4x4 vp8_filter_by_weight4x4_c + +void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +RTCD_EXTERN void (*vp8_filter_by_weight8x8)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); + +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_bv)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_bh)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); + +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); + +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); + +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); + +int vp8_mbblock_error_c(struct macroblock *mb, int dc); +int vp8_mbblock_error_sse2(struct macroblock *mb, int dc); +RTCD_EXTERN int (*vp8_mbblock_error)(struct macroblock *mb, int dc); + +int vp8_mbuverror_c(struct macroblock *mb); +int vp8_mbuverror_sse2(struct macroblock *mb); +RTCD_EXTERN int (*vp8_mbuverror)(struct macroblock *mb); + +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); + +void vp8_regular_quantize_b_c(struct block *, struct blockd *); +void vp8_regular_quantize_b_sse2(struct block *, struct blockd *); +void vp8_regular_quantize_b_sse4_1(struct block *, struct blockd *); +RTCD_EXTERN void (*vp8_regular_quantize_b)(struct block *, struct blockd *); + +void vp8_short_fdct4x4_c(short *input, short *output, int pitch); +void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch); +RTCD_EXTERN void (*vp8_short_fdct4x4)(short *input, short *output, int pitch); + +void vp8_short_fdct8x4_c(short *input, short *output, int pitch); +void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch); +RTCD_EXTERN void (*vp8_short_fdct8x4)(short *input, short *output, int pitch); + +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); + +void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff); +void vp8_short_inv_walsh4x4_sse2(short *input, short *mb_dqcoeff); +RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *mb_dqcoeff); + +void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + +void vp8_short_walsh4x4_c(short *input, short *output, int pitch); +void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch); +RTCD_EXTERN void (*vp8_short_walsh4x4)(short *input, short *output, int pitch); + +void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); +void vp8_temporal_filter_apply_sse2(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); +RTCD_EXTERN void (*vp8_temporal_filter_apply)(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); + +void vp8_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/x86.h" +static void setup_rtcd_internal(void) +{ + int flags = x86_simd_caps(); + + (void)flags; + + vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_c; + if (flags & HAS_SSE2) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_sse2; + if (flags & HAS_SSSE3) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_ssse3; + vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_c; + if (flags & HAS_SSE2) vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_sse2; + vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_c; + if (flags & HAS_SSE2) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_sse2; + vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_c; + if (flags & HAS_SSE2) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_sse2; + if (flags & HAS_SSSE3) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_ssse3; + vp8_block_error = vp8_block_error_c; + if (flags & HAS_SSE2) vp8_block_error = vp8_block_error_sse2; + vp8_copy32xn = vp8_copy32xn_c; + if (flags & HAS_SSE2) vp8_copy32xn = vp8_copy32xn_sse2; + if (flags & HAS_SSE3) vp8_copy32xn = vp8_copy32xn_sse3; + vp8_copy_mem16x16 = vp8_copy_mem16x16_c; + if (flags & HAS_SSE2) vp8_copy_mem16x16 = vp8_copy_mem16x16_sse2; + vp8_copy_mem8x4 = vp8_copy_mem8x4_c; + if (flags & HAS_MMX) vp8_copy_mem8x4 = vp8_copy_mem8x4_mmx; + vp8_copy_mem8x8 = vp8_copy_mem8x8_c; + if (flags & HAS_MMX) vp8_copy_mem8x8 = vp8_copy_mem8x8_mmx; + vp8_dc_only_idct_add = vp8_dc_only_idct_add_c; + if (flags & HAS_MMX) vp8_dc_only_idct_add = vp8_dc_only_idct_add_mmx; + vp8_denoiser_filter = vp8_denoiser_filter_c; + if (flags & HAS_SSE2) vp8_denoiser_filter = vp8_denoiser_filter_sse2; + vp8_denoiser_filter_uv = vp8_denoiser_filter_uv_c; + if (flags & HAS_SSE2) vp8_denoiser_filter_uv = vp8_denoiser_filter_uv_sse2; + vp8_dequant_idct_add = vp8_dequant_idct_add_c; + if (flags & HAS_MMX) vp8_dequant_idct_add = vp8_dequant_idct_add_mmx; + vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_c; + if (flags & HAS_SSE2) vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_sse2; + vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_c; + if (flags & HAS_SSE2) vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_sse2; + vp8_dequantize_b = vp8_dequantize_b_c; + if (flags & HAS_MMX) vp8_dequantize_b = vp8_dequantize_b_mmx; + vp8_diamond_search_sad = vp8_diamond_search_sad_c; + if (flags & HAS_SSE2) vp8_diamond_search_sad = vp8_diamond_search_sadx4; + vp8_fast_quantize_b = vp8_fast_quantize_b_c; + if (flags & HAS_SSE2) vp8_fast_quantize_b = vp8_fast_quantize_b_sse2; + if (flags & HAS_SSSE3) vp8_fast_quantize_b = vp8_fast_quantize_b_ssse3; + vp8_filter_by_weight16x16 = vp8_filter_by_weight16x16_c; + if (flags & HAS_SSE2) vp8_filter_by_weight16x16 = vp8_filter_by_weight16x16_sse2; + vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_c; + if (flags & HAS_SSE2) vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_sse2; + vp8_loop_filter_bh = vp8_loop_filter_bh_c; + if (flags & HAS_SSE2) vp8_loop_filter_bh = vp8_loop_filter_bh_sse2; + vp8_loop_filter_bv = vp8_loop_filter_bv_c; + if (flags & HAS_SSE2) vp8_loop_filter_bv = vp8_loop_filter_bv_sse2; + vp8_loop_filter_mbh = vp8_loop_filter_mbh_c; + if (flags & HAS_SSE2) vp8_loop_filter_mbh = vp8_loop_filter_mbh_sse2; + vp8_loop_filter_mbv = vp8_loop_filter_mbv_c; + if (flags & HAS_SSE2) vp8_loop_filter_mbv = vp8_loop_filter_mbv_sse2; + vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_c; + if (flags & HAS_SSE2) vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_sse2; + vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_c; + if (flags & HAS_SSE2) vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_sse2; + vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_c; + if (flags & HAS_SSE2) vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_sse2; + vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_c; + if (flags & HAS_SSE2) vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_sse2; + vp8_mbblock_error = vp8_mbblock_error_c; + if (flags & HAS_SSE2) vp8_mbblock_error = vp8_mbblock_error_sse2; + vp8_mbuverror = vp8_mbuverror_c; + if (flags & HAS_SSE2) vp8_mbuverror = vp8_mbuverror_sse2; + vp8_refining_search_sad = vp8_refining_search_sad_c; + if (flags & HAS_SSE2) vp8_refining_search_sad = vp8_refining_search_sadx4; + vp8_regular_quantize_b = vp8_regular_quantize_b_c; + if (flags & HAS_SSE2) vp8_regular_quantize_b = vp8_regular_quantize_b_sse2; + if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1; + vp8_short_fdct4x4 = vp8_short_fdct4x4_c; + if (flags & HAS_SSE2) vp8_short_fdct4x4 = vp8_short_fdct4x4_sse2; + vp8_short_fdct8x4 = vp8_short_fdct8x4_c; + if (flags & HAS_SSE2) vp8_short_fdct8x4 = vp8_short_fdct8x4_sse2; + vp8_short_idct4x4llm = vp8_short_idct4x4llm_c; + if (flags & HAS_MMX) vp8_short_idct4x4llm = vp8_short_idct4x4llm_mmx; + vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_c; + if (flags & HAS_SSE2) vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_sse2; + vp8_short_walsh4x4 = vp8_short_walsh4x4_c; + if (flags & HAS_SSE2) vp8_short_walsh4x4 = vp8_short_walsh4x4_sse2; + vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_c; + if (flags & HAS_SSE2) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_ssse3; + vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_c; + if (flags & HAS_MMX) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_mmx; + if (flags & HAS_SSSE3) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_ssse3; + vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_c; + if (flags & HAS_SSE2) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_ssse3; + vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_c; + if (flags & HAS_SSE2) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_ssse3; + vp8_temporal_filter_apply = vp8_temporal_filter_apply_c; + if (flags & HAS_SSE2) vp8_temporal_filter_apply = vp8_temporal_filter_apply_sse2; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/mac/ia32/vp9_rtcd.h b/media/libvpx/config/mac/ia32/vp9_rtcd.h new file mode 100644 index 0000000000..3136db471c --- /dev/null +++ b/media/libvpx/config/mac/ia32/vp9_rtcd.h @@ -0,0 +1,156 @@ +// This file is generated. Do not edit. +#ifndef VP9_RTCD_H_ +#define VP9_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP9 + */ + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" + +struct macroblockd; + +/* Encoder forward decls */ +struct macroblock; +struct macroblock_plane; +struct vp9_sad_table; +struct ScanOrder; +struct search_site_config; +struct mv; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); +void vp9_apply_temporal_filter_sse4_1(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); +RTCD_EXTERN void (*vp9_apply_temporal_filter)(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); + +int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +RTCD_EXTERN int64_t (*vp9_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); + +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +RTCD_EXTERN int64_t (*vp9_block_error_fp)(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); + +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_c + +void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type); +RTCD_EXTERN void (*vp9_fht16x16)(const int16_t *input, tran_low_t *output, int stride, int tx_type); + +void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type); +RTCD_EXTERN void (*vp9_fht4x4)(const int16_t *input, tran_low_t *output, int stride, int tx_type); + +void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type); +RTCD_EXTERN void (*vp9_fht8x8)(const int16_t *input, tran_low_t *output, int stride, int tx_type); + +void vp9_filter_by_weight16x16_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); +void vp9_filter_by_weight16x16_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); +RTCD_EXTERN void (*vp9_filter_by_weight16x16)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); + +void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); +void vp9_filter_by_weight8x8_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); +RTCD_EXTERN void (*vp9_filter_by_weight8x8)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); + +void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); +void vp9_fwht4x4_sse2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vp9_fwht4x4)(const int16_t *input, tran_low_t *output, int stride); + +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); + +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); + +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); + +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); + +void vp9_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/x86.h" +static void setup_rtcd_internal(void) +{ + int flags = x86_simd_caps(); + + (void)flags; + + vp9_apply_temporal_filter = vp9_apply_temporal_filter_c; + if (flags & HAS_SSE4_1) vp9_apply_temporal_filter = vp9_apply_temporal_filter_sse4_1; + vp9_block_error = vp9_block_error_c; + if (flags & HAS_SSE2) vp9_block_error = vp9_block_error_sse2; + if (flags & HAS_AVX2) vp9_block_error = vp9_block_error_avx2; + vp9_block_error_fp = vp9_block_error_fp_c; + if (flags & HAS_SSE2) vp9_block_error_fp = vp9_block_error_fp_sse2; + if (flags & HAS_AVX2) vp9_block_error_fp = vp9_block_error_fp_avx2; + vp9_fht16x16 = vp9_fht16x16_c; + if (flags & HAS_SSE2) vp9_fht16x16 = vp9_fht16x16_sse2; + vp9_fht4x4 = vp9_fht4x4_c; + if (flags & HAS_SSE2) vp9_fht4x4 = vp9_fht4x4_sse2; + vp9_fht8x8 = vp9_fht8x8_c; + if (flags & HAS_SSE2) vp9_fht8x8 = vp9_fht8x8_sse2; + vp9_filter_by_weight16x16 = vp9_filter_by_weight16x16_c; + if (flags & HAS_SSE2) vp9_filter_by_weight16x16 = vp9_filter_by_weight16x16_sse2; + vp9_filter_by_weight8x8 = vp9_filter_by_weight8x8_c; + if (flags & HAS_SSE2) vp9_filter_by_weight8x8 = vp9_filter_by_weight8x8_sse2; + vp9_fwht4x4 = vp9_fwht4x4_c; + if (flags & HAS_SSE2) vp9_fwht4x4 = vp9_fwht4x4_sse2; + vp9_iht16x16_256_add = vp9_iht16x16_256_add_c; + if (flags & HAS_SSE2) vp9_iht16x16_256_add = vp9_iht16x16_256_add_sse2; + vp9_iht4x4_16_add = vp9_iht4x4_16_add_c; + if (flags & HAS_SSE2) vp9_iht4x4_16_add = vp9_iht4x4_16_add_sse2; + vp9_iht8x8_64_add = vp9_iht8x8_64_add_c; + if (flags & HAS_SSE2) vp9_iht8x8_64_add = vp9_iht8x8_64_add_sse2; + vp9_quantize_fp = vp9_quantize_fp_c; + if (flags & HAS_SSE2) vp9_quantize_fp = vp9_quantize_fp_sse2; + if (flags & HAS_SSSE3) vp9_quantize_fp = vp9_quantize_fp_ssse3; + if (flags & HAS_AVX2) vp9_quantize_fp = vp9_quantize_fp_avx2; + vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_c; + if (flags & HAS_SSSE3) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_ssse3; + if (flags & HAS_AVX2) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_avx2; + vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c; + if (flags & HAS_SSSE3) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_ssse3; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/mac/ia32/vpx_config.asm b/media/libvpx/config/mac/ia32/vpx_config.asm new file mode 100644 index 0000000000..eaa3950d37 --- /dev/null +++ b/media/libvpx/config/mac/ia32/vpx_config.asm @@ -0,0 +1,93 @@ +%define VPX_ARCH_ARM 0 +%define VPX_ARCH_AARCH64 0 +%define VPX_ARCH_MIPS 0 +%define VPX_ARCH_X86 1 +%define VPX_ARCH_X86_64 0 +%define VPX_ARCH_PPC 0 +%define VPX_ARCH_LOONGARCH 0 +%define HAVE_NEON_ASM 0 +%define HAVE_NEON 0 +%define HAVE_NEON_DOTPROD 0 +%define HAVE_NEON_I8MM 0 +%define HAVE_SVE 0 +%define HAVE_MIPS32 0 +%define HAVE_DSPR2 0 +%define HAVE_MSA 0 +%define HAVE_MIPS64 0 +%define HAVE_MMX 1 +%define HAVE_SSE 1 +%define HAVE_SSE2 1 +%define HAVE_SSE3 1 +%define HAVE_SSSE3 1 +%define HAVE_SSE4_1 1 +%define HAVE_AVX 1 +%define HAVE_AVX2 1 +%define HAVE_AVX512 0 +%define HAVE_VSX 0 +%define HAVE_MMI 0 +%define HAVE_LSX 0 +%define HAVE_LASX 0 +%define HAVE_VPX_PORTS 1 +%define HAVE_PTHREAD_H 1 +%define CONFIG_DEPENDENCY_TRACKING 1 +%define CONFIG_EXTERNAL_BUILD 1 +%define CONFIG_INSTALL_DOCS 0 +%define CONFIG_INSTALL_BINS 1 +%define CONFIG_INSTALL_LIBS 1 +%define CONFIG_INSTALL_SRCS 0 +%define CONFIG_DEBUG 0 +%define CONFIG_GPROF 0 +%define CONFIG_GCOV 0 +%define CONFIG_RVCT 0 +%define CONFIG_GCC 1 +%define CONFIG_MSVS 0 +%define CONFIG_PIC 1 +%define CONFIG_BIG_ENDIAN 0 +%define CONFIG_CODEC_SRCS 0 +%define CONFIG_DEBUG_LIBS 0 +%define CONFIG_DEQUANT_TOKENS 0 +%define CONFIG_DC_RECON 0 +%define CONFIG_RUNTIME_CPU_DETECT 1 +%define CONFIG_POSTPROC 1 +%define CONFIG_VP9_POSTPROC 1 +%define CONFIG_MULTITHREAD 1 +%define CONFIG_INTERNAL_STATS 0 +%define CONFIG_VP8_ENCODER 1 +%define CONFIG_VP8_DECODER 1 +%define CONFIG_VP9_ENCODER 1 +%define CONFIG_VP9_DECODER 1 +%define CONFIG_VP8 1 +%define CONFIG_VP9 1 +%define CONFIG_ENCODERS 1 +%define CONFIG_DECODERS 1 +%define CONFIG_STATIC_MSVCRT 0 +%define CONFIG_SPATIAL_RESAMPLING 1 +%define CONFIG_REALTIME_ONLY 0 +%define CONFIG_ONTHEFLY_BITPACKING 0 +%define CONFIG_ERROR_CONCEALMENT 0 +%define CONFIG_SHARED 0 +%define CONFIG_STATIC 1 +%define CONFIG_SMALL 0 +%define CONFIG_POSTPROC_VISUALIZER 0 +%define CONFIG_OS_SUPPORT 1 +%define CONFIG_UNIT_TESTS 0 +%define CONFIG_WEBM_IO 1 +%define CONFIG_LIBYUV 1 +%define CONFIG_DECODE_PERF_TESTS 0 +%define CONFIG_ENCODE_PERF_TESTS 0 +%define CONFIG_MULTI_RES_ENCODING 1 +%define CONFIG_TEMPORAL_DENOISING 1 +%define CONFIG_VP9_TEMPORAL_DENOISING 0 +%define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +%define CONFIG_VP9_HIGHBITDEPTH 0 +%define CONFIG_BETTER_HW_COMPATIBILITY 0 +%define CONFIG_EXPERIMENTAL 0 +%define CONFIG_SIZE_LIMIT 1 +%define CONFIG_ALWAYS_ADJUST_BPM 0 +%define CONFIG_BITSTREAM_DEBUG 0 +%define CONFIG_MISMATCH_DEBUG 0 +%define CONFIG_FP_MB_STATS 0 +%define CONFIG_EMULATE_HARDWARE 0 +%define CONFIG_NON_GREEDY_MV 0 +%define CONFIG_RATE_CTRL 0 +%define CONFIG_COLLECT_COMPONENT_TIMING 0 diff --git a/media/libvpx/config/mac/ia32/vpx_config.c b/media/libvpx/config/mac/ia32/vpx_config.c new file mode 100644 index 0000000000..3e5d3ec0f3 --- /dev/null +++ b/media/libvpx/config/mac/ia32/vpx_config.c @@ -0,0 +1,10 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +#include "vpx/vpx_codec.h" +static const char* const cfg = "--target=x86-darwin9-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm"; +const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/mac/ia32/vpx_config.h b/media/libvpx/config/mac/ia32/vpx_config.h new file mode 100644 index 0000000000..69fd63bf02 --- /dev/null +++ b/media/libvpx/config/mac/ia32/vpx_config.h @@ -0,0 +1,108 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +/* This file automatically generated by configure. Do not edit! */ +#ifndef VPX_CONFIG_H +#define VPX_CONFIG_H +#define RESTRICT +#define INLINE inline +#define VPX_ARCH_ARM 0 +#define VPX_ARCH_AARCH64 0 +#define VPX_ARCH_MIPS 0 +#define VPX_ARCH_X86 1 +#define VPX_ARCH_X86_64 0 +#define VPX_ARCH_PPC 0 +#define VPX_ARCH_LOONGARCH 0 +#define HAVE_NEON_ASM 0 +#define HAVE_NEON 0 +#define HAVE_NEON_DOTPROD 0 +#define HAVE_NEON_I8MM 0 +#define HAVE_SVE 0 +#define HAVE_MIPS32 0 +#define HAVE_DSPR2 0 +#define HAVE_MSA 0 +#define HAVE_MIPS64 0 +#define HAVE_MMX 1 +#define HAVE_SSE 1 +#define HAVE_SSE2 1 +#define HAVE_SSE3 1 +#define HAVE_SSSE3 1 +#define HAVE_SSE4_1 1 +#define HAVE_AVX 1 +#define HAVE_AVX2 1 +#define HAVE_AVX512 0 +#define HAVE_VSX 0 +#define HAVE_MMI 0 +#define HAVE_LSX 0 +#define HAVE_LASX 0 +#define HAVE_VPX_PORTS 1 +#define HAVE_PTHREAD_H 1 +#define CONFIG_DEPENDENCY_TRACKING 1 +#define CONFIG_EXTERNAL_BUILD 1 +#define CONFIG_INSTALL_DOCS 0 +#define CONFIG_INSTALL_BINS 1 +#define CONFIG_INSTALL_LIBS 1 +#define CONFIG_INSTALL_SRCS 0 +#define CONFIG_DEBUG 0 +#define CONFIG_GPROF 0 +#define CONFIG_GCOV 0 +#define CONFIG_RVCT 0 +#define CONFIG_GCC 1 +#define CONFIG_MSVS 0 +#define CONFIG_PIC 1 +#define CONFIG_BIG_ENDIAN 0 +#define CONFIG_CODEC_SRCS 0 +#define CONFIG_DEBUG_LIBS 0 +#define CONFIG_DEQUANT_TOKENS 0 +#define CONFIG_DC_RECON 0 +#define CONFIG_RUNTIME_CPU_DETECT 1 +#define CONFIG_POSTPROC 1 +#define CONFIG_VP9_POSTPROC 1 +#define CONFIG_MULTITHREAD 1 +#define CONFIG_INTERNAL_STATS 0 +#define CONFIG_VP8_ENCODER 1 +#define CONFIG_VP8_DECODER 1 +#define CONFIG_VP9_ENCODER 1 +#define CONFIG_VP9_DECODER 1 +#define CONFIG_VP8 1 +#define CONFIG_VP9 1 +#define CONFIG_ENCODERS 1 +#define CONFIG_DECODERS 1 +#define CONFIG_STATIC_MSVCRT 0 +#define CONFIG_SPATIAL_RESAMPLING 1 +#define CONFIG_REALTIME_ONLY 0 +#define CONFIG_ONTHEFLY_BITPACKING 0 +#define CONFIG_ERROR_CONCEALMENT 0 +#define CONFIG_SHARED 0 +#define CONFIG_STATIC 1 +#define CONFIG_SMALL 0 +#define CONFIG_POSTPROC_VISUALIZER 0 +#define CONFIG_OS_SUPPORT 1 +#define CONFIG_UNIT_TESTS 0 +#define CONFIG_WEBM_IO 1 +#define CONFIG_LIBYUV 1 +#define CONFIG_DECODE_PERF_TESTS 0 +#define CONFIG_ENCODE_PERF_TESTS 0 +#define CONFIG_MULTI_RES_ENCODING 1 +#define CONFIG_TEMPORAL_DENOISING 1 +#define CONFIG_VP9_TEMPORAL_DENOISING 0 +#define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 +#define CONFIG_BETTER_HW_COMPATIBILITY 0 +#define CONFIG_EXPERIMENTAL 0 +#define CONFIG_SIZE_LIMIT 1 +#define CONFIG_ALWAYS_ADJUST_BPM 0 +#define CONFIG_BITSTREAM_DEBUG 0 +#define CONFIG_MISMATCH_DEBUG 0 +#define CONFIG_FP_MB_STATS 0 +#define CONFIG_EMULATE_HARDWARE 0 +#define CONFIG_NON_GREEDY_MV 0 +#define CONFIG_RATE_CTRL 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 +#define DECODE_WIDTH_LIMIT 8192 +#define DECODE_HEIGHT_LIMIT 4608 +#endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/mac/ia32/vpx_dsp_rtcd.h b/media/libvpx/config/mac/ia32/vpx_dsp_rtcd.h new file mode 100644 index 0000000000..dd688b035b --- /dev/null +++ b/media/libvpx/config/mac/ia32/vpx_dsp_rtcd.h @@ -0,0 +1,1604 @@ +// This file is generated. Do not edit. +#ifndef VPX_DSP_RTCD_H_ +#define VPX_DSP_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * DSP + */ + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; + struct ScanOrder; +#endif + + +#ifdef __cplusplus +extern "C" { +#endif + +unsigned int vpx_avg_4x4_c(const uint8_t *, int p); +unsigned int vpx_avg_4x4_sse2(const uint8_t *, int p); +RTCD_EXTERN unsigned int (*vpx_avg_4x4)(const uint8_t *, int p); + +unsigned int vpx_avg_8x8_c(const uint8_t *, int p); +unsigned int vpx_avg_8x8_sse2(const uint8_t *, int p); +RTCD_EXTERN unsigned int (*vpx_avg_8x8)(const uint8_t *, int p); + +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +void vpx_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +RTCD_EXTERN void (*vpx_comp_avg_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); + +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c + +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c + +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c + +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c + +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c + +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c + +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c + +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c + +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c + +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c + +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct16x16)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct16x16_1)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_sse2(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_avx2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct32x32)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct32x32_1)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_rd_sse2(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_rd_avx2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct32x32_rd)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct4x4_sse2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct4x4)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct4x4_1)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_sse2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct8x8)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct8x8_1)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); +#define vpx_get4x4sse_cs vpx_get4x4sse_cs_c + +void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get_mb_ss_c(const int16_t *); +unsigned int vpx_get_mb_ss_sse2(const int16_t *); +RTCD_EXTERN unsigned int (*vpx_get_mb_ss)(const int16_t *); + +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); + +void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_32x32)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); + +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c + +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_38_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_1_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride); + +int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); +int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width); +RTCD_EXTERN int16_t (*vpx_int_pro_col)(const uint8_t *ref, const int width); + +void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +RTCD_EXTERN void (*vpx_int_pro_row)(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); + +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_iwht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c + +void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_avx2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_dual_avx2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_16_dual)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_horizontal_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_horizontal_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_16_dual)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_vertical_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols,int flimit); +void vpx_mbpost_proc_across_ip_sse2(unsigned char *src, int pitch, int rows, int cols,int flimit); +RTCD_EXTERN void (*vpx_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols,int flimit); + +void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); +void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit); +RTCD_EXTERN void (*vpx_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols,int flimit); + +void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +RTCD_EXTERN void (*vpx_minmax_8x8)(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch); +void vpx_plane_add_noise_sse2(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch); +RTCD_EXTERN void (*vpx_plane_add_noise)(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch); + +void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +void vpx_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +RTCD_EXTERN void (*vpx_post_proc_down_and_across_mb_row)(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); + +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vpx_quantize_b_32x32)(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad16x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad16x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad16x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x16_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad4x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad4x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad4x4_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad4x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad4x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad4x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad64x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad8x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad8x4_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad8x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad8x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c + +void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c + +unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c + +void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c + +unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +int vpx_satd_c(const int16_t *coeff, int length); +int vpx_satd_sse2(const int16_t *coeff, int length); +int vpx_satd_avx2(const int16_t *coeff, int length); +RTCD_EXTERN int (*vpx_satd)(const int16_t *coeff, int length); + +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c + +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c + +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c + +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_horiz vpx_scaled_horiz_c + +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_vert vpx_scaled_vert_c + +int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_sse4_1(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +RTCD_EXTERN int64_t (*vpx_sse)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); + +uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vpx_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +RTCD_EXTERN void (*vpx_subtract_block)(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); + +uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size); +uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size); +RTCD_EXTERN uint64_t (*vpx_sum_squares_2d_i16)(const int16_t *src, int stride, int size); + +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance4x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c + +int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl); +int vpx_vector_var_sse2(const int16_t *ref, const int16_t *src, const int bwl); +RTCD_EXTERN int (*vpx_vector_var)(const int16_t *ref, const int16_t *src, const int bwl); + +void vpx_dsp_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/x86.h" +static void setup_rtcd_internal(void) +{ + int flags = x86_simd_caps(); + + (void)flags; + + vpx_avg_4x4 = vpx_avg_4x4_c; + if (flags & HAS_SSE2) vpx_avg_4x4 = vpx_avg_4x4_sse2; + vpx_avg_8x8 = vpx_avg_8x8_c; + if (flags & HAS_SSE2) vpx_avg_8x8 = vpx_avg_8x8_sse2; + vpx_comp_avg_pred = vpx_comp_avg_pred_c; + if (flags & HAS_SSE2) vpx_comp_avg_pred = vpx_comp_avg_pred_sse2; + if (flags & HAS_AVX2) vpx_comp_avg_pred = vpx_comp_avg_pred_avx2; + vpx_convolve8 = vpx_convolve8_c; + if (flags & HAS_SSE2) vpx_convolve8 = vpx_convolve8_sse2; + if (flags & HAS_SSSE3) vpx_convolve8 = vpx_convolve8_ssse3; + if (flags & HAS_AVX2) vpx_convolve8 = vpx_convolve8_avx2; + vpx_convolve8_avg = vpx_convolve8_avg_c; + if (flags & HAS_SSE2) vpx_convolve8_avg = vpx_convolve8_avg_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_avg = vpx_convolve8_avg_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg = vpx_convolve8_avg_avx2; + vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_c; + if (flags & HAS_SSE2) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_avx2; + vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_c; + if (flags & HAS_SSE2) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_avx2; + vpx_convolve8_horiz = vpx_convolve8_horiz_c; + if (flags & HAS_SSE2) vpx_convolve8_horiz = vpx_convolve8_horiz_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_horiz = vpx_convolve8_horiz_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_horiz = vpx_convolve8_horiz_avx2; + vpx_convolve8_vert = vpx_convolve8_vert_c; + if (flags & HAS_SSE2) vpx_convolve8_vert = vpx_convolve8_vert_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_vert = vpx_convolve8_vert_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_vert = vpx_convolve8_vert_avx2; + vpx_convolve_avg = vpx_convolve_avg_c; + if (flags & HAS_SSE2) vpx_convolve_avg = vpx_convolve_avg_sse2; + vpx_convolve_copy = vpx_convolve_copy_c; + if (flags & HAS_SSE2) vpx_convolve_copy = vpx_convolve_copy_sse2; + vpx_d153_predictor_16x16 = vpx_d153_predictor_16x16_c; + if (flags & HAS_SSSE3) vpx_d153_predictor_16x16 = vpx_d153_predictor_16x16_ssse3; + vpx_d153_predictor_32x32 = vpx_d153_predictor_32x32_c; + if (flags & HAS_SSSE3) vpx_d153_predictor_32x32 = vpx_d153_predictor_32x32_ssse3; + vpx_d153_predictor_4x4 = vpx_d153_predictor_4x4_c; + if (flags & HAS_SSSE3) vpx_d153_predictor_4x4 = vpx_d153_predictor_4x4_ssse3; + vpx_d153_predictor_8x8 = vpx_d153_predictor_8x8_c; + if (flags & HAS_SSSE3) vpx_d153_predictor_8x8 = vpx_d153_predictor_8x8_ssse3; + vpx_d207_predictor_16x16 = vpx_d207_predictor_16x16_c; + if (flags & HAS_SSSE3) vpx_d207_predictor_16x16 = vpx_d207_predictor_16x16_ssse3; + vpx_d207_predictor_32x32 = vpx_d207_predictor_32x32_c; + if (flags & HAS_SSSE3) vpx_d207_predictor_32x32 = vpx_d207_predictor_32x32_ssse3; + vpx_d207_predictor_4x4 = vpx_d207_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_d207_predictor_4x4 = vpx_d207_predictor_4x4_sse2; + vpx_d207_predictor_8x8 = vpx_d207_predictor_8x8_c; + if (flags & HAS_SSSE3) vpx_d207_predictor_8x8 = vpx_d207_predictor_8x8_ssse3; + vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_c; + if (flags & HAS_SSSE3) vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_ssse3; + vpx_d45_predictor_32x32 = vpx_d45_predictor_32x32_c; + if (flags & HAS_SSSE3) vpx_d45_predictor_32x32 = vpx_d45_predictor_32x32_ssse3; + vpx_d45_predictor_4x4 = vpx_d45_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_d45_predictor_4x4 = vpx_d45_predictor_4x4_sse2; + vpx_d45_predictor_8x8 = vpx_d45_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_d45_predictor_8x8 = vpx_d45_predictor_8x8_sse2; + vpx_d63_predictor_16x16 = vpx_d63_predictor_16x16_c; + if (flags & HAS_SSSE3) vpx_d63_predictor_16x16 = vpx_d63_predictor_16x16_ssse3; + vpx_d63_predictor_32x32 = vpx_d63_predictor_32x32_c; + if (flags & HAS_SSSE3) vpx_d63_predictor_32x32 = vpx_d63_predictor_32x32_ssse3; + vpx_d63_predictor_4x4 = vpx_d63_predictor_4x4_c; + if (flags & HAS_SSSE3) vpx_d63_predictor_4x4 = vpx_d63_predictor_4x4_ssse3; + vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_c; + if (flags & HAS_SSSE3) vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_ssse3; + vpx_dc_128_predictor_16x16 = vpx_dc_128_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_dc_128_predictor_16x16 = vpx_dc_128_predictor_16x16_sse2; + vpx_dc_128_predictor_32x32 = vpx_dc_128_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_dc_128_predictor_32x32 = vpx_dc_128_predictor_32x32_sse2; + vpx_dc_128_predictor_4x4 = vpx_dc_128_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_dc_128_predictor_4x4 = vpx_dc_128_predictor_4x4_sse2; + vpx_dc_128_predictor_8x8 = vpx_dc_128_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_dc_128_predictor_8x8 = vpx_dc_128_predictor_8x8_sse2; + vpx_dc_left_predictor_16x16 = vpx_dc_left_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_dc_left_predictor_16x16 = vpx_dc_left_predictor_16x16_sse2; + vpx_dc_left_predictor_32x32 = vpx_dc_left_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_dc_left_predictor_32x32 = vpx_dc_left_predictor_32x32_sse2; + vpx_dc_left_predictor_4x4 = vpx_dc_left_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_dc_left_predictor_4x4 = vpx_dc_left_predictor_4x4_sse2; + vpx_dc_left_predictor_8x8 = vpx_dc_left_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_dc_left_predictor_8x8 = vpx_dc_left_predictor_8x8_sse2; + vpx_dc_predictor_16x16 = vpx_dc_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_dc_predictor_16x16 = vpx_dc_predictor_16x16_sse2; + vpx_dc_predictor_32x32 = vpx_dc_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_dc_predictor_32x32 = vpx_dc_predictor_32x32_sse2; + vpx_dc_predictor_4x4 = vpx_dc_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_dc_predictor_4x4 = vpx_dc_predictor_4x4_sse2; + vpx_dc_predictor_8x8 = vpx_dc_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_dc_predictor_8x8 = vpx_dc_predictor_8x8_sse2; + vpx_dc_top_predictor_16x16 = vpx_dc_top_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_dc_top_predictor_16x16 = vpx_dc_top_predictor_16x16_sse2; + vpx_dc_top_predictor_32x32 = vpx_dc_top_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_dc_top_predictor_32x32 = vpx_dc_top_predictor_32x32_sse2; + vpx_dc_top_predictor_4x4 = vpx_dc_top_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_dc_top_predictor_4x4 = vpx_dc_top_predictor_4x4_sse2; + vpx_dc_top_predictor_8x8 = vpx_dc_top_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_dc_top_predictor_8x8 = vpx_dc_top_predictor_8x8_sse2; + vpx_fdct16x16 = vpx_fdct16x16_c; + if (flags & HAS_SSE2) vpx_fdct16x16 = vpx_fdct16x16_sse2; + if (flags & HAS_AVX2) vpx_fdct16x16 = vpx_fdct16x16_avx2; + vpx_fdct16x16_1 = vpx_fdct16x16_1_c; + if (flags & HAS_SSE2) vpx_fdct16x16_1 = vpx_fdct16x16_1_sse2; + vpx_fdct32x32 = vpx_fdct32x32_c; + if (flags & HAS_SSE2) vpx_fdct32x32 = vpx_fdct32x32_sse2; + if (flags & HAS_AVX2) vpx_fdct32x32 = vpx_fdct32x32_avx2; + vpx_fdct32x32_1 = vpx_fdct32x32_1_c; + if (flags & HAS_SSE2) vpx_fdct32x32_1 = vpx_fdct32x32_1_sse2; + vpx_fdct32x32_rd = vpx_fdct32x32_rd_c; + if (flags & HAS_SSE2) vpx_fdct32x32_rd = vpx_fdct32x32_rd_sse2; + if (flags & HAS_AVX2) vpx_fdct32x32_rd = vpx_fdct32x32_rd_avx2; + vpx_fdct4x4 = vpx_fdct4x4_c; + if (flags & HAS_SSE2) vpx_fdct4x4 = vpx_fdct4x4_sse2; + vpx_fdct4x4_1 = vpx_fdct4x4_1_c; + if (flags & HAS_SSE2) vpx_fdct4x4_1 = vpx_fdct4x4_1_sse2; + vpx_fdct8x8 = vpx_fdct8x8_c; + if (flags & HAS_SSE2) vpx_fdct8x8 = vpx_fdct8x8_sse2; + vpx_fdct8x8_1 = vpx_fdct8x8_1_c; + if (flags & HAS_SSE2) vpx_fdct8x8_1 = vpx_fdct8x8_1_sse2; + vpx_get16x16var = vpx_get16x16var_c; + if (flags & HAS_SSE2) vpx_get16x16var = vpx_get16x16var_sse2; + if (flags & HAS_AVX2) vpx_get16x16var = vpx_get16x16var_avx2; + vpx_get8x8var = vpx_get8x8var_c; + if (flags & HAS_SSE2) vpx_get8x8var = vpx_get8x8var_sse2; + vpx_get_mb_ss = vpx_get_mb_ss_c; + if (flags & HAS_SSE2) vpx_get_mb_ss = vpx_get_mb_ss_sse2; + vpx_h_predictor_16x16 = vpx_h_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_h_predictor_16x16 = vpx_h_predictor_16x16_sse2; + vpx_h_predictor_32x32 = vpx_h_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_h_predictor_32x32 = vpx_h_predictor_32x32_sse2; + vpx_h_predictor_4x4 = vpx_h_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_h_predictor_4x4 = vpx_h_predictor_4x4_sse2; + vpx_h_predictor_8x8 = vpx_h_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_h_predictor_8x8 = vpx_h_predictor_8x8_sse2; + vpx_hadamard_16x16 = vpx_hadamard_16x16_c; + if (flags & HAS_SSE2) vpx_hadamard_16x16 = vpx_hadamard_16x16_sse2; + if (flags & HAS_AVX2) vpx_hadamard_16x16 = vpx_hadamard_16x16_avx2; + vpx_hadamard_32x32 = vpx_hadamard_32x32_c; + if (flags & HAS_SSE2) vpx_hadamard_32x32 = vpx_hadamard_32x32_sse2; + if (flags & HAS_AVX2) vpx_hadamard_32x32 = vpx_hadamard_32x32_avx2; + vpx_hadamard_8x8 = vpx_hadamard_8x8_c; + if (flags & HAS_SSE2) vpx_hadamard_8x8 = vpx_hadamard_8x8_sse2; + vpx_idct16x16_10_add = vpx_idct16x16_10_add_c; + if (flags & HAS_SSE2) vpx_idct16x16_10_add = vpx_idct16x16_10_add_sse2; + vpx_idct16x16_1_add = vpx_idct16x16_1_add_c; + if (flags & HAS_SSE2) vpx_idct16x16_1_add = vpx_idct16x16_1_add_sse2; + vpx_idct16x16_256_add = vpx_idct16x16_256_add_c; + if (flags & HAS_SSE2) vpx_idct16x16_256_add = vpx_idct16x16_256_add_sse2; + if (flags & HAS_AVX2) vpx_idct16x16_256_add = vpx_idct16x16_256_add_avx2; + vpx_idct16x16_38_add = vpx_idct16x16_38_add_c; + if (flags & HAS_SSE2) vpx_idct16x16_38_add = vpx_idct16x16_38_add_sse2; + vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_c; + if (flags & HAS_SSE2) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_sse2; + if (flags & HAS_AVX2) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_avx2; + vpx_idct32x32_135_add = vpx_idct32x32_135_add_c; + if (flags & HAS_SSE2) vpx_idct32x32_135_add = vpx_idct32x32_135_add_sse2; + if (flags & HAS_SSSE3) vpx_idct32x32_135_add = vpx_idct32x32_135_add_ssse3; + if (flags & HAS_AVX2) vpx_idct32x32_135_add = vpx_idct32x32_135_add_avx2; + vpx_idct32x32_1_add = vpx_idct32x32_1_add_c; + if (flags & HAS_SSE2) vpx_idct32x32_1_add = vpx_idct32x32_1_add_sse2; + vpx_idct32x32_34_add = vpx_idct32x32_34_add_c; + if (flags & HAS_SSE2) vpx_idct32x32_34_add = vpx_idct32x32_34_add_sse2; + if (flags & HAS_SSSE3) vpx_idct32x32_34_add = vpx_idct32x32_34_add_ssse3; + vpx_idct4x4_16_add = vpx_idct4x4_16_add_c; + if (flags & HAS_SSE2) vpx_idct4x4_16_add = vpx_idct4x4_16_add_sse2; + vpx_idct4x4_1_add = vpx_idct4x4_1_add_c; + if (flags & HAS_SSE2) vpx_idct4x4_1_add = vpx_idct4x4_1_add_sse2; + vpx_idct8x8_12_add = vpx_idct8x8_12_add_c; + if (flags & HAS_SSE2) vpx_idct8x8_12_add = vpx_idct8x8_12_add_sse2; + if (flags & HAS_SSSE3) vpx_idct8x8_12_add = vpx_idct8x8_12_add_ssse3; + vpx_idct8x8_1_add = vpx_idct8x8_1_add_c; + if (flags & HAS_SSE2) vpx_idct8x8_1_add = vpx_idct8x8_1_add_sse2; + vpx_idct8x8_64_add = vpx_idct8x8_64_add_c; + if (flags & HAS_SSE2) vpx_idct8x8_64_add = vpx_idct8x8_64_add_sse2; + vpx_int_pro_col = vpx_int_pro_col_c; + if (flags & HAS_SSE2) vpx_int_pro_col = vpx_int_pro_col_sse2; + vpx_int_pro_row = vpx_int_pro_row_c; + if (flags & HAS_SSE2) vpx_int_pro_row = vpx_int_pro_row_sse2; + vpx_iwht4x4_16_add = vpx_iwht4x4_16_add_c; + if (flags & HAS_SSE2) vpx_iwht4x4_16_add = vpx_iwht4x4_16_add_sse2; + vpx_lpf_horizontal_16 = vpx_lpf_horizontal_16_c; + if (flags & HAS_SSE2) vpx_lpf_horizontal_16 = vpx_lpf_horizontal_16_sse2; + if (flags & HAS_AVX2) vpx_lpf_horizontal_16 = vpx_lpf_horizontal_16_avx2; + vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_c; + if (flags & HAS_SSE2) vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_sse2; + if (flags & HAS_AVX2) vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_avx2; + vpx_lpf_horizontal_4 = vpx_lpf_horizontal_4_c; + if (flags & HAS_SSE2) vpx_lpf_horizontal_4 = vpx_lpf_horizontal_4_sse2; + vpx_lpf_horizontal_4_dual = vpx_lpf_horizontal_4_dual_c; + if (flags & HAS_SSE2) vpx_lpf_horizontal_4_dual = vpx_lpf_horizontal_4_dual_sse2; + vpx_lpf_horizontal_8 = vpx_lpf_horizontal_8_c; + if (flags & HAS_SSE2) vpx_lpf_horizontal_8 = vpx_lpf_horizontal_8_sse2; + vpx_lpf_horizontal_8_dual = vpx_lpf_horizontal_8_dual_c; + if (flags & HAS_SSE2) vpx_lpf_horizontal_8_dual = vpx_lpf_horizontal_8_dual_sse2; + vpx_lpf_vertical_16 = vpx_lpf_vertical_16_c; + if (flags & HAS_SSE2) vpx_lpf_vertical_16 = vpx_lpf_vertical_16_sse2; + vpx_lpf_vertical_16_dual = vpx_lpf_vertical_16_dual_c; + if (flags & HAS_SSE2) vpx_lpf_vertical_16_dual = vpx_lpf_vertical_16_dual_sse2; + vpx_lpf_vertical_4 = vpx_lpf_vertical_4_c; + if (flags & HAS_SSE2) vpx_lpf_vertical_4 = vpx_lpf_vertical_4_sse2; + vpx_lpf_vertical_4_dual = vpx_lpf_vertical_4_dual_c; + if (flags & HAS_SSE2) vpx_lpf_vertical_4_dual = vpx_lpf_vertical_4_dual_sse2; + vpx_lpf_vertical_8 = vpx_lpf_vertical_8_c; + if (flags & HAS_SSE2) vpx_lpf_vertical_8 = vpx_lpf_vertical_8_sse2; + vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_c; + if (flags & HAS_SSE2) vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_sse2; + vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_c; + if (flags & HAS_SSE2) vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_sse2; + vpx_mbpost_proc_down = vpx_mbpost_proc_down_c; + if (flags & HAS_SSE2) vpx_mbpost_proc_down = vpx_mbpost_proc_down_sse2; + vpx_minmax_8x8 = vpx_minmax_8x8_c; + if (flags & HAS_SSE2) vpx_minmax_8x8 = vpx_minmax_8x8_sse2; + vpx_mse16x16 = vpx_mse16x16_c; + if (flags & HAS_SSE2) vpx_mse16x16 = vpx_mse16x16_sse2; + if (flags & HAS_AVX2) vpx_mse16x16 = vpx_mse16x16_avx2; + vpx_mse16x8 = vpx_mse16x8_c; + if (flags & HAS_SSE2) vpx_mse16x8 = vpx_mse16x8_sse2; + if (flags & HAS_AVX2) vpx_mse16x8 = vpx_mse16x8_avx2; + vpx_mse8x16 = vpx_mse8x16_c; + if (flags & HAS_SSE2) vpx_mse8x16 = vpx_mse8x16_sse2; + vpx_mse8x8 = vpx_mse8x8_c; + if (flags & HAS_SSE2) vpx_mse8x8 = vpx_mse8x8_sse2; + vpx_plane_add_noise = vpx_plane_add_noise_c; + if (flags & HAS_SSE2) vpx_plane_add_noise = vpx_plane_add_noise_sse2; + vpx_post_proc_down_and_across_mb_row = vpx_post_proc_down_and_across_mb_row_c; + if (flags & HAS_SSE2) vpx_post_proc_down_and_across_mb_row = vpx_post_proc_down_and_across_mb_row_sse2; + vpx_quantize_b = vpx_quantize_b_c; + if (flags & HAS_SSE2) vpx_quantize_b = vpx_quantize_b_sse2; + if (flags & HAS_SSSE3) vpx_quantize_b = vpx_quantize_b_ssse3; + if (flags & HAS_AVX) vpx_quantize_b = vpx_quantize_b_avx; + if (flags & HAS_AVX2) vpx_quantize_b = vpx_quantize_b_avx2; + vpx_quantize_b_32x32 = vpx_quantize_b_32x32_c; + if (flags & HAS_SSSE3) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_ssse3; + if (flags & HAS_AVX) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx; + if (flags & HAS_AVX2) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx2; + vpx_sad16x16 = vpx_sad16x16_c; + if (flags & HAS_SSE2) vpx_sad16x16 = vpx_sad16x16_sse2; + vpx_sad16x16_avg = vpx_sad16x16_avg_c; + if (flags & HAS_SSE2) vpx_sad16x16_avg = vpx_sad16x16_avg_sse2; + vpx_sad16x16x4d = vpx_sad16x16x4d_c; + if (flags & HAS_SSE2) vpx_sad16x16x4d = vpx_sad16x16x4d_sse2; + vpx_sad16x32 = vpx_sad16x32_c; + if (flags & HAS_SSE2) vpx_sad16x32 = vpx_sad16x32_sse2; + vpx_sad16x32_avg = vpx_sad16x32_avg_c; + if (flags & HAS_SSE2) vpx_sad16x32_avg = vpx_sad16x32_avg_sse2; + vpx_sad16x32x4d = vpx_sad16x32x4d_c; + if (flags & HAS_SSE2) vpx_sad16x32x4d = vpx_sad16x32x4d_sse2; + vpx_sad16x8 = vpx_sad16x8_c; + if (flags & HAS_SSE2) vpx_sad16x8 = vpx_sad16x8_sse2; + vpx_sad16x8_avg = vpx_sad16x8_avg_c; + if (flags & HAS_SSE2) vpx_sad16x8_avg = vpx_sad16x8_avg_sse2; + vpx_sad16x8x4d = vpx_sad16x8x4d_c; + if (flags & HAS_SSE2) vpx_sad16x8x4d = vpx_sad16x8x4d_sse2; + vpx_sad32x16 = vpx_sad32x16_c; + if (flags & HAS_SSE2) vpx_sad32x16 = vpx_sad32x16_sse2; + if (flags & HAS_AVX2) vpx_sad32x16 = vpx_sad32x16_avx2; + vpx_sad32x16_avg = vpx_sad32x16_avg_c; + if (flags & HAS_SSE2) vpx_sad32x16_avg = vpx_sad32x16_avg_sse2; + if (flags & HAS_AVX2) vpx_sad32x16_avg = vpx_sad32x16_avg_avx2; + vpx_sad32x16x4d = vpx_sad32x16x4d_c; + if (flags & HAS_SSE2) vpx_sad32x16x4d = vpx_sad32x16x4d_sse2; + vpx_sad32x32 = vpx_sad32x32_c; + if (flags & HAS_SSE2) vpx_sad32x32 = vpx_sad32x32_sse2; + if (flags & HAS_AVX2) vpx_sad32x32 = vpx_sad32x32_avx2; + vpx_sad32x32_avg = vpx_sad32x32_avg_c; + if (flags & HAS_SSE2) vpx_sad32x32_avg = vpx_sad32x32_avg_sse2; + if (flags & HAS_AVX2) vpx_sad32x32_avg = vpx_sad32x32_avg_avx2; + vpx_sad32x32x4d = vpx_sad32x32x4d_c; + if (flags & HAS_SSE2) vpx_sad32x32x4d = vpx_sad32x32x4d_sse2; + if (flags & HAS_AVX2) vpx_sad32x32x4d = vpx_sad32x32x4d_avx2; + vpx_sad32x64 = vpx_sad32x64_c; + if (flags & HAS_SSE2) vpx_sad32x64 = vpx_sad32x64_sse2; + if (flags & HAS_AVX2) vpx_sad32x64 = vpx_sad32x64_avx2; + vpx_sad32x64_avg = vpx_sad32x64_avg_c; + if (flags & HAS_SSE2) vpx_sad32x64_avg = vpx_sad32x64_avg_sse2; + if (flags & HAS_AVX2) vpx_sad32x64_avg = vpx_sad32x64_avg_avx2; + vpx_sad32x64x4d = vpx_sad32x64x4d_c; + if (flags & HAS_SSE2) vpx_sad32x64x4d = vpx_sad32x64x4d_sse2; + vpx_sad4x4 = vpx_sad4x4_c; + if (flags & HAS_SSE2) vpx_sad4x4 = vpx_sad4x4_sse2; + vpx_sad4x4_avg = vpx_sad4x4_avg_c; + if (flags & HAS_SSE2) vpx_sad4x4_avg = vpx_sad4x4_avg_sse2; + vpx_sad4x4x4d = vpx_sad4x4x4d_c; + if (flags & HAS_SSE2) vpx_sad4x4x4d = vpx_sad4x4x4d_sse2; + vpx_sad4x8 = vpx_sad4x8_c; + if (flags & HAS_SSE2) vpx_sad4x8 = vpx_sad4x8_sse2; + vpx_sad4x8_avg = vpx_sad4x8_avg_c; + if (flags & HAS_SSE2) vpx_sad4x8_avg = vpx_sad4x8_avg_sse2; + vpx_sad4x8x4d = vpx_sad4x8x4d_c; + if (flags & HAS_SSE2) vpx_sad4x8x4d = vpx_sad4x8x4d_sse2; + vpx_sad64x32 = vpx_sad64x32_c; + if (flags & HAS_SSE2) vpx_sad64x32 = vpx_sad64x32_sse2; + if (flags & HAS_AVX2) vpx_sad64x32 = vpx_sad64x32_avx2; + vpx_sad64x32_avg = vpx_sad64x32_avg_c; + if (flags & HAS_SSE2) vpx_sad64x32_avg = vpx_sad64x32_avg_sse2; + if (flags & HAS_AVX2) vpx_sad64x32_avg = vpx_sad64x32_avg_avx2; + vpx_sad64x32x4d = vpx_sad64x32x4d_c; + if (flags & HAS_SSE2) vpx_sad64x32x4d = vpx_sad64x32x4d_sse2; + vpx_sad64x64 = vpx_sad64x64_c; + if (flags & HAS_SSE2) vpx_sad64x64 = vpx_sad64x64_sse2; + if (flags & HAS_AVX2) vpx_sad64x64 = vpx_sad64x64_avx2; + vpx_sad64x64_avg = vpx_sad64x64_avg_c; + if (flags & HAS_SSE2) vpx_sad64x64_avg = vpx_sad64x64_avg_sse2; + if (flags & HAS_AVX2) vpx_sad64x64_avg = vpx_sad64x64_avg_avx2; + vpx_sad64x64x4d = vpx_sad64x64x4d_c; + if (flags & HAS_SSE2) vpx_sad64x64x4d = vpx_sad64x64x4d_sse2; + if (flags & HAS_AVX2) vpx_sad64x64x4d = vpx_sad64x64x4d_avx2; + vpx_sad8x16 = vpx_sad8x16_c; + if (flags & HAS_SSE2) vpx_sad8x16 = vpx_sad8x16_sse2; + vpx_sad8x16_avg = vpx_sad8x16_avg_c; + if (flags & HAS_SSE2) vpx_sad8x16_avg = vpx_sad8x16_avg_sse2; + vpx_sad8x16x4d = vpx_sad8x16x4d_c; + if (flags & HAS_SSE2) vpx_sad8x16x4d = vpx_sad8x16x4d_sse2; + vpx_sad8x4 = vpx_sad8x4_c; + if (flags & HAS_SSE2) vpx_sad8x4 = vpx_sad8x4_sse2; + vpx_sad8x4_avg = vpx_sad8x4_avg_c; + if (flags & HAS_SSE2) vpx_sad8x4_avg = vpx_sad8x4_avg_sse2; + vpx_sad8x4x4d = vpx_sad8x4x4d_c; + if (flags & HAS_SSE2) vpx_sad8x4x4d = vpx_sad8x4x4d_sse2; + vpx_sad8x8 = vpx_sad8x8_c; + if (flags & HAS_SSE2) vpx_sad8x8 = vpx_sad8x8_sse2; + vpx_sad8x8_avg = vpx_sad8x8_avg_c; + if (flags & HAS_SSE2) vpx_sad8x8_avg = vpx_sad8x8_avg_sse2; + vpx_sad8x8x4d = vpx_sad8x8x4d_c; + if (flags & HAS_SSE2) vpx_sad8x8x4d = vpx_sad8x8x4d_sse2; + vpx_sad_skip_16x16 = vpx_sad_skip_16x16_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x16 = vpx_sad_skip_16x16_sse2; + vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_sse2; + vpx_sad_skip_16x32 = vpx_sad_skip_16x32_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x32 = vpx_sad_skip_16x32_sse2; + vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_sse2; + vpx_sad_skip_16x8 = vpx_sad_skip_16x8_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x8 = vpx_sad_skip_16x8_sse2; + vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_sse2; + vpx_sad_skip_32x16 = vpx_sad_skip_32x16_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_avx2; + vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_avx2; + vpx_sad_skip_32x32 = vpx_sad_skip_32x32_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_avx2; + vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_avx2; + vpx_sad_skip_32x64 = vpx_sad_skip_32x64_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_avx2; + vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_avx2; + vpx_sad_skip_4x8 = vpx_sad_skip_4x8_c; + if (flags & HAS_SSE2) vpx_sad_skip_4x8 = vpx_sad_skip_4x8_sse2; + vpx_sad_skip_4x8x4d = vpx_sad_skip_4x8x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_4x8x4d = vpx_sad_skip_4x8x4d_sse2; + vpx_sad_skip_64x32 = vpx_sad_skip_64x32_c; + if (flags & HAS_SSE2) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_avx2; + vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_avx2; + vpx_sad_skip_64x64 = vpx_sad_skip_64x64_c; + if (flags & HAS_SSE2) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_avx2; + vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_avx2; + vpx_sad_skip_8x16 = vpx_sad_skip_8x16_c; + if (flags & HAS_SSE2) vpx_sad_skip_8x16 = vpx_sad_skip_8x16_sse2; + vpx_sad_skip_8x16x4d = vpx_sad_skip_8x16x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_8x16x4d = vpx_sad_skip_8x16x4d_sse2; + vpx_sad_skip_8x8 = vpx_sad_skip_8x8_c; + if (flags & HAS_SSE2) vpx_sad_skip_8x8 = vpx_sad_skip_8x8_sse2; + vpx_sad_skip_8x8x4d = vpx_sad_skip_8x8x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_8x8x4d = vpx_sad_skip_8x8x4d_sse2; + vpx_satd = vpx_satd_c; + if (flags & HAS_SSE2) vpx_satd = vpx_satd_sse2; + if (flags & HAS_AVX2) vpx_satd = vpx_satd_avx2; + vpx_scaled_2d = vpx_scaled_2d_c; + if (flags & HAS_SSSE3) vpx_scaled_2d = vpx_scaled_2d_ssse3; + vpx_sse = vpx_sse_c; + if (flags & HAS_SSE4_1) vpx_sse = vpx_sse_sse4_1; + if (flags & HAS_AVX2) vpx_sse = vpx_sse_avx2; + vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_ssse3; + vpx_sub_pixel_avg_variance16x32 = vpx_sub_pixel_avg_variance16x32_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance16x32 = vpx_sub_pixel_avg_variance16x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance16x32 = vpx_sub_pixel_avg_variance16x32_ssse3; + vpx_sub_pixel_avg_variance16x8 = vpx_sub_pixel_avg_variance16x8_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance16x8 = vpx_sub_pixel_avg_variance16x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance16x8 = vpx_sub_pixel_avg_variance16x8_ssse3; + vpx_sub_pixel_avg_variance32x16 = vpx_sub_pixel_avg_variance32x16_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance32x16 = vpx_sub_pixel_avg_variance32x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance32x16 = vpx_sub_pixel_avg_variance32x16_ssse3; + vpx_sub_pixel_avg_variance32x32 = vpx_sub_pixel_avg_variance32x32_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance32x32 = vpx_sub_pixel_avg_variance32x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance32x32 = vpx_sub_pixel_avg_variance32x32_ssse3; + if (flags & HAS_AVX2) vpx_sub_pixel_avg_variance32x32 = vpx_sub_pixel_avg_variance32x32_avx2; + vpx_sub_pixel_avg_variance32x64 = vpx_sub_pixel_avg_variance32x64_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance32x64 = vpx_sub_pixel_avg_variance32x64_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance32x64 = vpx_sub_pixel_avg_variance32x64_ssse3; + vpx_sub_pixel_avg_variance4x4 = vpx_sub_pixel_avg_variance4x4_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance4x4 = vpx_sub_pixel_avg_variance4x4_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance4x4 = vpx_sub_pixel_avg_variance4x4_ssse3; + vpx_sub_pixel_avg_variance4x8 = vpx_sub_pixel_avg_variance4x8_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance4x8 = vpx_sub_pixel_avg_variance4x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance4x8 = vpx_sub_pixel_avg_variance4x8_ssse3; + vpx_sub_pixel_avg_variance64x32 = vpx_sub_pixel_avg_variance64x32_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance64x32 = vpx_sub_pixel_avg_variance64x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance64x32 = vpx_sub_pixel_avg_variance64x32_ssse3; + vpx_sub_pixel_avg_variance64x64 = vpx_sub_pixel_avg_variance64x64_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance64x64 = vpx_sub_pixel_avg_variance64x64_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance64x64 = vpx_sub_pixel_avg_variance64x64_ssse3; + if (flags & HAS_AVX2) vpx_sub_pixel_avg_variance64x64 = vpx_sub_pixel_avg_variance64x64_avx2; + vpx_sub_pixel_avg_variance8x16 = vpx_sub_pixel_avg_variance8x16_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance8x16 = vpx_sub_pixel_avg_variance8x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance8x16 = vpx_sub_pixel_avg_variance8x16_ssse3; + vpx_sub_pixel_avg_variance8x4 = vpx_sub_pixel_avg_variance8x4_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance8x4 = vpx_sub_pixel_avg_variance8x4_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance8x4 = vpx_sub_pixel_avg_variance8x4_ssse3; + vpx_sub_pixel_avg_variance8x8 = vpx_sub_pixel_avg_variance8x8_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance8x8 = vpx_sub_pixel_avg_variance8x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance8x8 = vpx_sub_pixel_avg_variance8x8_ssse3; + vpx_sub_pixel_variance16x16 = vpx_sub_pixel_variance16x16_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance16x16 = vpx_sub_pixel_variance16x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance16x16 = vpx_sub_pixel_variance16x16_ssse3; + vpx_sub_pixel_variance16x32 = vpx_sub_pixel_variance16x32_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance16x32 = vpx_sub_pixel_variance16x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance16x32 = vpx_sub_pixel_variance16x32_ssse3; + vpx_sub_pixel_variance16x8 = vpx_sub_pixel_variance16x8_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance16x8 = vpx_sub_pixel_variance16x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance16x8 = vpx_sub_pixel_variance16x8_ssse3; + vpx_sub_pixel_variance32x16 = vpx_sub_pixel_variance32x16_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance32x16 = vpx_sub_pixel_variance32x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance32x16 = vpx_sub_pixel_variance32x16_ssse3; + vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_ssse3; + if (flags & HAS_AVX2) vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_avx2; + vpx_sub_pixel_variance32x64 = vpx_sub_pixel_variance32x64_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance32x64 = vpx_sub_pixel_variance32x64_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance32x64 = vpx_sub_pixel_variance32x64_ssse3; + vpx_sub_pixel_variance4x4 = vpx_sub_pixel_variance4x4_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance4x4 = vpx_sub_pixel_variance4x4_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance4x4 = vpx_sub_pixel_variance4x4_ssse3; + vpx_sub_pixel_variance4x8 = vpx_sub_pixel_variance4x8_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance4x8 = vpx_sub_pixel_variance4x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance4x8 = vpx_sub_pixel_variance4x8_ssse3; + vpx_sub_pixel_variance64x32 = vpx_sub_pixel_variance64x32_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance64x32 = vpx_sub_pixel_variance64x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance64x32 = vpx_sub_pixel_variance64x32_ssse3; + vpx_sub_pixel_variance64x64 = vpx_sub_pixel_variance64x64_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance64x64 = vpx_sub_pixel_variance64x64_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance64x64 = vpx_sub_pixel_variance64x64_ssse3; + if (flags & HAS_AVX2) vpx_sub_pixel_variance64x64 = vpx_sub_pixel_variance64x64_avx2; + vpx_sub_pixel_variance8x16 = vpx_sub_pixel_variance8x16_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance8x16 = vpx_sub_pixel_variance8x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x16 = vpx_sub_pixel_variance8x16_ssse3; + vpx_sub_pixel_variance8x4 = vpx_sub_pixel_variance8x4_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance8x4 = vpx_sub_pixel_variance8x4_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x4 = vpx_sub_pixel_variance8x4_ssse3; + vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_ssse3; + vpx_subtract_block = vpx_subtract_block_c; + if (flags & HAS_SSE2) vpx_subtract_block = vpx_subtract_block_sse2; + if (flags & HAS_AVX2) vpx_subtract_block = vpx_subtract_block_avx2; + vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_c; + if (flags & HAS_SSE2) vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_sse2; + vpx_tm_predictor_16x16 = vpx_tm_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_tm_predictor_16x16 = vpx_tm_predictor_16x16_sse2; + vpx_tm_predictor_32x32 = vpx_tm_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_tm_predictor_32x32 = vpx_tm_predictor_32x32_sse2; + vpx_tm_predictor_4x4 = vpx_tm_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_tm_predictor_4x4 = vpx_tm_predictor_4x4_sse2; + vpx_tm_predictor_8x8 = vpx_tm_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_tm_predictor_8x8 = vpx_tm_predictor_8x8_sse2; + vpx_v_predictor_16x16 = vpx_v_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_v_predictor_16x16 = vpx_v_predictor_16x16_sse2; + vpx_v_predictor_32x32 = vpx_v_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_v_predictor_32x32 = vpx_v_predictor_32x32_sse2; + vpx_v_predictor_4x4 = vpx_v_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_v_predictor_4x4 = vpx_v_predictor_4x4_sse2; + vpx_v_predictor_8x8 = vpx_v_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_v_predictor_8x8 = vpx_v_predictor_8x8_sse2; + vpx_variance16x16 = vpx_variance16x16_c; + if (flags & HAS_SSE2) vpx_variance16x16 = vpx_variance16x16_sse2; + if (flags & HAS_AVX2) vpx_variance16x16 = vpx_variance16x16_avx2; + vpx_variance16x32 = vpx_variance16x32_c; + if (flags & HAS_SSE2) vpx_variance16x32 = vpx_variance16x32_sse2; + if (flags & HAS_AVX2) vpx_variance16x32 = vpx_variance16x32_avx2; + vpx_variance16x8 = vpx_variance16x8_c; + if (flags & HAS_SSE2) vpx_variance16x8 = vpx_variance16x8_sse2; + if (flags & HAS_AVX2) vpx_variance16x8 = vpx_variance16x8_avx2; + vpx_variance32x16 = vpx_variance32x16_c; + if (flags & HAS_SSE2) vpx_variance32x16 = vpx_variance32x16_sse2; + if (flags & HAS_AVX2) vpx_variance32x16 = vpx_variance32x16_avx2; + vpx_variance32x32 = vpx_variance32x32_c; + if (flags & HAS_SSE2) vpx_variance32x32 = vpx_variance32x32_sse2; + if (flags & HAS_AVX2) vpx_variance32x32 = vpx_variance32x32_avx2; + vpx_variance32x64 = vpx_variance32x64_c; + if (flags & HAS_SSE2) vpx_variance32x64 = vpx_variance32x64_sse2; + if (flags & HAS_AVX2) vpx_variance32x64 = vpx_variance32x64_avx2; + vpx_variance4x4 = vpx_variance4x4_c; + if (flags & HAS_SSE2) vpx_variance4x4 = vpx_variance4x4_sse2; + vpx_variance4x8 = vpx_variance4x8_c; + if (flags & HAS_SSE2) vpx_variance4x8 = vpx_variance4x8_sse2; + vpx_variance64x32 = vpx_variance64x32_c; + if (flags & HAS_SSE2) vpx_variance64x32 = vpx_variance64x32_sse2; + if (flags & HAS_AVX2) vpx_variance64x32 = vpx_variance64x32_avx2; + vpx_variance64x64 = vpx_variance64x64_c; + if (flags & HAS_SSE2) vpx_variance64x64 = vpx_variance64x64_sse2; + if (flags & HAS_AVX2) vpx_variance64x64 = vpx_variance64x64_avx2; + vpx_variance8x16 = vpx_variance8x16_c; + if (flags & HAS_SSE2) vpx_variance8x16 = vpx_variance8x16_sse2; + if (flags & HAS_AVX2) vpx_variance8x16 = vpx_variance8x16_avx2; + vpx_variance8x4 = vpx_variance8x4_c; + if (flags & HAS_SSE2) vpx_variance8x4 = vpx_variance8x4_sse2; + if (flags & HAS_AVX2) vpx_variance8x4 = vpx_variance8x4_avx2; + vpx_variance8x8 = vpx_variance8x8_c; + if (flags & HAS_SSE2) vpx_variance8x8 = vpx_variance8x8_sse2; + if (flags & HAS_AVX2) vpx_variance8x8 = vpx_variance8x8_avx2; + vpx_vector_var = vpx_vector_var_c; + if (flags & HAS_SSE2) vpx_vector_var = vpx_vector_var_sse2; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/mac/ia32/vpx_scale_rtcd.h b/media/libvpx/config/mac/ia32/vpx_scale_rtcd.h new file mode 100644 index 0000000000..5f09104ea6 --- /dev/null +++ b/media/libvpx/config/mac/ia32/vpx_scale_rtcd.h @@ -0,0 +1,73 @@ +// This file is generated. Do not edit. +#ifndef VPX_SCALE_RTCD_H_ +#define VPX_SCALE_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c + +void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c + +void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c + +void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c + +void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c + +void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c + +void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c + +void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vp8_yv12_copy_frame vp8_yv12_copy_frame_c + +void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c + +void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_borders vpx_extend_frame_borders_c + +void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c + +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c + +void vpx_scale_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/x86.h" +static void setup_rtcd_internal(void) +{ + int flags = x86_simd_caps(); + + (void)flags; + +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/mac/x64/vp8_rtcd.h b/media/libvpx/config/mac/x64/vp8_rtcd.h new file mode 100644 index 0000000000..dc850b4fe0 --- /dev/null +++ b/media/libvpx/config/mac/x64/vp8_rtcd.h @@ -0,0 +1,248 @@ +// This file is generated. Do not edit. +#ifndef VP8_RTCD_H_ +#define VP8_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP8 + */ + +struct blockd; +struct macroblockd; +struct loop_filter_info; + +/* Encoder forward decls */ +struct block; +struct macroblock; +struct variance_vtable; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict4x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_sse2 + +void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_sse2 + +void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +int vp8_block_error_c(short *coeff, short *dqcoeff); +int vp8_block_error_sse2(short *coeff, short *dqcoeff); +#define vp8_block_error vp8_block_error_sse2 + +void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +void vp8_copy32xn_sse2(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +void vp8_copy32xn_sse3(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); + +void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem16x16 vp8_copy_mem16x16_sse2 + +void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x4_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem8x4 vp8_copy_mem8x4_mmx + +void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x8_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem8x8 vp8_copy_mem8x8_mmx + +void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +#define vp8_dc_only_idct_add vp8_dc_only_idct_add_mmx + +int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter vp8_denoiser_filter_sse2 + +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_sse2 + +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride); +void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *dest, int stride); +#define vp8_dequant_idct_add vp8_dequant_idct_add_mmx + +void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +void vp8_dequant_idct_add_uv_block_sse2(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_sse2 + +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2 + +void vp8_dequantize_b_c(struct blockd*, short *DQC); +void vp8_dequantize_b_mmx(struct blockd*, short *DQC); +#define vp8_dequantize_b vp8_dequantize_b_mmx + +int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_diamond_search_sad vp8_diamond_search_sadx4 + +void vp8_fast_quantize_b_c(struct block *, struct blockd *); +void vp8_fast_quantize_b_sse2(struct block *, struct blockd *); +void vp8_fast_quantize_b_ssse3(struct block *, struct blockd *); +RTCD_EXTERN void (*vp8_fast_quantize_b)(struct block *, struct blockd *); + +void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +void vp8_filter_by_weight16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight16x16 vp8_filter_by_weight16x16_sse2 + +void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight4x4 vp8_filter_by_weight4x4_c + +void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight8x8 vp8_filter_by_weight8x8_sse2 + +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bh vp8_loop_filter_bh_sse2 + +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bv vp8_loop_filter_bv_sse2 + +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbh vp8_loop_filter_mbh_sse2 + +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbv vp8_loop_filter_mbv_sse2 + +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_sse2 + +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_sse2 + +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_sse2 + +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_sse2 + +int vp8_mbblock_error_c(struct macroblock *mb, int dc); +int vp8_mbblock_error_sse2(struct macroblock *mb, int dc); +#define vp8_mbblock_error vp8_mbblock_error_sse2 + +int vp8_mbuverror_c(struct macroblock *mb); +int vp8_mbuverror_sse2(struct macroblock *mb); +#define vp8_mbuverror vp8_mbuverror_sse2 + +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_refining_search_sad vp8_refining_search_sadx4 + +void vp8_regular_quantize_b_c(struct block *, struct blockd *); +void vp8_regular_quantize_b_sse2(struct block *, struct blockd *); +void vp8_regular_quantize_b_sse4_1(struct block *, struct blockd *); +RTCD_EXTERN void (*vp8_regular_quantize_b)(struct block *, struct blockd *); + +void vp8_short_fdct4x4_c(short *input, short *output, int pitch); +void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch); +#define vp8_short_fdct4x4 vp8_short_fdct4x4_sse2 + +void vp8_short_fdct8x4_c(short *input, short *output, int pitch); +void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch); +#define vp8_short_fdct8x4 vp8_short_fdct8x4_sse2 + +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +#define vp8_short_idct4x4llm vp8_short_idct4x4llm_mmx + +void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff); +void vp8_short_inv_walsh4x4_sse2(short *input, short *mb_dqcoeff); +#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_sse2 + +void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + +void vp8_short_walsh4x4_c(short *input, short *output, int pitch); +void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch); +#define vp8_short_walsh4x4 vp8_short_walsh4x4_sse2 + +void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); +void vp8_temporal_filter_apply_sse2(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); +#define vp8_temporal_filter_apply vp8_temporal_filter_apply_sse2 + +void vp8_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/x86.h" +static void setup_rtcd_internal(void) +{ + int flags = x86_simd_caps(); + + (void)flags; + + vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_sse2; + if (flags & HAS_SSSE3) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_ssse3; + vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_sse2; + if (flags & HAS_SSSE3) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_ssse3; + vp8_copy32xn = vp8_copy32xn_sse2; + if (flags & HAS_SSE3) vp8_copy32xn = vp8_copy32xn_sse3; + vp8_fast_quantize_b = vp8_fast_quantize_b_sse2; + if (flags & HAS_SSSE3) vp8_fast_quantize_b = vp8_fast_quantize_b_ssse3; + vp8_regular_quantize_b = vp8_regular_quantize_b_sse2; + if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1; + vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_ssse3; + vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_mmx; + if (flags & HAS_SSSE3) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_ssse3; + vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_ssse3; + vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_ssse3; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/mac/x64/vp9_rtcd.h b/media/libvpx/config/mac/x64/vp9_rtcd.h new file mode 100644 index 0000000000..8644c3598d --- /dev/null +++ b/media/libvpx/config/mac/x64/vp9_rtcd.h @@ -0,0 +1,135 @@ +// This file is generated. Do not edit. +#ifndef VP9_RTCD_H_ +#define VP9_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP9 + */ + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" + +struct macroblockd; + +/* Encoder forward decls */ +struct macroblock; +struct macroblock_plane; +struct vp9_sad_table; +struct ScanOrder; +struct search_site_config; +struct mv; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); +void vp9_apply_temporal_filter_sse4_1(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); +RTCD_EXTERN void (*vp9_apply_temporal_filter)(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); + +int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +RTCD_EXTERN int64_t (*vp9_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); + +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +RTCD_EXTERN int64_t (*vp9_block_error_fp)(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); + +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_c + +void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht16x16 vp9_fht16x16_sse2 + +void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht4x4 vp9_fht4x4_sse2 + +void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht8x8 vp9_fht8x8_sse2 + +void vp9_filter_by_weight16x16_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); +void vp9_filter_by_weight16x16_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); +#define vp9_filter_by_weight16x16 vp9_filter_by_weight16x16_sse2 + +void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); +void vp9_filter_by_weight8x8_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); +#define vp9_filter_by_weight8x8 vp9_filter_by_weight8x8_sse2 + +void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); +void vp9_fwht4x4_sse2(const int16_t *input, tran_low_t *output, int stride); +#define vp9_fwht4x4 vp9_fwht4x4_sse2 + +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2 + +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht4x4_16_add vp9_iht4x4_16_add_sse2 + +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht8x8_64_add vp9_iht8x8_64_add_sse2 + +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); + +void vp9_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/x86.h" +static void setup_rtcd_internal(void) +{ + int flags = x86_simd_caps(); + + (void)flags; + + vp9_apply_temporal_filter = vp9_apply_temporal_filter_c; + if (flags & HAS_SSE4_1) vp9_apply_temporal_filter = vp9_apply_temporal_filter_sse4_1; + vp9_block_error = vp9_block_error_sse2; + if (flags & HAS_AVX2) vp9_block_error = vp9_block_error_avx2; + vp9_block_error_fp = vp9_block_error_fp_sse2; + if (flags & HAS_AVX2) vp9_block_error_fp = vp9_block_error_fp_avx2; + vp9_quantize_fp = vp9_quantize_fp_sse2; + if (flags & HAS_SSSE3) vp9_quantize_fp = vp9_quantize_fp_ssse3; + if (flags & HAS_AVX2) vp9_quantize_fp = vp9_quantize_fp_avx2; + vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_c; + if (flags & HAS_SSSE3) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_ssse3; + if (flags & HAS_AVX2) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_avx2; + vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c; + if (flags & HAS_SSSE3) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_ssse3; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/mac/x64/vpx_config.asm b/media/libvpx/config/mac/x64/vpx_config.asm new file mode 100644 index 0000000000..8715768a2e --- /dev/null +++ b/media/libvpx/config/mac/x64/vpx_config.asm @@ -0,0 +1,93 @@ +%define VPX_ARCH_ARM 0 +%define VPX_ARCH_AARCH64 0 +%define VPX_ARCH_MIPS 0 +%define VPX_ARCH_X86 0 +%define VPX_ARCH_X86_64 1 +%define VPX_ARCH_PPC 0 +%define VPX_ARCH_LOONGARCH 0 +%define HAVE_NEON_ASM 0 +%define HAVE_NEON 0 +%define HAVE_NEON_DOTPROD 0 +%define HAVE_NEON_I8MM 0 +%define HAVE_SVE 0 +%define HAVE_MIPS32 0 +%define HAVE_DSPR2 0 +%define HAVE_MSA 0 +%define HAVE_MIPS64 0 +%define HAVE_MMX 1 +%define HAVE_SSE 1 +%define HAVE_SSE2 1 +%define HAVE_SSE3 1 +%define HAVE_SSSE3 1 +%define HAVE_SSE4_1 1 +%define HAVE_AVX 1 +%define HAVE_AVX2 1 +%define HAVE_AVX512 0 +%define HAVE_VSX 0 +%define HAVE_MMI 0 +%define HAVE_LSX 0 +%define HAVE_LASX 0 +%define HAVE_VPX_PORTS 1 +%define HAVE_PTHREAD_H 1 +%define CONFIG_DEPENDENCY_TRACKING 1 +%define CONFIG_EXTERNAL_BUILD 1 +%define CONFIG_INSTALL_DOCS 0 +%define CONFIG_INSTALL_BINS 1 +%define CONFIG_INSTALL_LIBS 1 +%define CONFIG_INSTALL_SRCS 0 +%define CONFIG_DEBUG 0 +%define CONFIG_GPROF 0 +%define CONFIG_GCOV 0 +%define CONFIG_RVCT 0 +%define CONFIG_GCC 1 +%define CONFIG_MSVS 0 +%define CONFIG_PIC 1 +%define CONFIG_BIG_ENDIAN 0 +%define CONFIG_CODEC_SRCS 0 +%define CONFIG_DEBUG_LIBS 0 +%define CONFIG_DEQUANT_TOKENS 0 +%define CONFIG_DC_RECON 0 +%define CONFIG_RUNTIME_CPU_DETECT 1 +%define CONFIG_POSTPROC 1 +%define CONFIG_VP9_POSTPROC 1 +%define CONFIG_MULTITHREAD 1 +%define CONFIG_INTERNAL_STATS 0 +%define CONFIG_VP8_ENCODER 1 +%define CONFIG_VP8_DECODER 1 +%define CONFIG_VP9_ENCODER 1 +%define CONFIG_VP9_DECODER 1 +%define CONFIG_VP8 1 +%define CONFIG_VP9 1 +%define CONFIG_ENCODERS 1 +%define CONFIG_DECODERS 1 +%define CONFIG_STATIC_MSVCRT 0 +%define CONFIG_SPATIAL_RESAMPLING 1 +%define CONFIG_REALTIME_ONLY 0 +%define CONFIG_ONTHEFLY_BITPACKING 0 +%define CONFIG_ERROR_CONCEALMENT 0 +%define CONFIG_SHARED 0 +%define CONFIG_STATIC 1 +%define CONFIG_SMALL 0 +%define CONFIG_POSTPROC_VISUALIZER 0 +%define CONFIG_OS_SUPPORT 1 +%define CONFIG_UNIT_TESTS 0 +%define CONFIG_WEBM_IO 1 +%define CONFIG_LIBYUV 1 +%define CONFIG_DECODE_PERF_TESTS 0 +%define CONFIG_ENCODE_PERF_TESTS 0 +%define CONFIG_MULTI_RES_ENCODING 1 +%define CONFIG_TEMPORAL_DENOISING 1 +%define CONFIG_VP9_TEMPORAL_DENOISING 0 +%define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +%define CONFIG_VP9_HIGHBITDEPTH 0 +%define CONFIG_BETTER_HW_COMPATIBILITY 0 +%define CONFIG_EXPERIMENTAL 0 +%define CONFIG_SIZE_LIMIT 1 +%define CONFIG_ALWAYS_ADJUST_BPM 0 +%define CONFIG_BITSTREAM_DEBUG 0 +%define CONFIG_MISMATCH_DEBUG 0 +%define CONFIG_FP_MB_STATS 0 +%define CONFIG_EMULATE_HARDWARE 0 +%define CONFIG_NON_GREEDY_MV 0 +%define CONFIG_RATE_CTRL 0 +%define CONFIG_COLLECT_COMPONENT_TIMING 0 diff --git a/media/libvpx/config/mac/x64/vpx_config.c b/media/libvpx/config/mac/x64/vpx_config.c new file mode 100644 index 0000000000..9a06646fdc --- /dev/null +++ b/media/libvpx/config/mac/x64/vpx_config.c @@ -0,0 +1,10 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +#include "vpx/vpx_codec.h" +static const char* const cfg = "--target=x86_64-darwin9-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm"; +const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/mac/x64/vpx_config.h b/media/libvpx/config/mac/x64/vpx_config.h new file mode 100644 index 0000000000..ab4439aaf4 --- /dev/null +++ b/media/libvpx/config/mac/x64/vpx_config.h @@ -0,0 +1,108 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +/* This file automatically generated by configure. Do not edit! */ +#ifndef VPX_CONFIG_H +#define VPX_CONFIG_H +#define RESTRICT +#define INLINE inline +#define VPX_ARCH_ARM 0 +#define VPX_ARCH_AARCH64 0 +#define VPX_ARCH_MIPS 0 +#define VPX_ARCH_X86 0 +#define VPX_ARCH_X86_64 1 +#define VPX_ARCH_PPC 0 +#define VPX_ARCH_LOONGARCH 0 +#define HAVE_NEON_ASM 0 +#define HAVE_NEON 0 +#define HAVE_NEON_DOTPROD 0 +#define HAVE_NEON_I8MM 0 +#define HAVE_SVE 0 +#define HAVE_MIPS32 0 +#define HAVE_DSPR2 0 +#define HAVE_MSA 0 +#define HAVE_MIPS64 0 +#define HAVE_MMX 1 +#define HAVE_SSE 1 +#define HAVE_SSE2 1 +#define HAVE_SSE3 1 +#define HAVE_SSSE3 1 +#define HAVE_SSE4_1 1 +#define HAVE_AVX 1 +#define HAVE_AVX2 1 +#define HAVE_AVX512 0 +#define HAVE_VSX 0 +#define HAVE_MMI 0 +#define HAVE_LSX 0 +#define HAVE_LASX 0 +#define HAVE_VPX_PORTS 1 +#define HAVE_PTHREAD_H 1 +#define CONFIG_DEPENDENCY_TRACKING 1 +#define CONFIG_EXTERNAL_BUILD 1 +#define CONFIG_INSTALL_DOCS 0 +#define CONFIG_INSTALL_BINS 1 +#define CONFIG_INSTALL_LIBS 1 +#define CONFIG_INSTALL_SRCS 0 +#define CONFIG_DEBUG 0 +#define CONFIG_GPROF 0 +#define CONFIG_GCOV 0 +#define CONFIG_RVCT 0 +#define CONFIG_GCC 1 +#define CONFIG_MSVS 0 +#define CONFIG_PIC 1 +#define CONFIG_BIG_ENDIAN 0 +#define CONFIG_CODEC_SRCS 0 +#define CONFIG_DEBUG_LIBS 0 +#define CONFIG_DEQUANT_TOKENS 0 +#define CONFIG_DC_RECON 0 +#define CONFIG_RUNTIME_CPU_DETECT 1 +#define CONFIG_POSTPROC 1 +#define CONFIG_VP9_POSTPROC 1 +#define CONFIG_MULTITHREAD 1 +#define CONFIG_INTERNAL_STATS 0 +#define CONFIG_VP8_ENCODER 1 +#define CONFIG_VP8_DECODER 1 +#define CONFIG_VP9_ENCODER 1 +#define CONFIG_VP9_DECODER 1 +#define CONFIG_VP8 1 +#define CONFIG_VP9 1 +#define CONFIG_ENCODERS 1 +#define CONFIG_DECODERS 1 +#define CONFIG_STATIC_MSVCRT 0 +#define CONFIG_SPATIAL_RESAMPLING 1 +#define CONFIG_REALTIME_ONLY 0 +#define CONFIG_ONTHEFLY_BITPACKING 0 +#define CONFIG_ERROR_CONCEALMENT 0 +#define CONFIG_SHARED 0 +#define CONFIG_STATIC 1 +#define CONFIG_SMALL 0 +#define CONFIG_POSTPROC_VISUALIZER 0 +#define CONFIG_OS_SUPPORT 1 +#define CONFIG_UNIT_TESTS 0 +#define CONFIG_WEBM_IO 1 +#define CONFIG_LIBYUV 1 +#define CONFIG_DECODE_PERF_TESTS 0 +#define CONFIG_ENCODE_PERF_TESTS 0 +#define CONFIG_MULTI_RES_ENCODING 1 +#define CONFIG_TEMPORAL_DENOISING 1 +#define CONFIG_VP9_TEMPORAL_DENOISING 0 +#define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 +#define CONFIG_BETTER_HW_COMPATIBILITY 0 +#define CONFIG_EXPERIMENTAL 0 +#define CONFIG_SIZE_LIMIT 1 +#define CONFIG_ALWAYS_ADJUST_BPM 0 +#define CONFIG_BITSTREAM_DEBUG 0 +#define CONFIG_MISMATCH_DEBUG 0 +#define CONFIG_FP_MB_STATS 0 +#define CONFIG_EMULATE_HARDWARE 0 +#define CONFIG_NON_GREEDY_MV 0 +#define CONFIG_RATE_CTRL 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 +#define DECODE_WIDTH_LIMIT 8192 +#define DECODE_HEIGHT_LIMIT 4608 +#endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/mac/x64/vpx_dsp_rtcd.h b/media/libvpx/config/mac/x64/vpx_dsp_rtcd.h new file mode 100644 index 0000000000..755e916dd1 --- /dev/null +++ b/media/libvpx/config/mac/x64/vpx_dsp_rtcd.h @@ -0,0 +1,1296 @@ +// This file is generated. Do not edit. +#ifndef VPX_DSP_RTCD_H_ +#define VPX_DSP_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * DSP + */ + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; + struct ScanOrder; +#endif + + +#ifdef __cplusplus +extern "C" { +#endif + +unsigned int vpx_avg_4x4_c(const uint8_t *, int p); +unsigned int vpx_avg_4x4_sse2(const uint8_t *, int p); +#define vpx_avg_4x4 vpx_avg_4x4_sse2 + +unsigned int vpx_avg_8x8_c(const uint8_t *, int p); +unsigned int vpx_avg_8x8_sse2(const uint8_t *, int p); +#define vpx_avg_8x8 vpx_avg_8x8_sse2 + +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +void vpx_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +RTCD_EXTERN void (*vpx_comp_avg_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); + +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve_avg vpx_convolve_avg_sse2 + +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve_copy vpx_convolve_copy_sse2 + +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c + +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c + +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c + +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c + +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c + +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c + +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c + +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c + +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_sse2 + +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_sse2 + +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_sse2 + +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c + +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c + +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_sse2 + +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_sse2 + +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_sse2 + +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_sse2 + +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_sse2 + +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_sse2 + +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_sse2 + +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_sse2 + +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_sse2 + +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_sse2 + +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_sse2 + +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_sse2 + +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_sse2 + +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_sse2 + +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_sse2 + +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_sse2 + +void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct16x16)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct16x16_1 vpx_fdct16x16_1_sse2 + +void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_sse2(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_avx2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct32x32)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32_1 vpx_fdct32x32_1_sse2 + +void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_rd_sse2(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_rd_avx2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct32x32_rd)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct4x4_sse2(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4 vpx_fdct4x4_sse2 + +void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4_1 vpx_fdct4x4_1_sse2 + +void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_sse2(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_ssse3(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct8x8)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct8x8_1 vpx_fdct8x8_1_sse2 + +void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); +#define vpx_get4x4sse_cs vpx_get4x4sse_cs_c + +void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get8x8var vpx_get8x8var_sse2 + +unsigned int vpx_get_mb_ss_c(const int16_t *); +unsigned int vpx_get_mb_ss_sse2(const int16_t *); +#define vpx_get_mb_ss vpx_get_mb_ss_sse2 + +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_16x16 vpx_h_predictor_16x16_sse2 + +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_32x32 vpx_h_predictor_32x32_sse2 + +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_sse2 + +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_sse2 + +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); + +void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_32x32)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); + +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_8x8_ssse3(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c + +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_10_add vpx_idct16x16_10_add_sse2 + +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_1_add vpx_idct16x16_1_add_sse2 + +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_38_add vpx_idct16x16_38_add_sse2 + +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_1_add vpx_idct32x32_1_add_sse2 + +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct4x4_16_add vpx_idct4x4_16_add_sse2 + +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct4x4_1_add vpx_idct4x4_1_add_sse2 + +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_1_add vpx_idct8x8_1_add_sse2 + +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_64_add vpx_idct8x8_64_add_sse2 + +int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); +int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width); +#define vpx_int_pro_col vpx_int_pro_col_sse2 + +void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +#define vpx_int_pro_row vpx_int_pro_row_sse2 + +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_sse2 + +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c + +void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_avx2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_dual_avx2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_16_dual)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_4 vpx_lpf_horizontal_4_sse2 + +void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_horizontal_4_dual vpx_lpf_horizontal_4_dual_sse2 + +void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_8 vpx_lpf_horizontal_8_sse2 + +void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_horizontal_8_dual vpx_lpf_horizontal_8_dual_sse2 + +void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_sse2 + +void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16_dual vpx_lpf_vertical_16_dual_sse2 + +void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_4 vpx_lpf_vertical_4_sse2 + +void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_vertical_4_dual vpx_lpf_vertical_4_dual_sse2 + +void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_8 vpx_lpf_vertical_8_sse2 + +void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_sse2 + +void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols,int flimit); +void vpx_mbpost_proc_across_ip_sse2(unsigned char *src, int pitch, int rows, int cols,int flimit); +#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_sse2 + +void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); +void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit); +#define vpx_mbpost_proc_down vpx_mbpost_proc_down_sse2 + +void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +#define vpx_minmax_8x8 vpx_minmax_8x8_sse2 + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_mse8x16 vpx_mse8x16_sse2 + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_mse8x8 vpx_mse8x8_sse2 + +void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch); +void vpx_plane_add_noise_sse2(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch); +#define vpx_plane_add_noise vpx_plane_add_noise_sse2 + +void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +void vpx_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +#define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_sse2 + +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vpx_quantize_b_32x32)(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad16x16 vpx_sad16x16_sse2 + +unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x16_avg vpx_sad16x16_avg_sse2 + +void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad16x16x4d vpx_sad16x16x4d_sse2 + +unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad16x32 vpx_sad16x32_sse2 + +unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x32_avg vpx_sad16x32_avg_sse2 + +void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad16x32x4d vpx_sad16x32x4d_sse2 + +unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad16x8 vpx_sad16x8_sse2 + +unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x8_avg vpx_sad16x8_avg_sse2 + +void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad16x8x4d vpx_sad16x8x4d_sse2 + +unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x16_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad32x16x4d vpx_sad32x16x4d_sse2 + +unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad32x64x4d vpx_sad32x64x4d_sse2 + +unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad4x4 vpx_sad4x4_sse2 + +unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad4x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x4_avg vpx_sad4x4_avg_sse2 + +void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad4x4x4d vpx_sad4x4x4d_sse2 + +unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad4x8 vpx_sad4x8_sse2 + +unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad4x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x8_avg vpx_sad4x8_avg_sse2 + +void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad4x8x4d vpx_sad4x8x4d_sse2 + +unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad64x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad64x32x4d vpx_sad64x32x4d_sse2 + +unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x16 vpx_sad8x16_sse2 + +unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x16_avg vpx_sad8x16_avg_sse2 + +void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x16x4d vpx_sad8x16x4d_sse2 + +unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x4 vpx_sad8x4_sse2 + +unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x4_avg vpx_sad8x4_avg_sse2 + +void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x4x4d vpx_sad8x4x4d_sse2 + +unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x8 vpx_sad8x8_sse2 + +unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x8_avg vpx_sad8x8_avg_sse2 + +void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x8x4d vpx_sad8x8x4d_sse2 + +unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x16 vpx_sad_skip_16x16_sse2 + +void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x16x4d vpx_sad_skip_16x16x4d_sse2 + +unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x32 vpx_sad_skip_16x32_sse2 + +void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x32x4d vpx_sad_skip_16x32x4d_sse2 + +unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x8 vpx_sad_skip_16x8_sse2 + +void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x8x4d vpx_sad_skip_16x8x4d_sse2 + +unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c + +void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c + +unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_sse2 + +void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_sse2 + +unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_sse2 + +void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_sse2 + +unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c + +void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c + +unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_sse2 + +void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_sse2 + +int vpx_satd_c(const int16_t *coeff, int length); +int vpx_satd_sse2(const int16_t *coeff, int length); +int vpx_satd_avx2(const int16_t *coeff, int length); +RTCD_EXTERN int (*vpx_satd)(const int16_t *coeff, int length); + +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c + +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c + +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c + +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_horiz vpx_scaled_horiz_c + +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_vert vpx_scaled_vert_c + +int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_sse4_1(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +RTCD_EXTERN int64_t (*vpx_sse)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); + +uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vpx_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +RTCD_EXTERN void (*vpx_subtract_block)(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); + +uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size); +uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size); +#define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_sse2 + +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_sse2 + +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_sse2 + +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_sse2 + +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_sse2 + +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_16x16 vpx_v_predictor_16x16_sse2 + +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_32x32 vpx_v_predictor_32x32_sse2 + +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_sse2 + +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_sse2 + +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x4 vpx_variance4x4_sse2 + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x8 vpx_variance4x8_sse2 + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c + +int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl); +int vpx_vector_var_sse2(const int16_t *ref, const int16_t *src, const int bwl); +#define vpx_vector_var vpx_vector_var_sse2 + +void vpx_dsp_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/x86.h" +static void setup_rtcd_internal(void) +{ + int flags = x86_simd_caps(); + + (void)flags; + + vpx_comp_avg_pred = vpx_comp_avg_pred_sse2; + if (flags & HAS_AVX2) vpx_comp_avg_pred = vpx_comp_avg_pred_avx2; + vpx_convolve8 = vpx_convolve8_sse2; + if (flags & HAS_SSSE3) vpx_convolve8 = vpx_convolve8_ssse3; + if (flags & HAS_AVX2) vpx_convolve8 = vpx_convolve8_avx2; + vpx_convolve8_avg = vpx_convolve8_avg_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_avg = vpx_convolve8_avg_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg = vpx_convolve8_avg_avx2; + vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_avx2; + vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_avx2; + vpx_convolve8_horiz = vpx_convolve8_horiz_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_horiz = vpx_convolve8_horiz_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_horiz = vpx_convolve8_horiz_avx2; + vpx_convolve8_vert = vpx_convolve8_vert_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_vert = vpx_convolve8_vert_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_vert = vpx_convolve8_vert_avx2; + vpx_d153_predictor_16x16 = vpx_d153_predictor_16x16_c; + if (flags & HAS_SSSE3) vpx_d153_predictor_16x16 = vpx_d153_predictor_16x16_ssse3; + vpx_d153_predictor_32x32 = vpx_d153_predictor_32x32_c; + if (flags & HAS_SSSE3) vpx_d153_predictor_32x32 = vpx_d153_predictor_32x32_ssse3; + vpx_d153_predictor_4x4 = vpx_d153_predictor_4x4_c; + if (flags & HAS_SSSE3) vpx_d153_predictor_4x4 = vpx_d153_predictor_4x4_ssse3; + vpx_d153_predictor_8x8 = vpx_d153_predictor_8x8_c; + if (flags & HAS_SSSE3) vpx_d153_predictor_8x8 = vpx_d153_predictor_8x8_ssse3; + vpx_d207_predictor_16x16 = vpx_d207_predictor_16x16_c; + if (flags & HAS_SSSE3) vpx_d207_predictor_16x16 = vpx_d207_predictor_16x16_ssse3; + vpx_d207_predictor_32x32 = vpx_d207_predictor_32x32_c; + if (flags & HAS_SSSE3) vpx_d207_predictor_32x32 = vpx_d207_predictor_32x32_ssse3; + vpx_d207_predictor_8x8 = vpx_d207_predictor_8x8_c; + if (flags & HAS_SSSE3) vpx_d207_predictor_8x8 = vpx_d207_predictor_8x8_ssse3; + vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_c; + if (flags & HAS_SSSE3) vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_ssse3; + vpx_d45_predictor_32x32 = vpx_d45_predictor_32x32_c; + if (flags & HAS_SSSE3) vpx_d45_predictor_32x32 = vpx_d45_predictor_32x32_ssse3; + vpx_d63_predictor_16x16 = vpx_d63_predictor_16x16_c; + if (flags & HAS_SSSE3) vpx_d63_predictor_16x16 = vpx_d63_predictor_16x16_ssse3; + vpx_d63_predictor_32x32 = vpx_d63_predictor_32x32_c; + if (flags & HAS_SSSE3) vpx_d63_predictor_32x32 = vpx_d63_predictor_32x32_ssse3; + vpx_d63_predictor_4x4 = vpx_d63_predictor_4x4_c; + if (flags & HAS_SSSE3) vpx_d63_predictor_4x4 = vpx_d63_predictor_4x4_ssse3; + vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_c; + if (flags & HAS_SSSE3) vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_ssse3; + vpx_fdct16x16 = vpx_fdct16x16_sse2; + if (flags & HAS_AVX2) vpx_fdct16x16 = vpx_fdct16x16_avx2; + vpx_fdct32x32 = vpx_fdct32x32_sse2; + if (flags & HAS_AVX2) vpx_fdct32x32 = vpx_fdct32x32_avx2; + vpx_fdct32x32_rd = vpx_fdct32x32_rd_sse2; + if (flags & HAS_AVX2) vpx_fdct32x32_rd = vpx_fdct32x32_rd_avx2; + vpx_fdct8x8 = vpx_fdct8x8_sse2; + if (flags & HAS_SSSE3) vpx_fdct8x8 = vpx_fdct8x8_ssse3; + vpx_get16x16var = vpx_get16x16var_sse2; + if (flags & HAS_AVX2) vpx_get16x16var = vpx_get16x16var_avx2; + vpx_hadamard_16x16 = vpx_hadamard_16x16_sse2; + if (flags & HAS_AVX2) vpx_hadamard_16x16 = vpx_hadamard_16x16_avx2; + vpx_hadamard_32x32 = vpx_hadamard_32x32_sse2; + if (flags & HAS_AVX2) vpx_hadamard_32x32 = vpx_hadamard_32x32_avx2; + vpx_hadamard_8x8 = vpx_hadamard_8x8_sse2; + if (flags & HAS_SSSE3) vpx_hadamard_8x8 = vpx_hadamard_8x8_ssse3; + vpx_idct16x16_256_add = vpx_idct16x16_256_add_sse2; + if (flags & HAS_AVX2) vpx_idct16x16_256_add = vpx_idct16x16_256_add_avx2; + vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_sse2; + if (flags & HAS_AVX2) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_avx2; + vpx_idct32x32_135_add = vpx_idct32x32_135_add_sse2; + if (flags & HAS_SSSE3) vpx_idct32x32_135_add = vpx_idct32x32_135_add_ssse3; + if (flags & HAS_AVX2) vpx_idct32x32_135_add = vpx_idct32x32_135_add_avx2; + vpx_idct32x32_34_add = vpx_idct32x32_34_add_sse2; + if (flags & HAS_SSSE3) vpx_idct32x32_34_add = vpx_idct32x32_34_add_ssse3; + vpx_idct8x8_12_add = vpx_idct8x8_12_add_sse2; + if (flags & HAS_SSSE3) vpx_idct8x8_12_add = vpx_idct8x8_12_add_ssse3; + vpx_lpf_horizontal_16 = vpx_lpf_horizontal_16_sse2; + if (flags & HAS_AVX2) vpx_lpf_horizontal_16 = vpx_lpf_horizontal_16_avx2; + vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_sse2; + if (flags & HAS_AVX2) vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_avx2; + vpx_mse16x16 = vpx_mse16x16_sse2; + if (flags & HAS_AVX2) vpx_mse16x16 = vpx_mse16x16_avx2; + vpx_mse16x8 = vpx_mse16x8_sse2; + if (flags & HAS_AVX2) vpx_mse16x8 = vpx_mse16x8_avx2; + vpx_quantize_b = vpx_quantize_b_sse2; + if (flags & HAS_SSSE3) vpx_quantize_b = vpx_quantize_b_ssse3; + if (flags & HAS_AVX) vpx_quantize_b = vpx_quantize_b_avx; + if (flags & HAS_AVX2) vpx_quantize_b = vpx_quantize_b_avx2; + vpx_quantize_b_32x32 = vpx_quantize_b_32x32_c; + if (flags & HAS_SSSE3) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_ssse3; + if (flags & HAS_AVX) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx; + if (flags & HAS_AVX2) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx2; + vpx_sad32x16 = vpx_sad32x16_sse2; + if (flags & HAS_AVX2) vpx_sad32x16 = vpx_sad32x16_avx2; + vpx_sad32x16_avg = vpx_sad32x16_avg_sse2; + if (flags & HAS_AVX2) vpx_sad32x16_avg = vpx_sad32x16_avg_avx2; + vpx_sad32x32 = vpx_sad32x32_sse2; + if (flags & HAS_AVX2) vpx_sad32x32 = vpx_sad32x32_avx2; + vpx_sad32x32_avg = vpx_sad32x32_avg_sse2; + if (flags & HAS_AVX2) vpx_sad32x32_avg = vpx_sad32x32_avg_avx2; + vpx_sad32x32x4d = vpx_sad32x32x4d_sse2; + if (flags & HAS_AVX2) vpx_sad32x32x4d = vpx_sad32x32x4d_avx2; + vpx_sad32x64 = vpx_sad32x64_sse2; + if (flags & HAS_AVX2) vpx_sad32x64 = vpx_sad32x64_avx2; + vpx_sad32x64_avg = vpx_sad32x64_avg_sse2; + if (flags & HAS_AVX2) vpx_sad32x64_avg = vpx_sad32x64_avg_avx2; + vpx_sad64x32 = vpx_sad64x32_sse2; + if (flags & HAS_AVX2) vpx_sad64x32 = vpx_sad64x32_avx2; + vpx_sad64x32_avg = vpx_sad64x32_avg_sse2; + if (flags & HAS_AVX2) vpx_sad64x32_avg = vpx_sad64x32_avg_avx2; + vpx_sad64x64 = vpx_sad64x64_sse2; + if (flags & HAS_AVX2) vpx_sad64x64 = vpx_sad64x64_avx2; + vpx_sad64x64_avg = vpx_sad64x64_avg_sse2; + if (flags & HAS_AVX2) vpx_sad64x64_avg = vpx_sad64x64_avg_avx2; + vpx_sad64x64x4d = vpx_sad64x64x4d_sse2; + if (flags & HAS_AVX2) vpx_sad64x64x4d = vpx_sad64x64x4d_avx2; + vpx_sad_skip_32x16 = vpx_sad_skip_32x16_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_avx2; + vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_avx2; + vpx_sad_skip_32x32 = vpx_sad_skip_32x32_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_avx2; + vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_avx2; + vpx_sad_skip_32x64 = vpx_sad_skip_32x64_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_avx2; + vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_avx2; + vpx_sad_skip_64x32 = vpx_sad_skip_64x32_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_avx2; + vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_avx2; + vpx_sad_skip_64x64 = vpx_sad_skip_64x64_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_avx2; + vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_avx2; + vpx_satd = vpx_satd_sse2; + if (flags & HAS_AVX2) vpx_satd = vpx_satd_avx2; + vpx_scaled_2d = vpx_scaled_2d_c; + if (flags & HAS_SSSE3) vpx_scaled_2d = vpx_scaled_2d_ssse3; + vpx_sse = vpx_sse_c; + if (flags & HAS_SSE4_1) vpx_sse = vpx_sse_sse4_1; + if (flags & HAS_AVX2) vpx_sse = vpx_sse_avx2; + vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_ssse3; + vpx_sub_pixel_avg_variance16x32 = vpx_sub_pixel_avg_variance16x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance16x32 = vpx_sub_pixel_avg_variance16x32_ssse3; + vpx_sub_pixel_avg_variance16x8 = vpx_sub_pixel_avg_variance16x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance16x8 = vpx_sub_pixel_avg_variance16x8_ssse3; + vpx_sub_pixel_avg_variance32x16 = vpx_sub_pixel_avg_variance32x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance32x16 = vpx_sub_pixel_avg_variance32x16_ssse3; + vpx_sub_pixel_avg_variance32x32 = vpx_sub_pixel_avg_variance32x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance32x32 = vpx_sub_pixel_avg_variance32x32_ssse3; + if (flags & HAS_AVX2) vpx_sub_pixel_avg_variance32x32 = vpx_sub_pixel_avg_variance32x32_avx2; + vpx_sub_pixel_avg_variance32x64 = vpx_sub_pixel_avg_variance32x64_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance32x64 = vpx_sub_pixel_avg_variance32x64_ssse3; + vpx_sub_pixel_avg_variance4x4 = vpx_sub_pixel_avg_variance4x4_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance4x4 = vpx_sub_pixel_avg_variance4x4_ssse3; + vpx_sub_pixel_avg_variance4x8 = vpx_sub_pixel_avg_variance4x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance4x8 = vpx_sub_pixel_avg_variance4x8_ssse3; + vpx_sub_pixel_avg_variance64x32 = vpx_sub_pixel_avg_variance64x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance64x32 = vpx_sub_pixel_avg_variance64x32_ssse3; + vpx_sub_pixel_avg_variance64x64 = vpx_sub_pixel_avg_variance64x64_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance64x64 = vpx_sub_pixel_avg_variance64x64_ssse3; + if (flags & HAS_AVX2) vpx_sub_pixel_avg_variance64x64 = vpx_sub_pixel_avg_variance64x64_avx2; + vpx_sub_pixel_avg_variance8x16 = vpx_sub_pixel_avg_variance8x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance8x16 = vpx_sub_pixel_avg_variance8x16_ssse3; + vpx_sub_pixel_avg_variance8x4 = vpx_sub_pixel_avg_variance8x4_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance8x4 = vpx_sub_pixel_avg_variance8x4_ssse3; + vpx_sub_pixel_avg_variance8x8 = vpx_sub_pixel_avg_variance8x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance8x8 = vpx_sub_pixel_avg_variance8x8_ssse3; + vpx_sub_pixel_variance16x16 = vpx_sub_pixel_variance16x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance16x16 = vpx_sub_pixel_variance16x16_ssse3; + vpx_sub_pixel_variance16x32 = vpx_sub_pixel_variance16x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance16x32 = vpx_sub_pixel_variance16x32_ssse3; + vpx_sub_pixel_variance16x8 = vpx_sub_pixel_variance16x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance16x8 = vpx_sub_pixel_variance16x8_ssse3; + vpx_sub_pixel_variance32x16 = vpx_sub_pixel_variance32x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance32x16 = vpx_sub_pixel_variance32x16_ssse3; + vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_ssse3; + if (flags & HAS_AVX2) vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_avx2; + vpx_sub_pixel_variance32x64 = vpx_sub_pixel_variance32x64_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance32x64 = vpx_sub_pixel_variance32x64_ssse3; + vpx_sub_pixel_variance4x4 = vpx_sub_pixel_variance4x4_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance4x4 = vpx_sub_pixel_variance4x4_ssse3; + vpx_sub_pixel_variance4x8 = vpx_sub_pixel_variance4x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance4x8 = vpx_sub_pixel_variance4x8_ssse3; + vpx_sub_pixel_variance64x32 = vpx_sub_pixel_variance64x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance64x32 = vpx_sub_pixel_variance64x32_ssse3; + vpx_sub_pixel_variance64x64 = vpx_sub_pixel_variance64x64_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance64x64 = vpx_sub_pixel_variance64x64_ssse3; + if (flags & HAS_AVX2) vpx_sub_pixel_variance64x64 = vpx_sub_pixel_variance64x64_avx2; + vpx_sub_pixel_variance8x16 = vpx_sub_pixel_variance8x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x16 = vpx_sub_pixel_variance8x16_ssse3; + vpx_sub_pixel_variance8x4 = vpx_sub_pixel_variance8x4_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x4 = vpx_sub_pixel_variance8x4_ssse3; + vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_ssse3; + vpx_subtract_block = vpx_subtract_block_sse2; + if (flags & HAS_AVX2) vpx_subtract_block = vpx_subtract_block_avx2; + vpx_variance16x16 = vpx_variance16x16_sse2; + if (flags & HAS_AVX2) vpx_variance16x16 = vpx_variance16x16_avx2; + vpx_variance16x32 = vpx_variance16x32_sse2; + if (flags & HAS_AVX2) vpx_variance16x32 = vpx_variance16x32_avx2; + vpx_variance16x8 = vpx_variance16x8_sse2; + if (flags & HAS_AVX2) vpx_variance16x8 = vpx_variance16x8_avx2; + vpx_variance32x16 = vpx_variance32x16_sse2; + if (flags & HAS_AVX2) vpx_variance32x16 = vpx_variance32x16_avx2; + vpx_variance32x32 = vpx_variance32x32_sse2; + if (flags & HAS_AVX2) vpx_variance32x32 = vpx_variance32x32_avx2; + vpx_variance32x64 = vpx_variance32x64_sse2; + if (flags & HAS_AVX2) vpx_variance32x64 = vpx_variance32x64_avx2; + vpx_variance64x32 = vpx_variance64x32_sse2; + if (flags & HAS_AVX2) vpx_variance64x32 = vpx_variance64x32_avx2; + vpx_variance64x64 = vpx_variance64x64_sse2; + if (flags & HAS_AVX2) vpx_variance64x64 = vpx_variance64x64_avx2; + vpx_variance8x16 = vpx_variance8x16_sse2; + if (flags & HAS_AVX2) vpx_variance8x16 = vpx_variance8x16_avx2; + vpx_variance8x4 = vpx_variance8x4_sse2; + if (flags & HAS_AVX2) vpx_variance8x4 = vpx_variance8x4_avx2; + vpx_variance8x8 = vpx_variance8x8_sse2; + if (flags & HAS_AVX2) vpx_variance8x8 = vpx_variance8x8_avx2; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/mac/x64/vpx_scale_rtcd.h b/media/libvpx/config/mac/x64/vpx_scale_rtcd.h new file mode 100644 index 0000000000..5f09104ea6 --- /dev/null +++ b/media/libvpx/config/mac/x64/vpx_scale_rtcd.h @@ -0,0 +1,73 @@ +// This file is generated. Do not edit. +#ifndef VPX_SCALE_RTCD_H_ +#define VPX_SCALE_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c + +void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c + +void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c + +void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c + +void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c + +void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c + +void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c + +void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vp8_yv12_copy_frame vp8_yv12_copy_frame_c + +void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c + +void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_borders vpx_extend_frame_borders_c + +void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c + +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c + +void vpx_scale_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/x86.h" +static void setup_rtcd_internal(void) +{ + int flags = x86_simd_caps(); + + (void)flags; + +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/vpx_version.h b/media/libvpx/config/vpx_version.h new file mode 100644 index 0000000000..ed1736cc7a --- /dev/null +++ b/media/libvpx/config/vpx_version.h @@ -0,0 +1,8 @@ +// This file is generated. Do not edit. +#define VERSION_MAJOR 1 +#define VERSION_MINOR 14 +#define VERSION_PATCH 0 +#define VERSION_EXTRA "" +#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH)) +#define VERSION_STRING_NOSP "v1.14.0" +#define VERSION_STRING " v1.14.0" diff --git a/media/libvpx/config/win/aarch64/vp8_rtcd.h b/media/libvpx/config/win/aarch64/vp8_rtcd.h new file mode 100644 index 0000000000..d204ef751e --- /dev/null +++ b/media/libvpx/config/win/aarch64/vp8_rtcd.h @@ -0,0 +1,201 @@ +// This file is generated. Do not edit. +#ifndef VP8_RTCD_H_ +#define VP8_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP8 + */ + +struct blockd; +struct macroblockd; +struct loop_filter_info; + +/* Encoder forward decls */ +struct block; +struct macroblock; +struct variance_vtable; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict16x16_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_neon + +void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_neon + +void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_neon + +void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_neon + +int vp8_block_error_c(short *coeff, short *dqcoeff); +#define vp8_block_error vp8_block_error_c + +void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +#define vp8_copy32xn vp8_copy32xn_c + +void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem16x16_neon(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem16x16 vp8_copy_mem16x16_neon + +void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x4_neon(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem8x4 vp8_copy_mem8x4_neon + +void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x8_neon(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem8x8 vp8_copy_mem8x8_neon + +void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +#define vp8_dc_only_idct_add vp8_dc_only_idct_add_neon + +int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter vp8_denoiser_filter_neon + +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_uv_neon(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_neon + +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride); +void vp8_dequant_idct_add_neon(short *input, short *dq, unsigned char *dest, int stride); +#define vp8_dequant_idct_add vp8_dequant_idct_add_neon + +void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon + +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon + +void vp8_dequantize_b_c(struct blockd*, short *DQC); +void vp8_dequantize_b_neon(struct blockd*, short *DQC); +#define vp8_dequantize_b vp8_dequantize_b_neon + +int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_diamond_search_sad vp8_diamond_search_sad_c + +void vp8_fast_quantize_b_c(struct block *, struct blockd *); +void vp8_fast_quantize_b_neon(struct block *, struct blockd *); +#define vp8_fast_quantize_b vp8_fast_quantize_b_neon + +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bh vp8_loop_filter_bh_neon + +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bv vp8_loop_filter_bv_neon + +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbh vp8_loop_filter_mbh_neon + +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbv vp8_loop_filter_mbv_neon + +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_neon + +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bvs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_neon + +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbh vp8_loop_filter_mbhs_neon + +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbv vp8_loop_filter_mbvs_neon + +int vp8_mbblock_error_c(struct macroblock *mb, int dc); +#define vp8_mbblock_error vp8_mbblock_error_c + +int vp8_mbuverror_c(struct macroblock *mb); +#define vp8_mbuverror vp8_mbuverror_c + +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_refining_search_sad vp8_refining_search_sad_c + +void vp8_regular_quantize_b_c(struct block *, struct blockd *); +#define vp8_regular_quantize_b vp8_regular_quantize_b_c + +void vp8_short_fdct4x4_c(short *input, short *output, int pitch); +void vp8_short_fdct4x4_neon(short *input, short *output, int pitch); +#define vp8_short_fdct4x4 vp8_short_fdct4x4_neon + +void vp8_short_fdct8x4_c(short *input, short *output, int pitch); +void vp8_short_fdct8x4_neon(short *input, short *output, int pitch); +#define vp8_short_fdct8x4 vp8_short_fdct8x4_neon + +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_short_idct4x4llm_neon(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +#define vp8_short_idct4x4llm vp8_short_idct4x4llm_neon + +void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff); +void vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff); +#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_neon + +void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + +void vp8_short_walsh4x4_c(short *input, short *output, int pitch); +void vp8_short_walsh4x4_neon(short *input, short *output, int pitch); +#define vp8_short_walsh4x4 vp8_short_walsh4x4_neon + +void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_neon + +void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_neon + +void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_neon + +void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_neon + +void vp8_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + (void)flags; + +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/win/aarch64/vp9_rtcd.h b/media/libvpx/config/win/aarch64/vp9_rtcd.h new file mode 100644 index 0000000000..738de4f9f4 --- /dev/null +++ b/media/libvpx/config/win/aarch64/vp9_rtcd.h @@ -0,0 +1,106 @@ +// This file is generated. Do not edit. +#ifndef VP9_RTCD_H_ +#define VP9_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP9 + */ + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" + +struct macroblockd; + +/* Encoder forward decls */ +struct macroblock; +struct macroblock_plane; +struct vp9_sad_table; +struct ScanOrder; +struct search_site_config; +struct mv; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +int64_t vp9_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +#define vp9_block_error vp9_block_error_neon + +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +#define vp9_block_error_fp vp9_block_error_fp_neon + +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); +int vp9_diamond_search_sad_neon(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_neon + +void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht16x16 vp9_fht16x16_neon + +void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht4x4 vp9_fht4x4_neon + +void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht8x8_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht8x8 vp9_fht8x8_neon + +void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vp9_fwht4x4 vp9_fwht4x4_c + +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht16x16_256_add vp9_iht16x16_256_add_neon + +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht4x4_16_add vp9_iht4x4_16_add_neon + +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon + +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vp9_quantize_fp vp9_quantize_fp_neon + +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_neon + +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +void vp9_scale_and_extend_frame_neon(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_neon + +void vp9_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + (void)flags; + +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/win/aarch64/vpx_config.asm b/media/libvpx/config/win/aarch64/vpx_config.asm new file mode 100644 index 0000000000..24eb1a8cba --- /dev/null +++ b/media/libvpx/config/win/aarch64/vpx_config.asm @@ -0,0 +1,97 @@ +@ This file was created from a .asm file +@ using the ads2gas.pl script. +.syntax unified +.equ VPX_ARCH_ARM , 1 +.equ VPX_ARCH_AARCH64 , 1 +.equ VPX_ARCH_MIPS , 0 +.equ VPX_ARCH_X86 , 0 +.equ VPX_ARCH_X86_64 , 0 +.equ VPX_ARCH_PPC , 0 +.equ VPX_ARCH_LOONGARCH , 0 +.equ HAVE_NEON_ASM , 0 +.equ HAVE_NEON , 1 +.equ HAVE_NEON_DOTPROD , 1 +.equ HAVE_NEON_I8MM , 1 +.equ HAVE_SVE , 1 +.equ HAVE_MIPS32 , 0 +.equ HAVE_DSPR2 , 0 +.equ HAVE_MSA , 0 +.equ HAVE_MIPS64 , 0 +.equ HAVE_MMX , 0 +.equ HAVE_SSE , 0 +.equ HAVE_SSE2 , 0 +.equ HAVE_SSE3 , 0 +.equ HAVE_SSSE3 , 0 +.equ HAVE_SSE4_1 , 0 +.equ HAVE_AVX , 0 +.equ HAVE_AVX2 , 0 +.equ HAVE_AVX512 , 0 +.equ HAVE_VSX , 0 +.equ HAVE_MMI , 0 +.equ HAVE_LSX , 0 +.equ HAVE_LASX , 0 +.equ HAVE_VPX_PORTS , 1 +.equ HAVE_PTHREAD_H , 0 +.equ CONFIG_DEPENDENCY_TRACKING , 1 +.equ CONFIG_EXTERNAL_BUILD , 1 +.equ CONFIG_INSTALL_DOCS , 0 +.equ CONFIG_INSTALL_BINS , 1 +.equ CONFIG_INSTALL_LIBS , 1 +.equ CONFIG_INSTALL_SRCS , 0 +.equ CONFIG_DEBUG , 0 +.equ CONFIG_GPROF , 0 +.equ CONFIG_GCOV , 0 +.equ CONFIG_RVCT , 0 +.equ CONFIG_GCC , 0 +.equ CONFIG_MSVS , 1 +.equ CONFIG_PIC , 1 +.equ CONFIG_BIG_ENDIAN , 0 +.equ CONFIG_CODEC_SRCS , 0 +.equ CONFIG_DEBUG_LIBS , 0 +.equ CONFIG_DEQUANT_TOKENS , 0 +.equ CONFIG_DC_RECON , 0 +.equ CONFIG_RUNTIME_CPU_DETECT , 1 +.equ CONFIG_POSTPROC , 0 +.equ CONFIG_VP9_POSTPROC , 0 +.equ CONFIG_MULTITHREAD , 1 +.equ CONFIG_INTERNAL_STATS , 0 +.equ CONFIG_VP8_ENCODER , 1 +.equ CONFIG_VP8_DECODER , 1 +.equ CONFIG_VP9_ENCODER , 1 +.equ CONFIG_VP9_DECODER , 1 +.equ CONFIG_VP8 , 1 +.equ CONFIG_VP9 , 1 +.equ CONFIG_ENCODERS , 1 +.equ CONFIG_DECODERS , 1 +.equ CONFIG_STATIC_MSVCRT , 0 +.equ CONFIG_SPATIAL_RESAMPLING , 1 +.equ CONFIG_REALTIME_ONLY , 1 +.equ CONFIG_ONTHEFLY_BITPACKING , 0 +.equ CONFIG_ERROR_CONCEALMENT , 0 +.equ CONFIG_SHARED , 0 +.equ CONFIG_STATIC , 1 +.equ CONFIG_SMALL , 0 +.equ CONFIG_POSTPROC_VISUALIZER , 0 +.equ CONFIG_OS_SUPPORT , 1 +.equ CONFIG_UNIT_TESTS , 0 +.equ CONFIG_WEBM_IO , 1 +.equ CONFIG_LIBYUV , 1 +.equ CONFIG_DECODE_PERF_TESTS , 0 +.equ CONFIG_ENCODE_PERF_TESTS , 0 +.equ CONFIG_MULTI_RES_ENCODING , 1 +.equ CONFIG_TEMPORAL_DENOISING , 1 +.equ CONFIG_VP9_TEMPORAL_DENOISING , 0 +.equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0 +.equ CONFIG_VP9_HIGHBITDEPTH , 0 +.equ CONFIG_BETTER_HW_COMPATIBILITY , 0 +.equ CONFIG_EXPERIMENTAL , 0 +.equ CONFIG_SIZE_LIMIT , 1 +.equ CONFIG_ALWAYS_ADJUST_BPM , 0 +.equ CONFIG_BITSTREAM_DEBUG , 0 +.equ CONFIG_MISMATCH_DEBUG , 0 +.equ CONFIG_FP_MB_STATS , 0 +.equ CONFIG_EMULATE_HARDWARE , 0 +.equ CONFIG_NON_GREEDY_MV , 0 +.equ CONFIG_RATE_CTRL , 0 +.equ CONFIG_COLLECT_COMPONENT_TIMING , 0 + .section .note.GNU-stack,"",%progbits diff --git a/media/libvpx/config/win/aarch64/vpx_config.c b/media/libvpx/config/win/aarch64/vpx_config.c new file mode 100644 index 0000000000..13cc13a95d --- /dev/null +++ b/media/libvpx/config/win/aarch64/vpx_config.c @@ -0,0 +1,10 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +#include "vpx/vpx_codec.h" +static const char* const cfg = "--target=arm64-win64-vs15 --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-realtime-only"; +const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/win/aarch64/vpx_config.h b/media/libvpx/config/win/aarch64/vpx_config.h new file mode 100644 index 0000000000..c3cc860f18 --- /dev/null +++ b/media/libvpx/config/win/aarch64/vpx_config.h @@ -0,0 +1,108 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +/* This file automatically generated by configure. Do not edit! */ +#ifndef VPX_CONFIG_H +#define VPX_CONFIG_H +#define RESTRICT +#define INLINE __inline +#define VPX_ARCH_ARM 1 +#define VPX_ARCH_AARCH64 1 +#define VPX_ARCH_MIPS 0 +#define VPX_ARCH_X86 0 +#define VPX_ARCH_X86_64 0 +#define VPX_ARCH_PPC 0 +#define VPX_ARCH_LOONGARCH 0 +#define HAVE_NEON_ASM 0 +#define HAVE_NEON 1 +#define HAVE_NEON_DOTPROD 1 +#define HAVE_NEON_I8MM 1 +#define HAVE_SVE 1 +#define HAVE_MIPS32 0 +#define HAVE_DSPR2 0 +#define HAVE_MSA 0 +#define HAVE_MIPS64 0 +#define HAVE_MMX 0 +#define HAVE_SSE 0 +#define HAVE_SSE2 0 +#define HAVE_SSE3 0 +#define HAVE_SSSE3 0 +#define HAVE_SSE4_1 0 +#define HAVE_AVX 0 +#define HAVE_AVX2 0 +#define HAVE_AVX512 0 +#define HAVE_VSX 0 +#define HAVE_MMI 0 +#define HAVE_LSX 0 +#define HAVE_LASX 0 +#define HAVE_VPX_PORTS 1 +#define HAVE_PTHREAD_H 0 +#define CONFIG_DEPENDENCY_TRACKING 1 +#define CONFIG_EXTERNAL_BUILD 1 +#define CONFIG_INSTALL_DOCS 0 +#define CONFIG_INSTALL_BINS 1 +#define CONFIG_INSTALL_LIBS 1 +#define CONFIG_INSTALL_SRCS 0 +#define CONFIG_DEBUG 0 +#define CONFIG_GPROF 0 +#define CONFIG_GCOV 0 +#define CONFIG_RVCT 0 +#define CONFIG_GCC 0 +#define CONFIG_MSVS 1 +#define CONFIG_PIC 1 +#define CONFIG_BIG_ENDIAN 0 +#define CONFIG_CODEC_SRCS 0 +#define CONFIG_DEBUG_LIBS 0 +#define CONFIG_DEQUANT_TOKENS 0 +#define CONFIG_DC_RECON 0 +#define CONFIG_RUNTIME_CPU_DETECT 1 +#define CONFIG_POSTPROC 0 +#define CONFIG_VP9_POSTPROC 0 +#define CONFIG_MULTITHREAD 1 +#define CONFIG_INTERNAL_STATS 0 +#define CONFIG_VP8_ENCODER 1 +#define CONFIG_VP8_DECODER 1 +#define CONFIG_VP9_ENCODER 1 +#define CONFIG_VP9_DECODER 1 +#define CONFIG_VP8 1 +#define CONFIG_VP9 1 +#define CONFIG_ENCODERS 1 +#define CONFIG_DECODERS 1 +#define CONFIG_STATIC_MSVCRT 0 +#define CONFIG_SPATIAL_RESAMPLING 1 +#define CONFIG_REALTIME_ONLY 1 +#define CONFIG_ONTHEFLY_BITPACKING 0 +#define CONFIG_ERROR_CONCEALMENT 0 +#define CONFIG_SHARED 0 +#define CONFIG_STATIC 1 +#define CONFIG_SMALL 0 +#define CONFIG_POSTPROC_VISUALIZER 0 +#define CONFIG_OS_SUPPORT 1 +#define CONFIG_UNIT_TESTS 0 +#define CONFIG_WEBM_IO 1 +#define CONFIG_LIBYUV 1 +#define CONFIG_DECODE_PERF_TESTS 0 +#define CONFIG_ENCODE_PERF_TESTS 0 +#define CONFIG_MULTI_RES_ENCODING 1 +#define CONFIG_TEMPORAL_DENOISING 1 +#define CONFIG_VP9_TEMPORAL_DENOISING 0 +#define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 +#define CONFIG_BETTER_HW_COMPATIBILITY 0 +#define CONFIG_EXPERIMENTAL 0 +#define CONFIG_SIZE_LIMIT 1 +#define CONFIG_ALWAYS_ADJUST_BPM 0 +#define CONFIG_BITSTREAM_DEBUG 0 +#define CONFIG_MISMATCH_DEBUG 0 +#define CONFIG_FP_MB_STATS 0 +#define CONFIG_EMULATE_HARDWARE 0 +#define CONFIG_NON_GREEDY_MV 0 +#define CONFIG_RATE_CTRL 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 +#define DECODE_WIDTH_LIMIT 8192 +#define DECODE_HEIGHT_LIMIT 4608 +#endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/win/aarch64/vpx_dsp_rtcd.h b/media/libvpx/config/win/aarch64/vpx_dsp_rtcd.h new file mode 100644 index 0000000000..5a9b05ca14 --- /dev/null +++ b/media/libvpx/config/win/aarch64/vpx_dsp_rtcd.h @@ -0,0 +1,1184 @@ +// This file is generated. Do not edit. +#ifndef VPX_DSP_RTCD_H_ +#define VPX_DSP_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * DSP + */ + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; + struct ScanOrder; +#endif + + +#ifdef __cplusplus +extern "C" { +#endif + +unsigned int vpx_avg_4x4_c(const uint8_t *, int p); +unsigned int vpx_avg_4x4_neon(const uint8_t *, int p); +#define vpx_avg_4x4 vpx_avg_4x4_neon + +unsigned int vpx_avg_8x8_c(const uint8_t *, int p); +unsigned int vpx_avg_8x8_neon(const uint8_t *, int p); +#define vpx_avg_8x8 vpx_avg_8x8_neon + +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +void vpx_comp_avg_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +#define vpx_comp_avg_pred vpx_comp_avg_pred_neon + +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve_avg vpx_convolve_avg_neon + +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve_copy vpx_convolve_copy_neon + +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_neon + +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_neon + +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_neon + +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_neon + +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_neon + +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_neon + +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_neon + +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_neon + +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_neon + +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_neon + +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_neon + +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_neon + +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_neon + +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_neon + +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_neon + +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_neon + +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_neon + +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_neon + +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_neon + +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_neon + +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c + +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_neon + +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_neon + +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_neon + +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_neon + +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c + +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_neon + +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_neon + +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_neon + +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_neon + +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_neon + +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_neon + +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_neon + +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_neon + +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_neon + +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_neon + +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_neon + +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_neon + +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_neon + +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_neon + +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_neon + +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_neon + +void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct16x16 vpx_fdct16x16_neon + +void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct16x16_1 vpx_fdct16x16_1_neon + +void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32 vpx_fdct32x32_neon + +void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32_1 vpx_fdct32x32_1_neon + +void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32_rd vpx_fdct32x32_rd_neon + +void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4 vpx_fdct4x4_neon + +void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4_1 vpx_fdct4x4_1_neon + +void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct8x8 vpx_fdct8x8_neon + +void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct8x8_1 vpx_fdct8x8_1_neon + +void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vpx_get4x4sse_cs_neon_dotprod(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_get4x4sse_cs)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); + +void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get_mb_ss_c(const int16_t *); +#define vpx_get_mb_ss vpx_get_mb_ss_c + +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_16x16 vpx_h_predictor_16x16_neon + +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_32x32 vpx_h_predictor_32x32_neon + +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_neon + +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_neon + +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +#define vpx_hadamard_16x16 vpx_hadamard_16x16_neon + +void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +#define vpx_hadamard_32x32 vpx_hadamard_32x32_neon + +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +#define vpx_hadamard_8x8 vpx_hadamard_8x8_neon + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c + +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_10_add vpx_idct16x16_10_add_neon + +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_1_add vpx_idct16x16_1_add_neon + +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_256_add vpx_idct16x16_256_add_neon + +void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_38_add vpx_idct16x16_38_add_neon + +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_neon + +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_135_add vpx_idct32x32_135_add_neon + +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_1_add vpx_idct32x32_1_add_neon + +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_34_add vpx_idct32x32_34_add_neon + +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct4x4_16_add vpx_idct4x4_16_add_neon + +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct4x4_1_add vpx_idct4x4_1_add_neon + +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_12_add vpx_idct8x8_12_add_neon + +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_1_add vpx_idct8x8_1_add_neon + +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_64_add vpx_idct8x8_64_add_neon + +int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); +int16_t vpx_int_pro_col_neon(const uint8_t *ref, const int width); +#define vpx_int_pro_col vpx_int_pro_col_neon + +void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +void vpx_int_pro_row_neon(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +#define vpx_int_pro_row vpx_int_pro_row_neon + +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c + +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c + +void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_16 vpx_lpf_horizontal_16_neon + +void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_16_dual vpx_lpf_horizontal_16_dual_neon + +void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_4 vpx_lpf_horizontal_4_neon + +void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_horizontal_4_dual vpx_lpf_horizontal_4_dual_neon + +void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_8 vpx_lpf_horizontal_8_neon + +void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_horizontal_8_dual vpx_lpf_horizontal_8_dual_neon + +void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_neon + +void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16_dual vpx_lpf_vertical_16_dual_neon + +void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_4 vpx_lpf_vertical_4_neon + +void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_vertical_4_dual vpx_lpf_vertical_4_dual_neon + +void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_8 vpx_lpf_vertical_8_neon + +void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_neon + +void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +void vpx_minmax_8x8_neon(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +#define vpx_minmax_8x8 vpx_minmax_8x8_neon + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vpx_quantize_b vpx_quantize_b_neon + +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_neon + +unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x16_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad16x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x32_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad16x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x8_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad16x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x16_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x32_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x64_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad4x4 vpx_sad4x4_neon + +unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x4_avg vpx_sad4x4_avg_neon + +void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad4x4x4d vpx_sad4x4x4d_neon + +unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad4x8 vpx_sad4x8_neon + +unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x8_avg vpx_sad4x8_avg_neon + +void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad4x8x4d vpx_sad4x8x4d_neon + +unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x32_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad64x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x64_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x16 vpx_sad8x16_neon + +unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x16_avg vpx_sad8x16_avg_neon + +void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x16x4d vpx_sad8x16x4d_neon + +unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x4 vpx_sad8x4_neon + +unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x4_avg vpx_sad8x4_avg_neon + +void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x4x4d vpx_sad8x4x4d_neon + +unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x8 vpx_sad8x8_neon + +unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x8_avg vpx_sad8x8_avg_neon + +void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x8x4d vpx_sad8x8x4d_neon + +unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x16x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x8x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_neon + +void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_neon + +unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_neon + +void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_neon + +unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_neon + +void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_neon + +unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_neon + +void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_neon + +unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_neon + +void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_neon + +int vpx_satd_c(const int16_t *coeff, int length); +int vpx_satd_neon(const int16_t *coeff, int length); +#define vpx_satd vpx_satd_neon + +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_2d vpx_scaled_2d_neon + +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c + +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c + +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c + +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_horiz vpx_scaled_horiz_c + +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_vert vpx_scaled_vert_c + +int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +RTCD_EXTERN int64_t (*vpx_sse)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); + +uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_neon + +uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_neon + +uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_neon + +uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_neon + +uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_neon + +uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_neon + +uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_neon + +uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_neon + +uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_neon + +uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_neon + +uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_neon + +uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_neon + +uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_neon + +uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_neon + +uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_neon + +uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_neon + +uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_neon + +uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_neon + +uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_neon + +uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_neon + +uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_neon + +uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_neon + +uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_neon + +uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_neon + +uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_neon + +uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x8 vpx_sub_pixel_variance8x8_neon + +void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vpx_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +#define vpx_subtract_block vpx_subtract_block_neon + +uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size); +uint64_t vpx_sum_squares_2d_i16_neon(const int16_t *src, int stride, int size); +#define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_neon + +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_neon + +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_neon + +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_neon + +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_neon + +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_16x16 vpx_v_predictor_16x16_neon + +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_32x32 vpx_v_predictor_32x32_neon + +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_neon + +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_neon + +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance4x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c + +int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl); +int vpx_vector_var_neon(const int16_t *ref, const int16_t *src, const int bwl); +#define vpx_vector_var vpx_vector_var_neon + +void vpx_dsp_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + (void)flags; + + vpx_convolve8 = vpx_convolve8_neon; + if (flags & HAS_NEON_DOTPROD) vpx_convolve8 = vpx_convolve8_neon_dotprod; + if (flags & HAS_NEON_I8MM) vpx_convolve8 = vpx_convolve8_neon_i8mm; + vpx_convolve8_avg = vpx_convolve8_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_convolve8_avg = vpx_convolve8_avg_neon_dotprod; + if (flags & HAS_NEON_I8MM) vpx_convolve8_avg = vpx_convolve8_avg_neon_i8mm; + vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_neon; + if (flags & HAS_NEON_DOTPROD) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_neon_dotprod; + if (flags & HAS_NEON_I8MM) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_neon_i8mm; + vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_neon; + if (flags & HAS_NEON_DOTPROD) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_neon_dotprod; + if (flags & HAS_NEON_I8MM) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_neon_i8mm; + vpx_convolve8_horiz = vpx_convolve8_horiz_neon; + if (flags & HAS_NEON_DOTPROD) vpx_convolve8_horiz = vpx_convolve8_horiz_neon_dotprod; + if (flags & HAS_NEON_I8MM) vpx_convolve8_horiz = vpx_convolve8_horiz_neon_i8mm; + vpx_convolve8_vert = vpx_convolve8_vert_neon; + if (flags & HAS_NEON_DOTPROD) vpx_convolve8_vert = vpx_convolve8_vert_neon_dotprod; + if (flags & HAS_NEON_I8MM) vpx_convolve8_vert = vpx_convolve8_vert_neon_i8mm; + vpx_get16x16var = vpx_get16x16var_neon; + if (flags & HAS_NEON_DOTPROD) vpx_get16x16var = vpx_get16x16var_neon_dotprod; + vpx_get4x4sse_cs = vpx_get4x4sse_cs_neon; + if (flags & HAS_NEON_DOTPROD) vpx_get4x4sse_cs = vpx_get4x4sse_cs_neon_dotprod; + vpx_get8x8var = vpx_get8x8var_neon; + if (flags & HAS_NEON_DOTPROD) vpx_get8x8var = vpx_get8x8var_neon_dotprod; + vpx_mse16x16 = vpx_mse16x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_mse16x16 = vpx_mse16x16_neon_dotprod; + vpx_mse16x8 = vpx_mse16x8_neon; + if (flags & HAS_NEON_DOTPROD) vpx_mse16x8 = vpx_mse16x8_neon_dotprod; + vpx_mse8x16 = vpx_mse8x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_mse8x16 = vpx_mse8x16_neon_dotprod; + vpx_mse8x8 = vpx_mse8x8_neon; + if (flags & HAS_NEON_DOTPROD) vpx_mse8x8 = vpx_mse8x8_neon_dotprod; + vpx_sad16x16 = vpx_sad16x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x16 = vpx_sad16x16_neon_dotprod; + vpx_sad16x16_avg = vpx_sad16x16_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x16_avg = vpx_sad16x16_avg_neon_dotprod; + vpx_sad16x16x4d = vpx_sad16x16x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x16x4d = vpx_sad16x16x4d_neon_dotprod; + vpx_sad16x32 = vpx_sad16x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x32 = vpx_sad16x32_neon_dotprod; + vpx_sad16x32_avg = vpx_sad16x32_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x32_avg = vpx_sad16x32_avg_neon_dotprod; + vpx_sad16x32x4d = vpx_sad16x32x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x32x4d = vpx_sad16x32x4d_neon_dotprod; + vpx_sad16x8 = vpx_sad16x8_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x8 = vpx_sad16x8_neon_dotprod; + vpx_sad16x8_avg = vpx_sad16x8_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x8_avg = vpx_sad16x8_avg_neon_dotprod; + vpx_sad16x8x4d = vpx_sad16x8x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x8x4d = vpx_sad16x8x4d_neon_dotprod; + vpx_sad32x16 = vpx_sad32x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x16 = vpx_sad32x16_neon_dotprod; + vpx_sad32x16_avg = vpx_sad32x16_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x16_avg = vpx_sad32x16_avg_neon_dotprod; + vpx_sad32x16x4d = vpx_sad32x16x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x16x4d = vpx_sad32x16x4d_neon_dotprod; + vpx_sad32x32 = vpx_sad32x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x32 = vpx_sad32x32_neon_dotprod; + vpx_sad32x32_avg = vpx_sad32x32_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x32_avg = vpx_sad32x32_avg_neon_dotprod; + vpx_sad32x32x4d = vpx_sad32x32x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x32x4d = vpx_sad32x32x4d_neon_dotprod; + vpx_sad32x64 = vpx_sad32x64_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x64 = vpx_sad32x64_neon_dotprod; + vpx_sad32x64_avg = vpx_sad32x64_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x64_avg = vpx_sad32x64_avg_neon_dotprod; + vpx_sad32x64x4d = vpx_sad32x64x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x64x4d = vpx_sad32x64x4d_neon_dotprod; + vpx_sad64x32 = vpx_sad64x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad64x32 = vpx_sad64x32_neon_dotprod; + vpx_sad64x32_avg = vpx_sad64x32_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad64x32_avg = vpx_sad64x32_avg_neon_dotprod; + vpx_sad64x32x4d = vpx_sad64x32x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad64x32x4d = vpx_sad64x32x4d_neon_dotprod; + vpx_sad64x64 = vpx_sad64x64_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad64x64 = vpx_sad64x64_neon_dotprod; + vpx_sad64x64_avg = vpx_sad64x64_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad64x64_avg = vpx_sad64x64_avg_neon_dotprod; + vpx_sad64x64x4d = vpx_sad64x64x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad64x64x4d = vpx_sad64x64x4d_neon_dotprod; + vpx_sad_skip_16x16 = vpx_sad_skip_16x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x16 = vpx_sad_skip_16x16_neon_dotprod; + vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_neon_dotprod; + vpx_sad_skip_16x32 = vpx_sad_skip_16x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x32 = vpx_sad_skip_16x32_neon_dotprod; + vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_neon_dotprod; + vpx_sad_skip_16x8 = vpx_sad_skip_16x8_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x8 = vpx_sad_skip_16x8_neon_dotprod; + vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_neon_dotprod; + vpx_sad_skip_32x16 = vpx_sad_skip_32x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_neon_dotprod; + vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_neon_dotprod; + vpx_sad_skip_32x32 = vpx_sad_skip_32x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_neon_dotprod; + vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_neon_dotprod; + vpx_sad_skip_32x64 = vpx_sad_skip_32x64_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_neon_dotprod; + vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_neon_dotprod; + vpx_sad_skip_64x32 = vpx_sad_skip_64x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_neon_dotprod; + vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_neon_dotprod; + vpx_sad_skip_64x64 = vpx_sad_skip_64x64_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_neon_dotprod; + vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_neon_dotprod; + vpx_sse = vpx_sse_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sse = vpx_sse_neon_dotprod; + vpx_variance16x16 = vpx_variance16x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance16x16 = vpx_variance16x16_neon_dotprod; + vpx_variance16x32 = vpx_variance16x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance16x32 = vpx_variance16x32_neon_dotprod; + vpx_variance16x8 = vpx_variance16x8_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance16x8 = vpx_variance16x8_neon_dotprod; + vpx_variance32x16 = vpx_variance32x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance32x16 = vpx_variance32x16_neon_dotprod; + vpx_variance32x32 = vpx_variance32x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance32x32 = vpx_variance32x32_neon_dotprod; + vpx_variance32x64 = vpx_variance32x64_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance32x64 = vpx_variance32x64_neon_dotprod; + vpx_variance4x4 = vpx_variance4x4_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance4x4 = vpx_variance4x4_neon_dotprod; + vpx_variance4x8 = vpx_variance4x8_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance4x8 = vpx_variance4x8_neon_dotprod; + vpx_variance64x32 = vpx_variance64x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance64x32 = vpx_variance64x32_neon_dotprod; + vpx_variance64x64 = vpx_variance64x64_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance64x64 = vpx_variance64x64_neon_dotprod; + vpx_variance8x16 = vpx_variance8x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance8x16 = vpx_variance8x16_neon_dotprod; + vpx_variance8x4 = vpx_variance8x4_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance8x4 = vpx_variance8x4_neon_dotprod; + vpx_variance8x8 = vpx_variance8x8_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance8x8 = vpx_variance8x8_neon_dotprod; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/win/aarch64/vpx_scale_rtcd.h b/media/libvpx/config/win/aarch64/vpx_scale_rtcd.h new file mode 100644 index 0000000000..b371368275 --- /dev/null +++ b/media/libvpx/config/win/aarch64/vpx_scale_rtcd.h @@ -0,0 +1,75 @@ +// This file is generated. Do not edit. +#ifndef VPX_SCALE_RTCD_H_ +#define VPX_SCALE_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c + +void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c + +void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c + +void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c + +void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c + +void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c + +void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c + +void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vp8_yv12_copy_frame vp8_yv12_copy_frame_c + +void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c + +void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_borders vpx_extend_frame_borders_c + +void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c + +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c + +void vpx_scale_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + (void)flags; + +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/win/ia32/vp8_rtcd.h b/media/libvpx/config/win/ia32/vp8_rtcd.h new file mode 100644 index 0000000000..7a23227e4d --- /dev/null +++ b/media/libvpx/config/win/ia32/vp8_rtcd.h @@ -0,0 +1,323 @@ +// This file is generated. Do not edit. +#ifndef VP8_RTCD_H_ +#define VP8_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP8 + */ + +struct blockd; +struct macroblockd; +struct loop_filter_info; + +/* Encoder forward decls */ +struct block; +struct macroblock; +struct variance_vtable; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict4x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +int vp8_block_error_c(short *coeff, short *dqcoeff); +int vp8_block_error_sse2(short *coeff, short *dqcoeff); +RTCD_EXTERN int (*vp8_block_error)(short *coeff, short *dqcoeff); + +void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +void vp8_copy32xn_sse2(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +void vp8_copy32xn_sse3(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); + +void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); + +void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x4_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); + +void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x8_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); + +void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); + +int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +RTCD_EXTERN int (*vp8_denoiser_filter)(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); + +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +RTCD_EXTERN int (*vp8_denoiser_filter_uv)(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); + +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride); +void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *dest, int stride); +RTCD_EXTERN void (*vp8_dequant_idct_add)(short *input, short *dq, unsigned char *dest, int stride); + +void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +void vp8_dequant_idct_add_uv_block_sse2(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +RTCD_EXTERN void (*vp8_dequant_idct_add_uv_block)(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); + +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +RTCD_EXTERN void (*vp8_dequant_idct_add_y_block)(short *q, short *dq, unsigned char *dst, int stride, char *eobs); + +void vp8_dequantize_b_c(struct blockd*, short *DQC); +void vp8_dequantize_b_mmx(struct blockd*, short *DQC); +RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *DQC); + +int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_diamond_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); + +void vp8_fast_quantize_b_c(struct block *, struct blockd *); +void vp8_fast_quantize_b_sse2(struct block *, struct blockd *); +void vp8_fast_quantize_b_ssse3(struct block *, struct blockd *); +RTCD_EXTERN void (*vp8_fast_quantize_b)(struct block *, struct blockd *); + +void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +void vp8_filter_by_weight16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +RTCD_EXTERN void (*vp8_filter_by_weight16x16)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); + +void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight4x4 vp8_filter_by_weight4x4_c + +void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +RTCD_EXTERN void (*vp8_filter_by_weight8x8)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); + +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_bv)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_bh)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); + +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); + +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); + +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); + +int vp8_mbblock_error_c(struct macroblock *mb, int dc); +int vp8_mbblock_error_sse2(struct macroblock *mb, int dc); +RTCD_EXTERN int (*vp8_mbblock_error)(struct macroblock *mb, int dc); + +int vp8_mbuverror_c(struct macroblock *mb); +int vp8_mbuverror_sse2(struct macroblock *mb); +RTCD_EXTERN int (*vp8_mbuverror)(struct macroblock *mb); + +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); + +void vp8_regular_quantize_b_c(struct block *, struct blockd *); +void vp8_regular_quantize_b_sse2(struct block *, struct blockd *); +void vp8_regular_quantize_b_sse4_1(struct block *, struct blockd *); +RTCD_EXTERN void (*vp8_regular_quantize_b)(struct block *, struct blockd *); + +void vp8_short_fdct4x4_c(short *input, short *output, int pitch); +void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch); +RTCD_EXTERN void (*vp8_short_fdct4x4)(short *input, short *output, int pitch); + +void vp8_short_fdct8x4_c(short *input, short *output, int pitch); +void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch); +RTCD_EXTERN void (*vp8_short_fdct8x4)(short *input, short *output, int pitch); + +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); + +void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff); +void vp8_short_inv_walsh4x4_sse2(short *input, short *mb_dqcoeff); +RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *mb_dqcoeff); + +void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + +void vp8_short_walsh4x4_c(short *input, short *output, int pitch); +void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch); +RTCD_EXTERN void (*vp8_short_walsh4x4)(short *input, short *output, int pitch); + +void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); +void vp8_temporal_filter_apply_sse2(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); +RTCD_EXTERN void (*vp8_temporal_filter_apply)(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); + +void vp8_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/x86.h" +static void setup_rtcd_internal(void) +{ + int flags = x86_simd_caps(); + + (void)flags; + + vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_c; + if (flags & HAS_SSE2) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_sse2; + if (flags & HAS_SSSE3) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_ssse3; + vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_c; + if (flags & HAS_SSE2) vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_sse2; + vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_c; + if (flags & HAS_SSE2) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_sse2; + vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_c; + if (flags & HAS_SSE2) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_sse2; + if (flags & HAS_SSSE3) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_ssse3; + vp8_block_error = vp8_block_error_c; + if (flags & HAS_SSE2) vp8_block_error = vp8_block_error_sse2; + vp8_copy32xn = vp8_copy32xn_c; + if (flags & HAS_SSE2) vp8_copy32xn = vp8_copy32xn_sse2; + if (flags & HAS_SSE3) vp8_copy32xn = vp8_copy32xn_sse3; + vp8_copy_mem16x16 = vp8_copy_mem16x16_c; + if (flags & HAS_SSE2) vp8_copy_mem16x16 = vp8_copy_mem16x16_sse2; + vp8_copy_mem8x4 = vp8_copy_mem8x4_c; + if (flags & HAS_MMX) vp8_copy_mem8x4 = vp8_copy_mem8x4_mmx; + vp8_copy_mem8x8 = vp8_copy_mem8x8_c; + if (flags & HAS_MMX) vp8_copy_mem8x8 = vp8_copy_mem8x8_mmx; + vp8_dc_only_idct_add = vp8_dc_only_idct_add_c; + if (flags & HAS_MMX) vp8_dc_only_idct_add = vp8_dc_only_idct_add_mmx; + vp8_denoiser_filter = vp8_denoiser_filter_c; + if (flags & HAS_SSE2) vp8_denoiser_filter = vp8_denoiser_filter_sse2; + vp8_denoiser_filter_uv = vp8_denoiser_filter_uv_c; + if (flags & HAS_SSE2) vp8_denoiser_filter_uv = vp8_denoiser_filter_uv_sse2; + vp8_dequant_idct_add = vp8_dequant_idct_add_c; + if (flags & HAS_MMX) vp8_dequant_idct_add = vp8_dequant_idct_add_mmx; + vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_c; + if (flags & HAS_SSE2) vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_sse2; + vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_c; + if (flags & HAS_SSE2) vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_sse2; + vp8_dequantize_b = vp8_dequantize_b_c; + if (flags & HAS_MMX) vp8_dequantize_b = vp8_dequantize_b_mmx; + vp8_diamond_search_sad = vp8_diamond_search_sad_c; + if (flags & HAS_SSE2) vp8_diamond_search_sad = vp8_diamond_search_sadx4; + vp8_fast_quantize_b = vp8_fast_quantize_b_c; + if (flags & HAS_SSE2) vp8_fast_quantize_b = vp8_fast_quantize_b_sse2; + if (flags & HAS_SSSE3) vp8_fast_quantize_b = vp8_fast_quantize_b_ssse3; + vp8_filter_by_weight16x16 = vp8_filter_by_weight16x16_c; + if (flags & HAS_SSE2) vp8_filter_by_weight16x16 = vp8_filter_by_weight16x16_sse2; + vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_c; + if (flags & HAS_SSE2) vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_sse2; + vp8_loop_filter_bh = vp8_loop_filter_bh_c; + if (flags & HAS_SSE2) vp8_loop_filter_bh = vp8_loop_filter_bh_sse2; + vp8_loop_filter_bv = vp8_loop_filter_bv_c; + if (flags & HAS_SSE2) vp8_loop_filter_bv = vp8_loop_filter_bv_sse2; + vp8_loop_filter_mbh = vp8_loop_filter_mbh_c; + if (flags & HAS_SSE2) vp8_loop_filter_mbh = vp8_loop_filter_mbh_sse2; + vp8_loop_filter_mbv = vp8_loop_filter_mbv_c; + if (flags & HAS_SSE2) vp8_loop_filter_mbv = vp8_loop_filter_mbv_sse2; + vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_c; + if (flags & HAS_SSE2) vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_sse2; + vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_c; + if (flags & HAS_SSE2) vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_sse2; + vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_c; + if (flags & HAS_SSE2) vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_sse2; + vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_c; + if (flags & HAS_SSE2) vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_sse2; + vp8_mbblock_error = vp8_mbblock_error_c; + if (flags & HAS_SSE2) vp8_mbblock_error = vp8_mbblock_error_sse2; + vp8_mbuverror = vp8_mbuverror_c; + if (flags & HAS_SSE2) vp8_mbuverror = vp8_mbuverror_sse2; + vp8_refining_search_sad = vp8_refining_search_sad_c; + if (flags & HAS_SSE2) vp8_refining_search_sad = vp8_refining_search_sadx4; + vp8_regular_quantize_b = vp8_regular_quantize_b_c; + if (flags & HAS_SSE2) vp8_regular_quantize_b = vp8_regular_quantize_b_sse2; + if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1; + vp8_short_fdct4x4 = vp8_short_fdct4x4_c; + if (flags & HAS_SSE2) vp8_short_fdct4x4 = vp8_short_fdct4x4_sse2; + vp8_short_fdct8x4 = vp8_short_fdct8x4_c; + if (flags & HAS_SSE2) vp8_short_fdct8x4 = vp8_short_fdct8x4_sse2; + vp8_short_idct4x4llm = vp8_short_idct4x4llm_c; + if (flags & HAS_MMX) vp8_short_idct4x4llm = vp8_short_idct4x4llm_mmx; + vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_c; + if (flags & HAS_SSE2) vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_sse2; + vp8_short_walsh4x4 = vp8_short_walsh4x4_c; + if (flags & HAS_SSE2) vp8_short_walsh4x4 = vp8_short_walsh4x4_sse2; + vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_c; + if (flags & HAS_SSE2) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_ssse3; + vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_c; + if (flags & HAS_MMX) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_mmx; + if (flags & HAS_SSSE3) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_ssse3; + vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_c; + if (flags & HAS_SSE2) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_ssse3; + vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_c; + if (flags & HAS_SSE2) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_ssse3; + vp8_temporal_filter_apply = vp8_temporal_filter_apply_c; + if (flags & HAS_SSE2) vp8_temporal_filter_apply = vp8_temporal_filter_apply_sse2; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/win/ia32/vp9_rtcd.h b/media/libvpx/config/win/ia32/vp9_rtcd.h new file mode 100644 index 0000000000..3136db471c --- /dev/null +++ b/media/libvpx/config/win/ia32/vp9_rtcd.h @@ -0,0 +1,156 @@ +// This file is generated. Do not edit. +#ifndef VP9_RTCD_H_ +#define VP9_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP9 + */ + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" + +struct macroblockd; + +/* Encoder forward decls */ +struct macroblock; +struct macroblock_plane; +struct vp9_sad_table; +struct ScanOrder; +struct search_site_config; +struct mv; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); +void vp9_apply_temporal_filter_sse4_1(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); +RTCD_EXTERN void (*vp9_apply_temporal_filter)(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); + +int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +RTCD_EXTERN int64_t (*vp9_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); + +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +RTCD_EXTERN int64_t (*vp9_block_error_fp)(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); + +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_c + +void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type); +RTCD_EXTERN void (*vp9_fht16x16)(const int16_t *input, tran_low_t *output, int stride, int tx_type); + +void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type); +RTCD_EXTERN void (*vp9_fht4x4)(const int16_t *input, tran_low_t *output, int stride, int tx_type); + +void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type); +RTCD_EXTERN void (*vp9_fht8x8)(const int16_t *input, tran_low_t *output, int stride, int tx_type); + +void vp9_filter_by_weight16x16_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); +void vp9_filter_by_weight16x16_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); +RTCD_EXTERN void (*vp9_filter_by_weight16x16)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); + +void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); +void vp9_filter_by_weight8x8_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); +RTCD_EXTERN void (*vp9_filter_by_weight8x8)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); + +void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); +void vp9_fwht4x4_sse2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vp9_fwht4x4)(const int16_t *input, tran_low_t *output, int stride); + +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); + +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); + +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); + +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); + +void vp9_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/x86.h" +static void setup_rtcd_internal(void) +{ + int flags = x86_simd_caps(); + + (void)flags; + + vp9_apply_temporal_filter = vp9_apply_temporal_filter_c; + if (flags & HAS_SSE4_1) vp9_apply_temporal_filter = vp9_apply_temporal_filter_sse4_1; + vp9_block_error = vp9_block_error_c; + if (flags & HAS_SSE2) vp9_block_error = vp9_block_error_sse2; + if (flags & HAS_AVX2) vp9_block_error = vp9_block_error_avx2; + vp9_block_error_fp = vp9_block_error_fp_c; + if (flags & HAS_SSE2) vp9_block_error_fp = vp9_block_error_fp_sse2; + if (flags & HAS_AVX2) vp9_block_error_fp = vp9_block_error_fp_avx2; + vp9_fht16x16 = vp9_fht16x16_c; + if (flags & HAS_SSE2) vp9_fht16x16 = vp9_fht16x16_sse2; + vp9_fht4x4 = vp9_fht4x4_c; + if (flags & HAS_SSE2) vp9_fht4x4 = vp9_fht4x4_sse2; + vp9_fht8x8 = vp9_fht8x8_c; + if (flags & HAS_SSE2) vp9_fht8x8 = vp9_fht8x8_sse2; + vp9_filter_by_weight16x16 = vp9_filter_by_weight16x16_c; + if (flags & HAS_SSE2) vp9_filter_by_weight16x16 = vp9_filter_by_weight16x16_sse2; + vp9_filter_by_weight8x8 = vp9_filter_by_weight8x8_c; + if (flags & HAS_SSE2) vp9_filter_by_weight8x8 = vp9_filter_by_weight8x8_sse2; + vp9_fwht4x4 = vp9_fwht4x4_c; + if (flags & HAS_SSE2) vp9_fwht4x4 = vp9_fwht4x4_sse2; + vp9_iht16x16_256_add = vp9_iht16x16_256_add_c; + if (flags & HAS_SSE2) vp9_iht16x16_256_add = vp9_iht16x16_256_add_sse2; + vp9_iht4x4_16_add = vp9_iht4x4_16_add_c; + if (flags & HAS_SSE2) vp9_iht4x4_16_add = vp9_iht4x4_16_add_sse2; + vp9_iht8x8_64_add = vp9_iht8x8_64_add_c; + if (flags & HAS_SSE2) vp9_iht8x8_64_add = vp9_iht8x8_64_add_sse2; + vp9_quantize_fp = vp9_quantize_fp_c; + if (flags & HAS_SSE2) vp9_quantize_fp = vp9_quantize_fp_sse2; + if (flags & HAS_SSSE3) vp9_quantize_fp = vp9_quantize_fp_ssse3; + if (flags & HAS_AVX2) vp9_quantize_fp = vp9_quantize_fp_avx2; + vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_c; + if (flags & HAS_SSSE3) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_ssse3; + if (flags & HAS_AVX2) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_avx2; + vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c; + if (flags & HAS_SSSE3) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_ssse3; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/win/ia32/vpx_config.asm b/media/libvpx/config/win/ia32/vpx_config.asm new file mode 100755 index 0000000000..cb1aa7ce6a --- /dev/null +++ b/media/libvpx/config/win/ia32/vpx_config.asm @@ -0,0 +1,93 @@ +%define VPX_ARCH_ARM 0 +%define VPX_ARCH_AARCH64 0 +%define VPX_ARCH_MIPS 0 +%define VPX_ARCH_X86 1 +%define VPX_ARCH_X86_64 0 +%define VPX_ARCH_PPC 0 +%define VPX_ARCH_LOONGARCH 0 +%define HAVE_NEON_ASM 0 +%define HAVE_NEON 0 +%define HAVE_NEON_DOTPROD 0 +%define HAVE_NEON_I8MM 0 +%define HAVE_SVE 0 +%define HAVE_MIPS32 0 +%define HAVE_DSPR2 0 +%define HAVE_MSA 0 +%define HAVE_MIPS64 0 +%define HAVE_MMX 1 +%define HAVE_SSE 1 +%define HAVE_SSE2 1 +%define HAVE_SSE3 1 +%define HAVE_SSSE3 1 +%define HAVE_SSE4_1 1 +%define HAVE_AVX 1 +%define HAVE_AVX2 1 +%define HAVE_AVX512 0 +%define HAVE_VSX 0 +%define HAVE_MMI 0 +%define HAVE_LSX 0 +%define HAVE_LASX 0 +%define HAVE_VPX_PORTS 1 +%define HAVE_PTHREAD_H 0 +%define CONFIG_DEPENDENCY_TRACKING 1 +%define CONFIG_EXTERNAL_BUILD 1 +%define CONFIG_INSTALL_DOCS 0 +%define CONFIG_INSTALL_BINS 1 +%define CONFIG_INSTALL_LIBS 1 +%define CONFIG_INSTALL_SRCS 0 +%define CONFIG_DEBUG 0 +%define CONFIG_GPROF 0 +%define CONFIG_GCOV 0 +%define CONFIG_RVCT 0 +%define CONFIG_GCC 1 +%define CONFIG_MSVS 0 +%define CONFIG_PIC 1 +%define CONFIG_BIG_ENDIAN 0 +%define CONFIG_CODEC_SRCS 0 +%define CONFIG_DEBUG_LIBS 0 +%define CONFIG_DEQUANT_TOKENS 0 +%define CONFIG_DC_RECON 0 +%define CONFIG_RUNTIME_CPU_DETECT 1 +%define CONFIG_POSTPROC 1 +%define CONFIG_VP9_POSTPROC 1 +%define CONFIG_MULTITHREAD 1 +%define CONFIG_INTERNAL_STATS 0 +%define CONFIG_VP8_ENCODER 1 +%define CONFIG_VP8_DECODER 1 +%define CONFIG_VP9_ENCODER 1 +%define CONFIG_VP9_DECODER 1 +%define CONFIG_VP8 1 +%define CONFIG_VP9 1 +%define CONFIG_ENCODERS 1 +%define CONFIG_DECODERS 1 +%define CONFIG_STATIC_MSVCRT 0 +%define CONFIG_SPATIAL_RESAMPLING 1 +%define CONFIG_REALTIME_ONLY 0 +%define CONFIG_ONTHEFLY_BITPACKING 0 +%define CONFIG_ERROR_CONCEALMENT 0 +%define CONFIG_SHARED 0 +%define CONFIG_STATIC 1 +%define CONFIG_SMALL 0 +%define CONFIG_POSTPROC_VISUALIZER 0 +%define CONFIG_OS_SUPPORT 1 +%define CONFIG_UNIT_TESTS 0 +%define CONFIG_WEBM_IO 1 +%define CONFIG_LIBYUV 1 +%define CONFIG_DECODE_PERF_TESTS 0 +%define CONFIG_ENCODE_PERF_TESTS 0 +%define CONFIG_MULTI_RES_ENCODING 1 +%define CONFIG_TEMPORAL_DENOISING 1 +%define CONFIG_VP9_TEMPORAL_DENOISING 0 +%define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +%define CONFIG_VP9_HIGHBITDEPTH 0 +%define CONFIG_BETTER_HW_COMPATIBILITY 0 +%define CONFIG_EXPERIMENTAL 0 +%define CONFIG_SIZE_LIMIT 1 +%define CONFIG_ALWAYS_ADJUST_BPM 0 +%define CONFIG_BITSTREAM_DEBUG 0 +%define CONFIG_MISMATCH_DEBUG 0 +%define CONFIG_FP_MB_STATS 0 +%define CONFIG_EMULATE_HARDWARE 0 +%define CONFIG_NON_GREEDY_MV 0 +%define CONFIG_RATE_CTRL 0 +%define CONFIG_COLLECT_COMPONENT_TIMING 0 diff --git a/media/libvpx/config/win/ia32/vpx_config.c b/media/libvpx/config/win/ia32/vpx_config.c new file mode 100644 index 0000000000..33c836213b --- /dev/null +++ b/media/libvpx/config/win/ia32/vpx_config.c @@ -0,0 +1,10 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +#include "vpx/vpx_codec.h" +static const char* const cfg = "--target=x86-win32-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm"; +const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/win/ia32/vpx_config.h b/media/libvpx/config/win/ia32/vpx_config.h new file mode 100644 index 0000000000..9fe256f4ad --- /dev/null +++ b/media/libvpx/config/win/ia32/vpx_config.h @@ -0,0 +1,108 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +/* This file automatically generated by configure. Do not edit! */ +#ifndef VPX_CONFIG_H +#define VPX_CONFIG_H +#define RESTRICT +#define INLINE inline +#define VPX_ARCH_ARM 0 +#define VPX_ARCH_AARCH64 0 +#define VPX_ARCH_MIPS 0 +#define VPX_ARCH_X86 1 +#define VPX_ARCH_X86_64 0 +#define VPX_ARCH_PPC 0 +#define VPX_ARCH_LOONGARCH 0 +#define HAVE_NEON_ASM 0 +#define HAVE_NEON 0 +#define HAVE_NEON_DOTPROD 0 +#define HAVE_NEON_I8MM 0 +#define HAVE_SVE 0 +#define HAVE_MIPS32 0 +#define HAVE_DSPR2 0 +#define HAVE_MSA 0 +#define HAVE_MIPS64 0 +#define HAVE_MMX 1 +#define HAVE_SSE 1 +#define HAVE_SSE2 1 +#define HAVE_SSE3 1 +#define HAVE_SSSE3 1 +#define HAVE_SSE4_1 1 +#define HAVE_AVX 1 +#define HAVE_AVX2 1 +#define HAVE_AVX512 0 +#define HAVE_VSX 0 +#define HAVE_MMI 0 +#define HAVE_LSX 0 +#define HAVE_LASX 0 +#define HAVE_VPX_PORTS 1 +#define HAVE_PTHREAD_H 0 +#define CONFIG_DEPENDENCY_TRACKING 1 +#define CONFIG_EXTERNAL_BUILD 1 +#define CONFIG_INSTALL_DOCS 0 +#define CONFIG_INSTALL_BINS 1 +#define CONFIG_INSTALL_LIBS 1 +#define CONFIG_INSTALL_SRCS 0 +#define CONFIG_DEBUG 0 +#define CONFIG_GPROF 0 +#define CONFIG_GCOV 0 +#define CONFIG_RVCT 0 +#define CONFIG_GCC 1 +#define CONFIG_MSVS 0 +#define CONFIG_PIC 1 +#define CONFIG_BIG_ENDIAN 0 +#define CONFIG_CODEC_SRCS 0 +#define CONFIG_DEBUG_LIBS 0 +#define CONFIG_DEQUANT_TOKENS 0 +#define CONFIG_DC_RECON 0 +#define CONFIG_RUNTIME_CPU_DETECT 1 +#define CONFIG_POSTPROC 1 +#define CONFIG_VP9_POSTPROC 1 +#define CONFIG_MULTITHREAD 1 +#define CONFIG_INTERNAL_STATS 0 +#define CONFIG_VP8_ENCODER 1 +#define CONFIG_VP8_DECODER 1 +#define CONFIG_VP9_ENCODER 1 +#define CONFIG_VP9_DECODER 1 +#define CONFIG_VP8 1 +#define CONFIG_VP9 1 +#define CONFIG_ENCODERS 1 +#define CONFIG_DECODERS 1 +#define CONFIG_STATIC_MSVCRT 0 +#define CONFIG_SPATIAL_RESAMPLING 1 +#define CONFIG_REALTIME_ONLY 0 +#define CONFIG_ONTHEFLY_BITPACKING 0 +#define CONFIG_ERROR_CONCEALMENT 0 +#define CONFIG_SHARED 0 +#define CONFIG_STATIC 1 +#define CONFIG_SMALL 0 +#define CONFIG_POSTPROC_VISUALIZER 0 +#define CONFIG_OS_SUPPORT 1 +#define CONFIG_UNIT_TESTS 0 +#define CONFIG_WEBM_IO 1 +#define CONFIG_LIBYUV 1 +#define CONFIG_DECODE_PERF_TESTS 0 +#define CONFIG_ENCODE_PERF_TESTS 0 +#define CONFIG_MULTI_RES_ENCODING 1 +#define CONFIG_TEMPORAL_DENOISING 1 +#define CONFIG_VP9_TEMPORAL_DENOISING 0 +#define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 +#define CONFIG_BETTER_HW_COMPATIBILITY 0 +#define CONFIG_EXPERIMENTAL 0 +#define CONFIG_SIZE_LIMIT 1 +#define CONFIG_ALWAYS_ADJUST_BPM 0 +#define CONFIG_BITSTREAM_DEBUG 0 +#define CONFIG_MISMATCH_DEBUG 0 +#define CONFIG_FP_MB_STATS 0 +#define CONFIG_EMULATE_HARDWARE 0 +#define CONFIG_NON_GREEDY_MV 0 +#define CONFIG_RATE_CTRL 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 +#define DECODE_WIDTH_LIMIT 8192 +#define DECODE_HEIGHT_LIMIT 4608 +#endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/win/ia32/vpx_dsp_rtcd.h b/media/libvpx/config/win/ia32/vpx_dsp_rtcd.h new file mode 100644 index 0000000000..dd688b035b --- /dev/null +++ b/media/libvpx/config/win/ia32/vpx_dsp_rtcd.h @@ -0,0 +1,1604 @@ +// This file is generated. Do not edit. +#ifndef VPX_DSP_RTCD_H_ +#define VPX_DSP_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * DSP + */ + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; + struct ScanOrder; +#endif + + +#ifdef __cplusplus +extern "C" { +#endif + +unsigned int vpx_avg_4x4_c(const uint8_t *, int p); +unsigned int vpx_avg_4x4_sse2(const uint8_t *, int p); +RTCD_EXTERN unsigned int (*vpx_avg_4x4)(const uint8_t *, int p); + +unsigned int vpx_avg_8x8_c(const uint8_t *, int p); +unsigned int vpx_avg_8x8_sse2(const uint8_t *, int p); +RTCD_EXTERN unsigned int (*vpx_avg_8x8)(const uint8_t *, int p); + +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +void vpx_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +RTCD_EXTERN void (*vpx_comp_avg_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); + +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c + +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c + +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c + +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c + +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c + +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c + +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c + +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c + +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c + +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c + +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct16x16)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct16x16_1)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_sse2(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_avx2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct32x32)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct32x32_1)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_rd_sse2(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_rd_avx2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct32x32_rd)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct4x4_sse2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct4x4)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct4x4_1)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_sse2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct8x8)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct8x8_1)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); +#define vpx_get4x4sse_cs vpx_get4x4sse_cs_c + +void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get_mb_ss_c(const int16_t *); +unsigned int vpx_get_mb_ss_sse2(const int16_t *); +RTCD_EXTERN unsigned int (*vpx_get_mb_ss)(const int16_t *); + +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); + +void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_32x32)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); + +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c + +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_38_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_1_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride); + +int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); +int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width); +RTCD_EXTERN int16_t (*vpx_int_pro_col)(const uint8_t *ref, const int width); + +void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +RTCD_EXTERN void (*vpx_int_pro_row)(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); + +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_iwht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c + +void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_avx2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_dual_avx2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_16_dual)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_horizontal_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_horizontal_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_16_dual)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_vertical_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols,int flimit); +void vpx_mbpost_proc_across_ip_sse2(unsigned char *src, int pitch, int rows, int cols,int flimit); +RTCD_EXTERN void (*vpx_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols,int flimit); + +void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); +void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit); +RTCD_EXTERN void (*vpx_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols,int flimit); + +void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +RTCD_EXTERN void (*vpx_minmax_8x8)(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch); +void vpx_plane_add_noise_sse2(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch); +RTCD_EXTERN void (*vpx_plane_add_noise)(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch); + +void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +void vpx_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +RTCD_EXTERN void (*vpx_post_proc_down_and_across_mb_row)(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); + +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vpx_quantize_b_32x32)(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad16x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad16x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad16x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x16_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad4x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad4x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad4x4_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad4x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad4x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad4x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad64x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad8x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad8x4_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad8x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad8x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c + +void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c + +unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c + +void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c + +unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +int vpx_satd_c(const int16_t *coeff, int length); +int vpx_satd_sse2(const int16_t *coeff, int length); +int vpx_satd_avx2(const int16_t *coeff, int length); +RTCD_EXTERN int (*vpx_satd)(const int16_t *coeff, int length); + +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c + +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c + +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c + +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_horiz vpx_scaled_horiz_c + +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_vert vpx_scaled_vert_c + +int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_sse4_1(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +RTCD_EXTERN int64_t (*vpx_sse)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); + +uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vpx_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +RTCD_EXTERN void (*vpx_subtract_block)(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); + +uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size); +uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size); +RTCD_EXTERN uint64_t (*vpx_sum_squares_2d_i16)(const int16_t *src, int stride, int size); + +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance4x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c + +int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl); +int vpx_vector_var_sse2(const int16_t *ref, const int16_t *src, const int bwl); +RTCD_EXTERN int (*vpx_vector_var)(const int16_t *ref, const int16_t *src, const int bwl); + +void vpx_dsp_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/x86.h" +static void setup_rtcd_internal(void) +{ + int flags = x86_simd_caps(); + + (void)flags; + + vpx_avg_4x4 = vpx_avg_4x4_c; + if (flags & HAS_SSE2) vpx_avg_4x4 = vpx_avg_4x4_sse2; + vpx_avg_8x8 = vpx_avg_8x8_c; + if (flags & HAS_SSE2) vpx_avg_8x8 = vpx_avg_8x8_sse2; + vpx_comp_avg_pred = vpx_comp_avg_pred_c; + if (flags & HAS_SSE2) vpx_comp_avg_pred = vpx_comp_avg_pred_sse2; + if (flags & HAS_AVX2) vpx_comp_avg_pred = vpx_comp_avg_pred_avx2; + vpx_convolve8 = vpx_convolve8_c; + if (flags & HAS_SSE2) vpx_convolve8 = vpx_convolve8_sse2; + if (flags & HAS_SSSE3) vpx_convolve8 = vpx_convolve8_ssse3; + if (flags & HAS_AVX2) vpx_convolve8 = vpx_convolve8_avx2; + vpx_convolve8_avg = vpx_convolve8_avg_c; + if (flags & HAS_SSE2) vpx_convolve8_avg = vpx_convolve8_avg_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_avg = vpx_convolve8_avg_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg = vpx_convolve8_avg_avx2; + vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_c; + if (flags & HAS_SSE2) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_avx2; + vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_c; + if (flags & HAS_SSE2) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_avx2; + vpx_convolve8_horiz = vpx_convolve8_horiz_c; + if (flags & HAS_SSE2) vpx_convolve8_horiz = vpx_convolve8_horiz_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_horiz = vpx_convolve8_horiz_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_horiz = vpx_convolve8_horiz_avx2; + vpx_convolve8_vert = vpx_convolve8_vert_c; + if (flags & HAS_SSE2) vpx_convolve8_vert = vpx_convolve8_vert_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_vert = vpx_convolve8_vert_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_vert = vpx_convolve8_vert_avx2; + vpx_convolve_avg = vpx_convolve_avg_c; + if (flags & HAS_SSE2) vpx_convolve_avg = vpx_convolve_avg_sse2; + vpx_convolve_copy = vpx_convolve_copy_c; + if (flags & HAS_SSE2) vpx_convolve_copy = vpx_convolve_copy_sse2; + vpx_d153_predictor_16x16 = vpx_d153_predictor_16x16_c; + if (flags & HAS_SSSE3) vpx_d153_predictor_16x16 = vpx_d153_predictor_16x16_ssse3; + vpx_d153_predictor_32x32 = vpx_d153_predictor_32x32_c; + if (flags & HAS_SSSE3) vpx_d153_predictor_32x32 = vpx_d153_predictor_32x32_ssse3; + vpx_d153_predictor_4x4 = vpx_d153_predictor_4x4_c; + if (flags & HAS_SSSE3) vpx_d153_predictor_4x4 = vpx_d153_predictor_4x4_ssse3; + vpx_d153_predictor_8x8 = vpx_d153_predictor_8x8_c; + if (flags & HAS_SSSE3) vpx_d153_predictor_8x8 = vpx_d153_predictor_8x8_ssse3; + vpx_d207_predictor_16x16 = vpx_d207_predictor_16x16_c; + if (flags & HAS_SSSE3) vpx_d207_predictor_16x16 = vpx_d207_predictor_16x16_ssse3; + vpx_d207_predictor_32x32 = vpx_d207_predictor_32x32_c; + if (flags & HAS_SSSE3) vpx_d207_predictor_32x32 = vpx_d207_predictor_32x32_ssse3; + vpx_d207_predictor_4x4 = vpx_d207_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_d207_predictor_4x4 = vpx_d207_predictor_4x4_sse2; + vpx_d207_predictor_8x8 = vpx_d207_predictor_8x8_c; + if (flags & HAS_SSSE3) vpx_d207_predictor_8x8 = vpx_d207_predictor_8x8_ssse3; + vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_c; + if (flags & HAS_SSSE3) vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_ssse3; + vpx_d45_predictor_32x32 = vpx_d45_predictor_32x32_c; + if (flags & HAS_SSSE3) vpx_d45_predictor_32x32 = vpx_d45_predictor_32x32_ssse3; + vpx_d45_predictor_4x4 = vpx_d45_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_d45_predictor_4x4 = vpx_d45_predictor_4x4_sse2; + vpx_d45_predictor_8x8 = vpx_d45_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_d45_predictor_8x8 = vpx_d45_predictor_8x8_sse2; + vpx_d63_predictor_16x16 = vpx_d63_predictor_16x16_c; + if (flags & HAS_SSSE3) vpx_d63_predictor_16x16 = vpx_d63_predictor_16x16_ssse3; + vpx_d63_predictor_32x32 = vpx_d63_predictor_32x32_c; + if (flags & HAS_SSSE3) vpx_d63_predictor_32x32 = vpx_d63_predictor_32x32_ssse3; + vpx_d63_predictor_4x4 = vpx_d63_predictor_4x4_c; + if (flags & HAS_SSSE3) vpx_d63_predictor_4x4 = vpx_d63_predictor_4x4_ssse3; + vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_c; + if (flags & HAS_SSSE3) vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_ssse3; + vpx_dc_128_predictor_16x16 = vpx_dc_128_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_dc_128_predictor_16x16 = vpx_dc_128_predictor_16x16_sse2; + vpx_dc_128_predictor_32x32 = vpx_dc_128_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_dc_128_predictor_32x32 = vpx_dc_128_predictor_32x32_sse2; + vpx_dc_128_predictor_4x4 = vpx_dc_128_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_dc_128_predictor_4x4 = vpx_dc_128_predictor_4x4_sse2; + vpx_dc_128_predictor_8x8 = vpx_dc_128_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_dc_128_predictor_8x8 = vpx_dc_128_predictor_8x8_sse2; + vpx_dc_left_predictor_16x16 = vpx_dc_left_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_dc_left_predictor_16x16 = vpx_dc_left_predictor_16x16_sse2; + vpx_dc_left_predictor_32x32 = vpx_dc_left_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_dc_left_predictor_32x32 = vpx_dc_left_predictor_32x32_sse2; + vpx_dc_left_predictor_4x4 = vpx_dc_left_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_dc_left_predictor_4x4 = vpx_dc_left_predictor_4x4_sse2; + vpx_dc_left_predictor_8x8 = vpx_dc_left_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_dc_left_predictor_8x8 = vpx_dc_left_predictor_8x8_sse2; + vpx_dc_predictor_16x16 = vpx_dc_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_dc_predictor_16x16 = vpx_dc_predictor_16x16_sse2; + vpx_dc_predictor_32x32 = vpx_dc_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_dc_predictor_32x32 = vpx_dc_predictor_32x32_sse2; + vpx_dc_predictor_4x4 = vpx_dc_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_dc_predictor_4x4 = vpx_dc_predictor_4x4_sse2; + vpx_dc_predictor_8x8 = vpx_dc_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_dc_predictor_8x8 = vpx_dc_predictor_8x8_sse2; + vpx_dc_top_predictor_16x16 = vpx_dc_top_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_dc_top_predictor_16x16 = vpx_dc_top_predictor_16x16_sse2; + vpx_dc_top_predictor_32x32 = vpx_dc_top_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_dc_top_predictor_32x32 = vpx_dc_top_predictor_32x32_sse2; + vpx_dc_top_predictor_4x4 = vpx_dc_top_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_dc_top_predictor_4x4 = vpx_dc_top_predictor_4x4_sse2; + vpx_dc_top_predictor_8x8 = vpx_dc_top_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_dc_top_predictor_8x8 = vpx_dc_top_predictor_8x8_sse2; + vpx_fdct16x16 = vpx_fdct16x16_c; + if (flags & HAS_SSE2) vpx_fdct16x16 = vpx_fdct16x16_sse2; + if (flags & HAS_AVX2) vpx_fdct16x16 = vpx_fdct16x16_avx2; + vpx_fdct16x16_1 = vpx_fdct16x16_1_c; + if (flags & HAS_SSE2) vpx_fdct16x16_1 = vpx_fdct16x16_1_sse2; + vpx_fdct32x32 = vpx_fdct32x32_c; + if (flags & HAS_SSE2) vpx_fdct32x32 = vpx_fdct32x32_sse2; + if (flags & HAS_AVX2) vpx_fdct32x32 = vpx_fdct32x32_avx2; + vpx_fdct32x32_1 = vpx_fdct32x32_1_c; + if (flags & HAS_SSE2) vpx_fdct32x32_1 = vpx_fdct32x32_1_sse2; + vpx_fdct32x32_rd = vpx_fdct32x32_rd_c; + if (flags & HAS_SSE2) vpx_fdct32x32_rd = vpx_fdct32x32_rd_sse2; + if (flags & HAS_AVX2) vpx_fdct32x32_rd = vpx_fdct32x32_rd_avx2; + vpx_fdct4x4 = vpx_fdct4x4_c; + if (flags & HAS_SSE2) vpx_fdct4x4 = vpx_fdct4x4_sse2; + vpx_fdct4x4_1 = vpx_fdct4x4_1_c; + if (flags & HAS_SSE2) vpx_fdct4x4_1 = vpx_fdct4x4_1_sse2; + vpx_fdct8x8 = vpx_fdct8x8_c; + if (flags & HAS_SSE2) vpx_fdct8x8 = vpx_fdct8x8_sse2; + vpx_fdct8x8_1 = vpx_fdct8x8_1_c; + if (flags & HAS_SSE2) vpx_fdct8x8_1 = vpx_fdct8x8_1_sse2; + vpx_get16x16var = vpx_get16x16var_c; + if (flags & HAS_SSE2) vpx_get16x16var = vpx_get16x16var_sse2; + if (flags & HAS_AVX2) vpx_get16x16var = vpx_get16x16var_avx2; + vpx_get8x8var = vpx_get8x8var_c; + if (flags & HAS_SSE2) vpx_get8x8var = vpx_get8x8var_sse2; + vpx_get_mb_ss = vpx_get_mb_ss_c; + if (flags & HAS_SSE2) vpx_get_mb_ss = vpx_get_mb_ss_sse2; + vpx_h_predictor_16x16 = vpx_h_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_h_predictor_16x16 = vpx_h_predictor_16x16_sse2; + vpx_h_predictor_32x32 = vpx_h_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_h_predictor_32x32 = vpx_h_predictor_32x32_sse2; + vpx_h_predictor_4x4 = vpx_h_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_h_predictor_4x4 = vpx_h_predictor_4x4_sse2; + vpx_h_predictor_8x8 = vpx_h_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_h_predictor_8x8 = vpx_h_predictor_8x8_sse2; + vpx_hadamard_16x16 = vpx_hadamard_16x16_c; + if (flags & HAS_SSE2) vpx_hadamard_16x16 = vpx_hadamard_16x16_sse2; + if (flags & HAS_AVX2) vpx_hadamard_16x16 = vpx_hadamard_16x16_avx2; + vpx_hadamard_32x32 = vpx_hadamard_32x32_c; + if (flags & HAS_SSE2) vpx_hadamard_32x32 = vpx_hadamard_32x32_sse2; + if (flags & HAS_AVX2) vpx_hadamard_32x32 = vpx_hadamard_32x32_avx2; + vpx_hadamard_8x8 = vpx_hadamard_8x8_c; + if (flags & HAS_SSE2) vpx_hadamard_8x8 = vpx_hadamard_8x8_sse2; + vpx_idct16x16_10_add = vpx_idct16x16_10_add_c; + if (flags & HAS_SSE2) vpx_idct16x16_10_add = vpx_idct16x16_10_add_sse2; + vpx_idct16x16_1_add = vpx_idct16x16_1_add_c; + if (flags & HAS_SSE2) vpx_idct16x16_1_add = vpx_idct16x16_1_add_sse2; + vpx_idct16x16_256_add = vpx_idct16x16_256_add_c; + if (flags & HAS_SSE2) vpx_idct16x16_256_add = vpx_idct16x16_256_add_sse2; + if (flags & HAS_AVX2) vpx_idct16x16_256_add = vpx_idct16x16_256_add_avx2; + vpx_idct16x16_38_add = vpx_idct16x16_38_add_c; + if (flags & HAS_SSE2) vpx_idct16x16_38_add = vpx_idct16x16_38_add_sse2; + vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_c; + if (flags & HAS_SSE2) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_sse2; + if (flags & HAS_AVX2) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_avx2; + vpx_idct32x32_135_add = vpx_idct32x32_135_add_c; + if (flags & HAS_SSE2) vpx_idct32x32_135_add = vpx_idct32x32_135_add_sse2; + if (flags & HAS_SSSE3) vpx_idct32x32_135_add = vpx_idct32x32_135_add_ssse3; + if (flags & HAS_AVX2) vpx_idct32x32_135_add = vpx_idct32x32_135_add_avx2; + vpx_idct32x32_1_add = vpx_idct32x32_1_add_c; + if (flags & HAS_SSE2) vpx_idct32x32_1_add = vpx_idct32x32_1_add_sse2; + vpx_idct32x32_34_add = vpx_idct32x32_34_add_c; + if (flags & HAS_SSE2) vpx_idct32x32_34_add = vpx_idct32x32_34_add_sse2; + if (flags & HAS_SSSE3) vpx_idct32x32_34_add = vpx_idct32x32_34_add_ssse3; + vpx_idct4x4_16_add = vpx_idct4x4_16_add_c; + if (flags & HAS_SSE2) vpx_idct4x4_16_add = vpx_idct4x4_16_add_sse2; + vpx_idct4x4_1_add = vpx_idct4x4_1_add_c; + if (flags & HAS_SSE2) vpx_idct4x4_1_add = vpx_idct4x4_1_add_sse2; + vpx_idct8x8_12_add = vpx_idct8x8_12_add_c; + if (flags & HAS_SSE2) vpx_idct8x8_12_add = vpx_idct8x8_12_add_sse2; + if (flags & HAS_SSSE3) vpx_idct8x8_12_add = vpx_idct8x8_12_add_ssse3; + vpx_idct8x8_1_add = vpx_idct8x8_1_add_c; + if (flags & HAS_SSE2) vpx_idct8x8_1_add = vpx_idct8x8_1_add_sse2; + vpx_idct8x8_64_add = vpx_idct8x8_64_add_c; + if (flags & HAS_SSE2) vpx_idct8x8_64_add = vpx_idct8x8_64_add_sse2; + vpx_int_pro_col = vpx_int_pro_col_c; + if (flags & HAS_SSE2) vpx_int_pro_col = vpx_int_pro_col_sse2; + vpx_int_pro_row = vpx_int_pro_row_c; + if (flags & HAS_SSE2) vpx_int_pro_row = vpx_int_pro_row_sse2; + vpx_iwht4x4_16_add = vpx_iwht4x4_16_add_c; + if (flags & HAS_SSE2) vpx_iwht4x4_16_add = vpx_iwht4x4_16_add_sse2; + vpx_lpf_horizontal_16 = vpx_lpf_horizontal_16_c; + if (flags & HAS_SSE2) vpx_lpf_horizontal_16 = vpx_lpf_horizontal_16_sse2; + if (flags & HAS_AVX2) vpx_lpf_horizontal_16 = vpx_lpf_horizontal_16_avx2; + vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_c; + if (flags & HAS_SSE2) vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_sse2; + if (flags & HAS_AVX2) vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_avx2; + vpx_lpf_horizontal_4 = vpx_lpf_horizontal_4_c; + if (flags & HAS_SSE2) vpx_lpf_horizontal_4 = vpx_lpf_horizontal_4_sse2; + vpx_lpf_horizontal_4_dual = vpx_lpf_horizontal_4_dual_c; + if (flags & HAS_SSE2) vpx_lpf_horizontal_4_dual = vpx_lpf_horizontal_4_dual_sse2; + vpx_lpf_horizontal_8 = vpx_lpf_horizontal_8_c; + if (flags & HAS_SSE2) vpx_lpf_horizontal_8 = vpx_lpf_horizontal_8_sse2; + vpx_lpf_horizontal_8_dual = vpx_lpf_horizontal_8_dual_c; + if (flags & HAS_SSE2) vpx_lpf_horizontal_8_dual = vpx_lpf_horizontal_8_dual_sse2; + vpx_lpf_vertical_16 = vpx_lpf_vertical_16_c; + if (flags & HAS_SSE2) vpx_lpf_vertical_16 = vpx_lpf_vertical_16_sse2; + vpx_lpf_vertical_16_dual = vpx_lpf_vertical_16_dual_c; + if (flags & HAS_SSE2) vpx_lpf_vertical_16_dual = vpx_lpf_vertical_16_dual_sse2; + vpx_lpf_vertical_4 = vpx_lpf_vertical_4_c; + if (flags & HAS_SSE2) vpx_lpf_vertical_4 = vpx_lpf_vertical_4_sse2; + vpx_lpf_vertical_4_dual = vpx_lpf_vertical_4_dual_c; + if (flags & HAS_SSE2) vpx_lpf_vertical_4_dual = vpx_lpf_vertical_4_dual_sse2; + vpx_lpf_vertical_8 = vpx_lpf_vertical_8_c; + if (flags & HAS_SSE2) vpx_lpf_vertical_8 = vpx_lpf_vertical_8_sse2; + vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_c; + if (flags & HAS_SSE2) vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_sse2; + vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_c; + if (flags & HAS_SSE2) vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_sse2; + vpx_mbpost_proc_down = vpx_mbpost_proc_down_c; + if (flags & HAS_SSE2) vpx_mbpost_proc_down = vpx_mbpost_proc_down_sse2; + vpx_minmax_8x8 = vpx_minmax_8x8_c; + if (flags & HAS_SSE2) vpx_minmax_8x8 = vpx_minmax_8x8_sse2; + vpx_mse16x16 = vpx_mse16x16_c; + if (flags & HAS_SSE2) vpx_mse16x16 = vpx_mse16x16_sse2; + if (flags & HAS_AVX2) vpx_mse16x16 = vpx_mse16x16_avx2; + vpx_mse16x8 = vpx_mse16x8_c; + if (flags & HAS_SSE2) vpx_mse16x8 = vpx_mse16x8_sse2; + if (flags & HAS_AVX2) vpx_mse16x8 = vpx_mse16x8_avx2; + vpx_mse8x16 = vpx_mse8x16_c; + if (flags & HAS_SSE2) vpx_mse8x16 = vpx_mse8x16_sse2; + vpx_mse8x8 = vpx_mse8x8_c; + if (flags & HAS_SSE2) vpx_mse8x8 = vpx_mse8x8_sse2; + vpx_plane_add_noise = vpx_plane_add_noise_c; + if (flags & HAS_SSE2) vpx_plane_add_noise = vpx_plane_add_noise_sse2; + vpx_post_proc_down_and_across_mb_row = vpx_post_proc_down_and_across_mb_row_c; + if (flags & HAS_SSE2) vpx_post_proc_down_and_across_mb_row = vpx_post_proc_down_and_across_mb_row_sse2; + vpx_quantize_b = vpx_quantize_b_c; + if (flags & HAS_SSE2) vpx_quantize_b = vpx_quantize_b_sse2; + if (flags & HAS_SSSE3) vpx_quantize_b = vpx_quantize_b_ssse3; + if (flags & HAS_AVX) vpx_quantize_b = vpx_quantize_b_avx; + if (flags & HAS_AVX2) vpx_quantize_b = vpx_quantize_b_avx2; + vpx_quantize_b_32x32 = vpx_quantize_b_32x32_c; + if (flags & HAS_SSSE3) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_ssse3; + if (flags & HAS_AVX) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx; + if (flags & HAS_AVX2) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx2; + vpx_sad16x16 = vpx_sad16x16_c; + if (flags & HAS_SSE2) vpx_sad16x16 = vpx_sad16x16_sse2; + vpx_sad16x16_avg = vpx_sad16x16_avg_c; + if (flags & HAS_SSE2) vpx_sad16x16_avg = vpx_sad16x16_avg_sse2; + vpx_sad16x16x4d = vpx_sad16x16x4d_c; + if (flags & HAS_SSE2) vpx_sad16x16x4d = vpx_sad16x16x4d_sse2; + vpx_sad16x32 = vpx_sad16x32_c; + if (flags & HAS_SSE2) vpx_sad16x32 = vpx_sad16x32_sse2; + vpx_sad16x32_avg = vpx_sad16x32_avg_c; + if (flags & HAS_SSE2) vpx_sad16x32_avg = vpx_sad16x32_avg_sse2; + vpx_sad16x32x4d = vpx_sad16x32x4d_c; + if (flags & HAS_SSE2) vpx_sad16x32x4d = vpx_sad16x32x4d_sse2; + vpx_sad16x8 = vpx_sad16x8_c; + if (flags & HAS_SSE2) vpx_sad16x8 = vpx_sad16x8_sse2; + vpx_sad16x8_avg = vpx_sad16x8_avg_c; + if (flags & HAS_SSE2) vpx_sad16x8_avg = vpx_sad16x8_avg_sse2; + vpx_sad16x8x4d = vpx_sad16x8x4d_c; + if (flags & HAS_SSE2) vpx_sad16x8x4d = vpx_sad16x8x4d_sse2; + vpx_sad32x16 = vpx_sad32x16_c; + if (flags & HAS_SSE2) vpx_sad32x16 = vpx_sad32x16_sse2; + if (flags & HAS_AVX2) vpx_sad32x16 = vpx_sad32x16_avx2; + vpx_sad32x16_avg = vpx_sad32x16_avg_c; + if (flags & HAS_SSE2) vpx_sad32x16_avg = vpx_sad32x16_avg_sse2; + if (flags & HAS_AVX2) vpx_sad32x16_avg = vpx_sad32x16_avg_avx2; + vpx_sad32x16x4d = vpx_sad32x16x4d_c; + if (flags & HAS_SSE2) vpx_sad32x16x4d = vpx_sad32x16x4d_sse2; + vpx_sad32x32 = vpx_sad32x32_c; + if (flags & HAS_SSE2) vpx_sad32x32 = vpx_sad32x32_sse2; + if (flags & HAS_AVX2) vpx_sad32x32 = vpx_sad32x32_avx2; + vpx_sad32x32_avg = vpx_sad32x32_avg_c; + if (flags & HAS_SSE2) vpx_sad32x32_avg = vpx_sad32x32_avg_sse2; + if (flags & HAS_AVX2) vpx_sad32x32_avg = vpx_sad32x32_avg_avx2; + vpx_sad32x32x4d = vpx_sad32x32x4d_c; + if (flags & HAS_SSE2) vpx_sad32x32x4d = vpx_sad32x32x4d_sse2; + if (flags & HAS_AVX2) vpx_sad32x32x4d = vpx_sad32x32x4d_avx2; + vpx_sad32x64 = vpx_sad32x64_c; + if (flags & HAS_SSE2) vpx_sad32x64 = vpx_sad32x64_sse2; + if (flags & HAS_AVX2) vpx_sad32x64 = vpx_sad32x64_avx2; + vpx_sad32x64_avg = vpx_sad32x64_avg_c; + if (flags & HAS_SSE2) vpx_sad32x64_avg = vpx_sad32x64_avg_sse2; + if (flags & HAS_AVX2) vpx_sad32x64_avg = vpx_sad32x64_avg_avx2; + vpx_sad32x64x4d = vpx_sad32x64x4d_c; + if (flags & HAS_SSE2) vpx_sad32x64x4d = vpx_sad32x64x4d_sse2; + vpx_sad4x4 = vpx_sad4x4_c; + if (flags & HAS_SSE2) vpx_sad4x4 = vpx_sad4x4_sse2; + vpx_sad4x4_avg = vpx_sad4x4_avg_c; + if (flags & HAS_SSE2) vpx_sad4x4_avg = vpx_sad4x4_avg_sse2; + vpx_sad4x4x4d = vpx_sad4x4x4d_c; + if (flags & HAS_SSE2) vpx_sad4x4x4d = vpx_sad4x4x4d_sse2; + vpx_sad4x8 = vpx_sad4x8_c; + if (flags & HAS_SSE2) vpx_sad4x8 = vpx_sad4x8_sse2; + vpx_sad4x8_avg = vpx_sad4x8_avg_c; + if (flags & HAS_SSE2) vpx_sad4x8_avg = vpx_sad4x8_avg_sse2; + vpx_sad4x8x4d = vpx_sad4x8x4d_c; + if (flags & HAS_SSE2) vpx_sad4x8x4d = vpx_sad4x8x4d_sse2; + vpx_sad64x32 = vpx_sad64x32_c; + if (flags & HAS_SSE2) vpx_sad64x32 = vpx_sad64x32_sse2; + if (flags & HAS_AVX2) vpx_sad64x32 = vpx_sad64x32_avx2; + vpx_sad64x32_avg = vpx_sad64x32_avg_c; + if (flags & HAS_SSE2) vpx_sad64x32_avg = vpx_sad64x32_avg_sse2; + if (flags & HAS_AVX2) vpx_sad64x32_avg = vpx_sad64x32_avg_avx2; + vpx_sad64x32x4d = vpx_sad64x32x4d_c; + if (flags & HAS_SSE2) vpx_sad64x32x4d = vpx_sad64x32x4d_sse2; + vpx_sad64x64 = vpx_sad64x64_c; + if (flags & HAS_SSE2) vpx_sad64x64 = vpx_sad64x64_sse2; + if (flags & HAS_AVX2) vpx_sad64x64 = vpx_sad64x64_avx2; + vpx_sad64x64_avg = vpx_sad64x64_avg_c; + if (flags & HAS_SSE2) vpx_sad64x64_avg = vpx_sad64x64_avg_sse2; + if (flags & HAS_AVX2) vpx_sad64x64_avg = vpx_sad64x64_avg_avx2; + vpx_sad64x64x4d = vpx_sad64x64x4d_c; + if (flags & HAS_SSE2) vpx_sad64x64x4d = vpx_sad64x64x4d_sse2; + if (flags & HAS_AVX2) vpx_sad64x64x4d = vpx_sad64x64x4d_avx2; + vpx_sad8x16 = vpx_sad8x16_c; + if (flags & HAS_SSE2) vpx_sad8x16 = vpx_sad8x16_sse2; + vpx_sad8x16_avg = vpx_sad8x16_avg_c; + if (flags & HAS_SSE2) vpx_sad8x16_avg = vpx_sad8x16_avg_sse2; + vpx_sad8x16x4d = vpx_sad8x16x4d_c; + if (flags & HAS_SSE2) vpx_sad8x16x4d = vpx_sad8x16x4d_sse2; + vpx_sad8x4 = vpx_sad8x4_c; + if (flags & HAS_SSE2) vpx_sad8x4 = vpx_sad8x4_sse2; + vpx_sad8x4_avg = vpx_sad8x4_avg_c; + if (flags & HAS_SSE2) vpx_sad8x4_avg = vpx_sad8x4_avg_sse2; + vpx_sad8x4x4d = vpx_sad8x4x4d_c; + if (flags & HAS_SSE2) vpx_sad8x4x4d = vpx_sad8x4x4d_sse2; + vpx_sad8x8 = vpx_sad8x8_c; + if (flags & HAS_SSE2) vpx_sad8x8 = vpx_sad8x8_sse2; + vpx_sad8x8_avg = vpx_sad8x8_avg_c; + if (flags & HAS_SSE2) vpx_sad8x8_avg = vpx_sad8x8_avg_sse2; + vpx_sad8x8x4d = vpx_sad8x8x4d_c; + if (flags & HAS_SSE2) vpx_sad8x8x4d = vpx_sad8x8x4d_sse2; + vpx_sad_skip_16x16 = vpx_sad_skip_16x16_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x16 = vpx_sad_skip_16x16_sse2; + vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_sse2; + vpx_sad_skip_16x32 = vpx_sad_skip_16x32_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x32 = vpx_sad_skip_16x32_sse2; + vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_sse2; + vpx_sad_skip_16x8 = vpx_sad_skip_16x8_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x8 = vpx_sad_skip_16x8_sse2; + vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_sse2; + vpx_sad_skip_32x16 = vpx_sad_skip_32x16_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_avx2; + vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_avx2; + vpx_sad_skip_32x32 = vpx_sad_skip_32x32_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_avx2; + vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_avx2; + vpx_sad_skip_32x64 = vpx_sad_skip_32x64_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_avx2; + vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_avx2; + vpx_sad_skip_4x8 = vpx_sad_skip_4x8_c; + if (flags & HAS_SSE2) vpx_sad_skip_4x8 = vpx_sad_skip_4x8_sse2; + vpx_sad_skip_4x8x4d = vpx_sad_skip_4x8x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_4x8x4d = vpx_sad_skip_4x8x4d_sse2; + vpx_sad_skip_64x32 = vpx_sad_skip_64x32_c; + if (flags & HAS_SSE2) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_avx2; + vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_avx2; + vpx_sad_skip_64x64 = vpx_sad_skip_64x64_c; + if (flags & HAS_SSE2) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_avx2; + vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_avx2; + vpx_sad_skip_8x16 = vpx_sad_skip_8x16_c; + if (flags & HAS_SSE2) vpx_sad_skip_8x16 = vpx_sad_skip_8x16_sse2; + vpx_sad_skip_8x16x4d = vpx_sad_skip_8x16x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_8x16x4d = vpx_sad_skip_8x16x4d_sse2; + vpx_sad_skip_8x8 = vpx_sad_skip_8x8_c; + if (flags & HAS_SSE2) vpx_sad_skip_8x8 = vpx_sad_skip_8x8_sse2; + vpx_sad_skip_8x8x4d = vpx_sad_skip_8x8x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_8x8x4d = vpx_sad_skip_8x8x4d_sse2; + vpx_satd = vpx_satd_c; + if (flags & HAS_SSE2) vpx_satd = vpx_satd_sse2; + if (flags & HAS_AVX2) vpx_satd = vpx_satd_avx2; + vpx_scaled_2d = vpx_scaled_2d_c; + if (flags & HAS_SSSE3) vpx_scaled_2d = vpx_scaled_2d_ssse3; + vpx_sse = vpx_sse_c; + if (flags & HAS_SSE4_1) vpx_sse = vpx_sse_sse4_1; + if (flags & HAS_AVX2) vpx_sse = vpx_sse_avx2; + vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_ssse3; + vpx_sub_pixel_avg_variance16x32 = vpx_sub_pixel_avg_variance16x32_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance16x32 = vpx_sub_pixel_avg_variance16x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance16x32 = vpx_sub_pixel_avg_variance16x32_ssse3; + vpx_sub_pixel_avg_variance16x8 = vpx_sub_pixel_avg_variance16x8_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance16x8 = vpx_sub_pixel_avg_variance16x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance16x8 = vpx_sub_pixel_avg_variance16x8_ssse3; + vpx_sub_pixel_avg_variance32x16 = vpx_sub_pixel_avg_variance32x16_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance32x16 = vpx_sub_pixel_avg_variance32x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance32x16 = vpx_sub_pixel_avg_variance32x16_ssse3; + vpx_sub_pixel_avg_variance32x32 = vpx_sub_pixel_avg_variance32x32_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance32x32 = vpx_sub_pixel_avg_variance32x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance32x32 = vpx_sub_pixel_avg_variance32x32_ssse3; + if (flags & HAS_AVX2) vpx_sub_pixel_avg_variance32x32 = vpx_sub_pixel_avg_variance32x32_avx2; + vpx_sub_pixel_avg_variance32x64 = vpx_sub_pixel_avg_variance32x64_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance32x64 = vpx_sub_pixel_avg_variance32x64_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance32x64 = vpx_sub_pixel_avg_variance32x64_ssse3; + vpx_sub_pixel_avg_variance4x4 = vpx_sub_pixel_avg_variance4x4_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance4x4 = vpx_sub_pixel_avg_variance4x4_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance4x4 = vpx_sub_pixel_avg_variance4x4_ssse3; + vpx_sub_pixel_avg_variance4x8 = vpx_sub_pixel_avg_variance4x8_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance4x8 = vpx_sub_pixel_avg_variance4x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance4x8 = vpx_sub_pixel_avg_variance4x8_ssse3; + vpx_sub_pixel_avg_variance64x32 = vpx_sub_pixel_avg_variance64x32_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance64x32 = vpx_sub_pixel_avg_variance64x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance64x32 = vpx_sub_pixel_avg_variance64x32_ssse3; + vpx_sub_pixel_avg_variance64x64 = vpx_sub_pixel_avg_variance64x64_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance64x64 = vpx_sub_pixel_avg_variance64x64_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance64x64 = vpx_sub_pixel_avg_variance64x64_ssse3; + if (flags & HAS_AVX2) vpx_sub_pixel_avg_variance64x64 = vpx_sub_pixel_avg_variance64x64_avx2; + vpx_sub_pixel_avg_variance8x16 = vpx_sub_pixel_avg_variance8x16_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance8x16 = vpx_sub_pixel_avg_variance8x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance8x16 = vpx_sub_pixel_avg_variance8x16_ssse3; + vpx_sub_pixel_avg_variance8x4 = vpx_sub_pixel_avg_variance8x4_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance8x4 = vpx_sub_pixel_avg_variance8x4_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance8x4 = vpx_sub_pixel_avg_variance8x4_ssse3; + vpx_sub_pixel_avg_variance8x8 = vpx_sub_pixel_avg_variance8x8_c; + if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance8x8 = vpx_sub_pixel_avg_variance8x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance8x8 = vpx_sub_pixel_avg_variance8x8_ssse3; + vpx_sub_pixel_variance16x16 = vpx_sub_pixel_variance16x16_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance16x16 = vpx_sub_pixel_variance16x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance16x16 = vpx_sub_pixel_variance16x16_ssse3; + vpx_sub_pixel_variance16x32 = vpx_sub_pixel_variance16x32_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance16x32 = vpx_sub_pixel_variance16x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance16x32 = vpx_sub_pixel_variance16x32_ssse3; + vpx_sub_pixel_variance16x8 = vpx_sub_pixel_variance16x8_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance16x8 = vpx_sub_pixel_variance16x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance16x8 = vpx_sub_pixel_variance16x8_ssse3; + vpx_sub_pixel_variance32x16 = vpx_sub_pixel_variance32x16_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance32x16 = vpx_sub_pixel_variance32x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance32x16 = vpx_sub_pixel_variance32x16_ssse3; + vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_ssse3; + if (flags & HAS_AVX2) vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_avx2; + vpx_sub_pixel_variance32x64 = vpx_sub_pixel_variance32x64_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance32x64 = vpx_sub_pixel_variance32x64_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance32x64 = vpx_sub_pixel_variance32x64_ssse3; + vpx_sub_pixel_variance4x4 = vpx_sub_pixel_variance4x4_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance4x4 = vpx_sub_pixel_variance4x4_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance4x4 = vpx_sub_pixel_variance4x4_ssse3; + vpx_sub_pixel_variance4x8 = vpx_sub_pixel_variance4x8_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance4x8 = vpx_sub_pixel_variance4x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance4x8 = vpx_sub_pixel_variance4x8_ssse3; + vpx_sub_pixel_variance64x32 = vpx_sub_pixel_variance64x32_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance64x32 = vpx_sub_pixel_variance64x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance64x32 = vpx_sub_pixel_variance64x32_ssse3; + vpx_sub_pixel_variance64x64 = vpx_sub_pixel_variance64x64_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance64x64 = vpx_sub_pixel_variance64x64_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance64x64 = vpx_sub_pixel_variance64x64_ssse3; + if (flags & HAS_AVX2) vpx_sub_pixel_variance64x64 = vpx_sub_pixel_variance64x64_avx2; + vpx_sub_pixel_variance8x16 = vpx_sub_pixel_variance8x16_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance8x16 = vpx_sub_pixel_variance8x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x16 = vpx_sub_pixel_variance8x16_ssse3; + vpx_sub_pixel_variance8x4 = vpx_sub_pixel_variance8x4_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance8x4 = vpx_sub_pixel_variance8x4_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x4 = vpx_sub_pixel_variance8x4_ssse3; + vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_c; + if (flags & HAS_SSE2) vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_ssse3; + vpx_subtract_block = vpx_subtract_block_c; + if (flags & HAS_SSE2) vpx_subtract_block = vpx_subtract_block_sse2; + if (flags & HAS_AVX2) vpx_subtract_block = vpx_subtract_block_avx2; + vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_c; + if (flags & HAS_SSE2) vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_sse2; + vpx_tm_predictor_16x16 = vpx_tm_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_tm_predictor_16x16 = vpx_tm_predictor_16x16_sse2; + vpx_tm_predictor_32x32 = vpx_tm_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_tm_predictor_32x32 = vpx_tm_predictor_32x32_sse2; + vpx_tm_predictor_4x4 = vpx_tm_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_tm_predictor_4x4 = vpx_tm_predictor_4x4_sse2; + vpx_tm_predictor_8x8 = vpx_tm_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_tm_predictor_8x8 = vpx_tm_predictor_8x8_sse2; + vpx_v_predictor_16x16 = vpx_v_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_v_predictor_16x16 = vpx_v_predictor_16x16_sse2; + vpx_v_predictor_32x32 = vpx_v_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_v_predictor_32x32 = vpx_v_predictor_32x32_sse2; + vpx_v_predictor_4x4 = vpx_v_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_v_predictor_4x4 = vpx_v_predictor_4x4_sse2; + vpx_v_predictor_8x8 = vpx_v_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_v_predictor_8x8 = vpx_v_predictor_8x8_sse2; + vpx_variance16x16 = vpx_variance16x16_c; + if (flags & HAS_SSE2) vpx_variance16x16 = vpx_variance16x16_sse2; + if (flags & HAS_AVX2) vpx_variance16x16 = vpx_variance16x16_avx2; + vpx_variance16x32 = vpx_variance16x32_c; + if (flags & HAS_SSE2) vpx_variance16x32 = vpx_variance16x32_sse2; + if (flags & HAS_AVX2) vpx_variance16x32 = vpx_variance16x32_avx2; + vpx_variance16x8 = vpx_variance16x8_c; + if (flags & HAS_SSE2) vpx_variance16x8 = vpx_variance16x8_sse2; + if (flags & HAS_AVX2) vpx_variance16x8 = vpx_variance16x8_avx2; + vpx_variance32x16 = vpx_variance32x16_c; + if (flags & HAS_SSE2) vpx_variance32x16 = vpx_variance32x16_sse2; + if (flags & HAS_AVX2) vpx_variance32x16 = vpx_variance32x16_avx2; + vpx_variance32x32 = vpx_variance32x32_c; + if (flags & HAS_SSE2) vpx_variance32x32 = vpx_variance32x32_sse2; + if (flags & HAS_AVX2) vpx_variance32x32 = vpx_variance32x32_avx2; + vpx_variance32x64 = vpx_variance32x64_c; + if (flags & HAS_SSE2) vpx_variance32x64 = vpx_variance32x64_sse2; + if (flags & HAS_AVX2) vpx_variance32x64 = vpx_variance32x64_avx2; + vpx_variance4x4 = vpx_variance4x4_c; + if (flags & HAS_SSE2) vpx_variance4x4 = vpx_variance4x4_sse2; + vpx_variance4x8 = vpx_variance4x8_c; + if (flags & HAS_SSE2) vpx_variance4x8 = vpx_variance4x8_sse2; + vpx_variance64x32 = vpx_variance64x32_c; + if (flags & HAS_SSE2) vpx_variance64x32 = vpx_variance64x32_sse2; + if (flags & HAS_AVX2) vpx_variance64x32 = vpx_variance64x32_avx2; + vpx_variance64x64 = vpx_variance64x64_c; + if (flags & HAS_SSE2) vpx_variance64x64 = vpx_variance64x64_sse2; + if (flags & HAS_AVX2) vpx_variance64x64 = vpx_variance64x64_avx2; + vpx_variance8x16 = vpx_variance8x16_c; + if (flags & HAS_SSE2) vpx_variance8x16 = vpx_variance8x16_sse2; + if (flags & HAS_AVX2) vpx_variance8x16 = vpx_variance8x16_avx2; + vpx_variance8x4 = vpx_variance8x4_c; + if (flags & HAS_SSE2) vpx_variance8x4 = vpx_variance8x4_sse2; + if (flags & HAS_AVX2) vpx_variance8x4 = vpx_variance8x4_avx2; + vpx_variance8x8 = vpx_variance8x8_c; + if (flags & HAS_SSE2) vpx_variance8x8 = vpx_variance8x8_sse2; + if (flags & HAS_AVX2) vpx_variance8x8 = vpx_variance8x8_avx2; + vpx_vector_var = vpx_vector_var_c; + if (flags & HAS_SSE2) vpx_vector_var = vpx_vector_var_sse2; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/win/ia32/vpx_scale_rtcd.h b/media/libvpx/config/win/ia32/vpx_scale_rtcd.h new file mode 100644 index 0000000000..5f09104ea6 --- /dev/null +++ b/media/libvpx/config/win/ia32/vpx_scale_rtcd.h @@ -0,0 +1,73 @@ +// This file is generated. Do not edit. +#ifndef VPX_SCALE_RTCD_H_ +#define VPX_SCALE_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c + +void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c + +void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c + +void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c + +void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c + +void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c + +void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c + +void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vp8_yv12_copy_frame vp8_yv12_copy_frame_c + +void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c + +void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_borders vpx_extend_frame_borders_c + +void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c + +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c + +void vpx_scale_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/x86.h" +static void setup_rtcd_internal(void) +{ + int flags = x86_simd_caps(); + + (void)flags; + +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/win/x64/vp8_rtcd.h b/media/libvpx/config/win/x64/vp8_rtcd.h new file mode 100644 index 0000000000..dc850b4fe0 --- /dev/null +++ b/media/libvpx/config/win/x64/vp8_rtcd.h @@ -0,0 +1,248 @@ +// This file is generated. Do not edit. +#ifndef VP8_RTCD_H_ +#define VP8_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP8 + */ + +struct blockd; +struct macroblockd; +struct loop_filter_info; + +/* Encoder forward decls */ +struct block; +struct macroblock; +struct variance_vtable; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict4x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_sse2 + +void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_sse2 + +void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +int vp8_block_error_c(short *coeff, short *dqcoeff); +int vp8_block_error_sse2(short *coeff, short *dqcoeff); +#define vp8_block_error vp8_block_error_sse2 + +void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +void vp8_copy32xn_sse2(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +void vp8_copy32xn_sse3(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); + +void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem16x16 vp8_copy_mem16x16_sse2 + +void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x4_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem8x4 vp8_copy_mem8x4_mmx + +void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x8_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem8x8 vp8_copy_mem8x8_mmx + +void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +#define vp8_dc_only_idct_add vp8_dc_only_idct_add_mmx + +int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter vp8_denoiser_filter_sse2 + +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_sse2 + +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride); +void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *dest, int stride); +#define vp8_dequant_idct_add vp8_dequant_idct_add_mmx + +void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +void vp8_dequant_idct_add_uv_block_sse2(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_sse2 + +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2 + +void vp8_dequantize_b_c(struct blockd*, short *DQC); +void vp8_dequantize_b_mmx(struct blockd*, short *DQC); +#define vp8_dequantize_b vp8_dequantize_b_mmx + +int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_diamond_search_sad vp8_diamond_search_sadx4 + +void vp8_fast_quantize_b_c(struct block *, struct blockd *); +void vp8_fast_quantize_b_sse2(struct block *, struct blockd *); +void vp8_fast_quantize_b_ssse3(struct block *, struct blockd *); +RTCD_EXTERN void (*vp8_fast_quantize_b)(struct block *, struct blockd *); + +void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +void vp8_filter_by_weight16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight16x16 vp8_filter_by_weight16x16_sse2 + +void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight4x4 vp8_filter_by_weight4x4_c + +void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight8x8 vp8_filter_by_weight8x8_sse2 + +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bh vp8_loop_filter_bh_sse2 + +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bv vp8_loop_filter_bv_sse2 + +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbh vp8_loop_filter_mbh_sse2 + +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbv vp8_loop_filter_mbv_sse2 + +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_sse2 + +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_sse2 + +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_sse2 + +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_sse2 + +int vp8_mbblock_error_c(struct macroblock *mb, int dc); +int vp8_mbblock_error_sse2(struct macroblock *mb, int dc); +#define vp8_mbblock_error vp8_mbblock_error_sse2 + +int vp8_mbuverror_c(struct macroblock *mb); +int vp8_mbuverror_sse2(struct macroblock *mb); +#define vp8_mbuverror vp8_mbuverror_sse2 + +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_refining_search_sad vp8_refining_search_sadx4 + +void vp8_regular_quantize_b_c(struct block *, struct blockd *); +void vp8_regular_quantize_b_sse2(struct block *, struct blockd *); +void vp8_regular_quantize_b_sse4_1(struct block *, struct blockd *); +RTCD_EXTERN void (*vp8_regular_quantize_b)(struct block *, struct blockd *); + +void vp8_short_fdct4x4_c(short *input, short *output, int pitch); +void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch); +#define vp8_short_fdct4x4 vp8_short_fdct4x4_sse2 + +void vp8_short_fdct8x4_c(short *input, short *output, int pitch); +void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch); +#define vp8_short_fdct8x4 vp8_short_fdct8x4_sse2 + +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +#define vp8_short_idct4x4llm vp8_short_idct4x4llm_mmx + +void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff); +void vp8_short_inv_walsh4x4_sse2(short *input, short *mb_dqcoeff); +#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_sse2 + +void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + +void vp8_short_walsh4x4_c(short *input, short *output, int pitch); +void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch); +#define vp8_short_walsh4x4 vp8_short_walsh4x4_sse2 + +void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); +void vp8_temporal_filter_apply_sse2(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); +#define vp8_temporal_filter_apply vp8_temporal_filter_apply_sse2 + +void vp8_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/x86.h" +static void setup_rtcd_internal(void) +{ + int flags = x86_simd_caps(); + + (void)flags; + + vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_sse2; + if (flags & HAS_SSSE3) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_ssse3; + vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_sse2; + if (flags & HAS_SSSE3) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_ssse3; + vp8_copy32xn = vp8_copy32xn_sse2; + if (flags & HAS_SSE3) vp8_copy32xn = vp8_copy32xn_sse3; + vp8_fast_quantize_b = vp8_fast_quantize_b_sse2; + if (flags & HAS_SSSE3) vp8_fast_quantize_b = vp8_fast_quantize_b_ssse3; + vp8_regular_quantize_b = vp8_regular_quantize_b_sse2; + if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1; + vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_ssse3; + vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_mmx; + if (flags & HAS_SSSE3) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_ssse3; + vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_ssse3; + vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_ssse3; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/win/x64/vp9_rtcd.h b/media/libvpx/config/win/x64/vp9_rtcd.h new file mode 100644 index 0000000000..8644c3598d --- /dev/null +++ b/media/libvpx/config/win/x64/vp9_rtcd.h @@ -0,0 +1,135 @@ +// This file is generated. Do not edit. +#ifndef VP9_RTCD_H_ +#define VP9_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP9 + */ + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" + +struct macroblockd; + +/* Encoder forward decls */ +struct macroblock; +struct macroblock_plane; +struct vp9_sad_table; +struct ScanOrder; +struct search_site_config; +struct mv; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); +void vp9_apply_temporal_filter_sse4_1(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); +RTCD_EXTERN void (*vp9_apply_temporal_filter)(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); + +int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +RTCD_EXTERN int64_t (*vp9_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); + +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +RTCD_EXTERN int64_t (*vp9_block_error_fp)(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); + +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_c + +void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht16x16 vp9_fht16x16_sse2 + +void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht4x4 vp9_fht4x4_sse2 + +void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht8x8 vp9_fht8x8_sse2 + +void vp9_filter_by_weight16x16_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); +void vp9_filter_by_weight16x16_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); +#define vp9_filter_by_weight16x16 vp9_filter_by_weight16x16_sse2 + +void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); +void vp9_filter_by_weight8x8_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); +#define vp9_filter_by_weight8x8 vp9_filter_by_weight8x8_sse2 + +void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); +void vp9_fwht4x4_sse2(const int16_t *input, tran_low_t *output, int stride); +#define vp9_fwht4x4 vp9_fwht4x4_sse2 + +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2 + +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht4x4_16_add vp9_iht4x4_16_add_sse2 + +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht8x8_64_add vp9_iht8x8_64_add_sse2 + +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); + +void vp9_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/x86.h" +static void setup_rtcd_internal(void) +{ + int flags = x86_simd_caps(); + + (void)flags; + + vp9_apply_temporal_filter = vp9_apply_temporal_filter_c; + if (flags & HAS_SSE4_1) vp9_apply_temporal_filter = vp9_apply_temporal_filter_sse4_1; + vp9_block_error = vp9_block_error_sse2; + if (flags & HAS_AVX2) vp9_block_error = vp9_block_error_avx2; + vp9_block_error_fp = vp9_block_error_fp_sse2; + if (flags & HAS_AVX2) vp9_block_error_fp = vp9_block_error_fp_avx2; + vp9_quantize_fp = vp9_quantize_fp_sse2; + if (flags & HAS_SSSE3) vp9_quantize_fp = vp9_quantize_fp_ssse3; + if (flags & HAS_AVX2) vp9_quantize_fp = vp9_quantize_fp_avx2; + vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_c; + if (flags & HAS_SSSE3) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_ssse3; + if (flags & HAS_AVX2) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_avx2; + vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c; + if (flags & HAS_SSSE3) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_ssse3; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/win/x64/vpx_config.asm b/media/libvpx/config/win/x64/vpx_config.asm new file mode 100644 index 0000000000..a1d34d6d37 --- /dev/null +++ b/media/libvpx/config/win/x64/vpx_config.asm @@ -0,0 +1,93 @@ +%define VPX_ARCH_ARM 0 +%define VPX_ARCH_AARCH64 0 +%define VPX_ARCH_MIPS 0 +%define VPX_ARCH_X86 0 +%define VPX_ARCH_X86_64 1 +%define VPX_ARCH_PPC 0 +%define VPX_ARCH_LOONGARCH 0 +%define HAVE_NEON_ASM 0 +%define HAVE_NEON 0 +%define HAVE_NEON_DOTPROD 0 +%define HAVE_NEON_I8MM 0 +%define HAVE_SVE 0 +%define HAVE_MIPS32 0 +%define HAVE_DSPR2 0 +%define HAVE_MSA 0 +%define HAVE_MIPS64 0 +%define HAVE_MMX 1 +%define HAVE_SSE 1 +%define HAVE_SSE2 1 +%define HAVE_SSE3 1 +%define HAVE_SSSE3 1 +%define HAVE_SSE4_1 1 +%define HAVE_AVX 1 +%define HAVE_AVX2 1 +%define HAVE_AVX512 0 +%define HAVE_VSX 0 +%define HAVE_MMI 0 +%define HAVE_LSX 0 +%define HAVE_LASX 0 +%define HAVE_VPX_PORTS 1 +%define HAVE_PTHREAD_H 0 +%define CONFIG_DEPENDENCY_TRACKING 1 +%define CONFIG_EXTERNAL_BUILD 1 +%define CONFIG_INSTALL_DOCS 0 +%define CONFIG_INSTALL_BINS 1 +%define CONFIG_INSTALL_LIBS 1 +%define CONFIG_INSTALL_SRCS 0 +%define CONFIG_DEBUG 0 +%define CONFIG_GPROF 0 +%define CONFIG_GCOV 0 +%define CONFIG_RVCT 0 +%define CONFIG_GCC 0 +%define CONFIG_MSVS 1 +%define CONFIG_PIC 1 +%define CONFIG_BIG_ENDIAN 0 +%define CONFIG_CODEC_SRCS 0 +%define CONFIG_DEBUG_LIBS 0 +%define CONFIG_DEQUANT_TOKENS 0 +%define CONFIG_DC_RECON 0 +%define CONFIG_RUNTIME_CPU_DETECT 1 +%define CONFIG_POSTPROC 1 +%define CONFIG_VP9_POSTPROC 1 +%define CONFIG_MULTITHREAD 1 +%define CONFIG_INTERNAL_STATS 0 +%define CONFIG_VP8_ENCODER 1 +%define CONFIG_VP8_DECODER 1 +%define CONFIG_VP9_ENCODER 1 +%define CONFIG_VP9_DECODER 1 +%define CONFIG_VP8 1 +%define CONFIG_VP9 1 +%define CONFIG_ENCODERS 1 +%define CONFIG_DECODERS 1 +%define CONFIG_STATIC_MSVCRT 0 +%define CONFIG_SPATIAL_RESAMPLING 1 +%define CONFIG_REALTIME_ONLY 0 +%define CONFIG_ONTHEFLY_BITPACKING 0 +%define CONFIG_ERROR_CONCEALMENT 0 +%define CONFIG_SHARED 0 +%define CONFIG_STATIC 1 +%define CONFIG_SMALL 0 +%define CONFIG_POSTPROC_VISUALIZER 0 +%define CONFIG_OS_SUPPORT 1 +%define CONFIG_UNIT_TESTS 0 +%define CONFIG_WEBM_IO 1 +%define CONFIG_LIBYUV 1 +%define CONFIG_DECODE_PERF_TESTS 0 +%define CONFIG_ENCODE_PERF_TESTS 0 +%define CONFIG_MULTI_RES_ENCODING 1 +%define CONFIG_TEMPORAL_DENOISING 1 +%define CONFIG_VP9_TEMPORAL_DENOISING 0 +%define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +%define CONFIG_VP9_HIGHBITDEPTH 0 +%define CONFIG_BETTER_HW_COMPATIBILITY 0 +%define CONFIG_EXPERIMENTAL 0 +%define CONFIG_SIZE_LIMIT 1 +%define CONFIG_ALWAYS_ADJUST_BPM 0 +%define CONFIG_BITSTREAM_DEBUG 0 +%define CONFIG_MISMATCH_DEBUG 0 +%define CONFIG_FP_MB_STATS 0 +%define CONFIG_EMULATE_HARDWARE 0 +%define CONFIG_NON_GREEDY_MV 0 +%define CONFIG_RATE_CTRL 0 +%define CONFIG_COLLECT_COMPONENT_TIMING 0 diff --git a/media/libvpx/config/win/x64/vpx_config.c b/media/libvpx/config/win/x64/vpx_config.c new file mode 100644 index 0000000000..8c04c1a3cf --- /dev/null +++ b/media/libvpx/config/win/x64/vpx_config.c @@ -0,0 +1,10 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +#include "vpx/vpx_codec.h" +static const char* const cfg = "--target=x86_64-win64-vs15 --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm"; +const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/win/x64/vpx_config.h b/media/libvpx/config/win/x64/vpx_config.h new file mode 100644 index 0000000000..068c6d2a99 --- /dev/null +++ b/media/libvpx/config/win/x64/vpx_config.h @@ -0,0 +1,108 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +/* This file automatically generated by configure. Do not edit! */ +#ifndef VPX_CONFIG_H +#define VPX_CONFIG_H +#define RESTRICT +#define INLINE __inline +#define VPX_ARCH_ARM 0 +#define VPX_ARCH_AARCH64 0 +#define VPX_ARCH_MIPS 0 +#define VPX_ARCH_X86 0 +#define VPX_ARCH_X86_64 1 +#define VPX_ARCH_PPC 0 +#define VPX_ARCH_LOONGARCH 0 +#define HAVE_NEON_ASM 0 +#define HAVE_NEON 0 +#define HAVE_NEON_DOTPROD 0 +#define HAVE_NEON_I8MM 0 +#define HAVE_SVE 0 +#define HAVE_MIPS32 0 +#define HAVE_DSPR2 0 +#define HAVE_MSA 0 +#define HAVE_MIPS64 0 +#define HAVE_MMX 1 +#define HAVE_SSE 1 +#define HAVE_SSE2 1 +#define HAVE_SSE3 1 +#define HAVE_SSSE3 1 +#define HAVE_SSE4_1 1 +#define HAVE_AVX 1 +#define HAVE_AVX2 1 +#define HAVE_AVX512 0 +#define HAVE_VSX 0 +#define HAVE_MMI 0 +#define HAVE_LSX 0 +#define HAVE_LASX 0 +#define HAVE_VPX_PORTS 1 +#define HAVE_PTHREAD_H 0 +#define CONFIG_DEPENDENCY_TRACKING 1 +#define CONFIG_EXTERNAL_BUILD 1 +#define CONFIG_INSTALL_DOCS 0 +#define CONFIG_INSTALL_BINS 1 +#define CONFIG_INSTALL_LIBS 1 +#define CONFIG_INSTALL_SRCS 0 +#define CONFIG_DEBUG 0 +#define CONFIG_GPROF 0 +#define CONFIG_GCOV 0 +#define CONFIG_RVCT 0 +#define CONFIG_GCC 0 +#define CONFIG_MSVS 1 +#define CONFIG_PIC 1 +#define CONFIG_BIG_ENDIAN 0 +#define CONFIG_CODEC_SRCS 0 +#define CONFIG_DEBUG_LIBS 0 +#define CONFIG_DEQUANT_TOKENS 0 +#define CONFIG_DC_RECON 0 +#define CONFIG_RUNTIME_CPU_DETECT 1 +#define CONFIG_POSTPROC 1 +#define CONFIG_VP9_POSTPROC 1 +#define CONFIG_MULTITHREAD 1 +#define CONFIG_INTERNAL_STATS 0 +#define CONFIG_VP8_ENCODER 1 +#define CONFIG_VP8_DECODER 1 +#define CONFIG_VP9_ENCODER 1 +#define CONFIG_VP9_DECODER 1 +#define CONFIG_VP8 1 +#define CONFIG_VP9 1 +#define CONFIG_ENCODERS 1 +#define CONFIG_DECODERS 1 +#define CONFIG_STATIC_MSVCRT 0 +#define CONFIG_SPATIAL_RESAMPLING 1 +#define CONFIG_REALTIME_ONLY 0 +#define CONFIG_ONTHEFLY_BITPACKING 0 +#define CONFIG_ERROR_CONCEALMENT 0 +#define CONFIG_SHARED 0 +#define CONFIG_STATIC 1 +#define CONFIG_SMALL 0 +#define CONFIG_POSTPROC_VISUALIZER 0 +#define CONFIG_OS_SUPPORT 1 +#define CONFIG_UNIT_TESTS 0 +#define CONFIG_WEBM_IO 1 +#define CONFIG_LIBYUV 1 +#define CONFIG_DECODE_PERF_TESTS 0 +#define CONFIG_ENCODE_PERF_TESTS 0 +#define CONFIG_MULTI_RES_ENCODING 1 +#define CONFIG_TEMPORAL_DENOISING 1 +#define CONFIG_VP9_TEMPORAL_DENOISING 0 +#define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 +#define CONFIG_BETTER_HW_COMPATIBILITY 0 +#define CONFIG_EXPERIMENTAL 0 +#define CONFIG_SIZE_LIMIT 1 +#define CONFIG_ALWAYS_ADJUST_BPM 0 +#define CONFIG_BITSTREAM_DEBUG 0 +#define CONFIG_MISMATCH_DEBUG 0 +#define CONFIG_FP_MB_STATS 0 +#define CONFIG_EMULATE_HARDWARE 0 +#define CONFIG_NON_GREEDY_MV 0 +#define CONFIG_RATE_CTRL 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 +#define DECODE_WIDTH_LIMIT 8192 +#define DECODE_HEIGHT_LIMIT 4608 +#endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/win/x64/vpx_dsp_rtcd.h b/media/libvpx/config/win/x64/vpx_dsp_rtcd.h new file mode 100644 index 0000000000..755e916dd1 --- /dev/null +++ b/media/libvpx/config/win/x64/vpx_dsp_rtcd.h @@ -0,0 +1,1296 @@ +// This file is generated. Do not edit. +#ifndef VPX_DSP_RTCD_H_ +#define VPX_DSP_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * DSP + */ + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; + struct ScanOrder; +#endif + + +#ifdef __cplusplus +extern "C" { +#endif + +unsigned int vpx_avg_4x4_c(const uint8_t *, int p); +unsigned int vpx_avg_4x4_sse2(const uint8_t *, int p); +#define vpx_avg_4x4 vpx_avg_4x4_sse2 + +unsigned int vpx_avg_8x8_c(const uint8_t *, int p); +unsigned int vpx_avg_8x8_sse2(const uint8_t *, int p); +#define vpx_avg_8x8 vpx_avg_8x8_sse2 + +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +void vpx_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +RTCD_EXTERN void (*vpx_comp_avg_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); + +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve_avg vpx_convolve_avg_sse2 + +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve_copy vpx_convolve_copy_sse2 + +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c + +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c + +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c + +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c + +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c + +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c + +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c + +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c + +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_sse2 + +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_sse2 + +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_sse2 + +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c + +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c + +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_sse2 + +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_sse2 + +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_sse2 + +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_sse2 + +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_sse2 + +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_sse2 + +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_sse2 + +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_sse2 + +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_sse2 + +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_sse2 + +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_sse2 + +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_sse2 + +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_sse2 + +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_sse2 + +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_sse2 + +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_sse2 + +void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct16x16)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct16x16_1 vpx_fdct16x16_1_sse2 + +void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_sse2(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_avx2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct32x32)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32_1 vpx_fdct32x32_1_sse2 + +void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_rd_sse2(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_rd_avx2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct32x32_rd)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct4x4_sse2(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4 vpx_fdct4x4_sse2 + +void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4_1 vpx_fdct4x4_1_sse2 + +void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_sse2(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_ssse3(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct8x8)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct8x8_1 vpx_fdct8x8_1_sse2 + +void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); +#define vpx_get4x4sse_cs vpx_get4x4sse_cs_c + +void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get8x8var vpx_get8x8var_sse2 + +unsigned int vpx_get_mb_ss_c(const int16_t *); +unsigned int vpx_get_mb_ss_sse2(const int16_t *); +#define vpx_get_mb_ss vpx_get_mb_ss_sse2 + +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_16x16 vpx_h_predictor_16x16_sse2 + +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_32x32 vpx_h_predictor_32x32_sse2 + +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_sse2 + +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_sse2 + +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); + +void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_32x32)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); + +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_8x8_ssse3(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c + +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_10_add vpx_idct16x16_10_add_sse2 + +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_1_add vpx_idct16x16_1_add_sse2 + +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_38_add vpx_idct16x16_38_add_sse2 + +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_1_add vpx_idct32x32_1_add_sse2 + +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct4x4_16_add vpx_idct4x4_16_add_sse2 + +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct4x4_1_add vpx_idct4x4_1_add_sse2 + +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_1_add vpx_idct8x8_1_add_sse2 + +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_64_add vpx_idct8x8_64_add_sse2 + +int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); +int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width); +#define vpx_int_pro_col vpx_int_pro_col_sse2 + +void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +#define vpx_int_pro_row vpx_int_pro_row_sse2 + +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_sse2 + +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c + +void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_avx2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_dual_avx2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_16_dual)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_4 vpx_lpf_horizontal_4_sse2 + +void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_horizontal_4_dual vpx_lpf_horizontal_4_dual_sse2 + +void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_8 vpx_lpf_horizontal_8_sse2 + +void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_horizontal_8_dual vpx_lpf_horizontal_8_dual_sse2 + +void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_sse2 + +void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16_dual vpx_lpf_vertical_16_dual_sse2 + +void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_4 vpx_lpf_vertical_4_sse2 + +void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_vertical_4_dual vpx_lpf_vertical_4_dual_sse2 + +void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_8 vpx_lpf_vertical_8_sse2 + +void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_sse2 + +void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols,int flimit); +void vpx_mbpost_proc_across_ip_sse2(unsigned char *src, int pitch, int rows, int cols,int flimit); +#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_sse2 + +void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); +void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit); +#define vpx_mbpost_proc_down vpx_mbpost_proc_down_sse2 + +void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +#define vpx_minmax_8x8 vpx_minmax_8x8_sse2 + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_mse8x16 vpx_mse8x16_sse2 + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_mse8x8 vpx_mse8x8_sse2 + +void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch); +void vpx_plane_add_noise_sse2(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch); +#define vpx_plane_add_noise vpx_plane_add_noise_sse2 + +void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +void vpx_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +#define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_sse2 + +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vpx_quantize_b_32x32)(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad16x16 vpx_sad16x16_sse2 + +unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x16_avg vpx_sad16x16_avg_sse2 + +void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad16x16x4d vpx_sad16x16x4d_sse2 + +unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad16x32 vpx_sad16x32_sse2 + +unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x32_avg vpx_sad16x32_avg_sse2 + +void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad16x32x4d vpx_sad16x32x4d_sse2 + +unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad16x8 vpx_sad16x8_sse2 + +unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x8_avg vpx_sad16x8_avg_sse2 + +void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad16x8x4d vpx_sad16x8x4d_sse2 + +unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x16_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad32x16x4d vpx_sad32x16x4d_sse2 + +unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad32x64x4d vpx_sad32x64x4d_sse2 + +unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad4x4 vpx_sad4x4_sse2 + +unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad4x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x4_avg vpx_sad4x4_avg_sse2 + +void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad4x4x4d vpx_sad4x4x4d_sse2 + +unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad4x8 vpx_sad4x8_sse2 + +unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad4x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x8_avg vpx_sad4x8_avg_sse2 + +void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad4x8x4d vpx_sad4x8x4d_sse2 + +unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad64x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad64x32x4d vpx_sad64x32x4d_sse2 + +unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x16 vpx_sad8x16_sse2 + +unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x16_avg vpx_sad8x16_avg_sse2 + +void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x16x4d vpx_sad8x16x4d_sse2 + +unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x4 vpx_sad8x4_sse2 + +unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x4_avg vpx_sad8x4_avg_sse2 + +void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x4x4d vpx_sad8x4x4d_sse2 + +unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x8 vpx_sad8x8_sse2 + +unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x8_avg vpx_sad8x8_avg_sse2 + +void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x8x4d vpx_sad8x8x4d_sse2 + +unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x16 vpx_sad_skip_16x16_sse2 + +void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x16x4d vpx_sad_skip_16x16x4d_sse2 + +unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x32 vpx_sad_skip_16x32_sse2 + +void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x32x4d vpx_sad_skip_16x32x4d_sse2 + +unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x8 vpx_sad_skip_16x8_sse2 + +void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x8x4d vpx_sad_skip_16x8x4d_sse2 + +unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c + +void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c + +unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_sse2 + +void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_sse2 + +unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_sse2 + +void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_sse2 + +unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c + +void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c + +unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_sse2 + +void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_sse2 + +int vpx_satd_c(const int16_t *coeff, int length); +int vpx_satd_sse2(const int16_t *coeff, int length); +int vpx_satd_avx2(const int16_t *coeff, int length); +RTCD_EXTERN int (*vpx_satd)(const int16_t *coeff, int length); + +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c + +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c + +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c + +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_horiz vpx_scaled_horiz_c + +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_vert vpx_scaled_vert_c + +int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_sse4_1(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +RTCD_EXTERN int64_t (*vpx_sse)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); + +uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vpx_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +RTCD_EXTERN void (*vpx_subtract_block)(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); + +uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size); +uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size); +#define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_sse2 + +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_sse2 + +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_sse2 + +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_sse2 + +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_sse2 + +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_16x16 vpx_v_predictor_16x16_sse2 + +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_32x32 vpx_v_predictor_32x32_sse2 + +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_sse2 + +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_sse2 + +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x4 vpx_variance4x4_sse2 + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x8 vpx_variance4x8_sse2 + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c + +int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl); +int vpx_vector_var_sse2(const int16_t *ref, const int16_t *src, const int bwl); +#define vpx_vector_var vpx_vector_var_sse2 + +void vpx_dsp_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/x86.h" +static void setup_rtcd_internal(void) +{ + int flags = x86_simd_caps(); + + (void)flags; + + vpx_comp_avg_pred = vpx_comp_avg_pred_sse2; + if (flags & HAS_AVX2) vpx_comp_avg_pred = vpx_comp_avg_pred_avx2; + vpx_convolve8 = vpx_convolve8_sse2; + if (flags & HAS_SSSE3) vpx_convolve8 = vpx_convolve8_ssse3; + if (flags & HAS_AVX2) vpx_convolve8 = vpx_convolve8_avx2; + vpx_convolve8_avg = vpx_convolve8_avg_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_avg = vpx_convolve8_avg_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg = vpx_convolve8_avg_avx2; + vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_avx2; + vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_avx2; + vpx_convolve8_horiz = vpx_convolve8_horiz_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_horiz = vpx_convolve8_horiz_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_horiz = vpx_convolve8_horiz_avx2; + vpx_convolve8_vert = vpx_convolve8_vert_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_vert = vpx_convolve8_vert_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_vert = vpx_convolve8_vert_avx2; + vpx_d153_predictor_16x16 = vpx_d153_predictor_16x16_c; + if (flags & HAS_SSSE3) vpx_d153_predictor_16x16 = vpx_d153_predictor_16x16_ssse3; + vpx_d153_predictor_32x32 = vpx_d153_predictor_32x32_c; + if (flags & HAS_SSSE3) vpx_d153_predictor_32x32 = vpx_d153_predictor_32x32_ssse3; + vpx_d153_predictor_4x4 = vpx_d153_predictor_4x4_c; + if (flags & HAS_SSSE3) vpx_d153_predictor_4x4 = vpx_d153_predictor_4x4_ssse3; + vpx_d153_predictor_8x8 = vpx_d153_predictor_8x8_c; + if (flags & HAS_SSSE3) vpx_d153_predictor_8x8 = vpx_d153_predictor_8x8_ssse3; + vpx_d207_predictor_16x16 = vpx_d207_predictor_16x16_c; + if (flags & HAS_SSSE3) vpx_d207_predictor_16x16 = vpx_d207_predictor_16x16_ssse3; + vpx_d207_predictor_32x32 = vpx_d207_predictor_32x32_c; + if (flags & HAS_SSSE3) vpx_d207_predictor_32x32 = vpx_d207_predictor_32x32_ssse3; + vpx_d207_predictor_8x8 = vpx_d207_predictor_8x8_c; + if (flags & HAS_SSSE3) vpx_d207_predictor_8x8 = vpx_d207_predictor_8x8_ssse3; + vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_c; + if (flags & HAS_SSSE3) vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_ssse3; + vpx_d45_predictor_32x32 = vpx_d45_predictor_32x32_c; + if (flags & HAS_SSSE3) vpx_d45_predictor_32x32 = vpx_d45_predictor_32x32_ssse3; + vpx_d63_predictor_16x16 = vpx_d63_predictor_16x16_c; + if (flags & HAS_SSSE3) vpx_d63_predictor_16x16 = vpx_d63_predictor_16x16_ssse3; + vpx_d63_predictor_32x32 = vpx_d63_predictor_32x32_c; + if (flags & HAS_SSSE3) vpx_d63_predictor_32x32 = vpx_d63_predictor_32x32_ssse3; + vpx_d63_predictor_4x4 = vpx_d63_predictor_4x4_c; + if (flags & HAS_SSSE3) vpx_d63_predictor_4x4 = vpx_d63_predictor_4x4_ssse3; + vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_c; + if (flags & HAS_SSSE3) vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_ssse3; + vpx_fdct16x16 = vpx_fdct16x16_sse2; + if (flags & HAS_AVX2) vpx_fdct16x16 = vpx_fdct16x16_avx2; + vpx_fdct32x32 = vpx_fdct32x32_sse2; + if (flags & HAS_AVX2) vpx_fdct32x32 = vpx_fdct32x32_avx2; + vpx_fdct32x32_rd = vpx_fdct32x32_rd_sse2; + if (flags & HAS_AVX2) vpx_fdct32x32_rd = vpx_fdct32x32_rd_avx2; + vpx_fdct8x8 = vpx_fdct8x8_sse2; + if (flags & HAS_SSSE3) vpx_fdct8x8 = vpx_fdct8x8_ssse3; + vpx_get16x16var = vpx_get16x16var_sse2; + if (flags & HAS_AVX2) vpx_get16x16var = vpx_get16x16var_avx2; + vpx_hadamard_16x16 = vpx_hadamard_16x16_sse2; + if (flags & HAS_AVX2) vpx_hadamard_16x16 = vpx_hadamard_16x16_avx2; + vpx_hadamard_32x32 = vpx_hadamard_32x32_sse2; + if (flags & HAS_AVX2) vpx_hadamard_32x32 = vpx_hadamard_32x32_avx2; + vpx_hadamard_8x8 = vpx_hadamard_8x8_sse2; + if (flags & HAS_SSSE3) vpx_hadamard_8x8 = vpx_hadamard_8x8_ssse3; + vpx_idct16x16_256_add = vpx_idct16x16_256_add_sse2; + if (flags & HAS_AVX2) vpx_idct16x16_256_add = vpx_idct16x16_256_add_avx2; + vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_sse2; + if (flags & HAS_AVX2) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_avx2; + vpx_idct32x32_135_add = vpx_idct32x32_135_add_sse2; + if (flags & HAS_SSSE3) vpx_idct32x32_135_add = vpx_idct32x32_135_add_ssse3; + if (flags & HAS_AVX2) vpx_idct32x32_135_add = vpx_idct32x32_135_add_avx2; + vpx_idct32x32_34_add = vpx_idct32x32_34_add_sse2; + if (flags & HAS_SSSE3) vpx_idct32x32_34_add = vpx_idct32x32_34_add_ssse3; + vpx_idct8x8_12_add = vpx_idct8x8_12_add_sse2; + if (flags & HAS_SSSE3) vpx_idct8x8_12_add = vpx_idct8x8_12_add_ssse3; + vpx_lpf_horizontal_16 = vpx_lpf_horizontal_16_sse2; + if (flags & HAS_AVX2) vpx_lpf_horizontal_16 = vpx_lpf_horizontal_16_avx2; + vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_sse2; + if (flags & HAS_AVX2) vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_avx2; + vpx_mse16x16 = vpx_mse16x16_sse2; + if (flags & HAS_AVX2) vpx_mse16x16 = vpx_mse16x16_avx2; + vpx_mse16x8 = vpx_mse16x8_sse2; + if (flags & HAS_AVX2) vpx_mse16x8 = vpx_mse16x8_avx2; + vpx_quantize_b = vpx_quantize_b_sse2; + if (flags & HAS_SSSE3) vpx_quantize_b = vpx_quantize_b_ssse3; + if (flags & HAS_AVX) vpx_quantize_b = vpx_quantize_b_avx; + if (flags & HAS_AVX2) vpx_quantize_b = vpx_quantize_b_avx2; + vpx_quantize_b_32x32 = vpx_quantize_b_32x32_c; + if (flags & HAS_SSSE3) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_ssse3; + if (flags & HAS_AVX) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx; + if (flags & HAS_AVX2) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx2; + vpx_sad32x16 = vpx_sad32x16_sse2; + if (flags & HAS_AVX2) vpx_sad32x16 = vpx_sad32x16_avx2; + vpx_sad32x16_avg = vpx_sad32x16_avg_sse2; + if (flags & HAS_AVX2) vpx_sad32x16_avg = vpx_sad32x16_avg_avx2; + vpx_sad32x32 = vpx_sad32x32_sse2; + if (flags & HAS_AVX2) vpx_sad32x32 = vpx_sad32x32_avx2; + vpx_sad32x32_avg = vpx_sad32x32_avg_sse2; + if (flags & HAS_AVX2) vpx_sad32x32_avg = vpx_sad32x32_avg_avx2; + vpx_sad32x32x4d = vpx_sad32x32x4d_sse2; + if (flags & HAS_AVX2) vpx_sad32x32x4d = vpx_sad32x32x4d_avx2; + vpx_sad32x64 = vpx_sad32x64_sse2; + if (flags & HAS_AVX2) vpx_sad32x64 = vpx_sad32x64_avx2; + vpx_sad32x64_avg = vpx_sad32x64_avg_sse2; + if (flags & HAS_AVX2) vpx_sad32x64_avg = vpx_sad32x64_avg_avx2; + vpx_sad64x32 = vpx_sad64x32_sse2; + if (flags & HAS_AVX2) vpx_sad64x32 = vpx_sad64x32_avx2; + vpx_sad64x32_avg = vpx_sad64x32_avg_sse2; + if (flags & HAS_AVX2) vpx_sad64x32_avg = vpx_sad64x32_avg_avx2; + vpx_sad64x64 = vpx_sad64x64_sse2; + if (flags & HAS_AVX2) vpx_sad64x64 = vpx_sad64x64_avx2; + vpx_sad64x64_avg = vpx_sad64x64_avg_sse2; + if (flags & HAS_AVX2) vpx_sad64x64_avg = vpx_sad64x64_avg_avx2; + vpx_sad64x64x4d = vpx_sad64x64x4d_sse2; + if (flags & HAS_AVX2) vpx_sad64x64x4d = vpx_sad64x64x4d_avx2; + vpx_sad_skip_32x16 = vpx_sad_skip_32x16_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_avx2; + vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_avx2; + vpx_sad_skip_32x32 = vpx_sad_skip_32x32_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_avx2; + vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_avx2; + vpx_sad_skip_32x64 = vpx_sad_skip_32x64_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_avx2; + vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_avx2; + vpx_sad_skip_64x32 = vpx_sad_skip_64x32_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_avx2; + vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_avx2; + vpx_sad_skip_64x64 = vpx_sad_skip_64x64_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_avx2; + vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_avx2; + vpx_satd = vpx_satd_sse2; + if (flags & HAS_AVX2) vpx_satd = vpx_satd_avx2; + vpx_scaled_2d = vpx_scaled_2d_c; + if (flags & HAS_SSSE3) vpx_scaled_2d = vpx_scaled_2d_ssse3; + vpx_sse = vpx_sse_c; + if (flags & HAS_SSE4_1) vpx_sse = vpx_sse_sse4_1; + if (flags & HAS_AVX2) vpx_sse = vpx_sse_avx2; + vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_ssse3; + vpx_sub_pixel_avg_variance16x32 = vpx_sub_pixel_avg_variance16x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance16x32 = vpx_sub_pixel_avg_variance16x32_ssse3; + vpx_sub_pixel_avg_variance16x8 = vpx_sub_pixel_avg_variance16x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance16x8 = vpx_sub_pixel_avg_variance16x8_ssse3; + vpx_sub_pixel_avg_variance32x16 = vpx_sub_pixel_avg_variance32x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance32x16 = vpx_sub_pixel_avg_variance32x16_ssse3; + vpx_sub_pixel_avg_variance32x32 = vpx_sub_pixel_avg_variance32x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance32x32 = vpx_sub_pixel_avg_variance32x32_ssse3; + if (flags & HAS_AVX2) vpx_sub_pixel_avg_variance32x32 = vpx_sub_pixel_avg_variance32x32_avx2; + vpx_sub_pixel_avg_variance32x64 = vpx_sub_pixel_avg_variance32x64_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance32x64 = vpx_sub_pixel_avg_variance32x64_ssse3; + vpx_sub_pixel_avg_variance4x4 = vpx_sub_pixel_avg_variance4x4_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance4x4 = vpx_sub_pixel_avg_variance4x4_ssse3; + vpx_sub_pixel_avg_variance4x8 = vpx_sub_pixel_avg_variance4x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance4x8 = vpx_sub_pixel_avg_variance4x8_ssse3; + vpx_sub_pixel_avg_variance64x32 = vpx_sub_pixel_avg_variance64x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance64x32 = vpx_sub_pixel_avg_variance64x32_ssse3; + vpx_sub_pixel_avg_variance64x64 = vpx_sub_pixel_avg_variance64x64_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance64x64 = vpx_sub_pixel_avg_variance64x64_ssse3; + if (flags & HAS_AVX2) vpx_sub_pixel_avg_variance64x64 = vpx_sub_pixel_avg_variance64x64_avx2; + vpx_sub_pixel_avg_variance8x16 = vpx_sub_pixel_avg_variance8x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance8x16 = vpx_sub_pixel_avg_variance8x16_ssse3; + vpx_sub_pixel_avg_variance8x4 = vpx_sub_pixel_avg_variance8x4_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance8x4 = vpx_sub_pixel_avg_variance8x4_ssse3; + vpx_sub_pixel_avg_variance8x8 = vpx_sub_pixel_avg_variance8x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance8x8 = vpx_sub_pixel_avg_variance8x8_ssse3; + vpx_sub_pixel_variance16x16 = vpx_sub_pixel_variance16x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance16x16 = vpx_sub_pixel_variance16x16_ssse3; + vpx_sub_pixel_variance16x32 = vpx_sub_pixel_variance16x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance16x32 = vpx_sub_pixel_variance16x32_ssse3; + vpx_sub_pixel_variance16x8 = vpx_sub_pixel_variance16x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance16x8 = vpx_sub_pixel_variance16x8_ssse3; + vpx_sub_pixel_variance32x16 = vpx_sub_pixel_variance32x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance32x16 = vpx_sub_pixel_variance32x16_ssse3; + vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_ssse3; + if (flags & HAS_AVX2) vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_avx2; + vpx_sub_pixel_variance32x64 = vpx_sub_pixel_variance32x64_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance32x64 = vpx_sub_pixel_variance32x64_ssse3; + vpx_sub_pixel_variance4x4 = vpx_sub_pixel_variance4x4_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance4x4 = vpx_sub_pixel_variance4x4_ssse3; + vpx_sub_pixel_variance4x8 = vpx_sub_pixel_variance4x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance4x8 = vpx_sub_pixel_variance4x8_ssse3; + vpx_sub_pixel_variance64x32 = vpx_sub_pixel_variance64x32_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance64x32 = vpx_sub_pixel_variance64x32_ssse3; + vpx_sub_pixel_variance64x64 = vpx_sub_pixel_variance64x64_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance64x64 = vpx_sub_pixel_variance64x64_ssse3; + if (flags & HAS_AVX2) vpx_sub_pixel_variance64x64 = vpx_sub_pixel_variance64x64_avx2; + vpx_sub_pixel_variance8x16 = vpx_sub_pixel_variance8x16_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x16 = vpx_sub_pixel_variance8x16_ssse3; + vpx_sub_pixel_variance8x4 = vpx_sub_pixel_variance8x4_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x4 = vpx_sub_pixel_variance8x4_ssse3; + vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_sse2; + if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_ssse3; + vpx_subtract_block = vpx_subtract_block_sse2; + if (flags & HAS_AVX2) vpx_subtract_block = vpx_subtract_block_avx2; + vpx_variance16x16 = vpx_variance16x16_sse2; + if (flags & HAS_AVX2) vpx_variance16x16 = vpx_variance16x16_avx2; + vpx_variance16x32 = vpx_variance16x32_sse2; + if (flags & HAS_AVX2) vpx_variance16x32 = vpx_variance16x32_avx2; + vpx_variance16x8 = vpx_variance16x8_sse2; + if (flags & HAS_AVX2) vpx_variance16x8 = vpx_variance16x8_avx2; + vpx_variance32x16 = vpx_variance32x16_sse2; + if (flags & HAS_AVX2) vpx_variance32x16 = vpx_variance32x16_avx2; + vpx_variance32x32 = vpx_variance32x32_sse2; + if (flags & HAS_AVX2) vpx_variance32x32 = vpx_variance32x32_avx2; + vpx_variance32x64 = vpx_variance32x64_sse2; + if (flags & HAS_AVX2) vpx_variance32x64 = vpx_variance32x64_avx2; + vpx_variance64x32 = vpx_variance64x32_sse2; + if (flags & HAS_AVX2) vpx_variance64x32 = vpx_variance64x32_avx2; + vpx_variance64x64 = vpx_variance64x64_sse2; + if (flags & HAS_AVX2) vpx_variance64x64 = vpx_variance64x64_avx2; + vpx_variance8x16 = vpx_variance8x16_sse2; + if (flags & HAS_AVX2) vpx_variance8x16 = vpx_variance8x16_avx2; + vpx_variance8x4 = vpx_variance8x4_sse2; + if (flags & HAS_AVX2) vpx_variance8x4 = vpx_variance8x4_avx2; + vpx_variance8x8 = vpx_variance8x8_sse2; + if (flags & HAS_AVX2) vpx_variance8x8 = vpx_variance8x8_avx2; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/config/win/x64/vpx_scale_rtcd.h b/media/libvpx/config/win/x64/vpx_scale_rtcd.h new file mode 100644 index 0000000000..5f09104ea6 --- /dev/null +++ b/media/libvpx/config/win/x64/vpx_scale_rtcd.h @@ -0,0 +1,73 @@ +// This file is generated. Do not edit. +#ifndef VPX_SCALE_RTCD_H_ +#define VPX_SCALE_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c + +void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c + +void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c + +void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c + +void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c + +void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c + +void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c + +void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vp8_yv12_copy_frame vp8_yv12_copy_frame_c + +void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c + +void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_borders vpx_extend_frame_borders_c + +void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c + +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c + +void vpx_scale_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/x86.h" +static void setup_rtcd_internal(void) +{ + int flags = x86_simd_caps(); + + (void)flags; + +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/media/libvpx/generate_sources_mozbuild.sh b/media/libvpx/generate_sources_mozbuild.sh new file mode 100755 index 0000000000..ef9bc696f3 --- /dev/null +++ b/media/libvpx/generate_sources_mozbuild.sh @@ -0,0 +1,293 @@ +#!/bin/bash -e +# +# Copyright (c) 2012 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +# Modified from chromium/src/third_party/libvpx/generate_gni.sh + +# This script is used to generate sources.mozbuild and files in the +# config/platform directories needed to build libvpx. +# Every time libvpx source code is updated just run this script. +# +# Usage: +# $ ./generate_sources_mozbuild.sh + +export LC_ALL=C +BASE_DIR=$(pwd) +LIBVPX_SRC_DIR="libvpx" +LIBVPX_CONFIG_DIR="config" +DISABLE_AVX="--disable-avx512" + +# Print license header. +# $1 - Output base name +function write_license { + echo "# This file is generated. Do not edit." >> $1 + echo "" >> $1 +} + +# Search for source files with the same basename in vp8, vp9, and vpx_dsp. The +# build does not support duplicate file names. +function find_duplicates { + local readonly duplicate_file_names=$(find \ + $BASE_DIR/$LIBVPX_SRC_DIR/vp8 \ + $BASE_DIR/$LIBVPX_SRC_DIR/vp9 \ + $BASE_DIR/$LIBVPX_SRC_DIR/vpx_dsp \ + -type f -name \*.c | xargs -I {} basename {} | sort | uniq -d \ + ) + + if [ -n "${duplicate_file_names}" ]; then + echo "ERROR: DUPLICATE FILES FOUND" + for file in ${duplicate_file_names}; do + find \ + $BASE_DIR/$LIBVPX_SRC_DIR/vp8 \ + $BASE_DIR/$LIBVPX_SRC_DIR/vp9 \ + $BASE_DIR/$LIBVPX_SRC_DIR/vpx_dsp \ + -name $file + done + exit 1 + fi +} + +# Generate sources.mozbuild with a list of source files. +# $1 - Array name for file list. This is processed with 'declare' below to +# regenerate the array locally. +# $2 - Variable name. +# $3 - Output file. +function write_sources { + # Convert the first argument back in to an array. + declare -a file_list=("${!1}") + + echo " '$2': [" >> "$3" + for f in $file_list + do + echo " 'libvpx/$f'," >> "$3" + done + echo "]," >> "$3" +} + +# Convert a list of source files into sources.mozbuild. +# $1 - Input file. +# $2 - Output prefix. +function convert_srcs_to_project_files { + # Do the following here: + # 1. Filter .c, .h, .s, .S and .asm files. + # 3. Convert .asm.s to .asm because moz.build will do the conversion. + + local source_list=$(grep -E '(\.c|\.h|\.S|\.s|\.asm)$' $1) + + # Remove vpx_config.c. + # The platform-specific vpx_config.c will be added into in moz.build later. + source_list=$(echo "$source_list" | grep -v 'vpx_config\.c') + + # Remove include-only asm files (no object code emitted) + source_list=$(echo "$source_list" | grep -v 'x86_abi_support\.asm') + source_list=$(echo "$source_list" | grep -v 'config\.asm') + + # The actual ARM files end in .asm. We have rules to translate them to .S + source_list=$(echo "$source_list" | sed s/\.asm\.s$/.asm/) + + # Exports - everything in vpx, vpx_mem, vpx_ports, vpx_scale + local exports_list=$(echo "$source_list" | \ + egrep '^(vpx|vpx_mem|vpx_ports|vpx_scale)/.*h$') + # but not anything in one level down, like 'internal' + exports_list=$(echo "$exports_list" | egrep -v '/(internal|src)/') + # or any of the other internal-ish header files. + exports_list=$(echo "$exports_list" | egrep -v '/(emmintrin_compat.h|mem_.*|msvc.h|vpx_once.h)$') + + # Remove these files from the main list. + source_list=$(comm -23 <(echo "$source_list") <(echo "$exports_list")) + + # Write a single file that includes all source files for all archs. + local c_sources=$(echo "$source_list" | egrep '.(asm|c)$') + local exports_sources=$(echo "$exports_list" | egrep '.h$') + + write_sources exports_sources ${2}_EXPORTS "$BASE_DIR/sources.mozbuild" + write_sources c_sources ${2}_SOURCES "$BASE_DIR/sources.mozbuild" +} + +# Clean files from previous make. +function make_clean { + make clean > /dev/null + rm -f libvpx_srcs.txt +} + +# Print the configuration. +# $1 - Header file directory. +function print_config { + $BASE_DIR/lint_config.sh -p \ + -h $BASE_DIR/$LIBVPX_CONFIG_DIR/$1/vpx_config.h \ + -a $BASE_DIR/$LIBVPX_CONFIG_DIR/$1/vpx_config.asm +} + +# Generate *_rtcd.h files. +# $1 - Header file directory. +# $2 - Architecture. +# $3 - Optional - any additional arguments to pass through. +function gen_rtcd_header { + echo "Generate $LIBVPX_CONFIG_DIR/$1/*_rtcd.h files." + + rm -rf $BASE_DIR/$TEMP_DIR/libvpx.config + $BASE_DIR/lint_config.sh -p \ + -h $BASE_DIR/$LIBVPX_CONFIG_DIR/$1/vpx_config.h \ + -a $BASE_DIR/$LIBVPX_CONFIG_DIR/$1/vpx_config.asm \ + -o $BASE_DIR/$TEMP_DIR/libvpx.config + + $BASE_DIR/$LIBVPX_SRC_DIR/build/make/rtcd.pl \ + --arch=$2 \ + --sym=vp8_rtcd $DISABLE_AVX $3 \ + --config=$BASE_DIR/$TEMP_DIR/libvpx.config \ + $BASE_DIR/$LIBVPX_SRC_DIR/vp8/common/rtcd_defs.pl \ + > $BASE_DIR/$LIBVPX_CONFIG_DIR/$1/vp8_rtcd.h + + $BASE_DIR/$LIBVPX_SRC_DIR/build/make/rtcd.pl \ + --arch=$2 \ + --sym=vp9_rtcd $DISABLE_AVX $3 \ + --config=$BASE_DIR/$TEMP_DIR/libvpx.config \ + $BASE_DIR/$LIBVPX_SRC_DIR/vp9/common/vp9_rtcd_defs.pl \ + > $BASE_DIR/$LIBVPX_CONFIG_DIR/$1/vp9_rtcd.h + + $BASE_DIR/$LIBVPX_SRC_DIR/build/make/rtcd.pl \ + --arch=$2 \ + --sym=vpx_scale_rtcd $DISABLE_AVX $3 \ + --config=$BASE_DIR/$TEMP_DIR/libvpx.config \ + $BASE_DIR/$LIBVPX_SRC_DIR/vpx_scale/vpx_scale_rtcd.pl \ + > $BASE_DIR/$LIBVPX_CONFIG_DIR/$1/vpx_scale_rtcd.h + + $BASE_DIR/$LIBVPX_SRC_DIR/build/make/rtcd.pl \ + --arch=$2 \ + --sym=vpx_dsp_rtcd $DISABLE_AVX $3 \ + --config=$BASE_DIR/$TEMP_DIR/libvpx.config \ + $BASE_DIR/$LIBVPX_SRC_DIR/vpx_dsp/vpx_dsp_rtcd_defs.pl \ + > $BASE_DIR/$LIBVPX_CONFIG_DIR/$1/vpx_dsp_rtcd.h + + rm -rf $BASE_DIR/$TEMP_DIR/libvpx.config +} + +# Generate Config files. "--enable-external-build" must be set to skip +# detection of capabilities on specific targets. +# $1 - Header file directory. +# $2 - Config command line. +function gen_config_files { + ./configure $2 > /dev/null + + # Disable HAVE_UNISTD_H. + ( echo '/HAVE_UNISTD_H'; echo 'd' ; echo 'w' ; echo 'q' ) | ed -s vpx_config.h + + local ASM_CONV=ads2gas.pl + + # Generate vpx_config.asm. + if [[ "$1" == *x64* ]] || [[ "$1" == *ia32* ]]; then + egrep "#define [A-Z0-9_]+ [01]" vpx_config.h | awk '{print "%define " $2 " " $3}' > vpx_config.asm + else + egrep "#define [A-Z0-9_]+ [01]" vpx_config.h | awk '{print $2 " EQU " $3}' | perl $BASE_DIR/$LIBVPX_SRC_DIR/build/make/$ASM_CONV > vpx_config.asm + fi + + cp vpx_config.* $BASE_DIR/$LIBVPX_CONFIG_DIR/$1 + make_clean + rm -rf vpx_config.* +} + +find_duplicates + +echo "Create temporary directory." +TEMP_DIR="$LIBVPX_SRC_DIR.temp" +rm -rf $TEMP_DIR +cp -R $LIBVPX_SRC_DIR $TEMP_DIR +cd $TEMP_DIR + +echo "Generate config files." +all_platforms="--enable-external-build --disable-examples --disable-install-docs --disable-unit-tests" +all_platforms="${all_platforms} --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic" +all_platforms="${all_platforms} --disable-avx512" +x86_platforms="--enable-postproc --enable-vp9-postproc --as=yasm" +arm_platforms="--enable-runtime-cpu-detect --enable-realtime-only" +arm64_platforms="--enable-realtime-only" + +gen_config_files linux/x64 "--target=x86_64-linux-gcc ${all_platforms} ${x86_platforms}" +gen_config_files linux/ia32 "--target=x86-linux-gcc ${all_platforms} ${x86_platforms}" +gen_config_files mac/x64 "--target=x86_64-darwin9-gcc ${all_platforms} ${x86_platforms}" +gen_config_files mac/ia32 "--target=x86-darwin9-gcc ${all_platforms} ${x86_platforms}" +gen_config_files win/x64 "--target=x86_64-win64-vs15 ${all_platforms} ${x86_platforms}" +gen_config_files win/ia32 "--target=x86-win32-gcc ${all_platforms} ${x86_platforms}" + +gen_config_files linux/arm "--target=armv7-linux-gcc ${all_platforms} ${arm_platforms}" +gen_config_files linux/arm64 "--target=arm64-linux-gcc ${all_platforms} ${arm64_platforms}" +gen_config_files win/aarch64 "--target=arm64-win64-vs15 ${all_platforms} ${arm64_platforms}" + +gen_config_files generic "--target=generic-gnu ${all_platforms}" + +echo "Remove temporary directory." +cd $BASE_DIR +rm -rf $TEMP_DIR + +echo "Create temporary directory." +TEMP_DIR="$LIBVPX_SRC_DIR.temp" +rm -rf $TEMP_DIR +cp -R $LIBVPX_SRC_DIR $TEMP_DIR +cd $TEMP_DIR + +gen_rtcd_header linux/x64 x86_64 +gen_rtcd_header linux/ia32 x86 +gen_rtcd_header mac/x64 x86_64 +gen_rtcd_header mac/ia32 x86 +gen_rtcd_header win/x64 x86_64 +gen_rtcd_header win/ia32 x86 + +gen_rtcd_header linux/arm armv7 +gen_rtcd_header linux/arm64 arm64 +gen_rtcd_header win/aarch64 arm64 + +gen_rtcd_header generic generic + +echo "Prepare Makefile." +./configure --target=generic-gnu > /dev/null +make_clean + +# Remove existing source files. +rm -rf $BASE_DIR/sources.mozbuild +write_license $BASE_DIR/sources.mozbuild +echo "files = {" >> $BASE_DIR/sources.mozbuild + +echo "Generate X86_64 source list." +config=$(print_config linux/x64) +make_clean +make libvpx_srcs.txt target=libs $config > /dev/null +convert_srcs_to_project_files libvpx_srcs.txt X64 + +# Copy vpx_version.h once. The file is the same for all platforms. +cp vpx_version.h $BASE_DIR/$LIBVPX_CONFIG_DIR + +echo "Generate IA32 source list." +config=$(print_config linux/ia32) +make_clean +make libvpx_srcs.txt target=libs $config > /dev/null +convert_srcs_to_project_files libvpx_srcs.txt IA32 + +echo "Generate ARM source list." +config=$(print_config linux/arm) +make_clean +make libvpx_srcs.txt target=libs $config > /dev/null +convert_srcs_to_project_files libvpx_srcs.txt ARM + +echo "Generate ARM64 source list." +config=$(print_config linux/arm64) +make_clean +make libvpx_srcs.txt target=libs $config > /dev/null +convert_srcs_to_project_files libvpx_srcs.txt ARM64 + +echo "Generate generic source list." +config=$(print_config generic) +make_clean +make libvpx_srcs.txt target=libs $config > /dev/null +convert_srcs_to_project_files libvpx_srcs.txt GENERIC + +echo "}" >> $BASE_DIR/sources.mozbuild + +echo "Remove temporary directory." +cd $BASE_DIR +rm -rf $TEMP_DIR + +cd $BASE_DIR/$LIBVPX_SRC_DIR + +cd $BASE_DIR diff --git a/media/libvpx/input_frame_validation.patch b/media/libvpx/input_frame_validation.patch new file mode 100644 index 0000000000..1cb33e192f --- /dev/null +++ b/media/libvpx/input_frame_validation.patch @@ -0,0 +1,44 @@ +# HG changeset patch +# User Randell Jesup +# Parent 1b77af186da211485fa9c5573d843d96c708a829 +Bug 1263384: validate input frames against configured resolution in vp8 r=rillian + +MozReview-Commit-ID: BxDCnJe0mzs + +diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c +--- a/vp8/vp8_cx_iface.c ++++ b/vp8/vp8_cx_iface.c +@@ -921,20 +921,29 @@ static vpx_codec_err_t vp8e_encode(vpx_c + dst_time_stamp = + pts_val * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den; + dst_end_time_stamp = (pts_val + (int64_t)duration) * + ctx->timestamp_ratio.num / ctx->timestamp_ratio.den; + + if (img != NULL) { + res = image2yuvconfig(img, &sd); + +- if (vp8_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags, &sd, +- dst_time_stamp, dst_end_time_stamp)) { +- VP8_COMP *cpi = (VP8_COMP *)ctx->cpi; +- res = update_error_state(ctx, &cpi->common.error); ++ if (sd.y_width != ctx->cfg.g_w || sd.y_height != ctx->cfg.g_h) { ++ /* from vpx_encoder.h for g_w/g_h: ++ "Note that the frames passed as input to the encoder must have this ++ resolution" ++ */ ++ ctx->base.err_detail = "Invalid input frame resolution"; ++ res = VPX_CODEC_INVALID_PARAM; ++ } else { ++ if (vp8_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags, ++ &sd, dst_time_stamp, dst_end_time_stamp)) { ++ VP8_COMP *cpi = (VP8_COMP *)ctx->cpi; ++ res = update_error_state(ctx, &cpi->common.error); ++ } + } + + /* reset for next frame */ + ctx->next_frame_flag = 0; + } + + cx_data = ctx->cx_data; + cx_data_sz = ctx->cx_data_sz; diff --git a/media/libvpx/input_frame_validation_vp9.patch b/media/libvpx/input_frame_validation_vp9.patch new file mode 100644 index 0000000000..ad17d495f2 --- /dev/null +++ b/media/libvpx/input_frame_validation_vp9.patch @@ -0,0 +1,36 @@ +# HG changeset patch +# User Randell Jesup +# Parent 87841f3bfc9d99a37e31cd43b2e2d03c325af84f +Bug 1315288: Add input checks for VP9 r=rillian + +diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c +--- a/vp9/vp9_cx_iface.c ++++ b/vp9/vp9_cx_iface.c +@@ -1372,13 +1372,22 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, + timebase_units_to_ticks(timestamp_ratio, pts + duration); + res = image2yuvconfig(img, &sd); + +- // Store the original flags in to the frame buffer. Will extract the +- // key frame flag when we actually encode this frame. +- if (vp9_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd, ++ if (sd.y_width != ctx->cfg.g_w || sd.y_height != ctx->cfg.g_h) { ++ /* from vpx_encoder.h for g_w/g_h: ++ "Note that the frames passed as input to the encoder must have this ++ resolution" ++ */ ++ ctx->base.err_detail = "Invalid input frame resolution"; ++ res = VPX_CODEC_INVALID_PARAM; ++ } else { ++ // Store the original flags in to the frame buffer. Will extract the ++ // key frame flag when we actually encode this frame. ++ if (vp9_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd, + dst_time_stamp, dst_end_time_stamp)) { +- res = update_error_state(ctx, &cpi->common.error); ++ res = update_error_state(ctx, &cpi->common.error); ++ } ++ ctx->next_frame_flags = 0; + } +- ctx->next_frame_flags = 0; + } + + cx_data = ctx->cx_data; diff --git a/media/libvpx/libvpx/.clang-format b/media/libvpx/libvpx/.clang-format new file mode 100644 index 0000000000..a8bc4967c3 --- /dev/null +++ b/media/libvpx/libvpx/.clang-format @@ -0,0 +1,9 @@ +--- +Language: Cpp +BasedOnStyle: Google +AllowShortCaseLabelsOnASingleLine: true +ConstructorInitializerAllOnOneLineOrOnePerLine: false +Cpp11BracedListStyle: false +DerivePointerAlignment: false +PointerAlignment: Right +SortIncludes: false diff --git a/media/libvpx/libvpx/.mailmap b/media/libvpx/libvpx/.mailmap new file mode 100644 index 0000000000..bb0ddd95b2 --- /dev/null +++ b/media/libvpx/libvpx/.mailmap @@ -0,0 +1,56 @@ +Adrian Grange +Aℓex Converse +Aℓex Converse +Aℓex Converse +Alexis Ballier +Alpha Lam +Angie Chiang +Chris Cunningham +Chi Yo Tsai +Daniele Castagna +Deb Mukherjee +Elliott Karpilovsky +Erik Niemeyer +Fyodor Kyslov +Gregor Jasny +Gregor Jasny +Guillaume Martres +Hangyu Kuang +Hui Su +Jacky Chen +Jim Bankoski +Johann Koenig +Johann Koenig +Johann Koenig +Johann +John Koleszar +Joshua Litt +Konstantinos Margaritis +Marco Paniconi +Marco Paniconi +Martin Storsjö +Michael Horowitz +Pascal Massimino +Paul Wilkins +Peter Boström +Peter de Rivaz +Peter de Rivaz +Ralph Giles +Ralph Giles +Ronald S. Bultje +Sai Deng +Sami Pietilä +Shiyou Yin +Tamar Levy +Tamar Levy +Tero Rintaluoma +Timothy B. Terriberry +Tom Finegan +Tom Finegan +Urvang Joshi +Yaowu Xu +Yaowu Xu +Yaowu Xu +Venkatarama NG. Avadhani +Vitaly Buka +xiwei gu diff --git a/media/libvpx/libvpx/AUTHORS b/media/libvpx/libvpx/AUTHORS new file mode 100644 index 0000000000..2db4a113e4 --- /dev/null +++ b/media/libvpx/libvpx/AUTHORS @@ -0,0 +1,228 @@ +# This file is automatically generated from the git commit history +# by tools/gen_authors.sh. + +Aaron Watry +Abo Talib Mahfoodh +Adam B. Goode +Adrian Grange +Ahmad Sharif +Aidan Welch +Aleksey Vasenev +Alexander Potapenko +Alexander Voronov +Alexandra Hájková +Aℓex Converse +Alexis Ballier +Alok Ahuja +Alpha Lam +A.Mahfoodh +Ami Fischman +Andoni Morales Alastruey +Andres Mejia +Andrew Lewis +Andrew Russell +Andrew Salkeld +Angie Chen +Angie Chiang +Anton Venema +Aron Rosenberg +Attila Nagy +Birk Magnussen +Bohan Li +Brian Foley +Brion Vibber +changjun.yang +Charles 'Buck' Krasic +Cheng Chen +Chi Yo Tsai +chm +Chris Cunningham +Christian Duvivier +Chunbo Hua +Clement Courbet +Daniele Castagna +Daniel Kang +Daniel Sommermann +Dan Zhu +Deb Mukherjee +Deepa K G +Dim Temp +Dmitry Kovalev +Dragan Mrdjan +Ed Baker +Ehsan Akhgari +Elliott Karpilovsky +Erik Niemeyer +Fabio Pedretti +Frank Galligan +Fredrik Söderquist +Fritz Koenig +Fyodor Kyslov +Gabriel Marin +Gaute Strokkenes +Geza Lore +Ghislain MARY +Giuseppe Scrivano +Gordana Cmiljanovic +Gregor Jasny +Guillaume Martres +Guillermo Ballester Valor +Hangyu Kuang +Hanno Böck +Han Shen +Hao Chen +Harish Mahendrakar +Henrik Lundin +Hien Ho +Hirokazu Honda +Hui Su +Ilya Kurdyukov +Ivan Krasin +Ivan Maltz +Jacek Caban +Jacky Chen +James Berry +James Touton +James Yu +James Zern +Jan Gerber +Jan Kratochvil +Janne Salonen +Jean-Yves Avenard +Jeff Faust +Jeff Muizelaar +Jeff Petkau +Jeremy Leconte +Jerome Jiang +Jia Jia +Jianhui Dai +Jian Zhou +Jim Bankoski +jinbo +Jin Bo +Jingning Han +Joel Fernandes +Joey Parrish +Johann Koenig +John Koleszar +Johnny Klonaris +John Stark +Jonathan Wright +Jon Kunkee +Jorge E. Moreira +Joshua Bleecher Snyder +Joshua Litt +Julia Robson +Justin Clift +Justin Lebar +Kaustubh Raste +KO Myung-Hun +Konstantinos Margaritis +Kyle Siefring +Lawrence Velázquez +Linfeng Zhang +Liu Peng +Lou Quillio +Luca Barbato +Luc Trudeau +Lu Wang +Makoto Kato +Mans Rullgard +Marco Paniconi +Mark Mentovai +Martin Ettl +Martin Storsjö +Matthew Heaney +Matthias Räncker +Michael Horowitz +Michael Kohler +Mike Frysinger +Mike Hommey +Mikhal Shemer +Mikko Koivisto +Min Chen +Minghai Shang +Min Ye +Mirko Bonadei +Moriyoshi Koizumi +Morton Jonuschat +Nathan E. Egge +Neil Birkbeck +Nico Weber +Niveditha Rau +Parag Salasakar +Pascal Massimino +Patrik Westin +Paul Wilkins +Pavol Rusnak +Paweł Hajdan +Pengchong Jin +Peter Boström +Peter Collingbourne +Peter de Rivaz +Peter Kasting +Philip Jägenstedt +Priit Laes +Rafael Ávila de Espíndola +Rafaël Carré +Rafael de Lucena Valle +Rahul Chaudhry +Ralph Giles +Ranjit Kumar Tulabandu +Raphael Kubo da Costa +Ravi Chaudhary +Ritu Baldwa +Rob Bradford +Ronald S. Bultje +Rui Ueyama +Sai Deng +Salome Thirot +Sami Pietilä +Sam James +Sarah Parker +Sasi Inguva +Scott Graham +Scott LaVarnway +Sean McGovern +Sergey Kolomenkin +Sergey Silkin +Sergey Ulanov +Shimon Doodkin +Shiyou Yin +Shubham Tandle +Shunyao Li +Sreerenj Balachandran +Stefan Holmer +Suman Sunkara +Supradeep T R +Sylvestre Ledru +Taekhyun Kim +Takanori MATSUURA +Tamar Levy +Tao Bai +Tero Rintaluoma +Thijs Vermeir +Tim Kopp +Timothy B. Terriberry +Tom Finegan +Tristan Matthews +Urvang Joshi +Venkatarama NG. Avadhani +Vignesh Venkatasubramanian +Vitaly Buka +Vlad Tsyrklevich +Wan-Teh Chang +Wonkap Jang +xiwei gu +Yaowu Xu +Yi Luo +Yongzhe Wang +yuanhecai +Yue Chen +Yun Liu +Yunqing Wang +Yury Gitman +Zoe Liu +Google Inc. +The Mozilla Foundation +The Xiph.Org Foundation diff --git a/media/libvpx/libvpx/CHANGELOG b/media/libvpx/libvpx/CHANGELOG new file mode 100644 index 0000000000..21070785ed --- /dev/null +++ b/media/libvpx/libvpx/CHANGELOG @@ -0,0 +1,929 @@ +20yy-mm-dd v1.14.0 "V Duck" + This release drops support for old C compilers, such as Visual Studio 2012 + and older, that disallow mixing variable declarations and statements (a C99 + feature). + +2023-09-29 v1.13.1 "Ugly Duckling" + This release contains two security related fixes. One each for VP8 and VP9. + + - Upgrading: + This release is ABI compatible with the previous release. + + - Bug fixes: + https://crbug.com/1486441 (CVE-2023-5217) + Fix to a crash related to VP9 encoding (#1642, CVE-2023-6349) + +2023-01-31 v1.13.0 "Ugly Duckling" + This release includes more Neon and AVX2 optimizations, adds a new codec + control to set per frame QP, upgrades GoogleTest to v1.12.1, and includes + numerous bug fixes. + + - Upgrading: + This release is ABI incompatible with the previous release. + + New codec control VP9E_SET_QUANTIZER_ONE_PASS to set per frame QP. + + GoogleTest is upgraded to v1.12.1. + + .clang-format is upgraded to clang-format-11. + + VPX_EXT_RATECTRL_ABI_VERSION was bumped due to incompatible changes to the + feature of using external rate control models for vp9. + + - Enhancement: + Numerous improvements on Neon optimizations. + Numerous improvements on AVX2 optimizations. + Additional ARM targets added for Visual Studio. + + - Bug fixes: + Fix to calculating internal stats when frame dropped. + Fix to segfault for external resize test in vp9. + Fix to build system with replacing egrep with grep -E. + Fix to a few bugs with external RTC rate control library. + Fix to make SVC work with VBR. + Fix to key frame setting in VP9 external RC. + Fix to -Wimplicit-int (Clang 16). + Fix to VP8 external RC for buffer levels. + Fix to VP8 external RC for dynamic update of layers. + Fix to VP9 auto level. + Fix to off-by-one error of max w/h in validate_config. + Fix to make SVC work for Profile 1. + +2022-06-17 v1.12.0 "Torrent Duck" + This release adds optimizations for Loongarch, adds support for vp8 in the + real-time rate control library, upgrades GoogleTest to v1.11.0, updates + libwebm to libwebm-1.0.0.28-20-g206d268, and includes numerous bug fixes. + + - Upgrading: + This release is ABI compatible with the previous release. + + vp8 support in the real-time rate control library. + New codec control VP8E_SET_RTC_EXTERNAL_RATECTRL is added. + + Configure support for darwin21 is added. + + GoogleTest is upgraded to v1.11.0. + + libwebm is updated to libwebm-1.0.0.28-20-g206d268. + + Allow SimpleEncode environment to take target level as input to match + the level conformance in vp9. + + - Enhancement: + Numerous improvements on checking memory allocations. + Optimizations for Loongarch. + Code clean-up. + + - Bug fixes: + Fix to a crash related to {vp8/vp9}_set_roi_map. + Fix to compiling failure with -Wformat-nonliteral. + Fix to integer overflow with vp9 with high resolution content. + Fix to AddNoiseTest failure with ARMv7. + Fix to libvpx Null-dereference READ in vp8. + +2021-09-27 v1.11.0 "Smew Duck" + This maintenance release adds support for VBR mode in VP9 rate control + interface, new codec controls to get quantization parameters and loop filter + levels, and includes several improvements to NEON and numerous bug fixes. + + - Upgrading: + This release is ABI incompatible with the previous release. + New codec control is added to get quantization parameters and loop filter + levels. + + VBR mode is supported in VP9 rate control library. + + - Enhancement: + Numerous improvements for Neon optimizations. + Code clean-up and refactoring. + Calculation of rd multiplier is changed with BDRATE gains. + + - Bug fixes: + Fix to overflow on duration. + Fix to several instances of -Wunused-but-set-variable. + Fix to avoid chroma resampling for 420mpeg2 input. + Fix to overflow in calc_iframe_target_size. + Fix to disallow skipping transform and quantization. + Fix some -Wsign-compare warnings in simple_encode. + Fix input file path in simple_encode_test. + Fix valid range for under/over_shoot pct. + +2021-03-09 v1.10.0 "Ruddy Duck" + This maintenance release adds support for darwin20 and new codec controls, as + well as numerous bug fixes. + + - Upgrading: + This release is ABI incompatible with the previous release. + New codec control is added to disable loopfilter for VP9. + + New encoder control is added to disable feature to increase Q on overshoot + detection for CBR. + + Configure support for darwin20 is added. + + New codec control is added for VP9 rate control. The control ID of this + interface is VP9E_SET_EXTERNAL_RATE_CONTROL. To make VP9 use a customized + external rate control model, users will have to implement each callback + function in vpx_rc_funcs_t and register them using libvpx API + vpx_codec_control_() with the control ID. + + - Enhancement: + Use -std=gnu++11 instead of -std=c++11 for c++ files. + + - Bug fixes: + Override assembler with --as option of configure for MSVS. + Fix several compilation issues with gcc 4.8.5. + Fix to resetting rate control for temporal layers. + Fix to the rate control stats of SVC example encoder when number of spatial + layers is 1. + Fix to reusing motion vectors from the base spatial layer in SVC. + 2 pass related flags removed from SVC example encoder. + +2020-07-29 v1.9.0 "Quacking Duck" + This release adds support for NV12, a separate library for rate control, as + well as incremental improvements. + + - Upgrading: + This release is ABI compatible with the previous release. + NV12 support is added to this release. + A new interface is added for VP9 rate control. The new library libvp9rc.a + must be linked by applications. + Googletest is updated to v1.10.0. + simple_encode.cc is compiled into a new library libsimple_encode.a with + CONFIG_RATE_CTRL. + + - Enhancement: + Various changes to improve VP9 SVC, rate control, quality and speed to real + time encoding. + + - Bug fixes: + Fix key frame update refresh simulcast flexible svc. + Fix to disable_16x16part speed feature for real time encoding. + Fix some signed integer overflows for VP9 rate control. + Fix initialization of delta_q_uv. + Fix condition in regulate_q for cyclic refresh. + Various fixes to dynamic resizing for VP9 SVC. + +2019-12-09 v1.8.2 "Pekin Duck" + This release collects incremental improvements to many aspects of the library. + + - Upgrading: + This release is ABI compatible with the previous release. + ARCH_* defines have been removed in favor of VPX_ARCH_*. + +2019-07-15 v1.8.1 "Orpington Duck" + This release collects incremental improvements to many aspects of the library. + + - Upgrading: + This release is ABI incompatible with the previous release. + VP8E_SET_CPUUSED now accepts values up to 9 for vp9. + VPX_CTRL_VP9E_SET_MAX_INTER_BITRATE_PCT had a spelling fix (was VP8E). + The --sdk-path option has been removed. If you were using it to build for + Android please read build/make/Android.mk for alternatives. + All PPC optimizations have been disabled: + https://bugs.chromium.org/p/webm/issues/detail?id=1522. + + - Enhancements: + Various changes to improve encoder rate control, quality and speed + for practically every use case. + + - Bug fixes: + vp9-rtc: Fix color artifacts for speed >= 8. + +2019-01-31 v1.8.0 "Northern Shoveler Duck" + This release focused on encoding performance for realtime and VOD use cases. + + - Upgrading: + This release is ABI incompatible with the previous release. This adds and + improves several vp9 controls. Most are related to SVC: + VP9E_SET_SVC_FRAME_DROP_LAYER: + - Frame dropping in SVC. + VP9E_SET_SVC_INTER_LAYER_PRED: + - Inter-layer prediction in SVC. + VP9E_SET_SVC_GF_TEMPORAL_REF: + - Enable long term temporal reference in SVC. + VP9E_SET_SVC_REF_FRAME_CONFIG/VP9E_GET_SVC_REF_FRAME_CONFIG: + - Extend and improve this control for better flexibility in setting SVC + pattern dynamically. + VP9E_SET_POSTENCODE_DROP: + - Allow for post-encode frame dropping (applies to non-SVC too). + VP9E_SET_SVC_SPATIAL_LAYER_SYNC: + - Enable spatial layer sync frames. + VP9E_SET_SVC_LAYER_ID: + - Extend api to specify temporal id for each spatial layers. + VP9E_SET_ROI_MAP: + - Extend Region of Interest functionality to VP9. + + - Enhancements: + 2 pass vp9 encoding has improved substantially. When using --auto-alt-ref=6, + we see approximately 8% for VBR and 10% for CQ. When using --auto-alt-ref=1, + the gains are approximately 4% for VBR and 5% for CQ. + + For real-time encoding, speed 7 has improved by ~5-10%. Encodes targeted at + screen sharing have improved when the content changes significantly (slide + sharing) or scrolls. There is a new speed 9 setting for mobile devices which + is about 10-20% faster than speed 8. + + - Bug fixes: + VP9 denoiser issue. + VP9 partition issue for 1080p. + VP9 rate control improvments. + Postprocessing Multi Frame Quality Enhancement (MFQE) issue. + VP8 multithread decoder issues. + A variety of fuzzing issues. + +2018-01-04 v1.7.0 "Mandarin Duck" + This release focused on high bit depth performance (10/12 bit) and vp9 + encoding improvements. + + - Upgrading: + This release is ABI incompatible due to new vp9 encoder features. + + Frame parallel decoding for vp9 has been removed. + + - Enhancements: + vp9 encoding supports additional threads with --row-mt. This can be greater + than the number of tiles. + + Two new vp9 encoder options have been added: + --corpus-complexity + --tune-content=film + + Additional tooling for respecting the vp9 "level" profiles has been added. + + - Bug fixes: + A variety of fuzzing issues. + vp8 threading fix for ARM. + Codec control VP9_SET_SKIP_LOOP_FILTER fixed. + Reject invalid multi resolution configurations. + +2017-01-09 v1.6.1 "Long Tailed Duck" + This release improves upon the VP9 encoder and speeds up the encoding and + decoding processes. + + - Upgrading: + This release is ABI compatible with 1.6.0. + + - Enhancements: + Faster VP9 encoding and decoding. + High bit depth builds now provide similar speed for 8 bit encode and decode + for x86 targets. Other platforms and higher bit depth improvements are in + progress. + + - Bug Fixes: + A variety of fuzzing issues. + +2016-07-20 v1.6.0 "Khaki Campbell Duck" + This release improves upon the VP9 encoder and speeds up the encoding and + decoding processes. + + - Upgrading: + This release is ABI incompatible with 1.5.0 due to a new 'color_range' enum + in vpx_image and some minor changes to the VP8_COMP structure. + + The default key frame interval for VP9 has changed from 128 to 9999. + + - Enhancement: + A core focus has been performance for low end Intel processors. SSSE3 + instructions such as 'pshufb' have been avoided and instructions have been + reordered to better accommodate the more constrained pipelines. + + As a result, devices based on Celeron processors have seen substantial + decoding improvements. From Indian Runner Duck to Javan Whistling Duck, + decoding speed improved between 10 and 30%. Between Javan Whistling Duck + and Khaki Campbell Duck, it improved another 10 to 15%. + + While Celeron benefited most, Core-i5 also improved 5% and 10% between the + respective releases. + + Realtime performance for WebRTC for both speed and quality has received a + lot of attention. + + - Bug Fixes: + A number of fuzzing issues, found variously by Mozilla, Chromium and others, + have been fixed and we strongly recommend updating. + +2015-11-09 v1.5.0 "Javan Whistling Duck" + This release improves upon the VP9 encoder and speeds up the encoding and + decoding processes. + + - Upgrading: + This release is ABI incompatible with 1.4.0. It drops deprecated VP8 + controls and adds a variety of VP9 controls for testing. + + The vpxenc utility now prefers VP9 by default. + + - Enhancements: + Faster VP9 encoding and decoding + Smaller library size by combining functions used by VP8 and VP9 + + - Bug Fixes: + A variety of fuzzing issues + +2015-04-03 v1.4.0 "Indian Runner Duck" + This release includes significant improvements to the VP9 codec. + + - Upgrading: + This release is ABI incompatible with 1.3.0. It drops the compatibility + layer, requiring VPX_IMG_FMT_* instead of IMG_FMT_*, and adds several codec + controls for VP9. + + - Enhancements: + Faster VP9 encoding and decoding + Multithreaded VP9 decoding (tile and frame-based) + Multithreaded VP9 encoding - on by default + YUV 4:2:2 and 4:4:4 support in VP9 + 10 and 12bit support in VP9 + 64bit ARM support by replacing ARM assembly with intrinsics + + - Bug Fixes: + Fixes a VP9 bitstream issue in Profile 1. This only affected non-YUV 4:2:0 + files. + + - Known Issues: + Frame Parallel decoding fails for segmented and non-420 files. + +2013-11-15 v1.3.0 "Forest" + This release introduces the VP9 codec in a backward-compatible way. + All existing users of VP8 can continue to use the library without + modification. However, some VP8 options do not map to VP9 in the same manner. + + The VP9 encoder in this release is not feature complete. Users interested in + the encoder are advised to use the git master branch and discuss issues on + libvpx mailing lists. + + - Upgrading: + This release is ABI and API compatible with Duclair (v1.0.0). Users + of older releases should refer to the Upgrading notes in this document + for that release. + + - Enhancements: + Get rid of bashisms in the main build scripts + Added usage info on command line options + Add lossless compression mode + Dll build of libvpx + Add additional Mac OS X targets: 10.7, 10.8 and 10.9 (darwin11-13) + Add option to disable documentation + configure: add --enable-external-build support + make: support V=1 as short form of verbose=yes + configure: support mingw-w64 + configure: support hardfloat armv7 CHOSTS + configure: add support for android x86 + Add estimated completion time to vpxenc + Don't exit on decode errors in vpxenc + vpxenc: support scaling prior to encoding + vpxdec: support scaling output + vpxenc: improve progress indicators with --skip + msvs: Don't link to winmm.lib + Add a new script for producing vcxproj files + Produce Visual Studio 10 and 11 project files + Produce Windows Phone project files + msvs-build: use msbuild for vs >= 2005 + configure: default configure log to config.log + Add encoding option --static-thresh + + - Speed: + Miscellaneous speed optimizations for VP8 and VP9. + + - Quality: + In general, quality is consistent with the Eider release. + + - Bug Fixes: + This release represents approximately a year of engineering effort, + and contains multiple bug fixes. Please refer to git history for details. + + +2012-12-21 v1.2.0 + This release acts as a checkpoint for a large amount of internal refactoring + and testing. It also contains a number of small bugfixes, so all users are + encouraged to upgrade. + + - Upgrading: + This release is ABI and API compatible with Duclair (v1.0.0). Users + of older releases should refer to the Upgrading notes in this + document for that release. + + - Enhancements: + VP8 optimizations for MIPS dspr2 + vpxenc: add -quiet option + + - Speed: + Encoder and decoder speed is consistent with the Eider release. + + - Quality: + In general, quality is consistent with the Eider release. + + Minor tweaks to ARNR filtering + Minor improvements to real time encoding with multiple temporal layers + + - Bug Fixes: + Fixes multithreaded encoder race condition in loopfilter + Fixes multi-resolution threaded encoding + Fix potential encoder dead-lock after picture resize + + +2012-05-09 v1.1.0 "Eider" + This introduces a number of enhancements, mostly focused on real-time + encoding. In addition, it fixes a decoder bug (first introduced in + Duclair) so all users of that release are encouraged to upgrade. + + - Upgrading: + This release is ABI and API compatible with Duclair (v1.0.0). Users + of older releases should refer to the Upgrading notes in this + document for that release. + + This release introduces a new temporal denoiser, controlled by the + VP8E_SET_NOISE_SENSITIVITY control. The temporal denoiser does not + currently take a strength parameter, so the control is effectively + a boolean - zero (off) or non-zero (on). For compatibility with + existing applications, the values accepted are the same as those + for the spatial denoiser (0-6). The temporal denoiser is enabled + by default, and the older spatial denoiser may be restored by + configuring with --disable-temporal-denoising. The temporal denoiser + is more computationally intensive than the spatial one. + + This release removes support for a legacy, decode only API that was + supported, but deprecated, at the initial release of libvpx + (v0.9.0). This is not expected to have any impact. If you are + impacted, you can apply a reversion to commit 2bf8fb58 locally. + Please update to the latest libvpx API if you are affected. + + - Enhancements: + Adds a motion compensated temporal denoiser to the encoder, which + gives higher quality than the older spatial denoiser. (See above + for notes on upgrading). + + In addition, support for new compilers and platforms were added, + including: + improved support for XCode + Android x86 NDK build + OS/2 support + SunCC support + + Changing resolution with vpx_codec_enc_config_set() is now + supported. Previously, reinitializing the codec was required to + change the input resolution. + + The vpxenc application has initial support for producing multiple + encodes from the same input in one call. Resizing is not yet + supported, but varying other codec parameters is. Use -- to + delineate output streams. Options persist from one stream to the + next. + + Also, the vpxenc application will now use a keyframe interval of + 5 seconds by default. Use the --kf-max-dist option to override. + + - Speed: + Decoder performance improved 2.5% versus Duclair. Encoder speed is + consistent with Duclair for most material. Two pass encoding of + slideshow-like material will see significant improvements. + + Large realtime encoding speed gains at a small quality expense are + possible by configuring the on-the-fly bitpacking experiment with + --enable-onthefly-bitpacking. Realtime encoder can be up to 13% + faster (ARM) depending on the number of threads and bitrate + settings. This technique sees constant gain over the 5-16 speed + range. For VC style input the loss seen is up to 0.2dB. See commit + 52cf4dca for further details. + + - Quality: + On the whole, quality is consistent with the Duclair release. Some + tweaks: + + Reduced blockiness in easy sections by applying a penalty to + intra modes. + + Improved quality of static sections (like slideshows) with + two pass encoding. + + Improved keyframe sizing with multiple temporal layers + + - Bug Fixes: + Corrected alt-ref contribution to frame rate for visible updates + to the alt-ref buffer. This affected applications making manual + usage of the frame reference flags, or temporal layers. + + Additional constraints were added to disable multi-frame quality + enhancement (MFQE) in sections of the frame where there is motion. + (#392) + + Fixed corruption issues when vpx_codec_enc_config_set() was called + with spatial resampling enabled. + + Fixed a decoder error introduced in Duclair where the segmentation + map was not being reinitialized on keyframes (#378) + + +2012-01-27 v1.0.0 "Duclair" + Our fourth named release, focused on performance and features related to + real-time encoding. It also fixes a decoder crash bug introduced in + v0.9.7, so all users of that release are encouraged to upgrade. + + - Upgrading: + This release is ABI incompatible with prior releases of libvpx, so the + "major" version number has been bumped to 1. You must recompile your + applications against the latest version of the libvpx headers. The + API remains compatible, and this should not require code changes in most + applications. + + - Enhancements: + This release introduces several substantial new features to the encoder, + of particular interest to real time streaming applications. + + Temporal scalability allows the encoder to produce a stream that can + be decimated to different frame rates, with independent rate targeting + for each substream. + + Multiframe quality enhancement postprocessing can make visual quality + more consistent in the presence of frames that are substantially + different quality than the surrounding frames, as in the temporal + scalability case and in some forced keyframe scenarios. + + Multiple-resolution encoding support allows the encoding of the + same content at different resolutions faster than encoding them + separately. + + - Speed: + Optimization targets for this release included the decoder and the real- + time modes of the encoder. Decoder speed on x86 has improved 10.5% with + this release. Encoder improvements followed a curve where speeds 1-3 + improved 4.0%-1.5%, speeds 4-8 improved <1%, and speeds 9-16 improved + 1.5% to 10.5%, respectively. "Best" mode speed is consistent with the + Cayuga release. + + - Quality: + Encoder quality in the single stream case is consistent with the Cayuga + release. + + - Bug Fixes: + This release fixes an OOB read decoder crash bug present in v0.9.7 + related to the clamping of motion vectors in SPLITMV blocks. This + behavior could be triggered by corrupt input or by starting + decoding from a P-frame. + + +2011-08-15 v0.9.7-p1 "Cayuga" patch 1 + This is an incremental bugfix release against Cayuga. All users of that + release are strongly encouraged to upgrade. + + - Fix potential OOB reads (cdae03a) + + An unbounded out of bounds read was discovered when the + decoder was requested to perform error concealment (new in + Cayuga) given a frame with corrupt partition sizes. + + A bounded out of bounds read was discovered affecting all + versions of libvpx. Given an multipartition input frame that + is truncated between the mode/mv partition and the first + residiual paritition (in the block of partition offsets), up + to 3 extra bytes could have been read from the source buffer. + The code will not take any action regardless of the contents + of these undefined bytes, as the truncated buffer is detected + immediately following the read based on the calculated + starting position of the coefficient partition. + + - Fix potential error concealment crash when the very first frame + is missing or corrupt (a609be5) + + - Fix significant artifacts in error concealment (a4c2211, 99d870a) + + - Revert 1-pass CBR rate control changes (e961317) + Further testing showed this change produced undesirable visual + artifacts, rolling back for now. + + +2011-08-02 v0.9.7 "Cayuga" + Our third named release, focused on a faster, higher quality, encoder. + + - Upgrading: + This release is backwards compatible with Aylesbury (v0.9.5) and + Bali (v0.9.6). Users of older releases should refer to the Upgrading + notes in this document for that release. + + - Enhancements: + Stereo 3D format support for vpxenc + Runtime detection of available processor cores. + Allow specifying --end-usage by enum name + vpxdec: test for frame corruption + vpxenc: add quantizer histogram display + vpxenc: add rate histogram display + Set VPX_FRAME_IS_DROPPABLE + update configure for ios sdk 4.3 + Avoid text relocations in ARM vp8 decoder + Generate a vpx.pc file for pkg-config. + New ways of passing encoded data between encoder and decoder. + + - Speed: + This release includes across-the-board speed improvements to the + encoder. On x86, these measure at approximately 11.5% in Best mode, + 21.5% in Good mode (speed 0), and 22.5% in Realtime mode (speed 6). + On ARM Cortex A9 with Neon extensions, real-time encoding of video + telephony content is 35% faster than Bali on single core and 48% + faster on multi-core. On the NVidia Tegra2 platform, real time + encoding is 40% faster than Bali. + + Decoder speed was not a priority for this release, but improved + approximately 8.4% on x86. + + Reduce motion vector search on alt-ref frame. + Encoder loopfilter running in its own thread + Reworked loopfilter to precalculate more parameters + SSE2/SSSE3 optimizations for build_predictors_mbuv{,_s}(). + Make hor UV predict ~2x faster (73 vs 132 cycles) using SSSE3. + Removed redundant checks + Reduced structure sizes + utilize preload in ARMv6 MC/LPF/Copy routines + ARM optimized quantization, dfct, variance, subtract + Increase chrow row alignment to 16 bytes. + disable trellis optimization for first pass + Write SSSE3 sub-pixel filter function + Improve SSE2 half-pixel filter funtions + Add vp8_sub_pixel_variance16x8_ssse3 function + Reduce unnecessary distortion computation + Use diamond search to replace full search + Preload reference area in sub-pixel motion search (real-time mode) + + - Quality: + This release focused primarily on one-pass use cases, including + video conferencing. Low latency data rate control was significantly + improved, improving streamability over bandwidth constrained links. + Added support for error concealment, allowing frames to maintain + visual quality in the presence of substantial packet loss. + + Add rc_max_intra_bitrate_pct control + Limit size of initial keyframe in one-pass. + Improve framerate adaptation + Improved 1-pass CBR rate control + Improved KF insertion after fades to still. + Improved key frame detection. + Improved activity masking (lower PSNR impact for same SSIM boost) + Improved interaction between GF and ARFs + Adding error-concealment to the decoder. + Adding support for independent partitions + Adjusted rate-distortion constants + + + - Bug Fixes: + Removed firstpass motion map + Fix parallel make install + Fix multithreaded encoding for 1 MB wide frame + Fixed iwalsh_neon build problems with RVDS4.1 + Fix semaphore emulation, spin-wait intrinsics on Windows + Fix build with xcode4 and simplify GLOBAL. + Mark ARM asm objects as allowing a non-executable stack. + Fix vpxenc encoding incorrect webm file header on big endian + + +2011-03-07 v0.9.6 "Bali" + Our second named release, focused on a faster, higher quality, encoder. + + - Upgrading: + This release is backwards compatible with Aylesbury (v0.9.5). Users + of older releases should refer to the Upgrading notes in this + document for that release. + + - Enhancements: + vpxenc --psnr shows a summary when encode completes + --tune=ssim option to enable activity masking + improved postproc visualizations for development + updated support for Apple iOS to SDK 4.2 + query decoder to determine which reference frames were updated + implemented error tracking in the decoder + fix pipe support on windows + + - Speed: + Primary focus was on good quality mode, speed 0. Average improvement + on x86 about 40%, up to 100% on user-generated content at that speed. + Best quality mode speed improved 35%, and realtime speed 10-20%. This + release also saw significant improvement in realtime encoding speed + on ARM platforms. + + Improved encoder threading + Dont pick encoder filter level when loopfilter is disabled. + Avoid double copying of key frames into alt and golden buffer + FDCT optimizations. + x86 sse2 temporal filter + SSSE3 version of fast quantizer + vp8_rd_pick_best_mbsegmentation code restructure + Adjusted breakout RD for SPLITMV + Changed segmentation check order + Improved rd_pick_intra4x4block + Adds armv6 optimized variance calculation + ARMv6 optimized sad16x16 + ARMv6 optimized half pixel variance calculations + Full search SAD function optimization in SSE4.1 + Improve MV prediction accuracy to achieve performance gain + Improve MV prediction in vp8_pick_inter_mode() for speed>3 + + - Quality: + Best quality mode improved PSNR 6.3%, and SSIM 6.1%. This release + also includes support for "activity masking," which greatly improves + SSIM at the expense of PSNR. For now, this feature is available with + the --tune=ssim option. Further experimentation in this area + is ongoing. This release also introduces a new rate control mode + called "CQ," which changes the allocation of bits within a clip to + the sections where they will have the most visual impact. + + Tuning for the more exact quantizer. + Relax rate control for last few frames + CQ Mode + Limit key frame quantizer for forced key frames. + KF/GF Pulsing + Add simple version of activity masking. + make rdmult adaptive for intra in quantizer RDO + cap the best quantizer for 2nd order DC + change the threshold of DC check for encode breakout + + - Bug Fixes: + Fix crash on Sparc Solaris. + Fix counter of fixed keyframe distance + ARNR filter pointer update bug fix + Fixed use of motion percentage in KF/GF group calc + Changed condition for using RD in Intra Mode + Fix encoder real-time only configuration. + Fix ARM encoder crash with multiple token partitions + Fixed bug first cluster timecode of webm file is wrong. + Fixed various encoder bugs with odd-sized images + vp8e_get_preview fixed when spatial resampling enabled + quantizer: fix assertion in fast quantizer path + Allocate source buffers to be multiples of 16 + Fix for manual Golden frame frequency + Fix drastic undershoot in long form content + + +2010-10-28 v0.9.5 "Aylesbury" + Our first named release, focused on a faster decoder, and a better encoder. + + - Upgrading: + This release incorporates backwards-incompatible changes to the + ivfenc and ivfdec tools. These tools are now called vpxenc and vpxdec. + + vpxdec + * the -q (quiet) option has been removed, and replaced with + -v (verbose). the output is quiet by default. Use -v to see + the version number of the binary. + + * The default behavior is now to write output to a single file + instead of individual frames. The -y option has been removed. + Y4M output is the default. + + * For raw I420/YV12 output instead of Y4M, the --i420 or --yv12 + options must be specified. + + $ ivfdec -o OUTPUT INPUT + $ vpxdec --i420 -o OUTPUT INPUT + + * If an output file is not specified, the default is to write + Y4M to stdout. This makes piping more natural. + + $ ivfdec -y -o - INPUT | ... + $ vpxdec INPUT | ... + + * The output file has additional flexibility for formatting the + filename. It supports escape characters for constructing a + filename from the width, height, and sequence number. This + replaces the -p option. To get the equivalent: + + $ ivfdec -p frame INPUT + $ vpxdec --i420 -o frame-%wx%h-%4.i420 INPUT + + vpxenc + * The output file must be specified with -o, rather than as the + last argument. + + $ ivfenc INPUT OUTPUT + $ vpxenc -o OUTPUT INPUT + + * The output defaults to webm. To get IVF output, use the --ivf + option. + + $ ivfenc INPUT OUTPUT.ivf + $ vpxenc -o OUTPUT.ivf --ivf INPUT + + + - Enhancements: + ivfenc and ivfdec have been renamed to vpxenc, vpxdec. + vpxdec supports .webm input + vpxdec writes .y4m by default + vpxenc writes .webm output by default + vpxenc --psnr now shows the average/overall PSNR at the end + ARM platforms now support runtime cpu detection + vpxdec visualizations added for motion vectors, block modes, references + vpxdec now silent by default + vpxdec --progress shows frame-by-frame timing information + vpxenc supports the distinction between --fps and --timebase + NASM is now a supported assembler + configure: enable PIC for shared libs by default + configure: add --enable-small + configure: support for ppc32-linux-gcc + configure: support for sparc-solaris-gcc + + - Bugs: + Improve handling of invalid frames + Fix valgrind errors in the NEON loop filters. + Fix loopfilter delta zero transitions + Fix valgrind errors in vp8_sixtap_predict8x4_armv6(). + Build fixes for darwin-icc + + - Speed: + 20-40% (average 28%) improvement in libvpx decoder speed, + including: + Rewrite vp8_short_walsh4x4_sse2() + Optimizations on the loopfilters. + Miscellaneous improvements for Atom + Add 4-tap version of 2nd-pass ARMv6 MC filter. + Improved multithread utilization + Better instruction choices on x86 + reorder data to use wider instructions + Update NEON wide idcts + Make block access to frame buffer sequential + Improved subset block search + Bilinear subpixel optimizations for ssse3. + Decrease memory footprint + + Encoder speed improvements (percentage gain not measured): + Skip unnecessary search of identical frames + Add SSE2 subtract functions + Improve bounds checking in vp8_diamond_search_sadx4() + Added vp8_fast_quantize_b_sse2 + + - Quality: + Over 7% overall PSNR improvement (6.3% SSIM) in "best" quality + encoding mode, and up to 60% improvement on very noisy, still + or slow moving source video + + Motion compensated temporal filter for Alt-Ref Noise Reduction + Improved use of trellis quantization on 2nd order Y blocks + Tune effect of motion on KF/GF boost in two pass + Allow coefficient optimization for good quality speed 0. + Improved control of active min quantizer for two pass. + Enable ARFs for non-lagged compress + +2010-09-02 v0.9.2 + - Enhancements: + Disable frame dropping by default + Improved multithreaded performance + Improved Force Key Frame Behaviour + Increased rate control buffer level precision + Fix bug in 1st pass motion compensation + ivfenc: correct fixed kf interval, --disable-kf + - Speed: + Changed above and left context data layout + Rework idct calling structure. + Removed unnecessary MB_MODE_INFO copies + x86: SSSE3 sixtap prediction + Reworked IDCT to include reconstruction (add) step + Swap alt/gold/new/last frame buffer ptrs instead of copying. + Improve SSE2 loopfilter functions + Change bitreader to use a larger window. + Avoid loopfilter reinitialization when possible + - Quality: + Normalize quantizer's zero bin and rounding factors + Add trellis quantization. + Make the quantizer exact. + Updates to ARNR filtering algorithm + Fix breakout thresh computation for golden & AltRef frames + Redo the forward 4x4 dct + Improve the accuracy of forward walsh-hadamard transform + Further adjustment of RD behaviour with Q and Zbin. + - Build System: + Allow linking of libs built with MinGW to MSVC + Fix target auto-detection on mingw32 + Allow --cpu= to work for x86. + configure: pass original arguments through to make dist + Fix builds without runtime CPU detection + msvs: fix install of codec sources + msvs: Change devenv.com command line for better msys support + msvs: Add vs9 targets. + Add x86_64-linux-icc target + - Bugs: + Potential crashes on older MinGW builds + Fix two-pass framrate for Y4M input. + Fixed simple loop filter, other crashes on ARM v6 + arm: fix missing dependency with --enable-shared + configure: support directories containing .o + Replace pinsrw (SSE) with MMX instructions + apple: include proper mach primatives + Fixed rate control bug with long key frame interval. + Fix DSO link errors on x86-64 when not using a version script + Fixed buffer selection for UV in AltRef filtering + + +2010-06-17 v0.9.1 + - Enhancements: + * ivfenc/ivfdec now support YUV4MPEG2 input and pipe I/O + * Speed optimizations + - Bugfixes: + * Rate control + * Prevent out-of-bounds accesses on invalid data + - Build system updates: + * Detect toolchain to be used automatically for native builds + * Support building shared libraries + * Better autotools emulation (--prefix, --libdir, DESTDIR) + - Updated LICENSE + * http://webmproject.blogspot.com/2010/06/changes-to-webm-open-source-license.html + + +2010-05-18 v0.9.0 + - Initial open source release. Welcome to WebM and VP8! + diff --git a/media/libvpx/libvpx/CONTRIBUTING.md b/media/libvpx/libvpx/CONTRIBUTING.md new file mode 100644 index 0000000000..7a73a30317 --- /dev/null +++ b/media/libvpx/libvpx/CONTRIBUTING.md @@ -0,0 +1,29 @@ +# How to Contribute + +We'd love to accept your patches and contributions to this project. There are +just a few small guidelines you need to follow. + +## Contributor License Agreement + +Contributions to this project must be accompanied by a Contributor License +Agreement. You (or your employer) retain the copyright to your contribution; +this simply gives us permission to use and redistribute your contributions as +part of the project. Head over to to see +your current agreements on file or to sign a new one. + +You generally only need to submit a CLA once, so if you've already submitted one +(even if it was for a different project), you probably don't need to do it +again. + +## Code reviews + +All submissions, including submissions by project members, require review. We +use a [Gerrit](https://www.gerritcodereview.com) instance hosted at +https://chromium-review.googlesource.com for this purpose. See the +[WebM Project page](https://www.webmproject.org/code/contribute/submitting-patches/) +for additional details. + +## Community Guidelines + +This project follows +[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/). diff --git a/media/libvpx/libvpx/LICENSE b/media/libvpx/libvpx/LICENSE new file mode 100644 index 0000000000..1ce44343c4 --- /dev/null +++ b/media/libvpx/libvpx/LICENSE @@ -0,0 +1,31 @@ +Copyright (c) 2010, The WebM Project authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + + * Neither the name of Google, nor the WebM Project, nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/media/libvpx/libvpx/PATENTS b/media/libvpx/libvpx/PATENTS new file mode 100644 index 0000000000..caedf607e9 --- /dev/null +++ b/media/libvpx/libvpx/PATENTS @@ -0,0 +1,23 @@ +Additional IP Rights Grant (Patents) +------------------------------------ + +"These implementations" means the copyrightable works that implement the WebM +codecs distributed by Google as part of the WebM Project. + +Google hereby grants to you a perpetual, worldwide, non-exclusive, no-charge, +royalty-free, irrevocable (except as stated in this section) patent license to +make, have made, use, offer to sell, sell, import, transfer, and otherwise +run, modify and propagate the contents of these implementations of WebM, where +such license applies only to those patent claims, both currently owned by +Google and acquired in the future, licensable by Google that are necessarily +infringed by these implementations of WebM. This grant does not include claims +that would be infringed only as a consequence of further modification of these +implementations. If you or your agent or exclusive licensee institute or order +or agree to the institution of patent litigation or any other patent +enforcement activity against any entity (including a cross-claim or +counterclaim in a lawsuit) alleging that any of these implementations of WebM +or any code incorporated within any of these implementations of WebM +constitute direct or contributory patent infringement, or inducement of +patent infringement, then any patent rights granted to you under this License +for these implementations of WebM shall terminate as of the date such +litigation is filed. diff --git a/media/libvpx/libvpx/README b/media/libvpx/libvpx/README new file mode 100644 index 0000000000..4c25b15d81 --- /dev/null +++ b/media/libvpx/libvpx/README @@ -0,0 +1,189 @@ +v1.13.1 Ugly Duckling + +Welcome to the WebM VP8/VP9 Codec SDK! + +COMPILING THE APPLICATIONS/LIBRARIES: + The build system used is similar to autotools. Building generally consists of + "configuring" with your desired build options, then using GNU make to build + the application. + + 1. Prerequisites + + * All x86 targets require the Yasm[1] assembler be installed[2]. + * All Windows builds require that Cygwin[3] or MSYS2[4] be installed. + * Building the documentation requires Doxygen[5]. If you do not + have this package, the install-docs option will be disabled. + * Downloading the data for the unit tests requires curl[6] and sha1sum. + sha1sum is provided via the GNU coreutils, installed by default on + many *nix platforms, as well as MinGW and Cygwin. If coreutils is not + available, a compatible version of sha1sum can be built from + source[7]. These requirements are optional if not running the unit + tests. + + [1]: http://www.tortall.net/projects/yasm + [2]: For Visual Studio the base yasm binary (not vsyasm) should be in the + PATH for Visual Studio. For VS2017 it is sufficient to rename + yasm--.exe to yasm.exe and place it in: + Program Files (x86)/Microsoft Visual Studio/2017//Common7/Tools/ + [3]: http://www.cygwin.com + [4]: http://www.msys2.org/ + [5]: http://www.doxygen.org + [6]: http://curl.haxx.se + [7]: http://www.microbrew.org/tools/md5sha1sum/ + + 2. Out-of-tree builds + Out of tree builds are a supported method of building the application. For + an out of tree build, the source tree is kept separate from the object + files produced during compilation. For instance: + + $ mkdir build + $ cd build + $ ../libvpx/configure + $ make + + 3. Configuration options + The 'configure' script supports a number of options. The --help option can be + used to get a list of supported options: + $ ../libvpx/configure --help + + 4. Compiler analyzers + Compilers have added sanitizers which instrument binaries with information + about address calculation, memory usage, threading, undefined behavior, and + other common errors. To simplify building libvpx with some of these features + use tools/set_analyzer_env.sh before running configure. It will set the + compiler and necessary flags for building as well as environment variables + read by the analyzer when testing the binaries. + $ source ../libvpx/tools/set_analyzer_env.sh address + + 5. Cross development + For cross development, the most notable option is the --target option. The + most up-to-date list of supported targets can be found at the bottom of the + --help output of the configure script. As of this writing, the list of + available targets is: + + arm64-android-gcc + arm64-darwin-gcc + arm64-darwin20-gcc + arm64-darwin21-gcc + arm64-darwin22-gcc + arm64-darwin23-gcc + arm64-linux-gcc + arm64-win64-gcc + arm64-win64-vs15 + arm64-win64-vs16 + arm64-win64-vs16-clangcl + arm64-win64-vs17 + arm64-win64-vs17-clangcl + armv7-android-gcc + armv7-darwin-gcc + armv7-linux-rvct + armv7-linux-gcc + armv7-none-rvct + armv7-win32-gcc + armv7-win32-vs14 + armv7-win32-vs15 + armv7-win32-vs16 + armv7-win32-vs17 + armv7s-darwin-gcc + armv8-linux-gcc + loongarch32-linux-gcc + loongarch64-linux-gcc + mips32-linux-gcc + mips64-linux-gcc + ppc64le-linux-gcc + sparc-solaris-gcc + x86-android-gcc + x86-darwin8-gcc + x86-darwin8-icc + x86-darwin9-gcc + x86-darwin9-icc + x86-darwin10-gcc + x86-darwin11-gcc + x86-darwin12-gcc + x86-darwin13-gcc + x86-darwin14-gcc + x86-darwin15-gcc + x86-darwin16-gcc + x86-darwin17-gcc + x86-iphonesimulator-gcc + x86-linux-gcc + x86-linux-icc + x86-os2-gcc + x86-solaris-gcc + x86-win32-gcc + x86-win32-vs14 + x86-win32-vs15 + x86-win32-vs16 + x86-win32-vs17 + x86_64-android-gcc + x86_64-darwin9-gcc + x86_64-darwin10-gcc + x86_64-darwin11-gcc + x86_64-darwin12-gcc + x86_64-darwin13-gcc + x86_64-darwin14-gcc + x86_64-darwin15-gcc + x86_64-darwin16-gcc + x86_64-darwin17-gcc + x86_64-darwin18-gcc + x86_64-darwin19-gcc + x86_64-darwin20-gcc + x86_64-darwin21-gcc + x86_64-darwin22-gcc + x86_64-darwin23-gcc + x86_64-iphonesimulator-gcc + x86_64-linux-gcc + x86_64-linux-icc + x86_64-solaris-gcc + x86_64-win64-gcc + x86_64-win64-vs14 + x86_64-win64-vs15 + x86_64-win64-vs16 + x86_64-win64-vs17 + generic-gnu + + The generic-gnu target, in conjunction with the CROSS environment variable, + can be used to cross compile architectures that aren't explicitly listed, if + the toolchain is a cross GNU (gcc/binutils) toolchain. Other POSIX toolchains + will likely work as well. For instance, to build using the mipsel-linux-uclibc + toolchain, the following command could be used (note, POSIX SH syntax, adapt + to your shell as necessary): + + $ CROSS=mipsel-linux-uclibc- ../libvpx/configure + + In addition, the executables to be invoked can be overridden by specifying the + environment variables: CC, AR, LD, AS, STRIP, NM. Additional flags can be + passed to these executables with CFLAGS, LDFLAGS, and ASFLAGS. + + 6. Configuration errors + If the configuration step fails, the first step is to look in the error log. + This defaults to config.log. This should give a good indication of what went + wrong. If not, contact us for support. + +VP8/VP9 TEST VECTORS: + The test vectors can be downloaded and verified using the build system after + running configure. To specify an alternate directory the + LIBVPX_TEST_DATA_PATH environment variable can be used. + + $ ./configure --enable-unit-tests + $ LIBVPX_TEST_DATA_PATH=../libvpx-test-data make testdata + +CODE STYLE: + The coding style used by this project is enforced with clang-format using the + configuration contained in the .clang-format file in the root of the + repository. + + Before pushing changes for review you can format your code with: + # Apply clang-format to modified .c, .h and .cc files + $ clang-format -i --style=file \ + $(git diff --name-only --diff-filter=ACMR '*.[hc]' '*.cc') + + Check the .clang-format file for the version used to generate it if there is + any difference between your local formatting and the review system. + + See also: http://clang.llvm.org/docs/ClangFormat.html + +SUPPORT + This library is an open source project supported by its community. Please + email webm-discuss@webmproject.org for help. + diff --git a/media/libvpx/libvpx/args.c b/media/libvpx/libvpx/args.c new file mode 100644 index 0000000000..0a9631e1f4 --- /dev/null +++ b/media/libvpx/libvpx/args.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include "args.h" + +#include "vpx/vpx_integer.h" +#include "vpx_ports/msvc.h" + +#if defined(__GNUC__) +__attribute__((noreturn)) extern void die(const char *fmt, ...); +#elif defined(_MSC_VER) +__declspec(noreturn) extern void die(const char *fmt, ...); +#else +extern void die(const char *fmt, ...); +#endif + +struct arg arg_init(char **argv) { + struct arg a; + + a.argv = argv; + a.argv_step = 1; + a.name = NULL; + a.val = NULL; + a.def = NULL; + return a; +} + +int arg_match(struct arg *arg_, const struct arg_def *def, char **argv) { + struct arg arg; + + if (!argv[0] || argv[0][0] != '-') return 0; + + arg = arg_init(argv); + + if (def->short_name && strlen(arg.argv[0]) == strlen(def->short_name) + 1 && + !strcmp(arg.argv[0] + 1, def->short_name)) { + arg.name = arg.argv[0] + 1; + arg.val = def->has_val ? arg.argv[1] : NULL; + arg.argv_step = def->has_val ? 2 : 1; + } else if (def->long_name) { + const size_t name_len = strlen(def->long_name); + + if (strlen(arg.argv[0]) >= name_len + 2 && arg.argv[0][1] == '-' && + !strncmp(arg.argv[0] + 2, def->long_name, name_len) && + (arg.argv[0][name_len + 2] == '=' || + arg.argv[0][name_len + 2] == '\0')) { + arg.name = arg.argv[0] + 2; + arg.val = arg.name[name_len] == '=' ? arg.name + name_len + 1 : NULL; + arg.argv_step = 1; + } + } + + if (arg.name && !arg.val && def->has_val) + die("Error: option %s requires argument.\n", arg.name); + + if (arg.name && arg.val && !def->has_val) + die("Error: option %s requires no argument.\n", arg.name); + + if (arg.name && (arg.val || !def->has_val)) { + arg.def = def; + *arg_ = arg; + return 1; + } + + return 0; +} + +const char *arg_next(struct arg *arg) { + if (arg->argv[0]) arg->argv += arg->argv_step; + + return *arg->argv; +} + +char **argv_dup(int argc, const char **argv) { + char **new_argv = malloc((argc + 1) * sizeof(*argv)); + if (!new_argv) return NULL; + + memcpy(new_argv, argv, argc * sizeof(*argv)); + new_argv[argc] = NULL; + return new_argv; +} + +void arg_show_usage(FILE *fp, const struct arg_def *const *defs) { + char option_text[40] = { 0 }; + + for (; *defs; defs++) { + const struct arg_def *def = *defs; + char *short_val = def->has_val ? " " : ""; + char *long_val = def->has_val ? "=" : ""; + + if (def->short_name && def->long_name) { + char *comma = def->has_val ? "," : ", "; + + snprintf(option_text, 37, "-%s%s%s --%s%6s", def->short_name, short_val, + comma, def->long_name, long_val); + } else if (def->short_name) + snprintf(option_text, 37, "-%s%s", def->short_name, short_val); + else if (def->long_name) + snprintf(option_text, 37, " --%s%s", def->long_name, long_val); + + fprintf(fp, " %-37s\t%s\n", option_text, def->desc); + + if (def->enums) { + const struct arg_enum_list *listptr; + + fprintf(fp, " %-37s\t ", ""); + + for (listptr = def->enums; listptr->name; listptr++) + fprintf(fp, "%s%s", listptr->name, listptr[1].name ? ", " : "\n"); + } + } +} + +unsigned int arg_parse_uint(const struct arg *arg) { + uint32_t rawval; + char *endptr; + + rawval = (uint32_t)strtoul(arg->val, &endptr, 10); + + if (arg->val[0] != '\0' && endptr[0] == '\0') { + if (rawval <= UINT_MAX) return rawval; + + die("Option %s: Value %ld out of range for unsigned int\n", arg->name, + rawval); + } + + die("Option %s: Invalid character '%c'\n", arg->name, *endptr); +} + +int arg_parse_int(const struct arg *arg) { + int32_t rawval; + char *endptr; + + rawval = (int32_t)strtol(arg->val, &endptr, 10); + + if (arg->val[0] != '\0' && endptr[0] == '\0') { + if (rawval >= INT_MIN && rawval <= INT_MAX) return (int)rawval; + + die("Option %s: Value %ld out of range for signed int\n", arg->name, + rawval); + } + + die("Option %s: Invalid character '%c'\n", arg->name, *endptr); +} + +struct vpx_rational { + int num; /**< fraction numerator */ + int den; /**< fraction denominator */ +}; +struct vpx_rational arg_parse_rational(const struct arg *arg) { + long int rawval; + char *endptr; + struct vpx_rational rat; + + /* parse numerator */ + rawval = strtol(arg->val, &endptr, 10); + + if (arg->val[0] != '\0' && endptr[0] == '/') { + if (rawval >= INT_MIN && rawval <= INT_MAX) + rat.num = (int)rawval; + else + die("Option %s: Value %ld out of range for signed int\n", arg->name, + rawval); + } else + die("Option %s: Expected / at '%c'\n", arg->name, *endptr); + + /* parse denominator */ + rawval = strtol(endptr + 1, &endptr, 10); + + if (arg->val[0] != '\0' && endptr[0] == '\0') { + if (rawval >= INT_MIN && rawval <= INT_MAX) + rat.den = (int)rawval; + else + die("Option %s: Value %ld out of range for signed int\n", arg->name, + rawval); + } else + die("Option %s: Invalid character '%c'\n", arg->name, *endptr); + + return rat; +} + +int arg_parse_enum(const struct arg *arg) { + const struct arg_enum_list *listptr; + long int rawval; + char *endptr; + + /* First see if the value can be parsed as a raw value */ + rawval = strtol(arg->val, &endptr, 10); + if (arg->val[0] != '\0' && endptr[0] == '\0') { + /* Got a raw value, make sure it's valid */ + for (listptr = arg->def->enums; listptr->name; listptr++) + if (listptr->val == rawval) return (int)rawval; + } + + /* Next see if it can be parsed as a string */ + for (listptr = arg->def->enums; listptr->name; listptr++) + if (!strcmp(arg->val, listptr->name)) return listptr->val; + + die("Option %s: Invalid value '%s'\n", arg->name, arg->val); +} + +int arg_parse_enum_or_int(const struct arg *arg) { + if (arg->def->enums) return arg_parse_enum(arg); + return arg_parse_int(arg); +} diff --git a/media/libvpx/libvpx/args.h b/media/libvpx/libvpx/args.h new file mode 100644 index 0000000000..aae8ec06a5 --- /dev/null +++ b/media/libvpx/libvpx/args.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_ARGS_H_ +#define VPX_ARGS_H_ +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct arg { + char **argv; + const char *name; + const char *val; + unsigned int argv_step; + const struct arg_def *def; +}; + +struct arg_enum_list { + const char *name; + int val; +}; +#define ARG_ENUM_LIST_END \ + { 0 } + +typedef struct arg_def { + const char *short_name; + const char *long_name; + int has_val; + const char *desc; + const struct arg_enum_list *enums; +} arg_def_t; +#define ARG_DEF(s, l, v, d) \ + { s, l, v, d, NULL } +#define ARG_DEF_ENUM(s, l, v, d, e) \ + { s, l, v, d, e } +#define ARG_DEF_LIST_END \ + { 0 } + +struct arg arg_init(char **argv); +int arg_match(struct arg *arg_, const struct arg_def *def, char **argv); +const char *arg_next(struct arg *arg); +void arg_show_usage(FILE *fp, const struct arg_def *const *defs); +char **argv_dup(int argc, const char **argv); + +unsigned int arg_parse_uint(const struct arg *arg); +int arg_parse_int(const struct arg *arg); +struct vpx_rational arg_parse_rational(const struct arg *arg); +int arg_parse_enum(const struct arg *arg); +int arg_parse_enum_or_int(const struct arg *arg); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_ARGS_H_ diff --git a/media/libvpx/libvpx/build/make/Android.mk b/media/libvpx/libvpx/build/make/Android.mk new file mode 100644 index 0000000000..ba24f541b1 --- /dev/null +++ b/media/libvpx/libvpx/build/make/Android.mk @@ -0,0 +1,217 @@ +## +## Copyright (c) 2012 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +# Ignore this file during non-NDK builds. +ifdef NDK_ROOT +# +# This file is to be used for compiling libvpx for Android using the NDK. +# In an Android project place a libvpx checkout in the jni directory. +# Run the configure script from the jni directory. Base libvpx +# encoder/decoder configuration will look similar to: +# ./libvpx/configure --target=armv7-android-gcc --disable-examples \ +# --enable-external-build +# +# When targeting Android, realtime-only is enabled by default. This can +# be overridden by adding the command line flag: +# --disable-realtime-only +# +# This will create .mk files that contain variables that contain the +# source files to compile. +# +# Place an Android.mk file in the jni directory that references the +# Android.mk file in the libvpx directory: +# LOCAL_PATH := $(call my-dir) +# include $(CLEAR_VARS) +# include jni/libvpx/build/make/Android.mk +# +# By default libvpx will use the 'cpufeatures' module from the NDK. This allows +# the library to be built with all available optimizations (SSE2->AVX512 for +# x86, NEON for arm, DSPr2 for mips). This can be disabled with +# --disable-runtime-cpu-detect +# but the resulting library *must* be run on devices supporting all of the +# enabled extensions. They can be disabled individually with +# --disable-{sse2, sse3, ssse3, sse4_1, avx, avx2, avx512} +# --disable-neon[-asm] +# --disable-{dspr2, msa} + +# +# Running ndk-build will build libvpx and include it in your project. +# + +CONFIG_DIR := $(LOCAL_PATH)/ +LIBVPX_PATH := $(LOCAL_PATH)/libvpx +ASM_CNV_PATH_LOCAL := $(TARGET_ARCH_ABI)/ads2gas +ASM_CNV_PATH := $(LOCAL_PATH)/$(ASM_CNV_PATH_LOCAL) +ifneq ($(V),1) + qexec := @ +endif + +# Use the makefiles generated by upstream configure to determine which files to +# build. Also set any architecture-specific flags. +ifeq ($(TARGET_ARCH_ABI),armeabi-v7a) + include $(CONFIG_DIR)libs-armv7-android-gcc.mk + LOCAL_ARM_MODE := arm +else ifeq ($(TARGET_ARCH_ABI),arm64-v8a) + include $(CONFIG_DIR)libs-arm64-android-gcc.mk + LOCAL_ARM_MODE := arm +else ifeq ($(TARGET_ARCH_ABI),x86) + include $(CONFIG_DIR)libs-x86-android-gcc.mk +else ifeq ($(TARGET_ARCH_ABI),x86_64) + include $(CONFIG_DIR)libs-x86_64-android-gcc.mk +else ifeq ($(TARGET_ARCH_ABI),mips) + include $(CONFIG_DIR)libs-mips-android-gcc.mk +else + $(error Not a supported TARGET_ARCH_ABI: $(TARGET_ARCH_ABI)) +endif + +# Rule that is normally in Makefile created by libvpx +# configure. Used to filter out source files based on configuration. +enabled=$(filter-out $($(1)-no),$($(1)-yes)) + +# Override the relative path that is defined by the libvpx +# configure process +SRC_PATH_BARE := $(LIBVPX_PATH) + +# Include the list of files to be built +include $(LIBVPX_PATH)/libs.mk + +# Optimise the code. May want to revisit this setting in the future. +LOCAL_CFLAGS := -O3 + +# For x86, include the source code in the search path so it will find files +# like x86inc.asm and x86_abi_support.asm +LOCAL_ASMFLAGS := -I$(LIBVPX_PATH) + +.PRECIOUS: %.asm.S +$(ASM_CNV_PATH)/libvpx/%.asm.S: $(LIBVPX_PATH)/%.asm + $(qexec)mkdir -p $(dir $@) + $(qexec)$(CONFIG_DIR)$(ASM_CONVERSION) <$< > $@ + +# For building *_rtcd.h, which have rules in libs.mk +TGT_ISA:=$(word 1, $(subst -, ,$(TOOLCHAIN))) +target := libs + +LOCAL_SRC_FILES += vpx_config.c + +# Remove duplicate entries +CODEC_SRCS_UNIQUE = $(sort $(CODEC_SRCS)) + +# Pull out C files. vpx_config.c is in the immediate directory and +# so it does not need libvpx/ prefixed like the rest of the source files. +# The neon files with intrinsics need to have .neon appended so the proper +# flags are applied. +CODEC_SRCS_C = $(filter %.c, $(CODEC_SRCS_UNIQUE)) +LOCAL_NEON_SRCS_C = $(filter %_neon.c, $(CODEC_SRCS_C)) +LOCAL_CODEC_SRCS_C = $(filter-out vpx_config.c %_neon.c, $(CODEC_SRCS_C)) + +LOCAL_SRC_FILES += $(foreach file, $(LOCAL_CODEC_SRCS_C), libvpx/$(file)) +ifeq ($(TARGET_ARCH_ABI),armeabi-v7a) + LOCAL_SRC_FILES += $(foreach file, $(LOCAL_NEON_SRCS_C), libvpx/$(file).neon) +else # If there are neon sources then we are building for arm64 and do not need to specify .neon + LOCAL_SRC_FILES += $(foreach file, $(LOCAL_NEON_SRCS_C), libvpx/$(file)) +endif + +# Pull out assembly files, splitting NEON from the rest. This is +# done to specify that the NEON assembly files use NEON assembler flags. +# x86 assembly matches %.asm, arm matches %.asm.S + +# x86: + +CODEC_SRCS_ASM_X86 = $(filter %.asm, $(CODEC_SRCS_UNIQUE)) +LOCAL_SRC_FILES += $(foreach file, $(CODEC_SRCS_ASM_X86), libvpx/$(file)) + +# arm: +CODEC_SRCS_ASM_ARM_ALL = $(filter %.asm.S, $(CODEC_SRCS_UNIQUE)) +CODEC_SRCS_ASM_ARM = $(foreach v, \ + $(CODEC_SRCS_ASM_ARM_ALL), \ + $(if $(findstring neon,$(v)),,$(v))) +CODEC_SRCS_ASM_ADS2GAS = $(patsubst %.S, \ + $(ASM_CNV_PATH_LOCAL)/libvpx/%.S, \ + $(CODEC_SRCS_ASM_ARM)) +LOCAL_SRC_FILES += $(CODEC_SRCS_ASM_ADS2GAS) + +ifeq ($(TARGET_ARCH_ABI),armeabi-v7a) + ASM_INCLUDES := vpx_dsp/arm/idct_neon.asm.S + CODEC_SRCS_ASM_NEON = $(foreach v, \ + $(CODEC_SRCS_ASM_ARM_ALL),\ + $(if $(findstring neon,$(v)),$(v),)) + CODEC_SRCS_ASM_NEON := $(filter-out $(addprefix %, $(ASM_INCLUDES)), \ + $(CODEC_SRCS_ASM_NEON)) + CODEC_SRCS_ASM_NEON_ADS2GAS = $(patsubst %.S, \ + $(ASM_CNV_PATH_LOCAL)/libvpx/%.S, \ + $(CODEC_SRCS_ASM_NEON)) + LOCAL_SRC_FILES += $(patsubst %.S, \ + %.S.neon, \ + $(CODEC_SRCS_ASM_NEON_ADS2GAS)) + + NEON_ASM_TARGETS = $(patsubst %.S, \ + $(ASM_CNV_PATH)/libvpx/%.S, \ + $(CODEC_SRCS_ASM_NEON)) +# add a dependency to the full path to the ads2gas output to ensure the +# includes are converted first. +ifneq ($(strip $(NEON_ASM_TARGETS)),) +$(NEON_ASM_TARGETS): $(addprefix $(ASM_CNV_PATH)/libvpx/, $(ASM_INCLUDES)) +endif +endif + +LOCAL_CFLAGS += \ + -DHAVE_CONFIG_H=vpx_config.h \ + -I$(LIBVPX_PATH) \ + -I$(ASM_CNV_PATH) \ + -I$(ASM_CNV_PATH)/libvpx + +LOCAL_MODULE := libvpx +LOCAL_LICENSE_KINDS := SPDX-license-identifier-BSD +LOCAL_LICENSE_CONDITIONS := notice +LOCAL_NOTICE_FILE := $(LOCAL_PATH)/../../LICENSE $(LOCAL_PATH)/../../PATENTS + +ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes) + LOCAL_STATIC_LIBRARIES := cpufeatures +endif + +# Add a dependency to force generation of the RTCD files. +define rtcd_dep_template +rtcd_dep_template_SRCS := $(addprefix $(LOCAL_PATH)/, $(LOCAL_SRC_FILES)) +rtcd_dep_template_SRCS := $$(rtcd_dep_template_SRCS:.neon=) +ifeq ($(CONFIG_VP8), yes) +$$(rtcd_dep_template_SRCS): vp8_rtcd.h +endif +ifeq ($(CONFIG_VP9), yes) +$$(rtcd_dep_template_SRCS): vp9_rtcd.h +endif +$$(rtcd_dep_template_SRCS): vpx_scale_rtcd.h +$$(rtcd_dep_template_SRCS): vpx_dsp_rtcd.h + +rtcd_dep_template_CONFIG_ASM_ABIS := x86 x86_64 armeabi-v7a +ifneq ($$(findstring $(TARGET_ARCH_ABI),$$(rtcd_dep_template_CONFIG_ASM_ABIS)),) +$$(rtcd_dep_template_SRCS): vpx_config.asm +endif +endef + +$(eval $(call rtcd_dep_template)) + +.PHONY: clean +clean: + @echo "Clean: ads2gas files [$(TARGET_ARCH_ABI)]" + $(qexec)$(RM) $(CODEC_SRCS_ASM_ADS2GAS) $(CODEC_SRCS_ASM_NEON_ADS2GAS) + $(qexec)$(RM) -r $(ASM_CNV_PATH) + $(qexec)$(RM) $(CLEAN-OBJS) + +ifeq ($(ENABLE_SHARED),1) + LOCAL_CFLAGS += -fPIC + include $(BUILD_SHARED_LIBRARY) +else + include $(BUILD_STATIC_LIBRARY) +endif + +ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes) +$(call import-module,android/cpufeatures) +endif +endif # NDK_ROOT diff --git a/media/libvpx/libvpx/build/make/Makefile b/media/libvpx/libvpx/build/make/Makefile new file mode 100644 index 0000000000..199ed78058 --- /dev/null +++ b/media/libvpx/libvpx/build/make/Makefile @@ -0,0 +1,492 @@ +## +## Copyright (c) 2010 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + + +include config.mk +quiet?=true +ifeq ($(target),) +# If a target wasn't specified, invoke for all enabled targets. +.DEFAULT: + @for t in $(ALL_TARGETS); do \ + $(MAKE) --no-print-directory target=$$t $(MAKECMDGOALS) || exit $$?;\ + done +all: .DEFAULT +clean:: .DEFAULT +exampletest: .DEFAULT +install:: .DEFAULT +test: .DEFAULT +test-no-data-check: .DEFAULT +testdata: .DEFAULT +utiltest: .DEFAULT +exampletest-no-data-check utiltest-no-data-check: .DEFAULT +test_%: .DEFAULT ; + +# Note: md5sum is not installed on OS X, but openssl is. Openssl may not be +# installed on cygwin, so we need to autodetect here. +md5sum := $(firstword $(wildcard \ + $(foreach e,md5sum openssl,\ + $(foreach p,$(subst :, ,$(PATH)),$(p)/$(e)*))\ + )) +md5sum := $(if $(filter %openssl,$(md5sum)),$(md5sum) dgst -md5,$(md5sum)) + +TGT_CC:=$(word 3, $(subst -, ,$(TOOLCHAIN))) +dist: + @for t in $(ALL_TARGETS); do \ + $(MAKE) --no-print-directory target=$$t $(MAKECMDGOALS) || exit $$?;\ + done + # Run configure for the user with the current toolchain. + @if [ -d "$(DIST_DIR)/src" ]; then \ + mkdir -p "$(DIST_DIR)/build"; \ + cd "$(DIST_DIR)/build"; \ + echo "Rerunning configure $(CONFIGURE_ARGS)"; \ + ../src/configure $(CONFIGURE_ARGS); \ + $(if $(filter vs%,$(TGT_CC)),make NO_LAUNCH_DEVENV=1;) \ + fi + @if [ -d "$(DIST_DIR)" ]; then \ + echo " [MD5SUM] $(DIST_DIR)"; \ + cd $(DIST_DIR) && \ + $(md5sum) `find . -name md5sums.txt -prune -o -type f -print` \ + | sed -e 's/MD5(\(.*\))= \([0-9a-f]\{32\}\)/\2 \1/' \ + > md5sums.txt;\ + fi +endif + +# Since we invoke make recursively for multiple targets we need to include the +# .mk file for the correct target, but only when $(target) is non-empty. +ifneq ($(target),) +include $(target)-$(TOOLCHAIN).mk +endif +BUILD_ROOT?=. +VPATH=$(SRC_PATH_BARE) +CFLAGS+=-I$(BUILD_PFX)$(BUILD_ROOT) -I$(SRC_PATH) +CXXFLAGS+=-I$(BUILD_PFX)$(BUILD_ROOT) -I$(SRC_PATH) +ASFLAGS+=-I$(BUILD_PFX)$(BUILD_ROOT)/ -I$(SRC_PATH)/ +DIST_DIR?=dist +HOSTCC?=gcc +TGT_ISA:=$(word 1, $(subst -, ,$(TOOLCHAIN))) +TGT_OS:=$(word 2, $(subst -, ,$(TOOLCHAIN))) +TGT_CC:=$(word 3, $(subst -, ,$(TOOLCHAIN))) +quiet:=$(if $(or $(verbose), $(V)),, yes) +qexec=$(if $(quiet),@) + +# Cancel built-in implicit rules +%: %.o +%.asm: +%.a: +%: %.cc + +# +# Common rules" +# +.PHONY: all +all: + +.PHONY: clean +clean:: + rm -f $(OBJS-yes) $(OBJS-yes:.o=.d) $(OBJS-yes:.asm.S.o=.asm.S) + rm -f $(CLEAN-OBJS) + +.PHONY: clean +distclean: clean + if [ -z "$(target)" ]; then \ + rm -f Makefile; \ + rm -f config.log config.mk; \ + rm -f vpx_config.[hc] vpx_config.asm; \ + rm -f arm_neon.h; \ + else \ + rm -f $(target)-$(TOOLCHAIN).mk; \ + fi + +.PHONY: dist +dist: +.PHONY: exampletest +exampletest: +.PHONY: install +install:: +.PHONY: test +test: +.PHONY: testdata +testdata: +.PHONY: utiltest +utiltest: +.PHONY: test-no-data-check exampletest-no-data-check utiltest-no-data-check +test-no-data-check: +exampletest-no-data-check utiltest-no-data-check: + +# Force to realign stack always on OS/2 +ifeq ($(TOOLCHAIN), x86-os2-gcc) +CFLAGS += -mstackrealign +endif + +# x86[_64] +$(BUILD_PFX)%_mmx.c.d: CFLAGS += -mmmx +$(BUILD_PFX)%_mmx.c.o: CFLAGS += -mmmx +$(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2 +$(BUILD_PFX)%_sse2.c.o: CFLAGS += -msse2 +$(BUILD_PFX)%_sse3.c.d: CFLAGS += -msse3 +$(BUILD_PFX)%_sse3.c.o: CFLAGS += -msse3 +$(BUILD_PFX)%_ssse3.c.d: CFLAGS += -mssse3 +$(BUILD_PFX)%_ssse3.c.o: CFLAGS += -mssse3 +$(BUILD_PFX)%_sse4.c.d: CFLAGS += -msse4.1 +$(BUILD_PFX)%_sse4.c.o: CFLAGS += -msse4.1 +$(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx +$(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx +$(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2 +$(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2 +$(BUILD_PFX)%_avx512.c.d: CFLAGS += -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl +$(BUILD_PFX)%_avx512.c.o: CFLAGS += -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl + +# AARCH64 +$(BUILD_PFX)%_neon_dotprod.c.d: CFLAGS += -march=armv8.2-a+dotprod +$(BUILD_PFX)%_neon_dotprod.c.o: CFLAGS += -march=armv8.2-a+dotprod +$(BUILD_PFX)%_neon_i8mm.c.d: CFLAGS += -march=armv8.2-a+dotprod+i8mm +$(BUILD_PFX)%_neon_i8mm.c.o: CFLAGS += -march=armv8.2-a+dotprod+i8mm +$(BUILD_PFX)%_sve.c.d: CFLAGS += -march=armv8.2-a+dotprod+i8mm+sve +$(BUILD_PFX)%_sve.c.o: CFLAGS += -march=armv8.2-a+dotprod+i8mm+sve + +# POWER +$(BUILD_PFX)%_vsx.c.d: CFLAGS += -maltivec -mvsx +$(BUILD_PFX)%_vsx.c.o: CFLAGS += -maltivec -mvsx + +# MIPS +$(BUILD_PFX)%_msa.c.d: CFLAGS += -mmsa +$(BUILD_PFX)%_msa.c.o: CFLAGS += -mmsa + +# LOONGARCH +$(BUILD_PFX)%_lsx.c.d: CFLAGS += -mlsx +$(BUILD_PFX)%_lsx.c.o: CFLAGS += -mlsx +$(BUILD_PFX)%_lasx.c.d: CFLAGS += -mlasx +$(BUILD_PFX)%_lasx.c.o: CFLAGS += -mlasx + +$(BUILD_PFX)%.c.d: %.c + $(if $(quiet),@echo " [DEP] $@") + $(qexec)mkdir -p $(dir $@) + $(qexec)$(CC) $(INTERNAL_CFLAGS) $(CFLAGS) -M $< | $(fmt_deps) > $@ + +$(BUILD_PFX)%.c.o: %.c + $(if $(quiet),@echo " [CC] $@") + $(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@)) + $(qexec)$(CC) $(INTERNAL_CFLAGS) $(CFLAGS) -c -o $@ $< + +$(BUILD_PFX)%.cc.d: %.cc + $(if $(quiet),@echo " [DEP] $@") + $(qexec)mkdir -p $(dir $@) + $(qexec)$(CXX) $(INTERNAL_CFLAGS) $(CXXFLAGS) -M $< | $(fmt_deps) > $@ + +$(BUILD_PFX)%.cc.o: %.cc + $(if $(quiet),@echo " [CXX] $@") + $(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@)) + $(qexec)$(CXX) $(INTERNAL_CFLAGS) $(CXXFLAGS) -c -o $@ $< + +$(BUILD_PFX)%.cpp.d: %.cpp + $(if $(quiet),@echo " [DEP] $@") + $(qexec)mkdir -p $(dir $@) + $(qexec)$(CXX) $(INTERNAL_CFLAGS) $(CXXFLAGS) -M $< | $(fmt_deps) > $@ + +$(BUILD_PFX)%.cpp.o: %.cpp + $(if $(quiet),@echo " [CXX] $@") + $(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@)) + $(qexec)$(CXX) $(INTERNAL_CFLAGS) $(CXXFLAGS) -c -o $@ $< + +$(BUILD_PFX)%.asm.d: %.asm + $(if $(quiet),@echo " [DEP] $@") + $(qexec)mkdir -p $(dir $@) + $(qexec)$(SRC_PATH_BARE)/build/make/gen_asm_deps.sh \ + --build-pfx=$(BUILD_PFX) --depfile=$@ $(ASFLAGS) $< > $@ + +$(BUILD_PFX)%.asm.o: %.asm + $(if $(quiet),@echo " [AS] $@") + $(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@)) + $(qexec)$(AS) $(ASFLAGS) -o $@ $< + +$(BUILD_PFX)%.S.d: %.S + $(if $(quiet),@echo " [DEP] $@") + $(qexec)mkdir -p $(dir $@) + $(qexec)$(SRC_PATH_BARE)/build/make/gen_asm_deps.sh \ + --build-pfx=$(BUILD_PFX) --depfile=$@ $(ASFLAGS) $< > $@ + +$(BUILD_PFX)%.S.o: %.S + $(if $(quiet),@echo " [AS] $@") + $(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@)) + $(qexec)$(AS) $(ASFLAGS) -o $@ $< + +.PRECIOUS: %.c.S +%.c.S: CFLAGS += -DINLINE_ASM +$(BUILD_PFX)%.c.S: %.c + $(if $(quiet),@echo " [GEN] $@") + $(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@)) + $(qexec)$(CC) -S $(CFLAGS) -o $@ $< + +.PRECIOUS: %.asm.S +$(BUILD_PFX)%.asm.S: %.asm + $(if $(quiet),@echo " [ASM CONVERSION] $@") + $(qexec)mkdir -p $(dir $@) + $(qexec)$(ASM_CONVERSION) <$< >$@ + +# If we're in debug mode, pretend we don't have GNU strip, to fall back to +# the copy implementation +HAVE_GNU_STRIP := $(if $(CONFIG_DEBUG),,$(HAVE_GNU_STRIP)) +ifeq ($(HAVE_GNU_STRIP),yes) +# Older binutils strip global symbols not needed for relocation processing +# when given --strip-unneeded. Using nm and awk to identify globals and +# keep them caused command line length issues under mingw and segfaults in +# test_libvpx were observed under OS/2: simply use --strip-debug. +%.a: %_g.a + $(if $(quiet),@echo " [STRIP] $@ < $<") + $(qexec)$(STRIP) --strip-debug \ + -o $@ $< +else +%.a: %_g.a + $(if $(quiet),@echo " [CP] $@ < $<") + $(qexec)cp $< $@ +endif + +# +# Utility functions +# +pairmap=$(if $(strip $(2)),\ + $(call $(1),$(word 1,$(2)),$(word 2,$(2)))\ + $(call pairmap,$(1),$(wordlist 3,$(words $(2)),$(2)))\ +) + +enabled=$(filter-out $($(1)-no),$($(1)-yes)) +cond_enabled=$(if $(filter yes,$($(1))), $(call enabled,$(2))) + +find_file1=$(word 1,$(wildcard $(subst //,/,$(addsuffix /$(1),$(2))))) +find_file=$(foreach f,$(1),$(call find_file1,$(strip $(f)),$(strip $(2))) ) +obj_pats=.c=.c.o $(AS_SFX)=$(AS_SFX).o .cc=.cc.o .cpp=.cpp.o +objs=$(addprefix $(BUILD_PFX),$(foreach p,$(obj_pats),$(filter %.o,$(1:$(p))) )) + +install_map_templates=$(eval $(call install_map_template,$(1),$(2))) + +not=$(subst yes,no,$(1)) + +ifeq ($(CONFIG_MSVS),yes) +lib_file_name=$(1).lib +else +lib_file_name=lib$(1).a +endif +# +# Rule Templates +# +define linker_template +$(1): $(filter-out -%,$(2)) +$(1): + $(if $(quiet),@echo " [LD] $$@") + $(qexec)$$(LD) $$(strip $$(INTERNAL_LDFLAGS) $$(LDFLAGS) -o $$@ $(2) $(3) $$(extralibs)) +endef +define linkerxx_template +$(1): $(filter-out -%,$(2)) +$(1): + $(if $(quiet),@echo " [LD] $$@") + $(qexec)$$(CXX) $$(strip $$(INTERNAL_LDFLAGS) $$(LDFLAGS) -o $$@ $(2) $(3) $$(extralibs)) +endef +# make-3.80 has a bug with expanding large input strings to the eval function, +# which was triggered in some cases by the following component of +# linker_template: +# $(1): $$(call find_file, $(patsubst -l%,lib%.a,$(filter -l%,$(2))),\ +# $$(patsubst -L%,%,$$(filter -L%,$$(LDFLAGS) $(2)))) +# This may be useful to revisit in the future (it tries to locate libraries +# in a search path and add them as prerequisites + +define install_map_template +$(DIST_DIR)/$(1): $(2) + $(if $(quiet),@echo " [INSTALL] $$@") + $(qexec)mkdir -p $$(dir $$@) + $(qexec)cp -p $$< $$@ +endef + +define archive_template +# Not using a pattern rule here because we don't want to generate empty +# archives when they are listed as a dependency in files not responsible +# for creating them. +$(1): + $(if $(quiet),@echo " [AR] $$@") + $(qexec)$$(AR) $$(ARFLAGS) $$@ $$^ +endef + +# Don't use -Wl,-z,defs with Clang's sanitizers. +# +# Clang's AddressSanitizer documentation says "When linking shared libraries, +# the AddressSanitizer run-time is not linked, so -Wl,-z,defs may cause link +# errors (don't use it with AddressSanitizer)." See +# https://clang.llvm.org/docs/AddressSanitizer.html#usage. +NO_UNDEFINED := -Wl,-z,defs +ifeq ($(findstring clang,$(CC)),clang) + ifneq ($(filter -fsanitize=%,$(LDFLAGS)),) + NO_UNDEFINED := + endif +endif + +define so_template +# Not using a pattern rule here because we don't want to generate empty +# archives when they are listed as a dependency in files not responsible +# for creating them. +# +# This needs further abstraction for dealing with non-GNU linkers. +$(1): + $(if $(quiet),@echo " [LD] $$@") + $(qexec)$$(LD) -shared $$(LDFLAGS) \ + $(NO_UNDEFINED) \ + -Wl,-soname,$$(SONAME) \ + -Wl,--version-script,$$(EXPORTS_FILE) -o $$@ \ + $$(filter %.o,$$^) $$(extralibs) +endef + +define dl_template +# Not using a pattern rule here because we don't want to generate empty +# archives when they are listed as a dependency in files not responsible +# for creating them. +$(1): + $(if $(quiet),@echo " [LD] $$@") + $(qexec)$$(LD) -dynamiclib $$(LDFLAGS) \ + -exported_symbols_list $$(EXPORTS_FILE) \ + -Wl,-headerpad_max_install_names,-compatibility_version,1.0,-current_version,$$(VERSION_MAJOR) \ + -o $$@ \ + $$(filter %.o,$$^) $$(extralibs) +endef + +define dll_template +# Not using a pattern rule here because we don't want to generate empty +# archives when they are listed as a dependency in files not responsible +# for creating them. +$(1): + $(if $(quiet),@echo " [LD] $$@") + $(qexec)$$(LD) -Zdll $$(LDFLAGS) \ + -o $$@ \ + $$(filter %.o,$$^) $$(extralibs) $$(EXPORTS_FILE) +endef + + +# +# Get current configuration +# +ifneq ($(target),) +include $(SRC_PATH_BARE)/$(target:-$(TOOLCHAIN)=).mk +endif + +skip_deps := $(filter %clean,$(MAKECMDGOALS)) +skip_deps += $(findstring testdata,$(MAKECMDGOALS)) +ifeq ($(strip $(skip_deps)),) + ifeq ($(CONFIG_DEPENDENCY_TRACKING),yes) + # Older versions of make don't like -include directives with no arguments + ifneq ($(filter %.d,$(OBJS-yes:.o=.d)),) + -include $(filter %.d,$(OBJS-yes:.o=.d)) + endif + endif +endif + +# +# Configuration dependent rules +# +$(call pairmap,install_map_templates,$(INSTALL_MAPS)) + +DOCS=$(call cond_enabled,CONFIG_INSTALL_DOCS,DOCS) +.docs: $(DOCS) + @touch $@ + +INSTALL-DOCS=$(call cond_enabled,CONFIG_INSTALL_DOCS,INSTALL-DOCS) +ifeq ($(MAKECMDGOALS),dist) +INSTALL-DOCS+=$(call cond_enabled,CONFIG_INSTALL_DOCS,DIST-DOCS) +endif +.install-docs: .docs $(addprefix $(DIST_DIR)/,$(INSTALL-DOCS)) + @touch $@ + +clean:: + rm -f .docs .install-docs $(DOCS) + +BINS=$(call enabled,BINS) +.bins: $(BINS) + @touch $@ + +INSTALL-BINS=$(call cond_enabled,CONFIG_INSTALL_BINS,INSTALL-BINS) +ifeq ($(MAKECMDGOALS),dist) +INSTALL-BINS+=$(call cond_enabled,CONFIG_INSTALL_BINS,DIST-BINS) +endif +.install-bins: .bins $(addprefix $(DIST_DIR)/,$(INSTALL-BINS)) + @touch $@ + +clean:: + rm -f .bins .install-bins $(BINS) + +LIBS=$(call enabled,LIBS) +.libs: $(LIBS) + @touch $@ +$(foreach lib,$(filter %_g.a,$(LIBS)),$(eval $(call archive_template,$(lib)))) +$(foreach lib,$(filter %so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR).$(SO_VERSION_PATCH),$(LIBS)),$(eval $(call so_template,$(lib)))) +$(foreach lib,$(filter %$(SO_VERSION_MAJOR).dylib,$(LIBS)),$(eval $(call dl_template,$(lib)))) +$(foreach lib,$(filter %$(SO_VERSION_MAJOR).dll,$(LIBS)),$(eval $(call dll_template,$(lib)))) + +INSTALL-LIBS=$(call cond_enabled,CONFIG_INSTALL_LIBS,INSTALL-LIBS) +ifeq ($(MAKECMDGOALS),dist) +INSTALL-LIBS+=$(call cond_enabled,CONFIG_INSTALL_LIBS,DIST-LIBS) +endif +.install-libs: .libs $(addprefix $(DIST_DIR)/,$(INSTALL-LIBS)) + @touch $@ + +clean:: + rm -f .libs .install-libs $(LIBS) + +ifeq ($(CONFIG_EXTERNAL_BUILD),yes) +PROJECTS=$(call enabled,PROJECTS) +.projects: $(PROJECTS) + @touch $@ + +INSTALL-PROJECTS=$(call cond_enabled,CONFIG_INSTALL_PROJECTS,INSTALL-PROJECTS) +ifeq ($(MAKECMDGOALS),dist) +INSTALL-PROJECTS+=$(call cond_enabled,CONFIG_INSTALL_PROJECTS,DIST-PROJECTS) +endif +.install-projects: .projects $(addprefix $(DIST_DIR)/,$(INSTALL-PROJECTS)) + @touch $@ + +clean:: + rm -f .projects .install-projects $(PROJECTS) +endif + +# If there are any source files to be distributed, then include the build +# system too. +ifneq ($(call enabled,DIST-SRCS),) + DIST-SRCS-yes += configure + DIST-SRCS-yes += build/make/configure.sh + DIST-SRCS-yes += build/make/gen_asm_deps.sh + DIST-SRCS-yes += build/make/Makefile + DIST-SRCS-$(CONFIG_MSVS) += build/make/gen_msvs_def.sh + DIST-SRCS-$(CONFIG_MSVS) += build/make/gen_msvs_sln.sh + DIST-SRCS-$(CONFIG_MSVS) += build/make/gen_msvs_vcxproj.sh + DIST-SRCS-$(CONFIG_MSVS) += build/make/msvs_common.sh + DIST-SRCS-$(CONFIG_RVCT) += build/make/armlink_adapter.sh + DIST-SRCS-$(VPX_ARCH_ARM) += build/make/ads2gas.pl + DIST-SRCS-$(VPX_ARCH_ARM) += build/make/ads2gas_apple.pl + DIST-SRCS-$(VPX_ARCH_ARM) += build/make/ads2armasm_ms.pl + DIST-SRCS-$(VPX_ARCH_ARM) += build/make/thumb.pm + DIST-SRCS-yes += $(target:-$(TOOLCHAIN)=).mk +endif +INSTALL-SRCS := $(call cond_enabled,CONFIG_INSTALL_SRCS,INSTALL-SRCS) +ifeq ($(MAKECMDGOALS),dist) +INSTALL-SRCS += $(call cond_enabled,CONFIG_INSTALL_SRCS,DIST-SRCS) +endif +.install-srcs: $(addprefix $(DIST_DIR)/src/,$(INSTALL-SRCS)) + @touch $@ + +clean:: + rm -f .install-srcs + +ifeq ($(CONFIG_EXTERNAL_BUILD),yes) + BUILD_TARGETS += .projects + INSTALL_TARGETS += .install-projects +endif +BUILD_TARGETS += .docs .libs .bins +INSTALL_TARGETS += .install-docs .install-srcs .install-libs .install-bins +all: $(BUILD_TARGETS) +install:: $(INSTALL_TARGETS) +dist: $(INSTALL_TARGETS) +test: + +.SUFFIXES: # Delete default suffix rules diff --git a/media/libvpx/libvpx/build/make/ads2armasm_ms.pl b/media/libvpx/libvpx/build/make/ads2armasm_ms.pl new file mode 100755 index 0000000000..dd4e0318c4 --- /dev/null +++ b/media/libvpx/libvpx/build/make/ads2armasm_ms.pl @@ -0,0 +1,39 @@ +#!/usr/bin/env perl +## +## Copyright (c) 2013 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +use FindBin; +use lib $FindBin::Bin; +use thumb; + +print "; This file was created from a .asm file\n"; +print "; using the ads2armasm_ms.pl script.\n"; + +while () +{ + undef $comment; + undef $line; + + s/REQUIRE8//; + s/PRESERVE8//; + s/^\s*ARM\s*$//; + s/AREA\s+\|\|(.*)\|\|/AREA |$1|/; + s/qsubaddx/qsax/i; + s/qaddsubx/qasx/i; + + thumb::FixThumbInstructions($_); + + s/ldrneb/ldrbne/i; + s/ldrneh/ldrhne/i; + s/^(\s*)ENDP.*/$&\n$1ALIGN 4/; + + print; +} + diff --git a/media/libvpx/libvpx/build/make/ads2gas.pl b/media/libvpx/libvpx/build/make/ads2gas.pl new file mode 100755 index 0000000000..c301b7f829 --- /dev/null +++ b/media/libvpx/libvpx/build/make/ads2gas.pl @@ -0,0 +1,157 @@ +#!/usr/bin/env perl +## +## Copyright (c) 2010 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + + +# ads2gas.pl +# Author: Eric Fung (efung (at) acm.org) +# +# Convert ARM Developer Suite 1.0.1 syntax assembly source to GNU as format +# +# Usage: cat inputfile | perl ads2gas.pl > outputfile +# + +use FindBin; +use lib $FindBin::Bin; +use thumb; + +my $thumb = 0; +my $elf = 1; + +foreach my $arg (@ARGV) { + $thumb = 1 if ($arg eq "-thumb"); + $elf = 0 if ($arg eq "-noelf"); +} + +print "@ This file was created from a .asm file\n"; +print "@ using the ads2gas.pl script.\n"; +print ".syntax unified\n"; +if ($thumb) { + print "\t.thumb\n"; +} + +# Stack of procedure names. +@proc_stack = (); + +while () +{ + # Load and store alignment + s/@/,:/g; + + # Comment character + s/;/@/; + + # Convert ELSE to .else + s/\bELSE\b/.else/g; + + # Convert ENDIF to .endif + s/\bENDIF\b/.endif/g; + + # Convert IF to .if + if (s/\bIF\b/.if/g) { + s/=+/==/g; + } + + # Convert INCLUDE to .INCLUDE "file" + s/INCLUDE\s?(.*)$/.include \"$1\"/; + + # No AREA required + # But ALIGNs in AREA must be obeyed + s/^(\s*)\bAREA\b.*ALIGN=([0-9])$/$1.text\n$1.p2align $2/; + # If no ALIGN, strip the AREA and align to 4 bytes + s/^(\s*)\bAREA\b.*$/$1.text\n$1.p2align 2/; + + # Make function visible to linker. + if ($elf) { + s/(\s*)EXPORT\s+\|([\$\w]*)\|/$1.global $2\n$1.type $2, function/; + } else { + s/(\s*)EXPORT\s+\|([\$\w]*)\|/$1.global $2/; + } + + # No vertical bars on function names + s/^\|(\$?\w+)\|/$1/g; + + # Labels need trailing colon + s/^([a-zA-Z_0-9\$]+)/$1:/ if !/EQU/; + + # ALIGN directive + s/\bALIGN\b/.balign/g; + + if ($thumb) { + # ARM code - we force everything to thumb with the declaration in the + # header + s/\bARM\b//g; + } else { + # ARM code + s/\bARM\b/.arm/g; + } + + # push/pop + s/(push\s+)(r\d+)/stmdb sp\!, \{$2\}/g; + s/(pop\s+)(r\d+)/ldmia sp\!, \{$2\}/g; + + if ($thumb) { + thumb::FixThumbInstructions($_); + } + + # eabi_attributes numerical equivalents can be found in the + # "ARM IHI 0045C" document. + + if ($elf) { + # REQUIRE8 Stack is required to be 8-byte aligned + s/\bREQUIRE8\b/.eabi_attribute 24, 1 \@Tag_ABI_align_needed/g; + + # PRESERVE8 Stack 8-byte align is preserved + s/\bPRESERVE8\b/.eabi_attribute 25, 1 \@Tag_ABI_align_preserved/g; + } else { + s/\bREQUIRE8\b//; + s/\bPRESERVE8\b//; + } + + # Use PROC and ENDP to give the symbols a .size directive. + # This makes them show up properly in debugging tools like gdb and valgrind. + if (/\bPROC\b/) { + my $proc; + # Match the function name so it can be stored in $proc + /^([\.0-9A-Z_a-z]\w+)\b/; + $proc = $1; + push(@proc_stack, $proc) if ($proc); + s/\bPROC\b/@ $&/; + } + + if (/\bENDP\b/) { + my $proc; + s/\bENDP\b/@ $&/; + $proc = pop(@proc_stack); + $_ = ".size $proc, .-$proc".$_ if ($proc and $elf); + } + + # EQU directive + s/(\S+\s+)EQU(\s+\S+)/.equ $1, $2/; + + # Begin macro definition + if (/\bMACRO\b/) { + # Process next line down, which will be the macro definition + $_ = ; + s/^/.macro/; + s/\$//g; # Remove $ from the variables in the declaration + } + + s/\$/\\/g; # Use \ to reference formal parameters + # End macro definition + + s/\bMEND\b/.endm/; # No need to tell it where to stop assembling + next if /^\s*END\s*$/; + s/[ \t]+$//; + print; +} + +# Mark that this object doesn't need an executable stack. +printf (" .section .note.GNU-stack,\"\",\%\%progbits\n") if $elf; diff --git a/media/libvpx/libvpx/build/make/ads2gas_apple.pl b/media/libvpx/libvpx/build/make/ads2gas_apple.pl new file mode 100755 index 0000000000..62491c1918 --- /dev/null +++ b/media/libvpx/libvpx/build/make/ads2gas_apple.pl @@ -0,0 +1,114 @@ +#!/usr/bin/env perl +## +## Copyright (c) 2010 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + + +# ads2gas_apple.pl +# Author: Eric Fung (efung (at) acm.org) +# +# Convert ARM Developer Suite 1.0.1 syntax assembly source to GNU as format +# +# Usage: cat inputfile | perl ads2gas_apple.pl > outputfile +# + +print "@ This file was created from a .asm file\n"; +print "@ using the ads2gas_apple.pl script.\n\n"; +print ".syntax unified\n"; + +my %macro_aliases; + +my @mapping_list = ("\$0", "\$1", "\$2", "\$3", "\$4", "\$5", "\$6", "\$7", "\$8", "\$9"); + +my @incoming_array; + +# Perl trim function to remove whitespace from the start and end of the string +sub trim($) +{ + my $string = shift; + $string =~ s/^\s+//; + $string =~ s/\s+$//; + return $string; +} + +while () +{ + # Load and store alignment + s/@/,:/g; + + # Comment character + s/;/@/; + + # Convert ELSE to .else + s/\bELSE\b/.else/g; + + # Convert ENDIF to .endif + s/\bENDIF\b/.endif/g; + + # Convert IF to .if + if (s/\bIF\b/.if/g) { + s/=+/==/g; + } + + # Convert INCLUDE to .INCLUDE "file" + s/INCLUDE\s?(.*)$/.include \"$1\"/; + + # No AREA required + # But ALIGNs in AREA must be obeyed + s/^(\s*)\bAREA\b.*ALIGN=([0-9])$/$1.text\n$1.p2align $2/; + # If no ALIGN, strip the AREA and align to 4 bytes + s/^(\s*)\bAREA\b.*$/$1.text\n$1.p2align 2/; + + # Make function visible to linker. + s/EXPORT\s+\|([\$\w]*)\|/.globl _$1/; + + # No vertical bars on function names + s/^\|(\$?\w+)\|/$1/g; + + # Labels and functions need a leading underscore and trailing colon + s/^([a-zA-Z_0-9\$]+)/_$1:/ if !/EQU/; + + # Branches need to call the correct, underscored, function + s/^(\s+b[egln]?[teq]?\s+)([a-zA-Z_0-9\$]+)/$1 _$2/ if !/EQU/; + + # ALIGN directive + s/\bALIGN\b/.balign/g; + + # Strip ARM + s/\s+ARM//; + + # Strip REQUIRE8 + s/\s+REQUIRE8//; + + # Strip PRESERVE8 + s/\s+PRESERVE8//; + + # Strip PROC and ENDPROC + s/\bPROC\b//g; + s/\bENDP\b//g; + + # EQU directive + s/(\S+\s+)EQU(\s+\S+)/.equ $1, $2/; + + # Begin macro definition + if (/\bMACRO\b/) { + # Process next line down, which will be the macro definition + $_ = ; + s/^/.macro/; + s/\$//g; # Remove $ from the variables in the declaration + } + + s/\$/\\/g; # Use \ to reference formal parameters + # End macro definition + + s/\bMEND\b/.endm/; # No need to tell it where to stop assembling + next if /^\s*END\s*$/; + s/[ \t]+$//; + print; +} diff --git a/media/libvpx/libvpx/build/make/armlink_adapter.sh b/media/libvpx/libvpx/build/make/armlink_adapter.sh new file mode 100755 index 0000000000..75c342e97c --- /dev/null +++ b/media/libvpx/libvpx/build/make/armlink_adapter.sh @@ -0,0 +1,54 @@ +#!/bin/sh +## +## Copyright (c) 2010 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + + +verbose=0 +set -- $* +for i; do + if [ "$i" = "-o" ]; then + on_of=1 + elif [ "$i" = "-v" ]; then + verbose=1 + elif [ "$i" = "-g" ]; then + args="${args} --debug" + elif [ "$on_of" = "1" ]; then + outfile=$i + on_of=0 + elif [ -f "$i" ]; then + infiles="$infiles $i" + elif [ "${i#-l}" != "$i" ]; then + libs="$libs ${i#-l}" + elif [ "${i#-L}" != "$i" ]; then + libpaths="${libpaths} ${i#-L}" + else + args="${args} ${i}" + fi + shift +done + +# Absolutize library file names +for f in $libs; do + found=0 + for d in $libpaths; do + [ -f "$d/$f" ] && infiles="$infiles $d/$f" && found=1 && break + [ -f "$d/lib${f}.so" ] && infiles="$infiles $d/lib${f}.so" && found=1 && break + [ -f "$d/lib${f}.a" ] && infiles="$infiles $d/lib${f}.a" && found=1 && break + done + [ $found -eq 0 ] && infiles="$infiles $f" +done +for d in $libpaths; do + [ -n "$libsearchpath" ] && libsearchpath="${libsearchpath}," + libsearchpath="${libsearchpath}$d" +done + +cmd="armlink $args --userlibpath=$libsearchpath --output=$outfile $infiles" +[ $verbose -eq 1 ] && echo $cmd +$cmd diff --git a/media/libvpx/libvpx/build/make/configure.sh b/media/libvpx/libvpx/build/make/configure.sh new file mode 100644 index 0000000000..869793a296 --- /dev/null +++ b/media/libvpx/libvpx/build/make/configure.sh @@ -0,0 +1,1703 @@ +#!/bin/sh +## +## configure.sh +## +## This script is sourced by the main configure script and contains +## utility functions and other common bits that aren't strictly libvpx +## related. +## +## This build system is based in part on the FFmpeg configure script. +## + + +# +# Logging / Output Functions +# +die_unknown(){ + echo "Unknown option \"$1\"." + echo "See $0 --help for available options." + clean_temp_files + exit 1 +} + +die() { + echo "$@" + echo + echo "Configuration failed. This could reflect a misconfiguration of your" + echo "toolchains, improper options selected, or another problem. If you" + echo "don't see any useful error messages above, the next step is to look" + echo "at the configure error log file ($logfile) to determine what" + echo "configure was trying to do when it died." + clean_temp_files + exit 1 +} + +log(){ + echo "$@" >>$logfile +} + +log_file(){ + log BEGIN $1 + cat -n $1 >>$logfile + log END $1 +} + +log_echo() { + echo "$@" + log "$@" +} + +fwrite () { + outfile=$1 + shift + echo "$@" >> ${outfile} +} + +show_help_pre(){ + for opt in ${CMDLINE_SELECT}; do + opt2=`echo $opt | sed -e 's;_;-;g'` + if enabled $opt; then + eval "toggle_${opt}=\"--disable-${opt2}\"" + else + eval "toggle_${opt}=\"--enable-${opt2} \"" + fi + done + + cat <>${logfile} 2>&1 +} + +check_cc() { + log check_cc "$@" + cat >${TMP_C} + log_file ${TMP_C} + check_cmd ${CC} ${CFLAGS} "$@" -c -o ${TMP_O} ${TMP_C} +} + +check_cxx() { + log check_cxx "$@" + cat >${TMP_CC} + log_file ${TMP_CC} + check_cmd ${CXX} ${CXXFLAGS} "$@" -c -o ${TMP_O} ${TMP_CC} +} + +check_cpp() { + log check_cpp "$@" + cat > ${TMP_C} + log_file ${TMP_C} + check_cmd ${CC} ${CFLAGS} "$@" -E -o ${TMP_O} ${TMP_C} +} + +check_ld() { + log check_ld "$@" + check_cc $@ \ + && check_cmd ${LD} ${LDFLAGS} "$@" -o ${TMP_X} ${TMP_O} ${extralibs} +} + +check_lib() { + log check_lib "$@" + check_cc $@ \ + && check_cmd ${LD} ${LDFLAGS} -o ${TMP_X} ${TMP_O} "$@" ${extralibs} +} + +check_header(){ + log check_header "$@" + header=$1 + shift + var=`echo $header | sed 's/[^A-Za-z0-9_]/_/g'` + disable_feature $var + check_cpp "$@" <${TMP_ASM} <${TMP_X} + log_file ${TMP_X} + if ! grep -q '\.rodata .* 16$' ${TMP_X}; then + die "${AS} ${ASFLAGS} does not support section alignment (nasm <=2.08?)" + fi +} + +# tests for -m$1 toggling the feature given in $2. If $2 is empty $1 is used. +check_gcc_machine_option() { + opt="$1" + feature="$2" + [ -n "$feature" ] || feature="$opt" + + if enabled gcc && ! disabled "$feature" && ! check_cflags "-m$opt"; then + RTCD_OPTIONS="${RTCD_OPTIONS}--disable-$feature " + else + soft_enable "$feature" + fi +} + +# tests for -m$2, -m$3, -m$4... toggling the feature given in $1. +check_gcc_machine_options() { + feature="$1" + shift + flags="-m$1" + shift + for opt in $*; do + flags="$flags -m$opt" + done + + if enabled gcc && ! disabled "$feature" && ! check_cflags $flags; then + RTCD_OPTIONS="${RTCD_OPTIONS}--disable-$feature " + else + soft_enable "$feature" + fi +} + +check_gcc_avx512_compiles() { + if disabled gcc; then + return + fi + + check_cc -mavx512f < +void f(void) { + __m512i x = _mm512_set1_epi16(0); + (void)x; +} +EOF + compile_result=$? + if [ ${compile_result} -ne 0 ]; then + log_echo " disabling avx512: not supported by compiler" + disable_feature avx512 + RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx512 " + fi +} + +check_inline_asm() { + log check_inline_asm "$@" + name="$1" + code="$2" + shift 2 + disable_feature $name + check_cc "$@" <> config.mk + echo "TOOLCHAIN := ${toolchain}" >> config.mk + + case ${toolchain} in + *-linux-rvct) + echo "ALT_LIBC := ${alt_libc}" >> config.mk + ;; + esac +} + +write_common_config_targets() { + for t in ${all_targets}; do + if enabled ${t}; then + if enabled child; then + fwrite config.mk "ALL_TARGETS += ${t}-${toolchain}" + else + fwrite config.mk "ALL_TARGETS += ${t}" + fi + fi + true; + done + true +} + +write_common_target_config_mk() { + saved_CC="${CC}" + saved_CXX="${CXX}" + enabled ccache && CC="ccache ${CC}" + enabled ccache && CXX="ccache ${CXX}" + print_webm_license $1 "##" "" + + cat >> $1 << EOF +# This file automatically generated by configure. Do not edit! +SRC_PATH="$source_path_mk" +SRC_PATH_BARE=$source_path_mk +BUILD_PFX=${BUILD_PFX} +TOOLCHAIN=${toolchain} +ASM_CONVERSION=${asm_conversion_cmd:-${source_path_mk}/build/make/ads2gas.pl} +GEN_VCPROJ=${gen_vcproj_cmd} +MSVS_ARCH_DIR=${msvs_arch_dir} + +CC=${CC} +CXX=${CXX} +AR=${AR} +LD=${LD} +AS=${AS} +STRIP=${STRIP} +NM=${NM} + +CFLAGS = ${CFLAGS} +CXXFLAGS = ${CXXFLAGS} +ARFLAGS = -crs\$(if \$(quiet),,v) +LDFLAGS = ${LDFLAGS} +ASFLAGS = ${ASFLAGS} +extralibs = ${extralibs} +AS_SFX = ${AS_SFX:-.asm} +EXE_SFX = ${EXE_SFX} +VCPROJ_SFX = ${VCPROJ_SFX} +RTCD_OPTIONS = ${RTCD_OPTIONS} +LIBWEBM_CXXFLAGS = ${LIBWEBM_CXXFLAGS} +LIBYUV_CXXFLAGS = ${LIBYUV_CXXFLAGS} +EOF + + if enabled rvct; then cat >> $1 << EOF +fmt_deps = sed -e 's;^__image.axf;\${@:.d=.o} \$@;' #hide +EOF + else cat >> $1 << EOF +fmt_deps = sed -e 's;^\([a-zA-Z0-9_]*\)\.o;\${@:.d=.o} \$@;' +EOF + fi + + print_config_mk VPX_ARCH "${1}" ${ARCH_LIST} + print_config_mk HAVE "${1}" ${HAVE_LIST} + print_config_mk CONFIG "${1}" ${CONFIG_LIST} + print_config_mk HAVE "${1}" gnu_strip + + enabled msvs && echo "CONFIG_VS_VERSION=${vs_version}" >> "${1}" + + CC="${saved_CC}" + CXX="${saved_CXX}" +} + +write_common_target_config_h() { + print_webm_license ${TMP_H} "/*" " */" + cat >> ${TMP_H} << EOF +/* This file automatically generated by configure. Do not edit! */ +#ifndef VPX_CONFIG_H +#define VPX_CONFIG_H +#define RESTRICT ${RESTRICT} +#define INLINE ${INLINE} +EOF + print_config_h VPX_ARCH "${TMP_H}" ${ARCH_LIST} + print_config_h HAVE "${TMP_H}" ${HAVE_LIST} + print_config_h CONFIG "${TMP_H}" ${CONFIG_LIST} + print_config_vars_h "${TMP_H}" ${VAR_LIST} + echo "#endif /* VPX_CONFIG_H */" >> ${TMP_H} + mkdir -p `dirname "$1"` + cmp "$1" ${TMP_H} >/dev/null 2>&1 || mv ${TMP_H} "$1" +} + +write_win_arm64_neon_h_workaround() { + print_webm_license ${TMP_H} "/*" " */" + cat >> ${TMP_H} << EOF +/* This file automatically generated by configure. Do not edit! */ +#ifndef VPX_WIN_ARM_NEON_H_WORKAROUND +#define VPX_WIN_ARM_NEON_H_WORKAROUND +/* The Windows SDK has arm_neon.h, but unlike on other platforms it is + * ARM32-only. ARM64 NEON support is provided by arm64_neon.h, a proper + * superset of arm_neon.h. Work around this by providing a more local + * arm_neon.h that simply #includes arm64_neon.h. + */ +#include +#endif /* VPX_WIN_ARM_NEON_H_WORKAROUND */ +EOF + mkdir -p `dirname "$1"` + cmp "$1" ${TMP_H} >/dev/null 2>&1 || mv ${TMP_H} "$1" +} + +process_common_cmdline() { + for opt in "$@"; do + optval="${opt#*=}" + case "$opt" in + --child) + enable_feature child + ;; + --log*) + logging="$optval" + if ! disabled logging ; then + enabled logging || logfile="$logging" + else + logfile=/dev/null + fi + ;; + --target=*) + toolchain="${toolchain:-${optval}}" + ;; + --force-target=*) + toolchain="${toolchain:-${optval}}" + enable_feature force_toolchain + ;; + --cpu=*) + tune_cpu="$optval" + ;; + --extra-cflags=*) + extra_cflags="${optval}" + ;; + --extra-cxxflags=*) + extra_cxxflags="${optval}" + ;; + --enable-?*|--disable-?*) + eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'` + if is_in ${option} ${ARCH_EXT_LIST}; then + [ $action = "disable" ] && RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${option} " + elif [ $action = "disable" ] && ! disabled $option ; then + is_in ${option} ${CMDLINE_SELECT} || die_unknown $opt + log_echo " disabling $option" + elif [ $action = "enable" ] && ! enabled $option ; then + is_in ${option} ${CMDLINE_SELECT} || die_unknown $opt + log_echo " enabling $option" + fi + ${action}_feature $option + ;; + --require-?*) + eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'` + if is_in ${option} ${ARCH_EXT_LIST}; then + RTCD_OPTIONS="${RTCD_OPTIONS}${opt} " + else + die_unknown $opt + fi + ;; + --force-enable-?*|--force-disable-?*) + eval `echo "$opt" | sed 's/--force-/action=/;s/-/ option=/;s/-/_/g'` + ${action}_feature $option + ;; + --libc=*) + [ -d "${optval}" ] || die "Not a directory: ${optval}" + disable_feature builtin_libc + alt_libc="${optval}" + ;; + --as=*) + [ "${optval}" = yasm ] || [ "${optval}" = nasm ] \ + || [ "${optval}" = auto ] \ + || die "Must be yasm, nasm or auto: ${optval}" + alt_as="${optval}" + ;; + --size-limit=*) + w="${optval%%x*}" + h="${optval##*x}" + VAR_LIST="DECODE_WIDTH_LIMIT ${w} DECODE_HEIGHT_LIMIT ${h}" + [ ${w} -gt 0 ] && [ ${h} -gt 0 ] || die "Invalid size-limit: too small." + [ ${w} -lt 65536 ] && [ ${h} -lt 65536 ] \ + || die "Invalid size-limit: too big." + enable_feature size_limit + ;; + --prefix=*) + prefix="${optval}" + ;; + --libdir=*) + libdir="${optval}" + ;; + --libc|--as|--prefix|--libdir) + die "Option ${opt} requires argument" + ;; + --help|-h) + show_help + ;; + *) + die_unknown $opt + ;; + esac + done +} + +process_cmdline() { + for opt do + optval="${opt#*=}" + case "$opt" in + *) + process_common_cmdline $opt + ;; + esac + done +} + +post_process_common_cmdline() { + prefix="${prefix:-/usr/local}" + prefix="${prefix%/}" + libdir="${libdir:-${prefix}/lib}" + libdir="${libdir%/}" + if [ "${libdir#${prefix}}" = "${libdir}" ]; then + die "Libdir ${libdir} must be a subdirectory of ${prefix}" + fi +} + +post_process_cmdline() { + true; +} + +setup_gnu_toolchain() { + CC=${CC:-${CROSS}gcc} + CXX=${CXX:-${CROSS}g++} + AR=${AR:-${CROSS}ar} + LD=${LD:-${CROSS}${link_with_cc:-ld}} + AS=${AS:-${CROSS}as} + STRIP=${STRIP:-${CROSS}strip} + NM=${NM:-${CROSS}nm} + AS_SFX=.S + EXE_SFX= +} + +# Reliably find the newest available Darwin SDKs. (Older versions of +# xcrun don't support --show-sdk-path.) +show_darwin_sdk_path() { + xcrun --sdk $1 --show-sdk-path 2>/dev/null || + xcodebuild -sdk $1 -version Path 2>/dev/null +} + +# Print the major version number of the Darwin SDK specified by $1. +show_darwin_sdk_major_version() { + xcrun --sdk $1 --show-sdk-version 2>/dev/null | cut -d. -f1 +} + +# Print the Xcode version. +show_xcode_version() { + xcodebuild -version | head -n1 | cut -d' ' -f2 +} + +# Fails when Xcode version is less than 6.3. +check_xcode_minimum_version() { + xcode_major=$(show_xcode_version | cut -f1 -d.) + xcode_minor=$(show_xcode_version | cut -f2 -d.) + xcode_min_major=6 + xcode_min_minor=3 + if [ ${xcode_major} -lt ${xcode_min_major} ]; then + return 1 + fi + if [ ${xcode_major} -eq ${xcode_min_major} ] \ + && [ ${xcode_minor} -lt ${xcode_min_minor} ]; then + return 1 + fi +} + +process_common_toolchain() { + if [ -z "$toolchain" ]; then + gcctarget="${CHOST:-$(gcc -dumpmachine 2> /dev/null)}" + # detect tgt_isa + case "$gcctarget" in + aarch64*) + tgt_isa=arm64 + ;; + armv7*-hardfloat* | armv7*-gnueabihf | arm-*-gnueabihf) + tgt_isa=armv7 + float_abi=hard + ;; + armv7*) + tgt_isa=armv7 + float_abi=softfp + ;; + *x86_64*|*amd64*) + tgt_isa=x86_64 + ;; + *i[3456]86*) + tgt_isa=x86 + ;; + *sparc*) + tgt_isa=sparc + ;; + power*64le*-*) + tgt_isa=ppc64le + ;; + *mips64el*) + tgt_isa=mips64 + ;; + *mips32el*) + tgt_isa=mips32 + ;; + loongarch32*) + tgt_isa=loongarch32 + ;; + loongarch64*) + tgt_isa=loongarch64 + ;; + esac + + # detect tgt_os + case "$gcctarget" in + *darwin1[0-9]*) + tgt_isa=x86_64 + tgt_os=`echo $gcctarget | sed 's/.*\(darwin1[0-9]\).*/\1/'` + ;; + *darwin2[0-3]*) + tgt_isa=`uname -m` + tgt_os=`echo $gcctarget | sed 's/.*\(darwin2[0-9]\).*/\1/'` + ;; + x86_64*mingw32*) + tgt_os=win64 + ;; + x86_64*cygwin*) + tgt_os=win64 + ;; + *mingw32*|*cygwin*) + [ -z "$tgt_isa" ] && tgt_isa=x86 + tgt_os=win32 + ;; + *linux*|*bsd*) + tgt_os=linux + ;; + *solaris2.10) + tgt_os=solaris + ;; + *os2*) + tgt_os=os2 + ;; + esac + + if [ -n "$tgt_isa" ] && [ -n "$tgt_os" ]; then + toolchain=${tgt_isa}-${tgt_os}-gcc + fi + fi + + toolchain=${toolchain:-generic-gnu} + + is_in ${toolchain} ${all_platforms} || enabled force_toolchain \ + || die "Unrecognized toolchain '${toolchain}'" + + enabled child || log_echo "Configuring for target '${toolchain}'" + + # + # Set up toolchain variables + # + tgt_isa=$(echo ${toolchain} | awk 'BEGIN{FS="-"}{print $1}') + tgt_os=$(echo ${toolchain} | awk 'BEGIN{FS="-"}{print $2}') + tgt_cc=$(echo ${toolchain} | awk 'BEGIN{FS="-"}{print $3}') + + # Mark the specific ISA requested as enabled + soft_enable ${tgt_isa} + enable_feature ${tgt_os} + enable_feature ${tgt_cc} + + # Enable the architecture family + case ${tgt_isa} in + arm64 | armv8) + enable_feature arm + enable_feature aarch64 + ;; + arm*) + enable_feature arm + ;; + mips*) + enable_feature mips + ;; + ppc*) + enable_feature ppc + ;; + loongarch*) + soft_enable lsx + soft_enable lasx + enable_feature loongarch + ;; + esac + + # Position independent code (PIC) is probably what we want when building + # shared libs or position independent executable (PIE) targets. + enabled shared && soft_enable pic + check_cpp << EOF || soft_enable pic +#if !(__pie__ || __PIE__) +#error Neither __pie__ or __PIE__ are set +#endif +EOF + + # Minimum iOS version for all target platforms (darwin and iphonesimulator). + # Shared library framework builds are only possible on iOS 8 and later. + if enabled shared; then + IOS_VERSION_OPTIONS="--enable-shared" + IOS_VERSION_MIN="8.0" + else + IOS_VERSION_OPTIONS="" + IOS_VERSION_MIN="7.0" + fi + + # Handle darwin variants. Newer SDKs allow targeting older + # platforms, so use the newest one available. + case ${toolchain} in + arm*-darwin-*) + add_cflags "-miphoneos-version-min=${IOS_VERSION_MIN}" + iphoneos_sdk_dir="$(show_darwin_sdk_path iphoneos)" + if [ -d "${iphoneos_sdk_dir}" ]; then + add_cflags "-isysroot ${iphoneos_sdk_dir}" + add_ldflags "-isysroot ${iphoneos_sdk_dir}" + fi + ;; + *-darwin*) + osx_sdk_dir="$(show_darwin_sdk_path macosx)" + if [ -d "${osx_sdk_dir}" ]; then + add_cflags "-isysroot ${osx_sdk_dir}" + add_ldflags "-isysroot ${osx_sdk_dir}" + fi + ;; + esac + + case ${toolchain} in + *-darwin8-*) + add_cflags "-mmacosx-version-min=10.4" + add_ldflags "-mmacosx-version-min=10.4" + ;; + *-darwin9-*) + add_cflags "-mmacosx-version-min=10.5" + add_ldflags "-mmacosx-version-min=10.5" + ;; + *-darwin10-*) + add_cflags "-mmacosx-version-min=10.6" + add_ldflags "-mmacosx-version-min=10.6" + ;; + *-darwin11-*) + add_cflags "-mmacosx-version-min=10.7" + add_ldflags "-mmacosx-version-min=10.7" + ;; + *-darwin12-*) + add_cflags "-mmacosx-version-min=10.8" + add_ldflags "-mmacosx-version-min=10.8" + ;; + *-darwin13-*) + add_cflags "-mmacosx-version-min=10.9" + add_ldflags "-mmacosx-version-min=10.9" + ;; + *-darwin14-*) + add_cflags "-mmacosx-version-min=10.10" + add_ldflags "-mmacosx-version-min=10.10" + ;; + *-darwin15-*) + add_cflags "-mmacosx-version-min=10.11" + add_ldflags "-mmacosx-version-min=10.11" + ;; + *-darwin16-*) + add_cflags "-mmacosx-version-min=10.12" + add_ldflags "-mmacosx-version-min=10.12" + ;; + *-darwin17-*) + add_cflags "-mmacosx-version-min=10.13" + add_ldflags "-mmacosx-version-min=10.13" + ;; + *-darwin18-*) + add_cflags "-mmacosx-version-min=10.14" + add_ldflags "-mmacosx-version-min=10.14" + ;; + *-darwin19-*) + add_cflags "-mmacosx-version-min=10.15" + add_ldflags "-mmacosx-version-min=10.15" + ;; + *-darwin2[0-2]-*) + add_cflags "-arch ${toolchain%%-*}" + add_ldflags "-arch ${toolchain%%-*}" + ;; + *-iphonesimulator-*) + add_cflags "-miphoneos-version-min=${IOS_VERSION_MIN}" + add_ldflags "-miphoneos-version-min=${IOS_VERSION_MIN}" + iossim_sdk_dir="$(show_darwin_sdk_path iphonesimulator)" + if [ -d "${iossim_sdk_dir}" ]; then + add_cflags "-isysroot ${iossim_sdk_dir}" + add_ldflags "-isysroot ${iossim_sdk_dir}" + fi + ;; + esac + + # Handle Solaris variants. Solaris 10 needs -lposix4 + case ${toolchain} in + sparc-solaris-*) + add_extralibs -lposix4 + ;; + *-solaris-*) + add_extralibs -lposix4 + ;; + esac + + # Process architecture variants + case ${toolchain} in + arm*) + soft_enable runtime_cpu_detect + # Arm ISA extensions are treated as supersets. + case ${tgt_isa} in + arm64|armv8) + for ext in ${ARCH_EXT_LIST_AARCH64}; do + # Disable higher order extensions to simplify dependencies. + if [ "$disable_exts" = "yes" ]; then + if ! disabled $ext; then + RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} " + disable_feature $ext + fi + elif disabled $ext; then + disable_exts="yes" + else + soft_enable $ext + fi + done + ;; + armv7|armv7s) + soft_enable neon + # Only enable neon_asm when neon is also enabled. + enabled neon && soft_enable neon_asm + # If someone tries to force it through, die. + if disabled neon && enabled neon_asm; then + die "Disabling neon while keeping neon-asm is not supported" + fi + ;; + esac + + asm_conversion_cmd="cat" + + case ${tgt_cc} in + gcc) + link_with_cc=gcc + setup_gnu_toolchain + arch_int=${tgt_isa##armv} + arch_int=${arch_int%%te} + tune_cflags="-mtune=" + if [ ${tgt_isa} = "armv7" ] || [ ${tgt_isa} = "armv7s" ]; then + if [ -z "${float_abi}" ]; then + check_cpp <&- || \ + die "Couldn't find CodeSourcery GCC from PATH" + + # Use armcc as a linker to enable translation of + # some gcc specific options such as -lm and -lpthread. + LD="armcc --translate_gcc" + + # create configuration file (uses path to CodeSourcery GCC) + armcc --arm_linux_configure --arm_linux_config_file=arm_linux.cfg + + add_cflags --arm_linux_paths --arm_linux_config_file=arm_linux.cfg + add_asflags --no_hide_all --apcs=/interwork + add_ldflags --arm_linux_paths --arm_linux_config_file=arm_linux.cfg + enabled pic && add_cflags --apcs=/fpic + enabled pic && add_asflags --apcs=/fpic + enabled shared && add_cflags --shared + fi + ;; + esac + ;; + mips*) + link_with_cc=gcc + setup_gnu_toolchain + tune_cflags="-mtune=" + if enabled dspr2; then + check_add_cflags -mips32r2 -mdspr2 + fi + + if enabled runtime_cpu_detect; then + disable_feature runtime_cpu_detect + fi + + if [ -n "${tune_cpu}" ]; then + case ${tune_cpu} in + p5600) + check_add_cflags -mips32r5 -mload-store-pairs + check_add_cflags -msched-weight -mhard-float -mfp64 + check_add_asflags -mips32r5 -mhard-float -mfp64 + check_add_ldflags -mfp64 + ;; + i6400|p6600) + check_add_cflags -mips64r6 -mabi=64 -msched-weight + check_add_cflags -mload-store-pairs -mhard-float -mfp64 + check_add_asflags -mips64r6 -mabi=64 -mhard-float -mfp64 + check_add_ldflags -mips64r6 -mabi=64 -mfp64 + ;; + loongson3*) + check_cflags -march=loongson3a && soft_enable mmi \ + || disable_feature mmi + check_cflags -mmsa && soft_enable msa \ + || disable_feature msa + tgt_isa=loongson3a + ;; + esac + + if enabled mmi || enabled msa; then + soft_enable runtime_cpu_detect + fi + + if enabled msa; then + # TODO(libyuv:793) + # The new mips functions in libyuv do not build + # with the toolchains we currently use for testing. + soft_disable libyuv + fi + fi + + check_add_cflags -march=${tgt_isa} + check_add_asflags -march=${tgt_isa} + check_add_asflags -KPIC + ;; + ppc64le*) + link_with_cc=gcc + setup_gnu_toolchain + # Do not enable vsx by default. + # https://bugs.chromium.org/p/webm/issues/detail?id=1522 + enabled vsx || RTCD_OPTIONS="${RTCD_OPTIONS}--disable-vsx " + if [ -n "${tune_cpu}" ]; then + case ${tune_cpu} in + power?) + tune_cflags="-mcpu=" + ;; + esac + fi + ;; + x86*) + case ${tgt_os} in + android) + soft_enable realtime_only + ;; + win*) + enabled gcc && add_cflags -fno-common + ;; + solaris*) + CC=${CC:-${CROSS}gcc} + CXX=${CXX:-${CROSS}g++} + LD=${LD:-${CROSS}gcc} + CROSS=${CROSS-g} + ;; + os2) + disable_feature pic + AS=${AS:-nasm} + add_ldflags -Zhigh-mem + ;; + esac + + AS="${alt_as:-${AS:-auto}}" + case ${tgt_cc} in + icc*) + CC=${CC:-icc} + LD=${LD:-icc} + setup_gnu_toolchain + add_cflags -use-msasm # remove -use-msasm too? + # add -no-intel-extensions to suppress warning #10237 + # refer to http://software.intel.com/en-us/forums/topic/280199 + add_ldflags -i-static -no-intel-extensions + enabled x86_64 && add_cflags -ipo -static -O3 -no-prec-div + enabled x86_64 && AR=xiar + case ${tune_cpu} in + atom*) + tune_cflags="-x" + tune_cpu="SSE3_ATOM" + ;; + *) + tune_cflags="-march=" + ;; + esac + ;; + gcc*) + link_with_cc=gcc + tune_cflags="-march=" + setup_gnu_toolchain + #for 32 bit x86 builds, -O3 did not turn on this flag + enabled optimizations && disabled gprof && check_add_cflags -fomit-frame-pointer + ;; + vs*) + msvs_arch_dir=x86-msvs + case ${tgt_cc##vs} in + 14) + echo "${tgt_cc} does not support avx512, disabling....." + RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx512 " + soft_disable avx512 + ;; + esac + ;; + esac + + bits=32 + enabled x86_64 && bits=64 + check_cpp < sse4 + check_gcc_machine_option ${ext%_*} $ext + fi + fi + done + + if enabled external_build; then + log_echo " skipping assembler detection" + else + case "${AS}" in + auto|"") + which nasm >/dev/null 2>&1 && AS=nasm + which yasm >/dev/null 2>&1 && AS=yasm + if [ "${AS}" = nasm ] ; then + # Apple ships version 0.98 of nasm through at least Xcode 6. Revisit + # this check if they start shipping a compatible version. + apple=`nasm -v | grep "Apple"` + [ -n "${apple}" ] \ + && echo "Unsupported version of nasm: ${apple}" \ + && AS="" + fi + [ "${AS}" = auto ] || [ -z "${AS}" ] \ + && die "Neither yasm nor nasm have been found." \ + "See the prerequisites section in the README for more info." + ;; + esac + log_echo " using $AS" + fi + AS_SFX=.asm + case ${tgt_os} in + win32) + add_asflags -f win32 + enabled debug && add_asflags -g cv8 + EXE_SFX=.exe + ;; + win64) + add_asflags -f win64 + enabled debug && add_asflags -g cv8 + EXE_SFX=.exe + ;; + linux*|solaris*|android*) + add_asflags -f elf${bits} + enabled debug && [ "${AS}" = yasm ] && add_asflags -g dwarf2 + enabled debug && [ "${AS}" = nasm ] && add_asflags -g + [ "${AS##*/}" = nasm ] && check_asm_align + ;; + darwin*) + add_asflags -f macho${bits} + enabled x86 && darwin_arch="-arch i386" || darwin_arch="-arch x86_64" + add_cflags ${darwin_arch} + add_ldflags ${darwin_arch} + # -mdynamic-no-pic is still a bit of voodoo -- it was required at + # one time, but does not seem to be now, and it breaks some of the + # code that still relies on inline assembly. + # enabled icc && ! enabled pic && add_cflags -fno-pic -mdynamic-no-pic + enabled icc && ! enabled pic && add_cflags -fno-pic + ;; + iphonesimulator) + add_asflags -f macho${bits} + enabled x86 && sim_arch="-arch i386" || sim_arch="-arch x86_64" + add_cflags ${sim_arch} + add_ldflags ${sim_arch} + + if [ "$(disabled external_build)" ] && + [ "$(show_darwin_sdk_major_version iphonesimulator)" -gt 8 ]; then + # yasm v1.3.0 doesn't know what -fembed-bitcode means, so turning it + # on is pointless (unless building a C-only lib). Warn the user, but + # do nothing here. + log "Warning: Bitcode embed disabled for simulator targets." + fi + ;; + os2) + add_asflags -f aout + enabled debug && add_asflags -g + EXE_SFX=.exe + ;; + *) + log "Warning: Unknown os $tgt_os while setting up $AS flags" + ;; + esac + ;; + loongarch*) + link_with_cc=gcc + setup_gnu_toolchain + + enabled lsx && check_inline_asm lsx '"vadd.b $vr0, $vr1, $vr1"' + enabled lsx && soft_enable runtime_cpu_detect + enabled lasx && check_inline_asm lasx '"xvadd.b $xr0, $xr1, $xr1"' + enabled lasx && soft_enable runtime_cpu_detect + ;; + *-gcc|generic-gnu) + link_with_cc=gcc + enable_feature gcc + setup_gnu_toolchain + ;; + esac + + # Try to enable CPU specific tuning + if [ -n "${tune_cpu}" ]; then + if [ -n "${tune_cflags}" ]; then + check_add_cflags ${tune_cflags}${tune_cpu} || \ + die "Requested CPU '${tune_cpu}' not supported by compiler" + fi + if [ -n "${tune_asflags}" ]; then + check_add_asflags ${tune_asflags}${tune_cpu} || \ + die "Requested CPU '${tune_cpu}' not supported by assembler" + fi + if [ -z "${tune_cflags}${tune_asflags}" ]; then + log_echo "Warning: CPU tuning not supported by this toolchain" + fi + fi + + if enabled debug; then + check_add_cflags -g && check_add_ldflags -g + else + check_add_cflags -DNDEBUG + fi + + enabled gprof && check_add_cflags -pg && check_add_ldflags -pg + enabled gcov && + check_add_cflags -fprofile-arcs -ftest-coverage && + check_add_ldflags -fprofile-arcs -ftest-coverage + + if enabled optimizations; then + if enabled rvct; then + enabled small && check_add_cflags -Ospace || check_add_cflags -Otime + else + enabled small && check_add_cflags -O2 || check_add_cflags -O3 + fi + fi + + # Position Independent Code (PIC) support, for building relocatable + # shared objects + enabled gcc && enabled pic && check_add_cflags -fPIC + + # Work around longjmp interception on glibc >= 2.11, to improve binary + # compatibility. See http://code.google.com/p/webm/issues/detail?id=166 + enabled linux && check_add_cflags -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0 + + # Check for strip utility variant + ${STRIP} -V 2>/dev/null | grep GNU >/dev/null && enable_feature gnu_strip + + # Try to determine target endianness + check_cc </dev/null 2>&1 && enable_feature big_endian + + # Try to find which inline keywords are supported + check_cc < +#include +int main(void) { return pthread_create(NULL, NULL, NULL, NULL); } +EOF + ;; + esac + fi + + # only for MIPS platforms + case ${toolchain} in + mips*) + if enabled big_endian; then + if enabled dspr2; then + echo "dspr2 optimizations are available only for little endian platforms" + disable_feature dspr2 + fi + if enabled msa; then + echo "msa optimizations are available only for little endian platforms" + disable_feature msa + fi + if enabled mmi; then + echo "mmi optimizations are available only for little endian platforms" + disable_feature mmi + fi + fi + ;; + esac + + # only for LOONGARCH platforms + case ${toolchain} in + loongarch*) + if enabled big_endian; then + if enabled lsx; then + echo "lsx optimizations are available only for little endian platforms" + disable_feature lsx + fi + if enabled lasx; then + echo "lasx optimizations are available only for little endian platforms" + disable_feature lasx + fi + fi + ;; + esac + + # glibc needs these + if enabled linux; then + add_cflags -D_LARGEFILE_SOURCE + add_cflags -D_FILE_OFFSET_BITS=64 + fi +} + +process_toolchain() { + process_common_toolchain +} + +print_config_mk() { + saved_prefix="${prefix}" + prefix=$1 + makefile=$2 + shift 2 + for cfg; do + if enabled $cfg; then + upname="`toupper $cfg`" + echo "${prefix}_${upname}=yes" >> $makefile + fi + done + prefix="${saved_prefix}" +} + +print_config_h() { + saved_prefix="${prefix}" + prefix=$1 + header=$2 + shift 2 + for cfg; do + upname="`toupper $cfg`" + if enabled $cfg; then + echo "#define ${prefix}_${upname} 1" >> $header + else + echo "#define ${prefix}_${upname} 0" >> $header + fi + done + prefix="${saved_prefix}" +} + +print_config_vars_h() { + header=$1 + shift + while [ $# -gt 0 ]; do + upname="`toupper $1`" + echo "#define ${upname} $2" >> $header + shift 2 + done +} + +print_webm_license() { + saved_prefix="${prefix}" + destination=$1 + prefix="$2" + suffix="$3" + shift 3 + cat < ${destination} +${prefix} Copyright (c) 2011 The WebM project authors. All Rights Reserved.${suffix} +${prefix} ${suffix} +${prefix} Use of this source code is governed by a BSD-style license${suffix} +${prefix} that can be found in the LICENSE file in the root of the source${suffix} +${prefix} tree. An additional intellectual property rights grant can be found${suffix} +${prefix} in the file PATENTS. All contributing project authors may${suffix} +${prefix} be found in the AUTHORS file in the root of the source tree.${suffix} +EOF + prefix="${saved_prefix}" +} + +process_targets() { + true; +} + +process_detect() { + true; +} + +enable_feature logging +logfile="config.log" +self=$0 +process() { + cmdline_args="$@" + process_cmdline "$@" + if enabled child; then + echo "# ${self} $@" >> ${logfile} + else + echo "# ${self} $@" > ${logfile} + fi + post_process_common_cmdline + post_process_cmdline + process_toolchain + process_detect + process_targets + + OOT_INSTALLS="${OOT_INSTALLS}" + if enabled source_path_used; then + # Prepare the PWD for building. + for f in ${OOT_INSTALLS}; do + install -D "${source_path}/$f" "$f" + done + fi + cp "${source_path}/build/make/Makefile" . + + clean_temp_files + true +} diff --git a/media/libvpx/libvpx/build/make/gen_asm_deps.sh b/media/libvpx/libvpx/build/make/gen_asm_deps.sh new file mode 100755 index 0000000000..3bd4d125f1 --- /dev/null +++ b/media/libvpx/libvpx/build/make/gen_asm_deps.sh @@ -0,0 +1,64 @@ +#!/bin/sh +## +## Copyright (c) 2010 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + + +self=$0 +show_help() { + echo "usage: $self [options] " + echo + echo "Generate Makefile dependency information from assembly code source" + echo + exit 1 +} +die_unknown(){ + echo "Unknown option \"$1\"." + echo "See $0 --help for available options." + exit 1 +} +for opt do + optval="${opt#*=}" + case "$opt" in + --build-pfx=*) pfx="${optval}" + ;; + --depfile=*) out="${optval}" + ;; + -I*) raw_inc_paths="${raw_inc_paths} ${opt}" + inc_path="${inc_path} ${opt#-I}" + ;; + -h|--help) show_help + ;; + *) [ -f "$opt" ] && srcfile="$opt" + ;; + esac +done + +[ -n "$srcfile" ] || show_help +sfx=${sfx:-asm} +includes=$(LC_ALL=C grep -E -i "include +\"?[a-z0-9_/]+\.${sfx}" $srcfile | + perl -p -e "s;.*?([a-z0-9_/]+.${sfx}).*;\1;") +#" restore editor state +for inc in ${includes}; do + found_inc_path= + for idir in ${inc_path}; do + [ -f "${idir}/${inc}" ] && found_inc_path="${idir}" && break + done + if [ -f `dirname $srcfile`/$inc ]; then + # Handle include files in the same directory as the source + $self --build-pfx=$pfx --depfile=$out ${raw_inc_paths} `dirname $srcfile`/$inc + elif [ -n "${found_inc_path}" ]; then + # Handle include files on the include path + $self --build-pfx=$pfx --depfile=$out ${raw_inc_paths} "${found_inc_path}/$inc" + else + # Handle generated includes in the build root (which may not exist yet) + echo ${out} ${out%d}o: "${pfx}${inc}" + fi +done +echo ${out} ${out%d}o: $srcfile diff --git a/media/libvpx/libvpx/build/make/gen_msvs_def.sh b/media/libvpx/libvpx/build/make/gen_msvs_def.sh new file mode 100755 index 0000000000..4defcc2e7c --- /dev/null +++ b/media/libvpx/libvpx/build/make/gen_msvs_def.sh @@ -0,0 +1,83 @@ +#!/bin/bash +## +## Copyright (c) 2010 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + + +self=$0 +self_basename=${self##*/} +EOL=$'\n' + +show_help() { + cat < symbol1 [symbol2, symbol3, ...] + +where is either 'text' or 'data' + + +Options: + --help Print this message + --out=filename Write output to a file [stdout] + --name=project_name Name of the library (required) +EOF + exit 1 +} + +die() { + echo "${self_basename}: $@" + exit 1 +} + +die_unknown(){ + echo "Unknown option \"$1\"." + echo "See ${self_basename} --help for available options." + exit 1 +} + +text() { + for sym in "$@"; do + echo " $sym" >> ${outfile} + done +} + +data() { + for sym in "$@"; do + printf " %-40s DATA\n" "$sym" >> ${outfile} + done +} + +# Process command line +for opt in "$@"; do + optval="${opt#*=}" + case "$opt" in + --help|-h) show_help + ;; + --out=*) outfile="$optval" + ;; + --name=*) name="${optval}" + ;; + -*) die_unknown $opt + ;; + *) file_list[${#file_list[@]}]="$opt" + esac +done +outfile=${outfile:-/dev/stdout} +[ -n "$name" ] || die "Library name (--name) must be specified!" + +echo "LIBRARY ${name}" > ${outfile} +echo "EXPORTS" >> ${outfile} +for f in "${file_list[@]}"; do + . $f +done diff --git a/media/libvpx/libvpx/build/make/gen_msvs_sln.sh b/media/libvpx/libvpx/build/make/gen_msvs_sln.sh new file mode 100755 index 0000000000..0b312850fe --- /dev/null +++ b/media/libvpx/libvpx/build/make/gen_msvs_sln.sh @@ -0,0 +1,255 @@ +#!/bin/bash +## +## Copyright (c) 2010 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + + +self=$0 +self_basename=${self##*/} +EOL=$'\n' +EOLDOS=$'\r' + +show_help() { + cat <&2 + [ -f "${outfile}" ] && rm -f ${outfile}{,.mk} + exit 1 +} + +die_unknown(){ + echo "Unknown option \"$1\"." >&2 + echo "See ${self_basename} --help for available options." >&2 + [ -f "${outfile}" ] && rm -f ${outfile}{,.mk} + exit 1 +} + +indent1=$'\t' +indent="" +indent_push() { + indent="${indent}${indent1}" +} +indent_pop() { + indent="${indent%${indent1}}" +} + +parse_project() { + local file=$1 + local name=`grep RootNamespace "$file" | sed 's,.*<.*>\(.*\).*,\1,'` + local guid=`grep ProjectGuid "$file" | sed 's,.*<.*>\(.*\).*,\1,'` + + # save the project GUID to a varaible, normalizing to the basename of the + # vcxproj file without the extension + local var + var=${file##*/} + var=${var%%.${sfx}} + eval "${var}_file=\"$1\"" + eval "${var}_name=$name" + eval "${var}_guid=$guid" + + cur_config_list=`grep -B1 'Label="Configuration"' $file | + grep Condition | cut -d\' -f4` + new_config_list=$(for i in $config_list $cur_config_list; do + echo $i + done | sort | uniq) + if [ "$config_list" != "" ] && [ "$config_list" != "$new_config_list" ]; then + mixed_platforms=1 + fi + config_list="$new_config_list" + eval "${var}_config_list=\"$cur_config_list\"" + proj_list="${proj_list} ${var}" +} + +process_project() { + eval "local file=\${$1_file}" + eval "local name=\${$1_name}" + eval "local guid=\${$1_guid}" + + # save the project GUID to a varaible, normalizing to the basename of the + # vcproj file without the extension + local var + var=${file##*/} + var=${var%%.${sfx}} + eval "${var}_guid=$guid" + + echo "Project(\"{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}\") = \"$name\", \"$file\", \"$guid\"" + echo "EndProject" +} + +process_global() { + echo "Global" + indent_push + + # + # Solution Configuration Platforms + # + echo "${indent}GlobalSection(SolutionConfigurationPlatforms) = preSolution" + indent_push + IFS_bak=${IFS} + IFS=$'\r'$'\n' + if [ "$mixed_platforms" != "" ]; then + config_list=" +Release|Mixed Platforms +Debug|Mixed Platforms" + fi + for config in ${config_list}; do + echo "${indent}$config = $config" + done + IFS=${IFS_bak} + indent_pop + echo "${indent}EndGlobalSection" + + # + # Project Configuration Platforms + # + echo "${indent}GlobalSection(ProjectConfigurationPlatforms) = postSolution" + indent_push + for proj in ${proj_list}; do + eval "local proj_guid=\${${proj}_guid}" + eval "local proj_config_list=\${${proj}_config_list}" + IFS=$'\r'$'\n' + for config in ${proj_config_list}; do + if [ "$mixed_platforms" != "" ]; then + local c=${config%%|*} + echo "${indent}${proj_guid}.${c}|Mixed Platforms.ActiveCfg = ${config}" + echo "${indent}${proj_guid}.${c}|Mixed Platforms.Build.0 = ${config}" + else + echo "${indent}${proj_guid}.${config}.ActiveCfg = ${config}" + echo "${indent}${proj_guid}.${config}.Build.0 = ${config}" + fi + + done + IFS=${IFS_bak} + done + indent_pop + echo "${indent}EndGlobalSection" + + # + # Solution Properties + # + echo "${indent}GlobalSection(SolutionProperties) = preSolution" + indent_push + echo "${indent}HideSolutionNode = FALSE" + indent_pop + echo "${indent}EndGlobalSection" + + indent_pop + echo "EndGlobal" +} + +process_makefile() { + IFS_bak=${IFS} + IFS=$'\r'$'\n' + local TAB=$'\t' + cat </dev/null 2>&1 && echo yes) +.nodevenv.once: +${TAB}@echo " * \$(MSBUILD_TOOL) not found in path." +${TAB}@echo " * " +${TAB}@echo " * You will have to build all configurations manually using the" +${TAB}@echo " * Visual Studio IDE. To allow make to build them automatically," +${TAB}@echo " * add the Common7/IDE directory of your Visual Studio" +${TAB}@echo " * installation to your path, eg:" +${TAB}@echo " * C:\Program Files\Microsoft Visual Studio 10.0\Common7\IDE" +${TAB}@echo " * " +${TAB}@touch \$@ +CLEAN-OBJS += \$(if \$(found_devenv),,.nodevenv.once) + +EOF + + for sln_config in ${config_list}; do + local config=${sln_config%%|*} + local platform=${sln_config##*|} + local nows_sln_config=`echo $sln_config | sed -e 's/[^a-zA-Z0-9]/_/g'` + cat <${outfile} <>${outfile} +done +process_global >>${outfile} +process_makefile >${mkoutfile} diff --git a/media/libvpx/libvpx/build/make/gen_msvs_vcxproj.sh b/media/libvpx/libvpx/build/make/gen_msvs_vcxproj.sh new file mode 100755 index 0000000000..1e1db05bb2 --- /dev/null +++ b/media/libvpx/libvpx/build/make/gen_msvs_vcxproj.sh @@ -0,0 +1,508 @@ +#!/bin/bash +## +## Copyright (c) 2013 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +self=$0 +self_basename=${self##*/} +self_dirname=$(dirname "$0") + +. "$self_dirname/msvs_common.sh"|| exit 127 + +show_help() { + cat <${content}" + indent_pop + else + echo "${indent}<${tag}>${content}" + fi +} + +generate_filter() { + local name=$1 + local pats=$2 + local file_list_sz + local i + local f + local saveIFS="$IFS" + local pack + echo "generating filter '$name' from ${#file_list[@]} files" >&2 + IFS=* + + file_list_sz=${#file_list[@]} + for i in ${!file_list[@]}; do + f=${file_list[i]} + for pat in ${pats//;/$IFS}; do + if [ "${f##*.}" == "$pat" ]; then + unset file_list[i] + + objf=$(echo ${f%.*}.obj \ + | sed -e "s,$src_path_bare,," \ + -e 's/^[\./]\+//g' -e 's,[:/ ],_,g') + + if ([ "$pat" == "asm" ] || [ "$pat" == "s" ] || [ "$pat" == "S" ]) && $uses_asm; then + # Avoid object file name collisions, i.e. vpx_config.c and + # vpx_config.asm produce the same object file without + # this additional suffix. + objf=${objf%.obj}_asm.obj + open_tag CustomBuild \ + Include="$f" + for plat in "${platforms[@]}"; do + for cfg in Debug Release; do + tag_content Message "Assembling %(Filename)%(Extension)" \ + Condition="'\$(Configuration)|\$(Platform)'=='$cfg|$plat'" + tag_content Command "$(eval echo \$asm_${cfg}_cmdline) -o \$(IntDir)$objf" \ + Condition="'\$(Configuration)|\$(Platform)'=='$cfg|$plat'" + tag_content Outputs "\$(IntDir)$objf" \ + Condition="'\$(Configuration)|\$(Platform)'=='$cfg|$plat'" + done + done + close_tag CustomBuild + elif [ "$pat" == "c" ] || \ + [ "$pat" == "cc" ] || [ "$pat" == "cpp" ]; then + open_tag ClCompile \ + Include="$f" + # Separate file names with Condition? + tag_content ObjectFileName "\$(IntDir)$objf" + # Check for AVX and turn it on to avoid warnings. + if [[ $f =~ avx.?\.c$ ]]; then + tag_content AdditionalOptions "/arch:AVX" + fi + close_tag ClCompile + elif [ "$pat" == "h" ] ; then + tag ClInclude \ + Include="$f" + elif [ "$pat" == "vcxproj" ] ; then + open_tag ProjectReference \ + Include="$f" + depguid=`grep ProjectGuid "$f" | sed 's,.*<.*>\(.*\).*,\1,'` + tag_content Project "$depguid" + tag_content ReferenceOutputAssembly false + close_tag ProjectReference + else + tag None \ + Include="$f" + fi + + break + fi + done + done + + IFS="$saveIFS" +} + +# Process command line +unset target +for opt in "$@"; do + optval="${opt#*=}" + case "$opt" in + --help|-h) show_help + ;; + --target=*) + target="${optval}" + platform_toolset=$(echo ${target} | awk 'BEGIN{FS="-"}{print $4}') + case "$platform_toolset" in + clangcl) platform_toolset="ClangCl" + ;; + "") + ;; + *) die Unrecognized Visual Studio Platform Toolset in $opt + ;; + esac + ;; + --out=*) outfile="$optval" + ;; + --name=*) name="${optval}" + ;; + --proj-guid=*) guid="${optval}" + ;; + --module-def=*) module_def="${optval}" + ;; + --exe) proj_kind="exe" + ;; + --dll) proj_kind="dll" + ;; + --lib) proj_kind="lib" + ;; + --as=*) as="${optval}" + ;; + --src-path-bare=*) + src_path_bare=$(fix_path "$optval") + src_path_bare=${src_path_bare%/} + ;; + --static-crt) use_static_runtime=true + ;; + --enable-werror) werror=true + ;; + --ver=*) + vs_ver="$optval" + case "$optval" in + 1[4-7]) + ;; + *) die Unrecognized Visual Studio Version in $opt + ;; + esac + ;; + -I*) + opt=${opt##-I} + opt=$(fix_path "$opt") + opt="${opt%/}" + incs="${incs}${incs:+;}"${opt}"" + yasmincs="${yasmincs} -I"${opt}"" + ;; + -D*) defines="${defines}${defines:+;}${opt##-D}" + ;; + -L*) # fudge . to $(OutDir) + if [ "${opt##-L}" == "." ]; then + libdirs="${libdirs}${libdirs:+;}"\$(OutDir)"" + else + # Also try directories for this platform/configuration + opt=${opt##-L} + opt=$(fix_path "$opt") + libdirs="${libdirs}${libdirs:+;}"${opt}"" + libdirs="${libdirs}${libdirs:+;}"${opt}/\$(PlatformName)/\$(Configuration)"" + libdirs="${libdirs}${libdirs:+;}"${opt}/\$(PlatformName)"" + fi + ;; + -l*) libs="${libs}${libs:+ }${opt##-l}.lib" + ;; + -*) die_unknown $opt + ;; + *) + # The paths in file_list are fixed outside of the loop. + file_list[${#file_list[@]}]="$opt" + case "$opt" in + *.asm|*.[Ss]) uses_asm=true + ;; + esac + ;; + esac +done + +# Make one call to fix_path for file_list to improve performance. +fix_file_list file_list + +outfile=${outfile:-/dev/stdout} +guid=${guid:-`generate_uuid`} +uses_asm=${uses_asm:-false} + +[ -n "$name" ] || die "Project name (--name) must be specified!" +[ -n "$target" ] || die "Target (--target) must be specified!" + +if ${use_static_runtime:-false}; then + release_runtime=MultiThreaded + debug_runtime=MultiThreadedDebug + lib_sfx=mt +else + release_runtime=MultiThreadedDLL + debug_runtime=MultiThreadedDebugDLL + lib_sfx=md +fi + +# Calculate debug lib names: If a lib ends in ${lib_sfx}.lib, then rename +# it to ${lib_sfx}d.lib. This precludes linking to release libs from a +# debug exe, so this may need to be refactored later. +for lib in ${libs}; do + if [ "$lib" != "${lib%${lib_sfx}.lib}" ]; then + lib=${lib%.lib}d.lib + fi + debug_libs="${debug_libs}${debug_libs:+ }${lib}" +done +debug_libs=${debug_libs// /;} +libs=${libs// /;} + + +# List of all platforms supported for this target +case "$target" in + x86_64*) + platforms[0]="x64" + asm_Debug_cmdline="${as} -Xvc -gcv8 -f win64 ${yasmincs} "%(FullPath)"" + asm_Release_cmdline="${as} -Xvc -f win64 ${yasmincs} "%(FullPath)"" + ;; + x86*) + platforms[0]="Win32" + asm_Debug_cmdline="${as} -Xvc -gcv8 -f win32 ${yasmincs} "%(FullPath)"" + asm_Release_cmdline="${as} -Xvc -f win32 ${yasmincs} "%(FullPath)"" + ;; + arm64*) + platforms[0]="ARM64" + # As of Visual Studio 2022 17.5.5, clang-cl does not support ARM64EC. + if [ "$vs_ver" -ge 17 -a "$platform_toolset" != "ClangCl" ]; then + platforms[1]="ARM64EC" + fi + asm_Debug_cmdline="armasm64 -nologo -oldit "%(FullPath)"" + asm_Release_cmdline="armasm64 -nologo -oldit "%(FullPath)"" + ;; + arm*) + platforms[0]="ARM" + asm_Debug_cmdline="armasm -nologo -oldit "%(FullPath)"" + asm_Release_cmdline="armasm -nologo -oldit "%(FullPath)"" + ;; + *) die "Unsupported target $target!" + ;; +esac + +generate_vcxproj() { + echo "" + open_tag Project \ + DefaultTargets="Build" \ + ToolsVersion="4.0" \ + xmlns="http://schemas.microsoft.com/developer/msbuild/2003" \ + + open_tag ItemGroup \ + Label="ProjectConfigurations" + for plat in "${platforms[@]}"; do + for config in Debug Release; do + open_tag ProjectConfiguration \ + Include="$config|$plat" + tag_content Configuration $config + tag_content Platform $plat + close_tag ProjectConfiguration + done + done + close_tag ItemGroup + + open_tag PropertyGroup \ + Label="Globals" + tag_content ProjectGuid "{${guid}}" + tag_content RootNamespace ${name} + tag_content Keyword ManagedCProj + if [ $vs_ver -ge 12 ] && [ "${platforms[0]}" = "ARM" ]; then + tag_content AppContainerApplication true + # The application type can be one of "Windows Store", + # "Windows Phone" or "Windows Phone Silverlight". The + # actual value doesn't matter from the libvpx point of view, + # since a static library built for one works on the others. + # The PlatformToolset field needs to be set in sync with this; + # for Windows Store and Windows Phone Silverlight it should be + # v120 while it should be v120_wp81 if the type is Windows Phone. + tag_content ApplicationType "Windows Store" + tag_content ApplicationTypeRevision 8.1 + fi + if [ "${platforms[0]}" = "ARM64" ]; then + # Require the first Visual Studio version to have ARM64 support. + tag_content MinimumVisualStudioVersion 15.9 + fi + if [ $vs_ver -eq 15 ] && [ "${platforms[0]}" = "ARM64" ]; then + # Since VS 15 does not have a 'use latest SDK version' facility, + # specifically require the contemporaneous SDK with official ARM64 + # support. + tag_content WindowsTargetPlatformVersion 10.0.17763.0 + fi + close_tag PropertyGroup + + tag Import \ + Project="\$(VCTargetsPath)\\Microsoft.Cpp.Default.props" + + for plat in "${platforms[@]}"; do + for config in Release Debug; do + open_tag PropertyGroup \ + Condition="'\$(Configuration)|\$(Platform)'=='$config|$plat'" \ + Label="Configuration" + if [ "$proj_kind" = "exe" ]; then + tag_content ConfigurationType Application + elif [ "$proj_kind" = "dll" ]; then + tag_content ConfigurationType DynamicLibrary + else + tag_content ConfigurationType StaticLibrary + fi + if [ -n "$platform_toolset" ]; then + tag_content PlatformToolset "$platform_toolset" + else + if [ "$vs_ver" = "14" ]; then + tag_content PlatformToolset v140 + fi + if [ "$vs_ver" = "15" ]; then + tag_content PlatformToolset v141 + fi + if [ "$vs_ver" = "16" ]; then + tag_content PlatformToolset v142 + fi + if [ "$vs_ver" = "17" ]; then + tag_content PlatformToolset v143 + fi + fi + tag_content CharacterSet Unicode + if [ "$config" = "Release" ]; then + tag_content WholeProgramOptimization true + fi + close_tag PropertyGroup + done + done + + tag Import \ + Project="\$(VCTargetsPath)\\Microsoft.Cpp.props" + + open_tag ImportGroup \ + Label="PropertySheets" + tag Import \ + Project="\$(UserRootDir)\\Microsoft.Cpp.\$(Platform).user.props" \ + Condition="exists('\$(UserRootDir)\\Microsoft.Cpp.\$(Platform).user.props')" \ + Label="LocalAppDataPlatform" + close_tag ImportGroup + + tag PropertyGroup \ + Label="UserMacros" + + for plat in "${platforms[@]}"; do + plat_no_ws=`echo $plat | sed 's/[^A-Za-z0-9_]/_/g'` + for config in Debug Release; do + open_tag PropertyGroup \ + Condition="'\$(Configuration)|\$(Platform)'=='$config|$plat'" + tag_content OutDir "\$(SolutionDir)$plat_no_ws\\\$(Configuration)\\" + tag_content IntDir "$plat_no_ws\\\$(Configuration)\\${name}\\" + if [ "$proj_kind" == "lib" ]; then + if [ "$config" == "Debug" ]; then + config_suffix=d + else + config_suffix="" + fi + tag_content TargetName "${name}${lib_sfx}${config_suffix}" + fi + close_tag PropertyGroup + done + done + + for plat in "${platforms[@]}"; do + for config in Debug Release; do + open_tag ItemDefinitionGroup \ + Condition="'\$(Configuration)|\$(Platform)'=='$config|$plat'" + if [ "$name" == "vpx" ]; then + hostplat=$plat + if [ "$hostplat" == "ARM" ]; then + hostplat=Win32 + fi + fi + open_tag ClCompile + if [ "$config" = "Debug" ]; then + opt=Disabled + runtime=$debug_runtime + curlibs=$debug_libs + debug=_DEBUG + else + opt=MaxSpeed + runtime=$release_runtime + curlibs=$libs + tag_content FavorSizeOrSpeed Speed + debug=NDEBUG + fi + extradefines=";$defines" + tag_content Optimization $opt + tag_content AdditionalIncludeDirectories "$incs;%(AdditionalIncludeDirectories)" + tag_content PreprocessorDefinitions "WIN32;$debug;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE$extradefines;%(PreprocessorDefinitions)" + tag_content RuntimeLibrary $runtime + tag_content WarningLevel Level3 + if ${werror:-false}; then + tag_content TreatWarningAsError true + fi + if [ $vs_ver -ge 11 ]; then + # We need to override the defaults for these settings + # if AppContainerApplication is set. + tag_content CompileAsWinRT false + tag_content PrecompiledHeader NotUsing + tag_content SDLCheck false + fi + close_tag ClCompile + case "$proj_kind" in + exe) + open_tag Link + tag_content GenerateDebugInformation true + # Console is the default normally, but if + # AppContainerApplication is set, we need to override it. + tag_content SubSystem Console + close_tag Link + ;; + dll) + open_tag Link + tag_content GenerateDebugInformation true + tag_content ModuleDefinitionFile $module_def + close_tag Link + ;; + lib) + ;; + esac + close_tag ItemDefinitionGroup + done + + done + + open_tag ItemGroup + generate_filter "Source Files" "c;cc;cpp;def;odl;idl;hpj;bat;asm;asmx;s;S" + close_tag ItemGroup + open_tag ItemGroup + generate_filter "Header Files" "h;hm;inl;inc;xsd" + close_tag ItemGroup + open_tag ItemGroup + generate_filter "Build Files" "mk" + close_tag ItemGroup + open_tag ItemGroup + generate_filter "References" "vcxproj" + close_tag ItemGroup + + tag Import \ + Project="\$(VCTargetsPath)\\Microsoft.Cpp.targets" + + open_tag ImportGroup \ + Label="ExtensionTargets" + close_tag ImportGroup + + close_tag Project + + # This must be done from within the {} subshell + echo "Ignored files list (${#file_list[@]} items) is:" >&2 + for f in "${file_list[@]}"; do + echo " $f" >&2 + done +} + +# This regexp doesn't catch most of the strings in the vcxproj format, +# since they're like path instead of +# as previously. It still seems to work ok despite this. +generate_vcxproj | + sed -e '/"/s;\([^ "]\)/;\1\\;g' | + sed -e '/xmlns/s;\\;/;g' > ${outfile} + +exit diff --git a/media/libvpx/libvpx/build/make/ios-Info.plist b/media/libvpx/libvpx/build/make/ios-Info.plist new file mode 100644 index 0000000000..d157b11a0d --- /dev/null +++ b/media/libvpx/libvpx/build/make/ios-Info.plist @@ -0,0 +1,37 @@ + + + + + CFBundleDevelopmentRegion + en + CFBundleExecutable + VPX + CFBundleIdentifier + org.webmproject.VPX + CFBundleInfoDictionaryVersion + 6.0 + CFBundleName + VPX + CFBundlePackageType + FMWK + CFBundleShortVersionString + ${VERSION} + CFBundleSignature + ???? + CFBundleSupportedPlatforms + + iPhoneOS + + CFBundleVersion + ${VERSION} + MinimumOSVersion + ${IOS_VERSION_MIN} + UIDeviceFamily + + 1 + 2 + + VPXFullVersion + ${FULLVERSION} + + diff --git a/media/libvpx/libvpx/build/make/iosbuild.sh b/media/libvpx/libvpx/build/make/iosbuild.sh new file mode 100755 index 0000000000..978ffbbb98 --- /dev/null +++ b/media/libvpx/libvpx/build/make/iosbuild.sh @@ -0,0 +1,384 @@ +#!/bin/sh +## +## Copyright (c) 2014 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## +## +## This script generates 'VPX.framework'. An iOS app can encode and decode VPx +## video by including 'VPX.framework'. +## +## Run iosbuild.sh to create 'VPX.framework' in the current directory. +## +set -e +devnull='> /dev/null 2>&1' + +BUILD_ROOT="_iosbuild" +CONFIGURE_ARGS="--disable-docs + --disable-examples + --disable-libyuv + --disable-unit-tests" +DIST_DIR="_dist" +FRAMEWORK_DIR="VPX.framework" +FRAMEWORK_LIB="VPX.framework/VPX" +HEADER_DIR="${FRAMEWORK_DIR}/Headers/vpx" +SCRIPT_DIR=$(dirname "$0") +LIBVPX_SOURCE_DIR=$(cd ${SCRIPT_DIR}/../..; pwd) +LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo) +ORIG_PWD="$(pwd)" +ARM_TARGETS="arm64-darwin-gcc + armv7-darwin-gcc + armv7s-darwin-gcc" +SIM_TARGETS="x86-iphonesimulator-gcc + x86_64-iphonesimulator-gcc" +OSX_TARGETS="x86-darwin16-gcc + x86_64-darwin16-gcc" +TARGETS="${ARM_TARGETS} ${SIM_TARGETS}" + +# Configures for the target specified by $1, and invokes make with the dist +# target using $DIST_DIR as the distribution output directory. +build_target() { + local target="$1" + local old_pwd="$(pwd)" + local target_specific_flags="" + + vlog "***Building target: ${target}***" + + case "${target}" in + x86-*) + target_specific_flags="--enable-pic" + vlog "Enabled PIC for ${target}" + ;; + esac + + mkdir "${target}" + cd "${target}" + eval "${LIBVPX_SOURCE_DIR}/configure" --target="${target}" \ + ${CONFIGURE_ARGS} ${EXTRA_CONFIGURE_ARGS} ${target_specific_flags} \ + ${devnull} + export DIST_DIR + eval make dist ${devnull} + cd "${old_pwd}" + + vlog "***Done building target: ${target}***" +} + +# Returns the preprocessor symbol for the target specified by $1. +target_to_preproc_symbol() { + target="$1" + case "${target}" in + arm64-*) + echo "__aarch64__" + ;; + armv7-*) + echo "__ARM_ARCH_7A__" + ;; + armv7s-*) + echo "__ARM_ARCH_7S__" + ;; + x86-*) + echo "__i386__" + ;; + x86_64-*) + echo "__x86_64__" + ;; + *) + echo "#error ${target} unknown/unsupported" + return 1 + ;; + esac +} + +# Create a vpx_config.h shim that, based on preprocessor settings for the +# current target CPU, includes the real vpx_config.h for the current target. +# $1 is the list of targets. +create_vpx_framework_config_shim() { + local targets="$1" + local config_file="${HEADER_DIR}/vpx_config.h" + local preproc_symbol="" + local target="" + local include_guard="VPX_FRAMEWORK_HEADERS_VPX_VPX_CONFIG_H_" + + local file_header="/* + * Copyright (c) $(date +%Y) The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/* GENERATED FILE: DO NOT EDIT! */ + +#ifndef ${include_guard} +#define ${include_guard} + +#if defined" + + printf "%s" "${file_header}" > "${config_file}" + for target in ${targets}; do + preproc_symbol=$(target_to_preproc_symbol "${target}") + printf " ${preproc_symbol}\n" >> "${config_file}" + printf "#define VPX_FRAMEWORK_TARGET \"${target}\"\n" >> "${config_file}" + printf "#include \"VPX/vpx/${target}/vpx_config.h\"\n" >> "${config_file}" + printf "#elif defined" >> "${config_file}" + mkdir "${HEADER_DIR}/${target}" + cp -p "${BUILD_ROOT}/${target}/vpx_config.h" "${HEADER_DIR}/${target}" + done + + # Consume the last line of output from the loop: We don't want it. + sed -i.bak -e '$d' "${config_file}" + rm "${config_file}.bak" + + printf "#endif\n\n" >> "${config_file}" + printf "#endif // ${include_guard}" >> "${config_file}" +} + +# Verifies that $FRAMEWORK_LIB fat library contains requested builds. +verify_framework_targets() { + local requested_cpus="" + local cpu="" + + # Extract CPU from full target name. + for target; do + cpu="${target%%-*}" + if [ "${cpu}" = "x86" ]; then + # lipo -info outputs i386 for libvpx x86 targets. + cpu="i386" + fi + requested_cpus="${requested_cpus}${cpu} " + done + + # Get target CPUs present in framework library. + local targets_built=$(${LIPO} -info ${FRAMEWORK_LIB}) + + # $LIPO -info outputs a string like the following: + # Architectures in the fat file: $FRAMEWORK_LIB + # Capture only the architecture strings. + targets_built=${targets_built##*: } + + # Sort CPU strings to make the next step a simple string compare. + local actual=$(echo ${targets_built} | tr " " "\n" | sort | tr "\n" " ") + local requested=$(echo ${requested_cpus} | tr " " "\n" | sort | tr "\n" " ") + + vlog "Requested ${FRAMEWORK_LIB} CPUs: ${requested}" + vlog "Actual ${FRAMEWORK_LIB} CPUs: ${actual}" + + if [ "${requested}" != "${actual}" ]; then + elog "Actual ${FRAMEWORK_LIB} targets do not match requested target list." + elog " Requested target CPUs: ${requested}" + elog " Actual target CPUs: ${actual}" + return 1 + fi +} + +# Configures and builds each target specified by $1, and then builds +# VPX.framework. +build_framework() { + local lib_list="" + local targets="$1" + local target="" + local target_dist_dir="" + + # Clean up from previous build(s). + rm -rf "${BUILD_ROOT}" "${FRAMEWORK_DIR}" + + # Create output dirs. + mkdir -p "${BUILD_ROOT}" + mkdir -p "${HEADER_DIR}" + + cd "${BUILD_ROOT}" + + for target in ${targets}; do + build_target "${target}" + target_dist_dir="${BUILD_ROOT}/${target}/${DIST_DIR}" + if [ "${ENABLE_SHARED}" = "yes" ]; then + local suffix="dylib" + else + local suffix="a" + fi + lib_list="${lib_list} ${target_dist_dir}/lib/libvpx.${suffix}" + done + + cd "${ORIG_PWD}" + + # The basic libvpx API includes are all the same; just grab the most recent + # set. + cp -p "${target_dist_dir}"/include/vpx/* "${HEADER_DIR}" + + # Build the fat library. + ${LIPO} -create ${lib_list} -output ${FRAMEWORK_DIR}/VPX + + # Create the vpx_config.h shim that allows usage of vpx_config.h from + # within VPX.framework. + create_vpx_framework_config_shim "${targets}" + + # Copy in vpx_version.h. + cp -p "${BUILD_ROOT}/${target}/vpx_version.h" "${HEADER_DIR}" + + if [ "${ENABLE_SHARED}" = "yes" ]; then + # Adjust the dylib's name so dynamic linking in apps works as expected. + install_name_tool -id '@rpath/VPX.framework/VPX' ${FRAMEWORK_DIR}/VPX + + # Copy in Info.plist. + cat "${SCRIPT_DIR}/ios-Info.plist" \ + | sed "s/\${FULLVERSION}/${FULLVERSION}/g" \ + | sed "s/\${VERSION}/${VERSION}/g" \ + | sed "s/\${IOS_VERSION_MIN}/${IOS_VERSION_MIN}/g" \ + > "${FRAMEWORK_DIR}/Info.plist" + fi + + # Confirm VPX.framework/VPX contains the targets requested. + verify_framework_targets ${targets} + + vlog "Created fat library ${FRAMEWORK_LIB} containing:" + for lib in ${lib_list}; do + vlog " $(echo ${lib} | awk -F / '{print $2, $NF}')" + done +} + +# Trap function. Cleans up the subtree used to build all targets contained in +# $TARGETS. +cleanup() { + local res=$? + cd "${ORIG_PWD}" + + if [ $res -ne 0 ]; then + elog "build exited with error ($res)" + fi + + if [ "${PRESERVE_BUILD_OUTPUT}" != "yes" ]; then + rm -rf "${BUILD_ROOT}" + fi +} + +print_list() { + local indent="$1" + shift + local list="$@" + for entry in ${list}; do + echo "${indent}${entry}" + done +} + +iosbuild_usage() { +cat << EOF + Usage: ${0##*/} [arguments] + --help: Display this message and exit. + --enable-shared: Build a dynamic framework for use on iOS 8 or later. + --extra-configure-args : Extra args to pass when configuring libvpx. + --macosx: Uses darwin16 targets instead of iphonesimulator targets for x86 + and x86_64. Allows linking to framework when builds target MacOSX + instead of iOS. + --preserve-build-output: Do not delete the build directory. + --show-build-output: Show output from each library build. + --targets : Override default target list. Defaults: +$(print_list " " ${TARGETS}) + --test-link: Confirms all targets can be linked. Functionally identical to + passing --enable-examples via --extra-configure-args. + --verbose: Output information about the environment and each stage of the + build. +EOF +} + +elog() { + echo "${0##*/} failed because: $@" 1>&2 +} + +vlog() { + if [ "${VERBOSE}" = "yes" ]; then + echo "$@" + fi +} + +trap cleanup EXIT + +# Parse the command line. +while [ -n "$1" ]; do + case "$1" in + --extra-configure-args) + EXTRA_CONFIGURE_ARGS="$2" + shift + ;; + --help) + iosbuild_usage + exit + ;; + --enable-shared) + ENABLE_SHARED=yes + ;; + --preserve-build-output) + PRESERVE_BUILD_OUTPUT=yes + ;; + --show-build-output) + devnull= + ;; + --test-link) + EXTRA_CONFIGURE_ARGS="${EXTRA_CONFIGURE_ARGS} --enable-examples" + ;; + --targets) + TARGETS="$2" + shift + ;; + --macosx) + TARGETS="${ARM_TARGETS} ${OSX_TARGETS}" + ;; + --verbose) + VERBOSE=yes + ;; + *) + iosbuild_usage + exit 1 + ;; + esac + shift +done + +if [ "${ENABLE_SHARED}" = "yes" ]; then + CONFIGURE_ARGS="--enable-shared ${CONFIGURE_ARGS}" +fi + +FULLVERSION=$("${SCRIPT_DIR}"/version.sh --bare "${LIBVPX_SOURCE_DIR}") +VERSION=$(echo "${FULLVERSION}" | sed -E 's/^v([0-9]+\.[0-9]+\.[0-9]+).*$/\1/') + +if [ "$ENABLE_SHARED" = "yes" ]; then + IOS_VERSION_OPTIONS="--enable-shared" + IOS_VERSION_MIN="8.0" +else + IOS_VERSION_OPTIONS="" + IOS_VERSION_MIN="7.0" +fi + +if [ "${VERBOSE}" = "yes" ]; then +cat << EOF + BUILD_ROOT=${BUILD_ROOT} + DIST_DIR=${DIST_DIR} + CONFIGURE_ARGS=${CONFIGURE_ARGS} + EXTRA_CONFIGURE_ARGS=${EXTRA_CONFIGURE_ARGS} + FRAMEWORK_DIR=${FRAMEWORK_DIR} + FRAMEWORK_LIB=${FRAMEWORK_LIB} + HEADER_DIR=${HEADER_DIR} + LIBVPX_SOURCE_DIR=${LIBVPX_SOURCE_DIR} + LIPO=${LIPO} + MAKEFLAGS=${MAKEFLAGS} + ORIG_PWD=${ORIG_PWD} + PRESERVE_BUILD_OUTPUT=${PRESERVE_BUILD_OUTPUT} + TARGETS="$(print_list "" ${TARGETS})" + ENABLE_SHARED=${ENABLE_SHARED} + OSX_TARGETS="${OSX_TARGETS}" + SIM_TARGETS="${SIM_TARGETS}" + SCRIPT_DIR="${SCRIPT_DIR}" + FULLVERSION="${FULLVERSION}" + VERSION="${VERSION}" + IOS_VERSION_MIN="${IOS_VERSION_MIN}" +EOF +fi + +build_framework "${TARGETS}" +echo "Successfully built '${FRAMEWORK_DIR}' for:" +print_list "" ${TARGETS} diff --git a/media/libvpx/libvpx/build/make/msvs_common.sh b/media/libvpx/libvpx/build/make/msvs_common.sh new file mode 100644 index 0000000000..3989fec0d5 --- /dev/null +++ b/media/libvpx/libvpx/build/make/msvs_common.sh @@ -0,0 +1,124 @@ +#!/bin/bash +## +## Copyright (c) 2014 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +shell_name="$(uname -o 2>/dev/null)" +if [[ "$shell_name" = "Cygwin" || "$shell_name" = "Msys" ]] \ + && cygpath --help >/dev/null 2>&1; then + FIXPATH='cygpath -m' +else + FIXPATH='echo_path' +fi + +die() { + echo "${self_basename}: $@" >&2 + exit 1 +} + +die_unknown(){ + echo "Unknown option \"$1\"." >&2 + echo "See ${self_basename} --help for available options." >&2 + exit 1 +} + +echo_path() { + for path; do + echo "$path" + done +} + +# Output one, possibly changed based on the system, path per line. +fix_path() { + $FIXPATH "$@" +} + +# Corrects the paths in file_list in one pass for efficiency. +# $1 is the name of the array to be modified. +fix_file_list() { + if [ "${FIXPATH}" = "echo_path" ] ; then + # When used with echo_path, fix_file_list is a no-op. Avoid warning about + # unsupported 'declare -n' when it is not important. + return 0 + elif [ "${BASH_VERSINFO}" -lt 4 ] ; then + echo "Cygwin path conversion has failed. Please use a version of bash" + echo "which supports nameref (-n), introduced in bash 4.3" + return 1 + fi + declare -n array_ref=$1 + files=$(fix_path "${array_ref[@]}") + local IFS=$'\n' + array_ref=($files) +} + +generate_uuid() { + local hex="0123456789ABCDEF" + local i + local uuid="" + local j + #93995380-89BD-4b04-88EB-625FBE52EBFB + for ((i=0; i<32; i++)); do + (( j = $RANDOM % 16 )) + uuid="${uuid}${hex:$j:1}" + done + echo "${uuid:0:8}-${uuid:8:4}-${uuid:12:4}-${uuid:16:4}-${uuid:20:12}" +} + +indent1=" " +indent="" +indent_push() { + indent="${indent}${indent1}" +} +indent_pop() { + indent="${indent%${indent1}}" +} + +tag_attributes() { + for opt in "$@"; do + optval="${opt#*=}" + [ -n "${optval}" ] || + die "Missing attribute value in '$opt' while generating $tag tag" + echo "${indent}${opt%%=*}=\"${optval}\"" + done +} + +open_tag() { + local tag=$1 + shift + if [ $# -ne 0 ]; then + echo "${indent}<${tag}" + indent_push + tag_attributes "$@" + echo "${indent}>" + else + echo "${indent}<${tag}>" + indent_push + fi +} + +close_tag() { + local tag=$1 + indent_pop + echo "${indent}" +} + +tag() { + local tag=$1 + shift + if [ $# -ne 0 ]; then + echo "${indent}<${tag}" + indent_push + tag_attributes "$@" + indent_pop + echo "${indent}/>" + else + echo "${indent}<${tag}/>" + fi +} + diff --git a/media/libvpx/libvpx/build/make/rtcd.pl b/media/libvpx/libvpx/build/make/rtcd.pl new file mode 100755 index 0000000000..0b9e16738e --- /dev/null +++ b/media/libvpx/libvpx/build/make/rtcd.pl @@ -0,0 +1,528 @@ +#!/usr/bin/env perl +## +## Copyright (c) 2017 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +no strict 'refs'; +use warnings; +use Getopt::Long; +Getopt::Long::Configure("auto_help") if $Getopt::Long::VERSION > 2.32; + +my %ALL_FUNCS = (); +my @ALL_ARCHS; +my @ALL_FORWARD_DECLS; +my @REQUIRES; + +my %opts = (); +my %disabled = (); +my %required = (); + +my @argv; +foreach (@ARGV) { + $disabled{$1} = 1, next if /--disable-(.*)/; + $required{$1} = 1, next if /--require-(.*)/; + push @argv, $_; +} + +# NB: use GetOptions() instead of GetOptionsFromArray() for compatibility. +@ARGV = @argv; +GetOptions( + \%opts, + 'arch=s', + 'sym=s', + 'config=s', +); + +foreach my $opt (qw/arch config/) { + if (!defined($opts{$opt})) { + warn "--$opt is required!\n"; + Getopt::Long::HelpMessage('-exit' => 1); + } +} + +foreach my $defs_file (@ARGV) { + if (!-f $defs_file) { + warn "$defs_file: $!\n"; + Getopt::Long::HelpMessage('-exit' => 1); + } +} + +open CONFIG_FILE, $opts{config} or + die "Error opening config file '$opts{config}': $!\n"; + +my %config = (); +while () { + next if !/^(?:CONFIG_|HAVE_)/; + chomp; + my @pair = split /=/; + $config{$pair[0]} = $pair[1]; +} +close CONFIG_FILE; + +# +# Routines for the RTCD DSL to call +# +sub vpx_config($) { + return (defined $config{$_[0]}) ? $config{$_[0]} : ""; +} + +sub specialize { + my $fn=$_[0]; + shift; + foreach my $opt (@_) { + eval "\$${fn}_${opt}=${fn}_${opt}"; + } +} + +sub add_proto { + my $fn = splice(@_, -2, 1); + $ALL_FUNCS{$fn} = \@_; + specialize $fn, "c"; +} + +sub require { + foreach my $fn (keys %ALL_FUNCS) { + foreach my $opt (@_) { + my $ofn = eval "\$${fn}_${opt}"; + next if !$ofn; + + # if we already have a default, then we can disable it, as we know + # we can do better. + my $best = eval "\$${fn}_default"; + if ($best) { + my $best_ofn = eval "\$${best}"; + if ($best_ofn && "$best_ofn" ne "$ofn") { + eval "\$${best}_link = 'false'"; + } + } + eval "\$${fn}_default=${fn}_${opt}"; + eval "\$${fn}_${opt}_link='true'"; + } + } +} + +sub forward_decls { + push @ALL_FORWARD_DECLS, @_; +} + +# +# Include the user's directives +# +foreach my $f (@ARGV) { + open FILE, "<", $f or die "cannot open $f: $!\n"; + my $contents = join('', ); + close FILE; + eval $contents or warn "eval failed: $@\n"; +} + +# +# Process the directives according to the command line +# +sub process_forward_decls() { + foreach (@ALL_FORWARD_DECLS) { + $_->(); + } +} + +sub determine_indirection { + vpx_config("CONFIG_RUNTIME_CPU_DETECT") eq "yes" or &require(@ALL_ARCHS); + foreach my $fn (keys %ALL_FUNCS) { + my $n = ""; + my @val = @{$ALL_FUNCS{$fn}}; + my $args = pop @val; + my $rtyp = "@val"; + my $dfn = eval "\$${fn}_default"; + $dfn = eval "\$${dfn}"; + foreach my $opt (@_) { + my $ofn = eval "\$${fn}_${opt}"; + next if !$ofn; + my $link = eval "\$${fn}_${opt}_link"; + next if $link && $link eq "false"; + $n .= "x"; + } + if ($n eq "x") { + eval "\$${fn}_indirect = 'false'"; + } else { + eval "\$${fn}_indirect = 'true'"; + } + } +} + +sub declare_function_pointers { + foreach my $fn (sort keys %ALL_FUNCS) { + my @val = @{$ALL_FUNCS{$fn}}; + my $args = pop @val; + my $rtyp = "@val"; + my $dfn = eval "\$${fn}_default"; + $dfn = eval "\$${dfn}"; + foreach my $opt (@_) { + my $ofn = eval "\$${fn}_${opt}"; + next if !$ofn; + print "$rtyp ${ofn}($args);\n"; + } + if (eval "\$${fn}_indirect" eq "false") { + print "#define ${fn} ${dfn}\n"; + } else { + print "RTCD_EXTERN $rtyp (*${fn})($args);\n"; + } + print "\n"; + } +} + +sub set_function_pointers { + foreach my $fn (sort keys %ALL_FUNCS) { + my @val = @{$ALL_FUNCS{$fn}}; + my $args = pop @val; + my $rtyp = "@val"; + my $dfn = eval "\$${fn}_default"; + $dfn = eval "\$${dfn}"; + if (eval "\$${fn}_indirect" eq "true") { + print " $fn = $dfn;\n"; + foreach my $opt (@_) { + my $ofn = eval "\$${fn}_${opt}"; + next if !$ofn; + next if "$ofn" eq "$dfn"; + my $link = eval "\$${fn}_${opt}_link"; + next if $link && $link eq "false"; + my $cond = eval "\$have_${opt}"; + print " if (${cond}) $fn = $ofn;\n" + } + } + } +} + +sub filter { + my @filtered; + foreach (@_) { push @filtered, $_ unless $disabled{$_}; } + return @filtered; +} + +# +# Helper functions for generating the arch specific RTCD files +# +sub common_top() { + my $include_guard = uc($opts{sym})."_H_"; + print <) { + if (/HAVE_DSPR2=yes/) { + $have_dspr2 = 1; + } + if (/HAVE_MSA=yes/) { + $have_msa = 1; + } + if (/HAVE_MMI=yes/) { + $have_mmi = 1; + } + } + close CONFIG_FILE; + if ($have_dspr2 == 1) { + @ALL_ARCHS = filter("$opts{arch}", qw/dspr2/); + } elsif ($have_msa == 1 && $have_mmi == 1) { + @ALL_ARCHS = filter("$opts{arch}", qw/mmi msa/); + } elsif ($have_msa == 1) { + @ALL_ARCHS = filter("$opts{arch}", qw/msa/); + } elsif ($have_mmi == 1) { + @ALL_ARCHS = filter("$opts{arch}", qw/mmi/); + } else { + unoptimized; + } + mips; +} elsif ($opts{arch} =~ /armv7\w?/) { + @ALL_ARCHS = filter(qw/neon_asm neon/); + arm; +} elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) { + @ALL_ARCHS = filter(qw/neon neon_dotprod neon_i8mm sve/); + @REQUIRES = filter(qw/neon/); + &require(@REQUIRES); + arm; +} elsif ($opts{arch} =~ /^ppc/ ) { + @ALL_ARCHS = filter(qw/vsx/); + ppc; +} elsif ($opts{arch} =~ /loongarch/ ) { + @ALL_ARCHS = filter(qw/lsx lasx/); + loongarch; +} else { + unoptimized; +} + +__END__ + +=head1 NAME + +rtcd - + +=head1 SYNOPSIS + +Usage: rtcd.pl [options] FILE + +See 'perldoc rtcd.pl' for more details. + +=head1 DESCRIPTION + +Reads the Run Time CPU Detections definitions from FILE and generates a +C header file on stdout. + +=head1 OPTIONS + +Options: + --arch=ARCH Architecture to generate defs for (required) + --disable-EXT Disable support for EXT extensions + --require-EXT Require support for EXT extensions + --sym=SYMBOL Unique symbol to use for RTCD initialization function + --config=FILE File with CONFIG_FOO=yes lines to parse diff --git a/media/libvpx/libvpx/build/make/thumb.pm b/media/libvpx/libvpx/build/make/thumb.pm new file mode 100644 index 0000000000..ef4b316771 --- /dev/null +++ b/media/libvpx/libvpx/build/make/thumb.pm @@ -0,0 +1,60 @@ +#!/usr/bin/env perl +## +## Copyright (c) 2013 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +package thumb; + +sub FixThumbInstructions($) +{ + # Write additions with shifts, such as "add r10, r11, lsl #8", + # in three operand form, "add r10, r10, r11, lsl #8". + s/(add\s+)(r\d+),\s*(r\d+),\s*(lsl #\d+)/$1$2, $2, $3, $4/g; + + # Convert additions with a non-constant shift into a sequence + # with left shift, addition and a right shift (to restore the + # register to the original value). Currently the right shift + # isn't necessary in the code base since the values in these + # registers aren't used, but doing the shift for consistency. + # This converts instructions such as "add r12, r12, r5, lsl r4" + # into the sequence "lsl r5, r4", "add r12, r12, r5", "lsr r5, r4". + s/^(\s*)(add)(\s+)(r\d+),\s*(r\d+),\s*(r\d+),\s*lsl (r\d+)/$1lsl$3$6, $7\n$1$2$3$4, $5, $6\n$1lsr$3$6, $7/g; + + # Convert loads with right shifts in the indexing into a + # sequence of an add, load and sub. This converts + # "ldrb r4, [r9, lr, asr #1]" into "add r9, r9, lr, asr #1", + # "ldrb r9, [r9]", "sub r9, r9, lr, asr #1". + s/^(\s*)(ldrb)(\s+)(r\d+),\s*\[(\w+),\s*(\w+),\s*(asr #\d+)\]/$1add $3$5, $5, $6, $7\n$1$2$3$4, [$5]\n$1sub $3$5, $5, $6, $7/g; + + # Convert register indexing with writeback into a separate add + # instruction. This converts "ldrb r12, [r1, r2]!" into + # "ldrb r12, [r1, r2]", "add r1, r1, r2". + s/^(\s*)(ldrb)(\s+)(r\d+),\s*\[(\w+),\s*(\w+)\]!/$1$2$3$4, [$5, $6]\n$1add $3$5, $6/g; + + # Convert negative register indexing into separate sub/add instructions. + # This converts "ldrne r4, [src, -pstep, lsl #1]" into + # "subne src, src, pstep, lsl #1", "ldrne r4, [src]", + # "addne src, src, pstep, lsl #1". In a couple of cases where + # this is used, it's used for two subsequent load instructions, + # where a hand-written version of it could merge two subsequent + # add and sub instructions. + s/^(\s*)((ldr|str|pld)(ne)?)(\s+)(r\d+,\s*)?\[(\w+), -([^\]]+)\]/$1sub$4$5$7, $7, $8\n$1$2$5$6\[$7\]\n$1add$4$5$7, $7, $8/g; + + # Convert register post indexing to a separate add instruction. + # This converts "ldrneb r9, [r0], r2" into "ldrneb r9, [r0]", + # "addne r0, r0, r2". + s/^(\s*)((ldr|str)(ne)?[bhd]?)(\s+)(\w+),(\s*\w+,)?\s*\[(\w+)\],\s*(\w+)/$1$2$5$6,$7 [$8]\n$1add$4$5$8, $8, $9/g; + + # Convert "mov pc, lr" into "bx lr", since the former only works + # for switching from arm to thumb (and only in armv7), but not + # from thumb to arm. + s/mov(\s*)pc\s*,\s*lr/bx$1lr/g; +} + +1; diff --git a/media/libvpx/libvpx/build/make/version.sh b/media/libvpx/libvpx/build/make/version.sh new file mode 100755 index 0000000000..f36ede10f2 --- /dev/null +++ b/media/libvpx/libvpx/build/make/version.sh @@ -0,0 +1,78 @@ +#!/bin/sh +## +## Copyright (c) 2010 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + + + +for opt in "$@"; do + optval="${opt#*=}" + case "$opt" in + --bare) bare=true ;; + *) break ;; + esac + shift +done +source_path=${1:-.} +out_file=${2} +id=${3:-VERSION_STRING} + +git_version_id="" +if [ -e "${source_path}/.git" ]; then + # Source Path is a git working copy. Check for local modifications. + # Note that git submodules may have a file as .git, not a directory. + export GIT_DIR="${source_path}/.git" + git_version_id=`git describe --match=v[0-9]* 2>/dev/null` +fi + +changelog_version="" +for p in "${source_path}" "${source_path}/.."; do + if [ -z "$git_version_id" -a -f "${p}/CHANGELOG" ]; then + changelog_version=`head -n1 "${p}/CHANGELOG" | awk '{print $2}'` + changelog_version="${changelog_version}" + break + fi +done +version_str="${changelog_version}${git_version_id}" +bare_version=${version_str#v} +major_version=${bare_version%%.*} +bare_version=${bare_version#*.} +minor_version=${bare_version%%.*} +bare_version=${bare_version#*.} +patch_version=${bare_version%%-*} +bare_version=${bare_version#${patch_version}} +extra_version=${bare_version##-} + +#since they'll be used as integers below make sure they are or force to 0 +for v in major_version minor_version patch_version; do + if eval echo \$$v |grep -E -q '[^[:digit:]]'; then + eval $v=0 + fi +done + +if [ ${bare} ]; then + echo "${changelog_version}${git_version_id}" > $$.tmp +else + cat<$$.tmp +// This file is generated. Do not edit. +#define VERSION_MAJOR $major_version +#define VERSION_MINOR $minor_version +#define VERSION_PATCH $patch_version +#define VERSION_EXTRA "$extra_version" +#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH)) +#define ${id}_NOSP "${version_str}" +#define ${id} " ${version_str}" +EOF +fi +if [ -n "$out_file" ]; then +diff $$.tmp ${out_file} >/dev/null 2>&1 || cat $$.tmp > ${out_file} +else +cat $$.tmp +fi +rm $$.tmp diff --git a/media/libvpx/libvpx/build_debug/non_greedy_mv_test_files/cur_frame_16x16.txt b/media/libvpx/libvpx/build_debug/non_greedy_mv_test_files/cur_frame_16x16.txt new file mode 100644 index 0000000000..c26463937a --- /dev/null +++ b/media/libvpx/libvpx/build_debug/non_greedy_mv_test_files/cur_frame_16x16.txt @@ -0,0 +1,2 @@ +486,720 +230,207,226,208,198,205,214,224,228,181,208,205,211,218,218,221,213,193,213,233,219,206,226,199,199,189,211,190,204,231,229,236,218,227,194,229,222,227,210,219,237,219,225,227,212,207,197,203,207,216,238,208,233,222,213,212,213,220,221,222,191,215,237,211,226,234,208,214,239,210,223,224,236,248,233,216,237,211,198,227,231,233,236,238,239,224,235,208,219,229,212,241,243,218,233,225,231,230,224,221,248,209,241,215,237,216,244,241,218,221,240,211,239,231,237,231,221,219,240,219,232,222,223,255,229,245,243,242,245,246,217,234,243,222,213,239,230,231,230,242,248,225,238,236,223,214,229,216,240,223,212,228,236,219,254,240,222,217,246,228,215,230,255,226,230,248,250,254,234,235,232,243,237,255,253,239,239,251,251,252,238,241,240,251,240,243,223,246,246,249,235,233,228,246,232,236,234,255,253,221,244,237,252,245,253,252,221,251,255,255,233,243,243,246,221,234,238,252,252,215,242,255,229,243,255,251,236,231,241,246,237,225,244,229,242,239,234,235,251,237,253,245,251,230,220,239,224,255,244,249,249,249,255,249,236,240,218,248,247,246,235,250,250,230,239,228,238,250,242,241,232,237,238,246,255,243,232,251,252,244,237,249,247,242,246,254,246,255,247,230,235,241,236,252,245,240,244,228,241,247,231,242,242,242,255,246,236,240,245,246,235,208,219,251,251,252,251,244,240,243,230,250,227,254,224,228,249,232,255,245,248,247,241,241,245,234,194,186,231,214,243,230,243,249,248,190,175,174,161,130,37,204,246,253,250,230,253,239,247,181,213,245,228,251,255,253,233,249,246,252,228,228,219,241,235,245,211,228,228,248,254,247,239,251,232,231,250,246,227,237,238,251,251,242,237,240,219,155,137,131,88,97,19,109,243,243,237,242,242,254,244,245,243,237,242,225,237,219,226,242,248,244,247,253,249,229,255,251,245,255,189,192,240,228,249,248,247,237,241,240,247,247,220,250,251,255,231,251,193,103,31,14,31,31,9,13,44,26,50,35,74,113,61,72,15,35,22,59,25,24,47,38,115,112,50,34,26,59,48,65,78,66,62,38,15,54,212,243,249,236,253,254,254,244,247,239,240,243,239,249,237,240,244,252,205,186,243,200,193,210,244,243,212,235,232,189,243,236,244,234,244,235,238,237,254,249,227,231,241,246,255,236,236,241,249,241,246,253,229,243,143,99,67,6,70,201,221,250,245,233,248,209,224,198,211,232,225,217,209,255,214,121,184,220,235,232,185,230,236,237,227,213,240,222,241,226,224,215,240,238,252,243,224,222,233,236,237,213,214,214,232,229,227,224,203,222,234,209,225,226,216,202,235,208,207,233,234,230,226,215,214,214,214,241,237,214,226,214,227,205,221,222,224,193,209,209,226,213,235,212,250,180,147,118,85,99,73,79,70,109,95,116,65,100,94,108,104,79,91,75,73,126,199,206,241,239,219,246,226,216,229,213,221,214,227,222,214,207,220,226,207,213,234,199,206,213,221,203,190,198,181,183,190,196,219,207,190,170,196,190,184,208,189,220,217,106,17,8,20,16,21,8,10,0,29,18,9,14,0,11,0,15,39,2,13,4,4,41,28,7,235,225,220,231,197,218,197,225,229,195,220,207,235,222,213,217,219,210,219,207,225,192,207,211,203,205,214,204,203,219,207,207,199,206,235,223,221,198,231,232,220,211,238,234,236,216,228,226,196,202,235,234,214,231,223,213,185,220,242,252,233,221,218,229,212,206,210,197,235,222,197,223,213,209,242,197,226,211,212,216,205,229,229,231,213,235,209,237,222,230,214,241,222,235,242,228,230,229,249,227,212,207,217,227,226,237,232,223,214,208,231,240,223,230,225,239,240,221,237,228,245,241,237,224,239,249,237,230,250,228,244,239,214,247,225,226,234,243,239,225,250,248,249,226,232,237,251,226,231,236,233,248,232,240,233,249,245,236,234,227,211,245,233,234,229,237,240,218,243,237,237,216,250,253,238,249,237,245,248,247,241,247,223,231,246,249,245,244,238,223,247,230,247,252,249,210,250,243,247,253,230,255,248,243,241,246,225,254,248,242,255,239,241,233,248,255,254,255,238,249,244,250,236,253,247,252,240,248,240,238,232,252,246,251,237,231,241,248,231,252,254,243,241,245,251,242,250,212,255,245,248,235,252,243,238,250,242,253,251,251,254,238,233,254,254,243,249,252,254,239,250,241,239,243,245,238,247,243,240,242,243,239,252,238,254,226,233,252,238,245,255,234,249,253,238,226,240,253,255,249,230,232,242,251,239,240,209,246,236,235,239,244,251,241,236,236,240,255,242,228,250,239,252,231,243,253,253,226,235,237,240,237,252,239,223,252,216,231,253,235,214,213,238,230,245,210,152,148,132,97,40,203,250,242,249,234,220,240,230,182,195,242,255,250,247,223,255,252,255,235,251,213,210,245,255,253,226,244,241,252,254,236,250,254,235,226,249,255,237,218,230,239,237,252,252,242,221,120,118,110,105,120,44,129,241,242,247,243,244,233,249,232,242,222,237,241,243,237,251,248,255,245,254,248,242,255,246,232,221,246,171,164,239,248,218,252,245,252,250,229,250,248,216,239,250,212,203,218,148,100,35,8,47,38,32,60,48,38,43,58,46,103,90,69,19,33,11,55,55,29,34,71,91,63,65,69,23,25,50,82,80,41,42,43,1,142,218,231,233,242,239,249,233,252,228,234,222,246,244,243,216,225,240,252,216,156,221,221,204,212,249,236,219,190,198,172,224,253,244,252,252,237,233,249,249,252,238,253,244,250,242,249,244,255,229,227,217,234,242,245,116,88,54,14,59,229,246,234,254,249,233,233,231,213,210,219,243,231,228,242,191,71,184,223,223,235,234,236,237,236,238,228,232,219,226,221,208,228,245,213,228,238,230,230,227,217,227,229,224,214,215,221,217,213,217,236,238,224,213,225,194,248,230,234,226,243,214,214,238,226,237,220,231,236,224,222,233,230,219,229,196,208,232,214,217,202,235,247,235,233,206,186,127,128,96,139,132,132,120,114,79,94,87,109,98,90,106,70,58,59,83,74,76,117,204,208,247,231,238,209,214,190,230,236,208,188,225,211,192,208,185,212,213,201,208,215,228,198,210,217,185,197,224,206,206,219,206,198,214,184,194,186,203,208,203,114,26,0,35,5,8,16,3,1,3,1,6,5,10,20,4,6,4,4,0,15,30,7,22,12,216,211,220,193,216,215,233,232,208,205,214,202,231,203,211,224,215,222,216,199,217,210,228,243,237,216,214,213,231,213,203,217,232,218,225,200,200,205,199,235,215,218,215,211,214,194,230,221,240,193,221,212,234,209,193,207,243,216,217,223,231,238,226,193,226,209,236,220,232,240,219,228,214,209,221,218,244,221,217,206,214,235,227,229,226,205,202,238,205,224,222,241,236,229,207,228,222,215,234,231,216,217,237,235,229,252,230,225,220,229,235,241,250,206,227,222,217,238,242,247,228,226,238,210,248,241,255,218,244,209,246,231,201,241,236,236,226,216,210,218,232,199,238,222,246,235,249,244,232,238,240,244,237,252,241,246,234,250,227,225,252,252,237,243,246,239,243,236,235,225,240,241,245,249,230,244,237,206,237,242,237,214,237,241,231,207,244,244,234,231,232,255,252,250,242,252,254,242,255,228,254,239,233,240,250,255,245,229,234,254,249,250,248,242,249,246,252,250,246,225,246,245,231,254,247,230,252,250,252,254,241,240,237,252,246,228,223,251,252,240,240,246,243,253,227,246,242,252,250,242,243,249,246,255,233,248,235,227,255,247,236,246,239,234,245,232,236,217,245,252,253,255,255,251,254,247,249,252,255,237,214,232,237,247,232,212,230,251,238,246,245,233,254,241,238,235,241,249,247,227,255,254,255,243,250,247,249,247,234,248,242,252,241,255,254,243,232,232,239,229,246,255,254,251,249,248,223,240,245,252,239,248,252,242,242,219,238,243,226,197,211,234,253,253,244,234,157,156,140,95,43,196,234,228,235,206,247,247,238,185,196,237,237,234,249,232,230,254,254,221,249,200,180,219,250,241,239,255,219,241,244,238,248,251,248,253,249,250,241,228,225,249,255,250,250,250,206,135,99,113,83,117,24,88,243,237,248,240,229,236,226,250,249,249,244,252,247,251,246,255,239,246,253,252,254,246,247,253,219,234,225,158,224,246,253,247,240,253,247,242,218,219,247,248,233,215,223,240,242,206,118,103,87,103,70,104,52,45,47,75,65,108,83,63,58,36,38,39,50,31,53,100,89,62,54,30,25,38,49,53,96,76,67,41,16,205,241,249,244,239,255,252,237,254,238,248,235,237,253,244,234,232,243,251,227,195,234,254,197,206,229,220,202,201,184,178,231,251,226,233,244,253,238,244,249,251,242,245,240,235,246,249,236,210,242,242,250,247,240,243,125,92,45,0,81,212,243,251,247,244,222,238,221,199,184,218,229,194,211,237,159,82,167,233,220,225,213,224,223,249,224,229,239,232,231,216,245,224,216,236,235,213,215,218,220,235,217,249,234,241,228,231,227,218,229,224,217,200,231,244,222,230,213,199,243,238,209,233,216,226,216,216,203,209,200,221,218,216,233,232,221,231,195,204,230,216,254,243,224,169,163,154,129,146,89,141,139,112,107,109,93,40,71,40,30,58,31,53,53,49,55,55,63,10,28,88,157,193,228,214,203,200,195,205,218,195,196,197,181,243,188,222,212,203,210,212,206,211,213,220,206,211,202,199,182,201,226,199,203,232,210,196,207,203,216,110,2,0,23,15,31,10,25,14,29,28,10,8,29,1,14,1,16,9,16,13,2,31,10,9,207,201,221,200,221,224,225,217,213,211,228,210,216,210,208,221,212,230,197,196,214,187,234,211,233,220,215,232,213,214,215,215,228,224,225,229,215,212,202,219,222,209,212,213,215,214,210,238,230,236,231,216,220,237,237,237,218,224,242,223,221,213,232,210,217,226,222,223,207,234,226,215,240,225,222,228,231,238,222,203,213,222,243,204,224,214,230,214,243,224,208,227,219,225,237,233,220,216,228,227,216,222,212,217,210,243,227,238,240,229,236,244,235,227,227,220,211,239,244,225,240,247,223,213,249,210,231,232,234,236,228,249,217,223,241,248,221,202,221,239,223,236,219,245,224,223,222,226,251,228,211,211,227,223,237,239,235,232,223,245,250,226,250,251,255,249,212,236,242,249,223,250,215,244,243,244,250,245,250,249,247,237,243,252,226,221,236,238,252,255,246,232,230,244,237,254,255,252,247,242,255,235,249,241,252,247,235,223,250,254,244,227,245,252,252,253,251,241,245,252,235,239,252,242,239,254,239,255,250,254,244,246,254,251,248,253,245,236,213,240,243,252,254,229,240,248,243,254,247,246,249,243,254,231,237,244,237,234,225,252,232,240,249,235,251,250,252,248,238,254,249,238,248,244,229,239,253,229,232,234,247,251,244,246,237,255,233,255,242,248,228,249,253,230,254,237,245,244,226,233,201,248,219,246,239,245,235,239,245,244,248,234,253,250,237,229,253,248,220,254,217,240,243,252,244,249,255,226,249,233,249,252,230,237,244,239,186,190,211,214,229,253,255,249,248,203,167,193,163,81,30,182,254,238,249,253,236,245,248,191,153,227,255,229,231,248,218,228,250,219,215,213,231,246,254,242,247,237,212,189,229,238,246,244,226,239,255,252,237,244,223,240,252,242,255,250,199,126,111,115,109,125,32,119,254,241,248,241,253,252,218,245,254,247,255,236,224,241,244,255,251,239,241,246,246,246,255,229,239,220,241,187,203,250,244,220,247,254,236,253,222,227,253,242,234,230,238,247,251,247,178,157,138,85,67,107,69,86,67,46,66,57,61,48,45,11,36,29,28,55,89,74,68,51,55,25,42,53,64,81,99,79,73,32,38,223,245,222,240,241,250,248,235,237,250,232,239,234,235,243,240,251,255,248,243,214,224,232,204,216,163,183,233,194,195,211,211,214,231,249,237,245,232,237,231,241,226,255,219,244,239,226,240,236,255,242,242,228,253,217,132,89,45,6,100,213,242,239,248,248,240,223,244,221,201,228,231,226,236,235,153,71,136,234,230,230,229,236,247,236,251,226,231,225,225,245,237,220,241,228,214,233,242,214,207,207,207,227,238,210,245,212,223,241,231,201,219,220,227,215,221,200,230,246,218,195,214,239,213,221,217,216,217,242,218,226,216,239,202,216,214,223,216,177,212,222,232,152,147,119,107,118,76,116,73,53,64,61,47,108,73,76,40,54,33,44,24,44,13,22,28,44,78,62,34,76,30,28,99,176,213,220,215,245,193,217,203,206,222,230,209,197,215,224,208,197,221,205,197,197,218,229,229,212,209,217,204,186,197,206,220,202,196,207,179,116,0,7,3,10,6,13,0,3,22,2,12,14,3,1,19,9,17,25,19,9,5,1,0,3,211,205,197,203,223,210,233,216,223,214,215,203,213,232,215,226,223,219,219,239,197,227,214,223,233,208,203,235,202,204,227,207,206,201,202,216,230,223,227,224,207,225,233,236,214,231,207,229,225,227,209,223,204,223,218,209,224,232,216,232,233,224,224,214,221,209,222,231,212,216,216,224,183,208,223,219,221,197,240,227,195,216,222,242,223,222,210,222,233,205,224,218,208,233,234,231,219,230,210,228,196,220,231,231,238,239,249,248,233,252,243,230,237,246,231,215,235,229,240,222,228,230,243,236,241,223,247,218,247,246,236,232,229,219,230,217,229,223,228,255,234,224,237,232,231,220,236,239,241,241,231,233,230,236,223,225,250,218,228,240,252,235,217,243,240,235,252,235,255,247,211,232,219,245,247,250,255,248,239,252,247,243,226,243,247,249,251,247,249,234,233,242,248,241,252,252,255,233,241,254,242,242,254,242,218,238,250,229,253,238,230,248,236,249,239,244,251,254,247,248,251,252,252,255,252,255,239,235,245,255,225,233,255,240,243,237,244,239,249,240,244,229,247,242,225,241,254,253,239,252,255,246,235,236,255,254,234,237,254,231,240,244,250,250,236,253,251,243,250,231,227,249,236,234,249,233,229,250,228,250,231,253,254,241,238,248,251,252,253,254,249,245,253,249,224,246,249,233,213,248,248,252,252,242,244,243,235,244,233,241,242,238,245,236,249,248,254,242,245,255,231,245,254,251,236,253,239,251,224,238,250,242,253,238,244,222,206,200,226,238,250,245,244,231,249,197,163,166,150,111,64,209,248,237,250,221,222,247,243,200,173,230,250,251,241,254,229,246,251,207,225,230,240,255,248,237,234,241,203,255,250,249,251,246,228,239,251,246,242,239,232,249,245,255,246,241,209,158,114,123,109,112,24,108,253,232,247,245,238,255,254,243,232,255,230,224,253,250,255,219,249,244,236,233,254,254,249,240,231,237,247,187,232,254,245,241,230,217,246,243,249,226,252,254,230,252,248,250,255,252,173,90,88,52,51,89,113,75,89,169,140,97,37,60,91,60,51,60,44,82,118,79,70,43,47,47,43,55,51,80,79,49,68,21,29,187,246,253,249,249,225,250,225,241,246,255,240,231,221,232,254,225,237,241,255,228,206,232,199,149,135,165,246,232,175,226,222,236,249,217,255,245,209,246,245,255,244,235,249,232,225,232,244,248,242,247,247,233,253,221,125,75,71,26,94,233,246,249,239,244,229,226,233,201,179,227,227,236,253,235,127,57,138,221,223,226,229,242,245,235,225,240,218,227,228,227,225,197,244,220,244,219,219,245,222,248,229,227,224,219,204,237,212,219,216,209,229,230,218,226,208,209,209,219,212,222,210,231,221,225,218,202,223,218,246,238,214,204,212,212,183,226,197,234,217,183,155,59,95,71,39,43,11,47,37,61,25,20,83,85,58,40,50,20,37,12,29,35,26,28,26,75,92,71,64,44,16,11,79,161,170,200,209,232,220,203,196,213,190,218,207,200,226,206,185,215,228,195,214,205,197,217,201,207,215,211,189,204,202,204,200,209,191,196,211,102,13,2,1,0,14,15,17,1,29,11,25,21,3,8,10,21,9,15,1,24,35,11,10,34,224,217,235,216,223,230,206,222,229,214,200,236,203,200,215,221,233,213,228,205,185,206,202,204,235,210,232,214,235,226,214,239,215,233,213,206,197,203,201,217,219,205,229,195,237,226,229,199,219,203,202,222,211,222,245,228,240,220,218,224,195,207,226,202,197,228,227,219,202,203,233,227,200,210,225,205,218,219,211,221,230,219,221,228,222,221,203,241,226,216,219,223,205,208,224,224,222,238,215,223,209,219,214,230,238,236,227,243,228,240,211,239,221,233,223,236,218,226,232,221,222,227,233,220,241,242,228,229,245,238,241,238,214,246,215,222,240,234,244,238,238,235,240,235,213,221,228,252,236,206,248,239,238,252,227,248,243,252,243,231,224,229,244,236,230,236,238,237,234,242,255,234,226,251,231,212,241,240,248,226,201,241,227,235,246,240,255,248,243,236,228,229,255,252,242,243,255,253,249,242,252,225,250,255,236,252,253,231,230,255,247,254,254,243,250,248,239,236,254,229,239,234,250,223,233,247,245,252,235,252,245,230,242,216,251,251,230,255,252,247,248,239,224,238,246,235,255,243,249,224,235,241,245,242,229,251,253,247,248,244,252,248,231,228,232,237,251,250,255,237,230,248,243,243,222,241,252,236,244,251,233,240,252,246,222,251,248,244,243,233,238,248,252,244,233,251,246,246,217,252,247,251,243,241,248,243,229,236,242,250,252,239,252,212,241,239,255,237,242,240,242,255,245,253,235,242,248,255,247,223,252,242,254,237,242,237,207,235,247,221,242,249,251,247,254,223,161,151,118,92,42,204,239,252,245,227,253,233,239,232,194,242,251,241,233,253,237,245,232,229,255,243,254,255,212,206,243,250,239,255,250,229,215,248,249,241,252,233,240,222,232,251,239,243,242,254,218,139,100,107,111,104,29,117,236,255,251,249,236,247,221,254,223,246,237,249,248,247,234,250,230,234,240,253,246,231,247,238,247,242,217,183,230,237,250,246,234,219,237,248,242,251,234,255,218,254,249,250,240,154,74,63,131,57,26,75,112,79,99,193,180,135,75,176,138,49,24,80,126,113,135,99,67,39,43,34,21,21,78,96,91,50,75,39,50,214,244,250,249,254,255,243,243,213,249,248,249,246,236,252,254,224,228,235,249,238,171,232,220,119,121,163,228,210,172,231,229,229,232,255,238,250,250,228,239,241,244,239,235,244,249,233,231,245,232,252,241,243,254,216,113,75,42,0,131,226,240,246,252,245,237,219,219,204,225,215,220,245,247,207,122,78,118,232,219,237,226,245,217,244,241,254,223,220,252,206,239,223,218,214,229,223,212,244,213,230,214,227,236,223,225,200,243,239,229,224,211,211,209,227,244,236,200,203,230,223,223,216,206,233,221,213,208,228,227,223,206,226,219,202,191,217,205,230,156,110,51,46,50,29,20,33,44,28,35,68,44,44,24,53,88,58,60,37,65,43,47,46,29,43,80,90,93,66,33,54,0,56,159,190,228,182,194,210,182,218,202,186,191,196,204,186,194,202,221,208,235,203,206,219,244,222,214,207,198,195,203,198,210,188,194,207,204,190,197,115,5,16,0,1,5,3,18,6,12,14,6,13,7,12,18,13,15,21,17,17,27,11,16,9,217,220,221,223,223,208,221,228,218,197,228,222,220,222,231,203,210,206,229,217,204,213,200,216,203,216,225,237,242,222,238,217,217,216,200,225,200,223,221,215,213,222,229,223,225,217,237,194,247,212,224,180,241,239,222,220,217,199,217,220,213,247,234,243,211,224,216,210,198,214,189,210,242,223,231,237,212,227,224,213,237,245,205,212,210,234,223,222,230,224,224,221,222,216,216,215,221,229,224,238,217,225,219,207,221,220,237,253,223,225,232,222,230,221,235,230,211,216,229,221,240,236,229,222,244,222,247,232,217,218,238,217,220,247,205,225,220,231,198,238,232,236,235,235,249,239,221,238,237,242,240,238,249,243,244,235,245,235,239,219,234,238,231,221,227,216,240,230,225,231,228,225,228,220,254,255,226,248,249,235,224,234,218,239,246,235,239,251,250,224,254,241,244,223,255,237,251,254,253,230,245,230,234,230,234,246,250,250,210,247,249,251,238,247,252,237,252,249,243,248,255,241,250,244,243,245,243,241,255,254,240,243,250,251,233,239,250,245,242,248,234,242,225,247,239,232,254,214,254,241,219,249,249,234,252,254,253,250,234,231,252,234,223,241,247,248,243,252,255,231,251,252,239,255,245,238,241,247,227,237,247,236,229,226,249,221,239,252,243,249,250,240,247,245,244,243,244,237,226,246,246,230,232,232,246,246,237,237,237,248,240,248,224,250,252,253,247,255,255,249,251,254,240,242,250,249,254,249,226,248,245,237,237,233,248,240,243,248,201,237,235,248,239,238,245,214,148,144,127,75,35,218,246,248,240,222,246,238,232,213,172,235,255,227,251,252,247,244,242,252,237,167,218,247,238,253,250,241,233,243,242,225,248,244,241,241,211,232,255,229,241,250,236,233,254,244,214,112,86,109,107,130,27,132,245,249,223,219,244,242,254,245,250,250,246,216,237,248,242,242,240,253,254,252,246,253,255,242,255,246,247,175,186,248,252,244,247,251,252,255,243,235,236,233,223,250,229,249,185,80,87,68,136,132,82,49,55,37,52,192,151,108,78,149,114,30,53,77,95,97,138,130,82,19,57,61,37,54,74,97,88,63,70,57,59,237,235,222,243,249,240,247,246,255,234,252,248,232,239,238,239,253,240,208,254,250,182,220,250,193,158,174,230,243,209,216,245,208,226,243,237,252,247,249,238,231,254,240,235,249,253,251,232,245,249,255,232,219,248,211,105,73,4,0,104,240,218,227,249,242,224,231,230,208,214,205,211,234,240,239,126,55,146,221,211,229,234,240,227,235,234,225,196,235,218,250,222,216,246,218,222,228,219,224,222,234,211,238,243,236,205,231,218,213,205,219,220,230,236,220,230,208,211,215,220,234,208,220,212,209,194,208,214,226,221,220,219,213,247,218,204,236,199,205,180,73,66,9,11,24,56,73,68,35,63,31,45,35,47,69,86,58,48,47,45,21,10,47,38,51,67,73,70,62,56,41,70,169,224,229,201,211,200,227,210,194,215,216,218,205,187,221,201,207,218,197,214,195,213,208,177,226,191,213,204,191,198,211,201,215,220,219,202,193,219,101,2,2,24,15,24,20,0,2,4,13,7,1,8,6,0,7,8,30,11,29,13,1,13,8,226,226,234,219,226,215,237,221,228,204,220,222,223,192,219,204,198,216,212,211,221,202,205,197,220,212,207,199,199,228,206,221,209,210,199,205,236,199,219,232,220,205,213,231,224,209,238,212,227,223,187,224,234,219,225,217,190,216,231,210,231,211,207,226,199,210,230,229,208,235,219,213,216,230,231,229,231,212,216,210,208,233,195,202,213,227,220,231,235,232,189,226,199,216,197,209,231,226,216,229,216,238,217,241,218,211,242,227,228,244,218,225,241,229,252,229,239,244,214,185,197,234,219,240,239,241,237,234,241,205,238,216,243,229,231,228,244,227,208,235,238,235,234,235,223,221,226,216,221,227,225,241,217,242,236,233,247,229,218,243,247,234,228,237,255,250,255,245,252,231,250,246,240,231,254,236,238,250,251,231,241,244,243,251,232,246,242,234,239,250,235,218,247,255,234,241,252,250,235,243,242,227,240,250,253,237,252,208,254,246,250,232,251,234,252,254,232,255,243,241,240,250,246,240,249,249,255,241,236,235,232,241,245,254,246,254,255,213,239,235,229,235,251,247,231,227,210,251,242,235,241,254,244,248,251,255,255,240,245,251,233,237,246,234,230,243,224,254,216,236,255,236,227,244,241,237,243,248,237,233,246,241,223,249,237,254,255,232,253,236,231,250,241,235,235,251,214,240,232,223,235,231,245,239,254,233,237,234,235,231,252,248,229,235,242,234,245,248,253,246,231,252,229,253,253,237,253,255,224,243,250,255,242,230,239,208,174,189,226,224,253,242,252,239,242,196,178,184,170,91,44,188,242,240,253,243,242,239,251,219,151,220,246,253,229,245,253,253,205,225,183,220,244,248,253,250,246,235,213,219,246,227,250,230,249,227,243,251,229,227,209,248,254,248,242,246,208,129,114,122,115,103,32,103,238,240,239,233,255,237,250,254,227,242,195,193,221,254,254,255,251,233,239,238,240,237,234,238,225,238,243,149,145,240,253,239,250,243,226,226,242,242,246,210,240,210,133,194,163,145,155,66,142,107,57,51,60,59,71,157,111,102,80,163,113,53,75,19,64,102,88,138,75,18,33,56,26,67,107,74,68,41,65,40,70,217,240,233,242,255,254,253,236,254,231,242,248,242,230,244,247,229,237,236,254,236,206,189,250,207,168,157,224,236,246,193,224,227,196,237,238,245,240,241,236,226,242,250,255,242,242,251,255,253,255,250,222,243,251,194,104,92,38,0,143,225,236,247,237,215,223,213,226,214,225,227,239,232,229,222,116,54,122,238,202,226,228,218,239,214,246,221,236,207,222,223,232,238,216,239,211,208,227,238,244,203,231,180,238,238,245,243,216,229,224,210,231,231,209,219,217,231,236,222,216,223,212,247,202,204,229,220,227,191,205,219,196,235,207,188,206,213,242,229,174,141,118,78,71,65,56,67,55,58,43,23,69,46,53,92,74,58,69,33,62,36,13,33,41,65,91,70,68,68,53,45,157,210,209,229,223,218,226,223,194,202,225,194,192,212,212,201,200,214,201,210,199,214,208,185,222,222,206,216,210,205,221,229,179,204,183,200,207,198,200,117,7,0,17,13,6,38,14,3,11,22,9,25,18,1,0,14,7,12,37,23,10,12,1,12,222,216,234,240,215,188,209,211,212,221,215,203,214,217,204,202,220,211,213,235,194,211,211,228,231,199,214,208,221,210,217,231,232,202,197,223,235,227,214,199,205,244,227,210,217,230,221,238,220,207,243,216,224,214,230,220,202,214,204,206,194,225,221,220,231,235,186,208,212,231,212,224,228,227,208,210,201,224,217,199,200,229,222,237,212,210,227,204,209,230,219,228,222,231,229,222,217,222,210,223,226,228,229,236,217,230,222,241,225,221,219,207,242,240,220,234,227,219,244,224,236,214,234,246,232,246,222,227,231,216,219,239,224,234,229,249,226,236,234,228,222,211,223,220,216,214,217,214,226,223,224,248,247,243,233,250,234,239,234,235,232,223,240,243,230,210,233,230,231,244,228,208,246,241,251,255,250,237,250,249,251,249,233,243,242,254,254,250,227,251,248,232,233,242,252,226,244,242,234,241,253,232,255,249,230,241,234,237,220,225,253,240,246,231,232,244,252,252,235,236,253,252,248,229,240,251,229,248,251,255,247,254,251,245,243,244,240,254,250,246,228,216,248,251,248,243,251,228,223,253,243,255,222,245,238,237,255,255,254,251,235,249,244,231,255,250,246,251,255,246,251,246,254,233,251,242,237,229,246,234,251,232,250,254,254,222,220,225,239,253,241,248,255,230,253,222,254,241,231,242,240,249,251,252,252,251,245,251,242,249,255,246,230,235,249,251,249,251,238,223,236,250,228,236,250,248,254,248,253,242,251,241,228,244,243,183,221,235,229,224,221,245,250,247,248,211,208,167,154,101,46,200,255,247,243,236,237,253,232,220,152,200,249,251,255,242,249,248,226,219,195,215,249,254,223,227,239,217,198,224,238,248,251,248,227,247,252,231,229,230,242,255,248,223,253,249,243,152,135,141,106,105,20,135,239,249,241,252,246,253,239,244,246,245,187,207,212,240,247,238,247,249,214,225,253,253,249,247,233,252,230,194,154,225,250,236,243,218,245,233,243,247,233,203,223,198,194,253,188,179,172,60,147,101,92,69,70,52,85,186,115,87,70,135,112,76,66,72,105,101,111,109,82,64,22,22,42,54,92,66,73,68,66,33,37,220,247,248,242,239,239,241,252,255,227,244,250,229,232,223,246,230,231,224,245,248,199,205,251,253,174,158,186,236,237,204,250,199,168,230,207,237,253,253,242,213,251,254,240,254,242,226,240,222,245,251,247,251,233,218,120,142,14,6,147,222,249,234,242,239,227,230,224,228,233,214,209,227,249,231,122,46,156,241,243,220,241,254,229,232,227,221,249,238,221,254,220,203,228,236,234,233,221,219,246,211,214,225,220,207,228,231,239,216,240,213,220,227,222,206,213,222,227,212,217,225,226,200,216,224,225,196,216,232,208,209,219,193,211,240,204,219,216,217,192,186,137,148,132,99,51,81,65,92,72,80,51,78,103,56,57,86,44,54,38,41,47,41,61,56,106,70,64,54,37,46,78,184,182,203,220,183,199,212,214,205,219,187,228,207,216,204,197,223,213,222,230,203,207,210,220,196,230,200,187,204,213,202,186,196,188,215,213,216,215,117,7,1,3,27,12,3,16,10,8,33,10,17,11,6,6,20,26,8,26,9,0,7,6,21,241,213,213,199,204,228,213,211,239,226,208,210,224,224,219,234,217,214,214,228,216,204,200,199,210,210,236,224,231,211,198,223,223,239,222,220,220,220,213,236,239,231,192,233,223,241,217,237,217,227,220,233,211,219,246,225,236,210,231,226,217,207,222,220,206,231,212,230,239,211,232,198,225,215,207,189,221,206,229,219,207,239,204,223,244,214,227,214,217,205,206,232,234,231,222,235,220,215,219,241,238,228,233,213,242,232,231,195,237,218,222,219,229,227,233,220,218,221,230,204,226,209,236,231,236,232,231,202,238,243,225,238,218,219,230,237,229,227,231,224,236,219,233,245,235,231,251,212,225,243,244,237,241,224,238,230,232,242,240,227,227,229,242,243,225,225,238,225,252,204,248,254,253,234,236,241,243,249,234,252,238,247,226,255,248,253,249,236,248,240,220,227,231,229,230,227,247,223,245,247,246,243,233,247,216,239,252,255,253,252,233,253,242,244,249,253,249,237,252,245,232,241,246,245,237,240,255,245,236,234,243,243,250,236,235,236,245,228,247,255,254,240,254,235,253,234,244,199,240,248,217,234,248,240,253,241,247,252,249,247,248,246,229,239,234,237,250,244,243,236,246,249,252,244,233,243,251,245,233,247,253,249,255,243,252,253,254,228,247,240,241,248,244,255,249,241,239,238,242,236,245,248,234,238,235,235,247,242,223,230,249,247,235,224,245,249,249,234,250,241,255,252,249,251,238,230,244,242,238,236,239,250,236,241,242,226,209,251,208,194,227,254,226,247,252,194,169,164,128,86,54,168,247,247,252,223,248,243,247,248,154,198,245,245,254,255,252,240,229,231,249,243,231,255,220,173,229,230,247,241,249,247,236,246,247,223,238,250,251,234,255,255,254,252,236,227,218,159,149,116,84,95,20,138,226,252,246,255,249,245,240,251,248,255,238,244,249,243,247,253,251,226,227,253,245,251,246,254,237,250,241,180,178,228,246,253,242,249,229,230,225,246,250,216,225,221,235,251,172,225,189,102,138,99,97,47,63,53,59,195,162,84,46,155,136,58,22,59,116,102,70,43,36,14,35,38,38,50,101,104,90,62,80,34,29,221,243,245,221,240,235,244,249,240,231,230,253,241,239,255,244,237,231,240,245,244,213,181,234,250,196,169,185,220,210,230,237,217,180,192,226,249,247,241,252,249,255,240,245,243,245,251,246,228,251,236,250,251,254,222,140,108,40,3,177,247,238,255,245,240,239,206,226,206,212,218,211,229,247,215,128,72,149,230,179,239,226,217,228,241,235,232,224,228,235,214,228,222,197,228,211,218,218,208,230,204,225,212,232,219,213,209,231,213,214,205,230,209,217,234,219,226,225,216,208,227,193,209,219,203,229,222,207,229,208,210,221,217,207,227,199,213,209,232,204,158,152,95,103,71,61,43,81,71,82,96,98,127,111,65,62,73,45,48,82,81,109,79,137,113,52,58,74,31,25,40,134,171,192,212,217,209,209,186,226,226,213,207,224,224,195,205,186,218,208,193,204,182,234,216,202,232,205,217,213,220,213,224,196,200,204,213,190,217,197,119,17,12,9,19,4,7,12,13,4,29,8,7,1,14,14,17,2,6,16,14,20,11,40,14,227,224,232,199,213,228,223,233,200,234,211,189,208,223,216,225,220,206,224,194,220,223,195,210,207,214,235,205,202,222,222,231,206,220,219,211,226,227,210,221,223,203,192,223,217,242,231,207,218,212,211,215,218,208,236,215,211,225,208,209,225,225,222,211,201,219,213,198,244,227,209,208,201,232,226,231,211,224,221,211,221,191,216,223,221,222,207,235,238,218,236,241,222,235,227,208,217,218,230,200,216,221,242,245,210,243,226,237,220,216,221,222,246,252,215,246,230,239,208,225,251,207,248,229,242,230,232,246,238,212,242,219,223,236,247,245,216,229,215,249,230,206,240,221,232,228,233,239,236,227,239,216,215,247,224,246,215,245,229,235,234,232,238,235,245,238,224,244,231,246,241,241,236,229,235,244,249,228,232,249,242,226,245,234,252,249,248,229,251,246,255,240,246,242,215,242,252,220,242,250,247,233,247,249,237,236,255,245,255,244,247,249,241,245,237,242,243,253,238,253,245,240,215,232,236,223,255,240,245,252,251,243,228,241,255,243,246,243,246,250,247,248,255,240,247,243,255,254,230,251,248,255,254,253,250,222,251,226,229,248,250,254,234,255,212,251,236,251,254,254,231,244,234,251,232,248,246,246,238,237,253,253,254,249,247,234,255,255,239,254,240,230,240,238,241,252,234,228,248,253,251,243,251,231,253,180,159,185,222,243,240,243,240,234,247,238,243,253,232,245,255,230,240,242,233,248,241,245,252,252,247,255,233,246,246,190,208,206,236,201,245,225,243,239,253,199,151,145,122,81,44,190,248,242,224,230,232,246,254,233,210,173,224,255,252,238,246,255,231,249,194,225,250,227,221,227,251,245,252,252,232,253,236,250,241,236,248,245,236,227,204,237,241,239,251,230,142,106,109,131,108,116,57,134,238,248,243,251,244,253,243,255,244,245,211,236,243,243,235,252,251,248,250,255,246,238,244,254,244,239,247,182,135,254,253,249,242,247,247,242,244,251,243,226,255,251,237,248,211,255,194,96,68,36,41,11,34,33,48,223,159,68,43,158,117,27,51,27,108,85,76,44,38,69,51,38,34,62,74,90,59,56,91,30,24,220,253,255,249,246,253,243,252,251,241,244,254,249,248,250,254,254,221,251,253,243,250,194,205,244,231,187,181,134,203,240,220,191,199,192,237,234,254,232,233,246,242,249,236,249,247,241,246,250,254,246,246,252,234,189,104,71,26,44,185,236,205,218,200,241,232,229,234,204,176,210,238,216,227,249,116,50,132,207,216,198,212,208,242,228,249,233,234,214,220,234,226,242,241,219,205,218,202,200,198,223,215,218,213,214,225,207,222,238,215,233,214,216,216,212,206,220,207,210,200,211,215,212,200,221,220,221,201,210,192,212,168,192,235,232,212,210,226,186,181,102,119,110,138,91,90,68,85,80,129,161,162,126,100,121,95,81,57,64,128,136,133,116,131,119,103,77,74,37,53,144,223,225,224,214,210,188,200,202,211,198,227,217,212,205,207,240,193,200,223,193,226,231,200,197,214,188,218,219,220,207,217,186,199,211,181,206,200,211,216,110,0,13,5,8,17,10,4,23,9,27,1,5,2,7,4,4,4,24,8,0,27,15,11,27,210,206,210,229,216,215,188,212,226,227,192,217,213,218,219,215,237,224,229,219,211,234,189,198,226,225,217,216,223,208,246,233,234,218,204,209,232,213,215,231,221,231,226,201,232,220,211,225,203,218,226,188,241,225,221,189,210,226,234,225,201,232,200,220,224,205,217,220,225,216,203,233,212,236,223,215,227,220,209,227,225,203,198,225,215,233,218,214,205,225,237,243,229,220,199,222,209,204,241,207,203,211,245,233,213,241,213,210,224,241,228,219,210,221,238,198,211,239,236,212,237,249,222,227,225,219,227,226,252,241,235,223,203,240,223,226,241,223,243,212,234,225,227,236,230,229,226,238,236,228,231,214,229,249,231,244,226,245,231,238,240,243,242,222,243,240,221,247,240,240,243,255,242,249,221,240,244,247,242,237,243,248,251,251,246,245,243,236,229,243,238,237,238,247,255,248,247,248,251,226,252,254,239,248,238,224,235,238,242,250,244,251,255,243,234,250,251,243,242,251,242,255,237,238,230,250,251,228,245,255,245,253,255,253,253,244,248,250,238,248,246,248,251,253,236,248,224,247,251,250,245,246,253,234,232,242,239,255,242,240,228,215,254,226,234,252,236,252,253,252,241,225,232,237,244,236,249,229,248,252,248,238,244,255,240,250,241,255,245,243,253,254,243,245,240,252,247,238,236,249,238,243,251,249,197,189,193,242,253,242,247,247,243,219,238,238,249,226,246,245,254,251,225,234,255,251,253,230,244,255,245,238,243,250,219,172,190,232,236,233,248,250,254,213,239,206,164,139,139,116,47,215,235,249,233,235,247,244,238,249,185,183,226,247,245,230,247,255,237,178,159,207,246,247,231,222,253,246,229,249,249,238,249,253,250,237,248,255,251,248,210,156,89,85,113,72,99,143,117,158,153,168,123,136,207,188,198,186,188,199,181,153,178,138,183,230,255,255,252,249,246,242,234,218,235,250,255,238,255,225,236,220,182,221,222,234,250,253,246,230,247,255,238,246,248,253,253,199,206,229,135,40,41,18,43,40,70,41,42,196,184,79,51,128,118,61,73,27,124,94,60,81,51,31,75,26,62,45,52,57,74,49,74,18,61,202,250,252,248,249,251,229,232,228,254,240,238,238,246,254,254,255,232,219,231,250,252,204,192,240,228,134,130,144,190,234,192,183,252,248,225,250,237,237,244,238,237,218,245,246,233,255,243,191,128,91,80,110,114,83,65,38,49,52,61,141,145,157,194,232,225,231,203,217,204,214,222,205,220,236,132,41,111,241,202,214,250,225,244,216,246,212,220,228,212,226,236,210,243,220,223,207,217,242,214,221,227,226,215,212,231,219,222,221,216,237,217,215,245,237,210,213,202,190,203,210,228,216,215,237,211,216,200,223,212,227,238,222,214,209,215,225,217,218,194,131,104,92,116,104,92,81,64,56,76,150,127,145,125,115,85,74,51,49,99,137,116,81,129,101,99,86,85,47,15,170,217,227,213,202,225,191,224,224,213,222,209,197,207,220,204,219,205,231,196,209,231,201,220,210,217,227,213,211,206,207,217,203,210,202,210,193,225,191,216,111,0,8,17,15,35,13,6,14,8,8,14,27,9,5,1,0,27,0,13,0,3,4,0,15,236,216,213,217,210,209,232,207,215,229,202,219,217,236,220,222,193,210,236,209,214,207,228,209,207,212,207,245,225,206,213,234,238,201,211,190,205,194,230,203,221,222,213,240,219,234,226,196,216,211,236,214,231,230,217,225,243,221,191,220,221,206,205,213,205,203,200,217,216,206,215,220,215,218,211,210,223,196,208,190,228,231,198,200,204,215,226,210,219,234,222,220,223,217,214,233,202,217,222,233,208,237,221,217,229,200,207,248,230,229,249,225,231,226,249,220,247,246,219,218,226,224,238,229,246,245,197,234,235,235,242,242,229,208,216,231,228,245,233,228,232,214,227,222,240,232,233,221,236,211,240,225,237,245,247,244,235,243,241,242,223,236,233,233,226,241,241,207,245,246,247,245,214,224,250,252,245,253,243,243,247,230,236,248,234,244,254,245,243,239,228,254,236,232,244,244,230,248,234,241,253,245,252,242,248,250,236,252,237,255,249,235,237,245,234,241,253,241,247,246,244,237,244,244,227,243,251,254,250,248,246,252,235,253,248,234,245,255,245,250,233,220,250,243,237,235,247,251,241,254,241,230,238,232,246,246,245,238,254,238,237,249,252,251,252,250,249,233,250,236,255,249,250,227,239,249,215,225,254,247,249,253,247,249,252,248,255,242,252,244,246,244,248,255,217,246,228,252,247,231,236,238,246,249,237,224,249,240,242,233,247,247,253,242,244,237,239,250,232,249,252,245,253,255,235,244,228,253,243,254,247,245,231,250,239,188,246,251,243,190,218,245,252,225,241,218,163,185,152,83,76,189,250,250,241,250,231,243,252,243,184,198,218,230,229,255,253,234,219,191,211,245,250,255,242,249,255,157,143,232,227,252,247,232,255,206,244,244,255,246,181,84,44,48,72,109,109,133,124,170,127,147,146,112,103,89,98,114,90,59,100,100,67,40,158,242,255,239,235,252,252,239,254,246,242,240,229,247,244,249,247,223,157,217,254,245,252,237,242,228,240,247,242,249,239,239,212,125,186,177,95,70,62,61,40,12,83,49,78,176,166,75,26,149,102,62,74,48,102,65,57,64,30,29,55,44,38,19,35,73,65,55,74,37,41,239,244,244,230,236,255,254,245,243,246,250,249,253,231,244,250,255,244,228,242,249,230,224,175,241,241,154,108,159,206,243,202,214,234,227,216,216,248,254,255,243,254,250,241,236,255,247,162,86,62,32,2,7,15,20,25,59,46,44,62,31,52,46,159,203,198,209,200,221,206,220,222,232,251,216,149,65,141,228,211,210,230,224,232,218,228,220,231,220,217,235,242,229,231,218,231,223,208,227,242,212,234,231,205,228,234,204,218,231,211,228,217,205,210,218,200,220,216,200,213,222,233,233,217,205,216,215,211,209,206,228,205,209,200,225,182,196,206,221,225,171,120,100,107,144,86,70,61,33,68,89,132,156,119,109,98,43,32,47,129,121,98,83,102,98,86,97,41,28,50,159,206,184,223,230,216,193,215,218,214,206,220,179,207,229,209,204,205,221,219,207,219,233,199,208,213,205,219,213,212,191,198,186,214,230,197,201,205,207,221,99,11,2,13,14,4,7,7,24,32,4,0,15,24,20,16,0,10,14,17,10,3,6,30,13,201,220,220,202,214,195,227,238,213,236,200,218,227,201,209,229,230,188,222,216,230,233,220,202,235,209,244,209,213,210,225,219,207,209,230,216,201,205,201,235,214,222,190,222,229,236,214,225,226,230,199,208,210,226,201,221,224,236,222,217,211,200,223,232,227,241,207,230,204,193,229,218,228,204,228,240,232,199,193,224,235,213,209,234,224,232,244,229,228,198,246,222,236,198,231,229,229,229,223,216,221,219,225,210,201,214,219,227,232,227,217,239,253,229,200,221,242,236,218,230,241,231,231,229,227,211,213,225,211,239,247,233,252,243,246,217,234,245,227,240,207,229,245,240,214,220,242,223,221,224,248,230,234,232,247,220,236,246,214,238,243,226,246,221,249,243,229,240,249,234,233,237,255,250,231,223,250,242,241,236,241,250,255,250,232,239,234,232,249,246,248,222,255,248,246,236,255,239,255,216,235,228,230,243,255,252,243,252,251,245,245,254,249,248,239,247,251,252,255,245,222,246,247,253,246,255,217,247,236,253,255,229,242,234,243,253,235,243,237,236,236,255,247,242,234,236,233,235,249,255,249,254,232,231,234,245,242,251,252,232,235,243,244,223,239,245,252,255,255,254,247,252,255,243,255,224,247,215,253,250,253,248,244,248,249,247,247,255,242,237,222,249,249,233,241,232,243,248,243,247,253,240,252,243,249,239,254,224,210,255,251,252,255,244,252,249,250,236,249,248,255,254,246,244,246,254,250,231,245,230,252,225,233,255,243,212,231,185,168,199,239,244,255,242,222,205,195,148,130,82,34,203,255,252,241,243,236,234,242,232,198,189,218,241,252,230,254,231,231,208,228,234,237,243,209,195,239,241,207,252,219,247,247,234,236,247,245,254,240,242,202,103,99,83,70,102,69,76,72,69,85,46,49,88,91,73,76,101,78,78,80,76,66,62,173,243,239,239,249,254,248,236,225,232,252,240,231,237,253,255,253,249,152,207,244,244,248,232,236,241,255,224,240,247,238,236,154,112,212,236,164,69,63,97,62,86,85,50,87,171,153,68,33,134,84,52,56,90,128,82,54,64,34,59,40,38,55,23,40,61,72,35,50,22,34,216,255,240,235,219,245,240,250,229,229,240,255,246,252,235,245,252,242,253,234,234,243,215,168,215,233,241,179,217,236,248,249,217,249,239,193,207,241,249,251,242,253,235,213,247,228,203,124,41,76,54,50,24,42,52,46,58,53,77,47,41,15,0,116,198,197,214,207,198,216,204,216,238,250,234,137,71,165,228,219,239,249,235,239,235,239,230,218,231,229,202,235,228,236,242,215,223,221,223,239,226,228,222,213,237,233,244,212,229,219,233,231,206,208,227,204,223,208,225,198,210,223,204,223,196,236,206,192,205,225,231,202,192,193,226,209,198,216,228,185,173,126,109,87,108,82,80,26,53,52,99,122,72,106,83,89,64,37,50,130,87,99,94,95,77,82,54,67,35,67,171,184,185,236,225,201,199,205,194,216,228,216,226,211,210,223,202,201,210,210,206,211,197,221,216,223,239,241,222,206,219,206,210,204,226,192,200,196,203,221,92,2,3,2,24,0,17,3,28,18,6,21,28,9,7,4,3,10,6,20,6,17,21,26,1,209,203,237,192,221,214,214,221,211,237,223,199,220,210,229,199,216,203,226,219,220,212,202,234,198,207,221,197,223,234,237,208,209,230,191,210,220,217,238,234,215,224,221,235,224,233,197,193,217,227,217,216,215,212,212,215,221,230,216,214,208,198,231,224,214,216,226,209,193,228,221,222,203,203,209,218,191,206,226,201,236,203,212,216,219,226,222,237,219,229,197,211,243,229,215,225,210,223,236,221,242,239,242,211,238,222,218,222,227,247,232,234,244,222,235,224,207,233,230,233,227,215,217,207,247,240,227,235,240,230,237,238,220,238,236,227,222,212,241,236,246,242,233,220,228,220,225,251,223,245,225,218,246,241,216,233,222,229,244,235,228,219,224,223,235,255,243,232,234,235,226,245,240,246,247,251,237,231,245,221,241,237,238,250,215,235,235,228,232,242,232,245,223,255,240,233,254,245,253,244,240,246,243,247,253,249,241,242,245,232,243,249,243,237,250,247,229,243,252,235,251,231,227,246,247,247,250,248,227,254,236,240,252,248,250,247,243,241,252,228,246,250,224,250,253,235,234,254,245,245,245,244,255,251,235,236,246,230,238,251,251,227,246,247,255,255,229,255,236,245,251,253,235,244,229,253,233,232,241,238,230,254,232,220,241,224,240,241,251,251,224,240,250,255,223,251,247,252,238,237,253,239,240,238,232,242,251,247,245,249,247,249,243,247,233,231,241,235,238,247,254,250,223,243,244,242,236,255,239,247,248,244,245,225,210,179,202,211,230,221,255,244,241,225,245,200,160,170,136,72,33,214,254,241,253,232,252,239,245,213,154,194,229,231,231,252,251,253,232,191,193,234,226,237,205,234,238,220,235,246,230,249,245,234,255,234,238,242,247,242,190,122,104,73,57,36,72,63,49,57,66,20,46,29,33,44,34,40,81,33,64,57,46,51,66,116,231,243,223,242,254,249,243,232,240,228,253,246,237,249,231,237,130,157,253,251,255,237,238,242,226,204,226,178,144,200,172,226,251,243,166,109,69,93,54,29,83,60,66,172,127,47,36,130,63,36,60,55,120,84,55,39,37,49,60,43,42,39,29,70,60,59,41,29,64,211,249,252,243,239,250,232,252,236,252,248,255,244,242,233,242,251,255,230,244,231,239,253,161,216,251,247,164,184,231,240,254,195,217,234,163,183,187,240,255,235,254,255,253,243,247,149,82,54,76,68,63,72,72,69,58,76,64,53,70,73,32,38,160,218,239,218,181,168,176,191,228,234,255,222,150,50,155,229,202,235,246,228,248,238,231,238,236,213,249,226,221,231,225,220,214,221,250,200,218,231,216,222,220,237,227,225,210,226,217,217,230,231,215,233,201,213,198,207,222,219,207,220,204,200,215,207,205,201,212,231,222,229,209,189,200,191,204,218,211,149,120,60,67,41,42,45,71,56,71,131,119,113,90,68,31,66,39,58,123,118,101,89,71,86,51,42,49,14,88,196,220,207,205,200,225,215,215,225,206,207,209,207,207,223,201,216,214,202,215,225,210,213,216,239,205,208,199,197,202,217,202,199,196,199,213,192,196,196,220,97,12,7,6,6,8,30,19,2,8,26,27,9,4,22,3,6,11,21,16,11,18,11,28,10,213,220,198,189,220,211,214,213,201,223,217,189,205,228,234,218,199,210,227,207,206,223,228,199,228,229,188,219,219,208,212,237,218,211,219,213,224,212,226,245,227,224,235,238,237,228,199,244,216,223,230,191,241,230,221,223,214,210,217,205,201,210,232,211,224,232,230,224,236,199,223,215,212,221,235,240,244,227,216,219,211,224,218,236,222,200,234,207,228,238,235,213,219,213,213,228,240,220,238,233,192,229,209,229,229,207,205,243,218,210,226,237,218,233,238,217,245,203,219,238,222,234,231,224,252,231,238,233,232,212,223,205,246,240,223,205,226,219,232,216,207,229,223,227,244,242,222,241,215,236,212,236,240,235,223,204,232,229,243,233,247,238,228,240,226,232,229,232,224,242,241,236,233,223,242,245,242,236,249,243,235,231,254,246,239,248,223,242,228,251,250,253,241,252,248,240,212,249,240,227,232,248,243,248,243,253,244,226,247,218,248,243,242,241,245,252,245,253,252,245,243,253,251,235,252,221,254,249,229,255,225,243,255,242,248,236,255,252,254,247,255,252,239,249,234,249,229,252,232,247,245,245,247,242,226,237,238,235,245,221,246,254,252,253,253,255,243,243,240,240,233,239,246,250,231,239,248,232,240,236,226,252,242,255,255,247,243,235,239,240,233,237,243,229,236,252,243,242,237,233,232,249,226,241,247,240,251,247,247,251,229,238,245,246,242,254,231,237,245,240,255,251,255,240,235,242,250,240,247,240,221,236,245,244,176,195,247,237,213,211,229,253,255,240,255,173,150,154,136,71,50,218,234,230,237,214,249,244,249,242,205,190,234,223,240,249,246,235,230,185,155,249,253,255,245,232,248,244,225,236,238,237,238,232,254,248,255,239,244,254,154,108,111,64,65,54,66,60,64,76,65,36,66,39,43,54,49,50,66,49,74,55,72,12,45,152,241,239,245,241,232,246,246,242,255,250,233,248,252,243,253,220,154,194,253,254,232,246,235,242,231,180,230,214,222,217,216,232,255,166,154,152,119,140,53,59,102,101,66,154,158,97,46,109,98,77,56,49,105,58,62,56,53,58,58,65,40,55,51,63,63,36,64,21,61,219,254,239,244,224,234,239,221,251,243,242,243,249,253,233,235,252,246,233,239,212,241,252,201,188,249,237,184,163,232,215,243,193,229,201,198,189,199,253,254,227,250,222,240,250,225,140,99,67,81,84,80,99,62,58,53,61,41,63,58,58,38,42,225,216,215,230,201,182,199,198,198,191,237,255,158,67,168,205,200,232,236,225,209,207,231,235,227,225,222,212,225,239,202,227,210,231,231,217,201,223,235,220,208,221,225,234,214,215,219,216,230,227,213,229,225,222,215,229,233,191,222,219,215,237,209,215,212,202,228,214,192,219,206,192,219,218,240,231,203,125,111,68,84,66,60,78,29,34,104,155,104,114,74,59,30,44,48,77,104,89,126,93,68,75,64,46,28,18,97,201,198,218,221,199,206,227,219,201,207,217,197,210,228,215,222,229,217,212,212,207,216,221,219,217,209,190,209,209,213,227,230,233,221,199,205,205,211,206,202,85,21,6,7,14,5,24,3,1,7,27,5,21,3,13,14,3,12,26,35,5,1,24,17,7,207,236,219,230,191,216,214,216,220,221,212,225,225,228,209,233,200,219,223,226,226,207,249,231,232,227,212,230,219,214,234,234,232,187,222,230,216,238,238,211,217,235,209,215,240,235,211,230,219,209,222,223,225,217,200,218,236,209,208,212,220,228,227,239,218,233,218,243,234,247,200,216,234,210,223,239,224,234,215,206,223,219,238,234,213,240,205,214,214,230,186,188,232,218,234,217,218,222,197,245,225,230,193,231,200,217,205,228,206,219,247,219,218,229,206,221,234,219,210,241,246,210,216,230,234,209,250,235,230,234,213,233,208,222,231,218,218,214,234,230,237,247,244,227,232,252,230,250,242,241,239,248,225,227,237,235,240,224,240,241,247,233,235,247,231,237,230,241,224,246,223,229,236,249,221,249,239,232,227,243,233,247,253,255,243,239,243,223,230,238,244,250,242,243,215,247,252,250,239,249,233,224,228,255,249,240,253,245,247,230,251,250,236,245,244,251,247,248,241,243,236,231,248,253,232,234,255,255,250,242,249,255,240,245,242,235,248,252,238,241,228,247,252,251,248,253,251,240,250,255,230,225,251,234,252,248,248,240,235,253,247,218,240,250,248,235,255,225,226,247,225,240,228,241,243,233,238,247,247,213,246,229,249,250,247,252,222,243,240,232,249,247,254,253,233,250,245,255,251,238,229,253,249,251,241,223,248,225,244,234,234,254,233,253,223,255,254,236,247,251,239,240,243,234,253,244,230,242,242,254,248,218,245,247,242,237,253,246,239,183,231,239,227,224,244,189,182,182,142,87,72,243,229,251,247,222,237,243,232,247,224,183,204,245,253,246,252,217,210,205,193,238,250,244,196,243,251,242,229,243,246,251,247,243,245,247,245,244,244,249,153,128,103,69,48,61,94,74,70,49,66,98,84,71,46,79,30,91,91,47,107,88,73,33,122,220,239,243,241,230,244,249,248,225,242,231,241,245,253,247,211,220,156,144,235,250,247,250,218,253,236,234,252,220,253,247,249,214,149,112,166,243,186,134,100,73,123,78,96,145,183,96,128,130,132,122,68,68,105,49,49,75,45,55,43,51,42,19,61,64,76,69,74,14,48,226,247,246,250,244,252,252,252,233,227,225,245,230,254,234,253,251,227,246,230,243,237,251,203,179,242,250,174,164,221,208,196,231,209,232,234,207,213,229,249,254,231,250,242,246,230,120,92,97,66,89,89,70,29,37,59,46,65,49,51,64,31,47,180,230,241,218,230,222,211,209,192,245,244,222,122,84,145,213,218,245,206,244,220,212,230,219,228,215,217,218,230,217,235,222,210,213,214,206,221,231,218,231,202,224,224,233,188,202,238,210,243,223,224,215,234,223,239,192,190,215,201,217,211,233,219,212,221,224,208,182,242,235,199,197,248,208,218,226,190,148,96,76,91,98,45,38,60,50,134,162,112,121,66,66,35,44,18,102,130,107,85,83,61,54,52,54,56,38,169,205,208,199,217,197,217,222,209,209,207,224,213,234,212,220,228,217,216,202,203,225,222,223,206,203,220,213,206,216,215,226,212,200,209,205,202,225,207,198,189,109,2,0,7,2,7,22,14,18,4,7,18,29,0,2,19,17,17,4,10,21,10,40,5,1,209,216,198,201,191,212,205,198,209,226,218,208,197,217,212,236,204,208,196,220,201,214,215,218,228,198,181,196,208,214,233,214,193,215,235,222,230,225,229,201,242,192,208,215,221,224,216,228,216,209,215,215,219,220,212,211,187,219,225,221,226,221,244,225,225,245,217,219,242,242,219,222,213,221,229,225,242,210,222,227,214,235,240,219,219,207,238,202,237,238,229,228,246,204,206,221,219,226,216,213,237,224,200,223,226,234,238,225,240,218,235,230,221,221,214,250,232,247,220,245,230,226,237,218,240,206,220,227,211,207,243,228,215,226,242,249,227,236,200,222,230,238,247,222,228,238,234,250,231,236,233,247,219,236,233,232,226,245,214,248,212,210,246,225,227,231,232,226,249,252,231,238,216,237,246,220,255,224,247,225,227,229,241,240,235,237,247,244,227,234,246,253,244,242,245,244,238,255,249,248,249,251,250,241,252,242,254,233,247,255,249,255,251,252,234,255,218,238,218,247,250,238,243,245,252,249,244,247,250,237,244,252,229,248,241,252,223,254,252,252,255,244,234,245,249,252,250,250,241,242,243,238,252,249,227,253,246,244,247,253,255,251,242,240,238,242,242,231,250,255,220,244,248,228,243,240,244,243,235,234,231,246,237,247,233,235,240,246,253,243,236,254,254,255,245,239,251,250,252,233,248,253,237,249,237,228,251,241,230,238,232,249,238,239,250,251,235,251,247,233,245,246,249,246,245,240,244,229,233,251,242,222,230,230,199,243,240,229,253,206,234,251,236,242,246,192,169,155,111,49,86,236,253,250,232,245,240,239,253,244,246,179,190,242,234,253,242,237,215,185,235,247,255,236,246,251,229,186,192,238,238,251,237,233,244,238,229,241,230,253,170,155,126,51,65,52,40,36,30,48,71,60,71,35,52,51,63,85,97,66,44,49,71,15,176,240,255,253,238,242,241,247,239,252,236,255,248,231,236,255,232,241,176,121,233,245,239,242,221,244,250,252,229,255,232,248,208,181,101,92,202,246,218,190,131,96,101,109,49,139,188,125,83,85,130,122,72,107,137,106,102,84,31,34,48,52,58,7,54,79,79,47,60,27,51,200,246,244,254,243,231,253,242,235,246,228,254,240,244,244,238,251,235,248,225,239,246,248,217,179,240,249,232,183,195,155,167,223,235,205,230,225,174,230,240,253,249,254,235,245,206,106,62,87,75,100,76,46,16,53,49,32,28,60,45,52,24,43,198,223,233,250,229,252,225,240,243,240,248,228,113,70,176,222,185,237,216,255,217,235,231,214,222,218,204,231,223,238,219,238,217,219,217,220,214,223,214,217,237,237,217,236,200,208,212,230,212,237,197,219,234,194,214,187,209,225,211,222,232,228,225,193,234,212,237,219,206,222,222,202,210,191,217,212,222,143,73,71,76,93,81,50,94,74,136,145,109,116,72,68,54,41,70,128,120,95,136,104,102,57,51,13,32,113,187,234,226,223,208,214,222,219,238,209,227,232,210,202,214,205,204,231,220,220,197,192,184,221,224,189,201,206,224,211,170,196,192,202,222,216,216,194,227,189,205,106,8,13,4,27,2,6,21,3,11,12,10,17,13,17,15,30,25,2,7,20,7,38,13,15,217,241,220,199,217,209,217,197,214,205,219,227,204,225,211,211,219,210,205,217,214,225,219,234,179,225,210,234,225,201,223,223,217,224,222,228,220,229,231,202,217,223,242,206,223,214,217,237,233,214,225,212,219,235,219,215,200,221,240,243,229,225,243,205,215,235,209,212,201,212,216,212,236,212,202,230,210,220,234,221,222,231,209,206,210,235,226,206,191,241,228,233,228,217,229,222,228,208,209,232,231,188,211,215,209,234,211,221,227,200,228,211,234,235,205,249,235,222,232,235,243,233,221,202,231,245,231,233,245,237,214,228,221,229,211,244,244,230,238,243,237,234,243,239,225,210,233,248,235,240,236,229,218,235,219,236,252,253,237,228,204,238,232,236,245,242,224,245,237,243,250,235,239,239,245,235,236,211,241,251,226,245,237,249,253,242,242,241,247,250,229,251,230,255,253,225,254,232,232,236,255,223,245,228,244,245,237,203,253,220,251,252,246,226,248,224,254,234,253,240,243,242,245,247,239,217,232,252,237,255,252,249,252,246,227,249,221,230,252,237,249,255,250,255,227,251,255,255,231,235,217,239,241,226,249,250,255,248,237,238,233,237,242,242,241,245,239,250,236,240,255,242,253,247,224,242,235,230,251,243,253,231,251,252,219,247,255,245,249,230,242,233,220,248,252,241,228,253,235,224,246,226,227,240,243,255,226,229,237,235,247,254,240,242,236,239,254,241,243,248,246,234,246,248,236,232,243,238,252,250,237,248,238,232,184,202,234,229,228,232,237,235,203,223,247,174,158,147,125,62,83,235,251,250,255,232,237,224,255,246,219,199,200,242,221,255,244,221,214,222,242,255,238,221,227,224,195,176,220,233,229,246,250,253,227,249,228,225,242,245,172,127,125,112,49,66,67,63,70,43,44,61,46,51,42,48,45,95,77,55,75,53,48,31,165,236,254,241,236,255,221,255,246,234,238,236,240,242,251,252,226,248,201,113,244,240,250,233,223,241,225,238,249,234,223,178,172,177,149,144,199,248,228,117,87,69,78,87,62,138,159,89,80,88,95,111,70,108,174,145,92,62,50,64,47,74,44,33,67,89,65,66,55,37,57,186,236,243,239,250,252,234,224,236,247,234,242,243,221,255,231,244,250,250,213,245,247,248,215,190,217,248,217,104,103,167,223,239,230,164,209,224,213,229,238,248,255,250,224,124,115,84,105,82,86,103,79,74,49,53,36,34,69,35,29,53,31,46,189,223,214,241,240,245,231,240,251,251,239,207,102,93,191,214,211,239,205,210,230,226,222,198,221,218,190,197,200,237,219,214,214,241,231,211,191,223,218,216,239,204,217,204,208,231,219,206,217,222,191,220,235,202,220,233,216,211,213,230,197,231,218,217,206,207,201,198,185,214,212,217,200,204,191,235,190,177,140,78,75,72,83,36,69,65,105,157,156,135,109,66,48,48,73,146,134,136,133,87,68,36,54,14,93,201,240,226,215,194,208,186,202,205,202,215,220,211,214,208,215,213,220,189,197,197,217,231,227,201,189,193,213,208,208,198,224,226,216,201,187,217,216,232,217,208,227,82,1,1,32,7,21,14,8,6,13,7,11,13,2,0,26,7,3,16,7,5,3,7,13,14,205,220,213,216,211,226,221,230,207,203,228,224,225,238,204,223,225,185,208,212,199,212,240,214,228,226,211,228,230,246,231,219,207,214,216,211,226,206,232,225,217,242,228,219,194,214,218,209,204,222,228,225,229,220,233,212,197,227,216,232,226,218,203,211,225,225,234,220,226,234,204,214,230,226,194,219,233,203,212,243,214,213,232,227,224,223,201,229,202,197,209,222,221,225,221,243,237,224,217,242,233,235,215,225,212,225,215,228,200,215,235,226,237,216,206,222,227,217,225,231,230,231,229,220,215,222,232,226,218,217,222,234,236,236,224,221,246,233,210,223,207,233,233,240,254,240,240,242,250,228,224,247,232,232,253,230,245,249,223,228,218,236,249,229,244,231,212,239,234,235,235,215,251,241,249,255,247,233,251,252,234,238,223,236,253,243,216,242,246,245,237,234,245,245,220,235,250,249,255,245,245,235,243,228,253,254,252,238,241,246,247,228,233,249,250,248,251,253,227,247,245,254,255,240,253,248,237,250,252,250,249,251,243,234,241,252,247,243,252,248,252,253,253,242,250,254,242,240,236,251,249,255,249,242,251,234,249,236,252,236,234,247,243,248,247,235,250,248,253,255,232,249,246,237,248,251,245,236,219,251,244,251,224,254,250,242,241,245,231,251,241,251,239,250,251,253,253,243,231,237,250,243,238,247,249,253,224,244,246,240,249,250,230,252,218,231,246,233,252,254,253,231,253,252,225,246,250,249,252,255,249,208,234,234,199,241,249,253,242,214,228,235,203,206,241,212,149,161,117,54,74,237,255,252,232,231,255,250,246,241,225,194,207,246,246,245,237,203,210,201,208,210,232,190,185,223,219,224,227,237,236,237,240,250,248,235,246,235,245,180,122,115,119,91,33,52,49,27,27,38,40,49,61,31,39,48,55,74,55,57,63,65,36,52,170,232,244,255,245,253,252,245,237,255,232,249,224,252,234,250,241,222,233,155,224,243,235,247,242,249,249,238,212,187,111,149,204,230,173,174,238,245,118,94,64,52,58,85,67,141,145,102,98,76,119,79,93,133,145,118,62,45,55,91,88,82,46,49,74,57,73,71,57,32,37,234,243,249,246,240,255,242,239,246,235,246,249,255,248,253,224,251,248,234,217,210,241,248,230,172,193,235,217,174,152,205,240,249,238,173,194,246,209,208,234,228,244,238,121,82,82,51,80,80,91,89,78,60,50,51,36,55,46,24,31,37,15,47,226,236,235,224,226,201,216,228,233,246,239,210,90,119,192,202,207,212,219,231,233,225,239,230,224,210,225,218,237,212,213,214,210,223,227,234,241,211,239,212,203,230,201,241,211,220,214,191,240,195,223,209,207,235,226,200,214,210,198,225,215,214,213,210,229,214,226,225,199,194,216,214,241,206,224,195,219,209,136,100,116,75,43,71,93,56,139,178,149,125,85,67,56,37,107,125,117,133,109,69,52,62,31,51,167,215,227,201,213,218,205,196,197,173,209,216,221,201,203,211,225,210,196,222,197,209,210,215,209,223,223,221,233,228,207,219,191,213,215,206,227,207,205,211,199,200,212,109,14,6,7,0,7,18,24,3,37,10,5,48,7,0,8,9,7,2,13,24,27,20,17,3,197,223,234,233,227,204,223,223,214,225,220,203,215,198,209,216,196,199,207,204,236,201,203,217,213,215,213,218,219,224,218,245,192,213,204,222,233,226,197,217,222,234,211,211,220,217,226,216,227,212,213,214,226,222,242,227,225,210,204,221,222,200,212,201,205,207,242,237,207,201,217,231,223,223,239,222,202,209,224,233,227,215,227,248,225,226,218,205,224,205,218,208,227,226,216,202,215,211,231,219,216,223,216,223,203,230,216,208,241,232,218,228,224,214,233,217,204,201,244,221,199,227,227,234,224,233,223,207,227,223,235,201,216,219,226,237,212,227,226,239,217,234,222,221,241,233,234,234,216,235,243,242,254,225,230,249,242,238,227,233,235,212,208,215,249,233,229,221,235,238,228,223,234,207,246,217,243,227,235,234,245,216,224,245,240,232,250,241,232,236,249,252,245,237,237,241,250,255,237,233,243,231,250,218,249,236,242,254,245,239,246,245,236,250,225,228,243,250,242,249,200,230,245,241,240,240,249,251,230,252,255,235,255,240,245,230,246,247,252,239,249,251,241,255,253,229,244,251,239,249,249,255,244,237,253,246,242,230,255,211,242,255,225,219,228,254,239,230,250,247,239,235,250,221,248,213,242,255,247,246,246,252,253,254,227,241,233,249,245,247,253,252,214,225,239,242,237,226,255,250,245,239,240,240,248,243,239,230,241,242,238,254,242,241,229,242,226,254,248,251,231,231,236,255,226,249,244,243,229,246,233,221,248,233,217,250,219,198,217,222,252,228,219,199,243,197,179,169,137,79,100,235,242,251,251,242,252,251,233,253,235,219,208,244,226,249,244,211,193,145,205,222,226,222,234,243,236,223,233,242,231,246,236,246,231,226,249,243,190,144,98,110,78,24,16,29,27,28,40,21,19,9,29,37,18,35,51,81,67,60,72,82,40,110,244,244,227,241,224,219,253,244,247,254,250,250,247,245,235,219,246,230,211,138,215,250,255,250,208,234,241,186,160,194,193,231,200,193,198,183,243,229,169,134,126,77,63,77,81,132,196,135,81,70,130,105,107,130,138,115,63,55,66,62,88,65,50,28,64,89,48,45,61,40,34,222,245,242,246,245,243,223,231,226,239,219,243,255,244,253,242,237,222,224,251,224,230,253,252,230,187,239,249,233,150,220,217,227,245,218,205,239,201,184,217,233,252,236,124,52,57,41,105,94,94,82,80,64,36,30,35,40,31,54,28,74,48,75,237,233,236,248,226,220,217,213,238,224,221,182,86,132,208,207,202,228,233,243,238,222,225,244,216,222,225,228,227,226,220,253,222,222,237,216,228,219,193,229,210,207,210,227,201,217,231,214,199,176,227,180,193,214,216,224,206,205,201,194,200,222,210,228,204,222,216,212,202,204,215,213,182,222,207,219,243,220,148,106,116,119,73,79,66,46,123,161,117,124,89,54,38,35,124,151,94,113,99,70,64,33,24,138,212,218,213,199,214,187,234,223,227,191,215,236,207,207,212,227,226,216,230,211,195,224,200,229,207,212,210,213,238,202,212,222,227,206,213,216,213,225,189,200,236,187,198,102,18,1,26,23,31,3,14,18,7,28,22,2,18,15,5,6,5,0,20,5,15,13,5,17,215,214,209,209,212,223,230,228,211,209,197,192,206,224,240,244,203,218,230,238,221,217,212,215,215,209,202,215,194,233,228,203,245,229,227,235,208,232,229,239,208,198,243,247,214,228,217,210,216,241,221,213,238,203,207,217,219,233,210,214,220,215,213,226,233,217,225,236,223,237,206,215,207,234,249,226,233,229,222,224,246,235,222,204,228,213,223,209,236,210,201,204,236,222,227,221,201,219,229,233,217,214,215,208,224,201,225,232,220,233,231,228,216,238,219,235,223,252,233,226,209,210,238,221,246,237,247,230,226,244,237,219,216,241,225,202,244,211,216,211,218,238,224,245,243,247,227,223,228,235,245,244,230,235,239,226,224,249,243,240,236,255,224,231,223,218,232,234,230,243,228,231,229,212,251,226,231,239,249,220,244,219,201,240,216,240,240,226,223,249,244,246,245,225,235,250,251,225,243,245,249,240,240,236,242,249,238,239,249,248,241,238,255,246,237,251,235,237,255,250,239,253,233,245,234,255,227,228,250,243,240,225,253,221,238,250,238,253,239,250,243,251,238,252,246,246,243,235,248,240,242,247,253,241,235,252,252,253,255,231,233,254,245,247,252,243,255,250,255,242,237,226,233,246,234,213,251,242,248,236,241,249,255,255,253,234,235,231,245,245,253,250,237,235,232,228,236,239,248,221,252,254,254,253,246,223,242,218,241,244,251,255,239,247,252,248,227,251,249,228,249,254,255,237,248,251,215,219,224,247,219,228,253,224,203,215,203,236,230,248,224,233,211,227,255,192,168,155,126,90,85,229,247,242,230,234,244,211,222,241,226,198,173,219,216,255,247,203,149,173,251,249,247,211,241,253,206,203,225,242,247,249,237,255,244,219,222,233,188,145,104,94,57,18,23,34,56,30,26,26,14,44,56,43,23,26,51,76,75,56,83,60,91,214,254,250,254,245,244,232,213,255,220,240,255,225,253,251,236,248,215,199,173,116,181,245,237,225,201,207,210,167,220,234,242,233,205,207,219,223,254,249,199,115,107,74,71,65,58,106,174,144,93,107,153,131,106,140,134,108,79,64,64,75,73,75,71,68,40,39,48,67,63,6,78,233,241,240,219,252,243,219,222,228,242,239,247,244,246,241,241,243,252,254,249,233,251,243,239,182,174,231,255,226,174,173,232,198,247,196,206,234,208,172,212,235,248,239,132,101,57,63,87,106,119,115,55,46,53,41,39,43,48,22,17,68,33,81,251,240,250,252,212,221,230,239,234,250,239,163,68,154,198,191,229,224,225,227,206,226,208,221,215,242,207,226,236,225,230,222,208,206,235,192,221,222,200,203,212,213,222,233,222,225,204,210,210,214,217,213,213,203,203,197,208,200,214,215,210,213,234,209,215,212,204,203,216,192,213,214,202,201,183,239,246,197,160,110,88,112,94,91,73,69,145,136,130,132,91,51,63,61,123,129,143,134,89,90,57,46,41,192,230,222,225,217,222,223,197,205,193,209,188,230,207,216,217,213,220,218,232,193,217,214,202,214,210,219,195,201,213,211,232,199,217,204,206,216,233,216,221,216,206,210,203,105,7,7,23,14,11,13,8,42,8,9,18,36,9,7,0,37,19,16,24,0,16,22,9,15,203,220,205,211,230,207,241,207,207,196,214,205,223,213,224,219,229,211,206,216,207,230,205,217,199,221,210,218,202,211,235,240,238,227,236,223,209,229,201,203,229,221,210,216,225,225,219,216,229,220,211,214,216,230,229,215,219,231,232,223,217,221,218,202,235,216,203,218,231,212,198,231,206,214,202,220,236,221,243,235,227,219,220,223,229,232,235,203,218,202,214,213,231,208,226,230,244,220,234,215,226,236,242,212,208,217,230,242,229,223,233,249,221,229,217,223,227,220,221,231,224,209,216,233,218,230,228,233,224,239,226,235,241,243,225,240,230,233,206,208,202,222,215,237,245,222,239,235,211,230,226,241,240,233,219,214,233,227,219,238,221,237,236,249,238,239,215,229,228,239,221,245,241,220,245,233,236,224,230,228,214,248,234,240,234,247,240,226,246,246,212,249,214,248,247,248,251,241,243,243,228,252,246,248,240,228,249,242,239,237,228,224,244,253,227,251,251,237,252,216,241,250,245,238,245,244,232,243,246,230,233,251,221,204,251,245,244,251,242,241,255,251,253,242,242,230,253,245,247,233,255,255,249,244,226,249,209,244,234,246,214,252,252,253,247,220,245,245,240,231,244,244,242,254,219,253,234,244,250,254,250,234,254,255,251,238,248,245,222,250,254,252,241,243,254,242,244,224,230,235,236,235,224,241,242,241,255,223,225,244,233,234,240,232,238,235,214,249,249,237,239,244,253,245,255,235,254,232,224,251,232,234,248,202,208,234,222,254,239,202,237,221,211,215,242,146,132,132,120,85,98,230,252,246,236,227,244,240,226,241,220,196,184,224,251,234,215,168,195,201,251,247,245,206,237,255,208,212,233,250,239,253,243,241,252,233,233,234,194,131,101,56,24,10,30,30,17,20,48,21,27,32,33,14,7,24,58,81,85,75,43,38,155,240,250,255,224,242,236,247,251,243,233,255,231,254,218,237,242,252,246,227,214,116,178,238,255,234,204,198,244,216,252,251,253,224,182,225,208,202,236,250,188,158,134,111,51,92,58,114,202,172,90,82,131,109,74,80,147,102,108,64,49,63,60,79,91,82,68,78,87,54,81,36,52,224,253,240,254,225,245,239,225,243,246,247,250,252,239,250,246,217,235,234,242,208,231,234,252,228,157,206,219,224,188,143,201,230,209,233,183,209,224,187,198,225,252,215,123,113,71,108,106,111,108,67,60,67,59,41,19,37,46,31,15,52,11,79,230,236,238,242,228,212,236,231,226,225,206,146,64,145,221,197,219,222,235,245,229,241,233,240,227,210,232,218,201,208,205,226,216,206,238,196,209,212,227,215,224,220,225,219,195,209,225,216,202,221,180,193,219,203,198,196,205,221,195,217,194,176,224,191,216,205,204,225,216,183,198,217,205,238,192,223,223,207,188,147,125,90,99,105,89,81,135,144,130,141,73,55,78,99,129,103,138,122,105,104,77,29,105,225,199,203,203,210,214,221,214,192,207,239,203,229,206,220,186,218,198,218,214,197,215,201,225,213,235,238,198,210,225,201,215,200,226,226,185,203,208,215,204,198,194,222,221,116,7,2,10,19,10,11,10,4,17,23,25,19,21,2,0,5,36,28,14,8,0,2,13,4,233,231,207,200,226,193,215,229,209,218,215,195,215,221,211,205,218,220,219,212,215,202,225,227,233,224,214,236,235,208,237,211,211,213,232,210,241,193,223,201,226,216,228,218,208,225,220,204,225,214,238,233,226,242,226,219,225,231,214,220,208,231,217,210,229,218,237,206,227,231,203,229,236,208,223,215,241,241,190,208,227,235,222,238,209,227,246,234,217,222,238,229,235,199,229,226,219,235,217,225,219,215,219,217,231,221,218,189,240,221,226,224,230,232,225,216,243,238,238,239,229,229,224,206,212,215,221,236,229,215,237,246,228,233,216,222,224,228,218,236,237,225,225,224,221,245,246,230,245,244,243,221,245,246,231,246,243,219,238,251,236,226,218,230,246,239,232,246,235,249,230,253,239,237,245,249,245,253,244,254,245,239,242,249,237,245,252,246,236,252,243,242,251,246,246,245,251,246,253,219,244,245,243,254,245,236,248,247,252,253,234,255,254,248,246,255,240,240,249,252,239,250,255,240,252,228,254,241,246,237,232,234,245,250,254,248,223,228,242,223,252,254,232,251,250,235,244,216,242,235,232,245,229,228,243,250,255,239,220,243,244,246,247,214,239,237,253,250,233,239,252,255,248,229,214,246,239,244,243,226,231,248,254,239,252,216,240,246,255,242,255,241,254,238,243,252,246,242,237,216,252,243,233,254,227,244,247,238,236,247,250,243,253,249,239,245,249,254,236,234,237,243,236,244,238,237,248,241,236,245,243,219,236,220,221,236,243,244,205,216,238,252,213,207,237,178,137,153,139,81,120,217,247,230,248,230,243,244,232,253,242,210,170,235,254,245,203,180,212,234,241,254,243,200,240,249,225,177,249,249,226,249,248,255,228,246,219,248,170,99,67,32,8,26,17,37,30,20,25,31,26,48,47,54,45,49,49,67,51,42,61,33,175,235,247,232,223,241,236,251,246,228,235,239,234,236,255,245,237,231,255,244,235,175,152,252,251,201,202,249,243,245,255,249,242,163,74,151,170,220,240,254,202,118,129,91,52,89,122,149,146,112,57,47,70,72,36,97,119,146,99,68,46,80,65,62,83,86,62,89,93,35,52,3,60,217,237,238,248,251,245,254,242,213,243,250,243,253,241,242,243,247,205,233,241,247,234,241,255,220,184,198,242,242,208,169,182,219,168,219,210,218,200,217,187,210,252,245,99,123,55,66,72,71,43,55,10,42,35,34,26,37,30,18,29,47,15,87,230,235,235,254,227,242,254,238,237,236,254,147,118,179,223,188,229,212,236,241,251,229,227,218,201,206,239,230,227,231,224,232,232,216,203,202,198,226,216,186,214,219,215,208,222,234,220,219,223,212,213,235,221,215,232,199,234,219,212,206,194,233,200,196,219,203,203,203,208,186,218,224,183,194,212,222,235,213,218,119,124,106,99,92,74,75,135,124,123,107,65,69,97,101,128,128,124,129,120,122,95,22,51,160,180,228,226,209,238,199,221,228,214,206,220,219,221,196,201,201,200,201,210,217,220,201,239,235,196,189,207,196,198,194,199,202,196,220,213,200,223,229,239,221,206,195,195,108,13,5,22,3,2,7,16,3,15,3,0,0,1,19,27,2,0,14,20,3,12,9,22,11,206,232,204,235,218,218,217,184,209,227,220,217,208,218,232,232,215,204,205,206,224,207,201,220,224,218,206,214,234,235,238,213,230,220,205,217,248,206,215,232,229,212,219,215,234,211,227,217,211,204,222,212,221,231,213,205,208,234,223,233,219,220,213,202,219,221,227,232,241,213,221,210,208,214,243,239,228,209,196,237,208,231,196,215,240,215,211,221,198,232,225,248,229,230,230,218,199,233,210,227,237,218,216,218,206,231,247,213,222,224,225,227,213,225,243,229,223,215,233,223,229,217,216,225,225,238,228,214,221,223,227,241,231,223,238,241,210,225,234,224,231,238,231,221,244,245,234,235,237,216,241,238,242,238,219,226,226,219,250,241,227,246,219,234,234,249,227,223,242,250,228,230,230,227,244,216,245,218,237,225,230,250,244,235,241,242,235,239,250,218,254,244,232,230,231,239,240,236,246,236,235,245,230,243,238,231,241,239,244,226,240,255,255,225,246,239,223,251,235,218,234,248,240,242,241,226,255,237,247,253,220,236,252,255,239,255,243,247,227,242,255,236,255,234,243,250,223,247,239,242,255,252,255,252,236,237,251,245,240,249,254,251,246,229,245,248,241,255,233,249,242,221,247,230,226,251,253,253,232,250,246,245,239,241,253,252,254,255,255,242,221,220,247,247,236,250,252,238,238,240,238,238,243,224,246,248,235,249,233,253,222,236,247,251,222,243,253,250,231,241,255,228,241,243,252,255,251,237,237,246,204,253,244,219,235,231,179,223,220,207,250,237,224,214,249,206,150,152,134,83,88,221,230,250,241,255,255,225,240,251,227,223,209,221,249,237,215,198,206,241,252,255,229,188,231,249,238,234,255,254,243,251,247,201,236,251,217,249,167,121,74,68,66,11,3,3,40,25,14,36,41,74,71,49,27,44,45,10,23,55,100,134,239,237,254,242,250,251,232,245,221,223,237,255,235,255,249,248,245,252,243,226,255,174,185,227,228,198,230,247,255,230,252,250,233,155,96,158,184,221,253,246,147,113,57,51,21,105,145,167,147,69,31,34,28,68,66,68,120,98,59,68,72,48,75,68,91,49,82,117,102,53,91,16,56,226,240,240,230,244,250,240,252,222,251,255,214,252,246,234,241,249,246,219,253,236,204,241,251,239,184,192,235,231,201,181,187,232,184,189,252,182,212,249,234,213,246,193,93,56,11,3,13,8,12,9,5,28,63,29,46,33,42,34,30,54,28,92,237,225,254,212,244,242,255,226,247,234,235,140,115,213,197,198,239,221,223,206,243,198,223,231,224,230,227,202,233,234,208,216,227,233,233,180,223,217,219,225,224,227,222,231,227,208,197,225,186,204,190,229,215,214,230,204,199,214,192,224,199,226,216,218,241,187,204,220,219,196,205,206,207,198,199,194,204,207,137,161,99,103,79,90,78,77,121,117,142,114,75,61,86,116,144,98,116,131,121,99,77,44,40,106,163,224,220,211,209,211,232,231,212,213,210,226,204,215,222,236,226,207,219,228,226,194,231,211,238,225,199,207,193,208,232,185,206,212,232,207,220,203,222,214,205,214,203,113,10,0,12,42,7,5,17,2,15,27,16,16,10,0,8,4,6,6,13,27,18,26,21,2,237,228,231,188,236,221,242,227,217,202,232,234,217,235,223,183,204,194,218,217,223,196,201,220,199,205,232,196,230,227,200,219,217,225,231,223,218,228,214,228,221,241,214,213,216,221,242,239,213,198,189,227,212,223,226,213,225,232,225,229,195,209,220,234,209,223,210,210,233,214,216,216,211,233,212,201,225,233,204,225,215,207,227,218,196,201,222,231,227,225,236,210,218,249,230,238,195,214,230,225,216,218,220,222,217,242,214,219,230,226,220,236,205,208,216,227,216,226,231,212,232,228,222,222,212,229,214,210,195,224,216,217,228,217,232,219,216,209,242,232,228,218,249,227,243,232,234,208,230,237,230,237,236,243,231,239,236,214,214,246,217,245,219,242,225,224,236,233,240,229,245,242,233,248,230,239,247,252,252,237,233,238,240,232,241,239,246,246,250,252,235,215,249,244,247,237,242,238,253,255,234,240,233,247,248,252,248,233,237,246,235,252,222,254,248,247,245,246,228,228,244,246,236,228,253,254,249,255,242,251,248,233,255,242,227,249,246,245,241,248,253,241,231,251,251,251,253,236,238,245,242,234,239,236,244,250,249,246,238,255,251,252,236,255,235,230,231,246,240,255,253,255,248,235,229,255,230,249,252,251,252,247,251,251,251,247,249,253,254,241,246,238,243,246,231,240,235,248,235,248,254,230,244,239,237,250,228,252,229,247,231,211,255,255,251,240,255,239,234,228,225,240,252,233,233,222,237,252,233,252,239,231,241,207,213,215,241,245,251,240,255,231,231,202,243,192,176,145,146,66,108,235,246,249,255,246,238,243,233,246,219,199,169,184,245,251,232,205,186,236,237,250,217,208,227,243,244,208,226,255,250,238,255,240,246,237,242,209,220,189,168,169,216,220,232,179,87,30,52,40,45,55,92,41,40,75,85,115,81,182,205,186,246,251,241,238,230,255,232,235,249,233,235,242,244,250,248,243,246,245,232,223,248,184,140,217,184,192,243,242,223,245,245,248,179,81,176,216,188,245,253,254,181,133,128,92,99,165,199,200,212,140,51,75,107,108,108,119,126,70,76,73,51,66,49,76,92,38,86,92,49,69,65,7,70,225,252,229,255,248,243,234,252,235,240,237,250,246,246,246,255,222,219,250,254,244,240,228,252,254,179,164,251,225,238,203,162,152,148,210,232,189,160,232,222,199,229,198,102,71,72,74,66,60,46,38,37,53,38,32,33,52,41,50,28,42,8,127,239,243,253,232,229,245,232,255,247,243,254,148,122,221,209,202,224,222,234,225,209,219,224,226,234,209,230,222,217,236,226,216,205,243,229,210,207,232,228,219,229,199,230,204,198,234,214,226,232,231,211,213,205,213,200,212,226,205,186,209,220,192,194,199,211,192,206,199,211,195,211,204,221,205,223,201,216,180,146,130,143,121,86,86,76,85,122,120,128,120,81,98,95,113,115,99,94,122,99,90,57,28,82,211,190,208,219,219,218,219,223,191,204,222,206,198,204,200,215,212,213,228,215,204,224,224,237,204,202,214,201,194,215,200,219,218,194,215,205,202,227,192,186,203,215,198,179,115,10,16,11,13,18,34,4,5,10,10,25,4,11,14,7,24,4,8,10,43,17,15,7,39,205,219,234,211,223,220,210,235,204,193,224,214,228,214,212,244,225,211,221,218,203,226,228,214,230,219,202,229,241,219,197,204,212,228,240,249,241,228,232,227,215,217,224,216,201,219,231,232,213,189,224,212,206,212,230,200,208,216,220,230,210,212,179,204,208,229,211,221,214,202,222,215,208,217,235,212,238,200,218,227,232,220,241,189,228,227,216,219,234,217,228,216,231,223,208,203,226,212,223,243,228,203,206,216,210,251,208,232,207,236,228,236,221,251,205,200,211,230,219,242,228,207,242,217,235,205,225,226,212,242,212,210,243,215,217,235,241,226,232,212,234,245,220,242,230,231,227,231,211,255,223,245,220,218,221,243,227,231,227,234,244,233,235,233,219,238,224,230,235,231,254,238,242,216,242,228,237,245,251,228,228,220,242,236,241,245,239,244,252,233,237,239,228,219,246,233,243,232,225,232,245,249,233,240,242,255,236,232,232,232,249,240,232,225,240,231,224,239,242,235,225,242,243,235,242,246,225,234,239,248,250,250,255,252,245,247,249,249,255,237,241,230,228,255,253,246,251,252,220,249,250,253,235,238,255,250,245,239,252,231,255,254,254,244,241,237,238,235,231,238,224,249,243,244,233,249,253,228,240,232,245,253,216,237,235,232,236,254,245,212,231,218,251,229,227,234,253,231,246,244,239,246,232,253,231,254,254,254,247,229,241,213,223,239,249,247,244,255,253,236,249,252,228,243,243,248,255,232,241,244,231,208,252,215,238,239,255,253,194,218,221,229,216,237,245,174,134,122,118,84,94,223,254,253,243,242,253,226,249,243,253,217,170,209,223,248,201,184,229,244,242,236,228,238,241,255,227,134,172,239,213,238,248,253,253,255,255,248,247,218,254,237,252,255,254,199,111,11,84,108,58,51,74,74,65,197,238,224,144,221,164,146,250,245,229,224,222,246,246,250,254,241,250,251,247,241,242,233,247,242,251,233,252,196,156,183,159,197,238,245,254,226,169,218,173,128,225,212,212,230,255,247,251,194,142,139,170,191,217,202,226,136,96,114,149,159,148,150,151,120,95,71,67,73,13,69,72,65,47,99,87,40,41,19,61,227,231,240,245,253,242,235,250,246,228,244,232,224,234,255,246,244,245,235,240,249,238,200,255,246,179,168,198,225,215,191,80,144,202,196,219,171,180,248,215,130,197,206,242,219,245,244,233,252,246,255,253,234,145,96,99,85,54,31,22,11,34,132,247,212,239,232,251,250,239,254,231,209,222,109,165,240,202,215,233,228,223,231,243,245,199,226,211,199,215,230,205,230,224,246,229,223,211,214,228,215,205,198,195,218,195,198,218,213,201,205,197,229,188,193,209,223,217,197,208,230,193,201,240,209,213,179,183,209,200,228,195,213,187,208,189,192,202,204,229,214,137,121,120,116,89,72,99,80,118,144,126,130,102,121,146,135,130,99,133,104,125,98,79,18,102,229,213,244,214,218,224,225,214,232,187,228,203,224,227,211,199,236,232,215,205,223,207,198,225,209,185,198,211,206,202,214,224,207,196,197,197,201,191,217,195,207,242,215,198,105,9,1,7,8,3,1,12,17,20,12,24,10,8,9,9,7,17,14,15,18,7,36,18,11,213,216,181,233,236,211,210,226,213,240,203,214,217,202,212,216,225,211,212,221,221,237,224,218,214,237,231,228,207,218,202,222,197,223,225,243,221,226,211,222,194,224,224,233,202,231,208,221,217,228,246,240,226,226,214,224,205,198,247,232,211,219,223,235,203,225,207,219,225,222,210,223,208,218,231,214,226,207,227,211,239,208,227,223,213,217,213,182,210,219,205,226,218,232,224,221,235,204,222,231,218,221,214,220,216,217,225,229,207,219,226,215,217,234,230,223,235,228,205,213,234,244,227,222,221,239,234,214,226,218,229,232,206,233,189,229,234,237,225,244,245,217,227,241,234,244,238,248,217,248,215,229,233,237,224,218,243,249,240,213,232,239,247,246,244,250,214,244,211,219,232,239,237,232,240,252,246,247,226,243,255,252,222,231,214,228,228,243,224,238,242,228,229,244,253,251,242,240,247,253,220,226,252,233,251,244,222,239,246,254,228,230,235,252,255,242,249,245,225,221,255,235,241,226,250,244,237,249,253,241,252,250,252,254,246,251,253,252,236,240,250,243,219,239,253,246,249,245,235,252,239,253,253,228,223,253,250,242,225,253,238,249,248,255,248,254,254,242,255,249,224,249,252,243,255,255,208,251,246,255,242,225,248,249,252,238,248,239,239,231,224,250,249,233,252,238,225,240,249,235,235,228,239,235,247,250,246,241,250,255,250,223,251,234,231,242,229,231,234,234,247,234,251,250,252,254,246,248,242,227,232,254,255,242,237,226,173,171,184,232,224,213,196,226,216,175,135,149,163,66,102,242,247,247,240,231,235,211,244,234,245,245,210,228,243,239,202,174,228,248,218,238,223,169,252,239,187,89,72,210,236,242,247,227,233,254,227,246,248,234,226,211,251,243,249,169,86,23,67,121,84,66,64,56,67,234,255,255,178,101,13,73,221,236,255,203,221,238,232,248,246,238,241,232,225,255,246,235,250,247,239,215,206,194,164,186,160,219,246,237,235,128,166,222,136,158,237,203,254,249,252,253,250,175,115,111,122,85,88,79,94,87,81,61,45,56,50,76,50,64,69,87,75,77,36,43,78,59,71,99,93,55,38,2,50,220,239,247,230,252,245,240,240,237,227,248,246,250,244,237,253,227,243,207,236,238,232,217,255,252,209,177,228,241,217,204,143,186,218,236,233,238,217,252,218,150,219,236,215,245,250,252,255,250,252,252,250,255,187,118,132,88,48,19,28,33,25,133,230,204,233,235,242,248,241,248,225,162,176,112,164,227,214,205,204,224,214,227,215,213,243,207,220,207,216,226,199,219,207,223,207,227,228,213,208,196,215,211,214,180,219,194,229,185,205,188,212,212,204,206,212,214,222,207,193,205,224,224,216,198,233,216,198,209,223,225,205,195,222,211,199,209,214,218,224,177,158,152,130,136,94,82,69,93,138,167,157,141,114,106,102,111,102,90,104,154,122,110,42,30,197,208,218,225,237,230,213,212,221,198,223,221,198,215,214,211,235,217,200,204,233,188,201,220,202,200,204,224,249,192,207,198,232,216,235,220,219,189,201,205,204,206,221,207,208,118,6,7,0,4,2,7,19,0,7,6,16,16,24,7,7,1,5,4,21,15,14,30,37,14,207,221,219,228,235,220,240,214,218,206,201,208,217,248,212,211,225,214,226,214,205,206,219,204,210,213,208,223,200,231,214,230,188,207,199,249,217,205,237,221,217,208,234,213,197,203,207,222,223,230,187,228,237,236,223,227,196,198,213,214,233,214,217,214,234,225,244,210,221,225,224,204,225,207,211,196,219,216,218,210,206,206,203,191,211,228,208,221,215,249,220,242,239,235,240,215,250,211,230,227,230,220,231,196,219,222,217,215,213,213,222,234,237,201,212,224,227,217,249,226,220,227,219,216,218,209,224,206,200,222,204,230,243,193,234,226,214,224,233,213,190,206,244,232,232,234,228,247,215,229,230,253,221,222,250,237,240,234,216,227,227,239,219,249,236,251,217,232,236,248,243,239,236,218,252,232,245,250,236,240,252,231,252,220,255,236,251,233,224,249,251,230,253,236,241,245,221,243,247,253,235,253,246,246,253,239,240,254,218,250,240,241,249,223,248,245,236,236,241,228,233,252,242,213,243,249,240,254,224,248,246,248,254,228,247,251,228,255,255,243,253,241,251,238,248,238,252,244,251,249,255,249,241,244,223,252,238,243,254,216,228,251,224,254,243,244,228,229,255,233,247,255,225,240,245,255,235,252,247,244,252,229,245,246,235,234,231,224,252,234,222,255,244,249,248,248,246,243,219,225,237,248,227,239,229,228,226,246,248,244,240,223,232,245,250,244,252,254,248,245,253,235,255,249,255,238,233,248,236,245,231,233,252,235,216,176,209,223,198,231,244,233,213,231,247,205,153,155,148,117,103,252,244,247,243,214,238,246,223,249,255,230,213,193,251,231,200,230,217,246,219,252,186,112,195,200,164,50,43,168,245,239,249,252,227,232,239,241,254,213,223,188,220,250,214,146,99,26,50,123,63,68,73,24,22,216,252,217,66,22,24,91,203,236,235,198,242,228,229,251,232,228,235,239,249,235,255,239,241,247,255,208,229,211,202,155,123,213,218,136,172,190,255,255,160,191,228,232,250,249,243,223,248,140,119,80,36,7,3,0,59,114,97,47,30,40,22,44,70,49,55,28,68,62,56,73,101,65,52,93,111,68,52,21,59,217,246,240,234,242,255,255,243,224,211,215,247,242,226,242,255,254,239,231,248,244,238,222,222,254,211,181,195,244,241,242,218,180,220,206,241,218,161,242,213,212,225,212,246,248,246,252,214,235,247,247,246,254,175,128,132,83,67,13,45,11,27,201,250,241,238,220,249,250,254,239,217,192,204,130,187,217,190,202,220,242,202,218,230,218,218,216,216,228,196,237,219,200,229,225,210,203,220,223,189,228,199,219,211,185,221,234,209,225,196,199,214,217,206,198,185,168,217,218,194,217,217,193,213,207,214,208,206,191,212,217,190,202,207,221,219,223,218,213,205,211,163,131,107,109,97,75,90,110,148,119,122,120,91,73,64,64,68,100,133,139,102,107,69,39,163,209,218,205,222,206,218,215,202,204,231,235,198,216,204,189,199,211,227,205,210,217,239,207,204,228,183,239,226,193,224,232,196,207,214,222,205,191,207,196,216,206,191,220,208,133,5,20,17,9,6,27,14,0,19,8,12,12,7,23,22,9,5,30,18,11,14,15,20,7,233,217,214,211,228,221,224,233,219,216,206,230,235,228,201,204,246,222,225,221,218,199,205,224,223,194,203,223,217,201,192,232,237,225,216,204,220,220,224,221,218,236,233,226,231,191,224,216,207,214,211,229,208,226,211,220,242,206,210,216,206,208,225,219,203,201,212,234,210,231,231,213,220,213,212,209,207,205,202,217,221,235,234,238,219,233,210,206,224,204,209,237,209,214,198,233,226,219,207,224,193,206,229,238,240,235,230,219,220,215,240,225,198,217,230,212,234,215,240,231,225,219,228,228,209,217,206,241,237,233,224,208,229,219,199,218,225,224,220,212,233,216,231,213,227,237,226,242,241,236,229,223,252,243,196,230,223,243,242,240,240,236,217,229,223,230,247,220,240,244,233,224,244,251,244,242,250,245,241,243,230,255,233,227,237,245,236,243,225,248,243,205,243,254,248,236,252,230,231,240,250,240,251,218,241,243,252,255,221,231,219,237,239,244,229,249,255,247,240,239,254,244,236,248,249,255,253,240,229,251,240,237,242,246,255,251,255,249,245,246,245,245,229,255,243,234,246,248,250,252,222,240,253,253,217,244,255,244,238,246,240,227,224,235,231,245,244,236,225,251,252,238,231,217,248,252,247,255,252,236,242,252,235,238,231,245,234,254,238,242,247,237,230,245,233,245,249,226,227,245,241,241,234,240,254,219,250,250,236,245,231,239,254,238,239,250,253,252,242,225,244,245,238,245,247,245,238,221,239,227,199,235,228,190,221,222,244,231,221,235,248,228,213,233,253,157,165,142,126,119,154,233,246,255,249,247,247,241,246,240,245,241,217,218,230,221,238,195,233,235,240,253,121,35,174,196,193,96,15,107,209,230,254,242,242,254,232,218,245,198,209,173,192,233,163,133,93,0,63,78,71,47,52,47,20,200,249,135,34,2,28,147,248,250,239,194,244,255,254,246,231,227,242,234,236,238,230,230,246,247,253,196,146,165,192,205,145,200,200,199,235,222,255,210,81,209,237,178,233,250,253,254,242,233,94,90,7,37,30,22,24,71,34,48,49,44,48,62,92,50,30,47,54,64,30,92,70,68,39,111,106,53,42,55,78,211,252,246,237,255,231,241,250,233,220,243,236,248,235,242,240,242,247,223,228,253,214,233,217,252,243,158,183,211,238,227,215,159,209,223,244,221,176,206,214,224,218,210,253,255,245,242,241,236,238,235,255,243,143,86,109,85,34,52,54,20,33,200,240,225,255,226,226,250,235,221,179,192,171,152,224,206,203,198,205,229,207,251,211,232,196,213,235,217,211,228,216,200,202,237,223,225,217,207,210,210,204,220,196,183,205,198,215,193,215,220,209,203,201,200,201,202,210,192,200,197,224,208,210,203,231,181,201,199,200,213,206,217,212,186,202,206,194,206,220,173,140,133,75,89,89,79,115,126,131,147,97,66,59,64,49,61,66,87,84,107,82,67,43,4,138,214,218,231,230,207,204,211,226,218,229,217,198,223,226,194,216,222,206,216,208,191,198,214,206,200,197,211,197,208,207,224,205,224,210,228,234,177,239,209,223,232,211,187,206,113,2,2,8,13,18,0,6,1,31,18,14,10,6,9,9,17,23,0,12,8,5,0,14,7,226,220,233,210,224,224,211,207,198,209,222,217,233,220,189,217,226,203,224,188,214,214,205,231,223,207,216,212,209,227,203,212,192,220,227,236,210,213,210,240,221,223,231,224,224,212,220,214,201,219,198,236,214,226,214,211,231,225,241,224,216,225,236,222,221,232,227,224,217,222,196,212,226,215,229,212,218,185,212,219,208,215,207,228,212,218,212,209,218,212,228,220,219,223,235,215,212,244,202,227,218,240,237,230,247,202,228,206,203,211,181,234,231,227,224,198,193,230,208,249,222,228,219,228,234,238,208,208,219,233,209,216,242,244,205,236,248,239,212,223,246,227,224,225,249,243,219,231,239,224,229,244,229,220,241,212,244,233,216,243,246,250,248,220,229,239,234,233,236,236,247,251,254,247,238,208,238,230,245,248,231,235,231,246,251,245,234,254,244,232,233,221,247,246,222,243,220,224,244,247,253,253,247,249,236,243,234,229,241,250,239,250,248,250,232,231,240,255,232,252,228,231,250,254,246,236,248,239,245,242,225,237,247,249,237,246,221,214,247,234,227,242,249,248,247,252,246,248,248,243,255,242,240,249,255,233,251,247,253,240,229,234,225,248,230,254,236,242,235,233,237,246,227,227,250,250,248,227,238,250,252,245,245,243,239,246,255,232,228,249,233,252,243,245,238,248,249,252,239,245,248,224,223,233,247,255,253,243,241,229,250,240,237,234,233,237,251,244,237,224,248,244,253,248,255,238,233,252,248,209,189,230,227,215,216,237,236,232,183,217,222,200,206,239,238,167,161,124,97,114,101,143,218,233,235,237,250,233,242,252,245,244,211,224,241,201,173,135,188,235,236,224,122,59,116,193,202,152,74,151,236,246,255,235,230,249,250,208,242,220,188,150,224,238,152,135,79,5,77,79,80,90,76,24,21,198,250,141,39,72,206,233,239,249,241,167,246,255,254,253,231,242,251,240,247,251,230,226,249,237,238,160,76,68,160,138,201,252,243,253,226,230,246,152,102,179,188,211,251,247,240,249,232,230,157,81,50,51,32,26,92,76,86,66,33,38,65,80,80,33,59,41,37,45,46,82,62,91,70,108,95,48,80,16,51,202,227,255,232,237,253,242,235,254,215,238,243,245,249,243,220,237,249,233,228,229,236,233,239,251,252,186,169,212,232,241,210,174,192,242,228,201,209,180,207,207,219,193,229,255,246,223,235,228,233,232,251,249,149,100,90,86,59,35,38,19,50,216,243,230,241,255,254,237,195,226,207,210,178,179,208,224,212,196,218,219,214,225,222,215,208,208,224,184,217,233,234,220,237,227,206,226,211,211,193,215,219,209,219,215,187,204,207,215,190,200,185,201,197,181,187,202,210,213,193,223,206,202,161,217,192,226,213,224,186,209,198,200,224,197,219,199,203,206,220,175,115,114,111,100,75,61,123,159,136,147,145,101,67,64,83,66,90,92,116,95,66,45,35,164,225,209,216,213,222,184,214,184,213,208,240,229,231,229,203,214,223,219,219,227,218,217,231,228,225,227,226,221,197,217,210,219,213,199,217,208,212,208,194,210,208,200,228,220,228,110,21,0,8,0,3,2,11,24,17,25,33,2,7,12,1,12,18,14,26,14,0,12,2,31,211,224,203,218,246,194,222,213,202,225,220,213,218,218,204,207,215,231,178,226,213,231,216,225,207,206,210,215,204,211,236,218,206,206,217,213,198,215,219,209,213,205,232,201,208,204,213,205,206,233,222,230,228,217,200,249,204,192,238,209,213,219,192,231,238,235,235,219,230,208,215,218,216,243,196,195,198,206,230,202,212,246,211,194,230,220,222,220,236,227,223,215,211,229,231,221,224,224,234,227,225,219,236,227,202,216,219,234,190,218,199,202,205,225,207,226,236,224,207,225,229,217,218,214,220,229,228,236,230,223,206,222,219,243,250,223,219,218,218,225,230,235,232,215,227,215,220,222,224,236,239,232,235,236,236,203,223,234,220,231,220,251,227,236,232,239,241,232,251,246,236,240,250,237,234,237,247,223,253,248,233,251,232,254,245,229,240,255,249,245,252,240,225,247,244,249,237,236,239,247,237,243,252,228,242,245,250,246,242,247,253,236,237,255,245,239,253,253,249,241,229,236,250,244,255,231,255,239,237,241,240,240,238,253,216,235,238,237,242,235,252,249,253,250,246,230,241,252,252,255,241,224,249,239,238,246,242,255,252,233,232,237,249,254,237,249,246,246,243,234,249,224,210,252,231,236,230,250,239,199,234,254,249,248,255,232,234,244,254,237,242,255,241,230,241,251,235,255,227,232,249,246,252,236,253,251,252,253,246,246,255,237,245,235,233,237,228,244,224,240,247,252,242,240,254,249,245,241,252,241,216,205,230,246,249,198,219,152,197,232,219,204,209,242,252,171,128,155,124,73,60,67,73,149,248,253,245,251,253,255,233,252,231,201,112,116,119,43,114,203,228,233,176,48,72,154,228,194,150,205,236,241,246,250,245,254,225,243,227,174,179,196,240,247,184,154,67,14,89,84,64,71,70,41,29,223,247,224,72,196,240,230,238,250,231,175,247,246,245,255,238,230,236,225,238,249,242,230,255,239,246,205,36,16,46,99,232,237,240,223,120,198,230,131,177,232,221,249,246,253,242,238,255,228,207,141,114,44,36,89,143,92,75,53,31,41,50,71,54,42,70,47,51,9,65,84,62,85,53,115,105,95,85,50,62,218,240,235,250,255,255,243,230,250,226,227,252,243,241,236,251,255,253,234,213,233,244,226,211,246,246,196,154,197,241,238,218,176,163,239,211,182,231,195,160,212,230,197,176,214,225,232,240,239,249,239,252,233,127,129,72,76,61,19,26,36,84,244,255,232,245,224,224,215,189,219,213,200,187,184,228,224,200,198,205,219,206,235,206,218,213,220,221,221,209,230,221,217,207,207,215,212,214,189,220,217,220,211,217,208,224,220,217,224,208,232,189,191,210,209,181,178,185,204,208,195,206,198,211,220,192,200,227,202,231,189,224,208,192,191,230,221,228,204,215,199,176,127,119,120,99,132,181,168,166,160,161,119,105,103,69,84,80,136,110,82,88,64,110,215,227,233,207,233,196,215,215,215,238,221,214,210,225,224,223,215,225,225,203,185,207,215,198,179,193,218,219,211,224,234,213,209,188,215,189,207,219,211,204,210,232,242,212,198,192,113,5,2,3,30,6,22,21,13,23,12,22,20,4,8,0,15,7,0,8,6,44,25,13,13,230,208,236,210,203,220,220,242,231,228,195,219,210,213,238,209,230,205,203,206,208,210,231,211,208,207,210,222,234,215,206,214,205,214,236,218,240,218,216,215,219,199,213,236,218,240,218,216,213,212,250,210,232,210,213,210,208,249,204,221,234,217,215,223,207,208,206,220,221,215,250,237,206,239,210,208,206,212,212,233,235,252,219,216,208,218,225,228,225,217,201,218,221,220,228,234,207,215,216,241,205,214,227,215,200,226,228,237,213,200,236,218,224,202,200,241,234,224,219,229,214,222,228,215,210,238,230,231,240,201,219,233,245,240,248,240,235,218,214,196,239,229,229,229,222,206,231,253,231,235,226,243,228,211,239,230,240,228,222,247,246,231,226,228,240,242,221,255,253,241,240,238,224,215,233,246,223,228,243,235,249,238,255,244,233,253,238,232,235,218,250,248,244,246,242,241,250,240,232,242,236,219,227,247,237,237,244,239,227,247,252,247,235,227,234,245,219,255,255,237,224,252,223,247,244,226,250,246,224,229,247,254,240,250,250,242,238,246,220,231,255,253,247,246,241,230,237,246,248,236,243,230,246,246,245,236,221,236,239,234,239,246,244,222,253,232,231,255,252,243,243,239,248,239,255,234,255,247,247,255,227,241,254,227,237,231,247,246,245,236,232,249,252,246,243,236,231,232,219,252,229,249,255,247,243,240,247,238,245,249,248,255,236,235,242,243,233,252,236,239,254,242,246,239,220,253,255,223,250,216,212,246,209,248,210,169,244,206,224,225,242,206,212,252,253,198,170,178,115,74,61,43,24,66,193,231,249,245,234,236,249,253,231,202,60,108,104,1,54,213,255,238,165,130,145,197,232,220,173,212,244,241,253,226,217,238,241,236,231,210,180,170,222,244,163,129,53,1,91,118,109,55,52,38,52,217,248,185,164,246,247,243,236,240,226,184,244,245,218,225,227,252,239,233,229,247,234,226,243,242,241,160,58,66,40,82,234,248,231,214,166,244,228,174,232,238,197,250,254,233,251,232,242,241,238,233,84,57,13,54,107,75,48,46,78,34,57,78,38,57,79,66,18,1,24,101,78,57,34,109,115,68,61,47,57,184,250,251,249,239,255,242,230,244,238,212,233,238,255,244,230,251,250,254,214,234,243,247,227,250,246,215,150,197,244,245,212,164,125,166,174,150,242,207,142,188,237,172,193,200,239,247,239,228,246,253,255,224,145,91,82,87,54,54,28,22,114,237,222,220,239,247,232,181,230,219,216,207,141,205,232,220,225,197,198,229,214,216,207,200,248,204,232,195,203,208,215,221,216,217,227,196,227,203,203,200,211,186,202,217,224,221,180,220,211,206,208,203,211,218,187,186,215,215,209,212,219,223,201,215,202,220,218,193,193,201,219,231,207,183,207,200,203,212,238,204,146,161,122,112,84,89,133,105,97,96,96,78,66,44,32,59,61,57,94,60,38,46,90,219,229,238,253,214,211,223,213,227,234,236,215,191,221,207,191,213,203,229,218,218,216,228,227,222,204,206,216,226,217,195,213,233,198,199,194,202,215,187,205,222,215,216,203,214,203,134,2,14,19,21,7,9,5,2,23,4,10,17,17,2,11,17,27,10,18,12,23,23,8,33,219,216,227,209,198,224,247,233,203,226,221,224,232,227,232,226,207,208,226,223,225,215,216,227,218,213,201,222,216,218,217,222,196,219,242,231,224,222,223,234,227,236,233,226,207,225,230,210,232,245,208,225,216,211,239,216,219,226,215,191,234,216,196,227,213,220,227,217,221,224,226,219,214,231,217,220,238,221,200,221,234,223,233,229,209,231,221,225,206,205,203,227,219,220,233,217,214,247,213,226,232,225,226,224,240,223,240,227,223,228,232,234,222,216,231,234,228,211,230,195,208,241,175,238,221,227,241,238,232,223,241,209,230,229,205,239,237,233,225,235,209,220,242,229,212,226,226,240,255,222,219,235,230,250,237,247,241,227,240,246,239,233,222,225,241,244,237,245,247,228,223,237,252,239,226,234,251,242,247,241,224,253,239,245,227,253,226,251,247,252,251,239,251,228,245,223,255,247,239,244,252,233,230,235,239,226,244,222,242,225,238,247,226,229,231,254,251,255,232,230,229,249,246,247,244,250,252,236,245,220,251,223,244,235,230,237,244,247,237,255,252,232,233,243,236,242,255,238,233,239,234,237,251,254,233,248,238,244,242,237,245,232,216,254,241,247,224,243,219,253,254,234,229,253,253,244,234,228,242,246,232,255,252,227,247,236,229,234,231,248,243,249,253,245,240,239,250,226,240,249,246,227,238,247,239,246,234,243,245,237,253,246,250,248,242,238,243,237,229,254,253,244,254,230,233,242,225,238,240,240,250,234,200,227,209,222,249,200,235,246,229,209,195,241,245,177,142,131,155,69,70,109,54,128,209,240,208,252,250,244,253,224,221,185,43,142,181,12,47,195,248,255,232,194,205,203,251,233,191,215,231,243,237,240,224,232,247,238,237,212,200,206,242,244,135,147,70,0,81,79,73,33,39,19,47,213,253,187,147,243,253,226,244,214,200,203,244,249,251,255,241,254,249,242,236,241,219,238,236,229,236,213,174,228,133,52,117,176,186,188,215,234,204,169,241,234,229,249,246,249,241,249,249,244,239,190,129,77,54,12,12,6,49,13,25,45,67,71,39,58,78,71,47,11,64,111,73,100,61,120,110,58,64,14,19,179,236,251,242,226,245,249,234,246,242,244,227,232,237,253,228,250,243,223,231,237,222,228,208,204,239,237,150,200,216,237,221,232,86,92,168,180,251,249,161,200,221,155,157,184,244,237,252,237,240,212,230,241,119,131,72,60,26,25,8,2,139,245,250,251,252,228,197,219,252,235,228,159,135,229,208,244,216,200,231,231,215,230,205,219,194,239,203,233,216,211,233,208,223,199,194,211,214,220,226,209,204,208,196,225,237,221,210,195,200,215,236,184,197,198,199,199,215,225,209,203,214,206,193,215,198,196,219,217,213,194,189,195,198,175,217,221,221,219,234,172,130,98,76,67,66,59,31,28,42,52,68,98,57,73,25,37,20,27,41,77,19,38,163,217,229,218,219,203,210,213,216,202,227,227,216,231,217,208,224,209,205,222,219,208,221,211,222,197,231,223,210,232,217,215,209,220,220,206,206,233,241,207,218,213,201,194,216,217,237,88,12,12,16,0,16,13,15,11,2,26,9,19,14,14,9,22,4,6,9,32,33,20,7,0,223,210,204,209,211,222,221,207,232,217,226,212,220,233,223,242,222,227,227,224,206,216,232,206,219,214,220,233,213,222,241,202,184,189,231,196,235,231,214,211,213,210,209,238,219,220,226,196,240,214,213,209,208,219,229,208,208,225,205,215,224,213,207,209,235,207,222,213,202,232,198,204,238,231,213,242,192,225,226,228,238,216,222,222,243,224,244,235,214,194,227,219,223,234,212,220,223,218,231,206,205,225,221,235,210,224,236,229,219,209,232,213,220,244,237,246,202,217,202,230,226,220,216,234,226,225,244,238,237,235,220,222,226,234,231,239,245,224,218,240,214,225,227,228,237,216,249,234,250,229,223,230,227,242,247,250,239,220,238,224,209,243,238,220,244,233,240,227,224,251,224,243,252,243,223,218,224,242,249,248,235,252,238,247,229,228,251,225,235,235,220,224,222,241,248,246,251,243,243,250,252,230,250,250,223,251,243,242,244,255,254,222,243,233,251,251,245,255,252,241,253,248,225,246,252,250,245,224,224,224,248,238,238,242,232,243,250,253,226,247,247,235,248,210,205,239,219,237,241,253,255,255,242,246,247,243,240,254,235,244,232,243,243,238,255,246,255,236,217,238,247,240,238,227,232,241,235,235,235,230,244,227,246,235,251,241,235,239,249,246,228,223,235,245,233,254,247,230,233,246,252,248,238,246,252,254,246,237,241,234,253,234,252,239,230,246,216,217,233,253,255,246,214,243,230,238,235,255,255,213,241,236,197,221,243,243,236,194,207,224,225,214,205,254,237,162,153,125,148,39,148,224,230,228,224,234,241,247,236,245,242,231,212,163,71,191,175,57,117,225,254,248,231,217,209,190,245,197,203,184,224,239,246,252,243,247,245,240,249,202,202,191,228,224,142,135,56,0,88,85,54,55,41,40,59,221,247,124,151,247,220,218,214,238,206,219,239,236,242,240,227,235,228,213,240,245,254,249,243,253,242,246,247,246,180,67,75,67,212,235,234,244,142,165,245,228,224,238,240,244,246,245,243,231,220,222,236,209,140,140,109,46,44,7,24,59,58,46,64,117,79,47,18,26,42,109,73,74,30,92,98,76,76,38,55,161,225,245,250,255,235,238,250,243,253,229,225,243,221,232,242,243,244,239,223,235,255,244,240,206,253,247,182,160,178,247,230,214,150,152,207,215,247,236,199,223,220,191,176,218,213,235,243,213,230,236,252,193,97,114,85,84,31,40,14,32,177,232,253,245,236,208,205,234,246,196,176,127,157,193,242,227,203,194,213,221,232,210,231,220,228,191,220,232,220,209,217,193,193,226,209,194,210,210,195,211,183,233,223,220,224,219,240,238,204,188,216,200,216,228,212,199,182,193,190,211,203,216,219,187,235,214,219,218,203,213,204,208,192,212,217,186,207,215,200,127,65,53,34,63,54,19,54,24,36,40,59,78,54,57,44,59,49,48,63,55,35,78,204,216,241,232,209,203,223,214,213,221,210,210,205,238,217,201,237,197,209,194,214,205,209,222,207,231,234,217,202,227,218,200,210,232,200,192,209,215,193,233,231,194,208,211,206,194,221,116,0,14,6,8,16,3,12,0,4,8,16,25,2,6,10,26,3,7,13,18,3,3,0,13,213,239,203,207,207,206,191,223,214,221,175,200,226,222,247,214,178,230,222,218,232,229,206,213,211,241,230,217,223,228,207,224,196,222,225,238,214,212,215,232,189,233,194,220,224,201,206,212,222,217,208,225,223,237,230,209,211,232,235,228,207,243,243,228,237,224,214,219,223,195,237,221,234,202,204,232,188,215,231,213,200,224,246,222,234,217,226,224,230,232,209,216,224,215,215,227,214,247,207,222,208,219,225,228,215,241,220,224,238,220,229,214,203,229,215,223,236,203,235,241,246,233,196,214,202,207,236,233,241,217,226,193,241,198,237,221,239,240,214,225,237,223,228,242,228,220,237,219,226,215,228,245,227,229,234,234,225,245,247,229,216,216,249,241,253,239,241,248,229,245,237,238,248,236,229,240,247,230,226,241,218,227,233,235,253,247,229,232,247,254,253,249,239,246,241,239,254,253,250,226,239,255,247,238,226,248,246,233,245,242,242,252,232,232,253,224,244,254,216,253,234,225,223,235,248,250,244,235,233,241,253,251,243,249,245,243,244,226,243,224,227,247,241,233,247,239,252,239,251,236,239,235,249,240,216,243,253,232,233,247,239,255,241,220,252,241,252,250,237,242,249,242,248,255,245,253,237,239,231,243,250,254,241,244,240,237,234,244,234,235,238,234,250,246,253,247,241,218,239,240,235,241,227,246,232,224,230,236,236,247,253,242,250,225,248,255,222,243,245,241,246,247,234,229,236,249,247,244,252,227,253,243,217,241,215,179,204,191,225,244,233,208,216,247,225,189,145,148,115,50,136,237,254,247,205,199,216,236,219,244,230,245,244,213,85,134,210,160,218,250,221,252,169,220,187,189,207,216,219,207,245,253,236,240,248,251,237,227,251,208,176,189,248,248,120,145,28,9,90,86,60,54,47,39,23,245,250,116,176,250,236,212,200,203,177,195,228,248,231,242,228,249,240,221,218,249,238,233,231,231,255,249,246,228,171,144,83,68,221,225,253,231,126,175,251,201,205,237,242,209,251,251,234,255,212,225,254,255,246,244,240,210,85,6,44,164,82,38,37,67,102,62,71,49,56,89,82,57,63,100,99,31,49,22,47,176,245,255,241,252,239,244,236,234,229,243,226,216,247,251,235,249,247,252,227,245,252,228,216,218,244,245,210,174,213,243,224,229,156,145,237,248,217,253,212,226,194,201,218,222,211,231,234,224,241,234,254,231,114,111,89,78,28,28,3,79,243,245,254,248,218,187,217,238,245,152,74,79,180,222,218,241,186,203,221,189,209,231,215,193,206,208,232,213,211,217,196,203,191,223,233,215,224,218,220,219,215,192,212,213,186,216,198,202,206,217,218,213,214,207,209,191,226,184,201,204,199,216,206,216,209,212,172,194,217,191,203,232,179,220,210,210,211,219,180,147,109,87,66,53,81,51,67,58,46,62,52,45,40,48,41,30,31,52,72,51,22,111,211,222,215,211,229,228,237,226,212,205,220,220,217,219,196,226,234,191,227,219,206,207,205,212,209,191,243,223,211,197,226,220,199,222,184,214,223,206,217,202,218,208,207,225,200,240,206,87,1,0,23,10,2,0,5,13,15,40,6,24,14,7,14,16,10,8,2,11,7,7,3,7,242,212,235,212,229,203,210,236,214,222,227,196,219,169,222,212,216,234,217,234,236,221,219,228,215,227,217,223,235,229,226,224,211,229,229,236,215,225,210,208,210,213,233,226,225,234,193,225,219,212,219,234,200,222,222,210,232,199,213,212,214,205,234,226,210,223,235,238,223,238,238,224,214,230,227,228,251,216,206,215,210,227,208,231,191,211,236,220,212,217,211,242,222,219,209,214,222,253,239,225,231,209,218,244,223,224,223,241,248,219,224,227,224,226,219,229,220,239,236,235,228,204,227,235,225,242,229,200,247,217,227,219,225,235,217,245,237,224,239,225,209,228,239,219,241,218,224,235,213,239,218,241,228,225,225,223,233,244,244,230,240,240,235,229,245,236,243,247,210,243,229,230,231,246,247,235,232,246,235,227,236,247,240,229,227,236,240,244,247,249,252,239,238,197,244,230,247,255,254,236,242,253,255,225,252,248,231,254,240,242,233,252,232,220,226,229,224,238,253,252,241,243,236,226,254,236,242,238,240,216,245,252,230,244,240,232,244,252,248,246,242,238,240,245,232,240,240,244,234,254,245,248,252,247,254,228,239,220,230,249,255,238,244,242,242,237,246,236,253,232,237,235,228,242,233,248,236,237,255,249,252,248,225,247,251,252,242,247,215,231,245,231,239,253,216,233,246,237,252,223,241,251,244,235,226,250,240,255,242,225,240,239,234,249,239,253,244,246,246,250,234,236,234,247,248,255,234,218,233,225,248,232,177,203,206,239,242,231,247,254,227,210,220,253,247,163,160,145,116,55,108,221,246,250,206,173,242,248,219,219,222,238,216,195,145,184,183,215,251,254,251,254,210,217,213,206,211,234,248,240,232,252,252,253,242,237,221,216,239,197,186,190,236,202,126,151,57,9,56,82,84,38,44,36,47,230,251,117,178,226,252,227,204,213,211,205,246,244,231,251,238,236,251,248,240,232,218,235,228,229,243,235,248,213,140,153,82,50,213,237,213,185,64,221,230,211,251,235,254,253,246,244,234,236,218,232,251,253,255,245,244,213,131,32,85,175,68,38,33,72,97,51,25,37,63,93,66,68,44,113,130,50,65,35,66,213,236,255,236,246,246,245,236,227,239,224,224,235,239,230,247,250,244,209,203,241,249,248,217,222,234,239,211,153,193,243,252,203,184,171,174,240,209,222,195,165,164,201,245,200,204,231,230,233,245,237,252,205,114,110,85,83,50,19,20,152,229,246,244,243,201,209,228,246,207,133,44,60,216,232,237,214,193,229,226,204,226,210,217,198,199,230,216,187,212,210,224,206,226,191,203,201,221,193,203,233,196,203,232,209,199,196,207,190,243,197,228,210,209,219,214,224,246,228,195,220,190,222,179,212,207,214,199,192,203,209,199,201,198,231,207,234,197,225,200,171,167,183,119,125,69,63,99,48,101,110,77,64,83,65,46,34,61,60,50,32,140,218,242,206,217,230,213,242,212,236,218,222,232,215,220,238,233,228,226,216,212,222,223,220,213,219,211,206,212,218,189,210,189,242,225,201,211,202,200,194,208,201,198,218,218,210,216,190,232,105,7,0,6,18,18,0,14,16,16,14,26,2,36,9,1,36,16,4,12,2,6,8,10,17,201,188,183,205,216,215,227,244,226,218,229,233,222,213,240,222,210,215,208,221,217,211,230,244,213,246,232,234,225,217,206,226,222,227,226,230,190,213,220,201,220,204,243,231,197,208,203,217,213,223,195,229,238,236,205,221,238,210,229,227,237,217,232,232,237,222,214,227,228,237,218,233,226,226,193,238,226,250,244,208,232,241,208,233,245,218,229,223,210,223,219,238,234,240,220,238,214,223,213,214,213,236,204,234,212,231,225,230,246,236,241,240,215,237,219,238,241,233,221,231,214,214,231,217,206,234,239,221,215,217,211,239,243,237,235,241,202,218,222,219,236,228,237,236,226,221,238,228,241,239,241,248,249,217,220,226,214,236,210,241,249,237,239,230,223,225,250,237,242,229,222,235,252,251,232,244,219,247,239,242,237,224,209,247,242,239,239,254,247,252,254,249,249,247,246,246,239,255,252,245,238,243,241,255,240,240,237,235,251,237,238,240,245,243,249,243,252,246,237,251,241,254,255,235,218,246,241,249,252,229,223,239,243,223,228,232,220,254,247,241,252,244,249,230,244,231,238,234,238,220,229,236,231,231,226,236,236,236,243,247,252,242,245,238,255,255,255,243,248,238,237,251,246,249,206,249,245,243,239,253,241,212,224,252,253,235,233,240,251,226,248,255,234,249,236,243,229,254,245,245,232,237,237,235,239,233,245,228,243,244,243,238,216,224,213,238,249,228,247,218,231,246,246,253,249,235,240,250,239,209,229,176,183,233,243,220,254,209,220,241,210,200,210,254,241,187,155,137,137,67,108,238,252,250,255,199,169,209,246,245,248,222,233,216,167,199,176,215,251,247,231,224,205,217,223,190,249,173,232,239,203,252,222,238,233,249,235,229,206,200,194,186,242,171,122,147,56,9,59,76,78,31,69,36,38,238,178,93,151,197,249,227,221,231,236,211,224,254,242,250,241,247,213,230,223,244,242,241,247,248,227,242,236,177,165,196,109,32,146,209,215,133,142,243,207,190,230,237,247,250,234,230,241,215,196,253,230,254,230,245,252,164,133,41,81,174,81,30,35,76,80,23,45,32,72,92,80,60,31,84,127,65,38,18,29,212,237,248,253,250,252,252,230,250,246,222,231,229,253,251,239,247,218,230,236,233,226,253,238,207,242,236,226,180,190,219,225,226,204,173,140,228,226,236,210,178,131,185,217,230,221,214,236,220,244,235,241,206,107,122,95,34,19,15,76,233,250,243,242,228,214,209,251,242,173,93,31,71,211,208,218,234,193,204,214,216,212,214,199,241,231,189,204,211,226,199,225,223,179,211,210,228,205,199,239,203,222,237,205,204,205,212,208,214,224,218,183,214,208,218,198,223,209,182,189,190,207,198,198,231,220,184,206,202,208,218,204,209,214,186,203,229,208,202,185,211,222,195,201,146,109,58,47,31,94,141,121,94,81,41,44,36,31,24,38,150,240,210,243,232,216,233,225,212,209,214,199,213,209,220,219,225,200,225,196,208,220,220,203,180,219,213,223,217,192,217,246,218,205,209,216,236,214,224,194,227,229,231,215,200,200,215,206,205,211,103,7,7,9,23,21,6,13,9,19,18,8,23,11,0,0,8,15,5,1,15,6,10,31,20,226,231,205,221,210,238,217,202,224,227,233,216,216,216,209,209,213,211,226,229,236,212,232,196,218,212,236,221,208,229,217,224,218,246,232,227,230,218,228,233,207,240,224,217,199,219,231,230,227,224,211,226,215,226,213,221,218,219,206,188,226,236,237,223,217,235,221,238,219,231,240,236,227,236,217,233,217,202,225,233,217,238,235,224,210,231,222,206,223,208,207,239,234,215,226,231,229,233,227,235,215,232,238,233,226,219,233,211,218,219,229,214,210,227,208,230,212,235,226,225,228,202,214,234,241,229,217,252,225,227,232,220,226,224,248,228,233,238,236,217,230,237,230,219,245,231,233,226,223,232,250,220,206,220,240,248,228,240,224,237,245,254,237,247,233,231,244,239,234,242,244,250,243,251,235,217,239,253,235,229,229,243,222,228,255,212,229,226,218,240,223,239,251,230,219,253,232,249,244,238,253,250,250,234,248,242,246,226,236,233,236,250,248,241,237,239,243,236,251,242,230,255,247,239,248,221,233,222,228,230,242,233,229,234,241,250,237,255,235,237,255,255,236,245,242,241,232,244,253,252,245,229,249,242,232,246,248,236,233,239,249,227,255,217,251,246,255,231,235,246,232,249,236,245,245,238,247,243,254,244,223,233,240,254,224,246,237,248,246,250,252,233,239,228,253,230,232,241,229,235,244,232,253,232,253,247,243,244,248,232,232,252,223,247,242,249,245,233,252,222,240,241,233,227,255,243,247,249,214,205,215,198,234,247,239,229,219,213,216,229,221,193,237,246,250,165,128,150,121,45,127,241,246,252,249,206,188,198,225,224,254,237,213,171,198,210,235,242,248,232,247,237,153,204,203,201,166,99,159,197,223,224,237,249,248,249,234,227,232,216,176,192,255,147,138,157,34,33,55,48,56,40,30,71,30,47,97,140,198,192,205,193,187,213,223,184,229,248,228,253,247,237,228,215,237,241,246,240,250,229,246,233,221,169,146,214,144,50,103,182,240,171,223,206,206,239,222,250,241,246,236,228,235,233,225,241,248,245,252,238,237,162,115,42,106,166,27,30,20,62,65,27,35,21,59,80,81,111,40,98,119,70,61,27,26,120,241,245,238,221,248,242,246,249,241,255,239,206,251,233,248,241,246,234,228,239,238,250,250,238,231,252,209,175,185,211,238,245,218,149,158,205,225,188,202,215,85,150,226,190,179,163,220,221,228,230,250,185,106,100,68,45,44,111,185,235,244,226,246,211,240,215,251,213,145,80,9,106,224,205,214,219,189,223,214,212,209,208,210,205,203,227,223,212,210,230,205,240,218,217,181,224,228,211,202,233,240,182,218,208,188,202,213,211,203,200,228,218,202,209,193,202,222,178,217,204,201,207,192,196,220,211,211,201,203,188,191,176,198,215,219,205,212,210,166,191,209,220,210,136,109,56,31,19,16,67,78,41,64,55,27,45,10,18,113,229,245,228,223,228,213,205,199,187,243,216,212,210,225,207,206,238,225,226,200,208,226,210,235,217,217,232,218,245,222,211,195,208,165,213,198,205,221,199,219,226,220,209,194,207,207,234,228,229,204,105,12,14,18,18,2,16,12,6,4,0,33,3,19,12,10,22,11,13,14,22,1,1,8,1,213,229,237,228,241,234,226,226,211,245,230,223,218,218,217,224,206,242,205,215,243,212,219,231,249,217,224,217,222,214,204,184,227,200,232,220,216,207,212,242,242,227,220,233,222,205,218,242,214,192,223,229,207,217,205,211,210,227,227,241,201,216,225,223,222,221,203,220,232,237,223,243,215,229,234,249,219,235,228,231,238,252,239,225,244,223,222,213,244,240,218,252,237,203,215,210,233,231,242,232,234,233,241,230,208,235,239,222,243,210,246,235,229,230,229,199,233,237,221,226,220,212,237,230,224,232,230,213,227,246,242,229,235,237,231,231,230,229,237,229,209,228,241,221,234,221,239,229,220,218,253,218,219,236,243,233,241,224,237,252,239,234,222,255,228,235,235,246,228,241,233,247,223,246,232,221,249,243,220,242,218,213,243,251,250,247,251,231,239,247,246,252,249,234,255,244,249,237,237,232,237,244,228,219,252,241,223,230,230,248,235,230,249,248,239,253,238,247,236,255,234,227,236,251,248,236,253,241,245,235,238,247,252,249,243,233,237,249,254,226,253,245,249,211,242,242,231,251,229,247,241,251,253,239,249,252,254,227,249,250,246,245,242,250,235,254,235,246,253,235,234,216,242,236,250,244,239,234,252,244,246,250,236,227,248,242,222,253,229,240,222,251,238,243,254,251,239,236,251,228,225,253,224,246,246,253,252,227,230,224,238,243,221,238,227,242,227,255,253,255,252,229,225,239,252,242,237,246,250,215,213,204,236,216,234,223,219,211,238,247,211,199,210,242,244,153,137,127,127,36,109,241,248,240,252,228,229,187,175,249,229,249,230,176,217,187,224,232,247,243,234,159,101,236,210,151,127,14,83,181,238,226,234,235,227,254,245,244,230,203,156,199,252,180,131,128,67,50,66,44,71,51,76,61,84,25,107,191,211,236,237,192,182,234,237,213,237,252,244,253,229,236,250,219,240,233,251,241,253,233,252,200,200,117,149,219,118,87,230,245,228,167,225,255,183,230,236,253,238,226,255,255,255,229,209,249,240,247,248,246,240,158,104,23,64,168,26,46,39,68,51,35,38,12,70,72,64,88,46,102,109,75,53,47,12,62,214,249,251,248,249,237,252,255,242,244,224,223,240,253,223,245,233,250,243,220,238,253,249,231,224,242,223,197,174,228,235,231,194,171,121,156,181,164,187,232,157,162,216,241,156,138,213,221,254,247,251,159,94,67,77,23,79,218,226,252,251,212,210,227,226,253,245,168,139,98,9,154,237,211,243,204,205,214,216,207,216,218,205,223,198,201,227,208,207,224,185,219,219,189,201,227,195,213,213,220,232,209,228,220,224,223,222,208,202,211,204,197,194,228,210,218,185,227,204,196,203,207,224,182,196,210,203,208,209,211,193,217,195,214,225,206,212,182,171,216,206,213,220,185,156,146,133,79,59,50,48,26,39,12,31,88,152,215,235,223,229,231,221,215,236,186,219,204,199,199,221,208,209,227,195,225,205,229,222,238,223,203,233,219,201,223,213,231,228,184,210,200,232,208,210,211,196,207,239,224,217,201,190,216,209,220,220,210,201,106,0,17,6,7,7,15,3,1,29,0,23,19,10,0,19,2,24,15,11,4,0,11,9,7,210,231,216,236,242,225,222,189,219,221,226,236,212,212,203,215,199,230,238,223,219,237,209,211,227,242,211,215,223,220,206,237,233,211,208,233,239,220,232,206,221,212,226,224,198,213,227,234,199,217,223,207,217,219,207,197,228,190,194,227,230,209,229,203,219,221,236,241,243,224,244,240,229,219,219,224,223,233,246,233,244,220,205,224,246,233,221,233,216,247,233,231,235,233,214,231,226,241,216,231,218,217,236,215,225,232,234,237,250,234,201,233,220,237,232,222,218,221,247,240,222,216,228,241,242,220,206,231,225,242,222,217,221,240,221,217,225,234,237,238,242,243,252,214,237,221,231,209,237,226,233,238,201,243,240,227,230,237,226,237,237,208,235,215,239,222,230,219,237,228,242,252,224,239,222,238,234,244,235,207,247,214,244,247,232,226,255,253,218,245,248,248,246,235,246,241,223,236,246,230,228,244,255,255,255,245,252,233,244,251,235,240,231,245,243,243,249,231,252,244,246,247,236,254,241,241,235,232,225,241,238,245,239,247,234,216,211,251,244,255,251,249,226,246,245,255,255,235,255,243,245,241,243,255,245,252,253,247,250,221,253,225,234,236,233,236,253,246,233,246,231,241,254,242,242,243,248,236,255,245,224,232,237,245,241,245,249,221,249,241,235,233,250,244,234,255,211,248,229,243,233,248,247,253,231,231,246,226,242,234,227,231,241,224,249,252,224,244,239,244,242,255,247,240,250,245,228,226,218,230,234,183,220,193,208,227,235,248,251,207,224,204,222,230,246,170,148,133,137,41,103,237,255,247,226,234,243,201,179,201,232,243,208,196,249,186,215,216,224,230,182,120,55,141,167,94,51,5,112,187,252,208,234,236,252,243,249,239,245,242,138,208,255,211,193,139,85,78,74,74,57,33,31,75,34,54,209,251,226,253,230,218,205,228,247,239,252,254,243,249,253,245,240,253,217,238,244,230,234,239,228,205,139,62,146,105,112,173,242,253,195,186,243,206,203,217,241,236,226,246,249,244,250,209,236,252,241,252,245,241,222,172,96,31,119,181,18,48,26,86,51,17,61,15,66,97,80,89,64,87,105,66,23,60,25,105,212,234,252,246,245,229,245,252,244,247,246,225,223,232,240,237,239,236,250,237,214,235,251,241,196,247,247,220,177,208,243,240,238,162,105,87,135,185,241,218,187,186,227,207,190,165,211,226,234,251,216,127,83,75,26,86,193,202,232,251,241,250,242,236,246,229,209,127,157,87,54,169,222,227,233,205,211,236,230,218,211,201,215,214,215,181,230,249,221,204,205,198,217,212,214,208,184,223,218,219,211,199,211,212,207,202,224,219,233,206,218,193,230,214,217,213,222,215,214,202,220,228,216,220,211,217,222,208,202,195,220,209,213,215,211,234,210,205,204,199,221,217,239,249,249,236,227,227,195,137,90,29,90,213,239,234,239,235,248,239,225,237,217,212,200,197,229,202,205,216,221,223,222,208,203,213,225,207,232,214,222,205,224,220,235,208,211,225,232,194,220,212,204,221,212,204,221,206,217,232,207,205,227,184,215,226,228,222,206,118,2,0,0,2,4,17,9,16,22,12,41,28,2,6,1,18,5,4,0,25,4,19,2,17,237,192,203,223,239,231,228,219,217,218,220,211,206,239,215,198,229,217,206,195,212,233,236,225,243,212,224,214,231,217,206,226,240,225,222,220,241,243,232,214,236,214,237,204,222,234,214,215,246,235,236,216,216,236,207,244,222,219,233,232,203,237,224,245,239,213,225,225,228,233,237,216,227,227,211,237,239,238,229,239,244,241,238,241,243,214,229,212,224,229,230,231,235,221,239,186,230,235,227,199,247,211,234,238,227,228,217,227,237,226,236,235,215,204,225,241,232,231,220,246,210,235,231,223,206,243,244,248,231,237,245,229,247,247,238,214,204,237,213,240,240,239,242,232,235,232,234,224,229,236,252,233,218,233,231,223,232,237,204,228,254,240,249,243,245,224,225,248,244,227,228,254,248,232,230,231,228,228,240,244,240,221,216,239,245,238,250,231,232,246,230,245,222,245,228,236,241,204,233,251,223,255,254,235,250,255,255,246,242,243,247,240,243,253,243,240,237,226,249,246,255,230,219,241,241,249,254,244,255,245,233,255,234,213,250,253,255,234,254,250,247,243,244,244,254,241,229,246,242,244,247,244,248,247,237,255,237,234,232,241,248,222,227,236,221,237,242,224,235,250,244,247,233,249,250,237,255,242,235,251,242,248,237,247,217,228,243,244,248,250,237,252,249,245,237,241,225,245,254,255,230,254,241,247,245,249,246,240,213,229,225,247,230,226,243,218,247,249,229,239,254,236,235,252,230,250,239,252,191,240,193,178,222,222,245,246,248,204,217,236,202,233,214,222,245,185,175,147,144,24,88,247,253,241,243,232,235,235,187,235,228,250,217,176,217,157,189,205,221,224,132,13,16,127,122,74,49,83,227,249,250,242,239,247,250,239,251,255,250,229,227,238,247,243,182,138,63,35,73,76,68,45,29,64,41,92,218,248,251,255,236,198,255,251,240,239,241,247,243,254,254,251,231,234,232,250,234,247,236,241,252,220,136,65,85,30,59,168,239,238,105,194,246,224,201,239,246,244,253,239,250,249,247,224,228,248,235,255,255,238,230,159,89,9,81,136,19,16,97,102,46,29,21,30,38,83,106,52,53,81,117,117,63,50,7,102,248,234,245,255,230,230,237,242,240,241,249,222,211,225,250,255,238,230,237,235,223,235,249,243,210,238,252,233,157,162,231,240,222,205,138,140,178,212,227,232,171,138,188,215,216,207,181,219,240,234,147,121,143,55,107,199,251,227,250,253,224,238,239,237,252,255,153,89,112,103,116,191,201,228,230,204,201,207,202,209,220,222,228,230,203,230,208,227,224,243,201,207,206,207,201,206,210,210,209,196,222,237,216,201,210,193,218,209,200,210,223,236,215,214,196,180,223,208,205,224,207,198,204,195,199,192,207,203,211,195,198,199,191,196,205,211,187,168,204,207,226,184,231,224,251,254,248,242,238,173,111,92,193,240,245,227,253,239,224,231,212,217,200,203,191,203,218,228,203,203,224,241,234,223,211,217,231,216,221,217,242,201,228,226,206,206,233,225,217,216,240,213,228,210,242,217,218,184,228,202,209,201,220,205,228,215,211,233,212,107,7,4,11,21,2,14,14,0,23,7,3,22,5,2,0,17,15,14,11,33,28,4,9,19,220,233,212,223,227,206,219,218,203,222,240,229,204,185,226,220,226,223,222,232,224,216,236,195,209,206,241,237,236,241,207,195,195,237,222,202,206,225,205,225,238,225,216,211,243,242,243,236,242,233,193,235,226,231,214,236,222,237,213,236,230,241,223,231,211,234,225,234,224,216,225,236,235,243,241,241,217,239,232,217,221,249,234,228,237,228,216,214,246,235,222,217,207,225,225,252,236,241,233,220,241,243,222,244,238,233,209,231,238,233,233,219,235,214,235,210,240,229,232,250,243,228,224,236,218,239,194,226,227,227,208,218,210,243,228,230,252,227,231,239,219,250,234,231,228,239,234,238,228,219,212,251,237,209,228,218,228,236,241,243,253,232,235,252,220,238,225,243,235,235,233,241,246,249,229,246,198,242,232,239,242,249,254,248,234,239,235,233,237,233,247,253,247,246,233,247,254,233,250,241,255,224,239,243,236,247,242,242,250,255,248,241,234,224,232,229,246,238,251,241,249,232,246,236,233,240,254,250,247,253,255,253,244,240,238,235,252,224,247,243,233,239,244,255,221,238,246,242,252,247,237,250,227,246,251,245,252,242,251,243,230,242,240,252,225,237,228,253,246,224,230,245,254,251,223,249,240,231,228,233,242,241,253,246,224,237,240,242,254,229,246,231,237,255,230,210,235,227,222,233,249,246,243,229,237,240,243,240,255,235,253,239,245,232,254,227,239,239,231,216,234,255,227,229,223,237,229,246,214,252,228,230,240,249,241,237,194,212,244,242,217,220,237,244,219,182,173,162,118,52,116,239,227,232,252,226,252,242,199,213,206,243,190,113,195,90,113,208,207,197,75,19,124,224,217,203,183,252,251,254,246,245,255,253,255,249,240,218,179,180,132,121,195,130,136,73,14,77,38,35,32,37,38,54,48,48,173,158,144,200,197,172,221,232,243,228,252,245,249,232,245,240,253,255,253,232,246,244,244,244,225,237,155,97,55,8,62,186,246,171,61,176,235,212,212,242,251,247,248,253,248,255,247,173,247,245,244,249,232,236,236,171,67,16,21,57,41,28,57,121,39,28,12,25,62,65,74,46,49,90,110,75,35,66,27,103,232,233,245,239,230,226,231,255,243,236,210,239,218,237,252,229,250,222,254,231,227,201,242,233,222,214,240,217,172,167,201,231,238,205,206,172,186,238,220,205,177,171,151,204,212,232,174,197,236,177,156,121,103,86,173,243,237,255,253,217,232,241,225,226,255,206,112,80,97,91,104,204,204,241,212,202,203,216,206,238,204,225,215,226,215,207,203,214,213,214,213,198,218,198,240,192,237,208,227,211,232,215,205,218,220,198,201,191,205,203,204,205,211,193,193,209,226,219,218,209,186,229,229,219,213,221,220,222,207,211,218,193,192,217,200,220,176,184,215,212,234,218,215,217,225,217,240,240,185,159,114,65,199,218,230,249,225,230,224,205,232,220,216,227,235,208,238,192,218,234,214,206,221,207,203,212,209,208,214,223,223,207,207,232,237,205,221,224,228,227,227,207,224,201,215,205,213,198,194,225,218,230,207,223,199,196,212,194,194,105,13,19,6,0,22,23,16,7,6,28,9,14,7,0,2,12,10,1,9,26,6,19,8,4,224,211,232,243,232,216,225,236,210,220,226,189,220,224,204,196,201,230,215,210,226,204,235,228,215,197,202,210,228,218,235,221,228,237,228,238,236,211,249,212,231,240,217,238,215,224,219,212,244,230,235,241,226,237,224,203,223,214,222,226,236,215,225,243,238,234,220,225,227,224,227,222,234,244,245,224,241,240,248,243,227,254,248,246,233,215,236,229,231,224,247,230,247,242,255,246,219,231,241,239,244,217,233,224,225,247,237,251,227,205,233,215,242,241,227,223,224,238,233,253,226,246,246,234,251,231,233,215,251,229,219,217,243,230,231,229,214,207,239,209,228,207,236,246,230,229,242,218,228,245,247,230,228,233,219,239,210,241,237,236,227,242,229,214,222,246,226,228,233,240,222,235,226,249,226,241,240,225,250,246,212,225,234,244,247,250,249,234,243,255,238,237,222,241,253,229,253,237,216,236,248,247,252,247,253,239,227,243,240,221,238,213,217,246,247,251,234,255,231,243,235,230,252,237,234,237,241,222,237,242,243,224,233,236,240,242,237,226,223,255,254,220,242,249,236,235,241,230,252,248,232,252,229,255,246,248,240,239,250,239,244,232,245,253,250,252,245,244,224,230,231,242,246,235,233,237,245,254,228,227,234,252,248,247,231,253,226,239,218,249,246,228,230,250,241,253,231,240,238,251,255,217,249,235,232,234,235,249,246,215,248,233,240,242,226,226,242,255,233,246,241,255,234,222,249,248,255,242,223,249,228,223,236,226,184,224,212,244,226,236,207,206,245,246,221,132,154,159,166,98,142,234,242,247,227,227,248,226,216,239,175,226,145,62,141,24,95,202,189,248,204,226,252,240,255,255,251,229,218,221,222,160,179,113,121,113,96,84,61,53,22,15,21,67,43,20,16,22,42,56,71,59,39,51,34,27,57,24,11,3,20,12,15,25,27,38,80,91,143,134,188,227,233,249,245,242,246,245,235,245,253,238,222,141,93,19,51,217,227,160,131,236,237,215,251,255,255,247,222,233,232,240,219,190,255,217,243,236,233,248,239,195,140,7,2,16,37,78,113,64,34,39,21,43,57,73,76,53,47,85,114,77,47,70,5,60,226,244,241,228,234,242,233,255,249,243,236,210,195,225,229,233,252,227,243,233,215,227,232,239,231,211,248,236,202,144,193,236,215,219,211,168,165,207,222,160,166,202,151,156,217,221,223,219,240,186,72,90,71,123,228,250,255,246,236,252,226,246,255,249,235,194,61,90,97,76,145,223,211,238,223,216,201,228,218,219,226,202,229,216,233,220,226,210,206,223,239,230,224,240,191,203,208,224,194,220,200,229,216,212,214,200,208,224,217,214,203,202,207,208,206,222,212,192,195,204,217,203,214,204,211,209,207,218,200,217,204,198,184,207,217,198,188,210,211,223,242,216,198,200,230,245,227,228,200,158,86,57,200,219,200,207,202,218,245,208,217,202,213,232,205,221,226,222,234,223,219,209,228,234,216,217,214,212,236,231,222,207,204,197,222,208,205,212,210,240,205,222,217,211,211,211,209,220,222,217,228,227,203,231,189,224,181,208,201,101,0,11,1,0,19,9,33,11,2,28,13,0,0,0,35,2,27,8,0,7,29,12,2,19,202,204,222,217,222,243,209,219,236,204,216,223,217,219,220,214,230,215,228,216,212,217,227,234,220,211,192,230,230,230,220,242,243,219,244,215,237,231,229,227,220,214,246,232,223,221,233,231,236,249,244,227,234,251,229,241,211,224,226,223,235,223,225,212,236,247,227,234,211,246,241,237,220,233,233,247,226,232,230,241,255,244,238,247,241,242,225,235,236,225,223,203,244,221,219,246,227,203,247,243,228,246,211,205,237,236,242,227,233,221,246,225,217,239,226,226,251,232,227,244,239,237,226,231,232,236,225,225,231,214,237,205,255,237,243,223,212,232,233,252,213,249,229,228,231,247,223,198,252,222,231,240,223,234,234,217,224,235,238,242,236,224,241,238,249,222,241,241,234,248,251,220,222,240,245,245,235,218,234,236,220,246,252,245,248,240,235,239,238,246,237,248,234,250,220,231,229,254,236,242,218,237,223,243,227,253,252,233,232,252,224,248,251,252,223,255,239,244,240,249,239,239,236,231,227,240,249,231,233,228,253,234,240,248,239,238,227,241,238,249,239,255,254,252,246,226,235,249,240,251,240,236,222,229,228,243,235,239,225,243,215,243,240,247,252,244,231,255,234,223,246,240,246,255,249,239,238,250,249,208,245,232,243,225,249,245,254,242,246,250,234,246,244,253,224,242,246,225,242,247,253,243,237,251,229,246,232,248,242,240,236,236,229,246,248,219,222,247,248,238,228,246,255,243,229,247,248,211,222,239,190,195,216,198,197,239,233,242,217,230,204,209,246,246,242,139,128,133,148,144,167,255,242,233,251,243,236,229,222,228,233,236,111,57,77,2,95,212,251,251,247,249,242,240,202,185,120,164,108,84,93,86,52,94,66,66,64,59,11,36,19,17,7,47,65,30,58,56,41,59,41,58,58,33,48,47,45,48,26,35,12,6,15,15,9,17,14,8,25,15,2,15,8,68,88,154,178,217,246,249,226,255,234,193,128,67,47,118,144,139,203,236,233,212,249,235,246,239,248,249,241,230,221,210,245,240,255,232,251,253,202,202,172,34,35,12,22,73,142,66,43,30,29,62,84,95,57,47,25,39,76,82,68,56,9,78,232,243,237,231,241,241,236,231,226,236,231,251,226,204,237,246,238,243,247,245,228,192,233,246,206,225,227,241,218,168,194,205,213,219,217,205,137,220,217,152,196,209,156,198,236,227,233,235,164,106,78,39,86,150,245,246,234,251,255,253,231,211,224,228,239,163,56,70,107,97,141,223,241,239,248,242,240,251,243,222,208,197,194,206,201,200,220,212,245,233,221,184,214,184,203,195,202,226,212,214,218,226,202,203,201,222,207,221,227,208,201,211,220,213,216,231,184,201,223,209,227,201,200,191,212,220,217,207,247,196,215,217,198,215,236,196,156,207,220,211,201,221,228,226,189,212,239,250,195,134,81,65,202,204,238,239,194,225,194,221,220,210,214,218,221,219,236,212,211,201,211,209,215,226,225,187,216,219,211,222,226,229,217,216,190,213,220,201,185,232,226,202,216,208,225,225,195,208,201,210,227,242,201,214,212,212,212,222,224,116,0,0,9,38,21,6,0,12,6,10,15,0,10,1,1,16,6,20,24,10,24,35,34,21,219,216,240,219,226,237,215,224,237,221,228,230,217,227,216,210,216,205,238,214,211,238,209,212,229,231,220,238,223,194,223,204,223,216,248,230,236,218,228,220,219,225,235,220,216,253,233,232,193,234,244,222,238,240,238,243,239,227,238,247,229,238,238,206,218,239,230,250,209,236,224,207,238,231,233,230,206,225,236,233,225,229,235,241,240,236,213,210,232,219,224,221,254,220,220,220,246,242,213,221,223,236,239,237,218,241,230,221,229,228,244,226,224,236,228,246,221,236,251,228,244,238,231,239,235,242,222,243,220,228,247,240,236,244,224,249,232,230,242,233,240,233,222,245,213,226,226,249,233,243,249,239,241,224,239,230,219,249,246,232,228,236,245,234,236,236,213,249,236,213,236,236,224,226,233,247,246,227,225,225,249,238,242,240,236,243,255,250,238,239,242,248,240,242,254,223,250,242,240,232,245,247,230,248,236,248,249,247,223,237,247,251,253,244,253,233,247,246,246,234,227,237,243,238,243,237,245,220,244,236,240,229,221,242,234,249,241,250,246,252,245,234,253,251,249,249,252,238,250,234,242,240,237,225,255,236,235,230,245,229,242,249,255,238,243,234,245,240,238,215,251,248,231,236,224,233,235,254,250,232,238,246,239,218,243,237,239,245,252,227,245,212,241,210,244,235,248,231,247,240,254,234,220,209,223,255,234,222,246,244,239,245,244,223,245,247,253,240,255,254,242,227,246,207,246,222,255,221,203,212,178,182,223,236,249,253,199,208,236,244,219,229,245,247,239,154,138,120,102,80,156,251,217,239,219,235,237,212,222,246,234,238,192,108,110,59,154,227,249,251,161,151,127,130,87,67,84,56,62,84,68,100,106,126,106,125,93,104,93,105,98,122,46,61,125,58,39,58,72,43,29,27,48,66,75,68,77,77,41,39,105,93,81,70,41,34,22,30,11,18,4,4,12,4,20,6,6,29,39,37,123,116,233,178,136,123,61,64,44,111,239,242,233,214,230,248,248,252,249,247,248,247,195,240,239,230,248,244,244,253,185,203,132,39,38,19,17,59,96,71,48,37,16,29,68,67,80,66,52,40,78,91,59,44,17,58,233,230,253,235,238,247,226,232,241,251,244,245,224,212,233,245,241,249,243,236,228,203,237,235,225,213,239,237,205,172,173,210,231,211,217,206,152,183,245,205,191,213,180,163,215,243,253,164,115,86,69,52,128,240,237,252,237,242,248,245,224,239,233,244,214,120,49,49,103,103,182,207,223,235,242,198,220,216,229,215,224,216,217,198,206,200,207,209,213,201,203,196,194,205,199,217,186,225,229,208,222,223,209,233,202,229,194,222,215,206,212,234,219,224,233,213,222,214,212,237,227,215,209,224,188,221,218,231,211,205,208,255,215,235,217,177,184,213,224,234,220,221,218,236,239,241,251,244,186,115,66,81,181,194,226,237,209,217,202,194,215,228,216,227,213,224,202,204,210,213,227,228,233,197,209,194,225,224,224,208,224,214,196,215,194,210,206,213,212,206,218,215,208,198,230,220,236,230,211,204,232,223,208,227,196,218,227,205,219,126,0,4,3,10,11,3,9,19,13,3,16,30,5,15,18,8,6,14,4,0,7,16,17,7,209,216,211,214,229,237,194,234,227,216,220,218,204,223,218,229,225,225,226,239,232,205,219,223,232,238,216,229,238,223,222,223,226,214,245,219,211,207,242,217,240,210,239,242,233,236,248,216,228,241,231,231,218,227,222,207,234,252,236,229,225,210,217,212,233,253,220,245,213,220,241,236,246,242,226,242,239,233,222,201,197,211,214,233,201,221,229,219,237,229,234,239,229,223,234,238,246,227,225,219,237,238,217,227,226,216,218,235,244,216,234,234,241,237,225,241,249,232,250,254,220,246,225,249,237,249,244,246,235,234,227,244,238,244,247,253,207,243,240,242,246,247,247,213,248,220,229,217,224,243,238,234,234,213,229,227,226,233,239,240,244,234,252,242,222,217,239,228,255,237,241,244,230,240,238,213,224,225,230,247,225,246,248,220,238,221,245,234,243,245,240,222,242,251,229,221,243,249,242,218,251,244,250,254,237,248,235,239,230,237,225,232,221,232,229,216,218,254,246,237,239,242,239,251,235,248,244,238,245,232,246,252,245,245,253,241,217,220,237,242,243,255,241,217,234,222,220,236,228,251,208,215,246,243,244,224,226,246,248,228,246,234,255,218,237,251,238,239,200,214,230,249,222,237,238,231,251,250,229,236,251,237,222,236,227,237,226,230,247,253,241,234,226,216,247,241,245,227,244,234,242,228,218,235,220,228,242,237,241,241,245,217,232,226,246,214,219,227,255,238,227,237,212,229,232,244,244,208,214,218,176,237,224,223,217,219,207,237,225,239,223,235,250,247,251,154,132,109,130,18,72,212,195,238,210,207,227,243,218,234,255,247,218,207,177,92,152,174,112,109,27,14,39,61,61,53,80,75,36,60,76,92,204,199,233,238,251,251,226,253,239,173,69,48,123,137,69,23,65,65,73,53,158,196,143,162,191,148,99,201,230,242,255,240,249,254,244,231,228,170,106,73,12,9,4,0,17,4,23,3,24,15,76,93,81,114,91,99,18,106,239,237,223,221,241,243,244,242,235,244,255,237,191,239,241,254,252,236,249,228,219,193,84,10,90,69,28,40,65,61,21,5,22,41,51,54,81,58,24,53,83,74,59,55,0,55,244,242,249,243,243,240,223,248,243,238,252,255,242,216,225,241,246,232,233,252,247,203,238,243,228,233,210,235,197,163,175,212,234,209,215,191,130,142,195,174,239,239,170,135,186,242,241,147,79,88,57,89,205,252,247,241,247,243,248,214,238,209,238,255,215,152,61,88,114,135,153,213,253,202,93,55,45,150,198,193,216,197,220,208,205,206,212,223,214,206,227,232,203,201,236,192,184,180,190,211,222,225,195,198,223,220,204,198,194,216,199,221,211,205,201,208,208,198,211,243,222,223,223,224,199,221,186,225,188,226,245,208,210,235,207,156,204,214,212,216,224,246,229,242,235,236,227,225,106,99,27,70,185,220,211,212,221,219,220,224,228,212,195,209,233,221,205,212,207,224,218,236,213,219,218,215,200,233,213,200,218,208,201,195,209,195,209,242,201,229,213,185,220,214,210,200,227,237,191,229,223,226,204,198,225,196,186,216,214,122,27,7,0,18,0,0,10,2,17,14,1,9,2,9,0,5,29,5,2,8,9,3,12,38,236,236,206,220,224,238,226,217,222,224,224,223,226,237,227,220,218,243,207,230,213,244,220,231,241,224,223,214,230,208,238,244,236,212,242,221,246,230,217,219,225,235,234,234,221,236,222,238,214,226,239,246,225,236,236,237,250,246,209,249,255,242,222,233,237,222,245,234,228,222,246,229,226,222,245,239,228,213,219,243,227,231,228,207,210,234,236,221,241,238,217,224,228,229,236,207,238,227,230,233,237,206,239,223,247,220,218,255,248,223,232,240,252,251,217,222,215,209,225,219,236,241,230,223,233,230,246,236,249,220,245,234,222,246,238,235,220,241,238,235,246,249,218,207,235,243,240,229,239,236,234,231,254,234,245,227,227,241,231,220,246,230,242,238,236,234,244,242,216,243,239,207,227,230,225,249,245,239,236,238,233,236,243,240,226,247,223,240,241,246,238,240,248,235,251,235,237,246,248,247,252,218,227,226,234,240,229,239,240,235,236,241,240,239,243,254,244,225,219,226,245,230,236,252,232,236,235,252,245,214,246,233,231,237,247,238,236,239,226,240,236,239,230,247,246,242,242,234,240,248,231,241,244,232,247,225,247,227,245,233,246,236,244,231,222,231,246,241,204,230,247,232,227,250,245,218,251,233,239,243,232,231,237,243,241,240,224,249,231,247,255,243,240,225,246,250,236,255,240,243,227,235,242,245,242,243,248,234,201,240,240,240,223,238,253,244,241,216,251,253,243,233,250,211,247,230,241,227,228,222,223,206,200,199,210,245,230,237,238,232,214,224,227,245,243,147,114,109,112,39,111,244,249,245,223,209,225,234,221,248,247,255,221,199,174,103,89,84,62,65,72,78,98,82,102,144,181,145,80,32,37,130,245,255,251,252,255,252,253,230,237,191,81,79,129,142,105,83,76,109,61,83,178,177,160,139,152,130,92,182,247,216,236,239,248,248,245,255,249,176,68,87,40,161,232,217,192,161,147,54,28,36,111,53,45,39,47,87,27,146,247,250,205,231,243,244,255,247,255,218,232,216,184,222,240,255,231,236,236,241,222,188,106,1,128,112,13,44,10,59,39,46,33,64,91,34,61,35,40,60,69,94,20,64,39,99,236,250,235,247,240,242,249,228,227,217,243,228,253,220,221,250,228,230,242,233,240,223,216,233,227,235,215,251,227,183,168,200,216,227,215,192,158,114,111,139,132,152,85,70,101,183,232,188,135,81,70,158,248,251,228,230,252,216,253,240,254,242,248,238,213,170,94,117,129,117,151,232,223,160,34,8,5,15,151,205,226,224,210,209,207,219,200,200,232,226,238,210,224,222,195,215,223,217,217,231,220,200,234,219,215,215,207,197,209,207,203,206,204,196,193,210,210,195,234,231,217,200,212,222,244,208,213,221,207,221,212,230,215,226,187,173,209,209,219,229,232,245,238,239,255,190,129,106,43,19,28,128,220,208,213,222,222,242,216,219,216,199,234,204,230,207,222,197,223,244,216,226,224,238,203,221,226,210,204,208,203,225,202,217,208,232,221,232,211,214,209,229,212,202,207,189,220,198,222,205,206,231,203,195,214,219,192,208,227,115,5,20,6,22,11,16,1,0,4,15,4,7,0,0,0,10,4,22,26,18,2,25,31,2,211,227,224,240,236,218,235,227,229,226,225,213,224,234,220,206,222,219,240,214,241,230,224,240,223,232,240,231,214,229,234,235,234,229,241,213,245,225,240,245,226,233,236,235,217,226,207,241,215,227,205,229,237,243,222,229,209,241,234,206,242,244,232,249,243,223,212,228,229,234,239,249,245,214,237,230,224,228,243,240,219,236,238,234,223,226,223,235,251,236,245,216,235,229,233,239,247,230,218,246,225,224,217,229,244,238,221,235,253,202,226,230,231,222,235,230,213,237,242,207,247,223,233,234,222,247,244,248,202,233,237,246,236,250,227,234,233,222,235,242,241,223,229,227,249,240,248,240,235,254,223,239,249,243,252,244,235,234,249,250,243,239,239,227,243,234,234,246,235,246,239,242,241,238,244,243,247,214,233,248,250,210,249,240,220,237,235,235,219,225,239,229,233,246,244,223,232,238,240,243,243,245,246,249,231,238,255,246,242,233,243,245,222,250,247,226,224,221,247,230,241,245,241,227,249,255,236,252,224,245,248,234,222,244,227,239,238,243,255,241,236,229,236,222,223,243,249,222,235,242,255,234,242,252,247,246,252,251,226,250,247,248,253,251,240,237,241,218,219,243,241,230,233,244,235,234,253,219,248,254,215,230,250,251,247,238,246,251,229,240,244,236,228,241,232,254,243,236,226,245,233,242,224,217,248,225,247,243,219,249,251,250,231,235,235,235,223,232,239,234,227,208,246,207,205,236,243,181,230,220,158,206,223,210,238,229,249,233,217,250,189,194,227,251,248,134,133,145,117,23,138,250,217,252,244,204,211,185,207,250,235,228,105,91,109,138,98,36,23,83,80,147,233,237,255,251,253,238,202,112,73,72,205,233,237,238,210,197,167,130,135,118,70,67,125,68,51,68,49,31,22,41,30,44,81,58,39,33,52,80,79,78,80,151,154,171,202,144,164,64,60,40,77,237,242,232,252,242,254,218,167,138,100,29,43,71,27,57,17,180,220,222,204,238,243,245,219,242,245,249,249,212,224,246,247,238,251,251,253,229,209,198,101,18,138,90,18,46,23,26,60,66,54,60,75,32,37,29,23,24,90,103,38,51,28,152,251,248,255,249,250,240,239,251,237,244,246,229,234,215,182,253,236,255,233,229,244,205,228,247,231,217,245,242,234,192,156,183,209,217,193,222,207,170,78,80,37,16,37,114,91,191,246,143,141,58,117,228,239,237,232,246,255,228,253,232,224,222,248,249,179,173,150,137,129,118,141,123,188,114,62,25,5,64,140,183,227,222,211,250,214,226,226,211,220,208,223,212,217,204,211,221,222,206,206,240,214,216,212,223,209,226,207,206,217,222,196,228,234,206,195,232,229,198,226,213,219,194,217,224,226,228,220,218,219,202,201,212,228,199,154,212,200,217,207,218,246,249,254,233,201,120,76,37,17,87,196,224,247,227,220,221,224,197,242,226,228,227,206,221,218,221,217,220,194,232,207,239,213,213,223,238,229,219,204,205,203,212,209,218,214,202,212,224,213,225,195,224,201,214,226,202,192,214,213,211,206,221,227,205,210,208,221,182,221,131,27,6,1,8,22,1,17,3,10,21,27,4,1,1,7,18,0,11,22,0,8,24,11,7,224,211,219,231,225,229,224,230,248,219,241,216,251,228,237,212,238,236,239,219,240,243,239,245,235,236,244,214,221,239,221,233,238,242,226,233,228,226,232,237,229,228,247,242,244,242,231,231,237,240,241,231,223,214,232,240,242,227,242,217,244,238,240,238,243,239,231,244,231,238,241,216,233,231,249,247,250,223,238,218,255,225,225,199,232,224,228,238,219,237,247,235,238,224,238,248,223,235,221,230,237,232,251,236,247,213,236,251,232,229,237,239,239,236,252,247,227,244,226,233,223,237,242,205,244,240,241,230,246,241,230,223,234,226,240,237,237,247,235,248,236,221,239,245,224,252,247,230,250,242,240,246,254,225,235,243,208,235,221,250,227,253,245,242,238,229,242,246,239,231,242,240,246,236,245,227,231,248,229,250,229,235,235,240,250,238,240,216,246,215,222,226,234,252,231,231,242,224,216,238,244,234,239,253,250,222,242,227,216,247,228,234,246,242,238,239,244,212,244,218,245,215,220,235,234,232,237,251,237,247,253,225,248,234,255,241,223,247,245,233,242,243,233,248,238,250,232,248,239,208,240,237,221,220,235,224,231,231,239,229,245,252,230,246,231,230,236,208,245,255,212,245,232,236,230,252,223,225,242,247,237,246,246,227,233,249,234,239,241,247,248,221,227,217,246,240,250,245,231,230,242,238,244,250,254,231,245,222,235,236,252,249,250,242,230,234,232,249,236,253,237,197,239,235,235,239,241,201,209,206,200,233,250,206,241,198,191,243,239,233,183,189,246,240,253,155,117,138,120,47,141,243,235,254,217,229,229,250,230,241,170,76,64,30,101,133,122,44,30,0,51,188,255,247,223,237,248,227,209,177,155,173,60,44,61,61,21,40,54,18,12,29,28,15,89,43,25,9,21,28,6,10,39,47,41,15,15,40,36,23,16,69,13,19,44,37,29,38,30,56,45,47,37,154,240,230,255,255,251,203,134,137,127,68,39,83,62,104,63,162,255,197,221,239,242,254,254,244,244,238,237,192,198,236,218,254,239,228,235,251,206,163,42,32,137,68,19,50,25,28,48,57,43,44,29,24,50,41,14,18,38,91,78,28,30,71,229,232,227,252,234,248,236,234,230,233,235,255,228,217,219,216,236,234,222,225,239,217,224,244,245,205,212,204,237,219,154,155,226,227,224,226,217,219,136,104,72,25,18,26,74,196,234,149,89,68,177,245,239,238,232,255,244,249,242,219,210,226,249,238,173,168,142,156,133,137,140,35,96,83,31,33,25,181,231,234,212,195,222,212,209,192,197,195,210,227,227,207,217,225,243,209,206,217,207,213,205,217,219,219,216,226,207,211,217,209,203,197,230,211,219,213,215,215,210,228,214,218,230,213,227,232,207,206,249,234,214,212,232,182,151,223,222,234,234,242,254,235,199,130,99,49,6,29,163,225,232,240,250,232,239,232,215,209,239,209,190,221,228,225,209,194,225,210,202,196,230,211,227,225,209,210,221,232,224,219,211,240,209,219,213,215,231,218,224,219,214,210,208,205,224,227,202,200,231,198,234,234,200,214,206,206,204,208,196,110,13,7,4,9,21,17,7,9,27,4,25,2,3,21,0,20,6,7,9,3,26,28,32,22,237,239,237,238,228,223,246,205,244,193,244,216,207,213,242,234,238,251,223,246,242,226,243,238,252,223,233,238,223,221,231,232,241,235,228,224,240,221,243,227,240,237,227,234,213,242,226,249,252,245,228,229,223,243,233,206,237,236,230,236,249,228,226,228,235,244,215,237,249,255,228,252,210,241,242,218,229,208,247,232,239,254,230,220,215,233,227,227,246,221,230,209,222,246,239,240,240,219,196,223,252,252,208,249,237,233,217,245,224,226,245,243,225,251,216,250,235,234,235,251,208,229,232,238,235,235,245,244,241,237,213,246,249,254,243,209,243,226,243,253,234,251,248,237,255,244,235,250,229,239,223,238,245,251,240,252,243,249,251,249,233,235,234,254,241,224,245,232,229,225,244,252,214,231,255,236,218,250,242,240,236,221,238,228,232,229,236,217,238,227,253,240,249,221,255,241,248,235,229,238,225,237,251,243,244,234,248,239,238,233,228,244,252,224,232,216,228,243,223,232,241,242,251,236,242,246,233,244,241,232,238,241,227,212,235,232,245,214,205,232,243,236,250,241,245,253,251,244,244,245,252,247,237,246,242,239,244,243,244,242,249,218,228,239,235,238,230,205,233,235,253,199,222,238,241,226,211,221,247,244,229,232,244,232,250,247,225,220,223,196,219,237,250,251,220,238,247,236,220,232,211,248,216,242,238,243,234,226,248,246,234,230,251,227,226,237,245,239,226,244,244,235,247,227,252,239,236,198,236,227,193,236,212,171,225,228,215,246,229,247,188,231,255,250,239,178,145,139,119,46,152,223,248,249,246,212,234,245,222,117,72,36,44,62,101,170,182,125,71,27,25,118,182,168,113,119,52,56,52,100,135,137,96,11,22,10,34,40,75,34,12,18,23,51,48,43,18,12,5,4,16,33,26,12,7,13,23,22,27,22,25,17,0,31,28,47,35,19,44,46,18,23,42,36,25,29,62,138,52,52,78,162,132,84,164,205,155,107,91,81,135,204,205,251,228,235,236,216,242,245,233,198,207,252,227,254,242,222,245,246,181,160,51,8,114,56,21,115,31,32,40,35,60,38,35,12,23,60,36,29,45,86,22,48,30,73,235,243,252,247,230,216,213,238,239,244,218,251,245,238,207,211,245,222,235,234,252,208,232,230,224,235,206,234,242,204,176,179,203,209,211,224,186,127,101,129,189,111,32,24,32,157,236,142,85,102,228,252,246,222,244,252,233,253,250,228,236,234,253,217,171,155,165,161,152,157,118,44,129,122,83,65,78,204,229,255,217,191,209,221,201,193,221,206,207,218,188,200,199,220,223,235,205,230,227,197,242,188,236,217,193,216,233,187,245,218,195,202,193,210,184,218,217,220,223,235,226,203,180,193,234,203,231,242,218,206,219,222,226,184,176,212,230,242,237,245,223,185,107,90,30,11,94,203,246,241,246,225,216,206,228,217,206,232,218,199,220,219,199,217,211,214,212,223,216,218,229,202,204,220,214,216,207,223,219,232,201,201,222,226,220,237,220,234,219,191,211,192,206,199,211,211,217,191,210,219,208,191,212,216,211,184,211,226,214,91,4,20,0,23,11,27,6,35,21,24,26,27,0,16,5,15,16,20,20,4,33,37,50,25,219,220,217,248,228,219,224,233,228,221,211,213,236,234,221,199,235,238,230,241,234,221,238,234,232,227,231,220,219,215,216,220,236,246,255,240,234,248,255,225,239,247,250,221,248,249,232,233,236,237,229,243,251,234,227,247,213,239,230,230,233,232,243,241,230,240,230,239,233,238,235,224,220,211,235,249,245,224,229,243,231,236,212,249,231,233,249,223,226,240,225,228,236,243,228,244,222,224,244,243,251,234,234,243,215,244,237,229,239,240,240,209,237,235,222,233,231,242,240,231,250,231,223,253,235,235,241,244,232,249,239,247,248,250,249,243,250,246,253,239,247,233,244,227,253,246,244,226,232,247,247,252,255,231,246,247,225,231,245,250,227,237,226,229,245,243,235,229,253,253,249,234,240,248,242,243,253,254,234,246,237,242,241,250,234,239,228,211,216,242,216,227,246,246,230,251,249,239,245,231,249,228,254,243,239,229,231,238,239,245,217,235,245,244,244,240,222,241,238,229,232,245,246,232,239,229,227,242,231,236,228,247,244,244,239,237,206,224,244,234,235,239,234,249,247,251,245,250,239,243,236,249,246,238,238,232,229,240,228,211,240,220,245,207,226,253,203,227,253,241,245,204,233,246,240,239,233,237,246,237,244,236,247,235,217,244,240,233,223,238,200,240,236,241,247,244,227,241,203,221,223,245,221,244,243,245,223,243,222,251,248,229,235,232,245,239,242,197,228,245,232,216,241,222,255,246,245,193,248,202,189,199,204,180,238,248,245,250,246,247,158,216,236,255,252,153,121,98,101,60,147,231,241,252,240,223,245,245,178,101,65,52,33,78,166,234,228,190,165,113,52,52,15,46,9,12,52,24,38,55,2,79,124,88,54,29,25,28,98,31,32,45,21,16,62,69,29,27,8,42,5,28,40,11,25,29,25,34,26,11,10,12,14,7,8,23,21,14,40,18,20,29,14,8,34,24,42,15,12,34,48,105,75,92,178,243,190,96,65,38,82,152,209,229,246,243,235,239,250,252,242,156,223,230,239,246,249,241,237,250,173,153,36,2,125,54,51,120,32,60,38,34,25,60,47,26,57,37,26,41,73,104,30,46,25,45,230,248,255,233,245,243,231,220,242,243,241,234,243,229,232,224,226,225,226,237,249,228,196,241,232,227,201,215,223,226,178,168,190,213,217,218,170,103,38,11,106,144,107,77,68,92,171,140,101,192,255,255,240,242,253,249,245,236,217,245,242,245,249,196,142,144,154,137,157,181,128,70,170,183,109,77,31,101,183,214,208,221,201,217,222,213,212,225,225,213,214,211,229,231,207,217,207,206,224,228,210,189,221,202,201,212,196,215,200,217,242,205,219,218,210,227,214,216,217,214,235,191,204,188,202,202,237,202,216,198,207,207,188,170,220,230,243,241,242,189,138,105,51,14,43,167,235,230,232,240,235,222,233,229,230,231,216,201,216,230,210,233,226,240,223,238,201,231,196,212,220,235,233,217,225,209,210,198,227,238,200,196,203,214,229,220,203,214,229,225,209,230,210,218,220,212,184,230,228,217,201,225,215,213,206,203,213,217,181,132,0,14,13,5,4,14,24,7,11,1,31,0,2,7,9,15,25,4,22,14,9,34,18,5,236,226,229,240,207,208,219,231,230,234,231,217,222,247,240,221,250,225,242,242,225,247,242,241,206,235,241,247,223,228,227,250,220,228,243,215,231,222,250,232,236,242,230,219,232,220,234,227,253,231,246,225,217,242,255,252,249,248,244,237,232,239,242,253,243,234,225,243,231,240,243,240,243,242,246,251,231,250,242,247,245,223,243,249,222,242,223,225,230,235,243,194,245,226,228,237,236,211,250,211,246,235,251,216,229,232,245,246,240,234,236,243,234,225,242,244,231,242,247,218,251,239,232,244,243,249,244,231,245,229,240,244,245,227,232,251,224,244,247,236,236,252,255,238,243,252,238,247,231,245,230,244,253,242,242,231,243,246,249,247,248,224,240,231,228,218,224,234,233,223,254,241,229,247,237,249,219,251,231,218,236,238,230,234,231,249,228,214,216,218,243,253,254,239,239,242,224,242,237,236,225,231,231,251,219,245,245,247,251,240,249,252,232,226,238,246,234,200,246,236,207,223,236,245,226,241,223,204,222,219,247,242,244,252,245,225,247,232,238,253,232,244,220,230,252,251,255,231,243,253,243,223,246,246,235,236,240,250,230,214,240,243,248,240,238,228,244,226,215,236,234,244,226,238,233,242,221,234,239,247,246,218,250,249,251,229,230,244,241,243,249,228,239,230,242,246,249,234,222,239,226,229,230,245,224,205,218,248,233,239,207,220,254,228,199,247,224,215,232,242,238,254,231,220,236,249,204,192,234,183,204,239,245,244,251,250,229,240,243,225,186,231,239,250,228,123,106,106,69,21,124,243,229,237,217,225,247,217,194,74,59,19,5,22,89,185,167,141,159,137,62,21,37,59,60,22,60,23,16,33,11,12,40,135,120,92,54,43,87,55,24,4,19,48,67,47,26,19,36,3,6,33,25,25,34,34,2,20,16,24,35,38,16,18,12,2,23,7,8,45,16,26,26,26,18,21,44,29,26,19,78,107,85,36,40,81,80,88,92,13,67,189,242,243,249,239,222,239,249,236,217,184,253,248,247,231,249,241,253,252,124,102,50,15,144,47,46,111,36,62,7,26,24,20,63,45,49,56,58,114,134,122,50,48,16,56,206,233,247,248,220,244,250,221,239,239,242,235,239,242,229,194,205,229,244,243,236,239,210,244,242,201,221,209,255,213,162,180,173,228,226,204,227,198,148,44,36,82,155,137,139,94,121,86,148,220,254,249,252,229,251,255,241,212,227,235,227,226,239,159,159,134,109,127,125,149,96,89,176,165,116,64,11,2,67,179,193,221,233,228,198,222,213,201,217,214,215,208,236,225,231,215,205,230,193,233,213,219,203,218,230,221,219,216,233,206,238,212,217,207,217,208,243,229,208,237,202,226,210,218,217,210,229,233,203,218,234,219,204,203,213,229,253,244,149,104,94,2,30,42,146,230,229,237,235,207,233,238,196,225,218,212,228,195,215,215,219,217,230,228,200,220,208,224,231,221,199,224,212,220,222,228,209,199,228,199,204,202,238,213,223,216,214,208,209,215,236,210,229,196,230,224,210,214,214,198,207,216,209,211,200,225,228,211,221,111,21,19,13,9,4,10,8,17,18,36,19,17,7,0,24,21,18,0,28,24,10,14,11,29,231,221,225,230,215,236,219,244,229,220,233,228,229,239,239,250,235,214,248,217,242,220,226,228,243,239,247,234,247,246,245,211,233,232,236,217,210,240,252,240,237,241,233,231,243,240,245,244,237,240,253,255,246,252,233,222,230,236,242,236,223,254,246,241,243,241,255,219,234,254,234,250,233,248,248,224,230,240,224,228,245,230,246,242,223,249,252,237,230,233,216,231,238,233,233,206,198,237,240,239,246,236,223,226,246,213,244,233,224,240,242,245,248,234,252,225,226,244,221,232,250,248,236,233,248,255,246,246,231,233,230,236,244,244,244,226,239,248,255,242,244,253,225,237,251,236,244,218,234,230,240,247,223,254,246,239,228,253,230,247,245,239,226,247,225,240,250,242,240,227,237,249,244,236,244,245,207,216,240,221,225,228,212,228,227,214,230,220,230,213,219,227,235,234,233,238,239,244,248,240,240,206,238,234,230,237,242,255,218,236,232,249,244,236,250,224,233,236,222,241,245,248,250,232,218,250,242,236,252,234,240,250,230,222,242,238,246,252,214,229,252,234,232,243,242,255,252,239,243,225,253,238,236,236,203,245,249,247,243,237,233,221,233,229,246,239,220,232,246,248,252,236,237,238,237,245,211,237,247,246,215,246,219,211,226,247,235,226,216,255,248,232,245,244,235,225,242,219,239,233,242,242,244,246,234,239,240,237,238,251,242,231,232,243,253,238,223,231,246,227,255,236,224,242,249,255,216,214,224,189,252,245,246,226,227,217,231,241,255,229,164,238,236,251,227,101,113,98,83,12,95,240,238,255,248,212,254,255,208,88,27,24,36,80,74,43,32,54,136,160,107,33,20,47,36,34,79,13,33,14,39,48,16,30,88,143,117,105,113,38,15,2,34,2,34,67,36,12,19,11,37,23,35,17,8,6,18,19,14,10,30,22,24,14,13,35,7,22,36,17,13,10,35,35,24,4,28,17,21,59,58,68,57,63,53,33,35,60,97,30,22,189,244,249,253,217,243,241,233,248,193,194,252,238,238,228,243,223,247,231,93,114,35,8,140,43,68,65,11,23,24,33,40,37,43,52,62,86,147,170,164,148,98,35,32,45,188,241,250,228,232,215,245,238,241,227,217,235,229,225,239,195,218,247,222,245,213,241,251,224,229,248,200,221,239,219,196,158,143,234,198,237,224,216,200,132,102,154,154,149,116,52,15,50,192,239,250,254,253,242,249,242,235,227,227,244,240,253,204,140,152,105,82,134,112,147,95,97,192,148,68,61,3,45,173,203,193,223,229,215,196,218,201,196,214,226,213,199,208,219,239,232,198,210,183,232,216,194,234,229,215,213,226,229,225,213,204,203,211,212,237,215,201,234,243,200,227,219,225,216,236,207,197,224,192,244,204,225,179,226,252,242,157,127,71,13,10,3,82,174,238,239,238,213,242,204,221,244,216,218,210,223,249,237,208,238,235,223,227,230,233,215,212,228,207,243,224,221,231,238,201,206,229,233,224,204,210,219,215,214,217,236,222,213,217,190,210,217,190,205,210,237,235,233,209,228,211,218,224,222,194,199,234,206,223,133,1,14,1,19,14,6,18,4,7,12,20,13,5,0,3,15,6,7,23,12,3,7,4,15,219,218,239,228,237,211,218,251,236,253,227,227,225,210,223,239,227,237,193,230,232,243,231,245,229,249,238,246,246,245,237,220,243,236,234,238,229,236,238,224,244,232,250,244,226,226,226,228,236,234,252,236,239,240,246,251,236,239,230,229,223,246,222,254,240,238,233,239,240,228,254,231,231,239,231,245,235,251,240,255,225,255,207,236,228,239,246,228,246,228,233,234,237,235,221,227,235,225,251,225,239,210,253,224,243,228,241,236,240,228,248,237,242,243,247,239,251,224,238,210,239,238,241,221,235,247,206,239,237,241,247,238,244,233,233,248,229,243,237,242,251,247,245,248,227,224,251,233,232,249,245,249,245,233,241,220,226,243,223,249,255,255,233,223,246,238,238,246,211,221,226,232,228,254,234,243,230,239,231,233,236,248,231,243,231,235,243,196,242,251,234,238,241,233,237,255,249,241,231,214,222,249,238,217,248,217,252,227,247,239,224,209,241,225,227,218,229,244,241,233,245,241,234,228,236,230,236,246,239,249,239,230,241,252,226,232,236,233,246,238,234,238,221,244,239,254,242,245,223,238,239,246,231,237,237,229,216,236,254,239,248,232,246,247,236,243,213,238,237,222,246,231,243,246,235,208,234,251,250,233,236,238,239,247,203,229,215,247,217,245,224,224,231,255,217,226,226,228,234,233,250,230,252,220,233,239,243,249,238,240,237,250,233,232,245,232,245,208,253,227,227,226,211,250,231,246,208,213,249,210,239,206,182,181,208,222,235,246,250,255,173,239,232,255,200,80,94,76,68,10,97,237,249,247,233,229,253,252,184,84,53,81,120,99,46,49,12,42,66,153,181,128,63,76,43,64,76,56,53,31,43,46,29,31,7,33,134,169,152,48,5,21,14,12,42,81,23,21,31,29,19,21,12,25,7,7,19,14,16,24,8,12,29,21,27,39,13,8,34,20,22,39,15,34,37,13,30,18,40,63,98,85,64,40,17,32,42,51,61,27,51,210,238,255,250,205,234,236,248,254,185,223,239,219,227,243,226,246,247,179,89,129,47,35,142,8,86,76,18,15,35,16,26,24,18,53,81,119,128,133,102,95,60,50,4,28,200,225,255,214,210,250,222,255,247,238,246,233,207,252,233,202,201,255,245,254,247,250,255,245,243,233,191,214,205,210,201,151,177,176,206,208,227,245,224,134,137,184,169,144,131,61,52,138,241,245,242,249,249,234,242,230,212,238,248,237,236,228,183,155,132,129,129,129,107,147,119,139,217,125,71,64,4,114,222,242,249,204,216,217,210,217,216,221,213,214,220,224,243,217,231,232,232,223,223,239,216,239,212,232,219,220,212,226,221,209,206,224,204,220,239,206,210,208,216,236,235,211,231,220,232,226,226,222,222,212,228,217,200,209,208,135,123,39,48,55,136,164,197,227,241,240,219,212,208,206,206,221,226,224,231,223,232,239,204,220,219,241,217,225,200,221,200,213,214,216,211,208,218,232,212,237,204,202,216,213,208,201,223,216,234,214,209,223,207,222,215,194,233,217,221,214,206,222,183,211,194,206,203,217,200,240,196,211,205,117,20,3,18,13,7,2,15,0,30,20,10,25,0,5,5,2,6,24,9,0,23,10,9,17,211,229,219,228,241,220,238,238,232,241,246,242,228,255,209,225,215,240,222,232,240,236,245,231,237,237,241,245,244,243,225,202,246,255,224,223,232,243,232,240,243,217,228,234,238,225,253,223,245,253,243,246,226,228,227,249,233,233,229,244,241,252,244,231,235,229,229,237,224,249,244,235,223,229,234,241,234,235,221,228,239,230,250,238,219,233,201,254,201,240,237,237,242,253,228,232,253,220,225,226,227,237,222,231,241,222,207,232,227,220,224,240,251,240,219,236,243,212,226,217,215,220,234,253,217,227,249,245,247,254,234,248,236,253,237,249,241,220,239,252,255,250,221,237,245,228,243,237,240,232,232,253,238,229,246,232,229,255,247,250,228,236,222,249,253,231,242,240,229,244,240,222,216,247,213,237,251,240,241,225,237,237,214,218,237,251,228,218,239,217,222,238,242,243,222,236,231,230,229,241,232,219,243,243,230,233,215,241,255,253,233,222,247,229,200,194,236,244,241,245,230,239,229,242,229,222,250,242,231,230,226,236,228,241,247,238,236,254,242,239,231,213,232,252,233,231,226,232,229,235,250,243,246,232,230,233,247,247,222,220,248,237,239,247,227,214,230,242,243,241,219,236,228,251,216,221,255,242,221,225,237,229,230,246,244,233,227,250,250,247,218,231,206,242,246,235,237,239,253,223,236,237,226,203,227,220,243,236,225,243,243,229,237,207,240,228,244,247,219,242,235,213,239,239,234,247,211,244,210,162,203,222,229,247,241,250,245,245,248,246,184,253,245,250,188,94,69,66,54,12,109,230,237,247,234,242,232,255,224,80,65,39,65,44,44,67,44,46,55,73,189,171,131,111,41,55,69,68,58,38,43,50,27,48,42,40,14,65,142,90,85,29,14,18,52,76,32,26,32,3,22,1,25,13,2,23,5,27,42,17,9,33,30,39,18,28,12,46,11,18,25,24,6,40,1,6,12,31,58,93,93,53,29,56,34,35,73,48,55,23,80,219,236,254,255,240,235,234,243,231,192,231,251,243,249,235,238,253,244,189,77,97,69,68,165,13,61,73,25,35,9,30,29,9,76,50,66,101,114,88,80,43,52,92,20,100,207,230,248,255,229,233,230,223,246,244,238,240,253,230,251,243,218,243,252,249,255,254,254,244,233,253,210,190,225,210,215,172,168,207,224,223,191,201,187,185,184,199,157,141,136,82,124,211,242,225,255,233,240,218,237,210,250,243,235,228,249,243,162,161,159,145,163,127,145,133,166,169,133,68,60,34,12,144,244,229,252,214,222,222,212,224,221,186,232,236,200,210,224,225,252,255,241,231,229,239,252,203,236,239,198,225,233,216,221,231,213,213,202,222,223,236,228,241,239,241,251,237,250,250,239,224,234,229,232,252,248,215,196,162,132,80,30,19,88,191,234,216,232,226,238,222,221,216,226,196,215,229,224,209,210,216,209,222,210,237,193,224,220,197,214,232,228,217,227,239,237,241,198,219,197,205,203,221,211,222,222,206,210,243,199,213,231,224,192,206,191,220,243,193,212,215,206,211,209,205,220,212,202,232,200,203,220,186,226,120,7,15,3,10,5,13,14,13,16,20,3,13,3,11,11,3,19,9,32,25,18,14,13,1,241,226,211,238,204,203,208,225,235,218,226,229,244,224,236,239,243,218,237,236,227,242,237,253,219,238,244,237,228,236,250,235,225,232,222,254,220,234,243,236,246,224,232,246,226,246,228,243,231,234,243,235,232,224,248,230,254,248,243,238,213,228,238,239,247,246,238,235,250,244,251,237,224,219,243,249,231,249,215,240,240,244,252,232,235,238,222,224,227,235,240,227,243,235,227,212,241,236,234,223,231,246,243,240,222,250,228,254,233,226,231,209,242,246,231,223,237,232,233,240,251,207,246,226,241,235,228,247,235,240,236,249,247,215,225,219,225,240,218,241,229,231,240,234,247,243,230,241,221,224,215,243,237,205,242,239,242,252,225,245,245,248,248,243,249,235,244,241,232,245,241,237,224,223,240,214,241,227,230,239,238,254,251,227,252,223,243,233,228,249,209,239,207,210,242,243,243,242,222,231,231,247,234,250,230,232,254,232,239,230,245,220,240,229,226,227,238,245,237,232,238,227,231,239,239,239,234,232,234,249,235,242,239,254,244,246,215,255,217,240,246,230,223,250,255,218,250,203,233,255,246,244,244,235,254,247,218,253,236,249,217,241,231,233,249,233,216,223,245,216,233,242,238,229,220,247,219,230,231,221,236,246,236,235,231,232,227,208,233,226,231,234,230,231,242,249,225,254,237,243,228,234,243,252,242,239,254,225,215,241,249,239,232,222,247,252,241,224,237,230,234,228,247,246,250,244,210,193,185,181,233,233,242,229,249,221,204,212,251,227,198,229,245,240,216,76,82,106,89,3,97,231,233,248,210,228,229,241,234,84,17,23,5,40,68,55,31,35,34,68,113,171,167,115,14,22,45,47,53,41,64,56,14,15,18,47,38,25,58,110,167,114,30,29,16,78,45,11,23,16,4,32,14,22,6,40,18,11,7,0,17,20,39,54,22,25,17,42,16,42,12,25,43,47,39,44,9,36,80,83,99,60,65,72,35,55,111,74,68,15,103,227,253,242,253,243,252,220,250,237,166,235,247,234,228,231,234,224,252,186,81,83,38,50,107,12,102,64,21,18,1,19,7,48,15,27,30,58,92,59,42,37,47,28,18,117,246,248,241,236,224,249,237,242,251,227,255,242,246,243,254,232,198,216,217,245,138,171,162,153,240,254,233,215,203,246,216,186,174,190,225,181,100,88,86,162,135,119,120,91,66,111,205,227,243,249,250,255,255,250,239,225,241,228,254,231,222,176,157,151,180,143,144,142,153,193,148,114,84,41,69,9,39,213,249,228,252,236,226,206,212,226,206,190,211,215,209,243,228,246,250,239,255,249,245,243,250,254,231,235,207,204,209,225,223,235,218,225,234,220,230,214,231,236,246,233,236,235,229,227,231,231,235,231,252,245,236,146,156,96,43,27,37,124,246,241,234,209,218,242,242,238,213,202,219,211,206,213,209,216,225,184,226,222,227,239,209,233,206,208,198,217,230,220,214,216,210,209,209,231,233,219,208,217,217,205,233,228,224,204,225,216,212,199,207,229,214,221,229,183,234,246,226,212,220,221,219,230,208,195,190,202,205,213,208,109,15,3,7,23,18,22,11,10,17,2,13,8,18,11,9,19,12,14,19,1,18,9,9,9,233,231,217,229,216,195,229,246,219,230,235,236,220,217,248,225,232,245,249,236,235,237,243,231,240,246,236,224,248,241,242,252,248,226,234,230,249,247,227,235,239,229,238,225,248,246,226,241,240,223,242,251,225,240,207,222,246,249,246,249,241,250,239,238,239,217,240,239,250,239,217,235,227,242,232,234,226,231,210,255,236,251,251,240,241,247,234,218,228,246,244,223,235,214,233,234,232,222,233,214,232,243,255,243,246,242,228,213,235,244,240,242,245,243,225,248,239,247,238,230,242,230,248,206,224,237,232,236,251,240,224,240,247,236,243,237,232,229,254,237,249,237,253,211,224,247,250,238,224,216,222,236,211,237,241,240,255,236,254,251,224,229,240,229,229,236,254,234,222,247,238,234,247,230,237,230,238,233,237,241,211,226,207,239,213,238,234,252,216,235,211,226,241,232,241,246,252,248,250,233,246,215,233,237,248,241,237,231,242,215,246,219,240,238,223,239,234,228,230,252,232,225,211,235,226,248,244,242,231,229,229,234,233,205,223,253,230,241,238,248,254,246,247,244,230,253,241,247,224,255,243,248,229,251,229,240,239,249,222,241,234,246,224,252,212,218,232,236,235,241,220,224,240,212,208,222,243,244,232,228,244,240,237,245,225,239,230,214,248,211,222,246,228,222,224,235,246,240,242,247,247,223,243,223,246,246,229,247,245,252,237,236,225,245,239,212,250,207,238,237,243,244,232,242,241,239,209,218,214,216,246,241,208,251,215,193,233,246,233,201,211,242,234,251,192,161,148,128,130,53,160,234,234,238,243,227,247,249,242,132,46,1,1,49,75,33,47,46,37,17,50,93,193,184,60,40,13,33,53,44,30,9,5,23,24,17,2,44,209,231,242,238,207,136,135,130,23,16,2,0,29,14,9,19,24,33,19,32,14,16,21,49,24,36,43,44,25,35,35,35,24,26,48,62,47,32,60,54,89,89,89,61,55,50,53,61,79,110,40,58,162,245,248,239,237,229,237,228,254,190,173,234,244,213,217,249,239,248,242,164,69,68,73,83,88,37,83,47,8,38,6,20,28,10,31,25,45,46,84,102,77,50,92,42,33,140,221,236,250,251,237,247,254,253,252,250,240,235,230,239,255,192,128,61,14,85,30,31,89,81,189,244,250,247,244,252,242,175,184,181,185,225,97,18,24,17,58,44,33,38,31,107,239,232,255,249,252,248,255,246,233,234,245,255,238,247,231,168,139,169,171,183,113,152,136,157,145,49,25,35,39,17,63,233,249,227,239,221,230,226,217,211,230,212,217,243,212,218,204,154,144,155,207,231,245,252,242,255,215,219,242,228,212,214,226,205,211,235,228,223,206,214,218,248,229,189,103,97,104,199,223,214,249,248,252,236,179,119,107,41,2,70,220,241,252,208,173,222,238,243,225,244,217,217,200,214,205,215,218,230,233,221,234,206,220,230,218,236,211,229,233,235,241,220,210,207,231,219,209,231,212,202,207,215,213,229,204,194,211,202,206,226,234,203,207,218,218,248,231,202,215,220,222,189,209,212,218,204,229,214,200,203,212,214,215,130,26,9,9,0,10,25,16,20,35,37,17,22,11,5,1,26,18,7,5,8,13,23,28,9,236,238,224,239,224,243,240,224,232,239,242,234,244,213,222,246,225,229,246,246,236,208,248,242,242,238,236,237,246,234,223,237,225,231,241,245,228,233,241,249,224,253,238,247,226,226,230,229,251,234,220,236,248,232,220,255,231,228,225,215,224,242,238,224,240,247,243,214,234,235,242,217,230,224,227,229,218,233,240,241,240,237,227,240,251,237,227,209,221,238,215,245,240,239,217,233,244,238,230,238,222,244,227,218,226,224,219,226,245,245,225,225,235,217,250,244,224,253,229,242,233,249,247,255,228,252,232,253,225,242,235,245,221,243,242,250,234,243,236,237,212,241,242,209,235,215,242,233,247,232,222,219,245,240,215,238,252,250,241,255,249,252,229,254,252,245,247,236,242,243,244,241,219,237,202,233,236,229,246,202,209,239,216,233,234,229,240,213,222,237,222,236,238,215,241,229,242,248,243,237,229,254,239,253,230,235,230,253,242,241,213,240,228,213,238,237,224,241,214,239,224,211,223,219,228,215,226,241,249,240,238,218,239,240,225,245,232,238,239,230,231,233,252,251,253,240,245,234,232,219,234,231,250,233,237,245,246,230,241,242,251,230,238,239,228,242,216,227,230,243,248,245,218,204,228,237,211,237,221,243,233,241,223,241,242,225,201,227,230,246,249,232,234,246,215,240,241,242,247,253,233,235,230,244,245,233,250,248,233,243,222,219,250,223,238,245,236,230,234,229,228,223,247,247,242,210,211,226,231,223,217,213,188,212,199,197,230,253,232,231,188,252,251,247,230,149,161,138,104,50,168,240,224,249,226,229,252,250,239,122,48,4,17,70,75,66,50,50,37,22,53,51,119,185,114,62,27,45,39,44,39,16,13,27,14,38,28,187,240,246,239,150,132,151,164,184,63,14,19,5,22,21,34,8,35,13,16,5,25,28,41,47,42,63,87,68,64,66,60,61,53,48,43,45,46,15,77,60,62,93,75,80,89,75,45,58,90,53,18,67,162,248,237,255,237,250,229,229,250,219,173,228,232,250,241,231,229,255,231,170,78,58,52,41,81,25,85,35,9,24,8,46,36,20,11,36,30,89,173,182,141,75,91,88,30,128,208,233,247,244,240,236,213,233,246,254,247,222,237,242,228,185,99,25,0,4,9,73,92,6,139,253,247,238,250,250,243,198,178,153,216,230,162,66,56,47,44,45,50,30,20,72,149,229,242,251,255,223,255,226,253,234,254,249,223,228,201,153,156,156,172,123,135,146,146,146,96,27,27,9,25,22,106,224,243,234,241,229,229,220,227,209,207,219,211,208,225,246,173,86,80,43,11,122,109,144,209,229,219,230,226,219,232,247,218,205,216,236,219,220,207,204,217,234,189,93,45,22,0,75,198,238,253,239,215,171,92,78,30,16,153,236,250,252,232,207,208,226,224,240,212,215,208,206,231,212,207,205,209,225,224,193,233,199,220,215,204,193,221,220,223,216,235,205,189,216,225,226,223,230,206,201,215,230,227,219,229,230,224,211,217,221,228,233,198,205,231,209,205,209,208,208,233,235,204,212,228,224,208,214,230,207,210,235,239,105,22,19,9,4,3,8,23,7,16,23,4,3,8,6,0,9,5,10,13,19,16,23,7,5,231,236,232,232,230,244,219,221,240,242,231,250,239,215,238,225,241,236,232,229,237,240,214,247,213,232,253,246,223,226,233,220,233,235,246,235,223,233,236,244,250,233,247,247,221,239,240,234,219,228,228,232,239,234,243,254,249,237,229,242,218,233,240,228,237,226,248,217,222,235,236,214,223,230,225,229,210,238,217,251,237,242,216,238,230,244,220,230,219,245,243,237,243,229,202,236,230,227,230,240,243,247,229,215,234,238,230,245,228,229,223,223,238,237,249,241,233,247,247,238,246,255,253,252,242,231,251,233,244,231,252,244,243,241,222,245,253,248,243,235,232,234,238,243,235,237,222,227,240,242,208,240,227,223,226,238,244,238,241,251,231,234,224,226,232,246,248,231,229,250,249,233,241,245,240,241,246,233,215,242,240,246,206,233,233,241,233,243,239,238,238,222,239,247,247,231,243,220,238,247,245,249,249,255,235,246,245,255,233,248,233,227,236,210,215,226,248,232,235,224,232,239,242,220,226,218,237,246,213,237,234,232,228,232,235,219,210,246,243,242,251,244,227,198,224,245,246,245,237,255,245,231,235,230,214,241,254,245,235,250,239,242,255,238,225,215,227,236,234,243,226,240,252,207,227,252,222,251,226,225,241,217,229,238,255,247,225,248,252,229,252,250,247,253,229,230,250,226,233,243,243,243,252,234,248,207,245,234,215,242,247,245,237,240,233,233,254,211,232,221,254,251,219,244,242,248,234,228,189,176,232,216,217,224,248,231,251,244,250,182,171,247,229,247,204,171,120,111,90,30,186,236,239,245,215,196,242,243,243,172,131,46,15,25,57,47,45,44,44,21,52,25,77,116,164,144,45,48,38,32,27,11,20,34,25,168,255,253,233,237,224,92,26,5,131,197,129,75,29,14,23,5,14,26,5,32,10,26,17,63,134,90,22,9,57,60,91,81,65,51,39,29,27,25,33,57,54,60,57,91,61,68,67,46,67,93,82,64,11,49,223,238,241,251,221,238,214,236,246,173,182,229,248,254,245,246,247,234,254,244,86,49,69,86,60,54,72,29,15,16,19,25,36,41,28,25,45,95,161,147,119,50,92,122,110,138,193,225,250,255,204,131,65,33,44,166,235,246,249,249,245,156,65,11,1,55,16,31,22,10,127,184,156,139,170,231,254,255,188,165,218,229,203,100,69,54,52,47,32,23,32,6,90,229,251,235,244,255,251,222,247,253,238,252,230,252,146,107,143,161,151,159,149,139,140,127,60,11,5,45,28,24,5,12,119,221,216,248,232,200,222,215,207,223,225,227,244,253,161,69,99,54,5,21,21,5,39,165,180,210,215,202,215,223,223,232,243,208,217,192,234,239,241,250,163,89,81,26,4,78,219,225,237,152,113,69,32,25,92,228,241,232,250,214,173,179,228,222,213,232,242,215,220,226,242,223,211,218,211,203,216,214,211,241,200,207,228,229,224,211,216,211,218,220,210,198,211,223,221,216,226,219,222,209,208,219,189,241,212,216,227,223,208,207,210,220,215,216,211,216,214,235,211,204,215,231,217,219,227,219,218,206,232,186,211,79,15,9,0,19,1,30,18,0,16,28,26,17,7,0,5,2,7,1,1,12,7,7,21,22,235,222,249,229,224,250,230,228,218,236,229,198,236,229,228,228,234,230,204,234,245,232,244,242,233,239,235,224,224,237,218,226,229,235,242,238,241,249,235,242,232,230,248,250,224,238,220,234,223,246,219,241,226,230,236,217,243,240,245,240,235,232,244,228,239,247,227,217,228,222,253,239,230,236,225,216,230,245,219,236,227,230,239,246,236,252,229,213,239,226,237,243,230,244,229,228,222,245,217,247,238,245,212,240,226,217,237,215,251,249,242,237,243,231,216,233,219,233,238,249,209,227,224,240,237,239,240,229,233,233,231,239,239,240,225,242,249,231,216,236,232,225,216,228,230,242,235,243,230,236,218,218,232,237,218,242,238,234,241,253,235,225,233,223,213,230,222,237,229,238,255,226,239,235,255,231,216,234,206,219,222,213,249,224,229,225,242,236,208,223,228,250,218,253,241,247,236,242,234,245,238,242,240,245,249,252,227,245,221,245,235,236,239,216,220,234,232,248,238,231,228,202,231,232,246,226,242,236,226,237,225,219,214,221,253,227,226,253,253,222,252,243,238,233,246,252,227,235,247,233,242,237,228,238,249,232,230,239,232,228,244,224,249,243,235,208,242,231,246,235,225,225,245,209,236,245,227,238,234,219,241,252,225,253,235,249,240,246,253,240,236,243,242,242,235,221,234,237,227,252,241,247,242,232,211,214,226,243,229,246,236,222,239,207,240,255,244,242,234,240,248,245,247,250,243,222,223,220,162,220,226,189,224,241,233,213,244,216,233,183,173,231,240,246,215,135,154,135,114,37,182,226,219,243,236,226,246,243,237,255,221,122,45,3,4,31,18,31,26,25,34,49,49,34,112,215,122,102,47,17,2,10,8,10,95,238,248,227,209,148,97,43,34,25,25,98,141,81,129,45,41,8,29,19,27,15,17,8,29,151,249,139,36,10,68,99,95,60,51,33,21,13,27,12,47,56,64,72,90,87,68,65,29,38,19,72,57,13,94,152,185,237,240,245,246,234,235,236,242,177,244,255,245,252,250,246,243,252,246,197,119,16,58,108,15,30,72,28,22,23,102,133,64,29,4,24,88,126,166,95,73,29,33,124,133,141,234,242,238,237,174,68,23,11,23,128,241,250,254,255,163,68,24,8,24,2,10,116,156,86,56,37,1,39,2,70,186,234,218,149,199,232,212,120,72,71,23,16,34,36,108,72,125,234,247,245,252,238,252,247,252,247,237,222,239,181,152,150,158,146,171,169,137,134,121,122,94,100,36,51,46,39,28,36,10,105,218,231,255,221,230,212,207,241,229,251,255,250,160,57,29,54,225,161,56,20,27,154,242,237,234,223,212,235,252,237,244,250,232,249,222,222,222,244,225,176,137,68,62,200,254,202,174,107,71,28,38,158,232,234,250,255,221,170,170,243,217,227,206,210,212,227,212,223,212,227,234,226,217,236,220,186,220,228,211,227,216,235,208,222,202,243,231,202,207,220,209,226,213,228,214,198,203,230,215,223,244,224,209,222,222,235,233,233,211,201,223,227,216,216,209,205,203,225,209,214,228,207,239,210,222,196,233,223,206,119,10,10,14,19,12,20,0,14,15,9,8,17,0,5,4,13,11,6,22,32,0,16,29,14,234,226,232,229,250,224,247,222,211,219,239,233,223,224,204,247,232,231,245,253,240,219,227,219,237,243,245,216,222,227,221,243,252,244,235,236,225,218,232,255,239,233,247,244,233,230,244,245,230,245,223,216,245,234,243,244,245,254,244,247,223,227,231,227,242,247,237,229,237,237,223,219,229,245,247,253,239,218,249,224,239,241,232,235,236,242,232,239,227,229,231,239,243,215,222,238,226,234,235,236,215,221,225,231,241,216,223,235,227,241,216,237,238,222,227,205,218,221,236,230,236,240,241,232,235,231,244,250,254,242,236,222,230,227,241,247,253,244,225,249,233,237,240,229,250,224,235,247,241,234,229,247,231,240,217,247,223,235,222,246,228,234,201,216,220,230,241,233,241,223,241,224,213,243,245,234,253,241,233,245,237,212,228,254,245,229,252,243,215,237,239,211,247,245,231,243,242,238,244,234,243,244,233,219,230,246,227,235,239,215,217,243,224,218,220,217,219,240,222,226,217,233,246,234,218,223,233,223,237,236,220,245,234,244,207,238,230,240,255,253,251,252,235,228,246,248,250,253,228,247,243,239,234,245,236,225,237,230,231,248,242,240,236,236,231,232,245,232,230,249,236,235,229,232,255,249,242,240,225,251,238,221,245,246,255,233,247,253,227,225,237,228,231,253,228,249,243,250,234,248,224,246,230,228,225,249,253,230,243,254,244,242,224,233,237,243,249,249,238,244,242,248,249,255,255,183,194,220,224,230,244,241,231,242,206,198,225,234,251,176,174,253,230,251,207,134,140,122,118,64,188,244,245,240,245,236,240,242,237,240,253,155,143,72,15,16,18,3,12,6,9,35,52,45,28,127,192,139,60,17,11,29,15,58,20,48,74,36,76,55,36,46,25,36,15,61,81,64,166,136,105,41,16,10,8,7,1,17,10,131,228,108,33,7,56,103,73,30,30,0,21,40,25,32,21,36,64,88,102,88,52,49,23,20,36,53,99,119,166,121,161,220,224,236,232,225,250,254,252,237,251,240,254,226,156,142,163,232,253,189,109,43,77,97,0,43,53,41,27,14,187,191,46,25,3,39,97,167,213,136,108,84,65,105,94,116,237,240,229,245,178,67,52,6,21,190,229,253,249,192,62,25,3,38,3,41,210,245,219,80,14,5,8,13,26,29,63,169,223,173,210,243,237,163,148,73,7,23,12,86,222,174,138,253,222,251,250,229,244,234,238,233,215,231,224,161,125,135,152,182,160,151,138,121,137,180,176,237,180,63,36,31,32,30,41,11,48,201,249,236,232,234,237,236,233,237,199,122,92,53,31,160,231,233,193,43,35,159,219,246,239,250,238,234,224,237,217,249,246,240,226,241,245,247,243,223,150,104,3,104,174,144,112,49,34,84,175,228,240,250,252,235,195,165,173,206,241,212,220,231,218,231,218,202,233,246,211,215,237,218,208,240,227,215,227,250,229,228,249,197,207,217,202,199,221,215,231,213,208,218,207,216,217,217,224,230,232,214,211,197,207,220,197,220,209,225,221,211,227,212,207,214,235,199,215,214,222,223,233,209,214,228,191,198,209,108,11,30,8,8,7,3,15,25,11,23,10,23,20,0,5,5,4,10,24,3,4,24,34,15,238,249,240,211,247,226,212,238,230,229,230,208,226,228,229,228,231,233,239,219,234,220,242,229,231,241,221,230,233,226,237,241,237,198,208,236,209,252,219,218,229,229,236,228,242,250,226,242,251,232,248,230,224,230,230,246,229,226,246,232,235,243,226,246,232,226,242,241,248,225,237,223,227,224,239,230,231,202,243,222,223,216,228,235,241,234,236,241,246,219,242,223,204,247,219,223,230,240,236,221,229,217,213,227,230,209,243,242,215,241,240,224,231,217,231,210,234,239,220,238,228,229,235,224,244,239,247,233,243,230,227,249,242,223,238,222,231,242,228,223,242,227,228,238,212,221,234,232,247,218,224,218,238,219,222,254,229,235,211,242,233,220,227,233,248,226,245,219,225,246,232,213,234,228,215,214,226,238,227,239,235,246,225,221,225,222,239,204,238,233,235,242,225,214,245,249,229,223,218,236,230,254,251,233,231,236,237,246,218,218,244,238,230,240,236,219,214,237,223,215,240,206,223,229,226,216,250,229,218,246,241,210,232,230,241,240,249,245,231,222,254,239,232,226,249,244,246,243,220,252,236,240,236,255,221,227,232,243,243,249,252,254,221,244,236,254,243,237,230,236,249,218,200,216,233,235,233,229,226,239,248,238,255,244,243,249,252,255,242,238,231,226,232,238,240,241,229,240,247,233,243,222,255,255,242,242,217,247,240,230,254,255,218,245,221,250,255,245,217,250,236,224,231,245,244,225,232,237,226,221,220,190,196,205,222,208,246,223,253,215,201,234,229,222,188,155,130,128,102,70,183,252,237,250,237,233,212,224,209,254,236,192,226,155,145,93,7,9,15,11,18,47,75,81,11,73,140,185,81,18,6,3,58,49,29,33,50,42,69,60,34,33,29,52,12,67,63,23,42,111,203,130,89,60,45,37,13,11,45,35,52,46,26,36,16,78,63,26,25,10,16,23,9,17,19,44,66,72,106,34,48,19,26,49,105,213,205,168,116,136,219,228,251,238,239,226,248,255,250,214,247,238,250,128,100,97,63,131,148,154,110,17,90,94,16,64,48,7,52,11,52,115,50,34,12,64,149,204,155,105,87,83,31,59,52,124,242,252,239,219,99,20,12,8,95,219,237,241,160,80,15,1,38,69,106,234,239,251,144,43,39,79,80,74,10,4,9,133,200,184,205,225,219,155,137,53,31,25,39,188,215,237,159,198,255,239,241,253,244,254,233,228,234,222,187,156,151,138,178,165,177,146,99,133,160,213,220,219,217,98,88,139,85,51,34,15,23,91,229,238,223,221,238,249,188,116,26,13,9,14,7,35,159,227,229,107,6,59,193,228,229,242,223,139,106,90,108,199,238,237,240,221,214,232,207,197,186,118,55,48,30,34,51,21,78,180,245,249,240,241,247,208,181,164,239,224,219,216,201,209,216,209,238,235,218,225,223,213,224,236,249,224,203,202,208,208,212,200,226,220,235,204,228,222,216,221,191,207,218,221,219,216,219,213,217,202,195,226,219,212,207,233,204,205,215,220,237,231,215,201,221,206,236,209,210,177,234,208,199,216,215,206,229,206,209,121,9,11,3,4,1,29,17,5,14,36,9,15,5,2,9,4,1,13,9,7,7,23,27,52,230,217,236,239,225,232,222,206,224,227,213,248,223,245,224,208,207,242,221,216,240,246,219,240,203,229,215,242,236,232,241,232,203,223,221,226,240,239,208,249,231,232,239,220,230,243,248,247,214,228,255,235,232,232,243,242,218,210,248,216,223,232,240,234,245,243,238,245,242,246,246,232,243,223,244,237,224,232,244,231,215,245,238,231,231,234,210,228,211,227,240,213,240,215,232,233,234,211,241,219,216,220,217,245,226,215,237,221,228,233,237,215,222,234,241,242,215,248,249,217,241,244,224,240,248,224,228,231,219,236,227,236,239,230,210,219,215,244,234,230,239,224,231,234,254,234,224,253,252,226,244,237,243,235,212,213,224,224,234,244,247,207,230,229,208,235,230,233,233,228,229,251,235,240,239,212,216,243,220,225,229,220,240,219,237,233,230,236,242,249,224,241,234,238,232,241,246,218,227,225,223,247,237,238,249,216,235,204,227,220,228,228,242,224,252,216,245,234,232,224,233,237,247,233,237,232,234,220,245,222,229,235,248,201,253,230,231,227,241,235,253,227,219,252,232,240,240,242,240,248,251,220,240,255,224,239,245,232,252,248,237,238,250,229,250,250,255,238,212,250,234,217,231,231,240,224,207,228,244,238,247,226,226,244,247,252,248,245,253,251,232,248,236,247,222,255,247,231,237,229,230,245,252,251,222,246,247,234,237,243,248,245,244,251,242,229,224,241,220,245,236,238,243,255,220,210,219,236,204,195,204,166,213,248,244,233,254,230,252,191,189,233,244,239,160,151,86,107,91,32,194,236,246,250,234,239,231,241,247,239,237,202,228,252,243,151,74,22,0,18,19,48,65,39,49,11,17,90,129,60,19,25,64,52,31,14,48,36,114,106,23,25,13,37,29,51,95,34,12,27,93,135,155,124,72,63,63,48,21,38,39,20,64,11,19,39,24,35,21,8,16,11,38,42,32,57,40,32,69,74,142,141,162,157,153,215,176,116,109,155,210,248,216,233,246,244,208,180,104,56,133,206,145,71,139,120,15,21,9,37,38,11,84,106,69,74,48,42,24,4,65,107,44,37,29,75,171,202,122,64,39,133,131,60,39,53,224,225,199,109,29,9,15,10,44,188,192,161,91,8,14,88,252,240,231,252,253,98,60,116,240,235,226,211,101,13,28,145,194,207,211,236,178,95,36,28,1,25,103,223,220,240,124,183,254,241,252,235,251,232,230,235,216,209,133,158,156,155,153,180,153,138,135,123,188,238,179,123,112,81,226,222,207,105,54,29,5,56,158,240,251,229,243,214,116,43,22,0,9,6,32,14,52,158,211,148,54,22,91,204,245,246,185,111,51,23,14,72,188,176,244,140,75,129,115,112,127,125,123,99,52,17,26,42,145,217,255,250,246,244,219,186,154,203,212,214,209,226,210,202,228,237,215,232,184,216,229,249,245,238,245,248,231,239,210,210,237,195,230,233,226,225,213,213,214,228,216,219,229,230,210,213,217,222,215,223,226,204,206,220,237,240,223,183,226,211,234,221,214,215,206,226,209,209,211,218,212,231,212,213,217,223,193,203,203,119,34,6,4,28,4,5,15,30,15,15,28,19,19,3,6,6,13,4,17,29,12,8,0,8,240,239,234,249,238,245,235,226,226,218,229,218,231,242,208,216,239,241,230,242,231,236,215,240,251,241,232,230,238,245,228,208,233,212,233,221,220,225,239,234,238,247,227,235,223,238,229,253,253,240,231,245,239,236,232,230,227,246,220,241,225,235,236,241,232,244,242,232,228,230,218,215,236,249,232,215,239,240,216,235,226,240,224,248,237,215,215,239,230,231,221,226,237,227,224,222,243,224,223,225,226,228,227,239,228,231,223,238,245,220,239,230,225,240,221,224,240,192,252,222,243,217,214,216,206,244,220,245,226,208,242,217,227,221,236,225,248,230,242,233,234,238,230,236,223,242,233,237,231,231,228,230,240,222,239,239,237,216,217,246,208,250,232,218,206,252,232,245,231,236,223,228,248,233,236,247,233,236,236,244,235,229,246,227,234,248,243,237,250,248,226,226,246,227,243,247,245,238,242,235,249,249,252,231,224,243,227,231,239,200,238,235,223,226,239,227,218,235,230,200,227,247,237,248,213,204,234,212,220,227,230,211,230,231,248,252,191,248,241,255,229,214,208,240,231,230,224,232,247,248,239,245,231,252,242,252,238,254,246,209,240,254,251,219,252,230,232,225,213,249,251,222,236,239,218,222,247,250,251,244,235,226,237,234,231,251,237,228,255,241,249,247,250,235,249,224,249,248,234,223,242,248,246,239,222,234,231,237,247,251,252,238,249,221,247,219,240,247,251,236,254,237,249,222,208,208,216,175,188,202,233,216,240,252,231,212,242,237,221,185,195,247,238,254,169,164,128,109,121,70,179,239,243,247,219,218,239,245,225,223,239,221,251,247,237,216,149,150,99,25,9,13,7,8,16,0,26,13,96,126,27,20,13,31,44,20,29,34,71,105,21,21,18,23,39,66,95,45,42,29,71,18,64,155,176,112,46,53,29,37,34,26,14,35,20,40,25,21,32,16,9,24,30,18,27,33,10,29,109,124,184,252,243,238,203,246,160,122,99,173,236,224,254,243,145,86,19,2,7,18,21,15,32,15,27,46,35,47,19,27,59,28,54,99,60,69,37,27,14,0,182,217,61,20,54,97,128,110,107,20,34,183,129,51,29,5,45,69,76,33,11,111,148,54,8,10,20,41,23,30,167,240,247,253,255,253,117,24,81,243,230,239,247,195,91,32,22,165,214,220,235,174,95,63,51,44,2,123,217,200,247,171,95,112,225,235,232,243,238,255,208,232,194,190,107,128,139,156,185,168,170,135,138,146,201,245,165,44,28,55,235,234,242,171,86,24,17,3,91,221,239,245,246,161,76,32,16,23,160,210,139,80,19,12,89,123,68,27,29,128,235,224,212,136,109,43,11,22,119,206,240,187,61,27,0,32,58,49,82,97,106,138,101,134,173,248,237,243,235,210,183,173,184,222,237,216,214,220,198,230,209,223,226,252,236,239,236,250,252,250,222,242,251,241,207,234,224,212,250,201,237,237,205,210,210,221,225,227,215,216,234,208,225,211,230,203,227,206,201,210,233,199,215,217,209,189,221,232,208,231,208,221,220,226,221,203,211,203,210,213,213,224,227,210,219,104,8,14,18,22,12,9,7,6,12,11,25,12,0,11,5,4,25,2,4,7,15,2,12,8,238,225,228,237,209,244,209,215,246,231,222,218,220,242,249,209,231,241,214,237,229,219,232,240,207,231,243,244,221,211,220,217,242,218,216,203,230,239,226,234,232,239,227,230,238,213,248,229,231,242,227,243,223,248,219,246,255,243,229,231,233,203,222,234,236,222,219,231,241,241,232,236,227,246,223,249,223,204,238,238,244,240,216,222,245,239,250,248,234,231,225,246,233,232,239,214,241,217,249,226,222,248,220,232,231,234,255,211,231,245,233,224,231,250,238,252,239,216,209,237,216,226,234,223,245,240,230,239,241,251,233,228,213,246,246,244,242,226,233,234,201,241,246,219,236,229,224,237,248,221,248,234,239,247,232,234,250,246,240,221,242,205,229,245,243,226,233,233,234,203,222,230,247,240,218,219,236,227,237,221,212,227,236,227,240,209,229,242,238,227,233,242,253,203,222,241,216,224,235,241,227,252,251,240,226,243,232,232,217,217,228,229,217,215,228,231,232,226,230,229,235,227,247,252,210,224,225,247,221,215,238,218,225,242,244,236,238,250,232,255,253,244,252,234,229,253,234,237,245,224,222,251,230,254,233,227,236,246,231,249,241,255,237,245,235,252,247,249,247,254,252,243,244,238,216,249,239,242,255,251,235,232,250,249,247,249,252,247,231,228,242,220,247,255,249,233,240,254,248,235,243,232,225,246,239,253,235,239,237,244,253,229,232,237,251,234,239,245,255,219,236,247,236,221,234,190,199,208,206,237,228,211,207,222,203,218,226,243,231,187,219,255,254,247,191,162,123,120,106,47,196,239,222,231,236,224,250,230,215,238,240,227,201,215,232,230,239,254,216,234,248,207,212,199,208,184,141,147,176,190,162,99,65,55,19,25,29,10,77,69,34,46,18,77,59,76,77,46,47,44,40,25,29,38,96,184,147,132,97,68,8,12,31,28,38,17,12,26,41,32,13,19,18,33,6,92,151,179,228,167,236,255,192,169,230,226,150,97,79,209,239,255,252,165,57,1,0,26,6,30,25,19,48,0,156,199,144,43,18,21,37,54,83,151,111,85,35,9,22,56,236,217,92,32,25,103,135,82,66,68,54,72,80,41,60,17,17,22,35,90,209,244,242,178,88,29,14,25,25,153,247,244,246,252,207,128,20,55,233,236,246,192,65,45,9,1,79,249,252,237,181,119,54,14,46,28,48,196,252,193,142,62,24,61,227,246,254,238,242,243,227,237,182,143,136,134,162,154,173,155,167,129,157,127,191,234,173,124,9,19,111,221,229,187,103,50,11,12,33,136,233,236,229,171,88,42,40,167,238,242,250,176,101,14,8,97,104,64,3,38,192,234,249,145,20,22,6,61,145,211,241,203,132,58,45,34,33,22,35,132,165,196,207,173,115,153,234,226,232,198,144,169,233,220,206,221,229,210,214,224,215,227,204,241,244,239,231,186,139,130,133,192,217,214,226,216,221,204,238,222,209,223,229,248,239,225,219,242,228,228,223,198,215,212,214,218,218,233,221,218,214,210,213,217,223,225,224,225,225,208,243,194,223,226,216,210,208,217,215,229,219,231,222,217,190,96,19,0,1,4,20,12,9,20,26,17,32,25,19,13,7,6,24,24,5,10,27,2,23,47,242,215,231,217,236,201,235,214,230,245,241,225,243,232,246,224,214,224,251,236,213,232,231,244,240,246,214,224,204,221,244,220,231,241,234,211,222,225,234,234,218,223,211,240,223,243,228,206,223,229,215,238,233,238,199,236,227,234,208,228,228,211,231,219,206,247,240,230,214,216,226,227,240,200,226,247,249,230,217,235,238,226,224,228,240,235,227,241,224,237,242,208,221,240,227,247,240,228,208,228,244,227,225,212,223,219,237,213,241,238,239,237,253,235,245,222,228,212,239,216,242,252,228,242,239,241,245,244,229,221,240,215,217,229,241,219,230,228,231,253,230,240,229,230,226,233,207,190,247,236,235,226,252,210,235,212,201,211,232,243,218,241,221,241,250,218,212,232,232,243,221,230,214,234,241,227,218,231,231,228,219,230,215,218,236,211,246,224,217,240,240,231,239,238,234,222,241,238,245,234,216,228,232,234,250,242,229,253,215,242,199,225,227,207,212,227,219,234,231,214,221,222,204,231,243,243,235,222,210,223,231,237,232,248,224,248,239,254,246,255,246,252,242,250,253,243,246,231,255,250,238,239,236,246,247,233,254,246,241,222,242,238,250,234,242,250,238,233,231,233,224,244,249,233,241,254,228,233,241,228,242,253,244,246,240,244,234,227,232,250,235,220,255,243,221,249,231,232,246,215,237,251,228,242,246,252,231,250,216,248,236,242,252,238,243,224,252,212,245,247,234,251,222,248,236,198,238,215,219,243,220,170,181,233,232,209,245,237,218,163,217,239,234,242,173,177,98,120,93,42,192,237,247,246,246,233,250,238,251,179,210,231,228,232,216,229,230,240,238,244,226,255,254,247,251,241,250,253,253,252,241,238,255,249,216,135,83,22,67,46,26,49,86,91,61,68,95,55,46,27,50,35,29,42,13,45,90,162,133,98,18,10,50,43,48,42,14,28,24,22,60,8,22,83,216,249,244,235,188,174,246,237,102,91,222,237,157,68,107,233,241,255,172,47,13,21,15,53,23,32,31,25,41,11,86,227,224,141,51,50,83,62,98,151,127,107,29,15,37,97,250,218,56,32,13,69,49,52,31,66,44,96,69,41,31,22,42,20,32,164,245,254,252,220,206,67,67,28,58,194,234,244,220,152,74,26,13,137,239,212,232,188,82,42,66,135,241,238,244,180,96,13,26,36,57,4,113,237,236,165,106,47,12,23,151,241,225,243,254,232,211,193,152,162,130,157,141,149,162,155,152,138,125,159,159,132,153,76,9,76,147,221,255,224,91,37,40,23,36,25,99,213,255,253,120,42,4,18,99,177,225,237,164,80,19,108,165,125,64,25,49,163,127,108,50,26,13,112,221,230,234,225,156,120,70,62,31,51,67,139,123,127,117,65,12,20,171,228,250,183,162,187,234,223,234,226,225,239,224,229,239,236,230,208,142,137,115,112,94,105,66,110,235,226,255,243,237,247,229,218,223,201,198,239,222,218,216,220,229,214,222,223,225,203,210,220,228,227,214,210,234,180,207,224,235,221,214,212,221,224,233,237,217,214,197,225,215,238,213,231,197,229,202,236,231,119,13,11,19,8,23,19,45,19,14,22,25,30,27,16,0,11,14,0,7,26,12,19,1,19,214,234,216,235,225,234,226,227,231,234,210,224,255,243,237,222,245,232,241,229,244,227,214,244,222,227,239,225,224,230,233,249,227,226,205,247,244,245,213,227,235,237,239,218,243,242,231,245,215,242,238,221,205,214,231,220,222,232,239,232,234,217,236,232,228,219,227,221,237,233,247,222,209,236,230,202,236,222,220,231,236,237,235,236,208,249,226,250,238,238,219,212,221,239,232,226,232,228,229,216,229,246,238,222,229,237,229,223,230,201,236,237,213,233,235,222,243,231,245,213,213,234,234,228,219,244,238,252,215,238,232,243,235,234,217,229,215,222,250,236,224,232,237,238,218,224,243,214,228,240,238,244,221,222,239,237,225,219,223,224,233,211,243,234,226,225,230,242,243,228,242,224,233,235,239,212,238,227,230,234,248,205,214,226,229,242,234,222,226,231,231,226,242,241,231,244,235,236,245,232,244,217,238,240,221,228,247,236,220,225,245,216,228,231,221,235,199,237,213,234,217,223,218,222,234,253,226,252,229,243,239,220,237,228,235,226,246,240,230,255,238,255,239,247,229,252,247,253,236,232,239,248,234,247,237,243,252,248,251,236,253,238,215,234,229,251,250,238,255,246,246,242,254,235,240,236,234,252,250,254,250,251,255,229,250,237,229,249,255,232,238,247,252,249,238,241,239,233,245,244,236,238,240,237,237,252,248,249,242,247,250,236,239,243,251,240,239,230,235,245,222,235,237,228,208,211,222,196,189,220,209,201,207,239,242,188,237,248,200,188,217,231,252,248,190,141,127,100,66,36,206,247,249,249,227,229,239,241,220,186,244,243,242,253,219,239,206,228,228,240,249,244,234,253,246,252,252,244,249,252,253,247,255,253,242,201,108,31,46,119,155,137,127,109,73,61,111,56,38,28,51,37,53,69,39,41,57,15,147,239,208,234,219,194,74,6,86,179,191,209,198,210,234,253,249,219,186,145,163,223,243,195,51,88,193,223,160,77,126,232,255,216,94,8,23,5,21,47,23,32,30,43,38,39,25,189,255,238,158,61,94,76,89,120,139,89,21,100,81,73,169,107,58,28,37,34,22,54,22,62,107,80,46,67,50,62,66,15,26,175,229,231,248,242,233,121,35,31,33,98,112,57,19,26,42,10,3,131,249,254,254,241,252,251,252,251,218,197,99,48,0,22,46,51,34,33,208,230,241,208,148,85,56,2,129,218,246,231,220,238,225,182,149,146,139,148,147,149,186,152,142,143,119,150,117,23,4,18,96,223,248,249,243,239,140,56,30,14,15,39,19,62,192,216,202,115,3,17,34,172,199,233,249,120,57,2,70,125,83,61,43,57,47,48,15,41,164,232,251,248,242,251,234,180,104,77,27,44,46,43,47,68,82,85,29,26,121,213,199,173,197,231,232,248,248,237,222,223,225,236,223,251,212,126,96,74,22,47,36,75,75,124,226,230,246,254,242,246,242,235,234,231,209,205,210,220,216,241,215,222,207,221,221,230,228,215,217,193,213,227,218,228,232,196,194,232,215,243,210,220,234,217,209,248,204,226,219,216,212,206,182,209,243,221,234,109,13,23,4,25,0,22,22,6,14,5,10,7,15,15,12,7,25,6,2,17,22,5,10,11,225,236,231,220,232,237,222,241,197,222,199,253,245,228,223,215,227,233,249,214,231,236,226,227,210,224,220,189,214,219,202,216,220,243,218,229,223,234,227,208,240,231,234,212,240,233,238,222,242,220,216,225,230,227,232,214,236,241,216,206,226,245,209,228,210,218,231,245,240,253,224,210,240,238,233,205,235,222,242,233,239,226,226,233,230,245,237,232,190,243,232,228,239,238,244,237,229,236,252,223,222,223,216,220,203,223,236,229,206,230,231,250,225,250,245,234,249,232,245,231,207,236,218,231,221,235,241,240,234,231,239,228,216,238,210,254,226,235,232,239,228,238,227,233,205,204,235,214,243,240,235,235,222,243,238,228,227,243,220,228,228,213,203,216,215,230,214,216,225,229,243,235,213,235,217,236,248,223,238,225,238,206,245,249,241,237,237,229,246,225,244,239,238,229,235,231,244,248,236,241,241,245,253,252,229,241,245,226,247,239,223,222,226,210,209,225,219,229,205,216,207,226,234,221,213,236,241,248,238,235,221,238,217,236,236,229,231,244,240,239,243,235,219,232,239,244,253,246,235,234,239,249,245,241,250,228,252,240,237,245,255,228,226,241,244,234,252,234,235,241,250,236,245,240,231,246,235,253,254,240,252,244,249,229,249,244,230,230,239,233,243,232,210,242,255,233,240,246,253,254,234,225,250,221,238,239,255,246,231,246,253,244,232,242,251,255,233,255,228,234,237,255,233,242,207,238,205,179,215,228,210,218,251,249,194,198,231,214,228,195,214,252,252,231,170,136,112,104,94,35,186,233,249,241,214,231,244,250,217,195,245,250,246,228,250,235,208,215,183,222,236,144,148,223,231,250,232,198,246,247,253,255,224,248,219,179,57,5,25,112,219,171,146,126,71,68,112,57,10,24,59,29,44,54,25,14,52,10,82,242,248,235,253,210,157,27,133,229,253,242,243,235,252,244,244,215,120,157,140,244,254,132,115,101,163,206,176,106,166,251,248,180,40,31,27,20,35,46,29,10,76,47,34,19,47,123,237,234,184,80,36,64,80,134,143,35,19,138,79,72,89,22,91,29,8,16,4,39,32,100,126,78,66,66,52,54,74,35,37,192,252,252,255,191,143,87,18,5,27,18,42,49,50,62,29,27,27,86,215,226,226,242,247,251,240,253,150,47,2,8,29,27,46,17,18,191,236,249,241,220,171,161,171,95,140,231,252,231,243,243,212,164,171,157,139,126,165,153,155,130,126,138,146,152,165,135,201,237,210,239,252,246,246,254,203,123,31,38,25,5,14,38,45,106,210,219,230,233,238,236,253,241,240,188,87,27,22,13,39,65,14,30,29,24,185,234,246,247,241,241,238,247,210,162,123,61,23,39,35,17,36,18,134,229,143,39,72,128,155,190,222,241,242,247,244,245,251,242,233,248,217,233,212,129,104,30,1,18,11,41,25,90,160,153,189,220,235,244,253,244,230,213,227,234,241,218,219,231,230,227,223,224,236,226,212,238,227,230,189,212,214,218,243,216,216,201,192,221,228,206,221,211,204,198,212,214,213,196,198,198,212,203,225,221,210,104,1,2,5,0,13,20,0,7,17,13,20,26,12,10,1,22,0,14,3,42,34,8,14,20,252,240,228,211,203,239,227,225,239,235,236,243,234,247,215,235,219,206,232,241,226,239,242,237,232,235,225,215,231,234,222,220,241,228,219,214,235,225,231,230,232,239,197,237,243,236,220,226,227,237,205,242,243,238,222,230,240,197,229,230,248,228,240,234,212,251,227,226,230,244,247,238,217,229,234,244,242,240,218,236,224,224,251,213,227,227,224,238,203,219,242,223,229,232,251,209,225,240,206,211,246,235,208,221,233,221,242,227,246,234,223,228,245,245,220,227,219,254,218,242,222,207,236,236,240,232,220,227,204,250,219,239,235,216,209,212,237,247,248,218,224,221,240,253,224,217,228,213,240,233,219,224,239,210,217,228,204,227,237,213,226,234,216,212,224,219,244,242,234,231,220,235,239,229,228,230,210,235,213,206,236,233,221,237,229,219,214,227,232,207,224,219,223,228,245,217,252,236,215,236,232,243,248,245,222,234,221,245,254,221,201,236,211,220,237,219,237,209,217,223,236,230,216,238,220,232,236,240,213,229,238,238,243,254,226,239,232,243,255,255,247,240,255,247,252,226,251,234,234,239,210,255,239,226,225,251,243,237,234,246,229,228,231,245,249,239,230,236,212,254,232,250,243,254,244,251,247,241,227,224,227,228,247,236,217,251,237,240,242,230,237,230,242,240,230,226,226,236,222,223,246,240,249,235,243,246,250,227,249,253,226,249,247,213,246,250,235,244,246,230,205,242,229,249,216,207,219,241,240,246,247,196,173,194,213,218,249,244,199,210,224,252,218,247,197,158,109,122,92,32,186,243,251,254,222,239,229,240,229,180,229,246,244,253,207,229,201,150,122,231,224,160,212,221,244,243,234,223,242,255,244,216,191,211,144,132,149,65,72,156,221,163,126,85,42,58,86,57,33,39,70,18,37,53,64,43,73,32,98,230,201,187,244,221,183,51,96,238,240,246,234,238,250,207,210,191,150,176,188,254,202,150,149,191,197,195,169,131,211,249,228,152,30,18,21,9,35,18,134,212,197,172,108,54,7,128,245,252,191,94,18,27,71,91,73,27,31,97,67,81,65,45,34,44,3,48,60,64,30,48,25,61,48,90,27,44,13,34,212,255,239,187,84,34,5,9,59,11,32,25,54,47,30,45,51,29,54,28,91,200,239,230,237,204,165,70,34,9,27,52,28,61,35,60,169,246,244,239,237,253,174,191,237,172,138,240,240,243,233,227,196,161,154,149,149,160,175,171,161,154,135,110,121,179,238,218,251,248,249,241,236,246,249,253,249,219,105,36,22,21,14,24,27,10,72,208,240,239,230,244,250,250,246,166,130,24,23,17,23,25,24,30,14,119,229,228,233,221,255,250,250,196,151,115,44,16,5,20,16,40,37,16,150,212,231,189,99,89,150,214,234,236,248,242,248,247,232,228,227,221,225,248,186,134,66,43,16,10,77,102,58,36,5,13,0,16,78,187,193,218,230,241,225,212,195,220,223,212,215,233,230,212,234,242,240,212,200,201,230,201,228,204,213,195,228,205,215,230,235,210,218,215,225,230,225,223,225,211,231,220,237,222,219,207,226,81,15,10,0,14,11,0,10,18,10,22,8,11,10,22,7,16,6,3,0,14,8,10,11,5,228,214,223,243,196,212,205,241,243,212,249,223,226,239,243,226,229,236,200,220,229,217,232,223,231,239,194,229,217,194,229,233,196,233,210,218,232,247,211,202,213,225,222,220,215,182,241,214,243,229,227,234,226,220,238,202,232,237,211,239,245,213,225,228,241,237,233,238,246,233,225,221,207,227,231,249,226,253,228,210,215,239,215,227,227,227,220,242,225,230,249,215,224,241,235,216,228,215,224,214,215,240,231,192,219,237,230,233,248,244,247,223,206,216,218,227,229,234,217,228,238,245,249,245,235,246,228,221,204,232,241,223,234,214,248,211,230,231,220,229,223,240,230,205,223,220,219,220,248,229,210,231,202,233,233,216,238,235,225,222,214,233,226,219,203,216,222,208,221,213,233,212,225,208,228,201,191,237,228,222,215,216,202,211,212,233,209,220,221,227,235,248,239,242,234,225,236,219,235,253,240,231,211,255,235,225,224,237,223,207,236,236,232,229,216,208,225,231,187,208,223,243,226,231,231,233,202,228,234,215,225,236,230,238,226,249,247,232,246,248,222,236,245,247,248,228,236,240,236,249,245,254,221,240,223,245,230,252,252,246,243,235,211,246,247,249,229,245,233,238,246,229,246,240,253,215,232,225,248,221,253,238,247,255,240,231,246,223,231,242,237,218,232,252,234,255,225,241,250,246,240,246,235,236,241,255,255,238,219,235,242,246,225,249,244,241,231,248,246,227,245,206,251,232,226,209,220,234,228,209,187,183,212,253,221,245,232,227,202,207,208,237,220,219,225,162,116,113,95,45,194,244,244,233,219,237,239,249,201,188,255,233,231,204,182,206,198,177,163,187,250,191,225,251,250,254,247,205,203,188,208,223,151,156,161,189,202,125,169,157,197,137,168,110,49,72,87,94,41,49,46,46,38,59,37,20,33,17,78,190,149,154,205,183,190,133,92,191,227,252,212,255,219,216,253,210,181,210,200,210,177,182,197,191,214,204,157,171,243,251,235,119,24,6,4,41,19,154,249,237,241,252,151,42,10,81,255,233,149,91,31,10,42,58,76,49,47,93,65,64,65,33,9,41,52,64,36,47,27,19,20,44,66,51,55,44,96,227,236,237,143,71,9,10,29,31,22,16,30,11,51,51,13,35,27,47,53,37,32,13,80,114,66,61,64,29,62,77,99,94,90,87,132,123,208,198,191,152,162,142,113,98,134,97,90,155,233,242,244,190,166,161,145,145,124,145,138,159,155,145,153,123,152,178,175,127,212,230,206,193,237,233,242,247,246,238,141,28,2,22,40,43,52,36,15,3,160,209,246,255,251,233,196,119,79,21,3,43,27,41,23,9,27,51,142,191,224,222,244,255,239,158,83,65,42,12,15,28,73,57,40,15,73,167,228,207,165,193,215,225,237,242,203,165,129,125,150,210,242,232,238,206,134,115,41,38,11,29,99,79,92,61,36,33,45,20,22,20,110,213,238,236,223,217,232,223,216,203,211,228,230,219,225,218,238,225,226,218,238,210,213,220,212,231,227,207,205,212,208,206,246,231,183,204,221,216,222,220,213,244,226,220,237,213,206,110,0,7,1,25,5,20,17,34,17,5,33,0,2,4,0,4,18,8,13,39,11,23,18,16,214,228,237,225,224,218,225,204,230,239,226,239,223,208,203,204,229,223,226,202,235,216,214,213,235,208,200,208,216,228,214,206,214,222,242,210,221,229,217,221,210,226,231,209,208,217,232,248,239,229,206,194,250,227,214,228,219,184,244,221,245,240,227,222,215,214,234,215,236,225,218,232,233,223,245,225,232,202,211,219,218,212,253,238,242,233,223,220,219,226,226,207,225,222,229,239,226,237,232,240,224,246,217,223,236,242,214,228,243,228,250,234,215,232,250,235,235,233,242,234,247,243,226,240,223,225,242,239,226,246,209,227,247,255,246,248,238,239,233,240,236,231,216,223,220,226,243,218,220,218,219,236,215,195,224,206,238,231,217,237,206,201,213,238,222,223,221,228,202,232,214,230,225,232,209,231,218,221,207,227,221,242,220,233,212,210,233,230,235,224,214,212,229,212,247,246,246,232,197,238,232,241,240,227,243,236,218,215,217,223,226,223,216,231,217,225,219,244,222,235,221,244,224,229,216,231,234,224,227,215,220,231,230,229,237,246,246,252,242,250,248,248,249,242,244,255,231,252,243,237,253,241,244,215,246,242,245,243,241,243,250,242,249,251,240,249,247,241,235,248,245,229,236,235,248,239,250,234,242,249,243,235,228,252,249,255,241,239,253,247,242,239,249,244,220,243,243,241,250,241,229,224,254,217,244,255,252,255,249,243,246,223,233,236,250,237,243,249,236,246,238,236,241,216,225,214,216,184,197,229,210,192,252,251,222,229,253,233,222,203,223,249,230,244,203,173,124,135,78,70,193,239,253,244,228,233,253,254,207,200,240,233,242,214,198,216,204,197,176,199,253,219,246,240,216,241,219,159,145,126,180,183,207,219,205,229,187,180,159,137,195,132,142,137,62,89,96,92,17,39,56,17,58,44,53,38,27,3,135,241,170,168,219,175,163,128,105,158,219,255,221,249,203,236,239,239,201,128,156,255,115,188,198,184,228,154,147,193,221,235,220,115,29,5,24,30,135,244,241,235,229,139,80,39,8,51,250,218,159,118,10,17,64,106,54,67,69,95,103,69,38,38,35,46,37,56,22,34,49,37,38,52,44,58,39,120,229,241,249,111,13,7,25,30,34,29,40,28,64,61,72,91,90,92,82,78,97,70,84,72,33,37,23,38,28,35,53,38,54,44,49,75,61,71,31,16,13,17,25,25,34,28,30,58,3,93,244,234,220,191,126,151,133,138,128,138,119,117,141,154,135,148,144,128,71,17,14,4,16,13,27,9,74,93,133,155,126,67,56,73,52,54,47,66,26,30,27,40,148,211,208,185,107,49,21,14,5,30,30,22,11,46,44,35,50,14,18,54,195,213,249,181,85,36,0,5,98,223,190,136,49,30,39,24,18,19,100,215,242,237,249,214,139,72,45,30,56,155,208,239,245,183,97,88,20,40,91,72,89,88,54,58,27,36,72,33,46,58,27,62,190,226,221,226,206,212,217,203,225,221,221,217,234,207,198,229,228,221,215,239,215,228,210,233,237,209,237,219,199,200,230,235,221,220,228,219,202,210,215,207,221,230,226,197,215,116,0,3,18,21,9,16,9,12,12,8,23,0,10,8,15,4,25,14,12,0,11,3,27,27,217,235,226,231,241,222,221,229,210,230,242,233,240,231,241,234,249,230,225,245,227,199,220,241,237,201,228,227,204,238,221,212,223,231,241,217,233,222,213,222,204,198,247,197,230,207,225,219,225,208,204,227,235,227,195,246,218,180,225,228,232,237,235,229,242,207,222,219,240,213,216,245,241,229,232,225,249,229,237,217,240,224,241,236,232,226,249,227,209,251,232,219,226,228,229,232,223,225,214,228,223,224,238,234,218,239,229,223,226,236,210,210,217,219,221,235,225,222,233,241,237,238,237,231,215,244,247,246,245,224,247,238,226,232,246,220,212,223,226,211,231,228,201,220,216,223,222,225,237,234,208,222,223,216,217,218,223,212,221,207,225,235,235,243,231,220,221,210,217,203,221,226,227,215,234,220,226,229,217,197,213,235,213,217,236,214,237,224,242,224,211,228,208,231,217,218,240,228,231,233,233,218,225,240,233,226,223,247,209,223,218,235,224,216,219,235,237,218,227,228,235,239,238,235,215,238,246,223,212,228,225,233,219,237,243,253,225,220,235,251,250,232,253,246,249,244,223,235,253,251,244,245,255,231,245,237,228,246,223,236,228,251,253,252,236,252,228,246,226,237,251,212,219,240,225,244,255,239,230,225,238,246,242,239,242,238,251,243,236,255,226,251,219,243,246,237,237,249,244,247,237,251,222,255,234,253,247,247,244,255,239,252,242,234,227,240,229,234,237,235,228,203,233,227,200,198,211,200,249,211,222,226,241,222,172,209,244,236,207,215,225,244,247,238,183,156,127,92,79,57,203,247,250,245,229,228,254,248,202,205,255,233,254,234,198,217,240,228,182,222,252,232,221,200,202,221,134,132,139,199,230,232,226,244,236,192,150,166,123,136,166,86,190,131,69,81,92,71,17,44,63,57,54,22,50,37,46,27,162,243,223,246,233,187,196,138,78,84,186,221,251,193,178,244,242,206,158,131,215,209,124,166,193,173,203,149,138,183,248,246,242,146,17,0,33,3,141,245,245,231,85,0,0,14,21,127,239,232,171,76,8,32,37,52,57,32,65,51,51,23,20,25,41,46,48,10,38,49,40,24,35,22,38,49,32,152,198,196,70,17,14,38,47,53,45,64,115,135,113,130,127,104,126,97,105,82,33,83,66,74,37,60,52,42,67,53,36,71,65,67,62,75,63,81,67,57,52,55,57,86,64,63,101,66,38,119,224,222,198,174,141,152,147,170,124,134,161,133,155,126,138,129,148,142,77,9,10,36,65,29,23,17,28,15,38,49,44,33,22,30,10,47,53,56,75,38,72,31,32,57,26,37,36,69,64,42,32,26,43,40,18,37,17,24,38,15,27,49,43,121,227,180,153,43,31,28,55,121,209,195,140,70,39,31,28,8,63,158,246,226,236,185,133,72,49,27,20,160,242,238,222,121,103,22,39,123,168,127,67,32,40,18,19,16,46,11,29,24,44,19,148,196,228,222,233,215,252,210,231,223,222,221,210,219,212,226,223,219,234,227,222,211,217,206,223,225,199,242,212,223,217,216,240,205,224,223,224,222,212,228,225,212,213,200,225,120,5,3,7,3,26,20,21,20,2,11,13,8,21,5,5,15,1,14,8,9,25,17,29,9,217,240,229,196,254,219,218,224,212,225,231,219,226,233,224,221,233,226,214,197,242,233,246,216,178,218,206,214,212,236,216,189,202,239,227,240,199,215,185,218,215,185,226,235,231,215,221,232,197,249,239,229,216,208,225,207,238,218,221,221,231,232,220,216,242,211,226,225,242,236,222,205,243,214,208,217,231,228,227,228,229,229,241,223,248,242,210,226,245,232,235,219,234,235,225,223,239,224,206,235,220,219,224,222,232,233,230,225,245,236,229,226,244,243,233,221,227,234,217,239,247,220,232,207,218,232,251,247,230,239,240,233,239,239,212,203,238,231,216,249,220,207,227,221,241,229,242,231,223,232,221,219,232,218,210,200,231,232,245,233,242,213,220,213,208,244,202,232,228,208,230,209,219,216,180,223,214,219,218,206,233,199,214,229,202,224,219,237,207,235,224,230,213,230,227,237,228,230,231,230,227,230,209,233,213,252,232,245,243,229,234,253,228,227,238,215,233,224,215,241,250,243,206,233,222,226,220,218,209,246,244,232,222,237,235,240,226,236,237,240,243,222,220,250,255,234,224,237,245,255,247,236,237,251,252,253,236,246,250,244,231,236,232,241,223,240,221,241,237,250,242,231,252,244,231,235,254,255,230,243,254,236,231,249,253,243,253,240,252,255,227,253,249,243,245,245,249,242,253,226,254,251,221,254,247,240,255,248,236,247,242,249,235,236,239,229,217,242,231,233,216,212,221,212,206,213,217,223,246,212,187,194,222,230,190,193,239,225,230,235,239,225,223,251,190,153,114,98,39,69,184,235,255,243,236,249,235,246,193,216,245,224,242,253,233,226,249,255,211,170,223,183,147,101,172,200,210,232,197,212,251,243,233,238,237,165,103,164,136,171,192,114,197,120,97,67,69,113,21,50,55,39,53,48,41,34,33,31,154,250,200,213,248,228,217,230,142,39,104,187,214,174,147,237,218,169,125,165,255,214,80,190,157,149,200,125,118,205,230,249,253,158,36,7,20,0,119,233,225,209,66,10,17,2,117,243,255,248,165,75,42,8,37,102,80,20,68,56,70,45,18,55,37,56,48,45,47,40,25,13,24,29,80,43,60,107,95,90,55,77,105,111,104,88,104,107,119,81,75,83,80,43,64,49,61,63,71,62,68,93,67,88,117,101,139,147,124,151,166,150,135,123,130,156,148,142,143,135,138,106,113,80,83,79,79,99,201,207,144,124,135,139,137,136,148,146,125,153,162,141,130,117,155,123,95,29,54,49,53,66,61,73,30,41,60,50,61,42,33,54,24,65,37,29,57,41,47,46,25,43,74,42,57,63,34,47,84,54,49,53,59,52,38,49,45,21,32,52,24,6,54,173,158,160,40,31,36,44,185,232,140,90,36,14,57,36,48,195,222,242,250,232,154,82,46,21,101,239,248,229,170,94,50,109,195,178,157,78,59,26,31,7,15,34,27,2,54,65,45,22,61,201,205,226,212,233,234,234,230,217,208,209,227,225,236,228,229,211,214,230,203,197,218,234,233,206,239,231,198,227,220,217,216,202,226,221,222,241,238,203,211,220,230,228,226,103,7,4,18,8,21,11,14,4,7,6,8,12,6,10,6,26,14,15,15,21,3,9,13,20,195,216,220,233,227,217,233,209,243,242,218,208,246,229,239,224,209,245,216,216,248,232,235,216,217,219,232,240,219,219,208,238,236,216,238,221,198,227,222,220,227,229,235,226,227,217,226,237,220,183,202,214,211,200,238,203,243,221,242,216,188,200,218,222,225,244,249,207,224,207,222,244,206,237,229,217,241,229,237,197,207,228,203,227,220,220,249,205,230,250,232,227,228,243,233,228,225,235,234,226,197,218,240,213,229,234,219,216,219,221,224,201,223,213,224,217,218,217,240,223,221,209,237,210,225,230,248,237,243,233,240,245,225,243,230,215,206,217,226,225,227,210,241,228,225,243,218,213,228,222,243,238,211,216,224,223,227,220,215,227,228,209,223,199,216,227,211,203,220,205,232,229,211,229,218,215,198,213,208,212,191,228,230,214,210,226,220,234,233,223,222,211,223,241,243,210,214,252,236,220,206,234,226,211,243,235,236,248,231,238,251,222,217,246,247,227,253,222,224,226,246,250,233,236,219,223,242,244,214,225,247,228,227,237,246,251,221,255,236,231,241,232,240,240,244,246,247,226,230,240,229,246,225,248,249,247,234,241,227,231,234,238,240,236,243,253,224,249,255,250,228,204,249,247,249,247,254,234,230,243,230,251,255,237,236,252,246,239,253,242,232,242,255,235,230,255,249,249,246,243,223,227,255,253,234,235,250,252,253,239,232,255,238,249,249,255,207,219,229,243,232,201,213,210,215,220,178,197,238,212,195,248,234,255,242,225,238,211,230,214,222,224,207,235,192,157,149,117,82,53,196,248,222,229,206,225,255,222,202,220,241,224,239,250,199,226,228,248,133,123,155,149,185,192,238,247,221,247,224,252,249,252,254,255,221,110,129,170,178,189,213,142,154,70,38,59,81,89,13,37,60,34,29,73,67,72,14,14,163,248,181,201,235,227,237,252,120,57,106,208,228,130,147,160,121,122,148,208,244,148,111,216,192,180,173,96,172,206,252,255,237,178,73,5,11,9,34,212,236,253,207,156,179,232,254,244,235,174,102,25,12,41,38,70,68,36,38,73,62,42,31,28,41,28,40,20,29,43,56,44,68,70,77,107,115,92,89,105,115,109,78,46,66,60,44,69,109,69,64,83,69,100,108,127,128,134,140,126,141,145,141,135,144,139,97,100,76,50,63,52,35,82,31,37,39,46,32,51,49,27,51,15,6,31,53,59,133,155,163,116,122,144,143,145,176,161,139,150,142,141,157,150,156,148,90,18,40,22,17,42,44,36,38,43,37,53,26,78,46,60,33,90,54,59,28,53,42,55,58,56,37,29,47,48,57,51,36,95,73,74,87,70,63,29,59,67,26,46,35,42,33,17,142,142,81,40,53,196,242,230,160,116,49,26,37,30,132,229,247,253,249,223,130,109,80,27,35,131,188,154,108,41,85,220,223,194,108,86,39,35,113,150,113,96,45,64,42,29,21,1,97,179,200,232,236,218,225,229,217,237,204,224,228,231,212,231,200,208,212,212,206,215,227,216,234,206,233,227,218,238,216,225,246,234,238,217,206,191,231,233,213,205,241,217,206,125,21,8,4,25,14,3,17,26,31,21,0,13,6,5,22,7,29,1,11,4,11,12,25,10,243,214,239,241,225,239,230,224,225,219,227,229,225,222,241,237,238,231,223,229,244,222,244,222,210,229,216,223,211,212,213,216,229,220,213,191,199,224,228,227,209,214,210,206,214,209,239,196,223,226,219,218,195,231,219,204,189,226,220,209,243,234,212,219,225,216,220,214,226,235,232,232,240,249,237,219,225,206,241,236,227,230,237,240,224,231,232,220,239,224,206,230,246,218,206,227,245,239,227,226,231,223,235,223,220,234,210,232,239,228,226,229,196,226,237,241,223,219,243,233,240,229,243,231,205,212,241,231,220,234,240,231,228,243,249,234,217,209,221,218,220,219,245,228,216,212,235,238,226,234,223,228,199,210,224,229,223,212,199,196,215,217,212,225,214,223,204,215,231,208,216,216,222,192,223,237,221,216,221,223,232,204,216,217,211,227,226,226,221,220,209,217,227,235,221,225,217,234,228,225,230,240,243,225,232,232,247,238,229,224,219,243,226,228,238,244,219,224,222,240,218,251,210,234,233,241,204,224,217,240,229,221,193,234,242,234,231,221,253,225,242,247,255,242,246,246,248,234,243,249,253,237,232,249,243,247,241,236,236,246,243,245,242,232,250,249,243,213,228,234,226,246,249,250,237,252,237,240,246,247,213,252,230,253,238,248,243,236,246,226,248,242,216,234,229,248,239,243,239,252,255,240,247,244,239,236,252,247,232,249,236,243,246,241,232,233,240,226,218,234,237,226,229,197,212,209,166,200,242,221,222,240,236,246,202,211,242,229,228,208,215,243,226,251,212,166,154,113,76,42,197,234,246,255,196,225,238,241,178,208,252,240,244,253,204,150,169,151,104,90,149,216,226,225,247,251,225,232,185,237,237,215,249,229,205,108,131,178,167,190,187,154,140,68,51,33,21,98,29,58,52,32,26,64,33,43,15,94,207,247,191,210,242,209,203,238,135,11,100,197,216,142,132,127,119,143,176,207,174,111,170,213,222,240,188,79,179,246,251,239,240,226,141,3,0,12,4,85,232,240,252,255,255,243,228,235,96,45,9,5,18,48,22,22,43,45,45,37,59,49,34,21,49,50,61,67,70,70,87,100,119,106,100,73,41,71,32,52,57,53,40,55,75,104,91,116,134,127,150,131,145,143,118,120,96,51,62,36,58,51,45,33,28,21,43,25,47,39,31,49,13,14,43,32,31,9,46,55,20,31,37,17,21,1,19,4,19,75,129,154,144,146,160,165,161,174,159,174,147,152,157,180,173,111,46,28,46,15,2,26,24,29,8,14,11,26,26,22,16,15,31,49,33,51,32,44,69,65,88,43,50,54,51,73,58,75,50,52,68,46,49,30,58,61,87,84,73,85,84,85,62,52,40,79,44,55,179,239,239,246,128,127,70,21,38,24,63,199,238,236,229,142,93,89,77,18,26,13,18,5,32,15,104,217,232,142,105,38,71,199,229,236,229,173,121,85,61,11,30,8,64,176,224,218,236,226,216,241,227,223,221,225,237,233,228,217,211,218,237,226,230,227,230,238,201,222,229,204,207,246,218,223,231,231,205,241,226,230,218,234,218,238,232,230,215,114,12,18,5,6,20,0,38,23,25,16,2,1,16,5,4,19,43,24,10,20,17,19,14,0,231,208,217,219,223,217,235,250,220,239,224,235,227,226,227,233,235,235,211,225,219,208,246,246,235,205,219,226,223,221,240,240,220,217,199,196,187,210,201,207,214,220,221,223,242,217,190,219,215,232,234,214,233,201,212,209,217,221,233,230,232,211,178,207,209,221,235,227,219,211,224,211,206,234,219,231,217,221,219,217,228,235,229,241,227,246,242,227,218,231,207,227,234,222,215,212,239,226,221,243,226,243,242,206,231,230,220,243,220,211,204,241,221,242,212,217,226,210,244,236,231,232,222,235,240,219,224,199,235,220,238,238,230,208,208,225,226,228,227,227,220,214,210,239,219,216,248,231,247,235,233,241,218,235,221,209,226,221,220,204,235,235,217,210,212,219,201,224,224,203,201,212,218,225,190,220,201,195,213,228,227,215,227,216,241,232,230,216,237,212,208,233,191,224,231,223,244,229,225,215,222,214,231,234,241,236,232,225,241,237,245,242,240,234,243,230,231,247,244,245,219,228,227,235,248,237,246,215,226,228,216,222,239,243,226,241,193,237,235,241,243,232,238,244,255,222,214,255,245,251,249,240,226,252,228,247,253,233,241,226,216,241,219,235,246,237,228,216,237,236,241,218,239,255,231,255,235,239,249,248,243,236,254,253,230,249,243,253,246,225,242,245,246,238,250,236,229,235,241,254,230,240,239,227,234,255,252,250,233,246,219,242,248,255,233,243,231,246,239,232,212,233,208,237,213,194,228,227,255,177,225,228,228,201,211,240,255,204,199,207,210,245,227,255,206,160,153,97,53,54,229,246,228,241,212,242,226,236,176,227,254,247,253,250,173,119,171,192,177,147,169,249,243,234,233,252,223,228,189,202,228,239,229,224,180,129,144,174,191,188,210,167,150,142,83,41,30,120,54,84,68,45,40,27,43,41,149,217,227,225,191,192,224,227,224,243,99,28,58,140,196,193,235,185,164,202,190,207,163,117,188,226,201,244,169,82,171,247,254,239,237,243,191,129,70,9,17,25,45,140,176,221,210,158,136,46,39,6,49,78,51,61,31,62,79,79,50,81,65,78,67,94,100,88,85,95,65,89,63,76,59,47,43,76,45,70,58,110,122,114,147,167,127,133,128,107,67,75,47,17,40,37,15,14,26,33,17,20,26,22,43,22,10,32,18,44,35,15,36,22,23,18,14,53,45,17,20,43,17,51,22,49,28,27,18,33,96,122,141,149,151,157,139,150,146,154,137,163,138,159,127,165,154,128,76,12,49,46,39,31,31,17,7,43,14,29,51,38,21,19,33,34,22,14,10,41,10,33,31,64,31,44,53,87,90,86,79,102,83,61,71,77,60,55,55,52,51,63,70,65,54,78,57,61,54,59,172,235,240,191,147,55,50,30,36,44,14,33,137,194,148,104,88,57,92,72,55,41,30,0,35,116,217,255,227,134,90,57,27,76,190,228,226,226,145,111,68,34,41,15,66,198,225,215,204,218,229,236,233,213,228,212,229,221,211,209,213,229,244,231,230,227,245,185,223,215,222,230,204,230,214,242,209,206,244,218,231,228,194,222,209,237,205,227,223,119,0,0,30,13,17,8,7,15,14,2,24,0,8,8,0,24,15,6,18,2,7,26,3,6,205,204,216,226,228,245,228,243,211,216,223,234,216,230,213,219,231,223,227,232,226,227,241,229,217,228,213,210,239,197,238,214,225,211,215,227,218,222,192,229,224,192,211,206,234,228,191,217,231,194,212,234,208,236,221,239,220,215,207,228,215,205,220,212,214,236,229,206,202,199,215,230,227,210,245,229,219,223,242,213,204,224,218,244,193,232,230,213,211,244,233,225,237,213,225,241,215,219,236,239,222,236,227,214,251,212,234,213,218,243,207,220,201,211,218,204,212,229,213,229,241,230,240,228,238,236,204,239,236,237,255,244,214,229,230,206,233,218,215,218,219,204,239,235,226,225,235,235,232,229,215,224,220,206,212,220,217,227,203,215,236,230,209,231,209,215,229,218,228,227,195,216,203,213,204,224,237,232,224,217,203,230,226,223,224,232,228,220,235,225,208,219,235,218,219,223,243,229,227,231,217,230,218,226,227,230,232,229,233,219,245,244,232,231,235,226,226,250,238,205,225,225,226,226,218,212,225,208,203,223,237,226,196,236,216,241,234,230,238,234,214,240,239,251,230,239,247,246,242,249,237,239,229,239,235,236,251,240,229,219,237,238,236,245,255,240,209,250,247,247,233,238,239,247,230,239,226,246,239,247,250,251,246,231,217,241,246,253,254,254,247,251,226,245,253,243,233,221,223,246,240,246,240,224,254,242,234,254,244,247,246,245,239,246,248,240,230,245,217,230,245,239,203,222,219,222,206,251,174,79,129,207,233,245,199,230,233,212,210,240,218,218,196,241,181,152,135,81,53,53,229,255,234,235,215,240,245,230,166,223,212,184,154,194,198,191,233,247,243,185,157,245,239,241,231,253,207,189,162,188,240,245,251,215,198,180,165,166,198,144,209,120,157,173,146,68,51,130,58,98,68,27,34,33,32,77,202,251,219,219,175,203,255,219,246,243,126,72,56,90,182,201,251,206,172,132,166,231,188,156,192,188,153,224,113,69,198,208,250,247,239,241,218,235,203,163,85,113,53,2,11,14,7,9,5,21,50,87,55,111,111,118,126,110,120,113,103,117,105,84,63,74,62,75,49,79,57,54,53,37,49,104,113,110,119,160,130,127,115,98,87,52,49,29,34,27,49,30,22,36,17,1,9,20,24,19,16,9,17,45,27,15,20,22,42,27,32,46,42,38,38,23,50,39,49,17,13,9,18,14,30,43,35,41,59,85,214,172,162,175,161,148,151,160,160,149,154,145,136,128,142,146,151,66,43,26,30,48,74,61,31,22,23,25,25,40,47,44,5,18,39,11,45,49,12,7,22,7,11,14,41,19,7,19,16,40,73,79,63,61,74,100,96,77,87,80,93,76,53,54,42,62,54,56,88,47,164,193,191,171,85,56,51,27,29,8,30,36,17,45,50,115,156,80,88,99,101,165,204,214,251,251,243,240,247,179,107,32,20,18,29,61,194,223,188,117,89,35,45,24,94,216,195,230,223,221,205,213,229,219,202,209,229,230,217,216,240,251,231,215,241,226,234,220,230,238,209,235,235,234,206,227,229,201,224,225,210,221,229,216,210,206,216,233,202,101,25,30,0,0,5,7,5,0,18,13,20,19,14,21,14,7,9,25,17,1,26,12,4,9,219,206,230,226,219,206,216,212,219,213,206,232,230,211,204,208,230,220,246,229,209,211,232,218,219,223,201,219,217,215,215,215,220,209,216,189,218,224,215,208,226,219,200,215,211,207,210,205,217,212,218,217,224,239,213,204,205,210,210,190,209,208,212,225,224,218,197,201,217,217,207,225,218,213,204,204,214,239,237,235,205,237,240,221,214,227,233,208,214,223,244,222,227,214,240,236,232,227,228,212,236,221,238,205,219,216,196,224,229,218,237,233,232,207,237,221,228,211,239,221,225,227,235,213,221,215,205,241,220,225,235,245,250,244,232,235,211,213,237,221,232,229,220,245,210,211,214,209,237,210,220,221,221,213,210,232,218,210,233,227,195,234,226,242,218,199,209,234,209,225,231,229,230,232,190,239,231,244,227,222,232,238,226,208,222,220,201,228,220,228,224,215,213,211,223,207,239,222,213,221,203,234,220,223,239,215,228,214,226,203,225,224,237,228,252,210,218,224,237,241,240,233,251,233,210,217,229,232,220,224,229,238,232,234,235,241,243,239,236,249,254,233,215,213,221,236,198,245,234,231,255,240,227,254,245,246,228,243,218,217,235,251,228,219,255,219,227,246,241,239,209,226,242,249,248,252,226,221,246,255,250,233,254,240,247,250,224,244,229,233,249,252,239,253,253,255,245,245,228,239,234,253,248,243,249,249,238,221,240,209,255,228,251,246,230,248,204,226,243,228,217,255,236,207,213,240,231,191,107,22,98,228,214,232,207,237,254,201,186,241,218,234,223,255,191,144,152,114,51,30,227,239,240,244,243,245,242,210,170,186,188,181,208,238,251,204,233,232,251,186,163,214,243,242,235,252,187,165,218,202,227,235,248,182,167,201,168,169,204,157,191,137,181,177,139,85,31,132,71,90,81,58,71,37,20,71,230,231,204,196,164,224,243,206,220,242,155,157,99,76,155,225,249,160,104,97,190,170,96,144,211,185,139,181,69,75,221,219,237,213,238,229,241,252,253,222,179,195,100,56,54,17,8,15,44,36,46,57,30,13,46,41,25,68,44,50,61,60,97,61,79,68,51,78,79,73,112,119,118,127,150,131,131,111,89,49,49,35,29,26,48,23,4,26,14,24,28,28,35,7,15,13,16,27,14,18,15,32,31,34,41,23,32,43,34,39,39,34,35,38,45,44,61,80,33,19,12,3,23,31,34,32,58,82,82,169,211,159,161,159,141,181,151,126,166,151,161,169,107,134,125,190,132,49,38,30,12,35,66,61,65,41,12,45,42,18,26,7,20,16,19,17,23,29,14,13,0,16,18,38,34,26,23,28,26,5,7,30,9,16,48,47,29,52,65,75,96,104,93,110,109,77,67,54,73,68,40,48,49,78,86,93,42,54,44,61,48,75,40,27,54,63,87,70,63,38,143,222,234,239,245,234,245,250,246,253,135,46,18,14,20,107,217,229,213,133,91,38,53,17,69,214,220,239,249,231,216,239,238,220,219,238,216,232,215,223,238,237,226,224,215,230,236,226,216,240,216,230,247,223,231,214,207,210,219,207,224,223,217,211,210,208,229,228,247,121,18,1,6,1,9,9,7,13,11,12,13,4,4,0,3,18,11,7,8,12,9,16,8,24,240,221,222,224,214,219,224,220,232,223,222,224,235,201,235,210,225,233,247,209,219,227,234,221,204,185,226,228,226,226,197,224,209,220,202,240,224,214,219,214,231,187,219,223,197,202,215,242,220,219,198,231,220,210,209,228,224,222,219,197,220,212,218,223,209,202,211,230,241,225,218,217,205,209,215,232,219,203,218,211,219,238,220,226,239,235,238,244,244,231,214,217,217,216,224,218,218,211,214,226,220,218,232,213,251,228,232,220,229,237,216,220,233,228,229,239,201,204,202,211,222,233,199,216,223,215,247,242,214,206,226,234,243,230,225,213,214,227,199,233,221,222,224,240,222,246,226,240,239,216,233,225,193,215,229,235,239,243,240,231,224,215,234,226,199,233,215,226,220,211,214,214,235,230,201,227,209,222,212,230,202,210,243,218,214,220,215,188,225,220,217,213,234,222,204,210,218,227,229,205,220,207,241,242,235,238,218,229,220,240,224,241,210,208,224,243,221,240,236,210,227,245,206,216,247,223,242,234,219,176,240,248,193,227,226,228,231,241,211,224,240,211,243,227,255,247,235,245,249,237,242,235,247,240,245,235,209,245,229,213,246,247,246,225,217,246,213,246,255,224,226,233,241,239,245,221,255,225,233,238,241,229,246,249,223,242,253,250,248,250,251,245,238,214,240,243,240,220,251,221,218,231,247,254,249,236,254,229,253,214,253,239,234,232,244,236,246,226,238,223,224,237,224,186,212,229,168,219,139,10,146,233,223,228,208,217,247,217,208,187,215,236,214,255,171,178,130,109,61,44,210,241,255,253,210,243,196,155,147,213,228,235,251,242,255,223,184,217,234,188,146,232,252,187,166,201,184,212,248,226,246,244,196,146,182,160,162,196,217,205,213,150,179,160,128,74,12,152,106,78,58,44,73,46,19,86,243,247,208,190,189,231,202,244,233,230,167,208,183,99,142,235,189,190,98,156,244,171,62,143,235,183,171,185,107,128,177,179,181,209,212,234,224,253,255,241,183,208,116,65,76,53,95,76,82,98,80,91,69,78,85,85,80,69,45,94,84,58,101,119,129,131,133,128,122,100,89,59,84,49,42,49,37,32,46,21,18,13,22,18,55,28,5,8,44,19,20,18,36,32,23,9,21,18,6,5,39,41,34,13,33,20,28,57,54,56,45,44,5,25,61,61,37,49,37,23,24,61,42,65,44,71,74,144,177,238,230,157,188,165,152,147,143,156,157,179,171,170,142,128,144,153,142,46,45,16,31,20,64,68,52,43,35,46,9,20,22,8,15,37,16,33,28,36,28,28,18,46,69,18,38,42,30,27,20,14,7,31,4,13,16,29,20,13,11,29,33,59,51,81,77,88,126,122,122,82,59,64,33,77,37,85,43,96,55,52,75,83,58,72,36,62,54,47,46,51,23,66,113,88,106,98,97,164,223,217,190,167,178,172,227,241,248,227,161,157,123,34,14,24,167,223,224,220,234,223,231,241,230,196,217,202,239,243,212,227,217,220,247,228,217,201,231,215,225,223,206,236,241,235,226,227,221,231,214,243,221,224,232,226,231,211,213,235,234,144,0,12,14,0,21,10,22,21,5,11,9,6,0,0,8,2,12,10,1,0,23,14,23,21,216,216,221,241,231,230,222,236,227,203,235,219,220,234,235,213,233,235,202,231,227,215,228,224,230,233,234,221,195,225,196,211,205,213,223,196,228,217,227,215,202,217,224,216,188,207,184,202,206,228,199,202,204,218,247,216,220,191,236,220,202,222,212,189,224,234,226,208,208,194,227,227,205,220,205,238,224,235,220,224,227,241,225,208,218,202,206,234,247,239,223,226,228,200,238,238,215,204,226,209,220,205,228,210,216,237,235,228,221,230,214,214,219,231,210,216,199,209,225,233,233,229,213,226,218,212,236,245,205,225,209,223,225,226,236,210,206,211,236,224,220,224,237,233,232,223,214,207,226,225,219,232,214,238,218,230,214,215,210,209,241,212,204,216,221,237,230,225,239,193,226,226,238,212,211,218,217,231,228,195,217,199,218,218,228,241,223,234,219,200,212,222,211,226,208,214,236,205,212,239,220,232,200,224,222,225,231,238,216,229,238,233,223,236,217,236,236,219,234,217,229,224,221,215,212,218,221,216,230,238,206,224,231,244,201,247,231,232,237,213,236,219,239,248,244,236,242,230,211,249,236,223,252,244,233,254,240,207,201,236,233,246,248,238,227,230,240,252,239,251,234,211,242,233,223,245,251,233,235,245,247,213,248,252,249,233,234,221,242,247,239,255,235,253,230,247,242,237,233,242,211,229,241,255,254,227,223,217,238,246,243,244,224,222,231,243,234,251,236,235,233,241,219,220,207,226,236,253,154,51,160,224,216,218,219,217,249,216,200,193,224,184,200,246,207,153,120,117,63,58,241,243,235,208,161,187,191,205,195,197,218,226,253,245,255,189,174,210,206,130,129,211,209,190,196,235,206,215,249,222,252,240,170,179,173,148,171,190,195,225,249,144,173,151,152,91,0,119,69,75,61,23,58,46,21,74,226,247,236,170,179,239,242,227,217,244,179,244,210,107,148,227,188,152,121,197,243,189,100,176,253,194,232,217,109,194,223,215,204,199,190,168,171,231,235,181,122,79,34,25,26,17,4,2,12,17,67,48,65,88,85,64,77,85,86,78,48,76,60,75,60,67,87,39,32,47,48,71,93,22,7,20,31,33,13,1,3,9,13,36,37,29,35,26,27,16,16,51,23,56,33,6,43,13,25,6,31,53,22,13,44,47,43,36,55,53,10,38,56,55,41,61,59,60,43,22,26,43,65,70,60,44,132,156,171,202,214,177,150,137,177,158,178,175,139,185,175,162,161,143,155,187,103,64,49,50,20,18,30,54,61,58,56,55,34,27,22,37,26,12,34,10,37,37,33,29,0,12,49,20,45,35,18,29,56,32,48,37,35,22,16,29,24,6,21,25,36,47,27,29,28,40,46,75,49,102,77,100,92,90,56,76,49,51,57,63,88,76,80,83,82,107,79,60,43,47,64,64,32,52,35,39,46,38,30,99,204,227,237,239,246,248,230,177,115,103,58,33,31,55,218,224,228,205,210,249,233,219,233,200,221,208,229,222,233,237,239,234,240,239,216,219,239,221,228,216,223,193,242,228,229,251,231,212,245,227,236,241,224,231,209,239,197,210,239,106,14,4,1,16,26,7,1,3,11,12,0,43,16,12,3,9,27,1,37,4,21,12,11,4,204,227,230,225,238,205,193,241,210,220,218,216,235,205,227,225,214,234,240,214,207,214,233,233,236,221,227,234,208,224,248,213,226,207,215,211,222,216,199,212,200,227,206,215,201,199,235,206,199,193,230,250,241,209,227,211,212,215,237,219,220,210,221,223,223,210,229,203,221,187,227,208,234,223,234,231,230,234,222,209,217,204,201,197,247,239,238,224,221,237,235,224,240,234,189,205,226,229,241,205,230,214,203,230,221,224,239,229,210,243,223,211,218,209,200,230,215,231,223,220,237,215,224,222,231,235,236,225,207,219,229,206,223,219,210,239,199,242,222,211,247,194,206,242,232,213,229,233,215,208,228,222,242,234,232,221,226,252,233,225,214,208,223,215,206,222,213,195,217,205,229,251,233,215,208,221,195,240,211,212,235,213,213,224,239,200,236,220,220,217,215,209,223,211,215,235,211,210,246,231,174,227,247,222,231,225,242,205,228,192,231,239,214,237,195,219,212,222,224,221,233,236,235,215,232,243,219,239,213,237,231,233,239,243,244,233,228,232,225,232,235,242,249,228,237,193,247,221,239,243,245,247,229,204,244,242,236,227,198,250,241,240,235,238,234,247,241,234,247,255,242,223,220,239,229,236,238,225,247,234,233,236,249,255,255,222,240,227,248,224,242,238,246,221,252,244,231,234,244,247,254,255,237,240,249,244,248,249,250,245,239,219,252,234,240,238,246,230,230,236,245,244,217,211,214,231,223,253,113,60,201,249,238,236,229,247,243,210,220,218,214,217,226,252,198,179,111,104,63,71,170,195,175,177,188,238,234,221,209,221,242,235,231,244,226,242,181,234,232,164,114,196,243,254,242,250,203,209,235,184,188,229,166,173,161,181,204,186,202,218,222,116,151,216,157,109,36,80,66,86,74,49,32,42,19,52,242,247,220,152,160,216,238,209,216,247,156,186,222,112,118,177,127,196,170,220,245,114,147,226,248,172,213,202,134,198,211,192,233,174,186,194,188,198,196,124,95,65,67,92,59,40,39,18,15,8,33,47,23,25,24,19,35,33,6,17,18,38,13,16,34,47,44,33,45,42,110,138,73,34,36,11,0,7,10,21,8,38,9,18,47,51,46,21,7,34,22,41,54,58,59,49,54,35,7,12,39,22,18,19,41,47,38,55,43,27,25,27,53,46,58,46,67,29,26,26,26,76,77,63,57,84,138,129,224,253,191,156,143,139,143,154,155,152,177,195,169,146,146,157,191,166,113,82,72,87,59,34,55,56,60,71,47,29,25,40,34,51,18,15,33,20,29,32,43,65,10,11,29,25,24,30,21,44,27,17,53,38,32,42,44,21,14,33,15,2,45,47,36,30,33,37,22,2,38,19,49,29,42,52,100,120,96,89,85,83,48,67,40,44,61,67,71,72,76,68,54,81,68,48,53,24,66,32,43,35,4,104,208,219,248,207,186,96,100,58,13,17,66,219,234,226,246,241,224,229,240,228,238,218,235,222,207,240,224,215,251,226,189,224,229,241,229,244,224,214,223,247,235,222,223,238,237,223,234,221,219,232,237,215,217,209,236,218,214,148,12,3,11,6,4,3,6,8,8,11,18,2,1,16,0,12,22,8,15,3,6,11,7,0,193,206,196,214,214,212,245,226,205,204,217,216,205,229,199,220,206,217,223,211,232,222,236,226,223,220,219,225,228,224,242,229,218,236,205,197,223,194,199,203,217,210,212,204,237,196,232,207,196,187,204,219,210,221,217,205,220,215,188,218,220,211,202,213,223,212,216,227,204,216,194,228,221,221,232,234,207,219,216,196,231,219,197,213,223,212,236,226,229,227,216,211,232,216,204,241,233,247,200,234,216,212,230,230,221,246,246,220,233,192,217,213,241,241,232,239,207,214,241,218,222,201,229,235,218,228,216,226,214,232,243,220,227,218,232,204,228,236,248,202,225,216,197,222,209,213,225,223,212,204,220,216,214,216,215,229,211,215,229,206,229,230,198,224,229,239,234,222,220,242,236,232,231,224,236,215,210,220,219,218,222,223,235,220,224,226,194,220,215,211,223,230,196,214,243,222,203,215,236,204,220,223,229,199,228,218,204,251,200,220,210,212,209,240,223,215,220,229,235,219,219,210,228,241,231,214,223,238,226,220,231,235,214,227,231,225,227,254,242,217,231,216,224,234,238,217,232,219,250,233,237,236,217,234,220,224,244,214,182,219,224,220,240,236,214,248,251,248,237,230,221,247,232,237,227,234,232,230,245,223,247,242,235,234,232,241,227,250,251,228,249,229,246,231,254,242,238,217,240,247,217,244,225,232,235,234,250,246,241,241,255,252,235,235,232,242,242,244,221,222,254,221,198,201,211,225,193,226,104,78,236,239,245,229,224,223,244,213,239,206,208,201,244,250,173,159,159,123,124,54,165,212,224,235,212,238,237,196,235,209,240,235,238,208,255,192,169,200,252,227,141,241,234,235,231,167,138,200,203,184,201,212,172,174,210,185,218,206,172,201,206,109,174,176,174,138,50,101,53,109,66,77,56,54,4,71,224,240,200,113,180,193,238,209,209,238,189,179,212,124,112,88,106,158,211,233,161,106,206,253,214,145,199,167,118,221,204,209,202,221,229,194,209,228,158,159,166,195,212,237,238,227,238,240,247,215,165,74,61,30,63,37,41,42,40,35,23,39,44,26,68,58,37,16,50,124,159,151,81,12,20,16,30,2,12,15,38,63,35,27,15,30,37,37,24,17,12,13,32,71,42,38,43,56,54,23,42,7,25,30,60,41,45,19,21,11,33,71,63,37,52,45,37,25,5,32,27,46,33,58,57,60,98,170,245,242,152,150,139,157,172,158,159,151,204,174,175,132,166,152,173,131,72,48,96,77,58,19,30,88,88,48,74,37,21,19,64,33,19,8,37,17,34,18,38,43,42,38,26,10,31,26,40,38,63,26,27,38,31,27,12,6,20,15,28,53,44,25,24,37,39,15,25,32,20,14,25,9,30,45,59,35,23,54,98,98,96,119,96,82,82,72,76,76,63,43,63,90,37,48,84,58,90,81,70,64,14,23,4,11,54,70,52,56,55,30,56,175,233,239,227,253,246,230,251,222,234,237,227,240,238,216,199,213,240,225,234,215,227,243,248,250,237,204,232,222,218,221,195,241,220,211,239,234,221,230,217,233,235,223,245,214,214,227,228,103,20,7,24,3,17,5,16,3,15,0,10,1,18,17,1,5,16,19,14,10,4,14,8,0,221,185,201,221,219,223,221,234,229,225,190,207,223,215,239,223,219,223,229,223,217,211,216,214,236,221,226,226,225,197,210,211,228,207,224,199,204,221,226,224,215,209,201,203,215,222,192,230,218,202,185,210,204,209,207,218,186,210,219,201,209,218,205,219,204,195,195,225,213,231,212,185,189,197,230,214,221,246,220,211,215,225,210,220,214,211,233,220,237,225,222,224,200,213,222,215,223,213,211,203,218,231,221,217,209,204,224,215,198,195,221,219,196,211,223,197,217,225,222,210,206,228,226,234,225,221,212,211,219,240,196,215,218,241,213,230,219,206,243,246,236,231,225,231,228,199,239,211,196,224,220,213,210,210,210,212,215,215,217,224,240,188,212,236,200,228,212,234,224,232,218,243,229,231,235,235,207,236,220,215,190,210,239,204,200,225,208,210,211,223,215,230,233,218,235,209,214,217,240,222,203,214,215,230,220,202,211,212,218,218,200,224,217,204,216,224,228,228,236,223,211,226,210,223,239,195,230,223,255,234,215,245,226,249,233,232,224,248,223,231,239,217,224,221,225,249,232,212,237,244,215,230,209,236,228,224,229,230,201,219,241,225,231,220,247,243,224,244,232,232,232,220,233,251,230,252,209,247,239,238,247,234,226,244,224,243,241,232,230,242,234,253,254,246,241,254,233,241,207,207,243,250,244,231,229,226,252,253,245,237,239,246,240,238,250,234,227,228,221,215,244,236,204,187,171,188,216,241,96,128,235,253,252,187,189,240,241,204,211,223,232,217,247,251,170,127,120,116,81,61,220,250,242,227,228,246,196,208,218,218,223,233,246,215,236,224,178,170,225,185,126,204,234,204,165,157,184,223,236,199,203,199,170,196,198,178,198,183,140,172,182,151,175,171,102,143,140,141,115,111,84,31,62,56,33,84,195,243,205,139,184,225,222,221,223,239,168,151,205,131,114,54,112,195,206,172,130,137,218,236,215,158,176,109,115,219,169,194,224,205,204,231,224,224,172,137,242,231,254,227,244,228,250,253,242,216,153,134,138,137,120,112,116,98,97,74,104,96,71,82,100,65,42,20,73,157,160,148,68,37,23,6,37,4,11,50,82,47,53,32,22,38,44,54,50,24,42,21,53,30,62,39,54,50,45,49,49,33,35,69,74,64,41,40,51,68,66,31,40,35,39,22,40,30,24,51,62,61,35,61,49,33,66,162,239,220,139,132,125,172,157,156,159,163,173,176,133,129,148,142,193,119,50,16,50,57,94,46,33,71,90,94,51,26,23,33,24,30,55,18,30,21,31,30,52,39,39,45,14,37,38,13,56,22,21,17,10,20,24,26,28,19,46,45,35,45,49,63,41,27,5,43,13,14,20,23,64,39,35,24,23,49,32,22,41,51,49,70,93,98,142,152,168,164,141,118,102,84,85,54,71,87,68,68,58,69,72,82,57,48,40,68,73,65,45,10,102,211,251,247,217,232,217,233,235,232,217,222,234,251,238,223,240,235,227,225,210,234,220,222,235,229,213,239,220,202,245,245,242,249,232,236,242,229,234,202,232,234,230,243,230,220,227,227,199,121,12,4,21,1,2,24,1,5,21,20,28,30,7,15,10,19,1,1,38,7,12,3,0,16,241,189,234,211,206,200,214,211,219,220,230,223,206,207,235,213,217,213,226,207,228,220,213,230,216,215,211,220,218,237,225,212,219,208,229,218,200,228,203,196,228,237,217,219,224,210,211,225,199,213,224,219,233,220,218,207,221,218,208,203,217,232,213,227,210,206,200,228,233,226,218,203,215,222,203,208,237,199,236,221,228,202,212,217,177,203,244,194,211,219,208,222,211,230,200,196,207,206,201,221,222,226,214,213,203,242,204,200,201,237,200,209,209,220,221,205,222,221,200,194,244,207,240,221,231,207,177,239,219,232,222,220,223,211,235,208,235,239,201,223,225,230,220,227,221,197,211,224,236,230,212,223,223,233,235,231,209,190,213,236,227,229,216,225,238,194,232,238,196,215,245,214,236,193,220,221,227,228,235,238,205,226,197,216,230,216,217,217,241,237,200,217,215,206,220,221,208,209,215,188,224,238,237,224,234,217,228,242,223,221,244,239,227,228,230,189,221,233,238,208,234,232,220,214,227,237,233,219,215,223,223,219,239,222,233,239,238,221,214,245,215,215,213,206,223,236,228,239,237,237,214,241,227,229,207,229,225,222,215,222,243,226,235,213,246,236,250,211,226,232,231,241,239,231,244,250,242,226,255,236,248,234,214,246,240,238,241,245,230,251,230,233,249,238,235,244,228,234,246,248,226,241,247,236,238,243,239,255,237,245,236,222,235,239,255,229,247,239,231,237,254,234,216,179,165,214,238,233,96,137,217,224,241,203,206,253,229,192,234,236,198,225,221,224,146,117,105,79,71,63,218,221,252,226,205,249,182,212,200,193,225,231,247,246,242,232,200,166,176,152,121,141,190,228,225,156,198,249,242,240,189,190,184,215,197,121,198,190,158,170,178,134,170,125,75,203,201,153,136,103,70,32,31,53,5,67,243,242,196,105,170,216,234,204,210,234,199,151,196,229,169,85,112,155,195,212,107,142,223,251,215,172,198,108,173,240,237,192,239,213,212,217,218,227,150,214,236,238,238,234,227,226,208,148,115,83,102,123,124,118,116,122,114,127,121,116,153,93,111,155,140,64,14,36,112,139,135,142,53,34,29,18,7,15,6,32,68,41,49,30,20,15,24,87,67,39,25,34,54,42,75,61,54,42,57,47,53,37,32,64,68,46,48,48,50,64,31,61,44,45,26,21,41,56,32,67,73,80,56,71,63,44,95,212,204,197,135,127,156,155,148,169,162,175,206,147,122,139,172,152,172,117,41,53,37,18,37,26,43,58,58,52,50,6,23,36,22,40,38,30,15,9,60,33,6,16,32,29,27,25,28,43,32,41,22,27,25,16,13,36,38,56,43,36,56,76,65,55,26,22,31,12,23,33,43,32,36,32,33,7,4,18,22,20,36,28,21,36,97,70,74,86,116,133,141,125,168,170,171,148,124,140,129,129,99,106,86,86,95,67,66,41,33,11,18,19,111,229,240,240,237,235,233,217,236,239,231,233,249,215,242,220,252,233,231,217,225,222,241,197,219,232,239,248,235,228,246,232,238,234,202,226,234,228,217,249,225,225,194,205,234,221,225,210,206,119,8,0,12,8,9,20,0,1,52,9,18,12,7,0,14,19,4,10,16,41,10,22,6,3,219,187,205,230,225,209,236,211,238,223,241,209,227,211,207,231,217,199,222,211,183,202,208,224,242,227,204,181,228,233,217,205,219,208,204,205,204,226,237,196,215,200,204,184,215,217,211,189,227,223,216,203,207,222,228,195,201,219,231,220,227,229,201,198,234,191,222,217,212,207,232,218,213,207,205,212,209,220,223,212,228,197,198,223,198,200,189,191,211,227,212,204,198,205,242,229,230,217,222,229,224,230,197,205,205,222,231,215,223,221,208,211,199,225,218,218,212,236,219,219,215,230,182,236,201,193,224,238,212,242,224,214,217,217,216,238,223,222,218,219,229,242,234,238,220,206,214,207,203,225,229,234,208,210,239,205,227,235,228,248,235,219,213,218,220,237,243,226,233,205,225,225,231,220,234,233,229,231,221,231,203,230,196,224,210,212,226,227,234,221,227,238,225,235,229,232,206,221,206,215,228,210,213,225,220,226,217,222,225,221,215,226,213,201,210,205,223,228,217,221,228,240,215,228,226,210,228,213,227,212,210,248,240,212,222,223,217,239,237,227,239,232,245,232,247,240,219,246,216,232,249,234,227,229,254,217,236,198,222,240,226,252,209,205,252,236,243,229,252,234,239,250,244,245,238,222,238,222,231,240,231,228,238,245,245,234,232,233,240,237,247,247,249,240,246,243,241,241,227,255,244,243,241,237,248,242,245,215,217,228,240,220,220,221,240,239,247,236,247,241,230,220,230,207,207,242,234,254,101,89,190,228,250,209,242,227,233,214,220,198,164,168,182,194,180,122,144,87,89,60,206,233,250,236,206,245,200,210,225,210,238,231,227,250,239,243,191,182,194,184,108,150,224,230,232,189,162,232,249,222,185,203,164,222,186,182,233,231,200,159,194,135,171,103,19,198,190,143,172,101,83,25,28,40,57,79,200,250,167,60,133,213,227,212,215,249,200,152,177,237,201,93,64,121,225,166,93,181,229,242,215,211,213,145,186,243,238,211,228,225,231,222,213,204,161,225,238,218,240,230,220,185,106,86,51,69,75,91,101,43,38,29,47,61,93,116,98,28,10,95,110,81,44,39,111,126,151,99,33,20,32,33,57,13,25,15,48,66,61,40,26,11,13,73,72,46,37,14,16,49,37,54,53,68,31,25,68,76,48,77,66,69,90,89,70,58,83,111,80,79,82,140,101,104,121,113,93,128,123,143,136,68,141,253,253,194,160,132,135,147,162,184,163,169,195,139,157,162,172,166,135,85,69,60,41,54,69,70,53,73,80,52,21,61,25,32,44,28,38,21,18,6,43,21,4,13,21,35,39,25,44,34,35,54,50,24,27,30,29,39,68,74,53,25,56,66,54,39,41,38,19,57,54,44,57,56,31,28,37,22,40,27,3,0,25,27,34,123,122,95,89,43,32,15,40,28,16,49,58,76,60,79,96,105,130,105,100,105,73,59,33,35,8,14,32,97,214,210,229,222,251,231,244,206,229,233,214,251,239,243,222,238,235,222,219,218,221,243,240,220,237,224,216,232,218,229,244,234,219,227,223,216,233,206,225,224,243,219,231,224,208,212,218,208,217,145,7,18,16,30,7,0,4,24,4,17,37,13,19,6,8,25,5,10,11,36,8,20,15,8,189,220,205,240,221,210,213,244,214,187,223,207,218,208,214,220,216,206,213,211,199,231,215,214,232,241,220,211,217,197,212,217,209,182,219,200,210,217,241,214,209,216,214,197,214,214,215,188,189,190,199,201,195,200,234,219,228,220,214,221,209,201,215,191,219,207,230,228,227,206,211,230,174,230,204,209,235,209,220,233,227,213,211,199,206,225,227,218,214,222,214,238,212,209,214,217,223,211,194,216,227,225,239,209,201,225,186,217,212,238,239,208,227,233,211,235,202,217,201,217,208,194,210,229,229,235,223,228,226,200,221,240,222,235,203,204,230,233,212,212,209,226,225,191,225,218,218,237,227,207,223,219,215,219,224,211,226,222,213,212,221,199,243,222,217,216,218,242,204,224,187,219,231,210,219,221,186,223,211,217,222,221,226,242,224,241,200,217,230,194,206,207,237,230,211,245,213,180,240,208,221,224,236,235,209,227,229,217,219,206,229,205,218,230,227,223,236,226,238,221,232,207,229,215,201,230,223,229,242,210,219,222,235,221,219,207,231,226,222,229,223,230,233,219,243,242,240,225,214,240,226,236,248,204,252,244,213,200,204,215,248,238,211,241,232,241,233,220,236,215,246,244,226,232,241,221,249,243,245,221,229,232,239,224,243,224,224,222,245,249,232,243,227,232,244,243,247,239,232,240,230,243,235,240,255,254,243,233,215,222,250,216,247,234,238,248,243,243,254,221,225,228,238,196,204,180,210,197,90,132,216,227,248,219,222,238,243,201,232,194,171,207,216,250,185,168,137,110,75,61,214,239,245,225,212,227,197,206,218,233,224,216,241,203,223,234,180,189,167,200,175,172,223,236,255,148,188,248,218,188,125,143,168,193,184,191,227,207,213,106,177,143,140,68,47,242,202,135,175,132,113,20,17,59,21,89,242,230,195,85,162,201,226,215,202,232,195,128,204,236,205,126,91,193,218,145,62,181,241,189,201,193,168,119,210,224,234,217,215,220,224,213,241,222,136,212,214,212,206,223,210,131,93,73,67,67,77,44,28,26,35,50,46,20,76,93,86,58,1,58,106,132,84,55,130,109,154,113,40,36,26,84,56,50,46,58,57,45,69,20,0,15,14,52,46,13,32,19,31,41,46,58,72,89,81,100,131,131,126,109,145,114,120,105,111,101,114,120,89,122,95,93,103,112,114,141,125,111,106,142,122,78,191,237,229,182,143,154,144,147,151,182,198,194,174,154,144,138,157,214,136,96,61,52,57,47,46,61,81,57,55,73,62,62,72,52,67,42,59,49,50,52,35,53,27,48,31,66,55,51,48,52,15,21,26,48,52,20,51,36,33,51,62,61,25,21,41,45,38,35,27,49,33,43,16,34,39,25,31,50,10,3,2,19,15,37,86,133,115,127,96,58,46,14,54,39,32,26,15,51,18,16,30,51,31,40,74,45,31,72,78,110,133,189,238,245,251,245,239,223,212,218,237,230,234,215,236,220,246,217,248,219,213,240,246,251,225,241,239,219,240,230,221,215,221,217,232,225,242,238,231,227,231,255,235,230,225,244,245,230,230,235,225,233,238,102,30,18,10,0,10,5,13,0,6,18,22,11,0,11,5,7,13,13,7,8,11,12,33,27,216,220,195,198,224,198,237,205,209,208,200,205,212,207,201,213,222,196,223,227,205,203,203,216,246,221,205,212,218,194,206,206,225,209,202,213,211,214,209,192,226,202,183,226,207,184,212,196,198,212,206,208,206,190,203,216,213,206,237,194,195,218,196,223,215,201,205,208,221,218,219,210,209,215,204,209,228,222,213,208,206,220,228,215,196,229,213,213,207,225,219,218,238,218,230,237,216,215,213,213,235,205,212,209,211,216,234,203,218,200,196,195,207,203,213,213,218,215,209,209,201,212,211,211,214,229,166,221,227,212,214,225,204,211,190,221,213,219,211,219,214,212,208,231,205,199,209,219,203,220,210,233,226,224,220,231,236,236,212,207,215,238,203,214,219,217,208,195,195,215,215,211,209,221,215,195,212,214,223,202,226,211,218,232,201,201,187,210,216,204,204,231,242,222,214,244,232,222,210,230,221,222,209,212,206,207,220,213,231,241,232,222,227,224,209,227,222,200,217,232,220,238,243,207,220,210,228,217,211,206,214,242,237,219,248,225,231,255,227,233,248,227,242,243,234,234,236,254,238,217,237,223,233,236,218,198,194,216,232,238,246,225,226,244,250,254,216,237,250,249,233,241,247,250,243,238,249,227,226,218,244,247,214,232,220,239,247,232,209,222,231,221,220,255,243,229,254,233,236,236,243,242,232,241,235,243,233,231,233,240,234,240,246,228,254,252,208,249,236,241,238,212,204,155,200,175,240,234,111,144,248,228,238,199,195,218,196,174,220,215,212,221,248,234,172,187,167,117,90,60,211,247,232,215,221,217,182,217,249,204,235,211,218,218,223,233,200,195,156,157,205,202,218,239,225,146,165,218,181,148,112,203,211,219,160,196,200,188,217,143,179,141,135,55,65,248,172,66,136,148,137,62,35,36,18,69,243,238,188,120,184,203,245,216,215,247,192,123,195,210,142,81,108,185,173,121,152,190,223,196,180,169,115,157,204,223,235,220,204,227,244,234,218,194,170,228,220,208,205,223,180,96,111,64,69,93,43,30,33,62,97,95,108,88,55,115,82,47,23,36,94,125,86,102,167,127,144,57,9,27,6,31,52,64,60,54,60,46,41,24,11,11,28,52,82,55,80,110,111,124,129,139,142,123,133,142,135,132,133,103,108,110,84,90,121,88,109,114,92,103,114,126,115,120,119,121,90,120,74,98,50,103,247,240,253,168,138,150,148,166,158,163,179,183,146,142,144,170,173,174,100,65,32,42,53,38,40,41,74,50,63,67,53,58,72,38,53,28,56,44,33,43,51,63,53,73,37,72,63,54,73,40,46,51,47,46,37,39,37,34,55,44,50,54,77,66,31,31,34,58,68,62,19,14,41,16,45,46,58,22,38,33,21,17,21,22,50,98,121,103,65,60,21,21,75,77,63,56,71,71,69,52,47,60,69,37,52,90,159,210,245,239,238,250,234,218,251,231,230,229,236,229,233,236,235,231,250,210,225,249,242,220,243,233,245,228,234,237,205,202,224,221,241,238,244,210,247,222,205,231,225,237,225,228,210,248,246,236,221,227,212,220,220,211,225,110,1,8,15,4,22,3,9,9,1,20,1,4,0,12,5,9,10,4,5,29,6,18,9,13,212,207,215,199,212,216,209,220,215,233,232,200,200,234,202,208,210,225,205,227,211,191,226,203,208,213,212,238,238,231,225,216,203,209,196,194,200,199,225,195,201,222,213,212,217,206,182,227,209,222,200,240,201,203,203,242,202,219,218,212,194,216,218,204,205,208,202,236,208,209,208,216,223,193,200,231,211,225,210,235,205,207,205,192,206,240,217,219,210,235,210,228,215,219,217,195,223,232,217,238,211,229,203,222,202,215,230,240,199,218,219,203,211,194,226,200,221,191,223,208,224,216,241,189,230,207,220,230,218,236,226,221,214,200,223,231,211,230,208,220,223,204,211,236,222,199,232,238,180,215,218,204,212,215,214,206,206,196,202,233,239,218,223,191,217,218,204,214,219,203,210,212,213,211,188,209,215,200,211,211,222,201,212,213,203,195,206,183,217,227,206,235,217,217,248,200,223,221,207,220,214,228,200,239,221,201,205,223,191,208,217,228,220,224,211,218,223,214,212,219,229,209,223,218,229,243,211,231,219,221,227,209,207,239,222,211,213,222,216,221,248,213,224,251,218,227,234,229,225,234,210,234,211,229,234,213,224,240,238,238,229,229,215,223,231,241,223,234,246,238,236,219,250,238,229,219,248,227,235,241,232,247,249,232,238,229,233,232,223,236,220,227,240,251,236,237,220,239,242,234,227,244,239,246,240,255,249,204,230,238,236,247,245,230,221,214,237,234,220,236,242,230,242,196,199,224,252,228,89,141,241,196,206,190,190,221,241,218,232,214,181,228,245,252,176,166,135,119,65,70,220,247,247,225,239,201,189,255,220,222,243,249,246,243,234,211,203,244,179,128,192,170,199,244,215,114,170,213,188,206,211,193,194,217,177,163,203,201,186,184,228,164,94,63,128,237,159,48,113,128,199,143,41,56,4,49,202,246,169,151,171,202,238,202,199,236,170,130,218,153,135,117,91,175,180,137,142,234,205,189,199,164,118,149,248,237,222,208,231,235,200,205,206,167,188,219,208,245,235,253,158,91,119,84,60,66,57,47,45,125,110,129,119,98,102,117,104,89,40,33,109,146,84,89,119,100,129,61,2,18,15,16,45,55,53,66,55,60,88,84,115,93,165,155,126,152,119,104,137,137,125,108,116,117,135,106,127,109,113,90,81,77,88,121,86,63,61,46,61,59,62,14,39,52,64,22,33,73,26,33,9,123,250,239,235,145,138,155,131,153,167,185,207,200,149,139,151,166,170,174,84,41,20,19,28,44,25,19,31,48,17,40,28,32,44,62,28,24,64,49,40,65,51,58,41,58,38,65,48,56,58,57,57,63,74,53,74,96,63,57,51,58,41,60,28,30,18,41,19,46,37,56,20,19,12,32,52,63,54,52,21,10,16,35,65,48,39,111,130,125,85,40,15,26,115,124,154,145,151,143,104,120,129,120,170,162,154,141,168,250,241,229,235,236,221,239,219,232,214,222,231,230,243,219,232,225,250,231,230,226,216,219,242,239,242,231,228,245,227,204,237,239,218,228,224,218,239,215,229,230,223,225,210,214,232,226,240,234,224,251,236,222,213,210,242,120,16,2,16,11,2,18,21,15,9,31,31,16,21,12,17,9,17,11,10,20,6,18,24,8,208,212,204,226,214,219,216,201,221,209,206,227,202,205,197,213,213,219,190,207,214,216,186,213,215,220,216,200,203,224,215,206,212,203,203,227,191,187,214,189,205,213,204,211,200,228,203,226,193,202,204,210,189,203,189,220,234,218,204,213,212,201,190,213,206,227,234,206,208,212,205,204,217,206,203,224,195,214,202,199,213,200,209,217,194,194,226,212,227,209,212,216,215,228,209,215,196,219,222,230,186,205,223,221,213,219,213,222,225,201,231,228,219,211,221,207,208,192,225,217,211,202,233,231,235,194,216,207,208,227,214,216,231,214,214,225,225,224,221,232,223,237,194,213,239,212,211,205,231,204,214,219,209,204,189,213,229,222,211,208,219,223,219,227,214,221,205,222,207,210,229,217,237,196,213,215,202,226,221,189,218,208,205,220,209,205,220,210,239,206,219,236,196,223,207,214,223,208,242,232,216,199,207,216,216,233,234,210,237,219,207,216,240,239,205,215,240,238,233,207,209,230,197,211,188,231,227,243,216,207,218,220,218,214,239,235,216,221,221,216,214,203,233,228,227,222,229,220,216,234,224,237,231,252,224,218,211,222,210,228,237,225,248,245,245,238,230,240,230,206,239,213,242,220,247,230,245,240,240,200,213,223,254,228,233,232,219,226,245,252,239,234,214,230,249,242,246,241,228,222,223,228,240,243,249,243,232,233,218,220,207,249,242,231,231,243,230,216,221,243,247,217,206,196,232,229,250,193,89,151,216,226,234,204,240,251,238,220,227,197,186,225,236,239,165,154,126,108,74,65,204,246,232,250,222,199,169,248,234,245,216,238,232,241,234,244,216,206,162,149,221,129,139,197,199,181,193,209,200,218,176,181,235,175,177,181,194,219,216,206,224,157,84,113,198,245,151,48,48,78,129,190,142,57,1,4,162,240,161,111,214,193,221,231,215,252,158,154,189,150,183,154,104,107,177,154,216,247,187,187,197,158,101,179,241,231,249,218,222,196,236,205,243,168,184,231,226,211,221,249,141,82,112,44,105,64,30,29,123,134,76,66,18,34,80,135,138,65,41,40,83,137,108,139,107,119,113,31,40,70,64,71,80,136,130,119,152,116,117,113,122,123,112,106,120,128,126,121,141,110,145,109,79,85,66,84,54,55,62,62,36,49,41,24,39,37,96,53,47,52,47,29,47,50,39,32,6,13,12,40,15,174,243,241,237,149,137,140,159,179,191,163,161,155,143,143,150,172,186,133,50,28,25,13,17,24,11,33,46,30,18,30,9,14,16,25,17,18,30,21,4,35,34,46,37,47,63,52,45,65,58,47,74,57,60,79,44,87,70,119,64,96,73,50,88,79,54,39,23,38,37,53,9,18,1,8,47,68,26,29,24,11,31,18,25,53,37,110,100,108,114,64,44,95,166,149,105,97,99,132,146,145,120,120,106,132,119,138,114,99,190,233,233,249,244,227,226,241,209,228,238,247,234,237,228,246,240,237,241,210,215,207,237,234,228,231,236,227,212,239,229,216,245,218,226,246,222,232,223,232,239,221,217,213,230,215,235,235,220,217,204,225,224,218,234,132,5,0,18,10,25,24,23,32,28,17,16,12,22,9,13,22,30,13,6,3,22,12,32,27,212,212,201,195,212,215,232,204,193,193,218,206,214,203,236,197,202,210,186,203,198,207,197,205,213,206,213,212,202,214,198,201,206,198,229,226,189,192,205,198,203,197,198,202,193,203,196,173,210,199,201,208,196,213,177,206,223,218,209,195,207,227,199,211,203,200,234,191,228,210,227,218,222,205,214,196,210,206,211,210,204,213,201,220,201,190,202,212,233,219,197,218,196,192,201,213,244,213,228,211,204,225,231,204,214,234,181,207,217,222,191,226,212,221,222,197,235,210,228,243,219,219,243,220,193,208,215,218,210,200,198,204,191,228,221,219,218,215,217,231,243,213,194,230,206,217,219,225,196,205,220,186,187,210,224,221,195,212,201,217,199,216,219,223,220,208,237,237,225,221,221,200,223,210,230,210,195,214,221,226,204,219,227,228,220,194,244,220,232,238,243,209,228,209,220,215,208,207,229,208,236,241,226,226,219,208,214,200,220,225,242,230,194,217,207,202,224,219,238,201,237,200,210,226,237,190,215,227,225,205,205,232,212,240,229,236,208,246,230,231,217,214,228,223,214,226,223,242,230,233,212,198,233,232,231,195,211,225,230,229,211,203,248,246,219,190,231,242,244,223,248,234,252,229,242,245,251,239,229,240,241,215,245,217,225,237,222,246,239,235,234,232,229,237,225,220,255,249,224,225,253,235,227,222,235,245,232,242,244,236,240,218,222,250,219,235,240,216,210,241,247,249,242,232,201,163,221,128,124,200,207,239,237,185,248,255,211,228,221,202,166,225,251,253,176,154,127,114,72,66,218,236,235,215,203,177,204,255,203,214,205,237,225,209,227,223,196,219,141,144,218,156,150,222,214,194,181,205,202,202,142,202,231,216,205,139,199,207,200,196,188,139,45,164,234,232,151,62,64,80,102,154,161,117,48,4,163,238,179,130,234,206,224,226,221,208,116,209,239,195,158,125,105,112,126,197,206,238,219,200,197,142,121,194,238,213,214,207,187,236,237,241,212,162,173,228,231,224,206,231,159,76,84,96,107,52,20,43,120,124,64,97,44,55,48,140,137,79,28,59,102,95,88,122,97,100,158,93,147,166,183,169,144,112,108,128,106,109,117,124,114,93,123,116,44,87,78,76,61,51,73,39,34,60,42,32,21,39,49,71,34,35,12,24,82,96,83,101,74,36,35,32,40,30,84,24,44,32,22,2,62,219,241,255,208,141,168,146,167,172,192,167,166,130,139,139,181,157,174,111,16,38,55,23,19,27,56,39,27,40,33,22,26,17,36,7,9,18,13,20,4,20,14,16,19,3,46,12,16,31,42,60,49,78,37,81,70,52,85,62,41,84,81,60,103,95,91,53,73,83,77,51,24,24,9,17,27,58,81,36,63,52,52,50,43,39,72,106,120,146,103,55,92,154,152,76,48,28,57,112,88,94,58,59,57,51,59,69,112,97,117,175,238,241,222,237,227,231,254,247,225,240,202,252,240,221,204,241,240,246,241,249,224,240,230,229,233,220,229,235,237,236,235,235,248,239,240,227,212,189,224,234,227,221,223,216,232,219,247,241,246,243,235,216,223,123,2,7,4,5,0,13,6,25,18,13,30,0,17,5,7,13,15,9,9,8,14,14,16,2,203,207,194,204,199,233,205,205,206,223,232,218,218,200,223,222,227,221,214,199,206,200,210,222,216,218,193,197,199,223,187,209,205,200,226,205,239,204,204,207,187,193,209,214,201,184,213,209,202,211,193,199,225,199,225,208,196,232,196,207,217,201,198,227,202,207,215,199,225,192,228,200,226,225,218,214,197,223,206,202,241,210,204,228,232,205,186,211,216,213,205,210,206,209,205,193,218,206,215,198,209,195,200,193,210,217,223,226,199,218,208,216,198,202,230,215,193,228,213,221,223,223,218,200,187,245,234,225,208,232,227,212,217,240,200,217,223,229,214,233,214,195,227,213,202,226,211,207,225,219,222,203,192,206,209,220,213,192,205,237,223,222,206,214,236,217,224,215,193,190,233,230,222,226,198,226,225,205,228,202,211,214,219,207,210,229,209,236,218,213,219,217,218,206,230,209,203,216,213,213,207,210,198,202,200,214,217,202,235,219,214,212,219,216,223,209,185,214,239,213,208,224,214,202,209,213,214,215,212,222,240,200,211,219,194,213,203,224,243,212,203,244,231,234,216,246,207,234,226,222,225,227,242,247,216,216,205,217,253,226,208,243,227,220,232,216,232,230,225,227,233,247,235,243,226,215,228,244,223,233,227,237,234,240,248,231,242,241,244,234,246,245,236,224,219,246,229,252,236,237,233,237,226,248,244,232,234,242,229,231,244,245,229,237,227,243,238,246,221,255,221,208,171,195,161,140,232,172,135,200,224,219,218,203,227,229,226,209,229,180,210,246,251,251,190,148,151,94,59,81,203,244,251,215,203,187,192,241,211,213,241,215,238,221,229,234,215,216,156,176,223,174,222,206,195,197,170,198,196,206,174,195,192,186,215,159,166,217,208,196,162,119,67,206,249,237,157,57,78,112,46,102,125,179,101,51,196,247,162,84,204,218,207,204,202,210,154,194,245,149,123,165,196,125,86,191,234,244,196,222,172,121,114,176,232,221,234,230,196,209,210,230,220,158,227,246,225,231,240,253,118,78,59,61,118,43,14,35,141,116,81,101,100,133,141,136,118,41,37,63,151,115,81,137,98,153,119,102,149,150,135,134,154,122,122,102,114,97,71,88,57,51,43,57,17,13,6,63,77,37,68,24,13,68,91,78,56,17,55,74,68,53,59,80,66,54,71,88,129,81,39,43,46,38,24,39,34,30,29,13,127,239,254,253,190,128,139,159,178,185,154,151,152,124,165,155,167,157,146,55,58,24,18,28,24,47,48,40,42,6,30,21,19,27,37,6,18,10,11,34,19,31,25,37,44,11,2,15,0,13,48,65,56,27,35,26,12,36,54,52,98,101,86,94,74,57,65,67,96,105,77,87,77,69,65,65,32,26,40,48,39,55,47,39,36,23,46,104,147,117,118,28,78,145,115,66,37,44,97,94,66,31,94,76,65,44,22,22,26,51,104,160,237,232,233,245,238,248,255,244,251,226,243,233,251,228,246,228,247,253,235,222,206,232,238,225,216,251,233,221,226,248,230,223,235,249,210,200,231,219,223,192,216,219,222,221,232,242,237,196,232,221,234,236,235,105,0,2,4,11,24,12,17,8,12,1,3,18,9,13,17,27,10,8,18,0,7,5,20,17,182,195,192,220,199,200,214,208,199,218,198,214,202,181,214,208,208,212,204,210,207,202,206,189,208,200,192,220,177,176,213,229,234,226,203,207,231,208,202,184,208,205,209,203,215,217,210,190,192,215,180,196,197,189,190,184,214,207,201,182,195,193,222,198,206,220,216,234,189,202,226,215,187,188,230,223,214,229,234,216,207,214,209,218,202,203,203,215,202,211,205,213,235,216,222,213,211,205,209,236,208,233,195,244,219,213,215,202,203,219,224,198,218,198,216,200,216,201,218,213,218,213,203,183,214,192,224,209,225,217,219,215,208,213,222,190,210,223,221,232,191,196,209,203,224,208,200,232,233,205,202,219,213,219,175,193,221,222,194,203,204,196,209,229,210,206,230,204,197,223,233,222,227,198,212,202,213,210,219,231,225,213,192,197,210,192,197,188,217,208,193,204,210,220,227,207,193,201,224,218,224,213,215,211,217,213,248,195,208,198,219,196,238,214,214,208,210,208,205,228,205,207,216,213,220,219,243,219,200,225,217,236,220,208,231,199,200,225,230,212,210,237,240,224,223,235,238,203,233,213,235,225,250,229,239,206,206,251,236,241,229,205,231,241,207,197,228,222,237,240,248,236,231,212,240,237,215,219,226,242,250,249,235,249,228,229,227,249,234,235,241,249,248,219,240,229,252,245,247,238,246,229,220,249,238,211,235,229,242,236,240,246,215,249,255,248,233,235,249,218,102,75,77,127,177,188,236,143,90,176,236,225,239,222,229,239,218,209,225,202,219,244,239,230,151,164,115,113,61,58,221,236,225,236,206,189,227,229,196,224,186,237,226,217,224,243,230,209,178,188,230,182,193,221,237,207,128,229,202,154,138,136,185,186,211,164,206,193,234,181,135,95,85,236,253,250,150,84,122,105,98,38,70,151,166,168,206,254,175,69,187,184,227,236,204,197,169,214,233,167,144,173,196,66,44,187,240,242,178,202,161,109,110,219,219,230,247,230,208,205,220,255,186,151,224,249,220,247,243,237,133,98,111,107,115,46,18,23,105,127,82,74,95,99,90,70,32,30,47,112,121,99,115,132,116,118,115,150,171,169,122,105,87,85,63,38,20,13,24,10,27,14,39,69,56,12,13,33,80,83,82,12,43,113,108,147,93,63,95,106,97,41,98,127,73,42,13,88,120,112,52,40,33,42,40,26,30,49,15,66,220,238,251,227,149,141,152,132,152,146,153,151,119,133,144,128,128,165,118,65,18,15,31,46,28,23,48,19,11,36,20,18,24,32,30,33,37,38,20,29,0,33,15,22,26,42,33,27,14,5,56,53,51,41,32,0,5,35,12,41,36,24,67,75,71,93,73,99,106,82,79,50,68,77,60,85,65,74,47,69,39,28,10,14,33,72,63,112,109,122,102,89,80,131,94,58,50,72,98,102,107,135,147,153,98,140,62,56,29,31,61,147,221,243,238,243,228,230,230,230,242,233,222,242,239,234,234,237,237,232,220,252,199,249,255,225,237,222,235,233,217,241,232,225,212,219,236,212,239,223,231,222,245,234,203,248,243,215,213,240,226,218,234,232,220,124,7,2,10,1,3,8,15,2,51,3,4,1,19,7,4,10,2,11,16,8,13,48,5,16,196,210,192,196,228,214,205,210,195,205,212,183,229,204,211,218,209,222,197,217,185,189,217,209,206,212,185,206,206,210,229,184,207,201,214,201,178,212,205,198,223,203,227,208,204,213,203,191,207,209,195,214,200,196,195,192,198,203,214,187,213,190,197,228,203,204,188,193,199,193,208,222,162,211,218,223,197,216,209,215,197,211,195,181,179,217,207,212,215,234,188,227,225,211,220,209,210,239,224,217,203,204,216,223,216,204,203,193,219,213,213,196,197,227,215,206,226,212,197,214,230,219,210,211,221,212,212,218,213,233,219,223,215,198,201,223,217,198,236,207,223,201,210,211,221,213,203,197,239,241,212,206,208,215,199,235,222,201,194,214,208,200,220,210,224,219,207,212,220,192,177,202,229,210,209,209,213,227,214,195,225,208,200,212,242,193,236,203,223,212,212,231,225,209,239,242,207,212,192,226,206,246,217,234,206,206,217,221,205,206,231,223,202,221,206,219,216,217,225,226,227,210,221,190,228,238,219,245,215,228,213,197,217,210,215,202,235,182,220,225,226,221,240,226,221,234,198,212,211,219,214,235,247,234,211,207,215,232,217,212,227,232,219,220,221,228,247,199,239,246,228,229,237,237,206,243,212,224,219,226,246,216,225,236,221,230,239,237,236,243,241,227,241,239,225,237,239,238,249,234,240,236,239,227,238,233,227,228,220,234,226,236,232,234,241,239,215,215,166,167,99,58,44,150,236,198,223,132,111,235,219,228,239,205,213,214,205,207,218,193,206,234,244,251,173,130,135,95,50,84,232,225,246,225,217,230,243,255,239,200,229,230,234,214,234,250,180,191,184,194,223,184,193,189,236,195,104,191,141,153,175,160,160,209,227,175,209,192,231,180,187,102,83,229,209,253,169,59,82,113,81,47,4,79,117,196,241,222,190,67,204,203,236,209,178,189,183,189,183,150,141,240,186,66,119,154,234,223,210,214,159,93,167,224,244,254,231,245,214,202,202,228,175,189,243,240,219,234,246,252,172,78,83,94,140,63,41,22,69,106,113,61,18,47,62,39,47,71,127,150,133,73,69,117,97,116,90,86,63,19,42,12,18,22,24,63,96,53,37,15,13,20,72,95,82,99,87,56,122,110,93,9,73,103,123,109,118,159,112,105,128,36,45,140,90,63,36,98,120,66,36,19,18,25,42,29,20,47,39,82,239,255,247,252,191,155,149,168,176,148,154,155,126,133,171,175,173,175,105,44,51,25,24,31,36,33,5,30,13,51,35,17,27,25,40,31,22,12,15,10,30,30,37,39,13,8,29,20,16,9,45,61,61,38,23,43,41,73,82,48,67,64,51,38,14,15,24,58,60,77,96,105,83,76,82,75,79,80,70,69,96,57,43,72,49,75,41,91,120,130,101,89,110,92,76,58,48,102,98,115,113,83,71,48,107,145,86,61,54,37,68,107,217,235,244,249,247,244,213,237,239,229,235,227,229,210,241,240,235,249,251,230,223,251,242,233,230,220,236,226,239,253,215,226,235,225,226,243,215,230,200,200,226,226,221,237,233,233,243,231,227,232,231,209,214,112,8,25,6,19,12,22,13,17,22,3,34,12,16,7,15,10,4,12,2,9,3,30,19,2,218,206,211,181,234,194,211,206,209,207,202,208,199,209,196,197,210,212,198,210,192,193,199,206,194,205,204,221,213,230,227,191,205,172,223,210,190,184,207,197,213,219,181,184,180,221,209,211,228,215,180,195,182,218,223,193,189,209,222,200,198,192,218,189,201,203,186,197,226,227,199,213,213,206,192,228,209,214,178,207,205,213,215,193,217,202,214,216,178,203,220,212,222,197,183,203,227,218,209,214,193,210,215,241,200,204,191,215,233,218,228,219,201,207,211,207,227,203,216,205,216,195,212,207,210,212,209,221,213,212,223,203,200,225,210,223,214,206,214,228,212,212,208,241,221,215,244,223,218,186,207,194,208,189,213,207,191,210,216,237,214,230,209,220,205,207,213,208,215,223,221,234,215,205,215,231,233,207,229,234,230,197,241,221,219,210,236,208,208,215,210,197,203,201,200,235,214,231,212,211,214,221,194,220,204,220,231,230,222,216,231,215,202,222,241,216,228,216,223,212,249,231,214,217,218,221,216,214,214,216,236,223,197,239,237,213,246,236,226,219,209,204,204,225,236,243,208,203,211,213,249,247,231,224,229,174,222,246,209,208,217,228,229,191,228,213,225,230,238,250,214,224,232,220,228,211,238,213,220,241,221,235,245,226,242,236,235,247,232,241,215,246,236,239,209,236,250,237,235,237,236,242,222,246,227,223,251,236,234,255,242,240,244,235,213,193,176,182,205,219,161,98,118,224,203,187,223,129,125,212,249,223,205,204,228,223,193,181,236,164,206,251,230,240,202,162,137,104,49,79,218,232,226,228,178,222,255,243,233,214,202,216,193,223,218,228,204,190,132,148,214,191,176,177,189,149,149,177,190,185,228,182,187,192,240,185,182,200,222,181,174,107,70,242,227,255,188,51,87,132,127,38,24,63,48,144,223,236,143,84,199,205,238,217,142,182,205,160,118,158,156,226,156,78,112,151,239,217,206,249,174,88,151,254,225,219,229,214,207,216,236,216,185,185,224,215,253,238,250,255,205,97,99,96,136,118,84,13,43,92,132,80,65,34,42,64,92,108,83,141,56,28,125,123,109,114,51,31,19,24,23,21,34,4,49,96,84,65,55,20,27,16,74,126,114,112,124,101,105,99,32,35,70,117,99,82,89,122,135,129,104,49,29,63,91,58,103,108,85,75,47,47,17,51,47,42,52,44,25,164,250,245,248,233,180,149,173,152,163,185,188,150,169,157,153,158,170,154,92,15,36,37,17,36,30,17,13,45,31,27,32,30,19,17,9,27,32,2,8,16,36,21,41,34,43,30,26,24,19,25,67,81,41,56,27,26,74,73,48,54,54,54,37,4,55,29,22,45,36,42,20,45,41,48,62,92,64,83,79,57,41,62,93,87,99,89,85,123,132,121,113,45,85,134,89,62,42,74,149,135,90,56,55,25,46,113,128,98,43,76,52,79,219,230,226,240,205,229,252,245,242,241,248,236,227,233,215,219,223,239,214,240,229,226,225,236,224,233,241,233,245,233,234,220,218,229,217,211,226,234,220,222,229,248,248,223,207,227,218,228,234,233,234,231,203,100,1,3,0,10,22,16,38,18,7,12,23,17,4,1,0,16,10,14,7,2,13,24,3,11,201,209,179,211,199,183,197,177,209,233,200,204,182,195,207,217,212,198,179,201,201,198,199,216,217,202,204,204,212,216,200,208,211,214,184,189,203,214,227,191,205,206,188,207,174,200,195,219,202,227,215,233,195,197,212,198,215,227,207,221,207,210,178,185,219,201,209,219,212,185,208,203,187,214,220,196,208,230,191,175,212,204,216,200,211,207,208,189,212,209,203,194,209,203,224,194,206,209,204,195,218,176,214,213,200,205,208,232,209,220,199,217,222,229,206,215,196,212,201,196,209,206,223,210,205,232,223,226,230,220,247,208,212,217,204,222,211,224,211,215,212,221,199,221,242,193,196,215,203,212,177,200,213,207,206,222,205,196,215,181,216,212,213,226,224,229,205,217,227,201,180,202,189,216,211,213,222,225,228,214,218,209,232,196,233,238,216,220,213,204,214,205,211,231,224,204,218,205,220,193,233,177,212,236,199,191,230,214,192,221,212,207,236,233,211,237,221,222,224,237,217,222,213,228,199,197,225,221,218,180,227,217,226,224,245,231,212,223,238,249,216,213,223,207,198,230,225,220,218,230,218,222,214,203,193,218,225,216,215,234,193,226,244,215,219,231,238,222,242,223,226,237,226,231,244,230,240,228,232,225,247,250,232,210,245,234,247,234,238,248,224,224,238,233,239,239,235,230,234,241,222,250,212,245,239,245,239,244,240,241,234,196,176,170,176,204,206,211,244,249,204,108,175,232,207,247,221,116,107,212,228,238,193,187,236,253,225,186,228,193,222,237,198,249,208,187,145,112,57,57,200,228,247,207,147,165,248,238,231,203,219,229,210,214,222,233,220,218,173,131,172,174,190,179,151,156,195,193,182,207,209,194,178,229,219,177,190,181,197,163,183,83,89,234,220,242,163,49,71,126,99,53,41,39,11,102,196,245,137,54,191,201,243,175,111,176,184,84,133,187,209,192,100,93,122,127,213,179,215,230,167,128,168,228,241,239,241,239,234,219,209,240,169,221,222,239,229,242,254,238,188,74,46,108,135,132,84,46,24,51,114,148,144,152,119,120,152,161,152,119,34,31,111,122,105,131,51,9,15,19,6,18,49,35,106,133,121,137,67,41,25,22,49,106,101,102,89,126,88,108,78,37,63,122,101,36,44,59,87,102,80,74,36,18,89,129,115,98,75,64,34,48,44,77,68,48,94,49,88,250,249,231,235,247,202,153,139,154,171,220,199,151,127,149,166,160,152,119,84,36,44,43,23,38,36,26,18,22,36,38,11,11,35,14,46,46,29,15,16,4,21,26,14,19,35,23,16,14,9,37,84,80,46,6,30,38,27,60,87,45,73,49,36,23,44,19,53,79,107,65,70,54,46,29,17,47,47,60,60,76,82,73,58,91,91,81,108,101,105,147,96,95,91,122,80,45,57,48,111,123,115,72,64,113,95,102,140,90,52,52,40,77,200,235,223,239,233,237,234,213,239,246,249,241,219,237,230,240,233,208,227,219,249,225,245,234,217,224,218,206,230,232,227,208,239,206,229,210,226,224,228,227,199,220,222,244,232,212,234,203,222,226,238,236,204,118,5,18,26,22,16,11,6,11,36,7,19,21,1,6,31,1,8,21,1,6,7,1,27,26,198,195,214,187,223,206,192,201,201,202,198,207,210,207,210,214,209,195,217,231,225,221,195,194,189,185,209,205,224,213,205,217,190,185,216,199,223,217,205,200,200,201,220,198,222,204,203,216,201,206,211,208,205,203,212,212,194,190,192,207,184,197,201,209,203,184,206,199,229,214,224,207,198,204,190,214,222,227,194,217,194,199,194,203,236,215,204,214,215,232,200,212,207,215,214,207,207,210,194,209,203,211,215,220,206,208,211,188,202,201,221,210,195,211,214,215,196,214,223,209,212,187,219,185,204,218,225,217,209,202,229,214,205,199,221,199,229,192,215,208,201,222,210,187,233,220,195,211,214,198,188,209,196,190,198,196,200,215,218,206,186,222,208,221,203,222,198,226,203,208,204,228,241,221,222,213,211,221,224,214,184,206,194,189,212,222,229,196,209,205,227,242,224,210,225,205,190,227,208,175,205,214,209,213,222,199,192,206,200,245,215,226,218,222,230,226,222,226,195,200,242,230,220,226,229,207,225,236,204,211,212,217,239,196,216,218,208,239,238,200,221,242,211,225,210,200,207,199,224,202,196,197,215,220,195,210,239,211,215,228,235,224,219,208,245,224,216,243,215,225,221,225,224,222,225,244,225,236,230,241,245,225,229,231,209,225,221,235,233,232,228,209,232,232,247,226,246,240,238,237,230,231,239,248,243,247,230,246,224,200,196,196,187,197,193,218,237,251,230,247,177,85,159,237,217,233,183,89,115,235,222,240,208,205,245,246,206,205,243,186,227,248,253,254,182,142,122,102,62,73,216,237,251,122,34,59,135,243,229,213,235,224,206,232,227,219,226,184,153,148,230,175,176,225,162,167,210,180,192,211,195,150,110,219,186,190,168,211,226,166,196,94,81,240,243,254,174,48,79,106,111,51,57,39,17,47,164,229,153,117,201,205,243,160,146,236,175,171,179,206,220,138,98,177,191,129,147,177,230,217,137,99,167,210,238,245,229,216,218,217,240,213,187,222,227,221,247,253,251,254,116,55,35,47,127,153,144,77,41,36,47,109,137,148,138,119,122,143,120,76,5,37,118,92,136,86,33,38,18,20,24,27,43,78,114,101,100,120,102,89,32,35,75,100,73,38,1,53,90,95,64,49,35,106,85,73,30,15,55,120,129,112,62,59,122,140,129,95,105,96,104,96,153,156,136,145,138,66,150,224,241,253,251,228,187,138,151,203,212,245,205,155,135,132,160,190,170,116,96,77,78,93,63,84,49,59,47,66,66,63,45,54,76,70,47,23,40,43,26,4,29,49,17,33,27,37,47,17,18,29,56,66,31,30,33,54,75,43,28,74,58,53,49,34,35,49,78,152,103,98,95,69,46,32,12,34,99,55,61,55,42,37,77,77,102,76,98,81,108,150,101,83,98,79,101,74,49,24,86,126,180,159,148,92,78,132,121,104,60,34,53,91,205,215,220,236,233,233,242,226,245,221,235,242,216,237,231,219,211,239,244,239,229,244,218,236,237,203,226,219,228,243,221,235,218,226,225,243,234,228,236,238,215,211,248,205,212,227,227,202,212,228,220,223,228,122,26,12,0,17,6,9,16,17,7,30,12,14,12,0,0,18,21,0,7,12,15,30,17,3,192,189,210,199,209,176,211,210,215,194,213,230,203,212,216,180,196,194,205,185,187,229,200,200,220,197,223,204,198,192,181,215,218,206,198,206,197,191,205,208,220,210,196,193,204,203,164,205,209,207,192,214,217,213,236,209,213,194,211,187,219,184,214,224,210,209,187,214,213,183,189,217,190,185,214,189,211,190,194,191,208,196,183,195,194,185,195,189,211,214,231,198,201,206,185,211,193,227,227,191,203,207,190,207,228,218,201,231,223,181,224,185,225,217,197,201,227,194,217,213,194,210,214,234,217,188,226,209,212,222,190,218,208,221,180,197,202,224,205,200,213,203,214,206,214,216,195,198,207,212,212,197,226,207,207,208,226,211,217,189,230,205,204,194,194,190,188,219,203,221,211,209,239,219,211,199,203,218,197,218,206,224,215,211,188,199,233,221,216,182,189,213,212,211,209,225,240,220,199,236,214,224,180,231,198,193,204,216,214,229,203,215,204,232,233,213,205,185,201,205,214,206,207,221,242,205,216,190,222,239,244,220,199,227,210,215,217,225,218,225,237,220,215,213,209,240,227,217,231,215,209,232,214,214,177,217,228,219,219,202,238,209,223,235,225,227,244,231,212,208,217,242,230,234,218,213,221,231,228,211,224,223,223,217,227,249,205,221,237,207,252,246,223,238,235,225,244,235,211,237,216,249,233,255,247,191,185,165,155,185,183,207,225,243,238,240,228,241,224,250,159,73,175,227,176,167,146,87,151,239,244,244,248,218,248,251,204,225,246,203,213,204,196,189,142,155,154,128,85,115,205,237,197,44,5,4,132,224,225,210,223,209,218,228,238,230,192,167,155,177,182,203,190,197,198,187,216,156,167,193,186,158,140,228,183,172,204,238,238,194,188,88,125,244,239,235,187,67,70,124,132,41,47,84,22,65,206,237,176,157,223,215,240,189,160,246,229,173,141,147,209,147,92,228,235,147,126,181,253,197,83,110,197,240,219,253,226,248,238,228,243,195,164,224,243,250,244,249,239,128,41,1,9,38,79,113,166,139,69,65,35,33,80,140,100,53,54,136,138,86,27,70,120,110,140,100,35,24,15,23,57,33,87,113,99,49,75,105,128,97,43,23,64,103,64,59,0,30,64,83,69,84,110,138,156,120,134,123,152,143,177,138,133,165,132,126,102,126,131,113,125,117,127,140,116,88,112,78,182,249,245,252,251,221,179,150,163,200,253,226,177,104,110,131,171,224,152,128,92,96,65,31,58,55,59,41,52,58,68,57,52,64,67,58,67,46,34,67,47,42,72,69,34,64,59,35,38,29,47,71,61,69,37,11,40,82,67,65,80,50,58,52,6,53,67,54,72,104,92,77,40,20,24,41,35,79,105,84,43,20,17,15,34,20,59,67,64,92,104,112,93,84,51,109,146,120,69,58,21,69,70,84,85,51,95,139,129,67,74,49,47,102,208,227,218,248,238,238,235,247,237,245,219,219,248,201,238,231,238,218,233,221,221,229,222,235,247,232,243,226,244,220,244,214,223,241,202,217,248,231,208,211,223,227,217,211,227,214,213,218,205,215,225,232,223,131,4,9,26,0,1,11,19,16,0,10,16,11,9,7,14,30,8,30,0,2,4,36,0,15,209,185,200,185,219,193,222,198,191,195,195,182,205,197,188,195,202,221,222,201,230,213,183,191,224,200,207,201,197,206,179,199,208,216,204,221,199,185,197,198,218,180,214,202,215,230,234,170,193,200,222,213,204,206,208,199,215,197,223,197,228,193,213,182,195,198,216,211,191,229,189,212,211,185,218,221,205,200,193,193,226,182,211,190,173,224,192,202,189,214,219,198,198,188,206,183,191,229,208,189,221,204,200,169,211,199,194,227,210,232,183,205,189,201,199,200,201,214,220,224,217,193,223,211,185,235,209,207,197,176,205,207,212,195,218,214,205,209,206,206,218,207,172,232,212,204,213,213,210,215,197,205,200,186,170,186,229,198,200,215,207,207,205,231,190,198,206,197,215,194,218,215,186,203,211,197,193,199,208,213,219,230,213,196,229,203,194,210,191,201,212,208,201,216,205,218,210,235,180,226,208,200,211,207,225,224,215,205,204,194,196,220,219,183,210,220,217,197,225,201,214,213,233,214,233,211,192,201,217,234,221,204,213,204,208,210,217,221,210,220,217,220,202,198,209,198,224,196,212,204,204,212,202,194,207,217,228,210,203,232,232,209,219,211,224,207,230,230,224,228,221,230,233,214,213,228,231,213,226,228,233,223,226,211,230,198,239,235,245,227,211,243,228,228,233,245,228,238,231,227,234,219,197,207,183,178,182,199,194,216,221,238,248,242,230,238,208,250,243,213,183,84,159,225,177,160,98,90,131,207,200,239,182,172,223,208,193,207,226,191,132,97,141,121,139,154,161,187,147,148,170,160,139,65,12,14,106,232,254,213,239,228,244,232,230,205,224,197,181,178,182,202,214,204,147,147,162,108,188,197,208,190,188,251,210,228,243,248,240,202,183,59,94,235,247,253,176,59,105,122,108,67,41,43,43,100,200,252,201,156,226,231,254,192,202,247,231,179,85,170,255,120,49,198,255,188,128,159,248,198,53,137,214,241,243,241,210,229,239,246,229,171,190,247,240,246,252,242,160,44,5,28,13,35,32,70,116,109,151,104,87,22,13,76,107,46,9,41,92,80,18,80,131,127,160,75,20,41,24,49,43,55,103,112,120,73,51,83,38,115,88,92,115,97,73,57,91,128,133,145,154,157,172,180,150,141,148,145,153,123,108,141,115,131,112,96,137,117,114,102,104,106,101,95,87,93,63,80,213,241,244,245,250,213,144,188,210,237,244,201,164,113,133,120,184,233,147,81,81,75,65,77,54,79,46,77,61,78,73,85,54,78,69,59,68,73,61,61,48,51,48,65,69,60,53,53,77,59,67,76,80,75,62,41,31,45,85,68,109,71,53,24,7,49,69,79,56,98,55,31,7,27,15,31,28,65,70,72,63,25,37,1,12,45,40,10,25,34,43,96,109,71,74,48,106,140,115,71,61,37,50,60,49,110,107,147,117,67,45,68,78,119,234,234,244,223,241,238,238,232,211,247,240,220,187,223,220,241,234,224,228,216,234,236,233,244,208,245,223,233,217,212,216,240,223,238,222,229,228,240,225,230,230,204,207,225,216,217,225,229,247,221,227,229,244,112,11,14,41,2,13,7,17,6,6,10,24,22,14,7,6,0,7,8,9,5,14,0,8,1,209,192,199,194,199,213,224,207,179,183,211,216,191,212,226,185,222,198,207,190,197,205,177,194,192,209,190,210,202,212,204,194,220,223,205,209,225,197,217,204,199,186,177,197,203,196,205,203,220,229,206,211,201,186,184,215,207,201,205,195,195,193,193,212,194,214,189,212,205,183,222,194,204,188,189,198,225,185,225,212,213,209,211,229,211,202,190,203,205,214,198,180,204,234,191,186,200,211,219,197,219,204,229,208,201,210,206,205,186,205,209,215,208,190,194,227,198,217,198,201,211,198,225,207,231,207,203,207,190,203,209,215,185,196,193,227,209,184,222,225,196,209,201,228,207,212,203,227,211,204,184,196,196,215,231,177,189,203,199,204,210,187,200,191,200,184,195,194,214,203,210,188,202,210,206,188,212,199,210,211,220,180,201,200,224,214,227,203,206,201,217,211,215,220,207,196,210,223,228,224,223,223,210,194,208,212,211,239,203,204,233,231,217,204,222,215,215,220,217,205,238,211,223,234,213,222,207,208,227,202,235,213,222,213,210,200,204,231,216,204,196,218,213,215,182,233,194,225,234,195,217,218,219,205,197,223,205,201,224,215,212,208,202,223,236,225,238,221,241,234,220,222,228,219,214,223,235,205,237,227,231,237,228,251,233,200,240,218,248,232,242,209,218,246,207,227,234,227,232,214,208,164,170,186,204,234,213,229,255,236,248,215,223,229,238,240,209,220,222,230,174,93,148,235,202,170,127,95,111,146,129,149,129,125,164,143,151,196,200,128,89,97,131,124,154,172,158,172,169,164,147,175,149,93,34,46,159,241,255,220,255,210,234,221,238,222,225,211,178,218,176,177,149,157,156,134,194,153,206,253,225,247,199,235,194,197,167,154,127,90,107,19,53,194,255,234,146,69,70,128,89,72,42,74,10,77,238,241,240,161,181,253,226,185,191,248,216,131,142,232,238,129,91,236,235,233,136,97,153,172,91,140,233,225,238,249,233,230,232,248,218,136,182,228,243,239,254,187,61,0,9,6,19,68,57,20,40,105,118,158,94,41,45,55,104,102,34,27,41,54,45,112,134,109,127,49,41,48,19,27,58,57,108,97,91,52,38,49,77,134,124,157,120,120,165,159,162,152,139,148,132,102,164,140,122,119,111,103,104,53,83,33,56,81,70,66,53,46,62,71,33,19,62,55,46,47,20,50,240,248,247,248,251,223,168,192,243,255,247,165,138,140,145,163,219,211,109,68,15,23,19,52,20,24,37,21,45,54,64,58,48,55,56,58,82,74,95,42,78,67,62,50,67,31,63,76,50,53,83,72,66,82,52,95,67,65,82,80,67,58,30,26,31,46,93,70,108,98,78,53,51,21,27,35,46,92,83,62,41,28,28,35,16,15,5,17,38,4,61,88,105,88,49,34,68,127,136,122,110,108,76,72,116,140,104,98,56,32,48,62,92,207,230,241,240,225,207,226,229,213,223,236,236,248,209,239,228,249,229,247,247,235,222,211,226,225,226,224,230,239,208,238,240,221,231,239,229,219,204,209,225,224,226,214,233,226,215,211,220,235,213,199,208,231,228,123,18,18,1,11,28,3,5,32,17,8,24,9,15,3,2,0,0,18,5,11,10,27,8,16,177,208,201,164,186,172,195,202,196,211,203,214,171,173,186,176,200,190,202,208,196,218,194,190,205,204,187,204,213,157,201,182,192,207,196,202,179,192,207,211,205,199,197,208,221,198,180,223,189,176,196,185,218,204,197,170,202,202,192,198,197,180,206,205,196,210,214,183,208,205,199,195,188,178,210,210,192,202,216,198,204,204,190,207,196,192,226,204,193,188,203,160,216,215,216,185,186,196,199,229,208,199,194,189,219,215,201,194,196,219,207,187,179,215,181,193,199,204,214,204,198,196,233,232,200,212,237,212,221,216,211,203,199,216,236,197,225,201,205,220,226,205,215,211,215,209,208,192,204,202,198,217,220,207,193,227,207,199,232,217,187,200,214,191,202,199,183,206,221,164,199,221,213,219,194,213,231,213,213,197,199,211,203,193,192,210,238,215,205,197,200,205,226,231,193,224,201,207,214,223,210,209,242,215,184,229,208,223,203,215,207,239,231,221,214,221,193,198,210,229,198,213,198,203,225,210,222,208,197,218,228,220,214,235,208,198,198,224,248,213,214,219,210,198,239,224,205,215,197,202,217,219,189,214,215,237,193,204,210,234,225,215,197,219,218,207,229,209,226,228,222,228,213,233,219,226,239,229,235,210,202,229,249,221,236,237,235,231,208,215,232,237,248,253,239,214,210,207,194,171,183,196,217,205,231,231,243,243,226,249,227,237,216,215,195,231,230,221,234,223,180,69,160,233,220,199,63,91,115,176,155,170,134,120,133,156,119,137,164,86,118,103,99,92,131,127,134,162,124,154,161,121,145,88,95,94,131,197,223,214,246,246,229,218,212,191,232,153,137,212,158,218,156,219,226,180,222,234,238,252,225,238,152,147,98,84,41,17,3,18,39,36,24,89,128,124,75,52,36,105,97,79,47,44,25,73,157,223,136,67,127,206,178,173,142,134,110,68,125,224,197,70,94,195,225,192,156,43,83,143,123,231,241,219,241,220,225,226,241,231,243,158,229,246,241,252,219,124,1,0,19,9,40,46,33,19,23,49,95,101,137,104,40,18,94,99,89,38,55,42,11,123,125,108,142,53,23,9,24,26,53,109,138,125,108,112,116,117,153,134,139,128,151,118,159,143,114,147,118,105,121,121,109,114,76,79,53,64,53,28,35,26,21,19,56,49,42,31,43,12,16,53,29,49,48,55,8,92,212,250,255,253,252,165,144,211,238,202,192,137,114,120,122,156,223,168,74,6,11,22,44,11,51,38,17,42,47,37,34,16,4,38,19,22,14,40,45,15,43,59,42,56,39,52,44,50,46,66,36,57,55,54,71,53,82,83,61,80,69,74,67,122,44,83,121,104,112,77,38,44,12,37,16,37,64,108,96,55,8,43,33,29,41,19,50,18,40,17,33,74,102,89,73,65,41,41,112,122,180,167,166,152,147,116,96,82,37,36,72,67,162,234,243,247,234,228,235,239,237,233,214,205,229,213,242,233,230,225,215,244,221,223,221,239,237,231,232,238,217,226,233,231,212,221,248,236,245,230,238,222,239,212,224,234,212,235,239,215,192,244,224,222,238,202,216,132,0,16,6,0,7,15,16,15,5,41,29,19,5,0,8,34,1,20,38,13,22,0,6,12,197,192,189,183,208,166,202,189,180,187,192,201,184,190,194,196,202,204,194,212,192,204,198,190,193,190,188,182,207,188,200,191,209,199,196,209,212,202,185,191,217,195,170,207,231,204,205,191,155,189,198,226,187,210,212,202,191,199,199,237,201,195,168,204,180,201,211,209,200,189,184,217,188,233,204,214,203,210,201,200,223,205,209,183,188,230,174,220,206,214,206,202,209,197,195,171,217,208,208,189,220,194,209,216,217,212,204,216,208,224,219,210,192,195,206,186,186,193,188,220,180,194,212,189,199,217,213,191,194,198,221,195,188,203,216,205,214,187,223,212,208,204,216,214,199,193,225,222,228,227,206,213,206,206,205,215,202,180,205,215,210,220,193,208,203,190,218,235,211,205,217,194,218,202,217,204,178,214,187,226,181,205,213,229,196,226,197,226,212,203,220,192,222,238,217,184,230,216,207,221,216,203,192,224,226,201,226,207,213,198,211,198,194,218,219,206,182,209,224,224,204,200,220,226,212,198,192,200,212,224,212,209,210,195,226,208,209,211,226,196,231,216,204,233,219,227,208,228,206,221,206,214,202,200,196,205,200,219,240,225,177,216,186,231,237,222,208,181,224,215,228,242,209,222,235,210,213,233,227,231,226,206,219,217,205,212,222,234,230,217,231,225,234,217,195,188,156,199,199,167,223,229,246,228,222,224,226,220,232,233,213,225,230,206,217,220,214,227,222,235,200,96,109,224,190,172,36,65,101,162,181,194,162,162,163,146,125,161,147,104,94,96,64,70,54,85,75,93,90,92,101,93,78,81,84,71,97,188,160,200,220,213,251,240,226,187,237,163,163,196,144,212,200,248,247,203,191,150,122,116,93,93,47,25,26,52,108,131,109,147,119,86,143,162,169,183,103,57,52,91,97,62,70,50,22,21,110,155,146,83,82,148,151,126,123,30,76,62,130,159,115,93,126,137,179,183,110,48,39,69,126,228,246,216,240,223,243,226,239,247,206,153,237,252,234,240,140,32,5,25,26,32,54,34,24,13,35,65,62,91,73,114,34,40,36,91,152,120,103,72,83,170,118,117,124,30,59,27,85,127,147,159,124,118,98,112,128,129,119,116,135,153,129,155,143,124,96,88,83,40,64,36,22,8,55,35,37,69,14,46,24,22,58,43,62,53,37,40,15,38,20,63,44,60,92,89,7,182,245,237,241,225,154,104,156,233,249,229,134,130,104,109,122,205,209,125,39,21,29,15,54,34,13,17,39,47,51,59,49,26,22,51,41,35,32,51,67,25,29,24,34,25,30,24,26,44,25,39,26,43,60,63,36,84,62,79,63,56,62,80,94,122,104,107,104,102,90,72,56,48,52,32,41,36,77,84,84,76,27,38,34,45,44,46,31,32,18,24,64,97,93,127,89,31,32,120,141,101,95,81,137,142,62,75,55,64,37,73,56,156,234,237,247,247,220,234,238,209,221,228,223,246,235,234,231,221,236,214,219,245,220,234,216,234,244,246,234,240,219,217,247,241,229,222,215,200,233,246,221,225,217,208,229,196,225,230,212,231,212,209,237,231,223,217,228,115,15,8,26,41,3,25,37,28,2,22,23,6,9,0,3,5,27,21,21,12,29,16,8,40,193,177,172,192,178,210,185,170,191,206,192,182,217,210,198,216,204,198,200,194,189,198,221,203,197,192,195,178,202,192,196,187,204,185,221,182,200,189,190,204,191,202,209,187,213,190,202,205,191,193,202,193,194,197,181,209,198,185,212,188,182,198,199,193,233,197,198,212,202,192,185,198,194,210,192,222,199,199,184,205,204,210,209,212,191,195,187,207,188,190,221,202,206,202,206,212,188,198,210,185,212,191,208,194,203,197,182,187,223,207,197,187,222,217,190,201,218,189,196,201,237,208,186,207,204,197,190,177,197,232,200,202,188,225,195,206,184,221,205,196,209,190,218,172,169,215,173,216,197,219,206,193,191,219,188,198,202,213,211,193,214,169,215,193,214,199,210,192,189,203,228,204,200,197,213,191,216,235,207,208,215,211,201,204,190,232,197,214,203,206,209,206,191,190,228,186,210,197,196,202,216,176,214,209,198,223,187,201,199,206,231,217,206,189,216,190,212,203,211,210,224,186,223,225,205,200,207,201,211,200,225,204,207,241,202,232,202,218,210,196,221,219,188,224,196,243,224,204,204,230,187,213,188,210,225,206,193,224,205,222,211,228,229,223,212,214,218,198,222,221,215,224,229,233,220,234,207,243,227,215,211,210,218,238,237,226,213,207,224,209,192,184,182,184,172,193,214,201,220,227,242,236,224,229,229,238,237,209,222,232,224,206,216,237,222,224,214,215,226,233,200,58,123,184,233,154,54,102,109,190,163,174,166,156,201,180,144,168,172,91,120,96,84,86,71,55,34,63,69,48,59,55,89,76,75,40,72,151,176,137,185,196,226,219,218,219,242,172,189,235,143,151,168,209,175,57,40,30,28,20,4,38,44,55,67,76,118,135,190,170,143,156,151,179,223,211,111,58,62,107,98,40,28,34,28,80,204,245,181,123,188,206,174,197,177,149,170,162,218,185,125,192,211,204,202,192,129,74,67,53,99,231,248,213,247,242,228,235,230,251,198,147,243,248,241,171,22,12,37,25,16,54,35,36,33,25,53,34,94,97,97,109,98,32,41,33,86,114,156,158,157,160,107,113,135,121,171,178,171,194,137,130,112,134,140,146,137,120,151,108,90,91,67,59,38,30,26,42,17,51,35,47,35,30,17,45,62,92,66,65,74,29,41,59,89,27,28,49,24,41,41,55,60,33,69,70,99,252,246,240,242,194,149,156,208,250,247,201,141,136,130,116,149,230,206,98,13,9,30,30,37,44,35,30,17,31,43,14,42,34,25,70,15,30,47,75,22,28,33,31,52,17,3,19,21,10,12,20,24,35,29,26,31,14,53,34,63,61,80,69,69,84,64,83,97,92,93,93,103,105,81,78,90,142,110,107,87,62,42,15,20,42,37,35,25,32,29,27,69,76,99,107,70,24,82,135,77,63,18,44,76,99,77,42,47,58,11,53,148,235,227,251,231,233,203,239,239,237,238,223,242,228,203,221,216,238,232,230,245,255,218,223,237,226,229,219,222,223,229,212,215,220,218,209,240,215,229,232,197,238,211,237,239,240,178,253,240,229,211,223,212,213,213,209,208,131,4,3,17,7,12,23,5,7,28,19,5,7,2,0,14,15,32,37,21,0,23,0,21,28,204,210,166,195,207,192,188,185,200,193,202,190,194,193,211,221,227,199,188,180,200,192,201,212,201,193,205,220,173,211,220,205,199,208,215,192,228,189,228,201,201,226,203,177,211,180,198,197,191,190,212,194,184,201,211,187,216,183,205,218,191,205,220,213,222,204,200,214,181,192,184,187,205,213,192,200,208,197,207,194,203,166,196,213,195,224,213,196,224,191,191,187,197,177,217,201,208,201,215,201,217,210,187,186,208,199,216,175,217,227,222,206,185,186,202,189,187,224,207,215,199,226,216,225,214,188,217,202,174,201,224,194,207,216,199,236,206,201,231,197,207,179,195,214,182,198,205,196,203,201,221,196,218,209,213,196,211,207,203,197,196,190,204,173,210,208,208,180,202,227,205,228,221,206,203,203,201,229,213,205,202,197,217,211,203,203,210,216,210,218,226,221,206,199,208,202,232,207,190,209,209,202,192,196,193,202,218,216,221,195,220,200,201,217,193,188,208,211,199,195,208,221,204,201,191,206,223,224,207,198,201,201,207,199,221,199,242,200,197,214,222,194,220,214,220,200,206,212,215,205,234,203,218,207,203,209,199,214,220,195,206,238,224,193,184,226,218,209,218,232,217,206,232,221,216,209,221,211,234,229,218,238,219,228,218,247,224,176,189,204,153,180,203,186,210,206,230,235,234,228,225,210,224,218,224,225,213,208,226,236,206,231,220,224,195,222,221,244,210,220,209,40,109,228,225,218,122,156,149,160,160,150,132,192,199,153,145,122,128,116,122,96,73,63,59,66,52,63,69,66,63,62,75,97,85,51,85,161,141,83,82,121,182,224,215,229,243,166,204,241,125,69,14,31,55,0,24,55,118,124,146,161,212,203,203,170,159,118,130,133,113,110,87,118,150,169,117,58,49,131,147,61,47,61,41,67,138,194,111,50,134,148,133,168,128,160,170,116,126,123,89,133,102,117,150,117,153,115,66,66,146,234,239,251,247,253,214,243,245,254,189,178,236,246,209,39,34,1,9,16,59,40,5,25,20,45,59,45,81,122,99,106,93,57,65,34,27,45,53,73,110,154,87,143,153,119,161,145,142,139,116,158,118,120,97,73,49,46,64,37,18,37,57,48,25,17,17,12,24,53,34,57,83,63,54,42,74,113,46,78,79,83,58,72,66,49,25,33,41,64,66,51,63,58,64,48,154,237,247,254,241,188,163,149,241,246,254,133,123,96,131,129,169,235,151,86,91,67,82,64,48,56,27,44,39,14,59,68,55,39,68,57,30,42,42,42,32,50,69,62,45,13,18,27,15,17,10,46,36,55,62,22,28,9,26,26,37,23,42,18,33,54,54,57,97,97,91,58,58,103,86,93,100,111,116,93,84,68,43,59,57,26,10,41,51,33,31,30,63,81,78,105,59,55,41,71,60,41,50,96,91,64,66,79,60,21,112,234,251,245,250,247,238,230,227,230,224,226,216,235,235,246,229,236,225,237,221,239,245,233,218,234,224,234,228,216,230,224,229,201,237,209,209,251,201,209,229,229,216,233,233,220,233,219,234,232,234,238,222,240,226,222,217,221,222,129,6,1,8,3,12,7,0,4,27,5,19,11,9,0,25,34,2,19,15,22,14,5,19,23,199,182,207,200,180,177,155,197,185,187,186,208,216,205,203,188,199,222,189,198,189,216,206,179,192,197,208,215,199,201,177,204,193,189,205,206,207,189,205,220,179,191,176,226,195,216,183,198,215,199,198,208,231,199,184,205,196,198,216,200,178,217,199,227,205,208,198,200,186,197,189,201,196,219,203,195,205,193,183,207,208,209,204,204,215,218,217,217,205,205,204,198,193,214,178,178,219,198,195,176,211,195,189,194,211,176,199,218,198,214,213,204,198,197,204,201,215,172,199,207,198,201,182,197,204,204,201,206,195,207,193,202,205,194,209,167,176,194,194,182,188,208,219,187,205,229,187,217,196,215,222,207,217,208,226,206,210,207,217,198,236,203,203,213,212,177,218,188,215,174,202,186,192,193,211,215,206,213,192,231,201,214,214,199,198,190,214,199,216,200,231,209,229,220,202,202,189,215,189,211,209,227,204,194,193,175,217,191,186,222,206,209,208,208,197,187,208,187,216,230,225,227,212,197,215,209,212,204,207,217,207,187,202,211,230,199,205,184,241,186,212,178,198,223,200,227,217,219,220,221,228,205,182,246,202,203,196,230,214,209,197,208,235,214,210,208,208,213,220,223,200,206,222,215,224,214,201,207,218,232,237,246,235,245,222,222,187,203,152,196,221,213,239,224,226,240,233,228,223,210,217,199,212,224,238,231,233,226,229,227,226,228,220,223,211,221,219,227,226,231,180,50,95,200,238,224,151,148,152,170,166,183,165,161,169,113,102,126,151,95,138,94,90,84,53,61,63,62,49,73,67,69,62,45,72,51,56,107,104,123,143,199,216,212,215,207,238,146,207,243,160,87,118,108,67,96,202,219,216,253,226,255,228,233,240,197,233,226,201,98,99,148,117,184,185,200,165,26,48,72,117,53,32,50,34,58,104,153,132,84,70,67,125,171,146,112,99,93,104,61,53,130,111,115,133,121,167,165,84,30,118,238,223,231,218,239,234,218,226,229,190,234,253,174,78,5,10,18,3,45,41,40,5,13,44,57,50,34,34,62,130,98,88,95,100,63,56,67,62,22,50,94,90,148,111,126,135,116,111,103,66,41,48,53,46,46,17,50,21,25,18,55,48,52,32,39,28,31,52,40,40,46,102,71,54,54,88,87,58,40,54,112,151,151,90,79,107,96,123,130,129,97,115,109,144,122,84,190,248,252,241,212,173,116,188,233,209,112,123,130,135,145,226,235,137,57,126,131,142,50,101,74,64,72,62,60,54,46,64,78,67,51,23,38,79,60,63,46,48,24,18,33,12,25,37,24,28,26,44,50,62,37,11,24,57,47,52,57,56,39,17,2,15,37,48,44,43,75,71,96,82,90,74,50,71,70,107,103,94,104,68,66,45,55,36,14,21,20,47,92,121,102,71,49,39,33,46,85,97,92,79,64,49,52,70,194,228,240,255,232,238,243,224,223,213,218,234,221,225,214,225,240,245,234,214,238,240,227,222,226,235,220,230,242,243,220,224,216,225,224,209,233,220,231,235,230,232,220,207,254,233,230,223,211,228,240,226,238,217,216,240,236,216,207,228,101,19,3,4,25,0,9,20,11,9,1,21,15,5,1,14,2,10,14,7,17,3,28,14,17,200,226,205,198,194,161,187,180,197,192,179,198,194,187,181,173,186,193,194,200,194,179,186,200,197,204,189,208,181,194,208,193,166,197,209,207,200,188,198,202,189,211,188,216,197,200,214,214,206,218,228,201,195,208,226,174,206,219,180,182,192,182,184,201,193,217,204,199,193,179,214,187,186,204,182,164,198,208,195,176,195,204,179,179,188,179,205,199,202,187,215,194,208,204,219,227,207,207,202,200,194,195,203,221,238,197,163,203,192,177,201,205,223,207,201,188,184,198,218,198,191,192,187,201,235,185,208,203,193,192,201,201,209,210,204,199,215,197,206,186,199,223,225,224,204,213,187,191,188,197,234,199,196,202,205,210,199,193,231,201,211,200,205,225,204,200,192,198,209,192,219,220,207,195,217,204,207,203,212,194,200,188,218,197,182,214,187,219,220,204,200,194,200,220,179,214,191,190,211,183,195,191,214,214,188,208,214,213,207,206,208,227,198,192,197,215,188,241,222,200,216,211,184,204,219,205,230,202,203,200,188,179,208,215,222,215,241,178,215,169,195,212,212,188,217,184,193,183,222,232,227,218,217,249,211,199,231,218,213,201,211,211,222,221,227,199,215,216,221,226,219,208,208,202,220,233,233,222,224,234,248,233,182,171,153,178,187,209,192,224,222,244,223,237,219,231,218,224,248,222,217,209,205,206,225,231,241,223,209,232,210,219,223,213,219,236,222,226,203,222,171,53,89,172,248,213,156,164,143,190,176,165,157,147,162,130,136,169,134,116,82,9,24,31,56,26,11,42,48,78,27,50,82,87,62,28,26,54,106,191,221,247,244,234,210,216,226,144,206,230,196,193,229,234,213,186,243,239,239,255,243,238,240,226,215,213,180,216,189,102,115,170,182,240,242,252,198,83,63,90,113,41,29,47,42,91,213,244,228,147,112,199,252,254,200,160,139,148,221,101,88,230,227,239,217,203,237,190,154,75,102,227,225,235,230,214,229,237,221,225,196,228,228,106,42,5,21,5,29,58,18,23,36,65,48,50,54,21,78,113,119,110,102,120,123,124,98,75,63,26,83,128,87,152,121,70,53,44,6,57,40,23,11,48,67,63,49,75,46,40,55,43,69,59,29,30,37,55,49,60,36,46,68,119,154,197,168,147,135,146,176,145,196,164,131,150,121,141,141,120,104,61,99,45,59,39,66,20,133,210,255,219,155,88,172,249,188,142,147,134,122,165,226,196,123,35,11,35,29,31,60,16,52,61,18,27,53,34,42,70,68,68,85,71,82,67,58,67,68,47,48,46,20,46,65,11,34,34,34,74,44,38,28,59,54,42,52,67,35,19,4,35,62,33,25,28,36,9,23,37,27,39,69,49,87,72,100,76,86,101,105,101,107,94,63,75,68,38,89,65,105,106,102,44,49,83,117,125,115,80,78,21,42,83,208,251,231,255,241,221,240,219,217,220,219,230,234,215,238,221,249,229,224,243,238,219,216,226,207,207,251,238,207,219,225,226,235,224,227,213,214,244,210,220,234,241,226,232,216,235,232,244,225,223,211,235,233,222,231,215,241,220,209,236,231,139,2,14,2,2,20,16,5,15,25,17,19,10,11,5,0,8,14,7,18,0,6,14,27,5,207,186,179,189,225,221,172,223,223,208,186,208,161,193,187,185,185,162,179,191,187,199,205,213,161,188,209,186,201,206,203,220,197,195,186,192,185,215,196,200,187,201,183,212,186,201,181,213,184,204,188,194,192,180,203,214,224,189,202,199,186,208,216,219,213,187,189,202,187,208,206,189,195,207,175,180,203,189,197,206,199,186,196,197,197,191,205,220,200,197,214,211,203,195,194,199,192,194,208,200,196,202,206,216,202,196,212,192,193,194,212,188,194,181,197,193,208,208,190,207,175,200,212,187,171,186,214,213,206,205,190,214,190,202,169,203,199,187,180,184,195,222,199,202,188,200,201,200,215,215,195,198,221,194,203,225,231,235,213,194,217,200,189,213,214,209,197,216,219,228,227,219,203,213,197,206,226,207,197,206,206,176,199,224,192,220,188,196,189,194,206,185,211,206,213,207,201,198,197,231,191,210,185,209,188,207,194,201,206,182,200,205,211,218,215,189,188,194,187,197,177,215,202,212,207,218,211,204,201,200,202,192,203,203,214,215,203,195,187,208,206,201,211,211,190,185,190,215,218,184,203,198,142,191,175,191,238,214,205,196,212,212,213,211,205,204,223,214,207,224,208,190,232,215,227,224,242,235,226,218,188,173,164,160,185,220,205,215,225,233,201,228,207,240,212,219,233,226,246,234,210,203,219,231,218,210,229,201,214,214,238,209,230,238,202,218,200,230,212,231,141,51,68,178,238,217,174,131,168,195,193,177,153,152,187,167,154,177,137,85,5,5,10,19,17,16,10,34,34,19,6,45,52,59,12,15,110,117,179,178,234,233,227,205,191,219,239,148,218,214,172,215,250,203,196,215,249,232,227,225,218,185,190,195,140,143,153,214,155,112,158,160,178,206,232,247,156,73,66,92,122,48,28,26,18,78,232,246,246,165,133,225,254,233,172,176,199,224,235,143,148,242,233,222,188,197,241,224,189,96,63,199,199,238,233,227,244,206,244,213,192,237,123,12,34,1,28,27,56,19,30,28,26,40,48,69,22,83,202,166,137,81,51,89,105,109,107,130,110,77,116,125,102,151,85,36,28,43,15,32,54,49,24,58,72,76,56,60,58,63,45,75,74,62,30,38,58,95,140,158,162,196,131,197,194,208,198,174,149,128,126,106,69,61,20,52,46,55,23,41,17,12,48,17,28,35,35,31,7,94,146,183,139,109,176,241,142,130,150,133,139,181,241,171,60,20,1,22,24,19,19,32,7,17,21,26,22,23,5,38,28,27,42,15,37,61,49,45,57,74,67,50,67,66,78,60,69,64,41,60,53,75,67,29,43,28,60,73,29,39,55,98,38,37,49,40,15,34,26,7,18,25,50,56,41,48,61,84,69,63,73,96,90,88,101,91,86,103,92,90,103,94,91,133,145,178,151,125,45,26,54,34,89,220,241,252,242,251,236,228,233,209,240,205,219,240,218,234,230,237,241,233,234,240,241,236,243,222,231,241,198,209,203,226,222,213,239,246,215,216,217,214,227,239,214,223,231,233,222,233,227,230,219,249,220,228,207,230,223,238,234,219,213,241,243,131,3,0,8,14,2,15,17,11,2,5,7,14,7,5,10,0,12,8,13,9,14,23,15,27,219,205,167,196,200,171,195,181,201,187,202,191,214,200,212,187,190,199,189,191,206,198,210,178,192,209,191,177,197,175,194,196,193,192,177,197,191,182,190,211,194,208,206,210,192,186,174,171,201,168,172,209,205,201,193,198,210,205,218,177,197,206,185,186,192,198,209,190,191,200,204,189,201,188,187,173,192,203,175,201,209,193,173,190,194,212,172,201,199,188,212,218,197,197,197,199,180,203,217,207,196,231,190,210,188,195,196,208,205,190,196,196,211,199,212,202,217,208,225,191,206,202,224,194,194,210,193,209,169,218,210,212,180,196,196,211,187,207,219,203,217,194,188,180,199,220,194,208,207,193,189,214,209,201,180,208,215,207,209,204,209,202,177,209,199,217,229,209,211,196,189,225,214,193,210,208,206,174,216,186,199,209,180,184,169,212,216,197,220,193,181,206,225,170,190,199,203,202,201,195,209,187,221,213,192,183,202,195,229,236,196,203,203,186,195,205,183,192,182,203,199,200,196,212,186,183,204,194,190,194,199,206,198,190,196,198,183,226,205,212,208,225,234,232,186,211,226,211,218,198,208,128,41,71,151,212,197,218,184,224,212,222,204,205,204,199,197,226,219,231,243,207,229,226,237,205,193,203,173,173,205,189,216,210,216,241,217,240,239,247,221,215,215,195,229,225,236,200,221,199,206,211,222,227,241,220,224,199,215,212,211,215,238,214,230,237,216,225,220,215,136,49,141,227,249,230,166,148,160,178,162,177,172,174,205,175,174,173,141,48,15,8,28,33,47,18,20,29,57,27,27,63,76,55,28,53,130,224,221,208,228,250,237,216,209,234,243,148,197,208,206,218,191,156,138,203,214,197,169,166,127,151,174,162,128,153,211,247,114,120,165,159,174,192,229,247,171,68,42,83,111,53,25,47,7,72,199,234,210,152,118,228,244,183,99,187,200,174,162,164,204,242,225,240,161,198,228,211,199,129,88,210,214,221,251,239,230,228,244,212,196,124,35,31,24,18,42,62,18,11,23,38,48,60,54,21,66,196,254,233,169,83,57,66,52,80,72,81,81,74,122,130,141,94,54,2,10,3,12,50,75,76,16,51,87,97,21,22,58,87,133,161,128,160,180,185,191,195,203,210,174,186,140,120,121,61,39,49,35,24,6,19,33,36,14,17,15,35,21,24,19,20,30,9,29,20,7,21,35,0,43,143,143,126,191,209,127,133,140,123,150,200,232,129,28,19,2,29,18,4,32,27,10,24,44,12,33,24,28,32,7,3,17,11,21,1,39,4,12,44,21,44,60,47,34,52,44,39,51,92,78,76,62,66,68,53,68,92,57,57,61,64,25,44,26,51,15,43,49,33,43,66,79,66,36,16,21,28,41,44,47,86,70,85,93,85,97,97,108,99,84,105,105,96,114,84,67,58,65,29,59,77,191,246,252,235,221,229,217,243,232,239,214,221,236,244,235,231,238,246,239,234,233,231,232,207,235,208,216,232,199,224,239,240,223,228,239,229,222,235,250,239,247,240,245,219,226,229,204,218,233,219,222,233,234,235,223,217,217,215,223,235,239,220,233,119,3,9,5,22,9,4,0,2,5,0,0,19,18,2,3,6,0,7,2,26,15,5,5,0,195,181,194,186,207,195,214,180,190,192,195,189,178,189,201,176,232,199,207,193,184,171,185,207,196,175,195,209,198,216,199,196,190,195,195,192,218,209,184,151,217,182,190,202,187,207,198,195,169,215,192,195,190,203,199,187,197,191,196,204,203,198,181,194,181,199,199,195,211,175,194,217,198,188,197,162,216,203,174,190,196,184,195,194,190,229,196,186,205,198,225,218,187,222,200,199,177,212,196,193,200,185,188,229,202,184,216,195,198,209,172,200,183,178,185,218,181,202,193,189,203,186,226,182,198,212,200,180,184,182,213,181,213,184,181,242,184,222,199,178,210,195,207,203,199,195,196,198,203,189,193,210,220,184,191,212,216,203,196,215,210,201,216,193,189,185,193,191,190,200,186,193,191,191,214,196,218,203,186,186,215,193,199,201,199,189,200,216,209,207,190,210,206,200,191,177,207,230,203,210,198,183,172,191,191,199,188,193,204,199,197,199,208,214,190,190,205,204,207,201,182,223,199,196,198,228,209,179,200,213,195,214,204,210,213,196,204,191,229,208,209,196,179,191,201,196,162,205,195,213,183,111,30,74,182,210,204,199,179,223,217,226,189,217,211,219,224,216,216,218,217,217,191,212,174,202,182,187,200,202,216,202,240,213,202,228,223,208,214,245,234,196,245,245,196,206,228,212,246,222,199,210,215,235,217,212,234,221,208,198,211,236,236,216,227,207,226,198,215,215,112,60,141,225,254,248,155,157,158,172,161,149,184,184,181,129,139,156,165,159,98,127,154,99,29,20,52,43,46,58,32,87,51,60,83,79,189,236,234,215,227,233,193,203,199,209,228,132,182,180,176,211,178,96,101,191,157,166,171,167,199,195,225,220,242,214,245,227,39,178,182,180,172,198,223,242,199,65,37,76,113,80,28,48,35,78,234,228,212,163,90,212,202,129,117,179,187,153,154,152,227,249,229,235,174,189,210,219,197,186,66,117,222,215,209,220,219,214,251,233,120,80,0,9,34,35,66,36,21,13,38,46,44,72,20,23,188,243,255,177,132,93,59,91,72,76,95,70,30,57,126,136,137,106,31,24,1,27,15,29,66,99,94,119,100,66,57,99,149,171,196,183,204,197,205,169,127,103,81,57,23,24,64,25,4,25,4,0,34,9,9,9,20,30,55,14,16,24,27,34,15,27,23,33,36,2,12,29,17,20,18,62,119,168,234,171,125,151,145,144,144,208,222,76,77,14,18,13,18,32,7,10,22,32,26,42,13,19,11,1,27,23,20,29,9,24,12,10,41,5,9,5,29,5,24,17,8,25,44,52,37,35,58,60,44,89,79,69,100,73,78,39,26,16,53,36,36,33,41,21,78,78,87,51,32,26,82,67,72,53,82,52,34,47,66,72,110,107,92,113,92,92,64,54,63,37,51,52,52,96,71,117,199,247,241,233,243,235,235,234,252,210,228,217,216,242,225,225,226,228,248,225,218,229,233,222,206,242,242,242,227,243,233,222,223,205,213,231,220,213,250,233,238,233,225,240,250,237,240,205,233,229,220,244,239,235,230,218,201,231,238,207,243,205,225,113,4,17,3,6,2,21,6,22,7,28,8,1,11,2,12,6,9,19,16,19,20,3,29,16,179,182,171,174,182,193,189,182,207,194,195,184,202,210,195,192,199,194,199,200,189,165,180,173,220,190,195,197,195,201,185,204,179,177,175,204,201,194,204,212,171,182,188,189,216,204,228,185,197,177,186,192,184,193,196,171,177,205,203,172,178,162,186,186,185,187,184,211,180,183,181,190,193,191,199,184,173,188,190,185,187,192,175,194,202,200,188,178,195,188,199,212,186,189,189,211,205,204,199,201,179,205,182,187,208,198,208,206,217,227,189,186,179,206,173,206,192,222,198,218,199,167,189,204,200,208,198,224,207,206,204,232,197,189,205,184,209,189,176,224,196,190,185,202,179,183,211,181,199,225,188,214,221,218,194,202,195,205,206,198,191,185,212,196,231,196,216,202,184,203,208,203,199,199,186,207,200,220,206,192,200,197,204,192,208,198,208,232,185,209,204,177,187,215,209,190,194,206,201,206,231,199,203,183,218,223,207,195,194,194,193,209,203,200,182,212,214,194,221,192,201,203,210,213,189,206,188,216,194,178,194,215,193,174,190,195,202,207,204,190,233,191,211,234,205,179,214,235,223,187,210,161,136,180,215,247,194,195,222,195,202,218,214,223,205,216,201,209,215,193,219,184,183,204,179,170,179,202,207,233,190,218,208,228,221,216,208,221,218,211,223,220,221,225,227,208,226,231,206,216,217,227,234,243,194,216,221,210,233,197,211,235,223,222,220,202,229,232,199,179,123,48,146,225,231,221,200,185,152,189,154,147,193,121,172,121,152,178,173,205,203,209,226,146,63,36,60,41,39,29,45,99,91,132,109,137,207,249,231,190,197,240,211,203,210,240,220,155,193,192,199,173,153,144,121,171,174,186,229,206,248,226,241,200,196,179,216,137,38,189,198,176,161,199,237,242,170,54,20,63,109,67,42,19,3,74,223,239,212,159,138,203,200,165,168,174,202,160,127,160,199,225,246,224,149,184,208,203,224,243,134,57,133,162,197,225,242,244,252,188,56,27,18,12,11,23,29,11,47,11,23,45,69,59,25,167,252,225,225,212,160,113,52,70,54,103,73,67,75,84,126,116,144,109,42,32,34,70,46,58,73,62,143,150,169,134,136,155,150,120,102,80,57,51,59,33,22,18,25,17,11,16,25,15,22,11,28,19,38,35,33,17,25,40,49,18,41,58,23,32,38,25,13,20,3,20,18,23,32,13,26,26,71,171,251,223,129,135,142,171,167,250,200,73,38,12,21,7,10,32,39,8,16,23,30,5,16,31,20,10,13,9,14,24,28,17,42,20,17,37,22,10,23,9,53,18,31,39,10,19,4,28,2,34,51,41,44,47,53,76,67,73,75,78,72,96,68,53,42,51,55,82,70,43,58,67,72,71,111,83,68,65,37,21,57,42,43,38,70,92,101,121,73,40,60,49,72,93,120,113,91,152,234,250,244,255,247,217,240,223,219,224,239,254,227,228,224,249,229,225,227,247,227,218,233,232,205,227,241,237,214,239,230,223,235,225,251,229,235,216,212,237,237,195,234,239,233,225,228,243,234,239,223,243,218,212,223,216,228,237,226,243,251,241,214,125,21,3,3,2,17,19,10,11,32,9,27,21,19,8,26,9,21,19,8,10,12,28,2,17,194,181,183,184,188,185,179,218,194,166,183,191,207,199,171,200,210,183,182,179,189,199,181,181,183,173,193,195,204,179,188,211,216,192,207,190,189,226,202,195,207,197,187,204,190,206,187,168,196,167,198,169,199,200,174,173,175,195,178,166,212,164,167,197,207,210,207,201,192,157,164,218,197,205,218,191,199,212,184,194,209,189,197,180,216,172,206,196,207,204,200,205,205,205,186,198,212,192,204,200,210,199,202,188,192,181,180,201,201,195,209,183,211,215,186,207,185,185,200,194,189,210,177,213,204,187,229,202,189,194,204,210,198,186,200,198,217,204,189,200,221,199,197,207,212,197,195,188,195,197,194,204,200,222,196,188,209,180,182,166,198,207,206,219,199,194,204,205,203,201,227,180,192,193,199,204,196,186,216,210,213,210,184,189,186,213,188,218,231,192,198,176,201,210,220,192,235,190,201,203,201,199,184,215,208,194,209,187,207,203,202,189,188,178,203,219,204,197,210,191,207,205,201,197,212,213,222,176,197,206,209,217,226,199,210,212,241,210,216,210,195,217,207,189,195,215,204,177,210,205,214,192,210,229,213,232,215,193,211,220,208,225,224,230,243,215,188,200,182,171,182,164,178,207,203,199,201,232,215,190,218,227,230,204,208,218,225,225,209,197,207,211,224,223,226,200,204,238,222,229,206,233,235,202,231,214,204,204,227,212,207,228,187,236,223,231,223,219,219,186,119,97,152,234,247,209,162,167,156,206,172,135,97,84,140,158,155,170,196,200,163,186,182,162,30,61,74,24,44,36,71,113,40,158,190,178,232,246,240,197,209,226,197,193,231,215,233,165,147,204,249,147,208,144,149,212,195,228,240,201,239,235,175,192,161,162,252,161,98,239,207,199,176,217,250,237,175,40,52,36,71,96,22,4,24,58,227,251,214,146,166,255,221,146,176,205,239,163,95,196,253,241,246,221,179,237,197,202,253,254,211,83,57,126,197,231,219,229,238,91,34,0,1,25,55,46,25,17,9,23,53,54,44,37,138,233,244,251,255,234,214,136,27,69,26,75,37,45,42,80,125,124,161,66,41,64,85,161,169,192,180,142,140,133,126,76,45,24,48,25,23,27,11,31,25,11,37,8,10,25,21,11,32,31,0,20,10,5,24,37,18,72,54,56,42,70,53,52,54,41,40,18,22,19,4,22,46,5,24,11,46,29,47,182,240,179,143,170,155,155,206,217,149,26,32,5,22,21,33,28,25,34,30,37,34,11,24,19,30,34,14,10,27,20,21,27,27,21,44,9,3,15,18,13,26,49,12,12,20,4,10,25,14,20,22,16,38,6,15,17,16,29,41,88,67,82,92,126,91,80,89,86,103,78,100,74,46,42,93,89,70,43,18,72,74,49,10,40,13,55,76,104,137,95,125,125,118,133,152,137,118,184,241,232,243,219,224,235,230,217,234,223,230,242,216,227,222,246,240,232,228,215,227,221,203,220,232,241,239,229,213,215,233,247,212,218,247,239,237,238,253,234,224,241,218,224,204,227,240,239,209,237,233,224,236,233,222,235,214,217,223,220,221,224,228,122,8,11,5,2,32,11,15,8,8,43,25,26,27,4,1,10,20,25,16,0,11,37,10,13,198,185,196,192,190,203,190,200,187,198,185,192,188,184,212,210,190,188,208,198,185,175,194,208,168,166,197,188,196,187,179,196,184,192,170,163,200,197,195,186,186,190,214,198,198,172,170,190,179,185,177,200,189,186,208,214,189,184,182,191,186,187,166,187,179,176,179,208,195,201,199,195,222,193,182,190,194,175,192,209,188,217,208,172,197,184,206,208,224,188,171,205,198,193,192,185,172,197,194,199,187,194,177,198,198,193,186,195,192,186,180,201,208,190,206,207,185,187,213,196,205,177,208,190,198,194,202,202,185,206,192,210,200,194,192,182,191,181,195,196,188,197,181,193,195,196,195,194,193,196,192,201,208,189,216,197,213,178,210,188,214,211,202,178,196,206,214,193,220,191,189,197,211,211,203,206,188,211,172,185,204,190,208,204,202,228,196,189,223,181,183,198,192,231,181,191,192,178,210,194,196,200,195,208,205,212,208,178,206,177,200,207,182,189,186,209,184,197,199,195,193,214,198,188,197,185,184,194,188,210,206,190,182,182,210,198,202,197,205,176,232,168,190,181,225,179,196,195,199,206,203,203,231,234,229,208,194,202,223,229,223,242,239,204,213,205,188,174,184,189,174,197,215,209,211,215,231,207,213,228,192,206,227,209,198,213,216,193,209,203,230,208,201,230,237,234,226,225,232,215,221,239,205,219,217,230,202,226,201,197,216,240,240,226,224,227,227,206,226,179,125,142,190,230,235,185,140,104,159,190,158,170,161,79,109,168,189,171,200,162,134,135,144,140,59,59,79,18,50,15,67,44,63,202,248,254,235,213,246,211,249,246,230,245,241,254,251,139,157,240,230,237,208,170,185,237,255,253,236,202,232,227,190,248,223,225,241,158,211,234,253,229,204,245,234,244,187,59,56,42,120,85,11,46,29,44,193,247,249,143,214,229,243,181,196,246,252,136,129,235,251,251,233,223,222,245,218,244,247,232,250,160,86,139,221,217,248,253,147,38,0,25,24,20,43,18,12,5,56,57,77,40,26,126,222,251,252,232,237,249,208,98,63,56,49,53,54,52,33,100,143,114,152,138,145,147,171,173,180,135,92,55,30,35,20,14,8,26,15,29,16,49,19,44,32,20,14,18,38,30,28,25,6,17,27,25,24,50,33,51,48,36,28,50,29,59,37,33,40,31,10,0,28,21,21,6,25,43,29,26,39,32,21,205,233,163,142,156,158,187,186,198,95,13,27,14,14,31,16,38,27,25,57,20,23,20,12,31,15,24,23,9,8,6,4,30,36,56,48,12,36,19,15,30,31,34,36,28,26,9,8,25,17,25,25,9,3,6,18,21,23,23,53,23,32,35,41,51,80,100,95,87,131,119,118,102,79,46,57,61,88,40,93,98,63,31,42,63,33,53,126,105,98,88,113,144,135,141,118,120,101,237,245,243,244,240,232,243,229,214,239,253,241,250,248,225,216,224,214,224,248,214,229,242,232,221,235,231,216,241,228,206,245,233,229,228,232,222,235,241,238,233,232,215,236,210,226,235,234,214,235,223,228,234,229,216,238,240,208,234,230,233,213,208,233,128,6,12,5,30,22,10,20,8,3,5,11,23,4,0,6,27,2,13,10,21,10,9,18,4,201,181,199,213,194,191,205,191,174,184,192,180,200,192,211,187,177,186,183,167,192,207,175,174,186,194,198,200,206,166,186,203,176,178,207,181,194,164,195,199,204,197,188,182,195,188,198,214,171,214,183,194,211,196,187,203,200,199,179,168,189,190,184,205,193,153,193,213,210,207,197,161,212,179,203,167,209,182,204,190,202,204,229,196,214,204,201,189,219,202,193,195,184,187,188,206,177,168,201,167,210,204,207,192,216,186,202,184,197,225,198,193,213,190,181,194,199,198,187,195,197,181,179,175,209,215,191,200,208,208,179,238,209,188,186,200,216,209,203,191,206,205,187,189,197,163,212,203,194,197,207,188,184,200,214,226,198,215,210,192,216,181,212,213,177,201,214,200,177,193,189,221,200,183,212,206,179,193,214,196,186,206,212,203,174,201,205,189,192,194,203,187,201,189,194,179,186,174,225,219,196,198,218,196,208,188,195,165,184,203,193,180,199,198,200,194,199,174,168,185,175,208,198,182,206,173,210,210,220,191,186,193,209,202,200,191,189,207,207,224,201,199,195,183,180,219,195,200,183,214,185,189,229,212,225,220,194,198,247,217,185,146,162,183,179,189,185,201,214,212,212,217,233,245,198,216,225,225,206,226,193,227,214,211,233,231,216,221,233,242,245,250,209,238,222,227,216,245,253,247,255,247,240,234,237,248,248,251,242,223,246,250,254,219,243,253,246,242,222,187,192,150,175,228,249,220,145,116,156,215,219,241,232,182,170,179,203,177,156,132,95,147,155,124,76,65,69,17,38,34,40,30,21,193,255,237,248,228,232,242,242,242,249,231,252,233,220,86,91,195,232,161,216,170,188,206,206,194,176,170,231,215,218,236,191,230,228,142,204,246,225,188,137,195,233,238,162,45,43,74,95,100,45,25,50,37,171,216,153,92,150,216,158,124,177,173,159,93,124,222,216,215,188,204,175,170,146,165,219,220,214,112,67,129,183,199,195,149,63,8,0,24,37,29,52,4,21,46,63,69,62,28,73,224,227,241,250,252,238,218,150,88,41,92,37,78,55,74,65,107,153,124,188,150,141,124,67,52,36,35,8,28,3,10,22,8,22,20,13,25,44,28,36,46,34,9,52,70,40,57,33,25,22,16,31,21,26,11,54,44,47,34,67,40,79,63,66,50,25,17,32,29,17,20,9,13,31,18,40,21,62,29,54,223,217,134,152,151,184,165,190,173,64,34,22,29,42,8,9,51,38,30,26,11,43,12,15,28,8,35,32,26,16,6,25,16,48,7,20,39,13,32,13,22,59,40,47,37,55,27,46,23,8,39,34,29,20,18,6,24,31,18,18,5,6,12,19,19,32,40,57,70,60,60,90,108,99,87,97,101,91,64,72,57,31,61,33,28,16,74,111,121,122,94,127,136,132,142,118,84,139,244,231,239,244,231,240,232,222,217,229,207,241,232,249,235,242,222,234,239,235,219,237,231,236,229,229,225,236,230,210,204,225,240,223,218,230,225,212,220,212,240,241,206,237,238,212,245,242,222,216,239,203,210,244,245,217,243,230,203,223,225,253,218,217,129,2,11,28,7,24,1,14,12,28,3,6,31,2,17,4,2,11,21,18,18,26,16,16,8,190,225,195,193,191,207,203,174,211,204,187,177,173,185,182,219,217,202,214,185,186,180,211,176,185,186,202,200,192,203,185,196,199,185,201,198,177,202,157,186,189,186,192,193,214,200,176,202,193,197,173,201,201,206,215,192,172,182,204,194,188,168,203,193,196,199,193,206,194,188,209,223,181,189,220,200,199,204,201,191,211,199,190,195,192,202,196,205,180,187,213,186,170,200,203,184,211,192,196,192,183,195,203,184,206,207,202,188,205,194,203,198,200,187,171,196,225,207,204,179,215,224,222,222,192,203,218,199,180,218,172,178,197,202,199,205,202,215,195,208,217,191,209,208,199,219,212,199,202,194,197,187,194,215,182,191,227,185,201,226,203,202,181,180,204,198,199,217,182,213,189,204,222,214,196,196,201,196,197,205,205,200,178,185,194,216,193,187,202,176,206,192,184,210,204,197,203,177,213,196,220,202,192,170,181,207,211,213,192,184,182,196,192,206,168,174,207,209,191,197,189,189,198,184,191,199,178,178,196,223,185,177,175,213,173,204,201,204,190,185,211,187,221,220,204,197,193,210,208,199,179,186,208,219,236,192,217,223,215,109,14,0,77,173,215,221,240,246,232,250,239,240,241,220,249,214,252,228,242,232,255,250,251,250,253,251,252,254,251,235,255,254,246,252,243,230,244,253,228,247,255,252,255,254,241,255,249,239,251,254,252,246,242,220,253,253,255,239,250,241,197,149,150,225,251,215,203,124,147,203,219,235,226,206,221,166,183,185,151,129,89,117,135,105,84,103,82,105,86,87,83,54,77,178,191,171,166,169,163,161,168,143,155,185,153,179,162,63,67,166,118,92,77,98,95,86,71,56,94,96,123,88,107,124,93,153,95,56,107,71,113,98,57,31,67,86,115,77,38,58,92,87,31,36,42,40,36,52,67,10,17,43,33,43,46,77,53,59,68,52,14,12,41,99,133,122,88,63,11,48,77,39,43,62,83,69,101,30,49,44,3,24,34,45,17,14,37,63,71,41,18,92,198,227,247,254,239,247,219,192,137,88,86,101,92,125,116,89,108,125,131,114,112,30,13,16,14,18,13,58,29,11,31,33,5,24,43,27,27,34,21,49,35,56,20,57,24,43,22,56,51,67,37,44,30,48,27,37,39,46,54,83,65,107,160,94,79,45,60,12,22,10,29,5,25,30,39,30,29,33,58,37,69,224,174,140,154,121,204,189,147,155,96,70,92,66,55,34,35,4,14,17,17,32,10,28,12,20,24,8,5,53,5,13,13,0,14,49,6,30,5,15,25,31,43,35,36,58,34,23,48,26,42,41,35,21,8,10,13,9,26,29,24,37,32,27,17,22,11,27,18,12,29,39,14,60,55,66,91,106,109,98,95,58,25,28,50,39,26,59,87,122,101,96,97,108,111,104,105,58,100,206,226,235,255,244,230,236,230,236,219,243,227,238,240,241,210,238,213,229,226,242,224,236,231,234,222,205,229,199,232,240,215,225,238,233,239,249,216,231,207,215,232,248,229,217,233,243,243,233,216,231,235,243,245,209,228,229,248,226,215,225,206,235,234,124,7,0,0,14,12,13,12,7,19,4,0,15,10,7,16,0,12,31,12,23,23,14,36,0,185,163,202,190,203,212,205,205,213,210,200,202,207,192,204,190,171,184,210,212,198,191,191,187,191,173,186,175,215,197,207,170,210,182,189,170,204,190,181,183,199,180,189,179,199,160,187,191,189,190,213,187,187,203,192,189,189,184,179,207,181,206,216,201,202,202,162,196,209,176,195,180,188,181,199,188,197,179,223,183,208,206,206,190,202,168,170,203,194,200,166,168,204,178,210,225,164,205,189,207,203,206,204,202,194,207,209,182,198,200,187,218,177,207,198,180,199,198,180,190,197,203,193,182,202,185,204,205,180,174,186,192,194,192,231,206,224,207,208,187,170,211,181,197,191,186,206,215,186,201,209,214,182,199,197,217,193,203,205,211,204,213,203,207,226,201,199,179,197,219,191,178,206,217,200,205,198,182,208,202,204,201,184,207,206,196,223,165,185,196,201,210,185,192,188,204,161,187,188,191,183,203,186,206,172,203,221,190,189,172,224,196,192,194,201,200,197,206,180,176,180,205,178,204,175,208,191,191,228,207,227,217,209,198,213,202,209,213,218,219,217,225,233,223,230,221,222,215,207,215,199,226,245,250,252,234,234,229,188,126,74,107,194,230,250,238,253,231,248,247,241,248,234,236,251,253,247,255,238,238,247,246,246,234,238,235,252,254,236,245,253,240,251,253,243,247,248,223,240,220,206,232,189,188,237,212,214,192,182,182,178,170,158,171,178,228,188,162,155,137,182,107,103,162,154,188,174,90,76,141,152,186,165,138,142,113,120,120,105,117,101,76,76,127,116,99,108,126,138,105,143,115,128,98,91,77,47,69,92,59,56,28,18,85,42,36,75,86,74,50,37,70,66,64,53,28,32,7,48,40,74,48,54,15,13,63,32,36,30,54,54,63,29,21,23,2,33,72,63,58,70,90,74,22,45,50,39,26,77,54,43,38,11,35,16,46,23,60,72,48,22,3,32,80,131,146,124,73,24,17,30,30,46,71,95,68,36,44,0,23,24,41,52,27,25,14,75,46,92,36,36,154,242,235,252,250,247,192,143,144,92,84,120,132,137,151,145,115,153,147,110,141,99,18,11,26,5,21,16,4,36,29,34,17,31,18,35,29,26,34,2,48,32,38,21,66,38,62,25,60,30,41,45,36,37,63,38,26,53,22,52,50,18,94,141,93,55,22,18,14,10,34,26,28,43,12,30,43,29,35,52,1,109,229,136,106,154,181,227,158,174,114,104,79,122,99,72,50,38,12,32,37,14,58,15,33,21,38,15,10,2,28,27,26,6,20,9,37,12,25,27,46,24,27,47,39,37,35,51,57,64,54,33,35,5,42,24,16,40,16,20,14,34,17,11,18,17,19,21,39,11,6,4,31,9,53,17,21,42,43,53,78,69,95,101,89,93,92,39,79,96,98,126,86,65,75,87,69,62,22,81,227,231,252,241,233,242,237,224,245,208,236,231,239,239,241,223,229,228,219,221,247,215,241,229,223,222,236,239,223,235,221,232,240,238,200,227,242,236,245,220,244,222,244,235,203,209,229,231,226,248,236,217,209,222,216,220,238,220,233,230,209,225,240,222,122,5,1,20,1,8,0,22,4,9,13,29,2,11,1,35,12,42,28,5,11,13,3,3,29,196,163,185,205,158,179,190,200,192,194,212,194,182,195,194,196,201,183,205,179,221,193,203,195,172,184,191,167,178,200,187,188,182,205,163,207,186,205,166,176,217,166,170,166,211,204,207,194,201,174,191,203,179,183,208,204,211,171,172,194,177,190,218,195,182,180,205,186,203,197,196,179,177,210,215,198,184,211,189,191,201,205,188,180,214,200,192,183,205,205,201,186,205,193,194,199,179,201,205,194,220,200,176,194,200,210,197,181,199,194,184,168,191,178,189,186,190,187,183,204,201,203,200,193,213,206,195,208,194,214,214,187,194,198,204,190,211,203,194,211,215,210,211,196,193,234,210,218,211,176,212,218,204,204,192,214,193,215,204,206,181,222,200,188,200,209,193,216,199,179,197,188,201,186,207,204,203,197,215,209,183,198,191,190,201,175,192,217,200,183,187,211,215,199,192,190,192,179,179,206,181,202,209,188,193,197,189,197,201,210,197,184,184,165,189,196,204,182,189,202,245,220,216,217,211,204,207,205,234,236,216,222,236,216,249,240,246,246,232,245,248,214,234,229,242,240,232,232,230,207,212,229,241,236,210,184,190,219,223,200,177,215,244,235,228,229,225,210,169,190,225,204,192,169,145,167,166,187,174,167,174,161,149,153,150,146,176,147,139,125,141,108,138,107,116,99,97,115,118,99,87,129,98,118,77,108,73,81,78,93,66,58,96,64,38,79,76,72,51,64,82,55,63,59,49,79,78,71,85,57,74,44,71,94,54,81,82,99,77,85,99,98,95,68,113,102,108,67,109,117,137,118,105,81,57,57,79,71,51,58,61,50,18,44,65,53,46,85,112,70,78,68,59,52,51,22,58,76,89,53,87,41,66,27,38,79,58,28,54,54,90,105,58,44,49,20,59,108,63,60,40,91,58,36,71,77,45,46,88,70,88,100,59,51,67,58,50,115,70,65,77,5,67,31,67,135,131,170,88,54,31,16,58,87,98,80,28,43,22,43,62,44,28,17,16,60,42,38,40,40,167,245,245,252,251,252,156,117,90,68,62,76,128,168,163,154,177,141,149,150,109,133,96,29,36,38,46,25,32,31,9,37,10,29,55,33,1,22,8,65,27,9,14,55,69,95,73,56,37,43,87,59,50,70,93,87,67,34,33,31,8,83,56,100,88,63,44,42,6,12,19,15,9,45,29,43,37,40,58,22,62,12,111,192,92,128,162,220,243,169,164,138,112,112,111,76,59,41,9,52,44,37,37,28,45,1,44,39,28,33,41,35,28,16,23,15,14,40,20,6,45,11,25,8,20,26,42,41,59,73,56,29,37,43,47,31,26,18,35,31,34,28,5,4,49,15,42,38,29,15,13,18,26,15,20,0,33,8,14,20,21,55,40,26,58,102,108,127,103,87,114,84,99,62,67,53,70,69,51,11,167,253,247,233,217,243,225,222,247,234,221,209,227,235,218,249,232,224,234,224,232,237,231,241,244,222,241,245,223,236,235,215,214,232,224,218,226,235,237,236,225,240,239,222,227,239,218,240,244,242,239,237,234,241,214,228,218,230,226,231,236,232,248,240,249,121,17,0,12,23,22,4,17,10,22,3,18,49,0,10,8,15,18,33,7,7,33,2,13,2,183,197,198,212,185,191,175,213,175,186,185,192,192,182,172,194,190,201,173,197,169,186,195,204,201,180,156,196,201,173,184,194,197,176,194,190,185,192,198,162,193,194,189,184,153,194,175,189,172,193,198,200,225,170,187,192,177,185,181,183,177,212,182,199,177,183,160,212,179,212,181,185,207,193,201,188,181,202,195,188,226,190,178,222,191,204,208,170,210,200,214,183,182,178,193,173,179,195,187,191,194,212,209,218,207,159,204,199,194,187,204,201,198,179,225,206,196,214,201,196,188,206,208,178,185,179,207,205,199,185,183,174,175,183,175,222,199,219,198,192,232,209,210,201,223,198,214,191,185,206,193,203,203,206,204,209,210,189,193,210,192,203,207,208,194,208,182,186,192,175,194,217,210,177,186,199,189,212,177,198,193,204,193,200,176,181,190,197,193,204,206,192,214,189,201,199,194,191,189,180,187,206,187,185,179,233,206,206,178,204,183,195,202,188,187,203,211,196,191,237,242,243,254,254,229,229,248,231,250,227,244,227,243,223,239,233,239,226,204,207,212,208,222,223,190,199,212,166,177,192,187,150,129,137,137,127,131,142,179,126,145,167,144,130,112,104,77,84,118,75,92,74,78,50,82,96,55,65,92,66,77,64,73,85,34,43,44,46,44,63,67,47,60,14,63,72,41,74,50,34,54,130,97,65,68,68,61,77,84,84,81,87,51,96,77,66,81,49,79,67,82,50,60,77,40,52,51,40,57,90,43,74,49,38,65,73,86,52,70,117,107,70,97,55,75,98,83,88,52,46,93,66,81,81,62,108,85,82,82,70,73,86,52,74,58,105,75,71,100,80,87,90,53,84,95,81,89,59,58,56,77,37,33,69,32,70,62,57,48,71,73,104,60,62,46,33,74,109,63,95,47,117,49,33,49,58,50,36,51,59,70,45,19,45,65,95,58,130,64,21,18,29,84,84,21,28,127,119,179,126,22,20,7,75,89,70,21,12,26,45,77,36,20,18,23,42,69,44,26,121,243,243,239,230,255,168,135,153,150,112,43,82,85,96,121,117,134,119,154,114,77,116,56,16,20,19,13,18,16,45,41,45,36,49,24,29,4,5,7,9,20,12,15,23,46,49,75,43,51,69,45,41,66,50,89,120,101,102,68,90,92,118,125,79,24,32,13,22,27,23,71,80,44,23,30,40,46,61,53,52,85,37,119,164,114,173,190,240,224,173,158,154,97,62,32,18,61,37,16,42,41,52,35,35,68,54,32,24,57,43,40,12,43,50,48,34,12,29,10,25,22,18,7,8,37,62,52,54,101,69,73,54,44,43,19,24,50,34,33,27,45,35,26,56,21,55,39,31,12,19,57,26,27,5,5,28,7,0,16,2,7,22,24,25,27,23,54,83,84,103,79,106,79,69,42,76,51,68,68,71,183,241,255,251,226,227,235,246,229,228,220,237,213,232,251,235,224,231,227,242,226,238,227,228,213,242,230,224,245,230,231,233,233,241,241,235,232,229,230,234,230,240,236,216,229,226,236,247,227,231,215,229,235,224,246,239,238,243,231,236,226,216,229,247,224,124,25,6,7,2,3,4,14,1,18,16,13,5,23,9,16,10,9,4,26,9,43,7,18,18,196,175,183,187,209,197,197,172,208,188,168,184,191,193,184,174,174,179,182,176,205,174,195,202,202,160,177,191,187,184,203,189,198,174,178,197,194,167,177,198,183,177,202,211,175,190,200,195,188,175,196,203,187,181,205,182,192,172,197,211,201,172,200,207,190,192,183,201,185,188,178,201,216,205,201,203,196,195,204,197,198,201,191,199,202,220,209,213,212,205,190,197,206,186,193,203,195,203,216,195,206,190,193,176,225,188,194,198,196,200,198,212,208,195,197,216,212,224,219,187,203,200,173,221,206,196,181,189,213,218,189,179,175,192,199,226,201,180,187,209,203,198,189,197,198,210,224,200,162,174,216,221,180,193,219,209,200,198,171,216,199,203,199,196,203,189,194,191,195,195,181,202,171,214,169,187,201,193,193,185,183,208,207,193,196,183,235,208,182,218,184,191,195,191,199,183,203,201,190,190,199,202,191,201,192,226,201,223,188,199,172,180,172,223,201,192,193,188,195,188,193,200,204,189,204,202,180,163,162,183,148,179,146,152,142,162,127,122,125,97,135,106,109,104,102,106,106,120,96,131,174,73,56,64,71,105,90,71,88,63,45,70,67,59,57,35,69,59,28,71,63,83,47,45,76,85,70,75,80,36,36,49,25,57,55,53,45,62,57,57,64,56,42,62,61,39,53,38,72,48,62,107,102,48,56,62,58,68,67,102,96,100,109,142,86,71,70,41,69,58,89,39,51,84,79,67,29,65,49,53,33,62,63,62,56,39,75,51,51,51,54,49,59,37,62,49,56,44,42,95,55,51,37,80,57,43,42,66,50,71,63,52,44,13,45,70,57,62,30,60,48,52,75,82,65,63,53,47,57,16,56,55,52,42,45,126,64,41,24,56,33,109,77,78,32,3,63,100,72,105,85,86,72,26,64,39,37,37,57,26,63,69,52,31,61,53,56,64,73,21,14,15,80,60,63,24,45,55,84,115,78,36,46,47,46,37,26,33,44,47,20,23,21,57,62,53,73,35,117,206,247,252,253,254,168,163,200,255,252,213,77,13,12,4,15,38,58,87,130,88,89,123,43,20,34,30,27,32,35,28,27,23,30,22,57,45,22,27,18,15,25,24,23,35,14,45,38,61,54,59,53,64,75,86,85,92,109,116,96,110,126,109,92,74,60,8,22,22,23,65,57,115,81,100,64,63,71,59,57,42,62,38,128,138,111,184,235,244,189,131,150,116,50,28,48,59,71,50,32,17,40,44,60,69,36,63,22,48,53,16,44,38,67,18,36,43,29,22,22,38,26,11,32,33,33,50,71,84,71,65,28,13,17,25,51,52,49,37,52,27,28,44,43,40,14,37,48,47,25,27,27,18,19,4,42,25,24,41,44,23,29,14,15,18,41,25,34,20,18,59,65,79,105,110,81,52,68,106,97,100,160,223,244,234,244,229,245,245,197,220,247,244,247,251,237,216,241,247,221,219,231,212,234,228,244,243,235,244,240,243,238,218,229,217,233,229,212,237,219,238,229,236,226,210,251,240,242,222,224,242,224,240,226,230,234,214,221,223,224,231,243,219,241,244,220,112,2,8,5,13,9,0,12,23,27,15,2,5,10,18,2,10,21,2,6,17,2,10,16,6,222,181,194,190,196,195,182,194,186,186,193,170,188,195,158,191,219,192,190,172,191,180,210,188,178,202,201,204,166,182,202,161,193,195,187,192,177,193,200,193,211,182,176,196,182,193,177,193,180,175,196,182,201,194,183,198,207,178,179,185,203,178,198,173,169,213,211,206,199,182,210,190,169,184,182,198,201,199,203,205,200,193,203,193,208,185,197,210,195,227,208,209,213,180,188,186,192,197,199,196,195,188,205,216,214,216,179,182,209,202,196,212,217,210,214,198,192,227,214,215,213,189,194,198,202,189,197,221,187,219,195,205,194,188,182,201,210,205,205,178,202,205,209,222,196,208,193,196,191,179,199,178,203,214,198,205,208,221,201,214,205,181,199,216,201,223,212,195,197,205,185,201,193,180,175,191,181,168,203,207,180,185,200,183,188,187,188,176,189,194,178,195,199,192,213,189,207,194,203,183,176,187,193,172,190,203,195,194,207,185,179,197,187,199,183,207,218,123,75,62,47,42,37,44,35,37,56,28,48,40,27,44,24,31,24,23,21,37,22,52,38,26,46,12,23,47,36,20,53,105,112,84,9,29,53,52,44,43,51,52,61,28,62,38,53,54,78,35,56,29,64,77,46,65,50,45,56,58,57,54,57,60,82,46,79,66,70,39,28,36,41,68,70,71,42,73,30,57,45,55,16,77,42,32,52,43,44,29,52,91,65,38,39,41,78,67,47,41,37,68,58,37,62,45,36,25,31,20,21,35,42,21,28,60,60,32,65,41,31,32,19,19,29,28,24,12,54,39,39,32,48,28,30,43,65,40,7,23,54,21,40,42,11,50,27,33,84,28,63,61,62,56,44,43,44,31,35,44,32,53,49,10,39,13,18,56,86,35,13,28,14,14,48,45,27,2,41,51,73,116,74,104,110,31,66,30,48,47,9,35,68,48,28,1,10,46,50,59,23,40,19,12,56,13,0,28,38,5,42,50,108,70,40,60,17,21,58,65,39,47,18,6,44,59,53,80,21,94,208,245,252,237,245,196,170,204,243,234,254,185,67,13,0,22,26,65,42,25,97,122,84,122,28,28,32,14,58,7,21,31,12,41,58,42,50,65,62,27,7,16,15,14,56,44,17,41,14,42,34,32,62,68,55,73,78,85,71,104,110,117,120,121,143,93,66,22,19,26,63,104,43,118,157,142,81,41,23,37,43,19,63,51,131,163,174,237,223,244,171,109,135,93,48,46,137,95,85,57,35,29,40,10,28,51,70,64,31,59,59,80,35,38,48,52,22,33,43,22,9,15,14,36,35,17,33,39,35,43,52,63,44,17,36,20,32,45,55,60,44,61,45,57,55,49,49,44,61,46,60,42,61,38,43,31,39,28,17,26,29,17,43,20,30,18,17,16,23,17,4,56,73,98,85,85,89,121,146,163,143,115,156,196,232,245,244,241,234,243,235,239,243,238,232,237,234,208,206,228,248,216,233,237,244,239,247,248,240,246,225,205,234,237,232,235,232,219,229,235,208,248,209,217,245,206,249,237,208,237,229,216,228,231,196,221,229,245,227,229,224,222,231,229,221,235,220,96,7,26,26,1,4,17,11,7,6,12,12,52,9,10,21,19,18,17,7,22,8,9,1,19,208,207,213,214,208,168,169,194,196,190,191,197,195,204,185,184,181,183,189,229,187,187,171,194,205,196,170,190,187,221,194,204,169,186,194,177,204,174,180,206,179,181,196,190,185,168,167,190,183,195,171,174,171,199,170,189,189,195,168,192,166,179,197,188,203,194,186,203,204,206,194,224,206,202,185,166,154,196,199,193,199,169,212,193,221,186,207,202,172,181,221,186,203,192,184,171,181,188,211,204,213,217,205,204,209,214,217,228,206,204,199,216,198,206,217,204,201,205,214,201,222,208,195,202,174,184,180,194,200,223,199,192,175,193,175,185,188,174,207,206,184,195,201,227,200,188,201,191,216,232,199,175,206,203,206,191,214,202,197,179,215,183,191,193,198,189,183,180,201,196,190,192,221,208,205,181,208,173,209,209,181,213,168,205,184,197,199,207,183,176,181,187,204,158,188,183,203,199,206,191,196,168,187,176,207,196,195,195,209,186,201,194,181,176,186,179,184,128,18,1,8,13,8,13,10,20,24,14,7,18,10,3,19,7,30,10,23,16,32,4,26,0,19,18,14,5,18,7,18,36,92,48,38,19,2,35,21,38,13,31,23,26,23,13,29,36,43,20,35,34,32,26,53,55,42,36,41,38,20,40,39,43,25,59,36,39,17,57,59,14,27,61,30,27,14,13,70,24,26,26,3,64,68,17,7,31,4,31,57,21,17,24,33,39,10,30,14,10,12,22,24,24,26,41,22,45,49,19,23,33,33,13,41,35,25,25,33,11,31,27,18,14,24,33,68,39,42,42,22,53,34,10,36,18,38,32,39,27,30,24,29,16,37,59,26,23,74,57,41,46,72,36,42,36,75,27,27,29,25,20,19,43,45,39,78,48,15,17,29,62,75,70,14,41,38,26,49,44,77,102,74,111,94,46,75,44,75,74,62,38,100,98,94,82,79,72,124,112,95,131,107,105,145,117,123,118,129,109,60,182,171,140,98,24,25,10,47,47,24,46,6,41,57,60,47,24,79,210,244,245,244,236,195,170,233,242,237,244,252,138,63,68,34,54,114,66,36,29,136,103,122,105,35,40,39,20,32,33,52,23,18,39,13,32,60,39,82,38,18,27,45,46,26,17,23,28,20,43,40,25,30,29,52,59,103,89,91,99,110,149,140,159,152,134,100,29,10,34,104,101,45,128,152,121,66,36,13,4,29,73,83,48,138,201,225,254,245,237,165,138,157,67,49,141,142,104,112,59,33,22,8,26,0,9,47,71,62,77,53,51,45,49,34,50,45,27,22,52,60,42,21,10,21,29,23,14,12,45,68,51,77,73,75,37,70,52,51,60,34,77,94,75,53,63,59,51,50,29,75,56,65,68,63,59,68,60,50,24,20,16,25,42,13,43,29,34,44,23,40,22,92,78,101,96,108,168,144,158,143,117,90,88,135,216,209,221,216,226,231,232,232,242,225,237,226,217,221,244,226,238,222,227,237,233,197,198,239,243,239,236,234,244,228,243,238,217,235,228,240,210,236,215,223,217,251,234,228,221,206,215,227,237,222,239,236,218,208,234,233,241,218,229,234,222,221,118,2,6,0,29,1,4,18,4,35,4,27,6,36,23,25,4,6,26,8,11,5,2,26,27,189,191,190,176,181,190,174,182,203,184,196,191,160,180,154,195,215,208,202,192,170,194,175,186,191,176,207,183,189,198,207,194,208,221,158,178,201,195,170,200,197,206,199,169,210,210,209,184,189,179,170,173,165,178,192,197,179,197,186,196,200,204,197,205,196,183,183,192,189,201,202,178,184,189,219,169,170,199,202,213,199,191,194,186,210,193,210,188,186,203,214,191,206,208,189,203,201,201,156,207,202,189,209,204,209,217,203,206,194,217,174,204,206,195,216,212,213,215,194,210,227,210,186,209,234,189,201,213,208,212,185,198,207,209,209,204,204,212,200,207,225,196,224,192,185,204,163,193,181,203,174,211,214,191,214,223,194,201,198,222,205,184,195,233,214,204,205,192,187,202,218,186,216,214,206,194,217,212,191,197,195,184,197,210,182,213,217,216,199,200,189,194,175,190,200,188,213,185,185,196,200,186,212,208,185,174,173,185,202,190,208,198,160,188,212,207,220,150,86,58,49,80,55,36,52,70,25,33,48,51,27,29,16,22,26,25,2,28,12,14,40,20,25,8,6,6,27,12,6,2,60,43,9,37,3,18,22,15,56,11,18,39,23,14,39,19,30,38,23,40,16,29,39,27,14,53,3,51,16,69,28,14,18,31,24,26,31,19,2,14,7,9,3,24,15,27,39,22,14,43,18,10,3,29,4,7,11,2,2,7,12,10,19,3,27,5,30,14,22,47,35,67,51,66,54,51,30,29,33,38,62,41,11,28,36,21,36,34,24,69,42,61,71,54,27,17,49,28,29,70,25,62,86,41,30,136,39,130,57,128,66,122,106,156,139,120,134,136,64,110,127,110,103,140,168,144,88,137,168,197,192,212,216,215,169,119,159,247,239,240,228,184,135,198,232,242,201,91,66,72,89,84,105,62,46,50,202,247,188,170,191,249,253,196,171,159,210,240,238,253,245,218,245,248,237,236,234,235,246,255,244,155,50,43,16,34,50,42,27,9,28,43,43,51,42,65,213,237,233,249,255,186,159,215,232,230,251,249,208,89,85,79,72,91,65,97,55,89,132,105,147,89,12,21,31,47,48,34,41,12,28,32,40,40,31,34,55,40,24,53,53,39,70,34,26,42,21,45,69,68,73,61,108,152,141,106,114,112,111,124,91,100,101,72,75,52,50,59,169,101,95,95,57,35,58,43,12,24,22,97,81,64,138,227,232,237,244,227,166,132,154,66,53,156,134,111,101,70,13,18,30,10,2,36,4,43,49,74,42,50,50,44,50,50,36,39,31,44,39,52,49,12,51,47,51,30,57,55,42,45,70,73,74,57,52,76,70,67,51,83,72,48,57,66,61,60,53,57,27,40,61,48,36,41,40,26,47,41,9,18,36,26,45,28,50,10,23,11,43,15,86,98,121,89,116,156,165,162,118,105,77,22,80,195,236,235,229,238,226,226,237,219,220,241,222,196,218,217,226,244,226,247,229,248,226,242,225,222,237,232,239,248,234,248,251,235,235,211,239,222,241,231,237,226,225,236,217,236,221,246,238,242,204,243,210,222,242,242,225,217,229,228,246,241,214,115,0,11,11,26,25,3,24,2,22,9,0,4,7,4,3,17,19,30,30,4,3,2,20,13,213,194,206,219,198,190,193,184,170,193,186,207,194,211,197,192,191,187,200,204,208,204,179,179,194,179,180,195,190,209,184,194,202,199,213,196,206,199,198,214,199,204,181,188,201,187,186,190,197,188,186,187,187,201,208,193,219,185,214,181,182,211,206,183,193,189,184,205,200,206,221,216,217,221,197,198,224,192,183,219,182,197,213,207,196,177,240,180,204,223,220,168,205,196,193,215,204,179,201,206,185,222,210,199,220,199,224,204,193,212,225,189,209,182,215,210,217,199,203,189,195,173,186,216,191,212,205,202,201,213,213,171,206,216,225,181,170,208,210,196,235,201,200,196,211,196,230,193,205,178,209,199,182,192,224,235,188,199,193,184,195,214,204,194,212,198,190,191,177,209,208,200,196,215,210,197,191,201,204,210,202,214,197,182,197,190,181,212,210,189,193,199,177,195,183,204,198,209,219,179,193,181,191,197,202,201,214,196,189,182,209,189,193,180,209,216,192,214,190,228,247,210,236,222,221,201,214,226,243,219,239,207,231,212,218,241,207,216,218,208,212,194,189,210,177,174,193,197,154,135,161,175,174,175,187,205,183,181,166,175,182,163,183,147,172,174,155,167,184,172,160,161,172,179,189,186,203,163,175,197,180,186,176,201,194,219,194,239,216,213,221,222,233,227,232,247,238,219,235,226,215,175,214,221,234,252,239,217,230,206,210,223,203,205,195,230,228,240,251,234,130,126,237,238,180,58,64,126,214,248,240,182,157,199,210,241,247,234,243,254,238,253,231,159,74,61,64,14,53,47,53,207,239,236,236,252,222,246,248,252,243,251,246,255,244,233,246,223,131,119,157,171,113,122,237,210,214,236,252,255,250,250,252,253,163,137,231,244,253,243,240,225,248,251,251,254,224,127,63,51,31,74,74,52,39,67,220,236,251,219,202,223,252,227,136,173,231,240,247,248,247,210,239,255,243,243,245,236,240,253,191,81,45,11,9,40,53,44,7,30,40,68,69,79,7,105,238,243,247,246,165,174,212,249,248,246,234,249,135,25,65,61,60,70,81,101,51,101,138,115,160,80,15,26,33,47,40,28,38,13,24,15,15,13,73,46,61,69,78,59,84,52,97,62,112,141,161,117,129,150,158,166,193,154,151,143,93,108,122,66,36,52,45,52,47,58,72,120,125,75,94,52,53,57,40,29,32,39,99,134,86,54,110,222,252,244,248,255,150,164,147,53,89,157,99,127,74,34,19,36,27,22,25,3,22,12,2,14,17,49,61,42,69,52,30,43,48,45,63,38,61,89,61,74,78,71,68,94,66,60,87,100,61,73,73,51,30,50,42,62,53,81,51,53,60,38,56,55,29,5,47,31,21,18,26,27,18,13,26,17,20,26,35,22,58,28,45,43,39,43,79,91,113,95,97,129,127,134,79,80,146,179,203,246,245,240,240,230,241,240,241,241,230,232,255,242,232,253,230,240,217,233,209,246,235,247,237,238,236,251,241,233,245,242,237,238,238,234,241,230,240,237,234,224,230,245,228,222,226,225,241,235,245,229,221,241,236,245,242,225,232,228,222,244,232,94,31,7,26,33,4,17,14,19,30,11,13,0,13,0,7,10,22,3,27,15,36,10,19,28,177,190,184,179,190,166,179,179,193,183,182,195,215,200,182,191,197,206,205,186,203,199,185,183,178,173,181,175,196,195,202,173,167,182,210,189,213,185,185,190,194,184,200,198,185,201,216,187,188,180,215,199,206,191,185,182,208,200,182,203,203,192,193,189,185,188,205,178,187,179,178,185,200,187,195,175,185,198,207,181,179,186,195,204,215,200,196,194,204,220,219,212,193,189,201,220,197,240,224,217,204,199,196,206,207,206,211,179,219,195,195,212,218,186,194,224,200,210,178,203,195,201,183,209,206,190,205,199,193,212,212,206,185,201,206,198,199,185,205,199,214,205,181,229,207,220,192,185,213,215,208,191,183,184,191,183,190,211,218,218,187,219,188,198,199,195,193,208,196,196,214,211,201,182,166,191,192,194,201,193,177,191,203,192,178,192,204,207,189,186,189,212,180,171,209,190,172,209,178,196,204,171,201,206,192,226,203,196,203,180,183,189,211,172,199,165,177,205,196,235,246,241,252,254,254,227,246,249,249,229,236,236,253,242,255,249,247,250,249,255,240,246,244,245,242,254,244,242,255,211,248,251,254,234,233,238,216,251,249,253,248,250,235,242,233,244,254,232,223,255,246,251,229,254,238,233,246,252,241,247,239,249,248,255,244,255,247,234,253,252,254,254,253,243,238,252,249,255,252,255,255,232,226,245,255,253,234,235,253,253,225,255,238,243,249,237,255,238,249,252,134,133,224,247,221,187,203,251,255,248,242,253,235,230,252,255,249,246,252,239,230,248,238,184,90,81,65,10,16,38,70,219,227,237,255,245,250,237,236,239,252,241,242,221,241,249,255,226,91,64,156,174,70,44,226,242,217,243,234,246,226,193,185,175,136,133,239,246,242,224,242,180,200,248,252,245,177,76,58,89,38,78,54,40,48,31,210,235,202,226,137,149,244,185,155,218,242,221,240,219,213,172,241,227,221,232,239,238,248,215,86,14,6,1,39,59,35,9,6,48,55,56,58,39,81,130,233,223,250,155,155,222,244,254,232,238,241,174,82,51,88,66,53,67,91,59,47,112,111,111,133,38,23,23,18,44,35,36,31,13,23,33,49,8,6,44,36,29,71,61,62,91,142,180,157,155,182,155,163,129,135,183,170,128,100,76,59,74,60,46,47,41,12,20,29,48,61,61,50,60,55,49,76,79,15,34,35,92,169,116,59,47,65,188,240,241,229,192,103,106,84,37,35,101,62,62,67,59,6,4,15,22,28,12,50,37,2,13,41,3,23,38,24,41,18,37,24,24,23,30,33,12,15,43,50,46,44,46,56,83,55,64,64,69,75,51,70,58,67,61,59,61,52,47,48,44,33,15,15,15,0,36,16,4,17,23,24,0,24,43,36,16,31,30,34,33,25,50,49,55,101,97,102,84,56,94,72,71,30,112,228,246,230,235,246,235,238,233,246,240,234,220,246,249,230,223,242,224,233,238,239,232,224,221,211,219,234,226,234,240,208,240,210,231,234,241,230,234,243,239,243,209,243,251,224,226,242,238,240,210,243,230,235,237,238,224,231,215,222,213,218,224,229,206,217,128,9,10,1,4,20,3,43,16,16,36,11,16,9,4,11,0,11,27,6,7,13,25,33,29,205,200,181,197,183,187,203,180,199,201,178,197,196,205,216,219,177,180,186,182,196,184,207,173,176,175,175,181,181,202,170,202,174,174,184,177,197,185,203,180,198,194,175,214,196,186,174,195,190,167,189,199,169,216,190,211,164,196,184,193,196,197,208,182,202,178,174,179,182,193,173,190,206,185,183,180,196,157,191,200,206,213,189,190,208,216,195,206,211,170,196,213,210,193,189,202,213,202,206,192,201,224,197,212,196,217,204,190,200,208,205,209,202,195,197,203,207,203,191,198,174,189,210,216,199,170,212,214,188,199,189,215,207,215,208,186,176,209,196,185,216,214,182,203,194,191,194,208,192,209,204,188,205,196,192,161,215,208,207,191,201,204,189,213,191,189,187,194,201,183,175,199,200,199,211,205,196,196,214,176,187,211,178,185,190,180,202,213,188,181,195,180,208,177,174,169,193,186,189,195,169,167,182,195,180,203,177,209,193,188,206,177,184,201,183,175,180,211,183,218,205,208,215,220,196,218,224,204,189,217,229,205,227,209,238,204,229,238,228,231,228,216,216,223,227,219,233,208,233,221,227,236,243,239,245,233,237,235,250,241,227,247,231,250,243,238,253,221,242,231,253,248,229,246,224,250,238,255,233,235,255,254,254,243,234,255,248,220,253,252,242,231,254,234,232,249,229,236,241,235,242,221,243,234,249,224,236,255,253,253,254,248,244,246,245,240,228,144,212,189,54,82,199,248,254,254,251,242,255,250,229,213,206,223,214,203,200,185,216,190,155,183,188,101,43,78,60,44,52,33,24,167,184,212,215,213,237,219,239,252,255,243,228,244,233,229,237,221,197,188,220,231,162,105,207,180,175,205,196,176,150,213,190,200,124,193,249,236,241,227,242,159,164,233,226,255,168,96,88,87,56,50,87,41,50,70,232,242,171,147,97,169,202,122,152,189,235,224,207,246,183,159,199,185,218,222,248,251,229,85,11,7,11,17,36,54,34,14,37,72,44,58,39,91,185,227,240,242,142,122,223,246,251,253,240,249,145,150,149,142,115,63,46,72,92,100,66,123,143,145,94,24,25,28,28,3,30,36,38,16,24,31,13,24,19,55,19,39,63,45,42,78,151,174,176,183,163,163,124,96,103,129,137,78,101,62,41,61,26,50,17,16,19,16,10,25,53,55,41,29,13,55,41,50,33,53,136,197,172,109,61,28,23,126,245,235,177,100,56,20,34,45,32,22,47,63,43,34,29,17,4,12,15,30,34,17,36,10,26,19,32,27,9,23,14,21,26,49,29,5,27,22,25,44,30,61,59,44,60,83,55,52,30,71,95,47,31,37,70,89,59,49,35,17,15,4,24,22,24,23,15,14,24,3,22,20,6,14,13,26,31,66,32,35,39,69,45,43,41,72,101,98,118,103,67,24,72,65,31,89,227,218,246,242,244,240,229,239,225,229,209,243,246,245,225,234,234,231,222,232,232,230,244,225,246,221,248,236,239,226,218,209,248,239,254,229,226,238,220,220,233,192,203,206,246,241,236,244,202,223,234,223,223,243,214,230,205,245,222,221,209,234,233,235,223,113,3,2,10,17,12,12,9,10,35,17,15,41,0,5,2,19,0,14,8,20,1,10,0,32,194,166,200,172,193,181,194,182,181,204,188,206,189,223,197,191,199,163,190,179,188,178,184,187,191,194,171,217,195,195,189,191,196,195,175,190,178,207,191,174,197,190,181,188,202,204,177,195,178,208,200,205,219,188,191,199,195,193,199,192,206,178,223,185,199,215,224,190,178,188,180,196,209,202,188,201,204,176,195,199,173,192,194,175,164,228,205,198,216,223,180,186,216,195,205,194,215,206,184,199,207,201,202,197,200,204,232,219,192,215,217,206,205,195,201,202,203,205,204,218,212,210,223,209,192,195,192,199,200,209,197,184,167,192,192,225,191,193,200,185,196,182,174,195,216,207,195,195,179,208,203,212,209,210,205,210,207,203,200,208,207,210,209,200,174,184,220,200,207,177,192,211,195,187,184,208,177,199,196,200,194,178,207,214,198,201,183,187,195,194,202,207,173,173,179,185,180,200,214,209,192,189,193,206,175,203,171,194,193,201,185,164,193,199,194,179,189,205,175,197,214,205,187,187,184,189,209,194,187,204,189,187,194,193,187,197,178,190,211,204,197,193,207,202,171,205,207,201,211,183,202,188,207,202,182,218,216,215,208,207,224,200,218,228,218,227,202,216,229,210,208,237,192,233,208,199,204,230,214,215,241,214,238,235,225,212,211,227,208,218,224,226,200,244,233,221,222,196,218,202,232,196,216,232,214,229,203,239,232,219,211,194,220,240,202,219,171,92,159,185,86,70,151,227,177,184,161,148,166,165,177,120,125,133,112,91,106,116,135,110,85,96,114,107,107,165,116,170,154,118,85,65,20,25,57,27,38,43,71,59,123,165,174,141,138,118,141,206,201,247,251,224,244,124,125,122,61,108,161,176,190,232,209,215,133,197,218,233,239,217,248,162,162,227,221,252,156,104,63,66,50,56,101,66,36,56,228,241,178,138,107,180,167,61,101,168,198,193,219,214,146,193,202,180,213,233,241,235,154,39,20,26,8,37,39,43,23,9,69,50,61,33,56,211,248,196,242,157,145,222,250,247,249,242,227,143,165,234,201,147,115,78,32,26,56,34,54,127,89,122,97,32,36,30,42,68,56,29,16,17,42,29,13,16,33,14,23,51,32,45,61,47,92,100,135,207,129,97,67,72,89,114,130,80,114,86,58,63,8,38,15,34,18,30,15,9,25,14,22,30,40,9,27,14,14,127,189,184,131,51,61,22,28,80,175,112,125,44,26,93,62,27,45,92,90,61,55,66,42,12,37,18,29,33,23,15,38,25,17,24,30,31,19,23,14,28,26,40,31,11,10,0,34,48,36,49,30,53,67,63,21,34,32,67,79,69,22,10,23,35,33,20,16,28,26,15,9,36,50,22,20,13,21,20,17,44,32,42,29,18,39,62,82,42,51,61,47,17,51,81,73,115,110,111,73,51,91,89,36,59,195,212,216,249,243,243,222,226,229,234,252,217,216,237,236,240,244,223,243,244,250,226,232,220,234,220,244,225,243,233,216,240,244,238,252,240,242,237,231,230,235,233,246,226,217,226,235,233,224,227,247,241,240,238,222,234,224,232,228,246,243,223,224,226,227,115,30,14,11,34,15,15,2,15,28,19,16,31,25,8,0,23,31,20,5,24,18,23,17,18,193,193,184,212,197,167,191,186,152,187,193,181,170,174,209,192,186,195,174,174,199,188,201,195,193,206,181,186,207,211,202,173,203,197,219,177,196,195,218,174,201,182,192,198,192,188,162,196,206,174,207,174,193,195,187,203,202,178,182,204,198,183,193,168,189,215,193,199,207,201,190,197,185,184,205,209,187,199,176,207,215,207,223,191,202,203,191,199,205,191,191,183,189,202,211,190,183,182,207,218,197,204,205,188,214,208,206,189,215,193,204,189,212,188,199,191,188,187,180,207,204,222,158,201,184,189,188,202,220,179,193,218,198,182,209,197,199,210,209,182,194,204,196,201,219,202,196,200,195,211,207,205,188,190,182,216,201,168,191,191,208,214,196,196,193,212,172,204,191,195,189,193,177,191,186,222,188,204,209,190,180,179,185,191,199,188,183,192,199,171,206,197,202,186,189,201,209,176,198,193,180,172,207,202,172,201,192,193,182,184,205,182,180,198,174,213,190,167,190,201,211,215,179,180,191,192,198,205,186,188,195,198,209,192,198,193,196,194,207,185,195,206,176,206,195,192,188,212,196,183,181,194,214,210,199,205,197,194,200,226,225,199,197,206,198,221,198,211,171,203,210,201,197,196,206,196,223,209,195,199,213,207,217,221,205,231,213,218,222,212,224,202,222,204,219,200,213,231,221,224,225,186,193,238,195,247,216,201,207,207,223,209,204,190,209,238,181,107,171,155,104,86,116,119,99,103,76,95,69,81,94,93,101,100,104,133,83,143,174,106,115,137,168,147,161,200,198,192,194,183,130,59,44,22,25,17,22,28,0,1,27,21,23,11,12,18,49,73,69,145,192,182,202,110,120,127,110,170,236,202,255,206,183,168,109,223,221,249,228,213,221,163,162,202,206,252,164,102,60,72,62,34,108,73,34,55,225,234,135,192,194,228,129,107,172,168,245,211,230,208,159,209,195,166,222,253,229,198,52,19,19,2,24,56,28,30,24,52,57,71,23,35,177,239,210,214,173,112,181,250,251,255,250,245,139,138,215,245,244,161,71,72,90,31,43,24,67,99,103,105,68,11,46,42,25,45,23,14,18,32,8,0,25,34,17,47,13,25,6,24,31,48,44,19,108,179,113,82,80,89,85,105,54,36,54,57,58,45,12,29,28,28,44,24,9,11,25,7,17,23,50,37,33,18,25,81,147,98,64,68,25,19,25,30,61,92,49,8,60,140,92,107,211,241,181,133,129,112,55,24,14,45,6,22,4,49,31,22,23,26,27,22,41,24,40,25,15,54,33,47,38,34,36,17,26,24,48,26,57,33,55,20,20,72,77,56,27,19,34,34,19,18,15,20,39,25,54,8,7,31,28,29,38,24,53,133,83,48,43,30,74,85,63,87,59,67,54,18,41,99,107,80,94,100,58,68,66,70,50,68,175,237,249,255,227,231,243,238,239,245,225,230,241,251,252,231,230,234,233,223,226,228,252,242,247,236,229,218,243,230,241,235,240,239,237,241,240,222,219,235,242,242,248,233,236,233,225,236,217,228,233,244,217,250,224,215,217,231,228,240,242,219,220,230,238,119,0,23,21,2,7,17,18,7,1,5,26,19,10,4,8,14,24,12,19,2,7,34,32,9,177,173,191,184,201,204,180,186,192,195,176,193,190,178,204,167,163,204,156,194,218,186,195,196,174,183,179,180,193,203,193,176,194,195,192,168,189,188,205,197,202,186,183,193,187,220,209,210,193,216,195,197,187,192,181,188,187,173,199,190,189,208,204,180,173,213,193,205,210,188,214,198,199,196,203,209,187,216,179,177,210,202,198,206,185,208,192,196,172,201,199,194,191,203,202,201,189,188,194,193,222,188,205,185,228,219,196,215,204,216,216,197,188,222,182,201,194,199,198,177,207,198,191,207,185,204,181,204,199,216,195,205,186,209,205,175,208,180,213,198,213,180,215,205,193,189,184,195,178,187,193,210,216,196,169,210,158,211,212,185,171,200,190,176,193,200,202,184,194,197,183,199,187,186,198,183,170,191,205,179,204,184,211,206,189,192,221,191,214,197,178,191,183,182,200,185,206,194,208,167,188,178,184,193,174,207,198,224,191,169,181,166,177,188,186,196,202,191,178,173,206,192,190,218,185,192,215,193,205,208,200,179,214,182,194,203,187,215,208,180,210,206,212,189,173,199,171,205,225,211,202,219,211,212,214,223,197,197,197,199,233,182,205,194,207,202,195,210,216,194,198,234,217,205,199,215,197,185,202,200,226,192,213,213,185,217,194,227,217,190,221,226,200,223,205,212,201,198,243,198,204,186,203,240,229,219,218,204,211,207,198,212,206,211,242,233,204,140,125,127,82,88,99,94,86,112,105,127,97,72,77,65,82,149,157,206,196,192,204,152,76,137,241,199,159,192,178,125,154,149,98,80,157,217,206,205,181,129,128,141,64,11,7,25,107,77,32,39,22,36,36,32,49,40,106,144,115,244,255,222,254,169,164,154,113,216,240,243,234,193,237,188,180,220,213,255,156,87,73,48,60,22,104,83,42,38,191,187,138,210,206,196,97,143,215,191,249,208,210,173,147,210,190,179,231,248,193,71,3,31,9,12,41,26,27,23,41,57,81,32,24,167,247,245,230,158,145,171,242,246,237,248,245,194,136,217,242,239,193,101,105,109,118,45,38,16,111,135,80,142,56,25,59,51,25,53,34,11,16,19,16,19,28,12,31,24,12,11,48,27,59,24,54,64,141,121,54,70,61,66,55,58,49,42,40,77,34,28,63,27,63,15,27,30,7,30,43,20,14,2,25,16,28,28,39,79,85,45,61,58,15,21,3,5,59,119,59,67,124,155,74,91,183,223,194,90,155,113,71,27,43,45,38,42,11,43,20,35,39,29,20,19,33,57,10,8,45,73,69,57,95,42,45,28,30,49,28,37,49,37,18,19,53,57,82,80,73,51,55,31,26,12,27,28,16,35,5,31,12,14,37,6,15,72,198,172,124,101,60,49,66,73,93,66,77,50,55,45,38,78,103,106,111,112,85,58,78,99,35,41,200,227,215,238,236,238,220,206,245,246,243,222,242,235,251,244,224,232,223,236,248,236,230,251,221,230,217,223,240,225,237,235,232,219,244,230,243,226,238,222,214,225,224,208,240,234,248,219,221,227,196,246,231,234,218,227,249,221,242,217,226,226,241,247,229,111,0,1,4,8,0,11,35,11,1,14,6,7,6,1,2,16,4,9,12,9,56,16,21,20,189,185,174,175,208,197,186,201,207,173,196,185,202,163,196,188,172,177,183,194,192,198,194,205,210,202,181,183,183,173,191,207,176,195,174,202,164,209,203,177,195,195,193,228,180,170,197,209,205,190,208,197,183,198,180,183,215,209,181,196,192,192,198,203,204,202,179,178,210,226,200,192,178,191,182,208,187,188,204,187,180,196,181,189,204,197,218,175,187,189,199,203,189,208,208,168,211,205,216,206,212,215,213,201,219,200,207,191,204,194,209,218,219,210,200,212,215,178,179,230,176,178,221,203,205,193,179,212,187,201,189,208,186,205,176,195,195,198,198,176,186,205,187,169,200,215,212,210,211,197,164,203,188,189,193,183,204,204,178,192,200,181,218,164,203,190,200,204,201,193,207,190,173,200,185,195,186,205,182,184,175,191,210,182,198,183,174,186,185,195,194,182,162,177,186,192,179,209,195,173,196,189,191,217,202,198,213,210,187,197,183,201,187,207,190,206,183,200,197,206,196,193,198,177,187,206,187,200,165,208,216,213,187,202,200,211,200,229,195,201,202,194,191,195,204,202,202,201,207,185,198,223,218,225,211,197,199,214,218,214,229,191,180,206,216,185,212,204,206,212,223,193,217,186,203,219,219,211,207,198,219,206,219,205,222,200,218,205,206,221,213,210,219,209,205,210,205,204,223,211,233,201,198,246,212,200,204,207,210,223,217,207,207,212,233,185,135,96,95,75,70,59,73,47,71,147,211,253,204,94,184,187,219,250,254,239,239,213,221,148,88,129,198,173,103,138,108,125,122,93,82,100,229,252,237,238,236,244,208,194,108,108,118,232,232,216,179,94,54,71,50,35,20,25,44,30,61,166,233,225,178,120,176,154,161,229,232,238,225,180,240,188,155,210,195,251,153,79,51,47,70,47,115,134,69,51,121,163,157,246,206,111,80,142,211,205,252,214,234,147,162,223,196,191,238,240,109,7,17,24,16,32,42,33,29,34,62,77,28,29,138,230,253,249,172,111,206,220,250,246,236,238,192,143,204,246,212,246,115,104,107,92,82,30,89,103,128,153,133,139,74,9,19,10,44,40,25,26,15,41,9,13,12,23,15,32,31,20,38,18,54,25,66,80,112,77,66,36,62,42,29,36,24,33,33,26,14,40,37,67,41,47,21,42,22,39,19,7,38,29,34,17,32,12,74,103,78,34,59,11,32,9,19,31,58,137,115,108,140,140,80,33,52,63,77,43,60,77,77,36,72,50,66,66,32,44,9,26,28,26,32,30,48,43,15,45,38,68,59,20,48,32,38,43,71,45,32,48,50,93,72,67,71,106,87,82,81,88,72,75,43,44,29,58,24,36,34,35,11,18,39,72,86,116,146,109,94,146,132,97,73,68,82,75,50,22,47,11,54,94,81,74,103,94,76,83,66,83,49,19,214,233,252,246,238,242,234,212,251,245,222,227,227,238,248,239,228,238,241,222,246,239,238,236,224,247,210,229,225,220,244,242,238,241,245,234,239,246,231,226,226,217,217,241,212,203,242,227,229,244,216,233,249,240,240,222,208,230,248,218,240,239,232,222,230,111,0,5,3,0,24,8,37,11,6,6,11,8,4,5,6,2,6,9,14,31,6,17,30,0,193,189,194,189,192,195,177,187,187,205,206,164,199,209,186,180,201,182,204,196,173,181,189,190,178,202,191,184,192,185,222,187,191,187,204,178,188,203,203,191,170,175,178,193,207,208,187,177,167,188,197,203,191,205,204,199,167,198,217,179,190,203,194,207,184,206,197,189,196,214,193,178,195,198,197,192,188,194,191,209,216,195,185,216,211,192,196,189,181,197,219,199,175,191,188,211,196,208,205,214,195,219,217,189,203,202,184,187,194,208,215,221,210,221,194,199,223,197,189,188,185,196,197,186,192,213,203,188,210,189,201,194,206,181,190,209,198,196,215,189,174,196,195,182,194,206,182,188,167,187,174,193,192,203,181,196,190,185,190,174,198,198,181,191,196,188,198,201,196,224,166,181,183,219,186,192,196,196,203,195,197,213,192,205,167,175,190,192,192,194,208,197,158,190,180,227,187,194,182,194,191,200,163,186,184,210,210,213,173,211,200,198,207,183,197,208,186,179,190,198,182,185,202,181,172,192,205,199,193,194,212,192,220,219,202,204,216,208,199,196,194,180,209,192,225,197,205,207,202,203,200,198,180,220,206,194,203,233,190,214,205,217,227,190,199,207,188,222,186,198,234,213,231,222,194,206,209,207,195,220,213,201,210,220,210,202,223,205,218,234,218,208,221,225,214,217,205,226,224,209,233,223,228,251,227,248,226,201,216,193,214,221,203,181,229,183,96,83,65,56,45,34,35,35,76,134,201,222,204,197,248,255,238,230,247,239,184,107,123,125,59,58,77,69,77,62,50,36,29,63,41,55,78,100,111,112,156,181,137,88,62,66,90,211,243,226,175,151,143,107,112,130,162,107,54,50,39,67,143,214,194,170,202,152,159,231,227,230,227,221,235,185,180,204,238,252,159,93,87,69,62,30,90,118,93,86,133,122,167,239,179,140,124,221,211,217,229,207,211,176,193,226,204,233,229,138,10,27,26,28,42,54,10,17,35,43,58,43,8,103,242,252,255,219,88,138,250,247,239,227,243,167,126,194,243,238,250,176,67,110,72,32,17,20,51,103,122,138,122,151,62,37,46,35,40,24,60,46,31,25,26,16,32,10,41,13,42,46,16,38,54,22,49,110,76,83,30,78,53,17,34,32,26,16,4,28,30,53,52,38,46,52,58,10,2,32,27,37,27,13,32,38,29,29,86,115,85,40,32,10,25,53,44,50,71,124,161,110,61,109,104,74,51,19,31,71,43,63,65,70,97,94,66,55,41,32,38,24,4,13,41,1,23,32,21,36,58,55,46,28,25,35,46,37,93,71,68,96,87,128,138,102,87,48,61,57,72,68,81,82,108,118,84,96,80,71,41,37,30,53,65,62,131,84,40,44,56,45,67,29,63,33,70,38,57,53,31,33,44,88,89,71,93,90,95,92,74,71,20,46,216,242,250,250,237,211,241,230,238,239,226,239,219,228,231,223,224,238,237,218,246,231,250,224,230,209,223,236,231,207,249,209,247,219,219,227,211,212,241,247,222,213,234,232,201,234,225,220,226,248,232,220,230,244,241,237,226,231,231,206,248,232,249,225,220,119,1,0,2,27,32,13,18,38,17,28,10,1,2,7,5,10,35,6,1,0,2,10,7,17,196,179,180,202,194,190,178,186,198,168,198,184,192,181,181,181,161,193,215,206,180,175,185,200,184,166,176,175,202,195,209,194,212,179,185,175,195,191,221,194,171,175,202,207,186,191,203,189,205,201,200,210,196,182,188,198,205,177,198,196,172,190,208,188,202,196,183,206,200,195,188,201,197,186,206,210,181,205,183,200,176,184,190,176,195,181,184,223,201,196,184,206,208,178,205,191,222,206,204,210,200,204,201,204,206,230,222,214,185,192,205,178,189,190,194,200,182,192,197,199,206,178,220,183,198,169,178,211,200,184,209,194,182,193,181,199,211,188,183,179,182,199,198,190,173,192,196,208,205,213,200,195,189,212,190,178,220,153,184,193,206,203,183,193,213,219,194,187,187,199,219,195,204,203,217,192,165,197,187,191,188,191,174,153,172,179,195,171,172,192,179,165,210,202,198,203,200,202,172,210,200,209,193,189,203,191,192,186,183,201,185,190,166,180,190,217,202,176,167,173,182,188,212,187,206,189,192,208,195,203,167,191,206,159,179,184,192,213,200,194,186,211,204,178,211,183,207,220,185,202,211,219,201,195,223,218,208,201,220,208,205,202,184,199,190,207,181,206,200,233,219,211,194,213,204,209,211,225,201,244,217,228,217,218,217,222,196,215,222,212,193,210,228,191,211,228,222,199,199,200,232,163,138,222,229,225,214,215,232,205,205,197,230,200,231,152,46,55,11,62,111,84,106,97,70,79,65,86,71,66,136,137,140,142,119,57,14,6,18,33,45,34,0,16,39,23,39,12,12,19,20,19,38,10,34,24,15,25,48,33,39,31,86,81,66,90,89,105,132,124,122,142,186,152,117,84,57,50,171,222,190,215,191,143,195,225,230,233,214,206,228,208,177,203,207,255,148,52,60,53,81,13,63,47,40,87,102,124,157,233,192,135,179,216,218,209,226,203,206,161,188,205,221,200,132,37,27,26,23,34,51,35,18,28,71,47,72,33,83,233,244,250,230,144,141,206,215,245,248,237,177,137,174,236,236,246,172,88,106,109,15,58,108,119,46,89,110,104,132,119,38,15,29,39,25,27,48,28,33,4,26,10,32,11,33,59,15,13,35,68,36,61,149,179,157,97,54,70,79,76,42,6,17,21,27,11,19,28,37,77,97,82,57,43,62,24,48,17,29,31,50,24,35,69,84,83,77,32,31,25,30,78,63,59,75,78,160,129,106,92,95,71,91,86,62,29,61,83,68,33,83,90,57,47,45,40,33,12,18,6,19,14,31,33,44,39,41,60,12,10,20,26,58,69,85,106,109,105,59,97,72,48,44,33,44,49,30,76,72,95,70,90,139,105,125,108,82,59,71,127,135,107,66,53,23,28,29,40,25,34,37,44,38,59,37,52,25,35,59,69,99,86,96,96,101,63,24,53,58,137,255,244,240,251,221,243,245,226,243,228,233,227,220,237,250,234,255,213,237,246,228,243,232,234,251,230,247,241,230,248,223,240,254,227,244,215,238,222,226,230,243,228,247,243,221,220,241,232,245,232,205,237,227,227,231,211,213,240,239,249,245,240,227,219,249,89,13,1,18,4,10,2,20,14,21,0,4,3,0,18,8,2,22,12,16,19,4,25,25,8,195,176,199,181,167,165,193,168,179,188,182,195,200,171,173,184,211,183,184,190,165,200,153,152,216,166,195,169,199,198,172,186,187,192,185,185,179,194,201,190,199,203,190,191,189,194,201,182,164,227,193,216,212,188,188,176,190,195,199,206,191,196,185,204,203,227,166,185,185,219,180,184,200,209,183,198,201,187,179,188,202,182,193,202,182,201,194,196,203,215,199,214,187,194,193,197,207,216,213,205,196,189,201,193,202,193,190,190,195,197,208,207,210,225,190,177,206,186,195,209,188,205,189,197,189,176,188,209,203,190,188,172,168,189,191,184,184,192,205,193,211,187,176,181,204,196,159,177,205,195,195,175,200,189,206,205,189,197,179,196,207,174,193,180,185,190,203,217,208,188,190,208,196,216,196,198,185,199,195,160,175,218,192,186,184,192,189,193,199,179,206,180,216,196,196,195,198,181,205,179,195,212,185,182,177,190,180,189,192,203,166,191,199,182,204,194,185,181,194,170,174,199,189,231,219,186,183,170,206,177,182,202,221,185,178,171,176,194,211,187,200,211,197,183,185,192,199,187,202,206,189,223,187,191,201,193,191,228,192,208,193,200,191,240,199,215,194,198,218,241,219,224,213,215,199,223,206,192,218,210,208,219,230,226,200,223,202,217,217,204,230,190,229,203,204,220,204,210,206,209,205,97,34,146,217,200,217,206,212,212,205,206,215,210,191,157,50,48,63,101,98,44,79,70,65,52,54,34,26,42,34,8,29,3,60,50,10,28,27,8,30,3,7,20,16,18,11,21,42,46,19,32,39,16,9,31,15,24,15,41,16,48,50,73,50,60,54,52,88,42,88,65,65,87,81,71,26,50,186,247,182,203,180,143,185,232,233,216,234,201,235,202,162,206,213,254,137,78,103,72,46,20,24,65,21,34,96,133,223,238,172,61,142,202,174,229,255,221,179,150,193,222,223,153,67,0,5,19,29,43,23,13,23,83,66,36,13,68,205,238,251,227,127,178,195,232,233,242,239,185,123,177,207,239,251,189,95,73,108,126,24,89,128,61,86,89,98,128,144,99,25,30,51,37,58,4,31,34,19,31,45,24,15,20,34,11,37,58,70,36,68,171,190,163,153,122,122,106,107,57,6,41,9,10,34,24,32,29,45,43,67,79,56,71,32,41,3,13,15,22,47,21,36,42,51,43,16,21,37,46,33,54,38,21,60,64,106,162,113,115,96,88,103,99,64,64,73,65,83,89,55,31,41,34,38,36,32,27,19,39,14,30,44,33,15,21,46,32,28,26,33,36,54,70,55,62,35,32,21,38,38,32,39,9,41,55,27,57,56,50,69,70,45,103,119,106,125,102,120,128,84,84,67,18,38,23,32,27,22,5,43,43,72,67,44,59,55,28,31,101,119,103,84,115,111,59,19,65,110,181,248,250,245,243,236,246,213,241,217,239,217,232,234,235,246,240,239,233,247,252,237,242,236,225,232,209,219,240,222,251,231,228,241,215,218,240,221,226,239,229,238,218,224,231,222,248,235,237,236,240,231,213,213,232,243,219,221,238,237,239,235,245,249,237,242,135,8,5,37,0,1,10,11,14,8,10,0,24,14,13,9,3,30,37,6,9,12,21,8,13,186,172,191,194,190,179,208,164,169,185,183,199,170,187,178,166,178,199,193,194,218,183,190,217,181,197,214,183,197,184,179,177,172,175,199,172,180,188,182,192,190,183,172,182,196,197,190,221,204,181,184,198,188,160,165,198,156,206,209,185,178,186,195,197,175,187,211,189,199,209,197,195,191,190,206,202,184,191,209,182,199,174,206,181,192,199,189,194,188,187,200,182,191,161,176,211,208,168,213,203,203,201,186,207,191,194,197,226,218,232,199,208,219,215,209,196,196,183,198,187,208,188,184,195,216,173,219,189,214,174,190,205,174,211,208,194,147,193,181,189,213,214,182,197,201,193,198,181,181,204,214,185,196,198,204,178,198,193,193,211,173,185,203,188,202,196,188,179,189,173,184,174,185,211,194,190,164,200,199,171,201,196,182,176,200,179,200,179,214,163,186,176,192,203,186,183,174,185,175,174,216,196,170,164,204,178,176,182,196,193,185,191,176,186,173,208,188,225,173,189,193,187,178,176,182,190,194,202,165,198,191,195,225,213,207,209,178,216,187,186,189,204,202,192,186,212,216,203,202,216,207,189,194,197,201,209,208,201,225,218,188,215,195,225,214,210,177,199,213,221,204,213,216,206,210,208,205,220,228,204,237,228,194,200,242,207,225,206,184,213,222,231,227,222,206,211,214,206,224,214,185,75,28,89,193,205,212,197,203,201,219,198,199,211,211,183,124,75,44,87,32,49,52,49,54,75,56,37,29,23,34,44,60,19,72,45,23,15,12,36,10,19,50,20,10,12,26,18,13,35,26,13,10,39,10,18,26,30,19,14,14,32,24,63,87,82,37,48,62,61,75,25,48,48,63,69,15,76,197,200,138,214,174,170,211,209,248,218,239,200,216,237,167,190,238,250,173,71,81,59,41,11,106,84,31,41,132,209,248,253,146,70,170,211,180,218,238,215,130,158,230,225,203,74,23,11,22,4,42,42,28,31,24,74,71,57,45,160,239,244,224,158,158,235,183,224,255,225,188,135,160,229,252,217,246,137,66,102,95,95,38,63,82,32,87,82,82,103,144,76,26,34,35,28,44,48,25,29,26,10,27,25,39,22,17,33,107,132,148,118,156,205,171,145,129,128,135,90,67,37,33,51,22,33,30,25,43,39,75,64,75,93,94,86,29,25,12,24,44,50,29,37,13,11,41,3,29,26,16,26,17,20,21,52,70,50,65,92,65,126,126,130,84,111,116,120,82,64,79,58,56,55,50,38,18,52,29,29,36,28,28,33,23,17,53,17,15,71,67,45,45,40,14,24,24,25,2,13,40,26,23,18,33,40,27,41,53,49,33,64,18,51,47,84,60,88,63,66,89,67,76,31,42,47,16,17,32,28,33,37,70,84,77,79,63,66,50,52,51,92,132,97,69,101,109,76,56,116,107,90,147,227,223,246,249,255,229,244,247,230,228,223,247,212,238,222,237,218,237,231,251,214,237,236,207,237,231,227,212,228,236,234,220,246,197,229,242,235,237,237,222,214,230,244,214,216,235,249,238,244,246,245,206,227,243,231,230,220,240,237,215,239,240,238,225,134,4,6,1,9,30,19,12,15,2,20,3,39,8,14,9,18,27,21,7,9,19,20,0,29,192,192,172,189,172,182,174,195,187,165,191,170,180,177,170,186,170,192,171,195,212,170,201,158,202,172,179,216,186,191,170,167,201,170,180,175,187,217,195,209,212,193,177,169,173,166,200,199,179,191,206,204,204,191,186,191,186,211,184,205,197,185,194,197,185,194,163,205,189,184,205,181,212,185,203,217,200,181,195,218,211,181,200,175,185,173,197,181,212,198,199,187,184,203,211,190,202,185,200,185,210,209,182,180,210,219,209,197,210,208,182,183,196,172,177,204,181,192,197,196,188,182,192,201,226,202,210,214,193,212,202,196,190,204,204,189,194,177,167,181,193,187,178,201,193,184,175,175,187,196,203,183,215,216,184,188,211,188,194,192,192,184,208,185,203,179,181,186,190,209,209,205,192,210,184,173,181,185,189,205,224,173,191,191,187,180,186,200,194,192,193,198,184,176,161,187,203,168,189,197,193,195,207,194,220,190,204,213,217,189,176,196,215,182,183,154,189,172,191,171,181,169,206,208,218,211,194,216,183,209,210,183,176,231,195,213,231,217,188,194,179,181,197,200,182,174,201,178,190,199,210,202,218,205,205,220,221,208,233,192,222,211,205,179,216,193,186,208,204,210,207,221,217,191,217,197,226,203,215,238,209,206,226,191,209,210,218,235,182,216,206,205,230,218,210,216,204,217,214,230,216,116,78,165,208,218,230,206,220,221,210,189,200,212,206,239,108,23,9,44,59,27,54,26,35,38,54,54,40,25,32,71,47,40,71,42,10,27,19,16,1,20,24,6,22,9,29,60,16,31,1,31,10,38,22,22,41,20,6,21,46,49,37,58,84,64,82,55,62,62,64,49,74,56,72,40,17,120,176,174,193,242,157,163,221,205,241,237,206,209,219,218,174,184,201,246,127,61,109,59,69,67,93,92,28,55,177,243,248,201,145,163,240,213,181,187,229,211,142,167,233,227,132,44,0,17,16,61,36,38,20,43,61,52,55,11,164,252,252,248,157,153,232,230,211,211,242,178,124,192,231,250,255,235,239,96,73,105,86,72,29,52,89,105,92,66,93,129,129,62,54,11,16,27,42,20,34,26,9,53,14,57,91,40,53,127,215,246,194,129,150,195,137,123,90,101,65,45,24,10,9,24,24,34,23,36,34,50,38,74,79,114,102,72,69,45,11,64,41,25,28,49,34,37,25,6,13,21,1,13,33,1,25,30,17,22,23,19,67,104,118,146,111,85,96,91,95,87,94,73,45,42,38,44,9,24,19,24,33,29,14,43,19,51,11,31,18,24,43,44,67,38,43,12,34,3,27,10,7,26,6,41,10,31,41,23,19,43,34,48,21,47,62,31,41,35,34,57,49,32,50,26,8,31,28,2,13,41,49,48,90,95,76,74,52,59,52,45,39,82,115,123,90,76,91,87,83,115,49,26,22,83,221,233,255,251,233,235,220,233,223,232,235,237,215,216,227,219,245,255,238,247,241,235,239,235,232,221,219,212,247,242,232,228,238,222,237,234,230,234,240,243,235,253,244,240,235,243,232,245,231,247,230,215,219,222,239,243,229,216,221,237,232,212,215,90,19,11,18,0,10,29,7,10,17,44,20,1,9,5,7,7,15,15,6,12,26,12,5,4,186,174,185,195,184,197,176,155,188,197,173,178,203,198,198,181,184,178,192,171,201,157,204,192,179,195,196,185,177,188,182,174,203,196,203,181,171,179,188,153,180,183,219,191,162,167,173,190,200,191,173,193,188,179,215,190,179,196,200,188,194,215,193,174,211,198,221,177,199,181,196,195,176,198,204,196,181,188,196,183,227,205,190,184,209,216,208,216,185,217,190,176,198,193,191,199,185,193,188,190,215,185,201,200,198,233,205,183,183,197,195,188,192,177,211,188,199,207,197,204,208,181,181,205,173,179,173,195,182,197,181,194,178,173,188,182,188,191,181,172,201,202,189,183,228,167,200,194,178,186,204,207,192,166,184,196,199,206,212,179,197,186,209,199,210,182,185,186,177,200,202,189,181,199,186,186,174,204,193,208,204,202,195,193,196,204,186,189,195,184,185,188,183,189,200,195,170,204,159,187,197,183,190,197,185,206,214,192,184,179,218,186,196,193,179,198,207,185,185,182,185,210,190,172,199,206,193,235,197,220,200,181,195,208,209,177,222,216,189,229,210,201,188,202,214,210,188,191,206,231,234,195,189,194,196,218,220,204,240,212,181,196,202,190,220,185,195,214,223,235,187,217,204,207,201,197,204,196,202,200,219,215,220,222,213,223,232,207,208,223,221,210,241,222,230,208,235,219,245,227,207,200,179,204,231,222,216,230,216,179,222,195,225,218,200,218,132,69,5,22,0,30,44,60,15,36,49,23,48,51,23,49,47,57,68,49,32,32,31,57,16,12,24,46,7,1,19,18,12,23,28,15,13,45,22,53,28,8,27,15,54,52,42,62,77,69,92,76,52,55,47,17,54,29,82,25,27,133,225,186,209,231,144,239,235,194,221,199,222,198,251,222,204,223,214,255,147,65,107,69,85,22,74,89,40,39,170,231,232,208,125,172,253,210,185,206,253,202,149,214,215,163,70,14,23,9,30,68,18,44,17,74,58,51,36,120,237,244,255,166,163,211,244,246,183,243,215,117,204,229,240,210,237,255,189,48,94,126,75,78,38,31,52,81,48,47,64,143,121,46,26,41,27,44,70,6,43,10,19,11,59,106,150,125,132,195,237,169,118,100,79,76,97,75,73,42,28,13,20,32,35,51,20,17,60,55,45,43,49,37,72,85,77,59,72,22,36,34,21,43,28,34,34,13,15,27,0,24,23,10,20,15,8,19,21,33,36,35,50,108,114,160,101,94,67,65,92,92,90,75,77,69,34,29,34,13,30,17,57,22,29,29,35,34,39,13,10,13,23,32,62,30,15,43,20,40,35,21,31,9,33,31,35,16,20,29,1,51,13,45,55,33,29,48,40,37,19,17,43,50,7,40,19,24,3,31,29,22,46,25,65,85,94,58,44,39,85,57,20,90,97,96,115,84,123,89,124,67,63,65,32,34,153,243,241,244,227,221,208,234,235,221,233,228,235,220,235,248,210,223,210,242,241,244,233,227,219,245,219,245,213,236,234,225,216,220,242,214,232,222,236,233,214,231,240,222,241,213,228,227,236,218,238,231,236,225,237,210,234,233,235,239,229,210,247,125,2,5,10,12,27,1,3,4,24,4,22,7,5,10,14,25,20,39,19,4,16,9,20,1,164,198,187,185,173,180,188,180,173,194,167,193,176,173,195,187,200,191,184,181,195,190,189,183,192,168,200,193,202,186,189,187,184,211,202,188,208,192,189,189,181,173,178,180,168,183,188,197,197,196,205,174,183,195,191,192,203,197,198,184,183,199,191,181,193,186,177,188,173,185,194,173,194,184,195,202,214,179,196,187,205,209,176,183,211,201,187,185,194,192,194,182,196,195,192,189,181,188,205,209,182,192,209,209,178,189,200,213,185,210,184,195,205,207,207,180,201,190,203,207,206,201,184,177,219,180,192,198,205,190,197,197,211,202,184,209,205,193,193,187,205,213,177,199,203,212,193,178,187,181,185,187,218,199,174,189,198,205,171,202,196,211,197,185,182,177,219,194,195,198,200,200,192,210,184,219,196,193,164,203,192,211,195,203,181,205,188,212,184,200,210,189,178,174,185,224,199,169,201,178,182,201,185,214,219,185,206,205,202,205,186,214,182,196,175,195,182,209,197,194,199,204,182,211,183,192,203,204,214,203,174,188,219,195,204,192,220,197,192,230,213,196,205,210,191,184,207,189,197,201,195,203,179,228,213,218,191,195,215,218,229,239,200,166,229,177,200,201,223,196,208,243,191,215,204,204,197,200,195,199,222,211,216,211,219,221,218,201,237,236,216,200,217,196,208,196,241,232,221,202,232,233,232,230,218,233,200,203,206,214,200,189,210,209,212,235,192,136,119,78,3,2,12,10,18,54,42,30,30,17,49,71,59,51,46,32,53,45,39,55,48,39,31,29,24,44,8,22,14,45,34,46,29,14,52,34,38,36,35,45,92,95,98,85,95,74,81,84,57,45,36,61,27,6,45,61,83,201,217,213,238,199,129,225,232,190,234,235,219,216,233,208,178,179,205,254,126,51,87,37,71,63,63,117,43,62,171,211,249,171,59,164,245,211,195,179,245,202,182,209,120,73,15,7,7,9,37,40,39,8,45,71,53,20,82,236,237,238,195,154,198,244,242,229,192,197,140,197,251,233,252,233,245,243,118,6,72,88,50,44,75,72,84,44,38,34,128,152,129,30,33,35,33,40,39,49,33,6,22,12,39,137,112,159,118,107,116,51,47,13,19,48,49,65,14,32,19,28,20,21,26,14,54,75,77,56,50,35,29,48,38,50,23,9,71,65,61,50,46,35,42,43,31,34,22,19,18,20,18,24,27,9,22,22,11,17,32,32,33,28,67,76,84,63,82,79,58,69,87,78,69,61,36,18,17,12,37,25,33,40,31,23,24,22,25,23,35,8,13,15,40,41,4,21,45,26,22,44,26,24,28,8,28,31,21,16,11,23,49,29,42,55,71,65,36,30,30,28,25,18,11,18,22,28,12,14,9,29,25,20,60,78,79,80,57,70,67,41,44,60,106,108,114,86,137,109,111,75,111,115,73,24,108,213,245,244,242,220,231,223,220,234,217,240,234,249,228,240,227,225,210,236,229,235,245,231,251,225,241,211,235,234,231,234,255,225,231,235,211,235,251,235,246,215,215,231,214,235,240,237,221,246,236,226,208,240,249,222,235,230,237,241,240,240,215,140,3,0,18,17,27,2,10,13,26,5,2,21,0,9,5,13,12,9,12,14,23,32,34,18,185,172,179,185,178,207,186,171,184,204,173,218,183,195,209,185,185,182,200,187,169,199,189,207,200,182,197,193,193,176,174,187,180,178,174,201,183,196,211,192,175,185,187,189,181,192,182,201,194,186,175,181,208,195,175,198,201,179,202,188,183,186,191,205,186,219,205,197,227,201,192,207,182,217,190,196,198,193,191,171,192,191,189,213,194,174,194,177,201,179,190,189,185,217,198,189,163,197,187,203,199,186,205,186,203,178,203,189,225,193,200,200,179,218,210,183,193,199,174,194,197,230,203,179,187,207,212,211,186,202,190,207,195,183,209,196,202,193,189,208,173,197,157,207,215,175,192,194,202,190,212,199,195,192,201,208,184,197,187,176,185,208,170,212,178,160,197,179,184,197,208,194,181,186,183,184,193,179,188,195,204,198,190,220,186,170,162,192,179,184,207,189,193,217,218,205,188,206,172,185,182,215,211,211,219,207,188,197,199,178,192,195,198,197,192,207,188,200,203,166,180,191,203,207,195,184,212,190,196,204,193,212,224,211,204,199,171,193,208,216,210,202,216,212,216,195,194,202,216,215,218,206,207,225,224,228,222,195,240,203,208,215,210,211,226,218,214,210,209,206,216,219,181,217,215,222,216,223,193,220,199,210,208,198,216,203,195,228,229,201,206,221,216,194,201,236,218,215,233,234,206,211,210,194,215,198,222,222,218,192,218,206,224,204,207,190,217,130,195,187,74,15,20,28,12,12,56,35,31,19,13,66,45,79,60,45,37,37,34,92,27,39,35,13,24,23,31,28,28,22,30,24,39,32,29,22,20,19,58,91,48,71,71,79,50,13,87,89,56,22,27,75,84,108,182,173,121,219,203,162,224,180,144,230,201,226,247,208,221,201,214,197,186,194,207,229,175,47,121,58,63,91,101,124,61,55,173,247,238,144,37,169,227,231,227,169,236,185,201,220,53,21,21,12,23,26,33,13,16,53,27,56,35,77,220,240,225,185,142,218,240,231,234,230,170,135,190,227,235,249,237,235,237,144,57,43,96,55,50,100,82,92,107,101,49,107,143,129,119,25,15,7,26,42,18,36,41,6,31,38,106,135,54,74,28,19,34,38,14,41,17,7,25,17,8,31,31,15,35,78,55,78,79,107,124,75,69,27,16,45,32,33,31,37,39,36,54,42,27,30,38,38,50,33,13,11,8,21,17,30,13,19,31,23,41,18,32,23,24,44,58,22,33,83,56,75,63,61,35,45,78,53,35,15,1,2,20,10,53,48,31,21,33,19,56,28,37,32,21,30,33,21,56,52,26,21,26,40,13,24,22,26,25,31,33,42,35,44,97,73,49,61,72,62,65,77,51,22,52,8,33,55,36,63,41,31,42,35,39,20,45,83,74,54,60,65,79,71,41,52,102,140,135,105,109,110,100,76,74,89,99,71,71,187,235,229,252,242,238,245,238,223,224,255,244,242,235,224,196,239,223,249,231,249,236,245,255,235,225,253,239,244,239,250,225,243,211,211,243,249,237,241,212,223,248,219,222,238,237,247,229,230,223,246,232,232,226,209,241,241,237,240,227,239,215,122,15,9,4,4,50,3,14,10,2,45,14,1,8,0,13,10,8,24,13,2,8,2,15,0,156,185,198,187,207,190,176,183,190,201,175,202,178,168,198,196,204,194,164,177,155,198,217,184,215,188,190,200,197,182,207,187,180,192,197,168,204,183,197,181,178,177,198,188,190,160,186,214,187,192,202,221,180,178,202,180,209,193,215,188,180,191,194,195,189,177,178,176,188,204,168,175,159,224,187,221,200,179,206,190,194,202,210,204,219,188,164,179,185,205,191,169,218,199,212,180,195,185,195,207,200,220,218,187,209,187,181,190,189,215,201,214,208,203,189,189,211,203,201,196,191,222,176,200,212,194,200,180,184,180,217,184,199,187,195,190,208,199,179,178,199,198,167,184,185,196,168,204,185,192,204,194,206,193,191,185,184,207,206,189,190,216,215,158,197,194,211,193,192,205,206,196,197,210,195,198,188,192,229,192,204,199,202,223,213,193,206,185,220,199,182,178,179,216,201,196,179,222,211,200,205,189,202,210,194,203,225,199,176,191,179,197,166,176,200,184,210,172,218,202,198,200,203,212,174,186,203,203,201,192,218,241,199,191,220,224,224,205,179,222,213,210,209,192,216,202,229,229,204,206,205,208,214,203,221,208,197,191,202,203,227,217,198,207,206,194,188,178,198,202,190,198,208,216,227,192,231,204,206,223,210,196,203,199,231,234,210,223,208,208,208,208,197,215,222,213,211,224,198,223,216,214,222,206,220,210,238,226,217,218,204,209,207,194,199,193,181,137,212,254,127,76,33,11,18,11,50,37,47,40,5,36,36,31,50,19,41,12,7,11,49,41,24,32,39,32,28,7,15,46,6,43,18,36,48,18,41,19,48,96,46,42,62,52,40,51,62,51,35,59,133,244,222,210,232,212,111,135,148,172,235,150,156,245,214,214,229,216,226,233,229,204,194,195,231,240,185,77,157,92,75,56,49,98,72,66,209,239,224,64,40,198,207,243,223,213,240,196,160,116,13,38,0,7,37,48,44,55,45,71,77,47,65,211,251,246,179,142,187,237,247,239,236,170,98,152,244,236,234,219,241,198,134,97,42,82,97,38,69,96,67,39,42,97,115,112,120,159,92,19,24,10,7,33,53,50,51,16,35,96,115,76,33,40,16,38,13,39,0,8,1,7,12,5,32,27,0,34,79,76,119,93,83,92,114,89,85,35,56,62,65,25,40,13,44,31,37,16,7,7,60,14,21,36,45,31,44,31,45,38,12,14,40,14,13,22,4,13,23,27,44,44,31,29,61,67,64,67,59,62,26,56,7,14,27,13,24,33,40,33,30,23,33,39,42,32,23,29,32,31,4,25,46,26,36,31,25,32,44,15,54,33,43,60,44,35,39,57,69,59,45,25,30,51,51,68,54,62,64,34,44,17,35,32,26,22,31,8,32,36,48,86,76,90,53,53,68,74,13,71,93,114,131,129,126,81,90,111,97,79,50,60,56,159,228,238,247,233,233,239,246,244,245,234,239,247,231,226,239,221,221,246,241,234,249,233,220,221,249,223,222,241,233,241,237,233,231,212,241,238,234,230,226,214,248,237,246,224,247,222,224,222,232,241,249,249,247,221,222,237,235,237,236,249,221,105,0,2,13,3,25,40,2,0,8,20,22,11,5,12,5,10,5,15,8,2,13,26,5,17,187,188,149,187,164,181,181,192,195,178,168,192,183,176,173,187,180,191,185,204,166,182,201,181,185,178,188,203,197,181,206,158,218,194,172,221,171,176,214,159,179,175,201,217,203,179,188,194,175,157,196,194,217,186,181,204,181,188,172,200,210,203,204,205,196,213,195,185,199,211,198,198,192,205,219,174,186,205,169,219,224,211,205,201,177,201,187,207,191,194,194,206,198,164,193,207,199,192,194,195,171,204,176,189,154,202,191,185,202,190,189,195,186,211,174,200,200,194,178,204,172,189,184,206,197,208,182,221,177,210,198,193,207,183,190,188,192,180,189,172,191,187,191,199,189,198,212,179,178,188,191,201,189,174,192,176,215,196,193,189,188,178,209,180,189,228,188,176,186,183,211,235,174,184,207,196,200,196,207,182,219,199,221,198,201,215,184,186,199,206,204,166,219,195,205,217,232,184,201,202,206,187,201,196,197,195,178,203,216,208,209,198,183,193,174,203,184,199,198,211,232,190,204,193,206,203,209,185,203,197,203,205,211,228,206,224,223,195,228,179,205,227,199,216,205,202,199,213,210,205,200,213,224,211,226,205,209,213,227,212,213,202,186,225,232,209,231,200,209,200,211,195,211,207,217,232,223,194,203,215,220,223,195,211,224,213,235,227,235,194,212,228,224,221,190,215,213,239,209,206,233,197,216,184,223,221,217,219,210,203,229,226,214,207,201,204,220,174,177,232,132,114,11,31,95,22,49,28,49,46,65,58,43,51,31,10,29,46,34,40,41,39,23,47,22,17,36,25,37,25,25,27,43,27,25,20,32,14,46,81,49,60,33,36,53,15,23,38,133,150,221,244,232,209,235,125,76,116,163,233,220,135,210,244,205,205,224,229,218,203,230,199,164,185,202,249,157,76,158,86,73,13,21,87,55,44,208,250,155,40,95,241,128,134,202,169,235,135,115,26,5,19,9,52,47,40,16,30,47,62,56,55,204,249,247,216,124,193,217,246,216,218,182,106,116,213,220,242,241,239,187,149,218,126,52,69,100,70,76,107,73,90,32,72,95,101,150,130,77,1,53,28,30,43,41,32,49,36,7,58,60,30,16,30,32,15,40,26,5,12,17,34,19,55,1,23,17,52,71,57,90,83,50,107,117,117,105,83,68,37,49,29,50,42,21,47,25,27,19,23,26,41,32,49,24,29,39,19,15,29,24,21,25,15,18,21,25,44,35,49,23,52,57,5,15,31,65,77,70,60,21,51,11,18,20,4,42,73,63,73,40,16,24,13,36,25,22,15,16,39,28,21,41,25,14,37,42,53,44,8,30,118,140,114,116,110,63,70,62,57,91,52,50,9,60,86,86,88,92,56,89,74,48,43,35,59,37,41,20,36,42,67,46,70,64,81,57,66,56,64,89,111,111,96,112,90,66,64,79,95,36,41,36,120,244,221,246,233,223,209,247,248,218,235,240,229,241,249,240,241,247,229,221,229,244,224,223,228,229,229,234,237,252,242,249,245,230,243,239,252,230,213,224,233,226,245,242,240,233,223,225,245,242,238,233,249,217,234,218,244,223,230,231,231,240,108,3,18,0,18,15,15,45,22,8,29,36,7,15,4,13,1,18,1,2,7,31,9,24,26,168,186,207,195,191,200,155,189,194,185,199,165,192,168,195,174,189,190,187,172,197,184,189,191,194,166,206,173,180,180,179,193,201,190,172,194,179,196,202,167,204,175,196,211,186,195,185,184,197,213,182,189,207,186,197,215,195,192,182,184,205,179,213,218,199,181,176,186,216,223,186,195,196,193,208,192,181,185,196,220,169,179,186,191,214,206,196,202,182,189,206,202,189,196,186,201,188,181,207,194,208,196,208,205,205,180,213,199,187,173,215,181,198,205,193,190,173,201,207,209,226,191,193,180,204,190,205,189,189,188,181,190,183,200,210,185,213,197,178,189,201,170,189,195,188,178,189,189,185,178,196,196,174,194,197,195,195,189,179,204,217,196,180,199,204,187,184,192,171,202,198,219,193,183,212,177,211,203,182,191,213,183,187,194,209,218,188,207,206,167,198,208,204,210,196,175,215,209,227,178,213,211,188,200,191,200,178,216,167,208,184,196,179,165,193,198,206,190,201,207,214,196,218,195,219,190,211,202,220,195,222,219,209,221,208,210,208,208,226,223,204,203,211,230,205,201,204,197,229,221,195,215,206,189,206,221,227,229,212,179,213,233,198,213,194,203,191,175,209,203,199,198,220,230,219,222,229,211,213,218,221,221,219,198,196,218,208,233,224,225,213,228,220,224,218,214,212,219,230,204,215,185,185,197,228,242,198,218,199,223,232,213,211,213,166,174,236,174,220,198,107,91,59,171,220,183,212,251,250,255,222,229,202,179,171,159,228,137,63,33,12,48,33,39,17,20,26,35,29,24,26,37,12,32,59,76,92,48,18,42,38,50,35,42,49,73,162,175,239,217,242,239,155,135,188,113,121,224,184,246,231,135,224,242,206,206,219,214,213,171,233,228,181,181,204,246,154,61,159,73,58,48,16,81,50,66,220,238,115,40,86,118,58,160,215,207,206,91,50,13,11,22,38,35,33,24,36,74,54,49,26,142,255,239,253,176,202,222,234,242,201,185,142,154,187,247,249,220,189,175,124,189,249,132,36,32,70,80,69,161,132,143,31,63,77,87,143,130,56,25,26,15,35,61,56,24,26,14,17,32,38,18,17,1,17,32,43,18,16,41,20,13,36,34,7,35,33,93,94,96,122,91,81,83,132,134,43,57,72,51,81,24,35,29,31,33,40,25,38,32,30,41,45,21,57,39,11,50,3,12,18,16,39,41,34,10,25,18,48,29,28,47,28,53,40,40,46,59,69,53,53,26,28,35,5,18,40,24,32,48,31,42,2,33,38,31,19,12,33,42,40,31,2,27,8,28,26,23,19,29,71,154,177,179,177,165,147,115,56,56,89,44,7,11,61,52,62,38,42,71,65,86,123,117,88,66,17,13,42,27,64,92,83,49,77,60,47,74,47,48,88,111,109,103,98,107,59,26,45,86,74,79,14,61,219,220,240,247,227,222,239,249,249,209,243,239,232,250,246,232,230,211,228,233,228,232,230,228,227,240,228,230,204,217,228,239,224,235,213,237,189,238,195,238,251,236,240,223,237,236,240,205,236,228,237,248,246,214,224,245,240,236,242,224,229,112,7,11,11,6,14,2,13,11,32,11,29,20,3,0,35,13,3,2,14,0,33,14,7,0,179,182,167,174,191,191,201,186,169,181,184,193,223,169,201,195,192,203,179,177,199,173,177,187,207,177,195,182,197,165,199,191,194,218,212,191,197,195,200,195,169,169,190,201,172,194,211,167,207,192,209,197,197,186,192,204,204,196,172,225,218,212,185,184,174,202,205,214,199,217,185,180,205,185,179,181,192,211,181,228,218,205,192,166,182,208,222,187,200,196,194,212,191,179,192,200,204,171,178,184,216,196,216,188,194,185,184,189,191,165,199,191,212,194,204,198,191,197,195,195,198,199,192,201,182,195,183,191,203,186,183,182,199,203,184,203,198,218,216,178,216,200,188,184,188,192,212,195,198,201,204,210,184,217,223,213,195,198,174,219,195,183,197,214,185,196,181,186,209,212,191,197,201,221,219,192,183,184,218,211,203,195,206,214,201,206,198,212,195,181,214,199,219,182,212,209,195,196,219,180,206,205,191,191,199,205,233,195,203,187,211,205,207,217,195,206,193,192,209,223,206,219,209,220,203,219,210,189,224,222,202,205,224,190,222,208,192,202,218,199,225,230,193,204,184,212,220,205,218,240,218,227,211,212,204,228,212,222,220,215,198,206,214,185,225,209,208,218,199,207,211,207,188,214,233,193,209,209,228,220,218,221,212,195,220,202,210,208,195,234,204,220,194,209,236,201,212,234,209,214,212,197,212,188,236,216,212,224,224,228,198,217,221,188,198,204,215,189,195,182,117,114,41,161,242,250,246,241,242,242,251,244,243,214,249,247,244,178,51,42,128,133,80,24,26,17,20,55,34,34,21,40,25,98,197,253,250,87,131,170,146,159,177,171,176,241,253,248,237,228,231,181,89,135,201,150,189,177,208,255,186,149,221,204,236,201,230,222,219,207,229,238,144,208,208,252,142,66,101,72,47,37,46,111,42,52,201,201,104,60,34,3,57,214,245,211,192,37,21,28,9,31,51,54,25,24,70,43,64,46,110,238,240,244,177,206,251,248,253,206,176,97,144,197,225,230,245,225,151,143,184,210,240,153,6,58,87,88,37,78,120,104,48,72,72,115,137,125,62,23,42,33,34,22,33,36,36,0,26,8,26,23,20,20,12,31,37,22,34,38,41,31,27,46,46,25,39,96,113,119,116,89,112,128,85,65,64,89,66,52,55,64,36,26,38,30,45,31,49,32,28,52,50,35,17,29,29,20,35,24,3,16,30,11,39,4,3,23,18,50,39,20,37,47,46,64,35,70,54,45,57,32,36,34,29,59,39,10,13,37,10,32,28,8,20,15,23,5,36,19,67,12,30,12,31,12,11,37,28,20,105,151,203,156,145,172,132,156,102,69,89,49,33,25,33,13,37,33,38,40,53,60,86,78,123,72,31,18,65,21,61,70,59,70,85,85,60,74,41,37,88,98,125,99,126,92,49,81,112,129,124,114,63,26,204,229,242,226,247,246,244,238,230,232,222,234,215,225,240,222,244,234,224,241,226,236,245,213,232,228,244,248,246,219,237,231,224,237,231,242,238,242,226,227,225,226,232,238,234,231,243,243,244,249,232,228,238,248,214,233,228,244,247,242,220,114,3,0,5,27,31,8,26,9,10,15,12,31,37,0,16,6,13,13,10,18,9,10,15,40,182,204,187,208,188,193,199,188,184,155,186,188,184,188,201,185,224,161,222,175,183,162,193,187,201,187,197,169,204,174,184,183,172,197,197,183,168,186,207,172,180,207,192,205,194,206,193,182,220,192,211,194,186,206,180,182,211,198,210,202,199,207,209,201,186,205,182,208,196,200,184,187,211,190,197,206,210,205,173,200,214,187,192,201,201,173,182,196,183,180,175,186,187,191,186,209,177,191,196,168,189,189,202,188,204,182,183,192,207,189,192,175,209,192,207,208,199,212,167,172,172,205,192,196,190,184,194,183,179,176,198,196,180,203,189,207,221,171,202,177,209,204,204,205,190,203,203,185,197,191,188,196,190,197,180,199,189,185,176,217,219,194,213,226,199,206,192,217,197,189,197,187,201,185,212,212,203,189,193,203,190,221,192,211,200,206,207,224,210,208,210,221,213,236,199,190,186,206,207,190,215,196,211,207,215,177,234,224,209,198,204,213,191,209,202,200,194,205,211,205,235,201,191,181,204,195,215,235,211,210,209,222,228,203,210,209,215,195,215,206,228,223,216,199,188,221,224,213,237,203,209,204,216,204,187,205,194,236,214,224,215,199,223,202,229,210,230,198,234,214,208,208,180,224,218,206,227,212,206,207,180,216,233,196,188,227,209,215,228,211,216,207,210,215,222,213,218,207,223,203,197,216,220,217,208,232,212,196,196,216,213,193,218,182,191,182,255,221,207,201,107,105,34,180,248,230,244,243,248,240,228,226,220,216,208,234,235,98,56,50,159,155,85,40,26,41,31,41,23,60,26,39,53,29,139,241,255,148,85,185,196,251,237,239,221,255,248,237,200,147,225,208,150,193,185,118,126,134,187,253,146,168,235,242,196,194,210,219,213,202,231,227,154,200,211,255,149,54,117,70,68,44,27,91,76,44,209,204,74,65,8,5,130,244,246,232,72,14,12,8,9,48,44,45,28,39,40,76,34,80,225,249,249,202,224,231,217,250,254,228,126,102,208,245,241,223,242,169,96,178,232,253,253,159,41,47,78,87,48,23,36,57,32,110,108,93,135,79,19,49,32,36,47,68,10,43,54,57,18,29,18,17,49,32,38,37,36,42,45,44,20,29,27,21,29,33,29,68,100,51,86,103,97,82,71,35,65,86,56,48,66,36,34,34,50,41,26,41,42,42,32,42,64,34,37,13,40,31,32,27,7,23,41,34,39,35,16,30,31,27,32,60,25,55,58,69,52,50,54,68,41,65,27,31,39,65,5,17,2,20,8,21,21,45,3,24,28,30,30,35,26,32,32,23,35,18,36,51,74,77,128,153,142,129,125,153,166,145,101,74,89,51,30,44,19,16,20,15,12,33,19,27,64,84,93,61,11,45,34,17,60,44,50,71,89,93,36,60,38,48,71,93,120,101,106,95,93,137,99,71,40,88,96,55,176,231,227,226,239,244,242,234,208,236,218,218,239,233,240,234,234,242,231,230,243,242,221,242,233,233,230,242,221,224,196,221,235,235,243,226,239,241,230,233,230,248,226,246,247,214,232,241,215,207,244,250,231,229,221,250,244,222,249,227,223,123,17,2,5,7,23,19,36,6,16,24,11,16,5,0,4,1,22,28,20,13,8,3,17,6,170,198,184,212,189,164,191,199,200,179,169,188,197,197,189,183,206,216,197,213,193,206,192,186,185,168,195,200,202,203,196,186,199,193,207,162,185,183,205,200,200,183,177,192,187,182,167,185,192,176,192,210,190,191,179,192,185,187,191,215,202,196,171,182,200,181,207,233,191,212,193,189,182,188,213,191,205,180,201,199,180,204,191,195,191,209,183,203,200,195,203,188,190,184,182,209,223,198,184,182,221,206,167,204,194,185,207,194,199,193,199,197,220,169,205,181,217,199,210,206,204,201,183,192,205,166,183,206,191,189,195,211,183,200,195,166,216,181,202,194,210,180,186,206,192,194,186,188,201,211,193,203,194,190,192,218,210,206,179,201,210,199,193,186,198,206,209,187,202,205,216,235,190,204,206,202,206,224,225,230,201,205,215,203,215,209,234,193,191,202,215,220,196,179,218,210,209,216,224,197,206,221,217,221,172,204,216,200,197,209,196,207,215,192,220,184,231,224,196,195,202,220,210,209,225,207,219,208,195,222,227,215,218,217,220,234,215,230,214,209,229,229,226,205,224,223,205,223,196,207,211,213,223,213,224,223,216,211,213,199,199,202,210,222,217,196,212,214,214,186,225,230,182,235,235,221,200,225,231,179,233,218,196,208,223,203,208,209,221,193,217,220,190,215,212,198,209,214,207,206,205,184,224,200,216,209,223,199,222,234,227,211,222,196,197,198,241,232,205,124,65,85,35,121,176,203,220,233,217,211,215,183,151,173,131,184,188,86,48,101,179,153,107,57,14,10,47,36,43,54,10,30,57,31,99,214,211,117,27,60,178,248,232,204,166,212,165,184,170,138,227,218,201,225,187,111,132,166,237,247,161,189,245,206,228,209,224,249,232,189,218,215,191,201,188,246,165,17,86,88,88,67,73,73,85,61,198,188,81,35,9,36,207,231,255,143,45,13,7,7,43,52,34,4,48,60,87,6,79,207,242,234,203,215,237,254,244,241,240,143,153,238,243,240,211,230,225,113,169,226,239,249,240,213,93,39,18,65,89,72,52,67,130,87,107,126,117,85,9,18,33,28,68,41,31,35,24,21,21,18,19,39,23,45,33,67,29,16,38,47,60,19,51,10,18,30,43,73,57,81,92,114,77,69,56,46,70,36,36,59,39,75,23,3,38,23,24,54,40,24,44,33,9,47,33,21,26,29,34,35,28,31,47,34,28,32,34,42,32,12,40,60,37,53,59,33,68,62,70,45,59,48,23,25,30,30,25,20,18,30,33,40,17,14,47,59,27,26,13,34,13,23,17,35,42,39,62,121,142,156,164,153,139,147,135,112,151,147,78,61,76,30,28,19,27,41,35,16,6,25,31,35,53,58,80,100,43,28,36,25,44,81,87,77,99,95,33,66,59,29,70,94,109,104,106,124,121,126,77,74,46,100,127,15,133,233,229,243,242,251,232,227,225,217,234,224,239,246,221,241,230,240,233,235,236,240,226,237,240,247,242,218,213,225,240,253,237,238,237,236,216,227,209,243,231,255,223,232,241,237,241,227,250,242,236,232,252,245,255,249,239,234,222,225,249,117,6,0,13,12,8,23,12,19,2,9,9,20,17,0,3,18,26,19,37,19,14,17,26,29,178,196,198,188,192,180,209,165,185,176,181,196,196,186,198,192,176,181,202,218,189,206,195,216,187,163,176,177,189,204,181,190,173,183,186,196,197,201,188,175,218,167,175,183,221,198,197,209,204,179,196,215,184,214,201,197,207,177,209,198,185,203,205,203,208,206,180,169,206,185,213,207,189,195,224,183,191,194,175,198,198,176,210,194,218,169,198,202,189,195,189,201,202,191,165,200,201,189,198,200,175,195,186,193,182,193,180,218,187,200,200,190,199,189,211,193,187,201,201,183,192,189,175,200,192,194,194,190,219,199,179,189,211,192,214,167,198,205,205,200,197,179,178,208,203,187,184,213,193,191,221,204,216,185,198,220,176,189,192,192,209,208,197,191,240,239,209,193,180,190,205,198,216,215,191,184,195,208,206,196,206,212,172,191,201,199,232,214,204,206,223,215,207,216,184,194,221,224,213,196,217,194,236,212,210,194,190,194,203,201,185,211,223,230,221,229,211,217,202,229,200,195,182,217,220,214,190,194,183,206,216,209,241,181,239,217,231,213,204,204,209,224,192,164,199,219,204,221,223,215,196,222,223,201,216,200,205,218,209,221,212,231,195,221,208,220,201,206,198,198,195,224,229,194,209,187,219,234,219,234,220,192,238,213,213,213,221,197,229,215,218,233,216,237,223,233,217,204,221,221,216,207,215,213,218,226,193,201,218,229,219,214,207,181,194,214,209,147,113,50,20,39,8,44,46,130,203,172,187,184,173,151,115,176,188,204,205,127,113,120,162,174,119,61,13,24,56,50,49,43,33,38,81,146,203,250,230,163,52,7,84,209,236,146,116,195,158,221,204,188,195,183,182,148,171,155,190,228,249,215,128,228,227,246,212,211,226,225,217,197,205,221,145,186,202,255,154,77,65,63,58,50,72,77,81,71,141,91,23,53,5,66,243,250,211,79,8,2,1,39,67,52,6,34,22,61,39,68,190,234,252,183,197,240,239,249,253,251,194,146,220,226,230,215,193,201,155,180,240,227,249,253,240,208,97,27,1,31,77,116,58,111,60,60,134,106,119,74,21,46,27,25,27,31,23,42,45,41,16,26,26,16,20,32,67,32,40,38,35,60,35,36,43,24,41,20,23,82,48,96,127,109,74,92,119,75,59,56,33,33,41,21,15,41,25,12,27,62,37,20,27,31,30,34,50,21,27,50,18,38,44,22,34,23,58,23,25,38,44,49,54,44,63,49,66,32,69,32,42,44,46,27,19,19,27,14,19,2,11,27,31,18,37,33,5,40,28,19,37,18,27,18,18,32,31,99,141,154,165,162,147,149,113,118,110,122,157,137,55,37,51,43,15,41,32,15,14,41,16,40,3,39,27,53,77,91,59,21,44,22,30,89,74,72,95,67,81,52,38,13,97,108,111,76,85,84,108,107,64,96,143,169,142,31,152,242,229,237,242,245,215,211,239,231,250,223,241,222,239,239,242,236,230,248,235,216,236,231,239,239,238,243,234,245,228,237,235,248,231,209,235,251,223,237,219,239,240,251,233,237,249,237,244,245,240,251,208,253,245,220,244,215,237,234,233,130,22,9,7,7,18,7,21,9,14,17,12,11,7,5,9,3,13,16,3,11,25,13,17,6,190,169,171,184,203,185,196,203,173,188,189,179,189,196,181,169,189,182,166,181,190,191,184,184,168,190,204,174,183,173,174,179,195,205,186,214,178,182,193,186,210,192,187,204,180,197,175,181,193,202,188,207,180,186,214,192,195,209,200,207,187,172,198,176,207,196,183,188,210,188,193,193,196,165,173,189,194,201,195,198,191,201,201,198,212,193,183,187,190,213,198,200,175,187,199,206,198,181,199,164,191,188,193,193,194,181,178,183,191,181,191,188,205,198,189,182,178,191,181,203,201,204,222,200,183,205,207,198,190,184,211,196,196,189,178,212,180,186,191,189,168,189,198,182,179,193,195,179,199,193,193,212,222,205,210,171,200,175,185,204,203,216,208,178,202,205,203,191,195,219,197,209,194,207,213,209,191,213,204,219,210,219,195,233,221,227,219,206,200,181,186,213,217,210,213,211,205,225,184,218,210,217,208,200,203,208,203,194,206,206,199,189,215,228,201,212,196,207,220,212,220,206,217,216,218,185,216,225,221,226,211,211,218,213,220,219,235,221,197,211,216,212,205,201,211,209,206,233,202,213,209,229,215,224,246,224,214,241,221,214,213,211,212,219,211,211,200,210,209,226,228,221,199,238,205,210,219,234,227,203,197,202,210,215,222,218,208,216,216,209,230,206,211,216,242,222,236,209,221,214,229,230,213,204,232,198,231,216,226,206,220,224,205,201,182,204,188,70,18,8,19,33,45,50,27,82,105,160,192,176,213,166,149,207,242,248,199,88,152,107,194,194,154,85,15,32,38,59,35,37,41,30,100,182,230,244,239,207,149,30,57,128,146,111,139,181,191,222,186,193,180,134,156,165,180,168,208,196,253,195,149,240,237,235,199,209,235,212,220,203,233,229,169,207,202,255,146,26,106,89,27,73,55,81,106,42,78,39,88,48,26,129,254,217,90,20,32,7,12,31,19,0,24,49,89,30,55,144,249,239,193,163,228,249,247,250,252,193,155,213,251,231,207,239,159,94,157,241,253,224,250,247,224,136,160,112,51,58,48,71,54,66,62,51,153,143,111,83,19,27,5,46,33,55,26,24,47,20,13,10,51,15,15,47,23,34,13,31,52,47,23,29,36,39,35,17,66,99,58,71,88,74,49,41,62,35,53,58,36,78,53,8,31,10,25,39,26,11,37,28,42,52,52,42,30,18,21,27,28,29,26,36,53,27,31,29,27,67,85,65,105,78,44,80,35,52,27,34,46,51,20,17,25,25,39,17,32,18,35,48,12,17,37,37,11,9,24,30,29,13,20,16,49,10,80,139,125,134,108,126,122,123,91,106,89,124,144,103,83,39,31,21,34,38,34,40,39,51,30,45,12,27,38,31,66,55,37,9,13,16,39,71,72,77,76,42,62,57,35,34,68,98,99,102,77,91,122,98,29,78,115,90,90,11,141,219,243,250,250,246,237,249,231,239,247,240,247,241,248,229,239,238,248,242,233,247,216,246,243,253,249,240,225,236,251,248,235,213,215,239,240,236,240,234,239,215,235,237,253,249,245,245,248,233,238,234,242,254,228,240,240,247,246,242,241,113,19,7,10,31,22,8,7,24,16,14,2,15,16,8,3,5,10,36,17,12,6,29,5,20,205,163,162,173,148,189,184,192,206,182,201,207,212,194,189,185,210,187,184,194,191,197,214,184,212,188,181,189,209,171,190,201,184,187,205,206,208,215,190,166,200,202,195,186,175,194,197,198,196,193,207,194,188,187,191,191,175,181,213,200,200,191,160,205,194,211,178,211,195,173,209,186,203,195,192,209,224,210,180,207,186,194,198,190,195,174,186,206,208,205,200,185,171,182,199,201,168,185,195,204,206,191,192,196,188,182,188,179,197,208,201,181,184,196,174,217,208,191,189,162,203,174,196,207,201,173,182,190,179,188,186,185,204,200,186,213,215,185,172,206,209,181,194,157,189,226,199,203,211,204,209,196,205,199,189,212,187,185,218,186,201,191,211,212,192,197,213,181,179,188,232,233,192,201,220,217,191,227,212,199,225,207,218,207,243,212,219,203,203,198,208,209,237,198,199,225,217,208,202,234,195,223,218,189,222,216,205,228,202,239,203,228,235,209,210,247,197,219,220,216,200,221,203,218,190,207,211,221,205,202,200,210,238,214,195,188,217,220,222,216,215,223,198,223,197,192,226,197,212,204,244,198,227,220,214,196,215,189,241,196,238,247,215,182,194,214,202,213,242,231,190,213,213,220,213,215,205,210,218,229,218,223,230,230,213,200,226,213,206,228,214,206,218,207,188,212,242,193,211,219,227,206,194,212,198,192,209,222,191,232,207,217,211,162,213,211,150,100,19,24,27,43,77,82,40,59,164,212,209,231,225,191,142,206,218,229,125,108,122,100,181,176,132,69,5,31,29,45,50,40,52,30,63,140,200,234,164,205,164,19,56,145,241,168,192,181,182,160,157,116,139,195,137,190,163,184,172,193,253,146,169,236,230,224,218,200,222,206,217,176,234,183,156,149,213,252,119,33,116,85,53,75,60,62,92,53,66,130,218,79,13,187,231,140,35,24,12,15,52,22,0,10,43,48,56,50,147,214,243,180,130,205,241,254,229,228,185,139,191,253,238,215,200,222,72,83,210,228,252,246,254,220,103,68,133,158,85,32,43,45,25,71,26,83,161,134,124,41,10,27,24,22,43,30,17,50,21,32,32,44,15,16,49,33,49,28,20,15,10,25,55,36,11,40,16,61,46,85,67,65,57,60,37,18,46,38,24,12,75,60,14,26,20,23,4,15,32,19,23,9,28,15,18,50,27,25,37,18,30,32,35,42,26,28,8,39,14,159,159,156,91,134,87,42,32,5,27,49,42,33,42,25,29,12,13,38,25,38,70,28,4,30,27,25,31,50,59,27,8,33,30,17,28,3,108,135,128,107,112,112,91,90,102,115,107,108,118,96,26,31,35,23,25,29,29,41,18,51,54,27,45,14,37,29,45,32,34,24,59,40,39,70,72,44,68,47,80,18,63,10,34,115,117,91,104,65,82,102,35,60,56,31,44,11,139,213,236,252,221,254,230,244,238,235,222,238,247,254,255,240,239,240,248,238,235,237,244,242,229,255,224,235,247,253,231,251,238,246,242,234,243,243,250,240,243,245,246,252,253,245,241,238,248,222,253,239,241,241,242,237,206,247,250,240,227,100,7,18,0,1,2,5,33,6,20,17,9,14,2,0,20,22,20,7,9,7,30,10,19,5,190,191,206,208,186,177,165,201,181,202,188,164,206,198,189,172,189,179,199,189,191,196,185,186,204,185,189,185,196,188,188,208,167,203,191,194,196,186,184,197,176,171,194,198,199,209,194,192,193,211,221,170,193,201,191,190,224,210,194,187,186,192,202,192,189,187,205,191,185,191,185,174,195,202,196,191,174,193,174,211,212,198,190,200,202,190,189,212,205,168,202,210,194,165,172,197,181,198,194,195,188,192,184,200,191,182,202,177,197,184,211,183,188,203,179,200,204,226,191,198,177,193,185,192,174,187,177,207,207,181,192,179,198,178,200,187,203,204,169,175,210,200,176,185,202,205,220,200,204,210,211,190,153,200,180,188,189,198,229,207,208,222,174,199,171,205,202,221,203,183,219,195,183,208,206,207,200,228,200,187,200,211,210,208,216,194,218,201,220,200,224,194,206,205,209,211,203,212,212,190,234,211,218,216,202,216,237,195,199,216,210,205,229,206,219,215,226,222,225,213,232,236,246,225,220,226,214,200,220,227,224,220,207,221,203,197,213,231,206,223,195,187,197,211,216,209,202,230,232,232,214,211,192,196,204,222,202,214,207,196,231,200,216,210,201,198,238,210,207,202,219,209,230,230,194,222,207,170,183,211,219,202,212,239,188,185,232,227,213,204,226,231,214,220,226,241,231,229,209,214,202,202,183,202,205,206,216,212,227,216,208,223,206,192,220,222,209,120,96,95,95,134,99,79,59,101,222,189,193,211,242,177,154,226,224,182,110,109,174,108,172,163,42,52,39,31,43,31,65,39,60,13,66,131,199,225,150,211,193,78,51,211,237,222,230,197,162,136,147,124,164,186,131,124,131,138,214,224,250,147,202,238,225,239,217,201,227,212,229,193,231,229,181,170,203,242,149,69,80,98,75,59,38,68,97,94,132,178,221,152,84,209,119,47,11,2,26,24,39,31,45,39,67,67,21,120,233,223,220,135,196,249,253,246,206,177,157,211,217,253,242,192,172,111,87,198,232,229,246,253,233,66,8,64,106,95,93,23,40,51,46,57,38,86,172,139,111,14,32,34,33,38,36,22,26,27,39,30,10,26,22,19,28,43,42,43,40,39,47,37,53,39,21,68,23,54,43,57,71,40,54,74,39,46,28,33,31,37,26,53,11,37,12,26,29,41,43,44,32,49,41,40,38,38,16,27,33,18,38,7,37,48,61,35,51,42,66,140,130,103,86,91,76,44,40,10,37,15,28,22,18,40,30,45,55,50,61,62,16,30,30,18,27,3,17,18,26,42,24,38,29,28,7,34,74,149,126,113,100,98,115,118,124,147,114,78,120,84,52,29,19,42,30,58,40,14,25,45,48,28,19,29,7,15,52,18,19,21,15,14,40,52,78,50,75,70,86,47,58,36,52,150,108,120,106,105,74,85,114,84,43,40,22,108,248,253,242,251,248,251,233,223,223,252,238,242,233,252,243,242,245,242,243,227,253,234,244,237,254,243,248,241,224,237,254,244,250,249,255,251,227,247,214,240,255,240,226,242,236,255,241,232,247,251,243,255,252,236,232,248,248,248,243,236,249,123,11,1,2,18,13,15,0,10,3,8,8,1,11,0,5,8,16,18,30,18,25,4,28,28,150,188,209,172,159,180,187,186,204,171,209,176,180,212,195,228,198,187,177,185,174,164,197,183,185,200,192,199,178,183,202,198,196,185,216,193,191,195,218,170,179,175,209,196,203,167,169,179,199,186,198,215,185,197,192,211,205,200,218,210,212,221,186,165,190,179,165,190,201,199,175,198,198,201,191,218,183,176,193,218,193,187,188,205,193,187,197,190,210,202,204,176,198,199,182,208,176,206,202,177,206,164,208,185,197,204,191,196,192,221,215,186,191,177,174,192,193,193,183,187,204,191,176,193,197,164,203,182,195,192,174,191,207,202,204,211,208,185,214,183,198,203,216,205,206,212,225,202,210,208,183,228,182,193,178,195,187,219,199,222,220,211,203,192,198,194,182,200,211,176,185,204,201,205,211,198,205,237,210,206,212,227,196,226,205,213,212,223,181,216,214,206,192,207,203,202,234,203,216,184,216,209,227,236,197,211,228,240,212,209,201,218,227,224,224,208,220,204,225,219,221,231,221,199,217,206,218,214,234,234,206,219,211,222,179,210,213,210,221,212,223,196,199,213,217,210,217,219,203,224,199,227,222,211,234,201,220,201,199,201,211,211,210,195,199,205,229,222,209,218,223,198,221,232,215,199,227,207,198,210,221,237,240,213,221,236,206,221,218,225,232,222,217,205,238,208,230,190,235,230,237,210,192,187,172,219,229,218,235,232,206,241,198,239,178,223,236,178,193,179,141,132,111,120,118,115,213,211,227,223,243,184,162,226,201,130,103,110,158,125,187,211,40,16,8,19,62,36,79,27,36,41,119,162,197,217,176,221,190,103,90,196,219,167,152,153,167,211,217,199,207,168,64,117,109,149,156,247,183,149,216,249,222,247,200,225,220,217,229,191,228,220,191,190,209,254,157,59,135,88,53,45,36,45,52,104,131,175,253,191,182,139,44,32,5,7,18,28,43,20,58,45,59,24,103,203,241,214,149,194,189,236,252,226,197,163,210,252,251,220,232,166,89,139,161,237,243,239,242,221,95,26,57,89,108,120,141,91,30,45,35,30,73,142,124,127,103,19,44,38,17,6,34,58,38,39,10,29,39,26,45,37,31,36,39,27,19,24,59,50,15,30,30,14,36,44,38,40,28,11,50,48,40,43,54,32,9,4,51,41,11,24,6,22,25,46,53,40,15,33,13,28,46,27,65,19,18,43,30,38,29,27,27,45,17,38,68,111,127,74,98,65,78,55,51,24,35,3,5,6,33,11,31,36,45,36,24,52,51,28,45,15,13,6,30,18,39,47,30,35,43,13,22,26,76,138,113,117,96,93,100,97,87,72,102,106,99,107,43,24,19,42,41,34,62,15,55,39,19,52,45,36,51,20,14,36,53,21,43,6,34,91,89,84,69,50,71,38,56,55,56,129,102,90,90,98,90,49,90,92,113,104,152,235,236,248,232,241,255,241,235,234,243,225,224,254,254,241,241,245,241,247,234,223,234,225,255,248,247,244,236,235,241,255,242,252,251,238,255,247,253,238,247,251,249,253,249,240,245,245,217,230,232,247,249,234,242,250,247,241,247,239,243,247,238,105,10,0,2,11,26,15,26,18,20,12,13,12,12,27,13,12,17,7,5,14,13,14,1,13,188,189,191,175,184,184,192,172,182,188,199,225,210,178,214,186,185,173,195,212,192,187,178,155,205,190,193,212,182,194,215,211,178,179,195,164,181,197,191,190,209,184,183,209,183,176,197,170,190,188,199,191,196,213,187,194,174,209,198,197,204,196,203,185,209,193,207,190,219,214,185,187,206,179,185,205,185,211,204,194,205,186,199,194,198,209,199,198,193,203,217,188,193,186,189,183,188,204,216,175,184,200,191,172,206,182,183,215,215,186,199,182,185,200,198,224,195,192,203,202,192,185,216,186,202,190,210,218,203,190,219,195,219,186,191,220,186,218,185,205,188,199,217,213,223,197,224,165,198,205,209,209,198,201,203,232,196,203,202,195,201,190,163,217,188,206,225,194,212,196,208,213,229,216,194,225,213,200,204,217,204,232,214,201,194,218,232,197,198,218,203,207,218,191,198,214,221,224,212,222,206,230,213,217,209,190,209,198,232,221,208,217,186,206,199,207,241,207,192,223,206,223,203,220,205,217,199,190,202,219,222,239,239,205,217,202,228,200,216,202,220,205,220,223,213,228,212,212,238,213,210,197,221,230,225,216,195,217,208,202,216,206,221,210,210,214,205,213,217,202,230,230,228,217,213,223,224,214,235,204,208,241,249,252,243,223,229,213,213,207,231,215,231,208,225,213,221,210,208,227,243,206,209,216,195,223,216,236,238,232,225,210,171,141,168,157,145,190,225,185,165,116,105,187,186,127,204,188,238,229,244,153,159,219,179,103,96,117,173,117,198,209,15,13,16,97,82,100,49,15,30,89,216,200,220,204,210,235,210,148,64,89,95,117,192,203,194,245,225,208,219,127,108,146,147,163,176,238,145,134,237,213,246,208,220,205,234,215,212,199,222,205,159,188,196,252,141,79,105,84,83,90,57,67,55,29,91,186,247,254,195,91,28,32,34,8,37,32,7,45,39,61,47,63,177,251,232,152,166,229,252,229,250,198,150,212,244,250,233,244,224,105,76,209,190,238,243,242,218,65,25,38,87,103,123,162,211,144,74,87,89,79,130,129,135,140,39,32,17,54,53,61,64,23,26,31,79,11,15,22,30,36,30,46,37,51,51,29,46,52,24,31,18,36,31,31,29,28,35,28,20,41,24,58,41,61,38,25,7,25,22,20,32,19,43,28,42,23,39,40,75,39,32,17,27,26,25,12,15,40,38,24,45,37,24,28,28,94,143,100,58,99,52,28,22,31,21,32,20,15,39,14,28,38,54,46,26,8,36,41,19,3,35,19,11,30,34,53,10,57,25,22,28,29,66,96,116,119,69,74,62,78,38,18,60,104,108,95,61,47,14,29,29,7,34,16,11,27,38,39,42,16,53,31,35,41,59,15,26,25,22,95,86,64,52,53,74,41,57,68,52,118,83,116,108,133,80,87,77,98,95,142,196,219,226,248,255,224,250,234,239,245,249,236,248,244,250,246,248,252,232,252,249,242,248,223,243,255,253,254,250,254,238,243,238,236,240,255,251,253,237,234,251,241,255,238,239,255,250,255,240,249,255,242,237,255,255,239,243,248,244,250,233,246,250,125,0,10,16,21,37,17,7,0,1,17,6,20,12,12,0,9,0,17,5,12,21,16,30,11,172,178,191,176,196,190,197,200,221,176,206,226,195,210,194,190,180,174,207,191,199,199,199,188,181,178,199,196,201,192,195,189,220,164,196,195,182,212,186,177,166,193,187,204,189,175,165,212,192,179,180,205,176,225,221,208,206,220,196,232,203,177,194,200,181,184,205,197,204,174,187,208,188,200,183,188,187,197,184,198,211,200,213,191,196,191,183,182,174,180,200,173,195,185,212,194,187,182,196,181,192,188,214,217,163,195,170,210,192,195,215,208,186,193,201,166,178,196,195,207,194,195,194,190,214,205,217,197,213,201,209,188,212,182,173,176,214,200,224,214,182,188,190,210,205,210,211,193,210,211,200,212,214,215,201,200,205,196,205,221,208,205,218,213,201,211,195,217,198,205,215,194,205,199,200,204,211,201,225,203,213,220,204,205,200,235,198,198,216,230,210,200,222,211,222,193,228,224,224,230,217,187,210,219,201,222,205,219,196,195,207,218,213,211,190,215,197,223,209,215,203,212,207,211,243,219,236,228,218,204,219,224,213,233,219,227,205,203,235,183,204,216,205,206,209,198,205,231,217,205,204,191,203,204,216,208,221,219,201,207,228,223,204,187,191,202,237,209,210,193,230,205,214,212,233,209,213,225,244,226,198,198,167,223,214,246,204,228,235,226,213,201,223,198,237,241,234,208,213,213,239,222,201,214,200,187,212,195,185,187,187,150,177,159,151,110,112,178,225,185,175,111,74,237,227,82,149,191,180,201,191,162,141,186,172,136,129,137,194,150,193,222,50,27,64,142,140,125,83,40,28,132,229,214,185,226,220,178,173,151,48,22,58,154,225,198,224,229,164,153,191,150,92,162,194,224,230,241,132,130,200,151,204,219,181,200,239,228,223,201,239,237,200,171,200,242,106,78,137,116,78,72,40,38,28,86,133,234,247,231,156,38,24,36,47,38,40,54,16,51,65,48,56,178,251,248,150,163,219,241,248,246,225,142,215,249,242,242,244,240,183,135,184,253,208,250,240,249,112,6,42,82,99,97,120,155,188,129,33,22,83,81,92,138,121,109,48,5,23,42,55,47,42,25,20,27,21,33,25,24,38,24,28,26,34,40,38,41,30,19,39,14,60,39,41,15,22,46,47,25,25,1,54,50,43,71,36,10,28,53,29,25,39,17,42,18,34,20,22,24,15,51,12,17,37,34,23,24,26,18,31,49,28,41,33,51,35,72,107,98,75,32,70,29,6,37,35,32,29,13,8,41,58,47,54,46,22,24,24,11,13,36,16,33,34,54,22,7,27,42,27,31,18,30,34,52,118,119,89,56,75,74,68,23,15,43,98,121,110,63,29,32,30,41,51,34,38,9,61,28,51,36,63,30,27,35,51,55,8,40,40,95,73,68,63,70,83,54,49,53,28,91,110,114,123,125,84,79,62,95,70,25,11,80,216,242,232,253,255,250,255,245,248,244,243,247,239,239,239,235,242,238,251,255,231,239,236,243,236,247,242,254,236,251,249,254,249,249,235,253,245,248,253,248,248,238,245,224,250,250,228,226,246,250,244,251,231,254,239,228,245,249,243,224,220,114,25,31,10,34,7,23,28,6,14,3,5,20,2,3,26,21,31,27,13,12,22,15,5,19,202,180,196,185,168,203,212,188,228,179,174,184,197,176,204,183,213,185,195,178,199,215,176,199,196,195,198,184,211,208,209,200,196,225,202,170,172,188,205,198,183,187,190,188,196,189,175,218,220,181,208,182,204,202,185,172,204,171,222,179,193,188,205,194,203,227,204,200,180,195,211,209,199,197,197,190,197,200,202,201,202,204,180,189,209,187,207,213,191,219,204,211,207,194,202,207,180,203,214,198,209,182,197,181,198,168,183,201,195,221,204,193,184,188,203,187,176,198,212,183,212,191,194,174,205,203,187,192,205,205,206,197,196,199,193,195,204,211,212,195,203,207,225,224,220,205,186,226,213,205,200,193,194,220,208,202,196,195,209,190,185,221,215,196,204,228,209,205,212,204,227,194,233,210,231,200,200,225,230,203,216,218,209,203,218,207,207,202,235,219,239,217,202,216,204,229,205,203,205,235,199,200,208,211,217,223,215,218,209,201,201,226,180,224,219,203,219,201,215,190,217,209,227,217,228,210,206,219,208,211,209,235,227,206,198,205,208,211,205,200,208,206,221,201,215,210,217,235,232,199,197,206,205,214,219,206,212,199,226,202,222,216,223,211,214,221,214,220,209,215,230,219,227,202,207,216,199,217,213,241,191,89,85,128,202,218,223,224,243,215,220,242,236,212,226,194,227,220,219,227,247,221,197,192,201,208,171,171,178,180,176,182,170,200,218,194,205,197,212,127,173,102,52,208,198,110,176,146,146,200,221,148,166,162,129,138,152,166,220,142,191,237,123,56,95,170,127,137,140,52,14,173,244,186,136,212,216,152,155,158,91,69,113,175,231,192,209,193,142,136,208,144,116,122,142,175,244,234,138,213,202,213,202,188,198,196,220,199,209,190,239,230,184,163,223,213,79,44,113,111,76,44,50,36,64,126,205,245,241,162,65,1,35,42,40,28,23,24,22,43,74,48,127,231,243,165,175,213,236,233,250,228,159,159,243,221,247,245,250,176,165,196,238,242,221,233,245,101,38,33,76,102,120,111,93,69,128,103,15,47,36,66,103,119,98,115,27,18,48,45,38,53,50,24,48,29,29,42,10,9,20,40,52,38,28,30,28,53,7,28,56,33,38,54,25,16,40,24,19,26,51,19,42,62,46,65,41,20,14,28,20,30,34,33,32,48,39,43,36,41,28,26,33,41,38,38,47,42,3,51,46,31,33,27,26,84,51,56,126,137,95,62,58,18,43,18,34,44,28,28,28,29,59,42,78,53,17,11,23,17,35,16,28,20,43,16,8,25,33,35,36,44,36,14,38,23,70,113,102,82,107,110,45,79,49,61,94,139,144,108,59,24,36,24,34,40,36,48,41,20,45,58,34,53,53,42,34,43,50,32,35,91,53,42,68,78,72,49,62,70,35,92,114,126,101,115,87,88,45,53,49,18,50,200,255,246,253,215,250,244,224,219,254,233,252,245,249,249,240,248,235,251,243,234,253,242,255,252,253,230,254,251,228,229,246,255,223,243,252,255,215,239,254,216,226,225,242,243,242,228,243,241,238,237,246,252,239,255,236,239,249,244,250,248,201,97,14,0,0,4,9,2,18,18,24,28,11,7,4,0,1,9,21,26,15,5,10,14,37,13,176,200,206,181,182,199,183,192,215,197,203,175,197,225,191,181,174,182,213,206,191,185,191,191,193,203,197,220,194,219,193,180,210,180,219,215,195,209,184,194,177,214,195,194,220,179,202,210,197,193,188,200,207,206,188,205,181,206,217,181,192,172,179,172,204,208,199,174,192,174,197,203,228,208,192,184,201,184,201,196,205,202,197,195,177,171,181,187,187,170,196,223,183,194,195,209,184,197,201,217,203,189,189,180,190,204,196,201,182,214,203,195,192,197,194,198,184,184,209,200,178,221,182,217,189,196,204,202,201,203,201,196,192,199,200,214,201,199,204,200,195,221,197,197,234,170,193,214,203,200,189,200,198,213,206,182,185,215,203,198,192,211,193,199,208,212,181,228,189,215,218,227,180,174,203,216,222,235,222,209,197,214,212,189,213,190,202,214,222,219,200,206,201,236,189,195,217,212,191,188,206,235,181,218,237,196,209,222,197,219,204,192,212,185,208,216,203,203,221,218,212,217,209,210,228,207,240,199,191,188,212,222,223,214,199,177,219,211,195,222,202,218,206,210,211,215,198,217,214,219,225,222,208,219,224,194,200,211,211,196,210,212,212,219,199,211,201,230,202,214,202,225,217,216,228,205,211,215,221,244,183,133,117,124,181,216,245,222,221,223,225,224,220,220,224,191,228,209,209,223,236,199,173,175,149,170,180,205,211,205,214,230,188,220,222,252,210,177,214,135,172,122,57,180,169,85,214,206,171,217,229,172,111,152,142,136,160,179,232,113,191,241,159,50,88,180,152,144,134,81,29,195,246,157,141,233,170,138,199,215,222,77,69,182,187,159,192,228,189,179,160,138,120,164,181,215,246,193,160,214,236,245,236,233,203,178,210,195,210,182,236,206,174,194,218,208,71,74,94,71,79,23,10,16,40,115,139,246,191,56,33,29,25,15,70,24,2,43,32,57,21,53,173,219,181,183,232,241,234,251,203,168,132,222,241,226,223,243,164,158,219,221,245,222,207,254,62,33,58,95,128,123,110,76,41,49,138,79,21,36,124,175,143,121,114,113,18,23,54,38,32,57,58,20,32,41,36,23,42,33,18,24,30,46,43,30,66,41,36,16,56,42,25,16,20,5,25,50,47,31,46,44,29,50,93,82,77,34,15,39,29,46,30,60,34,27,15,18,26,33,45,32,49,33,49,38,13,36,8,33,34,58,15,37,84,90,55,48,59,101,92,69,54,46,40,31,23,14,7,28,30,42,60,81,14,41,34,17,22,26,24,36,19,32,17,68,32,27,35,30,24,28,47,44,20,26,34,51,110,135,107,139,68,62,34,41,94,117,149,158,104,52,18,23,36,44,42,64,43,42,38,51,47,56,53,34,59,33,18,16,42,85,61,79,70,84,58,51,74,45,26,78,122,83,98,113,83,42,81,62,31,106,249,240,254,230,241,255,238,240,247,241,249,249,246,230,253,254,249,244,253,235,240,250,251,239,255,250,251,245,253,244,249,224,254,254,251,239,250,255,248,248,238,255,247,241,246,243,240,232,254,250,244,248,229,255,229,241,253,244,235,229,254,243,227,101,6,0,5,0,8,21,31,16,10,14,11,8,20,14,30,12,9,14,8,5,19,1,13,35,175,197,180,193,199,202,192,206,182,189,203,188,213,226,204,195,198,175,198,195,225,194,201,176,187,205,221,204,205,184,225,185,189,226,196,206,194,179,172,200,175,187,209,201,176,212,190,228,196,210,195,208,176,201,214,180,169,194,204,192,201,190,176,175,216,195,209,203,201,202,204,193,196,204,208,191,226,193,197,188,209,215,185,207,212,220,204,191,197,193,193,197,198,194,205,171,190,196,179,174,200,205,186,194,182,201,201,205,195,207,206,201,192,173,191,202,181,214,171,210,217,195,194,201,196,196,196,215,208,205,207,235,199,202,177,206,213,202,176,211,210,208,188,177,210,215,191,211,208,203,203,194,192,205,222,204,187,212,176,184,214,194,205,190,212,222,202,201,207,222,210,196,197,193,218,234,218,207,217,205,219,225,209,219,199,198,207,224,193,226,193,202,201,213,204,226,203,216,207,219,205,196,230,204,212,209,206,219,206,210,221,202,208,204,221,207,205,202,207,232,215,197,236,188,216,221,198,235,223,216,213,214,191,212,223,198,201,200,210,195,183,231,220,202,225,244,238,226,219,184,213,188,201,236,235,205,222,212,212,223,205,212,191,230,186,215,218,245,237,198,200,225,212,220,218,214,229,220,208,195,225,221,187,164,205,187,215,208,234,209,214,198,237,221,237,235,203,221,202,193,174,194,136,202,173,186,219,225,244,217,243,240,160,208,211,231,195,150,205,172,185,98,98,229,183,118,195,234,197,224,220,142,139,154,130,155,119,214,200,90,177,248,176,34,63,147,150,131,76,42,10,172,243,119,175,244,188,201,225,237,225,112,66,156,200,198,224,233,219,202,145,143,132,177,226,235,246,151,144,221,247,246,240,254,230,210,238,220,220,191,234,198,162,173,172,187,65,73,45,51,40,19,30,22,84,88,114,164,65,14,13,25,11,37,64,47,40,37,78,52,66,84,213,198,149,193,219,251,238,234,161,157,216,237,238,213,209,156,155,227,245,192,241,214,166,134,31,48,80,86,106,125,70,57,45,64,141,92,26,43,146,175,134,135,137,107,15,22,22,57,50,17,58,30,40,38,10,25,32,29,18,45,22,26,34,25,22,47,20,15,36,30,19,48,22,66,19,39,3,36,27,21,30,21,72,86,62,80,20,31,27,33,53,35,43,50,33,40,41,40,3,21,33,36,10,38,38,13,37,33,46,56,47,71,134,147,87,59,74,64,89,89,71,51,41,42,30,5,13,16,17,45,40,53,33,28,23,13,24,22,13,45,29,22,18,15,37,16,41,42,32,18,41,33,32,17,38,30,50,102,80,94,74,45,61,42,69,95,157,198,146,63,24,14,42,46,33,33,43,32,33,45,50,29,26,42,30,23,37,43,62,84,76,36,57,58,73,47,65,71,10,103,107,104,112,92,131,83,92,47,75,234,255,251,237,236,244,254,246,242,255,220,248,248,235,248,235,243,252,253,240,241,246,251,224,246,249,251,248,229,242,235,245,242,239,240,233,252,238,255,244,241,254,245,243,253,237,239,237,255,247,240,229,238,247,246,247,254,231,237,250,248,251,248,212,106,8,18,0,15,15,27,9,10,11,20,22,5,22,14,1,0,22,34,0,10,15,13,34,8,201,202,193,197,195,229,169,188,205,177,191,199,195,180,198,203,219,180,189,186,204,208,190,174,176,226,183,189,178,229,207,182,207,198,184,160,217,180,218,220,182,194,219,212,203,208,178,210,195,183,198,214,195,205,209,238,192,178,191,208,194,208,201,212,209,190,199,196,191,185,205,201,180,193,183,187,206,196,230,180,203,193,215,191,203,197,213,187,201,212,197,196,160,207,188,204,201,197,192,182,204,205,205,200,199,208,179,196,200,189,202,194,196,188,194,209,197,201,200,192,208,196,201,191,166,180,198,204,205,166,211,201,203,208,219,212,211,178,234,201,219,216,212,207,230,206,217,218,209,199,187,213,226,179,222,197,193,214,188,209,207,222,200,188,204,227,208,216,232,218,204,182,217,177,230,213,216,190,229,212,201,215,184,217,202,207,210,213,191,222,205,219,237,197,209,214,218,203,226,226,210,209,217,193,196,188,202,226,198,213,206,207,217,210,213,204,222,197,196,206,210,221,189,194,193,176,222,223,214,191,210,215,202,197,233,203,226,193,183,194,211,205,187,198,216,220,206,227,204,201,226,222,210,214,202,209,212,225,206,238,195,214,220,205,205,216,207,197,211,196,216,215,222,226,210,213,223,226,220,209,220,244,242,231,204,183,202,242,228,214,216,215,202,223,220,213,208,204,185,206,204,227,194,215,220,212,241,234,226,228,227,202,179,226,214,225,140,164,228,201,186,95,121,246,201,96,202,217,198,184,226,133,133,175,160,174,137,223,193,108,164,220,221,85,98,155,144,108,87,47,40,200,246,98,202,242,181,177,203,238,132,94,43,98,171,209,245,205,201,143,164,162,183,203,221,252,229,144,161,248,250,225,231,238,229,204,227,237,208,207,239,232,164,149,176,174,107,35,59,42,74,83,103,165,146,157,111,117,42,7,12,9,46,40,28,14,20,58,49,61,129,195,165,172,190,201,237,240,241,169,154,225,243,239,240,226,199,148,201,246,237,213,244,212,26,31,34,96,128,101,123,83,27,86,52,72,127,77,51,35,22,65,111,139,134,76,31,32,48,29,63,49,27,43,39,20,15,15,31,21,20,27,37,29,57,60,23,35,36,34,28,36,28,20,42,30,19,24,38,50,45,30,25,31,55,48,87,61,24,25,26,48,30,6,36,33,28,40,28,28,60,30,6,26,29,41,23,36,31,52,50,99,89,85,125,187,75,49,38,71,79,110,75,38,53,20,29,47,24,54,20,28,28,39,39,3,19,27,3,34,33,20,12,30,76,24,35,29,46,64,26,28,39,36,30,12,40,21,52,44,56,22,68,43,49,57,47,47,93,184,133,38,36,15,26,30,38,49,58,52,23,14,50,53,46,40,45,22,23,31,32,74,72,71,56,55,98,63,63,96,33,91,145,104,120,101,83,107,77,84,83,214,227,223,252,250,249,244,248,255,240,250,219,233,252,250,238,253,252,250,240,244,247,252,241,226,252,243,246,225,250,222,255,236,251,249,243,232,246,250,242,253,231,225,237,251,236,236,241,252,246,222,247,241,248,248,243,255,252,238,237,239,248,250,242,103,15,12,1,31,27,2,12,1,24,38,8,15,6,0,11,9,10,7,11,18,21,5,19,10,198,213,201,206,212,195,221,206,198,209,197,197,180,190,199,229,210,193,196,191,182,181,192,217,203,197,179,208,187,188,183,151,184,208,191,186,181,219,203,203,197,195,170,209,212,209,212,197,185,185,197,188,170,170,190,190,206,187,189,199,202,210,193,180,215,215,211,212,209,187,207,180,192,191,226,190,193,198,183,178,186,180,219,167,199,192,187,213,184,183,194,177,170,201,218,211,227,219,188,200,199,194,198,202,188,202,185,208,199,196,204,193,211,201,217,212,190,205,190,205,191,198,217,197,209,208,181,201,200,211,193,212,224,187,197,202,221,225,208,203,192,175,204,224,201,178,193,194,195,228,230,224,202,183,197,202,216,217,201,206,222,197,200,205,214,212,203,185,223,209,231,196,204,215,201,218,199,205,184,186,200,196,205,203,225,203,196,200,209,217,220,202,205,220,222,206,208,218,210,215,197,217,225,217,217,220,198,213,209,189,213,208,193,214,227,223,211,209,189,211,205,224,217,198,216,204,219,224,204,211,192,234,218,213,211,205,236,184,192,210,236,214,211,217,201,196,205,215,212,231,222,210,210,198,208,211,187,224,210,221,210,191,225,219,223,219,214,211,207,200,219,229,222,217,226,219,208,213,233,226,222,207,232,242,215,213,216,183,194,211,232,211,192,204,194,210,194,203,212,245,217,237,189,220,208,225,199,191,201,195,202,219,172,212,211,216,141,166,237,190,181,92,112,238,215,100,130,189,182,194,216,172,152,134,179,172,142,247,201,103,188,238,233,130,72,118,125,115,72,42,30,174,232,89,215,250,168,137,180,175,177,138,101,129,160,235,209,166,225,166,160,128,166,176,197,231,226,139,188,232,221,249,223,207,207,205,237,220,231,214,229,248,201,171,150,225,109,14,30,81,105,67,146,168,212,190,48,41,33,5,26,42,46,14,4,42,34,61,24,120,202,181,176,182,237,241,255,252,175,152,194,244,238,213,247,170,143,203,250,241,241,201,246,108,26,64,78,131,106,129,74,29,61,111,57,92,146,57,27,58,105,123,125,82,108,64,21,25,5,29,46,41,40,16,46,28,45,39,21,24,33,33,14,26,26,17,49,23,25,44,32,27,31,32,33,38,19,36,39,34,33,52,23,56,25,37,66,76,28,20,28,45,50,4,28,18,45,37,33,35,12,46,44,37,21,24,35,27,42,71,102,128,112,150,171,129,83,35,23,34,57,57,34,60,26,28,33,22,35,7,38,32,33,11,21,14,6,16,19,11,29,43,31,24,31,17,25,37,18,24,15,43,28,28,35,20,53,37,7,22,41,6,43,70,15,51,7,31,49,29,50,40,53,23,38,45,50,39,49,43,51,40,46,24,31,35,5,38,39,30,50,63,69,67,54,69,85,57,68,78,25,46,124,132,137,110,102,113,58,29,25,122,237,241,220,249,254,246,211,235,251,254,247,226,255,252,234,251,254,249,254,240,243,240,252,245,254,253,249,243,252,237,240,254,248,247,211,255,237,248,252,242,242,248,249,255,244,255,228,253,233,246,244,250,252,241,253,249,242,241,226,250,253,252,242,97,2,12,12,13,14,13,0,36,22,12,18,17,4,3,27,7,23,7,33,1,16,1,27,11,172,193,205,215,189,205,186,178,204,201,202,228,171,190,177,190,197,194,193,202,189,199,213,187,197,211,190,188,184,190,187,193,185,200,192,184,181,213,184,193,210,201,196,189,191,187,177,194,177,195,207,214,197,203,197,198,188,198,190,192,181,186,191,192,207,213,196,196,212,187,185,196,196,181,211,196,234,188,197,194,187,212,186,205,201,205,195,200,226,205,200,181,194,216,201,210,194,201,194,199,204,233,193,201,204,223,210,184,208,198,222,195,205,196,185,208,194,203,217,216,197,222,219,207,211,208,209,214,190,215,209,189,217,198,212,194,218,174,208,224,210,210,208,233,189,189,184,187,207,196,200,170,210,199,222,197,201,206,228,202,183,210,217,192,200,203,207,183,200,204,210,218,211,225,195,214,202,219,209,211,211,193,204,204,225,214,231,221,204,213,197,214,219,179,216,225,204,191,194,196,205,217,214,190,220,207,205,218,201,231,199,224,200,210,207,202,221,189,208,185,209,196,230,181,204,217,221,202,210,199,239,205,232,208,179,212,201,205,181,183,184,218,202,192,221,233,193,218,199,197,236,215,224,213,200,217,227,211,217,210,204,214,254,230,223,192,219,213,212,231,203,179,218,209,237,219,233,224,220,237,223,234,235,246,203,233,184,196,178,183,188,221,215,204,222,211,222,233,218,218,223,236,202,204,203,184,228,216,210,211,226,201,205,216,210,229,135,169,243,173,209,77,72,220,221,130,146,237,195,217,228,142,152,158,157,157,174,231,203,110,185,235,229,160,82,112,115,125,60,28,57,196,195,95,213,174,112,136,218,220,194,210,120,116,145,193,171,216,190,185,162,52,104,182,253,236,191,140,227,231,238,241,213,236,202,215,227,209,219,189,235,238,200,176,165,220,111,26,53,60,33,89,100,123,164,134,38,38,11,24,36,20,22,21,13,42,48,57,111,226,228,121,172,212,245,226,251,216,161,223,250,251,251,247,170,150,208,255,254,248,242,175,98,53,43,95,76,133,124,81,63,56,113,165,35,112,146,37,42,149,224,165,147,113,100,54,19,28,8,42,46,17,42,31,41,44,12,10,25,18,30,32,16,29,20,17,17,48,8,23,27,15,35,32,36,47,37,36,34,42,52,21,30,47,29,35,69,64,36,12,38,22,11,9,16,14,28,16,26,10,48,57,23,19,25,25,26,75,137,172,99,125,162,166,113,68,26,26,22,18,51,64,60,26,43,37,8,20,29,45,31,25,25,15,38,23,49,29,9,5,36,4,30,15,0,52,17,49,51,24,21,49,32,43,44,19,35,44,17,34,11,46,67,36,21,61,36,8,4,15,15,40,26,26,43,9,35,26,59,42,36,33,56,59,47,45,23,60,29,43,43,91,69,37,59,65,68,76,52,83,43,54,143,131,116,122,102,105,86,62,23,84,245,229,238,239,251,248,252,236,248,253,245,229,255,249,255,247,253,247,240,253,255,252,233,252,239,255,255,236,242,254,230,246,237,241,243,251,253,255,252,238,240,251,240,229,222,234,239,252,238,239,246,236,226,244,247,254,253,239,255,248,253,255,230,104,16,26,3,7,0,10,29,0,33,9,32,14,3,12,5,12,9,9,8,4,7,25,16,10,192,201,213,201,222,194,207,215,213,194,211,189,207,188,229,182,197,188,205,217,197,190,205,199,173,226,195,219,193,209,173,198,210,209,201,195,183,169,177,184,203,199,229,230,200,203,184,187,177,194,189,193,178,200,224,190,165,186,182,198,212,193,184,210,184,207,169,196,199,197,204,187,207,212,188,191,201,214,195,206,201,221,189,192,202,195,198,192,196,214,186,198,185,189,192,203,185,199,213,205,209,197,191,221,195,210,198,185,222,209,212,190,212,219,205,193,195,219,209,208,215,204,227,192,206,199,210,174,207,213,219,204,207,194,202,214,212,212,217,209,197,205,178,205,209,221,201,192,205,204,204,199,211,222,192,209,197,205,225,200,200,200,198,202,211,199,205,229,193,190,189,228,199,228,192,219,210,208,207,200,222,187,200,200,208,205,220,219,213,196,192,216,235,215,224,217,222,186,215,229,238,201,180,201,209,216,185,209,202,210,199,211,213,204,209,213,220,201,202,209,221,217,205,210,206,221,180,241,236,218,218,192,197,225,218,203,205,202,203,217,213,211,245,219,210,218,214,223,218,214,226,213,218,220,229,203,214,223,206,199,193,233,210,223,208,222,225,210,222,217,236,220,222,235,243,223,230,208,231,230,231,218,234,188,171,169,202,217,191,176,176,212,230,214,206,241,217,242,199,226,221,216,214,230,214,200,232,213,219,235,225,167,184,235,237,213,146,190,248,215,209,84,113,215,214,147,160,246,192,200,207,121,160,169,178,157,172,246,216,144,197,224,255,239,138,110,140,117,77,39,4,165,138,91,164,136,158,167,234,209,221,207,77,58,145,193,186,243,201,191,108,56,133,217,238,255,155,127,226,217,252,247,219,242,230,208,238,227,216,204,249,231,175,144,122,225,124,73,117,93,20,74,120,92,68,85,30,40,19,30,41,49,23,20,64,74,46,82,221,220,151,157,235,252,225,212,200,166,212,243,252,250,240,192,125,197,247,241,236,253,246,101,42,12,36,118,102,127,108,30,72,95,167,139,29,108,147,48,37,99,186,163,151,142,109,43,8,13,1,2,26,19,23,7,49,21,26,9,17,5,25,23,25,32,30,54,49,45,45,36,25,19,4,28,43,13,39,41,55,48,30,42,39,65,42,50,49,83,41,22,35,59,35,29,31,46,18,30,30,42,36,27,52,30,39,56,41,60,105,151,128,84,122,83,55,43,3,4,34,19,56,69,80,51,41,29,28,31,25,32,31,34,34,40,1,7,22,23,15,19,48,38,29,30,43,28,16,42,50,25,41,52,8,48,38,54,28,9,25,31,17,34,79,42,44,39,3,45,9,23,30,29,29,40,31,44,7,33,26,60,66,49,28,18,35,39,40,31,38,46,41,65,79,37,57,53,84,81,23,84,18,57,99,128,108,119,112,131,61,58,14,55,224,228,255,242,238,253,252,242,247,226,243,233,248,231,253,230,249,243,251,241,226,240,254,249,245,233,230,238,239,241,232,244,254,253,228,251,255,253,227,238,222,237,248,226,237,252,243,234,248,247,250,235,249,239,246,253,241,241,248,255,235,250,243,110,0,0,5,12,14,28,9,46,21,8,27,1,4,13,0,4,29,0,12,15,31,18,2,29,192,179,198,228,198,212,218,188,209,210,209,211,204,200,202,198,198,209,194,216,209,196,185,179,211,216,198,188,227,213,191,203,222,219,194,220,194,175,179,174,186,186,215,191,189,183,202,183,186,192,184,209,220,191,215,189,212,175,216,184,215,214,214,198,178,194,199,216,207,195,187,197,213,194,208,202,205,210,166,189,202,194,232,180,210,203,194,201,190,230,218,213,201,224,199,215,215,203,197,184,203,184,201,207,213,187,176,182,194,200,213,191,210,199,234,231,209,184,210,227,231,216,212,217,195,208,189,206,197,226,218,217,212,195,196,209,222,193,209,228,207,241,195,207,195,196,230,179,217,203,216,222,211,192,212,210,216,213,210,184,205,195,221,189,206,228,194,218,206,214,199,204,184,196,205,220,205,219,218,211,199,205,208,218,210,189,229,214,216,203,208,204,226,219,207,217,229,193,224,196,204,215,214,193,203,207,200,188,199,217,225,195,202,188,213,199,223,210,205,209,206,187,197,217,206,219,210,201,212,183,204,215,205,194,192,197,223,201,204,219,216,197,206,207,204,231,208,200,211,225,214,242,191,230,237,203,213,209,202,214,220,215,239,222,208,212,224,218,228,230,225,233,243,247,223,227,238,246,238,213,198,181,181,181,227,242,238,244,244,227,242,212,211,219,246,230,232,243,214,233,236,247,224,206,213,234,241,234,234,240,220,203,209,225,253,236,130,200,240,189,200,65,120,227,232,157,106,226,199,177,153,98,146,188,192,166,177,229,229,134,206,235,246,236,115,124,103,95,67,18,13,127,148,104,185,175,207,195,231,179,161,145,101,102,145,162,215,235,169,143,152,126,182,243,252,252,137,137,250,221,232,228,236,242,209,188,241,203,238,186,226,217,217,134,197,240,172,57,70,90,44,141,164,76,83,81,53,18,12,12,26,32,49,32,40,54,70,193,234,197,136,194,239,241,215,183,146,195,251,245,235,223,206,136,217,255,250,232,244,250,188,34,35,0,36,106,112,85,45,59,119,138,153,92,49,124,113,40,35,83,150,122,118,138,95,18,29,35,22,20,20,43,48,41,55,26,35,31,18,30,15,50,40,36,33,45,22,10,33,28,32,26,44,59,49,40,35,19,43,34,26,57,29,30,26,33,66,69,64,27,12,33,50,22,22,16,43,55,26,31,36,20,40,27,42,49,22,27,39,103,103,56,24,42,23,12,6,35,7,21,30,58,79,70,37,27,51,30,16,27,21,21,26,25,28,25,19,1,29,25,35,18,18,56,7,16,39,31,40,24,41,53,38,51,53,21,46,19,40,33,39,29,66,30,14,31,21,29,22,61,12,35,33,45,39,39,18,44,25,23,46,54,61,56,31,30,35,30,39,16,12,66,87,53,70,73,93,74,25,52,52,25,96,91,128,128,117,125,66,61,35,59,219,236,224,239,255,250,238,237,247,253,253,253,242,249,237,239,227,237,238,255,255,242,250,240,247,234,248,239,245,249,243,248,230,247,236,233,245,233,247,248,250,255,251,255,241,225,245,219,246,234,249,248,253,248,253,255,244,238,255,239,239,250,254,111,3,6,4,30,24,1,23,3,16,5,8,33,27,2,8,8,10,4,12,6,9,16,27,26,217,238,218,179,211,183,209,207,207,235,215,195,222,224,221,201,207,196,190,213,218,212,203,179,197,203,187,204,230,198,194,207,200,177,225,179,205,198,191,211,224,213,195,191,201,164,186,190,177,210,199,193,200,205,206,212,230,198,197,210,207,199,186,183,194,181,206,220,193,193,209,186,195,196,204,190,183,192,215,204,188,209,189,213,196,224,200,196,195,212,194,200,209,186,217,190,206,209,208,184,211,218,192,204,195,190,199,206,220,204,213,195,183,217,200,199,219,209,224,213,227,184,218,214,200,226,168,208,207,224,224,223,206,219,219,190,202,178,230,205,192,196,208,205,210,209,212,213,214,212,207,206,218,209,196,205,231,199,193,230,192,216,201,208,219,208,200,211,227,211,212,197,223,193,209,191,203,179,188,227,211,199,219,181,193,206,220,186,225,239,185,211,219,223,209,205,185,209,184,212,207,193,222,228,208,187,215,213,208,224,234,216,197,200,197,180,195,200,220,221,214,221,204,203,211,220,226,209,213,187,225,209,223,211,203,210,200,193,224,218,209,199,210,226,205,206,233,192,222,212,203,216,212,212,205,206,215,226,230,236,246,249,236,234,252,240,237,253,236,248,255,235,244,242,230,237,229,226,207,220,225,252,253,255,251,246,247,238,228,234,234,241,230,239,245,249,238,251,255,248,249,254,231,221,246,227,255,246,244,248,255,208,239,246,242,249,176,230,252,187,141,75,88,238,237,173,128,249,240,231,154,122,164,181,194,208,244,229,220,186,236,246,255,236,115,92,76,73,52,33,4,143,173,162,235,191,190,216,183,165,181,164,122,122,103,146,196,187,172,178,147,183,224,245,255,232,144,214,249,226,249,217,213,226,212,215,201,212,221,198,221,239,176,140,206,247,156,47,85,37,65,185,78,36,41,83,41,34,18,32,21,14,35,43,36,37,130,237,205,176,157,218,240,227,177,133,170,214,242,244,242,213,136,179,243,243,245,252,245,178,41,17,53,28,106,104,79,58,65,113,117,134,123,82,76,167,93,63,28,37,96,137,105,164,23,15,25,40,50,38,45,57,28,32,22,27,31,26,40,21,38,29,36,42,29,27,23,41,17,23,43,36,25,34,35,37,41,56,36,20,10,24,42,19,21,50,74,98,40,54,25,20,48,19,39,37,27,26,17,42,35,37,40,28,28,26,35,25,45,109,46,64,51,36,22,3,6,18,18,20,57,45,64,38,57,39,45,15,16,32,13,29,1,2,19,43,22,41,36,18,18,16,32,24,38,35,39,34,22,38,27,47,32,13,29,37,25,22,18,17,13,71,65,43,20,52,30,61,26,32,20,41,23,54,69,9,50,15,34,38,31,27,31,11,32,32,20,48,43,29,38,85,113,26,54,46,55,59,39,62,37,33,84,116,97,99,114,122,86,24,26,3,191,238,238,240,249,242,235,250,243,243,250,250,238,245,251,249,248,249,244,236,244,222,226,228,242,237,226,249,255,233,255,254,255,249,254,254,230,248,252,236,251,239,235,239,232,250,245,241,239,224,240,255,247,234,225,248,240,254,243,234,241,252,249,114,6,13,22,8,0,15,36,25,5,0,7,49,7,9,0,0,14,20,23,31,30,3,14,41,190,219,212,203,200,200,221,201,205,196,197,223,200,188,190,195,216,205,216,214,194,214,174,194,181,201,212,191,195,225,190,209,189,197,200,202,169,213,207,183,169,200,210,210,205,212,207,187,199,193,204,192,187,201,196,167,215,198,212,210,205,192,198,208,205,199,176,192,210,205,200,196,215,204,214,192,214,212,196,197,204,202,204,198,188,194,180,209,199,200,216,187,199,228,204,190,214,197,199,236,183,203,188,206,232,179,197,223,182,220,193,201,208,210,197,164,202,208,186,210,208,188,218,204,227,201,228,215,192,224,201,216,219,207,201,213,204,213,196,174,218,188,205,207,221,205,215,194,205,201,187,185,194,213,228,205,205,215,207,187,221,193,193,191,212,234,194,224,197,206,200,213,222,218,206,224,208,229,212,216,214,204,197,217,216,226,196,222,215,216,199,205,195,215,209,201,208,198,198,216,207,213,213,210,206,210,214,226,217,208,198,205,211,203,198,183,203,196,217,218,228,197,229,199,197,202,212,232,212,214,205,223,221,194,215,233,197,203,221,200,205,209,216,213,224,208,197,206,236,233,203,200,206,213,209,213,188,207,196,216,219,196,209,214,190,181,227,209,179,174,166,177,173,157,171,147,158,131,184,185,209,225,193,220,210,191,203,210,232,202,200,190,188,168,183,183,198,208,184,200,203,190,191,184,202,173,207,199,199,200,190,169,190,185,210,146,133,187,204,140,93,18,55,169,144,143,118,211,195,190,144,131,154,176,197,162,227,255,207,164,229,237,250,229,108,100,90,70,65,44,39,135,207,149,224,194,191,215,176,221,214,231,192,154,132,118,174,169,235,175,174,205,234,234,225,235,148,228,219,246,255,235,236,237,202,243,232,236,225,210,213,234,171,140,202,242,119,39,53,77,121,173,84,12,64,82,74,54,43,24,23,28,59,51,59,121,233,248,167,189,189,234,222,195,151,173,230,244,248,245,238,153,201,241,240,252,240,231,146,39,15,15,88,85,139,106,37,75,117,120,128,102,101,36,67,134,87,38,63,187,152,144,129,129,59,7,35,28,19,52,64,43,49,38,53,22,33,34,21,19,35,33,26,29,25,23,33,12,28,20,31,45,74,81,62,59,21,29,8,17,37,59,19,30,30,10,32,73,54,54,16,23,28,21,35,20,64,29,45,11,9,21,46,44,35,30,21,32,31,44,42,46,55,21,17,43,10,6,5,19,30,48,62,39,56,46,30,15,19,12,37,11,10,19,29,22,17,24,13,6,27,6,30,15,35,39,27,39,34,25,22,33,16,35,60,6,28,22,5,28,47,59,54,14,34,41,31,35,21,22,34,31,26,12,16,41,31,37,38,18,34,37,26,25,52,24,50,35,35,30,52,78,101,77,63,56,35,38,44,51,49,35,80,110,98,141,79,113,98,46,34,30,156,236,237,232,253,243,245,248,252,254,252,239,234,249,255,236,255,253,231,250,225,250,255,239,255,253,254,245,224,233,235,253,251,253,250,254,237,249,251,226,223,251,255,248,247,242,220,255,238,223,252,248,236,237,249,252,250,237,251,228,249,234,251,115,2,14,40,0,23,11,2,1,19,3,32,8,23,2,4,2,16,23,18,9,19,27,15,19,191,185,221,187,211,179,215,205,196,192,195,223,195,201,211,205,217,233,190,182,200,193,212,207,214,211,215,225,177,191,220,206,204,197,200,203,214,211,206,183,204,175,197,205,226,189,222,201,209,159,196,201,193,197,176,211,215,184,203,201,198,180,183,170,206,195,203,197,185,196,209,206,217,167,173,205,193,196,207,186,202,185,204,201,206,188,161,206,190,196,203,200,206,205,213,203,199,213,200,204,209,205,222,225,187,196,191,195,188,202,220,216,216,178,229,215,218,219,213,217,226,200,174,201,198,187,227,238,207,183,231,189,211,202,206,208,238,209,179,225,200,198,180,209,180,218,182,221,186,208,184,201,191,198,182,210,200,176,205,198,222,212,212,201,190,214,200,206,225,201,203,191,225,194,223,193,227,221,205,209,188,203,228,212,206,220,211,187,230,223,225,221,206,199,217,213,202,213,201,218,231,205,207,209,223,190,207,237,226,206,210,203,203,191,220,218,190,202,199,193,186,201,222,207,188,209,213,223,218,220,226,214,199,227,174,208,204,220,222,187,218,203,203,208,211,203,218,214,233,207,215,223,215,213,214,203,175,142,112,112,89,100,100,97,91,68,93,114,84,71,69,50,58,28,19,13,29,24,37,48,32,62,48,49,30,70,61,58,27,34,45,49,59,81,102,93,95,72,91,37,42,48,44,68,72,57,63,15,39,60,70,36,50,50,57,55,18,29,46,64,144,51,10,10,19,31,5,88,63,69,76,108,86,93,97,89,110,125,86,61,92,77,129,119,88,118,133,63,108,34,20,138,108,80,135,123,101,142,155,148,153,151,117,124,49,93,171,136,187,106,94,120,196,241,212,145,119,239,235,237,230,217,224,227,204,235,244,224,214,186,250,208,159,186,187,206,118,61,63,76,198,190,69,33,70,109,78,36,35,14,29,39,74,56,73,228,252,194,211,213,210,191,184,154,175,244,255,239,237,229,185,162,244,242,250,253,248,167,15,28,57,65,101,85,80,32,89,138,141,138,88,47,74,41,81,142,71,50,132,202,206,143,145,124,63,12,40,29,37,33,52,37,41,40,47,20,43,20,20,36,11,18,29,39,39,27,32,13,42,52,100,163,173,152,121,147,67,46,51,65,7,55,28,20,30,44,41,76,80,61,17,30,20,26,48,2,32,32,35,21,24,47,29,26,20,25,23,55,30,21,32,63,34,17,35,13,18,5,5,18,53,13,43,65,58,56,58,22,17,17,11,28,11,20,24,16,18,41,34,26,45,21,6,15,43,19,24,48,37,43,30,55,47,14,32,28,8,28,32,31,42,62,36,29,13,24,23,18,20,10,35,52,60,18,35,28,32,20,39,35,17,44,51,42,43,46,12,39,35,34,33,68,131,74,40,75,71,63,45,31,42,25,58,132,104,121,89,95,113,34,20,15,112,224,251,252,242,217,252,255,239,251,243,250,252,246,245,252,255,251,222,233,250,245,249,245,253,237,244,211,239,252,236,236,245,246,248,245,228,254,236,236,245,253,249,249,255,243,244,246,242,251,232,255,255,252,255,241,250,254,242,250,238,246,246,123,17,5,1,18,0,3,19,17,27,23,23,6,5,4,12,4,2,0,26,23,15,8,12,5,227,187,198,201,200,203,203,193,203,228,190,210,222,196,208,196,224,213,200,196,211,215,205,213,205,219,207,208,193,213,191,195,209,219,202,206,214,197,216,198,185,176,175,201,195,219,207,197,186,199,178,191,205,177,213,222,206,196,206,220,200,202,188,193,191,178,244,204,200,222,208,199,198,185,206,201,179,183,199,211,183,196,215,200,213,214,213,207,222,208,175,200,201,210,215,202,215,219,227,189,211,190,183,209,190,206,227,193,213,203,200,215,208,221,202,194,212,218,179,218,211,200,205,206,200,224,180,214,219,209,200,206,209,215,211,219,195,206,210,202,181,200,212,180,201,196,195,214,192,197,208,214,222,203,194,196,204,197,192,187,207,210,208,207,230,206,212,197,217,225,198,211,225,195,190,215,197,218,193,181,199,225,198,215,204,205,194,201,206,208,217,188,202,199,212,195,226,212,219,216,200,215,181,206,196,211,204,215,209,215,200,204,208,217,208,205,208,254,219,216,208,219,214,223,205,189,233,198,238,216,231,217,239,204,219,210,198,204,203,216,213,220,210,212,207,211,206,202,221,212,198,196,206,219,206,213,198,158,166,135,141,147,127,128,80,82,81,107,112,125,162,118,131,129,124,126,90,116,100,94,97,132,93,80,108,92,107,116,100,118,125,126,109,133,138,152,108,123,98,106,114,109,112,126,116,86,109,98,117,119,92,108,116,85,93,89,55,89,113,122,153,87,59,101,50,46,37,61,91,97,80,117,101,107,113,99,89,71,107,59,61,78,84,83,56,104,109,68,61,45,82,107,51,33,76,55,48,46,47,49,35,41,28,35,36,94,108,91,82,34,53,14,80,90,94,65,50,113,88,121,132,114,135,193,192,199,223,237,245,201,212,203,163,188,181,197,156,106,74,110,243,198,50,32,52,88,90,28,15,25,46,47,58,89,180,245,228,234,242,250,186,195,119,104,213,229,251,223,243,166,162,237,243,234,246,254,192,40,30,59,134,121,128,50,23,79,123,122,137,76,64,84,56,28,101,127,68,26,56,121,145,118,139,122,54,18,54,26,5,11,38,19,48,45,36,31,25,27,36,10,33,17,18,19,38,36,49,54,116,143,139,177,120,185,159,184,144,108,44,39,38,34,8,33,41,37,53,81,75,48,19,25,28,21,18,28,52,11,18,22,10,34,33,20,39,44,37,9,12,8,24,15,37,28,32,10,2,10,17,25,19,35,81,93,79,35,6,21,6,7,35,37,32,33,20,36,8,28,40,33,28,25,38,19,26,40,31,43,17,38,40,47,44,31,48,42,44,38,36,26,66,83,37,19,30,28,18,47,32,25,21,44,53,30,39,13,44,30,38,51,37,30,15,36,33,15,41,28,18,41,38,38,124,66,83,73,65,61,64,64,42,39,72,115,109,125,137,153,105,63,36,18,85,221,247,251,242,253,230,240,251,254,215,247,253,239,235,227,245,245,250,243,225,229,251,249,247,246,243,240,239,242,222,250,253,253,255,245,223,248,240,244,249,246,255,230,255,221,251,241,249,232,248,250,232,241,245,246,250,234,247,248,234,255,230,114,12,0,15,9,11,18,19,6,8,6,19,6,0,5,5,28,12,11,33,8,8,17,25,12,191,175,185,201,185,189,184,219,220,181,189,175,179,206,203,187,194,218,198,196,201,210,193,179,212,228,195,207,188,188,221,188,192,189,191,211,188,218,207,219,194,197,209,200,209,214,191,199,204,190,181,193,196,210,188,211,217,199,217,186,199,188,200,205,217,207,223,175,192,205,192,219,193,186,208,211,189,215,200,220,199,188,208,193,200,195,189,213,185,194,215,208,216,207,210,218,214,206,210,191,227,202,181,199,185,209,187,217,213,197,206,208,205,209,214,196,213,205,230,212,211,202,223,203,211,201,200,224,210,202,202,212,213,183,188,210,199,211,194,178,199,192,193,209,221,213,230,208,212,183,204,190,198,174,183,179,182,214,203,201,200,220,200,203,192,194,210,206,199,203,213,191,210,205,201,205,196,224,188,214,168,191,225,201,193,187,201,198,229,179,190,186,212,210,214,224,210,204,220,239,236,222,216,215,213,179,225,235,203,227,210,220,214,190,190,228,197,187,236,220,210,208,210,211,208,212,229,202,213,208,223,217,209,217,230,196,234,216,242,232,232,173,215,196,241,204,219,209,201,205,235,213,219,216,225,196,206,250,240,240,221,238,203,212,185,172,216,213,236,248,252,237,222,247,245,239,255,211,220,247,226,231,239,215,241,244,229,251,246,246,240,247,234,218,243,238,243,209,223,247,255,246,236,237,241,221,252,237,242,228,219,213,244,236,233,197,197,249,238,212,162,86,131,255,225,194,187,169,244,191,186,215,198,198,175,210,255,212,254,178,209,252,237,253,138,88,98,53,62,49,82,149,213,171,120,201,161,116,97,127,126,113,169,199,125,125,92,139,135,118,138,127,156,150,139,83,105,169,144,180,123,180,161,228,206,221,238,233,211,206,242,157,188,187,179,224,160,87,87,214,243,114,22,139,86,58,36,14,26,8,70,24,23,147,227,220,242,242,245,207,191,140,70,170,222,241,235,243,173,166,191,233,238,244,243,184,54,12,75,113,110,103,89,10,40,93,160,123,93,52,90,124,52,27,106,141,36,48,76,111,130,149,108,110,25,8,12,21,20,21,36,68,29,32,53,57,17,31,34,38,24,31,28,25,29,37,95,182,197,167,134,96,100,164,86,47,67,53,58,24,42,28,7,19,65,62,38,74,46,51,20,29,37,43,21,29,32,43,33,34,34,40,16,29,17,34,10,24,32,29,27,20,34,5,19,25,10,26,60,57,26,45,78,69,62,52,9,51,18,3,24,19,34,17,43,48,23,28,9,41,34,27,29,6,13,44,40,34,61,29,21,30,61,26,43,22,49,24,42,41,91,50,37,43,3,15,14,19,56,58,59,45,58,64,68,84,58,59,23,47,23,28,26,41,36,21,29,31,43,38,31,51,101,48,54,80,68,97,115,39,60,32,69,103,92,110,110,117,118,44,37,11,53,190,245,239,253,250,241,245,232,254,248,253,235,249,246,249,245,228,243,247,234,254,255,238,234,249,239,244,249,245,237,253,247,234,252,239,254,244,253,247,237,229,244,253,222,237,235,243,236,245,243,248,243,235,250,246,252,253,244,253,255,254,246,122,7,1,1,12,29,4,8,21,25,12,25,10,18,2,17,5,8,4,5,14,6,21,17,15,215,220,216,203,209,189,195,216,197,192,196,202,229,202,190,186,183,198,185,209,209,197,213,192,203,216,205,191,199,210,186,198,190,211,207,200,204,208,207,181,227,196,183,206,197,214,205,179,219,191,201,196,193,201,179,182,191,217,217,197,215,202,216,183,187,216,204,220,194,201,210,203,200,201,183,187,207,198,179,185,177,198,192,185,192,227,199,201,192,193,172,198,202,194,186,202,196,207,214,213,191,214,203,245,194,205,220,226,188,197,186,191,213,215,199,210,206,197,209,192,218,171,206,210,208,199,219,212,220,208,173,176,212,226,196,196,227,205,206,192,182,203,200,195,211,237,195,204,203,230,204,208,200,212,180,205,205,216,208,230,201,201,208,194,224,202,185,200,163,223,194,193,202,193,192,213,208,207,228,196,185,192,180,208,201,206,181,184,223,172,225,210,224,212,190,214,240,210,206,210,190,218,223,203,234,211,214,223,210,207,215,206,204,218,195,202,199,190,189,197,227,226,212,215,225,203,210,230,214,225,215,235,163,143,198,170,222,197,213,221,208,232,196,196,198,194,206,214,208,207,207,205,227,190,228,195,206,206,193,205,207,219,202,235,228,215,204,244,252,254,255,249,243,249,234,249,226,249,253,255,239,245,235,234,250,240,251,252,255,247,254,254,237,247,232,240,224,235,211,217,233,245,243,234,245,241,245,226,249,206,208,230,242,247,238,208,241,249,245,203,199,70,152,247,237,211,198,206,228,197,139,137,121,97,132,245,231,253,255,224,255,245,250,255,128,116,135,91,76,55,44,181,251,224,237,255,208,212,190,232,240,208,230,247,208,134,123,206,178,198,215,192,247,250,216,124,185,248,252,242,231,250,207,236,214,224,207,235,228,210,202,169,201,168,148,219,118,34,70,210,251,50,39,179,123,41,41,22,22,28,52,25,131,227,210,206,251,247,242,201,116,78,130,206,247,226,229,149,141,212,248,243,243,241,197,59,32,48,103,136,109,117,67,22,94,137,122,70,35,80,82,67,31,56,128,127,32,27,92,163,149,122,143,82,44,10,25,18,47,48,65,44,35,77,11,35,20,0,24,27,55,14,18,37,52,74,81,115,183,127,88,90,135,155,60,64,92,87,71,32,22,15,21,54,21,26,74,76,59,43,9,19,21,21,38,28,16,33,34,17,17,17,43,33,35,35,43,44,25,42,24,12,25,16,13,36,29,35,14,37,34,31,67,56,54,50,34,11,13,11,39,13,19,12,19,21,47,33,28,6,14,26,28,46,48,30,46,35,38,26,56,22,18,20,51,34,29,41,23,53,68,74,26,8,12,36,22,32,74,71,63,91,92,112,166,104,66,52,47,36,28,40,28,22,15,46,22,58,30,33,35,48,119,63,40,42,54,123,139,78,75,35,59,129,91,107,79,115,83,64,21,57,66,126,251,248,251,247,216,253,243,239,232,250,240,242,246,240,252,241,255,250,241,241,250,248,253,248,246,238,236,243,251,242,229,244,252,238,234,248,244,255,236,239,221,253,243,231,244,246,254,245,248,242,233,244,239,246,251,241,238,235,238,255,242,101,1,3,20,10,20,4,10,13,40,19,9,9,7,15,15,24,32,10,25,7,22,4,12,20,183,202,205,188,192,222,187,194,197,195,182,215,188,203,186,194,183,177,187,189,200,202,206,196,204,192,220,236,210,223,173,192,187,192,198,206,179,198,183,217,190,213,189,189,201,199,221,188,199,174,199,201,198,213,207,208,180,193,188,173,201,213,205,202,199,180,184,163,192,189,192,216,202,202,213,196,223,198,202,190,211,202,201,210,224,208,186,185,197,200,223,207,185,208,187,181,202,216,210,210,204,185,189,211,179,205,195,233,210,187,199,190,200,203,191,202,196,200,214,201,211,209,196,198,195,206,201,219,201,195,203,204,212,218,187,193,201,173,199,213,226,208,195,210,219,192,216,236,205,204,184,188,223,199,227,194,217,209,203,213,196,208,206,204,196,196,174,190,214,208,225,191,210,203,192,215,230,189,226,186,188,230,200,192,203,229,222,224,196,212,194,199,212,206,218,211,205,182,220,196,215,208,190,192,217,210,234,227,214,233,226,218,189,209,204,206,236,224,221,216,230,211,237,209,229,193,209,205,197,235,211,195,80,59,136,199,226,234,210,213,216,199,215,228,220,221,194,208,210,236,239,212,187,220,212,175,189,194,174,180,200,217,202,211,196,195,233,224,244,224,205,215,210,220,217,218,200,218,193,209,216,220,219,199,225,229,247,237,227,237,192,212,213,205,241,211,215,217,226,213,206,218,220,204,190,202,218,229,228,191,192,216,227,233,206,194,230,232,229,233,124,67,146,246,232,207,133,111,107,68,74,45,57,37,21,64,86,114,106,55,148,180,195,165,53,113,110,89,109,45,51,136,204,157,192,213,173,205,215,249,217,160,222,160,216,139,88,172,107,150,235,198,255,238,178,146,231,239,254,239,242,247,216,255,222,194,214,219,212,201,195,199,209,179,164,180,69,33,57,193,170,3,143,237,65,35,24,22,49,27,48,73,182,211,213,221,234,254,227,148,84,180,193,236,255,241,154,154,218,230,253,241,250,202,70,22,51,103,113,114,97,14,53,95,104,147,84,71,113,131,68,60,54,60,147,135,58,23,44,138,158,143,140,65,30,36,25,51,33,39,68,46,40,48,44,16,40,37,42,11,15,60,23,43,51,61,44,89,138,87,115,102,107,149,114,104,55,47,32,28,48,9,16,14,33,40,47,92,50,41,19,18,24,26,22,40,29,68,65,24,55,41,5,21,16,46,87,74,33,38,34,25,40,20,27,27,39,23,37,59,46,50,12,45,54,36,20,35,11,2,14,13,38,28,5,31,23,38,41,70,52,21,33,26,28,19,23,29,44,40,35,31,42,68,19,9,17,54,57,47,88,62,32,15,27,26,34,21,40,54,86,80,111,132,121,82,112,105,69,53,53,20,51,32,32,37,47,13,34,44,21,67,104,74,35,48,48,63,65,70,54,29,57,143,107,104,133,94,133,74,11,47,57,78,211,219,234,254,231,255,238,240,245,222,224,246,253,226,236,232,245,241,252,243,242,243,228,252,237,230,252,250,209,240,234,245,247,236,228,254,243,254,255,233,251,242,243,232,252,251,252,242,252,251,233,255,241,253,255,255,225,250,251,242,243,90,1,4,19,20,17,23,12,11,0,4,15,13,0,6,1,38,9,0,12,10,8,18,29,17,198,167,218,210,160,185,216,196,226,197,204,207,178,188,187,187,186,190,202,195,197,181,215,203,207,206,194,188,198,207,195,216,181,197,214,164,202,199,203,207,220,208,185,193,185,201,182,212,186,189,184,198,203,185,198,192,184,221,178,207,201,187,227,183,199,198,189,198,198,203,197,190,210,211,184,191,191,186,207,207,208,202,230,196,204,207,210,207,226,206,181,226,220,217,205,209,164,208,210,212,204,198,190,195,186,202,192,197,214,208,206,226,222,219,192,179,187,207,225,211,207,210,180,202,208,194,217,207,224,197,230,200,212,218,210,185,197,201,209,220,211,199,195,204,212,200,203,191,205,180,200,207,212,203,213,194,219,194,206,196,191,226,211,176,207,223,240,217,207,232,219,212,202,196,204,187,176,237,185,170,218,190,203,209,192,197,211,204,195,210,203,209,208,212,212,211,213,191,234,198,210,231,200,232,233,217,217,212,218,238,206,212,220,205,209,222,220,221,201,216,202,202,217,226,205,211,208,210,206,208,215,202,112,96,193,220,226,197,221,221,218,214,214,225,196,223,220,207,200,199,201,203,176,192,186,179,201,216,208,233,197,203,202,216,216,211,207,230,217,242,200,241,204,209,210,227,220,211,221,222,219,218,246,203,250,226,232,210,223,219,185,218,229,230,222,235,220,221,190,210,215,213,189,187,224,207,223,226,226,232,166,214,241,227,167,161,227,233,234,177,149,59,121,231,239,207,91,39,47,45,80,104,141,91,99,80,45,86,67,29,55,60,67,75,15,27,34,36,86,71,50,86,96,65,89,116,87,100,129,129,108,45,74,52,114,116,71,38,23,135,243,216,252,249,130,152,216,234,208,230,254,207,220,211,193,177,223,224,234,179,197,196,242,152,127,148,123,90,100,195,105,7,201,194,31,37,38,70,47,45,29,129,199,200,237,243,244,242,209,106,121,222,227,244,235,183,176,211,244,255,248,236,224,83,33,62,100,116,97,123,36,28,94,154,123,107,56,83,109,89,62,47,31,74,161,110,41,25,24,112,148,153,109,35,28,30,40,36,43,35,30,48,45,27,34,9,29,30,31,13,11,23,29,17,33,84,85,49,91,65,75,31,68,77,58,28,15,38,28,5,13,27,7,54,26,31,38,90,59,51,23,30,18,62,27,16,39,21,37,52,83,43,53,48,60,122,167,113,87,47,35,41,38,83,48,44,43,36,42,26,57,73,33,37,30,21,38,9,2,30,23,36,40,42,34,15,20,48,87,49,46,15,27,25,37,34,26,53,39,31,40,32,33,38,21,17,15,27,26,72,87,57,36,22,4,8,18,59,54,101,77,62,99,82,79,126,91,90,88,73,45,41,24,43,20,33,18,63,19,39,45,68,126,82,41,39,38,34,41,33,50,37,33,101,105,99,101,108,111,67,61,26,65,30,132,245,234,244,244,245,249,234,225,232,233,222,255,249,235,226,227,244,240,228,239,226,240,255,251,221,235,248,249,249,251,254,239,229,246,223,240,247,235,249,229,212,241,244,255,250,243,255,246,243,247,252,244,246,241,245,233,252,253,251,249,110,2,20,14,5,7,2,1,19,20,14,17,3,3,9,39,12,7,22,9,11,11,0,48,6,211,182,171,185,210,181,212,201,186,204,201,204,203,200,220,202,205,194,192,198,177,221,207,190,183,216,200,221,191,193,204,213,217,174,212,199,177,181,190,182,193,198,201,205,202,176,186,196,211,187,182,189,189,218,188,210,222,193,194,218,218,189,196,177,174,217,208,190,229,213,193,215,218,206,196,212,178,208,186,201,195,203,199,196,189,197,206,212,203,216,217,204,205,186,219,200,190,211,176,199,181,220,219,219,206,212,213,200,192,219,183,232,204,229,226,194,213,200,198,181,185,197,208,199,224,195,197,205,178,196,217,198,210,213,185,188,194,204,181,220,222,187,198,201,193,207,211,205,226,199,200,208,214,181,207,204,220,199,191,203,197,194,190,226,220,216,191,223,199,195,231,219,215,175,202,221,205,206,226,207,192,182,226,208,204,190,212,200,194,201,197,188,208,206,201,216,206,209,198,231,197,212,224,199,224,236,199,202,199,231,222,232,209,209,244,209,221,205,211,215,249,231,221,198,230,217,207,202,181,196,241,238,155,212,229,218,217,210,233,239,225,222,202,228,212,170,178,211,172,200,192,187,204,202,211,239,225,218,211,233,222,223,216,229,196,210,219,202,226,214,213,228,240,213,204,216,236,230,219,211,229,231,233,193,235,220,208,216,224,236,213,195,221,230,228,223,208,205,241,220,217,211,205,160,214,180,207,228,218,202,189,218,230,232,180,195,244,233,240,215,145,60,136,240,221,235,112,141,141,161,183,183,180,228,234,249,249,233,218,154,169,203,221,211,54,34,11,12,78,90,45,84,166,120,114,165,140,110,140,125,73,107,123,117,158,95,59,53,57,173,250,221,226,254,163,177,237,225,233,251,240,232,227,252,233,223,225,244,244,173,199,251,229,169,170,146,155,129,160,177,69,76,250,138,19,25,16,47,65,24,100,130,200,233,253,255,243,213,162,197,186,225,207,226,182,172,202,243,254,235,247,204,68,38,78,97,97,109,116,36,56,125,131,130,109,65,63,84,96,44,42,70,60,81,147,97,37,14,51,173,94,121,112,25,53,42,15,19,24,14,29,25,26,20,23,27,42,50,32,43,20,47,41,62,64,75,76,43,73,28,67,32,26,38,79,50,56,90,19,8,66,55,30,12,12,17,41,99,50,43,26,7,17,47,34,29,42,24,12,85,109,74,77,133,153,163,130,149,157,113,88,74,175,136,42,27,26,22,38,45,40,54,23,52,45,36,37,25,21,27,12,43,68,16,46,46,67,100,119,77,40,43,23,39,70,42,52,41,54,41,28,32,40,39,53,30,32,28,8,70,90,70,46,21,13,35,40,44,32,56,31,20,21,36,43,67,63,70,81,44,42,43,40,36,32,22,41,38,48,28,34,56,106,74,69,65,36,57,21,21,50,30,37,94,144,87,128,87,102,76,31,30,76,38,35,203,239,244,239,225,233,246,235,238,236,227,242,250,232,242,255,247,227,255,252,249,241,235,241,245,248,242,227,249,222,245,241,241,244,236,242,244,250,223,234,244,251,254,240,244,235,211,230,246,242,253,255,248,247,231,250,254,238,247,240,107,9,0,21,20,7,15,27,17,38,21,29,3,0,6,8,5,2,33,31,30,0,9,33,2,193,183,213,201,203,184,195,222,189,185,201,200,198,189,188,200,206,207,179,215,195,194,213,205,211,186,213,210,191,212,209,218,180,196,184,211,212,196,191,225,189,209,212,199,191,206,186,197,182,204,207,199,203,197,201,198,185,216,207,195,193,199,177,206,196,206,192,183,210,177,183,197,183,216,186,203,194,168,197,180,186,186,203,209,207,211,189,189,202,181,201,198,187,204,180,202,195,198,185,204,196,201,186,209,213,241,209,176,216,205,219,196,205,188,220,213,205,205,187,212,235,198,197,186,206,211,189,173,199,198,211,178,213,217,212,206,197,196,197,176,209,207,207,202,194,202,195,193,192,168,178,201,196,212,215,195,223,186,205,196,205,206,194,201,181,197,205,196,212,221,224,209,187,210,210,192,206,209,178,226,201,223,201,180,206,211,209,219,207,220,192,194,214,189,196,195,228,225,222,196,209,199,213,194,230,210,219,241,218,218,226,225,215,226,235,224,232,225,210,224,212,209,212,242,227,202,227,202,219,209,196,228,196,240,251,243,208,234,230,213,226,210,217,206,187,184,214,207,208,218,221,241,220,220,230,227,224,226,229,225,234,210,226,239,190,208,227,226,233,225,227,239,217,237,240,206,221,233,222,222,222,237,238,159,173,222,245,220,235,241,230,228,244,218,236,246,228,233,255,246,228,225,223,213,236,219,214,213,226,212,202,236,236,250,206,214,252,248,246,181,118,64,159,242,249,237,199,212,220,222,215,231,213,255,226,246,228,227,249,249,252,252,244,237,131,33,36,34,100,47,34,156,239,247,248,224,244,173,232,242,243,238,240,247,222,218,187,122,138,195,250,207,250,252,147,242,249,242,245,245,250,228,247,240,241,241,220,255,249,179,205,246,183,130,140,207,221,128,151,160,51,165,225,24,13,39,51,59,71,138,137,129,210,239,242,247,222,130,175,208,186,219,169,138,144,214,237,239,233,242,216,52,29,53,101,104,129,98,41,80,102,166,163,104,42,57,18,25,66,34,47,38,39,117,169,61,48,8,124,151,123,145,117,17,10,26,9,42,39,30,37,20,12,32,34,31,20,28,34,58,23,53,50,74,88,79,38,36,81,54,84,22,52,67,79,48,67,61,34,56,81,41,48,18,30,23,72,140,78,69,47,28,28,39,29,46,13,35,53,59,81,74,75,112,129,87,71,76,123,164,112,106,151,137,69,9,0,23,33,105,114,57,49,14,47,36,6,34,23,28,28,64,54,55,24,72,110,98,148,168,134,81,76,142,117,61,22,36,48,30,28,20,32,37,19,35,40,14,49,85,83,49,26,15,54,78,81,50,38,58,78,59,44,51,76,54,46,38,66,18,30,20,26,28,45,23,27,36,35,22,28,59,110,74,60,56,37,41,45,65,49,39,19,124,118,94,93,97,102,89,47,28,90,61,20,98,221,240,221,244,234,244,250,255,238,224,245,249,243,238,246,255,244,250,245,248,245,244,250,252,250,248,237,247,230,242,243,245,233,245,244,228,247,229,232,246,255,251,231,239,243,241,253,248,255,246,252,222,247,249,239,254,239,237,243,123,0,31,11,36,4,13,10,2,3,34,7,19,10,7,10,11,8,0,14,22,13,5,11,21,181,205,212,189,213,187,190,201,215,191,192,224,194,213,199,200,194,205,184,222,209,206,197,198,196,176,196,175,194,209,193,199,202,196,216,174,192,202,197,189,184,196,192,189,193,184,204,189,190,190,196,197,197,204,182,182,185,200,205,199,227,193,196,183,223,203,190,199,169,216,194,206,171,204,211,211,195,209,199,208,182,204,202,203,202,208,206,201,204,193,190,169,194,211,170,205,180,220,195,186,191,204,198,206,215,184,204,192,182,180,184,190,189,187,225,204,211,202,179,196,213,190,176,196,180,205,214,212,203,206,202,198,216,221,197,182,188,200,198,170,221,193,202,203,194,196,203,202,214,175,214,212,222,207,217,214,186,223,219,209,216,202,218,215,217,212,196,201,199,196,210,197,201,196,201,220,199,205,217,179,218,211,216,199,190,190,212,217,204,193,185,181,220,220,199,208,226,200,228,200,215,217,210,207,211,213,231,224,223,194,200,226,210,221,211,204,196,227,237,179,224,225,249,208,219,208,243,224,241,216,219,191,193,249,226,240,235,240,204,111,77,154,216,236,198,237,197,250,225,235,237,245,255,243,240,231,253,224,253,240,242,229,245,239,233,243,234,253,250,254,243,240,248,252,246,247,243,248,254,255,237,249,228,99,137,237,238,252,244,225,224,251,252,245,243,255,250,250,232,254,235,255,252,227,214,227,238,205,234,191,196,230,237,221,193,221,228,242,221,131,122,58,127,243,246,233,190,178,172,146,149,158,163,232,245,226,239,229,216,164,200,207,244,232,100,76,69,73,62,12,45,147,169,179,211,228,187,131,189,230,243,207,222,194,172,230,180,140,145,195,172,194,243,182,147,225,241,239,230,222,224,223,236,229,191,213,230,237,245,134,189,202,97,61,90,175,220,123,175,93,61,197,163,11,8,29,23,33,144,216,143,171,242,254,245,237,161,142,229,219,197,198,132,135,184,231,233,239,249,229,105,18,41,72,118,119,102,37,34,77,161,126,72,34,49,16,19,32,50,47,56,43,45,70,109,79,26,10,131,139,95,115,68,25,41,18,19,31,5,13,39,40,1,21,4,10,21,34,1,35,7,32,35,58,101,99,70,47,35,77,93,70,100,150,118,120,109,85,57,50,55,55,28,36,29,55,57,110,113,63,28,35,25,59,14,17,34,20,77,102,44,30,23,35,31,54,29,17,23,16,39,35,27,43,73,23,22,19,116,209,145,52,48,21,29,14,17,14,11,25,19,61,60,53,36,25,70,38,20,92,151,115,101,108,103,59,57,34,27,26,23,17,29,31,27,24,49,19,49,108,62,48,29,19,79,77,65,43,72,101,105,81,65,91,78,65,38,49,29,48,49,41,37,40,27,45,39,41,50,40,21,54,112,106,86,66,40,42,69,99,78,56,55,120,125,104,101,81,142,91,30,16,186,160,78,47,120,234,243,241,255,224,234,247,233,229,248,243,251,254,255,232,252,233,243,247,237,255,248,249,251,249,247,238,242,253,254,249,250,254,245,222,250,249,247,236,240,239,247,235,231,228,242,253,237,255,249,247,245,252,252,249,231,246,233,125,18,1,8,11,33,19,22,7,0,26,9,22,0,9,12,5,5,19,25,2,11,1,9,1,216,175,201,197,172,191,167,181,220,191,202,193,222,208,207,173,197,194,185,205,193,213,185,179,204,188,203,207,190,179,186,202,196,245,214,201,188,202,178,191,197,188,219,189,209,216,217,221,193,193,203,211,193,198,199,190,208,184,189,202,188,187,200,198,196,195,193,200,185,213,200,226,191,187,189,208,189,212,198,195,196,207,198,228,209,207,168,193,197,202,207,207,197,200,202,214,206,202,200,193,186,202,208,217,229,189,205,213,193,201,203,198,191,200,204,196,184,204,193,205,212,196,182,185,201,196,200,206,210,204,212,204,221,194,193,183,182,191,212,206,213,176,189,191,208,203,205,225,188,212,213,209,210,213,209,203,170,207,206,216,198,202,196,213,179,169,197,190,210,199,209,190,205,198,211,199,231,202,197,212,203,225,204,216,171,225,213,209,200,206,207,217,196,183,211,209,207,217,218,176,225,185,216,225,211,229,218,224,219,237,231,247,254,247,251,249,250,240,231,249,255,250,251,255,252,240,254,250,211,246,239,214,220,250,253,247,234,222,199,103,104,186,241,239,240,255,241,255,243,255,237,255,241,253,237,247,235,251,238,247,228,246,255,255,253,212,235,253,208,245,238,242,241,246,205,218,238,224,193,220,225,196,174,100,121,172,214,226,217,221,202,215,201,188,216,190,199,232,200,239,193,214,210,177,171,184,175,168,158,157,157,202,159,147,118,116,113,127,123,89,122,85,98,117,175,145,100,112,85,103,49,111,79,112,137,112,124,132,86,44,22,90,127,117,56,94,111,67,47,35,76,50,39,21,36,63,70,37,41,52,82,61,38,65,50,134,114,122,92,83,102,76,114,74,64,149,105,89,77,77,70,86,95,78,65,84,100,73,147,91,81,115,78,112,116,173,177,119,191,66,85,226,61,14,71,34,15,103,204,204,129,181,247,252,217,148,149,199,239,199,196,187,143,176,228,253,241,231,254,99,21,59,66,119,108,78,65,44,98,141,118,110,71,57,36,22,21,28,34,61,60,75,43,83,118,72,27,14,156,153,101,129,57,24,21,22,46,21,36,34,12,11,14,14,26,55,55,34,15,24,28,8,53,29,70,117,116,53,75,124,117,117,105,161,135,122,99,96,67,57,37,49,36,22,17,37,50,85,88,50,41,29,14,44,36,64,43,49,137,85,45,11,39,19,39,14,7,42,37,35,13,30,31,50,112,92,37,33,137,178,102,23,38,26,27,27,48,13,13,35,67,37,23,24,14,18,21,11,42,19,33,34,37,19,18,63,54,13,46,47,34,37,40,25,35,28,52,46,54,89,67,35,33,45,94,110,68,61,61,168,150,86,118,155,129,87,59,47,10,16,34,33,33,37,43,16,64,21,38,57,50,43,94,96,89,71,84,62,73,79,84,85,44,111,140,135,125,97,110,86,65,33,220,226,178,119,45,172,247,233,240,234,240,253,249,210,250,253,248,247,254,222,245,234,250,248,247,237,219,242,206,227,244,248,239,243,232,244,248,234,237,248,239,225,243,245,237,251,236,255,234,237,235,237,243,249,243,234,237,247,225,237,254,255,239,124,26,8,17,5,16,21,18,25,10,6,11,9,15,7,15,0,18,0,36,1,39,18,13,31,198,181,198,187,213,201,222,199,184,200,190,184,196,190,237,194,200,188,214,180,203,198,206,191,193,214,202,190,192,206,200,194,207,174,193,173,213,201,180,175,205,176,189,191,184,216,198,191,211,179,195,211,191,223,195,208,207,195,214,198,211,183,212,202,205,179,199,211,211,222,191,186,200,201,206,205,209,209,209,190,194,188,208,179,227,177,218,208,213,206,203,188,191,204,213,204,191,183,196,226,193,206,204,196,192,219,220,197,210,218,203,200,220,227,206,223,208,185,222,193,173,227,210,201,199,206,183,217,181,198,201,207,219,203,203,194,189,181,214,211,198,192,190,218,191,212,204,208,204,190,193,190,194,171,201,201,191,215,210,208,201,216,206,202,211,219,193,206,214,213,207,185,213,205,205,223,189,212,205,225,189,209,191,194,202,223,216,197,215,196,216,190,203,210,210,212,223,207,229,222,229,208,207,238,206,234,231,242,241,244,233,247,253,243,255,254,238,243,240,239,246,255,232,241,240,255,243,241,251,224,223,241,190,186,183,177,163,218,187,166,146,185,210,192,204,162,169,186,159,172,184,165,172,151,159,149,165,143,174,159,160,139,160,170,168,152,146,191,153,145,141,141,148,135,123,143,156,165,130,117,132,122,122,119,109,117,120,117,110,131,114,124,129,92,116,112,88,130,104,79,91,83,79,102,100,98,143,100,147,122,104,127,108,82,51,40,50,61,65,89,92,62,75,83,79,77,65,86,93,52,98,78,24,31,19,68,40,16,29,33,20,5,42,55,73,95,115,113,58,86,70,69,44,46,11,44,26,34,53,11,43,13,21,32,23,55,50,79,112,63,64,63,39,19,47,66,34,28,5,17,15,56,65,52,42,34,5,31,110,103,43,80,120,143,129,156,104,129,213,52,154,222,63,8,26,21,69,195,204,169,161,237,248,251,174,135,213,240,248,169,110,184,210,241,249,252,217,247,130,34,47,76,108,112,105,57,47,90,128,157,114,96,70,39,24,24,16,9,40,61,53,94,40,83,127,60,37,54,165,103,103,114,17,20,6,0,14,18,18,31,34,32,40,25,11,3,32,39,12,42,28,43,37,32,51,79,74,32,42,71,97,74,111,139,81,88,45,61,31,33,54,14,17,33,8,48,45,86,97,48,23,57,51,34,28,50,22,33,116,95,35,41,34,16,29,34,37,69,61,21,25,30,20,67,98,90,32,45,44,62,46,17,14,40,8,28,27,38,32,26,55,32,22,24,18,9,20,14,35,14,17,44,41,13,59,85,85,55,32,34,34,17,35,41,48,84,84,58,96,69,49,50,29,83,130,92,59,66,101,147,106,139,105,141,111,80,51,58,48,58,56,35,33,43,49,42,37,34,33,25,27,44,85,32,60,63,50,32,50,57,48,86,51,108,128,141,100,95,127,93,12,47,192,218,225,138,48,40,233,233,233,252,233,248,252,242,255,248,238,239,226,247,249,249,228,246,246,246,253,239,247,233,253,246,222,231,241,239,225,255,238,225,246,243,254,240,241,230,238,249,253,239,248,251,246,240,244,255,247,236,234,255,247,234,226,126,6,5,41,23,15,20,22,0,32,23,14,34,0,22,6,8,3,12,14,13,28,27,9,13,195,179,201,182,194,229,207,198,177,192,193,184,181,200,176,191,193,180,199,198,182,195,187,212,194,178,200,197,191,207,178,178,196,200,172,201,204,198,199,172,170,180,197,201,217,164,192,175,211,196,179,186,193,222,191,200,190,200,187,202,193,192,189,180,196,195,190,205,185,188,200,184,211,200,177,194,196,219,175,200,191,203,221,191,196,199,209,208,196,204,220,195,209,213,204,213,201,183,185,205,184,220,206,200,183,198,188,179,228,199,218,191,204,200,220,234,207,185,210,199,203,191,209,217,174,181,175,199,199,183,196,199,206,217,224,219,173,208,199,220,200,224,226,217,193,193,190,210,196,215,213,198,186,210,201,208,173,212,218,205,188,201,201,221,201,219,198,206,200,189,212,184,200,199,206,215,222,202,216,209,197,206,227,195,211,198,213,188,204,191,218,210,206,217,203,204,224,217,194,216,221,223,221,213,223,209,196,184,168,179,180,174,171,176,198,175,145,160,164,168,150,163,141,149,137,151,138,104,119,146,181,198,95,55,112,73,96,111,84,132,131,120,96,90,96,93,84,77,76,64,72,80,80,33,84,57,67,61,88,94,81,66,52,62,59,46,84,70,72,74,46,61,48,60,73,40,103,108,60,52,56,73,133,87,112,102,105,57,65,53,79,35,41,47,40,55,42,72,37,40,42,46,50,45,59,49,69,112,120,125,83,86,144,86,68,55,23,32,69,62,87,95,62,85,87,74,113,103,122,89,88,82,37,28,23,14,20,35,56,61,46,57,43,46,76,136,114,101,64,30,77,79,68,49,31,53,66,72,46,25,70,67,31,59,60,60,43,57,96,94,120,68,56,26,38,42,19,27,22,20,25,13,49,37,46,52,1,32,80,34,35,111,107,93,177,185,75,157,176,93,215,242,36,12,28,10,138,175,188,215,226,236,238,217,143,187,243,246,240,147,102,155,240,239,249,251,247,159,15,28,70,116,116,86,36,53,116,143,127,106,46,49,43,34,30,11,33,45,85,90,117,98,55,115,137,24,5,88,163,128,122,120,31,23,22,27,31,7,6,35,43,3,36,22,9,15,28,24,37,60,10,22,30,9,35,54,88,86,36,35,36,65,70,60,57,45,50,52,57,25,37,33,44,41,12,34,40,67,66,48,46,81,21,22,49,23,34,29,98,61,23,36,31,27,28,29,25,99,35,32,6,18,23,86,120,76,26,4,47,41,39,32,27,42,23,19,25,23,16,35,55,23,28,37,21,30,48,22,24,24,45,42,31,26,61,126,131,43,31,74,66,36,36,51,62,87,92,104,75,57,53,56,63,86,112,67,66,55,31,50,116,129,135,77,94,66,57,39,27,31,40,16,12,16,22,18,36,36,25,46,44,34,20,31,46,28,33,40,21,23,68,56,46,119,140,129,114,85,91,113,45,31,145,147,173,115,35,29,143,239,254,240,242,249,253,247,247,243,249,235,229,247,234,248,253,252,234,240,238,243,227,249,236,238,219,249,232,236,252,246,239,247,253,230,233,248,242,251,237,251,240,236,252,237,241,243,240,230,252,214,246,245,248,244,244,101,4,8,11,9,10,20,3,13,22,5,18,24,0,13,5,22,20,12,13,11,17,6,6,9,197,224,174,189,211,154,189,202,187,182,197,192,190,168,197,205,189,196,183,167,185,216,202,188,189,156,191,198,208,195,185,207,216,197,185,189,206,188,208,189,201,190,188,201,193,207,195,191,186,216,224,204,187,190,224,184,202,185,216,208,179,224,191,201,185,212,205,189,199,190,203,197,206,213,206,190,177,200,174,194,191,221,212,194,198,186,215,188,197,216,183,200,184,202,203,190,203,202,188,203,190,184,209,209,169,205,195,210,227,209,193,208,189,201,200,193,209,205,198,210,209,208,219,215,209,167,195,192,187,216,210,192,195,184,198,197,199,194,212,207,192,182,188,206,187,205,192,215,188,194,194,185,191,194,203,197,199,205,211,181,180,206,185,189,204,202,207,216,197,190,213,223,199,208,226,181,215,193,194,210,168,202,195,212,207,231,222,210,214,215,214,202,217,233,225,218,227,218,221,213,212,227,240,235,193,206,111,18,20,33,21,16,3,20,13,22,15,5,15,16,15,1,24,1,14,3,30,23,30,8,63,88,33,17,13,11,12,36,1,13,12,19,18,30,6,9,6,1,0,11,36,14,17,6,16,30,18,4,0,19,7,7,1,40,22,23,24,8,10,4,23,12,7,19,12,22,18,16,10,50,22,20,27,12,34,19,5,22,11,25,23,6,14,17,6,16,11,11,28,10,0,13,5,16,23,31,19,4,62,88,67,55,44,76,91,54,22,10,0,16,32,10,22,60,79,72,84,96,92,64,69,36,9,5,4,28,29,33,30,33,21,23,9,30,20,116,132,118,65,58,66,83,49,39,22,42,22,45,34,21,44,37,36,18,1,33,17,40,21,33,90,75,24,17,27,33,39,18,16,43,54,19,14,3,28,43,18,11,44,14,24,95,81,90,166,121,100,205,186,173,251,186,18,61,4,40,166,171,210,250,215,228,200,156,200,250,247,255,196,129,165,180,225,238,254,228,131,38,28,60,94,93,102,48,32,93,112,131,88,80,41,35,39,20,22,23,45,82,112,129,131,116,104,140,130,33,7,97,173,97,116,83,44,30,6,5,30,27,30,20,22,21,14,28,40,13,32,36,44,25,16,20,29,21,12,50,102,60,44,28,37,37,62,56,43,21,30,27,39,35,27,24,15,40,14,17,40,73,53,43,47,63,41,19,26,41,48,26,66,111,32,25,16,40,34,59,29,51,35,26,43,37,11,94,130,47,19,12,36,41,35,33,52,5,41,28,27,45,14,2,50,64,30,10,9,35,9,20,23,25,23,53,28,32,68,151,74,11,29,40,25,41,22,44,76,113,47,86,82,64,37,20,60,61,94,55,24,38,29,47,88,116,72,46,53,51,69,69,36,36,24,36,14,35,33,36,26,51,9,38,51,33,12,19,53,37,24,36,43,29,18,59,20,79,143,128,104,84,106,118,57,33,32,71,42,31,32,28,181,228,245,250,254,255,252,248,248,255,250,231,226,247,239,255,251,248,255,250,241,228,231,232,251,237,250,238,239,237,242,233,251,241,243,246,214,253,222,234,248,244,243,234,232,234,252,234,252,237,255,226,231,246,242,241,242,121,3,4,2,2,3,4,18,19,7,22,15,11,4,3,6,14,0,1,24,1,0,3,12,13,193,200,195,214,193,201,191,185,199,222,178,203,189,203,209,185,193,219,181,177,196,194,202,182,194,190,200,213,202,207,199,205,189,208,189,205,194,210,198,175,188,195,198,217,188,218,196,197,195,188,193,205,193,191,192,182,206,211,191,196,203,202,179,208,177,178,199,197,217,224,194,208,208,219,217,198,214,209,217,196,168,204,167,212,204,217,201,203,211,191,195,222,210,221,201,204,185,204,189,192,187,190,198,226,224,221,194,204,202,201,203,190,201,216,197,188,216,200,215,203,197,193,204,200,218,178,209,221,201,204,174,194,198,210,215,223,194,194,169,218,223,202,206,188,198,226,184,194,200,193,209,198,193,203,202,195,189,216,212,197,219,198,222,215,191,178,235,196,213,208,194,195,199,196,200,224,199,232,220,198,209,203,223,169,213,229,220,213,207,214,202,224,199,217,209,218,201,212,208,229,227,238,241,200,207,219,114,32,35,32,62,19,40,21,30,15,22,35,18,10,16,22,29,25,10,1,20,26,41,12,27,49,25,10,15,0,7,7,4,14,22,10,17,4,21,19,8,25,10,0,10,3,21,11,15,29,11,20,16,17,36,6,18,14,21,19,10,17,4,18,35,17,8,8,25,16,9,23,20,10,27,18,12,4,7,14,14,4,2,9,18,11,10,31,12,18,7,27,29,13,25,13,19,15,34,24,21,12,21,19,44,30,34,61,64,82,43,22,5,6,44,21,60,74,44,106,95,59,73,66,57,0,12,4,5,24,25,15,35,45,17,34,35,12,57,80,115,108,77,62,65,71,52,8,15,23,11,39,32,31,44,37,64,43,35,83,41,33,74,68,84,119,66,63,25,42,118,22,37,1,19,28,23,2,54,52,51,133,88,65,129,97,36,120,155,89,170,199,155,146,142,46,31,139,93,89,137,167,220,242,205,212,130,156,228,247,242,243,161,145,162,139,228,240,239,138,17,41,83,127,110,90,73,78,115,150,116,104,64,51,34,16,25,22,32,35,91,100,110,132,107,112,82,161,115,26,18,112,162,94,109,110,23,21,5,6,8,4,10,26,48,26,49,26,28,16,18,29,18,31,50,42,44,43,26,38,91,76,36,45,25,68,49,47,46,22,5,43,18,41,41,77,50,20,23,39,40,61,57,42,53,83,23,17,25,10,39,27,55,83,13,41,42,31,43,38,50,62,26,17,26,34,28,58,117,48,34,24,37,41,39,37,60,26,44,68,34,32,14,11,41,32,40,31,32,8,16,32,21,37,10,15,37,19,60,109,69,18,23,41,33,15,52,53,83,84,104,104,83,21,30,0,55,56,73,33,73,65,49,63,43,60,51,25,73,43,46,55,15,39,14,39,34,18,29,32,12,33,22,48,29,33,29,63,15,56,41,25,47,38,51,28,37,103,143,126,106,78,94,118,47,36,26,52,22,31,16,110,248,251,245,253,241,243,250,254,252,247,253,235,215,243,254,247,249,250,241,255,235,246,229,246,231,241,247,231,226,245,246,254,239,234,247,238,212,247,227,238,249,234,242,250,231,222,240,240,231,243,231,252,236,223,251,247,241,135,3,32,5,13,32,14,12,37,19,28,10,11,14,15,8,17,33,2,11,18,10,34,13,29,169,191,184,176,191,184,196,192,171,170,213,206,221,197,195,217,189,187,199,190,214,198,190,208,185,204,203,188,195,189,188,201,187,196,202,220,205,198,196,201,214,198,206,191,205,201,191,183,201,189,207,208,216,183,202,189,214,191,174,209,188,181,200,206,208,184,208,181,186,206,186,181,205,195,204,208,201,195,206,205,208,195,180,197,204,209,191,197,186,208,196,198,192,215,203,177,212,197,206,197,212,192,197,195,225,195,199,204,207,205,230,205,202,190,207,189,213,210,201,193,190,196,227,209,183,205,227,206,177,185,214,195,207,221,203,199,196,215,223,203,205,212,211,206,224,202,210,228,210,184,185,198,181,213,199,211,223,182,194,190,208,195,194,212,215,212,206,226,214,199,206,196,202,198,214,186,229,202,214,219,212,208,216,216,210,233,204,207,214,220,222,224,213,238,214,220,214,222,194,226,198,183,242,219,224,226,247,235,231,210,209,207,232,215,229,210,206,217,237,228,203,236,202,198,211,196,181,190,182,160,186,157,155,157,148,182,156,150,140,130,147,130,142,119,120,118,92,129,100,113,135,146,139,126,130,127,110,134,134,120,143,122,132,137,137,135,134,142,165,129,135,114,124,146,144,133,130,102,115,152,126,124,123,97,92,90,112,148,127,155,153,170,169,158,137,143,174,160,181,152,176,152,155,163,154,167,157,168,155,102,119,184,164,107,193,164,142,174,118,102,84,31,62,162,168,140,122,116,108,103,154,158,153,209,185,203,199,187,159,145,190,200,217,149,64,106,89,103,83,62,67,119,154,150,166,186,177,213,156,94,173,171,225,201,155,167,173,211,213,170,130,121,95,190,229,252,254,212,225,247,254,216,214,212,228,244,243,251,243,238,137,57,69,161,141,131,215,139,87,140,83,3,134,227,120,115,158,203,239,233,169,107,164,191,239,253,235,188,55,117,189,191,248,250,171,27,11,65,115,101,102,42,56,128,133,143,101,65,75,42,19,8,18,8,30,79,106,106,110,107,107,96,91,152,111,9,38,135,122,85,116,44,14,27,0,18,28,33,38,10,21,35,17,9,34,13,37,15,46,4,29,33,14,22,23,40,92,100,66,50,50,97,49,42,36,22,22,43,19,29,49,58,64,51,7,31,33,43,32,20,45,59,34,16,21,32,64,29,77,116,47,21,40,3,23,32,14,42,48,27,45,15,51,69,89,27,18,18,32,49,53,85,99,84,91,50,46,13,16,12,25,28,20,6,10,54,26,22,18,33,19,7,32,64,51,122,47,34,31,23,20,61,50,75,99,92,69,56,49,25,31,18,39,79,60,68,75,65,49,76,68,45,26,11,51,47,45,45,52,23,29,27,33,9,33,26,28,59,44,50,42,32,45,26,37,18,29,32,29,44,41,22,42,112,156,133,124,72,90,115,54,18,54,115,144,128,214,253,232,240,249,252,249,249,240,250,248,244,243,255,252,238,252,241,244,252,251,237,241,244,229,252,220,236,238,243,252,253,228,234,244,244,247,247,217,244,235,248,249,245,242,243,226,234,252,252,242,221,234,244,233,234,246,251,250,124,14,6,14,17,4,26,23,4,1,24,6,2,11,1,5,11,11,4,2,7,9,5,14,9,186,178,203,199,195,179,189,192,195,216,191,190,199,194,179,195,208,195,204,189,180,207,210,195,217,195,198,209,206,168,183,176,219,186,198,215,194,195,192,216,181,183,161,185,199,197,191,178,186,169,185,210,176,222,201,184,201,177,201,213,201,199,180,183,190,176,198,193,232,184,208,207,205,192,212,180,189,182,193,213,190,192,205,192,200,210,203,196,208,189,191,207,199,208,190,189,194,229,190,205,205,193,198,200,216,205,202,207,200,210,210,189,208,194,215,221,229,184,187,199,181,193,187,199,187,222,185,200,219,186,202,209,192,187,218,209,210,196,221,204,198,197,188,208,186,202,214,214,217,176,203,197,207,203,208,195,185,185,222,220,205,227,200,212,205,203,218,182,213,185,201,201,217,214,203,220,217,202,196,199,214,222,203,223,218,205,224,205,214,198,214,226,212,229,224,217,216,213,222,218,210,213,198,225,210,221,237,247,242,244,230,224,225,248,252,248,238,252,235,247,242,246,239,244,236,238,255,234,242,255,253,236,224,242,244,240,248,252,247,236,245,248,255,221,255,255,255,234,247,215,254,237,243,244,253,254,253,252,229,244,237,247,252,255,250,245,236,236,250,252,252,254,244,228,246,253,252,248,246,241,245,238,247,240,242,242,242,255,238,234,230,246,252,255,254,255,243,251,246,245,243,247,237,222,236,242,245,252,228,252,246,250,249,176,212,237,243,250,255,232,148,108,151,247,178,185,178,125,139,158,246,252,244,245,236,247,238,241,247,249,255,226,250,169,80,98,116,106,63,76,86,154,245,231,241,255,235,246,255,216,204,233,253,182,196,179,247,254,254,219,150,122,106,237,242,247,236,249,238,248,255,245,248,250,238,238,250,224,255,246,161,30,73,184,118,133,240,111,99,192,106,106,191,243,164,104,178,213,248,198,93,155,234,242,235,234,201,138,120,157,224,209,241,209,47,62,80,76,101,103,49,65,74,107,127,99,79,58,66,18,12,4,27,30,87,125,136,109,124,89,101,73,111,154,82,0,60,147,114,101,93,38,14,12,1,22,17,19,34,21,20,40,13,15,42,45,24,36,38,41,25,33,31,43,29,55,111,87,82,94,73,60,56,62,46,55,37,32,8,39,68,62,58,35,4,25,29,39,24,32,36,24,28,21,5,50,8,20,110,91,45,29,22,9,62,50,20,20,25,47,38,29,19,81,133,50,43,15,38,37,24,116,183,202,182,156,115,67,21,36,58,23,26,16,26,49,7,25,41,2,9,28,34,35,45,108,35,14,41,16,15,34,53,56,69,47,56,14,26,30,15,14,46,110,68,64,86,82,72,53,65,77,56,25,13,27,14,34,39,23,32,56,17,15,29,30,31,20,25,17,44,33,35,33,30,10,32,35,21,34,28,49,35,102,118,130,119,77,89,96,85,19,88,232,249,237,255,247,241,255,249,245,245,249,250,242,222,239,235,235,248,251,233,233,239,254,250,244,238,247,230,246,236,236,241,239,234,251,243,250,232,241,248,225,249,237,248,244,233,234,249,246,211,231,226,236,221,242,227,232,237,239,253,249,226,112,3,11,41,7,17,8,28,37,9,18,12,23,10,1,0,20,17,29,16,8,8,10,18,7,205,184,180,213,182,197,167,192,212,212,202,189,191,206,204,196,184,177,199,197,187,183,218,177,190,185,219,201,217,220,207,186,200,208,216,193,202,169,226,200,204,181,187,198,204,215,193,189,164,213,190,176,180,197,219,189,206,189,188,193,221,203,209,193,198,184,197,194,178,179,177,181,211,175,182,196,192,197,205,175,185,215,200,212,215,194,192,181,208,200,191,207,191,203,180,201,196,184,202,228,184,197,200,192,189,224,202,208,204,192,199,205,190,191,202,201,210,195,193,216,207,196,203,217,212,216,208,194,203,199,205,204,203,215,197,202,172,200,206,193,192,186,206,244,213,201,200,222,209,252,205,194,217,191,203,194,195,190,198,202,204,219,214,218,208,218,188,223,208,206,211,189,230,216,204,233,198,198,223,243,198,222,202,187,216,234,210,199,214,214,220,203,213,222,208,208,198,213,236,229,203,230,226,204,204,218,197,202,208,222,228,248,240,240,218,238,239,242,251,243,227,232,198,236,231,240,206,238,251,242,247,229,247,235,247,251,231,252,255,247,243,238,245,251,250,239,247,255,245,247,250,253,236,254,231,248,242,239,247,228,254,236,253,249,253,246,243,223,252,247,238,250,242,250,248,228,243,223,230,254,231,239,242,241,243,242,239,252,249,237,248,252,244,247,250,240,254,251,246,234,231,228,255,241,236,234,231,228,223,247,249,241,207,199,227,243,219,247,235,254,178,128,205,200,174,165,156,196,177,175,218,249,249,249,254,244,250,245,222,241,230,241,255,140,82,70,117,124,65,95,86,126,230,228,246,157,188,238,248,198,191,191,218,193,183,196,245,245,246,218,122,73,110,161,210,224,241,239,228,233,240,246,241,244,221,243,193,180,238,231,66,16,149,167,68,169,142,49,115,195,175,120,243,229,102,144,210,237,252,142,151,232,254,252,241,223,167,221,183,191,243,238,163,58,36,68,107,88,101,58,47,95,117,129,67,62,47,14,20,13,2,37,57,65,88,109,112,122,118,105,119,114,143,129,74,13,79,146,104,134,109,16,12,2,20,27,3,43,29,35,41,18,41,28,4,25,41,10,34,26,23,25,17,19,28,37,70,59,122,99,65,56,46,47,40,36,27,60,6,36,31,44,41,41,8,22,30,44,59,27,57,34,26,27,14,17,24,50,134,88,32,44,40,33,18,43,53,11,31,27,22,35,37,109,153,62,46,34,35,57,54,237,242,242,233,235,204,150,63,38,58,35,47,14,18,25,26,18,41,18,44,7,36,57,72,116,38,50,20,31,32,43,40,77,52,39,30,30,32,50,25,69,89,125,91,76,81,50,19,18,37,49,62,51,39,24,26,51,44,27,45,20,33,44,27,24,26,32,38,29,45,14,33,55,28,17,16,41,15,42,25,54,24,64,132,144,116,90,91,74,72,7,91,232,246,245,254,252,250,255,243,239,238,231,250,229,235,252,246,254,230,254,240,247,236,218,229,235,249,225,244,222,249,242,246,243,238,245,241,247,246,228,246,247,248,227,253,230,215,247,246,239,242,226,252,254,249,237,243,254,240,246,243,247,244,125,27,13,11,4,13,15,8,18,38,4,0,21,4,11,22,0,3,9,8,14,0,12,20,5,197,199,185,214,181,176,212,196,178,181,193,194,193,182,182,197,193,205,199,201,178,226,198,212,186,209,208,198,171,195,203,208,208,178,199,203,182,207,195,205,202,200,178,192,203,198,194,190,180,205,207,178,183,178,177,183,190,195,219,211,190,193,196,217,191,185,197,186,203,181,185,201,225,192,172,206,179,195,206,210,196,207,225,216,197,202,213,191,194,222,183,212,155,195,194,205,194,191,179,194,196,198,192,210,187,192,181,190,202,195,215,194,194,218,204,191,210,204,184,215,224,198,220,195,194,195,178,210,204,222,219,214,206,201,229,206,217,208,201,186,199,224,195,202,229,221,198,213,202,223,203,184,193,199,218,213,212,222,212,208,188,225,211,206,192,197,217,212,208,209,203,224,220,220,230,211,214,220,212,226,232,206,215,235,223,206,222,208,199,240,228,181,207,220,218,216,213,213,219,208,201,205,218,206,218,225,214,223,213,213,218,219,214,228,237,222,203,222,219,211,223,218,201,203,222,241,214,209,234,223,192,221,213,212,205,229,241,229,214,232,222,210,235,216,238,201,221,233,223,210,221,217,231,218,230,223,228,216,208,222,227,217,220,211,233,237,195,226,225,238,247,228,243,218,236,221,235,225,195,240,211,208,220,218,224,219,231,226,211,229,230,199,223,230,210,228,220,181,225,203,214,211,228,227,179,221,210,199,197,207,238,208,193,171,224,213,214,229,198,193,149,115,195,175,154,203,189,177,158,164,236,223,216,232,229,214,224,216,187,212,214,217,247,126,78,103,96,103,72,87,49,152,213,123,219,182,199,241,203,158,91,197,209,199,185,204,238,241,221,213,156,81,102,141,144,207,235,219,212,203,242,213,237,218,207,222,182,146,195,177,19,67,168,147,110,129,105,68,70,161,141,114,162,194,100,164,203,235,236,152,186,245,242,239,245,147,133,186,236,227,250,169,66,31,57,121,94,101,100,23,88,139,146,100,77,60,35,15,34,2,16,36,102,110,113,120,148,92,81,108,95,116,139,150,48,16,103,155,106,120,105,31,37,31,24,9,38,14,36,17,24,13,35,8,30,22,20,43,14,45,22,22,22,34,24,45,50,52,67,84,52,90,73,50,44,70,45,37,22,44,30,41,65,48,25,33,50,40,26,50,62,47,17,13,16,26,18,80,141,35,18,26,12,30,21,41,23,34,64,51,34,43,38,57,155,51,72,28,78,94,128,192,181,150,159,158,161,98,59,48,48,18,13,21,11,21,15,29,26,35,9,17,54,29,63,115,59,23,34,20,26,39,19,58,46,65,31,70,65,34,25,112,126,158,90,84,67,34,21,47,66,32,37,39,11,30,16,34,41,34,20,26,23,47,16,23,3,25,34,31,57,45,31,31,40,62,28,22,32,46,34,13,38,48,125,144,124,97,123,126,89,46,70,235,241,232,234,242,232,226,252,224,243,251,245,218,222,251,241,249,234,251,244,226,229,243,231,246,231,226,254,247,243,228,251,232,227,239,240,234,249,246,242,233,250,234,236,225,231,241,202,244,252,246,218,248,225,254,220,205,240,240,231,235,245,109,11,16,8,31,14,7,10,15,6,5,2,7,0,29,7,4,0,1,14,13,26,15,0,8,201,202,201,186,204,191,226,187,198,179,176,176,209,183,201,196,176,200,216,208,187,222,190,188,206,207,188,202,201,176,181,228,186,187,198,199,190,221,175,182,180,184,187,183,197,203,184,200,208,189,193,192,214,174,196,197,180,191,182,206,202,199,195,185,194,200,182,182,195,213,201,195,193,203,196,179,216,209,202,198,206,186,209,187,203,214,197,190,196,200,203,171,203,181,207,202,181,198,181,177,199,169,196,203,191,199,197,203,214,184,188,215,194,196,215,219,210,193,203,210,220,193,216,223,209,207,207,224,225,218,214,202,199,212,225,204,216,194,203,207,202,227,198,190,206,188,186,176,187,200,201,205,216,191,201,208,209,206,231,201,235,203,202,207,243,215,201,198,188,196,219,213,233,202,205,215,233,222,216,224,247,214,203,180,210,213,220,195,210,211,190,216,242,203,191,213,209,229,189,211,197,220,240,212,213,205,219,211,214,219,231,213,195,195,218,216,227,227,223,225,209,220,203,212,232,190,218,196,235,183,215,215,199,218,202,202,198,207,222,241,238,206,207,240,211,185,223,181,195,220,206,226,226,174,202,213,211,227,230,165,204,213,216,215,234,218,205,190,223,217,198,223,224,209,217,205,240,182,171,214,245,183,218,219,212,203,208,221,203,172,220,199,210,221,216,199,228,210,211,209,198,213,200,197,194,205,189,165,185,232,228,221,157,153,181,217,190,214,182,192,170,90,160,181,172,183,179,132,103,141,208,217,222,212,207,217,224,217,183,216,202,226,227,121,70,77,38,75,71,103,69,177,224,179,209,169,212,172,153,174,133,239,226,184,143,202,244,198,239,244,172,64,112,134,149,205,233,220,194,192,222,236,220,216,205,213,155,81,195,104,37,135,188,124,101,140,61,78,85,128,142,92,147,184,141,191,241,230,170,167,171,151,131,108,87,38,0,121,180,195,207,78,59,53,122,105,119,79,31,96,116,125,95,56,64,20,25,8,6,2,33,87,113,120,122,106,134,80,85,95,106,118,133,158,56,28,128,130,105,101,58,17,18,11,17,38,16,8,22,6,36,19,28,14,25,42,62,63,37,55,54,7,33,42,28,39,14,20,86,85,81,150,133,87,108,55,39,13,28,6,40,54,66,66,82,12,50,48,50,74,70,30,16,0,9,16,34,81,106,49,35,21,32,41,51,51,31,40,47,37,50,37,45,20,135,115,94,186,251,217,133,61,41,27,43,24,34,32,15,51,25,35,32,15,35,53,36,31,55,16,19,53,46,47,49,117,108,34,18,33,18,64,45,64,98,37,85,115,86,44,85,141,146,128,78,105,83,57,50,73,60,50,80,66,27,49,24,30,35,29,15,30,31,19,11,33,34,31,7,33,60,49,46,29,33,29,39,55,55,50,36,48,15,44,141,147,118,104,123,117,109,30,50,236,253,246,222,231,236,231,255,249,251,246,218,237,224,250,233,236,240,230,237,232,245,244,249,245,247,252,230,241,247,252,242,236,235,245,236,233,249,214,234,217,254,231,222,224,226,249,228,250,232,234,236,236,243,237,251,235,253,234,221,248,234,114,4,11,24,11,15,27,12,14,7,16,23,12,3,8,16,5,0,20,10,22,35,8,19,7,196,205,206,183,165,209,205,198,204,191,203,192,203,188,199,213,198,182,206,188,232,172,183,189,215,197,198,185,181,192,213,185,227,201,200,205,209,196,178,192,204,193,195,181,175,188,192,173,174,195,201,203,172,211,195,182,209,179,187,193,197,192,204,150,196,208,171,206,165,177,198,208,210,183,187,196,199,173,219,206,207,196,216,204,180,211,206,187,194,210,190,188,209,212,204,194,212,213,198,194,194,184,189,194,198,219,208,222,202,196,200,214,197,212,177,211,211,218,204,213,220,147,199,225,204,214,203,180,218,194,209,193,188,172,205,203,212,196,217,214,192,229,204,202,205,197,217,202,206,231,198,182,216,235,211,196,180,195,202,199,191,206,198,207,239,227,219,184,221,206,212,194,204,211,207,228,212,224,206,211,196,197,201,222,211,215,196,228,216,224,191,210,214,221,205,234,208,224,223,230,220,210,229,215,194,232,212,215,209,195,223,240,213,209,231,198,210,215,201,213,197,236,217,211,219,211,209,202,202,208,211,205,211,222,223,216,218,197,216,212,193,217,224,222,202,210,192,218,212,208,209,198,207,217,204,207,220,203,205,217,218,203,200,220,217,231,221,208,213,211,196,232,242,182,217,198,207,204,174,242,237,225,197,200,219,210,206,200,206,204,187,226,216,192,184,214,206,210,203,208,225,224,217,185,195,230,191,169,155,219,197,205,138,142,213,204,198,229,173,186,120,110,157,165,140,166,131,128,91,124,195,210,218,213,214,211,226,217,178,209,206,234,230,83,85,54,85,84,76,89,76,203,229,190,222,172,170,172,214,182,203,228,147,119,153,194,238,215,211,234,118,67,113,122,168,210,225,205,213,243,210,229,225,229,225,184,97,103,159,68,66,202,162,110,166,75,32,48,74,135,213,108,202,217,173,233,215,170,102,44,64,60,18,29,21,15,2,114,185,135,73,1,67,103,110,134,79,41,70,113,144,81,54,79,46,15,35,15,21,34,64,114,131,83,99,88,72,93,102,82,107,99,130,130,16,27,127,125,100,123,59,28,8,24,23,25,7,33,29,26,34,19,40,52,74,82,79,64,51,42,54,44,28,34,35,29,0,36,156,139,117,158,157,95,103,42,37,36,32,48,52,49,74,70,62,18,43,23,52,92,44,18,34,37,14,35,19,69,102,68,49,14,49,24,30,13,26,21,30,15,54,77,29,58,203,139,64,192,224,185,158,124,88,63,69,57,51,40,22,88,21,33,47,20,14,33,21,14,78,30,12,38,37,32,35,115,155,58,32,17,18,27,63,78,53,32,154,138,67,82,125,115,178,147,125,141,62,85,64,66,46,24,49,36,10,31,13,29,16,64,23,25,39,28,29,38,7,37,24,43,37,28,36,18,30,40,45,29,38,35,21,36,43,57,133,132,135,108,95,121,84,23,67,236,232,254,249,234,229,227,242,254,224,241,217,244,240,212,242,233,242,253,247,240,239,255,243,236,241,234,225,234,238,250,244,230,243,239,208,242,234,250,239,244,242,227,221,216,238,230,228,220,231,238,251,222,235,248,250,238,231,245,239,250,230,116,2,15,9,9,7,29,10,11,24,34,18,9,3,6,9,4,8,18,1,24,37,19,4,23,186,191,212,221,192,177,183,194,201,204,191,203,211,182,221,195,188,193,198,190,216,210,212,206,180,205,179,194,183,205,208,197,199,185,208,200,207,205,195,204,184,201,187,195,170,201,182,207,192,179,208,178,190,206,178,214,206,203,174,209,208,202,190,196,145,171,200,193,183,189,175,203,191,186,213,201,178,188,221,180,197,177,202,195,213,192,206,172,207,195,192,186,196,200,194,171,194,186,188,195,199,192,191,191,219,194,193,220,205,204,199,220,197,194,200,207,211,214,192,186,193,199,210,209,203,205,196,223,202,201,208,224,188,230,190,198,193,190,223,202,194,196,198,178,197,192,169,214,218,212,196,217,191,226,204,211,226,222,228,223,215,215,220,198,186,197,217,210,210,207,200,197,177,200,215,208,208,192,190,216,228,178,202,231,193,194,177,222,208,206,218,196,217,205,206,235,222,198,219,212,206,219,226,215,176,216,230,210,224,224,193,215,190,206,214,221,202,206,214,227,208,191,205,202,224,199,212,212,225,218,227,229,203,203,221,200,207,204,212,210,188,193,202,234,190,206,197,196,215,176,204,196,196,203,230,212,188,226,215,225,214,220,163,190,190,208,210,204,220,224,200,218,246,198,203,189,238,186,145,167,220,207,207,217,214,193,204,195,205,207,191,202,207,210,206,195,182,188,207,223,193,220,226,180,187,195,189,165,170,209,213,209,152,173,192,220,190,224,226,156,146,119,161,169,137,184,151,172,147,172,208,183,214,229,196,203,212,206,156,194,211,217,210,128,89,80,102,68,70,77,49,167,187,183,184,169,195,165,223,170,210,177,132,157,189,220,239,206,234,249,116,76,140,112,182,141,158,205,206,227,215,233,210,241,161,135,111,146,143,49,139,179,109,157,111,51,9,35,46,116,171,106,156,144,149,186,128,56,28,30,28,58,53,73,66,37,41,199,188,43,37,36,115,99,106,96,13,30,121,131,115,33,51,33,2,11,48,38,44,96,73,122,94,117,114,105,71,118,109,103,91,91,110,121,11,14,136,93,100,99,66,28,30,37,21,38,11,18,32,18,49,48,101,121,128,116,120,84,75,95,69,48,48,46,55,55,39,57,124,143,87,124,130,94,100,46,50,56,61,40,68,66,60,112,20,25,48,50,80,119,73,47,16,16,5,12,3,41,82,98,57,50,58,59,65,64,52,39,60,44,56,66,44,95,197,105,53,42,92,74,44,137,147,145,92,57,41,37,31,66,42,43,13,14,34,28,22,29,36,26,14,20,41,29,50,116,112,57,30,32,23,51,58,90,68,106,180,121,96,113,123,100,123,151,137,162,74,91,70,47,38,5,42,28,22,41,49,15,36,42,20,44,25,27,20,21,35,73,17,42,24,52,54,35,32,34,31,20,35,32,27,49,37,29,144,140,127,109,107,97,96,22,58,222,244,243,236,238,248,254,247,227,255,245,248,255,242,242,242,213,246,238,231,231,255,239,234,230,222,232,236,244,204,241,252,227,244,230,244,222,244,244,242,238,249,231,222,233,245,238,249,253,237,234,224,250,211,233,233,249,227,241,247,220,221,139,5,5,0,36,16,23,1,14,34,19,11,20,12,7,5,8,25,39,16,15,4,24,4,7,188,218,189,168,203,196,192,190,190,192,199,190,202,198,227,195,192,211,205,189,189,208,179,211,206,207,207,204,167,188,202,205,180,176,188,189,207,190,187,193,190,180,200,195,206,186,202,206,198,160,198,209,211,193,195,188,198,201,204,169,183,193,198,181,180,184,201,178,201,199,208,187,191,210,188,180,181,182,186,190,209,199,197,170,201,190,203,164,192,195,214,205,223,201,197,190,181,201,217,195,187,186,191,200,201,208,221,209,208,206,192,174,195,204,204,217,231,230,205,196,214,215,222,205,217,190,204,190,203,197,184,176,206,195,204,193,197,209,213,189,206,197,191,191,215,199,195,202,205,201,216,197,217,191,190,208,198,212,192,197,194,182,198,212,220,201,205,206,182,196,202,187,216,214,204,215,184,206,221,220,206,230,194,207,209,212,205,221,200,224,220,199,220,211,227,195,210,179,204,209,233,205,216,207,199,213,219,220,203,221,207,238,191,225,225,247,241,221,194,211,201,205,218,222,216,205,219,213,216,221,202,227,230,203,209,225,209,207,219,206,189,184,201,192,223,205,220,209,214,192,193,195,227,191,201,216,202,211,192,210,215,196,207,174,228,198,204,193,202,224,187,163,210,205,206,207,201,167,56,83,190,215,215,193,192,194,204,239,198,198,206,197,213,197,209,198,205,213,220,187,191,211,200,187,199,203,162,153,205,209,207,199,142,178,194,213,217,214,224,148,118,95,185,147,200,200,189,185,159,176,208,207,208,217,207,203,232,205,169,205,231,238,189,132,95,49,103,77,90,91,8,117,175,147,164,218,212,194,193,128,209,206,186,178,184,232,205,205,228,255,131,109,170,140,173,118,147,187,202,241,229,246,227,194,107,118,134,169,86,85,208,127,1,149,156,59,71,69,22,58,110,68,76,96,64,86,40,24,43,24,69,88,87,55,70,90,67,198,117,26,76,84,98,111,71,42,55,131,139,113,59,42,64,27,8,22,34,52,95,92,116,106,102,125,103,97,94,88,105,67,74,130,145,93,48,82,140,126,86,137,22,17,9,6,21,18,3,35,26,26,59,109,139,162,114,107,99,81,106,100,100,102,107,83,59,80,92,114,123,147,58,132,148,180,131,45,122,50,55,70,64,80,61,87,18,46,47,46,47,56,62,53,31,12,12,24,40,14,41,31,55,87,98,109,81,88,111,90,74,119,74,78,94,85,52,34,12,20,58,65,33,86,103,128,120,69,46,11,11,44,31,41,35,42,33,51,22,83,56,34,45,57,84,16,80,97,69,14,37,19,6,39,35,69,67,103,203,88,71,110,100,69,60,69,123,110,57,59,85,68,56,23,63,44,28,47,48,38,9,21,23,41,19,17,25,68,32,59,75,39,63,23,52,42,26,19,39,24,32,20,48,56,58,9,107,136,140,113,104,128,112,39,23,195,255,251,254,242,252,241,226,253,230,234,251,227,244,235,237,244,220,239,238,238,217,249,246,236,228,244,229,222,253,248,219,239,238,250,237,238,238,241,236,251,228,255,239,234,226,240,231,233,241,253,227,245,241,244,233,238,247,227,249,255,233,119,2,0,10,2,9,17,1,3,17,18,4,25,10,13,1,6,1,30,14,16,4,39,6,8,173,215,181,205,199,198,201,200,196,185,183,196,183,222,199,197,176,188,205,202,199,208,177,191,176,174,181,178,201,198,175,194,192,201,185,209,199,200,201,201,202,207,183,169,176,207,198,202,185,172,185,199,205,200,181,196,211,182,208,185,178,180,196,187,203,197,187,194,170,180,208,178,189,212,198,189,219,197,194,208,211,199,188,195,172,209,196,177,191,186,182,172,177,207,170,203,200,200,218,191,180,232,182,213,209,178,215,187,187,208,195,201,188,229,209,218,218,206,211,210,204,200,225,224,207,206,197,185,195,192,213,201,218,162,210,209,204,227,213,199,206,203,215,168,214,211,184,185,189,196,197,196,196,181,204,207,220,223,233,171,211,205,215,213,210,204,190,193,206,182,197,207,232,204,236,198,203,196,206,196,193,224,217,183,212,190,206,214,208,202,216,206,199,199,207,218,229,215,217,202,227,211,206,233,196,223,231,196,189,206,188,219,214,207,198,198,234,204,187,183,211,235,223,223,203,201,192,210,218,239,234,221,224,206,217,180,190,182,203,198,221,202,231,238,171,211,201,211,231,180,212,210,188,181,227,206,225,188,186,219,183,189,195,215,208,218,185,181,161,201,186,134,222,224,217,227,192,195,85,98,151,195,217,180,213,221,203,203,212,197,211,215,188,181,216,198,193,225,217,177,199,190,225,203,173,214,165,167,189,218,224,170,148,190,204,216,197,202,186,136,119,114,173,188,177,147,141,182,172,149,186,196,209,208,216,240,206,206,164,224,225,228,166,135,84,77,124,91,130,77,25,119,154,173,231,226,192,199,151,117,216,215,199,180,209,225,238,192,222,237,136,200,203,185,166,129,149,167,184,221,240,252,214,76,80,142,183,172,64,121,221,84,88,195,115,57,44,77,81,54,58,70,65,50,43,50,45,55,84,43,31,50,24,63,41,32,26,64,54,50,84,82,108,67,45,48,100,147,119,57,80,39,1,26,22,23,71,64,95,101,116,119,90,127,98,109,108,95,95,34,150,168,158,123,90,128,173,125,124,101,44,12,6,17,36,31,41,56,16,50,88,121,132,111,117,114,88,87,127,109,149,134,122,83,94,123,103,99,117,91,120,135,98,152,83,50,79,18,45,79,76,114,50,64,39,26,30,18,39,40,20,35,16,55,22,30,36,15,14,14,85,100,87,74,78,133,176,127,96,125,146,138,119,50,7,16,31,37,47,60,64,147,146,142,120,58,44,25,26,35,28,41,51,63,63,61,53,66,60,61,55,81,69,55,73,83,37,53,30,33,40,41,12,31,25,67,116,58,51,74,98,76,53,80,75,104,69,119,116,66,40,85,50,67,67,65,54,10,16,9,23,27,52,50,40,62,73,97,113,90,89,75,30,36,52,30,40,53,49,27,13,37,58,19,109,162,131,144,86,86,118,40,30,186,239,232,243,243,246,237,224,235,222,230,240,249,242,230,238,235,213,235,223,246,245,241,249,236,231,243,231,251,226,239,207,216,232,246,247,235,231,235,233,225,229,240,242,241,247,216,236,228,219,246,235,245,222,249,204,240,209,244,244,249,239,135,5,10,1,0,21,4,16,18,0,5,14,16,2,13,10,19,2,8,17,1,0,2,9,20,191,189,184,217,195,184,206,215,205,213,180,196,200,192,221,195,192,233,191,198,198,173,205,191,199,190,215,176,189,198,196,206,194,193,198,215,175,192,192,193,190,176,206,198,180,199,196,196,193,174,199,192,194,182,202,192,213,196,197,211,204,195,224,189,183,176,185,204,221,188,189,182,178,193,195,182,194,203,210,181,202,188,210,196,190,187,198,202,176,185,183,208,202,201,202,196,211,190,201,179,212,194,210,189,204,202,212,188,209,176,187,222,196,205,202,200,201,172,205,182,199,212,211,203,207,197,183,185,209,210,215,193,209,202,191,203,192,203,184,217,199,198,206,194,216,193,205,207,198,195,179,213,193,199,209,207,210,200,192,208,204,190,207,198,200,197,209,195,223,188,191,193,206,213,200,224,202,227,213,231,208,212,207,217,217,203,219,218,207,217,187,217,210,205,222,228,221,200,199,193,239,193,205,197,202,192,221,212,193,215,219,219,212,189,210,241,203,228,194,212,199,223,179,212,218,213,194,198,230,206,238,201,227,210,193,211,205,171,219,212,195,191,183,191,218,222,201,181,182,218,193,211,203,202,211,220,214,199,218,170,188,198,197,220,210,217,196,196,194,204,207,163,200,214,185,197,214,216,204,214,197,192,209,196,206,183,177,220,216,190,196,224,214,195,210,222,192,205,229,203,187,183,173,207,188,178,196,170,201,214,233,176,147,191,186,212,199,229,181,167,111,44,118,180,159,174,162,206,171,155,203,195,198,189,211,228,241,208,181,201,213,228,127,131,110,68,100,80,99,74,60,164,220,193,241,173,150,211,153,175,222,192,150,172,218,219,236,198,247,221,154,239,239,162,165,185,157,129,145,210,223,233,146,14,95,162,152,123,142,177,169,68,93,162,79,64,67,65,57,53,55,66,63,73,69,48,82,25,44,24,5,13,21,0,28,2,4,43,41,99,80,71,75,31,43,85,116,107,58,92,33,18,1,8,17,39,112,113,103,135,99,121,93,109,101,105,115,82,91,116,216,215,170,128,145,117,157,84,111,99,24,33,5,31,7,36,49,12,59,99,99,140,124,89,84,101,88,90,76,80,76,99,74,40,42,61,95,85,74,102,82,72,45,72,41,37,78,35,49,77,99,84,54,51,25,16,36,28,20,39,18,1,21,21,31,36,19,14,41,57,69,77,37,58,17,25,41,47,35,32,19,41,49,19,14,27,12,61,77,49,119,202,183,110,74,33,28,40,45,35,25,33,58,40,54,40,21,43,43,68,75,83,124,119,56,66,50,33,20,19,28,36,17,42,21,58,88,55,17,46,121,79,90,94,95,112,116,125,127,61,28,86,74,72,16,14,79,33,46,46,46,40,66,40,78,73,89,92,107,113,81,93,69,56,13,52,36,23,45,38,38,25,25,28,73,113,129,130,96,85,120,78,21,154,239,235,255,227,254,254,228,215,227,226,228,222,225,242,227,229,237,229,235,205,236,225,242,235,222,249,251,245,224,217,250,220,238,220,243,226,237,234,207,233,247,239,248,249,248,244,243,239,228,244,242,224,234,233,226,241,212,241,240,245,222,117,16,23,11,26,8,14,24,6,4,23,13,26,25,7,15,0,5,6,9,14,10,27,15,16,199,207,206,171,208,209,183,189,211,201,187,202,211,182,205,201,202,195,201,167,202,181,214,199,196,197,185,188,198,181,203,190,202,214,180,185,187,165,204,201,194,197,196,214,191,202,173,211,187,193,187,166,217,208,190,196,211,200,187,188,204,188,181,199,192,191,184,187,192,200,213,187,212,184,177,194,186,194,181,212,197,201,211,202,182,186,176,197,189,179,201,199,181,195,185,172,204,191,186,175,200,187,189,222,210,204,202,182,220,191,176,234,194,196,188,195,221,205,210,202,227,199,199,207,205,205,194,193,195,203,202,202,204,201,217,182,220,212,193,180,192,203,174,183,200,202,199,206,218,208,200,187,217,198,211,186,223,191,204,206,210,206,220,195,210,213,199,222,193,209,188,211,191,178,205,214,209,205,178,218,192,216,228,210,210,214,205,195,194,222,239,207,199,199,191,215,191,227,196,212,235,206,205,219,210,221,216,221,198,211,201,191,204,215,210,215,209,192,192,208,221,210,200,223,216,237,215,189,212,207,202,223,230,192,202,225,194,190,202,200,206,200,206,225,227,206,188,205,225,213,193,201,208,219,210,207,191,180,201,192,206,225,206,212,208,218,178,179,171,195,186,202,240,212,199,188,196,205,173,217,218,183,225,201,192,209,191,207,207,179,232,216,190,181,202,209,197,201,195,204,217,218,192,212,175,215,170,173,201,206,199,171,127,211,191,199,201,224,172,121,84,55,155,180,164,158,156,196,165,134,195,185,207,195,196,205,203,195,164,212,179,188,135,167,100,55,108,77,92,52,68,185,243,213,200,110,178,204,198,179,144,154,153,201,233,221,220,237,232,185,160,238,238,186,141,181,174,157,133,182,234,230,102,28,134,143,154,186,224,195,79,75,75,88,41,6,25,17,33,20,32,32,45,50,50,33,39,26,1,15,13,12,3,50,10,11,17,62,88,109,100,51,61,33,77,109,114,65,46,52,4,11,10,34,25,119,106,123,94,86,87,56,120,99,84,76,77,74,112,176,229,192,113,113,141,124,130,97,128,70,14,37,6,33,28,31,31,56,95,192,148,117,92,65,92,108,93,87,33,43,50,63,42,80,45,43,78,77,91,64,79,55,29,48,12,34,42,10,48,70,40,43,51,50,41,26,27,38,9,9,5,24,30,26,28,29,60,49,68,97,56,84,74,83,32,37,41,17,27,20,58,28,39,49,40,33,20,72,89,91,163,189,138,70,75,56,36,52,36,32,25,38,27,37,21,12,28,45,30,23,24,70,104,140,114,87,53,69,28,38,29,38,30,49,26,108,80,51,31,73,118,100,106,81,113,168,101,60,57,35,59,49,49,42,39,60,53,62,81,55,70,66,72,74,83,100,115,114,97,92,80,99,63,42,34,18,24,41,71,20,52,41,40,43,64,126,154,138,101,114,76,56,37,130,245,233,254,230,228,237,229,216,236,232,241,229,242,211,230,235,216,238,248,250,224,221,254,240,251,231,238,236,242,249,243,249,243,226,246,245,255,253,238,243,247,234,214,254,245,228,239,250,243,247,228,242,239,219,233,231,238,235,241,246,220,116,5,15,12,34,5,13,16,9,7,17,22,2,34,4,9,2,33,3,6,4,15,2,8,37,193,198,179,208,201,203,173,191,197,213,167,198,195,211,206,174,190,214,175,223,213,195,196,187,200,209,213,203,192,174,233,217,179,196,188,210,193,205,176,176,170,201,182,194,197,185,202,193,197,198,184,193,204,186,196,199,183,173,192,198,189,185,191,193,202,194,215,220,181,180,208,194,209,217,214,200,199,187,190,216,178,202,204,185,212,202,197,197,245,195,213,212,200,208,206,218,189,220,199,176,183,183,172,194,193,215,201,204,195,193,182,201,213,200,215,209,218,202,205,212,214,198,211,222,184,208,216,212,216,210,240,205,204,196,200,205,206,219,195,189,205,192,203,190,194,207,191,213,191,210,204,198,183,236,216,204,202,231,218,207,214,229,197,200,189,209,182,185,219,202,213,211,212,206,226,193,211,213,191,212,184,186,209,207,175,210,226,208,207,232,232,209,204,214,208,211,208,202,220,175,208,223,188,197,201,210,201,184,231,196,191,193,201,199,215,203,208,211,226,226,233,207,210,215,222,217,238,199,204,189,207,227,212,219,204,192,210,208,215,214,211,222,194,196,235,199,208,223,213,190,199,207,205,223,189,207,219,203,223,197,179,207,190,214,217,208,199,206,186,186,222,212,201,227,215,195,186,202,181,194,213,167,200,197,226,211,209,208,217,202,209,206,215,218,187,208,206,188,216,212,189,207,181,200,190,197,188,170,210,210,206,143,166,225,215,205,199,215,170,119,103,118,190,205,173,164,190,200,150,128,186,173,202,240,189,190,216,195,186,212,225,199,159,189,118,77,80,74,69,17,59,184,169,170,213,163,207,246,176,172,145,171,188,216,224,223,224,237,226,118,170,233,238,184,159,160,200,165,138,191,208,215,87,101,143,123,154,200,244,146,44,49,77,72,19,35,23,25,28,21,20,25,33,16,16,1,13,6,20,5,12,12,12,11,14,20,32,48,85,106,101,35,51,75,98,111,63,53,23,14,5,12,23,54,52,103,106,116,108,123,91,94,116,107,86,110,90,89,162,197,210,125,71,115,106,98,138,97,109,75,33,21,19,11,26,20,22,51,156,205,159,72,120,40,30,39,66,45,34,52,18,56,43,31,47,53,56,44,61,87,67,53,41,57,33,30,5,53,46,81,65,38,64,66,62,78,79,37,24,25,19,0,38,69,87,74,83,113,121,106,69,88,85,46,46,31,14,40,32,44,49,58,80,72,93,105,140,129,162,141,121,142,113,146,113,81,96,92,101,79,75,81,48,45,16,14,17,7,26,17,64,76,80,104,78,119,94,54,67,30,49,38,37,10,73,131,84,52,13,94,133,98,94,80,126,139,60,22,83,57,2,33,69,10,41,33,64,62,80,65,60,88,60,86,102,89,78,121,92,79,65,80,82,82,69,22,45,29,23,20,19,35,38,17,71,143,141,117,97,82,119,56,17,159,239,254,246,229,232,212,232,224,241,241,234,209,236,239,231,238,238,231,244,207,242,229,249,218,236,244,229,249,245,255,231,232,245,237,246,225,222,237,226,243,248,235,231,250,223,239,241,243,239,213,234,245,242,223,241,240,248,233,237,250,241,110,11,8,0,23,18,1,27,9,27,20,12,9,16,5,1,12,19,13,15,22,18,29,28,35,208,207,218,217,204,188,188,197,175,213,180,183,203,199,195,194,206,215,191,206,194,201,200,176,172,189,189,199,180,192,199,188,187,169,197,192,203,182,196,213,176,204,203,195,205,184,207,195,190,190,192,207,197,189,193,179,181,214,192,198,164,213,184,226,179,190,221,214,202,195,205,208,203,196,184,193,204,197,183,210,176,174,182,187,186,192,174,163,192,160,160,206,225,197,182,199,198,187,208,186,185,211,185,222,191,202,198,209,197,197,214,193,184,167,196,215,185,193,217,204,208,190,193,199,205,204,209,196,195,199,218,203,191,194,171,216,219,181,184,200,213,196,199,193,185,178,232,182,180,224,193,219,195,197,206,197,217,179,201,194,190,204,207,196,199,207,217,222,205,204,201,206,191,193,189,198,221,183,202,208,197,206,192,209,218,206,198,233,192,208,206,216,206,202,225,226,212,237,222,213,202,188,217,233,202,222,224,213,216,224,234,209,186,212,214,222,244,217,199,201,212,207,207,210,199,198,215,193,214,205,194,227,215,200,208,211,209,227,207,215,198,232,202,230,210,198,197,188,202,168,198,196,223,224,182,201,194,215,195,204,210,224,201,214,222,199,200,206,210,205,210,199,183,198,214,182,199,211,182,204,198,194,187,200,194,196,211,188,216,212,203,194,195,187,215,184,207,204,179,193,192,219,210,219,210,189,138,157,205,196,189,164,167,191,215,229,197,216,155,120,95,109,171,170,135,174,197,238,159,108,183,226,173,215,204,197,207,164,167,205,184,212,157,172,117,71,103,104,84,60,68,64,153,236,193,176,199,181,169,163,190,197,199,218,223,247,227,232,214,86,169,244,245,210,169,181,180,116,126,190,225,220,159,106,157,173,176,227,226,93,33,69,33,30,5,21,13,32,23,22,39,23,12,16,9,24,11,19,16,4,19,28,46,22,34,5,87,81,84,89,56,49,68,102,66,77,61,57,27,38,24,44,54,87,101,142,126,94,90,96,89,130,75,99,84,78,120,181,202,136,154,114,71,105,72,92,118,79,107,62,24,38,26,14,28,48,23,70,156,166,99,85,50,50,42,28,29,44,66,46,37,49,41,16,35,21,61,56,53,52,77,42,43,51,56,58,38,38,50,52,56,69,66,58,73,77,73,60,48,33,37,27,76,87,103,75,96,80,77,47,23,43,34,49,45,34,56,54,100,118,151,190,213,197,220,173,197,168,148,142,67,54,63,76,103,116,119,131,140,120,134,141,134,131,98,87,69,31,53,25,45,47,66,75,38,42,67,48,53,82,63,54,68,55,97,115,73,50,66,156,111,112,108,63,95,102,52,45,64,33,54,43,33,67,78,47,25,42,59,58,95,121,96,22,36,79,63,73,51,64,69,95,85,43,48,76,32,18,39,33,38,31,42,39,53,154,145,134,124,86,103,86,69,150,247,222,234,252,234,251,248,255,242,250,220,223,238,242,238,207,225,228,223,230,247,243,243,248,236,245,252,238,239,243,251,205,250,238,248,222,247,246,249,223,237,251,250,251,236,236,218,234,251,243,237,247,242,226,238,224,236,241,234,249,226,106,12,21,16,5,2,5,3,23,14,15,18,32,20,3,10,12,15,19,15,4,3,7,10,27,200,186,199,194,211,197,188,207,203,203,212,171,193,170,206,207,199,186,192,191,211,216,170,192,187,188,181,193,192,181,189,180,212,174,202,196,215,204,198,186,184,204,207,200,186,210,210,205,217,188,214,196,193,187,196,200,210,166,194,188,183,181,198,196,194,188,215,193,193,193,208,181,203,179,189,210,181,183,205,212,201,207,202,204,213,189,201,201,200,197,212,200,211,218,226,195,183,202,201,181,220,192,171,204,182,188,198,231,179,191,188,195,209,198,191,213,196,201,202,173,206,218,217,196,197,210,210,208,205,199,203,195,193,232,188,197,197,187,205,214,188,221,197,186,201,212,186,212,196,197,196,210,186,214,193,197,172,177,176,220,191,205,221,195,201,207,192,197,207,197,193,202,178,192,229,199,193,213,208,195,201,201,197,209,204,213,199,211,205,210,212,197,203,197,227,219,199,209,214,211,200,205,205,210,227,210,201,202,191,180,201,191,200,187,219,178,199,218,224,214,215,228,211,214,214,192,193,185,230,219,221,199,219,211,222,211,201,204,203,202,195,218,216,186,197,182,206,207,193,208,184,192,218,210,207,174,175,172,204,196,186,200,185,204,225,221,213,224,190,181,224,177,196,200,205,189,204,208,202,191,200,202,185,189,240,213,181,217,195,189,210,205,205,175,165,185,210,207,207,200,186,193,205,214,212,199,164,176,212,232,199,148,171,209,198,212,207,198,169,95,98,83,154,151,176,205,217,237,205,115,181,192,180,223,212,193,191,178,170,190,165,197,138,87,80,63,137,116,84,88,70,137,207,227,207,181,131,184,189,153,198,181,218,215,201,237,213,238,194,100,184,225,229,225,202,168,118,152,163,189,225,239,197,146,183,244,192,221,189,39,59,49,4,30,22,31,10,32,27,11,30,13,8,18,39,39,19,44,22,18,15,10,14,3,27,45,85,90,51,47,53,64,124,101,67,83,38,32,11,91,69,52,66,96,115,143,118,125,91,108,82,72,107,57,98,93,161,190,119,72,92,62,101,108,89,89,142,121,108,54,10,16,26,46,46,29,21,87,153,135,81,70,40,49,46,49,52,27,53,57,40,55,50,28,54,47,44,42,52,50,63,50,75,41,38,41,16,52,37,27,40,58,51,66,69,56,66,47,80,57,66,102,121,93,62,54,50,79,84,35,25,25,10,42,49,79,122,204,205,221,223,214,155,140,133,53,68,60,74,83,54,23,16,21,56,44,80,91,39,36,51,59,75,110,122,180,153,120,120,100,46,50,31,38,32,39,38,3,33,55,78,55,79,59,68,87,69,118,93,154,104,103,97,37,72,112,33,13,22,23,48,26,35,59,58,57,47,78,64,58,56,90,43,53,28,28,60,73,52,51,45,55,79,54,72,42,43,45,48,44,36,2,28,23,55,139,160,125,121,74,97,89,41,167,204,220,250,254,252,253,219,215,214,227,218,230,243,220,206,221,216,243,228,255,242,254,248,252,236,249,253,219,251,240,242,251,229,244,248,253,236,234,238,205,244,233,243,252,237,253,241,222,255,255,236,214,250,234,222,246,227,224,244,231,231,122,29,6,38,6,12,7,30,7,11,30,8,6,1,1,0,18,8,6,29,1,8,53,23,25,204,194,189,235,188,207,213,196,197,208,175,188,172,182,206,175,194,188,196,210,179,216,200,202,182,190,191,220,209,196,183,218,198,198,207,216,216,185,213,176,188,185,203,199,180,203,177,213,188,198,204,198,189,227,193,186,222,168,188,190,187,189,179,185,208,195,200,208,190,190,180,189,202,191,199,189,162,173,203,195,204,188,188,187,184,203,211,208,223,178,200,178,193,193,194,206,204,178,197,183,212,214,199,214,189,185,188,210,208,190,197,200,204,191,215,182,203,189,177,177,189,191,196,198,204,201,216,206,214,194,191,197,212,209,191,169,209,183,227,214,194,201,196,182,189,227,190,178,194,226,218,192,206,196,175,196,212,173,223,184,226,220,199,219,210,197,200,217,202,194,178,200,192,206,202,206,204,199,199,216,234,229,197,213,217,196,197,215,221,209,193,202,211,196,209,217,204,229,213,235,218,229,194,213,201,205,225,199,208,213,212,232,203,207,178,188,211,215,229,193,193,221,230,193,206,195,217,198,195,193,221,203,205,186,209,190,181,219,197,214,193,194,222,192,208,208,219,233,204,210,209,218,202,218,201,196,203,204,204,183,235,203,200,217,200,187,179,228,182,208,212,210,179,213,200,204,193,207,188,228,223,202,199,181,217,179,214,194,202,184,205,186,231,202,211,194,216,198,194,180,211,187,198,208,223,160,161,183,204,212,182,161,175,192,203,226,192,179,134,93,96,77,136,168,192,198,230,214,165,102,158,203,215,185,195,234,233,182,217,178,161,170,120,99,67,58,115,81,73,72,68,154,243,212,177,159,132,210,176,115,173,162,201,220,230,220,212,242,190,88,210,237,236,251,212,146,133,152,161,178,175,204,183,165,197,233,198,214,170,59,71,41,9,29,35,18,28,39,10,25,9,19,22,28,8,10,20,37,33,2,23,7,56,40,26,43,104,65,23,36,49,88,70,71,78,49,20,37,74,139,177,141,121,129,104,113,79,96,106,118,93,98,95,115,105,88,105,114,72,61,90,107,98,114,98,99,91,139,86,45,28,26,36,52,21,34,2,23,116,69,59,50,48,33,57,21,50,59,62,52,50,35,29,20,19,63,25,37,70,21,38,26,14,28,38,22,26,43,67,62,51,67,62,48,67,50,47,63,59,78,73,104,74,49,38,28,21,52,38,43,25,31,53,128,159,195,162,165,156,94,75,24,46,66,48,77,19,76,68,52,55,66,54,44,66,58,39,56,59,33,49,48,36,30,51,75,85,128,154,130,119,104,75,69,29,46,16,23,22,8,34,55,35,82,68,63,93,95,102,125,99,101,63,52,85,68,61,71,54,27,49,63,85,56,50,60,61,58,27,36,42,46,31,30,16,61,33,33,57,50,52,28,44,60,61,91,45,84,45,35,41,50,26,29,72,147,133,149,89,99,116,111,72,84,119,185,245,244,252,217,234,235,224,224,232,228,228,224,218,236,224,232,236,249,235,240,245,240,220,246,233,255,246,252,227,221,244,242,228,251,255,227,248,244,220,242,249,228,240,217,241,253,224,243,246,230,220,243,239,239,218,241,232,231,241,117,6,29,5,8,10,1,14,25,2,26,35,16,17,3,14,0,20,3,9,15,36,8,6,11,210,189,173,187,191,181,185,213,202,186,178,202,193,192,182,192,183,195,210,212,186,194,190,190,184,209,190,174,192,191,186,197,175,195,211,197,188,203,195,205,211,209,207,213,197,202,215,220,191,181,177,195,174,178,189,219,192,203,214,183,188,182,188,186,192,194,205,198,179,184,183,190,174,190,211,205,186,175,196,204,184,190,210,193,198,197,187,193,178,222,212,190,210,187,211,207,202,185,194,213,198,202,201,207,195,204,214,214,208,208,196,207,203,198,184,195,202,203,199,195,206,194,170,208,216,181,197,206,179,204,214,229,190,184,185,176,169,195,194,219,188,203,207,196,217,222,196,206,186,200,204,199,190,202,210,216,210,191,199,206,211,194,191,174,208,204,193,196,195,205,211,184,212,203,207,191,188,205,211,188,190,213,239,197,215,215,217,222,212,207,184,231,217,188,192,193,211,198,205,176,213,208,203,238,187,226,212,211,226,229,209,218,198,230,208,210,212,207,193,208,207,210,195,190,215,199,213,221,208,226,221,192,198,190,172,222,198,229,203,217,196,218,214,168,180,196,218,211,215,180,194,196,222,210,197,209,197,202,208,184,213,208,218,176,199,210,210,195,182,188,194,205,189,216,217,217,209,204,196,223,208,215,191,201,189,226,199,169,200,190,185,196,216,220,203,172,198,196,216,200,194,198,221,215,213,144,179,187,193,215,205,152,208,208,215,216,212,182,138,105,83,96,180,167,191,220,217,206,154,107,137,225,239,190,210,179,210,178,160,206,188,206,160,162,113,60,108,80,90,92,53,190,240,147,154,149,167,243,147,145,150,181,242,216,233,227,212,215,141,124,240,213,252,239,205,171,189,179,204,187,160,178,167,159,197,193,209,191,157,87,73,7,17,8,17,25,16,12,19,16,15,19,4,16,5,14,18,25,36,29,17,21,7,11,76,82,71,60,33,61,85,104,70,60,32,44,1,43,145,203,216,108,76,85,113,115,122,90,115,84,98,93,99,88,108,90,72,72,83,128,135,80,88,84,77,134,93,163,90,21,30,15,43,46,20,22,26,56,35,65,52,23,20,49,51,59,47,75,56,50,26,13,67,48,35,64,49,60,37,50,36,25,7,28,43,32,43,49,62,29,42,53,68,59,64,51,56,68,67,64,71,58,26,42,42,8,26,32,24,53,93,129,213,211,158,143,70,41,49,49,32,57,35,53,56,55,53,51,40,34,66,60,59,53,50,57,44,67,42,71,38,14,39,26,62,60,59,53,47,81,100,128,146,75,91,50,56,31,43,8,7,17,52,80,64,68,57,69,90,74,68,56,49,41,63,63,53,31,35,36,68,68,58,66,52,47,46,57,39,21,46,45,23,47,24,53,63,57,59,55,35,46,62,56,71,69,42,33,70,11,25,39,59,35,30,107,140,129,99,84,107,91,77,149,68,100,185,218,223,243,233,241,245,228,235,230,229,234,218,244,219,254,247,237,246,236,227,232,237,240,231,228,241,231,238,248,249,241,231,231,222,238,242,230,223,254,233,250,246,240,211,239,249,247,228,227,246,228,227,215,224,234,231,251,224,91,19,0,12,21,10,14,19,6,7,13,22,2,24,1,12,2,3,9,1,22,14,1,30,29,198,189,194,215,215,175,204,197,182,208,197,192,209,207,205,186,197,217,214,215,186,209,216,189,204,215,178,188,166,192,200,196,197,189,195,212,197,184,190,202,188,185,190,190,207,157,189,206,189,185,190,199,215,195,201,202,202,195,208,202,205,207,219,187,193,175,185,174,210,207,206,165,206,201,187,183,189,183,212,193,188,197,225,205,192,204,181,172,204,205,194,194,181,194,212,227,171,230,197,176,181,194,234,199,217,181,206,195,227,221,189,191,166,211,178,213,193,207,190,218,209,196,196,197,213,174,231,194,206,212,198,187,179,199,177,188,185,209,218,199,237,203,206,197,203,191,223,203,221,177,209,186,216,198,200,208,220,194,198,202,188,207,195,180,205,211,215,195,210,213,220,189,213,207,216,203,209,229,212,225,213,217,195,211,175,192,198,218,203,198,222,230,219,194,238,196,195,190,190,208,216,214,206,216,215,206,209,196,209,209,197,250,215,228,201,204,179,221,203,188,207,199,202,182,207,221,207,204,195,179,204,203,203,211,220,196,210,218,206,207,191,207,196,203,189,205,208,216,204,195,187,190,196,191,172,197,199,207,199,209,207,209,198,219,193,196,219,201,188,212,209,210,197,194,208,187,198,220,196,188,197,206,191,173,200,208,195,205,189,205,206,180,193,215,208,205,197,211,203,188,221,217,176,191,214,144,158,179,212,221,160,160,193,201,209,207,195,189,133,117,71,66,141,169,188,180,227,223,160,167,175,206,203,224,201,188,194,152,176,174,184,212,201,181,119,150,171,99,80,56,31,147,205,190,226,166,193,207,148,199,195,240,228,201,232,215,233,209,151,169,214,213,236,229,242,198,172,197,185,171,183,192,184,166,209,238,162,140,87,32,24,9,6,27,14,15,8,3,57,38,25,32,14,14,6,37,10,34,27,24,16,18,21,47,60,76,53,55,62,84,104,64,56,71,44,11,33,44,142,140,173,75,46,105,90,95,102,114,121,108,88,116,102,78,128,91,72,125,195,152,125,102,111,121,104,99,127,120,86,40,31,35,6,25,5,25,10,28,26,27,41,52,51,47,103,44,70,65,25,37,20,49,79,65,66,68,66,34,27,51,29,22,27,62,46,34,44,45,30,22,34,66,39,38,44,39,27,46,56,55,49,28,43,14,17,35,44,57,64,165,216,208,142,115,47,61,30,38,44,32,44,33,54,36,41,26,17,20,8,40,44,79,71,28,63,70,22,43,65,47,52,27,37,26,37,38,47,52,74,49,96,57,82,97,109,89,83,61,28,24,10,28,19,29,45,52,61,57,30,50,55,61,58,55,26,42,48,29,28,47,46,30,52,55,58,77,36,29,55,47,43,28,40,51,47,47,32,63,37,55,27,49,45,36,36,37,47,44,48,29,24,39,29,30,35,110,169,123,103,108,118,82,107,111,107,50,85,116,132,177,195,216,228,243,249,224,244,229,212,203,255,240,240,237,236,248,249,230,251,227,233,243,239,241,250,239,230,250,238,245,233,240,229,229,225,246,225,236,245,222,235,230,252,215,230,225,227,230,237,214,215,230,237,219,224,128,12,0,1,31,7,4,12,7,6,15,7,27,15,13,4,8,9,9,22,3,3,5,12,22,178,191,198,205,199,202,191,172,175,170,198,177,192,208,188,224,180,218,174,195,193,189,194,174,184,181,204,210,195,175,195,194,179,205,215,193,207,197,188,184,207,187,189,208,197,178,187,188,203,190,185,194,201,184,181,199,183,174,193,190,216,191,187,195,203,181,177,210,198,196,219,157,187,195,212,187,199,190,210,192,176,200,195,199,189,191,191,211,202,187,179,197,197,197,180,205,183,213,197,185,195,200,203,206,187,177,193,191,212,216,183,192,192,225,192,217,204,210,185,225,209,217,182,206,212,193,207,208,223,208,195,203,231,218,225,191,214,197,195,198,200,206,212,203,196,218,202,207,197,215,214,181,194,181,209,199,201,207,204,195,215,200,199,185,193,223,216,209,195,198,201,215,188,215,180,210,211,195,192,204,168,207,203,212,227,193,221,213,183,215,196,203,183,208,230,200,221,193,214,224,224,186,206,208,202,219,188,198,206,196,223,206,216,211,205,205,207,210,197,203,199,204,187,217,201,212,196,204,182,199,208,193,231,215,223,210,196,200,178,195,202,223,183,199,216,195,202,186,198,208,217,186,201,204,177,186,188,196,191,199,210,202,199,209,216,204,199,186,206,194,193,223,180,205,197,195,223,188,186,193,211,199,189,186,203,201,211,207,201,198,235,199,184,193,206,205,194,190,202,193,207,206,209,179,214,171,174,214,196,197,159,180,197,215,216,205,187,188,87,125,95,71,160,187,225,217,218,204,173,163,126,197,239,218,219,196,220,177,157,168,240,219,178,126,100,145,164,103,102,71,26,130,198,207,236,156,207,210,156,190,191,209,222,209,222,184,254,188,154,185,202,225,205,239,222,207,217,158,159,153,167,175,164,204,208,232,177,75,46,5,0,11,8,26,22,17,15,56,19,27,18,31,16,29,5,14,13,2,9,27,11,13,39,28,93,56,58,59,80,98,61,51,41,33,35,21,46,99,69,47,107,63,80,125,108,127,149,102,107,95,91,93,79,92,85,65,86,150,181,126,128,105,105,97,89,101,117,134,55,3,13,7,13,38,11,26,15,39,21,43,53,98,58,91,64,58,41,32,28,33,59,117,107,102,82,88,88,29,32,24,41,41,14,30,15,31,15,11,29,12,54,39,48,34,26,24,31,51,35,36,36,29,16,26,17,50,97,215,234,192,132,55,48,43,93,50,47,27,35,4,17,5,12,14,2,22,5,28,22,47,61,45,79,75,103,78,95,47,66,38,40,25,35,16,22,36,26,40,48,55,55,58,107,71,49,69,116,106,129,71,53,32,30,19,26,33,48,43,19,23,34,39,58,29,24,37,31,28,29,39,40,34,58,38,76,40,39,76,45,27,21,31,31,39,51,35,30,64,70,40,51,39,36,59,54,56,28,46,43,54,37,48,53,25,33,100,163,140,116,112,109,75,89,117,112,83,85,56,84,169,214,220,230,231,212,213,219,238,237,235,245,240,223,244,237,215,235,245,232,224,237,241,222,233,237,218,235,227,251,239,251,253,214,227,222,223,234,212,247,235,223,237,211,241,222,235,240,212,241,215,237,221,217,232,222,121,20,11,0,21,4,5,23,16,25,12,6,10,15,7,23,11,5,13,15,25,33,12,28,3,180,177,188,224,196,211,188,214,217,207,193,196,213,196,206,216,220,213,181,176,203,228,202,178,203,193,190,230,209,202,183,198,197,186,215,179,186,178,180,195,204,190,198,189,172,186,184,184,185,196,174,181,189,197,204,198,196,203,173,190,207,198,198,186,181,194,210,179,216,189,179,165,201,180,190,189,194,200,220,208,210,211,208,190,199,187,171,201,200,198,205,197,217,207,214,179,201,222,189,196,194,206,197,219,200,214,223,216,194,208,194,185,187,237,175,211,210,201,199,216,181,225,171,196,214,200,213,198,190,202,217,193,191,209,206,180,201,193,202,208,217,183,188,205,200,186,199,202,207,200,217,213,191,171,187,221,201,204,222,200,226,197,194,204,213,207,212,210,196,188,197,221,189,179,198,212,216,197,206,194,219,205,201,207,202,202,203,207,208,192,221,208,212,205,227,198,213,208,223,200,201,199,224,201,202,213,209,210,198,201,198,215,207,199,220,206,225,203,197,227,207,211,219,199,223,198,159,212,192,188,200,224,202,202,196,200,207,215,206,204,195,222,190,211,199,190,204,187,192,207,193,190,196,195,223,198,206,200,217,191,211,203,200,200,194,205,175,197,214,179,182,206,192,181,200,214,204,193,215,193,200,210,215,187,180,213,194,193,156,185,220,183,240,192,192,191,229,216,204,209,190,197,216,189,209,148,192,205,220,198,156,193,181,190,195,188,139,132,103,128,118,88,147,194,237,172,242,194,162,146,133,191,191,220,218,190,230,181,150,211,226,136,103,126,96,93,142,134,130,76,42,132,212,219,168,158,225,214,120,135,173,221,216,209,199,226,234,157,140,199,178,243,222,221,240,209,221,136,122,108,168,193,185,148,143,169,155,42,27,14,15,5,13,10,21,17,22,39,22,7,18,25,10,8,8,18,36,29,29,13,39,50,53,52,67,54,79,113,76,61,55,48,29,25,20,1,75,119,42,60,99,100,138,145,153,134,128,88,111,99,99,111,114,102,127,118,135,143,127,106,101,89,98,83,79,117,114,105,43,15,37,3,21,22,5,40,27,36,10,38,115,106,78,72,72,53,46,46,70,88,105,135,135,104,48,49,44,26,34,14,45,64,34,27,22,19,9,4,16,24,15,43,33,18,21,55,50,39,48,38,40,49,83,53,118,198,238,197,111,65,40,57,41,31,39,14,29,5,5,5,3,9,21,24,8,35,11,15,28,26,57,42,92,95,120,76,62,55,29,8,6,14,4,18,30,22,13,14,39,39,51,39,74,77,77,108,58,73,92,135,95,65,44,34,40,53,26,42,30,34,27,55,53,26,24,18,33,15,8,43,41,45,49,67,49,56,64,55,55,83,105,79,105,61,36,40,37,37,4,38,80,35,29,30,36,40,21,74,26,38,32,77,36,41,18,76,153,132,126,130,84,91,103,114,118,97,84,197,214,243,255,239,209,236,227,227,202,236,229,245,226,246,241,251,232,239,242,247,250,231,231,232,226,225,224,221,221,238,231,241,249,242,233,238,250,238,209,231,220,207,253,223,225,222,216,238,208,238,208,206,224,218,242,214,219,124,20,10,5,0,10,6,22,0,25,17,17,21,13,15,22,5,22,0,10,25,11,12,11,28,185,211,188,211,199,197,211,202,200,185,187,182,187,202,200,217,182,183,200,194,201,188,178,200,188,178,175,207,186,180,206,189,201,171,194,192,185,191,200,208,202,204,185,203,174,175,192,211,195,206,194,187,197,193,180,183,191,171,205,181,161,168,182,206,187,177,201,202,219,190,164,200,201,189,192,207,210,167,220,195,184,200,194,176,190,188,191,194,190,200,209,217,190,201,208,202,212,187,205,221,205,175,184,194,201,190,213,193,175,189,177,195,191,185,209,208,196,200,212,197,202,194,191,199,196,184,194,219,192,207,196,194,200,193,212,188,199,200,181,167,199,203,205,222,195,208,205,185,188,206,183,190,207,198,197,215,232,203,208,205,220,210,190,217,216,224,213,212,207,207,178,220,205,182,213,187,215,208,219,246,205,185,223,213,213,239,213,207,207,200,226,225,226,193,226,191,211,209,189,211,201,182,196,207,196,189,222,213,216,215,194,201,196,228,179,228,209,213,201,226,214,218,186,209,220,212,210,205,191,182,217,203,212,199,223,205,206,200,209,205,215,199,189,192,183,204,209,224,194,196,207,206,221,189,204,182,204,212,221,201,198,189,197,217,177,176,218,202,202,197,198,177,187,187,202,197,209,197,207,186,224,210,202,187,202,195,205,199,206,204,172,192,207,198,209,217,193,217,216,208,212,212,182,227,197,118,167,211,204,187,151,194,178,232,196,184,168,156,136,144,131,96,188,242,227,210,237,199,168,167,142,195,212,203,192,187,212,200,161,192,172,123,138,124,80,125,129,110,159,108,71,131,207,193,177,189,237,128,83,156,202,224,190,202,218,230,237,141,186,189,224,227,209,206,192,225,224,179,112,136,196,169,121,100,43,62,64,28,24,32,15,19,38,17,22,42,23,21,10,29,24,26,7,11,18,6,36,21,13,25,17,23,51,83,64,81,127,116,76,60,53,32,28,9,12,27,114,104,21,77,108,99,101,108,111,94,74,73,98,134,116,98,85,100,176,185,148,101,98,98,114,111,95,98,83,120,167,112,23,5,11,22,22,27,44,52,47,31,16,33,81,112,85,52,61,51,56,46,60,76,93,65,35,63,44,43,4,19,43,17,53,22,20,54,32,38,20,42,33,74,68,58,71,84,94,120,163,162,154,198,203,161,151,168,175,208,130,121,30,54,56,46,11,18,28,20,18,11,10,17,6,1,19,14,15,18,26,43,10,42,39,61,103,59,93,51,50,28,17,20,21,23,2,13,8,12,6,18,27,28,9,28,21,53,52,44,69,62,63,76,140,117,65,79,75,59,96,31,69,54,53,38,27,68,37,41,10,7,15,26,26,35,48,79,51,44,29,45,86,123,177,135,112,99,49,102,30,25,29,42,38,61,30,34,45,49,44,40,17,7,29,32,63,40,75,68,165,156,148,118,105,76,93,116,105,109,132,240,240,255,249,208,233,219,224,243,233,240,224,245,233,233,201,221,241,225,241,225,239,227,221,231,241,222,223,219,217,225,226,215,214,230,238,227,232,232,247,240,216,230,226,220,214,200,217,234,226,223,221,225,244,243,233,217,211,100,12,2,0,4,3,29,12,0,6,28,10,17,0,6,12,11,11,44,19,13,24,3,10,9,195,215,195,194,207,191,198,201,199,201,214,182,173,210,205,185,199,190,184,194,187,201,219,195,202,184,196,185,173,174,206,201,185,208,186,212,161,185,192,209,186,203,191,183,225,182,220,202,174,185,175,198,200,200,232,199,168,186,211,188,180,192,169,208,182,160,191,175,202,198,217,229,187,175,193,171,180,227,179,192,191,202,195,202,208,198,181,173,190,203,210,181,199,194,201,219,208,211,206,205,201,189,202,182,194,201,180,217,230,201,210,193,187,205,216,206,213,194,215,211,226,211,209,219,197,220,198,229,219,204,191,209,187,195,207,190,213,198,214,211,220,193,174,196,227,198,231,179,171,183,200,210,211,191,197,206,205,205,217,207,213,191,220,186,204,197,202,208,218,188,187,199,215,218,210,205,223,224,215,202,198,204,209,231,230,198,214,220,208,207,209,181,205,228,209,230,197,211,208,206,217,201,221,212,214,215,202,199,196,206,185,212,213,201,219,205,188,219,192,219,199,189,210,210,181,191,203,189,219,208,210,210,226,204,194,188,197,221,197,204,171,204,194,211,188,192,195,187,203,200,201,202,194,215,174,190,196,226,166,178,169,180,228,174,205,193,209,181,201,199,180,214,198,173,178,196,189,188,187,169,171,171,201,190,201,184,209,220,199,195,211,204,205,198,197,218,213,213,201,197,183,188,195,204,167,141,208,197,194,195,163,158,179,208,217,198,164,143,130,139,112,118,233,223,237,210,237,186,186,190,153,186,197,200,221,232,231,177,164,203,239,160,149,165,104,80,98,121,123,73,43,128,181,215,183,221,178,98,103,199,223,227,203,222,238,226,186,140,206,163,223,208,218,219,241,239,226,189,180,161,218,194,81,48,55,19,30,11,20,3,37,23,32,2,21,22,21,27,21,35,39,17,20,14,8,12,21,20,22,19,41,36,60,57,81,78,89,69,79,41,10,33,19,20,26,61,114,70,55,121,113,43,42,34,34,23,36,42,74,102,138,102,73,86,148,163,90,88,80,67,59,91,74,77,99,126,111,90,23,29,3,25,20,51,31,37,22,15,30,10,52,36,15,22,8,39,32,33,47,42,37,54,43,59,53,81,41,54,59,59,74,89,95,146,154,161,193,171,192,224,193,154,167,194,172,197,198,167,168,176,126,123,114,77,63,56,39,40,40,49,21,34,7,14,7,22,2,11,20,2,31,14,33,37,21,31,10,34,13,48,43,75,32,20,62,53,37,23,12,14,22,27,7,15,41,30,18,31,2,28,14,6,21,18,12,35,46,55,93,94,63,117,128,142,145,143,156,136,120,145,113,126,90,76,82,59,50,29,27,27,41,30,22,32,53,23,41,25,62,69,105,116,102,62,59,110,66,67,69,67,45,75,41,34,45,18,9,42,14,8,24,37,42,33,40,66,122,154,103,123,100,94,92,116,103,85,115,142,215,239,223,246,244,225,209,207,242,216,227,226,235,236,250,255,225,250,239,227,229,231,238,245,242,226,244,212,239,254,229,204,221,230,224,216,227,215,226,229,242,216,225,243,223,231,233,209,235,227,221,229,239,232,234,231,220,125,13,29,15,0,24,15,7,21,19,5,22,14,12,7,5,2,8,28,16,15,8,15,3,25,180,209,191,183,223,189,182,213,199,185,202,178,201,200,182,196,193,218,192,197,210,217,191,193,197,171,182,172,200,189,190,199,202,182,184,204,164,217,197,198,216,192,194,216,189,190,181,194,192,174,208,181,221,170,204,187,199,201,189,177,206,198,207,201,199,205,181,165,187,207,218,188,189,186,187,178,181,192,198,182,192,207,176,181,186,213,190,192,195,206,199,177,188,178,200,199,202,223,193,205,199,191,203,208,189,203,222,201,206,201,208,206,219,218,199,193,182,220,199,214,189,198,199,201,195,200,214,196,197,200,192,219,207,213,214,209,215,209,178,208,229,201,185,224,198,196,209,202,183,223,194,182,209,207,207,203,198,186,203,208,193,197,182,205,228,189,203,221,194,211,199,205,187,217,203,227,196,198,208,203,205,219,194,216,191,220,206,214,222,230,207,203,205,229,210,227,217,197,202,225,195,211,211,197,195,223,204,214,215,215,202,207,197,197,202,217,191,211,210,203,201,199,210,202,208,221,209,231,202,203,219,210,222,178,211,190,201,207,224,188,200,191,216,200,208,202,192,197,215,183,216,170,188,199,211,178,176,209,189,200,196,198,206,199,201,181,205,201,203,182,170,172,203,212,194,183,192,212,178,180,191,209,206,193,180,161,217,202,206,219,176,202,177,222,203,207,204,193,170,172,187,217,178,220,195,162,207,191,201,178,148,171,169,181,181,160,122,107,105,131,124,148,190,203,201,213,243,204,204,181,125,189,196,228,208,179,217,167,211,217,210,182,180,178,134,141,106,121,87,75,84,135,198,204,192,188,185,121,164,227,221,218,222,221,216,243,161,148,224,146,207,215,214,207,216,213,219,184,187,194,209,161,52,41,103,42,35,24,19,8,35,11,33,22,16,8,6,15,42,19,9,10,51,17,49,22,40,33,3,40,39,28,63,83,100,102,67,73,49,26,14,0,24,22,50,109,129,71,92,139,66,61,80,61,67,60,40,35,31,36,75,112,97,107,121,109,96,108,73,68,78,75,93,66,70,147,138,60,45,39,33,39,23,39,16,24,47,26,40,60,57,53,61,65,65,64,54,85,92,84,110,118,147,143,135,173,175,184,183,168,98,162,179,181,163,166,100,117,140,130,84,50,66,58,83,39,36,35,81,62,57,97,27,44,46,49,38,46,36,2,20,27,11,12,16,15,19,18,5,12,14,24,17,17,23,4,8,23,14,36,66,54,48,30,46,48,29,3,25,33,10,26,11,13,1,50,45,16,15,18,28,39,10,38,24,33,27,53,72,68,78,61,51,81,56,94,126,92,114,158,116,157,168,158,171,157,148,146,110,102,82,69,68,39,38,37,65,33,26,38,22,88,63,9,41,64,37,41,70,71,53,48,57,83,59,37,27,43,22,28,13,34,37,36,31,43,141,155,146,109,101,97,87,111,79,78,85,97,180,233,249,237,234,235,232,219,251,250,242,249,255,247,252,248,241,248,247,240,244,232,220,242,247,230,199,219,224,231,230,242,242,235,237,223,233,240,246,233,245,237,247,206,221,243,212,218,231,199,241,238,228,218,235,242,217,126,9,7,8,15,14,0,18,20,4,30,42,7,10,5,3,23,29,0,21,17,22,3,23,14,191,196,188,202,203,190,193,202,197,200,218,196,218,184,197,181,204,200,154,199,180,211,195,195,181,209,181,185,196,210,197,195,196,164,177,196,185,200,198,188,190,186,195,212,184,187,188,174,218,182,193,188,210,202,193,199,209,198,199,195,174,185,215,203,189,179,184,186,200,199,205,190,208,192,198,195,207,183,187,204,187,186,191,191,206,180,189,203,200,161,171,181,179,184,213,218,193,198,187,190,205,213,207,222,203,216,219,191,189,187,176,198,183,200,212,214,208,204,221,194,212,195,229,204,215,211,192,190,214,178,191,191,200,200,195,175,212,206,193,190,217,201,213,202,219,208,213,185,200,194,210,217,197,193,183,209,193,207,189,201,211,204,214,221,215,213,207,227,210,211,237,201,214,185,177,230,229,215,214,212,190,219,219,215,226,239,216,201,217,213,226,206,201,215,205,213,207,211,216,212,220,210,213,209,241,203,219,213,211,215,225,206,216,209,190,216,219,206,207,206,179,193,211,184,203,226,244,209,199,183,204,209,182,207,213,208,188,205,214,205,199,212,192,202,199,186,203,207,199,205,200,203,214,186,179,211,205,183,191,182,200,209,205,187,192,209,220,190,197,169,186,209,214,172,211,199,204,199,205,191,201,189,183,206,167,180,220,205,203,196,202,175,209,195,220,176,181,178,216,185,209,204,228,208,185,124,193,205,202,158,190,186,192,189,178,142,101,133,149,118,136,111,137,117,126,139,245,198,175,179,141,183,201,200,219,193,192,149,202,206,174,114,173,214,112,125,129,143,94,36,87,167,228,197,191,169,168,176,191,237,213,245,203,212,198,232,147,170,214,131,209,245,210,222,192,239,221,215,220,166,149,103,49,57,65,15,23,9,28,25,10,28,14,22,20,20,17,29,10,8,40,27,33,29,14,30,14,60,7,50,60,41,46,103,98,71,41,74,38,9,34,13,27,49,97,119,152,87,119,113,49,99,90,40,53,21,41,46,37,47,83,116,154,147,143,141,144,141,118,154,114,125,135,115,97,119,111,66,101,126,120,168,131,167,134,147,168,176,185,151,169,166,171,165,144,163,166,150,149,141,153,151,127,139,136,125,145,134,112,86,87,77,52,77,67,67,54,68,46,67,60,64,70,60,37,61,53,70,60,58,58,50,25,59,45,39,54,29,26,20,12,22,8,1,28,15,16,1,49,16,14,9,17,26,18,21,44,2,27,40,87,87,75,51,54,72,47,13,31,31,27,19,10,16,28,34,31,31,13,39,17,21,9,10,14,18,10,13,38,55,89,67,69,78,85,72,57,69,69,61,61,44,75,53,66,104,131,143,157,134,173,136,159,141,97,102,84,80,71,49,65,52,49,49,27,52,30,39,35,34,60,55,52,74,90,67,33,20,29,39,18,43,44,50,54,16,97,141,123,119,92,120,107,93,79,117,100,120,159,235,244,248,221,248,244,233,251,254,250,255,253,244,251,226,253,255,245,246,218,244,224,219,217,207,215,229,217,208,213,246,214,223,213,212,201,218,244,229,211,210,208,217,209,242,232,234,217,234,212,232,210,220,226,242,249,139,0,2,5,8,19,11,0,11,28,29,14,12,2,16,23,23,7,4,31,0,5,20,11,32,203,195,212,200,180,198,189,193,190,182,207,196,192,197,193,181,190,203,174,187,175,193,199,170,182,179,160,214,189,196,211,199,168,199,201,178,208,195,199,192,195,190,210,182,191,169,185,192,199,214,213,196,211,210,170,169,196,188,181,190,202,200,193,204,172,198,202,190,176,195,197,187,196,182,185,211,207,196,215,199,195,186,202,208,193,214,171,187,213,188,184,196,197,196,201,201,166,194,223,216,211,196,198,208,218,209,181,204,194,198,201,200,206,204,190,216,191,206,235,198,206,194,198,236,204,203,210,209,191,199,195,209,203,190,226,202,211,211,198,203,194,219,212,173,211,200,215,192,208,195,186,195,211,212,185,206,220,224,223,219,215,192,206,200,194,201,215,222,216,187,201,195,203,206,219,222,221,190,214,216,195,221,216,207,219,231,207,209,203,187,228,202,204,208,227,217,209,241,209,227,210,218,194,213,225,212,190,204,204,212,223,210,198,215,214,208,207,212,229,202,218,201,199,193,199,215,218,222,205,218,208,214,226,203,212,194,221,226,212,198,211,188,180,189,211,203,210,192,212,187,226,192,215,225,182,197,199,194,205,184,200,200,199,199,189,209,202,204,184,188,199,186,177,174,187,213,202,209,217,159,187,192,177,195,209,180,201,211,191,200,198,179,204,195,194,184,189,209,179,186,202,200,197,201,171,167,203,213,183,173,160,189,210,177,186,145,107,128,123,116,127,100,112,87,73,153,241,183,183,187,151,154,200,192,217,216,168,179,176,142,155,89,153,202,111,116,89,143,137,94,45,152,181,153,193,171,190,169,168,240,209,238,218,234,221,205,131,190,223,119,238,202,225,236,212,250,195,216,208,86,92,74,84,50,0,40,16,4,23,12,18,21,20,7,38,42,37,31,17,10,32,35,6,25,16,5,3,43,31,61,67,42,86,90,99,91,55,50,17,13,30,49,40,86,143,103,98,95,97,108,71,91,75,89,94,80,37,56,72,53,70,82,132,128,120,132,117,132,118,137,137,136,132,103,117,106,89,88,135,165,148,145,155,122,146,131,152,117,142,164,128,116,141,117,120,99,83,97,43,62,36,73,59,60,56,76,45,54,63,54,53,58,58,67,77,68,70,62,80,74,72,52,57,47,44,57,37,33,36,27,28,44,27,24,25,40,52,15,36,48,59,63,38,10,6,36,22,26,17,25,19,6,32,18,26,27,3,22,42,67,142,142,101,69,66,74,39,26,32,3,32,27,6,6,39,26,6,7,14,38,19,15,28,37,27,75,72,24,43,62,33,43,56,48,37,40,71,43,80,76,87,98,74,70,63,87,49,50,54,55,72,93,91,125,136,99,118,138,138,120,128,99,108,88,79,73,58,45,49,36,34,62,54,83,36,50,26,27,24,38,40,69,90,47,41,17,52,127,113,115,103,84,82,98,77,114,123,118,134,135,132,190,206,217,249,251,249,240,219,212,242,246,254,234,204,225,236,240,218,220,206,223,218,201,200,213,211,216,228,238,216,223,245,207,243,235,221,216,236,230,235,223,219,215,244,248,243,242,228,223,206,250,227,246,236,124,3,3,3,7,13,9,17,0,15,20,10,30,7,14,20,22,14,9,10,3,16,22,31,23,203,183,203,212,187,200,210,193,186,207,203,198,173,196,207,197,178,187,188,192,196,157,191,194,218,215,171,193,187,184,185,194,207,180,190,199,201,200,188,209,224,212,196,180,165,172,172,183,183,176,180,213,169,179,199,210,188,188,198,197,179,187,210,199,200,174,191,199,191,177,177,207,204,173,179,191,183,199,196,209,192,201,172,181,188,188,204,187,192,190,204,195,182,215,191,187,212,196,181,211,202,236,173,210,192,204,204,190,192,187,201,204,211,202,206,214,193,213,210,203,212,212,214,208,210,206,194,218,204,210,210,221,206,226,213,202,190,204,197,210,188,209,195,209,196,206,207,212,206,224,210,230,201,187,207,203,190,199,201,215,196,209,211,207,229,194,215,186,215,176,210,217,188,221,203,226,220,210,220,203,230,229,205,215,230,228,212,223,210,189,210,236,214,205,216,183,223,213,233,204,218,186,194,221,210,207,205,221,203,222,224,184,182,224,208,226,218,197,214,197,206,208,205,210,206,215,227,201,216,220,211,219,209,225,199,212,196,190,204,211,214,203,214,215,210,214,202,193,199,206,214,229,203,204,232,204,191,195,224,189,208,191,209,185,171,187,200,191,181,192,172,177,213,191,208,202,215,218,205,179,207,162,191,190,186,189,187,192,200,211,195,213,226,195,226,212,183,214,194,178,174,197,199,206,134,150,202,196,168,148,185,173,193,192,175,122,66,107,110,101,100,78,49,72,57,129,205,184,183,219,131,177,212,211,229,181,158,149,160,140,177,122,184,167,66,82,77,86,78,110,126,198,192,206,183,126,142,120,165,235,194,228,215,234,209,218,130,213,222,135,210,225,220,212,214,235,226,219,119,55,60,71,53,20,10,3,51,24,1,7,9,15,29,23,10,2,24,41,28,19,16,22,22,13,2,22,31,33,83,44,43,75,124,79,69,31,30,5,12,6,44,78,92,87,135,97,66,58,93,110,123,130,113,124,172,114,101,135,102,129,141,114,85,63,48,49,56,72,63,60,42,67,63,44,74,54,59,47,77,76,53,66,65,54,51,55,48,60,42,60,66,38,79,73,79,72,79,85,45,106,76,57,55,102,71,79,59,42,51,53,52,59,43,50,24,44,35,52,33,22,55,18,22,28,24,38,20,21,7,49,15,12,20,5,12,20,52,56,63,83,80,94,71,35,8,24,16,13,36,11,31,23,20,37,4,28,18,15,16,114,183,127,126,143,131,126,67,27,9,28,24,35,9,22,18,17,8,18,30,11,10,28,35,65,78,86,70,56,62,37,48,47,34,18,28,40,37,34,45,28,49,54,59,31,57,73,52,77,50,73,42,74,60,76,102,88,73,65,55,48,75,81,116,128,126,133,141,112,119,106,93,79,94,95,73,67,77,62,41,41,26,55,43,33,74,24,82,135,126,120,122,112,129,121,94,138,136,59,67,46,63,180,238,248,251,236,174,118,129,104,114,123,143,134,104,154,224,214,213,216,222,237,216,214,209,226,226,228,212,205,196,226,233,225,209,229,204,203,232,228,213,232,221,236,229,231,207,242,207,219,232,228,225,207,210,94,1,5,7,15,2,10,16,28,29,9,19,3,0,11,2,7,19,17,16,6,0,27,17,29,206,197,185,198,191,206,201,197,203,204,191,215,215,207,187,196,185,184,221,217,197,184,191,216,175,220,178,197,182,195,179,190,186,202,191,199,204,187,198,189,184,183,184,205,171,176,195,192,181,193,180,196,185,182,196,186,185,171,196,201,177,207,218,201,189,191,198,189,194,178,182,205,176,187,183,193,181,186,197,192,211,179,213,228,186,210,222,208,195,186,191,199,182,197,190,202,215,210,210,203,206,210,209,189,168,209,225,195,212,176,213,204,177,183,180,200,191,199,192,199,212,222,210,230,223,211,208,204,198,184,207,228,211,196,204,194,214,236,217,199,220,225,215,232,204,191,234,203,198,234,217,210,207,203,185,222,214,202,208,192,210,191,216,207,207,235,201,213,198,202,205,217,200,199,215,206,209,216,179,206,206,213,213,226,212,217,207,229,215,188,197,245,195,203,217,216,213,203,187,197,190,224,240,187,214,214,216,198,206,207,218,226,211,204,221,232,199,234,207,213,209,204,199,204,196,221,218,195,189,223,235,228,191,208,234,201,193,216,199,215,217,215,221,190,222,217,187,200,201,203,197,209,185,199,211,203,211,185,200,199,200,210,181,202,205,217,207,186,193,196,195,183,187,183,217,187,189,190,205,186,190,202,198,194,218,180,185,194,195,192,203,185,187,199,190,193,204,190,200,167,172,196,211,195,151,169,205,201,149,137,178,186,175,152,150,103,98,100,68,127,63,42,35,56,95,150,204,171,197,207,142,155,225,205,227,176,162,179,170,169,185,117,142,137,69,74,106,74,61,56,56,203,164,229,152,151,190,143,189,216,190,193,178,220,219,175,134,236,196,151,201,198,216,199,240,218,208,221,57,49,40,45,17,14,36,22,17,14,15,25,19,30,31,10,51,28,27,36,39,13,25,42,27,45,33,17,48,42,43,36,60,75,90,45,63,42,15,3,16,17,43,68,125,117,89,71,21,16,100,65,95,139,107,129,130,124,119,95,109,164,156,148,66,77,83,59,57,66,99,80,79,90,93,80,91,96,60,56,56,106,99,88,85,79,36,77,95,63,95,48,60,69,65,80,55,65,88,80,74,58,55,31,37,46,47,46,46,24,11,17,26,22,21,21,14,20,25,8,19,38,7,7,15,31,14,27,6,13,19,10,23,27,10,3,23,87,82,112,142,134,138,133,105,40,18,7,12,28,8,19,27,14,31,9,21,17,3,47,36,132,171,131,155,137,179,151,78,22,14,5,11,12,19,38,13,7,22,15,0,44,59,17,29,95,110,146,139,118,99,83,80,13,20,13,36,26,19,11,40,9,29,18,33,36,41,46,15,31,38,53,67,61,59,56,73,87,53,50,66,45,45,50,86,103,55,79,73,75,88,84,95,126,139,123,150,127,135,131,125,128,95,85,75,90,82,63,123,176,156,142,111,115,112,97,103,127,113,107,57,70,73,185,224,250,233,99,43,61,49,64,70,64,49,56,22,82,177,192,228,218,208,202,225,214,222,226,233,244,203,241,211,235,220,216,230,224,203,218,220,252,228,207,227,204,243,240,224,217,216,225,230,220,229,240,223,114,6,1,5,6,29,35,22,29,6,14,1,13,12,9,7,22,34,30,10,2,16,16,18,0,201,199,207,204,202,215,183,213,206,185,186,196,200,187,210,199,188,214,195,172,206,198,194,180,184,187,199,192,201,192,190,223,182,177,208,195,206,187,172,214,204,175,205,187,199,202,185,187,183,173,195,192,191,191,220,182,214,194,188,204,190,155,197,205,215,160,206,196,209,184,188,182,177,189,185,186,205,183,204,187,198,204,184,197,201,209,198,186,193,197,197,209,192,191,180,196,189,202,182,168,183,197,214,204,215,215,187,194,190,218,191,215,219,196,200,204,204,194,205,207,219,201,210,215,207,200,211,196,195,219,223,224,225,221,207,171,224,238,225,222,239,227,236,221,221,224,197,205,215,191,199,216,214,214,183,211,211,201,196,192,210,213,205,208,202,215,217,194,226,212,194,212,200,212,214,235,236,214,217,220,215,216,210,212,213,181,223,228,216,216,228,197,228,214,215,176,209,211,209,193,192,211,214,244,213,234,220,186,215,209,204,186,194,206,217,198,211,215,205,201,211,214,200,217,211,242,207,217,236,229,222,221,222,180,212,212,233,238,204,205,238,221,228,201,200,218,204,210,206,215,195,242,209,207,189,206,190,199,208,196,218,201,199,191,186,180,183,189,193,187,191,215,185,210,193,191,210,205,206,191,192,194,188,207,213,186,210,192,188,195,202,174,172,203,188,183,205,222,216,189,193,187,204,157,154,191,175,214,143,131,172,162,180,168,150,114,113,67,54,67,47,10,25,74,68,169,213,174,211,212,159,150,224,221,195,182,181,188,154,196,194,119,107,130,94,125,100,84,66,39,5,122,161,195,156,200,194,163,218,222,209,206,216,214,221,148,165,245,196,152,233,211,206,232,227,201,233,163,25,51,61,23,25,36,10,17,5,25,20,3,20,22,22,1,16,30,29,22,19,31,18,15,19,41,18,56,11,28,54,66,68,100,61,32,56,38,2,26,19,44,76,104,108,101,141,116,6,56,114,91,79,86,56,90,63,69,47,81,105,122,85,51,34,38,20,58,82,59,42,71,64,66,61,97,122,124,118,72,42,47,41,55,45,40,40,45,49,64,39,61,54,38,75,39,41,27,24,12,18,28,5,19,44,12,43,24,31,18,40,31,26,41,52,18,8,16,6,60,25,7,26,7,6,17,9,12,6,31,24,42,32,22,31,11,63,170,193,179,174,206,199,194,144,62,18,28,48,31,33,29,15,22,15,40,21,3,33,14,69,132,152,180,163,127,176,151,106,39,19,22,20,11,15,33,30,13,31,24,54,19,6,28,52,140,156,147,151,182,155,155,95,32,6,40,19,37,10,3,14,13,17,38,26,33,13,24,25,18,36,20,29,41,20,43,44,38,45,31,42,72,68,73,69,91,76,80,68,48,84,57,66,97,74,51,43,59,103,107,127,123,127,155,132,143,135,139,138,157,153,121,107,91,104,134,119,130,105,99,140,109,204,235,233,251,178,58,11,16,73,73,23,31,44,24,1,78,204,222,214,233,203,201,241,221,219,217,218,220,220,217,191,220,193,206,231,206,205,198,212,205,211,206,192,216,222,237,230,217,233,237,237,243,238,210,216,123,3,23,5,8,11,5,17,0,27,29,31,13,17,8,15,34,20,23,19,3,26,18,4,14,198,204,200,184,191,221,208,228,179,177,193,191,199,205,203,214,213,205,168,202,220,206,187,204,208,200,201,209,188,166,198,187,174,195,201,177,182,190,188,169,186,195,190,192,188,179,201,203,185,189,185,224,205,187,220,205,215,218,210,181,181,202,175,206,170,199,179,207,186,176,194,212,183,186,193,196,202,170,174,191,227,193,219,223,226,186,183,210,214,176,201,207,208,189,202,203,213,205,169,200,198,222,180,200,199,191,189,205,196,214,190,227,219,201,203,198,196,190,224,202,202,195,202,203,199,191,220,185,193,204,214,244,208,190,220,201,193,189,226,211,210,198,219,198,197,214,194,219,214,222,209,195,199,191,209,226,189,211,187,233,167,214,202,228,218,194,208,210,202,187,183,213,211,210,207,186,206,207,204,221,185,192,218,212,209,220,219,227,201,212,214,229,214,230,189,195,216,199,213,199,209,212,216,241,213,213,216,196,213,217,214,209,219,220,202,211,214,211,239,213,219,217,207,223,194,211,230,218,230,205,226,191,198,215,208,184,215,193,210,184,213,212,219,169,203,223,189,217,213,213,200,204,165,200,202,208,194,177,213,222,213,162,196,198,203,181,193,203,205,192,196,179,203,204,172,214,215,206,209,169,201,216,198,195,200,201,182,187,178,170,206,212,188,181,207,193,199,205,244,188,185,194,210,174,187,228,200,190,146,160,204,198,174,160,166,145,123,82,109,93,51,52,59,128,157,240,227,199,198,208,156,145,221,231,193,160,216,215,142,153,183,136,174,176,110,99,103,68,56,56,18,91,182,217,153,199,156,162,232,224,211,195,207,227,196,157,190,236,181,152,233,231,237,239,210,185,144,100,76,66,32,40,29,44,13,13,14,33,26,18,30,21,6,49,18,19,25,3,19,4,56,22,19,41,24,23,37,30,82,67,88,46,61,55,29,36,18,34,59,72,123,103,121,92,99,76,20,83,111,129,90,91,51,124,92,34,77,26,56,89,129,49,10,96,62,49,120,82,27,12,53,28,21,93,96,73,86,59,17,38,34,39,27,20,11,24,29,6,54,18,36,12,27,30,28,22,18,19,1,5,31,34,17,15,18,53,65,58,84,70,66,80,97,47,16,31,16,2,26,15,20,13,19,33,24,26,5,22,7,19,8,11,23,4,81,208,200,186,167,178,182,186,171,70,3,11,15,27,6,25,16,36,11,26,19,33,16,33,60,104,141,168,140,140,144,158,147,51,19,16,28,19,28,20,45,27,16,29,24,29,50,21,55,140,156,163,167,146,176,175,164,49,14,11,10,38,16,25,9,9,14,7,9,11,17,11,23,18,15,19,29,41,32,22,24,36,20,21,29,33,48,34,40,52,53,64,57,45,50,66,92,62,79,64,85,71,59,61,77,64,59,86,70,68,114,86,72,85,72,68,45,87,88,65,96,94,128,106,119,99,140,200,162,172,91,60,29,69,57,86,66,57,62,65,21,152,243,221,219,213,204,226,223,228,199,212,233,208,220,236,198,207,219,212,228,215,206,221,222,206,231,213,206,205,210,230,225,224,223,214,199,216,233,210,236,130,1,0,10,18,11,16,30,0,2,21,10,13,0,0,10,26,16,25,4,20,16,18,13,12,213,197,190,173,228,202,212,189,190,218,195,233,192,181,200,197,189,207,201,174,212,224,210,183,211,184,201,186,199,206,167,191,199,180,193,218,187,177,194,199,217,199,196,191,191,209,209,214,203,194,198,213,182,168,192,200,193,198,184,199,208,181,204,191,194,210,177,184,193,187,182,205,208,179,178,201,186,224,178,196,196,205,207,203,208,194,207,215,215,182,204,206,182,199,207,195,209,212,168,210,231,200,188,189,198,210,212,200,193,221,212,186,198,224,214,193,215,187,203,197,200,207,214,199,216,209,211,214,212,202,241,207,193,191,211,198,207,188,206,225,228,207,226,216,191,206,198,220,204,220,190,232,194,204,195,208,187,179,196,205,202,193,204,178,204,209,220,204,204,223,238,209,195,209,232,194,203,213,204,219,204,206,197,229,208,225,217,191,205,215,221,208,211,219,199,223,207,235,240,235,213,226,187,216,206,224,211,212,220,217,213,194,241,195,225,214,191,209,205,192,220,208,220,220,212,184,199,243,209,192,203,208,213,204,218,209,187,212,199,196,214,187,195,228,224,215,204,232,217,201,183,201,215,229,196,219,216,209,197,205,218,184,188,197,191,226,200,208,182,192,201,179,148,184,179,187,179,200,213,185,212,186,195,187,203,210,231,163,206,215,197,200,184,187,190,194,193,198,227,187,194,175,215,143,191,180,176,219,151,197,198,184,190,150,160,150,161,112,121,56,65,38,101,168,203,236,209,171,164,193,168,158,183,200,170,181,238,151,106,125,204,149,198,157,77,129,88,91,104,64,53,158,250,244,168,157,133,164,202,221,218,247,208,224,182,137,186,205,138,156,147,243,237,230,170,99,60,43,75,44,26,50,21,11,24,18,32,21,20,13,24,45,17,43,34,20,20,44,26,8,8,25,31,26,60,51,35,68,80,88,59,58,60,20,31,46,42,66,61,121,84,135,126,90,108,79,44,96,130,87,117,117,123,130,61,61,61,44,97,150,121,59,80,129,135,71,111,129,47,23,21,24,66,127,95,92,113,38,22,32,24,40,40,33,29,33,4,23,34,5,14,36,16,11,3,22,10,20,51,17,21,29,14,21,8,78,121,148,139,168,167,160,199,74,20,34,24,29,13,25,14,19,26,32,11,2,10,22,19,20,17,30,12,16,95,166,151,161,110,96,125,119,105,24,28,42,10,31,8,43,28,17,18,20,52,34,35,17,63,68,138,158,132,136,135,165,182,94,7,25,22,34,28,37,37,37,20,31,23,46,45,22,68,144,170,116,167,163,210,175,190,110,29,9,18,17,24,22,38,33,20,24,19,15,22,3,29,17,27,3,8,38,49,59,63,32,15,16,49,50,17,17,41,40,16,45,55,54,27,27,66,41,26,68,67,92,70,90,76,80,85,61,75,61,94,70,73,58,49,71,77,66,78,58,45,33,55,58,70,55,43,39,30,45,42,13,18,56,45,93,46,50,63,35,40,218,244,232,245,217,220,212,217,214,204,219,233,208,208,196,239,224,206,201,221,221,218,191,233,230,225,230,224,230,210,217,221,204,197,223,233,233,224,249,222,119,15,12,23,18,17,32,22,14,31,4,13,14,6,7,12,3,17,8,6,0,19,20,11,11,224,218,231,216,195,212,228,190,193,196,192,202,206,201,191,181,222,205,211,184,204,166,190,178,179,192,188,181,199,192,184,189,190,192,183,189,188,202,204,202,184,203,213,200,197,206,220,198,187,178,202,202,191,183,177,192,196,203,193,197,210,200,198,190,192,228,210,181,181,196,192,180,183,209,193,194,197,199,171,179,190,198,196,216,180,229,191,179,195,202,209,203,209,213,190,210,205,187,210,200,214,183,210,240,211,207,214,206,222,195,199,202,206,201,207,210,211,209,202,206,181,192,191,176,219,191,216,220,226,203,192,226,217,221,202,194,212,211,196,208,217,204,211,196,216,217,224,223,204,198,212,211,206,212,203,220,204,199,228,218,218,202,211,209,207,195,219,220,218,194,225,213,229,206,226,213,193,202,190,196,205,177,210,207,222,217,236,209,211,216,206,210,205,206,208,206,229,232,207,224,211,202,232,215,243,215,195,223,231,198,213,206,227,232,208,212,218,224,208,223,234,201,202,199,228,203,194,215,211,214,214,207,232,199,197,211,196,200,202,213,210,213,202,230,229,202,220,210,213,218,224,226,199,188,235,226,188,210,220,200,208,198,201,216,206,210,178,185,222,185,206,163,175,208,228,205,207,195,203,204,206,190,203,204,212,200,206,202,187,203,209,182,237,187,191,198,191,194,198,179,202,164,189,136,176,177,174,170,151,215,225,164,143,127,144,157,158,142,107,38,63,48,104,203,218,241,201,166,194,228,179,159,194,205,168,213,168,128,149,180,209,155,167,97,80,109,92,87,81,20,51,221,225,204,140,152,172,160,217,206,196,216,227,241,145,158,202,77,70,90,125,197,241,209,104,65,58,31,65,33,22,7,27,10,15,51,27,24,18,11,27,15,15,23,29,17,54,22,7,26,35,21,27,57,34,45,63,61,91,71,83,56,48,12,65,95,77,96,82,100,115,130,91,88,126,123,118,146,119,74,117,113,119,96,57,77,87,74,97,83,120,136,71,159,148,88,82,161,117,44,35,48,138,175,93,108,94,49,29,59,26,48,52,57,21,20,19,14,10,16,12,22,13,19,11,35,5,11,5,8,29,23,30,20,25,189,187,210,228,223,180,190,214,105,11,23,15,8,14,22,16,26,17,24,20,16,19,32,23,14,8,23,0,31,92,110,67,68,35,38,60,71,87,31,15,6,36,26,40,17,12,66,14,33,47,32,19,29,31,91,147,190,139,145,166,133,188,71,11,17,10,18,29,43,24,21,38,19,19,15,26,20,89,123,113,128,109,116,160,140,174,99,29,21,35,24,12,9,42,23,4,33,14,17,40,19,19,8,25,28,27,21,91,72,54,95,73,45,59,59,37,14,1,31,18,19,18,9,3,27,24,19,39,37,58,14,29,50,47,59,78,56,77,57,41,83,64,70,98,115,140,142,110,90,36,22,8,40,63,60,64,53,55,32,39,14,63,72,55,76,73,59,68,15,133,228,219,232,247,231,202,209,221,198,204,221,229,217,220,226,218,230,208,221,229,211,243,223,248,206,228,203,228,229,227,204,216,235,227,230,202,223,210,226,230,134,1,15,2,12,17,14,14,13,4,20,39,39,2,1,19,1,0,12,11,5,19,3,25,7,207,196,219,195,199,196,192,196,184,200,190,185,190,204,196,191,187,212,225,209,203,216,204,205,178,200,219,194,219,218,196,200,213,180,178,200,194,181,211,194,201,193,202,195,205,195,207,207,205,220,193,206,191,211,209,225,194,194,189,176,200,194,199,192,193,204,202,212,230,189,171,211,210,204,199,216,199,186,223,206,207,191,192,215,195,204,191,223,213,212,201,218,189,191,210,201,204,209,197,185,206,205,207,216,196,212,201,199,196,190,199,185,199,208,192,214,211,221,180,201,186,189,210,209,197,214,206,213,221,230,201,222,205,215,229,233,205,200,223,199,185,173,208,197,205,205,221,206,214,216,214,214,207,199,204,207,197,202,220,216,214,205,213,209,177,216,229,207,189,228,198,209,210,213,196,206,242,208,192,210,237,221,211,202,207,200,217,203,227,224,210,212,215,224,219,208,221,212,211,202,222,227,207,222,200,214,205,192,204,222,206,229,178,220,205,203,216,211,219,201,200,212,226,206,220,216,206,222,186,208,208,199,217,208,193,202,215,211,218,221,208,210,222,220,193,239,236,208,212,207,204,218,197,220,225,218,204,195,213,193,190,202,203,208,206,190,187,193,205,214,210,205,180,203,219,213,197,205,205,222,210,211,201,218,197,210,199,198,207,220,193,225,197,182,199,191,201,202,190,172,198,198,173,152,183,192,186,213,143,207,198,150,152,137,145,194,182,139,86,72,41,49,91,211,211,245,219,163,226,167,105,113,173,163,187,218,165,174,185,230,161,109,170,125,87,129,64,101,62,51,53,172,203,134,152,187,168,189,232,204,191,199,178,219,146,176,189,50,45,74,83,148,240,202,46,37,62,69,8,15,35,22,21,24,23,17,23,38,20,10,1,14,23,7,11,35,26,17,2,32,20,30,18,45,38,51,61,73,67,77,53,28,32,62,144,203,132,113,100,103,137,111,119,119,140,195,135,113,38,84,111,103,77,110,57,96,75,67,88,47,91,120,158,176,196,81,101,138,129,68,45,39,144,132,93,106,71,37,76,69,97,84,142,109,27,41,15,12,31,26,24,31,3,9,16,31,46,11,27,6,31,32,23,33,37,170,189,190,190,171,178,176,192,128,6,20,9,13,21,39,11,9,16,20,34,17,20,31,30,25,10,21,24,31,98,112,45,84,41,51,29,38,30,27,44,11,48,11,31,17,25,36,28,62,11,53,14,16,36,82,118,148,105,148,135,156,169,74,31,22,37,31,18,19,11,7,25,14,36,43,34,45,113,129,92,71,48,61,121,88,117,92,17,15,13,27,18,19,24,17,31,8,54,15,8,28,36,17,33,11,14,40,137,169,152,141,135,131,129,98,40,25,3,38,18,18,15,37,10,13,32,10,15,24,19,18,16,33,39,42,36,43,25,42,41,25,47,47,37,51,115,128,106,96,110,137,147,163,105,117,75,68,70,70,89,87,112,106,116,120,127,101,81,75,157,227,229,207,230,205,214,220,222,216,209,214,221,201,215,197,240,232,195,221,235,202,230,226,225,219,193,232,205,239,186,218,213,205,230,251,245,199,235,220,237,124,7,12,10,8,13,15,32,0,4,21,18,2,22,1,4,14,0,5,1,20,20,22,10,37,190,204,225,205,180,197,205,211,187,212,194,168,174,186,203,203,224,196,191,213,185,223,202,204,199,193,189,201,223,209,180,202,207,199,197,195,199,182,190,222,198,194,207,202,194,210,197,205,198,192,199,220,197,201,187,197,212,194,219,206,220,219,199,193,206,169,192,194,191,203,197,206,196,174,216,202,212,199,198,203,225,203,181,181,209,193,207,200,215,209,198,200,198,210,208,228,198,205,198,198,203,157,216,225,204,221,182,176,202,218,217,210,195,203,214,211,201,204,207,196,204,222,229,205,217,203,214,195,200,187,211,201,220,179,174,202,216,214,197,203,212,220,200,207,222,202,218,220,208,212,202,179,226,197,201,216,214,210,222,210,204,203,214,221,215,190,212,195,199,196,212,212,235,217,211,208,227,219,206,195,206,195,208,195,215,220,183,192,229,199,220,224,195,215,204,231,217,221,201,208,209,218,190,224,211,222,203,187,189,209,216,197,207,206,207,205,215,231,204,196,192,213,215,219,234,198,207,221,225,194,228,206,225,198,191,213,232,216,235,205,221,194,237,169,136,179,205,199,210,207,210,210,219,193,198,191,207,229,196,182,188,196,222,212,218,183,186,220,204,210,187,184,192,193,195,237,192,199,222,190,221,218,195,205,202,219,198,230,222,197,207,210,195,205,170,213,201,191,210,199,193,221,205,148,197,217,164,190,138,147,113,95,166,166,161,178,200,155,114,42,34,31,90,167,191,244,194,173,195,169,99,137,80,162,223,198,154,182,213,209,129,88,202,177,143,105,88,78,52,68,84,173,199,176,175,221,179,197,242,211,223,209,228,185,115,188,151,48,124,96,96,148,220,145,36,62,67,42,26,11,16,32,31,12,27,18,37,23,21,34,52,13,33,37,14,28,15,29,33,12,34,46,29,65,61,65,67,67,49,76,38,7,54,144,171,170,108,106,130,128,104,113,80,125,150,179,156,116,47,62,113,129,95,126,109,118,84,82,81,23,74,106,173,206,152,60,56,164,144,94,0,33,151,116,99,99,101,78,129,143,165,179,191,188,27,0,12,28,24,16,10,31,27,32,35,16,32,24,23,30,14,4,23,45,58,173,132,106,97,78,79,103,111,58,15,17,16,13,3,22,7,29,27,1,24,26,21,39,33,38,36,41,24,60,84,133,57,42,53,23,58,43,61,11,42,24,30,29,38,20,48,47,63,17,38,37,16,23,42,74,80,104,93,107,96,68,102,51,42,19,27,31,26,32,51,48,35,58,22,17,24,48,148,127,77,62,71,36,85,45,51,34,24,30,51,31,23,28,36,17,19,22,11,49,0,41,31,63,45,32,41,25,171,183,162,194,189,184,178,141,86,27,24,48,5,0,22,18,30,33,27,15,22,18,9,10,3,4,41,18,27,15,45,45,24,57,34,10,29,48,83,108,103,159,252,247,237,158,93,59,4,21,54,85,99,108,90,117,94,109,124,135,124,45,144,199,218,239,216,226,231,215,242,218,201,204,209,230,231,208,204,234,200,218,215,190,241,210,189,218,218,211,222,227,196,245,226,215,221,216,232,224,226,241,225,97,15,20,5,16,10,6,3,14,13,17,9,12,4,8,24,5,6,2,6,12,23,29,14,30,223,197,193,208,194,198,187,205,203,200,213,192,204,198,179,197,190,208,199,192,216,193,181,201,206,195,222,184,192,206,213,214,208,180,201,198,193,218,194,195,189,188,201,218,212,198,173,202,200,207,210,202,194,184,199,203,184,187,194,213,198,191,188,203,186,213,216,198,211,196,183,181,193,217,197,201,210,177,214,204,200,190,203,183,206,182,188,211,191,207,205,197,176,216,201,203,200,203,166,204,189,213,194,207,201,169,194,217,220,192,193,195,203,200,204,216,196,205,218,212,207,202,225,195,211,208,216,206,218,199,226,204,212,204,206,220,235,216,210,222,212,234,179,207,202,193,206,207,216,190,192,181,167,209,207,194,210,195,206,206,204,209,238,231,198,210,210,181,188,191,214,209,201,225,209,202,208,209,214,210,225,203,213,221,207,218,213,201,214,217,210,211,199,206,219,226,227,202,221,217,213,210,194,194,217,185,244,208,176,218,222,202,219,226,206,201,199,195,196,192,198,187,211,202,216,203,207,189,216,218,195,190,210,214,201,196,189,211,192,221,201,223,210,172,118,162,186,181,220,209,207,208,202,223,212,218,187,203,211,201,193,194,202,230,184,223,193,205,221,212,220,211,193,196,213,206,228,212,207,198,197,210,226,216,182,224,230,194,204,196,195,210,215,197,193,215,206,189,191,220,205,237,188,141,203,203,163,164,97,133,90,48,146,130,156,183,134,128,86,57,35,7,96,180,198,214,213,196,192,179,153,158,85,109,239,194,176,134,173,202,128,139,212,163,121,106,114,106,59,56,61,179,180,182,194,236,182,201,222,196,222,219,224,181,130,218,134,71,108,75,89,180,229,168,73,92,26,5,20,10,1,28,23,22,22,30,47,35,25,21,45,33,19,14,31,43,34,16,23,26,55,25,39,56,57,69,43,31,80,51,23,1,92,106,94,121,49,59,133,99,92,89,127,93,87,100,191,153,77,99,94,107,121,138,143,128,104,52,43,26,51,49,82,101,47,15,70,179,107,54,27,79,162,94,117,112,110,150,166,183,184,179,189,163,30,23,5,34,35,14,19,33,7,17,34,33,22,15,19,20,28,26,15,33,74,103,56,56,48,45,57,57,86,21,9,5,4,26,12,20,21,53,30,31,42,19,23,21,17,39,30,25,33,40,99,81,66,69,53,80,56,93,60,39,48,42,24,42,36,37,40,70,62,34,54,42,44,38,64,60,94,62,46,57,20,30,47,34,13,24,15,29,19,42,40,39,22,34,13,46,21,58,123,99,54,38,51,63,48,31,52,34,5,32,29,18,37,49,26,8,22,1,25,11,18,41,12,6,15,2,27,48,140,142,157,166,194,176,183,193,160,41,15,14,24,4,35,27,17,37,32,25,25,4,30,35,21,23,12,19,30,41,67,72,76,73,67,68,35,32,101,114,134,184,251,244,216,122,55,53,5,69,107,83,80,41,28,54,106,78,114,105,120,67,70,206,228,229,235,225,231,226,217,219,228,208,232,218,198,242,208,201,214,200,208,215,232,217,216,217,212,234,233,213,222,191,209,207,208,190,197,232,217,223,215,131,14,11,7,13,29,7,4,17,7,11,17,14,0,1,0,1,12,2,17,0,21,10,6,14,192,198,175,213,213,196,206,198,204,197,205,183,216,228,194,186,180,208,195,212,201,187,165,199,201,202,238,209,199,210,208,189,194,175,196,204,209,184,188,203,206,195,211,186,199,226,185,197,197,210,202,210,209,207,202,182,200,196,188,202,206,170,196,235,204,197,189,201,215,212,191,206,197,205,233,224,219,224,193,199,193,191,197,192,214,212,200,197,213,190,193,227,184,201,221,212,206,171,198,190,192,214,211,199,209,199,193,216,211,225,216,194,217,223,217,200,193,213,197,221,179,208,202,206,215,212,219,215,187,206,200,195,205,188,193,214,201,215,204,199,230,206,213,205,202,210,219,206,224,180,230,216,199,205,208,195,227,199,217,199,208,203,199,213,201,178,214,218,199,213,208,219,184,216,219,212,194,215,213,189,211,192,201,208,212,204,224,211,209,226,207,195,192,207,196,199,230,185,211,224,216,228,212,198,233,205,233,209,209,217,198,233,192,203,215,211,211,235,199,206,214,196,200,191,193,219,217,199,184,201,219,197,204,208,197,202,204,198,207,213,204,201,221,184,197,201,212,185,228,210,205,220,184,209,173,214,212,196,195,200,196,217,208,205,199,196,216,220,220,216,188,210,222,238,201,207,181,212,243,202,190,227,223,204,191,217,195,215,201,203,206,219,205,226,214,235,193,181,214,209,212,206,177,145,199,229,190,177,114,160,120,104,151,125,156,168,155,125,80,51,39,53,138,203,201,219,193,166,212,219,213,202,115,134,194,209,141,142,182,218,156,119,188,151,112,129,86,87,62,48,51,170,237,182,160,241,184,205,216,194,217,203,239,176,139,226,189,110,136,72,89,241,253,161,75,26,34,24,12,14,5,12,28,37,41,21,22,14,15,56,2,25,44,20,10,16,29,31,24,37,53,36,71,68,88,85,64,52,49,15,26,32,87,85,64,96,49,102,57,98,95,78,108,106,74,71,154,190,154,87,72,95,72,102,120,90,95,39,22,62,21,42,3,26,38,45,118,129,59,23,7,76,139,81,65,68,99,137,121,124,134,131,150,111,28,29,0,8,19,11,5,30,32,19,33,41,9,44,50,52,30,34,20,38,100,129,76,45,42,40,47,38,44,26,34,36,30,8,7,11,28,25,39,49,15,57,30,60,62,39,75,62,66,77,164,174,114,144,121,173,142,158,133,152,115,157,158,138,165,173,177,180,163,197,156,190,157,180,166,171,146,118,52,49,34,19,36,59,26,50,59,62,58,48,50,41,37,26,54,38,35,62,110,90,102,58,44,52,76,45,69,28,27,20,3,36,20,17,39,32,13,21,23,4,7,28,31,8,30,30,28,50,122,125,104,139,107,147,126,184,123,54,15,36,0,19,14,25,30,16,42,11,20,12,36,20,56,21,35,10,8,74,156,185,177,110,110,131,76,137,121,132,100,178,235,217,108,35,45,29,73,130,135,71,57,64,46,79,92,79,80,79,77,74,108,207,240,244,224,219,220,227,230,199,212,204,212,216,203,204,211,199,223,224,222,212,207,199,227,200,215,204,228,205,202,223,217,219,211,234,212,242,206,220,229,130,6,5,8,23,4,17,16,17,1,16,19,16,11,12,4,2,2,0,18,1,10,4,24,10,204,191,207,185,213,183,224,183,171,185,191,230,221,206,199,197,188,193,197,181,194,203,204,192,192,210,194,198,194,214,216,190,182,185,204,190,206,188,207,178,205,193,217,209,207,196,211,205,198,213,199,176,179,186,205,184,213,223,207,202,168,199,223,210,205,200,193,176,191,193,202,184,199,210,205,201,207,211,197,192,170,214,220,210,211,216,193,207,200,194,184,201,193,212,223,230,213,206,205,197,168,219,226,200,179,202,209,211,225,218,202,206,201,213,204,198,208,212,209,195,207,200,199,195,208,221,234,207,221,209,224,203,209,230,183,204,227,212,228,186,211,195,209,192,232,209,214,212,210,201,218,197,218,216,215,215,202,215,191,234,231,188,188,202,204,210,213,187,221,211,201,197,203,204,224,222,225,215,198,209,206,198,231,226,192,212,214,216,202,193,217,197,205,207,208,199,196,211,213,211,211,228,224,218,242,211,220,221,198,224,195,212,203,208,192,213,209,191,234,200,189,217,222,193,208,207,217,185,198,210,195,209,208,208,204,198,201,231,213,194,204,221,231,216,233,216,228,223,219,214,229,205,184,199,184,237,185,213,217,200,212,207,208,182,213,199,215,194,213,217,193,220,202,208,204,191,205,209,201,215,193,210,217,211,205,196,203,204,211,199,200,190,226,222,196,202,214,217,197,221,222,218,172,193,230,187,149,200,151,211,161,123,205,192,207,203,197,148,80,77,68,61,134,211,186,212,176,192,203,207,225,210,103,133,142,143,200,180,216,208,146,145,242,145,99,117,99,109,55,16,25,102,166,131,175,212,163,185,228,213,224,212,223,144,189,243,175,114,115,132,158,254,228,131,52,30,10,12,12,12,11,21,23,25,12,22,9,8,23,25,0,33,14,12,11,40,42,3,36,48,63,50,77,75,47,74,54,50,11,20,12,52,120,41,65,119,69,101,80,100,99,108,114,123,101,78,142,211,195,144,80,70,66,95,90,88,86,62,31,19,39,6,28,10,37,128,142,69,16,38,10,52,125,79,100,72,54,69,27,56,35,38,73,22,26,32,11,13,32,49,25,34,29,41,43,10,38,56,31,25,31,24,24,23,104,93,51,70,61,38,39,54,65,23,44,39,41,23,39,77,67,80,64,109,105,137,144,188,166,194,215,218,214,195,199,180,195,174,153,173,137,151,169,162,201,206,169,188,175,187,158,164,176,186,137,178,167,183,156,149,138,106,94,87,83,110,108,84,100,112,182,163,183,145,129,103,120,89,70,60,58,91,148,189,140,132,138,113,122,118,79,37,24,26,24,19,39,37,18,56,36,11,33,13,25,42,39,1,23,27,9,47,138,128,63,50,60,54,108,95,68,52,14,13,20,32,27,39,9,27,24,39,13,16,22,11,29,16,15,30,55,88,193,205,228,204,226,212,167,190,156,99,82,161,198,75,24,27,25,46,132,156,61,62,118,122,89,89,65,86,65,53,88,76,97,228,229,229,255,231,229,236,216,208,219,192,227,218,219,205,213,211,214,229,235,222,228,222,205,209,214,203,219,220,203,194,216,222,214,208,243,210,219,213,218,113,11,35,6,12,24,22,13,12,30,27,12,26,14,13,7,27,4,1,24,6,10,46,4,3,189,202,190,195,209,202,187,195,186,208,202,198,168,190,194,212,219,207,203,192,184,195,190,217,201,191,195,187,197,173,200,216,196,205,207,195,188,184,177,204,185,178,195,230,222,205,193,195,199,201,208,185,218,221,199,187,189,220,200,176,189,198,202,210,187,198,191,194,183,221,202,224,197,201,205,219,203,221,217,184,203,207,189,198,194,198,203,181,218,176,215,208,207,209,197,196,206,184,195,193,194,203,195,205,212,193,196,214,208,189,190,204,191,185,214,207,169,193,210,204,197,211,200,208,179,200,183,191,195,210,179,196,185,206,211,230,205,215,218,192,204,202,213,221,206,203,180,228,213,193,182,213,202,217,190,216,221,209,182,221,208,215,198,221,215,202,200,205,200,204,201,222,205,211,199,203,214,202,204,204,195,213,207,225,215,200,181,213,196,208,221,231,211,215,220,201,203,216,213,218,198,182,212,204,203,197,205,229,216,201,228,208,224,210,196,207,201,209,214,211,237,177,216,216,193,202,221,210,203,194,223,208,230,215,203,198,213,195,207,220,193,230,209,214,222,203,232,200,196,199,189,200,207,204,218,213,209,225,222,201,205,195,192,224,189,205,219,179,185,209,227,220,216,203,198,200,220,220,195,224,201,225,209,216,220,182,185,191,220,218,195,199,217,200,219,196,246,197,203,215,217,210,165,183,218,217,173,207,192,207,157,156,181,178,204,167,214,195,93,42,38,29,152,252,172,227,177,202,200,210,242,179,130,139,161,157,168,207,159,163,153,165,238,131,105,75,72,88,80,74,7,111,192,174,188,223,168,217,233,216,216,227,197,139,173,231,200,127,111,69,150,212,135,75,39,7,13,45,30,20,60,5,2,6,10,28,17,8,18,12,9,21,10,22,28,20,27,35,48,62,26,54,55,97,68,65,27,41,23,23,32,80,80,15,83,127,84,81,113,129,108,114,107,99,129,110,122,148,180,148,154,88,76,89,95,60,98,43,27,19,64,33,60,65,103,164,105,30,12,25,0,88,112,89,110,48,53,67,56,46,35,59,59,22,17,23,36,12,26,60,37,41,37,49,38,41,23,50,53,35,38,41,30,62,124,144,83,126,53,80,118,105,91,69,91,134,100,168,192,202,202,224,218,211,198,177,202,202,210,193,174,188,191,141,131,77,123,85,58,70,72,59,61,62,71,96,100,65,67,59,49,79,44,63,90,72,62,58,71,72,38,47,96,68,77,76,106,86,120,128,143,144,153,155,185,185,195,200,186,160,175,119,203,186,182,163,145,135,142,166,139,54,36,58,33,34,16,35,28,11,39,38,40,20,40,40,19,47,25,42,21,102,93,97,45,44,42,71,46,64,63,34,12,27,16,24,30,25,22,24,21,61,27,38,39,28,13,27,25,24,18,79,151,182,190,191,203,179,190,194,135,121,87,153,140,21,40,39,40,76,134,130,46,65,140,105,129,123,84,103,31,41,63,38,43,171,209,205,227,235,236,233,206,175,220,226,215,182,181,217,217,205,215,200,214,217,213,208,219,220,197,197,214,206,223,216,232,221,222,229,221,182,230,227,216,119,19,0,9,14,15,6,12,21,43,2,6,13,13,14,24,7,7,19,10,1,1,10,24,6,204,187,235,198,191,201,185,188,213,192,199,219,202,192,196,187,204,208,174,173,196,229,196,202,206,203,200,221,202,206,226,215,169,194,192,209,210,196,179,178,192,208,202,190,198,199,207,185,206,195,197,213,210,192,190,194,196,211,210,215,204,188,212,191,194,202,213,181,197,187,204,216,192,203,211,184,212,210,210,193,220,211,213,226,205,185,210,202,208,186,200,198,182,194,224,203,200,183,200,215,189,199,190,198,193,201,216,213,221,199,197,216,197,223,213,164,194,210,197,204,198,197,220,209,206,200,177,202,235,196,212,206,220,209,213,211,205,201,218,191,188,201,201,208,193,207,206,201,188,221,203,196,197,229,220,180,197,191,223,192,187,189,199,203,225,220,187,208,229,218,206,207,223,216,213,200,202,194,222,229,224,203,196,235,225,208,187,221,213,181,225,197,201,196,215,204,216,204,218,217,212,204,216,223,219,209,198,203,208,174,190,198,184,204,176,203,221,218,209,223,200,237,208,224,209,217,210,211,202,220,231,219,221,196,241,204,188,211,223,211,221,176,207,193,219,215,232,180,217,222,210,213,199,211,185,192,187,216,198,195,204,190,236,195,195,218,201,225,204,211,204,198,181,213,212,218,214,200,233,199,200,198,239,210,212,181,213,187,213,231,203,188,206,178,218,200,214,199,206,214,199,209,172,192,229,203,191,200,191,163,145,123,146,125,151,173,217,171,102,75,25,45,125,241,218,195,170,198,207,192,181,202,196,160,182,207,172,187,142,186,135,177,236,152,98,60,93,113,99,46,50,188,213,206,205,208,154,223,223,202,202,214,153,125,163,213,175,114,100,69,66,90,67,36,8,19,17,17,41,34,17,25,41,13,26,27,8,16,13,39,15,5,18,29,26,18,18,71,17,28,66,85,64,70,75,49,36,19,53,78,71,155,103,63,122,130,111,114,103,133,119,130,98,119,110,112,110,143,132,147,109,72,68,74,66,37,60,54,46,33,34,41,101,140,114,44,7,23,33,14,30,85,116,74,120,50,73,41,38,34,50,66,76,57,12,16,35,20,25,33,32,54,51,50,20,67,84,64,84,72,68,82,96,144,179,172,172,201,168,200,173,179,183,161,176,182,176,196,199,169,163,149,152,121,113,113,69,63,91,67,56,64,68,71,54,72,60,41,60,75,44,63,77,105,79,90,70,79,73,72,98,82,87,83,41,84,75,54,72,65,66,74,94,62,52,72,56,44,60,58,65,59,83,70,63,96,82,92,95,123,127,127,122,123,92,100,105,111,124,158,154,110,117,138,143,86,67,82,59,58,58,41,44,18,68,44,36,40,34,36,22,113,122,104,63,50,47,42,54,30,57,41,37,17,26,6,18,25,27,6,25,39,37,6,14,5,20,35,47,16,10,46,134,143,123,136,121,154,167,121,121,105,85,127,102,33,40,32,3,80,151,70,58,55,70,79,113,109,56,68,66,65,79,40,20,117,112,138,138,164,202,237,226,195,196,214,213,227,215,213,191,195,235,218,220,221,199,209,210,220,197,208,237,223,237,222,215,206,220,189,223,214,221,220,229,119,30,6,1,9,27,10,16,5,54,22,0,25,9,2,10,11,27,13,25,4,13,6,2,13,218,217,191,197,204,185,208,226,174,217,204,205,185,199,205,189,197,180,186,202,199,195,206,198,205,207,191,190,204,203,215,213,212,235,213,184,205,192,219,189,211,191,233,215,180,210,195,225,194,233,220,200,209,202,188,199,226,201,189,176,204,172,210,205,219,202,226,203,187,193,210,206,220,196,219,195,192,220,211,192,207,196,189,217,179,212,196,191,202,215,196,181,202,190,209,184,175,195,202,202,202,213,195,196,206,207,225,192,187,199,199,202,212,180,197,202,214,206,197,214,207,215,182,226,195,170,192,204,212,200,205,221,198,201,201,192,190,219,182,199,187,211,205,251,209,186,219,235,214,198,192,214,205,210,207,209,202,205,216,199,228,201,189,192,184,184,163,198,212,203,191,193,198,220,235,209,212,194,202,211,203,190,208,192,207,223,226,218,209,197,230,211,206,218,214,200,206,203,210,192,195,213,217,206,210,197,194,196,218,185,238,207,202,207,205,198,215,200,232,199,220,197,208,175,211,193,225,205,213,206,212,185,205,177,206,186,202,219,231,188,221,195,195,207,209,202,195,209,204,188,214,203,211,192,206,205,211,200,193,229,189,227,197,175,194,205,231,233,196,200,202,207,207,196,166,211,235,205,210,195,165,214,190,210,217,191,209,181,200,190,196,203,207,208,206,206,212,213,196,203,199,192,163,202,220,182,149,169,144,116,102,131,139,151,183,182,235,201,99,31,7,27,140,198,216,208,194,225,199,214,194,227,233,175,185,194,170,174,181,182,138,141,192,126,108,88,84,88,72,66,56,160,212,184,232,198,155,197,186,220,186,216,115,136,223,228,164,84,61,60,100,60,24,14,8,12,47,30,28,15,41,16,32,25,23,25,21,21,20,36,10,11,23,13,32,34,36,43,40,54,71,55,83,64,78,36,47,68,116,141,149,135,140,129,126,112,119,101,81,118,121,130,115,119,77,56,88,64,86,98,73,41,32,42,39,18,14,13,21,87,89,150,143,55,52,5,5,43,15,5,40,100,85,92,135,48,52,34,58,78,111,108,97,29,47,48,65,44,72,71,136,126,156,145,191,178,208,219,193,187,204,178,168,160,171,157,128,137,133,116,128,142,120,95,51,76,117,91,35,57,68,66,67,80,64,73,72,67,48,52,97,68,90,37,52,55,67,58,44,68,24,38,70,66,63,60,34,56,38,53,35,31,48,51,7,29,35,33,40,57,66,21,44,64,51,42,53,68,54,38,46,50,78,43,52,69,65,62,56,52,29,40,63,73,82,60,49,64,79,106,99,94,148,195,171,166,143,148,145,149,135,132,74,77,75,93,67,57,66,48,39,98,161,143,109,93,85,84,54,86,70,28,26,10,39,52,21,45,35,29,38,31,32,42,47,38,35,34,11,32,31,67,107,74,35,46,58,58,49,59,65,114,94,110,106,27,30,14,20,77,159,130,82,68,59,27,112,60,57,64,45,72,87,83,54,79,69,23,57,66,173,223,207,227,211,194,220,226,222,191,207,202,205,220,221,218,208,230,182,235,198,197,209,217,213,234,200,209,227,181,231,214,203,223,226,130,1,12,15,1,8,34,15,36,12,22,12,32,1,2,16,4,47,7,24,20,2,16,32,5,204,215,186,196,171,202,193,178,200,194,199,195,197,187,207,210,201,205,220,209,202,193,198,194,202,202,189,176,207,204,205,206,193,205,208,202,191,189,213,210,213,169,207,194,194,207,177,186,217,201,231,216,192,190,196,201,188,196,222,194,183,227,198,193,176,215,214,203,177,196,210,172,204,210,184,190,194,191,188,178,207,224,194,184,194,202,198,205,188,201,216,198,180,205,193,209,204,211,223,207,208,210,206,190,200,193,222,187,190,213,232,180,184,199,179,196,206,195,203,195,227,215,189,202,195,192,201,198,230,201,203,201,206,203,204,192,205,204,216,219,207,202,207,210,185,219,204,226,217,202,199,196,213,174,211,200,200,198,193,213,201,222,210,191,221,181,181,221,197,202,218,189,219,212,195,225,209,221,190,207,207,198,195,202,192,189,205,206,228,198,197,213,210,181,220,213,234,183,190,220,203,199,202,203,179,190,227,190,225,203,214,196,215,209,204,205,217,191,205,230,205,219,194,206,211,195,204,206,194,206,209,219,189,207,204,194,202,213,200,211,208,204,199,208,218,198,205,204,186,223,223,199,188,206,211,214,220,215,198,195,223,175,209,176,213,190,202,213,201,223,220,187,200,200,203,198,203,218,202,208,198,204,212,204,219,226,207,200,202,205,188,210,180,191,248,185,200,203,210,196,204,189,154,223,185,125,115,121,171,117,87,173,196,180,205,241,246,187,107,52,57,50,143,246,247,204,203,220,250,247,174,121,153,111,128,148,181,189,177,193,69,83,162,145,119,102,98,81,42,43,82,137,181,191,224,183,157,231,190,199,201,212,131,197,215,213,226,88,68,51,57,40,6,9,13,7,26,10,31,31,29,17,14,34,22,27,28,23,22,11,20,11,13,33,50,42,27,43,66,85,76,79,117,145,138,121,176,127,169,159,136,142,130,115,88,91,85,89,65,62,95,126,119,100,38,42,50,17,49,70,60,32,41,18,25,17,52,46,54,68,87,78,11,23,3,30,33,12,48,15,61,132,91,90,108,80,95,117,162,160,199,191,188,143,180,194,218,215,199,192,215,171,184,167,188,184,178,162,154,100,86,97,72,57,92,76,62,63,58,27,61,70,63,87,59,81,101,56,87,90,86,75,56,73,66,48,29,57,41,47,69,45,62,35,39,24,8,30,52,30,17,13,14,11,31,23,28,3,40,16,11,18,9,9,16,14,46,24,20,35,27,23,11,19,43,9,25,9,23,37,22,44,37,24,25,48,36,67,68,79,59,72,68,55,90,80,72,66,41,76,63,57,61,63,85,60,83,114,119,147,170,138,141,152,162,160,139,127,128,109,132,165,188,199,177,145,151,161,153,147,95,24,40,35,27,13,40,20,51,47,51,13,51,34,19,20,52,38,36,34,12,71,102,46,48,31,44,44,59,41,36,53,82,124,94,49,36,24,24,59,107,115,99,77,35,19,26,41,58,54,62,60,77,68,59,66,54,43,52,47,106,203,236,231,224,227,227,215,242,199,209,208,216,222,216,205,233,209,224,203,212,220,206,214,233,194,206,217,211,218,242,211,230,210,240,116,12,2,5,21,3,33,5,3,22,18,23,25,4,9,4,8,12,0,4,12,7,8,10,11,199,187,203,202,193,172,185,191,181,184,236,194,202,188,235,186,183,221,200,185,186,186,195,183,202,218,199,184,178,209,189,213,217,191,195,204,192,191,208,195,214,190,185,198,206,195,191,215,192,188,192,186,182,201,221,198,193,205,224,194,189,197,205,214,185,213,209,205,201,187,206,211,204,189,216,199,199,198,223,178,201,166,212,190,218,210,204,193,191,178,214,190,208,184,198,224,234,211,200,220,203,198,188,197,221,206,209,197,180,196,199,224,195,183,229,220,221,209,219,189,211,210,206,193,193,192,222,210,209,188,197,202,214,206,205,209,190,192,225,211,215,199,202,197,225,186,186,189,188,203,219,225,215,210,214,220,195,179,203,182,197,202,212,201,214,210,220,208,209,212,208,231,208,184,191,216,227,193,204,208,216,192,203,210,206,176,215,195,197,198,203,213,185,206,177,195,193,202,234,210,224,204,206,210,192,196,184,246,204,184,178,218,206,228,200,179,183,207,196,197,193,212,178,212,207,220,196,202,203,195,195,192,206,171,215,196,222,218,198,209,182,178,211,223,218,201,213,201,208,214,215,208,198,202,217,211,198,200,215,198,207,192,179,207,221,243,208,199,200,195,196,222,225,187,189,214,217,176,206,190,198,208,216,196,231,205,205,194,215,190,213,207,168,219,207,215,200,205,218,211,211,154,157,211,207,104,94,153,168,97,109,170,175,166,179,194,173,138,90,74,109,72,81,192,201,141,147,199,222,213,109,65,61,51,105,199,239,207,173,141,111,103,213,147,115,117,73,71,49,84,49,93,177,225,244,172,178,215,185,185,230,188,135,201,215,216,210,114,66,27,23,4,10,32,27,6,14,6,32,30,20,36,34,24,21,42,38,22,6,5,20,3,35,44,53,30,42,79,96,67,80,148,176,172,164,169,145,121,129,129,104,107,67,71,58,88,80,58,63,56,79,79,83,79,34,39,66,35,81,61,73,45,21,13,47,25,53,71,77,57,5,63,17,16,7,18,8,5,15,9,56,93,105,114,115,150,186,163,209,180,183,157,183,152,182,157,180,141,108,121,111,125,68,49,86,50,44,76,47,63,59,60,40,59,56,65,54,67,67,71,83,93,76,84,41,67,63,47,47,62,53,33,24,27,32,30,29,3,18,17,14,8,32,26,23,15,9,5,5,9,8,42,43,20,36,15,14,32,24,16,16,8,29,39,35,24,7,23,30,24,15,13,18,20,10,6,3,8,21,28,26,17,34,34,11,22,37,20,33,20,37,27,49,22,50,46,49,50,35,49,53,64,60,88,82,77,88,71,81,28,60,75,70,96,125,109,126,120,135,156,177,144,140,128,124,144,167,144,165,152,131,66,56,53,47,69,70,38,56,48,16,40,23,59,29,16,51,24,18,34,38,108,85,86,94,55,62,45,68,40,55,108,99,92,109,50,23,21,11,20,68,105,118,83,48,54,44,23,58,35,73,36,50,62,61,70,72,59,61,55,100,232,246,233,252,232,243,217,229,222,230,200,203,239,212,203,214,229,207,224,208,199,206,243,204,208,199,205,215,187,218,208,235,215,215,120,14,15,9,7,26,9,15,9,19,8,26,29,4,22,25,21,8,13,0,9,10,12,27,24,192,186,212,215,208,200,184,206,190,182,203,180,182,160,221,196,208,204,215,205,201,222,196,181,203,203,197,201,176,183,186,197,187,203,190,205,179,219,223,182,178,196,189,185,198,212,188,196,197,168,176,200,200,206,203,202,203,202,199,199,189,204,208,200,212,189,219,230,188,191,193,183,202,193,234,200,179,205,185,211,221,212,211,202,189,195,202,192,232,193,211,181,178,234,202,186,212,210,200,206,196,181,193,218,201,193,202,187,209,194,197,218,200,220,196,187,176,197,192,200,192,215,194,209,193,181,213,219,189,213,208,190,167,221,185,180,217,207,189,209,216,200,175,231,233,207,203,205,197,212,209,202,197,215,202,203,222,200,189,218,211,181,198,189,193,204,209,204,212,210,229,188,197,203,189,220,217,196,209,202,188,184,196,203,189,187,197,196,196,215,199,217,204,197,194,205,228,203,196,204,200,204,232,197,216,204,200,187,198,213,206,219,190,191,197,193,202,197,205,207,200,201,211,199,192,197,213,208,198,204,212,193,194,211,214,213,182,198,193,212,213,175,176,184,205,234,193,185,210,200,209,197,234,190,197,189,199,193,190,206,213,188,205,210,211,203,234,226,219,233,208,224,191,208,172,202,203,232,217,214,174,200,209,194,203,211,228,216,218,238,199,205,189,189,204,237,221,211,203,214,194,190,178,191,161,98,91,130,126,103,113,134,159,128,121,144,109,86,72,90,135,83,51,63,93,25,51,40,75,83,71,49,28,43,87,189,213,177,151,141,119,157,248,127,76,89,105,97,91,66,36,138,215,226,227,160,163,196,183,206,232,157,140,232,215,222,225,114,87,3,26,0,13,22,17,46,61,30,10,12,16,28,21,25,12,20,11,42,33,21,29,34,27,40,55,31,41,70,58,37,87,165,183,171,163,150,74,88,132,78,69,36,50,57,46,78,81,69,70,68,57,57,73,76,69,75,30,35,136,124,64,25,11,42,41,36,31,50,40,27,12,13,19,28,15,21,16,18,18,5,97,105,84,102,129,129,132,73,76,92,76,55,52,62,60,32,40,54,59,44,74,60,53,60,56,57,35,55,57,49,45,50,35,46,33,6,11,24,58,37,41,30,42,48,24,30,33,26,28,4,29,23,6,17,23,42,40,26,21,34,8,21,19,32,50,16,35,1,31,3,24,39,25,16,16,31,1,61,21,38,43,48,21,31,27,2,15,35,29,27,17,44,49,30,41,18,16,15,16,20,47,27,41,8,3,28,21,20,19,28,19,9,14,36,6,25,12,4,45,30,45,39,39,59,35,50,54,73,80,82,49,62,68,71,68,83,66,71,65,82,73,76,63,88,55,86,91,91,93,115,132,140,124,144,154,145,137,168,137,128,120,107,84,79,79,75,70,48,86,38,29,96,148,133,143,123,100,99,76,90,119,120,112,120,61,61,59,21,27,24,18,76,97,115,90,74,70,33,51,22,52,65,63,58,85,53,79,71,99,87,141,198,224,217,237,255,252,246,241,203,228,231,209,215,214,221,214,217,217,225,211,235,209,215,194,221,214,219,217,229,223,196,208,210,231,121,4,1,9,6,22,24,29,3,35,5,10,22,10,1,30,7,30,17,16,20,15,12,41,4,194,196,193,213,206,174,185,187,194,179,186,227,207,211,192,211,219,213,201,174,186,203,216,211,196,203,201,218,192,211,184,208,202,206,206,229,186,202,209,194,154,189,188,211,203,192,193,211,207,209,181,187,188,196,189,196,189,198,203,193,178,198,202,172,178,213,194,197,197,196,189,190,217,191,214,188,221,207,199,225,208,209,189,226,204,224,188,220,218,198,177,193,219,192,188,228,193,181,211,196,179,184,183,196,194,212,226,206,193,196,217,208,216,209,221,195,208,198,212,213,196,194,215,205,216,204,216,200,198,205,189,213,182,213,209,198,202,201,239,221,188,210,197,223,205,211,208,223,205,198,200,224,218,199,195,192,192,205,208,218,195,200,203,206,215,200,206,192,180,203,217,210,211,225,223,197,185,192,202,188,209,194,209,200,214,205,213,220,203,193,215,224,199,217,221,202,219,210,228,230,212,195,191,228,225,224,183,202,219,232,208,219,209,203,217,203,212,172,221,210,211,215,205,194,226,236,225,187,217,200,191,188,175,200,174,195,202,200,224,214,218,201,211,240,201,212,234,195,217,217,197,188,193,204,225,214,209,210,192,187,196,197,219,203,194,200,206,218,186,212,213,203,211,201,204,227,200,201,195,196,210,217,189,234,212,207,213,193,217,218,209,195,192,178,210,197,204,213,203,184,198,174,178,138,70,55,128,89,65,108,120,133,117,97,125,114,61,82,73,90,132,97,51,114,104,97,39,5,0,60,69,50,32,35,59,73,115,177,193,178,147,192,236,127,105,116,97,87,82,71,39,139,236,228,227,184,168,195,212,197,204,114,164,224,208,236,182,95,31,15,57,68,46,40,28,73,30,61,96,47,5,46,21,30,24,42,33,14,32,17,35,33,33,39,31,38,81,107,152,144,110,120,131,142,142,123,67,79,64,61,40,57,36,35,25,37,29,54,29,54,39,45,54,88,70,57,81,159,210,189,127,58,18,39,43,30,12,2,29,12,11,8,21,22,29,29,10,18,6,17,125,118,106,149,90,44,64,66,25,38,37,70,99,38,40,45,50,51,57,70,59,62,37,41,58,47,29,16,37,41,15,26,19,9,35,10,30,35,35,24,32,22,17,29,23,28,37,30,18,18,31,21,31,34,21,2,46,31,31,33,16,29,54,46,44,40,26,27,11,20,18,41,30,7,13,15,14,34,27,26,15,42,42,33,5,5,17,31,26,33,70,71,112,100,86,35,7,46,19,19,19,25,46,15,16,23,19,23,39,30,26,6,22,28,14,23,25,25,24,25,8,14,36,43,32,22,28,43,48,27,40,50,24,43,46,45,68,60,58,66,64,91,41,66,79,57,42,72,71,75,74,68,106,115,155,149,141,149,170,159,165,165,174,178,182,194,126,181,145,151,126,177,187,177,168,176,166,172,160,152,148,164,103,145,133,65,40,51,15,14,7,41,64,74,112,135,95,64,42,27,61,53,52,56,84,68,69,108,85,117,100,97,84,131,161,230,239,243,247,253,240,240,225,249,241,225,213,207,194,185,219,195,175,195,222,201,209,208,217,199,218,207,203,214,195,105,2,28,6,1,10,8,33,0,33,20,11,8,1,13,24,4,0,23,23,12,32,6,19,40,184,195,207,222,194,185,209,199,229,177,198,193,186,204,189,197,191,211,216,199,215,202,228,184,190,210,202,228,229,196,200,204,188,184,210,188,202,202,205,211,187,211,216,205,206,190,209,221,223,195,203,200,228,212,204,209,196,214,234,194,197,204,207,191,222,193,189,200,196,194,188,213,193,214,185,197,208,184,208,214,214,202,188,216,195,206,210,190,231,203,205,212,234,206,174,204,217,213,198,191,215,187,182,196,171,199,201,219,192,201,186,207,207,178,190,202,204,194,195,205,227,188,218,199,213,209,206,164,195,228,201,176,221,188,210,186,181,215,200,203,205,204,202,206,222,209,200,214,225,193,230,211,214,194,211,212,213,202,226,210,196,196,203,214,213,217,216,201,188,203,190,200,205,211,185,202,199,208,185,212,189,197,202,196,200,204,208,203,219,198,217,193,172,211,203,210,214,207,192,220,218,215,196,210,199,202,194,208,211,195,208,198,201,215,213,211,201,208,174,180,217,202,195,197,193,179,225,205,219,181,196,223,201,205,206,203,207,213,164,215,177,189,203,207,229,191,208,206,224,201,206,210,191,225,227,194,202,199,180,217,201,203,204,198,183,218,207,193,204,210,197,184,197,201,206,159,207,203,177,208,186,177,188,202,202,216,222,170,207,190,214,198,202,207,213,214,185,206,197,209,195,157,197,80,17,45,75,87,90,116,156,141,123,114,132,123,70,131,94,98,121,95,103,188,192,158,132,48,32,65,111,105,68,71,50,62,95,161,207,195,140,226,192,109,135,139,114,124,67,84,44,159,225,205,198,141,181,232,182,213,196,96,167,209,226,200,145,62,49,35,118,200,95,70,48,22,70,211,211,37,1,34,17,5,10,3,35,7,45,35,18,33,36,28,42,90,80,125,163,176,126,118,77,100,92,126,84,18,41,48,33,59,71,46,54,41,46,24,53,17,48,20,33,18,41,107,167,189,203,164,89,25,19,1,2,22,0,16,31,9,15,44,52,21,46,14,3,23,10,61,125,100,106,136,66,58,44,52,26,36,32,34,54,60,17,46,55,28,34,7,28,23,42,19,21,26,15,6,25,31,13,19,25,8,15,13,17,25,13,18,4,27,13,30,29,27,29,15,6,34,20,28,30,22,49,40,17,20,27,26,40,51,67,48,56,43,21,19,31,45,8,15,40,22,4,19,24,19,26,23,25,12,45,47,19,13,3,11,50,49,112,139,184,133,95,36,22,15,26,20,13,14,27,29,19,22,1,15,12,11,43,46,40,22,17,16,16,12,32,25,5,23,30,22,22,7,17,31,15,36,31,13,19,25,35,22,41,38,49,24,58,29,58,68,70,67,63,81,60,72,65,57,93,88,65,84,92,101,72,96,102,116,137,124,122,176,158,160,151,147,141,146,147,158,143,135,135,154,145,181,154,124,113,129,131,70,29,16,32,3,30,21,41,43,117,145,116,96,69,79,50,37,33,27,47,71,73,59,74,78,49,60,30,20,72,113,151,157,221,237,243,241,242,249,255,241,216,216,214,201,222,194,201,190,223,197,210,195,220,194,202,207,214,228,219,88,6,10,20,24,20,13,20,2,9,1,1,4,9,0,9,16,10,16,23,1,24,16,0,27,187,209,193,187,210,189,173,189,201,198,215,188,210,194,205,230,193,185,222,202,187,199,211,212,237,196,214,210,181,193,188,201,198,202,194,214,177,184,210,228,166,189,212,195,221,196,203,191,210,198,226,206,194,212,214,197,209,179,198,199,180,192,189,214,195,196,199,197,189,208,166,180,207,199,216,214,193,198,206,221,214,203,201,198,196,201,210,182,196,186,212,183,221,215,189,218,205,180,199,186,188,216,218,205,201,218,217,188,223,198,214,221,203,222,203,186,187,205,183,197,208,199,190,219,196,219,190,195,190,208,192,205,202,207,192,222,200,233,188,182,215,219,218,204,213,204,233,208,207,208,195,188,202,211,177,192,218,206,212,227,188,214,197,196,202,218,203,200,197,211,182,213,216,199,212,188,197,180,191,182,195,217,179,219,204,217,203,189,200,214,214,209,199,189,231,183,200,180,193,209,221,186,224,200,206,202,202,210,217,207,182,208,199,219,202,202,202,217,207,197,204,199,184,176,196,199,198,195,198,224,215,170,193,211,181,223,200,212,227,198,222,222,201,170,199,208,187,181,195,190,185,195,219,222,205,193,213,209,201,219,184,191,181,208,212,219,197,200,183,217,177,187,197,210,194,184,208,220,198,212,202,199,181,205,208,221,192,196,189,217,209,211,221,190,196,207,205,202,187,209,210,178,221,117,30,32,76,124,102,126,137,118,89,111,102,60,76,42,59,77,107,112,60,93,130,96,76,58,69,15,38,73,104,73,90,92,114,155,213,174,103,215,201,114,103,66,99,112,58,60,42,170,235,208,211,159,197,218,197,253,153,95,208,183,231,207,113,52,45,70,206,244,89,43,61,64,164,249,248,76,9,15,12,35,7,34,22,31,18,11,71,38,18,48,87,54,101,62,83,106,138,161,88,90,70,84,47,39,12,43,65,65,78,42,58,57,46,42,50,33,40,29,29,64,51,135,154,132,78,13,32,8,27,10,14,2,18,28,26,54,53,59,51,61,38,44,32,16,18,79,148,100,158,126,70,25,45,37,12,17,18,46,12,55,24,10,19,24,21,19,30,20,15,1,28,16,27,29,12,17,18,17,25,22,21,17,13,6,11,12,9,41,6,7,25,38,29,10,19,4,14,25,34,38,30,9,17,29,32,90,95,116,106,83,89,41,41,22,16,48,28,42,39,43,26,33,25,8,21,38,12,22,10,26,51,28,42,19,44,155,149,127,96,94,77,37,24,16,22,34,11,36,18,9,33,8,2,39,42,9,16,49,14,33,27,11,11,16,19,22,15,21,50,40,6,22,37,19,3,18,27,17,23,33,19,41,25,10,28,18,6,36,26,31,40,36,48,57,42,66,48,66,35,52,94,102,71,110,92,67,62,71,57,58,80,51,87,65,75,77,70,72,67,77,69,123,93,89,93,84,106,84,76,135,137,83,25,9,12,2,20,7,1,29,27,85,96,98,113,88,61,54,39,55,38,56,65,42,68,42,74,74,68,48,28,39,45,69,84,134,160,178,178,168,217,215,252,245,232,233,219,220,230,174,239,224,229,204,204,198,221,226,209,219,223,109,0,5,21,33,21,17,26,11,38,16,5,11,7,9,3,30,3,2,8,4,28,2,7,37,225,218,180,195,202,198,217,183,186,201,205,181,158,206,195,190,204,185,204,213,207,204,198,191,186,200,194,183,190,182,199,184,200,189,199,211,194,204,204,230,178,216,228,190,222,200,197,190,222,188,199,226,210,216,209,197,213,175,205,206,198,221,195,187,199,199,202,207,202,197,210,194,193,200,181,198,221,197,213,197,208,185,163,210,200,200,193,182,192,219,200,205,205,200,191,208,184,195,205,185,209,202,211,197,209,215,209,187,209,203,214,220,210,191,207,185,217,219,181,220,211,200,203,208,208,203,189,196,205,209,217,205,200,199,204,183,245,207,187,209,204,196,209,207,169,192,206,220,214,199,211,173,217,194,209,210,204,203,210,203,205,213,207,202,181,203,210,214,184,189,203,204,207,224,201,200,206,188,213,219,193,225,192,183,208,205,183,210,208,186,226,191,209,172,209,233,224,216,204,185,201,211,197,206,193,195,217,196,213,198,208,200,218,203,220,191,187,224,202,197,188,216,206,197,229,195,198,207,185,183,207,189,201,206,208,198,205,200,190,222,190,197,170,203,203,193,186,184,206,200,210,194,200,212,211,211,199,224,203,198,202,222,204,206,196,192,202,204,210,202,201,201,215,209,208,205,185,178,215,205,196,224,193,204,196,204,215,197,220,214,212,233,176,203,203,181,194,192,215,217,168,202,253,132,82,72,110,137,121,86,108,110,68,31,6,11,43,62,45,32,131,120,7,62,75,133,111,48,39,2,48,73,57,76,28,44,111,114,198,92,123,220,187,122,90,67,84,85,59,77,30,125,237,209,199,148,190,192,189,227,107,93,207,221,238,149,76,49,23,40,123,140,37,43,45,131,230,251,247,92,62,29,17,27,2,47,22,47,22,19,32,63,57,62,79,107,111,55,49,58,114,142,77,68,78,80,100,62,41,46,43,50,68,39,51,43,30,50,43,27,50,33,59,143,146,162,136,66,34,12,19,7,5,32,28,11,58,48,64,36,73,38,57,43,12,14,26,32,32,105,128,99,137,121,43,15,28,16,14,28,40,16,28,26,16,25,21,16,32,49,18,33,17,20,14,10,38,19,14,30,19,28,39,19,30,13,14,24,9,48,48,8,46,14,26,0,1,23,30,4,18,13,33,49,50,36,20,21,31,135,158,181,130,107,116,79,52,18,31,17,30,42,20,5,27,48,30,32,8,8,15,21,42,17,43,33,36,17,81,157,155,158,118,103,95,70,68,39,19,22,11,20,15,11,23,19,1,0,20,5,2,11,28,14,56,35,18,11,47,61,40,52,77,58,33,42,37,20,6,9,25,44,40,35,28,47,27,40,35,12,26,14,36,25,29,47,5,18,28,45,22,34,3,48,49,34,48,76,68,52,33,60,70,70,75,42,86,57,57,75,51,94,67,78,63,69,63,84,60,73,71,43,92,96,113,93,39,31,28,8,31,15,42,24,33,35,62,60,120,130,82,53,76,53,61,23,59,51,40,52,15,40,50,54,48,70,74,46,47,50,67,85,84,117,138,128,220,228,244,232,224,234,219,205,217,224,219,208,216,220,230,227,231,208,245,106,15,4,26,25,33,16,21,21,2,5,12,5,12,0,8,13,11,16,14,2,33,17,26,12,219,158,192,210,190,208,219,188,173,203,204,202,188,201,206,212,211,167,176,204,218,210,175,188,231,209,177,207,195,207,201,207,202,195,196,201,226,197,200,201,223,202,205,203,224,194,211,214,205,220,225,219,197,215,187,207,192,178,226,208,211,211,191,226,207,211,194,198,202,197,183,197,220,200,187,189,179,197,213,197,199,201,194,199,200,200,197,189,206,212,209,208,212,206,213,207,198,219,187,216,196,187,205,170,214,211,203,195,189,202,215,195,194,201,214,200,206,186,200,214,199,205,177,203,195,203,180,196,192,196,215,187,212,192,191,223,223,198,212,197,209,156,198,192,202,210,180,207,195,208,207,222,210,207,209,212,202,209,190,219,202,199,230,197,223,205,192,213,178,203,189,204,203,175,221,196,190,212,198,205,192,199,204,199,212,196,210,203,193,201,211,185,192,197,219,199,203,212,180,196,220,230,191,218,189,198,190,206,210,207,184,203,207,166,192,208,203,215,205,214,191,192,192,192,199,210,213,188,203,209,204,180,207,185,193,199,200,205,193,204,197,192,205,205,209,208,213,213,205,195,198,191,188,198,199,182,194,210,187,191,181,216,202,200,206,207,195,219,169,180,199,202,195,203,196,216,192,200,182,197,168,185,210,192,184,189,207,217,191,185,192,219,209,206,202,171,201,210,209,188,156,194,235,182,117,94,56,92,60,85,78,104,41,36,40,38,24,18,43,61,106,88,59,81,136,148,114,52,29,16,23,59,36,14,37,46,105,168,192,119,131,162,161,115,96,62,89,68,40,52,10,146,242,237,186,124,214,238,214,206,66,107,216,211,167,65,47,41,16,40,42,48,35,47,142,209,247,218,163,189,224,192,116,23,3,29,28,21,34,57,42,60,79,94,79,78,79,92,84,103,76,137,107,56,32,46,55,56,72,41,41,46,41,40,27,53,70,41,37,43,35,114,158,185,122,84,81,26,18,13,13,5,39,35,38,76,75,65,30,47,75,63,56,22,15,6,5,15,43,137,131,104,130,118,57,40,15,20,14,20,0,16,34,5,4,18,14,7,21,30,7,25,9,56,48,6,31,10,28,3,40,46,14,23,12,27,16,17,43,10,9,10,25,14,31,3,3,14,25,8,37,28,18,33,82,43,25,15,37,153,174,140,120,106,111,59,47,30,41,31,16,9,11,41,38,38,37,13,36,5,32,46,53,41,24,34,33,18,69,112,120,120,106,41,81,52,15,34,16,6,36,24,6,11,36,16,32,21,12,29,5,10,26,40,67,24,12,30,46,107,96,116,108,96,95,38,44,9,40,43,49,43,27,7,35,41,22,27,29,20,23,11,26,30,10,19,18,17,16,28,20,38,21,23,28,12,19,30,29,26,14,54,3,28,28,35,46,48,63,46,59,49,56,70,44,75,80,68,73,73,60,87,114,107,97,102,54,11,7,18,40,16,3,10,23,7,42,39,47,83,91,62,55,56,54,43,22,43,54,52,44,33,39,61,63,72,98,82,85,70,56,62,65,52,62,69,96,139,234,226,246,227,223,234,212,225,226,218,211,222,226,229,241,238,228,123,12,7,5,21,4,39,24,34,18,6,12,11,8,0,3,0,20,4,8,30,14,7,5,22,202,189,192,197,204,197,203,193,201,194,197,187,219,190,218,197,202,204,198,202,234,231,204,203,218,201,207,197,170,212,212,194,207,198,202,199,220,215,203,216,180,175,209,201,212,226,198,212,202,191,201,187,207,197,211,155,194,181,242,217,214,212,228,218,207,200,213,205,187,197,210,213,197,214,190,224,190,220,199,205,212,228,198,218,220,208,217,194,228,217,189,225,196,182,216,208,215,184,223,225,184,212,200,185,191,215,213,209,218,191,175,175,199,197,207,186,166,200,198,221,201,192,196,183,213,186,222,201,213,206,203,207,188,187,196,184,200,210,203,204,206,191,215,199,203,188,189,183,172,207,208,195,200,185,203,193,205,183,221,186,205,207,211,221,204,196,216,213,201,194,206,202,185,208,216,209,218,183,209,186,205,191,220,198,196,191,177,183,193,195,203,207,196,199,202,196,193,168,185,205,176,213,207,205,180,203,213,213,197,182,195,217,215,211,195,189,212,187,192,206,210,183,186,208,206,204,208,191,217,196,200,202,193,199,211,208,201,216,195,190,183,206,195,205,199,219,196,182,193,216,209,215,188,166,212,194,203,207,196,187,180,211,188,221,194,225,199,205,197,212,218,230,183,201,205,185,164,198,182,202,184,185,199,219,206,188,189,187,175,217,209,201,191,190,204,189,213,184,203,202,174,173,218,193,157,103,65,86,53,95,61,48,17,15,23,19,29,15,52,50,94,89,44,90,163,194,128,75,59,10,92,53,18,37,93,152,161,172,167,127,125,139,160,164,94,95,55,76,78,66,30,150,233,189,211,147,232,188,219,205,78,159,246,162,88,21,5,23,37,21,22,31,18,148,234,245,222,126,176,248,240,248,222,61,0,16,39,58,62,31,40,68,95,76,79,84,104,89,115,94,76,124,105,35,41,37,44,47,68,41,27,39,45,58,44,46,48,25,27,49,114,174,188,128,39,54,33,7,15,4,9,23,37,68,47,38,29,52,57,43,81,54,66,12,46,15,27,21,56,175,105,92,154,82,29,16,8,4,39,8,51,13,49,14,19,21,15,32,11,22,9,7,17,6,4,19,28,29,23,41,44,58,62,69,55,30,37,13,22,24,26,43,27,19,16,28,19,16,30,10,13,13,20,45,32,62,10,0,39,145,110,129,87,117,67,57,58,32,20,27,41,21,36,34,22,45,47,40,18,25,9,37,54,16,27,70,41,12,70,104,73,128,92,59,69,15,43,2,29,31,14,13,15,33,26,32,10,8,17,15,2,16,31,28,41,38,32,12,123,168,148,140,108,109,87,58,44,34,37,23,33,30,49,10,18,11,29,42,22,31,33,27,37,28,28,22,17,29,34,19,41,22,20,25,4,18,13,25,5,33,21,23,41,26,33,26,8,30,12,26,25,39,19,36,23,59,21,36,41,52,51,65,120,119,115,80,12,14,17,42,54,33,54,29,22,23,32,23,16,12,51,62,52,71,27,50,19,59,26,7,14,41,44,25,63,27,58,88,55,87,72,71,54,43,59,35,87,60,133,180,224,240,219,209,185,202,246,208,227,217,213,209,229,221,221,126,15,9,14,23,11,2,13,38,6,19,5,8,1,13,10,0,8,21,13,5,32,8,6,22,209,183,189,219,154,199,190,213,205,186,188,195,207,215,197,194,209,215,202,216,196,182,195,219,208,215,200,206,210,210,197,159,184,170,193,185,214,198,196,200,185,201,234,203,207,211,203,204,218,215,209,183,221,195,217,219,223,200,220,205,203,208,197,215,220,209,190,234,211,204,200,185,172,236,199,213,215,193,196,219,214,217,226,207,211,171,201,203,192,207,220,235,193,209,204,210,200,208,181,205,187,185,196,204,212,192,216,199,199,201,208,186,203,174,186,203,195,217,194,216,198,215,212,204,190,191,193,194,196,193,183,188,212,197,216,201,184,209,204,202,220,204,191,211,185,191,203,194,227,184,197,215,203,197,216,190,195,217,182,179,204,216,195,168,178,194,213,197,192,213,193,206,230,214,206,199,197,200,196,195,186,194,197,203,197,205,185,190,196,204,164,198,215,198,207,186,189,190,230,191,212,194,187,194,235,205,230,205,204,201,186,184,208,189,204,210,182,215,217,198,213,207,205,189,206,175,189,210,203,212,192,202,205,180,185,205,200,187,223,197,194,198,195,204,218,225,219,200,207,198,205,194,197,192,196,194,213,220,209,209,199,211,190,184,209,204,215,196,202,186,191,202,213,191,209,187,191,209,194,185,199,196,188,208,209,189,220,196,199,207,225,165,188,205,190,177,209,212,202,210,184,223,238,210,182,135,100,103,88,65,68,48,2,26,52,74,30,17,64,43,70,74,0,80,127,135,96,79,38,16,66,38,73,55,173,242,211,198,163,112,157,170,227,182,76,133,90,90,85,51,4,173,232,208,182,150,205,187,234,153,61,218,184,73,9,9,28,69,21,9,46,62,103,193,237,224,119,136,214,250,237,245,146,59,69,129,115,111,49,38,53,74,83,74,113,96,106,102,98,111,88,100,110,78,72,60,81,61,62,53,58,32,21,49,43,39,27,35,51,104,146,155,81,27,18,15,16,8,13,57,42,70,63,44,55,80,60,73,43,66,53,45,34,11,12,17,23,18,48,138,107,94,152,85,19,8,26,29,20,49,24,31,41,37,31,35,10,18,30,25,16,33,36,19,26,37,26,2,22,14,80,57,88,79,101,81,52,31,35,30,27,35,28,27,13,11,24,29,38,28,9,20,24,33,53,51,13,15,64,109,92,86,86,96,86,36,43,36,49,41,46,30,16,28,14,28,22,34,19,32,22,22,19,25,31,63,45,14,41,92,92,103,125,87,74,46,28,17,19,7,7,9,23,29,39,24,15,21,25,25,21,29,38,38,34,63,47,77,151,173,158,136,83,121,94,34,53,28,35,16,27,56,8,22,16,17,41,57,31,23,15,20,17,37,26,15,26,54,21,40,23,31,31,28,44,5,28,27,14,31,14,30,27,31,52,16,8,45,25,20,31,16,49,21,20,53,35,47,14,21,15,36,92,105,97,115,73,42,25,5,27,28,74,66,32,22,27,21,16,12,37,30,14,20,41,51,53,34,31,28,70,13,20,45,48,47,81,64,79,59,82,45,54,39,92,64,74,35,48,51,101,184,197,223,224,219,201,245,235,217,212,215,212,224,228,98,10,5,6,26,6,10,10,23,19,13,23,30,7,0,9,4,26,5,47,31,29,10,22,28,191,200,186,195,191,185,167,195,214,216,171,199,211,214,204,176,212,196,193,202,198,174,205,202,179,227,217,209,190,206,196,208,216,204,211,188,194,200,207,202,198,200,196,205,187,206,173,200,202,197,199,216,211,197,192,214,206,227,227,187,203,218,204,214,210,194,215,201,223,209,199,202,222,204,202,189,188,224,210,196,225,205,214,205,206,218,198,193,215,214,203,199,227,200,193,185,189,206,203,228,225,199,191,199,218,194,207,210,213,183,202,192,192,204,193,184,208,204,194,205,187,184,193,234,181,199,199,207,225,200,211,217,214,181,200,163,218,198,193,180,218,202,201,207,175,212,190,208,199,209,193,197,197,207,198,185,204,201,214,210,207,203,210,207,200,234,202,207,188,188,179,201,172,171,208,197,193,192,180,187,202,184,191,193,205,199,199,204,192,177,203,173,194,168,200,201,209,197,205,214,207,216,200,181,194,204,194,196,182,193,188,212,217,201,194,206,200,195,166,212,185,194,195,206,202,201,212,192,230,199,201,195,197,200,198,195,183,185,184,183,191,185,203,210,208,183,202,220,165,206,212,195,156,209,208,208,208,174,210,198,212,200,194,217,188,178,209,202,200,177,200,202,222,213,203,174,176,203,205,197,201,197,229,221,193,196,192,206,193,187,220,203,211,195,200,200,211,189,191,201,179,199,191,173,144,117,173,131,152,126,143,95,53,83,93,126,95,66,76,51,68,59,18,63,152,138,100,136,82,40,67,124,175,171,246,190,185,132,155,146,139,196,220,137,86,137,98,98,69,42,9,164,188,214,184,149,199,184,247,137,63,187,85,17,8,85,177,112,66,40,38,125,204,233,218,156,161,208,249,223,214,155,75,125,132,113,128,102,44,88,124,67,18,58,115,105,102,101,104,119,82,100,64,73,46,46,48,67,32,36,29,22,9,11,12,18,24,104,157,153,123,89,26,9,15,6,10,19,17,45,47,67,65,52,68,61,60,75,75,70,32,22,15,33,20,13,38,2,95,147,105,126,128,59,24,28,5,38,36,16,30,23,14,27,12,16,7,48,24,24,19,30,19,22,16,13,23,55,22,62,100,79,82,112,67,54,27,34,34,26,25,4,32,22,20,17,42,8,20,10,30,61,40,48,32,41,46,10,41,74,57,86,110,158,77,15,49,27,32,14,36,29,33,42,3,10,24,19,34,38,22,26,1,27,15,60,50,10,49,59,50,91,80,86,61,44,33,28,24,20,26,50,36,11,33,12,14,22,21,30,48,19,18,12,68,52,31,99,160,136,142,124,114,73,65,41,39,28,17,14,31,4,23,31,42,32,18,9,22,36,11,15,50,41,32,5,23,67,42,48,41,28,16,12,31,9,20,24,32,29,27,33,35,18,34,32,23,13,24,29,22,39,50,3,38,49,12,5,32,21,54,56,120,121,104,115,67,33,14,41,28,36,72,65,74,71,48,37,9,24,28,36,33,22,20,30,24,41,65,64,55,41,51,31,39,38,34,79,19,73,26,50,48,49,63,55,64,3,49,47,33,177,196,236,227,233,225,238,225,231,220,232,209,217,224,126,13,20,24,5,6,1,31,0,19,40,6,17,2,8,24,13,7,2,22,19,0,5,21,3,199,184,174,206,204,198,203,206,193,223,223,192,201,186,179,211,177,212,197,203,185,179,207,179,194,197,203,205,228,180,171,182,187,206,186,212,195,166,195,215,199,179,223,189,222,209,201,211,227,195,198,208,222,202,222,186,221,183,207,209,184,198,209,215,199,210,198,208,207,195,191,198,210,230,197,207,192,215,200,220,193,187,208,203,205,190,200,215,202,207,195,195,214,196,180,206,193,216,199,190,213,212,199,191,217,193,194,193,215,203,200,192,200,204,191,195,212,223,205,201,206,201,204,216,196,199,191,196,204,178,208,208,187,193,193,202,191,210,202,213,198,207,221,197,187,209,213,192,199,199,193,200,181,210,200,196,201,198,188,202,190,211,208,197,211,204,188,183,194,187,206,212,193,201,199,180,187,211,213,211,186,200,206,195,181,198,201,186,202,173,188,187,190,204,196,200,196,191,197,192,205,189,205,196,217,200,195,192,180,183,194,203,221,214,200,203,205,171,205,165,225,196,180,195,197,199,219,199,184,203,199,187,209,197,198,186,195,192,202,192,203,198,188,221,190,191,204,203,193,192,177,211,200,205,196,222,203,189,193,178,217,223,204,198,212,186,198,217,168,190,201,194,214,197,206,184,204,186,212,167,170,195,186,200,204,199,190,198,198,195,171,190,192,186,177,187,192,189,224,198,181,218,199,157,110,156,180,197,216,186,165,205,137,107,186,186,173,93,119,36,144,92,5,114,172,138,173,243,163,67,57,185,236,234,211,187,140,136,188,155,128,177,179,100,95,121,99,86,60,23,22,136,236,214,186,132,233,223,239,127,61,138,24,20,29,125,230,155,106,39,97,214,242,246,165,179,218,245,252,175,120,47,72,146,89,119,84,61,72,134,88,47,49,83,90,72,70,63,70,100,85,83,95,75,72,77,70,48,34,35,23,25,21,34,11,18,60,135,154,159,107,25,25,7,33,15,41,38,34,49,60,58,80,72,49,63,73,63,86,89,80,71,29,47,23,31,18,42,142,167,105,103,139,63,34,37,40,64,20,16,55,9,19,21,30,30,45,19,13,15,24,6,31,23,51,39,39,36,33,131,135,142,108,65,58,54,56,20,29,0,10,19,28,33,40,23,53,24,35,12,29,37,32,30,5,59,61,57,37,66,80,83,170,108,34,41,48,7,4,37,28,19,35,31,35,6,15,31,54,28,49,4,26,19,33,92,52,40,59,59,53,42,68,52,13,47,14,19,33,1,3,51,58,45,4,24,15,42,29,32,1,20,11,31,59,81,37,91,141,116,107,93,81,96,72,23,7,12,23,48,15,31,2,5,40,34,44,29,23,22,33,17,25,36,69,28,15,38,70,50,75,68,49,66,68,36,47,44,16,29,24,29,25,17,27,24,20,39,12,30,39,33,7,20,22,7,29,30,11,27,31,84,125,112,106,118,49,28,9,16,15,49,60,68,83,59,65,65,24,41,22,31,36,28,0,18,6,38,46,53,47,79,75,46,23,27,28,66,70,49,54,64,50,50,22,40,47,44,73,49,78,151,159,224,223,221,228,226,224,236,226,228,233,210,200,111,16,0,19,18,20,29,23,15,19,10,15,12,15,5,0,18,23,26,17,8,24,21,2,5,214,214,202,218,211,205,227,210,211,210,207,194,192,194,208,186,197,213,215,195,193,214,193,195,195,201,206,224,226,208,205,181,222,195,182,197,216,187,192,219,207,194,180,208,212,216,205,204,208,221,209,206,201,219,208,202,205,227,196,213,200,213,227,214,214,201,190,225,201,198,206,194,181,186,192,205,198,194,181,197,216,224,199,224,234,199,203,212,185,224,217,204,193,178,208,187,223,192,208,207,192,191,206,218,180,192,213,191,199,217,204,221,218,207,210,222,189,191,207,180,203,205,172,202,201,185,205,190,188,207,215,196,221,204,186,182,207,209,205,198,204,193,215,198,194,179,197,178,200,215,185,211,198,199,205,194,213,208,234,187,205,189,188,187,193,182,204,194,223,199,198,199,185,211,208,200,197,208,192,187,208,179,192,222,189,197,208,192,202,182,221,203,203,198,229,202,206,203,195,202,205,190,198,196,215,178,222,202,212,187,174,179,199,209,196,194,219,203,185,217,194,218,199,190,181,194,202,185,163,194,209,206,189,197,216,215,189,203,209,208,165,174,204,203,205,204,202,202,198,193,220,209,202,204,193,196,198,188,192,192,203,165,196,170,202,193,183,175,189,175,196,187,195,210,202,192,171,178,195,201,188,186,187,187,214,185,214,175,196,196,187,195,176,193,212,207,186,199,212,159,188,193,170,160,155,176,187,180,175,147,191,208,149,85,141,241,210,131,134,36,115,136,43,170,172,159,180,211,119,107,123,158,166,182,150,123,174,171,195,170,107,189,172,126,101,101,73,110,58,64,3,143,217,215,176,179,253,190,196,157,50,37,16,8,47,157,132,77,76,62,191,253,246,207,160,218,254,249,246,116,29,60,118,131,138,68,35,71,121,153,77,88,95,99,98,34,117,124,101,76,94,87,117,104,96,78,68,58,52,39,51,28,23,11,13,50,107,157,134,86,46,45,8,35,22,13,9,37,65,77,79,53,62,52,64,69,7,26,66,60,77,93,75,30,30,6,22,39,160,139,84,129,102,35,33,34,97,71,47,43,18,43,25,11,33,5,40,22,53,23,5,11,5,19,43,35,44,38,47,96,146,125,133,82,100,59,36,40,1,18,15,26,31,16,30,19,12,22,26,36,22,16,6,17,28,52,48,32,30,61,75,74,56,62,53,36,39,41,28,25,48,45,7,45,31,43,20,41,17,37,13,16,20,47,48,84,74,46,60,57,35,34,33,30,36,40,35,50,25,12,42,10,34,20,12,21,30,66,26,8,24,21,14,18,64,122,37,92,102,116,73,60,147,121,43,36,29,26,28,7,8,20,44,36,22,22,18,12,0,53,49,15,35,53,53,42,37,46,89,105,122,103,166,91,34,45,16,42,38,8,36,11,19,17,41,10,44,33,38,17,24,52,21,19,46,27,10,44,18,48,9,90,104,99,110,108,65,42,33,22,33,53,96,94,95,81,79,83,61,45,28,20,43,25,36,29,15,22,51,27,26,53,83,61,73,18,27,44,57,31,46,55,33,40,58,56,67,75,45,43,91,195,218,201,205,228,238,213,224,230,227,231,232,201,223,95,16,7,8,1,1,20,0,27,2,11,12,26,8,2,14,2,2,1,12,5,26,7,5,16,195,201,196,197,170,182,193,197,225,216,208,205,210,202,199,201,220,187,181,200,181,203,223,210,195,180,195,191,192,185,213,201,196,195,200,201,194,192,199,211,199,193,189,199,216,194,196,209,196,215,217,190,181,190,209,199,219,204,208,215,226,199,217,222,219,212,187,206,215,208,208,196,220,201,203,196,219,224,219,210,193,189,207,214,208,209,208,210,218,197,201,228,189,176,202,218,216,211,192,210,189,164,216,196,202,210,184,199,208,223,183,195,202,203,209,207,199,186,201,203,203,206,180,202,182,216,191,208,200,203,204,197,216,207,206,205,207,172,209,188,187,215,210,196,184,192,184,202,169,202,203,217,211,211,184,212,203,187,190,183,203,198,184,235,192,179,206,193,209,204,210,199,200,223,201,209,167,184,187,184,196,183,185,205,199,188,193,199,202,173,225,195,201,183,212,222,234,197,193,185,193,189,192,193,198,198,198,169,175,175,187,197,202,185,204,217,203,191,194,179,200,184,199,200,209,223,212,173,196,183,216,211,188,212,194,185,214,207,184,195,207,197,201,185,179,212,170,202,191,176,187,180,202,183,232,186,179,177,206,186,204,220,212,198,220,192,236,221,181,199,186,190,195,191,209,209,169,182,212,170,177,172,188,189,174,179,192,188,207,203,189,195,210,189,210,177,180,196,193,176,188,186,181,209,187,192,164,149,165,151,195,228,212,122,73,168,166,136,94,67,139,87,10,103,187,130,137,126,92,148,201,190,140,131,143,163,176,165,210,106,168,208,194,132,72,91,80,95,56,45,36,160,224,200,188,138,183,99,133,104,56,11,4,21,36,47,37,52,92,176,239,212,218,180,205,248,241,228,189,42,41,75,149,165,90,13,67,115,128,92,46,70,70,157,77,122,165,170,128,112,104,109,108,99,107,95,102,89,63,56,43,49,14,34,30,92,159,134,67,21,20,47,16,23,20,22,27,44,75,96,66,86,58,74,86,93,54,26,30,38,39,40,111,78,28,45,5,60,164,127,113,141,79,52,10,99,107,40,59,38,41,12,25,24,40,5,23,15,32,29,30,32,31,32,40,45,41,36,49,151,137,123,123,94,107,49,49,39,32,25,13,7,46,39,25,8,17,28,16,22,55,38,12,18,23,67,95,36,35,70,31,44,68,26,12,29,47,46,22,27,25,35,8,16,40,28,29,23,11,4,19,36,45,34,58,102,54,17,73,36,45,63,8,28,36,22,53,9,42,26,26,24,30,9,11,39,12,22,44,18,30,24,29,38,83,99,29,58,78,104,89,137,187,91,29,58,26,9,14,20,35,31,18,51,43,17,17,18,47,21,31,38,44,49,57,46,43,74,100,158,130,102,119,85,39,29,38,13,21,21,38,27,20,21,47,53,1,18,48,8,22,46,18,24,34,24,17,11,38,61,21,93,123,112,125,108,92,32,12,10,21,29,51,74,111,87,88,80,81,82,80,42,38,47,19,34,11,14,20,38,51,44,84,83,67,49,37,42,31,22,30,42,62,61,81,93,44,49,62,49,156,232,211,233,219,214,212,223,238,232,227,232,231,198,232,105,12,0,7,2,9,11,28,18,10,31,21,41,36,4,5,15,13,14,3,20,13,24,9,12,212,189,218,195,218,212,177,207,205,199,221,179,211,207,177,218,206,200,216,205,184,193,213,213,201,196,208,217,197,206,220,204,190,210,191,209,205,197,175,206,190,193,223,188,212,205,207,197,209,210,198,195,204,185,231,181,199,187,217,173,180,208,220,208,225,230,208,197,233,195,216,197,215,182,184,209,192,219,163,221,195,211,199,191,205,204,215,199,197,183,191,185,217,201,206,186,204,215,202,202,194,201,194,201,187,197,210,198,182,210,209,201,207,212,193,193,198,197,185,199,199,203,187,208,214,203,200,195,190,202,190,194,181,185,219,210,203,182,189,213,198,197,187,182,200,210,173,195,196,205,192,198,188,202,200,211,200,207,211,189,235,195,192,215,199,219,212,195,218,212,201,190,210,214,213,207,205,197,221,189,221,183,171,211,209,210,181,180,199,178,181,200,200,186,195,188,208,178,204,196,162,206,218,194,177,200,199,181,192,188,204,201,199,185,185,204,191,185,185,186,189,198,206,184,213,243,196,195,212,208,186,207,191,172,211,185,204,185,187,207,190,208,206,199,210,200,198,180,184,196,194,197,191,204,212,196,199,214,185,183,197,224,190,184,170,217,199,204,201,177,182,201,193,214,201,205,189,194,195,199,177,158,199,197,189,165,194,177,195,200,211,209,198,194,193,197,198,202,206,178,192,181,127,114,145,150,147,147,155,150,212,220,195,190,132,34,100,118,102,73,99,82,8,86,162,138,80,80,159,170,215,174,128,174,169,182,157,138,191,144,161,213,193,162,81,103,76,78,57,23,27,145,209,197,178,87,140,115,55,29,8,30,34,62,72,57,35,54,206,243,227,189,179,196,253,238,226,230,65,19,44,123,128,118,54,49,88,109,89,51,49,25,71,117,150,171,222,158,116,119,113,111,105,82,64,82,83,94,56,35,34,16,52,27,73,145,142,101,44,5,34,6,1,45,54,4,35,39,52,90,64,55,58,81,106,56,39,19,30,25,52,59,64,119,54,33,30,98,164,111,96,119,45,42,45,90,87,65,64,32,22,12,7,26,34,37,52,2,10,0,28,21,43,41,14,12,67,63,58,110,108,78,122,95,75,30,55,26,30,16,7,27,20,28,23,24,18,19,35,20,11,46,49,15,11,60,82,55,30,53,18,41,14,44,19,48,46,48,41,33,44,27,17,13,40,66,53,46,30,8,25,47,50,54,83,115,82,19,39,46,41,20,18,28,22,43,34,26,37,39,15,37,21,17,31,29,30,13,4,31,22,78,61,39,115,114,38,57,96,89,89,68,82,43,43,42,31,10,7,20,37,51,23,25,22,20,10,40,29,19,16,7,35,40,85,60,39,94,141,162,169,85,108,104,51,27,39,20,14,26,32,6,31,27,26,20,32,32,31,17,34,41,19,12,27,58,39,25,50,73,26,116,127,101,108,108,96,48,16,10,37,37,25,64,91,111,107,103,65,78,82,73,77,51,12,17,42,17,3,15,25,30,25,65,61,72,62,41,66,19,62,56,58,64,57,69,84,39,52,139,229,242,218,243,247,228,208,228,223,223,234,208,233,226,226,97,4,7,13,19,5,17,24,31,19,20,12,14,5,3,10,11,27,4,3,7,2,10,24,5,208,213,197,197,200,184,208,222,184,204,211,212,224,189,166,195,224,204,197,183,201,207,224,201,226,207,197,210,205,193,198,173,191,196,208,205,201,207,216,190,198,215,202,185,221,224,212,210,220,208,192,228,210,199,212,206,205,222,200,202,227,199,204,205,191,204,227,229,240,201,228,224,233,192,203,209,208,202,208,223,206,222,208,177,191,191,206,177,221,223,188,197,195,191,211,198,201,199,221,192,201,200,191,175,201,197,201,192,187,206,194,184,226,187,184,175,217,199,197,165,216,200,191,177,216,202,191,219,195,199,197,203,203,191,212,195,174,207,185,197,213,179,200,188,211,189,214,218,210,200,231,188,187,184,183,193,204,202,206,219,196,208,201,183,236,190,195,195,192,195,185,192,197,187,191,184,192,191,197,191,195,204,193,174,200,188,201,183,204,207,213,184,219,198,217,190,196,209,191,175,191,200,205,186,198,185,198,210,201,167,187,186,172,199,188,164,206,200,183,202,188,194,185,211,184,192,199,206,196,204,191,196,198,215,204,198,195,194,215,176,188,200,188,205,218,235,173,192,194,211,204,202,179,185,183,187,215,207,192,171,210,194,188,211,200,184,201,190,200,194,195,192,177,197,192,185,196,186,183,185,192,160,188,225,193,215,202,211,228,196,185,206,202,200,189,207,205,176,185,163,195,169,115,103,130,131,127,152,143,195,200,197,233,220,159,99,5,79,100,77,77,79,9,76,102,88,63,99,205,205,196,136,156,202,161,122,170,171,149,114,161,161,191,149,62,119,90,76,46,22,16,164,222,210,185,95,120,119,39,35,0,104,192,72,75,90,61,105,241,233,203,162,212,250,245,246,220,116,42,50,111,86,90,52,84,106,131,73,66,60,50,8,42,97,152,217,216,123,118,108,92,104,73,76,33,72,79,53,66,59,31,23,14,36,73,162,130,85,6,17,32,0,17,47,78,31,39,57,71,67,62,101,101,109,66,25,19,29,11,3,37,34,87,124,59,37,5,123,143,93,115,91,38,37,96,125,101,106,68,27,36,34,17,5,12,10,6,9,5,12,2,26,10,1,14,15,53,68,36,78,71,65,143,111,59,25,35,14,33,3,18,18,22,44,20,7,42,36,15,24,16,39,52,37,21,67,72,39,47,42,22,54,19,16,18,33,53,44,37,18,37,35,15,31,37,35,37,70,43,0,64,46,44,37,76,141,46,30,60,30,34,53,40,40,36,45,28,16,14,4,22,26,14,23,49,20,29,46,31,31,42,51,61,26,94,93,56,65,84,44,23,19,3,28,39,39,17,9,28,19,38,47,29,17,2,17,36,51,13,25,15,27,32,55,80,68,84,124,136,130,128,100,109,114,51,32,31,15,33,11,32,26,63,32,20,37,6,19,32,37,15,38,42,33,44,54,75,81,131,120,93,144,149,107,98,116,111,42,57,48,28,20,43,87,117,121,135,120,81,90,104,80,75,47,69,38,1,27,11,35,42,33,21,32,46,77,61,67,70,41,77,58,77,67,32,43,66,43,70,212,238,234,220,231,233,219,222,225,200,215,218,207,215,217,222,117,22,4,8,32,18,12,41,17,12,18,9,38,3,8,0,16,13,1,24,4,26,17,15,17,200,215,184,191,180,188,205,199,198,177,185,192,218,201,186,218,202,187,204,200,188,207,222,220,202,185,197,216,193,194,198,195,211,215,223,186,212,208,190,182,190,222,230,210,190,193,215,214,209,210,198,234,199,213,212,215,224,201,208,201,194,189,194,207,208,188,208,212,218,193,212,190,195,198,222,209,191,200,223,202,205,201,197,208,219,227,183,178,199,206,191,215,229,200,182,199,187,212,212,206,189,197,214,208,208,198,216,214,192,188,196,223,220,189,221,206,175,200,183,207,214,201,220,200,197,229,207,209,200,196,192,198,169,205,207,200,203,208,207,226,190,175,198,187,210,207,189,201,193,188,217,197,194,212,201,209,184,173,201,191,203,198,202,179,214,180,203,200,210,194,199,194,213,197,196,208,214,196,213,200,207,183,207,202,181,199,184,201,198,212,187,208,182,185,202,202,191,205,186,193,180,198,172,196,186,200,196,182,180,185,170,191,194,217,215,197,188,201,207,180,194,184,196,201,204,180,184,184,193,191,193,198,207,199,208,187,177,200,212,189,206,217,188,200,181,181,220,165,193,202,206,191,218,202,186,197,209,202,200,192,195,182,200,181,205,210,182,188,172,196,205,217,194,186,195,184,165,166,193,193,188,180,193,212,187,191,193,172,196,181,198,194,187,184,165,198,217,202,187,164,168,150,159,160,157,188,169,156,176,197,212,174,216,234,239,135,34,15,90,77,101,82,60,59,74,117,126,148,148,169,208,146,192,184,124,126,164,216,152,106,144,111,191,149,80,108,85,72,54,44,48,167,182,176,146,94,105,88,42,48,8,183,137,84,72,45,150,136,246,205,126,200,241,248,250,219,106,49,91,86,107,47,53,88,129,154,114,80,79,26,36,19,43,101,131,140,141,70,90,109,82,92,88,68,62,67,54,69,68,53,45,37,42,34,119,158,121,45,10,20,4,33,21,67,78,34,28,72,80,108,106,144,147,135,54,31,21,21,12,40,26,70,99,103,48,19,43,121,146,103,100,75,45,45,80,124,76,49,55,40,29,28,24,21,2,44,38,15,7,38,40,9,18,34,18,10,80,75,18,68,42,66,155,105,68,50,51,21,19,38,24,16,39,11,34,51,44,47,12,22,24,28,57,35,22,77,101,29,31,51,34,54,36,26,19,34,43,26,13,52,6,44,52,23,26,5,10,42,29,75,48,23,19,44,100,118,52,38,88,83,83,61,79,61,85,78,63,22,23,5,49,57,47,34,10,6,24,44,22,48,26,30,53,50,120,94,35,49,63,31,19,44,29,29,46,32,34,14,33,60,25,31,20,16,33,35,34,20,29,48,15,43,60,86,111,48,94,144,119,100,78,100,118,78,42,33,16,15,27,22,10,37,33,24,25,14,20,33,36,57,34,34,27,59,77,88,102,129,186,154,130,135,112,131,120,132,121,52,12,14,39,50,50,71,71,113,92,86,95,105,73,90,107,69,69,41,13,39,34,38,31,44,36,19,47,37,69,79,40,71,79,57,47,46,47,23,42,23,100,198,245,235,238,214,243,226,237,224,222,229,245,220,232,221,243,108,7,11,10,21,7,2,12,6,11,10,7,13,6,3,15,19,2,0,16,12,0,8,10,0,190,221,201,216,215,202,195,229,194,207,186,200,195,190,203,201,195,221,189,177,190,197,189,228,196,213,209,217,203,197,182,190,201,211,208,198,204,229,209,192,216,203,204,192,207,193,196,204,194,202,199,202,215,214,195,203,204,201,217,197,212,166,207,222,227,215,218,207,229,203,207,220,196,201,197,194,213,202,230,219,192,181,206,198,175,188,225,200,198,176,212,231,206,221,208,204,188,168,211,181,195,190,211,194,200,223,199,201,219,201,195,208,181,174,194,212,199,198,205,193,193,206,192,193,201,203,189,174,193,178,179,195,181,197,187,201,197,193,184,192,178,180,211,224,193,188,195,209,170,202,192,219,216,200,210,201,209,211,183,175,208,189,216,204,205,201,204,190,189,201,194,190,196,213,210,213,189,192,203,204,188,202,210,191,200,200,177,196,201,209,190,205,196,176,169,185,186,192,194,186,181,192,178,211,204,202,198,204,180,180,200,212,184,187,190,177,193,196,185,206,196,190,167,212,171,190,201,208,164,208,180,175,196,190,190,204,195,217,190,204,189,207,198,182,189,184,179,203,182,209,206,217,193,196,199,174,186,199,192,201,171,193,198,191,182,212,181,207,196,210,199,197,186,198,206,197,196,181,184,204,197,174,176,197,179,191,194,181,211,195,201,183,167,179,194,201,196,210,187,146,159,158,167,161,168,146,177,151,206,187,176,215,182,219,207,212,119,34,46,60,86,112,60,100,116,153,130,138,143,189,165,158,146,147,131,151,184,196,130,97,168,161,214,128,100,131,74,91,70,16,29,133,117,72,149,91,69,50,13,23,20,83,81,79,62,138,217,185,201,202,167,248,242,248,219,90,57,71,117,116,48,49,57,117,147,124,96,97,22,9,3,40,63,89,72,43,35,34,27,78,111,129,121,121,108,66,63,70,83,40,26,22,35,85,129,119,64,23,24,9,25,38,69,41,66,76,56,59,99,117,125,114,84,76,45,14,43,44,41,26,11,75,147,115,16,14,23,142,144,84,121,60,23,28,61,112,82,39,15,29,16,23,15,27,8,57,26,34,41,35,19,10,40,31,26,46,77,86,21,63,50,52,72,25,13,32,32,47,27,6,21,25,28,28,31,14,26,45,35,10,21,55,30,28,38,61,105,38,77,83,40,63,65,12,70,74,75,11,22,25,33,35,35,37,47,28,17,49,51,61,51,27,24,29,113,188,44,36,125,118,167,178,120,190,130,76,38,25,21,45,19,50,60,44,24,34,7,64,32,47,34,14,36,70,131,95,66,51,67,12,22,26,31,13,28,36,30,26,58,8,19,8,20,14,41,46,31,12,21,47,17,64,27,52,91,59,40,76,82,89,82,108,86,66,46,19,9,16,48,53,31,31,24,40,34,14,26,42,43,11,27,39,75,54,99,148,157,144,125,107,65,59,97,110,105,104,109,75,18,24,33,56,38,60,12,67,70,125,123,122,110,118,98,109,70,47,16,13,16,32,17,18,17,28,21,56,56,87,116,84,61,51,17,20,17,33,48,3,141,218,252,253,246,250,235,218,238,248,213,227,221,218,229,215,235,117,11,1,6,18,24,9,45,20,42,30,36,14,18,0,13,10,17,11,24,19,25,3,0,12,217,171,190,202,186,220,215,219,206,208,191,204,189,224,202,225,217,190,212,189,234,200,182,195,191,203,222,184,206,209,198,206,192,200,176,185,213,206,184,186,198,192,207,192,202,171,209,226,187,238,182,187,204,224,211,206,213,209,223,208,203,205,201,232,189,234,226,217,215,221,222,219,227,190,193,219,192,226,197,207,183,206,216,217,218,212,206,203,191,186,188,208,187,218,210,213,208,215,181,192,204,204,218,191,204,192,199,201,194,170,195,212,202,226,210,202,192,178,197,220,200,194,195,213,187,207,175,182,207,165,173,194,177,206,211,203,197,201,196,186,212,206,175,180,185,206,210,183,159,182,182,185,199,174,197,187,210,186,196,197,194,187,186,162,195,194,207,200,191,201,204,190,193,220,199,192,182,208,193,196,204,196,202,179,188,179,206,195,193,196,189,204,186,199,217,224,177,206,188,190,207,198,187,223,189,213,191,195,194,197,192,190,198,170,174,197,195,196,183,197,176,189,187,187,200,217,206,206,172,194,210,185,196,210,207,217,196,208,184,200,211,203,185,225,179,195,186,178,158,206,187,192,194,205,199,198,204,192,186,215,191,203,198,202,187,179,156,181,196,180,181,211,202,184,175,189,194,168,196,196,199,187,162,197,183,188,194,183,184,216,195,193,176,215,192,198,194,204,164,154,159,124,109,125,123,149,126,156,208,214,173,184,181,173,198,215,179,116,119,66,68,75,34,128,139,106,134,125,198,201,186,146,142,179,173,169,211,154,109,121,174,202,224,89,82,122,64,89,55,51,41,117,119,78,103,69,38,30,4,50,73,69,44,24,110,212,255,168,161,179,218,244,251,225,114,65,82,111,117,70,47,82,77,131,111,62,58,52,31,15,18,41,88,122,58,59,60,31,21,59,97,129,115,129,128,102,97,83,46,21,24,29,48,115,146,117,56,10,16,16,35,72,39,48,47,68,127,131,127,129,67,47,33,37,11,21,2,15,34,45,38,121,148,63,27,18,61,170,135,101,115,79,44,27,28,105,107,45,23,43,39,32,45,27,1,30,14,47,64,26,5,26,18,29,81,24,86,80,5,43,22,40,18,25,52,23,44,18,22,12,25,22,16,43,11,38,16,25,38,51,66,37,27,50,29,121,126,31,91,134,121,186,140,114,179,132,50,48,43,11,35,32,34,31,39,25,12,63,42,40,48,47,36,51,170,159,39,26,71,83,155,130,104,148,98,60,33,20,7,46,40,54,28,43,36,23,30,19,49,42,39,3,19,52,139,82,33,68,66,39,39,29,9,48,37,19,29,51,41,20,22,23,36,49,56,19,3,54,14,39,73,35,40,50,82,67,29,59,90,61,161,170,85,30,44,7,13,43,17,64,56,51,6,18,35,46,33,38,35,48,16,64,115,64,132,154,154,139,107,112,59,57,26,112,111,103,92,84,26,52,49,43,27,34,38,37,61,117,131,126,99,93,95,112,78,90,29,45,28,25,27,10,11,20,20,16,43,64,81,87,66,16,10,8,50,60,28,36,180,218,247,252,237,240,243,232,239,212,234,243,206,230,218,235,219,135,12,13,5,18,33,7,14,4,10,11,18,16,2,7,1,0,7,13,15,20,27,5,13,17,207,228,203,216,203,203,211,194,221,212,201,224,204,219,192,201,194,193,198,214,185,175,189,201,207,219,205,172,207,200,209,206,226,208,193,188,183,190,224,202,201,210,209,171,187,221,235,181,194,184,227,206,207,193,191,203,207,218,204,206,204,188,202,213,198,203,216,203,205,218,212,221,201,197,202,216,203,195,198,209,227,211,202,195,208,227,189,202,188,200,215,211,184,204,203,201,189,206,212,210,196,207,183,192,197,226,211,208,198,200,206,186,197,199,205,183,162,197,200,196,184,195,200,207,195,194,209,189,189,191,190,195,201,191,177,205,203,196,204,210,207,193,183,203,190,171,210,202,178,204,192,200,178,188,189,190,216,196,217,199,187,208,217,197,204,203,206,193,202,192,207,162,193,192,182,203,182,178,224,204,201,189,196,189,179,196,187,200,181,177,199,202,198,190,218,179,204,202,177,198,203,191,197,187,192,203,210,187,206,214,188,177,204,194,201,173,186,188,203,187,212,200,192,204,184,193,197,195,180,204,193,195,192,182,205,192,170,209,170,186,197,185,199,195,201,182,203,202,201,175,191,189,186,186,188,183,188,203,195,195,192,214,196,220,200,185,204,190,195,159,188,190,199,175,197,190,184,164,204,190,183,200,184,192,185,200,203,197,178,197,175,193,201,191,171,202,184,179,136,137,148,145,142,164,178,149,153,154,197,191,171,170,193,176,214,203,184,168,159,80,78,102,84,171,173,137,143,122,178,173,141,136,167,184,142,164,155,154,135,130,159,191,203,95,94,105,61,99,66,30,74,140,93,66,94,34,16,7,22,41,54,88,40,103,200,237,243,133,185,249,246,247,197,103,53,60,104,109,66,54,69,125,105,81,104,64,59,13,4,38,58,96,112,157,130,75,54,59,16,46,39,61,94,120,114,121,123,91,101,68,42,29,56,131,176,58,28,11,0,52,78,63,33,20,22,10,68,141,138,95,53,19,20,27,32,37,25,27,19,51,40,76,61,26,56,21,83,179,129,103,132,64,37,20,28,59,39,12,13,34,36,14,6,16,16,4,47,38,35,45,12,16,19,64,32,36,118,85,41,67,28,21,28,19,35,24,52,28,18,39,7,12,69,43,48,14,27,4,34,57,39,15,21,24,33,127,105,48,70,117,109,176,113,134,127,90,18,9,24,55,50,42,21,52,9,30,42,23,33,7,38,61,97,29,104,143,52,41,43,42,17,34,44,43,26,51,41,32,24,40,39,42,0,10,36,29,51,50,39,33,26,26,27,66,146,75,17,83,72,39,63,51,53,38,58,44,27,26,17,31,17,28,27,34,37,40,43,27,60,17,15,45,30,73,116,47,15,46,50,102,105,92,43,51,48,45,23,31,34,14,31,61,44,33,39,42,21,49,30,20,60,118,76,109,123,88,118,92,114,107,92,50,53,117,108,102,90,38,34,42,29,32,31,20,46,35,32,78,122,106,79,90,100,127,121,74,29,49,75,67,40,18,21,15,6,42,38,47,65,81,37,71,41,28,11,29,30,53,202,222,254,210,182,218,231,242,240,221,196,231,222,237,222,214,206,110,34,13,28,30,7,17,6,7,32,1,3,30,5,5,0,49,9,10,25,14,36,31,19,21,183,200,196,184,188,200,179,207,193,201,205,194,196,183,199,187,196,204,207,213,217,204,209,179,203,217,198,173,206,206,212,180,212,220,218,180,206,208,196,208,205,199,171,197,197,217,188,209,195,224,209,234,214,218,189,194,209,200,231,216,225,225,234,194,204,219,216,201,213,216,212,217,177,202,218,216,196,229,188,187,213,222,190,199,195,198,213,207,193,187,194,203,195,216,187,211,198,190,211,202,209,207,196,204,184,189,177,186,191,174,211,216,206,209,213,206,204,192,175,204,194,207,203,200,210,155,192,203,189,191,230,190,198,188,184,207,196,212,195,202,185,207,205,188,199,225,221,175,197,180,171,213,196,204,182,180,206,207,195,214,168,192,188,209,176,197,214,209,211,204,195,202,199,178,199,201,188,179,226,193,161,200,203,207,181,202,199,206,166,211,197,202,197,200,186,221,190,181,197,211,182,183,194,194,206,165,196,184,195,203,200,203,175,219,170,187,206,197,190,226,214,198,190,209,182,195,205,229,197,205,197,183,201,201,193,193,199,193,214,195,175,179,199,198,173,215,189,203,191,193,214,209,205,190,194,209,190,206,196,176,205,193,218,195,196,216,183,208,226,197,191,184,198,205,191,214,199,169,198,215,213,192,167,194,187,180,166,202,195,188,183,203,202,208,180,191,178,210,134,159,173,132,185,199,223,161,197,161,185,183,169,170,196,187,189,199,131,176,164,104,87,109,130,245,150,128,184,161,175,152,120,195,146,151,136,167,152,174,156,118,163,189,161,110,83,107,81,85,52,40,134,152,117,78,69,42,22,41,33,70,49,67,103,196,245,232,182,147,252,255,240,195,77,72,58,88,171,119,52,56,101,128,112,82,72,41,34,12,53,58,101,135,152,160,121,110,103,66,45,35,54,22,48,106,131,142,137,139,154,114,55,62,62,132,122,43,19,4,9,50,59,45,40,23,17,27,38,72,125,108,82,30,31,89,109,54,22,35,28,30,27,11,27,16,30,14,93,181,111,103,126,53,11,36,15,47,42,19,34,34,43,7,24,25,19,19,29,3,39,27,30,54,68,31,17,51,138,72,19,51,28,37,25,24,36,43,37,34,37,11,9,13,17,33,17,20,26,25,35,50,26,22,27,33,69,98,88,53,59,43,35,39,34,12,48,29,45,29,69,58,23,34,31,12,44,62,58,3,24,40,89,76,106,48,53,114,48,48,41,53,57,52,22,29,34,44,41,25,38,16,29,42,29,35,25,69,22,26,4,28,47,63,43,51,127,74,45,164,132,136,175,91,119,156,82,46,14,34,14,33,7,4,10,24,7,25,29,56,56,35,28,19,43,114,89,58,36,74,59,38,28,26,39,38,39,41,54,29,12,22,25,47,68,54,27,25,13,45,29,63,81,119,71,55,129,104,90,129,119,128,84,45,33,65,102,108,128,72,28,61,37,33,29,13,26,51,46,60,98,146,111,95,97,120,105,96,19,71,140,64,45,51,26,8,11,12,43,22,52,80,62,66,22,48,46,48,32,16,117,143,138,123,58,142,164,147,157,142,152,183,228,228,213,214,212,128,0,18,14,20,17,15,19,21,16,8,35,13,17,1,3,27,1,15,3,20,23,14,18,12,213,214,202,205,200,171,206,191,200,218,222,190,208,191,200,194,206,219,202,229,185,203,198,210,200,214,199,218,206,224,196,193,196,205,203,224,197,221,204,207,188,204,218,195,196,195,177,199,218,191,195,189,172,205,189,205,214,208,202,219,242,186,193,204,198,211,199,202,172,208,184,197,192,215,194,204,189,182,205,203,193,209,208,219,199,191,213,198,214,204,209,198,208,190,189,190,183,212,202,193,204,192,178,210,205,198,210,185,200,202,212,197,208,158,195,181,193,203,182,215,186,182,180,185,187,195,211,196,179,204,193,222,194,189,187,172,204,211,187,192,196,182,180,194,204,184,207,227,200,190,202,192,194,205,185,189,194,191,207,195,209,198,201,187,214,184,195,206,186,193,192,215,192,215,183,200,204,189,205,200,197,204,194,197,223,190,195,202,174,220,172,216,187,159,219,187,189,182,179,168,188,184,209,196,200,182,227,204,177,213,204,167,186,210,191,231,193,202,170,195,189,203,196,212,198,179,179,194,211,188,174,198,212,198,193,184,186,206,179,195,202,180,214,216,196,170,172,209,206,192,189,218,196,191,178,208,198,186,179,185,194,182,194,162,181,185,189,182,184,195,196,175,183,192,188,186,182,193,191,214,189,173,189,201,193,203,190,208,195,207,200,172,192,175,180,197,200,204,137,174,187,166,176,180,200,129,215,188,180,207,189,214,195,174,199,177,149,165,177,92,72,73,146,237,137,122,143,93,160,196,176,169,138,138,138,176,166,169,135,116,193,186,214,124,117,107,110,99,82,68,139,83,64,105,37,18,44,71,67,78,56,77,204,255,255,164,199,162,244,240,210,98,42,85,106,113,129,93,67,48,63,118,91,61,27,25,34,15,36,91,106,129,106,125,119,123,124,101,90,66,22,30,43,19,59,139,114,157,162,155,166,113,56,20,26,46,29,15,29,32,56,38,35,40,14,31,6,29,76,128,77,31,118,195,132,64,28,15,38,34,54,18,41,32,30,42,127,148,79,110,130,49,21,22,3,3,8,22,48,49,13,3,6,9,31,36,12,10,16,23,45,59,27,22,12,26,97,81,43,82,28,57,69,42,41,31,39,51,26,41,48,24,10,7,30,47,26,23,28,24,24,63,65,63,77,75,123,23,13,12,65,37,14,33,50,56,85,58,31,23,39,52,32,76,81,50,79,75,75,75,133,122,116,103,112,128,43,104,68,54,61,47,70,77,55,66,81,30,54,78,50,53,35,36,46,46,31,26,35,17,50,44,12,29,129,63,10,135,93,141,164,135,154,159,78,57,7,10,31,0,47,24,25,13,12,1,40,51,49,18,39,36,51,94,91,40,55,67,26,38,37,29,29,30,29,61,23,26,35,19,30,36,68,58,38,34,30,26,44,89,54,117,76,49,101,73,122,136,151,149,53,54,44,93,129,83,114,74,29,54,65,46,18,5,17,42,29,36,126,160,145,136,114,113,109,79,12,117,199,84,49,19,29,23,34,34,22,11,57,56,48,74,41,27,23,62,35,52,41,40,34,49,15,76,133,138,143,113,94,88,155,196,200,186,222,110,6,12,4,10,22,5,17,2,21,5,6,30,12,38,2,24,12,7,34,22,11,15,24,0,229,186,199,180,200,208,213,205,225,194,212,205,237,212,207,209,206,196,192,195,200,201,177,186,210,169,198,214,183,212,201,216,211,181,182,194,195,201,230,201,193,191,196,208,204,211,201,194,198,182,194,201,203,202,208,194,205,207,232,213,204,190,214,200,223,182,195,196,205,200,202,184,175,205,189,171,205,198,191,188,211,222,200,200,195,188,213,208,203,205,204,177,188,193,195,194,183,215,200,178,206,196,171,195,175,207,208,180,194,199,168,202,211,201,189,183,178,193,174,227,217,181,186,200,213,188,169,183,184,214,177,184,218,193,192,186,220,224,193,220,174,199,205,193,187,177,212,206,182,166,192,194,212,153,200,195,202,189,184,221,199,202,206,191,188,195,199,193,205,189,200,206,206,197,209,176,185,200,204,208,188,184,197,196,168,186,193,196,193,166,191,181,217,200,194,193,185,191,201,200,190,197,205,207,159,197,218,206,200,181,187,195,207,199,180,177,214,202,212,171,191,183,193,171,200,175,196,190,215,194,195,195,189,190,186,182,199,179,190,196,177,195,203,205,184,196,176,214,221,187,205,195,184,204,188,171,213,180,198,194,189,201,211,182,192,211,187,188,182,183,176,199,201,165,187,212,193,197,163,154,217,172,189,186,199,188,197,210,174,160,154,162,180,160,209,203,181,195,115,143,129,124,143,152,142,129,206,208,199,187,179,184,182,196,214,195,156,181,182,46,45,65,115,157,79,62,117,147,148,200,170,146,156,142,207,184,145,190,139,171,232,182,178,131,93,145,95,116,57,88,104,40,82,78,59,19,57,76,78,90,63,184,247,242,206,189,246,199,255,228,108,36,63,110,127,107,74,54,60,88,68,74,70,37,25,25,12,40,89,97,115,116,107,106,113,101,124,116,140,102,74,50,48,44,19,51,94,157,182,133,145,149,131,72,43,7,11,36,34,48,47,30,46,21,12,19,6,7,28,121,120,119,161,153,46,31,25,61,88,71,39,34,30,56,48,47,152,136,116,129,127,60,12,7,32,35,7,35,22,39,36,23,17,32,33,38,31,26,16,40,48,24,27,21,9,32,140,84,66,93,57,77,72,64,112,74,60,38,34,54,35,44,10,13,19,59,51,47,24,11,12,37,56,144,189,141,113,55,20,23,37,62,47,35,37,56,65,26,56,52,68,128,99,119,104,97,118,149,182,186,172,199,166,178,198,169,166,207,182,175,175,163,146,155,152,133,132,130,150,162,158,135,112,97,81,47,67,60,40,58,51,41,38,68,140,40,64,42,58,43,38,54,47,55,60,100,37,35,25,21,61,23,30,21,14,49,32,28,36,54,19,28,58,87,99,61,67,50,14,31,59,16,18,14,28,36,20,11,25,18,36,14,23,48,33,4,30,37,56,46,77,141,81,36,59,55,59,116,111,89,39,22,32,83,122,88,120,87,30,43,54,32,32,24,38,17,48,59,70,60,55,96,102,92,88,44,83,155,166,115,71,50,53,22,41,17,55,51,79,70,77,45,73,18,53,44,44,65,83,53,17,28,65,95,101,79,134,79,81,20,73,171,206,220,221,125,1,16,17,46,13,19,12,31,15,29,8,23,15,5,15,21,12,23,12,17,10,0,25,1,190,207,221,211,216,202,225,206,186,203,203,214,193,224,213,197,197,179,191,206,211,185,219,185,187,212,197,196,212,225,217,193,175,195,209,198,194,183,197,188,212,193,198,218,160,189,209,205,194,189,206,223,210,196,187,190,226,202,197,209,188,202,201,190,214,208,197,223,194,204,181,186,198,202,204,203,214,181,214,165,209,209,182,195,175,203,232,201,209,197,214,203,209,192,203,199,201,225,190,233,196,174,186,197,210,174,218,205,179,205,215,194,181,172,187,202,210,201,183,182,176,199,191,182,188,196,200,205,189,201,207,188,164,169,201,200,196,195,177,184,193,173,199,195,215,187,204,188,209,214,197,201,200,195,192,189,200,206,181,210,203,189,209,207,208,191,198,185,185,196,170,205,189,186,213,189,194,177,220,196,214,192,188,164,186,180,213,198,199,226,209,186,187,202,212,206,189,173,170,186,188,201,198,224,154,184,197,192,194,170,203,206,189,208,196,174,216,205,177,192,211,192,205,179,204,182,225,179,202,198,190,204,188,221,181,205,215,208,170,204,196,186,185,195,189,188,185,191,184,183,188,193,201,187,199,178,173,212,177,202,197,191,182,210,197,181,217,188,218,186,184,210,173,188,170,201,157,197,169,195,209,221,170,187,216,172,196,182,177,182,181,182,209,185,189,200,218,149,93,109,153,140,178,149,144,172,207,175,188,178,191,186,213,191,180,216,195,206,183,135,92,126,80,97,17,74,195,180,168,155,184,164,154,185,185,126,176,191,125,155,190,115,175,127,101,137,58,73,66,61,81,6,37,58,42,33,65,56,51,83,152,250,250,215,137,220,252,224,213,117,41,84,96,134,130,77,82,54,110,66,60,65,67,21,6,29,42,83,92,137,131,95,96,122,113,98,94,95,95,125,118,139,65,35,61,57,33,89,115,159,137,142,128,136,106,56,59,23,25,10,18,26,23,37,17,13,41,20,27,65,120,117,178,77,23,6,41,76,115,102,65,40,44,26,22,52,178,131,110,132,98,43,30,15,26,34,29,12,23,61,25,4,38,6,14,10,38,25,43,32,27,10,51,22,57,99,175,53,104,156,131,232,116,161,185,99,68,22,22,27,35,20,6,8,27,42,23,41,14,8,23,56,46,167,105,84,120,47,31,48,35,36,43,52,63,58,128,113,115,129,173,185,217,218,168,206,210,211,206,201,200,204,193,192,187,180,160,197,195,167,181,208,234,223,158,235,211,199,201,209,197,202,152,158,171,147,165,164,145,132,93,43,28,117,139,66,32,54,60,38,37,54,27,40,63,34,15,47,22,14,11,20,26,40,61,36,40,8,13,49,42,70,40,83,112,35,27,61,18,31,44,23,34,30,40,52,32,20,51,35,31,39,23,9,46,68,54,43,36,4,84,137,74,29,55,36,56,87,86,41,44,49,23,95,120,100,124,125,55,13,25,52,33,16,35,12,11,16,19,59,46,78,87,99,102,83,143,167,174,112,101,90,62,32,11,27,42,22,64,58,94,71,63,38,22,43,27,51,66,53,60,84,97,120,119,99,64,74,87,119,169,187,213,201,213,120,15,9,25,26,13,11,2,30,10,8,24,0,19,8,3,1,19,29,25,17,2,3,22,12,165,191,198,212,206,184,203,193,198,222,213,199,188,195,207,211,190,200,192,180,207,197,198,186,197,183,203,207,197,203,225,222,224,209,208,206,187,189,219,207,197,185,207,186,174,188,222,219,183,193,212,183,208,199,208,217,218,196,213,210,194,179,202,204,208,217,207,196,177,215,198,196,182,211,210,189,189,201,178,196,207,198,194,213,195,202,198,208,190,207,202,202,198,209,187,201,184,211,212,220,202,181,211,194,183,205,168,194,201,196,199,213,197,188,195,212,177,199,187,199,189,198,190,187,180,202,206,176,183,214,201,199,195,174,175,177,191,197,200,192,177,202,206,188,197,201,177,184,200,200,197,218,206,219,215,210,171,185,172,191,176,222,190,199,211,180,191,203,213,208,184,205,200,204,231,191,194,196,186,183,186,208,233,209,191,189,208,186,194,185,194,195,197,203,190,174,179,175,204,211,211,210,206,188,183,189,189,214,205,179,189,183,204,206,177,200,182,226,201,191,182,178,174,185,160,181,199,182,191,184,187,190,181,192,194,205,210,175,216,206,193,185,182,181,177,180,170,213,186,188,193,184,177,221,199,199,194,189,196,207,182,181,177,204,176,176,212,192,222,194,180,206,186,191,196,166,188,167,183,178,188,222,177,189,159,177,166,186,177,182,181,169,204,177,198,219,203,144,144,167,171,185,175,177,177,179,214,193,197,166,185,174,190,188,182,215,184,199,213,131,125,146,134,73,38,141,233,199,145,165,167,185,185,187,170,108,183,189,62,167,168,134,199,114,101,108,62,85,61,71,37,15,21,71,63,41,71,56,56,146,233,252,209,173,195,225,239,186,99,57,50,95,119,107,48,34,114,102,80,71,85,33,42,5,33,47,63,101,109,126,118,83,99,132,109,98,110,87,87,84,107,154,60,27,40,36,54,45,53,108,130,179,169,142,130,134,156,91,52,37,12,10,1,23,33,31,1,28,51,42,98,122,142,78,31,7,73,165,139,151,115,64,62,36,43,75,157,103,116,135,111,21,15,20,24,7,29,27,28,38,39,17,24,6,11,8,25,21,59,27,25,6,31,63,65,64,122,55,59,75,55,152,75,62,95,46,35,40,8,27,9,35,21,52,29,27,6,36,18,54,64,51,41,78,60,127,93,29,49,38,47,56,77,134,142,180,208,188,233,215,239,207,199,185,182,190,181,164,173,147,147,124,110,100,131,120,120,118,135,113,150,164,157,154,168,166,174,171,181,150,148,174,186,159,166,176,176,181,197,180,175,161,151,162,152,71,78,51,38,52,36,57,53,15,43,48,13,51,29,35,47,24,28,48,51,29,21,13,18,44,78,65,39,84,100,43,96,74,39,109,56,51,70,52,49,21,14,13,28,43,58,19,18,2,37,49,76,20,15,22,29,131,62,43,50,55,27,39,41,53,17,15,32,84,147,127,117,115,53,23,27,32,18,25,39,25,57,64,47,68,26,64,119,143,132,96,69,46,52,66,68,94,81,45,28,21,18,78,62,66,85,95,60,54,15,43,85,58,41,89,154,134,125,109,108,103,63,115,161,187,217,239,219,225,211,154,8,2,25,14,31,37,23,14,3,12,12,10,17,2,4,10,14,0,7,5,17,6,19,21,199,189,224,203,168,184,209,203,202,206,224,197,205,177,219,205,181,177,203,190,185,216,164,180,197,191,207,213,201,217,203,215,182,197,218,188,200,222,193,203,212,210,214,179,200,198,213,189,190,203,201,214,215,201,212,191,210,178,184,181,208,218,205,209,205,199,207,199,218,192,206,215,199,175,182,205,210,208,189,216,186,210,177,207,203,200,199,189,214,215,186,177,210,177,182,196,194,186,200,220,219,205,206,208,203,183,169,200,205,207,219,202,181,197,206,175,193,192,167,187,190,197,189,188,180,181,189,196,176,175,189,202,215,179,203,209,188,185,194,191,187,176,172,174,204,182,182,197,201,205,187,191,182,186,195,192,185,204,173,171,188,183,173,183,192,220,199,211,196,204,187,185,175,210,176,200,165,205,188,192,196,174,198,200,216,197,177,169,188,182,189,193,191,206,192,192,194,190,205,207,197,195,195,184,206,183,199,199,178,208,183,183,177,191,183,215,188,214,193,216,196,190,200,208,197,195,192,189,201,202,167,178,207,185,180,202,187,193,203,170,204,192,175,189,183,169,191,218,188,178,197,187,191,201,195,205,195,204,194,187,183,185,170,183,190,202,177,209,194,163,185,190,190,183,196,171,177,186,214,192,172,172,167,164,153,184,172,187,197,180,161,179,190,188,187,203,138,96,159,165,158,168,171,157,176,216,207,185,178,197,164,177,189,192,195,163,170,203,198,132,138,213,153,123,94,165,226,170,145,160,176,169,123,191,173,160,215,146,66,154,171,190,226,71,121,123,107,107,87,73,49,8,28,93,88,58,70,38,154,240,251,253,155,161,241,241,239,136,29,47,86,145,109,62,64,86,125,101,53,69,47,41,17,52,39,82,107,102,135,106,109,83,119,148,113,117,96,84,114,102,110,125,70,100,66,29,53,43,25,35,40,96,149,164,129,121,149,143,140,101,54,20,32,17,30,0,23,57,44,20,54,102,147,71,44,22,14,68,2,31,104,64,31,17,29,118,164,114,126,147,82,31,29,17,23,40,10,42,41,70,38,15,55,41,36,18,28,36,38,31,19,28,52,62,66,102,115,65,23,24,26,44,39,51,46,22,87,48,15,20,30,32,74,49,19,10,34,64,71,95,116,93,88,160,87,128,128,146,181,146,195,202,196,209,208,212,224,168,201,170,153,159,141,128,117,139,127,147,135,124,122,152,144,122,148,149,157,171,169,155,146,156,147,163,190,162,204,193,168,164,174,178,148,136,153,123,120,112,158,153,174,160,169,176,174,150,198,164,134,137,76,62,36,31,49,39,41,33,17,40,29,21,37,25,32,29,17,8,33,61,18,58,59,96,102,14,93,108,125,242,108,131,155,62,50,29,38,30,8,36,8,32,25,10,37,60,55,32,32,26,81,131,88,36,64,51,20,38,36,22,25,22,55,88,122,95,117,124,59,49,16,39,40,34,13,47,77,105,61,55,46,112,159,145,81,55,36,34,32,25,36,120,99,63,47,11,41,35,53,23,56,42,34,106,171,144,126,127,121,152,149,98,113,83,72,89,129,211,231,237,224,242,237,211,233,132,11,4,23,8,17,13,17,33,19,16,22,15,22,0,20,29,12,10,10,7,18,25,7,2,221,206,204,211,197,185,200,180,183,214,210,224,219,209,201,205,180,222,199,218,214,185,194,205,198,205,218,198,226,205,171,174,194,214,192,205,207,185,191,181,217,196,208,199,184,198,164,224,183,220,205,210,200,205,195,198,188,184,196,165,205,208,203,199,217,208,221,189,211,223,188,209,216,194,199,209,203,222,194,178,203,214,174,194,218,202,192,196,194,221,203,206,200,188,173,190,175,209,193,205,195,176,194,195,218,222,190,187,218,186,201,179,210,186,180,184,186,202,196,191,209,188,180,193,195,195,192,186,198,178,191,194,186,204,200,189,183,175,218,204,202,218,198,187,160,195,199,187,209,188,214,185,184,180,214,190,166,214,196,213,202,178,202,205,218,193,196,191,191,211,194,203,187,215,193,200,191,201,189,178,204,176,180,162,190,182,187,187,192,178,192,216,195,204,197,189,204,188,179,187,193,209,180,188,188,194,181,180,203,200,189,178,184,169,202,193,177,202,198,194,182,217,205,193,183,207,176,177,191,187,179,200,195,193,196,168,190,204,206,188,191,189,188,183,187,197,200,190,183,187,220,200,190,195,193,175,221,185,183,191,206,187,178,197,195,195,198,164,196,204,187,185,188,186,181,197,190,218,185,179,188,156,162,182,163,193,168,217,190,180,214,192,153,172,212,218,176,107,126,123,136,148,154,150,179,176,198,188,184,197,183,181,197,195,189,179,207,189,221,132,161,148,121,91,85,147,205,197,171,155,173,151,174,177,181,170,172,120,47,164,155,179,169,80,149,150,110,114,101,68,19,35,140,210,129,88,75,83,239,231,236,188,131,231,249,238,170,46,62,88,109,109,67,83,107,122,117,64,73,71,24,0,9,30,81,105,111,123,137,86,103,84,113,156,170,101,99,75,67,89,112,115,86,103,84,60,56,17,39,40,36,22,40,113,110,156,129,171,169,179,158,99,92,48,29,16,12,31,18,4,30,38,89,111,78,49,15,47,21,73,126,74,46,17,43,139,153,92,128,107,64,33,41,73,42,49,92,88,50,53,50,19,30,31,36,18,20,7,36,37,22,70,50,20,29,87,134,49,38,50,54,41,30,58,51,29,40,51,33,13,19,58,39,92,67,82,104,97,131,139,160,162,185,201,182,189,182,216,224,187,192,176,152,180,136,132,140,158,152,140,131,159,148,147,159,144,157,148,141,133,145,139,122,135,109,129,98,124,107,103,89,73,87,95,118,111,114,160,127,150,142,153,110,176,142,134,181,168,171,154,151,144,133,149,146,172,177,185,178,194,156,167,173,120,114,102,70,86,21,32,68,81,60,18,10,41,33,45,27,26,21,35,62,133,93,66,76,87,144,195,102,145,131,65,58,45,28,27,6,6,14,32,22,57,17,8,32,45,70,52,107,175,79,77,70,34,25,27,27,42,26,29,37,86,141,95,133,153,84,18,23,58,58,45,60,49,71,79,140,89,84,164,148,84,33,8,42,15,35,35,79,111,131,85,48,15,37,11,49,60,63,113,156,207,194,179,151,156,141,126,87,80,51,99,134,188,236,236,210,236,219,208,217,223,217,126,11,11,5,4,24,24,8,35,21,9,21,26,7,4,5,15,23,10,17,18,12,3,27,7,206,217,226,201,185,207,213,195,220,217,201,197,244,235,207,206,214,199,208,224,226,220,199,220,190,202,211,217,216,209,179,198,184,214,219,187,186,210,192,204,206,180,195,212,172,208,218,200,188,196,216,222,199,211,186,218,179,186,196,219,198,201,215,207,191,196,183,226,210,202,220,190,203,192,184,201,224,191,190,217,187,188,205,171,203,198,209,182,199,175,190,198,188,217,198,198,202,198,192,206,196,186,203,201,195,218,193,198,207,154,188,178,198,216,208,180,204,193,187,206,193,216,202,210,204,195,207,197,181,192,190,197,177,200,210,180,206,202,196,195,199,189,177,177,191,189,188,202,217,184,200,198,197,176,200,193,179,197,201,200,193,183,192,204,189,194,202,199,198,179,212,190,198,185,186,184,176,211,220,180,192,200,177,181,187,201,197,208,203,202,218,191,192,181,197,193,205,187,199,197,204,214,178,187,195,194,176,192,152,195,195,200,199,176,195,182,195,173,197,185,188,187,203,175,198,209,203,198,195,199,168,179,173,196,199,191,194,181,200,196,190,189,210,193,180,183,189,173,194,192,176,176,180,207,204,181,187,200,182,203,209,201,170,189,190,186,178,214,199,199,192,192,191,201,200,209,187,179,168,156,148,188,186,196,184,191,201,190,205,187,180,186,180,205,191,194,146,102,139,157,152,163,138,138,211,197,224,203,169,202,175,182,193,207,175,178,167,182,198,147,110,86,62,46,45,69,124,162,126,128,137,182,203,216,137,192,165,140,64,99,163,173,143,102,128,104,114,124,99,21,14,61,193,188,131,59,118,200,228,193,138,174,217,240,234,145,52,29,132,115,106,88,58,138,122,131,65,73,40,12,13,33,22,91,119,112,115,81,107,87,99,98,75,136,111,104,109,66,128,131,127,157,79,76,72,39,46,21,34,12,37,45,43,37,61,107,150,171,145,171,154,165,157,120,75,32,17,37,8,14,15,0,28,73,119,79,63,81,98,188,145,69,29,21,52,129,154,95,127,113,59,4,37,60,42,30,47,32,49,47,20,21,21,13,47,38,40,15,10,12,28,55,25,46,1,45,130,24,48,45,80,68,23,57,48,51,52,54,36,55,74,83,119,122,142,157,187,202,190,200,185,211,199,185,185,185,148,152,167,124,120,115,119,132,131,144,129,129,110,140,147,132,144,97,95,74,51,58,36,43,33,30,55,48,34,54,15,36,29,18,36,33,29,28,22,25,52,20,49,24,41,33,61,86,88,110,90,138,113,173,167,145,135,143,146,131,123,136,149,161,147,172,179,190,163,143,162,163,170,148,132,113,104,71,65,21,52,69,31,39,23,27,61,125,109,61,58,35,58,50,48,41,37,9,43,59,51,32,25,16,20,36,70,19,29,11,25,14,91,73,101,155,80,36,95,38,62,69,22,17,37,33,55,110,120,87,95,101,69,39,17,33,56,81,59,48,21,57,115,98,173,188,114,64,21,36,42,24,25,52,144,187,96,47,42,32,17,87,238,200,212,189,160,152,150,147,131,111,101,75,58,94,133,202,237,218,244,234,228,228,235,234,237,223,221,117,0,0,1,24,14,14,10,16,7,18,30,36,8,4,11,8,9,13,24,13,21,10,25,29,211,186,203,188,192,210,200,197,206,171,207,223,203,202,216,221,186,199,221,202,207,189,207,221,202,205,202,189,217,202,192,188,171,201,182,214,213,201,203,205,194,216,206,199,203,204,217,209,189,191,208,201,209,222,194,219,228,222,182,211,188,216,207,190,199,191,219,203,176,210,216,202,203,195,234,200,200,206,202,204,194,191,169,196,201,201,182,187,208,184,223,190,216,195,187,199,215,239,197,178,199,217,206,195,176,217,196,214,179,195,192,204,227,204,204,195,173,196,199,188,209,202,191,213,185,179,196,210,177,181,211,200,195,174,178,197,199,208,194,202,183,189,193,176,168,199,194,197,200,198,174,209,185,204,213,194,187,192,201,189,197,169,195,197,201,192,182,181,173,197,215,198,179,197,199,194,175,191,187,186,212,208,195,189,202,191,195,196,205,218,187,201,185,195,204,176,173,196,172,190,182,221,194,209,209,194,198,202,193,210,183,172,161,193,184,196,202,163,188,220,168,208,197,190,194,186,169,200,193,208,207,206,186,178,177,183,191,210,171,174,189,195,196,168,202,194,189,177,199,188,182,179,188,204,171,191,185,197,211,209,207,202,182,205,186,198,200,187,195,202,190,187,221,202,196,187,143,155,173,159,185,197,184,203,213,185,208,199,188,212,176,180,182,216,189,178,143,152,182,174,191,166,150,209,205,214,203,218,170,223,191,181,185,186,207,206,183,161,175,120,87,67,97,70,61,61,99,127,135,121,159,165,204,146,119,185,185,167,58,71,140,161,198,119,132,112,90,111,99,39,24,21,92,103,50,114,240,220,220,141,177,229,228,245,150,60,62,49,120,133,58,64,88,124,110,95,79,51,4,21,46,39,57,105,128,100,116,85,82,88,135,127,88,71,103,81,86,95,141,137,151,118,81,21,35,64,57,65,44,12,11,13,42,39,37,44,58,102,133,143,166,154,144,160,177,160,116,49,19,34,28,13,24,47,70,90,113,132,176,156,59,40,38,27,66,162,140,110,122,92,35,19,23,23,21,20,11,25,49,37,49,38,27,8,29,31,32,14,20,54,44,34,34,28,43,54,121,78,28,20,24,23,33,27,35,13,78,72,93,106,152,205,203,191,208,210,227,207,213,188,164,156,144,133,134,106,103,90,138,129,133,137,129,146,124,131,116,70,58,39,29,41,59,5,25,29,20,1,2,22,15,29,34,18,12,40,15,27,18,11,13,22,31,32,30,34,32,39,40,12,5,22,44,2,43,24,50,25,56,27,62,81,103,87,126,125,113,130,135,164,160,175,149,144,152,161,161,172,166,151,200,168,179,139,150,120,123,99,71,24,17,42,66,125,144,37,41,39,76,49,24,35,28,32,53,50,35,43,37,13,50,50,7,22,21,32,14,32,61,63,133,157,57,67,94,65,110,91,25,69,76,27,75,119,134,123,104,111,98,22,56,55,60,62,34,60,10,56,154,134,158,114,74,77,11,20,25,38,41,69,131,104,87,16,32,42,27,140,251,242,220,173,130,114,139,106,110,88,75,73,118,195,251,249,227,235,240,237,200,201,234,241,242,226,217,114,10,0,2,9,38,18,26,0,26,11,0,0,20,4,13,14,11,31,17,23,2,15,13,3,203,204,199,197,185,219,207,172,204,184,223,217,196,207,198,233,225,222,218,199,196,227,195,204,219,195,211,205,206,215,214,210,191,197,198,210,218,193,216,187,227,168,183,195,222,184,194,198,207,202,208,212,192,218,215,215,200,203,212,205,208,231,221,192,198,185,221,205,191,199,205,198,210,201,192,195,202,203,202,202,206,222,222,222,210,196,209,188,209,186,200,190,217,204,194,200,183,208,192,197,210,197,184,197,200,200,206,188,202,199,213,204,189,193,191,224,180,184,193,174,207,193,181,177,209,182,189,197,181,194,206,214,183,216,172,173,177,195,191,181,188,207,215,190,172,216,221,187,172,226,150,209,199,192,181,187,193,171,191,200,214,177,176,176,183,182,184,215,199,184,222,198,182,185,205,179,173,194,193,193,200,187,194,214,205,213,202,193,178,197,213,188,202,207,193,196,181,229,181,210,199,196,184,182,193,220,188,196,195,175,189,201,197,211,171,210,177,194,157,192,178,177,201,195,209,195,197,204,205,176,203,201,210,196,190,200,197,184,203,191,197,202,194,182,210,182,204,186,184,211,187,194,172,210,195,174,190,197,172,212,211,180,196,216,182,200,207,190,201,183,163,209,188,174,168,176,186,177,210,209,182,174,198,181,205,188,191,190,208,197,194,195,214,195,176,192,143,165,208,195,212,186,188,215,189,230,214,186,164,201,192,193,206,182,199,209,182,181,159,91,45,91,122,84,87,69,89,154,170,128,186,175,153,139,198,187,211,156,70,116,211,178,177,130,90,131,97,119,84,61,64,90,107,80,59,148,248,227,156,159,230,249,247,151,19,54,101,72,118,80,63,104,141,119,60,62,37,17,15,22,53,55,75,126,135,111,121,80,125,132,133,118,33,39,73,72,106,99,110,94,113,96,60,31,71,69,81,114,46,31,18,22,32,26,33,33,38,40,75,96,169,173,142,136,157,184,178,163,123,99,51,45,22,37,13,38,46,36,64,61,22,25,29,25,84,157,123,106,142,86,10,20,27,32,60,50,39,62,70,53,22,47,38,15,33,15,32,47,60,4,21,23,28,28,26,124,125,57,49,36,39,49,55,63,77,97,115,167,192,203,196,162,200,193,158,161,174,160,128,135,116,132,154,135,138,134,189,151,119,109,108,78,71,33,15,31,22,18,26,40,30,33,11,12,8,21,22,20,34,28,13,22,22,14,36,5,37,44,12,28,22,37,53,20,33,16,54,32,13,13,11,5,40,31,41,26,15,7,15,18,29,17,48,8,25,70,71,85,78,83,122,133,120,157,153,175,156,153,129,154,159,133,147,164,169,177,184,148,148,110,116,124,91,120,57,35,51,64,64,86,89,57,55,43,62,14,42,32,26,60,22,18,22,15,22,38,54,51,17,9,120,163,51,112,168,116,216,147,137,179,65,23,76,163,153,112,116,116,113,32,59,58,61,35,47,51,13,128,175,165,135,95,45,35,11,22,12,35,21,21,38,60,13,15,13,38,40,145,210,161,174,128,111,123,98,126,111,130,154,209,246,248,252,255,230,235,225,250,236,231,225,241,199,211,221,130,0,9,9,6,2,17,17,4,44,27,18,41,15,12,8,43,15,23,3,18,1,25,16,7,216,212,209,181,234,234,213,208,183,207,217,210,190,213,189,200,217,182,177,212,206,203,217,225,225,220,230,193,224,226,208,234,232,210,198,180,188,199,207,231,186,189,205,189,179,186,201,180,208,192,211,181,178,205,234,216,219,200,205,215,207,228,208,200,220,223,209,213,222,217,180,214,215,207,206,208,180,195,194,191,200,219,188,174,192,182,197,215,209,226,210,215,223,192,217,193,187,191,203,200,183,181,174,194,213,201,200,205,197,217,190,206,217,187,195,188,204,205,203,205,189,207,178,164,200,178,189,191,190,185,206,187,208,209,181,209,226,204,212,199,199,200,213,197,187,194,203,219,203,187,179,186,189,207,195,195,204,197,213,170,197,191,206,177,198,202,193,179,205,183,225,185,192,191,205,190,169,201,194,203,179,178,208,203,221,173,180,196,175,179,201,187,198,184,184,195,191,196,190,197,193,200,184,181,196,170,200,204,207,186,180,186,189,208,200,190,179,201,185,186,205,209,178,176,175,199,207,181,210,203,205,201,208,194,168,195,181,190,200,195,193,197,159,179,190,178,183,205,173,193,183,209,191,173,185,227,183,194,209,214,196,184,202,182,190,202,182,189,185,186,191,168,181,181,199,190,196,182,225,206,209,197,160,185,209,197,221,186,174,204,181,188,203,207,154,153,96,133,164,205,160,150,168,206,194,212,210,207,194,210,186,183,204,214,190,202,193,180,143,138,68,78,92,88,71,57,117,156,209,171,158,177,152,199,210,165,170,162,60,207,220,179,181,124,109,138,64,93,46,148,150,198,226,91,70,121,202,143,169,213,245,240,164,82,79,117,106,76,62,58,72,144,114,91,83,39,20,16,38,66,74,79,129,131,112,125,112,110,100,141,145,96,24,52,100,82,99,75,122,121,151,125,61,92,116,108,112,100,31,23,9,13,42,53,52,54,39,16,19,25,70,98,113,146,148,131,140,164,199,152,131,121,67,68,37,27,16,41,3,19,11,39,31,29,86,157,132,103,147,77,33,8,24,38,21,75,105,104,81,43,54,38,13,5,21,38,37,49,63,27,31,43,68,89,111,139,116,43,74,104,107,146,155,175,196,206,200,179,194,152,181,139,137,133,132,129,127,137,116,124,144,126,138,132,120,82,53,53,47,22,20,2,24,50,18,54,31,26,27,23,27,21,11,25,21,29,28,16,35,13,7,1,21,12,22,30,31,14,34,20,30,8,32,33,47,30,49,46,38,18,38,7,6,25,27,28,23,4,8,21,30,47,41,20,18,34,13,22,29,37,34,59,66,69,103,146,126,154,151,161,138,150,132,134,125,143,148,134,163,165,141,154,164,163,99,50,45,26,48,49,43,52,35,42,73,66,37,21,44,56,59,23,7,19,18,56,42,32,9,25,73,134,60,99,83,73,139,115,98,103,62,57,84,158,124,128,112,109,92,33,46,71,14,27,30,28,61,158,169,83,82,45,28,35,13,31,27,7,34,34,39,23,47,28,54,66,106,156,109,108,99,61,102,124,158,142,138,196,234,250,226,235,234,255,226,245,245,208,246,206,211,228,217,238,239,110,4,6,11,10,0,5,8,9,11,19,29,11,7,5,9,11,19,34,11,24,5,13,37,23,228,197,200,188,200,222,193,209,222,185,219,205,207,211,202,206,221,204,211,205,206,215,215,194,210,205,198,205,192,217,221,227,213,186,196,198,206,210,211,219,198,198,189,206,197,196,184,209,198,193,208,197,229,187,210,212,182,211,199,226,178,193,205,211,184,179,193,197,209,218,224,193,182,206,191,195,228,211,191,186,221,209,208,190,229,210,178,206,216,198,184,206,206,217,181,212,219,203,201,197,202,179,211,192,196,182,207,218,190,192,205,224,202,208,192,215,191,204,184,205,191,180,183,181,203,203,180,197,168,184,206,192,195,170,195,187,190,217,188,183,176,190,205,188,220,189,179,211,188,184,200,193,194,201,192,184,207,169,173,178,174,198,186,194,207,157,216,190,183,196,188,187,189,204,173,193,194,195,175,224,190,196,174,183,173,203,194,185,211,204,225,192,181,187,206,190,190,192,166,196,202,197,187,190,192,191,205,192,218,169,184,205,195,189,179,186,195,187,204,176,183,191,178,209,198,184,201,184,209,194,193,181,197,205,194,198,197,207,201,195,198,210,181,188,182,219,187,210,192,175,197,167,208,187,165,189,189,172,185,199,183,189,194,195,190,203,194,172,196,187,182,194,190,195,192,207,204,205,211,180,224,191,173,199,206,205,198,200,187,202,191,194,206,194,169,130,78,112,141,143,117,164,195,229,216,181,208,197,178,208,181,168,206,204,205,193,203,149,164,180,158,88,114,113,107,96,135,226,200,107,160,160,182,214,157,94,220,168,100,170,184,160,208,115,85,125,47,76,57,147,211,227,192,78,72,85,111,102,182,241,238,206,72,60,75,115,99,56,73,76,140,132,97,59,38,21,16,23,73,77,124,127,136,114,126,106,94,116,112,137,136,57,21,94,133,91,115,106,138,124,111,86,81,62,86,65,78,71,49,34,15,22,57,70,77,49,46,33,17,60,20,37,47,92,97,115,129,127,178,131,162,173,158,145,120,76,46,24,15,24,10,30,53,2,107,154,69,117,104,64,21,19,15,22,36,75,111,69,60,56,38,43,40,15,31,39,61,84,96,82,90,136,139,147,153,126,165,161,190,218,210,234,203,206,228,177,150,133,144,145,145,143,140,145,112,155,156,110,112,103,68,45,48,26,26,44,24,41,23,14,14,38,79,50,46,45,3,28,18,10,8,7,14,27,41,23,22,15,12,14,22,23,24,8,34,31,51,36,31,44,42,21,17,17,56,51,64,52,41,47,10,18,9,41,8,33,3,20,25,10,11,18,46,27,44,36,45,42,40,40,46,2,36,27,34,35,53,83,71,63,121,126,130,145,117,158,119,144,137,137,148,160,149,162,141,112,117,77,85,48,32,28,36,41,88,59,30,19,66,63,45,45,18,32,64,47,32,75,35,11,107,135,46,56,42,14,40,30,51,45,17,32,49,130,151,101,112,96,92,56,38,44,45,32,48,72,138,172,74,73,30,27,10,31,17,31,14,41,27,18,45,24,54,128,143,155,198,140,120,89,136,104,110,125,123,111,140,210,249,242,255,249,250,218,215,225,206,223,208,233,226,234,218,235,208,150,20,1,21,19,10,18,10,16,14,26,7,0,18,17,1,17,13,7,7,20,14,12,8,17,204,219,197,196,183,212,186,211,191,193,187,197,200,193,210,210,199,206,215,213,202,206,197,219,214,220,227,205,206,193,176,214,198,197,197,196,196,188,235,189,190,232,203,207,215,210,198,200,208,199,175,192,199,204,210,203,209,206,223,197,206,219,190,205,194,221,209,197,225,205,198,205,198,203,188,227,228,201,188,198,208,227,201,181,209,189,204,190,181,211,194,208,198,193,176,193,194,187,207,196,200,176,179,210,207,184,210,230,190,202,172,201,186,210,203,208,174,204,176,208,194,192,193,185,199,201,213,209,208,205,217,198,192,193,175,184,193,203,185,194,186,197,182,216,207,177,167,175,196,183,206,214,173,198,199,198,174,213,182,194,190,204,197,186,215,189,189,170,199,206,166,200,182,169,214,199,196,192,220,198,196,183,185,202,196,207,198,187,200,184,168,169,193,199,185,183,191,188,213,192,195,189,175,210,199,207,191,205,193,194,209,204,182,199,197,187,194,178,193,227,189,206,205,225,194,177,191,196,198,207,210,204,182,199,192,209,193,184,205,206,195,200,185,216,199,206,191,185,181,192,194,183,213,223,164,180,184,193,169,198,188,192,199,201,205,184,169,176,192,183,197,171,216,211,206,206,192,187,202,195,196,200,183,190,188,205,207,181,195,201,186,185,204,170,189,140,127,175,186,169,173,177,201,219,168,201,201,208,179,218,193,198,198,197,190,218,185,152,162,246,162,134,131,153,129,103,103,177,160,123,180,145,166,178,155,164,229,123,56,136,147,195,237,129,116,114,105,68,43,176,218,231,136,17,26,54,117,158,215,227,201,106,47,97,97,110,70,75,123,124,102,117,101,74,30,34,45,46,115,103,125,109,147,106,97,102,117,114,100,106,108,24,29,148,124,79,109,75,107,91,66,111,75,58,59,81,21,42,30,55,10,56,31,66,63,91,62,64,24,27,41,53,47,23,54,32,99,105,126,134,144,141,145,162,183,209,157,135,101,79,46,54,45,28,123,135,107,128,118,48,4,14,12,20,52,111,119,61,38,44,47,55,72,77,88,135,145,150,153,145,166,172,153,190,175,196,164,176,193,156,173,148,148,110,131,146,120,135,115,127,115,136,126,96,113,72,57,27,21,29,23,19,29,29,10,28,29,21,0,19,31,49,54,52,54,40,11,27,10,13,19,16,20,2,2,68,41,23,56,22,16,25,30,24,62,59,74,70,68,56,56,33,37,62,62,69,65,70,65,60,28,10,5,13,11,5,15,23,2,14,23,5,6,8,5,3,8,64,81,34,47,45,37,29,23,35,41,29,49,41,46,40,60,80,105,132,123,131,145,149,134,132,118,162,139,162,159,150,144,107,95,83,47,68,66,82,72,69,61,39,39,52,71,48,10,12,23,67,45,73,140,150,68,56,64,46,48,24,77,66,25,48,53,95,139,98,105,126,105,38,11,44,39,57,98,103,126,101,55,49,26,38,10,10,26,47,42,60,70,78,128,150,163,155,143,169,145,99,102,149,166,89,105,95,52,44,28,56,133,186,234,229,224,229,220,218,239,237,226,203,215,215,211,200,224,105,22,13,23,8,21,13,26,28,19,12,24,35,17,20,0,17,24,0,23,18,25,29,2,17,188,196,209,209,196,200,186,188,205,192,227,209,211,175,230,213,206,230,206,196,190,204,231,213,227,185,205,193,180,183,182,217,207,201,186,190,184,186,212,209,213,183,191,210,214,188,208,202,217,206,189,205,201,199,203,210,195,201,188,212,193,214,206,206,228,203,198,208,191,214,197,202,206,183,194,210,217,215,231,181,194,211,179,202,201,206,203,209,199,180,212,228,177,194,203,186,190,170,185,208,203,206,216,190,189,182,186,210,167,202,191,191,199,222,183,194,204,182,203,205,186,191,201,229,190,206,191,200,166,206,207,176,187,220,176,189,187,181,194,192,178,219,211,180,199,201,187,180,202,151,180,186,210,207,197,201,215,188,182,207,210,198,203,194,169,191,182,217,186,175,193,204,194,184,217,207,197,174,201,194,200,178,190,187,201,196,210,186,225,178,204,203,196,196,188,189,190,192,176,170,178,203,198,182,181,203,178,205,202,186,199,168,198,198,182,197,174,180,189,190,200,192,189,173,178,187,180,176,206,216,203,175,185,194,195,203,181,179,196,192,199,170,173,184,187,164,204,209,221,197,207,200,190,180,202,206,200,203,182,193,211,216,194,194,155,166,206,187,189,209,210,210,209,207,190,199,190,199,188,181,179,185,189,175,207,224,218,186,198,188,201,187,195,187,176,128,163,203,216,144,131,196,233,194,207,210,228,176,191,203,191,159,196,169,201,234,209,145,201,212,170,98,124,135,90,91,54,142,146,201,176,103,136,207,151,193,235,68,80,126,162,236,244,148,81,123,122,71,16,139,241,253,140,19,22,153,184,148,233,191,100,68,59,109,126,85,49,59,157,124,63,113,65,30,15,27,40,72,105,131,131,139,117,102,125,113,114,93,75,53,51,12,68,148,129,108,101,88,109,103,118,111,102,118,89,90,9,15,6,29,24,59,27,29,33,35,54,50,37,35,17,39,22,16,5,11,28,36,63,79,116,122,123,160,157,163,180,183,200,171,163,149,139,115,164,150,102,123,65,55,41,14,59,45,67,60,72,38,41,76,81,105,142,121,143,198,180,191,194,186,182,190,165,169,186,151,154,118,91,104,113,147,132,161,164,147,132,109,82,92,67,64,27,30,11,41,29,47,16,10,18,3,42,12,34,21,29,32,26,41,36,32,21,54,61,25,13,32,23,20,37,28,20,32,42,48,97,73,57,57,26,22,23,69,75,98,89,85,77,68,76,71,96,104,133,137,83,57,84,80,51,14,17,2,26,70,62,53,24,50,48,11,3,8,6,38,61,105,63,57,39,19,38,22,14,34,33,36,26,54,10,20,6,26,62,67,70,114,124,134,125,148,143,130,123,135,136,135,150,151,160,165,116,122,148,124,143,107,57,52,24,101,94,57,46,22,36,40,82,79,121,155,43,43,44,46,70,58,48,50,43,39,34,104,138,135,108,112,120,52,33,14,40,52,58,71,45,22,22,55,20,46,17,46,57,50,111,165,199,166,172,148,150,114,110,138,140,134,141,108,142,106,52,52,39,16,60,56,19,60,147,217,223,239,224,202,212,227,195,226,226,218,213,238,213,109,1,3,18,14,15,8,17,7,4,10,13,5,9,3,12,21,15,17,11,19,3,16,23,44,184,208,212,191,201,221,212,208,213,212,186,199,202,199,201,201,192,192,208,213,187,197,201,195,240,221,197,206,215,197,208,201,210,200,224,199,201,202,194,202,198,202,195,184,197,208,193,195,217,215,208,192,207,174,200,179,189,182,213,208,200,187,209,186,213,177,210,192,207,230,209,197,201,183,202,209,213,196,197,174,200,192,171,187,199,171,211,192,227,183,207,196,184,221,208,194,213,223,193,201,198,190,184,190,198,222,175,183,182,201,176,199,174,165,202,192,205,217,199,179,196,203,208,187,196,205,199,210,185,170,202,189,188,209,207,194,195,190,198,181,184,184,167,187,163,183,207,194,202,191,212,192,195,210,200,198,197,188,181,214,191,194,199,181,175,184,210,196,166,206,187,186,207,181,185,184,196,229,184,192,194,204,169,182,202,187,198,199,208,202,187,195,196,174,193,185,189,189,204,188,193,203,190,204,204,208,179,190,194,213,166,184,217,203,179,181,183,188,184,196,170,199,195,194,192,206,202,198,213,199,185,197,212,209,167,209,209,196,219,211,216,207,219,200,185,221,205,202,186,204,195,197,197,208,232,225,193,206,213,196,181,179,195,195,160,193,201,208,194,200,203,209,194,192,191,199,228,197,198,160,214,187,190,191,185,207,184,201,202,194,205,217,178,111,144,113,129,159,174,133,142,192,218,169,193,180,182,204,162,202,207,150,215,183,190,201,170,178,186,157,189,106,120,112,124,96,60,94,152,210,159,105,173,170,165,195,191,91,65,116,160,216,232,125,116,123,95,51,13,101,225,255,177,130,183,250,214,231,234,91,56,65,117,150,55,60,99,81,133,117,75,68,20,44,31,53,49,102,111,123,123,123,89,133,105,110,107,93,68,22,35,9,75,149,104,82,95,113,146,139,139,122,98,139,86,68,64,28,58,55,85,55,41,25,36,15,35,42,9,18,8,42,50,40,26,25,39,31,36,27,59,83,103,126,120,142,149,125,117,159,181,195,203,179,146,111,114,130,78,66,67,111,148,162,139,159,180,160,171,167,179,181,187,190,182,174,205,177,158,178,131,130,121,129,102,125,115,117,143,145,166,149,137,89,92,59,53,50,12,38,36,70,25,4,22,6,6,6,4,15,31,32,31,9,3,27,8,17,58,61,43,38,97,79,46,23,13,9,5,20,7,28,10,21,83,87,46,41,80,82,97,33,59,83,97,149,152,129,109,91,96,120,137,143,167,113,143,88,80,128,79,38,19,50,102,148,134,74,54,51,28,29,27,17,36,59,80,67,79,89,51,12,13,23,20,34,50,1,10,16,11,19,37,42,5,12,2,6,25,40,47,70,88,105,124,120,158,108,151,153,143,179,136,157,152,139,162,148,168,119,156,122,139,86,64,41,31,84,73,51,122,142,56,23,39,10,19,54,47,60,64,35,41,86,138,143,93,98,126,96,27,19,9,31,12,43,26,36,39,21,41,73,91,105,185,161,188,185,162,133,125,134,121,156,144,145,136,98,52,51,153,122,49,43,21,24,34,46,39,13,71,173,225,235,222,215,238,217,223,200,230,215,212,224,226,125,4,5,19,15,7,8,13,20,21,31,15,1,19,3,17,21,6,20,17,8,23,7,32,25,194,218,205,215,209,218,237,176,215,207,176,214,211,191,206,211,217,189,194,206,211,188,183,202,200,187,204,205,227,201,186,234,192,227,197,221,203,215,200,184,196,209,222,201,217,194,190,234,207,200,182,220,211,190,198,186,192,210,204,201,209,182,213,208,207,222,182,229,188,223,182,202,199,221,188,181,211,197,190,212,195,205,201,230,235,208,192,191,217,203,184,211,205,195,184,199,231,184,213,192,195,214,190,200,203,195,199,201,215,196,212,193,197,189,199,206,167,207,198,198,200,185,195,208,184,213,221,196,190,194,216,200,188,208,193,208,198,187,198,195,180,186,193,188,197,171,174,186,181,196,203,208,192,217,187,198,194,203,207,195,202,204,186,215,175,187,190,190,205,159,190,211,203,213,176,192,203,200,175,190,187,208,192,198,196,201,193,190,202,186,200,210,204,205,201,205,183,186,181,193,207,188,198,200,212,182,184,181,195,191,206,187,198,188,208,198,190,194,182,210,200,181,207,182,193,213,190,182,202,185,204,190,211,193,208,195,210,196,204,212,167,170,183,193,197,209,213,200,210,213,194,211,204,210,169,214,206,190,188,186,169,197,203,203,211,191,197,187,207,208,196,198,219,180,183,200,195,188,217,215,198,189,178,188,194,215,205,184,206,204,200,189,172,103,114,71,117,155,132,109,164,195,208,212,194,201,192,211,171,223,193,180,191,195,179,180,152,152,156,137,166,124,93,62,86,87,97,139,165,201,142,129,184,181,164,213,147,80,57,80,155,241,204,148,111,92,75,46,9,103,223,245,195,187,251,255,200,183,95,58,116,94,109,79,54,113,123,86,93,78,75,43,4,48,56,110,100,123,114,143,132,100,70,131,135,109,88,92,59,39,23,24,49,130,108,66,120,76,76,47,105,99,98,143,68,63,61,98,114,72,42,52,48,38,11,10,46,24,17,30,11,46,53,49,27,31,37,13,32,10,51,15,35,33,74,113,116,125,133,134,155,146,159,189,128,121,120,115,72,126,187,211,223,210,172,187,202,189,213,158,145,148,172,139,145,163,147,121,90,144,122,125,127,128,129,130,136,125,115,96,45,30,21,26,20,28,36,35,56,64,38,0,13,18,9,8,14,29,0,36,14,9,10,16,33,21,15,41,25,17,35,67,78,78,31,21,4,8,23,14,21,39,19,55,116,84,46,58,74,103,104,72,153,117,121,175,114,43,59,99,136,103,149,149,108,100,140,89,77,160,111,45,87,159,149,104,53,124,140,65,28,24,9,7,48,60,64,102,106,111,57,32,10,18,15,27,25,22,34,6,39,28,8,28,35,9,40,31,28,24,48,28,30,47,17,75,98,100,109,128,135,133,146,142,110,130,128,164,203,192,158,175,143,176,172,113,142,130,125,74,155,127,58,42,51,16,24,36,22,26,48,8,32,83,142,134,94,97,126,70,29,18,25,14,25,74,78,65,73,138,139,160,161,174,182,137,131,123,126,150,148,136,163,130,103,98,61,29,19,115,157,72,69,33,40,29,34,47,50,13,58,189,205,238,241,221,212,191,210,220,214,225,236,223,236,114,19,8,9,26,17,15,26,12,31,24,17,17,3,2,14,5,8,19,10,4,29,24,31,7,187,201,190,216,168,193,202,184,204,211,204,224,198,205,217,200,206,189,217,213,182,208,196,191,203,196,221,209,228,200,198,200,209,201,201,212,196,193,195,222,210,191,210,212,222,212,206,225,217,212,198,198,209,202,170,192,185,200,218,191,222,194,213,179,212,195,211,203,204,199,194,201,223,215,196,211,221,215,189,212,192,197,176,208,221,191,199,217,195,214,201,193,186,210,192,184,194,188,205,167,207,196,194,179,203,200,226,201,195,199,194,222,194,210,200,210,197,183,205,198,191,208,193,189,191,189,217,229,207,204,199,189,202,217,195,207,189,197,192,197,184,186,171,198,206,192,184,181,208,198,193,206,193,217,211,197,197,197,190,184,182,201,205,178,197,233,215,177,191,164,200,184,206,189,206,164,203,207,178,186,170,198,196,182,202,193,191,194,185,207,207,185,185,210,204,203,183,178,208,199,191,187,177,189,187,207,169,192,176,201,216,227,205,194,193,184,199,215,181,189,199,188,184,217,210,205,183,194,204,185,208,184,189,180,210,200,203,182,190,182,136,93,158,209,198,222,201,182,203,198,205,191,177,189,196,211,195,175,200,219,201,177,197,187,227,218,186,198,167,188,208,202,199,218,213,191,186,207,194,201,221,187,169,223,199,189,217,207,189,228,216,197,182,160,174,142,182,193,116,132,152,187,193,223,193,203,174,197,198,220,215,181,206,207,222,165,144,175,162,178,175,141,108,108,101,98,110,104,160,216,171,166,173,155,168,229,130,63,41,102,225,219,232,152,109,76,31,14,31,173,246,251,158,175,244,254,136,83,57,99,100,96,73,38,101,141,101,51,45,54,38,31,18,51,94,129,167,144,131,124,110,113,92,96,89,94,83,124,79,52,28,10,35,179,99,92,88,65,35,52,60,91,104,121,27,30,98,101,86,32,46,37,66,38,33,30,15,37,48,14,26,5,48,36,27,22,65,34,21,33,36,1,27,15,29,23,74,76,124,115,160,144,124,174,153,146,144,114,108,158,163,176,217,169,174,174,184,137,143,108,109,118,100,117,121,140,139,127,140,150,122,125,111,98,89,68,26,26,25,31,19,21,8,5,21,27,6,30,28,26,10,45,3,17,19,15,17,11,9,33,26,11,13,36,6,12,17,24,5,35,46,20,57,26,21,33,19,26,24,17,8,32,14,55,125,130,92,91,74,90,104,111,141,91,76,161,107,110,87,139,131,48,104,121,132,96,161,103,82,146,123,71,155,123,97,69,27,89,169,126,22,9,22,26,44,63,57,74,125,89,38,15,43,21,29,21,22,18,32,28,23,9,12,17,24,5,28,8,0,58,30,51,13,27,40,25,39,48,49,53,56,115,108,121,108,128,125,152,182,172,170,185,159,158,179,193,159,178,162,144,168,130,64,64,63,80,65,69,53,46,47,47,84,106,134,140,109,80,135,81,58,44,69,69,111,141,147,157,188,165,161,177,158,129,126,135,177,138,146,158,138,128,108,48,53,31,57,29,68,202,128,41,42,45,40,14,44,60,49,44,70,204,212,232,226,218,228,220,226,228,222,177,203,215,235,104,8,10,2,15,19,6,11,0,8,12,5,30,4,1,8,0,12,12,14,3,32,2,19,20,218,192,221,219,205,199,211,234,197,193,196,229,196,221,197,224,192,182,190,190,181,185,196,201,209,195,204,202,201,205,177,202,204,179,204,197,213,230,195,197,183,188,200,174,198,208,197,214,190,210,211,206,192,203,218,186,200,215,199,211,199,201,197,189,196,187,218,186,198,204,203,191,203,203,195,198,211,210,220,159,222,201,194,184,199,204,210,201,186,168,218,196,189,202,212,201,220,176,201,185,222,209,199,168,232,199,192,190,203,187,228,178,186,181,196,192,203,192,198,208,205,198,184,196,220,189,201,165,212,196,198,187,188,198,205,186,188,184,188,201,166,175,183,180,185,216,175,210,196,190,188,214,181,207,192,184,191,208,204,185,196,166,198,170,185,187,224,205,178,225,178,199,182,205,196,200,185,183,194,184,169,184,204,201,196,205,194,201,204,200,233,182,191,195,208,208,199,196,207,190,207,196,222,186,196,191,199,183,216,204,182,182,179,196,211,207,203,194,184,200,184,191,190,209,202,186,176,192,190,181,166,180,197,186,195,189,198,199,204,206,148,123,180,211,217,218,189,240,214,204,217,207,194,180,181,198,189,207,185,199,187,215,203,205,192,198,204,200,190,209,203,217,222,172,191,234,194,180,185,203,221,191,174,175,212,189,233,220,224,221,228,153,157,160,153,160,226,222,142,183,166,155,176,204,213,212,176,208,188,214,227,171,219,220,203,194,178,160,161,166,158,199,127,108,106,75,76,75,159,193,152,152,141,157,217,196,123,55,66,151,244,246,244,128,40,46,31,8,114,234,227,219,151,133,203,141,48,37,59,90,135,73,53,72,118,140,84,47,22,37,27,20,42,88,134,137,157,134,121,117,86,69,58,16,64,62,105,95,85,56,52,61,89,162,102,72,59,48,84,19,98,88,62,19,4,67,120,103,53,35,79,60,45,37,19,55,42,37,52,45,34,52,16,23,9,33,3,1,24,14,15,46,13,18,4,47,14,36,43,41,52,72,93,90,114,144,147,154,150,177,178,186,201,175,169,162,131,150,148,152,136,133,131,113,61,133,103,54,88,49,42,22,45,20,30,20,18,9,30,29,19,8,0,11,19,33,14,16,7,10,9,1,9,8,8,11,19,28,18,22,29,29,17,1,32,21,29,11,15,33,48,60,60,56,27,52,26,8,39,2,17,25,26,42,82,132,105,91,53,95,82,110,129,78,86,99,102,103,146,85,36,32,62,77,115,158,127,71,134,139,151,103,175,73,82,147,92,184,216,116,23,0,39,20,39,65,117,88,82,55,19,15,25,47,17,10,18,14,40,21,23,21,6,22,43,33,26,44,14,23,21,60,33,28,51,39,35,53,27,2,26,32,49,64,67,82,103,149,147,167,141,160,144,148,156,160,175,150,151,161,175,152,120,198,158,176,162,143,147,135,140,122,147,179,200,184,143,82,111,96,73,117,138,156,189,175,179,142,157,145,109,145,151,155,174,148,132,120,122,67,72,36,41,18,38,57,22,14,208,246,87,36,51,31,40,56,36,38,38,24,67,205,194,223,237,200,225,202,227,227,200,225,209,206,205,103,12,11,16,4,1,16,60,4,29,23,17,24,17,11,20,9,13,0,8,9,14,27,23,13,208,191,207,189,165,202,211,185,217,205,183,196,215,185,180,175,193,175,205,196,212,179,178,213,188,211,189,182,196,213,205,197,195,191,211,225,190,197,204,211,209,202,181,203,195,205,208,215,192,193,209,226,215,216,230,218,208,198,207,201,199,221,219,181,211,190,185,208,214,195,184,191,220,193,193,198,189,215,176,208,203,167,213,184,193,177,201,172,225,195,220,178,206,229,202,195,195,222,195,205,200,206,195,201,223,208,205,188,205,196,185,213,224,183,182,210,173,202,195,176,205,209,193,182,211,173,205,197,171,189,174,187,199,216,199,196,209,194,193,209,191,207,173,208,197,178,180,202,210,195,198,168,202,186,199,209,213,184,183,171,191,187,179,192,186,181,201,168,205,193,197,216,194,213,213,199,219,202,184,181,196,185,191,186,190,188,193,188,180,185,189,207,207,169,211,195,207,196,173,216,205,191,200,215,176,187,187,196,200,184,210,197,195,185,203,183,188,215,193,207,191,206,190,196,224,185,216,217,211,213,216,209,232,222,212,204,229,221,214,228,190,204,237,239,222,193,211,229,229,220,204,193,205,214,238,217,230,232,226,227,240,230,229,224,228,213,221,219,223,207,226,207,250,226,225,230,217,235,234,213,206,241,236,195,235,245,237,217,209,210,199,168,145,138,154,161,187,162,147,196,195,127,181,229,238,233,226,210,197,227,219,217,220,227,215,184,200,172,190,161,207,229,163,87,91,101,117,66,112,126,110,122,153,228,240,144,65,73,34,141,233,236,150,44,1,13,33,46,151,230,214,230,104,99,157,56,14,58,94,136,83,48,97,127,162,102,81,80,37,23,11,45,77,117,123,130,106,107,108,133,110,93,59,67,57,47,65,96,72,57,92,72,40,142,121,64,58,53,42,12,98,159,67,30,76,115,158,79,49,63,116,72,49,57,26,27,32,52,10,18,14,16,18,17,41,19,4,1,26,18,28,19,9,32,13,54,24,64,32,26,20,23,14,33,41,49,66,58,52,96,100,107,146,117,116,120,105,76,78,95,48,59,29,53,11,29,54,42,21,33,34,21,29,30,22,14,10,24,13,20,34,34,4,8,15,15,4,32,9,8,45,20,3,17,15,17,13,11,24,23,2,40,21,32,18,24,12,23,48,70,97,135,98,78,93,98,55,35,26,19,7,12,7,9,48,62,73,108,65,65,74,80,121,87,104,121,61,46,28,46,33,34,37,54,68,64,83,121,180,163,138,96,124,83,80,176,93,155,155,52,26,2,32,17,16,49,141,83,50,46,36,26,9,17,22,21,19,10,25,11,30,27,35,13,57,39,30,16,21,5,12,27,37,25,17,31,17,21,12,14,20,18,5,17,35,39,31,72,76,119,150,123,106,143,130,137,137,126,127,139,148,140,153,177,154,179,156,209,181,180,178,184,180,161,146,173,124,113,114,126,74,124,131,141,151,126,144,139,128,127,126,139,144,148,126,107,71,37,24,26,55,55,43,28,16,17,15,96,243,237,80,31,43,39,38,28,47,46,53,13,68,183,216,223,218,205,202,227,221,214,209,231,251,221,219,115,9,0,3,33,13,10,10,26,14,34,14,23,11,6,9,17,10,1,16,5,29,16,2,10,192,183,215,177,202,218,181,205,208,211,204,233,203,191,230,193,204,199,199,218,212,168,217,205,210,223,196,187,213,213,171,207,164,206,204,207,222,200,195,228,186,194,216,196,201,190,192,189,179,184,192,211,196,192,216,198,201,204,198,223,231,208,213,219,223,206,187,195,198,186,199,194,202,210,181,204,191,191,197,207,214,200,221,219,200,187,196,191,197,195,196,185,205,181,208,191,196,186,205,182,221,195,188,171,184,186,185,221,201,203,176,191,175,205,222,206,206,185,168,197,210,195,208,187,187,202,200,204,181,204,189,203,197,219,185,217,187,201,179,210,179,164,195,199,209,189,198,198,182,185,204,216,199,200,185,200,215,195,199,197,185,188,178,185,186,178,185,185,203,184,186,187,201,191,196,169,194,195,205,216,202,181,174,167,211,187,173,190,214,197,200,199,189,224,194,202,189,186,202,186,192,213,200,235,205,164,205,194,190,202,212,169,179,205,204,187,191,189,190,206,198,173,205,201,207,222,245,214,242,238,249,213,230,242,234,236,245,227,250,233,219,237,250,238,242,153,99,197,208,246,213,226,242,243,240,238,245,234,222,254,240,234,252,251,247,209,231,242,253,239,233,237,248,251,232,248,225,235,231,250,216,240,216,212,227,202,238,222,215,240,188,157,156,140,122,124,146,120,184,200,192,147,114,194,237,228,208,216,212,212,194,207,216,218,187,188,211,133,175,176,242,207,123,96,111,96,125,92,85,55,85,152,159,240,218,99,59,77,14,49,108,135,62,19,5,46,22,100,217,248,222,234,130,105,126,33,61,109,101,84,68,81,130,144,71,54,50,38,30,34,49,93,122,118,156,127,133,126,151,122,55,47,101,92,73,51,96,87,51,109,72,54,25,60,67,70,43,51,36,49,148,182,80,79,123,163,156,70,62,147,94,72,65,64,65,19,13,27,29,17,24,25,32,16,33,29,31,19,28,17,17,10,22,32,5,13,34,10,30,37,31,12,30,39,33,28,27,23,17,2,9,30,37,30,23,32,21,43,29,26,21,24,43,11,28,18,40,45,19,25,54,26,31,6,8,5,24,23,8,21,55,19,36,27,17,11,14,25,41,10,24,17,4,29,18,20,19,19,29,18,27,22,14,42,30,29,26,8,123,176,199,191,181,175,193,152,62,23,12,2,50,8,19,9,15,62,98,116,50,62,96,97,127,74,97,95,21,36,24,42,58,24,41,33,74,54,59,123,139,116,97,82,93,50,67,129,34,70,39,23,17,47,29,5,74,148,144,113,86,83,39,43,47,6,25,25,10,21,44,46,29,35,60,20,9,12,22,15,13,1,22,25,42,35,18,43,12,12,27,2,37,43,59,42,55,14,36,36,37,27,36,30,83,94,95,90,112,107,128,137,164,127,130,159,131,153,114,125,134,149,144,140,132,154,140,142,130,108,139,122,110,148,133,163,152,120,130,153,158,123,94,107,75,42,48,44,48,31,35,44,43,47,21,23,15,23,35,127,231,171,55,54,34,18,24,25,69,49,64,18,75,211,210,224,239,231,232,213,232,223,231,218,231,223,221,129,5,6,2,12,0,25,11,4,9,12,26,20,0,15,1,14,31,0,20,30,4,2,13,19,211,221,208,211,207,193,196,200,212,197,223,195,177,194,201,178,228,213,210,190,203,201,204,170,213,217,220,225,188,219,187,209,191,195,223,196,198,211,179,186,209,214,207,199,201,209,222,226,192,206,192,199,232,208,221,222,216,216,206,166,193,183,207,207,210,193,220,215,184,196,175,205,188,206,213,188,224,184,206,197,202,185,197,191,207,174,199,214,212,208,200,205,178,164,211,207,206,193,178,196,206,213,194,203,213,214,200,213,207,212,177,216,206,189,185,187,174,194,200,190,198,194,208,170,187,202,211,196,210,205,173,202,174,203,199,172,187,211,211,184,185,203,209,171,179,191,192,166,179,200,197,189,199,187,213,190,210,176,206,204,198,199,212,207,189,200,195,191,208,203,187,194,217,198,206,197,187,224,216,199,198,185,181,183,213,191,200,204,203,190,180,201,192,206,201,202,159,182,168,187,192,176,200,163,169,232,188,197,192,204,210,208,195,198,193,187,212,207,206,206,184,180,194,178,211,212,209,206,210,186,212,190,185,207,204,188,157,188,156,207,206,180,155,153,120,97,51,119,151,185,163,169,164,160,140,141,167,138,160,152,159,134,138,169,187,137,153,154,160,138,154,158,133,146,138,130,139,125,131,126,163,124,148,167,135,109,144,186,151,134,130,140,117,116,127,136,152,118,165,220,208,142,71,111,130,109,134,125,120,153,125,143,119,143,125,156,108,65,100,113,131,124,79,116,141,127,109,105,102,72,79,92,76,152,102,35,50,70,23,17,24,54,44,40,46,43,66,153,246,249,224,250,160,113,89,83,115,98,96,65,91,123,127,72,88,56,20,33,29,12,64,139,142,138,147,131,149,122,129,98,62,50,76,78,63,68,80,73,37,61,31,41,12,84,81,81,38,33,109,107,52,70,106,110,109,109,109,51,34,84,86,62,64,58,60,31,30,7,42,34,49,22,29,34,29,10,29,19,18,12,2,12,11,18,21,16,18,27,7,7,51,38,36,26,32,17,0,21,23,27,9,19,25,15,29,16,20,44,36,31,24,23,9,29,23,34,52,72,36,36,9,13,39,10,32,43,23,16,27,30,14,16,33,32,13,38,7,15,6,27,9,17,37,29,41,14,33,26,16,35,15,18,10,35,13,34,8,11,49,178,181,191,149,172,198,145,77,23,23,23,13,37,32,24,41,93,95,54,40,87,47,137,135,63,52,54,37,47,37,52,41,54,64,98,54,39,33,62,92,149,164,101,111,55,63,116,67,27,10,23,24,5,38,53,125,160,133,155,130,136,153,156,63,19,35,40,12,14,18,31,19,6,29,37,3,10,29,21,20,13,42,2,31,14,11,32,39,12,30,24,22,7,35,26,28,15,18,44,26,13,31,30,42,34,55,54,27,43,54,73,57,105,106,77,115,119,127,98,134,109,133,125,112,152,157,151,113,130,186,155,112,122,126,125,94,103,80,45,31,53,34,29,11,27,28,36,20,36,32,49,39,43,24,18,29,40,28,76,131,94,64,66,40,35,34,29,62,43,66,60,162,211,219,205,225,206,214,237,213,233,237,209,241,197,221,114,4,1,1,7,9,19,36,3,45,15,2,12,2,26,0,23,27,16,17,2,27,20,23,20,203,235,209,194,200,187,200,218,180,207,201,201,191,189,217,224,244,250,219,205,177,211,211,201,186,213,201,185,200,191,209,195,211,185,193,178,186,202,179,212,196,188,195,186,201,200,203,211,230,182,198,223,200,203,207,220,203,187,198,193,183,208,210,209,203,192,204,219,200,198,191,197,199,225,211,187,216,201,182,227,196,196,215,192,171,204,196,194,220,230,197,217,189,214,210,178,208,190,202,206,205,188,226,197,182,175,199,192,177,201,183,204,212,196,190,201,183,188,204,193,216,182,197,200,188,183,176,218,170,185,199,175,174,197,213,200,197,201,192,188,192,195,175,196,198,210,209,206,190,217,188,211,194,185,188,193,210,195,197,200,209,186,187,186,186,176,172,201,191,191,182,192,193,198,196,188,189,214,190,190,191,202,196,178,213,181,174,179,204,216,178,199,188,194,206,202,175,207,185,193,190,188,201,169,192,172,198,205,174,196,185,210,230,186,198,197,197,198,188,196,168,191,175,113,64,74,69,50,74,63,60,78,48,53,42,27,19,10,45,106,76,28,6,14,17,15,8,19,22,29,16,17,14,44,5,7,25,1,4,14,3,2,12,23,57,0,10,4,9,33,12,9,5,42,53,17,10,7,1,7,15,2,46,59,46,9,42,81,43,46,69,59,64,87,98,110,145,136,195,224,173,147,89,55,4,5,19,10,22,19,7,20,3,25,38,37,54,9,44,54,27,36,21,95,134,117,126,106,100,42,50,31,0,43,30,4,65,59,61,44,31,21,29,56,42,45,49,199,237,254,215,225,190,138,126,98,90,84,67,67,111,130,76,100,70,39,24,47,40,98,127,143,129,125,135,124,146,144,118,81,49,55,68,68,69,46,55,14,46,77,44,46,33,60,74,35,50,129,234,126,14,36,55,111,72,51,94,48,40,52,51,76,46,55,28,34,26,30,17,22,22,3,13,25,23,22,7,28,31,21,7,16,28,15,16,17,20,35,31,13,36,11,47,12,47,24,9,28,23,37,14,39,7,7,14,12,14,24,20,55,0,15,34,22,13,48,59,85,47,56,36,12,5,16,18,34,34,21,52,15,26,26,41,3,29,15,31,27,25,22,14,20,27,17,20,17,24,13,17,43,28,9,6,17,25,27,18,35,31,49,95,66,60,20,45,49,56,25,19,10,26,19,36,48,63,138,84,38,44,59,58,108,147,49,39,19,33,19,46,85,87,126,122,44,39,45,72,61,28,135,169,121,99,54,63,98,96,56,24,1,23,9,9,38,74,131,172,141,145,116,160,174,59,21,35,17,29,31,32,42,15,12,14,16,11,15,8,21,45,33,23,32,17,38,2,20,31,34,33,15,12,25,27,31,21,13,26,18,61,29,41,24,9,29,35,39,41,41,36,23,32,19,33,28,45,59,75,77,84,84,101,113,99,72,80,77,77,126,100,64,33,65,49,48,61,24,12,29,20,16,28,25,5,22,4,31,23,29,48,24,40,7,27,27,22,25,50,51,46,47,62,56,19,31,56,47,42,39,6,55,186,226,208,246,207,220,205,220,235,212,224,231,218,225,208,106,3,1,6,0,12,34,20,11,19,13,25,25,16,1,12,4,25,16,45,15,40,36,8,19,211,210,210,196,183,196,195,208,199,206,210,179,198,188,218,177,246,246,194,214,214,192,179,196,191,204,214,189,185,204,215,176,212,219,222,201,211,185,201,209,196,224,210,208,204,227,202,196,210,209,186,200,208,227,209,213,190,204,203,209,210,190,189,211,203,233,212,219,199,230,198,202,207,226,205,197,206,196,188,212,190,181,205,175,195,200,204,182,170,216,213,218,210,176,216,188,194,209,209,192,201,196,202,209,172,209,192,202,213,208,194,187,195,200,196,197,201,214,218,204,208,190,212,191,193,196,177,194,193,191,222,172,189,219,198,194,197,180,182,180,181,193,196,207,215,169,199,182,190,184,219,192,192,216,190,163,216,202,180,202,213,202,194,175,212,207,190,178,188,200,196,191,166,198,215,200,185,191,179,201,196,192,192,209,199,192,207,184,203,200,193,197,205,210,198,189,186,190,202,217,188,176,200,182,194,180,193,214,194,182,198,190,192,199,181,191,210,220,210,200,202,188,181,118,85,73,76,70,32,68,65,77,78,57,77,74,55,65,57,77,95,79,63,63,73,67,57,57,54,27,56,57,36,30,47,44,33,35,34,33,62,57,55,51,104,48,37,39,67,42,41,40,53,34,94,52,21,34,40,18,44,35,87,100,42,5,16,73,56,36,62,82,79,45,77,91,154,167,208,179,137,99,38,44,31,13,15,6,58,34,10,25,15,9,12,24,39,6,25,41,9,19,33,87,109,104,157,131,118,76,49,36,17,61,27,20,66,75,75,47,30,22,36,32,61,38,56,183,184,191,157,160,178,152,122,109,70,38,90,121,106,91,62,64,62,7,34,59,89,128,116,118,149,119,129,118,115,80,108,37,56,74,70,126,59,41,4,28,119,131,46,49,31,43,11,38,81,168,200,100,45,81,80,55,34,77,62,46,68,48,10,32,33,25,50,51,34,7,34,38,64,30,11,24,17,25,33,18,16,39,18,22,16,32,38,17,26,41,22,2,30,13,10,26,18,6,25,33,4,9,12,5,16,17,7,12,42,7,28,16,23,40,28,28,29,29,67,62,56,28,6,13,13,3,25,25,25,31,15,23,4,36,17,24,12,22,23,31,15,23,31,10,28,13,19,2,13,35,17,11,27,19,34,4,28,21,11,14,32,59,46,52,39,45,37,26,19,25,25,41,15,25,39,34,93,122,65,51,50,61,66,73,86,28,52,9,17,22,30,91,123,130,65,42,14,9,5,59,72,90,130,131,101,47,69,62,126,88,37,23,8,2,37,16,48,62,96,80,85,102,52,52,54,30,51,11,19,33,24,25,30,31,18,18,28,16,11,23,21,27,7,19,18,28,13,56,2,20,22,10,15,5,35,37,18,4,5,6,9,35,53,49,28,36,47,48,63,78,59,41,11,32,16,33,24,49,16,50,14,32,36,26,61,50,45,48,20,27,42,25,40,46,43,43,49,36,18,8,32,16,19,29,20,22,33,14,14,35,21,27,18,28,35,23,49,38,12,6,20,43,46,69,34,34,33,43,46,25,38,144,243,236,209,214,216,236,199,237,213,203,219,239,232,209,214,90,11,11,10,18,2,3,25,13,17,41,31,14,2,6,12,8,20,3,3,2,14,17,20,8,210,192,221,194,206,217,206,211,204,227,217,206,194,224,220,203,250,254,217,207,213,205,184,215,202,178,211,198,194,189,203,184,192,197,213,195,196,207,216,205,217,172,198,193,207,207,196,183,185,193,170,208,197,192,199,190,209,200,202,211,219,203,197,203,190,194,228,209,219,196,188,206,181,196,190,208,201,182,210,190,205,210,194,202,160,186,214,201,190,201,206,209,215,177,201,183,176,182,196,180,205,207,198,196,170,217,195,179,211,186,195,196,184,198,196,186,194,197,196,201,188,204,204,206,197,214,213,201,191,176,190,187,213,187,179,200,193,188,187,181,182,178,197,202,217,204,185,196,181,219,191,211,195,196,191,199,183,204,194,203,208,198,210,182,197,195,183,197,208,211,168,228,191,186,167,196,177,220,183,201,213,195,198,174,196,192,221,187,182,206,210,233,200,204,194,188,208,198,196,211,192,202,196,172,214,190,199,200,189,191,179,205,185,194,174,190,182,208,174,211,196,183,201,197,184,207,193,213,177,213,206,216,209,208,211,196,215,185,180,167,194,223,226,211,208,214,203,216,213,187,227,230,208,203,231,217,223,211,186,224,221,224,210,220,221,198,211,241,218,212,209,241,226,214,205,201,211,230,189,227,202,190,205,197,161,161,178,192,173,173,172,136,141,122,105,96,157,204,203,162,69,73,45,73,160,183,187,186,173,140,158,175,103,166,144,147,115,111,180,142,156,138,113,89,72,132,134,114,83,19,92,77,97,95,70,52,59,61,42,12,17,4,36,50,49,45,114,202,116,84,99,60,85,110,133,79,32,71,134,124,83,86,42,10,21,14,62,122,140,117,119,94,127,102,125,107,113,40,42,57,38,34,80,76,49,36,61,132,208,207,125,80,45,16,63,143,160,64,84,90,55,75,63,66,36,70,41,44,82,43,57,8,28,45,50,65,38,23,25,34,44,37,22,11,30,22,35,22,30,22,12,12,26,31,8,8,6,28,4,19,22,18,30,11,24,22,12,8,19,5,19,13,2,29,24,23,25,34,19,14,13,33,39,26,44,70,119,67,46,28,15,16,40,16,5,2,1,27,13,29,16,23,4,17,26,32,27,49,14,21,48,35,13,1,30,14,25,21,47,25,21,17,18,30,12,29,28,34,20,50,46,49,10,50,47,13,20,7,13,4,15,4,41,50,102,121,63,82,68,53,59,85,67,47,26,55,35,40,65,120,154,106,52,17,42,12,25,44,72,88,107,117,68,40,67,94,110,107,57,31,13,28,10,2,38,49,74,43,17,17,35,29,12,24,34,15,22,28,44,30,30,14,22,7,38,25,26,24,22,13,20,28,6,13,23,50,17,7,35,26,28,13,22,33,16,25,24,31,41,23,36,6,21,49,90,131,107,56,62,25,29,28,28,42,34,25,22,36,31,16,15,10,34,19,23,34,28,27,23,47,57,44,48,38,27,55,9,13,52,38,6,4,25,20,25,16,27,24,28,22,30,15,39,15,29,20,44,23,27,50,41,86,49,69,62,55,40,88,165,234,250,210,218,209,222,212,195,216,212,199,218,227,209,222,234,116,12,0,9,27,4,24,40,12,33,17,22,1,5,14,14,8,8,11,23,8,5,12,35,16,199,229,201,188,210,193,221,209,194,203,189,198,203,206,206,192,243,219,214,194,205,184,203,202,208,205,193,204,189,200,214,212,204,183,209,211,218,215,231,192,220,196,213,197,205,218,211,221,193,215,202,221,190,185,212,206,207,179,229,200,187,208,185,199,203,199,199,200,207,208,193,210,192,213,212,191,223,205,198,202,196,190,210,194,236,185,185,194,182,198,219,207,219,206,203,210,206,210,209,195,184,209,189,185,189,191,213,176,195,206,202,196,182,195,218,223,216,199,199,210,201,192,197,176,190,209,213,202,198,172,223,217,197,214,207,183,192,186,172,215,183,170,220,215,200,178,187,207,198,200,185,181,192,202,204,218,172,174,207,190,182,187,194,204,184,185,193,191,171,180,181,210,202,218,190,191,197,235,191,192,196,195,167,178,187,183,203,185,189,204,215,202,184,195,175,195,217,218,210,190,201,194,175,181,188,206,180,211,176,213,192,206,203,191,181,210,184,184,194,214,209,180,223,226,226,212,222,223,247,252,236,255,214,233,245,238,247,224,217,242,187,239,247,245,246,236,247,246,237,248,243,255,245,247,249,232,241,239,237,241,218,234,227,247,249,229,237,252,248,232,235,255,241,247,249,253,239,251,246,241,247,249,246,241,223,218,248,231,202,189,194,180,182,180,130,137,158,173,141,111,64,33,47,186,247,245,243,253,255,211,251,253,216,247,226,220,177,202,246,248,204,178,211,167,123,151,128,86,58,29,111,217,198,160,161,122,74,68,25,33,16,10,34,53,48,57,181,182,84,76,89,57,35,96,84,18,53,132,132,77,78,48,25,29,42,74,104,115,120,106,112,106,102,124,129,146,103,46,33,53,40,19,55,29,39,57,108,174,196,162,92,43,3,9,104,229,101,18,48,73,95,59,91,81,63,34,35,41,64,17,20,54,59,34,104,72,68,40,40,10,55,24,40,37,2,6,28,54,14,32,5,35,22,31,23,11,37,23,27,23,19,51,13,23,16,10,18,16,31,43,23,40,8,9,7,8,4,13,15,15,32,19,59,36,36,78,145,98,52,47,50,27,13,14,5,14,24,47,17,3,30,18,20,34,33,35,8,25,6,38,27,33,17,16,12,2,40,24,43,17,22,21,40,21,21,21,58,36,36,80,56,50,37,53,35,27,58,1,20,14,23,19,28,58,131,124,93,88,90,110,57,52,39,27,18,40,23,19,74,182,186,97,60,62,39,13,35,20,68,112,95,67,46,144,134,71,135,150,56,23,3,19,29,55,36,66,49,20,29,34,15,43,28,13,30,16,35,29,27,27,31,18,9,17,18,25,14,30,21,11,39,10,6,42,27,22,13,12,21,44,25,9,27,10,18,40,20,12,10,19,31,15,26,100,113,100,75,54,56,58,12,20,17,40,42,17,25,65,50,12,22,26,39,13,41,34,29,24,22,50,54,37,55,41,19,15,27,21,9,7,30,21,15,46,21,45,37,40,30,34,20,40,33,41,38,55,97,25,21,23,46,56,79,55,43,89,101,218,238,244,235,230,227,230,194,210,225,197,204,201,212,231,201,224,216,110,10,14,11,17,15,3,27,2,7,20,8,14,17,22,20,41,9,28,11,5,10,14,28,22,191,209,207,201,206,213,181,219,200,153,213,212,208,212,190,187,225,236,188,209,227,190,219,188,207,212,189,200,210,195,243,204,193,208,218,197,204,184,204,200,226,196,211,208,208,204,211,221,207,209,187,189,205,194,190,206,226,196,205,203,197,196,198,206,203,203,212,191,206,202,206,199,209,207,196,189,191,199,204,174,187,208,204,228,196,198,212,213,208,192,186,198,174,200,189,210,196,199,198,204,201,207,197,198,200,177,186,208,198,186,199,202,188,203,203,234,191,212,203,202,180,227,210,212,234,197,196,189,194,179,208,195,213,199,185,196,216,178,185,198,202,192,181,200,209,200,183,207,183,190,171,187,214,204,220,200,187,178,203,196,197,204,203,188,194,187,207,194,196,200,198,179,177,189,187,194,182,186,195,182,174,181,181,199,198,202,190,205,187,214,171,189,195,217,182,182,175,194,198,172,208,194,198,202,188,189,188,174,171,198,207,200,189,169,190,197,194,207,193,211,196,211,205,206,199,224,213,220,211,226,206,214,194,225,201,202,201,185,211,209,198,193,212,211,230,183,219,201,216,209,198,207,221,182,212,226,204,212,224,226,220,208,206,218,210,223,227,217,203,221,210,216,232,228,192,199,191,209,208,198,206,242,224,228,186,206,227,206,176,188,170,196,170,173,116,154,192,197,123,93,35,67,136,186,242,199,222,215,226,184,218,179,198,217,171,152,161,211,245,183,155,182,223,181,132,118,93,78,61,66,125,250,205,224,186,77,45,24,57,88,62,44,48,100,39,155,229,225,111,123,99,30,15,19,73,70,135,107,83,82,47,14,20,45,45,103,133,124,121,140,112,93,149,148,111,147,105,91,43,33,35,39,37,44,19,90,166,198,183,88,47,27,68,32,60,147,95,59,69,61,72,73,63,101,45,42,47,41,52,25,29,45,84,92,71,53,76,76,41,56,54,24,35,4,38,5,34,15,38,51,16,23,32,10,5,31,39,47,18,20,30,11,33,21,32,37,27,22,5,10,10,21,18,11,0,23,2,11,14,37,18,22,13,21,18,59,98,77,15,24,13,2,2,21,9,6,31,30,26,22,32,42,37,22,21,28,19,5,36,18,11,2,40,33,19,14,12,26,34,21,43,37,34,30,9,17,4,43,22,62,108,68,89,84,69,38,19,31,28,23,28,35,43,52,156,98,82,134,119,106,36,30,27,33,20,39,85,94,75,98,42,78,53,80,32,25,44,27,69,71,58,56,101,172,101,65,129,146,92,9,2,46,24,38,56,69,61,47,59,39,24,46,7,26,16,31,7,21,48,19,28,36,28,19,33,26,28,21,18,13,36,16,39,17,13,15,12,9,38,29,17,29,22,32,27,18,35,18,32,18,34,22,43,99,79,63,92,81,60,27,25,18,15,20,12,22,6,2,10,24,21,11,23,18,3,36,27,44,4,29,22,60,17,29,23,21,25,11,11,27,32,32,24,16,36,11,5,20,28,29,15,26,50,32,45,159,109,60,27,30,46,43,95,86,98,75,75,196,230,232,233,219,213,215,242,187,220,191,186,232,213,213,221,219,217,116,19,15,15,22,13,16,2,42,32,52,1,2,0,17,0,31,12,6,12,25,9,0,35,8,202,180,193,184,184,200,209,216,208,199,207,210,190,195,204,177,228,206,156,171,211,206,215,215,230,191,216,230,210,204,204,195,206,220,196,210,206,197,184,206,189,192,203,201,209,211,192,189,181,181,183,176,201,191,207,193,164,209,204,183,195,190,197,202,199,188,191,174,206,197,220,189,192,188,203,187,218,210,169,195,208,202,174,206,205,224,200,221,201,213,197,193,200,213,212,213,209,213,221,201,196,203,207,201,184,190,208,195,192,208,176,178,174,200,166,199,203,196,172,177,174,190,204,192,180,199,179,198,206,196,196,179,193,222,209,189,180,191,201,231,201,173,173,182,181,196,194,178,197,213,184,194,210,184,168,196,200,177,220,174,194,200,186,233,185,186,203,177,202,194,201,204,193,212,185,217,190,204,205,174,197,191,177,181,190,186,199,182,191,174,196,212,192,173,178,182,183,182,189,191,192,180,191,172,175,182,199,193,216,222,190,198,215,178,189,163,189,183,211,194,199,204,193,184,188,204,194,172,202,208,214,207,189,208,205,199,187,202,197,203,186,165,151,207,222,167,200,195,181,204,200,192,180,184,205,176,177,201,194,184,179,212,209,179,212,220,176,190,198,191,179,193,171,199,172,208,213,220,183,192,206,186,220,178,178,191,220,174,122,139,156,209,191,152,125,162,213,192,159,62,19,94,193,243,229,206,204,197,206,190,196,158,160,185,141,147,181,221,170,61,91,190,237,201,138,135,78,62,78,83,118,211,156,135,130,77,42,16,56,156,58,68,59,79,156,242,248,187,69,91,53,6,33,38,70,97,63,65,63,36,46,22,39,66,84,110,125,103,118,124,104,119,107,100,112,103,122,92,64,49,26,31,19,28,93,150,151,149,103,72,109,126,113,86,41,55,72,83,71,46,73,70,39,72,22,37,45,36,38,13,20,44,54,89,69,48,38,81,84,105,109,78,63,20,28,21,34,11,22,23,41,29,33,24,15,44,41,21,30,46,41,35,23,15,25,21,9,25,13,7,45,9,23,8,25,2,20,21,22,25,15,23,28,35,41,69,71,53,24,17,33,4,15,27,44,14,14,21,16,27,10,9,26,34,22,31,38,23,21,6,18,29,11,14,16,9,17,21,19,34,20,8,23,25,31,24,56,47,73,105,127,138,163,116,81,53,23,22,21,24,26,22,23,56,121,138,146,110,152,149,94,44,9,16,28,102,141,99,100,50,54,46,57,64,92,74,52,21,12,82,96,83,103,155,125,107,166,176,80,21,3,20,45,33,52,57,38,54,86,72,52,36,44,23,26,35,28,13,30,24,20,38,38,23,33,8,9,20,43,10,19,26,35,20,26,14,27,26,27,23,40,3,32,34,30,36,24,34,13,17,22,15,88,64,63,124,114,100,55,40,21,28,25,13,20,40,46,46,12,34,20,9,6,21,6,10,16,16,41,44,26,60,22,1,42,8,14,58,10,33,21,35,20,12,17,33,43,50,24,29,27,36,26,32,131,199,80,78,92,66,80,74,79,72,76,48,28,141,231,219,239,192,228,198,182,184,193,198,227,217,220,227,211,213,223,119,13,13,6,16,21,10,9,19,11,6,21,5,29,13,8,20,31,11,30,9,11,0,10,5,198,217,214,196,191,216,208,214,209,188,171,188,215,200,231,215,239,237,194,199,182,203,200,190,185,190,201,164,202,186,211,177,193,216,205,212,201,219,189,216,206,200,195,208,192,189,204,198,217,202,203,204,213,200,189,220,211,195,189,206,223,181,199,204,187,217,210,210,199,188,226,182,205,209,197,210,216,198,203,189,207,203,183,178,208,187,179,207,217,197,226,185,201,191,180,192,206,204,192,195,198,195,188,197,179,197,205,198,186,183,201,185,162,205,205,201,182,184,199,185,203,190,197,197,196,207,201,200,207,198,187,203,220,198,212,229,182,192,224,184,204,190,215,166,185,223,233,208,218,215,206,222,199,190,193,225,228,192,200,155,203,192,197,176,194,195,187,192,203,192,188,226,205,214,192,190,167,202,188,167,188,195,198,167,187,211,192,173,220,209,196,173,193,206,196,200,198,182,187,202,201,180,200,187,195,200,199,191,187,187,199,196,210,212,214,188,201,214,201,172,201,196,180,220,193,202,194,211,170,199,194,197,190,179,195,210,209,184,203,193,204,192,164,179,211,201,185,187,209,190,191,199,181,175,230,198,221,197,188,173,201,200,192,208,193,182,223,198,195,180,177,196,193,189,179,206,182,187,199,199,190,200,172,193,162,184,205,168,100,104,143,168,184,122,135,218,233,187,129,37,91,196,229,218,232,209,209,172,183,174,184,119,190,184,168,186,227,218,108,40,125,176,154,177,159,90,100,102,86,75,133,153,100,68,73,38,19,23,8,74,77,85,62,143,242,237,222,99,53,59,0,40,10,71,113,85,64,47,51,34,35,48,66,89,154,125,106,123,95,108,128,128,107,100,109,123,132,94,119,127,51,37,22,126,131,202,178,103,149,185,156,146,117,55,36,74,46,60,72,67,94,83,52,42,36,26,41,67,56,27,40,37,77,65,52,60,101,96,144,125,142,111,63,23,34,30,7,63,35,50,20,8,17,25,60,35,24,25,42,31,33,34,30,28,31,33,12,3,19,1,12,29,25,32,17,14,12,27,36,16,24,11,18,87,106,121,118,47,85,54,42,26,8,13,33,30,30,10,31,15,29,11,16,13,29,39,29,21,19,34,40,29,22,23,17,22,22,58,43,37,41,24,36,21,30,27,73,59,118,189,220,149,146,132,101,57,32,16,11,9,14,10,30,24,73,104,129,150,129,152,104,52,43,17,6,96,183,149,133,73,44,55,100,116,156,99,20,26,9,72,101,97,128,133,132,134,172,159,30,13,13,12,21,54,45,102,106,129,172,128,86,66,33,24,18,31,27,16,34,47,19,30,11,29,24,35,20,22,34,51,23,34,14,12,20,27,14,27,32,30,8,57,24,16,21,14,28,39,39,5,24,31,54,47,69,116,134,74,40,26,6,8,11,9,33,53,20,40,30,51,19,16,13,21,27,53,20,15,50,43,62,29,6,29,28,17,27,43,2,34,14,37,14,16,25,23,9,26,39,28,21,38,30,96,250,167,97,148,119,68,33,48,52,72,48,60,43,130,207,186,228,212,199,219,226,218,185,237,208,221,231,217,214,209,209,127,6,1,0,8,19,16,25,18,17,11,15,3,1,21,15,27,21,11,20,17,7,23,8,0,215,193,179,200,223,191,199,174,209,205,212,198,180,202,188,195,250,235,174,202,204,202,215,204,205,193,200,201,205,200,211,217,202,227,182,208,203,198,220,190,202,205,192,210,196,201,205,192,208,196,187,176,221,189,219,202,188,198,201,197,211,202,196,213,197,230,199,200,205,211,202,214,190,229,209,212,195,203,199,185,181,202,189,211,219,224,211,203,197,192,165,200,205,200,210,172,185,187,193,180,203,202,211,186,194,197,175,207,196,176,187,189,206,199,182,208,176,195,186,194,197,189,179,203,178,174,185,204,186,189,186,188,218,187,222,197,199,189,212,190,224,201,195,195,189,181,156,174,174,189,179,207,190,214,172,199,203,215,189,215,183,192,204,202,205,194,198,205,211,200,177,209,186,214,192,201,200,179,207,215,188,183,189,190,175,191,184,224,198,209,181,190,178,213,218,192,197,169,187,196,203,191,211,190,182,197,188,202,186,199,208,200,215,208,180,174,206,194,187,209,204,213,234,185,192,182,167,193,196,207,189,196,181,185,181,185,175,193,199,187,209,216,181,173,193,212,198,186,186,190,198,206,176,199,207,215,209,196,198,171,217,197,196,175,171,219,191,200,224,201,222,178,206,195,200,194,198,177,199,204,198,184,196,188,215,209,154,165,134,95,143,159,122,77,180,190,203,187,143,134,186,224,214,214,177,193,187,196,195,174,160,120,189,214,131,163,187,171,81,21,99,106,129,169,144,102,95,95,83,125,122,79,59,37,24,90,41,24,50,38,48,59,150,215,237,193,99,102,90,46,19,32,28,88,83,53,44,57,26,20,22,54,57,116,125,110,130,108,87,89,103,116,114,119,112,114,118,123,133,71,43,37,64,183,195,149,86,72,123,147,102,81,49,49,29,35,40,28,66,69,55,58,24,83,62,24,89,72,55,36,48,47,91,121,127,115,113,119,122,126,115,86,63,46,12,30,35,1,72,39,38,25,28,54,27,32,35,23,25,22,36,22,27,32,26,39,42,67,39,35,18,11,20,21,27,26,8,34,20,8,15,23,145,210,185,179,116,132,139,122,89,27,11,45,24,25,35,22,29,35,8,18,24,26,18,25,13,6,24,28,31,18,6,54,21,29,19,7,5,36,44,17,55,35,27,53,75,68,112,154,155,120,142,114,80,27,44,21,20,15,24,42,22,24,76,82,83,82,110,140,71,57,38,29,22,140,154,97,97,84,43,111,148,130,146,61,36,24,21,64,91,119,108,137,118,134,152,95,39,23,18,12,11,40,139,158,193,167,152,164,140,66,36,41,29,4,7,18,16,17,23,27,28,21,34,47,56,41,46,14,13,24,31,15,47,26,16,18,44,18,48,9,36,34,53,19,27,42,7,49,30,20,39,57,72,92,79,50,23,18,27,2,6,9,24,21,25,27,20,35,8,23,38,31,34,29,35,13,27,32,22,14,6,22,11,42,43,38,16,15,11,45,11,23,30,22,54,7,17,22,67,10,79,238,228,118,98,178,138,107,59,38,48,15,88,55,35,156,223,200,226,226,221,217,229,226,212,211,205,222,206,202,214,176,220,115,0,0,8,8,25,15,13,39,17,10,14,13,17,2,7,20,6,18,28,13,31,5,7,12,198,188,213,203,192,189,186,195,214,160,209,175,201,196,200,208,241,252,230,204,220,201,193,195,201,192,180,187,185,195,184,192,209,201,216,212,206,201,190,189,195,199,185,196,213,194,199,190,207,212,192,199,183,193,202,210,218,175,194,221,204,213,182,229,198,203,193,180,176,186,200,176,202,202,210,174,190,192,193,180,193,199,205,189,194,196,206,210,201,199,203,194,202,207,193,196,219,191,181,189,205,198,177,208,191,178,205,187,181,177,189,210,203,216,189,205,189,181,194,191,181,220,225,211,208,232,208,212,175,212,189,206,203,202,214,192,201,190,213,198,183,210,222,208,210,196,201,180,210,202,196,198,202,186,186,184,204,191,165,183,189,215,174,198,176,181,194,190,181,200,192,201,198,199,193,226,200,207,198,188,188,190,188,194,214,191,200,189,191,192,189,194,220,191,203,179,180,187,189,185,185,193,183,207,197,184,207,184,205,186,198,215,194,182,201,189,166,186,219,176,205,192,198,195,195,196,202,202,202,203,199,184,189,193,196,190,193,202,195,185,184,191,200,186,182,185,186,209,224,213,201,190,205,203,190,212,191,208,224,195,185,170,213,205,193,186,202,195,204,195,198,205,188,192,169,174,179,182,200,201,198,200,204,204,198,174,139,185,130,182,168,195,122,106,167,242,207,205,214,198,199,228,193,216,217,196,171,182,196,160,168,170,161,98,104,166,203,147,85,38,23,49,168,191,176,108,104,69,89,134,90,64,7,49,121,82,53,57,73,63,74,158,217,242,185,90,89,107,132,79,31,87,137,117,90,59,21,32,7,49,49,60,113,100,105,102,104,118,111,112,88,109,110,106,133,95,110,90,79,54,20,33,152,200,149,91,0,62,104,96,99,44,62,86,17,49,30,52,58,34,70,53,27,39,51,36,45,82,62,34,48,108,132,131,135,113,112,109,108,95,85,67,18,37,20,15,26,36,43,42,34,23,51,6,53,39,27,42,24,27,34,25,23,12,38,41,21,39,26,18,28,24,26,21,22,35,15,13,2,8,34,30,112,180,186,205,192,184,194,197,101,22,27,7,30,12,23,26,27,7,17,13,32,45,26,15,19,20,34,38,37,31,44,55,23,20,39,50,4,42,7,30,28,61,24,29,92,69,75,82,143,109,122,100,50,68,38,7,20,12,14,13,5,42,77,89,91,83,119,101,52,44,32,44,42,107,86,49,80,119,52,93,142,68,54,37,30,46,24,54,83,96,109,93,136,71,143,107,42,13,8,2,13,60,108,216,152,145,156,117,104,60,16,37,16,9,5,16,25,18,51,30,19,28,22,32,33,31,30,25,41,16,42,25,24,58,11,34,38,34,31,36,13,7,35,35,22,29,26,5,39,54,55,79,112,88,84,57,34,40,33,20,39,18,26,29,11,22,38,25,39,30,44,59,22,6,36,5,56,24,15,30,28,35,16,39,22,15,25,46,41,20,19,17,41,52,35,60,37,28,20,67,208,238,172,62,67,140,127,53,66,39,17,45,61,29,39,173,217,219,220,219,240,221,225,230,192,208,223,219,223,214,215,207,218,90,13,2,8,12,9,15,33,22,21,7,25,30,31,30,5,3,5,18,7,17,16,8,0,7,186,202,214,201,219,223,161,218,193,180,192,194,203,215,193,174,245,253,249,206,197,233,177,186,180,190,208,189,200,214,194,180,216,185,181,190,207,188,210,219,238,195,182,222,176,189,207,190,194,184,208,203,174,182,180,184,179,202,206,205,158,172,179,228,210,228,202,216,218,209,205,196,199,191,218,203,174,203,202,186,204,184,207,199,169,202,218,200,208,201,168,198,200,190,197,203,219,215,204,210,180,190,207,208,171,208,193,198,191,213,180,205,203,192,169,210,194,195,238,189,190,199,205,210,213,206,187,207,223,204,207,203,181,199,184,191,179,154,197,204,183,186,171,194,213,205,186,198,194,175,189,196,201,179,205,208,195,197,195,190,176,203,201,189,187,179,195,193,176,168,194,171,180,224,199,214,192,192,197,198,210,227,183,190,211,194,197,209,228,198,164,200,197,191,185,176,196,220,181,185,209,173,191,177,196,221,226,191,201,209,197,194,196,205,204,201,197,214,195,197,193,214,203,191,181,187,195,201,212,211,183,238,161,201,194,203,179,205,199,184,206,214,190,199,207,178,197,211,192,204,205,224,207,196,198,176,180,211,195,185,207,198,192,212,205,199,174,212,213,190,197,197,202,207,176,209,209,186,181,212,192,167,187,190,194,160,137,159,162,161,217,202,126,142,199,206,193,199,179,220,222,208,213,215,178,191,178,206,207,145,182,158,123,23,143,232,224,135,51,20,20,117,214,218,142,87,96,43,83,75,49,13,32,209,195,80,27,86,35,117,176,237,248,150,95,79,97,108,70,125,115,143,113,98,84,45,39,34,31,59,97,118,116,122,116,95,97,100,110,84,104,103,116,110,106,119,85,63,73,40,23,46,140,122,60,20,15,69,157,142,73,45,103,75,35,29,56,26,46,44,54,67,34,40,39,14,29,57,79,77,144,161,193,148,115,105,98,69,84,49,51,39,17,21,8,25,48,34,24,38,38,16,43,40,37,39,44,51,20,22,29,50,36,27,21,43,38,45,23,32,41,20,27,23,28,48,38,20,29,18,45,22,61,111,123,134,139,137,143,87,77,17,17,61,9,14,33,26,41,22,45,12,3,42,18,34,9,11,14,24,39,42,26,23,30,11,22,22,34,13,2,20,22,30,57,35,70,48,67,85,94,117,79,72,77,46,35,23,17,25,28,2,17,47,59,88,140,119,100,102,100,33,22,18,46,67,101,91,122,133,126,140,120,72,58,42,22,11,34,76,85,118,118,136,121,164,186,91,14,17,41,29,22,52,117,135,184,124,113,119,108,77,56,54,34,34,32,31,30,32,20,42,22,33,41,51,40,31,33,17,37,20,50,22,51,19,27,36,46,27,9,6,25,29,53,30,27,31,17,23,19,90,119,192,145,118,91,82,71,65,41,8,15,9,26,44,5,24,20,22,10,10,14,32,29,32,19,15,18,22,45,21,27,14,25,41,42,15,16,4,27,17,27,52,21,18,46,20,28,54,65,114,231,222,102,51,46,58,57,67,69,47,51,42,46,15,112,236,218,207,204,219,229,205,228,202,213,223,217,202,224,217,219,195,217,109,5,0,21,11,8,20,21,27,16,33,30,26,10,0,2,4,21,19,7,17,5,2,0,12,218,195,206,199,193,214,204,205,204,180,199,193,179,202,231,212,249,243,189,201,200,187,206,179,205,205,196,184,212,207,212,205,197,202,189,197,192,191,206,210,201,216,194,175,189,200,204,217,202,166,201,207,193,184,202,196,184,181,199,207,207,211,197,187,200,221,198,209,223,182,206,212,177,185,201,198,189,194,209,200,233,202,188,218,212,216,173,188,195,205,219,192,175,195,191,191,208,192,168,190,199,185,189,205,201,165,225,196,180,205,217,202,202,192,202,198,195,201,207,198,195,210,192,208,215,197,175,193,206,213,182,226,208,182,189,190,178,190,188,201,211,210,193,217,180,201,199,201,194,188,197,201,187,195,197,188,193,187,177,195,181,174,189,184,183,197,200,208,190,203,163,181,205,211,184,176,183,177,186,190,184,193,185,173,200,199,185,194,169,200,195,165,197,214,184,184,190,163,205,195,199,189,178,189,178,192,217,185,189,199,203,180,216,195,171,194,217,181,212,195,205,195,201,192,209,174,171,189,217,181,194,187,235,201,208,211,207,215,196,220,195,176,205,209,186,155,205,185,195,232,209,189,182,196,192,201,202,181,186,167,176,198,222,186,176,194,152,169,205,191,178,214,212,185,184,203,206,180,228,203,192,197,166,192,198,153,108,121,160,172,206,166,133,170,226,201,195,190,206,230,188,211,193,199,180,187,207,224,185,149,167,163,105,90,196,228,152,85,40,10,51,174,217,198,117,56,46,40,57,43,25,44,99,203,138,53,49,76,118,234,248,243,118,72,93,109,98,48,103,154,141,143,99,98,46,24,22,53,80,118,107,124,131,109,117,86,121,140,118,106,85,85,88,94,84,74,115,94,37,33,35,100,119,52,17,56,117,122,129,73,88,79,71,61,42,48,27,78,45,66,66,45,46,55,17,1,15,45,109,157,153,191,163,109,112,110,83,18,12,38,38,61,42,26,34,12,13,42,32,28,25,35,33,40,32,37,40,38,32,41,48,10,36,12,43,20,42,28,29,35,37,27,46,30,29,23,31,35,46,25,39,23,47,74,29,28,30,41,38,29,37,8,26,27,19,32,42,25,53,14,25,28,23,30,12,26,17,6,15,19,23,50,20,42,48,17,17,13,37,18,33,31,38,22,14,35,70,77,83,73,109,63,48,66,61,66,34,32,17,20,6,23,67,43,34,80,81,85,106,132,85,71,57,38,19,39,75,108,131,111,88,85,96,71,117,54,39,30,40,117,104,145,85,106,143,146,134,42,29,9,11,11,51,39,102,166,117,116,79,75,59,67,45,36,25,35,12,29,25,25,9,21,25,10,43,41,29,42,38,29,47,36,18,33,43,4,24,30,15,27,15,17,55,10,36,26,23,44,26,36,32,98,188,177,207,158,125,141,165,139,35,7,14,27,28,34,40,20,18,7,37,41,26,20,23,15,8,49,21,25,35,14,53,28,10,24,22,16,26,12,32,22,12,27,40,19,46,29,39,40,83,141,186,131,97,99,38,48,78,63,63,31,38,28,32,124,209,240,233,214,211,217,224,210,202,216,190,219,219,231,221,207,222,196,215,95,5,14,9,27,6,14,29,28,3,14,30,12,5,4,7,22,8,13,20,22,7,13,28,10,206,194,209,186,201,218,205,187,197,198,199,194,211,188,199,175,254,238,151,202,191,187,180,180,228,193,197,208,192,201,202,213,191,199,197,204,202,186,207,202,187,222,185,190,200,198,202,197,158,190,196,182,204,184,217,188,200,192,182,226,195,173,206,202,217,204,183,192,209,191,202,202,211,216,195,217,202,196,181,217,210,177,167,183,186,171,203,193,186,219,220,240,188,197,191,166,216,178,202,202,202,201,182,221,172,221,171,188,207,187,193,207,202,195,207,214,204,184,207,181,214,199,186,192,226,214,203,221,197,210,219,187,201,198,183,195,213,203,212,200,193,186,186,218,193,175,191,207,194,182,171,207,216,192,195,186,183,192,193,197,180,189,196,175,205,200,199,191,181,210,178,209,191,186,188,195,226,195,173,184,204,205,201,177,189,163,184,152,209,206,172,195,177,191,201,187,189,176,205,190,172,197,195,177,172,203,192,178,193,193,194,195,196,202,203,207,221,186,197,190,223,185,208,201,193,178,192,198,202,220,201,183,172,216,190,223,198,225,210,210,181,216,217,186,217,173,182,193,190,218,165,183,178,206,207,194,206,204,163,201,189,192,213,179,200,196,169,203,200,181,212,169,182,198,176,210,206,216,206,178,175,193,204,190,153,146,114,135,173,156,196,141,104,171,202,174,200,174,194,229,203,193,178,206,185,180,218,200,165,162,223,178,94,55,140,118,62,53,90,62,93,241,246,158,113,63,70,31,46,41,11,20,91,88,86,52,34,108,196,253,244,128,79,79,89,63,52,69,91,126,62,84,69,29,40,12,44,78,132,163,130,121,106,122,98,127,98,97,122,115,145,122,90,64,64,102,110,129,44,34,56,112,60,13,46,161,159,97,93,125,105,84,72,113,30,33,32,7,44,17,58,55,39,22,30,29,40,55,56,147,160,130,127,97,91,50,34,50,36,39,61,50,42,28,37,15,28,16,46,18,43,38,21,36,55,33,4,29,12,55,36,35,43,9,17,59,33,30,31,21,35,27,32,19,12,64,38,20,27,13,17,18,42,43,42,35,42,20,39,15,36,3,16,15,22,26,35,29,37,36,2,28,24,19,15,34,16,26,12,17,1,46,27,43,42,21,34,31,43,19,18,11,38,24,34,43,60,56,83,81,63,92,45,46,44,74,45,49,10,19,24,22,44,37,31,76,91,106,134,151,108,60,35,22,8,8,9,35,87,58,30,66,34,60,54,26,39,45,79,135,92,177,104,55,53,83,57,21,22,31,25,24,41,35,109,45,110,70,52,52,41,85,43,32,25,10,28,19,34,43,23,26,18,13,27,40,36,47,22,23,68,39,43,30,19,23,21,31,37,39,26,22,24,12,32,7,43,29,23,33,34,75,127,141,184,170,168,187,176,102,55,35,19,23,35,10,16,19,14,34,32,22,30,40,11,9,49,26,25,21,41,11,11,20,21,21,12,35,19,17,24,30,27,61,31,29,13,47,31,46,107,183,188,134,147,134,97,99,76,55,67,55,23,21,126,229,249,253,225,227,204,231,199,206,209,190,202,215,201,197,209,202,200,200,217,114,7,23,5,12,14,3,3,27,1,16,15,17,12,4,15,15,4,12,16,18,29,25,28,28,203,196,201,197,184,187,203,219,183,204,214,208,191,184,207,194,243,227,170,225,200,227,207,193,183,202,208,199,215,198,203,209,211,199,192,207,226,231,202,190,217,218,191,187,185,187,206,207,192,181,192,195,214,190,207,195,225,184,194,189,205,208,213,196,207,202,176,189,233,202,215,176,210,184,198,197,199,202,215,216,208,200,214,203,200,197,205,202,197,229,202,214,216,199,202,172,211,197,192,216,214,220,193,211,173,191,227,214,202,184,189,218,198,231,193,192,196,211,190,202,201,222,188,192,212,184,216,190,194,190,203,188,199,175,189,193,168,206,185,184,175,191,204,190,207,203,199,201,200,173,215,190,191,182,203,185,203,202,191,194,172,197,171,193,213,190,168,190,208,217,201,198,200,225,174,200,224,180,188,212,192,190,200,199,170,180,206,193,213,165,190,176,191,179,213,192,216,199,189,181,187,189,207,184,174,210,206,196,220,197,178,195,214,200,197,186,208,194,208,213,189,178,184,186,197,218,203,171,185,164,190,189,203,214,196,211,193,199,191,193,194,211,195,198,205,186,174,155,205,212,186,188,206,198,191,204,176,210,208,175,220,188,213,194,193,199,209,199,199,178,189,216,211,224,190,184,201,185,202,180,186,190,220,188,146,179,121,117,163,158,151,119,133,168,192,195,210,202,191,209,167,205,204,221,217,183,205,203,183,174,207,161,43,30,27,64,77,134,182,104,152,228,244,170,115,69,54,21,61,156,41,91,195,137,24,41,132,236,255,246,119,86,79,106,81,32,76,118,144,94,83,51,41,36,7,35,81,119,115,137,111,89,100,103,105,103,88,114,114,126,134,93,88,111,98,116,145,123,108,66,69,35,14,52,131,205,159,77,120,147,146,108,122,88,41,33,34,22,22,53,32,41,25,73,30,47,15,18,35,76,103,95,69,46,48,22,13,36,24,25,42,51,44,46,43,21,29,37,13,42,28,15,14,29,16,33,39,30,19,34,44,37,19,49,31,35,12,64,31,18,41,57,57,42,26,21,24,35,24,38,50,7,42,55,36,21,39,36,34,28,15,4,21,28,36,36,44,29,28,8,36,40,53,15,11,23,40,52,8,27,29,37,25,15,30,45,21,43,28,15,29,23,26,41,37,38,60,91,104,66,111,110,49,69,54,68,76,50,19,8,51,20,18,46,84,145,166,142,183,139,105,65,60,47,30,24,21,62,127,94,79,98,78,39,20,21,18,45,121,100,113,195,149,128,70,76,53,6,7,3,3,13,37,47,59,66,63,76,81,24,58,32,68,35,22,30,14,28,24,33,40,38,38,27,22,43,5,28,29,25,42,45,29,36,12,28,40,14,39,46,52,18,39,16,25,15,56,34,39,56,32,25,69,86,68,102,116,100,56,50,27,19,12,21,27,34,37,14,13,38,18,43,12,28,39,21,16,33,42,21,21,31,15,31,13,16,30,32,31,33,40,34,36,12,24,12,25,51,50,68,104,127,142,132,141,146,149,133,129,112,74,53,15,28,156,208,227,239,220,206,221,216,241,206,235,198,194,212,222,220,203,212,211,225,218,109,16,13,6,6,1,29,6,9,18,29,21,25,6,21,27,35,18,22,32,31,14,14,11,22,187,218,196,196,188,194,192,184,201,202,200,197,200,204,186,190,238,241,176,205,192,220,177,196,203,203,193,204,215,199,214,228,189,215,189,221,206,205,194,212,197,182,210,235,194,196,203,223,180,184,192,200,198,208,226,200,202,178,203,175,190,209,177,208,193,217,186,188,200,187,185,203,196,201,224,194,196,189,193,218,200,209,213,215,210,186,222,186,211,210,214,198,183,192,216,196,198,210,194,204,205,204,207,205,200,203,200,190,193,200,196,200,181,205,180,192,215,184,203,186,189,207,200,197,179,180,179,205,180,224,215,223,203,183,189,210,177,202,188,180,179,181,232,202,205,222,199,194,188,184,196,182,193,188,204,191,176,189,188,177,197,190,203,185,186,193,211,189,189,189,194,215,197,186,175,200,218,157,178,223,179,190,209,177,198,218,188,200,196,215,202,203,185,180,180,200,192,206,189,204,183,191,194,193,193,199,195,211,199,180,192,201,193,196,213,193,220,222,212,194,220,207,205,195,211,186,192,191,164,190,200,193,204,197,185,202,210,185,201,207,185,206,188,192,204,180,226,189,179,208,222,189,187,216,165,182,221,210,235,191,198,182,186,207,203,180,211,181,186,199,184,198,195,165,193,189,202,201,209,189,206,210,201,184,181,172,175,159,196,162,105,110,151,192,188,205,215,215,190,212,212,189,175,155,214,215,203,193,162,223,201,116,56,17,12,115,180,183,224,148,202,223,163,164,95,33,19,7,156,239,84,146,157,103,63,144,242,252,220,89,60,91,103,61,57,77,135,144,101,66,51,20,27,36,59,79,141,110,125,110,89,105,89,109,114,94,104,104,112,98,80,90,86,93,99,87,129,129,113,73,35,8,66,169,103,89,105,35,103,147,96,110,120,61,63,39,26,37,13,4,48,49,20,12,30,54,37,56,28,41,58,40,41,28,13,4,10,32,22,10,17,60,43,55,72,19,7,31,27,50,15,37,48,39,38,34,9,28,34,23,47,33,41,34,60,27,51,33,53,14,53,46,59,45,31,14,26,42,27,40,12,15,51,46,66,24,50,71,51,38,20,3,22,39,22,27,44,41,24,16,49,9,42,18,37,10,50,16,47,31,35,35,59,34,13,45,10,26,35,48,38,46,46,25,40,18,106,150,169,138,162,179,122,153,179,144,178,103,47,2,13,18,48,7,158,203,123,87,58,43,67,47,46,68,27,25,53,97,181,168,177,189,118,24,38,31,38,99,103,84,65,89,131,161,135,144,70,38,54,23,39,25,45,71,106,89,110,102,78,69,59,74,65,75,46,10,9,20,37,58,46,53,9,34,20,37,25,54,18,43,3,25,25,36,22,15,16,38,32,14,20,9,30,25,3,17,19,32,27,20,7,70,47,58,26,45,42,8,14,38,32,23,12,20,25,31,41,53,26,43,29,59,35,50,40,27,35,36,15,37,48,31,28,59,14,34,34,43,32,42,25,40,28,23,66,56,62,104,129,44,59,49,85,100,108,119,148,173,168,162,82,40,5,51,195,219,245,252,236,243,225,251,239,218,223,217,217,240,223,188,201,217,223,209,234,132,12,6,23,34,11,1,6,25,22,17,2,24,5,2,17,0,12,13,1,5,6,19,19,9,200,189,200,182,195,178,169,171,209,204,214,203,215,207,212,206,233,254,138,187,208,198,150,207,216,216,206,199,199,216,195,199,215,197,205,204,204,181,211,184,199,192,214,186,210,209,226,204,212,198,204,222,206,222,190,191,199,205,203,201,233,193,219,215,205,212,199,216,207,196,205,186,200,208,206,196,192,211,215,219,195,182,219,201,197,209,188,204,207,213,193,182,222,196,218,214,185,207,195,188,193,213,217,205,213,191,206,219,220,211,213,205,191,170,197,213,200,221,195,202,190,217,215,182,198,191,188,190,182,188,212,184,176,202,179,179,184,183,225,173,201,203,205,191,182,192,190,176,208,181,189,183,166,208,180,173,202,191,194,211,203,168,187,233,178,175,175,170,171,195,203,175,215,201,210,204,184,191,204,178,158,202,201,225,166,193,202,185,202,202,185,197,208,207,192,181,182,184,213,188,176,192,179,208,189,176,179,182,187,207,186,185,184,228,185,201,163,197,155,199,182,191,211,196,170,172,192,177,194,180,200,189,171,207,188,217,199,187,192,187,210,186,185,175,193,187,221,186,168,196,201,203,212,222,196,185,212,196,208,213,212,210,201,182,212,190,176,207,189,184,168,196,163,173,187,199,178,183,173,210,210,212,207,184,131,141,172,184,194,120,104,143,153,225,178,184,207,216,203,216,183,177,189,188,204,208,223,189,163,199,193,77,83,88,39,103,139,111,149,148,219,195,117,121,44,8,3,27,136,103,63,78,81,52,158,250,244,188,73,119,93,98,57,85,98,111,148,83,131,86,23,11,25,51,90,121,132,122,130,105,113,78,114,137,112,129,102,104,130,103,87,65,98,93,127,124,96,114,56,26,40,29,154,203,51,44,36,80,93,101,101,98,71,49,45,61,44,23,31,38,62,28,40,32,32,39,24,26,24,48,27,48,45,41,14,10,14,27,35,35,40,43,60,74,70,52,48,23,50,60,51,54,56,62,89,40,53,43,56,64,62,58,83,63,77,55,68,60,62,64,41,65,58,29,58,45,73,51,33,48,46,44,70,87,106,64,140,97,60,59,7,3,12,36,48,18,10,32,26,27,27,66,46,28,35,30,41,9,48,48,18,43,36,44,18,15,20,24,45,21,48,42,37,34,38,20,78,158,187,129,162,158,172,151,159,200,145,107,27,29,5,25,46,29,138,168,89,58,49,26,35,73,58,83,69,51,21,134,140,84,129,123,137,37,42,32,58,102,97,107,41,39,47,48,91,127,126,28,32,19,46,26,55,161,157,158,188,185,167,168,181,186,185,114,44,22,10,16,25,44,17,36,70,29,25,35,35,52,18,48,49,23,40,18,12,12,28,27,41,38,25,41,13,24,38,30,11,24,41,25,16,40,64,40,45,46,22,25,6,30,4,44,38,41,34,26,55,74,7,9,31,30,30,57,49,18,44,12,41,33,60,28,27,40,47,25,38,24,30,14,37,23,20,40,33,33,68,194,223,146,107,69,66,86,57,96,128,172,143,127,62,112,136,137,209,209,218,236,213,222,235,232,231,240,221,223,239,188,219,211,215,222,215,222,227,120,15,10,0,11,24,1,19,22,16,19,19,29,2,3,4,0,13,4,14,21,9,17,20,29,202,194,207,173,190,198,210,197,207,216,212,188,194,197,203,187,234,230,110,154,194,168,206,195,195,216,169,182,203,184,185,213,197,209,203,158,213,192,187,191,204,212,200,196,200,178,183,193,197,201,183,219,211,195,228,194,201,190,190,196,221,187,205,216,227,175,169,210,219,178,222,190,199,194,199,189,231,220,206,207,188,206,206,222,206,204,191,217,192,194,231,205,183,197,211,201,193,200,217,195,198,204,209,191,173,219,188,192,183,202,211,201,192,198,199,189,186,186,183,198,202,185,185,167,211,187,194,198,181,208,197,212,180,199,192,183,211,187,199,189,216,182,211,202,191,209,201,191,187,177,223,212,177,195,198,190,170,190,178,205,202,207,194,177,204,207,176,201,182,205,186,200,201,200,210,198,166,233,189,206,213,181,183,202,185,199,186,176,183,197,191,202,190,169,214,194,202,194,184,199,215,184,190,199,180,191,211,183,211,198,200,218,174,191,188,181,181,184,204,196,192,189,229,217,194,191,160,185,190,197,180,177,197,210,198,212,190,191,177,202,194,215,161,186,171,182,188,192,178,174,173,187,186,197,217,169,236,179,187,193,201,202,181,193,201,191,182,192,192,196,183,185,187,190,201,209,189,201,220,174,195,208,201,166,118,116,132,161,150,142,81,94,219,225,171,139,153,222,193,186,198,193,178,190,196,209,191,159,119,148,77,63,171,107,30,61,53,24,64,158,223,160,9,10,14,45,9,67,110,77,83,153,121,195,248,243,177,75,80,83,85,42,62,124,152,145,82,70,67,82,6,19,74,102,124,139,114,120,135,141,122,118,108,106,89,110,103,122,113,83,80,72,128,136,107,109,79,44,38,77,136,94,128,124,34,89,65,92,90,121,140,56,23,31,123,145,40,18,12,50,44,41,40,14,9,30,30,18,16,24,24,28,8,36,8,31,20,20,15,19,29,53,58,72,61,40,83,58,63,154,162,196,210,161,180,195,174,180,144,167,164,176,189,152,161,171,169,172,180,196,187,207,176,165,160,139,108,134,160,142,123,94,135,179,188,186,140,160,139,79,44,28,60,37,46,93,78,60,67,84,84,88,91,86,65,79,75,74,55,77,95,53,72,59,80,39,68,50,65,58,60,64,64,44,85,66,99,170,151,42,39,55,35,55,67,35,44,40,22,9,28,6,20,34,73,91,64,69,70,106,93,103,90,62,65,38,57,117,60,32,14,44,110,100,23,71,72,120,117,97,91,64,54,73,29,96,150,43,17,11,33,26,55,194,190,174,167,166,215,177,163,166,187,137,16,29,47,12,31,10,13,14,41,47,20,25,30,27,28,41,20,33,68,23,41,32,44,26,19,30,23,36,26,10,35,23,29,52,27,24,41,36,50,57,44,26,41,50,21,32,9,46,12,37,23,43,10,33,32,24,67,30,37,53,15,38,10,16,59,21,30,36,41,14,37,11,50,20,27,35,62,28,16,22,50,24,40,163,184,150,155,110,72,94,110,65,50,88,44,40,47,117,195,193,199,152,141,163,171,183,142,170,209,205,217,246,238,222,231,232,202,198,214,225,214,96,8,8,19,0,1,1,40,35,17,34,10,23,33,34,6,24,22,41,12,22,13,31,28,12,186,186,215,198,181,177,206,180,211,196,192,201,218,177,209,202,240,217,93,161,165,199,189,183,201,221,210,196,169,197,217,198,191,188,194,197,176,212,199,205,187,187,192,204,187,208,219,222,194,197,180,198,187,176,211,199,213,197,186,170,208,207,198,218,200,193,184,198,183,191,185,191,203,183,191,194,188,201,228,198,181,186,185,213,215,207,201,190,206,207,186,201,184,182,206,188,199,210,214,183,208,194,221,180,209,208,195,192,193,194,195,206,204,204,210,171,211,211,181,200,204,180,210,197,179,193,210,185,191,217,200,201,207,209,201,187,188,193,223,212,212,171,199,181,188,183,189,201,182,191,174,193,201,180,197,209,210,203,186,186,188,196,204,196,198,194,192,203,209,206,204,214,191,181,202,187,200,205,168,195,169,204,204,208,196,192,211,167,185,181,193,182,182,175,190,196,196,213,214,188,188,176,195,189,193,189,175,199,181,186,190,191,153,204,195,201,182,197,189,181,205,184,198,185,221,182,214,190,184,200,194,190,176,190,211,197,202,185,192,186,188,202,178,214,209,190,201,172,167,200,187,204,195,211,201,185,199,198,206,197,198,190,203,177,183,203,202,194,199,201,191,173,201,197,208,210,232,221,213,198,202,229,181,144,134,87,99,140,136,116,78,90,147,233,203,135,109,194,162,224,182,204,172,190,181,210,192,94,46,42,51,184,212,127,29,75,107,66,97,146,184,82,21,10,100,139,64,189,122,97,180,219,249,252,242,141,99,82,85,102,55,84,150,140,103,69,56,47,48,22,18,59,110,130,119,123,98,107,100,119,105,84,100,103,119,124,120,90,98,112,115,111,125,65,91,76,12,88,140,142,185,95,63,61,44,102,92,47,109,60,87,46,45,78,161,148,84,15,14,7,15,8,27,12,26,5,41,63,44,36,48,30,16,31,30,34,40,18,36,37,27,60,53,92,99,98,62,81,69,124,135,150,145,157,131,155,173,175,150,178,164,222,145,156,159,128,162,142,173,179,174,168,123,172,130,153,119,176,153,158,163,104,148,168,185,169,168,142,118,111,90,95,85,89,82,102,107,105,185,138,164,196,161,174,150,182,200,187,175,165,199,198,174,157,181,171,155,144,157,152,179,184,172,167,195,196,217,166,63,32,37,48,55,61,27,24,58,34,33,14,11,24,35,9,52,82,112,118,137,133,95,109,118,117,110,78,70,36,25,35,18,30,72,74,62,62,92,139,174,163,152,120,99,90,84,163,134,44,15,11,36,6,93,187,157,107,52,60,55,39,55,69,61,46,52,30,25,25,35,19,59,28,39,43,28,50,35,39,43,40,50,39,32,48,33,60,33,61,40,39,59,55,37,32,35,30,43,40,11,11,19,44,62,76,99,57,89,34,51,32,32,29,13,8,21,22,8,26,35,35,33,50,38,53,43,37,17,30,32,38,46,20,22,36,41,54,47,31,32,32,32,42,49,31,11,52,34,155,111,114,90,49,52,112,103,50,23,38,44,32,27,154,137,146,141,114,129,132,132,146,119,126,125,144,176,198,199,226,243,234,236,244,236,207,205,102,7,10,32,1,8,5,29,19,16,37,3,26,11,4,8,36,13,17,1,15,14,3,41,5,163,194,201,214,198,199,208,188,188,202,230,209,209,195,199,194,253,225,122,169,210,200,197,199,200,216,210,188,202,211,191,204,194,194,181,204,201,187,190,217,190,175,179,186,207,194,215,215,195,205,198,201,200,188,169,227,193,197,184,193,206,181,200,205,192,196,201,211,209,206,208,189,190,197,196,189,189,216,194,212,195,192,183,192,200,207,187,190,207,219,211,199,169,187,204,173,190,197,172,202,207,206,200,232,186,207,206,206,185,221,182,203,215,209,195,189,195,190,187,172,208,205,219,217,200,199,192,179,183,210,193,190,176,176,205,195,199,205,199,181,185,200,190,203,211,204,180,187,202,187,205,188,204,165,203,193,199,206,208,197,200,197,197,161,212,216,177,183,206,189,217,196,206,179,196,186,179,207,194,205,188,200,212,183,207,195,222,203,216,187,183,211,170,202,193,183,189,176,177,192,186,199,190,173,195,188,161,175,197,158,208,198,190,183,180,187,193,164,186,200,195,193,189,174,182,197,207,198,180,185,203,212,218,174,227,177,205,223,198,166,191,195,205,188,180,186,170,173,199,179,188,168,180,195,211,181,179,204,170,192,225,213,193,215,190,204,185,199,192,184,202,196,207,235,237,242,160,170,200,192,227,166,149,130,115,113,114,146,146,97,101,62,109,194,230,148,121,213,201,209,207,209,180,195,220,210,110,89,34,29,110,212,220,121,55,102,163,151,89,151,159,153,158,108,234,125,111,191,128,97,104,146,234,193,105,117,81,89,60,96,103,117,141,94,67,48,27,18,22,57,81,101,126,140,111,108,120,117,119,100,124,94,85,97,110,92,103,96,97,85,107,101,93,90,64,4,75,185,173,133,122,94,65,41,36,41,48,71,79,51,22,53,130,159,168,107,33,44,3,26,14,9,28,19,19,20,28,60,51,38,22,16,39,13,33,24,16,1,41,36,32,27,56,69,63,97,46,86,57,43,29,32,35,33,33,68,68,39,39,54,52,61,62,40,60,23,38,45,37,44,35,16,38,28,35,48,33,25,77,53,25,29,93,110,145,140,101,116,87,41,73,32,33,47,24,47,81,64,62,95,76,63,72,82,75,101,122,111,113,104,116,126,92,130,101,135,111,107,101,138,153,141,147,150,156,158,169,97,87,113,64,60,51,70,63,70,57,46,36,21,18,22,17,5,18,101,112,120,93,102,152,165,187,181,168,59,79,104,46,55,49,60,84,42,77,139,157,196,185,167,150,87,121,156,146,149,41,5,40,22,11,38,45,94,87,65,58,27,47,40,42,51,34,61,64,60,45,67,53,66,118,116,137,133,135,142,139,131,114,118,112,89,97,89,105,109,78,87,78,100,75,91,65,75,58,53,70,60,69,72,61,88,146,145,162,171,178,159,49,48,21,33,16,22,30,24,28,37,32,80,98,51,67,50,27,33,43,58,41,48,40,46,46,30,40,55,43,48,22,65,45,49,30,24,52,97,141,155,152,75,53,13,20,89,107,51,17,22,71,66,63,139,140,127,113,131,111,125,120,118,102,119,111,132,150,147,177,171,202,223,184,247,226,187,204,120,0,2,22,12,0,15,32,4,28,17,38,11,8,0,19,3,4,0,10,6,38,39,12,0,204,194,203,180,213,230,203,194,196,185,209,219,195,210,195,218,243,216,123,204,192,195,204,181,191,210,186,211,189,184,199,209,203,191,179,212,201,198,199,186,167,215,198,211,215,188,205,177,192,192,191,204,173,191,217,235,201,182,199,195,211,179,191,217,193,212,197,219,180,217,169,226,183,191,201,166,173,172,204,206,190,215,181,202,218,197,203,192,198,219,202,190,194,201,189,201,186,208,178,197,204,213,197,209,200,202,182,213,192,198,198,186,180,200,179,208,197,190,187,189,198,184,192,197,200,176,190,169,206,201,172,216,194,187,187,193,221,194,187,208,208,200,195,203,181,233,193,202,190,194,174,177,182,196,202,210,197,190,168,171,179,190,193,213,183,209,199,192,193,170,187,173,199,204,172,194,195,190,190,177,209,205,180,180,217,185,190,188,195,199,204,190,224,213,200,176,189,179,200,201,175,171,199,184,205,187,172,167,195,203,180,193,225,164,191,185,186,186,189,201,205,194,209,203,208,185,211,196,195,161,190,187,186,183,205,203,207,219,196,174,182,197,187,191,202,189,202,186,205,213,198,178,210,207,187,178,200,182,179,196,207,194,187,207,192,196,235,196,208,196,206,187,199,223,235,200,109,182,207,171,218,162,122,93,74,102,182,137,97,58,93,44,76,115,134,129,147,211,247,224,224,224,219,247,225,154,149,163,80,103,93,188,221,90,61,26,73,87,75,110,212,238,243,161,125,111,89,112,68,89,68,66,160,114,83,85,93,70,67,100,121,131,120,89,46,42,43,36,55,79,93,176,135,136,107,103,109,107,95,104,108,96,108,80,95,113,78,65,81,88,102,127,106,73,22,6,78,109,129,117,59,37,50,47,56,62,64,36,62,51,56,102,146,138,124,58,32,23,34,21,0,10,21,26,6,34,30,59,38,21,57,30,42,33,34,41,34,13,21,23,45,34,45,50,38,84,77,83,86,59,22,33,19,50,27,50,60,60,39,52,39,59,33,69,77,116,86,82,18,12,33,73,62,43,65,36,73,10,31,37,15,32,88,113,123,64,82,55,73,67,28,23,12,38,67,55,76,79,59,74,58,89,63,75,32,14,65,90,55,9,54,50,76,62,29,28,74,99,42,32,44,55,61,58,38,32,58,37,64,135,109,107,110,122,123,90,53,13,13,19,30,37,37,26,85,143,113,83,127,178,165,125,86,102,134,143,86,72,73,94,87,104,117,151,130,196,237,211,167,100,85,74,70,82,122,130,51,19,12,3,8,23,30,34,92,75,73,70,49,40,54,62,41,48,77,51,48,92,122,126,93,140,126,154,156,154,165,149,134,120,132,152,156,173,121,125,162,164,138,130,153,103,137,146,135,140,172,79,148,175,159,156,168,213,158,167,160,187,108,70,49,55,41,58,49,55,59,88,89,161,129,137,128,126,107,123,124,111,122,123,122,89,104,84,78,133,84,99,90,102,110,60,49,52,116,169,194,146,159,119,72,67,95,136,132,110,106,127,155,157,160,153,140,131,138,144,131,119,114,115,121,124,109,129,120,123,136,124,159,168,191,210,234,235,231,100,12,3,26,6,2,3,22,0,12,8,17,17,0,1,15,5,4,2,29,24,24,25,14,18,200,190,189,183,213,210,215,217,186,202,179,210,179,209,184,197,240,212,159,200,196,205,170,231,200,207,200,176,184,225,196,208,180,183,185,211,200,203,188,198,212,186,202,211,194,189,188,164,197,207,187,189,181,215,179,195,184,190,203,180,200,218,204,200,185,180,195,193,167,182,197,189,216,214,179,175,203,190,201,211,180,212,204,196,193,217,201,204,194,189,200,188,181,194,192,204,192,179,209,198,194,207,193,192,216,219,166,202,213,204,213,195,191,180,194,209,202,189,199,167,191,220,216,188,205,164,188,187,189,206,196,189,176,194,183,202,212,194,203,185,188,189,214,181,209,184,203,176,161,172,198,179,204,177,188,193,188,178,208,165,199,183,216,184,180,174,206,186,212,229,193,196,200,189,190,218,210,195,192,185,176,201,213,212,193,185,188,173,182,194,195,165,201,182,190,208,186,205,178,173,182,182,198,189,170,191,198,190,171,171,159,188,194,197,203,221,197,215,183,170,210,191,201,181,182,184,192,198,173,201,179,214,191,180,192,211,203,200,184,212,198,186,217,189,214,185,185,198,211,178,202,167,184,201,180,202,217,179,178,189,201,170,206,185,198,185,190,186,190,201,195,162,132,193,164,109,137,241,243,193,217,183,113,98,68,120,197,173,77,42,64,55,28,24,55,78,136,203,206,193,159,218,214,206,188,104,169,217,126,94,85,138,131,86,68,36,31,13,6,82,215,244,237,144,72,81,202,157,72,60,7,49,83,93,115,112,37,69,142,149,136,82,58,62,26,31,15,36,84,88,137,133,126,95,136,115,122,101,126,116,118,86,113,114,94,77,88,80,84,127,137,132,73,32,35,22,82,101,124,92,82,106,66,29,25,51,39,38,46,27,41,103,114,94,107,56,37,62,51,23,23,34,6,8,19,23,25,34,47,20,31,30,8,26,16,17,25,47,28,14,38,16,25,42,49,47,84,100,86,90,61,62,22,27,54,16,38,47,64,56,46,43,35,67,122,149,114,72,33,67,118,112,111,77,42,69,53,81,47,50,24,42,88,73,104,100,76,80,46,62,50,17,7,43,54,97,112,141,60,62,117,104,112,139,60,38,91,122,96,13,29,103,109,105,10,46,127,149,57,79,110,89,129,96,44,14,54,45,106,145,114,159,141,125,165,110,20,34,35,9,43,4,22,57,87,145,86,102,156,93,62,43,74,24,26,110,80,74,103,124,169,174,209,154,144,200,117,42,15,3,60,53,97,46,86,156,70,18,21,37,26,19,43,43,103,116,138,90,119,93,116,111,59,26,19,72,31,27,23,32,75,72,61,58,99,45,43,73,50,45,37,65,60,51,57,91,75,75,76,54,74,60,67,56,62,64,88,87,123,124,117,120,143,154,174,159,152,128,87,92,75,79,50,55,56,52,74,112,126,106,166,117,130,146,136,142,173,120,138,143,150,147,157,172,174,171,189,173,156,176,145,86,43,113,139,155,154,128,145,157,159,174,180,187,167,155,160,185,193,163,153,138,126,142,137,138,127,125,132,111,147,130,120,133,139,118,112,118,128,136,145,178,217,228,216,101,19,10,26,10,0,3,0,4,16,50,15,5,16,13,2,7,6,17,7,15,27,11,16,6,214,172,188,190,210,191,198,196,204,193,174,179,183,189,204,211,253,205,191,201,192,219,202,191,184,186,212,201,184,189,201,196,203,215,200,193,207,195,201,191,203,211,220,179,202,190,180,199,174,198,185,205,193,195,199,184,194,178,194,201,178,198,205,214,194,234,196,195,205,202,194,192,192,186,192,211,184,193,208,179,233,190,193,179,189,186,206,224,181,203,166,191,167,193,198,207,181,197,207,187,205,178,194,225,195,187,183,197,190,194,187,205,195,192,219,183,188,200,184,189,174,181,225,198,168,176,196,208,191,185,205,191,172,202,188,186,206,198,201,211,178,206,200,202,184,165,207,206,207,180,210,196,205,202,206,187,208,223,189,187,180,165,175,168,185,193,183,180,192,206,201,196,206,189,170,210,201,180,214,179,188,176,185,180,178,185,175,198,199,170,181,188,196,182,181,176,193,180,185,184,198,159,185,189,211,178,187,188,204,193,198,203,207,207,187,211,167,223,189,188,211,197,186,174,202,181,196,202,207,198,187,169,190,189,210,168,191,194,188,208,214,190,199,194,188,180,186,181,181,217,184,179,194,180,192,210,204,213,177,185,208,186,207,210,196,206,200,178,214,188,191,58,53,95,21,26,68,147,160,120,133,131,123,103,86,111,145,146,98,89,91,109,67,68,59,18,62,84,172,179,168,173,131,150,133,75,119,122,53,66,50,85,123,100,88,25,58,33,26,98,182,244,247,183,115,151,248,157,96,51,0,57,110,101,76,62,79,141,108,119,57,55,18,23,17,12,42,75,108,112,157,103,116,89,90,148,80,105,145,122,107,102,86,102,85,83,99,107,132,112,124,96,23,3,19,12,118,85,71,83,90,133,82,33,54,43,54,55,39,46,48,89,93,62,56,50,38,42,72,34,20,15,19,13,15,29,44,29,26,12,39,24,4,38,36,16,58,11,34,32,30,38,32,56,69,87,103,76,132,90,88,51,63,56,43,28,47,59,46,55,47,40,62,77,98,104,108,33,21,117,106,128,103,85,35,36,47,42,38,25,26,18,44,92,93,73,45,35,64,18,51,33,15,1,54,100,95,128,45,67,125,109,117,152,87,2,95,143,108,49,37,106,100,125,57,62,113,165,61,128,173,134,156,95,48,35,34,14,119,142,127,113,94,128,129,81,12,25,6,19,19,47,14,6,68,155,98,152,158,68,55,50,107,48,26,94,75,67,54,111,172,191,163,79,104,185,63,13,76,58,92,86,100,46,52,145,91,37,40,8,1,19,12,34,127,150,141,132,144,129,126,98,29,9,57,42,82,27,70,68,58,78,100,83,64,60,26,82,98,70,20,45,62,62,94,25,45,69,81,47,67,73,70,72,48,54,85,38,29,32,24,84,97,132,89,118,141,93,86,54,42,36,41,45,45,48,14,66,39,63,53,52,49,78,55,62,70,67,66,103,97,63,66,108,102,93,82,119,106,95,91,52,113,161,142,121,132,124,127,150,198,167,190,180,179,166,128,145,131,101,126,136,121,147,130,126,120,106,106,119,119,126,124,126,133,143,130,127,117,108,110,132,179,218,213,123,12,0,17,27,4,13,19,22,21,16,15,4,12,2,16,4,24,6,15,23,11,12,13,8,180,180,201,189,198,200,212,202,201,185,175,189,208,203,161,179,227,235,208,206,184,192,200,200,198,178,218,201,191,187,172,202,168,181,195,204,195,185,195,215,204,182,195,191,163,211,200,199,168,212,197,195,185,189,204,189,170,188,187,181,197,179,203,222,173,204,196,205,195,184,182,196,177,161,210,199,195,191,201,214,181,190,188,215,201,189,211,197,202,221,205,192,213,183,205,207,186,203,176,208,203,185,193,192,196,217,193,188,178,196,206,209,181,189,181,193,198,211,197,198,192,199,201,190,204,182,176,208,179,185,189,184,172,176,200,174,187,173,173,182,207,192,198,197,203,187,199,219,186,194,194,196,189,196,196,174,199,214,203,177,202,186,174,173,162,171,215,195,179,231,207,195,208,166,198,192,181,168,183,197,187,193,184,180,174,194,190,180,204,195,186,197,184,200,202,199,181,178,180,202,184,195,196,190,172,181,183,211,197,189,170,188,165,212,194,193,182,210,182,192,188,179,219,197,183,185,194,199,214,177,172,176,185,193,190,206,195,200,202,187,201,185,191,183,202,192,192,205,206,195,190,201,189,187,200,176,216,206,197,194,182,187,180,186,211,179,198,215,180,210,191,83,32,86,60,4,5,9,16,42,86,68,104,80,80,94,76,121,116,89,123,92,70,71,35,0,11,73,107,74,97,121,132,158,146,94,90,84,33,33,44,64,118,117,54,19,59,10,91,212,181,137,138,81,86,164,230,71,60,35,47,132,108,53,67,102,129,137,116,77,51,74,62,28,38,53,120,126,104,107,100,122,115,111,86,115,93,118,88,107,121,122,73,89,78,93,107,92,140,109,62,34,11,45,45,93,143,102,81,101,122,103,82,44,49,38,41,64,51,64,38,50,55,66,79,52,41,26,13,49,11,7,13,31,13,33,37,10,33,25,13,24,37,48,12,45,20,29,27,2,3,17,25,30,64,102,85,113,113,79,69,87,68,54,32,28,52,52,44,40,15,60,5,59,102,118,92,30,43,129,144,141,147,132,60,35,48,23,62,24,29,31,120,36,59,78,48,51,38,49,74,27,55,33,39,101,108,90,45,34,118,84,135,133,38,2,88,147,93,41,44,122,126,165,114,119,111,101,5,61,81,84,119,33,37,49,18,29,174,156,82,119,65,118,111,72,9,15,21,15,35,34,36,30,39,137,46,146,148,67,55,63,159,81,23,121,110,73,31,48,62,56,10,63,163,187,24,51,116,42,120,104,107,74,51,130,144,65,19,30,36,24,25,7,106,160,130,117,109,136,132,81,33,61,59,22,71,71,73,84,81,130,140,123,118,39,69,154,131,81,36,34,84,107,60,42,51,126,128,73,64,107,124,73,90,112,113,55,18,41,48,68,101,89,135,93,106,66,84,73,44,44,20,34,35,65,70,71,102,75,46,66,76,60,76,52,78,78,84,74,43,60,37,48,59,43,24,34,42,47,38,100,149,155,137,156,118,144,162,174,188,160,180,178,146,136,154,120,125,130,119,131,125,144,117,119,109,121,127,125,128,133,113,105,140,119,119,123,131,134,121,130,151,172,192,121,7,33,8,1,3,13,15,11,24,20,6,0,26,2,4,5,23,8,17,7,30,27,33,30,209,175,210,180,196,199,210,181,181,206,193,201,224,207,184,176,251,206,192,219,194,218,201,203,202,187,168,164,178,196,203,191,202,211,191,195,198,199,206,211,192,210,189,186,180,182,212,184,209,192,180,213,184,215,198,192,218,185,210,214,180,188,179,203,212,179,193,194,197,179,193,185,188,220,202,192,194,176,201,184,194,187,204,177,207,197,208,202,204,201,214,194,186,196,186,193,194,193,207,204,205,189,190,196,193,221,172,206,193,214,203,183,175,197,205,200,199,208,194,199,202,191,178,201,172,174,189,181,196,180,199,203,204,203,203,187,187,203,189,196,191,203,196,193,195,179,187,199,188,207,181,208,185,200,186,180,178,226,176,179,176,194,185,197,197,224,205,181,182,187,165,197,177,219,174,190,208,203,180,178,191,191,182,188,174,210,204,185,170,186,193,219,200,177,186,193,170,211,187,202,168,177,185,186,188,185,196,187,186,184,201,213,207,186,192,184,200,191,213,204,201,200,212,191,180,205,190,189,190,179,189,189,197,194,185,189,177,178,214,194,201,182,207,190,188,213,196,155,190,193,187,197,192,205,166,188,185,206,194,212,207,190,195,199,222,204,200,199,200,216,232,137,59,119,118,60,27,26,33,17,100,70,84,58,87,112,65,82,157,153,104,53,50,71,53,26,65,66,94,136,119,150,154,147,110,165,147,124,79,89,92,80,139,103,43,56,35,13,72,118,68,109,82,50,30,161,211,32,42,16,31,101,72,117,127,128,133,83,46,79,62,163,83,50,77,121,148,133,101,112,100,117,103,116,75,101,92,113,107,138,121,80,67,81,111,92,113,62,129,54,24,13,16,62,112,112,166,173,113,89,61,123,94,45,14,7,42,46,24,35,36,46,73,44,57,49,45,43,31,51,12,45,28,13,32,16,23,34,10,26,27,26,7,19,40,46,40,32,10,22,38,45,36,38,42,61,90,82,88,71,40,57,58,54,35,20,19,46,24,38,25,51,46,63,105,116,81,36,17,117,97,88,110,136,54,26,47,39,50,47,52,111,145,157,182,140,122,132,138,145,150,115,34,26,21,93,75,44,9,68,132,105,141,93,57,8,95,147,97,29,45,115,143,121,119,131,131,143,15,25,60,85,137,67,7,36,13,42,169,168,91,82,84,87,93,49,14,15,2,26,41,21,12,21,75,112,107,130,107,43,56,156,232,92,2,95,155,75,45,34,86,46,67,192,242,154,0,62,114,41,142,120,98,80,130,184,106,46,34,34,7,39,8,25,126,152,116,76,86,117,101,72,48,39,48,59,74,74,109,48,72,91,88,91,99,47,66,145,97,47,32,24,97,103,80,28,23,116,126,40,63,116,124,123,170,156,94,54,15,42,74,82,59,87,67,40,52,18,44,59,14,31,14,47,31,53,27,65,96,130,111,118,132,59,50,82,105,116,117,48,71,37,51,72,48,30,29,39,15,32,108,131,168,148,118,125,138,159,156,153,156,160,150,151,138,142,152,143,166,160,136,153,152,124,143,124,136,152,147,146,133,127,124,113,136,102,126,125,140,124,130,127,135,151,149,105,7,2,16,22,36,14,15,20,21,18,0,39,3,4,2,29,7,3,17,21,0,17,6,20,195,176,170,184,210,213,179,216,212,189,202,177,206,193,188,151,207,160,182,206,203,201,172,209,206,194,175,172,198,199,192,169,182,182,192,189,190,185,199,226,199,183,191,177,190,185,197,193,207,193,229,200,204,187,185,204,193,204,208,205,216,212,203,210,176,180,196,196,193,208,192,197,211,188,184,191,212,205,206,191,193,202,208,195,169,184,187,197,187,199,176,200,201,188,208,177,182,178,191,196,178,180,181,205,188,189,189,174,191,178,206,188,176,188,204,206,193,202,201,180,187,191,195,190,208,209,218,203,183,192,180,209,210,204,189,188,178,194,202,185,185,173,194,187,192,188,177,178,181,193,179,187,192,191,188,199,192,194,212,145,206,204,177,197,202,221,207,175,189,193,177,185,188,186,198,203,197,203,190,155,193,192,169,179,208,173,206,184,204,181,186,202,208,178,198,188,191,168,192,208,209,185,188,198,188,199,180,211,180,201,193,196,195,196,203,198,196,194,194,184,200,161,203,215,205,210,190,178,160,189,179,190,184,185,190,205,184,210,196,191,185,191,185,207,182,196,192,168,189,197,199,204,187,206,173,169,184,188,203,194,181,198,195,205,184,192,205,195,183,223,252,157,149,205,215,193,162,105,64,34,85,69,95,59,81,154,112,103,113,139,109,28,40,29,21,19,17,50,109,106,97,101,96,86,162,224,196,175,134,148,167,134,167,88,106,214,81,37,79,46,68,150,132,24,69,214,168,4,14,6,102,134,107,128,132,105,96,99,67,132,153,166,84,54,80,98,133,109,97,102,121,102,98,80,69,76,80,111,121,113,84,90,106,85,95,77,102,122,40,8,23,34,71,131,130,129,140,139,116,80,112,105,44,53,14,17,19,32,47,29,47,36,45,33,35,56,61,43,35,69,39,39,17,20,9,8,17,16,22,30,32,28,30,29,38,21,39,29,18,45,44,19,31,51,39,55,54,55,60,51,57,53,28,23,27,51,39,41,28,47,38,30,50,65,95,98,107,34,19,31,59,30,84,49,38,54,39,36,46,67,65,118,232,175,188,224,182,227,223,226,204,150,43,15,16,80,99,40,21,58,111,100,105,119,85,20,79,132,87,16,57,99,147,68,91,140,160,147,15,57,103,93,136,62,10,34,36,65,189,149,92,84,75,88,74,94,4,15,21,30,11,24,30,13,46,90,69,118,86,85,116,230,209,42,13,103,160,153,111,99,139,143,218,244,187,112,7,52,112,113,111,63,54,134,182,201,98,16,36,29,5,33,26,62,155,145,88,90,79,52,82,63,31,14,42,63,80,94,92,69,90,62,31,91,104,51,57,140,62,52,10,24,124,90,69,32,37,102,80,31,75,117,65,105,102,117,57,32,8,17,42,80,103,55,73,72,54,37,44,35,39,25,26,19,22,40,44,78,37,119,113,93,66,24,17,89,108,112,85,33,36,40,55,53,47,9,35,52,68,139,167,158,174,156,137,151,139,124,142,148,129,130,149,144,165,134,165,133,151,142,140,155,149,150,156,144,149,158,175,158,123,130,128,122,139,101,122,130,148,138,116,113,118,122,149,111,37,4,18,16,12,40,11,30,12,13,19,17,22,3,27,14,10,12,29,8,28,4,11,15,187,211,179,213,200,214,200,200,214,208,207,204,199,226,240,170,163,150,152,185,167,194,209,206,197,213,204,191,211,203,178,198,220,208,206,196,199,211,188,197,194,194,200,201,198,191,216,197,192,190,216,205,187,195,198,197,192,168,207,186,196,178,205,186,206,208,194,173,179,184,172,198,205,202,230,191,186,208,202,189,217,189,202,206,209,196,194,196,204,177,207,186,176,210,185,164,178,201,180,171,174,169,193,171,200,221,201,200,207,187,206,200,207,174,178,210,186,214,193,183,200,197,197,206,196,200,186,188,194,174,216,186,168,178,207,178,194,217,213,196,186,194,190,181,209,203,196,191,200,187,213,203,203,201,183,189,189,190,174,195,198,180,200,184,172,187,180,217,191,187,177,177,179,193,206,194,194,183,198,181,203,205,186,193,204,188,178,206,214,195,203,187,188,213,215,216,195,181,197,198,174,195,217,187,191,201,160,187,188,186,203,192,192,216,207,201,183,202,220,178,174,193,178,191,166,173,167,200,205,193,197,205,206,184,179,199,199,176,210,197,209,208,184,199,205,163,177,208,168,190,199,192,192,179,192,183,192,192,189,194,200,191,168,193,161,188,203,180,192,253,235,149,171,218,222,243,251,250,199,147,107,77,103,40,57,133,124,77,48,98,114,72,51,66,27,35,18,23,104,25,22,11,1,66,131,186,129,113,96,106,106,67,74,16,106,180,97,175,185,53,125,209,141,64,144,219,106,27,17,119,217,172,156,148,127,84,84,43,93,124,173,154,68,58,82,78,134,87,99,93,113,99,90,107,57,82,106,95,113,61,85,94,110,82,103,117,66,38,30,14,20,72,118,120,131,134,129,108,80,92,110,72,45,62,34,30,34,42,52,39,23,3,13,35,35,59,39,47,46,85,78,59,63,28,51,28,13,13,12,3,27,31,43,33,16,16,30,24,28,34,33,14,31,72,83,90,80,102,80,106,117,97,81,60,65,88,128,53,76,56,78,87,62,104,80,104,90,58,51,34,76,95,87,58,73,40,29,84,66,63,92,150,165,110,111,65,98,128,93,120,106,81,75,55,50,69,120,57,24,114,121,81,108,99,73,85,101,131,106,25,81,116,115,49,24,109,133,148,58,22,102,99,84,20,24,54,54,119,186,62,57,55,52,95,66,68,42,39,36,19,28,25,15,19,12,47,45,77,94,116,107,176,103,63,20,16,78,166,154,191,127,199,171,168,126,46,0,31,44,87,104,48,101,109,169,126,40,24,8,23,54,26,47,90,165,118,51,56,34,52,57,57,44,27,52,47,117,94,127,101,93,23,13,91,117,11,71,162,68,67,60,78,131,95,65,34,60,130,96,9,99,120,41,31,49,82,65,28,26,22,73,127,128,123,126,105,64,62,92,89,91,56,29,37,49,82,85,84,67,104,80,53,63,33,56,112,101,107,34,28,39,45,80,73,50,25,7,71,157,167,148,101,75,135,139,159,115,168,131,136,143,137,145,180,155,210,173,142,138,138,113,162,136,156,170,141,158,153,152,157,127,135,152,159,129,110,123,128,142,126,129,97,107,134,128,99,16,3,4,9,28,16,18,12,15,4,6,16,2,2,16,22,7,39,10,25,26,8,15,8,183,211,221,223,180,201,205,167,222,193,197,182,222,220,211,149,136,121,143,204,187,230,186,184,194,185,192,191,171,202,204,185,180,191,194,205,171,208,185,196,201,203,176,204,199,197,210,216,201,195,185,214,185,184,193,226,191,195,204,217,177,196,201,214,196,197,194,181,202,190,204,203,199,192,175,207,201,207,214,209,207,188,186,192,185,192,200,179,203,187,181,185,184,191,189,203,193,190,172,204,216,191,205,189,201,201,207,190,196,189,189,186,208,181,190,194,184,178,189,203,193,178,201,186,191,178,219,190,183,186,188,190,208,197,176,194,167,202,184,165,202,200,194,198,187,199,192,210,210,202,207,184,204,181,189,181,186,179,202,219,184,172,175,171,196,191,191,195,198,193,186,159,178,218,175,186,183,205,200,215,195,188,219,181,204,201,190,194,192,199,189,225,174,173,204,208,205,181,190,206,174,165,191,191,200,192,186,191,190,216,207,187,202,208,196,187,186,182,212,199,183,213,170,187,196,202,197,207,193,185,170,192,191,192,176,220,208,213,176,205,178,173,194,186,174,186,177,193,186,200,196,177,198,188,202,201,179,156,194,194,209,202,214,180,199,184,202,215,214,249,217,124,157,182,227,232,243,251,240,117,94,67,81,94,95,115,142,128,59,39,104,130,153,105,34,56,74,113,145,107,101,85,75,103,149,116,105,49,44,93,83,60,90,67,64,110,102,207,117,52,146,171,62,78,137,195,131,66,96,203,201,142,123,84,87,68,18,4,49,51,135,171,91,70,90,82,99,103,91,101,94,118,104,89,90,101,100,83,108,85,109,126,104,104,101,79,28,22,9,31,71,123,121,107,95,118,124,98,130,85,58,19,10,107,95,46,26,9,19,26,39,22,36,40,41,40,27,36,63,77,58,68,40,38,45,51,4,10,7,15,5,56,18,23,20,30,40,55,14,14,43,30,54,116,175,173,173,149,136,165,180,184,150,168,173,202,203,162,166,180,163,170,180,163,146,153,147,140,181,159,172,160,194,196,129,164,167,144,174,165,193,184,143,86,64,67,82,95,91,89,93,131,130,141,172,183,171,158,182,158,149,145,160,162,161,162,169,150,132,141,158,113,135,128,90,115,106,156,114,132,153,123,121,102,128,113,128,196,166,133,130,111,105,108,109,130,62,65,81,64,80,67,86,60,64,70,52,108,137,120,113,69,42,43,66,41,47,97,130,142,140,131,107,114,45,12,53,26,39,43,97,86,84,109,77,53,51,28,35,47,46,49,50,143,176,60,77,43,61,72,75,87,44,29,19,95,70,41,126,96,28,23,54,125,112,66,121,116,105,105,91,91,129,97,112,73,84,124,92,27,82,106,43,39,57,98,62,56,20,34,122,155,181,134,174,158,159,152,206,184,161,88,45,32,52,99,140,94,103,106,75,125,140,101,71,95,96,85,14,35,33,42,99,51,51,61,35,128,182,169,79,38,62,129,149,174,149,138,135,150,113,115,135,144,133,130,154,141,139,123,142,148,130,147,150,140,136,140,141,129,126,139,153,123,142,157,154,141,146,125,120,114,117,151,164,128,21,22,0,4,6,4,16,42,25,7,25,22,6,5,14,18,3,5,6,11,8,14,16,23,209,193,180,191,186,195,193,193,204,206,200,191,183,201,214,175,194,189,171,207,199,171,180,188,196,206,203,217,198,188,223,211,189,195,197,188,190,206,186,193,193,181,191,194,214,217,197,180,184,197,202,196,200,182,204,200,190,194,182,197,221,176,218,176,194,186,188,214,180,183,195,184,192,205,178,185,202,192,186,190,186,205,194,182,182,214,160,183,216,186,175,194,190,200,207,189,202,199,194,181,177,155,203,208,187,181,188,176,164,208,201,224,192,190,180,164,192,193,172,197,193,206,192,171,172,203,174,206,185,188,194,205,210,178,189,197,196,190,188,192,208,168,199,188,186,213,203,174,200,201,180,217,197,179,193,168,205,187,193,171,195,194,179,190,196,172,188,183,189,182,178,189,191,184,218,198,178,198,183,221,192,224,230,219,184,193,215,213,204,228,234,221,146,123,158,196,215,181,217,172,170,200,195,185,184,205,176,206,190,169,190,192,177,190,190,222,164,212,181,194,190,172,183,171,189,181,186,196,194,210,206,155,164,189,196,168,217,184,193,205,174,193,192,174,197,182,202,201,188,205,200,165,203,199,220,198,205,183,208,189,211,208,217,197,213,210,194,199,218,240,184,135,150,180,221,217,251,200,162,112,99,49,74,92,99,121,140,147,121,103,96,164,173,151,51,35,130,163,209,224,219,218,185,155,141,114,105,65,120,106,105,177,147,153,140,79,30,79,49,70,187,100,43,93,188,147,59,75,191,238,176,123,117,72,60,27,13,48,61,64,125,145,37,65,75,97,76,77,92,115,115,113,105,109,79,88,98,82,111,91,120,108,133,113,81,37,14,20,73,77,112,137,151,112,112,112,106,64,85,44,69,10,103,200,130,36,2,19,1,43,9,45,32,14,62,54,33,42,61,75,84,64,61,54,50,31,38,37,37,41,11,30,27,17,13,23,30,48,10,18,48,29,59,126,155,98,74,56,87,89,83,93,82,67,66,69,120,94,98,139,100,89,108,113,62,84,93,119,116,136,117,144,136,103,130,132,113,122,149,152,128,108,83,103,104,109,107,117,96,107,145,162,169,159,162,151,122,168,168,177,136,167,123,132,128,135,131,143,107,150,157,145,178,154,131,152,110,139,143,174,149,167,139,174,200,196,191,172,167,125,146,188,168,175,142,177,178,172,174,168,172,188,178,172,199,162,162,125,144,154,140,142,142,152,175,126,152,158,132,138,151,142,135,116,122,141,124,134,121,133,131,132,129,157,83,79,115,141,129,142,154,149,170,231,182,117,110,102,148,148,150,149,103,81,123,106,57,112,89,86,95,55,113,130,97,95,118,129,101,144,90,111,104,93,101,83,100,130,86,54,107,77,17,32,78,87,62,54,22,62,146,173,140,101,114,100,121,130,124,120,158,78,19,28,48,52,106,110,88,80,54,121,128,133,75,87,81,49,11,57,24,68,80,44,76,42,13,46,145,142,101,91,109,157,138,148,167,126,154,129,127,135,138,114,131,118,100,152,139,160,138,120,142,125,132,106,115,148,150,127,134,147,136,149,132,153,166,145,131,142,146,156,144,147,136,106,18,4,10,4,35,9,3,9,10,16,13,43,8,0,30,6,16,25,2,15,19,4,34,23,202,200,187,215,198,198,187,189,202,193,209,190,187,182,199,212,243,221,200,190,208,205,204,195,205,189,185,191,212,206,210,200,190,190,213,202,184,203,193,206,175,204,190,183,194,187,175,202,203,201,204,192,187,193,204,210,193,175,222,210,178,226,176,209,219,193,220,179,217,207,205,209,200,188,193,183,202,182,182,207,184,175,182,201,188,189,190,187,171,195,198,191,203,196,166,193,188,174,188,203,175,183,189,176,181,193,194,169,196,188,192,200,209,204,225,208,171,196,183,178,179,183,176,184,185,207,176,182,170,197,177,210,213,169,175,184,188,196,188,196,203,205,189,180,191,216,213,218,215,199,177,176,205,162,186,201,192,190,197,171,177,196,208,177,209,184,200,191,218,181,200,187,190,185,206,226,192,208,218,187,194,200,197,211,208,216,228,199,207,208,214,198,127,111,162,195,202,190,190,183,184,192,201,205,185,189,187,173,199,192,190,190,187,175,203,200,183,181,183,183,190,204,181,188,198,185,213,198,220,182,191,198,219,184,200,204,184,160,194,192,194,182,190,187,195,210,186,180,180,180,213,202,216,217,198,176,203,193,203,179,189,217,195,201,190,214,197,174,241,229,134,117,129,140,183,199,199,187,111,77,71,65,80,113,124,98,108,103,137,112,156,161,139,149,69,10,8,12,68,142,177,190,170,174,151,150,121,74,159,147,134,195,151,243,160,66,105,101,48,100,217,84,20,148,176,138,51,145,219,189,110,61,76,37,14,42,19,94,76,70,160,91,56,56,58,109,128,80,120,109,97,112,87,101,82,77,73,54,74,103,140,80,148,91,62,18,21,62,102,110,127,126,139,116,109,97,62,49,82,68,99,150,216,203,80,28,49,9,23,19,25,19,24,30,51,69,34,65,96,102,83,76,42,43,29,48,24,23,10,54,33,3,57,8,59,40,21,16,27,9,14,36,52,112,111,112,91,46,87,55,76,61,86,59,63,84,55,53,81,68,65,61,59,64,39,63,84,59,56,49,53,67,55,75,74,79,70,88,31,62,37,75,43,55,63,43,80,78,63,63,36,61,72,60,58,69,56,43,36,57,59,63,51,39,49,21,73,17,55,52,55,63,37,46,55,69,80,57,39,84,49,78,63,82,66,47,67,68,100,76,59,120,96,94,80,119,92,109,78,99,87,116,118,103,118,107,121,97,92,98,119,119,100,152,155,133,133,163,112,117,107,102,104,113,115,137,149,144,156,133,135,108,98,130,113,105,176,162,159,182,181,162,161,154,168,137,142,151,150,175,172,151,171,182,173,176,149,126,147,113,145,166,167,172,118,167,154,132,158,119,128,127,142,121,153,103,142,168,113,128,177,162,143,138,195,180,118,150,138,161,197,163,123,74,85,65,73,111,77,80,106,102,65,44,58,105,122,143,72,57,78,105,129,96,97,80,27,51,80,62,75,82,84,77,64,58,55,23,70,134,124,128,141,134,128,159,141,148,139,147,128,132,141,148,128,116,124,121,147,132,131,111,134,126,95,138,146,157,157,150,148,154,145,166,136,138,136,128,123,147,188,199,159,143,155,112,30,11,17,6,8,7,35,11,6,8,4,7,3,5,14,11,8,5,5,20,19,28,7,7,195,184,198,203,218,182,191,186,189,212,174,181,190,188,213,207,245,231,206,203,197,212,197,205,202,191,206,205,185,216,209,211,203,214,215,210,200,212,221,210,206,195,208,194,183,191,180,187,203,187,201,167,200,183,197,204,208,187,209,171,217,194,169,208,202,208,202,196,161,189,192,188,204,171,197,202,199,196,184,172,194,207,195,202,207,199,198,201,210,200,194,203,182,179,206,196,195,191,218,210,201,159,218,155,211,186,217,203,200,184,199,209,195,201,185,217,198,181,191,198,200,197,191,180,185,185,181,189,203,180,178,178,194,197,181,214,174,194,198,213,203,184,235,224,214,193,171,203,189,172,193,183,181,204,194,180,188,186,166,186,233,212,202,188,208,209,199,196,221,199,193,222,231,214,196,184,226,216,213,215,184,127,116,161,175,190,195,167,129,105,142,144,112,94,143,183,228,211,204,174,209,217,197,208,182,186,180,201,196,195,188,174,198,200,187,181,204,181,200,189,194,193,199,205,192,187,209,206,182,200,181,182,221,191,187,191,180,182,204,219,203,177,202,190,202,203,210,220,209,217,189,167,188,201,167,183,153,140,156,152,165,152,140,127,139,130,136,197,237,162,131,166,99,107,90,142,141,165,137,59,87,55,74,113,131,131,130,142,84,106,182,132,143,157,106,21,20,48,43,4,2,59,100,107,109,117,103,107,223,155,92,93,98,158,74,114,229,155,90,152,186,94,94,159,190,184,152,153,146,90,85,61,50,15,18,23,73,119,87,87,120,56,54,85,64,111,140,112,101,82,85,94,94,65,95,60,59,113,100,119,135,94,41,24,3,46,30,84,126,134,141,143,121,103,109,60,100,78,126,128,123,155,172,131,53,45,44,14,30,14,18,38,27,24,20,75,69,113,97,98,92,44,21,27,23,18,21,26,25,17,23,22,43,29,0,34,38,32,31,40,40,33,25,117,89,190,168,176,160,159,141,127,124,154,149,150,123,128,146,100,135,116,126,88,92,121,109,111,121,132,128,116,137,135,76,155,136,155,96,150,150,123,135,129,144,149,150,170,131,107,139,118,133,90,92,125,90,95,94,107,91,118,106,98,69,127,83,98,106,77,103,76,98,81,72,80,92,104,75,78,77,74,74,51,113,85,65,82,94,55,43,36,57,55,63,52,73,43,57,63,74,54,72,78,86,70,73,60,54,57,42,42,89,47,76,47,60,48,47,52,42,61,47,44,63,68,95,69,81,54,62,33,50,59,65,56,57,43,27,81,64,53,34,53,23,40,58,64,42,69,75,65,78,111,97,115,113,118,78,96,98,102,77,102,118,103,114,117,80,109,104,103,110,113,110,114,125,126,92,105,137,139,138,172,160,131,134,161,162,167,137,132,118,77,88,77,92,89,113,98,102,98,147,137,106,131,127,123,128,99,130,138,115,113,106,116,108,139,127,131,114,153,103,136,169,113,141,107,115,136,154,124,124,148,145,157,176,189,168,157,126,124,129,109,137,116,117,106,113,119,132,140,117,127,125,117,129,126,142,137,150,149,151,146,145,145,135,156,144,159,188,195,175,157,168,125,0,23,7,42,4,32,29,3,13,10,26,13,8,0,3,25,44,23,2,30,19,14,8,4,189,195,206,184,193,208,198,223,179,188,193,196,181,193,185,231,252,221,193,219,203,224,209,229,219,231,233,233,221,236,211,212,227,193,186,212,206,179,189,185,186,195,200,189,220,201,204,209,216,186,194,216,215,221,183,188,176,182,197,176,199,198,209,195,197,209,216,211,204,199,191,223,183,176,194,200,180,179,198,196,192,210,190,194,192,210,185,203,212,208,195,193,203,212,175,193,187,196,219,173,198,195,176,198,155,205,190,195,200,185,179,182,184,162,166,143,199,195,193,204,214,201,205,208,205,221,195,191,192,195,209,214,212,182,231,192,182,228,214,195,202,195,171,145,147,168,159,172,189,187,203,219,190,174,213,183,217,229,217,213,223,192,220,198,226,232,227,223,227,224,225,197,231,222,196,156,163,182,206,211,158,122,84,81,134,124,113,93,81,66,61,90,94,133,123,175,192,208,239,202,250,216,215,215,198,201,186,187,211,197,182,187,192,211,207,191,203,208,181,219,202,212,182,201,198,211,207,190,180,194,204,193,170,185,200,177,199,187,177,198,192,203,217,181,207,180,184,201,198,200,130,102,79,161,152,139,129,109,123,113,104,108,63,85,54,88,88,176,209,89,127,139,93,108,105,125,131,116,165,104,100,50,74,111,146,158,121,157,99,46,116,142,126,130,118,59,12,31,37,25,34,14,32,4,36,52,125,129,218,169,80,55,62,43,25,183,244,101,133,164,125,153,169,201,151,194,202,140,71,57,56,38,8,15,65,96,140,137,66,83,140,39,54,41,102,128,55,77,128,70,100,78,58,61,73,86,75,103,104,135,89,39,43,18,23,55,89,103,98,127,104,107,119,97,54,119,106,119,118,133,113,96,96,79,42,30,63,40,55,14,21,18,28,26,33,47,67,89,58,54,55,31,31,38,33,27,21,36,41,25,4,51,35,23,17,20,26,22,52,37,33,85,35,57,74,85,53,48,60,68,83,87,66,77,78,45,49,82,88,76,64,80,84,72,77,65,79,84,130,91,96,107,102,133,109,93,103,141,117,117,134,137,108,101,122,135,141,99,102,86,114,112,120,134,131,97,121,128,114,121,93,122,135,156,120,109,114,141,148,126,136,131,90,124,130,127,119,121,126,130,129,177,140,137,109,116,140,110,133,122,111,122,108,104,98,109,122,130,122,125,105,138,137,135,115,127,112,143,147,126,111,101,110,91,103,115,100,72,115,108,110,115,106,106,90,126,99,133,117,87,102,114,115,94,92,114,76,75,59,92,66,66,83,91,47,92,83,56,58,66,65,64,70,66,53,53,81,71,71,64,55,74,78,84,68,49,67,73,52,70,47,57,58,52,56,69,70,56,43,52,59,61,69,80,38,49,54,72,58,54,69,63,69,83,78,50,86,36,52,76,83,83,70,84,65,87,76,84,68,92,85,92,74,92,90,84,99,121,128,117,103,119,124,109,142,133,132,140,134,108,99,107,103,97,113,104,118,141,136,129,102,109,98,100,113,107,102,108,124,104,162,123,127,142,123,139,121,128,137,134,130,138,145,142,164,165,170,168,158,139,163,177,173,151,142,106,14,4,27,25,3,22,10,1,14,24,17,20,0,2,16,20,26,8,16,5,6,12,11,3,186,192,181,210,208,174,204,183,212,189,189,204,213,183,175,215,255,223,153,189,183,181,237,219,228,237,225,231,237,229,227,221,231,199,174,180,183,223,191,197,201,195,213,218,193,218,202,220,162,186,191,203,191,199,193,195,210,180,216,199,192,182,201,206,197,189,195,183,178,198,212,208,201,203,212,208,187,169,190,200,190,174,204,208,207,195,211,219,194,201,198,214,182,201,178,198,162,193,201,209,186,191,159,209,211,189,205,188,182,205,183,137,91,104,87,128,183,208,217,218,213,241,239,213,198,182,208,242,206,206,208,205,190,212,200,203,230,220,205,185,220,203,127,152,112,127,98,149,210,214,211,179,193,191,234,190,170,199,149,166,173,184,172,185,208,194,176,155,173,179,166,169,134,171,144,76,97,114,128,104,124,122,104,91,101,86,92,114,98,97,95,111,144,101,98,141,92,180,184,193,183,186,193,180,193,223,191,204,195,207,191,179,193,174,218,199,200,193,198,191,217,227,203,204,216,183,190,218,213,209,178,201,194,187,206,228,199,193,218,179,189,194,180,197,198,143,137,149,126,133,129,96,85,129,146,132,142,120,106,97,116,89,76,60,102,62,76,194,201,90,89,100,113,112,77,163,153,119,168,152,106,55,85,114,122,146,117,91,95,57,106,113,74,105,142,111,112,63,111,48,67,30,35,19,37,75,66,103,161,194,125,102,164,73,50,161,166,60,110,130,136,160,187,187,164,170,202,138,97,57,49,35,63,83,70,97,129,128,45,76,100,25,64,91,100,104,57,60,86,66,25,68,51,33,57,65,79,103,99,122,57,4,27,41,80,109,123,122,114,116,114,96,97,103,102,132,119,94,114,98,116,88,53,52,69,56,31,49,14,20,14,20,0,36,44,24,21,53,38,41,29,33,30,20,38,20,65,31,14,13,12,49,33,46,43,33,32,34,14,52,52,29,19,51,19,50,53,47,34,61,55,45,24,29,41,23,22,40,40,31,41,25,62,48,37,40,51,45,67,44,44,34,60,43,59,27,15,33,63,67,29,35,35,18,16,29,64,41,55,63,45,58,69,44,29,40,41,27,29,58,72,28,66,56,53,46,33,51,81,37,46,33,47,55,33,42,39,43,58,55,40,37,52,42,60,43,77,54,49,57,36,71,59,65,53,73,54,62,103,67,56,81,98,76,83,100,83,88,72,96,106,89,90,80,96,62,102,80,119,108,108,111,108,129,117,117,111,118,117,127,128,135,151,114,78,137,136,114,133,111,119,132,138,139,133,113,117,103,119,86,136,127,111,90,114,115,135,119,129,130,123,127,117,110,125,103,111,112,67,123,122,122,119,131,90,108,104,116,55,77,113,113,98,114,95,128,118,90,75,114,80,113,88,92,94,102,78,112,102,88,90,67,74,53,110,57,69,56,66,75,60,76,74,96,73,100,107,88,72,88,80,84,78,68,121,82,75,74,42,34,75,85,52,53,47,48,58,59,71,64,68,114,121,105,97,88,76,84,85,98,92,119,140,149,155,143,139,135,119,111,148,136,120,135,132,150,177,171,155,158,171,143,173,153,113,117,26,17,13,2,27,9,7,14,13,5,9,7,0,0,11,7,13,21,10,29,0,25,4,5,172,192,197,193,204,161,207,186,218,191,214,196,219,204,197,244,251,223,160,155,139,95,112,131,119,127,144,124,129,131,153,192,182,211,202,206,201,178,183,211,206,184,205,176,210,207,184,178,195,191,176,183,201,203,204,192,171,210,212,182,185,184,204,189,201,206,184,191,188,211,195,191,186,186,193,180,186,184,194,177,195,184,211,182,177,206,196,198,200,178,207,190,204,190,189,192,198,179,212,197,199,197,209,185,198,196,209,186,206,232,203,121,96,100,93,109,149,197,200,201,221,192,160,166,190,195,193,199,153,151,232,218,169,131,135,140,152,218,199,227,212,211,138,111,130,136,133,149,198,226,205,201,190,216,215,120,82,85,61,123,169,146,112,114,122,103,120,80,78,75,103,95,66,92,99,60,53,68,51,73,103,122,108,99,97,100,105,104,141,134,128,120,118,70,77,84,68,84,101,81,100,114,134,159,170,191,191,154,175,183,178,180,183,191,177,142,170,138,125,151,211,183,158,191,198,165,139,154,179,180,204,195,231,192,214,202,197,237,175,187,139,158,146,146,126,121,95,109,99,117,98,82,109,113,124,144,125,137,100,140,127,104,117,95,78,67,111,236,122,80,93,92,108,140,108,212,175,103,190,141,65,52,62,110,49,76,128,121,108,94,89,89,77,89,119,185,156,130,127,115,85,92,102,97,223,148,135,145,123,123,68,149,195,93,80,164,142,69,137,90,91,172,148,84,111,186,198,193,87,84,69,64,86,54,78,67,65,60,67,87,44,61,76,90,103,87,96,54,72,46,46,46,37,45,71,64,51,61,54,40,27,15,50,102,113,116,126,128,103,95,78,102,84,100,95,111,107,84,104,58,72,56,66,62,73,35,1,17,25,33,24,47,17,36,20,36,36,36,28,33,4,24,46,8,49,26,32,45,29,21,27,29,18,49,48,29,47,21,45,2,37,26,25,29,38,44,71,78,71,53,16,23,6,26,35,40,31,30,48,48,65,45,43,28,36,32,20,47,32,34,42,52,35,36,29,69,49,39,28,47,31,25,46,40,72,29,38,48,19,41,36,36,59,52,60,31,41,37,33,29,47,48,37,58,44,48,38,40,37,53,48,53,59,71,36,45,20,35,32,60,26,24,43,26,31,33,47,28,52,72,46,36,44,41,46,26,64,59,42,35,50,50,43,60,36,18,61,44,22,33,48,65,55,20,52,61,48,41,52,28,38,33,52,59,43,20,26,64,38,35,31,42,29,46,71,52,28,48,37,30,34,23,49,47,70,38,39,50,65,33,57,39,46,57,41,79,76,90,82,82,83,74,92,93,86,93,96,106,98,118,99,106,111,107,91,106,110,111,96,104,98,122,141,106,134,122,139,126,140,112,118,155,113,116,106,104,133,114,133,135,143,102,133,94,101,85,94,98,107,104,124,122,124,130,128,138,121,118,123,114,104,103,115,100,123,128,102,120,87,110,95,70,78,68,70,98,47,57,67,82,97,160,163,165,166,101,95,86,107,70,78,105,143,137,158,129,165,128,121,111,117,100,90,108,124,111,124,144,158,116,149,166,164,94,117,97,16,7,6,18,16,37,27,33,10,16,34,6,0,10,1,0,15,10,25,25,18,32,26,29,183,200,182,182,222,198,198,205,202,190,198,211,196,202,209,226,249,204,168,202,140,1,15,6,4,14,43,36,22,11,43,146,202,176,220,223,206,187,208,200,207,215,166,201,193,203,203,195,189,196,180,171,212,197,206,181,175,202,175,218,185,181,193,193,206,198,210,214,208,213,198,197,184,171,210,168,224,196,203,204,205,187,197,185,184,217,193,199,221,197,204,212,188,194,176,182,183,182,208,199,189,190,191,193,185,221,207,238,205,188,217,140,118,187,137,143,147,173,146,154,155,115,108,133,120,99,117,158,84,110,179,212,126,59,68,38,97,192,212,172,191,166,120,122,94,126,127,172,195,202,195,199,222,208,196,95,59,59,88,163,170,148,105,94,105,101,64,85,78,45,92,111,93,98,121,89,101,104,94,84,102,103,118,113,71,109,88,139,123,114,117,79,74,64,75,68,85,84,63,47,82,74,115,139,157,155,127,121,149,162,174,176,145,163,147,119,155,134,128,142,133,124,154,152,127,136,134,141,164,195,191,184,175,177,170,175,182,211,170,147,149,129,140,92,115,119,109,89,124,144,110,91,84,100,98,97,111,100,114,151,185,165,192,154,126,102,158,254,121,66,108,107,119,172,190,231,186,101,147,176,107,50,59,83,44,56,164,233,174,73,94,120,94,93,79,154,204,174,138,145,136,170,173,155,198,151,146,128,68,40,60,159,142,71,103,194,135,100,147,85,99,166,96,57,25,132,160,163,85,68,49,35,52,59,56,69,60,79,55,78,54,44,54,55,80,82,38,37,98,63,60,76,45,78,71,71,32,45,25,18,47,89,111,101,142,133,137,121,98,111,86,79,83,125,84,102,116,84,32,17,26,56,43,63,38,52,11,31,45,51,42,16,20,11,24,25,10,40,15,25,24,37,22,19,52,43,28,13,6,16,17,20,14,33,40,38,38,28,15,42,30,29,32,34,36,67,66,73,47,38,25,33,25,17,58,66,50,48,59,38,42,84,64,44,52,39,48,43,28,42,48,30,35,49,49,45,51,44,39,12,27,27,36,34,28,32,43,30,40,48,41,35,44,33,49,41,13,41,39,54,63,51,63,63,50,52,40,68,39,40,41,42,52,45,31,19,37,48,36,46,36,52,39,42,24,39,53,41,42,33,39,69,21,55,45,38,37,51,57,30,50,45,21,39,17,52,61,57,60,57,51,55,42,56,48,42,40,26,45,44,45,77,42,53,35,44,37,37,48,45,35,37,32,25,31,44,67,52,15,14,37,47,29,59,7,50,31,7,45,36,21,36,50,40,17,43,48,37,34,51,77,43,41,30,18,59,43,54,46,25,38,43,15,43,58,30,48,57,68,58,25,53,46,49,85,45,47,50,44,38,47,49,57,41,51,49,78,45,72,88,47,60,59,56,75,64,48,108,78,68,88,89,86,90,73,73,83,100,117,101,115,110,79,118,68,113,107,122,111,74,111,108,109,101,104,122,104,94,106,133,115,150,158,151,117,113,116,122,107,90,119,139,154,140,156,144,148,155,121,124,119,109,88,106,100,112,115,123,117,113,124,124,127,86,91,99,27,26,9,10,15,19,24,20,15,19,39,11,4,18,16,29,17,24,6,10,5,33,21,28,206,211,202,210,192,188,203,205,204,189,198,181,182,207,204,254,252,232,176,191,126,22,11,5,12,43,9,15,40,43,40,173,220,238,230,219,215,209,215,184,226,201,216,209,215,223,176,213,213,192,210,195,211,185,211,222,190,218,201,209,183,185,191,219,211,213,206,196,240,208,210,229,202,193,193,213,199,181,171,196,215,220,199,199,196,198,205,189,208,202,243,230,197,228,205,235,207,188,155,185,196,196,218,202,207,209,209,203,162,127,127,120,132,152,95,100,109,109,86,76,72,84,87,78,90,52,85,100,61,78,172,178,53,44,34,22,84,119,146,134,156,121,89,89,79,79,102,163,171,156,147,136,137,183,158,115,77,94,169,196,156,107,95,124,120,104,114,99,105,107,124,116,117,132,101,116,119,122,122,125,166,131,125,123,147,147,118,120,101,114,90,84,120,103,110,121,106,86,94,133,162,130,133,156,155,139,151,180,154,144,176,153,124,158,141,149,171,147,154,167,175,186,162,190,186,168,184,148,165,163,117,98,114,78,79,98,134,146,156,153,162,149,115,102,117,143,89,126,143,214,180,177,109,78,64,80,63,74,166,189,198,191,210,175,115,101,237,224,100,111,87,116,188,228,248,254,199,84,109,149,122,67,91,136,113,111,157,208,161,88,111,103,88,79,32,80,210,227,238,221,166,192,228,103,130,129,97,145,115,43,82,155,82,68,170,149,103,158,189,216,181,131,56,18,0,69,80,69,52,29,26,7,19,63,57,19,58,63,34,91,51,71,82,76,107,79,96,57,74,80,57,58,60,51,64,56,40,16,15,42,70,105,133,139,110,140,111,81,106,73,96,65,81,114,98,67,72,31,19,9,26,32,40,51,59,48,33,35,60,41,31,30,31,1,25,18,20,16,21,35,24,34,28,45,10,26,40,50,36,48,46,32,40,12,57,33,28,43,26,17,33,33,34,66,59,73,77,12,39,28,41,39,21,35,29,43,72,39,87,59,61,65,42,73,55,43,46,56,62,64,59,34,45,63,58,69,49,49,8,37,38,1,4,9,31,19,18,13,30,23,35,58,48,31,61,67,37,69,54,48,62,49,53,66,63,77,66,50,70,62,55,41,44,40,43,42,38,50,49,38,45,30,34,19,25,24,56,25,50,15,2,49,22,34,45,18,44,32,56,28,37,41,47,28,39,51,50,59,53,49,68,34,48,62,53,28,47,52,46,54,49,48,42,58,37,44,46,59,26,36,46,30,43,49,37,39,32,69,43,19,60,29,58,31,32,47,45,32,40,36,42,39,47,49,35,44,47,27,29,45,36,15,53,30,51,29,53,38,52,50,32,44,27,50,23,31,30,53,43,42,27,38,59,52,51,20,39,59,45,49,45,24,10,20,30,52,42,35,48,38,49,48,56,50,35,55,66,37,21,62,52,65,58,61,41,57,74,51,49,52,28,31,20,33,33,26,38,47,74,27,73,99,111,104,67,76,119,111,102,97,101,110,124,133,99,107,134,148,117,125,159,174,155,158,142,174,148,132,142,154,136,102,136,122,102,76,91,90,111,136,99,91,109,74,117,96,15,28,41,2,20,24,16,28,15,26,22,30,2,4,22,14,9,32,18,28,15,25,5,24,166,192,186,191,203,189,175,191,182,174,183,187,187,198,208,253,247,206,166,199,114,36,27,25,43,40,40,35,51,38,73,184,223,232,240,232,200,208,198,224,201,220,231,200,224,205,200,205,210,212,217,195,214,195,209,237,212,208,211,177,217,194,240,244,197,207,213,203,222,230,191,216,211,186,198,211,215,205,206,236,214,216,214,240,240,205,172,212,210,200,170,226,207,202,180,223,202,175,207,182,192,177,204,161,136,126,134,141,75,55,70,68,114,104,76,73,97,107,81,71,66,58,106,77,115,123,120,103,91,101,120,96,65,54,95,84,95,118,98,109,128,111,86,118,155,138,143,115,119,116,125,80,103,103,126,99,141,151,178,189,176,124,151,120,144,109,157,135,134,117,98,137,131,134,108,138,124,126,121,124,136,156,147,121,170,153,127,74,79,106,122,125,153,158,154,138,116,139,126,155,180,113,128,149,116,138,138,184,181,183,176,175,142,168,170,162,174,189,186,187,189,186,187,205,157,158,144,163,160,132,99,47,82,89,78,120,178,192,191,194,183,156,152,144,144,119,129,125,150,190,227,237,238,203,159,163,161,161,206,207,169,202,210,203,84,117,195,166,110,147,112,135,200,208,236,200,192,97,107,137,163,112,72,87,160,180,116,135,122,105,104,97,90,34,12,53,104,168,208,177,148,162,181,113,127,100,71,173,152,86,138,154,90,93,128,115,168,178,175,229,185,104,51,21,12,30,23,57,52,33,40,30,45,41,29,31,25,30,71,71,65,65,36,63,66,65,63,53,46,9,52,55,52,46,44,44,21,39,37,59,110,91,91,99,115,92,89,76,84,97,101,103,118,131,94,85,33,15,21,46,50,88,19,64,66,30,25,57,84,49,72,41,6,3,5,17,5,26,21,25,9,44,8,35,25,56,52,55,80,50,54,39,3,15,33,27,18,17,27,21,66,51,59,78,66,73,58,37,23,52,23,20,34,84,58,61,55,57,48,73,67,75,68,29,80,62,61,66,38,48,75,60,45,73,39,43,53,52,20,5,21,21,6,33,16,59,35,35,53,70,47,56,29,61,39,74,39,53,53,69,44,36,68,47,44,61,37,34,62,83,57,38,53,68,47,48,54,34,31,45,43,52,17,23,21,10,9,29,6,36,37,25,25,36,38,34,38,58,41,26,36,62,75,75,61,57,40,68,58,64,47,68,68,44,35,41,59,75,90,60,52,45,30,53,50,57,54,23,18,45,26,27,38,50,25,60,56,43,26,37,18,18,32,5,18,4,37,54,24,27,39,52,42,38,53,56,40,49,42,33,51,22,61,69,38,27,29,32,37,18,36,53,42,45,47,61,20,32,55,59,26,12,52,34,51,27,42,46,29,61,47,62,54,52,70,65,37,52,59,25,37,40,31,31,71,71,55,40,59,62,62,51,71,61,45,71,57,76,40,25,10,32,26,33,38,39,20,51,60,38,77,51,107,75,109,110,131,116,107,88,57,48,59,81,91,74,124,136,121,125,128,122,146,156,147,147,130,134,168,117,152,155,145,137,136,97,77,113,126,104,108,108,105,109,120,97,3,0,5,15,10,12,49,15,21,17,30,20,27,17,18,5,8,41,2,18,11,11,19,14,177,180,188,167,159,205,199,195,178,193,192,196,204,206,169,229,249,153,166,228,139,46,26,38,26,37,38,37,32,34,94,170,181,127,158,164,148,170,172,172,148,135,128,133,160,180,192,198,199,200,210,220,176,206,209,211,189,212,198,213,192,165,200,219,171,183,177,190,204,197,204,195,202,220,213,216,176,196,191,205,209,191,222,212,211,205,167,154,128,108,159,147,165,145,180,225,220,217,242,255,231,196,166,117,58,61,74,114,113,67,83,121,127,132,128,128,128,90,130,105,85,92,96,147,131,127,137,128,124,83,81,91,73,101,127,124,119,146,114,148,164,89,119,154,201,228,164,194,159,163,133,102,108,123,140,149,135,229,196,191,181,139,131,113,118,102,157,117,104,88,114,110,128,104,109,113,127,128,96,91,104,107,149,157,149,132,111,85,97,110,114,123,132,106,137,132,124,134,136,157,141,137,103,112,92,89,138,190,154,160,146,141,135,165,137,127,149,173,190,148,158,128,118,156,118,87,135,102,131,127,92,97,127,150,184,184,181,205,160,142,170,142,139,130,116,133,106,128,124,145,208,220,254,251,250,251,236,255,228,191,188,157,144,115,95,126,175,130,91,133,111,110,162,193,229,170,161,109,116,133,166,117,106,55,163,180,115,106,115,141,162,107,91,44,11,62,12,25,104,115,126,121,186,159,184,147,115,203,116,104,216,197,46,125,97,97,188,107,73,140,148,56,14,2,31,124,104,77,67,41,79,52,47,69,21,42,58,67,102,69,43,45,27,42,50,37,25,50,49,42,43,32,11,54,29,35,63,82,91,95,126,131,78,99,69,97,87,65,53,83,103,121,136,138,80,24,5,30,58,64,116,103,92,51,59,17,25,75,51,62,51,64,58,41,12,13,31,49,46,37,18,35,14,49,34,48,28,44,47,69,79,50,23,21,6,39,16,31,37,31,57,49,75,61,58,31,24,44,20,27,21,40,39,63,67,55,71,56,48,40,45,55,80,65,55,34,78,60,49,78,58,48,62,61,36,52,27,20,3,18,18,23,31,45,14,13,25,18,43,41,62,39,63,51,63,55,40,47,68,70,52,45,65,55,44,51,79,28,37,68,25,57,56,41,38,50,41,52,72,41,37,26,7,16,26,12,5,49,20,39,5,26,24,48,35,42,40,55,49,43,26,80,79,56,66,74,58,65,42,46,44,57,68,35,73,75,58,26,36,52,61,61,31,27,49,21,44,48,34,31,70,28,37,31,28,53,39,58,20,36,15,21,31,21,31,21,19,10,29,40,27,55,50,29,47,59,51,57,79,51,63,56,63,28,48,29,21,46,63,36,59,21,32,50,70,30,60,27,53,38,14,65,69,36,24,41,15,30,50,44,7,51,29,18,64,65,58,45,60,60,52,53,35,64,36,76,43,57,46,83,67,60,56,61,67,72,51,47,40,30,7,31,27,35,46,37,58,81,48,52,47,30,49,67,44,82,98,56,46,91,19,88,75,83,69,77,112,115,109,137,105,98,132,141,162,94,135,109,77,130,117,122,152,129,111,110,100,145,139,132,113,122,143,113,131,104,28,19,21,24,21,2,2,6,18,23,10,14,25,6,10,11,17,26,9,21,16,36,6,40,124,189,200,186,209,200,212,222,208,220,201,201,209,220,197,237,214,125,143,219,103,59,48,24,3,21,43,63,43,44,76,130,97,31,81,77,91,68,140,124,66,63,77,99,102,145,152,171,124,132,120,148,117,143,160,157,140,120,161,135,112,104,135,148,105,124,122,106,109,130,127,129,163,174,146,101,94,119,152,138,125,143,156,143,148,138,114,113,86,65,89,97,79,100,128,210,244,231,239,250,245,232,162,99,85,117,102,145,101,130,139,152,143,164,146,127,137,114,144,103,143,168,104,115,85,128,99,110,128,90,127,126,151,204,168,197,197,167,148,136,144,80,88,136,180,180,140,172,146,147,138,129,140,127,151,147,193,214,164,138,137,129,92,92,104,65,111,120,101,99,59,104,83,112,71,105,90,101,100,90,92,111,99,120,111,125,111,123,119,110,115,110,106,123,89,114,90,78,114,132,132,83,104,87,85,88,150,167,145,112,95,109,107,107,109,70,72,139,140,85,58,86,124,113,113,109,125,161,150,123,158,121,141,156,151,160,135,107,127,95,108,136,107,94,126,111,140,103,98,126,135,147,149,159,148,160,188,181,206,220,211,194,205,172,94,147,224,132,109,100,117,110,182,214,215,174,149,152,146,98,133,132,104,17,124,212,132,121,124,161,200,255,240,209,174,132,104,110,140,155,189,135,161,144,192,167,157,223,80,126,212,102,75,144,128,136,148,76,22,38,77,67,22,18,89,131,137,101,90,119,154,134,107,70,96,94,71,47,69,76,52,24,27,52,56,18,25,28,56,46,54,55,31,16,12,61,60,123,95,113,132,113,91,75,96,94,68,105,79,102,117,118,109,89,21,35,10,41,74,102,123,113,120,92,82,26,31,54,102,85,110,64,46,52,56,19,14,21,32,17,51,47,48,35,23,22,13,31,42,53,56,46,35,24,34,47,24,18,45,23,45,91,76,41,15,22,12,34,17,21,44,70,51,75,62,74,63,49,35,51,32,71,73,72,49,58,51,41,53,53,57,73,51,56,44,48,42,17,42,24,19,12,13,16,42,19,10,4,38,52,59,65,59,23,23,68,45,66,60,51,38,27,41,52,60,35,61,59,44,65,81,42,45,58,47,30,57,57,60,51,29,16,19,4,19,28,6,14,35,19,26,24,18,52,42,48,25,80,43,45,28,73,52,95,39,64,77,65,47,67,79,64,48,53,46,59,64,58,44,29,48,51,40,57,47,35,42,47,46,32,45,36,35,56,44,61,29,63,19,4,34,31,8,32,5,33,28,8,40,34,36,66,49,46,33,43,50,36,21,44,48,58,33,58,29,36,45,44,29,41,56,62,50,41,63,55,37,54,45,44,56,35,57,30,17,29,15,56,13,52,25,42,12,34,47,37,30,76,49,35,51,35,53,45,53,68,53,63,72,79,77,63,55,69,58,73,51,64,19,25,21,19,44,34,77,91,131,106,63,39,72,75,89,64,37,83,54,35,47,43,33,83,127,110,125,103,133,139,134,122,93,115,121,116,104,111,103,87,104,94,83,129,111,78,75,105,121,124,102,111,113,135,143,100,145,105,1,30,20,14,18,14,33,3,24,9,27,24,48,6,4,8,36,6,51,8,13,5,18,22,179,174,210,194,217,208,196,216,198,194,203,205,218,223,222,255,218,158,106,153,37,22,33,43,27,21,51,36,34,51,77,128,113,54,64,93,72,99,142,119,128,89,105,99,129,135,123,133,94,51,51,75,72,85,110,111,67,60,85,92,97,52,85,94,67,60,64,60,83,78,69,57,67,108,90,56,87,76,103,101,60,77,72,75,119,94,79,64,74,60,80,79,103,99,82,136,181,181,169,182,156,156,118,115,102,115,116,118,140,131,143,133,144,131,103,143,132,127,152,120,111,115,135,102,101,140,143,111,141,132,170,147,196,212,236,217,179,170,160,140,140,77,118,136,138,118,136,144,120,139,114,134,105,99,113,166,208,176,131,115,91,108,50,56,91,87,114,135,105,82,64,56,88,83,100,116,102,128,82,67,83,123,96,116,148,143,142,106,117,95,96,99,109,92,130,136,111,85,49,107,134,90,109,113,118,99,120,124,108,100,84,77,108,125,111,110,131,123,105,90,103,100,91,101,90,132,166,161,160,160,159,160,121,101,94,91,82,84,85,72,80,66,86,114,120,121,96,123,84,79,38,54,56,61,46,32,63,115,148,195,234,224,208,219,112,105,187,155,128,122,122,157,239,235,180,152,174,164,186,91,117,185,160,70,146,244,138,81,90,124,185,248,232,255,237,247,206,238,191,162,217,109,101,120,175,119,181,175,98,169,139,79,124,202,246,193,103,63,23,16,40,52,17,55,48,77,85,61,18,86,96,93,85,83,77,95,63,61,69,39,46,35,31,55,42,43,57,7,60,86,49,61,15,26,46,54,114,126,105,133,118,104,92,87,87,123,100,98,105,103,109,90,65,39,20,22,32,84,103,125,106,136,126,113,75,38,44,94,104,117,106,55,36,24,32,44,13,17,18,32,15,0,34,40,11,18,35,44,45,41,73,45,47,57,44,26,45,34,56,75,59,65,69,26,24,7,21,13,31,58,63,67,66,67,75,56,50,58,20,44,64,69,64,54,67,46,55,36,61,40,50,27,35,59,50,50,71,5,36,44,14,24,19,7,14,17,28,56,52,36,36,40,70,53,68,59,63,32,74,52,64,58,37,43,71,58,60,64,50,73,41,49,47,49,67,66,44,24,63,47,45,38,38,38,2,28,14,34,20,24,22,33,46,22,64,73,34,24,37,20,57,68,61,46,43,56,49,49,64,49,75,59,51,59,53,68,53,30,50,57,40,40,60,46,45,50,24,40,27,38,41,59,41,43,28,36,29,25,43,26,3,29,6,21,32,31,29,39,31,44,52,53,62,45,34,64,58,54,65,43,24,45,15,39,25,32,49,37,30,52,35,59,49,44,75,43,31,74,59,61,44,3,27,28,7,41,18,37,26,34,9,12,65,12,27,40,42,57,35,72,49,58,66,36,31,60,45,58,53,37,58,50,60,69,57,60,32,17,14,43,30,13,14,8,85,129,157,91,99,104,128,121,131,133,90,103,102,75,103,93,108,101,138,141,128,137,119,139,123,110,114,126,134,146,119,126,99,119,89,118,92,106,62,75,83,121,89,94,136,108,95,122,127,105,133,120,24,0,34,22,11,37,0,44,22,35,5,9,15,6,1,17,9,18,29,17,29,22,24,23,177,170,185,165,169,142,118,135,116,149,138,143,134,134,173,253,228,134,59,66,5,35,18,22,27,47,25,29,42,48,101,145,118,83,120,172,132,150,190,164,148,148,148,142,138,118,130,134,118,92,86,86,73,94,92,102,95,61,69,84,67,101,87,103,64,104,98,108,95,94,93,75,84,97,109,98,101,90,94,114,94,100,95,75,95,133,122,109,102,107,107,133,122,118,74,86,82,88,61,79,80,75,85,120,101,99,95,114,123,91,96,109,119,98,95,93,126,119,119,101,102,111,128,122,128,123,109,113,123,137,161,148,185,243,236,233,236,208,209,201,178,117,124,118,116,120,115,96,94,87,109,110,93,98,146,178,191,146,115,140,131,103,89,81,110,89,125,147,115,96,77,101,120,122,135,118,91,100,120,83,144,148,143,136,139,159,115,102,139,135,114,126,100,119,121,140,128,127,114,86,80,102,79,115,122,112,130,84,100,89,117,105,133,143,130,118,134,127,84,118,84,90,94,113,111,151,170,164,152,144,145,163,131,92,74,111,88,82,107,103,89,102,112,97,122,87,115,91,84,63,57,47,65,71,57,72,69,94,110,130,157,136,162,166,116,83,93,94,138,148,153,132,152,126,85,78,63,90,72,72,104,170,171,87,83,133,120,122,59,24,89,223,235,236,232,246,232,177,150,117,186,164,106,157,120,118,185,89,72,213,61,101,188,221,246,145,103,33,20,65,37,53,43,45,42,44,64,18,63,48,46,38,46,46,49,31,36,37,42,53,37,27,13,56,27,28,37,50,38,34,44,19,36,37,104,72,97,99,98,102,100,106,89,117,76,84,76,131,105,127,95,73,15,3,38,74,70,110,98,83,98,134,101,119,106,68,110,103,96,75,41,40,47,25,22,26,11,39,31,41,8,41,50,29,28,24,32,42,11,53,51,64,43,45,26,41,39,40,55,65,40,38,13,15,15,13,12,37,67,53,66,47,83,56,68,68,44,71,55,69,52,45,57,51,61,77,65,75,33,61,45,42,36,48,36,52,45,18,2,28,25,16,9,14,13,29,37,40,69,60,27,62,30,48,57,54,45,57,59,19,37,69,64,59,52,47,88,18,28,65,27,32,62,42,49,56,75,30,70,53,47,9,18,10,21,16,32,50,31,36,12,29,26,39,42,52,11,16,13,39,45,58,53,49,37,65,31,55,67,41,60,60,61,63,42,69,48,38,57,77,75,54,49,36,49,74,30,38,36,28,37,38,39,51,64,27,26,27,28,4,1,22,39,15,5,28,22,12,33,31,47,65,34,58,45,38,68,54,46,46,55,38,71,39,40,42,48,29,58,39,59,48,39,48,30,56,74,43,58,64,44,38,24,22,28,12,5,26,12,46,3,29,8,37,49,57,37,52,52,73,56,49,41,48,45,59,59,39,65,75,50,67,71,34,59,60,53,13,44,55,45,29,29,68,103,130,116,88,117,130,118,130,153,123,91,116,149,148,152,143,154,151,151,156,131,109,107,136,115,99,137,122,146,150,158,114,123,121,135,122,137,148,120,124,110,103,98,88,87,58,65,95,91,109,127,89,6,1,17,12,43,9,19,28,7,17,23,33,14,2,12,17,15,27,18,14,9,19,16,14,143,111,115,117,129,117,96,59,46,34,30,39,73,70,122,241,225,124,66,59,2,3,39,53,29,49,43,27,42,68,115,147,153,167,162,143,156,141,194,205,165,113,128,120,137,118,136,110,128,149,104,137,147,139,143,129,114,110,93,87,110,112,113,124,95,110,95,81,119,109,121,120,88,92,117,121,137,118,129,126,126,143,116,109,129,146,130,92,127,116,135,119,114,114,94,111,98,69,101,83,93,78,96,93,105,91,100,119,96,124,103,102,104,103,87,96,91,104,132,122,114,137,104,104,128,118,127,131,119,154,131,170,230,208,221,239,240,251,255,221,205,143,127,125,127,126,106,85,105,117,122,127,150,187,180,205,187,118,107,152,126,132,133,158,165,169,137,202,164,158,168,162,177,143,145,138,150,157,159,143,153,155,156,146,108,162,127,118,147,114,147,142,129,109,110,122,157,119,125,118,113,102,124,110,99,111,95,94,88,117,92,130,138,143,139,114,101,102,119,109,104,127,152,129,127,158,161,152,155,152,165,167,134,53,80,77,93,83,93,103,96,111,121,100,115,98,109,106,111,148,127,112,110,81,73,104,96,87,107,112,74,58,81,140,145,123,35,55,99,57,53,54,87,64,34,54,42,44,9,8,40,69,162,108,73,19,71,141,103,22,9,10,143,202,236,247,184,126,151,115,207,149,144,148,107,139,140,34,118,169,143,161,158,58,117,115,51,25,14,64,49,38,62,24,35,23,50,13,53,30,11,40,31,27,44,23,1,42,34,43,42,17,20,12,7,57,25,25,27,53,32,49,33,23,79,116,113,97,115,101,108,100,99,114,80,109,122,107,130,87,45,25,37,11,63,109,116,106,129,115,85,109,109,98,89,94,102,81,44,34,46,44,21,12,33,11,7,33,38,14,28,19,21,14,34,16,18,20,36,27,51,84,74,90,44,34,31,81,69,58,24,13,26,25,25,36,35,44,65,55,86,42,69,75,46,46,80,47,27,62,60,52,64,82,37,59,70,61,62,39,60,60,47,30,61,45,27,43,45,25,5,14,12,32,43,10,45,37,41,47,29,42,67,64,56,66,56,14,16,29,48,58,64,59,73,53,60,81,59,62,44,51,51,78,42,69,34,46,51,40,26,25,29,29,12,26,37,6,10,38,21,38,37,45,35,9,34,5,5,50,51,56,72,59,65,90,52,77,54,49,70,62,71,61,52,53,53,82,49,32,28,58,38,43,40,16,59,56,72,70,58,47,54,44,72,50,52,42,35,8,28,19,11,20,33,23,21,25,50,17,32,53,49,44,57,58,69,47,41,62,54,32,59,51,55,24,27,67,63,69,54,69,29,64,78,59,44,43,42,39,52,16,22,40,31,9,29,32,22,41,16,28,33,43,33,29,54,46,51,51,76,39,76,57,53,37,81,46,50,53,70,79,30,42,83,58,41,37,31,30,57,25,56,107,131,115,106,83,89,107,121,127,77,113,94,110,117,115,158,120,140,141,121,108,152,94,115,114,118,137,133,145,150,127,157,133,147,151,127,95,139,177,99,121,112,118,115,130,116,94,87,90,60,79,116,105,31,11,11,11,14,10,9,16,33,11,11,19,9,4,43,12,4,9,13,8,0,42,7,4,96,103,95,117,155,183,182,153,130,105,98,101,80,74,133,244,238,148,83,137,60,33,41,38,44,25,45,17,25,43,83,164,163,182,168,175,188,192,170,142,146,111,140,157,125,123,105,108,129,134,130,189,161,168,158,173,158,131,116,124,126,152,115,119,137,119,120,111,97,139,126,114,113,107,82,120,128,115,135,131,125,117,153,126,97,127,131,82,128,123,118,131,92,96,107,94,112,78,84,105,99,111,123,100,109,108,130,117,131,123,121,125,113,122,105,95,114,111,111,117,128,116,127,130,133,106,140,130,132,117,117,120,143,143,149,168,209,196,210,231,218,208,180,156,146,107,113,110,115,96,122,151,175,211,237,245,173,160,172,173,188,188,233,202,186,206,152,174,177,177,200,202,185,136,142,126,188,184,178,138,112,124,107,91,146,125,100,137,129,110,129,162,128,130,127,112,98,115,128,122,108,123,122,155,137,131,106,130,131,119,137,118,139,132,107,103,117,83,93,118,119,163,217,198,123,165,172,171,147,133,153,123,154,100,85,119,123,138,116,108,126,108,131,108,127,126,150,146,172,242,230,122,96,106,109,142,144,101,88,115,105,100,184,205,193,177,42,7,8,9,21,10,99,165,132,139,163,170,97,28,12,36,134,152,172,128,129,200,125,84,54,32,10,62,77,150,155,158,209,195,211,147,162,130,117,192,104,90,172,196,212,186,89,28,26,49,42,41,61,75,50,30,36,17,34,34,25,15,17,26,10,34,11,16,41,37,12,20,42,39,25,45,11,22,12,26,16,34,10,49,59,77,60,49,75,98,100,108,108,104,105,101,93,101,102,90,116,130,100,41,17,20,35,74,94,100,88,101,85,102,62,105,115,114,96,64,85,71,32,31,37,40,7,21,47,47,29,16,29,3,18,22,30,55,24,32,20,36,22,49,30,45,61,58,57,28,53,48,25,16,18,11,40,39,31,52,50,81,74,59,60,61,38,52,37,75,63,45,61,48,60,64,48,68,61,68,85,70,81,59,65,60,43,41,41,28,21,31,27,49,37,10,23,24,16,27,37,41,18,38,50,40,69,54,49,46,41,45,41,31,53,54,53,48,83,61,41,37,55,38,63,49,45,44,47,68,61,58,52,35,34,12,25,15,5,10,16,8,23,33,30,81,45,46,41,15,21,11,17,39,78,56,77,60,34,65,81,85,73,73,51,56,84,65,51,52,92,84,65,66,58,52,55,44,55,25,52,32,57,62,40,42,46,65,32,47,15,2,27,24,11,19,8,24,36,16,42,58,26,61,44,57,26,40,71,44,55,69,71,68,59,70,80,81,64,40,38,64,70,56,43,36,50,48,35,58,52,45,49,39,47,37,27,12,26,17,24,30,15,29,18,25,21,23,65,37,43,14,67,51,33,49,46,66,60,58,54,55,82,81,82,81,90,72,80,56,37,45,53,28,24,32,62,150,107,110,120,134,106,54,73,76,68,103,62,60,126,129,122,116,133,138,115,124,113,113,159,152,128,121,151,149,144,147,121,129,154,108,134,131,155,149,136,116,116,142,110,118,128,120,99,87,106,111,87,86,5,29,14,16,29,9,16,31,20,46,20,13,3,16,36,6,20,23,18,25,6,43,1,29,87,97,89,152,192,213,212,190,204,174,150,132,127,118,203,253,245,158,148,183,97,29,27,9,10,13,41,23,50,37,63,148,135,179,151,181,166,148,140,98,108,137,138,148,107,134,109,130,143,145,127,141,145,156,127,147,167,134,104,113,110,71,89,114,95,97,115,98,113,128,124,116,115,104,80,96,136,128,121,131,116,121,119,133,123,136,120,130,121,129,110,107,131,129,103,93,146,117,106,111,127,143,115,110,137,144,131,149,135,117,131,93,105,90,121,86,136,98,120,122,116,106,103,116,95,118,122,125,156,136,99,84,92,71,107,71,125,148,185,171,213,221,237,248,208,139,119,116,127,125,118,114,150,249,223,167,167,163,171,178,184,197,189,189,190,193,138,149,117,163,202,191,137,98,133,150,188,170,137,124,90,93,70,95,125,126,87,101,109,69,112,108,116,117,94,98,111,99,112,117,128,157,148,137,124,109,97,139,112,123,106,113,138,97,89,67,70,125,155,176,144,182,245,212,152,157,147,153,150,151,155,172,147,125,49,84,111,138,111,120,145,209,167,131,119,131,136,129,178,216,219,123,102,109,141,164,160,133,66,83,92,113,188,232,220,145,122,104,76,71,22,56,145,201,198,185,191,194,186,110,91,32,99,163,167,153,221,212,122,118,109,71,95,76,55,129,172,155,157,171,201,158,170,103,145,179,123,184,208,217,189,105,99,10,14,32,70,76,56,39,30,36,22,50,22,15,18,16,49,28,29,24,22,37,27,26,16,5,27,34,32,18,34,23,23,22,24,48,35,51,76,89,42,53,90,107,107,105,96,70,83,93,102,118,102,87,117,82,43,34,16,47,68,114,123,112,113,123,112,104,69,76,109,94,124,107,96,95,94,129,107,74,28,44,50,50,14,40,30,17,26,12,35,30,38,36,21,36,12,13,33,58,54,77,98,43,45,28,7,19,5,34,40,24,66,50,78,56,63,63,52,40,40,39,51,53,50,32,66,64,67,51,37,72,65,47,47,58,48,58,64,31,49,38,46,25,32,38,29,8,19,11,26,21,24,43,43,52,35,38,39,56,47,25,56,30,21,51,24,41,30,52,63,37,51,73,75,53,60,59,62,50,64,33,63,50,66,52,60,25,28,34,8,19,24,33,3,27,45,25,52,30,47,27,12,33,32,37,51,92,78,84,60,57,48,52,57,59,49,69,70,59,79,67,64,63,46,67,57,43,58,27,42,27,44,43,62,15,60,42,41,61,30,40,43,31,19,13,35,18,37,47,14,18,35,24,0,37,53,17,33,56,38,47,71,56,60,70,58,66,71,90,28,16,47,40,59,22,47,66,44,49,22,51,46,77,65,58,61,36,45,20,16,38,26,10,36,27,5,7,6,3,28,68,35,81,54,34,54,45,53,73,61,62,68,54,60,64,59,61,47,53,60,76,49,37,56,39,34,55,41,44,128,159,161,149,138,136,109,62,44,95,97,100,67,107,116,121,124,156,133,126,128,120,157,121,139,149,160,118,129,95,142,129,105,129,114,94,126,152,186,150,139,155,132,110,123,135,176,133,122,129,119,108,84,70,12,21,49,13,6,5,5,25,16,29,20,11,6,22,7,15,27,26,26,5,17,23,23,11,81,101,80,86,154,133,121,153,139,181,154,136,144,185,226,254,243,136,179,233,150,81,12,12,21,19,48,38,51,33,70,89,78,109,127,95,110,133,67,82,85,97,95,91,101,98,110,81,92,69,82,103,105,76,104,97,137,111,78,99,72,64,96,91,113,135,103,103,121,130,131,128,102,101,97,145,138,128,125,92,96,118,120,110,139,114,129,130,132,117,112,123,143,129,129,149,128,122,115,123,102,132,116,136,120,168,152,138,167,117,101,111,75,134,94,98,92,82,110,115,108,107,124,133,123,85,103,134,130,125,99,92,95,64,66,77,77,75,118,140,147,176,162,215,219,203,182,150,139,108,86,64,162,175,157,126,112,124,156,159,129,142,147,134,141,152,113,103,91,84,113,111,120,97,105,111,117,156,119,116,102,115,120,149,89,107,111,112,98,59,118,104,93,90,71,79,70,96,89,83,119,146,130,132,84,126,123,126,114,98,124,107,102,110,84,91,163,156,178,188,143,195,239,200,154,160,168,171,170,135,145,140,171,147,91,78,88,111,120,139,197,194,162,171,129,113,120,104,97,184,131,120,124,99,161,178,201,143,92,80,95,102,139,142,172,180,227,151,158,163,131,111,85,172,207,154,127,146,179,131,83,7,45,127,159,136,200,179,93,80,78,81,87,86,85,110,146,137,102,115,179,194,135,82,151,146,161,234,199,238,198,61,34,1,59,64,55,60,40,42,39,7,19,29,33,19,53,14,34,15,61,25,29,29,20,29,0,14,17,11,40,15,40,22,20,29,3,25,40,63,65,86,78,81,110,125,89,75,77,69,74,109,91,116,94,109,92,25,17,19,40,88,107,147,91,90,81,103,90,177,93,82,111,102,114,101,115,125,123,150,159,77,29,57,22,24,16,74,19,2,19,17,25,48,18,19,5,19,22,31,29,60,71,81,96,90,50,34,30,4,16,13,54,49,46,76,52,76,50,52,78,45,50,57,64,64,55,80,59,58,91,55,65,41,59,71,74,25,22,60,34,58,29,38,53,13,7,32,19,15,15,21,46,30,15,12,49,32,44,39,67,34,66,48,55,53,48,41,19,33,27,51,49,71,69,57,31,64,40,60,63,64,69,58,55,29,51,74,31,13,14,34,18,17,5,40,25,28,16,23,68,33,30,30,42,57,38,43,58,77,43,64,64,50,24,71,50,58,59,48,61,38,31,47,67,46,40,61,45,58,76,48,46,61,77,42,74,61,64,73,39,51,45,54,59,46,23,21,8,34,13,9,14,24,27,11,13,19,59,36,44,56,48,45,96,34,46,66,43,41,68,37,53,62,49,59,57,57,35,44,78,49,47,71,47,63,35,57,62,26,28,51,18,15,16,18,16,33,25,33,23,50,43,29,75,66,55,67,48,78,67,44,63,57,47,64,59,91,65,63,85,66,69,60,34,44,50,40,35,32,55,103,130,147,109,122,128,136,118,101,129,121,104,142,141,87,114,126,114,108,129,145,137,133,141,130,117,145,157,110,109,88,117,107,110,112,108,153,132,142,128,127,146,152,147,149,143,120,137,108,117,132,102,121,108,95,7,1,42,20,14,10,15,3,18,13,22,9,6,0,23,8,21,18,32,13,7,14,21,20,101,99,86,74,85,99,56,86,92,108,90,96,149,195,244,247,240,115,99,238,133,54,14,34,7,42,36,76,38,28,85,79,83,90,90,74,77,81,73,68,71,69,93,108,85,98,80,108,78,75,91,83,90,76,63,78,113,68,98,81,91,97,87,83,90,109,98,127,95,89,117,122,126,103,109,118,129,133,74,122,121,117,91,112,119,113,102,130,140,116,114,135,142,91,129,119,148,143,123,143,152,126,120,105,114,112,131,149,142,134,116,138,131,135,138,145,146,97,113,104,94,112,110,120,112,105,67,115,151,165,137,145,137,86,111,90,84,91,85,112,108,64,119,189,204,217,240,232,187,133,86,32,120,143,122,64,95,107,131,90,134,123,100,82,104,103,108,95,86,95,85,132,106,93,121,88,103,144,127,119,138,137,142,137,101,80,62,140,108,84,110,91,86,87,53,76,71,66,90,72,84,94,111,136,128,131,143,108,116,125,132,105,107,145,151,143,173,191,191,182,132,175,147,150,141,159,164,164,156,147,132,137,177,172,118,57,85,85,140,114,160,148,149,134,141,109,119,104,118,136,94,122,152,157,185,150,178,194,130,78,89,144,152,127,90,185,242,192,250,185,167,107,124,181,214,191,159,127,146,67,60,4,57,172,175,136,168,104,51,30,49,72,66,43,73,118,171,86,122,161,182,158,77,101,152,129,177,161,90,159,109,20,18,17,64,33,25,35,42,25,28,31,37,23,30,23,54,34,30,9,28,34,58,15,43,20,11,33,32,35,17,22,27,34,23,31,22,46,76,91,90,52,66,80,99,75,85,69,83,67,84,123,96,88,87,67,24,18,11,78,105,128,126,126,108,77,121,89,100,136,87,104,108,95,142,103,100,111,113,127,141,152,89,50,31,48,37,47,42,27,19,34,24,40,44,30,5,22,17,16,53,44,97,72,91,74,68,63,44,35,56,33,53,44,79,66,65,79,61,65,64,65,46,64,41,54,29,46,42,58,56,57,29,45,36,38,29,53,24,65,45,29,20,41,25,6,13,50,35,12,15,17,9,23,41,44,53,35,43,62,31,67,47,56,52,37,18,27,41,19,53,64,54,57,46,60,49,49,43,46,71,61,56,37,63,39,53,58,43,4,43,23,42,32,21,24,20,14,12,55,58,20,53,44,29,26,43,32,39,75,64,74,69,75,52,52,58,40,38,23,53,69,51,52,51,53,90,53,52,61,85,77,55,48,45,60,69,45,47,46,45,33,63,56,39,11,41,17,21,13,20,38,16,19,14,44,36,40,30,52,59,58,66,60,61,51,50,74,76,71,70,38,42,42,66,87,62,44,45,53,50,57,69,60,47,62,80,65,22,9,31,7,24,45,11,31,31,20,30,14,45,47,56,54,57,65,59,69,31,47,73,44,41,76,76,66,57,57,72,64,67,46,52,40,41,40,35,76,62,84,67,96,134,116,107,122,110,116,124,139,145,146,139,147,133,120,132,109,122,112,142,153,110,120,118,105,124,148,123,122,103,151,114,133,106,111,101,95,99,102,88,110,127,128,155,131,127,82,83,91,115,94,82,89,99,93,28,26,17,1,8,5,46,16,0,9,0,29,9,1,11,0,10,20,35,29,1,33,26,3,128,150,140,164,158,148,109,110,120,127,80,106,103,133,191,247,234,65,88,201,82,25,34,17,26,26,70,76,53,50,72,84,64,63,87,67,72,58,83,72,105,90,64,92,99,90,96,105,77,74,108,72,76,121,82,116,58,100,97,110,127,96,103,119,86,79,89,84,96,101,113,145,127,106,100,119,111,124,103,100,97,101,95,111,109,126,109,126,144,112,130,129,117,101,114,104,140,116,140,117,117,83,113,95,127,139,144,143,136,118,143,114,116,117,142,154,173,135,135,128,118,112,92,123,91,106,120,163,190,175,166,116,105,120,137,137,114,117,120,137,108,152,82,151,205,193,231,240,243,214,126,119,129,128,125,60,55,76,84,116,146,146,123,132,96,107,100,104,92,101,113,98,101,122,114,90,109,109,120,96,105,128,99,86,76,82,113,148,134,103,129,111,91,77,88,53,85,79,72,41,70,86,116,142,114,125,135,109,89,117,135,181,119,129,154,160,176,185,186,156,161,154,178,160,134,186,206,184,152,169,179,132,135,130,113,104,79,100,134,102,117,112,117,120,104,123,134,121,134,116,116,91,130,163,162,166,163,167,132,104,131,125,126,136,141,216,209,182,247,213,159,78,122,220,223,147,129,118,97,86,83,0,70,173,163,138,181,105,35,87,80,111,150,117,119,166,172,74,135,183,137,133,75,168,186,154,142,83,8,32,19,37,65,40,43,23,41,55,11,75,35,45,46,45,39,31,26,17,46,32,36,5,17,10,21,53,26,7,25,29,33,35,41,22,16,21,53,59,81,84,80,75,45,67,81,96,90,89,74,78,103,124,108,83,48,23,8,30,66,88,130,120,100,99,102,77,88,84,94,83,21,62,83,63,81,97,95,112,120,98,126,128,110,50,43,60,66,67,47,55,52,14,35,22,25,38,41,12,34,38,53,50,38,79,75,55,38,66,81,55,47,59,75,54,63,60,68,52,49,71,50,30,63,32,49,84,24,46,57,54,38,49,37,56,70,57,63,82,44,59,57,26,33,34,52,10,37,26,34,31,29,23,26,30,13,43,61,62,59,28,33,87,54,64,47,39,48,34,35,32,40,36,52,60,55,55,54,79,48,73,52,81,65,45,44,33,64,27,11,26,7,13,10,43,21,10,44,32,36,45,59,37,61,41,16,37,44,20,63,93,48,73,45,55,71,61,35,66,40,41,53,56,80,75,48,34,47,47,61,43,60,45,62,44,81,53,45,55,48,76,62,43,38,79,59,29,27,18,33,42,11,20,17,12,23,11,15,68,39,53,57,57,41,58,50,62,67,77,40,61,57,70,63,49,61,45,52,68,25,61,51,68,45,67,56,65,56,55,61,27,36,20,7,22,36,11,37,46,29,25,15,58,54,51,55,75,71,52,48,36,64,43,63,79,75,60,64,80,86,58,41,61,47,52,66,70,89,180,148,131,147,96,96,76,110,93,94,73,87,134,126,125,141,147,97,84,83,104,122,101,111,84,88,120,131,132,145,167,139,151,142,149,128,121,145,117,79,66,111,88,67,98,118,146,113,112,97,75,92,108,83,94,114,76,125,98,29,12,43,24,24,15,6,11,19,17,26,19,0,2,11,17,8,18,24,21,9,12,24,4,167,146,165,215,213,181,155,146,128,133,122,107,117,109,177,238,238,99,54,85,8,26,40,9,21,40,27,33,49,61,70,103,103,114,109,89,114,98,84,80,96,104,112,102,107,79,78,64,94,86,93,103,87,110,101,118,90,89,111,139,151,114,120,111,100,88,121,122,94,122,115,97,82,81,103,76,112,109,99,118,101,115,115,118,137,103,126,97,138,112,141,103,102,84,93,97,114,104,103,131,122,90,117,98,129,140,116,127,98,102,98,104,105,120,122,115,120,133,113,100,105,122,94,102,92,104,97,163,128,123,134,111,118,144,179,131,113,126,129,149,94,100,115,150,164,161,149,159,197,246,223,213,180,160,182,139,146,82,47,102,153,167,150,143,133,125,138,115,161,133,151,115,113,131,123,74,88,116,89,91,113,106,114,54,53,84,125,132,103,110,114,93,119,84,96,100,82,72,73,81,44,58,93,104,119,109,110,116,97,109,123,101,105,154,160,184,195,188,168,167,162,163,171,185,174,177,169,178,157,167,167,81,81,126,132,151,114,143,138,102,114,90,118,87,72,99,109,129,113,97,124,87,101,123,165,133,121,126,130,139,158,125,130,120,166,225,216,203,244,209,186,118,169,221,140,81,129,169,157,24,28,19,75,205,136,157,198,92,38,91,136,183,160,164,143,176,149,95,182,154,131,160,188,228,240,148,85,16,0,11,47,76,86,65,62,15,26,21,33,52,32,29,23,16,38,63,27,28,26,36,60,21,7,26,30,38,42,23,23,22,27,48,10,34,20,30,34,35,99,108,97,95,41,70,74,74,64,89,87,93,125,151,83,53,16,23,24,59,138,126,120,98,90,99,78,102,94,68,148,107,12,41,47,71,91,89,58,54,84,112,105,142,119,101,56,55,72,61,65,53,38,60,35,31,34,36,27,60,56,50,45,48,33,66,54,50,45,62,94,70,23,41,66,85,80,67,62,57,58,64,18,43,36,69,50,55,41,75,48,61,64,38,63,55,65,82,70,40,34,40,62,52,58,49,33,9,24,23,22,32,27,12,15,17,30,36,80,43,61,59,42,38,68,47,61,27,45,34,30,55,41,70,75,46,66,45,56,26,57,60,22,26,49,80,65,17,62,44,17,37,11,30,1,15,16,32,12,14,35,47,71,53,33,58,16,23,39,19,37,44,55,37,56,55,62,41,47,30,57,56,39,21,22,53,30,35,48,39,54,31,63,54,43,42,87,58,63,51,44,74,51,46,66,56,52,34,6,20,9,14,28,13,19,7,30,45,61,33,68,46,52,57,58,60,37,54,70,78,66,73,70,59,58,58,63,37,31,30,35,58,67,61,46,70,56,79,49,44,73,1,24,12,16,3,42,30,24,24,28,36,32,50,54,42,62,72,76,54,64,75,38,52,46,58,48,32,52,70,65,63,43,59,74,42,51,109,177,182,188,162,143,131,69,45,84,72,96,108,95,112,141,99,104,101,88,61,42,75,84,116,110,111,89,95,102,118,141,153,154,135,111,146,118,113,137,157,116,104,131,119,110,116,131,97,123,107,96,127,89,106,89,81,80,107,99,73,7,1,17,28,23,17,14,8,16,41,32,10,23,19,3,29,5,8,31,18,14,5,26,3,120,125,115,152,176,163,140,98,124,145,102,130,142,147,201,251,234,83,57,58,23,54,27,17,42,28,33,45,49,47,89,114,114,116,106,103,99,122,123,126,103,97,82,138,130,123,124,114,112,108,136,113,100,98,108,129,89,125,131,102,135,108,99,106,105,93,109,101,103,99,106,84,94,82,114,146,102,113,121,144,140,142,121,123,99,122,79,88,100,74,87,110,101,91,123,104,120,84,91,81,123,92,123,98,101,104,117,116,97,106,86,113,112,75,90,104,105,96,99,88,88,100,92,93,75,94,98,116,109,106,100,108,115,123,148,131,104,91,83,105,92,69,91,95,130,230,183,149,168,222,208,155,106,98,201,245,216,174,148,169,167,128,142,144,139,146,152,136,127,124,116,137,94,118,89,88,83,100,62,81,120,120,78,78,82,91,134,119,81,95,102,72,75,87,129,122,103,122,87,73,73,86,115,99,86,139,125,119,124,120,101,121,162,151,165,170,172,183,169,177,181,171,160,160,171,169,161,159,172,182,166,134,74,121,115,82,113,125,145,99,122,122,113,107,39,70,92,118,112,76,81,93,86,71,105,105,97,104,126,118,125,114,127,125,196,242,214,246,251,240,166,160,234,205,81,51,185,217,175,70,37,5,127,189,157,196,215,123,66,123,146,125,119,150,164,175,131,141,181,159,174,186,200,149,216,137,59,16,9,59,38,77,39,47,37,32,16,36,22,24,33,33,45,19,54,50,61,24,44,37,24,21,20,9,12,6,24,33,25,11,35,26,49,24,17,36,70,67,84,96,110,88,92,81,60,92,101,91,88,84,142,90,51,31,12,54,85,88,112,90,92,108,106,101,76,96,127,119,134,149,32,25,37,71,86,65,115,73,84,109,116,104,52,86,103,103,52,40,56,45,52,30,39,23,31,23,24,40,59,59,37,15,6,38,87,37,64,87,100,49,82,64,45,49,56,62,52,30,43,63,79,50,49,45,72,68,51,39,61,51,49,72,48,64,74,36,62,34,29,46,39,51,63,34,31,21,14,4,6,22,25,16,35,44,20,51,45,43,76,74,69,64,45,56,50,47,41,19,55,27,40,38,70,44,37,29,41,55,65,47,31,75,54,74,39,50,50,30,11,2,0,24,13,13,31,29,24,19,11,53,40,40,56,32,38,44,15,48,63,55,62,72,61,48,48,55,21,52,49,27,12,46,37,24,10,55,24,52,45,75,57,28,62,70,43,41,75,69,63,41,46,59,69,29,39,9,16,9,17,28,41,23,17,19,30,18,80,88,59,65,63,50,68,49,67,81,66,63,53,76,51,76,33,74,44,47,38,47,76,45,47,49,59,40,75,52,62,69,29,32,19,19,13,26,11,34,3,8,20,46,50,63,59,42,69,56,47,47,21,41,54,71,56,41,65,34,51,47,76,62,77,60,64,42,77,111,154,146,136,108,119,68,63,50,92,114,54,97,127,136,97,123,79,97,131,84,59,79,81,108,94,105,115,132,116,150,135,163,138,118,95,101,132,132,150,187,142,121,76,136,91,102,83,123,116,120,120,148,113,87,64,81,38,69,60,73,24,34,16,34,8,0,13,16,7,0,17,18,23,4,27,14,16,14,25,5,33,4,42,32,139,139,121,110,121,133,104,123,140,105,103,129,167,160,220,227,219,74,62,57,2,60,4,5,38,11,29,30,61,54,80,118,110,78,116,109,141,110,110,99,124,102,104,105,118,133,127,123,121,134,152,117,91,111,132,123,130,149,107,126,101,104,103,126,93,94,129,101,104,107,104,91,115,120,114,125,122,109,120,126,110,128,154,118,128,128,87,85,96,79,104,117,102,113,107,125,86,80,76,99,98,97,99,57,63,93,104,135,104,109,129,103,106,87,112,127,108,139,136,117,124,114,129,96,84,105,114,117,105,132,101,102,110,99,116,101,123,95,51,91,77,75,45,36,95,160,158,137,155,142,103,105,11,59,125,211,230,244,211,199,142,127,111,85,91,119,129,98,125,89,87,83,99,109,119,110,104,158,94,104,86,114,96,108,67,66,108,110,77,112,92,92,86,108,127,130,147,109,96,91,70,68,115,109,139,152,148,146,102,146,121,147,173,171,172,124,157,147,166,148,182,124,158,156,174,137,145,155,160,155,158,148,135,107,112,107,139,159,123,95,103,119,65,66,86,80,107,114,86,100,79,68,92,83,50,101,80,64,86,108,80,109,66,152,237,244,233,230,248,226,134,182,206,148,58,47,100,198,171,50,26,4,125,254,192,162,176,139,119,124,172,140,130,154,184,143,101,170,171,168,169,152,111,44,81,77,31,34,10,56,78,52,40,11,11,31,58,42,32,24,63,31,44,47,32,35,46,43,39,30,16,6,20,36,19,15,22,30,40,28,36,29,36,30,56,69,74,86,93,92,82,40,68,69,98,92,87,104,119,96,76,23,47,25,17,63,118,115,93,94,107,102,117,92,90,114,123,89,167,177,70,60,48,61,104,69,94,106,99,116,121,101,70,90,98,131,113,55,50,82,86,45,24,19,18,39,26,45,51,24,17,14,8,41,38,52,74,82,82,69,60,59,71,65,49,65,65,45,61,39,54,60,23,49,41,55,51,50,45,47,62,69,75,66,30,85,53,40,74,61,53,47,64,33,9,11,33,14,14,32,30,22,37,42,54,61,44,57,28,35,71,38,47,30,53,52,47,33,45,44,58,31,58,45,61,77,77,35,51,40,45,38,50,64,58,32,42,39,29,20,34,7,19,4,35,3,23,35,62,43,55,52,73,27,29,9,25,51,65,32,66,60,48,19,9,17,17,10,20,26,34,18,15,12,22,9,20,41,49,62,62,38,36,81,40,69,53,70,79,67,62,39,52,55,29,22,36,2,6,21,31,33,6,28,18,52,60,49,85,42,60,55,67,69,81,72,82,51,60,43,48,50,48,64,56,41,19,44,41,65,61,64,28,56,64,43,36,39,14,29,12,7,8,7,21,41,30,15,11,55,69,76,54,52,82,49,66,51,48,50,71,60,49,78,57,37,68,60,79,55,93,47,52,63,81,154,137,128,103,77,81,49,52,46,129,117,64,48,95,115,111,106,121,109,109,77,91,99,93,78,111,102,118,135,114,115,112,150,142,88,104,116,104,112,137,164,123,124,83,68,76,86,76,108,119,101,97,101,120,82,105,98,41,33,39,44,18,5,27,32,18,16,42,22,9,11,13,10,29,3,3,35,23,10,14,10,24,44,29,21,147,140,112,98,103,113,125,158,106,114,147,148,117,149,207,253,229,77,105,19,16,65,36,36,41,26,57,30,28,38,94,131,78,108,116,104,115,126,110,83,94,98,116,106,114,104,108,110,83,117,105,124,86,91,106,96,113,112,94,103,98,111,105,122,119,104,115,146,131,100,108,123,102,138,125,133,117,144,120,122,122,122,124,108,122,116,99,111,80,82,102,107,92,69,100,99,80,98,61,75,90,132,102,101,110,122,106,130,150,144,158,146,161,167,142,148,141,116,137,135,99,119,97,104,111,133,107,112,130,128,140,129,106,114,87,111,117,122,80,145,104,68,54,41,48,78,84,72,123,130,124,161,110,59,27,103,149,148,203,201,196,174,146,98,91,88,76,76,89,92,110,109,89,88,154,120,136,138,108,108,71,92,95,105,96,105,102,115,113,152,145,105,102,72,79,89,123,109,135,129,95,111,141,180,164,100,128,136,145,142,148,189,160,152,142,151,126,132,150,143,157,134,178,137,158,158,147,152,157,165,170,169,136,124,108,105,102,126,102,123,97,107,65,66,95,112,127,63,43,59,54,57,53,28,41,71,88,83,88,103,52,91,127,164,230,250,222,215,244,218,169,205,230,133,60,26,3,52,92,38,48,45,167,235,124,79,122,140,121,125,210,178,162,168,174,107,126,215,171,187,163,136,77,19,7,34,28,38,55,47,38,19,20,35,37,23,34,37,41,32,53,49,27,25,1,37,66,39,19,32,19,49,16,57,25,18,36,18,21,25,34,29,34,19,39,43,83,90,104,83,65,56,81,95,86,118,106,131,73,59,18,18,36,96,93,126,118,105,112,95,133,81,83,78,100,91,102,107,135,171,89,103,94,83,71,95,101,82,105,99,109,97,86,101,112,126,89,51,69,60,81,57,59,23,29,60,62,46,16,24,39,7,10,22,45,55,72,80,60,52,89,68,78,57,52,56,71,82,76,30,49,35,71,37,41,65,49,38,64,63,57,57,53,57,67,72,45,37,63,54,41,39,66,34,14,17,12,20,13,17,29,4,35,61,50,33,23,41,66,29,69,19,13,17,8,14,35,35,40,27,50,54,65,77,50,70,46,54,90,58,38,52,36,37,47,41,83,34,22,28,27,12,43,29,25,39,68,26,60,35,52,44,41,49,20,12,43,42,37,55,57,34,31,13,12,31,3,20,20,14,7,15,7,16,10,3,50,73,38,87,32,66,40,80,44,65,49,41,63,52,50,51,22,45,29,17,30,23,14,8,15,20,25,51,36,39,45,53,55,53,54,70,40,69,61,44,61,56,64,37,61,52,58,56,28,17,32,63,35,55,57,41,40,72,45,46,42,46,10,43,17,15,15,33,35,27,22,16,31,77,66,70,59,81,53,45,60,26,36,69,46,60,64,61,70,85,61,58,74,71,61,41,26,57,147,119,100,122,107,61,74,87,111,118,153,136,50,35,59,53,51,98,97,36,52,40,72,98,81,75,57,62,82,91,74,96,99,135,107,118,92,93,62,51,81,136,115,46,56,16,73,55,37,72,100,93,85,104,99,71,87,74,48,41,17,24,20,10,16,10,14,40,11,25,40,13,13,8,27,9,19,4,4,31,25,3,36,19,2,30,133,146,103,119,117,114,123,116,105,113,130,122,125,132,207,249,216,58,74,58,19,60,25,51,28,24,56,52,72,42,93,153,78,90,90,97,100,121,121,91,133,117,102,76,106,99,115,70,105,120,113,118,92,98,74,88,103,118,127,98,125,134,138,109,103,143,140,119,104,89,120,104,101,132,144,147,112,146,109,123,120,113,139,120,96,74,112,127,86,102,96,114,118,97,101,128,100,113,99,108,122,100,113,97,122,105,105,103,129,141,119,138,133,143,117,110,136,107,115,113,92,95,53,76,112,109,104,80,79,100,114,141,138,101,96,80,125,109,99,127,98,97,93,74,93,66,55,102,70,112,160,180,163,81,45,86,110,125,147,143,207,245,230,161,111,85,63,61,82,81,120,118,114,98,90,140,114,146,138,126,115,100,112,104,116,107,129,134,96,139,115,112,120,78,83,82,102,169,149,148,95,144,165,145,109,76,92,93,143,187,176,201,189,177,174,193,170,178,163,170,177,163,148,129,167,159,168,199,190,182,186,157,141,158,129,95,110,94,120,112,109,135,83,81,100,109,118,106,71,73,45,77,36,50,35,46,62,69,96,73,90,97,112,205,242,255,219,204,239,228,194,171,240,193,50,22,6,20,25,44,39,61,132,145,95,49,49,110,144,211,204,175,185,168,162,142,218,238,149,139,158,83,81,14,36,56,61,68,54,49,34,14,46,49,12,44,55,35,40,31,44,57,23,32,40,44,30,30,34,54,25,48,45,33,36,22,19,19,14,31,30,25,29,42,47,69,97,119,76,80,94,93,82,79,103,121,85,111,44,5,39,23,68,98,93,121,103,110,82,104,96,107,95,104,100,79,125,73,118,162,69,77,91,126,100,78,90,82,69,111,104,123,116,106,100,97,84,44,39,54,69,67,64,44,25,47,46,32,31,34,45,27,36,57,75,75,86,58,26,49,97,120,68,100,61,60,75,69,59,60,64,55,55,69,59,56,60,66,72,97,64,59,57,59,77,45,54,49,63,86,51,39,41,16,26,10,36,23,34,13,29,26,25,60,39,56,40,21,8,26,30,25,21,22,22,18,18,27,41,20,50,49,54,54,59,80,53,49,47,39,46,72,54,43,40,51,68,16,11,19,15,7,6,13,20,6,29,14,34,30,39,55,36,36,30,13,36,30,32,42,53,35,28,2,8,28,27,25,33,38,9,28,22,30,40,39,42,45,44,75,47,47,42,83,47,52,61,60,38,26,78,21,45,31,14,42,22,33,13,23,29,29,35,36,53,54,66,32,63,69,40,98,80,54,53,35,44,39,42,58,55,20,39,67,31,39,53,66,53,66,48,68,73,85,52,35,36,34,35,24,5,21,11,19,31,22,32,15,59,53,76,55,63,48,47,32,42,25,68,73,73,66,56,72,110,63,54,59,42,46,37,42,35,112,131,108,127,129,98,66,89,133,150,158,141,115,86,76,72,43,35,72,35,29,41,40,100,165,91,56,90,90,75,78,36,76,109,121,103,108,116,63,58,32,84,94,69,77,91,85,82,65,58,59,52,78,80,69,72,61,83,41,46,79,84,64,12,8,3,0,22,27,30,18,15,35,13,5,27,0,6,17,33,33,17,6,6,3,6,31,130,147,131,100,106,117,130,120,120,125,108,110,97,129,203,251,231,68,102,33,22,37,10,33,30,16,39,48,52,74,107,125,104,84,111,99,128,122,135,129,125,122,102,92,111,114,111,104,112,114,105,120,139,130,115,116,116,121,101,105,104,116,108,98,94,95,123,100,103,111,128,110,119,148,126,111,123,101,131,116,106,135,115,103,96,105,117,136,121,97,98,77,102,109,126,128,115,100,129,109,128,110,125,123,117,135,90,99,111,79,80,107,100,77,74,102,118,132,111,90,77,71,92,117,101,92,75,100,107,145,104,87,98,116,102,85,109,109,75,86,94,100,116,129,118,108,87,101,83,104,156,132,112,81,71,133,130,97,130,115,143,168,172,172,157,167,138,86,103,87,127,136,124,109,128,151,130,107,124,124,115,158,151,110,114,138,111,102,122,122,127,91,84,77,89,116,110,143,145,132,82,106,92,91,71,53,52,106,155,189,192,233,212,230,224,244,193,208,198,219,214,207,180,182,194,188,211,207,200,206,205,171,134,160,111,101,103,90,140,120,127,106,86,109,81,110,125,105,115,92,96,70,67,67,49,49,70,83,94,109,111,85,138,223,255,252,255,218,246,208,176,171,187,179,120,28,32,90,156,54,61,117,106,62,59,27,15,111,154,223,190,185,184,175,164,184,224,184,116,112,95,40,9,2,63,105,57,73,38,36,23,12,25,30,48,23,42,47,19,21,34,29,9,12,52,46,14,11,23,38,47,30,28,23,36,15,28,39,20,36,27,21,36,55,48,83,86,80,79,50,109,119,100,111,106,112,71,36,14,5,76,44,85,83,90,93,99,97,70,88,120,115,122,81,80,71,48,30,117,134,80,101,67,84,102,95,78,101,113,127,103,103,74,109,72,90,115,88,51,56,57,112,64,53,58,30,13,29,14,30,29,24,51,58,64,68,94,62,65,103,105,88,61,70,57,62,84,84,87,79,33,53,62,107,81,89,52,74,79,74,70,79,52,71,65,56,81,63,51,31,52,57,35,50,24,29,14,16,27,19,19,24,45,46,43,51,17,34,20,43,48,55,28,21,35,8,9,48,30,17,37,45,53,28,48,45,66,70,64,73,60,69,64,64,79,67,18,31,7,1,27,27,31,29,34,26,28,35,10,32,39,56,48,28,18,27,21,13,4,15,49,33,23,11,28,13,6,11,33,15,10,13,35,24,16,16,21,38,34,27,44,34,39,38,41,52,37,74,79,37,44,52,24,29,8,15,9,17,21,14,13,38,39,38,65,15,52,67,51,51,73,63,61,27,36,51,23,28,42,25,39,55,47,39,67,38,24,19,18,63,23,24,64,39,32,53,44,4,21,24,16,38,34,27,24,21,46,59,53,49,36,55,50,43,59,66,61,51,79,81,69,99,69,93,94,77,67,47,45,25,33,56,24,86,121,95,147,136,90,136,162,147,135,168,111,142,129,98,123,115,131,167,131,125,105,137,185,183,139,145,128,186,144,136,108,124,117,154,170,133,115,82,102,12,68,133,109,103,80,92,67,56,56,56,46,55,86,95,72,80,59,60,58,73,160,103,8,1,36,4,25,7,26,20,31,21,35,17,26,26,15,15,17,36,2,18,19,13,14,16,157,183,153,147,169,165,164,122,152,152,155,143,133,151,209,254,219,78,104,57,9,44,27,43,29,33,16,48,55,43,131,150,118,118,94,125,88,125,126,97,127,104,121,118,114,150,109,79,114,106,148,118,137,143,105,128,110,123,116,113,104,93,124,93,106,117,101,93,123,105,78,103,95,133,134,142,146,136,141,128,116,115,114,138,114,113,131,114,75,98,81,89,72,95,91,109,86,84,101,93,120,121,119,95,141,123,94,120,116,100,107,100,65,86,74,82,93,95,97,105,77,97,115,95,96,85,63,105,119,97,83,89,75,98,105,80,97,74,92,76,43,100,102,103,146,130,89,104,100,77,83,99,95,58,66,87,88,103,124,90,77,110,148,172,192,245,205,163,156,133,106,109,117,104,108,115,95,108,127,119,108,133,132,118,139,132,106,91,74,86,143,89,113,80,66,68,82,96,112,136,92,73,95,77,60,93,44,75,178,176,172,207,230,226,229,234,163,189,219,212,199,186,193,166,208,199,242,218,173,182,204,135,99,117,107,88,118,118,117,116,130,71,47,62,71,111,105,155,111,67,88,79,51,46,43,53,82,81,103,125,129,122,154,249,252,247,212,228,243,200,172,116,172,202,156,136,136,244,200,74,45,89,32,9,52,34,50,96,184,204,185,178,224,202,193,202,163,130,63,31,47,20,38,66,93,85,23,45,43,10,31,50,44,37,43,43,51,59,28,46,37,37,36,20,50,9,29,5,18,63,42,20,20,14,16,52,14,27,8,33,46,15,66,86,61,89,83,94,132,73,103,73,108,120,108,56,19,22,37,51,100,107,103,85,96,97,85,96,99,93,68,113,104,94,76,49,51,48,101,118,103,78,95,115,93,88,102,80,112,92,98,94,96,90,87,108,110,95,89,56,80,84,92,79,50,14,15,37,5,16,51,86,69,55,98,92,118,102,84,109,91,86,83,42,76,63,81,64,43,14,51,59,55,74,67,62,67,82,74,64,62,64,88,62,68,77,62,56,44,60,58,76,57,35,8,28,33,32,31,8,29,19,29,33,28,26,17,32,58,30,17,43,41,28,41,26,26,34,12,21,21,40,44,27,35,45,64,37,61,84,49,14,68,48,41,12,5,0,13,17,33,11,19,24,13,13,31,21,2,42,7,32,45,37,59,39,10,14,47,37,51,47,19,16,21,2,13,3,25,16,49,12,8,16,27,45,34,41,20,30,33,19,0,16,28,66,61,90,59,40,59,48,62,44,14,18,22,26,22,59,20,40,28,75,29,58,52,64,51,54,81,64,75,45,41,38,38,56,55,77,37,31,29,45,34,36,33,27,40,53,52,50,58,40,18,36,37,26,49,12,25,15,39,30,6,43,66,45,55,59,57,60,61,44,46,39,71,68,78,103,62,80,98,68,62,36,37,36,28,42,40,33,52,134,132,115,121,88,101,125,160,132,124,109,73,92,88,113,140,196,207,197,192,172,181,194,197,184,155,144,168,207,158,157,116,131,117,106,132,157,148,120,114,76,111,157,118,103,100,76,92,116,107,73,83,87,112,128,84,100,115,82,121,159,221,112,65,5,2,9,25,13,30,3,16,14,21,11,37,16,23,9,33,15,14,21,26,14,24,13,147,151,170,158,131,154,163,144,133,169,157,178,135,154,222,230,217,70,135,46,7,55,4,29,15,27,51,44,53,80,124,129,132,122,108,113,79,101,83,108,124,138,139,105,112,132,131,98,122,108,110,111,99,98,115,105,115,129,102,117,139,112,118,79,125,105,130,104,107,76,86,119,108,123,154,158,136,130,135,144,153,152,136,144,143,147,124,135,123,132,109,127,113,141,100,88,89,59,61,82,78,97,94,95,144,106,73,124,98,114,129,89,108,82,111,95,106,86,104,91,113,94,87,83,90,81,66,87,78,78,89,32,65,86,113,81,82,86,84,64,83,93,81,101,103,124,143,123,92,99,114,97,108,95,85,84,95,84,106,112,101,137,93,135,143,194,184,180,227,190,127,112,123,125,73,86,64,78,129,108,92,123,145,111,112,120,86,106,93,69,122,86,105,103,88,54,59,73,117,144,89,86,83,80,62,69,37,45,116,196,156,196,240,217,230,211,166,149,204,183,174,181,176,191,205,202,181,183,155,162,163,110,80,87,82,84,116,105,130,111,121,68,65,54,75,70,103,113,91,91,45,67,54,25,19,62,140,152,117,86,124,126,172,219,246,206,155,186,173,179,120,158,144,99,113,77,114,145,139,45,16,46,56,31,89,81,62,130,165,185,179,211,225,219,134,112,143,65,40,6,27,93,82,94,51,34,20,50,13,21,25,36,77,36,35,24,23,37,31,40,21,43,12,29,13,24,31,38,11,41,31,41,36,22,7,37,24,22,31,27,29,56,63,61,64,70,88,80,100,72,97,104,105,98,28,31,14,43,85,67,114,115,116,89,90,106,96,88,109,102,95,97,125,111,106,97,95,77,149,159,85,92,84,103,93,73,100,99,103,67,76,88,83,65,98,96,137,117,125,89,72,87,95,84,48,34,10,46,58,58,43,62,50,65,64,107,98,110,126,124,114,118,54,30,53,68,52,50,59,41,41,53,54,50,51,52,60,64,55,55,63,55,83,44,56,67,44,36,50,55,36,49,28,22,18,9,11,13,9,6,36,33,24,32,38,24,29,55,63,38,11,11,32,20,21,16,26,50,36,48,37,41,22,37,41,55,53,16,55,57,13,26,31,7,35,42,13,34,32,18,42,27,34,24,33,40,28,3,23,15,37,30,21,53,18,36,8,24,47,16,9,25,21,9,20,30,26,18,24,52,12,20,29,22,13,30,42,19,40,12,8,22,29,20,60,80,48,66,48,70,46,39,16,5,4,24,10,27,4,21,32,27,21,43,45,81,51,71,49,56,38,58,46,42,41,34,29,80,47,27,74,12,17,47,34,25,71,45,51,49,38,53,53,68,20,48,39,19,13,55,26,8,9,31,15,50,46,76,73,62,67,81,62,64,30,51,67,65,42,47,71,75,51,40,39,14,16,25,33,33,21,47,120,128,103,62,44,56,81,90,105,102,73,64,63,82,55,101,141,172,186,176,132,193,166,179,159,132,155,130,148,165,145,140,139,132,122,127,150,158,120,129,144,104,158,187,120,94,72,127,143,132,146,150,158,161,134,152,128,121,153,132,172,186,175,108,15,4,11,20,2,26,4,15,28,30,11,31,14,17,19,6,37,33,23,16,18,26,36,6,143,133,151,148,127,121,151,142,163,146,144,173,167,150,211,237,194,81,97,46,4,57,30,51,16,50,55,47,64,58,127,137,122,103,89,110,95,98,117,114,115,110,98,99,99,111,93,104,131,101,102,111,113,106,102,102,107,119,102,106,137,131,113,109,119,115,120,119,99,121,109,122,153,128,159,171,137,165,164,141,139,169,138,171,152,180,181,165,159,190,166,153,159,161,154,162,135,130,104,147,101,115,145,127,142,141,102,147,157,110,116,136,106,106,96,107,108,74,100,99,83,88,53,75,91,69,88,88,74,78,86,61,34,93,99,133,125,91,113,72,93,98,107,109,92,113,112,135,125,140,118,92,91,88,98,95,58,60,111,95,109,155,132,76,87,123,124,146,223,192,117,126,110,111,80,69,66,80,133,115,122,97,120,119,115,119,119,119,101,125,75,87,120,126,101,100,103,100,81,113,71,63,68,75,88,94,62,80,135,186,143,153,236,247,246,218,147,177,202,215,208,207,185,173,174,193,197,184,150,123,155,67,88,90,87,108,157,122,121,106,111,96,66,63,62,94,104,112,80,69,81,116,128,139,137,151,220,165,171,157,136,166,230,236,195,135,117,133,151,130,147,162,96,99,115,69,29,62,92,16,3,59,141,120,102,109,102,172,186,213,232,201,200,178,113,85,116,64,33,10,60,105,69,34,56,17,21,44,25,64,30,34,67,33,40,26,36,16,45,53,44,47,37,50,48,31,40,30,24,28,49,35,40,31,18,43,27,39,23,29,31,53,48,61,80,70,56,116,93,107,109,128,79,22,30,12,38,101,128,107,119,87,105,78,120,98,91,88,92,91,108,112,124,86,104,116,111,81,173,165,78,90,87,115,98,78,102,87,91,97,127,80,94,106,103,138,110,119,121,85,80,70,77,92,87,60,57,50,51,47,73,74,49,88,77,108,86,110,113,113,140,89,54,45,25,42,67,26,30,39,36,43,23,36,41,41,37,50,27,22,43,35,45,56,25,23,26,40,38,41,38,52,30,21,32,24,27,47,29,19,24,33,30,7,13,28,23,18,8,21,38,35,28,29,4,40,45,16,26,35,49,36,44,42,58,60,33,50,33,42,40,33,29,64,29,16,32,23,28,18,16,8,0,24,5,42,4,37,22,30,44,34,34,45,6,29,20,36,41,32,3,18,20,10,26,25,30,8,36,15,33,14,12,20,32,23,25,37,46,29,61,8,35,70,25,67,36,60,54,42,50,70,27,23,20,15,32,28,15,11,22,6,23,46,32,64,53,70,57,55,57,63,49,49,42,21,42,47,30,32,33,44,38,56,43,35,44,18,27,37,47,63,45,62,26,20,37,32,24,50,31,48,5,12,65,48,62,68,59,49,77,64,43,42,36,60,17,65,66,73,53,84,71,63,15,38,35,28,44,18,10,19,95,129,126,90,63,91,54,53,56,47,40,93,53,84,78,102,117,110,138,120,148,138,134,132,106,104,126,125,87,86,106,137,106,92,75,99,126,116,110,110,138,132,176,195,127,104,111,124,129,132,137,153,183,140,136,116,137,160,146,179,173,179,152,98,17,0,10,7,10,21,9,18,21,20,21,9,12,17,31,4,18,4,6,6,40,41,5,2,127,91,140,129,131,125,117,148,165,141,146,147,116,194,241,243,179,58,81,24,13,38,7,35,17,29,45,54,38,40,138,104,143,127,103,94,131,113,101,98,106,95,105,67,96,127,89,115,106,117,137,107,100,105,90,107,117,109,100,125,95,104,122,89,104,96,89,96,100,107,110,114,104,144,110,148,125,141,127,130,141,137,131,128,105,119,111,151,166,167,160,128,124,146,181,168,168,158,165,156,194,168,166,180,132,125,104,129,106,117,110,121,102,108,77,87,88,62,71,109,70,67,60,71,85,121,76,66,59,66,67,63,64,75,99,129,89,104,69,82,102,111,127,122,117,103,94,88,106,120,87,125,83,80,62,73,92,79,67,101,83,121,125,109,109,102,109,125,160,140,115,115,93,101,61,98,104,116,118,108,109,103,142,182,117,98,89,113,114,127,105,120,154,98,98,104,125,90,99,54,78,94,53,91,123,107,81,89,95,164,135,110,110,222,201,89,106,162,223,211,191,172,178,163,151,145,168,155,151,129,136,78,130,101,88,102,103,95,90,93,134,109,104,104,93,124,102,120,116,56,99,163,219,255,248,248,246,251,255,207,186,221,235,222,152,147,128,173,165,122,159,144,136,182,182,143,115,123,137,57,22,70,180,128,156,80,158,226,196,241,234,145,138,156,54,37,21,23,20,68,81,47,35,30,51,51,54,41,26,50,30,40,31,35,52,29,41,48,23,58,34,30,34,38,29,20,51,39,43,56,28,6,63,39,25,18,14,42,49,43,57,64,79,108,105,89,80,147,104,118,100,36,45,36,5,46,132,122,102,123,85,81,78,94,88,109,77,71,55,107,92,108,119,90,106,106,122,95,205,205,92,98,93,124,93,78,99,61,77,88,104,87,112,91,87,99,147,138,78,34,28,77,91,96,94,59,47,80,73,45,81,74,91,84,102,81,84,61,83,51,60,64,51,58,50,64,46,22,47,18,60,63,30,13,59,36,53,45,44,21,74,41,23,54,40,28,41,22,25,20,66,18,30,35,25,30,10,51,25,10,36,22,27,43,37,51,40,35,51,9,40,43,47,42,20,13,46,12,22,30,34,50,52,32,12,43,36,46,30,34,47,48,29,34,63,26,48,39,13,8,16,53,13,22,51,20,9,45,15,31,25,28,34,35,15,51,14,57,25,30,5,37,21,27,20,32,21,24,30,31,30,29,26,24,21,41,51,38,41,1,45,31,50,40,11,46,40,30,44,38,28,23,28,28,31,29,53,42,25,31,18,25,35,21,17,29,27,40,51,60,60,43,47,50,18,36,48,36,55,50,51,24,54,24,35,17,38,36,25,89,64,37,50,51,39,23,12,22,19,12,14,16,7,30,23,47,64,62,67,18,72,69,46,82,56,61,32,79,56,73,35,65,48,55,21,6,36,9,38,29,13,31,56,58,104,111,99,80,70,71,94,60,44,53,88,105,96,104,64,87,86,105,143,169,161,157,120,107,95,86,73,115,118,120,126,88,65,109,105,94,86,80,92,110,132,155,121,125,113,111,121,88,108,127,121,139,110,115,159,141,156,119,160,137,139,92,30,15,26,14,15,18,8,9,0,24,12,8,25,6,11,8,30,32,23,27,8,12,14,13,116,143,131,146,141,141,133,145,162,149,140,131,138,137,252,253,200,75,104,23,45,52,19,13,11,46,9,61,43,78,109,115,124,136,88,105,111,100,102,71,64,99,125,106,107,133,121,116,115,135,108,117,110,114,106,97,117,114,107,104,105,85,61,110,94,101,99,85,92,80,115,109,101,96,113,76,124,100,85,99,118,99,87,54,39,79,97,84,112,101,123,120,104,103,143,139,158,161,145,163,153,152,154,112,112,52,76,91,88,92,97,106,75,96,63,70,50,45,44,46,62,73,67,80,101,109,117,93,67,99,107,94,148,127,126,96,73,69,108,113,124,127,107,125,105,105,138,108,91,112,94,108,94,85,89,82,52,64,68,94,69,104,131,134,117,94,105,111,123,103,75,101,107,79,90,138,130,96,113,107,77,122,132,175,110,96,127,131,131,154,123,141,151,98,79,98,105,103,94,72,89,57,78,89,100,105,112,116,80,87,101,57,59,67,110,54,63,134,181,170,161,164,157,119,135,75,96,100,94,158,125,122,99,77,95,93,103,87,66,62,94,99,82,69,93,92,103,110,113,125,96,181,213,246,249,251,236,227,228,180,144,223,242,213,187,195,171,169,147,128,136,211,182,171,136,96,105,155,120,37,6,30,147,166,148,153,203,243,246,242,130,90,93,71,19,0,19,8,82,55,59,51,53,33,51,22,34,16,23,43,49,30,37,13,52,26,22,49,31,30,40,28,28,49,56,52,24,2,19,35,37,35,12,36,54,58,12,38,57,67,71,92,97,104,65,101,120,134,137,53,62,34,14,41,70,104,123,123,103,99,81,67,121,99,87,93,92,85,66,84,105,90,103,109,108,56,84,86,189,182,78,89,81,103,85,79,75,57,92,120,91,79,82,106,89,126,117,61,14,12,33,62,94,86,112,67,67,88,57,47,68,74,98,74,88,112,77,24,7,12,46,62,12,26,52,50,45,56,38,64,64,44,57,47,36,58,51,54,55,53,61,53,22,32,41,50,47,31,58,49,31,43,14,14,32,6,37,34,47,14,16,13,38,39,22,39,32,51,25,29,40,60,45,45,34,27,31,60,10,17,28,36,19,50,38,25,28,41,41,48,43,13,24,32,59,26,41,35,24,7,42,44,31,11,26,29,33,27,51,22,27,35,39,35,22,45,41,61,38,42,24,15,48,31,36,21,32,19,37,44,23,35,29,34,30,64,51,29,38,65,25,13,36,60,38,35,19,18,23,29,30,39,48,13,48,0,49,4,23,35,36,27,43,14,42,31,34,66,46,38,51,49,57,13,49,12,28,51,51,56,56,20,23,6,34,48,27,7,20,29,34,71,48,18,39,9,20,27,28,20,20,20,38,25,21,53,23,49,73,70,50,77,55,66,62,57,73,72,69,58,36,52,61,65,46,32,30,38,20,32,44,43,41,56,45,81,64,83,37,39,87,39,47,75,63,63,63,43,24,39,42,29,101,123,126,103,62,83,87,108,93,94,92,82,101,80,90,108,90,81,82,100,113,92,78,124,113,114,128,101,91,103,92,100,107,116,91,106,119,87,84,107,141,157,140,110,13,14,8,24,10,13,18,23,28,12,24,16,8,4,26,7,34,17,19,19,12,10,8,17,147,143,139,143,153,134,161,149,132,169,175,143,130,166,214,255,186,92,148,18,22,36,15,39,16,57,13,43,64,44,111,86,103,117,106,87,109,116,109,63,67,129,106,90,107,118,111,118,134,102,96,92,120,88,97,89,98,81,72,106,115,128,113,93,84,73,121,96,91,100,108,102,81,75,110,98,103,95,100,112,109,94,63,65,52,67,76,67,77,79,110,128,79,121,98,109,98,114,141,122,138,116,79,59,54,42,52,63,41,96,88,83,81,67,64,75,40,64,32,29,75,46,67,97,112,90,100,102,74,99,119,70,145,90,52,67,61,74,94,129,134,102,103,149,122,101,152,80,131,126,76,123,81,91,115,108,117,75,98,115,95,117,143,152,114,124,128,130,92,71,45,89,98,95,111,98,87,33,77,104,89,68,104,86,91,96,94,114,136,146,142,112,138,85,89,113,107,124,114,114,122,93,116,95,118,168,128,154,83,66,54,57,28,73,91,78,85,132,151,161,138,167,155,148,147,113,105,103,83,103,144,100,117,107,89,82,84,74,64,84,100,81,55,74,79,114,74,76,73,48,92,141,138,174,154,131,112,64,118,56,92,163,164,198,251,210,167,144,139,142,208,216,220,168,95,73,75,135,115,35,4,64,168,153,146,204,255,162,176,164,92,51,26,33,8,19,63,66,76,58,26,46,49,39,36,51,28,12,39,35,43,69,49,28,22,53,49,34,32,56,64,8,34,38,37,47,45,45,23,54,12,53,40,34,57,57,8,43,65,107,85,106,99,91,102,119,115,117,71,34,38,13,51,88,107,115,107,107,74,69,79,92,82,103,99,101,104,75,87,102,75,136,95,104,114,105,100,124,210,214,87,79,75,83,94,65,82,63,77,84,104,134,107,89,102,47,22,18,22,20,13,61,94,86,68,51,60,74,56,47,82,62,68,87,104,93,64,44,39,28,38,27,39,76,47,50,67,39,78,50,55,42,42,32,34,48,24,67,59,45,40,59,59,53,64,24,33,46,65,46,45,65,10,15,32,36,20,6,34,18,34,18,14,36,56,32,44,25,24,21,67,57,58,41,32,41,41,52,58,36,30,46,21,71,39,55,65,36,57,28,53,54,36,39,22,46,32,28,13,12,0,34,28,36,50,39,15,26,33,6,52,42,34,19,11,62,32,54,60,15,22,30,39,23,39,22,58,37,60,69,64,29,5,38,15,29,40,7,47,27,17,64,45,43,43,38,21,35,12,45,41,22,41,24,35,43,7,16,38,33,19,18,28,28,33,50,35,23,29,42,48,44,45,35,31,24,33,28,10,55,50,34,38,18,46,30,29,39,31,53,24,35,53,16,26,8,25,16,42,25,27,4,29,59,43,42,45,37,73,65,44,41,45,54,38,54,33,29,64,68,73,49,49,57,29,31,11,29,38,27,63,50,49,35,52,28,56,37,54,46,74,62,51,84,42,43,5,54,46,57,16,51,106,124,123,118,94,102,101,113,115,111,101,110,98,91,96,103,90,97,81,94,121,93,81,107,115,138,148,138,74,98,134,86,107,100,123,91,79,92,92,112,141,102,126,81,31,29,12,28,8,15,30,19,31,10,8,14,23,19,15,3,20,19,11,18,12,21,19,8,126,121,130,126,126,149,157,180,134,154,144,131,126,154,220,238,149,108,135,10,42,36,25,62,35,36,47,27,54,63,84,94,93,103,78,40,72,89,83,72,88,76,72,79,116,95,112,135,107,87,101,117,102,115,113,71,119,112,98,117,145,105,115,97,89,81,69,84,87,118,104,109,106,107,117,117,79,136,126,95,84,62,92,105,88,86,81,98,124,89,97,111,124,116,117,92,89,102,120,156,137,76,63,56,45,33,28,58,63,98,77,97,92,84,43,55,29,80,53,69,64,57,68,62,72,95,107,84,102,102,113,98,105,48,35,68,67,82,59,88,100,110,117,111,104,110,109,77,80,110,94,104,101,81,85,112,142,121,112,111,94,121,133,149,128,133,114,115,87,69,71,104,115,112,88,111,104,75,65,87,127,57,70,62,43,63,63,65,79,102,68,115,108,123,128,186,135,160,154,126,149,171,154,157,190,189,169,145,91,117,81,80,88,77,87,98,114,118,134,164,141,130,157,138,135,107,118,107,94,98,84,104,106,90,102,81,98,100,127,102,56,73,71,94,116,81,101,95,90,68,36,100,50,55,20,67,71,26,83,47,97,214,143,209,220,192,237,218,180,232,185,251,182,132,111,108,118,163,131,105,76,114,154,174,163,237,231,123,142,154,50,45,7,36,53,50,73,38,47,55,31,42,50,13,25,48,60,27,30,30,29,29,33,33,41,36,47,49,55,37,34,5,46,47,36,48,34,39,32,29,44,63,49,34,57,81,51,67,102,117,121,103,102,84,121,114,72,38,14,33,40,69,104,119,99,123,60,96,95,120,96,94,116,97,82,146,107,89,62,77,119,112,56,104,106,89,107,97,166,205,135,94,57,88,64,81,99,99,110,101,113,125,126,91,29,51,47,29,36,15,34,60,82,50,78,43,70,54,48,59,75,74,101,81,113,114,66,21,11,15,26,59,44,43,56,53,51,49,74,44,66,60,44,36,44,45,22,62,57,59,59,50,70,49,68,38,58,35,36,46,53,32,20,23,28,19,21,21,35,16,27,30,29,29,34,49,37,23,44,14,53,65,44,23,45,26,58,58,48,27,49,43,45,45,47,35,70,47,36,42,18,33,26,50,76,38,42,39,31,16,20,12,16,34,25,65,9,27,29,32,39,47,38,38,29,54,49,51,79,48,28,33,48,25,32,41,49,65,58,50,81,35,23,12,13,32,42,17,35,33,37,24,47,44,13,47,45,40,44,22,28,52,21,3,20,29,13,14,1,25,9,40,42,44,25,41,16,24,25,53,22,27,47,20,31,19,9,38,30,39,32,3,15,23,63,25,16,37,27,27,21,36,21,47,15,7,30,16,31,14,23,17,18,25,44,39,43,48,58,43,48,65,38,44,31,49,75,62,46,50,55,38,51,45,33,24,33,38,48,19,40,35,60,36,33,20,45,46,59,88,121,85,90,68,42,45,34,93,99,72,92,98,105,113,116,124,121,111,85,111,99,86,95,110,127,81,127,121,106,88,77,90,104,69,48,77,88,114,136,129,128,105,123,130,111,123,115,100,76,96,82,91,104,111,63,64,17,35,7,11,28,17,10,9,13,10,18,12,36,19,4,43,29,21,10,11,8,27,40,5,118,57,111,77,114,95,110,114,120,149,151,145,143,164,251,237,165,62,73,18,51,51,27,40,12,25,30,75,55,71,88,106,87,61,104,61,74,47,55,59,72,74,52,46,64,90,92,130,114,99,75,102,115,53,105,104,92,116,90,116,118,119,82,101,102,109,111,73,83,79,110,100,121,92,94,106,80,85,89,86,71,64,106,115,138,102,103,86,77,70,54,100,120,130,106,105,122,97,129,148,100,85,76,83,48,81,64,94,61,100,89,103,120,77,96,74,111,101,122,85,63,78,75,97,56,44,54,68,67,98,86,64,74,72,62,68,85,67,35,66,86,79,102,85,129,113,77,60,91,125,127,138,84,84,87,122,143,142,121,90,80,114,131,137,147,143,123,102,114,87,77,96,81,75,88,81,51,50,90,59,66,50,76,76,61,109,68,62,49,69,111,147,138,135,156,163,187,180,151,176,166,133,153,157,153,153,148,106,77,63,82,78,89,67,66,82,70,117,118,93,103,112,129,85,106,96,89,91,92,87,89,76,94,104,106,94,82,101,93,119,98,95,86,83,87,65,94,88,101,37,66,91,67,55,33,114,146,54,138,88,141,224,108,165,191,216,228,231,213,185,160,180,158,139,138,159,158,169,150,139,116,121,151,180,178,186,207,90,105,69,12,17,27,57,60,58,47,39,35,57,21,21,35,11,18,32,48,26,27,40,15,22,25,17,40,13,41,39,24,33,46,42,54,35,21,57,33,19,60,63,58,60,39,55,51,73,62,45,74,115,80,71,68,109,88,54,21,19,18,40,88,120,136,107,103,80,110,96,98,79,85,88,109,107,107,85,115,109,110,101,104,99,116,98,86,90,71,89,86,174,162,56,61,78,82,120,116,120,114,89,99,124,75,37,32,37,36,15,20,53,73,71,64,50,66,42,81,53,68,75,84,103,89,83,84,85,51,33,16,24,21,63,28,39,69,51,36,38,48,28,40,68,43,35,42,23,48,36,20,20,66,43,73,64,63,6,55,60,68,50,20,36,26,10,37,5,27,31,26,23,37,54,16,33,46,53,32,60,31,48,60,51,42,36,29,41,58,40,57,54,47,72,27,54,29,52,45,35,46,7,28,46,38,39,41,23,35,11,45,6,19,2,20,32,34,15,30,50,37,26,34,50,5,49,59,65,48,35,47,31,49,33,31,47,29,39,58,30,21,49,100,87,50,46,28,52,42,32,50,42,28,36,41,45,43,29,39,44,36,34,22,41,27,28,0,44,26,6,33,27,12,28,23,21,28,24,16,35,19,47,46,13,14,13,36,15,38,14,30,43,24,29,53,41,17,43,0,35,22,30,42,27,33,37,36,44,14,14,35,23,27,25,21,29,31,36,50,21,34,49,55,58,33,46,45,36,39,45,33,70,42,46,47,54,48,33,43,69,25,31,37,26,21,58,38,36,39,49,75,144,129,128,97,64,22,90,104,120,88,97,104,103,125,127,121,114,106,114,97,105,70,73,85,121,125,87,116,96,105,47,53,66,96,81,75,102,92,53,106,114,131,122,92,101,117,105,115,105,100,106,92,102,79,100,77,87,29,5,8,23,34,22,24,16,30,15,17,14,17,32,42,21,41,29,22,8,23,1,6,3,82,99,88,77,84,95,70,85,54,135,110,135,150,208,242,240,162,59,55,30,32,49,44,37,20,44,67,55,73,74,120,95,114,105,102,112,86,65,84,93,75,70,45,58,75,83,90,109,94,93,99,124,107,110,75,113,112,102,97,101,135,80,123,110,108,134,106,120,111,145,116,117,119,98,81,107,81,88,60,69,65,84,112,123,120,101,80,79,42,54,62,51,78,75,91,101,122,111,106,84,102,82,74,107,85,67,104,76,98,123,104,105,104,84,72,75,100,104,107,75,101,107,83,105,76,74,51,62,63,93,98,56,82,87,95,77,59,67,36,69,60,40,73,88,130,125,85,50,105,132,119,98,129,96,82,110,151,142,113,79,94,117,167,156,141,142,125,100,121,111,105,52,71,75,107,70,76,58,66,57,66,53,67,56,70,108,84,46,84,64,100,133,170,150,139,157,140,129,179,180,130,88,73,35,44,99,53,59,46,76,54,106,45,98,126,58,86,102,127,130,100,117,107,84,86,92,66,96,78,97,81,90,80,66,89,104,87,67,88,90,80,85,64,55,71,102,79,41,89,81,81,127,88,60,65,155,204,86,120,163,196,169,109,185,205,163,185,187,164,120,71,88,98,114,153,140,152,175,181,167,177,179,191,235,130,101,110,64,22,41,8,55,42,56,65,43,37,39,19,38,59,42,48,27,30,27,61,32,43,45,55,48,12,32,22,36,52,48,46,32,29,24,68,18,25,29,41,57,83,92,43,33,70,74,101,111,77,52,36,75,83,70,73,50,38,15,15,39,66,96,113,119,104,91,133,105,93,51,64,89,86,89,84,89,119,63,109,107,110,89,110,123,110,84,112,97,115,66,99,155,207,108,51,70,83,64,106,92,100,127,104,55,30,29,7,23,12,26,69,82,124,113,95,84,81,63,94,83,58,83,74,104,96,78,88,55,41,32,3,11,46,31,48,18,37,31,35,18,21,26,34,37,51,47,45,51,64,59,42,85,55,52,56,49,38,42,19,44,48,47,48,31,34,43,39,53,27,17,39,34,48,44,36,42,6,55,44,32,38,45,36,50,77,58,18,35,52,38,29,42,58,21,45,45,45,50,39,34,43,13,17,26,19,43,61,7,17,24,41,5,34,43,46,28,45,37,33,37,33,24,38,35,26,33,27,60,51,56,44,56,7,11,51,35,47,39,29,38,43,40,64,66,54,41,44,60,40,22,32,29,48,24,26,25,77,13,16,31,17,31,34,29,18,9,14,6,18,20,15,37,21,17,7,44,6,22,42,31,46,31,39,41,24,20,21,14,43,59,62,45,10,24,31,28,43,35,30,23,27,51,50,51,45,63,31,17,7,26,22,5,18,19,33,12,36,19,32,34,58,59,48,20,67,36,47,47,29,25,42,26,44,36,16,56,52,21,51,53,48,44,46,15,26,45,25,61,64,82,151,155,117,113,142,120,125,105,82,93,108,129,154,131,135,123,118,96,87,110,91,86,82,96,132,113,81,73,78,86,98,88,71,98,105,85,61,91,60,47,94,104,106,121,60,89,103,137,129,101,82,88,73,136,169,165,119,62,9,15,16,19,26,7,43,2,33,14,31,18,22,22,19,12,27,51,28,9,21,18,9,24,86,75,76,50,73,88,74,57,75,113,112,136,120,182,245,239,157,65,76,41,41,27,37,13,6,24,33,52,85,88,105,105,113,110,98,96,105,83,109,95,73,66,65,66,45,76,93,93,90,98,89,91,107,105,80,118,108,77,115,95,90,99,113,113,123,130,130,135,129,82,87,105,63,90,74,66,91,62,80,103,112,85,107,92,97,79,84,48,72,64,71,76,66,87,73,86,102,105,117,78,68,81,91,82,89,100,88,138,99,102,55,50,86,84,82,113,102,85,76,87,62,73,79,109,84,82,69,74,67,90,82,84,101,106,103,90,83,66,61,50,72,101,90,113,68,96,90,85,117,126,137,105,81,91,115,106,122,137,134,123,120,121,152,149,172,115,116,88,91,118,117,85,84,93,90,59,80,62,65,51,77,76,37,54,73,66,65,55,94,113,77,80,106,67,95,65,87,108,147,131,93,36,40,15,44,46,4,15,46,77,84,161,162,53,66,122,124,136,158,135,121,103,57,94,84,100,126,109,81,83,102,81,74,79,50,78,87,86,76,79,98,50,81,70,58,91,51,65,65,92,119,116,57,56,74,148,179,102,144,222,207,135,132,209,196,218,133,81,156,75,17,29,29,101,140,114,138,154,141,168,167,155,178,187,82,64,29,24,26,24,52,76,30,36,52,32,46,41,41,36,38,49,38,28,39,37,41,34,43,50,38,28,24,32,50,13,34,37,34,39,31,32,49,28,39,40,40,91,112,78,70,53,9,55,124,132,124,105,87,80,110,53,22,60,33,42,56,94,99,100,91,99,107,127,134,109,115,87,104,99,88,114,93,82,105,98,110,102,84,107,108,113,103,111,70,100,89,78,79,120,215,144,72,90,87,90,108,116,126,77,59,42,27,12,10,24,43,76,103,119,86,105,123,129,127,71,78,92,134,96,94,77,80,82,115,57,45,10,1,29,34,52,16,47,27,76,34,36,48,36,30,46,39,50,51,79,40,68,55,47,50,62,46,42,48,52,53,48,39,29,54,31,41,29,9,25,25,8,18,33,58,48,36,59,41,59,54,90,54,47,50,52,53,40,56,45,49,60,29,34,45,29,40,42,39,22,41,44,33,20,19,42,30,32,25,15,11,23,44,44,19,33,45,23,5,24,48,47,45,62,40,60,13,38,58,50,51,49,35,46,45,18,30,37,32,28,25,18,47,42,26,60,69,46,20,10,21,53,33,54,27,40,51,36,58,46,10,15,5,25,18,32,4,7,31,17,28,11,33,38,35,30,43,39,45,47,39,26,5,35,36,25,51,17,13,48,44,61,31,14,32,44,40,29,47,40,34,26,28,52,59,30,20,51,64,40,19,20,1,10,20,21,29,38,29,43,30,18,31,75,38,41,63,57,53,44,40,42,44,27,52,30,32,36,16,62,24,57,44,38,34,34,51,19,12,37,76,134,122,123,127,123,147,115,129,122,107,106,114,139,155,119,107,81,141,112,87,95,125,63,91,90,117,108,103,89,99,113,116,126,136,100,56,49,70,79,46,44,52,74,63,71,44,66,107,120,122,87,76,96,74,154,165,166,161,110,25,1,9,18,24,24,12,14,2,11,13,34,45,16,19,21,20,13,26,42,11,6,16,9,86,84,35,40,70,55,88,80,81,94,101,93,73,179,239,247,119,56,100,37,36,24,19,39,35,38,32,41,65,83,92,101,78,61,66,81,105,83,85,92,90,89,84,63,48,59,48,47,73,81,89,88,94,90,70,113,97,113,84,107,100,104,80,99,70,100,69,91,93,103,83,109,75,92,107,78,79,82,74,94,111,91,108,85,51,88,56,111,73,64,115,100,97,103,106,117,80,89,60,68,70,77,98,100,99,97,113,132,119,129,102,88,98,79,87,109,97,103,91,87,100,66,74,91,98,111,75,41,58,94,91,72,72,97,76,115,66,86,81,79,81,107,123,62,67,102,102,134,138,125,173,137,106,93,107,94,82,104,142,135,145,166,159,116,159,157,144,106,76,79,127,94,79,97,63,62,56,76,59,80,73,63,89,59,45,70,81,82,98,93,69,81,53,31,49,55,60,88,113,95,77,81,44,20,35,28,45,31,71,97,90,187,119,66,93,91,152,121,102,125,83,80,109,102,127,93,120,99,93,83,90,63,78,72,66,71,113,55,46,60,99,84,53,39,42,47,54,25,35,70,105,96,52,88,62,88,122,142,214,238,158,94,167,246,226,169,16,78,130,61,24,23,23,83,110,29,75,59,93,91,81,103,111,34,39,35,23,27,38,72,63,66,51,56,34,56,67,43,60,54,80,43,28,17,57,57,37,20,32,44,36,27,15,26,40,48,43,31,26,35,20,31,34,21,49,36,34,69,114,78,69,36,42,31,86,128,140,121,146,140,72,29,21,34,55,50,97,103,102,134,98,86,108,98,108,110,99,97,87,109,86,60,97,101,98,88,77,86,89,94,108,87,94,121,99,88,81,83,67,89,187,179,92,86,92,107,150,123,75,15,37,2,17,12,53,44,91,108,106,91,115,120,124,146,84,74,99,74,81,75,101,104,88,79,83,27,22,9,16,17,40,55,46,43,57,65,56,38,30,47,31,44,53,40,35,30,45,43,54,36,38,51,30,44,41,36,48,9,68,49,45,40,30,30,33,25,36,13,37,54,47,43,62,19,29,27,55,11,34,37,48,22,25,23,28,40,37,60,52,67,30,36,65,32,40,39,20,12,20,21,41,53,53,40,28,9,25,21,13,42,22,31,22,38,24,26,34,47,50,45,48,31,36,51,40,14,57,39,28,22,26,65,49,36,42,41,53,44,22,21,30,26,81,105,37,17,30,28,38,23,47,31,47,31,52,30,20,20,28,23,25,27,10,23,15,31,30,9,7,8,36,17,33,43,25,29,40,28,22,12,60,43,26,34,38,26,42,44,40,38,35,26,9,27,35,23,23,23,24,59,50,46,8,17,21,18,36,13,39,34,24,24,34,65,25,20,34,26,40,74,43,70,69,73,30,46,42,24,49,58,33,53,51,28,17,63,21,12,27,59,15,10,21,12,31,20,33,83,88,98,106,110,88,81,89,99,109,120,95,105,110,95,101,88,82,105,70,68,114,100,65,119,108,104,94,72,115,102,108,118,113,62,71,45,73,73,55,44,27,64,55,75,86,85,103,123,128,118,122,106,98,134,187,161,139,71,19,4,15,2,45,30,23,7,30,27,0,14,13,1,23,0,54,16,7,12,10,28,2,27,78,59,84,70,94,93,96,94,102,116,77,112,98,189,241,251,117,67,71,13,34,33,50,28,48,44,26,75,51,81,100,107,101,60,55,82,91,90,55,71,64,81,113,73,95,67,57,52,69,65,68,90,65,87,107,106,116,98,93,114,121,92,103,79,57,69,67,70,118,96,131,155,85,69,94,74,102,106,86,90,117,104,75,77,93,118,105,85,108,104,102,101,85,99,100,99,95,75,77,74,86,81,41,86,107,112,135,150,131,135,132,140,143,93,90,102,75,82,71,84,117,81,52,70,95,111,59,71,53,72,81,115,90,97,128,103,67,59,68,101,109,102,102,75,99,102,122,112,90,101,111,124,91,74,91,63,54,65,93,136,139,129,136,117,156,166,144,107,77,76,102,125,78,89,75,63,44,75,76,70,76,65,51,47,104,113,70,56,85,75,51,45,100,75,68,34,50,94,112,109,138,92,114,63,82,91,68,113,113,112,73,135,118,71,58,112,124,102,107,78,72,85,92,97,103,96,87,77,66,80,95,57,63,69,44,91,53,92,73,60,94,72,53,49,68,72,66,44,41,19,64,54,45,62,71,90,158,229,245,130,80,102,171,226,190,76,20,25,136,91,96,101,125,121,54,14,6,23,4,34,22,17,12,29,34,29,35,47,74,39,69,45,27,15,31,47,49,81,46,17,83,32,41,30,54,33,26,55,32,6,40,58,11,41,45,51,36,57,42,17,17,40,60,40,58,46,49,69,96,90,70,41,36,44,51,102,172,149,96,62,15,23,50,57,78,96,110,94,93,82,89,78,103,125,103,101,80,93,70,118,98,105,108,92,95,112,90,105,95,92,104,75,105,139,94,91,113,119,95,107,133,207,128,86,114,97,91,54,22,19,23,21,43,36,83,97,88,99,93,86,126,91,112,98,115,92,84,79,76,79,90,66,52,69,50,31,21,8,28,40,29,36,43,61,68,38,30,57,56,63,45,48,42,23,82,29,47,51,26,59,59,20,63,48,22,38,26,50,55,60,52,37,18,39,24,31,13,10,19,40,40,55,45,56,22,36,20,60,18,25,38,33,29,45,13,51,54,57,22,34,37,39,2,62,32,15,10,30,28,16,36,28,20,56,16,9,25,25,13,8,23,6,0,19,25,38,20,35,7,14,28,50,41,52,12,55,58,32,53,28,22,27,32,28,13,29,19,26,47,14,27,11,68,78,69,62,41,40,6,32,26,40,55,42,35,35,23,24,26,46,14,6,15,8,21,26,39,22,33,60,35,48,20,34,38,14,46,11,23,29,31,24,13,12,60,54,54,38,40,26,17,23,15,3,25,6,37,67,45,51,36,37,50,29,0,32,29,20,30,29,21,30,15,37,65,34,54,14,38,33,18,35,57,68,69,50,27,43,57,41,29,44,38,24,16,22,31,44,24,40,16,20,27,10,50,30,70,76,72,93,58,92,74,67,80,82,79,104,90,111,131,68,90,64,95,83,83,91,120,126,98,100,116,77,113,85,58,74,57,104,71,85,74,26,71,35,38,30,9,96,82,98,74,128,98,121,134,106,124,119,108,113,151,157,129,64,29,0,3,21,21,10,19,10,7,28,6,4,6,6,18,1,1,3,11,2,31,15,11,22,71,54,85,71,97,98,87,127,117,125,121,115,107,187,230,243,123,45,67,5,40,14,45,34,23,35,28,81,70,106,82,102,94,44,71,61,90,88,79,42,59,62,61,69,69,71,65,81,62,76,93,92,112,97,93,110,115,125,112,129,94,75,81,72,79,79,78,98,86,121,158,133,110,98,122,100,118,141,108,125,129,67,74,59,124,120,110,100,110,94,137,120,98,76,81,88,89,66,108,105,72,114,71,80,89,121,147,142,124,128,155,138,146,150,132,123,152,117,135,112,87,149,94,78,110,99,96,107,102,85,124,95,90,114,93,163,74,67,87,84,104,75,81,78,65,103,84,100,113,86,113,102,81,88,96,78,100,82,94,93,121,153,133,107,120,123,109,88,79,81,129,140,136,93,66,74,60,52,43,61,88,77,61,80,116,113,122,79,92,109,80,117,129,127,99,83,65,107,127,113,126,135,145,131,93,134,127,130,134,123,85,102,89,105,133,117,132,138,105,89,49,87,74,53,60,54,88,71,84,85,98,79,90,71,78,127,95,92,82,70,53,67,82,123,99,90,54,82,58,54,62,27,77,49,58,131,249,229,144,34,31,43,89,143,124,45,1,38,78,115,120,107,100,117,85,45,78,89,48,55,27,65,94,48,58,63,78,77,48,39,67,44,36,51,42,36,36,43,41,47,58,65,47,47,54,45,49,34,44,27,46,24,43,36,43,45,42,51,43,20,72,55,55,69,39,52,70,84,93,88,78,63,65,31,38,38,97,74,49,29,48,88,74,105,97,83,122,97,101,110,94,88,109,114,97,105,121,76,73,70,98,101,93,74,111,108,106,88,102,86,106,104,124,148,97,90,105,137,142,48,58,176,159,113,62,16,8,19,24,21,18,36,70,81,106,90,125,106,129,85,105,119,107,103,108,49,67,74,78,60,79,111,107,77,68,15,3,12,10,30,43,41,43,37,27,55,28,19,37,21,60,53,2,48,38,95,49,57,35,22,34,18,16,15,48,47,35,35,47,30,15,44,58,66,2,18,14,37,16,46,21,29,27,25,34,32,38,41,51,32,29,41,45,32,53,62,36,68,31,49,29,39,27,13,53,19,28,24,36,37,35,27,42,41,23,1,31,20,15,14,39,16,27,17,27,52,26,33,32,21,32,13,41,60,12,18,14,37,12,38,28,47,18,34,28,32,44,43,25,49,45,49,25,86,80,66,26,31,43,14,6,22,23,48,26,45,19,48,32,36,21,26,30,25,36,33,22,10,26,19,19,17,42,46,13,15,21,36,44,38,51,41,36,49,56,66,48,52,24,30,5,18,46,27,18,13,14,54,33,46,37,63,43,24,24,28,5,9,11,15,23,13,21,46,43,41,33,42,53,43,25,34,44,28,42,59,15,55,50,45,43,63,45,29,52,40,37,57,36,38,31,5,29,21,51,63,52,52,64,58,58,41,56,89,83,49,81,112,129,101,93,58,62,96,92,89,91,125,118,144,110,98,111,99,99,68,39,45,59,89,62,43,38,53,48,46,33,38,101,122,99,108,92,117,110,102,72,106,113,95,82,90,87,85,75,61,23,10,14,18,12,10,17,24,15,15,37,17,21,7,7,16,2,22,8,20,18,26,5,10,59,35,78,84,83,98,96,108,136,126,133,137,159,209,216,225,122,42,81,33,46,36,31,22,35,41,52,68,66,85,89,77,77,59,66,72,82,85,85,82,25,39,53,50,79,56,75,82,77,78,64,94,162,104,87,83,128,100,112,102,94,90,96,82,118,148,118,149,115,135,144,96,96,115,110,107,110,159,123,91,120,94,72,40,108,107,79,98,101,110,144,197,67,54,86,130,118,89,109,114,113,119,77,107,140,153,135,107,125,135,92,109,136,147,137,132,145,124,147,96,152,179,139,114,116,91,87,121,105,121,102,104,106,97,150,181,71,53,92,89,126,93,101,96,63,84,88,133,175,160,114,129,99,165,153,137,138,152,137,150,165,153,157,148,145,146,139,97,94,74,125,162,114,130,98,72,47,56,52,68,54,66,65,118,113,114,81,87,64,107,119,129,136,119,142,139,105,88,116,128,147,107,147,124,114,157,127,121,151,110,77,100,140,102,151,118,91,120,100,113,101,84,114,90,97,96,84,80,64,112,109,79,56,71,92,92,81,66,62,58,75,49,41,110,149,84,53,83,55,102,128,97,77,82,5,100,192,206,170,35,0,70,44,70,43,20,15,45,88,88,105,93,98,85,119,153,96,103,97,88,69,161,142,105,97,81,83,35,27,34,44,43,31,45,45,48,60,48,66,34,31,38,32,41,44,26,27,27,62,34,39,53,43,36,42,26,26,42,12,37,25,43,60,47,36,62,70,86,105,83,58,26,39,35,68,51,52,50,33,53,53,84,64,104,124,80,119,75,102,77,96,83,117,77,96,91,100,87,101,57,87,101,88,109,97,116,107,95,122,105,112,134,117,122,55,39,92,122,65,27,28,111,184,104,30,10,18,16,22,10,27,45,95,101,140,116,106,93,105,116,114,89,95,106,54,83,64,67,75,61,69,67,78,83,73,38,25,2,21,24,41,38,33,52,24,43,29,45,33,29,53,32,53,19,16,36,21,62,30,31,28,54,20,30,53,47,30,32,24,43,8,39,51,40,32,12,13,25,22,13,38,39,21,49,25,46,29,87,17,20,33,35,29,29,32,77,52,41,46,37,44,69,32,40,41,56,40,47,33,25,31,29,54,21,24,46,21,22,37,43,18,29,23,29,61,30,50,42,22,30,32,19,44,29,24,45,11,46,13,9,22,40,17,19,22,31,19,30,19,15,20,43,7,19,59,65,62,20,55,32,34,28,31,56,12,13,25,41,33,27,45,6,7,0,20,10,34,19,26,26,15,38,52,27,33,16,30,44,45,49,46,56,75,41,38,54,44,54,37,3,42,30,22,34,22,47,26,41,55,48,44,63,26,38,16,22,23,14,12,4,36,19,38,52,43,42,47,40,8,64,17,34,31,10,48,38,33,57,44,45,85,57,50,45,53,42,43,46,30,49,51,15,7,25,27,22,38,52,63,58,85,44,55,78,90,87,52,108,118,111,110,59,70,72,108,134,96,98,108,131,118,96,115,83,99,108,125,109,84,126,119,82,89,67,105,87,85,82,108,152,121,146,102,133,99,119,103,81,91,106,91,105,76,66,93,86,29,3,5,27,9,18,24,7,0,24,13,27,24,25,37,3,12,4,20,34,16,5,9,0,65,27,65,45,95,79,93,114,137,124,124,137,129,203,227,243,89,51,59,21,40,26,27,26,12,36,27,53,83,119,85,59,30,48,51,38,81,91,111,121,70,36,37,57,82,81,84,51,39,71,53,79,185,99,66,98,130,111,94,114,94,120,86,99,110,135,133,128,103,103,152,89,85,77,88,100,102,174,105,78,86,84,76,55,83,91,80,67,94,80,180,213,66,63,82,142,129,88,62,72,92,110,104,115,136,110,115,86,69,127,53,76,124,147,146,125,111,112,106,110,95,212,147,94,118,96,93,83,83,84,106,77,69,49,155,182,69,47,100,110,118,88,134,180,160,92,107,171,232,170,145,140,156,207,181,155,158,164,173,185,183,174,199,202,221,153,178,129,119,121,122,115,95,88,81,74,80,74,80,80,39,56,81,99,111,82,56,56,108,86,103,132,106,126,145,149,141,116,107,120,92,130,119,122,112,140,106,125,123,119,92,124,132,100,100,92,104,101,115,121,131,166,143,132,115,125,95,66,48,76,104,107,79,107,79,95,91,113,59,75,95,90,62,67,123,96,70,26,16,53,101,109,110,91,33,15,35,141,187,28,9,68,92,123,47,22,93,93,114,125,85,68,101,73,97,98,110,116,91,109,87,120,92,116,97,57,65,54,68,27,55,50,48,47,44,47,57,73,51,33,23,46,30,44,37,57,37,29,46,44,56,41,31,29,50,42,37,19,48,32,53,55,57,82,59,77,86,98,116,87,67,49,45,56,35,54,51,46,89,98,99,112,110,107,128,92,107,123,100,91,92,87,112,100,93,87,88,73,101,99,99,99,89,105,97,126,78,61,101,115,131,120,79,61,27,58,53,83,41,11,6,55,140,105,58,44,18,24,35,35,66,82,108,111,74,98,131,114,88,81,99,99,73,47,64,54,51,73,73,86,64,72,83,69,53,37,27,15,24,35,59,56,30,36,48,29,35,46,46,25,35,21,8,34,29,43,20,63,27,39,19,49,18,52,62,21,53,12,56,34,34,42,59,47,13,40,4,24,33,39,49,30,44,58,46,29,42,33,17,37,44,23,67,51,36,24,19,47,65,59,61,46,41,49,48,54,43,46,32,45,53,37,31,32,16,13,57,26,30,27,31,24,43,21,22,49,11,10,26,22,36,34,39,7,10,40,44,8,43,23,23,38,8,28,40,8,38,14,33,7,12,45,46,17,54,84,67,49,29,39,28,37,13,28,30,30,25,17,22,11,8,34,25,31,6,43,21,10,18,27,16,18,21,23,17,25,55,52,36,24,42,25,33,38,36,20,60,30,26,25,17,28,57,35,38,34,18,79,40,13,51,45,3,15,28,12,6,22,14,20,10,30,44,55,46,31,38,53,38,31,27,3,3,16,38,35,32,39,14,60,43,44,54,54,44,49,45,23,29,39,0,40,29,32,37,45,43,87,32,40,61,67,76,103,104,82,78,94,94,80,90,56,63,53,91,97,86,71,62,87,97,116,127,160,148,165,183,176,158,141,132,145,164,163,138,149,142,142,146,163,131,129,145,169,124,153,137,149,144,113,131,127,114,128,146,130,5,5,39,18,33,11,8,0,7,15,32,3,28,0,11,1,9,6,42,18,1,32,26,17,94,72,33,34,58,80,81,139,140,120,97,122,139,226,245,238,83,43,44,4,47,16,29,39,27,32,27,63,84,127,130,93,56,41,64,53,70,95,148,142,92,68,35,69,75,88,74,47,34,33,89,99,182,133,84,125,129,140,86,83,75,106,92,91,125,82,74,76,70,118,100,62,57,60,64,63,117,162,113,59,111,102,91,115,111,108,89,82,96,92,157,198,63,50,89,161,127,72,85,71,84,106,92,97,119,100,117,92,102,174,58,64,99,107,99,95,109,98,98,102,115,170,81,98,90,92,98,102,71,107,105,83,114,93,176,193,72,78,89,105,109,73,181,226,215,112,95,174,165,141,146,135,184,156,102,140,130,157,148,166,137,168,169,141,107,88,103,111,81,101,75,82,68,51,34,58,59,68,54,54,52,43,33,54,90,75,66,81,73,64,45,68,90,94,119,91,125,119,116,101,106,116,108,112,125,132,135,101,130,94,98,112,140,147,149,150,163,140,140,136,144,135,129,124,92,125,121,105,97,79,77,84,62,91,92,69,81,73,80,61,71,83,52,122,144,118,116,79,56,81,134,144,95,75,78,88,47,43,122,47,21,99,155,152,93,115,129,118,114,117,104,114,119,120,113,130,125,126,132,105,123,113,90,74,35,44,30,59,73,48,66,48,52,59,45,42,70,46,44,57,65,68,43,32,32,35,58,47,44,19,38,41,37,23,49,37,56,49,54,51,47,56,92,57,64,69,74,77,83,121,114,73,33,33,58,50,41,49,68,124,139,131,119,94,61,69,84,113,67,78,100,123,89,110,99,89,96,117,102,93,99,91,82,95,115,93,83,106,109,92,81,58,33,20,18,68,25,41,51,30,25,78,153,121,109,36,28,44,27,73,78,107,119,88,85,94,69,79,110,109,92,73,87,68,66,69,60,63,71,45,94,93,83,85,37,36,15,31,40,52,38,47,37,25,29,38,25,16,51,48,44,22,23,44,48,43,32,30,31,26,35,36,26,42,19,33,21,5,13,40,35,44,41,23,12,18,21,33,37,55,43,28,70,49,44,14,29,46,50,37,31,53,30,46,27,41,39,46,47,37,59,50,45,43,60,37,52,45,23,31,43,50,39,10,24,20,15,21,8,11,9,27,43,43,5,33,24,41,14,18,41,31,62,8,53,41,27,51,43,52,19,34,19,51,31,15,39,12,30,29,18,25,31,26,6,44,107,60,39,31,33,11,26,30,48,37,12,10,47,23,27,26,4,22,38,21,20,33,31,31,43,16,37,30,28,15,38,32,65,37,32,33,41,44,59,47,31,30,35,18,31,28,17,38,32,37,41,32,47,23,33,40,21,27,23,44,2,12,10,22,14,34,29,36,47,27,34,29,58,56,41,38,37,10,17,40,15,25,41,29,65,46,49,11,71,53,36,48,52,21,46,17,28,24,51,59,52,50,50,44,48,61,62,73,53,70,69,76,55,80,84,60,51,33,52,34,98,13,36,36,53,116,151,162,168,160,199,176,178,208,201,222,173,199,199,180,186,188,180,174,161,183,168,171,168,174,185,177,156,147,168,145,180,156,127,98,14,22,20,33,2,8,24,18,24,33,13,33,22,20,6,14,39,6,21,26,22,31,46,18,172,147,112,62,39,67,91,102,145,139,108,94,108,198,243,224,96,46,41,0,51,47,20,21,22,29,33,63,93,144,229,219,160,115,88,104,146,173,224,231,152,147,116,98,79,103,114,103,62,64,92,105,206,110,101,87,111,100,72,97,94,93,86,95,77,71,77,95,96,129,157,103,73,67,99,78,105,178,154,138,144,192,163,171,160,181,185,173,149,160,212,214,123,84,101,166,149,136,138,161,158,162,166,134,154,146,142,139,212,250,142,68,70,81,91,68,102,74,101,74,126,166,126,169,165,132,183,149,161,168,168,162,206,183,247,251,148,121,112,103,137,100,190,237,182,131,118,124,113,111,92,115,123,87,70,52,90,88,74,61,63,47,49,29,5,1,6,14,13,11,40,13,17,16,27,13,12,29,20,32,18,15,34,19,35,9,28,24,26,34,17,36,22,54,69,111,117,77,114,106,83,93,127,131,127,120,70,97,120,106,141,118,104,171,154,151,169,140,146,118,120,131,154,103,118,110,142,106,111,69,67,66,71,64,78,85,52,68,9,44,74,83,58,123,125,98,103,92,125,139,143,110,78,101,128,122,60,82,154,98,39,102,141,109,120,124,126,110,147,115,104,129,122,113,132,120,104,124,96,129,102,118,49,37,111,72,73,75,90,45,69,33,50,52,33,59,52,59,51,72,54,70,37,35,63,38,30,59,36,44,39,47,32,24,48,40,42,33,67,54,68,100,101,88,94,88,67,86,123,167,148,62,49,58,56,64,48,31,20,77,132,115,95,93,104,81,88,98,104,82,100,120,92,90,78,75,109,77,85,100,84,87,124,95,102,103,104,104,79,29,33,35,32,41,34,17,11,13,45,46,15,142,242,130,111,54,15,74,75,101,119,91,83,67,68,39,65,75,69,62,68,79,76,73,71,70,67,74,91,70,97,93,73,55,22,14,13,26,43,38,66,51,28,52,59,57,41,24,74,43,31,39,22,59,39,47,14,51,20,57,22,40,24,28,48,55,17,39,36,35,60,40,25,25,36,12,11,15,31,46,18,81,39,30,28,69,43,43,59,25,18,45,39,59,47,26,14,30,36,46,60,72,45,42,25,94,38,49,37,52,33,66,54,14,52,24,13,45,11,6,32,16,26,22,29,29,51,51,30,40,55,9,50,26,48,36,21,22,40,6,36,47,37,11,16,16,19,30,16,6,4,26,34,24,21,22,69,89,66,27,18,27,45,23,27,47,34,59,41,26,25,46,15,21,19,24,22,39,33,22,56,14,40,35,7,14,30,43,41,38,38,14,50,33,49,37,41,26,57,39,32,31,18,45,23,21,40,27,51,33,37,9,31,41,45,30,9,21,37,21,23,35,35,33,45,50,47,50,66,51,33,62,48,30,31,19,24,38,36,38,34,37,42,34,41,40,38,51,32,35,19,20,39,36,58,83,88,64,53,32,70,30,68,71,56,9,7,43,65,55,60,56,46,55,53,106,113,107,35,10,34,145,205,169,215,199,213,217,239,226,240,207,219,215,203,214,216,179,179,193,206,203,230,203,194,185,193,196,201,178,202,196,191,195,220,119,9,3,14,31,13,24,35,16,11,21,18,2,18,0,15,35,26,30,1,24,0,21,23,25,232,228,223,110,95,95,109,92,144,134,112,110,105,202,244,236,94,108,84,24,40,36,61,55,15,51,27,45,75,165,229,237,213,127,125,135,206,242,252,254,183,169,145,150,149,138,176,155,133,146,130,179,245,183,123,99,137,137,136,129,145,125,145,155,155,155,148,155,157,203,222,159,104,173,201,158,157,208,174,209,195,193,195,186,199,179,212,204,208,200,204,193,159,157,139,179,190,211,210,210,185,203,213,194,217,175,211,190,183,195,153,129,81,80,70,73,79,45,89,81,119,164,204,206,213,182,186,200,180,211,189,185,217,204,192,199,173,150,107,135,100,62,111,131,155,127,79,83,78,8,17,7,58,4,0,14,15,22,48,11,61,145,106,64,35,24,25,7,29,46,20,14,17,10,35,2,9,16,0,18,7,26,26,2,5,3,61,18,15,32,26,2,0,25,44,41,83,95,94,106,137,135,141,124,134,98,95,101,127,119,128,156,113,147,146,146,139,123,100,76,125,96,108,137,148,142,124,107,89,119,90,71,62,113,69,91,72,42,38,31,80,70,39,84,120,112,100,98,98,161,168,116,53,75,98,89,80,47,143,149,69,98,103,67,58,66,94,121,84,94,79,71,80,132,89,82,87,63,104,104,104,75,71,41,101,100,131,83,101,77,28,41,44,51,53,26,64,48,33,59,54,45,42,59,54,43,44,30,60,55,39,39,43,22,33,41,27,36,61,91,138,166,119,123,81,127,116,119,135,96,77,13,30,46,46,54,83,40,52,70,44,107,110,90,69,75,85,79,108,75,91,113,95,107,100,93,97,81,101,92,101,117,89,90,70,109,116,65,98,19,31,24,15,34,39,19,25,36,33,61,44,150,185,122,158,52,60,75,92,132,99,53,76,80,70,69,65,77,79,56,65,84,72,89,71,50,110,107,94,89,120,65,55,23,6,20,11,5,51,16,52,31,36,49,35,52,41,37,42,31,19,28,30,57,14,24,57,31,26,12,16,29,23,31,26,19,32,30,34,27,35,39,24,3,3,10,9,18,36,30,34,20,40,27,28,31,42,28,35,29,47,54,32,26,59,28,38,30,51,50,48,36,59,48,33,46,46,67,24,60,56,4,26,27,9,27,8,23,27,27,25,44,52,54,38,23,40,25,55,50,61,17,37,33,34,50,51,32,32,19,31,23,37,41,30,37,20,10,29,44,13,41,7,8,14,21,54,105,68,56,15,14,7,39,32,45,34,29,29,2,34,41,31,20,31,14,43,34,33,50,30,20,8,48,27,28,29,20,50,43,41,54,40,34,31,46,53,26,50,16,20,33,35,60,29,49,16,41,48,38,42,37,20,20,41,24,32,39,30,21,37,27,44,32,37,48,30,54,23,50,45,12,22,32,49,17,3,37,52,34,41,30,51,73,29,45,60,67,55,44,9,24,31,47,81,91,74,80,36,74,35,61,47,30,22,39,45,21,45,52,50,62,104,91,107,80,143,122,47,60,46,186,213,193,210,197,215,225,205,240,213,173,215,200,238,228,219,192,201,207,205,201,201,202,213,225,193,197,252,219,212,208,205,217,218,118,32,9,12,16,3,12,10,25,26,29,26,16,14,0,13,26,26,6,5,9,27,25,22,11,234,239,237,156,113,150,103,120,146,156,163,166,136,188,250,218,89,155,152,47,32,22,22,42,52,30,47,30,89,122,141,249,219,142,169,156,229,236,225,223,174,204,179,190,187,198,190,193,192,186,181,211,217,160,160,140,170,200,186,196,196,194,180,214,202,193,197,187,208,224,240,195,219,232,243,231,220,213,189,186,203,189,200,239,219,201,204,210,177,129,84,104,117,199,184,224,179,210,223,207,180,181,204,184,182,189,151,158,116,77,133,176,107,79,76,93,71,50,89,95,165,181,209,212,203,224,194,178,189,179,209,202,204,154,72,97,138,172,111,107,71,71,106,122,132,104,82,83,32,24,40,30,34,98,98,146,172,248,243,229,236,249,244,240,232,238,212,247,215,226,224,199,238,212,192,213,235,192,224,206,195,193,187,175,169,148,151,142,174,176,171,176,120,134,104,51,86,71,109,140,126,117,128,137,99,115,77,101,76,81,133,147,145,122,132,113,127,95,105,86,103,67,122,151,109,115,133,126,116,115,106,112,93,85,70,73,54,70,81,80,90,76,68,52,70,122,95,46,72,101,110,78,104,81,66,52,56,61,133,160,73,61,64,61,42,39,49,57,70,81,91,78,65,75,91,61,36,71,50,60,35,58,17,58,33,67,100,74,89,126,103,97,103,93,90,109,78,72,70,69,52,60,49,36,29,55,45,16,56,24,45,21,53,52,49,56,55,67,55,97,140,144,103,104,104,129,131,102,44,21,22,23,65,73,66,66,55,78,45,40,54,60,73,73,76,102,91,82,98,88,74,90,68,111,101,83,96,110,85,105,133,88,105,91,97,87,68,93,116,87,42,39,27,46,36,18,33,34,27,30,34,79,80,57,133,91,45,66,61,89,62,53,50,24,61,68,90,67,82,51,86,86,76,81,78,75,94,87,94,97,39,14,41,24,1,32,25,20,42,41,22,44,22,50,27,39,33,10,29,46,44,22,52,36,25,31,28,16,40,58,78,53,31,12,44,45,39,46,21,49,25,24,13,44,31,4,8,20,38,24,40,46,31,24,19,22,32,23,36,29,32,37,40,63,39,34,32,41,54,35,53,40,64,53,53,16,23,31,59,31,43,63,26,43,1,7,1,28,24,24,43,27,35,60,40,39,41,39,59,34,22,41,45,26,32,47,55,29,43,50,20,23,42,54,17,27,28,24,44,23,26,39,11,5,22,46,40,44,65,36,44,20,45,38,37,18,49,46,8,4,21,13,44,26,6,26,33,24,47,33,25,33,9,19,10,26,33,49,32,35,34,30,40,72,27,32,29,25,44,56,46,40,34,34,42,41,20,58,43,30,35,43,16,11,21,40,38,20,30,34,19,49,35,47,42,30,36,41,23,35,35,43,45,37,52,41,4,12,13,34,26,7,10,30,9,49,43,51,65,35,41,17,21,60,123,93,95,99,70,113,96,92,73,76,89,108,88,60,36,50,26,49,64,66,103,119,132,104,99,65,131,239,204,215,229,206,224,227,191,206,198,193,219,205,192,205,215,192,200,181,215,213,225,208,211,229,188,227,229,205,223,176,216,202,226,97,23,21,11,0,29,5,14,6,0,31,17,12,29,18,12,19,3,2,8,14,35,34,40,12,234,245,246,150,96,156,150,161,172,191,250,249,209,231,241,233,106,180,119,24,63,5,34,10,13,39,23,55,93,144,136,156,149,129,149,138,195,224,193,162,145,159,193,189,197,225,205,195,193,198,168,115,108,139,172,180,192,196,205,213,198,200,183,203,195,196,193,190,184,73,98,138,227,247,255,235,178,180,186,192,173,183,227,250,188,174,176,181,130,39,9,41,92,170,218,201,194,187,201,223,176,180,176,205,172,177,182,82,57,30,63,147,162,134,27,85,77,118,133,172,180,195,190,193,187,188,183,186,169,196,195,183,177,78,44,69,103,174,150,135,69,81,111,129,143,115,122,162,229,229,240,228,249,245,254,250,255,230,247,254,231,236,252,254,255,255,249,232,254,247,252,239,255,248,255,252,233,243,234,237,242,253,246,251,253,249,246,234,234,242,250,229,255,249,233,172,135,88,98,117,99,59,93,95,106,86,108,102,72,67,148,100,124,121,118,97,126,130,94,140,134,107,98,134,136,131,147,139,101,82,117,82,92,74,45,44,51,75,85,53,69,46,25,77,64,75,82,54,79,107,88,107,77,85,118,81,70,52,101,147,65,27,46,9,29,26,39,76,76,62,61,58,54,67,81,77,74,51,47,81,46,69,57,19,54,24,44,74,74,105,127,124,113,124,156,146,136,126,71,25,44,43,68,11,67,45,33,47,50,17,44,28,40,40,60,61,78,94,72,35,118,141,113,139,122,116,69,27,38,27,49,65,131,119,72,44,47,31,57,28,28,51,50,58,76,83,88,108,108,95,116,99,99,82,69,85,119,115,94,115,102,107,122,69,63,72,78,112,117,103,72,60,35,45,19,52,27,37,25,40,29,48,46,14,97,51,58,34,27,49,77,51,63,70,76,71,78,52,77,73,82,49,71,77,97,69,76,77,82,29,26,41,10,29,23,33,51,62,36,43,50,28,28,19,61,26,28,16,29,38,39,37,55,34,30,23,29,18,40,40,32,31,57,51,27,34,26,44,30,42,39,24,6,2,2,54,9,37,22,44,25,51,29,45,29,61,50,32,44,22,37,34,49,40,14,46,38,43,35,43,44,36,18,55,48,20,54,46,57,40,41,31,37,5,35,12,20,24,17,35,25,38,34,46,30,55,25,39,54,38,38,52,35,48,31,48,46,36,28,21,20,36,56,71,20,34,54,29,42,44,34,27,39,66,21,38,32,27,54,69,86,38,39,44,24,44,27,24,32,7,19,25,2,17,24,10,17,19,7,44,16,12,14,17,39,43,48,54,21,40,46,61,43,31,41,60,38,43,42,28,65,9,46,45,26,45,44,50,55,38,28,19,17,34,23,3,7,28,3,22,27,34,54,24,30,18,35,23,38,44,49,34,27,23,29,41,16,39,25,23,32,35,24,58,48,42,35,43,44,56,29,77,150,172,152,135,114,114,149,161,136,149,129,149,151,159,128,166,105,93,85,36,58,64,53,84,104,110,127,103,192,238,219,223,211,207,227,202,179,187,183,221,211,172,180,214,205,193,205,178,201,215,193,219,199,181,207,209,197,207,185,213,214,209,183,129,10,4,16,5,9,18,0,17,5,3,1,10,19,15,10,12,22,16,22,14,16,28,37,1,249,233,233,130,113,140,164,180,205,224,249,241,242,240,253,225,100,161,86,20,37,14,34,21,25,44,34,54,64,110,101,72,73,93,102,115,126,130,150,152,201,181,144,214,177,203,168,169,194,158,132,17,3,78,146,203,219,196,168,185,192,178,163,157,178,179,157,178,69,4,28,60,196,226,227,232,157,145,154,163,162,193,212,211,186,169,156,150,61,1,12,23,22,134,179,202,219,159,176,175,180,183,217,164,180,172,117,52,38,38,45,118,171,148,77,68,81,119,171,203,199,170,193,181,168,179,186,147,174,200,168,168,103,27,54,19,44,86,137,129,83,31,47,81,101,107,171,247,244,255,236,254,244,248,252,232,255,254,250,230,253,247,245,244,243,240,240,247,255,251,252,250,228,230,242,243,243,255,254,239,239,249,248,255,254,255,240,246,249,255,224,254,243,238,229,233,200,121,93,107,120,122,108,149,134,85,115,126,104,114,119,115,110,129,121,128,124,106,109,130,149,88,114,142,139,145,90,73,53,94,116,123,96,87,44,61,77,84,53,61,62,35,27,56,57,82,74,49,39,45,95,62,68,74,101,78,80,74,65,150,79,21,63,66,95,46,28,53,76,72,100,89,65,33,44,103,90,69,80,55,46,76,48,55,56,56,53,63,102,111,131,135,136,134,128,148,118,87,37,55,24,78,95,79,68,50,55,43,42,66,75,64,23,57,86,116,114,108,121,51,91,149,121,143,81,15,30,28,38,64,110,94,108,129,137,47,27,75,35,22,29,62,119,67,109,89,111,96,97,100,119,116,100,99,97,107,77,106,108,113,72,89,87,61,82,97,82,104,120,112,127,82,47,20,47,35,40,27,57,32,35,55,44,35,99,84,30,15,12,66,67,55,53,81,67,85,63,89,78,72,76,45,64,79,87,96,71,72,45,16,3,21,24,28,24,31,63,62,25,33,36,22,46,38,54,49,43,51,54,44,22,53,11,34,25,46,50,53,57,19,25,50,40,39,32,45,43,51,39,64,43,9,34,45,35,10,17,24,2,29,22,34,27,51,39,42,31,37,35,37,49,37,43,31,18,52,38,26,45,61,51,45,61,43,60,35,46,31,21,39,63,39,53,13,30,23,33,37,24,27,32,1,14,28,39,31,24,68,38,34,39,45,59,48,80,33,30,16,14,8,57,38,23,36,37,42,38,36,29,47,34,37,49,33,47,9,3,40,64,61,77,58,24,29,44,24,38,25,9,9,11,23,8,16,24,28,47,22,38,22,39,19,14,25,38,28,35,28,54,37,28,29,34,36,44,54,26,38,35,18,21,34,29,43,20,43,36,52,29,53,29,56,23,27,23,4,11,19,10,41,32,56,25,23,39,15,16,24,51,22,39,3,13,31,16,36,29,23,33,22,36,2,33,41,21,23,57,45,56,27,35,199,252,241,222,177,141,144,163,155,157,161,159,151,177,175,170,159,170,149,119,80,88,57,67,102,114,145,71,140,223,219,196,222,191,209,187,174,179,189,206,199,212,167,199,184,206,210,180,176,177,191,198,186,194,191,211,194,184,186,200,212,202,210,190,124,22,0,10,9,27,15,21,14,18,24,8,25,8,13,12,45,21,8,14,10,14,8,29,14,246,238,208,137,122,146,116,145,187,249,247,240,253,253,232,243,157,186,66,21,56,12,50,18,18,46,48,57,75,125,85,32,57,73,117,122,149,148,179,193,153,161,188,178,172,180,175,160,141,137,55,9,30,19,88,191,220,197,174,157,160,167,169,150,176,179,171,116,77,30,46,42,68,164,208,210,151,149,152,197,190,195,167,174,165,150,162,83,29,39,40,29,39,55,125,214,204,202,169,188,165,198,193,184,164,131,82,47,69,77,91,84,126,193,152,107,113,157,145,214,179,177,194,177,186,168,156,164,181,172,185,96,28,12,25,37,40,35,74,96,73,59,11,18,24,11,72,124,231,243,237,230,215,206,184,186,174,168,129,117,119,228,245,245,250,229,254,252,247,251,253,255,247,246,255,238,251,246,255,253,241,237,231,246,250,231,255,253,250,244,253,255,249,249,239,243,235,179,75,82,67,94,86,130,108,126,98,113,145,150,148,136,128,135,97,132,124,73,110,111,94,76,127,135,130,117,80,68,85,92,104,103,104,78,67,80,77,108,84,49,57,51,39,56,66,57,60,63,58,29,76,75,97,78,69,96,105,106,88,139,125,21,70,148,141,89,74,60,108,162,161,126,60,33,100,104,91,89,66,85,71,84,90,77,85,59,79,79,87,90,143,115,91,85,78,45,63,64,68,76,86,98,101,81,95,83,78,66,69,57,40,55,27,43,123,133,134,109,128,112,133,140,78,70,15,6,9,72,115,131,130,118,127,111,123,127,112,85,29,9,17,87,131,115,93,100,120,113,109,138,86,102,89,91,140,119,106,79,111,112,98,94,71,56,86,79,90,122,99,107,115,109,49,23,22,34,28,35,39,44,56,57,52,96,185,108,19,30,12,65,54,78,89,71,94,72,87,57,61,102,71,85,69,88,104,87,52,41,14,4,9,43,32,44,41,65,57,81,47,33,27,42,45,56,26,46,23,48,53,50,41,26,51,32,29,36,11,52,29,43,40,36,43,43,57,54,64,35,39,48,55,60,54,55,29,32,50,33,39,14,30,45,34,17,38,43,25,45,35,6,46,28,57,51,18,30,29,54,45,67,54,9,19,13,60,58,12,47,45,45,59,22,31,2,32,38,53,26,27,22,35,45,34,42,27,30,38,35,47,50,39,64,19,54,24,33,16,22,16,7,34,29,22,43,47,18,16,9,4,51,30,39,14,25,21,33,36,49,71,41,78,82,45,25,33,12,36,21,14,30,16,38,18,36,21,34,38,12,40,6,30,19,9,32,20,46,21,52,17,20,31,31,45,62,62,22,45,41,41,15,37,44,19,33,33,20,47,32,33,26,22,22,36,19,33,23,27,15,16,33,33,17,38,22,33,37,8,22,19,25,20,25,52,60,57,31,31,20,34,18,29,30,33,10,9,35,6,39,36,8,66,199,248,251,227,220,187,147,165,135,146,148,140,150,153,185,161,164,191,177,140,139,121,99,103,127,90,81,104,187,243,231,224,213,203,214,189,181,182,192,188,207,212,202,204,209,217,218,192,219,191,218,178,183,189,212,199,194,201,221,205,202,216,217,197,124,7,0,5,23,33,1,23,7,2,23,1,27,28,2,14,26,22,18,9,23,6,20,23,9,248,231,234,212,140,152,103,127,160,166,194,244,244,238,216,219,136,184,71,7,31,3,54,15,54,49,60,66,88,145,134,101,103,105,141,158,176,192,166,187,160,205,167,168,155,157,180,177,133,32,71,60,58,65,49,113,205,199,180,184,197,178,157,160,183,183,145,84,113,135,132,57,31,119,198,177,173,184,209,208,153,160,125,169,174,174,63,38,60,79,125,88,44,18,79,139,191,201,179,192,211,187,174,163,150,81,86,103,134,116,102,91,68,128,168,135,157,157,160,189,182,178,186,202,191,180,171,183,184,171,109,128,119,106,109,129,107,107,165,237,248,243,217,136,72,38,16,6,96,151,138,161,181,241,242,246,241,154,126,102,178,254,232,239,222,252,250,242,236,246,255,242,236,254,249,237,251,226,252,233,255,252,243,254,242,254,252,241,247,237,202,123,164,238,240,236,229,169,84,72,91,89,82,138,122,95,67,104,124,148,155,118,112,115,102,134,137,122,94,82,97,91,135,160,139,95,100,111,105,90,78,90,124,120,101,117,134,97,113,103,93,91,51,56,68,53,54,53,39,16,58,87,106,82,83,112,106,123,105,176,139,62,109,148,126,122,87,111,168,170,151,107,47,44,70,87,68,95,104,106,71,144,128,144,93,126,76,82,104,95,71,52,56,68,72,84,85,131,134,160,122,134,82,65,86,111,114,91,89,87,118,108,73,71,108,111,106,125,117,141,109,90,11,21,12,45,68,109,132,143,136,124,127,131,137,115,132,92,11,17,42,147,95,91,146,112,79,100,97,118,98,100,99,120,89,76,100,110,118,107,83,73,55,74,87,84,108,116,112,114,84,24,36,36,10,21,30,26,34,53,22,43,74,96,230,148,34,25,47,72,75,90,56,80,63,65,71,54,77,68,67,101,72,93,51,41,21,4,31,25,8,29,61,60,75,69,72,46,48,11,22,48,68,29,48,59,25,51,41,34,33,51,27,38,50,50,39,48,21,80,36,59,65,33,45,42,38,32,33,86,66,72,48,58,74,59,18,14,28,36,13,23,25,22,43,52,46,56,32,15,73,53,43,30,49,34,24,51,27,55,55,45,49,32,56,40,43,55,28,74,39,15,29,33,19,44,36,51,26,25,34,35,31,43,19,38,43,37,47,36,39,35,72,46,44,31,32,4,4,50,10,19,21,14,23,13,46,34,5,30,23,27,19,32,3,0,40,16,19,49,66,85,83,58,29,7,36,21,45,30,41,22,33,36,31,40,26,34,5,27,16,25,34,23,56,22,27,31,52,22,47,14,54,34,39,18,37,21,40,43,15,31,23,33,42,38,43,12,18,33,54,33,20,29,15,25,19,43,39,47,36,53,40,28,19,35,43,11,53,31,34,70,22,44,46,39,36,36,34,15,15,29,24,26,29,14,29,18,26,131,190,228,188,165,205,220,218,207,204,161,140,133,103,108,103,127,145,140,156,139,145,159,150,140,132,147,103,113,158,229,235,240,245,240,230,199,233,212,218,245,218,212,232,217,228,242,200,228,219,237,237,218,235,195,229,247,185,231,205,215,225,218,202,223,195,113,14,4,1,21,31,10,1,7,16,20,35,15,15,2,0,8,15,1,7,25,29,3,18,4,252,242,222,215,93,83,80,102,70,48,47,99,139,175,227,181,156,162,62,49,27,6,35,22,32,66,55,74,130,210,149,132,143,113,155,185,158,207,193,187,192,197,172,185,177,142,171,141,49,54,122,120,125,108,68,70,102,204,225,182,170,164,168,186,171,112,51,110,140,161,181,141,74,116,181,155,170,212,178,181,135,137,143,123,146,90,27,30,85,187,168,172,118,32,53,84,165,198,201,170,185,189,180,135,114,61,73,106,71,119,145,91,53,101,124,138,160,198,204,165,195,170,182,158,190,200,193,192,179,116,95,138,154,140,183,218,246,254,243,233,235,239,227,238,243,220,175,118,172,208,252,248,248,229,253,234,250,242,230,241,230,224,250,245,255,244,243,230,247,247,255,238,253,252,226,251,235,235,255,254,243,250,251,244,221,244,238,244,231,103,40,25,93,145,169,161,166,92,82,67,100,105,117,144,145,128,91,112,114,97,116,100,102,67,83,83,128,93,80,99,130,121,142,101,107,108,140,130,95,100,112,99,103,105,122,124,127,126,102,120,49,87,79,111,92,60,85,63,46,16,71,100,93,88,96,101,104,119,110,155,177,70,66,81,76,85,91,170,193,138,157,95,33,20,67,90,59,94,64,73,104,144,163,156,157,113,117,77,77,85,115,66,76,100,118,119,118,190,180,123,124,122,93,71,86,93,132,136,100,131,137,124,91,83,58,73,126,148,117,111,65,58,80,79,110,123,142,144,161,138,144,133,130,118,136,101,111,100,60,27,76,115,147,89,114,113,102,129,68,103,79,117,117,93,63,92,92,79,97,75,88,87,87,86,95,91,108,107,97,42,70,19,23,17,20,65,27,28,35,41,37,44,42,66,131,166,77,32,64,109,86,66,96,86,78,73,73,71,90,82,73,93,78,35,40,25,4,29,13,0,32,37,52,68,64,63,52,49,13,25,49,14,48,25,56,36,42,21,34,15,5,36,7,49,26,12,34,57,25,40,75,51,33,41,48,40,46,45,71,78,97,88,73,72,78,66,54,22,22,32,13,17,35,32,25,26,31,42,32,30,38,42,36,36,13,47,56,47,73,48,30,47,54,20,54,43,48,21,41,36,34,21,60,50,39,32,2,6,21,30,25,51,38,48,55,27,59,42,33,35,37,54,30,47,21,46,21,6,40,30,16,16,35,15,23,38,17,26,46,25,22,26,46,12,25,28,20,44,43,25,26,53,70,77,56,22,14,14,36,51,29,50,41,38,25,26,33,20,18,30,14,24,26,16,23,37,22,28,31,47,39,38,21,38,38,24,54,49,40,24,43,32,17,30,38,42,26,25,62,15,18,11,29,41,42,62,44,17,38,14,29,35,30,56,33,63,28,47,28,35,42,46,21,41,40,46,26,25,40,22,8,14,23,31,14,30,34,40,175,248,247,251,204,207,181,182,178,199,216,202,188,167,134,115,110,115,120,134,148,124,130,161,159,156,169,135,115,148,202,210,203,214,232,231,228,231,230,223,251,222,233,247,239,238,234,252,247,216,226,235,229,223,228,232,223,221,216,220,206,191,223,217,212,211,217,107,6,2,4,18,7,15,15,9,32,48,6,0,26,6,10,2,18,12,9,5,31,12,13,22,173,201,211,136,23,47,65,51,79,60,51,44,52,135,208,134,118,150,63,36,20,32,19,36,53,28,29,51,106,181,181,149,147,165,176,208,195,202,184,182,185,192,164,151,169,167,155,51,31,47,138,167,146,118,59,61,55,173,187,152,164,154,163,129,145,75,15,97,168,180,177,202,178,157,161,135,147,173,166,145,104,147,166,117,98,34,13,28,118,177,153,183,119,61,60,68,75,187,180,124,122,155,145,123,82,32,65,121,103,82,110,77,78,85,85,106,123,172,220,177,211,181,205,199,182,228,175,176,117,66,48,89,179,238,241,246,249,255,245,232,243,246,242,255,243,239,254,247,250,235,254,235,250,251,249,239,255,253,234,237,247,234,234,244,253,249,247,255,234,243,249,245,247,234,255,252,255,247,240,235,243,246,249,242,255,251,249,248,90,39,32,19,93,85,87,127,151,65,93,115,129,177,120,120,126,115,105,104,85,69,84,84,68,37,61,76,96,103,111,136,124,98,146,130,94,136,140,134,91,82,66,117,97,88,133,137,89,94,101,86,82,39,67,85,73,61,63,87,137,127,126,95,98,102,135,138,144,146,143,148,183,124,68,107,144,123,145,173,147,105,138,84,19,44,66,108,64,71,76,78,116,147,155,158,146,132,164,147,104,109,103,67,86,108,106,116,131,154,138,95,118,119,93,43,79,139,131,137,80,79,84,99,70,101,63,83,129,104,60,8,12,67,139,128,117,144,154,151,134,136,120,111,133,95,89,76,91,78,106,95,106,124,90,109,87,114,91,97,114,106,100,81,105,85,59,97,104,96,99,111,94,105,105,106,87,94,103,70,50,41,29,26,13,34,52,82,41,47,14,14,22,49,42,18,68,172,150,75,47,77,87,83,70,87,60,60,57,107,84,82,65,57,16,22,4,10,27,12,25,77,62,76,68,66,53,81,50,57,34,15,37,16,34,11,26,19,28,24,41,52,27,26,23,56,36,24,49,46,43,37,21,54,27,54,24,39,27,52,30,67,71,86,61,71,94,72,67,55,35,9,25,19,7,31,21,36,20,39,30,7,57,43,37,39,27,44,40,45,40,45,27,49,37,50,46,28,50,32,38,44,27,42,14,36,28,50,50,33,47,42,44,41,15,48,36,36,50,48,60,35,30,52,17,27,62,18,34,10,18,12,4,15,7,11,13,7,25,27,31,47,48,46,27,23,12,36,26,18,32,23,41,50,54,88,55,28,18,21,16,33,50,64,36,41,44,21,46,13,20,31,17,5,28,22,20,50,42,32,45,28,13,26,59,45,28,8,25,33,21,28,34,38,29,12,20,29,42,31,39,17,18,18,38,36,30,43,13,67,42,36,25,32,28,32,24,11,25,56,43,63,39,40,28,44,29,40,48,29,27,37,11,43,37,49,38,9,42,66,249,236,246,245,239,255,221,200,176,173,182,187,188,210,207,152,124,144,121,158,159,182,127,169,158,157,168,155,138,160,175,170,138,160,141,150,126,165,158,139,159,166,172,175,187,188,157,178,197,190,167,172,192,194,210,207,203,223,201,198,194,180,215,208,231,226,232,134,11,1,6,11,11,27,12,9,6,20,19,10,15,13,13,6,15,37,26,8,8,5,30,28,103,102,64,46,28,72,60,86,97,74,78,109,90,124,170,73,74,117,21,38,37,36,21,27,29,7,45,74,87,123,133,125,173,169,187,188,185,177,174,164,169,161,170,162,149,128,102,29,37,67,126,161,119,149,138,136,71,120,149,157,103,86,155,131,65,35,26,80,135,177,177,223,223,185,188,123,102,177,173,128,116,180,186,142,122,39,31,58,96,182,177,206,132,23,62,28,56,125,126,105,77,134,107,61,62,70,76,127,60,71,99,66,83,119,179,215,183,188,215,181,163,174,196,166,172,175,167,147,125,77,166,249,246,244,242,234,236,239,249,252,224,245,255,255,237,247,249,243,255,250,245,254,255,231,241,253,237,254,255,245,250,231,243,245,222,238,236,249,245,255,249,255,240,248,242,239,252,250,251,222,252,251,252,243,244,236,230,114,86,63,74,100,70,74,38,98,114,61,124,51,132,116,96,118,107,65,88,70,69,73,100,114,90,77,79,102,117,125,172,155,102,83,138,130,128,98,105,97,73,106,99,93,80,83,137,135,132,102,103,72,70,80,23,29,67,62,105,110,114,133,156,148,115,140,132,144,123,116,135,127,135,121,49,53,103,127,66,92,69,46,63,60,60,16,77,90,22,29,88,113,129,144,128,119,145,158,211,170,144,115,63,59,36,68,72,106,123,112,133,97,124,90,89,26,46,145,147,129,102,49,77,111,107,128,86,80,54,8,27,9,68,112,117,98,117,71,79,91,95,105,104,85,64,61,95,90,65,77,89,91,102,114,79,88,113,88,98,96,103,73,103,135,125,123,70,80,127,83,62,84,103,92,96,77,101,77,53,8,17,28,32,17,62,56,88,108,98,59,36,18,11,35,12,15,53,144,179,91,57,73,69,65,78,69,64,54,66,77,71,47,28,40,16,42,10,25,24,55,64,62,75,92,73,55,79,72,66,47,21,47,27,25,32,32,23,23,21,37,45,32,38,56,50,38,30,42,33,44,45,34,48,32,28,37,29,45,27,47,48,110,63,57,52,67,74,81,79,29,2,26,1,14,38,39,42,17,32,18,8,27,12,35,27,30,22,29,34,33,22,24,51,41,44,71,35,53,32,41,27,35,36,30,20,25,26,58,43,59,35,7,19,39,24,42,30,12,48,52,23,75,50,27,36,32,27,32,36,7,36,14,14,4,18,42,24,16,26,17,52,38,40,22,26,24,37,27,17,35,11,62,25,21,39,72,96,46,0,14,5,6,36,33,27,70,49,26,40,24,16,29,41,18,31,18,41,17,4,28,32,41,36,51,39,30,27,27,21,36,44,45,31,14,45,29,27,39,35,37,18,26,47,6,61,49,27,11,74,51,44,49,14,40,47,24,46,32,8,31,8,43,33,35,38,36,38,37,16,61,8,31,16,13,38,34,8,33,19,100,247,228,251,248,246,244,223,245,227,178,181,169,182,175,226,221,238,221,198,188,192,173,189,195,164,175,173,162,162,185,166,141,107,37,18,5,27,32,60,39,49,35,27,53,45,50,66,63,79,112,79,67,96,114,97,113,117,134,141,157,127,128,147,166,166,169,190,106,8,0,25,27,26,19,14,15,11,12,13,16,10,4,7,35,37,17,22,14,2,16,27,8,112,76,44,33,61,110,148,160,133,125,124,134,92,182,183,82,68,62,30,45,43,35,35,23,28,20,60,78,113,113,154,134,156,163,192,193,161,156,155,180,183,165,159,159,99,101,16,25,30,78,76,82,110,147,228,172,55,75,108,131,133,121,151,58,30,30,41,87,161,156,176,231,182,188,182,108,92,162,218,135,121,217,254,180,164,125,62,53,100,197,178,167,134,32,41,39,35,99,164,115,164,166,79,38,79,88,87,102,80,90,106,96,87,159,240,239,210,148,183,171,156,136,109,125,85,112,133,115,218,250,248,249,223,252,255,255,244,249,236,251,232,248,255,253,250,244,230,252,249,241,255,246,251,255,239,254,246,224,255,242,238,248,237,246,223,244,251,242,220,255,254,240,249,252,225,252,252,252,249,255,255,243,243,228,160,158,84,59,71,76,71,70,78,69,56,81,106,119,120,110,110,103,86,65,91,67,80,94,87,85,129,142,132,95,83,102,129,100,128,147,133,131,130,141,115,66,84,67,77,107,110,85,82,103,92,108,124,111,107,77,88,74,75,41,37,105,104,88,97,98,88,64,41,81,81,29,56,36,31,58,90,146,60,10,33,34,36,48,19,46,57,64,51,36,62,56,39,42,106,137,151,145,133,103,133,136,186,194,156,150,139,115,105,87,62,70,88,101,86,116,159,105,95,19,38,54,92,123,107,61,49,94,144,115,58,36,32,8,69,95,115,82,103,76,63,42,45,56,69,78,59,65,49,54,83,50,75,58,63,74,86,97,101,108,93,117,107,87,68,124,129,150,133,97,53,46,61,67,21,39,49,77,116,75,39,38,31,21,25,16,18,47,71,107,118,121,146,117,51,23,30,27,51,82,82,111,143,138,50,30,53,56,69,91,78,45,91,76,70,39,39,25,10,21,9,39,34,64,52,64,60,47,96,56,79,69,69,38,31,78,25,41,57,25,38,16,50,37,65,46,32,32,23,27,31,31,20,30,53,49,42,46,66,31,16,44,45,23,65,64,52,49,76,52,15,23,48,21,6,33,30,39,31,42,36,6,31,25,51,34,61,19,26,15,8,28,22,19,9,57,26,11,36,27,41,30,29,23,33,40,31,34,18,57,40,41,62,43,28,58,37,40,15,47,11,33,28,25,30,21,24,18,20,22,23,33,8,39,55,23,27,35,4,21,23,20,2,30,13,31,16,25,36,52,6,26,17,25,17,36,28,42,28,65,87,97,31,21,26,23,32,35,30,37,38,43,22,47,27,19,35,20,18,20,15,32,28,40,33,38,7,26,26,17,36,4,40,21,42,40,49,18,38,38,18,7,17,51,20,13,20,23,26,54,26,39,48,57,11,37,25,8,54,39,33,28,3,19,23,41,35,37,43,23,21,33,50,46,23,19,16,29,26,26,31,49,4,134,234,246,252,246,234,253,239,239,214,209,225,203,190,168,186,220,253,248,174,149,182,175,191,161,148,152,162,153,166,163,130,122,79,27,7,10,6,34,30,0,9,22,31,16,18,31,10,12,36,16,21,41,25,10,28,7,11,3,33,42,41,51,57,76,89,90,119,91,30,4,31,8,9,17,25,17,11,22,21,36,38,15,27,11,7,14,32,23,23,11,16,14,139,99,53,89,95,101,151,160,155,137,155,170,124,193,218,76,50,59,27,55,21,53,34,26,37,43,31,80,125,146,134,139,202,186,190,165,172,162,163,158,166,162,147,151,96,57,21,31,67,56,53,50,55,88,205,126,42,81,96,138,156,106,106,31,15,28,88,149,189,193,197,217,159,161,178,129,94,174,219,160,112,218,247,202,172,180,106,67,115,193,123,142,60,43,76,41,86,63,143,174,167,133,57,54,91,96,80,132,109,116,111,122,124,142,202,193,140,61,74,88,87,75,58,64,81,104,142,171,244,244,254,248,254,254,247,236,240,254,249,246,244,253,237,239,255,246,253,245,240,255,240,246,250,245,252,243,252,252,245,249,245,249,254,226,238,243,241,251,242,245,254,250,244,247,248,247,252,245,230,243,252,244,253,241,155,85,34,48,54,34,41,71,74,65,73,99,109,135,169,141,115,102,103,101,106,120,116,99,100,117,159,103,94,93,76,79,127,97,113,122,115,119,129,125,112,60,23,49,81,98,120,78,107,61,101,112,107,110,82,73,106,117,89,61,34,57,74,56,40,35,28,35,19,52,49,19,35,15,25,7,42,153,79,27,25,40,51,57,57,55,54,73,46,63,76,64,40,60,112,158,178,148,158,168,163,161,165,155,181,134,138,124,104,113,80,84,123,123,104,122,91,114,90,37,15,45,19,51,83,79,85,130,97,44,4,6,22,88,125,134,142,129,88,27,22,35,57,47,42,12,44,35,45,14,28,41,50,42,34,43,68,108,86,88,81,122,108,105,104,140,139,128,70,44,62,43,46,41,63,48,39,42,47,63,7,26,18,18,19,51,65,99,135,120,123,114,135,115,107,93,114,76,31,68,31,116,100,154,116,60,57,61,61,70,61,56,55,15,42,28,18,16,11,26,26,73,88,65,61,73,44,76,93,59,68,46,56,37,22,61,42,36,31,49,28,21,42,30,32,47,44,39,46,41,52,49,41,20,42,52,24,47,34,47,14,66,26,42,48,21,19,14,46,48,15,33,34,11,19,20,9,31,45,9,18,29,26,50,28,40,26,26,20,46,48,17,8,34,33,33,36,31,37,29,30,24,37,35,44,41,2,9,24,10,37,44,31,10,38,5,19,16,24,2,32,34,29,0,29,26,39,32,28,41,25,22,7,34,40,37,7,31,37,23,22,23,25,20,28,20,18,18,40,38,13,35,22,50,25,14,26,24,31,39,50,102,84,47,6,18,31,9,10,43,34,35,40,2,11,38,23,20,21,33,14,24,19,16,27,10,38,25,24,20,41,45,22,31,32,21,35,29,50,10,23,12,29,23,38,38,31,9,38,10,64,49,59,45,35,37,8,35,41,25,60,56,31,29,42,33,41,31,40,52,23,23,32,34,24,28,40,36,36,23,9,50,34,171,234,251,238,252,236,249,246,246,246,223,248,209,250,227,219,163,220,188,79,57,84,86,74,98,83,74,83,51,85,69,88,116,99,144,143,137,142,124,79,105,131,90,69,56,47,75,61,66,56,42,36,18,17,49,62,15,24,25,10,25,16,28,20,25,33,32,23,55,11,15,7,20,13,17,21,12,3,23,2,27,14,29,2,13,26,5,13,27,18,8,26,23,82,100,92,157,115,115,85,89,106,75,117,175,130,215,199,101,97,53,23,25,38,18,30,23,26,54,69,68,126,208,222,215,220,221,225,216,220,203,208,205,205,216,231,145,78,118,125,95,102,97,121,97,67,70,109,99,74,126,151,103,89,82,89,45,54,58,118,196,210,185,225,236,156,154,204,152,133,174,228,171,98,187,244,211,160,173,159,114,142,176,63,53,44,74,94,78,60,63,63,103,85,77,53,54,126,117,73,107,104,117,90,119,115,99,105,110,51,26,26,58,52,75,104,145,231,237,245,255,237,254,232,249,255,255,243,240,255,249,233,251,243,255,250,254,248,245,253,248,229,238,236,255,243,246,235,225,248,255,250,250,252,231,255,252,254,254,234,230,246,254,234,236,254,246,251,252,242,253,231,252,254,243,243,245,128,93,56,44,110,103,83,81,86,46,38,105,119,134,184,148,133,128,119,120,101,70,70,104,101,109,102,88,97,86,90,62,80,105,95,109,93,106,104,149,81,72,33,30,44,39,76,59,44,47,101,143,127,102,99,78,118,92,111,54,39,54,45,44,46,59,29,32,43,56,79,68,45,33,82,44,71,163,114,44,95,89,91,59,44,58,70,109,69,51,64,118,161,164,171,175,116,109,162,153,166,126,150,96,57,64,4,11,20,66,102,58,81,65,36,38,39,57,80,43,12,15,11,50,116,82,86,54,10,4,15,78,113,135,132,138,161,113,55,23,22,19,27,34,27,34,37,45,32,24,13,43,40,43,13,46,43,75,74,94,81,110,93,107,116,94,55,10,43,61,42,22,43,38,26,38,24,28,12,25,25,38,12,20,64,77,104,110,110,156,99,110,122,107,159,182,171,72,27,61,74,73,79,129,159,70,48,39,77,53,95,53,17,31,16,17,29,19,41,70,67,84,86,85,71,72,92,29,62,47,43,70,60,35,39,34,46,73,15,56,23,25,47,62,35,33,68,39,35,49,52,71,15,40,32,30,58,33,21,52,45,25,34,38,53,67,40,23,27,9,36,35,9,39,18,23,20,5,26,42,2,11,38,30,23,16,41,12,34,34,20,20,16,21,39,50,21,15,22,33,34,16,15,21,28,28,30,33,17,19,9,37,28,13,21,16,21,32,18,36,24,54,19,42,17,16,27,12,7,5,33,4,22,41,29,19,37,38,6,30,22,23,13,14,23,29,24,35,11,17,16,24,31,27,50,33,20,25,12,28,23,59,94,79,35,16,17,30,19,41,21,27,24,55,13,27,10,26,42,44,7,22,19,37,41,47,14,38,33,18,34,17,16,31,38,36,20,39,30,18,51,18,45,28,18,33,15,32,23,39,39,31,9,40,44,21,26,17,44,32,59,23,12,30,11,20,5,36,32,17,38,39,42,34,26,35,21,44,49,13,17,31,46,201,251,255,243,247,243,233,233,232,246,230,240,230,246,218,216,118,70,54,29,0,15,3,12,18,25,25,32,18,22,23,71,154,192,228,235,233,236,241,224,238,223,214,232,207,175,194,189,205,170,169,174,195,162,168,198,166,151,115,134,154,120,117,100,88,71,52,39,54,32,10,6,4,8,6,18,3,26,17,8,25,0,7,13,5,9,13,19,5,18,26,11,0,154,133,161,173,164,134,100,111,96,88,93,121,132,149,183,67,67,38,50,41,23,31,48,35,42,29,26,60,159,220,253,238,243,251,243,242,231,188,244,222,227,233,234,129,111,168,130,134,148,135,131,109,157,122,136,125,133,167,249,130,49,64,105,122,117,126,187,205,240,211,218,220,164,137,131,117,129,186,127,85,72,73,165,107,108,155,156,148,147,176,94,73,68,88,125,81,87,77,76,53,44,63,83,93,93,128,104,130,97,116,114,81,82,108,140,109,129,65,50,72,98,117,165,200,224,238,252,246,250,253,229,235,247,238,255,249,250,246,226,255,246,254,248,240,236,243,238,253,226,254,251,219,254,250,247,237,242,245,255,255,255,240,245,240,233,152,144,175,151,172,154,154,137,137,155,135,157,153,183,145,145,130,139,118,50,58,60,110,125,100,125,104,77,69,75,103,134,139,131,110,112,94,99,94,69,57,71,64,81,97,63,103,96,85,60,94,98,103,116,104,54,84,97,75,100,59,14,21,20,55,54,42,47,36,121,134,119,106,62,103,91,105,56,34,13,23,32,26,45,70,59,49,53,105,155,141,126,134,141,146,110,162,167,96,172,135,102,95,113,114,141,82,40,13,75,127,108,171,127,136,116,62,88,75,91,69,24,22,32,16,25,23,6,1,30,57,83,73,87,89,118,167,232,233,234,251,249,178,242,146,38,10,9,41,100,133,152,135,144,124,142,92,34,52,67,96,61,85,111,58,78,94,56,43,74,73,22,34,38,41,62,76,94,81,63,72,83,57,49,41,39,29,28,50,37,41,18,30,21,38,37,14,17,41,21,21,53,64,103,126,106,102,139,108,104,101,105,87,119,105,101,63,29,42,76,86,82,103,134,179,96,20,33,45,19,5,21,26,28,18,36,34,58,74,87,82,81,87,78,79,63,53,99,117,109,123,112,69,73,92,96,61,80,68,52,69,69,76,106,81,70,53,53,56,79,80,70,49,97,51,71,121,77,79,91,77,47,60,53,41,24,24,8,24,28,27,12,17,24,49,3,29,39,13,12,25,13,19,53,32,38,30,27,50,28,21,26,52,28,33,33,27,23,20,30,54,32,23,33,16,34,51,18,22,15,32,11,44,41,19,12,18,8,2,33,48,47,39,19,16,37,45,15,20,22,15,26,38,36,26,33,29,33,26,6,27,21,18,21,19,11,21,36,11,25,15,12,24,26,20,32,1,18,34,7,30,37,112,56,21,36,9,13,38,20,28,15,18,14,20,36,25,16,32,14,27,39,14,6,27,48,29,17,29,17,37,2,35,13,28,37,19,24,17,37,7,12,38,20,8,7,23,6,22,37,21,22,17,30,17,31,32,20,41,18,27,16,37,28,19,28,39,52,47,36,57,13,56,32,48,14,22,36,42,39,14,62,238,245,252,240,224,243,234,248,242,247,246,225,242,245,228,210,129,18,2,3,12,18,30,36,50,51,77,48,52,57,24,96,165,232,232,249,250,243,195,234,232,201,198,215,220,230,198,229,202,235,227,235,215,250,219,230,225,221,218,224,221,235,215,186,183,178,142,169,92,12,0,23,10,26,8,42,6,11,1,26,14,24,9,22,10,9,19,22,15,27,21,19,20,176,129,125,144,166,143,140,95,86,78,53,88,107,198,180,50,38,19,13,53,58,33,18,4,14,23,66,53,97,179,178,167,165,167,175,165,98,114,157,194,136,116,107,74,79,139,97,120,83,100,99,69,101,96,84,79,115,168,236,83,41,99,124,182,205,239,255,254,238,186,154,150,168,131,101,80,84,145,107,72,57,66,61,48,90,101,68,68,133,152,97,100,78,84,92,94,100,99,102,72,90,86,58,80,114,78,97,110,107,98,81,85,64,45,96,141,136,106,61,75,37,102,149,196,159,115,169,207,197,202,243,235,249,236,239,245,254,249,239,248,246,250,255,236,254,246,241,240,252,254,250,247,252,253,241,253,244,255,248,239,235,252,189,192,239,24,24,13,41,54,38,112,25,28,91,27,60,47,6,43,81,52,55,44,49,84,111,117,91,59,80,62,76,91,84,111,102,131,94,113,122,127,116,96,86,90,83,57,94,119,94,92,92,62,78,86,104,83,94,105,54,59,50,80,87,54,40,56,32,51,88,47,56,53,120,111,100,124,61,71,78,84,81,44,59,50,52,50,0,60,62,59,40,124,135,110,114,109,141,120,90,111,130,98,134,97,89,95,107,116,115,76,70,31,59,63,41,67,26,13,34,15,25,16,18,17,10,8,16,31,81,124,139,186,206,230,252,251,234,253,247,244,229,245,232,241,250,230,252,124,54,14,105,114,140,177,128,101,98,100,113,107,117,117,138,142,173,154,163,146,159,153,141,122,136,79,46,35,94,70,71,77,60,104,89,72,52,57,77,34,18,29,26,43,46,33,34,47,12,53,21,32,34,14,69,37,80,76,95,91,73,103,72,59,66,75,81,26,41,33,27,38,47,63,85,82,73,76,131,167,158,56,28,4,28,32,6,35,22,50,74,70,87,64,50,88,63,62,57,62,55,80,120,175,160,206,174,147,131,140,153,151,143,144,148,167,156,153,132,141,164,159,165,169,165,167,164,137,157,150,165,148,129,172,157,157,146,130,76,50,22,9,2,16,12,20,40,26,41,15,23,44,27,50,35,35,59,27,74,37,38,40,64,74,70,82,29,45,50,14,38,31,30,38,30,39,26,22,41,25,15,13,16,2,10,5,18,17,36,12,10,25,16,26,14,25,11,18,39,24,32,44,24,25,40,33,37,53,25,32,40,30,29,20,31,33,23,25,10,25,29,13,21,40,15,16,16,55,14,21,11,7,28,16,31,51,37,92,109,68,38,37,27,27,34,12,35,23,11,25,23,38,31,30,47,15,4,20,12,27,17,14,5,18,20,36,17,21,26,29,27,29,13,42,19,32,41,12,19,38,15,10,30,27,20,22,8,43,3,28,32,27,27,14,38,22,10,26,41,22,26,40,32,30,20,18,21,29,47,24,47,48,27,13,20,30,97,234,249,238,255,247,249,245,230,255,238,250,240,236,228,212,208,164,119,82,28,13,27,30,45,48,44,100,82,74,100,73,96,131,129,205,182,194,198,177,175,153,170,167,163,173,186,178,191,195,189,222,207,228,179,215,217,194,171,187,212,234,192,213,208,197,209,197,218,130,6,0,8,12,11,3,11,24,12,22,1,1,8,3,18,7,1,28,9,39,22,37,0,24,154,82,66,96,137,142,126,103,125,109,98,96,129,172,184,39,46,61,17,13,36,43,30,33,37,56,50,53,70,72,54,70,88,79,94,79,40,50,97,85,38,33,42,20,62,59,63,81,71,61,88,55,48,46,50,70,28,139,227,79,41,73,163,206,235,233,228,175,122,45,69,149,198,212,214,169,148,189,161,183,118,86,92,128,86,94,70,55,44,61,54,85,78,74,62,68,90,84,117,103,87,95,88,80,57,85,94,102,89,98,63,77,59,43,81,72,90,63,61,47,62,71,106,136,79,5,19,9,27,16,43,34,54,96,157,123,76,102,107,102,107,95,124,90,67,85,62,61,84,89,37,49,75,50,53,59,104,157,83,116,45,45,2,139,189,28,29,37,39,7,111,214,53,10,28,18,16,32,21,25,63,46,86,73,67,116,106,118,69,37,100,44,26,61,81,81,100,141,125,132,91,92,112,127,115,104,107,92,87,122,110,105,64,81,59,30,58,75,55,80,42,55,50,53,64,45,48,49,16,56,82,44,10,65,83,112,120,111,52,82,87,74,91,65,110,126,130,115,86,120,89,76,78,136,132,56,43,22,100,67,41,32,58,74,77,108,64,70,44,70,66,68,66,31,41,22,17,40,31,25,39,18,39,143,119,163,154,243,253,246,237,255,252,255,244,254,249,255,249,240,245,246,225,211,142,172,103,68,94,41,87,140,124,150,148,109,106,107,115,113,120,92,81,99,122,142,138,138,118,129,136,156,130,137,151,71,20,12,52,88,91,103,56,63,53,98,105,83,99,106,75,49,53,54,10,32,36,41,31,43,38,44,46,60,35,37,50,92,66,46,61,70,69,101,69,88,54,52,31,14,39,40,35,54,96,118,72,85,71,138,191,116,46,15,8,23,10,28,24,67,72,67,97,98,91,58,71,73,60,86,82,54,72,101,96,159,113,126,102,112,114,114,144,126,126,110,136,134,102,123,124,133,121,143,122,140,119,106,127,93,138,94,121,110,104,96,94,104,51,47,10,8,18,6,9,15,13,41,9,21,27,42,63,48,57,73,59,91,69,56,80,98,109,132,164,131,78,68,53,44,73,42,54,30,57,43,20,54,28,28,42,31,46,16,28,21,14,14,41,25,37,30,52,33,27,17,9,22,39,29,22,12,30,34,23,26,29,32,13,36,20,20,30,14,15,41,4,9,38,34,37,28,32,26,14,37,41,3,34,31,27,5,35,20,26,23,26,63,101,89,43,27,16,26,40,26,21,21,46,33,36,37,6,4,32,27,26,42,21,24,42,33,43,25,24,23,9,17,25,25,29,28,32,33,5,57,23,20,29,8,22,44,28,48,39,19,27,55,37,6,29,32,17,3,22,49,7,56,32,17,36,21,28,27,37,25,21,9,35,13,24,45,29,30,42,4,115,230,169,213,226,239,255,249,245,226,253,215,241,248,237,212,233,239,244,224,196,162,117,29,66,38,54,54,72,49,43,30,27,47,74,103,136,150,161,156,138,176,161,168,176,152,177,167,170,177,178,175,195,166,172,194,177,158,176,180,177,181,170,153,172,173,183,197,153,138,17,4,17,3,7,27,17,37,2,35,11,7,13,7,24,11,19,7,11,2,14,15,38,8,123,107,38,64,84,71,103,150,177,164,176,147,179,185,192,138,188,131,108,134,96,144,84,100,64,49,72,81,95,88,77,81,98,90,108,118,74,36,54,15,20,10,51,51,49,63,34,60,66,73,87,69,84,71,67,78,87,114,199,107,77,143,113,159,178,131,89,52,34,46,92,142,207,247,239,249,248,237,227,250,219,100,83,130,98,92,83,69,65,59,39,58,77,89,69,77,90,80,65,88,53,85,92,106,102,86,106,74,87,94,130,128,91,37,73,60,98,80,99,85,70,43,54,62,64,41,76,9,18,24,28,18,22,19,89,94,13,14,25,32,26,35,47,36,35,32,42,6,64,79,44,39,24,14,25,26,65,153,75,26,24,22,6,150,245,28,35,39,43,39,149,246,92,44,99,45,64,52,43,41,97,70,112,101,113,101,71,75,47,61,47,49,39,5,48,64,90,93,113,106,115,102,121,102,100,130,77,88,67,73,139,114,71,79,77,63,23,85,73,29,52,50,85,47,31,17,52,57,74,49,85,43,73,78,143,182,155,104,93,113,102,119,139,146,162,168,203,178,191,211,181,161,113,125,122,14,60,10,14,48,14,23,19,16,23,32,38,6,9,11,35,39,70,68,116,138,149,195,236,247,240,242,230,253,234,243,221,247,250,255,239,246,236,234,226,221,161,153,129,98,102,84,31,17,16,23,21,7,54,66,143,156,146,133,101,104,97,127,121,143,74,118,95,123,97,97,129,93,121,109,108,103,123,127,78,22,17,11,73,95,108,103,65,94,89,111,94,102,101,132,105,49,36,32,42,20,36,32,15,29,29,36,24,23,25,36,23,63,40,63,40,74,83,82,74,64,64,43,53,35,33,42,27,68,84,89,80,69,71,43,140,165,89,10,14,40,45,48,76,68,65,77,99,89,73,64,92,67,48,83,98,59,76,57,32,52,20,33,20,47,16,33,21,26,34,7,53,49,36,39,26,21,13,27,6,29,30,44,20,49,32,10,39,31,8,24,39,53,30,7,40,30,25,16,21,23,32,15,34,26,4,13,3,10,13,57,22,13,6,61,36,35,42,85,143,112,89,20,40,62,42,57,36,29,23,41,23,24,14,36,16,4,9,36,24,19,24,13,41,29,15,16,9,28,33,7,23,26,18,46,46,39,26,12,20,30,22,12,29,35,40,23,16,14,30,29,9,21,37,17,46,11,34,3,38,20,16,14,39,21,26,8,34,32,26,16,27,17,60,90,93,36,40,26,31,6,29,32,38,35,35,22,28,31,51,24,27,41,26,19,51,32,41,33,9,25,28,13,39,31,29,43,32,41,19,29,18,47,35,35,15,28,17,13,13,9,15,19,31,25,23,60,39,50,37,41,41,25,40,41,28,52,41,57,10,46,31,38,36,21,39,31,42,41,36,23,71,95,96,106,164,203,234,232,213,198,197,213,203,220,210,172,206,241,255,240,249,211,209,156,97,88,63,45,36,44,49,33,27,20,19,35,46,79,105,129,174,177,201,172,179,189,195,199,192,205,203,197,171,183,149,180,175,125,177,182,151,156,154,165,134,153,159,169,177,116,15,1,7,18,19,8,7,3,12,17,33,4,11,4,0,27,12,14,11,3,28,9,20,21,153,132,73,78,90,61,108,171,239,229,221,220,235,230,246,230,216,213,194,200,222,207,219,166,105,102,115,115,139,113,147,153,168,82,128,133,111,114,80,80,59,59,69,96,134,87,77,111,97,107,101,105,123,130,167,156,156,173,233,192,194,203,167,197,165,151,107,145,144,110,111,151,210,224,255,239,222,194,244,243,128,117,104,125,72,36,73,81,96,69,42,64,60,92,103,88,46,72,53,95,84,78,100,77,97,104,75,56,85,159,155,206,176,82,57,52,103,113,173,128,106,70,70,130,156,183,177,112,178,203,210,164,154,110,250,250,155,143,182,197,185,200,205,166,191,168,176,181,234,255,185,171,161,152,166,164,200,248,168,143,142,154,192,243,236,42,84,101,134,116,196,241,87,116,123,98,115,102,90,142,132,71,77,135,95,61,38,50,79,43,53,11,13,59,90,117,120,98,79,101,125,122,94,87,80,69,101,116,84,154,136,83,115,158,101,49,60,52,48,70,30,24,71,67,48,27,36,40,77,116,122,145,160,175,179,179,119,90,54,130,144,177,165,163,181,158,190,182,212,198,184,148,123,72,14,12,7,5,16,29,25,7,9,47,23,94,122,122,164,210,226,238,255,248,255,253,227,252,255,250,235,241,251,249,246,247,224,191,190,180,150,115,118,13,29,4,13,29,2,14,33,29,11,10,37,49,18,5,91,119,154,149,124,92,93,100,109,122,103,124,95,100,106,134,127,111,128,127,100,114,120,102,115,126,73,0,20,47,104,97,101,88,88,94,91,96,93,123,89,119,130,93,49,28,12,31,51,7,19,30,33,39,34,38,60,37,58,63,28,73,83,69,61,69,52,73,67,64,29,32,40,49,39,73,71,85,61,44,44,5,37,147,151,49,48,29,63,55,71,74,74,83,97,79,56,72,63,77,77,84,69,63,65,47,31,11,41,16,22,1,17,12,39,38,12,35,33,4,7,22,19,13,9,26,12,18,42,13,17,30,24,41,42,8,40,14,18,29,23,20,8,32,18,32,16,28,63,1,10,27,24,34,35,27,34,21,16,16,30,13,27,39,18,16,72,109,70,53,4,19,8,17,41,33,14,18,28,12,6,17,33,12,10,22,23,16,14,35,21,17,19,8,4,9,33,43,40,15,14,26,32,11,0,16,36,11,14,23,13,9,34,26,28,17,23,25,20,15,16,30,18,34,13,30,12,25,2,24,28,34,28,23,10,23,50,20,10,35,39,60,101,48,33,19,7,20,28,39,19,13,30,15,48,40,49,25,18,27,17,31,31,39,57,33,19,34,43,17,53,37,57,17,49,28,34,18,28,16,31,6,21,17,17,14,19,12,42,22,11,38,51,32,53,47,38,29,36,67,49,26,33,38,22,48,53,51,43,45,34,49,56,39,47,60,45,65,42,66,46,50,62,115,128,138,120,168,153,148,169,153,168,139,169,161,167,232,210,234,226,170,146,157,135,130,107,114,146,125,106,108,88,102,98,85,109,127,153,166,189,167,170,165,153,200,165,193,192,199,188,173,162,164,173,179,172,146,175,145,143,163,178,153,155,175,172,123,23,18,0,2,16,11,20,17,6,14,0,1,2,26,12,4,6,14,13,3,21,30,24,23,232,211,119,165,114,119,132,149,216,219,186,213,224,219,173,156,201,182,132,141,159,196,232,155,110,111,103,128,114,115,162,160,178,164,152,158,146,131,124,137,90,76,121,144,164,191,165,201,152,161,178,183,231,245,238,238,245,248,244,253,245,240,255,247,229,235,253,237,244,255,243,247,241,207,209,202,231,198,216,188,77,91,144,158,85,60,59,56,112,95,69,66,70,55,59,64,38,71,63,97,68,80,90,88,81,79,68,59,99,119,162,160,120,60,71,81,89,95,92,104,94,94,130,157,132,227,183,150,215,227,241,233,188,200,255,241,218,199,223,206,194,239,228,206,227,234,234,223,251,244,207,241,242,241,237,233,176,235,228,191,244,238,194,251,195,56,163,177,171,117,148,251,112,116,132,96,120,82,102,91,137,106,104,142,92,79,83,122,122,105,66,37,104,119,112,86,94,97,140,139,111,95,76,74,26,76,66,120,106,102,133,75,100,146,121,69,29,56,93,85,45,50,47,48,31,46,64,91,147,170,147,138,168,147,166,123,78,38,90,105,108,132,112,141,144,176,164,144,139,123,104,102,128,99,55,43,109,139,170,159,194,224,253,239,252,222,250,238,238,240,253,233,241,253,253,236,233,232,198,181,161,128,84,75,65,61,33,22,6,14,44,29,16,26,25,18,28,33,23,13,44,25,41,28,60,23,34,27,65,87,160,110,73,71,79,97,117,121,148,155,61,77,86,85,81,112,128,99,115,123,137,99,120,103,45,106,123,112,112,92,104,116,70,111,110,76,70,116,99,105,110,56,51,35,34,36,26,44,48,17,42,57,61,63,38,35,67,70,76,58,58,61,77,56,73,65,63,72,19,55,40,20,22,82,72,39,6,21,25,16,47,56,145,152,70,34,67,83,82,67,55,31,79,84,71,65,111,90,66,89,48,51,56,23,8,10,51,12,12,3,56,0,41,4,27,8,23,31,35,16,18,36,25,17,11,36,12,36,46,6,16,6,14,14,26,6,19,16,14,19,25,0,33,9,21,9,25,17,16,1,9,34,14,22,23,18,26,35,23,26,15,35,21,13,80,98,74,48,16,34,8,10,8,8,1,24,38,12,10,3,6,27,30,3,35,39,6,17,2,22,11,26,25,21,16,19,6,23,29,32,26,14,42,5,10,18,11,35,11,26,34,25,27,0,16,19,18,25,2,38,12,1,32,47,11,16,25,1,5,19,12,9,25,21,30,19,27,38,41,38,85,98,79,29,33,19,32,18,9,13,34,18,14,9,35,4,20,24,3,39,65,39,35,35,19,29,18,20,26,13,13,27,21,22,41,23,23,16,35,70,24,22,26,10,30,7,31,24,37,27,10,37,29,40,38,38,24,35,16,42,41,48,43,48,75,35,15,10,21,44,45,45,69,40,48,58,50,27,47,56,21,26,107,146,176,157,149,159,145,143,167,164,123,130,177,179,179,200,173,176,166,216,206,193,208,226,190,217,203,204,212,190,172,182,200,184,173,155,192,136,158,144,127,155,166,178,195,197,209,187,161,174,169,174,169,165,175,158,144,168,177,173,179,165,206,118,15,0,28,2,5,24,14,22,3,38,19,16,0,2,9,14,12,13,32,7,5,14,34,13,253,229,208,220,238,194,169,137,217,198,199,237,243,196,188,165,219,168,139,114,95,201,226,164,110,110,142,141,113,144,127,150,204,200,152,162,161,187,155,155,134,112,155,158,227,205,182,229,228,211,241,248,247,216,243,250,250,255,245,254,241,253,245,248,245,235,246,244,255,253,241,241,228,221,247,230,236,171,238,216,34,108,130,141,119,85,68,76,111,92,80,87,110,59,48,81,77,69,74,101,104,120,102,52,89,67,94,76,42,93,56,69,67,67,52,67,108,53,74,86,90,138,134,104,61,170,135,61,77,78,113,116,158,170,222,241,149,202,163,95,82,146,131,74,128,162,164,224,237,183,131,134,163,244,206,155,120,179,220,201,199,136,100,202,126,50,121,80,116,56,108,150,79,102,71,74,72,70,58,48,106,111,156,139,107,114,116,121,94,100,81,84,124,127,94,65,138,159,136,122,65,58,63,50,28,13,41,81,97,71,77,40,79,105,62,68,58,20,57,67,74,37,44,60,34,59,120,127,141,192,143,173,155,131,139,106,71,74,96,98,83,99,108,129,130,150,170,156,168,146,190,196,244,239,249,246,249,249,255,249,244,255,249,253,240,227,232,229,248,234,212,166,130,89,109,63,44,22,1,19,13,12,4,8,38,32,8,18,14,39,43,49,56,39,31,48,54,36,50,50,20,102,132,48,61,74,22,22,25,54,105,86,78,64,87,107,140,118,156,146,86,78,107,110,97,131,153,113,111,133,125,119,107,91,128,110,123,93,98,99,128,103,101,104,124,129,94,131,114,42,57,30,7,31,39,45,31,30,26,38,27,55,53,54,34,41,62,60,64,64,70,66,97,74,85,91,84,23,49,41,28,25,50,49,69,8,32,19,34,9,26,37,107,116,140,85,47,79,65,72,76,76,66,63,98,75,101,65,84,65,39,47,28,18,21,42,14,18,29,11,17,45,29,30,18,17,15,26,16,23,20,24,28,13,38,10,18,8,21,14,21,27,18,14,25,21,18,11,3,22,23,14,53,39,18,30,4,24,44,31,15,38,28,28,29,22,15,26,8,6,27,22,26,23,46,95,89,22,8,16,11,14,3,3,26,39,4,12,25,35,17,15,24,32,29,18,34,17,15,23,24,23,3,23,23,37,19,8,11,10,23,18,31,19,12,11,10,15,9,30,13,7,22,14,20,9,34,31,22,31,38,18,13,4,43,41,41,8,24,23,7,25,22,8,38,8,21,12,14,13,49,79,79,67,36,20,40,15,22,22,4,50,31,11,14,39,12,16,20,27,44,33,34,6,23,24,22,34,17,29,16,27,6,23,43,20,10,17,15,25,9,27,12,21,5,4,34,37,34,26,28,22,14,25,20,11,11,12,20,20,13,13,18,10,30,27,19,14,29,30,43,16,42,25,32,39,23,42,38,21,27,30,104,203,170,205,206,211,181,176,182,178,183,155,167,167,171,157,165,161,183,177,171,211,230,202,197,204,209,216,206,218,215,214,229,216,193,207,190,181,188,182,170,174,183,164,172,190,191,185,199,175,199,176,150,163,186,159,196,182,196,190,161,205,206,97,22,21,10,37,21,10,13,22,8,32,37,12,0,5,31,23,4,20,15,8,22,24,16,9,245,231,227,250,249,236,220,187,215,242,251,236,248,241,249,246,231,247,247,252,181,235,241,185,183,206,174,182,140,114,117,135,193,149,157,182,179,148,145,138,166,187,198,160,162,138,136,218,167,176,238,236,228,213,217,182,228,251,247,239,251,255,253,254,233,242,252,236,250,244,219,184,154,179,203,235,219,179,232,189,61,83,96,84,100,100,148,104,148,83,80,58,80,61,81,115,88,98,68,88,86,123,99,71,68,82,92,74,52,85,77,64,70,82,43,58,115,78,82,103,116,145,164,100,70,145,101,102,114,84,64,72,120,113,200,252,213,210,193,119,46,131,164,105,142,153,152,202,249,183,178,143,206,230,209,197,174,223,245,222,217,192,124,199,122,88,87,84,86,72,100,98,69,93,69,98,109,99,105,88,146,107,81,107,92,113,124,110,80,39,86,61,63,102,83,161,145,127,110,104,48,101,103,73,58,24,22,58,58,44,71,24,42,50,32,29,50,66,42,67,81,90,71,40,29,38,99,146,154,124,103,145,113,114,123,135,138,118,151,146,182,159,214,253,239,243,243,249,254,249,243,255,231,245,243,252,254,231,245,251,217,207,167,148,152,101,61,46,17,102,130,38,1,2,2,11,19,18,25,39,37,17,33,62,41,61,68,54,74,40,55,76,123,84,30,33,42,56,60,89,127,143,172,88,21,88,49,56,29,28,61,46,116,106,140,156,131,135,177,122,79,82,101,131,92,132,126,111,103,121,128,107,111,100,105,89,61,90,123,96,106,89,107,112,115,107,124,77,43,19,30,48,41,48,39,18,41,31,41,15,38,24,26,54,61,72,62,64,60,71,45,54,68,72,42,68,64,46,32,51,14,34,32,52,50,33,34,73,49,81,64,76,105,96,154,144,49,30,82,83,66,64,75,96,65,115,58,39,69,55,33,62,22,21,35,20,46,31,36,15,37,13,15,20,20,39,29,13,21,24,4,18,15,27,14,26,15,30,22,28,33,23,14,15,7,25,8,13,18,14,21,11,13,1,15,19,1,22,27,4,21,25,31,26,30,10,26,33,25,17,11,5,35,16,47,88,79,49,22,29,4,5,12,17,21,37,33,0,13,41,11,22,10,22,4,32,40,19,20,15,12,27,25,19,12,18,20,30,27,15,18,30,20,9,25,5,22,19,13,18,22,9,23,42,16,34,26,42,24,38,30,37,20,35,34,43,16,1,25,19,32,15,6,2,30,8,6,9,20,2,20,49,46,85,63,12,14,17,7,30,8,43,29,39,36,26,37,6,26,35,25,28,19,12,18,29,16,11,21,8,0,22,9,35,16,12,4,5,12,39,13,22,27,45,23,6,25,32,6,25,10,35,12,18,10,6,26,9,16,16,5,30,24,30,29,50,25,9,29,34,21,13,30,15,21,12,30,12,18,28,20,69,184,195,193,177,168,198,183,179,162,166,164,182,178,138,161,134,148,171,219,175,181,164,148,166,154,156,186,197,194,198,172,192,179,198,205,215,194,190,205,213,202,197,191,172,158,192,203,197,217,216,191,199,213,221,213,192,219,230,220,222,199,201,197,127,20,12,5,17,1,3,8,10,11,6,18,15,19,7,7,22,5,17,11,24,34,2,13,4,208,162,150,199,206,211,113,99,127,179,204,244,234,238,243,243,230,251,239,241,129,217,229,122,140,169,120,197,90,108,118,135,169,130,143,182,188,117,52,131,140,192,213,118,134,81,100,170,84,61,112,104,148,95,98,104,146,199,198,182,192,230,255,240,249,255,253,254,243,234,222,136,82,134,139,134,182,191,252,177,45,95,67,82,86,86,143,106,101,108,84,73,81,55,85,90,87,75,60,78,82,121,97,63,55,60,76,68,49,121,96,67,61,78,61,61,53,124,111,100,65,105,93,113,45,134,79,102,120,54,93,75,121,121,171,211,121,160,158,50,47,109,186,116,114,107,103,162,162,137,125,132,161,199,184,212,139,183,241,192,206,191,137,158,139,84,103,82,125,76,122,104,71,92,58,85,94,113,80,122,139,104,135,143,97,88,116,65,42,46,34,26,54,57,89,123,111,81,53,76,65,102,129,97,96,52,49,17,17,65,53,44,69,45,62,61,23,58,33,39,67,74,71,49,46,78,121,141,130,151,154,168,174,184,221,255,249,244,248,239,254,250,252,255,228,248,253,218,226,244,254,235,239,219,201,190,171,136,110,92,43,8,3,11,23,4,7,16,12,65,105,18,12,14,30,33,21,28,26,58,57,45,60,142,77,131,98,57,49,75,38,64,142,136,71,32,56,117,89,82,84,94,97,64,37,48,26,30,16,41,87,78,135,116,121,133,117,136,133,92,62,74,109,100,99,134,81,125,122,124,126,84,85,79,101,133,77,106,102,110,125,101,84,116,119,79,38,30,21,18,72,70,63,72,8,28,6,56,52,30,31,34,50,86,81,91,70,65,76,70,86,95,81,45,94,81,67,68,41,28,30,39,58,55,60,43,73,76,104,104,72,69,71,61,128,171,133,63,45,42,80,96,90,72,54,68,78,40,82,50,43,44,16,22,34,17,30,14,15,15,22,6,46,11,10,49,27,13,10,15,12,28,14,44,26,18,4,12,26,31,20,15,17,29,44,4,16,32,18,22,13,8,19,48,25,13,42,30,13,14,40,9,28,10,25,22,30,20,16,21,7,32,32,54,28,83,112,56,7,25,23,23,14,30,21,32,2,10,11,36,11,47,14,2,14,5,10,3,2,39,28,33,4,8,21,32,17,25,13,5,15,20,5,49,9,30,3,14,37,30,29,17,10,40,15,26,56,54,24,21,31,24,24,24,36,24,30,15,15,60,12,26,21,11,7,22,31,3,3,10,5,26,70,114,88,65,26,17,28,37,6,35,51,27,28,27,5,29,31,30,65,51,50,7,21,47,21,35,4,0,13,43,24,15,22,33,58,14,29,18,20,19,5,21,16,4,11,11,27,41,19,13,49,20,19,12,15,25,5,21,10,51,30,28,17,35,10,10,20,42,32,27,24,23,3,43,8,16,9,15,36,145,188,176,179,134,160,154,145,130,131,152,204,217,200,175,141,162,128,184,213,222,213,194,166,148,145,117,154,132,121,118,142,142,132,145,134,169,142,142,175,164,199,156,137,124,127,144,192,216,215,196,196,192,207,222,209,228,225,206,223,198,198,184,212,126,11,4,5,9,0,24,21,38,7,20,25,20,7,12,9,23,21,6,25,10,13,33,17,23,178,97,46,54,56,86,61,49,82,114,150,227,191,125,184,186,220,210,166,159,61,159,225,90,37,67,105,143,78,58,85,132,129,112,113,142,155,116,91,113,110,119,146,142,98,89,143,181,117,80,53,89,111,64,50,65,119,110,108,155,155,178,254,249,249,222,249,247,243,231,248,142,77,87,64,85,133,155,240,147,38,95,39,82,27,65,82,97,128,106,93,97,53,59,91,110,87,73,96,64,68,78,78,107,34,49,56,69,68,72,95,79,75,59,66,46,82,55,63,61,47,36,61,76,51,84,51,25,63,53,85,80,82,86,65,77,61,27,12,12,16,36,79,40,53,19,45,51,50,24,31,70,70,90,95,136,109,109,156,110,145,144,108,97,55,47,63,62,85,92,47,38,44,61,37,47,52,45,99,122,127,93,104,95,52,66,88,115,95,65,56,31,38,50,64,66,94,89,108,80,39,89,79,96,115,103,97,48,33,7,46,17,17,61,63,76,85,63,72,93,66,100,150,132,191,224,237,251,247,250,255,243,251,247,251,246,243,251,232,247,254,251,230,233,202,222,187,76,40,117,201,177,135,142,170,166,141,77,72,31,25,31,15,28,12,20,6,0,19,69,93,117,57,47,35,58,53,52,47,95,129,83,77,103,84,133,134,60,41,63,40,32,62,71,66,64,38,52,77,53,56,60,93,59,29,74,61,55,16,117,124,150,168,130,134,131,131,140,126,131,103,71,128,120,118,97,95,111,113,113,78,89,96,78,55,112,106,96,89,95,117,94,115,110,72,22,26,39,15,46,65,119,120,24,1,41,12,26,60,6,48,54,73,68,82,70,64,60,70,81,85,66,80,65,90,99,77,88,35,31,41,29,19,33,49,53,61,74,76,95,86,51,88,58,111,149,165,87,44,67,65,111,78,61,76,97,73,70,76,57,76,47,37,20,21,14,28,8,8,26,22,4,21,22,35,29,3,23,37,18,8,25,21,14,5,27,10,37,19,31,33,41,11,9,16,19,28,33,15,7,29,26,13,2,17,11,20,9,28,44,11,21,34,16,26,19,25,18,40,24,10,19,14,16,9,79,115,66,20,31,39,12,8,18,19,25,18,11,34,36,12,3,29,35,19,21,18,14,37,5,42,20,25,23,8,36,26,8,13,21,36,30,36,9,20,11,26,33,28,18,25,21,33,17,15,7,23,23,40,41,13,46,45,10,35,16,21,12,8,9,30,16,16,33,23,8,7,44,5,18,33,13,54,48,77,75,37,36,24,12,17,26,17,15,32,19,10,24,30,45,20,28,16,23,10,14,37,10,30,25,7,35,8,36,14,32,18,6,20,13,21,18,17,13,26,41,15,46,11,21,13,34,28,25,15,22,32,17,8,16,2,9,14,14,3,8,20,16,35,12,5,20,17,32,17,10,27,20,24,44,86,155,124,176,129,122,122,135,124,143,169,210,254,247,243,229,248,196,180,224,250,243,250,249,220,190,126,79,76,74,128,128,90,110,103,100,125,133,91,95,114,98,93,77,74,89,60,65,110,129,157,153,143,147,151,132,164,155,165,168,165,172,165,139,166,95,22,10,10,0,19,15,11,22,27,20,13,11,15,19,18,11,13,18,24,11,30,4,4,42,230,147,92,104,118,82,135,130,171,192,197,225,207,150,216,180,210,195,157,158,84,179,213,50,59,55,105,144,76,73,80,85,95,45,88,106,141,137,124,106,76,106,194,165,144,106,167,237,157,84,80,115,139,87,142,93,59,44,64,157,177,234,244,248,235,255,235,252,250,250,254,127,72,124,121,98,48,101,159,29,28,62,84,82,44,57,53,55,116,114,119,113,96,82,77,100,99,86,90,103,81,39,37,54,57,43,74,74,57,68,102,87,73,71,81,64,83,93,53,68,90,71,94,151,129,188,163,42,59,102,110,88,75,75,57,79,89,131,172,139,201,176,214,161,133,111,142,171,128,112,71,47,73,64,44,78,61,57,74,96,75,95,85,71,20,60,45,37,52,41,31,62,42,52,26,53,17,40,128,125,129,91,73,66,45,87,90,131,94,57,50,33,61,37,37,50,112,104,78,49,14,20,56,66,49,55,74,31,13,11,17,22,77,108,182,178,209,223,255,223,249,255,243,249,245,239,253,247,240,254,243,239,242,243,240,231,213,213,208,200,187,178,164,172,174,168,107,8,0,37,151,154,82,50,61,66,58,30,35,35,24,57,96,102,102,147,110,39,64,120,122,154,114,38,93,78,35,65,35,68,115,69,35,50,50,95,85,68,42,51,43,65,40,68,70,67,11,34,78,53,113,99,135,100,24,88,84,47,84,167,186,156,150,139,131,141,106,84,129,151,117,115,137,94,91,97,146,121,94,86,66,73,53,72,76,79,77,118,101,123,122,105,69,44,12,11,53,27,41,94,102,84,84,47,13,15,38,68,47,55,35,73,92,78,69,88,89,69,86,72,80,89,87,67,103,45,40,38,50,35,28,15,27,62,77,50,67,69,80,81,75,68,50,64,52,74,145,185,103,40,75,61,90,61,72,52,80,68,70,43,23,29,7,20,18,11,24,3,30,27,25,23,22,13,12,15,12,34,12,6,27,19,28,10,28,31,30,20,6,25,42,17,23,15,10,9,10,12,17,24,8,30,20,13,14,11,23,50,21,33,4,33,26,34,20,18,23,37,26,7,27,39,16,20,0,57,93,65,23,19,17,11,31,4,20,19,37,25,18,31,19,32,15,21,21,26,29,36,11,11,25,21,35,16,22,12,14,23,10,24,24,9,27,20,31,21,8,22,16,22,38,15,38,26,34,26,32,24,23,38,38,23,14,12,40,26,26,24,10,10,20,13,4,12,15,45,22,5,38,32,7,27,11,37,64,96,74,49,24,25,13,13,11,24,9,41,24,42,16,18,46,43,24,31,17,25,36,20,5,17,46,25,31,41,1,33,21,13,18,35,13,33,41,29,11,31,5,10,10,19,32,20,5,3,41,8,27,25,10,47,22,28,23,4,16,13,22,18,21,27,46,15,14,52,4,44,26,25,24,25,33,93,118,96,105,81,110,113,121,150,161,210,194,222,241,226,221,192,164,218,239,235,230,224,208,178,96,66,57,84,130,153,126,118,124,144,172,139,121,88,38,36,23,18,63,50,30,41,40,43,52,49,42,27,39,27,34,57,85,55,144,152,196,167,134,90,26,0,16,0,25,26,23,28,0,46,43,20,19,5,34,23,23,9,6,28,5,17,15,24,219,188,100,162,153,149,163,180,182,230,199,249,188,144,192,188,237,208,166,194,103,194,221,76,54,58,93,127,83,47,40,40,40,33,53,73,91,157,116,88,106,167,216,206,167,114,150,246,162,104,74,124,133,150,151,84,27,8,40,135,242,237,241,227,245,223,203,175,168,154,190,102,117,167,138,106,79,102,138,77,65,67,55,95,55,67,65,67,77,75,100,95,98,67,78,68,106,67,137,125,87,58,31,39,25,67,62,72,98,68,89,108,79,74,59,63,121,112,54,89,138,65,158,164,211,253,229,171,133,147,143,68,73,69,144,237,237,247,247,238,241,250,228,248,250,250,255,246,252,238,249,207,166,71,41,45,4,47,34,27,41,25,27,37,35,49,32,57,42,25,27,31,58,37,27,46,29,29,118,129,91,74,108,35,58,86,58,59,35,41,23,19,29,9,65,34,36,41,37,40,39,86,83,91,116,100,148,169,203,222,235,255,246,243,254,252,242,226,242,243,242,238,216,248,250,230,237,222,208,180,164,173,153,134,126,172,184,147,140,160,176,140,128,116,88,108,46,2,28,10,74,62,63,52,77,92,68,97,128,145,136,165,152,201,238,225,222,95,57,83,133,124,43,45,49,91,55,57,40,51,66,55,86,32,34,41,75,105,30,60,94,105,107,97,97,109,45,63,30,74,100,94,126,98,40,90,56,18,85,154,178,146,145,131,150,150,78,46,83,128,89,115,115,120,106,87,74,69,35,85,89,79,81,100,55,82,88,108,102,71,60,22,26,25,34,10,57,90,96,103,42,47,40,32,12,0,30,25,60,54,63,64,74,93,62,95,76,98,87,82,84,75,66,44,26,33,32,14,32,11,15,44,56,77,64,71,68,80,85,88,93,81,54,90,68,48,101,157,173,99,25,49,44,67,71,78,59,79,100,74,39,47,13,29,30,7,28,26,17,25,30,36,25,13,14,22,24,21,44,9,20,10,28,20,4,33,0,9,14,14,33,19,13,16,31,24,40,23,27,19,11,23,46,9,41,30,29,25,26,28,20,15,34,13,23,34,30,7,27,26,21,34,25,45,21,37,86,94,66,7,20,6,11,41,26,25,17,22,5,19,17,7,15,23,21,18,52,26,18,16,48,34,16,19,28,30,42,25,17,9,41,38,16,30,26,30,15,25,24,9,13,39,22,40,26,28,31,33,37,21,34,24,35,20,55,13,9,31,21,15,25,26,27,7,26,41,32,40,36,16,18,17,21,37,19,75,91,51,42,23,20,25,36,25,7,23,31,38,4,26,43,46,13,24,34,13,9,20,43,41,25,35,33,12,25,21,9,39,30,5,9,10,30,40,36,19,7,28,12,28,19,14,16,11,59,6,25,18,11,19,13,20,37,8,14,34,31,37,14,47,18,31,38,11,42,13,17,46,12,9,17,60,56,40,77,68,47,47,29,89,82,69,77,74,69,94,97,90,71,99,146,106,117,121,131,97,45,45,66,103,152,160,127,129,109,112,117,117,137,106,60,15,39,38,50,52,61,51,22,57,11,11,19,25,29,29,10,30,33,20,119,228,217,220,149,68,11,22,30,10,9,8,39,17,8,29,15,7,16,17,32,11,8,21,32,3,0,13,24,7,237,131,108,95,76,114,140,189,194,231,243,250,199,174,204,213,244,246,228,191,95,236,211,80,85,48,78,139,52,46,25,53,14,9,22,37,102,148,142,88,88,127,185,177,77,81,125,195,110,33,55,88,128,146,157,129,54,53,39,101,185,199,206,228,196,224,165,88,36,43,110,88,142,151,107,83,56,172,225,109,60,66,86,95,58,76,60,51,64,92,134,102,73,90,92,128,138,90,108,145,89,94,69,31,42,37,53,71,107,70,84,89,48,75,67,48,106,105,130,124,122,73,130,124,173,234,244,224,72,104,124,83,77,103,135,178,246,239,242,219,231,241,236,245,247,243,225,219,189,249,246,206,133,54,10,24,8,29,44,54,40,17,39,55,35,25,27,40,36,33,43,33,43,57,45,49,25,11,48,50,58,73,98,114,35,18,22,11,23,12,21,7,20,24,24,36,152,121,157,175,210,241,233,228,240,237,255,248,255,242,236,241,251,253,243,236,226,218,194,154,149,122,115,81,49,89,147,141,150,138,116,117,114,84,56,126,91,95,86,37,56,40,70,54,39,94,52,10,12,8,120,118,203,187,207,173,168,187,229,192,174,205,183,168,136,175,154,43,17,25,46,53,50,21,36,53,53,33,23,37,75,49,71,41,39,9,86,158,77,54,77,105,152,101,103,67,82,66,48,27,44,27,45,61,32,42,53,16,100,144,167,145,152,135,150,142,87,40,117,138,101,137,129,100,75,74,65,82,58,53,32,70,53,102,105,96,95,118,68,56,21,7,10,32,54,24,40,60,76,69,21,45,5,53,34,7,50,51,44,79,62,65,78,103,80,71,90,88,71,73,68,91,64,18,27,30,2,13,34,36,25,71,62,74,80,93,94,101,55,86,68,76,49,83,91,73,70,113,165,131,94,60,39,42,56,94,82,113,90,59,49,7,12,26,13,10,12,27,28,16,30,27,11,37,22,1,32,15,10,20,44,15,2,19,9,31,21,13,19,8,6,22,41,26,14,24,13,23,28,22,34,12,11,34,28,29,8,23,19,9,9,29,15,17,22,8,10,35,10,14,10,23,43,20,23,26,87,84,39,37,12,7,21,37,43,0,20,30,9,15,25,41,25,23,28,2,32,16,19,26,9,23,2,19,31,22,34,36,18,18,3,33,42,15,23,29,11,25,16,15,21,30,28,35,17,32,35,13,15,36,43,37,46,44,28,37,23,18,26,17,18,33,29,25,28,15,11,18,23,22,51,30,18,14,18,39,74,92,58,25,3,14,51,11,24,39,25,28,13,50,81,53,17,48,8,32,19,35,25,6,42,44,25,13,12,12,18,23,22,11,4,34,30,13,5,7,22,5,30,25,24,44,25,36,29,30,15,21,40,28,12,5,19,24,16,35,44,14,32,25,25,27,35,10,29,34,32,30,34,25,32,55,34,79,51,52,59,58,33,31,8,15,33,33,6,24,63,39,10,26,54,21,28,33,69,50,71,67,63,85,112,99,79,72,42,68,70,65,78,72,50,48,20,26,69,72,34,52,45,35,45,36,56,103,138,156,126,92,53,15,189,229,228,222,153,66,4,27,18,40,24,6,33,4,25,0,19,36,1,22,18,16,17,9,10,6,23,24,23,18,225,165,91,91,48,69,121,141,202,192,197,246,199,186,225,210,240,240,193,109,95,224,203,87,62,64,70,138,81,83,67,72,81,29,56,60,77,119,121,82,60,51,63,58,17,29,32,25,14,54,23,41,63,86,113,120,131,115,117,155,189,158,147,173,138,126,163,118,50,50,42,70,124,83,70,71,95,146,217,87,35,70,78,87,96,79,88,76,79,88,106,99,75,55,95,153,158,114,132,140,128,146,116,98,64,72,78,93,111,110,68,106,85,58,88,93,125,105,94,114,112,65,86,74,117,192,227,146,2,90,95,142,163,155,147,101,42,142,181,184,130,168,253,250,255,240,215,218,196,148,149,130,87,31,32,16,13,25,37,63,43,35,60,54,41,32,40,51,48,23,49,30,84,126,94,33,11,19,6,22,59,74,102,80,44,39,36,36,63,128,159,187,205,231,245,252,248,236,253,254,236,243,231,255,243,249,243,227,233,197,186,208,157,134,154,97,128,108,29,1,7,26,84,68,43,53,92,101,85,91,42,33,44,33,16,15,25,34,22,63,77,122,176,117,122,193,121,61,52,34,148,232,226,239,200,169,141,102,128,86,94,90,37,67,40,74,96,81,118,106,123,126,135,129,99,125,45,24,55,58,96,65,66,70,50,31,48,124,79,29,26,80,83,80,54,67,52,83,36,70,41,69,90,102,17,32,85,51,131,129,152,160,142,148,126,137,81,65,124,106,86,114,97,109,63,54,41,56,94,59,89,89,97,71,103,100,58,32,25,26,16,25,31,49,62,76,55,28,80,111,68,8,24,22,68,58,45,46,19,39,49,64,66,64,81,68,65,59,93,80,66,28,20,9,19,5,7,46,53,85,104,89,107,100,85,82,107,71,92,60,91,57,85,78,86,80,47,72,103,171,187,102,52,5,78,51,88,63,74,50,44,40,36,17,3,16,11,24,8,36,32,22,34,37,47,14,22,32,5,33,13,37,20,21,30,32,17,18,43,16,4,17,8,32,54,29,19,23,12,12,30,26,6,35,25,24,1,9,7,25,33,29,4,32,50,15,26,18,34,5,31,16,22,32,5,32,57,96,59,25,7,13,18,8,20,10,26,16,23,26,45,39,10,25,24,39,10,16,27,14,16,33,15,36,14,36,27,25,42,29,16,23,25,7,8,35,24,20,23,23,31,35,5,23,33,39,14,37,4,24,33,44,43,16,28,39,19,24,18,7,3,48,21,3,8,33,20,5,19,31,33,15,41,18,20,12,47,74,90,58,12,8,35,18,36,30,9,46,56,28,36,18,8,55,36,3,18,34,50,12,19,29,25,26,30,42,41,18,18,25,7,13,56,7,51,25,9,30,24,22,21,5,4,24,11,26,33,27,21,21,39,12,27,11,14,41,8,19,18,34,25,31,8,10,33,23,7,22,12,24,49,58,60,78,39,60,25,14,19,13,34,17,38,37,20,6,82,67,29,50,38,25,24,31,51,26,65,76,49,88,64,28,50,57,11,38,77,35,56,63,50,65,68,111,89,91,94,72,42,34,34,89,135,201,230,222,230,216,129,96,231,230,245,236,177,82,4,3,16,21,10,17,22,20,5,4,33,12,0,0,0,24,32,15,17,24,21,5,9,11,251,221,190,140,68,92,131,161,159,139,160,216,159,113,150,194,227,202,113,98,82,194,179,117,106,77,106,129,113,148,150,152,118,111,112,131,123,119,95,72,77,65,48,79,74,59,64,74,69,86,75,40,71,79,76,88,139,169,199,167,145,112,87,127,89,86,98,126,101,97,55,43,84,72,72,63,65,85,106,35,66,104,99,134,153,114,48,62,73,83,88,72,76,58,68,134,135,99,119,169,123,130,113,131,111,82,79,104,110,106,98,128,64,45,79,112,160,95,63,26,25,63,66,59,40,50,46,8,19,6,24,55,156,103,80,7,24,32,26,22,20,25,71,120,86,106,140,178,145,96,86,124,94,17,34,17,10,27,37,39,33,38,31,25,30,55,46,66,47,39,19,20,88,155,122,114,94,90,95,115,145,154,189,228,224,229,252,250,247,254,254,239,248,255,249,244,239,250,245,237,219,182,157,157,142,109,98,65,41,6,21,3,18,51,82,42,71,89,14,9,53,65,79,14,38,39,49,61,58,65,58,53,64,74,27,40,7,63,109,103,138,163,166,151,131,149,150,61,52,17,94,142,124,109,66,70,66,43,81,77,69,115,101,85,73,100,148,180,172,193,197,174,130,130,154,114,66,74,92,33,75,62,44,74,41,70,21,37,56,75,39,75,77,52,80,60,89,105,118,106,95,105,125,113,61,44,93,46,53,100,140,106,130,113,125,155,95,86,137,115,83,84,76,72,55,53,58,58,101,80,72,68,86,83,54,45,38,50,34,61,40,27,65,78,139,156,118,137,139,122,149,57,69,62,65,63,31,31,52,54,57,71,64,68,64,89,103,101,78,44,28,14,45,19,31,50,49,83,85,102,110,103,98,84,55,81,79,68,70,89,61,73,65,82,88,52,65,59,65,105,175,166,117,55,35,61,83,42,45,44,17,33,16,14,43,17,36,25,27,18,26,23,11,7,17,3,12,21,18,15,22,42,24,41,35,20,30,2,14,44,33,25,9,44,9,31,21,27,21,9,20,30,14,18,17,15,25,20,19,19,9,24,39,12,23,23,25,18,10,38,33,37,3,33,17,17,46,77,92,16,12,4,7,16,49,30,16,19,23,45,53,33,40,25,15,14,39,7,35,17,22,7,49,24,15,18,13,22,31,38,17,31,13,27,36,11,14,13,11,17,9,49,25,34,26,23,39,11,23,33,31,44,58,37,32,41,23,10,41,22,18,26,29,18,44,31,10,31,25,18,22,35,31,12,10,28,32,44,84,105,52,14,17,12,23,1,14,23,25,37,53,22,18,27,34,52,19,18,18,14,43,8,25,23,33,3,17,26,11,14,45,1,21,11,17,20,10,32,8,4,27,35,22,21,23,35,35,14,13,37,6,47,2,23,19,19,47,32,29,4,20,36,29,21,40,16,25,14,7,7,32,37,63,64,42,31,21,11,21,30,59,20,15,19,11,15,66,76,55,75,27,25,32,39,31,61,31,84,73,69,50,66,51,61,29,56,53,49,85,93,73,96,148,216,212,238,209,208,189,168,152,173,187,240,207,225,202,183,172,149,239,240,247,222,179,132,22,11,34,5,27,15,0,32,9,8,8,38,26,9,2,29,9,5,5,14,19,42,10,38,254,240,246,234,147,130,145,170,171,112,125,165,152,105,146,160,151,132,127,131,146,151,152,123,133,133,145,138,122,171,166,161,191,211,235,235,212,216,190,227,196,205,185,185,198,193,229,230,224,214,201,212,199,194,193,197,207,206,204,191,161,162,176,189,129,73,51,82,80,87,39,50,88,58,79,91,82,112,145,95,86,121,141,146,135,120,148,100,40,54,58,63,78,64,59,121,86,103,123,131,100,102,130,153,133,111,110,89,96,79,43,72,42,46,35,49,115,26,52,44,57,90,38,102,63,53,19,23,22,40,32,21,43,25,41,28,52,14,18,37,15,19,31,23,30,53,74,67,47,62,56,126,73,12,22,9,29,17,9,38,16,12,20,28,40,44,57,78,28,26,31,105,165,227,207,213,241,234,251,240,237,246,226,240,241,242,231,242,236,199,211,219,159,174,125,128,84,108,110,72,36,15,8,30,23,32,11,22,33,37,7,8,31,8,31,9,7,19,7,9,24,12,29,67,56,86,86,106,93,101,84,69,70,54,20,13,4,0,20,27,26,53,40,13,30,19,28,32,36,10,31,62,63,45,39,90,144,137,179,148,137,133,99,71,67,58,66,65,90,72,77,77,68,83,57,54,40,70,49,27,55,66,92,30,106,70,11,66,157,64,58,105,130,63,48,41,66,32,64,61,47,37,23,48,78,57,39,47,30,70,113,112,105,121,147,178,93,87,78,66,62,68,94,72,57,66,86,70,103,99,102,79,58,40,24,32,36,26,38,38,64,66,100,143,131,140,156,139,171,132,135,128,84,99,80,74,49,31,35,40,42,66,68,88,67,89,53,47,38,32,37,29,24,29,33,60,64,98,90,101,83,103,114,79,49,74,86,64,91,60,73,53,69,38,72,80,82,70,42,40,104,167,188,144,97,56,28,60,24,6,27,12,23,39,18,44,24,14,25,27,22,37,26,33,36,19,18,31,12,8,50,28,38,29,3,11,26,26,17,34,19,33,32,9,17,24,15,35,35,42,27,34,25,15,14,43,24,26,10,31,23,30,18,13,22,22,15,14,18,15,40,1,12,9,32,28,25,79,99,32,6,23,3,23,10,25,16,13,23,13,34,16,17,24,39,21,25,14,25,4,16,15,41,24,24,19,11,27,0,40,4,28,11,17,16,40,28,17,20,4,12,29,54,21,27,18,39,24,46,32,41,35,33,24,40,18,36,14,12,27,18,14,4,18,32,10,25,29,24,22,36,27,26,19,25,41,40,34,63,88,64,41,7,15,24,3,18,31,42,48,36,35,12,16,6,26,21,26,36,28,21,41,18,31,28,5,5,31,21,25,14,17,5,9,18,23,26,32,28,44,16,14,23,22,13,26,13,16,9,17,0,9,29,12,39,25,38,47,33,44,20,29,14,25,26,16,50,31,29,24,65,65,101,96,97,121,132,167,167,180,169,200,169,170,139,163,195,183,161,165,112,92,125,131,136,69,88,91,78,93,80,67,63,42,35,45,57,61,47,68,48,131,165,218,241,246,243,248,243,225,225,214,234,240,224,185,148,144,139,133,215,238,242,231,219,95,17,9,8,14,36,11,16,20,19,11,14,26,16,0,8,26,3,31,17,11,9,14,10,23,250,252,240,247,165,132,130,153,141,133,137,157,125,169,173,138,130,134,104,160,142,173,191,165,193,192,214,181,180,224,225,233,232,241,254,239,232,251,239,255,235,250,230,251,251,253,245,255,249,234,244,249,246,254,255,249,246,246,243,252,246,253,233,246,254,108,100,59,86,113,74,122,114,141,157,188,134,205,216,111,121,208,201,207,188,180,126,104,60,63,44,65,80,52,64,88,87,125,135,163,113,93,122,151,183,126,111,80,43,45,67,137,87,72,59,53,73,63,131,116,164,196,171,226,251,245,243,220,235,213,235,226,232,215,214,247,240,240,239,227,230,207,230,145,177,165,180,208,180,203,181,219,190,227,229,216,240,230,229,222,232,223,137,234,218,142,248,250,240,247,244,225,247,246,253,241,243,248,252,246,238,231,244,193,180,184,182,183,151,38,79,126,144,125,108,72,63,24,23,34,3,12,6,15,21,42,50,66,55,64,77,81,91,129,72,98,78,72,31,1,28,52,92,125,174,101,109,85,71,21,53,18,40,20,39,19,37,41,24,27,49,33,38,33,40,99,98,50,25,22,124,149,129,114,27,69,71,103,110,68,63,53,2,19,32,3,18,8,16,29,8,16,100,42,43,92,41,39,47,35,46,78,161,152,169,88,29,114,140,103,93,84,77,50,47,45,37,37,49,60,32,33,43,45,49,38,75,23,51,80,70,79,89,92,107,141,72,63,98,82,89,94,116,82,57,80,86,107,101,103,72,52,52,25,34,50,28,41,66,75,123,111,116,148,128,150,129,116,105,122,86,75,61,68,71,76,41,23,15,56,41,67,80,90,64,48,19,21,17,20,18,31,50,77,91,92,86,85,92,82,95,86,90,82,94,77,85,124,74,72,57,53,73,74,80,71,65,76,41,58,46,94,187,208,146,46,31,37,16,17,19,52,21,13,10,23,34,30,26,34,26,21,13,29,14,22,23,35,16,26,21,28,13,29,39,6,26,30,18,36,35,7,27,12,29,40,31,17,14,30,16,28,30,28,60,30,37,3,44,29,12,21,28,14,14,10,18,31,35,33,6,38,4,28,34,32,14,87,77,8,15,8,23,11,5,9,40,22,21,22,14,42,32,33,30,29,13,25,18,31,22,11,34,15,5,39,9,15,19,17,19,7,20,3,30,29,12,14,11,14,27,30,24,45,38,34,42,29,23,36,12,40,21,17,22,43,22,21,24,28,12,32,17,20,43,35,30,19,23,32,43,27,15,17,8,31,26,42,38,45,113,86,48,23,23,30,18,64,13,13,48,34,39,18,26,17,29,49,25,22,28,4,12,40,23,13,4,0,31,16,32,0,41,32,18,46,11,9,35,41,18,12,22,10,38,17,39,9,29,10,12,31,32,12,18,14,47,10,32,37,25,36,18,22,8,28,31,12,29,19,43,64,69,100,219,236,242,232,247,247,249,243,229,252,246,243,250,253,254,243,230,220,239,239,229,162,97,125,116,84,86,76,46,56,41,29,73,56,81,57,81,136,185,171,163,237,233,241,240,233,239,249,243,250,205,175,176,168,151,141,227,224,212,246,245,111,0,2,13,34,13,26,16,32,12,25,35,11,8,8,48,6,14,6,18,18,18,14,12,20,203,244,236,235,145,111,159,120,155,115,156,167,152,160,123,165,157,147,159,174,220,246,237,234,249,242,242,243,255,239,252,255,240,255,254,250,252,249,232,252,254,249,255,245,251,245,236,244,241,254,245,255,242,252,255,241,255,252,237,235,254,228,242,253,241,187,126,133,135,150,141,154,183,160,195,227,201,164,158,102,144,226,225,234,216,183,124,78,80,78,56,74,93,83,92,150,128,117,120,165,115,123,90,92,160,124,88,75,83,57,99,188,182,128,150,103,150,92,149,183,195,227,203,240,252,247,211,250,244,252,226,244,248,232,251,239,246,245,237,242,246,234,248,246,236,239,255,246,252,248,243,216,246,243,243,251,254,254,255,242,253,255,255,255,246,252,244,240,242,243,252,251,247,250,240,244,252,239,255,250,253,242,222,253,251,221,234,251,255,67,83,224,238,235,162,136,67,52,99,94,144,186,194,250,238,229,252,254,237,251,253,235,250,238,251,245,236,237,207,78,90,236,225,167,115,50,88,75,53,55,88,134,175,217,242,231,226,250,247,252,240,240,250,218,247,245,250,221,96,22,102,160,108,58,15,4,13,39,42,76,86,114,173,148,178,167,164,159,157,175,133,96,113,117,128,124,50,39,50,51,38,77,131,135,126,60,49,62,80,22,22,47,26,55,53,79,43,84,103,96,103,122,114,108,76,18,75,62,43,73,62,71,79,77,94,97,77,51,60,72,93,81,108,74,71,78,90,128,83,60,13,64,40,52,29,65,53,102,81,97,116,135,120,107,115,120,126,111,78,59,75,59,67,94,76,52,71,61,54,46,37,77,77,23,39,4,49,21,13,19,51,86,93,77,82,81,90,74,76,73,55,93,68,71,78,99,69,72,62,69,61,64,75,79,89,57,70,66,81,51,65,47,87,170,166,58,32,37,11,24,21,5,13,13,30,19,37,29,15,14,13,27,34,46,40,23,35,16,36,19,36,30,19,24,35,21,12,33,8,29,47,37,9,21,22,30,31,22,17,20,38,15,24,23,29,29,16,23,3,28,13,37,27,26,13,38,16,31,29,10,16,11,9,19,5,14,12,28,96,28,21,39,21,22,21,15,43,37,29,24,50,23,44,15,14,19,24,14,15,18,9,30,19,26,7,24,16,20,5,24,26,16,9,23,28,22,26,29,9,25,22,23,34,23,38,22,46,25,29,35,41,20,29,18,45,55,51,35,22,35,17,34,14,2,8,49,35,44,33,16,29,31,12,21,2,28,5,26,42,17,62,85,90,46,22,20,18,11,27,52,45,0,29,9,11,25,11,9,20,0,37,24,14,14,12,21,20,14,45,28,24,23,30,22,20,30,26,21,41,1,2,27,24,22,20,13,18,33,20,10,15,22,19,24,25,8,21,23,28,9,33,3,37,26,24,21,35,11,41,25,22,30,31,109,245,246,248,236,255,252,244,255,247,233,246,247,247,248,230,250,245,250,245,250,224,183,141,134,78,70,20,18,43,65,68,39,66,57,89,75,108,136,97,62,91,135,190,198,191,210,240,233,235,240,228,230,231,223,206,234,248,234,243,233,237,105,7,2,33,16,6,22,21,15,32,9,25,23,26,1,5,11,11,33,16,15,16,19,13,4,142,159,185,168,126,137,136,137,145,148,149,155,175,169,138,152,190,205,229,252,243,236,215,208,210,246,240,240,233,221,206,196,202,205,207,220,206,206,219,205,214,223,160,210,211,221,226,199,211,201,175,179,188,171,190,198,193,211,204,203,191,205,214,204,165,143,141,112,163,164,150,143,146,127,157,151,140,108,80,54,122,132,121,150,127,82,67,9,110,104,121,46,80,82,122,114,119,73,80,74,128,98,26,54,93,81,87,55,46,81,50,173,135,91,118,83,130,127,109,127,142,122,143,219,242,226,219,150,233,179,158,208,222,185,242,236,233,249,231,238,252,236,254,243,252,248,245,253,252,242,255,254,252,253,242,251,253,249,224,249,246,255,255,238,249,253,231,234,253,240,249,237,245,255,243,251,244,254,244,249,249,225,239,246,241,254,240,219,191,34,45,177,190,212,193,217,235,251,239,252,243,245,246,229,237,238,237,238,238,252,233,223,249,234,252,247,224,220,156,27,78,189,156,186,165,196,238,255,244,255,250,239,254,255,251,248,243,247,239,255,255,253,249,239,236,242,252,124,16,13,89,135,125,140,172,203,249,251,244,242,237,250,243,247,247,250,238,226,248,255,246,186,175,157,115,81,65,37,56,4,28,42,74,45,71,34,50,77,73,107,79,84,76,127,129,108,104,138,137,147,137,152,82,93,66,30,83,59,35,66,67,82,103,109,103,62,90,79,75,94,98,74,71,87,82,67,62,49,34,18,16,28,23,53,70,68,107,105,123,96,106,115,116,122,110,83,76,67,35,18,83,62,58,86,60,60,41,39,41,41,52,55,30,57,7,39,24,19,28,58,78,90,87,101,76,88,70,97,71,36,76,76,69,62,53,62,36,67,65,69,71,41,50,85,64,60,62,78,79,101,64,44,44,42,60,52,40,34,19,18,17,27,19,29,14,26,28,5,44,38,33,12,37,8,24,17,14,45,16,16,17,19,3,28,37,23,24,35,19,19,37,0,34,27,4,14,34,38,15,31,20,31,6,32,31,18,18,21,49,27,39,26,13,28,14,30,20,27,20,21,38,1,6,31,20,21,17,70,81,53,23,26,29,24,14,53,14,43,9,43,32,7,16,36,52,10,41,31,25,30,25,26,21,21,13,18,18,9,25,21,46,14,35,24,50,6,33,26,29,36,30,23,20,19,35,27,46,21,38,15,35,28,35,24,31,11,48,19,26,26,14,29,19,23,25,29,14,30,14,8,43,18,31,26,29,57,36,29,21,18,13,87,97,84,30,18,28,28,18,46,25,27,33,4,23,12,22,35,32,8,11,27,8,30,13,24,9,9,52,14,13,37,26,14,22,25,20,9,35,17,53,34,32,26,11,16,2,25,22,32,36,20,32,4,9,31,42,12,28,34,9,47,29,28,35,40,30,20,17,46,11,14,29,106,215,234,252,233,217,217,235,223,229,203,239,220,232,227,237,202,224,235,253,246,220,191,135,100,28,53,73,21,35,61,79,56,49,74,97,112,112,128,85,65,40,79,147,166,220,220,241,246,243,246,254,253,247,238,229,225,249,228,220,244,237,109,16,17,9,12,25,6,27,0,17,25,30,3,15,0,15,8,13,46,23,12,6,9,21,18,236,231,214,173,151,131,165,153,185,164,145,145,175,157,184,210,228,250,246,250,195,135,113,16,61,148,109,104,55,53,38,2,0,0,5,16,22,23,25,39,20,15,45,24,31,30,25,41,24,25,56,52,41,1,22,11,53,102,99,54,9,19,81,64,76,92,76,96,79,62,54,52,83,70,81,87,43,42,57,49,52,26,27,58,23,23,53,47,81,56,97,82,77,87,127,127,103,100,96,109,76,68,71,72,62,64,96,62,74,40,71,92,32,63,44,44,76,66,77,70,51,55,42,82,91,24,10,38,41,30,7,81,107,104,151,146,168,159,189,182,215,234,244,245,246,248,228,235,232,242,223,209,192,183,161,181,190,192,128,83,119,154,183,172,170,131,152,199,212,207,193,203,201,200,173,172,126,110,170,108,108,79,107,109,64,75,60,72,94,26,29,63,131,211,227,241,238,243,251,245,240,250,224,236,225,237,209,184,163,178,148,114,113,91,93,71,58,61,24,12,70,216,245,247,247,249,236,251,250,249,254,237,239,240,227,240,247,224,213,205,153,153,138,117,44,37,66,51,35,2,92,237,249,237,241,249,248,245,248,232,245,248,246,246,235,246,246,230,217,212,193,89,41,65,73,86,46,44,35,63,88,140,69,60,53,60,71,129,78,84,96,91,85,74,37,64,62,33,35,52,56,60,35,45,70,38,76,70,52,93,132,119,124,141,104,74,81,75,119,118,143,124,96,95,43,47,32,13,8,30,56,54,60,90,95,120,90,123,115,98,91,101,91,92,52,69,54,61,61,51,60,67,63,83,59,79,51,45,59,52,29,54,38,30,56,49,73,92,93,128,100,106,109,105,111,95,78,79,50,93,76,54,76,68,23,40,67,61,48,67,71,72,55,104,98,85,85,94,68,29,43,10,7,3,20,21,29,38,18,24,24,4,34,25,19,48,36,14,19,47,23,50,43,36,37,13,32,29,23,29,16,50,11,13,29,27,11,21,40,12,12,34,20,11,26,11,37,15,36,23,15,29,6,22,18,34,16,21,31,28,35,27,21,14,32,11,11,31,13,29,19,12,33,33,49,44,18,64,88,68,51,39,28,55,19,3,38,12,24,25,30,41,29,24,40,29,29,30,17,29,23,42,27,34,41,18,18,20,27,15,13,10,4,3,32,9,7,9,7,33,14,44,31,52,17,33,7,16,14,26,36,25,39,32,22,20,23,3,36,34,23,4,9,38,17,25,2,28,11,13,47,23,36,18,31,20,43,28,24,34,3,60,90,106,86,49,7,9,36,52,42,31,9,19,21,21,25,30,6,30,19,6,21,14,11,27,15,30,18,32,20,36,26,41,9,20,9,32,8,32,20,21,39,18,11,33,18,30,19,57,8,26,29,9,23,16,25,20,29,24,19,31,14,41,23,26,29,26,30,30,26,37,8,146,194,205,202,190,185,206,193,203,185,224,224,222,209,199,202,209,200,188,217,177,169,155,143,124,138,152,143,138,73,78,68,46,80,80,88,92,97,100,106,89,71,106,227,224,239,246,248,246,247,243,250,224,246,255,250,251,255,246,203,243,245,136,0,10,7,5,32,3,20,30,1,26,31,26,5,1,4,17,19,8,15,7,16,12,4,10,194,241,250,251,242,185,203,211,235,194,123,133,171,144,198,254,232,226,221,237,134,96,82,38,18,56,41,23,26,9,22,36,19,11,14,21,16,22,10,30,35,14,21,4,20,25,5,22,23,65,61,87,39,2,17,14,56,60,58,34,37,21,62,58,64,79,130,102,80,54,31,43,88,76,54,21,38,27,58,36,55,47,72,83,65,57,31,64,61,71,51,75,60,63,90,112,100,81,64,60,67,79,74,71,90,54,102,73,41,40,51,65,23,18,34,7,93,37,65,38,43,17,25,47,46,10,54,30,73,38,21,122,135,190,229,225,254,245,252,246,234,239,202,214,211,182,169,152,121,109,120,91,102,108,104,143,115,105,49,15,19,91,95,101,32,4,52,54,27,22,47,42,23,34,6,25,4,32,11,31,12,19,19,54,54,61,79,113,110,25,22,11,34,105,105,90,80,110,112,76,89,50,43,47,49,32,25,6,19,3,11,17,60,43,54,9,31,105,39,6,39,130,158,177,158,178,134,125,136,136,97,95,49,63,15,34,20,10,9,20,7,20,41,50,9,38,105,61,34,46,77,210,204,165,168,170,199,147,161,157,87,151,97,119,117,52,52,30,27,17,39,6,9,0,95,136,80,50,17,43,99,150,85,61,38,52,42,40,54,19,46,67,39,41,35,66,30,59,79,78,83,131,105,152,117,73,151,89,52,63,115,126,106,81,72,38,51,66,77,114,79,73,51,24,20,44,24,12,40,47,57,90,74,84,122,126,103,110,103,104,90,80,92,72,76,50,74,62,87,77,110,76,61,89,77,74,52,41,60,44,15,41,52,33,70,120,123,93,110,112,114,90,102,101,82,69,101,75,64,83,70,84,67,47,63,64,52,34,69,62,66,62,59,83,74,76,76,49,35,2,12,31,23,36,2,18,32,66,7,34,21,27,22,10,14,41,17,36,21,7,18,24,26,26,15,35,33,24,22,30,28,5,10,29,18,51,36,20,20,28,24,10,51,38,55,18,33,31,16,21,2,16,19,56,48,9,27,17,34,9,33,27,27,7,31,33,10,27,26,27,25,42,27,24,16,19,11,36,91,88,34,54,32,7,16,29,27,27,47,18,30,32,11,4,18,0,1,16,24,20,46,15,5,20,0,34,34,22,20,6,19,4,7,27,21,24,23,11,41,10,42,38,36,43,36,16,67,26,12,41,16,37,56,26,37,14,20,25,43,22,23,36,34,25,6,12,17,30,15,31,32,8,31,34,27,5,14,8,33,48,29,33,43,86,112,68,25,14,35,32,48,25,25,14,20,30,33,38,19,28,19,16,21,42,38,9,38,0,29,32,39,18,18,15,19,14,40,25,9,39,47,28,25,2,6,14,16,0,35,5,26,26,5,30,11,25,44,8,41,15,38,17,23,26,19,26,30,20,23,16,12,14,41,131,186,165,161,169,159,168,152,177,175,172,176,174,203,197,173,148,154,125,149,141,137,139,126,96,153,176,212,133,84,60,37,21,42,94,74,50,71,93,118,77,97,201,247,235,249,221,222,243,250,252,236,246,247,233,234,252,232,192,209,240,221,120,3,1,0,21,2,5,19,6,34,31,26,17,0,6,12,24,20,10,6,1,17,22,25,15,237,222,255,252,255,237,252,228,225,161,113,151,129,189,216,255,226,186,163,190,181,207,191,134,146,153,89,108,127,151,94,93,81,80,90,109,106,81,85,106,85,82,98,115,125,116,119,84,139,153,162,184,107,69,95,72,83,103,104,67,34,60,105,81,105,103,128,114,89,89,73,68,63,78,84,84,69,70,68,87,69,36,71,95,91,57,63,61,73,48,58,60,75,56,68,82,93,60,67,25,18,40,50,53,34,47,42,50,14,13,33,52,42,38,28,52,122,91,130,133,135,161,138,154,153,124,119,122,193,124,133,194,233,227,236,232,225,181,169,164,116,123,143,128,106,118,122,97,82,75,86,71,68,60,75,60,65,51,30,13,35,58,47,38,12,22,16,26,27,14,13,24,57,45,72,86,97,84,93,101,81,91,109,126,111,89,88,108,136,48,17,17,45,39,26,30,16,17,10,17,6,10,46,22,43,68,96,61,70,100,98,97,88,140,101,82,73,128,87,38,5,54,24,16,19,11,3,7,12,28,21,31,28,44,35,38,52,48,56,48,9,50,131,138,75,29,105,112,55,45,11,53,48,48,53,5,13,15,34,23,28,7,2,14,29,3,23,22,36,16,25,41,58,47,62,129,50,22,63,60,38,65,64,64,63,42,55,54,73,47,69,82,94,98,123,136,158,170,162,139,129,135,143,125,120,100,81,36,37,52,57,72,27,49,57,47,51,64,59,33,45,4,31,10,51,14,12,59,67,65,88,82,114,123,88,112,114,120,120,119,83,62,56,57,58,67,59,72,45,75,103,85,82,80,78,60,25,41,41,28,35,59,45,38,81,114,131,109,117,127,76,101,68,80,57,80,75,74,110,68,77,60,54,29,72,37,45,76,93,93,82,57,87,83,68,31,41,18,9,9,53,18,32,2,5,54,34,29,19,16,23,21,0,33,18,15,31,6,33,10,16,53,27,54,39,42,15,13,22,23,19,23,17,19,4,16,15,27,3,16,23,26,16,40,19,14,47,31,35,17,28,41,24,3,35,42,41,15,7,12,4,39,9,13,16,37,46,10,48,14,22,5,16,21,13,16,39,23,31,75,47,42,7,28,13,15,29,12,52,14,17,16,32,20,39,64,37,32,39,39,32,9,48,7,40,27,10,8,20,18,22,11,6,42,24,12,30,7,29,27,43,19,27,40,24,9,0,36,12,16,24,9,41,47,29,42,22,25,29,21,32,28,16,26,28,7,6,12,23,11,6,23,1,2,7,27,47,8,63,23,20,21,29,36,84,81,46,28,17,41,8,15,28,12,15,24,23,30,21,10,24,33,21,33,19,34,25,13,12,10,6,12,54,35,25,19,8,19,31,25,26,27,11,7,20,16,15,23,11,38,28,4,18,14,7,46,24,28,30,14,13,12,36,31,42,21,20,18,13,15,16,4,50,109,101,148,115,98,133,142,122,119,133,129,142,142,126,136,133,146,103,104,98,133,87,98,69,25,86,112,116,88,82,88,62,43,37,12,8,6,27,61,82,71,104,205,216,217,216,222,220,232,252,251,226,251,236,233,253,249,205,182,187,212,192,103,18,4,16,34,23,4,23,11,17,12,8,5,5,3,32,0,12,9,14,29,30,5,24,26,255,252,254,253,255,251,238,243,238,157,115,152,135,194,242,244,235,179,128,221,251,236,241,231,192,242,190,184,236,220,189,133,173,169,176,236,248,228,223,216,163,247,247,248,239,239,252,252,252,233,233,245,181,140,178,169,121,132,115,88,47,96,108,70,94,123,141,139,135,103,62,76,61,72,60,69,80,76,67,68,49,61,50,113,90,39,38,62,76,71,38,35,16,25,61,90,77,61,57,25,24,30,23,38,43,70,117,121,137,97,98,151,141,151,91,127,237,189,200,220,207,190,155,220,190,135,125,92,123,94,124,122,176,171,148,143,135,53,42,56,16,96,78,88,86,88,62,91,84,87,78,102,62,50,18,18,24,22,23,30,25,86,74,100,64,20,104,135,129,131,131,150,143,143,150,151,157,168,155,147,134,88,95,58,24,38,31,36,46,33,48,35,29,105,116,138,132,134,118,121,128,142,137,172,164,171,189,158,133,119,99,81,73,76,48,46,57,62,62,53,14,57,64,65,90,77,114,91,127,144,105,120,142,139,149,162,117,124,107,84,30,30,69,65,48,24,105,59,48,38,20,45,52,70,64,76,47,59,9,41,4,20,5,17,53,23,72,42,80,50,40,23,76,35,54,39,58,36,31,47,35,42,49,85,126,140,156,163,144,153,150,153,147,136,127,106,95,115,59,75,35,24,29,30,18,15,28,30,25,37,59,56,53,40,38,23,58,43,38,37,37,16,16,19,60,42,53,91,112,117,113,100,119,121,104,97,105,122,88,68,59,37,66,59,61,63,49,79,85,72,94,98,77,59,50,19,24,26,13,48,37,78,64,76,116,130,89,69,73,95,78,83,81,91,61,67,92,74,60,48,45,64,56,67,56,58,65,64,72,60,66,83,45,31,34,37,11,47,28,11,8,33,16,36,61,48,64,35,17,20,45,33,46,16,9,29,26,22,13,40,34,31,27,13,21,0,48,19,30,32,21,28,16,21,20,8,15,29,37,35,33,21,6,38,18,26,14,38,11,26,37,29,8,12,30,24,14,55,25,12,37,32,10,14,5,25,38,25,13,15,31,20,48,16,1,43,30,28,43,88,81,42,16,23,50,24,43,14,16,51,9,10,9,23,20,15,15,51,18,23,37,28,15,51,21,32,18,14,7,10,8,47,28,21,9,20,19,20,16,19,11,11,31,40,19,46,52,26,25,40,27,23,26,38,26,22,19,16,26,16,29,45,4,7,16,4,10,17,29,6,25,34,21,25,10,32,28,10,29,33,24,23,13,6,67,119,84,65,80,37,16,22,6,18,23,21,12,28,28,13,24,35,24,36,44,25,16,18,32,33,37,7,27,20,33,22,70,35,30,31,14,22,12,5,26,23,18,10,28,28,16,30,10,28,34,7,20,2,39,40,41,22,20,0,32,27,35,22,18,22,24,36,87,124,111,148,119,112,122,142,101,122,117,119,140,126,103,125,127,121,134,105,135,128,113,121,95,116,90,100,118,125,106,138,111,72,46,76,105,84,78,92,73,87,155,232,237,217,235,244,236,249,243,245,239,240,233,245,220,249,215,211,198,216,205,110,2,1,12,25,8,14,6,24,22,14,10,16,20,7,21,8,0,25,7,0,15,15,25,18,200,151,156,190,184,229,244,241,219,131,115,123,122,144,173,207,202,142,96,111,187,213,213,106,92,168,109,105,141,159,99,102,155,144,141,212,194,190,191,179,176,193,203,237,197,154,225,228,241,247,243,238,96,122,158,135,116,141,120,50,12,87,81,73,38,76,124,121,89,99,89,44,51,57,62,69,79,59,33,37,18,57,46,5,7,49,42,33,47,62,50,29,24,28,102,104,122,152,133,180,161,133,133,104,137,205,211,230,212,171,143,162,124,169,112,113,205,151,119,134,141,124,87,102,52,74,57,17,38,22,48,132,42,92,99,115,103,27,29,18,25,50,64,4,15,30,55,41,65,71,80,124,99,105,115,125,147,132,88,47,54,151,171,191,166,57,136,179,169,150,125,108,90,95,89,69,73,72,92,98,97,103,109,91,121,63,100,98,117,100,49,24,83,189,205,214,184,188,167,141,154,127,110,132,105,105,96,91,71,77,75,45,55,56,80,49,48,98,38,57,1,93,169,192,175,149,140,152,130,138,142,149,114,112,103,96,87,53,60,46,20,41,77,105,55,62,163,136,53,5,33,144,208,191,116,119,135,96,87,72,67,64,50,50,55,52,43,73,40,57,49,37,60,81,108,114,109,126,139,49,90,111,138,152,129,167,146,98,100,82,70,46,60,33,41,37,27,29,18,30,42,21,31,2,20,17,14,14,39,38,53,74,79,62,11,23,10,29,40,12,0,35,38,70,93,73,101,121,77,107,106,110,114,127,87,105,109,69,72,70,53,62,49,60,58,83,70,80,81,87,142,84,37,11,9,22,10,25,40,57,76,80,134,96,103,96,98,114,64,80,86,85,64,72,87,77,80,64,68,38,55,69,47,53,67,69,59,75,77,64,62,48,41,31,36,15,32,16,32,17,28,33,20,28,26,56,37,34,38,66,97,69,36,43,12,27,36,37,34,28,21,36,8,26,21,13,31,23,26,33,12,21,43,44,1,17,13,9,18,27,23,22,4,28,23,36,19,38,29,32,22,25,37,26,29,31,25,12,24,10,49,29,45,33,12,19,12,43,18,4,15,25,24,23,35,18,49,17,0,83,70,20,20,38,15,10,7,39,28,24,25,12,51,10,44,48,14,28,41,16,3,42,29,24,34,10,27,35,16,23,11,13,29,11,11,16,14,11,17,25,29,32,51,23,13,25,36,21,52,45,22,48,27,41,24,22,29,28,22,36,29,25,11,12,40,27,20,9,28,14,20,5,12,42,14,16,31,2,26,27,44,15,5,20,18,72,93,108,77,47,24,26,11,17,21,6,24,10,26,35,20,17,20,6,40,14,33,9,3,32,10,23,19,5,37,17,14,19,29,23,16,18,31,35,8,19,37,22,9,16,26,12,33,3,6,22,40,22,18,2,20,24,33,31,19,14,24,22,17,14,41,16,61,114,102,117,134,134,183,149,132,142,164,208,185,172,180,166,169,211,199,172,189,193,188,201,229,209,210,227,189,179,208,195,202,158,185,193,213,196,189,193,175,168,205,229,213,245,246,247,243,236,249,250,250,249,252,233,242,239,228,218,215,222,208,90,18,0,0,13,15,11,5,16,13,30,3,16,13,11,16,14,30,9,2,40,12,37,3,16,69,94,96,132,201,200,179,190,215,139,99,139,111,93,20,15,31,53,25,19,35,20,8,23,45,19,2,50,59,45,41,36,40,9,45,51,32,30,47,35,58,15,47,40,74,25,30,55,53,105,121,116,28,49,49,41,117,132,194,95,56,135,124,84,72,55,88,67,68,86,96,119,57,68,71,77,94,67,52,56,59,77,75,40,43,70,92,55,68,69,126,145,126,111,171,198,187,254,237,236,193,188,145,154,201,235,240,217,232,205,186,166,63,111,49,68,140,108,68,61,78,72,87,99,83,65,22,27,23,35,63,60,49,43,64,55,75,1,30,40,14,68,67,73,69,41,106,77,126,126,128,154,179,158,124,147,195,179,118,36,23,148,146,166,77,51,126,84,93,67,37,40,17,23,36,20,49,71,119,128,118,161,140,156,163,159,166,130,143,105,28,36,67,150,134,122,85,79,46,43,42,35,49,49,3,35,30,70,72,80,122,130,149,135,155,105,106,152,68,51,34,98,99,149,121,83,87,48,24,31,30,12,9,29,11,36,64,96,97,74,8,124,164,169,81,61,175,161,93,25,18,103,129,124,65,26,39,10,30,58,27,54,64,57,68,86,142,191,152,151,162,176,165,153,173,148,151,192,171,68,52,68,53,54,41,42,28,36,19,20,54,15,18,21,44,36,29,28,24,30,3,25,29,7,42,36,31,42,36,35,37,99,70,39,33,20,16,24,35,30,22,50,34,59,66,74,69,92,127,94,109,111,118,96,110,72,65,47,56,72,70,70,76,45,72,75,77,76,67,62,48,50,29,8,1,12,24,56,73,57,79,116,71,126,119,100,102,79,68,98,61,99,104,51,44,75,61,68,48,51,80,91,87,49,79,41,71,79,64,68,44,24,15,48,19,2,19,40,7,28,33,22,63,80,43,26,41,67,55,65,141,73,52,4,25,4,29,12,31,36,20,27,29,8,42,50,42,27,19,26,46,44,3,26,35,17,14,37,35,31,39,9,9,13,30,32,32,20,28,18,26,9,39,21,25,45,44,12,19,34,22,8,39,30,35,20,11,54,14,5,19,24,44,35,29,34,39,30,35,75,83,54,44,20,7,24,25,19,23,36,49,29,44,5,40,42,23,37,11,35,18,50,39,14,13,16,15,16,13,25,13,29,36,17,16,24,7,21,24,20,14,42,50,57,18,5,23,21,26,39,17,45,46,20,25,30,16,36,20,25,14,35,14,12,29,35,22,0,55,12,10,44,34,27,35,29,27,52,27,17,30,15,23,19,15,36,37,109,91,40,37,15,25,41,29,22,29,30,13,24,45,15,4,14,2,30,18,15,38,39,29,17,15,27,34,4,0,9,13,15,29,28,3,34,16,19,51,4,40,40,25,51,33,25,8,23,33,30,20,31,12,16,23,19,22,23,36,23,29,35,24,29,43,92,32,69,81,84,135,85,98,149,159,158,144,139,119,159,163,188,208,202,222,223,220,249,251,237,245,248,235,211,214,207,214,188,217,226,201,201,201,207,178,165,169,193,236,214,234,226,197,187,201,218,215,214,214,210,226,242,241,241,197,202,189,108,32,5,14,12,15,2,20,12,21,31,0,2,13,22,8,23,25,37,33,15,14,59,13,17,180,187,186,171,148,140,114,156,224,159,122,133,123,91,32,14,58,90,51,30,62,35,19,19,63,67,49,76,91,79,38,24,44,40,31,44,47,32,22,31,14,69,21,49,33,28,19,28,11,39,59,88,61,91,121,94,157,188,247,175,122,164,222,199,141,75,62,57,64,106,131,197,116,89,86,70,108,44,73,68,107,168,214,170,131,185,184,146,113,88,137,224,241,175,187,182,126,192,166,156,75,79,92,86,160,195,157,147,197,244,241,180,83,88,102,141,222,214,112,84,91,56,95,123,73,80,89,60,53,53,54,58,49,55,59,76,47,35,72,62,50,117,130,169,133,127,147,153,121,110,132,119,93,107,131,116,97,92,39,24,38,49,24,34,31,13,92,115,104,94,133,118,123,137,135,137,121,129,140,137,156,138,142,133,130,112,98,80,50,33,29,62,39,48,50,29,45,56,72,106,126,114,96,110,141,133,137,126,154,173,168,179,156,148,124,81,49,76,38,89,28,29,72,32,42,27,50,63,94,99,82,110,103,127,106,133,165,179,175,129,77,95,130,133,68,19,55,72,33,87,6,33,48,96,71,60,89,129,117,109,119,128,141,149,172,167,191,199,149,130,137,146,131,80,85,52,46,78,61,30,30,33,16,10,37,7,21,2,31,35,42,7,29,10,34,8,11,27,38,34,35,47,19,28,30,34,43,31,41,9,60,111,55,46,27,16,31,6,39,36,64,71,50,49,72,51,78,73,95,74,88,73,68,76,46,76,76,52,46,72,66,78,61,86,53,81,66,74,52,20,22,5,26,15,21,37,51,76,84,68,81,128,79,98,74,90,78,75,77,84,98,77,80,62,55,42,50,79,63,39,86,77,73,60,92,83,60,62,38,13,31,13,15,15,16,9,46,41,39,44,39,27,30,55,31,17,37,29,23,40,82,130,58,34,3,17,33,14,10,38,9,46,23,42,46,24,22,53,51,32,21,40,30,36,9,49,35,46,16,9,8,41,34,51,20,4,9,27,17,13,22,28,2,30,27,24,23,34,18,33,29,25,20,26,5,26,38,35,18,42,11,26,6,19,18,54,29,12,30,61,94,71,16,0,21,49,30,19,36,37,18,42,14,40,48,41,38,18,11,18,30,37,23,43,27,13,32,3,31,19,3,12,18,16,35,25,14,28,26,25,28,35,54,28,34,21,36,20,37,14,27,16,18,49,35,27,15,35,21,15,28,29,30,30,18,13,18,18,28,21,10,24,3,18,33,11,3,25,22,50,23,33,15,27,31,19,38,50,121,85,49,25,24,50,34,29,52,18,11,33,25,38,40,24,8,3,28,22,5,19,24,30,16,8,27,36,23,21,17,27,18,28,32,13,27,27,25,50,33,29,40,23,36,51,31,19,26,11,8,17,17,22,37,36,31,18,19,7,7,41,43,19,55,112,52,26,25,47,53,27,40,56,68,79,63,38,87,83,110,113,131,124,167,178,149,224,206,184,181,188,169,121,138,167,194,175,175,154,145,150,128,164,140,98,117,118,127,122,133,106,107,90,104,83,120,133,149,128,152,153,177,162,168,155,140,97,31,2,8,5,6,8,12,2,31,12,48,2,28,13,17,13,21,17,22,0,5,5,22,4,252,234,232,180,140,130,151,186,234,166,143,165,151,121,12,18,80,135,86,36,74,41,30,59,110,39,75,84,120,94,70,72,82,53,79,68,60,34,53,55,66,80,66,95,115,94,77,48,74,87,140,147,99,160,120,129,201,233,247,145,85,187,229,229,129,98,99,122,114,109,142,156,94,119,134,126,124,69,65,92,142,186,255,175,179,248,232,214,115,42,103,162,188,139,123,60,45,105,42,39,32,27,69,80,60,58,68,81,165,166,201,152,88,115,81,120,198,195,99,73,84,126,120,126,132,110,107,117,67,7,62,158,137,145,135,137,123,64,107,75,121,164,129,128,138,119,87,70,37,29,31,12,30,22,84,73,93,96,108,74,20,105,134,147,94,46,127,175,163,180,185,80,139,168,128,126,112,98,46,60,17,54,49,75,60,60,82,92,110,99,56,4,43,92,151,146,146,143,184,201,174,196,177,176,146,134,107,60,91,111,105,80,88,64,64,44,44,48,60,47,38,52,101,106,130,141,147,157,180,204,174,187,159,131,123,102,110,99,125,75,36,74,54,24,32,44,51,76,105,67,52,41,108,182,164,146,192,182,170,152,159,155,117,73,105,85,78,49,34,46,2,25,31,23,49,26,24,28,28,25,44,24,25,24,25,3,37,22,29,8,36,35,17,27,21,13,52,27,51,39,21,25,50,11,23,37,64,27,16,25,26,24,31,23,42,14,3,42,40,97,66,62,78,80,88,71,70,76,73,73,69,81,75,43,47,58,37,48,65,84,44,83,74,75,81,68,43,50,38,25,38,19,21,25,82,58,87,98,81,93,86,85,102,84,84,75,96,66,64,78,71,73,46,34,53,55,48,56,56,83,92,54,52,79,61,48,65,32,11,21,2,16,14,21,40,36,28,49,49,19,32,34,51,27,67,36,35,39,24,18,34,112,95,60,26,14,13,20,29,18,7,27,29,36,25,24,14,28,28,35,16,11,13,44,13,11,8,16,9,17,40,32,48,16,13,39,33,43,1,33,36,32,28,40,30,36,39,11,10,57,11,45,19,25,26,12,40,29,28,19,39,26,11,36,54,25,37,23,23,50,81,62,34,20,36,16,41,36,23,25,26,15,34,16,22,30,15,28,28,17,56,25,23,3,13,39,44,24,27,32,48,22,33,47,16,28,26,39,30,53,38,18,15,41,33,25,47,38,26,44,14,18,5,25,65,30,26,24,22,15,26,9,30,24,19,43,26,14,26,17,2,1,27,21,11,19,2,38,26,7,23,20,25,15,29,26,25,49,71,93,78,37,27,32,15,2,21,24,22,5,7,23,22,24,18,28,10,9,36,20,11,23,32,19,13,22,22,3,15,3,13,45,42,41,10,8,41,18,21,49,16,16,47,35,25,28,22,16,32,33,39,24,20,16,20,14,12,41,33,21,50,23,71,115,54,31,30,11,57,58,32,50,41,66,61,72,49,72,72,57,71,56,106,99,71,92,121,108,112,109,95,66,109,142,145,151,113,131,70,90,103,59,61,96,76,59,56,66,45,74,57,50,31,40,74,97,66,69,71,95,90,95,101,70,127,98,7,12,29,4,28,20,23,18,23,19,5,1,1,13,11,17,14,27,15,20,23,21,26,19,252,252,255,239,222,225,245,236,231,167,130,133,169,106,18,16,96,141,66,63,92,75,66,56,78,81,95,117,100,95,57,38,60,81,98,73,65,49,43,56,42,43,81,70,84,75,65,67,101,83,83,62,73,86,82,84,148,216,208,130,83,141,147,120,139,118,157,177,171,172,139,83,111,145,162,150,151,86,47,41,76,132,149,128,129,181,139,176,126,43,69,122,153,90,68,1,22,56,27,46,28,48,76,73,65,28,26,30,44,34,13,43,36,28,31,41,34,56,21,51,110,114,185,175,189,197,179,180,55,17,25,72,119,88,99,79,73,32,41,33,72,85,58,50,57,45,61,63,41,48,58,80,90,114,120,143,122,186,129,63,29,136,187,161,79,26,106,89,110,112,82,80,48,95,59,53,49,46,40,18,3,54,62,99,107,144,148,157,187,164,66,44,84,150,142,130,128,129,133,122,130,67,101,68,49,45,31,29,32,19,21,46,67,105,85,55,122,126,38,75,38,65,142,158,193,187,167,165,159,106,94,96,69,67,38,27,5,12,25,70,74,135,160,169,129,164,174,167,161,129,44,73,139,135,139,94,72,94,54,32,26,45,13,12,20,35,31,37,2,19,29,37,32,18,14,1,43,16,19,27,33,46,2,19,0,23,6,24,33,31,61,31,16,29,37,45,50,23,30,32,20,26,57,26,42,39,47,29,25,34,31,17,28,24,34,55,60,92,99,82,83,62,62,41,66,96,61,86,58,89,88,91,49,67,46,65,64,73,72,67,74,83,63,69,23,26,37,42,8,25,21,29,68,59,76,92,78,88,90,92,91,84,81,78,77,80,84,102,50,38,94,62,62,32,71,58,58,66,66,68,71,76,68,57,46,38,18,11,13,28,26,34,20,25,34,53,76,48,22,31,51,30,65,37,35,7,31,44,25,23,25,39,132,99,31,18,12,14,45,33,33,12,34,31,16,34,28,33,38,13,65,10,31,12,28,43,20,13,21,10,27,26,25,13,23,11,17,38,25,1,52,13,39,5,40,21,34,37,27,32,31,37,43,30,42,30,57,19,29,45,11,22,20,22,24,22,59,30,46,37,102,81,36,36,0,38,28,27,20,25,32,33,35,34,5,25,20,32,31,29,34,40,42,14,8,44,40,28,37,25,11,32,27,7,34,12,21,22,29,35,20,19,49,14,34,42,32,39,31,37,41,31,12,35,40,48,12,20,15,16,27,21,9,48,16,24,13,15,20,7,22,17,22,36,11,9,39,9,25,10,45,16,33,19,15,31,28,40,34,68,124,67,37,26,13,32,11,16,24,25,36,33,46,11,23,36,24,11,24,39,46,35,40,21,26,22,20,32,40,17,18,14,18,10,31,23,33,21,28,20,19,16,10,29,16,24,15,19,24,44,45,31,38,2,7,32,30,4,40,51,58,54,137,161,129,108,68,40,45,33,42,44,52,54,41,76,63,58,68,78,55,79,66,79,75,60,39,70,104,96,58,60,83,142,129,83,83,103,91,106,63,50,71,65,56,63,68,16,47,56,63,41,51,55,45,66,40,34,39,50,43,70,37,54,56,62,31,21,5,19,17,1,15,19,24,12,16,13,1,2,17,27,11,0,4,30,25,16,6,2,250,249,246,255,246,245,223,243,236,168,135,138,151,114,48,61,128,130,98,130,128,105,74,117,127,111,120,115,83,119,55,35,46,57,88,122,41,51,43,73,71,73,98,106,100,110,103,85,136,81,24,48,93,122,74,81,170,226,236,112,81,186,124,80,108,141,138,135,170,197,177,103,73,77,102,112,94,67,51,85,50,44,60,44,60,102,98,122,133,124,67,70,78,78,54,35,52,39,40,46,66,52,72,101,79,38,25,44,47,62,25,47,66,59,61,82,88,61,93,75,38,53,74,103,111,106,141,86,30,25,58,101,90,67,65,70,76,21,55,39,75,110,109,140,146,144,161,154,162,187,170,169,137,161,139,130,107,95,96,48,38,56,77,67,6,38,39,41,65,29,59,47,64,95,86,107,102,134,131,133,132,98,113,134,145,125,113,139,123,91,37,43,47,69,28,44,44,41,19,26,18,13,40,66,93,83,90,121,119,87,99,119,112,130,103,82,112,102,49,50,70,55,42,59,61,40,54,53,40,48,54,68,92,131,138,142,144,159,150,144,178,182,173,178,153,156,121,112,121,51,27,39,65,28,10,24,37,16,18,15,6,37,16,27,33,7,16,19,9,22,2,11,25,19,19,41,18,11,9,51,41,53,29,36,31,37,51,66,67,68,51,37,37,28,40,42,45,52,52,74,83,114,99,97,44,6,43,33,46,54,39,51,63,61,45,58,52,98,104,67,75,68,28,25,65,77,76,74,59,94,35,80,65,53,57,63,62,56,43,63,104,68,43,46,22,1,25,16,27,44,68,84,97,86,122,106,104,80,78,80,68,73,72,85,82,77,76,81,48,73,63,49,62,61,59,46,62,70,54,95,65,82,56,23,26,0,19,19,6,7,18,57,13,57,60,41,27,57,58,31,44,54,62,47,48,26,38,25,41,14,28,29,77,94,81,38,39,29,32,38,4,26,50,31,24,55,32,24,41,36,24,44,19,40,24,28,23,10,16,35,40,28,41,13,44,16,14,17,33,36,24,35,20,15,27,46,28,19,22,25,22,43,19,17,28,27,20,23,35,26,27,20,37,10,37,24,26,55,2,32,70,88,51,63,26,19,29,15,28,23,34,39,23,31,28,42,42,16,41,44,24,9,5,41,23,27,15,29,33,48,19,30,21,36,31,0,37,1,22,40,42,34,42,34,17,43,39,26,29,29,28,44,34,28,30,14,17,48,17,2,27,6,28,15,23,20,27,26,16,30,8,25,13,8,47,28,23,36,34,7,34,22,40,10,17,33,5,36,36,46,100,124,92,49,22,17,12,21,10,22,20,36,8,48,26,29,27,14,23,5,22,38,17,11,50,28,11,22,24,19,16,27,17,35,8,36,11,41,31,24,10,19,19,18,23,22,33,42,13,23,20,15,32,16,32,26,39,30,69,61,64,151,212,208,208,184,154,93,60,24,28,45,19,35,63,48,46,48,71,83,59,40,64,80,59,38,33,86,132,99,93,54,87,91,85,97,94,90,69,82,76,57,59,85,79,72,76,33,76,65,62,49,36,58,56,82,56,41,29,44,41,74,84,44,45,58,41,3,21,6,31,15,29,46,10,23,26,3,6,7,1,24,3,5,17,19,28,9,11,33,235,255,253,237,246,253,244,219,213,139,132,135,134,183,176,119,151,189,120,101,91,47,62,85,133,84,76,60,75,95,68,44,25,42,97,109,46,64,68,81,74,65,99,89,47,90,73,97,99,71,45,22,71,104,53,82,156,202,174,86,55,171,106,63,104,134,73,66,123,191,142,57,57,33,68,80,45,66,103,88,68,77,37,30,26,29,26,104,118,63,67,69,54,64,88,55,55,73,51,67,56,53,92,61,74,82,119,122,104,101,92,125,94,62,81,100,76,88,111,68,24,36,23,32,21,32,24,9,64,213,245,253,215,158,173,184,192,130,101,46,117,191,193,179,177,172,166,134,180,157,107,94,86,37,32,12,14,15,48,35,23,43,87,98,33,65,152,159,152,179,186,173,178,178,187,157,178,189,167,170,150,89,116,73,66,29,32,31,53,62,43,63,34,66,109,146,134,166,139,181,139,143,192,196,198,178,189,165,182,150,121,89,62,43,43,7,63,59,18,78,28,44,40,81,114,116,154,120,120,122,143,178,160,183,208,196,185,136,146,124,117,92,66,51,44,12,45,21,37,14,50,26,37,16,20,4,37,8,19,14,13,9,6,38,29,8,19,38,15,4,40,8,38,28,85,76,24,41,32,70,60,68,87,105,158,139,141,147,99,91,59,61,62,87,108,115,98,117,112,126,145,142,146,148,78,5,23,36,52,54,50,57,39,32,38,43,45,13,38,66,64,64,36,45,72,51,68,55,66,35,71,25,55,62,56,27,76,83,64,61,66,28,21,26,4,2,30,62,72,71,70,101,98,96,88,80,85,87,93,50,71,64,97,77,76,76,51,62,38,52,48,53,58,77,65,54,59,63,49,54,64,8,7,7,7,9,51,23,13,19,37,54,28,25,48,41,57,61,39,54,41,45,60,37,42,66,44,33,49,56,45,37,35,76,109,46,27,18,39,31,57,40,43,23,40,29,34,42,29,41,32,30,16,15,18,10,38,40,47,12,21,22,17,11,29,23,54,10,19,30,21,20,26,38,20,15,22,50,1,32,26,40,38,30,13,21,14,24,33,20,38,30,28,25,36,29,40,15,14,16,57,93,58,22,2,24,21,30,43,28,23,32,37,19,30,49,25,53,31,19,17,26,25,33,13,15,15,21,16,42,41,49,49,32,28,30,19,40,41,53,22,37,47,15,17,34,21,49,6,6,45,12,20,31,33,35,16,33,31,15,20,31,43,25,12,21,15,30,26,19,13,24,14,54,24,37,43,45,26,35,19,28,16,34,25,19,42,62,34,26,51,94,125,87,28,17,36,27,37,24,18,20,29,34,20,14,48,16,38,25,3,44,21,42,37,21,30,20,35,31,20,15,28,35,16,40,29,12,28,19,23,11,5,4,12,24,26,34,13,24,23,32,17,35,24,37,15,25,34,67,138,223,229,175,185,201,213,204,102,44,59,31,43,43,37,28,51,39,39,52,45,74,54,41,73,82,30,42,96,116,142,80,30,38,35,86,42,36,41,32,63,51,53,53,76,65,48,61,82,27,69,74,54,47,66,60,102,75,52,59,51,73,50,70,48,20,19,9,31,32,18,11,4,18,11,15,14,11,8,9,42,3,27,2,9,10,21,16,12,3,228,254,246,236,224,223,221,209,193,118,115,141,162,192,233,161,156,149,78,102,73,33,13,45,55,79,95,79,106,109,94,66,46,45,78,71,56,65,74,54,91,79,65,75,52,69,52,86,97,78,24,8,62,86,73,69,69,84,81,31,36,92,50,56,51,81,45,64,79,103,74,51,54,76,35,45,37,62,69,58,89,110,71,80,72,47,40,95,75,109,127,155,130,88,85,80,40,36,48,74,69,79,71,82,88,87,105,107,55,76,80,82,67,50,36,29,16,33,57,27,25,39,42,111,111,71,98,118,192,222,227,228,206,145,141,187,238,129,62,18,41,98,87,80,83,74,55,22,22,17,23,23,23,12,33,31,20,50,116,52,16,72,132,171,83,58,137,118,152,156,116,127,92,108,94,91,120,68,76,53,71,28,40,43,28,26,32,57,98,104,58,25,44,182,163,174,170,178,172,175,111,93,128,147,112,131,114,86,90,96,51,62,42,42,46,65,92,137,137,121,104,90,72,210,198,209,170,142,146,94,106,109,96,86,42,63,26,17,16,11,14,19,17,22,60,16,40,19,21,22,27,52,39,14,9,15,21,16,50,30,16,25,36,39,41,35,15,53,42,30,33,43,79,84,107,136,96,45,83,28,142,142,109,125,114,94,73,79,62,74,81,105,93,111,97,131,107,87,143,122,112,105,60,47,45,13,13,56,16,36,54,40,22,29,56,32,31,22,39,64,82,63,55,52,54,77,53,64,63,66,80,46,55,56,71,70,64,80,37,55,15,29,35,34,13,43,59,65,83,108,73,87,110,84,89,93,80,84,90,81,77,72,62,69,49,43,67,53,25,43,52,71,30,71,55,58,74,53,35,44,18,33,29,19,14,28,28,21,38,23,33,42,44,58,72,39,37,46,38,38,38,50,49,17,38,27,50,59,45,45,56,44,52,35,107,102,49,29,41,57,65,18,32,34,22,26,45,40,27,14,40,55,47,30,36,45,26,28,18,34,36,8,20,22,36,13,32,37,40,28,16,24,30,26,37,23,38,43,18,13,27,21,21,6,37,45,27,38,28,13,31,17,40,41,49,43,22,28,46,35,11,82,74,45,14,22,57,35,15,35,20,36,32,36,30,23,14,38,31,19,21,13,11,20,34,26,25,26,14,18,14,23,52,23,28,28,46,12,17,27,20,20,27,47,35,28,29,43,18,14,16,5,19,38,25,31,31,13,17,14,34,37,30,8,7,13,31,24,14,3,34,33,33,14,41,4,25,18,30,3,28,46,15,23,24,19,52,59,31,14,22,53,125,87,63,22,7,36,26,19,19,14,12,33,6,18,34,34,31,28,21,37,40,46,17,28,27,13,4,8,52,8,54,6,37,28,25,33,32,27,22,24,38,30,21,18,31,10,47,31,24,37,52,41,52,57,53,91,138,146,162,160,95,152,127,115,147,201,162,137,113,89,67,39,44,25,18,46,51,43,38,54,67,58,40,51,66,44,59,44,68,62,61,53,57,48,46,57,41,33,50,69,44,39,50,37,52,59,42,62,82,54,60,42,60,52,68,82,48,51,70,62,40,42,29,34,22,29,7,21,12,22,26,36,5,31,7,15,2,5,25,33,1,9,5,9,23,32,13,12,247,240,224,235,244,228,216,208,153,125,132,123,127,167,218,156,116,118,57,101,104,38,66,70,84,134,111,152,158,166,131,156,152,121,140,157,112,116,94,140,137,138,113,139,118,130,101,128,118,123,128,35,89,141,126,103,86,114,100,97,71,101,111,128,144,110,111,105,96,110,88,55,60,34,74,64,45,57,55,69,63,72,95,134,110,127,93,150,147,185,246,245,202,120,82,43,39,35,28,44,61,80,58,54,53,80,84,56,53,58,40,34,42,27,20,52,25,51,100,106,153,151,191,222,241,238,220,198,225,197,179,101,61,26,35,76,111,74,53,5,47,40,75,105,80,121,88,113,114,154,146,137,180,167,182,138,121,146,150,86,3,33,102,69,42,49,75,57,24,26,29,48,40,30,38,57,62,85,91,75,85,122,118,134,130,134,120,143,125,130,87,17,31,90,105,86,80,35,27,11,11,36,30,57,87,49,71,59,89,78,149,162,172,148,137,154,148,140,136,136,105,42,38,102,113,83,59,23,55,44,43,65,21,29,23,2,18,26,38,23,42,36,13,9,26,25,32,33,47,31,49,72,41,32,47,29,49,72,62,59,70,96,114,84,47,59,91,115,107,76,50,10,103,169,160,114,91,67,63,77,64,83,63,52,40,59,76,70,88,81,93,101,106,105,110,101,107,112,79,82,52,49,21,18,28,43,70,116,54,32,46,33,34,61,59,50,16,35,49,65,51,82,70,62,68,57,84,100,77,102,61,83,47,82,45,48,36,16,40,20,38,14,47,60,82,84,93,85,94,94,115,85,83,87,67,66,70,83,75,103,72,61,38,59,75,81,48,37,57,45,58,51,48,77,65,60,42,63,23,20,6,16,22,25,34,30,50,63,36,57,61,39,29,53,51,62,19,53,37,38,67,56,46,59,30,69,23,42,45,27,36,58,19,49,31,78,99,53,31,22,37,20,25,2,22,29,12,18,18,25,29,21,39,30,25,32,43,29,22,16,17,28,32,25,10,53,3,33,6,23,51,27,54,35,32,30,25,18,25,48,20,32,21,19,19,20,37,15,30,31,9,18,45,33,33,19,30,37,15,24,24,55,74,35,19,26,30,39,8,20,30,37,20,42,15,40,19,5,12,14,20,27,53,38,17,10,27,28,10,40,28,21,34,29,39,42,24,13,43,42,17,31,18,54,30,29,16,40,33,24,41,51,35,28,40,34,34,40,23,4,33,35,24,4,29,16,16,27,16,5,27,14,16,3,14,4,29,31,41,27,13,46,5,15,24,19,71,70,23,11,42,28,65,83,114,67,28,30,0,25,26,20,17,5,29,25,14,37,23,5,29,44,32,3,38,14,29,17,33,18,14,10,4,5,33,33,31,19,26,30,14,39,14,12,26,35,20,36,28,17,31,60,89,105,126,119,114,156,168,135,186,112,45,85,37,55,94,140,167,168,164,120,111,42,28,55,28,39,52,62,62,38,66,70,68,52,53,52,43,57,57,56,41,62,36,45,64,68,50,39,44,33,34,24,37,47,55,52,27,27,50,34,54,49,54,46,72,43,44,25,57,38,54,51,45,36,16,32,9,25,6,13,42,31,22,12,14,21,6,6,0,10,5,14,17,9,18,3,13,28,249,242,244,242,250,251,233,197,205,129,119,112,145,110,94,76,126,133,73,143,106,100,79,108,128,153,174,175,185,179,176,168,190,174,176,165,184,166,191,184,164,165,182,147,180,185,173,159,132,136,127,95,116,160,169,133,145,129,169,123,154,160,151,169,142,173,165,143,151,106,94,53,86,60,108,78,68,80,56,72,81,42,53,82,89,83,74,83,86,137,157,148,112,81,66,71,78,2,35,17,47,40,60,93,54,36,58,14,34,65,31,57,78,117,97,121,171,202,252,247,236,229,228,218,225,208,176,111,78,52,28,3,6,34,28,29,62,93,98,96,133,156,172,199,208,226,183,222,195,188,236,219,134,174,175,145,119,99,109,51,32,13,47,38,31,34,76,108,112,118,148,140,181,159,179,115,145,208,223,230,139,220,182,187,152,138,112,118,83,81,38,38,59,48,46,74,90,93,111,98,132,162,150,184,163,146,114,152,144,141,156,142,156,90,77,86,46,38,47,33,24,15,13,19,42,21,6,22,18,35,41,39,30,10,39,19,21,14,12,8,33,19,0,29,20,20,37,55,52,73,117,136,150,180,165,150,178,185,217,167,154,194,198,183,92,84,169,209,185,158,94,75,204,203,134,117,81,42,39,52,34,42,34,91,69,73,98,115,84,118,83,120,104,119,105,105,72,70,67,15,41,20,65,48,97,132,155,175,192,117,50,25,42,63,30,38,41,57,49,82,78,53,76,52,61,84,75,85,69,81,70,53,84,82,42,68,25,25,17,3,24,30,44,43,66,81,96,104,68,86,98,93,90,83,78,64,96,98,75,48,60,56,52,54,47,49,77,78,48,59,84,47,77,57,50,36,29,20,23,13,17,38,33,43,48,60,53,74,64,52,36,18,56,45,28,47,53,56,39,48,58,28,52,77,42,41,30,26,50,63,53,17,18,42,35,32,106,106,53,19,15,11,39,36,8,27,30,47,11,33,26,25,25,40,26,14,10,42,47,28,8,58,24,8,28,39,34,35,9,7,11,10,22,37,0,17,32,22,33,18,15,15,45,20,10,18,32,24,38,22,53,30,10,30,31,21,47,22,35,30,37,44,140,36,22,14,13,42,31,16,40,12,27,37,28,22,54,21,17,20,29,7,15,47,27,33,22,17,17,21,44,36,15,20,24,31,21,29,42,6,55,15,30,5,30,24,24,7,35,61,21,43,28,31,32,10,32,24,26,20,24,15,2,1,14,36,29,20,5,28,9,6,22,24,23,50,13,20,22,38,14,19,29,33,13,39,65,65,45,35,46,17,25,48,95,99,93,20,18,28,30,12,23,32,31,6,15,17,15,16,18,23,50,5,30,20,5,12,36,45,25,17,24,12,17,18,48,29,28,33,20,31,8,20,21,29,27,21,28,37,46,102,130,115,97,102,120,139,129,107,119,99,106,92,77,56,81,95,146,133,103,126,84,51,41,21,29,32,41,45,71,51,52,34,33,79,45,64,55,56,65,50,44,36,38,47,60,73,60,41,59,50,44,32,46,59,47,68,51,33,53,67,29,47,44,40,41,25,84,41,42,70,26,56,57,48,8,6,31,19,33,31,22,10,53,6,4,19,2,35,22,11,5,18,6,21,5,22,14,45,255,224,242,254,251,245,209,230,189,123,103,107,94,86,55,77,143,180,133,162,175,131,90,130,160,163,169,174,182,183,181,176,180,183,189,173,199,188,193,210,208,174,182,210,203,154,184,162,124,174,119,41,86,123,204,179,126,178,171,157,171,188,153,154,139,153,148,166,171,133,122,141,100,86,110,73,92,46,42,58,54,57,32,6,27,38,41,56,18,14,41,50,91,84,83,115,44,32,2,30,53,39,94,97,77,89,119,81,128,111,134,186,178,211,213,230,242,214,233,201,176,136,111,98,68,52,65,23,18,33,11,32,86,112,71,59,67,91,158,155,184,200,193,171,167,116,126,127,123,134,156,106,105,117,122,87,131,140,127,104,59,19,79,127,77,116,177,210,190,192,200,190,158,182,171,148,149,121,114,122,125,88,101,102,94,71,102,110,110,64,81,25,36,117,148,193,203,185,190,151,177,150,147,140,110,120,112,73,60,34,34,53,31,31,21,15,17,33,18,26,27,46,42,43,18,13,43,16,31,22,26,39,20,51,38,38,57,66,57,49,69,60,61,72,65,72,141,163,168,191,209,191,207,220,184,199,214,214,234,164,162,217,212,175,108,126,165,188,202,170,94,75,184,146,70,73,52,44,41,37,115,118,110,125,137,133,153,123,139,79,116,122,106,74,75,64,59,10,38,20,35,86,105,131,148,129,132,172,173,120,33,42,87,115,69,50,13,64,71,77,40,72,84,55,72,88,83,107,102,119,92,61,59,55,14,0,8,29,36,22,38,79,65,87,62,92,77,83,62,76,86,55,43,44,92,73,55,66,51,68,50,69,83,35,86,79,60,71,47,88,94,67,39,33,42,26,14,46,21,10,27,7,23,53,25,41,36,39,45,23,51,59,68,70,42,49,22,53,65,55,58,62,52,39,46,41,36,40,41,35,48,25,66,18,13,32,53,82,112,43,12,30,16,4,43,27,45,43,22,33,13,19,44,31,13,45,43,42,24,35,21,40,1,23,49,25,34,19,39,38,29,34,22,14,49,41,34,54,38,21,24,22,31,26,38,28,23,5,39,44,34,44,37,18,32,30,43,33,31,30,24,65,95,79,9,18,5,27,24,30,20,26,36,18,19,6,23,36,5,42,18,29,26,13,8,20,11,23,76,45,29,11,39,20,49,26,17,49,6,44,30,20,36,22,32,35,29,21,28,41,29,11,10,61,57,20,33,38,6,33,32,21,27,29,19,2,14,51,29,22,29,37,26,36,21,10,27,29,6,20,31,8,13,21,22,25,63,90,22,42,2,25,8,46,88,104,87,78,44,22,1,14,19,23,34,16,15,26,21,36,32,33,43,50,55,7,18,2,15,7,19,20,24,51,7,28,76,11,46,8,65,35,17,21,14,22,30,28,36,24,27,88,92,107,56,83,90,99,86,50,74,104,120,95,77,81,87,83,91,57,77,94,88,98,60,70,81,70,123,97,110,56,110,113,105,82,88,59,82,105,98,58,81,71,83,74,104,88,61,62,73,74,52,21,59,79,94,90,96,105,96,73,64,65,84,89,84,119,89,83,86,62,47,74,66,65,48,16,14,2,47,5,18,30,8,6,8,18,14,24,7,4,6,10,14,30,11,20,15,43,241,252,250,244,244,244,221,212,170,108,126,132,145,104,86,102,138,128,110,114,132,70,73,71,115,127,137,120,123,97,135,113,127,115,145,153,139,140,134,131,125,101,148,122,157,132,160,138,43,72,28,0,3,74,96,101,65,88,106,123,125,114,148,116,92,121,120,131,102,112,105,118,97,87,81,89,61,59,28,37,47,41,54,51,46,82,104,105,75,80,94,60,136,102,110,144,92,19,39,54,132,129,170,161,160,174,221,195,189,207,205,210,220,213,141,212,129,109,65,41,42,32,37,34,7,8,49,65,55,41,36,59,162,187,95,41,19,54,85,155,147,126,128,81,25,32,52,24,43,77,66,75,118,122,127,121,181,159,199,171,67,63,167,171,123,140,190,169,148,107,119,85,63,56,22,2,0,7,13,28,125,72,90,135,145,175,200,217,192,163,103,55,70,114,111,140,84,97,49,67,47,27,25,25,13,14,57,10,31,23,12,14,28,34,13,3,48,1,23,31,11,43,45,46,32,19,14,26,25,57,28,38,35,56,112,152,162,198,245,228,227,248,205,217,123,150,231,232,240,222,209,179,207,220,197,192,215,198,206,133,121,188,164,187,176,183,186,205,153,136,71,23,90,117,98,91,97,90,95,108,111,128,134,143,120,135,134,125,111,95,100,53,25,35,36,8,52,49,62,78,130,130,140,119,126,134,127,127,83,32,5,9,94,94,71,72,53,54,78,52,81,78,64,89,87,84,58,90,122,126,58,36,34,32,26,24,26,30,33,61,59,69,69,43,76,78,54,54,79,42,74,78,79,72,45,45,58,52,47,48,67,43,50,69,60,54,46,45,56,55,59,23,15,13,32,21,2,47,5,30,47,36,51,33,58,24,43,32,51,66,51,34,52,58,58,49,57,43,18,59,32,76,54,50,22,46,27,47,50,48,57,54,31,27,45,43,38,45,91,95,31,43,46,11,5,43,34,33,28,26,21,33,27,39,10,30,25,26,10,37,33,46,10,26,27,12,18,56,20,30,37,27,19,28,12,44,45,40,39,23,44,21,32,38,14,29,12,31,55,16,31,6,31,42,19,26,23,18,38,36,62,32,75,76,35,49,39,16,31,18,54,44,58,28,30,8,19,17,18,17,35,20,23,41,18,18,30,29,46,35,17,35,7,21,33,32,5,38,28,50,40,20,40,38,18,19,35,39,26,38,38,29,28,8,28,7,26,36,0,34,25,39,31,27,36,36,24,27,34,26,15,25,27,45,17,20,26,39,8,53,35,16,21,45,25,47,97,82,13,17,4,20,17,28,25,60,112,122,72,41,0,13,5,14,29,12,10,27,44,15,17,8,30,11,27,13,20,40,12,15,22,30,58,35,44,42,24,10,37,22,16,42,19,24,24,8,13,34,21,27,38,55,139,132,110,104,123,141,73,91,105,102,118,142,101,99,79,101,90,73,84,83,83,109,127,122,130,138,132,141,151,148,150,149,129,169,134,103,117,129,115,104,100,95,117,108,120,105,95,56,94,66,81,103,84,112,127,139,113,169,134,133,146,137,140,161,144,159,153,155,163,120,110,125,137,100,12,3,10,2,19,19,19,20,5,32,12,9,7,17,27,7,13,32,20,3,9,20,22,1,250,249,247,230,238,238,223,221,170,120,144,118,162,146,162,206,132,50,15,4,22,51,17,20,31,20,25,37,58,36,23,23,55,51,21,18,58,37,52,55,41,40,28,57,26,76,123,106,48,41,41,42,45,33,23,23,37,35,21,31,55,33,44,42,54,52,32,44,67,47,51,44,55,53,68,44,25,24,13,34,118,158,175,168,160,195,195,191,175,160,198,171,169,155,151,157,174,168,207,233,220,239,230,220,209,236,210,188,149,120,106,114,82,66,103,67,36,48,25,7,10,43,74,57,29,25,54,62,53,48,42,110,229,251,212,120,25,10,28,51,32,25,40,36,27,34,42,46,60,40,14,64,74,90,75,80,69,63,80,74,18,31,85,83,49,53,49,52,48,31,30,21,38,46,85,82,119,129,107,152,134,151,159,185,139,184,153,126,119,64,66,60,29,34,12,28,8,30,40,11,22,24,28,55,18,43,28,29,29,15,45,10,37,32,11,24,13,62,28,23,53,43,28,50,9,7,17,32,36,65,69,67,109,132,150,174,165,199,195,196,191,175,203,146,106,123,165,156,197,143,159,158,169,149,121,164,168,153,154,101,92,136,140,154,160,156,160,139,96,62,54,38,108,122,150,65,65,68,67,75,115,87,84,98,70,61,99,82,62,40,46,30,30,54,54,48,68,86,115,133,149,141,139,132,144,134,132,79,19,22,62,92,63,64,74,89,81,77,50,54,68,44,40,69,106,108,86,121,69,60,36,30,18,39,28,36,67,80,79,66,67,62,57,71,57,82,67,83,58,62,81,66,72,74,50,43,74,35,56,37,51,42,56,94,79,63,67,25,66,20,11,43,15,11,15,9,47,31,65,34,58,32,41,53,64,15,57,50,73,57,46,51,41,40,61,43,30,47,44,32,46,59,55,65,65,36,30,45,44,27,82,42,55,42,50,37,57,38,113,110,66,31,21,35,36,38,40,22,18,14,25,34,14,23,23,35,37,37,22,20,29,13,22,24,26,47,18,36,26,13,22,12,47,25,20,50,17,12,23,11,37,16,57,43,10,8,12,18,16,37,52,11,3,39,51,31,18,51,47,23,21,31,56,82,88,34,2,33,34,25,26,44,39,51,7,12,42,51,27,6,21,37,33,32,19,48,32,6,29,23,30,17,16,33,20,15,26,61,38,33,48,20,24,41,17,22,23,25,23,28,48,23,31,22,23,23,31,14,36,12,12,23,38,29,24,32,13,31,25,20,24,36,13,6,30,11,51,2,22,18,28,41,11,10,44,32,101,24,27,23,12,20,40,15,21,14,40,112,105,90,40,32,14,15,42,25,22,11,33,23,5,9,36,15,20,0,13,49,24,27,42,38,17,14,41,31,17,18,14,28,50,15,20,16,40,34,30,14,18,24,48,59,85,126,110,142,151,162,170,142,162,130,146,120,141,131,129,139,110,124,148,118,154,155,128,115,137,150,177,150,136,120,146,167,183,152,141,124,111,98,109,116,109,124,98,134,112,120,113,114,104,138,134,122,109,140,156,174,176,162,192,164,150,177,191,160,166,208,193,190,165,150,148,172,191,93,28,3,22,12,14,3,0,5,9,3,23,30,5,0,6,18,23,22,12,8,20,30,7,3,253,248,248,225,230,249,226,215,159,98,168,122,147,112,156,167,91,27,15,41,34,61,32,23,29,13,21,32,43,20,22,22,27,44,21,29,28,17,19,27,42,35,19,32,80,80,77,45,35,59,33,45,34,29,41,46,20,16,8,25,25,41,51,20,32,68,22,33,45,44,24,45,52,69,88,58,69,70,105,109,175,253,223,242,224,224,203,195,177,187,210,207,203,185,191,229,219,218,228,201,211,175,180,168,92,111,78,92,122,117,85,41,12,25,32,20,7,88,64,45,74,52,55,44,46,43,23,45,44,24,63,148,242,252,233,197,40,4,32,2,47,17,34,19,19,36,35,50,47,42,32,28,51,34,39,19,32,51,47,53,47,48,27,57,54,101,63,88,89,102,128,101,144,123,144,135,68,141,155,129,95,72,43,26,18,45,45,25,25,13,27,44,53,26,28,59,32,38,40,23,16,39,30,17,23,33,25,27,48,21,23,26,18,40,19,27,9,50,34,29,19,18,31,38,27,16,21,28,39,25,28,38,23,19,40,27,26,30,52,24,20,43,27,50,27,57,10,41,46,43,17,55,55,33,36,64,32,50,34,41,54,63,52,84,46,30,19,41,23,38,47,57,21,47,44,46,41,49,35,47,24,56,26,43,17,71,24,49,33,60,66,35,61,77,103,131,133,157,140,110,108,124,111,141,136,138,119,108,81,79,131,92,67,78,59,69,61,82,59,69,73,71,89,93,86,112,97,23,64,36,27,14,24,35,76,101,73,86,83,70,96,49,64,44,39,51,81,79,56,49,52,72,59,65,59,47,50,69,43,63,82,66,52,77,55,88,55,32,17,22,7,24,36,19,30,31,36,31,33,42,61,65,33,52,58,33,47,51,38,40,47,51,27,65,35,71,43,27,46,36,41,29,27,68,43,77,51,31,58,45,33,17,63,36,46,32,36,10,23,72,86,89,60,5,19,26,31,21,37,32,18,32,26,15,26,10,10,47,53,19,46,26,14,24,24,42,11,34,32,39,23,36,5,32,6,33,6,4,25,13,52,21,32,19,37,38,44,43,40,8,30,44,10,21,37,28,62,35,34,17,41,55,46,102,93,25,5,43,37,21,29,31,28,2,32,43,38,17,21,28,20,17,26,46,46,43,23,20,29,53,30,35,10,18,35,26,37,22,46,43,26,35,17,55,20,33,35,35,57,25,28,36,31,35,24,49,28,28,24,36,16,11,27,9,21,44,29,19,29,20,7,48,20,13,30,25,14,30,28,10,28,41,34,33,22,56,70,20,31,16,29,34,13,27,20,9,39,52,104,101,59,54,18,10,10,6,27,33,53,27,27,26,3,13,25,24,18,25,15,18,27,16,22,27,25,31,35,7,18,41,30,28,8,16,37,18,9,25,45,34,27,85,82,70,62,94,162,179,178,200,197,189,150,198,170,155,175,150,161,158,178,198,172,162,194,157,167,172,154,166,153,142,171,160,140,132,132,134,107,124,124,119,106,129,129,127,128,156,142,123,130,136,138,130,153,128,152,167,158,172,158,174,180,158,170,191,187,207,156,177,201,166,169,169,190,125,7,10,0,10,20,3,12,2,20,0,16,10,20,12,8,8,5,10,4,0,37,8,11,34,250,249,248,249,241,235,225,213,146,115,175,140,151,76,56,56,24,21,22,22,48,39,46,31,33,40,56,45,23,14,33,12,31,18,25,38,32,43,23,48,41,9,41,39,69,85,40,45,23,27,54,43,46,53,41,66,59,60,38,39,46,57,69,57,49,46,32,58,103,110,86,102,165,131,147,157,162,175,222,218,229,237,230,217,177,156,150,131,147,160,167,142,138,153,156,145,129,129,106,77,64,47,46,25,25,12,40,74,144,124,77,79,35,30,24,41,34,56,41,46,30,30,48,54,36,40,32,66,68,25,83,153,243,233,236,225,92,16,2,24,40,31,44,46,67,61,48,64,56,49,65,60,63,53,64,75,110,85,116,136,92,64,99,123,123,142,140,151,166,147,142,106,89,63,74,19,58,19,31,26,27,20,11,27,29,23,29,5,40,28,42,5,48,43,52,24,46,55,36,30,29,29,23,53,59,50,38,29,31,20,42,31,27,21,51,13,18,23,53,29,23,18,21,26,18,29,23,30,18,17,29,31,45,4,14,24,36,56,31,28,23,24,19,24,50,60,22,28,14,3,19,12,21,27,24,37,24,32,32,28,55,45,58,34,29,27,7,31,43,41,62,46,29,33,14,27,29,47,40,31,29,45,32,54,56,68,68,68,71,89,114,106,106,132,113,146,111,122,137,126,109,98,117,114,119,131,136,146,121,92,78,52,58,95,84,91,82,62,85,70,112,112,93,107,62,46,31,9,17,10,12,27,60,75,99,86,108,118,102,96,93,78,63,85,50,55,80,67,63,83,73,51,56,58,47,70,61,58,48,44,69,70,58,55,37,25,37,22,30,42,27,19,54,15,51,33,43,50,61,51,65,52,69,54,57,54,65,50,47,50,29,36,32,58,40,28,31,55,49,53,42,37,35,66,28,33,30,54,36,33,48,66,63,14,31,47,67,58,45,49,83,140,75,25,23,5,21,25,8,27,19,4,18,27,27,13,15,41,67,15,36,29,11,34,27,26,43,31,26,25,30,45,47,24,2,29,39,19,42,34,29,31,26,25,52,39,27,24,20,9,9,19,45,14,14,15,25,16,23,26,27,44,32,73,63,54,34,29,19,20,63,18,21,44,35,37,33,6,14,17,41,24,14,29,23,10,2,43,26,36,48,25,19,26,9,36,14,38,21,15,31,13,33,31,55,19,36,29,47,41,49,28,52,14,56,15,33,32,8,30,24,13,22,33,19,26,36,45,18,11,14,12,40,38,30,9,6,35,43,17,12,37,15,7,11,71,61,7,36,19,10,16,38,46,25,7,32,27,69,82,93,74,39,17,34,40,30,23,18,17,23,22,8,21,13,32,39,47,17,29,33,41,16,13,42,11,35,31,13,14,37,22,37,12,42,14,29,26,13,44,71,74,92,124,96,69,128,202,178,209,191,196,203,181,177,197,170,173,149,176,198,197,199,195,180,179,173,180,179,170,168,176,164,175,160,180,169,169,171,146,146,136,168,157,152,155,163,170,168,152,176,170,163,160,177,141,152,170,166,184,170,202,182,183,169,161,164,167,189,170,183,154,186,185,188,105,12,1,13,3,21,6,7,13,5,2,15,15,2,17,27,13,44,32,22,7,18,10,11,40,250,255,250,229,224,238,224,214,141,147,154,194,205,84,21,24,33,32,44,31,54,32,50,37,21,41,23,28,50,30,24,41,26,25,24,44,46,64,51,49,36,50,27,57,60,72,85,51,51,41,38,37,38,48,34,82,85,112,105,166,135,142,162,147,196,182,209,196,219,191,226,181,205,227,204,221,222,229,203,198,163,169,168,164,145,148,171,138,99,137,118,120,116,84,13,85,11,30,8,33,26,24,32,41,34,19,61,63,75,52,60,47,42,86,141,129,91,65,30,18,35,39,63,61,56,30,55,80,28,62,132,152,178,162,182,159,99,64,31,35,52,52,55,47,50,112,87,120,97,120,164,170,163,144,143,124,146,149,139,119,65,32,48,71,89,50,37,59,39,32,34,44,15,11,38,17,27,22,15,53,3,33,17,24,38,20,32,37,45,28,41,29,7,33,10,30,7,50,45,30,15,23,4,23,14,28,12,17,16,19,17,36,36,19,14,36,31,19,59,36,24,28,37,40,18,19,14,22,40,6,24,29,31,24,6,27,0,30,7,37,27,24,35,27,45,51,14,14,0,17,5,56,38,26,30,52,32,47,46,37,24,13,37,47,51,12,8,13,31,7,58,35,15,31,44,36,40,56,33,58,60,43,97,98,108,115,77,115,137,126,82,110,123,148,118,95,98,102,120,127,102,121,100,133,123,119,123,99,59,67,45,48,72,55,62,71,70,107,89,116,103,97,57,50,29,32,17,31,19,58,94,104,97,75,109,131,82,97,86,78,79,70,92,89,79,81,77,71,64,70,89,61,46,40,67,58,31,94,35,69,83,44,66,26,18,18,12,30,11,23,30,22,46,60,48,34,63,69,56,41,42,75,63,41,59,70,49,38,30,35,43,45,31,29,51,63,35,40,29,49,55,42,41,22,57,42,58,43,37,30,62,39,35,42,44,35,39,31,54,37,36,72,99,73,28,18,28,25,26,30,40,9,38,26,19,32,21,15,37,54,37,31,34,21,23,40,36,32,33,15,58,8,19,26,23,26,18,62,36,20,39,16,47,44,34,18,39,31,36,26,34,10,57,37,23,47,20,40,31,22,28,36,18,61,98,42,32,21,25,36,27,31,18,38,26,45,24,12,36,22,40,27,25,21,30,13,26,36,40,12,20,32,23,42,35,38,44,34,14,45,44,36,35,23,28,23,23,41,9,31,41,27,26,28,35,46,12,9,31,21,54,53,5,39,33,20,25,11,20,31,15,15,36,27,33,21,12,7,37,15,23,23,23,15,48,83,35,11,43,28,18,19,25,17,18,38,28,20,33,51,97,84,76,27,44,31,6,22,10,15,29,33,31,6,31,11,24,38,8,25,15,47,30,46,17,26,26,24,29,22,18,34,18,25,23,21,24,28,36,95,83,105,135,186,154,87,64,108,163,176,207,186,195,190,179,167,205,210,205,183,187,172,183,168,182,184,198,183,186,172,202,169,178,209,187,182,176,157,175,199,194,188,185,185,197,204,146,170,169,194,188,190,186,186,181,190,176,195,165,170,178,179,173,175,180,210,181,187,163,160,166,145,166,180,166,114,17,7,6,31,13,13,25,2,18,0,22,4,8,7,16,9,24,22,7,41,18,35,16,37,244,254,248,244,237,229,215,191,98,130,156,212,210,61,24,33,29,55,29,46,31,32,52,51,10,46,33,48,29,59,53,42,34,25,44,33,59,40,75,58,70,56,66,39,51,128,109,85,82,100,140,131,156,156,180,215,235,206,200,216,234,244,235,203,231,234,229,221,229,221,200,170,167,154,128,135,103,109,71,38,79,33,41,74,144,169,196,169,106,95,93,124,80,45,0,25,31,34,58,38,90,39,45,37,36,62,60,59,57,39,37,71,84,188,255,246,204,129,26,34,63,99,34,37,39,34,53,58,79,168,191,141,86,61,71,91,57,92,90,120,162,112,145,135,140,171,163,130,153,138,112,111,126,100,72,45,26,53,22,48,18,34,48,6,69,19,19,50,29,34,44,40,47,43,41,34,49,19,39,35,25,40,62,51,46,38,45,84,36,51,50,45,31,43,51,50,43,57,48,36,45,66,52,38,42,19,46,61,53,52,61,68,50,51,50,53,22,37,58,42,32,49,43,58,47,33,31,47,37,27,62,44,41,34,66,45,54,32,55,42,58,52,38,66,61,55,8,35,19,56,35,39,40,70,50,38,36,42,33,42,47,25,40,16,14,33,20,45,16,53,75,36,36,25,50,47,77,50,33,53,42,68,98,96,109,89,86,99,83,106,102,106,111,111,108,108,108,90,73,113,97,98,136,115,110,103,63,69,46,50,37,52,76,84,57,58,78,91,70,55,58,26,34,24,26,27,7,73,74,88,118,103,111,115,112,84,91,67,75,101,94,62,77,95,71,68,78,63,53,68,69,83,79,67,58,66,73,49,63,54,57,35,21,15,26,24,26,22,51,47,41,34,51,37,56,62,60,41,57,55,45,44,73,39,62,72,43,36,56,53,59,38,35,35,76,25,51,41,19,37,18,48,39,50,42,23,23,47,32,70,34,45,33,60,25,48,58,24,39,44,31,30,87,100,66,33,8,34,39,53,68,31,19,5,39,21,29,35,7,0,26,21,43,44,28,29,38,42,31,54,46,16,35,21,19,46,18,26,8,46,38,30,21,35,31,47,20,5,28,60,49,22,26,20,8,53,18,41,51,52,7,35,11,42,91,63,57,7,20,27,12,33,37,38,25,54,11,35,29,34,38,42,26,34,16,27,38,27,36,30,20,10,37,47,42,16,37,9,31,44,23,18,55,30,23,33,14,46,20,42,32,30,44,20,63,21,33,29,17,14,41,25,18,31,32,33,49,24,13,19,6,17,46,38,14,8,32,2,40,33,35,4,34,12,57,102,37,26,52,17,35,31,22,34,16,1,34,22,15,12,88,98,116,34,9,15,26,25,23,17,30,49,9,53,25,48,30,20,7,0,36,10,6,5,34,32,20,29,31,20,11,56,27,19,22,31,25,27,142,159,98,72,59,123,138,62,20,44,82,146,178,197,200,197,155,148,220,249,229,203,177,179,179,190,168,174,169,178,167,170,170,173,173,194,193,177,188,211,207,188,204,195,200,189,202,200,197,175,186,190,199,181,167,194,213,171,184,179,174,187,188,175,183,173,211,204,188,177,207,197,212,171,175,189,184,96,10,3,14,13,9,0,7,4,1,11,9,21,27,15,28,19,23,23,13,15,9,13,12,10,252,251,229,244,230,221,210,166,41,49,133,197,196,63,6,11,25,44,36,54,30,56,31,50,48,30,45,51,61,52,60,75,61,106,69,93,83,98,113,123,147,166,165,203,175,205,222,227,221,233,215,235,233,244,225,235,225,235,219,221,212,211,209,177,162,175,127,98,119,107,117,108,100,80,52,71,59,46,20,22,9,7,8,62,101,147,137,124,109,77,98,92,97,77,8,11,54,34,16,41,58,45,37,42,22,73,101,115,134,91,86,86,148,250,252,246,241,148,30,30,94,100,93,58,33,46,70,79,112,157,169,88,55,49,42,41,47,130,135,84,145,93,130,82,89,66,49,46,32,38,69,23,42,16,34,5,13,38,29,38,4,48,37,46,15,27,22,17,19,31,39,63,19,22,48,47,62,31,30,57,54,92,86,137,146,175,198,181,224,184,191,192,171,176,193,192,193,191,174,163,200,199,194,188,206,194,184,159,190,188,202,217,201,208,212,200,193,138,180,210,209,190,204,120,78,73,88,128,176,175,190,184,176,185,199,190,158,177,178,214,212,213,195,187,176,86,36,14,44,88,139,160,134,160,156,157,141,149,152,159,153,155,163,146,130,97,108,111,121,133,132,100,81,92,89,126,131,63,51,37,32,53,92,101,93,78,61,72,76,121,58,73,110,95,81,72,101,96,117,114,107,56,76,47,55,49,35,37,31,43,45,49,74,63,66,45,65,56,55,52,38,11,48,21,33,32,60,93,92,84,77,108,89,120,91,102,82,97,87,66,91,87,91,78,61,63,67,90,103,57,86,78,61,74,63,66,62,52,49,39,22,11,44,5,14,18,37,73,59,31,32,42,61,55,48,59,44,64,12,58,67,70,25,66,31,39,45,50,37,47,58,17,40,30,41,18,39,38,54,34,63,50,45,49,29,37,55,47,29,53,60,19,46,27,17,48,38,82,22,7,38,26,12,86,73,74,9,13,45,45,61,13,16,23,7,22,19,12,36,42,19,7,48,23,48,25,29,39,27,18,19,61,10,15,22,34,9,29,29,32,1,18,36,23,83,37,11,31,23,52,43,28,20,38,41,28,31,28,25,24,14,41,23,15,97,82,45,30,15,30,20,47,22,25,22,10,23,7,32,36,30,19,27,9,25,6,22,19,37,40,21,13,23,48,19,33,10,13,24,34,42,8,19,40,34,55,37,39,53,40,52,63,35,34,42,21,31,40,22,11,20,28,50,26,24,50,5,24,33,37,28,10,6,18,23,0,21,38,31,22,16,28,23,31,82,82,36,24,14,19,20,30,12,4,26,15,14,10,22,29,30,59,90,94,56,47,19,28,8,31,26,14,30,16,28,37,39,5,14,31,41,31,13,13,53,41,26,12,34,33,4,19,10,12,8,36,24,69,169,172,112,106,47,38,38,8,42,4,37,117,124,148,156,208,192,209,244,255,204,186,208,171,207,186,181,173,182,179,162,186,186,157,162,188,183,174,159,179,191,193,190,188,167,146,172,165,175,186,155,190,174,171,195,181,200,177,189,203,191,183,174,196,165,180,180,176,193,189,184,183,193,157,196,192,168,108,32,5,19,32,7,20,21,10,2,19,15,0,21,8,13,8,19,29,20,0,8,9,9,19,251,255,243,221,223,236,220,186,43,81,138,210,212,38,20,17,21,39,57,73,52,35,49,45,45,60,61,91,111,129,132,149,160,162,182,164,179,222,218,226,217,231,219,217,233,233,218,226,207,211,211,184,201,182,139,150,97,89,71,73,50,63,43,39,26,33,12,32,64,66,118,159,163,87,63,84,52,80,50,70,44,11,31,28,87,155,131,116,82,116,112,144,106,44,22,47,73,68,24,45,59,67,60,43,31,81,110,103,110,82,85,135,191,238,238,238,231,160,43,14,53,148,139,123,48,64,149,145,121,63,48,81,68,66,43,23,37,54,30,22,25,28,41,37,37,14,14,15,21,7,16,12,4,52,35,30,38,33,46,28,35,34,46,52,17,34,68,45,41,38,71,44,56,101,108,117,167,178,197,187,203,222,209,223,245,240,239,249,242,218,221,230,202,237,255,237,215,214,217,236,247,250,229,247,239,231,237,225,223,243,250,244,246,230,237,231,255,249,242,221,207,204,167,78,119,139,219,227,238,247,209,238,240,237,242,242,249,248,238,227,228,199,173,122,78,94,131,196,206,219,233,240,225,239,226,245,233,242,231,216,237,202,212,229,227,178,173,177,194,223,233,194,143,141,140,99,119,145,60,75,55,77,101,74,79,72,73,50,65,61,67,56,76,98,88,81,86,100,117,87,50,49,53,39,64,19,36,39,13,40,59,52,79,43,67,57,58,49,19,26,16,18,27,54,74,91,90,107,98,107,75,84,102,74,95,72,98,72,75,88,70,67,60,45,45,68,94,80,76,69,46,66,61,72,77,49,102,46,32,31,38,8,9,44,39,40,47,54,67,55,57,47,57,54,55,42,54,61,50,56,64,48,71,55,45,42,48,53,30,36,52,59,32,58,41,28,17,49,50,46,55,40,55,40,45,43,34,58,44,38,58,50,52,48,29,37,52,33,34,54,75,13,46,15,82,122,63,53,25,25,63,22,13,9,11,4,13,2,13,20,19,9,26,15,0,22,37,13,25,23,8,17,22,30,45,9,45,26,6,45,18,23,12,26,59,26,27,18,48,30,20,16,32,35,12,15,43,47,36,42,18,46,20,12,69,95,24,25,34,29,17,19,33,26,20,33,27,26,39,44,27,22,25,41,22,14,19,47,34,5,15,50,19,32,32,38,32,42,54,45,32,37,18,37,55,36,33,30,30,23,35,40,12,46,32,25,53,11,26,33,39,14,32,28,18,38,39,16,26,49,31,16,31,18,2,22,23,28,15,23,9,22,9,26,56,83,15,37,28,19,40,9,31,25,31,64,24,20,17,8,31,15,77,102,68,78,23,20,25,39,31,33,9,17,19,30,34,24,33,23,28,13,14,10,49,7,23,22,20,22,23,32,21,11,28,28,15,91,184,146,157,176,144,120,79,63,41,43,82,147,125,119,161,208,156,177,207,180,113,145,146,165,164,137,165,151,167,160,182,181,152,146,138,169,160,139,173,169,164,167,148,163,131,142,149,171,143,153,189,160,180,183,179,188,171,185,174,189,162,171,175,182,172,181,180,178,166,194,160,198,162,153,194,183,167,113,13,19,2,2,33,13,3,42,12,9,4,6,21,3,21,19,17,16,15,14,14,7,34,14,235,255,237,248,233,248,223,169,77,86,136,197,247,76,82,77,82,130,121,161,151,147,184,171,175,212,217,202,230,244,240,209,215,162,226,216,210,194,203,180,165,139,141,144,158,93,38,108,110,103,66,38,26,11,23,6,23,31,9,60,18,33,43,22,34,50,33,35,25,58,104,137,102,57,62,83,63,95,108,133,158,127,113,63,106,157,145,122,97,122,107,124,114,32,0,61,133,157,146,137,173,151,96,36,9,23,48,45,25,43,117,135,153,181,219,196,193,126,15,5,70,144,119,108,35,70,121,113,53,37,25,44,80,18,38,20,4,41,4,38,40,33,16,19,25,19,26,40,20,44,11,55,47,56,30,49,74,64,50,44,67,68,64,53,48,90,108,134,162,171,181,208,221,212,225,221,231,235,233,228,198,209,204,175,150,158,163,155,186,193,179,192,184,185,180,164,165,158,185,205,195,169,165,183,198,201,205,197,203,171,190,187,201,213,215,214,202,203,197,75,68,98,120,157,179,224,212,193,195,216,209,215,237,250,227,199,181,157,114,74,58,26,90,110,174,168,208,226,216,246,213,211,193,211,180,193,205,197,171,190,196,217,182,205,175,166,120,119,147,181,169,185,190,142,123,84,93,106,82,78,97,121,119,78,81,74,59,74,72,71,81,91,104,78,98,91,115,89,81,84,54,26,45,53,37,69,54,67,68,113,52,42,47,84,47,48,46,17,30,10,31,70,68,74,110,102,109,133,108,74,101,76,77,60,69,88,84,63,91,59,101,49,61,59,86,81,84,94,55,61,86,43,76,69,53,39,48,21,20,24,57,10,31,53,57,29,61,51,42,60,50,47,58,37,45,38,66,50,55,40,56,53,58,61,51,65,52,75,47,56,36,43,64,29,37,40,35,48,32,32,40,21,44,36,55,44,31,37,56,26,28,38,48,43,44,33,46,58,40,49,13,33,32,10,48,85,120,91,36,9,16,29,9,15,27,26,9,32,27,22,25,31,23,35,27,33,23,40,7,35,31,37,27,36,26,28,4,9,30,16,35,41,35,13,27,28,14,30,14,16,29,13,40,35,30,31,32,23,10,24,32,42,31,29,75,72,55,32,25,14,26,33,17,3,20,32,52,49,30,23,25,28,24,14,34,24,30,9,16,28,27,23,38,25,13,7,47,8,58,31,30,90,17,22,55,19,52,47,41,37,38,45,45,26,23,38,30,41,44,43,36,31,13,10,8,11,11,31,48,18,19,29,27,7,19,28,36,27,18,22,37,15,1,10,95,60,23,27,25,27,14,30,21,24,20,24,19,35,38,20,22,20,42,39,109,98,26,41,26,16,21,8,18,14,39,27,21,27,13,38,30,22,36,36,14,20,19,34,36,14,22,29,31,21,45,1,43,179,188,192,201,214,203,225,204,161,131,138,96,100,82,53,93,142,150,142,177,161,118,157,136,156,148,157,169,159,146,145,143,136,128,115,121,154,132,131,127,122,137,119,166,131,160,127,116,143,146,143,157,142,125,154,137,134,143,152,150,163,172,193,169,163,200,189,178,158,160,171,168,161,176,187,206,175,174,117,2,1,4,17,23,13,22,17,13,15,28,28,33,9,22,8,24,15,36,18,8,33,9,26,244,240,252,251,241,246,217,206,118,129,182,247,243,150,181,188,184,224,195,214,235,242,233,212,225,237,197,226,194,217,199,192,170,133,109,120,113,92,95,64,79,70,99,65,58,43,2,31,10,7,39,24,58,25,47,60,31,45,33,50,81,55,55,34,24,32,33,32,56,74,30,29,26,53,21,68,42,61,97,143,198,240,219,130,187,175,200,220,187,169,204,226,230,145,18,86,178,223,240,216,203,205,152,83,31,68,74,85,53,87,133,142,93,81,122,136,109,63,29,19,24,60,61,54,33,41,75,47,70,37,61,75,33,16,5,14,20,26,31,28,25,37,41,49,59,48,61,54,76,80,49,62,131,136,92,90,75,114,147,166,180,180,204,228,203,222,213,240,245,236,251,239,233,212,212,207,213,192,199,183,184,175,175,175,170,150,146,163,181,144,155,156,148,136,167,151,168,177,147,146,150,131,137,146,177,162,168,171,164,153,161,165,194,198,171,107,111,104,57,38,68,145,186,198,190,184,181,173,203,199,181,190,172,136,100,64,23,38,76,99,131,199,167,197,182,209,217,201,178,191,161,153,170,131,170,173,150,157,180,180,179,158,156,201,169,141,104,103,82,107,99,115,102,103,80,92,110,105,79,99,101,173,115,109,97,102,116,103,89,72,77,70,76,89,79,96,64,85,44,42,52,53,76,53,40,26,32,64,133,93,55,52,44,46,3,18,10,26,48,47,68,94,108,80,103,108,97,98,78,91,80,74,86,70,50,70,69,65,58,85,73,58,59,106,69,104,55,52,93,92,59,98,81,57,21,12,19,23,35,36,19,38,35,53,36,58,59,54,53,24,69,38,42,62,26,43,43,71,64,46,58,41,45,35,45,42,44,40,51,67,25,58,66,50,23,57,74,47,36,28,25,48,27,41,37,32,39,37,42,38,43,38,51,34,16,36,34,51,46,35,53,38,48,38,13,42,83,102,74,65,19,12,15,28,21,20,34,40,19,26,42,34,6,31,51,11,31,11,45,29,23,25,25,29,23,37,2,11,36,32,18,20,30,38,34,29,21,32,48,61,36,35,25,37,41,39,13,41,21,5,18,38,7,12,55,83,81,33,27,27,21,11,25,29,30,37,35,32,22,39,5,20,36,15,29,18,32,62,35,8,29,21,20,49,39,18,32,32,35,35,54,73,58,28,25,41,44,19,21,28,28,19,36,26,23,17,17,5,16,10,21,32,22,31,5,54,15,12,31,30,15,18,9,12,11,19,16,20,42,27,37,7,34,30,87,42,25,41,15,17,26,5,23,16,49,35,13,7,30,30,14,32,28,40,90,117,86,39,18,12,22,20,49,10,18,48,26,10,27,26,41,17,23,20,22,46,33,45,14,33,45,30,28,31,61,27,139,192,241,250,212,187,174,202,190,185,197,164,122,98,65,26,12,45,69,94,170,192,213,228,226,175,159,179,162,182,149,162,170,170,143,178,148,148,165,156,159,155,145,151,145,169,144,135,159,134,102,117,123,129,154,134,122,114,114,129,152,121,130,135,133,150,156,142,155,153,175,159,149,166,192,167,175,159,145,98,23,4,0,13,7,8,25,1,10,13,8,1,21,12,37,6,33,9,60,31,5,12,22,27,233,230,247,235,238,242,242,205,161,196,180,198,208,148,180,205,187,198,184,160,151,122,163,125,131,131,97,90,69,97,77,85,124,112,111,105,109,120,117,105,144,128,127,139,159,122,62,25,1,31,24,83,91,101,90,97,102,78,96,75,65,59,70,55,39,31,28,43,97,73,94,77,45,77,26,35,19,36,50,152,217,224,242,250,233,244,243,228,238,249,248,244,241,185,20,25,149,177,161,119,103,94,69,34,29,50,156,182,168,153,119,84,50,20,13,10,3,13,27,18,34,48,32,56,24,38,45,53,53,37,80,37,32,49,36,46,62,47,79,68,89,112,139,127,163,176,202,218,245,208,95,107,229,209,216,234,240,219,242,225,238,225,245,225,221,203,227,198,199,194,175,171,173,192,193,174,172,179,187,178,202,197,204,172,152,154,169,174,156,152,186,189,168,164,164,151,170,200,166,178,153,165,150,170,157,161,180,143,193,175,196,187,181,152,78,31,50,92,165,141,180,202,234,226,197,190,178,156,101,124,112,64,46,36,70,90,135,165,160,178,205,208,226,190,175,183,175,177,167,169,163,177,180,188,156,151,144,160,166,172,170,154,163,179,161,145,103,121,92,79,62,33,5,56,68,36,55,57,47,106,124,141,162,130,126,135,121,122,101,105,100,101,66,101,97,101,103,88,112,78,74,76,70,84,64,78,49,54,64,59,37,30,47,38,26,19,42,47,71,80,105,119,101,67,95,91,110,79,74,85,80,56,86,94,77,91,79,72,59,61,65,66,94,66,63,60,60,80,60,105,62,42,18,34,19,6,5,25,26,60,53,49,55,37,50,41,84,57,37,50,69,47,33,74,53,51,38,44,50,63,49,29,68,54,50,25,26,49,60,25,33,37,45,20,51,30,36,71,31,53,36,34,38,43,26,33,43,61,70,37,34,29,44,62,56,55,66,32,25,43,46,29,44,24,30,34,37,95,99,71,14,46,18,16,34,34,19,24,57,40,23,25,28,5,35,36,16,27,22,57,35,20,31,16,33,18,23,26,26,13,38,38,31,37,29,22,18,46,52,20,24,31,27,48,28,31,38,44,29,30,3,20,20,8,2,85,94,44,28,17,40,20,24,24,33,58,24,26,30,11,12,34,57,52,27,37,29,19,30,28,27,25,21,42,35,46,16,13,38,56,21,41,68,48,15,15,37,23,14,45,40,35,26,39,18,6,35,8,10,38,20,22,24,6,22,39,35,9,13,16,28,21,18,20,33,13,45,39,24,28,30,27,44,72,79,51,57,2,29,30,7,12,45,31,21,29,25,37,13,22,51,22,30,32,49,92,104,88,58,43,21,28,26,26,2,20,14,39,15,27,35,24,27,3,24,33,20,32,10,39,32,22,15,17,22,62,184,222,208,161,145,88,104,152,191,175,202,200,146,149,87,7,20,14,10,44,97,207,253,233,245,227,212,250,244,217,225,209,216,213,224,233,250,220,220,232,233,212,204,234,195,217,234,208,204,206,142,201,185,154,184,174,169,155,172,168,139,162,155,136,147,141,170,142,148,160,160,151,147,161,150,149,169,142,111,103,11,3,4,13,24,38,20,8,21,6,23,27,11,0,25,1,16,17,26,30,21,20,20,10,197,183,201,188,195,177,161,144,137,113,118,120,114,103,138,123,120,133,129,119,129,91,111,95,113,83,120,38,3,22,64,102,153,128,165,151,136,173,197,187,190,192,203,192,202,190,118,129,60,19,44,115,142,160,151,151,114,145,133,139,112,111,124,129,108,56,44,89,110,202,187,186,158,109,81,68,58,78,100,171,241,247,224,229,215,235,255,255,246,232,212,224,171,100,3,13,68,70,43,35,52,9,38,55,20,102,169,236,184,117,54,61,30,23,19,13,5,21,28,30,92,101,74,44,47,61,69,67,75,66,78,69,117,120,163,142,178,192,204,215,216,230,225,223,217,220,238,235,227,156,86,180,218,240,214,209,226,253,234,212,202,183,181,190,182,169,167,180,180,165,172,184,189,170,166,153,162,176,159,167,203,193,157,190,178,170,145,159,158,169,177,167,173,171,157,166,159,163,167,152,174,149,187,170,166,156,175,168,182,177,123,79,102,57,101,113,137,189,210,231,221,203,205,177,163,108,62,53,39,9,78,79,100,136,137,156,165,198,197,197,181,188,212,149,171,200,187,182,172,143,158,190,185,164,166,181,150,176,141,157,164,171,164,170,170,131,112,117,88,84,72,8,7,22,48,18,37,29,46,81,149,168,151,110,142,156,141,131,119,97,93,93,101,106,107,78,112,108,101,111,109,95,53,82,96,73,60,60,33,6,32,35,6,34,52,75,61,95,93,81,92,59,112,92,97,91,96,82,68,88,78,71,100,67,98,75,88,26,36,58,67,56,81,93,63,59,94,102,78,98,45,28,31,24,8,21,39,31,53,59,77,69,28,66,52,58,60,53,65,39,70,51,49,41,79,71,51,19,65,23,45,39,26,50,37,52,50,48,46,56,57,39,39,27,42,32,41,35,17,21,34,28,54,35,41,37,37,33,65,36,39,67,43,42,28,50,57,46,43,53,27,42,19,30,12,37,27,38,94,111,82,27,22,28,19,17,26,29,11,23,25,25,41,25,28,39,50,44,20,41,53,41,12,11,17,39,42,36,19,29,21,20,24,21,10,8,29,23,47,26,22,52,32,39,15,15,26,26,36,27,11,41,28,24,37,34,118,64,41,42,3,29,52,28,31,50,23,27,8,49,18,9,41,45,34,22,23,50,30,9,23,37,36,40,16,44,24,14,44,40,29,33,42,41,29,46,12,47,16,13,40,39,41,39,29,26,14,19,20,16,44,40,28,7,22,32,25,33,5,34,18,44,25,22,23,19,22,12,24,17,34,10,22,66,94,22,19,19,7,8,25,22,17,42,22,41,20,49,16,17,4,31,28,17,30,29,49,116,93,56,43,23,2,16,26,30,21,23,18,12,17,17,7,14,24,27,38,20,13,19,26,26,21,35,33,67,186,213,218,180,154,124,130,167,169,161,173,195,177,200,146,119,111,115,56,42,109,195,252,244,244,252,252,226,249,249,247,255,250,255,250,246,235,251,254,226,222,254,252,255,252,252,239,253,247,252,253,248,224,254,219,235,237,226,252,234,222,255,216,211,214,190,188,187,171,185,181,164,175,138,156,154,161,140,135,114,15,8,22,0,12,21,12,6,3,9,16,17,4,20,21,19,38,5,16,8,3,32,1,23,80,84,44,59,58,55,72,92,107,138,138,155,154,130,135,134,138,146,171,130,157,156,166,148,161,142,152,67,13,34,102,140,193,187,199,166,166,168,167,173,130,144,151,161,127,106,122,117,25,41,46,75,131,147,132,131,155,133,166,134,151,165,161,204,198,70,23,60,157,205,229,211,193,140,103,89,112,131,151,171,215,208,195,144,139,150,180,166,142,106,68,94,49,21,29,60,113,138,103,109,111,127,75,20,15,64,150,131,74,75,33,88,99,92,79,76,105,134,49,19,130,186,124,89,62,97,101,127,56,21,38,110,151,246,227,229,255,230,228,220,203,215,177,181,192,192,186,199,184,74,73,154,187,208,181,176,160,159,174,138,144,163,161,159,138,178,162,138,160,192,186,183,155,188,148,155,175,160,166,160,158,155,177,150,176,164,170,158,178,174,159,139,135,141,140,141,142,129,156,165,148,189,167,159,198,159,155,108,109,60,38,75,55,157,188,184,211,204,203,189,161,119,87,48,9,53,32,100,113,145,122,160,149,136,161,151,160,164,173,161,191,193,158,148,153,171,155,128,137,129,151,157,139,170,170,186,156,153,160,158,147,162,164,166,166,159,129,104,64,60,112,51,23,39,44,60,59,46,81,94,113,129,115,132,107,118,117,100,66,83,87,104,82,105,102,93,127,89,110,115,102,93,68,83,80,85,51,23,9,42,39,47,51,51,73,108,99,75,76,85,88,63,94,81,102,75,91,48,79,76,90,91,51,86,89,67,65,51,32,88,87,80,88,87,79,73,64,77,41,32,32,17,16,21,21,37,42,50,76,80,59,48,37,22,44,66,34,44,34,52,16,46,45,41,52,28,30,32,53,74,29,55,41,52,53,64,47,24,43,58,37,35,41,26,52,28,67,20,43,52,51,55,45,54,62,29,37,43,53,47,23,20,38,47,48,48,38,30,33,21,18,47,17,30,26,34,6,15,51,49,96,77,34,28,22,32,23,24,6,32,42,9,24,14,17,18,25,52,52,0,36,31,19,44,37,15,40,16,43,33,19,13,31,8,47,26,45,35,38,37,44,31,48,17,49,16,44,38,37,27,33,28,34,18,20,36,74,100,27,42,19,21,34,37,19,31,36,30,27,16,21,49,17,39,14,22,20,5,37,38,20,36,25,19,31,14,26,11,29,38,29,17,21,42,31,33,7,42,17,47,37,31,32,33,36,39,39,12,3,9,10,33,38,28,16,38,58,18,23,28,21,19,14,32,10,6,30,13,13,30,8,31,39,87,67,30,23,11,0,29,29,26,30,17,11,52,22,34,27,7,40,16,5,24,38,20,49,85,149,115,61,30,0,36,13,25,37,30,16,20,32,10,34,23,23,27,44,33,26,25,26,19,20,54,27,130,202,230,174,207,232,175,174,171,175,148,199,182,164,200,196,175,189,150,126,143,169,176,155,147,166,189,225,252,250,247,246,255,250,250,239,243,231,252,248,252,242,242,251,244,253,252,253,243,248,255,247,251,238,254,253,237,249,247,255,255,253,255,255,253,255,252,231,229,228,228,232,246,228,200,201,187,199,138,135,77,6,0,0,9,29,21,8,30,3,22,12,12,11,6,11,5,20,1,29,36,11,12,15,4,74,44,49,28,21,16,80,112,143,157,215,245,200,182,163,169,160,184,171,176,171,185,165,161,179,160,163,65,24,18,51,116,126,137,99,103,62,90,45,80,87,66,85,94,45,78,83,61,17,19,79,120,180,150,173,181,193,214,205,186,201,222,223,210,236,132,20,82,143,237,234,237,220,114,81,84,92,161,140,114,108,122,100,85,21,63,82,104,135,137,165,161,177,49,28,108,193,201,200,172,164,151,115,45,30,63,93,81,48,66,47,107,83,116,179,238,229,244,111,36,114,165,109,42,73,161,135,136,108,22,25,117,215,188,244,211,208,215,190,170,168,168,155,162,154,177,162,157,143,32,80,165,142,173,140,113,112,121,122,95,172,142,133,167,174,188,146,150,142,141,160,151,134,157,150,162,151,133,152,159,133,172,136,130,156,167,175,212,197,187,189,157,164,155,133,144,166,162,119,188,187,172,203,174,190,128,27,14,59,103,139,181,207,233,224,155,131,61,53,11,22,25,44,89,102,148,150,183,169,200,165,207,170,146,188,207,185,193,179,191,133,155,161,145,165,156,159,152,159,160,176,168,166,160,148,156,156,172,173,149,150,141,173,160,167,142,152,98,79,107,102,100,51,65,72,69,80,124,94,95,103,65,77,63,91,59,92,96,74,91,104,103,122,120,100,108,114,122,116,110,87,89,59,42,28,7,47,40,37,50,45,40,75,89,98,97,66,73,63,67,83,87,51,89,90,36,54,74,92,97,55,94,56,60,72,59,70,73,91,90,73,54,81,86,80,51,60,35,14,25,3,31,16,54,55,36,66,69,30,54,60,40,54,48,66,41,49,56,28,48,28,45,36,40,44,50,44,44,37,61,44,38,32,48,63,43,36,51,53,57,16,48,46,49,45,55,36,42,76,55,52,53,41,61,45,9,51,47,22,48,31,13,50,46,28,41,41,34,38,23,52,55,57,36,35,34,42,45,47,32,91,117,74,19,34,5,43,15,19,46,28,12,29,18,35,18,16,54,36,13,39,27,25,25,43,23,24,31,21,12,30,29,25,21,30,37,7,13,44,26,20,30,29,35,10,28,29,16,22,21,11,20,26,45,27,34,94,105,50,27,15,27,54,20,29,20,11,20,22,21,46,39,27,29,13,17,9,20,30,24,32,40,24,15,28,34,38,26,36,18,46,37,29,41,45,43,31,26,20,22,60,24,35,36,44,29,35,14,49,13,28,8,23,29,42,8,10,19,24,5,24,11,32,29,9,14,21,16,32,21,21,4,62,67,42,47,39,8,32,26,32,19,23,27,8,24,14,36,26,24,41,26,38,10,40,33,38,22,88,120,107,73,35,30,28,34,23,28,35,15,22,30,16,12,34,11,42,26,10,42,31,37,37,33,101,124,139,132,99,189,187,176,164,143,173,136,171,186,184,201,183,174,197,208,213,242,243,168,107,0,51,83,134,172,200,205,182,172,227,247,254,253,236,255,248,251,255,249,236,255,249,250,255,237,246,255,250,255,255,249,245,248,244,244,250,255,251,247,241,254,254,247,250,245,240,242,253,227,236,242,232,240,219,202,194,105,3,6,5,17,3,15,10,17,14,8,12,12,10,0,23,12,22,27,18,8,12,5,15,8,142,144,70,29,40,33,80,176,185,202,241,234,234,159,167,166,144,181,146,149,117,176,137,107,99,112,74,27,13,31,62,119,112,89,101,81,67,70,92,96,87,120,113,112,112,133,153,135,21,19,84,174,207,197,243,215,217,230,227,224,227,209,189,187,203,67,1,46,95,153,133,114,75,50,58,43,77,114,80,50,31,19,34,26,14,56,132,212,255,251,247,240,229,144,24,101,187,185,145,120,115,104,51,25,20,77,147,160,137,122,150,147,73,125,223,252,248,217,75,28,108,80,68,33,116,207,179,181,129,50,12,122,200,205,218,187,194,185,171,175,189,179,164,171,196,179,185,155,113,45,124,149,156,163,174,163,154,145,181,170,175,160,138,163,171,172,142,168,161,156,143,141,179,182,160,158,162,164,178,151,144,176,183,177,172,189,196,246,230,186,156,155,182,245,197,218,189,237,236,203,185,206,146,105,93,124,160,148,179,201,215,247,174,123,101,65,49,33,61,121,138,142,173,217,249,232,197,215,211,197,177,197,247,208,192,183,231,218,178,155,138,187,221,214,194,174,214,218,201,167,204,206,170,189,178,215,199,188,189,218,216,201,170,159,161,166,122,130,92,100,96,85,54,98,97,97,100,112,95,95,88,91,93,79,52,91,117,71,82,77,92,82,116,106,117,113,103,129,114,85,52,33,25,13,44,34,36,37,37,85,57,122,83,66,97,83,71,83,88,91,68,61,89,113,82,87,96,110,103,86,72,44,57,83,51,80,46,67,102,56,88,80,95,81,35,29,31,23,47,2,30,34,41,50,40,69,33,32,69,40,36,45,43,33,70,81,40,50,34,56,38,58,58,76,56,43,66,49,90,39,52,26,40,46,40,47,51,35,29,72,27,35,45,53,37,38,31,51,32,34,34,41,48,42,37,48,49,36,44,66,47,79,34,19,41,23,47,43,24,20,24,38,37,37,18,9,34,11,10,41,59,64,116,120,48,11,21,16,19,27,0,21,18,24,25,24,27,67,21,27,29,2,24,26,34,5,14,20,2,37,7,3,48,30,47,17,44,20,15,31,33,25,33,12,50,40,26,41,0,18,48,38,33,29,22,51,63,71,46,49,16,17,11,10,5,17,44,34,39,7,57,34,20,14,33,23,31,46,29,15,32,49,36,28,38,50,6,19,33,31,25,15,45,41,32,36,51,47,27,36,16,27,39,34,31,16,50,27,37,43,25,13,13,20,40,51,46,23,28,16,10,26,2,22,37,25,25,31,9,21,8,22,53,99,71,34,14,42,30,7,20,34,26,21,19,25,28,22,7,18,10,22,3,28,28,13,14,14,47,50,121,107,83,22,3,31,7,24,42,8,55,21,34,23,36,12,15,27,21,13,41,28,39,91,101,32,61,74,55,131,125,109,124,104,127,127,163,174,202,197,200,177,204,231,239,250,253,202,146,8,18,48,45,97,140,161,133,139,169,236,237,230,198,198,235,237,227,228,235,246,255,252,228,248,252,239,255,249,239,244,242,237,251,250,245,253,247,226,236,251,246,250,241,255,255,239,225,254,211,249,236,219,227,241,211,101,28,7,14,4,8,11,5,28,17,12,7,7,13,3,28,29,5,21,26,10,28,8,30,14,168,192,128,57,37,60,101,149,151,145,193,217,158,87,100,93,100,61,75,80,43,61,66,68,69,66,56,31,23,69,116,125,142,169,156,178,187,198,204,202,212,199,208,209,203,221,214,161,81,36,97,207,220,224,217,231,218,213,206,206,126,152,103,99,57,22,33,29,21,23,13,37,18,25,17,90,77,85,86,73,54,38,36,72,105,142,189,236,238,245,236,244,219,83,18,33,97,87,76,55,50,30,38,31,15,45,142,205,184,199,212,168,90,61,148,164,159,101,2,24,84,73,63,26,131,236,165,170,118,16,6,104,164,179,185,164,123,109,144,189,181,233,253,198,175,186,222,223,111,95,227,245,225,180,181,205,222,186,165,188,228,237,184,188,211,232,187,165,194,251,244,154,176,242,247,198,187,213,242,233,162,208,242,247,175,145,158,197,124,104,128,133,251,245,195,163,246,244,234,147,137,91,41,50,112,243,236,175,133,133,85,32,16,72,187,217,150,128,240,220,223,190,235,244,254,192,179,227,248,171,168,235,234,201,147,173,217,179,84,118,173,245,239,222,166,205,219,221,145,170,248,237,150,161,228,251,233,152,201,248,252,201,152,161,162,133,128,98,87,113,87,47,43,108,102,119,112,109,130,117,106,98,127,114,108,81,131,98,97,112,77,111,102,114,136,123,107,70,13,23,40,8,19,59,61,52,85,92,84,79,88,70,57,86,105,63,72,72,70,84,66,58,88,91,86,73,114,57,83,52,42,74,55,70,80,69,116,78,127,124,102,75,54,39,29,22,23,8,30,17,52,80,42,44,61,81,65,53,52,63,47,47,50,27,47,69,52,42,23,46,45,41,36,63,75,57,40,49,24,40,48,60,39,42,57,30,19,21,29,42,30,30,50,53,41,43,42,45,23,45,34,40,52,63,47,46,39,30,25,30,43,33,52,54,31,46,63,35,42,30,35,54,53,30,10,25,22,39,21,35,23,0,85,126,85,64,35,6,38,10,21,18,29,24,50,15,15,19,31,10,18,43,38,35,24,32,39,25,26,38,42,41,29,22,14,37,31,19,33,24,36,10,30,33,42,23,33,25,20,12,27,46,20,55,19,8,30,91,92,33,19,20,20,24,29,18,15,39,37,30,28,14,14,26,33,26,11,29,27,12,39,24,34,38,22,29,19,24,32,29,34,55,31,38,29,41,42,56,33,51,25,45,24,45,18,26,14,28,8,26,7,18,22,27,35,8,18,26,38,26,0,5,7,16,35,20,21,20,15,20,39,14,54,85,44,27,9,12,25,21,29,29,17,18,17,30,26,36,29,24,33,25,32,11,27,20,17,42,27,24,88,103,97,52,40,11,19,11,36,28,27,16,17,13,23,27,42,24,16,23,48,47,91,138,87,38,87,62,52,86,121,136,133,102,137,172,153,181,207,208,194,175,183,202,208,203,243,243,206,121,100,68,46,21,47,70,127,167,154,147,144,147,133,148,139,144,172,219,242,217,151,201,191,195,199,206,228,234,232,244,243,252,229,255,255,245,253,243,248,226,255,243,235,251,241,211,222,233,246,255,241,241,254,232,255,115,2,16,15,9,29,7,19,22,16,28,14,20,12,0,0,2,11,7,16,8,8,26,19,18,146,134,124,26,47,45,73,113,118,115,112,140,75,49,66,58,91,105,87,109,110,119,135,133,152,162,170,39,15,78,151,215,249,249,245,248,243,245,248,247,239,229,244,243,228,217,236,155,57,20,66,109,201,179,159,140,121,83,73,59,50,65,57,61,63,32,42,28,48,40,51,25,69,79,108,101,51,38,50,100,83,57,92,140,202,237,197,190,228,179,178,153,128,19,11,84,59,75,93,91,97,77,36,61,31,73,124,156,131,123,177,171,28,25,83,89,78,63,20,29,124,150,115,63,121,202,214,211,143,28,12,105,154,201,158,82,69,75,95,128,142,226,245,176,161,200,241,210,86,149,236,250,202,167,220,234,181,150,159,212,252,206,147,174,191,205,183,135,183,200,212,137,181,245,241,181,151,223,246,228,160,167,236,240,133,116,107,101,85,116,146,165,202,228,149,153,165,188,77,44,84,83,22,57,93,126,98,45,36,90,118,99,103,146,230,242,184,182,241,251,194,148,179,230,218,176,130,219,215,180,182,197,186,157,144,155,199,99,111,136,149,211,236,187,125,172,200,167,136,140,184,176,141,168,207,217,158,144,184,216,204,160,182,132,148,164,141,132,126,131,127,68,61,46,70,62,111,91,94,118,117,86,114,115,100,105,107,112,92,124,128,93,123,107,92,64,42,34,31,26,51,40,57,93,89,105,88,91,72,76,70,92,87,88,89,114,82,89,67,88,85,85,83,81,74,86,79,79,46,80,56,40,70,70,60,73,94,112,108,72,56,27,29,35,40,50,19,37,72,46,59,63,48,46,37,43,56,18,34,39,54,40,44,55,70,45,43,41,31,43,45,32,52,53,50,53,55,60,60,50,53,51,53,38,18,42,41,30,63,38,57,34,59,51,43,57,44,54,54,37,29,49,31,39,21,18,61,20,63,38,14,20,33,33,39,40,40,39,67,36,13,34,7,37,15,43,40,23,97,20,39,22,29,94,103,95,80,19,14,12,17,38,44,30,36,30,18,33,31,40,33,24,19,13,16,33,28,10,22,41,47,26,32,45,21,51,32,28,15,0,38,26,37,32,5,42,31,26,21,10,35,52,25,41,11,16,46,90,78,42,35,33,30,15,46,48,21,24,33,57,36,29,33,12,21,19,18,15,14,27,16,39,38,40,35,41,23,7,40,35,44,46,24,37,41,37,18,35,20,39,37,34,26,21,36,24,34,24,24,14,11,30,30,18,25,29,20,24,26,25,20,18,34,20,7,8,10,35,5,17,13,22,54,69,35,9,11,7,8,45,34,47,7,24,47,8,39,27,19,34,40,13,27,24,24,5,23,17,20,58,39,79,70,94,51,37,3,20,13,35,39,12,32,38,20,12,46,13,0,19,20,84,164,169,116,66,159,124,103,125,139,181,203,161,213,206,195,203,208,196,180,167,165,203,209,242,223,242,220,147,217,180,92,34,1,26,101,138,170,136,143,172,149,128,139,105,115,180,224,126,57,56,101,108,135,121,157,215,245,214,189,167,172,230,208,221,188,209,253,243,255,227,254,247,235,213,224,240,242,255,248,253,244,248,224,108,1,11,14,14,20,0,23,0,13,14,29,42,15,9,6,4,5,11,21,21,23,10,10,11,113,89,28,6,63,60,91,130,125,132,120,138,128,127,168,157,167,183,194,213,215,208,205,194,228,237,203,101,22,88,155,207,214,233,227,216,233,213,215,206,198,160,158,164,153,131,127,52,16,15,31,31,11,20,26,10,12,21,16,43,44,101,139,158,166,36,33,118,170,212,223,230,196,174,94,46,90,67,87,88,27,38,56,103,127,100,79,88,70,108,113,117,120,32,36,126,185,174,148,162,154,111,112,99,127,119,94,85,55,81,85,77,28,56,150,178,210,201,44,38,120,105,88,46,43,165,129,202,218,44,1,78,156,212,178,154,126,148,135,130,159,157,166,157,145,181,221,120,68,81,189,165,159,153,158,157,152,133,148,177,206,170,141,153,126,153,102,151,155,142,94,124,125,130,175,135,163,147,163,134,162,161,183,176,125,167,173,168,174,189,223,193,227,209,127,103,76,61,20,38,76,69,42,32,30,50,82,96,167,162,199,176,199,171,213,188,174,160,187,161,138,138,152,162,153,157,144,179,174,152,167,156,140,130,159,159,174,160,168,199,136,155,160,136,150,168,163,172,172,156,172,157,152,157,168,180,161,163,152,162,128,156,169,129,159,149,153,151,160,147,123,119,93,71,62,84,94,102,97,107,114,115,94,102,124,99,106,127,97,103,85,80,53,4,12,7,23,35,45,81,102,124,106,89,89,100,91,83,75,90,96,78,58,84,110,73,81,76,72,100,102,75,81,62,66,80,50,68,49,101,68,54,70,77,77,94,124,80,76,37,20,36,22,18,24,39,26,75,58,54,47,52,34,76,45,55,34,45,82,48,57,61,55,14,50,52,24,48,61,67,22,51,45,43,50,24,35,54,45,49,57,28,17,26,41,27,39,45,45,32,31,22,30,27,55,48,49,38,27,30,29,47,16,36,38,47,19,28,35,44,2,39,41,30,50,22,23,60,37,18,9,43,26,42,23,32,20,24,45,26,9,10,17,12,85,97,85,58,15,10,24,32,40,16,6,27,23,26,5,41,29,9,14,12,32,11,17,15,38,6,6,22,12,2,31,22,24,23,42,33,26,10,35,19,14,40,49,25,24,8,15,17,10,32,23,45,53,31,102,75,34,39,21,19,30,6,40,22,19,30,23,51,41,29,24,26,14,30,40,8,48,19,21,37,32,25,45,13,59,36,46,48,28,58,34,35,23,56,39,49,15,16,27,32,22,32,7,22,3,13,6,40,26,30,7,11,22,25,45,32,31,32,27,18,37,24,50,14,25,25,27,28,79,25,10,14,18,10,36,35,31,39,9,5,22,16,8,11,44,46,17,32,10,46,11,11,8,9,17,26,9,37,77,100,102,52,24,38,10,41,20,52,11,37,29,4,17,28,35,38,68,121,140,125,111,111,126,129,112,119,164,186,174,156,190,179,175,193,201,202,184,195,200,201,179,178,161,185,168,161,195,145,131,122,58,18,48,51,91,123,159,154,170,174,169,168,149,179,252,148,26,96,139,134,130,135,145,178,209,174,124,82,86,142,150,157,98,159,245,232,247,240,235,252,231,226,193,215,238,231,240,236,235,191,182,108,0,27,1,0,10,8,12,23,3,21,1,12,4,11,22,6,0,22,19,14,15,9,42,11,74,96,38,23,70,76,160,201,193,203,221,202,219,205,205,198,215,242,226,224,222,236,233,208,214,220,188,99,20,42,82,164,174,187,140,110,131,98,88,77,92,69,70,74,24,22,17,29,27,36,59,40,42,48,52,27,16,11,44,67,73,71,67,111,79,49,61,103,220,239,236,245,231,127,36,31,83,91,96,41,20,9,36,75,48,33,11,53,88,132,183,181,165,32,25,83,125,132,101,89,84,68,24,60,159,142,144,162,114,123,217,152,21,131,229,253,243,232,72,29,91,123,33,39,117,172,151,168,169,48,18,29,103,146,232,224,231,190,209,178,162,161,148,145,178,174,146,58,37,131,168,175,169,176,166,155,168,178,165,169,161,159,175,160,157,132,162,195,138,167,150,131,155,146,137,181,148,145,167,149,144,190,161,169,150,182,209,222,228,214,174,178,126,56,24,26,22,38,13,59,70,62,115,131,162,177,171,196,207,225,200,199,174,180,176,156,139,151,179,169,159,165,176,169,175,155,132,144,131,137,120,145,162,139,162,176,192,183,179,167,148,172,147,141,162,146,176,156,181,136,189,149,147,143,183,163,172,169,164,143,153,157,163,152,142,155,155,135,128,140,133,117,162,115,97,102,113,92,100,54,80,90,79,70,78,66,65,55,41,51,17,25,23,23,56,77,54,109,127,149,146,112,92,99,99,78,84,61,91,108,116,91,101,97,78,73,79,75,87,89,81,66,62,40,64,56,46,66,71,68,66,87,109,96,96,92,78,24,40,26,38,13,42,43,62,62,56,60,67,80,67,55,45,37,34,69,54,51,33,49,32,39,72,42,45,48,37,47,41,50,50,17,18,38,51,41,38,43,42,48,26,36,52,45,40,49,59,57,55,43,47,52,44,16,29,40,48,35,57,40,33,49,47,28,8,27,16,65,16,53,38,19,25,60,42,26,18,65,20,50,33,19,30,40,37,35,18,13,7,19,10,9,28,32,15,76,103,123,37,33,27,33,60,17,51,22,2,5,36,40,21,28,17,23,30,20,29,16,16,19,65,50,10,35,35,10,37,36,42,28,30,19,27,30,22,29,34,27,18,34,24,37,16,30,17,43,35,51,100,89,54,11,22,6,20,36,20,30,33,22,23,46,9,44,16,23,14,36,15,1,15,29,42,17,36,19,30,12,11,12,50,29,59,11,33,42,54,22,43,38,24,27,29,26,25,21,39,24,8,13,29,34,36,24,51,12,25,24,15,22,40,11,11,38,26,26,34,32,1,17,13,76,81,28,25,24,35,41,33,9,11,10,29,33,34,27,3,18,42,12,24,50,17,33,14,29,12,23,21,22,24,18,8,73,98,92,82,25,28,47,21,7,16,23,11,32,40,36,7,38,92,78,61,42,44,77,63,53,65,78,89,90,70,76,111,129,141,121,135,145,110,133,120,89,126,144,139,156,146,147,171,153,149,181,120,118,111,33,25,58,102,138,148,169,181,179,168,207,235,173,54,99,148,176,160,173,164,194,230,181,103,37,126,162,136,151,90,148,247,223,253,232,216,246,242,205,212,211,225,207,199,173,167,142,112,96,11,1,19,19,16,4,19,40,8,24,11,21,30,24,21,1,7,6,12,10,17,5,1,6,172,176,87,25,58,77,163,223,218,247,234,235,201,195,184,188,171,185,181,182,132,156,143,157,146,128,120,44,13,24,51,105,56,90,52,32,31,30,54,79,56,79,76,54,76,61,40,13,35,32,87,114,124,92,95,115,115,92,105,100,80,38,67,25,18,5,22,72,115,136,110,111,95,135,91,60,56,95,155,103,19,29,92,141,138,121,115,106,120,147,128,113,52,22,24,27,54,45,22,48,29,28,13,41,132,152,161,220,193,232,240,246,89,100,192,238,225,148,37,28,98,99,72,62,151,233,203,196,137,10,16,53,16,58,120,209,211,194,183,180,173,176,178,170,174,173,139,18,74,184,173,164,166,152,154,174,166,172,156,181,169,128,164,147,158,158,154,207,182,165,165,185,168,182,151,165,156,197,167,172,183,158,173,194,200,196,199,221,145,95,47,3,15,26,33,6,40,89,134,141,182,189,217,200,214,194,183,187,183,181,193,175,150,162,158,149,189,187,170,162,180,157,168,161,165,171,159,142,162,136,131,159,133,152,152,179,153,173,171,171,162,183,163,151,157,138,153,149,154,173,131,120,130,134,124,131,135,168,146,157,159,169,144,135,124,123,136,126,106,134,102,127,119,147,135,147,123,111,92,105,65,74,81,50,108,48,46,26,30,45,59,75,58,94,109,120,136,135,139,111,128,87,89,96,86,70,98,82,98,80,47,89,89,105,77,94,78,101,83,60,63,69,60,74,73,91,65,74,65,96,74,90,108,93,46,35,6,31,19,7,18,42,38,59,58,52,59,71,59,52,48,67,38,70,62,50,52,26,28,51,39,34,35,41,44,43,41,60,63,43,54,60,40,47,20,71,79,52,53,58,50,37,25,43,61,49,33,67,30,30,20,28,43,51,33,23,35,26,29,53,29,39,16,36,32,26,27,36,59,8,46,30,45,18,35,37,19,35,43,39,39,34,26,60,43,17,21,10,18,14,14,22,20,13,28,43,52,78,116,107,68,15,27,12,21,27,34,41,42,20,20,18,30,42,20,11,25,10,34,34,29,12,47,53,13,7,27,42,18,22,36,24,5,6,18,24,29,23,5,54,24,18,28,19,33,37,33,38,60,86,75,15,57,20,20,35,25,38,14,34,33,29,26,33,46,44,14,21,52,32,28,14,37,24,43,43,22,23,27,19,66,48,44,24,19,45,44,36,53,46,29,24,19,38,14,9,24,22,19,10,23,31,4,19,10,27,17,24,40,20,15,23,39,39,17,10,43,23,39,23,11,82,60,7,30,12,15,16,38,21,25,27,31,22,14,25,29,10,34,20,13,38,31,45,26,47,38,36,11,8,20,25,28,35,39,104,92,66,45,20,32,30,7,50,50,10,13,17,63,106,69,67,56,45,57,55,79,52,56,64,75,47,50,50,40,72,42,46,38,64,43,44,79,131,165,191,210,222,221,201,237,189,220,220,124,200,188,93,77,64,91,115,125,170,217,210,211,243,249,179,82,123,159,164,156,161,169,197,219,184,121,60,145,177,181,172,108,154,248,232,236,220,247,243,230,229,184,181,178,192,160,141,154,132,144,101,11,24,29,1,23,3,25,6,1,9,38,28,22,10,8,10,23,7,19,7,15,5,19,20,203,194,118,36,61,80,155,168,194,244,233,220,189,158,123,131,121,111,75,63,69,83,51,44,66,42,18,17,39,44,84,113,105,113,130,135,127,133,132,148,156,152,177,157,160,157,153,111,42,56,128,197,229,191,200,184,161,146,125,100,96,103,74,39,9,29,24,91,129,138,118,139,156,174,193,112,132,172,182,178,40,38,132,182,159,144,124,89,122,119,90,38,21,7,31,76,60,96,87,100,116,159,118,65,181,164,133,157,165,159,223,150,40,31,105,72,44,25,0,32,71,89,62,27,122,231,247,219,160,32,8,122,137,34,37,70,136,191,184,195,194,175,213,190,168,140,31,60,162,198,198,175,169,161,198,203,162,184,197,172,181,183,194,168,164,187,170,162,154,151,192,172,140,173,164,174,172,165,190,174,174,208,225,231,182,126,83,69,40,3,12,27,29,81,122,187,210,207,191,212,199,218,179,192,167,187,186,145,154,155,189,193,184,169,171,175,164,161,182,164,165,191,164,164,164,202,150,194,188,168,185,160,177,179,169,161,148,147,162,157,177,157,144,167,169,151,146,163,187,152,165,158,181,160,158,150,135,136,167,143,144,182,181,142,123,89,128,154,141,122,110,119,90,93,85,83,117,129,140,119,145,111,101,105,124,98,107,98,102,114,110,116,127,150,111,148,145,112,115,104,102,117,106,60,78,76,77,79,89,63,63,89,100,77,71,62,61,54,73,77,73,76,80,67,83,97,67,71,60,81,85,82,87,36,6,13,17,8,45,33,41,59,47,51,72,55,54,62,56,42,40,24,61,53,56,57,55,44,37,54,50,37,57,22,50,55,41,39,50,42,67,49,51,38,51,41,33,40,38,24,49,41,35,64,52,67,60,50,38,28,42,30,16,34,18,46,37,31,5,38,19,44,50,35,48,41,31,46,61,57,37,35,33,35,36,30,26,20,36,51,17,34,32,10,42,31,55,22,45,30,12,45,23,34,39,38,34,32,68,135,68,53,17,42,11,9,63,28,27,24,27,18,13,12,41,31,27,8,19,30,25,49,36,28,32,33,36,29,42,30,42,43,26,43,43,40,30,45,24,35,40,31,22,37,7,15,36,37,31,106,77,67,11,19,22,16,35,35,30,43,48,27,14,34,22,48,42,27,10,7,13,24,31,54,24,28,39,31,13,57,50,13,30,33,30,22,22,41,27,37,28,30,10,45,41,41,36,5,29,13,13,28,22,33,22,14,30,25,18,13,36,40,25,32,26,45,42,45,10,2,28,89,71,33,10,10,17,19,33,33,27,22,31,28,29,32,23,19,17,23,18,33,12,30,34,24,25,14,23,27,31,21,39,25,29,56,155,98,80,52,49,13,29,1,37,32,8,73,94,95,62,16,56,59,43,64,48,22,37,42,71,73,46,79,58,64,39,35,39,10,28,69,193,225,234,238,254,251,252,251,252,248,230,189,96,145,188,124,82,79,54,51,56,109,185,205,175,176,247,134,47,89,99,124,134,147,157,183,212,141,85,86,162,152,183,147,106,155,248,242,243,249,218,237,254,236,202,173,198,164,158,130,136,131,134,130,7,1,17,6,18,5,16,12,14,7,14,2,8,4,12,13,34,34,6,14,4,25,4,25,169,186,74,13,33,100,126,147,145,167,167,153,77,66,66,43,76,50,58,70,61,62,73,87,100,112,73,16,26,70,118,194,197,203,212,209,208,201,190,218,190,185,188,184,207,193,220,121,61,21,74,167,177,140,125,122,138,139,116,110,95,138,144,184,107,24,56,134,185,224,235,233,234,220,174,123,127,188,228,147,48,23,91,100,122,104,75,65,78,111,56,51,37,12,54,137,180,213,201,212,220,249,157,90,206,194,171,149,65,25,38,28,12,18,28,48,25,26,4,50,17,86,30,38,107,148,181,230,198,58,25,116,212,161,47,37,46,125,131,199,188,197,163,162,70,16,56,137,194,183,185,167,175,168,185,169,157,167,148,192,180,163,157,135,196,169,139,174,164,187,137,164,166,166,191,184,197,215,222,198,209,163,148,108,44,4,19,21,83,105,134,188,199,213,225,214,209,212,190,176,200,192,171,173,164,190,157,133,172,175,166,186,187,158,160,184,178,160,173,189,166,180,158,166,164,166,176,159,186,177,165,196,167,165,176,181,191,165,192,152,158,150,151,135,163,124,143,135,143,161,199,193,191,166,163,191,181,187,152,156,173,176,190,122,109,111,120,145,135,133,98,110,121,103,96,106,118,134,124,138,127,127,117,125,129,119,113,124,133,122,108,112,140,125,116,109,113,97,119,65,68,104,63,88,69,87,78,67,55,72,74,76,56,76,67,45,49,75,61,75,75,78,86,63,93,93,73,70,80,82,56,36,27,29,25,18,41,40,54,61,72,63,57,52,64,71,69,68,50,57,39,59,27,81,52,36,35,48,48,56,51,45,36,32,55,41,72,45,53,36,12,38,48,41,18,53,39,44,34,40,43,49,27,43,39,59,32,37,35,34,40,50,25,6,43,33,31,39,58,39,44,50,45,33,14,32,44,37,57,32,60,27,53,69,25,23,60,34,44,38,42,64,38,36,57,38,37,39,10,15,23,8,2,19,7,29,13,23,41,80,124,77,51,34,8,33,25,5,23,21,11,46,13,15,49,16,15,40,53,17,14,26,13,16,45,12,26,51,23,60,25,28,19,21,35,31,30,21,38,43,7,14,32,30,38,32,25,20,20,78,94,57,29,41,24,36,14,26,25,13,5,20,26,45,51,18,32,29,13,13,31,25,19,32,40,13,17,4,44,31,25,14,44,32,24,29,35,27,15,46,52,42,23,35,29,19,25,25,16,30,40,34,32,10,1,30,6,17,32,29,37,25,35,14,34,33,28,10,44,32,33,93,41,4,32,13,17,32,6,11,4,19,34,7,31,24,23,26,27,28,14,17,26,27,2,5,15,6,27,25,38,26,18,16,41,42,82,103,125,47,51,20,22,17,24,35,56,107,58,76,39,34,61,58,42,64,75,45,51,53,38,59,55,39,82,66,71,17,37,40,131,216,222,235,213,226,233,241,251,242,251,247,244,192,92,99,104,113,109,118,12,19,0,22,46,68,62,116,197,96,33,80,62,100,95,104,113,156,215,142,61,96,149,150,192,169,89,172,244,251,232,212,226,245,249,238,239,184,169,181,160,156,154,160,163,114,1,0,20,22,36,21,18,19,23,2,11,11,10,19,6,20,15,5,0,11,23,24,29,10,116,105,26,28,27,61,87,83,114,77,91,62,93,89,91,96,114,134,124,151,160,189,184,175,197,193,145,55,29,94,160,229,225,220,218,183,223,190,209,186,148,135,142,89,137,103,101,33,17,41,57,114,105,98,112,110,131,133,166,166,167,211,200,227,129,33,41,149,204,243,243,213,139,69,77,44,92,122,138,84,6,19,50,49,45,41,48,52,82,134,129,178,90,2,102,177,226,235,238,189,213,210,134,84,198,194,210,213,132,82,82,33,1,92,143,167,193,122,18,86,71,60,48,49,123,163,141,151,169,50,26,122,197,217,183,102,17,13,20,74,78,79,61,33,37,90,155,173,205,152,164,173,175,141,174,168,160,173,153,170,179,172,167,171,194,176,173,174,183,190,194,187,199,210,230,233,194,196,134,96,63,91,2,48,51,98,143,190,186,228,223,255,197,200,193,184,163,175,163,144,158,185,176,175,182,161,194,166,138,163,165,148,151,184,175,169,186,171,180,161,156,135,168,177,158,143,177,139,147,156,179,138,159,149,175,136,164,149,152,197,188,155,159,162,132,144,163,144,170,183,199,178,171,162,195,181,208,184,168,176,136,155,159,154,138,89,91,105,128,113,110,120,112,119,99,117,112,115,123,109,105,107,123,134,134,117,101,127,108,113,108,116,117,118,92,114,116,83,106,87,86,91,68,72,52,88,87,92,80,92,63,95,73,53,47,59,69,78,49,68,69,78,77,80,68,87,109,99,37,30,43,21,16,21,31,45,44,64,65,37,61,57,55,48,57,54,49,48,48,54,51,48,47,64,49,46,64,54,51,66,26,41,53,18,59,22,32,33,52,31,48,58,40,75,43,40,32,48,28,35,83,44,36,20,46,11,30,26,45,10,51,72,40,40,44,46,16,50,29,28,20,13,40,45,64,57,48,74,48,46,31,15,32,25,33,48,30,30,23,22,28,24,15,43,47,45,25,24,33,32,22,40,19,12,18,19,37,50,55,51,98,153,118,47,43,13,16,17,37,11,45,42,44,37,45,33,66,53,6,31,36,47,35,28,52,23,21,46,34,6,13,14,5,32,28,25,28,25,35,8,26,43,40,35,61,6,34,40,20,47,87,87,56,23,23,23,22,20,29,13,14,11,39,30,9,27,15,12,35,44,15,34,42,21,9,34,36,42,14,40,37,21,29,32,71,37,45,21,55,25,20,40,34,55,36,15,22,7,39,24,26,8,28,23,15,28,7,17,16,3,18,40,40,21,17,24,7,38,50,12,65,74,21,41,26,0,31,18,29,41,13,48,28,19,31,33,1,43,13,57,38,14,27,57,15,17,12,26,30,26,28,32,24,61,25,33,33,55,119,89,75,50,18,28,39,77,103,82,45,47,39,52,54,24,55,68,43,39,40,45,60,60,67,44,62,57,17,45,32,45,136,180,189,215,221,202,188,196,196,210,247,239,240,223,136,105,88,121,160,147,99,12,9,26,7,33,2,53,170,84,45,84,69,96,88,86,61,146,168,134,56,84,117,165,151,141,88,174,226,245,251,236,252,245,241,248,252,200,173,183,177,186,178,156,191,113,0,10,2,51,8,13,11,33,13,2,22,23,19,4,29,11,19,48,22,9,8,31,13,17,55,15,20,31,47,107,107,150,141,133,181,166,163,176,176,190,226,226,223,242,231,225,224,220,216,226,182,62,22,75,133,160,169,159,150,134,120,108,89,100,84,42,66,63,69,54,61,20,8,64,120,173,155,176,192,206,195,205,198,209,201,211,219,190,155,28,48,100,187,197,169,64,27,62,39,54,75,100,52,16,17,41,71,106,84,91,116,128,146,189,195,242,116,21,74,147,172,170,147,127,84,59,3,62,153,164,204,207,172,157,173,86,64,148,219,213,232,110,42,48,102,99,55,46,178,224,187,183,118,3,10,108,168,201,173,175,152,122,46,37,65,48,99,107,137,159,192,179,171,167,157,168,149,170,131,169,165,129,148,154,183,174,172,168,187,175,172,175,200,197,203,218,191,187,127,105,82,9,10,24,44,58,78,170,204,188,226,213,203,191,160,188,188,192,183,157,171,146,154,181,150,156,169,165,192,166,145,151,142,183,178,149,166,178,175,162,186,165,184,149,135,138,141,147,148,167,183,145,151,155,162,158,164,186,148,120,116,144,161,161,155,186,152,167,164,192,176,190,168,137,142,167,190,171,169,161,187,175,191,156,158,185,192,170,116,111,86,93,106,89,80,59,111,96,98,96,98,125,119,121,126,100,128,112,122,127,119,126,111,149,103,107,121,108,102,116,107,88,103,111,66,44,82,69,76,84,77,95,74,69,52,62,62,52,51,62,77,51,66,97,86,75,76,91,98,73,36,52,54,35,27,23,32,26,20,60,72,66,49,46,66,47,61,49,47,48,50,67,39,68,55,54,61,41,30,52,46,44,55,44,47,41,32,31,42,37,43,56,36,35,29,50,34,52,31,35,28,55,78,21,37,37,25,51,27,25,53,36,32,46,18,36,10,34,32,31,16,59,34,52,29,30,59,49,29,36,54,26,24,52,42,20,27,20,81,53,17,23,26,19,32,47,36,16,22,25,18,35,24,4,22,14,46,9,38,32,24,16,53,39,26,46,126,95,82,51,26,14,20,29,35,10,31,42,25,25,40,48,32,42,32,10,42,50,48,48,39,23,17,52,21,25,16,46,11,21,36,22,42,49,53,30,23,41,30,35,37,26,35,16,91,97,46,35,15,20,29,19,13,37,20,11,64,16,19,28,35,23,56,30,44,21,7,24,43,38,15,18,31,47,48,15,47,32,30,34,8,36,60,38,18,29,47,27,33,31,17,24,32,31,28,18,36,6,43,19,20,21,7,28,31,32,27,28,27,44,37,9,22,26,77,60,22,13,32,2,33,20,6,27,19,43,18,21,18,12,35,34,39,42,3,8,28,15,45,22,30,19,48,30,29,20,22,34,14,10,21,26,45,107,115,54,59,29,69,99,82,33,45,63,45,31,55,62,57,43,45,9,27,38,38,64,84,60,61,65,57,28,37,50,102,137,175,219,195,213,193,200,175,216,201,212,202,172,148,123,104,94,195,220,214,204,193,144,110,67,74,139,179,66,36,85,94,126,88,68,66,142,206,147,45,41,78,103,105,111,67,165,225,247,254,246,245,237,230,239,236,234,194,183,185,160,194,184,197,106,10,9,38,19,15,10,29,6,31,1,30,7,3,11,15,11,9,17,31,31,4,9,13,4,143,103,8,64,74,114,188,196,192,199,226,206,215,203,208,193,218,224,205,180,169,192,177,164,127,146,84,13,27,48,105,90,99,60,81,39,82,94,65,75,74,95,88,144,120,180,137,45,20,61,141,198,184,217,207,209,195,204,195,194,130,138,125,115,53,20,64,64,115,83,117,77,66,120,131,124,120,150,141,87,0,54,118,159,175,104,120,125,149,194,215,212,81,6,48,112,120,92,60,73,68,72,15,77,147,139,166,187,164,156,228,85,42,101,171,114,90,56,16,28,13,49,5,37,174,191,251,238,135,30,31,120,185,207,183,181,182,183,175,156,123,168,178,164,211,204,184,185,147,149,188,137,142,157,158,178,154,153,167,174,156,196,220,192,198,195,166,161,153,110,97,65,23,17,5,10,8,6,26,95,167,178,206,206,207,201,201,184,189,161,169,161,149,157,156,166,152,151,160,154,169,176,161,163,156,138,147,138,154,165,166,174,161,187,150,152,166,157,173,189,155,153,154,152,156,166,170,181,167,179,170,166,165,170,174,152,155,182,173,175,174,190,185,197,173,166,186,187,165,164,149,150,162,181,169,211,154,156,138,177,188,167,174,172,181,92,76,89,73,103,104,73,90,74,87,124,143,121,103,146,97,126,101,125,129,107,87,102,123,97,92,136,100,96,99,111,97,119,80,90,96,76,75,68,79,72,82,91,89,63,55,65,79,83,62,58,69,71,67,75,101,74,95,65,45,59,5,18,29,30,42,46,33,60,60,57,45,48,42,45,31,63,42,52,63,65,42,60,61,82,44,56,73,8,69,40,48,64,58,58,56,57,20,50,44,42,54,30,71,59,54,54,40,36,49,39,48,31,14,18,51,55,42,46,47,47,44,38,23,58,42,27,28,46,43,44,25,42,37,27,34,29,16,39,27,39,25,11,29,31,26,41,75,33,59,41,47,24,53,29,45,60,9,45,19,53,15,40,50,16,29,27,35,27,19,34,42,12,21,29,25,7,19,69,96,81,63,43,26,31,35,17,54,45,22,36,30,34,44,16,31,18,12,37,20,35,5,30,33,13,46,42,13,14,11,33,41,35,17,11,25,47,31,8,12,24,39,12,44,22,31,82,77,18,9,16,42,45,9,46,42,24,34,22,24,19,22,29,29,35,16,31,26,26,49,53,31,20,34,42,37,37,20,49,62,1,28,43,25,45,40,25,5,31,40,23,15,5,43,28,50,21,6,33,8,28,22,54,14,15,12,16,4,22,50,41,30,19,17,49,91,71,18,34,14,10,32,9,32,32,35,40,3,20,25,12,24,17,19,3,25,21,13,22,23,17,6,15,19,42,17,35,30,10,11,26,16,16,35,71,95,115,71,98,128,84,19,63,47,42,45,44,46,55,43,83,28,33,39,37,43,68,43,48,60,67,40,59,4,31,51,73,95,158,186,204,211,188,202,216,200,189,194,127,121,146,92,133,203,253,239,232,248,248,240,252,233,227,220,107,107,108,129,112,122,95,84,173,239,171,84,86,34,60,64,65,80,128,222,237,252,243,233,255,212,249,240,237,228,175,173,202,208,181,171,122,4,8,0,28,1,9,0,28,21,39,0,24,20,4,11,8,18,10,3,7,15,15,31,25,201,162,47,51,100,123,196,209,217,192,193,210,181,181,171,159,152,139,142,63,107,80,77,89,79,39,19,40,36,60,104,119,109,106,137,149,158,162,178,169,181,171,184,202,194,212,188,105,26,73,152,181,202,158,166,146,127,146,100,92,94,60,48,23,0,13,43,102,155,160,189,118,118,156,130,128,175,235,234,145,11,64,146,150,160,128,110,87,96,130,93,79,25,13,70,108,145,115,136,155,169,196,93,121,204,218,172,161,96,110,98,47,28,30,57,73,34,25,8,15,44,55,37,34,63,176,170,240,180,28,14,118,187,206,190,158,190,196,174,188,191,199,193,206,183,178,164,157,210,132,154,169,167,167,188,177,202,233,185,184,177,186,169,131,99,117,85,55,29,25,30,23,34,46,15,43,46,82,128,185,199,193,176,181,173,210,151,178,183,145,178,175,193,184,147,164,176,176,188,161,176,173,149,156,184,161,159,154,181,140,174,144,161,189,167,155,151,168,157,195,186,170,166,160,169,172,163,156,171,166,171,167,161,192,151,187,201,187,190,167,158,165,158,145,157,163,171,166,173,197,193,165,181,183,159,162,169,173,150,157,197,172,146,169,144,123,82,98,89,96,70,105,85,68,101,94,112,105,127,102,108,110,100,122,114,126,117,79,112,138,100,111,117,66,94,98,103,117,110,139,91,77,63,72,91,98,96,49,72,86,78,78,76,81,91,83,71,68,110,71,91,69,39,28,39,25,32,40,54,24,49,65,60,72,52,44,51,42,68,40,52,43,46,55,58,66,65,51,58,19,36,65,44,42,51,46,50,49,83,44,25,63,31,20,41,29,31,39,25,48,53,73,49,43,73,57,47,37,33,46,19,31,44,40,37,40,32,62,37,29,49,41,13,50,25,38,43,51,72,40,34,24,25,31,47,52,23,23,20,47,44,8,15,25,32,18,49,58,23,57,42,35,49,11,15,49,33,16,44,26,32,14,26,39,52,8,30,14,19,50,11,35,29,29,74,103,109,83,71,30,15,13,35,19,18,24,5,4,46,29,21,32,26,6,30,32,25,24,42,43,22,42,41,25,22,5,19,28,42,41,64,36,44,27,11,41,22,54,17,17,37,56,103,53,30,13,25,35,27,35,26,42,12,23,43,14,48,33,14,68,12,50,35,63,31,17,24,35,27,52,4,37,45,50,44,34,47,32,33,26,30,43,36,38,46,39,24,7,25,33,38,16,35,21,25,22,19,26,27,13,21,33,18,16,5,8,19,33,17,76,84,30,25,34,24,3,11,6,27,17,35,28,18,25,44,22,20,21,42,26,28,11,42,42,39,49,34,46,29,59,21,9,35,9,22,19,8,33,27,35,68,113,132,130,79,61,34,36,37,49,42,57,60,51,54,52,64,45,43,56,39,58,60,25,41,47,37,28,48,41,32,49,55,40,72,131,152,183,184,198,218,220,169,119,124,76,93,111,143,200,210,238,231,247,251,253,237,244,234,186,196,200,153,140,125,127,205,209,254,218,197,141,154,118,101,118,109,187,246,250,248,233,253,208,172,210,252,236,213,185,191,179,199,183,155,99,6,8,0,2,40,17,39,14,10,6,12,30,2,7,8,2,16,0,14,8,22,6,14,12,158,140,44,43,80,98,133,157,148,120,141,84,96,87,85,90,49,75,59,45,48,49,78,98,92,97,24,13,48,81,157,172,178,210,194,213,198,213,229,188,214,202,200,212,184,166,161,72,22,57,98,154,135,91,76,73,57,65,61,79,82,101,110,140,75,24,109,160,237,234,237,186,126,129,123,144,176,191,216,129,6,27,85,110,96,70,30,50,51,82,74,76,25,23,95,175,186,188,207,214,233,183,100,124,207,217,216,174,84,57,65,23,39,15,45,41,53,29,29,18,33,25,49,22,104,125,147,182,175,4,14,107,159,185,173,153,170,199,176,154,166,166,151,151,164,152,166,166,161,154,165,166,169,188,223,184,203,178,158,136,112,67,48,39,19,72,47,102,111,86,141,157,198,185,192,182,94,152,134,131,128,133,157,164,171,162,149,178,156,127,162,160,156,160,172,150,165,151,173,185,169,164,158,135,165,157,180,150,160,153,155,155,157,177,175,190,175,163,167,178,186,187,174,178,167,163,189,206,160,183,142,150,177,186,162,164,160,173,158,170,145,161,141,173,164,173,157,187,186,186,186,181,178,158,167,162,177,165,182,162,177,171,169,167,152,120,112,113,80,84,104,68,63,30,80,69,75,116,129,121,119,124,125,106,117,135,99,75,110,109,114,128,75,88,129,112,117,107,98,89,79,46,97,84,70,110,65,64,74,122,68,83,97,75,93,84,77,84,48,66,40,21,46,35,0,51,39,40,46,58,51,59,29,49,45,47,78,62,59,51,35,40,51,65,80,45,55,67,54,34,54,56,45,44,31,71,43,64,74,74,45,49,71,44,48,57,56,37,42,45,37,35,66,39,47,34,29,29,46,46,37,32,58,48,36,52,41,24,19,62,64,44,43,42,31,62,23,35,46,41,52,53,28,25,36,28,35,22,48,22,12,39,23,35,13,36,60,31,34,37,43,23,25,26,7,38,26,20,38,47,18,11,39,13,25,30,29,15,22,14,22,18,28,45,10,59,97,112,89,49,41,40,32,18,37,54,22,20,35,11,10,32,34,35,11,35,41,10,26,14,25,42,7,7,23,22,21,50,42,9,39,38,33,45,9,42,57,40,42,29,28,32,91,81,17,20,35,29,58,47,31,13,40,46,29,32,35,46,21,18,32,26,32,35,39,18,55,33,27,35,34,47,40,52,45,29,17,35,33,17,36,27,34,25,32,26,28,15,20,26,21,22,20,20,44,45,34,26,38,16,9,34,8,22,35,16,39,24,39,105,53,24,12,28,18,24,21,20,8,43,43,45,17,26,20,31,28,46,12,21,37,0,38,35,17,19,27,27,38,51,6,16,39,2,21,46,39,27,17,37,51,93,113,107,51,26,35,71,11,33,64,61,40,42,80,65,61,61,54,57,34,49,62,76,52,72,38,55,35,56,37,52,35,60,41,18,89,80,93,115,144,169,154,136,117,101,89,87,126,186,197,203,169,169,171,209,209,193,191,203,230,188,110,75,30,98,176,240,253,248,248,252,231,247,231,234,220,223,251,255,253,247,239,227,174,212,239,246,238,214,196,186,187,188,175,104,23,6,7,19,23,8,0,24,21,4,16,30,5,17,18,19,24,10,21,20,19,18,27,10,144,106,24,34,38,76,103,99,63,67,62,95,49,62,76,73,65,122,124,143,140,154,175,170,179,184,90,27,77,117,197,209,223,199,221,193,227,188,165,166,168,143,147,88,101,102,57,13,18,27,49,103,88,96,88,120,104,160,171,169,153,230,212,204,129,55,73,172,248,243,254,215,143,134,149,105,94,116,118,60,14,25,52,51,47,62,49,70,123,178,205,181,70,55,100,184,230,217,209,185,148,113,42,85,165,159,222,213,105,48,23,14,19,30,45,3,45,21,40,48,82,72,79,108,160,209,240,186,118,11,19,143,187,178,168,183,174,162,184,171,184,171,168,160,173,192,176,133,183,189,201,192,176,154,151,111,88,61,22,49,15,39,104,133,155,159,170,189,159,165,210,194,200,186,171,199,171,177,184,121,171,164,138,141,136,145,146,139,160,166,159,138,139,127,147,160,143,151,138,152,164,142,165,148,154,163,159,154,169,171,167,131,131,169,164,152,164,166,157,169,150,153,155,169,173,177,153,161,165,160,156,182,165,166,156,165,154,167,165,185,172,164,167,197,179,160,173,160,176,165,187,165,167,167,148,176,164,184,184,162,192,162,175,169,205,171,110,97,93,102,105,87,77,68,53,71,86,102,133,105,99,108,97,94,126,136,112,103,136,100,128,119,130,129,129,79,83,51,66,72,75,80,68,87,81,87,121,75,89,81,80,76,96,93,118,98,51,78,33,32,30,19,37,31,30,46,70,80,81,81,73,51,46,69,66,70,56,47,31,27,51,64,48,73,46,82,54,48,65,57,19,52,55,13,56,70,48,49,60,60,54,40,68,39,25,44,36,51,67,66,40,51,53,44,40,31,36,42,44,41,37,40,31,48,52,51,57,27,54,14,55,55,45,29,64,54,46,26,30,42,45,34,22,35,21,34,15,22,39,23,24,27,64,44,54,47,36,30,28,32,6,5,41,47,32,51,34,33,27,23,40,8,48,29,34,28,33,12,29,30,54,34,29,27,39,49,23,59,87,112,70,30,26,23,30,36,16,36,16,45,38,33,48,18,37,33,28,12,21,38,20,16,21,25,51,31,34,37,17,40,35,28,45,16,41,25,37,53,34,10,24,33,73,91,42,32,23,8,23,36,16,40,34,32,25,17,45,39,5,9,55,45,35,37,48,14,20,19,48,29,58,56,18,24,34,33,35,19,28,31,49,16,39,22,15,29,33,44,11,3,4,59,25,26,16,47,9,19,24,39,29,24,34,13,6,23,5,19,62,110,22,29,22,16,13,37,19,38,23,27,7,13,25,21,18,37,17,6,31,19,29,24,21,32,20,11,30,46,10,11,35,39,19,24,19,38,36,38,26,42,99,102,66,83,69,84,49,65,36,35,69,20,31,59,41,62,46,62,54,67,79,54,49,52,49,71,63,53,52,48,55,58,59,25,35,30,55,27,19,32,59,101,116,106,112,119,105,94,131,176,177,155,171,162,158,155,156,149,168,154,161,136,43,21,24,24,104,202,216,254,255,245,243,245,249,249,224,242,234,252,238,249,249,233,220,246,241,246,253,206,202,196,182,179,174,100,15,9,8,15,8,11,0,10,17,20,41,13,14,9,8,17,23,0,9,48,12,9,19,29,61,32,11,50,55,81,106,108,126,119,144,136,141,190,190,167,196,209,197,209,194,213,211,209,215,221,84,50,32,75,159,188,177,171,146,142,160,123,106,88,88,87,59,86,57,63,55,15,27,68,146,181,164,170,181,216,219,229,223,189,228,220,223,205,105,38,77,113,201,221,195,161,106,150,113,87,47,0,19,25,38,47,60,54,34,65,62,114,182,219,189,195,56,17,92,134,164,115,130,80,60,21,20,104,133,133,155,209,136,33,40,15,10,36,55,45,41,20,17,61,114,149,177,163,168,195,217,224,134,0,26,121,169,167,166,164,160,180,145,161,176,192,163,170,188,195,210,196,159,150,116,132,59,53,22,15,49,52,108,134,150,169,194,186,174,218,206,176,201,174,147,153,167,155,158,165,181,183,170,161,171,156,138,186,156,174,144,130,158,162,167,152,146,162,164,161,147,167,146,154,145,139,148,139,122,136,150,144,173,149,144,147,166,151,160,163,175,148,149,141,149,180,164,178,162,165,164,184,180,170,197,189,176,153,157,167,167,170,159,183,200,155,169,164,182,145,159,155,146,173,173,165,168,142,164,190,158,147,139,167,152,177,153,156,161,167,145,122,98,82,71,73,116,97,80,93,112,122,125,166,132,101,100,131,124,103,123,138,110,92,116,103,102,90,76,86,82,25,80,59,79,75,74,90,84,82,53,82,73,99,79,113,96,68,60,33,46,26,15,39,45,74,69,74,55,53,49,61,68,70,40,38,67,40,48,36,55,28,37,57,46,59,61,49,51,45,61,68,42,60,45,31,31,64,48,62,50,66,65,37,41,45,47,20,34,27,22,48,65,58,35,43,54,40,33,30,43,33,38,44,35,37,39,9,73,37,46,55,44,38,37,22,29,33,46,46,13,42,25,53,19,19,45,46,12,37,27,17,27,40,31,17,32,58,34,11,20,22,39,41,33,14,31,18,11,25,51,14,38,20,31,20,33,16,24,17,31,5,39,22,16,34,26,26,32,10,6,17,51,114,114,95,67,15,6,15,8,24,41,15,28,18,26,38,4,43,43,46,22,21,29,17,10,43,52,36,34,14,31,24,23,14,24,27,38,37,30,44,43,45,13,11,42,91,70,16,23,19,7,15,27,25,21,18,36,19,14,21,37,27,37,54,34,42,41,25,22,55,9,23,33,33,44,40,21,34,43,42,24,37,19,35,23,47,20,50,28,10,16,29,11,21,30,29,23,30,22,7,12,32,9,25,24,23,33,45,8,43,85,78,24,46,30,32,2,25,25,24,21,26,10,23,19,0,14,15,14,31,24,20,13,45,18,30,44,20,21,38,5,25,19,45,42,11,28,45,25,12,76,128,105,28,16,47,123,146,92,81,95,84,114,77,55,68,10,59,76,48,101,41,65,55,46,59,49,55,40,57,31,37,39,47,34,21,62,42,67,65,49,39,12,50,89,137,148,152,130,134,150,157,216,236,208,214,181,174,177,181,174,155,159,155,97,82,35,37,68,74,110,182,242,232,244,241,249,247,230,238,242,242,248,235,251,249,238,254,243,250,241,200,244,196,183,220,183,119,18,3,3,10,32,19,11,10,29,7,11,17,7,2,3,9,26,23,7,29,27,12,24,22,81,40,21,31,86,127,156,203,182,200,210,217,218,194,193,216,199,211,205,221,206,192,179,175,163,168,42,43,30,63,133,112,95,79,94,108,76,65,71,103,109,124,146,148,155,163,130,20,35,92,159,212,232,214,226,196,226,193,192,169,142,151,136,124,48,2,33,50,62,93,103,108,96,127,110,109,87,19,23,22,21,53,66,93,80,88,98,120,137,119,126,64,31,40,82,120,87,58,56,43,44,8,46,102,194,196,135,167,107,16,14,19,36,28,20,33,16,22,23,50,90,95,86,97,98,124,149,214,148,3,19,113,174,164,189,176,185,204,181,171,190,215,173,168,121,126,107,66,50,22,12,64,67,104,93,146,151,164,178,203,206,190,213,180,189,217,191,187,146,170,151,173,161,176,171,180,158,166,162,169,170,170,162,161,178,174,157,146,165,181,159,174,169,142,169,152,151,145,174,177,167,168,148,144,162,166,146,160,139,160,143,162,150,157,155,132,153,158,168,165,162,199,158,157,167,152,166,183,158,171,160,183,158,173,165,154,164,181,186,165,146,158,170,176,166,177,166,159,166,165,182,169,190,171,170,170,156,193,189,166,166,167,147,135,162,169,194,223,172,142,51,37,122,83,116,112,101,153,138,139,137,144,136,97,123,109,105,103,98,114,94,90,74,59,77,66,81,60,101,95,68,86,97,111,102,103,86,79,97,123,73,80,35,38,23,18,20,42,48,69,51,58,63,44,71,47,85,45,57,67,59,74,58,54,74,50,38,49,50,62,46,77,58,51,62,37,69,73,60,66,49,24,47,56,72,32,17,61,42,61,23,39,45,57,65,49,38,55,20,54,40,60,50,37,29,28,33,55,32,30,38,39,46,44,59,56,40,27,17,22,49,23,48,32,29,41,35,38,39,18,42,13,41,34,47,47,18,56,3,30,43,61,37,20,28,12,41,14,26,47,33,37,33,17,50,14,36,41,31,24,41,9,57,39,5,8,41,41,12,38,34,19,22,41,14,45,57,21,29,48,92,125,91,89,67,17,20,27,30,9,29,33,40,14,42,58,5,54,34,34,44,52,6,43,35,22,24,9,45,20,25,42,36,32,30,11,29,5,30,30,40,34,34,86,80,63,47,13,19,26,18,6,13,11,27,15,50,32,28,4,37,30,57,37,32,20,11,41,22,38,35,27,45,25,49,44,38,22,36,29,46,47,21,39,37,38,29,12,30,15,36,6,29,44,30,19,42,10,28,31,24,33,30,47,21,27,36,53,84,39,14,40,14,21,21,34,18,16,34,62,13,17,36,15,24,21,35,22,15,12,11,18,3,25,24,47,23,40,21,18,25,27,18,44,21,12,27,99,96,73,44,26,40,181,203,163,151,103,134,125,155,111,117,89,57,104,98,102,63,59,69,41,39,34,34,46,47,72,48,58,50,21,52,39,39,48,62,71,71,65,49,57,85,131,183,183,150,160,149,175,252,227,249,255,227,233,236,211,247,221,225,213,234,231,170,163,121,81,71,97,124,139,190,228,225,243,245,225,247,249,248,234,250,255,233,221,233,237,242,204,219,194,205,225,183,93,0,0,9,0,27,35,25,0,8,21,32,6,10,10,19,5,3,19,22,8,16,11,1,28,175,85,40,72,112,170,181,228,210,214,213,173,198,177,185,201,165,167,158,118,154,115,104,93,74,64,14,16,18,67,108,112,74,63,110,138,133,156,170,183,175,213,195,212,189,225,165,62,28,78,170,205,198,160,176,156,110,107,118,96,107,58,47,47,15,40,44,89,98,112,140,119,158,156,118,144,134,120,125,20,46,89,133,163,135,158,126,104,97,74,79,48,21,17,46,68,69,53,43,30,21,35,49,168,244,245,212,159,67,1,37,0,24,46,44,15,36,25,14,56,56,71,74,90,107,128,145,176,147,31,76,123,169,198,211,200,212,184,178,162,150,99,102,83,10,9,22,57,65,106,127,157,166,197,165,218,211,192,183,199,161,164,158,192,165,153,150,170,143,148,166,146,124,155,164,167,167,155,181,144,180,178,133,147,157,162,138,170,149,150,148,150,141,159,163,173,145,162,165,169,140,139,169,167,142,147,144,137,132,145,143,165,166,158,149,134,169,141,164,137,146,150,176,172,162,161,159,121,147,158,140,168,158,167,156,153,150,160,138,150,150,164,136,147,165,167,176,172,163,171,179,186,184,160,166,173,173,157,165,180,143,166,158,144,153,149,193,222,255,209,62,69,99,119,135,126,88,85,95,93,99,94,105,53,87,71,66,84,94,74,79,72,84,98,106,115,96,78,68,92,69,85,71,106,103,102,107,81,56,61,34,37,33,38,26,59,50,67,65,53,35,40,42,45,62,66,42,62,57,59,58,58,47,74,65,57,72,54,47,47,38,68,28,46,44,47,35,47,39,85,68,57,67,48,49,33,54,60,45,54,46,24,35,39,48,28,12,35,20,47,41,27,52,62,27,33,19,34,38,19,39,52,23,47,36,49,43,36,53,39,49,18,25,33,58,29,61,43,31,33,45,33,30,32,26,51,56,43,43,31,17,30,2,21,53,18,9,39,11,30,33,37,18,26,25,26,21,28,37,28,19,41,26,37,22,25,33,40,28,17,30,19,26,29,34,56,71,45,30,59,29,36,79,119,98,87,40,45,45,42,44,8,22,38,20,17,17,7,29,32,32,18,20,28,20,33,45,54,6,27,12,49,13,16,21,36,10,26,33,37,56,36,34,31,86,58,48,33,8,28,28,8,46,28,23,41,24,47,36,32,34,46,33,47,46,27,22,21,16,52,49,14,30,43,21,14,33,49,11,46,27,27,40,22,35,52,8,14,16,20,11,11,17,23,21,28,23,8,31,32,10,44,13,29,24,13,24,67,114,23,40,36,27,23,21,35,37,19,24,20,50,1,28,39,35,4,21,52,19,12,28,39,8,29,20,17,39,20,23,41,38,28,22,23,34,34,91,128,85,35,29,11,133,199,193,174,164,109,153,122,140,109,112,100,100,115,121,99,118,110,85,99,114,96,91,127,101,111,103,84,51,34,48,45,28,41,45,31,28,19,51,12,37,54,108,126,162,178,167,195,255,252,249,245,247,235,246,238,246,237,225,244,230,246,252,252,194,163,166,104,39,37,65,59,86,173,171,175,196,211,228,210,240,246,238,213,255,238,230,192,185,195,160,172,172,101,9,0,14,11,3,8,6,11,6,14,20,23,30,16,14,32,18,5,25,11,2,20,4,21,208,112,33,71,91,163,188,194,172,170,142,139,146,114,127,97,95,92,101,73,76,37,72,29,58,33,27,14,46,128,167,191,174,181,175,183,165,209,185,203,189,214,188,184,200,196,133,11,12,56,97,128,102,95,79,58,64,91,72,121,112,125,137,129,11,18,102,167,197,187,218,171,157,161,142,149,151,206,127,84,47,65,131,139,127,103,98,78,101,88,58,31,37,23,23,38,17,11,26,16,11,29,33,79,175,226,203,197,76,18,31,20,10,20,36,22,30,27,33,59,144,152,160,158,167,153,142,142,99,68,137,213,195,188,173,159,120,98,55,37,28,24,38,68,104,112,156,174,179,186,199,177,210,209,181,196,178,173,160,167,163,165,160,154,172,159,161,141,146,149,155,175,175,145,147,154,150,122,150,172,168,155,185,154,157,161,170,147,155,154,138,144,170,135,153,157,167,158,174,137,154,160,138,149,142,132,137,135,146,144,145,144,154,158,162,150,161,165,127,115,154,148,146,153,142,163,150,124,141,145,124,138,159,147,116,125,120,149,130,126,142,138,145,147,120,129,111,112,121,123,123,122,152,155,140,121,132,132,148,146,142,132,163,121,172,118,163,243,220,239,132,66,38,117,102,103,85,100,92,70,87,76,72,86,70,75,60,56,76,87,90,106,89,91,107,104,90,78,114,97,84,90,77,90,61,86,67,40,29,21,58,50,48,53,61,70,70,73,50,34,71,44,83,60,57,42,65,73,71,52,67,63,53,69,70,25,64,82,55,65,66,51,53,44,62,53,58,57,58,55,50,48,44,29,36,47,60,71,50,41,34,58,36,40,33,51,36,66,58,37,26,63,63,32,43,28,33,36,39,44,50,48,40,28,61,49,43,55,43,41,36,52,39,37,28,29,39,40,51,41,25,28,43,45,9,27,29,28,12,23,20,17,23,35,34,52,29,20,30,33,17,48,43,29,31,33,27,27,32,21,36,4,26,34,35,37,8,25,35,12,44,32,32,25,25,33,9,22,55,23,31,33,27,84,135,99,94,73,33,41,22,23,22,14,12,32,22,16,29,13,6,33,35,29,38,9,33,33,51,19,34,13,38,36,30,48,16,26,10,9,27,16,11,21,58,58,61,32,20,35,44,27,31,2,47,13,22,30,21,26,10,38,49,29,38,58,30,46,31,29,56,41,43,18,22,37,34,31,56,34,27,37,21,52,21,24,14,13,7,46,17,49,29,24,2,2,42,26,15,36,22,17,32,19,29,9,13,102,45,20,22,25,42,17,34,21,23,2,2,30,39,26,14,29,16,36,38,25,23,13,21,24,31,17,29,19,41,27,51,7,33,37,18,27,80,120,131,37,54,32,38,115,180,203,176,184,186,158,134,122,103,108,104,146,118,126,124,139,140,107,141,109,137,143,160,166,188,163,160,151,136,145,144,77,68,46,49,41,44,14,29,44,40,20,38,77,109,125,134,175,247,252,237,244,246,236,244,250,252,255,252,251,252,255,244,251,232,248,251,219,133,66,27,26,17,31,101,150,174,168,179,158,166,201,187,181,191,185,188,114,161,145,152,138,148,87,3,36,4,2,6,10,5,30,7,20,32,9,4,26,0,8,28,7,3,11,3,3,10,21,168,55,19,78,64,118,110,102,105,89,82,83,78,51,59,50,71,92,69,95,114,117,135,133,149,142,45,20,111,156,199,214,235,207,189,208,215,189,156,175,170,128,138,139,106,100,59,10,24,38,92,67,120,75,98,136,143,160,173,183,198,218,207,187,64,39,92,156,211,188,192,167,129,140,137,110,111,118,84,55,48,47,81,86,89,74,56,53,44,33,41,32,29,20,61,28,22,41,22,38,41,43,40,83,98,147,181,234,83,13,8,17,29,34,24,16,29,5,23,40,69,76,60,63,63,59,35,48,15,42,129,172,138,84,64,43,21,31,41,60,118,130,152,162,159,198,191,197,202,182,191,171,154,143,146,151,153,151,158,189,160,134,148,141,141,170,154,163,151,146,161,158,168,149,154,156,155,175,153,154,143,153,139,166,147,157,156,158,163,142,156,137,159,155,138,145,146,130,155,173,146,142,158,130,116,147,182,152,148,130,143,127,161,145,151,162,131,163,159,146,158,155,140,163,121,143,155,169,132,157,127,143,124,148,135,134,156,140,129,160,146,140,134,118,112,120,120,111,92,106,105,113,154,116,135,87,101,154,140,129,128,143,190,133,121,118,134,185,229,255,168,82,39,73,114,131,96,112,79,105,106,92,98,99,104,104,105,71,104,90,121,115,124,83,127,109,95,123,107,107,87,70,65,52,35,35,43,24,51,31,39,61,70,79,69,68,49,55,81,49,46,58,40,46,76,72,36,44,52,58,71,52,32,15,62,67,41,40,40,51,70,52,23,55,37,54,45,43,44,41,46,58,48,34,40,41,61,64,44,51,55,65,33,58,33,34,22,26,30,33,75,58,54,12,39,15,50,52,30,36,35,39,15,44,29,23,44,28,36,43,22,51,75,29,31,6,43,36,30,28,47,46,56,31,51,35,42,33,45,28,49,51,46,48,33,21,28,33,39,24,39,28,30,29,20,38,18,17,37,36,20,35,27,38,22,23,11,18,15,29,61,26,35,8,18,20,29,36,35,42,51,22,32,18,52,92,128,120,104,39,5,16,11,36,24,24,30,45,40,18,37,20,38,27,27,33,45,29,22,24,18,31,20,54,57,19,19,38,16,34,29,25,25,29,54,63,68,14,8,11,12,22,20,43,33,19,7,49,3,18,36,12,33,24,20,34,18,28,36,40,42,37,48,6,61,25,42,41,39,48,49,28,41,19,23,45,4,17,3,22,45,22,30,14,25,36,20,37,41,31,43,39,40,29,27,7,91,92,58,2,44,4,16,25,21,23,15,39,29,24,17,27,29,31,6,54,36,16,6,25,21,19,33,9,22,5,21,24,54,37,26,29,42,87,111,72,76,44,18,29,116,205,206,176,185,178,167,157,156,137,119,122,144,108,146,128,157,137,158,156,142,142,135,173,162,157,166,154,161,161,177,149,159,141,122,109,143,132,106,92,57,84,50,65,21,40,58,50,68,104,180,204,229,237,244,250,244,252,255,226,236,246,251,245,241,252,250,228,236,244,218,192,145,107,23,55,113,173,182,198,225,196,172,167,154,181,154,144,146,127,114,129,123,125,111,84,15,1,29,10,42,16,4,5,10,22,8,9,23,0,33,15,8,20,13,7,6,5,35,32,58,19,44,50,56,74,97,62,87,94,97,129,87,124,140,162,158,178,169,184,203,201,191,206,227,147,73,32,84,132,179,202,198,188,174,157,147,110,118,79,96,86,68,50,32,51,33,1,64,105,140,179,188,184,186,207,214,191,227,193,193,190,205,178,63,19,57,135,121,104,136,140,128,116,124,124,73,62,49,13,5,61,46,81,16,37,7,16,26,17,17,21,25,33,26,15,38,41,48,60,78,70,109,140,182,159,168,188,56,31,14,7,17,29,27,22,41,36,11,3,14,27,15,20,29,71,56,56,42,42,56,60,49,85,75,140,132,148,163,203,185,212,215,196,164,159,156,167,162,170,170,172,143,152,141,134,138,136,140,143,140,139,161,142,136,167,154,141,132,138,162,150,138,158,184,174,119,159,166,170,132,143,156,147,169,157,148,130,164,134,156,160,149,156,136,144,157,146,140,154,162,157,161,160,168,184,160,130,131,153,149,122,130,154,144,140,142,137,141,166,147,158,146,169,142,136,147,142,149,141,151,143,153,146,176,169,183,155,155,150,157,169,157,119,138,147,147,157,171,138,154,131,141,147,115,138,141,138,129,127,143,151,136,119,102,109,144,213,211,206,156,160,76,82,106,78,96,93,110,87,104,124,127,99,118,128,77,79,93,88,81,102,116,116,120,93,88,95,99,47,30,41,48,23,18,36,51,41,54,46,78,49,73,63,74,53,75,51,64,48,43,50,67,36,68,53,66,45,65,56,45,55,45,48,58,29,44,65,52,53,59,45,55,44,44,59,37,45,49,46,52,42,47,49,53,38,52,50,42,38,50,60,32,54,43,45,22,37,55,35,43,44,57,37,49,38,70,65,43,43,44,30,41,40,38,35,21,45,38,44,27,42,34,58,29,36,17,32,22,17,43,31,41,53,47,37,34,22,30,34,37,48,56,28,31,54,23,33,15,45,13,41,28,18,11,17,44,48,35,10,48,38,32,35,37,18,9,25,45,24,24,32,53,19,6,23,13,21,33,47,41,32,17,16,9,20,45,113,113,124,78,63,30,24,12,22,28,40,39,27,29,24,50,43,41,47,41,54,35,32,37,14,54,20,21,24,40,11,43,32,18,44,46,32,32,53,93,65,45,11,29,10,29,34,31,22,37,49,33,27,22,27,28,12,29,35,24,16,16,28,29,43,33,30,32,39,37,36,36,39,36,36,60,33,18,26,5,34,26,19,19,25,31,16,25,17,19,29,16,5,34,12,22,21,0,27,96,75,15,21,17,33,53,12,18,48,6,29,13,23,49,10,9,33,18,40,54,34,42,36,36,19,27,17,27,36,23,55,23,35,0,50,121,110,91,36,25,29,29,103,163,186,193,154,188,191,167,196,175,149,142,134,125,153,166,140,178,174,158,156,151,166,173,175,185,159,173,145,131,167,157,170,161,144,149,145,147,136,135,136,122,128,89,84,63,11,39,7,14,11,77,103,125,128,159,180,234,232,247,246,239,255,250,254,253,250,253,242,250,230,251,247,242,211,150,122,123,155,212,237,242,224,194,191,164,169,181,149,154,164,187,173,161,146,108,69,13,17,12,1,24,5,13,0,9,7,11,26,9,9,13,21,12,22,5,32,6,12,4,9,43,40,26,78,105,127,147,162,163,192,172,205,205,199,212,185,215,209,213,229,186,204,188,175,166,139,14,11,69,125,142,148,152,105,78,78,79,69,76,65,69,103,90,137,129,167,78,30,90,110,179,191,196,195,219,197,211,212,201,167,136,135,135,86,1,29,68,88,134,101,132,143,130,144,124,97,64,41,34,1,39,19,21,21,24,24,9,36,14,17,16,34,30,26,35,36,17,94,181,116,191,164,185,187,224,222,145,122,32,29,17,2,40,25,34,19,33,19,36,38,40,11,20,42,52,43,50,62,34,16,83,139,130,165,172,175,191,172,187,185,165,166,158,174,153,167,165,156,148,157,161,126,158,138,144,136,135,161,141,148,153,163,141,152,151,149,146,163,154,148,142,152,150,177,135,146,157,149,127,154,151,141,140,162,128,148,140,142,148,150,153,151,165,164,153,163,145,139,155,150,141,156,149,132,161,158,176,135,175,126,134,149,136,129,159,165,147,165,115,137,151,147,163,175,181,172,138,142,172,173,155,145,165,152,125,136,152,151,160,148,141,152,143,143,110,167,143,180,173,187,164,152,142,151,155,130,166,153,168,166,136,165,134,132,94,106,112,169,215,159,106,152,145,110,103,83,108,75,60,91,104,77,108,107,98,112,90,102,114,86,134,104,122,81,91,81,54,45,41,29,40,24,49,31,38,73,51,62,73,58,50,55,51,56,50,81,57,45,59,60,47,61,23,50,64,62,47,70,49,32,58,24,65,60,36,27,38,56,44,64,77,49,28,37,66,36,34,66,50,52,41,44,29,39,70,46,72,38,40,18,50,32,38,29,54,40,54,29,43,47,46,36,12,30,41,45,34,48,45,32,47,21,43,55,34,32,53,38,68,37,27,39,39,42,43,40,36,48,53,26,40,53,27,34,39,27,52,30,43,26,16,38,35,83,27,31,60,10,29,12,38,19,20,34,16,39,47,33,54,25,18,28,18,29,27,35,38,19,28,33,14,25,44,18,48,28,48,17,18,35,12,23,39,32,24,20,22,33,79,133,142,83,74,50,50,10,14,38,49,38,26,45,23,36,17,35,23,24,59,34,30,33,11,27,18,37,39,16,17,37,17,15,12,23,34,44,76,87,57,19,20,16,19,29,4,32,37,24,18,21,59,58,43,32,37,46,42,35,20,48,26,32,42,49,35,40,18,25,16,40,24,36,32,19,45,29,16,27,16,19,30,25,16,23,38,16,36,47,27,27,15,18,15,24,9,28,96,22,12,17,28,17,19,26,36,2,15,24,10,50,29,39,29,34,37,23,53,23,39,32,60,37,4,28,31,6,8,25,26,4,93,120,86,52,41,41,36,63,94,132,189,198,156,165,160,175,167,165,175,159,159,142,84,106,144,154,189,169,203,160,185,168,145,162,167,177,155,156,179,159,149,153,161,139,151,133,179,165,139,167,130,106,95,84,58,43,69,38,24,20,41,82,90,95,72,84,107,105,123,144,179,205,211,203,252,229,242,248,255,245,255,242,255,223,226,219,196,213,239,230,229,215,175,202,215,217,209,196,168,171,206,192,200,181,141,93,9,17,7,19,1,28,9,0,42,28,32,0,20,3,1,15,8,4,27,7,11,22,19,25,111,62,27,96,145,178,195,215,196,218,194,193,205,198,188,195,151,166,179,147,137,133,115,87,86,34,10,34,30,55,66,72,86,59,93,93,103,114,173,173,175,187,202,197,193,219,126,42,64,113,214,207,200,206,173,142,150,117,108,128,108,94,96,52,18,44,43,100,110,157,180,162,146,117,135,116,18,5,6,24,20,50,47,7,6,31,26,36,23,19,41,42,51,24,29,18,55,79,122,126,133,112,135,130,181,179,195,129,14,28,41,8,38,21,40,44,35,20,57,50,58,36,49,65,79,61,56,44,61,105,161,180,189,193,184,168,153,173,159,149,135,135,157,148,131,121,140,131,102,157,124,164,132,157,154,131,151,148,127,140,138,139,135,123,162,152,133,140,146,144,130,149,145,144,120,140,150,149,149,145,145,150,161,116,160,166,141,167,154,159,144,159,132,160,148,141,148,165,137,164,163,162,146,148,128,157,174,133,170,159,158,132,149,143,152,166,156,184,154,144,170,156,158,144,150,154,172,171,115,124,140,140,132,148,138,154,147,159,115,122,131,149,121,108,109,121,140,115,114,142,143,170,155,132,151,143,132,114,163,120,113,146,141,113,99,49,42,85,143,66,66,72,115,129,95,115,104,70,43,85,61,73,89,104,90,109,79,99,80,89,90,84,79,34,35,34,43,37,43,49,49,66,32,59,59,69,60,54,101,50,80,88,60,64,34,64,50,50,35,65,70,41,60,36,60,34,51,41,64,51,56,52,75,52,65,57,62,41,40,62,45,30,60,66,11,56,44,63,25,36,43,38,64,47,58,17,56,38,36,68,50,75,49,32,22,43,38,46,16,50,51,26,29,53,26,33,25,63,53,40,31,13,48,20,35,26,34,42,37,17,15,32,48,38,50,49,34,29,48,43,40,38,31,43,13,16,16,22,56,29,17,40,13,42,50,24,14,26,14,45,45,23,50,27,14,43,14,35,60,39,40,17,33,6,25,15,30,10,35,33,30,56,17,26,25,35,16,31,36,45,33,40,37,27,37,39,15,12,42,57,91,143,116,109,54,42,35,24,35,29,55,27,65,56,27,20,6,50,49,28,9,31,26,27,33,25,45,40,39,42,22,19,26,27,39,21,97,101,63,43,16,26,15,9,26,9,44,21,27,15,52,27,34,39,22,27,25,33,30,45,20,39,25,18,27,41,19,2,32,13,23,37,41,39,41,30,18,34,27,31,40,17,27,17,17,10,17,13,2,38,16,28,9,8,6,60,69,16,16,4,37,44,29,26,14,46,8,37,24,29,21,19,7,11,4,10,10,40,10,26,26,24,37,31,29,27,37,17,48,93,110,76,39,28,25,17,69,95,127,180,197,194,186,180,165,190,158,216,169,132,154,171,125,96,100,101,128,155,171,153,128,140,153,161,147,159,143,170,177,158,149,145,142,142,145,167,159,166,154,145,138,135,132,111,100,79,88,82,72,88,125,145,132,130,97,63,82,102,77,104,140,135,122,147,124,130,141,179,186,194,207,234,235,216,221,206,241,241,249,180,138,159,168,177,196,232,236,215,211,228,224,243,230,203,193,110,4,11,1,9,11,15,26,23,18,29,29,21,18,5,15,9,13,22,8,13,29,23,11,31,158,49,55,109,143,182,175,185,167,169,155,184,135,141,119,108,91,109,100,58,57,61,54,46,49,44,38,24,54,76,88,117,129,152,173,193,191,212,214,241,208,232,204,212,228,203,111,37,25,78,137,129,140,87,95,65,81,72,127,111,106,103,125,76,28,17,56,44,87,123,179,161,150,120,118,66,25,21,20,22,21,40,41,33,6,14,25,14,35,22,23,12,34,51,22,26,21,58,64,145,57,44,51,108,111,176,213,195,100,21,38,34,44,58,17,55,33,29,27,39,41,71,105,146,100,118,150,170,187,181,172,162,144,149,128,136,142,137,145,149,133,145,134,133,144,139,152,113,143,138,139,154,138,138,146,135,170,151,145,150,142,146,108,142,138,137,142,140,134,135,153,133,142,113,138,120,128,160,113,157,144,151,152,165,146,116,142,153,153,135,109,162,149,124,154,144,128,165,166,151,177,163,135,169,163,148,126,153,136,137,152,143,141,126,154,118,153,163,154,157,176,151,128,141,150,160,143,138,145,153,143,129,162,131,126,147,162,142,138,149,153,150,156,167,140,160,139,129,138,149,143,132,180,144,129,127,135,112,142,148,114,149,177,140,143,86,62,52,61,36,47,75,58,118,79,96,135,177,128,40,0,15,28,50,76,65,51,56,70,47,47,61,34,62,30,49,60,45,46,50,63,55,63,46,60,69,68,45,72,91,65,27,44,42,41,65,54,50,56,60,48,102,70,53,67,71,71,66,51,70,72,68,43,59,60,85,53,28,61,36,68,61,71,63,58,36,55,46,57,38,41,56,38,43,43,60,64,9,48,45,56,28,29,40,22,46,19,48,39,58,59,35,35,50,55,49,20,40,46,30,36,25,51,28,33,36,16,46,59,25,19,25,58,25,64,38,56,47,25,45,10,36,42,51,20,14,27,23,27,38,64,42,16,42,27,31,42,35,37,4,46,29,32,23,40,45,28,28,15,25,32,43,30,1,32,34,11,5,34,32,42,13,21,23,11,22,26,23,7,24,40,22,37,35,40,15,15,32,24,29,44,25,83,139,143,106,61,76,34,9,8,18,23,19,15,22,22,34,27,36,37,27,47,42,25,19,21,14,38,19,35,29,26,48,48,35,53,75,90,70,20,26,4,25,22,23,13,6,20,31,35,42,61,25,40,15,24,25,42,28,42,46,35,54,42,51,16,46,35,25,41,27,41,46,56,31,29,25,8,6,26,16,22,27,14,20,39,38,28,39,33,29,19,16,42,70,58,2,17,27,2,28,6,2,51,13,41,31,19,28,51,11,7,19,35,9,21,15,34,26,45,33,39,23,16,40,58,91,119,89,58,21,30,60,41,66,81,106,154,207,182,185,172,157,139,160,168,181,178,157,192,181,119,113,112,69,104,154,159,151,158,170,161,163,164,170,136,171,174,155,167,170,182,174,179,181,139,175,156,188,201,169,156,145,120,92,110,117,107,103,145,143,158,173,138,109,103,111,123,145,124,131,123,156,139,101,126,130,153,148,117,195,173,170,170,193,216,209,173,104,64,137,115,131,158,230,244,247,252,232,237,248,251,242,218,113,6,0,11,43,15,27,20,18,15,6,8,11,2,0,14,10,60,11,26,26,10,18,12,24,141,27,50,46,82,119,144,116,97,109,106,89,70,51,53,48,39,65,78,79,93,122,119,99,149,68,12,59,68,105,173,170,199,199,222,217,224,188,201,183,203,171,165,134,108,121,53,16,22,42,79,113,111,97,82,62,99,75,63,82,46,53,36,38,17,33,8,39,16,80,135,163,134,135,156,90,23,19,16,25,4,38,44,10,10,25,13,22,4,21,22,17,15,20,30,7,77,92,133,120,121,133,108,126,154,144,206,189,87,34,16,40,22,24,54,19,10,24,27,48,46,64,142,150,169,170,175,185,175,169,164,131,161,170,159,146,140,141,149,143,139,139,135,165,169,142,134,139,144,164,122,141,145,147,137,120,135,148,129,149,161,145,138,154,170,136,127,129,134,144,144,139,123,156,125,134,128,168,157,157,133,161,171,158,152,167,131,151,147,138,111,154,121,142,151,136,150,124,126,142,160,141,143,158,155,161,145,143,162,165,148,135,137,150,121,159,154,158,137,140,135,133,154,148,144,154,158,150,140,165,145,177,149,143,150,121,137,166,154,157,182,155,149,159,143,177,154,143,146,160,122,155,141,128,156,141,136,151,157,138,135,170,153,150,124,85,69,47,22,56,81,76,74,67,57,125,223,251,242,205,116,55,24,16,35,10,16,14,61,46,36,47,52,73,61,77,69,55,68,49,83,36,28,46,54,46,65,28,69,44,56,66,43,75,52,37,61,55,57,49,46,66,52,58,61,48,66,53,63,55,45,80,62,57,50,44,57,70,37,59,46,44,76,57,75,28,62,58,84,64,53,65,39,27,40,74,42,49,26,60,47,56,55,42,22,39,30,39,16,39,58,57,47,44,43,51,42,39,43,40,61,21,52,31,44,37,21,35,26,48,32,29,53,53,13,25,38,41,29,52,21,30,58,34,40,40,24,16,23,39,18,30,40,33,20,36,25,17,12,40,39,39,29,36,35,34,40,44,17,35,40,42,14,24,19,26,42,32,23,15,21,27,15,31,24,16,33,41,26,38,47,42,43,20,15,43,27,20,15,18,31,15,33,50,66,145,134,129,108,72,19,30,17,8,1,32,19,45,24,34,19,15,38,45,38,12,43,41,52,33,27,36,28,24,30,23,36,36,76,65,61,43,5,34,42,18,35,9,21,34,41,23,32,37,41,34,27,3,29,41,17,14,34,35,15,34,47,51,39,30,34,11,28,28,32,22,43,39,25,0,11,45,23,20,22,16,8,24,38,34,32,17,39,30,51,53,33,19,19,19,27,20,4,23,10,15,39,15,19,14,39,51,24,14,0,6,12,17,24,40,35,15,42,27,41,65,119,109,68,57,39,32,18,24,55,73,131,153,175,209,185,170,158,142,164,152,155,154,163,155,170,161,149,147,120,126,75,126,164,195,182,177,190,188,153,181,179,150,185,199,181,153,189,160,168,178,139,165,181,177,191,146,164,153,149,151,123,106,158,136,133,151,150,151,140,167,128,159,173,168,164,154,159,155,149,165,151,146,129,136,141,143,156,141,166,142,146,152,150,67,24,96,75,98,117,207,236,235,248,238,250,244,255,255,223,111,7,0,2,35,10,15,18,23,9,10,6,25,24,2,7,0,22,0,7,7,1,11,18,21,80,8,40,39,92,98,65,61,77,60,63,73,78,93,106,120,145,149,177,157,185,193,194,195,182,97,58,48,105,152,166,227,197,149,153,145,142,136,105,115,100,103,106,63,74,20,40,29,12,66,64,69,75,34,22,32,39,14,19,24,15,13,34,38,41,26,28,24,49,59,132,131,112,126,119,93,46,24,15,7,26,27,9,33,33,2,31,61,45,50,16,35,2,58,36,43,23,99,135,146,142,160,145,142,116,72,56,44,45,27,10,40,28,36,26,27,48,30,45,15,43,61,144,161,171,189,165,136,162,158,145,136,145,153,163,142,162,167,146,129,167,161,143,174,151,164,146,142,139,136,116,137,134,156,148,137,140,164,167,149,152,128,132,141,131,142,132,124,152,160,153,126,150,120,169,144,157,130,138,159,146,178,150,143,176,141,149,153,172,152,153,151,133,149,166,128,130,169,146,130,116,119,118,114,112,142,145,132,165,120,160,147,112,117,139,112,124,114,125,116,142,145,131,148,127,163,156,138,131,147,147,141,144,149,148,143,140,144,130,118,142,120,132,97,86,109,146,117,127,143,105,121,101,109,107,122,141,122,114,94,106,97,111,98,41,36,58,37,53,32,28,22,50,39,48,77,175,224,240,254,245,229,163,99,24,18,8,17,32,36,57,58,82,66,64,76,79,65,55,42,73,77,79,65,51,53,52,51,56,82,62,47,66,54,42,49,50,70,68,19,50,59,27,64,74,49,63,43,59,59,60,43,41,69,78,59,54,41,65,52,64,35,63,44,54,20,43,62,52,80,48,60,60,49,18,54,47,37,36,51,36,47,63,17,40,37,57,53,36,22,46,34,56,53,16,42,40,38,43,45,24,30,6,38,58,22,43,52,53,32,27,45,45,43,46,51,37,45,47,68,42,16,25,42,22,34,6,54,28,48,39,27,20,42,28,45,37,31,25,17,16,34,62,33,13,42,26,18,33,32,44,28,18,44,6,8,34,29,22,52,13,33,28,32,17,28,2,29,9,21,18,21,47,22,36,31,29,15,20,13,29,6,29,37,46,67,73,156,138,146,109,77,39,32,21,40,10,20,47,30,26,21,19,20,32,68,22,5,46,30,36,51,71,35,36,15,45,28,38,90,76,46,13,13,38,1,34,34,30,23,58,52,59,41,60,21,12,28,14,30,61,16,41,58,11,43,36,38,51,43,21,43,39,32,17,23,28,14,25,9,48,41,20,9,13,6,35,10,18,41,6,13,8,32,81,60,8,28,11,8,23,29,19,16,24,6,28,14,10,32,25,34,34,16,30,35,37,23,36,26,45,34,31,75,130,120,83,28,51,14,35,26,62,82,78,108,108,162,199,187,161,170,159,148,170,161,152,183,122,141,180,158,178,174,178,158,110,113,107,149,178,185,162,153,165,169,156,175,163,185,142,165,173,163,185,153,164,187,182,172,170,168,190,167,154,131,134,102,120,142,140,155,162,162,164,155,164,150,163,176,179,159,143,149,163,133,126,144,130,110,115,138,148,144,141,126,180,148,151,104,90,79,78,48,65,133,180,186,205,218,230,247,242,251,228,116,0,14,4,0,18,0,22,15,45,14,38,12,9,0,13,11,8,6,9,4,26,14,33,12,32,37,88,58,79,100,113,109,114,157,158,160,167,188,175,176,183,200,184,198,160,183,199,168,153,46,30,41,104,131,141,130,130,100,81,67,58,76,82,51,54,49,60,80,54,51,21,2,13,12,30,57,27,35,24,18,8,28,6,36,8,17,39,42,40,34,62,46,26,69,132,130,102,151,117,84,22,17,19,28,48,24,33,47,14,55,33,20,17,4,39,25,34,45,20,28,45,47,44,86,56,60,48,45,36,29,31,11,15,16,42,21,56,57,95,82,82,92,100,109,120,108,139,166,174,190,167,145,137,151,131,148,124,139,136,129,121,129,128,156,126,135,133,154,129,161,135,175,152,139,140,167,142,121,148,140,133,144,125,124,130,129,139,138,133,154,137,139,160,179,162,179,158,147,139,137,129,154,117,160,155,135,150,156,135,181,140,163,106,139,143,145,130,134,131,110,140,146,134,133,152,149,116,136,127,133,131,142,156,124,155,153,166,132,138,135,133,126,134,118,159,154,156,160,144,138,142,136,136,121,162,147,152,113,138,99,131,122,126,94,96,89,134,97,113,115,102,104,120,113,116,109,135,89,98,113,131,110,90,88,84,104,47,79,85,82,65,84,75,69,62,11,50,60,48,18,99,182,219,237,243,234,225,224,162,89,46,41,21,15,48,78,79,82,105,60,88,76,79,56,33,47,47,45,57,50,60,50,54,53,41,68,47,31,69,27,81,62,52,63,42,60,38,26,58,47,51,77,62,46,38,49,58,49,64,40,55,39,53,41,43,58,40,36,42,45,43,40,40,57,64,59,12,37,36,43,70,53,69,56,45,48,51,30,51,40,46,38,70,36,46,40,49,59,28,16,56,47,26,41,43,46,4,32,39,43,50,30,29,43,52,32,36,16,21,33,27,22,36,18,23,28,31,35,23,16,33,44,33,24,33,58,60,18,38,31,31,57,29,34,32,34,26,35,47,17,38,43,31,30,28,28,42,17,25,39,15,40,8,19,20,37,28,26,37,21,28,21,12,39,3,27,22,21,29,23,24,44,34,22,43,27,29,31,35,51,18,40,69,140,155,153,132,81,32,47,34,29,11,8,55,41,53,22,36,10,52,30,23,25,33,16,22,3,21,24,33,26,29,82,137,78,23,17,4,14,38,29,18,5,51,36,18,42,31,21,26,18,17,40,36,24,33,24,30,43,49,30,25,28,27,12,45,30,43,20,28,19,34,29,18,10,27,49,2,9,13,14,16,12,4,24,50,34,68,50,25,29,27,31,23,35,7,21,4,20,49,24,13,23,13,24,27,40,41,26,23,3,14,15,33,83,98,111,92,39,35,54,37,24,30,31,69,101,91,122,152,179,148,162,176,183,158,177,171,159,157,129,159,150,188,190,173,186,168,163,113,101,102,99,139,174,178,169,146,183,182,170,170,178,176,172,162,183,177,181,191,182,169,167,183,181,175,176,172,163,129,146,157,153,158,136,190,175,161,154,148,179,156,154,181,176,154,137,147,133,155,145,144,154,124,132,123,120,105,160,170,174,166,141,176,141,82,39,64,86,98,111,129,138,170,204,205,202,239,147,2,6,14,19,26,8,0,27,10,22,17,20,21,4,18,19,9,20,14,12,3,0,18,7,41,67,51,72,170,195,192,187,174,184,202,176,189,172,184,183,182,183,161,174,156,165,153,120,83,17,19,33,72,99,93,106,68,108,91,111,118,79,72,66,73,61,41,26,22,36,28,35,29,26,30,8,13,15,13,35,37,31,25,12,26,13,45,19,36,48,105,98,111,145,194,145,136,152,152,80,0,9,33,4,35,52,23,32,32,27,19,41,10,13,31,12,26,51,24,35,56,39,40,44,18,35,15,21,19,29,14,18,42,42,62,76,133,172,179,212,188,198,185,205,225,211,170,166,177,122,137,142,155,145,150,141,117,147,136,153,141,153,129,137,133,118,123,140,126,128,134,127,126,130,124,165,128,150,137,144,138,144,102,154,140,159,145,134,157,144,131,153,100,143,132,121,141,149,135,150,139,124,121,124,123,145,114,142,142,119,138,133,120,112,111,153,111,124,133,134,138,157,115,144,155,117,134,136,105,123,146,136,140,134,144,131,129,150,143,130,114,130,122,114,124,153,115,139,148,126,132,129,141,140,128,129,100,118,91,124,130,130,146,137,120,133,121,157,117,166,160,150,132,168,159,173,147,130,114,106,94,81,81,118,97,81,92,76,51,63,79,90,94,93,67,91,70,17,64,17,83,121,125,189,251,241,253,255,250,233,159,63,18,10,10,48,39,67,94,43,74,54,53,64,39,53,81,55,38,61,48,71,64,60,67,59,28,56,76,41,87,68,54,44,41,62,44,54,67,52,68,30,86,70,60,43,56,47,40,62,56,40,55,81,30,56,38,45,34,65,45,64,30,36,52,53,54,50,41,55,45,39,35,78,51,33,28,51,59,25,27,50,11,21,52,45,48,31,47,50,25,34,27,36,25,42,19,43,34,22,40,23,29,29,29,33,26,39,45,47,38,12,48,47,49,36,21,33,25,11,13,9,46,24,61,15,19,23,35,47,57,34,13,13,40,40,18,0,33,15,30,22,15,13,31,24,14,32,26,30,18,35,16,18,37,37,28,63,27,46,10,39,20,45,28,13,20,21,28,17,18,21,16,41,50,30,51,42,8,20,18,50,47,44,58,113,182,174,146,120,100,77,68,49,19,33,28,34,30,33,9,11,16,42,22,45,16,17,39,23,33,31,31,18,72,96,51,49,16,29,36,25,19,36,20,61,30,52,41,30,15,17,25,43,22,31,44,43,28,34,13,49,38,34,9,16,33,37,21,19,18,32,12,11,13,45,23,45,12,32,22,32,30,20,68,16,52,54,21,28,32,15,17,47,48,29,23,28,24,33,37,6,9,11,30,24,9,4,30,22,28,24,47,81,113,127,82,70,25,36,28,34,6,12,57,87,103,101,98,134,185,205,176,180,174,176,162,174,177,189,164,151,210,158,203,158,164,149,136,152,158,153,99,83,108,137,155,198,197,197,173,187,138,181,148,154,164,155,185,164,162,178,180,156,146,178,179,193,165,168,152,124,139,131,150,155,153,168,154,164,144,145,160,174,189,157,150,175,169,162,165,138,160,130,127,118,128,134,155,148,163,175,182,173,191,179,131,130,113,116,102,73,26,80,91,124,173,160,192,100,10,3,17,26,6,26,19,8,10,11,29,10,16,2,17,4,17,6,8,4,11,18,24,24,65,79,92,114,179,171,205,188,171,178,188,192,179,171,141,165,168,129,128,107,97,108,80,55,39,29,42,21,76,123,129,94,111,131,110,58,35,32,58,16,19,48,44,31,29,9,42,44,16,35,13,30,45,2,19,25,9,28,9,26,8,24,22,37,42,55,132,156,184,189,205,155,137,134,142,68,36,5,28,24,20,38,19,13,25,20,26,67,47,48,46,40,49,48,51,6,26,29,25,56,27,16,32,29,53,19,59,77,82,125,149,184,196,212,217,217,195,177,173,185,206,157,177,165,142,142,149,150,149,140,153,139,118,130,125,122,163,145,121,138,137,127,115,104,119,106,126,125,140,116,116,120,137,130,140,140,147,174,150,141,155,134,127,139,143,130,153,133,111,132,134,124,122,129,120,145,114,128,105,143,160,125,152,136,121,121,126,132,121,135,117,153,104,132,134,138,127,115,81,118,136,149,140,143,128,121,121,137,136,132,130,128,139,138,153,132,140,142,138,142,137,129,124,125,106,123,136,118,143,120,131,108,115,129,101,117,76,110,116,156,126,141,112,126,132,130,105,150,147,140,153,126,122,120,88,81,84,68,54,76,76,101,80,93,76,72,82,68,93,99,67,70,54,75,67,19,101,115,90,97,114,175,254,250,246,252,220,244,177,86,41,0,10,17,56,44,83,66,49,43,54,39,51,51,52,78,42,67,42,73,60,41,46,61,69,62,71,62,32,53,42,49,73,55,60,45,36,52,58,51,40,47,24,61,58,77,51,26,46,46,29,62,48,47,55,34,34,50,26,48,65,37,43,51,46,43,42,44,49,26,23,17,46,68,41,30,49,35,36,44,29,31,23,40,47,27,19,40,34,47,54,30,49,43,16,21,43,49,38,35,32,45,45,45,31,28,36,30,25,31,23,21,22,39,30,24,20,11,26,44,50,45,13,40,35,25,55,16,23,34,43,30,26,32,6,31,15,28,32,22,6,47,36,33,17,21,36,40,31,31,60,33,17,29,12,18,31,25,58,27,15,13,23,26,28,31,37,29,26,54,45,50,44,20,29,24,35,43,5,38,27,41,34,64,127,136,158,163,133,91,68,45,17,14,38,18,41,1,25,44,41,47,54,3,27,23,11,8,40,7,31,84,78,74,39,16,22,13,17,18,43,49,38,40,24,26,30,11,33,28,20,40,20,29,36,27,32,25,15,7,19,32,39,14,41,16,17,7,9,14,18,32,20,23,13,43,36,17,32,33,28,32,94,50,9,6,22,25,15,11,20,42,7,25,22,21,14,20,8,17,32,12,18,17,36,18,48,99,129,129,109,61,25,37,40,24,14,25,48,40,79,101,107,125,118,182,189,173,169,155,177,198,162,202,172,144,173,167,174,172,172,153,159,181,167,161,164,161,161,96,92,108,140,141,175,174,190,195,156,179,181,185,162,151,149,141,150,141,145,150,165,164,170,197,162,158,137,138,136,131,146,144,147,177,183,172,174,155,152,165,166,172,163,140,160,140,165,163,171,155,169,156,166,148,143,173,158,160,185,154,202,195,179,188,174,165,155,109,75,50,57,61,94,123,138,108,18,8,19,5,18,29,33,11,9,13,5,31,4,21,6,7,16,17,19,4,35,22,9,12,51,31,73,94,156,176,168,168,135,148,118,116,104,95,108,86,52,63,82,58,52,66,63,82,94,42,27,38,36,64,75,44,42,20,15,47,21,12,14,24,18,37,17,34,29,30,21,5,39,18,17,45,21,29,16,33,50,34,18,42,35,15,24,33,29,44,61,126,137,122,137,121,115,137,123,77,44,19,20,31,36,28,41,32,34,38,54,29,30,40,11,32,18,11,43,25,29,27,15,24,36,16,68,61,57,137,137,125,200,217,189,199,170,161,159,152,168,181,168,168,167,123,149,165,129,154,137,160,146,154,136,115,138,159,107,153,147,122,139,130,117,142,132,121,127,114,138,137,124,132,112,122,132,138,108,132,161,150,146,128,111,99,81,138,103,133,117,113,140,132,140,138,104,131,142,154,143,148,164,134,167,141,138,124,116,146,142,127,141,131,120,124,112,147,131,139,130,106,105,123,122,145,134,95,122,97,128,130,156,126,120,129,136,103,129,133,120,112,126,121,132,120,115,123,141,126,143,139,122,135,104,111,129,118,110,117,100,95,109,94,83,57,108,88,56,94,90,113,102,84,109,125,132,120,102,107,109,109,108,98,89,86,114,67,84,92,78,75,56,68,69,59,86,79,61,51,68,109,102,99,124,123,151,206,255,212,254,252,254,247,147,70,15,0,1,28,28,56,34,52,56,68,49,51,76,53,42,31,78,67,46,65,55,39,65,37,52,70,25,34,41,31,63,39,43,48,62,44,73,61,67,45,65,56,47,41,73,57,33,66,35,66,47,60,50,60,35,51,43,57,50,37,53,58,57,63,31,41,55,65,44,12,35,50,43,29,27,33,11,45,18,30,31,53,25,29,49,41,31,43,47,48,49,42,47,31,26,30,28,32,60,34,46,16,35,41,26,21,22,25,36,62,22,43,37,46,27,34,60,35,49,28,10,42,8,25,45,50,27,15,39,21,38,32,18,14,46,36,17,35,39,29,20,6,39,15,34,24,23,36,30,52,26,21,18,23,14,23,10,16,15,32,22,6,44,46,31,44,35,47,31,26,38,19,34,38,19,31,59,17,34,34,51,53,48,55,66,134,112,168,168,150,100,75,44,43,29,26,25,21,31,22,39,58,27,13,27,22,20,27,35,32,105,82,68,21,25,6,30,25,44,47,56,27,27,28,61,26,41,71,43,22,37,16,31,33,27,39,22,24,33,39,28,29,28,11,11,23,23,13,29,25,31,24,21,15,21,9,29,32,8,40,29,49,26,33,18,7,8,24,21,30,25,6,28,37,5,23,11,16,21,27,25,34,64,79,129,113,63,58,34,38,45,45,29,32,23,34,74,118,105,113,97,107,179,178,175,138,129,188,175,160,180,169,124,134,160,181,172,129,176,166,168,167,162,165,171,171,150,158,115,77,96,119,173,197,215,156,167,177,185,167,141,169,159,158,180,148,161,155,142,163,164,170,165,153,144,154,147,142,169,160,160,177,141,171,155,167,194,167,180,173,165,162,167,180,174,143,163,149,188,188,151,166,145,157,191,161,187,182,176,213,206,201,189,199,188,154,118,51,72,44,50,57,112,111,29,2,20,16,27,23,27,26,31,5,12,8,35,4,27,5,19,32,11,18,57,22,21,14,46,41,66,63,116,111,109,110,82,88,60,71,75,73,71,91,87,94,92,85,80,44,42,62,28,49,23,14,20,35,19,30,21,13,17,27,17,9,17,24,5,25,29,26,29,20,29,28,25,11,18,18,35,6,21,11,22,40,20,26,20,28,43,19,38,38,57,85,74,102,115,163,144,135,122,96,20,22,23,31,43,38,61,23,41,29,0,26,8,31,26,7,38,33,40,12,20,47,30,66,41,70,63,137,166,168,208,209,208,184,167,160,163,185,152,166,126,158,188,176,174,161,147,169,140,129,145,124,155,144,146,139,144,130,124,125,153,107,130,121,118,146,168,135,133,97,113,147,114,119,118,135,136,121,112,116,108,97,74,67,123,109,102,131,97,101,107,105,122,98,89,96,97,106,81,108,109,118,89,97,112,123,106,96,121,114,138,113,113,121,128,149,133,130,139,116,81,130,108,136,122,143,136,104,128,89,115,99,98,119,103,104,104,122,93,93,132,97,94,102,135,128,110,116,127,118,149,136,137,101,90,134,139,110,96,129,118,89,68,72,82,133,66,74,105,109,91,93,94,124,118,133,145,105,148,128,108,105,144,122,121,120,99,106,93,92,73,86,49,93,74,61,72,81,78,38,68,91,123,155,159,151,137,137,184,227,254,247,252,250,246,246,182,111,38,4,12,28,30,47,32,48,38,35,55,43,61,64,60,71,81,89,61,55,71,69,37,44,45,43,67,54,67,42,64,49,20,45,61,72,37,60,60,69,48,48,57,60,52,52,5,48,34,51,24,39,64,36,48,55,32,36,58,58,75,54,31,52,24,55,16,21,48,51,30,25,14,40,11,44,42,27,31,48,29,43,41,46,20,52,37,30,20,20,23,26,20,47,28,34,46,32,31,32,51,40,21,33,32,28,12,31,29,41,21,34,27,34,38,31,67,32,40,18,64,16,16,38,16,59,52,23,26,28,35,31,51,33,10,16,8,26,32,2,12,24,1,15,30,53,27,13,22,14,34,36,28,31,29,39,23,51,22,47,23,33,40,28,24,24,46,37,28,54,5,22,25,19,20,27,19,46,57,47,25,28,32,48,42,67,116,186,174,153,148,143,109,68,33,36,45,41,32,41,19,24,35,23,48,19,28,25,75,58,68,31,41,20,15,8,31,29,37,40,19,33,14,35,54,53,55,41,49,46,19,30,28,47,17,19,24,24,33,48,24,53,30,17,28,12,29,29,21,17,21,9,15,40,12,20,39,54,19,3,34,15,21,21,48,8,19,45,40,31,44,31,25,30,24,53,36,84,96,126,153,110,62,46,48,28,3,48,26,16,35,50,36,67,93,123,130,109,92,150,199,182,165,159,149,155,171,182,170,180,164,158,190,186,136,155,183,162,158,160,173,159,168,200,180,192,159,135,128,88,103,134,182,163,161,170,153,130,159,173,185,156,161,178,160,153,163,168,177,172,199,160,151,152,145,129,125,134,162,160,184,145,145,177,151,150,166,171,140,165,158,150,133,141,157,166,159,156,135,167,137,162,175,163,149,173,169,194,188,213,195,191,183,195,170,161,106,52,39,61,74,79,3,11,10,9,18,18,19,27,21,4,5,38,27,11,3,7,23,23,11,4,34,18,13,23,39,28,39,69,89,102,91,92,97,99,104,112,100,106,105,63,62,44,47,21,39,2,26,17,38,32,12,35,53,31,17,22,30,51,3,21,26,38,31,32,27,18,19,47,42,41,45,19,16,37,20,34,45,22,11,27,5,22,56,13,16,7,21,11,26,60,85,117,127,143,132,159,136,113,110,77,46,32,30,11,28,19,19,51,52,48,13,20,29,18,33,18,26,19,30,16,34,94,106,143,166,178,177,217,196,204,181,195,174,184,173,173,160,176,177,163,157,139,166,161,151,143,142,126,127,138,148,171,139,140,116,115,113,144,110,103,139,120,152,127,151,138,148,149,139,104,151,111,120,146,93,93,114,125,138,98,131,95,107,135,107,114,147,111,128,113,124,115,115,100,122,103,112,91,92,119,81,73,65,62,119,70,78,111,96,137,97,122,118,134,119,139,126,86,117,73,98,93,122,120,110,105,124,107,146,97,90,85,62,90,92,97,103,88,86,66,91,101,93,121,129,118,101,116,111,90,77,120,112,119,98,93,121,97,64,100,111,70,82,99,93,113,96,108,146,125,130,147,154,151,126,124,129,110,89,91,108,92,102,118,101,86,102,114,101,106,71,103,116,79,79,56,78,72,54,45,53,86,61,116,162,139,129,110,106,131,165,186,249,252,255,250,234,230,198,132,69,3,15,53,23,35,49,63,66,56,56,66,69,62,74,53,90,46,59,62,89,75,30,51,70,77,34,57,90,60,57,47,40,47,67,41,37,38,86,48,66,44,33,71,57,70,46,55,60,47,51,30,55,16,28,76,49,13,55,48,53,27,35,41,68,37,50,58,24,38,18,41,39,35,33,40,39,18,51,39,17,33,31,33,14,55,43,30,15,58,28,41,11,28,11,47,15,56,39,13,23,58,29,42,33,25,59,43,55,18,26,28,46,19,46,41,14,41,37,33,24,51,31,27,15,0,12,43,30,45,6,4,34,24,29,27,19,2,29,17,16,32,48,34,20,41,5,36,33,6,7,21,20,26,28,37,35,5,63,52,49,43,44,7,18,8,27,26,39,48,42,31,5,26,21,32,50,36,27,20,43,45,35,21,44,69,60,102,103,151,171,162,165,118,131,113,109,58,35,47,26,29,24,28,24,17,26,41,74,61,58,32,15,33,32,52,50,14,40,71,113,120,86,48,47,50,63,25,22,34,60,29,45,48,42,33,27,38,18,59,34,21,25,6,9,5,39,50,22,17,8,12,31,34,46,44,20,14,20,40,17,15,7,8,17,31,27,28,11,37,40,30,50,108,148,191,129,104,56,60,33,15,29,14,39,27,19,49,18,60,62,106,110,110,100,126,106,176,177,187,182,158,137,170,154,147,176,159,146,165,147,144,166,157,160,163,156,182,171,166,138,172,146,187,181,159,143,87,76,126,118,175,177,176,169,173,159,161,181,153,170,154,201,168,176,158,174,174,162,166,150,126,123,152,146,162,164,144,178,155,144,177,168,159,186,179,180,184,148,152,135,152,168,175,192,154,150,158,170,147,187,180,147,161,163,184,233,224,216,213,172,177,222,145,149,115,120,90,72,106,11,10,18,29,4,7,22,19,14,9,14,16,3,3,17,17,1,28,17,5,3,9,26,3,41,39,41,50,97,135,100,93,103,76,37,63,27,57,25,5,25,28,19,31,44,27,2,4,18,28,16,33,31,18,13,19,30,14,40,29,36,22,13,41,38,21,9,23,33,30,36,36,21,8,40,38,12,11,44,15,11,17,25,21,30,17,33,38,42,43,59,99,112,97,82,57,40,54,18,34,43,11,7,15,56,43,25,29,52,41,7,42,17,12,6,17,26,5,41,50,87,146,199,232,199,197,206,165,159,157,136,162,146,161,138,150,136,127,129,141,133,146,141,148,129,132,113,80,107,99,130,129,139,144,151,114,127,111,109,108,97,143,151,144,137,136,121,127,117,112,93,127,136,140,121,144,140,149,155,152,132,136,117,134,136,147,170,162,162,123,142,151,144,141,154,152,132,119,134,130,120,141,121,128,119,135,174,150,120,131,149,136,175,126,121,130,119,122,113,117,137,127,110,130,104,87,94,111,108,105,92,103,131,130,111,98,109,130,110,118,144,103,152,134,140,159,143,151,163,131,115,148,116,108,100,120,108,105,97,114,121,112,127,137,120,122,133,112,75,84,106,109,88,114,101,108,62,76,84,77,92,98,94,80,86,61,117,91,90,110,63,81,71,69,63,50,76,47,54,66,64,53,30,27,65,52,76,109,160,116,72,106,142,181,231,242,255,253,246,255,235,170,87,42,20,4,21,43,61,58,57,61,71,54,50,64,75,51,59,54,73,77,66,57,46,60,73,24,86,65,31,59,36,39,61,61,48,54,38,45,63,54,45,68,61,50,55,26,57,58,66,51,37,27,44,65,59,60,35,44,51,51,63,69,6,27,72,23,72,26,46,43,18,45,12,19,36,43,25,23,27,40,33,12,40,28,28,35,39,54,29,64,37,30,15,47,45,3,56,5,35,22,27,36,21,32,7,47,21,24,23,38,65,43,35,37,20,14,51,37,28,29,11,14,34,33,40,37,12,21,31,30,5,7,39,30,41,10,47,21,41,24,32,16,44,48,45,7,41,18,24,24,36,1,27,24,12,11,26,35,15,30,19,26,17,39,21,41,30,45,19,29,10,2,50,29,51,39,23,19,24,30,17,38,38,31,16,46,28,24,63,68,131,151,155,161,176,144,153,127,163,131,121,105,50,68,61,58,105,106,109,84,55,76,114,134,147,169,186,215,243,196,127,59,33,47,73,23,25,37,22,3,11,42,21,21,15,53,26,27,61,38,50,20,20,19,24,43,17,29,38,0,20,71,44,18,3,15,19,8,33,43,8,22,16,41,47,45,63,79,145,116,172,125,100,72,28,56,64,11,34,9,32,12,42,32,32,62,59,84,113,116,89,83,117,99,143,181,166,143,166,161,159,161,133,184,169,190,160,170,180,161,184,177,169,141,152,145,156,161,157,150,186,158,172,196,160,147,130,110,135,128,145,172,200,177,186,175,196,170,185,158,173,184,160,166,173,177,160,177,156,143,145,181,156,141,165,178,166,184,184,169,177,169,187,192,153,160,167,172,159,159,176,160,175,152,173,169,172,147,168,151,145,175,180,208,193,219,208,206,173,206,206,165,156,156,160,154,134,115,16,0,29,20,6,7,9,2,13,12,2,15,7,12,17,11,24,15,17,2,31,3,1,14,11,18,47,46,29,34,57,42,21,41,6,24,15,30,7,21,23,10,30,22,33,29,0,16,21,32,15,22,35,24,32,12,28,30,42,11,19,31,44,14,35,25,25,9,11,24,25,10,19,18,10,11,30,25,25,32,29,29,61,33,29,53,29,33,66,37,63,48,57,29,28,13,35,36,20,25,47,23,11,22,13,35,33,30,40,26,8,9,47,29,7,40,44,30,50,22,30,137,177,203,150,137,138,113,107,87,102,126,131,131,124,116,111,106,101,122,105,90,139,124,107,134,125,129,131,130,147,127,164,136,125,121,126,125,141,114,98,134,133,114,127,105,110,135,96,86,114,113,125,129,91,130,121,101,80,76,106,112,89,120,114,109,99,99,54,86,97,106,102,106,97,92,91,99,102,103,123,104,116,123,124,123,107,112,116,79,144,116,99,82,100,69,74,71,129,117,112,135,141,121,107,96,89,108,71,71,139,145,148,115,98,115,119,86,118,115,140,124,148,154,136,150,124,127,133,141,141,134,118,111,103,116,119,96,151,143,134,133,139,134,136,128,142,93,99,111,94,88,72,74,89,87,66,91,96,72,120,108,88,94,103,110,67,81,88,86,66,88,71,36,54,63,56,42,59,66,50,64,25,44,41,36,76,120,154,187,125,127,104,141,151,195,241,234,227,237,248,251,229,174,114,39,1,15,13,32,45,62,39,58,76,46,66,44,28,48,72,55,45,56,59,60,64,44,43,69,52,57,62,55,47,45,56,42,44,73,69,42,34,49,60,56,55,52,53,53,38,71,20,33,46,60,52,60,26,25,46,30,38,57,43,62,28,57,56,51,19,38,42,53,49,8,42,40,9,22,33,12,60,27,48,57,24,52,29,35,57,38,34,26,33,16,32,18,28,28,42,2,8,32,28,37,41,33,27,16,28,36,26,20,31,32,33,34,39,41,36,23,22,11,28,24,36,25,23,6,14,19,15,31,23,25,23,18,34,18,40,31,19,41,15,52,8,43,44,16,26,41,18,26,26,16,25,17,17,31,12,1,44,19,19,17,5,41,37,22,28,37,46,39,24,36,38,17,28,26,34,29,32,24,17,14,17,17,3,25,57,30,29,40,47,57,86,96,134,169,172,181,195,165,185,174,174,152,151,177,222,169,179,221,196,193,193,193,196,139,156,112,69,14,32,40,45,54,60,25,14,51,30,31,33,16,20,32,33,25,51,29,61,57,34,48,36,29,41,17,33,55,55,27,47,59,66,31,31,23,40,65,84,75,95,69,144,113,167,119,94,97,46,109,29,22,33,14,19,7,31,21,7,25,25,53,61,85,85,84,102,95,106,120,102,159,169,171,158,170,174,164,172,172,144,178,171,170,153,176,184,147,169,182,160,155,154,164,157,150,168,166,145,167,146,157,171,180,165,134,85,92,117,161,194,191,166,195,178,155,171,158,138,173,164,159,189,167,179,167,181,182,165,151,165,195,176,158,177,178,169,195,173,159,188,165,170,134,155,172,178,184,196,162,146,157,158,147,155,174,162,133,150,162,185,208,221,212,201,189,179,182,199,172,153,167,165,161,151,103,3,10,9,30,11,17,25,7,11,31,41,14,15,11,29,10,11,23,57,23,17,11,22,36,42,15,26,15,45,36,44,38,38,34,19,17,20,38,25,34,33,33,21,11,27,25,24,29,43,25,9,20,22,3,20,24,18,13,18,44,25,24,18,20,21,24,20,16,37,37,22,31,43,36,21,33,19,27,30,21,68,39,46,42,37,35,7,53,56,32,32,34,43,20,27,13,41,15,28,26,38,28,25,18,40,42,30,35,57,28,36,45,27,42,21,27,49,22,25,27,38,63,90,118,145,118,118,94,71,97,99,116,115,125,96,89,108,88,110,107,132,84,103,120,137,135,161,131,134,149,155,165,179,136,154,129,127,140,116,137,97,105,85,66,65,64,67,97,105,86,95,101,113,123,107,69,91,86,77,82,85,69,78,83,67,63,68,39,44,64,75,35,47,73,63,59,65,81,86,68,68,79,87,70,83,101,75,69,97,39,33,27,50,57,60,47,81,97,88,92,81,72,113,95,80,63,85,88,81,107,83,113,121,91,90,95,48,100,87,124,135,88,113,75,86,70,104,82,69,36,79,112,89,55,95,120,109,114,66,102,116,101,101,112,104,118,108,114,130,114,106,90,75,65,58,60,30,67,74,70,60,57,93,81,75,75,67,106,68,53,76,88,62,51,65,64,76,47,66,41,61,62,66,67,58,20,66,69,142,172,161,194,178,137,102,113,133,140,203,245,252,250,250,254,207,134,74,68,33,32,26,26,38,27,62,39,41,60,69,51,40,38,78,56,44,52,33,56,37,53,51,34,62,53,46,58,37,50,42,67,62,19,55,60,41,50,71,44,32,54,49,43,25,54,27,40,36,48,48,24,58,30,50,43,43,23,56,31,59,35,40,33,35,33,15,24,26,36,32,38,38,44,47,38,19,38,45,29,35,32,35,28,29,23,17,46,59,45,39,45,31,37,28,33,32,30,27,25,22,29,16,28,37,19,37,37,38,16,38,51,41,16,39,45,31,30,8,33,8,20,40,36,52,39,20,6,16,4,18,10,30,2,9,14,15,12,28,19,14,31,16,25,35,13,43,22,26,35,8,27,22,32,39,11,7,18,12,56,22,15,0,47,27,22,44,11,15,39,40,26,59,34,10,30,6,32,19,40,41,22,40,13,19,19,29,44,30,29,35,49,60,57,56,90,41,83,92,113,116,135,129,139,161,143,160,91,64,51,19,42,35,47,60,39,49,10,28,53,8,43,32,45,28,22,20,37,11,16,40,37,28,18,47,91,38,61,55,84,80,72,90,135,120,64,88,76,93,111,128,155,129,113,183,145,165,114,63,55,47,39,63,22,32,46,37,14,30,29,7,13,25,34,15,71,45,80,77,87,94,66,70,96,115,106,133,167,193,162,159,191,166,165,194,155,186,132,169,173,167,181,135,155,141,145,159,178,166,161,170,143,177,193,137,146,160,173,188,188,167,151,119,91,117,125,150,182,183,199,175,193,147,159,162,173,174,179,154,154,164,172,174,169,177,192,173,174,147,185,189,196,172,180,165,187,189,145,157,139,151,187,169,168,173,180,162,182,171,178,160,164,174,169,159,172,171,202,199,204,200,191,176,169,173,191,157,170,158,172,156,100,12,9,43,15,16,22,12,7,8,2,30,13,10,3,5,34,10,17,16,35,14,13,7,21,20,18,10,15,8,36,42,17,23,43,21,33,19,26,51,17,33,12,27,15,12,21,18,51,8,23,28,27,13,30,26,44,44,47,6,43,14,23,37,33,26,18,33,8,41,16,45,41,44,43,22,65,41,25,23,56,9,27,33,44,20,12,41,11,18,18,32,14,45,34,32,27,26,23,33,20,53,20,9,23,64,54,66,51,127,79,66,71,54,73,54,64,55,122,84,97,103,97,130,145,147,102,133,73,99,77,106,93,101,89,62,69,106,93,95,96,84,90,74,97,104,121,95,101,117,124,110,117,145,139,139,151,102,96,137,107,91,111,131,101,112,96,87,97,111,112,104,99,80,107,108,139,131,144,136,113,136,121,124,126,75,102,132,130,153,133,102,57,71,62,67,83,92,100,97,95,102,89,87,92,111,81,134,56,75,75,49,88,96,100,100,102,116,108,81,113,84,75,76,94,81,108,110,108,127,109,97,81,118,84,84,77,90,118,83,121,111,96,110,90,84,91,80,87,81,75,87,80,86,69,93,123,102,95,75,76,102,107,95,111,77,89,101,91,83,105,98,92,74,72,70,46,74,50,71,78,69,101,97,72,79,64,60,103,88,53,40,76,78,64,60,80,58,70,66,81,86,68,81,38,63,71,62,38,84,90,133,173,211,164,172,103,97,104,121,169,213,244,254,250,234,204,180,148,83,76,45,24,6,15,27,7,37,48,42,32,53,69,63,26,41,66,43,40,50,57,42,60,51,42,53,58,37,51,53,36,58,48,52,40,59,27,50,66,40,61,61,44,46,61,48,51,53,43,35,27,26,48,31,35,47,44,37,20,29,52,49,11,31,38,64,21,19,55,51,33,24,58,45,46,58,24,53,17,6,48,50,24,46,36,32,36,48,37,22,36,25,35,55,17,48,41,21,38,26,36,23,30,31,15,20,25,15,37,41,51,27,17,9,27,24,17,26,25,36,22,22,55,29,13,10,24,42,17,22,15,35,18,33,12,28,18,21,15,15,24,59,21,41,26,33,30,12,0,28,20,32,28,24,21,23,22,12,27,11,9,36,16,35,19,34,19,22,41,31,64,58,60,43,18,42,19,14,12,50,46,15,27,30,70,37,24,21,37,16,38,16,55,22,22,21,19,23,32,37,35,33,50,43,73,86,71,53,50,52,3,42,151,138,61,28,27,30,20,28,36,34,52,34,38,27,64,41,19,43,23,16,27,31,15,43,37,52,71,58,70,72,78,95,92,95,79,89,85,106,63,79,78,61,45,25,70,39,61,44,28,3,15,38,16,4,24,36,48,24,30,25,18,48,81,67,69,73,89,105,95,109,89,128,143,181,206,175,151,150,171,159,173,150,145,155,174,151,131,130,160,150,156,146,147,150,188,169,159,127,157,174,176,150,172,150,155,175,160,182,169,126,105,96,78,132,158,144,175,182,154,145,180,190,173,168,133,164,163,170,197,175,174,159,170,154,158,144,166,169,165,139,151,172,174,153,173,178,172,152,189,184,152,179,183,178,166,182,166,171,150,177,163,158,163,173,182,186,193,205,180,184,174,191,166,164,163,144,159,156,116,21,1,24,4,17,28,12,16,8,34,11,25,0,0,4,14,15,7,20,9,11,14,8,11,33,45,38,17,36,23,31,31,24,29,17,29,34,27,50,43,27,43,26,14,39,53,25,30,20,25,23,12,28,47,20,26,9,30,35,18,23,35,38,10,13,14,37,42,36,49,42,40,49,13,32,24,45,63,23,43,31,1,28,3,2,12,25,25,33,41,32,56,40,23,6,21,11,39,23,34,54,24,42,48,78,121,156,215,193,216,214,206,215,171,202,199,194,196,187,153,134,149,106,136,105,94,107,121,124,108,138,130,160,127,129,105,98,92,95,79,80,103,92,105,99,90,56,102,113,123,80,47,75,85,82,73,73,71,75,111,101,92,127,119,125,129,123,118,149,138,134,109,119,121,110,124,166,139,183,134,143,156,138,136,124,106,147,127,179,152,93,108,88,103,114,125,107,170,137,110,115,144,164,150,129,136,119,94,94,110,141,156,154,144,163,147,128,149,157,114,108,106,96,84,108,111,117,84,105,103,110,72,103,78,75,84,85,94,97,84,104,135,130,117,94,100,91,107,111,97,99,99,131,121,107,107,114,104,87,60,93,62,81,102,110,103,99,67,92,94,59,109,115,79,114,81,81,90,107,95,98,94,118,111,91,97,66,89,59,68,95,76,67,61,52,78,64,73,40,64,77,89,47,65,71,60,42,60,40,62,67,78,111,142,159,146,182,156,129,113,130,121,192,191,172,178,215,193,188,162,141,152,151,100,59,39,21,9,11,19,31,30,52,70,53,63,75,53,70,46,42,43,48,58,37,43,71,65,44,74,69,33,64,51,35,65,59,61,46,39,71,37,38,22,57,42,40,26,40,51,61,26,38,52,26,17,30,61,55,38,49,39,18,39,29,18,36,27,33,58,55,22,35,55,33,47,44,51,11,27,27,34,33,33,45,31,41,61,38,41,45,51,30,52,41,31,41,14,68,37,30,11,42,28,31,38,27,39,33,6,46,37,26,33,25,13,33,32,38,33,37,36,35,36,3,15,13,30,13,21,14,28,18,24,33,21,25,0,0,33,46,27,41,16,48,17,14,20,11,10,17,20,28,38,9,51,37,20,36,28,11,57,4,42,32,46,38,10,15,11,33,21,19,5,16,44,34,38,36,13,48,31,0,35,20,34,29,32,16,40,33,31,27,41,23,27,23,34,11,7,42,22,21,46,77,67,102,86,112,118,164,243,154,71,11,54,38,31,47,46,58,44,39,51,52,41,41,50,58,40,32,3,16,36,18,18,32,37,32,35,14,36,38,56,20,30,27,8,46,41,56,32,44,17,16,11,46,1,32,18,2,25,16,24,36,34,15,37,25,53,71,57,55,72,83,94,74,79,77,68,96,96,121,161,184,168,175,142,140,150,154,125,149,147,137,153,140,138,163,169,169,171,155,159,187,152,154,137,171,178,158,162,156,167,158,192,171,153,182,138,178,140,127,96,115,107,137,148,192,179,164,155,158,145,141,144,157,196,146,158,179,193,176,145,138,133,131,144,139,158,147,153,153,177,177,148,160,162,166,158,161,129,177,161,169,178,144,142,168,147,158,150,144,159,176,187,195,190,204,196,195,182,184,183,180,171,160,150,183,112,31,1,31,5,0,22,0,35,24,28,19,13,5,15,14,20,11,28,24,10,15,10,4,10,40,24,20,27,20,28,35,21,32,35,34,51,35,62,72,69,62,21,50,28,6,4,20,35,25,16,8,23,35,27,30,18,22,26,31,23,24,39,37,27,30,28,64,48,37,55,38,47,22,19,38,21,2,33,32,27,40,30,21,23,38,14,35,16,31,45,24,32,29,25,14,18,14,24,28,39,30,57,61,142,102,158,108,157,148,165,154,168,217,141,176,153,153,147,121,111,111,104,91,75,105,94,126,151,145,145,90,112,126,101,125,128,110,90,122,127,104,92,86,93,114,99,77,99,103,115,94,91,66,88,106,92,97,119,68,110,118,141,113,90,149,129,143,116,138,127,141,119,119,135,120,129,132,120,113,93,90,85,79,89,109,79,67,101,111,125,112,121,128,143,132,141,116,92,129,120,109,130,142,116,121,134,122,115,105,115,121,118,116,111,101,115,115,111,95,95,108,86,87,98,84,65,119,67,68,93,97,69,91,99,94,144,93,80,106,94,104,113,123,83,103,82,112,89,123,124,108,102,121,108,102,101,99,105,97,66,102,93,96,107,103,76,85,80,95,108,77,96,98,72,99,81,72,98,122,78,96,77,89,78,87,59,90,78,74,78,82,84,64,79,81,66,51,61,55,37,58,77,78,43,82,62,51,63,71,63,36,63,52,88,144,176,190,168,193,139,102,94,135,107,104,133,152,179,202,182,187,249,250,243,227,175,140,74,41,29,6,16,14,15,43,55,50,48,40,70,59,52,53,56,42,67,44,70,30,55,46,42,51,56,34,40,61,21,28,64,28,51,61,49,64,49,42,48,44,44,41,30,64,41,32,39,29,33,33,40,43,19,49,42,15,52,25,34,36,59,47,26,33,22,26,16,34,29,28,23,39,32,22,28,52,32,40,13,17,20,33,46,34,44,28,26,32,37,32,19,27,37,26,10,38,27,9,28,38,34,27,9,9,6,30,39,23,7,17,20,20,48,27,30,18,25,35,47,33,20,19,7,31,13,20,20,27,44,4,44,30,34,20,29,24,24,18,26,32,28,24,26,32,12,22,17,8,31,9,33,2,20,39,24,24,46,31,20,52,36,16,17,42,25,31,43,16,19,38,17,18,19,30,31,18,14,40,23,43,25,31,46,47,37,16,37,34,26,30,18,21,31,56,39,28,62,58,74,67,73,109,126,71,39,43,42,32,90,104,88,86,115,117,83,67,95,100,86,61,30,20,37,48,29,67,55,51,23,34,26,7,13,29,27,31,20,23,13,12,3,22,28,20,28,9,15,28,56,31,38,26,18,22,22,25,44,23,27,68,68,75,71,82,79,97,83,72,75,101,65,99,97,179,168,183,161,176,179,138,162,137,168,170,132,131,150,166,148,160,159,164,149,190,175,147,130,149,150,160,139,162,118,158,160,148,149,162,166,152,158,185,164,153,156,100,104,102,152,168,159,178,144,153,154,161,160,166,148,169,166,154,139,127,143,141,138,131,117,132,146,123,158,159,150,146,157,116,147,166,152,183,142,159,154,166,179,150,125,149,162,165,189,154,180,163,185,159,199,194,208,213,194,214,190,154,149,196,157,144,123,11,0,15,44,6,8,8,31,1,16,18,13,3,0,11,10,1,3,14,16,43,12,20,11,40,21,27,14,27,6,42,12,25,35,33,60,65,40,62,79,62,31,28,19,7,11,33,38,7,52,28,58,25,7,40,45,32,19,78,11,34,51,44,50,18,34,48,15,47,25,0,40,28,37,25,19,40,32,31,22,19,31,26,20,11,31,26,30,21,24,7,34,10,14,25,23,44,39,38,34,56,95,47,67,70,62,51,81,37,39,72,76,54,71,62,29,70,72,97,101,102,108,88,108,92,94,94,84,94,81,81,76,62,96,109,104,92,112,118,120,105,105,64,83,139,112,125,77,111,116,140,113,109,123,125,95,119,93,86,105,97,77,56,60,91,111,91,60,81,91,96,101,98,88,52,80,78,82,76,72,71,83,92,75,81,59,81,86,115,105,92,96,118,107,105,129,87,61,89,76,74,69,68,102,111,104,78,74,80,58,69,70,63,86,54,58,84,67,81,80,90,76,100,73,87,106,74,69,85,97,67,112,87,109,109,130,116,129,114,119,90,85,100,77,100,112,87,94,113,123,106,88,114,118,108,97,118,97,131,125,128,89,86,98,119,79,87,80,89,112,82,75,92,86,112,92,74,122,107,91,96,109,88,101,109,72,93,73,83,82,74,95,69,67,77,82,65,56,85,45,75,67,56,58,82,62,48,84,60,65,53,54,54,67,71,118,128,135,128,98,122,154,157,145,97,49,89,116,163,149,163,227,249,255,254,254,243,247,210,160,122,51,16,1,19,24,10,31,43,56,53,43,54,62,86,68,48,53,53,41,38,57,49,69,18,57,38,39,48,42,32,50,54,52,50,60,27,42,34,20,26,41,49,38,27,22,38,41,27,38,54,27,21,29,45,30,35,44,35,54,33,38,32,23,37,11,27,20,24,51,44,19,9,38,31,21,24,39,18,18,33,51,28,37,45,42,37,38,33,10,21,51,25,4,34,31,23,37,5,8,19,53,43,33,38,30,27,48,47,22,47,2,21,21,17,40,25,21,24,19,21,19,11,39,21,43,34,46,9,14,28,14,32,10,24,21,35,18,37,33,33,21,20,19,30,29,15,19,10,36,25,33,4,39,5,18,42,25,5,0,27,28,26,14,2,1,26,53,39,57,39,51,22,31,40,30,51,31,26,26,37,22,20,42,13,33,3,32,1,25,39,15,47,25,44,51,25,54,10,48,41,57,42,12,27,26,74,139,132,146,146,166,156,141,108,126,167,144,138,87,47,15,40,54,39,35,14,21,18,23,32,46,13,33,32,19,30,16,6,12,33,13,8,15,15,19,10,18,19,15,19,13,6,69,27,61,30,41,57,24,77,52,74,99,63,77,113,112,70,97,110,184,199,191,160,158,148,157,148,171,183,165,167,172,172,177,158,160,171,165,166,151,164,154,170,133,153,165,169,158,145,166,146,147,168,156,189,156,161,154,169,148,148,144,122,124,68,88,147,182,177,180,152,155,166,158,148,182,164,152,159,161,133,153,153,149,147,147,129,158,169,151,156,147,150,164,156,156,164,152,170,157,177,160,153,151,151,138,149,156,167,178,162,178,196,168,168,195,186,180,209,200,191,181,193,151,172,183,192,105,16,7,5,13,16,2,32,20,2,2,10,21,25,16,18,2,25,17,15,7,6,14,18,22,18,28,20,33,28,23,4,45,18,30,31,41,52,87,59,58,59,30,57,49,25,13,12,15,19,31,45,44,38,42,25,58,54,29,32,33,14,20,28,28,35,10,13,44,21,19,32,28,20,27,25,30,35,20,24,10,32,12,27,10,17,23,17,15,55,12,33,40,25,24,35,22,49,51,37,40,41,34,44,52,68,62,72,37,41,46,27,68,85,68,74,106,98,124,105,128,111,91,138,115,93,101,94,61,87,61,70,67,110,87,122,128,152,165,148,151,152,139,125,84,97,116,154,111,112,140,110,98,134,99,82,62,75,79,85,54,66,80,80,59,63,63,51,52,75,57,75,112,76,54,42,78,66,92,81,86,74,105,92,85,57,81,87,80,88,109,85,68,81,98,75,97,127,97,85,67,68,57,91,86,65,76,85,93,64,82,88,88,71,106,100,82,77,73,98,102,117,112,105,95,87,99,95,116,116,127,102,79,108,97,119,130,109,98,116,113,91,77,90,126,94,74,105,101,95,130,138,151,135,99,90,98,94,104,90,90,87,97,120,110,131,96,117,114,113,116,104,106,104,100,97,88,88,102,106,123,97,88,106,100,88,81,76,85,77,74,101,75,87,99,57,49,53,82,81,45,65,68,87,76,87,77,89,53,54,66,57,61,42,52,24,39,39,28,34,83,99,155,199,115,75,62,93,84,84,95,95,142,236,252,254,250,243,226,253,251,245,214,168,134,116,43,3,18,27,11,11,36,59,53,48,44,55,44,71,38,56,50,51,58,54,54,46,56,26,57,32,61,44,60,24,24,45,20,50,30,53,36,32,55,29,33,45,43,32,45,10,52,41,32,69,5,33,48,20,36,31,56,10,34,53,37,42,26,48,11,15,24,56,30,38,50,52,33,22,49,22,17,49,11,41,20,39,48,26,27,53,43,49,20,15,7,7,47,37,58,27,19,43,19,33,11,2,34,13,40,41,48,23,22,43,24,32,33,16,15,6,24,26,34,5,22,31,18,51,17,14,49,11,2,43,14,23,7,39,6,24,17,8,37,31,15,37,29,16,34,28,31,28,16,11,24,25,22,45,32,42,24,29,14,26,26,21,27,12,47,36,30,32,42,23,41,25,28,42,17,22,39,40,41,47,8,23,40,14,6,34,28,17,14,53,26,32,40,37,15,38,9,46,22,24,56,61,88,159,135,133,163,129,146,137,125,147,151,148,109,17,16,33,7,39,11,53,24,1,13,15,3,6,15,25,18,12,36,33,26,30,15,6,64,28,9,27,41,46,26,39,27,42,50,41,34,47,39,52,60,67,95,41,70,81,86,70,77,79,120,177,151,181,173,171,168,161,138,173,171,161,176,161,164,156,163,158,171,178,151,141,165,161,161,159,159,166,162,165,161,169,196,166,177,172,170,165,159,171,170,183,138,154,154,144,117,94,75,116,125,180,184,190,175,143,161,158,166,133,143,184,135,134,176,157,170,172,162,179,159,158,172,159,132,141,162,161,159,163,170,146,189,168,155,180,181,182,163,131,184,151,170,182,152,180,180,188,182,162,199,202,177,184,182,174,154,189,157,177,107,2,1,18,13,13,3,31,11,35,28,5,11,11,14,1,12,10,38,9,40,0,28,3,22,32,2,11,27,29,23,31,10,11,32,56,34,37,50,45,65,31,35,29,16,26,27,44,11,31,51,32,34,34,22,39,43,34,6,34,39,16,22,23,36,30,15,34,24,41,30,26,47,36,36,39,33,54,32,17,8,24,29,21,21,20,9,22,9,17,35,18,8,32,26,22,30,16,45,52,38,42,46,37,41,78,35,53,49,52,34,77,96,72,79,93,111,122,123,80,98,101,98,98,103,102,69,75,66,74,90,102,92,90,99,96,101,110,97,101,68,63,83,86,53,70,104,98,119,93,120,112,113,82,71,81,79,95,75,64,78,77,96,109,79,82,102,81,77,84,96,67,81,84,95,113,117,107,100,112,104,112,106,119,81,88,82,104,100,94,117,125,78,115,108,92,92,105,117,107,92,118,106,106,103,84,113,82,104,106,99,117,84,90,105,132,87,102,116,118,103,142,104,94,95,109,100,110,111,108,110,118,111,132,86,121,116,121,108,94,122,126,115,118,97,97,131,106,127,141,125,117,135,123,142,132,95,127,111,80,106,127,108,112,135,116,116,122,122,99,136,95,120,102,112,91,105,104,119,96,99,95,95,102,86,116,115,100,112,100,92,96,115,86,79,84,86,77,51,54,60,95,77,66,69,80,71,68,79,50,38,35,29,59,64,50,43,40,35,41,18,52,68,93,107,85,108,87,70,46,51,38,39,114,150,176,220,239,246,250,251,236,237,218,249,244,210,167,127,76,79,37,23,3,8,42,59,55,36,63,29,50,46,73,55,67,39,33,51,55,50,39,45,9,44,60,49,15,33,59,57,31,35,40,52,27,44,47,39,42,48,47,37,38,48,44,30,31,47,46,31,11,52,37,39,21,35,1,48,43,51,53,20,49,41,14,45,36,42,18,14,44,36,43,34,27,32,25,40,13,37,7,35,18,20,28,54,27,20,16,26,15,8,25,27,37,33,25,30,33,22,21,24,25,19,19,41,13,12,31,10,40,11,32,18,18,16,9,37,30,21,13,17,29,35,17,27,32,17,9,33,31,25,34,18,7,17,22,28,13,60,8,22,49,30,35,39,1,49,55,62,28,26,20,42,42,48,40,36,39,34,28,27,9,19,36,25,40,21,28,31,27,59,16,27,23,24,13,23,13,28,30,32,37,23,30,26,36,48,45,26,11,67,51,35,23,33,51,68,127,108,134,131,127,129,117,101,117,149,167,120,30,9,20,28,20,40,37,30,21,38,19,13,1,17,42,8,26,16,35,9,7,15,7,18,3,7,21,14,27,12,18,22,26,58,35,24,52,50,61,71,72,43,73,94,70,91,85,73,116,142,183,179,162,142,162,143,136,140,139,168,177,165,216,179,162,179,164,162,176,155,173,134,138,153,150,140,176,178,166,198,167,147,157,146,174,171,153,168,163,141,151,172,151,175,194,154,140,97,88,85,131,152,183,180,202,178,163,171,170,144,169,164,148,159,143,141,151,155,145,131,159,171,186,156,172,163,139,158,163,160,152,155,178,154,153,157,157,163,163,143,166,125,162,163,165,195,147,199,160,209,169,169,165,152,156,180,157,150,167,122,20,12,25,20,13,8,40,23,11,16,18,15,29,35,0,32,43,13,7,23,9,5,26,5,39,14,25,39,14,18,19,39,17,25,44,41,27,62,55,46,45,75,43,23,16,26,29,17,34,25,4,10,23,25,37,25,31,52,34,4,10,20,26,21,45,38,31,22,11,47,38,26,43,31,9,42,25,21,26,35,10,21,6,14,24,10,15,9,22,9,19,24,19,50,11,41,36,28,41,35,45,41,19,49,23,37,39,35,45,46,29,30,55,44,45,36,51,41,48,37,26,32,34,56,52,25,26,15,44,27,43,67,62,59,37,45,21,27,31,34,54,29,70,39,68,62,54,81,93,95,78,80,95,86,78,102,105,78,102,110,99,121,140,137,114,117,122,72,118,94,108,112,131,101,118,117,118,129,103,108,123,101,131,91,92,127,119,112,85,113,104,116,98,87,87,105,119,119,113,102,124,123,120,134,117,122,111,112,119,126,103,104,122,125,97,107,97,124,95,124,120,92,103,93,101,101,83,113,124,132,98,143,111,126,120,104,153,126,101,111,127,107,130,123,111,111,118,140,104,107,107,132,123,117,126,117,111,114,118,105,127,104,109,113,117,102,122,110,111,109,115,96,120,97,91,115,103,104,117,95,109,112,96,119,87,110,132,116,109,94,102,82,75,99,66,65,89,67,54,65,65,65,58,89,87,64,81,61,59,65,62,52,59,51,34,45,63,49,70,40,52,54,76,80,87,107,105,73,63,80,44,62,77,110,115,121,147,190,207,233,244,250,249,247,255,243,255,255,238,190,145,135,83,45,11,4,57,3,18,26,24,39,33,29,55,21,40,39,28,29,53,64,43,60,48,47,34,37,28,25,39,40,51,58,30,47,55,37,46,37,47,39,37,31,51,42,36,30,38,28,30,36,22,17,53,23,19,17,22,64,52,20,18,35,25,22,14,36,21,33,23,25,25,15,14,19,25,29,44,16,23,35,20,17,30,33,32,10,15,33,33,24,31,9,15,27,39,40,19,18,36,40,33,19,6,18,0,19,31,23,48,18,35,30,6,5,14,30,5,59,26,25,35,11,32,17,26,18,25,25,33,11,30,31,14,16,4,12,40,32,12,33,9,24,21,28,5,29,69,34,25,21,22,8,18,46,26,27,11,10,29,27,52,22,46,32,14,23,28,13,26,30,18,32,10,43,27,29,38,36,10,32,24,40,56,54,20,42,10,25,26,47,49,25,45,30,47,78,77,82,106,104,122,127,116,101,117,135,154,119,24,11,31,18,33,37,17,24,14,13,9,15,19,39,24,11,28,13,13,35,34,33,7,24,26,51,28,39,15,32,30,50,61,33,35,58,54,66,66,52,84,49,75,83,101,99,103,121,151,168,172,177,157,137,146,136,134,168,154,162,177,150,192,197,157,175,167,167,134,152,139,157,171,159,169,158,156,168,168,153,155,150,157,178,137,179,176,162,188,157,165,145,172,162,176,161,150,138,110,64,75,117,178,190,164,188,164,157,164,163,160,161,135,156,134,144,153,184,162,162,173,133,164,114,145,159,168,140,140,152,169,151,159,133,151,150,187,187,153,196,155,171,123,169,140,164,167,149,170,173,143,165,164,157,169,162,167,149,195,138,7,19,4,11,39,20,10,3,0,5,10,22,41,0,11,22,18,27,0,35,19,8,37,19,24,32,13,28,33,33,30,50,44,21,39,30,41,27,30,40,31,35,24,20,22,19,24,26,22,15,23,24,11,12,5,18,31,21,26,22,36,23,45,15,24,14,15,16,30,21,22,14,15,42,13,18,8,27,10,7,11,26,6,9,16,18,15,17,26,3,16,3,20,37,42,58,49,44,54,74,58,45,50,81,72,39,58,29,60,51,29,22,45,54,33,20,44,54,50,20,43,37,30,62,43,43,32,40,34,55,78,51,67,61,117,73,80,99,81,79,73,80,92,64,81,104,87,90,94,101,98,110,81,121,101,94,76,96,119,91,119,112,116,131,114,122,125,129,102,132,135,104,107,144,131,111,115,90,84,96,132,136,126,126,116,109,115,122,86,148,130,110,82,115,114,111,110,106,138,124,110,107,114,124,120,147,94,131,103,139,126,101,115,126,131,105,116,115,124,121,123,117,114,142,135,105,109,114,100,112,130,98,126,110,115,122,132,138,122,130,99,80,104,106,93,113,132,110,120,109,117,110,96,144,141,122,131,144,106,126,107,104,109,103,112,142,125,115,115,114,140,108,132,116,78,100,88,103,99,106,104,102,125,71,101,107,90,89,78,88,118,82,70,55,90,94,94,104,100,57,97,87,92,58,68,57,41,81,57,63,60,60,67,58,58,50,83,96,89,58,68,77,76,66,47,30,91,86,117,105,114,102,142,99,112,102,96,81,117,110,160,162,214,234,254,248,243,252,254,252,254,255,251,214,156,146,115,83,51,35,14,20,35,10,42,32,53,44,48,64,27,60,38,33,62,27,53,43,19,14,15,37,22,45,18,53,82,42,29,15,25,41,36,14,31,26,37,30,34,22,40,53,34,43,32,30,25,56,48,19,27,57,21,11,25,37,33,22,53,44,18,17,29,34,36,42,4,10,8,5,27,7,43,29,12,29,43,11,24,24,7,49,4,11,20,32,22,28,33,55,27,17,17,39,52,29,7,12,30,42,39,16,28,11,32,11,34,11,23,18,11,15,18,23,29,13,30,29,13,10,7,23,25,27,30,32,22,6,9,20,10,16,8,24,29,16,19,23,15,31,37,35,32,20,34,49,31,25,9,22,18,38,26,12,47,32,31,13,31,39,38,34,20,31,50,10,14,27,63,8,33,14,56,30,62,44,29,28,37,25,26,35,46,40,23,41,40,61,78,82,98,67,103,87,92,99,110,101,107,95,30,27,13,10,51,23,19,19,28,19,43,27,21,16,22,25,26,35,23,1,17,10,34,10,0,17,30,9,29,45,33,34,21,53,48,45,39,47,82,73,45,53,93,88,82,119,99,165,161,161,161,162,166,177,159,152,149,160,146,169,149,170,176,167,149,163,170,154,131,156,154,148,156,183,154,141,169,183,171,147,163,139,150,162,154,156,142,153,160,151,143,155,149,157,173,192,191,188,137,129,82,100,118,158,161,172,175,196,160,166,175,179,136,153,165,152,171,182,174,162,153,162,163,178,185,175,158,169,146,144,145,141,159,176,153,146,153,169,161,151,140,154,141,178,149,145,159,163,137,167,150,158,167,172,161,142,170,178,176,109,29,0,37,2,13,27,9,9,4,6,12,8,14,9,14,46,25,9,9,25,10,28,19,4,35,25,32,32,37,45,14,69,25,41,29,51,49,46,21,47,20,58,31,39,42,40,32,29,29,28,19,19,11,42,25,13,38,33,48,25,41,15,31,40,32,18,33,28,25,14,27,12,31,15,13,39,28,25,10,0,14,15,12,8,27,11,15,15,25,9,23,14,9,21,36,44,69,72,59,104,46,62,72,84,60,51,70,60,67,64,58,64,44,60,56,47,32,16,39,39,38,32,64,40,76,58,69,47,56,42,45,25,29,88,82,132,96,103,123,96,131,127,151,121,132,115,129,95,107,112,128,123,104,100,108,94,137,117,104,108,78,109,109,108,106,124,115,109,105,109,97,119,109,113,115,155,123,127,125,125,134,125,105,103,112,121,114,117,125,112,125,93,85,121,119,96,110,115,124,146,119,117,107,111,138,136,126,142,143,135,104,127,108,103,127,128,141,98,137,124,111,122,103,116,150,136,121,108,110,138,121,125,95,132,99,119,149,144,144,115,121,127,108,100,94,125,132,131,129,123,111,116,109,121,98,131,102,114,104,113,126,133,107,112,125,105,127,128,121,131,107,86,71,112,95,125,96,96,96,122,113,83,71,101,86,86,97,73,71,77,83,111,97,107,97,87,119,65,60,41,50,99,57,67,85,93,90,78,71,58,46,55,51,45,72,43,66,18,73,61,81,78,96,43,30,77,105,100,81,97,100,118,116,106,138,106,114,103,82,79,75,82,131,138,163,210,211,234,251,255,248,240,249,254,254,239,205,222,143,128,104,56,38,16,1,26,13,28,18,32,34,51,53,49,47,34,55,46,53,28,50,33,26,59,15,49,54,38,47,48,3,47,47,15,30,20,34,32,16,48,29,26,38,13,42,15,13,33,7,40,40,61,33,15,4,35,26,44,42,47,19,37,31,11,48,18,28,32,49,8,20,12,35,16,22,18,40,28,42,21,40,41,27,15,4,12,30,14,9,15,15,35,35,46,29,46,2,26,29,8,20,15,39,34,30,14,14,16,20,6,19,34,17,14,9,18,24,29,1,20,37,24,44,28,15,4,41,39,28,38,31,15,27,26,11,34,37,49,6,30,16,16,56,40,24,10,22,13,24,35,50,27,45,27,22,25,17,16,45,63,52,43,21,44,11,36,23,31,16,25,13,20,22,62,48,26,40,21,30,32,42,63,51,38,45,12,43,47,66,118,105,96,105,115,101,98,121,84,138,139,39,36,30,31,36,27,21,1,25,52,37,29,29,32,20,14,21,18,31,31,30,23,10,35,31,6,21,40,44,37,23,34,49,55,64,57,42,97,81,86,76,82,100,81,95,103,155,170,163,174,153,142,168,169,192,160,181,178,135,156,154,177,145,150,144,165,143,186,176,159,160,179,162,161,166,154,180,156,158,162,173,154,168,164,131,164,168,162,155,148,151,146,153,150,163,165,188,200,167,157,138,99,113,109,138,154,184,185,175,164,175,160,141,186,146,180,168,167,168,178,168,143,188,183,189,175,197,168,163,175,129,160,132,158,175,168,151,157,144,154,148,157,149,139,160,148,164,164,160,147,169,165,149,168,162,163,175,155,164,102,4,23,26,21,13,11,27,24,15,8,28,25,26,0,2,11,9,2,4,26,9,10,22,7,70,39,26,34,14,19,11,19,31,28,13,13,5,14,6,21,24,40,15,36,8,28,6,14,13,41,31,39,38,25,53,19,27,27,31,16,25,13,37,36,18,22,14,3,18,5,38,27,20,30,20,21,11,23,12,14,23,10,11,26,19,25,7,22,25,17,13,18,4,38,46,48,44,72,45,54,43,49,42,59,48,40,56,57,70,51,64,58,46,67,62,53,56,36,40,54,35,56,61,85,89,81,75,60,69,78,47,60,43,57,46,95,103,113,81,78,87,116,116,118,128,131,135,117,116,111,126,106,103,114,111,106,98,136,103,107,108,107,118,120,94,115,103,98,102,103,124,128,111,133,150,121,139,136,101,99,132,117,145,137,118,108,126,121,84,118,116,125,146,145,122,128,125,115,120,97,99,124,92,113,101,132,142,105,108,102,110,96,122,93,112,112,124,105,122,107,112,125,128,123,142,148,132,100,133,139,121,116,122,100,133,125,156,134,125,121,139,104,129,143,122,139,135,148,123,104,119,104,144,94,85,106,125,142,101,129,133,127,111,115,140,106,116,132,101,145,132,138,119,116,134,128,124,121,118,105,106,80,76,61,81,101,85,76,82,77,73,100,91,121,100,85,81,72,68,60,62,55,62,68,74,55,57,59,56,87,97,67,57,62,71,54,44,59,86,49,75,83,75,72,57,41,76,61,65,82,62,84,96,73,74,129,161,113,132,113,105,37,55,42,64,81,132,138,172,162,194,234,243,245,253,249,248,243,215,239,252,225,188,150,173,112,51,31,9,8,7,42,22,45,35,46,49,45,46,59,42,23,32,36,39,36,37,40,50,52,41,23,24,47,30,39,38,41,45,21,33,56,48,41,14,28,24,37,28,27,43,31,39,29,28,29,18,17,24,34,44,34,14,10,19,6,26,36,20,13,60,34,20,29,31,17,39,22,19,30,47,19,21,14,51,8,19,26,11,35,26,13,12,15,32,34,37,40,22,9,9,17,31,7,7,18,27,7,30,21,29,18,11,22,37,22,15,35,29,10,1,9,11,22,27,17,21,2,17,9,44,22,25,5,20,8,13,45,12,5,9,16,22,25,49,25,41,33,55,21,25,35,33,22,36,43,26,30,66,76,59,33,18,28,26,40,41,28,34,54,48,41,27,29,38,35,53,40,49,25,43,33,49,28,31,32,39,38,78,111,157,129,158,138,100,102,140,144,154,143,70,41,6,35,20,23,15,10,8,40,21,8,37,44,12,27,19,20,38,34,31,10,9,32,30,26,20,16,36,25,38,47,49,54,59,75,60,62,102,86,93,74,95,95,80,149,150,176,154,143,156,155,182,164,153,153,155,174,185,167,161,158,182,132,160,155,167,179,155,170,152,200,155,151,177,150,196,162,123,161,175,185,171,146,152,155,152,132,142,146,162,160,154,150,173,147,175,179,181,181,192,140,112,78,109,108,160,190,184,185,178,142,186,174,178,146,157,167,170,166,161,191,132,168,155,145,176,175,161,154,159,145,166,140,152,156,163,157,159,150,164,158,152,178,160,155,158,161,152,179,158,170,169,171,182,170,141,159,142,111,8,6,10,10,31,17,31,4,24,4,29,30,14,1,22,28,17,15,2,9,10,22,10,34,31,47,17,24,51,21,7,32,18,21,33,45,22,9,18,17,6,29,22,27,8,3,33,21,0,12,9,28,44,20,18,6,16,35,28,12,35,28,12,32,30,8,4,29,11,10,23,12,6,12,36,31,13,30,20,23,28,34,48,21,14,16,20,16,13,38,28,19,27,45,6,65,62,57,66,76,49,59,61,55,78,66,83,70,55,53,85,95,104,84,77,65,82,61,58,44,69,58,53,99,93,74,121,81,106,106,89,75,51,89,71,70,90,70,87,65,73,110,69,99,103,87,129,109,128,140,140,114,125,111,126,140,123,115,116,116,119,84,128,130,119,124,101,151,111,130,120,129,145,116,150,110,113,127,119,107,121,164,120,105,117,108,124,126,118,132,104,146,112,110,108,136,117,103,134,118,135,104,119,118,123,124,64,95,105,127,136,135,122,106,122,124,102,102,131,136,123,148,130,122,103,131,137,123,117,148,128,112,117,136,141,109,92,114,123,107,139,119,133,111,117,135,119,103,116,113,111,114,86,94,105,119,123,112,122,94,117,116,116,110,105,121,115,119,127,149,128,118,131,121,134,148,124,159,145,139,133,85,100,82,84,112,87,84,82,74,76,58,79,120,93,101,78,64,94,69,111,85,62,68,89,96,106,88,74,99,74,74,62,68,39,48,69,55,80,68,62,52,74,56,50,54,46,49,58,53,64,79,60,65,100,125,150,150,179,168,154,150,124,87,97,111,82,115,99,89,125,124,177,166,195,196,215,201,228,248,242,251,247,238,252,241,243,194,166,102,67,114,5,19,24,25,30,33,35,29,15,41,18,31,11,48,35,49,13,39,50,23,31,33,39,36,30,35,31,24,13,24,42,34,9,17,50,49,24,50,30,30,33,30,33,25,16,16,12,28,33,33,42,33,10,51,9,27,8,47,32,23,43,45,12,25,22,23,20,49,19,14,41,44,23,25,14,12,23,23,23,22,14,11,19,12,14,33,23,29,38,28,8,5,26,31,24,10,22,15,23,3,18,20,5,22,28,17,11,25,15,28,10,13,23,17,14,42,33,0,43,31,30,9,33,14,26,7,29,8,22,33,31,14,21,23,33,43,14,14,28,20,23,48,61,31,43,35,23,47,29,17,41,14,22,30,10,51,35,22,29,33,28,42,39,61,45,44,42,52,24,36,49,44,44,35,34,42,80,118,140,134,122,95,85,112,115,122,146,139,63,36,3,12,26,27,8,17,29,31,40,11,45,41,15,31,26,25,28,12,11,11,15,23,39,20,39,31,60,49,29,50,56,42,81,60,51,52,67,78,76,109,104,107,144,183,152,167,166,156,154,180,174,179,163,164,183,167,159,170,147,172,148,166,147,153,159,145,149,154,155,163,157,160,164,155,167,151,148,135,153,151,176,161,146,168,150,165,162,174,194,164,151,149,167,143,152,146,181,162,160,146,138,102,87,91,127,164,185,206,147,167,155,168,150,156,145,175,147,157,156,166,146,150,138,150,150,153,148,155,179,161,161,174,180,171,167,160,185,135,167,180,140,173,174,167,187,200,199,146,178,145,156,141,178,171,185,174,170,124,1,8,6,3,13,15,13,21,25,28,12,16,9,8,3,18,5,4,10,13,7,6,3,7,27,14,15,18,25,32,17,9,9,30,16,24,26,45,10,33,1,7,18,27,15,33,32,31,25,20,15,16,30,8,46,21,11,0,25,27,38,40,7,11,31,31,4,35,19,21,24,23,3,26,15,26,24,22,29,9,15,1,27,29,18,10,36,12,46,26,35,10,2,37,35,49,23,55,72,71,68,87,63,102,71,66,80,102,109,150,100,79,110,102,107,70,74,38,67,39,48,64,65,117,113,105,97,106,117,121,104,97,106,84,101,95,85,131,133,94,91,95,87,88,130,113,100,111,134,137,137,126,135,143,132,116,129,120,132,136,153,121,128,114,117,149,132,140,156,131,122,139,128,135,130,144,146,140,144,134,128,97,141,126,132,128,121,109,91,102,117,118,134,121,132,128,117,135,133,128,95,97,129,133,126,118,124,139,138,111,128,104,130,121,147,157,104,130,132,127,112,125,118,102,112,129,149,129,137,114,104,132,120,119,145,111,132,121,131,129,126,114,140,144,133,127,107,108,99,118,144,133,130,132,154,145,131,132,128,135,171,130,148,141,119,112,140,118,104,104,105,123,116,140,144,153,126,140,150,109,119,121,110,133,122,122,58,82,89,86,95,92,102,116,98,100,126,96,89,110,107,100,92,107,84,98,101,111,102,70,65,63,73,78,46,39,68,66,74,60,47,34,65,48,34,40,60,26,37,53,55,102,64,81,97,122,137,154,172,162,173,194,195,184,164,135,125,129,92,101,73,73,92,63,105,132,154,157,179,226,225,244,235,243,252,255,246,237,233,234,241,233,210,144,116,70,66,19,5,5,41,34,45,35,34,33,33,30,47,28,28,33,27,35,38,31,36,57,32,38,52,26,22,30,15,40,35,31,11,7,42,36,19,25,22,27,7,44,35,37,16,27,20,32,12,47,29,34,32,38,39,22,28,7,27,33,39,13,21,23,10,15,15,18,33,26,13,23,18,26,39,29,12,41,32,34,11,4,18,37,14,20,13,17,12,29,30,52,8,6,29,15,27,45,44,12,31,34,7,36,26,15,41,37,2,24,6,11,29,45,41,9,21,31,24,25,18,17,3,11,30,42,32,4,24,24,25,41,22,22,23,30,13,40,37,27,13,23,37,39,23,26,33,17,20,50,18,21,43,51,10,31,6,50,24,17,57,47,40,33,36,44,32,17,50,16,10,56,57,71,61,59,55,37,36,41,55,66,55,76,28,40,9,33,21,15,12,8,26,34,21,10,41,29,20,7,17,45,6,47,24,9,20,25,49,21,36,26,42,17,50,37,63,40,51,70,62,72,80,96,94,122,108,141,173,149,162,151,174,171,165,155,163,176,163,141,157,153,148,165,147,140,149,126,112,165,175,166,182,161,154,148,157,168,173,169,158,145,160,138,164,154,160,168,162,175,179,167,198,173,169,155,136,117,117,151,150,158,158,174,154,179,163,134,132,92,85,102,169,178,174,162,172,160,159,154,159,168,118,163,153,152,157,154,139,165,157,175,176,167,162,173,169,145,147,173,183,154,153,165,139,152,181,184,140,158,164,166,167,141,148,171,163,157,149,171,170,168,159,102,22,10,3,20,3,11,31,10,7,3,13,7,20,11,14,10,10,12,4,5,19,5,1,3,10,43,14,15,12,4,38,24,19,16,42,5,24,22,24,2,22,49,8,18,26,10,17,10,24,22,20,33,8,40,11,15,15,9,13,11,22,34,16,13,42,14,24,16,9,10,27,12,8,9,17,19,26,12,18,10,18,46,36,11,20,22,25,11,28,19,10,20,32,41,24,25,37,31,50,36,33,36,31,45,53,65,55,76,57,82,85,68,81,99,58,40,70,12,43,45,35,66,52,47,82,60,47,75,42,63,61,80,53,65,72,51,57,42,76,63,56,32,27,45,82,85,90,50,64,78,73,113,93,97,98,115,83,112,125,118,100,123,116,128,125,138,125,136,113,125,152,125,121,112,118,121,161,123,125,119,122,98,98,135,97,111,137,139,124,123,101,137,137,134,138,138,128,134,136,113,126,133,135,118,122,116,129,144,140,147,127,135,139,128,123,136,140,131,144,128,119,150,128,128,128,141,137,157,128,117,120,128,145,131,127,114,126,96,117,132,128,126,141,115,121,146,129,148,145,146,147,125,128,146,103,132,131,135,135,121,136,126,127,124,124,136,112,101,115,125,123,127,108,129,121,121,115,128,113,73,108,87,132,117,91,96,104,120,109,127,125,117,99,116,140,88,114,128,91,85,96,74,61,98,88,76,61,77,73,49,35,57,78,101,77,75,79,66,92,45,67,58,26,51,30,45,62,72,60,36,29,60,113,88,95,109,112,70,102,116,69,115,91,80,71,68,132,111,92,103,92,91,116,43,51,69,89,58,123,132,131,153,149,180,194,228,237,238,247,239,252,249,249,244,240,216,179,190,131,98,113,48,13,20,10,20,24,29,20,24,16,40,49,33,18,27,16,38,40,40,21,51,16,20,43,9,37,15,29,9,11,7,21,35,33,29,18,6,43,16,47,38,38,15,31,27,53,39,18,25,35,19,11,23,29,17,22,20,32,31,8,44,22,11,18,24,29,26,44,32,17,20,6,30,20,23,15,40,30,23,13,25,27,31,23,38,19,19,16,30,14,16,40,18,12,26,29,22,15,16,37,55,17,13,65,17,41,52,35,9,30,25,19,29,3,27,46,16,27,33,18,42,31,22,22,7,20,23,2,2,26,25,25,30,32,44,36,17,49,47,51,19,46,34,23,37,42,33,35,36,34,31,24,20,22,20,48,46,39,24,21,41,30,32,59,36,21,45,31,54,41,38,18,47,34,32,28,38,72,69,30,41,32,30,17,12,16,29,28,16,40,21,22,11,44,31,26,34,10,23,7,12,34,22,37,24,14,26,54,25,25,57,31,41,71,76,50,79,79,69,112,93,155,194,169,173,151,163,180,163,148,144,162,161,168,138,158,156,149,141,178,150,172,163,173,144,180,166,153,158,175,166,149,147,156,152,173,153,157,137,178,173,151,169,149,163,142,186,168,178,158,166,162,146,151,152,162,165,197,144,167,166,142,171,150,106,66,66,131,145,152,152,178,159,152,138,172,158,164,144,152,172,165,180,168,165,160,157,158,183,154,169,148,167,133,162,167,164,166,149,174,154,187,170,166,172,160,166,173,138,155,164,170,165,183,152,153,184,190,116,3,7,11,3,18,6,18,18,2,26,19,2,27,1,17,4,28,22,14,24,27,23,4,4,30,31,16,43,14,27,43,15,19,14,20,20,10,29,9,16,45,14,25,12,23,13,20,12,11,16,8,45,45,36,19,17,13,45,10,10,31,28,35,13,31,20,2,32,32,36,11,12,2,37,4,4,43,23,17,21,7,38,30,19,14,27,35,29,20,19,31,8,33,28,22,16,52,3,4,22,20,44,15,30,22,17,47,53,33,36,21,33,35,22,35,32,32,47,40,69,36,35,42,48,67,53,57,66,27,36,61,48,47,63,31,60,61,65,34,23,36,46,45,58,56,75,68,42,75,45,67,78,46,76,68,112,103,125,130,129,112,111,114,103,107,119,127,138,114,129,124,128,128,119,134,118,116,108,105,126,100,92,109,118,139,108,147,132,117,136,144,124,151,127,151,129,128,119,132,129,123,130,121,122,138,166,154,116,129,135,132,102,126,133,124,129,145,145,136,129,139,138,132,128,123,133,148,116,144,125,121,122,141,152,118,134,129,134,125,132,119,136,144,130,135,146,126,159,149,130,145,130,147,112,134,133,127,118,128,110,137,111,129,130,117,94,136,108,133,139,140,118,124,123,109,102,115,103,118,100,114,113,116,94,107,125,115,126,115,102,128,118,99,109,66,107,86,88,102,82,68,75,67,90,113,68,77,91,74,64,56,59,43,87,40,93,60,89,62,64,54,67,83,84,63,52,56,42,20,24,53,70,77,73,61,36,52,58,56,48,40,49,62,53,55,61,86,89,108,87,128,158,129,112,88,66,46,22,48,14,30,72,81,85,85,129,122,169,200,185,239,253,254,244,251,234,255,242,250,252,243,195,175,148,110,88,65,44,25,19,28,66,30,39,26,57,23,28,47,26,34,11,21,34,33,42,44,44,32,53,32,32,27,33,29,47,13,34,28,27,31,54,22,2,35,20,31,24,19,50,13,9,7,30,30,24,14,29,41,17,5,15,7,38,4,38,16,1,24,19,14,28,11,1,8,6,27,19,27,23,37,7,43,12,21,31,5,45,51,24,17,32,28,5,11,24,29,6,5,2,7,5,9,35,39,31,38,8,21,27,9,10,9,8,24,13,13,3,26,22,26,9,24,1,11,8,16,30,22,10,24,9,30,20,18,25,29,45,42,67,67,18,39,53,20,7,41,25,49,21,22,46,45,42,27,49,17,57,30,24,49,43,62,25,45,43,39,56,44,59,53,68,21,19,22,60,17,44,28,26,42,37,46,7,29,31,14,52,20,41,8,16,14,21,12,30,33,3,14,35,3,23,34,12,44,46,21,18,34,30,66,55,48,62,72,82,95,85,87,90,115,171,183,186,163,170,150,182,162,152,161,143,174,154,163,159,174,176,158,168,144,146,177,177,174,165,147,159,159,178,148,186,158,163,168,157,177,173,163,178,163,162,158,164,152,163,141,152,142,177,159,165,137,155,153,137,153,187,157,180,172,162,181,210,173,163,106,90,77,106,130,168,162,171,168,161,173,148,165,187,163,174,151,174,164,167,162,183,167,159,172,165,160,177,155,165,166,158,155,158,163,151,161,152,163,170,146,159,176,180,162,153,161,152,156,147,189,143,180,106,9,6,3,0,24,1,11,13,29,26,29,29,12,17,20,5,7,11,18,21,3,20,16,23,60,14,25,23,41,15,37,11,39,15,26,31,10,4,24,20,20,11,27,13,0,48,7,24,42,29,21,29,8,34,14,46,23,9,28,15,7,48,29,10,13,40,10,11,17,44,21,41,33,18,24,20,52,51,34,4,33,5,23,15,41,27,21,38,34,10,19,29,28,27,15,33,19,17,9,4,33,11,19,37,40,51,36,36,38,31,30,34,56,32,54,67,53,24,46,49,68,49,45,64,48,21,58,58,64,31,59,61,34,67,72,62,64,69,74,53,44,54,54,44,105,77,94,82,84,102,89,89,88,136,134,147,129,114,133,142,106,129,152,136,108,112,115,110,108,120,102,105,148,103,139,135,137,116,144,134,138,114,126,154,153,116,141,138,115,135,115,122,119,117,116,120,130,115,101,118,122,125,121,134,104,134,144,133,132,130,132,133,111,120,124,139,124,144,138,128,150,130,150,114,141,148,136,137,140,135,119,136,140,121,138,130,152,130,130,129,123,166,121,157,130,135,120,143,173,136,130,138,119,150,126,119,138,143,132,152,137,168,122,118,146,146,134,127,130,148,143,140,123,126,131,116,139,109,141,120,137,124,140,106,115,125,123,121,92,100,115,104,111,116,86,82,85,79,84,109,79,88,85,83,93,89,75,95,81,101,85,59,75,68,64,52,67,61,79,61,88,67,76,79,68,41,58,71,66,74,60,60,30,48,66,90,55,59,50,64,40,51,72,35,60,57,45,77,95,113,129,121,146,141,141,132,80,59,84,85,91,75,57,57,53,70,72,98,112,104,146,159,219,213,223,230,247,213,242,239,226,243,235,220,229,211,196,190,145,121,105,94,57,29,21,25,11,61,37,45,39,43,32,13,36,44,19,30,11,30,28,17,15,47,13,38,33,23,21,9,42,33,20,15,14,25,42,25,31,19,17,14,56,43,20,37,25,9,2,22,22,18,32,24,3,29,2,46,20,2,54,20,51,43,26,14,17,13,11,6,33,38,21,37,12,0,29,11,18,33,28,34,29,29,38,20,29,16,20,39,35,22,39,21,36,9,34,18,12,8,19,35,31,27,36,14,19,16,3,18,20,42,44,22,17,13,8,15,7,24,22,3,6,30,17,46,37,52,42,32,36,54,39,40,52,55,53,19,32,23,27,42,9,26,48,40,57,30,10,18,49,37,40,28,36,29,39,50,38,35,49,34,44,29,25,15,17,33,31,29,72,25,40,51,29,12,15,8,39,20,23,20,13,6,50,33,16,44,24,13,30,22,36,5,53,19,44,33,40,40,55,52,59,63,94,91,91,86,75,108,157,179,178,147,169,169,166,134,161,156,170,147,152,153,186,140,173,191,178,180,172,155,157,183,137,151,154,169,132,164,143,168,169,169,186,184,163,160,160,154,164,170,162,166,162,159,151,144,161,157,160,162,163,176,150,137,127,154,164,167,171,161,162,153,155,176,148,142,98,84,92,134,147,183,196,174,163,170,172,132,167,153,176,188,175,155,165,193,166,137,164,133,185,158,168,157,162,159,162,159,150,176,195,161,151,204,190,172,178,191,174,168,154,171,155,149,166,172,166,92,12,5,4,16,15,2,9,4,11,8,20,5,28,8,15,30,5,4,9,18,24,40,23,35,22,30,23,27,31,19,23,13,38,18,10,14,16,19,12,14,9,18,13,21,23,50,22,22,23,20,13,10,9,15,6,33,4,18,36,13,12,13,10,15,37,10,33,21,30,12,47,27,44,2,8,31,36,8,18,36,17,12,39,19,29,18,34,16,18,13,8,19,19,16,30,27,23,26,25,36,21,37,38,28,33,58,24,39,45,63,54,59,40,35,34,46,56,61,60,83,85,72,45,68,61,79,65,50,49,37,72,46,35,62,59,88,52,66,66,82,73,92,112,95,80,100,98,92,59,90,79,105,126,108,124,122,130,124,122,94,115,127,145,130,136,122,113,112,136,148,134,127,160,149,168,136,138,110,140,120,144,167,115,104,149,154,137,128,116,124,121,107,143,126,141,117,111,125,116,117,141,140,106,102,133,138,154,128,102,107,123,121,131,125,118,150,122,91,127,154,97,128,118,131,95,135,129,157,175,148,137,156,122,126,151,173,153,138,143,147,129,143,142,148,156,141,136,146,136,152,115,145,138,140,154,173,157,136,153,151,126,159,148,140,171,134,146,154,140,148,130,167,151,133,153,156,126,154,163,146,127,135,144,147,142,151,128,115,131,114,97,126,95,121,83,94,124,122,92,78,76,62,70,62,81,64,61,60,88,93,68,76,84,74,107,89,89,96,100,77,76,64,63,41,78,56,69,70,44,58,38,49,62,60,56,15,58,54,65,50,64,7,78,60,45,49,53,34,62,75,81,94,101,122,147,152,129,124,150,161,160,153,132,90,87,87,63,53,60,38,50,56,95,98,132,158,168,206,190,212,238,233,230,240,209,253,244,230,239,247,246,223,211,200,181,182,141,99,62,60,44,24,14,29,22,36,10,42,29,15,28,8,23,43,33,27,27,36,37,9,30,12,28,5,41,18,52,22,33,8,16,40,10,11,26,21,18,15,27,16,32,26,19,40,25,19,9,20,35,17,38,10,15,4,23,10,9,11,27,27,11,21,16,17,7,21,38,8,17,24,44,31,10,29,2,18,44,45,20,15,26,8,38,12,5,7,12,40,31,3,29,5,30,44,23,18,9,29,7,11,18,12,39,34,11,44,16,16,22,3,32,13,7,14,14,38,38,22,37,44,22,29,19,34,51,20,6,26,37,50,48,18,13,60,56,54,39,50,55,63,49,68,20,29,35,32,44,43,43,42,41,23,55,26,30,19,0,36,29,49,28,16,32,15,47,5,43,21,15,14,16,17,11,10,26,32,51,12,36,39,29,23,20,23,31,44,39,44,51,24,41,70,66,64,61,85,93,74,100,166,149,173,160,167,168,146,169,173,151,171,156,158,149,163,166,145,161,166,143,153,146,151,156,161,169,152,122,164,173,147,146,151,152,159,153,163,152,179,165,160,150,167,152,153,157,184,162,193,160,155,155,128,155,168,147,185,136,125,141,157,157,157,135,157,178,187,156,142,128,108,88,95,100,144,150,169,180,168,161,138,158,156,168,164,150,160,154,157,170,177,165,148,151,153,168,180,156,162,148,160,152,163,168,168,151,174,167,181,170,149,174,161,178,169,161,177,177,150,161,86,4,29,2,3,22,16,56,16,13,31,11,26,1,0,11,4,5,17,49,17,6,10,2,3,27,35,18,21,21,39,17,13,30,18,11,12,6,23,12,10,9,17,25,24,6,26,30,18,20,23,33,4,21,28,29,46,23,8,37,19,32,31,39,22,31,52,16,27,22,14,12,37,25,22,14,37,15,18,10,21,31,6,24,14,8,11,12,27,31,25,25,19,34,11,9,41,37,27,24,25,47,38,41,31,34,32,50,73,44,38,66,56,47,55,59,57,75,50,62,67,85,70,48,55,94,42,62,47,81,61,48,49,24,59,72,44,46,39,59,10,40,46,95,92,82,62,81,75,57,64,94,86,96,96,126,106,106,108,101,87,94,131,108,126,127,148,115,114,122,141,130,88,110,111,121,110,152,147,113,125,119,144,141,128,145,110,120,152,129,133,132,139,142,139,110,127,118,130,123,134,144,120,123,109,150,156,137,148,123,134,136,147,143,161,154,112,132,123,135,150,134,147,145,128,127,165,136,140,124,121,130,159,155,173,138,132,141,132,130,164,140,157,127,150,156,141,135,162,134,120,165,145,152,136,103,143,148,116,147,134,123,146,141,144,175,183,172,156,176,167,145,144,145,127,132,144,133,143,144,142,120,117,123,127,110,117,111,116,108,116,141,116,106,90,74,105,98,101,96,81,91,71,61,59,46,75,62,74,50,70,37,65,105,82,80,79,81,92,74,69,65,95,78,76,29,66,50,70,54,31,38,53,54,62,59,56,68,64,58,35,50,59,45,63,52,41,77,63,71,73,84,85,82,132,83,133,67,111,151,150,143,157,137,129,101,139,98,98,92,37,36,52,71,46,38,59,56,87,99,130,167,130,154,179,209,204,216,216,216,228,238,227,230,241,235,223,229,225,204,177,186,164,149,144,63,69,17,22,18,39,29,22,29,0,21,3,44,10,48,20,12,8,40,28,22,23,2,30,41,26,12,61,12,34,22,21,34,23,23,37,51,11,21,28,38,12,22,26,11,11,32,29,13,13,8,37,33,37,28,20,30,22,1,28,21,19,26,30,25,24,28,55,10,5,5,20,24,13,6,21,9,12,27,32,5,4,20,32,37,53,16,35,20,22,25,39,8,25,14,19,40,8,17,33,5,3,36,16,6,13,26,4,34,15,49,27,35,23,37,52,21,56,22,55,19,24,21,46,15,36,42,36,30,62,67,69,51,42,35,41,27,56,54,45,48,29,44,38,59,16,54,39,45,31,53,56,29,22,34,27,15,24,39,14,13,6,27,20,26,34,22,28,32,43,31,30,23,28,3,8,37,36,22,13,15,41,25,11,40,30,38,62,62,60,98,102,97,129,150,129,150,160,142,135,163,154,163,135,173,168,164,156,157,131,139,132,177,144,136,144,150,117,165,126,149,149,146,140,166,145,164,183,141,131,146,113,145,152,145,139,172,154,165,154,158,156,152,167,159,171,165,168,144,165,169,159,141,159,140,134,163,133,175,156,167,177,157,152,160,166,125,86,82,136,148,155,161,188,147,143,132,152,143,137,173,173,157,161,160,181,132,174,175,135,176,169,160,141,146,166,148,152,161,201,161,158,176,148,162,154,171,150,166,164,154,155,182,164,147,127,19,16,6,15,13,22,11,19,22,25,27,45,6,8,12,6,22,14,7,10,22,0,0,33,14,40,7,9,12,23,22,9,26,32,17,8,48,45,2,24,34,24,17,27,23,11,32,6,2,20,38,12,13,4,14,20,24,27,4,8,5,5,34,14,6,4,27,27,24,26,3,20,9,22,10,5,21,20,18,31,6,7,8,13,6,48,21,11,3,4,36,13,38,48,40,14,25,16,36,33,39,32,17,14,26,30,34,32,32,22,27,39,35,41,58,8,19,35,33,34,8,21,42,37,45,56,43,28,83,68,59,56,67,28,34,38,34,25,21,25,25,14,52,56,71,86,93,63,77,66,86,108,138,111,87,97,75,101,102,122,71,96,105,118,129,139,141,150,118,136,136,128,136,88,82,114,153,126,108,132,121,127,120,147,133,143,140,147,147,115,121,107,131,136,133,144,152,145,172,166,135,137,140,169,178,139,136,153,168,162,171,155,156,157,142,149,142,151,166,138,141,123,139,95,116,126,120,118,123,145,123,121,102,89,110,140,137,121,131,134,120,121,122,98,121,132,123,147,123,156,106,116,106,109,126,103,114,108,95,95,114,117,91,113,139,139,136,119,130,147,123,124,139,114,140,131,134,105,124,104,90,116,115,112,103,122,114,113,110,118,122,78,100,73,71,120,118,105,83,75,78,79,34,60,63,104,85,53,89,63,94,101,95,80,71,30,60,84,55,82,93,62,63,71,52,75,45,64,58,69,57,59,51,33,62,64,69,69,82,76,54,59,74,77,105,59,78,79,80,74,78,55,110,87,90,96,70,82,79,82,67,94,86,101,125,150,151,121,126,120,134,108,84,77,57,28,55,27,38,28,28,43,52,78,95,108,140,153,166,161,179,191,212,198,170,200,213,182,214,203,218,241,240,244,213,174,172,141,111,106,94,83,110,41,3,18,39,22,19,10,35,12,19,24,11,14,26,22,38,49,50,36,22,38,10,30,23,39,23,24,46,12,43,30,26,31,14,1,25,17,31,40,40,29,28,30,14,29,31,8,18,18,27,23,15,7,10,18,31,1,48,32,49,24,36,23,32,14,26,8,20,4,50,41,31,18,20,26,19,14,9,24,29,7,22,46,32,42,37,14,20,9,21,20,9,42,25,12,8,25,31,21,10,40,27,43,34,20,35,48,23,24,28,25,45,32,31,20,47,29,55,32,33,78,60,37,39,49,17,28,51,24,18,74,46,62,37,58,57,51,45,49,40,29,38,43,34,46,28,22,32,33,46,36,13,31,13,24,17,9,39,55,5,18,25,5,10,17,3,29,35,32,34,7,47,39,40,25,26,41,43,25,69,83,91,104,158,186,160,142,152,165,171,150,167,155,144,137,109,157,160,166,156,176,154,169,159,187,177,159,172,139,150,147,161,163,151,155,161,166,158,134,150,150,172,143,151,152,147,174,142,154,130,151,157,179,139,154,156,143,163,149,168,168,139,163,159,172,162,142,161,147,152,163,170,160,186,146,189,166,168,97,103,77,122,124,187,153,177,153,145,175,158,141,172,145,167,163,126,161,149,176,164,152,172,141,152,132,154,167,161,170,133,149,162,140,157,168,160,151,175,160,130,152,166,156,165,182,157,103,23,13,28,8,16,11,0,5,22,20,34,26,11,14,7,22,2,7,17,37,31,22,0,6,34,7,15,34,27,39,20,2,11,28,21,12,7,21,31,33,21,19,18,11,28,37,31,6,16,30,29,17,14,25,13,37,12,26,24,1,6,24,4,20,28,14,36,39,10,10,25,23,3,0,44,23,3,31,45,22,18,24,16,27,28,10,13,0,33,23,22,19,2,19,39,26,31,35,33,32,39,7,15,39,16,22,25,48,31,9,16,33,37,46,6,26,16,32,27,33,39,37,33,50,32,43,51,84,71,61,45,51,59,37,67,58,56,41,52,56,37,51,69,50,67,53,96,87,78,111,106,77,81,67,55,61,88,121,119,102,107,112,116,119,127,87,112,98,146,154,167,117,150,141,92,142,126,118,105,114,116,124,78,93,129,121,124,127,112,141,146,133,120,121,147,103,127,129,130,127,129,130,111,95,104,105,113,129,102,103,106,99,130,100,111,139,139,147,123,113,136,121,132,125,121,135,136,114,128,138,106,66,89,53,63,134,138,121,156,146,108,109,96,123,124,116,140,121,126,130,135,109,115,143,115,125,103,82,90,90,101,105,106,128,111,72,56,89,95,69,96,105,100,95,126,128,122,118,107,106,91,119,118,122,127,109,109,96,99,105,105,101,104,114,99,131,123,81,99,78,115,80,101,70,87,102,75,117,112,105,101,124,95,58,72,34,64,69,88,80,90,64,69,59,101,54,79,63,54,50,36,49,39,28,30,53,59,56,38,62,53,67,53,53,68,57,60,46,51,55,49,63,50,64,43,48,42,47,52,51,33,22,57,87,115,142,126,84,112,147,108,129,115,90,126,115,109,89,41,86,52,59,62,71,30,75,58,57,72,85,78,110,124,123,129,134,166,180,189,192,190,212,214,201,230,216,213,198,195,205,232,193,151,116,116,109,103,47,34,19,52,15,29,32,10,39,27,10,26,5,31,33,32,16,50,2,26,41,11,32,18,21,21,16,15,15,22,23,17,16,14,37,10,11,33,25,17,10,19,12,19,36,5,18,15,21,31,44,30,7,26,25,19,13,24,22,19,15,15,6,26,17,50,5,23,7,18,25,41,16,29,15,36,10,27,8,33,38,18,18,35,34,37,14,2,55,27,50,7,9,16,12,28,22,25,8,31,23,20,12,36,28,55,20,48,22,38,38,35,31,35,26,30,47,51,53,52,32,22,14,22,42,35,13,39,43,36,23,36,33,41,11,60,22,55,32,37,40,39,53,48,20,9,30,51,27,14,28,46,24,39,31,33,6,7,14,12,38,17,22,24,22,20,46,22,26,50,26,15,45,65,27,76,103,108,154,170,167,210,157,178,172,171,179,181,163,161,170,172,176,144,165,165,143,173,150,163,160,179,161,147,154,167,182,159,142,143,149,146,140,151,146,152,143,187,171,144,165,190,171,162,191,162,166,161,167,170,146,148,149,164,171,157,139,151,129,161,178,172,140,160,155,153,154,141,180,173,172,155,188,170,137,147,86,62,81,129,164,154,199,185,147,154,158,180,147,169,164,153,143,147,144,145,125,160,153,162,149,165,169,154,146,156,172,170,191,173,151,162,164,173,145,158,154,154,149,136,160,186,110,28,0,12,7,7,38,42,19,11,16,49,14,9,4,0,0,10,13,19,12,45,11,24,1,29,23,2,14,28,13,4,11,14,36,4,22,16,13,14,22,5,12,22,37,12,6,13,20,6,1,19,17,21,19,35,20,4,33,21,3,25,33,48,30,41,19,16,10,12,14,7,19,8,16,40,24,26,16,19,9,1,5,11,19,21,12,29,24,3,29,19,16,28,20,10,29,45,29,39,18,40,42,14,17,7,18,31,36,15,27,40,39,30,30,22,9,43,43,33,30,41,32,26,43,45,32,37,71,66,105,88,42,71,42,38,53,43,69,56,19,41,17,29,43,50,32,29,36,57,46,55,48,44,56,75,89,115,122,168,109,143,120,133,128,117,137,129,112,142,124,133,146,162,151,145,137,135,133,121,133,146,110,105,120,117,131,134,166,128,146,151,127,120,116,93,102,108,133,134,107,111,113,92,92,86,105,95,66,73,91,101,57,74,94,113,119,112,128,139,113,121,120,105,119,145,149,144,114,163,139,100,98,121,127,157,156,146,150,154,102,142,119,148,98,108,128,99,117,117,123,142,153,157,143,149,158,139,142,133,137,119,119,142,124,132,134,125,110,99,116,132,127,102,150,153,162,135,109,117,138,99,139,131,146,120,110,94,106,121,108,79,110,105,129,131,106,82,77,74,72,100,118,95,95,112,96,113,105,138,146,107,72,75,55,86,59,60,50,73,89,80,75,86,73,62,77,62,36,70,42,49,52,41,82,47,40,56,37,85,65,55,65,54,64,39,49,54,64,57,67,50,58,60,40,73,40,63,41,39,39,26,22,48,65,58,61,80,64,54,110,104,71,112,126,110,114,105,138,96,95,87,131,94,71,60,71,53,47,59,25,27,34,24,83,41,76,101,91,104,121,146,132,124,161,169,169,184,177,205,216,218,207,169,201,185,189,174,158,188,149,112,100,120,96,72,44,49,21,40,30,16,42,1,40,5,15,31,15,17,19,19,18,22,17,27,15,15,12,17,28,31,25,5,24,11,1,47,12,7,1,20,37,27,29,43,35,35,27,8,34,33,38,6,17,25,16,21,28,39,36,15,7,9,34,9,11,33,15,15,43,31,19,32,32,18,37,22,21,23,26,43,30,8,26,51,11,23,34,67,16,10,13,20,2,46,36,4,29,30,44,6,10,37,31,36,56,10,24,35,23,36,19,44,59,51,58,32,40,19,36,17,45,26,34,57,54,36,50,64,35,49,21,29,28,47,33,43,45,26,18,11,9,30,15,25,26,2,20,8,18,35,24,18,33,14,8,30,27,32,24,14,19,17,29,28,16,17,47,35,43,52,36,59,67,128,152,163,156,178,193,143,165,157,157,152,155,155,160,129,177,129,154,148,136,160,148,141,145,160,147,179,166,159,168,167,167,167,145,154,165,175,163,156,176,162,175,181,158,167,176,148,153,129,150,145,132,167,172,157,166,150,138,145,175,172,168,171,127,136,161,163,131,163,150,148,182,181,163,179,176,185,155,162,127,97,77,111,126,144,175,160,174,172,147,181,154,170,169,160,165,152,139,168,155,149,159,165,166,157,159,179,161,158,133,158,134,170,152,157,192,151,170,170,145,159,129,175,177,156,112,0,0,31,15,15,0,22,9,2,8,3,12,26,1,20,9,25,12,30,21,18,1,35,26,10,9,15,27,12,14,9,29,12,46,22,26,20,12,21,10,4,7,38,0,40,7,8,24,13,21,38,27,15,31,14,23,29,14,0,22,35,12,22,6,15,35,17,16,33,4,37,27,8,13,14,21,17,29,5,8,14,26,16,16,32,21,10,20,6,12,33,17,53,17,26,18,26,31,30,18,26,21,41,32,38,39,13,22,12,17,27,19,22,28,19,22,33,19,35,35,28,33,39,49,29,41,30,38,13,55,49,61,56,72,58,71,37,33,66,27,33,21,38,43,55,39,52,58,26,72,55,39,42,71,76,78,124,144,111,124,130,137,158,152,133,144,126,115,108,117,124,149,128,148,154,169,157,157,173,165,130,133,113,153,114,125,152,135,140,135,131,117,120,96,108,112,129,102,123,123,120,100,119,145,135,146,122,108,118,100,114,110,114,128,118,128,103,102,95,123,104,133,136,108,151,134,128,144,155,101,155,154,138,182,171,135,148,139,133,120,118,129,146,111,118,136,151,120,124,117,116,142,149,174,172,124,152,144,120,142,135,128,151,149,147,135,162,161,142,142,123,157,169,118,137,112,131,132,129,136,119,145,142,111,109,123,107,133,101,86,97,116,111,91,101,80,66,94,81,121,74,91,108,109,98,105,88,99,93,94,100,68,73,87,70,76,46,64,33,43,50,52,44,70,71,60,67,51,59,57,65,46,62,45,66,67,56,68,65,59,49,98,97,68,75,18,45,81,59,55,37,71,68,31,31,21,27,58,47,42,55,71,61,67,56,45,52,52,75,73,78,76,81,103,56,72,102,100,102,100,117,126,113,103,107,91,102,80,122,87,78,47,65,54,60,47,47,51,72,36,72,61,37,70,131,126,112,161,152,153,210,189,172,181,172,153,187,205,191,175,176,163,192,172,134,170,162,179,159,78,80,89,113,52,47,29,44,51,27,35,29,13,37,30,10,53,25,43,36,36,24,4,19,15,22,11,36,27,25,49,16,15,21,14,26,47,17,31,25,28,11,17,14,29,14,0,26,15,8,5,12,27,4,41,25,46,12,6,39,7,3,22,10,3,33,22,26,23,13,17,9,10,29,33,38,23,36,40,4,16,42,13,9,21,30,36,29,40,24,21,26,18,17,39,28,52,40,23,37,37,28,35,62,29,66,44,55,60,22,44,36,40,57,49,62,51,56,44,55,39,30,33,38,43,48,37,31,51,36,45,25,42,21,1,12,23,16,14,24,11,11,44,14,11,17,36,14,5,33,17,10,34,27,24,39,30,50,20,22,50,33,39,49,99,83,109,125,170,191,169,155,160,157,164,156,169,168,133,145,148,152,140,151,151,145,167,154,144,149,135,168,177,161,172,180,184,178,190,149,148,194,170,159,165,193,177,151,171,135,165,164,163,158,155,148,193,153,182,149,159,165,180,168,150,156,158,152,156,145,160,168,162,151,172,156,161,166,136,158,147,157,178,148,136,143,116,102,118,128,172,167,173,161,164,148,160,174,173,143,152,133,133,156,163,153,142,154,136,157,146,142,128,158,136,150,174,155,167,185,158,155,140,156,186,145,176,179,147,148,107,18,1,3,5,22,25,27,8,3,27,17,14,8,12,11,7,4,2,6,10,9,2,2,11,9,24,29,5,13,41,14,30,14,3,22,9,12,0,8,17,37,11,27,2,23,9,22,20,7,0,17,25,8,43,19,27,12,33,7,14,1,0,14,3,6,33,26,19,32,3,14,11,10,24,18,9,44,32,18,22,18,20,15,11,21,11,19,33,18,9,27,17,10,43,14,13,35,48,32,35,43,15,68,15,29,28,28,21,12,24,27,19,33,35,18,20,26,24,27,38,32,9,22,25,54,37,30,61,54,50,36,70,58,46,65,44,71,49,62,45,35,55,62,58,44,51,43,69,60,61,48,49,30,41,102,93,121,120,110,94,98,68,122,111,142,112,125,118,118,121,112,99,124,119,129,122,110,146,117,138,136,150,118,160,132,130,134,147,127,146,127,132,117,147,115,144,110,101,110,134,110,136,120,159,128,152,147,152,162,164,168,185,178,170,146,101,113,139,97,121,121,125,135,135,133,139,145,128,156,113,142,128,156,164,122,134,130,136,133,126,128,139,137,136,165,132,128,140,141,104,153,145,125,142,151,159,177,132,124,132,116,138,119,153,143,164,137,135,153,145,178,154,129,150,127,118,139,106,140,147,120,158,88,143,109,131,109,114,126,142,131,113,84,101,111,98,117,100,119,105,83,96,108,131,89,96,109,114,82,78,83,98,87,79,69,92,78,73,56,53,56,62,64,62,98,69,38,72,39,84,42,61,44,66,68,49,54,63,57,65,68,79,64,51,49,64,38,57,70,51,41,30,62,50,50,51,38,58,68,49,47,46,28,43,46,53,62,74,88,75,75,63,55,74,71,73,92,103,83,106,93,116,94,73,104,116,121,116,122,122,141,135,115,111,93,100,87,76,57,58,38,24,40,20,35,39,37,36,71,79,66,111,148,138,166,165,139,165,152,162,185,205,183,180,181,205,238,214,185,175,188,155,146,155,170,183,155,127,121,95,63,78,31,47,55,68,69,21,46,38,36,38,17,12,30,24,19,32,26,15,23,34,5,3,12,20,16,13,19,8,21,11,35,25,31,15,16,33,10,25,20,38,16,10,18,31,23,53,38,42,35,11,11,14,2,20,23,18,20,8,43,11,46,15,31,35,27,21,20,8,32,28,18,44,18,29,36,16,5,2,8,25,19,34,27,38,28,34,28,32,28,28,34,57,45,45,49,43,63,63,65,58,45,54,44,52,37,58,42,44,32,48,28,18,21,17,18,58,49,34,9,29,31,3,13,7,22,38,29,28,4,29,19,11,26,12,10,11,26,12,31,24,4,19,26,24,28,67,39,61,59,53,60,75,52,92,117,136,158,167,169,161,182,170,192,175,167,140,139,149,177,167,142,175,124,178,167,171,156,168,170,175,152,173,133,169,166,159,176,172,178,128,159,150,167,133,166,166,159,175,171,151,172,161,160,162,178,174,142,146,153,156,180,143,150,170,162,155,177,192,134,156,157,154,167,146,151,147,157,145,148,144,165,142,151,116,133,104,92,111,136,159,166,151,145,167,149,170,146,150,146,164,144,150,161,151,167,168,157,147,162,146,146,176,171,148,174,151,147,173,147,167,154,172,163,177,185,165,187,135,0,4,2,9,20,11,29,18,13,25,26,5,23,19,6,1,1,4,27,30,16,16,19,29,7,18,7,16,6,6,17,13,21,24,26,20,18,7,28,18,13,26,15,12,12,10,3,17,23,13,14,35,15,41,14,18,19,24,53,25,12,9,35,18,3,2,25,25,18,3,23,10,17,20,15,8,27,44,17,4,32,40,2,7,7,37,11,13,19,16,53,32,33,29,8,22,31,36,48,20,31,31,27,29,11,22,44,35,27,13,41,33,54,37,44,13,26,36,23,41,28,30,11,62,50,60,28,30,38,60,50,33,24,44,73,48,67,86,76,82,93,65,38,74,83,56,59,73,80,69,66,47,64,102,99,127,129,113,123,90,109,82,90,93,99,119,117,100,100,127,114,85,113,76,85,94,111,87,124,127,128,120,137,144,119,139,102,145,137,126,119,137,150,147,147,141,141,129,126,111,165,143,115,116,129,147,116,138,156,150,142,156,130,140,131,118,139,149,149,163,144,161,173,169,144,154,143,135,135,148,145,131,134,157,127,125,146,140,141,140,128,123,145,155,162,105,136,110,109,122,143,127,101,140,95,124,147,135,132,133,127,144,159,142,127,138,128,140,132,126,136,139,147,122,101,120,131,107,120,116,141,140,123,134,141,132,126,128,110,127,93,122,106,98,86,80,108,119,99,118,109,124,117,136,95,115,72,96,73,85,87,70,77,97,96,45,64,48,45,51,54,84,39,62,63,43,68,43,55,72,71,59,68,52,55,61,61,58,44,54,74,44,40,41,56,54,50,41,60,54,57,47,47,62,43,82,49,61,84,59,73,43,50,69,45,37,55,54,62,67,76,79,105,93,77,113,121,108,113,90,86,62,82,71,73,113,88,93,125,122,112,147,92,108,115,130,98,92,106,118,75,69,60,52,42,44,58,42,24,30,53,25,31,46,86,92,84,80,107,113,112,149,121,145,167,177,153,208,186,189,178,175,162,210,198,202,173,156,127,140,177,151,141,129,168,167,108,54,85,115,104,69,49,79,27,35,26,41,10,35,12,18,35,5,35,16,5,14,15,10,11,28,22,12,20,11,25,8,17,8,6,8,12,11,29,22,19,27,10,20,28,22,34,15,45,31,4,18,11,39,25,17,22,31,10,19,25,25,1,23,25,35,50,22,21,41,12,16,31,28,36,6,63,15,21,40,27,48,29,7,26,20,23,66,30,25,57,46,49,55,77,60,76,40,50,31,46,51,30,20,33,33,14,35,33,48,32,33,19,31,13,42,19,29,17,7,17,27,16,7,27,25,12,37,23,28,38,13,22,13,2,31,27,25,33,25,37,34,41,65,68,71,79,80,84,95,90,129,125,146,132,148,170,149,145,168,161,154,158,172,177,169,152,155,145,151,157,162,139,136,136,145,145,155,121,151,171,150,164,161,139,159,146,163,150,168,145,169,150,172,149,166,182,134,147,157,146,164,166,142,156,143,141,175,166,155,182,157,171,158,144,144,162,158,184,154,154,160,161,166,184,171,143,172,146,155,158,127,90,97,98,140,177,174,178,169,156,181,145,181,168,150,158,167,150,137,147,158,164,174,155,141,142,177,163,156,177,163,135,152,156,141,157,171,176,173,140,154,163,104,21,15,11,6,35,21,12,45,11,29,26,28,45,19,20,7,12,20,28,5,17,14,23,15,6,30,49,13,14,16,23,34,23,12,6,16,6,16,11,25,7,36,36,14,5,54,35,20,18,26,30,16,25,9,45,11,32,32,21,20,24,10,31,12,27,24,20,4,16,16,27,15,39,18,18,8,24,33,3,8,27,37,48,32,20,22,35,9,19,8,23,20,23,28,40,58,49,46,20,20,37,14,23,42,38,39,23,58,39,43,41,52,54,53,47,35,65,51,61,30,75,58,63,33,52,74,49,42,64,72,60,89,49,91,50,70,83,87,82,81,81,59,55,73,91,113,99,116,110,123,110,92,109,140,138,145,156,133,141,114,138,110,116,98,111,124,118,134,114,86,156,101,93,106,114,144,125,128,113,142,100,94,89,110,111,132,126,124,110,127,135,96,137,112,119,106,103,118,111,135,143,120,128,119,100,117,118,118,111,117,117,87,110,110,70,101,159,120,140,157,111,130,137,151,140,132,142,137,108,146,95,126,128,121,133,151,156,131,123,141,144,149,124,114,123,130,109,116,134,130,113,145,129,114,110,124,120,144,108,133,113,129,107,149,141,138,123,131,109,127,137,114,127,100,125,145,117,107,113,110,106,112,126,142,133,124,86,114,79,92,124,144,144,88,119,91,105,110,92,73,107,139,128,116,75,102,68,85,95,97,95,68,57,90,63,74,71,32,74,63,77,73,69,95,65,46,79,60,53,56,65,52,75,82,59,85,85,57,50,58,90,53,47,69,39,62,59,66,68,54,60,43,38,45,54,44,66,58,46,51,67,60,36,43,48,47,42,42,54,64,56,44,64,59,49,47,32,31,36,51,62,41,20,62,67,68,54,72,53,58,90,79,79,66,95,99,85,98,85,110,109,121,104,83,54,58,87,61,48,43,42,16,36,52,48,35,25,55,42,16,37,30,57,78,91,121,105,117,135,130,112,136,107,155,136,144,103,130,166,157,142,165,147,140,132,125,110,142,123,155,154,138,157,128,121,100,95,110,104,74,74,65,42,48,55,30,40,24,31,36,18,31,33,34,10,5,22,21,11,27,34,4,30,29,10,30,20,27,4,10,5,15,36,37,30,42,10,20,32,20,14,19,12,26,46,23,36,22,31,26,33,9,14,68,46,30,10,2,34,25,11,27,6,31,37,13,33,44,31,56,23,47,53,34,66,38,60,65,13,37,72,75,70,46,44,41,61,48,49,45,40,53,25,40,36,31,30,27,36,6,16,11,12,14,10,25,4,7,39,29,27,21,29,5,30,40,15,13,19,40,2,40,25,42,35,51,47,42,45,70,82,80,76,89,111,79,74,111,104,118,145,141,152,136,160,124,156,162,184,147,155,146,158,161,149,164,153,169,158,150,145,157,174,164,121,173,134,150,120,149,148,151,164,138,146,143,171,177,163,133,151,154,154,158,164,178,131,170,157,162,178,155,165,186,174,155,177,161,158,143,141,142,166,172,158,163,177,181,171,170,152,180,173,157,187,160,149,149,148,93,117,98,130,152,175,172,166,169,156,162,168,145,162,157,143,144,168,137,169,165,126,131,164,161,160,137,169,139,177,154,149,148,157,163,175,149,173,166,160,110,10,8,23,15,30,10,0,28,34,17,16,14,13,4,34,6,19,15,11,10,9,7,13,24,4,32,10,25,19,22,30,19,38,15,28,17,34,19,27,33,20,15,56,28,23,17,10,37,20,20,35,12,22,20,40,24,41,8,30,0,2,9,37,4,39,11,11,23,5,19,20,14,11,25,13,22,55,43,37,11,20,26,30,19,37,33,33,27,18,23,11,15,47,42,47,46,23,39,19,58,37,43,70,49,56,36,57,80,48,68,52,62,46,50,53,63,83,58,40,67,49,25,57,56,31,52,30,29,32,46,60,49,60,62,38,48,67,33,48,63,68,52,29,57,62,49,43,74,81,115,111,142,85,121,100,88,80,118,121,120,129,122,110,115,122,97,115,122,119,147,109,122,114,115,127,120,108,114,115,116,128,113,95,106,116,110,118,98,80,96,118,106,116,120,107,86,106,110,127,123,137,144,121,99,113,103,104,130,85,109,106,123,100,121,97,119,126,105,118,95,81,101,119,120,91,116,109,121,138,129,107,106,120,130,148,132,131,110,148,126,118,117,114,123,141,141,155,112,115,144,128,164,155,136,108,147,140,129,136,119,130,106,143,137,123,127,136,130,158,130,138,121,106,125,101,109,114,101,109,104,112,133,104,135,107,109,124,100,116,105,116,106,103,87,123,107,100,89,92,77,107,90,106,134,75,98,86,77,103,90,90,84,68,65,55,69,90,101,64,58,65,71,67,58,46,50,58,46,49,48,35,46,71,41,61,31,79,44,47,94,71,28,48,28,51,56,56,49,47,57,60,42,52,55,47,66,44,66,67,59,26,58,54,48,43,60,45,66,61,62,39,52,57,45,56,45,67,34,38,59,57,43,27,33,47,50,45,56,47,72,41,64,56,72,56,71,64,76,93,100,93,77,72,86,86,77,113,71,89,111,76,74,68,61,48,51,48,59,17,33,27,44,66,36,33,33,40,63,29,13,50,69,71,89,71,67,86,117,123,89,100,139,139,87,109,100,149,130,128,149,160,120,144,139,136,119,151,128,142,114,143,144,143,137,136,145,130,63,83,72,59,79,68,37,54,25,44,62,31,55,32,38,43,51,51,29,42,20,40,22,13,3,27,24,3,0,13,31,16,40,17,12,31,18,21,9,2,30,9,12,7,33,34,30,7,20,12,5,20,28,38,35,34,52,13,31,41,50,57,34,29,37,32,57,56,33,36,57,39,45,58,44,75,70,54,42,22,71,68,61,42,34,45,24,36,22,57,28,18,44,45,36,18,5,5,51,15,17,13,22,13,19,21,41,37,25,35,20,3,27,23,30,60,19,56,61,62,88,45,35,86,79,81,89,93,72,70,89,112,91,132,122,140,90,139,138,138,153,161,142,163,159,138,149,156,160,149,159,156,154,148,142,179,172,170,164,168,158,157,147,164,166,178,176,147,112,176,124,140,152,157,163,174,146,156,159,151,140,160,166,164,150,155,119,132,152,150,155,139,141,150,165,146,171,161,169,181,174,151,160,160,143,156,169,181,160,132,163,159,133,129,96,96,126,153,171,193,172,170,160,166,167,175,174,150,133,160,154,143,158,149,146,153,139,171,161,164,158,159,158,172,168,153,146,178,144,138,147,154,108,0,17,26,2,11,6,9,4,8,11,27,14,11,8,19,10,19,10,14,11,20,8,15,21,26,32,27,42,23,6,15,1,15,9,25,6,27,8,14,3,10,13,25,32,21,22,15,12,24,12,40,18,37,12,13,18,13,1,7,22,26,9,29,22,39,27,36,20,20,24,40,20,27,36,3,6,13,12,13,38,20,8,12,36,8,25,11,10,9,33,27,21,32,38,18,33,48,49,14,20,58,64,55,71,56,31,42,27,44,31,20,25,57,54,35,38,52,38,44,35,34,33,20,16,35,21,45,27,31,48,39,36,32,23,42,61,41,39,50,43,30,32,50,45,33,11,32,56,35,65,85,96,95,90,78,73,87,93,102,125,115,114,102,81,79,75,97,103,109,147,142,96,127,115,137,104,92,119,121,106,124,109,118,120,128,122,115,127,129,125,120,140,126,128,139,134,122,144,134,116,104,136,134,130,101,122,101,120,136,132,128,105,127,138,126,102,135,123,132,114,118,103,104,123,108,109,105,122,112,131,110,130,128,161,104,156,123,116,121,114,125,112,96,148,130,144,113,105,133,143,165,131,117,140,140,164,140,155,128,119,128,117,147,115,123,134,100,127,127,121,121,99,127,155,101,158,124,118,110,114,70,118,75,85,122,112,108,121,133,84,117,105,74,93,104,100,94,110,112,90,79,78,105,107,118,126,108,85,105,125,99,72,67,60,39,103,75,58,54,58,75,61,72,50,76,76,33,78,30,36,51,76,42,45,71,54,39,62,52,61,47,30,43,43,58,62,67,75,45,52,53,71,30,62,35,55,34,75,28,55,35,37,75,37,40,35,63,61,58,48,73,52,57,41,47,74,66,56,67,70,67,72,28,42,54,45,77,61,74,63,55,66,49,62,53,42,53,85,34,59,85,52,68,94,69,53,86,85,86,66,29,60,73,89,128,139,134,84,88,88,72,90,39,51,55,53,33,29,30,18,63,46,28,46,56,64,54,51,64,54,33,66,84,69,90,75,127,90,105,124,89,102,134,120,144,141,131,137,123,138,129,125,108,129,112,128,124,157,135,117,141,140,130,125,129,114,118,91,105,118,91,98,75,95,73,51,60,69,40,37,39,65,42,36,35,14,31,22,48,33,17,21,25,21,22,18,22,43,9,3,18,3,31,5,44,58,28,10,19,22,28,32,31,17,28,39,32,32,38,48,50,43,51,54,35,44,43,50,33,16,47,36,20,65,61,41,32,54,54,48,62,10,30,31,34,23,58,18,23,19,21,22,16,0,10,25,31,45,1,28,26,24,27,34,40,44,15,55,48,58,50,44,33,75,61,42,57,83,76,72,57,47,53,76,56,49,85,60,81,78,86,108,132,149,169,155,154,160,140,138,177,147,147,158,184,127,94,121,160,153,150,169,151,174,158,152,161,180,177,159,170,147,165,118,131,143,144,156,155,140,141,141,151,141,143,140,159,162,151,157,167,160,148,156,171,174,153,155,142,130,149,140,141,139,162,133,143,171,147,149,172,152,155,188,171,154,146,158,175,149,158,120,117,96,111,149,145,185,183,171,170,154,129,156,149,161,154,179,171,139,164,193,156,159,137,184,176,156,160,168,169,169,164,144,172,161,169,163,177,99,24,0,14,3,20,24,40,20,35,11,32,14,23,7,14,4,16,19,5,14,16,12,19,19,22,6,31,6,12,38,16,16,8,28,7,35,42,37,28,23,51,30,18,34,22,15,50,19,20,26,0,6,28,22,24,3,15,32,1,19,13,19,19,9,15,25,25,2,8,24,20,5,7,43,18,7,24,12,12,20,35,30,9,12,30,31,6,8,18,28,9,48,24,31,46,37,63,46,40,61,65,63,66,58,65,55,55,21,20,27,51,46,40,46,35,44,32,22,37,34,45,19,27,25,41,27,59,68,33,39,14,40,48,64,37,53,36,29,78,54,22,33,53,24,44,31,43,46,60,25,78,60,91,111,92,95,92,106,100,128,95,94,102,95,116,120,107,144,117,109,122,118,107,144,142,132,121,124,127,131,122,121,124,120,119,102,124,122,127,127,130,136,137,118,118,137,117,112,126,107,111,112,103,100,114,94,86,127,121,128,108,118,139,138,115,105,117,102,124,119,134,145,142,113,133,131,114,148,115,134,130,123,126,140,129,123,147,151,136,124,150,119,123,134,129,125,135,119,121,117,143,160,155,172,177,160,144,150,130,161,147,116,127,134,141,106,94,113,120,142,130,137,113,126,147,133,104,131,134,139,124,123,150,96,111,120,111,93,88,107,73,112,94,93,117,97,83,71,73,73,102,79,87,106,110,86,107,106,104,104,91,108,78,72,85,55,49,62,77,74,60,59,51,73,75,95,48,56,34,59,47,45,70,48,79,53,72,71,37,55,81,42,51,55,26,58,54,67,66,58,76,65,63,38,72,48,50,29,55,65,80,53,76,29,37,48,57,51,58,47,62,74,44,33,66,92,57,57,61,67,79,68,69,50,50,50,62,71,64,50,59,63,68,40,62,62,44,59,68,76,61,38,52,39,44,27,51,45,45,60,25,55,62,103,106,86,94,115,95,135,82,98,97,72,72,82,73,53,38,66,56,23,45,49,57,50,19,42,25,49,28,35,57,31,67,58,68,66,50,50,54,40,99,89,101,100,63,90,101,97,127,86,77,122,109,110,124,123,112,122,132,108,144,112,144,96,159,169,150,153,95,103,146,86,91,132,97,161,110,157,112,125,151,141,100,88,112,91,79,83,45,66,58,55,45,50,41,36,60,21,9,39,23,21,24,24,11,6,31,50,22,24,5,42,12,51,32,10,34,43,25,39,50,42,42,51,51,71,18,69,80,52,50,50,62,61,85,68,58,64,32,25,29,18,42,8,26,14,23,19,46,9,15,33,26,12,39,48,23,45,44,30,28,28,34,41,38,48,69,62,61,25,47,53,73,52,61,66,56,63,59,78,82,86,77,75,61,78,53,82,122,97,116,154,191,197,167,127,149,121,113,143,189,144,156,146,146,155,147,157,154,158,156,143,132,153,159,157,137,151,122,152,146,136,135,156,150,167,169,169,151,174,151,142,150,172,147,139,172,158,159,140,124,143,149,144,148,154,157,142,168,198,176,155,155,140,149,166,147,156,149,146,152,117,172,150,163,172,154,152,159,141,107,101,95,91,108,157,171,170,149,133,130,147,135,171,160,123,158,155,153,173,180,173,168,172,147,179,145,131,171,164,162,149,156,169,168,140,167,122,6,3,11,3,30,20,32,9,3,13,14,12,14,0,5,12,5,9,4,0,23,3,9,27,15,26,24,31,19,31,1,27,17,5,33,36,7,40,26,11,0,28,6,20,38,9,23,28,15,15,14,25,5,21,27,29,24,14,47,22,33,7,24,12,29,21,26,14,5,5,6,16,23,26,15,2,8,40,19,15,25,25,27,45,1,35,13,18,35,6,34,10,35,24,16,49,26,55,33,37,41,17,41,43,42,29,28,48,18,44,51,44,14,16,26,39,21,29,44,28,9,48,35,5,34,31,26,24,29,23,8,39,15,29,43,17,30,43,17,31,39,22,40,24,1,29,53,31,18,42,65,56,72,83,88,94,116,120,122,124,102,133,89,114,127,113,123,87,95,94,121,102,121,110,82,108,91,131,102,98,129,104,119,107,120,120,115,105,135,92,110,121,119,107,116,119,112,113,123,97,133,114,129,133,127,108,114,97,105,96,106,121,92,101,115,107,93,99,121,114,108,123,107,122,114,118,138,126,124,119,117,122,101,73,87,121,108,98,112,113,99,152,144,142,131,130,109,146,118,102,94,132,143,152,148,125,108,141,127,137,118,98,110,121,135,112,143,163,137,160,136,124,137,94,134,116,120,129,136,150,123,136,132,126,121,92,122,102,99,87,94,90,95,111,98,109,80,87,68,52,90,84,75,71,64,103,109,102,87,96,83,96,103,104,97,59,87,112,63,96,72,72,29,44,62,89,72,26,60,73,68,72,82,48,64,66,58,39,81,45,48,54,58,50,76,68,61,58,31,47,60,53,57,55,41,31,49,69,55,45,56,47,42,32,57,75,51,76,42,43,12,56,44,53,46,32,64,56,55,83,33,56,64,59,56,39,37,33,52,73,34,60,55,71,48,47,48,29,61,46,43,51,62,50,40,57,17,40,56,57,57,61,66,67,51,45,72,43,35,85,68,84,64,65,79,86,94,95,94,75,86,65,89,93,87,71,77,73,82,54,43,45,46,27,44,36,34,50,21,47,40,33,37,32,44,32,46,50,50,47,68,64,61,71,58,59,80,64,91,97,109,109,116,90,99,125,130,139,117,121,145,123,109,102,139,123,105,135,146,155,125,122,153,145,146,144,144,132,167,152,109,154,135,124,112,104,123,48,93,99,73,83,44,37,66,10,15,32,30,53,54,46,24,7,34,11,4,24,22,51,28,1,24,38,64,44,51,41,22,37,57,81,37,47,45,41,76,55,47,35,11,44,33,9,32,31,23,41,27,30,38,50,55,25,22,36,54,67,42,68,62,65,73,66,44,59,51,51,48,31,63,70,42,44,55,20,46,50,69,61,70,75,104,67,78,80,74,74,80,84,142,129,150,149,171,177,160,162,116,165,134,145,157,138,174,160,143,185,171,170,157,157,179,165,143,125,169,157,128,126,145,158,168,169,152,138,162,164,138,152,153,161,161,152,145,157,188,161,157,163,187,158,182,153,139,148,185,150,170,164,174,194,156,161,160,173,147,139,164,165,165,162,152,141,162,175,165,182,154,136,149,143,171,120,123,99,81,143,163,170,170,151,152,185,174,178,166,176,158,162,157,155,147,144,163,138,177,130,133,134,159,140,176,155,150,155,160,143,160,94,8,5,24,6,15,2,11,8,25,6,28,25,24,3,40,33,28,25,19,8,15,1,14,36,18,5,47,34,9,30,11,34,23,22,4,5,13,20,5,15,17,17,17,41,23,0,34,39,16,22,25,31,7,12,41,10,12,16,38,23,18,43,31,22,22,25,24,7,12,15,5,18,31,14,28,3,22,13,25,16,18,32,45,40,27,4,18,30,25,11,36,19,30,6,8,22,52,26,21,15,14,21,32,33,17,36,47,3,15,46,22,6,48,38,25,31,14,5,31,30,21,53,30,32,27,16,24,33,39,22,42,9,35,55,33,31,45,36,71,55,60,50,29,26,30,23,37,22,44,36,51,75,75,96,97,82,83,120,92,118,118,125,109,119,107,91,107,99,115,114,123,118,97,133,101,96,74,73,71,139,97,104,111,84,101,109,120,128,103,127,111,105,101,129,97,117,103,119,130,105,132,128,131,132,124,109,102,105,106,116,129,146,138,79,111,127,106,102,123,101,79,115,107,131,135,109,146,103,115,100,109,101,90,69,106,79,94,94,114,120,116,123,85,134,93,125,109,121,106,115,123,103,101,100,117,119,104,133,113,127,147,117,142,113,134,151,131,137,139,137,125,86,105,90,114,119,115,125,95,134,103,110,120,109,95,122,105,121,114,94,112,100,94,103,104,104,107,98,110,62,91,64,92,80,72,52,95,82,81,52,85,73,99,98,107,109,115,102,96,108,84,92,103,88,96,95,110,118,88,65,50,52,46,60,63,62,57,73,34,92,25,31,39,49,63,67,57,64,69,42,83,61,80,58,32,54,77,46,62,56,44,45,34,56,58,59,57,39,58,67,58,53,56,36,30,42,43,29,92,60,25,35,21,40,36,52,46,29,42,17,58,49,54,30,34,47,43,48,26,13,40,54,61,61,24,59,61,48,43,67,41,35,38,56,44,61,58,66,38,67,34,47,54,59,70,59,68,62,63,60,74,51,103,107,123,87,97,82,59,75,63,52,65,55,45,43,60,56,53,13,64,47,37,36,31,21,34,26,45,39,48,48,42,41,45,32,51,42,42,73,44,55,56,68,61,75,89,75,96,82,86,86,98,98,120,105,116,111,117,136,112,110,101,109,147,114,107,124,140,139,113,114,144,147,154,155,147,150,124,146,160,157,140,145,166,177,171,190,210,211,238,183,176,176,159,198,95,138,67,50,33,58,94,41,7,33,20,41,25,28,52,42,56,47,46,41,38,70,47,46,26,36,15,20,50,34,30,27,40,30,45,51,60,43,41,54,55,74,31,49,59,23,15,40,49,21,32,41,41,18,20,36,41,44,62,56,31,50,82,68,59,94,83,75,70,77,48,62,59,58,85,89,108,149,144,167,164,149,145,150,165,146,136,149,147,168,154,152,151,186,184,157,189,179,150,142,170,136,131,129,179,171,157,152,157,160,165,171,183,158,159,177,167,160,175,146,182,140,152,166,172,157,173,177,145,143,142,185,152,184,186,165,178,162,164,166,175,155,160,135,162,183,158,192,149,179,163,157,156,153,178,177,187,161,172,117,103,90,124,143,161,172,164,153,157,172,155,148,158,161,144,176,193,181,162,148,183,161,148,144,180,160,158,162,163,144,180,162,156,125,1,14,1,28,5,12,9,26,6,16,4,24,26,6,21,24,1,0,42,8,29,10,22,11,36,26,61,29,48,49,40,33,54,31,59,43,49,36,24,27,21,15,13,10,26,11,11,40,14,28,28,28,32,19,28,29,39,25,10,27,8,37,9,10,19,22,52,1,19,19,19,0,23,17,20,33,11,20,24,16,17,19,3,28,7,26,5,42,37,21,43,14,27,35,39,34,46,15,37,25,33,42,24,40,74,9,21,13,6,48,20,29,29,19,34,10,16,29,31,26,14,36,2,30,23,34,21,19,36,30,35,44,50,29,24,3,24,46,44,25,69,28,71,80,86,53,42,54,48,64,77,62,102,114,77,88,59,79,76,73,78,118,116,94,105,109,122,94,111,119,92,98,108,109,118,92,120,103,105,99,107,95,111,89,107,101,94,129,103,116,120,115,112,107,112,131,138,122,128,96,104,107,108,108,108,117,123,118,94,112,108,131,123,128,108,123,140,125,110,96,102,132,95,114,97,133,132,118,108,125,118,113,91,103,92,92,102,90,78,70,92,103,78,97,118,97,119,112,116,102,117,81,75,73,112,117,86,121,123,126,120,121,134,110,163,128,94,126,102,96,99,132,125,110,104,124,133,97,121,121,95,80,95,85,101,98,106,107,114,123,86,112,97,99,106,106,120,110,105,128,103,83,76,84,73,87,37,72,58,59,77,60,88,94,78,81,107,79,98,95,105,121,95,115,101,107,89,96,80,61,87,101,61,55,68,47,83,46,63,51,58,29,83,60,72,67,52,44,49,44,45,62,65,29,41,66,56,67,45,56,64,57,61,40,46,67,65,75,65,55,61,49,21,15,53,68,52,43,38,57,29,49,51,61,35,44,53,59,58,66,21,64,59,80,60,50,38,39,66,26,37,54,68,50,62,59,63,73,58,74,33,29,50,37,22,57,21,23,36,56,35,67,51,21,44,47,49,28,60,41,74,69,66,47,65,65,71,63,82,53,52,86,73,70,58,66,77,83,69,67,74,52,43,36,51,71,70,72,46,33,57,23,11,70,22,34,41,24,53,42,53,30,27,36,33,39,30,44,49,39,45,41,64,80,73,77,93,57,70,101,97,64,102,102,106,113,79,129,117,141,129,124,140,134,148,119,163,184,167,154,138,150,190,209,228,237,238,245,250,247,235,240,241,238,255,255,251,233,244,231,234,247,251,227,177,156,105,15,20,10,9,41,33,50,32,64,35,29,61,34,18,34,39,24,34,48,55,24,25,21,10,36,52,56,75,58,78,49,30,48,39,35,50,27,67,46,35,36,48,42,37,29,46,58,27,39,42,60,38,54,61,60,42,46,44,52,68,37,66,70,50,74,106,127,132,135,149,145,188,164,160,170,151,147,171,175,160,157,166,156,170,165,177,169,128,128,150,135,172,147,165,190,160,144,159,169,162,148,165,164,165,143,147,171,165,149,133,166,142,172,151,160,156,163,184,163,164,165,156,149,168,155,163,169,173,146,179,168,164,159,141,160,174,150,158,129,174,165,164,174,170,161,153,163,164,178,113,86,96,103,137,161,179,163,138,150,156,131,118,132,149,143,170,166,155,172,193,156,137,142,167,176,157,181,166,155,141,152,159,110,1,0,12,7,7,19,9,29,12,18,14,10,15,7,28,6,7,11,28,11,3,17,43,19,57,78,79,89,61,124,84,113,122,111,124,128,90,48,50,52,26,34,23,21,24,9,48,21,19,16,2,28,24,14,25,9,18,21,16,23,19,11,18,0,18,26,18,25,17,23,6,7,18,15,42,36,10,6,41,9,31,14,15,21,9,17,17,30,27,30,27,12,5,23,40,33,20,27,50,12,48,46,34,47,53,47,47,7,19,38,41,20,55,45,19,12,4,11,21,6,11,25,20,17,27,47,11,34,17,41,12,50,28,44,55,20,30,43,45,32,41,51,72,51,29,37,70,50,62,52,65,62,78,68,60,63,67,77,77,83,90,110,89,57,77,56,98,112,115,103,99,101,110,94,129,113,107,132,108,140,125,105,100,105,132,123,117,133,117,121,125,129,123,117,138,114,106,96,114,90,112,112,103,136,100,119,102,128,92,143,104,131,119,127,123,131,117,156,114,135,116,141,106,116,135,119,115,121,129,133,104,109,153,130,104,107,54,86,92,136,111,117,141,112,112,94,114,123,134,121,113,110,120,107,138,128,106,112,119,135,146,137,151,138,126,121,80,118,101,103,116,104,108,107,145,145,107,103,115,112,113,88,134,101,113,108,105,87,115,110,80,86,114,106,102,95,86,93,95,93,84,78,87,89,72,94,81,83,92,84,64,78,68,45,44,66,58,58,77,48,57,58,84,66,67,62,71,98,63,71,36,56,68,53,61,69,35,73,68,54,55,51,49,67,55,69,77,69,53,68,53,50,35,39,58,57,48,73,76,70,50,52,52,52,34,44,52,45,50,35,44,37,47,41,31,32,38,27,57,67,39,53,58,48,66,51,49,57,57,40,37,35,54,49,44,64,52,44,32,22,60,49,46,53,30,57,62,45,54,46,29,50,45,35,32,50,22,56,28,55,54,63,37,21,28,49,70,56,57,53,52,58,22,56,39,75,63,27,47,60,48,48,56,45,76,62,41,66,65,70,62,81,64,70,70,69,79,69,72,53,63,71,54,66,45,30,94,75,48,34,69,28,30,41,43,51,55,52,45,38,20,30,40,34,54,36,55,44,37,45,55,64,45,53,79,95,68,78,84,102,59,115,135,112,128,135,124,131,148,170,180,187,238,250,251,239,242,249,248,213,255,249,253,246,250,248,241,246,252,252,255,251,250,241,243,247,230,187,77,38,3,9,22,8,26,67,43,57,42,48,43,41,48,18,45,35,51,61,36,41,27,39,35,36,44,22,37,18,37,31,47,11,39,31,32,27,37,20,45,16,42,22,38,33,32,48,60,33,37,69,42,44,83,46,69,51,57,51,62,40,59,59,76,86,80,107,139,184,143,154,183,159,148,156,160,179,171,156,150,154,169,178,178,168,133,136,167,154,140,154,160,161,142,140,185,171,164,183,174,186,149,170,152,166,171,164,174,151,145,181,156,126,158,169,162,184,155,152,156,137,141,138,158,150,154,157,174,157,174,149,150,147,168,146,150,146,165,172,147,165,153,166,157,158,128,163,121,144,104,62,113,91,140,132,174,170,154,146,140,112,153,153,146,145,168,166,156,159,151,146,151,161,156,153,172,163,168,155,136,129,26,2,17,20,2,21,25,7,1,33,29,10,7,13,28,9,7,9,32,17,30,10,24,2,153,154,183,182,175,169,182,153,152,146,147,136,129,102,64,51,23,38,47,12,7,17,19,37,8,38,20,10,1,25,16,29,20,4,1,12,34,16,26,17,16,25,36,23,8,32,40,18,21,34,17,22,13,8,10,10,17,7,2,16,3,20,19,0,2,31,13,33,29,22,25,32,21,16,23,8,73,21,59,44,34,32,50,0,26,47,44,23,52,36,11,34,22,24,37,20,2,29,21,28,39,16,16,50,13,20,21,40,49,23,50,37,54,36,58,64,48,33,52,53,52,30,18,45,33,38,49,57,65,59,65,48,62,60,80,58,95,73,90,57,84,85,110,104,99,86,112,118,133,117,96,116,121,130,113,126,134,110,135,135,128,116,86,102,106,117,143,123,139,100,128,120,117,136,121,122,135,103,133,132,103,106,110,98,85,106,119,136,125,112,82,152,123,133,102,125,123,111,124,123,126,111,110,118,94,127,105,122,135,148,143,135,108,134,143,135,145,138,118,130,117,124,113,124,114,118,108,112,142,140,101,94,109,132,83,95,95,110,116,112,109,129,132,95,102,113,97,103,114,98,116,119,114,98,122,113,126,97,92,98,112,116,87,71,82,107,83,84,96,83,82,91,83,100,104,94,88,108,83,86,60,94,85,98,100,103,94,78,94,80,67,99,79,107,76,67,61,27,58,29,53,54,66,82,84,62,67,60,55,59,45,55,90,66,51,64,38,63,43,56,90,40,60,46,55,55,71,48,63,44,36,59,40,57,43,63,47,28,36,55,67,50,56,71,47,42,49,37,49,47,72,46,42,50,42,51,51,34,60,53,6,44,39,38,36,56,55,55,38,63,53,34,50,67,34,62,39,51,56,29,42,33,24,41,66,46,45,69,49,65,51,51,39,31,60,56,46,53,32,37,51,59,49,64,57,44,77,63,48,41,32,58,43,58,49,68,48,60,8,48,46,59,38,58,61,45,55,79,48,25,51,47,33,40,65,75,77,45,84,56,20,46,61,69,50,62,41,65,38,65,63,43,45,44,49,37,52,33,28,39,47,29,44,37,48,61,44,37,34,34,57,52,32,70,45,69,44,37,60,65,69,88,57,98,111,161,136,181,206,210,212,220,212,229,243,244,247,242,252,250,253,231,244,255,253,246,252,250,253,252,244,253,255,243,203,168,152,121,31,4,8,20,14,17,49,52,53,57,15,48,33,35,30,35,45,39,14,53,42,26,38,27,24,23,18,33,37,53,41,46,43,47,39,51,35,31,33,31,26,22,58,38,39,27,27,50,69,60,58,59,60,57,49,52,76,62,64,74,64,100,93,90,126,154,119,117,131,142,131,140,158,157,175,164,151,126,147,160,143,139,161,171,146,165,144,163,151,148,142,164,176,169,169,184,161,145,129,129,153,158,167,161,150,166,144,165,161,162,136,160,173,169,186,172,185,164,178,135,139,156,165,145,152,155,138,160,146,127,144,150,178,183,118,139,158,161,147,166,164,171,188,138,167,185,135,87,73,109,150,153,178,177,188,166,153,164,144,123,149,142,155,164,156,135,159,143,131,165,185,159,147,181,130,139,168,110,23,6,10,9,11,3,0,1,18,15,21,15,8,5,19,46,24,15,15,11,6,18,31,17,175,151,184,146,131,132,121,133,109,75,64,35,51,27,47,33,32,26,18,14,16,30,2,33,8,16,26,26,22,7,5,20,25,14,25,20,13,6,24,38,8,25,26,45,5,23,14,14,8,2,3,21,11,27,9,10,43,12,36,29,18,20,27,53,27,54,36,11,18,50,47,31,21,54,40,17,35,34,39,49,18,14,17,30,13,31,77,17,39,43,31,29,18,29,27,26,20,29,19,42,62,23,41,22,18,25,59,45,56,71,14,29,51,43,51,45,68,63,60,96,66,48,43,40,64,36,86,69,101,106,107,101,61,74,90,94,85,133,120,101,95,132,110,126,81,120,137,99,113,126,102,121,99,107,110,113,107,126,111,116,104,96,124,101,109,112,109,115,126,112,133,116,115,117,125,120,132,118,107,108,110,111,116,108,100,109,104,115,105,97,125,111,122,115,108,104,104,100,101,98,105,97,115,112,115,119,108,141,131,114,156,120,114,114,122,104,121,132,114,116,124,113,100,99,151,149,96,106,103,114,117,123,115,120,108,106,77,109,76,112,111,94,96,97,111,130,101,143,112,120,135,127,105,101,130,110,109,120,108,111,102,87,110,85,95,100,107,75,101,86,108,85,68,122,102,92,105,92,92,82,102,91,98,131,133,128,105,119,119,105,96,98,73,74,72,74,93,47,65,57,65,86,87,75,84,55,51,68,76,50,49,63,52,62,69,59,67,34,45,66,66,57,59,50,58,93,29,69,47,41,79,55,70,68,42,62,70,42,56,44,41,44,41,53,44,49,24,83,49,36,47,53,55,50,52,35,42,57,34,24,39,9,32,45,41,29,35,38,63,28,38,26,42,49,27,40,42,39,36,36,23,62,37,38,34,60,47,59,32,28,49,59,28,23,51,26,35,36,35,35,30,46,40,52,43,47,51,39,66,50,8,59,39,38,35,64,52,48,42,76,44,36,44,46,40,43,25,35,37,67,37,51,52,39,59,81,46,55,67,67,55,40,70,64,42,69,79,51,55,75,41,71,87,71,87,58,46,56,68,39,70,61,59,50,60,60,41,50,33,65,47,48,25,44,47,27,45,34,56,60,72,60,27,32,71,68,77,75,82,94,110,122,88,155,157,148,184,178,213,227,255,255,252,255,248,247,255,252,224,254,238,255,236,255,245,233,232,246,228,111,115,26,0,15,7,19,45,33,42,24,26,23,31,51,68,33,38,32,27,25,19,47,44,38,48,38,41,25,44,39,17,22,40,39,20,23,49,38,12,21,13,26,48,19,41,73,60,69,52,44,58,68,57,74,46,56,64,39,64,53,91,106,100,109,145,117,140,154,133,134,133,142,126,142,119,137,133,121,121,133,163,107,146,143,155,150,117,149,169,158,165,145,125,135,146,141,147,135,151,152,170,168,171,143,146,137,160,153,138,160,146,182,161,168,165,161,146,138,157,127,150,175,160,172,170,142,186,146,166,151,148,156,174,135,157,147,151,138,135,165,170,195,174,175,150,124,127,107,77,134,142,162,191,177,191,179,158,142,157,156,159,145,160,165,176,179,160,173,126,169,174,181,163,149,152,132,14,0,28,16,12,7,36,34,11,0,12,12,8,18,14,32,6,10,16,30,20,32,1,5,107,108,79,49,88,62,15,5,35,39,10,37,29,17,24,19,42,7,42,20,39,22,29,25,33,46,14,22,28,44,12,5,5,4,29,29,7,31,22,8,22,21,27,25,9,35,24,29,28,17,6,14,9,16,12,4,28,21,16,7,24,15,22,29,14,27,65,46,30,31,36,40,19,58,34,17,48,60,59,50,32,47,20,40,34,43,16,59,33,30,57,14,11,37,36,24,25,27,38,38,51,29,15,39,32,23,31,48,51,35,55,50,43,40,31,53,33,47,62,39,35,58,62,59,66,79,89,90,110,126,124,96,100,146,115,110,136,117,152,138,116,106,130,132,143,106,111,114,117,119,131,123,119,115,123,112,115,102,128,120,114,131,113,132,90,115,121,110,121,106,128,115,124,121,127,149,119,120,120,127,108,124,102,108,118,134,126,97,128,109,113,133,117,138,137,140,116,113,112,128,126,118,113,124,135,125,111,106,130,99,119,97,140,86,142,139,129,135,100,120,132,118,157,123,122,149,115,124,117,118,145,150,135,141,133,135,154,111,121,124,121,115,100,122,128,95,120,125,129,121,82,94,106,84,124,148,124,120,123,116,157,142,124,109,116,121,118,107,116,116,107,100,91,116,89,92,117,86,93,92,77,76,80,89,78,79,97,99,105,108,74,85,82,70,108,93,58,59,64,65,55,61,46,74,95,76,51,71,51,45,59,49,76,65,43,48,49,41,26,47,63,74,71,38,62,49,64,44,64,59,36,51,70,47,59,42,63,55,43,44,75,41,66,36,47,62,51,32,48,42,47,67,48,42,53,39,37,12,18,36,5,53,34,40,62,41,36,40,48,54,24,39,40,36,29,55,58,46,29,74,58,42,37,56,25,22,53,45,43,31,31,44,17,65,51,57,42,50,59,28,33,46,50,60,35,60,37,26,50,35,54,51,46,33,71,44,47,38,47,38,33,77,45,36,17,65,45,50,41,59,31,62,61,79,40,41,61,54,50,80,15,42,58,70,33,86,39,47,93,62,53,61,61,72,91,67,60,66,53,66,58,84,72,51,72,64,23,59,79,76,84,73,59,55,54,56,31,75,18,56,54,58,51,43,74,34,44,39,47,68,44,56,48,54,49,50,63,73,103,104,135,156,151,203,182,160,186,209,203,247,255,255,254,249,245,241,252,242,217,228,234,232,173,94,11,1,12,18,33,2,18,12,21,29,34,49,53,27,54,35,25,40,56,38,44,54,47,34,15,40,21,42,44,13,28,35,34,31,39,31,46,23,37,37,50,41,30,43,47,35,54,41,46,51,42,68,50,55,42,66,67,58,92,68,114,130,173,173,138,142,146,150,132,125,130,129,139,148,162,147,145,151,160,136,156,157,154,145,155,132,165,134,152,133,148,156,156,157,137,155,147,140,121,138,148,155,167,150,148,157,142,149,151,157,155,130,148,188,150,146,164,161,146,155,140,154,183,168,175,160,165,175,139,150,170,138,159,157,150,181,145,158,157,166,172,155,142,111,99,66,113,143,142,162,174,143,155,165,132,149,156,144,156,160,133,156,151,138,153,145,160,140,127,139,163,111,26,3,9,5,9,10,12,22,21,18,14,5,32,12,24,0,0,11,29,11,15,19,24,18,64,34,32,51,39,12,26,29,19,60,37,16,48,8,16,24,25,34,38,32,17,29,18,19,30,3,23,2,11,25,15,20,10,31,5,33,11,4,12,27,29,36,25,19,19,28,2,43,40,36,35,27,3,26,38,12,15,16,31,8,22,16,11,17,19,45,24,26,10,48,35,32,52,43,49,18,21,31,13,34,9,43,61,18,45,45,43,46,39,52,55,38,33,28,58,51,41,49,28,33,38,26,31,38,34,44,48,17,36,26,16,42,20,15,23,17,41,55,53,48,48,47,43,69,67,39,85,74,110,110,115,119,121,98,120,132,126,105,124,106,107,104,130,108,113,112,124,139,95,115,123,114,124,109,109,101,116,124,119,123,138,134,110,125,148,122,91,139,117,115,121,124,131,141,124,143,123,97,117,152,126,105,139,107,121,118,117,122,122,118,126,137,96,137,123,114,110,128,129,136,120,132,119,145,106,121,117,92,120,114,128,103,121,121,129,121,115,125,127,108,102,112,137,133,122,122,132,128,113,143,150,123,154,145,101,140,119,138,137,121,116,118,148,153,127,118,119,147,102,125,124,138,115,95,115,101,103,106,111,94,98,127,130,131,120,106,113,94,111,89,93,84,110,99,74,78,100,112,79,90,64,68,61,68,78,104,66,105,81,95,95,67,61,102,90,67,83,62,75,59,43,85,42,34,59,60,44,86,59,79,52,60,44,56,66,53,50,68,83,42,59,62,49,71,66,61,59,56,78,50,56,56,53,56,34,59,67,65,41,60,52,71,61,54,21,48,37,56,54,54,83,26,38,30,30,34,51,26,47,41,14,48,51,33,30,43,45,25,53,40,13,33,48,25,17,50,56,55,43,48,46,50,34,32,37,56,60,30,51,73,25,72,55,20,58,34,44,87,75,66,48,24,29,47,35,55,25,44,45,29,56,37,59,52,59,9,28,34,69,45,37,29,40,37,47,60,48,33,48,35,39,35,23,37,62,18,34,24,30,43,40,49,36,37,61,33,53,41,35,42,39,52,54,50,28,78,33,59,52,46,37,64,58,42,44,75,83,71,68,80,34,71,71,71,66,64,71,77,73,71,87,84,86,81,58,79,45,51,69,97,73,59,84,82,56,51,81,23,64,44,39,43,65,88,93,35,58,104,149,175,227,249,226,240,222,249,251,255,226,245,252,246,242,247,190,130,59,11,12,11,17,28,16,34,7,35,24,17,39,37,36,43,39,36,63,20,46,50,45,23,59,30,39,55,26,50,22,27,45,25,52,40,29,42,31,23,50,47,60,38,30,54,61,37,49,43,47,41,54,82,68,51,76,77,76,78,131,124,130,137,132,149,126,118,159,141,163,157,179,180,173,156,173,165,165,168,178,144,150,136,169,169,153,170,146,163,154,163,147,114,142,170,129,159,145,153,185,158,174,148,133,177,178,161,148,169,160,161,168,153,138,161,153,169,153,152,143,186,164,162,171,157,145,166,180,167,154,144,157,145,175,162,149,166,164,173,158,139,139,78,78,108,144,159,174,193,161,171,152,152,169,154,169,170,157,167,164,178,140,156,168,169,172,176,160,108,34,0,26,19,8,3,10,5,27,13,33,41,18,9,9,29,30,20,25,13,16,13,14,54,44,40,31,34,35,28,35,26,17,29,37,29,43,2,40,27,9,13,33,26,20,9,30,25,46,25,21,20,8,15,25,7,9,10,22,5,17,20,6,19,8,30,17,12,21,26,16,12,9,26,13,14,7,32,23,14,18,1,45,13,22,11,33,40,19,24,36,49,26,24,18,48,36,17,36,13,2,27,43,29,29,4,24,32,23,9,42,30,19,17,53,19,39,20,26,43,29,25,37,33,27,62,30,45,13,42,44,38,27,24,32,39,26,18,36,34,72,35,24,65,30,39,33,63,47,72,62,50,68,78,116,112,92,103,94,116,110,96,128,117,109,119,113,111,107,115,143,126,127,104,97,111,115,124,108,125,142,134,114,106,126,126,112,120,127,124,133,107,132,122,123,125,138,101,128,126,138,116,134,119,107,105,118,104,128,112,133,90,115,107,113,82,84,115,77,110,121,134,138,131,118,124,131,166,122,108,126,115,117,106,100,106,124,92,135,131,110,112,93,122,124,123,121,110,127,132,100,126,115,113,131,122,126,117,117,141,107,103,96,110,135,144,135,131,108,104,126,135,110,122,98,141,118,90,125,112,98,102,86,115,126,120,127,111,107,86,134,103,96,87,54,93,91,89,97,81,86,100,83,92,102,77,91,66,97,91,106,91,106,101,78,60,49,39,46,63,82,51,43,30,56,58,24,43,52,46,17,45,44,71,42,66,77,60,71,52,39,69,47,76,49,58,71,56,47,73,52,43,57,47,53,43,72,45,64,55,48,54,59,61,67,71,41,42,73,70,59,58,59,58,42,71,29,63,56,53,41,56,34,51,35,63,41,79,30,80,54,46,59,60,54,70,30,18,54,50,27,31,44,68,51,49,14,36,36,30,26,39,38,38,62,33,36,30,52,46,46,60,40,38,53,17,32,42,30,11,45,44,50,45,47,31,24,8,64,30,78,42,30,30,74,44,34,47,41,32,23,53,44,55,38,26,41,37,39,20,39,49,61,46,65,25,34,41,21,40,40,49,39,56,38,31,51,45,66,55,61,50,51,18,17,46,39,46,40,40,56,35,56,67,72,51,74,73,68,60,69,31,91,75,96,72,60,77,77,81,95,95,105,106,143,113,135,114,151,153,141,145,137,97,103,107,89,118,82,9,20,64,92,130,130,163,190,175,181,253,253,249,246,244,244,249,249,249,248,246,220,228,199,185,196,220,192,190,163,205,165,48,23,36,36,44,25,37,30,30,14,64,25,47,35,38,42,13,37,52,17,14,35,30,22,44,23,19,49,25,18,29,41,54,55,38,54,54,39,62,66,79,67,80,97,59,94,100,72,107,96,113,122,119,122,123,118,116,146,152,175,166,165,172,186,159,153,186,166,153,149,156,158,162,146,180,158,147,148,154,156,174,160,139,147,155,150,157,156,167,161,209,181,164,165,155,152,148,163,159,151,173,154,151,155,174,167,153,168,155,159,153,180,163,138,155,154,164,150,170,155,151,144,172,152,175,152,167,163,173,160,173,175,134,105,88,106,128,155,157,170,160,140,136,166,181,179,163,163,159,161,186,149,157,168,166,176,151,168,111,3,36,5,10,19,22,18,6,14,12,9,11,24,20,28,3,9,16,10,17,16,12,16,32,29,19,3,53,50,21,25,30,10,25,33,36,10,35,17,28,29,14,21,19,33,11,13,17,33,12,42,33,23,30,51,35,12,15,6,14,50,19,25,6,34,19,41,44,19,4,21,59,10,12,27,32,1,2,13,25,20,5,34,19,3,51,26,10,26,30,30,25,27,24,26,27,19,32,39,22,37,42,30,33,57,37,36,49,17,35,25,56,19,38,16,24,22,33,42,34,53,24,33,56,28,40,15,35,47,54,42,39,41,30,42,39,17,42,24,45,63,57,74,49,75,66,71,75,60,72,99,93,87,93,135,99,109,95,127,112,121,114,84,88,97,92,121,111,116,135,127,103,110,110,108,140,109,117,125,117,119,117,132,121,112,139,111,111,109,88,101,78,116,118,91,95,99,91,105,136,119,110,144,115,101,117,112,120,126,115,113,106,93,90,96,118,101,109,109,94,108,146,119,123,115,124,122,122,117,123,114,126,131,131,114,127,137,99,89,116,110,105,130,116,117,109,129,114,123,133,142,103,111,120,128,112,128,133,133,119,81,122,102,117,129,127,115,115,99,110,114,135,130,112,112,108,103,102,124,124,124,135,134,107,149,123,153,102,92,118,94,76,76,126,144,121,136,111,113,114,112,112,96,97,144,78,86,67,53,108,82,107,85,91,100,75,57,36,56,90,78,91,66,29,48,66,64,40,52,42,77,62,63,30,73,44,76,49,60,52,77,35,57,79,65,53,29,61,43,58,53,47,45,40,76,50,55,53,84,86,72,54,52,77,43,62,66,38,78,38,69,54,77,34,37,42,51,48,26,62,32,63,47,44,33,77,60,31,44,24,35,63,43,44,29,31,37,21,38,45,40,66,43,46,38,21,60,46,56,41,26,43,35,56,45,52,53,75,64,54,47,44,26,49,41,31,50,44,19,52,44,12,59,36,23,40,27,34,53,16,39,32,32,40,26,42,56,22,70,36,35,68,56,48,38,28,22,61,46,22,40,52,46,36,36,56,34,32,44,37,19,40,37,55,34,39,34,59,39,42,50,32,37,27,10,21,45,65,16,53,52,23,22,49,40,54,56,82,64,52,63,64,62,75,44,42,64,78,96,76,82,85,101,132,116,113,135,140,157,148,134,157,117,160,191,173,180,136,139,114,128,121,132,101,93,105,145,82,113,213,241,251,246,245,242,241,246,252,236,248,252,243,245,246,254,247,249,234,248,243,217,61,33,35,22,42,32,24,7,29,32,21,37,36,35,10,32,15,16,27,13,19,37,18,34,42,25,29,13,53,23,22,18,40,50,69,46,70,66,68,45,55,46,71,92,66,119,109,122,109,133,121,95,127,109,112,122,129,122,169,119,143,118,144,154,153,165,125,161,178,172,169,166,160,169,136,154,157,158,141,153,148,188,154,181,141,147,118,156,154,158,191,203,189,173,146,182,162,147,170,161,163,175,167,154,165,169,160,153,186,152,151,163,160,159,168,153,131,163,184,165,143,160,147,182,138,155,157,136,172,138,164,151,185,128,123,70,88,138,151,170,170,161,129,140,159,152,157,162,159,151,180,162,163,154,164,161,147,148,107,17,6,3,21,23,10,17,0,2,7,29,22,42,20,10,15,9,26,22,31,8,0,15,8,15,28,21,31,32,41,10,21,17,39,12,29,21,13,35,50,32,32,12,24,9,11,30,10,9,17,14,13,26,6,29,21,18,14,11,24,17,12,14,30,2,24,25,40,26,1,20,18,7,5,17,29,27,27,19,33,32,24,13,12,45,40,13,27,3,12,4,33,27,24,34,19,34,36,49,30,32,32,56,49,21,18,54,60,24,40,36,32,69,43,48,15,26,40,26,38,51,56,39,32,39,14,24,71,48,49,52,65,39,47,44,53,55,64,42,43,57,52,48,74,47,85,113,88,98,102,99,98,86,85,95,105,130,112,128,115,106,106,123,99,116,129,114,117,117,124,119,131,94,111,119,107,117,116,103,118,121,151,113,111,127,95,126,121,84,111,109,114,132,104,81,112,106,145,132,134,129,114,106,114,116,93,120,116,142,122,97,101,142,124,134,127,120,138,107,94,108,101,120,129,131,104,121,119,99,125,118,148,144,112,122,139,150,131,125,128,112,114,113,146,123,117,110,136,136,112,124,116,116,146,147,137,145,124,117,119,108,125,117,130,126,138,124,108,117,114,106,109,131,123,83,80,112,110,121,109,127,110,101,96,125,116,108,105,102,120,99,108,130,119,116,138,136,111,106,123,115,108,109,132,120,117,115,74,77,85,90,72,62,59,85,82,110,108,90,109,115,90,81,73,50,57,75,65,95,77,69,65,56,42,85,72,67,48,80,76,56,68,37,60,67,62,53,75,76,55,47,75,51,55,69,66,69,61,57,61,39,23,64,64,56,57,62,55,60,48,44,40,65,68,51,62,67,37,36,49,41,52,44,70,44,37,45,85,32,31,77,31,20,25,37,50,32,72,32,27,56,55,48,28,41,57,40,39,37,16,40,44,54,50,49,27,57,52,29,22,42,38,27,25,47,41,55,54,63,38,42,32,36,34,27,49,59,36,50,40,57,68,76,28,16,55,41,19,36,35,25,50,55,33,46,4,22,22,12,54,57,43,11,6,62,45,27,22,54,41,49,29,63,27,46,34,33,34,28,34,50,45,54,8,47,49,32,43,37,45,66,26,60,43,62,65,60,73,58,55,48,36,44,50,57,30,71,67,65,89,74,92,100,97,115,120,120,122,98,111,127,128,131,140,183,125,175,177,147,140,135,224,197,163,128,108,117,66,70,149,136,181,225,250,254,255,255,254,250,255,229,255,248,231,250,244,242,253,242,227,212,43,15,27,5,12,19,20,27,34,34,21,19,28,9,9,23,24,27,25,42,32,22,15,7,33,15,22,12,44,26,50,25,29,33,20,32,67,54,31,44,62,55,65,91,65,43,79,113,138,102,127,104,120,153,135,142,147,131,131,161,134,149,140,125,134,169,147,155,157,178,170,166,180,173,180,134,149,151,151,151,156,187,152,157,174,155,140,152,124,145,159,170,154,145,158,152,112,168,131,163,153,138,157,163,161,135,165,160,130,149,163,154,143,139,178,149,155,165,161,161,161,168,169,153,159,149,144,127,172,167,151,177,177,148,132,105,74,83,80,161,168,180,153,175,165,177,138,166,168,129,141,147,171,151,152,164,162,163,112,20,1,7,12,26,18,20,13,21,18,24,3,1,2,20,14,7,6,6,7,15,39,39,9,50,0,18,17,20,29,5,29,33,44,5,19,27,31,27,21,36,38,31,10,20,37,45,16,16,25,17,16,6,37,11,20,11,16,38,25,25,2,9,13,6,12,30,9,18,28,13,40,36,0,16,5,12,18,22,35,24,24,34,29,24,29,16,25,11,29,12,14,38,24,52,11,29,40,21,38,9,15,33,39,52,26,11,40,9,24,46,46,20,57,51,21,23,13,15,48,40,66,40,68,36,31,38,52,66,14,31,41,42,41,46,47,69,43,56,39,43,42,44,48,63,60,78,109,103,104,88,83,103,99,112,125,96,129,124,110,115,124,110,148,129,126,119,120,129,113,85,141,126,114,113,136,119,120,118,146,140,97,124,120,118,122,126,115,153,135,115,96,128,144,103,134,127,114,132,123,131,141,129,95,114,149,114,98,101,110,109,109,113,116,134,126,113,123,136,129,124,105,105,118,103,112,109,116,146,130,130,92,126,125,120,125,107,128,126,124,107,138,121,139,135,98,110,140,119,144,135,132,141,87,130,125,126,140,145,93,126,148,123,124,104,142,98,126,121,122,120,98,118,130,99,119,90,103,101,94,132,114,104,118,113,95,94,92,103,84,127,108,121,63,90,96,109,134,87,110,70,78,109,92,110,90,104,116,116,112,60,72,100,81,90,105,104,106,85,103,80,117,109,96,109,89,90,93,101,74,98,91,73,70,60,57,75,62,75,57,56,40,58,29,79,46,35,68,51,65,62,59,66,58,52,60,47,65,83,36,35,40,60,52,59,58,25,51,74,64,74,66,66,54,60,36,53,40,37,54,39,41,42,54,36,59,46,43,46,51,40,21,68,68,49,38,40,84,47,51,45,60,49,42,61,44,68,52,61,62,34,50,49,98,48,52,80,30,39,48,32,18,50,49,55,54,22,67,27,69,41,36,68,60,61,61,47,49,41,41,42,40,46,50,38,52,49,44,45,33,39,56,43,13,38,43,45,27,39,44,73,64,13,46,60,66,42,44,47,53,24,59,59,20,19,60,35,30,41,38,53,60,30,43,29,46,58,27,11,41,36,55,18,47,42,41,58,33,30,74,60,38,68,60,53,51,65,54,45,66,45,51,65,76,85,73,59,55,74,69,105,95,92,107,114,123,162,147,104,120,144,172,164,156,157,205,137,87,111,116,115,89,115,153,190,195,237,251,255,252,243,250,253,237,253,221,250,255,252,249,221,37,1,17,26,17,45,53,35,12,57,15,42,38,27,42,28,28,30,35,3,64,32,34,27,28,19,32,28,20,21,30,44,35,15,11,39,26,31,64,62,34,45,57,52,53,48,62,75,68,117,114,147,148,152,149,91,107,132,181,162,145,122,145,137,168,131,152,136,140,175,141,176,124,148,161,158,161,167,156,161,157,151,144,142,150,166,161,161,146,155,166,202,141,141,159,176,139,143,147,144,136,124,164,144,152,150,142,153,147,144,153,172,125,150,166,168,156,162,135,147,154,160,124,164,162,170,145,142,143,148,158,157,156,129,148,142,125,76,75,103,129,143,174,144,158,135,113,145,155,154,171,167,159,166,167,164,171,147,140,27,8,14,8,12,37,21,17,16,12,33,20,1,20,13,28,29,17,15,33,46,19,6,16,13,26,23,22,25,15,21,41,26,36,31,38,22,14,33,18,21,16,18,4,20,36,8,14,9,39,57,32,30,7,25,34,31,7,8,10,8,20,13,23,6,26,20,16,21,31,25,40,11,20,35,9,37,29,16,5,21,24,25,25,23,12,3,3,38,8,9,17,41,3,21,23,45,20,28,36,29,15,39,3,22,13,32,44,20,17,47,51,21,43,38,23,22,22,43,21,16,35,43,37,18,41,27,38,65,18,26,41,23,46,9,31,47,57,38,38,22,25,53,44,46,53,74,68,91,117,118,87,100,124,124,129,103,112,97,123,106,135,148,112,124,137,129,132,117,127,123,126,136,133,120,140,131,114,138,113,143,114,126,94,131,138,129,114,146,129,121,94,116,130,141,138,125,126,139,123,90,119,108,103,140,120,154,108,137,126,149,110,113,157,136,100,141,137,145,129,137,144,144,116,120,149,118,111,128,126,153,138,152,134,115,110,116,125,146,109,131,134,138,131,120,132,131,129,147,102,157,121,120,134,118,148,142,121,111,133,127,146,154,115,139,131,122,104,134,135,108,127,118,110,118,142,96,116,112,105,131,116,113,116,117,95,100,129,111,128,100,105,66,115,123,76,130,81,85,76,80,107,68,97,103,97,124,110,148,89,85,89,110,94,95,126,105,79,95,105,96,92,117,120,121,126,149,112,96,90,103,122,110,89,109,82,79,55,46,52,51,57,45,75,66,64,61,88,35,44,61,60,65,52,70,31,68,53,53,26,68,50,53,76,54,44,47,48,21,49,22,78,45,38,68,47,52,71,48,58,65,34,38,68,51,42,59,38,52,81,70,70,72,41,73,61,38,36,49,64,44,31,65,57,51,44,54,44,35,54,48,20,38,58,19,35,55,56,40,42,69,30,14,41,26,59,55,54,58,32,42,30,39,66,40,16,37,41,27,32,60,54,58,21,45,38,22,43,31,49,50,47,48,31,29,30,29,65,40,36,38,35,34,50,56,41,45,50,44,46,54,23,7,33,34,49,19,38,25,46,34,37,16,49,47,45,40,53,58,2,40,58,30,8,39,34,47,40,58,70,49,53,44,29,53,64,65,71,30,70,45,58,70,53,58,40,52,62,55,84,90,109,77,74,85,86,101,99,100,103,110,136,123,155,183,180,179,124,131,144,155,154,72,78,83,114,107,166,189,210,178,197,213,215,206,212,215,196,227,235,175,12,21,12,32,15,34,29,49,21,20,46,8,43,34,21,30,27,36,29,23,12,25,24,24,14,43,39,32,42,25,37,20,41,40,6,39,39,39,54,42,47,61,73,59,58,73,48,69,64,95,126,140,117,132,109,108,122,131,177,186,175,149,163,150,155,163,135,119,126,156,161,150,156,139,176,158,163,190,158,156,160,141,146,140,176,153,147,169,144,171,134,174,164,151,160,151,116,117,151,136,172,165,120,156,155,164,114,130,153,133,155,158,159,146,152,131,141,154,139,143,180,170,160,167,158,161,156,150,134,134,185,186,165,145,168,171,137,136,115,98,102,147,143,182,160,166,167,147,158,166,159,174,130,192,140,165,169,170,112,1,16,11,3,4,14,13,18,13,10,7,11,31,4,15,11,28,1,5,1,22,4,20,30,0,31,10,11,5,38,32,24,46,35,38,24,21,33,43,47,7,10,16,19,12,8,44,24,18,15,16,3,23,14,21,6,44,6,29,22,0,20,26,23,2,31,14,13,19,24,4,3,5,18,12,11,11,15,27,34,23,12,25,17,14,12,14,25,22,51,27,3,2,29,27,18,39,13,28,24,24,50,31,17,28,18,35,39,34,51,23,19,35,16,34,33,23,49,24,37,54,42,34,64,28,40,31,24,7,45,23,36,47,41,39,42,24,52,42,46,54,38,35,43,41,69,60,24,53,80,58,60,102,97,100,106,125,93,130,134,111,132,139,128,100,130,128,93,139,120,120,152,142,144,121,112,122,138,149,123,125,139,134,132,128,136,138,125,144,115,135,138,118,138,119,131,125,117,123,116,131,163,136,128,132,92,136,151,123,162,110,129,133,116,110,98,109,124,120,133,155,151,152,135,111,139,147,144,159,124,114,165,123,134,133,151,130,144,118,140,131,113,105,117,141,131,124,123,123,118,123,126,135,131,143,136,155,129,117,158,122,122,138,127,109,146,165,116,135,141,142,111,127,143,113,133,139,138,142,133,99,105,131,130,121,123,111,122,118,120,115,84,121,117,132,142,120,110,114,109,117,92,109,88,102,102,116,100,93,115,77,124,118,95,115,121,123,110,121,110,86,83,82,72,59,73,86,88,89,96,95,121,86,119,81,65,80,68,68,76,56,69,107,79,36,81,24,56,57,44,50,33,66,31,78,55,51,44,75,44,71,64,52,54,44,72,70,47,54,49,59,59,52,42,36,54,70,37,56,54,50,59,50,18,70,63,62,54,29,43,43,54,30,36,64,45,25,37,37,71,47,39,47,59,62,28,32,61,28,63,28,64,26,40,34,45,21,41,45,44,56,36,44,27,50,9,57,33,20,29,58,32,43,43,61,43,47,41,23,29,27,71,24,31,51,35,50,50,26,15,16,50,29,36,53,42,40,50,40,25,34,49,48,34,15,53,48,37,44,10,36,61,41,32,22,41,35,62,19,38,52,41,52,43,29,17,32,53,73,22,39,37,62,51,56,35,29,27,12,48,37,46,16,54,45,39,38,48,26,26,42,31,50,36,53,48,84,42,54,55,95,44,71,51,52,70,59,55,66,80,96,100,116,84,144,170,142,110,143,190,195,166,128,143,94,137,157,98,140,146,145,157,101,99,94,117,85,107,120,183,99,25,34,16,29,19,26,39,20,13,21,31,23,31,20,27,39,26,17,30,6,30,2,18,10,24,4,44,32,16,31,47,33,24,27,25,45,41,48,76,30,58,51,58,45,46,77,75,67,63,104,107,90,77,130,90,102,131,136,137,147,130,175,180,160,151,160,143,147,180,163,160,156,137,133,165,159,154,143,153,157,141,136,175,146,167,155,167,170,160,161,137,170,172,151,138,171,142,148,145,170,191,162,145,150,144,135,115,131,178,155,149,179,169,180,151,137,139,166,147,176,165,152,148,145,182,160,169,162,195,129,162,160,161,152,165,143,166,160,108,99,81,125,120,142,166,177,186,161,176,147,142,149,128,158,177,146,150,187,108,17,1,0,6,29,23,20,11,38,19,8,2,3,0,19,24,4,19,14,31,17,2,3,17,28,8,26,32,28,49,27,35,17,10,14,20,25,30,24,20,17,21,20,32,48,15,12,29,30,13,21,9,5,9,19,12,37,25,11,20,23,26,27,9,51,37,29,25,17,23,15,33,24,12,31,32,21,18,33,41,27,25,39,24,36,48,29,36,22,13,6,21,42,25,23,19,30,24,24,11,15,28,39,8,44,16,21,34,27,31,10,57,25,7,42,31,43,20,21,48,15,19,31,15,36,33,31,34,27,32,25,25,6,31,32,44,18,36,45,44,78,51,35,39,61,63,54,44,45,70,52,71,85,100,100,111,109,134,130,142,119,131,136,126,131,123,113,123,113,123,115,144,153,115,126,129,120,108,137,129,114,121,147,150,131,149,139,112,144,136,129,120,128,128,137,140,125,151,113,141,144,137,118,99,117,121,122,113,117,140,149,107,107,114,93,95,122,124,142,122,94,139,142,123,128,129,107,95,149,149,142,146,152,118,124,119,135,136,136,127,131,131,127,124,111,110,137,153,139,107,150,147,149,158,120,139,108,87,120,124,138,149,124,126,147,141,128,129,167,155,136,132,132,128,112,101,130,116,130,118,110,119,111,117,115,112,109,119,125,111,121,130,109,138,115,138,143,138,113,120,114,115,115,110,124,126,118,107,102,96,71,88,100,63,89,84,96,107,88,89,85,58,43,35,52,62,60,62,72,61,66,59,55,79,75,33,57,60,85,54,106,69,84,77,60,66,55,82,85,63,62,62,53,30,63,57,59,38,52,74,77,74,54,52,47,72,55,61,51,51,38,62,46,68,46,73,56,14,61,78,49,40,52,49,51,52,47,41,58,40,34,67,27,32,50,52,37,33,28,51,43,47,54,37,53,34,69,47,26,45,71,40,47,50,58,46,31,40,22,58,35,41,44,26,58,35,38,22,34,45,39,58,26,38,38,58,37,58,43,56,46,29,22,54,20,20,42,33,49,46,50,66,49,48,39,10,38,56,31,45,17,47,76,28,18,57,7,57,51,49,19,24,61,18,42,34,71,22,51,48,38,43,29,31,24,54,38,23,32,57,44,25,39,43,30,55,26,36,34,48,33,52,33,33,25,49,51,42,40,24,37,56,66,45,53,47,52,62,44,87,58,24,64,58,37,53,52,50,56,58,35,91,97,100,82,135,118,111,105,144,191,223,196,209,189,188,213,214,190,147,128,76,84,71,85,70,74,26,39,79,40,26,30,15,13,32,33,41,8,7,3,22,24,36,41,14,42,10,38,50,25,38,25,4,16,30,35,30,23,35,34,34,48,51,56,33,46,56,38,67,40,35,44,51,70,77,46,55,74,80,51,94,100,79,85,121,98,103,95,108,101,149,170,187,178,173,149,158,142,157,179,166,179,161,154,156,193,150,141,144,142,141,131,155,158,144,144,136,161,165,161,151,157,147,141,163,177,153,148,181,161,145,169,141,170,171,148,168,143,140,173,177,165,138,157,131,146,148,180,183,163,156,144,156,129,176,165,173,166,171,161,155,160,152,154,134,149,176,151,153,111,118,86,65,108,154,158,170,192,182,163,160,138,133,144,141,151,124,148,111,12,0,15,55,30,13,37,16,13,27,11,3,3,1,1,12,39,2,18,7,10,26,18,28,30,13,22,16,28,27,12,12,61,8,26,9,24,22,44,22,19,28,30,24,23,26,19,2,17,16,28,20,30,28,27,23,16,8,37,3,11,9,5,34,5,25,23,28,32,12,31,20,27,23,19,28,20,21,32,23,35,32,14,25,15,25,10,41,37,21,39,14,42,21,50,55,37,25,31,31,35,20,35,7,15,20,17,23,26,31,4,42,17,43,34,43,35,25,14,30,22,34,23,14,41,18,24,31,5,39,44,42,16,28,26,45,34,57,46,44,43,41,59,56,63,57,76,87,70,57,13,38,72,58,95,92,122,120,106,121,101,100,130,101,85,102,144,140,141,126,132,129,127,105,149,125,149,133,163,133,104,152,147,120,101,119,135,94,118,128,109,124,134,136,139,117,122,114,129,108,134,126,110,121,133,113,139,116,126,118,117,146,133,119,115,124,116,124,85,151,124,127,121,109,136,117,115,147,113,127,133,128,131,137,137,163,152,138,139,134,103,101,94,152,125,122,136,134,123,150,121,129,99,108,139,105,123,121,104,84,136,132,122,120,114,139,160,118,100,132,115,107,99,108,124,95,117,127,96,125,131,113,100,126,124,112,107,129,105,109,113,131,126,124,126,142,123,93,138,105,106,121,136,110,124,104,83,119,124,121,93,103,79,83,98,121,80,101,91,47,63,76,90,78,34,77,82,64,69,25,57,43,38,39,78,37,16,42,66,73,65,72,14,59,67,26,68,42,43,62,58,49,57,60,67,81,66,60,68,60,52,72,60,71,64,55,45,61,55,18,45,71,60,57,49,63,61,44,23,83,81,57,56,56,58,47,46,60,54,52,50,47,65,65,56,55,57,24,37,46,56,50,40,53,52,73,37,31,47,70,31,62,58,69,24,40,38,54,36,48,61,29,38,35,25,40,63,50,34,55,60,49,37,38,57,25,57,64,64,47,42,44,36,49,15,50,44,34,39,55,71,23,55,49,69,48,26,35,23,46,11,26,61,51,29,37,51,37,31,36,43,41,42,7,22,52,45,32,33,8,38,67,27,24,35,48,32,54,43,51,28,32,34,15,56,29,11,33,29,37,35,42,44,29,37,22,40,50,28,35,23,14,46,42,40,57,47,51,58,53,49,73,32,50,72,42,65,66,57,59,49,68,89,101,85,126,115,110,107,120,195,190,200,170,176,164,188,171,162,123,107,114,140,140,164,140,138,126,130,101,54,4,32,3,17,15,30,41,23,14,18,38,22,37,37,7,8,15,12,35,2,17,30,18,20,11,18,35,56,29,52,53,61,29,33,42,54,45,33,49,27,76,55,79,83,63,71,63,78,50,102,78,100,104,158,119,133,148,135,123,141,133,161,141,158,137,160,151,128,164,159,154,137,160,157,167,133,129,156,142,197,110,145,180,168,169,166,137,155,160,172,168,142,165,158,178,147,170,162,127,161,180,159,139,166,160,156,173,139,156,170,189,146,166,163,171,142,141,164,139,144,143,151,172,168,148,156,146,143,157,139,159,154,144,150,159,125,159,169,159,149,119,118,112,97,92,133,169,190,166,144,186,176,140,124,142,149,160,166,113,25,4,1,19,5,2,16,5,5,35,7,18,4,7,25,20,14,37,20,3,6,2,15,16,13,29,13,15,15,16,20,18,31,33,17,20,14,23,40,19,11,25,15,32,16,12,31,10,1,14,34,42,6,8,14,24,26,23,31,14,39,15,10,30,7,18,24,15,29,37,10,16,30,21,17,31,19,29,8,16,26,2,32,3,36,18,20,36,33,36,1,5,48,21,51,17,18,33,20,40,18,41,43,56,42,29,37,21,24,22,19,44,4,42,32,29,29,33,7,32,28,27,20,42,10,17,55,34,32,38,48,37,28,30,43,39,47,37,29,37,53,46,53,38,68,66,48,75,45,65,60,60,25,31,62,106,57,91,51,59,87,104,116,110,105,141,127,139,135,133,119,130,149,125,144,142,123,102,101,119,114,105,138,127,133,107,120,123,108,131,128,150,135,136,130,119,132,114,102,125,133,102,152,119,151,139,128,114,123,119,132,147,113,155,127,132,120,135,131,121,109,119,138,134,134,142,118,135,131,149,127,135,117,137,133,139,151,140,137,113,111,146,115,128,128,140,150,105,129,117,129,119,120,140,131,154,157,153,162,129,113,118,129,128,120,146,126,143,154,136,128,111,115,137,140,116,115,132,115,133,122,149,143,105,111,117,101,108,106,115,99,111,112,83,110,99,97,118,115,115,148,144,137,133,130,97,90,119,137,121,128,106,77,100,114,128,101,121,103,111,125,115,118,102,115,89,83,72,92,64,57,77,87,92,111,82,76,88,98,71,80,49,73,50,57,61,60,60,70,48,44,65,45,42,57,52,58,52,97,83,70,37,49,27,47,46,36,19,78,47,43,47,48,67,82,49,35,45,64,32,33,66,40,57,47,41,57,44,43,67,11,52,51,41,48,46,65,41,44,60,45,42,42,59,26,37,54,34,66,31,42,22,73,49,61,59,31,22,42,43,48,45,59,55,56,33,46,62,41,48,49,59,29,49,34,44,34,45,60,24,47,55,40,43,48,41,33,34,51,51,72,42,30,20,41,43,36,23,36,26,44,40,59,46,61,17,30,41,27,25,56,69,21,54,45,14,34,49,33,51,27,25,33,39,63,34,43,40,26,26,35,42,29,30,44,41,39,27,32,63,44,58,46,29,60,64,45,36,30,47,39,78,70,26,47,40,46,81,63,46,66,58,68,40,54,73,67,64,74,63,61,81,108,107,111,68,63,79,88,92,129,162,132,155,127,141,113,87,112,114,129,178,165,182,166,198,203,210,193,145,74,39,24,8,23,3,27,13,38,34,29,33,1,26,23,14,32,12,25,32,42,33,41,59,31,30,22,22,32,43,17,39,45,33,55,32,59,46,67,61,60,83,58,57,53,80,74,63,50,71,98,125,127,121,137,140,151,135,168,152,166,149,151,144,114,121,124,127,111,121,131,148,119,143,107,137,147,148,174,156,164,159,141,159,172,166,132,167,160,167,165,186,158,171,167,147,139,153,156,136,176,152,169,135,166,154,145,144,160,170,150,132,159,158,167,162,158,174,176,158,171,165,183,159,157,147,147,150,147,125,145,152,134,177,134,144,144,142,161,129,183,162,133,122,97,85,102,119,152,180,167,150,173,133,151,135,156,153,147,102,15,3,18,14,13,4,10,25,31,26,12,7,11,22,11,4,9,8,7,5,14,3,16,33,34,7,19,24,36,24,27,31,9,12,32,23,45,12,0,30,21,32,9,12,18,38,30,28,7,22,24,9,15,43,12,15,16,18,9,25,31,9,13,16,11,12,29,24,22,44,13,5,14,28,20,2,19,18,14,13,39,4,10,18,32,34,4,27,28,20,5,34,12,13,43,11,49,54,34,55,38,48,23,44,38,18,48,20,49,29,27,44,30,48,23,49,20,35,25,41,55,8,42,38,41,26,38,45,18,40,51,51,38,25,29,28,69,74,72,52,32,57,60,65,87,65,119,121,72,111,69,70,72,85,104,106,108,110,99,98,106,124,125,107,115,129,117,124,111,102,94,132,146,144,129,126,102,119,136,120,135,129,125,128,121,133,112,122,149,141,137,134,106,140,112,139,152,122,151,131,132,106,142,127,113,141,141,124,139,115,104,116,168,137,145,147,137,165,132,130,158,143,129,141,143,141,171,113,141,110,118,129,114,126,139,134,110,109,143,145,117,121,140,124,131,133,131,125,143,119,136,134,146,147,141,135,150,122,132,115,144,120,120,141,141,152,142,149,122,141,120,145,145,127,173,146,141,150,148,128,171,126,132,127,131,111,129,122,127,123,127,107,84,102,113,85,74,89,92,92,99,109,137,97,105,86,80,85,95,101,102,128,111,93,128,126,81,82,91,132,115,110,127,92,86,98,106,106,85,79,101,81,99,66,95,104,109,78,82,55,107,66,105,72,68,54,83,65,53,37,29,60,55,61,75,80,68,39,44,44,62,70,57,47,37,29,61,45,50,50,61,51,54,39,60,35,66,49,67,57,59,54,29,59,59,23,44,50,76,53,50,49,44,43,46,58,49,42,49,27,40,49,77,47,44,62,40,74,68,30,60,40,54,47,43,35,59,50,71,50,31,57,45,48,50,55,52,43,41,38,32,52,47,78,42,40,52,60,28,46,58,40,56,73,49,58,36,21,44,46,33,39,34,43,68,31,33,27,50,28,36,46,30,43,48,38,33,51,43,36,30,40,16,67,27,48,20,47,58,58,35,56,25,33,30,49,46,50,20,39,62,45,48,60,18,55,37,53,24,34,39,23,39,32,32,36,56,33,47,27,37,28,18,41,42,48,66,34,40,70,65,44,37,63,42,63,43,55,50,47,73,56,49,63,63,80,64,51,66,61,95,78,103,92,84,65,98,102,121,93,117,109,139,149,124,132,148,183,178,164,110,50,40,31,27,42,38,38,23,31,10,46,14,38,34,45,51,25,12,35,26,33,47,18,29,43,55,41,45,17,53,37,32,42,41,62,42,50,57,45,69,61,73,46,60,64,55,67,54,70,120,104,117,126,125,112,119,145,140,105,143,127,152,121,166,142,153,144,168,159,138,155,153,113,141,135,176,148,165,161,135,175,174,172,152,179,164,127,149,171,160,160,155,157,157,193,140,126,161,143,145,180,140,132,161,159,136,138,150,145,168,153,116,131,151,178,169,170,189,137,149,164,154,148,153,150,143,154,157,148,145,125,154,105,141,139,146,128,121,141,156,140,153,128,119,83,103,73,97,139,146,149,156,169,158,148,163,144,147,116,26,4,17,18,23,15,19,5,21,17,8,21,0,16,28,0,19,6,3,7,42,18,12,9,21,12,32,10,39,27,15,31,11,36,27,21,37,28,39,17,30,34,28,38,19,11,43,22,34,22,8,1,21,6,50,6,18,25,20,22,14,24,32,5,20,14,18,22,34,25,17,16,28,40,6,1,33,5,37,44,24,16,23,32,26,18,19,28,14,6,30,53,48,35,27,31,32,48,28,49,49,52,29,40,45,62,29,49,18,25,33,34,36,37,36,44,16,55,44,52,58,34,37,50,35,29,33,15,42,28,25,48,48,67,48,45,46,61,61,28,59,58,42,68,75,80,91,98,86,82,89,105,131,110,115,136,143,152,126,130,116,118,118,115,113,136,108,95,92,93,106,131,128,101,121,107,95,106,104,113,115,130,130,121,139,117,157,146,137,142,155,183,162,116,125,123,125,125,129,107,134,116,135,158,114,112,146,135,123,146,155,133,131,118,126,120,132,149,130,131,137,146,133,136,138,135,121,133,129,144,152,134,125,139,130,143,129,145,153,122,142,127,117,143,151,140,127,152,139,127,153,146,179,127,151,134,126,127,127,115,170,150,151,141,144,136,120,135,134,128,129,124,117,133,114,136,150,159,162,116,161,173,149,120,147,146,127,122,116,141,123,92,93,100,91,115,108,114,83,111,96,75,90,99,83,68,89,86,92,86,93,124,118,129,120,101,82,55,95,22,56,78,83,85,81,99,100,117,84,103,101,89,95,51,63,84,74,54,75,58,55,25,27,61,32,57,22,64,32,52,69,34,85,80,56,63,52,57,66,55,81,83,70,40,71,97,73,48,64,52,61,54,69,63,63,59,52,59,39,51,56,46,49,54,50,52,37,37,69,47,34,50,40,59,50,45,44,35,49,45,66,46,22,49,73,51,70,55,59,62,36,58,55,48,56,40,22,50,50,55,41,50,46,20,34,55,57,37,28,26,53,34,47,50,29,54,49,48,50,50,45,45,39,48,68,36,16,52,26,54,23,25,40,48,41,52,57,45,48,59,44,56,28,57,25,44,33,47,47,29,44,61,53,34,44,57,32,53,54,53,39,55,36,48,61,32,22,40,46,39,46,39,40,60,40,48,29,30,36,24,33,40,28,63,27,49,74,32,50,51,44,46,64,33,62,56,58,34,62,53,31,45,49,43,68,62,70,56,64,54,57,57,56,57,57,61,61,64,61,29,35,78,77,48,49,80,94,115,95,72,70,74,106,130,71,78,65,113,142,152,116,39,13,4,2,25,37,32,26,26,16,24,29,51,9,38,56,49,41,3,9,37,14,25,47,26,48,48,31,64,33,45,31,48,28,59,55,37,64,53,38,56,89,51,58,63,40,45,64,66,82,104,106,105,119,83,92,123,110,128,124,132,124,139,156,192,144,199,173,118,172,162,155,157,148,157,203,166,151,183,173,158,147,145,161,166,156,133,157,156,150,163,143,143,135,154,133,169,147,170,146,164,167,165,136,155,154,138,149,133,137,134,123,141,141,146,152,148,140,156,162,151,152,159,145,134,164,146,146,150,145,138,155,170,155,156,155,138,116,146,146,143,139,146,143,166,107,93,119,119,104,125,152,162,161,145,164,132,156,94,7,11,30,0,5,18,14,9,15,23,8,0,20,28,28,15,23,13,42,19,6,14,21,21,38,28,17,35,28,13,34,19,52,22,38,14,4,22,21,18,23,3,16,11,18,53,33,27,38,22,5,13,36,18,34,23,34,32,25,9,24,22,12,20,35,27,30,13,0,13,40,6,8,29,39,20,3,2,24,36,19,38,23,39,10,22,16,11,21,31,13,14,25,36,62,24,35,35,6,20,18,34,3,29,22,30,34,16,22,28,51,17,39,24,42,8,39,32,65,44,31,47,45,62,31,18,14,32,44,44,27,53,30,61,15,32,51,41,32,48,50,37,51,54,73,78,74,90,65,85,58,69,123,115,124,98,109,118,135,129,159,114,107,136,135,157,129,121,111,127,128,150,135,133,132,117,111,126,138,122,115,124,100,97,131,111,115,133,129,102,119,136,135,119,126,124,118,137,144,130,112,141,126,100,124,125,124,138,137,135,119,119,114,114,112,137,137,116,137,122,113,138,151,125,120,144,136,140,120,126,122,123,130,144,142,148,136,142,127,127,112,140,114,115,142,130,125,134,120,127,141,139,167,152,154,159,141,149,124,158,138,140,142,141,153,118,136,141,127,132,134,139,133,164,146,134,148,158,139,138,155,121,133,95,121,124,137,130,143,107,121,91,92,103,118,123,112,88,86,91,113,99,131,143,103,81,108,112,92,110,106,101,106,127,108,98,119,114,94,94,106,62,105,88,69,91,110,108,77,101,67,86,40,66,82,81,41,31,58,84,71,30,53,56,89,64,71,93,54,48,85,87,76,87,65,90,69,53,71,96,91,66,94,86,68,75,80,83,102,80,59,59,51,47,53,55,70,50,65,78,69,39,41,61,32,47,99,58,62,62,31,48,68,72,41,57,54,50,37,75,57,53,52,43,49,67,69,46,68,54,52,55,32,35,40,68,66,35,29,49,44,55,53,44,83,51,49,47,55,49,8,28,82,33,48,42,47,34,32,52,58,49,50,41,38,35,31,62,22,54,35,40,20,41,42,30,24,40,59,47,8,59,54,41,49,24,12,36,37,36,28,36,42,15,27,42,52,54,41,32,54,25,44,31,58,24,22,36,39,46,16,66,38,46,54,54,22,32,32,11,42,17,20,50,11,34,18,40,36,37,29,30,28,33,49,30,57,68,69,57,54,50,61,76,61,53,54,69,55,39,58,25,54,57,74,45,72,61,74,73,63,69,84,75,61,60,101,98,104,103,105,102,103,87,92,71,58,77,115,116,68,45,30,30,35,33,51,30,13,23,49,29,29,41,16,19,37,41,24,23,22,39,66,22,34,46,24,28,17,15,19,53,47,47,67,41,38,58,29,35,59,45,31,67,35,67,36,57,67,71,82,90,113,124,133,121,126,147,121,129,96,107,130,114,164,158,168,172,162,168,149,130,138,138,142,168,149,144,169,173,170,150,162,159,147,139,160,151,157,160,163,180,185,154,147,150,152,145,155,145,146,183,161,157,152,152,136,159,126,136,158,154,159,147,148,148,156,127,147,139,145,159,146,131,139,141,156,160,154,158,166,144,139,137,147,136,169,128,140,143,151,157,132,114,157,151,148,124,122,104,98,136,127,162,150,148,164,148,152,115,0,3,17,10,35,20,35,1,29,23,3,6,16,1,27,11,19,13,37,28,0,4,36,22,33,31,30,33,23,9,18,36,27,14,33,25,23,25,54,48,7,26,43,23,31,28,12,18,13,13,12,29,15,17,13,25,18,27,22,41,22,17,24,41,4,32,35,19,32,18,10,4,15,26,46,29,18,6,49,25,36,12,11,23,28,33,42,18,19,33,36,19,29,30,9,29,21,11,33,30,32,9,11,32,21,34,19,32,40,52,52,51,33,50,32,52,53,33,23,17,29,63,26,37,46,64,56,35,64,52,46,22,35,30,48,70,30,43,55,65,54,48,41,79,62,54,58,69,73,102,91,82,95,83,94,84,85,87,99,102,153,159,148,128,102,120,152,148,121,159,136,139,114,142,127,116,145,132,138,142,153,129,141,147,137,123,122,137,129,118,127,121,124,130,141,114,168,131,133,140,162,122,137,147,148,133,129,117,134,104,141,115,130,117,133,144,131,116,122,111,125,132,120,125,110,150,146,146,148,98,130,142,131,144,134,121,142,140,144,148,142,137,143,131,130,153,144,137,130,148,149,143,161,141,133,149,148,158,143,160,151,127,113,114,118,132,155,130,114,135,144,124,137,147,152,122,124,117,124,136,136,127,129,137,116,140,107,137,106,143,171,147,159,136,122,150,114,159,123,130,138,139,143,150,129,144,131,113,133,127,104,116,105,103,74,91,116,83,100,108,117,106,99,111,85,119,82,65,109,97,95,108,96,106,79,86,83,101,80,65,52,61,62,55,94,96,116,89,90,111,95,67,81,86,88,49,69,109,78,80,75,72,78,86,94,67,65,104,98,83,101,76,48,71,68,67,64,56,80,37,63,28,56,42,83,48,69,49,62,46,46,32,65,59,58,48,45,48,34,58,75,42,60,48,72,38,63,46,60,56,30,26,37,15,52,44,44,23,40,36,52,29,54,38,58,53,61,46,48,43,52,44,31,52,64,26,39,62,31,25,49,53,39,55,50,63,51,46,23,59,15,39,52,37,51,42,45,25,50,44,47,32,35,41,53,47,66,50,32,47,36,24,28,50,42,48,45,26,40,26,47,42,38,57,29,33,23,33,38,28,43,44,61,50,45,31,66,35,29,41,24,43,47,48,40,55,60,37,24,42,45,67,20,70,57,41,36,48,44,39,64,51,54,62,55,73,48,54,60,58,65,49,58,44,37,104,50,94,99,94,86,52,105,127,105,85,88,89,97,99,100,68,87,79,88,83,66,65,52,72,70,55,19,2,51,21,20,27,36,4,15,31,27,47,51,40,38,30,27,20,31,30,25,41,42,55,15,38,22,61,17,20,36,31,48,43,29,50,36,35,57,62,65,70,39,56,52,37,60,80,85,73,91,120,125,121,139,147,123,137,150,143,105,135,135,173,148,144,140,171,139,158,175,166,148,164,154,151,140,171,155,152,174,163,173,160,179,147,141,167,165,167,167,143,168,140,160,183,168,153,164,166,157,179,164,170,161,164,178,165,162,165,163,136,140,156,136,158,138,185,188,164,159,143,129,155,143,126,170,128,137,148,149,154,165,168,160,174,138,147,123,155,159,150,129,153,140,148,153,73,68,91,116,120,158,163,189,193,157,114,12,10,11,13,0,14,7,19,31,11,11,5,4,0,15,5,9,0,33,10,12,22,14,24,30,23,15,14,31,33,27,16,22,24,12,20,8,25,9,22,30,39,22,16,19,22,4,24,3,7,30,28,24,50,31,14,36,22,37,17,15,14,36,15,13,30,22,13,30,26,25,35,23,18,30,35,14,14,37,34,9,5,35,15,43,10,12,28,26,48,26,23,4,20,8,45,7,18,46,37,21,20,47,29,10,42,25,51,42,23,23,30,27,10,43,53,47,45,61,31,23,36,30,26,32,43,47,63,58,55,53,41,29,63,59,67,64,37,36,64,30,56,70,49,84,71,84,83,69,66,99,139,102,92,100,77,80,81,81,107,90,121,117,137,126,146,125,107,158,95,124,128,129,115,130,140,125,141,152,175,123,143,138,125,122,95,112,156,96,134,117,140,131,116,107,123,137,138,141,128,157,148,142,158,124,147,134,150,134,127,130,95,136,133,168,169,142,140,143,165,130,126,111,111,143,143,134,148,119,136,139,132,145,145,140,155,139,129,132,160,160,113,141,146,156,137,115,135,134,118,125,135,131,109,119,139,150,137,130,126,115,90,92,82,124,115,122,134,125,114,121,125,116,126,116,116,100,111,165,123,129,114,126,126,145,126,98,105,129,113,140,152,141,151,117,108,137,111,125,116,116,124,155,110,120,131,110,150,93,77,90,114,81,72,80,73,48,87,94,93,72,115,73,100,75,76,115,74,123,90,103,115,135,89,50,73,90,58,106,74,72,57,48,77,61,102,90,80,56,87,94,28,69,44,57,62,42,61,63,50,61,56,68,66,84,69,70,80,75,82,39,66,73,61,76,72,81,58,45,65,93,75,48,55,31,47,33,50,43,45,73,68,46,30,74,62,49,48,33,45,53,45,42,67,22,60,46,31,49,56,34,55,51,35,50,58,80,39,42,58,62,45,40,46,13,29,41,36,37,14,54,24,68,49,55,50,39,53,32,45,46,40,36,42,66,41,59,20,31,37,50,59,31,51,32,34,53,36,40,33,38,45,19,37,23,45,21,25,33,22,36,40,53,16,34,48,27,44,29,46,43,47,26,61,22,45,40,40,60,50,26,58,65,62,15,34,31,31,36,47,32,57,29,32,63,43,32,30,40,58,32,42,52,37,58,33,48,31,62,51,30,34,64,53,55,34,46,85,73,48,62,30,58,81,71,61,80,69,87,76,59,55,80,67,77,69,50,78,56,82,71,69,57,66,62,71,51,47,54,69,68,64,51,30,35,30,12,23,8,16,31,52,32,30,40,18,20,41,20,59,29,37,22,44,41,24,59,24,55,32,45,56,46,33,38,57,48,47,52,46,45,54,64,63,82,36,75,57,47,91,70,62,88,84,113,113,84,95,121,143,144,124,133,139,145,123,146,165,135,117,134,138,166,175,150,136,150,144,156,159,130,164,150,151,178,167,136,156,149,116,154,160,139,154,165,136,145,159,147,158,144,154,134,178,153,156,150,170,165,144,128,147,121,134,160,140,171,149,172,143,155,143,169,165,153,195,171,157,132,145,158,148,139,162,135,151,159,158,137,170,153,160,130,148,152,171,165,132,123,105,93,67,77,111,141,170,154,143,160,110,11,15,34,7,21,11,26,5,19,0,21,24,12,5,19,5,13,4,10,23,3,19,9,8,9,39,50,38,23,30,15,16,21,26,27,33,24,28,24,27,14,9,26,43,4,11,22,28,23,17,5,9,23,38,24,33,23,15,16,22,36,0,11,31,11,31,48,10,12,13,13,18,17,42,26,16,14,47,26,4,28,4,34,39,14,31,19,13,19,19,31,22,26,39,48,3,20,20,43,46,46,48,11,9,39,35,27,12,28,38,14,12,22,40,49,58,35,18,62,45,27,42,80,44,60,55,72,73,24,61,46,67,64,38,41,43,37,57,97,57,75,90,95,100,106,100,99,124,64,82,74,120,79,89,98,72,83,71,87,61,121,111,136,120,142,141,143,120,133,124,127,124,140,113,127,135,128,113,140,131,123,100,126,115,124,113,106,128,124,148,122,154,123,114,136,141,122,125,118,100,140,119,131,152,134,129,152,134,136,156,132,117,146,134,146,124,139,133,121,131,121,123,117,150,132,119,114,120,146,113,168,132,139,132,157,142,155,142,144,144,122,124,134,135,117,125,131,134,122,130,112,122,121,137,114,102,133,125,138,118,134,105,137,108,123,128,109,109,121,110,132,140,152,147,137,135,129,98,112,103,111,102,114,89,136,109,105,112,125,124,147,125,123,85,120,103,125,134,105,127,147,136,125,129,132,96,85,95,121,106,83,99,116,80,79,82,97,90,104,72,78,89,86,81,74,77,74,36,65,32,54,78,75,104,65,95,62,60,53,46,52,61,62,63,56,63,69,71,90,55,51,87,70,43,81,65,59,54,52,71,65,49,80,39,51,50,62,45,72,68,63,50,69,49,53,43,48,41,64,59,82,61,50,69,37,66,46,58,71,32,51,43,59,34,67,55,68,65,59,20,35,46,50,48,47,34,29,57,57,55,64,57,53,49,48,61,57,66,32,49,40,62,46,60,47,63,42,49,51,76,56,54,51,41,44,75,37,42,57,64,34,50,32,43,59,35,54,51,59,38,40,60,44,32,34,57,35,48,51,41,64,60,51,54,52,1,58,44,66,38,30,43,36,74,73,48,56,30,38,25,49,37,47,57,38,57,50,58,52,74,44,47,14,10,29,29,33,64,36,37,47,44,42,53,48,30,34,41,34,35,47,32,54,68,34,36,34,32,45,51,65,45,72,71,64,70,68,60,70,54,59,48,40,48,72,74,70,44,64,43,72,69,63,71,89,75,81,55,66,63,56,53,62,42,69,74,62,66,53,62,32,27,37,38,44,28,33,51,44,29,40,26,54,29,24,5,31,34,26,56,52,34,29,22,28,39,7,69,31,32,47,52,60,54,48,54,68,30,58,58,57,38,52,70,66,68,59,52,77,72,95,101,65,51,71,84,90,89,95,116,137,106,125,149,132,149,134,144,153,144,199,155,133,181,151,144,143,150,128,127,129,121,153,132,145,144,126,166,152,153,139,160,161,146,161,167,156,162,170,154,158,145,147,178,150,179,158,153,143,159,157,133,145,138,173,180,141,140,143,155,147,168,164,137,160,170,170,148,153,137,151,153,142,165,149,180,141,145,135,132,144,145,137,140,127,141,121,155,175,137,136,103,89,76,95,129,145,162,163,106,26,0,9,37,16,10,2,12,16,2,14,32,19,5,7,15,13,12,42,12,20,0,3,9,17,30,48,1,28,26,18,7,11,52,36,22,31,9,29,27,7,32,5,11,10,28,13,36,23,23,11,34,30,18,23,11,39,26,2,11,12,17,38,5,15,17,11,11,4,10,9,8,15,27,24,35,35,37,5,37,18,26,2,19,23,19,40,32,12,26,33,39,46,30,24,45,13,49,26,18,25,42,35,27,36,11,33,36,36,35,27,28,5,35,19,57,53,35,28,52,46,39,29,35,65,59,58,72,56,35,54,33,77,49,53,49,35,72,68,74,79,94,66,77,88,96,71,79,80,56,99,80,67,88,64,88,94,110,117,105,108,131,123,118,152,134,126,148,128,131,132,107,126,138,140,101,120,128,119,117,132,125,155,136,104,153,133,130,119,136,164,126,160,144,143,124,101,125,144,113,120,144,143,143,101,111,105,129,124,162,127,133,133,127,134,126,125,87,132,127,131,126,97,134,132,133,133,162,127,134,142,142,135,143,119,131,135,124,144,139,126,130,131,143,129,118,112,152,153,139,112,132,165,135,155,139,115,135,151,150,167,134,134,130,155,121,132,120,127,128,117,125,133,129,156,115,123,134,120,116,121,107,147,134,121,119,130,137,130,130,156,117,128,115,104,129,141,141,122,99,137,115,139,138,144,107,89,125,129,118,123,105,130,153,115,123,154,126,114,84,101,80,97,103,87,59,79,89,63,75,63,95,86,86,83,64,73,83,74,100,100,102,118,97,81,86,96,67,56,102,85,96,54,31,73,61,67,65,81,73,84,56,51,76,52,79,40,53,46,54,56,59,41,78,57,63,29,59,47,66,60,59,51,61,46,58,69,55,45,66,46,49,56,56,77,43,42,74,48,24,24,55,49,46,54,27,55,59,56,32,62,41,52,58,58,40,52,61,42,35,48,61,39,49,26,44,36,45,32,25,69,47,57,16,52,42,44,41,66,46,46,44,34,28,59,52,63,39,42,48,58,51,36,53,49,58,55,66,50,57,26,26,43,58,25,24,52,30,73,21,54,30,52,48,46,57,31,16,48,66,18,48,40,32,41,29,49,56,48,30,26,59,29,44,36,33,39,47,38,58,23,47,54,37,40,24,38,41,78,43,61,15,49,26,43,37,52,40,37,39,33,53,57,62,50,45,47,81,70,48,37,56,52,39,49,51,69,86,46,43,64,79,88,45,86,88,55,95,97,107,108,93,79,58,45,75,59,54,24,75,28,23,45,47,9,24,12,39,31,26,48,20,21,61,46,52,31,35,60,53,52,43,29,19,54,41,36,40,32,42,21,34,33,60,40,52,45,63,38,37,68,59,61,49,62,22,46,53,77,69,88,77,52,56,51,92,82,92,97,125,136,144,132,162,121,132,162,171,151,157,148,171,146,181,173,177,133,156,166,170,186,140,151,149,157,148,150,126,157,115,167,183,162,116,125,148,171,162,157,151,165,148,158,163,131,154,159,134,109,147,169,168,150,166,166,137,144,142,149,174,145,149,139,142,145,138,158,160,158,156,159,139,173,165,161,154,142,146,131,155,144,130,137,160,146,133,136,142,142,148,144,120,78,84,78,106,123,127,164,102,16,18,15,11,12,11,11,18,12,35,20,21,13,4,20,33,15,9,10,20,3,25,11,25,38,32,34,5,48,16,40,18,11,36,11,30,7,41,18,41,30,11,23,15,31,32,25,33,9,29,36,6,10,28,32,38,35,14,6,27,10,19,7,11,6,17,20,14,11,9,14,27,10,18,38,36,12,9,29,17,16,17,44,23,34,44,38,14,41,22,19,36,30,22,64,33,17,33,20,49,34,35,25,26,28,28,12,55,15,9,39,61,3,39,44,29,16,33,51,40,58,24,24,51,45,46,77,34,76,84,71,71,82,84,73,91,73,87,90,76,90,79,95,103,66,90,57,70,55,95,71,82,92,107,135,104,134,126,111,113,132,137,132,132,122,129,118,98,125,132,134,127,110,135,122,117,148,153,167,152,144,170,152,139,152,153,147,147,125,107,156,129,125,133,127,136,141,129,145,134,143,136,162,142,151,134,118,123,147,142,126,141,149,115,124,144,104,137,124,166,149,128,158,159,145,149,153,170,127,120,154,131,127,114,103,142,101,129,134,160,124,146,120,109,147,118,126,87,110,124,137,134,145,108,118,123,137,128,150,149,164,163,133,130,137,146,123,143,146,146,133,128,138,122,144,138,121,104,128,134,124,120,138,125,123,136,143,181,118,116,137,138,123,149,138,130,160,101,110,107,105,113,140,143,138,140,144,170,148,165,147,142,151,129,125,143,151,128,134,112,117,83,132,111,133,105,119,110,98,64,70,104,86,113,86,69,76,70,96,95,121,103,70,93,76,67,53,88,47,86,37,61,75,69,55,85,83,64,35,60,74,80,69,78,44,59,95,51,50,58,65,39,53,42,55,62,56,31,62,38,63,34,61,55,56,64,43,34,64,46,26,60,53,43,59,49,48,23,40,30,36,38,53,43,29,74,48,61,67,35,35,44,44,48,49,29,53,42,52,55,59,53,73,57,42,71,59,45,53,36,43,46,42,67,51,55,47,36,13,54,26,28,46,53,30,30,43,35,37,37,37,43,70,49,42,56,38,31,50,47,28,29,33,36,37,13,35,37,58,46,25,54,56,53,44,36,79,45,36,56,35,39,66,46,75,46,62,40,33,45,34,33,37,40,49,30,59,51,25,31,40,60,55,50,29,41,14,17,50,62,46,65,66,35,37,33,49,43,82,67,38,45,50,29,42,33,46,38,41,38,54,44,43,60,38,46,42,69,52,65,75,72,55,72,87,76,86,93,72,106,113,89,63,58,30,62,41,49,46,74,60,53,52,25,32,39,44,44,25,38,43,38,37,49,30,37,13,29,32,23,35,4,35,26,36,56,40,52,38,32,44,30,40,59,50,47,42,57,47,51,70,35,62,51,44,66,59,86,67,50,52,60,50,55,55,107,105,104,118,126,151,141,155,184,168,121,181,139,181,168,165,144,170,171,168,159,163,169,160,180,152,191,163,178,148,138,166,147,164,191,157,176,168,131,132,155,170,169,182,156,150,156,178,173,160,189,153,176,162,153,148,172,152,159,148,142,145,165,171,150,156,113,123,141,139,157,173,163,149,148,139,181,119,176,140,168,138,181,161,155,166,170,138,140,139,134,120,177,150,162,166,172,159,121,75,81,114,82,126,119,38,19,14,16,7,23,23,2,25,44,13,20,27,12,4,31,13,14,33,7,16,15,41,25,4,9,46,29,33,14,14,18,49,34,24,34,19,22,32,18,5,16,21,14,28,20,9,30,5,11,31,14,22,35,17,31,11,16,17,4,8,26,18,22,47,24,16,15,20,9,42,25,30,46,13,7,18,19,34,33,17,10,33,29,31,11,39,43,25,34,25,29,17,30,20,5,22,36,30,38,45,64,45,33,33,28,33,33,48,30,51,66,27,35,40,29,44,54,55,48,50,29,50,24,56,45,73,67,94,103,93,99,107,122,88,112,71,101,76,81,71,57,115,68,133,94,96,84,104,123,115,118,137,138,109,116,121,132,138,131,109,138,127,102,130,107,118,128,93,134,128,137,121,114,126,117,157,167,126,154,137,143,123,137,116,117,129,125,136,140,132,132,119,146,131,164,128,133,141,137,137,125,161,161,159,162,167,148,155,139,141,138,144,140,140,141,142,158,157,123,162,152,155,157,169,151,137,132,145,137,129,157,122,146,114,113,109,96,123,115,122,131,140,157,129,113,137,103,158,110,138,150,149,118,142,150,142,147,137,155,140,155,128,141,153,148,129,148,141,126,152,165,147,159,133,114,148,131,148,159,131,112,118,127,99,156,134,126,131,112,105,101,140,130,131,116,148,127,125,114,154,155,137,115,106,139,153,159,114,118,132,113,107,140,135,85,88,118,125,79,88,98,66,62,79,114,93,69,112,99,59,56,92,59,74,94,82,89,79,76,49,72,61,28,50,43,46,60,51,65,49,63,44,51,84,68,60,66,58,57,60,71,64,71,65,72,73,44,65,74,80,71,57,55,66,61,19,64,70,87,50,51,54,65,22,50,28,64,65,44,49,56,32,39,57,43,20,55,33,45,50,47,46,91,50,28,62,67,68,67,39,32,49,47,56,34,38,52,87,22,50,57,47,29,50,33,38,51,46,38,63,43,55,42,41,48,42,58,32,43,52,57,36,57,52,67,50,36,34,38,24,49,41,56,42,49,45,35,46,34,46,46,31,26,46,55,53,47,43,20,33,52,60,55,29,51,21,41,74,52,59,39,33,56,63,54,59,56,13,36,44,19,37,31,22,39,43,12,62,12,16,51,46,28,29,29,37,40,47,48,52,41,52,46,43,40,44,44,39,38,57,34,46,34,33,33,49,50,68,46,37,69,39,62,58,67,57,71,60,60,52,67,70,69,75,76,59,100,32,76,42,78,58,67,81,48,52,63,35,54,39,57,34,51,49,18,25,42,55,45,31,45,33,24,14,7,60,50,28,32,51,31,39,45,17,46,32,24,23,82,40,19,25,41,61,8,29,32,62,32,53,68,51,48,67,60,43,63,40,41,77,32,51,44,53,69,83,70,88,124,149,135,154,173,150,145,140,144,173,168,162,140,165,185,187,164,159,167,170,190,185,176,166,155,153,153,160,148,143,177,165,156,168,144,149,140,157,158,157,152,161,154,135,145,186,141,176,145,140,153,130,172,164,156,160,160,164,163,143,153,166,122,151,152,130,137,147,156,148,140,143,159,141,129,137,157,152,163,158,163,164,154,153,159,126,158,122,124,160,111,149,168,168,132,117,79,99,102,119,103,29,14,17,13,35,12,1,35,18,15,13,2,20,20,19,0,1,17,9,7,19,13,3,22,26,36,33,22,35,15,28,19,24,26,32,29,51,24,23,37,12,28,6,13,15,6,31,32,35,15,9,15,23,8,21,20,26,45,20,18,56,48,19,18,13,24,20,36,25,2,12,20,46,46,34,20,23,37,7,27,21,33,33,39,32,39,5,28,45,26,16,21,12,19,36,44,41,38,30,33,7,42,25,51,36,38,31,13,58,19,11,23,23,44,43,53,50,52,50,31,39,75,39,52,50,58,55,80,71,72,71,79,86,80,65,97,73,74,84,48,62,34,62,98,91,94,98,143,105,95,120,157,138,141,115,143,128,125,124,124,138,128,116,141,148,123,134,145,151,136,108,154,126,149,156,164,130,112,135,130,127,105,104,142,109,113,122,128,141,126,130,135,119,141,114,111,132,141,140,139,127,133,162,134,138,122,152,132,157,147,150,138,122,136,109,156,137,138,122,154,154,148,152,155,168,153,125,130,142,94,109,126,134,135,133,133,114,147,120,150,130,160,136,110,163,161,139,159,159,132,135,131,155,166,158,150,125,107,137,134,137,154,165,110,123,140,131,119,144,147,137,140,159,117,142,137,168,190,161,121,131,114,137,140,136,137,156,157,160,168,130,160,133,101,107,102,91,98,140,125,115,108,118,74,94,132,120,104,67,94,122,96,111,99,89,89,74,81,86,93,83,91,70,44,78,91,120,100,114,76,80,89,68,61,85,56,74,80,94,92,74,63,81,60,61,40,53,31,60,59,45,53,47,38,64,90,54,50,45,86,26,72,70,58,43,41,37,71,58,38,85,58,49,79,56,71,51,75,64,56,48,58,54,47,56,49,58,63,66,60,65,65,40,66,43,74,54,46,56,59,47,53,50,33,59,56,47,74,46,56,59,49,64,18,65,55,52,28,59,42,56,54,64,68,48,71,47,24,47,67,44,62,26,57,60,23,69,28,32,43,46,51,31,26,48,16,42,49,34,69,51,47,48,22,26,42,64,29,40,41,47,53,42,47,30,15,38,19,27,54,48,28,26,37,29,45,54,27,41,27,36,36,40,35,13,32,48,24,40,49,39,40,25,68,41,51,27,46,22,51,53,28,55,34,46,43,62,36,54,40,44,45,55,52,37,43,55,46,53,58,61,39,56,31,37,47,39,61,73,62,58,60,43,54,46,54,59,70,73,51,92,70,76,102,106,74,58,42,48,56,55,64,38,68,68,75,40,61,42,37,56,52,65,46,49,29,30,50,56,37,27,50,29,44,44,21,11,40,51,50,26,38,25,50,40,14,53,53,28,52,48,41,42,65,70,47,38,45,54,22,63,70,60,47,27,62,52,46,61,50,89,52,83,77,64,63,62,49,64,91,112,82,108,122,145,137,136,119,161,160,129,128,145,148,175,147,123,160,134,188,166,161,167,169,164,191,151,148,158,141,157,172,145,159,142,124,150,157,160,159,148,145,166,168,136,140,143,114,139,130,168,160,149,181,175,156,168,160,145,163,171,155,164,163,140,160,141,178,160,165,154,170,156,163,138,130,147,153,144,126,163,148,155,147,136,137,143,143,155,156,153,143,173,144,130,122,116,78,93,81,16,21,32,0,0,9,7,26,6,12,17,18,22,6,18,18,7,8,19,6,14,17,23,14,27,31,11,39,48,25,20,15,38,29,23,31,11,51,31,32,13,30,20,18,11,21,13,12,6,24,34,9,43,13,27,18,43,3,29,36,35,20,29,16,26,34,25,29,30,12,28,29,21,25,40,31,20,26,25,39,23,25,11,34,30,22,20,40,49,49,32,11,38,40,17,40,24,30,16,38,52,24,29,40,45,35,35,48,40,48,32,40,22,25,29,34,47,57,47,67,41,32,63,24,40,52,57,47,46,54,48,28,36,40,79,53,85,47,28,55,15,48,54,71,58,90,104,86,109,137,130,118,138,126,139,144,167,164,148,162,141,169,154,149,129,124,138,141,126,124,152,145,158,154,141,138,149,126,134,131,122,98,113,112,141,142,121,135,131,160,161,150,164,167,140,149,140,144,114,141,128,108,124,125,128,109,159,142,123,131,150,131,143,125,151,126,113,130,101,140,141,108,141,141,113,144,141,134,145,171,134,142,143,140,157,134,114,158,147,154,141,141,122,134,132,143,141,167,158,143,138,130,102,124,119,132,139,136,146,151,145,161,134,134,107,115,110,135,135,129,178,120,142,125,114,134,143,163,148,151,162,164,151,144,146,112,156,132,138,118,149,158,130,129,126,101,117,110,113,112,83,112,126,115,92,92,84,84,60,73,104,69,118,129,99,103,89,92,95,59,107,99,121,97,92,90,90,86,111,95,101,97,93,13,56,49,68,53,86,80,69,65,73,82,120,45,92,78,94,77,73,65,73,61,46,69,39,58,68,54,68,57,53,41,43,31,66,51,76,37,60,57,41,46,57,55,47,70,59,60,55,56,52,56,32,39,39,56,46,54,70,52,39,62,46,40,65,54,25,57,48,65,44,62,69,42,28,49,67,44,66,61,49,47,38,75,66,57,62,68,64,54,40,47,38,56,36,30,43,19,50,39,41,36,46,43,60,52,51,31,40,53,45,55,51,17,53,18,35,47,53,43,54,54,45,27,29,18,19,49,50,13,36,29,55,46,34,59,47,49,45,47,45,47,41,58,28,58,44,23,55,33,45,64,53,37,23,45,28,54,56,20,58,52,54,25,42,51,66,31,62,53,32,34,29,38,67,33,52,23,44,37,50,43,48,44,50,45,57,52,58,42,52,44,50,49,58,58,58,67,52,21,37,46,60,46,43,44,49,51,38,69,53,59,68,60,59,47,73,41,73,70,88,69,71,53,55,44,44,39,33,53,47,29,29,25,17,56,46,20,34,31,30,61,51,56,25,62,39,31,45,44,42,31,25,31,31,55,29,41,37,42,50,46,47,51,53,57,47,55,48,28,58,39,71,52,39,61,60,54,84,60,90,62,75,74,52,73,92,107,140,106,125,105,126,130,121,130,159,178,150,138,127,138,169,216,132,150,177,167,145,151,164,161,155,156,138,151,168,161,161,144,147,133,143,156,167,140,129,156,132,140,156,157,122,121,134,156,149,172,193,179,158,150,153,132,173,161,154,178,170,173,130,136,138,163,121,165,132,147,152,186,164,182,126,156,144,154,172,157,156,166,158,127,161,131,135,141,153,157,156,133,163,145,146,143,118,100,76,65,25,11,40,0,11,37,46,12,2,24,7,27,22,31,20,12,24,5,17,8,27,26,11,9,17,16,15,20,43,8,17,11,30,31,28,43,33,37,15,21,28,21,29,48,19,7,9,14,29,30,10,22,25,21,18,31,43,19,31,11,38,38,32,43,22,32,17,43,22,23,37,27,32,8,18,25,23,32,28,10,21,20,36,23,22,33,7,50,24,45,30,47,42,31,34,45,38,30,27,34,22,36,51,24,48,50,35,33,49,69,16,18,31,25,50,42,54,55,58,65,35,34,75,41,50,47,60,44,62,58,44,27,42,40,59,76,70,55,57,41,75,65,41,80,45,85,94,130,136,108,102,99,135,95,120,137,156,147,191,155,148,113,144,117,98,133,142,143,124,143,143,134,125,143,156,117,121,123,139,150,127,155,159,121,145,152,134,136,103,132,131,127,133,154,156,145,138,143,150,139,142,120,131,130,135,146,127,123,128,131,131,135,138,132,111,134,153,111,118,126,130,121,133,131,129,158,151,128,171,152,135,135,148,157,149,153,151,157,139,133,128,140,134,126,145,122,158,165,144,133,113,132,142,143,151,180,140,111,127,131,137,137,123,117,135,120,126,118,115,117,126,168,126,137,138,146,103,81,129,105,115,120,100,115,102,92,100,94,94,103,80,101,100,142,131,129,117,129,137,124,131,149,141,143,122,139,113,106,106,92,120,121,137,99,101,94,122,146,111,100,110,136,129,130,114,80,74,75,49,44,76,91,71,57,93,60,65,74,62,54,69,62,67,57,98,104,88,89,93,58,72,75,63,76,72,84,77,78,76,44,28,68,76,65,57,72,68,72,52,55,74,40,37,71,60,56,45,44,51,21,38,42,36,41,72,35,44,55,30,42,43,65,29,42,53,38,54,81,56,65,61,48,36,67,56,45,62,37,70,49,52,65,54,58,48,44,57,34,22,47,45,54,44,63,33,34,59,56,59,51,25,56,29,55,53,62,57,42,50,18,51,40,23,35,31,48,48,26,53,47,45,35,30,39,22,44,36,40,40,64,40,73,48,62,36,43,27,33,39,56,50,66,40,41,47,44,38,50,35,49,47,48,36,45,28,27,12,46,38,38,29,35,38,62,55,43,5,45,44,47,74,41,42,40,47,47,42,21,42,66,35,72,28,32,54,60,65,51,66,58,38,41,52,67,72,41,63,50,41,32,34,35,43,56,23,56,48,51,57,54,40,49,66,44,43,43,59,58,54,36,87,57,49,77,78,64,67,59,30,55,66,21,28,37,26,53,32,37,30,30,25,34,37,30,27,36,40,30,52,36,64,26,47,38,44,32,50,46,11,21,57,43,43,40,31,43,64,53,37,33,54,19,58,69,50,55,66,51,57,77,87,86,70,94,95,89,102,104,109,142,133,119,132,128,164,135,158,156,160,172,182,167,153,163,161,195,151,171,155,150,155,162,138,144,150,163,150,130,167,155,164,129,165,127,171,138,173,142,164,133,180,165,162,164,161,167,164,173,168,155,166,162,176,169,163,163,138,166,166,167,144,153,160,167,141,134,164,150,145,153,164,154,121,136,141,129,139,153,140,155,149,137,136,120,153,134,147,134,143,134,149,151,123,134,152,136,146,116,82,80,30,5,27,6,26,17,10,1,11,33,14,9,33,0,22,13,1,14,39,5,33,32,22,21,34,19,22,21,45,37,8,18,31,19,8,34,32,6,31,28,22,21,17,40,41,15,16,14,32,14,22,14,13,19,29,26,22,9,17,26,33,26,40,18,43,24,3,15,33,40,29,15,48,21,32,30,50,31,41,23,43,31,23,21,29,29,9,20,29,9,31,11,22,42,51,36,47,41,47,57,26,26,18,54,58,15,32,78,57,54,65,55,87,57,52,54,67,91,99,83,74,85,102,118,94,111,157,133,138,126,135,140,113,107,113,107,82,63,91,63,66,97,110,95,98,87,88,97,74,86,79,90,99,108,93,83,75,97,110,97,108,113,127,143,123,119,127,146,131,125,138,151,159,155,156,161,122,113,145,150,137,139,130,137,141,173,156,132,146,145,141,153,120,155,142,160,159,158,154,161,121,126,138,126,125,142,143,138,136,169,166,134,152,126,146,163,170,158,146,164,154,140,155,154,143,152,192,152,142,156,170,132,130,128,119,120,89,123,111,125,141,137,122,144,130,132,162,150,136,116,111,145,165,150,142,163,158,163,147,141,117,96,107,156,138,191,156,155,175,165,115,108,122,129,147,163,165,129,161,179,154,170,131,151,145,176,149,145,139,149,148,148,135,107,132,122,133,128,132,149,127,120,120,112,130,136,143,98,93,127,158,123,117,145,151,114,101,151,144,143,122,98,97,126,70,97,118,115,170,130,94,58,57,91,105,68,98,112,113,90,85,104,100,88,82,61,78,50,44,61,46,65,78,80,72,76,73,64,71,47,92,93,85,90,82,52,96,75,85,51,98,64,58,63,55,79,56,84,42,27,73,51,44,46,56,39,54,62,58,26,65,62,22,42,38,30,45,70,18,32,17,37,78,32,61,58,48,60,45,25,54,37,41,55,46,75,57,27,54,45,42,64,43,43,59,69,46,28,60,77,30,40,33,59,32,36,32,54,50,50,41,32,49,34,32,65,50,53,29,41,39,37,64,49,21,39,47,49,55,53,58,25,64,44,51,46,50,53,42,38,37,41,45,14,64,70,43,40,27,30,46,52,66,29,46,37,40,59,44,36,28,46,45,32,36,25,10,51,70,31,36,55,46,36,20,36,40,29,52,41,65,29,44,59,57,57,37,35,76,32,25,54,62,52,59,62,39,42,45,58,54,44,31,52,56,30,37,29,29,49,42,32,50,52,44,32,75,45,48,44,46,68,47,53,66,58,48,41,78,43,85,50,32,67,47,40,66,42,43,36,27,39,57,25,51,31,20,23,49,50,53,48,35,39,40,29,76,41,66,18,33,28,58,47,56,11,29,42,48,42,52,40,16,55,57,46,53,72,67,75,70,56,49,60,92,75,97,124,116,109,100,132,135,134,135,147,141,166,176,172,175,156,157,149,134,152,175,173,177,170,158,166,156,163,181,161,162,151,164,178,166,156,178,155,163,153,161,163,142,153,134,170,150,138,151,144,153,148,160,129,158,141,166,163,139,182,145,157,173,151,124,154,151,142,156,143,134,161,144,182,143,161,153,144,158,146,149,144,150,155,148,125,157,142,148,149,129,148,121,129,113,135,141,176,159,160,159,161,130,132,29,2,28,21,8,33,7,21,25,31,12,11,19,8,10,9,3,9,31,15,23,13,46,13,7,23,31,26,14,15,33,33,23,13,32,20,33,20,23,33,25,21,19,28,5,14,24,31,28,36,22,14,20,32,28,42,16,7,20,24,16,32,53,36,11,29,15,10,11,42,16,26,40,11,26,4,12,24,38,6,26,44,54,12,20,22,24,24,31,52,45,36,47,24,48,48,41,41,27,24,61,38,14,45,57,61,51,46,31,43,72,62,58,82,29,64,61,60,75,73,81,57,91,104,99,121,123,129,134,114,122,115,120,116,102,115,85,80,63,78,75,109,96,118,94,90,96,83,106,89,101,105,106,100,107,70,101,90,91,113,119,108,117,162,154,135,149,136,140,130,159,121,141,157,126,150,142,117,115,148,138,139,156,142,134,136,164,121,142,102,118,130,165,132,149,137,143,149,143,134,126,117,135,114,144,144,150,119,128,152,167,136,146,156,155,158,156,141,139,155,151,142,150,141,146,155,178,161,171,157,152,139,146,122,138,128,113,137,126,144,130,151,125,148,128,146,145,149,104,104,146,140,137,182,147,140,170,174,140,152,122,117,133,142,145,151,141,159,123,174,126,122,139,147,144,166,135,130,137,177,163,133,148,121,146,158,136,139,159,137,158,153,128,115,123,123,138,145,148,113,105,100,92,118,121,121,124,119,111,131,112,142,138,135,94,129,110,104,120,134,142,127,112,94,95,99,127,124,136,163,111,77,84,80,96,87,132,106,103,94,102,113,93,77,88,84,79,36,54,65,74,72,65,55,51,65,48,47,59,72,77,86,86,62,93,55,82,83,95,79,64,89,73,48,70,78,20,47,37,37,55,54,38,72,46,42,48,45,48,44,59,47,62,53,42,34,58,50,62,58,40,42,78,67,54,39,35,55,47,35,42,36,37,43,45,57,45,67,67,60,30,40,61,47,65,48,55,46,51,77,49,58,48,70,28,49,52,34,48,54,45,44,46,31,46,56,28,35,56,54,42,39,57,66,38,45,32,32,60,69,23,45,50,47,36,57,47,40,67,30,39,38,41,53,52,49,40,46,17,26,41,49,50,42,84,47,26,48,44,34,46,44,70,38,39,56,44,54,61,62,40,72,44,35,42,36,53,32,19,26,13,34,65,79,37,45,37,20,40,38,49,55,46,19,50,41,36,30,46,43,49,57,41,54,24,62,48,27,52,32,65,49,53,46,81,46,40,39,46,53,53,56,69,31,66,51,44,27,52,53,67,49,30,42,46,42,36,20,39,29,50,38,50,21,29,44,20,46,35,24,38,54,34,51,43,38,27,15,37,19,51,57,52,48,57,30,48,81,42,35,70,41,43,62,25,42,58,76,52,77,79,58,84,82,73,84,88,128,82,123,105,130,164,131,149,161,161,158,170,153,166,156,190,149,162,158,168,148,164,163,140,160,151,156,168,158,133,152,171,180,169,169,154,159,157,149,143,175,141,149,134,182,156,155,154,148,146,173,172,160,141,125,129,147,139,160,150,160,174,161,153,151,137,125,151,169,158,130,160,142,127,136,136,145,130,154,137,144,154,154,143,125,111,121,136,156,145,145,115,137,138,174,137,138,132,144,138,146,165,100,25,5,1,19,8,1,5,16,17,17,39,18,25,31,19,27,13,19,21,31,17,25,19,10,8,32,27,1,26,29,29,21,31,24,16,31,17,41,27,15,22,37,23,43,23,13,26,33,16,18,14,30,13,10,4,30,16,42,21,42,21,4,4,24,14,11,31,18,43,27,21,50,38,27,58,23,43,39,67,48,39,35,25,48,22,34,27,22,40,35,19,18,21,16,14,50,36,46,34,42,22,35,43,34,73,56,49,57,57,65,57,71,70,72,57,56,76,100,89,89,91,92,118,128,98,109,121,92,135,115,109,107,123,99,114,94,158,104,124,102,150,106,110,121,124,135,131,143,149,132,139,151,139,108,126,114,125,133,120,163,169,139,125,151,152,148,158,151,162,149,165,165,151,139,132,137,139,132,158,136,123,138,153,151,145,137,143,144,126,142,138,151,128,144,130,132,154,113,141,139,147,148,130,143,160,157,118,143,142,144,140,131,127,127,134,152,142,101,148,125,136,139,137,153,153,130,144,127,134,152,135,126,151,161,158,121,139,140,149,168,173,159,144,150,172,176,132,149,131,146,157,146,148,150,150,128,139,151,152,155,146,159,143,119,143,158,141,113,139,135,134,133,126,108,85,116,119,124,121,112,122,131,124,132,126,112,124,130,127,135,124,27,4,20,12,23,20,7,5,17,4,28,8,19,4,0,31,20,16,5,5,25,28,18,1,15,35,40,25,31,8,33,4,1,22,10,16,33,11,3,3,9,7,28,14,1,11,14,15,12,16,28,23,6,11,21,7,22,24,20,5,14,5,9,6,11,10,24,15,37,21,21,26,35,18,33,12,0,25,21,3,26,26,34,11,22,4,29,27,13,11,17,29,7,8,15,20,11,19,36,21,26,15,41,38,27,46,5,14,23,18,14,11,7,24,12,13,26,18,40,34,11,24,22,20,4,31,10,23,15,18,9,9,25,18,14,31,26,11,14,16,11,17,17,9,14,10,10,1,16,11,2,16,4,29,3,3,16,29,45,14,15,36,13,27,26,29,11,3,15,13,22,11,39,1,14,3,16,6,17,7,43,18,25,0,20,24,17,7,18,5,25,2,4,11,14,30,15,8,38,28,23,7,16,42,42,7,10,23,25,6,11,16,30,14,47,36,17,28,18,15,41,21,26,16,24,34,23,23,14,15,4,15,37,13,10,23,10,0,9,25,7,17,5,7,27,15,5,15,6,4,27,6,13,36,8,12,18,2,1,14,15,13,46,8,31,35,21,23,19,10,1,32,23,17,3,18,42,28,11,3,23,18,10,45,0,15,12,37,11,3,5,31,34,26,12,30,3,4,13,4,11,10,27,1,15,10,10,8,13,4,10,10,4,24,16,5,22,16,10,45,46,45,26,11,25,49,19,19,3,10,12,13,16,24,25,6,4,17,19,3,14,15,45,40,10,16,28,12,11,10,19,14,6,51,16,29,42,13,29,1,25,26,13,10,31,5,19,6,22,30,20,18,37,11,18,25,12,20,14,12,2,15,26,11,1,24,24,21,15,9,38,38,46,8,7,23,12,15,19,18,24,2,28,5,2,16,25,30,3,16,58,19,15,31,15,26,20,11,43,17,31,26,21,3,12,8,29,8,2,24,38,24,18,10,1,20,16,32,15,7,19,1,0,7,0,26,4,2,34,8,9,32,32,26,42,26,39,37,14,51,23,25,8,43,14,14,33,33,18,6,21,11,20,29,23,13,21,18,21,26,11,35,17,20,21,48,25,16,16,33,21,28,19,32,33,37,26,48,39,27,19,12,20,45,19,21,35,49,25,53,56,20,40,6,10,21,28,56,24,31,48,36,28,21,22,70,25,24,34,36,47,29,82,58,49,63,58,64,71,40,68,53,42,46,59,74,94,99,79,97,103,128,109,78,100,112,117,100,129,104,115,92,106,91,82,144,140,131,126,127,131,128,148,141,115,138,138,137,143,124,139,114,151,124,150,147,126,159,145,155,170,139,158,136,158,126,146,157,134,164,174,137,143,98,149,144,132,129,157,149,123,110,134,126,112,145,139,161,139,137,138,145,146,131,109,127,108,125,138,148,121,130,112,131,154,123,170,155,150,137,130,126,153,142,122,131,129,117,134,148,126,162,117,136,157,143,151,103,143,156,120,150,154,157,159,122,159,130,140,144,145,152,117,164,162,143,151,160,129,147,142,151,116,144,140,142,148,130,159,152,156,124,156,120,160,118,118,99,148,186,132,125,111,113,122,122,125,123,132,124,129,134,149,132,130,118,123,128,145,125,41,0,24,35,37,7,18,31,15,20,13,18,12,23,8,39,16,23,16,30,2,23,21,17,16,21,8,12,21,7,22,17,28,27,4,3,35,4,20,30,8,3,15,19,42,29,12,31,18,18,13,11,19,24,14,25,51,11,41,8,11,10,1,19,14,2,11,16,25,19,11,11,16,12,52,19,8,18,12,18,42,23,8,12,24,19,12,17,27,28,13,42,5,24,19,12,35,7,15,15,17,12,19,21,12,35,13,35,11,7,0,22,21,28,18,13,31,21,1,5,15,14,41,8,21,41,30,0,26,16,6,17,22,13,28,8,20,30,9,26,30,6,12,34,45,42,58,23,4,4,14,3,41,7,25,1,7,10,2,20,37,24,22,31,4,29,6,5,29,23,11,13,22,31,30,33,13,13,24,1,4,6,10,21,3,13,4,30,22,17,26,19,13,30,47,32,17,18,0,18,12,18,12,10,9,5,8,18,3,35,12,41,17,13,7,10,24,28,34,24,6,39,9,25,30,13,27,10,4,18,10,31,32,8,23,39,26,14,13,35,28,18,10,25,23,4,20,6,25,40,18,38,15,11,25,22,30,12,32,20,25,2,20,16,23,9,31,20,5,22,13,15,14,23,23,21,18,12,11,16,5,1,15,4,40,18,20,18,7,21,8,22,1,15,28,30,17,38,20,8,33,17,23,14,10,1,22,45,20,19,19,12,10,25,3,3,13,12,29,7,0,23,13,1,10,18,29,54,5,10,22,12,13,20,16,21,6,12,3,5,23,19,17,10,13,16,12,24,31,20,36,27,23,1,32,30,18,8,24,27,12,24,50,28,5,6,5,1,24,30,13,33,27,18,23,5,2,31,15,1,8,32,17,21,13,8,15,34,30,7,20,22,15,4,7,10,35,14,14,41,14,42,13,34,16,20,14,22,29,26,26,8,12,16,30,24,22,10,4,0,9,21,13,8,11,20,10,17,29,12,15,14,17,20,6,18,11,19,6,11,24,22,3,26,31,0,3,12,32,8,21,5,18,20,26,9,5,33,19,25,31,38,20,16,14,5,17,10,34,12,26,8,12,19,38,28,30,17,33,32,6,21,38,19,15,3,31,22,35,7,21,3,30,43,17,14,10,40,28,34,16,30,26,40,18,6,29,9,18,2,36,25,34,16,30,3,30,14,13,28,36,38,11,38,38,51,25,8,36,21,22,17,37,17,33,38,24,50,31,28,25,20,18,30,38,40,9,31,22,14,42,33,35,40,36,53,10,47,44,34,60,57,35,49,45,70,56,39,33,31,48,53,25,40,53,35,52,49,32,60,55,38,50,65,65,56,71,101,91,91,77,98,116,112,110,84,118,110,104,102,117,128,141,112,114,152,125,142,130,128,147,142,144,134,112,156,130,139,119,147,130,131,140,131,130,119,145,106,112,103,128,112,129,123,107,135,133,110,131,154,148,130,149,148,138,129,162,120,112,148,138,126,141,124,122,139,134,126,137,121,140,103,134,147,154,115,117,147,104,110,127,153,157,147,163,172,152,137,159,152,144,128,123,144,155,136,134,131,142,115,136,126,128,123,101,120,107,113,136,138,120,113,97,85,111,111,133,111,105,143,102,89,141,106,118,149,135,122,115,106,96,149,119,123,153,90,136,138,115,123,130,148,150,122,124,144,135,124,127,116,126,118,151,138,138,144,96,136,121,138,99,98,84,79,103,94,62,71,85,66,78,81,97,101,84,88,85,121,59,102,107,100,100,132,89,64,81,82,74,94,59,63,43,62,53,50,71,93,56,98,73,68,54,53,46,36,61,70,70,50,51,54,86,42,63,62,46,89,67,67,45,77,54,68,74,72,69,71,47,67,71,51,63,58,56,70,63,52,83,57,39,40,21,23,61,69,49,46,35,39,23,47,44,62,43,36,40,49,52,46,39,37,54,62,25,14,39,45,26,67,43,25,52,65,40,35,54,51,39,49,38,37,43,34,56,43,35,30,53,51,35,47,32,28,40,30,48,40,32,23,18,44,48,48,30,30,37,34,29,36,46,44,45,37,52,42,58,45,29,30,34,33,51,42,49,35,50,46,22,44,41,50,66,57,36,15,31,40,53,52,27,43,62,33,40,37,36,33,36,22,54,46,22,42,52,40,32,58,44,22,42,36,32,55,31,42,60,43,59,73,54,41,60,24,51,51,35,64,65,46,54,68,60,55,62,69,81,68,80,90,62,48,78,71,82,68,74,42,82,95,84,82,28,48,61,86,73,42,70,60,56,67,31,27,30,33,64,29,57,43,20,48,13,28,29,10,40,32,51,31,46,29,32,31,28,64,28,11,39,56,27,39,32,42,20,22,54,47,45,74,79,46,63,43,59,52,34,51,44,85,82,77,66,68,84,70,119,126,94,111,131,114,145,148,146,141,140,128,134,156,128,147,184,156,139,158,142,122,146,153,152,150,156,149,107,166,136,172,149,171,146,164,162,150,146,166,163,163,142,141,158,181,149,143,121,162,149,147,133,168,147,150,157,127,159,161,169,160,166,149,168,141,169,146,164,168,147,152,177,174,156,144,148,133,171,138,154,153,146,152,153,118,162,146,127,137,142,174,137,160,102,85,85,86,117,152,141,174,135,168,102,9,17,25,0,15,1,9,20,13,12,23,0,23,10,30,24,18,18,14,35,6,43,28,16,11,8,22,3,30,41,21,12,9,10,20,33,35,17,33,8,34,24,46,23,49,38,30,7,27,11,26,13,12,34,11,13,49,24,9,44,10,10,30,35,27,14,20,37,37,7,20,14,31,29,26,37,16,19,21,6,43,45,30,33,28,37,35,11,31,41,8,16,32,30,24,38,13,61,40,27,50,40,11,38,16,51,33,22,12,30,28,28,12,8,36,56,63,64,47,49,35,24,40,48,56,50,71,61,61,68,58,31,64,51,48,76,82,57,59,47,62,89,111,89,89,89,97,75,81,61,57,70,91,97,101,71,100,72,84,89,110,110,115,151,132,123,121,133,137,125,161,157,99,119,117,126,134,108,121,112,120,126,146,121,134,94,144,128,122,136,134,142,149,107,141,134,151,131,130,142,124,146,134,153,136,113,107,117,123,142,165,138,128,129,156,135,143,117,116,127,123,118,129,130,86,138,95,122,129,145,123,121,147,123,140,145,176,136,114,140,110,129,115,117,108,159,118,122,130,127,126,123,116,114,139,122,152,123,136,135,94,120,133,115,138,131,113,124,106,117,97,125,153,147,130,116,112,122,113,79,130,131,99,124,97,93,124,116,104,125,115,114,106,79,99,118,104,129,117,116,123,133,147,127,102,72,90,108,77,98,105,109,94,65,41,79,92,99,98,49,83,78,72,96,60,63,77,58,58,48,81,67,111,80,23,51,48,87,65,56,51,68,85,52,49,102,55,50,69,50,38,69,64,57,33,68,52,71,49,57,50,50,54,63,46,62,52,77,77,52,57,76,39,37,71,48,63,60,64,67,46,62,59,68,40,82,32,64,72,50,58,70,40,58,39,63,51,51,38,53,66,50,59,42,42,38,43,28,35,14,28,49,60,63,50,45,32,27,65,47,60,39,52,40,46,46,24,74,53,57,37,60,50,37,49,43,43,45,49,50,48,37,43,52,29,57,52,49,39,23,67,43,49,55,36,33,25,55,45,35,59,34,36,43,40,48,39,58,26,31,40,50,40,32,26,32,28,51,60,45,36,62,70,56,26,71,50,54,49,35,54,31,50,19,54,41,37,23,35,54,29,11,40,34,37,34,47,56,29,40,35,33,54,65,34,61,65,37,63,42,75,48,52,64,52,41,23,55,53,46,68,60,68,51,23,58,45,49,49,47,90,73,69,67,75,91,56,62,29,66,77,48,44,32,64,63,42,68,70,63,53,54,40,49,23,40,63,44,40,38,39,50,39,7,40,38,31,27,32,37,22,36,4,31,42,33,16,32,30,22,49,50,20,54,30,30,38,44,47,43,46,57,60,46,76,84,40,100,62,98,97,56,49,72,77,94,65,90,74,154,139,122,137,124,124,114,153,134,152,152,152,158,179,181,138,169,119,148,162,124,145,142,156,161,138,139,156,146,153,140,168,174,160,152,178,147,161,170,166,162,152,156,154,160,130,157,120,138,166,149,149,136,145,122,160,161,132,156,155,144,161,163,173,157,169,166,150,170,143,151,127,154,166,146,153,181,161,134,138,128,182,153,143,146,141,123,170,139,128,126,113,88,77,68,78,104,154,170,171,114,14,5,7,6,23,19,18,10,13,38,33,15,28,1,15,25,44,13,19,29,44,10,21,13,24,13,28,51,16,31,28,22,15,17,44,9,14,28,9,37,9,38,24,5,10,20,4,28,18,32,13,19,15,14,28,32,25,28,21,17,5,26,15,41,27,16,12,23,19,27,14,10,28,49,3,42,60,14,4,41,26,29,24,11,28,40,38,21,17,12,32,42,20,20,33,45,30,23,29,30,39,48,26,36,12,18,23,39,24,41,10,28,11,32,30,52,25,11,82,41,48,40,51,34,79,31,54,28,37,31,36,20,69,47,39,67,49,66,67,93,77,89,83,89,72,90,84,70,68,76,79,100,112,106,80,107,95,108,111,91,131,123,131,123,132,137,146,126,135,112,108,155,119,106,146,126,138,127,127,103,96,145,138,134,121,109,126,150,126,149,136,124,147,159,128,128,113,123,100,102,113,135,104,129,144,93,81,96,127,126,124,124,123,114,126,128,148,132,146,138,143,111,141,151,145,153,145,133,142,146,139,130,122,111,110,136,131,99,134,111,121,137,121,132,122,134,112,133,128,143,100,125,142,137,122,135,136,130,117,153,133,137,131,127,99,115,101,125,133,106,119,143,147,135,112,143,129,114,105,122,132,121,134,117,90,114,148,129,140,151,133,139,99,112,134,113,130,132,144,110,116,130,134,146,124,98,120,105,120,121,135,92,135,118,131,142,114,144,121,83,79,106,112,121,86,73,70,86,74,85,76,102,123,76,118,92,58,70,71,81,71,111,117,109,100,77,67,75,78,86,59,96,47,38,97,65,62,100,67,71,40,34,66,42,59,26,68,64,27,32,31,52,45,50,47,60,66,73,62,65,42,77,47,56,37,21,70,67,70,47,55,38,56,47,55,73,44,55,41,65,48,44,40,39,46,54,62,43,14,68,53,78,70,53,63,46,50,51,79,62,31,42,47,68,60,46,73,25,78,38,53,34,42,40,24,48,23,47,21,48,52,55,44,48,66,13,41,46,53,30,61,36,30,47,37,36,79,68,59,49,41,42,47,46,55,44,46,18,8,57,52,28,26,33,39,76,64,26,30,48,35,41,56,33,37,29,51,42,30,54,38,22,44,49,47,25,40,38,23,56,37,43,42,18,46,32,33,37,37,59,20,25,49,18,21,47,40,47,65,46,41,43,50,64,55,66,57,55,63,50,34,37,34,67,63,60,31,65,58,75,71,88,61,77,92,88,84,90,75,84,54,87,106,48,58,58,57,71,41,30,52,36,31,36,43,19,40,13,50,35,26,35,25,41,18,58,19,20,56,36,41,22,21,24,55,40,28,51,25,23,27,22,21,44,47,33,42,41,25,52,68,51,64,70,53,62,37,77,59,51,75,61,51,48,72,99,78,78,127,134,141,127,141,142,124,146,154,158,134,166,156,170,171,175,158,173,158,149,160,164,159,159,175,164,126,161,145,146,158,133,169,174,135,152,138,158,167,172,186,156,165,162,160,160,172,165,149,143,152,156,136,149,163,148,154,158,138,145,161,151,153,154,138,134,147,144,133,150,152,165,189,158,156,152,167,150,159,122,143,166,137,136,131,141,143,142,165,134,147,146,157,143,94,86,93,87,114,122,167,123,20,3,20,23,12,27,1,22,15,20,9,16,1,26,6,17,10,34,32,12,23,19,53,33,27,37,12,19,38,3,41,8,14,9,14,26,6,32,6,25,45,26,7,6,14,15,13,40,31,8,21,7,27,51,20,29,4,35,3,15,16,38,13,18,10,8,41,29,32,9,2,19,16,13,46,14,18,16,19,14,44,7,16,8,20,27,4,10,2,17,28,30,62,25,10,18,35,14,33,17,31,50,39,11,30,7,9,14,33,44,34,54,34,41,13,26,21,36,39,28,52,48,45,54,70,53,65,61,69,42,52,64,83,75,84,71,103,83,91,79,66,82,84,75,68,78,87,71,75,84,69,109,108,116,133,139,109,120,126,129,141,131,123,121,137,112,96,137,120,117,132,139,137,136,103,142,151,138,162,142,166,128,165,137,154,145,141,156,149,133,125,139,119,106,131,133,138,151,133,138,135,107,144,145,142,125,126,113,121,110,106,125,128,115,130,140,106,125,143,146,150,166,149,151,148,140,136,145,137,147,143,153,145,106,122,132,103,132,132,138,128,153,121,120,119,128,113,116,95,119,147,126,121,141,133,129,122,142,162,161,158,159,114,142,131,162,146,150,126,119,117,148,124,133,110,127,131,115,122,140,171,139,156,103,117,124,148,155,126,128,105,118,137,145,137,125,132,116,129,128,123,133,138,110,140,123,147,148,168,146,139,132,134,158,152,144,156,134,138,113,87,100,113,106,117,80,91,95,96,108,97,97,84,80,94,52,95,79,101,112,98,110,88,80,59,88,46,68,102,77,69,71,53,47,44,77,70,33,56,86,45,85,66,55,48,54,65,74,69,43,54,61,43,59,80,59,70,38,62,61,64,67,72,66,43,76,53,43,64,51,48,46,71,63,40,57,67,54,67,61,55,27,28,34,60,60,54,36,53,53,74,53,23,51,54,47,59,43,22,44,38,49,58,70,47,37,47,65,61,15,30,26,52,47,55,62,58,65,45,26,53,30,55,64,34,56,38,59,39,34,28,27,52,36,51,28,22,36,59,51,42,49,45,45,51,45,40,32,27,53,42,31,47,46,3,46,39,44,40,42,55,58,42,42,20,29,42,42,35,66,31,59,53,52,47,39,28,38,22,55,22,48,38,20,41,35,31,32,20,39,37,44,54,34,24,47,40,36,62,32,35,55,57,23,72,30,67,42,58,42,23,21,28,58,49,77,44,73,83,58,92,62,63,91,72,79,80,112,92,86,104,68,65,38,43,66,55,47,50,43,59,33,27,33,14,6,66,52,46,31,37,31,44,26,31,66,28,36,44,14,38,19,45,33,48,24,44,51,31,40,38,30,35,16,54,42,37,29,44,77,53,67,64,58,46,61,63,46,69,35,62,33,94,69,38,107,69,76,94,120,137,141,151,162,148,128,121,148,167,195,150,163,158,180,171,156,159,150,173,154,182,171,163,164,197,165,154,179,182,156,182,153,147,135,118,154,134,187,171,164,153,153,172,171,143,159,165,151,149,158,145,148,167,160,160,150,140,160,156,145,154,151,141,160,135,129,131,161,157,143,184,150,159,149,159,156,158,163,147,171,158,171,135,147,131,146,112,156,146,135,174,150,132,112,103,81,98,113,139,128,5,5,0,15,1,21,7,22,17,11,15,45,17,14,53,19,15,30,18,6,39,14,16,39,34,29,5,19,47,33,19,5,14,29,27,8,25,22,34,27,29,25,12,14,8,15,31,20,29,7,18,15,2,33,36,19,21,24,9,20,8,13,18,11,8,11,30,14,20,36,44,42,11,11,12,36,28,3,29,4,36,39,36,22,42,25,30,15,17,8,28,36,27,40,31,41,18,44,28,28,26,35,41,20,23,64,40,36,10,36,40,43,26,19,33,40,29,40,45,61,48,33,41,37,76,57,75,83,122,102,70,102,113,91,100,120,56,91,78,96,59,54,104,67,131,67,92,108,96,112,91,123,128,115,149,129,124,127,130,129,106,121,117,119,98,130,134,105,117,117,132,140,143,131,130,146,151,149,139,147,156,138,97,109,133,132,141,129,133,119,131,127,130,97,121,121,116,133,134,123,136,150,159,156,116,178,166,159,130,150,118,148,157,161,130,144,151,152,154,169,153,161,130,137,124,154,132,146,137,163,166,142,155,145,165,144,108,122,126,132,120,109,134,143,162,96,134,122,125,102,151,147,120,162,131,137,142,160,165,122,153,147,119,136,117,148,163,139,126,146,140,156,141,148,118,134,126,121,126,159,120,102,123,147,101,108,150,140,139,138,107,109,142,125,118,121,112,107,122,122,136,118,108,122,124,142,128,125,111,137,132,93,116,119,108,122,108,90,114,89,112,86,89,106,92,86,97,93,92,87,79,80,62,78,60,65,69,82,72,72,79,75,67,49,53,39,54,29,46,66,49,57,41,63,34,22,66,34,54,51,50,64,66,47,64,61,62,54,65,90,52,76,67,50,39,57,57,37,53,57,50,69,67,42,67,53,40,45,63,64,55,40,45,34,25,55,50,35,46,50,27,47,50,43,54,37,42,50,69,60,46,49,48,44,39,28,47,42,53,32,29,31,23,56,47,15,71,28,42,44,46,46,30,49,36,45,39,62,29,48,34,55,58,20,41,37,51,52,46,61,40,20,47,49,39,38,37,32,53,45,39,27,51,41,37,47,49,31,42,31,39,20,65,45,47,46,51,49,52,46,21,39,44,38,54,38,34,44,26,35,46,27,49,36,32,25,27,37,35,25,57,56,35,42,33,48,44,21,49,19,24,33,59,47,51,62,70,57,45,23,37,45,67,36,27,46,20,36,44,57,43,43,43,61,47,79,49,64,49,65,56,55,78,72,88,98,78,74,57,59,79,53,60,53,52,64,81,84,29,43,44,28,48,32,18,46,50,40,22,43,48,44,9,38,43,33,24,50,21,60,35,54,34,56,30,31,40,56,37,29,35,46,49,58,48,50,51,43,29,46,55,72,65,49,56,70,58,46,73,39,56,38,78,56,72,112,97,65,66,101,128,112,132,157,153,113,135,141,175,149,137,137,132,161,177,198,188,184,175,191,158,171,166,166,212,156,139,150,141,168,161,150,162,147,157,159,169,166,158,121,156,174,151,171,162,166,150,152,133,163,146,142,142,154,125,152,145,159,157,154,137,161,148,150,162,143,142,148,141,140,134,138,142,143,152,132,148,153,138,146,156,149,153,148,141,131,142,138,124,177,158,165,145,129,119,97,80,77,111,100,24,0,14,9,26,6,23,16,0,21,28,17,9,14,54,41,22,3,18,21,32,30,18,30,11,42,20,43,16,15,8,24,14,25,41,9,5,27,25,45,28,32,16,0,35,12,12,22,10,19,41,10,37,0,9,18,21,29,37,23,16,13,32,44,17,6,24,25,34,19,37,7,38,33,11,6,5,30,17,19,7,37,21,13,54,12,4,25,26,0,18,30,31,24,38,25,0,27,38,20,21,51,30,40,40,34,34,7,41,17,20,12,28,46,45,29,52,55,40,42,55,70,24,42,49,82,72,61,70,51,112,50,62,64,71,67,82,66,62,53,33,59,70,111,95,81,99,102,111,146,157,143,130,133,145,144,148,132,138,123,126,154,129,127,110,110,123,147,158,132,125,128,147,132,124,163,129,104,113,138,120,88,134,117,108,123,140,117,130,142,120,121,141,136,136,88,154,131,126,127,140,120,111,120,110,151,137,117,153,166,134,143,140,111,106,131,130,150,147,141,142,160,169,151,142,155,136,127,137,130,123,136,126,150,98,148,148,137,127,136,125,135,154,117,148,124,136,138,163,129,125,139,137,137,147,139,140,141,115,143,117,148,151,146,136,147,143,134,156,156,167,155,161,131,169,134,157,169,145,131,136,125,152,125,156,140,138,138,147,148,160,131,121,87,92,85,115,94,117,126,112,93,75,92,115,101,100,96,81,68,114,142,75,94,105,91,105,77,78,70,89,70,88,70,73,79,85,94,95,104,75,80,42,40,78,82,93,71,78,78,63,73,66,60,64,59,32,64,62,58,53,46,69,53,66,52,44,70,36,41,47,80,64,71,64,62,58,57,66,68,67,56,38,61,63,55,51,63,69,42,63,53,58,67,72,42,56,32,46,61,50,40,55,92,41,55,48,54,43,42,50,53,73,45,63,61,26,45,48,70,46,66,48,34,53,59,66,44,50,55,69,26,40,45,36,43,51,61,59,23,36,52,67,28,53,42,51,46,59,40,31,37,60,49,24,54,49,41,63,37,46,39,58,27,58,30,55,44,57,48,40,21,36,67,33,40,45,43,68,50,32,59,42,41,40,45,53,28,16,25,61,54,19,37,41,28,32,40,39,58,31,32,60,68,59,42,29,45,42,40,59,75,42,65,29,36,35,43,51,57,31,36,61,46,44,51,79,52,29,37,79,68,56,65,39,72,56,77,67,73,40,54,64,45,69,67,60,55,43,24,54,80,96,78,95,72,45,63,41,60,57,40,41,76,55,65,69,57,61,39,76,57,29,46,56,19,43,28,42,37,28,43,30,56,23,33,13,56,26,41,59,39,37,46,54,16,47,41,25,36,36,49,28,53,60,29,51,43,44,52,60,44,49,50,32,56,67,72,68,42,61,42,67,44,71,52,60,48,72,113,121,104,109,141,131,150,135,137,170,129,143,141,120,154,167,164,162,165,127,153,138,151,166,141,151,146,158,156,188,155,169,170,178,166,142,169,164,163,129,180,152,165,150,136,154,141,139,153,170,175,176,156,166,174,162,157,178,138,158,172,135,167,170,174,169,149,171,158,163,170,139,165,143,140,162,160,136,122,157,154,162,135,150,142,163,151,153,120,113,116,135,137,159,146,140,126,91,59,88,80,8,7,41,9,31,13,5,4,15,31,10,8,18,2,23,10,45,12,19,26,4,7,34,52,12,14,59,30,22,8,22,13,19,37,25,32,11,18,12,28,19,23,33,24,26,18,24,16,28,14,13,2,19,22,52,24,28,19,8,40,25,11,40,17,24,21,16,44,19,27,6,22,35,43,16,3,18,11,20,32,47,20,10,21,4,40,7,28,4,52,44,22,19,19,23,15,33,52,35,62,44,25,18,18,39,31,9,14,39,30,27,39,48,34,24,45,28,56,41,61,56,43,54,63,72,68,61,54,74,73,29,39,57,46,52,31,74,73,45,28,24,64,28,49,48,89,79,95,125,128,146,135,136,136,140,152,181,165,142,151,153,165,150,159,149,126,148,152,132,120,151,131,128,143,120,134,125,114,140,153,119,117,126,133,138,143,143,119,130,132,151,152,130,149,128,144,156,147,133,124,110,118,121,140,140,122,122,134,123,140,137,154,133,100,125,119,121,98,95,139,136,131,150,148,143,147,152,173,148,136,144,138,137,140,145,151,152,155,143,122,150,124,156,139,151,128,121,141,139,142,123,148,115,107,126,114,123,150,139,150,135,169,146,138,129,120,117,149,135,122,151,145,139,111,132,132,114,153,142,139,146,139,140,151,127,112,157,164,143,136,112,127,121,119,142,118,115,100,121,137,105,106,119,113,122,97,120,107,94,75,92,97,89,94,107,110,86,104,97,83,110,107,108,118,63,85,89,72,101,75,117,112,80,63,50,56,65,63,70,67,82,66,70,90,89,72,37,83,42,82,70,86,70,79,70,48,64,48,65,85,53,54,38,43,80,54,68,58,34,40,53,48,55,53,55,49,69,42,61,49,49,52,53,48,61,39,57,71,69,52,74,56,23,51,68,41,61,72,69,50,30,40,33,65,38,42,51,56,40,39,53,58,46,36,51,23,63,50,39,52,60,48,47,44,34,36,54,31,22,37,55,60,56,44,63,46,56,30,50,36,34,11,59,40,68,32,45,58,23,26,41,36,51,30,27,46,44,47,43,37,34,53,63,11,42,34,43,61,34,34,43,36,55,43,39,63,38,57,47,72,39,36,53,43,28,45,47,43,22,33,15,45,48,41,41,32,32,57,36,34,62,42,48,42,24,61,58,31,42,45,57,32,25,54,36,49,63,63,66,41,29,39,38,40,36,63,47,67,57,38,57,27,61,52,62,33,12,50,53,69,59,41,57,79,86,69,66,29,52,37,83,74,79,48,47,45,35,43,73,48,38,43,47,47,53,39,20,14,26,29,46,41,20,49,40,20,21,12,55,62,53,47,38,21,26,55,27,44,53,52,36,53,44,33,61,52,51,48,48,44,37,74,39,41,86,36,47,63,43,57,92,47,94,64,90,98,77,66,94,119,139,111,126,139,128,120,140,144,162,182,159,139,120,137,169,149,141,129,146,137,136,139,146,176,161,129,129,148,164,157,127,171,140,152,152,146,123,148,148,137,152,131,161,161,134,155,138,131,174,172,144,155,159,152,163,141,154,162,159,157,155,147,122,143,167,120,135,160,158,162,150,143,128,172,156,130,139,155,167,141,153,159,162,137,118,164,146,170,137,157,161,181,134,126,162,136,131,76,70,74,33,18,20,8,9,29,2,17,12,5,15,26,6,32,25,8,18,18,16,25,31,43,26,5,24,15,39,8,8,43,21,40,28,16,30,26,39,46,26,2,24,7,9,16,8,47,23,22,22,32,17,53,20,9,5,6,39,20,30,30,13,46,28,23,12,20,21,15,3,34,17,32,26,27,21,19,31,12,34,22,33,27,16,4,6,36,43,14,40,21,4,30,38,35,12,49,56,42,48,40,22,35,10,46,49,23,32,21,61,24,41,41,45,35,42,52,54,33,38,47,56,47,42,53,63,40,27,32,29,64,26,31,43,31,52,53,71,68,52,74,45,65,45,54,79,83,104,110,116,77,123,119,107,122,149,137,157,149,144,145,120,137,119,120,138,122,124,148,125,134,142,141,124,120,123,102,114,141,123,125,115,124,163,160,122,131,99,134,140,120,152,140,144,132,131,152,141,148,133,130,157,152,129,121,109,131,137,133,130,126,123,124,120,128,103,130,124,109,120,109,121,115,148,150,159,131,143,142,163,160,163,127,140,154,142,148,120,151,133,149,156,131,144,127,139,122,152,138,136,136,119,135,124,140,150,137,144,127,135,152,127,127,149,125,122,134,112,114,107,124,145,145,139,118,112,113,119,88,117,136,105,120,99,80,102,91,84,80,103,94,55,76,107,118,140,134,151,94,120,125,124,150,166,147,134,117,100,101,100,90,104,119,103,98,83,97,108,120,106,110,121,133,130,109,121,108,88,44,46,60,89,93,94,89,76,52,62,69,69,66,88,63,80,64,104,95,84,71,64,76,67,28,52,75,58,63,84,110,80,65,44,63,46,72,53,55,65,81,54,47,60,50,45,54,64,67,62,74,36,54,60,49,43,37,52,47,81,58,43,51,73,56,41,76,48,51,55,67,44,65,58,56,46,59,71,43,72,61,77,67,79,64,39,34,40,69,57,63,59,48,27,47,65,28,41,23,83,44,54,44,71,63,51,47,58,14,60,38,41,43,33,36,45,39,48,22,47,16,28,40,40,43,41,31,37,60,50,11,23,35,55,52,52,57,51,39,51,16,49,56,32,40,52,49,43,48,72,36,56,63,29,25,48,64,52,44,43,42,63,42,31,63,50,59,41,55,49,18,33,35,29,55,57,66,77,40,44,23,62,52,51,52,66,59,60,44,55,64,42,63,31,53,30,44,29,46,62,24,69,68,60,40,45,17,29,57,61,33,42,30,57,59,41,61,37,43,68,43,53,74,53,61,40,74,63,54,57,63,68,47,26,29,56,29,41,59,57,35,29,59,46,41,42,57,36,23,20,50,40,29,27,13,37,19,35,47,54,52,42,37,30,29,51,51,15,63,63,18,37,43,33,60,67,22,54,80,36,52,49,70,87,75,81,46,92,78,108,111,130,137,129,143,135,125,157,141,154,134,167,168,191,157,153,158,161,172,184,157,155,133,139,149,141,142,146,162,147,125,162,172,175,163,161,156,142,154,141,145,165,144,147,135,191,160,182,191,161,152,168,160,163,199,177,163,142,195,153,153,144,167,146,150,166,184,150,170,157,170,155,147,177,163,145,164,151,138,158,145,159,140,167,166,127,145,162,133,146,134,154,146,122,154,139,158,143,145,143,130,117,69,23,16,12,21,26,12,15,14,8,25,12,29,36,26,32,26,35,18,27,25,8,28,3,14,38,50,9,9,29,37,31,34,19,8,22,17,46,40,5,37,17,55,31,21,40,27,20,16,30,35,4,16,37,18,14,27,16,22,40,35,20,36,8,20,20,13,31,20,16,17,44,40,47,12,16,22,11,33,21,34,25,34,19,35,33,11,23,26,38,29,24,23,23,59,29,27,24,62,22,55,42,38,58,55,31,41,54,68,66,37,56,63,29,46,38,64,80,55,82,77,80,70,109,118,119,130,122,124,139,122,99,123,104,118,103,98,85,62,86,91,85,105,114,114,100,109,103,87,56,67,101,106,117,99,92,123,114,101,87,109,77,108,132,128,154,120,157,163,123,141,146,133,164,147,156,116,136,135,140,164,151,145,131,154,145,150,135,128,119,140,136,112,113,151,137,151,155,158,176,135,126,136,135,136,150,157,122,127,144,153,143,155,140,151,175,177,167,151,162,165,166,153,138,141,147,161,164,157,145,155,141,116,131,103,128,131,113,128,133,117,128,134,138,133,159,122,121,123,140,148,143,154,149,136,140,156,143,162,152,133,128,104,113,159,144,165,128,143,142,141,139,121,136,131,156,138,153,150,155,175,163,136,125,133,149,174,130,118,152,145,123,131,132,129,119,130,125,138,127,137,127,92,116,129,106,140,126,112,118,151,124,123,130,142,103,140,131,120,142,125,134,116,110,96,90,116,106,151,152,113,105,92,57,96,99,69,93,102,97,108,98,120,116,93,84,67,77,62,80,71,72,72,85,62,35,81,62,58,69,74,99,99,58,71,62,59,65,56,102,76,72,67,73,42,76,78,45,43,56,51,42,60,70,63,38,51,53,60,56,56,41,35,66,49,49,77,47,31,61,61,31,59,43,75,57,38,58,72,50,50,51,83,44,56,64,47,56,49,64,68,45,72,47,58,56,19,34,59,79,72,25,18,34,40,42,38,52,51,52,56,37,56,55,27,53,51,29,56,15,54,20,30,49,45,37,51,41,51,34,49,50,45,51,61,45,48,22,64,15,49,27,27,56,62,42,59,42,55,52,58,40,36,23,52,36,50,71,43,52,69,55,44,68,41,46,44,53,73,55,29,54,35,26,69,37,45,39,38,50,58,34,61,39,48,30,44,40,30,50,54,52,53,45,39,22,50,41,54,49,61,60,62,33,49,25,33,52,46,44,48,51,71,61,51,53,40,33,35,49,49,38,45,59,67,54,58,72,53,63,39,62,52,30,64,63,50,48,44,54,45,47,41,61,49,45,51,41,43,68,21,26,16,39,37,34,53,39,61,48,34,56,61,70,44,66,55,56,50,49,50,61,31,43,66,52,57,72,65,48,74,49,40,74,80,89,97,100,122,113,105,81,134,126,142,144,151,163,174,187,169,150,179,166,160,133,154,168,152,173,172,183,146,153,161,163,141,157,160,175,165,178,156,160,150,147,130,132,152,168,138,140,146,155,142,158,143,172,172,135,148,158,133,171,146,169,138,136,161,152,132,146,153,140,140,133,154,157,147,138,151,132,135,149,160,147,165,143,142,138,171,128,118,119,175,133,127,125,105,132,136,130,123,163,145,162,151,132,125,133,101,21,9,19,3,16,17,26,1,24,6,34,17,32,14,41,21,30,14,51,13,28,4,12,18,24,25,34,22,16,19,22,28,41,27,39,42,30,24,4,24,14,15,23,10,6,47,21,19,38,37,38,6,19,35,31,35,11,6,37,23,27,38,13,36,10,14,21,38,38,9,9,11,28,22,30,49,15,52,53,28,22,9,66,27,4,27,21,25,16,32,22,9,40,34,47,38,27,31,46,37,36,39,34,68,41,35,60,49,55,58,25,57,47,61,46,62,99,98,115,73,83,61,84,116,116,113,127,129,113,139,133,128,90,128,86,72,93,66,101,93,98,99,111,113,105,91,100,98,98,72,94,92,100,114,100,103,97,90,99,109,141,115,155,144,146,144,142,117,103,118,151,157,145,125,146,187,117,143,135,145,137,118,137,146,154,161,141,146,142,120,127,130,138,126,134,138,121,134,161,125,128,131,143,123,120,110,146,142,136,153,170,134,142,166,165,157,133,130,160,169,154,136,148,136,148,160,165,148,142,160,160,143,130,143,110,121,110,136,157,109,116,115,142,137,144,137,115,121,140,135,121,123,141,131,142,152,152,163,149,130,131,140,137,147,143,175,169,155,141,144,145,106,113,131,141,153,150,147,153,147,137,148,122,122,147,169,147,163,131,151,158,126,126,138,113,120,124,130,121,129,137,112,133,127,121,113,132,102,126,120,121,138,125,120,127,117,115,122,123,133,130,113,128,99,106,117,115,111,147,131,113,73,62,80,91,100,102,92,112,110,67,95,105,81,104,55,61,65,67,75,50,89,72,79,64,58,87,56,59,76,52,92,102,104,95,61,79,72,83,74,94,80,79,74,92,84,35,69,47,54,42,66,53,39,57,57,38,51,60,27,41,51,51,52,65,38,42,35,29,51,40,48,51,21,64,54,43,33,38,48,52,31,59,67,30,47,49,69,58,57,35,49,62,29,77,44,64,72,42,32,44,51,56,53,46,67,37,48,48,55,56,57,60,36,32,55,48,22,39,39,57,40,28,11,32,38,29,57,64,30,30,30,51,60,62,57,43,67,45,63,40,23,26,49,40,55,31,27,44,50,51,54,47,33,54,47,58,42,37,25,30,45,52,53,39,52,53,60,53,31,40,72,34,52,68,53,24,49,34,47,45,52,55,40,35,32,51,54,57,21,48,53,69,64,26,48,53,64,17,41,28,42,32,50,54,47,55,47,31,33,38,19,58,64,44,71,55,68,56,41,66,50,39,65,56,61,49,32,55,44,68,52,43,40,53,54,24,37,72,51,23,28,74,50,34,64,39,39,42,27,56,47,48,40,38,45,73,14,53,23,61,76,58,37,42,43,45,45,40,42,67,35,47,87,53,44,52,78,36,71,67,82,55,67,104,86,85,122,118,87,75,103,124,142,131,145,153,168,160,171,168,179,193,167,166,163,163,169,170,150,163,172,182,157,161,168,163,164,174,183,160,172,143,155,142,140,149,165,162,160,174,156,180,147,169,143,164,145,159,138,131,147,158,166,147,155,147,131,164,152,136,142,125,119,126,160,146,160,167,125,139,133,145,146,155,166,143,130,142,155,165,123,127,130,130,139,149,130,122,121,126,139,147,114,169,114,125,137,138,109,15,4,21,15,0,15,13,30,7,35,19,0,47,27,16,12,17,42,33,19,4,37,20,26,16,26,39,30,22,30,19,27,25,23,28,32,23,21,30,21,27,54,8,29,12,27,7,41,17,37,31,37,18,14,38,23,32,42,1,28,48,20,20,39,54,24,23,29,41,23,25,12,45,41,51,28,27,27,18,37,24,14,18,23,38,37,21,11,36,40,18,28,33,24,17,17,46,42,25,26,37,29,21,54,53,48,67,42,57,67,55,60,62,83,62,83,56,86,111,70,108,108,110,135,77,122,118,87,120,100,121,136,108,106,109,103,118,103,132,116,115,130,114,143,112,121,123,138,149,142,124,128,136,119,137,123,152,124,148,169,156,135,139,153,158,150,149,139,180,137,144,152,154,130,146,131,137,143,133,141,124,140,157,129,132,138,163,136,159,109,132,163,137,143,171,136,116,115,134,155,140,152,115,122,138,154,157,164,144,151,143,123,144,125,131,148,114,129,113,141,135,102,149,154,159,162,152,86,135,134,149,135,162,139,136,156,141,124,163,141,150,108,157,152,162,162,134,144,144,131,151,149,170,151,144,149,142,162,149,145,155,140,139,152,145,117,126,131,131,141,139,151,147,105,145,122,125,114,119,113,125,143,155,150,146,142,111,105,137,152,97,13,13,5,23,10,25,16,2,0,7,24,49,31,16,54,45,0,30,12,13,10,14,7,30,24,19,24,38,22,12,2,16,39,20,22,24,7,27,7,32,28,20,29,13,16,22,28,5,2,43,17,8,28,17,13,7,5,12,15,30,15,32,30,5,23,24,40,5,40,0,7,5,5,21,34,7,18,25,20,28,41,53,18,14,30,13,12,14,2,11,9,31,15,1,26,21,11,24,20,23,21,20,5,10,8,12,21,18,20,33,50,23,7,15,28,26,18,32,16,33,8,21,25,13,5,25,33,27,5,3,10,34,8,37,13,26,14,19,23,12,4,14,20,10,21,4,19,1,29,8,26,30,26,28,36,29,26,9,9,15,21,17,18,25,24,44,2,27,17,26,9,36,23,33,4,7,11,6,8,21,28,27,11,27,15,17,31,41,9,38,20,24,31,25,14,16,12,9,16,16,16,23,21,30,22,8,28,29,10,1,17,11,20,16,6,21,18,11,35,3,11,5,9,20,19,16,25,7,34,24,9,32,22,17,13,16,27,12,19,24,34,18,0,13,30,5,8,10,26,10,26,19,17,12,12,38,20,2,13,12,35,16,29,3,42,35,15,40,15,11,5,19,17,8,45,39,20,10,40,12,17,27,6,13,42,9,17,7,18,22,14,14,20,18,22,1,5,15,24,18,33,40,33,12,11,11,11,36,8,45,19,29,40,2,20,9,6,28,9,15,14,25,29,13,25,30,32,19,16,8,24,21,45,30,27,45,15,28,29,5,29,18,11,20,9,32,14,18,47,2,10,17,15,19,29,5,13,14,17,35,21,18,5,16,12,11,5,6,29,30,21,28,40,10,31,3,13,38,14,2,21,22,15,29,4,9,6,3,16,23,13,33,13,28,22,17,22,19,20,29,43,8,3,20,17,15,21,21,21,13,33,6,2,12,2,21,16,19,31,21,14,19,21,31,26,25,37,3,15,17,14,20,27,29,25,2,16,8,14,22,12,36,50,33,38,11,26,28,20,23,23,15,34,21,26,39,29,50,34,24,18,34,25,30,15,16,39,14,17,3,24,21,41,19,55,41,9,13,28,22,19,28,24,5,29,51,19,14,35,16,27,28,0,39,35,36,34,39,35,35,31,41,35,40,46,47,13,25,26,13,50,32,23,48,37,34,34,34,25,11,38,28,27,41,27,36,47,25,43,16,9,30,49,41,54,74,47,58,64,52,66,41,57,52,39,72,106,88,87,97,118,119,129,82,102,112,89,103,129,97,132,118,108,126,103,111,91,100,128,123,135,115,136,112,130,99,139,157,120,144,125,131,117,153,138,101,155,129,147,162,160,136,139,149,160,134,132,169,142,160,112,164,133,120,121,139,137,146,160,136,110,157,147,160,143,119,140,157,157,132,151,155,146,134,120,132,114,134,144,166,166,174,129,139,144,165,125,119,159,134,120,116,132,140,144,116,123,116,127,128,105,144,143,153,155,144,149,127,139,129,150,123,169,138,151,137,128,126,169,136,152,136,158,146,164,149,148,147,140,139,165,128,131,132,154,125,115,166,137,143,141,120,145,147,145,136,118,108,130,171,132,126,135,93,107,115,157,147,132,112,121,124,123,128,118,118,108,110,147,103,17,0,3,9,15,36,5,37,5,26,35,17,37,15,1,27,21,33,1,16,7,1,22,21,10,7,12,6,0,22,9,12,13,18,6,19,27,17,19,24,16,22,9,32,18,7,29,27,18,18,1,17,47,9,25,13,22,21,13,10,13,15,23,21,23,10,24,18,34,8,14,24,33,6,10,0,33,1,13,37,2,12,37,9,35,34,5,21,37,10,28,13,12,43,15,23,5,31,10,18,13,18,25,26,45,4,20,10,20,11,16,7,14,10,25,16,17,3,9,9,10,9,12,0,8,6,11,20,20,17,17,3,23,12,22,11,7,17,10,7,16,22,14,34,25,16,14,18,6,23,3,16,31,16,11,21,37,12,12,2,19,28,30,19,12,25,18,35,14,29,26,23,8,39,16,20,2,31,14,42,16,8,29,22,18,5,5,15,26,16,1,18,20,27,23,5,16,17,21,17,24,28,33,10,25,16,9,19,23,27,19,28,33,18,30,21,12,16,12,19,7,29,20,12,10,4,32,19,21,43,18,32,28,22,31,29,13,12,16,36,19,13,30,32,28,22,24,14,22,38,26,2,31,16,13,16,11,24,15,12,34,8,26,10,33,13,5,8,8,6,27,19,12,7,27,17,48,24,10,1,5,17,25,11,6,38,13,15,20,14,21,14,23,26,22,0,23,36,20,32,16,8,12,19,8,11,24,1,13,12,26,29,23,2,21,8,31,20,25,3,19,35,6,13,5,16,26,19,3,14,20,16,3,20,10,42,15,18,13,27,19,3,48,4,8,19,14,22,26,11,27,18,12,9,13,28,37,1,14,23,16,23,21,17,30,13,26,20,15,23,21,9,29,37,16,17,24,21,25,4,8,6,20,34,12,17,10,19,0,23,26,0,14,24,4,15,31,4,3,8,35,30,22,1,13,29,21,15,8,42,6,14,13,41,2,17,43,17,7,7,30,4,17,38,20,8,8,15,35,19,11,13,24,31,16,23,36,26,8,29,27, \ No newline at end of file diff --git a/media/libvpx/libvpx/build_debug/non_greedy_mv_test_files/estimation_16x16.txt b/media/libvpx/libvpx/build_debug/non_greedy_mv_test_files/estimation_16x16.txt new file mode 100644 index 0000000000..7216dbcb17 --- /dev/null +++ b/media/libvpx/libvpx/build_debug/non_greedy_mv_test_files/estimation_16x16.txt @@ -0,0 +1,2 @@ +30,45 +12,8;12,8;12,8;12,8;12,8;12,9;12,9;12,9;12,9;12,9;12,9;11,10;11,10;11,10;11,10;11,11;11,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,11;9,4;10,12;11,12;11,12;11,12;11,5;10,11;10,-1;12,8;12,8;12,8;12,8;12,8;12,9;12,9;12,9;12,9;12,9;12,10;11,10;11,10;11,10;11,10;11,11;11,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,11;11,13;11,11;11,12;11,12;11,12;10,9;10,11;10,-1;11,8;12,8;12,8;12,8;12,8;12,9;12,9;12,9;12,9;11,9;11,10;11,10;11,10;11,10;11,11;11,11;11,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;12,12;11,12;11,12;11,12;11,12;11,11;10,11;10,11;10,-1;12,8;12,8;12,8;12,8;12,9;11,9;11,9;11,9;11,9;11,9;11,10;11,10;11,10;11,11;11,11;11,11;11,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,13;11,14;10,13;10,8;10,11;10,0;12,8;12,8;12,8;12,8;11,9;11,9;11,9;11,9;11,9;11,10;11,10;11,10;12,11;11,11;11,11;11,11;12,11;12,12;12,12;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;10,13;9,13;10,24;10,12;10,11;10,0;11,8;11,8;12,8;12,9;11,9;10,9;11,9;11,9;11,10;11,10;11,10;11,10;11,11;11,11;9,10;11,11;12,12;13,12;12,12;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;10,12;10,13;10,9;10,7;9,11;9,0;11,8;11,9;10,9;11,9;11,9;10,9;10,9;10,9;11,10;11,10;11,10;11,11;11,11;11,11;11,11;11,12;11,12;11,12;11,12;11,12;11,12;12,11;12,11;12,11;12,11;12,11;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;10,12;10,10;9,11;9,11;9,0;11,9;11,9;10,9;10,9;10,9;10,9;9,9;10,10;10,10;11,10;11,11;11,11;11,11;11,11;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;12,11;12,11;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;10,12;10,11;9,9;9,10;9,11;9,0;11,9;11,9;11,9;10,9;10,9;9,9;7,9;9,10;10,10;11,10;11,11;11,11;12,12;12,12;11,12;11,12;12,12;19,14;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;10,12;10,12;10,12;8,-2;8,12;8,12;8,0;11,9;11,9;11,9;11,9;10,9;10,9;10,10;10,10;10,10;11,11;11,11;11,13;12,12;13,12;10,12;3,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;10,12;10,12;9,11;9,25;8,12;7,11;7,0;11,9;11,9;11,9;11,9;11,9;10,9;10,10;10,10;11,10;11,11;11,11;11,11;12,11;11,11;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;10,12;10,12;9,12;8,15;7,13;6,11;5,0;11,9;11,9;11,9;11,9;11,9;11,10;11,10;11,10;11,10;11,11;11,11;12,11;12,10;12,11;12,11;12,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;10,12;9,13;9,14;8,14;6,11;4,11;3,0;12,9;12,9;12,9;12,9;12,10;11,10;11,10;11,10;11,10;11,10;12,11;12,11;12,11;12,11;12,11;12,11;12,12;12,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;10,12;10,12;9,12;9,14;9,15;8,16;5,15;1,12;0,0;12,9;12,9;12,9;12,9;12,10;12,10;12,10;11,10;11,10;12,10;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;10,12;10,13;9,13;9,14;8,16;7,17;3,16;-2,12;-5,7;12,10;12,10;13,9;14,10;13,10;12,10;12,10;12,10;12,10;12,10;12,11;13,11;13,11;13,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;10,13;10,13;9,14;9,15;7,16;6,18;0,18;-8,12;-19,3;12,10;12,10;13,10;13,10;13,10;12,10;12,10;12,10;12,10;12,10;13,11;13,11;14,11;13,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;10,13;10,13;9,14;9,15;7,16;4,17;-1,17;-13,12;-30,-2;12,11;12,10;11,10;16,10;14,10;12,10;12,10;12,10;12,10;12,10;13,10;14,10;18,10;13,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,13;11,14;10,14;10,16;9,17;5,18;0,19;-12,14;-39,-11;12,11;12,11;12,10;12,10;12,10;12,10;12,11;12,11;12,11;12,11;12,10;12,10;12,10;12,10;12,11;12,11;12,11;12,11;12,11;12,11;12,11;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,13;11,13;11,13;10,14;9,15;8,16;7,18;3,19;-4,17;-13,6;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,13;11,13;11,13;10,13;9,14;8,15;7,17;5,17;1,15;-3,2;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,13;11,13;10,13;10,13;11,14;9,14;9,16;8,16;5,13;0,-2;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;10,12;10,12;10,13;9,13;9,14;9,14;8,13;3,-5;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;12,11;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;10,12;10,12;10,13;10,13;9,13;9,13;9,12;4,-7;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,13;10,12;10,12;10,13;10,13;10,13;10,13;10,13;10,12;6,3;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;10,13;10,13;10,13;10,12;10,12;10,13;10,13;11,14;10,14;8,12;2,-1;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,13;11,13;10,13;10,13;11,13;10,13;10,13;10,13;10,13;10,13;9,13;8,13;6,12;0,-6;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;10,13;10,13;10,13;10,13;10,13;10,13;10,13;10,14;11,14;10,14;10,14;8,14;5,13;-1,-10;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,13;11,12;11,12;11,12;11,12;11,12;11,12;11,13;11,13;10,13;11,13;11,13;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,12;11,13;11,13;11,13;11,14;10,14;9,13;9,13;9,13;9,14;9,14;9,14;9,14;9,14;9,14;7,13;5,12;2,0;10,13;10,12;9,12;9,12;10,12;11,12;11,13;10,13;10,12;10,12;10,12;11,12;11,12;11,13;10,13;10,13;10,13;10,13;10,13;11,13;11,12;11,13;11,13;11,13;11,12;11,12;11,12;11,13;11,13;10,14;10,14;9,15;9,15;10,15;8,13;8,13;8,15;7,14;6,13;7,13;8,14;8,14;5,13;4,12;2,-1;10,13;7,12;7,12;7,12;8,12;8,11;8,12;8,13;8,13;9,12;9,11;9,12;9,12;9,12;9,12;9,12;7,13;7,13;8,13;8,12;8,12;8,13;9,13;10,13;11,13;11,13;10,13;10,13;10,14;10,15;10,16;9,17;7,18;6,16;5,12;8,15;8,15;1,13;1,10;4,10;6,12;7,15;4,14;3,12;2,1;-1,13;5,12;6,12;4,12;6,11;5,9;2,5;3,6;0,1;6,9;6,10;3,10;6,11;4,11;3,10;3,10;1,8;-7,3;-6,-1;3,8;4,10;2,12;-11,13;1,13;-4,13;-6,13;-6,13;0,13;-3,14;0,15;0,15;1,16;-11,13;1,14;3,13;-2,5;-8,2;-17,-8;-3,0;0,6;1,10;2,11;-1,10;2,11;1,0; \ No newline at end of file diff --git a/media/libvpx/libvpx/build_debug/non_greedy_mv_test_files/exhaust_16x16.txt b/media/libvpx/libvpx/build_debug/non_greedy_mv_test_files/exhaust_16x16.txt new file mode 100644 index 0000000000..719c3f04b3 --- /dev/null +++ b/media/libvpx/libvpx/build_debug/non_greedy_mv_test_files/exhaust_16x16.txt @@ -0,0 +1,2 @@ +30,45 +6,4;12,14;6,16;24,9;30,2;7,11;11,12;13,10;12,12;30,11;9,13;10,11;4,11;1,-7;7,-13;9,-32;1,-12;22,9;29,5;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;28,-9;12,12;12,12;12,12;12,12;10,5;12,12;0,-1;9,29;14,3;4,31;11,24;-10,7;5,23;-15,-32;13,6;13,6;0,6;27,3;10,9;12,11;14,3;2,-19;-4,14;16,-13;12,12;17,10;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;6,18;10,17;9,12;12,12;12,12;12,12;3,9;26,12;-16,-1;8,31;11,-10;17,-17;16,-22;13,14;10,18;12,12;12,11;20,-23;3,9;7,4;12,13;10,13;12,12;15,24;11,-6;12,12;12,12;11,11;12,11;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;14,9;12,12;12,12;12,12;12,12;24,17;4,11;17,12;-32,-1;12,8;13,11;13,-4;13,25;14,-26;12,12;20,-8;12,12;12,12;13,12;13,12;12,12;12,12;14,11;11,13;3,8;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;14,16;5,6;5,12;-32,-1;12,11;13,14;12,13;12,12;8,-2;12,20;11,13;12,14;12,12;12,12;10,7;11,-10;13,14;11,13;12,12;-16,-10;12,12;12,13;9,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;16,25;12,12;18,12;-32,-1;12,13;12,12;16,14;16,30;11,7;10,-4;11,15;12,12;9,22;13,15;11,-12;12,12;12,12;12,6;4,-22;12,12;12,12;15,24;12,9;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;15,9;19,6;27,12;-32,-1;11,5;13,26;6,-20;13,8;13,-2;8,-14;12,22;10,1;11,-14;13,24;15,10;14,17;12,-8;12,12;11,-9;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;5,12;-32,-1;12,12;9,-8;12,12;30,1;25,-3;12,12;12,17;8,14;8,24;8,-2;7,20;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;8,16;8,10;12,12;-32,-1;9,11;5,6;14,1;-4,27;12,12;19,7;-18,-32;22,-20;11,20;11,16;10,20;12,12;12,12;12,12;12,12;12,12;12,12;31,-22;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;7,-4;12,12;13,12;-32,-1;12,12;12,12;11,4;12,12;12,20;12,7;12,12;10,20;13,8;19,-1;12,11;24,19;13,8;14,6;11,9;2,16;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;11,11;14,26;17,12;12,12;-32,-1;20,18;10,10;15,-17;14,-2;12,12;11,26;13,27;10,12;13,14;23,9;8,20;17,0;12,12;12,11;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;10,13;12,1;13,12;-32,-1;16,11;12,12;9,13;22,0;-32,-5;9,8;15,6;-8,-2;12,12;10,-21;12,3;12,12;8,3;12,12;12,12;12,23;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;11,11;7,12;31,-1;8,16;11,7;11,18;11,9;12,3;13,-4;5,24;8,17;12,12;12,14;12,12;-9,14;7,-2;12,16;15,6;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,14;11,12;21,-1;13,20;27,-9;11,6;14,8;7,22;19,7;4,1;-24,21;12,19;11,18;9,-24;14,-20;12,7;13,-28;10,15;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;13,12;24,-32;14,5;9,18;25,-26;24,-27;12,11;10,16;7,26;12,12;17,17;13,4;6,18;12,20;30,3;16,17;12,16;12,12;12,-21;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;6,-32;12,12;12,12;-16,11;12,12;10,11;25,-9;27,5;22,28;8,13;13,3;12,30;-5,-8;18,-31;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;-9,-32;13,12;12,12;12,3;18,29;15,-9;1,19;-9,14;-3,-3;-21,8;13,6;-12,12;-2,-13;20,-32;-19,-31;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;-24,-30;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;27,-1;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;11,-1;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;-5,-1;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;-21,-1;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,15;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;-29,-1;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;17,-1;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;1,-1;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;-15,-1;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;-31,-1;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,9;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;3,-1;12,12;12,12;17,20;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;-13,-1;12,12;12,12;-8,25;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;21,-1;-2,5;-3,25;-18,27;1,-31;0,17;-9,26;-23,27;-23,20;-27,18;1,30;3,15;0,0;3,22;3,5;0,0;0,0;0,0;-17,-18;-15,-16;-1,15;-4,9;2,11;-13,-2;2,13;-5,16;-7,-2;-7,7;0,18;-5,-11;0,-10;0,0;2,-5;-5,-32;4,-27;2,21;-8,7;-8,1;-17,-10;-4,-1;-13,0;-8,5;1,10;0,-1;4,12;5,-1; \ No newline at end of file diff --git a/media/libvpx/libvpx/build_debug/non_greedy_mv_test_files/ground_truth_16x16.txt b/media/libvpx/libvpx/build_debug/non_greedy_mv_test_files/ground_truth_16x16.txt new file mode 100644 index 0000000000..850b7eda8f --- /dev/null +++ b/media/libvpx/libvpx/build_debug/non_greedy_mv_test_files/ground_truth_16x16.txt @@ -0,0 +1,2 @@ +30,45 +12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12;12,12; \ No newline at end of file diff --git a/media/libvpx/libvpx/build_debug/non_greedy_mv_test_files/localVar_16x16.txt b/media/libvpx/libvpx/build_debug/non_greedy_mv_test_files/localVar_16x16.txt new file mode 100644 index 0000000000..5e4ea8eed9 --- /dev/null +++ b/media/libvpx/libvpx/build_debug/non_greedy_mv_test_files/localVar_16x16.txt @@ -0,0 +1,2 @@ +30,45 +0,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;1,0,0,0;0,0,0,0;0,0,0,1;0,0,0,0;0,0,0,0;0,0,0,1;0,0,0,0;1,0,0,0;1,0,0,0;0,0,0,0;0,0,0,0;1,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;1,2,2,4;25,115,115,529;196,644,644,2116;225,420,420,784;576,696,696,841;4900,3920,3920,3136;9604,6664,6664,4624;9,120,120,1600;1024,2208,2208,4761;3249,5016,5016,7744;484,1870,1870,7225;9,288,288,9216;64,384,384,2304;324,882,882,2401;10404,8058,8058,6241;1936,2332,2332,2809;196,322,322,529;1,10,10,100;4,16,16,64;4,100,100,2500;1156,2482,2482,5329;3600,5160,5160,7396;81,594,594,4356;0,0,0,1521;0,0,0,10000;1,134,134,17956;0,0,0,0;0,0,0,0;0,0,0,0;1,0,0,0;0,0,0,0;1,0,0,0;0,0,0,0;1,0,0,0;1,0,0,0;0,0,0,0;1,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;0,0,0,4;25,110,110,484;196,560,560,1600;529,713,713,961;1296,2196,2196,3721;4096,6784,6784,11236;6889,9960,9960,14400;441,1659,1659,6241;1764,2520,2520,3600;4225,4875,4875,5625;529,2208,2208,9216;36,558,558,8649;36,276,276,2116;529,1564,1564,4624;9801,11088,11088,12544;3600,4620,4620,5929;225,360,360,576;4,2,2,1;1,9,9,81;1,44,44,1936;2025,3060,3060,4624;4624,4760,4760,4900;16,168,168,1764;0,0,0,1681;0,0,0,10609;1,133,133,17689;9,0,0,0;1,0,0,0;4,0,0,0;9,0,0,0;9,0,0,0;1,0,0,0;1,0,0,0;1,0,0,0;1,0,0,0;1,1,1,1;0,0,0,0;0,0,0,0;0,0,0,0;1,-1,-1,1;0,0,0,0;1,0,0,0;1,0,0,0;1,0,0,0;0,0,0,0;0,0,0,9;81,198,198,484;225,540,540,1296;2209,2162,2162,2116;6400,5760,5760,5184;6241,7347,7347,8649;7396,7998,7998,8649;8464,7084,7084,5929;2916,4104,4104,5776;3249,6498,6498,12996;784,3724,3724,17689;400,1820,1820,8281;784,1288,1288,2116;1849,1935,1935,2025;8649,4650,4650,2500;4624,3060,3060,2025;1089,891,891,729;625,175,175,49;81,0,0,0;441,693,693,1089;3969,4347,4347,4761;2809,2862,2862,2916;0,0,0,0;0,0,0,2116;0,0,0,10816;1,134,134,17956;1,1,1,1;16,0,0,0;25,0,0,0;9,0,0,0;25,0,0,0;1,0,0,0;1,0,0,0;4,0,0,0;9,0,0,0;9,0,0,0;9,3,3,1;9,3,3,1;1,0,0,0;4,2,2,1;1,0,0,0;0,0,0,0;4,2,2,1;4,4,4,4;1,0,0,0;1,4,4,16;196,378,378,729;256,784,784,2401;4624,4692,4692,4761;16384,8960,8960,4900;6889,4067,4067,2401;8649,3534,3534,1444;17689,8246,8246,3844;6889,8798,8798,11236;4761,8832,8832,16384;1024,4032,4032,15876;3136,5656,5656,10201;6400,5920,5920,5476;5776,5700,5700,5625;6724,7626,7626,8649;2916,5130,5130,9025;7744,6688,6688,5776;5184,3528,3528,2401;4225,2665,2665,1681;4356,3564,3564,2916;7056,3528,3528,1764;2500,600,600,144;1,0,0,0;0,0,0,225;0,0,0,10404;1,-135,-135,18225;16,0,0,0;25,5,5,1;25,0,0,0;36,0,0,0;16,0,0,0;9,0,0,0;9,0,0,0;9,0,0,0;9,3,3,1;16,0,0,0;25,0,0,0;25,0,0,0;25,10,10,4;4,4,4,4;4,6,6,9;1,0,0,0;4,2,2,1;1,0,0,0;0,0,0,0;1,9,9,81;576,720,720,900;100,450,450,2025;1849,1763,1763,1681;8464,5796,5796,3969;3969,5733,5733,8281;5041,6177,6177,7569;7921,7476,7476,7056;8100,7740,7740,7396;6561,6075,6075,5625;1296,2088,2088,3364;4489,3618,3618,2916;7056,4368,4368,2704;5041,4331,4331,3721;4900,5530,5530,6241;1849,3311,3311,5929;7225,4930,4930,3364;6084,3744,3744,2304;7396,5934,5934,4761;6241,7110,7110,8100;8100,8730,8730,9409;5184,5688,5688,6241;1,35,35,1225;1,54,54,2916;0,0,0,11881;1,-137,-137,18769;64,0,0,0;49,0,0,0;16,-4,-4,1;25,0,0,0;25,0,0,0;36,0,0,0;25,0,0,0;36,0,0,0;16,0,0,0;9,0,0,0;16,0,0,0;9,0,0,0;25,0,0,0;25,0,0,0;16,4,4,1;100,10,10,1;49,14,14,4;25,10,10,4;36,0,0,0;36,96,96,256;1764,1680,1680,1600;1600,1880,1880,2209;1024,1184,1184,1369;1681,1517,1517,1369;2704,3484,3484,4489;2025,3330,3330,5476;2809,2915,2915,3025;4489,5293,5293,6241;5329,6132,6132,7056;1764,2100,2100,2500;3025,2090,2090,1444;3249,1596,1596,784;1764,2310,2310,3025;4489,6164,6164,8464;1521,2808,2808,5184;729,810,810,900;900,330,330,121;1764,882,882,441;2025,1530,1530,1156;5041,5467,5467,5929;3136,4144,4144,5476;0,0,0,676;0,0,0,289;0,0,0,12321;1,-139,-139,19321;9,0,0,0;9,0,0,0;36,0,0,0;9,0,0,0;4,0,0,0;9,0,0,0;9,0,0,0;9,0,0,0;9,0,0,0;9,0,0,0;4,0,0,0;9,0,0,0;9,0,0,0;121,0,0,0;36,0,0,0;1089,231,231,49;1225,210,210,36;1369,185,185,25;1225,245,245,49;1764,1218,1218,841;3364,3654,3654,3969;3969,5292,5292,7056;4761,5313,5313,5929;2704,2964,2964,3249;2209,3290,3290,4900;1681,2952,2952,5184;3721,4453,4453,5329;12321,12432,12432,12544;9025,9405,9405,9801;2116,2392,2392,2704;2704,2704,2704,2704;2304,2256,2256,2209;1089,2244,2244,4624;4761,7176,7176,10816;900,1890,1890,3969;100,150,150,225;324,342,342,361;576,600,600,625;441,945,945,2025;3600,4860,4860,6561;441,1092,1092,2704;1,2,2,4;0,0,0,3025;0,0,0,12100;0,0,0,19881;0,0,0,0;0,0,0,0;1,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;1,0,0,0;1,0,0,0;1,0,0,0;1,0,0,0;4,0,0,0;9,0,0,0;9,18,18,36;256,320,320,400;1600,720,720,324;2209,611,611,169;3249,798,798,196;3364,522,522,81;3600,600,600,100;4761,1725,1725,625;2916,2376,2376,1936;2809,2703,2703,2601;6084,3900,3900,2500;3844,2914,2914,2209;3249,2850,2850,2500;3600,3480,3480,3364;9025,7505,7505,6241;19321,15846,15846,12996;6889,9545,9545,13225;1225,2485,2485,5041;1600,1640,1640,1681;961,1147,1147,1369;729,1431,1431,2809;4489,5025,5025,5625;400,920,920,2116;81,108,108,144;576,336,336,196;576,552,552,529;289,1139,1139,4489;2304,3984,3984,6889;4,58,58,841;0,0,0,0;0,0,0,1849;0,0,0,12544;1,-144,-144,20736;1,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;9,0,0,0;0,0,0,0;4,0,0,0;4,0,0,0;4,0,0,0;16,0,0,0;9,0,0,0;121,11,11,1;484,22,22,1;576,24,24,1;1024,32,32,1;529,138,138,36;1681,451,451,121;2304,2928,2928,3721;3969,5355,5355,7225;4096,3776,3776,3481;6889,4980,4980,3600;3249,4161,4161,5329;1296,3276,3276,8281;4900,7280,7280,10816;17424,15972,15972,14641;8100,11070,11070,15129;3600,5100,5100,7225;576,1272,1272,2809;841,1102,1102,1444;400,620,620,961;144,228,228,361;1936,924,924,441;225,315,315,441;25,125,125,625;1444,1292,1292,1156;256,480,480,900;169,728,728,3136;1156,2754,2754,6561;9,117,117,1521;4,68,68,1156;9,174,174,3364;1,114,114,12996;1,-149,-149,22201;4,0,0,0;1,0,0,0;1,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;1,0,0,0;4,0,0,0;4,0,0,0;1,0,0,0;1,0,0,0;0,0,0,25;4,0,0,0;196,0,0,0;400,0,0,0;400,0,0,0;1225,315,315,81;2116,414,414,81;1936,616,616,196;2601,1530,1530,900;5929,4774,4774,3844;4624,4964,4964,5329;7225,4760,4760,3136;3969,3276,3276,2704;1024,2560,2560,6400;9216,9600,9600,10000;12321,10989,10989,9801;5476,7104,7104,9216;3364,4292,4292,5476;289,578,578,1156;676,286,286,121;256,96,96,36;49,105,105,225;784,672,672,576;144,180,180,225;64,136,136,289;1444,988,988,676;144,252,252,441;144,696,696,3364;1296,3132,3132,7569;1,46,46,2116;4,94,94,2209;1,61,61,3721;0,0,0,14400;1,-150,-150,22500;0,0,0,0;1,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;1,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;9,48,48,256;900,600,600,400;1521,507,507,169;1936,440,440,100;3364,580,580,100;4225,585,585,81;3969,693,693,121;4225,1755,1755,729;4356,3234,3234,2401;3136,2912,2912,2704;5776,3192,3192,1764;5041,3550,3550,2500;4624,5236,5236,5929;13456,10672,10672,8464;7744,8096,8096,8464;4225,4290,4290,4356;2601,2499,2499,2401;324,720,720,1600;676,728,728,784;169,286,286,484;256,288,288,324;1369,851,851,529;289,374,374,484;225,300,300,400;961,651,651,441;196,280,280,400;256,768,768,2304;1681,3772,3772,8464;1,53,53,2809;1,0,0,0;0,0,0,0;0,0,0,14161;0,0,0,22801;0,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;1,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;0,0,0,9;256,0,0,0;441,0,0,0;625,50,50,4;784,56,56,4;1089,132,132,16;841,232,232,64;2025,945,945,441;2116,2162,2162,2209;1296,2268,2268,3969;3025,2585,2585,2209;6084,4992,4992,4096;5476,5550,5550,5625;6084,3900,3900,2500;5041,2911,2911,1681;1225,1365,1365,1521;2025,1980,1980,1936;784,980,980,1225;961,682,682,484;529,552,552,576;1156,1258,1258,1369;2209,2021,2021,1849;576,744,744,961;625,650,650,676;961,930,930,900;576,432,432,324;289,884,884,2704;3025,4895,4895,7921;2209,1974,1974,1764;16,0,0,0;16,204,204,2601;1,118,118,13924;1,141,141,19881;0,0,0,0;0,0,0,0;1,0,0,0;0,0,0,0;1,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;1,0,0,0;1,0,0,0;1,0,0,0;0,0,0,9;0,0,0,0;1,0,0,0;1,0,0,0;9,3,3,1;9,6,6,4;16,8,8,4;16,68,68,289;1681,1681,1681,1681;2401,2842,2842,3364;576,1560,1560,4225;1296,2556,2556,5041;8836,8272,8272,7744;2116,2668,2668,3364;1600,1120,1120,784;2025,1530,1530,1156;1444,1330,1330,1225;1849,1677,1677,1521;400,740,740,1369;1444,1596,1596,1764;676,1222,1222,2209;1849,2107,2107,2401;1521,1677,1677,1849;676,1066,1066,1681;1521,1716,1716,1936;441,882,882,1764;900,1230,1230,1681;256,784,784,2401;3136,3472,3472,3844;7396,6450,6450,5625;3136,2352,2352,1764;256,1056,1056,4356;1,107,107,11449;4,278,278,19321;0,0,0,0;0,0,0,0;0,0,0,0;1,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;1,0,0,0;1,0,0,0;1,0,0,0;1,0,0,0;1,0,0,0;4,0,0,0;4,0,0,0;4,0,0,0;4,0,0,0;4,0,0,0;9,6,6,4;100,390,390,1521;3136,3304,3304,3481;3844,3100,3100,2500;484,1254,1254,3249;4624,5916,5916,7569;10201,8585,8585,7225;1024,1536,1536,2304;1369,1591,1591,1849;2025,1710,1710,1444;1600,1400,1400,1225;1369,1369,1369,1369;400,680,680,1156;1369,1147,1147,961;1521,1131,1131,841;2025,1080,1080,576;1521,897,897,529;1521,1014,1014,676;2209,1598,1598,1156;441,609,609,841;1024,832,832,676;361,627,627,1089;1521,1326,1326,1156;3136,2632,2632,2209;8100,5940,5940,4356;3136,5488,5488,9604;9,393,393,17161;7056,5376,5376,4096;0,0,0,0;0,0,0,0;0,0,0,1;9,0,0,0;0,0,0,0;0,0,0,0;0,0,0,1;0,0,0,0;0,0,0,0;0,0,0,0;1,0,0,0;1,0,0,0;0,0,0,0;9,0,0,0;144,0,0,0;256,0,0,0;100,0,0,0;256,112,112,49;576,408,408,289;3025,1925,1925,1225;2704,2808,2808,2916;1764,2814,2814,4489;7744,6864,6864,6084;5329,5256,5256,5184;1369,1850,1850,2500;961,1302,1302,1764;1521,1560,1560,1600;1936,1628,1628,1369;1600,1400,1400,1225;1600,1280,1280,1024;1681,1517,1517,1369;3249,2451,2451,1849;2401,1960,1960,1600;2209,1504,1504,1024;2209,1598,1598,1156;2209,2021,2021,1849;1089,1221,1221,1369;1369,1258,1258,1156;1156,1190,1190,1225;1521,1287,1287,1089;1156,1088,1088,1024;3969,3528,3528,3136;4096,6720,6720,11025;25,670,670,17956;8836,6298,6298,4489;0,0,0,1;9,3,3,1;0,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;1,0,0,0;0,0,0,0;0,0,0,81;400,280,280,196;900,210,210,49;784,140,140,25;961,589,589,361;1369,1221,1221,1089;2401,2107,2107,1849;2704,3068,3068,3481;4624,4556,4556,4489;4489,3819,3819,3249;2025,1845,1845,1681;1521,1287,1287,1089;961,1116,1116,1296;1225,1365,1365,1521;3136,2072,2072,1369;2704,1976,1976,1444;2401,1960,1960,1600;2704,1820,1820,1225;2601,1530,1530,900;1225,1015,1015,841;1764,1218,1218,841;2209,1645,1645,1225;1444,1406,1406,1369;1444,1406,1406,1369;1681,1722,1722,1764;2025,1980,1980,1936;2809,1802,1802,1156;1849,1849,1849,1849;2601,3672,3672,5184;2916,6912,6912,16384;16,660,660,27225;7921,5785,5785,4225;1,6,6,36;400,120,120,36;144,36,36,9;225,0,0,0;64,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;0,0,0,0;64,0,0,0;0,0,0,0;0,0,0,0;361,0,0,0;1,0,0,0;256,16,16,1;576,24,24,1;961,217,217,49;2304,1248,1248,676;3136,2856,2856,2601;2916,3402,3402,3969;4225,3835,3835,3481;3025,2970,2970,2916;1681,1804,1804,1936;1156,1088,1088,1024;1600,1320,1320,1089;625,825,825,1089;1024,768,768,576;2401,686,686,196;3025,935,935,289;2209,1081,1081,529;3136,1344,1344,576;2401,1519,1519,961;729,999,999,1369;1600,1600,1600,1600;2025,1845,1845,1681;1225,1400,1400,1600;1849,946,946,484;2209,658,658,196;2025,1170,1170,676;2601,765,765,225;2601,1224,1224,576;3136,3360,3360,3600;3025,5500,5500,10000;225,1905,1905,16129;7569,6438,6438,5476;676,910,910,1225;4225,2210,2210,1156;1444,684,684,324;1521,39,39,1;1444,38,38,1;1444,152,152,16;1296,468,468,169;1296,576,576,256;1024,480,480,225;961,496,496,256;1156,544,544,256;1369,666,666,324;1600,680,680,289;625,275,275,121;841,348,348,144;1600,920,920,529;2209,2021,2021,1849;3481,3304,3304,3136;5041,3550,3550,2500;3844,3162,3162,2601;4225,3120,3120,2304;1369,1295,1295,1225;625,800,800,1024;841,1218,1218,1764;1521,1092,1092,784;225,255,255,289;784,588,588,441;961,558,558,324;1849,817,817,361;1521,741,741,361;1764,756,756,324;2116,828,828,324;900,660,660,484;1444,950,950,625;1444,874,874,529;1296,792,792,484;1369,703,703,361;1936,968,968,484;1089,660,660,400;1156,714,714,441;2209,1504,1504,1024;1444,1064,1064,784;1444,1558,1558,1681;529,1426,1426,3844;144,756,756,3969;1521,3315,3315,7225;3600,4320,4320,5184;2401,1666,1666,1156;2116,460,460,100;2025,405,405,81;2401,490,490,100;2025,540,540,144;1296,612,612,289;1936,1364,1364,961;1936,1672,1672,1444;1444,1064,1064,784;1089,528,528,256;1681,1066,1066,676;1936,1496,1496,1156;1849,1419,1419,1089;2116,2070,2070,2025;4096,4096,4096,4096;2809,3445,3445,4225;5776,5624,5624,5476;2916,3348,3348,3844;1521,1053,1053,729;1156,1020,1020,900;289,578,578,1156;676,702,702,729;1089,1089,1089,1089;400,500,500,625;400,220,220,121;169,156,156,144;400,240,240,144;361,133,133,49;289,187,187,121;529,322,322,196;324,180,180,100;576,216,216,81;441,252,252,144;400,240,240,144;256,144,144,81;529,276,276,144;400,320,320,256;484,594,594,729;1156,1088,1088,1024;625,525,525,441;729,945,945,1225;225,750,750,2500;144,732,732,3721;1225,2450,2450,4900;576,1296,1296,2916;2209,1175,1175,625;676,364,364,196;1296,576,576,256;1296,756,756,441;900,630,630,441;1600,840,840,441;5041,1562,1562,484;5041,1917,1917,729;5041,2059,2059,841;3721,1952,1952,1024;676,962,962,1369;1444,1292,1292,1156;784,840,840,900;1024,1440,1440,2025;4624,3876,3876,3249;3136,2912,2912,2704;5476,2738,2738,1369;1849,731,731,289;961,620,620,400;676,572,572,484;441,399,399,361;1369,999,999,729;1024,960,960,900;729,810,810,900;324,324,324,324;144,72,72,36;36,30,30,25;25,15,15,9;36,24,24,16;9,12,12,16;36,24,24,16;81,54,54,36;49,42,42,36;25,20,20,16;25,15,15,9;49,42,42,36;81,81,81,81;3969,1512,1512,576;2025,1530,1530,1156;2209,1269,1269,729;2401,2107,2107,1849;400,1220,1220,3721;144,720,720,3600;2025,3330,3330,5476;3249,4788,4788,7056;3844,4402,4402,5041;2500,2350,2350,2209;3721,3477,3477,3249;2601,2703,2703,2809;1369,1554,1554,1764;3136,3472,3472,3844;8100,5490,5490,3721;9604,4410,4410,2025;14884,5002,5002,1681;10609,5768,5768,3136;2401,2450,2450,2500;2401,1176,1176,576;2025,1125,1125,625;2601,1683,1683,1089;5776,3192,3192,1764;3249,2964,2964,2704;3481,3186,3186,2916;1849,1935,1935,2025;1296,1224,1224,1156;676,702,702,729;729,837,837,961;1296,1260,1260,1225;900,900,900,900;729,702,702,676;441,252,252,144;289,170,170,100;64,128,128,256;81,72,72,64;25,20,20,16;16,12,12,9;9,9,9,9;49,49,49,49;81,72,72,64;25,20,20,16;9,3,3,1;9,3,3,1;25,295,295,3481;8281,9737,9737,11449;5329,6935,6935,9025;4225,4355,4355,4489;3969,4032,4032,4096;729,2430,2430,8100;144,744,744,3844;2916,3618,3618,4489;5476,4366,4366,3481;2809,2385,2385,2025;4624,3400,3400,2500;4489,4489,4489,4489;2401,2352,2352,2304;2209,1410,1410,900;2500,2100,2100,1764;6889,4399,4399,2809;8464,5060,5060,3025;11881,5886,5886,2916;7569,5220,5220,3600;5184,5040,5040,4900;5184,5112,5112,5041;5625,5625,5625,5625;6400,6800,6800,7225;8281,8190,8190,8100;2916,3564,3564,4356;2209,1974,1974,1764;1936,1584,1584,1296;625,675,675,729;841,870,870,900;484,660,660,900;625,550,550,484;729,702,702,676;676,728,728,784;169,169,169,169;100,0,0,0;25,20,20,16;144,96,96,64;9,9,9,9;4,0,0,0;1,1,1,1;9,15,15,25;81,81,81,81;81,63,63,49;4,4,4,4;1,0,0,0;16,104,104,676;4356,4356,4356,4356;5476,5328,5328,5184;2401,3332,3332,4624;3844,5332,5332,7396;900,3060,3060,10404;121,638,638,3364;3721,5063,5063,6889;4225,3575,3575,3025;4096,2176,2176,1156;5041,2982,2982,1764;4096,2688,2688,1764;4096,2816,2816,1936;3249,3477,3477,3721;3136,4032,4032,5184;5041,4899,4899,4761;5184,4680,4680,4225;5184,4464,4464,3844;3969,4032,4032,4096;4900,4410,4410,3969;5625,4500,4500,3600;6400,5120,5120,4096;5776,4560,4560,3600;5476,3774,3774,2601;1681,1722,1722,1764;1156,1088,1088,1024;1156,986,986,841;289,374,374,484;576,480,480,400;400,420,420,441;289,289,289,289;289,204,204,144;225,150,150,100;49,42,42,36;1,1,1,1;1,1,1,1;25,20,20,16;81,45,45,25;1,3,3,9;1,1,1,1;0,0,0,1;9,21,21,49;225,195,195,169;64,64,64,64;1,3,3,9;784,784,784,784;4624,3468,3468,2601;4761,2691,2691,1521;3969,2205,2205,1225;6084,4056,4056,2704;1296,2412,2412,4489;529,1426,1426,3844;4225,5980,5980,8464;4225,4875,4875,5625;5041,4544,4544,4096;4356,4422,4422,4489;3364,4292,4292,5476;5776,5700,5700,5625;4761,4278,4278,3844;3844,3534,3534,3249;4225,3770,3770,3364;4225,3315,3315,2601;4356,2838,2838,1849;3136,1904,1904,1156;3136,1792,1792,1024;4489,2345,2345,1225;4900,2170,2170,961;3364,1682,1682,841;2209,1739,1739,1369;784,1064,1064,1444;529,667,667,841;441,462,462,484;361,323,323,289;256,256,256,256;196,182,182,169;49,28,28,16;9,3,3,1;9,15,15,25;100,90,90,81;49,56,56,64;9,6,6,4;4,6,6,9;144,84,84,49;1,4,4,16;1,2,2,4;0,0,0,1;4,6,6,9;121,99,99,81;121,132,132,144;361,646,646,1156;2304,3504,3504,5329;5184,5112,5112,5041;4489,2613,2613,1521;5776,1672,1672,484;5041,4402,4402,3844;576,2496,2496,10816;529,1403,1403,3721;2916,4320,4320,6400;3136,4480,4480,6400;5184,5544,5544,5929;5929,5467,5467,5041;3481,3717,3717,3969;4489,4355,4355,4225;5929,5313,5313,4761;3969,4725,4725,5625;3136,3584,3584,4096;2401,2156,2156,1936;2601,1836,1836,1296;1936,1452,1452,1089;1681,861,861,441;1936,308,308,49;1936,176,176,16;1600,560,560,196;1849,1290,1290,900;1156,918,918,729;1369,777,777,441;324,306,306,289;225,150,150,100;64,40,40,25;9,3,3,1;1,0,0,0;1,0,0,0;1,0,0,0;16,16,16,16;81,99,99,121;100,110,110,121;49,42,42,36;196,84,84,36;169,52,52,16;36,12,12,4;1,3,3,9;49,35,35,25;121,44,44,16;361,361,361,361;3364,1914,1914,1089;3481,2301,2301,1521;4356,4686,4686,5041;2704,4524,4524,7569;3364,4698,4698,6561;2304,4992,4992,10816;324,2376,2376,17424;529,1449,1449,3969;2401,2450,2450,2500;2209,2115,2115,2025;3249,1881,1881,1089;3136,1848,1848,1089;2304,2016,2016,1764;1936,2332,2332,2809;3969,3843,3843,3721;2916,2700,2700,2500;1156,816,816,576;676,234,234,81;841,174,174,36;961,248,248,64;784,168,168,36;484,132,132,36;529,184,184,64;841,464,464,256;961,1054,1054,1156;1764,2772,2772,4356;3025,3795,3795,4761;1024,896,896,784;1024,64,64,4;625,0,0,0;400,20,20,1;121,11,11,1;1,0,0,0;0,0,0,0;1,0,0,0;16,8,8,4;64,80,80,100;225,330,330,484;225,405,405,729;529,598,598,676;324,324,324,324;169,156,156,144;121,143,143,169;289,391,391,529;2500,2750,2750,3025;2304,2448,2448,2601;1444,608,608,256;1681,615,615,225;1296,612,612,289;961,589,589,361;1444,2166,2166,3249;324,1674,1674,8649;484,1386,1386,3969;1521,78,78,4;441,21,21,1;324,18,18,1;121,55,55,25;400,300,300,225;841,464,464,256;1369,481,481,169;1369,777,777,441;361,285,285,225;225,75,75,25;225,45,45,9;256,48,48,9;256,48,48,9;225,30,30,4;196,42,42,9;324,162,162,81;225,210,210,196;841,435,435,225;1156,1020,1020,900;2704,3172,3172,3721;3136,4256,4256,5776;2704,3692,3692,5041;2209,2538,2538,2916;1156,1020,1020,900;484,242,242,121;361,19,19,1;256,0,0,0;196,-14,-14,1;196,0,0,0;484,22,22,1;900,90,90,9;1600,280,280,49;841,725,725,625;576,744,744,961;64,184,184,529;441,1029,1029,2401;2500,2500,2500,2500;81,99,99,121;25,20,20,16;121,132,132,144;196,168,168,144;81,54,54,36;121,440,440,1600;64,608,608,5776;49,784,784,12544;324,306,306,289;1,8,8,64;0,0,0,0;0,0,0,4;9,12,12,16;169,52,52,16;196,84,84,36;400,480,480,576;196,364,364,676;81,72,72,64;49,28,28,16;100,60,60,36;81,45,45,25;64,40,40,25;81,45,45,25;81,54,54,36;100,90,90,81;169,182,182,196;196,168,168,144;841,145,145,25;784,84,84,9;1089,297,297,81;1156,544,544,256;1225,910,910,676;1369,1295,1295,1225;1156,1122,1122,1089;676,572,572,484;529,368,368,256;625,375,375,225;841,638,638,484;2401,1617,1617,1089;5929,3619,3619,2209;4489,2881,2881,1849;1600,680,680,289;16,56,56,196;441,966,966,2116;1600,1560,1560,1521;36,48,48,64;1,2,2,4;16,12,12,9;64,64,64,64;81,99,99,121;64,328,328,1681;1,71,71,5041;49,777,777,12321;289,17,17,1;0,0,0,0;0,0,0,1;0,0,0,1;4,4,4,4;9,3,3,1;25,45,45,81;121,297,297,729;144,288,288,576;36,36,36,36;25,10,10,4;64,16,16,4;49,21,21,9;64,16,16,4;49,14,14,4;81,36,36,16;64,64,64,64;121,132,132,144;196,210,210,225;64,80,80,100;9,12,12,16;64,16,16,4;144,12,12,1;441,21,21,1;961,0,0,0;729,0,0,0;441,0,0,0;361,0,0,0;400,0,0,0;900,150,150,25;3249,969,969,289;8281,3458,3458,1444;10000,6700,6700,4489;3025,3795,3795,4761;16,120,120,900;400,340,340,289;2500,2100,2100,1764;64,248,248,961;4,12,12,36;4,4,4,4;36,12,12,4;121,99,99,81;144,480,480,1600;4,138,138,4761;64,592,592,5476;576,0,0,0;0,0,0,0;0,0,0,1;49,0,0,0;1,2,2,4;9,6,6,4;169,182,182,196;64,120,120,225;81,108,108,144;16,4,4,1;25,10,10,4;49,14,14,4;49,14,14,4;64,8,8,1;25,10,10,4;16,8,8,4;16,12,12,9;36,18,18,9;25,30,30,36;64,40,40,25;9,9,9,9;4,4,4,4;729,0,0,0;9,0,0,0;961,0,0,0;961,0,0,0;676,0,0,0;289,0,0,0;529,0,0,0;289,0,0,0;784,56,56,4;1936,88,88,4;784,140,140,25;3721,244,244,16;16,12,12,9;100,100,100,100;1849,1247,1247,841;1024,992,992,961;1,20,20,400;4,6,6,9;16,4,4,1;25,15,15,9;81,36,36,16;1,15,15,225;9,213,213,5041; \ No newline at end of file diff --git a/media/libvpx/libvpx/build_debug/non_greedy_mv_test_files/raw_1.png b/media/libvpx/libvpx/build_debug/non_greedy_mv_test_files/raw_1.png new file mode 100644 index 0000000000..ebf23e3c66 Binary files /dev/null and b/media/libvpx/libvpx/build_debug/non_greedy_mv_test_files/raw_1.png differ diff --git a/media/libvpx/libvpx/build_debug/non_greedy_mv_test_files/raw_1_12_12.png b/media/libvpx/libvpx/build_debug/non_greedy_mv_test_files/raw_1_12_12.png new file mode 100644 index 0000000000..92941218c8 Binary files /dev/null and b/media/libvpx/libvpx/build_debug/non_greedy_mv_test_files/raw_1_12_12.png differ diff --git a/media/libvpx/libvpx/build_debug/non_greedy_mv_test_files/ref_frame_16x16.txt b/media/libvpx/libvpx/build_debug/non_greedy_mv_test_files/ref_frame_16x16.txt new file mode 100644 index 0000000000..b1a877a2fc --- /dev/null +++ b/media/libvpx/libvpx/build_debug/non_greedy_mv_test_files/ref_frame_16x16.txt @@ -0,0 +1,2 @@ +486,720 +214,214,214,213,214,212,213,215,214,214,215,214,214,214,215,214,214,214,214,216,214,214,214,214,216,215,215,214,214,216,217,216,215,214,217,217,218,217,217,215,216,217,215,218,216,216,218,217,218,219,218,216,219,218,218,220,219,219,219,220,222,219,222,222,222,223,222,223,224,223,222,225,223,224,224,223,225,224,224,224,224,224,225,224,226,226,223,224,226,226,226,226,224,224,227,225,226,227,227,226,225,225,224,224,227,227,227,228,227,229,230,228,230,230,228,229,230,230,230,230,229,229,230,230,230,232,232,233,230,230,231,232,233,232,233,233,232,233,233,231,233,233,233,235,234,235,237,238,236,235,236,236,237,237,236,237,236,236,238,239,239,241,241,240,241,240,240,239,240,240,239,241,240,241,242,241,242,242,242,242,242,243,244,244,245,244,244,244,243,243,244,244,244,243,243,244,245,245,243,244,245,246,246,244,244,245,247,247,245,245,244,247,245,245,246,244,245,245,245,245,245,245,245,245,244,245,246,246,247,247,247,246,247,247,247,247,247,247,247,247,245,245,247,247,247,247,246,247,247,247,247,246,246,247,247,247,246,246,247,247,247,247,247,246,246,246,246,246,247,247,247,247,248,247,247,247,247,247,247,247,247,248,248,247,248,246,247,246,247,247,247,247,247,247,249,241,234,247,247,247,250,249,249,252,252,252,251,251,252,252,252,252,252,252,253,253,252,252,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,252,252,253,253,252,252,253,253,250,250,244,251,252,252,253,253,251,251,230,211,171,147,80,131,251,251,252,252,249,242,206,236,252,252,251,251,252,252,252,252,249,243,215,212,212,184,169,186,193,186,179,152,136,129,129,132,141,118,71,35,27,44,76,82,53,44,33,36,68,94,117,144,123,107,77,36,45,21,18,34,48,55,47,57,66,66,62,61,68,68,74,73,71,65,120,214,249,252,249,248,223,184,205,244,251,251,250,250,251,249,249,250,251,252,250,250,238,240,252,251,248,230,195,218,217,183,229,249,245,229,136,177,243,234,143,105,108,67,101,177,193,223,171,92,55,21,48,55,45,91,71,36,36,57,76,63,17,159,250,250,250,250,253,253,251,251,252,252,252,252,252,252,252,252,194,110,29,128,248,226,206,165,176,235,252,252,250,242,247,247,249,249,247,248,247,247,246,247,247,246,247,246,247,247,246,247,246,250,243,226,233,252,240,127,96,63,7,118,238,246,250,250,250,248,219,210,225,212,198,213,226,237,251,246,242,236,182,170,229,246,250,250,248,247,246,244,244,242,243,243,242,244,244,244,244,243,242,241,242,241,241,241,241,241,241,241,239,240,239,239,238,238,239,238,238,237,236,237,238,235,234,235,233,234,233,231,231,230,227,228,229,229,227,228,227,227,227,229,229,227,225,226,223,223,224,222,224,222,223,222,222,230,233,239,237,217,212,225,230,224,223,222,218,217,218,216,214,219,217,216,217,217,215,214,215,217,217,213,214,214,213,214,214,212,210,211,210,208,210,208,208,210,206,209,207,202,205,204,201,201,200,199,200,197,199,195,199,114,5,1,4,7,9,8,10,10,11,11,11,11,214,214,214,213,214,212,213,215,214,214,215,214,214,214,215,214,214,214,214,216,214,214,214,214,216,215,215,214,214,216,217,216,215,214,217,217,218,217,217,215,216,217,215,218,216,216,218,217,218,219,218,216,219,218,218,220,219,219,219,220,222,219,222,222,222,223,222,223,224,223,222,225,223,224,224,223,225,224,224,224,224,224,225,224,226,226,223,224,226,226,226,226,224,224,227,225,226,227,227,226,225,225,224,224,227,227,227,228,227,229,230,228,230,230,228,229,230,230,230,230,229,229,230,230,230,232,232,233,230,230,231,232,233,232,233,233,232,233,233,231,233,233,233,235,234,235,237,238,236,235,236,236,237,237,236,237,236,236,238,239,239,241,241,240,241,240,240,239,240,240,239,241,240,241,242,241,242,242,242,242,242,243,244,244,245,244,244,244,243,243,244,244,244,243,243,244,245,245,243,244,245,246,246,244,244,245,247,247,245,245,244,247,245,245,246,244,245,245,245,245,245,245,245,245,244,245,246,246,247,247,247,246,247,247,247,247,247,247,247,247,245,245,247,247,247,247,246,247,247,247,247,246,246,247,247,247,246,246,247,247,247,247,247,246,246,246,246,246,247,247,247,247,248,247,247,247,247,247,247,247,247,248,248,247,248,246,247,246,247,247,247,247,247,247,249,241,234,247,247,247,250,249,249,252,252,252,251,251,252,252,252,252,252,252,253,253,252,252,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,252,252,253,253,252,252,253,253,250,250,244,251,252,252,253,253,251,251,230,211,171,147,80,131,251,251,252,252,249,242,206,236,252,252,251,251,252,252,252,252,249,243,215,212,212,184,169,186,193,186,179,152,136,129,129,132,141,118,71,35,27,44,76,82,53,44,33,36,68,94,117,144,123,107,77,36,45,21,18,34,48,55,47,57,66,66,62,61,68,68,74,73,71,65,120,214,249,252,249,248,223,184,205,244,251,251,250,250,251,249,249,250,251,252,250,250,238,240,252,251,248,230,195,218,217,183,229,249,245,229,136,177,243,234,143,105,108,67,101,177,193,223,171,92,55,21,48,55,45,91,71,36,36,57,76,63,17,159,250,250,250,250,253,253,251,251,252,252,252,252,252,252,252,252,194,110,29,128,248,226,206,165,176,235,252,252,250,242,247,247,249,249,247,248,247,247,246,247,247,246,247,246,247,247,246,247,246,250,243,226,233,252,240,127,96,63,7,118,238,246,250,250,250,248,219,210,225,212,198,213,226,237,251,246,242,236,182,170,229,246,250,250,248,247,246,244,244,242,243,243,242,244,244,244,244,243,242,241,242,241,241,241,241,241,241,241,239,240,239,239,238,238,239,238,238,237,236,237,238,235,234,235,233,234,233,231,231,230,227,228,229,229,227,228,227,227,227,229,229,227,225,226,223,223,224,222,224,222,223,222,222,230,233,239,237,217,212,225,230,224,223,222,218,217,218,216,214,219,217,216,217,217,215,214,215,217,217,213,214,214,213,214,214,212,210,211,210,208,210,208,208,210,206,209,207,202,205,204,201,201,200,199,200,197,199,195,199,114,5,1,4,7,9,8,10,10,11,11,11,11,214,218,218,215,218,217,215,215,216,218,217,217,219,218,216,215,219,217,216,217,218,221,221,221,220,218,219,220,220,218,217,219,218,220,220,217,218,217,220,223,220,221,219,220,219,217,219,217,218,219,221,222,220,220,220,221,221,221,221,221,223,224,223,225,224,223,224,223,224,226,226,228,227,227,227,225,228,229,229,227,226,228,229,229,229,229,229,230,229,226,226,228,227,227,230,229,229,227,228,229,227,230,230,230,230,231,230,230,232,228,230,230,229,231,230,232,231,230,231,232,233,230,233,233,231,234,235,236,235,236,234,232,234,233,236,237,236,237,237,236,239,237,236,238,238,237,237,238,240,238,239,241,240,240,240,241,241,240,240,240,241,240,241,242,243,240,242,243,244,244,242,244,244,244,245,244,244,244,244,246,245,246,246,245,245,245,247,247,247,247,246,245,246,245,246,246,246,247,248,248,249,248,248,247,248,249,248,249,248,249,248,249,247,249,250,248,249,248,248,248,248,249,249,248,248,249,249,249,249,249,250,251,250,249,250,249,249,249,249,249,248,249,249,248,249,250,248,249,250,250,251,251,250,250,250,249,249,250,250,250,250,250,249,250,249,249,250,250,249,250,250,251,251,251,251,249,251,251,251,252,251,250,250,249,251,250,250,251,247,249,250,250,250,252,248,233,242,250,249,250,249,249,250,251,251,251,251,251,250,252,250,245,248,251,252,247,249,252,252,251,250,249,250,250,250,248,250,251,249,248,242,243,249,252,251,251,251,251,251,250,251,251,251,252,250,251,245,237,252,240,207,180,212,229,246,251,247,228,171,179,151,123,49,149,250,250,252,252,250,238,178,234,251,251,250,250,253,253,252,252,252,252,252,252,252,252,252,252,252,249,253,253,252,252,253,253,249,249,209,216,252,252,252,252,252,252,253,253,245,190,135,134,118,107,49,100,248,248,250,250,253,253,252,252,253,253,252,252,252,252,252,252,252,252,252,252,252,252,249,242,205,167,207,246,251,251,251,251,252,252,251,250,252,252,250,250,235,252,252,252,252,245,252,252,218,167,225,251,172,125,116,104,100,85,87,96,91,76,69,62,55,77,95,93,65,34,41,49,68,90,52,32,41,65,69,59,17,150,247,247,249,249,252,252,250,248,245,247,249,249,237,245,250,252,223,173,118,103,162,210,240,188,165,207,234,248,251,247,245,249,249,249,249,249,249,250,249,250,248,248,250,248,248,249,249,247,249,250,251,239,225,252,222,119,104,47,8,136,242,246,250,250,252,251,223,209,223,229,214,202,218,235,252,242,192,113,42,15,89,181,225,246,248,248,248,247,247,245,244,244,244,244,244,244,245,242,242,242,242,244,241,240,241,241,240,240,239,239,239,239,238,237,238,238,238,237,236,234,234,236,236,232,232,233,232,234,231,230,230,228,229,229,227,226,226,228,227,226,225,226,224,225,227,222,223,224,222,222,220,224,231,229,203,153,120,83,73,129,183,212,225,227,226,222,222,221,216,216,214,213,215,218,217,217,215,214,214,212,213,214,214,211,212,212,212,212,211,213,213,210,210,209,207,208,208,207,206,205,202,203,205,201,200,198,201,196,200,115,3,1,6,8,9,7,10,10,10,11,10,10,214,218,218,215,218,217,215,215,216,218,217,217,219,218,216,215,219,217,216,217,218,221,221,221,220,218,219,220,220,218,217,219,218,220,220,217,218,217,220,223,220,221,219,220,219,217,219,217,218,219,221,222,220,220,220,221,221,221,221,221,223,224,223,225,224,223,224,223,224,226,226,228,227,227,227,225,228,229,229,227,226,228,229,229,229,229,229,230,229,226,226,228,227,227,230,229,229,227,228,229,227,230,230,230,230,231,230,230,232,228,230,230,229,231,230,232,231,230,231,232,233,230,233,233,231,234,235,236,235,236,234,232,234,233,236,237,236,237,237,236,239,237,236,238,238,237,237,238,240,238,239,241,240,240,240,241,241,240,240,240,241,240,241,242,243,240,242,243,244,244,242,244,244,244,245,244,244,244,244,246,245,246,246,245,245,245,247,247,247,247,246,245,246,245,246,246,246,247,248,248,249,248,248,247,248,249,248,249,248,249,248,249,247,249,250,248,249,248,248,248,248,249,249,248,248,249,249,249,249,249,250,251,250,249,250,249,249,249,249,249,248,249,249,248,249,250,248,249,250,250,251,251,250,250,250,249,249,250,250,250,250,250,249,250,249,249,250,250,249,250,250,251,251,251,251,249,251,251,251,252,251,250,250,249,251,250,250,251,247,249,250,250,250,252,248,233,242,250,249,250,249,249,250,251,251,251,251,251,250,252,250,245,248,251,252,247,249,252,252,251,250,249,250,250,250,248,250,251,249,248,242,243,249,252,251,251,251,251,251,250,251,251,251,252,250,251,245,237,252,240,207,180,212,229,246,251,247,228,171,179,151,123,49,149,250,250,252,252,250,238,178,234,251,251,250,250,253,253,252,252,252,252,252,252,252,252,252,252,252,249,253,253,252,252,253,253,249,249,209,216,252,252,252,252,252,252,253,253,245,190,135,134,118,107,49,100,248,248,250,250,253,253,252,252,253,253,252,252,252,252,252,252,252,252,252,252,252,252,249,242,205,167,207,246,251,251,251,251,252,252,251,250,252,252,250,250,235,252,252,252,252,245,252,252,218,167,225,251,172,125,116,104,100,85,87,96,91,76,69,62,55,77,95,93,65,34,41,49,68,90,52,32,41,65,69,59,17,150,247,247,249,249,252,252,250,248,245,247,249,249,237,245,250,252,223,173,118,103,162,210,240,188,165,207,234,248,251,247,245,249,249,249,249,249,249,250,249,250,248,248,250,248,248,249,249,247,249,250,251,239,225,252,222,119,104,47,8,136,242,246,250,250,252,251,223,209,223,229,214,202,218,235,252,242,192,113,42,15,89,181,225,246,248,248,248,247,247,245,244,244,244,244,244,244,245,242,242,242,242,244,241,240,241,241,240,240,239,239,239,239,238,237,238,238,238,237,236,234,234,236,236,232,232,233,232,234,231,230,230,228,229,229,227,226,226,228,227,226,225,226,224,225,227,222,223,224,222,222,220,224,231,229,203,153,120,83,73,129,183,212,225,227,226,222,222,221,216,216,214,213,215,218,217,217,215,214,214,212,213,214,214,211,212,212,212,212,211,213,213,210,210,209,207,208,208,207,206,205,202,203,205,201,200,198,201,196,200,115,3,1,6,8,9,7,10,10,10,11,10,10,215,216,216,214,217,215,216,216,218,216,216,219,217,218,215,216,217,216,218,217,218,215,215,215,217,218,217,219,217,219,217,218,218,218,219,217,217,215,215,217,217,218,218,217,219,217,220,219,220,221,218,220,220,218,220,218,220,222,218,222,221,222,222,222,224,224,224,227,224,223,223,225,226,226,226,224,226,229,228,226,226,224,227,225,226,226,226,227,227,229,227,227,227,227,226,229,227,225,226,225,227,229,228,227,229,229,230,229,230,232,230,229,229,231,229,231,231,230,231,232,230,230,232,230,232,233,233,233,234,233,234,232,234,235,233,234,233,234,234,233,233,235,237,236,235,236,236,239,238,238,238,239,237,237,239,239,237,239,240,239,239,239,240,241,241,240,241,243,240,242,242,241,242,243,242,244,244,244,244,244,245,245,245,245,246,246,245,244,246,246,245,246,245,245,245,246,247,245,246,248,248,248,247,247,246,247,248,249,249,248,247,248,249,249,248,247,247,247,249,248,249,249,249,249,248,248,247,248,248,247,249,248,249,250,249,248,248,248,248,249,248,249,249,249,249,249,249,249,248,248,248,249,249,249,250,249,248,250,249,248,249,250,249,247,249,249,249,249,249,250,249,249,250,250,248,249,249,249,249,248,248,248,249,250,249,249,248,248,247,248,248,248,248,252,244,232,244,249,248,249,247,247,248,249,250,249,249,249,248,250,248,243,249,251,252,250,252,252,252,251,247,248,249,247,247,247,248,248,248,249,246,242,242,247,250,249,250,249,248,247,248,248,247,250,247,250,237,230,245,188,181,208,243,244,252,252,248,216,170,179,157,134,49,149,250,250,252,244,250,229,141,191,247,247,248,243,248,252,250,251,251,252,241,252,252,208,240,251,241,249,252,252,252,252,252,252,252,252,252,252,252,252,252,252,253,253,252,252,239,165,127,125,110,104,36,85,248,248,249,249,252,252,251,251,252,252,252,252,252,252,251,248,252,252,251,251,251,252,250,240,230,202,204,249,251,251,251,251,252,250,250,249,251,250,250,248,235,252,251,251,242,242,252,241,149,112,205,242,193,130,101,104,102,115,109,96,97,81,81,64,66,68,72,117,110,65,50,47,82,90,52,29,37,70,65,57,12,141,246,246,249,249,252,252,250,250,244,244,247,252,240,244,250,252,245,219,178,95,113,186,242,204,186,220,225,248,251,250,246,247,248,248,248,246,249,249,248,249,247,248,247,247,248,247,247,247,247,248,248,242,222,252,210,116,104,35,11,142,243,247,251,248,252,243,224,207,219,229,222,203,209,230,234,234,196,146,60,66,145,208,231,243,245,245,244,244,244,243,242,244,244,241,243,241,242,243,240,239,240,239,239,239,239,239,238,239,238,236,237,237,234,237,235,236,237,235,233,234,232,231,230,229,230,228,229,229,229,228,226,227,224,225,226,226,225,224,224,224,222,224,224,222,222,222,221,222,222,224,230,231,241,225,173,131,103,77,48,94,171,217,233,230,227,222,219,219,217,215,215,214,215,217,215,214,212,212,213,214,213,214,215,214,212,212,208,212,210,208,212,210,212,210,207,208,206,204,204,205,205,202,200,199,200,196,200,197,200,114,4,0,4,7,9,8,10,10,11,11,12,10,217,217,217,215,215,213,214,216,215,214,215,215,213,215,215,214,215,214,216,217,214,217,217,216,217,214,215,214,214,217,215,214,214,215,217,217,216,214,214,216,213,215,216,214,215,218,217,217,217,217,217,217,217,218,220,215,217,218,219,220,217,218,217,221,222,218,222,221,221,221,221,222,222,222,222,225,224,224,223,223,224,224,225,227,225,225,225,224,223,225,226,226,226,225,227,227,228,230,226,226,226,226,229,226,225,228,228,227,228,228,231,231,231,231,230,232,229,229,229,230,232,229,230,231,232,233,231,231,230,231,232,232,232,232,232,232,230,233,232,232,233,232,233,236,236,235,237,236,236,235,234,237,234,235,236,236,238,238,239,240,239,239,239,237,239,241,241,240,239,241,239,241,242,241,241,241,242,244,243,244,244,244,245,245,244,244,244,244,244,244,244,244,244,244,244,244,245,246,245,245,247,246,246,245,246,247,247,248,247,248,248,247,248,248,247,247,248,248,249,248,248,247,247,247,247,248,247,247,247,248,248,247,246,247,247,247,247,248,248,247,250,248,248,249,248,249,248,247,248,248,249,249,248,248,248,247,247,247,247,249,248,248,247,248,247,247,248,247,248,248,249,249,248,248,249,248,248,248,249,248,247,248,247,247,248,248,246,247,247,248,249,247,248,251,236,228,245,249,250,250,249,248,247,248,249,249,248,248,249,252,246,243,251,250,251,236,215,238,242,248,250,247,249,248,248,247,247,247,248,248,247,244,241,243,248,248,249,249,248,248,248,249,248,248,248,251,230,222,248,215,214,237,251,240,239,252,247,206,159,167,146,125,43,152,250,249,251,243,252,239,171,208,249,249,248,244,246,251,249,251,250,248,233,252,218,174,238,252,250,251,252,229,225,239,252,252,251,251,252,252,250,246,235,252,248,252,252,252,234,156,132,135,119,116,32,77,248,248,247,247,249,252,250,250,244,247,252,250,252,252,245,247,252,251,251,251,252,252,251,245,241,214,202,248,252,252,250,250,252,251,251,250,252,252,250,245,239,252,250,250,233,243,238,143,95,145,249,249,200,137,106,101,118,126,115,98,87,83,74,69,65,62,67,92,123,102,60,56,99,85,40,27,41,73,57,57,12,122,246,246,249,249,252,252,249,250,243,241,246,251,241,242,247,252,247,226,212,129,108,152,222,219,195,227,224,248,252,252,247,244,249,249,249,247,248,248,247,249,247,247,247,246,247,247,247,247,247,247,248,248,229,252,199,110,97,29,14,152,243,248,250,249,249,250,225,206,217,229,228,206,204,226,249,232,207,200,188,190,220,240,244,245,244,245,244,242,242,241,243,242,242,242,243,241,240,240,240,239,239,239,239,239,239,239,238,237,240,237,235,236,233,234,236,235,234,234,232,231,232,231,229,229,230,229,226,229,229,227,228,226,225,225,225,227,225,224,224,223,222,223,221,219,218,218,220,220,226,231,240,246,241,198,150,122,106,87,69,110,166,209,238,239,236,227,223,222,217,219,217,215,217,217,213,214,215,215,213,211,214,214,211,210,213,211,210,212,211,208,210,208,210,208,207,210,206,206,206,207,204,202,198,199,201,198,201,198,200,113,4,1,4,7,9,8,10,9,10,10,10,10,217,220,218,217,218,215,215,215,215,214,214,216,214,214,213,214,216,213,215,215,216,218,219,218,215,214,216,217,214,216,216,214,214,213,215,217,216,214,215,215,216,217,218,215,216,213,215,216,215,215,215,218,218,218,219,218,218,218,218,218,217,217,219,219,221,220,218,222,220,222,223,222,222,222,224,222,223,224,222,224,226,225,225,224,224,224,224,224,224,226,227,226,224,228,226,228,226,225,229,227,229,226,227,227,228,229,227,230,229,229,231,231,229,230,230,230,229,231,233,231,230,230,232,232,231,232,230,232,230,230,233,231,231,232,234,234,232,236,237,235,235,236,235,236,237,237,236,235,235,236,236,236,237,237,236,236,237,238,237,239,241,238,238,241,239,239,240,240,240,239,241,241,241,242,241,241,243,244,243,244,244,243,244,244,244,244,244,244,242,244,244,244,245,246,244,243,244,244,247,247,246,247,247,247,247,247,247,246,248,249,247,248,248,247,248,247,249,247,247,248,246,248,248,248,248,247,247,247,247,247,247,247,247,247,247,247,248,248,247,248,248,248,248,248,249,249,247,247,247,248,248,247,248,247,248,248,248,249,248,248,248,247,249,249,248,247,247,247,248,248,248,249,249,249,250,249,247,249,249,249,249,248,247,247,247,249,249,247,247,247,247,247,249,248,228,230,246,248,249,248,249,250,248,247,248,249,250,249,249,248,242,245,252,249,243,171,119,187,232,249,249,246,248,248,249,249,247,247,247,247,247,247,240,240,244,247,250,247,248,247,247,248,248,247,247,250,231,245,252,227,221,214,201,201,237,249,247,200,156,162,139,118,37,157,249,248,252,242,252,249,184,202,247,248,248,245,243,250,249,251,252,251,225,241,225,222,252,252,246,250,227,172,211,241,252,252,248,246,252,251,250,241,230,252,244,251,252,252,233,150,134,140,120,123,33,77,248,248,247,247,249,252,250,247,234,246,252,249,250,250,241,251,252,249,252,252,252,252,252,241,250,221,189,239,252,252,250,250,252,250,250,250,252,251,250,238,245,252,251,247,228,232,183,161,148,212,253,253,169,107,93,89,98,101,87,76,74,59,62,53,50,49,46,54,79,100,63,65,100,76,39,30,48,71,59,60,12,134,246,246,249,249,252,252,249,250,244,240,244,251,244,240,249,252,246,223,236,174,105,130,200,209,179,216,208,227,251,251,248,245,250,248,248,247,247,247,247,248,247,247,247,246,247,246,246,247,246,247,248,251,236,245,182,105,91,19,18,163,243,248,251,247,252,242,226,204,217,229,223,216,205,223,234,217,206,227,252,252,251,251,247,247,245,244,243,241,241,240,240,241,240,239,239,239,239,239,239,238,240,238,236,237,236,236,236,234,236,233,235,236,233,234,232,232,231,231,231,229,228,227,229,227,229,229,228,225,225,225,224,224,225,223,222,224,221,222,224,220,220,219,220,218,219,220,219,228,231,242,238,200,171,146,126,114,107,95,80,78,102,127,181,221,238,231,223,220,221,216,215,214,212,212,214,213,212,214,214,212,210,211,212,211,209,210,208,209,210,210,211,207,207,208,205,207,205,206,204,201,203,202,201,199,202,198,200,196,201,114,3,1,4,7,8,7,10,10,10,11,10,10,218,220,220,220,220,216,216,217,215,214,217,217,216,217,213,213,217,216,215,217,216,217,217,217,216,215,217,217,217,216,215,217,217,215,215,215,215,216,215,216,214,213,216,216,216,217,216,217,216,216,216,218,217,216,218,217,217,218,218,219,218,220,218,220,220,218,220,220,223,226,225,222,223,224,223,225,224,225,224,223,225,222,224,224,221,225,222,224,224,223,225,224,227,225,226,224,226,226,223,226,226,227,228,226,225,228,230,229,227,229,229,229,231,228,230,229,231,232,229,230,229,231,233,232,232,232,231,232,232,232,236,235,233,232,232,232,234,234,233,234,234,236,236,236,236,237,237,234,238,237,235,239,236,238,236,235,237,237,237,238,237,237,239,240,241,239,241,241,239,241,239,240,240,240,242,243,242,242,242,244,243,243,244,243,242,244,244,244,242,243,243,244,244,245,248,246,245,246,246,246,247,247,247,247,248,249,249,249,248,247,249,249,248,248,247,247,247,245,248,248,248,249,247,247,246,246,247,247,248,249,248,246,248,249,247,247,248,248,248,247,247,247,247,248,248,249,248,247,249,248,248,247,247,248,247,248,249,248,247,249,249,247,247,247,247,249,248,248,248,248,248,248,249,247,249,249,247,249,247,248,248,249,249,249,249,248,249,249,249,248,248,247,250,247,228,239,251,248,249,248,249,248,248,248,246,248,248,250,250,247,243,246,252,249,240,182,169,231,248,252,249,246,247,247,247,247,247,247,247,247,247,248,247,241,240,243,248,249,248,248,247,247,246,247,249,250,230,245,250,186,171,199,220,229,250,252,247,201,169,177,155,125,42,168,249,249,252,242,252,252,175,174,239,246,249,247,243,247,252,250,252,249,242,250,229,237,252,251,212,229,243,232,252,248,252,252,248,246,252,247,250,241,234,252,247,252,252,252,226,146,125,128,118,111,29,83,247,247,247,247,249,252,249,246,231,243,252,250,250,250,241,252,252,250,250,250,252,251,248,246,243,212,173,226,252,252,250,250,252,251,250,251,250,252,250,237,249,252,252,237,218,210,220,219,222,252,250,229,129,99,92,92,96,86,85,81,81,78,71,73,69,60,57,52,63,83,67,81,107,63,40,32,53,74,59,62,12,141,246,246,249,249,252,252,250,250,245,241,247,251,248,241,248,252,249,217,237,207,129,139,195,229,208,207,182,200,244,251,251,243,248,247,247,249,248,247,247,247,247,247,247,247,245,246,247,247,247,247,245,249,245,234,161,92,83,17,25,174,246,248,251,250,250,250,227,204,217,228,230,227,209,224,245,201,201,237,252,252,246,252,247,244,245,242,241,239,241,240,239,239,239,239,239,239,238,238,237,239,237,237,236,237,236,234,232,233,233,233,235,234,232,232,231,229,229,230,230,227,224,226,225,225,226,224,225,224,222,222,223,223,220,223,223,222,223,221,221,220,220,220,219,219,218,221,228,231,240,209,159,130,110,96,96,95,88,84,80,75,49,34,67,135,200,225,224,222,219,218,218,216,212,210,214,215,213,214,213,211,210,212,212,210,213,210,211,213,212,211,209,207,210,208,209,209,206,206,203,204,202,203,201,198,200,196,198,194,201,114,3,1,4,7,9,8,10,9,10,11,10,10,219,222,221,219,219,214,217,218,215,215,215,217,217,215,214,214,215,212,216,217,213,215,216,217,218,215,217,215,213,216,217,215,214,214,215,214,213,213,217,216,214,215,215,215,217,216,217,215,216,215,217,215,215,216,214,219,219,220,220,219,219,219,221,219,221,223,223,224,223,223,224,223,223,222,223,224,225,224,221,224,220,222,225,221,223,224,225,224,223,221,223,225,223,225,222,226,227,225,226,226,228,225,227,226,226,227,225,227,228,229,230,228,226,230,229,229,230,231,232,227,231,232,231,233,231,234,234,234,233,232,232,232,234,233,235,233,231,232,232,234,233,233,235,235,234,235,237,237,236,237,236,236,236,237,239,236,237,237,235,237,238,236,236,237,237,238,237,237,239,239,240,241,242,243,244,242,243,243,242,243,243,244,242,243,242,244,244,243,243,245,245,243,243,244,245,246,245,245,246,245,246,246,247,248,246,248,249,249,249,249,247,247,249,248,248,248,248,247,247,247,247,248,247,248,247,244,247,247,248,247,246,247,248,248,249,249,247,248,247,247,246,248,247,247,248,247,247,247,247,247,248,247,247,248,246,247,248,248,248,247,248,247,248,248,247,247,247,247,247,246,247,247,247,247,247,248,248,249,249,247,247,248,249,248,248,247,248,247,248,249,249,249,250,242,234,247,248,248,249,248,248,247,248,246,247,246,247,248,248,245,244,250,249,249,252,247,252,252,251,249,248,248,248,246,246,246,246,246,245,247,248,248,247,247,240,239,244,245,250,248,247,248,247,247,248,248,224,237,215,182,215,230,243,250,252,252,248,204,174,169,157,127,39,173,249,245,251,241,251,251,198,187,240,248,249,248,242,245,250,249,252,251,245,251,224,229,244,243,242,251,252,236,252,252,252,252,250,244,252,249,249,241,235,252,229,245,253,253,226,128,107,116,103,118,33,83,248,248,247,247,249,252,250,247,235,246,252,248,250,248,245,252,252,250,252,252,251,248,247,247,241,220,181,229,252,252,249,247,250,252,249,250,250,252,249,236,252,252,250,226,229,252,253,253,250,252,247,186,119,114,110,121,117,114,113,116,124,110,112,109,108,75,96,86,91,105,80,118,124,61,35,27,62,77,62,62,9,136,247,247,249,249,252,252,248,250,245,239,247,251,250,240,244,249,252,219,230,234,148,141,179,217,214,187,165,187,227,248,252,242,245,246,247,247,247,247,247,248,247,247,247,247,247,246,246,246,245,245,246,247,252,225,139,94,74,10,29,176,243,248,252,245,249,243,220,201,216,228,230,225,215,236,234,180,189,223,241,239,230,245,243,242,241,240,240,239,239,241,239,239,239,239,238,237,238,239,238,236,237,237,236,235,235,232,233,233,233,232,233,231,228,229,229,228,226,227,226,224,227,226,226,223,225,224,222,224,224,224,223,221,223,220,221,223,219,217,218,217,219,220,219,218,219,226,231,231,188,139,110,98,95,92,97,95,95,91,87,73,63,52,35,57,129,200,219,218,219,216,216,216,215,214,212,211,212,212,212,212,212,213,210,212,212,211,212,210,212,209,210,210,209,210,207,211,208,206,208,204,204,203,200,199,198,197,200,195,202,113,4,1,4,7,9,8,10,10,10,11,11,10,222,223,221,220,220,218,219,218,218,217,217,217,217,218,217,215,217,214,215,219,217,217,218,218,216,217,217,215,215,213,215,216,212,216,215,215,217,214,213,214,214,214,218,216,216,218,215,217,217,217,216,218,218,218,219,218,220,220,218,218,219,221,219,221,225,224,223,222,222,223,222,222,224,224,224,224,223,223,224,225,225,221,223,224,223,226,224,224,224,226,225,223,223,224,226,224,224,225,224,227,225,227,229,226,227,227,228,227,227,229,228,226,230,229,228,229,231,232,232,234,231,233,232,231,234,233,233,233,233,231,233,232,235,234,234,235,234,235,235,237,235,235,237,236,234,236,237,236,238,235,237,238,236,237,239,239,237,239,239,239,239,238,239,240,238,239,240,240,239,242,241,241,242,241,241,241,242,242,242,243,244,243,242,243,243,244,244,243,245,245,243,244,243,244,245,246,246,245,246,246,247,246,247,248,248,249,247,247,248,248,249,249,247,248,249,249,249,247,246,247,249,249,248,247,249,249,247,248,247,247,247,245,248,247,248,249,248,247,248,247,249,249,246,248,248,247,248,247,247,248,248,249,248,248,247,247,249,249,249,248,248,247,247,248,247,248,247,247,247,247,248,248,248,247,249,249,247,248,248,248,248,248,250,251,249,248,248,248,249,249,248,249,251,240,240,250,250,249,248,249,250,249,249,249,248,248,247,249,247,243,248,251,250,249,252,252,252,252,249,247,246,247,247,247,247,245,247,247,247,247,245,247,247,248,244,240,242,246,249,248,247,247,247,249,249,246,226,248,242,211,232,244,234,218,241,250,249,201,162,160,139,113,41,176,249,243,251,239,250,252,212,200,246,248,249,248,246,245,251,250,252,247,242,247,178,202,252,252,249,250,246,227,246,242,251,251,250,245,251,251,249,243,235,250,163,158,234,249,225,137,119,113,107,110,29,90,247,247,247,247,250,252,250,249,240,246,252,249,251,245,248,252,251,250,252,252,251,246,248,252,238,245,198,208,252,250,250,250,251,252,249,251,250,252,244,240,252,252,248,241,252,252,252,252,222,224,169,122,113,118,128,122,126,119,93,91,95,101,104,94,86,79,85,83,84,95,99,136,115,51,34,25,61,72,60,60,12,140,247,247,249,249,252,252,249,251,248,238,244,249,252,244,241,248,252,226,218,244,163,100,132,167,184,171,168,198,215,243,252,243,247,247,247,247,248,247,246,246,247,246,245,246,246,247,248,245,247,244,248,247,252,217,118,87,68,11,30,179,243,249,251,247,246,240,214,198,216,229,231,234,220,233,237,160,178,225,233,232,229,244,240,240,243,239,239,237,238,238,237,237,237,237,239,237,237,236,237,237,236,235,235,234,233,232,231,231,232,230,229,229,227,229,227,227,228,225,224,227,225,226,224,222,225,224,224,225,223,222,222,224,223,222,220,220,221,216,216,217,218,216,219,220,222,229,227,184,136,105,92,101,125,132,120,109,106,98,91,85,74,60,50,36,57,150,201,211,219,217,219,216,213,213,212,210,212,213,213,214,214,212,212,212,210,210,211,211,210,209,207,210,210,209,208,207,208,205,205,202,201,203,199,200,199,196,202,196,200,113,4,1,4,8,9,7,10,10,10,11,10,10,220,222,221,220,223,220,220,222,220,218,217,218,217,219,219,215,219,218,217,218,217,218,215,216,216,214,217,216,217,215,213,214,216,216,215,214,214,212,214,215,213,215,214,212,212,214,217,216,216,215,219,217,218,221,219,220,218,220,219,218,221,220,221,219,220,221,222,224,221,222,220,220,223,223,221,220,220,221,223,225,222,222,222,220,222,224,222,221,224,222,222,222,222,226,223,221,224,222,222,226,227,224,227,226,227,226,226,228,227,228,227,229,227,229,231,230,231,231,233,233,233,232,233,232,229,232,232,233,231,230,232,232,232,234,235,231,235,236,234,237,237,236,234,237,236,235,236,233,235,237,236,237,238,236,236,236,237,238,236,237,237,237,238,239,238,238,239,240,240,240,241,241,241,241,241,241,240,240,242,242,241,243,242,242,243,242,244,243,244,244,243,244,244,246,246,246,246,246,247,247,247,247,245,247,247,247,247,248,248,248,247,247,248,246,247,247,249,249,249,249,248,249,246,247,248,247,249,247,247,247,247,246,247,247,246,246,247,248,249,249,249,247,248,249,247,247,247,248,247,247,248,249,248,247,247,247,248,248,247,248,249,247,247,247,247,248,247,248,247,248,248,249,248,248,248,249,249,247,247,248,248,249,250,248,248,249,249,248,248,248,248,251,247,238,245,251,248,250,249,248,249,249,250,247,248,247,247,248,245,245,248,250,247,245,248,252,247,246,247,247,248,246,247,247,247,247,246,247,246,247,247,245,247,248,248,244,240,240,247,248,247,247,247,246,250,242,236,252,240,224,234,203,189,214,244,250,248,201,167,153,141,109,37,187,250,245,250,236,251,252,228,178,208,247,250,250,247,242,251,249,252,245,242,205,172,241,252,252,249,250,237,190,225,241,252,252,249,240,252,252,250,239,237,249,192,212,251,251,227,142,130,137,113,110,28,95,247,247,247,246,248,252,249,247,237,240,250,252,251,243,252,252,251,250,250,252,250,245,251,252,245,248,204,215,252,252,250,250,251,252,249,250,250,252,241,244,252,252,244,252,252,252,252,174,95,78,64,58,84,77,73,81,83,92,64,45,48,43,49,46,49,52,44,46,51,57,57,80,80,59,47,50,72,69,58,53,11,139,247,247,249,249,252,252,250,251,248,238,242,246,251,245,239,245,252,232,206,229,188,144,144,162,220,194,172,221,219,239,252,248,248,247,247,247,245,246,246,245,246,245,244,244,245,245,245,244,245,243,247,245,250,223,115,89,63,7,42,194,245,249,252,247,247,229,215,197,215,230,230,234,223,236,234,146,179,228,236,231,225,241,237,241,240,238,239,236,236,237,237,236,235,236,236,236,235,235,236,234,235,233,232,231,231,229,229,229,228,228,227,226,229,225,225,225,222,224,227,224,226,224,222,221,224,223,220,222,220,222,220,219,222,219,220,219,219,219,218,216,216,217,217,220,228,225,188,139,106,92,83,110,134,135,122,105,94,80,72,61,61,53,48,36,22,101,177,204,223,221,218,214,216,214,209,212,211,211,212,212,211,211,212,211,208,206,211,210,208,208,208,208,207,209,206,206,205,206,204,200,203,203,200,198,199,198,198,192,201,114,3,1,4,8,8,8,10,10,11,11,10,10,216,220,220,219,221,216,219,218,221,219,218,219,219,219,216,215,217,215,215,217,214,216,215,214,218,216,214,218,218,214,216,216,213,214,214,215,214,212,214,214,214,210,214,214,213,214,214,217,215,215,217,218,217,215,218,217,218,221,220,223,223,221,221,221,220,221,222,224,218,222,222,219,219,221,221,217,219,221,222,219,221,222,222,221,221,221,221,221,221,222,219,224,222,220,222,223,225,224,224,225,222,223,225,226,225,224,225,224,227,226,228,229,227,228,226,227,229,229,230,230,229,232,232,230,231,232,231,231,233,233,234,230,232,232,232,234,233,234,234,234,236,234,234,235,233,234,233,234,237,235,236,235,235,236,237,236,235,238,236,235,237,235,236,238,236,236,237,238,238,240,237,240,240,240,242,241,242,241,241,241,242,242,241,241,241,242,244,244,243,243,243,245,244,243,244,244,245,247,247,246,247,247,246,247,246,247,248,248,248,247,246,248,247,246,247,248,249,248,248,248,248,247,249,247,247,247,246,248,248,248,249,247,247,248,247,248,247,247,249,248,248,248,247,248,248,248,248,247,249,249,247,247,248,248,249,249,249,248,248,248,247,248,247,249,248,247,247,247,249,249,248,248,250,249,249,249,249,248,249,249,249,249,248,249,249,248,248,249,249,249,247,250,246,237,247,250,249,248,249,249,248,249,249,249,249,248,247,247,242,246,251,249,246,244,247,249,247,245,246,247,247,247,248,247,247,247,246,247,247,247,247,246,247,247,247,247,242,240,242,246,247,247,247,247,250,241,234,252,230,201,184,187,222,238,252,252,249,206,175,173,153,111,44,190,249,247,249,236,249,252,223,167,205,246,250,251,249,245,247,250,251,239,244,227,218,251,252,250,238,249,215,211,242,244,252,252,250,243,252,252,248,238,237,252,251,252,253,253,224,145,134,132,117,111,27,95,247,247,247,247,248,252,250,247,242,242,249,252,249,246,252,252,252,252,250,252,250,243,252,252,243,250,222,227,252,252,250,250,251,249,248,250,251,252,240,248,252,249,246,252,253,196,123,75,50,28,2,9,28,33,45,39,44,98,84,46,35,32,32,30,36,33,29,37,62,76,60,48,46,53,55,68,76,62,46,49,12,106,240,242,248,248,252,252,249,251,248,239,241,245,250,247,238,244,252,246,200,227,238,187,204,215,250,232,193,244,231,240,252,249,247,247,247,248,247,246,247,246,245,245,244,245,244,244,244,244,247,244,247,244,251,241,141,95,62,6,53,210,245,250,252,247,246,240,217,199,216,230,231,233,225,250,230,133,180,232,231,227,224,242,237,239,241,237,237,236,236,235,235,236,237,236,235,234,232,232,232,230,230,231,232,232,229,227,228,226,227,228,224,224,224,224,223,224,224,224,225,224,223,223,223,223,223,221,222,221,220,222,218,217,218,221,218,219,215,215,218,217,218,214,218,223,229,210,155,112,89,77,75,90,105,105,102,95,85,73,65,60,62,60,57,47,24,95,186,221,239,228,221,217,218,217,214,215,210,209,213,213,213,212,212,212,209,211,210,208,210,209,208,210,206,208,207,205,206,204,205,200,200,201,198,197,197,197,198,192,202,114,3,1,5,8,8,8,10,10,10,11,11,10,218,222,219,217,220,218,217,220,221,221,222,222,217,216,218,216,216,214,215,215,217,216,217,216,215,217,215,216,216,214,217,217,216,217,214,216,214,213,215,214,214,214,215,213,215,215,214,214,215,217,217,216,216,216,218,218,215,219,220,219,222,220,221,221,221,221,221,223,221,221,219,221,222,219,222,220,221,219,219,221,219,220,220,223,219,222,223,221,223,219,221,224,221,221,223,221,224,222,222,224,222,223,224,222,223,224,223,225,226,227,225,226,225,226,229,226,228,229,229,231,230,229,232,232,231,232,231,232,232,232,232,232,232,235,235,234,236,234,234,235,236,235,234,236,233,235,235,235,237,236,236,236,236,237,239,237,237,237,238,237,237,239,239,236,236,239,240,239,238,237,237,239,241,241,241,241,242,242,241,240,241,242,240,240,240,241,242,241,244,242,243,244,243,244,244,244,244,245,245,245,247,246,247,248,248,247,246,247,247,247,246,245,248,248,249,248,249,249,248,249,248,248,246,247,249,249,250,247,247,247,247,247,247,248,249,249,249,247,247,247,249,249,248,248,249,249,249,248,249,249,249,250,249,249,249,249,249,249,248,247,248,247,248,249,247,247,247,247,249,248,247,248,248,248,249,248,248,248,248,250,249,249,250,249,249,247,248,248,247,249,250,250,240,241,250,249,249,248,249,249,247,248,247,247,249,249,250,243,241,245,248,249,245,246,249,248,247,244,244,247,247,247,247,244,247,247,247,247,246,247,245,245,247,247,247,247,247,241,240,243,247,247,248,247,249,236,231,243,191,196,228,225,239,237,249,251,249,207,170,164,153,115,43,192,249,248,249,236,251,252,239,193,211,247,250,250,250,245,246,250,252,244,252,235,225,251,249,226,211,249,240,240,250,247,252,252,250,241,252,252,247,236,232,252,252,252,252,252,222,140,125,130,113,109,29,99,247,247,247,247,249,252,250,248,244,246,248,249,245,248,252,252,252,250,250,252,246,248,252,252,241,243,201,190,252,252,250,250,248,251,250,251,251,247,237,250,252,247,241,253,195,91,28,20,57,26,12,27,33,42,54,51,56,81,68,51,39,31,34,36,37,34,36,55,89,84,58,44,28,35,40,66,78,61,50,59,10,93,227,240,250,250,252,252,249,252,248,242,241,244,250,249,236,241,253,253,207,206,247,210,202,224,251,249,201,237,231,214,247,246,244,244,247,246,246,247,246,246,245,244,245,246,248,245,244,242,246,244,244,243,253,251,148,102,69,6,59,212,242,250,252,242,243,231,219,201,213,227,227,225,222,253,217,114,173,231,232,226,226,243,236,241,240,239,237,236,236,235,233,235,236,235,234,233,233,232,233,230,231,229,230,229,231,232,227,227,224,225,225,222,226,222,222,225,223,224,222,223,223,224,223,219,221,219,221,222,219,222,222,220,218,215,217,219,217,216,215,217,217,222,229,236,240,198,134,101,91,85,78,91,87,93,97,92,99,94,93,89,91,86,81,86,65,117,194,219,243,243,245,233,227,224,214,213,213,213,213,211,210,209,210,211,212,212,211,208,206,208,209,206,207,207,204,203,202,204,203,203,203,200,198,198,198,198,200,193,202,113,4,1,4,8,9,8,10,10,11,11,10,11,218,223,222,220,222,220,220,219,222,220,219,219,218,216,216,218,214,215,217,216,216,216,218,217,217,217,218,217,215,217,215,216,217,218,217,215,214,215,214,212,215,213,215,215,213,215,212,214,213,215,214,215,220,215,216,215,220,219,218,222,220,222,222,222,223,223,221,223,221,222,222,222,221,220,219,220,222,218,220,221,220,218,220,221,219,219,221,220,220,222,221,223,222,222,223,221,221,222,223,223,222,224,225,224,225,221,223,224,226,224,224,227,227,228,229,230,230,232,230,229,232,232,232,231,231,234,232,233,234,231,234,235,234,234,233,237,236,235,236,237,238,235,237,237,235,236,234,235,237,237,238,237,237,236,236,238,237,237,237,237,240,238,239,239,236,238,239,238,239,240,237,239,239,240,241,239,240,241,241,240,242,242,240,241,243,243,243,243,244,245,244,244,244,244,244,244,245,246,246,247,247,247,247,248,248,247,247,247,248,247,247,248,247,246,247,247,247,247,247,248,249,249,250,248,248,249,248,249,249,249,248,247,247,248,248,249,249,249,249,248,248,247,249,249,249,249,249,249,247,249,248,248,248,248,249,247,248,247,248,248,249,249,248,248,248,248,248,248,247,248,247,248,248,247,248,248,249,249,249,249,247,250,249,249,249,247,249,249,248,249,249,249,240,244,251,248,250,249,248,248,248,248,248,249,249,250,249,244,242,248,247,247,245,247,249,247,248,244,246,247,245,248,247,247,248,246,245,247,245,245,247,245,247,245,247,248,247,246,241,240,244,248,249,247,251,237,242,247,221,236,252,236,208,225,249,250,248,200,160,152,138,97,44,198,249,247,250,239,252,252,244,190,208,246,251,251,250,249,246,249,252,243,250,212,190,243,241,242,251,252,243,252,252,245,252,252,250,242,252,252,247,232,226,250,251,251,252,252,215,127,105,109,103,101,29,107,247,247,247,246,250,252,249,247,244,243,245,248,242,252,252,252,251,249,250,252,244,249,252,252,238,235,189,168,244,249,249,249,249,251,250,251,251,246,241,252,252,234,230,219,156,98,33,11,44,39,34,49,46,44,51,44,55,87,73,55,44,30,32,37,32,28,46,77,89,71,57,44,32,34,45,77,80,61,59,55,19,145,242,242,250,250,252,252,248,252,247,244,242,244,247,251,239,237,250,252,220,192,240,220,201,217,231,240,194,214,193,171,221,240,244,244,249,250,247,248,249,245,246,247,246,246,245,244,244,243,247,243,246,241,253,245,139,101,67,5,67,218,240,250,252,245,239,232,222,203,210,224,225,229,224,251,202,95,168,231,227,227,227,241,236,239,239,236,234,233,234,233,232,235,234,232,232,231,232,230,231,230,232,230,226,227,227,229,227,224,225,225,224,226,226,222,221,222,223,223,222,222,222,220,222,223,221,220,220,222,220,221,219,218,220,218,217,217,217,217,218,223,228,235,245,245,221,167,129,117,108,111,113,118,114,107,93,85,92,93,92,87,82,77,79,75,70,72,73,119,197,225,246,246,249,226,212,214,214,215,212,210,210,211,210,208,208,210,210,206,209,208,207,207,204,204,202,205,206,201,202,203,202,202,201,200,200,198,205,199,201,113,3,1,4,7,9,8,10,10,11,11,10,12,218,223,222,221,221,219,222,222,220,219,216,219,218,220,219,217,215,217,217,217,220,214,215,214,214,218,213,214,214,214,214,215,216,214,214,216,214,214,216,214,214,214,216,213,214,214,213,215,213,214,216,214,217,216,216,218,217,218,221,219,222,220,220,220,222,224,221,221,221,220,220,220,222,220,221,218,220,221,220,220,220,220,217,218,218,220,219,218,222,219,218,220,221,221,218,220,222,218,221,223,222,221,223,224,224,224,221,223,221,223,224,226,225,226,228,227,228,229,230,231,231,230,232,232,230,232,234,232,231,233,233,232,232,233,231,233,235,232,234,234,235,234,234,233,234,237,233,236,235,234,235,234,233,234,234,236,237,235,236,236,235,235,235,236,237,236,235,237,237,238,237,237,239,237,239,239,237,238,239,239,240,240,240,242,242,242,242,242,243,240,243,244,243,244,244,245,246,246,246,245,246,247,246,248,247,247,247,247,248,247,248,248,246,247,248,247,248,246,248,249,247,247,248,249,248,248,248,248,247,247,248,247,249,249,247,247,249,249,248,249,249,249,247,249,249,249,249,247,247,248,248,247,247,248,247,248,249,249,248,248,248,247,248,248,247,248,249,249,248,248,248,249,249,248,249,248,249,248,248,249,248,248,249,249,248,248,249,248,249,249,251,246,237,247,250,247,249,248,248,248,248,248,248,249,248,249,246,241,246,247,247,245,244,249,248,248,249,244,244,247,247,247,247,247,247,247,246,245,245,246,246,247,245,246,246,246,247,249,245,242,241,246,248,248,248,237,251,252,236,250,231,190,209,236,252,252,249,209,166,152,131,91,47,199,250,250,249,235,252,252,247,191,174,238,251,251,250,247,242,249,252,243,250,191,191,247,252,249,251,251,237,246,232,232,252,252,250,240,252,252,249,231,225,252,252,252,252,252,207,120,102,112,94,98,24,101,248,248,247,246,249,252,250,246,245,246,246,242,244,252,252,252,252,249,250,251,241,251,252,252,241,245,227,178,237,249,250,250,248,251,250,249,251,241,243,252,247,217,220,239,227,190,121,91,93,83,79,84,62,46,49,47,53,89,80,54,49,35,36,35,30,36,52,89,92,61,44,34,39,44,57,83,79,60,63,50,37,191,246,242,251,250,252,251,249,252,244,245,242,241,246,250,242,236,244,252,236,193,230,237,202,219,221,222,201,197,180,168,208,238,246,243,246,246,246,247,246,247,246,246,245,244,245,242,244,242,244,242,245,242,253,240,128,94,57,4,80,224,243,250,252,241,237,228,222,204,211,223,226,227,235,250,186,80,159,228,232,223,229,240,234,236,234,233,233,233,233,234,232,230,230,230,230,229,229,229,227,228,226,226,228,224,223,226,225,226,225,225,224,222,225,223,223,224,222,222,221,222,221,220,221,218,220,222,218,217,216,219,217,216,218,216,217,216,214,219,226,233,246,246,221,185,159,143,135,126,118,122,126,127,113,110,77,59,69,49,49,50,46,51,47,45,47,42,43,20,25,106,163,199,224,209,208,214,215,212,210,212,212,210,210,209,206,208,208,206,206,205,205,205,205,206,205,204,202,203,202,201,203,200,200,201,202,203,203,197,205,113,4,1,5,9,9,9,10,10,10,11,12,12,220,222,219,220,223,221,221,219,222,221,219,219,220,219,219,219,217,217,216,217,216,216,216,214,215,214,214,213,213,214,214,216,217,214,214,214,216,215,217,216,216,215,215,215,214,216,214,217,215,217,216,215,217,216,218,218,218,216,217,221,221,221,221,220,220,220,222,223,221,221,222,219,222,222,219,219,222,222,217,221,219,217,220,221,219,217,218,219,221,219,218,220,219,218,221,221,221,220,219,221,222,221,221,222,224,222,224,225,222,222,225,227,227,226,229,228,227,230,229,232,230,229,232,230,232,232,230,233,232,234,235,234,235,235,234,234,234,235,235,235,233,233,234,234,237,237,236,236,237,237,236,236,236,237,236,239,236,236,237,237,237,235,237,237,236,238,238,238,238,238,239,239,239,239,240,240,239,239,239,239,240,240,241,239,241,241,242,244,242,244,244,244,245,245,245,245,245,246,246,245,246,246,246,246,247,247,247,248,247,247,246,247,248,245,247,247,246,247,245,247,249,248,249,248,247,249,249,249,249,249,247,247,249,248,248,249,249,249,249,249,248,248,249,249,249,249,249,249,248,249,248,249,248,247,249,248,249,248,249,248,249,249,248,250,248,248,249,249,249,248,247,249,249,248,249,248,248,248,249,249,248,248,248,248,248,249,249,249,249,249,250,242,239,249,248,249,249,247,248,248,248,248,248,247,249,250,244,243,249,249,245,243,247,249,247,247,248,247,245,247,246,247,247,246,245,245,247,245,247,246,247,247,247,246,246,247,247,248,247,243,240,242,247,249,247,235,248,239,205,199,198,212,232,249,252,246,249,208,174,169,153,96,53,209,249,249,249,234,252,250,247,190,151,222,250,252,249,247,241,247,249,231,223,201,227,251,252,252,250,250,203,198,222,239,252,252,251,241,252,252,250,229,234,252,251,251,252,252,210,131,122,126,111,99,21,113,248,248,247,245,249,252,250,246,244,248,246,235,238,252,252,252,252,250,250,247,245,251,249,252,248,249,240,181,228,249,250,250,249,252,249,251,250,238,247,252,243,238,252,252,252,252,193,151,132,81,71,102,91,66,71,69,74,65,46,54,46,41,36,33,38,46,66,82,71,53,35,38,47,46,68,81,79,61,62,39,41,207,247,239,252,250,252,250,249,251,244,247,244,239,246,249,246,231,241,252,251,210,218,247,213,195,174,180,223,205,188,214,217,234,246,242,249,247,246,247,247,246,245,246,245,246,245,244,245,243,246,241,246,241,253,232,122,97,48,3,90,226,241,250,252,242,237,230,223,205,207,224,230,230,244,253,169,68,151,226,221,221,232,238,237,237,234,232,231,232,233,232,229,230,231,229,229,225,229,226,225,227,225,226,225,226,226,225,225,224,226,225,223,223,222,222,223,223,221,221,221,220,220,220,221,218,218,221,217,219,218,218,216,214,217,216,217,215,218,226,229,241,226,177,146,116,102,94,91,85,69,74,71,63,66,94,89,65,52,39,36,31,34,33,31,31,32,41,56,62,48,39,10,21,113,174,202,214,211,214,210,211,209,209,210,208,207,210,210,205,206,207,207,208,207,208,207,204,205,206,204,202,203,205,201,202,205,203,205,199,204,113,3,1,5,8,9,9,12,10,10,11,12,12,219,222,220,221,222,219,218,220,220,219,219,221,218,217,218,218,216,217,218,214,217,214,215,216,216,222,216,213,214,215,217,215,216,217,216,216,215,215,214,216,214,213,214,212,215,214,214,217,217,217,217,217,217,217,216,218,219,220,219,217,219,219,222,222,220,221,220,221,222,220,220,222,220,216,219,219,218,219,218,217,221,222,222,218,217,219,220,221,219,217,218,219,219,220,219,220,222,224,223,223,222,222,224,220,221,223,223,225,225,225,225,227,229,229,228,228,227,227,230,229,229,229,229,231,231,229,234,232,229,234,232,234,234,233,234,234,234,233,235,233,236,235,235,233,233,236,234,235,233,234,237,235,234,236,236,237,236,236,238,235,237,238,237,239,237,237,237,238,238,237,238,237,239,238,238,239,238,240,240,239,240,240,240,241,241,241,240,243,244,243,243,244,245,246,246,245,245,245,246,245,247,247,246,248,245,246,245,245,247,245,248,247,245,247,248,249,247,246,248,247,248,247,249,249,247,247,248,248,248,247,248,248,248,248,247,249,249,247,249,249,248,249,248,249,248,248,248,247,248,247,249,248,248,249,247,248,248,247,248,248,247,248,248,249,248,248,247,248,247,247,247,247,247,248,247,248,249,247,248,248,248,248,247,247,249,248,248,249,247,250,249,240,243,249,248,248,248,248,249,249,248,247,247,248,248,247,242,245,251,249,245,242,245,248,247,245,247,246,242,246,247,247,247,246,247,245,247,247,247,247,247,247,245,247,247,247,247,248,248,250,244,239,243,249,247,229,242,213,200,236,236,240,234,236,251,244,249,209,170,169,154,100,57,206,249,249,249,236,252,248,248,205,179,239,252,252,249,250,243,248,245,245,246,223,251,252,250,226,235,239,213,240,240,246,251,250,251,240,252,252,250,224,225,252,250,251,252,252,217,139,123,137,116,110,28,109,249,249,248,247,250,252,248,244,244,248,245,232,246,252,251,251,252,250,248,245,246,251,249,252,248,245,242,187,229,250,250,250,248,250,250,251,249,238,250,252,238,245,252,252,252,252,162,100,87,43,50,92,105,72,97,163,139,90,44,70,84,44,37,50,66,82,94,79,59,45,42,39,44,45,62,81,75,60,63,33,40,208,248,240,252,249,252,251,250,250,243,247,244,239,244,248,249,233,237,252,252,223,204,247,194,144,135,153,232,221,186,229,230,228,245,240,247,248,247,248,245,247,247,246,245,244,246,245,244,242,245,242,245,241,252,225,117,97,47,3,104,230,239,250,250,239,235,224,221,205,205,219,228,230,247,234,148,62,145,226,225,225,230,237,234,236,235,232,234,232,229,231,231,229,229,229,228,229,226,224,226,227,225,227,227,223,224,227,225,224,225,225,224,224,224,223,222,221,223,222,220,223,220,220,222,219,220,218,217,220,217,218,218,217,218,215,215,215,217,225,210,168,137,102,83,73,46,24,22,36,45,41,46,42,46,73,73,63,57,34,34,34,33,35,32,33,43,68,71,67,54,42,24,11,83,159,197,214,207,210,211,209,209,210,207,208,212,209,209,208,209,207,207,208,205,204,205,207,206,206,206,205,205,205,205,204,201,202,206,202,206,112,4,1,4,8,10,9,10,10,11,11,12,12,222,222,222,219,218,218,217,220,220,219,221,219,219,219,217,215,218,217,215,215,214,215,217,216,216,214,215,217,215,216,217,215,214,217,215,214,217,214,217,216,217,216,214,215,217,215,216,217,218,220,216,219,220,219,220,219,221,222,219,219,220,217,218,222,221,222,222,220,221,217,219,218,218,218,218,219,219,221,219,220,218,218,219,218,217,218,219,218,220,219,219,219,223,221,220,220,220,223,221,222,219,220,222,222,224,221,221,221,223,225,224,224,224,226,227,225,227,229,225,229,229,228,229,229,231,229,231,231,230,230,230,229,231,231,231,234,233,234,232,233,234,235,234,233,233,234,235,235,235,233,234,235,234,234,232,236,237,236,236,235,236,234,237,237,237,238,235,237,238,237,237,236,239,237,236,237,238,239,237,239,239,239,241,240,242,243,241,242,242,243,243,243,243,244,244,244,246,245,244,245,246,246,246,246,246,246,244,244,246,247,247,247,247,245,246,249,248,249,247,247,248,249,248,247,247,248,248,248,247,248,249,250,250,248,247,247,247,247,249,249,248,248,249,248,248,248,248,249,247,248,248,248,248,248,247,249,248,248,248,248,248,248,248,247,248,248,248,248,247,250,248,249,247,247,247,247,248,247,248,248,249,249,247,250,249,249,249,248,248,250,248,239,246,252,248,249,248,248,248,247,248,247,247,248,249,246,242,247,249,248,246,245,247,246,247,248,249,247,242,246,247,247,247,246,247,247,247,247,249,247,246,247,245,246,246,247,245,246,248,248,247,243,241,247,246,240,252,243,237,252,252,220,217,242,251,248,249,208,165,156,140,83,50,202,249,249,249,236,251,246,249,228,193,243,252,250,250,249,246,247,247,252,245,236,249,249,225,219,250,251,239,252,252,244,252,251,250,239,250,252,250,221,225,251,250,252,252,252,214,131,115,119,105,103,26,124,249,249,247,245,252,252,248,246,242,252,247,244,252,252,251,251,251,252,245,246,251,252,249,252,248,242,240,181,225,249,250,250,248,252,250,250,245,237,252,249,240,252,251,253,247,153,79,68,95,61,51,84,100,79,123,196,181,131,92,159,139,67,45,69,111,118,137,104,69,51,38,45,45,35,60,79,74,56,65,35,41,213,249,243,252,249,252,250,250,249,243,248,247,238,244,247,251,238,234,247,252,235,198,239,222,142,128,166,239,221,172,227,241,226,245,244,246,250,246,248,244,244,246,245,245,244,246,245,244,244,245,241,246,241,253,214,111,93,34,1,118,234,240,250,252,243,237,229,221,205,204,215,230,231,251,236,131,60,141,228,215,222,235,236,237,234,235,234,230,231,230,229,229,229,229,227,229,225,226,228,225,229,224,224,224,222,223,225,226,224,228,224,221,225,224,223,223,221,223,221,221,222,218,219,220,217,219,217,216,217,217,218,214,215,214,214,215,212,216,213,155,101,66,23,40,48,32,36,36,44,46,46,46,43,45,64,66,63,56,37,40,36,33,37,35,43,63,74,71,65,46,48,18,49,155,193,210,212,207,212,208,210,208,209,210,208,210,210,208,208,208,208,207,207,206,205,207,207,207,207,207,206,205,207,203,206,205,202,207,200,206,113,4,1,4,9,10,8,10,10,11,12,12,12,217,221,220,220,220,218,219,220,217,218,220,218,221,219,218,218,216,216,218,215,218,217,214,217,214,215,215,214,218,214,216,217,215,217,214,214,214,213,213,217,217,216,218,217,218,218,218,217,214,218,218,218,220,220,218,216,220,220,220,221,221,220,222,218,220,222,219,220,220,220,220,220,220,221,219,219,220,218,218,217,218,217,217,218,220,218,218,219,219,220,220,220,221,222,220,221,220,219,219,220,220,221,221,220,219,222,222,222,223,224,226,224,225,225,226,226,227,227,227,227,228,229,229,229,230,228,230,229,230,232,231,232,234,233,234,232,232,232,234,233,234,234,233,234,235,236,233,235,235,233,233,233,235,237,234,235,236,235,235,234,236,234,235,237,236,236,237,236,237,237,238,239,237,237,237,240,238,237,238,238,239,240,241,241,241,241,241,242,244,243,245,244,243,244,243,244,245,244,246,244,245,246,245,247,247,247,247,247,247,245,247,247,248,249,246,247,248,248,249,248,248,247,247,247,247,247,248,247,248,249,248,248,248,248,247,247,248,248,248,248,248,247,248,249,247,249,249,247,249,248,248,248,248,247,247,248,248,248,248,248,248,247,247,247,247,248,247,247,247,247,248,248,248,248,247,247,248,248,249,247,247,248,248,248,249,248,247,249,247,251,244,237,249,249,249,249,249,249,248,248,248,249,247,249,249,241,244,248,248,247,243,248,247,246,247,245,249,249,244,244,247,247,247,246,245,245,247,246,247,246,247,247,246,247,247,247,248,247,247,247,249,249,243,244,242,245,252,240,241,241,201,212,245,251,252,242,249,202,152,155,134,84,51,198,249,249,248,238,252,247,249,219,174,214,248,251,249,249,246,250,240,252,222,176,216,239,249,249,252,250,242,252,242,237,252,251,250,241,249,252,250,227,234,252,250,251,252,252,210,120,99,108,99,103,24,117,248,248,247,245,249,252,249,244,244,252,242,235,252,252,250,250,251,250,242,247,252,252,250,252,250,235,239,164,172,246,250,251,249,249,250,250,240,241,252,243,243,252,223,234,174,92,81,81,136,121,71,54,68,45,79,170,151,116,87,157,128,48,40,39,72,100,137,118,72,50,42,48,44,47,68,78,71,57,66,32,51,219,249,248,252,248,252,249,250,249,241,247,247,239,240,244,251,240,234,245,251,246,199,230,250,192,164,176,225,244,205,223,248,224,237,244,239,248,245,246,246,244,245,245,244,245,245,246,247,245,247,242,247,242,252,207,107,89,31,0,125,236,239,250,249,240,234,222,220,207,204,212,226,235,247,229,128,58,137,230,217,222,231,237,236,235,232,229,232,229,228,230,229,228,229,226,224,226,225,225,226,224,222,224,224,224,225,226,226,224,224,224,224,223,220,222,220,220,220,218,218,220,220,217,220,217,217,218,214,216,214,217,215,212,215,214,213,215,215,214,153,92,65,21,15,25,47,66,57,55,49,47,47,41,58,79,73,63,55,41,36,32,33,35,30,50,82,80,75,56,49,27,61,180,221,216,214,214,216,212,209,210,206,207,208,209,208,208,208,207,211,208,208,209,206,208,209,211,208,208,207,206,206,206,203,202,203,204,205,198,205,113,3,1,5,8,9,9,12,10,10,12,11,11,220,221,218,217,219,218,215,217,217,217,220,216,216,219,220,217,218,217,216,217,218,218,217,218,218,217,216,216,217,215,217,216,217,215,217,218,215,217,216,216,217,217,217,218,218,217,219,218,219,219,217,218,220,218,217,217,217,219,220,220,218,221,219,218,221,219,219,220,221,222,222,222,222,222,220,220,219,219,219,218,220,220,218,219,218,220,219,216,219,216,220,220,221,219,218,222,220,221,220,221,222,221,222,224,222,222,225,224,223,222,226,227,226,227,227,226,228,229,227,229,227,229,230,230,229,230,232,229,229,233,231,230,232,232,231,232,233,233,232,231,233,233,234,235,234,232,233,232,233,234,232,235,236,235,235,233,235,236,234,235,237,236,237,236,237,238,236,239,239,236,237,238,238,237,239,237,238,239,239,239,239,240,241,241,241,241,242,243,242,244,244,244,244,244,244,245,244,244,246,246,246,246,245,246,247,247,247,247,246,248,247,245,249,248,248,249,247,247,247,248,248,249,248,248,249,249,249,247,249,249,248,249,248,248,249,249,249,248,249,250,249,249,249,248,248,248,248,249,249,248,248,248,248,247,247,248,247,248,248,248,247,247,248,247,248,248,247,247,248,248,247,247,247,248,249,249,247,248,249,249,248,248,248,248,249,247,247,247,249,251,241,241,251,249,249,247,249,249,248,249,247,247,248,248,246,241,246,248,248,244,245,248,247,246,245,245,248,249,247,245,247,248,247,246,246,246,245,245,248,246,247,248,248,248,247,249,247,248,247,248,248,249,248,244,235,233,253,214,185,199,218,236,249,250,251,243,249,209,166,169,144,85,60,202,249,249,248,240,252,249,247,217,163,210,251,250,250,249,251,246,237,235,181,194,246,252,252,252,250,241,211,232,236,239,252,251,251,240,247,252,250,229,236,252,250,251,252,252,207,129,117,113,103,99,21,122,249,249,247,245,252,252,249,246,245,251,215,190,237,249,251,251,252,250,237,249,252,250,250,251,251,240,243,152,163,245,250,252,249,248,251,249,239,243,252,227,235,217,156,218,180,170,150,60,120,109,73,46,54,51,67,127,131,119,63,133,109,47,52,27,65,81,106,113,80,51,44,41,39,57,81,77,73,59,64,32,49,222,249,249,251,248,252,249,249,248,243,247,247,240,240,244,251,245,233,243,251,251,204,213,252,213,162,172,211,241,214,215,249,205,202,232,233,249,249,248,246,243,247,245,244,244,244,244,245,243,244,243,246,245,252,200,104,87,24,1,143,236,242,250,250,241,231,224,219,211,206,212,226,229,249,224,127,56,135,234,211,224,232,231,238,232,233,230,227,229,229,227,228,227,225,226,227,226,223,225,223,225,224,223,226,224,226,226,223,222,223,224,223,224,219,220,221,219,222,218,218,217,218,221,216,214,217,214,217,217,214,216,214,215,213,214,214,211,218,216,170,132,115,77,72,77,66,77,79,67,55,49,50,49,59,72,67,62,58,44,39,34,31,34,41,73,86,86,70,60,44,40,148,217,226,217,214,217,212,212,206,207,207,206,210,209,211,210,208,209,208,210,208,208,208,208,210,210,208,211,206,203,206,206,202,202,203,200,204,201,208,113,2,1,5,8,9,8,12,10,10,11,12,12,219,222,218,217,217,218,217,217,217,219,218,217,220,216,217,217,215,217,218,215,216,218,217,218,217,217,217,217,217,214,216,216,216,218,216,217,218,217,217,217,218,216,220,219,217,218,218,219,221,220,218,220,220,220,219,220,221,220,220,218,219,218,219,218,218,220,221,222,221,221,221,220,219,221,220,221,219,217,221,219,219,220,218,217,218,221,220,218,219,217,217,218,218,219,220,218,218,220,220,221,220,221,222,220,222,223,222,222,224,224,223,222,223,224,223,226,225,225,227,224,226,225,226,226,230,230,229,231,230,231,231,232,232,229,232,232,232,232,232,232,233,232,235,234,232,233,232,232,232,230,231,232,230,232,234,232,235,237,237,237,236,236,237,236,236,237,236,237,239,239,237,239,240,240,238,239,239,238,239,239,240,241,241,241,242,242,243,243,242,243,245,243,244,245,244,245,245,243,244,245,245,247,245,246,245,245,246,247,247,245,247,247,247,248,247,246,249,249,248,248,247,247,248,248,248,247,248,248,249,248,249,250,250,249,248,248,248,249,249,249,249,247,248,250,248,248,248,248,248,248,247,248,248,247,249,249,248,249,249,249,249,248,248,248,248,249,248,248,247,248,248,248,248,247,248,247,248,247,247,248,248,248,249,249,250,250,249,248,249,249,239,244,252,247,248,249,249,249,250,250,248,249,249,250,245,242,249,249,247,244,245,249,246,248,245,245,252,252,252,252,251,248,245,246,246,245,247,246,247,248,247,247,245,247,247,247,247,247,247,247,249,249,248,248,226,229,239,190,209,240,249,220,218,241,249,244,249,211,173,177,150,101,57,195,249,249,248,242,252,249,249,235,181,208,252,252,249,249,250,250,218,209,214,243,251,251,246,244,249,208,199,241,237,243,252,252,252,240,248,252,250,230,235,252,253,253,252,252,231,160,137,134,105,95,15,113,248,248,249,249,252,252,251,250,251,251,206,203,243,248,251,251,251,246,240,252,250,251,250,251,252,239,250,191,176,248,252,252,250,248,252,246,235,248,248,207,218,207,207,252,193,186,167,86,134,117,85,53,51,47,77,163,134,98,62,147,114,51,64,56,106,96,94,101,71,51,39,42,36,56,91,73,73,59,59,32,45,214,249,249,251,246,252,249,250,248,242,247,249,244,239,241,249,246,233,240,250,252,222,205,252,241,169,165,195,225,215,199,238,199,179,215,224,245,249,249,247,244,245,245,244,244,243,244,245,245,249,250,252,252,251,208,125,101,33,7,151,238,248,251,252,234,234,226,218,211,206,214,220,232,248,223,126,53,132,231,214,226,232,236,233,232,234,230,230,229,227,229,226,226,228,225,226,225,224,223,225,223,222,226,224,223,222,223,223,222,224,219,221,222,220,220,219,221,219,220,217,217,218,214,214,214,215,214,214,217,215,213,214,212,212,213,212,212,218,232,202,168,151,116,107,88,71,71,89,87,72,65,50,57,66,69,57,54,53,39,38,43,49,58,69,85,80,65,66,50,44,11,74,190,208,221,220,210,210,208,212,207,207,211,208,210,210,208,210,209,210,209,208,211,210,208,206,209,206,206,206,205,205,204,205,205,204,202,205,200,206,112,4,1,4,8,10,8,10,10,11,11,11,11,217,219,218,217,220,215,216,219,219,218,217,216,216,217,217,214,217,215,217,216,216,218,215,217,216,215,217,215,217,217,218,217,215,216,214,215,215,218,217,215,219,219,220,219,218,220,219,221,222,221,221,222,223,220,220,222,221,221,217,221,220,220,220,220,221,219,221,221,221,219,217,219,221,221,220,222,220,218,223,218,218,219,218,221,219,219,217,218,220,217,220,219,218,221,218,221,220,219,221,220,220,221,219,222,220,221,222,222,223,222,223,223,224,224,226,225,226,226,224,227,226,225,224,228,228,227,227,228,230,230,229,228,231,231,230,232,230,230,234,233,232,231,231,231,231,232,232,233,233,233,232,234,233,232,233,232,234,234,237,236,235,237,236,236,237,237,237,237,238,239,240,239,237,239,241,239,237,237,238,239,239,240,241,241,242,243,242,242,242,243,243,244,245,243,244,244,244,244,245,244,245,245,246,246,246,247,246,247,246,247,249,249,249,247,247,248,249,249,249,248,248,248,248,248,248,248,248,248,250,250,248,248,247,247,249,248,249,248,248,248,248,247,248,248,249,249,249,249,248,249,248,247,248,248,249,249,249,248,248,249,247,249,249,247,247,247,247,248,249,248,247,247,247,249,248,248,247,248,249,249,249,248,249,249,249,248,249,249,250,245,236,245,249,248,249,249,247,249,247,248,248,248,249,248,242,246,249,248,245,244,248,248,247,248,247,248,249,245,252,245,240,247,247,246,246,245,246,246,248,245,247,247,247,249,247,247,247,247,247,247,249,249,249,250,237,239,252,212,230,250,220,203,227,247,252,243,249,210,168,167,138,88,51,189,249,249,247,242,250,246,249,243,180,192,246,250,249,247,252,249,245,245,229,250,249,249,198,187,242,237,241,252,247,247,251,250,251,240,246,252,250,239,244,253,253,253,253,253,230,153,127,122,108,92,16,115,248,248,250,250,252,252,252,252,252,252,239,244,252,252,250,250,252,244,244,252,248,250,251,250,253,244,250,188,161,245,252,252,250,246,252,243,236,252,242,229,251,232,250,253,191,250,186,91,127,79,67,43,57,60,81,178,148,70,31,152,122,57,61,46,113,101,69,56,44,48,46,39,36,61,93,81,71,55,64,30,41,211,249,249,250,244,252,248,250,248,243,247,249,247,239,243,247,249,236,237,249,252,234,202,244,252,206,172,199,205,214,220,229,202,186,211,217,239,250,247,247,245,245,246,245,246,244,246,252,252,253,253,252,252,251,227,139,103,31,10,172,241,250,250,253,253,240,229,218,212,205,206,223,227,249,222,127,52,131,229,206,229,236,231,236,230,235,229,229,230,229,227,229,228,226,226,225,225,222,226,222,221,223,222,224,221,225,224,222,224,222,220,219,220,219,219,219,218,218,216,219,215,215,217,216,215,216,214,215,215,212,214,214,214,212,213,213,212,218,229,210,163,129,107,99,80,53,61,80,83,81,107,118,113,102,71,62,60,48,42,75,112,111,104,106,107,76,64,60,48,29,24,120,179,197,218,214,214,208,210,210,208,212,210,209,210,213,212,210,212,212,211,209,210,212,207,207,210,206,208,207,206,205,206,204,204,205,202,206,200,204,113,4,1,4,8,10,8,10,9,11,11,11,11,220,220,218,220,218,217,217,217,218,216,218,217,215,218,219,218,217,217,217,217,217,218,218,216,217,215,214,217,216,216,217,217,217,215,215,215,215,215,215,214,218,217,218,220,218,219,219,219,220,220,219,219,220,220,219,219,218,216,219,218,217,218,219,217,219,218,220,219,220,219,219,222,221,221,219,223,220,220,221,217,220,220,218,219,217,218,217,217,220,219,220,220,218,220,220,220,221,221,217,220,220,220,222,219,222,221,220,221,221,222,222,221,223,224,223,226,223,226,229,225,228,228,228,227,228,229,226,230,230,227,230,230,230,231,232,230,232,232,230,232,230,229,231,228,231,235,232,231,233,232,232,234,233,234,236,231,235,235,233,236,234,235,237,235,234,237,236,237,238,238,237,239,240,239,238,238,237,236,236,238,239,239,241,241,242,242,243,243,244,243,244,244,244,245,244,244,242,243,245,245,244,245,245,245,245,245,247,247,247,245,248,249,248,249,248,247,248,247,247,248,247,248,247,247,247,248,249,249,248,248,249,248,247,249,249,249,248,248,249,247,248,249,248,249,248,249,247,248,249,248,248,248,249,248,248,247,248,248,248,249,248,248,247,248,248,248,248,249,247,247,247,247,247,248,249,248,248,247,248,249,248,247,248,249,250,248,249,249,249,243,239,249,249,248,248,248,249,247,248,247,248,249,247,244,242,247,249,247,244,246,248,248,247,248,248,246,239,186,172,205,234,249,247,248,245,245,249,248,247,247,246,247,249,249,248,247,247,247,247,247,247,248,250,251,238,252,245,195,207,204,226,231,249,252,249,242,249,203,153,152,124,83,42,186,249,249,247,240,247,246,249,242,190,184,239,250,250,250,252,252,241,230,208,217,239,239,224,239,252,242,252,252,245,248,248,249,251,240,249,252,250,246,237,251,245,243,251,232,151,103,109,122,126,123,57,144,247,247,250,250,253,253,252,252,252,252,244,253,252,252,250,250,250,240,247,252,248,249,249,248,252,237,248,194,144,238,252,252,250,246,252,237,241,252,236,243,252,249,252,247,211,253,202,103,78,32,32,27,60,48,69,194,154,68,39,148,105,47,64,38,109,101,69,54,39,48,45,39,32,44,75,72,73,57,64,36,42,211,249,249,249,243,252,248,250,246,244,249,248,250,241,242,245,249,241,234,245,252,248,200,221,250,235,184,166,158,201,230,222,211,211,213,214,236,248,248,248,247,244,243,244,244,247,252,252,252,253,253,252,252,248,181,99,69,13,34,186,235,212,214,234,234,232,226,215,210,208,208,219,229,247,212,126,51,134,227,208,230,232,234,234,233,231,227,227,225,227,227,224,225,226,224,223,222,224,222,222,224,224,220,220,222,223,225,222,222,222,218,221,220,218,217,217,217,216,216,214,215,214,213,214,214,215,212,212,213,209,211,211,212,214,212,210,210,213,212,166,122,105,92,105,97,88,70,78,71,96,148,141,145,123,109,95,71,49,54,125,150,139,124,123,122,108,83,60,39,31,135,220,221,210,210,213,211,210,208,209,210,211,211,212,211,209,211,212,210,210,208,208,207,208,208,207,209,207,208,208,206,207,205,206,203,204,204,204,199,205,113,2,1,5,8,8,8,12,10,10,11,10,10,217,222,217,219,219,215,219,219,219,218,216,219,220,215,216,214,217,215,215,217,215,217,217,217,218,217,218,218,220,216,215,219,217,218,218,218,217,217,216,216,218,215,219,218,217,221,220,221,219,220,219,217,218,218,220,217,218,218,216,219,218,218,217,217,219,218,220,218,218,220,219,221,221,220,218,220,218,217,221,215,220,220,216,219,216,219,218,220,221,218,220,219,221,219,219,221,220,217,221,220,217,220,220,221,219,221,221,222,220,222,222,223,222,222,224,222,224,223,224,226,226,229,227,228,227,227,228,227,229,230,230,229,231,228,229,230,229,230,229,230,231,232,232,229,230,232,229,229,230,230,232,233,232,232,233,232,232,231,232,234,233,233,234,236,237,237,237,239,239,238,238,239,238,237,237,236,237,237,238,237,239,239,241,240,240,242,241,243,243,243,244,243,243,242,245,244,242,244,244,244,246,245,246,245,246,246,245,247,247,247,246,247,246,245,246,247,249,249,248,247,249,248,248,248,248,248,248,250,249,249,249,249,248,249,249,249,249,249,249,248,248,249,248,248,248,248,249,248,250,248,247,248,249,249,249,249,248,248,248,248,248,248,247,247,248,248,248,248,247,249,248,248,247,248,247,249,248,249,248,247,247,247,250,248,247,248,248,248,249,239,242,250,247,248,248,248,249,248,247,248,249,249,249,243,243,248,248,247,244,248,249,248,247,247,248,249,232,169,185,229,240,249,246,246,246,247,248,247,247,246,247,247,248,249,247,248,248,247,247,247,249,249,248,249,234,252,225,154,209,247,253,241,237,249,247,243,249,204,164,162,136,82,48,202,250,250,247,238,248,247,249,242,188,196,244,250,250,247,252,250,238,184,147,210,252,252,245,252,252,237,217,234,241,250,250,245,252,242,246,252,250,250,205,134,82,80,91,99,107,118,135,143,155,165,128,150,198,190,189,188,196,192,177,170,183,157,201,250,251,251,251,251,249,240,249,251,249,251,251,248,252,237,239,222,165,227,252,251,250,248,252,233,245,251,231,249,252,249,254,206,200,253,127,39,59,27,30,29,61,47,63,188,164,81,35,130,107,61,64,39,112,101,71,55,41,48,50,37,34,34,49,71,75,60,66,32,46,215,249,249,249,242,252,247,250,248,244,249,247,250,245,245,248,251,244,233,244,252,251,203,196,249,239,146,117,139,190,240,201,182,232,228,216,244,248,250,248,246,247,244,245,243,251,251,252,195,126,104,87,92,99,79,66,63,38,39,85,132,142,144,197,222,220,222,211,213,212,207,220,225,249,229,129,50,139,226,202,231,234,233,237,229,233,228,228,227,226,226,228,226,226,225,225,225,221,223,221,222,223,222,225,222,222,221,222,223,221,219,222,220,217,219,217,215,215,215,215,214,215,216,214,213,216,214,213,214,211,211,212,211,210,214,209,213,211,215,182,127,94,84,110,112,96,74,63,47,83,137,132,128,128,117,94,64,49,55,113,131,116,102,105,118,102,82,68,34,40,159,223,222,223,215,213,212,212,210,208,209,212,211,212,213,212,213,211,212,213,212,212,210,212,210,206,209,208,208,209,207,206,206,205,206,205,204,206,201,206,113,2,1,5,8,9,8,12,9,11,12,10,10,215,220,218,218,220,215,217,217,219,217,216,218,214,215,214,214,214,213,214,217,215,215,219,218,217,215,218,218,217,217,216,215,215,217,217,217,216,217,217,217,218,217,217,218,218,220,220,221,217,219,222,217,220,219,215,217,219,218,217,218,219,219,218,219,218,215,220,218,220,218,219,220,218,220,219,221,217,220,220,217,220,218,216,216,217,219,220,220,218,219,221,219,218,220,218,216,217,219,218,220,218,219,222,218,221,221,220,221,219,221,222,221,222,221,222,223,223,225,224,222,227,226,225,228,229,227,224,229,227,226,229,228,229,229,229,229,230,229,228,229,229,230,232,230,230,229,229,232,230,233,233,232,231,231,233,232,232,233,233,234,234,235,236,235,236,237,237,237,238,238,237,239,239,238,239,237,237,237,239,239,239,239,238,239,240,240,241,241,242,241,242,242,243,243,243,242,242,244,244,244,244,244,244,245,245,245,245,246,246,245,246,247,246,247,246,245,247,247,247,247,247,247,248,247,248,249,248,248,249,249,249,249,248,247,247,248,247,248,248,248,247,247,248,249,248,248,248,248,249,248,248,249,248,248,248,248,249,248,248,247,247,249,247,247,248,248,249,247,247,248,247,248,247,249,247,248,249,249,248,247,248,248,248,247,247,247,247,249,246,237,244,249,248,249,247,247,247,247,248,248,248,248,246,241,247,248,248,244,245,249,247,247,245,246,247,248,250,243,252,251,247,247,245,247,247,247,247,246,245,246,248,248,247,247,247,247,247,248,247,247,246,249,249,246,230,252,238,205,230,251,246,191,224,249,247,245,249,209,171,173,145,95,65,219,251,251,248,238,246,246,249,249,207,190,240,250,249,250,252,252,227,179,206,248,251,251,238,252,247,174,182,232,235,252,248,243,252,242,244,252,249,251,178,95,66,57,83,84,105,131,135,141,139,150,129,132,113,88,88,94,100,87,86,77,63,53,173,242,250,250,251,251,248,244,252,250,250,249,250,248,252,244,247,225,156,218,252,252,250,250,252,230,249,246,243,252,251,251,199,113,183,186,99,69,61,65,36,24,79,47,62,197,162,65,36,131,98,66,70,39,108,93,58,48,42,49,49,36,34,33,45,72,79,57,67,34,45,214,249,249,249,240,252,247,250,248,243,249,248,251,246,241,248,251,249,232,240,248,250,216,184,238,238,162,125,170,217,244,202,180,244,243,218,231,244,251,249,247,247,245,244,248,251,251,165,84,55,23,6,6,9,11,36,56,49,59,46,22,69,56,140,200,206,215,201,207,210,211,220,229,248,234,136,53,145,224,206,234,232,235,232,229,232,226,229,227,224,227,227,227,227,225,225,222,222,221,219,221,222,226,225,224,225,222,220,222,222,219,222,220,217,218,215,217,217,216,215,218,218,217,216,215,214,212,213,214,213,214,211,214,212,210,211,211,214,229,217,167,123,86,105,118,103,69,56,42,70,118,119,131,125,112,81,61,41,47,112,111,104,90,99,107,94,78,59,33,29,139,198,208,224,213,214,210,210,213,213,211,213,211,211,215,213,212,212,211,214,211,211,210,212,212,211,212,210,211,209,208,208,205,205,205,203,205,207,202,207,113,3,1,4,8,10,9,10,9,11,11,11,11,219,221,219,219,221,217,218,218,217,220,215,217,216,214,217,217,217,214,217,217,216,218,214,215,219,218,219,215,218,217,218,218,217,217,217,217,217,218,218,217,217,218,220,217,219,221,219,220,219,220,218,220,219,217,217,216,220,220,218,220,219,219,220,220,220,220,220,220,222,220,222,222,223,223,221,221,219,221,220,217,220,217,219,222,218,222,219,219,219,219,222,219,222,222,221,221,222,218,220,220,219,222,221,222,221,222,223,223,220,222,221,221,223,223,224,221,224,224,224,226,227,228,227,229,229,227,229,229,229,229,227,229,231,229,231,231,231,230,230,230,229,230,232,232,232,233,233,232,234,232,232,234,232,234,234,234,235,233,234,235,232,234,235,234,237,236,235,235,237,237,237,240,237,239,241,239,240,239,240,240,240,241,242,241,241,243,241,243,241,242,242,241,243,241,242,242,241,243,244,244,245,245,245,245,245,246,246,247,247,247,245,246,245,246,249,249,248,248,247,247,248,247,248,248,249,248,248,247,248,249,249,249,248,249,248,249,248,248,249,249,247,249,249,249,248,249,249,248,249,248,249,249,249,248,248,248,247,249,249,248,248,249,248,248,248,247,249,249,247,248,248,248,249,249,250,249,247,249,248,248,249,249,249,248,248,248,248,250,245,237,248,250,248,249,247,247,249,248,247,247,248,249,244,242,248,249,248,245,246,248,247,248,246,246,247,250,252,252,252,252,241,244,245,249,247,247,247,244,245,245,247,246,247,248,247,247,247,247,247,246,247,248,250,245,237,252,243,211,224,202,213,219,241,252,246,244,249,207,172,162,138,80,61,218,251,251,248,238,248,244,250,247,205,188,220,249,250,250,252,246,243,204,225,249,248,243,181,208,239,220,224,242,244,252,251,242,251,244,243,251,251,251,180,102,93,84,85,84,80,84,80,81,78,77,67,79,80,71,64,84,94,71,81,75,66,47,149,241,251,251,250,250,246,247,251,249,252,249,250,247,252,250,247,238,157,196,250,250,249,249,248,231,252,237,248,252,235,227,145,115,210,237,160,81,66,84,64,57,95,65,59,168,151,75,48,131,99,56,66,71,127,86,55,45,42,51,48,35,36,26,43,75,70,56,67,34,47,211,249,249,249,242,249,249,250,248,245,250,249,250,248,241,245,249,252,236,239,248,249,233,184,220,251,221,166,203,222,246,245,211,252,243,179,202,233,248,252,246,248,244,245,252,253,197,106,65,52,59,48,28,24,39,54,61,63,63,55,53,33,2,115,196,200,211,192,202,206,204,219,225,251,249,143,58,152,222,203,233,232,235,234,228,234,224,226,229,224,226,227,224,225,224,223,223,223,224,222,223,223,225,225,223,225,223,221,221,222,220,220,219,218,221,217,216,217,215,216,216,215,215,214,215,214,213,212,214,212,210,213,209,208,210,210,216,213,229,206,161,129,101,116,109,92,69,53,35,80,121,107,115,96,86,78,62,45,52,108,105,104,107,91,82,69,60,63,23,58,174,193,207,218,211,213,209,211,213,212,212,216,213,213,214,213,214,211,212,213,210,213,212,213,211,208,214,210,209,210,207,208,206,205,203,204,205,208,205,206,112,4,1,4,7,10,8,10,10,10,11,11,11,216,220,219,217,221,218,216,217,217,216,214,217,216,217,217,215,217,217,216,217,214,215,217,214,215,217,218,215,216,215,214,217,217,214,214,213,215,217,214,218,217,215,218,217,217,218,219,219,219,217,220,217,218,218,218,221,217,221,219,217,220,221,220,219,219,217,221,219,218,220,222,222,219,220,219,220,219,222,220,218,222,222,220,221,220,220,220,220,219,218,220,222,222,222,224,222,220,220,220,221,219,220,222,222,223,220,220,223,219,222,221,222,222,221,224,221,223,225,225,225,225,225,226,227,227,227,227,230,226,228,230,226,228,227,227,227,229,229,230,229,231,230,229,228,229,232,230,233,230,231,231,230,231,232,230,232,233,235,235,232,233,234,232,234,234,232,235,235,236,235,235,236,239,237,236,238,237,237,239,238,239,240,241,241,240,241,243,242,241,243,242,243,242,241,243,242,242,244,243,244,245,244,244,245,245,245,245,246,245,243,246,247,246,247,248,246,248,249,248,247,247,248,248,249,248,248,248,248,248,248,248,249,250,248,250,248,248,248,248,248,247,248,248,247,248,247,248,248,247,248,247,249,248,248,248,247,248,247,248,248,248,248,247,248,249,248,248,247,248,249,248,249,247,247,247,247,247,247,247,247,248,248,248,249,249,250,249,249,240,240,250,249,248,248,249,249,247,248,247,246,249,249,242,245,250,247,245,242,247,247,245,247,246,245,246,247,247,247,249,242,239,242,243,246,246,246,245,245,245,246,245,245,245,248,247,246,247,248,247,247,247,247,249,243,237,252,219,179,199,214,232,223,252,252,246,243,249,201,152,145,122,75,56,215,251,251,248,236,245,242,249,246,204,177,218,249,250,250,252,248,248,198,198,235,239,243,208,230,253,223,244,252,241,252,251,243,251,245,243,249,251,251,176,114,102,73,57,47,56,51,45,47,41,48,42,45,41,48,37,60,77,45,55,50,52,45,38,135,227,246,248,246,244,248,248,250,251,248,249,249,252,252,238,230,162,191,251,251,250,250,240,233,247,209,228,171,141,193,184,212,253,253,181,109,73,94,62,55,98,60,72,168,148,60,39,113,77,57,66,77,125,78,48,42,47,57,44,42,42,24,42,72,72,56,61,27,53,213,249,249,248,242,249,248,251,248,244,250,248,250,250,242,244,247,251,239,236,245,250,242,186,209,252,242,174,198,224,233,247,215,244,234,167,180,214,243,250,246,244,243,243,252,250,152,85,67,68,79,79,62,57,61,59,63,61,60,61,68,40,20,151,215,210,218,190,197,195,203,221,226,252,244,151,63,150,213,207,234,228,233,231,230,230,225,228,226,225,226,222,223,225,220,221,221,222,221,222,224,222,223,223,221,222,219,219,220,219,218,218,217,218,218,215,216,215,213,214,213,211,213,213,213,213,211,212,209,210,212,211,213,208,211,209,210,212,225,202,147,116,77,73,61,53,55,55,42,81,131,122,110,77,62,50,53,39,51,101,103,113,107,85,66,55,50,49,17,73,184,200,214,217,210,213,210,210,212,212,211,211,211,215,214,212,212,213,211,212,212,212,211,211,211,210,208,208,210,206,206,207,205,205,204,204,202,205,201,208,113,2,1,5,8,8,8,12,10,11,12,12,10,215,218,218,215,219,216,215,216,215,216,217,215,214,215,213,213,215,212,214,215,215,220,216,217,216,215,217,214,219,217,216,215,214,217,217,216,214,214,215,217,217,217,219,217,217,217,218,218,217,219,217,220,219,219,222,220,220,219,219,221,220,217,220,220,220,220,219,220,219,219,220,219,222,220,221,221,220,221,218,219,221,220,221,221,221,223,221,222,225,225,223,222,221,221,221,222,224,219,222,222,222,221,220,222,220,220,218,221,221,220,222,222,218,220,222,222,225,224,224,225,225,224,226,227,225,226,226,225,226,229,229,229,228,226,227,229,230,229,230,228,231,231,229,229,229,229,230,230,231,231,231,234,234,233,233,231,232,233,232,234,234,232,233,236,235,232,232,233,235,234,235,237,236,237,239,237,237,237,237,239,240,240,241,240,240,241,240,241,241,242,243,242,242,243,244,244,244,243,244,244,246,246,244,245,245,246,245,245,246,245,246,247,248,248,246,247,248,249,247,247,248,247,249,249,248,247,248,248,248,248,247,249,248,248,249,249,250,251,249,248,248,249,248,248,248,247,247,247,249,248,249,248,248,248,248,249,248,249,247,247,249,249,248,248,248,247,248,247,247,247,247,248,248,247,248,247,247,249,249,247,247,247,248,247,248,247,248,248,238,242,249,248,249,249,249,250,248,248,247,247,249,244,242,247,249,248,245,245,247,247,247,247,245,246,248,246,247,245,247,247,242,242,242,245,245,244,247,245,247,247,245,247,246,248,248,247,247,247,248,247,248,248,249,238,230,247,189,201,244,243,240,207,235,251,244,244,249,194,160,154,131,72,72,232,252,252,248,236,246,242,250,249,213,191,221,249,250,249,252,245,238,164,177,242,251,250,222,248,252,231,245,252,242,252,251,241,250,246,241,248,251,251,174,120,105,71,55,56,60,62,60,61,53,57,53,53,49,55,36,61,85,43,48,52,68,22,31,155,240,248,249,244,245,249,249,250,249,247,250,249,252,251,223,203,149,188,250,250,249,249,235,240,237,190,208,187,206,245,214,246,252,179,160,157,118,113,73,45,94,79,65,151,161,89,42,105,96,78,64,61,105,62,51,50,48,57,44,57,53,26,49,74,70,57,61,26,51,217,249,249,248,246,246,247,252,247,246,249,248,248,250,243,243,247,251,245,234,242,250,250,196,194,246,245,171,186,226,227,237,204,225,228,188,189,201,238,252,248,246,244,245,252,229,123,92,85,88,91,87,71,46,44,49,53,50,55,46,55,39,35,187,234,225,231,195,200,202,208,212,201,240,251,156,68,151,205,202,235,226,231,229,225,233,227,226,226,223,225,225,224,225,223,222,220,222,221,221,223,223,223,222,222,221,217,218,220,217,216,216,217,216,214,215,217,217,214,215,214,212,213,209,214,212,210,212,210,212,211,213,211,210,214,210,212,209,224,199,137,103,69,71,57,44,49,45,41,101,143,131,120,87,68,46,44,39,67,108,101,112,99,78,73,55,46,39,13,103,193,200,217,215,211,212,211,213,213,212,212,212,214,214,213,212,213,212,211,214,210,212,210,212,212,209,211,207,208,208,206,207,206,204,204,207,206,207,203,207,112,3,1,5,8,9,8,10,10,10,11,12,12,216,216,216,214,214,214,213,217,215,213,214,215,214,214,216,214,214,214,215,214,214,216,217,215,216,219,216,217,220,217,219,218,218,215,218,216,215,217,216,220,218,217,219,216,219,218,219,220,219,217,217,219,220,220,220,221,218,222,222,219,221,221,221,221,221,221,220,222,221,220,222,222,219,220,221,219,220,221,222,222,218,220,220,223,222,221,224,222,222,223,223,222,222,221,223,221,221,224,221,222,220,221,219,217,221,219,220,222,219,224,222,221,223,222,223,223,227,224,225,227,227,224,224,227,225,226,229,229,228,228,227,229,229,229,229,229,229,228,229,228,229,229,229,230,231,231,231,232,231,231,232,233,231,233,233,230,233,233,232,232,235,234,232,232,232,233,235,236,236,236,236,237,236,237,237,237,237,237,237,239,242,241,241,241,241,241,241,242,242,243,241,242,242,242,245,243,242,243,245,244,244,245,245,245,245,246,245,245,245,245,245,247,246,247,247,246,249,249,249,248,247,246,247,248,248,248,248,248,248,248,249,248,247,248,248,248,249,248,248,247,249,248,248,247,247,248,248,248,247,247,249,249,249,248,248,249,248,249,248,248,248,247,248,247,248,247,247,247,247,247,247,247,247,247,248,247,249,249,248,248,249,248,248,249,248,247,249,247,237,246,249,247,249,249,249,247,247,247,248,249,249,241,243,248,248,246,244,248,249,247,247,247,246,247,247,247,246,245,247,247,244,242,243,242,243,245,246,247,247,246,246,246,247,247,247,247,247,247,247,245,249,249,251,235,236,252,218,231,253,253,221,188,226,245,245,239,250,198,169,169,144,84,84,240,252,252,249,236,245,242,250,251,228,184,197,246,251,251,251,236,214,189,215,249,250,248,234,252,253,225,232,244,240,252,251,245,249,246,241,246,251,251,165,125,107,72,50,57,66,70,76,64,64,68,61,61,57,63,53,87,92,70,95,79,75,25,114,241,252,252,248,241,248,249,247,248,247,247,248,249,250,252,214,218,155,141,244,249,249,249,231,246,236,241,252,241,252,252,242,220,141,106,178,242,180,136,107,89,108,66,61,164,181,101,80,129,115,115,72,70,107,63,65,61,51,55,41,57,57,39,59,72,70,55,62,28,47,208,249,249,248,247,246,244,250,247,247,249,249,249,250,245,242,246,252,247,232,241,252,252,206,188,232,248,191,171,229,214,195,207,218,223,226,212,193,230,246,249,246,252,252,252,230,116,94,83,87,108,94,59,39,35,37,41,41,49,35,52,31,35,192,234,234,234,217,222,222,212,228,222,250,231,126,73,167,204,211,232,225,234,227,227,227,224,227,224,223,225,224,223,224,222,224,221,221,221,222,224,223,222,220,221,221,220,219,218,219,218,217,216,218,216,216,216,214,215,214,215,212,211,212,213,214,212,211,212,212,209,208,212,206,211,210,212,211,220,192,132,94,66,80,73,56,52,56,50,102,152,131,122,95,66,51,47,38,93,115,100,111,93,76,69,53,50,18,42,161,209,209,215,210,211,211,210,208,211,211,213,215,212,213,214,214,213,211,211,211,210,210,211,210,212,211,211,210,209,208,208,207,205,204,206,206,205,208,203,207,112,3,1,4,8,9,8,10,10,11,11,10,11,218,220,218,217,218,216,214,215,216,214,216,215,214,214,214,215,217,215,217,218,215,216,214,217,215,216,219,217,216,217,219,216,218,216,217,217,216,219,217,217,218,217,220,220,220,221,218,220,220,221,217,218,219,219,221,219,219,219,218,222,222,220,223,220,219,221,220,221,222,223,223,220,222,220,220,222,220,223,222,221,224,222,221,222,221,221,222,221,220,219,222,223,221,222,223,222,222,222,223,221,222,222,220,222,221,220,220,221,222,219,220,221,224,226,223,224,226,226,225,226,227,225,229,228,227,228,227,229,227,228,229,228,228,227,227,227,225,226,229,229,229,228,229,229,231,231,231,230,229,231,232,233,232,232,234,232,233,234,232,234,234,234,234,234,237,237,237,237,237,237,235,236,236,236,235,234,236,236,238,239,239,239,240,240,241,240,240,241,240,241,242,242,242,242,242,243,243,242,244,243,245,245,244,246,246,245,245,246,246,246,247,247,248,247,247,248,247,247,247,247,248,247,248,247,247,248,248,248,248,248,248,249,248,248,248,248,247,248,247,248,248,247,248,249,248,248,249,248,248,249,249,247,248,248,249,248,247,248,248,248,248,248,247,247,248,248,249,248,247,248,248,248,248,248,247,247,247,249,247,247,248,247,249,247,247,247,249,243,238,247,249,248,249,247,247,250,247,248,247,249,247,240,247,248,247,245,244,248,247,247,247,247,246,246,246,247,247,247,247,247,245,244,245,244,244,243,244,244,246,247,247,247,246,246,245,245,247,247,245,247,248,250,252,236,246,252,210,233,227,206,233,213,233,241,232,237,250,197,167,154,130,77,83,233,251,251,249,238,244,240,250,252,222,190,196,243,250,252,250,228,221,195,217,250,251,246,245,252,231,181,189,235,238,252,251,243,246,246,236,243,250,240,165,136,116,88,57,60,66,50,53,59,54,57,58,53,50,60,55,80,80,57,73,78,69,26,161,241,249,249,244,241,251,248,249,249,248,250,248,249,250,252,232,241,165,125,227,246,249,249,226,241,235,252,252,252,252,252,229,196,113,81,195,246,231,177,121,94,110,85,64,153,173,112,81,96,114,122,81,91,131,113,100,67,53,54,34,52,54,33,57,74,74,55,61,32,39,200,249,249,248,248,247,242,251,245,247,249,248,248,249,248,241,245,250,251,232,236,249,252,222,184,222,251,207,157,181,150,158,227,225,196,234,240,197,226,248,247,252,252,253,253,191,101,81,73,91,104,96,57,37,39,34,37,40,46,39,46,26,37,197,237,239,248,234,245,239,243,248,246,247,210,96,80,184,204,205,234,226,234,228,227,229,223,226,224,224,225,223,224,224,222,224,223,221,222,222,221,222,222,221,218,221,218,220,219,214,218,216,216,217,214,213,214,214,212,215,214,214,214,209,214,212,210,213,212,212,208,211,209,207,209,207,211,208,218,198,139,93,59,78,79,61,57,69,62,123,152,128,122,88,66,50,42,69,117,121,110,116,101,81,56,47,30,27,118,214,223,218,216,214,213,212,210,207,210,212,213,214,214,213,213,214,215,213,210,210,210,211,212,211,208,211,212,212,211,209,208,207,206,208,207,206,205,208,207,207,111,3,1,4,8,9,8,10,10,11,11,10,12,216,221,218,216,217,216,217,216,215,217,214,214,217,214,215,213,215,217,214,215,214,216,218,214,216,217,215,215,217,215,215,214,215,216,216,218,216,216,217,217,217,218,220,218,221,218,218,219,219,222,219,219,220,219,220,218,217,220,220,219,222,221,221,220,220,221,220,221,219,220,222,220,220,222,221,220,222,220,222,221,219,221,218,219,218,218,219,220,220,218,220,219,220,219,219,220,220,221,221,220,222,220,222,220,220,220,218,221,220,222,222,222,224,222,224,223,224,225,226,227,226,225,226,228,227,228,228,225,228,229,228,228,226,227,225,226,229,225,226,224,226,228,229,229,229,228,230,230,229,229,230,232,231,234,233,231,232,233,232,234,236,234,235,235,235,235,235,235,235,235,236,237,236,237,237,237,237,237,238,238,239,239,239,239,242,241,240,241,241,242,241,243,243,242,243,242,244,242,243,244,243,245,244,245,245,246,246,245,247,246,248,249,246,247,247,246,247,247,247,247,247,247,247,247,246,247,247,247,247,247,247,247,247,248,248,247,248,248,249,248,248,247,247,248,247,249,248,248,249,248,247,247,247,247,248,248,248,247,246,248,248,247,247,248,248,247,248,247,247,247,247,248,248,248,248,247,247,247,247,248,247,249,248,247,247,248,249,239,240,249,248,248,247,249,248,247,249,247,248,249,243,241,248,249,247,244,246,248,246,247,246,245,247,247,246,246,246,246,247,247,247,244,246,246,246,245,244,246,247,247,247,247,247,247,246,245,247,247,247,247,247,247,250,227,245,241,167,196,213,237,253,236,229,218,222,233,249,186,147,142,124,69,83,234,251,251,249,237,243,239,248,251,225,196,205,247,251,252,250,221,211,201,229,251,252,231,209,219,186,177,218,242,241,251,252,247,246,243,237,248,250,237,152,129,124,97,69,69,67,57,51,44,51,51,47,49,46,52,44,68,80,51,54,62,51,35,162,239,250,250,243,244,252,248,247,247,248,249,249,250,249,252,236,252,198,132,233,247,249,247,225,237,239,252,253,253,232,184,164,182,163,142,213,249,218,124,83,66,85,77,67,151,149,74,78,84,100,116,79,113,158,133,98,51,54,61,46,63,50,34,57,70,73,56,64,27,49,217,250,250,248,248,249,239,248,243,247,249,247,246,248,249,240,244,248,251,235,234,245,252,231,181,212,244,223,121,117,181,195,244,222,165,211,244,213,223,245,246,252,252,224,165,113,77,88,76,85,101,87,56,39,36,34,40,45,42,36,46,24,46,208,234,234,234,222,231,234,234,252,252,245,205,92,97,194,203,216,229,227,233,228,225,227,225,227,227,226,225,224,225,225,224,222,225,225,222,222,222,221,219,221,222,220,216,218,218,216,217,214,216,216,215,214,212,213,214,213,212,210,213,211,208,211,213,209,207,210,208,209,210,206,211,207,207,208,217,210,160,107,72,89,85,69,66,82,70,120,168,136,120,88,55,57,45,83,150,122,119,117,90,77,58,39,22,98,203,229,221,215,215,214,212,210,210,211,211,213,214,212,214,214,214,212,213,213,212,211,213,212,209,211,213,212,211,212,212,209,209,208,208,205,205,208,205,208,203,208,113,2,1,5,8,8,8,10,10,10,12,10,10,214,218,216,216,216,214,216,218,217,214,217,216,216,216,216,214,214,214,213,213,214,216,214,217,215,215,214,214,216,214,216,214,217,217,217,218,214,216,216,218,218,218,220,217,217,220,219,218,218,220,220,219,219,217,221,219,217,219,220,220,218,220,222,219,223,223,222,221,218,222,219,218,220,220,219,219,219,219,219,217,219,219,217,218,220,218,220,219,220,220,218,220,221,221,220,221,220,218,220,221,222,221,218,220,220,220,220,222,222,220,222,222,222,224,221,223,223,223,225,226,227,226,227,226,225,226,227,228,226,226,228,229,230,229,229,227,227,229,227,228,227,229,229,228,229,229,230,230,232,232,231,231,232,232,234,234,235,234,234,233,233,235,235,236,236,234,235,235,237,236,237,238,236,240,238,238,240,239,239,239,240,240,238,239,241,240,242,243,240,241,242,242,241,243,243,244,242,242,244,244,245,245,245,246,246,247,245,247,246,246,246,245,247,246,248,248,248,248,246,247,247,247,247,246,247,247,248,247,247,247,248,248,247,248,248,248,249,249,248,248,247,248,249,249,249,249,249,248,248,249,249,249,249,248,248,248,247,248,248,248,248,247,248,248,250,249,247,247,248,248,248,247,246,247,247,248,248,248,247,247,248,247,248,248,247,250,249,238,245,250,247,248,247,247,248,248,249,249,249,248,240,244,249,248,246,244,247,248,247,247,246,246,247,248,247,247,245,246,247,248,247,243,246,248,247,245,244,244,247,245,248,248,246,247,247,247,247,247,247,247,247,247,249,218,232,232,184,229,238,252,234,205,231,224,212,224,250,190,163,150,124,71,89,236,251,251,249,237,245,241,246,252,226,199,203,247,252,252,248,211,207,206,232,233,233,186,197,244,223,214,238,252,243,252,252,249,247,245,243,252,249,181,113,117,108,87,43,46,51,37,45,40,43,44,39,43,43,49,45,71,78,60,67,69,42,42,169,235,250,250,241,245,252,249,249,250,247,249,248,250,248,252,238,252,212,141,227,247,250,245,223,236,250,250,227,179,123,152,195,198,168,176,246,245,139,106,73,61,87,74,60,142,160,97,82,85,116,110,85,129,150,122,80,45,52,63,71,81,56,42,61,70,71,52,61,29,57,229,250,250,249,249,252,237,244,241,246,249,247,249,248,251,241,241,245,251,241,229,242,252,245,183,200,240,229,164,161,223,229,248,242,174,207,250,211,196,235,239,251,236,123,87,46,62,78,71,87,94,89,55,45,35,34,44,33,41,30,46,21,57,221,236,240,240,212,213,214,231,243,251,246,202,92,117,204,199,215,234,230,236,229,229,229,225,227,226,226,227,226,225,224,223,225,224,224,224,222,222,223,222,222,220,218,217,219,218,216,217,217,215,214,214,213,214,214,213,214,210,210,211,210,211,210,210,210,208,208,208,212,207,210,210,205,210,208,214,221,193,143,100,96,90,62,58,76,60,125,158,141,129,80,56,48,39,93,136,123,120,107,84,53,51,25,50,170,223,231,218,217,215,212,212,210,212,214,213,217,215,214,215,213,214,212,213,215,212,213,215,214,214,213,214,215,211,210,212,210,210,209,206,208,208,208,206,208,205,208,113,2,1,5,7,8,8,10,10,10,12,10,10,217,218,218,218,217,215,214,216,213,215,215,216,215,212,216,215,214,215,214,216,214,214,216,215,214,214,216,215,213,214,214,213,214,217,217,217,218,216,215,217,218,219,219,218,219,217,216,217,219,219,217,218,220,219,219,218,218,217,217,217,219,218,219,221,218,217,219,219,219,219,220,219,218,220,219,220,221,218,220,218,217,217,220,221,219,219,218,220,219,220,221,220,222,220,218,220,221,219,222,221,217,220,219,217,219,220,219,220,221,221,220,221,220,222,221,222,223,222,222,221,225,225,224,225,224,224,225,225,227,227,226,227,226,227,228,227,226,228,229,229,229,228,229,229,230,227,228,229,231,232,234,232,229,232,232,233,233,234,235,234,234,233,235,234,233,236,237,236,235,235,237,236,236,237,237,237,237,237,239,238,238,239,237,239,239,239,238,240,240,239,239,241,241,239,242,242,243,242,242,242,243,245,244,245,245,245,245,245,246,245,244,244,245,245,248,248,246,247,247,247,247,246,247,247,248,248,247,247,247,247,248,248,246,248,248,248,248,247,248,248,247,248,249,248,248,248,247,247,247,247,247,248,248,247,248,247,247,248,247,247,247,247,247,249,248,247,248,248,249,247,248,249,247,249,247,247,248,247,248,247,247,249,248,248,248,250,245,237,248,250,247,249,247,248,247,247,249,247,249,245,242,248,249,248,244,243,247,247,247,246,245,247,247,246,246,246,248,249,247,245,247,243,244,247,247,248,246,246,244,245,246,249,249,248,247,246,247,247,246,245,246,249,245,222,252,245,213,245,211,206,213,219,248,239,209,224,251,193,171,163,139,80,90,238,251,251,249,236,244,236,244,251,239,210,187,235,249,252,245,210,186,162,206,219,244,223,230,252,239,227,246,247,240,252,250,250,247,242,235,253,207,128,106,114,85,34,27,36,37,26,24,25,29,23,26,28,21,32,36,68,79,57,73,69,47,128,236,246,250,250,239,245,249,250,246,249,249,248,247,250,248,252,237,240,206,127,196,248,250,237,219,225,248,191,152,188,196,222,215,183,176,194,247,235,161,147,120,76,77,89,66,127,184,141,87,72,120,115,94,138,154,112,89,59,56,59,76,83,55,47,59,66,67,47,61,25,51,221,250,250,249,249,252,239,241,240,245,248,247,247,248,251,244,239,244,250,245,230,239,251,250,188,190,235,246,207,173,224,233,233,252,222,223,252,198,165,217,234,251,239,115,73,56,61,87,78,93,103,91,60,38,37,39,44,31,37,32,47,18,67,230,234,234,234,217,212,216,231,237,250,246,185,80,128,208,200,224,231,231,233,230,230,229,226,226,227,224,226,225,223,223,225,223,222,221,222,222,219,220,218,218,218,220,217,215,216,214,218,212,214,215,213,216,213,213,211,211,213,211,214,211,208,212,208,209,212,208,207,207,208,208,208,207,207,208,212,223,215,160,110,109,99,71,57,75,63,114,153,135,125,90,61,58,50,99,130,106,110,94,59,59,39,24,116,211,225,224,220,222,214,213,214,212,211,214,213,214,214,214,216,214,214,214,214,214,214,213,212,214,214,214,211,212,212,213,211,209,212,209,211,208,207,208,205,210,206,208,111,3,1,3,8,9,8,10,10,11,10,10,11,219,221,220,215,216,213,214,216,214,214,214,214,218,213,214,215,213,217,215,216,215,215,214,216,216,215,215,217,219,216,218,217,217,217,217,218,216,220,217,217,218,217,220,218,220,219,218,220,218,218,219,217,219,220,219,220,222,220,219,218,218,221,219,218,221,220,219,221,221,220,218,222,223,221,221,221,222,222,220,220,220,222,220,219,220,218,221,220,219,222,220,220,220,220,218,220,221,219,221,219,218,220,221,220,219,219,217,221,220,219,221,222,223,223,223,225,226,225,224,224,224,225,225,224,225,225,226,227,226,227,227,227,226,226,229,226,228,227,228,230,228,231,228,229,229,230,232,230,231,229,232,231,229,231,230,231,232,234,236,234,233,236,231,235,236,235,237,234,236,236,235,236,237,239,238,237,237,237,238,238,238,239,239,240,241,240,240,240,240,240,240,240,241,242,240,243,242,242,245,243,246,245,245,245,244,246,245,246,245,245,245,245,245,245,246,246,246,247,247,248,248,246,246,247,248,248,247,247,247,247,247,247,247,247,247,247,247,248,248,248,247,248,249,249,248,249,248,248,248,248,247,248,248,248,249,247,247,249,249,249,250,249,248,247,248,247,248,248,248,248,247,247,247,248,249,247,248,247,247,247,248,247,248,249,250,252,241,239,251,249,249,249,248,249,247,248,247,247,248,242,244,249,249,247,244,246,249,249,247,247,247,246,247,246,245,245,246,247,246,245,246,244,244,247,246,248,247,246,246,244,246,246,247,248,247,247,246,247,245,245,245,250,244,217,253,236,195,199,197,241,236,243,252,245,221,219,250,181,158,145,125,79,94,234,251,251,250,239,244,237,244,251,231,209,179,210,245,252,241,193,151,173,242,249,252,224,240,252,224,208,237,240,235,251,250,250,249,237,224,248,193,119,119,111,60,20,24,43,36,23,24,23,26,22,21,26,22,21,31,71,79,60,77,53,84,218,251,251,250,249,245,248,249,250,244,248,247,248,247,252,245,252,235,214,190,116,179,246,249,231,192,200,219,177,207,237,248,252,214,200,195,218,253,253,188,135,111,73,86,78,66,120,167,137,100,105,151,131,98,135,134,105,89,62,56,62,66,74,68,54,63,75,69,50,59,24,54,218,251,251,249,249,252,243,236,236,244,249,248,248,247,250,247,239,243,249,249,228,234,248,250,200,178,230,243,226,167,182,232,224,247,214,208,248,204,176,206,229,251,235,121,97,69,83,92,98,116,113,91,53,48,37,36,45,39,44,34,51,18,78,238,238,245,245,225,227,224,236,236,242,246,167,69,143,213,202,224,232,233,235,229,231,227,227,225,224,224,225,224,224,226,223,223,220,218,220,219,219,217,217,215,215,216,215,216,215,215,214,214,212,211,212,213,213,214,211,212,210,211,212,208,212,210,209,210,208,208,206,208,206,207,205,207,209,206,211,220,218,170,122,113,110,88,81,79,65,126,140,129,132,95,74,73,81,118,115,107,106,92,84,77,35,62,206,234,222,221,216,222,214,213,212,210,214,214,213,215,215,214,214,215,216,216,213,215,214,212,212,209,212,213,211,212,214,213,211,209,212,209,208,211,210,208,207,211,207,210,112,3,1,3,8,9,8,10,10,11,10,10,10,215,217,217,217,216,214,214,215,215,217,216,214,214,215,215,214,216,214,213,215,216,214,214,213,214,214,216,216,215,217,217,217,217,217,215,215,218,218,218,218,217,216,218,219,219,218,219,218,216,217,216,215,217,218,219,216,219,219,216,218,218,220,222,222,222,221,223,221,218,220,219,219,220,221,220,219,220,219,221,221,221,218,218,219,217,218,217,219,218,217,220,220,218,220,220,218,219,217,221,223,219,221,222,221,222,221,219,221,222,222,222,224,223,222,222,223,222,221,224,222,226,224,222,224,224,224,225,224,224,226,225,229,227,226,228,226,227,228,226,229,227,228,228,226,230,229,229,230,229,229,229,229,231,231,230,231,231,232,234,234,235,236,237,237,237,234,235,235,237,235,236,236,237,240,238,240,238,237,238,238,241,241,239,239,240,239,239,241,240,241,241,242,242,240,243,242,242,243,244,244,245,246,247,245,246,246,246,245,244,245,247,248,248,248,247,247,247,246,247,247,246,246,246,247,247,246,247,247,247,248,246,248,247,248,247,247,248,247,249,248,247,248,247,247,249,249,248,247,247,248,248,247,247,247,248,248,248,249,248,249,248,248,248,247,247,248,247,247,247,247,248,247,248,248,248,247,247,247,248,247,247,247,248,247,249,250,237,241,249,248,249,250,248,247,247,247,248,248,247,242,246,250,247,245,245,248,248,247,247,246,245,246,246,247,246,245,245,245,245,244,247,245,241,247,244,245,247,247,246,245,244,244,247,247,247,247,246,246,245,245,244,250,236,212,238,188,185,233,232,252,236,220,238,237,224,220,239,156,133,136,118,71,87,234,251,251,250,237,242,239,242,249,233,201,173,218,250,252,225,176,180,214,252,252,252,210,236,252,211,190,228,238,236,252,248,250,249,237,232,254,185,132,114,79,46,9,30,41,31,27,27,29,27,27,22,24,22,29,40,76,74,61,59,34,128,243,252,249,252,246,245,247,247,248,244,249,246,248,248,248,246,252,244,224,199,110,172,245,251,226,199,206,220,214,250,250,253,230,177,196,206,219,253,251,193,176,135,95,73,83,71,121,194,165,104,87,123,107,84,108,137,121,96,59,50,54,59,76,77,68,66,80,77,49,63,22,58,222,251,251,249,249,252,247,232,229,244,248,249,249,247,248,247,237,239,245,249,229,229,245,251,214,171,222,242,232,175,155,219,223,214,205,204,229,223,199,195,217,252,227,119,99,76,89,110,110,113,89,70,60,47,37,34,27,38,34,26,45,17,82,234,234,234,234,227,229,235,243,234,242,232,157,79,166,216,207,225,228,231,231,230,228,226,223,224,225,222,224,223,223,219,219,221,220,218,217,216,215,216,216,217,214,212,212,214,214,212,214,211,211,208,210,210,208,209,208,210,208,207,208,208,208,209,206,207,207,205,206,206,205,204,206,205,204,205,208,217,226,189,143,121,106,93,84,92,70,111,133,116,120,85,76,84,90,122,112,107,125,124,105,86,33,102,226,232,217,216,214,214,212,214,211,212,213,215,214,214,214,212,214,214,215,212,214,214,213,212,213,213,209,209,212,213,210,211,210,209,210,209,211,206,206,209,207,208,204,208,113,2,1,4,7,8,7,10,9,10,11,12,10,219,218,217,216,221,214,216,216,213,217,216,214,215,215,216,214,214,215,212,215,214,215,216,216,216,214,216,215,216,213,218,218,216,217,217,219,218,218,217,220,221,219,218,216,217,215,219,220,217,217,217,218,219,220,219,218,218,217,218,217,217,219,218,218,218,220,218,219,221,219,220,218,218,219,219,219,218,220,220,220,219,221,219,218,219,219,219,220,219,220,220,221,219,219,217,220,221,219,222,220,220,222,220,221,219,223,222,223,222,218,222,221,222,224,222,224,222,223,226,222,223,222,222,224,223,224,225,224,227,226,225,227,226,226,227,227,228,229,228,228,228,229,228,229,229,228,228,227,229,227,230,231,231,234,230,232,233,232,233,234,236,236,232,234,236,236,237,237,237,236,237,237,237,237,238,238,239,238,238,239,238,240,240,239,240,241,243,242,242,241,241,242,241,242,242,243,243,244,245,242,245,245,244,245,245,246,246,246,247,247,247,247,248,248,248,248,247,246,248,248,248,248,247,246,246,248,247,247,247,247,247,247,247,248,247,248,248,247,249,248,247,247,248,248,247,248,248,248,247,248,247,248,249,248,248,248,248,247,247,247,248,248,248,248,249,248,248,248,248,248,248,249,248,247,248,247,247,247,249,248,247,248,249,249,250,247,235,246,251,248,250,248,247,247,247,247,247,247,244,241,247,249,245,243,246,248,247,247,245,246,247,247,247,247,245,245,247,246,246,245,247,246,241,246,247,247,247,248,249,246,246,245,245,247,247,249,249,247,245,246,245,251,232,212,244,220,232,249,239,238,178,206,240,242,226,235,246,169,161,150,122,75,94,235,252,252,250,236,241,240,244,252,243,222,194,225,252,252,222,188,188,232,250,250,239,196,240,252,215,201,241,249,242,252,247,250,250,239,230,251,181,101,61,36,20,2,6,29,29,19,28,32,34,38,39,35,32,38,45,73,58,54,39,33,174,243,251,242,248,245,249,250,245,247,244,249,248,248,248,250,247,251,249,245,247,149,166,247,249,212,212,251,252,240,252,251,248,166,82,148,175,226,252,252,201,134,126,75,65,94,128,141,151,122,74,46,47,60,56,90,128,129,99,59,49,66,61,76,95,66,66,88,80,55,61,19,61,223,251,251,249,249,252,249,232,223,243,246,247,248,248,249,249,239,237,244,252,235,226,241,251,229,167,208,243,235,202,153,207,208,179,218,223,214,216,224,198,205,252,218,113,85,57,74,67,50,44,30,37,47,47,36,27,26,21,24,18,39,12,88,240,239,248,244,229,235,240,247,245,249,246,154,95,190,218,211,227,229,231,229,230,227,225,225,224,224,224,224,220,221,219,218,219,218,217,219,218,220,219,218,216,214,214,214,215,213,212,213,213,213,210,213,212,210,211,208,210,208,207,211,207,208,207,207,208,205,206,208,205,205,206,205,207,207,206,213,217,228,201,146,116,86,83,91,83,78,125,130,105,95,65,69,96,112,134,116,110,128,123,98,79,24,45,160,185,211,223,214,215,210,215,213,213,215,212,212,217,216,214,215,216,214,213,214,215,213,214,215,213,212,210,210,212,211,212,213,210,213,209,209,210,206,209,205,206,207,209,113,2,1,5,8,8,8,12,9,10,11,10,10,220,222,220,219,219,214,214,214,214,217,217,217,215,214,215,214,214,213,214,214,213,214,214,215,216,213,214,217,216,216,216,215,217,218,217,217,217,218,217,215,219,218,219,218,219,220,219,220,219,218,219,220,220,217,220,218,219,220,217,222,218,219,219,216,218,217,219,220,220,220,219,221,219,218,219,218,220,218,216,219,219,220,220,220,218,220,220,218,216,217,219,218,217,217,217,218,218,216,218,219,218,219,220,218,221,222,220,219,220,222,219,222,221,219,223,222,222,222,225,222,223,222,222,225,224,225,224,225,227,225,226,227,225,223,227,228,226,227,225,227,226,226,229,225,226,228,227,227,227,226,229,231,232,231,231,230,232,233,232,234,234,237,235,234,234,233,238,234,236,235,237,237,235,238,237,237,239,239,239,239,239,238,238,239,240,239,240,242,241,241,240,241,241,240,241,242,243,242,244,244,244,244,242,242,244,244,245,245,245,246,246,246,246,246,244,245,247,247,248,246,246,246,246,245,247,248,247,247,247,247,247,248,247,247,248,247,248,247,248,247,247,247,247,248,248,248,249,248,249,247,247,248,248,248,247,247,247,247,247,247,247,248,248,247,248,248,247,247,248,248,249,248,248,248,248,247,247,247,248,248,248,247,248,249,250,243,238,248,247,248,248,247,247,247,247,248,248,246,240,241,247,247,245,242,248,247,246,247,246,246,247,247,247,246,246,246,246,247,247,245,247,247,242,247,247,247,247,246,247,247,247,244,244,244,245,246,247,247,244,244,246,251,232,232,252,233,243,219,182,225,210,229,252,239,225,239,250,193,171,162,137,79,92,238,252,252,250,237,243,238,244,252,247,232,199,209,249,252,218,196,198,235,250,250,229,182,242,252,232,230,252,252,245,252,248,251,248,239,223,240,164,94,70,49,39,7,7,14,30,27,27,48,51,55,53,55,49,47,33,44,18,64,110,112,231,246,250,244,243,246,249,249,244,247,244,249,247,248,247,247,248,250,252,240,250,166,155,243,217,195,221,252,251,245,252,252,247,140,84,131,173,236,250,250,171,101,65,43,35,93,142,154,141,70,31,36,47,48,60,92,107,77,65,59,60,68,70,70,81,58,70,105,85,49,61,21,55,214,251,251,250,250,252,252,238,228,244,246,248,247,247,247,249,240,234,242,250,240,225,236,252,242,171,198,242,237,223,171,188,216,180,216,239,189,201,242,212,204,252,190,80,44,6,4,6,7,10,7,13,29,41,33,33,29,30,25,33,36,18,102,233,233,234,234,229,237,246,250,249,234,234,150,114,204,215,213,225,230,230,227,229,227,223,223,223,222,222,222,221,220,218,219,219,217,219,218,218,217,216,215,214,214,215,213,213,213,212,213,209,212,212,214,211,209,211,209,210,210,209,208,208,207,208,206,206,208,208,207,207,208,206,206,206,206,207,209,210,196,160,142,118,93,83,83,87,75,124,134,111,116,75,75,95,112,134,113,118,126,113,94,73,29,20,105,171,212,226,217,216,212,214,215,214,215,217,217,217,214,215,216,215,217,216,217,215,214,212,212,214,214,213,208,210,212,212,211,211,212,210,211,210,210,209,206,210,208,210,111,3,1,3,7,9,7,10,10,11,11,10,11,221,220,220,217,222,215,216,219,217,217,217,216,214,216,217,215,217,216,214,215,213,215,216,218,217,218,219,218,218,217,219,218,217,220,219,219,216,219,219,217,219,218,218,222,221,221,219,221,220,222,222,220,222,221,221,219,222,221,222,222,220,220,220,220,219,220,216,218,220,218,220,218,218,220,220,219,221,220,221,220,219,220,218,220,220,220,217,218,218,218,219,218,217,219,220,218,219,218,220,219,220,222,220,221,219,220,219,221,222,220,224,222,222,223,221,223,223,223,223,223,225,223,222,225,224,226,224,225,226,227,228,225,225,224,225,225,225,227,226,225,224,225,226,226,225,225,229,229,229,229,229,231,229,231,231,233,231,232,232,231,232,232,234,236,236,235,235,236,235,236,236,237,237,239,240,237,238,239,240,239,239,240,239,240,241,241,241,241,242,240,241,240,239,241,240,243,244,241,244,244,244,244,245,244,245,245,244,246,244,244,246,246,245,244,245,245,246,247,247,246,246,246,247,246,246,247,246,247,247,247,247,246,247,247,247,247,248,247,248,248,247,248,248,249,248,247,247,247,247,248,248,248,248,247,247,248,248,247,247,248,247,247,248,248,248,249,247,247,249,248,248,247,248,247,249,247,247,248,247,247,247,248,249,248,249,238,240,251,247,247,247,248,249,247,247,247,249,245,241,246,249,248,244,246,249,247,247,247,246,247,247,246,247,247,247,247,247,246,248,248,247,249,242,244,248,247,249,247,247,247,247,246,246,244,242,246,246,246,246,245,247,250,227,235,252,196,202,214,232,252,236,250,247,234,219,243,250,191,163,140,123,69,91,238,252,252,250,238,243,240,244,252,238,214,183,184,240,251,217,199,194,239,250,250,232,196,252,252,253,218,232,251,246,251,248,251,249,242,228,241,228,194,176,178,199,226,221,155,82,18,36,63,61,61,60,54,46,71,101,116,73,183,210,170,250,250,251,241,237,244,245,248,245,246,244,247,247,247,247,247,248,250,252,238,250,184,146,206,185,185,237,252,241,242,240,252,192,116,175,198,202,249,250,250,183,125,117,86,89,145,195,205,198,135,75,80,102,113,118,127,118,84,62,55,49,51,50,62,74,55,62,110,89,50,66,18,56,214,250,250,250,249,252,252,246,227,241,247,247,249,248,247,248,242,235,239,250,247,227,232,252,252,183,181,238,236,229,179,159,169,168,221,243,184,178,241,223,181,222,167,94,84,55,58,53,54,59,45,49,53,33,27,42,47,40,36,25,35,10,117,241,235,249,237,230,242,245,250,249,249,245,139,134,217,214,216,225,225,228,227,227,226,223,224,224,221,221,221,221,221,220,219,218,219,217,217,216,215,215,216,214,213,214,213,213,213,213,211,212,213,212,214,208,211,210,206,208,207,208,210,208,208,208,208,209,207,207,208,207,206,205,206,207,207,205,210,203,171,141,127,134,115,91,79,77,82,117,135,128,131,98,85,100,113,132,111,114,131,117,103,72,21,98,191,201,218,217,216,218,214,214,214,214,213,215,218,217,215,217,217,213,216,217,215,216,217,216,213,212,214,212,212,212,213,211,208,207,213,208,205,210,207,210,207,208,207,207,112,3,1,3,7,9,7,10,10,10,11,10,10,217,219,217,217,219,216,216,216,217,217,215,213,215,217,217,214,218,216,214,216,215,215,215,217,220,219,215,218,218,218,217,217,217,217,220,221,217,217,218,220,220,218,220,219,218,219,218,219,221,221,220,221,220,218,218,220,218,219,220,220,217,220,220,217,219,218,221,219,218,219,217,218,218,220,223,219,220,220,221,221,219,218,217,220,220,220,220,219,219,221,222,221,221,219,221,220,220,219,219,219,218,218,219,217,217,220,221,218,220,220,220,221,220,220,224,220,221,222,221,220,222,223,221,222,221,221,223,222,225,223,223,225,225,224,226,225,226,229,225,226,225,225,226,226,229,229,228,228,229,230,230,228,232,232,230,231,231,232,232,232,231,234,235,236,237,235,237,236,236,236,236,236,237,238,237,237,237,237,237,239,239,242,240,240,241,242,242,244,242,242,241,241,240,240,241,241,243,242,243,243,242,243,243,245,246,244,245,246,246,245,245,246,245,245,244,244,245,246,247,246,245,245,247,246,245,247,246,245,247,246,247,246,246,248,247,246,246,246,246,247,247,247,248,247,248,248,247,247,247,248,248,248,248,247,247,247,247,247,247,247,246,247,247,246,248,247,247,249,248,248,247,247,248,247,248,248,247,247,247,247,247,248,247,250,247,236,244,249,247,248,247,247,247,248,247,247,247,240,242,248,247,245,242,246,248,246,247,246,247,246,247,246,246,247,247,247,246,247,247,246,247,249,244,244,247,247,248,247,247,247,247,247,248,246,244,241,243,245,246,247,248,249,218,226,233,212,240,240,252,252,208,214,235,227,214,241,249,178,134,125,124,70,85,239,252,252,250,237,244,240,244,251,239,209,186,191,252,252,203,192,200,247,251,251,242,224,253,253,225,144,173,226,241,250,249,251,246,247,241,252,253,253,250,252,252,252,248,193,113,26,81,118,61,55,57,43,63,204,250,236,158,226,160,141,250,246,252,226,223,245,245,251,246,243,242,245,245,246,247,246,247,247,252,229,243,187,160,195,161,196,252,252,252,204,168,206,148,127,213,212,222,252,252,252,247,190,154,158,164,211,218,210,203,137,99,115,132,154,145,144,133,100,87,65,60,55,37,56,71,59,57,102,88,55,63,17,62,217,250,250,249,248,251,249,244,219,233,247,248,248,244,245,248,245,234,238,247,248,229,227,250,252,190,169,225,241,226,176,103,142,198,214,249,198,179,250,214,150,192,214,229,243,242,250,248,243,247,241,245,235,147,91,93,81,56,25,31,31,19,127,233,230,234,234,240,244,246,252,234,215,217,122,152,223,214,217,218,227,223,223,222,223,222,221,222,221,221,218,218,218,218,219,220,216,215,215,215,215,215,214,214,213,212,211,211,212,213,211,209,209,208,210,210,209,209,208,207,208,207,207,207,206,208,206,207,206,207,207,206,207,206,205,205,206,207,211,218,194,139,117,116,109,90,81,83,78,121,134,133,139,99,110,119,137,133,108,113,117,117,101,69,22,122,225,223,225,221,215,215,213,217,214,213,216,214,215,217,214,214,216,214,214,214,216,216,214,214,214,212,211,212,211,211,211,208,207,207,207,207,207,206,205,208,208,208,205,208,113,2,1,4,7,7,8,10,9,10,10,10,10,218,219,217,216,220,217,217,217,216,216,217,215,217,215,215,216,214,216,216,218,217,217,215,217,217,215,215,216,216,216,217,217,219,217,217,220,218,218,218,217,218,219,221,221,219,220,218,219,220,220,219,220,220,217,221,218,218,219,221,221,218,220,220,220,220,219,217,220,220,220,221,220,219,220,220,219,220,220,219,218,222,220,218,220,220,219,218,221,219,219,219,221,221,218,221,218,219,218,217,218,217,218,219,218,219,219,219,221,220,219,222,221,218,222,220,221,223,220,222,222,222,224,222,223,222,222,221,223,224,222,225,223,224,226,226,227,225,226,228,226,228,227,226,227,226,226,227,226,227,227,227,229,231,230,228,231,231,234,232,232,234,233,234,234,235,235,235,237,237,235,235,236,237,237,238,238,237,238,239,240,240,241,240,241,242,242,242,241,241,241,241,242,241,242,242,243,242,243,244,243,243,243,244,244,246,245,245,245,245,246,246,245,244,245,245,245,245,246,244,245,246,246,246,246,247,246,246,245,246,247,248,247,247,247,247,247,248,247,248,248,248,248,247,247,247,247,247,247,247,249,247,247,248,248,248,247,248,248,248,249,247,247,247,247,248,248,247,248,248,248,248,248,248,246,247,247,247,247,247,248,248,248,247,249,247,236,246,249,246,248,248,247,247,248,248,249,246,240,245,249,250,245,244,248,247,247,246,247,245,246,248,246,248,247,246,247,247,246,247,247,246,249,245,243,247,246,247,247,246,247,246,247,249,248,247,246,244,242,246,246,250,247,221,245,252,237,252,231,211,201,184,223,234,225,211,241,249,176,152,146,144,85,92,242,252,252,249,237,245,239,244,252,247,236,206,203,252,243,199,198,214,247,250,250,228,177,253,237,194,90,65,191,244,252,250,246,246,251,243,251,253,237,224,224,249,251,246,183,101,5,72,105,68,58,52,37,50,223,253,253,152,97,40,88,234,240,248,214,214,247,248,252,245,242,240,244,248,247,247,244,247,245,252,193,222,203,163,195,154,216,253,240,225,146,181,246,124,147,231,201,239,252,253,253,247,184,101,103,104,108,97,74,93,99,71,53,44,49,57,61,66,60,63,66,78,71,41,60,78,63,58,102,89,52,61,16,70,218,249,249,249,249,252,250,249,224,226,243,246,248,246,247,247,246,237,237,245,251,232,224,244,252,212,160,213,243,227,205,172,178,226,224,244,234,198,249,225,162,199,236,252,252,252,252,252,252,253,253,252,252,203,134,121,99,60,35,27,29,13,143,234,227,247,238,245,252,251,253,218,185,191,125,176,229,212,216,222,227,226,224,224,221,220,220,220,219,219,218,217,217,217,217,217,217,214,214,214,213,215,212,213,214,214,212,211,210,208,210,210,207,206,211,207,208,208,208,210,206,208,208,206,207,207,205,206,206,207,207,205,206,207,206,207,207,206,216,215,181,154,141,136,123,98,82,74,82,120,155,159,147,109,110,113,107,116,103,122,150,132,112,44,54,197,220,212,222,220,222,215,214,216,216,217,219,213,216,218,215,217,216,215,215,215,215,215,213,214,214,211,211,209,210,212,211,210,207,205,208,208,207,207,208,210,209,212,207,208,112,2,1,4,7,7,7,10,9,10,10,10,10,217,222,219,217,221,216,218,217,216,215,215,217,218,216,215,214,216,217,216,217,215,214,219,216,217,219,217,216,217,217,217,219,217,219,218,218,217,219,222,221,219,217,220,221,220,221,219,220,221,223,220,220,222,220,220,223,220,220,219,219,219,219,218,218,220,220,221,220,219,218,217,221,219,217,219,219,222,218,220,220,217,220,218,218,219,217,220,221,217,219,220,218,220,217,218,220,219,221,219,218,220,218,221,221,218,220,220,219,219,220,222,221,219,219,221,222,220,220,220,223,223,223,223,225,224,225,226,223,223,224,225,226,227,224,224,226,226,225,225,227,229,227,227,227,227,226,227,227,227,229,230,230,230,229,230,230,230,233,235,232,232,233,232,232,234,236,237,235,235,235,236,237,240,239,237,238,239,240,239,239,240,240,239,242,243,243,242,241,241,241,242,241,240,241,241,242,242,243,243,243,243,242,244,244,242,242,244,244,244,244,244,244,244,245,244,244,245,245,246,246,245,246,245,244,245,245,246,248,246,245,246,246,246,249,246,245,247,247,248,247,246,247,246,248,247,247,247,247,248,246,247,247,247,247,247,247,247,247,247,247,247,248,247,247,247,248,247,247,246,246,248,248,247,247,247,247,247,247,247,247,247,247,248,250,240,235,247,249,248,248,247,247,247,247,246,247,244,241,247,248,246,244,247,250,247,247,246,245,245,244,246,246,247,247,246,245,245,245,245,246,246,247,246,242,245,247,247,246,246,247,247,247,247,247,248,247,244,242,242,244,250,247,226,252,252,241,230,175,207,225,211,235,240,223,212,246,250,184,159,151,146,109,124,246,252,252,250,243,246,240,242,250,249,234,215,195,242,239,211,216,232,248,248,248,172,100,212,216,171,80,35,142,232,250,250,246,242,252,236,234,251,219,202,186,222,250,222,150,92,2,60,92,62,56,51,29,36,212,251,213,71,21,11,71,215,239,250,207,224,250,248,252,245,241,239,242,247,246,247,246,247,248,251,215,208,220,204,178,143,219,214,147,164,175,253,253,141,198,234,199,248,252,252,252,248,170,108,84,17,6,9,2,57,110,68,36,20,27,27,46,74,50,37,30,57,56,38,69,78,68,57,100,93,52,61,19,61,206,249,249,249,249,252,251,249,231,226,240,248,247,246,247,246,249,242,237,241,250,237,222,239,252,229,162,195,240,234,233,214,194,234,224,237,224,174,231,234,199,210,215,250,252,252,252,252,252,253,253,252,252,174,124,118,88,55,27,30,25,22,173,234,230,234,234,252,252,253,252,204,188,189,135,195,228,213,212,218,227,223,224,223,221,218,218,219,220,219,218,217,215,215,216,216,216,214,211,213,214,214,211,211,212,211,212,211,210,209,209,210,208,208,210,207,207,209,207,205,209,206,206,207,207,207,205,207,206,205,205,206,205,206,206,206,209,205,210,222,199,152,123,118,115,93,79,96,119,141,137,128,120,88,82,69,66,71,87,123,129,113,91,39,36,163,215,216,227,217,217,219,217,217,215,215,217,217,217,214,215,217,217,214,215,216,214,214,216,214,214,210,213,212,210,212,211,208,207,207,207,208,206,208,208,211,208,210,209,208,112,2,1,3,7,9,7,10,10,10,11,10,10,217,218,217,218,217,214,214,216,217,218,217,215,218,216,217,217,217,217,216,217,215,217,218,219,218,219,219,216,214,217,217,216,216,216,218,217,217,220,218,217,220,218,217,220,218,221,217,218,220,220,222,219,222,220,225,222,219,220,219,220,220,220,219,219,218,219,218,221,221,220,218,218,218,217,221,221,219,220,218,217,220,218,217,221,221,221,221,221,218,218,220,220,219,216,221,220,221,222,219,222,221,220,219,218,219,219,219,218,220,220,219,221,220,220,222,221,222,221,223,222,222,224,224,223,223,224,223,226,224,222,225,224,226,225,224,227,228,228,227,226,226,227,226,227,228,227,229,230,230,229,229,230,229,229,230,231,230,231,232,234,231,231,233,234,234,235,234,234,236,237,237,237,238,239,239,240,240,239,239,239,241,242,240,241,243,242,243,243,244,242,241,243,242,243,243,242,243,243,243,244,244,243,244,245,245,243,244,244,244,244,245,245,245,246,244,245,245,246,245,245,245,244,245,247,245,244,248,246,246,246,245,245,247,247,246,247,247,247,247,246,247,247,247,248,248,247,247,247,247,248,247,248,248,247,247,248,247,247,247,246,247,247,247,246,247,247,247,247,247,248,247,248,248,247,248,247,248,248,248,247,247,246,248,249,237,236,248,247,249,249,247,247,248,249,249,249,241,242,249,248,245,243,247,247,247,247,246,247,246,246,246,246,247,245,246,247,245,244,245,244,246,247,247,242,244,247,246,247,246,246,246,246,247,248,247,247,247,244,244,241,249,238,215,247,224,191,215,217,246,247,232,241,235,221,210,245,250,179,150,137,132,117,147,247,252,252,250,249,251,243,245,251,247,225,206,197,234,236,228,204,219,249,248,248,144,38,156,202,188,113,9,106,226,248,249,244,243,252,232,221,243,213,201,177,213,251,186,137,88,1,74,86,62,61,50,36,30,204,248,132,22,6,26,147,237,245,251,214,241,252,247,250,245,245,239,242,247,245,248,244,250,244,249,197,149,166,188,194,137,201,190,174,226,229,248,219,84,187,217,198,251,248,251,250,249,218,103,78,23,8,14,11,37,72,62,41,36,36,33,60,87,55,37,34,63,51,29,74,80,66,58,104,97,51,62,18,62,206,246,249,249,249,252,249,250,240,226,234,247,247,247,248,247,249,245,237,239,249,244,223,231,252,248,171,179,235,240,235,222,174,214,235,223,221,177,202,218,203,218,205,230,247,248,248,244,241,242,239,253,252,146,108,112,79,50,24,36,19,33,208,243,234,250,246,253,253,235,223,195,200,186,148,211,228,213,211,217,224,219,222,222,220,218,218,218,218,215,216,215,214,218,215,213,214,215,215,214,214,212,211,210,212,211,210,210,210,211,209,210,207,206,211,206,206,207,205,208,207,209,206,204,207,205,205,206,206,206,206,207,208,206,204,206,206,206,210,214,182,139,118,100,73,60,64,101,136,149,141,106,78,49,49,48,47,62,83,105,102,78,60,22,21,133,196,220,231,218,219,219,217,220,219,217,217,219,217,217,216,218,216,215,215,215,215,214,215,214,215,214,214,215,212,212,211,209,209,209,211,208,211,211,210,209,206,211,208,210,112,3,1,3,8,9,8,10,10,11,11,10,11,215,216,218,216,216,215,217,217,216,218,218,218,214,217,216,216,218,217,214,217,218,216,218,217,217,217,217,217,216,217,217,219,218,218,216,218,217,217,217,217,218,219,220,219,219,220,219,216,215,218,217,220,220,218,218,219,220,219,220,220,216,221,220,220,220,218,219,219,219,219,220,220,218,220,222,219,222,217,219,221,219,220,218,219,219,219,221,220,218,221,220,219,219,216,220,218,218,220,217,220,218,217,218,217,219,219,221,220,219,220,220,221,221,222,220,219,221,223,223,223,222,221,222,221,221,222,223,224,224,227,223,224,225,224,226,225,224,226,227,226,224,224,226,227,227,229,227,225,228,229,229,231,231,229,231,231,231,231,229,232,232,234,235,234,235,234,235,235,237,237,237,239,239,238,239,240,239,239,238,239,239,239,241,243,240,242,243,242,243,244,245,244,244,242,242,242,243,243,244,243,242,242,242,244,245,244,244,245,244,244,245,245,244,245,244,245,245,244,244,244,244,244,244,244,247,246,245,246,246,247,248,246,246,246,246,247,247,247,247,246,247,247,247,246,246,246,246,247,247,247,248,246,246,246,246,247,247,247,247,248,247,247,247,247,246,247,247,247,247,246,247,247,247,247,247,247,247,247,247,246,245,245,249,247,234,241,248,247,248,247,248,247,247,248,249,244,241,247,248,247,242,244,247,246,247,247,247,247,246,247,245,244,245,245,246,245,244,244,244,244,246,246,247,243,243,246,246,246,246,246,245,247,246,247,247,245,247,246,246,241,248,226,202,234,213,224,244,232,252,237,205,221,229,213,207,246,249,172,141,136,120,98,97,152,201,245,249,249,250,243,244,249,250,226,211,207,228,203,160,122,181,244,246,233,125,54,135,190,197,138,73,159,243,250,249,241,245,251,237,222,237,211,198,178,218,252,175,138,77,1,75,85,67,61,56,38,34,206,242,147,39,71,192,241,251,250,242,196,236,252,244,250,245,246,236,240,245,244,244,245,246,249,246,184,69,68,130,147,206,250,241,240,232,227,247,158,87,193,193,212,252,249,252,251,249,225,141,87,55,57,38,25,74,71,51,54,37,35,50,74,70,46,45,42,58,46,35,74,83,67,56,106,97,53,61,24,61,196,247,249,249,249,252,247,246,246,228,229,245,245,247,248,247,248,245,236,237,247,247,222,227,251,250,183,167,222,244,242,237,178,191,245,222,208,211,196,188,211,229,207,224,247,244,244,243,236,239,234,252,244,131,106,100,76,49,22,38,15,56,226,234,234,244,252,253,237,213,222,202,205,176,163,220,225,217,211,220,221,219,222,219,217,219,217,217,218,215,216,216,217,215,214,214,214,214,213,213,213,213,211,210,209,208,210,210,211,209,208,210,209,206,207,206,206,207,207,207,205,205,207,206,206,205,205,206,207,205,207,208,208,208,205,208,209,207,210,213,170,128,112,99,98,75,71,124,152,162,163,143,104,71,63,71,73,87,110,120,105,83,36,44,154,217,216,215,219,219,219,218,217,218,217,218,219,216,216,217,217,214,215,215,215,215,218,218,216,213,214,214,215,215,213,212,209,211,212,210,208,208,208,208,206,208,208,209,208,209,112,2,1,5,7,8,8,10,9,10,10,10,10,218,218,218,217,218,217,217,216,214,217,216,217,217,215,217,217,217,217,216,217,218,218,217,217,217,219,218,217,219,217,217,217,217,218,219,218,216,217,218,217,217,216,219,219,217,219,219,219,218,218,219,219,218,217,221,218,219,219,218,218,218,219,219,219,221,219,219,219,216,217,217,217,220,220,221,220,220,223,221,220,222,220,220,219,217,220,218,218,222,219,219,220,219,220,218,217,218,221,220,220,219,219,219,218,221,220,218,220,222,221,218,221,223,221,221,220,223,220,222,224,221,221,223,222,223,224,222,223,223,223,225,224,225,225,225,227,225,225,227,225,227,228,226,229,229,230,229,227,228,227,228,230,229,227,228,226,229,230,230,233,231,232,234,233,236,237,235,235,236,234,236,238,239,239,239,240,239,239,240,239,240,241,242,241,243,242,242,244,242,244,244,244,243,243,242,242,242,242,242,242,244,244,244,244,244,244,244,245,244,242,245,243,245,245,242,245,245,247,246,244,245,244,245,246,245,245,247,247,246,247,247,247,247,247,247,247,246,247,247,247,247,247,247,246,246,246,247,246,246,247,247,247,248,247,246,247,247,246,247,247,247,247,246,246,248,247,247,247,247,247,247,247,247,247,247,247,248,247,247,247,249,249,249,243,234,245,249,247,248,248,247,247,246,246,246,242,241,248,249,246,242,245,248,247,248,247,247,247,246,246,245,244,245,246,246,244,245,245,246,245,246,247,247,246,242,247,245,245,246,245,245,244,247,247,246,246,247,246,248,246,249,231,226,252,232,242,252,228,219,183,193,229,232,212,207,249,248,182,153,140,127,92,60,53,59,175,249,247,249,240,242,249,251,240,224,214,154,117,126,38,99,232,249,244,155,52,88,175,232,202,141,204,251,251,249,242,247,251,238,226,231,206,196,179,223,252,169,141,75,0,75,84,73,62,54,33,41,221,247,197,116,195,249,249,249,249,230,200,240,249,247,250,246,247,235,237,244,244,245,243,246,245,248,189,41,7,32,115,237,249,247,229,151,198,223,139,183,242,202,231,252,249,252,247,251,251,212,162,89,76,57,86,142,105,66,50,46,45,54,69,49,43,56,51,42,24,41,81,81,69,51,107,106,50,64,22,56,195,244,249,249,249,252,248,245,248,234,226,242,246,247,246,247,246,247,236,232,245,250,229,220,244,250,198,163,208,242,246,238,188,168,239,222,188,217,179,161,208,236,210,205,230,238,239,240,237,240,232,253,237,130,100,97,71,48,28,37,0,84,239,235,246,253,251,245,203,203,229,212,206,165,180,226,225,214,206,220,221,219,220,217,217,216,217,216,217,215,214,217,214,217,214,214,214,212,213,211,214,212,211,213,212,210,209,210,208,210,206,209,210,206,208,206,208,207,207,208,203,206,206,205,209,206,206,207,205,207,207,205,206,207,209,208,206,209,215,228,211,161,136,117,115,132,135,154,169,162,165,151,116,94,96,93,99,100,107,111,101,77,54,136,217,239,233,216,219,218,220,218,217,217,214,219,219,219,216,217,218,214,218,216,215,217,217,217,214,214,215,214,214,215,212,211,210,209,209,208,209,207,209,209,208,210,210,212,208,212,113,2,1,4,7,7,8,10,9,10,11,10,10,217,217,218,216,218,217,214,215,216,214,215,214,217,218,217,217,216,214,216,219,219,218,218,217,218,217,216,215,214,217,217,216,214,216,217,218,218,220,219,218,218,217,218,220,221,220,220,220,221,222,218,219,219,217,218,219,220,220,220,220,219,219,219,219,220,217,219,221,219,220,219,219,218,218,220,219,220,219,218,220,221,220,220,221,219,218,220,219,217,219,219,218,221,222,221,219,221,221,218,220,219,220,220,219,222,219,220,220,222,221,219,222,220,220,222,221,222,220,222,226,222,223,225,224,225,224,224,223,223,223,221,224,226,225,228,226,227,228,225,228,228,227,229,227,225,228,228,227,229,226,225,228,229,228,228,229,230,230,232,232,230,233,232,234,233,236,235,230,235,234,235,234,236,235,235,237,238,240,238,240,240,240,240,240,240,241,241,241,241,242,243,241,241,242,242,243,242,242,242,241,241,243,243,244,242,243,244,243,243,243,244,245,244,245,245,245,246,245,246,245,248,247,247,248,246,245,245,245,247,247,247,246,247,246,246,247,245,246,246,246,247,245,246,246,246,247,246,246,246,246,247,247,248,247,246,248,246,247,247,246,247,247,247,247,246,247,247,247,246,246,246,246,247,246,246,245,247,247,247,247,247,247,248,241,236,245,247,247,247,247,247,247,247,248,246,238,243,248,246,244,242,247,247,248,247,246,245,245,245,245,245,244,244,244,245,245,244,244,244,246,245,244,247,244,241,245,246,245,246,245,244,245,246,246,246,246,246,246,247,247,252,229,230,252,222,240,213,178,221,220,225,237,236,211,208,251,250,181,161,155,131,86,55,53,6,59,185,227,249,243,242,249,252,242,231,186,87,99,120,18,62,203,252,252,186,124,125,182,243,222,179,210,252,252,249,245,247,250,244,229,230,204,196,182,226,252,166,142,71,1,75,87,71,55,56,32,41,224,249,191,143,236,252,230,237,243,230,202,240,251,244,248,245,248,236,237,244,244,244,244,244,249,249,173,54,61,55,71,220,246,247,192,166,249,226,163,232,248,207,247,252,250,250,245,252,252,249,204,108,57,7,49,118,92,55,44,53,47,61,79,43,50,75,50,32,13,46,84,83,68,48,106,108,53,62,22,50,182,244,248,249,249,252,247,244,249,239,224,239,245,244,245,246,245,247,237,228,239,250,232,222,240,251,220,163,191,236,238,235,184,119,188,183,148,225,196,132,190,226,183,162,205,231,236,242,235,237,231,253,231,127,108,92,76,43,34,34,15,112,233,234,234,253,253,226,187,223,234,218,185,152,193,224,226,213,203,218,219,221,220,218,215,217,216,217,216,214,215,214,215,213,216,216,211,213,213,212,211,211,212,212,212,211,211,211,210,209,208,208,206,206,206,205,206,205,205,207,205,205,205,206,207,206,205,206,208,205,204,206,208,207,207,208,208,209,217,232,206,167,144,119,111,111,113,115,105,100,89,91,91,72,60,50,52,51,55,62,60,58,45,108,215,226,227,226,218,216,215,220,218,218,221,219,219,220,220,218,217,218,217,218,217,216,215,214,215,214,217,214,215,215,212,211,211,211,210,210,208,210,210,210,208,210,211,212,210,210,112,2,1,2,7,9,7,10,9,10,10,10,10,217,218,218,216,218,216,217,219,215,217,217,218,215,215,218,216,217,219,217,220,219,218,218,216,219,218,220,219,218,217,219,219,218,218,218,220,217,220,220,217,222,218,219,218,219,219,219,221,219,220,217,219,219,221,221,219,221,218,222,222,220,220,218,219,223,219,219,222,218,219,221,220,219,217,219,218,218,221,219,221,221,220,221,220,222,224,222,219,220,221,222,222,221,222,222,221,221,221,219,217,222,223,221,222,223,222,223,224,222,222,220,222,221,222,223,222,225,222,224,225,223,225,225,225,225,224,224,225,226,227,227,225,226,226,226,226,226,227,229,227,228,229,227,227,227,227,227,229,229,231,230,229,231,231,234,233,234,234,231,233,232,232,233,232,232,233,235,236,236,235,235,234,234,237,237,238,240,239,240,238,239,241,241,239,241,241,239,242,242,242,242,243,244,243,244,244,244,244,244,243,244,243,241,242,241,241,244,242,244,242,244,244,244,244,244,246,245,245,245,246,246,245,247,245,246,246,246,247,247,247,247,247,247,246,246,247,247,246,245,245,246,248,247,247,248,248,246,247,247,245,246,246,246,245,246,246,247,247,247,247,246,247,247,247,247,247,247,247,246,247,246,246,245,245,246,247,247,246,247,247,247,247,249,237,238,248,247,246,246,247,245,246,246,248,244,239,246,247,246,242,244,247,246,246,246,245,246,245,245,245,246,246,245,245,244,244,245,244,244,245,245,245,247,246,241,243,245,245,247,245,244,244,244,245,245,247,246,244,249,248,250,224,229,230,187,204,213,222,250,233,226,231,231,205,212,250,247,178,150,150,132,81,56,98,78,111,203,235,250,237,244,249,252,229,218,183,50,115,175,25,60,205,248,253,224,191,186,201,241,221,189,204,245,251,246,244,247,250,244,234,224,197,194,184,232,250,154,149,71,1,80,84,70,51,56,33,47,234,248,170,158,250,251,215,232,235,210,199,238,246,245,248,245,249,240,236,244,243,244,244,244,247,251,198,181,205,125,67,137,191,190,195,226,252,218,168,251,228,210,252,252,249,249,249,252,252,249,201,134,89,28,6,14,9,29,39,42,44,55,63,49,59,83,52,23,19,48,92,86,69,46,105,112,56,63,19,46,180,239,248,250,249,252,247,245,249,246,225,231,244,245,245,245,243,248,242,229,236,248,240,221,231,252,236,169,181,228,244,221,196,96,92,166,183,237,236,159,186,225,171,161,190,224,240,238,236,238,230,252,225,119,105,94,73,45,33,29,4,148,245,250,250,253,241,199,206,251,241,213,156,146,207,222,230,209,207,223,217,221,220,218,216,213,213,214,217,218,216,216,213,214,212,210,212,212,215,213,212,212,208,210,210,209,211,210,211,212,210,209,207,208,207,206,209,205,204,207,207,207,206,205,207,207,207,208,207,206,205,206,207,208,210,209,210,211,216,212,163,113,97,68,59,69,50,41,44,36,35,71,84,69,42,28,33,20,37,50,62,30,42,167,212,217,222,221,222,216,218,218,218,222,222,221,220,222,219,218,220,217,220,219,217,219,217,219,218,219,219,216,217,216,213,212,212,212,212,213,214,212,213,214,211,210,211,213,211,210,112,3,1,3,7,9,7,10,10,10,10,10,10,214,217,218,215,216,215,214,216,218,217,218,215,217,217,215,218,217,217,218,218,217,217,217,215,219,218,217,219,218,220,218,219,217,219,219,218,218,217,217,218,218,220,220,218,217,216,219,218,216,219,220,220,220,218,220,220,219,220,217,217,218,218,221,219,220,219,220,219,218,218,218,218,219,218,220,219,222,222,221,222,221,222,221,222,222,221,221,221,221,219,221,222,221,222,222,221,222,221,221,222,219,220,220,222,223,221,221,221,221,219,222,222,222,222,223,221,222,225,225,223,222,224,224,222,225,224,224,226,224,226,225,225,226,222,224,225,225,224,222,224,226,225,226,227,226,226,226,228,226,226,227,229,231,231,231,232,232,231,232,231,231,230,231,232,231,232,233,232,235,233,233,234,235,236,235,237,237,237,237,237,238,240,240,240,240,240,240,242,240,241,242,241,243,243,243,244,243,242,242,241,244,242,241,243,241,242,243,242,244,243,245,244,245,245,245,246,245,246,248,246,248,247,245,246,246,246,248,248,248,248,247,247,247,247,246,248,246,248,245,245,246,245,244,244,247,246,244,245,244,244,245,245,245,244,244,245,245,246,246,246,246,245,246,246,245,246,246,246,246,247,246,245,246,245,246,248,247,246,245,246,245,249,244,234,240,247,247,247,246,246,245,245,247,245,240,242,247,247,244,241,245,246,245,245,244,245,245,244,244,243,245,246,245,245,245,245,245,245,244,244,244,244,245,246,241,241,245,245,245,243,244,244,244,246,246,246,245,244,247,249,249,212,229,233,201,236,226,227,230,184,209,231,231,205,214,250,244,164,138,139,128,67,110,238,224,227,232,223,242,230,234,242,253,237,223,168,86,174,168,66,127,232,253,253,228,206,203,214,244,214,197,198,231,251,239,242,244,247,244,233,225,198,195,186,232,233,144,143,61,3,82,86,68,47,51,29,46,235,248,134,165,250,245,218,226,220,206,200,233,246,243,245,242,246,239,231,240,244,243,243,241,249,240,238,252,252,169,100,58,69,202,222,250,250,153,177,252,207,222,252,247,250,245,250,249,252,243,221,223,212,147,127,115,71,42,12,18,54,62,63,50,93,90,37,29,19,54,92,86,68,44,108,107,55,61,24,50,179,243,248,250,249,252,245,243,247,247,231,225,240,246,247,246,243,246,242,226,230,247,242,218,224,253,246,176,168,215,241,226,206,140,140,203,212,234,251,198,212,224,185,199,198,217,236,237,235,235,228,253,212,116,103,88,77,34,34,16,31,194,233,251,251,244,208,202,229,252,201,159,111,163,218,223,229,201,207,221,217,218,216,215,214,215,212,215,213,213,215,212,211,210,211,210,210,213,211,210,211,210,208,208,210,209,208,209,208,208,210,210,207,207,207,207,206,205,205,205,205,206,207,206,208,206,205,206,206,206,208,207,206,206,210,206,208,209,214,195,127,91,69,36,53,57,43,49,37,35,39,53,72,66,55,41,30,28,44,59,63,18,73,198,221,225,220,216,218,218,218,218,220,218,221,220,219,220,218,220,217,217,217,217,215,215,216,215,215,215,216,215,214,213,214,212,210,212,214,213,210,211,213,212,210,212,214,212,207,211,112,1,1,5,7,8,7,10,10,9,11,10,10,217,220,217,217,217,214,216,218,218,217,216,217,217,217,216,215,218,217,219,218,217,218,218,218,221,217,219,217,216,219,220,219,221,220,219,220,218,219,218,218,222,217,218,220,219,219,219,219,218,220,219,219,218,220,221,219,218,217,218,219,221,220,218,218,221,220,221,220,218,218,220,221,222,222,222,223,222,225,222,221,224,224,224,223,223,223,222,221,221,223,221,222,221,222,224,221,223,225,223,222,224,222,222,223,222,224,223,221,222,222,223,224,222,224,223,227,226,226,229,225,226,225,225,225,224,226,226,225,226,225,227,226,225,225,225,226,226,227,226,225,226,227,229,226,226,226,225,228,229,229,229,228,231,230,228,230,232,230,232,231,230,233,233,234,233,232,232,232,234,234,234,234,236,236,236,237,237,239,239,239,241,241,241,240,239,242,241,241,242,241,244,242,242,244,244,244,242,244,244,244,244,243,243,244,244,244,246,244,245,245,245,246,247,247,248,249,248,249,248,248,247,247,249,248,249,247,247,247,247,247,247,247,246,247,246,246,245,244,245,245,246,244,244,244,245,246,245,245,248,246,245,244,244,245,247,246,245,245,246,248,246,246,246,247,247,246,246,247,246,247,249,248,247,246,246,247,245,246,246,245,247,249,241,230,242,247,247,248,246,247,246,246,247,244,238,245,248,247,242,241,246,247,247,247,246,245,246,246,245,244,246,245,245,245,244,245,245,245,244,244,245,245,244,244,242,241,243,245,246,245,246,246,244,245,245,247,245,246,245,249,246,222,251,244,226,245,200,193,217,206,222,238,233,201,217,249,243,172,146,142,123,59,127,244,250,249,209,202,240,229,229,235,246,240,247,202,93,165,204,153,217,251,253,253,208,208,202,211,236,210,218,211,237,252,239,247,242,248,243,236,230,207,191,181,237,218,136,145,57,2,83,91,71,46,49,36,36,235,242,121,183,244,245,217,223,212,206,213,236,246,242,245,243,245,239,227,236,244,243,244,243,248,246,241,252,252,167,129,69,60,209,238,245,232,101,184,245,196,238,252,248,251,247,250,249,252,229,236,252,252,253,253,249,186,84,14,57,150,71,41,31,76,94,31,31,24,55,100,85,67,42,98,113,57,61,20,56,189,243,248,250,249,252,245,244,247,249,239,222,238,245,244,245,244,244,244,232,232,245,247,224,219,248,252,193,165,203,239,233,222,168,152,222,236,225,245,199,192,212,199,231,208,211,235,237,237,235,230,252,204,110,105,81,75,41,23,8,73,242,249,251,251,211,199,215,252,243,156,80,70,181,223,225,233,199,211,218,215,219,214,214,216,216,213,214,214,212,212,213,212,211,213,212,214,211,209,210,212,212,209,210,210,208,209,210,209,208,208,210,209,207,208,208,208,207,205,207,205,205,207,207,206,207,206,205,205,206,206,205,205,207,208,208,210,212,220,195,146,113,78,60,65,63,56,60,54,39,57,69,55,48,43,39,30,38,55,58,42,19,123,224,224,229,221,220,219,217,219,218,222,222,217,220,220,219,219,219,218,217,218,218,219,217,215,217,215,215,214,214,215,213,214,213,213,214,214,212,211,212,212,212,210,212,210,210,208,212,113,1,1,4,7,8,8,10,9,10,11,10,10,217,219,217,220,220,216,216,218,217,217,217,217,218,215,216,217,216,219,220,219,218,220,218,216,219,219,218,217,217,219,220,221,218,218,220,220,220,219,218,218,219,219,220,218,220,220,219,220,219,221,220,218,222,222,222,218,219,221,217,220,220,218,222,219,219,220,220,219,221,222,221,221,221,221,222,224,224,224,223,224,226,226,224,225,226,224,224,224,224,224,224,225,221,222,226,226,225,224,223,224,224,225,225,223,223,223,226,225,224,224,227,224,224,224,224,226,226,225,227,226,228,227,224,224,228,226,225,227,227,227,224,226,226,225,224,227,227,229,228,228,230,227,228,227,230,229,229,232,227,232,232,229,232,232,233,232,231,232,232,233,233,234,236,235,236,235,236,235,234,234,235,235,234,235,236,237,238,238,238,239,239,239,239,240,241,240,241,242,241,241,243,243,243,243,243,243,243,243,243,245,244,243,245,244,243,245,244,244,245,243,244,245,245,246,249,248,247,247,247,246,247,247,247,247,247,247,247,247,247,246,247,247,246,246,246,246,245,244,244,244,244,244,245,244,245,244,244,245,245,244,244,245,246,246,244,245,246,245,244,244,246,245,246,246,245,247,246,247,246,246,246,246,247,246,246,246,246,246,245,244,246,247,236,235,245,247,247,247,247,248,246,247,249,241,240,246,247,246,242,244,246,246,247,246,244,245,245,245,247,245,245,245,244,244,244,244,244,245,244,244,245,244,243,244,242,237,240,245,244,243,245,245,244,246,244,245,245,246,245,249,241,220,252,231,192,211,205,231,238,226,242,240,229,198,220,250,246,178,153,151,131,53,112,243,251,251,218,176,225,240,237,228,240,229,228,208,150,184,189,205,252,252,251,251,202,212,202,212,242,229,245,229,242,251,239,249,244,247,241,234,232,198,177,184,239,198,134,141,53,3,78,81,64,49,50,41,51,239,249,111,160,237,243,221,215,217,226,215,234,249,241,244,242,245,241,228,231,241,242,243,244,248,246,234,253,221,134,156,90,62,195,234,245,177,85,216,232,203,248,249,249,247,247,249,250,250,215,238,252,252,252,252,249,208,124,33,84,180,71,30,15,69,84,32,34,19,67,104,81,65,36,98,109,57,60,28,54,196,249,249,249,249,252,247,243,248,248,244,224,232,245,244,246,244,244,246,231,226,239,249,231,218,241,252,208,163,193,234,235,221,185,145,186,236,224,234,204,167,173,199,246,221,203,236,239,235,237,230,252,198,113,107,88,70,29,8,27,156,234,251,252,230,204,213,234,234,202,125,52,60,188,219,227,228,198,211,216,217,218,213,218,216,215,214,215,214,212,215,215,213,213,212,211,212,212,210,212,213,212,210,208,208,210,210,208,209,208,208,209,209,210,209,208,209,211,208,207,208,208,207,205,210,207,206,208,205,206,206,205,207,208,207,206,210,214,219,204,172,173,155,118,108,81,69,73,58,94,113,84,67,51,45,39,37,46,58,27,40,147,224,243,221,221,225,221,222,217,216,218,218,218,219,217,218,218,218,217,217,217,216,216,216,216,216,215,215,218,215,216,215,213,215,214,215,214,213,215,212,212,212,211,211,210,210,212,210,210,110,3,1,3,7,9,7,9,9,10,10,10,10,219,221,218,219,222,218,218,219,219,219,220,221,218,218,221,218,221,219,219,221,218,222,218,219,220,216,221,219,217,218,219,218,219,220,219,218,219,220,219,219,222,221,222,220,220,222,221,223,223,221,222,220,219,219,219,222,222,222,220,221,221,220,222,223,223,221,223,222,223,223,222,221,221,224,225,225,224,225,226,227,228,229,228,229,227,226,227,226,226,228,226,228,227,226,227,225,226,229,226,225,229,227,227,227,226,228,228,228,227,227,229,226,226,228,223,226,224,227,227,224,229,225,226,227,226,229,228,227,226,223,226,225,225,226,226,229,228,225,227,229,228,229,229,229,229,230,230,231,231,232,234,231,232,233,234,236,233,234,234,235,236,236,236,235,235,236,236,234,233,236,235,235,236,235,237,237,237,239,238,239,237,238,238,238,240,241,241,239,242,242,243,243,242,242,242,242,243,243,244,244,245,245,245,244,242,243,245,245,247,246,246,245,246,246,247,247,246,247,246,247,247,247,247,247,249,247,247,247,247,247,247,247,246,246,246,246,245,246,245,244,244,245,247,247,245,244,245,246,247,246,244,244,244,245,246,246,245,244,245,244,245,246,246,247,247,246,246,247,247,247,245,245,246,246,248,246,247,247,246,246,249,247,235,237,248,245,246,247,246,247,247,247,245,237,242,247,247,244,241,246,246,244,247,245,245,246,245,245,245,244,245,244,244,244,244,244,244,244,244,244,245,245,245,244,242,235,239,243,244,244,241,244,243,246,244,246,246,246,245,249,233,212,239,191,194,238,229,252,252,226,220,228,219,193,221,250,247,174,144,143,130,54,108,244,252,252,249,202,194,203,235,246,237,220,214,187,180,203,202,221,252,252,250,247,185,219,206,215,232,187,230,229,239,242,236,250,241,249,241,235,229,200,181,187,245,181,118,136,49,8,66,73,62,49,57,41,46,212,168,87,156,206,238,212,213,226,232,213,230,248,242,247,242,244,241,232,234,239,241,242,244,247,247,217,220,182,150,212,118,41,150,206,227,156,158,249,222,210,251,248,250,247,248,246,251,243,210,241,252,252,249,248,249,177,117,39,84,168,57,31,19,75,73,29,32,20,65,93,83,69,42,79,111,62,51,27,45,192,249,249,249,247,252,246,242,248,246,248,224,224,244,245,245,244,244,247,237,226,236,247,236,215,232,252,225,169,181,223,239,223,199,160,165,228,231,204,198,166,134,175,240,235,201,218,231,229,233,237,252,191,120,111,88,50,16,4,83,243,252,252,248,210,208,231,252,252,158,104,36,66,199,212,231,223,193,214,213,218,217,212,214,214,214,213,214,213,211,211,213,211,209,212,211,210,211,212,213,212,211,210,211,210,210,210,209,206,210,209,209,208,207,211,208,209,209,208,210,211,210,209,209,208,208,208,208,209,208,206,208,208,207,205,207,211,213,213,189,193,226,225,194,146,103,72,47,34,80,126,107,81,60,45,43,34,51,29,36,154,230,235,245,222,225,221,220,218,214,216,216,220,220,218,220,218,220,219,216,217,218,217,215,217,215,216,215,214,216,214,217,214,212,213,213,213,214,214,211,214,211,213,213,212,211,208,214,212,211,111,2,1,3,7,9,7,10,10,10,10,10,10,220,224,219,221,222,218,219,220,220,219,220,220,220,219,219,222,218,220,222,220,221,219,222,222,221,221,218,219,221,219,221,221,219,220,219,219,220,220,222,221,224,220,223,220,220,222,222,225,220,224,222,220,222,219,223,220,220,222,222,224,221,224,224,221,221,222,224,220,222,222,220,224,222,223,225,225,225,227,227,226,228,227,228,230,229,229,228,228,229,229,227,228,229,227,226,229,229,229,229,230,230,228,230,230,230,230,228,229,228,226,231,227,226,229,229,229,227,226,227,224,227,226,225,226,226,228,229,226,226,228,228,229,230,228,227,229,226,228,229,227,229,230,229,227,229,230,232,233,230,232,232,232,234,234,235,232,235,236,236,235,234,235,234,236,235,232,234,234,236,236,235,236,233,237,238,237,239,238,239,239,239,240,238,240,239,240,240,240,240,241,243,242,242,242,244,243,244,244,244,243,244,245,243,243,244,244,245,245,245,245,245,245,245,244,246,246,247,246,246,247,247,247,247,248,246,247,247,246,247,247,247,247,246,246,246,246,246,246,245,246,244,244,245,244,244,245,245,246,245,245,244,244,244,244,243,244,244,244,245,244,245,245,245,245,246,246,246,246,245,246,247,247,246,245,246,246,246,245,246,246,249,244,233,241,246,246,249,245,245,244,245,249,244,238,244,248,247,242,242,246,245,245,246,246,245,246,245,245,245,244,244,244,244,244,246,244,244,245,244,243,244,244,242,242,242,237,237,241,241,242,243,243,242,243,243,244,246,247,244,251,227,211,239,207,226,250,240,252,224,199,226,230,216,196,225,251,243,158,127,136,115,44,111,244,252,252,249,214,187,181,214,246,251,240,204,174,198,205,203,221,247,252,249,237,152,203,203,190,153,94,173,204,230,228,231,248,241,249,243,233,231,214,179,190,242,163,119,132,41,20,65,56,55,57,50,57,31,49,96,138,177,208,227,201,197,218,234,209,230,247,241,245,243,245,240,233,229,237,242,243,243,248,245,228,231,171,175,224,125,45,108,222,250,160,208,251,207,224,251,248,250,245,248,244,251,233,209,248,249,250,243,242,248,156,101,26,87,155,37,28,25,83,66,26,30,22,61,84,82,78,50,92,116,62,51,39,25,128,244,244,247,247,251,246,242,248,246,250,232,220,238,248,247,247,245,246,243,227,234,248,240,216,226,251,237,174,175,217,236,231,210,171,163,217,223,161,197,198,114,150,223,227,162,173,220,226,235,241,252,181,110,104,66,41,34,115,183,248,250,250,231,215,229,234,254,220,135,98,30,103,211,215,234,216,195,208,212,219,215,210,214,211,213,212,213,212,211,211,210,210,210,210,209,210,210,208,210,212,210,212,212,211,211,208,212,209,207,207,209,210,206,208,208,207,208,208,208,208,207,208,209,209,207,208,208,206,208,207,206,205,207,206,206,208,213,198,179,197,223,235,213,157,103,57,25,13,23,43,65,65,51,46,33,34,22,8,110,223,244,241,228,223,226,220,215,217,215,217,219,219,218,217,219,219,218,219,218,217,217,217,217,214,214,217,215,215,214,213,213,212,212,215,214,213,212,212,213,210,212,211,208,210,212,212,212,211,212,113,1,1,5,8,8,7,10,9,10,11,10,10,220,223,221,221,219,217,218,219,221,220,217,218,221,222,219,219,221,219,220,219,220,223,219,219,220,217,219,218,218,220,220,221,219,220,221,221,222,220,219,217,221,221,219,220,220,220,220,221,218,218,223,223,222,222,220,221,223,222,222,222,223,222,221,221,220,222,222,221,222,225,224,223,225,225,226,226,226,227,229,229,229,230,229,229,231,230,231,230,231,230,228,227,230,229,229,230,230,230,230,232,231,230,231,230,232,232,228,230,228,229,231,229,230,231,228,229,228,229,227,226,229,226,226,226,224,226,227,227,229,229,229,227,228,228,228,229,231,229,229,230,229,229,229,231,229,230,230,232,230,230,232,228,230,232,232,232,232,236,234,234,234,231,232,235,235,235,236,236,236,237,237,238,239,236,237,237,237,237,237,239,240,239,237,237,239,240,241,241,240,240,240,240,241,241,244,242,243,245,245,243,244,243,243,242,242,242,244,245,245,245,244,244,245,244,245,245,244,247,246,247,245,245,247,248,247,246,244,245,246,248,247,246,246,246,247,247,245,244,245,244,244,245,245,244,246,246,245,246,244,244,244,244,245,245,245,244,244,244,245,244,245,245,244,244,246,245,244,246,244,245,245,245,244,244,246,245,246,246,245,246,248,240,230,241,246,245,246,246,246,245,246,247,239,239,247,247,245,241,245,247,246,245,245,244,245,245,244,245,245,244,244,244,244,244,244,245,246,245,244,243,243,243,242,240,243,239,237,241,242,242,244,245,243,244,242,244,245,247,244,251,225,232,252,214,236,245,204,208,220,229,245,248,224,203,224,246,237,161,135,128,113,41,104,245,247,251,251,228,218,179,188,231,245,252,224,184,228,199,204,224,240,252,239,169,92,206,196,152,103,17,83,174,233,222,232,247,240,251,245,232,226,210,165,182,245,179,149,141,60,49,63,65,59,43,52,51,40,35,103,206,208,229,243,193,192,217,233,212,231,246,242,244,242,244,240,233,229,234,240,244,241,248,244,224,184,119,160,210,125,97,214,249,244,181,240,251,196,232,252,247,249,245,248,247,251,222,217,251,247,250,242,244,247,155,99,24,94,155,38,27,33,90,55,23,32,25,64,76,82,77,53,89,116,74,45,46,10,72,226,243,249,249,251,247,241,248,244,248,237,217,235,246,247,247,245,247,245,230,234,244,246,221,220,249,247,189,169,207,238,230,213,182,119,162,194,151,204,231,162,158,218,217,151,160,203,223,235,251,236,146,100,81,66,25,84,237,252,252,250,229,221,226,249,252,252,183,124,91,32,150,223,214,236,209,199,214,212,217,214,212,213,213,214,213,214,211,212,210,210,211,210,211,211,212,210,211,211,212,210,210,211,211,210,210,211,209,208,208,209,207,208,208,208,210,208,208,207,208,206,207,210,209,208,207,207,206,207,207,205,206,205,208,210,212,213,188,174,198,216,222,220,189,155,140,110,71,58,49,31,33,24,6,49,93,153,209,237,241,235,230,228,221,219,216,216,218,217,217,220,219,217,218,218,216,218,217,218,218,218,218,217,217,214,217,213,214,215,213,214,212,213,214,214,212,212,213,212,212,211,214,213,210,213,211,213,211,211,113,1,1,4,7,8,7,10,9,9,10,10,10,221,224,220,221,222,217,219,220,219,220,221,220,220,224,221,221,221,220,224,218,221,220,219,218,217,217,217,219,220,219,219,220,220,223,221,220,222,221,220,218,222,218,219,220,219,222,220,221,221,221,221,223,223,221,224,221,221,223,221,223,220,222,223,220,224,223,224,224,223,225,224,225,225,224,227,226,225,227,229,229,230,230,230,230,231,232,232,233,231,231,231,231,231,229,231,232,229,230,230,229,232,229,229,231,229,231,228,227,230,229,230,228,229,229,229,228,227,229,231,227,229,228,228,228,227,227,227,226,229,228,229,227,226,229,229,231,228,227,229,227,227,229,229,230,230,231,230,231,232,231,232,231,231,231,231,233,233,232,234,233,234,235,234,236,233,233,237,236,237,237,235,239,237,236,236,236,238,237,236,235,237,237,237,240,238,239,239,240,240,239,241,241,240,241,241,242,242,241,244,244,241,241,241,243,244,243,244,245,243,245,245,245,245,245,246,244,245,244,245,245,244,244,244,245,245,245,244,244,245,246,247,246,246,245,246,245,245,246,244,244,245,245,245,245,245,245,244,244,244,244,245,245,245,245,242,245,244,243,243,243,243,245,244,244,244,243,244,244,244,245,244,244,245,245,246,246,244,244,246,247,248,234,232,245,245,247,245,243,246,245,247,244,235,240,246,246,241,241,245,245,245,245,246,246,244,244,245,244,245,243,244,244,241,245,243,243,244,243,243,243,242,242,242,242,243,238,235,238,241,243,243,244,243,244,244,244,243,245,245,252,220,229,243,191,208,196,190,240,242,248,249,243,223,205,225,241,234,164,151,146,115,40,99,240,248,251,248,232,243,208,170,207,238,252,227,203,248,200,221,224,232,253,182,109,62,154,157,87,37,3,95,201,246,229,241,251,252,253,252,249,246,234,176,199,252,223,199,162,71,52,71,72,63,49,47,54,42,49,200,248,231,252,252,211,207,243,252,227,247,252,252,252,250,249,244,239,233,234,241,243,243,248,243,205,118,66,131,136,101,174,251,252,202,174,251,236,200,244,248,249,247,247,249,249,252,212,223,251,245,250,243,244,248,150,101,29,98,161,42,22,41,91,49,24,29,27,63,84,81,71,50,76,107,74,48,48,11,92,242,245,248,248,250,244,238,247,242,246,245,222,227,244,246,246,245,245,248,232,229,240,246,229,217,244,251,200,165,194,229,234,219,174,106,80,144,185,219,248,191,182,221,217,185,184,197,219,242,252,205,117,89,90,64,92,180,234,253,253,234,232,233,234,252,234,216,142,136,74,60,182,220,224,230,205,202,212,215,218,213,213,215,212,213,211,214,210,212,213,211,211,210,211,210,211,213,212,211,210,210,212,211,208,210,210,209,208,212,211,208,211,209,209,208,209,210,206,209,210,209,210,208,208,211,210,206,207,210,206,208,210,207,211,211,215,211,187,189,207,211,221,229,233,238,234,221,226,201,137,77,16,92,211,239,242,251,251,249,249,232,225,222,217,217,217,218,217,217,220,218,217,218,218,219,216,216,217,218,217,218,215,215,216,218,218,215,216,214,215,215,215,215,214,212,216,214,212,213,212,211,212,213,210,211,210,214,212,210,112,2,1,2,6,8,6,9,9,9,9,9,9,221,224,222,223,225,222,222,220,222,222,222,223,223,220,220,223,222,220,224,224,221,222,220,223,221,220,223,220,221,222,220,220,221,221,219,218,221,221,221,222,221,221,223,223,221,221,223,224,223,228,221,221,224,222,226,225,222,222,223,224,226,226,226,227,228,226,226,224,223,226,225,226,224,228,226,225,227,228,229,227,229,231,230,231,231,231,232,230,231,231,232,232,232,232,232,232,231,232,231,232,231,230,232,228,231,230,229,229,227,230,228,227,229,229,230,229,231,232,229,229,231,227,229,229,229,229,230,232,230,230,232,230,230,230,228,231,229,230,230,229,229,230,231,230,231,230,232,234,232,232,233,231,232,232,232,234,233,235,233,234,237,237,236,235,234,235,237,236,237,235,236,237,237,239,238,239,240,239,239,237,237,237,239,238,241,240,239,241,241,242,240,241,242,242,243,242,243,242,242,243,243,243,242,242,243,243,243,244,243,244,244,244,246,244,245,246,245,246,245,246,245,244,246,244,244,245,245,245,244,244,244,245,245,244,245,246,247,245,245,244,245,246,245,245,245,243,244,244,245,245,244,244,244,244,243,244,244,244,245,244,244,245,244,243,245,245,245,246,244,245,245,246,245,244,246,246,245,245,244,245,245,232,235,244,245,246,245,244,245,245,245,240,236,244,245,244,240,241,246,245,244,244,245,245,245,245,245,245,245,244,243,245,243,244,242,243,244,242,242,241,241,240,243,243,244,241,234,240,243,243,244,241,243,245,245,246,242,244,246,248,209,224,212,170,215,227,233,252,237,214,227,237,221,212,231,245,239,182,175,155,126,41,93,238,242,251,243,230,249,230,197,207,222,252,225,171,236,156,179,214,217,227,118,32,10,131,127,64,33,89,227,249,252,252,252,252,252,252,252,252,252,252,220,226,252,250,202,146,63,51,69,76,58,54,56,53,49,85,233,248,252,252,252,239,248,252,253,253,253,253,253,253,253,253,252,252,252,248,249,251,248,251,247,214,132,66,71,43,69,181,250,227,105,173,251,214,207,245,247,248,246,249,245,252,245,202,232,251,244,250,243,249,248,146,73,11,84,132,40,20,66,92,41,28,23,35,64,81,83,70,55,71,107,77,44,49,12,83,243,244,249,249,249,245,238,246,243,246,247,225,220,239,246,246,246,244,247,236,225,235,249,232,214,236,252,216,166,184,225,231,223,203,150,127,169,216,219,235,193,162,207,208,208,212,194,218,247,242,164,126,117,65,86,193,252,252,251,250,235,240,244,248,252,252,168,106,132,73,93,203,220,223,231,204,200,214,213,219,213,213,214,212,211,209,213,211,211,213,212,210,211,210,211,211,211,212,210,211,212,212,209,211,211,210,209,208,209,208,211,212,212,210,208,210,209,208,211,212,209,209,210,210,211,211,209,210,209,209,210,212,210,211,215,217,201,184,199,214,213,218,226,231,244,249,251,251,233,178,114,80,184,234,252,252,251,251,238,231,225,220,219,217,217,218,217,216,217,217,219,219,217,219,218,219,218,215,216,217,216,215,218,217,216,217,214,216,216,214,217,216,216,215,213,215,213,213,213,213,212,212,214,213,212,211,213,212,212,112,2,1,2,6,8,7,9,9,9,10,10,10,221,224,223,223,223,220,222,223,222,220,220,220,222,222,218,223,222,223,225,221,225,222,222,222,222,225,222,222,221,220,219,219,220,220,222,222,223,223,222,222,223,223,223,225,225,224,224,224,224,225,226,224,224,224,226,223,222,225,226,226,225,228,227,227,227,227,227,223,226,227,226,228,225,227,226,226,229,227,230,228,228,230,230,234,232,231,232,230,229,229,229,231,232,232,234,234,234,233,232,231,232,234,230,232,231,232,231,230,230,229,230,229,231,229,230,231,231,230,230,229,230,230,231,229,230,231,230,231,234,232,233,236,233,232,231,231,231,232,230,230,231,231,231,230,229,231,230,230,232,230,232,230,232,234,233,235,234,233,234,234,234,235,233,236,236,235,236,236,235,237,237,237,238,237,240,241,240,240,240,239,239,238,239,239,238,240,241,241,240,240,241,242,241,241,242,242,243,242,242,242,241,241,241,241,242,242,241,243,244,244,244,244,244,245,245,245,246,245,246,244,244,244,241,243,244,244,245,246,246,246,245,244,245,245,247,245,244,245,246,245,244,245,245,245,244,244,243,244,243,244,244,244,244,243,242,244,244,243,245,245,244,245,244,244,244,244,243,244,245,244,245,244,245,245,243,244,243,244,243,246,242,225,237,245,244,245,243,244,245,244,245,235,236,245,245,242,239,243,246,245,244,242,242,244,244,244,243,241,243,243,243,241,244,243,241,242,241,241,241,241,240,241,241,241,242,240,237,239,241,242,242,242,244,243,244,243,243,242,246,246,208,234,224,208,247,240,243,230,194,214,226,236,217,212,236,248,242,177,171,162,141,66,103,233,243,250,241,226,248,237,214,210,203,251,192,117,179,82,129,206,208,204,85,21,106,220,225,214,205,243,253,253,253,253,253,253,253,253,231,208,193,193,141,148,192,150,122,78,32,45,44,48,49,47,44,51,34,54,178,163,157,213,181,163,204,240,247,229,252,252,252,252,253,253,253,253,252,252,252,252,253,253,250,244,162,91,64,4,43,198,247,177,67,177,250,200,220,246,244,248,244,247,242,252,234,200,238,249,244,248,245,249,241,144,77,9,18,67,45,30,74,92,35,27,24,36,62,76,81,71,54,70,114,81,42,55,12,82,241,243,247,247,248,242,237,245,243,245,249,233,217,236,245,244,244,243,246,238,224,231,246,237,211,227,252,226,173,173,215,232,227,218,206,167,188,236,221,208,194,174,174,187,212,224,200,216,249,212,145,117,97,80,146,245,252,252,251,241,234,239,242,247,253,218,111,85,115,74,119,204,211,226,225,205,200,211,212,215,213,214,212,210,212,209,211,211,210,210,209,210,211,208,209,211,212,209,209,209,210,210,211,212,212,209,210,208,209,210,209,210,210,210,208,208,210,209,209,208,208,210,210,208,208,208,210,210,211,211,209,211,209,212,216,208,185,184,203,214,218,219,217,220,226,234,240,246,210,154,106,71,198,233,228,245,235,234,225,221,220,218,216,216,219,215,218,218,215,217,218,218,217,217,218,215,216,216,217,217,216,217,216,217,215,214,215,214,214,217,214,212,215,214,212,212,212,214,213,213,212,213,210,208,213,212,212,210,212,113,1,1,3,6,7,7,10,8,9,10,9,9,220,224,225,224,225,221,223,222,222,224,221,221,224,221,222,223,223,223,223,222,222,223,220,223,224,221,224,221,222,222,221,222,224,223,225,224,223,225,223,227,225,225,225,226,225,225,226,226,227,227,226,226,227,225,226,227,229,228,227,229,226,226,227,227,227,226,226,226,228,227,226,227,228,229,228,229,228,229,229,229,232,232,231,231,234,234,232,233,232,231,233,233,233,233,233,234,234,234,232,233,234,231,233,230,232,230,230,232,228,231,231,231,231,230,232,231,228,231,230,231,234,229,231,233,232,232,232,234,233,234,235,234,235,237,235,234,233,233,231,230,232,231,232,232,233,232,230,231,231,232,231,231,232,232,233,232,233,235,234,234,235,236,235,236,236,235,237,236,237,235,236,239,237,237,239,238,240,238,239,238,239,242,240,241,240,241,242,241,240,240,240,242,242,241,242,241,242,242,243,243,241,241,242,241,243,242,243,242,245,246,244,244,245,245,245,245,245,244,244,243,242,241,241,242,244,245,245,247,244,245,245,245,244,244,244,244,246,245,245,244,245,245,244,244,245,245,244,243,244,243,244,243,242,244,244,244,244,244,244,242,244,245,245,245,245,245,245,244,244,244,244,245,245,244,244,244,243,245,245,246,235,225,240,244,244,246,243,243,244,245,243,234,240,244,244,240,239,244,245,244,245,244,243,242,245,244,243,242,242,245,243,243,242,242,242,241,243,241,240,241,241,241,241,241,241,242,236,235,240,242,242,240,243,242,244,242,241,241,248,240,211,250,229,217,252,213,183,223,224,237,242,235,214,214,237,250,236,151,143,165,176,117,145,247,249,251,244,224,242,233,222,226,198,229,144,53,122,33,88,198,206,245,190,204,252,252,252,252,252,252,243,229,206,155,155,137,124,115,99,88,65,42,24,15,13,54,56,21,31,44,42,46,47,43,42,42,38,42,45,27,12,9,8,10,10,17,34,34,69,105,131,163,186,220,231,247,253,253,252,252,252,252,252,252,226,144,94,25,43,207,229,157,131,231,248,199,235,250,248,249,244,248,245,252,222,202,246,246,247,249,248,252,231,208,134,18,1,14,45,63,98,78,29,30,20,44,63,73,84,65,53,62,107,88,46,54,11,82,242,244,249,249,246,243,240,245,243,245,246,241,217,227,244,244,244,243,246,245,228,229,244,241,216,222,249,238,184,166,203,229,224,224,220,187,169,223,220,174,190,198,155,166,226,238,220,220,241,180,102,89,61,108,226,252,250,250,246,245,242,244,239,248,249,181,68,87,118,81,142,204,214,229,233,214,218,227,221,221,213,213,214,211,213,211,212,212,212,210,210,210,211,214,211,211,210,210,210,211,213,208,209,210,209,210,209,212,211,210,213,212,210,211,211,212,212,209,209,211,212,211,210,210,208,209,213,213,213,214,215,213,213,216,218,198,177,191,212,217,222,221,219,219,220,225,231,243,195,149,90,65,189,223,218,231,224,225,218,219,220,218,217,218,220,220,222,218,217,218,219,218,217,218,220,217,215,217,220,218,216,214,216,217,214,215,216,217,216,216,215,214,214,214,214,214,214,215,215,211,212,213,212,211,211,214,212,209,213,112,1,1,4,6,7,7,10,8,9,10,9,9,223,224,224,222,225,225,224,224,224,222,226,224,225,222,221,225,221,223,222,222,222,220,225,222,223,225,221,224,224,222,224,223,225,222,223,224,222,223,223,223,225,225,224,227,227,226,226,225,226,227,228,227,227,230,230,229,229,228,227,229,229,230,227,226,228,227,228,227,228,229,228,229,227,230,231,230,232,230,231,231,231,232,232,233,232,231,235,235,233,236,235,236,235,237,235,234,236,235,232,231,231,231,231,231,231,230,229,231,233,230,232,229,231,234,231,230,229,229,230,231,230,231,232,233,235,234,237,238,237,237,236,235,235,234,234,234,234,232,231,234,232,232,232,233,234,232,234,234,234,233,233,232,232,232,233,235,234,235,236,235,236,236,236,236,237,236,237,235,235,237,238,238,238,237,239,240,238,240,238,238,241,240,240,240,241,242,241,242,242,241,241,240,239,240,242,243,242,240,241,242,242,242,243,243,243,242,243,243,243,244,244,244,244,244,245,245,244,242,244,242,241,241,242,242,242,243,243,243,244,245,246,246,245,245,245,244,245,246,246,246,245,245,243,244,245,244,244,245,244,244,243,243,244,243,242,243,243,244,244,243,242,242,242,242,243,243,242,242,242,243,243,242,241,244,244,243,244,244,243,247,232,226,244,243,244,244,242,244,245,245,237,232,243,246,242,237,241,243,242,243,243,244,243,244,243,242,244,243,244,242,243,243,243,242,240,242,242,242,242,239,240,241,241,241,242,242,236,234,239,242,242,241,242,240,243,240,242,239,248,232,203,234,203,205,214,187,220,247,240,241,231,236,212,217,240,246,239,156,134,145,160,143,187,250,248,252,243,223,241,230,221,240,224,224,128,62,94,3,83,220,252,252,253,253,253,252,210,173,138,137,104,96,99,73,77,68,69,74,59,50,29,12,6,14,10,48,76,29,43,53,50,54,53,51,49,45,42,39,42,36,38,30,8,8,9,10,10,10,10,10,10,11,11,11,27,53,101,130,171,213,243,249,252,245,248,210,132,69,44,115,142,142,181,250,240,200,243,249,249,248,245,246,247,252,209,208,249,245,250,247,252,252,219,221,163,42,11,7,22,83,114,73,29,31,21,41,67,78,80,63,50,50,89,85,45,54,14,71,238,244,249,249,247,242,236,246,241,243,246,247,223,220,241,244,244,243,243,246,229,224,240,244,218,215,243,243,193,163,190,225,222,220,224,199,163,210,222,146,196,231,156,172,231,249,240,211,185,118,82,56,85,176,252,252,250,249,244,244,242,243,239,252,234,155,63,84,111,95,153,209,233,242,251,248,236,249,234,220,216,217,214,212,213,209,212,212,212,213,212,210,210,212,212,211,212,210,213,211,210,214,210,210,211,210,211,213,211,211,214,212,213,211,210,212,212,212,213,211,212,212,211,211,213,213,214,215,213,214,214,216,217,218,220,195,181,203,216,220,221,222,222,221,222,230,240,246,193,144,79,65,194,216,219,230,217,224,217,219,218,217,216,215,219,220,219,217,218,220,220,220,218,220,217,214,219,217,219,217,217,217,214,217,217,217,216,216,216,215,215,214,216,213,214,215,214,213,211,214,213,214,213,212,212,209,213,212,210,112,2,1,3,7,8,6,9,9,10,10,9,9,223,226,225,224,226,223,225,226,225,224,225,223,225,225,226,225,222,223,223,226,225,226,226,226,225,223,227,225,225,224,224,224,225,224,225,225,225,227,224,225,226,226,226,228,227,226,229,229,229,229,227,229,231,229,230,229,231,229,230,230,228,230,230,231,231,231,229,230,231,230,230,231,230,231,234,233,233,233,232,234,233,232,231,232,234,232,234,236,235,234,236,236,235,236,235,236,234,232,232,232,233,230,232,232,233,233,231,232,231,230,230,229,232,232,232,231,229,232,229,229,232,231,234,235,236,238,239,237,237,237,238,236,236,237,236,235,235,236,238,236,236,235,234,233,235,235,235,237,239,237,236,238,236,236,236,236,235,237,236,236,234,235,237,234,235,237,236,237,240,238,238,239,237,239,239,239,239,239,239,240,241,240,239,239,241,242,240,240,241,243,243,241,241,239,240,243,242,240,241,242,241,242,242,241,243,241,243,243,242,242,242,244,242,244,243,243,243,243,244,242,242,242,243,243,243,243,242,243,243,245,245,243,244,244,245,245,242,243,246,245,246,245,244,246,244,244,242,242,243,243,242,244,244,243,243,243,243,243,241,243,243,242,243,243,243,242,244,243,243,244,244,244,244,244,243,244,243,243,246,244,225,231,245,244,244,242,243,244,244,243,232,236,246,245,240,237,242,245,245,245,242,242,243,242,242,241,240,242,242,242,242,241,242,241,240,239,240,241,242,243,241,242,241,241,242,243,240,232,237,240,241,241,241,241,242,240,241,236,247,219,192,222,174,182,229,226,248,252,202,212,230,235,210,217,238,247,241,162,141,128,108,81,155,243,232,242,233,220,238,227,222,237,238,252,182,106,103,35,151,245,253,253,159,121,150,127,98,78,51,55,60,78,93,100,105,100,120,122,110,101,87,94,112,105,61,79,95,73,61,51,56,56,54,43,50,82,69,54,94,92,40,56,83,68,70,63,54,28,12,12,9,10,11,10,9,11,10,12,10,12,16,57,110,119,204,167,147,125,65,71,31,122,224,248,231,203,247,247,250,248,247,243,250,252,198,220,251,244,251,246,253,253,199,207,129,33,37,16,18,51,85,63,33,27,19,48,69,74,76,55,42,38,80,90,47,50,14,62,228,243,249,249,246,239,239,245,242,245,245,251,229,213,236,244,244,244,242,245,231,220,236,245,222,208,236,244,208,169,180,219,222,219,222,207,164,178,234,199,219,252,188,163,231,246,246,156,105,89,70,71,131,241,252,252,250,249,247,242,234,237,238,251,229,137,54,64,109,108,162,216,238,247,226,198,200,234,245,225,214,215,214,214,211,211,213,210,214,211,212,212,209,213,210,210,212,213,214,210,214,211,214,213,212,217,212,213,214,212,214,214,215,213,212,213,214,214,212,212,211,210,212,214,213,214,216,216,217,217,219,220,219,223,217,186,189,213,218,223,222,218,221,224,231,244,250,240,167,123,63,65,174,207,222,230,217,222,218,219,221,218,219,219,217,219,219,218,220,221,219,219,220,218,217,217,218,219,217,215,219,219,216,218,217,218,216,214,216,215,214,215,216,214,214,215,213,214,214,213,214,211,212,214,213,213,213,212,212,112,2,1,3,7,8,6,9,9,10,10,10,10,222,225,225,224,225,222,223,225,227,224,227,226,223,224,225,227,224,224,225,225,229,227,224,225,227,229,226,226,225,224,227,227,228,227,226,227,225,227,227,226,228,228,228,227,229,227,229,229,229,231,230,231,232,232,233,231,231,231,232,232,229,231,232,231,232,231,232,231,231,232,231,231,229,231,231,229,231,231,232,232,233,233,229,231,234,234,234,234,234,235,233,232,233,233,234,232,232,230,229,230,234,234,232,232,232,232,231,231,232,229,232,232,230,231,230,232,234,231,231,232,232,233,234,235,235,236,237,237,237,236,237,235,237,237,238,235,235,238,237,237,236,236,236,238,239,239,240,239,237,240,240,237,237,237,238,237,236,237,236,237,235,235,235,235,237,236,238,236,236,236,238,238,237,238,237,239,239,239,240,240,239,238,240,239,238,239,241,241,240,240,240,242,241,241,241,241,242,240,241,241,242,241,241,242,241,241,241,242,242,243,242,241,243,243,244,242,241,241,240,240,242,240,240,240,241,241,240,239,240,241,241,242,241,240,244,243,244,244,242,243,242,243,243,244,243,242,242,242,241,241,241,242,241,243,243,243,244,244,242,242,242,241,241,242,242,241,242,242,243,241,242,241,241,243,242,242,244,243,246,238,221,234,244,243,244,243,243,242,245,238,231,240,245,241,237,238,242,241,243,242,241,242,242,243,241,241,241,242,241,241,241,240,240,240,240,240,239,241,241,240,240,240,239,239,240,242,240,233,235,240,239,240,240,240,240,237,239,236,248,211,214,236,191,226,243,212,208,213,210,229,232,237,201,212,240,248,242,148,122,124,103,34,98,228,229,237,223,208,236,225,215,235,248,252,225,214,167,73,147,159,127,106,55,47,46,57,80,73,66,60,49,57,77,112,185,204,232,244,251,251,249,252,247,195,78,68,129,126,83,45,64,62,52,49,150,206,139,152,200,154,95,193,251,247,250,250,252,249,238,238,223,174,102,54,21,7,6,8,9,8,10,9,13,11,81,77,83,96,73,88,22,112,246,247,221,211,248,244,249,247,245,242,252,239,191,232,252,244,250,243,252,252,196,193,104,17,91,69,23,31,49,64,42,28,26,45,69,66,68,52,39,38,82,89,42,55,14,67,228,243,249,249,244,239,239,247,241,244,240,246,237,212,228,240,241,244,238,244,233,218,230,243,229,207,229,247,218,174,171,210,220,215,218,202,139,143,197,185,246,252,173,141,177,241,246,155,93,83,65,91,198,252,252,252,250,248,242,237,235,240,234,251,206,138,73,79,119,123,167,218,251,178,110,75,45,160,221,213,220,210,214,214,215,214,210,212,211,210,211,211,214,212,210,212,213,212,214,212,210,213,213,215,214,213,214,214,212,212,213,210,213,212,215,216,212,213,213,211,211,211,212,214,214,215,215,215,216,217,217,218,223,224,201,182,201,218,217,218,220,223,225,234,248,248,232,179,117,89,25,34,169,208,225,229,217,224,217,220,218,218,218,218,217,215,218,221,219,218,216,218,215,216,217,215,219,215,217,217,216,217,215,218,217,217,216,215,214,214,216,214,213,214,216,214,214,216,214,215,214,215,214,212,215,213,214,212,214,113,1,1,4,6,7,7,10,9,9,10,9,9,223,227,227,225,227,225,226,226,224,226,227,225,226,225,224,225,226,226,224,224,227,227,227,225,227,225,226,226,227,229,227,230,229,227,228,230,227,229,230,227,227,229,229,231,229,229,229,230,231,230,232,233,234,232,235,235,235,235,234,234,234,232,230,233,232,231,231,232,232,232,231,231,230,230,233,231,232,233,233,234,231,234,231,231,234,234,234,233,234,233,234,233,232,234,233,235,232,231,233,232,232,232,232,232,230,230,231,231,232,232,232,232,233,232,233,232,232,233,234,233,234,236,237,237,237,238,238,236,237,238,238,239,237,237,237,236,235,235,238,238,237,239,238,241,242,239,242,241,240,242,240,241,241,240,239,239,240,240,239,239,240,240,239,239,239,239,239,240,240,236,237,240,240,239,239,239,241,240,239,238,237,237,239,240,240,240,239,240,241,240,241,240,241,240,241,242,240,240,241,241,241,243,241,241,241,239,240,239,242,244,245,242,242,244,243,241,240,241,239,240,239,237,237,238,238,239,239,240,238,238,240,240,241,241,241,244,243,243,243,243,245,242,244,244,242,244,243,243,243,242,242,242,243,242,241,243,241,243,244,243,242,241,242,242,241,241,242,242,240,243,242,240,241,242,241,241,243,242,245,230,220,238,242,242,243,240,242,243,245,233,229,241,245,241,235,241,244,242,242,242,241,243,243,242,242,241,242,241,242,242,242,242,240,239,240,239,240,240,239,239,238,241,241,240,241,241,241,234,235,240,239,239,238,239,239,238,240,240,249,211,221,242,203,219,206,186,221,240,235,247,236,237,197,212,240,247,238,148,130,124,102,41,114,240,240,246,224,208,230,218,214,246,253,253,234,176,167,98,76,59,54,68,64,89,81,61,110,165,180,136,90,47,30,126,239,249,251,251,252,252,253,253,249,204,95,79,140,129,95,75,82,79,68,91,174,194,153,126,147,139,94,198,247,236,251,252,252,252,253,253,246,160,79,69,33,139,225,220,195,148,113,69,47,53,91,46,48,39,39,81,22,150,251,251,217,221,249,245,247,245,244,240,252,226,194,240,247,242,245,242,250,252,217,197,90,16,132,101,32,23,21,57,57,38,29,57,79,54,60,46,33,39,71,92,46,51,20,106,245,245,250,250,244,237,237,244,240,241,241,244,243,215,221,240,240,244,239,242,238,218,225,240,233,208,221,243,226,182,166,199,220,211,218,191,151,106,126,152,150,144,94,73,84,189,240,177,139,83,69,170,252,252,250,249,247,242,244,240,231,227,237,245,185,148,98,104,131,125,167,209,247,175,57,20,8,33,147,201,218,217,212,215,213,213,212,210,211,212,212,212,213,214,213,211,212,212,213,214,214,212,215,214,214,217,212,214,214,213,214,214,213,213,214,214,214,214,213,214,213,213,214,213,216,216,216,214,216,216,217,217,224,222,192,186,208,220,220,222,227,232,245,247,250,201,145,107,63,18,24,138,221,223,227,225,221,222,219,217,218,215,217,219,220,218,220,219,217,220,220,220,217,216,218,217,217,217,220,217,217,217,217,218,217,219,218,216,217,216,216,214,215,216,214,215,214,216,217,214,215,216,213,212,213,214,214,211,213,112,1,1,4,6,7,6,10,8,9,10,10,10,227,231,226,227,229,229,229,226,230,226,228,228,227,228,227,227,226,227,227,226,227,227,228,227,226,229,228,226,228,227,230,230,229,229,229,229,231,230,230,231,230,229,230,230,233,231,231,233,231,233,233,234,235,235,234,236,236,232,235,234,232,232,232,234,235,234,235,232,232,232,233,234,232,234,235,235,235,234,234,233,234,235,234,235,234,232,232,234,233,232,235,232,232,234,234,235,231,232,234,234,236,232,231,232,232,232,232,232,233,233,235,233,234,235,234,234,234,234,234,235,236,237,238,236,238,241,239,238,236,238,241,237,237,237,237,237,236,238,236,238,238,240,241,240,240,239,242,240,243,243,242,242,241,241,241,242,240,242,242,242,242,241,241,240,239,239,240,240,239,240,240,241,241,240,239,239,240,239,238,238,238,237,237,238,237,240,239,239,240,241,241,241,239,239,242,242,241,239,241,242,241,240,239,237,237,238,239,240,240,243,244,243,242,241,242,241,239,239,239,237,238,238,239,238,239,240,239,239,237,240,239,239,242,240,244,242,242,242,242,243,241,241,242,243,244,243,243,244,244,243,244,242,243,243,244,242,241,244,243,244,243,242,241,241,240,241,241,240,241,240,241,242,241,242,240,241,242,243,242,224,225,241,241,242,241,241,243,243,239,227,235,244,243,238,236,241,241,241,244,242,241,243,242,241,240,241,241,241,240,240,241,240,240,240,240,239,239,239,240,240,240,238,239,239,240,239,241,236,230,237,239,239,240,238,239,239,241,242,246,203,225,210,153,205,218,223,252,249,231,232,232,242,191,214,241,249,242,152,136,141,122,46,123,244,246,251,236,213,225,216,223,241,251,238,108,74,118,134,108,51,50,65,73,161,240,243,252,252,252,252,183,114,67,77,193,240,249,247,212,199,171,145,143,128,76,74,111,69,52,55,44,49,46,46,51,55,51,53,45,45,45,50,86,89,105,143,147,165,203,155,146,75,52,40,73,230,252,252,252,252,251,236,158,133,110,39,52,47,44,60,37,179,252,252,214,230,250,244,247,242,243,240,252,214,198,246,244,244,247,243,252,252,224,195,75,6,131,83,23,39,17,34,55,55,43,58,63,36,55,49,29,26,61,87,47,43,26,141,246,246,250,249,245,234,237,244,240,242,240,243,247,220,214,237,241,244,239,241,239,221,223,235,238,211,214,238,232,190,163,185,217,211,217,212,198,169,115,61,39,16,27,81,82,183,230,162,128,79,126,219,252,252,251,247,243,242,246,239,229,229,234,234,175,152,125,129,135,137,147,128,196,117,46,29,5,54,164,207,227,220,214,214,212,215,211,213,212,211,212,212,211,213,212,212,214,212,213,213,216,217,214,217,216,214,216,214,215,215,214,216,218,218,217,217,217,217,214,213,215,214,213,216,217,214,217,221,219,219,220,222,227,211,183,193,215,217,226,232,239,246,252,232,171,124,78,28,4,83,190,231,247,225,224,225,222,222,217,217,217,219,221,219,220,220,220,220,216,219,220,219,218,220,217,217,220,218,217,217,217,217,218,219,216,218,215,218,217,214,217,219,217,216,215,214,214,216,215,214,212,213,214,214,214,212,215,212,212,112,2,1,3,8,9,7,9,9,10,10,9,10,227,229,229,229,230,229,227,229,230,230,230,230,230,229,229,229,229,229,229,230,229,229,228,227,228,229,231,230,230,232,229,231,232,232,232,231,230,232,234,230,232,233,231,232,232,232,233,235,235,234,235,237,237,237,238,236,236,237,236,235,234,235,234,235,236,235,238,237,235,234,232,235,233,233,236,232,234,234,231,232,232,233,235,233,234,234,233,235,234,235,234,233,234,231,233,233,232,234,233,234,232,235,234,231,232,234,235,234,235,237,235,236,236,234,237,235,235,236,237,237,237,237,237,239,238,239,237,239,240,236,238,238,237,238,237,237,238,237,238,238,239,239,241,241,242,241,243,243,242,243,242,243,243,242,242,243,242,241,242,243,243,242,240,241,241,241,242,240,243,242,241,244,242,241,241,239,240,239,237,238,238,239,237,237,238,238,240,240,239,241,241,241,243,241,242,241,240,240,240,235,234,235,234,237,237,237,240,240,241,241,240,241,241,242,240,239,238,237,236,238,237,239,242,241,242,243,240,240,238,237,237,237,241,241,240,242,241,241,240,240,241,239,241,242,241,243,242,242,242,242,243,243,243,243,242,244,243,244,245,242,241,241,242,240,240,240,239,241,239,240,240,240,241,240,240,240,240,244,239,220,229,242,241,244,241,241,242,243,235,227,240,244,242,234,237,243,241,241,241,242,241,241,241,239,240,241,239,239,240,239,238,240,240,240,239,239,240,240,238,238,238,240,239,238,240,237,241,237,231,237,239,241,238,240,240,239,241,244,244,198,217,208,198,240,237,224,234,208,194,230,234,243,186,213,245,251,245,156,141,148,128,51,134,244,247,251,240,224,234,229,225,234,160,83,47,37,85,131,122,64,27,0,46,208,251,251,250,250,244,216,203,172,143,142,76,57,67,42,14,33,53,17,16,37,29,24,74,53,25,21,15,21,21,29,31,28,27,26,22,19,19,30,22,31,33,21,37,36,37,42,47,43,41,41,31,157,236,237,252,252,250,196,149,154,129,62,66,74,70,86,61,148,243,226,214,237,247,248,245,244,245,245,252,200,208,248,243,247,245,243,252,252,205,156,55,7,134,70,21,64,39,33,32,62,60,46,42,25,51,47,30,30,56,93,51,46,16,84,243,245,250,250,241,235,238,243,242,241,243,243,246,229,208,228,238,239,239,239,240,222,218,233,237,214,208,232,236,198,166,178,215,214,216,219,217,201,145,94,78,41,7,42,71,200,227,127,87,81,189,251,251,251,249,246,249,249,244,226,231,236,253,236,167,162,148,149,143,143,131,47,90,79,36,33,16,177,248,228,231,216,217,215,212,213,213,214,212,213,214,213,212,212,214,212,214,213,213,217,217,215,216,217,218,218,216,216,215,217,215,214,216,216,218,217,218,218,215,214,215,216,215,216,219,217,217,219,221,220,222,224,223,198,179,207,218,225,238,244,249,248,196,139,99,56,5,26,151,227,246,246,237,228,225,223,221,217,216,217,220,220,220,218,217,217,220,219,219,218,217,219,220,218,217,217,218,218,217,217,217,217,217,218,220,218,215,217,217,216,216,216,215,215,215,215,214,212,213,215,213,211,212,211,213,212,215,212,212,112,2,1,2,6,8,7,8,8,9,10,10,10,227,229,228,229,228,228,229,226,229,230,232,231,229,230,229,230,231,230,229,229,230,229,229,229,231,230,231,233,234,232,232,234,234,236,233,234,233,232,232,232,234,234,236,235,236,235,234,236,235,237,236,236,236,235,237,239,237,237,236,237,238,237,237,235,236,236,236,236,237,237,236,236,237,235,234,235,233,232,233,233,233,233,234,235,234,236,236,236,233,230,235,234,232,234,232,232,232,234,234,234,234,232,232,232,234,232,232,233,234,232,234,234,232,234,233,235,235,236,236,237,237,236,237,237,236,237,236,238,236,237,237,236,237,237,237,237,237,237,238,240,240,241,241,240,242,243,242,241,242,242,243,244,243,245,243,244,243,243,244,244,244,244,244,243,242,243,245,244,245,245,244,245,243,242,243,241,239,239,236,237,239,237,240,237,236,238,237,238,239,239,239,239,240,240,239,241,240,239,236,231,229,231,234,235,239,240,241,240,239,240,238,239,240,238,239,237,239,238,238,239,240,241,241,242,243,240,239,239,237,237,236,236,238,237,239,240,241,240,239,240,240,239,240,240,241,240,240,241,241,242,242,243,244,243,245,244,244,245,244,241,242,242,241,240,240,241,240,240,240,240,241,240,238,240,240,239,240,244,231,216,232,241,240,241,241,241,241,241,229,232,243,244,239,232,240,242,240,242,240,239,239,239,238,240,241,239,239,240,237,238,239,239,239,239,239,237,237,237,237,237,237,239,239,240,240,237,240,238,228,234,240,238,239,240,237,239,237,244,237,207,241,220,219,250,209,178,214,221,221,239,237,240,175,216,246,252,246,165,145,139,124,57,144,243,248,251,239,223,242,245,220,133,69,51,51,63,91,174,192,143,72,26,12,92,192,174,139,114,78,53,55,101,134,130,80,12,8,27,15,19,73,45,16,20,13,30,73,57,22,13,10,20,14,20,29,19,16,17,18,19,18,17,20,23,22,19,24,34,30,28,34,33,31,25,31,39,38,34,48,134,61,66,86,137,117,81,185,200,150,115,89,83,130,182,204,245,247,245,246,244,244,250,249,190,220,247,244,247,244,242,253,253,172,146,50,2,141,61,35,110,56,39,27,36,65,53,41,20,47,55,33,26,51,90,50,50,18,65,228,243,250,250,239,233,239,243,243,242,239,241,247,238,209,221,237,237,240,236,238,225,214,229,241,220,206,228,237,207,174,171,205,214,214,216,186,122,89,142,189,123,28,15,32,160,213,143,84,129,227,249,249,249,245,245,247,247,238,226,232,234,253,216,160,161,149,152,143,155,126,44,132,130,88,51,54,219,249,247,227,212,220,214,214,212,210,212,213,215,211,212,213,212,212,212,212,212,214,215,214,215,216,217,214,214,216,215,214,214,214,216,214,217,217,217,218,217,218,219,217,217,217,216,218,218,220,221,218,218,221,224,215,185,189,214,228,241,249,249,222,173,122,79,37,7,91,205,241,243,248,233,229,223,223,219,216,215,218,220,220,218,218,219,217,217,218,220,218,217,217,218,217,217,218,219,217,218,220,218,217,217,217,217,216,219,217,215,217,217,217,215,214,214,216,216,215,215,212,212,213,213,215,212,214,214,214,213,214,113,1,1,4,6,7,7,9,7,9,10,9,10,226,229,226,228,227,226,226,227,230,229,232,230,229,231,231,231,230,232,232,230,229,229,229,230,230,231,229,230,233,233,234,235,236,236,236,237,237,234,233,235,234,234,235,236,238,238,237,237,234,237,238,237,237,235,237,237,236,235,237,237,236,240,238,238,239,235,238,237,236,239,238,239,238,239,238,235,235,235,235,236,236,237,236,235,236,235,236,236,235,236,236,234,234,234,235,234,232,233,235,235,231,232,232,234,235,234,235,233,233,235,231,232,234,232,236,237,235,236,238,239,237,235,236,236,236,237,236,238,237,237,239,237,237,239,238,237,240,239,240,241,241,243,243,244,243,242,244,244,244,243,244,244,244,243,243,245,245,244,245,245,244,245,243,246,244,244,244,245,245,244,244,244,244,243,244,241,239,239,238,239,239,239,239,240,241,239,239,238,237,240,239,239,239,239,239,236,236,232,232,231,230,235,234,236,239,238,238,239,238,239,238,237,239,238,240,240,239,239,239,243,243,244,244,241,241,240,237,235,235,235,237,236,237,238,237,240,239,240,239,238,240,238,240,240,241,240,239,240,242,242,243,244,243,244,244,245,244,244,245,244,242,244,242,240,241,239,241,240,239,240,239,240,239,240,238,239,243,241,223,219,239,241,239,239,238,240,243,236,226,237,244,242,236,233,241,241,240,240,239,241,240,241,240,240,240,240,240,240,238,237,237,237,237,239,239,237,237,238,239,239,239,237,238,239,236,238,240,236,229,232,237,238,238,238,237,239,238,247,233,207,244,206,184,206,202,214,250,245,245,248,238,237,178,223,247,252,243,145,119,110,96,46,148,243,250,250,237,227,245,252,190,99,61,55,37,63,141,240,235,187,166,92,58,44,27,47,17,11,42,18,38,39,20,93,128,98,59,25,15,25,72,47,26,19,17,31,68,65,22,14,8,16,17,17,24,17,13,21,11,17,18,18,18,21,19,16,22,20,22,17,18,27,17,24,22,25,32,24,40,34,30,39,56,98,76,91,187,234,170,97,81,63,87,163,219,248,250,248,242,243,243,252,236,184,230,245,243,245,243,243,253,253,166,132,37,6,136,48,44,109,47,44,27,24,44,57,51,24,46,55,36,28,55,94,53,50,19,51,219,242,249,249,236,235,239,241,243,242,241,242,243,245,213,212,236,237,242,238,239,232,214,226,237,226,205,222,239,217,179,164,199,216,207,215,179,113,43,19,116,161,117,98,69,114,187,139,101,188,251,251,249,247,246,249,248,239,229,233,241,248,253,195,156,154,141,148,137,159,120,54,186,188,120,89,6,93,195,208,226,214,214,214,211,216,215,214,215,214,214,215,215,213,212,212,213,214,214,217,217,217,217,219,217,215,214,217,217,217,217,217,217,216,218,219,219,218,218,220,218,219,219,218,221,220,219,217,218,218,223,224,203,186,204,230,246,247,247,190,143,103,51,12,36,156,229,240,251,240,232,225,223,222,218,219,219,217,216,220,219,218,219,218,220,218,217,217,218,220,220,220,219,218,220,220,218,218,218,218,218,217,219,216,218,218,216,217,215,215,217,215,214,214,216,214,215,215,213,214,214,215,212,210,214,214,213,213,215,113,1,1,3,7,7,7,10,8,10,10,10,10,227,230,229,229,229,226,228,227,229,231,230,230,231,230,230,231,230,232,232,231,234,232,230,232,232,231,232,231,232,232,234,232,234,237,234,237,237,237,238,237,237,236,237,237,238,239,237,237,234,236,237,236,238,238,239,239,237,238,238,238,237,237,237,237,238,239,238,240,240,238,239,239,240,239,237,236,238,237,238,237,236,237,236,238,236,235,235,236,236,236,237,238,237,235,237,237,234,236,236,236,237,235,237,235,235,235,234,235,235,236,236,234,235,235,234,234,235,238,235,237,238,236,237,237,236,237,236,236,237,237,239,239,238,237,239,239,239,241,241,241,241,241,241,242,243,242,242,242,243,242,242,242,243,243,245,245,244,245,245,246,246,246,245,245,245,244,245,244,245,246,244,244,244,241,240,237,237,237,238,241,240,238,238,237,239,240,239,239,238,239,236,237,236,235,232,231,232,232,231,230,232,232,234,235,235,236,237,238,237,237,237,238,239,240,240,239,240,240,242,243,243,242,241,241,240,237,234,235,234,236,236,236,237,238,239,240,240,239,240,239,240,239,239,240,241,241,242,241,240,241,241,242,242,243,245,245,245,244,244,244,243,243,240,240,238,239,238,239,240,239,240,239,238,239,239,238,242,240,220,222,239,241,240,240,239,240,241,231,224,241,243,238,233,235,241,241,240,240,240,240,240,239,241,239,239,239,240,239,238,239,237,237,237,237,237,237,238,238,238,237,239,239,237,239,238,238,238,240,230,226,237,238,238,239,235,240,238,248,226,206,223,168,192,232,228,246,252,240,227,237,239,240,185,229,247,252,231,119,95,89,71,17,126,242,248,250,235,228,245,253,194,101,57,6,2,12,90,188,161,154,177,132,54,14,11,43,36,29,49,24,37,39,17,15,46,134,127,80,48,29,78,49,17,15,14,21,55,73,24,14,14,13,21,22,26,19,16,15,16,15,19,17,15,17,15,20,24,23,18,19,24,20,19,20,23,25,26,21,18,24,19,48,67,95,80,45,53,63,84,81,87,22,66,193,230,250,247,245,244,243,243,252,223,185,238,239,244,244,242,244,253,249,116,117,44,15,144,39,49,98,37,40,25,29,35,35,66,45,46,59,67,104,134,121,57,55,20,44,214,243,250,250,235,236,238,241,240,240,242,241,241,248,221,207,230,234,241,236,238,235,218,224,234,231,206,216,237,223,191,166,188,212,209,217,221,198,155,50,37,120,149,155,136,94,99,96,149,228,252,252,250,246,251,248,244,232,228,240,245,234,233,171,152,130,111,134,133,155,106,73,184,158,113,75,11,3,71,184,222,224,217,213,217,215,215,214,214,214,213,215,216,215,217,214,216,216,214,217,217,220,219,217,219,219,219,217,222,220,217,219,218,221,218,220,220,218,219,220,220,219,220,222,221,218,218,220,219,220,227,222,198,199,229,246,252,227,160,118,72,24,4,36,162,229,249,249,232,232,225,224,220,216,220,222,220,220,221,220,220,220,220,220,222,221,220,220,220,220,220,219,220,220,219,220,219,219,218,218,220,218,217,220,219,218,219,217,217,217,215,215,217,216,215,215,214,215,215,216,216,214,216,214,214,212,213,212,214,113,2,1,2,6,8,7,8,8,10,10,10,9,229,231,227,229,230,227,228,229,228,229,232,229,231,231,231,233,231,231,231,232,232,233,233,232,230,233,231,233,232,230,231,231,236,235,235,237,237,237,236,237,237,237,237,238,239,240,239,238,240,238,240,239,236,237,239,240,239,238,242,240,240,239,237,240,240,239,241,241,239,240,239,239,239,237,238,239,237,236,239,239,239,241,239,240,240,239,237,237,238,237,237,236,237,237,238,237,236,237,237,235,235,236,237,237,234,235,236,233,234,235,234,236,236,236,236,235,236,236,237,235,236,237,239,238,238,237,235,234,237,236,235,237,237,239,240,239,240,238,240,240,241,241,241,242,241,243,243,242,242,242,242,243,243,244,244,244,245,245,245,245,245,245,245,245,247,246,245,246,245,245,244,243,245,242,239,236,237,239,240,241,239,238,238,239,241,240,240,241,237,236,236,233,234,233,233,232,234,233,234,234,232,230,229,232,232,232,235,239,239,238,239,238,241,240,238,240,239,240,241,241,239,239,239,237,239,237,235,237,236,237,236,236,239,236,239,239,239,240,237,239,240,240,241,239,241,240,242,243,243,243,241,243,245,244,244,245,245,244,244,244,242,242,241,240,241,239,239,238,241,239,239,240,237,237,237,240,242,234,216,226,241,240,241,241,241,244,237,224,232,241,242,234,231,239,241,241,241,240,239,239,239,239,240,239,240,240,237,239,238,237,237,236,236,237,238,237,237,237,236,237,237,237,236,237,237,237,237,240,232,227,235,238,238,239,237,240,237,249,218,206,225,206,245,250,232,217,226,200,213,241,241,241,191,234,245,252,222,111,101,94,77,8,100,239,249,250,238,227,245,252,207,84,28,19,41,87,75,48,28,51,136,163,119,38,23,71,38,41,54,22,51,35,16,26,11,22,77,131,110,104,111,47,18,10,16,11,46,76,24,11,15,14,20,17,21,16,16,16,14,16,14,13,17,18,16,20,24,20,17,16,19,21,15,18,20,20,28,15,22,18,31,63,69,97,65,51,47,22,42,74,93,21,56,198,240,250,251,244,241,240,245,252,204,193,244,236,245,243,241,240,253,232,95,110,36,27,145,34,63,91,32,38,17,32,29,25,51,59,71,98,153,174,169,154,77,52,17,34,204,243,250,250,234,235,239,242,241,240,238,240,238,244,232,204,226,237,239,240,236,241,227,228,240,237,211,211,235,227,199,171,178,214,210,222,231,238,221,114,93,147,150,152,127,75,19,57,196,247,249,249,247,247,250,245,232,226,238,244,246,253,210,153,151,117,95,123,131,151,97,104,213,141,88,65,4,49,163,204,222,222,217,217,212,217,215,212,212,214,216,215,214,216,215,216,217,217,218,219,220,222,220,220,219,219,221,221,221,219,219,221,219,220,220,220,221,218,219,220,222,222,222,223,223,220,221,221,221,222,229,219,199,224,249,245,184,132,87,32,5,23,98,176,240,242,236,233,228,225,223,218,217,217,220,220,220,222,222,221,217,220,220,220,221,218,219,220,220,221,219,220,220,219,220,220,220,219,218,219,219,217,218,220,220,217,217,220,219,217,215,218,217,216,215,215,218,216,216,217,217,214,214,214,214,216,216,214,212,111,2,1,2,6,8,7,9,9,9,10,9,10,228,231,230,229,229,227,228,229,228,229,229,227,232,232,232,232,231,234,231,232,230,231,230,229,232,231,230,231,233,231,233,235,236,237,237,239,236,237,237,238,237,236,237,237,239,239,240,240,240,241,239,239,240,239,239,240,239,239,240,239,239,241,239,240,241,241,240,240,240,239,241,239,237,239,239,238,239,239,237,236,238,237,238,240,240,239,239,239,237,240,238,238,239,237,237,239,237,237,236,235,237,233,237,235,234,235,232,234,235,234,235,235,236,234,238,237,236,238,236,238,237,237,237,237,238,237,235,234,236,236,237,237,237,238,241,240,239,241,239,239,240,242,241,240,242,243,240,243,243,242,244,242,243,243,242,242,242,244,244,245,246,245,246,246,246,247,243,244,244,243,242,243,244,241,240,240,241,241,242,241,240,239,238,240,241,241,240,239,237,237,234,235,234,235,234,235,233,235,235,233,232,229,229,229,232,234,235,236,238,240,239,238,239,239,238,239,239,238,236,237,237,239,237,236,236,235,236,236,236,235,237,237,237,237,237,239,238,238,239,239,239,239,239,240,239,240,242,243,244,243,244,243,244,244,244,245,246,246,245,244,244,243,241,241,240,240,239,239,239,237,239,237,237,237,239,240,242,226,213,232,241,239,238,237,240,241,231,224,235,242,240,231,234,240,240,241,240,239,239,238,239,239,238,237,239,237,238,238,236,237,237,238,237,236,236,235,237,237,237,237,237,237,237,237,237,237,237,240,236,229,235,237,237,239,239,240,240,248,220,227,244,213,232,220,184,195,235,226,235,247,242,242,200,236,248,252,220,106,88,85,73,11,108,240,249,250,235,228,245,253,199,69,57,78,107,93,54,35,26,29,71,155,181,113,64,75,46,55,65,48,60,36,26,32,16,22,13,39,107,164,156,66,30,12,11,11,40,71,31,16,13,14,22,17,17,15,15,14,15,13,18,19,12,13,17,26,22,18,17,16,18,14,16,13,18,18,19,17,13,19,49,83,75,70,54,53,39,22,51,68,78,33,72,193,238,250,247,241,240,236,245,251,190,201,245,234,247,239,242,243,251,218,77,116,44,33,143,28,70,84,21,36,21,27,30,25,32,54,76,100,133,134,124,104,71,54,13,33,207,242,249,250,229,234,239,241,241,238,238,236,237,242,236,212,223,244,252,252,252,252,251,249,246,244,218,205,231,227,202,171,171,202,215,223,236,243,217,144,144,171,156,148,130,78,54,133,228,247,249,249,248,246,248,240,229,231,245,244,243,234,181,147,151,134,124,128,132,151,133,156,207,107,78,49,9,116,238,237,228,218,217,218,212,215,212,211,214,214,214,218,217,216,219,220,222,222,224,222,220,223,222,220,222,218,222,222,222,222,220,222,217,218,219,220,220,220,223,227,230,236,239,233,228,222,221,220,219,227,231,214,217,237,209,151,114,61,9,14,117,175,196,243,248,237,232,227,221,221,217,217,218,219,223,220,220,221,217,217,219,219,217,217,219,217,219,219,217,217,218,219,219,218,218,220,216,218,220,217,217,216,218,219,217,218,220,218,218,219,218,215,217,220,218,215,214,214,215,215,214,214,212,213,213,213,215,212,217,114,1,1,3,7,7,7,10,9,10,10,9,9,225,227,229,229,229,227,226,229,227,229,232,230,233,231,230,233,232,232,234,232,234,232,232,234,234,234,231,232,234,232,233,235,235,235,235,236,238,236,237,240,238,239,237,240,239,240,242,237,240,239,240,240,239,240,240,240,241,241,241,240,240,241,241,241,241,239,240,240,241,240,239,240,241,240,240,241,240,239,239,237,239,238,238,239,239,239,238,239,239,239,239,237,239,238,236,235,235,236,236,237,237,235,236,234,234,234,234,235,235,234,235,236,236,237,237,236,235,235,236,236,238,236,236,237,237,236,234,234,235,236,237,237,236,237,239,239,239,240,240,240,242,241,241,242,241,241,241,243,242,241,242,242,241,239,237,236,241,241,244,246,244,243,241,243,242,241,240,240,240,241,242,241,242,242,243,244,244,242,243,242,242,240,239,238,239,240,239,238,235,235,236,237,236,237,238,234,236,235,232,232,230,229,229,229,232,235,237,239,239,239,239,239,239,242,239,240,239,239,237,237,236,235,237,233,234,233,231,235,235,234,236,234,237,235,238,238,238,239,240,239,239,240,241,242,242,240,241,241,242,243,245,245,243,243,245,245,245,246,246,245,242,242,242,242,241,241,241,239,241,239,238,239,238,237,238,242,240,218,216,237,240,238,238,238,240,238,222,226,241,241,237,232,239,241,240,241,239,240,240,241,242,239,240,239,237,239,238,239,238,240,240,237,237,235,237,236,237,237,238,237,236,238,236,238,239,240,238,240,237,228,234,239,237,237,239,241,243,250,215,232,229,169,202,213,215,236,252,241,252,244,239,238,201,242,244,252,210,87,63,53,59,6,107,239,247,250,234,229,245,253,200,73,62,66,64,59,61,55,44,50,39,99,188,178,122,71,27,55,68,54,61,43,32,34,24,35,22,31,20,63,154,97,68,42,23,13,34,76,30,12,12,14,16,17,18,14,17,17,15,19,16,16,15,17,32,33,27,20,12,17,19,14,17,16,15,20,21,15,22,21,68,101,77,62,43,54,33,39,69,66,61,21,77,207,246,248,248,239,240,235,247,241,180,212,244,236,247,237,240,243,251,206,62,105,50,46,137,22,76,73,21,36,8,22,22,28,35,30,53,101,110,77,59,53,48,57,30,79,222,244,250,250,228,234,239,239,244,239,236,236,236,239,248,229,238,249,252,252,252,252,252,240,251,249,223,205,225,229,208,180,164,196,214,224,207,178,193,183,183,183,156,137,109,99,128,227,252,252,250,247,251,247,243,231,232,244,249,247,253,226,159,148,154,151,150,141,141,155,173,188,129,61,59,35,12,150,241,231,234,217,218,216,214,214,211,212,210,216,214,221,228,230,243,249,248,240,234,232,230,227,225,223,220,220,222,221,222,221,223,223,218,219,219,221,222,225,234,239,248,248,251,249,228,226,224,223,230,236,240,214,197,167,124,83,28,6,80,198,217,215,235,240,238,230,227,223,218,219,218,220,219,218,218,219,220,219,220,220,218,217,217,219,218,217,220,221,218,217,218,218,217,217,218,218,220,219,217,218,218,218,221,221,218,218,219,215,216,219,217,217,217,215,215,214,216,215,214,214,213,215,214,214,214,214,216,214,216,113,1,1,4,7,7,7,10,8,9,10,9,9,225,229,226,225,229,226,226,227,228,227,229,230,230,229,231,230,230,233,232,231,232,235,233,234,235,235,236,234,233,236,237,235,236,236,238,240,237,237,237,238,238,237,239,239,240,240,240,240,238,239,240,239,239,238,237,239,239,240,241,239,241,241,239,240,239,238,238,238,240,239,239,239,238,240,238,239,241,239,239,239,240,240,239,238,237,238,239,237,234,237,237,237,239,236,236,236,235,236,236,235,237,236,237,235,235,235,234,234,234,236,234,235,236,234,236,233,234,235,232,235,233,235,237,236,237,235,234,237,237,237,237,237,237,239,237,237,239,238,240,241,242,242,242,241,242,242,241,242,240,239,238,235,234,234,235,234,236,239,241,242,238,235,234,232,230,231,233,235,239,240,241,242,243,244,244,245,244,243,244,242,241,241,239,239,236,237,238,238,237,236,236,235,235,237,235,235,235,236,236,235,231,231,230,231,232,233,238,239,241,241,242,241,242,242,241,241,239,240,239,235,234,233,232,232,233,229,231,231,232,233,234,233,232,235,235,236,236,235,237,236,236,237,238,238,239,238,238,239,242,242,243,244,245,245,244,243,244,244,244,241,242,241,241,244,242,243,241,241,241,242,239,237,237,237,238,242,235,216,222,240,240,239,239,237,241,234,220,230,240,239,232,229,238,240,241,240,239,239,239,239,238,239,238,237,239,238,239,239,240,241,239,239,239,237,236,236,238,237,237,237,239,239,238,239,238,239,239,239,239,227,229,236,237,240,240,239,245,243,209,216,199,190,233,236,239,246,252,223,221,239,238,233,200,239,243,252,212,100,81,81,72,9,106,238,245,250,232,229,244,252,231,114,46,5,17,53,66,54,40,48,44,54,115,171,180,108,32,30,57,53,62,41,33,33,33,39,30,35,30,19,63,119,157,112,72,33,29,74,35,14,14,13,20,18,17,17,17,16,13,18,15,12,19,26,39,37,28,19,21,22,21,27,16,22,27,27,27,29,30,40,63,83,86,67,52,57,42,58,92,87,54,12,107,232,250,248,242,236,239,233,249,231,174,222,240,232,244,236,238,244,251,193,54,90,51,57,125,22,84,65,10,32,10,25,19,24,30,31,26,62,85,56,45,33,48,43,41,117,230,246,249,249,237,239,247,252,251,247,247,242,239,241,253,237,212,206,234,240,176,164,144,157,236,250,232,205,227,238,218,191,168,187,216,195,123,75,98,136,137,127,107,81,68,126,198,234,250,250,252,249,250,246,237,228,237,247,248,234,234,191,151,148,157,150,148,139,150,181,155,125,69,33,51,15,34,208,247,233,238,220,220,216,214,214,216,215,217,215,218,231,236,247,246,246,251,251,252,251,249,243,231,228,225,223,223,219,219,221,220,222,221,222,222,222,223,232,241,243,235,222,227,246,244,226,227,232,243,246,232,175,136,98,53,11,26,126,229,242,222,203,229,237,232,226,221,219,214,220,221,221,220,218,221,217,219,221,218,220,219,220,219,218,220,219,220,219,220,220,218,218,220,219,218,219,217,218,217,218,220,217,218,219,218,217,215,217,217,215,214,215,217,214,215,215,214,215,215,216,215,214,214,214,216,214,215,213,213,113,2,1,2,6,8,7,10,9,9,10,9,10,230,229,229,229,229,229,227,229,227,226,230,227,229,229,229,234,231,232,231,227,232,231,232,233,230,234,236,234,237,236,236,239,238,238,240,239,240,239,239,239,238,239,237,239,240,240,240,238,239,237,237,237,237,239,240,239,239,240,239,240,242,241,238,239,240,240,239,240,240,238,240,240,239,239,239,241,239,239,237,239,241,240,240,239,239,238,238,238,237,239,238,237,237,238,237,238,239,238,237,237,236,234,235,234,236,236,236,237,236,235,234,235,236,236,234,235,234,234,234,233,235,234,237,237,238,237,237,238,239,239,240,240,241,239,239,240,239,241,240,240,240,240,241,240,242,241,240,242,240,237,237,236,237,238,238,238,237,235,235,234,230,230,228,230,232,234,236,239,241,244,245,244,245,244,245,245,244,245,242,241,240,240,241,237,236,235,235,239,239,239,237,236,235,237,236,236,237,236,234,234,233,232,232,233,234,236,239,241,242,244,245,246,243,242,243,242,242,241,238,237,233,233,234,231,232,230,232,235,234,236,236,234,234,235,236,235,235,235,236,235,236,236,236,236,237,238,237,240,241,242,243,243,244,245,245,246,246,245,245,242,241,241,243,245,246,245,243,243,241,241,240,239,239,239,239,242,229,214,231,242,240,240,236,240,239,224,223,237,240,236,228,234,240,239,241,239,238,239,239,239,239,239,240,238,237,239,237,240,241,242,242,240,242,240,239,239,239,238,240,239,241,241,238,240,239,237,239,240,242,229,229,238,240,241,240,241,245,240,203,223,215,216,252,245,237,248,236,193,226,238,241,225,192,241,241,252,227,146,142,134,119,42,139,243,248,251,234,229,245,251,241,128,27,2,12,57,62,45,48,45,28,35,39,91,184,152,53,24,41,50,55,36,17,23,23,33,17,35,15,60,189,225,242,214,193,126,110,118,31,8,11,12,14,14,17,19,17,14,16,22,20,16,34,37,33,33,37,37,32,35,35,39,39,34,39,40,41,39,44,53,75,79,69,65,67,64,56,78,92,88,45,59,174,245,251,244,243,236,239,235,252,213,177,234,234,237,241,235,237,248,251,175,40,59,65,69,108,29,87,56,12,32,10,24,20,16,24,34,28,38,73,87,70,48,76,51,36,138,230,246,250,251,246,250,252,252,253,253,251,244,240,244,253,208,121,43,30,79,28,53,89,74,196,241,249,248,249,250,231,204,173,185,216,208,112,17,17,40,49,37,44,27,29,122,229,245,247,247,252,249,249,240,233,239,247,252,252,252,235,161,149,156,159,162,142,139,150,156,126,61,25,24,33,2,91,239,252,252,238,225,229,221,218,213,216,218,216,220,225,238,228,179,150,152,198,237,250,250,252,252,241,226,226,225,222,220,222,222,221,221,223,222,221,222,225,235,228,168,123,86,112,204,228,232,240,248,250,238,166,114,77,24,5,83,205,237,250,217,183,218,231,231,223,218,220,220,220,220,219,220,221,221,222,220,220,221,220,219,219,221,221,220,220,219,220,220,220,221,220,221,218,220,221,218,220,220,218,220,220,220,219,220,220,219,218,218,218,214,217,220,218,217,217,218,216,216,215,216,215,215,214,214,214,216,217,213,214,112,2,1,3,7,8,6,9,9,9,10,9,10,229,230,228,227,231,229,230,229,230,229,229,230,229,232,233,232,232,235,234,232,233,232,231,232,232,231,231,232,232,235,236,234,235,236,237,239,240,238,239,240,239,239,239,239,240,236,236,237,234,233,234,237,236,235,237,237,238,238,238,238,239,239,238,237,239,238,239,240,237,239,238,239,239,240,237,236,237,237,238,236,239,237,237,239,240,239,236,237,237,239,236,236,239,238,238,237,238,236,236,236,234,233,232,233,236,236,236,234,235,235,235,235,234,232,234,234,232,234,232,234,234,235,236,235,236,237,236,239,238,238,239,237,239,240,239,239,241,239,240,241,241,241,241,240,241,240,242,242,241,240,239,240,240,238,237,236,235,232,229,230,229,226,228,232,234,237,240,241,243,245,247,247,247,246,245,245,244,241,240,239,237,237,238,236,235,235,236,238,236,235,237,236,234,235,233,233,234,230,232,232,232,233,233,234,235,238,240,240,242,244,243,244,245,245,242,243,242,242,238,238,237,234,233,232,231,230,232,232,234,235,232,234,231,231,232,232,234,233,232,234,233,231,233,236,235,232,235,237,239,242,243,244,244,244,245,244,245,244,243,242,242,242,244,244,244,244,243,242,240,241,239,238,240,239,238,241,222,214,233,239,239,238,238,240,234,217,223,238,239,231,225,235,238,237,239,237,237,238,237,238,238,239,239,239,240,239,239,240,241,241,242,243,242,240,241,239,241,241,239,240,238,237,237,238,238,239,239,239,241,235,232,239,242,241,243,242,249,237,215,241,223,232,242,222,211,203,230,215,241,245,237,214,189,240,239,252,231,160,147,140,122,53,171,244,247,250,227,229,245,249,236,129,40,1,11,49,66,57,41,30,22,30,42,35,121,180,121,53,44,44,53,28,19,20,18,19,24,30,35,190,252,252,250,179,130,162,178,176,63,16,7,5,14,13,17,14,17,14,17,22,21,33,43,37,24,45,70,60,71,72,71,65,45,46,42,49,50,47,54,65,77,79,72,70,70,60,61,74,84,64,33,57,182,246,250,246,238,234,236,234,250,197,182,236,232,240,239,239,239,250,249,173,74,46,73,74,69,38,93,45,15,32,8,21,16,21,18,29,28,86,171,185,145,78,89,84,50,124,217,242,251,251,250,239,217,235,247,248,242,229,244,249,250,195,82,23,3,11,10,72,96,24,137,246,251,251,253,253,249,216,186,179,214,227,151,59,41,54,46,54,34,17,24,52,158,240,250,250,250,247,245,238,238,239,245,250,234,229,187,149,153,153,160,158,138,141,146,139,85,27,29,15,31,8,92,226,246,246,249,229,228,216,221,215,214,214,216,218,230,237,166,89,81,48,18,109,119,152,199,229,230,225,226,226,225,224,223,222,220,224,221,220,220,222,223,231,186,93,36,17,11,69,184,232,251,251,208,148,101,56,4,27,153,232,250,250,220,183,202,225,229,220,219,220,217,220,221,220,218,219,221,220,217,217,219,217,219,221,219,218,217,217,221,220,222,221,218,220,217,218,220,218,218,220,218,218,219,217,217,217,220,219,216,220,219,216,217,219,219,218,218,214,218,215,214,218,216,214,214,216,213,213,217,214,214,212,215,114,1,1,4,7,7,7,10,9,10,11,10,10,226,230,228,230,232,229,227,230,230,231,233,232,232,231,232,232,233,234,232,233,232,231,233,233,234,232,232,232,235,234,234,235,233,235,236,237,238,238,239,237,238,239,239,239,237,237,237,236,237,236,235,235,237,238,237,238,237,239,238,240,240,239,241,239,237,238,238,239,240,238,239,239,238,237,237,238,237,239,237,239,239,237,239,237,239,237,237,237,237,236,235,237,235,237,236,236,237,236,237,237,236,236,235,235,236,232,235,234,234,234,234,234,232,234,234,234,236,235,234,235,234,235,235,235,235,237,240,236,237,237,237,237,237,239,240,241,240,241,240,240,239,241,242,241,243,241,241,241,243,243,242,240,241,237,235,235,231,229,231,230,229,231,231,234,236,237,239,240,243,246,247,247,245,246,246,243,240,238,239,236,235,236,234,234,235,235,237,237,236,235,236,235,233,235,232,232,233,232,235,236,233,234,236,235,237,240,242,244,243,245,244,244,245,244,244,243,244,244,244,243,239,236,234,231,230,229,230,233,233,232,232,230,230,230,231,231,231,232,231,232,231,230,233,231,233,232,235,239,240,244,245,245,244,244,244,244,243,244,243,241,244,244,245,246,244,244,243,243,241,241,242,241,241,239,240,240,220,223,240,240,239,238,237,240,226,215,231,237,235,226,228,236,237,238,239,237,236,237,239,239,240,240,241,240,239,240,241,242,241,244,243,243,243,242,242,242,241,240,240,238,239,240,239,240,239,237,239,239,244,237,233,241,243,240,242,242,249,237,217,244,207,190,220,203,188,221,243,229,251,236,237,204,185,241,236,252,223,147,133,124,105,49,169,244,248,251,228,228,244,249,238,188,119,35,15,31,47,44,26,18,20,21,36,45,68,130,179,137,64,49,49,27,20,17,14,24,31,150,250,250,250,250,224,105,19,2,110,203,128,70,29,9,13,10,14,15,11,20,15,20,24,57,125,84,26,30,59,79,85,89,80,59,39,33,31,35,42,42,55,62,75,86,69,62,62,46,61,77,72,50,9,66,211,246,250,240,234,234,233,235,246,184,191,241,238,251,252,252,252,252,250,200,117,41,86,80,34,47,93,36,17,24,15,44,33,27,17,23,38,115,184,160,118,49,60,100,96,133,203,240,252,252,211,135,74,33,55,176,229,238,252,252,250,146,73,29,2,30,16,8,12,12,132,187,155,133,175,234,250,247,188,172,207,238,191,94,78,66,57,63,39,28,29,1,95,235,250,250,249,245,247,242,246,246,247,252,253,202,150,142,155,153,162,158,136,141,145,123,48,9,16,22,20,27,28,14,128,222,250,250,224,222,221,221,223,218,227,234,247,250,171,93,104,57,7,7,8,10,27,147,219,225,235,223,228,228,233,237,239,235,228,225,224,224,222,227,183,89,45,29,8,91,211,238,250,177,123,76,27,5,97,212,245,246,250,226,181,187,222,227,220,220,223,222,221,221,221,220,219,219,220,219,219,218,217,219,220,220,220,220,218,221,219,217,219,218,219,220,218,221,218,219,223,218,221,219,217,218,217,217,219,219,217,219,218,218,218,219,220,217,218,217,217,215,215,217,215,215,214,216,214,213,216,217,216,212,216,113,2,1,4,7,7,8,10,8,9,11,10,10,229,228,229,229,229,226,227,229,229,230,232,232,232,233,231,231,231,230,230,229,231,231,233,235,232,232,233,235,235,234,236,235,234,237,237,238,238,237,239,237,236,237,235,238,237,234,237,236,236,237,234,237,237,237,238,237,239,237,238,237,237,238,239,238,238,237,236,238,237,237,237,239,238,238,237,236,236,236,237,236,239,237,236,237,237,236,236,237,235,234,236,237,235,235,236,234,236,237,237,236,237,235,233,235,233,232,232,233,231,232,234,232,233,233,235,233,232,234,232,234,235,234,235,235,236,234,235,234,234,234,236,234,236,238,237,240,241,239,241,240,239,239,239,239,242,241,242,244,242,240,238,237,234,232,232,232,233,233,233,232,233,234,235,236,233,234,237,240,241,244,244,241,240,239,237,235,236,235,233,233,232,234,234,233,234,235,237,237,235,236,234,233,233,232,233,233,234,233,236,235,234,234,235,237,238,239,241,242,243,243,244,244,244,246,244,244,245,245,243,243,239,235,232,232,230,230,233,232,234,231,231,231,231,232,232,230,231,231,230,229,229,231,229,231,234,235,237,238,241,244,245,246,245,244,244,244,244,244,244,244,243,244,245,244,246,244,244,242,241,241,241,240,239,241,244,237,222,231,243,240,240,236,240,238,220,223,237,239,232,224,234,239,236,239,240,238,239,242,240,238,241,242,243,243,244,241,241,243,241,243,243,242,244,244,243,242,243,242,242,241,242,240,240,242,242,240,240,240,241,239,235,240,244,242,244,242,248,226,214,214,179,211,231,222,225,244,252,201,225,237,235,202,185,238,235,252,212,151,139,128,102,39,170,243,246,251,227,232,243,244,244,253,227,118,35,1,3,25,22,16,20,17,32,48,60,50,121,184,130,87,39,23,23,10,19,18,91,235,248,232,198,121,103,45,13,21,8,94,150,85,104,49,32,11,13,15,14,14,13,23,22,157,249,137,29,11,67,111,83,61,48,36,22,21,19,24,29,37,55,63,85,88,71,53,35,34,50,55,46,29,103,173,209,238,232,237,236,236,233,243,244,186,218,250,252,252,252,252,252,252,248,199,133,32,84,77,12,52,86,29,25,10,97,157,62,25,12,23,65,135,168,113,77,39,45,131,130,137,211,239,251,249,164,87,26,5,18,122,222,249,253,253,165,87,31,7,9,7,18,128,145,89,60,12,6,14,17,81,193,235,207,165,193,241,214,129,101,64,34,39,37,30,107,68,119,234,250,251,250,247,249,243,246,245,248,234,234,176,147,147,153,155,167,155,139,138,134,137,99,96,41,40,33,28,27,27,18,95,212,234,246,219,223,222,225,230,239,249,249,249,170,84,12,60,198,176,62,18,21,136,242,237,232,230,237,245,250,250,249,249,234,228,229,228,236,249,226,167,127,66,69,194,246,208,164,109,64,16,28,160,236,243,252,249,231,186,174,212,224,224,223,223,222,220,224,221,219,221,219,220,221,221,220,219,221,222,221,219,220,221,223,222,220,220,217,218,221,222,222,220,221,219,218,220,220,221,218,218,220,218,217,218,219,219,218,217,218,219,218,220,219,215,219,219,217,217,217,216,216,217,215,215,217,216,216,216,217,112,2,1,3,8,8,6,10,9,10,10,10,10,228,232,229,229,229,227,231,231,231,231,231,233,233,233,234,232,232,233,232,233,232,232,233,231,232,231,234,233,233,234,235,238,237,239,239,239,238,237,237,234,235,234,234,235,236,235,236,234,235,236,237,238,237,236,237,240,239,235,235,237,237,235,235,235,237,238,237,238,237,236,236,236,236,236,236,235,237,238,236,237,235,235,237,236,236,234,237,236,237,236,235,235,233,235,237,236,235,234,236,235,233,236,232,234,234,233,231,231,232,231,232,231,231,233,233,229,232,232,230,232,231,232,234,234,233,233,234,232,232,232,235,236,237,240,239,240,240,238,239,237,239,243,241,242,241,239,240,239,239,236,235,233,231,232,231,233,233,236,240,236,236,235,235,234,234,236,237,240,237,239,236,235,232,229,231,230,231,232,232,231,231,233,230,233,233,232,235,234,236,234,234,234,232,233,233,236,235,234,235,236,236,237,236,238,240,239,238,239,239,239,242,243,245,245,245,245,244,244,241,237,232,229,229,231,231,230,232,233,232,233,232,230,230,229,232,232,230,230,229,232,232,230,233,232,237,237,236,241,243,244,244,244,245,245,244,245,244,244,245,245,245,242,242,244,243,243,242,242,242,242,244,242,244,244,246,236,224,236,241,239,240,239,240,231,217,229,241,237,227,226,237,239,239,239,240,241,244,245,245,245,245,245,246,245,245,245,241,242,243,244,242,242,241,242,242,243,242,242,243,242,244,244,243,241,240,240,239,240,240,240,237,241,245,241,243,245,247,221,207,222,211,244,252,235,235,252,215,182,230,233,237,196,191,243,236,253,206,157,150,139,109,43,177,243,247,251,228,236,245,244,235,252,251,174,136,54,1,7,15,14,16,15,27,48,67,38,30,132,187,139,50,12,10,12,24,51,40,65,64,39,90,51,37,45,15,21,12,63,62,71,160,148,92,54,25,19,19,17,7,23,20,125,210,89,33,15,56,102,66,48,34,15,20,17,17,29,18,30,63,81,96,73,49,53,29,31,27,51,92,130,172,134,174,226,232,243,238,246,252,252,252,229,251,251,252,224,161,143,171,241,244,194,127,15,82,91,12,60,67,27,29,5,161,193,53,26,12,35,97,174,195,150,117,91,75,97,107,141,236,248,251,251,179,77,29,5,23,194,251,251,248,168,78,21,1,15,2,58,211,249,235,97,7,5,8,13,21,1,54,188,204,178,188,237,233,169,144,81,30,35,11,66,214,165,151,240,243,251,250,248,249,240,243,248,251,251,214,158,150,153,153,165,167,148,138,135,140,169,181,216,185,76,14,13,35,29,30,7,74,211,248,243,223,219,234,237,249,240,182,134,87,43,42,155,241,241,191,59,32,154,241,241,240,238,242,240,245,235,229,244,249,237,240,239,249,249,250,223,142,74,15,110,167,131,109,48,14,74,190,241,250,250,242,235,196,171,196,222,226,221,223,223,222,221,219,220,220,219,219,218,219,220,222,221,220,219,218,218,219,221,220,221,221,219,221,218,220,220,220,220,222,222,220,218,219,218,218,220,221,219,218,219,219,220,218,220,219,219,219,218,220,217,218,219,218,217,217,221,214,214,214,215,217,215,216,213,215,113,2,1,2,7,8,7,10,9,10,10,10,10,227,229,230,229,229,228,230,232,230,230,232,230,234,234,230,232,232,231,234,233,234,232,231,232,229,230,232,230,233,235,234,235,233,237,236,237,237,232,234,233,233,235,234,236,235,232,235,234,234,235,233,235,236,235,239,237,236,237,236,236,237,234,235,236,236,237,235,236,235,236,235,234,235,235,235,236,234,236,237,235,235,235,234,232,236,235,233,235,234,232,234,234,234,234,234,235,234,234,233,232,234,233,232,235,235,233,233,231,232,231,230,230,230,232,232,232,232,231,230,231,231,232,233,231,230,230,232,231,233,234,232,235,237,240,237,237,236,237,238,236,239,237,239,239,236,234,234,234,234,235,232,232,234,232,231,234,234,235,236,237,239,237,237,238,237,237,239,239,237,235,233,232,231,231,231,231,233,231,233,234,231,232,232,230,230,232,233,232,230,232,233,232,232,230,232,233,233,234,235,236,238,240,241,240,239,238,238,240,240,240,241,242,243,243,245,242,243,242,236,232,229,227,227,227,228,228,229,229,231,233,232,231,228,230,232,233,234,231,233,235,234,233,232,235,238,238,240,242,245,245,245,244,244,244,245,246,245,246,246,245,244,244,244,242,244,241,243,243,243,243,241,244,244,245,246,231,227,241,241,241,240,239,240,224,220,235,240,234,225,231,238,239,241,241,245,247,246,246,245,245,245,244,244,244,245,245,243,243,243,244,242,241,240,242,244,242,243,242,242,244,245,244,244,241,241,241,241,242,241,243,238,241,244,242,242,245,246,220,230,233,222,249,239,212,188,212,222,211,251,238,231,196,197,242,239,252,200,155,123,125,105,44,185,243,249,251,226,236,243,241,223,248,233,194,232,164,145,69,3,3,12,14,22,53,72,42,23,73,162,195,93,24,8,20,46,46,35,29,27,38,96,82,44,30,12,25,19,65,72,32,36,113,185,135,84,51,30,19,14,27,29,31,47,50,38,15,17,55,56,39,19,10,17,15,21,24,15,35,50,71,100,66,41,19,17,44,88,191,196,169,123,133,222,235,243,251,251,251,251,251,252,227,252,252,248,138,101,99,45,141,159,133,99,10,94,98,19,65,60,24,35,8,49,110,60,29,14,48,165,204,179,123,112,96,40,44,57,106,248,248,250,234,123,49,16,1,104,240,252,252,175,68,24,2,33,70,101,220,253,253,135,23,28,78,102,56,11,8,19,129,198,184,198,231,228,169,145,63,24,13,33,189,234,223,148,217,248,247,248,244,244,242,245,234,234,227,186,156,156,152,156,164,164,146,135,133,156,212,201,250,232,107,88,136,97,44,24,23,18,97,233,237,236,224,235,242,196,123,54,10,1,18,11,25,162,226,220,125,17,58,191,232,249,247,211,145,124,83,110,199,241,238,237,214,214,242,229,185,156,107,61,46,39,39,42,10,82,194,248,248,246,239,238,210,171,180,216,227,225,223,221,223,223,221,220,217,218,218,221,225,228,231,231,231,224,221,220,219,218,219,220,220,218,217,219,221,216,217,221,220,221,217,219,221,219,220,220,220,217,220,218,218,217,218,220,217,217,220,218,218,219,220,217,217,216,217,216,213,214,214,213,215,216,218,216,214,214,114,2,1,4,6,7,8,10,8,9,11,10,10,225,229,228,229,229,226,228,229,232,231,233,233,230,232,232,231,233,232,232,235,235,235,230,229,229,227,229,230,232,234,235,234,234,236,233,235,234,232,235,233,233,235,236,238,232,231,234,232,235,234,235,234,234,233,234,236,235,237,239,239,236,235,239,236,236,237,234,234,235,233,235,235,234,234,234,231,232,232,232,232,233,234,234,234,232,233,235,231,232,236,232,233,234,234,233,232,233,231,231,231,232,231,231,236,233,236,234,232,231,227,230,230,228,230,233,231,232,232,232,233,232,231,233,234,232,232,233,234,233,234,236,236,236,236,235,235,239,234,233,233,233,235,231,232,233,231,233,231,233,235,237,236,236,235,232,235,236,236,236,237,239,238,238,238,238,239,237,238,236,236,231,230,233,232,235,233,235,235,236,235,231,235,232,233,232,233,235,232,232,231,232,232,231,231,233,231,232,237,239,239,241,242,241,240,241,241,240,241,241,241,242,241,242,242,241,241,240,237,232,228,227,227,225,228,229,227,228,231,231,233,234,229,232,231,233,236,231,234,233,234,234,230,236,236,237,241,240,243,245,244,245,244,244,242,242,245,246,246,246,245,245,246,245,244,242,245,244,245,245,244,244,243,246,247,241,230,233,244,242,241,242,242,239,225,232,244,242,236,230,239,243,241,245,245,248,249,248,247,247,246,246,247,244,245,244,244,245,243,244,242,241,241,240,242,242,244,245,244,244,245,245,244,245,244,243,243,242,244,245,245,242,237,242,243,240,246,241,215,225,224,186,204,209,181,192,240,240,230,248,235,232,193,201,241,241,252,191,145,112,116,96,39,185,244,248,250,226,236,244,242,225,237,225,209,252,252,245,174,71,12,1,5,4,21,53,36,21,9,16,109,123,56,21,26,36,34,25,28,32,45,94,83,42,29,19,29,34,67,79,40,30,27,64,136,165,128,76,41,28,32,33,30,29,30,36,20,17,39,41,33,15,15,18,11,20,27,21,42,40,35,63,69,112,146,174,179,171,230,191,130,120,169,249,249,252,252,250,250,210,160,105,68,130,194,146,53,122,104,25,21,11,23,49,13,74,103,42,74,50,22,32,3,75,116,57,32,17,78,184,193,123,63,43,129,141,74,24,55,216,236,183,99,38,8,13,5,42,191,194,148,80,12,11,98,239,251,252,252,240,125,34,111,246,248,244,188,95,15,15,125,188,203,203,235,191,104,63,18,8,24,97,205,251,243,125,185,250,245,249,245,249,249,250,253,236,201,160,152,158,151,160,167,161,143,136,133,170,235,177,148,110,103,232,248,208,109,35,23,12,25,159,233,249,236,247,201,120,59,2,3,9,17,39,17,37,154,205,144,53,14,94,219,249,249,205,100,49,19,9,75,179,210,220,137,102,130,111,109,137,138,113,92,39,22,27,35,154,238,249,248,239,238,223,179,167,209,226,225,224,222,223,221,220,221,220,224,228,233,243,247,249,249,249,246,233,226,223,224,219,221,221,222,221,217,222,219,221,221,223,223,219,221,222,219,219,221,221,220,218,220,220,220,220,221,220,219,218,218,219,218,219,216,216,215,217,216,215,216,214,217,216,214,214,217,216,213,218,113,2,1,4,7,7,8,10,8,10,10,9,10,229,230,229,229,230,227,229,230,230,229,233,232,231,232,230,234,232,233,232,231,233,231,232,229,226,225,227,227,230,230,231,235,236,237,234,235,233,232,234,234,234,235,235,236,232,231,232,232,232,232,232,231,232,233,235,234,236,239,237,236,240,235,237,237,236,238,235,237,236,235,234,232,232,230,232,232,230,231,231,231,231,231,234,232,232,231,232,234,233,232,234,232,233,235,234,234,233,231,232,233,232,232,231,233,236,235,233,230,231,229,230,233,230,231,231,228,229,231,232,234,235,235,235,234,234,232,232,232,234,235,235,235,235,235,232,232,233,230,232,232,233,230,231,232,228,229,231,228,231,232,232,235,235,231,234,235,233,237,236,237,239,237,239,240,237,234,234,234,234,233,232,232,231,231,232,232,233,234,235,235,236,236,235,234,234,234,233,231,230,231,229,231,232,231,233,235,236,236,238,236,239,239,239,241,240,240,241,242,240,240,240,240,240,240,240,237,235,230,229,226,226,225,226,227,227,231,231,230,231,232,231,230,230,232,232,235,235,232,232,232,232,232,231,234,236,235,239,241,242,244,245,244,244,243,243,243,244,245,245,246,245,246,245,244,246,245,245,244,245,245,245,245,245,247,240,227,237,244,240,241,241,245,236,229,242,245,244,237,238,244,246,247,247,247,247,247,247,246,245,245,246,246,247,247,245,245,244,244,244,243,242,240,241,242,244,245,244,244,245,247,244,243,245,244,244,244,242,243,242,244,241,235,239,241,239,246,233,206,212,187,177,220,221,222,231,252,226,189,237,237,233,196,202,241,242,252,187,159,122,118,94,38,186,243,249,250,225,236,244,240,230,228,221,230,247,252,251,218,147,142,109,32,4,19,20,5,6,10,9,29,93,108,33,8,21,17,23,17,27,35,65,82,39,31,20,42,51,68,83,48,38,26,49,37,61,153,164,113,66,47,38,34,30,23,22,30,33,31,30,29,24,15,18,15,21,30,27,49,19,45,123,118,188,249,252,222,200,226,176,122,107,193,250,250,251,251,158,97,34,2,5,9,24,13,34,21,31,65,17,29,31,19,61,33,78,125,63,84,47,15,35,5,170,200,56,29,39,88,122,112,94,36,29,160,136,55,35,9,26,55,56,27,32,119,127,60,27,4,23,46,12,44,175,250,253,253,252,252,136,37,112,241,252,252,251,185,96,18,14,162,223,225,237,192,118,56,26,17,6,114,185,194,233,162,58,121,234,245,248,248,250,252,234,234,211,171,142,141,145,151,162,169,157,141,136,141,181,225,168,46,9,62,214,239,240,162,73,22,17,8,75,213,247,247,248,183,90,43,4,19,145,210,145,76,11,21,98,118,68,19,27,149,241,242,222,123,78,21,2,20,122,203,231,173,64,26,4,19,45,54,78,105,111,115,117,135,194,238,246,242,237,233,195,165,196,226,227,223,222,220,222,223,219,222,230,238,249,249,251,251,249,249,233,239,241,224,221,221,223,222,220,219,222,220,220,220,218,220,222,221,221,220,220,220,222,221,218,219,220,221,219,218,219,219,220,218,219,218,216,220,219,218,217,216,217,217,216,216,217,215,217,217,215,215,217,215,214,113,3,1,2,7,9,7,10,9,10,10,10,10,226,227,228,227,227,228,230,228,229,228,229,229,227,229,229,229,232,231,232,231,227,231,228,230,229,229,226,225,228,230,234,231,232,234,234,233,233,231,234,233,236,235,232,231,229,229,229,229,229,229,230,229,233,232,231,233,231,235,235,235,235,235,235,233,232,232,232,234,234,231,232,230,231,230,229,232,230,231,232,230,232,230,231,232,232,234,230,232,232,233,233,233,234,235,235,234,234,231,232,233,235,232,231,233,233,236,232,230,231,231,231,231,231,230,229,230,228,230,230,232,233,234,236,234,234,232,232,234,233,236,235,236,235,233,234,231,232,231,234,235,234,233,231,234,235,230,233,232,234,235,234,232,231,234,234,233,234,233,234,236,237,237,238,235,232,232,229,229,230,230,229,229,230,228,230,232,233,233,235,236,235,236,233,234,232,231,230,229,229,229,229,231,229,229,229,229,232,233,234,234,235,236,237,239,239,238,239,240,238,237,240,238,239,240,238,238,235,231,232,228,225,228,227,230,231,232,232,230,230,231,231,229,231,229,230,230,229,232,230,230,231,230,233,230,232,235,237,242,243,245,245,245,246,246,245,246,246,246,247,247,247,247,244,245,246,246,245,246,246,245,246,246,248,246,235,230,241,244,242,241,242,244,232,235,247,246,241,239,244,248,247,247,248,247,247,247,246,245,246,246,246,246,246,247,244,244,245,245,245,243,244,244,243,244,244,245,245,245,246,246,245,244,245,245,245,245,244,242,241,241,241,236,237,241,239,244,231,201,208,207,221,252,242,224,218,223,198,203,241,237,227,192,207,241,245,250,188,162,127,129,95,39,194,243,250,250,225,236,246,242,234,216,222,237,217,226,237,228,233,252,215,220,245,207,199,210,205,186,169,145,154,184,154,95,64,40,11,9,27,20,55,77,39,33,34,67,57,70,90,44,39,35,46,40,34,40,93,167,159,122,78,47,23,19,33,28,36,31,24,31,23,13,17,16,19,26,14,92,148,204,207,163,245,252,198,168,234,223,139,86,116,216,251,251,253,150,50,7,1,17,21,26,26,28,35,16,135,198,121,56,10,10,66,49,72,152,109,93,32,18,17,51,252,229,83,32,38,96,110,98,66,52,36,62,75,46,31,16,27,22,8,88,213,249,243,179,81,22,3,41,33,137,252,252,252,252,223,130,19,59,232,244,245,201,84,37,17,2,93,247,251,251,185,108,59,26,42,12,48,225,243,177,147,76,8,51,217,244,248,248,245,247,235,225,189,149,131,136,139,150,166,165,152,141,140,141,190,219,186,134,21,7,151,230,243,191,83,28,11,16,27,125,231,236,242,185,90,40,15,137,236,238,237,180,83,24,15,93,117,57,6,42,187,228,236,145,44,18,7,43,168,217,245,202,101,51,27,30,25,19,54,129,175,196,191,150,99,149,234,236,242,214,170,187,222,231,223,224,224,220,221,222,220,231,239,246,247,247,244,193,151,133,135,190,228,227,231,230,228,225,223,222,221,221,221,219,221,223,223,222,222,222,223,220,221,222,220,221,223,221,221,220,219,217,218,220,221,220,219,220,219,220,220,219,219,218,218,218,217,216,217,216,216,218,219,217,217,113,2,1,3,7,9,8,10,9,10,10,10,10,228,229,229,228,229,223,227,229,230,227,229,228,229,230,230,232,231,232,230,231,231,231,232,232,235,232,230,230,230,230,231,230,230,230,231,232,231,231,229,227,227,229,229,227,225,226,226,227,226,227,227,229,229,227,231,232,229,232,233,233,231,229,233,231,230,230,226,229,229,229,229,229,229,231,230,230,226,230,230,229,230,229,229,229,231,230,230,230,231,232,233,235,234,233,232,236,234,231,233,230,230,230,229,230,231,229,231,231,230,229,229,231,228,229,230,228,229,229,230,230,233,233,233,235,235,233,235,235,234,235,234,233,234,235,234,234,235,235,237,237,235,235,235,236,235,236,236,234,237,239,236,235,232,232,233,231,229,229,230,232,235,234,231,232,230,227,228,227,226,226,226,225,228,227,229,231,233,234,234,232,230,229,231,229,226,226,226,229,229,229,227,229,229,232,232,232,232,232,232,231,234,231,234,235,235,239,237,237,235,236,237,237,237,237,240,237,237,235,235,235,233,232,229,228,230,231,231,229,229,229,231,229,226,227,227,230,231,228,231,233,233,232,234,234,232,232,237,241,242,245,248,245,245,245,244,245,246,247,246,245,245,246,245,245,246,246,245,246,246,245,244,245,246,243,232,231,243,245,245,244,245,237,227,240,247,243,239,240,246,245,247,247,246,247,245,246,245,244,246,246,246,245,246,246,246,246,245,244,242,242,243,245,244,244,245,246,246,246,246,244,245,245,244,244,244,244,244,244,242,241,244,234,234,238,238,246,224,215,228,217,230,244,212,175,195,238,227,230,250,235,221,189,208,241,243,245,187,164,120,114,96,52,198,243,249,249,225,237,245,245,234,208,221,245,225,232,240,233,243,252,249,252,252,252,252,252,253,253,252,230,252,252,252,252,251,251,205,128,81,46,71,76,32,51,66,89,67,83,100,44,38,43,48,44,34,38,34,44,114,155,152,112,29,7,31,41,39,31,39,33,26,34,31,16,15,76,202,246,250,235,173,198,250,230,103,104,219,243,164,73,128,231,252,252,170,62,11,0,22,27,29,29,34,35,50,15,92,241,241,141,41,35,78,57,84,144,119,101,27,39,36,101,251,204,72,32,27,57,59,61,56,59,55,64,61,46,38,21,35,19,33,135,250,250,248,248,187,97,68,53,61,204,245,245,236,142,74,33,9,132,248,248,244,194,91,38,59,154,240,246,244,168,95,36,23,37,36,8,112,243,246,174,91,31,15,15,165,235,245,248,237,239,225,206,170,147,134,142,150,156,164,166,148,139,138,131,140,119,136,78,4,64,182,222,249,210,107,37,11,20,21,8,114,214,249,243,124,32,5,7,66,181,247,247,160,64,20,83,183,144,74,20,45,162,155,103,50,14,6,90,227,235,249,219,146,106,65,60,55,33,62,123,141,127,125,64,17,34,191,233,239,189,171,215,227,227,223,223,223,222,221,221,223,232,233,192,160,150,135,110,73,100,71,110,222,234,250,250,242,238,230,228,224,220,221,222,223,222,222,221,218,221,220,220,221,220,221,220,220,220,220,220,221,221,220,218,220,220,217,217,218,218,220,218,218,218,217,219,218,217,217,214,217,218,218,217,216,115,2,1,4,7,8,7,10,9,10,11,10,10,228,230,231,230,226,228,231,227,230,230,229,232,231,232,232,232,235,232,232,233,234,236,232,235,234,237,234,234,233,230,231,229,232,232,232,231,229,227,228,225,226,226,226,227,224,225,224,225,226,227,229,227,228,229,229,230,230,232,233,232,232,231,229,229,229,230,231,230,229,229,230,230,231,230,227,229,229,229,230,228,230,229,230,229,228,230,229,232,234,234,236,235,235,235,234,237,235,234,234,230,232,228,229,231,229,229,230,231,230,232,230,231,231,228,230,229,229,231,232,234,232,233,237,236,236,235,235,238,237,238,237,236,234,235,237,238,237,236,239,239,239,241,236,236,235,234,239,236,236,234,235,235,234,235,230,231,232,228,230,230,230,231,230,228,229,228,227,227,227,227,227,226,227,227,226,226,226,229,230,229,227,227,227,227,226,227,227,228,228,227,228,228,228,228,230,228,230,232,232,232,230,232,233,234,236,237,237,237,237,237,238,237,240,239,239,239,240,239,241,239,236,234,230,230,229,230,231,230,229,229,228,228,229,229,231,233,232,234,232,235,236,235,238,237,235,235,238,243,244,247,246,246,247,246,247,246,246,247,245,244,246,246,245,245,247,246,246,247,246,245,244,245,245,240,229,235,244,243,244,244,244,235,232,244,244,241,237,243,247,247,247,247,247,247,247,247,246,246,246,246,246,246,246,247,246,246,247,244,243,244,244,244,244,245,244,244,246,246,246,246,246,245,244,245,245,244,244,244,245,243,243,236,232,237,240,242,221,210,222,194,185,212,204,205,231,251,233,220,241,235,218,196,213,240,242,244,187,147,107,109,83,47,200,243,249,250,224,239,245,246,228,197,227,251,234,237,228,223,229,240,227,247,253,239,240,250,249,251,248,212,251,251,252,252,253,253,249,205,108,27,38,107,152,159,141,101,50,74,107,53,34,39,52,45,34,41,36,39,32,27,145,244,203,202,229,190,99,4,69,173,190,199,181,205,251,251,253,253,200,145,163,215,248,200,56,77,185,217,169,84,153,240,251,214,90,14,2,14,28,32,32,35,34,33,39,32,22,166,251,237,128,66,80,71,81,140,136,84,24,103,88,77,182,102,53,45,14,31,29,49,47,78,99,67,54,48,55,55,48,23,23,168,251,251,252,252,241,122,41,34,28,92,104,63,36,15,29,32,14,134,247,247,247,247,228,251,253,253,252,211,108,61,7,23,33,38,16,45,200,245,249,196,133,99,44,2,122,231,245,248,234,232,217,189,155,138,137,145,158,156,166,163,142,141,135,129,99,27,7,18,80,210,246,228,250,228,149,69,11,13,19,29,16,50,202,217,198,108,16,13,35,169,241,244,228,113,55,13,95,151,85,60,42,33,36,50,14,52,185,245,249,239,249,249,222,173,113,70,36,37,31,34,53,43,100,110,21,26,141,216,206,171,196,227,232,231,235,237,233,232,228,226,227,238,202,139,92,44,39,31,39,86,70,121,233,238,252,252,252,252,246,236,229,226,223,223,222,221,223,222,220,220,222,222,222,223,221,219,222,223,223,220,220,221,222,220,220,221,222,220,219,222,219,220,219,220,219,216,217,217,217,217,218,217,218,216,218,114,2,1,4,7,8,8,10,9,10,11,10,10,228,230,228,227,230,228,229,229,227,227,231,227,230,231,230,231,229,232,234,232,230,231,233,235,232,234,233,231,232,230,230,229,229,230,231,230,228,228,226,224,225,224,223,222,222,224,221,224,225,225,227,229,227,225,229,229,229,229,227,228,229,228,226,227,230,227,227,229,229,228,229,230,228,227,225,227,229,229,228,229,227,227,228,228,228,229,232,231,232,232,232,235,232,235,233,230,231,230,230,229,229,230,229,230,230,230,230,229,231,231,230,230,228,230,230,229,230,231,233,235,235,236,235,234,234,233,237,238,238,236,236,236,233,232,233,236,234,234,236,236,236,235,232,229,228,230,231,230,229,229,229,230,230,229,229,228,226,223,227,228,229,231,229,229,226,227,227,225,226,225,224,224,223,224,224,223,224,223,224,225,226,229,225,224,224,227,228,225,227,226,226,226,224,224,225,229,229,228,229,228,230,229,232,233,232,233,234,233,234,236,235,237,236,236,239,238,239,238,236,236,232,232,228,227,229,229,232,230,229,227,227,226,227,229,229,230,231,231,234,234,234,236,237,236,235,236,240,242,244,246,245,245,247,248,246,246,246,246,245,245,244,244,245,245,245,246,245,246,246,244,245,245,244,235,229,240,244,244,244,244,242,231,239,246,243,238,239,245,247,247,247,247,246,248,246,246,246,245,245,244,245,247,246,246,245,245,245,244,244,244,245,246,244,244,243,242,244,245,246,245,244,244,244,244,245,245,246,244,243,242,243,237,231,235,240,242,215,205,200,185,211,232,226,226,242,240,188,205,242,234,218,197,214,242,241,240,191,161,120,102,81,44,194,243,250,250,223,242,241,248,225,192,231,251,240,230,232,237,206,212,200,224,245,142,165,230,221,252,246,191,247,250,251,250,250,251,236,186,75,2,13,120,203,169,148,100,46,59,101,61,30,41,51,47,42,44,39,30,44,10,90,241,252,252,248,245,165,37,130,242,252,252,252,252,252,242,249,222,133,157,161,239,246,146,106,87,169,220,149,96,181,244,251,174,46,6,8,17,29,37,38,39,44,46,37,36,15,116,249,249,174,83,65,41,72,148,135,49,36,148,92,65,68,36,70,38,19,28,41,54,38,89,117,69,61,53,50,60,64,15,43,209,250,250,252,202,133,80,34,17,21,32,29,38,41,45,33,37,18,92,238,239,248,248,253,253,252,249,139,43,4,1,24,32,42,25,38,197,248,248,247,245,178,153,165,86,141,234,248,252,234,225,202,174,153,139,134,142,155,155,159,155,136,137,138,157,176,141,208,228,229,249,251,247,251,251,209,123,53,21,17,20,27,27,22,116,215,232,237,229,241,249,250,250,240,173,90,27,7,17,36,37,25,35,27,37,199,242,250,250,251,251,248,248,224,163,108,63,34,29,24,24,11,46,137,226,148,43,57,125,146,188,224,237,240,245,249,249,248,246,229,226,237,243,210,131,76,36,15,24,7,25,34,95,168,166,198,227,243,243,251,251,230,226,227,222,223,222,220,220,220,223,220,220,221,220,218,220,222,220,219,221,221,221,220,221,220,217,219,221,220,219,218,220,218,218,221,220,217,216,216,215,215,216,218,217,216,114,3,1,3,7,9,8,10,10,10,11,10,10,227,230,230,229,227,227,227,227,229,229,229,229,227,229,229,226,229,229,232,230,230,232,229,230,231,232,231,230,229,227,228,227,226,229,230,230,231,228,228,226,225,222,222,222,223,225,225,227,226,226,227,226,224,224,226,229,229,229,229,227,227,228,227,228,228,226,226,227,226,227,226,225,226,226,225,225,227,226,226,227,227,227,226,226,227,229,230,231,233,232,232,231,229,231,230,230,229,227,228,229,233,229,230,232,229,229,230,231,229,227,227,228,228,227,229,229,230,232,234,236,234,236,237,234,236,237,237,238,236,235,233,234,233,233,234,232,233,235,236,233,230,229,226,227,227,225,230,227,229,230,231,232,229,232,231,226,225,221,225,228,229,229,226,224,228,226,224,224,224,224,224,222,222,224,223,224,224,224,225,224,229,228,226,225,225,224,225,225,226,225,224,225,226,225,227,225,225,227,226,227,225,229,232,230,231,231,231,233,234,233,237,236,237,238,239,240,238,235,235,233,231,230,228,229,228,233,232,229,229,228,225,226,226,226,228,226,229,230,231,232,232,235,237,240,241,242,242,242,244,246,246,246,249,246,247,247,247,247,246,248,246,246,247,245,247,248,246,247,245,245,245,247,245,233,232,244,246,246,247,245,237,234,244,248,243,238,242,244,246,248,247,247,247,247,247,246,246,246,246,245,244,245,246,245,245,246,247,244,245,245,244,246,245,243,245,245,245,246,246,245,245,244,245,245,245,245,244,243,241,241,242,238,231,233,243,240,214,206,215,214,243,252,221,193,207,231,211,224,247,237,213,204,221,244,239,242,201,166,138,121,85,48,198,243,249,249,220,241,241,252,219,184,236,248,244,230,198,228,200,148,130,188,240,168,199,243,242,252,248,205,248,249,237,222,192,190,168,148,123,73,78,160,207,141,137,108,53,53,92,65,31,42,55,44,42,45,43,35,42,16,80,221,179,199,253,232,182,51,96,240,252,252,252,252,251,190,208,191,165,183,194,247,193,151,158,175,204,206,160,139,215,247,248,147,30,9,6,24,29,22,129,207,202,179,105,53,7,97,247,247,182,95,20,15,57,105,95,32,24,79,63,72,79,42,55,34,16,47,49,37,34,33,48,57,61,66,34,41,28,29,195,249,251,190,88,33,5,24,29,23,24,33,38,39,42,36,39,35,38,14,93,201,239,241,238,212,136,66,22,3,19,43,41,49,46,74,207,246,251,251,251,251,196,199,252,155,151,242,252,252,242,216,187,160,145,141,137,143,157,149,158,153,135,141,141,185,245,246,251,251,252,252,252,252,253,253,246,210,117,50,15,12,23,23,30,8,68,215,245,245,248,248,251,251,246,174,110,37,10,18,17,34,27,27,25,118,232,234,247,247,250,250,250,207,138,98,47,27,18,14,21,25,24,14,146,226,240,202,96,89,153,214,248,245,245,245,246,246,238,241,225,229,244,242,192,129,74,27,18,12,75,107,59,42,26,8,15,18,89,183,231,238,249,231,228,226,228,223,223,222,221,223,222,221,222,223,223,220,221,220,221,222,221,222,222,219,221,221,220,220,220,221,219,220,220,218,219,219,220,217,217,220,218,218,219,218,218,113,3,1,3,7,9,8,10,9,10,11,10,11,224,226,227,225,226,227,227,229,229,228,230,227,230,229,225,227,226,227,230,231,230,230,230,229,228,230,229,228,226,222,220,222,224,227,227,226,225,223,224,223,225,222,223,224,222,225,224,224,224,223,224,224,224,223,226,224,224,225,226,225,224,225,223,225,226,221,225,223,222,224,223,222,223,225,225,226,226,224,225,225,226,226,226,227,225,229,228,230,231,229,230,230,230,229,229,227,227,227,226,227,229,230,230,230,227,227,227,229,227,225,227,224,225,227,227,227,227,231,231,230,232,234,234,235,235,234,236,232,234,234,233,234,233,234,234,233,231,232,234,232,231,231,229,231,230,229,229,229,230,230,231,232,232,230,229,227,226,225,225,224,226,226,224,224,220,220,222,221,223,221,222,222,222,222,222,221,221,220,224,224,222,224,222,224,225,226,225,223,224,223,224,225,222,224,223,223,225,225,227,226,227,226,227,226,227,230,231,229,230,232,233,234,234,236,237,236,236,236,232,232,230,229,229,230,230,229,232,229,228,226,224,225,225,227,226,226,226,227,230,228,230,232,235,239,241,242,243,244,243,245,245,246,246,247,247,245,245,244,245,245,244,245,246,246,245,246,245,245,244,244,247,247,242,233,239,245,244,244,245,245,235,238,246,246,240,238,245,246,247,246,246,247,245,247,246,247,247,245,246,245,245,245,244,244,244,245,245,244,245,244,246,245,246,246,244,242,245,245,245,246,245,245,244,244,244,245,243,241,243,242,241,238,229,230,244,235,214,219,223,218,237,215,182,191,230,249,229,243,250,235,212,207,219,244,237,239,202,167,138,115,86,57,203,244,247,248,220,239,241,252,208,184,241,244,246,213,178,202,177,158,148,203,252,199,220,246,252,252,249,203,224,198,186,194,146,171,166,200,213,143,154,177,203,140,162,117,63,68,89,73,30,46,54,41,38,46,42,36,41,14,76,190,139,154,208,170,183,122,95,190,243,247,249,245,205,196,251,210,189,199,194,224,183,176,203,190,213,200,156,167,230,248,240,120,24,6,11,26,39,150,251,251,251,251,147,53,8,86,247,232,162,98,21,10,42,61,74,49,55,87,57,71,66,40,35,18,43,49,33,35,37,29,28,43,57,51,43,16,72,225,253,253,133,44,6,7,32,25,27,25,34,35,36,44,37,43,42,45,45,48,37,15,68,101,91,60,38,33,48,68,74,83,81,101,103,131,190,213,169,154,163,146,113,95,128,93,76,172,234,234,234,196,165,151,146,140,135,142,151,151,162,148,136,142,144,163,166,141,226,234,200,188,225,237,246,246,250,240,149,60,18,25,30,25,27,35,16,26,166,222,251,251,252,252,212,130,68,22,14,15,22,30,29,31,33,28,139,198,228,235,251,251,246,159,93,51,25,24,20,46,77,59,44,19,81,163,235,215,179,178,206,235,248,247,217,150,145,134,145,223,226,235,248,212,149,97,64,28,5,45,93,89,67,51,39,35,39,35,35,9,82,208,243,243,225,220,226,226,223,222,220,222,223,222,221,223,222,221,222,221,223,222,219,221,222,222,221,221,219,218,218,219,219,217,216,218,217,217,220,219,218,216,219,220,220,216,217,115,2,1,5,8,8,8,10,10,11,11,12,10,224,228,225,224,226,226,227,225,228,227,228,229,228,227,229,229,229,230,231,231,230,231,231,231,231,229,228,230,229,229,225,223,224,225,224,221,222,222,222,223,224,221,224,223,222,221,222,222,220,222,224,224,220,222,224,222,220,221,222,223,223,224,223,222,222,224,223,224,226,224,224,224,225,225,223,225,228,225,225,224,225,227,227,226,228,229,229,229,229,230,230,230,230,231,229,228,228,228,229,230,230,229,229,231,229,229,229,226,229,227,227,228,225,226,227,228,228,227,230,231,231,233,233,233,235,234,233,232,230,232,234,236,235,236,234,232,234,234,235,233,233,235,235,237,234,232,232,227,229,232,230,232,230,230,229,227,226,226,227,224,224,223,224,223,222,222,222,222,223,223,222,222,224,221,220,221,218,220,219,218,222,222,223,224,226,224,224,225,222,224,223,222,224,221,222,224,223,227,229,229,228,229,229,226,227,230,229,229,232,230,233,232,232,235,234,237,239,236,234,232,233,233,232,230,229,232,231,232,230,230,230,231,231,227,229,227,227,226,229,229,227,230,231,236,236,240,243,243,246,246,245,245,245,247,248,246,245,244,244,245,244,246,247,245,247,246,245,246,246,247,248,249,241,235,244,246,244,245,248,240,236,243,247,244,239,240,246,246,246,246,244,247,247,247,246,247,246,246,247,245,245,245,244,244,245,244,245,244,245,247,247,246,246,246,245,244,245,245,245,245,244,244,244,243,242,241,243,242,241,240,240,241,230,227,243,231,211,214,211,181,200,210,198,224,252,252,219,218,245,233,208,212,223,245,231,244,204,163,139,113,84,50,200,244,248,250,222,244,242,252,206,194,249,244,249,238,194,211,211,195,178,218,252,220,234,247,252,252,208,149,145,150,180,198,211,248,229,218,188,155,158,162,189,128,174,124,81,86,86,79,30,49,54,43,37,39,44,36,42,12,121,244,170,164,201,170,171,127,104,150,225,248,250,240,194,218,252,210,205,160,175,244,143,170,212,176,217,168,139,184,229,250,237,120,21,5,10,22,124,242,253,253,237,153,67,21,5,84,246,226,154,86,15,14,70,91,66,59,81,91,76,59,44,39,32,33,44,38,32,33,34,32,35,41,44,55,35,117,243,247,242,103,15,1,19,31,29,34,37,44,45,53,57,62,69,72,78,81,85,75,80,53,26,32,41,50,42,56,64,60,67,69,69,66,63,60,48,27,12,14,15,27,36,48,48,39,13,100,237,243,224,174,145,145,138,141,137,137,145,148,155,143,134,141,144,140,69,10,6,8,12,14,11,19,53,92,133,147,128,81,47,51,51,53,50,46,41,43,16,41,146,198,201,160,96,46,27,28,20,16,24,30,27,32,33,35,33,29,18,73,199,217,243,179,83,38,21,12,89,214,211,132,57,37,39,15,24,45,95,218,244,235,249,222,141,83,55,14,38,153,217,239,243,171,95,70,31,49,105,89,84,57,47,54,39,38,34,30,35,38,21,77,211,225,231,221,223,226,225,225,223,223,221,224,223,222,224,222,223,223,224,222,220,221,225,224,221,222,221,222,220,221,221,220,221,220,220,217,220,222,217,217,220,220,220,217,220,116,3,1,5,9,9,9,12,10,11,12,13,12,224,225,227,224,226,222,222,226,226,223,227,227,225,226,227,229,227,228,231,229,229,232,229,229,229,230,231,229,231,230,228,227,223,223,222,221,222,220,220,221,223,222,222,221,221,222,222,222,222,222,220,218,221,219,220,221,220,220,222,225,221,220,220,219,222,218,222,223,219,223,223,223,225,224,223,224,225,223,223,225,226,225,226,228,226,228,229,229,229,231,230,229,229,229,232,230,229,229,229,227,229,230,228,227,229,231,227,226,226,226,227,224,227,227,226,229,226,229,229,227,231,230,234,235,233,235,232,230,231,230,234,235,233,233,232,234,231,232,235,234,235,235,237,239,235,232,230,229,230,229,227,226,227,227,229,227,226,225,224,226,226,225,223,223,221,220,221,220,222,224,223,221,222,222,220,222,222,220,220,219,223,223,223,225,224,224,225,222,225,224,222,223,221,221,222,223,222,224,222,223,225,224,226,227,227,227,227,229,231,230,232,233,230,234,235,234,236,235,234,236,234,232,230,231,232,230,232,232,232,234,234,232,230,230,227,225,228,229,228,226,226,226,227,229,232,237,239,241,241,242,242,241,242,243,245,245,244,244,245,246,246,245,246,245,244,245,246,246,245,247,248,246,238,236,246,245,245,247,245,237,236,246,247,242,237,242,245,245,247,245,244,244,245,246,246,247,246,246,246,246,245,244,244,244,245,245,244,244,244,245,246,246,247,245,246,245,245,245,245,244,244,244,243,243,240,241,240,242,240,238,237,240,232,224,241,224,204,197,192,197,239,226,208,229,244,223,185,220,246,230,207,213,226,245,231,245,199,151,124,99,73,55,205,244,249,247,222,244,243,252,203,208,251,241,250,244,208,237,232,212,187,227,252,227,224,193,200,209,162,128,148,184,236,251,251,252,241,198,163,160,133,135,180,111,192,132,88,81,78,84,25,53,56,46,33,43,44,38,34,23,158,243,206,222,247,192,199,160,61,94,184,233,242,203,175,244,249,211,162,130,222,203,91,174,188,160,205,139,127,186,233,249,237,127,26,2,10,19,152,244,250,214,64,3,1,10,6,133,251,224,168,86,22,26,32,66,64,52,61,61,55,51,39,33,43,38,36,37,40,39,35,33,34,28,39,55,47,126,196,198,77,9,6,38,42,50,63,73,94,105,121,133,132,130,128,109,95,78,69,69,59,55,56,43,47,60,58,58,58,63,65,69,69,70,69,76,75,66,59,61,66,66,71,81,87,69,44,102,229,226,187,152,137,141,138,142,142,141,149,146,153,141,134,145,149,141,71,17,15,30,40,42,39,34,40,39,39,43,42,42,38,35,39,52,57,55,60,54,55,42,36,27,15,39,47,55,43,44,27,24,34,29,32,30,26,31,33,29,30,33,27,125,222,195,155,47,32,26,42,126,231,194,128,49,34,29,27,12,43,169,230,236,242,184,120,72,49,16,28,170,241,241,215,115,79,29,44,145,162,139,77,35,45,38,31,32,33,43,37,35,27,12,127,217,226,232,224,225,225,224,224,225,222,221,224,223,223,222,223,223,223,224,223,222,223,223,224,222,221,223,223,223,223,224,222,223,221,223,224,219,220,220,219,221,221,219,218,115,4,1,4,8,10,9,10,10,11,12,11,11,222,227,225,224,227,223,225,223,225,226,225,226,226,226,226,227,226,225,227,226,228,230,231,227,228,229,229,231,229,229,228,227,228,226,225,223,222,221,221,218,220,220,220,222,220,222,224,223,223,219,218,219,218,220,222,219,220,220,220,223,220,221,222,220,221,221,221,218,219,219,220,222,221,223,221,224,224,222,225,224,225,227,227,227,229,228,226,230,229,230,230,229,229,229,229,229,230,228,227,227,229,227,227,229,229,229,229,227,227,228,227,227,226,225,224,224,224,227,227,227,228,228,229,230,231,230,231,231,229,230,231,232,231,230,233,232,232,233,232,234,235,234,232,234,233,231,232,229,227,227,224,225,223,222,225,226,227,226,228,226,226,226,226,224,221,222,221,220,224,224,224,222,223,222,220,223,221,222,221,219,221,221,221,221,222,222,222,223,222,220,219,221,223,220,220,222,221,221,221,224,222,223,226,225,227,228,226,226,227,227,229,229,230,232,232,236,235,236,237,235,234,237,237,236,235,237,237,236,236,236,234,235,234,230,230,230,231,229,229,225,227,229,226,228,230,234,236,236,238,237,238,240,241,241,242,241,242,245,244,245,245,246,244,244,244,244,246,245,244,245,246,243,234,238,247,248,246,247,243,234,241,247,246,241,241,245,244,244,246,247,246,246,246,246,246,248,247,247,246,247,246,246,244,244,245,244,244,244,245,245,246,246,246,247,244,244,245,245,246,245,246,243,243,244,241,242,242,242,240,238,235,239,232,224,237,222,205,206,212,228,252,221,187,189,219,229,214,235,250,227,204,217,229,246,227,247,200,150,124,94,66,48,204,245,250,249,223,244,244,252,203,215,250,238,247,249,211,223,252,248,190,195,220,181,139,113,160,206,196,214,201,217,252,252,252,252,240,163,137,169,139,160,190,130,190,106,78,73,83,105,22,45,57,41,41,45,38,43,33,26,163,243,195,223,249,232,237,225,112,38,125,196,202,171,174,233,208,172,135,162,252,203,75,171,187,151,199,119,116,198,231,248,247,151,45,3,8,10,99,240,247,202,66,4,8,1,111,250,253,251,173,82,30,19,50,109,84,49,57,57,48,44,39,40,35,36,36,38,39,38,39,34,33,33,53,56,53,113,108,83,74,72,93,97,112,116,108,109,104,92,86,86,72,60,62,59,56,57,66,69,72,76,77,82,98,114,121,127,131,136,142,133,146,149,139,147,146,149,136,123,122,120,113,111,97,94,78,92,200,199,155,139,134,144,139,145,147,150,153,154,152,138,135,144,159,144,83,51,43,46,54,53,64,60,59,59,58,59,50,54,50,43,41,44,39,39,40,40,42,47,45,42,50,53,60,62,58,64,67,56,53,50,42,43,37,35,37,28,33,30,37,9,67,171,179,139,57,35,19,26,178,223,151,86,42,29,39,13,47,205,239,240,249,224,143,83,49,14,98,226,247,218,150,83,49,122,201,186,148,79,49,40,25,29,28,36,39,35,38,33,49,16,78,196,218,235,228,222,224,221,224,223,222,222,225,224,222,224,225,223,223,222,222,222,225,225,222,222,222,225,223,222,222,224,225,223,222,221,222,223,220,220,221,220,222,221,220,114,4,1,5,9,10,9,10,10,12,12,11,11,223,225,226,221,226,223,222,224,224,223,225,227,224,226,226,226,226,224,226,227,228,230,227,229,231,229,231,230,231,230,231,232,229,229,229,227,226,222,218,222,217,220,219,218,220,219,220,217,217,219,219,218,220,220,221,219,218,220,217,221,220,219,220,221,222,220,222,221,217,222,220,219,222,220,221,222,222,222,224,223,225,228,225,226,226,225,226,228,229,231,229,231,231,230,232,230,232,230,230,230,227,228,229,229,230,231,229,230,233,231,232,228,227,225,223,224,223,226,225,225,227,226,229,226,224,227,227,227,228,227,229,229,228,232,229,232,232,233,231,230,233,233,234,231,230,232,230,228,227,224,221,222,224,223,225,224,226,226,225,227,226,225,226,226,222,222,222,224,224,224,222,220,221,220,219,222,220,219,220,218,220,219,220,221,218,219,220,217,221,220,219,222,219,221,221,221,219,223,222,223,225,224,225,223,226,226,227,223,227,227,227,230,227,232,232,231,234,232,234,235,237,239,240,238,240,240,239,239,237,237,237,238,237,236,233,230,231,229,230,229,229,227,229,231,233,234,232,235,236,237,237,239,240,241,242,241,242,242,242,242,242,242,242,243,245,246,246,246,245,247,247,240,233,242,246,245,246,246,237,236,245,246,243,237,242,245,245,246,246,245,245,246,245,245,245,246,246,246,246,246,245,246,246,245,244,244,244,244,243,244,245,246,247,246,245,244,245,245,244,245,244,244,244,244,242,242,239,240,237,237,235,238,235,226,232,220,213,216,215,200,220,196,187,221,247,247,229,240,247,222,203,219,230,243,220,249,203,164,142,108,71,49,208,244,249,246,220,241,245,251,190,214,247,236,248,251,211,195,227,232,139,101,134,152,168,168,229,250,230,238,222,238,251,249,251,252,197,109,129,174,171,186,200,139,165,69,48,55,69,97,19,46,57,46,42,40,45,44,21,31,168,234,189,215,243,222,240,248,131,49,110,188,226,161,144,160,108,130,148,220,252,150,122,213,189,194,183,106,162,218,240,244,249,188,70,3,4,2,43,210,252,252,201,133,182,244,253,253,252,179,91,36,12,33,27,74,76,42,41,45,42,39,39,33,35,37,36,38,35,36,42,44,51,63,80,95,99,85,67,82,100,92,73,74,65,59,57,56,113,60,62,73,85,99,112,123,135,139,141,145,138,138,133,130,133,119,107,95,77,67,65,58,56,55,49,46,49,43,42,45,39,36,35,24,24,24,31,40,103,140,145,142,139,139,142,147,156,154,154,155,152,136,136,152,162,134,69,27,38,15,24,27,35,36,38,44,50,46,54,57,50,56,58,64,59,56,57,53,52,51,49,48,46,42,41,49,57,63,63,72,73,73,75,73,66,62,59,57,44,50,36,36,26,30,153,134,77,34,32,178,236,212,160,98,62,27,33,19,119,227,250,250,249,227,152,99,56,18,35,152,157,128,105,43,99,204,234,185,107,69,33,24,101,146,124,82,51,39,36,34,44,9,77,193,205,231,231,225,226,222,225,223,223,224,222,223,225,225,223,220,225,224,221,224,224,222,222,223,225,226,224,226,224,225,223,226,224,221,224,220,221,220,219,223,221,220,221,116,3,1,5,8,9,9,12,10,11,12,12,12,221,222,221,223,225,221,223,221,220,222,222,223,225,223,224,226,228,226,226,228,226,226,229,229,229,229,229,229,230,231,231,234,228,231,232,229,227,221,221,219,220,218,218,218,215,218,217,215,218,215,218,219,217,219,218,217,217,217,221,219,218,219,217,216,218,219,221,219,217,217,217,218,220,220,218,220,219,220,222,222,225,224,223,224,225,224,226,229,229,228,229,230,229,226,227,229,231,229,229,226,229,229,228,232,230,231,231,231,230,232,229,230,230,227,227,225,222,223,224,223,225,224,224,224,223,223,225,225,224,227,227,227,230,230,232,232,231,232,231,231,232,232,230,231,231,229,229,228,225,222,223,223,223,226,226,224,224,224,226,226,227,226,223,226,225,225,222,221,223,220,223,218,217,218,217,220,219,218,219,220,221,219,218,218,217,218,218,218,221,221,220,220,220,218,220,220,221,223,221,222,220,222,224,223,224,225,225,227,225,225,228,227,230,229,229,231,231,234,233,235,236,239,239,238,240,240,237,238,237,236,237,239,237,234,235,232,230,229,229,229,228,228,227,231,231,232,234,234,237,235,237,237,240,243,242,244,242,241,239,238,240,242,243,242,244,245,246,246,244,244,246,235,229,241,244,245,248,244,234,238,247,244,240,239,245,246,246,247,246,247,246,246,246,246,246,246,245,246,246,245,245,244,246,246,244,242,244,244,244,244,246,245,246,246,245,244,243,244,244,244,244,244,244,245,243,243,240,239,237,237,237,239,239,232,229,217,210,207,180,195,238,214,241,250,252,236,192,225,249,217,203,219,229,240,215,252,202,160,149,117,74,55,214,244,250,246,221,240,247,243,177,214,248,245,252,252,218,147,159,161,122,114,156,211,227,223,252,252,234,240,212,234,251,239,252,247,177,120,142,169,175,188,203,150,148,72,63,24,32,90,27,69,57,36,34,35,46,44,23,93,220,234,191,212,237,213,234,242,128,24,90,198,225,148,139,136,140,159,165,217,174,112,155,243,209,240,192,85,187,238,239,241,247,224,127,28,2,9,12,88,223,240,249,249,253,253,253,210,112,45,1,12,24,27,34,30,39,51,41,39,45,43,45,42,44,51,57,62,69,74,87,103,111,110,98,77,63,60,45,50,57,50,53,61,76,86,100,118,129,128,125,139,146,138,128,117,93,70,61,55,46,39,35,34,33,30,31,29,31,27,24,25,17,23,19,17,24,30,35,33,34,36,30,23,21,13,22,11,32,96,141,152,143,146,148,154,155,155,157,160,151,136,138,153,166,121,53,32,33,28,23,18,16,14,13,16,18,22,23,23,26,26,33,32,35,42,48,56,58,61,62,64,69,63,63,61,59,62,55,53,47,48,54,58,66,71,77,78,72,73,66,57,55,48,34,72,70,27,157,239,248,232,150,111,63,31,34,18,70,210,241,241,229,144,104,83,69,33,21,15,14,10,19,9,92,234,227,143,101,45,58,196,235,235,241,184,118,75,51,31,39,8,72,203,209,229,234,225,228,222,226,223,224,224,222,225,225,224,221,222,227,225,226,225,224,224,225,227,224,224,222,228,225,222,226,223,224,222,222,223,219,221,221,221,223,220,221,116,3,1,5,8,9,9,11,9,11,12,11,11,219,220,222,218,223,220,221,221,223,222,220,222,222,221,221,222,222,223,224,227,226,229,229,226,229,227,230,229,229,231,229,229,226,228,229,225,226,224,221,221,218,216,216,216,213,214,216,214,217,218,217,216,218,217,215,214,217,219,217,219,217,216,216,215,216,214,216,218,215,218,219,216,220,216,216,217,218,219,219,220,219,221,220,223,225,222,224,225,225,226,228,226,226,225,224,224,225,227,226,229,228,229,230,227,229,229,228,227,229,229,230,230,231,229,226,224,225,223,221,221,222,222,222,220,220,221,222,223,224,226,227,226,227,231,230,232,229,230,231,229,230,229,230,232,231,229,226,224,224,223,223,223,221,221,222,222,222,221,222,225,222,222,223,221,222,224,225,222,221,219,219,221,219,218,218,220,217,219,221,222,222,221,221,217,217,216,218,217,220,218,215,219,220,221,220,220,219,218,219,219,221,221,220,220,225,224,224,223,223,225,227,227,226,229,228,229,230,229,232,235,236,236,235,236,237,236,237,236,235,236,235,235,232,232,231,229,229,229,227,227,228,226,229,228,230,232,232,235,234,234,235,239,241,240,240,241,240,240,238,239,240,241,243,243,244,243,243,244,244,244,241,227,225,238,241,241,246,239,229,240,246,243,238,241,245,245,244,244,245,246,247,247,245,245,247,247,247,246,245,246,245,244,244,244,244,243,244,244,244,243,244,245,245,245,244,244,245,245,245,245,245,244,245,243,242,242,240,239,236,236,235,235,237,235,224,214,199,196,211,240,252,203,218,230,222,206,196,229,248,214,201,218,223,237,214,252,194,149,134,108,70,53,210,244,250,242,219,238,246,238,181,224,250,250,248,232,187,144,167,182,187,160,181,237,247,225,246,252,220,228,187,197,239,238,252,239,179,146,141,163,192,171,206,146,151,136,101,37,37,111,58,75,61,36,36,35,38,46,140,221,236,224,186,210,240,211,233,234,122,22,41,139,196,187,217,188,167,201,188,195,159,99,182,232,194,252,163,83,198,235,242,236,243,250,203,127,53,3,3,25,57,145,198,216,203,163,110,60,37,15,41,60,53,64,61,66,65,69,75,76,80,84,86,91,99,91,91,93,86,76,63,63,57,50,48,53,55,64,73,85,103,117,137,141,139,134,127,107,80,67,53,44,39,28,29,28,31,27,20,23,25,25,19,14,19,26,25,30,28,31,35,19,17,21,23,36,34,35,25,18,25,33,39,38,19,21,14,47,111,139,155,158,143,150,148,150,145,148,160,155,141,132,141,157,158,101,40,23,29,46,49,29,21,13,15,16,16,24,24,20,20,24,22,21,20,16,19,19,22,27,31,43,44,46,60,69,80,91,94,89,83,76,67,60,57,53,48,49,54,60,68,75,69,74,67,54,59,56,180,237,248,211,130,75,39,34,32,38,23,22,142,198,148,99,79,70,67,66,55,37,11,15,12,125,234,241,227,135,95,55,19,100,198,228,248,234,165,104,64,30,46,10,77,206,208,232,235,224,228,222,227,224,224,224,225,226,225,226,225,224,226,225,224,227,227,225,226,225,225,224,226,226,225,225,224,223,222,223,221,218,220,222,219,220,222,217,218,115,4,1,4,8,10,9,10,10,11,11,11,12,220,224,220,220,221,216,218,220,220,220,221,220,219,218,222,222,222,224,224,228,227,226,229,227,228,229,228,227,227,226,227,226,222,224,223,220,221,221,222,222,219,219,215,216,219,217,214,214,217,215,217,217,215,216,213,217,217,215,218,216,215,214,215,215,214,215,216,215,217,216,217,219,217,218,221,220,218,218,218,219,222,221,221,223,222,224,224,227,227,224,223,225,225,225,226,224,229,228,229,227,228,230,228,229,229,227,229,230,229,231,227,230,230,228,228,227,224,221,222,220,223,224,223,222,220,219,221,220,220,224,223,223,227,227,228,231,229,233,232,233,230,230,231,230,230,230,226,224,222,220,223,219,218,222,219,220,222,224,224,223,224,223,223,222,222,224,222,220,223,222,223,221,222,221,221,220,220,223,222,221,224,222,219,220,218,218,217,218,219,217,220,218,219,218,219,220,217,219,218,221,223,223,222,222,225,224,224,224,225,225,226,225,224,226,227,228,229,230,231,232,234,232,234,234,234,236,236,235,235,234,234,232,230,229,229,227,227,229,227,227,227,227,230,230,230,234,234,236,237,235,237,237,240,239,240,240,237,239,237,238,241,240,243,243,243,242,243,245,245,244,240,225,229,242,241,244,245,233,232,242,245,241,238,244,245,244,246,245,246,247,246,246,246,246,247,245,245,247,246,246,246,245,244,244,245,242,244,244,243,245,245,245,246,246,245,245,245,245,245,245,244,244,244,245,243,242,240,237,237,235,235,237,239,240,218,208,208,217,228,253,194,78,131,198,236,232,223,243,247,209,203,219,222,232,215,252,188,146,128,101,63,52,214,244,250,242,220,246,252,239,194,229,230,205,189,189,200,185,238,248,241,173,159,232,247,226,238,251,223,196,145,187,241,247,252,226,176,165,160,181,206,154,193,133,165,185,156,76,55,125,66,89,65,45,39,41,30,61,216,246,219,198,177,210,239,208,234,237,130,66,54,103,165,219,252,210,177,130,173,237,167,165,200,184,153,210,122,69,200,228,244,240,239,249,252,243,210,134,73,113,54,7,14,5,10,8,10,17,45,67,74,92,105,113,120,113,115,110,97,103,100,82,67,61,59,55,51,53,52,53,62,65,80,90,108,129,136,144,137,131,110,94,75,52,43,35,32,27,23,29,28,19,15,12,15,25,25,23,18,25,29,29,26,14,24,40,35,37,37,33,35,26,19,30,41,44,35,19,19,15,16,22,31,53,33,38,51,97,196,181,161,161,155,158,151,150,148,158,158,149,135,133,143,164,155,82,29,12,16,40,60,54,36,27,14,19,31,27,25,18,13,19,23,23,28,22,16,13,17,24,19,22,23,22,24,27,28,35,47,55,69,78,81,87,90,83,80,76,66,64,57,55,51,57,61,69,59,50,171,208,176,159,99,70,43,31,34,33,36,39,16,31,76,126,137,71,74,100,96,153,196,214,245,246,253,253,236,173,104,36,5,12,15,55,198,228,199,128,71,45,46,8,82,206,211,233,234,225,226,226,226,225,226,223,227,227,226,227,227,228,229,226,229,228,228,227,227,228,226,229,225,225,223,226,227,221,225,222,222,219,219,223,221,222,225,220,218,115,4,1,4,9,10,9,10,10,12,12,11,11,214,220,218,218,219,217,216,217,220,219,218,217,218,221,220,221,220,222,225,224,224,225,226,226,227,224,226,223,223,227,223,224,222,221,222,219,220,216,218,219,218,214,215,218,214,215,215,213,215,212,215,214,213,215,212,213,213,213,213,215,210,212,215,212,213,213,213,214,213,212,215,213,215,215,218,216,217,217,218,220,219,223,222,223,224,222,224,224,225,223,223,223,223,224,225,228,226,227,227,228,227,227,228,226,226,226,226,229,228,227,227,227,227,224,225,224,222,219,220,221,222,224,223,221,218,221,222,219,220,222,223,224,224,224,225,226,226,228,228,227,228,228,227,227,228,229,227,222,222,219,216,218,220,220,221,222,220,223,224,223,223,222,223,221,225,222,220,219,219,221,222,224,221,220,222,221,220,216,218,220,218,220,219,217,217,217,219,217,217,218,217,217,215,215,215,214,217,217,220,218,220,222,221,221,222,222,222,220,221,222,222,222,223,225,223,223,226,226,226,226,228,227,228,231,232,233,234,231,232,235,234,232,229,229,228,226,227,226,227,226,226,226,226,229,233,233,234,234,233,235,236,234,236,236,236,238,236,237,235,236,238,239,241,240,243,242,243,245,244,244,235,221,231,243,241,245,240,226,236,244,244,237,237,245,244,246,245,246,246,246,246,246,246,245,246,244,244,244,244,244,244,244,244,244,245,243,243,242,242,242,244,246,246,246,244,245,245,246,245,245,243,243,244,242,241,241,238,236,235,234,232,235,237,240,214,208,213,222,217,202,125,13,103,220,251,244,220,235,244,203,206,218,219,229,215,252,193,160,141,110,61,46,214,244,251,248,228,250,252,219,165,179,181,187,210,227,247,214,229,241,250,182,140,228,249,230,230,239,167,179,197,205,245,250,236,197,165,178,160,192,210,155,201,134,182,187,148,83,42,138,81,87,81,50,42,49,24,70,231,246,197,176,177,212,240,208,232,246,169,160,111,88,149,228,250,178,112,103,212,198,110,152,207,177,154,190,71,76,201,214,226,217,229,242,252,252,251,227,177,183,110,54,29,18,19,29,45,46,46,46,50,46,49,53,53,53,52,53,49,49,93,47,53,58,63,67,77,88,103,120,122,129,139,131,110,90,69,55,42,35,28,19,22,14,14,12,14,22,26,29,26,24,17,12,22,27,23,19,25,37,30,32,29,27,39,43,42,45,43,33,25,22,39,46,46,48,34,26,21,16,24,24,41,52,60,87,105,170,210,171,165,158,159,163,158,158,154,163,165,149,134,135,146,164,140,65,24,24,23,44,64,62,60,37,15,28,33,29,18,14,15,13,16,19,30,27,22,18,17,29,27,28,27,29,24,24,22,15,16,21,21,24,34,35,46,59,65,78,83,90,98,93,90,79,69,59,57,51,50,36,50,79,77,76,65,60,54,50,48,41,33,37,22,54,101,75,53,42,136,225,243,244,249,249,247,247,244,222,130,47,6,11,14,76,220,235,202,143,91,47,37,6,102,211,219,238,229,225,227,222,226,223,225,226,226,227,227,228,226,228,229,229,226,227,228,225,229,225,226,226,224,225,224,224,222,223,222,222,222,223,219,220,222,222,224,220,220,116,3,1,5,8,9,9,12,10,11,12,12,12,214,217,214,216,219,215,215,216,218,218,219,218,218,221,221,219,221,220,221,223,224,224,224,224,225,227,225,224,225,224,225,227,224,225,224,221,223,218,217,214,213,217,215,212,213,213,213,214,215,214,212,215,214,213,213,214,212,211,214,212,214,212,212,212,212,214,214,213,214,211,213,213,213,214,216,216,217,218,219,217,221,220,221,224,222,224,224,223,225,223,223,224,223,224,226,226,230,228,229,229,227,227,224,227,226,226,224,227,226,224,223,225,222,221,221,221,223,219,222,222,223,223,222,220,219,222,222,220,220,222,220,222,226,223,225,224,222,227,228,227,227,228,227,229,230,227,224,222,223,220,221,224,224,223,221,221,223,225,222,223,226,222,224,222,221,224,223,224,225,225,226,224,224,221,221,221,220,220,219,218,219,218,217,217,217,218,218,219,217,217,219,215,216,217,216,215,215,219,218,216,218,220,220,221,222,221,221,222,220,221,222,221,223,224,224,224,225,226,226,227,229,227,228,229,232,233,232,231,231,232,233,235,232,230,227,224,226,224,226,228,227,225,225,227,228,231,232,233,234,235,238,238,239,237,237,237,240,240,237,237,238,240,240,240,241,243,244,244,245,243,230,221,236,241,240,244,231,225,239,244,241,235,240,244,244,245,245,245,244,245,244,245,245,245,247,245,245,244,244,244,245,245,245,244,244,243,243,245,244,244,245,244,244,244,245,245,245,245,244,245,244,244,242,243,242,239,239,236,236,234,236,236,237,242,214,210,210,198,176,223,142,16,149,237,248,208,190,232,242,203,206,217,220,225,217,252,191,163,145,124,71,45,215,243,250,250,223,228,193,154,151,189,221,234,245,248,252,210,189,201,232,178,149,228,252,210,163,202,182,208,233,223,251,252,201,163,159,162,153,184,197,182,226,142,186,172,133,72,35,132,83,95,68,53,49,47,24,74,243,243,207,180,184,220,244,212,226,249,179,222,179,79,147,230,217,156,111,142,245,168,59,159,236,202,184,172,92,139,212,188,196,196,208,219,228,245,253,229,187,184,92,58,59,57,62,66,84,84,86,83,88,85,79,77,74,74,76,85,86,91,93,118,135,127,134,122,113,108,90,78,68,47,40,37,27,27,24,22,22,23,23,21,19,12,12,10,14,23,27,30,33,32,18,12,16,19,15,15,37,42,27,27,38,46,50,52,51,45,43,34,23,38,51,52,54,54,45,29,33,48,53,48,51,55,97,131,155,239,216,160,165,159,162,162,160,160,158,170,164,147,140,138,152,169,123,51,38,39,26,36,54,64,71,55,36,33,31,24,24,22,13,17,11,21,32,30,30,19,20,28,24,33,33,30,30,27,24,19,23,24,22,21,20,20,23,18,20,21,27,39,53,70,83,98,114,118,105,86,74,53,49,55,51,57,68,76,75,73,72,68,63,55,48,40,43,44,39,39,27,69,101,100,94,78,113,167,214,227,198,172,160,180,238,242,248,240,177,141,99,40,17,9,139,222,229,237,227,229,226,226,227,224,227,229,227,227,231,230,227,229,230,229,229,230,231,229,227,228,226,227,226,227,225,225,224,225,224,225,229,224,223,223,224,225,226,223,222,116,3,1,5,8,9,9,12,10,11,12,12,12,212,217,213,214,215,213,217,214,217,214,216,217,218,218,217,221,220,220,220,221,222,223,224,222,223,222,225,223,222,225,224,226,224,225,224,223,224,219,219,216,216,216,213,215,212,214,215,213,216,212,214,214,213,218,214,216,214,213,214,215,213,211,213,212,213,214,212,214,214,212,215,213,214,213,215,215,215,218,218,218,216,217,219,220,222,220,223,222,223,222,226,226,224,227,224,226,225,226,228,225,225,227,225,225,226,226,227,227,224,223,224,222,221,222,224,222,224,222,226,224,222,222,220,220,217,217,217,217,218,219,219,222,222,222,224,224,225,226,229,229,228,227,226,228,227,224,223,217,221,222,222,226,222,224,223,222,223,224,223,224,224,224,224,222,224,224,225,224,223,225,224,223,222,220,220,220,221,221,218,220,217,218,220,219,222,218,218,218,217,219,216,215,216,217,218,217,217,215,217,217,218,216,217,219,218,220,220,221,222,223,223,223,223,223,223,222,224,225,227,227,227,225,225,227,226,229,229,230,230,228,231,232,231,231,226,223,226,225,225,227,227,227,227,227,230,230,231,233,234,235,236,236,237,237,235,237,239,237,238,237,237,237,237,238,240,241,240,242,245,236,223,224,237,241,241,243,224,227,243,242,238,234,244,244,243,244,244,245,244,244,244,244,245,244,244,244,244,244,244,243,243,244,243,243,243,244,244,246,245,244,244,244,244,244,244,245,243,243,243,244,242,243,245,243,242,240,238,237,237,237,237,237,239,244,218,199,192,204,219,252,143,49,169,221,232,215,207,237,244,198,209,218,216,222,219,252,183,151,125,105,71,46,211,243,250,208,152,188,187,183,202,225,249,249,252,241,252,204,177,188,207,164,136,221,223,184,189,231,199,229,248,217,252,240,173,160,170,158,154,186,188,216,245,132,180,163,148,84,19,118,76,90,71,44,44,48,22,76,235,245,221,177,188,221,239,213,221,245,171,230,216,101,146,218,186,130,132,195,252,185,118,199,244,189,218,217,131,188,209,199,211,192,184,185,199,219,232,183,128,78,10,2,6,8,8,10,21,36,48,65,79,78,79,77,77,83,83,78,80,78,61,69,61,59,54,48,46,45,57,77,71,26,15,25,18,20,14,12,15,18,20,32,32,18,15,12,12,16,23,45,44,40,34,22,24,15,15,20,36,46,29,19,39,47,46,52,48,40,28,27,44,54,57,58,51,50,46,33,34,49,58,57,57,56,119,132,181,234,198,160,159,147,165,161,162,165,166,178,165,148,142,145,160,164,103,48,53,49,37,23,25,56,70,68,54,36,34,30,38,33,24,19,17,27,35,33,32,29,16,17,25,32,37,36,29,32,32,27,29,27,34,30,19,17,25,22,15,16,15,24,24,23,30,34,43,60,68,83,91,97,93,82,71,63,59,53,53,58,66,74,78,79,77,76,69,61,62,54,45,38,39,42,42,36,38,27,19,96,199,230,250,250,252,252,243,195,142,107,41,26,14,64,223,232,236,236,227,230,228,226,228,226,227,227,231,232,229,229,230,232,229,229,226,230,230,226,227,226,228,230,228,227,226,226,228,228,228,227,225,226,225,226,226,224,225,223,224,115,3,0,4,9,10,9,10,10,12,11,12,12,212,214,213,212,213,214,213,212,215,214,214,215,214,214,217,214,216,218,220,222,220,221,223,222,220,221,220,221,222,224,225,223,222,225,222,222,225,220,220,219,219,222,219,220,220,218,216,213,214,212,215,214,214,213,214,214,213,215,215,215,214,214,214,212,214,212,213,213,214,211,211,216,213,214,216,214,217,216,215,217,219,217,221,222,217,221,222,222,222,221,222,222,225,226,224,225,224,224,224,221,224,223,224,226,224,224,224,225,224,224,222,222,222,223,223,222,225,223,225,224,223,221,219,219,219,216,215,217,217,220,219,222,225,222,222,225,227,227,225,224,226,226,224,225,224,223,221,219,221,223,225,224,224,224,222,222,222,224,225,225,224,222,225,224,225,226,224,226,225,223,221,218,218,220,220,221,220,220,219,217,218,218,221,221,222,221,221,219,219,219,219,219,218,220,217,218,215,216,219,218,221,217,217,218,218,217,222,223,222,221,223,222,222,223,221,223,222,224,224,221,224,225,225,225,226,226,227,228,229,229,231,230,231,230,227,223,225,225,224,226,224,225,228,228,230,229,232,231,232,233,234,233,235,236,236,237,237,237,237,236,236,237,236,239,239,239,238,240,241,231,215,223,239,237,243,236,222,236,245,241,238,238,244,245,244,244,245,245,244,245,244,245,245,244,245,244,244,244,245,244,244,243,244,244,243,244,243,245,245,245,244,244,244,244,243,241,243,244,243,243,243,242,242,242,241,239,239,237,239,238,238,238,239,243,211,197,205,224,237,252,114,55,195,234,251,232,229,249,243,199,211,217,212,223,223,252,188,154,125,108,71,55,203,204,181,184,183,239,226,214,230,227,246,242,240,235,252,212,174,200,226,177,133,212,244,237,226,248,193,207,238,182,218,213,159,174,189,178,196,208,200,229,231,116,164,188,163,105,33,91,72,89,68,47,47,52,20,74,233,241,217,152,171,218,236,214,216,243,173,214,230,113,134,178,131,162,167,221,241,132,162,240,236,175,221,212,147,205,216,214,214,196,201,199,190,193,194,132,95,87,70,81,69,57,46,30,20,16,32,39,32,29,27,23,29,28,22,25,17,25,24,18,36,36,38,32,36,71,119,142,81,24,24,18,15,14,11,14,19,28,24,24,35,34,33,18,13,16,27,45,41,49,50,44,41,31,20,15,24,27,25,24,34,46,45,39,41,35,19,39,55,54,56,58,47,33,30,26,25,46,59,68,54,68,136,134,211,252,177,156,150,150,163,163,165,165,174,186,165,145,146,150,170,164,84,56,76,66,60,36,40,62,67,73,63,42,37,35,34,34,24,19,23,36,30,30,34,33,30,17,24,31,30,36,35,37,36,32,29,29,41,37,22,16,16,22,18,16,27,36,31,29,27,19,18,14,18,25,33,45,61,69,79,95,93,90,82,72,65,59,61,59,61,72,76,81,77,72,71,69,59,56,61,51,57,47,46,37,20,104,206,227,239,212,172,132,110,67,35,9,74,221,245,245,235,236,232,231,231,230,229,230,231,231,233,233,228,229,232,230,229,229,229,227,229,231,228,226,229,227,229,228,228,229,229,231,229,230,228,226,228,226,228,226,225,225,222,116,5,1,4,9,10,9,10,10,12,12,13,12,210,213,211,211,214,212,213,209,213,213,214,212,213,215,213,214,213,216,218,219,221,219,220,219,221,220,220,221,220,223,222,221,221,224,222,220,222,220,220,220,222,221,219,221,219,219,216,213,214,214,214,214,212,214,214,214,214,213,214,214,212,211,214,211,209,211,211,208,210,210,213,212,211,213,214,212,214,217,216,216,217,217,219,217,220,218,217,219,221,222,224,222,221,221,221,225,223,222,220,219,218,221,220,219,222,221,223,223,220,221,222,220,221,219,220,221,221,220,222,219,217,218,217,218,217,218,217,220,217,219,221,221,222,221,221,220,222,222,223,222,222,224,220,221,221,222,222,219,224,223,222,224,222,222,221,220,221,223,222,221,221,223,224,224,222,223,222,225,224,220,220,217,220,220,223,223,222,221,219,221,218,221,221,220,223,222,223,222,217,221,221,221,219,216,216,214,216,215,215,216,217,217,217,217,216,218,218,219,220,222,222,220,223,222,222,222,224,223,223,225,223,222,223,226,225,225,228,229,230,228,228,228,229,229,228,227,224,222,224,226,223,224,225,225,227,226,228,229,232,234,234,233,233,235,234,235,236,236,235,234,233,235,236,236,237,237,236,236,236,222,211,224,234,237,240,226,222,240,244,238,234,241,245,244,245,243,244,243,243,243,244,242,244,242,242,243,242,243,242,243,244,242,243,243,244,242,242,243,244,243,244,242,242,242,241,242,242,242,242,243,242,243,241,240,241,238,237,238,237,235,237,235,241,239,215,193,209,223,199,228,96,97,239,246,252,233,219,242,238,196,212,215,209,219,231,252,198,178,146,120,95,66,176,209,224,227,216,250,226,212,225,219,240,231,238,228,250,205,169,203,243,218,152,235,251,251,221,178,150,192,209,177,206,187,156,187,204,181,208,218,183,200,204,113,182,174,159,147,74,95,68,95,70,48,53,50,18,71,230,245,207,124,171,223,236,215,211,244,165,184,205,127,112,78,98,165,200,217,146,135,199,248,219,150,198,167,126,198,206,214,221,214,216,206,200,214,178,152,183,198,228,241,241,247,245,244,239,217,149,73,55,51,46,50,42,37,35,27,31,36,34,40,60,53,39,16,53,127,161,162,70,23,26,14,11,11,9,21,43,46,39,26,21,37,44,41,28,15,19,19,40,51,47,53,46,44,39,29,23,21,13,24,49,54,46,32,23,22,44,53,55,47,45,56,44,30,12,24,44,56,56,60,51,47,108,147,234,234,159,152,145,150,167,160,161,161,179,185,156,140,149,159,170,142,61,39,56,76,69,38,42,74,71,76,66,37,34,30,36,27,22,16,17,24,23,36,37,35,32,22,21,17,25,40,27,33,35,22,19,29,35,31,25,17,16,12,19,30,41,44,39,32,23,18,14,15,16,17,19,20,25,26,36,42,53,69,81,95,104,103,93,85,83,77,68,66,60,62,63,66,69,69,68,71,77,74,63,63,42,16,24,24,38,53,50,53,64,27,56,189,243,243,250,236,235,235,231,229,228,231,231,231,235,233,232,232,229,231,228,231,231,228,228,228,230,229,230,230,230,229,228,230,229,229,230,228,229,229,227,229,227,224,227,228,226,225,224,116,3,1,5,9,9,9,11,9,12,12,12,12,210,212,210,212,211,210,212,208,212,211,211,211,213,213,214,212,215,216,217,218,220,219,218,219,218,219,218,221,219,218,223,220,220,221,219,219,221,220,218,220,219,218,218,220,216,215,216,214,215,213,215,215,215,215,217,215,214,217,214,215,213,213,214,211,212,208,208,211,211,211,211,212,207,211,213,211,212,213,214,214,215,214,213,216,216,217,218,218,221,220,223,220,219,222,219,222,222,220,218,218,220,218,219,221,220,220,222,220,221,219,220,220,218,218,216,218,217,216,216,216,217,214,218,218,217,219,220,220,218,220,220,220,221,219,220,221,220,222,223,222,224,226,222,222,220,219,219,220,222,221,222,222,223,224,223,223,223,221,219,219,219,219,221,222,222,221,221,222,222,219,218,219,221,223,223,227,223,221,222,223,222,221,223,222,223,224,223,221,220,220,221,219,215,217,214,214,216,217,218,215,219,218,217,219,220,218,219,218,219,217,220,221,220,222,221,223,222,223,223,220,224,223,219,224,225,226,228,226,227,224,227,227,226,227,227,228,227,224,225,224,224,225,224,225,230,228,228,229,230,233,233,233,232,233,234,232,232,234,233,232,234,234,235,235,234,235,235,237,231,215,210,227,234,236,235,218,227,242,241,234,236,243,242,242,241,242,242,242,244,243,244,242,242,243,242,244,243,242,243,243,244,242,243,242,242,242,241,242,241,243,244,242,241,243,242,241,242,241,243,241,242,243,242,241,242,240,239,238,237,235,237,234,241,235,217,194,193,195,210,249,95,118,247,247,251,189,198,238,233,197,214,218,208,225,237,252,186,146,118,108,75,68,220,245,251,244,223,251,207,212,226,213,239,229,235,231,246,225,181,185,220,200,143,217,251,214,170,160,178,224,244,213,191,183,166,200,199,151,203,203,160,176,193,120,171,182,126,174,157,131,94,90,69,47,47,56,18,67,233,247,210,131,177,229,239,216,209,245,175,162,181,158,117,58,88,181,194,192,151,145,224,246,209,149,174,115,117,218,214,225,225,210,213,215,220,230,179,158,225,241,242,243,251,252,253,253,250,210,162,132,131,129,125,118,111,95,82,80,93,90,85,96,89,53,31,21,94,136,147,144,55,27,24,12,15,12,10,33,54,45,54,42,18,23,40,50,47,36,27,28,32,49,52,49,48,46,46,45,41,36,27,46,62,46,51,45,39,49,51,57,50,38,23,30,38,19,15,35,58,60,55,49,38,19,55,164,251,217,145,148,141,157,160,159,162,167,191,177,146,146,157,165,174,124,40,21,31,58,67,35,36,62,79,77,48,23,25,31,33,34,34,19,14,15,22,32,32,34,37,31,21,18,20,23,27,30,30,22,12,23,27,19,20,23,30,40,43,51,55,49,46,32,20,14,17,12,23,33,29,33,26,29,23,23,27,22,28,39,57,73,78,96,123,146,152,152,134,126,109,93,82,66,63,59,61,59,60,57,55,61,47,42,44,46,47,52,41,11,116,223,250,250,243,235,238,234,233,229,230,231,235,234,235,236,233,231,232,232,231,232,231,230,230,231,230,229,229,229,230,227,231,232,230,228,229,229,227,228,223,228,227,224,226,226,226,224,224,116,4,1,6,9,9,9,12,10,11,12,12,12,206,214,212,210,210,208,209,209,211,210,210,210,212,211,212,214,213,216,218,215,217,218,218,215,217,220,219,220,218,220,220,219,220,220,218,219,221,219,221,220,220,220,218,218,218,216,213,216,214,215,216,214,216,216,213,213,214,215,213,216,216,215,216,214,213,214,214,212,212,211,212,214,212,210,212,212,213,213,212,214,214,214,215,214,215,216,220,221,219,218,218,217,216,216,217,217,216,218,219,220,218,218,219,218,219,220,219,218,217,218,219,217,216,217,215,218,216,212,215,215,215,216,215,217,217,217,218,218,220,220,219,220,217,217,219,222,222,218,221,221,221,223,221,223,221,217,219,220,221,222,220,219,221,223,222,223,224,223,221,222,220,218,221,221,220,221,219,222,221,220,222,221,221,221,223,222,222,222,224,225,222,222,220,219,221,221,222,224,222,223,221,218,218,217,214,215,219,217,218,219,220,220,220,220,221,220,218,218,217,219,218,218,220,218,221,222,222,225,222,222,223,222,223,223,221,224,225,222,224,224,225,224,226,224,224,226,225,226,225,222,221,222,224,226,228,229,230,229,231,233,232,230,231,233,232,232,232,232,232,234,234,232,234,231,233,233,235,237,229,214,215,232,235,237,226,216,235,242,237,232,236,244,243,241,242,241,241,242,241,241,242,241,243,242,241,242,241,242,242,243,242,241,241,242,242,240,241,242,241,243,241,240,240,240,240,241,241,241,241,242,242,241,240,241,242,240,240,239,239,235,237,235,244,231,214,183,194,227,240,249,104,115,233,221,227,196,209,243,235,203,226,222,211,217,216,209,131,126,104,93,68,57,216,245,252,236,216,249,199,215,228,210,238,230,236,229,243,234,198,184,183,164,111,155,201,220,222,178,198,246,252,230,193,191,172,208,185,146,210,208,179,174,196,136,187,138,71,207,198,161,125,97,61,36,44,57,19,64,226,248,193,101,160,216,235,213,210,243,190,156,181,218,159,78,121,145,185,213,98,159,237,239,201,185,190,97,170,246,222,223,225,228,224,207,213,224,169,200,239,224,224,222,232,227,211,154,106,89,90,108,133,128,113,113,110,133,146,139,128,105,114,150,125,68,29,24,114,139,145,130,42,25,29,25,17,12,13,19,49,55,58,39,22,13,23,46,50,54,48,39,42,42,48,52,47,50,48,47,53,45,46,50,49,48,54,50,54,53,50,52,40,34,27,43,47,35,45,55,71,77,76,81,77,50,94,206,234,202,145,141,145,156,165,159,166,180,191,162,137,147,160,174,169,110,47,41,41,48,57,44,33,49,72,60,39,15,16,22,30,39,34,26,17,14,21,27,20,26,35,36,29,22,17,21,32,36,32,25,17,12,19,33,43,44,49,53,53,56,54,47,41,28,21,16,22,35,42,44,40,31,27,21,15,19,18,25,24,22,26,51,77,67,61,76,98,134,145,145,156,159,157,148,141,136,129,118,103,100,97,90,89,74,60,46,27,16,22,7,99,222,247,248,238,232,235,231,233,231,233,235,235,234,236,235,233,235,233,233,231,232,232,231,232,232,230,231,231,229,229,230,230,231,229,230,230,227,229,227,225,226,226,225,225,226,226,226,223,115,3,0,5,9,10,9,10,10,12,12,11,11,205,210,208,209,210,208,210,207,212,212,213,210,211,210,210,212,213,212,212,216,216,217,217,217,217,217,220,219,219,217,221,219,218,220,220,219,219,220,218,217,217,218,216,215,215,215,217,216,215,214,215,216,215,213,212,212,211,213,214,212,213,214,215,212,216,215,215,216,212,212,212,212,211,212,211,213,213,210,212,213,214,214,215,216,214,215,214,216,218,215,215,215,214,215,214,214,216,216,216,220,217,220,219,220,221,217,220,218,220,217,220,217,216,219,215,215,213,214,214,214,216,214,216,215,216,219,217,217,217,218,218,217,219,217,215,218,218,218,218,219,220,221,221,220,220,218,222,223,221,222,218,221,221,221,223,219,220,220,222,222,220,221,219,219,220,221,222,219,219,221,222,223,223,223,222,221,223,222,220,222,222,219,219,219,218,219,223,223,222,222,223,222,217,218,218,218,218,220,222,223,222,221,221,221,220,218,219,219,218,216,218,217,217,222,220,221,221,222,223,220,224,224,222,223,223,222,222,222,223,223,227,225,227,232,227,227,225,226,226,222,224,224,222,223,228,230,230,231,229,230,231,232,231,231,234,231,233,234,231,232,234,233,232,232,232,232,235,234,224,211,220,234,236,238,219,224,240,241,234,228,242,243,242,243,240,241,242,242,240,240,242,242,243,244,242,242,241,241,242,241,241,242,242,242,242,242,241,241,243,245,243,242,240,241,241,242,243,241,243,241,242,240,240,241,240,241,240,241,240,238,239,237,244,227,220,194,215,235,235,250,89,96,203,220,247,219,237,249,239,213,227,209,174,177,188,215,161,149,125,101,63,60,216,243,249,231,218,243,191,222,226,212,236,227,234,230,239,235,196,180,199,178,113,147,220,243,250,177,188,246,252,229,182,179,171,214,189,184,226,208,202,159,184,151,184,106,21,221,199,160,167,113,69,21,33,56,17,66,227,248,182,83,132,194,233,218,210,244,191,151,185,247,199,94,76,132,210,156,72,171,240,233,199,196,194,142,199,239,221,219,221,223,229,226,229,213,163,200,231,227,239,237,232,197,127,79,49,62,77,93,87,48,39,35,45,66,98,114,85,39,32,103,116,89,39,41,135,127,148,120,29,18,28,42,34,18,16,30,54,61,61,32,13,13,20,48,56,43,32,32,34,42,50,47,52,48,50,49,50,54,48,56,61,68,70,69,77,77,83,92,97,94,101,111,110,111,121,122,119,124,127,135,117,84,152,248,249,190,147,149,144,160,160,159,171,186,188,150,143,153,166,179,159,96,59,65,57,65,66,66,70,61,61,60,55,51,47,48,48,47,45,34,27,27,31,20,15,19,28,38,36,35,31,33,36,36,37,32,24,27,37,50,54,53,54,57,55,52,49,36,25,22,25,40,42,43,39,35,37,38,34,19,16,15,17,16,29,28,38,93,114,98,68,43,27,25,33,43,47,53,61,66,75,88,104,107,113,112,110,95,80,69,39,27,22,16,36,111,221,241,242,243,238,235,231,231,233,233,231,233,235,236,235,234,235,236,233,233,234,234,232,233,232,231,234,233,232,230,232,229,230,229,227,232,231,229,228,230,227,226,226,227,228,227,227,225,223,116,4,1,5,10,10,9,10,10,11,12,11,12,205,208,205,207,206,207,207,206,209,210,207,207,210,209,210,210,210,211,212,214,216,214,217,214,215,216,212,216,217,214,214,215,215,216,214,215,215,214,216,215,214,212,212,212,215,215,215,218,214,212,214,214,216,214,212,213,210,210,210,208,209,209,211,212,211,213,212,211,214,212,211,209,210,212,211,211,210,212,211,211,212,210,212,211,214,214,214,213,211,214,214,215,214,211,214,214,214,216,213,215,215,217,220,217,219,218,217,219,217,218,216,215,218,215,214,216,215,213,214,214,211,215,215,216,216,215,215,214,218,216,215,214,215,217,213,216,217,217,223,220,222,222,219,221,220,220,221,220,218,217,215,218,220,222,220,219,219,220,218,219,218,215,217,214,217,219,218,218,217,220,219,222,220,218,219,218,218,218,219,217,216,217,218,219,217,219,219,219,219,218,219,221,218,218,217,215,217,216,218,219,220,221,219,220,220,216,217,218,216,219,218,217,220,219,223,222,219,222,222,223,222,220,221,222,223,224,223,222,220,223,224,224,227,226,230,229,224,225,223,222,223,220,222,225,226,227,226,225,226,229,231,231,230,231,232,230,230,231,232,232,229,232,233,232,230,229,233,229,217,208,224,235,239,232,217,232,244,239,229,232,240,240,240,240,241,240,239,239,241,241,241,240,241,241,241,242,240,241,241,241,242,241,243,242,242,242,242,243,242,242,243,242,240,242,240,240,241,240,240,240,240,241,240,240,240,237,239,238,239,239,239,238,241,225,217,200,209,187,207,234,89,123,232,243,252,221,223,241,224,190,198,179,170,202,230,251,187,172,142,125,81,62,220,243,244,227,218,232,185,226,225,213,235,224,231,229,234,232,186,184,198,187,165,174,229,248,250,164,185,243,235,190,131,156,187,217,184,194,220,204,193,128,171,149,164,88,24,225,191,136,174,144,92,20,19,51,17,72,230,247,177,93,157,200,239,220,206,244,189,139,195,246,207,102,81,170,219,141,74,178,234,220,193,190,163,129,203,228,219,214,215,222,225,222,240,211,158,206,209,212,222,236,239,148,105,72,56,77,79,50,29,33,42,56,55,39,54,96,92,58,16,57,109,109,52,64,138,119,147,102,23,14,27,58,54,50,49,51,60,57,48,23,11,9,20,46,46,30,22,25,30,46,55,60,68,76,84,93,112,117,125,136,134,137,129,117,118,107,113,116,109,109,100,101,100,104,118,113,117,109,117,128,99,93,195,250,234,176,144,150,150,156,161,166,181,186,169,136,141,156,165,180,140,78,55,47,43,46,51,54,56,59,57,56,51,57,63,57,64,62,63,63,60,59,50,48,42,46,42,51,49,41,39,36,35,37,38,42,37,37,41,50,52,55,54,53,54,48,41,33,37,41,43,52,43,35,27,36,47,42,44,37,21,15,15,16,33,35,52,112,131,110,89,49,27,27,42,38,29,19,26,28,22,26,28,37,40,44,50,55,51,60,83,104,150,208,244,244,252,252,241,244,237,236,232,230,233,232,233,234,233,233,232,231,232,232,233,233,234,233,232,232,233,233,233,233,229,230,231,229,230,231,230,229,228,228,227,225,227,227,222,222,225,228,226,223,223,116,4,1,5,8,9,9,11,9,11,12,11,11,204,209,205,206,207,206,206,204,204,204,208,207,208,208,208,210,211,212,210,213,212,213,213,213,214,214,215,214,214,214,214,213,214,213,214,216,214,216,214,217,214,213,214,215,217,213,214,214,214,213,212,210,212,213,210,212,210,210,211,207,207,208,207,207,212,210,209,209,210,212,212,212,211,212,213,213,210,211,212,208,210,210,213,212,214,213,211,213,212,212,211,211,212,213,214,214,215,214,214,217,214,217,215,215,220,214,217,217,215,215,214,215,214,216,216,214,214,214,214,214,214,213,216,215,214,215,215,214,213,216,217,216,215,214,217,217,217,218,219,218,221,221,217,218,219,219,217,217,218,217,217,220,221,218,222,222,220,218,217,217,214,217,215,217,217,217,218,217,219,218,216,219,218,216,217,218,216,215,216,216,217,218,217,217,217,215,215,214,214,218,219,216,220,220,217,216,214,217,215,216,217,217,220,220,220,219,220,218,218,219,218,218,218,220,220,218,221,221,222,221,222,221,220,221,222,221,222,224,223,223,224,224,226,225,224,224,223,224,222,219,221,221,224,226,225,226,228,229,230,230,231,232,230,231,232,231,232,232,232,231,229,230,230,230,228,230,231,224,214,214,231,239,236,221,218,239,244,238,231,236,242,240,241,241,240,241,241,240,240,238,239,239,240,240,240,240,240,242,241,241,241,241,242,241,241,243,243,242,242,243,242,242,240,241,241,241,241,240,240,240,240,240,241,239,240,240,239,240,237,239,236,238,243,217,210,177,186,201,235,242,97,152,246,245,232,180,201,208,201,192,216,204,197,229,245,252,187,170,149,129,84,69,222,243,245,223,220,225,189,235,222,215,234,225,231,228,231,234,205,196,165,162,188,182,230,250,248,154,165,218,187,165,143,177,207,210,171,189,203,205,203,145,184,150,140,65,70,250,165,84,137,160,143,58,12,29,11,70,231,248,174,115,201,208,236,219,207,249,185,155,208,205,143,77,107,183,185,134,123,199,230,210,190,158,123,150,213,229,227,212,213,220,224,222,237,194,168,230,225,214,206,231,183,107,107,69,73,87,44,33,36,61,95,108,115,84,56,103,103,71,30,39,109,125,70,95,129,112,145,78,16,19,8,41,59,57,61,56,58,47,36,28,24,28,44,67,70,76,88,99,111,122,122,131,141,138,135,127,129,122,116,118,111,117,106,107,105,96,106,109,111,110,112,107,116,113,110,116,101,103,93,92,54,99,231,250,247,162,140,151,151,159,162,174,185,178,150,135,150,163,181,175,112,63,46,39,41,50,55,56,58,58,60,59,59,62,53,51,50,50,49,50,53,47,50,54,55,63,61,63,63,61,59,53,53,47,47,42,41,44,44,48,49,53,52,57,60,54,46,41,46,49,50,55,36,21,24,33,49,48,49,35,24,16,14,21,38,38,57,106,114,115,87,59,29,27,65,73,66,62,66,58,48,46,48,53,56,66,58,72,169,238,251,251,252,252,252,252,252,252,241,240,238,234,234,234,233,234,236,236,235,234,235,234,231,233,233,233,232,232,234,232,233,233,233,232,231,233,232,230,231,232,229,229,229,229,227,227,227,226,228,227,226,225,226,226,223,116,4,1,5,8,9,9,10,9,11,12,11,11,200,206,206,206,205,202,204,205,205,206,207,207,207,206,207,206,206,208,209,210,209,208,211,209,211,212,210,211,212,212,213,214,214,214,212,211,213,212,213,215,213,214,214,212,214,213,209,209,210,211,212,208,208,207,206,206,207,208,208,206,208,208,208,210,210,208,208,209,209,208,210,210,210,209,207,210,208,210,209,211,210,208,210,212,212,210,211,212,212,211,211,212,212,212,212,210,212,213,212,213,214,214,214,214,214,215,214,214,214,214,214,214,215,215,215,215,213,211,211,213,213,212,212,212,212,214,214,214,215,213,215,214,217,217,215,217,215,216,217,217,217,218,218,218,219,218,217,216,216,217,218,220,217,219,219,216,214,214,215,215,216,216,215,215,216,214,216,214,215,219,214,214,216,215,214,215,215,214,214,213,215,218,216,218,216,217,213,213,215,214,214,215,215,214,217,215,214,214,213,214,214,217,218,217,217,217,218,217,217,217,219,220,218,219,218,218,216,220,222,220,220,221,222,220,219,220,221,221,219,221,220,220,222,223,222,220,222,223,222,221,222,221,223,226,225,225,227,229,229,230,230,232,230,229,232,229,230,231,230,230,229,230,230,229,228,229,229,222,211,218,233,237,234,216,225,240,240,231,229,241,240,241,241,239,241,239,239,238,239,238,238,238,239,239,239,240,240,240,240,240,241,241,241,241,241,241,240,240,241,239,239,241,240,240,239,240,242,240,240,239,238,240,239,238,238,237,237,237,236,236,234,239,237,219,201,191,214,223,250,238,96,136,214,212,203,169,200,226,218,211,240,214,200,229,240,251,177,164,136,110,72,66,218,242,242,221,224,218,194,241,221,217,231,225,230,226,231,235,211,229,174,143,185,161,201,246,222,128,170,219,189,210,181,184,209,197,179,189,196,207,214,179,212,158,102,78,148,249,146,48,97,134,168,128,55,36,8,37,194,246,174,128,202,193,227,218,210,252,166,149,194,123,118,113,97,148,170,132,151,227,222,207,196,147,107,169,238,230,229,216,210,219,222,223,236,178,179,235,225,236,229,240,159,96,115,73,89,76,39,24,80,130,128,117,91,117,113,109,118,84,37,41,97,125,85,106,124,108,130,61,11,14,5,15,33,55,57,62,66,70,80,77,125,89,157,143,134,138,119,135,133,132,128,129,128,118,109,112,113,109,106,90,84,87,93,95,78,67,79,66,61,59,51,46,43,39,44,40,58,60,24,36,6,119,238,250,234,148,138,152,148,162,168,178,183,166,137,139,155,169,179,166,82,33,27,17,22,17,27,25,30,36,28,34,37,38,38,44,43,44,53,49,56,55,56,56,48,56,53,53,54,53,57,63,61,65,67,61,61,69,66,65,58,60,58,57,59,37,34,33,24,35,56,51,30,22,18,31,49,54,49,31,24,19,17,32,50,41,54,109,114,125,99,56,30,46,109,130,141,147,149,131,117,119,133,144,147,149,138,123,181,243,251,251,252,252,249,249,246,240,237,241,236,238,237,233,235,235,237,238,237,236,236,235,235,234,233,234,233,232,233,233,233,233,232,232,232,232,232,230,232,233,231,231,229,229,228,227,227,227,226,226,227,226,228,226,223,116,2,0,4,8,9,8,10,10,11,11,12,12,201,205,201,205,205,203,204,204,204,206,208,203,206,205,206,207,207,207,206,210,206,209,206,205,210,211,210,210,208,210,210,209,212,210,212,210,210,213,211,212,212,213,211,210,213,208,209,210,210,210,208,208,207,207,206,208,210,207,209,208,206,207,209,208,210,208,208,211,211,209,210,211,209,208,207,209,208,208,210,210,210,213,208,209,210,211,210,212,214,213,213,213,213,211,212,213,212,213,212,214,214,215,215,214,214,213,214,214,211,215,213,215,214,213,216,211,215,214,211,212,211,211,214,212,214,212,214,215,213,214,214,215,215,216,218,217,216,216,217,216,215,216,216,219,218,215,217,216,217,216,219,218,217,217,217,216,214,214,217,216,216,218,216,214,214,215,214,215,215,215,215,215,216,213,212,215,214,215,216,215,216,214,214,217,216,216,217,216,214,214,214,214,212,213,215,215,215,215,215,216,218,216,216,218,217,218,220,218,219,218,222,221,221,221,218,217,220,219,218,218,219,218,219,220,219,219,219,220,217,218,222,219,221,220,221,222,218,221,221,221,223,222,222,225,226,226,227,226,228,230,230,230,229,226,229,229,228,227,228,230,229,228,229,226,228,234,229,220,212,221,234,239,226,214,233,239,235,225,232,240,239,242,240,240,238,239,239,238,240,239,239,236,237,239,238,239,238,239,240,239,241,240,241,240,240,241,240,241,241,239,239,240,238,240,240,240,242,239,240,240,237,239,238,238,239,237,237,236,234,237,231,242,239,224,226,220,233,217,245,194,96,161,200,213,222,207,234,240,233,219,242,209,192,229,236,249,172,155,127,101,63,61,215,240,242,220,224,212,199,242,218,214,229,227,229,225,229,232,207,214,155,151,196,133,150,188,200,175,203,227,206,223,173,177,212,192,186,186,193,207,212,196,219,163,80,103,203,250,141,37,67,91,132,171,139,63,6,24,167,245,168,120,215,185,220,218,210,248,162,159,198,160,190,147,81,119,156,155,198,244,214,201,213,150,98,187,238,229,234,218,208,217,222,228,229,166,193,236,225,233,233,252,141,87,112,74,100,63,19,47,118,133,90,59,19,33,88,126,123,81,43,40,99,115,89,120,114,120,120,44,34,46,49,71,86,114,120,122,133,128,124,122,131,131,121,110,109,117,127,136,134,133,122,112,101,78,63,56,56,52,53,52,41,39,28,36,53,54,62,60,50,39,31,29,30,30,35,29,24,27,23,23,12,159,252,252,227,136,148,143,154,167,175,190,172,147,133,147,161,173,181,139,63,33,28,15,27,33,19,28,24,30,28,17,18,19,17,17,16,18,22,17,25,35,30,35,38,45,55,56,57,58,56,58,57,57,56,57,58,59,72,80,85,84,73,75,65,56,42,27,29,29,48,46,25,16,14,25,42,54,46,39,31,22,39,48,48,41,56,113,118,127,108,60,42,106,152,139,110,83,109,128,140,136,127,116,104,123,123,124,107,108,198,237,248,245,237,238,238,238,236,239,237,238,238,236,235,235,236,236,236,238,238,236,234,235,235,234,233,232,234,235,233,235,233,233,233,231,231,230,230,232,231,231,228,225,226,227,226,225,226,226,226,226,228,226,222,117,4,1,4,8,10,9,10,10,11,11,11,11,200,204,202,202,205,199,201,204,203,204,204,204,206,206,205,205,207,207,207,206,208,205,205,206,207,208,209,208,209,209,207,210,208,209,209,208,211,210,210,211,210,210,209,210,208,209,210,209,209,207,208,206,206,208,210,211,207,208,208,205,207,206,206,206,208,207,207,205,206,208,208,206,208,209,206,209,208,209,208,208,208,209,209,208,210,210,210,210,212,210,212,213,209,209,211,210,212,212,213,214,213,213,212,213,212,212,214,212,214,212,212,212,211,212,213,212,211,212,214,212,211,211,213,212,210,212,214,211,210,211,213,215,214,216,217,215,218,216,216,219,217,217,214,213,217,216,216,216,217,218,218,217,216,215,216,216,215,215,217,212,214,217,212,214,212,213,215,213,214,214,214,214,214,215,214,214,216,214,215,216,214,214,214,215,214,217,216,214,215,214,214,214,213,215,217,215,214,215,214,215,218,217,218,217,216,219,217,217,220,219,221,220,220,221,217,218,214,215,217,214,216,215,217,217,218,216,215,219,217,220,220,219,220,217,217,217,218,218,218,220,222,220,225,225,224,224,224,227,227,229,228,229,225,224,228,226,227,224,226,228,226,228,226,227,227,230,224,212,211,225,236,231,215,217,233,235,227,223,235,238,238,239,236,236,237,235,234,237,237,237,237,237,237,235,237,236,236,237,236,239,237,238,240,239,241,239,239,238,237,238,239,240,239,238,239,239,239,238,239,239,237,238,238,236,236,236,235,236,233,234,236,249,245,252,237,231,211,169,203,149,103,196,234,243,237,197,225,243,222,210,236,200,200,234,237,249,173,164,137,105,61,66,223,243,241,226,221,194,199,240,214,215,227,224,227,224,226,229,190,198,142,141,210,149,150,199,220,190,189,209,209,219,172,188,217,184,198,184,197,202,212,196,193,141,56,152,234,250,155,47,81,66,89,136,168,134,28,21,177,245,171,124,222,197,212,216,215,226,139,194,236,184,174,127,132,145,136,173,225,249,207,202,207,145,110,182,229,226,230,223,211,215,219,231,218,157,207,233,223,234,229,252,123,78,97,73,103,49,22,50,130,123,65,74,48,29,71,125,119,70,35,47,105,107,92,121,112,117,120,104,136,138,142,146,138,125,125,125,120,125,117,134,130,96,111,95,79,83,82,79,67,57,53,38,40,53,47,37,26,28,53,66,59,29,19,55,78,76,87,90,81,56,32,32,33,31,33,30,29,26,30,11,71,213,251,251,208,137,150,146,156,173,178,178,159,136,138,152,163,174,168,113,47,34,29,28,40,34,37,36,32,30,18,21,19,15,24,16,13,18,19,14,19,17,13,19,17,16,20,23,25,24,40,46,51,58,61,63,62,63,71,74,70,71,70,75,83,80,76,67,61,59,61,55,36,24,21,24,37,52,49,48,53,47,51,49,46,29,54,113,123,127,105,49,73,146,157,99,55,41,70,106,107,81,57,49,46,49,51,60,87,78,113,216,243,242,241,234,237,238,236,236,237,237,236,237,238,236,236,239,238,237,237,236,236,233,232,234,231,232,234,235,232,233,233,232,233,231,232,232,229,229,229,227,225,227,225,226,225,227,226,224,227,226,226,225,223,117,3,1,5,8,9,8,10,10,11,12,12,12,200,204,201,202,200,200,201,203,200,200,203,200,203,203,202,203,205,202,205,207,205,207,205,207,208,208,206,206,210,208,207,207,205,207,207,206,207,206,207,208,207,208,208,210,208,207,209,208,207,207,207,207,207,209,206,206,207,205,207,206,204,205,206,204,203,204,205,207,204,204,205,205,206,205,206,206,207,210,207,208,207,208,207,210,211,211,211,209,211,209,210,210,210,211,213,212,210,212,211,210,211,211,211,213,214,211,214,214,213,214,212,212,212,210,210,209,213,210,210,212,212,212,212,210,212,213,213,211,211,214,214,216,217,217,215,214,216,218,219,216,216,216,215,216,218,216,217,219,217,215,217,216,215,214,216,216,216,214,216,216,214,212,212,213,214,212,212,213,213,213,215,214,212,214,213,216,214,212,214,213,214,213,214,216,213,212,213,214,215,214,213,214,214,216,216,212,214,215,214,214,217,218,215,218,217,217,217,218,218,218,220,219,218,216,215,214,215,216,214,217,218,215,217,217,217,218,216,219,218,216,219,216,219,217,215,217,217,220,221,218,219,222,221,224,221,220,227,226,225,225,227,226,226,227,227,227,225,228,227,226,227,226,228,224,228,231,220,212,213,225,232,224,211,224,236,231,224,227,237,235,237,238,236,237,235,235,234,233,235,235,236,235,235,235,234,237,237,237,238,239,239,238,239,239,239,237,237,240,239,237,238,239,238,239,239,238,238,235,237,237,234,236,237,236,236,235,236,235,236,239,242,252,233,200,181,183,169,155,211,160,113,195,227,236,213,193,226,238,219,201,235,199,204,237,234,249,173,168,144,120,71,66,221,244,242,227,213,181,200,238,216,215,227,226,225,226,226,233,208,217,150,160,227,188,210,206,231,199,146,204,218,200,172,174,193,181,204,177,193,197,216,188,160,103,58,203,250,251,177,63,94,93,57,74,130,191,112,77,201,247,176,107,193,190,211,209,212,207,147,198,222,154,112,154,192,115,93,175,238,248,203,206,185,124,120,185,226,226,229,229,213,216,223,239,206,159,223,232,227,234,234,252,124,69,85,78,117,52,18,48,129,113,69,107,106,120,124,125,92,45,29,66,120,107,99,124,101,125,118,108,151,137,142,137,138,146,128,119,99,90,93,84,67,53,53,49,38,29,19,33,57,65,50,19,29,72,82,71,44,27,71,95,82,37,49,95,90,57,75,118,108,77,40,29,33,34,30,29,30,27,36,14,148,247,249,249,190,135,156,143,170,179,178,165,131,126,146,154,165,175,152,79,36,33,27,28,32,38,29,29,23,22,21,22,22,21,21,21,24,16,22,20,19,19,20,21,21,24,13,18,18,17,47,53,46,32,23,30,36,39,51,64,81,87,79,81,72,71,72,74,80,85,85,80,69,62,59,51,46,42,53,51,52,47,43,39,34,31,48,111,118,120,108,47,80,151,121,75,40,52,97,96,82,70,76,81,66,57,45,29,36,60,80,165,236,240,244,237,238,239,237,239,237,238,238,236,238,236,236,238,238,237,238,236,236,235,235,236,234,235,233,233,231,232,232,231,231,232,232,229,228,229,228,224,227,229,227,226,227,227,225,226,226,226,228,225,223,117,2,0,5,9,9,9,10,9,11,12,12,12,197,201,199,202,202,200,200,201,201,202,199,199,199,198,202,201,201,201,202,204,204,204,206,207,205,206,206,205,208,207,207,207,205,207,207,205,207,206,206,208,205,207,203,206,206,204,208,206,206,205,206,206,207,206,205,206,204,206,204,204,204,202,204,203,202,201,204,206,205,204,205,205,205,206,207,208,207,208,208,207,207,207,208,209,207,208,208,208,208,208,211,208,208,208,210,211,210,209,210,211,208,211,210,211,212,212,214,213,212,210,210,213,211,211,211,212,213,211,211,211,211,210,212,212,210,212,211,212,212,213,214,214,214,213,215,214,216,217,215,213,211,214,214,217,218,217,217,214,215,214,216,213,213,213,213,217,213,212,217,213,214,214,212,214,211,212,210,210,211,211,212,212,212,213,214,212,213,211,212,213,212,212,214,212,211,212,212,211,211,213,212,214,214,212,212,212,214,214,214,214,215,215,214,217,217,217,219,217,218,219,221,218,219,215,214,217,214,215,216,216,217,216,216,215,218,216,216,217,214,217,217,215,217,217,218,219,217,217,220,221,222,220,222,220,221,222,223,225,224,223,224,227,227,226,226,226,227,226,227,225,225,225,225,226,227,225,218,210,216,229,230,216,215,232,234,227,220,227,235,232,236,237,236,237,235,232,233,234,233,233,234,233,234,234,235,234,234,237,237,237,237,239,237,236,237,237,238,238,237,237,236,237,238,239,237,235,236,235,235,236,235,235,236,237,237,239,242,242,247,244,244,220,121,78,79,123,169,209,250,153,97,187,224,242,229,209,235,240,209,202,235,194,204,234,234,247,170,155,125,107,66,70,219,244,242,234,207,184,216,240,217,215,226,225,225,226,227,236,210,226,183,180,229,184,204,200,232,205,136,211,195,158,149,150,178,181,207,172,193,193,220,184,146,90,71,231,246,251,184,64,100,105,81,30,62,141,155,144,217,250,171,87,182,201,212,204,204,191,179,202,201,179,150,206,204,79,44,171,247,236,197,195,174,111,120,210,231,229,235,230,221,217,225,244,196,174,236,232,231,238,237,252,143,74,82,84,117,56,26,38,112,130,69,66,87,108,89,75,51,29,49,117,135,108,101,115,107,128,118,135,163,129,117,104,92,81,57,44,40,32,25,25,19,21,45,73,61,38,23,39,82,92,64,17,44,113,127,125,79,61,108,108,88,42,78,108,75,39,25,89,114,88,48,34,33,32,36,33,29,39,19,46,207,249,248,248,180,141,151,153,181,165,156,149,132,132,147,153,167,177,127,59,27,29,25,29,27,22,27,25,22,23,27,19,23,22,23,24,23,22,22,26,22,25,21,21,18,20,18,17,22,16,48,68,50,31,17,15,18,20,33,51,52,53,60,60,70,83,88,96,96,83,76,69,75,75,75,80,73,68,61,64,54,44,36,23,30,35,49,112,113,122,101,53,96,120,96,64,39,78,106,98,111,126,135,146,97,120,69,54,34,39,58,132,225,233,243,242,239,241,235,238,237,236,237,237,237,240,238,236,238,238,234,235,233,233,233,233,234,231,232,234,233,232,232,232,233,230,229,231,230,229,229,227,229,229,227,229,227,226,225,225,230,225,228,228,222,117,4,0,4,8,10,9,10,10,11,11,12,11,200,201,200,199,200,201,200,199,199,201,204,202,201,201,200,200,203,202,204,204,200,203,205,203,203,204,206,205,207,206,207,210,205,205,207,206,208,206,208,207,205,206,204,205,203,204,207,206,204,204,204,203,205,207,206,205,207,207,208,206,205,206,205,205,205,205,207,207,205,203,205,203,204,203,206,208,206,207,206,208,206,206,206,206,206,208,208,210,209,208,211,208,210,208,208,209,208,211,210,209,211,212,209,210,212,212,213,211,211,211,212,213,212,211,212,210,211,212,210,210,210,211,213,212,212,213,212,212,214,215,213,214,214,212,214,214,213,213,214,214,214,214,214,214,214,216,214,214,214,213,214,214,214,215,213,214,213,214,215,213,212,214,214,211,212,211,212,211,211,212,212,213,213,216,211,212,212,211,213,211,213,211,212,214,214,211,211,212,214,214,212,215,214,212,213,211,212,214,215,213,214,214,213,216,215,217,217,217,218,217,217,216,216,214,214,214,214,214,215,216,217,213,214,214,214,217,214,216,215,215,217,216,218,218,217,218,219,220,222,220,222,224,218,222,222,224,224,223,225,223,224,224,225,224,224,226,224,226,223,224,226,224,223,223,227,224,212,206,221,231,228,212,220,234,231,221,218,229,233,232,234,234,233,235,232,232,231,231,233,232,234,233,234,235,234,235,236,236,236,234,236,236,237,237,237,236,236,236,236,237,236,236,234,237,236,235,236,234,235,235,235,236,238,243,246,247,247,241,234,206,180,165,102,49,42,145,211,210,228,139,125,221,244,251,233,215,232,239,202,195,236,187,205,235,234,249,162,150,122,99,61,64,219,244,246,238,213,205,238,247,222,214,228,229,224,227,224,237,200,198,170,174,232,188,196,187,234,188,107,179,164,165,182,167,172,197,217,183,205,196,228,178,171,98,74,236,237,251,181,62,98,115,92,49,7,77,124,177,237,251,166,91,202,212,222,200,180,197,209,199,196,151,159,249,192,86,95,174,244,224,194,216,169,106,165,238,238,236,233,233,227,214,229,247,183,188,241,231,234,235,238,252,181,96,98,94,127,72,37,27,66,128,113,65,28,43,44,36,40,53,106,139,136,70,83,113,105,139,95,69,68,42,33,24,19,22,26,55,68,38,24,19,13,24,63,96,94,87,57,60,107,105,72,25,63,123,122,122,129,127,113,108,96,38,72,110,84,43,39,95,100,83,44,29,35,33,29,37,29,39,15,88,250,250,249,249,172,152,157,160,174,152,143,143,137,145,158,160,171,168,102,35,26,29,27,27,31,23,17,26,25,26,24,24,27,20,25,24,23,22,27,26,22,24,21,25,23,23,26,23,21,24,58,71,51,33,18,21,29,54,63,57,64,63,50,30,20,24,32,45,65,77,81,87,81,74,69,63,64,75,80,76,78,66,64,55,51,45,54,120,122,117,102,60,96,118,84,57,35,93,124,113,120,85,66,67,105,155,122,79,56,47,51,107,214,231,237,244,235,237,236,237,236,236,236,236,238,238,240,238,233,234,233,233,233,233,232,233,232,231,230,231,233,233,230,231,229,230,231,232,230,227,230,227,228,226,225,229,225,227,226,226,227,228,229,225,224,116,3,0,4,8,10,9,10,10,12,11,10,12,199,203,197,200,201,198,199,198,201,202,201,203,203,202,204,199,201,203,203,204,203,205,205,204,204,205,204,203,206,202,205,205,205,207,204,206,204,205,206,206,205,205,202,206,202,202,204,203,203,202,205,203,205,203,205,204,203,205,202,204,202,201,204,202,204,207,206,204,204,202,202,203,202,204,204,203,202,203,203,203,208,206,207,205,205,207,205,208,206,207,208,207,210,209,208,210,209,205,208,211,208,208,208,209,209,208,211,211,209,209,211,211,211,210,211,210,208,210,211,209,208,209,212,211,210,211,212,212,211,212,213,212,213,213,213,213,215,214,214,214,213,216,212,211,213,214,214,214,214,213,213,212,215,216,211,214,214,212,215,212,212,212,210,214,211,210,211,210,210,208,211,212,211,211,212,214,212,214,214,211,213,214,213,212,211,212,212,211,213,214,212,212,212,211,212,213,212,214,214,214,212,211,213,212,213,215,214,215,215,214,214,213,214,214,214,214,213,215,214,214,214,215,213,214,217,212,215,217,216,216,216,217,217,217,216,215,218,219,221,220,220,219,221,222,222,222,221,223,222,223,223,222,221,222,222,223,223,223,223,222,223,222,222,223,222,218,208,205,224,231,217,210,225,232,226,217,225,231,231,231,232,233,230,232,232,230,230,232,230,230,231,230,233,232,233,233,232,236,236,236,235,236,234,234,235,235,234,235,235,233,234,233,232,234,235,236,234,235,235,237,238,244,249,249,246,234,211,180,166,170,194,226,174,89,127,212,191,212,224,132,129,221,237,238,202,184,222,235,196,196,229,184,213,236,235,248,175,169,141,113,60,65,219,244,251,238,198,214,251,251,222,211,226,227,223,226,221,236,203,191,158,147,207,185,190,166,178,163,141,189,162,185,212,190,189,215,214,192,211,196,230,174,186,104,73,235,227,247,176,56,89,118,106,56,27,49,56,146,230,251,144,83,198,212,227,191,144,170,208,157,129,141,168,243,158,88,98,147,237,213,205,241,181,101,171,246,237,235,232,230,229,216,229,239,175,203,239,231,237,231,245,252,219,127,87,103,131,101,62,23,35,93,128,114,67,49,49,49,76,108,131,136,78,28,93,107,116,121,53,21,15,17,20,20,11,19,59,92,96,77,43,21,21,23,68,114,122,131,122,103,108,100,74,27,64,116,94,88,88,116,132,104,89,42,34,70,100,81,79,103,93,64,33,34,32,34,34,29,37,41,39,161,234,249,250,234,166,149,155,149,163,174,191,163,139,153,152,155,170,150,82,19,26,27,29,27,26,22,23,26,23,24,26,22,23,24,26,23,23,24,23,26,25,25,24,22,25,22,24,30,19,33,58,66,48,34,20,31,62,67,71,60,46,42,39,34,22,16,17,35,56,44,33,38,48,61,71,77,69,67,67,63,61,67,83,89,91,79,96,134,124,117,107,70,97,110,88,56,34,89,133,131,93,56,39,29,57,113,122,90,54,55,54,89,207,231,234,240,233,238,231,233,236,236,237,235,236,236,238,235,235,235,234,236,233,232,232,232,233,231,230,232,231,231,230,231,227,228,227,227,230,228,228,226,226,226,224,223,224,225,223,224,225,222,226,225,222,118,3,1,6,9,9,8,10,10,11,12,12,12,195,199,199,198,200,197,198,199,200,199,200,200,199,200,201,201,205,200,200,202,204,205,203,203,205,203,203,202,203,204,201,205,202,205,205,203,205,201,202,202,203,206,203,204,204,201,202,203,201,203,206,203,204,204,204,202,202,203,201,201,203,202,204,204,203,205,205,204,204,205,205,202,205,206,203,203,203,202,203,205,205,206,206,205,205,204,204,206,206,205,206,205,206,206,206,207,207,206,207,208,208,207,208,208,210,209,209,210,209,207,209,208,208,210,211,209,208,210,210,209,211,210,211,209,211,212,209,207,210,211,211,212,214,212,211,212,213,212,211,213,212,212,214,214,214,212,210,209,210,211,209,210,211,211,211,213,213,211,210,212,211,213,211,210,211,210,208,208,212,209,210,210,208,210,211,214,212,210,212,211,211,212,211,210,209,211,213,211,211,210,211,212,208,210,214,211,212,212,212,212,211,214,211,211,213,213,213,213,214,214,212,212,215,214,214,213,213,211,213,214,211,213,216,215,216,217,216,217,215,215,215,219,217,215,217,217,217,217,220,219,220,221,219,221,219,221,221,218,220,218,220,221,221,221,221,221,219,221,220,219,220,220,221,223,221,216,206,209,226,224,211,211,226,231,222,216,227,229,229,229,231,230,229,231,229,231,231,229,230,227,230,228,228,230,230,232,232,232,233,233,234,231,234,235,231,235,234,234,233,232,233,232,234,234,232,234,237,237,243,247,248,246,239,215,184,166,170,184,200,218,236,252,205,115,167,238,227,242,226,116,110,201,219,222,196,191,222,236,193,196,230,186,220,245,251,252,191,181,144,117,63,60,217,244,249,192,142,171,231,249,221,210,224,228,226,228,219,236,209,214,178,137,189,180,174,154,162,164,191,188,171,204,213,187,164,217,207,188,207,194,226,165,187,90,75,231,226,247,167,54,81,110,98,59,33,40,12,89,199,246,148,81,190,206,238,192,103,174,177,95,143,168,213,195,80,100,123,119,205,209,219,235,172,108,169,235,232,238,230,230,233,215,230,228,170,219,239,233,236,241,251,251,212,81,57,87,136,135,95,45,22,50,100,133,147,138,112,122,139,153,140,82,27,17,105,109,113,116,37,21,21,20,19,17,26,42,93,125,132,128,80,39,31,25,66,101,91,93,101,125,117,93,80,24,54,104,92,45,10,39,75,101,95,60,26,33,91,122,118,118,83,50,44,47,59,57,65,64,72,50,87,239,251,245,250,246,162,153,148,144,164,206,231,170,138,146,147,166,171,134,73,43,43,41,41,38,35,32,36,37,33,34,29,25,27,27,29,27,24,24,22,26,25,24,22,23,23,22,24,25,22,36,66,64,43,24,27,56,65,55,54,68,63,37,22,23,21,21,53,85,90,66,45,51,42,21,23,36,48,57,68,73,70,66,71,72,79,95,104,106,101,118,112,74,87,116,94,67,36,63,122,141,121,65,64,81,70,105,118,99,60,48,54,85,206,232,233,242,234,239,230,232,236,236,236,236,237,235,236,236,236,236,235,236,233,233,233,233,233,231,232,231,231,230,231,231,228,229,227,228,229,227,227,225,226,226,225,226,224,224,224,222,224,225,224,224,224,118,3,1,5,8,9,9,10,10,11,12,12,12,193,200,194,198,198,196,196,198,197,194,198,199,197,199,199,198,202,198,199,202,199,201,200,201,201,202,203,202,203,201,205,203,204,205,203,203,200,204,203,203,205,202,203,207,203,204,202,205,206,202,203,203,203,205,206,206,203,204,204,206,204,201,205,203,204,206,204,206,205,203,204,205,204,203,204,204,202,201,204,204,205,202,204,205,202,204,204,207,205,203,205,205,204,203,204,205,202,203,206,206,208,208,208,208,207,206,207,207,206,205,205,207,210,208,208,208,207,208,209,208,208,208,208,210,210,210,212,210,209,210,209,210,210,208,207,210,212,210,210,210,211,211,211,214,212,212,212,207,208,210,212,207,206,211,207,210,211,209,211,210,210,210,209,208,209,208,207,207,209,208,211,209,209,208,208,210,206,207,208,208,210,211,210,211,210,212,209,210,213,210,209,211,211,211,210,212,213,210,213,212,209,210,211,211,211,212,212,211,213,213,212,213,212,210,213,213,210,210,211,212,212,213,212,212,215,213,213,213,215,216,214,215,216,216,218,216,217,217,217,217,217,218,217,215,217,218,216,218,218,218,218,219,220,218,218,218,220,218,218,216,218,217,217,222,216,210,202,212,228,216,203,218,228,225,217,220,229,229,229,228,228,226,226,227,226,229,227,227,226,226,227,225,229,227,230,231,230,229,227,229,229,232,231,231,232,234,231,232,231,229,230,229,232,234,237,240,243,247,245,237,216,191,170,168,181,197,217,230,239,242,248,252,188,83,166,247,219,247,186,95,129,217,242,247,226,216,240,241,194,205,235,192,227,249,253,253,187,160,125,97,55,58,210,243,246,127,36,65,164,238,224,210,226,225,226,227,222,232,206,191,170,167,207,194,181,205,167,173,213,179,184,203,205,141,114,213,190,183,199,195,224,162,188,83,89,235,228,247,168,56,74,110,97,56,33,42,18,48,181,243,161,126,196,214,244,189,152,217,196,141,173,196,200,158,86,177,196,116,163,178,218,221,143,113,181,230,232,236,233,230,235,224,235,211,173,229,237,232,240,250,250,248,122,34,29,65,118,137,138,80,47,32,44,98,141,160,142,131,129,146,135,66,13,41,128,96,125,99,34,28,12,23,30,27,38,80,114,105,100,120,114,77,36,25,70,86,68,38,36,66,89,92,74,25,43,97,94,67,35,16,61,113,135,104,71,72,108,128,123,131,119,107,113,127,133,139,125,131,128,68,148,234,250,249,252,234,159,150,152,177,212,232,202,140,131,139,158,191,177,125,84,79,72,69,74,69,66,57,57,64,59,56,57,53,47,50,45,41,39,39,35,31,31,32,26,25,27,25,24,18,22,50,69,60,39,26,36,60,67,49,56,76,69,46,24,26,33,50,86,124,129,95,76,58,32,26,21,38,69,61,42,36,38,52,67,84,94,86,83,90,108,117,104,72,79,123,122,81,46,38,74,136,149,150,141,103,88,119,122,92,61,47,50,98,211,233,234,245,236,235,233,235,235,234,236,235,235,235,234,234,235,235,235,233,232,232,231,230,232,231,231,232,231,228,227,229,228,228,227,226,227,225,227,229,226,224,225,226,225,225,224,222,223,224,228,225,222,117,5,1,4,8,10,9,10,10,11,11,11,11,197,198,197,195,199,197,197,199,198,195,197,201,198,199,200,197,200,198,200,200,198,201,201,200,200,201,204,202,205,203,201,203,199,201,201,202,203,202,206,201,203,203,201,200,205,206,202,205,203,203,203,203,205,205,206,204,206,205,203,203,207,203,202,203,204,205,204,205,204,201,203,204,206,203,202,205,203,201,202,203,202,201,204,203,202,205,206,202,203,203,203,203,203,205,204,204,205,202,204,204,204,205,207,207,206,206,206,206,208,204,206,206,204,205,205,205,206,207,206,207,207,205,208,207,207,208,207,208,209,208,207,209,209,207,208,210,209,207,208,209,208,208,209,209,208,210,210,212,208,208,208,208,210,208,209,208,209,208,206,211,206,206,208,206,207,210,210,207,209,208,209,210,208,208,207,208,209,206,208,207,208,210,208,211,211,210,210,209,211,210,210,212,209,211,211,212,212,211,211,212,211,211,210,211,213,210,212,211,211,213,214,214,213,210,210,212,212,212,211,212,211,213,213,214,215,213,214,215,213,213,215,215,216,216,218,217,218,216,216,214,217,219,215,217,216,216,216,218,220,217,215,216,216,218,221,217,217,217,218,216,216,217,217,217,214,207,201,218,226,208,208,220,226,220,213,223,230,227,228,227,226,227,227,227,226,228,225,227,227,227,229,226,228,227,229,229,228,229,230,231,230,229,232,231,231,232,229,228,230,231,232,233,235,239,243,243,236,220,195,173,167,178,199,214,230,239,242,244,239,236,238,245,181,79,164,239,168,163,142,102,155,243,251,251,238,230,248,246,202,217,245,199,197,194,194,193,142,137,122,123,92,101,207,240,190,43,2,5,109,237,226,211,223,223,227,223,224,230,201,171,164,173,205,216,202,222,175,178,217,141,156,187,181,156,150,231,187,192,205,223,242,188,205,82,114,243,237,251,167,67,82,120,105,57,37,49,15,73,200,247,190,137,205,221,249,200,167,249,222,173,165,142,224,150,98,226,235,145,113,166,251,192,94,100,202,243,236,239,231,231,235,229,236,192,179,236,232,237,250,250,251,144,33,2,19,30,76,118,142,139,89,54,41,27,88,143,112,58,46,122,129,73,20,72,135,103,137,97,27,27,19,24,38,34,67,101,113,77,54,103,113,103,63,28,63,92,76,48,13,27,68,101,103,68,113,146,150,139,134,140,139,147,162,162,146,145,131,123,117,117,123,123,118,125,127,130,130,125,100,79,200,251,251,251,252,235,159,152,165,214,252,242,165,117,129,139,174,217,177,113,89,84,66,58,58,57,57,54,59,55,57,62,62,63,66,65,72,70,64,66,59,63,62,54,53,50,49,39,35,35,43,66,72,63,44,19,33,68,79,74,72,71,60,43,24,48,71,58,57,79,88,71,46,30,25,22,24,62,87,74,44,19,21,15,23,33,43,59,81,90,107,113,100,80,66,104,146,116,67,52,39,56,84,82,72,64,89,133,113,83,51,42,60,111,218,235,236,244,234,235,233,236,235,234,233,232,234,236,232,232,233,232,231,231,232,232,232,232,233,232,232,231,232,231,230,229,227,229,228,230,230,225,226,227,225,224,224,226,225,224,225,224,224,227,225,226,226,116,5,1,4,8,10,9,10,10,11,11,11,11,193,196,194,196,197,195,195,198,198,198,197,197,198,199,196,195,199,198,200,200,199,199,199,201,201,199,199,201,201,202,201,200,201,200,203,203,200,203,200,201,203,201,204,201,199,205,202,203,202,201,203,201,203,206,204,204,203,203,204,203,205,205,204,204,204,205,205,202,204,204,203,207,203,202,202,201,203,202,203,202,203,201,201,204,201,201,203,204,205,203,204,205,206,205,204,206,202,206,205,203,205,202,205,204,206,205,204,206,208,206,204,206,203,204,205,207,204,203,205,206,205,204,205,208,208,206,208,207,206,208,206,207,208,207,207,207,207,206,208,209,207,208,207,208,207,208,210,208,208,208,209,207,210,211,207,207,208,207,207,207,207,207,206,208,208,208,207,209,208,207,210,206,208,208,208,209,209,210,208,208,207,208,208,210,208,211,211,208,211,211,210,208,211,210,210,211,212,210,212,210,210,214,213,212,211,209,211,211,213,212,211,211,212,212,212,210,208,212,212,211,212,212,213,214,214,213,213,215,214,212,212,216,215,213,216,216,214,212,215,215,215,215,215,215,217,217,214,215,214,214,215,216,217,217,217,214,217,217,217,216,217,216,218,217,208,203,201,219,221,203,212,223,223,214,216,226,225,225,224,225,226,225,224,224,227,226,225,226,227,225,227,226,225,226,227,225,227,228,230,229,227,230,227,229,229,226,229,228,231,233,239,241,239,235,219,199,179,167,177,192,210,227,238,244,241,239,238,232,230,228,235,236,182,81,159,222,162,153,113,100,139,207,214,220,183,186,227,225,190,214,241,174,113,97,107,131,142,156,152,163,154,141,162,167,144,48,1,3,113,237,236,217,231,229,230,224,227,220,213,191,187,186,183,199,184,200,148,151,163,110,164,196,201,199,199,250,212,227,235,242,253,206,188,62,103,242,252,252,185,81,81,121,105,57,36,48,23,75,229,246,212,159,212,236,247,186,184,247,235,163,103,187,248,113,68,208,249,181,129,163,252,190,69,130,227,243,243,236,228,230,233,234,236,171,186,239,229,250,250,250,170,41,1,8,12,38,48,70,118,129,153,105,66,41,31,107,104,55,17,33,83,78,33,96,125,102,141,75,26,31,17,30,29,42,91,100,97,80,72,76,78,101,95,76,100,111,112,86,83,100,120,157,156,160,173,163,162,155,139,150,143,121,113,118,108,106,105,99,108,117,116,105,109,121,117,116,103,98,59,83,220,252,250,251,253,230,160,168,199,234,234,199,136,121,130,143,195,216,141,75,65,51,65,59,65,61,59,64,64,69,61,59,63,59,57,53,54,56,57,56,57,60,64,66,70,71,71,65,68,67,69,81,81,71,52,39,35,63,91,91,84,63,43,31,21,60,90,74,55,70,87,62,32,24,23,21,39,81,91,65,41,24,19,14,15,19,23,19,29,35,61,97,101,86,55,60,109,146,113,70,58,49,50,48,50,78,111,130,90,64,43,43,69,141,238,241,240,235,231,239,232,232,232,231,231,232,234,231,230,233,232,230,232,231,230,231,229,231,231,230,230,229,229,229,227,227,227,226,226,225,224,227,225,223,224,224,225,224,223,225,224,225,226,224,227,224,224,118,3,0,5,9,9,9,12,10,11,12,12,12,194,196,195,193,194,194,194,193,197,196,196,198,194,198,199,196,200,195,195,199,195,200,198,199,200,198,200,200,201,199,201,201,201,200,200,202,202,202,202,202,203,202,204,203,204,206,202,206,204,203,204,205,206,202,203,203,206,205,203,204,206,203,205,206,203,204,204,202,205,207,206,202,203,204,203,204,201,202,201,200,204,203,202,202,203,202,201,202,204,204,202,203,205,205,202,204,204,204,205,205,203,201,206,203,203,203,203,205,206,204,206,206,206,204,204,204,205,204,203,206,205,206,210,206,207,208,208,208,206,208,206,207,207,206,207,206,208,208,207,206,206,208,207,209,210,208,208,208,209,208,210,208,208,207,207,206,206,208,206,208,207,206,209,207,207,209,208,209,210,206,208,207,208,209,207,208,208,206,208,208,206,213,211,210,213,209,211,211,209,211,212,211,209,210,210,210,208,211,212,208,212,214,211,212,211,213,213,210,212,210,210,210,212,212,211,211,212,212,211,213,212,213,210,211,212,212,210,210,212,213,213,212,212,211,214,214,214,217,214,214,214,214,214,213,214,215,215,214,213,215,214,214,213,214,216,214,214,215,216,214,214,214,217,214,209,202,206,223,208,203,219,224,220,212,218,225,224,223,226,224,224,223,224,225,223,227,223,224,224,224,225,224,227,226,226,229,226,224,227,226,229,228,228,227,228,229,232,237,239,239,234,221,202,182,174,181,193,210,223,236,241,244,243,239,236,230,229,228,225,225,232,230,188,84,149,234,197,179,124,102,118,141,141,146,127,125,157,160,135,174,188,122,99,98,114,133,148,162,164,176,166,153,154,154,145,93,51,61,147,239,241,225,241,234,230,223,228,217,219,175,179,208,171,182,154,167,147,127,185,153,210,249,241,236,214,249,219,220,181,153,129,90,98,12,59,196,238,249,150,60,73,125,114,66,39,50,28,81,237,246,246,160,193,240,239,189,201,246,198,139,127,230,246,112,78,217,252,226,123,97,184,174,109,165,237,242,239,236,231,230,227,237,228,153,201,237,237,250,251,203,62,2,9,6,31,43,39,36,54,94,125,142,113,62,38,62,111,97,42,23,28,48,38,110,119,99,128,56,27,29,17,26,32,74,107,96,99,65,51,59,74,122,130,142,145,144,153,166,168,160,142,137,139,128,139,128,123,116,116,110,100,88,69,67,64,69,59,56,58,59,56,49,44,49,49,41,34,46,11,69,237,250,251,251,252,221,162,188,232,252,246,155,119,130,131,152,217,206,103,37,14,16,24,29,29,25,33,32,38,44,46,50,46,51,48,50,59,57,54,61,60,58,61,56,60,57,55,59,57,60,64,67,66,76,76,68,63,70,88,82,69,54,39,38,28,57,99,112,113,100,79,51,33,28,26,24,51,90,86,59,36,29,24,19,22,24,24,21,19,18,55,92,103,93,64,32,52,109,136,131,109,96,82,81,101,127,139,112,72,44,35,60,95,201,246,243,239,231,234,235,232,231,230,233,233,233,232,232,230,230,232,230,230,231,230,228,230,229,227,227,227,227,229,229,228,225,225,226,227,225,224,224,225,225,223,224,223,224,224,226,225,224,226,228,225,224,225,118,3,0,5,8,8,9,12,10,11,11,12,12,194,194,194,193,193,189,193,193,192,194,195,194,194,196,196,196,194,195,196,194,197,197,196,199,197,197,199,199,196,198,198,197,201,198,198,199,196,201,200,199,200,198,201,200,201,200,200,200,201,202,200,201,199,201,202,200,202,200,200,201,201,203,201,201,201,201,203,201,202,200,201,203,203,200,199,202,201,198,200,202,201,200,203,201,199,200,200,200,200,200,201,201,200,201,201,201,201,205,203,203,203,201,203,202,200,201,204,203,203,202,202,203,203,206,202,203,204,203,204,205,205,205,205,205,204,204,204,206,203,204,207,206,207,204,206,206,207,203,203,206,205,206,207,205,205,205,206,205,202,207,208,205,206,203,206,206,205,206,206,204,205,208,206,206,207,206,207,208,207,208,206,205,208,206,207,205,204,208,206,206,209,207,210,212,209,211,210,206,210,212,208,207,210,207,208,210,209,210,208,210,208,206,209,208,209,211,211,210,210,208,212,211,210,211,213,211,207,210,210,208,210,209,210,210,210,210,212,210,210,212,210,214,214,213,212,212,215,212,214,214,213,214,212,210,213,216,213,212,214,214,212,212,214,213,212,212,212,211,212,213,214,213,214,211,203,201,210,220,202,207,222,222,216,208,220,222,222,223,222,222,224,222,223,223,225,223,220,224,223,223,225,222,223,226,224,225,226,224,224,224,223,227,228,229,234,234,235,229,219,203,186,175,179,191,205,219,232,240,242,241,237,233,229,227,225,227,223,225,224,226,230,223,190,80,149,233,220,178,91,78,107,147,153,149,137,122,152,134,111,143,153,116,106,103,95,103,112,125,139,152,139,141,148,144,133,103,100,108,141,211,219,204,234,242,237,228,227,211,222,151,162,206,170,208,162,199,226,178,234,210,238,249,228,217,145,126,103,89,60,20,14,18,54,32,29,94,129,134,71,42,52,99,104,63,54,62,27,59,184,226,155,66,130,178,167,174,164,128,88,61,125,228,208,76,86,185,206,188,137,38,76,139,112,208,238,231,243,232,232,231,223,241,223,159,211,243,249,250,225,114,11,5,8,18,42,44,27,19,37,51,89,102,112,96,46,36,90,127,96,56,46,38,33,122,124,105,110,45,26,22,19,37,67,116,129,115,114,94,99,113,117,133,130,134,135,126,133,131,137,141,126,122,112,113,107,85,89,76,59,51,49,44,37,34,30,41,48,35,24,29,28,24,32,39,36,42,53,55,15,121,251,251,252,252,244,166,162,209,244,234,199,131,122,130,133,173,226,172,74,24,15,25,27,30,22,12,22,27,42,49,45,30,17,20,30,29,27,39,39,33,36,41,43,45,50,53,56,59,57,57,55,55,57,59,59,65,66,70,69,72,76,79,85,85,72,81,107,117,124,94,50,38,31,29,24,36,72,96,81,53,31,26,27,27,31,27,30,27,25,21,57,95,104,99,66,41,32,83,130,141,144,154,159,143,144,139,96,66,42,38,48,66,174,241,249,241,235,236,233,230,230,231,233,233,233,232,233,233,233,233,233,231,231,230,228,229,230,229,229,229,228,229,228,227,225,226,226,226,229,229,228,228,227,226,226,226,226,226,226,228,226,225,227,225,228,226,222,117,4,1,4,8,10,8,10,10,11,12,11,11,191,194,194,195,194,192,194,194,195,191,192,195,193,194,196,193,194,193,195,197,195,197,194,197,196,196,198,198,198,200,199,195,201,200,199,200,197,198,198,198,199,198,200,198,197,199,198,200,200,199,199,198,199,200,201,200,199,200,201,200,202,200,199,202,200,200,202,202,202,200,201,200,200,202,201,201,200,202,199,200,201,197,199,200,200,198,200,200,199,200,202,200,201,201,201,200,200,204,199,202,202,200,204,202,202,206,203,203,204,205,203,201,203,202,203,202,203,202,203,203,203,203,203,203,204,205,205,204,202,204,204,204,205,204,206,206,203,203,205,203,203,205,204,204,204,204,204,204,205,206,206,204,203,204,206,207,203,203,205,203,203,203,205,205,203,206,206,204,205,203,206,207,206,208,206,206,207,206,207,207,208,208,203,206,205,206,209,207,210,208,207,209,210,206,207,211,208,208,211,210,208,207,211,210,210,210,209,211,209,207,208,208,211,208,208,208,207,208,209,210,210,210,210,209,210,210,211,211,210,213,210,212,212,212,213,212,211,212,208,211,212,212,211,209,212,211,212,211,212,211,210,212,211,212,212,214,212,210,212,214,213,212,215,210,202,198,215,214,199,212,220,217,207,211,220,222,221,218,222,220,222,221,220,222,220,224,221,221,225,220,224,223,222,222,221,225,223,222,227,225,229,232,232,233,232,217,203,188,174,177,188,202,217,229,235,237,236,235,234,229,227,225,225,225,223,225,223,224,223,225,227,223,198,78,133,212,209,156,51,47,106,166,179,182,160,157,179,159,132,165,170,117,112,85,63,62,72,90,91,94,90,95,106,109,93,87,84,76,84,173,184,173,222,232,237,229,222,205,227,149,157,217,165,204,190,236,252,198,203,142,122,117,77,71,26,2,14,23,81,129,131,167,127,98,133,143,165,177,105,42,35,87,95,49,40,44,36,25,107,160,139,66,83,144,151,145,118,65,61,84,121,173,130,84,112,158,186,181,129,49,39,76,117,225,245,227,238,235,234,233,226,246,213,158,229,248,251,246,119,36,2,3,18,34,47,29,19,18,31,56,75,87,95,94,73,40,42,89,133,122,93,68,76,154,124,122,106,29,47,62,105,142,141,153,134,123,125,112,124,125,129,135,134,146,145,136,133,123,101,90,69,56,51,45,42,30,29,36,46,53,30,36,41,17,32,59,66,52,34,41,37,28,32,42,51,50,87,69,20,186,251,251,252,241,164,113,165,229,252,241,152,112,129,128,134,198,229,134,47,18,14,27,30,33,21,18,30,34,51,63,51,34,19,40,42,35,48,49,39,26,13,27,31,21,25,23,30,25,28,31,36,40,48,52,61,67,63,59,59,53,56,61,91,120,118,117,98,105,100,73,63,57,44,40,36,70,100,96,78,47,32,29,27,30,34,33,30,31,31,28,56,87,111,105,75,41,53,123,134,118,100,101,122,128,100,69,57,39,42,33,45,160,245,245,249,230,234,231,230,228,230,232,231,233,234,232,232,232,231,231,232,231,230,230,231,231,231,230,228,228,229,229,229,228,226,228,229,229,227,228,228,226,226,225,226,228,227,227,227,228,226,228,227,225,228,228,224,117,3,0,4,8,10,9,10,10,11,12,12,12,193,196,193,192,192,192,193,193,193,196,192,191,194,192,194,193,194,195,194,193,195,197,194,198,197,196,199,196,198,198,198,197,198,198,199,200,197,199,198,198,198,196,198,199,199,197,200,199,198,200,200,200,200,199,198,197,198,199,201,200,200,202,201,200,201,200,201,202,200,200,200,200,201,200,200,200,200,200,201,201,199,198,200,198,200,199,198,202,202,198,199,200,201,201,200,203,200,201,201,202,202,200,203,202,203,200,203,201,201,206,205,205,202,201,202,201,202,203,201,203,204,202,201,203,204,204,203,205,203,201,206,202,203,204,202,204,204,203,204,202,203,203,203,203,205,205,203,206,202,205,205,203,205,205,206,204,205,204,204,203,203,203,201,205,204,204,205,206,204,205,206,208,208,205,209,206,205,207,205,207,208,205,206,206,204,206,210,206,208,210,207,207,207,206,207,208,206,208,209,210,210,210,210,211,211,208,207,210,210,209,206,207,208,207,210,209,208,210,209,208,208,208,208,208,208,208,210,208,208,210,208,211,208,207,208,210,210,209,210,209,206,209,211,210,211,212,211,209,211,211,211,211,213,212,211,211,211,211,211,210,210,214,207,204,200,202,216,203,200,218,218,211,207,216,220,216,220,218,220,217,220,220,218,218,222,221,220,222,221,224,222,222,224,223,223,224,225,225,230,232,229,230,218,203,190,179,181,186,200,213,225,232,233,235,234,232,229,227,226,223,223,225,225,225,222,226,223,227,220,226,223,223,200,83,120,199,239,172,64,87,129,171,172,171,156,163,189,164,152,180,158,113,119,88,60,69,59,59,51,59,61,55,57,61,69,82,81,53,60,157,155,125,159,186,219,225,219,207,237,155,187,242,146,133,127,184,172,67,56,19,0,12,9,59,66,58,74,77,117,143,183,159,136,151,142,189,234,238,126,52,57,120,111,48,41,47,29,75,207,247,199,114,172,210,173,196,163,149,182,168,217,189,130,164,199,206,203,181,138,66,59,54,112,243,242,233,233,232,234,231,229,245,195,157,240,249,252,156,26,3,2,11,29,44,32,18,21,26,46,53,88,106,89,100,89,57,38,35,76,122,153,149,148,165,110,119,128,128,161,163,165,147,133,125,113,131,145,139,144,130,121,108,97,90,70,56,47,35,32,24,25,30,26,29,45,32,22,47,77,81,60,67,66,29,36,63,76,63,34,40,43,33,39,42,53,29,62,69,84,230,251,251,250,191,138,130,190,245,251,192,127,118,131,129,152,218,199,103,22,10,15,23,32,40,29,24,28,21,47,55,43,32,29,57,49,39,58,56,33,21,33,50,41,23,22,29,24,19,20,22,24,39,43,35,25,33,35,39,52,52,59,62,69,80,83,83,87,89,97,98,89,90,83,74,103,127,129,114,77,46,30,25,29,32,34,30,27,32,33,30,61,91,106,106,77,44,83,133,94,66,35,49,103,93,67,47,39,50,25,55,185,242,249,249,233,233,229,229,229,229,230,229,231,231,232,233,231,229,230,231,230,230,231,230,230,229,230,227,226,229,228,227,227,227,228,228,227,227,227,225,225,226,227,226,227,228,227,226,226,228,226,225,229,227,227,224,222,118,4,1,5,8,9,9,12,10,11,12,11,12,193,194,190,193,189,191,191,190,193,193,193,193,193,191,193,191,195,194,194,196,193,195,194,196,196,196,196,197,197,195,198,195,199,196,195,199,196,199,198,198,198,195,198,198,195,198,196,196,198,197,199,199,199,198,198,198,195,198,200,196,200,198,198,201,196,200,200,199,199,197,201,198,199,200,199,198,198,202,197,199,199,198,198,197,200,199,199,199,199,198,200,201,199,198,200,202,202,201,200,202,202,200,200,200,200,200,200,201,199,200,200,200,201,201,202,200,201,200,203,201,200,203,201,202,202,203,203,201,203,203,204,203,201,203,204,203,202,202,203,205,203,203,203,201,204,204,203,205,203,205,205,201,205,203,200,204,204,203,202,203,204,204,206,205,205,202,204,206,206,206,205,206,205,206,206,203,206,205,206,205,205,207,205,209,206,207,207,206,208,207,208,207,208,207,206,206,206,206,208,207,205,206,209,207,208,208,208,208,207,208,208,207,210,206,205,206,207,208,204,204,206,207,208,207,206,206,208,210,205,207,208,208,207,208,208,208,208,208,207,210,207,210,211,210,210,211,212,210,212,211,211,212,208,210,209,208,212,212,210,210,209,210,207,204,197,208,216,198,208,219,215,206,208,218,216,217,217,217,218,214,220,216,217,220,218,221,218,219,222,218,221,219,223,225,224,232,230,230,230,217,202,190,178,177,188,199,213,223,228,234,233,232,229,226,224,223,223,222,224,223,221,223,225,226,222,226,223,222,219,227,217,223,196,65,121,212,252,217,136,139,151,171,159,152,146,168,191,158,130,149,141,116,132,107,87,84,57,53,48,63,65,55,49,53,66,90,73,47,73,150,131,76,90,132,197,229,214,211,239,170,194,252,134,45,20,23,27,9,40,66,78,117,137,178,180,189,191,143,134,130,144,141,99,95,84,102,146,172,107,37,41,109,117,57,57,57,33,47,146,181,131,71,121,144,120,146,141,145,156,122,127,117,83,136,123,118,149,129,150,136,79,51,134,243,246,232,230,231,234,232,230,245,182,188,248,248,188,49,2,12,5,28,42,36,24,21,27,45,56,54,83,114,102,88,99,87,59,46,39,43,64,75,109,141,95,137,157,133,157,139,149,141,129,137,130,127,111,76,55,43,39,31,28,37,44,32,21,27,19,18,27,38,41,49,78,55,30,42,81,96,56,64,82,63,57,75,77,49,25,29,46,55,56,49,51,48,53,50,140,243,251,251,241,186,161,153,214,250,224,147,113,123,139,131,175,237,164,83,70,57,55,55,50,49,45,35,34,29,29,60,56,52,56,47,33,30,51,51,38,33,48,54,29,22,27,27,26,19,24,31,46,65,64,45,26,16,29,32,31,41,35,36,40,51,62,76,84,87,81,76,79,83,94,103,110,114,105,104,92,66,54,50,41,39,34,31,33,29,35,30,60,96,103,105,83,47,41,58,57,46,55,91,95,75,49,44,50,28,116,239,249,249,250,236,232,233,233,227,227,228,229,229,230,231,230,230,231,231,230,232,230,230,231,230,231,229,229,231,229,229,229,227,227,227,228,227,227,227,227,226,226,228,227,228,227,228,229,226,227,229,222,225,228,226,228,226,224,118,4,1,5,8,9,9,12,10,11,12,11,11,189,192,193,192,194,191,188,190,190,191,189,189,194,189,191,191,192,193,191,193,193,194,193,193,195,197,197,194,194,195,197,195,196,196,198,196,194,199,195,196,198,198,196,195,195,193,196,195,195,197,195,195,198,196,197,198,199,200,199,199,197,197,199,198,198,198,198,199,196,196,198,199,199,200,199,196,198,196,200,199,196,198,201,198,198,198,196,198,196,197,198,198,197,198,199,199,198,200,200,199,200,199,200,199,201,200,201,200,198,203,200,199,200,201,199,201,201,202,200,199,201,201,200,201,201,199,200,202,200,201,203,202,205,203,202,202,204,202,204,203,199,206,203,201,203,200,204,204,201,201,200,201,203,202,201,202,205,204,203,203,205,204,205,207,205,203,203,206,205,206,206,205,207,204,206,205,205,205,204,203,203,205,207,206,207,208,206,205,205,205,202,205,207,205,207,207,205,205,206,206,204,207,207,205,206,206,206,207,204,207,207,207,205,205,205,204,206,204,207,206,204,206,205,206,205,207,209,204,206,206,205,206,206,208,206,205,206,206,207,206,208,209,207,208,208,207,208,206,209,208,210,209,209,208,207,210,209,209,209,207,211,212,205,207,202,220,218,202,214,217,208,205,211,214,215,215,213,213,215,215,217,217,217,218,217,217,219,217,218,219,221,224,226,230,230,228,218,204,194,181,177,185,195,209,220,226,229,230,231,229,225,222,222,222,219,222,223,220,226,223,223,223,221,223,220,223,219,222,218,227,216,222,178,51,93,194,251,208,143,152,167,176,164,161,142,146,173,138,105,132,139,121,128,95,83,71,50,60,49,47,54,56,61,66,70,85,75,48,57,113,111,107,148,180,217,233,207,218,241,159,191,239,141,95,115,105,76,97,198,216,223,247,237,247,225,252,232,215,229,225,227,99,96,160,139,173,184,213,134,45,39,89,113,54,43,48,28,45,114,153,141,81,72,99,124,158,133,134,115,86,120,51,43,131,132,122,138,131,160,160,111,45,119,234,237,240,229,228,235,224,239,237,183,220,250,181,79,7,5,7,16,43,42,25,25,26,46,50,59,53,29,76,106,94,92,102,103,75,57,48,41,21,57,112,101,132,139,133,139,127,120,87,77,53,34,39,49,36,33,34,30,17,29,58,62,50,31,35,33,18,30,46,40,45,72,81,62,72,93,79,48,37,77,110,143,131,91,81,83,100,103,113,123,114,120,118,131,116,100,206,243,243,244,207,154,125,197,251,193,133,120,131,133,139,201,226,127,72,124,136,156,64,91,69,63,67,61,57,58,57,71,73,57,50,43,42,55,67,63,57,46,36,22,22,27,25,29,32,27,24,43,65,62,42,25,32,57,48,49,59,56,36,18,19,30,32,31,45,48,61,78,81,85,80,79,74,77,87,93,91,91,84,73,68,63,51,46,39,34,29,59,93,101,110,80,51,35,30,46,66,101,101,83,54,44,54,66,186,248,248,251,251,238,234,234,230,226,229,228,230,230,228,232,233,233,231,229,229,231,231,228,229,230,228,231,231,231,230,229,228,225,228,228,227,228,228,228,229,229,229,229,227,229,227,228,227,229,229,226,229,229,228,228,226,228,227,225,117,4,0,5,10,10,8,10,10,11,12,12,12,189,191,193,193,192,191,190,190,192,191,191,192,192,193,195,191,193,189,189,193,192,194,193,192,191,193,192,192,193,193,195,194,195,195,196,196,193,194,195,198,196,195,196,195,196,198,194,194,195,194,196,195,195,196,198,197,197,199,197,196,198,195,196,200,197,199,198,198,198,196,200,198,196,194,196,199,198,200,198,198,198,198,200,196,197,198,196,197,196,196,198,198,198,197,198,199,200,199,198,200,201,200,201,198,199,201,200,201,201,203,200,200,201,199,201,200,203,199,199,202,199,200,197,199,200,199,200,200,202,200,200,201,200,202,201,204,205,203,204,205,204,201,203,202,203,199,202,203,200,205,203,203,204,202,204,203,202,204,204,204,203,203,203,203,205,202,203,204,203,204,205,205,204,206,206,204,205,203,205,205,205,207,204,205,205,205,205,205,207,203,204,205,205,204,203,206,205,205,203,205,205,203,206,204,205,205,204,208,206,206,205,203,206,203,204,204,204,205,204,203,203,203,203,203,204,206,204,205,205,205,205,205,205,206,207,205,207,207,207,208,205,206,207,207,208,209,207,208,208,208,208,205,207,208,208,207,209,208,207,208,210,206,214,212,217,244,219,206,216,214,208,205,213,213,213,216,213,214,216,213,218,215,217,219,217,220,217,221,224,225,228,225,227,217,200,194,180,177,184,191,205,218,225,227,230,229,228,225,223,225,223,224,223,221,218,222,224,222,222,223,224,223,222,224,223,221,221,222,222,227,213,217,168,42,78,173,241,210,147,166,170,183,178,176,162,159,165,132,135,163,141,117,77,20,26,34,29,33,29,33,31,36,32,46,65,66,63,37,33,77,105,166,238,250,244,226,205,224,240,168,191,234,171,203,238,203,185,186,249,247,247,251,236,244,236,247,213,200,192,227,181,105,131,177,188,227,244,252,193,80,79,108,115,52,33,44,25,81,214,247,227,146,131,196,251,251,201,160,151,181,221,103,71,222,237,242,224,186,223,191,163,69,94,216,224,238,229,225,236,225,242,231,185,249,220,97,27,1,11,11,39,47,22,23,26,39,51,61,54,12,79,111,108,94,83,111,107,122,99,70,66,32,83,119,103,139,121,66,50,34,29,33,34,24,16,44,68,60,49,60,53,30,21,55,84,53,24,34,43,46,49,52,48,50,68,111,156,171,170,142,137,153,155,153,162,164,145,143,137,139,125,110,93,77,71,62,63,56,52,22,115,217,250,234,144,87,174,245,167,128,128,133,136,149,229,207,89,51,35,29,43,42,41,38,39,42,43,44,48,50,49,59,63,65,69,69,69,69,73,65,58,56,51,48,42,40,47,36,31,27,32,56,54,44,46,59,60,47,55,61,51,36,25,44,50,30,34,33,30,29,26,32,42,52,66,74,84,85,81,79,80,86,100,111,101,93,79,63,61,50,71,96,99,115,85,57,58,81,116,133,118,81,54,38,45,93,221,245,250,250,248,239,233,234,231,226,227,226,230,230,230,230,229,231,232,231,231,229,228,231,229,228,228,228,228,229,227,227,230,229,228,229,229,230,228,229,229,228,230,231,229,229,231,232,230,229,229,230,229,229,227,230,230,227,230,230,226,117,3,0,4,8,9,9,10,9,11,11,11,11,190,193,194,192,195,191,190,191,191,193,192,190,192,191,192,191,193,192,190,193,194,191,191,193,193,192,193,194,194,194,196,195,193,194,197,195,192,194,194,194,195,194,194,195,195,194,193,196,194,195,199,194,198,197,196,195,195,196,194,198,194,196,198,193,199,196,193,194,196,195,196,195,194,195,196,198,197,198,196,198,196,198,197,195,200,196,196,196,196,197,195,198,198,196,195,198,197,198,199,198,200,198,199,198,197,197,198,197,198,199,200,199,198,201,198,200,199,198,198,199,199,200,200,201,201,201,202,199,199,200,200,199,201,201,203,202,201,200,202,202,201,204,200,200,203,201,200,200,202,202,202,203,201,201,203,200,201,202,201,203,202,203,201,203,205,204,204,203,202,205,205,203,204,202,206,203,204,203,206,206,205,207,205,204,202,202,203,205,205,205,203,204,204,204,205,205,202,203,205,203,205,203,203,202,204,204,202,202,202,206,203,203,203,202,204,202,203,203,203,201,202,201,202,203,202,206,205,203,206,204,203,203,202,204,204,206,203,203,205,205,203,206,205,203,206,204,207,206,206,206,205,206,205,205,208,208,204,206,207,205,207,204,212,201,137,184,190,202,215,212,202,206,211,212,213,214,213,213,214,214,215,215,213,217,220,220,223,224,226,226,218,205,195,182,175,180,190,201,214,222,223,228,227,223,224,220,220,220,221,221,222,222,220,223,223,222,223,220,222,223,224,221,222,223,222,221,222,222,222,226,214,218,149,51,103,196,252,227,156,161,175,184,172,168,158,162,187,160,165,184,149,64,4,1,17,32,21,22,25,26,22,22,27,53,58,63,41,44,73,128,167,196,234,234,234,217,200,227,233,159,208,239,152,210,243,201,194,217,249,230,234,221,213,175,189,186,140,145,148,231,160,88,150,171,171,215,224,251,189,72,68,94,121,54,32,35,14,84,227,246,242,164,128,234,252,252,178,174,187,213,222,118,163,249,250,251,212,204,240,228,214,85,91,206,212,230,232,222,236,224,249,229,195,245,119,20,15,1,22,32,46,29,22,26,36,53,52,61,19,85,219,178,122,89,67,89,94,103,109,107,108,99,134,128,123,124,52,23,19,17,14,29,69,45,18,46,78,85,59,61,74,55,44,74,83,48,42,46,66,103,140,157,165,190,121,180,200,198,187,169,147,131,117,88,65,53,43,45,39,37,36,29,29,29,25,23,19,24,21,27,15,109,158,203,143,91,185,234,150,130,126,140,140,174,234,169,61,21,10,10,20,14,18,17,15,21,19,24,18,22,27,29,27,34,36,36,44,42,46,51,57,60,64,63,63,69,69,61,54,54,47,56,73,61,59,57,38,30,48,55,39,33,53,62,54,36,42,39,34,31,19,22,33,46,53,50,42,46,61,68,75,74,71,74,83,97,101,100,97,89,94,103,100,116,96,108,155,165,164,116,71,48,37,41,100,216,249,249,250,239,237,233,233,228,225,226,228,231,230,228,229,231,228,228,228,229,230,229,230,231,228,228,229,228,228,228,230,228,229,230,229,229,229,228,230,229,230,233,231,229,231,231,230,229,230,233,229,228,227,231,228,226,227,229,231,229,227,118,4,1,5,8,9,9,10,10,11,12,11,11,193,190,191,190,192,190,190,194,192,192,192,191,192,192,192,192,194,191,190,192,190,190,192,194,195,196,195,196,195,194,194,193,196,193,193,194,191,193,192,191,195,192,193,197,193,194,194,196,194,195,198,193,193,193,195,197,195,197,196,196,196,194,196,196,193,194,196,197,196,194,195,194,194,197,197,197,197,197,195,196,197,198,198,196,196,196,195,194,196,197,198,198,198,199,198,198,196,198,198,197,196,196,198,199,200,198,197,198,198,198,199,198,198,198,197,199,198,197,201,199,199,203,200,201,201,200,200,198,198,200,202,203,200,202,200,197,199,198,201,203,200,201,200,200,202,199,201,199,199,201,199,202,201,200,200,202,202,200,203,201,201,201,200,204,204,203,203,205,205,203,204,205,204,204,203,204,205,203,208,207,204,204,202,203,202,201,203,202,204,202,203,203,200,202,203,205,201,203,200,203,204,202,202,200,203,205,201,201,200,200,204,201,201,200,200,203,202,201,201,201,202,202,202,201,202,202,202,203,201,204,203,202,204,203,205,203,202,204,203,205,204,204,205,204,205,204,205,206,207,206,206,204,205,203,206,207,205,208,207,206,208,197,209,129,45,86,156,210,215,204,202,206,213,212,211,210,207,210,212,211,214,215,217,220,223,223,222,214,201,192,184,174,181,189,200,215,220,225,226,224,225,223,221,220,217,215,219,218,221,221,219,222,221,222,222,222,221,221,222,220,220,222,222,222,223,221,224,220,223,222,215,207,146,55,119,217,252,230,158,159,155,170,165,162,154,175,200,164,171,183,130,62,11,2,24,38,30,28,33,25,29,35,33,54,59,53,30,64,124,199,239,214,229,235,230,214,204,229,227,158,196,211,166,206,199,158,152,212,230,197,185,171,155,141,184,170,167,174,198,252,116,112,181,164,167,206,224,240,182,66,49,71,116,73,32,40,13,61,222,239,216,154,107,213,251,163,117,173,206,189,164,149,213,249,238,242,171,184,228,200,221,127,76,180,211,216,241,231,237,228,252,228,187,162,28,2,16,14,42,45,36,17,24,39,52,54,63,23,67,208,252,203,145,110,57,69,65,82,71,80,89,91,142,132,136,120,33,15,19,19,17,35,86,78,42,41,85,83,41,29,51,92,126,146,144,151,171,190,200,202,200,191,179,168,142,123,102,72,59,40,31,25,16,22,22,16,18,22,15,21,20,22,18,17,23,15,21,20,23,14,27,10,48,144,136,112,205,207,135,139,133,142,147,201,237,128,40,23,8,9,14,13,17,14,15,15,14,17,18,14,17,19,14,16,15,21,20,14,19,21,24,28,28,35,37,46,46,49,59,56,57,65,67,70,71,61,52,49,57,69,64,67,62,46,29,26,39,35,31,28,39,38,53,78,76,66,41,24,24,34,36,46,61,70,77,89,88,83,86,95,106,96,94,108,97,95,109,95,66,55,42,45,61,73,179,242,249,249,237,236,229,231,231,226,226,231,231,229,230,229,230,233,230,229,229,228,229,226,228,231,229,230,228,228,229,228,231,228,230,231,229,231,231,231,232,232,231,231,231,231,230,231,230,230,229,231,230,229,229,229,229,229,229,228,230,229,226,118,3,1,6,9,9,9,10,10,11,12,11,11,191,193,193,192,194,193,188,192,190,190,191,191,192,188,190,188,188,191,193,189,192,193,193,193,191,196,193,191,193,193,194,191,188,192,194,192,191,192,193,191,193,193,193,193,192,196,192,193,193,191,194,193,194,192,191,193,193,193,193,196,193,195,195,192,195,193,194,196,195,193,194,194,194,194,195,196,193,198,198,196,196,197,196,196,198,195,198,196,195,198,198,195,196,194,196,198,193,197,196,195,195,194,198,196,196,198,198,198,198,197,197,196,196,199,196,200,200,199,199,198,200,198,198,199,198,200,199,198,198,200,200,200,203,199,200,201,200,199,202,200,199,199,200,200,202,202,199,203,199,196,201,198,200,201,200,201,200,202,202,200,200,202,203,201,202,200,200,200,202,203,202,201,201,200,202,204,202,203,205,203,202,201,201,202,203,200,201,200,200,201,200,203,203,200,200,201,199,200,200,199,202,200,202,202,200,200,200,200,200,199,200,199,200,201,200,199,199,200,200,199,199,201,203,201,201,200,200,200,201,202,202,203,199,202,201,200,201,202,205,205,205,205,205,204,206,204,205,205,204,204,202,204,203,203,204,206,204,206,206,208,207,203,191,100,23,77,181,214,214,203,201,211,210,214,210,207,211,212,213,216,220,221,219,222,212,199,192,181,177,180,186,196,210,216,223,227,223,225,220,220,219,219,220,217,219,219,218,218,220,222,222,221,220,220,221,219,220,221,221,222,223,222,221,222,223,221,224,220,224,218,212,207,126,61,135,217,252,229,155,157,160,173,163,166,184,179,179,136,156,162,163,156,110,123,130,100,29,21,47,33,33,29,43,80,67,67,74,99,173,231,234,208,219,230,220,206,209,234,223,148,179,186,163,203,180,106,107,187,160,160,153,162,186,189,231,212,217,197,232,217,58,136,191,161,165,199,232,240,183,68,43,68,125,79,32,50,18,66,233,241,214,156,103,210,200,122,129,196,198,150,146,164,236,245,236,240,168,198,207,187,224,189,90,116,196,206,229,227,229,240,253,220,127,66,1,15,21,22,53,33,20,22,27,50,51,66,29,42,186,245,249,198,143,115,56,84,84,80,71,66,50,67,138,137,151,100,25,22,14,22,22,19,53,82,93,103,96,79,77,110,141,160,182,195,191,190,189,164,137,107,71,47,33,32,69,20,22,22,15,19,16,13,17,15,19,22,24,30,29,23,24,23,16,23,23,23,22,21,22,20,18,22,23,71,94,148,228,197,135,141,139,151,159,224,216,96,34,18,7,16,16,17,17,16,16,16,17,20,20,15,18,21,15,16,19,19,19,16,15,15,16,16,13,15,18,17,21,18,27,31,28,42,42,50,55,60,66,65,71,81,81,74,65,49,40,39,42,41,41,44,43,33,59,83,77,65,36,36,61,57,54,64,66,49,37,44,54,75,92,93,87,97,98,102,88,59,41,39,41,45,63,87,84,107,213,244,248,246,235,235,235,229,230,231,230,229,230,231,231,230,230,229,231,229,228,228,228,230,228,228,231,230,229,229,232,232,231,231,231,232,231,233,236,234,234,234,233,234,232,232,234,231,232,233,234,235,229,233,231,229,232,231,232,232,230,230,227,117,4,0,4,8,10,9,10,10,11,11,11,11,193,193,194,190,192,192,188,193,190,190,190,192,191,189,190,190,190,191,193,192,193,192,191,191,191,193,191,192,193,193,193,192,192,189,191,192,191,192,193,190,191,191,193,193,191,192,191,193,191,192,193,195,195,192,192,192,193,193,193,194,192,193,194,195,193,192,194,193,194,194,193,193,191,192,191,195,193,193,196,195,196,194,195,194,195,197,197,195,196,195,194,193,195,195,195,195,196,196,193,195,196,196,195,196,198,196,194,196,198,195,198,196,198,199,197,200,198,198,199,198,196,199,198,198,198,198,201,200,199,199,198,199,199,200,203,202,201,200,201,200,199,200,201,202,201,199,201,199,201,201,198,201,197,200,200,199,200,199,201,200,200,201,201,201,199,199,199,200,200,200,201,201,200,202,202,202,206,202,200,202,200,202,201,202,200,199,204,200,202,200,201,202,202,200,200,203,199,202,199,199,200,200,203,199,200,201,199,201,198,198,200,196,198,198,198,201,200,199,199,199,201,201,200,199,199,199,201,201,199,202,200,198,201,201,201,202,200,203,201,201,204,202,204,202,202,205,204,202,205,201,201,203,202,201,200,204,205,205,204,208,206,202,212,143,115,186,217,223,211,199,208,208,213,211,212,214,214,220,219,220,219,213,201,190,184,176,179,186,194,208,217,220,224,223,222,220,217,217,216,215,214,215,215,216,219,219,221,220,220,222,220,221,222,219,220,222,221,222,221,222,223,223,219,221,223,218,222,217,225,218,215,192,119,66,143,223,250,225,172,178,172,188,170,169,152,124,156,133,145,166,183,203,196,223,213,148,51,40,57,29,33,29,53,109,83,120,127,132,217,250,240,198,212,229,212,206,212,236,224,163,191,185,193,170,152,145,122,183,182,196,193,211,215,232,230,213,205,166,201,132,49,178,198,159,158,197,241,234,174,60,39,50,104,75,29,46,19,66,236,246,201,162,144,220,206,167,171,179,191,160,117,175,220,221,231,224,174,198,207,195,235,240,130,68,142,158,198,223,235,250,253,202,63,10,3,22,28,39,46,21,22,21,49,55,61,34,31,160,247,247,248,206,155,107,41,61,53,76,72,76,55,70,132,122,141,81,20,22,18,44,50,61,70,94,139,165,167,141,141,141,139,121,104,93,63,51,37,29,24,15,16,16,15,17,17,14,21,20,13,17,22,19,22,29,34,40,43,46,48,43,37,27,22,29,18,17,27,19,24,17,23,27,23,23,50,177,250,196,142,146,143,156,185,243,188,67,31,14,15,20,15,24,20,15,19,22,18,15,20,18,17,15,17,18,19,19,17,17,16,17,16,16,15,14,13,15,15,14,16,15,17,18,16,21,22,25,36,33,39,49,58,61,60,69,78,84,78,72,69,54,57,41,45,72,64,53,50,76,83,66,79,83,75,55,26,27,44,38,32,40,55,78,98,105,90,63,48,55,68,93,121,128,95,130,240,244,248,241,234,237,231,231,232,232,231,232,231,230,231,231,230,231,231,229,228,228,231,231,230,229,229,232,231,232,232,231,231,232,233,233,233,235,236,234,234,236,234,236,236,235,234,235,234,236,235,234,232,233,232,229,230,229,230,232,231,231,226,117,4,1,4,8,9,9,10,10,11,11,11,11,190,193,194,189,189,190,188,193,190,189,191,190,191,188,191,191,189,191,193,190,193,191,191,191,189,194,192,193,192,192,191,191,193,191,192,192,190,191,193,190,191,191,190,191,190,194,191,192,193,191,193,191,190,193,193,194,193,194,193,193,193,192,193,191,193,194,191,191,192,194,193,193,194,191,191,193,191,191,191,189,193,194,191,194,194,193,195,193,194,193,194,194,194,195,195,195,194,195,193,192,194,193,194,195,193,196,195,194,195,193,197,195,195,198,196,198,196,196,200,196,197,198,198,200,199,200,198,199,200,198,199,196,199,198,197,198,197,200,200,200,202,202,200,201,201,202,202,200,200,199,201,199,203,200,199,200,199,199,199,199,200,199,199,199,200,200,198,199,198,196,200,199,197,200,201,202,200,200,201,201,201,200,201,199,200,198,201,200,199,200,199,202,200,199,200,200,199,201,200,199,200,200,199,198,198,199,198,197,198,199,199,198,200,196,196,198,198,198,196,198,199,200,199,199,200,198,199,201,198,199,199,198,200,201,201,202,199,199,201,199,199,200,203,200,199,201,204,203,203,202,201,202,201,204,203,202,202,205,204,207,202,207,222,194,204,233,231,217,204,202,210,217,218,228,230,225,223,213,208,200,190,185,176,174,182,193,206,212,216,219,220,220,218,214,217,213,212,216,211,214,215,215,216,217,217,217,219,220,220,220,221,221,220,221,220,220,219,222,221,220,221,220,220,221,221,219,219,218,224,213,210,189,117,79,157,229,247,203,166,168,169,201,179,146,113,79,138,145,158,167,173,184,163,177,175,136,48,46,68,31,40,28,69,97,56,158,193,185,234,234,234,198,224,236,214,213,221,242,229,170,170,213,235,179,199,166,151,215,221,239,233,222,213,225,179,183,158,166,234,133,112,236,220,191,178,212,251,239,162,66,39,38,87,78,24,31,25,46,218,246,210,160,174,248,239,174,157,200,240,163,106,194,246,230,248,221,170,209,208,212,246,252,199,94,76,132,213,239,252,252,250,121,19,8,4,25,42,42,24,16,22,45,56,59,44,22,140,241,251,251,251,241,196,122,30,38,34,54,54,59,40,83,135,112,136,66,32,67,100,155,169,173,171,150,143,136,107,71,39,34,33,24,23,19,12,18,19,14,19,12,12,18,15,17,26,22,20,22,22,21,22,24,31,40,47,43,40,51,53,47,41,28,28,20,18,25,18,22,22,15,31,24,26,24,32,177,234,180,146,144,147,161,196,222,134,34,22,15,21,22,27,20,23,23,19,26,21,16,14,19,19,15,15,17,18,24,22,19,19,18,22,19,16,18,14,19,20,17,17,16,24,21,16,19,16,17,17,16,22,19,23,27,34,39,48,68,79,88,101,102,89,80,75,92,99,89,80,76,61,48,68,77,61,38,45,73,65,40,23,24,19,45,83,97,101,83,110,127,129,142,146,123,91,181,247,247,243,239,235,235,231,231,233,233,231,231,232,232,231,231,231,231,231,230,229,230,230,232,229,228,230,228,230,231,231,231,231,233,232,233,236,234,233,234,234,234,233,235,232,232,233,232,233,231,233,235,229,233,231,230,231,228,230,230,231,231,229,118,2,1,5,9,9,9,10,10,11,12,12,12,190,193,193,190,191,186,190,191,189,190,190,191,188,192,188,190,191,189,191,188,193,192,191,193,192,191,192,193,191,192,191,191,195,189,190,191,191,189,190,191,191,192,190,192,191,189,192,190,191,194,191,193,191,189,195,190,191,191,191,192,191,194,191,191,194,193,194,191,193,193,190,194,192,193,193,193,192,191,193,192,195,194,194,192,194,194,193,194,195,196,197,193,196,194,193,195,193,193,193,194,192,193,192,194,195,193,193,194,194,192,195,195,195,195,194,196,195,197,196,196,197,196,197,196,195,198,196,195,196,196,199,199,198,198,200,199,200,198,201,201,200,203,200,203,202,200,202,199,200,198,198,200,199,202,200,200,199,199,198,198,200,199,201,201,199,199,199,200,200,199,199,199,199,201,200,199,200,199,198,199,200,199,199,199,199,199,199,200,199,200,198,198,203,199,198,199,198,201,202,200,199,199,198,198,198,199,198,198,198,198,197,195,199,196,194,195,196,196,198,195,196,198,194,199,198,196,198,198,198,199,197,198,200,199,200,199,198,199,200,200,200,198,200,201,200,202,201,201,203,200,202,201,201,200,202,203,203,204,205,208,198,209,209,191,216,230,218,209,197,207,224,228,248,249,244,224,198,193,185,179,178,183,190,200,212,216,220,218,219,218,215,215,214,213,213,212,212,214,213,214,214,214,217,217,216,220,223,222,222,222,223,222,222,223,222,224,222,224,225,223,226,226,228,229,227,229,229,226,229,219,213,185,145,128,182,239,238,184,143,121,157,205,174,166,149,92,130,162,175,176,178,162,120,128,142,133,55,53,71,41,43,32,52,46,31,192,251,243,253,253,251,230,249,252,236,241,249,252,238,119,125,224,252,210,214,184,167,229,250,250,236,193,215,230,188,245,207,222,252,174,208,250,252,229,196,235,252,252,190,65,46,53,100,85,18,35,26,51,200,245,237,147,198,248,252,186,185,250,245,131,120,246,252,252,252,227,211,225,215,234,251,251,245,149,83,137,230,246,253,253,146,31,6,9,12,38,44,25,18,13,39,55,61,47,23,121,240,247,251,251,252,252,207,112,46,50,45,55,55,44,44,111,136,120,146,141,154,167,162,175,156,128,93,55,36,25,33,18,15,16,19,22,14,18,14,18,21,19,27,24,28,27,24,23,24,28,22,22,30,26,30,37,45,50,42,47,48,39,46,42,36,24,14,17,23,20,22,17,23,22,32,27,29,27,25,184,235,160,148,143,154,171,200,190,82,11,15,17,22,27,25,27,21,28,31,29,30,20,19,18,17,16,19,19,16,18,22,27,23,22,21,17,20,22,21,27,31,31,27,31,32,29,32,25,21,18,16,16,15,15,16,18,18,20,19,25,34,35,42,57,69,82,102,119,125,123,101,77,57,45,68,72,61,56,78,89,62,34,28,31,18,56,96,106,102,90,121,132,142,132,133,104,103,237,248,243,245,234,238,234,230,236,234,232,231,232,232,233,233,232,233,233,233,232,231,231,229,229,229,228,228,228,230,230,231,231,232,231,230,232,235,233,233,232,235,233,232,234,233,234,233,234,232,231,229,232,231,229,231,229,230,230,231,233,231,231,229,118,3,1,5,9,9,9,10,10,11,12,12,12,190,193,192,190,193,192,190,193,191,190,193,191,189,188,193,190,188,192,193,190,191,190,191,191,190,192,189,187,189,191,191,190,188,190,189,190,190,191,191,189,189,189,188,190,191,191,188,187,193,190,191,190,191,190,187,192,189,191,191,191,191,191,193,191,192,190,189,193,190,194,191,190,193,189,191,194,189,190,194,194,195,193,193,195,193,194,198,194,196,194,195,194,191,196,195,193,193,196,195,196,194,194,195,193,194,196,195,193,196,197,195,195,197,194,194,195,196,195,198,195,194,196,195,198,197,197,196,197,199,198,198,200,200,199,200,199,200,202,201,200,199,200,199,200,199,199,199,198,197,200,201,199,200,198,201,198,197,198,199,200,198,198,199,199,199,199,198,199,200,199,201,201,199,200,200,199,198,199,201,200,201,200,199,199,201,200,200,198,198,198,196,199,198,198,199,198,198,199,199,199,198,196,199,198,198,198,196,197,196,196,195,196,198,196,198,198,195,197,197,196,198,198,196,196,198,198,198,196,196,199,197,198,196,196,200,200,199,198,199,198,200,199,200,200,200,200,199,200,203,200,200,202,201,201,202,204,203,205,206,204,197,207,193,180,208,211,213,204,199,217,224,214,177,164,170,174,179,178,184,196,204,214,221,223,222,222,221,219,218,220,217,218,218,218,223,219,221,224,223,226,225,229,229,230,232,236,237,237,240,239,241,241,241,243,239,242,244,243,245,244,249,250,251,252,250,252,252,252,252,243,231,211,191,151,189,248,250,199,165,105,139,213,202,236,215,155,177,172,196,193,170,139,101,124,139,137,64,52,69,38,52,39,46,24,20,188,247,247,252,252,250,237,250,250,245,251,252,252,230,99,92,207,237,186,192,177,162,195,214,181,202,179,224,230,206,249,201,233,214,150,208,243,227,182,139,200,246,235,160,76,48,57,96,91,28,29,39,39,172,230,140,83,152,211,151,131,169,201,160,81,112,202,222,205,198,193,186,181,155,163,200,235,218,117,66,108,186,198,200,147,47,14,10,10,32,45,33,18,14,34,53,54,56,18,93,222,252,252,252,252,250,217,171,101,50,70,60,78,72,57,69,125,137,127,167,143,134,100,52,42,29,27,22,18,19,16,15,19,20,13,18,26,29,33,31,33,34,33,41,45,41,44,31,32,24,24,34,28,30,26,38,46,43,48,62,53,67,79,49,30,29,18,21,24,19,22,25,21,22,28,35,30,40,23,51,217,212,144,146,141,165,180,185,163,80,33,22,28,32,22,28,29,26,22,33,34,28,28,18,18,15,17,15,15,22,17,16,18,27,26,30,27,22,26,24,32,36,39,50,37,37,37,35,33,24,21,17,19,18,21,16,14,15,14,17,15,17,17,15,24,30,33,44,54,63,75,89,103,97,90,102,111,103,83,77,60,41,34,27,32,29,74,106,108,112,86,105,125,135,126,119,87,131,240,246,241,243,233,233,233,236,229,232,234,233,236,236,235,235,235,234,234,232,231,232,232,231,231,232,232,233,233,233,233,236,235,231,233,234,232,234,235,233,234,234,235,236,236,233,235,234,232,232,231,232,230,228,230,230,230,230,230,231,230,235,232,226,118,4,1,3,8,10,9,10,10,11,12,11,11,192,195,194,193,194,191,193,193,193,193,191,196,192,191,191,191,192,192,194,191,193,189,190,191,189,190,192,192,189,193,188,188,193,190,192,190,192,191,193,190,190,195,189,191,193,190,192,192,191,191,189,192,191,191,194,191,192,190,189,192,190,193,191,192,194,189,192,190,189,194,190,193,192,188,190,193,192,193,193,191,193,190,194,195,193,194,194,193,193,194,193,192,196,193,194,196,194,196,195,194,193,196,193,195,196,195,195,195,196,196,196,197,196,195,195,195,196,196,196,198,198,199,200,199,200,200,199,203,202,200,202,200,203,202,204,202,200,201,199,200,198,200,199,198,199,196,200,197,200,202,201,203,199,200,199,199,200,202,198,199,200,198,202,200,200,199,199,200,200,198,199,200,198,199,198,198,201,200,200,201,201,199,199,200,199,199,196,198,200,200,197,198,199,198,199,199,197,199,198,198,198,198,198,196,198,198,197,198,197,196,198,197,198,197,199,195,197,199,198,197,198,198,195,197,198,197,199,196,195,199,195,197,197,196,198,198,200,198,198,198,198,201,199,200,199,199,200,202,203,200,202,201,205,205,205,205,205,206,208,204,201,206,188,200,217,220,222,216,217,225,208,103,23,11,85,179,208,223,229,236,242,246,245,243,241,240,241,240,240,240,244,245,244,247,246,248,251,251,252,252,251,251,252,252,252,252,252,252,252,252,252,252,252,252,252,252,253,253,253,253,252,252,252,252,250,250,252,252,252,252,245,218,210,147,159,244,253,225,205,114,141,211,225,245,242,204,190,170,176,168,158,120,101,124,133,139,91,93,89,81,97,83,95,80,84,181,187,171,185,169,160,141,153,154,158,183,174,180,149,74,78,127,127,111,104,106,93,75,84,70,105,97,129,110,95,120,95,124,99,66,96,100,116,105,32,42,73,78,102,66,46,51,78,88,37,31,33,46,50,47,48,9,35,43,29,24,49,71,50,53,61,32,12,6,56,107,122,128,80,48,21,40,61,47,68,72,76,88,69,51,30,19,12,23,45,35,28,21,23,47,60,63,24,72,208,245,252,252,254,254,211,170,138,85,89,100,93,98,99,84,99,137,123,135,125,55,29,20,15,16,16,16,18,17,20,21,17,19,25,23,18,24,37,44,38,38,36,36,37,41,45,49,50,44,41,33,33,31,30,29,37,54,55,71,60,91,154,117,66,41,17,19,21,20,25,22,24,24,33,35,41,34,53,18,82,245,186,132,144,140,182,189,169,144,90,70,77,57,45,33,24,29,27,27,28,34,31,31,24,19,21,16,18,19,19,21,18,15,16,18,29,25,29,30,28,33,32,39,43,46,39,38,41,34,31,22,21,23,18,21,22,19,22,22,19,18,18,21,18,16,17,20,17,20,29,32,36,49,61,73,86,94,101,101,83,63,50,45,42,40,33,70,101,108,108,99,103,107,115,106,112,59,93,222,234,243,247,230,233,232,235,233,234,234,233,234,235,235,231,231,234,234,234,233,231,233,235,236,235,234,234,234,233,233,233,235,233,233,234,233,233,233,234,234,237,236,233,232,232,233,233,232,232,231,230,233,231,232,233,231,232,230,231,232,231,232,229,117,4,1,4,8,10,9,10,10,12,12,10,12,192,195,193,193,195,191,189,191,192,192,190,191,192,192,192,191,192,191,191,189,190,192,191,192,190,190,194,193,191,193,190,190,192,191,191,190,190,189,190,193,192,192,189,193,191,190,192,189,192,189,191,194,190,190,190,193,191,191,190,191,189,191,191,189,193,191,190,193,189,191,189,190,195,191,193,190,192,194,193,194,195,193,194,194,191,194,194,193,196,193,195,193,191,195,192,193,192,194,194,191,193,195,194,193,194,191,194,194,194,196,192,194,197,194,197,197,195,194,197,196,196,199,197,198,199,200,203,200,200,200,200,201,200,200,200,199,201,201,199,199,197,199,200,201,198,197,198,199,200,200,201,199,201,201,202,202,202,201,201,201,198,200,202,202,201,200,199,201,201,200,201,200,200,201,201,200,201,200,200,199,199,199,198,199,200,197,200,198,198,200,199,198,198,196,199,197,198,198,198,198,197,197,197,195,197,197,197,197,197,198,197,196,197,197,195,194,193,195,195,196,195,196,196,198,197,193,196,196,198,197,195,201,200,198,200,201,200,200,202,202,203,202,207,207,207,210,210,212,214,214,217,217,219,218,220,223,223,226,228,218,223,224,210,224,236,245,242,229,226,223,201,123,59,95,191,246,252,252,251,251,251,251,251,251,251,251,251,251,250,250,251,251,251,251,251,251,250,250,251,251,249,250,246,248,249,242,244,242,241,238,239,225,214,228,209,214,222,214,214,199,184,170,166,167,159,163,171,187,192,179,153,157,172,104,104,172,170,171,163,95,122,147,150,189,160,141,142,116,117,107,120,107,98,114,107,117,114,120,125,115,130,129,144,134,121,116,79,74,68,60,59,48,55,41,31,60,50,62,69,69,88,58,40,53,50,57,48,24,29,42,60,38,56,36,37,27,19,68,41,43,57,28,53,85,36,19,18,4,46,63,52,53,59,88,54,35,40,42,47,36,65,49,57,44,20,26,41,57,32,49,74,49,12,10,26,78,110,126,137,71,20,9,16,37,64,67,84,72,47,45,23,16,23,42,43,27,25,30,51,53,63,32,50,185,248,248,252,252,252,181,142,128,94,95,122,142,142,142,137,120,130,141,118,130,92,21,14,17,14,19,13,14,20,19,26,28,27,23,27,28,31,30,26,36,43,41,41,50,56,48,45,52,46,50,53,50,46,30,37,35,36,47,50,50,44,99,159,116,69,43,16,21,21,24,23,24,29,21,33,44,46,45,51,17,97,226,144,129,147,175,217,178,154,141,103,92,112,106,64,36,36,26,27,33,32,30,27,24,27,24,18,24,24,23,24,19,23,20,18,16,16,23,26,27,25,30,27,33,49,56,46,37,44,41,35,28,24,29,24,20,22,20,22,24,22,27,24,21,22,16,17,19,18,19,21,16,17,23,24,29,33,45,63,76,89,96,93,87,77,67,53,81,103,108,119,94,76,72,73,66,61,18,96,233,238,247,242,232,238,231,236,233,232,234,233,233,233,234,235,234,233,233,235,234,232,235,233,234,236,234,234,233,233,232,232,233,233,233,232,233,233,232,232,233,232,234,232,232,231,231,234,230,231,232,230,232,229,229,230,230,231,231,231,230,231,230,227,118,3,0,6,10,9,9,11,9,11,12,12,11,192,193,191,192,191,190,191,191,193,192,191,192,191,188,191,193,187,188,189,191,195,190,192,191,191,191,189,189,190,189,189,191,190,188,191,191,188,188,190,189,191,189,187,190,189,191,190,188,190,188,189,192,189,188,187,186,191,191,190,191,190,190,189,189,193,190,191,192,189,191,189,191,192,191,192,191,190,191,192,192,195,193,194,194,194,196,192,192,193,193,195,193,195,193,194,194,193,194,192,195,195,194,194,194,195,193,194,196,194,195,195,196,195,195,196,195,196,194,196,198,196,197,198,196,198,201,200,201,202,203,202,199,199,198,200,200,199,200,196,196,197,197,198,196,196,196,199,198,200,201,199,200,200,201,200,201,200,201,200,205,201,199,204,199,199,198,200,200,199,201,201,203,202,201,200,202,199,196,199,199,199,198,199,199,199,200,198,196,198,197,198,198,196,197,198,198,197,197,198,198,197,199,198,197,198,199,195,195,198,198,198,196,195,194,196,195,196,196,193,196,196,196,195,194,196,194,195,194,196,202,201,210,217,217,220,220,222,224,228,227,226,231,230,235,236,235,237,237,241,242,245,245,243,244,245,246,247,248,246,233,236,225,222,228,219,229,203,193,203,219,234,203,200,227,246,242,227,218,210,207,194,190,184,184,183,178,177,177,173,169,167,166,168,168,159,157,152,150,146,145,146,136,137,134,130,125,121,117,113,111,107,93,100,137,101,85,100,95,96,89,81,71,66,64,64,73,66,61,67,65,67,68,81,70,73,70,62,80,79,69,87,71,64,69,59,79,77,69,74,69,83,84,87,93,89,98,104,119,107,112,119,109,116,113,104,96,71,63,65,63,61,57,56,50,45,55,52,61,79,99,103,78,59,68,58,66,61,47,64,69,83,65,69,49,71,45,31,67,53,51,53,54,80,105,60,65,60,24,61,83,72,59,62,97,63,40,46,42,39,49,83,74,87,71,55,54,70,62,61,103,78,56,50,16,69,63,63,133,142,160,85,31,20,20,53,81,96,75,42,34,17,23,45,42,31,21,29,51,50,62,39,39,165,243,252,252,252,252,160,90,81,65,44,87,139,148,162,166,173,155,156,139,110,126,67,16,18,17,19,20,23,29,29,29,34,34,23,17,18,21,29,29,26,25,37,55,61,64,60,49,49,54,57,54,49,55,70,67,60,48,34,32,37,63,79,97,92,61,45,27,21,22,24,26,26,26,23,27,38,44,50,33,49,19,110,184,100,132,156,223,250,162,144,141,110,100,95,74,64,39,23,29,36,35,35,34,29,33,31,31,36,34,41,41,34,29,21,21,21,19,19,17,19,22,16,25,23,39,56,49,56,46,45,42,39,35,24,27,28,27,24,26,29,21,28,29,22,26,25,24,27,18,18,25,17,19,21,17,19,22,20,19,23,27,40,50,62,84,103,109,104,124,112,102,112,85,59,56,57,57,50,24,159,239,249,249,238,233,236,231,233,232,232,233,234,233,232,235,235,233,235,234,235,236,235,235,233,233,235,234,233,233,235,236,235,234,234,232,233,233,234,233,232,233,231,230,229,232,233,231,232,229,232,231,229,230,228,229,228,229,230,226,229,232,231,228,227,119,4,1,5,8,9,9,12,10,11,12,12,11,191,194,194,195,194,189,191,193,192,191,192,193,190,187,192,192,190,189,190,190,188,188,189,190,188,187,189,188,188,190,188,187,191,190,187,187,188,188,188,189,189,188,186,188,188,188,190,188,191,189,189,189,190,190,189,189,189,190,190,189,187,187,189,189,191,190,188,190,188,189,189,190,190,190,190,190,192,189,189,191,192,193,193,194,192,192,193,193,195,192,193,191,193,194,189,195,193,194,196,193,196,196,196,194,196,196,196,195,196,196,192,194,196,194,194,196,199,198,197,199,198,197,199,201,200,199,199,202,203,201,200,202,200,200,199,199,198,197,195,199,196,196,195,196,196,196,200,200,199,200,202,200,201,200,199,200,201,202,201,200,200,201,201,200,201,200,201,201,201,200,200,200,200,200,200,200,200,199,198,199,199,199,198,198,199,198,197,195,198,198,193,195,197,196,198,198,198,194,194,196,196,194,197,195,198,196,195,196,196,198,197,199,194,198,197,195,196,195,197,195,194,196,194,193,194,193,195,192,200,207,214,239,240,241,242,244,247,243,245,242,244,243,241,237,237,235,232,231,226,223,222,217,217,210,204,204,201,200,197,185,188,196,199,164,127,139,127,132,141,146,158,150,157,155,144,129,116,107,102,97,87,81,84,76,77,79,71,78,73,70,66,61,64,62,61,93,51,49,51,50,54,55,48,48,51,41,49,48,46,46,46,38,49,118,82,51,60,58,70,63,86,84,90,93,91,99,85,60,55,63,57,66,66,53,71,66,56,60,63,57,72,61,47,62,47,61,63,55,74,63,68,81,68,83,78,88,85,83,91,90,84,74,84,83,87,88,77,75,69,74,77,70,78,75,57,64,51,77,81,62,86,87,79,81,53,88,74,72,71,65,65,57,75,45,64,54,49,82,66,45,42,50,72,107,72,56,41,15,73,102,75,81,71,99,68,45,53,42,39,44,50,27,64,72,63,45,65,77,73,112,63,44,32,20,69,72,30,70,127,127,162,94,36,23,12,56,76,55,34,31,22,41,53,33,21,27,47,49,61,45,27,137,244,246,252,252,253,170,131,124,135,115,59,65,80,108,105,134,130,131,150,124,109,122,60,14,22,24,19,20,29,35,31,34,37,42,35,24,19,16,19,18,22,24,16,32,48,53,57,60,61,60,48,57,54,55,91,118,113,92,70,66,86,103,104,73,43,33,27,25,16,32,48,46,37,31,35,38,46,47,50,49,47,30,109,146,96,161,193,234,234,138,144,130,79,60,33,43,49,33,23,30,43,37,43,39,49,53,38,42,41,39,30,31,44,39,35,29,26,23,18,22,15,18,17,19,36,55,61,62,78,63,48,47,43,36,31,34,35,33,34,30,32,33,31,31,32,40,40,35,27,27,26,21,20,23,23,23,20,18,15,16,18,17,21,24,25,31,52,63,72,88,91,96,102,85,61,56,59,70,60,59,193,240,249,249,239,236,236,234,236,231,234,235,233,234,236,236,235,235,233,234,236,234,233,236,234,236,236,234,237,237,235,234,236,234,234,236,233,235,235,235,234,233,231,231,232,233,231,231,233,229,230,231,230,231,231,232,230,231,230,227,229,230,231,231,227,117,4,1,4,9,10,9,11,10,12,12,11,11,193,196,193,194,193,192,190,194,191,191,192,193,192,189,191,190,189,190,189,190,190,189,193,190,191,190,188,190,188,189,190,189,190,190,189,189,189,188,191,187,189,188,186,190,189,189,189,188,191,190,189,191,190,193,190,189,191,189,190,191,187,188,189,189,193,189,189,190,190,191,188,193,189,191,191,190,193,189,191,193,193,192,194,194,191,193,193,194,194,194,194,193,194,192,193,193,194,195,196,196,194,197,197,197,198,196,196,198,198,196,198,196,196,197,196,198,198,197,199,198,198,202,201,200,202,201,200,199,200,202,201,204,204,200,200,200,198,198,199,199,197,198,202,199,200,198,200,200,199,199,200,201,200,201,199,201,203,202,201,200,198,199,201,199,203,203,201,202,198,201,199,196,197,199,199,199,198,200,200,197,199,196,196,198,197,198,196,197,199,195,198,198,195,196,195,196,194,192,196,194,196,195,194,194,194,196,194,195,193,193,196,194,196,196,195,197,196,196,195,194,194,193,197,193,192,193,193,192,206,205,190,198,198,193,191,188,186,182,179,173,171,168,157,154,153,147,141,132,124,121,118,115,114,111,103,100,98,101,107,94,101,141,172,107,40,58,67,81,71,65,72,65,70,69,65,62,64,61,61,63,56,63,55,60,67,62,68,68,69,62,60,57,52,55,50,55,51,49,49,50,57,50,56,56,50,55,54,54,53,53,53,46,61,119,91,52,55,55,62,69,89,102,107,107,106,113,99,70,61,57,67,59,66,65,57,66,61,59,56,56,61,58,52,57,53,54,59,53,59,55,59,55,53,54,54,51,52,54,51,53,51,50,53,53,52,58,58,55,55,59,51,59,57,52,46,45,46,51,73,53,39,62,69,68,59,73,76,61,54,54,48,37,57,44,59,36,61,103,61,33,24,44,49,87,70,48,41,8,54,92,82,96,77,98,73,42,52,42,41,49,32,19,56,75,55,22,66,57,62,98,46,41,23,14,81,45,36,25,18,72,78,127,93,31,19,42,60,41,31,35,45,55,37,24,25,39,56,55,50,25,113,235,250,250,252,252,174,155,211,246,251,216,69,9,8,15,41,49,51,68,113,111,110,114,46,19,25,15,27,26,19,24,26,29,31,43,52,38,24,22,22,17,23,21,19,18,20,38,42,50,54,49,56,59,69,80,93,93,117,123,103,111,113,115,98,61,37,18,22,18,27,69,71,66,77,90,74,54,63,66,59,46,48,46,116,123,108,200,235,253,207,117,147,117,51,31,48,70,53,35,24,22,36,48,57,51,47,58,54,51,48,43,38,36,42,41,35,44,37,25,26,23,24,22,21,23,36,56,63,69,71,51,33,32,29,29,42,48,42,41,41,42,42,42,44,46,43,40,44,44,42,36,31,34,30,26,27,20,21,22,18,24,22,22,18,18,22,18,23,25,33,47,59,84,101,89,65,69,87,103,102,84,181,239,249,249,242,234,236,236,234,234,234,236,235,235,234,234,234,234,234,233,234,236,233,235,236,236,238,236,234,235,234,235,234,234,236,235,236,236,232,233,233,234,232,233,234,230,232,233,233,232,232,231,230,231,230,231,228,229,232,229,230,232,231,230,227,118,4,1,4,8,10,9,10,10,12,12,12,12,191,194,194,193,192,190,189,191,191,192,189,192,193,189,191,191,191,190,188,189,191,191,190,191,193,192,191,191,192,190,188,189,190,189,188,190,189,188,189,189,187,187,190,191,190,186,189,189,186,189,188,189,188,187,190,189,189,188,188,191,191,191,193,191,189,191,190,190,191,191,191,191,191,192,190,190,190,190,191,193,193,191,192,193,191,193,195,194,195,194,195,196,195,196,195,198,194,195,195,196,196,197,201,197,198,199,198,198,197,199,199,199,198,198,200,196,200,199,199,202,199,200,199,200,202,202,201,201,201,203,203,202,200,200,199,199,199,198,200,200,199,200,198,199,200,200,200,199,199,199,199,199,199,200,200,201,200,201,199,201,200,199,201,200,201,199,200,199,200,198,198,198,198,198,199,200,198,198,199,199,196,196,200,197,195,196,196,196,194,194,195,196,193,194,195,193,195,193,194,196,194,194,197,195,195,194,194,194,194,196,194,194,190,194,191,194,195,193,195,193,196,193,191,192,196,195,194,191,198,153,71,62,57,51,49,42,44,35,39,37,36,35,31,31,31,26,24,25,24,25,26,27,24,20,25,28,33,35,37,34,37,108,154,84,16,30,43,49,49,43,44,43,48,48,52,48,52,48,50,52,51,55,53,49,50,54,51,57,50,57,53,44,54,48,51,52,53,49,50,52,49,53,48,50,53,48,49,50,50,45,48,43,34,77,65,44,43,36,48,41,42,68,53,50,47,55,53,47,40,39,45,46,39,45,48,40,48,39,42,44,36,40,38,37,37,43,39,34,43,37,36,42,36,38,30,36,36,36,37,33,36,35,36,33,32,37,43,38,39,39,40,35,33,36,32,33,24,45,81,43,37,57,57,48,36,51,55,51,42,51,30,39,42,31,53,14,36,70,54,21,17,27,22,47,36,55,42,3,27,60,78,105,70,97,81,44,57,39,54,46,19,24,60,73,37,10,36,35,56,71,16,35,19,18,53,21,9,23,19,14,40,49,108,75,36,59,33,33,34,42,53,43,28,17,37,55,53,54,23,89,222,244,252,252,252,185,164,207,249,249,250,204,68,19,6,16,33,62,43,22,95,106,118,110,34,23,23,23,34,19,20,20,19,22,30,39,46,53,47,32,24,20,23,26,21,24,19,21,19,29,35,38,55,56,59,72,88,76,66,80,93,117,125,125,141,104,55,23,17,18,68,93,42,95,155,141,92,48,45,49,36,41,72,61,130,159,168,230,234,252,176,115,153,100,48,79,138,100,86,41,37,22,18,24,36,50,53,56,59,59,57,59,49,40,46,43,34,36,42,38,25,27,21,22,21,17,31,35,44,49,50,44,39,30,30,33,31,38,37,38,38,46,57,60,60,55,47,47,49,49,49,50,46,41,44,40,37,38,28,27,31,27,18,25,21,21,24,17,16,19,30,33,68,94,98,98,83,110,139,149,141,117,139,191,241,245,236,232,237,233,235,233,233,233,233,233,233,233,233,234,233,233,235,231,233,235,234,236,235,236,236,236,236,235,237,235,235,236,234,236,233,234,234,233,235,231,232,233,231,233,231,230,230,229,229,230,229,229,229,229,228,228,231,232,231,228,227,119,3,0,5,9,9,9,11,9,11,12,11,11,187,192,188,193,191,190,192,192,191,190,191,190,193,191,193,191,192,190,191,193,192,192,191,191,191,192,190,191,189,188,190,188,189,190,186,188,189,186,190,189,187,190,188,191,191,189,188,189,191,188,188,189,188,191,190,190,188,189,191,191,189,192,192,190,191,190,191,189,190,192,188,193,191,190,193,192,193,192,193,195,195,195,194,195,197,194,194,194,193,194,193,194,196,195,197,196,199,196,195,198,197,200,199,199,201,201,202,200,199,199,200,201,200,200,202,201,200,200,201,200,200,200,200,201,203,203,205,203,205,205,204,201,200,200,200,201,200,203,203,201,200,200,200,199,200,198,201,201,200,200,203,200,199,201,199,201,200,200,202,201,200,201,200,198,199,198,198,199,200,200,196,199,198,198,198,198,198,198,199,195,199,196,198,197,194,195,195,196,196,192,196,195,194,196,194,197,194,195,197,192,194,194,194,193,193,196,190,193,196,192,196,190,195,197,193,196,194,194,193,194,193,192,194,194,195,194,195,193,198,117,6,3,5,6,10,9,10,8,10,11,10,11,10,10,10,10,10,10,10,10,11,11,11,11,10,11,12,12,10,12,10,32,77,46,6,11,9,21,25,29,32,29,34,32,35,32,36,35,35,36,32,37,35,32,38,33,34,35,35,34,31,37,33,32,33,35,36,33,33,34,35,35,30,30,34,36,33,32,32,32,35,31,24,45,53,34,25,27,29,28,32,36,33,31,33,28,32,35,26,26,27,27,33,33,31,34,30,29,35,29,29,31,30,26,26,29,31,31,28,28,30,25,30,32,27,28,25,26,27,30,29,31,31,34,31,25,29,29,29,29,27,29,30,30,28,35,19,35,75,36,41,46,56,36,27,42,53,32,16,24,10,29,23,26,29,24,65,43,21,14,27,54,58,73,31,31,28,30,35,37,73,104,72,91,85,45,55,47,64,78,43,35,84,96,87,74,78,86,120,128,95,123,114,118,132,103,131,120,127,108,84,174,159,155,101,27,31,28,41,50,44,24,21,33,49,54,64,30,73,207,247,247,252,252,186,164,216,251,250,250,245,143,73,61,47,53,75,90,32,41,119,105,131,99,27,31,20,25,36,26,24,17,20,24,21,29,46,50,56,49,31,33,39,36,29,27,23,24,21,21,26,22,29,26,46,71,88,91,87,91,112,146,138,157,160,131,78,37,21,26,120,88,45,113,120,105,69,36,25,15,19,57,97,71,138,203,237,252,236,234,150,121,163,83,53,131,122,119,95,47,30,22,20,17,22,39,47,59,63,57,58,47,56,45,39,43,39,40,34,46,45,38,27,21,22,19,22,20,29,35,51,62,60,67,67,57,49,48,46,46,41,57,71,64,60,53,55,54,47,45,51,53,50,55,55,52,51,52,46,32,22,21,19,27,23,29,25,17,22,18,30,39,77,99,106,103,107,159,164,163,155,124,101,82,144,206,228,239,236,234,233,236,236,234,235,233,232,231,233,235,234,236,235,235,234,235,235,235,237,236,233,236,236,234,235,236,236,235,234,233,235,234,232,233,232,231,232,232,232,230,231,230,228,231,229,230,231,228,229,229,229,228,230,230,230,229,226,119,3,1,5,8,9,9,11,10,11,12,12,11,188,191,191,189,191,190,189,190,191,191,190,192,191,190,190,190,190,190,190,192,192,188,192,192,191,191,190,189,191,190,189,189,188,189,188,187,188,189,189,191,188,190,189,187,190,188,191,191,191,190,188,191,189,189,192,188,193,191,189,191,191,191,188,193,189,190,193,190,194,192,192,193,192,192,190,194,194,194,194,193,193,196,194,193,196,195,197,196,196,193,192,193,195,197,198,197,198,198,199,199,200,200,200,201,201,201,202,202,201,203,200,202,203,198,200,200,202,201,200,203,203,203,204,203,204,205,203,202,205,205,202,205,203,203,202,202,205,202,201,201,202,200,201,203,199,198,200,202,201,200,202,200,202,200,200,199,197,200,198,199,197,199,200,196,198,198,198,196,198,198,196,196,198,196,198,198,197,198,198,197,196,196,196,195,196,195,195,195,198,196,192,194,192,192,194,194,196,193,194,194,193,193,193,192,193,195,193,192,194,192,193,197,193,195,194,195,194,192,196,193,193,192,192,193,193,194,193,196,202,145,75,63,46,54,53,48,49,39,43,38,36,33,31,29,23,21,21,21,17,14,16,16,12,12,12,12,10,11,11,12,10,15,51,42,10,11,10,12,12,17,24,24,30,29,29,29,31,31,28,33,29,31,30,34,31,31,29,29,26,24,27,99,19,14,16,13,12,13,13,13,13,14,13,14,13,13,14,15,15,14,13,14,15,15,14,15,13,15,17,15,14,13,14,14,15,14,14,15,19,22,32,56,50,36,45,55,56,46,32,30,35,39,57,50,22,29,28,30,36,39,51,47,39,44,58,60,41,35,39,33,33,44,34,62,83,53,36,122,41,130,57,129,54,134,123,145,144,134,144,122,81,104,124,122,108,126,160,132,102,140,168,191,191,206,224,210,185,110,177,236,246,238,236,188,146,213,232,249,189,89,57,78,59,92,86,46,43,57,212,251,214,171,177,243,243,201,151,143,223,247,245,251,252,226,246,243,250,252,252,252,252,252,250,153,49,38,15,31,43,47,25,12,23,48,48,66,36,47,194,245,249,249,252,188,155,203,246,249,253,253,208,99,82,81,82,91,86,121,53,87,140,106,136,84,20,22,24,28,31,27,25,18,22,26,23,29,34,46,58,54,51,54,59,59,55,51,43,49,62,55,56,60,56,54,113,153,134,123,98,105,130,118,91,92,86,71,63,61,43,69,157,108,83,106,55,42,42,32,18,14,38,94,95,72,128,234,234,234,236,234,164,146,168,71,67,153,107,114,105,52,25,9,21,15,15,20,18,30,43,45,46,54,52,51,44,34,37,39,39,44,51,51,39,29,35,45,45,30,40,55,56,66,76,84,75,59,59,61,58,56,55,59,61,54,50,49,56,62,63,45,33,45,60,62,51,39,36,38,30,26,20,15,28,29,23,29,31,24,17,18,24,44,83,94,104,103,108,146,152,150,127,103,75,20,73,191,235,246,235,231,237,234,235,236,233,233,235,233,234,234,233,234,233,234,236,234,236,236,234,236,235,236,234,234,235,233,233,236,235,235,235,234,234,232,233,233,232,231,231,234,231,229,230,230,229,230,228,230,229,229,230,228,229,227,230,231,226,118,4,1,4,9,10,9,10,10,12,12,12,12,189,193,190,192,191,189,190,191,193,191,191,191,191,188,191,190,190,191,193,192,190,191,194,195,193,193,191,191,192,191,191,191,191,193,188,187,191,189,190,188,189,192,188,191,190,191,191,191,194,191,194,191,190,192,189,191,190,192,191,193,193,190,191,193,194,194,193,193,191,193,193,193,194,191,194,194,194,198,193,196,194,194,198,197,195,194,195,195,197,196,196,198,199,199,200,199,200,201,201,201,201,205,203,199,201,202,201,204,203,201,203,203,206,204,205,203,203,203,205,206,204,204,205,204,203,202,207,206,205,203,206,205,205,202,202,205,201,203,203,203,205,202,202,204,201,200,203,201,201,200,202,201,200,202,201,199,198,199,200,200,199,199,198,195,198,198,198,198,199,198,198,198,198,198,196,197,198,198,197,197,198,196,196,194,197,198,195,195,196,196,195,193,193,195,193,195,195,194,194,194,193,193,195,192,192,193,193,196,195,193,195,190,194,195,191,194,192,193,193,194,193,194,199,194,193,196,193,198,214,208,209,228,228,231,231,229,230,229,229,227,227,224,223,221,222,221,217,217,219,216,213,214,210,208,206,206,202,194,195,194,183,151,159,174,174,183,181,184,177,177,175,173,170,167,165,160,163,162,160,158,159,166,166,166,174,173,181,180,185,184,188,196,194,199,200,204,206,207,209,211,215,217,216,224,225,225,225,232,232,231,234,234,223,170,192,231,236,244,243,235,214,213,213,209,212,208,219,235,243,241,246,235,122,106,211,250,175,83,61,139,222,248,239,173,165,206,217,238,247,250,250,250,240,242,241,163,78,75,64,33,44,50,59,208,248,235,249,241,252,252,249,248,249,252,252,252,252,238,246,217,125,112,164,173,122,128,225,232,217,252,253,253,253,253,252,237,176,153,251,251,252,252,252,238,227,252,252,252,220,117,69,65,35,72,89,48,35,70,230,245,246,226,218,228,251,226,140,186,249,250,252,252,253,224,242,250,246,252,252,252,252,252,210,92,28,2,22,37,48,27,15,22,35,52,55,54,19,84,239,243,249,236,168,167,210,239,251,251,250,252,143,27,69,71,71,78,80,101,53,115,147,107,138,70,15,27,24,33,29,24,27,21,19,27,18,20,31,36,40,44,54,64,71,67,83,86,109,139,127,122,128,142,146,163,184,162,145,121,100,113,99,72,45,36,36,42,39,56,73,100,131,87,74,60,28,55,45,20,22,26,93,115,93,49,108,240,250,248,252,246,180,170,160,52,72,150,108,106,80,42,23,12,19,17,17,21,24,18,18,18,19,43,56,50,47,43,46,47,40,43,46,43,44,38,46,68,70,61,59,62,64,66,73,68,72,76,63,54,47,51,53,51,63,65,66,62,57,57,53,38,21,16,22,32,32,22,15,18,19,17,19,18,24,27,28,27,29,32,24,23,22,46,89,89,97,95,84,120,127,118,86,88,137,164,222,247,242,243,236,239,235,235,236,235,232,234,237,234,233,235,235,234,234,234,233,234,235,236,236,236,237,238,236,233,234,235,234,236,235,233,235,234,233,231,232,233,235,232,230,231,231,229,229,231,228,230,228,229,230,228,230,230,230,230,231,231,225,117,4,0,4,8,10,9,10,10,11,12,12,12,188,192,190,186,189,188,188,189,191,190,189,191,190,192,191,187,192,193,192,193,193,191,190,189,190,192,190,190,192,190,190,191,191,191,192,193,191,190,189,191,190,191,192,191,192,191,192,192,189,191,193,191,191,190,197,190,190,190,191,192,188,193,188,191,192,192,195,191,192,191,190,191,191,192,193,193,195,193,193,194,194,196,195,195,196,192,194,193,196,198,196,197,198,198,199,201,202,203,202,203,205,203,202,203,201,199,203,202,203,204,202,205,203,203,206,204,205,206,204,205,203,205,205,204,204,205,205,203,206,205,201,203,201,204,205,203,205,203,204,203,202,200,201,201,199,200,202,200,199,201,199,200,202,198,200,200,200,201,196,201,199,198,200,199,200,195,196,198,198,196,198,199,195,198,198,198,195,196,197,195,196,194,196,195,195,195,195,195,193,195,193,193,193,193,195,193,192,193,193,192,193,193,195,193,194,194,192,194,194,193,193,191,195,194,192,195,194,195,195,192,193,194,193,194,193,193,193,196,205,208,223,241,243,248,249,251,251,251,251,251,252,252,252,252,252,252,252,252,252,252,253,253,252,252,252,252,252,252,252,252,252,252,252,248,253,253,252,252,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,252,252,252,252,252,242,252,252,253,253,253,253,253,253,252,252,252,252,252,252,252,252,249,243,113,118,243,246,237,203,203,247,253,253,252,252,252,252,253,253,252,252,252,252,252,252,249,182,85,81,64,29,37,41,61,217,252,252,252,252,253,253,252,252,252,252,250,250,250,241,248,230,103,92,168,173,60,47,232,247,224,252,252,252,220,229,186,176,141,155,251,251,251,239,252,198,195,250,246,252,184,99,75,75,43,61,87,46,41,53,207,247,198,212,122,162,241,177,165,207,234,234,223,251,216,163,208,213,214,227,252,252,243,206,97,13,4,5,25,50,32,19,16,40,50,57,52,38,63,145,244,244,239,160,152,219,247,252,250,243,248,181,83,60,89,75,53,77,90,74,55,133,127,113,132,53,20,25,29,34,33,29,21,18,21,21,19,26,21,28,29,39,54,61,76,83,138,173,157,169,170,157,155,147,141,160,158,118,87,73,68,68,53,45,39,28,19,21,33,46,53,63,64,62,67,62,73,74,32,23,32,89,161,113,71,41,48,186,234,252,232,187,122,101,86,26,38,91,68,77,67,39,21,10,21,14,19,21,21,25,22,24,20,16,26,34,39,29,26,28,22,23,28,34,35,29,31,46,56,51,46,48,58,72,72,53,56,81,75,59,59,62,63,57,63,69,69,64,55,41,28,23,20,17,16,17,20,18,17,20,15,16,23,19,27,32,27,34,40,36,30,23,27,61,89,97,103,97,69,74,81,76,33,103,244,244,247,247,241,241,239,238,234,234,235,233,236,238,235,236,237,234,236,236,235,235,234,235,233,232,235,234,232,234,231,233,233,231,233,233,232,231,231,232,230,232,233,233,231,231,231,230,228,230,229,229,230,230,228,228,229,230,227,227,230,229,231,231,227,118,3,1,5,8,9,9,11,9,11,12,11,11,185,190,188,188,190,188,189,188,190,188,189,189,189,190,189,190,189,191,189,188,192,192,191,190,190,191,191,190,190,188,190,192,191,190,191,191,191,188,191,194,191,192,190,192,191,190,194,191,192,193,192,191,196,193,194,196,192,195,193,192,192,192,194,193,192,192,191,192,191,192,194,190,191,191,194,195,193,195,195,194,191,193,194,193,195,194,195,195,198,194,198,198,198,199,197,200,200,200,203,204,201,201,205,202,203,203,200,205,203,201,202,201,205,203,204,204,202,205,203,203,205,205,207,206,206,205,205,205,203,202,203,201,200,200,199,201,200,200,203,199,202,200,199,200,199,200,200,199,200,198,199,199,198,198,198,197,199,198,198,200,199,198,197,198,198,198,197,197,198,196,197,195,196,194,194,196,195,193,196,196,194,193,194,194,194,193,194,195,191,191,196,192,192,194,194,193,192,195,193,195,193,192,193,193,194,193,193,193,192,193,195,189,192,191,193,193,193,194,190,194,192,192,195,191,193,193,191,192,195,196,200,202,207,208,206,209,209,210,213,213,210,212,213,215,216,215,217,219,222,220,221,220,223,226,227,230,221,222,230,231,236,234,234,229,229,235,237,239,241,244,241,243,245,246,246,246,250,250,249,251,251,250,251,251,251,251,251,250,251,251,251,251,251,251,252,252,252,252,250,249,249,249,249,248,248,248,247,246,246,245,245,251,246,215,239,251,244,246,244,246,244,247,248,248,248,247,249,249,226,148,214,174,51,83,201,245,244,251,247,251,253,253,252,219,212,219,196,191,189,196,200,183,164,179,175,94,51,68,60,50,61,38,34,160,204,209,222,211,244,245,252,245,252,252,251,251,232,208,223,212,187,200,245,229,146,95,209,193,149,203,198,170,152,209,196,189,144,188,251,239,239,212,243,171,169,227,220,251,163,91,77,75,65,59,99,46,39,66,224,248,174,124,82,175,204,122,138,198,227,226,217,230,181,163,186,193,206,223,251,251,215,106,19,2,6,17,44,34,24,17,28,54,56,59,20,107,184,210,246,239,163,145,212,250,252,252,243,239,147,152,134,121,117,49,57,59,91,83,57,128,119,118,121,39,19,31,22,28,31,28,24,18,23,24,25,20,19,22,24,40,46,54,67,80,167,170,160,198,168,152,119,98,105,130,125,89,77,59,56,51,33,29,25,16,14,22,21,29,33,39,47,29,35,49,46,39,18,48,142,190,175,97,43,35,16,112,239,239,183,99,45,22,19,15,14,38,50,66,56,28,21,16,12,19,23,23,21,20,31,28,23,22,21,17,19,18,17,17,16,14,19,19,14,21,22,21,34,33,33,40,50,67,55,43,57,71,73,55,36,33,54,68,59,44,34,30,24,17,19,20,16,19,18,21,23,22,21,17,17,24,22,19,32,38,43,56,49,45,36,23,36,71,103,103,107,107,63,49,73,76,26,98,235,239,247,247,239,241,236,237,235,236,234,233,236,237,236,235,238,236,234,236,234,234,236,234,233,234,234,235,233,233,233,232,234,231,232,232,231,231,230,233,230,232,233,230,231,230,230,231,229,231,230,229,229,231,228,229,231,230,231,229,231,233,231,231,228,118,3,0,6,9,9,9,12,10,11,12,11,11,186,190,188,188,188,185,186,186,185,186,188,191,188,185,188,186,189,189,187,189,188,190,192,191,192,192,193,192,191,191,192,190,190,192,193,192,190,191,191,193,193,191,192,191,192,193,191,192,192,192,194,194,196,193,194,193,194,194,192,194,193,197,195,195,194,194,193,192,195,193,195,196,195,193,193,196,195,196,198,195,195,195,195,195,195,192,194,196,197,195,196,199,197,199,199,198,199,200,201,200,201,201,202,201,203,204,203,201,202,203,201,203,203,202,203,203,203,202,205,204,204,204,203,205,203,204,204,204,204,203,202,201,202,201,200,199,199,198,199,200,199,197,201,201,198,199,200,199,200,200,198,198,197,196,198,198,199,200,199,198,194,196,198,194,196,194,196,196,196,196,195,196,195,195,194,197,195,195,194,193,193,193,194,193,193,193,192,193,193,194,194,194,194,191,194,193,194,193,193,194,194,193,192,192,191,191,190,191,190,191,194,192,192,192,191,193,191,191,193,190,191,193,194,193,190,190,192,193,194,194,191,195,194,195,195,194,193,193,196,194,197,196,194,198,196,197,197,198,199,198,198,198,200,199,201,203,194,198,203,206,207,206,200,200,207,206,205,206,207,208,209,212,211,210,214,216,213,214,216,215,216,217,218,218,220,220,218,220,219,218,221,221,222,221,221,221,220,222,222,221,219,220,220,220,221,221,221,219,220,222,218,221,224,203,217,227,220,222,217,219,218,219,221,220,216,218,223,222,178,74,152,174,92,83,165,229,186,186,177,165,156,157,153,124,130,128,115,100,89,108,116,92,84,102,108,90,110,140,129,139,145,127,75,57,55,39,42,36,48,54,56,65,99,150,168,149,128,137,156,186,207,244,252,252,247,127,132,122,65,122,180,174,200,228,207,215,134,198,251,234,241,205,244,171,168,219,220,251,160,88,70,73,62,53,97,52,33,67,232,247,170,142,120,201,156,59,117,174,229,225,221,218,161,190,189,179,208,238,251,251,145,44,6,1,11,29,42,30,21,32,48,56,62,29,70,217,230,219,249,167,138,207,252,252,252,248,241,146,132,245,211,141,112,62,36,34,64,36,59,131,95,124,107,29,28,24,29,36,34,27,22,22,19,26,24,21,20,21,21,32,41,46,59,45,108,113,138,203,148,114,92,78,95,123,111,74,83,76,66,50,32,31,21,18,24,21,19,18,21,23,23,26,22,20,21,21,26,131,211,199,145,63,36,27,9,56,152,106,100,38,35,72,63,33,45,83,75,60,52,50,40,21,21,20,22,24,26,29,22,31,31,25,27,26,24,24,21,22,22,16,17,21,22,19,17,17,19,31,50,38,47,58,41,35,48,69,74,56,33,15,23,35,34,29,22,19,20,19,18,23,22,20,23,24,25,19,25,42,45,33,21,23,43,58,58,57,54,48,39,25,46,88,101,105,106,105,69,60,86,98,53,62,205,232,243,249,238,239,231,237,236,235,238,235,234,237,236,234,236,235,236,236,234,235,234,234,234,234,235,233,235,234,232,233,236,235,233,233,231,231,234,233,233,233,231,231,231,230,230,230,230,231,231,230,231,228,232,229,229,231,227,231,232,230,231,230,228,118,4,0,4,8,10,9,10,10,12,11,12,11,185,188,187,188,187,185,188,187,188,186,186,186,185,188,188,188,187,187,189,189,189,189,189,191,191,191,191,189,192,191,191,191,191,191,191,190,192,191,191,191,191,191,192,193,193,194,196,191,191,194,191,192,196,191,193,194,191,195,193,193,194,194,195,194,194,193,193,194,197,194,196,195,194,194,195,197,194,197,199,197,196,198,197,195,197,196,197,198,198,195,196,198,198,199,198,201,200,200,201,199,200,200,201,199,202,202,201,202,202,202,201,202,200,202,201,202,204,203,204,203,204,203,205,203,201,203,203,203,202,202,201,203,203,200,199,199,200,199,200,198,198,196,198,199,198,198,196,197,197,198,198,198,196,198,200,198,199,198,195,197,196,196,195,194,193,196,196,194,196,196,193,194,194,194,195,195,193,194,196,193,195,194,193,194,195,193,192,193,193,191,194,194,192,192,193,192,191,191,191,191,191,192,193,193,193,191,192,193,190,189,192,191,193,190,192,193,193,192,193,191,189,192,190,190,192,192,191,191,191,193,194,192,193,194,193,192,193,196,195,194,194,197,195,194,195,196,198,195,196,195,196,198,198,199,200,197,188,198,202,200,205,198,193,200,203,204,202,200,204,203,200,206,208,206,205,205,207,206,206,207,208,209,210,210,211,212,211,212,212,212,211,211,214,215,216,215,216,216,215,215,216,215,217,214,215,215,211,214,216,217,217,216,222,191,198,222,214,217,212,214,214,213,214,214,211,213,223,223,179,102,179,174,113,93,106,141,106,98,71,67,57,64,96,95,112,102,105,119,111,141,160,113,98,149,167,162,182,198,197,200,195,162,118,74,43,25,17,12,10,10,9,10,11,10,10,11,10,24,39,53,78,129,155,199,198,104,126,139,97,174,231,218,245,232,208,175,107,196,239,231,236,203,244,171,166,221,216,251,157,85,75,65,60,44,101,63,40,57,222,246,139,183,196,223,152,100,166,182,233,220,222,196,150,212,184,163,217,249,251,181,49,4,3,7,33,38,34,24,26,48,53,65,27,49,184,246,232,216,184,120,191,250,252,252,250,250,149,131,218,252,237,134,91,81,85,25,37,23,64,123,87,120,84,25,35,22,33,38,33,30,19,24,21,20,21,23,26,20,24,23,25,35,33,32,46,38,114,165,108,92,76,76,94,98,69,55,62,51,47,35,29,35,27,24,23,19,19,24,24,15,21,23,24,24,15,28,21,82,131,106,81,53,33,15,10,19,69,85,46,15,59,147,110,106,203,246,191,118,135,125,68,29,16,21,25,23,18,26,29,27,30,34,37,32,33,31,28,28,24,28,33,39,39,33,28,23,27,46,55,50,50,42,39,36,41,52,65,66,48,27,17,19,18,17,20,18,22,20,22,22,22,26,21,24,24,24,73,111,81,54,35,25,60,71,55,59,54,55,46,24,57,90,99,95,95,108,81,64,79,77,52,61,176,230,238,249,238,237,233,236,235,235,236,234,233,232,234,233,234,235,235,236,233,233,234,234,233,234,233,233,234,235,234,232,234,234,234,234,232,233,233,232,232,232,233,231,234,232,230,231,230,231,231,230,229,230,230,231,231,230,230,230,232,233,232,231,227,117,4,0,4,8,10,9,10,9,11,12,11,11,184,187,186,188,188,187,187,186,188,190,185,187,185,185,191,186,187,187,186,190,190,187,186,188,188,188,189,190,188,188,190,188,190,192,189,193,191,191,192,191,191,189,192,190,192,191,189,192,192,190,192,194,192,191,192,193,193,192,193,192,192,194,192,194,194,192,193,196,194,193,198,193,191,196,196,195,194,196,198,194,199,199,198,199,199,198,199,196,195,198,197,198,199,196,198,199,198,198,199,200,199,198,200,198,199,200,198,200,202,200,199,202,201,203,201,202,203,202,205,201,202,204,205,203,200,204,200,202,200,200,203,202,203,201,200,199,199,199,197,197,197,199,200,199,198,198,198,196,196,196,194,194,197,196,196,197,193,194,195,195,195,193,193,193,198,195,194,193,195,193,191,192,194,195,191,193,191,194,193,193,193,191,193,191,193,193,190,192,190,191,193,188,192,192,192,191,189,191,191,192,192,190,191,192,191,190,191,190,188,190,190,190,193,191,190,192,192,192,191,190,192,188,188,187,190,191,191,192,191,190,192,193,190,193,191,193,193,193,194,193,197,195,197,196,194,197,196,197,193,195,198,198,198,196,198,190,190,200,199,201,202,195,194,200,205,204,203,206,200,205,204,201,206,202,207,205,204,205,207,208,207,210,208,210,210,208,207,207,210,210,212,214,214,213,214,214,214,212,213,213,214,214,214,215,213,212,215,212,212,214,215,214,220,185,187,217,212,215,212,212,214,212,212,212,209,210,225,219,190,126,135,121,95,97,98,98,92,104,103,120,89,64,63,39,93,153,181,212,214,214,229,147,73,159,230,188,160,185,177,145,138,132,102,83,159,214,209,202,206,145,139,129,46,6,18,37,115,62,28,11,9,23,19,30,58,34,106,137,116,239,251,233,249,176,162,148,107,212,232,232,234,200,246,176,169,217,217,251,152,87,75,71,63,34,96,76,30,45,201,189,146,222,216,200,110,132,204,198,244,213,226,177,149,224,171,172,235,248,212,75,5,9,8,22,46,32,24,23,42,54,66,45,32,155,248,248,227,159,139,174,237,252,251,251,251,170,131,207,251,251,198,84,104,112,120,45,55,24,81,135,101,134,77,24,35,26,37,37,31,32,17,19,24,24,18,24,29,21,23,24,28,24,24,21,41,69,130,120,79,74,61,63,61,49,46,34,36,36,30,33,39,44,45,35,23,21,23,29,24,19,21,27,26,21,29,22,33,86,88,51,49,33,22,15,14,20,64,105,64,69,124,143,86,84,184,231,201,92,155,118,69,45,33,33,39,38,23,25,24,28,29,34,31,31,37,24,24,29,31,53,68,63,65,53,38,27,31,38,36,47,50,35,36,38,45,52,65,80,67,53,46,34,27,26,19,22,21,26,19,21,27,21,24,28,22,80,177,163,101,97,72,59,71,71,60,61,55,54,43,32,57,84,90,90,95,112,89,65,78,80,42,45,191,235,239,249,235,238,234,235,236,233,235,232,233,235,233,236,235,234,235,233,233,233,234,233,232,231,233,233,233,233,232,233,232,233,233,235,233,233,232,231,231,233,234,232,232,232,231,231,231,231,230,231,232,229,232,230,231,234,229,232,232,232,233,233,229,118,3,0,5,9,9,9,12,10,11,12,12,12,186,187,184,184,185,186,190,186,187,189,186,188,186,186,189,186,188,188,186,187,185,186,186,188,186,186,189,188,190,189,189,188,191,191,188,191,190,190,191,193,190,189,190,188,193,190,192,192,190,193,189,191,194,191,193,193,193,193,194,192,192,194,191,191,190,191,193,193,194,191,194,198,195,196,196,196,196,196,194,195,198,199,198,196,200,197,195,197,198,198,198,196,196,195,196,196,197,196,198,201,199,199,199,197,199,196,198,199,200,202,202,202,201,203,203,200,201,200,201,200,202,204,205,203,202,203,201,201,200,201,202,201,199,202,202,198,198,197,198,198,200,198,198,194,196,199,196,198,196,196,195,195,196,196,196,194,197,196,192,194,195,193,194,194,194,195,193,193,193,190,191,193,191,194,196,195,193,192,193,193,194,192,194,190,190,193,190,190,191,191,192,191,190,191,191,191,193,194,193,193,191,193,191,190,191,190,190,193,190,192,190,190,193,191,191,191,191,189,193,191,191,193,189,191,192,191,193,193,193,193,192,193,191,190,193,193,190,192,194,192,192,196,196,196,195,195,196,194,196,199,197,196,194,198,199,192,192,201,200,200,201,191,196,203,201,205,203,204,205,202,203,206,206,205,206,204,205,204,206,207,209,211,210,209,210,209,208,211,210,210,211,210,211,214,212,212,212,212,214,213,214,212,215,213,213,215,212,214,212,212,213,211,226,196,199,230,221,218,213,213,212,211,212,210,211,207,226,194,131,108,96,83,84,71,60,62,84,153,211,251,193,99,161,205,252,252,252,252,250,214,228,143,73,149,198,156,109,126,121,104,90,84,95,100,211,241,245,248,248,248,246,188,118,93,119,215,243,200,174,101,62,67,23,28,27,14,38,22,68,190,212,225,177,126,185,141,139,225,226,236,229,201,244,179,168,214,214,251,151,70,77,65,67,36,101,106,66,60,117,151,180,248,199,123,65,153,227,211,249,211,224,163,170,225,191,200,250,235,99,16,6,18,17,36,40,26,19,38,56,62,50,27,128,243,250,250,170,113,188,229,253,253,244,245,170,142,209,251,252,242,124,108,108,82,74,51,84,72,120,144,119,151,69,21,37,27,38,34,33,32,19,23,19,26,24,21,24,24,35,30,30,33,27,32,51,88,110,74,54,60,51,51,34,34,34,33,32,19,17,39,49,44,51,46,24,24,25,27,29,24,26,25,27,25,23,28,53,107,91,47,43,22,17,16,24,31,72,135,124,106,145,121,67,48,18,67,80,47,76,62,61,61,60,67,59,57,42,26,21,23,21,27,28,27,33,27,29,29,53,61,50,34,34,47,50,50,50,36,26,39,51,68,75,73,65,64,80,86,82,79,77,69,63,50,40,34,33,29,29,27,24,28,32,45,77,136,137,94,99,152,135,89,79,63,63,61,51,48,49,30,55,86,92,89,84,107,92,75,70,83,40,44,200,236,242,249,234,238,234,235,234,235,233,233,234,233,236,236,236,234,233,234,233,235,233,233,232,233,233,233,231,230,233,233,233,231,233,233,232,232,234,231,232,232,233,233,231,231,231,231,231,231,231,230,233,233,229,230,230,232,233,232,234,234,233,233,230,118,3,0,4,9,9,9,12,10,11,12,12,12,186,188,184,185,188,185,184,186,185,186,186,187,188,187,188,188,186,185,188,185,186,187,189,189,187,188,187,189,188,188,189,188,189,191,188,189,188,189,187,190,190,188,192,190,192,192,190,191,191,190,191,192,190,193,191,192,192,190,194,191,193,193,191,193,193,194,195,194,193,194,195,195,196,195,196,194,194,196,198,193,194,196,196,198,196,195,198,196,196,199,196,198,198,198,198,198,196,198,198,195,197,198,200,200,199,200,199,199,202,199,201,204,199,201,200,200,200,200,201,199,202,201,200,201,199,201,198,203,200,198,201,199,201,197,196,199,198,197,197,200,197,196,199,196,199,196,195,198,196,195,195,196,195,194,195,196,193,197,196,194,195,194,195,193,194,194,192,192,196,192,192,192,193,193,192,193,193,194,193,194,192,191,193,190,194,193,194,193,190,193,191,190,191,191,190,191,191,192,189,190,190,188,190,192,192,189,193,192,190,192,191,190,192,191,191,191,192,191,192,191,191,194,194,189,191,193,192,192,194,194,195,194,193,194,194,194,192,196,195,193,197,195,198,196,196,196,195,196,198,200,198,199,196,199,196,188,200,200,200,203,198,192,199,203,203,205,204,207,205,205,202,203,201,203,210,205,205,206,205,207,208,209,212,212,210,211,212,209,210,211,210,211,211,212,213,213,212,214,213,213,215,214,213,215,214,213,211,213,214,214,213,217,235,206,216,249,232,220,214,212,214,210,211,212,208,209,233,172,90,77,55,55,67,43,39,44,75,147,208,244,200,198,252,252,252,252,253,253,191,116,111,90,73,66,63,76,65,53,55,48,50,57,57,56,76,93,110,117,141,152,140,97,53,67,113,209,230,215,196,149,158,127,108,120,165,122,47,41,33,66,137,206,189,169,206,154,166,232,228,237,225,202,241,187,170,212,217,251,153,79,80,58,62,25,98,115,96,85,120,146,178,226,169,141,123,198,224,216,247,218,224,163,187,229,201,217,237,142,33,9,12,19,41,42,27,20,33,50,62,55,22,98,235,245,251,218,111,152,232,246,252,252,249,169,137,202,250,250,252,158,87,106,87,27,29,24,59,93,123,134,125,131,46,21,37,25,38,36,34,24,23,21,23,27,21,26,25,25,34,29,33,35,34,33,60,92,87,71,52,49,41,38,34,16,20,23,16,21,18,29,54,60,63,58,45,24,23,29,24,24,23,29,29,30,31,37,79,105,87,66,42,23,30,42,57,75,92,123,151,111,81,113,88,68,49,22,29,51,65,53,69,68,82,84,64,70,59,40,26,15,19,21,24,24,23,27,24,30,55,55,26,16,19,24,50,77,83,75,76,97,109,122,124,97,80,71,61,59,59,77,93,97,103,89,78,71,63,63,53,34,31,50,73,99,115,97,59,42,52,62,63,57,63,57,53,59,53,53,52,37,45,84,91,97,88,100,92,66,69,65,27,63,228,241,244,248,234,239,233,236,234,233,235,233,233,234,233,234,234,236,236,235,235,234,235,236,236,234,231,231,233,233,234,233,233,233,231,232,232,231,232,234,233,233,232,231,231,231,232,232,231,231,230,231,231,230,234,231,230,233,230,233,233,231,233,233,229,118,4,0,4,7,10,9,10,10,11,12,11,11,187,191,188,188,188,188,186,185,185,186,188,188,187,184,187,185,184,187,186,188,189,187,186,188,186,186,185,185,188,187,188,186,188,187,186,190,187,190,188,188,190,189,191,187,191,192,187,190,190,189,190,189,191,190,190,193,194,192,192,192,193,194,193,193,193,194,194,194,196,193,197,195,191,193,197,196,194,194,193,197,196,195,197,196,197,196,197,196,199,194,198,198,197,195,195,197,196,196,196,199,198,198,197,198,199,196,198,199,199,199,200,199,200,201,202,200,201,201,201,203,202,201,201,199,198,200,199,199,201,198,198,199,198,200,199,197,199,198,198,198,198,196,196,197,197,196,195,197,195,196,196,194,194,194,194,193,193,193,195,194,193,194,193,195,193,195,195,191,196,193,195,192,193,195,192,193,192,194,191,190,193,193,194,194,193,192,191,193,191,192,193,190,191,191,190,190,191,190,192,190,191,190,188,190,189,189,191,192,190,191,189,191,193,189,192,192,190,192,191,193,191,191,193,192,191,191,194,193,190,193,192,193,193,193,196,195,194,198,198,198,198,198,196,196,198,196,196,196,198,199,197,198,200,203,196,192,198,201,203,203,196,198,205,206,206,205,205,207,208,205,204,202,202,206,208,208,207,208,209,209,210,210,211,211,213,212,210,210,211,211,213,213,212,215,213,214,214,210,212,210,213,214,213,214,213,217,214,214,215,212,212,218,232,178,150,214,230,217,212,211,213,212,212,212,211,207,230,155,61,36,11,61,108,103,98,90,84,80,60,78,48,63,153,157,152,122,108,74,16,6,29,21,24,24,12,33,29,26,27,29,25,21,29,19,29,28,33,34,24,33,28,31,31,45,74,82,79,83,96,119,135,128,119,145,181,145,112,89,66,59,152,245,204,221,212,141,195,230,233,239,223,203,238,197,173,208,214,251,156,82,93,52,64,14,69,73,36,76,116,125,160,234,186,145,156,216,199,200,242,223,205,160,206,229,217,203,147,50,3,18,16,36,42,27,22,27,51,61,56,28,81,222,247,247,221,139,137,205,249,249,250,251,173,124,186,238,250,250,169,90,96,108,34,46,109,87,64,73,101,128,126,111,33,23,39,27,38,34,35,27,23,26,26,24,25,23,25,30,33,38,36,42,35,60,144,177,141,87,66,61,64,60,29,16,18,17,16,24,24,21,57,72,72,79,64,44,27,26,28,27,29,30,40,37,31,49,86,95,61,37,29,25,52,60,63,66,64,89,149,145,96,76,83,97,87,87,55,48,69,64,64,65,75,67,55,55,53,44,27,16,16,22,20,27,29,27,29,32,55,44,23,22,19,24,57,84,91,101,98,92,77,71,65,53,50,47,44,34,39,61,69,78,92,104,110,116,129,120,89,57,81,126,128,113,78,53,35,24,27,23,20,30,37,40,49,55,57,51,52,40,45,80,95,98,97,103,92,64,39,42,45,139,246,246,249,249,240,240,233,236,236,236,234,236,238,238,238,236,236,236,236,234,234,234,234,236,237,238,234,233,233,233,236,236,238,235,233,234,236,234,236,236,236,236,236,236,233,232,234,233,234,233,231,233,234,232,231,235,233,234,234,234,234,232,235,236,228,117,4,0,4,8,10,9,10,10,11,12,12,12,184,189,184,187,189,182,185,185,187,188,187,186,186,184,184,186,184,186,184,184,188,184,186,185,186,186,184,188,184,184,185,186,186,186,188,186,188,187,187,189,186,185,184,190,189,187,190,188,188,190,192,189,187,192,188,189,189,187,193,190,191,191,190,192,188,189,193,191,191,192,194,192,193,193,193,194,195,196,195,194,196,196,194,194,195,195,195,193,194,194,191,194,196,195,194,193,197,194,194,197,196,194,196,195,194,197,196,196,200,199,199,201,198,200,200,200,198,198,203,199,199,198,198,200,196,200,200,198,195,196,199,196,196,197,195,196,196,195,194,195,195,193,193,192,192,195,194,193,194,191,194,193,196,194,192,195,191,193,190,192,196,190,190,192,192,192,191,191,192,192,194,191,192,193,192,194,190,191,194,193,193,193,191,192,190,191,191,190,190,192,191,189,191,192,190,190,189,191,190,190,191,193,190,190,191,187,191,189,188,191,190,189,190,190,189,190,190,192,191,191,191,190,191,192,193,189,190,191,191,191,190,191,193,192,194,196,194,197,197,197,197,196,197,196,197,196,198,198,197,199,197,200,200,203,197,193,202,200,204,199,194,200,207,206,206,210,205,205,206,206,206,206,202,204,204,205,208,208,207,205,207,207,210,211,212,212,208,211,211,211,211,214,214,214,214,214,212,213,212,211,212,210,214,214,211,212,212,216,211,211,211,216,208,92,39,129,198,215,215,212,211,211,212,210,208,212,221,144,52,56,79,97,86,75,83,56,54,69,41,28,23,26,15,27,44,14,46,54,19,15,15,14,14,21,21,18,23,20,26,29,17,19,22,18,24,19,24,26,27,26,19,28,25,44,48,59,59,52,27,39,95,55,71,63,72,92,75,87,47,68,189,243,192,200,183,147,213,223,230,236,218,204,234,198,173,206,213,251,147,77,95,59,66,19,61,83,23,46,118,155,219,250,163,78,138,200,173,202,236,223,182,156,210,232,221,141,64,5,10,22,31,48,34,25,24,41,60,63,32,61,201,244,250,230,139,159,187,231,250,233,251,178,114,177,220,249,254,193,94,83,107,84,26,68,118,83,67,76,92,118,131,96,27,28,35,33,43,33,31,24,22,26,31,24,23,25,24,32,38,48,70,53,69,167,201,180,143,122,129,130,92,49,24,15,21,19,18,23,25,27,63,73,74,91,76,61,41,29,29,27,27,32,39,38,34,42,55,43,24,26,23,26,42,48,50,44,50,59,121,155,125,118,87,98,106,96,90,69,60,63,70,67,66,49,40,36,30,36,34,20,18,17,20,27,23,30,23,28,57,51,31,21,23,33,46,51,53,50,39,30,29,31,33,33,33,28,26,32,45,55,58,57,62,69,76,87,105,118,107,103,123,123,98,71,47,27,21,26,21,15,21,21,34,53,60,60,55,54,56,38,44,95,105,99,76,97,103,66,29,61,101,179,247,247,250,250,240,243,237,236,235,234,233,234,236,239,235,235,236,235,234,233,235,235,234,233,235,234,236,235,235,234,235,236,233,236,235,234,235,233,234,233,233,234,233,235,233,233,235,234,234,234,234,233,233,233,233,230,231,235,229,231,233,232,236,231,229,118,2,0,5,8,9,9,10,10,11,12,12,10,184,190,184,187,186,186,186,187,189,187,187,186,188,185,188,188,185,187,185,186,185,186,185,186,189,188,187,187,188,188,188,187,190,187,188,188,188,190,187,190,189,189,189,188,188,190,189,189,190,188,193,190,191,191,190,191,188,190,191,190,191,189,191,191,191,193,190,193,194,192,193,191,193,194,194,194,193,195,191,197,195,195,196,194,194,196,196,195,196,192,196,194,194,195,198,197,197,198,197,199,196,198,196,198,198,195,199,200,201,201,203,200,199,198,199,199,196,199,198,197,199,200,199,196,197,198,199,197,199,197,197,198,197,200,196,194,198,196,196,194,192,196,194,195,194,193,192,194,192,192,192,191,194,193,193,193,192,192,193,192,193,192,190,193,189,192,193,192,194,191,194,191,191,192,193,191,193,194,194,194,192,192,191,194,194,191,191,191,189,191,193,192,192,194,192,190,190,189,194,192,192,193,191,191,191,193,189,190,191,191,191,190,190,190,191,192,191,191,190,193,192,189,193,193,193,189,190,193,192,194,192,194,194,193,195,195,196,196,197,198,198,200,201,200,202,199,199,200,200,201,200,200,200,200,193,199,203,203,206,196,198,206,209,205,207,207,207,209,207,209,210,208,208,203,203,205,209,210,207,207,210,210,211,209,212,211,210,213,212,211,214,212,210,214,212,212,214,212,213,211,213,213,212,210,212,214,210,212,212,214,213,222,195,78,7,88,191,216,216,210,210,209,210,211,207,211,219,177,107,54,44,63,34,33,46,38,39,63,53,36,24,22,24,45,63,29,59,44,17,21,14,24,19,19,18,20,19,22,24,24,20,15,25,14,18,23,17,27,25,22,22,21,25,51,44,44,76,78,39,49,66,51,60,43,52,44,50,53,27,76,191,191,133,200,156,159,229,208,233,229,222,208,232,206,173,203,216,251,152,73,96,62,73,30,93,104,30,51,139,215,251,251,136,79,159,207,181,207,237,223,152,154,214,242,198,68,15,7,7,30,43,38,23,18,42,55,64,42,47,177,246,246,239,150,162,220,210,246,250,235,182,124,171,227,251,251,251,134,77,103,95,73,35,84,84,39,88,95,81,114,124,85,23,30,37,31,41,33,33,26,25,24,28,29,29,28,27,42,98,147,163,137,146,199,169,137,122,119,130,88,53,33,17,22,19,23,24,23,28,31,66,72,66,90,85,69,55,31,24,34,33,35,38,33,28,22,19,17,22,21,18,19,23,34,38,36,45,35,52,74,74,122,129,101,94,102,110,104,93,83,70,66,62,46,33,23,21,26,33,27,22,20,23,26,25,27,33,21,36,59,48,41,34,41,39,28,25,23,17,17,21,17,15,19,25,25,32,38,46,46,50,49,35,43,53,62,64,67,65,70,79,69,58,50,32,24,21,23,25,23,32,27,51,76,69,67,64,54,55,44,45,107,122,101,78,84,100,63,67,110,95,98,165,221,249,249,242,242,238,238,236,235,234,236,236,233,235,234,234,236,235,236,234,232,233,233,233,234,233,234,234,233,234,233,234,235,233,235,238,236,237,236,233,235,235,233,233,235,235,234,234,236,235,233,231,231,233,232,232,234,233,232,232,233,234,232,229,120,3,0,5,8,9,9,12,10,11,12,13,12,185,191,186,185,188,185,187,186,187,186,188,189,187,189,191,188,185,188,189,185,187,186,189,186,185,188,186,187,187,187,188,188,184,186,186,186,189,189,186,187,189,190,187,188,187,186,190,188,189,189,189,188,189,190,189,188,190,191,190,190,189,189,191,192,190,189,192,191,193,192,192,191,191,194,194,196,193,193,195,196,196,196,197,193,196,196,193,195,196,196,195,196,195,194,197,197,196,197,194,194,195,197,198,196,197,199,198,198,200,200,198,197,198,199,198,197,198,198,197,196,200,198,196,198,196,196,196,196,197,196,193,196,196,197,197,195,195,195,196,197,196,194,195,195,192,194,195,193,193,192,193,191,192,193,192,191,191,191,189,192,194,191,193,192,191,191,189,192,189,192,193,190,192,192,193,194,194,192,192,192,194,196,194,193,191,190,190,193,193,191,193,192,193,191,191,192,192,190,191,192,189,193,190,190,192,189,191,192,191,189,191,192,188,192,192,193,190,193,193,192,192,192,193,191,191,191,196,193,191,192,190,192,192,194,194,196,196,198,198,198,199,201,202,200,201,200,201,202,200,201,199,200,203,198,192,202,206,205,204,194,201,207,209,208,206,207,207,210,209,208,208,208,208,208,205,204,205,209,212,210,210,209,210,210,210,213,212,211,211,210,210,210,212,212,214,213,212,213,212,211,210,212,212,211,209,214,211,213,211,210,214,223,218,127,90,165,212,220,215,210,211,209,208,210,208,215,211,212,122,5,2,21,29,33,35,31,37,64,56,45,36,29,25,66,66,42,74,41,21,28,13,23,23,15,23,15,24,27,27,26,17,22,18,21,19,18,19,25,31,22,17,19,42,53,36,57,77,80,55,67,72,56,63,38,57,63,53,39,33,128,178,151,170,220,151,179,230,206,235,227,222,203,228,211,174,201,214,252,151,71,104,63,72,41,74,98,39,62,157,223,249,218,127,151,234,226,184,207,242,212,147,177,212,235,131,26,9,4,19,45,40,26,12,37,54,61,47,29,155,244,249,249,160,156,219,242,222,236,252,171,135,182,226,249,252,252,244,96,84,108,77,76,29,57,93,96,111,83,86,126,126,71,21,29,29,34,39,29,33,24,27,29,26,51,80,49,49,135,226,231,198,151,142,177,143,122,112,83,59,38,32,18,17,20,23,21,25,29,36,32,46,72,68,93,90,68,57,36,31,32,31,36,34,33,26,22,19,17,16,15,17,16,18,21,23,21,28,27,29,27,53,122,132,131,103,83,97,100,109,96,87,75,55,50,30,22,19,22,27,32,29,17,21,21,28,32,24,25,22,39,61,60,61,57,35,20,15,16,22,18,19,22,18,17,19,25,21,33,41,39,37,35,33,46,41,39,43,43,45,55,62,52,43,33,21,21,23,19,25,30,36,31,61,89,71,73,68,51,61,43,38,104,116,111,94,87,101,88,102,109,62,27,18,89,228,236,248,237,235,240,238,238,237,236,232,236,235,235,236,237,236,235,235,235,235,235,235,234,234,232,234,234,234,236,233,235,234,236,238,236,238,236,236,236,233,235,235,235,233,233,235,232,233,236,234,233,231,232,233,233,230,231,231,231,236,233,229,118,4,0,4,8,10,9,10,9,11,12,11,11,184,189,187,186,185,187,187,187,188,186,188,188,188,186,186,186,187,187,188,188,186,189,187,185,187,188,188,186,187,187,188,186,187,187,187,185,184,187,185,188,188,186,187,186,189,188,186,188,190,186,187,187,191,189,189,192,190,192,188,190,190,191,191,189,193,191,190,191,190,192,192,193,192,193,195,195,196,195,193,196,195,192,194,194,191,192,195,194,194,192,196,195,196,197,196,194,196,197,194,196,195,199,199,196,197,197,197,199,198,196,198,196,198,196,198,196,196,201,199,198,198,199,195,198,198,196,199,194,197,197,196,197,195,194,194,194,196,194,195,193,193,195,193,195,191,192,194,194,194,192,193,193,193,191,191,193,194,191,191,193,193,191,192,192,190,194,193,189,190,192,191,190,191,192,193,191,195,193,192,193,191,193,192,193,192,191,191,191,192,194,194,193,193,194,191,193,194,193,192,190,191,188,190,192,190,191,192,193,191,192,193,193,191,194,196,191,193,194,194,195,196,195,193,191,192,193,193,195,193,194,194,193,194,194,194,196,195,198,198,200,204,204,203,203,203,201,202,202,204,204,204,202,205,198,198,206,205,210,198,196,207,207,208,208,208,207,208,209,207,208,206,208,208,208,209,203,203,203,205,210,210,210,210,209,211,209,210,211,210,210,212,212,210,213,210,211,212,210,213,212,212,212,213,212,213,214,212,215,212,214,211,224,228,202,201,217,231,222,214,212,212,212,212,212,213,212,207,212,144,43,11,21,11,27,30,27,30,42,55,33,32,36,28,54,58,47,72,39,30,54,39,44,42,24,29,24,19,25,21,29,21,15,21,18,24,21,25,37,31,24,26,29,64,59,46,71,87,78,72,92,51,51,55,48,61,42,59,31,25,135,221,194,212,231,137,207,231,204,237,221,223,198,227,212,173,198,215,251,152,62,92,65,64,38,57,89,52,38,149,237,233,208,126,178,251,218,186,197,244,207,167,208,193,155,61,3,12,6,41,42,25,20,29,54,60,54,29,126,245,245,251,169,154,214,243,247,200,245,201,129,198,238,250,250,253,253,202,45,89,101,66,64,26,31,50,76,61,48,98,136,128,55,21,36,33,35,40,32,34,23,22,29,42,106,139,133,147,189,224,180,120,89,76,108,102,83,62,34,29,19,23,19,20,26,25,28,34,39,38,37,31,47,62,66,62,57,75,57,37,36,27,37,38,31,25,24,23,17,16,15,14,15,15,18,18,19,18,18,19,23,55,100,129,131,116,103,87,64,75,93,90,79,65,57,34,12,17,15,32,38,27,24,20,20,31,32,25,20,25,18,21,39,39,31,19,22,22,24,28,23,23,27,26,32,27,19,27,21,23,31,32,31,43,57,45,44,35,23,24,26,35,30,28,23,20,22,20,24,35,30,29,29,66,85,64,72,66,55,57,51,35,84,106,114,99,89,105,103,122,80,72,82,34,23,153,231,249,247,235,236,237,233,236,234,232,233,236,236,235,236,234,232,233,234,236,234,233,234,233,234,234,235,235,235,236,236,233,235,239,235,235,235,235,236,234,236,234,234,233,234,233,233,234,233,233,231,232,232,231,233,232,234,235,234,235,234,229,117,4,0,3,8,10,9,10,10,12,12,11,11,184,190,187,185,188,185,190,189,187,188,188,188,188,188,186,185,186,189,186,188,187,185,190,188,186,186,186,189,188,186,187,184,188,189,188,190,186,186,187,187,188,187,185,186,187,187,189,188,188,187,188,188,190,188,191,188,188,191,186,191,188,189,194,190,191,191,190,193,195,194,192,195,194,193,193,192,196,197,195,195,192,194,194,192,194,192,192,197,195,193,194,195,195,196,195,193,194,194,197,198,198,197,199,200,196,198,198,198,201,197,199,199,196,197,198,196,198,198,198,198,197,196,198,197,195,198,200,197,197,198,196,196,195,193,193,194,196,198,194,193,196,196,192,193,193,191,193,191,194,192,192,191,191,192,190,190,194,191,191,193,191,191,191,191,193,191,191,193,192,193,192,191,194,192,192,193,191,192,194,192,191,191,192,193,193,193,194,192,192,192,194,192,195,194,196,197,196,198,194,193,192,194,190,189,192,192,192,194,195,193,195,193,194,197,196,194,194,195,194,198,198,193,194,194,194,195,193,192,193,193,195,196,196,196,198,199,198,198,200,203,205,201,204,205,205,206,206,207,205,207,206,207,205,195,202,207,208,207,199,201,208,208,207,208,209,209,208,208,207,210,211,209,209,212,210,206,207,204,204,208,211,212,210,210,212,210,210,210,211,212,209,211,214,210,213,212,210,210,212,214,213,213,212,214,212,214,212,214,214,214,213,219,221,205,223,224,220,222,214,214,212,210,211,210,212,212,198,219,182,129,126,79,12,8,21,23,24,37,50,33,21,28,24,45,41,54,73,37,53,73,49,69,60,31,41,23,18,23,15,22,20,25,20,25,28,30,30,28,33,34,49,64,90,87,77,87,83,59,75,79,50,51,37,38,20,19,51,36,90,210,234,197,228,200,134,228,223,203,232,218,224,198,224,216,176,200,212,251,147,59,94,48,72,63,74,116,49,46,171,217,247,185,71,180,244,213,196,191,247,197,190,229,129,68,14,9,15,20,49,33,23,24,46,63,57,27,107,237,250,250,171,145,212,239,247,233,201,199,150,186,249,249,251,249,251,243,95,24,89,84,44,48,59,93,75,57,27,57,126,144,117,38,21,33,28,35,36,34,28,23,24,29,60,135,112,124,119,115,115,71,44,41,37,47,49,43,31,19,16,19,16,22,36,33,51,75,65,55,49,35,31,30,34,45,46,41,62,74,64,43,33,36,30,33,32,27,27,18,17,20,15,19,16,17,17,15,16,14,18,27,36,55,63,81,86,82,78,67,71,68,71,78,67,51,33,19,14,20,27,30,29,29,21,17,28,26,26,28,23,26,20,16,29,22,21,23,24,24,25,29,26,27,27,33,33,29,29,26,19,34,48,46,58,60,73,68,54,36,21,19,19,18,21,23,24,21,28,31,31,37,33,29,62,78,64,66,62,55,60,54,38,76,97,118,105,92,104,109,109,78,98,115,83,39,106,223,245,245,241,231,235,237,234,233,233,234,234,232,234,233,234,235,231,232,233,234,234,234,236,236,233,233,234,235,235,234,236,237,235,234,236,234,234,234,233,234,233,235,234,234,235,235,233,232,231,235,234,232,233,232,230,231,233,233,234,232,229,119,3,0,6,10,9,9,11,10,11,12,11,11,187,191,188,188,187,189,186,187,188,185,189,188,191,189,187,188,188,187,189,188,189,189,185,189,188,187,189,187,188,186,187,188,186,188,188,188,188,188,187,188,186,183,188,188,187,186,187,187,186,187,189,188,188,185,190,191,186,189,187,188,188,188,190,189,191,191,190,192,193,192,193,190,194,195,195,196,193,196,195,195,196,196,197,195,193,195,197,195,195,196,196,196,195,195,195,195,195,196,194,194,196,194,194,195,198,200,197,196,197,196,196,197,198,196,197,197,197,194,196,196,196,197,198,196,193,195,196,194,197,193,194,195,194,197,196,195,192,194,196,192,193,193,193,194,193,193,192,190,196,194,192,194,193,194,192,191,192,191,190,191,191,190,193,193,190,191,193,192,192,191,191,193,191,192,192,193,191,191,194,193,193,194,196,196,193,198,198,194,194,194,196,194,195,196,195,199,198,197,196,192,193,191,195,194,195,199,197,196,196,198,195,194,195,198,199,198,198,196,197,195,194,193,193,193,194,192,193,194,193,194,195,195,196,197,198,200,199,198,202,203,203,206,205,205,205,204,208,208,206,206,208,209,204,198,208,210,209,205,197,207,208,208,210,210,210,209,210,211,210,211,212,211,211,210,208,207,212,208,204,205,207,211,210,210,213,208,209,212,209,210,209,210,210,211,211,214,212,211,212,212,212,210,213,213,212,212,211,214,212,214,212,218,215,198,211,204,214,218,210,210,211,212,212,209,216,210,201,200,184,163,200,203,63,12,12,14,22,27,47,29,19,22,18,42,44,57,72,35,35,39,32,53,51,38,36,22,20,19,17,22,18,21,22,23,27,24,30,30,20,36,53,68,70,58,68,63,61,42,74,56,35,47,21,61,67,89,189,179,151,210,197,154,223,175,143,239,210,210,226,217,223,193,226,217,176,192,211,251,150,75,120,63,74,92,107,129,73,50,178,246,250,149,42,175,233,238,227,188,251,185,213,225,55,17,8,6,24,37,45,22,29,51,53,61,26,82,222,243,251,178,140,198,241,246,240,229,151,132,191,239,252,249,236,247,230,120,45,38,111,63,39,96,88,95,100,103,57,96,139,139,109,28,25,32,31,42,33,37,32,19,26,41,98,101,61,47,29,34,38,33,35,30,21,19,27,21,16,25,21,19,42,70,72,68,96,118,108,76,52,43,40,35,30,27,27,30,28,39,55,48,31,24,31,35,32,24,22,21,19,22,18,18,18,19,16,16,16,15,14,22,34,34,38,47,43,59,73,64,71,66,56,62,54,48,31,15,14,14,24,30,42,32,26,23,18,29,33,26,26,28,29,27,25,25,26,24,24,27,24,27,26,25,33,32,31,34,33,32,27,54,71,53,53,56,62,74,71,53,36,24,27,26,21,28,32,27,26,30,31,30,34,32,58,75,55,57,69,64,56,56,37,69,99,120,114,101,114,106,110,76,66,84,84,54,80,189,241,243,245,235,235,236,236,235,236,236,232,234,234,232,233,234,234,232,234,235,234,236,234,235,234,235,236,236,234,233,235,234,233,233,234,235,234,235,234,235,234,234,234,232,233,234,234,233,233,235,233,233,234,235,232,231,234,234,234,234,229,118,3,0,5,9,9,9,11,10,11,12,12,12,184,188,186,188,189,185,186,187,186,186,186,188,186,184,187,188,185,186,184,187,186,186,189,187,188,188,189,186,185,188,186,186,189,186,186,186,185,188,188,186,186,189,188,188,186,186,187,188,187,187,186,188,188,188,192,188,187,190,189,192,191,190,193,191,194,192,191,194,193,194,192,192,192,192,195,197,197,194,196,197,194,195,196,194,196,193,194,196,194,195,195,194,196,196,196,194,195,197,194,191,192,194,195,196,195,199,197,194,194,196,196,195,196,196,195,194,195,196,195,195,196,196,194,197,195,196,196,193,196,195,192,193,194,195,193,193,195,193,193,194,193,193,191,194,194,192,191,192,193,192,194,191,193,192,193,194,191,191,191,192,191,193,193,192,193,193,194,193,191,191,191,190,192,194,191,192,193,193,194,196,196,194,195,198,200,198,197,196,198,198,199,198,198,196,198,200,199,198,197,197,197,198,197,198,198,199,199,200,197,197,196,195,195,197,197,196,197,197,195,195,197,193,193,194,192,196,195,194,196,196,198,198,198,200,199,200,199,202,204,203,207,207,208,206,206,206,207,208,208,209,208,209,203,203,213,213,212,200,204,212,213,214,211,213,211,210,212,213,211,210,212,208,207,210,211,209,210,209,206,206,204,209,210,208,212,211,211,211,212,212,211,211,214,211,210,211,211,212,212,213,211,212,212,212,211,212,213,213,212,214,213,217,216,199,207,197,208,220,211,214,207,208,212,208,218,206,199,195,180,134,193,249,141,67,15,10,17,19,43,29,18,23,15,44,40,59,61,23,34,26,27,36,32,34,32,23,23,23,25,30,27,27,27,19,21,25,35,33,27,26,42,71,42,41,58,59,57,37,61,23,22,49,119,234,214,200,252,211,113,129,133,174,232,149,163,243,205,210,225,223,222,196,225,219,172,190,208,251,152,83,152,84,78,38,49,127,62,58,207,244,224,65,47,214,226,228,218,199,251,185,172,127,13,26,4,18,46,39,29,27,44,57,64,32,63,201,248,248,190,133,190,237,247,239,228,185,97,166,245,249,252,217,241,198,149,112,39,71,106,58,66,105,57,36,27,90,101,114,134,133,98,24,35,32,26,42,33,35,32,25,24,90,111,61,37,23,24,16,17,23,19,22,18,18,22,23,19,21,25,38,68,83,103,88,86,104,106,103,78,66,59,47,39,32,26,29,32,28,26,29,24,28,35,39,34,29,29,24,29,24,17,21,16,20,21,16,19,16,12,19,30,35,37,31,30,38,53,63,66,65,64,55,40,40,29,13,14,15,23,38,49,48,31,24,24,27,29,25,25,24,26,27,28,28,28,29,24,31,32,24,31,28,36,39,38,48,44,37,36,57,73,62,46,39,44,46,57,70,65,61,52,47,43,28,29,28,32,30,28,27,33,28,49,76,55,59,73,60,57,62,37,67,98,114,112,110,113,100,98,92,94,83,67,50,64,170,241,240,241,238,238,237,234,237,234,233,235,231,234,234,235,237,235,236,235,233,235,234,233,232,236,236,233,233,233,234,233,235,236,235,234,233,235,235,234,234,234,234,234,233,233,234,233,234,233,234,234,234,235,233,234,232,233,233,235,235,227,117,4,0,4,8,10,9,10,10,12,12,12,12,186,191,186,188,187,185,187,187,188,185,187,187,187,188,186,188,185,185,189,187,187,188,187,188,188,185,189,186,186,190,184,190,187,188,188,184,189,187,189,190,190,189,187,188,188,187,187,187,190,189,190,189,187,188,190,191,188,191,191,188,192,193,194,194,192,193,193,194,195,194,194,192,196,194,194,195,196,196,194,194,196,198,198,197,195,196,198,196,195,195,194,194,197,197,196,195,195,194,192,195,195,194,197,196,196,196,195,194,195,195,194,195,193,195,196,193,196,196,198,195,195,197,194,198,196,196,196,194,197,193,194,195,196,195,193,196,194,194,194,192,195,194,195,193,190,193,192,193,196,194,193,192,194,194,195,193,193,194,194,195,193,193,193,195,193,194,194,193,194,192,195,193,196,194,191,196,196,195,194,194,196,197,197,198,196,198,200,198,200,201,200,198,199,201,200,200,205,202,202,203,201,201,200,201,200,199,201,200,200,200,199,199,200,199,200,197,195,194,196,196,193,195,196,194,198,195,196,198,198,201,201,202,203,202,203,200,203,206,206,206,207,207,208,208,211,210,210,210,212,211,212,212,202,205,213,211,208,204,208,213,213,217,213,212,214,211,213,212,212,212,210,211,210,210,212,211,212,211,214,209,205,206,210,212,212,211,211,213,210,210,213,214,212,213,212,213,211,212,213,213,214,213,216,212,214,214,214,216,212,214,211,217,216,198,209,198,209,220,212,217,212,211,211,211,220,203,198,192,217,163,187,226,132,125,31,38,80,37,40,31,47,47,55,65,35,53,29,7,39,33,31,34,31,29,33,30,27,31,36,36,28,34,30,31,20,27,40,29,27,21,44,70,29,51,36,36,45,18,24,24,117,162,208,252,240,208,240,136,72,118,155,220,232,136,192,241,207,214,215,224,221,197,227,219,174,186,204,251,148,79,148,87,80,31,12,80,49,59,220,248,136,18,94,236,128,153,208,196,249,151,114,44,1,27,6,38,47,31,22,37,51,66,48,46,182,246,251,218,147,189,236,247,234,223,191,115,126,228,251,250,222,224,181,134,217,128,50,48,92,61,79,124,78,87,32,65,84,103,133,131,81,13,34,40,29,42,32,33,36,19,27,83,80,30,24,15,14,19,18,19,22,23,24,24,28,25,19,28,27,44,61,61,79,71,80,89,105,129,106,77,67,56,54,48,31,30,27,31,33,24,28,32,34,35,35,33,26,27,33,27,21,23,18,19,20,18,15,16,17,23,31,36,37,39,35,35,33,43,72,69,60,55,39,33,28,15,16,12,30,55,59,53,39,26,22,26,30,27,21,21,22,29,27,25,29,25,27,23,30,32,27,32,40,88,114,108,98,83,55,48,62,57,62,48,30,35,49,73,87,85,81,76,66,61,51,47,49,41,30,22,30,26,49,72,62,63,75,66,55,63,39,65,101,110,113,97,121,89,49,66,77,82,49,37,39,125,240,238,241,241,237,238,233,238,236,234,235,236,238,236,237,237,238,235,236,235,233,237,236,234,236,236,231,235,235,236,237,233,234,235,234,237,235,233,234,235,235,233,235,233,234,233,233,234,232,233,230,232,232,231,234,235,236,234,234,233,229,118,4,0,4,9,10,9,10,10,12,12,12,11,184,189,189,188,189,185,186,188,187,189,187,187,188,186,188,187,188,186,185,188,185,188,186,187,189,186,188,186,187,189,189,190,191,185,189,191,188,189,188,189,188,188,187,189,190,187,190,188,186,188,189,190,188,190,190,189,190,190,191,190,193,194,194,193,191,193,195,194,194,196,193,196,197,196,196,196,194,195,195,193,194,195,195,196,199,198,196,198,196,194,196,195,198,195,195,194,195,195,194,193,193,192,194,194,191,195,193,195,195,194,195,194,194,191,192,196,196,193,195,194,194,195,196,195,193,197,196,194,197,195,194,196,192,194,193,192,195,196,193,192,192,193,193,192,193,194,192,193,197,191,192,191,194,195,191,196,194,194,194,194,196,197,196,196,196,194,194,193,193,193,194,194,191,192,193,194,192,192,195,194,195,194,196,195,195,198,198,199,200,200,200,201,202,201,201,203,202,203,200,200,203,202,203,203,203,204,202,201,201,203,202,202,203,201,201,200,199,198,196,193,194,195,196,200,198,197,198,198,201,203,205,205,204,204,203,205,207,206,208,208,207,210,210,212,212,211,213,213,214,214,217,210,200,212,214,214,205,204,214,213,214,213,213,212,211,212,214,212,209,214,211,209,213,213,212,211,211,212,210,210,208,208,206,207,214,214,211,211,212,212,212,211,214,214,212,214,213,213,213,213,214,214,216,217,215,216,213,211,211,214,214,214,215,198,208,200,207,220,212,217,212,214,215,212,220,199,195,192,231,196,199,189,99,104,51,156,220,185,214,226,246,247,241,235,198,186,168,164,206,156,57,16,43,50,48,35,27,32,33,45,32,33,33,24,31,37,66,77,87,35,42,65,22,51,22,46,46,66,160,186,247,214,224,251,139,129,166,109,138,197,194,248,207,133,218,236,209,210,217,224,216,198,224,220,174,187,205,251,150,63,130,84,72,29,30,79,53,53,213,236,106,34,97,124,53,162,218,208,220,108,61,12,7,26,22,48,35,21,31,48,63,50,34,148,249,249,248,184,203,235,250,236,202,182,122,147,189,245,251,218,223,170,125,194,253,144,29,47,89,68,75,143,148,148,63,63,81,99,144,126,62,13,33,39,35,44,30,34,31,18,19,33,37,23,20,16,21,20,24,27,27,29,25,32,34,25,31,36,44,92,81,92,99,76,79,90,115,99,49,55,58,75,71,41,29,32,27,28,27,27,34,33,38,37,39,31,25,32,33,27,21,25,25,29,26,19,23,17,20,24,31,32,37,38,34,42,39,42,46,65,68,49,45,35,26,18,20,27,28,31,25,29,37,26,19,28,28,24,21,26,26,29,29,26,28,28,21,25,33,27,34,26,71,158,192,191,184,153,134,100,49,51,67,55,27,21,44,55,65,68,66,68,77,98,112,107,89,55,30,25,24,24,50,79,63,62,72,71,59,63,41,50,90,106,118,101,112,102,56,33,50,85,81,64,23,65,222,235,244,241,233,239,236,236,236,236,236,235,236,235,235,238,234,234,234,233,237,234,235,234,235,236,235,236,234,236,235,236,234,233,235,234,233,234,235,235,235,236,234,235,232,233,232,233,231,231,232,230,233,235,231,232,233,234,235,233,229,118,3,0,4,9,9,9,11,9,11,12,11,12,182,189,185,187,188,184,187,185,187,186,188,186,182,185,185,185,188,186,188,185,186,188,185,188,189,186,189,186,188,190,188,189,189,189,190,190,192,190,189,189,192,188,188,190,189,190,192,191,192,189,190,191,187,189,193,190,192,193,194,193,191,194,194,190,193,195,192,194,194,195,196,195,197,195,195,193,195,197,194,196,196,195,194,196,198,198,197,197,197,195,196,195,196,196,195,195,198,196,194,195,194,193,194,195,193,193,194,194,191,193,195,194,194,192,196,194,193,193,194,192,193,193,195,198,195,195,195,195,196,197,197,195,194,195,195,196,194,196,194,191,195,192,193,194,193,195,194,194,195,195,194,194,194,194,194,194,196,195,195,195,196,196,195,198,196,196,196,198,197,194,198,195,196,193,191,196,193,194,195,196,196,196,198,197,196,200,202,201,202,204,204,204,204,203,204,202,205,203,202,206,205,206,204,204,205,204,204,203,202,206,206,202,204,203,204,201,199,202,199,198,196,198,199,198,203,199,199,202,201,204,204,208,207,204,207,205,206,205,206,211,209,207,211,209,212,213,214,214,214,213,217,207,207,216,217,211,202,212,215,213,214,214,212,211,212,211,213,214,212,214,214,215,213,213,216,213,214,211,212,212,212,211,208,209,210,214,214,213,214,211,212,215,214,215,214,215,214,216,216,214,216,215,215,215,215,214,215,216,211,216,214,216,217,198,208,199,207,220,212,217,214,214,216,216,216,199,190,190,234,195,191,172,104,107,50,178,250,250,253,253,252,252,252,252,251,249,252,249,248,194,49,53,128,113,67,39,24,31,33,42,33,33,33,35,32,90,208,251,243,105,115,188,132,165,168,170,169,217,252,252,252,214,226,179,83,139,178,147,182,189,181,250,173,146,231,225,214,206,218,227,214,195,222,218,178,187,205,251,150,53,107,79,67,24,31,97,61,41,190,209,110,61,9,7,69,226,251,212,176,42,24,17,5,33,39,39,21,24,50,59,56,35,122,239,252,252,200,213,247,251,252,222,173,119,155,215,221,249,236,228,174,117,184,235,249,138,19,25,85,69,26,77,119,102,40,78,90,110,139,119,50,16,45,34,35,39,28,35,29,15,24,19,16,29,24,22,24,24,29,29,35,37,32,30,37,36,39,36,47,97,91,110,117,97,128,117,89,63,48,55,64,63,61,41,24,28,27,29,27,27,34,31,33,42,33,29,26,31,30,26,23,27,28,27,30,27,22,23,18,24,33,34,39,37,38,42,46,50,51,57,59,57,48,38,34,41,47,35,29,22,17,24,20,21,23,23,26,27,23,24,30,31,31,24,27,27,29,27,30,31,34,33,75,155,173,176,171,165,165,158,95,60,98,59,30,21,23,31,33,40,39,45,49,59,93,112,95,58,31,22,24,22,50,83,69,63,80,76,51,63,45,45,90,108,118,99,117,99,61,94,103,120,113,109,60,34,193,236,243,246,232,240,233,234,236,233,235,234,234,234,234,237,235,231,234,236,234,235,236,235,235,236,236,236,234,234,236,234,235,235,233,235,234,234,237,235,234,235,235,233,232,234,238,238,235,236,233,235,236,235,234,232,234,235,235,234,230,119,3,0,5,8,9,9,11,10,11,12,11,11,184,188,184,185,187,185,182,187,185,188,186,185,188,184,186,184,187,186,187,186,185,187,188,186,188,187,189,188,188,189,188,190,189,188,190,189,187,190,189,190,191,190,190,190,192,191,189,189,190,189,190,192,192,192,193,193,193,193,193,194,197,193,191,196,195,193,195,194,195,194,193,195,194,193,196,192,193,196,196,197,195,196,196,197,196,195,196,196,198,198,193,196,196,196,198,194,194,195,195,194,196,195,194,194,193,195,193,196,193,194,195,190,192,193,193,194,193,193,192,193,193,193,196,194,195,196,195,194,197,197,195,198,195,196,196,196,196,196,196,193,194,194,193,194,194,195,195,196,198,194,196,193,194,194,196,198,192,196,197,192,196,194,195,195,196,198,196,198,199,198,198,198,196,196,197,197,198,198,199,199,198,200,199,198,200,203,203,204,206,203,204,205,206,205,204,204,203,204,205,206,208,205,203,205,203,204,205,204,205,205,206,206,206,205,205,205,203,203,202,203,202,201,202,201,202,203,204,204,203,206,206,207,209,208,208,205,208,208,208,210,208,212,212,213,214,214,214,213,214,213,214,203,208,215,214,208,201,214,215,213,215,212,214,212,214,213,213,214,212,214,213,214,214,213,212,214,214,214,215,213,214,216,215,209,207,211,215,217,217,216,216,216,215,217,218,215,216,219,216,217,218,216,217,216,216,217,216,216,215,216,216,215,220,200,206,201,204,221,213,220,213,213,212,212,217,196,189,193,239,207,204,177,113,113,57,176,248,235,249,241,250,250,240,225,195,210,205,214,227,99,42,75,145,152,81,39,24,30,37,44,42,35,34,38,33,29,158,238,243,145,87,186,207,250,250,240,226,251,252,230,198,145,221,205,143,191,188,122,136,147,186,250,146,168,236,222,216,201,223,224,214,193,219,222,180,190,205,251,154,53,95,79,79,33,36,75,63,40,192,209,89,54,6,4,134,245,252,214,96,11,19,6,18,38,37,29,16,44,55,59,30,97,226,245,250,201,201,240,252,252,253,216,131,144,217,241,237,246,247,178,123,173,231,247,250,153,41,43,71,81,45,29,39,42,53,107,97,108,141,104,38,21,43,33,41,39,26,43,28,19,21,20,22,18,30,26,23,29,34,37,39,41,33,30,29,33,41,34,48,62,69,55,67,90,102,81,65,42,50,80,61,57,56,45,37,22,23,30,27,28,34,34,31,36,36,34,26,33,30,31,29,24,28,28,36,28,28,29,18,24,32,35,37,41,44,48,55,54,62,56,54,66,55,40,32,43,49,32,19,18,15,18,17,21,20,22,25,30,21,25,27,34,27,24,30,27,26,33,33,45,65,81,127,156,151,141,139,139,152,147,98,75,97,63,27,29,21,19,20,20,19,24,18,29,51,82,90,63,40,19,24,24,51,80,66,73,79,74,54,63,48,38,88,98,113,102,112,113,92,127,110,79,48,86,90,39,173,237,234,246,236,237,233,233,235,232,233,233,233,233,236,234,234,234,232,233,235,235,235,233,234,233,233,234,235,236,232,233,233,235,235,234,235,234,232,235,235,233,232,234,233,236,239,241,241,240,240,240,239,238,236,236,238,237,238,234,229,117,4,0,4,9,10,9,10,10,11,12,12,11,184,188,185,186,187,184,186,186,185,186,187,189,190,186,188,185,185,188,191,189,187,191,187,188,189,187,189,187,189,191,191,191,190,188,189,188,187,189,188,190,192,191,192,190,190,192,189,191,192,190,192,194,193,195,196,188,193,195,192,195,194,195,196,196,197,196,195,197,195,195,196,196,194,194,198,192,195,196,194,198,198,198,196,197,196,195,199,198,199,196,195,198,195,198,195,194,194,193,193,194,196,195,195,194,193,193,193,193,194,195,192,193,193,191,195,191,193,194,193,196,196,196,193,193,196,194,196,195,197,196,196,197,195,198,198,198,196,197,195,192,196,193,196,196,196,194,194,196,195,194,194,195,193,197,198,195,194,196,198,195,194,197,196,196,195,198,198,195,198,198,200,197,199,199,199,201,198,200,200,200,200,200,202,201,202,204,202,200,202,204,206,206,205,207,207,206,206,206,208,208,207,207,207,207,208,205,208,208,208,209,209,206,209,208,209,207,206,208,205,206,204,204,206,203,206,202,202,206,205,207,206,210,210,208,210,211,214,212,212,211,212,213,214,214,214,215,214,214,215,217,211,204,211,215,212,203,206,216,215,215,214,214,214,214,214,211,212,214,212,214,213,214,214,212,217,213,217,218,214,218,215,216,219,214,208,207,213,217,218,219,217,217,218,217,218,217,218,218,216,218,218,218,219,218,220,217,218,215,214,216,216,215,219,201,206,201,201,220,212,214,214,213,212,215,212,196,184,205,243,220,213,130,62,67,29,115,183,206,218,201,214,207,204,179,143,157,150,179,196,92,65,94,158,160,109,50,16,22,38,48,38,40,36,34,39,35,113,208,231,122,11,59,179,243,249,219,184,220,174,183,158,141,217,210,197,205,162,108,151,169,229,243,134,198,234,226,220,199,224,222,215,195,220,219,179,185,205,251,155,47,73,84,87,57,55,84,69,61,207,171,63,48,4,42,206,248,253,145,33,10,10,11,35,46,25,15,34,52,64,26,77,213,249,249,204,200,235,250,251,248,252,170,157,216,236,245,215,246,193,119,182,223,249,249,252,202,100,30,16,62,71,66,56,72,101,89,103,123,121,84,29,24,34,32,44,36,28,38,29,22,25,21,27,29,27,33,28,33,36,35,42,36,35,33,27,35,39,39,49,77,57,70,101,93,78,64,54,44,64,56,46,59,51,50,37,19,22,28,31,30,28,34,39,34,38,36,30,33,29,29,32,26,23,35,34,35,30,27,29,21,34,37,41,52,41,49,56,56,60,63,53,50,57,44,32,30,33,22,13,16,18,15,19,24,18,29,25,27,26,23,29,29,31,30,28,24,31,43,88,131,149,156,165,151,130,126,127,127,137,134,80,55,74,49,33,28,24,24,24,20,21,19,18,27,33,64,87,75,43,23,24,16,43,78,66,75,76,72,52,59,52,26,83,107,105,97,102,105,107,119,53,64,76,85,107,36,159,236,226,248,238,239,234,235,237,231,236,233,233,233,233,233,233,234,236,233,234,233,231,234,235,236,233,233,235,234,234,233,235,235,233,234,233,232,233,233,234,232,233,236,238,241,243,244,243,245,244,243,243,242,244,244,245,241,238,234,227,117,4,0,4,8,10,9,10,10,11,12,11,11,183,190,186,185,186,185,184,190,186,186,186,184,187,183,184,184,185,185,185,184,187,185,187,189,189,188,187,189,190,187,190,191,191,187,188,189,188,191,188,190,187,189,192,189,189,189,192,190,192,191,193,193,191,192,191,190,191,193,194,194,195,194,193,196,193,196,193,194,196,194,196,193,194,196,196,192,196,197,194,196,195,195,196,195,195,195,198,194,195,196,196,196,194,193,191,193,195,193,191,193,193,193,195,194,193,191,191,194,190,192,192,191,196,192,192,194,192,194,196,196,192,192,196,194,194,194,194,195,193,196,194,196,195,195,193,193,194,194,196,192,196,196,196,197,195,196,195,191,196,194,194,195,195,193,192,196,193,196,197,193,197,195,197,199,198,198,198,200,196,197,199,198,200,199,201,202,200,200,198,200,199,200,204,201,201,201,201,200,201,202,205,206,205,206,205,205,208,208,206,206,207,208,206,206,206,208,208,207,210,208,205,207,208,208,206,206,208,207,206,208,208,206,207,207,206,203,205,206,204,209,209,208,211,209,211,211,212,212,214,214,213,215,216,214,214,212,212,213,212,214,205,203,213,215,208,202,211,217,214,212,213,214,214,212,212,212,214,215,213,214,213,213,214,214,215,216,215,214,214,213,217,216,214,217,214,208,206,212,217,218,217,218,217,218,219,218,220,218,217,218,220,219,219,217,218,219,217,217,216,214,216,212,219,202,202,202,199,220,212,215,212,211,214,217,210,191,192,209,232,159,111,76,25,38,19,47,69,135,187,178,190,183,188,170,141,175,185,234,210,106,120,108,173,186,126,66,11,24,39,39,43,34,41,33,62,140,200,241,219,150,57,9,129,213,213,146,129,183,164,215,197,176,214,198,158,148,162,173,205,203,251,216,130,224,232,231,215,198,223,217,217,195,222,219,174,183,203,251,158,48,71,65,63,54,56,78,89,62,131,94,34,47,5,93,249,249,198,68,12,7,6,27,47,32,12,30,47,66,37,56,189,243,251,190,196,238,251,252,249,251,191,159,228,235,237,228,215,195,135,171,235,245,249,249,253,219,84,35,7,34,79,111,71,107,61,59,112,125,122,64,19,25,31,33,45,38,29,36,32,25,24,27,27,30,31,29,32,32,35,37,40,34,37,37,31,30,45,29,41,94,72,107,125,105,70,66,86,74,84,54,30,49,51,40,27,18,27,27,23,34,29,29,34,34,33,27,33,32,28,32,25,26,30,27,33,32,33,34,22,30,44,48,62,61,45,50,57,51,53,54,49,35,33,39,29,23,24,19,19,16,15,27,23,27,24,23,28,25,25,25,29,31,27,27,26,29,36,104,152,157,160,152,145,132,124,113,116,121,133,122,61,36,44,30,21,29,27,25,27,27,27,19,19,22,21,42,74,71,44,22,27,19,37,81,66,66,68,68,54,56,54,25,77,105,107,92,95,103,100,105,54,110,149,159,125,21,144,232,235,249,240,242,236,233,238,236,238,237,236,235,235,234,236,238,237,238,233,231,236,237,240,237,236,236,235,236,238,238,234,233,233,231,233,234,233,234,233,233,235,239,241,244,245,244,244,247,245,244,244,246,243,244,243,240,236,232,229,118,3,0,4,8,9,9,10,9,11,12,11,11,184,188,187,186,186,184,185,186,186,190,188,184,185,183,186,186,187,185,185,187,188,190,189,188,193,189,190,191,190,193,189,190,191,188,191,188,189,190,189,194,190,190,193,190,190,191,192,192,190,193,193,194,194,191,197,195,195,195,194,195,196,195,194,196,193,196,194,192,195,194,196,195,193,196,196,193,197,195,195,195,196,196,192,197,196,195,196,196,199,194,195,196,194,196,194,193,196,194,194,195,194,194,194,195,192,192,193,191,194,192,192,193,193,195,196,193,193,194,193,194,191,191,191,195,198,196,195,195,197,195,195,196,193,198,193,192,193,194,196,192,196,196,196,196,195,194,194,193,195,196,194,194,194,195,197,196,195,195,194,196,198,196,198,197,196,200,199,199,201,199,200,200,201,201,200,202,200,201,202,201,200,201,204,203,202,201,203,206,206,204,206,207,205,210,207,207,205,205,211,206,207,207,206,208,208,208,208,207,208,208,208,207,209,207,208,211,210,210,210,210,212,211,211,210,213,212,207,210,210,214,212,215,214,211,214,214,213,216,217,216,217,214,214,215,216,213,211,213,214,213,202,206,216,214,206,206,213,214,216,215,212,214,212,214,215,211,214,212,214,216,213,214,212,215,216,212,215,214,212,212,214,214,215,214,216,218,210,209,213,217,217,220,219,218,217,217,221,219,219,218,219,219,220,220,219,218,220,218,216,214,213,212,222,205,203,201,199,218,214,214,214,211,215,218,207,192,185,211,186,77,26,9,13,27,32,63,29,58,129,161,200,196,216,178,150,216,229,251,178,102,132,106,188,189,145,84,7,26,35,39,44,33,44,26,78,173,219,249,232,207,137,26,93,138,164,123,137,186,182,223,203,202,164,160,142,164,188,190,214,208,251,182,150,241,223,235,208,200,227,219,221,197,225,218,175,181,201,251,155,51,95,73,47,60,58,70,91,48,48,36,78,59,9,146,250,234,95,19,14,4,23,44,36,18,23,47,63,47,45,165,246,246,178,167,232,251,252,243,252,191,155,211,242,239,216,236,160,100,158,229,249,249,247,247,239,159,156,116,58,59,55,70,64,57,38,62,131,130,112,56,16,29,33,35,39,35,34,37,23,25,27,24,27,28,32,39,34,28,36,33,37,37,34,27,30,36,33,36,75,97,92,91,67,69,50,56,66,37,54,43,39,59,36,27,22,17,23,23,33,34,28,33,30,36,35,31,33,29,29,28,25,24,31,33,32,37,34,31,35,64,90,84,88,71,51,53,48,43,40,39,33,30,21,28,32,22,19,16,17,23,39,34,25,20,21,25,24,24,19,28,29,29,33,28,28,24,75,143,153,128,118,115,116,115,112,104,111,122,133,115,57,27,20,16,24,29,30,31,26,31,22,26,28,21,24,36,61,52,35,23,20,26,39,77,71,62,61,60,57,46,55,29,65,107,104,100,95,98,101,104,49,80,125,125,91,13,124,227,240,250,242,243,240,240,243,240,241,242,243,243,242,242,243,243,243,244,243,243,245,245,244,241,239,237,238,241,242,241,239,236,237,239,238,238,239,238,235,237,240,241,243,244,246,245,245,247,247,245,246,247,244,242,241,241,242,244,234,116,3,1,6,9,9,9,10,10,11,12,11,11,182,186,185,183,186,184,184,186,184,184,186,186,188,186,186,187,186,188,188,188,190,189,192,190,188,193,191,190,193,190,189,190,190,191,191,191,189,191,191,190,193,192,191,190,191,193,194,192,192,193,194,193,193,194,196,195,195,193,193,194,191,193,195,197,192,193,195,193,196,193,194,195,191,194,194,193,195,197,193,194,194,194,196,194,193,193,195,197,198,196,194,195,195,195,196,196,195,191,194,195,192,191,193,190,192,194,194,195,193,195,192,193,196,194,192,194,196,191,193,192,191,193,191,193,191,190,194,198,195,194,195,194,194,193,194,192,191,192,193,193,194,193,193,193,193,195,198,194,196,195,193,196,195,193,196,194,192,196,197,196,198,199,199,198,199,201,200,200,198,202,199,197,204,201,203,202,200,203,200,201,203,201,201,200,202,203,203,206,207,209,208,205,208,208,208,208,208,206,205,208,210,211,209,208,210,208,208,207,209,207,207,208,207,209,212,212,214,213,213,213,211,214,214,217,216,212,214,214,214,217,218,217,216,215,217,217,215,217,219,218,215,215,214,212,215,213,210,212,214,212,203,212,218,207,203,208,215,214,213,214,214,214,210,211,211,211,213,215,214,213,215,214,214,212,214,213,212,214,215,214,214,215,213,214,217,216,219,212,209,213,217,222,218,220,217,218,219,221,220,217,218,218,222,220,220,220,217,217,219,214,216,213,221,206,199,206,198,217,214,214,216,214,216,218,208,189,193,192,150,77,16,10,33,60,71,87,42,70,163,190,221,217,224,166,157,222,240,225,124,100,139,98,178,182,113,81,14,24,34,40,39,31,48,19,79,153,194,249,191,215,170,37,64,162,221,169,192,188,168,162,160,128,142,192,160,190,179,176,175,214,251,151,169,241,220,240,207,204,224,218,222,193,222,221,174,179,204,251,159,48,108,96,69,75,61,63,96,60,60,114,195,103,26,180,248,133,36,17,6,16,40,42,22,23,43,57,57,38,135,240,249,199,147,214,252,252,236,232,188,149,205,234,249,214,219,217,71,84,201,244,247,246,246,229,93,85,135,147,85,36,37,46,47,51,36,100,143,137,122,45,15,28,29,32,47,35,31,39,24,27,29,21,26,29,31,35,37,31,35,33,29,34,34,32,29,35,31,38,56,74,84,67,48,59,41,27,41,35,31,26,42,43,23,22,20,19,19,24,29,28,29,29,32,29,33,32,32,32,30,28,27,29,24,37,35,35,42,37,50,126,163,133,98,111,75,52,45,35,36,36,34,27,27,26,33,29,25,32,31,51,57,38,23,19,24,21,21,28,27,24,26,31,29,23,28,29,95,150,126,119,114,110,107,102,109,109,116,117,130,106,54,26,21,24,22,29,26,30,32,29,37,34,25,21,24,24,32,31,24,26,24,24,36,80,68,60,60,66,61,48,59,30,70,118,114,101,97,101,92,102,72,51,47,51,42,7,141,241,247,249,244,242,243,244,244,242,244,246,246,247,247,247,248,249,248,248,248,249,251,248,249,247,245,246,246,247,245,246,244,244,244,246,247,243,243,243,240,242,245,246,245,245,247,245,247,247,246,247,247,247,244,244,243,245,249,250,237,116,4,1,4,8,10,9,9,10,11,12,12,12,181,186,186,184,189,185,183,183,185,187,184,184,185,185,184,185,187,186,188,188,188,190,190,191,192,189,193,193,191,192,190,191,191,191,192,189,191,190,190,191,189,192,192,191,191,189,193,194,193,195,191,195,194,190,195,191,193,195,192,193,196,193,193,196,195,195,191,195,196,194,194,195,195,193,197,198,198,196,195,195,194,194,191,194,194,192,195,194,195,195,196,198,195,193,196,192,192,195,191,192,191,191,191,193,192,191,191,193,193,192,193,190,193,193,189,190,193,194,192,192,192,192,194,194,194,191,191,194,191,193,191,191,194,194,191,192,193,193,192,193,191,192,195,194,195,194,194,194,194,196,195,198,197,194,196,195,193,200,200,198,198,196,200,199,201,203,201,202,201,200,202,200,201,201,201,202,203,203,201,201,202,203,201,200,205,204,205,208,207,204,208,210,206,208,206,208,208,205,209,207,211,211,211,211,207,208,209,210,208,208,208,207,212,209,211,215,211,214,216,216,216,215,216,215,215,215,215,215,216,217,215,217,216,216,217,216,216,215,215,215,215,214,214,213,213,214,210,210,216,208,205,214,214,207,203,211,214,212,213,213,213,214,211,212,212,212,214,214,214,213,213,212,212,213,213,212,214,215,214,214,213,212,217,216,214,216,217,219,212,209,211,217,222,223,220,219,219,217,221,219,220,218,218,221,219,219,220,216,217,215,220,215,221,207,200,204,196,217,217,214,219,215,218,224,211,199,211,232,204,128,97,89,96,100,79,87,56,120,216,188,207,215,232,185,168,223,230,171,97,103,151,106,174,174,52,60,31,21,38,39,45,32,46,15,72,145,190,232,165,210,199,58,60,206,249,211,220,194,146,129,164,136,150,189,155,139,106,145,189,247,249,143,200,241,221,243,206,205,221,218,221,194,227,222,174,182,206,251,158,53,100,93,74,56,46,50,96,87,128,185,237,135,89,201,143,54,21,15,13,33,42,28,26,37,54,62,35,105,226,246,215,149,194,245,252,250,225,175,146,201,239,252,239,203,183,121,87,179,237,247,246,245,227,78,14,49,112,132,78,24,27,45,50,51,41,115,144,136,113,34,19,36,30,33,45,31,29,39,27,28,25,25,28,26,30,40,35,33,32,29,38,36,34,31,26,29,34,33,44,59,65,42,53,77,32,42,60,30,34,28,30,32,14,27,24,18,22,27,34,25,28,30,29,37,33,35,39,32,33,30,27,25,26,34,37,37,42,39,63,129,156,125,98,92,76,61,45,29,33,33,29,24,28,25,25,32,44,47,53,53,40,25,21,26,19,22,23,19,27,22,27,28,27,27,30,22,96,144,122,117,109,106,110,113,111,113,113,108,116,94,53,28,22,24,28,32,33,31,26,30,36,33,32,29,26,28,29,26,26,21,24,25,33,77,71,61,66,72,69,47,62,34,63,125,115,111,96,98,86,85,92,84,57,39,20,89,249,249,249,249,244,248,242,244,244,243,247,249,247,248,249,249,250,248,250,249,249,249,247,248,250,250,249,247,248,248,247,247,247,246,247,247,247,245,244,244,244,244,245,245,248,248,247,247,248,246,247,247,245,245,244,245,248,250,251,250,236,116,4,1,4,8,9,9,10,9,11,12,11,11,183,187,185,184,185,184,185,184,184,187,186,184,186,183,185,188,188,188,188,187,191,194,191,194,196,194,194,191,193,192,191,194,192,191,190,189,190,192,192,192,193,191,193,194,191,191,193,192,190,190,192,191,190,190,194,195,192,194,192,194,193,193,193,196,193,193,195,194,199,196,194,195,196,198,197,199,196,197,194,195,196,197,198,194,195,196,194,195,196,194,195,196,198,196,193,197,194,188,193,193,191,194,192,190,193,191,193,191,191,192,193,192,192,192,192,190,192,191,196,195,191,194,193,192,193,193,192,192,193,193,192,190,191,192,193,192,195,195,192,194,193,194,193,193,194,194,196,190,195,197,195,200,198,198,201,197,199,203,200,199,200,200,200,200,200,200,200,200,200,201,201,200,203,200,201,204,200,202,202,203,202,202,204,204,206,204,207,208,206,208,209,209,209,209,209,207,208,209,208,210,210,208,209,211,210,210,210,209,211,210,211,211,212,214,213,214,214,214,212,216,215,214,215,215,214,214,217,215,215,214,214,214,216,214,212,215,215,214,214,213,214,214,214,213,212,213,210,213,213,205,205,215,211,203,210,211,212,213,210,212,213,214,213,212,213,214,214,214,213,212,214,213,213,212,213,214,212,214,212,210,213,215,213,215,217,215,217,215,220,216,211,212,220,230,233,234,225,221,221,219,219,218,220,217,220,218,219,220,217,216,216,214,222,208,198,208,196,214,220,219,222,224,229,231,210,197,209,223,225,186,187,169,132,119,78,134,129,120,221,208,213,234,244,194,179,220,208,128,96,115,153,113,181,191,24,29,20,37,71,62,55,31,40,11,110,176,198,223,186,227,205,108,77,190,207,170,169,170,179,194,230,178,205,166,95,109,98,147,182,244,210,134,227,245,234,244,205,211,223,224,222,192,226,222,173,178,205,251,153,48,97,71,50,61,40,49,63,86,135,163,243,193,183,163,37,18,22,11,27,42,29,22,36,50,66,34,84,207,249,229,148,181,232,249,252,239,186,149,205,242,252,247,234,170,74,125,152,230,249,249,244,220,87,21,37,89,100,117,136,71,43,44,52,49,70,141,122,129,95,23,23,30,33,36,48,35,31,36,24,25,22,26,34,26,29,37,35,35,32,33,35,34,29,31,28,27,32,29,45,43,35,24,44,56,42,68,57,37,32,28,38,23,17,31,23,27,24,24,31,29,34,33,29,38,33,31,38,32,33,26,27,28,31,39,33,34,32,37,59,121,120,87,96,82,69,59,34,28,32,29,23,22,25,25,25,34,45,34,39,43,29,24,27,23,19,23,24,22,23,30,27,30,31,22,33,26,91,139,121,122,101,99,89,71,76,86,101,105,111,81,53,29,19,30,26,29,33,26,32,30,33,38,32,32,29,28,30,29,33,29,26,29,35,81,72,65,69,66,70,49,63,40,50,117,115,107,105,107,78,49,75,105,112,99,173,246,252,252,251,248,250,249,247,242,244,247,248,248,248,249,248,248,249,248,248,251,251,250,249,247,249,249,249,249,249,248,246,250,248,248,248,247,247,246,248,246,244,244,245,247,247,246,248,248,247,245,246,245,244,244,245,247,249,251,250,249,236,116,3,1,4,8,10,10,10,10,11,12,11,11,183,189,187,184,187,185,185,188,187,188,185,185,188,190,187,187,190,190,192,191,191,194,193,195,194,194,195,193,193,189,190,192,192,193,191,191,193,191,196,194,192,193,194,196,194,194,195,191,190,193,191,193,191,192,193,191,193,193,195,194,195,194,195,195,191,194,194,194,194,196,198,198,199,198,198,198,195,195,196,196,198,199,195,196,198,194,195,196,195,197,196,195,195,196,198,196,195,194,193,196,193,191,190,191,195,190,192,193,192,192,193,191,191,192,192,194,194,193,194,196,197,196,196,194,193,194,195,193,192,193,194,194,193,193,193,195,196,195,195,195,195,196,197,195,194,196,194,197,198,198,200,203,205,203,201,202,203,203,202,202,202,203,205,201,201,202,200,205,202,200,203,202,202,200,204,203,199,200,204,205,204,206,205,208,208,205,208,208,210,211,211,212,209,211,212,212,212,210,212,209,211,211,210,214,212,214,212,212,213,213,213,211,215,212,214,215,212,216,214,214,215,213,213,214,214,215,217,213,215,216,213,214,215,216,213,212,215,214,214,215,214,212,215,214,212,214,213,217,212,201,212,215,207,206,212,214,213,211,213,216,212,214,214,214,215,214,214,213,213,212,212,214,213,211,213,214,214,212,212,214,214,214,217,216,215,217,217,219,218,223,215,211,215,229,248,248,239,226,222,221,219,219,219,221,219,218,219,217,220,217,220,213,220,211,200,211,200,222,229,226,231,225,218,205,169,158,167,142,142,182,240,202,159,126,95,205,189,129,218,214,225,242,237,170,155,219,184,111,115,123,170,121,184,211,27,5,29,92,105,91,68,36,20,83,221,209,200,217,211,221,216,168,77,92,107,135,173,198,199,240,241,208,208,125,98,139,170,177,185,221,143,122,210,216,224,237,212,215,229,230,226,195,228,222,173,177,205,251,145,71,100,79,88,68,48,45,48,56,95,184,252,252,211,81,1,24,21,27,42,32,18,29,48,62,39,59,184,244,242,159,173,224,250,250,252,205,150,203,247,252,251,243,234,95,97,194,183,245,247,247,234,83,25,43,93,116,116,166,199,129,77,73,93,97,123,142,122,127,78,20,25,36,32,36,47,28,33,39,24,27,18,25,29,25,30,36,36,37,31,29,38,31,32,30,27,29,34,35,27,19,19,24,24,24,34,65,56,38,33,21,24,22,24,25,22,30,26,30,33,29,32,31,31,37,36,33,34,35,29,25,25,32,32,31,40,36,31,39,51,91,130,99,74,83,55,38,38,30,26,20,25,21,24,28,28,42,46,48,40,30,24,23,24,21,23,22,27,29,29,32,31,34,31,26,27,26,53,106,125,118,99,76,60,54,35,38,66,88,101,88,62,37,27,27,31,30,28,30,29,30,28,29,32,39,36,33,35,33,41,34,33,35,38,84,72,66,68,64,68,51,69,41,45,111,105,110,107,112,95,61,68,105,102,113,187,206,242,252,251,251,248,251,247,247,249,247,247,248,248,249,248,249,249,248,251,250,248,250,250,249,251,249,249,251,251,251,248,249,249,249,251,249,248,247,247,245,244,245,247,247,247,247,246,245,246,247,247,246,246,246,248,249,251,250,248,249,236,117,3,1,4,8,9,9,10,9,11,12,12,12,185,190,187,188,186,186,187,187,188,189,189,188,190,189,190,189,188,190,191,191,193,194,192,195,191,193,195,190,193,193,193,194,193,193,192,193,193,197,196,192,196,193,193,196,193,193,193,192,194,194,194,193,194,192,192,195,192,197,196,198,196,196,197,196,196,194,196,198,198,196,198,201,200,200,200,199,198,194,197,199,196,198,200,197,198,197,196,196,195,196,195,195,198,198,195,193,192,193,194,192,192,193,193,191,194,193,193,194,193,191,194,192,194,193,193,191,194,196,193,196,195,198,198,195,196,195,196,196,194,193,194,197,197,195,196,197,200,198,198,199,196,200,197,197,197,198,199,198,201,200,202,203,203,203,205,205,202,204,204,203,203,204,203,203,203,205,205,203,205,205,205,204,204,202,202,206,203,204,204,203,205,206,208,206,206,207,208,210,210,211,210,211,209,210,211,212,212,211,211,212,214,213,214,213,211,214,214,213,214,212,213,211,214,213,211,212,210,214,214,216,214,210,214,216,214,214,214,214,215,214,216,214,212,215,213,214,213,214,215,211,215,212,211,213,211,214,211,216,208,203,212,211,204,209,216,212,214,214,213,214,214,213,211,211,213,213,211,214,213,211,214,214,215,213,213,214,212,214,213,215,214,215,215,216,216,216,219,217,221,223,219,215,196,178,192,217,231,229,222,222,220,221,221,217,222,218,219,218,217,217,218,216,224,218,207,219,208,222,222,207,195,181,177,172,155,165,158,99,118,172,231,181,169,124,79,229,218,90,146,182,192,215,201,154,155,195,154,120,134,142,198,130,192,238,62,15,62,128,119,104,84,38,24,145,250,217,185,219,226,187,185,151,50,40,61,154,215,223,210,231,167,144,198,142,108,153,190,202,234,239,127,126,192,179,194,203,190,203,227,234,228,199,235,229,178,177,212,252,128,67,125,109,96,79,43,36,34,82,159,234,252,252,144,22,4,26,30,38,34,18,23,43,61,54,45,153,248,248,171,171,222,248,252,252,234,152,197,244,252,252,250,245,177,126,186,235,198,244,244,244,104,11,48,87,118,113,125,153,177,114,51,33,70,77,99,134,117,132,63,15,29,33,29,41,51,32,35,37,23,27,23,27,29,25,33,39,36,36,39,31,29,36,36,32,30,36,33,32,29,26,26,23,18,18,38,59,47,37,47,23,17,26,21,23,26,29,28,31,37,30,31,32,32,39,34,34,32,34,31,27,27,23,34,39,30,33,37,38,50,78,110,127,96,56,52,35,23,27,23,19,22,20,23,32,33,44,59,54,32,21,19,19,22,25,23,22,25,30,31,26,31,36,31,25,29,27,33,63,106,126,93,74,77,74,62,42,34,68,108,114,95,58,33,29,27,33,28,30,30,32,30,33,38,34,36,35,36,38,39,39,34,35,42,85,74,60,61,64,77,53,71,52,33,110,116,110,115,116,104,88,66,89,53,27,5,79,218,247,250,250,248,250,249,251,249,247,247,248,249,249,249,250,249,249,251,251,249,249,251,252,252,251,250,251,251,251,250,252,250,251,251,251,251,247,248,245,244,245,246,246,247,247,245,245,247,246,245,245,245,246,248,249,249,249,250,249,236,116,4,1,4,8,9,9,10,9,11,12,12,12,187,193,190,186,191,191,187,191,189,189,191,190,191,190,191,191,192,191,193,195,194,194,194,191,193,194,194,195,194,195,194,193,193,196,196,193,196,196,197,195,198,198,196,195,193,197,199,195,196,195,195,196,194,197,196,194,196,195,195,196,196,191,194,196,196,197,196,198,198,196,196,199,198,201,200,200,199,196,198,199,198,198,195,198,198,196,198,198,197,198,197,195,197,197,194,197,195,195,194,194,193,193,193,191,193,192,193,192,192,194,195,194,192,195,195,193,197,196,196,197,196,197,199,199,198,198,195,197,198,196,199,199,198,198,198,200,198,199,200,200,199,200,200,199,200,199,199,200,201,203,205,202,201,202,203,204,205,203,204,204,202,202,203,204,204,206,202,206,203,202,207,203,205,206,206,206,204,205,206,205,207,208,206,208,208,209,212,210,210,210,210,212,211,211,210,208,210,210,212,212,215,214,210,214,213,212,211,212,212,214,214,211,213,212,214,213,211,215,213,213,214,212,214,212,214,213,214,211,211,217,217,214,214,214,213,214,214,212,212,213,215,212,212,213,212,211,212,214,204,204,216,207,204,212,213,214,214,212,212,212,211,214,212,212,212,211,212,212,214,214,210,214,212,212,214,214,214,212,215,216,214,215,216,216,216,217,217,219,218,222,218,224,184,85,71,125,199,230,225,225,220,222,221,220,223,219,220,218,221,220,224,222,230,225,207,213,188,189,178,163,175,178,192,200,182,201,207,188,178,185,198,127,174,117,56,201,196,94,146,152,156,198,199,162,144,170,134,137,145,165,220,125,192,252,126,56,89,157,143,124,120,57,34,181,251,191,157,219,199,130,150,158,108,63,83,160,229,207,192,197,127,159,190,168,133,125,159,196,252,244,142,196,233,202,198,190,178,183,205,211,213,194,235,232,187,183,217,238,89,49,101,96,91,50,28,28,59,129,198,249,252,164,50,13,11,27,37,36,24,20,34,55,55,43,141,243,249,182,169,222,247,249,249,235,154,177,243,251,249,244,245,159,179,192,226,248,202,248,236,107,34,44,90,115,114,120,114,81,120,91,34,19,45,52,103,127,120,117,42,23,29,31,34,44,53,27,39,40,22,27,24,23,29,26,33,37,34,38,31,34,33,35,39,30,31,32,32,30,29,28,27,28,23,23,31,47,43,55,51,21,24,19,26,37,24,31,37,34,34,30,29,28,34,37,31,33,32,31,33,26,27,29,30,34,32,31,31,60,59,56,119,135,96,64,47,40,38,26,21,18,16,22,26,27,36,47,54,49,23,23,23,18,22,18,29,25,27,26,24,33,32,30,32,29,24,29,22,38,71,98,109,91,100,105,71,64,42,55,99,124,145,100,53,33,25,29,30,29,32,39,31,34,38,36,39,35,38,41,38,37,40,35,37,89,76,61,62,64,81,55,70,56,28,99,116,116,117,112,112,71,58,47,48,11,59,214,251,251,250,248,251,250,249,249,249,249,250,249,249,250,249,251,251,249,249,249,251,251,251,251,252,251,250,251,251,252,250,251,251,249,250,249,248,248,248,247,245,246,247,245,245,245,245,247,246,245,244,244,246,247,249,249,249,250,250,250,237,115,4,1,4,8,9,9,10,9,11,11,12,12,187,194,191,193,191,191,193,191,192,191,192,193,192,189,194,194,195,197,194,196,196,195,194,198,193,194,197,191,196,195,192,194,192,194,197,195,194,197,196,196,198,196,199,198,194,196,197,196,196,195,198,198,198,198,198,196,195,198,195,198,197,196,197,196,198,197,197,199,196,198,199,198,198,199,197,196,199,195,196,196,199,199,198,199,198,198,199,198,198,198,195,198,196,198,198,195,196,194,195,196,196,194,195,193,194,194,195,195,194,195,198,196,196,195,195,197,196,197,196,198,196,198,201,200,199,198,198,198,198,200,201,201,200,200,200,198,200,199,201,201,200,203,201,202,203,201,200,200,204,205,203,203,202,200,201,200,202,204,203,204,203,204,203,203,203,203,204,202,204,206,202,205,206,205,207,207,204,206,208,208,208,209,208,206,207,210,210,211,210,208,212,213,210,210,210,210,209,208,211,211,213,213,211,212,211,213,211,211,213,210,213,209,213,213,210,212,213,213,212,213,213,212,213,214,212,213,213,212,213,211,212,212,212,213,210,212,211,213,214,213,214,209,212,215,212,211,211,210,202,212,212,201,205,211,214,212,212,212,214,209,211,211,211,213,210,214,211,210,211,211,212,211,212,212,211,214,214,214,213,214,216,217,215,215,214,216,217,215,216,217,221,226,203,133,94,116,177,217,226,227,219,219,220,218,222,220,222,225,226,224,224,221,217,205,177,175,164,178,190,195,210,219,229,223,198,214,232,245,211,182,202,129,172,111,62,191,173,118,197,205,191,224,222,167,134,157,130,143,141,179,222,92,174,249,175,67,77,151,149,137,120,61,32,185,251,151,134,209,183,149,185,224,196,108,85,169,200,157,198,218,187,190,188,141,107,145,182,214,252,208,144,227,242,235,238,230,207,188,202,200,199,179,217,214,178,186,214,201,69,50,80,67,64,25,25,19,54,116,147,246,201,57,19,10,17,38,41,31,15,27,53,60,39,56,193,243,189,162,210,246,249,249,234,162,160,217,251,245,236,236,164,152,240,214,245,240,212,246,92,35,44,86,123,110,119,101,59,59,121,85,35,21,130,186,157,132,110,105,37,18,36,34,35,42,47,33,39,34,29,28,22,29,27,28,33,32,34,35,33,33,31,33,33,36,33,29,30,29,31,33,33,34,31,27,24,34,77,84,64,33,19,30,31,36,31,28,37,37,32,29,32,28,30,36,33,29,36,30,32,34,22,28,32,28,34,37,76,96,66,53,63,100,98,68,49,40,44,41,29,19,21,19,17,27,38,44,39,33,24,16,26,26,22,24,20,27,25,27,35,30,35,30,31,29,23,24,26,29,36,68,101,118,122,118,86,71,50,47,84,108,158,168,96,45,28,23,31,34,34,35,34,37,38,41,39,33,39,35,34,38,36,40,39,87,81,62,64,63,74,54,63,63,25,84,116,110,120,107,105,72,53,56,18,112,248,248,248,248,252,252,249,251,249,247,248,249,251,250,251,250,250,248,248,248,249,250,248,250,251,251,250,250,250,251,251,251,249,250,249,248,247,248,248,247,247,245,246,245,245,245,247,247,245,245,246,245,245,245,244,246,248,249,250,251,250,250,237,116,3,1,4,8,9,8,10,9,11,12,11,11,189,194,195,191,193,192,191,193,192,193,195,195,195,194,194,196,199,198,195,196,194,196,197,194,197,196,193,193,194,193,195,195,194,194,193,194,193,196,194,194,195,195,196,195,194,196,197,196,197,196,198,198,195,196,198,198,200,199,197,199,198,198,199,197,200,198,198,199,198,199,197,198,197,198,197,194,198,196,196,197,193,198,196,198,200,198,198,200,199,195,197,197,198,195,196,198,196,196,198,197,197,197,196,197,197,196,197,194,196,196,195,197,196,200,199,197,199,198,198,197,199,200,200,199,198,201,200,200,200,200,203,201,204,202,200,203,200,201,203,201,201,202,204,200,203,201,200,204,200,204,203,201,203,203,202,202,201,202,201,201,203,202,202,204,204,203,203,205,205,205,207,205,206,204,207,207,207,208,208,208,207,207,210,210,208,209,207,207,210,209,209,211,209,210,212,210,212,210,210,210,210,212,208,211,212,212,213,211,212,212,210,211,213,211,212,211,211,212,210,212,212,212,213,211,213,212,214,212,212,213,212,214,212,213,211,211,212,210,212,211,213,211,212,213,212,212,213,208,201,212,208,200,209,213,211,212,211,212,213,212,212,210,211,212,212,214,212,211,213,212,210,212,212,212,214,213,214,214,214,217,215,214,217,218,216,215,214,217,216,218,217,228,232,217,208,188,183,194,211,225,223,224,222,221,225,222,224,222,218,209,201,191,191,188,173,192,190,211,229,227,235,231,235,219,188,211,227,242,185,160,215,155,178,109,96,236,193,114,210,239,215,230,219,155,138,160,133,156,136,195,206,75,174,249,183,57,75,144,137,120,101,54,27,190,242,122,160,244,190,181,216,249,224,108,66,139,194,197,237,227,226,198,145,119,141,193,215,237,251,170,149,238,234,236,235,239,222,214,237,229,214,181,205,199,166,167,179,177,71,56,79,50,62,31,27,45,86,107,134,184,84,21,22,9,29,45,28,19,31,45,57,51,42,96,223,190,157,207,238,249,247,240,164,166,218,245,252,229,231,162,153,224,250,221,243,237,171,108,25,53,83,116,118,119,80,58,46,56,141,78,32,31,130,164,150,132,117,94,22,26,32,38,39,44,45,29,39,34,22,25,23,28,27,33,33,32,35,34,32,36,33,29,40,39,33,32,29,34,34,31,33,35,30,28,28,32,65,81,69,47,24,30,34,37,32,31,29,27,33,27,29,27,27,37,33,27,35,39,29,29,29,28,33,27,49,84,118,149,90,44,51,59,92,89,66,57,41,39,26,16,17,24,24,21,41,44,29,22,19,24,29,26,29,24,27,27,29,33,32,33,32,34,29,31,30,27,27,21,30,36,56,79,91,86,65,59,43,47,58,92,157,190,158,66,25,23,22,28,30,34,34,39,38,36,40,37,33,37,37,36,37,36,36,83,78,57,62,55,75,54,60,71,25,81,122,110,116,109,120,87,80,59,66,223,252,252,249,249,252,252,251,250,247,249,250,252,250,250,252,250,249,248,248,248,249,248,249,248,248,249,250,251,250,251,250,249,248,249,249,249,248,247,248,247,246,247,245,245,245,244,245,246,245,246,246,247,247,247,247,248,248,250,251,252,251,251,237,116,3,1,4,8,9,9,10,9,10,12,12,12,191,196,194,195,195,192,193,191,193,194,195,194,196,194,196,195,194,194,195,195,195,194,193,195,193,195,194,191,195,195,193,196,196,193,193,192,195,194,193,193,192,196,194,193,194,194,197,196,197,197,195,194,197,198,195,194,198,199,196,197,197,195,195,196,199,197,196,197,197,198,198,199,198,199,196,196,198,195,199,196,196,197,195,198,196,198,201,198,199,198,195,197,194,197,197,194,197,196,198,196,194,198,199,198,196,197,196,194,198,198,197,197,198,198,197,200,198,198,199,197,199,198,198,200,200,200,198,201,199,201,200,200,202,201,203,203,204,201,205,207,203,203,203,206,203,202,203,202,204,203,200,201,203,203,203,200,205,202,201,203,200,202,203,201,202,205,205,200,204,206,204,206,205,204,205,206,209,208,207,207,207,208,206,208,208,209,207,209,210,209,208,208,211,209,209,207,210,210,211,208,208,211,207,210,212,209,212,214,212,211,211,210,212,211,211,211,212,210,210,211,209,211,210,210,212,211,211,210,210,211,212,210,211,214,210,212,210,211,212,210,212,211,211,211,209,211,212,202,205,214,199,205,212,210,212,211,212,210,210,209,214,212,211,214,210,212,213,211,212,212,214,213,212,214,213,214,214,212,214,214,214,215,213,216,215,216,216,216,217,217,220,221,227,234,241,234,204,189,198,215,225,229,228,224,223,214,207,198,193,193,194,199,207,214,201,214,211,219,233,227,227,223,224,204,182,206,224,239,155,150,227,172,191,97,102,249,217,119,175,220,177,194,212,158,148,160,150,165,143,214,215,94,178,237,204,102,91,138,129,115,70,43,27,189,223,105,200,249,190,186,210,217,160,94,66,131,184,234,250,215,198,141,171,142,169,222,215,249,245,148,169,241,225,237,227,235,217,214,238,235,227,202,234,224,177,162,151,171,98,57,65,54,85,60,101,150,169,178,98,99,36,15,21,11,44,37,22,27,36,63,47,58,133,181,186,157,199,233,250,250,244,179,152,215,246,251,244,235,171,137,214,252,249,208,249,213,41,36,35,92,114,115,127,87,40,82,62,74,146,67,26,33,28,59,118,127,122,75,21,26,33,38,48,56,45,28,44,31,23,28,24,28,27,34,34,28,34,39,28,31,33,35,33,33,34,33,30,32,38,34,36,33,37,32,29,31,40,61,71,54,27,29,36,36,28,29,29,30,27,29,32,29,34,33,30,32,33,36,28,28,30,35,44,74,113,98,141,190,112,50,40,40,79,96,75,59,46,39,29,18,27,26,27,28,34,34,24,17,17,24,25,30,31,27,29,33,32,35,35,34,41,33,34,30,31,37,28,25,30,30,23,28,40,44,49,49,35,32,42,46,95,162,133,60,28,21,23,30,38,37,36,38,34,34,40,36,35,35,38,41,33,38,30,78,84,53,65,65,81,56,57,81,27,70,124,113,122,109,119,97,78,62,47,200,250,250,248,248,253,253,250,250,249,249,249,252,251,251,250,250,250,247,248,247,248,248,247,249,248,250,251,251,251,250,249,249,248,250,249,249,249,249,248,247,248,246,247,246,245,244,246,247,246,246,248,248,247,248,248,251,250,252,251,250,250,251,238,115,4,1,4,8,9,9,10,9,11,12,11,11,197,199,198,196,196,196,196,196,194,192,194,197,196,195,196,196,196,195,198,198,196,198,198,198,196,196,196,199,199,196,197,194,193,194,196,195,194,194,195,196,196,195,194,192,192,194,195,192,194,196,193,195,196,196,194,195,196,196,195,197,198,196,195,194,195,194,196,196,193,197,196,197,198,196,198,198,198,198,197,199,198,201,199,197,200,199,201,200,199,196,196,195,195,196,195,196,197,196,198,196,196,199,196,198,196,198,198,197,203,200,197,200,199,200,200,199,200,202,201,199,198,199,198,200,200,199,201,201,207,203,202,203,202,205,206,207,204,203,206,206,205,206,205,206,207,203,205,203,199,204,203,202,203,204,205,204,202,204,201,201,203,203,204,203,203,205,204,205,205,205,205,205,206,204,206,207,207,207,208,209,210,207,207,210,207,207,207,208,209,207,208,209,209,207,208,210,208,206,209,210,211,211,210,210,208,210,211,210,210,211,210,209,210,209,209,210,208,209,210,211,211,210,211,210,211,212,211,208,210,211,210,212,209,211,210,210,214,211,210,210,214,212,211,211,210,212,209,203,210,210,199,207,212,213,211,212,212,210,212,211,211,211,210,212,210,211,211,211,209,212,212,214,212,211,214,214,216,214,213,215,216,214,214,214,215,216,215,215,214,216,218,220,225,226,233,241,234,216,205,202,206,214,214,207,204,200,203,200,207,211,215,223,224,230,207,214,209,212,227,217,222,221,219,196,181,210,227,229,141,161,240,190,190,88,96,246,218,108,150,205,167,199,220,160,161,157,157,167,154,242,214,96,187,234,232,151,93,122,130,114,68,39,21,191,210,108,219,241,171,141,170,184,164,154,101,113,169,243,192,170,198,177,155,114,170,189,216,250,225,137,191,241,225,238,225,236,211,208,232,229,225,201,237,237,206,185,162,215,109,19,36,57,99,70,124,167,211,180,60,38,16,19,21,29,41,21,21,35,50,54,42,137,216,169,162,196,237,245,248,252,188,161,199,237,250,233,241,178,139,207,251,252,247,199,240,125,18,49,70,121,112,123,91,34,64,127,63,89,146,50,29,41,96,118,120,127,116,65,12,26,39,42,47,53,42,30,48,33,29,30,20,32,24,34,32,23,33,31,31,33,32,36,36,31,34,32,30,37,37,39,41,39,42,32,30,34,32,49,70,59,34,26,34,38,35,21,27,28,27,32,26,33,29,30,31,33,36,36,31,33,57,84,109,119,121,137,153,147,87,37,29,31,59,65,55,50,49,46,26,27,24,19,19,27,31,18,22,16,23,24,21,30,26,30,24,26,31,29,37,36,34,36,34,36,31,32,33,31,25,30,28,26,24,28,54,54,30,20,20,23,27,46,51,39,32,19,27,29,33,38,37,42,40,33,40,39,36,36,33,37,34,37,30,78,83,57,66,57,76,65,56,73,30,57,127,120,117,106,117,101,60,41,22,130,244,244,246,246,253,253,250,250,251,252,250,250,251,252,251,249,249,249,250,250,248,249,248,249,249,249,249,249,249,249,249,249,249,251,251,252,251,251,250,248,247,247,247,245,245,244,245,245,246,249,249,249,251,250,249,251,250,252,252,252,250,250,237,115,4,1,4,8,9,8,10,9,11,12,12,12,198,202,198,200,201,199,200,200,200,198,198,198,198,198,196,199,199,197,198,195,198,198,199,200,202,199,198,200,204,202,198,198,198,198,198,196,196,193,195,197,195,198,195,194,198,195,194,195,191,192,194,194,197,194,194,194,193,195,193,193,196,198,198,194,196,198,198,198,195,198,196,198,196,196,198,195,199,195,198,198,200,202,198,199,199,199,199,198,199,198,198,197,195,194,195,198,198,198,196,198,198,200,198,199,200,200,200,200,201,201,200,201,202,201,201,201,200,201,201,201,202,202,203,199,200,203,204,205,203,205,207,204,205,206,208,211,207,208,207,207,207,207,207,205,206,208,206,204,204,205,204,205,205,205,205,202,205,203,205,207,205,206,207,205,203,205,206,206,206,206,204,207,204,206,208,205,207,204,206,208,206,208,207,206,209,208,206,208,208,207,208,208,210,206,207,208,206,207,210,208,213,212,207,209,210,209,208,210,207,209,209,208,209,208,208,208,211,210,208,210,212,212,211,211,211,209,211,213,212,211,209,207,209,210,208,210,210,210,210,212,212,210,210,212,209,213,206,202,212,202,201,214,211,212,210,210,211,210,211,210,211,209,211,212,210,211,212,209,212,212,212,212,212,214,213,214,215,214,214,213,215,216,214,215,213,215,216,216,219,221,223,223,227,230,230,233,228,220,196,181,178,181,191,200,212,214,218,221,222,226,225,223,222,225,204,211,204,204,223,217,220,221,218,188,184,213,230,220,126,172,246,191,192,74,80,214,211,135,148,229,192,216,226,142,153,159,168,167,168,247,205,99,192,236,247,190,88,112,124,120,66,30,28,185,193,108,212,189,124,130,197,208,205,205,109,98,162,202,163,201,210,188,148,59,117,192,225,250,198,131,217,234,227,237,222,236,206,211,230,227,223,197,233,232,202,160,158,222,99,27,47,59,53,62,89,131,155,121,43,23,12,19,35,31,29,21,32,51,56,42,97,229,217,143,194,229,246,230,250,217,162,222,235,252,237,230,191,143,200,248,252,252,246,174,118,50,27,84,98,123,118,90,47,55,115,144,46,104,137,39,33,135,208,164,136,120,108,53,15,13,19,40,44,46,36,28,41,29,28,27,19,24,31,32,30,31,30,29,29,35,30,27,36,27,36,33,26,34,35,34,41,45,44,34,31,36,26,45,68,62,37,24,35,36,29,31,23,25,31,28,29,30,30,34,31,33,32,37,30,51,137,170,118,112,148,153,119,59,27,27,24,27,49,64,64,60,53,38,23,16,19,20,21,25,22,21,15,25,25,21,22,24,26,24,20,27,28,31,32,34,38,31,34,35,33,33,28,33,33,25,27,31,22,35,61,53,31,24,22,21,16,23,27,29,33,28,25,30,35,39,35,36,41,37,39,36,35,35,37,39,31,39,34,74,88,55,67,61,79,66,46,82,39,43,120,114,116,107,113,106,57,48,15,81,237,240,246,246,253,253,250,250,252,251,252,251,251,249,249,250,248,248,249,250,250,248,249,247,248,249,248,248,247,248,249,249,249,251,250,251,251,250,248,248,247,246,246,246,245,245,246,247,247,246,248,250,249,249,249,252,249,252,252,250,251,250,237,116,3,1,4,8,9,9,10,9,11,12,12,12,200,202,202,201,204,200,200,203,205,203,202,204,202,201,202,203,202,199,202,200,199,201,200,204,204,203,201,204,205,202,204,202,201,200,199,199,200,198,198,199,198,196,196,198,198,197,199,196,195,197,198,198,196,195,195,196,196,198,195,194,195,196,196,198,197,198,199,198,200,198,198,198,200,200,200,200,198,200,199,199,198,199,199,199,201,200,200,200,198,198,198,198,198,198,198,198,200,198,198,198,198,199,198,202,200,200,202,199,201,198,198,204,200,200,203,204,201,201,202,202,204,208,206,206,208,205,208,205,208,208,208,209,208,209,208,207,206,209,210,208,208,211,207,208,207,207,209,206,206,206,206,207,206,205,206,205,206,206,204,207,207,205,203,204,204,204,206,205,205,205,204,205,208,205,208,208,206,207,205,206,206,206,207,208,206,209,210,206,207,206,207,209,209,207,206,207,207,208,208,209,209,208,208,208,208,208,208,209,208,208,208,207,208,209,209,210,210,211,211,209,208,210,209,207,210,210,210,210,211,210,210,210,210,210,208,210,211,208,210,210,211,208,210,211,211,214,201,205,210,200,211,214,214,214,212,212,211,213,212,211,212,212,215,214,212,214,212,212,212,214,213,212,212,214,215,214,216,214,214,215,215,218,217,218,219,222,221,223,228,226,225,226,223,215,205,192,182,183,188,192,191,186,193,209,221,227,229,225,225,223,224,223,219,223,202,208,208,204,222,218,222,224,217,186,189,217,231,213,134,190,245,194,191,81,88,210,230,149,138,233,201,209,193,127,152,161,174,157,180,250,205,125,201,234,249,229,130,112,125,117,76,43,20,169,167,103,167,139,150,193,234,215,214,199,94,63,136,196,200,239,213,188,118,69,147,208,241,251,168,147,233,229,230,232,223,234,204,214,229,227,222,198,231,230,198,137,133,232,134,65,92,86,38,73,127,94,96,66,37,31,14,36,39,28,25,22,43,61,46,83,201,243,160,171,232,244,234,214,193,162,208,252,251,245,234,184,152,206,249,252,252,252,237,84,39,14,27,101,105,127,87,35,66,100,153,122,33,132,125,35,35,115,201,155,125,127,98,45,10,19,16,22,27,30,32,35,44,26,24,26,22,25,29,33,29,33,30,28,33,34,32,29,27,34,31,28,34,27,29,40,43,43,39,36,38,34,33,41,60,72,45,25,27,38,33,26,31,24,31,30,29,31,29,32,32,25,28,29,33,44,104,169,133,109,114,84,45,23,14,13,21,26,47,72,71,50,40,38,25,23,17,19,23,20,21,20,21,21,23,19,27,27,24,23,25,30,25,30,29,31,38,32,36,39,32,31,31,31,31,27,31,30,22,41,59,47,34,27,23,27,29,29,33,24,29,36,29,29,34,38,36,37,38,35,40,32,33,37,34,41,37,39,32,78,91,55,69,57,74,71,50,78,51,34,110,120,115,107,109,116,67,52,16,62,229,237,246,246,253,253,251,251,252,252,252,251,249,248,249,250,249,249,250,249,249,251,250,249,248,249,249,249,248,247,249,249,249,250,249,250,250,251,251,248,247,247,247,246,245,245,247,246,248,249,248,248,249,249,250,252,251,250,250,251,250,249,236,116,4,1,4,8,9,9,10,9,12,11,12,12,200,204,201,201,205,202,203,201,202,202,202,204,203,203,200,202,201,199,205,202,203,206,203,207,205,204,205,204,206,206,204,205,202,202,201,198,200,200,198,198,199,199,197,196,199,197,197,199,198,198,195,195,196,196,197,196,197,195,195,197,198,195,196,198,199,198,196,198,200,200,199,201,199,198,200,198,200,199,203,200,198,198,198,199,199,199,201,201,196,198,196,195,198,198,197,197,198,199,198,198,199,200,201,200,200,201,200,202,204,201,201,200,202,203,204,202,201,202,205,202,204,209,208,208,208,208,208,207,208,209,210,210,212,210,208,210,208,210,208,208,209,208,209,208,208,208,210,210,207,206,207,208,207,207,207,203,206,206,206,207,205,205,205,203,206,203,205,206,203,207,206,206,206,203,205,205,207,205,206,207,206,208,205,205,205,204,207,205,206,207,204,205,208,207,207,210,208,207,208,206,208,207,207,208,206,207,206,208,208,208,209,208,209,209,210,209,209,208,209,207,209,210,209,209,209,208,210,208,210,210,209,208,209,208,208,207,211,211,207,211,209,208,210,210,210,210,200,210,207,200,213,214,214,214,213,212,212,212,214,213,214,212,211,213,213,212,214,213,215,214,217,221,221,224,225,224,226,225,227,230,229,233,236,234,233,235,237,236,234,232,226,212,198,187,190,197,204,215,224,235,232,224,218,217,225,229,233,233,234,232,231,233,232,239,217,221,220,216,238,231,234,237,226,195,206,231,247,214,141,214,252,205,201,86,122,238,241,161,120,214,187,191,165,110,154,172,179,165,202,250,214,152,217,234,251,247,126,111,97,105,80,33,29,129,157,117,175,171,174,218,228,187,167,145,107,78,124,170,215,235,175,149,136,136,188,238,246,248,149,169,241,221,236,227,225,235,204,212,227,227,220,195,230,236,187,125,181,252,154,68,85,89,60,151,151,72,58,65,57,25,33,40,33,23,30,39,60,43,55,178,248,192,148,194,244,234,223,192,149,192,233,252,244,231,199,136,202,248,252,252,250,250,154,36,34,3,41,99,118,87,31,56,104,130,151,87,37,137,112,31,22,82,130,120,126,127,88,26,23,26,25,28,24,45,39,35,48,26,27,24,22,32,27,31,36,30,33,35,30,36,34,35,32,27,31,29,30,33,34,31,34,35,36,41,38,38,34,29,63,70,52,29,21,31,26,30,32,32,30,34,35,31,32,27,31,27,31,37,32,33,46,115,105,46,49,38,24,21,13,14,14,31,41,59,66,47,42,35,27,25,21,21,19,14,18,25,21,21,23,25,25,24,25,24,27,27,24,33,28,35,40,32,36,30,31,35,29,33,30,32,26,25,27,43,63,42,28,29,23,29,32,26,32,31,33,33,32,33,27,33,42,38,36,39,36,39,35,34,34,35,36,37,32,80,96,57,72,55,71,78,50,74,53,33,95,115,116,109,110,115,69,51,19,54,218,237,247,247,251,252,252,252,251,251,249,248,247,248,249,249,249,249,250,249,249,247,251,250,248,250,249,248,249,248,249,248,248,249,248,251,249,250,249,248,248,246,248,248,247,247,248,248,249,249,249,251,249,250,249,252,252,251,249,249,250,250,237,116,4,1,4,8,9,9,10,10,11,12,11,11,200,204,204,202,206,200,203,204,202,201,200,200,200,202,202,199,200,200,203,203,205,204,203,204,204,205,205,206,206,203,202,203,203,202,200,198,200,199,199,198,198,198,199,200,199,198,197,197,199,197,198,198,196,198,196,196,197,194,195,198,197,195,195,197,194,196,200,200,200,200,201,200,199,198,199,198,198,201,201,201,200,200,200,198,198,197,198,200,198,198,198,195,199,199,197,197,197,198,200,201,200,201,198,201,201,199,202,199,200,204,204,204,204,203,204,204,200,203,204,204,206,207,205,206,208,207,209,208,210,209,211,211,209,210,209,209,205,207,208,209,210,206,205,208,207,206,207,206,208,206,208,207,205,203,205,205,206,206,205,206,205,206,204,205,204,205,206,203,204,205,205,202,204,203,202,204,205,206,205,205,204,205,205,206,203,204,206,205,206,206,206,206,206,205,205,206,207,208,205,207,206,206,208,207,211,208,208,208,207,206,207,207,207,206,209,208,208,209,208,208,208,210,209,208,211,210,207,210,208,209,208,206,208,209,209,208,210,209,210,209,210,208,208,209,213,207,199,211,203,205,216,210,212,211,212,213,213,214,212,212,212,212,211,212,210,212,211,217,220,225,241,244,244,244,247,248,250,251,252,250,248,251,248,246,244,247,246,237,233,226,219,221,228,236,249,252,252,252,252,252,252,252,252,245,239,235,239,243,249,251,252,252,252,252,248,245,248,243,252,252,252,252,249,217,235,251,252,234,167,232,253,196,162,66,110,244,251,172,141,224,223,214,161,123,165,192,200,189,239,252,234,170,231,249,250,250,123,83,72,65,63,34,12,120,185,156,202,205,198,216,179,163,165,174,147,105,111,142,195,177,184,167,163,201,234,251,252,237,146,205,246,231,240,230,233,237,207,212,225,227,219,195,229,237,184,145,202,252,150,51,63,70,92,175,110,53,53,57,57,38,43,35,24,29,44,53,53,44,135,243,205,165,171,213,237,224,196,152,193,233,252,252,246,206,145,178,243,250,251,251,248,172,35,14,50,42,86,122,86,49,59,107,134,130,131,64,44,141,92,29,33,45,107,125,121,137,71,16,29,28,33,40,47,56,35,33,46,26,20,27,24,27,29,27,32,31,32,33,33,27,29,33,29,29,31,30,35,31,29,32,25,29,33,31,32,29,36,34,59,74,51,37,21,22,28,34,33,31,33,26,32,35,29,31,36,34,33,37,34,32,40,88,66,45,49,26,23,14,14,17,15,21,39,59,58,49,42,36,27,18,14,21,19,19,21,19,24,26,27,24,25,24,25,24,26,26,29,36,29,33,36,34,34,30,36,33,26,33,32,28,31,29,33,51,55,42,31,28,33,32,33,29,31,35,28,29,32,31,30,31,35,39,38,31,34,37,31,35,29,34,32,33,31,71,98,59,72,65,54,62,52,62,57,28,89,119,107,107,102,115,75,49,28,30,183,236,247,247,252,252,252,249,251,249,249,248,248,248,249,249,249,248,248,251,250,249,250,248,248,248,249,250,249,249,249,248,246,247,248,249,248,249,249,248,248,247,247,248,247,249,249,249,249,248,249,250,250,251,251,252,251,251,250,250,249,250,237,115,3,1,4,8,9,9,10,10,11,12,11,11,201,204,203,202,202,201,202,202,202,201,202,200,201,200,202,204,201,204,203,204,206,200,203,203,203,205,203,201,202,204,205,203,202,201,201,198,200,201,200,200,201,199,198,200,201,202,199,199,199,196,201,200,199,197,196,197,196,196,197,195,198,198,197,197,198,200,197,200,202,199,201,201,198,199,199,198,199,200,201,200,200,200,202,200,200,199,198,201,198,201,198,198,199,198,200,199,199,200,198,201,203,201,201,200,202,200,200,202,200,201,202,203,201,202,204,203,204,204,206,205,208,206,203,208,208,209,209,207,207,208,210,211,212,208,208,208,207,208,207,208,208,205,206,208,207,207,207,206,206,206,208,206,207,206,205,205,204,206,206,206,207,203,204,203,204,202,205,207,203,205,207,205,205,205,203,203,205,205,205,207,204,204,204,204,206,203,206,203,203,207,204,207,206,205,208,207,205,206,205,205,210,208,207,207,207,210,207,207,208,207,209,208,209,210,209,208,207,210,210,208,210,209,208,210,210,210,210,209,210,208,210,209,209,208,209,210,209,211,210,212,211,208,211,211,214,204,205,210,201,211,214,213,212,209,212,213,213,214,214,212,213,212,212,212,211,209,211,212,212,206,203,205,202,201,204,204,207,207,216,211,182,173,170,171,176,165,151,137,138,147,164,188,199,206,207,206,208,210,203,201,201,199,191,183,187,186,183,189,197,202,196,198,197,200,187,185,189,181,199,199,201,201,183,164,186,205,214,173,124,186,197,133,112,19,57,176,171,128,124,190,200,189,145,117,153,179,180,182,235,250,207,160,220,247,252,241,103,95,82,70,57,15,34,149,212,151,213,194,184,200,179,213,224,229,183,165,103,108,178,190,232,173,190,208,244,252,250,240,164,246,252,252,252,247,250,248,212,212,227,226,220,194,236,231,162,158,212,230,113,39,53,72,139,189,84,42,59,87,60,37,49,17,20,34,52,53,43,111,229,220,172,185,187,216,229,195,141,182,235,253,250,250,241,149,171,233,250,250,249,249,171,39,8,12,74,81,121,95,30,74,105,137,126,93,101,41,57,136,77,24,73,173,160,132,131,118,68,16,32,36,34,53,69,62,33,39,42,27,29,26,21,29,29,32,29,29,33,27,32,30,31,34,30,55,73,67,63,58,44,34,30,27,32,29,30,30,35,35,54,76,59,36,23,26,29,33,33,33,31,27,33,31,28,30,36,30,30,36,33,32,33,53,47,47,55,34,17,14,15,15,14,23,37,55,56,46,44,39,31,20,19,20,18,22,16,24,27,21,25,24,24,29,28,24,26,25,29,32,30,33,37,37,33,33,33,32,29,34,32,33,31,31,40,57,59,35,29,37,27,32,34,31,31,31,38,29,30,30,26,32,37,32,35,38,29,37,34,32,34,35,33,32,31,71,95,59,72,69,41,37,38,48,44,29,71,113,113,102,99,118,94,55,27,28,157,233,249,249,252,252,252,250,249,248,249,249,247,249,249,249,249,248,249,249,248,249,249,248,249,249,248,247,248,247,247,245,246,246,245,248,247,249,249,248,248,246,247,246,248,248,248,248,248,248,249,250,249,250,249,252,250,250,250,249,250,250,236,117,4,1,4,8,9,9,10,9,11,11,12,12,201,204,201,200,205,199,202,202,201,203,201,200,202,203,202,200,203,203,203,204,203,203,204,202,203,203,203,202,201,200,199,204,203,202,203,200,200,201,200,201,199,199,200,198,198,199,201,200,200,200,201,200,199,201,200,200,198,198,198,198,200,201,200,201,202,200,200,200,201,200,201,201,201,200,200,198,200,201,200,198,201,200,198,201,201,203,200,199,197,200,203,202,200,199,200,199,202,200,200,202,201,203,202,204,203,202,205,203,202,204,202,204,205,203,205,205,203,207,205,205,206,207,207,208,209,210,208,207,208,209,211,209,209,208,207,208,207,209,207,210,210,207,208,208,207,208,208,207,207,207,209,207,206,205,207,207,207,205,205,205,205,205,203,203,203,202,204,206,203,204,206,204,205,202,203,206,205,204,203,203,204,204,206,206,205,204,203,205,205,205,204,206,206,207,210,208,206,207,207,208,209,208,207,204,210,210,208,207,208,212,210,208,210,209,210,209,210,210,210,210,208,208,208,210,213,212,210,210,208,207,207,210,211,209,213,213,213,212,211,214,212,212,213,213,215,205,210,208,203,213,214,214,212,209,212,211,212,213,212,214,210,212,211,211,211,213,211,212,188,130,108,99,98,96,92,90,78,74,102,88,56,60,57,54,44,34,26,19,19,21,38,56,63,57,62,63,54,53,49,51,50,48,43,45,59,79,89,89,93,84,69,55,50,51,51,52,57,54,51,50,50,53,54,50,48,53,57,53,24,29,46,89,132,48,5,19,13,14,35,71,78,70,72,77,83,86,89,92,103,106,98,73,103,108,119,128,64,119,122,84,79,36,37,129,127,77,128,94,100,139,137,159,165,145,125,114,53,98,138,158,175,101,99,111,185,227,243,157,127,226,218,227,226,220,224,232,210,210,229,231,220,193,243,223,163,177,198,200,114,57,49,83,203,208,68,36,75,107,69,40,35,15,29,49,56,36,89,215,239,196,207,219,198,210,200,140,172,225,252,252,249,249,173,165,227,246,250,250,249,177,35,33,51,82,99,88,92,44,64,118,136,129,77,73,89,33,77,133,71,21,100,201,182,136,126,122,41,18,40,33,40,51,57,43,27,39,48,24,30,28,20,29,33,31,33,30,32,27,31,35,46,57,73,153,164,150,158,153,95,40,42,36,23,28,33,27,39,35,57,77,61,41,23,27,27,36,34,27,31,30,35,33,27,32,34,35,30,32,32,31,28,26,36,45,34,21,15,13,15,15,14,28,33,40,52,49,46,42,39,23,18,16,19,26,21,25,20,22,26,27,23,22,27,21,24,28,29,32,28,36,35,31,32,36,34,27,35,32,29,36,31,29,42,62,52,33,25,22,27,33,29,27,34,33,29,29,27,31,29,29,33,35,33,36,35,32,33,40,32,36,33,36,32,65,106,57,68,84,51,43,39,38,45,26,59,113,106,107,103,116,103,55,32,27,120,232,251,251,250,249,250,249,249,246,247,247,248,248,249,249,248,249,249,247,247,247,248,249,249,248,247,248,249,247,246,246,245,246,245,246,248,248,247,247,247,246,245,247,245,245,247,249,248,248,249,247,249,249,250,252,249,249,248,251,250,249,237,115,4,1,4,8,9,9,10,9,11,11,12,12,200,205,204,203,204,203,202,200,201,202,201,201,202,201,203,201,200,201,200,200,204,203,204,200,201,204,202,202,200,201,203,201,201,202,202,200,200,201,200,201,200,200,199,200,199,199,198,198,202,198,200,198,200,200,200,201,199,201,199,200,202,200,200,198,200,201,198,203,201,199,201,200,200,199,200,201,199,200,201,199,196,199,202,199,201,199,201,201,198,201,199,200,203,200,202,198,199,202,201,202,203,203,203,201,202,203,204,204,203,204,202,201,205,204,206,205,203,205,205,206,211,207,205,208,205,207,205,208,207,207,208,205,207,207,209,210,207,207,206,206,208,208,205,207,208,206,207,206,206,204,205,206,206,205,205,203,206,204,203,204,203,205,205,203,204,205,204,205,203,202,203,201,202,205,203,203,204,204,201,202,203,205,205,203,205,204,205,206,206,208,205,206,206,207,210,207,208,207,205,208,208,208,208,208,208,208,208,210,211,208,210,208,210,209,210,212,209,209,210,210,211,208,210,212,210,211,211,211,211,210,211,211,213,213,212,213,214,214,212,214,213,212,212,222,220,214,224,213,211,215,212,214,213,214,210,209,212,211,212,212,212,212,210,210,212,213,211,216,199,163,153,153,148,145,145,134,104,89,103,117,115,129,136,128,125,118,118,113,110,106,105,108,102,106,100,87,97,103,103,107,105,105,105,107,110,128,131,128,125,122,113,101,103,110,109,104,111,103,108,108,104,105,96,97,104,109,106,77,53,78,97,132,171,77,61,95,67,56,44,63,89,87,84,107,114,106,99,92,86,85,94,65,66,66,68,89,59,119,123,89,78,50,72,100,83,42,66,58,50,64,59,66,52,46,38,44,39,77,105,83,68,38,47,25,59,98,95,46,44,108,98,109,116,111,137,199,205,213,230,230,224,195,245,203,160,195,178,195,162,114,66,120,253,189,43,28,40,78,62,33,21,17,45,51,44,60,189,249,217,226,248,242,188,193,129,122,204,239,251,251,239,171,165,219,249,249,248,248,198,53,26,72,116,123,119,59,36,72,117,139,122,75,66,97,76,34,90,131,63,27,50,126,144,131,134,98,37,18,25,24,27,29,31,22,23,49,46,31,28,23,29,27,30,31,31,33,34,33,69,68,130,146,155,170,140,169,179,160,132,89,64,38,29,25,27,29,35,33,51,78,66,46,20,22,29,30,30,33,33,30,35,29,30,33,34,30,29,31,34,33,27,25,18,21,19,15,21,20,16,17,16,24,30,41,63,65,61,51,33,20,17,22,21,23,23,23,23,24,24,25,29,24,21,24,27,28,29,27,26,37,37,39,32,32,39,27,33,37,27,34,33,27,53,64,47,32,27,27,22,31,34,34,39,46,42,33,35,34,34,34,35,30,31,35,36,35,30,33,36,33,29,40,27,62,103,60,75,83,56,64,67,50,53,29,56,119,115,113,106,121,99,57,36,35,83,209,253,253,249,247,250,248,246,248,248,246,246,247,248,249,249,249,249,248,246,247,248,249,249,248,248,248,249,248,246,245,244,245,247,247,244,247,247,248,248,247,245,244,247,247,249,249,248,249,251,249,249,248,251,252,250,251,250,251,250,250,237,116,3,1,4,8,9,9,10,10,12,12,12,10,199,205,202,201,206,197,202,202,199,203,201,202,202,201,202,201,199,200,201,200,200,202,201,200,200,202,200,199,201,200,200,203,200,199,201,199,202,201,200,201,199,201,200,200,200,199,199,197,199,198,199,198,198,199,197,198,200,200,198,199,200,201,198,199,200,198,198,202,198,196,201,199,198,199,199,201,200,202,202,197,200,200,200,201,199,200,198,201,200,199,200,201,199,199,198,198,201,201,202,201,200,200,202,202,201,201,201,201,201,204,201,203,205,202,204,203,202,203,205,204,204,206,203,204,206,206,205,206,206,207,206,204,208,205,207,208,207,206,203,208,206,205,207,207,205,204,204,205,203,202,205,203,206,206,205,203,203,205,205,202,202,202,201,203,204,205,204,202,201,204,204,203,203,200,203,203,201,203,204,203,202,202,204,205,204,204,206,206,205,205,204,206,206,207,209,208,207,207,208,210,208,207,209,209,211,208,210,211,208,212,212,210,211,210,210,210,211,211,209,212,211,208,210,210,210,210,208,212,212,212,213,213,212,213,216,215,215,214,213,215,213,212,217,231,216,222,230,210,214,214,213,212,211,208,212,211,214,213,210,212,210,212,212,211,212,214,217,222,223,229,236,236,233,225,225,219,190,183,211,218,235,247,248,249,247,251,252,252,251,251,248,245,244,244,236,219,236,244,243,246,242,246,246,249,247,244,239,227,223,222,225,225,231,244,240,231,240,227,244,249,248,242,208,222,239,251,233,187,208,241,237,204,177,89,150,249,234,205,173,165,226,191,168,191,200,205,173,232,250,250,238,185,220,240,240,233,107,110,113,74,76,46,76,173,191,152,142,174,171,134,103,122,160,125,174,178,118,100,90,151,117,123,156,130,153,139,139,68,101,166,152,163,151,151,161,208,206,210,228,229,217,197,240,177,168,198,159,216,171,92,101,198,252,112,2,94,105,49,39,26,15,24,49,43,39,145,240,220,230,249,252,228,177,131,69,145,233,244,248,247,168,158,223,242,248,248,246,196,54,27,62,108,110,111,87,33,48,99,139,121,71,48,97,96,61,38,109,136,53,26,42,108,138,132,139,97,26,9,15,16,30,27,31,40,34,44,51,25,28,23,20,31,26,33,32,27,37,39,116,183,190,180,124,99,100,149,104,62,74,75,59,28,32,24,27,32,36,27,46,83,67,41,24,21,28,29,30,30,30,34,33,30,30,29,30,34,32,33,29,26,30,26,19,19,18,19,21,16,17,18,26,36,29,35,64,73,69,44,27,29,13,17,24,23,24,17,23,29,24,24,21,22,24,26,29,24,30,32,27,35,33,34,36,34,40,38,34,34,33,32,30,33,63,65,39,26,22,24,33,38,52,65,64,60,59,62,71,75,61,40,28,33,33,34,38,29,31,34,30,34,31,38,27,57,104,59,64,83,63,103,109,65,61,33,55,125,105,116,104,112,107,57,30,48,64,159,249,249,249,249,249,249,245,247,247,247,246,248,247,249,249,249,250,250,247,247,248,248,249,248,246,245,245,246,246,247,245,245,247,246,246,246,247,247,245,245,247,247,246,246,249,249,248,250,249,249,248,249,251,252,250,251,251,251,250,251,237,115,3,1,4,8,9,9,10,9,11,12,12,12,198,200,200,202,203,200,199,199,199,201,202,201,202,202,200,202,200,201,203,200,201,201,201,199,200,201,198,200,200,198,198,199,201,199,198,199,200,198,200,201,199,200,198,200,198,198,198,197,199,197,199,198,200,199,199,199,198,200,196,200,198,197,202,199,200,199,195,198,199,199,198,199,201,200,200,198,198,198,200,200,199,200,199,198,200,200,200,200,200,199,199,200,199,198,200,200,200,200,199,201,199,201,201,199,200,200,202,203,201,201,202,201,202,200,203,203,202,205,204,202,203,201,203,207,204,204,204,206,203,206,208,205,206,205,205,206,208,207,208,204,206,206,205,206,204,204,203,206,205,204,202,202,205,203,202,202,205,205,205,204,202,202,199,201,202,202,202,203,201,200,204,205,201,202,201,201,203,203,202,200,201,204,202,203,203,205,205,204,206,204,206,207,206,208,208,208,210,209,209,207,210,208,207,208,207,210,210,210,212,211,211,210,212,210,212,214,212,212,210,211,212,210,212,212,210,212,211,211,213,213,213,214,214,213,215,216,216,215,213,214,213,212,214,224,171,146,185,201,214,215,213,212,210,212,212,210,212,214,211,211,214,214,215,216,217,217,216,214,212,209,210,212,208,211,217,225,202,197,235,241,242,246,242,240,243,245,248,247,249,251,251,252,250,252,248,236,249,251,251,251,251,252,250,249,249,248,245,241,231,230,227,231,233,244,245,231,246,232,244,251,252,245,212,229,245,252,236,211,248,249,248,214,167,77,150,246,249,249,202,190,242,192,142,140,122,103,139,230,252,252,253,227,249,252,251,251,155,124,129,85,81,53,57,184,248,218,233,252,210,203,209,247,252,212,252,252,221,129,128,204,164,201,221,211,252,252,209,139,195,249,251,251,245,245,236,242,210,212,225,226,214,194,220,167,197,192,164,227,111,30,71,198,226,34,47,205,131,41,29,17,27,33,39,24,116,219,208,218,246,253,250,206,114,91,130,202,248,229,235,180,152,218,247,249,245,246,200,56,28,57,107,110,113,84,33,35,87,129,124,79,47,97,105,64,50,61,135,128,45,30,100,159,142,130,142,82,23,16,15,31,31,42,48,49,45,50,42,25,25,21,28,27,35,28,29,29,39,74,88,157,171,127,97,108,122,128,81,66,83,77,57,45,33,20,24,32,34,25,56,81,66,47,23,21,26,31,34,32,33,33,31,29,27,31,31,29,31,31,43,50,38,30,27,26,29,22,18,23,20,23,38,39,33,38,48,56,56,41,30,17,15,21,21,27,27,22,24,26,27,27,24,27,29,29,28,29,32,31,25,36,32,36,39,34,33,34,37,34,28,35,29,39,66,61,37,23,19,27,35,33,49,61,79,77,92,131,128,102,74,58,44,35,36,29,33,34,31,32,32,35,29,38,28,57,106,65,57,74,68,118,128,70,66,36,57,128,108,108,97,115,101,64,29,57,62,100,243,244,249,249,245,250,244,246,247,248,247,247,248,248,249,249,250,247,247,248,246,245,246,248,245,245,246,246,246,246,245,245,244,245,246,248,247,247,247,245,245,246,246,247,246,248,248,249,249,248,248,248,250,250,249,251,250,252,250,250,237,116,4,1,4,8,9,9,10,9,11,12,11,11,197,202,200,200,200,196,201,200,198,203,198,199,201,200,199,201,200,200,200,199,202,202,200,198,198,199,200,199,199,199,198,199,196,200,198,196,203,197,199,201,199,200,197,199,197,196,198,197,199,200,197,200,201,203,203,200,201,200,200,200,198,200,199,199,200,199,200,199,198,199,200,200,199,198,199,200,197,198,198,196,199,199,199,199,200,200,198,201,200,198,200,201,200,199,199,201,203,200,200,201,202,200,200,204,200,201,200,200,202,200,201,201,200,200,201,204,201,203,204,203,205,203,203,203,204,207,202,204,205,206,207,203,204,203,206,205,203,205,205,207,205,206,206,204,201,203,205,203,204,204,207,205,203,203,202,204,203,204,202,202,203,201,203,203,205,204,203,201,203,203,201,203,202,199,202,202,203,202,202,203,203,203,204,202,204,205,204,206,207,207,206,206,205,207,209,207,206,207,207,207,207,207,209,207,210,208,210,211,209,211,211,210,212,214,213,214,215,213,212,213,215,211,212,212,213,215,213,213,213,215,215,215,216,216,217,214,217,213,215,214,211,214,214,207,92,52,150,201,218,217,213,214,214,214,216,210,213,214,214,216,217,219,216,214,208,203,199,194,189,192,194,197,203,208,217,225,205,199,224,219,223,223,216,219,217,218,218,219,221,221,223,224,225,225,222,212,226,230,221,226,225,224,222,223,220,222,222,222,222,213,210,208,208,213,207,202,218,207,221,228,229,214,180,212,222,236,199,186,232,234,234,193,153,66,142,244,241,232,138,102,117,66,42,41,28,27,20,50,87,110,106,74,133,162,179,157,59,105,108,94,111,59,55,148,194,159,185,219,174,188,199,249,225,176,208,168,212,135,104,159,95,150,208,212,252,252,189,158,224,241,251,247,241,244,237,243,206,210,224,225,218,197,204,181,220,169,134,184,87,53,63,176,159,1,144,240,85,39,22,23,39,45,30,52,194,202,203,244,252,252,230,143,66,150,192,242,244,223,177,150,214,249,249,247,246,209,63,32,59,101,116,113,89,31,35,96,130,132,96,63,102,113,69,51,44,61,141,111,41,32,53,148,151,134,141,60,26,28,28,37,40,58,57,60,50,48,36,18,28,22,24,28,36,32,27,33,41,48,50,81,114,112,105,102,117,137,104,81,66,47,42,44,37,19,20,29,34,26,57,84,68,51,25,19,23,27,36,28,31,33,30,33,39,37,33,35,33,39,89,82,39,33,28,37,35,33,30,22,25,26,33,43,39,42,41,44,42,33,21,16,19,19,24,23,25,25,23,26,24,33,50,43,35,28,31,27,27,32,27,35,37,36,34,36,37,35,33,36,33,35,30,46,83,60,36,22,16,29,33,35,37,63,90,87,117,123,102,98,97,91,61,42,37,34,33,33,34,34,34,31,31,40,26,59,111,71,54,51,45,70,76,50,51,33,44,122,109,110,93,107,117,72,30,51,65,51,201,240,249,249,247,252,247,246,247,245,245,245,248,248,249,250,249,249,249,248,248,247,246,246,246,245,244,244,244,244,244,245,244,244,244,244,245,246,245,244,248,248,247,247,246,246,247,246,248,249,248,249,250,252,248,248,250,251,250,250,237,116,3,1,4,8,9,9,10,9,11,12,10,12,198,202,200,198,202,199,200,199,199,200,200,199,198,199,200,200,197,199,200,199,200,198,199,198,198,199,198,199,198,198,198,197,198,199,199,196,199,197,199,198,198,198,196,195,198,200,197,198,199,198,200,200,200,200,199,201,200,200,199,200,201,199,198,199,199,201,200,199,200,199,198,199,199,196,199,199,200,200,199,201,199,198,198,196,199,200,199,199,200,198,199,199,199,199,200,200,200,202,199,201,199,200,200,199,202,201,199,199,200,200,200,199,200,200,199,200,202,204,202,202,205,204,203,205,205,206,206,205,204,205,205,201,203,203,203,205,205,203,202,206,205,204,204,204,201,204,204,205,201,204,206,203,204,203,204,204,202,202,201,202,202,204,203,203,201,200,202,201,203,203,204,203,201,203,203,202,202,205,204,202,203,205,203,203,204,204,205,204,207,205,206,208,205,208,205,207,208,204,207,206,208,207,209,209,209,210,208,209,212,209,212,212,212,212,210,214,213,214,213,212,214,214,214,214,213,214,216,214,217,216,214,214,213,213,214,214,213,213,212,211,212,211,218,206,115,118,191,211,220,212,216,215,217,224,221,218,216,217,214,214,210,203,198,194,192,188,194,195,199,208,210,216,217,220,221,228,213,202,222,214,216,219,213,215,214,215,216,216,218,219,218,220,216,226,225,210,232,231,219,217,216,219,218,217,219,218,218,220,219,222,214,208,203,206,205,188,207,197,212,228,225,203,175,209,220,232,171,176,224,222,233,185,151,67,145,241,236,216,83,31,44,49,79,105,110,84,88,79,68,74,58,27,27,45,66,64,12,32,42,41,81,63,43,94,93,48,71,110,83,98,107,122,105,66,70,49,107,92,53,54,29,144,217,203,250,248,154,175,229,228,235,227,228,229,226,234,203,213,226,229,226,191,193,207,236,151,122,162,115,89,96,192,90,17,212,203,38,26,24,34,54,41,30,116,188,186,232,253,251,241,197,114,121,208,222,229,240,187,145,206,252,252,248,248,217,71,34,60,101,112,113,91,35,36,107,147,134,96,55,90,117,78,52,40,36,72,146,105,45,22,31,130,139,134,123,47,31,29,33,40,39,45,45,47,42,28,21,24,23,25,29,29,34,31,28,29,29,61,87,77,77,69,59,42,66,87,63,45,22,29,33,23,31,29,25,27,36,29,63,91,73,50,24,21,21,30,32,33,33,28,29,57,85,58,33,50,60,112,135,98,62,52,43,39,56,76,49,26,23,23,37,43,52,43,39,43,32,25,29,19,20,24,24,29,25,28,24,26,23,69,98,50,42,34,25,33,34,31,32,36,37,38,34,36,36,33,33,32,33,31,29,66,88,57,30,21,18,30,42,46,60,77,73,61,77,87,88,103,112,106,81,66,52,36,36,33,33,35,32,36,31,38,26,65,122,79,45,36,28,32,39,32,33,24,38,107,111,107,94,100,106,72,39,46,69,28,136,237,252,252,250,248,249,249,247,246,247,247,249,248,248,248,249,249,249,248,245,244,245,245,245,246,244,244,244,244,244,244,246,245,246,245,244,244,244,245,247,247,246,246,248,247,245,247,248,249,248,249,250,250,250,250,250,251,250,250,237,116,4,1,4,8,9,9,10,9,12,12,13,13,200,203,200,201,200,196,199,200,200,200,198,199,198,196,199,200,198,199,199,198,198,198,198,197,196,197,198,197,198,199,199,198,198,199,199,198,200,194,197,199,195,199,194,197,198,196,198,196,198,198,196,200,196,198,197,196,200,198,197,199,201,199,198,198,200,198,199,198,199,199,196,200,200,200,198,197,199,199,200,199,200,198,199,199,198,198,198,199,199,198,198,198,198,197,199,199,199,198,199,200,200,200,198,201,199,199,203,198,198,199,199,199,198,200,199,199,200,201,202,200,201,201,200,203,203,202,202,203,203,202,201,202,203,202,204,203,201,203,203,204,201,201,205,203,201,202,203,202,203,201,203,200,202,205,202,203,200,202,204,203,203,200,203,200,202,202,202,201,202,204,200,202,205,200,203,203,203,203,202,204,203,206,205,203,206,205,203,203,204,207,206,207,208,206,207,205,208,206,206,207,206,208,206,205,207,206,208,210,209,208,210,210,212,212,212,213,214,212,212,214,213,214,213,212,212,214,213,214,214,215,215,212,214,214,214,213,213,211,214,213,210,213,221,223,189,215,237,222,220,222,228,231,242,241,227,215,211,207,199,195,192,191,189,193,198,206,211,214,215,216,217,218,216,217,217,224,206,198,220,210,213,216,213,218,215,217,217,217,217,219,218,219,215,227,231,204,231,236,219,223,219,219,217,216,216,220,220,221,221,220,225,220,211,212,204,194,201,194,210,225,229,198,182,218,226,232,172,193,229,232,240,191,158,65,152,245,244,222,136,130,149,167,190,212,211,220,244,248,243,235,210,153,190,211,221,200,59,31,11,17,77,66,41,100,153,125,141,152,129,118,113,110,99,110,130,138,158,108,58,47,67,183,234,213,252,244,153,211,239,232,251,240,248,245,244,249,222,232,240,244,241,184,196,244,237,170,149,164,160,108,128,195,68,76,246,123,6,37,22,49,50,39,96,128,184,226,253,253,250,203,161,168,184,230,199,214,192,155,199,238,252,252,247,213,72,29,57,105,114,108,99,37,47,117,143,144,86,66,59,63,83,55,48,52,52,89,153,94,44,19,63,157,115,122,101,33,35,36,24,22,23,19,19,25,27,16,24,24,22,24,27,36,31,34,34,50,64,72,82,61,49,40,47,36,25,53,53,27,48,59,27,26,46,46,31,22,29,24,63,94,75,55,26,23,22,27,35,29,33,31,33,85,114,80,74,139,165,174,150,131,159,132,79,80,155,141,60,31,23,21,25,44,61,48,42,41,29,29,33,19,23,22,24,53,49,32,26,36,64,108,108,76,64,52,35,38,71,60,34,37,36,40,35,31,33,35,33,28,32,30,34,79,96,52,30,19,22,61,65,43,51,57,57,47,38,41,62,71,72,78,71,61,43,33,29,33,33,37,35,30,31,41,28,61,121,81,52,51,49,36,32,38,31,39,44,104,114,107,98,98,105,73,43,35,77,29,41,188,247,247,246,246,249,251,249,247,248,246,246,246,247,248,248,247,247,246,247,247,246,245,247,247,244,244,244,244,245,244,244,245,244,244,246,244,245,245,245,247,245,246,245,246,246,245,246,246,246,249,249,250,249,249,251,251,250,250,237,115,4,1,4,9,9,9,10,9,11,11,11,11,198,201,198,198,198,198,198,196,199,199,199,197,198,197,198,198,199,197,196,199,196,196,198,198,199,196,196,199,195,197,195,195,197,194,198,197,198,197,196,197,198,196,196,196,195,195,193,198,197,198,198,196,198,196,196,198,196,198,197,197,199,199,200,199,196,196,197,195,198,198,196,199,198,198,198,196,201,197,196,198,196,200,198,198,199,198,197,200,200,199,199,201,200,197,200,199,198,197,196,198,199,199,198,199,198,199,200,199,199,198,201,202,200,200,198,200,200,202,202,201,204,201,200,201,201,202,201,200,201,203,203,199,204,201,201,203,203,204,201,205,203,201,202,201,203,203,201,203,201,201,200,200,202,202,203,201,199,200,202,203,201,199,200,201,201,203,203,202,203,201,201,205,203,201,201,203,201,203,204,202,203,205,203,203,205,204,206,205,205,203,202,204,204,207,205,206,204,203,207,205,207,206,207,205,207,208,206,208,209,209,210,212,210,210,211,212,212,212,212,213,216,212,211,212,214,213,214,212,212,213,212,213,212,215,215,212,214,213,213,213,212,213,222,211,207,236,237,229,225,230,236,217,217,222,211,204,200,201,199,201,205,212,217,221,226,225,230,226,225,227,226,225,223,224,223,225,209,199,223,224,223,227,224,227,229,230,231,232,230,230,230,234,234,241,228,159,189,238,238,240,234,235,235,235,239,238,238,239,240,243,241,244,244,240,236,213,225,214,216,237,235,208,203,239,246,247,191,219,249,249,250,186,134,63,150,245,252,238,208,207,217,223,226,216,215,252,252,252,252,252,252,246,252,252,250,250,126,38,28,39,88,57,39,162,249,251,252,252,229,173,234,247,251,231,242,251,215,230,180,120,146,210,241,225,252,246,167,242,252,252,252,252,253,253,252,252,251,251,252,252,252,185,210,251,191,129,124,199,207,116,158,155,61,170,220,50,8,33,33,52,49,132,134,127,217,253,252,244,218,141,180,212,206,208,179,159,141,199,224,252,248,248,222,67,24,49,94,112,114,97,43,52,108,141,140,94,55,56,29,41,64,55,57,61,51,98,150,78,38,14,96,158,102,117,85,33,35,31,13,17,19,19,23,20,25,28,23,22,23,21,28,34,31,33,36,80,104,69,40,43,46,44,57,48,55,84,75,53,65,63,40,41,60,54,39,23,28,27,62,114,95,67,30,16,24,24,35,30,34,31,38,66,81,86,88,113,122,93,57,58,121,134,115,120,141,112,57,31,17,24,43,98,113,60,34,33,31,30,26,22,25,22,29,57,61,46,59,73,95,113,129,158,146,91,73,133,137,73,31,27,38,35,35,30,36,36,31,36,32,31,44,97,92,47,27,16,46,86,69,41,39,65,74,53,37,55,68,61,50,46,47,36,27,24,27,33,33,34,33,34,33,35,29,56,114,89,58,62,61,48,58,53,47,50,51,123,121,104,98,93,111,82,41,32,76,49,30,89,224,236,245,245,250,251,250,251,248,246,247,245,246,248,248,248,246,245,246,248,246,245,245,245,244,244,244,244,243,244,244,244,241,243,244,244,246,244,244,244,245,244,246,246,244,246,246,248,247,249,248,248,250,248,249,249,248,249,236,116,4,1,4,8,9,9,10,10,12,13,12,12,195,201,199,199,200,196,198,198,198,198,197,198,198,196,198,197,197,198,194,196,199,196,197,199,195,195,196,196,195,193,195,195,196,200,197,194,197,193,195,196,196,198,193,197,197,194,198,197,195,197,198,201,196,200,199,197,198,198,200,198,199,196,198,199,199,199,199,196,197,199,198,199,198,199,198,199,200,198,198,196,198,198,199,202,202,201,200,200,199,198,199,199,198,200,199,200,200,198,201,200,201,201,198,202,199,199,200,198,199,201,201,199,198,200,199,199,200,199,201,198,199,202,199,200,203,201,203,200,200,203,202,202,201,201,203,203,203,203,200,202,200,201,202,201,203,201,203,201,201,203,203,202,200,200,201,204,201,202,204,200,199,200,201,200,202,199,200,200,199,203,200,200,203,199,203,203,200,202,202,205,201,202,203,202,203,202,203,204,203,205,205,205,203,200,206,205,205,205,205,204,205,207,206,207,207,207,210,209,209,213,211,211,215,213,214,212,213,212,214,215,215,218,216,217,217,220,217,219,221,219,222,222,225,224,224,225,224,225,229,230,230,230,230,214,216,244,245,244,240,240,203,110,97,165,216,227,227,235,241,245,248,250,251,249,251,252,251,251,248,248,247,248,247,248,249,247,235,233,248,249,250,251,251,251,252,252,251,251,252,252,251,251,252,252,217,106,150,233,250,250,252,252,251,251,252,252,251,251,252,252,252,252,252,252,252,236,235,233,229,225,229,200,208,236,242,234,179,224,234,232,219,140,113,59,143,240,240,217,187,182,165,150,139,147,175,234,246,246,247,246,222,167,217,240,252,228,89,69,69,82,64,18,37,133,199,199,194,202,170,129,190,215,236,190,212,199,171,245,186,139,145,183,194,185,246,188,149,227,227,233,232,217,218,216,214,212,205,226,239,240,234,135,184,215,99,74,103,195,191,123,173,92,54,209,150,2,28,35,26,43,122,204,133,153,246,252,241,215,152,142,236,227,181,186,136,122,189,229,243,252,244,236,91,22,48,90,113,111,103,46,51,102,129,139,84,63,56,33,14,19,55,53,54,55,49,76,120,78,33,26,128,153,96,118,69,21,23,14,18,21,16,17,21,24,23,29,23,26,27,22,27,25,30,31,32,58,106,97,65,53,40,67,83,78,108,134,118,105,112,94,67,53,50,46,33,19,26,26,67,113,107,74,36,22,17,22,33,34,31,36,83,95,45,33,24,27,24,27,29,19,27,29,33,31,26,43,77,53,31,33,112,201,145,64,35,31,29,27,27,24,21,19,27,34,47,45,47,48,51,41,33,83,113,102,104,116,94,60,34,29,33,31,35,34,34,31,34,37,41,31,60,108,75,44,29,22,69,76,71,56,54,107,101,74,74,97,99,64,39,35,31,24,33,29,23,29,27,34,31,36,37,37,26,52,115,90,75,70,67,63,76,71,57,62,50,121,131,107,95,88,111,90,45,22,163,177,63,46,135,227,250,250,251,247,251,252,251,246,248,247,247,245,247,247,246,244,244,245,244,244,247,245,244,244,243,242,244,242,242,244,243,242,243,244,244,244,244,243,244,244,245,245,246,247,246,248,246,248,248,247,247,247,247,248,247,247,236,116,3,1,4,8,9,9,10,9,12,12,12,12,200,203,200,200,199,199,196,196,200,199,198,200,202,198,199,199,196,196,195,196,196,196,196,195,198,196,197,198,196,195,196,195,196,196,196,196,197,195,196,197,197,195,194,197,198,198,196,198,197,198,196,195,198,198,198,200,196,198,197,197,199,194,198,200,197,199,198,199,199,198,198,197,198,202,199,197,200,197,201,200,200,200,199,200,200,199,201,199,198,200,200,202,200,199,200,200,202,202,201,202,202,200,199,198,196,199,200,198,201,200,201,199,201,203,199,202,199,201,200,198,201,200,199,200,200,200,200,200,200,200,201,201,205,203,202,202,200,200,200,202,200,203,203,202,200,200,202,204,203,202,201,203,202,200,201,202,202,200,202,202,200,200,201,200,200,200,199,201,200,202,201,200,201,201,202,200,202,200,201,203,203,202,201,203,203,201,204,206,205,205,205,206,206,203,204,208,208,206,206,206,208,206,208,208,209,208,208,213,213,213,212,212,211,212,214,212,212,215,219,226,234,237,240,239,242,243,246,246,246,247,247,249,249,251,251,251,252,252,252,252,251,251,246,223,236,249,251,248,235,240,194,95,101,192,244,248,252,252,249,249,251,251,250,249,249,246,249,248,245,245,244,243,242,244,248,248,240,232,241,246,243,244,244,240,242,239,241,235,237,232,221,228,226,218,173,104,133,184,205,222,221,218,218,219,217,213,214,211,205,210,205,198,198,198,199,174,181,184,178,186,169,155,158,166,167,146,116,138,127,125,126,106,106,77,96,139,150,135,112,93,94,83,77,89,85,114,137,126,138,108,100,57,51,90,118,114,58,86,107,86,51,28,53,62,57,59,51,59,76,35,42,38,83,61,47,74,53,107,112,104,107,104,103,80,133,71,68,130,98,95,79,68,77,72,73,58,76,90,86,98,141,101,71,125,98,117,112,161,149,120,191,61,79,217,74,5,45,23,25,92,204,203,140,193,251,251,226,148,141,200,249,202,157,175,149,192,234,251,250,250,247,103,29,47,93,112,113,105,51,44,104,137,134,91,62,67,34,18,12,17,39,53,56,57,46,71,110,73,33,41,152,137,101,118,49,19,19,11,16,19,23,22,21,25,26,24,27,23,24,25,28,30,31,34,31,28,67,106,93,54,54,122,139,105,116,139,142,120,111,97,60,44,37,40,32,20,19,26,52,78,84,64,33,30,23,22,30,34,27,58,130,96,34,21,17,26,26,25,30,28,28,24,24,28,24,47,103,80,36,42,134,172,90,49,42,32,31,27,29,32,18,29,53,34,23,21,20,17,22,25,21,25,30,30,30,29,32,74,63,33,38,32,34,33,33,31,34,40,46,53,83,100,61,39,21,42,88,94,85,67,75,170,144,103,117,133,128,78,53,51,29,34,46,34,21,24,29,33,33,34,37,37,31,43,91,82,79,86,70,69,83,71,68,69,48,123,132,110,102,92,112,94,38,34,207,236,177,121,38,168,245,245,246,244,252,252,249,249,250,248,248,247,245,246,246,245,246,247,244,244,245,245,244,244,244,244,242,242,243,243,244,244,244,243,244,244,244,242,244,244,245,246,245,246,246,246,247,248,248,248,248,248,249,250,249,248,236,116,4,1,4,8,9,9,10,10,11,12,11,11,198,198,196,198,199,195,196,198,197,197,198,195,196,197,197,196,198,195,195,196,196,196,194,195,194,195,196,195,195,193,196,194,193,197,194,193,198,196,196,194,196,199,197,197,197,195,197,196,196,197,196,196,196,198,195,197,197,196,197,196,198,196,198,195,195,195,196,195,197,198,197,200,196,198,198,196,198,196,199,200,199,200,200,199,198,198,200,199,199,200,200,199,199,200,200,199,200,200,199,198,200,202,199,200,200,199,200,198,200,200,198,201,200,203,200,200,201,201,203,199,201,201,199,202,203,199,199,203,201,202,199,201,202,200,201,203,201,202,202,203,199,203,203,201,205,199,200,202,200,202,203,200,201,201,200,202,200,201,201,200,200,199,200,200,201,199,202,200,198,202,200,201,201,201,202,203,201,203,202,203,204,202,201,203,203,203,203,204,205,203,204,204,205,205,208,207,208,207,208,209,206,206,209,207,208,212,211,212,212,213,212,211,214,211,214,210,212,227,228,246,250,250,252,252,251,251,251,251,251,251,251,251,250,250,250,250,248,248,242,236,230,220,221,219,188,184,190,182,186,199,190,155,170,192,191,194,191,181,174,173,174,171,176,167,166,159,155,160,157,163,162,162,159,157,163,159,158,153,156,159,158,155,151,151,151,152,151,144,160,148,119,125,121,137,128,112,130,118,116,122,127,127,119,116,113,111,106,100,99,97,93,87,80,87,80,96,85,100,126,110,134,116,112,118,92,72,51,57,46,60,71,83,96,74,77,75,81,79,61,76,90,81,90,73,38,39,41,46,34,21,34,37,19,16,41,52,53,110,125,101,52,46,67,63,33,22,23,41,44,25,47,16,37,21,24,41,33,59,31,80,97,69,78,57,33,8,35,55,25,27,17,23,29,42,46,18,46,35,11,27,128,100,42,97,134,147,141,174,104,136,198,63,151,231,41,15,40,3,55,178,206,191,170,223,250,244,179,134,191,242,250,158,127,174,210,246,252,252,249,249,129,27,42,85,121,117,104,49,49,98,133,143,89,70,68,39,19,13,15,23,46,68,64,68,51,95,127,67,23,55,160,130,118,118,42,18,18,15,17,16,24,26,26,25,20,24,27,23,26,25,30,29,31,30,34,30,38,78,77,53,48,80,88,76,114,134,103,82,59,54,47,29,32,39,39,24,18,31,54,68,72,51,38,53,35,24,30,30,28,50,115,89,33,27,15,27,35,37,41,34,37,33,22,29,31,51,114,90,40,31,46,62,44,40,44,33,25,32,33,24,19,39,56,35,24,20,17,24,26,24,24,23,25,23,28,27,38,93,88,42,35,35,34,27,32,36,43,71,77,81,94,81,54,27,38,69,107,100,73,51,81,130,113,120,128,134,120,78,62,49,39,51,49,32,23,22,26,31,37,31,32,43,32,34,54,41,54,65,46,42,50,53,65,77,48,113,139,122,111,90,118,108,37,32,190,227,229,167,25,46,213,238,247,247,249,251,248,251,251,248,247,245,245,246,245,245,245,247,245,244,245,242,243,243,244,244,244,243,244,244,244,244,244,245,242,242,244,244,244,244,244,245,245,248,247,247,246,245,247,247,245,247,247,248,247,247,236,115,4,1,4,8,9,9,10,10,11,11,12,12,195,199,195,196,196,196,196,195,198,196,198,198,195,194,194,197,194,198,197,198,196,195,196,194,195,193,193,195,196,195,195,194,195,195,195,194,195,195,194,196,193,195,198,194,196,196,192,195,196,195,197,198,197,195,195,198,197,196,198,196,198,195,196,196,196,198,195,197,198,198,198,196,198,196,194,198,198,198,195,196,199,196,199,198,200,198,198,200,201,200,199,200,199,199,201,202,201,200,200,200,199,198,201,203,201,200,200,203,203,200,200,198,200,200,200,200,197,201,201,200,199,202,201,201,202,199,202,201,200,201,201,198,200,200,200,200,200,201,200,200,201,202,199,199,201,203,202,200,198,200,201,200,200,200,201,201,201,200,200,200,198,199,199,200,200,199,199,202,201,200,201,201,201,200,202,201,203,201,203,205,200,203,203,204,205,202,204,204,203,205,205,205,206,208,211,208,209,209,210,210,209,208,210,211,211,212,212,212,213,213,214,213,213,212,212,210,212,223,190,177,183,180,181,175,176,171,170,167,166,163,158,155,151,150,143,140,141,134,129,125,116,119,169,179,98,73,87,91,97,103,101,108,113,104,98,89,93,82,72,83,71,73,76,79,73,66,66,61,67,80,84,81,73,62,62,64,68,65,64,67,66,63,65,59,59,60,69,64,101,105,48,53,52,89,112,108,119,106,88,63,62,64,56,56,50,49,47,47,45,51,48,47,45,40,43,63,61,51,79,102,115,121,107,94,111,92,60,53,25,44,61,71,88,68,63,77,72,83,94,89,109,90,90,87,36,24,24,29,28,32,45,56,47,44,41,39,69,118,135,110,65,57,68,80,56,41,36,53,62,50,55,22,57,34,37,63,36,51,27,61,90,71,86,72,46,16,16,37,24,32,23,37,43,35,46,27,53,39,7,25,70,60,41,84,114,114,163,161,84,155,179,93,210,236,49,18,23,6,121,191,190,207,201,229,242,201,148,181,242,249,230,128,97,163,232,246,253,253,249,160,36,41,87,111,118,84,57,46,95,127,137,94,57,59,39,19,14,12,25,47,81,101,103,117,88,141,141,44,12,76,161,112,129,112,33,21,11,16,22,19,23,23,30,27,21,23,25,21,24,26,28,31,27,29,37,28,35,54,77,69,35,33,25,68,98,76,56,49,37,53,46,24,37,51,43,28,21,24,53,68,60,49,55,61,33,31,33,27,33,34,96,95,41,29,24,29,35,45,40,38,37,36,24,29,32,73,122,74,23,22,28,31,37,37,36,37,33,37,30,29,19,34,53,30,25,19,20,27,27,27,26,23,24,24,25,33,41,109,101,46,36,25,36,34,32,31,68,87,82,99,85,63,39,26,57,77,89,77,53,42,29,55,113,142,127,82,71,57,58,55,42,46,36,27,31,29,25,27,32,36,33,32,33,35,30,29,29,32,31,28,27,29,59,69,43,108,141,134,113,89,115,100,52,14,129,154,147,129,25,29,168,237,250,250,252,252,248,249,249,248,245,245,245,245,245,244,244,245,245,244,243,242,243,244,244,242,242,244,244,242,244,242,244,244,244,244,244,245,244,244,244,243,245,244,244,245,244,245,246,246,244,244,246,247,247,246,236,116,4,1,5,9,9,9,10,9,12,12,12,12,196,199,198,195,199,196,195,198,197,199,196,195,197,198,196,192,196,197,195,193,197,194,194,196,193,193,194,198,195,193,197,196,194,196,195,195,195,191,195,196,196,195,195,196,196,196,196,194,193,196,196,196,196,195,196,195,194,195,195,196,195,194,199,196,196,198,198,198,197,197,197,200,199,198,198,199,199,198,198,196,196,195,200,199,198,199,198,197,198,200,200,201,200,199,202,198,199,199,199,200,199,200,199,200,202,200,201,201,202,201,200,202,196,200,200,200,200,199,200,200,201,200,200,198,200,198,200,202,200,201,201,202,200,201,202,201,200,201,200,200,200,202,204,200,203,201,199,202,200,201,200,200,202,201,200,201,202,202,200,200,201,198,199,202,200,201,201,200,199,198,200,201,201,198,199,201,200,204,203,205,203,203,205,203,205,202,205,206,206,207,208,210,210,210,211,212,213,212,212,214,212,213,214,212,214,214,213,214,213,214,214,211,212,211,212,212,210,207,104,19,17,13,12,12,11,11,9,10,11,10,10,10,10,10,10,10,10,10,12,10,12,9,61,91,27,5,10,9,12,10,13,12,10,12,12,12,13,12,13,13,12,12,12,12,12,12,12,12,13,13,12,12,13,13,13,14,13,13,14,13,14,14,13,14,13,12,14,12,21,23,12,13,12,14,15,17,23,18,14,14,13,14,14,15,14,13,14,14,14,14,15,15,13,15,13,18,17,15,18,15,60,76,60,66,53,72,89,58,21,10,11,17,30,22,35,67,67,85,90,84,84,66,66,26,8,26,26,27,30,31,33,41,40,31,32,22,43,93,133,114,74,65,68,86,46,32,24,25,25,44,34,12,58,28,33,28,12,30,11,27,38,39,77,95,48,27,21,20,27,32,33,29,36,27,29,11,33,39,12,13,24,12,39,83,66,86,174,105,95,194,189,171,248,190,21,38,15,46,150,174,210,238,212,233,201,144,184,240,249,249,199,141,133,174,227,248,249,249,156,44,38,81,109,109,112,46,53,102,130,138,89,66,55,35,19,10,18,27,46,86,117,124,123,129,101,156,134,30,18,105,156,102,122,87,29,20,9,20,19,22,24,24,25,24,27,24,24,25,23,24,29,27,27,36,34,32,33,55,93,59,27,22,21,60,77,67,39,30,27,35,36,24,46,57,42,32,21,19,40,55,61,45,49,53,32,29,27,29,37,21,66,96,45,30,28,30,38,39,37,38,37,38,31,28,28,74,113,48,19,24,29,41,30,40,41,39,38,36,31,24,16,25,57,36,24,21,19,29,23,28,28,22,32,24,29,28,65,122,81,39,31,29,30,33,36,44,73,85,81,93,81,50,35,23,54,74,67,59,41,37,33,45,88,98,71,56,49,36,47,54,45,39,34,32,33,27,28,29,34,35,27,37,33,33,33,27,39,33,37,36,27,29,37,48,27,96,139,131,119,83,106,101,58,30,45,57,24,42,26,28,182,244,250,250,246,252,249,250,249,249,246,244,245,245,244,245,245,245,245,244,244,244,244,244,244,242,244,243,244,244,243,244,242,244,244,244,244,243,245,243,244,244,243,244,243,244,244,244,244,244,243,243,244,246,243,245,237,116,4,1,5,9,9,9,11,9,12,12,13,12,196,201,197,199,198,195,197,196,198,194,195,196,193,197,198,194,194,193,194,196,195,197,194,196,196,196,198,195,196,194,195,195,196,198,196,195,196,194,197,196,194,196,197,193,195,198,196,197,194,194,197,193,195,196,195,196,194,194,195,196,196,195,198,196,198,198,196,197,196,195,195,198,197,197,196,198,198,197,196,198,200,199,199,195,200,200,199,200,198,199,200,200,199,199,199,198,198,199,200,198,198,199,201,198,198,199,198,200,199,198,200,199,200,200,199,202,200,202,199,199,200,200,199,198,202,200,199,200,199,200,199,200,201,200,202,201,200,203,200,201,201,199,200,203,201,200,201,198,200,201,201,200,200,203,201,202,201,201,203,200,201,200,200,199,201,203,201,200,199,199,201,201,200,200,199,200,203,200,204,205,203,207,204,204,205,203,204,207,209,208,212,213,214,214,214,212,214,214,215,214,216,217,213,215,216,214,214,213,214,213,213,212,214,213,212,213,216,211,115,48,41,28,32,28,25,22,19,19,19,18,16,17,16,14,12,12,13,10,11,10,13,8,24,37,12,9,12,10,11,10,12,11,11,12,12,13,12,12,12,12,12,12,13,13,13,13,13,14,12,12,12,12,13,12,13,13,14,14,13,13,13,13,13,13,14,14,14,14,13,14,14,15,13,14,14,14,15,14,14,14,17,15,14,14,15,16,15,17,15,15,15,14,16,15,14,14,14,15,14,15,13,34,46,28,37,41,59,79,49,21,13,12,27,14,48,76,62,91,81,62,82,64,48,14,10,14,13,15,20,24,33,32,24,42,38,10,44,95,120,105,80,61,72,81,49,23,15,30,19,36,23,26,61,39,57,32,36,54,30,39,66,63,71,92,63,64,49,42,129,32,21,17,18,15,21,17,36,42,48,117,93,75,134,95,34,125,168,82,165,210,160,171,161,49,29,139,104,103,148,174,241,244,207,199,141,166,229,250,250,231,160,139,176,166,234,247,249,165,41,36,72,116,108,103,57,58,101,134,139,104,63,64,39,14,13,15,29,48,84,111,125,116,111,113,92,154,117,22,24,121,141,95,125,74,22,19,11,18,16,24,23,21,24,23,24,24,22,23,22,26,28,29,28,33,35,29,27,38,79,71,39,33,21,39,48,41,36,19,15,27,33,33,49,61,50,33,22,23,44,66,63,46,60,55,31,23,21,24,34,26,65,97,42,31,24,24,38,37,37,33,35,35,28,33,32,77,103,40,21,19,30,42,36,43,44,46,38,38,29,22,19,19,49,37,24,22,18,27,25,22,28,22,29,27,29,31,61,112,61,27,29,29,35,24,33,53,83,84,81,92,69,44,27,17,48,60,65,59,57,55,48,57,53,52,52,39,42,42,47,44,37,38,29,26,28,33,27,29,33,36,36,28,33,36,33,35,36,36,36,42,36,33,35,31,26,93,142,130,116,78,98,100,67,33,38,57,38,43,20,91,250,250,250,247,245,252,247,252,249,246,248,247,245,245,244,243,245,245,245,244,243,245,246,244,244,243,244,244,244,244,244,244,244,244,244,244,244,244,244,244,244,243,244,244,244,244,243,244,244,244,242,243,244,245,246,244,235,116,5,1,4,9,10,9,11,10,12,12,11,12,198,200,200,197,198,196,194,196,194,194,193,194,194,196,197,193,194,195,195,193,194,194,196,195,195,196,195,195,196,194,195,195,193,196,194,193,196,197,199,194,195,196,195,196,195,194,192,194,196,195,195,194,196,195,194,196,196,198,195,196,196,195,195,194,196,195,195,197,196,196,196,196,196,195,197,198,198,199,200,196,198,198,199,194,194,199,200,200,197,198,197,195,200,198,200,199,198,198,198,199,200,200,198,198,198,198,198,199,199,200,198,201,201,201,201,198,199,198,200,200,199,201,199,200,199,199,202,199,201,200,200,203,200,200,204,204,201,203,201,204,200,201,203,199,201,200,200,202,201,201,202,199,200,201,202,203,200,202,203,202,202,202,204,204,201,202,203,203,204,205,206,202,203,202,202,205,203,203,202,205,206,205,206,204,206,206,206,211,212,212,212,212,212,212,213,212,213,213,213,215,214,215,214,214,217,213,211,212,212,211,212,213,215,212,214,215,219,231,212,215,223,219,226,223,224,222,222,221,216,217,214,212,212,208,205,201,198,195,190,183,174,164,165,158,147,158,160,159,155,151,152,144,139,136,131,129,122,117,117,118,119,120,125,125,123,127,127,125,129,131,129,130,129,134,134,136,139,138,129,130,145,141,139,142,141,145,144,139,135,95,117,146,136,132,109,107,109,108,127,136,143,157,156,153,158,155,158,164,159,160,159,158,161,158,157,162,147,157,149,139,142,126,145,181,146,115,157,156,139,168,135,91,66,15,87,160,144,156,130,112,104,101,157,168,179,193,191,199,199,198,165,151,175,191,203,123,75,104,117,108,75,57,81,124,141,142,168,198,183,200,169,132,168,172,219,198,155,161,172,219,215,151,118,107,109,192,234,249,247,226,214,232,233,229,229,220,222,232,240,239,234,232,163,63,55,158,149,118,226,170,87,136,70,4,133,239,133,102,150,201,252,222,159,134,153,206,249,249,233,184,85,132,178,194,240,246,186,40,27,67,102,107,98,58,51,90,135,133,92,71,59,41,16,10,13,22,46,80,111,110,113,103,101,109,108,158,96,9,34,137,128,101,122,52,21,20,11,16,12,24,25,21,24,25,24,17,23,24,24,27,29,26,29,31,27,30,25,33,91,83,55,52,61,81,65,43,34,22,20,33,29,38,53,55,61,46,14,25,47,33,29,31,60,58,29,31,20,24,30,31,85,104,39,27,27,22,34,33,35,31,30,37,27,33,28,78,110,44,30,19,28,44,39,55,75,81,74,62,49,32,19,21,48,30,26,21,16,30,22,27,30,24,29,22,31,32,63,100,47,26,32,28,34,36,38,67,89,82,71,61,44,28,22,19,48,66,75,78,76,70,60,77,69,47,33,28,46,42,35,36,36,33,31,28,29,29,33,29,32,34,33,33,32,36,35,34,36,32,34,41,41,37,33,39,28,101,143,130,120,75,100,101,69,29,40,104,117,143,216,253,252,252,251,247,252,250,249,247,247,248,245,246,247,244,245,244,246,246,245,244,243,244,244,244,245,244,244,243,244,244,241,242,244,244,244,243,243,242,244,243,244,244,242,243,243,242,241,243,244,244,244,244,245,245,244,244,234,116,3,1,4,9,9,9,11,10,12,12,13,12,197,200,199,200,200,195,195,195,197,196,195,196,194,194,194,196,197,196,195,195,194,196,195,197,196,192,195,194,198,197,194,194,193,194,193,195,198,193,197,197,194,197,196,194,195,196,195,195,194,194,197,194,194,195,194,195,196,194,195,195,195,193,196,194,194,195,196,197,194,194,196,194,198,198,195,200,198,199,198,199,196,194,199,197,198,197,195,198,198,195,198,199,196,199,199,196,197,197,200,196,198,198,198,198,198,199,198,200,198,198,199,199,198,197,199,200,199,200,200,200,201,200,202,200,200,201,202,201,200,201,200,202,203,202,205,203,203,203,201,204,205,207,206,203,203,202,202,200,203,205,205,205,205,204,203,205,205,203,205,204,206,204,207,205,205,205,203,206,208,206,202,205,203,205,206,205,207,205,206,208,204,205,206,205,208,208,209,211,212,212,214,212,210,213,213,212,214,213,214,210,211,214,212,212,214,214,213,211,213,214,212,212,214,214,213,215,218,229,232,250,252,252,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,252,252,252,252,253,253,253,253,253,253,253,253,253,253,252,252,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,252,252,253,253,252,252,251,234,249,252,252,252,252,252,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,252,252,252,252,251,250,232,244,252,252,246,194,219,251,232,251,241,214,168,103,162,236,192,187,159,151,125,145,238,250,252,252,253,253,253,253,252,252,252,252,248,184,93,101,109,107,83,68,89,153,234,244,252,252,216,252,248,218,220,233,252,211,193,219,244,252,248,208,132,117,122,219,252,252,252,252,252,252,252,252,252,252,252,252,250,231,251,251,138,19,92,184,110,145,223,98,104,181,113,100,230,252,128,111,173,225,246,184,98,149,209,244,248,248,204,145,129,139,215,227,248,202,57,34,64,99,101,99,58,44,88,127,139,95,61,57,45,16,11,16,23,46,81,108,117,113,108,99,100,97,112,154,80,7,52,147,121,105,111,44,21,17,11,18,15,24,24,19,24,21,19,26,22,27,27,26,30,28,24,27,26,36,26,43,99,80,92,89,71,72,68,58,43,42,34,28,29,36,44,54,69,45,18,23,46,46,25,15,33,39,31,28,16,27,27,36,115,105,38,31,22,23,33,32,33,29,29,33,34,38,31,90,128,58,26,23,34,39,33,125,188,189,186,166,122,74,25,26,48,31,24,18,20,26,29,24,27,27,27,21,29,28,63,103,47,31,33,26,37,33,41,78,83,57,42,30,24,22,19,22,57,92,77,75,92,71,53,74,81,58,37,31,33,24,25,33,42,29,22,33,32,31,29,29,26,27,38,35,34,33,38,34,30,35,33,42,35,37,35,37,31,90,146,127,123,80,93,103,79,18,89,239,252,252,252,252,252,252,251,249,249,249,245,245,247,247,246,245,244,244,243,243,244,244,244,244,244,242,242,242,244,244,244,242,241,242,243,244,243,241,241,241,241,241,241,241,242,242,244,242,241,241,241,241,242,244,241,243,244,243,244,242,234,118,3,1,4,9,9,9,10,9,11,12,13,12,197,198,199,197,199,197,196,196,195,198,195,195,197,195,194,193,196,196,193,195,197,195,198,197,196,196,196,194,196,196,198,199,195,198,196,197,196,193,196,193,195,195,193,196,195,199,198,196,196,195,199,195,195,196,193,194,194,194,196,193,193,196,197,197,194,194,196,195,196,197,196,197,198,197,198,196,194,198,198,196,196,198,196,197,198,199,198,195,196,199,201,198,196,198,198,199,199,198,198,198,198,196,196,198,200,199,198,199,199,196,198,199,199,198,199,201,201,201,200,200,198,200,200,203,201,201,204,200,202,202,202,204,202,202,203,203,203,206,204,203,205,207,206,205,204,204,204,203,206,205,207,205,206,207,205,206,205,205,203,205,207,206,206,206,205,206,206,208,207,206,207,205,208,206,206,208,207,207,208,208,206,207,207,207,211,211,211,213,211,210,214,212,212,212,213,213,212,212,214,214,210,214,213,208,214,216,214,215,214,214,213,214,213,214,214,213,214,215,218,225,227,231,234,232,236,235,237,238,239,241,240,242,242,242,242,244,247,245,245,245,247,246,249,247,244,249,249,250,252,250,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,251,252,252,252,251,244,252,252,252,252,252,252,252,252,252,231,239,251,252,252,252,252,252,252,252,252,244,247,251,251,251,249,250,249,248,251,250,250,249,246,252,245,232,246,237,230,212,232,250,251,221,185,227,236,235,251,232,214,184,141,217,208,154,170,181,182,153,170,237,249,252,252,249,247,251,249,219,244,252,252,250,149,83,97,91,103,70,78,75,133,237,231,230,174,184,249,231,217,171,181,222,193,199,207,235,249,247,226,128,71,115,163,188,227,248,235,227,245,247,244,244,246,228,239,185,179,243,214,87,14,151,164,93,154,134,57,109,191,156,128,228,226,115,137,208,229,238,144,125,230,252,252,251,227,155,227,215,195,230,241,200,71,43,63,98,103,101,62,36,88,124,128,97,52,59,42,17,14,12,26,48,84,108,114,120,113,114,103,97,99,118,145,63,6,77,151,110,110,100,31,20,16,14,18,18,24,16,22,24,21,27,24,23,28,28,32,30,27,29,30,25,31,22,35,67,63,116,99,69,59,36,37,39,37,45,33,24,30,29,51,59,45,20,17,40,49,28,24,43,37,26,27,14,25,22,62,133,95,36,27,22,24,31,33,35,32,31,39,30,42,33,98,158,73,37,16,38,36,54,216,251,250,241,223,205,157,62,39,50,27,27,16,18,27,22,27,26,29,23,18,31,28,75,109,62,29,29,24,34,32,41,64,56,40,25,29,36,30,26,53,116,110,79,84,70,52,28,34,55,50,47,44,39,29,26,48,45,26,24,21,29,33,32,31,27,32,37,33,33,38,31,37,32,31,33,34,30,41,37,39,29,76,141,132,125,87,94,105,86,20,95,238,252,252,252,252,252,248,250,247,247,247,245,245,245,247,247,244,243,242,241,242,244,242,244,244,242,242,242,244,242,241,241,243,243,242,242,244,244,242,242,240,241,241,241,240,241,241,241,239,240,241,241,241,242,244,243,242,244,245,244,244,233,117,3,1,4,8,9,9,11,9,11,12,12,11,196,200,197,199,200,196,197,197,198,198,198,199,196,194,195,199,195,196,194,194,196,194,196,194,197,197,195,198,194,194,195,197,198,197,199,197,196,194,196,196,194,192,193,195,196,193,196,196,194,196,198,194,194,194,194,194,194,195,196,195,194,195,194,196,198,195,195,198,195,196,197,195,198,195,193,198,195,194,198,199,196,196,198,195,197,198,197,196,196,196,199,198,195,196,199,196,199,196,198,197,197,199,197,196,195,199,196,198,198,197,199,199,199,199,197,197,198,200,200,199,202,200,202,203,203,201,205,205,204,204,203,205,205,201,204,203,205,206,205,208,203,206,206,207,205,202,206,205,205,206,208,207,206,205,205,206,206,206,207,205,206,208,207,204,203,205,205,205,206,208,208,208,207,207,207,207,206,208,208,208,205,205,207,208,210,210,208,211,210,212,214,211,212,212,213,212,211,212,212,212,213,215,211,212,213,214,214,211,212,212,214,213,214,214,213,215,214,214,215,216,215,215,215,214,215,216,218,217,217,218,217,217,217,218,217,218,218,217,220,216,217,219,217,217,212,212,217,219,218,219,216,218,219,219,222,221,224,222,223,223,222,221,223,222,221,222,221,223,223,220,223,222,222,220,214,224,221,225,218,210,225,220,223,227,224,224,221,223,226,203,197,216,220,225,223,222,224,220,221,219,210,212,214,217,218,214,220,215,216,217,217,217,218,215,220,213,202,214,207,189,180,212,224,231,193,166,215,218,208,227,207,188,153,133,190,169,154,186,194,187,151,171,221,231,230,221,222,218,226,217,186,220,223,230,250,123,81,87,78,84,69,88,61,151,208,155,203,168,190,239,210,172,105,179,220,193,197,187,222,228,233,235,153,73,113,136,141,200,223,221,209,217,223,219,220,226,210,214,186,153,184,176,47,61,193,125,112,135,96,72,87,141,135,84,160,185,113,171,216,234,212,155,203,249,246,246,229,142,125,198,229,214,247,175,72,49,56,106,104,110,71,42,89,122,134,98,58,60,39,17,14,13,29,48,87,112,116,117,113,111,106,94,101,101,125,148,49,5,100,143,104,117,82,25,19,8,19,19,17,22,21,27,24,21,26,24,27,26,29,29,30,32,27,27,28,38,33,32,34,39,93,77,47,85,75,48,57,66,69,43,21,20,26,40,55,45,19,22,43,41,32,56,67,39,26,22,16,30,23,79,128,66,27,26,24,27,33,30,27,32,32,36,31,41,36,64,137,101,60,46,83,82,126,194,185,144,159,155,152,104,55,49,53,22,21,18,16,30,25,28,25,22,26,21,30,27,69,118,74,39,27,26,37,31,41,60,60,44,34,65,74,27,53,117,144,134,83,62,59,34,30,48,71,43,34,37,29,31,36,45,35,23,30,19,25,34,29,32,24,26,36,30,32,37,32,34,35,37,29,33,34,41,36,39,31,63,138,137,122,93,95,115,98,18,69,237,252,252,250,248,251,244,249,248,244,245,245,245,244,245,244,244,244,242,244,242,243,242,239,241,239,239,242,241,241,242,242,242,243,241,241,242,242,242,240,241,239,240,240,241,241,239,239,239,239,241,242,244,243,244,242,244,244,244,246,244,233,116,4,1,4,8,9,9,10,10,12,12,12,12,199,199,199,198,199,198,199,199,198,199,198,196,198,196,196,198,195,198,196,197,196,192,198,194,196,196,195,196,196,195,195,195,194,197,195,196,197,197,199,194,195,196,196,196,195,195,194,194,193,194,194,192,196,196,194,194,196,195,196,195,198,195,194,195,193,193,195,193,194,195,196,195,193,195,198,197,194,196,196,196,196,195,195,196,193,193,197,197,197,197,197,193,194,198,194,196,196,196,200,197,197,198,196,196,198,198,198,197,200,198,198,198,198,199,198,199,199,200,200,203,203,202,202,202,201,203,206,206,206,204,204,207,206,205,206,205,203,204,206,204,206,205,207,208,206,208,205,206,207,205,207,205,206,206,204,205,204,203,205,206,203,203,206,204,205,205,205,207,204,207,208,204,208,209,208,207,208,209,207,209,206,209,210,208,212,210,211,214,213,212,211,213,213,212,211,212,212,211,212,212,212,213,213,212,212,212,212,212,211,212,211,212,212,214,213,213,215,213,217,215,214,215,215,215,214,214,213,215,217,215,215,214,215,216,214,217,215,214,214,214,214,213,214,214,207,208,211,211,212,212,214,211,213,213,212,212,212,214,211,211,214,212,212,211,212,214,213,213,213,210,211,212,213,207,208,215,212,215,212,196,212,212,209,216,214,214,213,213,220,202,188,217,224,214,208,209,209,208,210,209,204,196,202,209,209,207,208,208,208,208,206,205,210,207,211,205,191,207,199,173,178,210,216,217,165,160,207,218,212,212,195,169,141,110,164,173,166,187,160,132,103,143,213,222,221,214,217,215,218,210,178,214,214,225,235,107,64,60,60,77,71,92,68,172,202,169,215,175,220,186,167,166,138,227,217,181,158,181,226,221,229,250,155,67,123,124,150,200,215,227,210,215,219,219,219,234,208,212,158,105,173,113,45,140,191,108,122,116,65,80,76,129,143,83,169,185,150,205,229,214,173,163,168,175,134,110,88,45,5,105,179,204,197,60,40,61,98,112,110,72,41,86,125,134,102,61,56,41,16,11,17,25,47,87,111,117,114,104,117,104,92,97,103,102,131,139,34,14,117,134,103,109,67,22,17,15,21,21,21,19,22,26,24,22,25,29,39,46,47,46,40,36,33,27,32,39,30,28,25,35,76,95,103,146,136,90,67,40,35,33,30,27,39,55,46,57,41,26,42,34,43,83,63,33,21,18,22,24,27,90,110,43,29,26,28,36,30,35,30,34,32,32,33,36,40,31,124,109,94,200,235,214,127,86,52,33,31,24,43,37,35,57,36,16,27,18,24,29,24,27,29,28,26,24,29,24,54,108,95,49,30,26,34,35,49,79,67,37,76,116,75,44,103,137,165,150,90,96,61,47,41,70,79,34,56,47,26,39,42,36,23,24,25,23,22,29,38,26,26,29,29,30,33,33,36,32,31,36,31,33,39,38,37,49,31,57,133,135,130,103,101,115,99,18,55,232,252,252,250,242,249,242,247,245,245,245,244,245,243,244,244,243,241,243,241,241,241,240,241,241,239,239,240,241,242,241,240,241,242,242,241,240,240,241,240,240,239,239,239,239,240,238,238,239,240,242,242,245,244,244,242,242,244,242,241,242,233,116,3,1,4,9,10,9,10,9,12,12,12,12,195,199,198,198,199,194,194,196,197,197,193,196,196,195,193,195,195,196,195,193,196,195,196,196,197,193,191,195,194,194,194,195,195,193,197,195,195,194,194,194,194,196,196,198,196,195,195,193,194,195,195,195,193,196,198,195,194,196,197,193,194,196,196,196,196,194,194,194,193,194,193,194,195,196,194,193,194,195,194,193,194,196,194,195,195,194,193,195,197,193,194,194,195,195,195,194,196,194,196,197,196,194,192,198,196,195,198,197,198,195,198,198,196,199,198,200,202,200,200,201,204,203,203,202,204,203,203,205,203,206,205,204,205,204,206,204,202,201,201,204,203,206,206,203,204,206,205,206,204,202,204,200,205,206,204,203,202,202,202,203,203,205,204,205,206,205,206,207,207,205,206,207,206,206,206,208,208,209,208,211,209,210,212,210,211,211,211,212,211,209,212,210,210,212,212,212,211,212,210,211,213,212,210,211,210,211,211,211,212,212,213,212,213,213,212,212,212,212,213,212,212,213,212,213,214,212,213,212,214,215,214,215,214,215,214,214,214,211,213,210,211,212,212,214,210,211,212,211,209,212,213,210,209,210,212,211,208,210,212,208,209,207,210,209,210,212,207,208,209,207,207,209,210,203,205,210,209,211,210,195,208,215,203,215,219,212,212,206,222,215,192,230,234,213,208,204,210,204,203,208,203,199,199,205,209,204,206,205,203,205,205,206,207,205,208,205,189,207,195,159,184,211,214,214,149,160,208,210,215,220,198,167,125,107,164,161,149,168,148,133,98,125,203,221,221,213,212,212,222,206,172,213,211,229,223,102,77,72,89,69,73,95,61,193,216,199,224,184,184,143,190,180,192,235,147,142,158,198,229,218,222,247,137,61,112,134,174,185,201,212,208,224,217,216,226,234,213,175,108,100,160,65,69,192,156,114,139,71,27,50,62,139,178,128,187,213,183,228,221,146,101,72,55,42,25,27,17,10,10,110,184,150,81,21,57,95,108,121,77,35,64,118,131,94,62,59,45,21,14,14,27,45,81,106,111,114,107,103,97,96,103,102,93,98,125,118,22,19,123,123,103,108,57,21,20,15,22,21,16,23,21,27,25,29,50,68,83,80,78,66,57,51,39,32,32,35,34,31,22,50,139,159,122,155,135,97,86,38,27,33,36,45,54,59,59,72,44,27,43,33,62,89,64,37,23,19,18,26,27,82,108,64,37,27,30,36,32,30,28,35,34,35,35,37,34,66,178,139,74,209,232,186,128,108,94,78,44,41,43,32,41,61,30,19,22,29,24,27,31,24,29,27,29,27,30,30,34,95,107,57,33,23,30,37,63,91,66,49,137,141,74,87,120,119,179,165,118,138,74,79,68,51,56,28,52,38,23,40,33,31,29,26,25,21,28,30,26,28,29,29,31,28,31,35,33,34,29,32,35,34,33,41,41,38,35,49,131,136,128,116,101,111,101,27,52,229,252,252,250,239,246,243,244,241,241,242,239,241,244,243,241,244,241,240,239,239,240,239,237,238,238,240,241,240,241,243,242,242,243,240,241,244,241,241,240,239,240,239,239,237,238,239,239,238,238,241,242,241,241,242,239,240,241,241,242,241,233,117,3,1,4,8,9,9,10,10,11,12,12,12,195,196,198,197,198,194,195,194,195,198,196,195,195,195,194,196,195,195,194,196,198,196,198,193,195,197,194,196,195,195,196,195,197,197,195,198,195,194,195,195,197,196,196,196,195,196,198,198,197,198,196,194,196,196,196,195,197,196,197,195,195,194,196,198,196,198,194,194,196,194,198,196,194,196,194,196,195,195,195,195,196,195,194,194,193,195,196,192,194,194,195,193,194,196,194,196,194,194,196,193,195,195,196,196,198,195,197,196,196,197,198,198,197,196,199,199,200,200,200,201,202,202,202,206,203,202,202,201,203,203,205,204,202,203,205,204,203,203,204,202,203,202,200,201,203,203,200,202,201,202,205,203,203,202,204,203,202,203,205,205,206,205,204,207,205,204,206,205,206,206,206,205,206,206,205,207,208,208,209,207,206,208,210,211,209,211,210,210,212,209,210,211,210,211,210,211,209,210,213,212,212,209,208,212,212,211,212,211,212,214,211,212,212,212,214,212,212,212,214,212,214,212,213,214,213,214,213,213,213,211,214,213,213,214,213,214,214,213,212,213,213,210,214,212,210,211,211,212,210,209,211,208,212,210,212,210,208,210,210,209,207,206,208,209,206,208,207,207,207,206,208,208,208,203,205,209,210,210,211,196,203,216,197,207,223,208,208,204,215,198,131,166,214,210,208,203,205,203,203,206,205,201,196,202,205,204,206,204,201,203,208,203,206,205,206,208,189,209,191,150,188,210,215,208,151,176,207,210,212,222,205,162,133,120,155,152,162,185,178,184,144,153,210,217,224,212,212,210,218,201,169,212,212,235,216,134,105,80,110,78,78,77,50,164,200,199,181,169,185,178,222,186,202,186,136,165,177,221,223,217,225,241,120,80,125,128,186,160,162,193,207,232,213,223,233,240,178,127,110,125,130,43,148,183,97,148,133,34,10,42,36,107,174,111,141,167,167,195,127,64,27,27,41,56,66,60,53,47,47,196,197,62,43,43,103,107,111,85,26,50,105,129,94,61,60,43,19,12,15,25,48,79,101,111,112,114,103,90,101,99,105,101,93,105,122,100,15,30,136,114,110,116,50,25,17,11,22,22,21,24,24,26,37,64,93,117,118,109,101,85,78,71,65,54,44,50,43,48,39,55,130,149,87,116,118,110,117,45,62,63,38,49,57,66,72,77,38,29,44,33,67,98,71,34,15,15,19,27,24,32,76,73,45,43,46,47,45,48,46,47,45,50,48,48,46,104,177,95,31,50,76,71,55,117,143,127,102,66,45,37,46,57,43,35,27,26,29,28,32,30,30,31,35,29,42,31,63,144,113,53,33,23,27,24,52,93,67,105,185,114,87,110,117,95,133,150,141,162,77,73,77,44,37,26,38,29,26,34,33,28,27,22,23,27,24,30,30,29,39,38,29,34,39,33,31,36,33,37,32,34,38,37,36,39,36,43,119,136,125,116,98,104,111,35,36,217,250,250,250,238,246,242,245,239,240,240,239,241,241,242,241,241,239,237,239,238,236,237,237,237,237,236,238,238,239,243,243,242,242,240,240,244,241,243,240,239,240,239,241,240,242,241,240,239,239,241,241,241,240,241,240,240,241,241,242,243,233,116,3,1,4,8,9,9,11,9,11,12,11,11,193,200,198,196,199,194,196,198,196,197,195,198,195,197,197,195,197,196,195,196,195,195,196,196,194,195,197,197,197,198,199,195,195,196,196,195,196,198,197,196,196,196,196,197,195,195,197,196,195,194,196,196,196,196,197,196,196,195,196,195,194,197,194,194,197,194,196,198,196,196,197,196,197,194,193,196,194,195,197,194,195,194,193,195,193,196,194,193,194,193,195,194,193,193,194,194,196,193,195,195,195,199,195,195,196,197,196,196,198,198,198,198,198,198,198,196,198,200,200,201,200,202,202,203,204,203,202,203,201,203,203,200,203,200,203,204,200,203,200,203,201,201,205,202,203,203,201,204,201,199,205,202,201,200,200,203,203,202,202,201,202,205,203,204,203,202,204,204,204,204,205,205,205,205,205,206,207,207,206,207,207,208,208,207,208,209,211,209,208,208,209,209,208,210,210,209,210,210,209,211,212,212,210,211,210,212,213,211,211,208,212,211,209,212,214,214,214,214,214,212,214,214,214,212,211,212,213,212,213,214,213,213,209,212,211,212,213,212,214,212,212,211,212,214,212,211,209,210,208,211,211,210,210,209,210,210,210,208,210,208,210,207,207,206,208,207,205,208,206,208,208,209,205,200,210,209,208,208,212,197,196,214,169,166,207,205,210,206,208,175,61,73,171,203,209,202,202,203,203,207,203,201,196,193,205,206,204,203,203,202,205,202,201,204,202,208,188,206,190,149,196,207,216,203,153,182,203,209,210,220,202,154,115,113,171,178,185,195,171,198,176,167,206,211,223,216,214,211,221,198,168,211,212,234,199,141,116,64,104,87,97,79,24,120,167,151,168,198,177,208,198,121,198,196,178,197,200,225,215,217,229,233,117,130,164,136,183,141,141,178,194,230,222,227,240,176,105,132,139,148,84,74,214,139,19,148,147,35,54,66,33,62,108,71,62,78,84,82,53,27,43,53,81,93,92,56,73,71,83,190,123,52,62,74,113,105,85,34,54,106,136,108,63,61,46,23,12,12,29,54,88,102,108,115,110,115,96,94,111,110,102,90,84,124,143,102,44,90,162,112,116,116,44,19,12,17,25,19,17,24,28,42,67,104,116,141,131,111,101,91,94,101,103,100,93,78,79,83,85,93,96,118,84,135,105,157,122,50,113,57,52,61,63,77,71,70,36,33,37,28,61,78,57,29,17,15,22,27,23,20,27,39,77,90,84,91,82,87,97,98,89,94,95,92,98,83,56,30,29,24,52,50,41,93,122,125,105,70,38,27,22,37,45,39,41,32,37,39,40,43,39,40,45,46,53,43,85,122,72,35,27,25,22,27,38,67,70,113,175,87,70,120,113,87,83,87,110,127,50,59,83,52,48,43,46,39,39,47,45,31,19,21,23,24,31,37,36,48,53,61,69,63,56,42,32,35,35,28,37,31,33,39,33,38,40,36,110,136,132,125,95,104,113,48,26,199,249,246,250,235,245,240,242,240,237,240,239,239,241,239,238,237,235,237,236,236,234,234,232,232,232,233,236,239,241,244,242,241,242,239,238,240,242,241,242,242,243,242,242,244,243,243,241,242,243,242,242,242,240,239,236,237,237,239,241,241,234,116,3,1,4,9,9,9,10,9,11,12,12,12,196,198,198,195,196,196,197,196,198,197,196,197,196,196,195,198,195,195,198,197,195,193,197,198,195,197,195,197,196,194,197,195,196,198,197,196,194,196,198,196,198,196,195,200,196,198,198,195,196,196,196,198,198,198,198,195,196,196,195,195,195,198,196,196,196,194,196,198,197,195,199,194,195,197,194,196,195,194,196,195,194,194,196,195,195,194,194,196,197,195,195,196,194,198,195,194,195,198,198,195,197,198,198,196,197,194,196,195,194,198,195,196,199,196,199,199,200,201,200,200,204,203,202,203,200,202,202,203,203,199,202,201,199,204,201,200,203,200,202,202,202,200,201,202,203,202,200,202,201,201,203,200,201,200,203,202,200,201,204,202,200,203,201,202,200,203,205,202,206,204,203,203,203,206,205,205,204,203,206,205,208,207,210,210,208,211,205,207,208,207,208,208,210,209,209,208,211,210,210,211,210,208,210,212,210,210,212,213,211,212,213,212,214,212,213,212,212,213,213,210,211,212,213,210,211,212,210,213,214,212,212,212,211,212,212,213,211,212,212,210,211,210,212,210,209,210,210,210,209,209,210,207,209,208,210,210,208,208,207,207,210,210,206,209,208,207,208,208,208,207,206,210,201,201,208,207,208,206,208,199,193,212,160,141,198,204,210,205,206,196,105,100,170,202,213,205,201,203,203,205,205,203,198,194,199,204,203,204,204,201,202,201,203,203,205,207,186,209,188,158,205,207,218,192,145,190,205,208,208,221,196,146,114,99,160,181,178,157,143,196,166,153,204,201,215,211,218,213,221,195,169,214,217,228,156,131,106,75,118,88,115,72,26,103,170,179,205,211,169,193,135,123,215,201,188,179,208,224,213,214,234,222,134,201,201,160,161,128,145,144,163,226,231,239,198,74,84,142,180,152,60,141,235,96,57,196,127,55,77,66,62,48,59,56,63,66,42,51,44,53,50,39,44,40,44,39,29,38,30,56,46,59,90,85,112,78,43,48,94,137,112,70,62,50,23,11,15,26,49,91,110,112,115,111,112,110,97,103,115,102,93,79,112,179,163,125,108,130,164,101,104,105,35,25,12,14,24,18,22,27,41,61,103,139,125,117,109,106,98,92,111,126,139,137,119,93,97,120,120,103,93,95,105,118,98,151,78,24,73,45,62,78,92,102,66,50,37,27,33,20,30,38,29,23,18,19,19,23,22,26,24,26,77,95,90,95,69,111,149,124,101,136,139,121,100,46,21,21,29,34,67,56,71,142,141,141,116,56,31,27,25,17,26,51,56,56,63,59,51,61,69,73,78,78,84,69,69,58,28,27,23,21,29,28,27,40,32,75,118,59,56,79,100,85,70,74,87,98,73,86,117,81,55,71,68,55,56,62,42,30,25,27,29,30,44,48,59,63,71,93,102,93,76,60,46,31,28,33,35,32,34,40,34,33,39,25,100,143,134,127,93,97,106,48,23,189,249,244,250,236,243,235,239,239,237,238,237,238,237,236,237,236,234,235,235,232,232,234,233,234,236,236,238,240,244,244,242,241,239,239,243,244,243,244,243,243,244,244,244,243,243,242,242,244,244,243,242,241,240,239,237,236,237,239,241,241,232,116,3,1,4,8,10,9,10,9,11,12,12,12,194,199,196,196,200,195,195,197,195,197,194,194,194,194,196,197,197,196,196,196,196,194,195,196,194,196,196,196,194,196,198,195,197,196,194,195,195,198,197,195,199,196,198,196,194,196,195,196,198,197,196,195,198,198,197,196,196,198,198,197,193,196,197,197,197,198,197,196,197,196,198,197,196,197,197,197,196,195,197,194,195,196,193,193,194,196,192,193,196,195,197,195,194,196,195,196,199,194,197,197,196,199,197,196,196,198,196,197,195,196,196,196,198,199,201,197,199,200,200,202,200,200,198,201,202,202,200,200,200,200,203,199,202,203,202,203,200,203,200,202,203,200,200,199,200,199,199,200,201,204,202,202,203,202,203,203,203,200,200,202,202,203,202,202,203,202,202,204,202,202,203,204,205,204,205,205,203,205,206,205,205,205,207,207,206,207,206,206,206,207,206,207,208,208,209,207,211,210,209,211,207,209,211,210,210,209,210,209,212,209,211,211,209,210,210,209,209,211,210,210,213,210,210,213,212,213,210,210,212,210,212,213,210,211,209,211,212,209,210,211,212,209,211,207,208,210,207,211,209,209,209,208,206,206,208,209,208,207,208,206,205,206,206,204,206,205,203,207,205,205,206,205,200,202,207,205,206,204,207,200,189,211,188,165,201,205,208,202,205,213,181,189,197,199,212,203,205,201,199,203,199,201,200,193,193,199,202,201,203,202,202,204,200,200,198,210,189,203,182,151,208,207,217,177,147,195,200,211,207,223,192,149,116,72,122,148,160,159,150,201,172,156,197,191,205,204,207,210,224,194,169,212,219,210,130,134,103,75,119,94,116,73,44,158,216,205,243,171,131,207,150,156,214,178,160,172,217,224,215,211,241,203,153,242,221,173,146,148,158,125,162,219,235,245,137,31,107,153,164,150,145,191,167,72,88,160,86,58,54,38,56,44,48,59,69,79,73,61,53,44,29,21,17,23,21,18,15,16,15,35,54,85,97,83,82,47,43,84,125,108,72,63,47,27,12,13,21,46,89,104,109,111,107,110,99,100,100,103,105,89,85,115,185,221,170,145,132,146,160,95,110,92,23,19,12,17,19,21,32,26,56,107,125,132,115,92,94,96,90,76,87,92,91,95,76,55,61,79,88,77,68,82,92,83,44,76,43,35,67,37,59,82,83,78,53,48,37,23,30,26,18,16,18,20,16,17,24,27,26,29,39,62,74,64,47,39,28,21,34,32,24,25,36,31,33,32,21,30,24,48,76,46,121,195,169,131,86,43,27,40,35,26,14,33,40,34,51,33,38,55,49,52,78,87,116,124,81,60,31,30,30,21,33,27,27,26,22,73,91,54,37,48,94,83,80,95,79,104,101,114,137,55,42,78,74,55,47,49,46,43,43,42,44,53,54,63,70,71,88,106,107,98,90,71,63,45,32,34,28,38,36,29,35,36,39,29,87,137,134,128,94,93,108,59,19,171,246,242,249,232,238,237,236,237,234,237,233,233,235,233,233,232,231,231,233,235,233,235,238,240,240,240,243,244,242,243,241,241,240,241,241,242,242,241,241,244,243,243,243,241,241,241,243,241,240,241,239,241,236,235,234,234,235,237,236,237,231,117,3,1,5,8,9,9,10,10,11,12,12,12,198,198,198,196,197,199,196,196,196,196,197,197,195,195,197,196,195,194,195,194,196,196,198,194,195,196,194,198,198,197,196,195,196,194,198,198,195,198,195,195,196,196,195,198,197,196,198,197,197,196,196,196,194,196,198,198,196,197,198,196,196,198,194,195,196,194,198,194,194,197,198,194,197,199,196,196,196,197,197,198,195,195,196,195,198,195,194,196,194,194,195,195,195,196,197,198,196,197,198,198,199,198,199,198,198,197,197,197,196,198,198,198,200,197,199,200,201,200,200,201,201,200,199,199,202,201,201,201,200,200,201,201,200,202,201,203,203,200,201,200,200,200,202,201,199,201,201,202,203,199,201,202,203,204,202,202,203,202,200,202,199,201,202,203,202,201,202,203,204,200,203,204,201,202,203,201,202,205,206,205,205,203,206,204,202,204,204,207,206,208,208,207,207,207,210,209,209,209,208,210,209,211,211,211,210,207,209,208,209,210,212,209,208,209,208,210,208,211,211,208,210,210,208,208,210,211,211,212,211,210,211,212,212,212,210,208,210,210,209,211,211,208,212,210,207,207,208,207,208,208,209,208,209,208,207,206,206,207,207,205,206,205,205,207,205,205,206,206,206,205,208,204,197,204,206,206,206,205,208,203,191,211,203,190,210,207,206,198,205,214,201,221,209,197,208,201,203,200,198,200,199,203,202,197,192,197,203,203,203,202,205,202,201,201,199,206,188,202,167,156,211,205,217,168,147,194,202,212,206,221,188,143,101,74,140,178,183,168,169,211,169,137,186,200,205,195,201,200,214,192,173,215,217,198,134,162,109,79,108,82,99,44,66,198,231,218,211,138,177,218,171,172,145,141,167,200,232,220,217,212,245,177,167,250,228,188,132,163,184,152,157,189,225,236,107,34,138,135,155,182,225,187,84,65,67,89,42,29,27,14,35,23,27,36,40,45,43,37,27,21,22,15,19,17,16,18,16,19,18,54,74,97,106,80,50,42,83,106,105,67,62,56,21,13,11,29,46,81,113,112,112,107,107,98,96,107,101,100,87,75,119,184,235,197,120,128,131,142,145,97,115,83,15,20,17,16,22,24,34,41,110,182,152,115,105,88,78,86,94,70,48,45,48,51,48,49,46,55,69,70,72,71,76,62,41,44,35,39,48,35,46,70,58,56,52,47,49,41,38,27,17,15,17,20,26,26,24,36,45,63,87,85,77,64,67,66,35,28,26,23,29,27,26,25,35,27,30,42,40,57,66,71,148,189,146,103,62,52,46,37,35,34,29,25,22,23,24,17,17,24,21,19,31,68,103,116,112,74,56,54,43,37,36,31,27,21,48,105,87,53,29,63,116,85,89,81,108,157,92,54,61,44,41,51,51,36,37,47,50,56,56,56,60,60,76,87,99,107,108,112,99,97,95,88,76,61,43,37,31,31,34,26,35,32,40,28,79,140,135,131,98,94,102,59,15,156,245,241,249,230,233,234,232,234,231,235,232,232,231,230,232,231,231,230,231,232,230,238,244,246,245,243,244,244,244,242,241,241,241,242,240,242,239,240,241,241,242,241,242,240,242,242,238,239,240,240,241,239,237,235,232,233,234,234,237,237,229,117,3,1,5,9,9,9,11,10,11,12,11,11,196,198,196,198,199,197,197,198,196,196,197,201,199,196,195,195,197,195,195,195,196,197,195,194,196,197,196,195,192,195,198,195,196,195,195,198,198,198,196,193,198,194,198,198,198,199,194,195,194,192,194,195,196,196,197,195,194,196,195,195,196,195,192,193,194,195,195,196,196,193,196,197,195,196,198,197,196,196,196,195,195,198,198,196,196,195,195,195,196,195,196,195,195,198,196,197,200,198,199,198,200,199,199,200,199,200,195,198,196,200,201,198,198,196,198,198,200,199,199,199,199,200,200,201,199,199,200,200,201,202,201,199,202,200,198,201,200,199,199,199,200,200,203,200,200,200,200,202,200,202,199,201,201,201,202,199,200,201,201,201,199,200,200,201,202,204,204,203,200,201,202,203,202,203,203,203,204,204,205,202,207,206,205,207,205,205,205,208,206,207,208,205,207,206,208,207,208,207,206,211,206,208,211,210,210,208,210,210,211,207,211,212,208,209,208,210,210,210,212,211,211,209,209,208,210,211,208,210,211,210,210,209,209,211,210,209,211,210,208,209,209,208,209,208,210,210,206,207,205,206,207,206,207,207,204,207,207,205,206,205,206,203,205,207,204,206,205,206,206,203,206,202,199,204,207,206,205,202,206,207,191,201,213,189,198,209,206,199,200,208,195,207,203,194,205,201,205,204,202,204,201,201,203,203,198,195,203,203,205,205,203,204,203,205,201,208,194,199,169,170,216,206,210,158,156,200,201,212,209,217,174,137,100,104,187,207,178,159,179,214,165,128,186,197,206,205,193,195,210,186,175,219,208,196,155,176,131,81,95,97,81,30,70,169,182,194,207,158,203,225,174,149,153,167,181,221,229,223,220,224,243,136,168,249,233,198,141,169,193,150,124,181,216,216,98,76,146,138,162,202,249,151,49,59,72,57,20,23,14,11,19,20,21,19,24,20,21,18,20,17,15,18,17,17,18,18,18,20,37,73,86,101,94,50,60,77,105,99,63,57,55,29,12,11,23,44,76,106,120,107,106,111,96,98,106,102,96,89,83,110,167,206,197,133,89,106,99,116,127,98,114,61,19,24,12,24,24,31,29,61,163,187,131,107,75,54,50,42,51,43,32,35,34,38,47,40,31,44,63,76,77,72,64,46,41,40,34,36,41,43,55,74,59,61,57,46,60,62,60,39,23,18,16,19,36,64,69,70,76,122,128,98,74,75,78,58,40,30,28,32,36,41,46,57,69,75,88,106,124,139,161,149,133,141,125,122,118,92,93,97,85,78,67,64,51,43,37,29,27,19,23,19,44,76,88,104,92,99,92,73,71,59,57,44,41,25,71,119,87,63,25,78,127,84,103,89,129,136,61,44,53,45,31,30,35,44,43,50,53,55,60,57,77,90,84,90,86,87,100,96,85,79,84,84,78,71,55,43,33,30,35,31,33,33,39,29,69,143,139,130,104,89,104,71,16,152,245,244,249,234,235,229,231,236,229,233,232,230,232,232,232,232,230,229,229,229,229,238,244,245,245,245,244,242,243,242,239,240,239,241,240,240,241,239,240,241,243,241,239,240,240,240,241,239,242,241,240,242,236,235,233,234,233,234,236,234,229,118,3,1,4,8,10,9,10,10,11,12,11,11,197,199,198,199,199,197,198,198,199,198,200,200,197,197,197,198,196,195,197,194,194,194,197,193,194,196,193,197,195,195,196,195,195,194,196,194,194,195,195,196,194,196,196,195,193,194,195,193,195,196,196,191,193,194,193,193,195,194,193,193,193,193,193,194,195,192,195,195,193,196,195,195,196,195,197,195,196,197,194,196,196,195,196,197,197,197,194,195,195,196,197,195,196,196,198,198,196,198,199,197,198,196,196,198,197,198,199,199,196,199,200,199,200,198,198,199,199,199,198,196,198,199,199,200,200,201,200,203,201,201,200,199,200,198,199,201,200,199,201,201,199,199,202,204,202,199,199,200,203,201,204,201,200,199,200,200,200,201,200,203,202,203,200,202,202,203,202,199,204,201,203,203,203,204,202,201,202,203,201,202,202,202,204,203,206,205,206,206,204,204,204,208,207,207,208,207,207,205,207,208,207,208,208,208,205,206,211,210,209,208,210,209,208,209,207,210,208,210,210,208,209,211,212,210,209,210,210,208,210,211,208,208,209,211,210,209,210,208,208,207,209,207,210,208,208,208,207,207,205,205,207,205,206,208,204,205,206,205,204,205,207,203,206,204,202,201,201,206,204,203,206,199,199,206,204,205,204,202,205,206,190,195,208,194,194,205,203,196,204,205,193,201,200,192,201,203,203,204,202,206,202,203,203,202,202,196,203,206,205,205,202,203,204,204,203,210,198,195,160,176,218,212,208,156,174,206,203,215,209,210,165,133,105,116,177,167,144,164,196,223,172,125,184,202,207,205,202,201,204,181,177,215,196,199,165,157,103,83,118,107,84,57,71,99,152,222,202,174,196,175,162,157,187,194,201,230,220,226,221,237,227,98,165,244,235,215,160,173,166,131,130,188,219,213,165,134,147,165,182,213,235,95,35,66,47,35,16,21,17,15,22,20,20,18,17,18,17,17,22,20,14,13,20,17,14,15,18,25,59,87,81,82,55,47,81,101,102,66,59,53,24,14,19,29,46,81,100,113,116,108,107,100,94,103,105,103,94,89,128,183,184,143,133,99,81,88,67,98,115,109,122,57,17,22,10,24,33,35,30,71,157,159,119,83,57,44,37,33,40,37,43,45,39,51,45,30,24,34,51,60,61,54,53,44,46,41,35,38,33,45,50,64,61,57,62,55,59,69,67,63,51,35,27,34,72,91,78,77,94,98,86,57,36,35,34,30,29,35,42,68,89,127,158,185,212,203,205,196,185,184,175,151,93,55,74,79,89,113,119,139,133,117,127,131,132,127,97,82,67,53,45,28,34,44,47,70,63,71,71,48,63,76,81,78,62,35,86,117,81,52,51,119,111,90,108,64,102,109,53,52,56,46,39,39,46,56,51,52,57,64,65,66,101,98,63,53,50,61,62,62,61,59,61,69,70,66,62,48,34,30,35,32,36,29,38,24,60,144,145,136,108,89,110,90,45,165,241,241,250,240,239,234,233,234,231,233,230,231,232,232,232,229,231,231,232,233,235,244,245,244,245,244,244,241,241,240,240,240,241,241,239,241,240,241,241,241,241,240,240,240,239,240,239,240,240,240,240,240,239,236,234,234,235,234,236,233,225,117,3,1,4,8,10,9,10,10,11,12,12,12,199,201,201,196,196,198,199,200,198,199,196,196,196,196,198,198,198,196,194,198,198,196,195,194,196,196,195,196,195,196,196,195,194,195,197,194,195,194,193,194,196,194,193,193,193,193,193,191,194,194,193,191,193,194,194,193,193,194,194,193,191,192,193,195,191,194,194,193,194,193,196,194,196,196,194,194,195,196,195,195,196,196,196,194,196,196,194,193,196,195,197,198,199,196,196,199,198,195,194,196,193,194,199,197,196,196,198,198,195,198,196,197,200,200,199,198,200,198,198,199,198,199,200,200,201,199,200,200,200,200,200,200,202,202,200,200,199,201,203,201,201,201,201,201,202,203,201,201,202,202,201,201,200,201,200,199,201,203,201,201,202,202,202,199,201,200,202,203,202,201,203,203,199,202,203,201,203,204,202,203,204,202,203,202,203,204,206,206,202,205,208,206,209,208,205,206,207,208,208,208,208,210,206,207,208,207,208,205,208,206,208,207,206,210,208,207,207,208,208,208,208,207,208,207,209,210,208,210,209,208,208,209,209,210,209,208,208,208,208,206,208,207,208,206,206,206,206,207,204,206,206,208,207,205,204,204,203,205,204,203,204,200,203,204,202,204,202,201,203,202,202,198,200,205,202,206,205,204,203,206,195,188,208,199,193,203,198,199,205,205,193,202,205,192,198,206,207,201,201,204,205,203,202,206,203,194,197,201,203,204,203,203,204,202,201,210,202,184,153,174,211,214,199,165,183,203,205,212,208,197,148,114,93,82,143,161,157,185,207,224,180,130,171,196,203,212,203,200,211,180,174,201,176,207,137,83,81,86,133,112,73,65,61,130,214,234,208,170,129,164,165,146,181,163,203,230,214,229,221,247,198,81,183,236,235,225,185,164,124,151,158,188,212,216,204,160,184,213,196,231,191,66,62,55,24,18,22,22,15,22,18,19,21,20,17,16,19,15,17,18,18,18,15,19,19,19,16,41,85,84,66,52,46,71,97,98,68,60,53,27,16,77,83,68,87,115,114,120,117,111,99,98,98,100,109,93,93,107,160,202,127,72,83,89,94,98,72,91,113,122,122,41,14,19,14,38,35,35,33,93,171,133,89,67,48,42,42,42,49,50,42,48,49,37,31,22,23,33,39,57,51,48,44,44,46,33,34,35,34,39,45,53,48,46,55,60,55,63,68,73,72,59,64,86,103,81,63,52,52,63,59,45,24,18,17,27,48,72,129,191,217,214,200,191,165,142,135,87,73,58,87,107,54,38,34,33,62,39,77,88,46,50,53,56,84,128,150,164,160,147,115,75,63,44,32,42,29,30,27,26,41,44,59,69,71,68,103,93,77,91,97,136,112,93,88,45,102,101,41,46,42,44,46,45,55,55,55,61,67,69,59,50,73,75,46,47,45,46,51,51,54,53,51,52,54,62,58,51,44,33,32,30,35,29,36,28,51,144,147,133,106,89,113,98,58,143,210,229,250,250,244,239,236,232,227,233,231,229,231,229,230,229,229,236,239,242,244,245,245,244,242,241,240,240,242,239,239,239,237,239,238,239,241,240,240,238,240,240,238,239,239,239,239,236,239,239,237,235,230,230,228,230,233,233,234,233,229,118,3,1,5,9,9,9,12,10,11,12,12,12,200,204,201,199,199,200,200,198,198,199,197,199,196,198,198,196,198,197,198,199,199,199,198,195,195,198,198,197,196,195,199,198,195,194,196,197,198,194,194,198,195,193,193,195,194,196,196,194,196,194,195,192,193,194,194,195,193,193,194,193,193,194,195,193,194,193,190,194,193,193,196,194,195,194,196,196,197,198,197,196,197,198,195,196,195,195,197,197,197,195,198,198,197,197,194,196,197,196,197,196,198,197,196,199,198,195,198,198,197,199,200,198,198,198,198,199,197,199,199,201,203,199,199,201,198,201,201,199,200,202,201,200,201,201,202,201,202,201,200,202,202,202,202,200,203,201,201,201,201,201,202,200,202,202,203,203,202,202,202,201,200,203,200,202,202,201,201,203,205,202,204,202,202,202,203,203,203,206,204,203,204,207,208,207,203,206,207,206,208,205,208,209,208,209,208,209,208,208,209,209,207,209,208,208,208,209,208,204,208,207,208,208,208,209,206,208,210,208,208,208,209,209,210,208,208,208,208,207,209,209,209,210,206,207,206,207,207,205,208,210,207,205,208,206,207,207,205,205,205,205,206,205,204,205,205,204,203,204,203,202,203,200,200,198,203,206,202,204,203,205,202,194,203,203,203,203,205,206,203,207,200,190,206,203,194,199,199,201,207,206,193,200,209,197,195,202,207,206,204,205,202,206,204,202,208,199,195,201,207,208,204,203,201,203,202,207,204,173,152,178,208,214,188,155,183,200,205,214,196,190,128,93,88,89,169,175,182,200,207,218,170,123,163,201,203,200,203,205,211,184,183,182,165,196,132,110,75,71,114,86,77,71,71,169,250,220,164,158,146,203,170,127,150,156,214,222,214,225,223,248,170,102,207,234,238,235,200,165,139,169,169,182,178,186,181,167,213,221,212,225,180,87,67,35,8,19,17,22,22,19,20,19,19,19,19,20,16,14,16,17,16,15,16,15,15,23,31,61,88,71,47,50,69,88,88,65,61,55,27,15,74,154,173,138,106,108,110,120,110,96,95,103,98,101,98,98,103,100,117,118,68,54,88,87,103,107,81,98,107,127,109,29,19,19,25,41,29,28,26,56,92,68,54,47,39,38,43,46,47,54,55,48,39,30,21,22,27,36,37,44,48,39,39,38,32,35,35,27,31,37,45,53,52,53,53,53,51,54,59,61,66,72,80,90,80,52,42,34,31,30,31,20,17,37,59,103,160,190,187,169,148,111,77,54,50,49,57,61,56,57,68,74,62,56,58,46,49,58,48,49,52,45,44,46,46,49,55,63,86,122,141,155,142,99,71,53,35,26,14,19,27,23,32,44,56,76,82,83,93,105,118,122,95,80,59,48,81,57,43,49,40,37,43,53,56,56,57,69,66,61,53,43,45,39,44,51,47,48,47,40,46,46,46,50,53,59,63,61,56,47,36,29,39,30,36,34,49,136,151,128,107,87,110,100,70,97,133,190,240,244,250,250,242,233,231,231,230,229,229,229,229,227,233,239,245,245,245,247,247,245,243,243,242,241,243,242,240,239,237,239,239,239,239,240,240,240,239,237,237,239,237,238,236,237,238,236,236,234,231,228,229,230,232,232,236,236,229,118,3,1,5,9,9,9,10,10,11,12,13,12,198,199,198,199,197,200,198,198,200,196,198,200,197,198,198,196,196,196,195,200,199,196,196,194,197,195,193,198,196,199,201,196,195,194,196,196,194,194,195,194,193,193,194,193,193,196,197,195,194,194,194,192,194,194,192,193,193,191,190,193,194,193,193,194,191,194,194,193,193,192,196,193,194,198,194,196,196,195,196,196,197,194,197,196,196,198,198,197,196,197,198,196,195,196,195,196,196,194,195,193,196,199,197,196,196,197,196,197,197,197,198,197,198,200,198,198,199,199,198,200,199,200,201,199,202,200,200,203,200,200,200,199,202,201,200,203,204,202,202,202,200,200,201,202,200,198,200,202,200,199,202,201,201,204,204,203,202,204,201,204,203,201,203,200,203,203,202,202,203,203,204,203,203,207,205,202,205,206,203,206,205,205,206,206,208,204,205,206,207,206,206,205,206,208,208,211,208,207,209,210,208,208,207,209,209,208,210,208,209,207,208,208,207,208,209,209,208,209,209,207,207,209,208,206,209,209,207,207,208,207,208,205,207,207,207,207,208,207,206,205,206,208,206,205,207,207,207,205,204,206,203,205,202,203,205,201,203,203,200,201,204,202,201,203,203,202,203,202,203,203,198,199,204,203,202,202,201,200,201,206,201,187,203,207,194,195,200,206,205,208,197,197,208,202,193,195,205,205,204,203,202,202,203,205,205,203,198,195,203,205,202,203,204,204,200,207,208,160,153,184,207,216,176,154,188,201,208,210,196,189,130,107,93,79,153,167,195,206,208,211,160,130,167,212,213,207,198,198,213,185,176,175,184,209,155,172,123,81,96,70,87,83,58,177,244,153,173,169,179,245,148,144,173,185,235,214,216,218,227,242,147,129,212,227,238,233,218,190,173,178,172,182,157,178,159,162,203,204,202,199,153,92,55,7,9,16,11,22,24,19,18,19,20,18,18,17,16,18,18,21,14,16,18,17,16,23,53,72,83,56,41,77,96,92,64,62,51,33,7,44,145,194,215,139,74,97,106,116,105,92,101,104,97,92,95,103,107,98,85,81,91,130,127,90,101,101,83,112,113,131,100,26,22,19,24,27,15,19,21,33,53,50,46,38,31,36,41,50,56,62,57,36,33,23,36,55,43,48,46,46,48,46,31,35,35,29,33,29,27,37,35,43,50,54,57,49,51,50,50,54,55,63,67,59,43,33,34,23,16,17,24,46,78,149,203,205,170,126,80,55,49,51,52,51,56,53,52,53,50,42,36,49,76,79,69,65,61,67,63,62,65,55,47,39,37,44,53,61,61,69,66,78,113,127,134,104,70,59,37,26,23,14,23,27,35,53,65,66,71,83,78,73,64,57,55,48,51,44,43,50,27,29,42,55,57,59,69,64,52,55,48,48,39,31,40,40,45,50,49,51,48,46,44,52,55,57,63,64,62,49,37,37,34,35,35,36,36,116,156,129,103,84,107,93,86,114,90,98,173,224,243,244,233,229,230,230,231,228,227,227,227,228,234,243,245,244,245,245,244,244,244,244,244,241,239,239,239,238,237,240,238,238,240,238,237,238,238,237,237,237,237,238,236,234,233,232,233,233,232,231,229,231,232,234,236,237,230,117,3,1,4,8,10,9,12,10,11,12,13,13,194,199,198,200,200,198,200,196,197,197,198,200,196,198,198,198,199,196,198,196,195,196,196,198,195,196,198,196,198,197,198,195,195,194,192,193,195,195,196,193,191,193,193,193,194,193,191,196,194,192,195,194,195,195,191,193,194,192,193,192,191,193,193,194,195,194,193,195,194,196,198,197,196,195,196,196,195,196,196,196,198,198,198,198,194,196,198,198,196,193,195,194,196,198,197,196,195,196,194,196,198,196,198,196,197,198,195,198,196,195,198,198,196,198,198,198,198,200,198,202,204,200,203,203,202,203,200,201,204,204,201,202,203,200,201,201,205,203,202,205,201,201,201,201,203,202,203,203,203,202,203,201,203,204,202,203,203,203,205,205,203,204,202,201,203,203,201,205,203,203,203,203,207,204,205,205,207,206,205,205,205,206,206,206,205,206,207,205,208,208,207,207,208,207,207,209,207,208,207,208,208,208,210,208,207,208,211,210,211,208,212,208,206,207,207,209,208,208,208,208,207,207,209,208,208,208,208,208,206,207,207,206,207,207,208,205,207,207,206,206,205,205,206,206,205,206,204,204,206,205,207,203,201,201,204,204,201,203,200,201,204,200,202,202,200,199,198,202,200,201,198,198,203,200,202,200,202,202,201,202,202,187,198,208,192,192,203,203,205,206,194,196,205,203,195,191,202,205,204,202,200,205,201,201,206,202,200,196,199,207,202,202,202,201,202,204,208,155,160,191,206,213,172,166,191,198,213,212,192,199,142,115,81,74,143,160,206,198,212,206,164,154,165,212,212,214,213,199,204,182,172,167,213,215,180,187,124,133,131,95,93,76,51,142,215,178,210,188,200,217,155,191,198,214,235,212,219,212,233,214,139,159,211,225,232,230,225,200,194,182,166,175,169,189,171,178,213,209,180,135,93,46,23,8,9,14,12,21,25,23,17,19,19,16,19,17,18,16,13,17,18,17,19,21,25,46,60,75,59,46,74,103,101,69,60,55,25,15,15,74,134,141,162,87,50,99,98,107,101,100,111,104,93,99,101,101,109,91,84,136,171,154,122,96,107,100,86,110,125,137,83,17,23,19,14,20,17,22,12,19,38,40,44,40,44,65,73,70,58,45,35,30,26,42,79,81,65,63,57,53,45,39,32,31,30,28,33,31,25,27,27,36,36,46,54,41,45,42,41,41,47,53,46,40,32,18,18,17,29,55,98,172,218,204,157,104,57,49,58,51,53,42,40,34,27,29,21,24,21,17,20,46,70,55,57,56,57,59,50,54,51,50,33,20,27,25,35,43,49,67,70,77,74,74,83,102,114,101,84,67,47,32,18,16,19,32,48,53,57,58,55,51,54,50,45,49,46,36,33,27,23,35,54,55,54,58,54,54,44,49,48,37,36,28,36,45,39,48,51,57,55,48,46,48,46,47,51,51,56,49,44,45,39,36,36,38,28,99,155,130,107,92,97,89,102,108,92,72,78,117,134,182,217,226,230,224,229,227,225,229,228,232,240,244,243,239,238,242,244,244,244,244,243,241,239,238,237,236,237,239,238,238,240,239,237,238,238,235,236,236,235,235,233,233,231,228,229,230,230,232,233,232,233,234,236,235,229,117,4,1,4,9,10,9,11,10,12,12,13,13,196,200,199,198,199,200,197,196,199,197,198,198,194,198,198,196,198,197,196,196,195,196,197,196,197,197,198,196,194,194,194,194,193,193,194,193,192,193,195,193,194,193,193,194,194,192,193,193,193,195,195,193,194,193,194,193,194,195,191,192,193,194,192,194,192,195,196,195,195,194,198,194,197,198,195,195,196,197,194,199,196,195,197,196,196,194,197,195,194,195,193,193,194,196,196,194,194,195,196,194,198,195,195,199,198,198,198,200,198,198,198,195,196,197,199,198,200,198,200,202,201,203,203,202,200,203,203,203,201,202,203,203,203,203,205,201,200,201,201,202,202,204,202,203,205,204,203,203,203,203,205,201,201,202,202,200,201,203,202,202,204,203,201,201,203,201,203,203,203,206,204,205,203,204,206,204,203,206,206,207,204,206,205,207,209,205,206,205,206,207,207,208,206,208,206,207,208,208,207,208,208,209,211,208,208,208,208,209,209,210,210,208,209,208,209,210,207,208,208,208,208,209,208,206,206,206,205,208,207,206,207,204,207,208,203,205,207,206,206,204,203,206,204,205,205,205,206,201,204,203,204,206,200,203,204,200,201,200,201,200,198,201,199,201,200,201,201,197,201,200,192,198,201,199,200,200,201,199,199,203,203,188,192,206,193,191,200,204,199,203,193,192,208,204,198,190,197,203,201,203,201,201,202,203,202,202,203,199,196,200,201,203,200,203,200,206,210,153,174,190,201,208,170,168,185,202,213,194,185,175,119,119,83,91,159,181,216,190,217,196,165,157,139,201,209,212,212,207,218,185,163,179,229,198,143,149,106,139,149,127,121,83,29,135,208,201,230,151,205,206,152,193,188,221,226,212,218,208,239,188,139,181,208,226,225,226,228,205,200,175,145,154,165,196,191,208,207,229,188,79,44,16,8,12,12,12,13,18,25,27,25,17,19,17,15,20,15,18,13,14,17,15,14,24,36,49,80,60,51,71,91,98,68,60,54,29,14,14,46,110,63,45,113,69,63,119,113,115,119,116,117,99,97,105,100,101,90,71,101,174,179,131,106,94,110,98,79,118,124,127,70,10,24,15,19,25,19,19,20,19,30,37,56,85,83,79,68,52,35,29,33,48,71,95,126,110,75,82,66,37,31,27,32,29,30,29,27,30,25,25,21,23,31,35,42,35,29,28,29,29,31,29,31,27,23,20,27,54,113,194,227,200,142,77,44,49,57,50,42,36,24,21,16,17,17,13,16,17,15,14,17,40,59,50,67,84,84,78,59,46,42,32,18,13,17,16,17,25,30,34,43,55,68,80,83,73,68,83,102,111,92,68,47,27,17,18,30,37,41,41,35,44,41,40,51,41,32,29,24,24,24,48,47,42,57,42,54,44,40,66,43,31,45,56,56,50,48,43,39,47,54,54,46,43,43,43,48,46,49,47,44,47,43,47,39,40,28,86,153,135,120,99,92,79,101,122,106,90,69,56,87,178,227,230,228,223,226,224,229,230,232,238,243,243,234,234,238,239,239,242,238,237,237,236,236,235,236,236,237,239,238,237,236,235,235,234,234,234,235,233,232,234,232,232,230,228,226,227,231,230,231,232,232,233,233,232,227,118,4,1,5,9,9,9,12,10,12,12,12,11,195,203,198,197,198,196,201,198,198,195,195,198,196,199,196,193,196,195,198,196,194,196,195,195,196,197,198,198,198,196,196,197,193,195,195,193,193,191,193,192,192,191,193,193,194,192,193,195,192,193,194,193,194,193,192,192,190,191,193,194,193,195,195,192,192,191,194,195,193,196,191,193,194,193,196,194,196,195,193,193,193,195,195,197,194,193,193,194,195,194,195,194,194,195,194,194,193,195,194,198,199,196,198,198,200,199,199,199,198,201,199,199,199,197,200,201,200,202,200,201,203,200,203,203,202,204,203,203,202,199,202,204,204,202,201,202,202,204,202,202,201,202,202,203,202,203,202,200,205,202,202,200,203,203,199,202,203,200,201,203,203,203,202,203,203,202,203,205,204,203,203,206,205,204,206,203,206,206,208,210,205,208,207,207,207,208,208,205,207,206,205,206,207,208,207,210,209,211,209,209,209,208,209,209,210,209,210,209,210,207,208,212,208,209,208,207,209,208,206,208,206,208,208,204,207,206,206,207,206,208,206,207,208,204,206,204,204,205,205,204,203,205,202,204,202,203,203,201,202,199,204,201,201,203,200,203,201,201,200,198,201,198,200,200,199,202,201,200,200,199,196,199,201,199,200,201,200,201,201,199,206,191,191,203,191,190,199,202,204,203,192,193,201,201,201,191,191,201,202,203,201,200,200,198,202,201,204,199,191,200,202,204,199,201,202,208,206,155,185,194,198,202,157,171,194,203,208,181,149,137,128,141,101,87,155,207,225,191,221,188,162,155,122,195,206,207,212,203,223,188,165,189,214,135,107,129,96,123,132,140,140,107,51,131,201,208,167,139,230,195,127,145,182,224,222,217,214,214,237,160,159,188,210,237,217,225,230,213,204,172,134,128,164,197,191,141,131,173,128,59,22,7,8,12,20,17,17,19,22,24,24,27,17,19,15,14,18,15,16,18,15,19,19,29,46,69,63,51,85,99,90,68,60,56,30,17,16,20,96,110,39,59,109,104,126,151,149,145,117,112,117,108,110,107,101,93,101,125,137,152,137,108,107,101,109,89,81,115,128,120,50,14,17,15,22,22,23,33,39,34,25,37,94,95,76,69,53,51,43,52,73,96,110,125,132,94,52,44,39,24,26,29,26,31,27,26,24,18,17,16,17,17,18,27,32,30,29,34,38,44,49,48,56,61,61,65,124,203,224,200,135,71,46,53,53,45,31,19,21,12,19,18,17,17,14,18,13,16,14,20,15,37,53,61,93,102,112,75,58,45,31,22,10,16,13,16,17,15,18,17,23,32,35,45,59,69,77,78,67,73,103,110,88,67,41,27,30,39,34,30,31,29,35,38,41,33,28,21,17,22,24,30,31,43,46,53,49,40,73,62,38,72,96,94,87,69,57,50,37,36,35,49,51,44,45,37,39,36,41,38,26,31,37,50,45,47,33,73,162,148,123,109,90,79,98,116,118,101,112,190,217,243,243,227,232,227,229,227,230,237,235,240,242,236,236,237,240,243,241,239,236,235,235,234,234,236,237,237,236,235,236,235,234,234,233,232,232,232,231,230,233,233,233,231,226,224,223,224,225,228,228,230,230,229,229,229,228,117,3,1,6,9,9,10,12,10,11,12,11,12,196,199,198,198,197,196,198,198,198,195,198,198,197,195,196,198,198,195,195,197,196,197,198,196,198,196,196,196,197,196,194,195,192,192,193,191,193,191,193,190,190,195,190,191,192,191,191,192,193,189,192,191,191,194,189,190,191,189,192,193,191,192,193,197,193,191,191,193,193,194,191,190,193,190,191,192,193,193,192,192,191,193,195,193,194,193,194,196,194,197,195,197,194,194,195,193,194,196,198,195,198,199,199,199,198,198,198,199,197,199,199,198,199,199,200,199,200,200,201,200,202,201,200,203,200,202,202,201,203,202,202,204,202,202,202,204,203,202,204,203,201,202,203,203,201,203,203,200,203,201,202,203,202,202,202,204,202,203,201,201,206,203,203,204,202,204,206,202,202,204,202,205,205,204,205,205,207,207,205,204,205,206,206,208,208,207,208,207,210,209,207,207,208,208,210,212,210,210,210,211,208,208,210,210,211,212,212,211,212,211,209,209,210,209,210,211,208,208,208,205,207,205,207,207,207,207,206,208,204,208,209,205,205,205,204,203,203,202,203,202,203,206,203,204,202,200,203,201,203,202,201,200,200,204,201,199,202,200,199,200,198,199,200,200,198,197,199,200,200,196,196,201,200,199,200,199,200,200,201,201,203,194,187,197,194,192,196,201,201,203,196,188,201,203,203,197,189,199,202,199,200,200,200,201,201,202,200,200,196,194,198,203,199,202,197,208,199,147,198,195,200,198,162,180,192,207,204,183,160,146,149,159,111,98,179,226,226,202,227,188,176,174,135,199,203,206,211,201,220,171,160,194,180,126,119,123,88,121,119,112,134,115,77,136,176,197,174,163,234,139,93,155,196,231,220,222,214,224,228,145,184,193,210,239,213,223,226,224,210,181,142,150,186,186,137,74,33,67,66,31,22,9,14,14,20,16,17,20,20,24,27,26,19,17,15,17,16,14,20,12,15,17,27,43,57,62,49,77,100,99,71,57,55,33,16,10,22,40,107,90,30,88,119,98,107,113,112,90,71,77,98,124,121,107,92,92,162,206,145,106,100,97,114,104,102,87,80,124,141,110,34,12,16,21,36,35,45,49,38,29,14,38,84,78,67,59,62,63,60,63,60,77,74,49,45,47,37,31,20,20,25,24,28,28,32,30,34,38,43,46,55,67,66,72,79,84,104,136,160,169,176,182,187,176,170,168,187,198,139,119,46,48,53,41,28,17,13,15,15,17,15,19,15,16,17,15,15,17,19,16,15,31,48,61,67,61,65,53,40,40,23,16,15,14,16,15,17,17,16,18,16,19,19,27,35,46,54,69,77,74,64,80,119,117,93,83,74,69,95,54,51,51,41,37,40,36,29,20,15,21,16,22,30,31,44,48,39,50,56,40,69,112,139,131,104,92,75,61,51,41,34,34,44,46,39,39,34,27,32,27,24,21,18,30,39,53,34,62,159,155,127,107,92,84,92,117,114,100,141,236,250,250,242,227,229,225,229,229,234,239,242,240,241,241,235,239,243,243,240,239,239,237,235,237,237,237,238,235,235,235,233,233,234,230,229,230,230,230,229,230,232,230,231,227,224,226,221,222,223,224,225,229,230,229,232,233,229,117,4,1,4,9,10,9,11,10,12,12,12,12,196,202,198,196,200,196,197,196,198,196,199,198,195,196,194,196,196,197,198,197,198,198,199,198,198,196,196,196,196,194,193,194,192,193,193,194,191,193,191,189,193,192,191,190,194,191,191,192,190,192,192,191,191,194,192,193,193,191,192,191,193,192,191,192,193,192,193,193,192,195,193,193,193,190,193,193,192,193,193,194,194,192,193,194,193,194,194,193,194,195,196,195,194,193,193,196,195,197,197,196,199,200,200,198,202,199,198,200,202,201,195,198,198,200,202,200,199,198,200,202,204,201,200,200,202,203,200,202,202,202,202,202,202,200,202,205,202,200,202,203,204,202,203,203,201,203,202,200,204,202,203,203,203,205,200,203,202,199,203,203,203,201,200,202,202,203,205,205,203,204,204,205,205,205,206,206,206,204,204,207,204,208,207,207,207,207,206,207,210,209,207,210,210,210,212,210,209,210,211,212,211,213,213,211,211,208,211,210,213,211,211,213,209,211,211,211,212,209,211,210,208,210,209,208,210,209,211,210,207,208,209,207,206,202,204,204,202,205,203,202,202,203,203,202,203,202,200,202,202,200,200,200,201,200,198,199,198,198,200,199,198,199,199,200,198,199,201,200,199,194,197,200,199,196,198,200,199,198,199,198,202,196,184,191,196,195,196,202,200,201,196,192,200,201,202,200,189,195,201,199,199,200,203,200,201,200,201,201,198,193,194,202,198,204,198,211,188,150,204,195,193,194,169,174,193,210,203,196,172,157,147,139,103,122,218,248,239,212,235,193,189,192,141,200,206,204,211,206,218,155,162,205,199,151,161,182,106,96,105,119,122,81,74,118,181,213,184,207,180,95,113,188,228,230,219,225,210,236,203,139,210,171,211,240,211,222,219,229,222,193,160,178,209,181,77,39,53,38,33,24,21,23,17,19,21,16,22,18,19,21,24,27,17,21,15,15,17,12,15,20,19,21,28,48,55,51,66,94,90,73,63,55,33,14,13,19,37,64,108,56,47,117,90,60,48,41,54,48,45,45,51,95,117,100,82,91,155,162,108,78,76,80,86,83,94,78,80,123,137,95,25,9,13,23,34,36,33,28,18,21,16,18,47,41,41,33,28,38,36,34,39,45,43,39,44,48,47,46,49,60,66,70,76,81,98,124,146,169,170,175,187,191,191,181,177,178,174,185,185,175,160,149,134,121,113,85,68,55,51,53,46,39,29,16,13,16,15,15,17,13,22,17,15,21,18,19,12,17,17,16,17,26,50,52,48,40,55,42,34,32,13,13,14,15,16,17,19,17,15,16,22,21,14,21,19,27,35,41,55,71,79,78,75,111,131,139,156,145,157,146,139,144,125,105,93,77,65,57,43,39,40,31,32,36,26,35,30,36,36,32,59,79,99,106,96,70,72,84,74,65,53,47,47,47,50,51,40,30,26,27,23,20,21,23,27,50,49,49,148,153,122,117,94,90,95,108,109,99,102,157,221,242,242,234,230,223,226,230,235,241,241,240,241,238,240,244,244,245,241,239,241,239,238,238,238,239,237,235,234,230,231,231,230,231,231,230,228,228,227,229,229,227,227,227,227,224,224,225,221,223,223,227,229,231,235,237,231,116,3,1,5,10,10,9,12,10,12,12,12,13,199,201,200,198,198,196,198,199,198,199,198,198,197,198,196,195,197,192,195,195,194,196,199,196,199,198,198,198,195,196,193,195,192,194,194,194,193,191,191,192,192,192,193,191,190,192,193,194,191,192,193,192,192,193,192,191,193,191,192,191,191,192,191,191,192,192,191,194,191,191,190,192,194,191,191,192,193,192,194,191,193,192,192,192,190,193,192,193,193,194,194,197,195,194,195,194,195,196,198,195,197,197,197,199,198,199,199,200,198,199,200,198,199,200,200,199,199,201,204,202,200,199,201,202,199,202,202,200,202,201,201,201,201,201,200,201,199,202,204,203,203,203,202,203,203,204,202,201,205,202,201,203,203,204,202,202,202,202,202,203,205,203,202,203,203,201,203,204,203,204,203,204,204,205,205,205,206,204,207,210,208,207,206,208,207,208,208,208,210,208,210,211,211,210,211,212,212,210,212,212,212,214,211,208,208,209,209,211,211,209,211,210,209,211,211,210,210,210,210,210,209,207,208,207,210,211,210,210,207,211,207,208,207,203,204,202,204,201,205,204,201,202,201,203,202,200,201,199,201,202,201,200,200,200,202,200,200,203,198,198,199,200,200,195,198,198,198,200,196,193,197,198,198,198,199,198,198,199,198,196,199,200,179,188,205,196,193,199,199,201,195,188,199,200,199,198,190,190,200,202,201,200,199,200,200,197,199,200,197,195,191,200,196,200,199,213,178,141,204,191,189,182,158,182,196,200,192,164,121,117,119,124,112,132,210,216,194,199,228,190,189,191,132,190,204,203,208,203,206,158,193,217,209,168,185,191,118,137,112,105,112,90,83,148,223,211,194,186,172,133,148,220,229,227,218,220,211,237,172,141,219,145,203,235,212,221,211,229,224,194,181,186,212,172,59,47,57,41,19,13,24,19,19,23,18,17,22,19,16,19,22,23,18,20,14,12,16,17,19,17,19,27,46,48,42,66,92,92,68,60,56,31,16,14,19,33,61,93,108,70,94,124,71,68,71,57,57,45,33,39,29,45,88,99,98,86,108,113,81,84,81,77,90,75,83,68,70,118,118,77,27,28,28,30,42,40,36,36,49,47,46,50,53,57,61,56,64,64,69,81,81,80,90,121,135,147,150,166,173,175,178,167,92,157,164,174,169,164,123,126,137,127,98,81,67,56,57,57,59,64,60,56,62,60,53,49,48,48,43,42,37,17,14,15,13,16,14,15,19,17,16,17,14,20,19,15,21,19,17,19,14,30,61,57,41,41,39,41,36,17,16,18,15,16,16,16,18,19,16,16,14,18,18,15,15,20,19,23,34,55,63,77,86,74,71,64,66,73,113,101,124,137,138,146,155,151,153,150,150,139,110,87,69,66,57,51,49,37,52,36,41,45,48,71,63,39,44,51,59,66,69,65,56,55,55,67,62,43,38,27,28,30,24,20,25,51,42,43,131,150,128,113,99,93,92,110,103,101,95,89,164,235,249,248,231,223,226,233,244,250,250,250,251,251,251,251,250,250,247,239,236,234,232,236,235,234,231,230,229,229,228,227,229,228,229,229,229,226,225,226,228,227,227,225,224,225,222,222,222,224,225,227,229,233,235,235,231,118,3,1,5,9,10,9,12,10,12,13,13,12,198,203,200,199,199,198,198,198,201,196,198,197,195,198,198,199,199,197,197,196,198,197,196,197,197,195,195,195,195,195,192,194,193,191,193,192,191,193,192,192,192,191,193,190,193,193,193,193,192,195,192,193,192,190,192,190,190,191,193,191,191,191,191,192,190,191,190,188,193,193,191,194,193,190,192,190,192,195,191,193,191,192,193,190,194,193,196,196,192,193,194,195,196,195,194,196,196,197,194,196,199,198,198,197,199,198,200,199,198,199,197,201,201,201,200,200,200,198,201,199,200,201,201,201,201,200,201,203,200,202,200,202,203,203,204,203,203,202,203,203,202,202,202,202,203,201,201,201,203,203,202,202,202,204,202,205,204,203,205,202,205,205,202,206,205,203,204,205,203,205,205,201,205,205,204,207,205,206,207,207,205,207,211,208,208,210,208,212,210,210,211,211,212,212,214,213,213,213,214,212,211,211,208,210,209,208,209,209,212,210,211,210,210,211,208,211,211,208,208,206,207,207,207,208,210,210,209,208,208,210,211,210,208,207,208,206,206,207,207,206,204,204,203,204,204,202,204,205,203,206,204,205,204,203,204,201,203,200,202,203,199,198,200,200,197,200,200,200,194,193,197,198,199,197,198,198,196,197,199,196,202,199,178,190,203,198,193,199,199,200,198,188,197,198,199,201,193,188,196,203,201,199,199,198,198,196,200,198,199,196,191,199,198,202,198,211,165,141,205,196,186,177,170,185,200,197,176,136,90,105,115,126,128,123,138,125,124,165,226,182,181,189,127,186,206,202,209,204,188,160,204,195,170,114,176,196,107,138,139,146,99,47,78,171,233,190,176,176,185,171,188,236,229,229,221,219,215,236,142,167,225,129,215,230,217,223,209,226,226,203,195,171,158,133,51,61,45,15,19,13,19,22,25,24,22,18,19,22,19,16,17,17,23,29,18,16,17,15,17,19,21,36,46,42,63,98,99,78,63,57,35,18,13,16,34,61,92,117,133,97,106,107,77,99,78,55,63,43,42,47,33,32,66,106,147,150,160,150,139,147,134,139,136,135,139,118,118,127,118,89,104,135,141,156,152,148,150,163,178,186,179,182,178,175,181,172,176,168,170,166,160,139,143,158,150,149,143,138,136,124,107,99,77,66,66,69,65,64,56,53,58,61,63,57,57,59,57,57,57,61,63,59,60,54,47,46,43,40,35,26,16,23,22,18,21,15,15,17,18,17,16,19,17,16,16,23,17,18,19,18,16,41,92,81,64,54,49,51,32,15,13,16,15,16,18,17,19,21,17,18,16,17,17,18,18,19,22,25,19,25,46,59,68,74,76,66,71,63,66,68,59,66,57,62,68,67,86,100,125,152,148,143,153,151,150,140,120,102,83,73,67,56,53,57,50,39,36,32,37,36,42,51,55,52,57,69,67,60,38,30,31,33,31,30,55,53,41,30,107,149,116,113,103,93,97,104,104,116,110,92,162,224,234,246,235,229,242,248,251,251,252,252,252,252,253,253,252,252,251,237,231,229,230,231,227,226,225,226,227,225,229,229,226,228,229,225,226,227,226,225,227,227,225,225,224,224,223,225,226,227,228,229,232,234,236,234,228,117,3,1,5,9,9,9,11,10,11,12,12,13,201,205,200,198,199,194,196,198,198,198,195,198,196,197,198,199,199,195,197,198,198,199,199,195,194,194,195,194,191,194,193,195,195,193,192,191,192,190,190,191,191,191,191,191,191,191,189,191,191,188,189,190,193,191,190,192,192,190,190,189,191,191,188,191,189,189,191,191,191,192,192,190,193,191,191,192,193,191,194,191,193,192,192,194,193,193,196,195,195,194,194,196,196,199,196,195,198,197,196,194,194,197,195,197,199,199,200,200,198,199,201,200,201,200,199,200,202,200,199,200,200,200,202,200,199,200,200,200,201,201,201,201,201,200,200,203,202,201,201,199,201,202,202,201,200,202,204,202,203,203,203,204,200,203,203,202,204,203,204,202,204,203,200,202,203,202,203,202,202,206,205,204,206,205,206,204,206,203,207,207,205,210,208,208,208,210,211,211,212,208,211,212,212,212,213,213,213,213,212,212,212,210,210,209,208,208,208,208,208,207,209,211,207,208,210,210,210,209,210,210,210,208,208,207,207,208,208,209,209,212,210,208,207,205,206,207,209,207,208,207,206,208,207,207,206,205,208,207,207,207,208,207,208,207,206,205,205,206,205,207,202,203,203,201,201,202,199,200,195,194,200,199,199,199,199,198,198,198,196,199,201,196,182,188,201,199,196,198,198,200,196,189,196,199,198,200,201,188,193,198,198,200,196,198,198,199,200,198,198,199,192,192,198,199,200,206,162,150,199,196,182,181,171,177,198,193,190,143,101,111,101,121,116,97,94,79,81,145,219,180,183,199,135,177,204,200,212,202,172,172,182,141,148,89,179,200,103,109,99,136,129,90,77,168,186,164,193,178,188,151,185,232,221,230,222,220,226,225,128,193,225,131,224,226,218,226,210,224,217,210,181,93,82,84,54,59,13,16,25,13,18,19,21,24,22,21,24,19,19,16,15,21,19,25,18,17,19,17,16,24,34,44,41,49,90,101,84,66,56,40,16,14,17,37,63,84,113,122,101,83,103,110,83,90,88,73,89,75,61,64,52,49,71,100,132,133,134,120,116,122,113,119,119,117,126,111,110,107,92,91,119,147,148,162,158,147,141,151,151,146,147,141,128,128,125,110,98,86,87,75,66,63,57,61,62,65,63,63,62,62,68,65,61,70,63,69,67,63,56,56,61,55,54,49,46,44,44,42,39,39,32,37,36,24,26,27,29,35,35,39,41,45,46,47,36,16,16,13,20,19,14,18,16,22,20,17,18,21,19,21,20,72,132,119,97,84,78,77,47,15,14,13,16,21,21,17,21,17,19,18,18,19,18,18,23,42,57,55,48,32,31,39,37,41,47,50,54,55,60,63,66,66,72,71,66,66,63,59,63,57,63,65,62,83,94,115,125,89,135,130,127,121,121,106,89,86,71,62,56,53,54,48,48,53,59,61,53,50,34,18,26,29,39,52,54,47,33,21,84,127,118,107,102,95,89,110,106,120,119,99,130,134,146,211,229,238,248,248,246,246,231,226,233,240,247,247,230,235,247,224,226,227,225,224,221,222,220,222,222,224,224,224,229,227,229,228,226,229,226,225,223,223,223,224,226,226,227,228,228,230,230,229,230,233,232,230,228,117,4,1,5,9,10,10,10,10,12,12,13,12,202,206,203,202,201,198,197,199,200,200,199,199,200,199,198,195,197,198,198,199,196,194,198,195,194,195,198,194,194,194,193,195,196,194,193,193,193,193,193,190,191,190,190,191,189,191,192,190,190,192,189,193,192,192,193,191,190,189,191,188,189,191,192,189,191,191,191,192,191,192,190,193,191,190,194,190,191,193,190,193,191,192,195,192,196,196,196,196,195,197,196,198,199,198,198,198,196,199,198,196,197,199,198,197,199,198,199,199,202,200,200,199,200,200,198,200,199,200,201,198,202,202,202,201,201,200,200,202,202,202,201,203,202,200,202,202,202,203,202,205,203,206,206,202,207,207,206,208,206,208,207,206,206,206,204,203,203,203,205,203,205,206,203,201,204,206,206,205,203,205,204,202,204,205,206,207,207,206,206,208,209,208,208,210,210,211,212,213,212,212,212,210,211,212,212,214,213,213,213,212,213,210,207,208,209,208,211,208,207,210,208,208,210,209,208,209,207,208,210,210,210,208,210,208,206,208,209,209,209,212,208,208,209,209,210,206,206,208,211,210,208,210,207,208,208,208,209,208,207,210,208,209,208,207,208,207,208,203,206,210,206,205,209,208,205,206,204,203,196,199,200,198,199,196,199,200,198,198,199,196,196,196,187,185,198,200,194,195,195,198,198,188,197,198,197,201,199,197,190,197,200,199,200,198,201,199,199,198,199,200,194,190,194,199,205,199,156,164,203,197,163,159,171,185,192,182,182,133,105,106,91,105,93,77,66,62,71,139,215,178,190,204,148,175,203,207,212,185,153,165,159,154,173,111,194,171,76,91,88,95,73,108,120,184,199,199,204,147,160,134,186,221,205,218,214,219,235,207,121,223,211,131,226,218,224,224,218,214,220,205,105,56,46,60,53,36,12,15,18,15,23,19,16,21,19,18,25,13,19,20,14,19,18,19,23,19,16,20,20,31,46,41,44,74,99,94,65,56,42,18,12,19,35,65,92,100,110,84,41,43,98,107,111,148,113,131,177,120,103,125,102,124,142,119,100,71,61,52,52,52,55,63,55,57,53,55,65,63,60,58,61,62,67,68,59,63,60,58,63,63,66,60,59,62,65,63,70,66,69,65,69,71,67,65,63,75,67,67,65,55,56,54,54,53,47,47,41,41,31,29,36,27,27,27,21,20,23,19,18,21,17,21,20,16,17,19,22,43,57,65,66,75,82,80,61,23,12,18,19,19,26,19,20,23,23,24,19,23,19,24,32,112,177,146,133,128,139,135,64,18,12,16,17,18,20,17,19,19,17,19,20,21,22,19,33,70,94,89,77,69,56,41,36,21,18,19,24,27,27,32,36,42,42,51,54,50,57,55,55,64,63,64,65,65,66,63,62,59,62,65,60,69,87,106,112,127,133,128,128,109,107,94,88,95,83,83,81,69,58,53,42,39,42,45,46,30,36,27,69,127,127,118,104,102,100,103,103,118,114,88,82,47,59,164,232,245,249,223,165,139,115,109,118,138,157,141,98,154,215,217,227,220,220,218,217,221,219,221,222,218,219,221,221,223,225,226,229,227,225,226,226,222,222,225,225,226,226,228,228,229,230,228,229,229,229,228,225,117,4,1,5,9,11,10,12,10,12,13,13,13,200,203,202,203,204,200,200,201,201,200,201,201,200,198,199,200,198,198,198,199,197,196,198,193,197,197,194,196,193,196,193,195,196,196,194,193,195,192,195,191,189,192,192,193,191,191,192,193,190,190,191,190,192,191,190,190,189,191,193,191,190,191,189,192,191,191,193,188,190,190,191,190,191,192,190,191,192,192,194,189,193,193,194,195,195,196,194,197,196,198,198,199,195,196,198,197,199,198,199,199,199,199,199,200,199,200,200,198,198,200,198,197,200,200,200,202,200,198,199,200,200,200,201,201,202,201,199,202,203,203,203,203,204,204,205,203,204,204,203,205,206,207,209,209,208,210,208,206,209,208,210,209,208,211,208,207,204,205,205,205,206,205,205,204,205,204,204,205,205,201,202,204,202,204,205,204,208,207,210,208,207,211,209,211,208,210,212,212,211,212,213,213,211,211,212,212,212,210,211,208,210,210,209,210,210,208,207,209,210,208,207,209,207,210,209,208,210,208,209,208,209,210,208,207,210,208,209,213,211,210,209,207,210,210,208,210,209,210,210,209,208,209,209,209,209,208,208,208,208,209,210,208,208,207,207,208,210,208,207,208,206,208,208,207,207,206,208,203,198,202,201,200,199,199,199,197,198,199,198,197,193,193,193,184,195,201,193,194,196,199,197,188,194,199,195,199,200,192,188,191,197,197,196,196,196,197,196,195,196,197,198,190,190,193,208,188,153,176,197,200,154,150,184,189,185,165,143,105,107,100,74,91,66,56,49,47,71,140,214,177,190,212,155,170,213,211,207,168,157,183,163,171,199,127,149,122,77,93,98,90,53,48,71,184,202,231,171,136,176,131,207,220,201,206,200,211,230,179,129,247,203,139,232,220,224,220,227,218,228,187,59,44,58,52,31,22,17,19,21,15,22,19,19,24,18,20,23,16,19,14,18,18,12,27,21,23,20,23,31,39,38,48,64,84,95,62,59,47,17,14,13,32,61,89,104,103,114,76,20,35,85,80,105,144,101,130,141,113,114,94,116,162,155,113,93,88,79,63,73,71,75,94,93,93,83,87,96,85,84,66,75,99,90,84,77,75,74,83,85,76,75,69,71,72,72,75,74,66,63,61,58,50,46,44,37,45,38,36,37,23,32,30,26,31,25,22,19,19,18,16,19,17,14,17,18,17,18,14,15,15,16,19,14,16,21,18,33,85,102,116,126,139,143,148,111,36,12,14,17,23,23,17,27,22,23,23,23,22,19,24,39,122,169,146,142,139,152,151,78,20,15,14,18,15,19,24,18,18,21,26,18,20,22,20,40,109,135,131,131,112,102,84,59,23,10,14,14,15,18,15,18,19,21,25,21,30,29,29,30,37,41,51,51,50,57,57,61,66,63,62,65,62,61,56,57,95,59,65,65,70,89,101,113,116,120,123,131,141,137,142,124,105,98,84,76,72,95,60,118,153,157,143,116,116,105,111,103,106,105,87,83,65,80,203,244,244,237,113,49,62,66,66,53,43,53,49,11,78,187,211,228,221,218,217,217,219,216,217,217,218,220,220,218,219,222,224,225,225,224,224,225,226,226,225,225,224,225,223,223,225,224,223,227,226,225,225,224,119,4,1,6,9,10,10,13,10,12,13,13,13,202,203,202,203,204,204,200,204,202,201,203,202,202,202,202,201,200,199,200,198,198,198,198,197,196,198,197,193,195,196,196,193,193,194,194,194,194,193,192,193,196,194,193,193,192,192,191,193,189,188,189,190,191,191,190,191,190,190,193,190,190,191,191,189,190,192,192,192,190,191,193,193,192,190,192,189,193,194,192,196,194,196,197,196,196,195,194,197,196,197,196,198,199,198,199,200,202,200,201,200,198,200,200,201,203,200,202,201,200,199,201,201,202,199,200,203,200,203,202,200,199,200,201,200,201,203,202,203,203,205,205,204,205,207,208,206,205,205,208,207,208,210,208,210,212,211,211,211,212,211,212,212,211,212,210,211,210,209,209,208,208,206,203,204,204,205,202,204,205,203,205,206,206,206,207,209,208,210,211,210,210,207,208,208,210,212,212,212,210,210,210,212,211,210,212,210,210,210,212,210,211,211,211,213,210,210,210,209,210,211,208,210,210,210,210,211,211,211,211,210,212,213,211,212,211,211,212,210,210,212,210,211,212,210,210,212,211,210,211,210,210,210,212,211,208,208,210,209,207,209,207,208,208,207,208,206,209,209,207,210,205,204,210,206,203,206,208,200,200,207,203,203,204,201,200,201,199,199,200,194,191,198,199,183,191,202,198,198,198,198,195,182,194,199,195,200,194,196,188,187,198,197,196,194,195,195,197,195,193,194,197,193,191,187,209,179,155,185,193,199,146,158,187,179,180,155,153,129,117,86,74,87,48,35,27,44,85,173,223,178,189,207,156,159,205,215,197,170,199,201,169,185,203,107,110,129,95,121,107,81,67,44,22,117,183,204,170,183,185,144,212,217,212,214,195,210,217,148,164,252,199,160,235,231,226,228,234,217,233,164,39,55,51,31,23,16,17,17,14,23,24,19,18,19,26,21,25,19,19,16,13,16,12,23,27,23,28,29,37,42,46,65,80,87,69,59,46,23,11,15,30,51,83,106,110,103,119,86,23,61,107,83,99,89,62,80,76,68,63,64,84,122,101,61,40,31,41,53,80,66,46,55,57,66,59,104,120,103,112,76,49,45,53,42,45,39,36,46,37,37,38,44,37,33,38,32,33,31,25,24,23,18,22,23,16,22,19,27,32,30,38,33,33,36,40,24,13,16,14,17,19,15,15,15,16,20,16,17,16,20,20,14,22,14,24,21,55,177,178,179,179,179,186,184,169,56,7,16,21,23,25,26,22,20,24,27,23,22,21,21,54,121,163,153,142,144,148,162,108,37,11,11,15,16,18,19,19,18,25,26,36,19,23,22,55,152,163,147,160,159,157,157,108,34,13,12,13,15,16,16,14,18,16,17,17,20,17,18,22,18,23,24,25,27,31,35,40,40,49,50,48,59,59,60,61,66,66,64,64,61,66,65,62,70,65,62,70,85,105,125,129,134,139,146,141,141,152,132,146,155,146,125,97,96,106,118,122,125,120,112,116,120,181,241,244,244,185,46,4,49,74,80,52,36,37,27,12,86,199,216,229,222,217,218,218,220,215,217,217,215,216,217,214,218,219,220,222,221,220,220,222,221,222,223,221,221,221,224,223,223,221,217,222,223,225,223,222,118,4,1,6,9,10,10,13,10,12,13,13,13,204,205,203,202,203,199,202,203,204,202,202,202,203,200,199,201,200,201,202,199,198,198,200,196,198,199,196,198,198,198,197,196,194,194,194,194,194,193,196,195,193,193,193,194,191,193,192,192,194,193,194,190,194,193,192,194,192,193,193,193,194,193,191,194,192,192,193,192,195,191,193,194,192,192,195,194,195,194,196,198,198,198,200,199,196,199,200,200,197,196,199,200,198,201,201,199,200,199,199,201,201,200,203,203,199,201,200,201,202,201,201,199,199,202,201,201,199,201,203,203,205,203,204,203,202,205,203,204,207,204,206,208,207,207,208,208,210,210,207,210,208,209,210,210,212,211,211,211,212,212,213,214,212,211,212,211,213,213,208,208,208,207,208,204,201,201,203,202,206,206,207,205,207,210,208,211,211,209,210,209,208,208,208,208,208,211,211,210,210,211,209,208,211,210,212,213,212,213,214,212,213,212,212,211,210,208,209,211,210,210,212,212,212,211,210,211,211,210,211,213,212,214,213,212,213,210,211,212,211,211,212,211,211,212,212,212,210,210,211,209,210,210,210,209,208,209,211,210,209,208,208,208,208,210,206,208,212,208,207,207,208,207,206,206,206,207,208,201,202,207,205,206,203,203,200,199,202,199,200,195,194,198,202,185,190,203,198,198,197,200,198,186,194,200,196,198,196,198,192,185,193,198,198,197,196,193,194,195,196,196,196,191,192,188,208,169,161,195,186,203,146,160,198,174,182,178,169,152,136,106,105,81,51,55,57,110,154,218,226,176,185,210,158,130,198,214,188,186,213,191,157,156,177,116,165,177,113,108,90,84,76,52,23,93,184,223,171,187,171,150,227,219,206,209,202,219,206,144,184,248,181,144,217,238,236,236,223,174,153,100,45,66,29,18,18,16,19,18,20,19,22,19,21,21,20,22,15,21,19,15,16,14,18,17,22,30,29,40,42,42,60,77,90,66,63,47,24,18,12,30,47,78,109,114,115,107,109,76,39,83,112,107,121,84,56,119,79,42,68,45,59,110,116,57,19,81,72,63,100,75,28,21,21,32,36,95,98,92,101,53,24,22,23,28,24,24,19,15,19,18,19,15,19,17,17,19,21,21,14,19,14,16,21,16,21,17,17,46,57,61,69,73,76,77,85,47,15,16,13,14,14,18,20,18,20,17,19,16,16,20,20,22,20,14,24,16,74,195,184,185,173,174,173,181,175,59,15,13,13,25,27,25,25,25,27,25,25,26,26,23,53,101,142,163,141,139,144,162,148,56,16,10,13,26,22,17,23,24,19,28,24,21,26,22,67,144,155,157,164,169,170,180,162,70,16,9,14,14,13,18,16,19,16,21,21,16,16,14,19,19,17,16,20,24,31,28,29,29,31,31,29,39,36,39,43,44,53,53,60,66,66,72,74,76,73,69,72,68,64,69,71,71,72,70,74,78,114,77,91,84,71,69,57,64,74,91,98,101,110,107,113,130,151,175,153,153,119,36,27,46,70,83,70,54,50,37,17,148,242,226,234,218,223,220,219,223,219,220,217,215,216,217,217,219,217,219,219,217,220,219,221,220,219,220,220,219,220,221,221,220,219,219,222,223,224,222,222,119,5,1,7,10,12,10,12,12,13,13,13,13,205,205,203,205,203,202,201,201,203,204,205,202,200,201,202,200,199,199,200,200,201,202,201,203,201,199,199,199,199,198,198,196,198,196,196,195,195,196,192,193,193,193,194,193,193,193,193,194,191,195,195,193,195,195,193,192,196,194,193,193,195,193,193,194,195,193,192,196,197,194,196,196,193,198,195,194,196,195,198,198,199,199,198,199,199,199,198,198,200,201,199,200,201,200,200,198,201,199,200,201,201,202,199,203,200,200,202,199,201,200,202,202,202,200,202,202,201,202,201,202,205,205,205,205,206,205,204,205,204,205,204,207,207,205,207,205,210,208,208,208,209,212,209,210,212,211,211,208,212,211,211,213,211,212,212,212,212,211,213,208,210,208,204,203,202,204,201,202,203,204,205,204,206,209,209,210,207,208,208,208,210,209,211,210,211,210,211,210,209,210,208,212,210,208,213,211,213,214,213,212,212,211,212,211,210,211,210,211,213,210,210,214,212,212,212,212,212,210,212,212,211,212,211,213,213,214,214,211,212,212,212,212,211,212,210,210,212,211,209,211,212,207,209,208,206,208,208,208,209,210,210,207,208,208,210,208,211,212,206,209,205,204,207,206,205,209,207,200,205,208,206,205,205,203,202,201,199,199,196,193,195,200,203,186,187,201,199,197,198,198,200,190,194,199,193,198,196,198,197,186,188,196,196,194,195,193,194,196,194,195,197,194,193,188,203,160,171,195,186,200,151,192,207,177,184,154,141,139,141,129,113,62,46,55,105,188,207,237,218,171,186,208,177,144,194,208,181,208,214,161,118,149,199,142,196,161,90,117,87,94,89,61,51,164,242,233,177,156,154,160,214,216,216,213,199,225,191,134,206,182,130,139,174,232,244,237,169,88,56,41,54,46,12,20,18,16,22,19,18,18,21,24,19,19,21,21,21,18,19,15,14,15,17,18,23,29,46,41,38,65,71,79,72,61,54,26,19,41,48,59,69,95,117,117,120,93,98,89,55,95,118,103,125,120,124,134,65,58,56,51,87,134,132,76,88,143,146,71,111,132,53,22,24,39,76,120,92,89,92,46,32,27,25,28,28,32,22,16,15,18,19,14,17,21,15,16,14,15,18,19,16,16,16,22,14,22,20,89,136,139,156,160,165,174,190,80,10,15,8,15,13,15,18,15,17,16,18,18,19,21,20,17,18,15,25,19,74,171,130,135,124,107,117,116,114,42,18,15,15,24,24,26,29,29,28,29,28,32,29,22,52,78,141,174,141,148,139,155,168,75,17,13,11,24,20,16,22,28,29,24,22,25,28,29,72,143,151,150,165,158,163,176,194,108,24,14,11,14,17,18,16,17,18,21,17,16,16,15,16,17,19,19,20,31,50,47,44,43,34,33,35,29,19,19,19,22,22,29,28,34,37,36,49,47,55,57,58,65,63,74,77,79,79,80,73,71,73,66,70,62,65,73,84,80,73,50,38,39,40,53,58,63,56,43,24,40,43,29,37,47,66,71,63,50,59,22,52,231,250,232,232,219,224,218,219,222,218,220,219,218,218,221,220,220,220,218,220,220,220,220,220,222,221,220,219,220,219,220,220,219,220,221,222,222,221,221,223,118,5,0,6,10,11,11,13,12,13,13,14,14,203,210,204,204,205,202,205,202,204,203,205,202,204,205,201,203,202,201,202,202,203,204,201,200,203,200,201,200,199,201,197,198,198,199,198,198,198,194,196,198,197,196,194,194,196,195,194,196,194,191,193,196,196,193,195,196,195,196,195,195,196,197,195,198,194,196,199,194,197,197,198,198,197,199,198,194,198,196,198,200,199,199,200,200,200,200,199,199,199,200,199,201,200,200,200,200,201,199,198,200,200,200,203,203,201,203,203,203,200,201,205,203,202,202,204,204,205,206,203,204,204,205,205,207,206,203,205,203,207,206,205,203,204,206,206,204,206,206,208,208,208,211,208,207,209,212,210,211,211,211,210,211,210,209,214,208,211,212,209,212,209,208,206,202,202,203,205,206,206,203,204,204,207,209,209,212,209,207,209,208,207,208,210,210,210,210,211,210,209,208,210,212,211,209,210,210,210,211,211,210,211,211,213,211,211,212,212,212,210,212,211,212,212,210,212,212,210,211,208,211,212,211,210,212,213,211,211,212,211,210,211,211,212,210,210,211,210,208,208,209,209,207,211,211,207,209,208,208,208,207,210,211,209,210,213,210,214,222,215,208,208,208,209,207,203,207,202,199,206,207,207,206,204,206,203,205,204,204,198,193,201,199,202,191,183,198,196,200,201,200,202,190,197,201,198,200,196,199,198,194,187,194,200,194,196,195,194,195,192,193,194,193,196,189,193,148,175,199,182,204,163,210,220,170,153,123,129,155,161,135,110,43,41,46,103,213,214,232,209,168,190,210,189,145,201,205,175,222,177,137,143,187,222,141,165,109,81,113,89,99,75,46,61,199,251,203,141,160,173,174,217,205,202,212,209,230,166,150,188,87,75,84,117,205,243,217,92,56,57,41,49,27,12,25,21,19,19,15,22,16,22,22,19,21,15,20,21,18,16,18,14,15,23,23,21,39,45,36,61,70,74,66,60,54,31,12,63,111,103,98,97,103,112,120,103,90,118,135,123,118,93,81,110,123,121,119,69,73,93,88,87,90,124,117,88,147,167,87,94,149,103,45,42,63,132,147,93,92,87,43,44,42,43,49,57,57,21,12,15,13,17,16,18,15,19,17,17,19,17,16,17,19,19,16,21,23,29,160,208,192,208,197,199,194,227,120,7,11,11,15,18,19,16,18,17,17,19,20,23,16,27,26,19,22,27,19,81,134,81,62,54,51,53,52,60,36,24,14,21,24,23,27,28,28,28,31,34,28,28,31,59,87,142,173,131,149,140,147,174,86,22,15,12,22,23,23,29,27,29,30,17,28,26,29,87,134,132,122,123,130,130,148,173,112,32,13,10,16,24,17,19,18,15,18,15,20,21,19,18,17,19,17,20,40,84,92,81,73,67,63,59,43,21,16,15,15,18,18,18,19,21,23,22,24,27,29,32,33,39,38,44,53,49,57,63,64,66,69,74,89,99,108,128,128,104,72,42,24,32,42,61,77,60,46,40,46,44,44,57,75,75,65,69,71,54,22,143,238,248,231,227,221,223,216,217,217,217,220,217,217,221,220,218,220,216,218,217,217,218,215,217,219,219,217,220,221,220,219,219,220,220,217,220,218,223,217,218,120,5,1,6,10,11,10,12,11,12,14,13,13,206,205,205,205,203,202,203,203,205,206,203,204,202,203,204,200,203,203,200,200,200,200,200,201,199,198,199,201,202,200,200,198,200,199,198,200,199,199,198,199,198,196,197,199,198,198,198,198,194,198,197,197,199,198,198,197,197,199,200,199,199,199,201,199,196,197,196,198,197,198,201,199,198,201,198,199,200,199,200,200,199,200,198,200,201,201,202,201,200,201,201,202,203,200,201,201,204,203,200,202,204,200,201,203,202,203,204,204,204,200,203,202,201,203,203,203,203,203,205,204,203,204,205,205,204,208,206,205,206,207,208,207,204,203,207,204,206,207,205,207,206,207,208,206,209,208,209,210,208,212,208,208,205,207,210,210,211,208,212,211,208,209,208,208,205,203,204,204,205,205,206,205,206,207,208,209,208,209,208,208,208,209,210,210,209,208,211,209,207,210,210,211,210,211,212,209,209,207,211,210,211,211,210,211,212,213,213,212,214,214,212,212,212,214,212,211,213,212,210,212,212,213,212,212,208,211,213,213,213,210,211,210,211,211,210,209,211,209,208,208,208,207,210,210,210,210,209,205,209,208,207,209,205,208,216,204,203,222,218,208,206,206,207,206,205,206,201,201,207,205,208,204,205,209,209,208,210,206,200,203,206,205,206,198,186,202,204,201,205,203,207,196,199,208,205,206,203,202,203,201,189,191,200,198,200,198,196,197,195,193,194,198,195,199,185,138,188,198,189,204,162,208,173,142,168,149,164,190,195,159,101,49,44,35,105,189,195,225,200,171,198,188,125,112,165,176,185,201,155,160,176,224,173,85,175,131,93,101,71,92,55,41,56,190,207,150,154,179,197,184,228,212,201,198,200,227,144,164,178,75,73,61,79,165,235,190,49,42,64,46,33,14,19,27,21,17,22,20,17,22,22,21,20,20,19,21,19,21,17,15,17,16,24,24,37,46,39,48,63,77,66,60,63,26,26,62,144,208,142,115,110,109,121,100,105,104,153,201,171,109,37,73,117,103,115,109,90,107,74,66,72,51,95,130,146,164,176,94,76,148,130,81,27,63,157,132,87,99,71,42,62,66,85,106,132,106,31,13,15,12,23,19,15,17,17,19,16,18,21,15,22,15,23,25,20,27,39,177,194,172,186,161,166,166,200,95,9,16,7,16,21,16,19,19,23,23,21,26,25,23,28,30,23,27,22,34,96,113,60,46,43,36,39,45,46,25,22,18,20,24,24,28,26,24,30,34,35,36,32,27,51,70,124,141,113,134,128,137,159,79,24,16,10,22,23,29,26,29,30,24,29,31,27,36,98,139,97,63,64,66,86,85,112,83,29,17,12,20,17,21,17,17,22,16,21,19,16,21,23,22,21,19,18,48,147,181,156,153,141,133,126,88,41,19,14,15,16,18,19,17,18,20,19,19,22,19,19,23,18,23,21,23,32,29,35,39,37,40,44,52,45,67,116,129,117,93,120,138,147,148,116,92,75,74,61,57,60,87,111,112,100,95,118,107,90,78,165,238,231,225,230,217,223,219,222,218,214,219,218,218,216,219,217,217,220,215,215,216,217,216,216,218,217,218,217,217,216,217,220,220,218,218,218,219,220,218,220,118,5,1,6,10,10,10,13,12,12,13,13,13,202,206,203,204,206,202,202,203,204,203,205,201,205,203,199,202,200,200,199,200,199,198,201,200,199,197,201,201,200,201,198,197,199,201,202,201,200,200,198,196,198,196,198,198,198,198,196,198,198,196,197,198,198,198,198,197,198,198,198,198,199,199,199,199,199,201,198,197,199,198,199,198,198,199,198,196,200,199,200,200,198,199,200,201,198,199,200,201,201,200,201,203,202,204,201,201,202,200,201,201,203,201,201,203,200,203,204,203,201,202,200,201,202,200,202,203,205,203,201,204,203,204,203,204,207,203,207,204,205,208,203,206,206,206,207,205,206,204,206,205,205,210,206,208,208,207,208,207,210,211,208,207,204,203,208,208,209,208,209,212,208,206,208,205,203,206,203,204,206,208,209,207,207,208,212,209,207,206,205,207,207,210,208,210,210,210,209,208,209,210,208,210,212,208,208,210,207,208,207,207,211,208,210,211,209,209,212,211,212,212,211,212,212,211,214,210,209,213,210,211,210,209,208,209,211,209,210,209,212,211,210,210,209,209,208,208,210,208,210,211,208,208,208,210,208,209,205,206,208,206,207,206,207,205,207,173,133,170,202,206,207,203,206,205,208,208,198,205,206,204,207,204,205,206,205,207,207,205,202,207,211,210,210,205,191,201,210,207,209,206,210,199,199,210,208,210,206,209,207,207,197,192,204,203,202,200,199,202,200,198,197,200,198,208,184,141,198,208,184,181,131,150,102,98,170,168,176,184,185,146,101,51,53,37,90,177,180,222,203,173,205,163,89,121,99,147,202,201,156,178,190,202,129,89,198,167,115,98,82,91,60,49,57,165,194,174,179,210,195,178,230,215,219,205,201,206,132,192,149,46,116,83,83,165,229,162,39,59,47,24,23,14,19,16,23,23,22,23,20,23,18,21,23,17,18,19,19,17,19,19,18,21,28,33,42,41,53,63,64,63,64,59,36,12,48,125,178,188,101,90,117,113,111,92,99,115,147,183,181,114,42,63,114,131,113,120,123,108,101,74,64,42,50,118,165,198,152,62,57,149,151,71,6,44,153,121,91,98,79,76,126,144,171,173,196,174,44,9,9,14,23,16,19,18,19,21,17,20,19,16,23,22,22,21,24,19,55,171,135,102,102,80,87,79,122,57,19,20,9,21,24,24,18,21,27,26,28,27,20,27,34,27,25,30,29,37,96,97,53,50,42,41,45,46,42,27,19,22,24,28,29,24,31,39,39,41,42,38,27,26,48,52,93,107,69,85,80,83,101,60,29,19,15,28,29,31,34,31,33,31,24,30,24,47,115,120,78,47,48,45,51,56,62,55,29,17,16,19,23,22,22,17,21,21,21,25,22,22,15,21,18,22,19,46,154,190,179,182,184,189,189,170,94,27,14,17,21,17,21,19,17,19,19,24,22,20,22,21,21,19,21,16,24,36,39,39,33,38,29,34,29,46,97,113,105,172,251,251,245,177,85,42,29,32,66,79,90,101,97,106,96,101,122,135,128,81,130,211,216,233,226,216,222,217,221,221,219,217,216,218,217,215,217,218,216,218,215,214,217,217,217,216,217,218,214,214,217,217,216,216,217,217,218,216,223,217,218,120,4,0,5,10,12,10,13,12,13,13,13,13,202,204,202,203,206,203,203,207,203,203,203,203,204,203,203,200,201,202,200,201,200,200,200,200,199,200,201,199,199,201,199,199,198,199,198,196,199,198,198,197,199,199,198,199,194,199,200,198,198,198,197,198,199,200,201,200,200,200,200,199,199,196,199,200,199,199,199,200,198,200,200,200,200,202,202,200,200,199,199,200,200,200,199,200,201,201,199,202,199,201,202,199,201,201,203,200,202,201,199,202,201,201,203,202,200,204,204,203,201,200,201,201,205,202,202,203,204,204,202,204,203,204,205,205,201,203,204,205,206,206,204,209,209,205,208,205,205,206,205,205,206,205,210,208,208,208,208,212,210,212,207,206,207,206,207,204,207,208,208,210,207,208,206,205,204,205,204,205,208,207,208,207,209,208,207,206,204,205,206,206,207,208,207,208,208,208,210,208,209,213,210,211,211,208,207,209,208,208,211,209,209,211,210,211,211,208,211,211,211,208,210,212,210,211,211,211,211,211,212,213,210,209,210,210,208,211,210,209,210,209,212,210,210,211,208,210,211,210,208,210,210,207,208,207,208,208,207,207,208,208,203,205,208,207,207,162,105,141,195,206,210,205,205,205,205,205,200,203,203,205,206,204,205,204,204,206,206,199,204,207,207,211,210,210,194,199,211,205,210,208,208,200,199,210,206,210,207,208,210,207,205,195,203,207,207,207,202,206,205,203,202,203,201,212,184,151,206,214,187,160,87,120,78,78,141,127,146,155,149,121,91,50,44,33,98,182,182,223,201,185,212,174,148,152,86,124,224,196,167,146,160,203,150,126,214,156,100,110,98,94,63,60,62,178,213,202,193,217,187,179,232,208,221,213,229,193,122,207,143,49,88,71,91,181,230,165,73,68,32,11,22,14,19,14,20,24,21,21,22,21,22,22,22,18,16,19,16,20,22,19,21,27,33,39,43,52,71,72,63,59,62,34,26,7,63,106,103,118,47,83,110,96,108,93,104,100,93,119,168,154,83,76,97,120,119,123,132,124,106,68,44,30,41,49,95,108,69,26,62,171,118,49,4,54,159,110,103,104,105,141,167,179,193,175,196,170,41,7,8,13,24,15,21,23,24,19,21,24,22,20,31,29,28,25,29,23,71,140,74,47,57,41,50,47,56,36,24,22,16,22,23,25,28,26,24,27,26,27,24,25,37,34,22,33,27,44,105,93,66,66,60,62,68,71,62,35,35,41,40,49,53,52,50,66,66,61,59,56,54,55,64,63,108,76,38,49,39,46,52,41,24,20,22,31,26,33,37,36,29,34,32,27,33,53,117,113,63,47,44,37,45,41,52,38,20,25,18,23,24,26,27,27,23,18,26,26,27,19,18,24,24,22,21,54,133,174,154,166,174,181,185,196,146,39,14,16,15,20,18,24,21,20,26,21,22,24,23,23,23,17,22,23,24,44,71,77,62,63,59,60,40,50,102,110,111,208,249,249,224,114,49,28,11,74,123,97,71,48,53,72,82,86,116,119,117,59,73,218,232,232,226,215,225,217,219,218,218,217,219,217,215,218,217,217,215,214,216,214,217,214,214,217,213,216,217,217,218,217,215,214,215,218,219,219,222,217,218,117,5,0,6,10,11,10,13,12,12,13,14,13,202,205,203,202,205,200,201,203,200,202,202,200,202,200,200,201,201,201,200,202,200,198,199,199,202,201,199,198,198,196,198,200,198,196,194,195,198,198,194,198,199,197,198,198,198,194,196,198,198,198,198,199,200,199,200,198,200,201,201,201,198,200,198,197,200,201,200,200,200,199,199,199,201,200,201,199,201,203,200,200,203,203,200,203,200,200,202,199,200,200,198,199,199,198,199,199,202,199,200,201,202,201,199,202,200,200,202,201,203,203,200,203,203,202,204,203,204,201,205,209,203,206,205,202,204,205,206,202,205,206,202,206,204,204,205,205,205,202,205,204,203,206,205,207,208,207,204,205,208,207,208,206,207,207,206,207,208,206,209,208,206,207,206,204,205,206,207,207,206,205,205,207,206,204,203,202,203,204,204,206,206,208,207,209,209,208,208,208,209,210,210,210,210,208,208,209,207,208,208,206,209,208,207,210,210,210,212,208,209,209,209,211,210,210,210,208,211,211,211,214,212,211,210,210,210,207,209,210,210,209,211,211,210,211,207,208,211,208,209,209,208,208,205,207,205,205,206,207,207,205,206,205,205,206,212,202,177,185,205,205,206,206,204,203,204,200,197,203,204,204,205,204,203,205,206,207,201,200,206,211,208,209,210,211,198,197,211,207,211,208,212,202,199,211,207,211,207,207,207,209,207,198,200,208,211,208,208,207,208,206,206,206,205,220,178,156,208,213,183,154,117,158,118,101,144,143,165,167,161,130,93,49,52,43,122,210,186,224,200,184,217,198,219,198,101,128,200,183,145,141,181,206,164,128,197,153,103,120,96,86,79,42,42,152,210,176,163,221,174,189,229,206,225,210,231,172,138,232,170,111,125,54,103,226,250,174,65,46,10,15,22,16,21,16,23,28,20,21,22,16,19,23,19,16,19,16,15,19,21,27,27,32,48,37,43,69,76,76,60,61,42,18,15,30,100,74,53,82,62,87,86,95,103,93,112,104,80,87,153,182,153,101,76,93,85,104,115,107,95,58,39,33,37,29,27,45,39,45,123,128,57,29,4,70,150,89,101,97,95,118,118,127,132,117,138,106,27,22,13,13,25,24,30,22,26,31,23,20,29,29,29,36,31,24,31,27,87,123,54,49,53,40,50,42,53,28,21,29,16,23,22,23,26,27,28,34,35,34,34,50,56,52,57,67,65,79,150,157,137,138,133,144,149,161,127,126,144,152,173,132,164,169,174,183,178,179,173,174,168,172,168,156,153,72,45,47,38,46,45,49,40,42,42,46,49,45,47,42,45,33,29,35,33,59,112,120,89,63,57,57,50,53,59,36,20,20,17,27,23,27,29,30,27,24,33,22,21,24,24,21,22,25,25,63,131,144,111,110,122,130,137,160,123,52,16,11,13,19,21,23,21,21,26,23,24,22,23,26,22,18,21,24,22,64,152,164,149,134,125,122,93,116,121,101,95,184,249,177,95,37,32,23,75,142,120,76,66,55,42,75,86,72,79,85,90,56,122,237,247,243,242,231,232,221,219,215,217,217,214,214,217,217,217,216,215,216,214,214,216,214,213,214,212,215,214,215,216,214,217,216,217,218,219,217,219,216,220,120,4,0,6,9,10,10,13,12,12,13,12,12,202,205,203,204,203,199,201,202,200,200,203,200,201,200,202,201,202,200,198,198,199,201,200,199,199,197,198,198,197,199,199,198,197,198,197,198,195,198,201,195,200,195,196,200,196,198,196,196,198,199,201,200,197,198,198,199,199,197,200,199,201,199,199,200,199,198,200,202,199,199,198,199,199,200,199,199,200,199,201,199,199,198,200,202,200,199,201,200,200,203,201,200,200,198,201,199,201,200,200,202,200,199,200,203,201,202,202,203,201,201,202,200,203,202,204,204,203,202,203,205,202,203,204,205,203,204,203,203,204,206,204,203,203,202,205,203,203,203,202,204,205,204,206,204,205,206,206,205,206,208,207,210,207,207,206,204,207,207,205,206,207,208,205,206,207,206,208,209,208,207,206,207,207,205,205,204,203,204,204,206,208,208,208,208,208,208,208,209,209,210,208,208,211,208,208,209,206,208,208,208,208,209,208,208,211,208,209,208,210,208,212,211,210,213,208,209,211,209,212,211,209,211,208,210,208,207,210,209,211,208,208,210,208,208,208,207,208,207,207,209,209,209,210,208,206,208,205,203,203,204,206,205,205,210,213,215,213,213,212,208,208,206,208,204,203,198,201,206,202,204,206,206,205,209,206,202,199,202,210,207,208,210,207,210,200,194,209,205,207,207,211,206,198,210,210,210,207,206,209,209,211,201,196,205,211,208,210,210,210,208,210,208,210,217,174,174,215,198,170,185,159,204,163,145,198,189,208,189,194,163,99,51,52,51,134,226,191,219,197,193,212,198,237,216,114,137,157,137,166,181,203,205,141,138,225,144,103,118,93,97,70,40,21,106,176,154,174,217,163,197,223,206,214,213,222,141,159,250,194,120,113,98,178,250,226,120,51,19,8,22,22,12,23,15,24,28,23,26,19,21,20,20,19,17,17,16,16,21,21,21,30,45,42,47,66,76,72,62,61,43,23,17,15,69,113,41,63,109,78,85,71,95,109,103,111,113,105,83,133,191,188,153,90,79,86,113,108,71,75,58,34,24,27,24,27,29,40,135,151,71,18,24,9,69,119,73,106,80,53,53,45,52,46,48,61,51,25,19,15,23,29,28,27,31,31,30,31,27,30,39,37,30,31,29,31,35,107,110,49,63,54,49,59,60,56,29,30,35,37,49,50,59,66,73,82,92,118,131,149,178,182,188,199,216,203,198,204,184,185,181,171,170,160,162,171,177,183,193,193,184,177,170,173,177,175,177,173,176,176,164,155,151,142,103,95,81,77,95,98,98,116,131,151,168,152,157,145,132,122,98,77,73,61,79,157,173,148,133,124,122,120,112,92,39,13,22,17,24,30,23,30,35,29,33,27,26,35,24,25,25,26,27,22,74,124,122,67,45,47,57,69,85,86,45,26,12,17,22,18,23,22,23,21,22,20,24,21,25,24,24,22,19,25,93,203,221,211,206,211,205,186,184,147,105,81,164,199,71,17,29,20,54,132,142,67,66,118,89,89,97,83,70,54,56,79,55,104,225,244,244,250,246,241,226,223,216,217,214,214,214,215,216,213,214,213,215,215,212,214,212,214,214,211,214,212,215,214,214,216,216,215,213,216,217,216,214,218,118,4,1,6,9,10,10,13,12,11,13,13,13,200,200,200,199,203,199,200,201,199,202,201,200,200,200,200,200,199,198,199,200,198,199,200,197,198,198,198,197,199,199,200,199,197,197,198,198,196,200,194,196,198,195,197,195,197,195,198,199,198,199,199,199,201,201,198,199,201,200,200,200,199,199,200,201,202,198,198,199,199,198,200,200,199,198,199,199,199,201,200,200,201,201,201,200,198,199,200,200,200,202,201,202,201,199,203,200,202,200,199,201,199,201,200,202,202,200,203,201,202,203,199,203,203,201,203,203,206,201,200,202,200,204,203,203,203,203,205,204,205,203,201,205,201,203,205,205,205,203,204,204,203,202,205,202,203,205,203,206,207,206,208,207,208,208,204,205,205,203,206,208,205,208,207,207,208,207,207,207,208,208,208,207,205,206,205,205,205,205,205,205,208,209,207,207,210,208,208,208,206,208,208,208,208,206,207,208,209,207,208,208,208,209,207,210,210,208,209,210,209,209,208,211,208,210,211,212,212,209,210,211,209,209,208,206,206,207,210,208,209,209,208,208,209,211,207,206,207,207,207,207,208,208,206,207,208,207,207,205,205,206,204,205,206,204,206,211,209,203,207,208,204,205,205,206,203,198,203,205,207,206,206,206,206,208,206,203,200,206,209,207,205,208,207,212,204,190,205,205,207,206,208,203,196,211,207,209,209,206,209,209,211,206,197,200,211,210,208,209,210,208,212,208,213,214,166,188,224,199,184,194,190,208,175,173,195,179,184,172,206,175,100,54,41,38,141,231,193,214,190,191,207,196,232,204,144,136,141,164,179,201,183,169,140,171,234,137,105,98,77,90,72,38,27,111,191,170,202,221,157,206,213,204,211,214,200,124,178,244,188,121,91,67,162,206,136,70,18,12,13,22,21,17,27,22,18,24,23,22,18,19,22,20,20,15,17,18,21,22,21,31,44,40,48,62,77,78,57,61,39,24,15,21,29,92,108,27,85,118,91,92,87,122,120,104,108,117,119,105,133,165,185,167,129,101,85,99,76,67,80,53,30,19,26,30,42,50,119,164,82,38,15,20,7,75,104,74,106,74,41,43,40,43,45,42,50,35,16,25,17,22,29,28,33,31,33,28,31,32,30,37,38,38,32,36,35,42,136,136,95,126,76,91,105,106,91,73,106,129,128,182,191,201,206,206,214,214,208,203,200,202,198,196,193,183,162,149,125,98,122,87,78,73,61,71,61,71,70,130,126,63,64,61,60,63,56,62,59,60,65,55,59,67,62,67,77,67,71,81,89,85,112,128,146,125,173,179,184,179,175,178,187,177,169,112,184,184,167,159,160,161,155,166,134,73,33,35,30,37,32,31,29,31,32,35,35,31,39,29,29,37,37,33,29,93,124,97,49,37,47,41,48,54,56,37,23,17,17,23,29,22,23,25,20,27,22,26,24,21,20,27,23,21,24,71,169,188,190,196,194,199,188,194,146,107,89,136,141,27,27,34,12,71,137,117,46,67,128,111,125,111,77,69,57,47,51,39,32,152,221,220,227,220,239,232,222,217,217,214,215,214,213,216,214,215,213,213,213,214,213,213,214,212,213,214,215,214,214,214,214,214,214,214,215,215,219,213,217,119,5,0,5,9,11,10,12,12,12,12,13,13,203,203,201,201,200,198,202,201,200,202,203,200,200,200,199,198,200,198,199,200,197,199,197,198,198,197,201,196,195,199,196,198,195,198,198,199,195,195,202,197,201,199,197,200,197,199,199,198,201,197,199,201,199,202,201,199,201,200,200,200,200,198,199,200,199,200,200,201,201,199,199,201,202,201,198,199,201,200,200,200,202,201,200,201,199,200,199,201,200,200,199,198,203,199,202,199,199,200,200,201,200,201,200,199,201,201,200,202,203,204,203,205,206,203,203,203,204,202,202,204,201,201,202,204,202,203,205,202,203,202,203,204,204,202,202,205,204,204,205,204,204,202,204,203,206,203,205,206,204,206,206,208,204,204,204,204,205,204,203,205,203,206,207,206,208,208,206,204,206,206,206,206,203,203,202,201,205,205,205,204,207,210,209,208,208,209,206,206,206,208,207,207,210,208,207,208,208,208,206,206,207,207,207,210,209,206,209,208,208,207,207,207,206,208,208,211,209,208,212,208,208,208,205,209,208,208,207,206,207,207,207,209,208,206,205,206,210,209,206,206,207,206,205,207,203,206,207,205,206,205,206,206,204,205,203,205,207,202,204,203,202,202,207,207,200,199,206,205,206,209,206,206,204,208,205,201,205,209,210,210,207,206,208,210,208,191,204,208,204,206,208,205,196,205,208,209,209,207,205,205,208,211,202,196,209,212,209,208,208,206,210,206,215,210,162,201,229,198,193,205,181,177,147,136,143,134,143,162,219,174,92,51,39,43,134,233,195,201,184,194,202,203,214,201,200,174,181,196,190,171,147,174,121,171,230,141,107,77,75,98,73,48,53,169,222,187,220,208,155,206,208,207,205,218,172,124,199,234,186,127,115,51,64,97,48,37,16,9,14,16,27,21,22,17,20,24,23,25,16,17,21,14,19,17,15,21,18,20,24,47,44,40,67,75,72,65,60,42,28,31,48,66,78,139,109,71,127,122,129,119,93,123,126,121,107,115,118,109,126,136,150,145,112,81,59,55,53,38,52,37,24,31,39,59,101,139,112,60,24,21,20,15,16,91,92,81,117,72,51,44,46,55,59,58,51,28,16,24,19,28,33,29,40,46,44,48,50,55,64,69,74,81,84,84,96,128,185,183,177,190,168,178,182,184,184,169,176,179,189,196,182,181,159,141,141,121,103,88,77,75,66,61,60,57,57,66,59,59,65,61,59,59,66,69,73,80,78,77,69,64,74,75,75,70,69,71,66,65,65,60,65,71,68,74,79,68,64,66,64,62,60,60,58,56,56,63,66,67,77,96,113,125,120,117,125,118,113,113,115,122,123,143,145,113,125,131,111,93,71,60,51,47,52,49,47,45,40,42,34,34,36,33,35,90,117,92,63,49,48,46,49,53,54,35,23,22,26,25,24,29,24,22,24,26,27,27,26,24,25,25,19,22,27,66,125,134,130,126,132,130,128,137,118,104,83,121,101,33,33,26,11,82,137,103,54,54,84,98,121,99,78,65,46,49,61,44,37,103,108,99,128,165,214,234,224,215,216,217,219,217,215,214,213,214,212,215,214,214,214,214,214,213,215,215,214,214,213,216,214,213,213,213,217,216,218,214,217,117,5,1,6,10,11,10,13,12,13,13,13,13,202,205,203,201,203,201,200,200,199,198,198,199,200,200,198,200,200,199,200,199,196,199,198,198,200,196,196,196,198,197,198,197,197,199,199,198,196,197,199,198,199,197,201,201,200,197,201,200,196,200,200,200,199,201,200,200,200,196,199,199,198,199,200,198,199,201,201,202,200,199,201,201,199,198,198,200,198,200,199,198,199,200,201,200,202,199,200,200,198,198,199,200,199,199,200,200,199,200,201,201,199,200,202,197,199,201,203,201,204,203,202,205,201,202,205,203,204,201,203,203,203,205,204,205,204,202,202,203,202,202,204,204,203,205,201,203,203,200,204,203,205,201,206,203,201,206,203,205,202,203,206,205,205,205,203,203,206,203,203,206,206,206,205,207,205,205,205,203,205,204,206,205,205,204,203,204,202,205,207,205,206,204,207,207,207,206,206,207,204,207,207,206,207,207,208,207,206,205,207,207,205,208,208,206,205,206,208,207,208,208,205,208,207,209,206,208,210,207,210,210,208,209,208,206,209,208,207,207,207,205,205,207,205,206,205,203,206,207,206,206,206,206,204,206,204,202,206,204,205,205,202,204,203,202,203,202,205,205,206,206,203,202,205,203,199,200,203,205,205,204,203,206,209,207,200,201,206,206,208,208,206,207,204,206,207,190,198,205,206,206,207,205,195,208,208,207,208,207,208,204,207,209,204,195,202,211,208,209,210,205,209,207,217,201,162,213,227,181,163,154,150,118,95,127,147,163,164,198,240,179,97,44,22,36,142,240,208,207,189,210,218,220,193,189,225,179,199,184,170,175,170,183,107,146,196,139,108,84,91,98,67,46,63,177,218,193,223,190,159,210,202,205,206,214,145,140,211,221,179,90,81,65,83,54,22,33,10,17,16,22,28,21,23,24,18,22,26,17,22,21,15,14,19,16,16,22,20,30,42,42,42,52,69,68,57,71,63,45,68,110,133,146,141,158,139,117,130,106,108,95,81,103,119,130,108,110,92,67,81,75,100,102,81,53,38,43,37,35,27,25,40,55,83,134,124,73,39,18,20,15,16,11,31,110,90,92,121,66,56,54,61,89,91,100,76,35,42,54,63,72,77,87,125,133,156,162,180,190,200,208,198,194,188,186,162,168,181,152,156,132,138,128,116,115,103,83,74,71,127,72,65,65,62,57,67,64,69,70,70,76,65,70,71,70,70,71,77,72,63,56,62,60,55,61,61,59,57,51,45,43,47,49,48,49,41,50,38,44,49,45,48,50,53,53,58,55,61,58,53,59,58,57,51,53,57,59,57,56,57,54,63,61,59,57,55,63,65,57,59,67,72,85,99,96,156,167,168,172,155,142,139,139,137,133,113,95,83,67,62,53,53,45,40,96,141,146,109,93,80,66,74,74,65,37,23,19,25,29,31,32,29,29,23,30,28,24,27,27,24,24,24,25,29,70,107,87,62,53,54,53,53,55,66,96,95,111,92,41,27,19,20,71,139,113,76,45,41,51,68,68,61,53,46,61,71,69,59,73,55,34,61,87,154,213,219,220,218,217,217,213,216,214,214,214,213,212,210,211,212,213,214,212,215,212,212,213,212,214,213,213,213,214,216,214,217,215,217,120,6,1,7,10,11,10,13,12,12,14,13,13,200,200,201,200,200,199,200,198,198,200,199,200,199,199,199,198,201,198,195,196,197,198,196,198,197,194,197,195,198,198,196,200,197,198,198,195,197,196,198,195,198,200,198,200,198,199,200,199,199,199,197,197,196,199,201,198,200,196,198,201,200,200,199,199,200,200,200,200,200,199,199,200,198,199,198,197,199,200,200,200,199,199,199,199,199,200,199,199,199,199,200,198,200,200,201,200,202,200,199,204,200,200,201,202,202,200,200,201,201,201,202,200,202,203,202,202,202,203,203,203,203,205,202,203,201,202,206,204,204,202,206,204,204,205,202,203,204,203,204,205,205,203,203,204,204,201,204,202,203,204,203,203,205,204,203,206,203,203,205,205,206,205,207,205,205,205,202,204,207,206,205,207,204,205,205,203,205,206,205,205,205,205,208,206,206,207,206,206,205,205,207,203,205,205,203,205,205,207,207,205,205,208,205,207,208,205,209,207,206,208,204,207,208,208,208,207,206,207,208,204,207,208,205,208,207,207,207,206,207,206,206,205,206,205,206,207,208,206,205,205,204,204,203,204,203,203,203,202,206,203,202,206,203,201,203,203,205,205,203,203,201,203,207,203,195,203,206,205,207,205,205,203,205,204,198,203,207,204,204,206,205,206,207,205,208,191,194,205,205,206,204,203,195,204,206,206,207,206,207,207,206,206,207,194,198,206,206,206,206,205,208,207,216,193,166,227,217,141,116,129,153,111,102,157,190,206,204,237,252,192,103,57,40,48,135,246,241,222,216,241,246,243,163,139,158,126,141,160,202,185,191,195,87,100,177,153,108,89,86,80,59,41,57,129,177,192,221,182,163,208,200,202,212,203,124,162,213,217,193,96,75,51,44,39,16,29,18,17,25,16,27,22,21,24,23,23,21,26,25,19,20,15,18,18,23,23,23,41,39,42,61,66,59,63,115,145,131,133,145,150,156,149,137,137,128,113,100,72,71,71,72,79,87,107,110,108,63,52,50,24,55,67,76,59,32,32,22,25,27,40,63,71,79,67,42,27,18,19,19,10,16,12,53,116,87,105,116,77,101,119,155,179,181,193,167,160,189,202,210,213,203,203,197,190,186,185,184,179,169,151,136,117,103,88,66,70,68,58,53,57,59,54,56,59,63,61,65,72,77,74,76,87,87,86,81,78,67,64,60,57,55,48,47,40,40,36,31,36,31,26,28,21,27,25,25,22,20,28,21,19,24,20,18,19,19,24,16,16,21,18,23,24,23,23,19,24,28,26,29,31,29,34,33,35,37,38,42,44,49,55,58,61,56,62,67,66,71,67,63,63,62,60,64,65,64,72,80,92,105,120,128,142,151,149,159,157,163,158,148,143,136,120,122,159,170,178,163,151,155,154,150,140,97,36,27,24,29,34,27,37,38,36,32,24,30,23,31,27,24,28,23,28,33,80,99,68,56,43,51,45,47,41,35,76,92,112,92,52,29,15,20,55,98,113,93,58,31,24,33,42,59,49,53,63,67,65,66,67,55,38,48,48,85,200,235,236,232,219,216,214,216,215,214,212,212,210,212,214,212,211,212,211,213,212,209,212,211,212,213,212,210,211,215,215,217,211,216,119,4,1,6,10,11,10,13,12,13,13,14,14,196,201,199,198,202,198,198,200,200,198,201,199,199,198,197,200,199,197,197,197,196,198,198,196,198,196,198,198,197,197,196,196,196,196,196,196,197,198,198,196,198,198,199,197,198,197,199,200,199,200,198,199,198,199,198,198,201,198,199,200,200,199,200,200,198,199,198,200,200,199,199,198,200,198,199,200,199,200,199,201,198,199,199,198,199,199,201,199,199,203,201,201,199,200,201,199,199,201,200,200,201,199,201,200,201,200,200,199,201,200,201,205,202,201,203,204,205,203,204,202,202,203,202,201,202,205,204,204,202,201,204,202,204,204,202,205,204,202,204,202,203,201,206,202,201,205,201,205,203,204,204,203,203,204,204,203,204,203,204,203,202,205,205,205,202,205,206,206,206,205,206,204,203,203,204,203,205,207,206,207,205,205,205,205,206,205,205,204,205,204,204,206,205,206,205,204,204,204,205,206,205,204,205,206,209,208,207,208,207,206,206,205,204,207,206,208,208,205,208,205,205,203,205,204,205,209,205,207,205,205,205,205,205,205,207,203,205,204,203,205,205,205,203,203,203,202,203,201,201,205,203,202,201,202,205,203,203,204,202,202,200,201,206,199,199,205,201,206,206,204,204,204,205,199,199,205,204,205,204,205,203,204,206,206,210,195,191,204,206,206,206,206,196,203,207,205,208,204,206,204,205,206,208,199,194,205,209,205,205,208,206,206,216,185,181,236,211,110,89,145,173,123,124,162,178,183,177,203,191,148,93,89,105,65,98,192,192,160,162,193,202,211,117,69,71,45,103,193,232,193,178,156,86,107,206,164,103,97,87,87,65,68,56,96,182,204,221,172,161,209,200,201,220,187,122,186,213,220,214,122,83,39,17,17,15,30,17,28,29,23,23,23,25,23,23,23,22,25,22,19,19,19,21,17,28,24,36,42,45,65,78,71,63,127,174,180,175,160,134,122,128,122,98,94,91,86,79,72,86,74,66,63,62,72,84,84,54,47,56,41,79,58,69,46,23,23,23,27,50,64,57,42,29,26,19,16,17,15,15,15,23,11,74,116,87,107,112,137,180,185,195,194,187,141,152,156,186,174,167,140,126,112,93,119,61,53,57,53,47,55,53,51,53,51,45,48,47,47,59,48,54,63,56,61,59,56,58,52,54,50,47,45,39,41,32,31,30,24,26,21,22,22,18,21,21,18,15,19,19,16,15,14,16,16,17,21,19,21,23,24,21,21,23,20,19,21,17,15,16,16,21,21,20,18,20,19,18,16,19,19,15,19,19,20,19,21,25,26,31,29,30,33,31,39,39,39,48,50,53,53,54,62,63,64,63,69,68,69,71,68,68,68,71,71,77,88,106,120,124,134,142,145,145,146,132,127,127,150,160,154,162,164,136,84,69,62,54,57,53,50,52,52,46,44,37,36,37,34,36,31,30,29,36,83,101,78,66,57,57,55,61,44,40,101,100,113,111,50,28,18,15,30,64,98,104,77,53,33,27,41,49,52,63,57,60,61,57,64,67,50,64,68,111,227,244,244,250,236,232,228,223,217,214,212,212,212,213,212,211,213,214,211,212,212,211,213,212,214,215,212,213,214,215,217,218,214,215,118,6,1,6,10,12,10,13,13,13,13,14,14,200,200,200,199,200,196,200,199,198,199,196,199,198,195,196,197,198,197,195,198,196,194,198,198,197,195,198,198,197,197,196,195,196,198,196,196,198,195,198,198,199,198,196,198,198,199,199,198,196,200,199,199,201,200,200,198,199,199,198,199,198,198,200,199,200,197,198,200,199,199,196,201,199,200,202,198,200,198,200,200,197,199,201,201,199,199,200,200,198,198,201,201,200,200,200,200,199,198,201,202,200,201,198,198,198,201,198,199,200,199,201,199,201,200,201,204,201,201,201,204,205,204,202,203,205,201,200,201,203,203,201,201,202,203,204,200,201,203,201,203,203,202,202,203,203,203,203,202,202,201,202,202,203,203,203,203,202,203,204,203,205,205,205,203,201,204,204,204,205,206,206,205,205,206,204,202,205,207,205,206,206,203,206,203,204,205,203,203,203,203,204,204,206,205,203,207,203,204,204,203,206,205,204,204,206,203,204,206,206,206,204,206,207,205,205,206,206,205,206,206,205,206,204,205,205,204,203,203,202,203,203,202,206,204,203,204,203,204,203,205,203,205,205,202,202,203,203,203,206,202,200,203,202,201,203,202,204,205,203,204,201,202,203,197,197,203,201,202,201,202,205,204,200,196,202,205,204,204,204,207,204,204,206,202,209,200,189,200,204,205,203,205,193,202,208,206,207,205,204,204,203,206,208,202,194,198,208,206,205,205,200,205,213,181,193,217,162,92,101,132,130,106,120,131,133,129,125,124,95,101,66,92,139,84,41,87,87,39,36,33,60,98,64,37,22,17,77,171,203,177,146,148,105,155,244,146,88,90,94,97,76,67,36,123,226,216,222,166,166,210,198,209,222,158,131,210,208,230,213,108,60,30,29,18,12,24,34,49,50,37,28,22,28,28,19,22,23,19,18,22,22,18,20,23,25,36,42,44,61,76,93,69,87,141,162,175,160,143,102,92,115,89,54,32,39,45,50,66,63,62,56,55,52,55,59,71,61,56,47,45,134,132,69,43,21,29,35,45,44,38,31,18,19,17,18,17,20,16,15,20,17,22,98,111,90,113,102,113,118,95,93,85,67,66,59,55,56,51,50,50,51,53,53,55,52,48,51,54,49,48,55,48,44,43,41,38,34,34,34,36,31,39,35,29,28,27,29,23,19,21,21,19,16,18,17,15,16,16,19,15,17,17,19,23,21,21,19,18,19,15,18,17,19,25,27,17,23,24,19,26,22,27,29,28,24,19,19,15,18,18,24,23,30,41,46,45,33,26,17,17,17,17,17,21,27,23,25,22,26,23,20,29,21,22,23,20,24,20,27,26,28,32,33,39,43,48,48,49,60,65,69,70,67,67,66,62,65,69,68,68,67,70,71,72,70,68,68,76,92,93,99,118,120,118,139,145,153,151,131,139,139,129,122,102,92,87,82,73,71,68,59,49,50,96,141,149,132,110,110,102,101,96,111,130,110,121,112,63,26,20,15,21,33,66,101,98,80,61,48,46,43,47,54,60,66,68,62,64,76,64,103,103,129,217,217,231,246,246,249,245,241,234,229,227,223,220,218,213,212,212,214,210,212,215,211,213,214,214,214,214,212,213,215,214,216,214,217,117,5,0,6,10,12,10,13,12,13,13,14,13,198,203,200,200,199,199,200,200,201,198,199,198,199,199,197,198,198,197,197,198,199,197,197,198,199,197,194,196,198,196,194,196,196,197,199,200,198,197,196,197,198,198,200,199,198,197,196,199,198,199,200,201,199,199,202,200,200,199,200,199,199,198,197,198,196,200,199,197,199,198,200,200,200,199,201,200,200,200,199,199,199,200,200,201,200,199,199,199,200,200,199,201,200,201,202,200,201,203,202,203,202,202,200,199,203,200,201,201,201,201,199,202,200,201,201,199,200,200,203,202,202,202,203,204,200,202,204,202,203,201,202,203,202,202,200,201,201,202,203,201,202,202,203,203,204,203,202,204,203,203,202,203,203,202,202,204,204,203,205,204,204,204,203,203,204,204,202,204,206,207,208,205,206,206,205,203,204,206,206,208,204,204,204,205,205,204,205,203,205,203,204,204,201,205,206,205,204,202,203,206,204,204,204,204,205,202,203,204,204,207,207,205,206,205,204,207,207,206,205,205,206,205,204,205,203,202,200,203,203,203,203,203,204,203,204,201,203,203,204,203,200,201,201,203,202,202,204,203,201,202,203,202,203,201,203,203,201,205,205,205,203,204,203,194,198,205,202,202,203,205,204,204,200,197,203,204,203,204,203,203,202,204,203,202,207,202,190,198,205,203,200,204,195,199,206,203,205,204,206,204,206,205,206,207,194,194,204,206,206,206,203,208,204,171,194,148,84,84,103,103,82,91,127,127,111,98,116,113,72,90,78,87,117,88,64,120,125,79,53,13,6,57,69,61,42,31,38,61,118,165,183,189,145,193,239,117,103,113,94,103,73,62,41,145,243,218,217,164,174,214,199,214,217,128,143,213,208,234,190,100,49,36,40,62,56,36,47,64,45,64,99,44,19,22,19,23,22,22,24,21,17,21,19,27,38,42,39,56,79,105,147,135,116,117,127,138,130,121,87,56,73,71,54,40,33,31,31,42,43,42,44,45,44,47,50,62,66,57,97,159,214,203,134,66,35,24,21,21,19,19,19,17,16,17,19,19,16,19,14,23,16,39,118,108,109,127,77,62,62,52,52,51,51,59,54,54,49,47,50,44,48,48,48,44,39,44,41,36,33,31,33,33,25,30,24,19,21,18,22,19,17,17,22,19,15,19,19,22,21,23,21,25,22,18,20,17,17,15,22,17,21,23,27,31,33,34,29,26,18,15,20,28,29,29,23,19,19,18,19,24,22,21,24,32,33,27,19,19,20,16,21,35,69,88,100,97,75,36,15,16,19,23,25,30,20,18,21,15,24,17,19,22,24,30,22,19,19,17,20,18,18,20,21,25,21,24,25,27,28,28,37,36,44,46,43,50,51,49,57,66,63,65,68,70,67,67,67,67,63,62,67,67,69,79,103,116,130,143,146,152,153,167,162,163,170,182,188,178,133,172,165,142,136,149,168,184,173,166,162,169,172,157,153,153,120,122,128,71,30,16,11,16,20,40,66,96,112,95,78,69,51,45,48,57,72,64,66,63,74,89,112,102,84,108,99,129,185,219,239,243,247,248,249,248,243,241,231,227,222,217,214,211,212,209,208,210,212,213,211,211,212,210,214,211,212,213,216,120,5,1,7,10,11,10,13,12,12,14,14,13,200,200,200,200,200,196,199,199,199,201,198,200,198,198,199,197,198,198,195,199,199,196,196,195,199,198,197,198,196,197,197,198,200,199,199,199,200,199,199,200,200,201,201,200,201,199,199,201,200,202,199,199,200,200,200,201,200,199,199,201,202,202,202,199,201,200,199,201,199,198,198,200,200,199,200,198,200,200,200,202,202,204,201,200,201,202,201,201,203,203,203,203,204,201,202,203,201,203,203,203,203,203,201,203,202,201,199,199,199,200,201,201,202,199,200,201,202,204,203,202,202,201,203,203,201,204,203,204,204,201,201,202,203,201,202,201,201,203,202,200,203,203,203,200,201,202,200,201,203,205,205,203,204,203,205,204,203,203,202,203,205,202,202,203,203,203,203,203,206,204,206,206,206,204,205,207,205,207,206,202,205,203,203,201,200,204,203,204,204,203,205,203,203,205,204,205,203,203,206,204,204,205,203,203,206,201,204,206,203,205,203,205,205,204,205,204,205,204,205,203,205,204,201,204,203,202,202,202,203,204,204,201,204,202,201,204,203,202,200,203,201,200,203,202,203,203,202,200,201,201,202,204,201,200,203,200,201,202,203,207,204,203,199,196,201,203,202,203,202,202,203,201,196,202,205,202,203,203,204,203,201,201,202,200,205,204,190,196,207,202,202,205,193,199,205,202,204,203,204,205,204,203,204,207,202,191,200,206,205,206,203,209,201,170,198,102,16,35,68,98,80,107,147,130,116,103,136,119,73,115,88,78,124,99,96,193,190,137,110,44,36,71,93,97,76,62,64,49,84,155,207,210,157,202,210,117,126,124,116,108,81,73,42,153,238,210,214,157,179,214,193,226,195,101,157,214,213,231,157,78,53,31,127,213,116,49,53,37,69,214,210,35,8,24,16,26,22,19,15,21,23,27,28,38,46,44,53,75,89,114,159,165,155,114,95,97,101,107,66,23,31,51,54,54,42,40,39,34,31,27,35,36,38,34,29,46,38,89,162,178,190,146,95,42,12,15,8,13,14,13,21,21,29,36,39,44,24,20,18,22,18,56,135,107,122,125,58,49,51,47,47,44,37,42,40,35,35,32,31,28,30,27,26,23,24,22,18,17,17,17,16,17,19,18,14,19,17,18,17,17,16,16,16,20,26,21,22,21,24,24,21,28,29,33,27,16,22,17,15,21,19,29,46,64,76,67,58,41,21,14,26,34,17,20,33,27,14,18,19,17,29,35,22,17,29,41,37,19,22,14,31,89,105,120,146,128,83,41,20,17,24,32,27,18,21,21,15,19,17,20,22,20,24,34,38,27,22,16,16,16,20,21,22,22,23,21,16,22,17,24,18,19,23,22,26,23,30,29,27,32,41,42,47,50,53,61,66,66,69,70,69,66,70,74,69,69,77,73,71,81,77,84,97,99,113,129,136,147,162,167,161,150,145,135,129,129,132,141,151,153,155,158,159,132,116,129,130,84,28,20,14,15,17,27,41,59,106,139,132,99,69,59,48,36,46,52,53,65,69,63,78,71,65,53,20,25,67,114,144,172,200,225,234,237,245,246,247,244,240,231,222,217,215,212,209,212,211,211,209,209,212,210,212,213,215,212,215,118,5,0,7,10,11,11,14,12,12,14,14,14,199,200,199,198,196,194,196,196,198,197,197,197,195,197,196,196,197,198,195,194,195,196,199,197,196,198,196,197,199,195,200,199,195,200,199,201,201,203,201,198,198,199,200,199,199,199,202,203,200,202,201,202,200,200,202,200,201,200,202,202,202,202,203,203,201,200,199,200,199,198,198,198,199,196,199,198,199,202,199,200,200,203,200,200,202,202,203,204,203,204,205,204,201,203,204,202,203,201,202,201,202,202,201,202,203,200,201,200,200,200,197,200,200,202,200,199,200,200,201,201,203,200,201,203,201,201,204,202,202,201,202,203,200,203,201,201,201,199,201,202,201,200,200,202,203,202,202,202,201,203,203,202,201,200,202,201,201,202,201,202,205,203,202,202,201,202,202,201,203,200,204,205,204,204,203,205,203,203,203,201,203,204,201,201,202,201,205,201,200,203,202,202,202,203,200,201,203,201,202,203,203,200,199,201,202,202,206,203,201,203,203,202,202,203,204,205,202,202,203,203,203,202,204,201,202,203,204,202,201,203,199,200,203,201,202,200,201,202,200,202,201,203,204,202,202,200,201,201,200,200,201,201,201,200,198,199,200,200,200,200,202,203,193,194,201,201,201,200,200,200,200,196,195,200,203,202,200,202,200,201,200,202,199,201,203,204,191,191,201,200,200,203,193,198,206,203,206,203,203,202,201,203,203,204,203,193,193,203,205,205,200,210,195,175,214,108,23,43,77,110,96,116,136,132,110,90,92,76,64,64,57,61,119,95,39,108,135,105,94,66,57,45,45,74,76,80,84,81,112,150,194,151,118,212,202,116,118,90,99,118,71,62,36,138,225,205,207,151,182,212,193,231,160,77,183,221,226,217,113,70,42,50,174,244,101,63,42,33,163,249,243,57,2,12,12,17,17,24,17,16,25,30,35,51,46,49,74,79,93,56,61,105,142,128,79,81,86,92,63,41,31,43,53,50,50,50,51,43,34,28,29,31,29,26,34,49,85,136,154,134,77,43,21,15,9,13,14,13,16,29,33,44,58,53,59,46,24,24,16,23,16,81,136,98,126,122,56,27,21,19,22,20,23,21,18,25,21,18,22,19,19,21,22,18,16,15,17,17,15,14,18,18,18,23,17,19,17,21,18,16,20,19,29,28,20,24,22,22,21,20,22,16,26,37,37,28,25,20,20,19,24,56,99,118,109,99,87,58,25,23,33,22,15,26,30,23,20,17,18,20,29,30,21,19,19,32,47,29,19,14,55,131,158,132,108,95,78,61,28,20,33,25,15,24,24,17,17,16,16,18,23,23,16,19,33,36,31,18,17,15,17,28,27,26,32,29,21,24,19,19,16,16,20,22,23,25,26,26,26,20,25,29,25,27,30,30,31,36,40,39,46,48,53,55,62,69,79,84,87,92,82,81,79,77,71,65,66,59,107,64,67,69,70,66,69,72,68,101,105,91,91,99,101,74,86,120,130,85,38,17,14,17,17,20,18,31,56,89,104,104,102,87,64,43,36,42,48,54,56,54,57,57,61,59,47,46,41,44,60,76,105,132,143,156,177,203,213,233,245,246,241,227,220,217,216,213,213,214,212,211,212,214,215,212,216,217,219,119,6,1,7,11,12,11,14,13,15,15,15,15,198,201,198,200,201,196,198,197,197,196,195,197,196,197,198,195,193,196,198,197,198,196,198,197,198,196,194,199,194,196,198,196,199,199,203,203,202,203,200,200,200,201,200,198,200,200,200,200,200,204,202,201,202,203,203,205,204,203,204,204,205,204,203,203,201,202,200,201,199,199,200,199,198,198,200,200,203,202,203,201,201,201,202,201,199,200,203,205,202,203,202,205,205,202,202,201,200,204,202,200,200,202,203,201,201,202,202,203,202,201,199,200,200,198,201,199,200,199,200,202,201,200,200,202,201,202,199,202,202,201,200,200,203,201,202,202,203,202,199,200,200,200,200,200,202,202,201,201,204,203,203,202,201,203,202,203,201,202,203,202,203,200,202,203,201,200,202,200,203,203,203,202,204,203,204,205,200,203,204,203,204,203,201,202,201,203,200,203,202,200,203,202,202,201,199,202,203,200,202,202,202,202,201,200,201,199,200,203,201,201,202,203,201,203,202,202,201,200,200,199,202,202,200,201,200,202,201,200,200,200,202,197,200,201,200,201,200,200,202,201,199,199,200,200,199,201,200,201,202,199,200,200,201,200,201,202,201,199,198,200,203,201,193,196,202,200,201,200,200,201,198,194,197,202,201,201,202,202,199,201,200,200,199,200,201,203,195,185,198,202,200,200,191,196,203,202,203,204,204,202,202,202,201,201,203,194,190,199,203,202,199,207,189,179,231,153,76,69,91,122,96,101,109,110,69,20,28,28,30,41,36,30,105,88,13,48,78,127,87,35,55,14,36,67,63,48,29,59,107,137,166,104,121,221,193,118,81,66,83,91,74,66,34,149,227,206,204,150,191,208,193,231,109,81,211,221,230,148,69,69,46,35,127,138,69,62,34,128,217,246,231,91,49,15,11,25,17,17,21,25,34,35,49,49,51,72,80,94,95,59,46,63,115,135,93,59,62,77,82,73,47,39,51,53,44,46,51,43,45,40,34,32,28,33,71,135,159,173,125,50,33,13,12,14,11,14,15,31,40,51,53,53,55,59,53,34,21,18,16,22,19,103,133,98,138,116,45,14,11,14,14,14,17,15,15,17,21,22,21,20,24,24,19,20,24,22,18,15,16,17,15,21,21,24,23,22,24,24,20,19,16,33,36,18,19,24,26,17,16,21,21,19,26,22,29,41,35,23,18,19,31,110,155,156,136,116,111,69,46,30,27,22,22,26,14,17,25,17,22,29,19,17,25,23,17,31,48,40,29,14,68,160,157,140,109,104,112,66,37,29,23,14,18,26,19,21,18,16,18,18,24,23,18,18,21,40,44,25,20,18,24,42,49,59,63,51,46,33,17,18,16,23,27,29,26,19,22,30,23,24,28,25,25,20,21,19,23,21,18,22,23,26,27,23,30,37,43,44,44,55,56,59,58,57,64,61,61,63,64,64,71,66,68,75,72,75,69,73,71,66,60,61,63,49,90,109,119,95,37,25,15,15,16,19,22,17,23,39,60,89,119,115,87,59,45,37,36,35,43,40,50,48,49,52,43,57,66,67,65,61,59,63,64,73,81,102,122,155,197,223,246,246,235,227,220,216,216,218,216,217,215,217,219,218,222,223,222,117,6,1,7,11,13,12,14,13,15,15,15,15,194,198,199,199,199,196,196,196,196,196,197,198,198,196,194,195,195,197,196,195,198,199,198,196,197,198,197,196,198,195,196,199,200,201,201,205,207,205,204,203,202,204,203,202,201,201,200,201,203,202,204,204,201,203,205,202,205,207,205,204,205,204,205,203,203,205,200,200,203,202,200,198,199,201,205,205,206,207,202,204,203,204,205,203,205,203,203,202,200,202,202,203,200,201,200,202,204,202,204,203,203,203,203,205,205,201,204,203,203,203,200,201,200,199,199,199,199,202,200,199,201,199,200,200,200,203,203,203,202,200,202,200,202,204,199,199,202,202,201,200,200,200,201,200,200,202,200,202,200,201,202,203,202,201,201,202,203,201,201,202,202,201,201,201,202,199,202,202,201,202,200,202,203,203,203,202,201,200,200,202,201,201,200,201,200,200,202,201,201,201,199,201,201,203,202,200,203,203,199,201,203,202,201,202,202,201,201,203,201,201,202,200,203,204,204,202,199,200,200,200,200,200,201,198,200,198,200,200,199,201,198,199,200,199,199,200,200,200,199,201,198,197,199,200,199,198,199,198,199,198,199,200,200,199,201,200,199,200,201,199,203,200,193,200,201,199,198,199,201,202,196,196,201,202,199,199,202,200,199,201,200,200,199,199,199,204,195,186,195,200,198,199,194,195,203,201,202,201,203,201,199,200,199,199,201,198,189,194,200,201,200,208,181,184,233,180,125,69,70,103,76,89,81,76,46,19,26,18,21,27,46,32,112,104,49,89,125,176,110,55,38,12,55,60,57,27,11,42,83,154,189,112,140,188,159,134,87,76,77,81,74,68,38,163,235,211,201,147,199,205,206,220,76,106,224,227,170,69,49,35,14,27,32,36,31,55,143,220,237,205,169,197,224,198,115,21,14,14,23,31,36,45,43,56,74,81,80,76,85,78,91,94,96,140,97,62,46,51,74,74,60,32,35,41,54,51,41,46,45,42,35,37,48,103,159,182,160,89,48,33,19,13,13,12,18,27,45,55,54,57,54,60,60,54,52,28,17,17,15,20,35,129,133,103,141,103,36,10,10,14,16,14,16,16,20,24,21,18,24,21,17,23,23,17,23,35,31,23,19,15,19,20,33,36,29,30,27,27,22,18,29,36,23,19,27,24,22,18,20,19,24,27,24,22,16,31,55,36,19,18,50,144,170,174,120,101,117,72,56,35,25,31,22,19,14,19,21,30,35,19,16,14,19,26,33,31,33,52,32,24,65,113,105,111,108,73,73,48,39,32,19,19,21,16,14,19,24,20,24,19,18,19,21,23,16,33,50,35,22,17,49,92,106,108,97,84,71,46,24,15,26,32,28,19,22,23,18,20,25,21,19,24,30,29,28,25,17,19,17,19,24,20,20,22,22,19,22,20,23,27,31,30,27,34,32,37,38,38,44,42,46,48,54,57,58,69,61,65,70,72,68,67,70,77,113,106,105,96,49,21,16,18,17,18,20,20,22,20,24,40,51,69,73,66,59,50,43,31,31,35,42,45,41,41,37,45,60,72,88,84,81,66,60,60,63,63,54,64,97,150,212,233,245,241,230,218,217,218,220,222,220,220,222,224,225,224,225,119,6,1,8,11,12,12,14,12,14,15,14,14,193,199,195,198,197,194,196,195,196,196,196,197,195,194,195,194,196,194,193,195,195,194,198,195,196,198,197,200,195,195,198,200,201,199,204,203,205,205,203,204,203,203,205,203,201,203,204,203,205,206,205,205,204,203,204,203,204,205,207,208,206,208,206,206,206,206,205,206,204,204,205,203,203,203,204,205,206,203,205,205,203,205,203,202,206,206,206,204,203,205,202,203,201,202,201,203,202,203,202,202,204,204,204,202,203,203,202,200,201,200,198,199,200,199,198,199,201,200,199,201,199,198,201,201,200,201,198,201,200,201,202,200,201,200,203,201,199,201,201,200,199,202,200,199,201,199,201,201,202,201,202,201,199,200,198,200,201,199,202,202,202,203,200,201,200,201,202,200,201,200,201,202,204,201,199,201,200,200,201,199,199,201,200,202,201,201,200,203,201,201,201,197,202,200,200,202,198,199,200,201,199,201,201,202,201,200,201,199,201,202,202,200,200,203,199,201,204,200,200,200,203,201,200,201,200,201,200,201,200,199,201,202,200,196,198,201,199,197,199,199,199,200,198,199,198,199,199,200,199,195,200,199,196,198,199,198,197,199,199,198,201,200,198,199,198,200,200,199,200,197,191,197,200,200,199,201,199,202,199,199,200,197,199,198,199,200,198,185,193,201,200,198,191,193,200,203,200,199,198,198,199,199,198,200,202,198,193,189,200,200,203,205,173,190,225,200,169,84,59,73,61,70,70,54,13,26,27,24,35,29,54,32,97,89,30,79,145,184,102,84,50,25,63,50,29,18,93,159,157,186,193,130,139,142,167,156,102,104,75,72,78,60,29,159,230,208,191,143,205,200,219,203,63,156,241,169,77,9,7,9,27,27,39,23,23,135,221,248,212,125,171,252,252,252,215,51,4,30,43,54,57,43,45,65,93,84,82,88,83,86,100,99,92,122,100,66,51,40,55,62,55,30,21,24,39,60,51,40,27,27,32,55,127,158,183,136,47,24,15,17,16,11,17,24,44,53,54,58,59,59,62,61,60,62,51,21,20,16,25,16,47,146,121,115,137,83,33,13,12,16,22,20,17,20,30,22,22,31,21,16,19,18,18,24,27,25,32,32,23,18,15,37,45,50,57,52,53,51,31,27,40,24,17,26,21,17,21,21,17,20,23,19,25,25,23,20,39,49,32,16,73,139,129,117,104,99,88,61,53,35,29,31,24,19,19,16,23,34,32,20,19,19,17,27,35,29,22,49,40,28,63,82,81,127,100,77,62,31,49,24,23,25,13,19,17,19,23,32,24,15,19,15,21,32,24,23,40,44,27,30,111,168,166,157,117,104,91,53,26,24,27,20,22,26,24,19,18,18,17,27,32,21,23,27,37,27,21,20,16,21,26,29,27,22,24,22,17,17,20,22,15,21,20,24,27,26,25,24,29,24,26,24,23,29,29,34,29,35,36,39,39,44,41,52,107,97,106,106,46,28,19,19,22,22,31,25,21,19,21,20,22,29,42,48,52,55,44,41,36,31,41,39,33,33,39,42,48,64,72,76,74,76,64,63,57,54,56,51,51,66,127,170,205,236,227,219,221,223,225,225,222,221,224,227,227,224,224,118,6,2,8,11,12,12,15,14,15,16,15,15,196,199,196,197,197,194,198,197,196,196,195,195,197,194,193,196,192,197,196,195,196,196,199,197,198,199,200,198,199,198,198,198,200,203,201,201,203,203,203,204,203,204,204,205,203,203,203,205,205,202,204,205,203,205,207,206,203,202,203,204,208,208,208,206,208,209,205,205,208,207,208,211,205,203,203,204,202,205,204,204,205,201,204,203,203,204,206,205,205,201,204,205,200,203,200,201,203,200,202,202,201,203,203,200,202,202,203,201,201,203,200,201,200,199,199,198,199,199,198,199,200,200,201,201,201,199,196,200,199,199,200,202,202,201,200,199,200,197,198,199,200,199,199,200,200,200,198,201,199,198,202,203,199,200,199,200,199,200,201,200,201,200,200,201,201,198,200,200,198,201,200,199,204,199,200,201,200,202,199,199,200,201,201,203,201,201,200,200,202,200,199,199,200,199,200,199,200,201,198,200,200,200,199,200,199,199,202,199,199,200,202,200,198,199,200,199,198,199,197,200,198,198,201,196,198,199,199,201,199,201,199,197,200,200,199,200,199,197,197,196,199,198,198,202,198,198,198,199,200,198,198,198,198,197,196,199,197,198,199,200,201,194,200,200,199,200,200,200,199,194,192,200,198,200,199,198,200,195,199,198,196,196,196,196,196,200,198,185,186,200,201,198,190,189,200,199,197,198,196,196,198,199,199,198,199,200,195,187,196,201,202,199,173,199,217,198,194,132,99,81,66,77,72,41,4,23,36,62,40,35,60,38,67,69,12,45,137,146,73,57,19,33,66,55,52,69,180,220,201,212,184,129,143,165,207,171,103,122,90,77,78,50,25,158,224,205,187,147,208,196,224,159,63,213,202,74,24,24,24,44,33,36,43,29,117,211,242,242,145,130,210,249,249,249,163,71,75,105,124,108,62,41,49,87,93,66,109,101,96,95,97,97,88,116,95,66,49,68,79,62,63,35,34,24,27,38,44,39,29,39,66,118,154,154,87,32,21,17,15,13,14,23,43,55,63,59,56,61,62,63,63,56,52,41,27,15,18,19,24,12,70,149,107,114,132,71,25,16,15,17,21,16,21,27,26,22,29,25,18,15,14,17,21,22,28,26,17,35,33,23,15,36,64,71,92,102,106,79,34,33,36,18,29,27,17,17,20,22,27,28,19,18,17,20,30,28,27,47,46,20,61,111,81,78,85,104,89,41,44,39,22,18,24,34,21,24,34,17,21,24,23,23,22,22,19,29,35,55,47,31,55,81,89,111,133,98,52,33,36,26,16,26,24,16,20,22,26,25,19,22,17,16,26,24,24,16,37,57,25,64,156,165,173,156,98,112,92,63,46,27,23,14,24,27,21,18,17,20,18,34,28,16,21,22,38,35,26,22,17,21,24,27,28,27,27,23,21,18,21,20,19,25,26,30,26,28,31,24,25,25,23,24,23,17,25,21,20,23,21,24,21,26,24,43,98,96,108,106,58,31,19,14,32,46,50,55,41,27,28,19,15,19,22,20,26,30,39,37,34,39,37,50,45,33,37,39,45,49,59,59,54,56,59,55,55,59,61,67,51,30,50,69,118,199,217,219,224,221,225,223,224,225,225,224,225,225,224,118,6,2,8,12,13,12,15,14,15,15,16,15,198,198,197,198,196,196,196,196,196,198,195,196,195,193,195,194,197,194,195,198,195,195,200,199,199,200,198,199,196,197,198,196,199,200,203,201,202,203,203,204,204,203,204,200,200,204,202,200,201,203,204,203,203,204,203,205,203,203,202,205,205,207,207,208,208,207,208,207,207,208,211,207,206,205,203,206,205,205,205,203,204,205,206,203,205,205,204,205,203,204,204,203,203,203,204,205,201,204,206,203,201,201,202,203,203,202,202,199,202,203,201,204,200,198,199,198,199,199,198,200,198,198,202,199,201,200,199,200,199,198,201,200,200,200,199,201,200,201,200,198,201,199,198,201,200,199,200,200,201,200,200,201,200,200,199,202,201,198,200,200,199,201,199,199,200,200,199,196,199,199,200,201,202,200,199,200,200,201,202,202,202,201,202,201,201,201,199,200,197,200,202,198,200,198,199,200,199,200,201,200,198,198,198,200,200,198,201,199,200,199,200,199,198,198,196,199,198,198,198,198,199,197,198,198,199,200,197,196,196,197,197,196,198,197,198,196,198,196,199,198,196,198,195,199,197,198,196,197,198,194,200,200,199,199,198,199,197,198,200,199,193,194,198,199,198,198,198,196,195,193,196,199,198,198,198,198,198,196,196,199,196,195,200,195,196,195,198,186,181,198,199,197,189,190,199,199,196,198,196,197,198,196,196,196,200,197,199,187,190,198,206,191,170,204,198,174,158,134,134,135,146,133,145,112,61,83,88,124,83,60,91,38,78,66,4,53,147,130,112,150,77,49,57,116,185,184,223,206,181,163,154,141,133,188,215,121,94,123,95,99,74,38,24,158,226,209,178,150,218,200,232,131,67,204,98,29,6,73,187,117,82,53,36,125,227,251,213,157,159,196,246,225,214,155,76,97,119,130,142,104,62,80,102,83,65,69,110,101,90,98,92,102,100,108,89,75,70,65,65,56,53,37,32,28,19,25,22,33,36,96,148,146,143,71,27,21,12,16,11,14,27,48,60,57,61,59,59,61,68,71,66,62,44,29,22,19,17,21,29,12,95,154,99,122,122,57,25,15,16,21,21,22,30,31,27,26,21,16,17,22,16,19,27,21,21,27,22,23,39,35,19,69,102,99,104,101,81,51,46,38,28,24,23,21,17,24,19,27,36,22,18,18,16,22,30,32,22,40,49,22,48,81,80,80,117,149,81,36,42,26,15,15,17,28,41,36,20,22,16,19,33,34,23,15,16,23,34,71,57,24,48,68,53,87,102,66,35,33,37,17,15,15,24,26,22,27,19,14,22,24,21,27,26,14,22,24,40,66,35,104,158,143,139,114,105,89,76,59,37,26,15,20,21,17,20,20,23,26,23,21,27,23,26,22,29,44,31,23,17,26,34,39,41,37,32,31,31,26,19,25,30,32,25,22,25,21,27,25,20,26,29,36,27,19,23,18,21,23,18,21,17,21,24,55,112,103,118,112,61,37,15,13,30,56,78,78,69,57,43,33,23,19,19,19,21,19,19,30,29,35,43,61,61,46,38,36,41,45,46,50,50,47,51,51,57,60,56,50,51,37,50,36,57,163,202,219,227,221,224,225,227,226,226,225,226,225,224,117,6,2,7,12,13,12,15,13,15,15,16,15,197,199,195,198,197,196,196,197,197,196,198,194,196,194,195,198,197,198,197,197,196,198,200,200,200,200,197,198,201,197,196,197,198,202,201,202,204,201,203,200,202,205,202,203,198,202,204,203,203,198,201,204,204,204,203,204,204,205,205,206,207,207,209,208,209,203,203,205,205,205,204,206,203,207,207,206,207,207,205,208,203,205,207,204,206,203,205,203,202,203,202,204,202,205,203,201,204,201,203,203,204,205,204,201,201,200,200,201,199,201,201,202,200,200,199,198,198,199,201,200,198,199,199,196,199,198,196,200,200,200,199,198,200,199,202,201,199,201,199,198,199,198,198,198,200,203,200,199,200,200,200,200,199,200,199,198,199,200,198,198,199,199,198,200,200,199,200,200,200,199,200,198,200,197,199,199,199,201,199,200,200,199,198,200,200,200,198,197,198,199,200,200,197,198,200,198,199,199,198,198,198,198,198,199,199,197,198,198,199,199,197,198,197,199,198,195,199,198,196,199,198,198,198,197,199,199,194,198,195,196,199,196,197,197,198,198,198,197,199,199,200,198,195,199,197,199,198,195,196,192,197,196,197,199,195,198,196,196,197,198,196,192,201,198,196,198,195,196,191,192,196,199,196,196,196,195,194,193,198,193,194,194,195,196,195,197,198,190,179,194,198,198,187,184,197,197,196,194,196,196,193,194,194,195,198,196,196,189,185,191,205,184,173,203,181,150,132,138,174,179,199,193,208,199,126,105,178,207,177,115,125,59,132,94,5,84,166,137,177,247,153,56,57,170,232,221,228,168,140,124,169,146,116,178,174,92,83,123,96,101,73,32,24,156,226,208,175,160,226,217,248,141,70,115,30,5,43,129,237,160,110,34,99,225,251,242,149,172,217,252,250,189,117,66,77,114,107,117,88,53,89,136,100,65,57,77,90,86,97,85,79,107,101,89,89,83,59,64,50,36,42,32,30,26,14,21,26,20,52,146,157,145,112,49,17,12,15,13,15,19,51,66,59,59,59,62,63,62,64,71,74,83,74,56,38,23,22,22,26,27,128,145,100,127,107,56,30,31,41,42,33,29,37,33,29,18,15,17,22,24,23,29,24,13,20,26,29,33,37,41,28,104,149,134,113,74,85,66,49,44,18,14,17,26,25,21,31,26,19,27,23,20,27,27,19,29,25,45,53,32,21,60,87,94,152,125,53,38,40,27,22,19,16,35,37,32,24,17,18,22,39,39,24,16,17,18,40,92,72,28,47,62,32,43,55,41,27,35,35,15,16,16,16,33,36,17,18,18,19,29,35,27,19,14,19,20,50,78,43,107,157,111,96,101,93,103,67,36,37,19,29,23,12,19,23,20,33,30,21,20,15,27,33,31,28,39,44,28,16,39,54,64,77,71,66,70,51,24,29,35,29,28,28,22,23,22,22,27,33,33,28,35,33,26,23,19,23,18,21,22,18,22,23,75,128,110,111,107,71,31,19,17,23,50,79,92,85,79,71,53,43,36,21,17,18,17,17,26,19,27,41,49,64,69,58,44,40,37,37,51,56,54,54,53,50,51,48,49,54,45,67,53,65,165,203,221,231,223,224,225,225,223,222,228,228,223,225,118,6,2,8,12,12,12,15,14,15,16,15,16,196,197,196,197,196,198,197,195,197,194,196,199,196,198,199,198,201,198,198,200,198,200,203,203,203,200,202,202,199,201,201,201,197,199,201,200,201,202,202,202,203,200,201,202,202,203,202,200,205,204,201,202,201,202,204,205,203,203,204,203,206,208,206,206,205,206,205,205,206,204,207,206,205,207,205,205,207,207,206,205,206,205,205,203,204,205,204,203,203,203,201,203,207,204,200,201,201,204,203,204,204,203,203,201,204,201,201,202,200,201,200,200,200,199,201,201,198,199,199,199,198,200,202,197,199,196,199,203,200,200,201,199,199,200,199,200,199,199,200,198,200,201,200,199,199,196,199,198,199,200,197,200,198,201,199,201,202,196,199,198,199,198,197,199,199,198,199,200,200,198,198,196,199,199,197,200,198,198,198,198,198,198,199,198,199,199,198,199,196,198,199,196,199,199,198,198,198,199,199,198,199,198,198,198,198,197,197,196,199,199,198,198,198,199,198,200,198,199,198,198,200,198,200,199,199,196,198,198,196,200,198,199,199,198,200,198,200,198,196,198,198,197,198,199,198,199,197,199,195,195,198,195,197,198,196,198,196,199,201,198,190,195,199,200,198,199,198,193,192,195,196,197,197,198,195,193,195,194,194,195,193,193,195,192,196,194,196,191,179,192,197,200,186,181,194,194,195,194,194,194,194,195,193,195,194,192,196,193,185,188,204,174,179,202,177,175,159,177,186,183,198,172,189,204,159,82,139,243,208,134,117,67,141,121,50,129,181,128,174,207,113,101,122,171,189,177,155,129,152,149,198,136,98,182,171,111,99,92,84,102,70,42,24,155,225,212,178,166,234,198,201,152,69,44,4,7,26,143,143,87,72,69,189,250,252,189,159,223,252,252,249,129,43,63,108,130,118,76,41,77,133,134,83,61,69,112,92,60,105,117,99,104,90,80,105,105,93,88,71,59,53,33,29,25,16,21,24,37,97,144,126,84,54,28,15,15,10,15,15,32,60,63,63,61,63,59,69,66,46,41,39,60,78,89,81,46,18,22,17,46,147,134,98,128,92,41,29,66,100,78,41,32,37,31,29,19,13,18,19,27,31,27,18,15,16,23,41,34,37,41,39,134,150,134,128,99,106,67,53,41,16,13,15,19,29,38,27,23,13,19,26,27,37,22,16,21,25,49,71,36,23,60,73,73,61,44,37,36,46,28,23,24,34,33,20,31,30,26,30,35,27,22,31,23,17,37,45,94,77,25,60,54,26,33,33,27,27,37,39,20,15,16,24,24,23,27,18,14,21,32,32,21,18,16,24,23,68,93,40,78,116,101,87,83,153,126,46,39,29,26,24,21,16,16,22,27,28,25,19,21,18,19,33,31,23,30,47,34,19,55,84,117,127,124,135,98,45,28,36,24,26,30,28,22,21,21,23,27,34,32,24,27,33,41,28,18,29,26,26,27,27,28,22,83,127,102,103,103,77,42,20,16,26,54,76,93,89,80,83,76,67,57,41,27,20,21,22,21,21,22,25,33,56,76,74,65,48,33,43,54,55,50,46,39,42,47,61,71,65,54,61,43,89,204,223,226,228,222,223,223,225,222,224,226,228,223,223,118,5,2,9,13,12,12,15,14,14,15,15,15,198,198,197,199,195,197,195,198,199,195,198,198,199,197,198,197,198,199,199,200,200,202,203,203,203,204,205,202,201,200,203,200,200,203,203,202,203,201,202,202,202,203,200,200,199,200,198,197,201,199,200,202,199,200,203,204,203,203,201,205,207,205,207,206,207,206,208,208,207,206,205,207,207,207,205,203,208,206,204,208,203,205,206,203,206,201,203,206,203,203,204,203,203,205,204,204,205,204,205,204,202,203,203,203,203,203,202,202,203,201,203,202,198,203,200,199,200,201,200,199,201,199,198,198,200,200,198,198,199,200,198,201,200,198,200,199,198,199,198,198,199,195,198,200,198,199,199,198,197,198,196,197,198,196,198,196,198,199,198,198,197,200,198,197,198,197,199,199,198,197,198,195,198,195,196,195,193,197,194,197,197,197,196,198,201,198,199,197,196,197,197,199,196,194,196,196,198,198,198,196,198,197,193,198,198,196,199,198,199,198,198,198,198,199,197,198,198,196,196,199,197,196,199,198,197,198,198,200,199,199,199,198,199,199,198,198,199,197,199,198,198,199,197,199,197,198,199,195,198,197,198,199,197,199,196,197,199,198,199,197,194,193,197,198,199,197,197,194,194,200,198,197,196,198,195,194,196,193,199,195,194,195,195,191,191,194,197,196,180,187,195,198,187,179,195,191,191,195,194,194,193,194,192,193,193,193,196,196,184,189,200,171,186,194,175,175,168,178,178,159,160,149,172,212,200,121,78,157,167,139,110,61,122,81,17,113,182,131,130,104,99,156,184,192,145,136,133,147,166,170,195,119,155,210,184,149,100,81,75,92,68,44,25,151,227,209,181,151,181,128,130,118,39,17,6,17,26,49,53,40,83,173,233,250,218,169,207,250,251,251,190,45,44,90,132,132,91,42,45,113,142,93,71,46,85,140,77,93,170,162,109,103,92,101,110,117,118,97,84,73,65,46,43,30,26,26,38,88,144,135,67,36,18,15,16,17,23,20,20,41,69,71,62,63,66,72,86,78,42,25,27,31,39,65,99,80,30,21,15,58,157,126,107,123,73,39,22,84,119,73,47,42,37,18,22,27,21,17,28,27,18,23,24,14,21,26,31,34,43,50,49,130,131,110,121,94,98,57,46,41,12,14,14,21,33,36,31,16,16,23,26,37,24,20,19,17,18,60,89,43,30,59,45,47,38,31,30,36,46,28,24,37,26,22,16,22,32,45,47,23,20,18,24,42,57,34,39,109,71,28,62,44,32,36,26,26,31,39,33,21,21,32,30,17,20,28,25,29,37,22,17,26,27,23,34,36,79,93,37,41,81,98,101,137,170,96,31,35,29,16,19,19,22,24,22,27,21,21,23,19,19,34,29,27,34,30,54,40,37,82,102,149,122,100,99,72,48,37,33,22,33,25,23,25,24,18,31,37,19,29,27,22,28,45,38,20,24,28,30,29,26,31,26,80,125,102,103,107,87,46,21,16,21,46,63,81,86,92,87,87,87,76,67,50,36,27,21,21,23,20,22,24,34,51,69,73,72,51,39,42,32,39,43,46,51,48,74,87,73,57,57,57,148,231,231,228,225,222,224,223,224,224,224,225,229,224,225,118,6,2,7,12,14,13,14,14,15,15,15,15,199,203,201,203,200,198,200,199,200,199,201,203,198,201,200,198,201,201,201,201,201,202,202,203,201,203,203,200,200,200,200,200,200,203,203,202,204,199,200,200,200,203,199,201,203,202,203,200,200,202,198,202,203,203,203,203,204,203,205,205,205,205,205,208,206,208,208,205,210,208,207,206,207,208,208,207,206,207,208,207,207,207,206,204,203,203,204,203,203,206,205,203,206,202,205,205,202,205,205,205,200,200,204,201,204,202,202,203,201,200,201,202,200,200,200,203,202,201,200,198,201,199,199,198,200,199,198,202,198,196,199,196,200,200,198,199,199,198,198,200,199,198,199,199,199,196,197,197,197,198,196,198,196,198,198,198,197,198,198,196,199,197,195,196,195,198,198,196,196,198,197,194,196,196,195,195,195,198,197,195,197,198,195,193,198,198,196,196,195,195,199,197,196,196,195,198,195,195,196,195,196,194,198,198,197,196,195,198,197,196,197,197,197,196,198,196,195,198,196,198,196,197,198,195,199,198,198,198,198,198,197,198,198,198,198,195,198,199,198,198,200,198,198,198,197,197,197,201,196,198,199,199,200,198,194,196,198,196,200,193,191,195,196,199,200,197,194,193,196,199,196,196,194,196,197,196,195,193,195,195,195,195,195,195,195,195,197,195,182,183,193,199,187,177,193,196,194,195,193,194,192,192,193,191,194,195,192,196,189,188,192,170,184,176,140,133,123,137,133,120,145,166,191,220,220,185,115,66,94,124,104,66,98,73,4,76,161,125,77,73,151,194,208,170,143,168,160,170,159,164,184,132,198,210,193,162,86,101,85,78,61,35,23,153,230,224,174,83,132,124,69,39,7,29,43,47,50,55,39,53,183,250,252,214,179,203,245,250,247,226,81,26,63,105,133,99,45,55,76,105,92,71,63,23,71,140,137,186,219,153,111,105,101,121,100,85,79,77,77,66,61,46,42,33,34,30,54,125,149,98,34,19,14,10,17,35,46,25,14,48,69,66,66,69,81,92,90,60,35,27,23,20,32,44,82,109,42,25,13,79,165,107,117,116,57,43,61,99,103,76,62,44,26,14,12,24,29,32,26,14,17,18,27,31,27,22,15,20,50,60,46,95,92,85,105,101,83,40,46,33,17,17,15,29,19,22,29,26,18,29,30,17,26,24,20,23,28,66,92,40,34,55,29,27,26,27,27,31,49,33,31,34,30,24,17,19,36,53,49,30,23,18,27,60,57,29,54,120,68,24,61,38,26,39,23,28,26,39,38,23,38,31,21,19,22,25,36,39,27,19,15,19,27,52,48,25,90,96,35,35,71,87,88,75,61,43,35,35,17,16,18,19,29,39,27,16,16,18,27,29,33,25,16,19,28,39,69,49,56,117,127,152,126,92,110,85,51,38,35,33,22,18,23,22,29,36,32,21,16,27,28,30,33,45,42,27,35,38,43,43,51,57,40,108,134,111,101,104,94,44,24,16,18,20,36,74,96,112,112,98,93,86,81,76,57,45,27,17,19,22,19,18,26,24,37,62,73,71,56,50,53,48,49,52,55,65,72,66,69,62,57,137,230,239,234,224,225,222,224,225,221,223,224,226,229,224,224,117,5,2,7,11,13,12,13,12,15,16,15,15,203,205,205,204,203,206,201,203,202,200,204,200,201,203,203,202,203,203,201,201,203,203,203,201,202,203,200,205,203,201,202,201,202,200,201,200,202,203,199,200,201,199,202,202,200,203,202,200,203,203,205,204,202,204,202,201,205,204,203,204,205,207,207,208,207,207,207,208,209,210,208,208,209,209,207,205,212,208,208,213,207,208,210,208,208,205,205,204,205,205,205,206,205,203,203,207,203,203,203,201,203,202,203,203,201,200,201,201,202,201,201,201,200,203,200,199,199,200,200,200,200,197,200,200,198,200,200,199,199,198,199,198,196,198,200,198,199,199,200,198,200,199,196,199,198,198,199,198,198,198,197,198,198,198,198,197,198,199,197,198,197,196,199,196,197,198,196,196,196,195,200,198,197,196,199,197,195,199,195,198,197,197,197,196,199,195,195,196,196,196,198,198,195,199,198,195,196,196,197,191,196,198,193,198,198,194,196,198,196,194,194,198,197,196,195,197,198,197,198,197,196,198,198,198,197,198,197,196,196,196,198,199,197,199,198,198,199,194,199,198,197,197,195,197,197,198,197,195,197,196,198,200,198,198,197,194,198,199,197,191,192,196,198,199,200,197,188,193,198,200,195,190,194,196,194,195,195,192,196,193,193,198,195,195,193,193,194,196,186,182,192,199,194,180,192,197,193,198,193,194,195,194,193,195,193,191,193,198,190,192,183,168,188,160,128,124,124,138,145,136,168,191,188,212,216,210,185,85,26,70,100,81,101,69,5,55,116,109,60,104,191,189,185,151,166,195,162,134,138,189,169,118,188,156,178,146,77,119,91,83,61,30,19,155,236,217,171,72,122,120,57,34,4,117,171,87,69,60,46,98,231,249,203,171,205,229,247,247,215,99,45,51,81,92,98,53,81,122,117,71,57,64,41,13,34,102,166,215,195,133,109,109,100,114,91,53,51,65,75,57,59,50,39,37,29,34,85,141,131,74,22,17,12,12,18,47,67,32,19,51,73,74,78,96,111,105,68,31,31,20,11,15,22,35,85,116,53,27,21,113,157,94,121,103,50,41,92,120,102,91,61,45,24,14,17,19,36,27,19,15,15,14,24,41,27,15,21,12,46,81,41,62,69,68,131,132,72,32,37,29,15,26,29,21,20,17,26,32,41,33,17,15,16,29,34,46,25,63,100,39,41,51,27,29,32,22,27,34,46,46,27,22,22,28,26,36,37,30,46,48,35,29,57,42,33,40,78,131,60,32,67,40,37,49,33,33,36,47,36,26,28,23,30,28,18,34,39,29,27,20,17,24,43,49,34,29,101,96,37,41,66,50,39,44,31,22,37,39,21,17,17,18,35,34,22,21,19,24,31,36,29,26,22,17,24,49,89,51,84,129,134,144,105,116,113,80,56,45,36,27,23,23,19,24,36,39,24,21,16,17,28,47,35,41,48,33,48,60,70,81,116,121,98,142,149,118,102,111,102,46,26,19,18,31,46,84,117,131,125,114,85,84,90,86,76,63,49,29,19,21,23,21,18,24,24,35,54,73,78,79,64,51,44,53,68,55,56,47,65,40,75,207,243,241,225,220,225,221,220,221,220,221,220,224,228,223,225,118,5,2,7,12,12,12,15,13,15,15,15,15,205,206,205,206,201,201,202,201,202,203,205,204,201,203,201,200,199,200,200,200,201,199,200,202,200,201,202,204,202,203,203,202,202,203,204,203,201,199,199,200,201,202,201,201,201,200,201,200,203,204,200,203,203,202,202,203,203,203,206,202,205,208,206,207,207,208,209,207,207,206,208,205,208,210,208,208,208,208,209,208,210,209,208,207,207,207,204,205,207,210,207,204,205,202,206,205,204,204,201,204,205,204,204,203,203,203,202,202,201,202,202,203,201,200,200,200,200,198,201,198,200,200,200,200,200,199,199,200,195,198,199,199,200,197,198,197,199,200,198,199,196,197,197,197,198,198,199,200,199,198,199,200,199,198,198,199,197,198,196,196,199,199,199,198,198,198,195,198,196,197,196,196,196,195,199,198,196,196,197,196,195,198,196,196,201,197,194,195,194,196,195,194,194,196,197,198,195,196,196,194,195,194,196,193,194,197,193,196,195,193,196,196,194,194,195,195,195,196,196,198,195,195,196,195,196,196,195,196,194,195,196,195,195,196,198,195,195,195,195,195,196,196,195,197,196,195,194,195,194,197,196,194,198,198,196,195,194,195,198,191,194,197,197,196,196,195,190,196,197,196,194,193,193,194,193,194,194,194,195,192,195,194,191,191,190,194,193,197,190,177,193,199,196,187,192,200,194,194,193,193,193,192,195,195,192,194,193,195,194,198,178,165,189,173,153,156,170,179,176,168,198,200,185,202,203,212,213,158,59,26,76,76,106,102,38,74,105,114,113,141,156,168,181,161,177,167,138,122,160,208,152,102,130,118,188,141,86,129,91,88,72,31,38,158,191,163,155,87,103,78,32,25,16,164,162,80,68,39,132,156,244,227,146,199,233,237,244,218,93,56,65,87,92,72,54,96,134,159,114,77,66,29,20,10,29,97,123,133,101,81,105,105,106,106,93,81,59,53,67,67,66,46,36,36,30,59,125,149,110,49,14,11,14,25,39,63,73,47,18,48,84,98,112,122,141,117,53,27,37,27,19,21,21,31,100,121,49,28,28,131,153,88,125,92,41,30,73,99,68,63,51,47,31,16,22,28,24,24,26,17,16,30,26,23,27,20,19,20,75,89,30,59,55,85,141,97,57,29,34,31,14,33,27,20,16,16,18,46,54,34,20,19,17,31,57,46,17,73,103,36,46,59,31,38,40,25,30,44,50,33,19,21,20,30,44,39,25,22,23,39,65,66,35,15,24,31,99,145,54,39,83,66,78,82,51,76,74,58,47,20,19,18,26,31,38,38,25,23,30,33,23,42,32,27,38,56,123,92,30,43,63,33,25,38,29,21,33,40,26,21,24,23,24,22,28,31,19,36,38,26,29,32,23,28,41,65,98,49,83,130,119,113,98,95,100,69,42,37,22,25,29,24,27,31,28,29,24,19,17,28,29,33,36,42,58,36,71,101,106,120,155,157,107,118,126,118,103,116,113,55,26,24,40,58,66,78,87,99,108,103,102,88,95,104,97,90,69,43,25,22,22,21,18,23,23,24,36,51,74,81,71,62,57,52,53,41,35,37,54,30,96,220,241,235,228,225,229,220,223,222,221,224,224,227,228,224,225,117,5,1,8,12,12,12,14,12,14,15,15,15,203,204,202,204,202,203,199,202,203,204,203,202,205,203,202,200,198,199,200,198,199,202,201,199,199,200,200,200,199,201,202,200,200,199,200,199,199,200,200,201,200,199,200,199,201,203,201,200,201,201,201,203,202,203,204,201,202,202,202,205,206,205,204,208,210,207,207,206,206,206,206,207,208,207,208,208,209,208,208,208,206,209,208,207,208,206,206,207,207,205,204,206,204,204,206,202,200,201,203,202,202,201,200,201,203,201,201,201,200,201,200,199,201,200,199,201,199,200,199,197,200,199,200,198,196,198,198,197,196,196,198,200,197,196,199,195,196,198,198,196,198,197,195,199,196,195,198,196,195,197,196,199,197,198,199,197,198,197,198,197,196,194,197,196,196,198,197,197,197,196,196,194,196,197,194,193,193,196,197,198,195,197,196,194,197,195,194,195,194,194,194,195,196,195,195,198,196,195,196,194,196,194,194,195,195,194,193,196,194,195,196,195,195,195,194,196,194,195,196,195,194,193,194,194,195,194,194,193,195,192,193,194,193,195,196,194,195,194,194,195,194,195,193,195,194,193,194,193,196,194,193,194,193,196,194,192,196,195,191,191,193,194,195,193,194,193,192,196,194,193,195,194,191,193,192,193,193,191,195,195,193,194,194,194,191,192,194,196,191,176,186,195,196,179,188,195,191,196,192,194,195,191,191,191,192,194,193,196,195,207,174,158,174,157,154,161,171,165,155,156,196,200,179,200,194,204,214,205,130,33,56,71,94,96,42,100,119,138,144,124,156,186,188,157,135,144,153,158,191,195,120,102,148,156,222,132,87,116,76,103,72,36,52,132,137,93,132,100,76,42,23,25,17,89,84,56,47,127,214,174,195,172,174,248,246,244,210,97,53,71,102,93,54,46,73,127,160,116,83,78,39,12,12,27,51,84,75,39,25,26,55,89,108,118,113,113,107,81,72,69,52,33,31,31,33,89,143,141,81,23,16,14,31,67,66,70,71,74,48,65,111,112,120,116,112,74,34,21,24,31,27,24,23,67,148,105,33,22,39,148,136,96,124,81,36,24,64,96,84,54,34,45,34,27,32,21,14,16,27,31,35,29,21,12,19,27,33,46,91,86,37,60,52,47,63,41,39,35,35,39,26,16,23,23,25,17,41,41,38,44,32,21,21,53,33,35,34,81,105,36,60,86,43,73,64,37,69,67,56,36,19,25,19,35,42,36,24,21,20,41,73,57,34,23,25,19,125,163,49,54,124,123,191,183,133,185,141,74,37,12,16,14,21,37,43,24,23,25,21,38,55,41,14,24,33,61,133,90,32,55,52,27,24,31,26,24,36,41,25,27,35,24,17,14,23,30,44,40,23,16,22,38,39,53,33,58,103,48,64,88,88,91,90,133,107,57,40,29,22,19,33,35,39,33,17,17,26,27,31,37,21,19,24,53,68,54,121,146,148,143,119,96,69,68,90,118,98,107,115,60,34,37,57,63,44,45,44,45,59,108,117,109,101,96,106,104,88,58,34,23,20,18,23,20,17,25,26,33,56,76,81,78,66,37,26,27,35,45,48,18,132,231,243,247,246,243,239,230,234,235,235,236,232,226,229,226,223,117,5,1,6,10,13,12,13,12,14,14,15,15,202,201,204,201,201,202,202,204,203,203,205,201,199,204,200,205,200,200,204,198,202,200,200,202,199,200,199,199,200,199,201,198,199,200,198,201,200,201,202,198,201,199,198,200,200,199,200,201,200,200,201,203,203,201,201,202,201,201,203,203,205,205,206,209,209,208,208,208,208,207,208,206,208,206,207,208,205,206,206,204,208,210,206,208,207,208,207,203,203,204,205,204,204,202,205,204,203,203,201,203,200,200,200,200,200,200,200,200,199,199,199,200,198,198,198,197,200,198,200,198,199,197,195,198,196,197,197,198,198,197,196,197,198,195,196,193,195,198,194,198,196,197,198,197,198,196,198,197,198,197,197,197,198,197,196,199,197,199,198,198,198,195,198,197,198,199,196,198,197,195,195,197,198,197,196,196,197,195,196,196,196,198,199,197,198,196,197,198,195,198,194,197,197,193,191,194,196,196,194,195,195,194,197,193,195,196,196,197,196,194,195,196,194,195,197,196,193,193,193,196,194,195,196,193,198,194,191,195,194,196,197,196,194,196,194,193,196,194,195,194,191,195,195,194,194,196,193,192,194,194,194,193,195,195,193,192,196,195,189,190,195,193,196,196,190,193,192,193,194,195,192,196,197,194,194,193,191,192,196,192,194,195,192,195,193,193,193,194,197,180,184,194,195,179,178,195,192,194,193,193,193,192,191,191,192,196,195,196,199,199,155,135,142,114,116,133,150,132,132,156,194,196,179,196,192,194,206,216,172,118,113,69,71,79,44,122,134,123,132,114,182,214,169,143,139,161,165,184,189,142,112,124,169,203,219,100,86,105,68,87,60,39,39,121,108,64,111,77,51,29,18,24,34,73,41,34,117,211,252,162,152,193,229,250,246,215,101,49,73,102,96,74,50,65,85,125,106,78,71,44,26,11,29,46,79,117,69,43,36,37,36,40,89,121,130,125,121,120,110,74,50,27,24,35,34,101,151,110,47,22,17,25,55,63,46,42,54,77,110,138,126,116,77,41,38,24,17,11,17,20,30,33,46,132,139,47,27,20,47,155,127,97,123,75,34,24,55,117,93,44,35,40,42,33,31,17,12,14,19,43,39,24,14,19,15,35,62,42,97,88,37,63,37,29,32,27,37,28,43,37,18,19,14,24,29,36,36,21,16,38,47,49,51,26,16,24,40,97,107,29,92,137,129,200,144,134,178,124,71,36,27,27,35,32,20,27,35,24,39,50,36,50,57,33,37,69,142,168,52,39,81,87,147,125,95,141,99,62,46,17,21,19,29,36,33,31,26,20,26,48,44,38,19,25,22,66,141,86,32,57,55,25,43,38,21,32,39,39,26,37,32,19,15,17,22,30,40,32,18,26,19,35,61,44,26,65,101,50,37,62,61,84,148,165,92,33,37,29,18,21,26,45,45,27,22,19,26,44,41,29,16,19,19,61,85,63,138,154,145,139,101,92,86,56,66,116,97,103,112,66,36,37,57,42,28,32,30,31,51,100,122,104,94,100,108,108,102,73,36,25,30,26,23,19,16,22,24,30,42,60,76,81,63,35,19,22,31,42,33,39,188,233,250,250,248,248,249,249,249,247,246,246,240,232,227,225,223,116,5,2,7,11,12,12,14,13,15,15,15,15,204,205,205,206,203,201,204,205,207,205,202,204,202,204,201,203,200,200,202,201,201,200,200,200,201,200,202,200,200,200,198,200,199,200,199,201,200,198,201,201,200,200,200,200,200,200,200,201,202,201,205,205,201,202,204,204,204,202,203,205,205,202,204,206,205,206,206,207,206,206,206,206,206,203,206,205,203,207,206,208,207,205,206,203,204,205,206,204,204,202,203,206,204,203,204,203,203,203,201,202,201,201,200,200,202,198,198,199,198,198,198,196,200,196,195,199,195,198,197,197,198,194,197,198,196,198,198,196,194,196,196,196,195,196,200,194,197,196,194,198,196,199,195,195,196,196,197,195,195,195,196,198,196,194,195,196,196,195,193,196,197,196,198,197,194,197,197,197,196,197,197,198,197,195,197,197,198,198,198,197,196,197,198,198,201,199,196,199,199,195,196,198,193,195,196,197,195,195,197,196,197,193,195,194,196,197,196,198,197,196,196,194,195,194,193,194,194,196,193,196,196,194,195,194,195,194,192,193,193,193,198,194,195,192,193,193,195,196,195,195,193,193,195,196,193,192,197,194,194,194,194,193,192,192,194,192,193,195,188,192,193,192,193,189,189,193,193,190,191,195,194,193,196,195,193,193,193,193,193,192,193,192,190,191,194,194,193,192,195,182,182,196,196,178,176,194,194,195,190,190,193,192,193,193,196,197,195,189,181,185,142,131,134,122,142,154,166,152,152,171,195,196,177,192,193,191,200,208,171,169,163,73,70,94,90,190,151,130,157,122,197,174,136,156,167,185,162,166,168,154,141,115,160,200,177,88,87,103,71,82,48,33,82,129,95,74,89,44,35,29,14,40,61,75,37,92,191,245,247,146,197,252,244,245,208,105,52,62,107,105,79,55,72,107,95,77,72,69,45,22,15,33,60,83,115,155,118,71,53,38,42,39,32,70,116,129,131,119,121,106,74,45,26,23,43,134,155,85,31,10,16,38,61,43,22,19,15,34,77,136,145,101,60,24,22,17,31,32,15,16,26,27,41,68,51,25,31,13,66,162,115,108,132,67,32,16,41,85,61,35,28,42,33,22,26,23,19,16,33,32,32,36,20,16,37,52,39,55,119,81,32,56,26,27,31,22,35,26,40,40,18,18,15,19,39,39,28,17,23,18,46,71,40,18,14,22,34,110,110,31,83,102,116,181,122,132,120,85,52,29,40,48,32,21,16,24,31,48,48,23,21,22,48,67,85,36,92,148,57,32,43,39,42,44,38,39,36,53,50,23,24,28,30,24,22,35,33,31,40,33,25,43,39,30,39,80,137,79,29,78,69,48,71,53,38,55,49,46,39,19,21,27,18,19,32,27,26,33,30,22,38,49,31,35,35,83,104,47,29,57,61,96,115,81,50,40,39,29,26,32,33,33,42,39,30,28,55,44,28,35,21,24,41,104,83,77,139,119,107,102,108,89,76,52,49,113,103,105,113,73,35,47,45,32,29,23,27,26,46,83,105,114,98,103,106,113,125,82,33,39,76,56,30,24,17,23,19,24,39,51,65,81,66,41,26,24,25,43,31,55,208,226,249,236,205,229,246,243,228,222,219,230,240,235,227,221,221,118,5,1,8,11,13,13,14,13,14,15,15,15,204,205,203,203,204,205,202,205,203,202,205,203,205,204,202,203,198,202,203,200,200,199,199,200,200,198,200,199,200,199,201,201,199,199,200,200,198,198,201,199,199,200,201,201,200,200,202,206,204,201,206,205,204,203,205,204,204,204,205,203,205,205,203,204,203,203,204,204,204,206,205,204,205,204,206,203,203,207,206,203,203,205,204,205,206,206,203,203,206,204,205,205,204,204,204,204,202,203,201,204,203,203,200,202,200,198,200,200,200,198,196,199,198,197,198,197,197,194,195,197,198,196,197,198,195,198,197,196,196,198,198,196,200,198,199,198,198,199,195,198,198,197,195,194,198,196,195,195,194,196,196,197,196,196,195,194,194,196,193,197,198,194,195,196,197,196,195,197,198,196,198,198,196,197,197,197,196,195,199,198,198,198,196,197,196,197,196,196,196,198,198,198,198,196,199,198,195,198,197,195,195,195,199,196,195,198,197,196,199,196,197,196,196,196,194,196,193,198,198,195,196,196,195,195,199,195,194,197,195,194,194,193,196,195,194,194,196,194,194,197,191,196,194,194,196,191,193,192,193,194,194,193,193,195,197,193,194,193,188,193,194,194,194,187,189,196,193,191,192,193,193,194,196,193,191,193,191,194,194,191,193,191,189,193,194,191,192,192,196,186,176,194,202,183,180,191,191,193,193,193,194,194,195,195,193,186,179,174,188,190,146,162,168,161,187,193,193,160,185,192,192,198,173,191,194,194,196,200,142,158,173,73,84,103,134,248,155,140,192,136,167,155,154,172,171,153,136,173,170,181,167,99,154,186,173,113,89,98,85,87,50,50,115,139,97,86,72,22,27,42,46,55,72,55,94,214,244,228,172,139,243,251,243,198,89,51,64,101,128,115,73,57,81,129,105,78,71,39,25,13,26,60,88,113,130,157,131,116,112,73,50,38,37,29,47,92,122,133,126,118,124,103,62,37,53,113,117,56,19,10,15,49,64,33,19,15,13,16,25,74,117,107,68,25,33,85,100,73,35,21,22,28,31,22,26,21,28,10,91,160,103,122,128,63,22,14,18,34,30,25,29,36,30,15,17,23,31,33,25,18,16,36,39,48,51,29,18,57,126,76,38,57,23,29,34,21,30,29,47,46,23,16,21,33,26,27,29,27,15,34,50,41,47,38,20,31,54,97,100,40,40,55,34,49,46,40,41,39,51,33,44,48,30,25,21,18,44,53,42,28,27,31,56,95,90,28,69,118,46,36,40,42,51,45,45,46,26,47,43,25,45,32,19,22,20,26,39,42,29,24,23,29,44,65,43,54,129,71,40,127,113,128,159,95,120,123,78,51,17,17,13,21,29,35,29,21,17,23,33,46,50,26,16,21,47,100,102,46,39,67,50,44,45,33,37,29,47,35,30,41,25,23,25,42,49,63,51,21,18,32,34,59,86,122,84,68,132,101,87,129,104,105,83,34,36,99,107,103,121,75,41,53,47,30,21,21,16,28,48,63,120,132,112,113,101,118,126,79,26,85,137,84,44,26,23,19,20,24,34,49,65,81,76,60,33,36,45,37,37,29,115,145,130,118,84,141,180,175,174,156,152,171,209,229,222,216,219,117,6,2,8,12,13,12,14,13,14,15,16,15,202,201,200,202,203,203,201,200,205,205,203,205,201,203,201,203,200,201,200,201,201,200,200,199,199,198,201,200,200,202,201,201,200,202,202,201,201,199,198,198,200,199,200,201,201,200,201,201,202,203,205,201,203,203,202,206,204,202,203,204,205,205,205,207,207,204,203,204,205,206,206,203,203,204,205,202,201,202,204,205,201,202,205,204,202,203,205,205,204,203,203,205,203,201,204,200,200,201,200,201,200,202,201,201,201,197,199,201,200,200,199,197,198,198,197,200,197,198,198,196,198,198,198,198,196,196,197,197,197,197,196,198,198,196,197,193,196,197,194,196,194,195,193,197,194,192,195,195,196,194,196,196,196,196,194,198,194,196,195,196,195,195,197,196,196,196,196,198,198,197,193,196,195,196,195,193,196,194,196,198,196,196,196,195,196,196,195,195,196,194,195,196,196,198,195,195,198,196,196,197,196,195,196,198,198,195,196,194,196,195,195,196,194,198,197,195,194,198,196,196,195,193,195,194,194,197,194,196,195,195,195,191,195,195,194,193,193,193,194,195,193,194,192,193,193,193,192,192,194,193,191,191,192,193,194,193,193,191,190,194,194,192,191,188,193,192,193,194,193,192,192,193,194,194,193,191,190,191,192,194,192,190,192,193,194,191,193,191,192,189,176,190,198,188,182,187,192,194,191,193,196,193,192,185,178,176,181,186,202,191,158,172,163,165,181,181,171,157,194,192,190,198,174,190,194,191,195,194,143,166,166,61,61,89,134,220,121,116,153,115,160,169,183,171,134,122,151,184,175,187,141,121,198,195,188,134,91,117,95,93,61,66,120,90,81,101,57,17,42,81,63,76,47,76,210,253,252,178,171,178,246,248,210,100,36,65,101,121,121,83,69,60,92,113,89,79,41,24,18,22,49,87,112,118,119,137,130,116,121,111,98,59,34,40,31,37,73,118,141,147,147,141,142,116,68,37,40,31,20,14,13,46,65,41,28,21,12,19,15,26,80,122,88,33,122,188,139,72,29,24,36,46,35,27,25,22,27,27,115,157,101,128,121,52,20,12,15,18,22,25,33,36,22,15,14,23,41,29,19,15,11,24,56,62,26,16,15,29,111,77,43,67,23,45,50,28,48,39,47,39,26,33,30,25,16,11,29,33,34,38,30,17,33,60,41,79,56,90,105,34,39,36,38,42,29,45,37,33,58,49,26,29,52,49,35,61,74,50,63,73,66,68,117,113,127,92,131,130,33,101,51,61,69,62,62,74,55,59,63,57,58,51,48,46,53,39,42,42,31,23,21,28,50,59,34,50,124,73,36,108,96,139,173,121,158,155,80,42,21,13,16,19,36,37,23,18,17,17,34,50,43,21,26,21,43,113,101,48,43,63,36,24,34,24,31,35,33,46,42,28,23,19,20,34,64,59,33,21,18,22,58,66,74,124,71,56,100,83,108,130,139,136,68,37,28,95,114,93,114,87,44,47,52,34,24,25,26,39,44,59,113,140,126,126,112,110,107,56,24,122,164,90,55,33,21,19,22,27,35,49,62,76,77,68,48,42,41,49,50,43,47,25,36,54,34,50,105,131,134,113,89,101,149,189,206,214,220,118,6,2,9,12,13,13,15,14,15,15,16,16,203,204,202,202,202,202,202,203,200,203,205,201,204,201,201,201,200,201,201,199,202,201,200,200,201,202,201,201,201,202,202,203,201,201,202,199,203,200,200,200,200,201,202,203,201,199,200,200,200,204,203,202,202,200,203,203,203,205,203,205,204,202,202,203,204,203,204,204,202,203,204,203,202,202,202,204,205,202,202,202,203,201,202,201,200,200,201,204,203,201,202,200,200,200,200,200,200,200,201,202,199,203,200,200,199,198,201,199,201,200,198,199,200,198,196,198,197,198,199,196,199,198,198,198,198,198,193,196,196,197,193,195,198,193,196,195,196,194,194,195,193,193,193,194,195,196,194,194,194,195,195,194,193,195,194,194,199,196,193,196,195,195,196,193,194,196,197,195,196,196,195,196,195,194,194,195,198,195,196,194,195,196,194,196,195,194,196,195,197,197,196,193,193,195,196,198,196,194,196,195,195,196,197,195,193,197,195,196,198,194,198,195,194,196,195,195,196,196,196,193,194,195,194,194,195,195,196,194,192,193,196,193,194,195,193,193,194,193,193,194,190,194,191,193,196,191,193,193,192,192,191,192,194,191,191,191,190,189,190,194,193,191,188,189,193,192,193,191,190,193,193,195,192,193,195,192,192,193,193,191,191,194,193,190,193,190,193,191,190,193,180,184,198,187,183,189,190,195,197,193,190,181,173,177,182,184,191,191,208,179,128,136,114,120,148,133,136,152,199,187,181,198,170,188,194,189,195,193,159,194,174,55,53,69,122,174,74,70,134,139,146,173,162,140,139,145,190,173,142,179,136,160,227,167,171,119,91,130,90,90,64,68,97,49,60,89,52,15,50,76,78,64,61,177,250,250,199,175,234,208,246,234,99,49,57,99,118,114,73,57,62,71,85,84,77,53,24,12,24,47,83,113,117,116,107,117,119,107,106,105,121,107,75,56,47,49,36,47,98,135,159,165,152,151,127,79,47,27,19,16,12,27,51,49,46,33,14,21,18,20,42,103,113,101,183,148,48,22,23,51,68,61,55,41,28,30,31,45,142,141,106,141,110,46,17,15,16,16,20,23,30,42,31,16,23,30,29,25,24,22,12,37,45,45,38,22,14,42,127,71,62,92,62,107,83,61,100,70,55,35,21,43,29,20,17,14,16,42,44,27,17,19,18,55,73,164,193,135,106,33,32,43,46,51,39,48,44,35,56,48,53,52,74,101,105,119,119,118,143,161,177,182,188,191,180,183,212,187,161,184,171,160,157,149,162,160,150,150,127,132,137,127,139,124,103,95,70,64,69,53,53,69,45,45,26,59,135,63,28,60,49,43,51,42,50,52,56,51,28,21,20,28,29,25,28,24,16,33,30,28,43,41,32,37,66,102,96,43,50,56,25,29,34,23,28,31,43,37,28,33,24,23,29,37,39,52,41,26,25,50,63,41,79,138,79,39,57,61,66,117,127,87,52,31,27,97,120,93,120,91,41,42,51,41,29,24,23,31,41,42,59,83,93,110,105,97,92,38,93,180,162,118,84,59,33,23,23,26,37,48,61,69,76,75,52,35,39,57,69,67,64,37,33,40,40,81,111,107,101,84,73,62,95,166,198,212,220,118,6,3,9,12,13,12,14,13,14,15,15,16,202,206,201,200,202,203,204,204,204,200,202,202,200,201,202,204,203,201,200,200,201,204,203,200,202,201,202,201,200,201,200,202,200,201,201,200,199,201,203,199,202,200,201,205,199,200,204,200,200,202,202,201,201,201,202,200,201,201,202,202,200,203,201,202,201,200,204,201,202,200,203,201,200,202,204,201,202,201,199,201,199,200,200,199,201,198,198,198,200,200,198,200,199,197,198,198,201,199,200,200,199,200,199,200,202,199,200,199,199,202,200,201,198,198,201,199,198,198,198,196,198,199,198,198,193,198,198,196,198,195,198,197,196,198,196,194,194,196,193,196,194,194,195,195,196,195,195,195,196,194,196,196,196,192,191,196,195,195,193,194,195,193,194,196,195,193,192,194,194,192,194,195,192,194,194,194,197,193,196,195,193,194,195,194,192,195,195,195,195,195,197,196,195,193,194,193,196,193,194,194,192,196,196,195,193,196,196,195,198,193,193,195,196,195,195,193,193,193,191,195,196,194,195,193,193,195,194,194,193,193,193,193,195,192,192,193,193,192,193,193,193,193,195,193,193,194,193,194,193,193,192,191,192,193,191,194,191,187,191,191,193,190,190,190,194,191,192,191,187,191,193,193,193,194,194,191,190,190,191,191,191,190,191,191,190,189,189,193,191,190,181,181,195,190,186,191,194,194,188,179,175,174,179,190,191,190,192,195,203,150,115,121,118,141,154,144,136,167,206,187,182,194,173,187,194,189,194,196,171,201,184,101,94,102,96,99,35,77,190,171,148,142,132,151,156,191,190,122,150,193,120,178,203,116,177,122,95,118,66,83,61,65,67,33,37,65,45,29,53,62,49,61,159,249,252,200,163,215,249,211,213,122,35,66,92,122,111,69,63,71,81,83,79,69,47,35,15,33,51,78,114,118,122,110,103,111,106,104,100,95,108,120,120,133,70,37,49,42,33,63,125,152,162,157,150,144,105,68,51,33,18,13,22,34,37,28,22,25,17,18,27,67,112,143,178,99,27,19,33,95,117,97,75,57,33,34,39,69,148,133,108,144,101,29,17,15,14,21,19,23,29,49,40,29,34,16,14,21,29,30,36,36,19,21,48,41,43,101,145,50,99,145,136,231,153,153,202,112,60,29,23,23,22,33,19,15,37,33,32,31,25,16,44,50,39,194,113,85,112,19,39,36,42,45,42,51,54,57,122,93,122,151,181,200,206,213,164,209,218,210,203,197,203,182,186,188,201,193,185,201,186,181,184,195,204,205,201,205,191,189,194,198,200,192,148,179,175,165,162,149,136,120,89,65,41,109,149,51,23,37,33,34,31,38,35,24,41,40,24,34,37,25,14,17,27,24,41,35,22,23,35,48,53,61,42,81,97,40,55,59,27,43,41,24,43,37,38,36,17,26,27,33,40,27,18,27,41,44,60,54,31,14,69,130,76,39,55,48,39,62,63,45,38,29,24,96,125,92,115,104,47,25,35,38,30,23,27,26,26,26,24,42,51,72,89,103,97,99,149,168,137,124,124,84,55,32,26,28,33,53,60,71,81,78,59,37,35,39,48,58,61,41,59,89,105,120,108,87,80,76,79,106,156,199,216,217,220,118,7,2,8,12,14,12,14,13,14,15,14,14,203,206,206,206,203,202,202,201,203,205,201,201,202,201,202,201,203,202,203,203,205,203,202,203,200,202,202,202,202,202,200,200,203,202,204,203,203,202,202,204,205,202,204,202,201,201,201,202,200,200,198,199,201,200,202,200,199,201,198,203,201,203,200,200,201,201,203,201,201,200,202,200,200,203,202,202,200,199,201,202,201,199,200,198,199,199,198,199,198,199,200,199,199,200,196,200,200,198,196,197,198,199,199,200,199,197,199,199,200,201,199,200,201,198,199,199,198,200,200,196,200,198,196,197,196,197,196,198,196,198,198,196,197,195,198,197,197,196,196,197,198,197,194,196,193,194,197,195,195,197,195,194,195,196,194,192,195,193,191,196,197,194,194,195,195,193,192,192,192,194,195,196,194,193,194,195,194,194,195,192,197,198,194,196,193,195,197,193,194,194,195,196,195,194,195,195,193,195,194,192,192,193,194,196,194,196,194,193,195,193,193,193,193,195,195,191,194,195,193,194,193,193,193,192,195,193,193,195,193,194,193,192,193,193,193,193,194,189,192,194,192,194,194,194,192,191,193,191,190,191,193,192,190,189,194,192,187,190,193,192,191,189,190,191,193,195,191,190,190,189,191,191,191,192,191,193,193,191,191,189,192,190,189,193,193,191,191,190,191,194,184,179,196,194,189,192,181,178,173,175,182,187,192,196,191,191,195,200,195,131,136,168,165,185,188,165,171,194,208,189,181,200,175,184,197,191,198,191,179,202,195,123,125,158,125,73,46,139,232,188,131,144,163,168,176,184,168,138,190,200,95,161,163,139,210,121,110,119,68,76,57,61,56,19,29,57,43,45,66,54,45,145,241,250,231,156,191,232,250,194,95,55,53,104,110,115,68,68,106,98,87,83,75,46,30,16,29,57,83,112,122,118,116,101,101,118,106,101,105,87,104,109,117,141,78,39,49,40,43,34,34,78,128,160,163,154,137,141,151,100,53,36,18,12,19,22,20,22,17,37,59,55,84,131,160,91,32,13,69,163,134,141,120,59,36,33,38,94,159,112,112,141,79,30,21,12,19,25,20,25,36,43,40,43,27,13,17,14,23,47,39,18,14,17,25,72,73,77,128,51,57,60,77,141,76,77,99,60,60,39,15,17,15,32,31,36,32,17,13,30,36,45,56,51,36,52,62,110,94,28,55,55,57,70,86,123,148,177,200,202,224,222,225,212,194,187,183,181,184,170,142,144,139,125,119,122,131,122,125,133,127,131,141,155,158,162,179,179,170,172,165,164,162,162,172,184,188,170,177,174,185,188,170,169,147,170,159,67,57,61,46,49,39,36,44,31,40,38,21,45,34,22,26,13,21,42,45,36,19,20,22,45,70,53,32,80,97,45,73,78,43,82,69,49,77,51,46,34,17,24,29,47,36,15,14,15,24,54,63,31,18,11,41,119,85,42,56,45,27,39,41,31,27,26,24,98,132,94,121,107,54,24,19,27,25,30,23,27,45,57,46,34,37,66,116,125,131,116,90,60,53,59,74,97,82,56,27,24,35,43,61,65,77,78,61,42,29,44,60,55,55,90,133,141,124,108,92,76,76,112,171,206,230,243,229,225,224,116,6,1,8,11,12,12,14,13,15,16,15,16,203,206,205,205,203,202,202,200,201,201,204,204,202,200,201,204,201,199,202,201,202,205,203,200,203,200,203,206,202,204,202,202,205,204,205,205,203,206,204,201,203,202,200,200,199,201,204,202,200,199,199,200,200,202,199,200,201,200,201,198,199,201,199,200,200,201,201,199,201,200,199,200,202,199,200,200,201,201,200,204,201,200,202,201,201,198,199,201,201,199,199,199,199,200,200,197,200,198,199,198,198,198,198,199,199,199,198,199,198,200,197,197,200,200,199,198,198,199,199,198,199,196,196,197,197,198,195,195,197,196,196,197,196,195,196,196,195,195,194,198,196,193,195,194,196,195,195,195,192,193,193,193,194,195,194,196,194,193,193,193,192,193,194,194,193,194,193,193,194,192,193,195,194,193,193,193,194,193,195,194,193,195,194,193,193,191,194,195,194,194,191,194,193,191,194,192,194,193,191,199,196,191,191,196,194,193,191,193,196,194,194,193,191,191,193,194,192,194,196,195,193,191,192,192,193,189,191,194,193,193,193,194,193,195,192,192,191,192,193,195,194,192,194,192,194,195,193,191,191,193,192,193,193,194,192,192,188,187,194,194,189,189,192,190,193,191,192,194,190,188,189,191,190,191,193,195,193,191,194,190,190,190,190,191,191,191,189,193,192,193,194,182,194,188,174,174,170,179,184,188,193,195,192,195,191,192,193,203,177,122,145,163,173,181,165,158,183,204,207,190,179,196,178,183,196,188,194,193,182,193,192,125,130,194,153,118,94,171,236,160,137,155,187,159,152,190,167,162,211,151,64,152,162,182,208,96,121,130,97,98,80,77,41,9,32,92,65,71,59,37,135,224,249,249,157,173,224,235,249,124,48,61,90,124,108,74,62,111,129,94,74,77,50,26,14,24,46,80,104,116,124,112,107,103,113,144,118,98,99,92,105,96,104,124,81,78,70,41,39,32,35,31,48,95,131,153,146,151,161,163,145,105,64,34,23,12,21,17,22,53,50,40,48,91,139,105,57,19,26,55,24,50,108,80,37,33,43,118,154,110,116,131,68,26,30,23,37,37,38,42,42,52,34,24,29,22,17,19,33,33,25,24,21,15,42,69,59,100,124,54,36,38,29,48,30,37,41,28,60,39,12,17,17,21,48,44,27,25,32,46,70,88,93,86,89,137,98,156,149,120,177,163,191,198,201,227,218,213,204,169,173,162,155,143,128,109,119,135,136,134,134,129,132,129,138,138,141,146,146,154,156,151,139,148,142,165,181,178,184,186,178,170,169,170,161,156,154,128,136,143,153,162,158,168,167,180,174,152,174,158,139,117,85,63,57,57,49,58,42,33,32,32,31,26,41,31,24,28,26,24,34,44,40,45,49,109,103,36,118,105,125,221,119,141,166,75,53,29,23,30,30,31,30,24,18,23,35,42,38,39,38,12,80,141,78,51,60,33,29,35,29,30,27,26,28,100,136,103,115,113,60,27,18,24,28,29,27,40,80,85,80,60,36,117,150,133,108,56,44,24,24,26,42,95,102,62,40,28,29,32,46,53,56,53,44,104,151,148,140,131,137,150,148,118,95,82,73,99,155,210,239,246,246,238,229,224,223,115,4,2,7,11,12,12,13,12,15,15,15,15,201,206,205,200,204,201,202,202,203,200,200,203,201,199,200,201,201,202,201,203,203,202,203,203,201,204,206,206,206,205,205,203,204,206,203,202,204,202,202,199,198,198,199,199,202,202,199,201,200,199,201,200,200,200,202,201,199,200,198,200,199,200,200,201,202,200,200,198,199,200,201,201,201,202,200,201,202,203,200,200,199,200,204,200,202,201,201,202,201,200,199,199,201,204,200,200,200,199,200,198,198,199,199,198,196,199,198,196,198,198,196,198,198,198,199,198,198,198,198,197,196,198,197,196,198,196,194,196,195,198,198,198,197,196,195,196,196,193,195,196,194,191,193,198,195,193,196,195,197,196,194,194,194,196,195,193,194,193,193,195,193,193,195,193,195,193,192,194,194,193,193,194,193,195,193,191,193,191,193,190,193,195,190,194,193,194,194,191,193,192,193,194,191,190,194,194,191,193,196,194,195,192,193,193,189,193,192,194,193,191,193,193,192,193,193,192,191,192,193,193,192,192,194,193,192,192,192,191,193,194,194,193,192,194,193,191,194,192,193,193,190,191,192,192,193,193,192,192,192,191,192,191,193,193,194,190,188,193,191,191,188,189,193,192,193,193,189,192,193,189,190,195,193,193,192,193,198,193,193,192,193,190,189,190,190,195,193,192,196,199,194,173,172,169,163,179,184,192,193,193,196,194,195,194,190,193,195,196,155,105,122,123,143,153,122,139,189,201,205,191,177,199,177,185,199,191,198,193,188,189,189,137,134,150,119,97,81,146,196,187,163,158,155,128,174,198,177,173,171,112,60,159,180,198,166,73,130,135,111,124,115,73,23,21,129,204,118,78,56,106,216,246,239,177,161,224,236,232,179,60,75,98,105,116,64,73,107,134,118,67,71,54,25,12,18,43,77,105,118,128,119,105,102,98,112,156,141,101,94,84,96,100,116,122,83,98,89,52,44,31,37,36,32,35,49,89,126,140,150,153,159,171,160,110,64,40,27,23,17,22,24,22,25,34,86,115,87,49,21,24,26,77,114,79,42,20,46,138,153,104,122,118,53,25,47,51,60,64,60,59,54,43,24,12,23,34,31,35,23,15,23,29,26,48,51,26,30,97,130,52,37,50,54,55,44,61,45,35,52,44,26,21,32,55,61,69,71,65,88,123,132,151,168,176,193,200,186,205,189,202,226,196,200,184,174,169,149,138,125,123,118,123,139,144,150,140,145,146,141,147,150,144,132,130,131,129,127,120,110,104,98,99,90,89,89,99,105,108,124,136,143,145,156,152,142,155,159,162,165,162,166,155,153,147,137,143,140,151,185,186,185,184,169,168,146,123,125,102,84,67,47,56,77,71,43,27,21,26,33,40,44,31,17,28,65,126,105,33,89,93,132,190,104,140,149,65,60,36,36,37,20,21,23,29,28,42,34,21,23,45,59,63,122,157,74,46,61,29,24,37,25,30,30,26,29,93,126,99,121,115,75,33,22,34,35,34,35,43,60,95,123,74,93,174,150,92,51,32,32,21,21,25,67,135,113,71,42,26,22,29,50,50,76,114,177,214,204,196,174,160,141,124,99,74,64,84,147,206,239,248,249,239,236,231,225,222,219,115,4,1,7,11,13,12,13,13,14,14,15,14,204,206,206,206,203,200,203,205,202,203,202,200,201,202,202,202,203,201,203,203,206,205,204,205,208,208,205,206,206,207,209,205,206,205,205,205,202,204,203,203,200,200,203,200,201,200,200,200,198,201,201,201,200,202,205,203,202,203,206,204,205,204,203,204,201,205,201,203,202,200,202,202,203,203,203,204,202,202,201,203,202,200,203,201,203,201,201,202,199,200,201,201,203,201,201,200,200,200,199,199,200,198,200,199,198,197,199,199,194,196,197,196,196,196,194,197,198,196,193,194,196,197,196,194,195,197,197,195,195,195,195,195,195,195,197,194,194,195,194,195,194,193,196,193,194,193,193,195,195,194,194,194,193,193,193,195,192,193,194,194,195,195,195,195,194,194,193,194,196,194,194,198,192,192,195,193,192,193,193,193,193,193,193,193,193,193,192,194,191,192,193,193,196,194,196,193,196,195,191,195,193,193,191,191,194,192,193,192,192,193,191,193,193,194,194,191,193,191,189,193,191,194,195,193,195,194,195,194,194,192,193,194,193,194,192,193,194,191,192,194,193,191,193,193,191,192,193,192,192,193,193,192,191,192,192,188,189,192,193,189,189,193,195,193,191,193,192,191,194,192,188,193,193,191,192,194,196,195,192,189,193,193,194,193,193,195,198,196,192,184,174,162,169,178,181,192,193,194,195,194,191,191,191,193,190,196,197,193,145,107,135,146,163,156,124,150,199,203,203,193,176,200,184,184,203,193,200,198,191,186,192,146,99,96,71,68,46,65,123,185,160,123,153,146,194,191,154,164,174,137,63,121,163,173,149,95,125,124,109,127,99,48,7,58,177,190,115,72,128,200,238,206,155,179,210,247,234,140,73,38,104,115,105,71,57,109,127,120,77,66,50,25,20,19,39,79,105,121,125,117,107,104,97,100,95,136,140,103,101,84,112,131,139,123,87,87,59,53,49,44,45,27,36,36,31,40,54,100,133,151,154,165,169,163,149,119,86,45,31,22,14,14,21,18,31,75,102,84,76,71,108,168,130,60,33,18,54,151,139,103,127,107,46,19,30,38,45,42,41,46,46,43,20,15,13,30,48,29,19,16,18,26,45,50,24,21,12,53,110,61,38,57,58,56,47,49,42,34,53,51,50,62,80,105,125,117,130,154,187,205,199,200,196,201,195,184,170,169,147,155,148,115,121,107,114,128,133,146,136,136,140,145,147,145,137,108,101,81,63,53,45,41,33,29,33,39,33,34,22,24,30,29,29,27,24,28,33,27,29,37,39,37,39,47,53,84,88,103,100,128,141,147,152,156,150,147,141,137,139,142,154,154,169,177,177,172,168,178,160,159,143,141,147,116,87,60,52,41,61,60,42,27,22,16,60,137,103,37,49,41,34,56,38,44,48,36,55,48,43,29,17,24,19,29,46,33,19,19,15,37,79,77,118,152,66,52,64,27,41,50,31,35,35,24,32,103,127,102,110,112,78,37,29,41,54,56,37,37,20,52,121,104,155,177,94,58,35,27,23,20,41,61,141,175,103,61,33,32,20,97,207,214,215,189,172,173,159,148,132,105,84,77,67,78,133,197,239,247,250,244,234,229,226,226,226,219,219,117,5,1,7,11,12,11,14,12,14,15,15,14,203,208,206,203,207,203,203,203,205,206,204,203,203,202,204,203,205,205,204,206,206,206,206,208,207,210,209,209,208,206,207,207,206,206,205,205,205,204,203,204,204,205,205,200,202,201,201,206,201,202,203,203,205,205,206,204,204,206,207,207,207,205,209,208,207,206,207,207,206,208,208,206,206,206,206,207,204,205,204,204,205,203,205,202,205,204,202,203,202,202,205,201,200,202,198,200,200,200,200,198,201,201,200,200,198,200,199,199,200,197,198,199,198,198,198,196,197,198,198,198,195,198,195,194,197,194,197,194,194,195,193,194,194,195,194,194,193,194,194,195,193,195,194,194,194,193,195,193,194,193,193,193,192,193,193,192,196,193,191,194,194,194,191,194,194,191,193,193,193,193,194,195,193,193,194,193,194,191,193,191,191,194,193,194,192,192,193,194,194,194,193,193,194,194,196,194,193,192,194,193,195,193,195,196,193,198,194,194,194,194,195,191,194,194,192,195,194,194,194,194,194,193,193,194,195,194,194,196,193,191,195,196,193,196,196,194,196,195,193,192,197,196,196,196,191,194,193,194,193,193,193,192,193,194,193,183,191,193,189,190,191,194,192,191,192,191,192,194,193,194,190,190,193,191,193,192,194,193,193,193,192,194,196,198,198,195,187,177,173,173,177,177,181,193,193,199,198,193,196,195,191,190,192,192,192,201,198,193,145,134,181,187,203,174,152,185,208,207,205,196,179,207,189,187,203,196,202,201,194,179,176,110,73,92,88,74,56,61,84,136,136,139,162,174,193,135,148,182,186,180,52,72,160,179,182,123,121,123,103,121,89,26,8,45,114,123,66,127,232,240,214,146,166,229,234,246,139,59,58,54,116,108,67,67,102,136,111,84,76,44,21,15,23,38,66,106,122,116,113,109,101,110,118,118,69,72,108,99,100,104,137,142,141,127,71,47,47,57,55,57,46,23,19,26,35,35,38,38,60,110,134,151,167,154,156,163,168,148,93,56,37,31,21,12,24,32,57,85,112,146,166,139,70,33,26,16,73,160,124,108,131,91,38,10,16,16,25,25,21,31,38,45,27,15,27,29,30,36,27,16,19,38,37,35,24,23,15,56,118,63,35,33,21,27,23,29,22,40,73,80,95,121,160,182,184,193,200,212,222,213,198,187,177,162,142,136,126,122,109,122,125,117,125,121,127,133,137,125,95,83,61,41,46,39,37,24,18,21,22,19,23,24,15,17,19,21,21,29,25,32,30,24,17,19,27,26,31,25,30,32,27,21,16,24,20,24,23,20,28,29,36,49,59,76,93,104,117,118,125,133,142,151,159,162,148,151,159,159,167,172,174,168,178,174,156,159,134,139,118,85,81,60,38,45,57,108,110,43,38,41,48,45,33,49,38,30,53,41,33,37,28,23,30,43,33,24,24,19,19,40,61,60,126,170,65,63,84,50,94,89,53,72,65,27,46,126,132,112,110,108,90,40,40,66,69,57,50,47,19,70,145,128,158,126,50,41,27,22,18,29,45,71,115,102,63,47,28,28,28,149,245,242,220,160,127,116,117,115,104,93,67,83,140,200,242,249,250,249,236,233,231,225,224,222,223,223,222,117,4,1,7,11,11,11,14,12,15,15,14,14,207,207,206,205,206,205,205,204,200,204,206,204,205,202,203,203,202,204,202,204,205,204,208,205,205,209,208,206,207,203,206,203,202,203,207,206,206,209,205,204,207,205,204,203,200,205,204,203,201,202,202,201,204,204,204,205,204,200,202,204,207,207,205,208,207,206,205,207,205,207,209,206,208,206,205,206,205,205,203,205,203,205,207,204,205,203,205,202,201,202,200,199,201,200,200,200,199,202,199,196,198,198,199,198,200,199,198,198,197,198,198,197,199,198,198,197,197,198,198,197,197,196,199,198,196,196,195,194,194,194,193,195,195,196,194,192,195,195,194,193,194,193,193,193,195,194,192,194,190,191,194,191,194,194,194,195,190,192,193,192,193,192,193,192,191,189,191,194,194,191,191,195,193,192,195,193,193,193,193,193,194,192,192,193,193,196,193,192,192,195,196,194,195,193,192,190,194,196,192,194,194,194,193,191,195,193,195,193,193,195,194,195,193,192,194,191,194,194,193,198,191,194,194,195,194,191,197,191,192,192,191,194,191,194,194,193,196,193,196,197,195,193,191,196,194,193,194,192,194,194,194,194,193,196,189,188,196,191,189,190,191,192,192,194,191,192,195,191,193,193,193,190,191,194,190,194,193,193,198,194,194,195,194,192,184,175,174,177,181,189,199,196,190,194,193,200,200,196,197,194,196,198,196,196,196,204,193,178,135,139,188,198,201,173,174,200,210,210,208,199,179,204,194,186,207,199,204,203,197,174,159,102,57,80,95,95,73,78,92,149,173,135,169,184,164,144,181,184,200,188,55,131,204,180,203,133,122,134,90,107,68,62,71,88,127,91,60,148,248,219,147,160,220,250,246,159,52,63,88,83,110,74,57,99,134,131,84,78,44,19,14,23,49,63,95,115,115,117,103,101,107,125,125,110,38,41,88,85,106,97,116,110,105,103,44,45,76,81,95,86,48,31,19,19,45,44,37,34,30,37,59,102,144,157,150,152,159,171,176,149,119,76,49,39,27,23,23,36,53,56,56,51,31,25,27,13,79,159,113,113,137,86,29,11,22,30,32,39,54,60,62,53,35,32,31,18,21,27,39,34,33,31,19,23,38,38,47,122,128,53,38,33,34,43,52,68,66,89,134,165,184,198,208,193,181,184,172,174,170,160,141,139,137,134,133,139,146,112,142,143,124,110,98,70,56,38,29,36,26,27,21,23,31,26,24,16,17,15,18,20,20,23,18,18,15,18,18,23,26,28,23,15,21,16,18,24,24,32,32,21,19,22,19,15,20,21,21,23,23,19,20,27,28,31,26,24,32,53,60,72,93,107,126,130,144,147,146,157,155,155,142,141,162,165,161,168,169,172,173,153,156,130,101,110,98,123,89,32,46,50,60,61,61,63,55,39,49,34,22,34,40,51,40,29,21,22,29,29,42,39,26,29,105,157,53,99,136,110,208,150,140,166,80,36,65,146,140,120,107,107,98,39,52,63,45,35,45,43,23,128,169,135,132,62,38,39,21,22,19,21,32,43,52,50,39,36,24,28,20,125,193,159,160,118,99,99,111,122,121,128,165,219,250,250,251,251,241,231,231,229,222,223,225,223,225,225,222,116,4,1,6,11,12,11,13,13,13,14,14,14,203,207,208,205,205,204,204,205,203,204,203,202,204,206,204,202,203,200,202,202,202,205,207,205,205,205,204,205,205,204,205,207,206,206,207,208,210,208,206,206,207,206,206,206,206,204,203,203,200,204,203,204,202,201,202,202,204,203,202,204,205,203,205,204,204,204,206,205,204,205,204,205,207,205,205,206,202,206,205,205,207,202,205,204,204,205,202,204,203,202,203,201,202,200,201,202,199,199,202,199,198,199,198,198,196,201,197,197,199,198,199,196,198,199,198,197,197,196,196,195,195,197,196,197,198,196,196,196,196,198,195,196,195,193,197,198,196,196,197,195,191,193,194,195,193,192,195,194,195,194,194,195,192,193,196,193,196,193,191,191,193,193,190,194,193,192,194,193,192,194,194,193,193,192,191,191,194,194,193,192,195,195,195,193,194,193,193,194,192,196,192,194,191,193,194,193,195,191,194,191,193,191,193,193,191,194,192,193,193,195,195,193,194,193,193,194,193,193,192,193,194,193,193,193,193,194,191,194,193,193,196,192,191,195,195,194,191,191,194,189,194,194,192,194,191,194,193,193,193,191,191,192,194,192,187,186,193,188,186,191,192,193,193,194,191,189,191,193,192,191,190,188,190,191,193,193,195,195,197,198,191,190,180,172,173,180,187,192,198,200,202,200,196,197,194,198,200,197,195,198,198,198,202,200,203,203,173,141,106,118,160,170,167,154,186,204,200,208,204,201,182,206,195,184,207,200,202,206,200,162,160,139,64,78,109,100,76,71,105,184,204,146,166,168,159,177,202,152,184,167,67,209,216,162,212,125,116,129,81,84,53,109,155,185,181,102,59,122,205,128,136,214,246,248,174,68,57,90,113,80,63,53,88,133,128,96,70,44,21,11,27,56,78,100,117,122,116,110,105,111,117,135,139,95,24,54,92,89,104,87,103,121,131,107,66,89,108,97,106,89,60,44,19,21,51,62,56,32,21,28,32,33,59,98,129,142,143,146,153,168,173,161,140,110,75,49,34,24,24,18,19,23,19,23,23,22,92,150,107,122,128,72,25,14,26,34,32,50,98,97,68,52,43,41,27,16,17,18,43,59,49,36,38,59,68,106,118,139,115,50,77,97,125,146,158,185,193,205,205,192,186,178,165,151,137,127,137,128,133,137,137,136,139,141,132,124,107,92,60,51,41,25,21,23,37,33,33,31,16,20,27,25,27,18,16,19,16,21,15,21,21,16,22,19,18,19,17,19,19,25,30,20,19,17,14,22,24,23,27,21,21,18,20,19,16,22,19,21,21,20,19,21,25,34,35,24,27,32,32,27,28,30,35,45,59,89,111,124,131,141,147,142,150,145,134,139,132,142,147,154,160,159,155,163,146,152,99,36,46,35,43,48,41,45,39,36,51,35,21,26,51,61,48,29,21,21,27,49,46,26,23,11,64,130,63,80,86,84,147,103,112,123,70,35,81,155,139,126,107,108,102,45,51,59,39,24,28,26,68,165,155,107,73,37,31,34,29,29,24,21,27,27,29,30,31,35,51,75,106,160,137,107,114,95,98,115,141,137,153,226,249,249,251,251,249,241,233,229,227,225,223,222,224,227,228,227,224,115,4,1,6,11,12,11,12,11,14,14,15,14,204,206,204,203,206,203,201,202,202,204,206,201,203,203,201,201,202,204,202,202,204,204,207,206,204,205,205,204,205,205,206,205,206,204,207,205,206,207,206,206,205,204,203,207,202,205,204,205,206,203,205,204,201,201,204,205,205,205,206,202,203,205,202,204,202,202,207,204,202,205,205,203,203,204,203,202,203,203,203,203,203,205,202,201,201,199,202,202,202,202,200,201,201,199,198,198,198,200,199,199,199,197,200,197,197,197,196,199,197,198,198,196,198,193,198,196,194,197,196,197,198,198,197,197,195,194,197,197,196,194,196,196,194,197,195,194,197,193,196,195,194,194,192,194,195,194,193,198,196,194,196,192,193,193,192,193,193,192,191,193,191,193,193,191,194,193,193,194,191,191,194,195,193,191,194,193,191,191,194,192,192,193,193,193,192,193,193,191,191,192,192,190,191,192,193,190,189,192,192,193,191,191,192,192,192,190,193,192,192,193,191,192,190,192,194,191,194,191,189,195,191,192,192,191,194,189,192,193,191,193,192,193,191,191,192,193,193,191,192,191,190,190,191,191,190,193,193,190,191,191,193,190,190,191,183,189,193,187,190,192,190,189,190,189,190,190,192,188,190,190,189,189,188,194,194,194,195,194,191,188,182,174,177,181,189,195,200,200,201,202,203,201,199,199,193,200,199,196,199,199,199,199,200,199,204,196,171,130,101,122,152,155,131,150,203,205,195,201,203,202,180,204,196,181,205,199,202,205,189,143,163,194,141,106,110,116,98,89,110,200,204,119,141,160,160,205,178,120,202,162,84,190,168,166,226,122,109,127,83,84,48,151,206,220,186,83,50,79,115,93,175,239,246,203,73,58,79,117,112,57,59,89,130,133,93,80,50,26,15,27,59,87,111,118,125,121,106,106,109,124,118,139,133,53,17,93,110,81,100,80,120,130,116,121,83,87,77,70,85,66,49,36,15,24,62,84,76,50,39,32,36,32,25,32,47,86,103,103,130,131,150,148,163,167,157,152,113,77,53,37,28,23,25,23,26,24,110,145,97,128,115,59,22,8,19,19,35,92,115,82,63,53,31,34,36,32,45,54,76,85,84,88,95,119,131,135,146,151,148,152,184,190,206,212,205,199,191,181,154,139,139,135,125,125,136,137,121,147,145,125,100,93,71,50,40,35,34,53,29,32,21,15,18,32,51,48,39,38,15,19,24,19,16,17,19,18,22,16,17,17,15,18,16,19,20,20,23,34,50,49,46,36,25,22,18,26,41,46,45,36,29,22,15,16,18,18,16,20,18,19,19,17,21,26,32,31,33,25,19,39,41,31,28,20,24,31,36,39,46,63,84,90,123,129,134,137,131,136,130,122,129,128,148,153,148,163,123,108,106,67,56,48,35,35,33,39,60,40,22,39,42,49,52,42,28,28,39,38,40,29,25,20,103,143,60,51,51,31,47,37,43,48,39,29,52,127,137,122,99,106,100,47,54,60,52,43,35,61,129,155,91,57,45,22,23,29,29,32,28,34,38,34,43,54,67,105,145,170,167,160,113,93,125,109,107,127,122,92,128,207,241,242,251,251,236,228,229,223,223,223,217,219,223,226,226,223,224,116,4,1,7,11,11,11,14,13,14,15,15,14,204,207,204,203,206,205,206,203,203,205,204,205,204,203,201,203,204,201,203,206,208,206,206,206,206,206,204,207,207,204,204,206,206,207,205,205,206,205,208,206,205,204,205,208,205,203,204,205,205,206,205,203,204,205,204,204,204,203,204,203,206,205,204,204,205,206,203,204,203,206,205,204,204,205,203,202,201,202,203,203,206,202,201,203,204,202,199,200,200,198,198,199,201,198,198,200,200,199,200,199,201,201,199,201,198,202,199,200,200,195,201,198,198,198,198,198,198,197,198,197,196,197,195,196,194,195,198,197,198,198,195,196,195,195,196,195,194,194,196,194,197,196,195,196,194,195,195,193,195,195,193,196,193,194,193,192,196,193,192,195,194,193,196,194,191,193,193,194,195,196,192,194,196,193,192,192,194,194,194,194,193,194,193,191,192,193,195,193,191,193,191,191,191,193,193,191,193,189,193,190,190,193,191,190,191,191,191,193,194,192,191,193,192,192,191,193,193,194,190,193,194,189,191,190,191,192,191,194,194,190,191,192,189,192,190,192,193,191,193,194,193,192,193,191,190,191,189,190,190,191,192,193,193,189,191,193,193,196,196,196,193,194,195,194,192,192,191,193,193,195,196,191,193,193,197,195,190,183,177,184,185,188,193,198,201,202,202,199,201,200,198,200,200,193,192,199,201,200,199,199,198,197,200,201,207,189,181,153,132,177,198,172,134,174,216,205,197,196,205,205,181,200,193,178,203,201,199,208,182,139,182,222,176,117,110,136,121,104,107,175,175,137,163,144,169,202,147,153,239,152,59,135,152,200,242,131,111,127,98,90,43,161,220,226,143,13,10,66,117,126,208,245,203,87,43,77,101,117,90,62,104,125,127,100,81,62,23,17,31,61,93,107,122,122,122,113,99,108,115,113,98,111,90,14,40,141,103,79,97,79,105,76,58,105,87,66,67,55,44,36,36,24,14,34,51,65,74,75,68,46,36,32,30,30,27,29,44,57,87,108,123,131,139,137,144,173,188,187,166,133,114,85,54,50,35,39,125,135,96,122,99,42,14,14,11,23,52,89,90,51,42,48,48,55,61,71,86,110,138,134,141,160,179,181,175,181,179,182,174,178,179,160,157,152,149,145,144,141,129,134,134,133,131,122,122,103,94,72,46,33,24,25,18,17,20,25,22,29,30,24,14,16,36,39,41,48,47,38,21,16,16,14,20,17,19,17,19,30,29,34,30,22,19,16,21,30,53,69,63,60,64,56,50,35,36,68,69,64,66,60,57,43,24,15,19,21,15,16,23,25,18,21,20,18,20,21,24,18,33,75,74,53,49,29,17,21,27,29,26,30,34,36,49,46,71,97,112,123,130,147,146,143,137,133,128,147,139,156,170,136,134,107,74,76,60,59,79,57,67,63,37,32,45,57,62,56,30,19,30,41,46,72,152,152,59,42,45,44,49,37,47,42,29,26,31,96,133,124,102,98,106,61,38,48,63,71,76,95,96,75,47,35,29,24,18,23,28,35,44,55,72,91,129,155,160,165,163,154,134,133,123,137,158,112,92,83,51,31,24,64,135,180,230,242,224,224,224,222,222,220,223,221,221,223,223,221,222,116,4,1,8,11,12,12,14,13,14,14,15,14,200,204,203,203,205,203,203,203,203,202,203,201,203,204,203,204,201,202,203,205,206,203,206,203,204,205,204,207,205,205,206,206,205,205,208,205,205,206,205,204,205,204,205,205,204,205,202,206,206,206,206,203,205,207,204,203,205,202,205,204,206,205,203,206,204,202,203,202,205,204,205,204,200,204,205,205,207,203,204,203,200,201,201,205,201,201,201,200,201,199,203,201,201,200,198,200,199,203,200,200,202,198,203,201,200,203,199,199,200,201,200,198,200,199,200,199,199,198,198,198,198,195,195,196,198,199,195,197,196,196,198,196,197,197,194,196,197,195,198,195,194,196,194,195,194,193,196,196,195,194,195,195,194,195,192,192,195,194,194,194,193,194,192,191,194,192,192,194,192,193,193,194,193,192,194,195,193,191,193,192,194,195,193,195,194,195,194,193,192,191,196,193,193,192,192,194,193,193,191,193,193,190,191,190,192,190,193,194,190,190,191,191,190,191,191,193,193,190,192,194,191,191,192,190,192,191,190,190,189,191,193,192,190,193,191,191,192,192,193,194,193,192,190,190,192,193,192,192,194,193,192,194,197,193,199,199,193,196,200,200,200,200,198,201,200,198,198,199,199,198,199,196,193,188,187,184,184,187,188,193,196,200,204,199,199,200,200,199,199,198,199,198,199,189,184,200,200,198,199,195,196,196,198,203,199,174,173,148,159,202,208,181,152,200,219,206,198,189,201,204,179,200,196,175,198,198,203,212,179,156,196,207,165,115,118,136,110,84,66,125,163,175,174,125,153,186,146,194,230,93,61,141,168,224,235,134,110,121,112,79,22,144,239,239,127,5,36,149,173,167,241,209,95,56,67,107,107,83,53,66,131,123,88,87,59,32,18,23,53,78,113,124,121,122,111,106,106,109,109,94,66,62,59,15,56,155,101,82,98,78,115,126,118,113,83,105,95,57,37,22,23,21,30,49,43,33,33,49,55,36,29,21,24,32,29,27,21,26,33,49,74,101,127,123,123,139,152,169,176,177,184,177,162,147,118,120,155,128,95,115,83,37,27,20,42,59,89,78,71,41,44,59,77,123,127,146,143,181,189,192,189,195,190,181,178,171,156,144,140,122,112,112,124,138,145,148,149,142,133,113,93,82,66,51,33,24,16,18,22,21,16,17,17,16,16,16,23,19,19,21,17,31,38,34,44,62,54,29,21,18,15,15,16,18,21,21,40,59,69,65,56,45,23,21,29,69,77,78,96,98,91,78,78,58,85,100,107,112,92,74,83,81,44,19,14,19,34,79,53,49,35,24,19,20,16,15,17,33,76,87,69,71,54,26,20,16,27,34,30,38,29,24,22,18,21,25,37,41,69,106,124,129,129,133,139,137,127,141,139,133,150,148,160,160,133,127,131,122,133,105,69,58,31,90,100,60,37,24,26,39,85,70,125,146,57,48,51,53,59,50,55,46,42,29,26,96,145,133,98,105,115,65,29,22,45,60,55,57,47,35,34,34,26,19,33,40,56,69,107,154,165,177,174,164,150,135,129,133,142,146,126,123,152,116,61,38,33,24,35,31,19,47,145,216,224,223,223,221,225,225,222,224,225,223,223,223,222,116,4,1,6,11,13,12,12,11,15,15,14,14,200,203,201,203,204,201,201,199,202,201,199,203,202,203,201,203,201,201,204,203,204,200,206,203,203,204,203,205,204,205,203,206,206,205,205,207,206,205,206,204,207,205,203,204,203,206,206,204,205,204,202,203,203,205,203,202,203,203,203,201,205,205,205,205,203,203,202,204,203,202,200,202,202,202,203,202,204,204,204,201,202,202,201,206,203,200,203,202,202,202,203,200,200,202,203,203,203,202,202,199,202,202,200,201,200,203,200,200,203,198,204,201,200,200,198,199,200,198,198,198,199,201,199,199,198,198,199,198,200,197,196,198,199,198,197,197,196,196,196,195,197,194,194,197,195,197,196,195,197,196,198,196,192,196,194,194,196,195,195,195,194,193,192,193,193,196,192,191,193,193,194,194,195,194,193,191,195,193,189,193,194,191,193,193,194,195,193,191,193,192,192,194,193,194,191,192,193,192,195,193,194,194,194,194,192,191,193,194,196,192,192,191,191,194,191,193,192,190,191,192,193,192,191,193,196,193,195,191,192,195,194,193,189,193,194,195,194,195,193,194,194,193,193,193,196,195,193,194,194,196,195,206,202,201,221,201,196,203,199,204,199,199,203,200,206,204,201,205,200,199,199,195,191,185,188,190,192,195,199,202,199,201,203,198,195,198,198,196,196,197,198,195,200,185,177,196,197,199,198,196,196,197,199,199,182,139,127,106,133,184,176,153,152,205,218,208,203,184,194,203,182,199,193,177,201,198,202,208,162,169,188,152,159,128,115,119,92,84,67,91,154,213,147,90,176,193,164,206,181,68,53,98,158,218,231,141,115,123,93,61,12,113,234,252,169,118,185,252,212,206,225,101,60,73,100,114,76,67,76,87,117,87,83,63,25,21,22,47,77,109,131,127,125,116,105,116,116,113,113,95,48,21,43,15,60,154,92,84,96,84,141,143,147,122,98,145,100,66,49,29,53,55,57,51,44,39,23,24,31,20,16,17,19,32,40,33,19,25,30,28,30,39,55,81,110,125,129,137,140,134,145,159,175,192,189,179,165,117,104,120,86,65,84,118,149,156,144,154,164,161,174,170,179,185,182,179,179,181,186,174,160,149,139,134,130,115,113,116,114,127,134,138,151,151,130,110,86,59,45,34,26,30,34,24,19,12,14,17,13,19,16,16,17,15,17,13,17,20,20,17,27,36,24,39,81,93,51,25,22,16,15,17,15,19,19,35,79,82,51,52,77,83,59,30,82,109,91,165,145,107,100,102,102,108,134,146,147,104,139,87,98,136,62,30,22,55,120,148,124,86,71,39,14,24,17,16,23,51,67,66,88,77,44,24,17,16,19,28,33,23,22,19,19,16,19,23,22,25,19,26,35,46,61,76,89,103,111,122,135,123,131,138,146,161,146,154,150,150,165,156,162,146,141,135,131,89,60,49,49,90,71,41,118,143,57,38,25,26,38,36,45,45,34,32,30,98,152,130,99,92,117,80,33,17,16,27,28,33,25,29,38,39,48,63,85,118,148,169,182,177,160,148,136,136,135,141,147,141,125,94,50,64,135,106,50,33,24,29,35,34,48,12,64,195,214,227,226,220,227,221,221,222,223,223,225,223,223,115,4,1,6,11,12,11,14,13,14,14,14,15,200,202,198,199,202,199,199,200,202,200,200,201,202,198,198,199,200,200,201,203,202,202,204,202,201,202,201,204,202,201,203,204,204,204,204,203,204,203,206,204,204,204,204,200,202,203,202,206,202,203,203,202,202,201,201,203,205,202,203,203,203,203,203,202,201,204,201,200,202,202,201,202,202,203,204,203,202,201,201,203,205,205,200,202,202,205,205,200,202,201,201,200,203,201,204,204,200,202,199,201,202,201,202,202,200,200,200,199,201,200,198,199,200,200,200,201,199,199,200,199,200,198,199,198,198,198,197,200,198,199,199,195,197,198,196,198,198,198,198,197,197,199,196,196,195,195,197,193,196,196,196,195,194,194,195,195,194,193,192,194,194,194,194,195,195,192,194,195,195,193,193,193,193,195,195,196,191,193,194,192,194,193,192,193,192,194,192,193,193,190,194,189,192,192,193,193,191,194,192,192,194,194,192,192,194,193,193,192,195,196,194,192,193,196,195,194,195,193,193,196,195,194,195,195,195,196,195,198,198,196,195,197,196,195,195,194,195,196,195,196,196,197,196,195,199,196,195,198,196,198,195,205,183,162,189,193,200,202,199,200,198,201,200,203,204,203,200,199,196,191,191,188,191,194,199,196,197,200,197,200,198,198,200,194,196,196,196,198,197,196,196,195,199,186,171,195,200,195,199,197,197,198,201,198,170,124,116,103,131,164,131,125,143,194,220,209,202,186,187,203,180,195,199,176,198,196,203,189,139,158,154,148,169,130,98,87,94,99,97,115,174,206,140,131,194,183,158,202,162,73,41,55,153,221,231,155,110,96,73,41,10,125,233,252,202,206,252,252,205,185,123,57,81,100,105,76,56,101,110,89,86,76,55,30,12,23,47,87,118,126,134,130,118,119,99,130,130,112,114,98,60,27,38,17,50,141,98,88,95,93,81,49,101,93,110,152,80,49,57,87,110,82,61,46,45,32,23,22,23,23,19,17,20,34,50,46,37,35,33,31,23,19,22,26,37,56,84,108,124,132,141,136,148,159,165,162,138,113,111,112,85,117,175,202,222,214,167,204,192,198,189,173,165,153,149,132,147,128,126,122,111,113,117,113,124,127,123,125,126,122,105,88,61,42,33,28,24,27,26,30,36,32,25,15,15,14,13,13,13,18,19,17,16,15,16,18,19,20,16,21,30,29,26,49,89,80,37,22,18,14,14,21,16,20,19,54,108,73,47,49,63,108,84,75,137,122,107,174,124,46,53,105,147,119,141,136,89,92,128,98,74,125,98,43,85,160,164,109,72,105,125,71,27,14,17,16,39,47,52,99,112,90,41,19,22,10,23,21,20,17,17,25,18,17,21,24,20,20,23,19,19,31,34,36,33,36,45,66,81,95,116,116,136,144,147,142,135,145,148,163,180,184,167,168,153,155,164,122,153,136,94,84,146,125,59,39,29,23,27,23,32,30,29,31,32,90,139,133,91,84,114,84,34,22,16,22,39,48,57,73,84,110,134,152,178,175,173,157,143,132,131,148,150,152,150,133,111,96,58,23,17,108,153,75,50,36,24,25,31,39,55,19,57,193,214,225,231,219,223,220,220,219,220,222,225,223,222,115,4,1,7,10,12,12,13,13,14,15,14,14,200,204,199,199,200,199,198,201,202,199,201,201,199,200,198,199,200,199,201,198,203,201,201,200,200,204,200,204,202,201,202,203,203,200,204,206,203,202,203,203,204,201,201,203,200,202,201,203,204,201,200,201,201,201,203,202,206,202,202,203,200,201,200,200,201,201,201,200,199,204,203,201,202,201,203,201,203,201,203,201,200,202,201,203,200,201,202,200,201,199,202,201,198,201,201,199,199,201,202,201,200,199,200,200,199,198,200,200,204,199,200,198,197,200,196,200,199,197,199,199,196,197,199,198,198,200,199,196,198,196,200,198,196,199,196,197,198,199,197,196,196,195,196,194,196,195,195,195,194,193,193,195,195,197,194,193,196,192,195,196,195,195,192,196,193,197,194,193,196,194,194,195,193,194,194,193,194,192,193,195,194,191,195,196,194,195,190,191,195,195,192,192,194,194,192,195,194,194,197,193,191,193,193,194,195,194,193,194,194,193,195,193,195,197,193,194,196,196,196,196,197,199,198,201,200,197,201,198,200,200,195,198,196,199,197,196,195,197,196,196,198,195,196,198,197,196,196,196,194,199,197,203,136,79,153,191,199,203,198,208,203,204,203,196,200,194,190,191,189,191,193,198,198,197,199,198,199,198,198,194,195,194,196,199,196,196,196,197,197,196,198,197,202,190,170,194,200,198,200,196,196,200,207,196,178,162,155,148,184,179,122,129,143,169,212,215,201,191,185,200,182,192,198,173,196,199,202,171,145,159,155,168,182,143,93,96,109,105,92,107,177,226,166,168,175,130,177,214,142,76,50,98,218,243,243,160,74,66,55,20,45,178,235,232,160,179,248,248,142,76,60,64,110,110,72,53,94,132,108,55,42,47,34,16,18,50,98,130,137,133,128,120,115,103,82,91,97,98,107,106,83,41,39,12,45,154,111,99,112,52,59,38,58,96,90,103,29,29,95,108,87,44,38,36,35,32,25,27,24,24,21,26,19,28,39,36,43,37,26,22,18,16,17,17,21,21,24,35,51,77,97,110,130,142,146,156,147,141,149,122,107,152,184,197,194,175,167,156,150,137,133,127,120,111,115,120,119,134,141,128,133,140,125,114,101,92,73,50,33,30,25,24,16,17,18,18,21,13,24,27,24,24,19,19,16,12,20,17,15,18,17,17,16,15,17,16,14,16,19,17,22,24,30,44,62,55,27,16,15,15,15,20,16,19,16,56,118,108,93,79,49,104,110,105,156,102,90,144,105,103,90,148,125,63,106,117,124,101,155,87,71,147,122,97,148,150,86,66,27,82,161,97,33,14,10,24,43,42,43,101,126,78,32,21,17,13,18,21,21,17,19,20,16,18,21,19,20,18,24,21,16,24,30,36,34,23,31,35,39,41,44,55,76,100,111,122,123,137,141,154,168,162,163,160,165,170,182,179,168,171,143,135,162,125,78,80,63,60,60,52,54,51,55,57,47,103,156,143,107,86,117,87,42,43,46,73,102,136,153,172,174,169,171,163,156,136,129,141,152,148,153,166,145,121,93,63,56,44,41,11,79,212,148,53,43,33,30,26,42,45,57,22,72,214,210,222,230,217,223,220,223,222,220,222,222,217,220,116,3,0,7,11,12,12,13,11,14,15,14,14,206,207,205,203,205,201,200,202,201,202,199,200,202,200,201,199,199,198,201,202,198,199,203,200,199,201,200,203,199,200,201,199,202,203,202,202,202,202,202,203,202,201,207,202,202,202,200,200,200,201,200,200,202,202,199,201,201,200,201,201,199,201,202,202,202,201,202,201,203,202,201,200,200,201,199,203,200,201,203,199,201,199,200,204,201,201,203,200,200,200,203,200,203,199,201,201,200,202,200,200,199,200,201,200,199,203,200,199,200,199,200,198,199,198,196,199,199,200,198,198,198,199,197,195,198,196,198,198,196,196,195,196,198,198,196,196,194,195,197,194,195,193,194,196,191,195,194,194,195,193,196,195,193,195,194,195,196,194,195,194,194,194,192,196,194,193,193,191,193,192,194,196,193,193,191,193,193,190,195,194,194,196,193,192,195,193,196,196,194,195,195,194,195,192,192,193,195,195,191,196,194,193,195,195,196,194,195,195,198,196,195,195,193,196,198,196,197,196,195,198,198,198,198,198,198,199,198,200,200,198,198,198,196,198,196,196,196,197,196,195,196,193,195,196,198,198,198,198,195,198,198,200,142,120,180,200,217,214,216,226,209,204,196,191,192,191,193,196,199,201,203,204,205,205,206,203,200,201,201,202,200,200,201,201,204,200,200,202,202,203,202,203,206,204,179,194,209,207,210,207,206,210,209,185,170,165,158,165,208,192,143,172,169,142,189,221,216,209,195,205,191,201,210,185,204,210,199,175,181,173,159,153,171,192,128,119,123,84,77,82,163,214,162,160,134,160,220,191,108,63,50,147,249,249,239,112,39,43,35,20,97,231,227,206,126,109,205,167,51,38,61,95,117,73,59,95,124,134,81,47,46,24,19,25,35,88,129,141,138,129,121,118,108,86,42,33,53,62,93,103,93,57,46,39,66,161,127,96,66,48,67,15,77,109,72,30,8,66,131,98,49,39,60,53,37,39,29,24,28,26,25,27,24,26,24,19,24,19,16,16,16,15,18,21,19,19,24,20,22,28,41,40,50,57,79,97,110,148,156,151,153,171,188,192,201,184,166,165,153,151,143,141,150,128,127,118,81,119,96,78,69,53,45,33,32,31,20,22,15,15,21,18,19,17,15,14,16,16,17,21,16,21,17,14,22,18,16,22,17,14,19,19,19,18,17,16,17,16,21,15,27,36,43,60,59,56,40,35,24,14,18,17,19,22,16,31,89,123,110,91,48,83,92,98,126,75,89,100,103,123,118,88,49,22,53,77,113,149,131,81,134,159,142,106,155,98,79,143,95,162,199,101,21,9,13,14,33,48,86,94,78,48,27,15,14,16,16,17,17,23,21,15,21,22,19,20,21,19,18,21,18,19,21,36,38,32,44,36,33,24,21,23,34,34,47,56,65,91,111,130,136,153,162,162,151,154,157,150,156,147,151,152,159,154,130,181,166,166,156,141,143,137,141,141,143,169,179,171,127,91,113,92,61,108,135,160,192,182,166,155,140,138,130,129,142,139,159,170,155,128,113,98,69,55,38,39,32,39,36,30,194,241,115,41,41,26,29,33,42,37,60,29,71,214,210,220,230,216,222,215,218,218,219,222,223,220,219,115,4,0,6,10,13,12,13,12,13,14,15,14,205,210,207,206,206,203,204,205,205,200,201,203,201,203,199,201,200,198,201,197,202,200,200,202,200,205,201,200,200,200,194,198,202,200,199,199,201,200,201,201,200,200,201,202,201,202,201,200,201,202,203,204,203,202,201,200,201,199,200,201,200,198,200,201,202,199,200,200,200,202,199,201,201,198,199,200,199,198,201,200,200,202,199,199,199,200,201,200,199,199,200,200,197,199,202,200,200,199,198,198,198,199,199,199,196,198,199,196,199,196,197,198,197,199,198,198,198,196,198,196,196,198,196,198,195,197,193,195,199,193,195,195,195,198,194,196,195,193,195,194,194,193,194,194,193,194,194,194,193,195,195,194,193,192,193,193,195,190,191,190,191,193,190,191,191,192,192,192,193,193,193,191,192,192,194,193,193,194,193,195,194,193,196,193,193,195,196,198,195,194,192,195,195,195,194,194,196,194,195,194,193,198,198,194,195,195,195,195,196,194,196,193,194,196,194,196,194,195,197,199,197,198,197,199,198,198,201,196,199,198,196,198,198,198,197,198,199,201,201,203,203,205,205,206,207,208,210,208,212,211,213,221,193,207,235,231,233,218,211,218,214,210,210,211,217,217,224,226,228,227,225,228,227,227,227,226,223,222,227,225,226,226,226,225,226,226,223,226,227,227,227,228,229,227,207,211,226,226,229,225,221,231,210,166,137,128,127,151,197,167,160,217,199,138,159,225,239,236,225,225,212,223,232,208,229,235,205,198,212,177,158,158,214,239,139,96,103,93,86,73,119,142,112,142,154,219,248,133,65,61,39,107,235,239,151,46,6,32,31,43,166,246,213,205,115,92,163,74,27,71,93,124,79,54,97,123,133,87,79,63,28,16,17,48,87,124,128,122,127,124,122,121,113,81,69,77,60,44,89,104,66,76,96,63,48,130,102,46,60,48,45,17,105,165,72,36,63,139,162,84,44,89,104,69,54,53,37,22,24,24,23,27,19,25,21,23,23,19,20,13,17,17,16,16,18,18,19,16,19,26,30,34,27,22,26,33,37,44,59,51,66,81,92,114,121,121,122,116,101,93,83,73,63,50,43,40,33,25,24,30,23,21,26,27,26,19,17,21,17,14,17,17,17,17,22,20,16,18,17,21,22,16,18,15,17,19,19,18,15,21,16,19,21,18,19,18,24,21,25,19,34,85,101,108,105,85,86,75,49,24,17,15,17,22,16,17,34,40,71,103,48,67,74,90,104,64,110,101,65,56,41,44,40,38,44,38,64,63,89,139,165,144,122,89,131,68,81,182,111,169,157,38,23,13,18,23,33,71,141,114,63,55,30,26,19,15,19,22,21,18,21,21,17,23,20,20,20,20,21,20,23,21,20,28,29,31,38,25,21,22,15,19,22,24,21,27,27,32,44,58,76,97,118,127,134,141,146,142,139,147,141,142,146,148,167,180,183,182,185,181,184,193,189,188,181,169,175,165,123,103,125,101,101,138,145,161,150,142,126,131,141,133,139,134,141,137,127,114,79,48,34,38,41,38,33,27,22,33,23,94,248,233,78,45,44,19,29,29,36,39,56,27,69,202,210,225,229,214,224,220,219,220,217,221,224,222,223,113,4,1,6,11,13,12,13,13,14,14,14,14,206,205,206,205,205,205,206,205,202,203,199,201,200,198,201,199,202,201,200,200,200,203,201,202,203,203,203,200,204,203,195,201,202,200,204,200,202,204,202,204,201,200,203,202,201,204,201,200,202,203,206,206,205,203,204,202,201,202,200,201,200,202,200,203,200,199,202,200,202,200,203,198,198,202,198,202,199,201,201,200,203,201,198,199,198,197,198,200,200,199,198,198,200,196,197,198,200,199,197,198,198,198,199,196,195,200,198,198,196,195,197,196,198,197,196,198,196,197,196,196,195,198,198,195,196,194,195,198,195,195,196,194,194,194,193,194,195,193,195,195,195,194,193,195,192,195,194,193,194,196,193,190,191,193,190,192,193,190,191,192,191,193,191,194,194,191,193,194,195,190,192,194,193,196,194,193,196,196,195,196,194,194,194,195,195,193,196,192,193,194,194,197,197,195,195,193,192,196,193,195,196,194,197,194,195,195,197,195,193,196,194,196,196,194,194,194,196,194,194,196,193,193,196,199,198,199,198,199,195,195,196,198,197,199,203,208,220,229,230,229,233,234,233,237,237,234,240,237,242,239,234,244,222,243,249,241,230,160,134,193,229,246,245,244,248,248,249,249,248,249,249,249,248,248,243,250,237,229,242,243,246,242,242,240,239,245,229,229,238,240,241,240,238,232,224,209,209,215,222,226,212,225,200,160,133,127,129,136,159,131,170,229,198,139,123,188,226,235,226,218,212,215,226,212,214,225,188,197,201,136,159,176,229,226,108,96,110,101,113,95,85,61,80,139,141,234,208,70,65,72,31,30,122,139,66,11,14,38,29,92,214,246,217,235,139,103,121,59,67,88,112,88,59,92,128,128,91,80,57,30,17,21,45,89,127,132,128,121,120,128,132,124,88,79,102,84,69,43,93,72,48,67,74,48,27,63,77,69,62,25,29,62,144,177,85,100,133,165,157,73,50,121,108,78,74,63,47,21,23,21,24,27,22,27,22,24,24,19,21,20,15,17,21,15,15,19,16,13,15,16,25,36,37,23,27,31,25,31,19,19,24,20,19,23,25,25,28,29,34,33,32,22,26,29,31,19,18,24,29,39,29,30,24,18,27,19,16,18,14,17,17,17,19,17,15,19,19,19,19,18,17,16,20,19,17,18,19,17,19,19,17,19,17,20,21,19,25,17,29,16,94,196,201,187,170,169,171,160,90,25,11,15,21,15,19,25,15,33,84,96,38,57,71,90,118,75,101,72,40,44,36,45,37,35,44,47,53,52,69,111,147,124,120,84,106,55,55,131,53,52,42,17,27,14,16,27,74,129,162,129,91,90,57,63,35,19,23,16,22,22,22,22,22,21,21,23,19,20,15,19,19,22,22,15,18,23,20,17,22,20,16,16,27,29,24,24,26,22,26,34,37,41,39,46,61,84,95,106,113,120,132,141,150,147,145,149,148,147,142,139,147,141,141,145,136,128,135,145,131,117,144,134,112,140,139,143,141,126,132,133,134,124,109,94,76,52,49,47,41,37,28,36,39,36,25,25,23,23,28,115,248,181,66,55,35,25,27,31,48,46,62,26,73,206,211,224,226,214,220,218,222,218,218,222,222,221,222,116,4,1,8,12,12,11,14,13,14,14,15,15,204,204,203,205,205,205,203,201,202,200,200,200,199,202,198,199,200,200,201,199,203,200,201,204,202,202,204,204,220,234,206,200,202,200,204,203,207,203,202,205,203,202,205,203,201,202,205,203,201,205,206,206,205,206,206,205,206,204,205,204,203,205,207,202,205,206,207,206,204,205,201,203,203,201,203,204,203,204,202,200,200,199,201,200,200,200,199,198,200,200,201,200,198,200,200,202,201,198,199,200,199,198,199,197,196,199,199,196,198,197,196,196,196,200,198,195,195,194,198,196,197,196,197,196,192,197,195,197,195,193,195,193,192,194,194,194,193,191,195,194,193,193,193,192,194,195,193,193,192,193,192,192,193,193,190,190,194,192,196,194,194,195,193,195,194,192,193,193,192,194,196,195,196,192,195,196,195,194,196,198,194,196,198,196,198,194,195,193,195,196,196,197,195,197,195,195,197,196,194,196,193,195,197,195,197,197,196,194,194,195,195,196,197,196,194,196,195,196,195,196,196,197,195,199,198,198,197,197,202,198,195,199,195,200,203,195,200,205,205,205,204,203,200,200,198,193,194,192,187,174,162,203,187,165,158,151,139,83,73,119,158,167,164,162,163,161,160,155,155,153,151,156,151,149,146,168,167,132,144,148,147,149,141,144,149,161,149,127,133,137,139,141,140,139,161,145,129,133,153,170,128,134,125,125,125,120,122,130,149,120,177,231,199,139,88,106,125,135,135,130,137,131,131,128,136,137,123,139,113,71,105,124,141,128,83,114,133,116,126,113,99,49,54,90,81,145,113,36,51,60,36,27,41,41,43,29,43,46,53,165,245,246,232,242,168,116,103,84,99,100,68,62,87,122,127,93,87,54,33,17,23,49,92,138,128,129,130,126,133,125,130,105,66,48,70,76,71,80,57,42,31,45,49,38,31,70,91,62,36,30,128,116,67,80,86,125,128,130,113,56,39,78,76,64,67,65,46,17,19,23,27,28,23,27,27,21,25,23,21,22,17,19,16,19,17,13,17,16,14,17,17,28,37,29,35,31,22,15,16,19,17,21,16,20,18,19,16,21,19,21,31,28,31,29,20,16,19,37,61,59,46,43,19,14,23,15,18,14,16,19,19,23,15,17,19,16,17,17,18,18,21,18,16,19,17,18,16,17,21,20,19,22,17,21,20,16,25,15,26,18,63,160,164,170,166,166,178,149,78,20,12,16,15,19,26,22,34,80,104,61,29,71,66,107,145,83,55,40,39,37,42,58,56,60,66,71,59,55,55,58,75,131,158,98,102,54,46,107,72,43,14,14,27,17,21,29,106,161,162,141,137,136,139,135,71,24,13,18,17,18,23,19,20,21,19,22,21,19,17,16,22,18,19,21,19,21,17,17,19,19,17,19,19,19,22,18,19,20,22,29,36,37,30,27,41,39,37,40,39,45,51,71,91,103,111,106,112,120,122,122,126,132,134,133,133,125,136,145,126,139,163,139,123,128,124,127,107,95,76,66,55,41,43,33,30,27,27,30,30,38,39,41,36,26,23,29,17,27,24,79,144,80,58,60,33,31,33,37,46,55,53,36,140,226,220,228,224,216,222,217,219,217,219,221,221,220,221,116,4,1,8,13,12,12,15,13,14,15,15,15,202,202,201,202,202,200,201,200,200,204,201,203,200,198,201,199,201,200,202,200,198,201,200,202,202,200,206,203,240,252,205,201,202,198,206,200,203,205,203,200,204,203,203,200,201,205,200,203,204,205,206,206,205,205,206,204,205,205,201,206,205,203,206,206,205,206,206,204,206,204,204,205,202,203,205,205,203,203,202,204,202,200,201,203,201,202,202,200,202,200,200,200,201,199,200,199,200,198,198,202,199,198,200,199,200,201,197,198,197,197,198,198,197,195,197,196,198,198,196,196,195,196,196,194,195,196,193,196,195,193,195,191,194,195,197,194,194,195,194,194,192,194,194,194,192,191,192,193,193,193,192,193,193,192,193,196,194,196,196,196,194,194,193,193,195,196,193,193,194,194,195,193,195,196,194,194,195,196,196,194,197,196,196,196,195,193,193,196,194,195,193,196,195,194,195,193,194,195,193,195,193,193,195,195,196,193,194,195,194,193,194,196,193,193,193,194,196,193,195,195,196,196,195,196,195,197,198,196,198,198,198,198,193,199,190,120,79,80,71,69,66,62,56,55,53,46,46,36,33,29,29,105,81,14,12,10,18,12,10,16,10,15,14,14,14,11,12,11,12,12,11,12,10,14,8,34,52,15,9,12,10,14,11,15,9,37,40,10,13,12,13,11,15,10,49,62,25,19,34,76,33,35,57,60,65,71,78,111,153,149,199,212,180,143,85,51,15,6,12,12,28,23,10,15,12,32,34,32,36,16,44,36,29,21,30,110,132,121,129,119,107,70,52,49,15,42,35,18,65,64,59,57,43,25,31,44,66,48,78,184,241,249,221,234,180,134,122,94,108,68,61,80,110,127,93,89,57,29,21,26,46,96,139,131,125,128,126,133,129,109,100,71,41,35,63,83,91,55,45,25,32,73,54,35,38,74,69,33,50,127,224,137,22,35,75,105,75,76,81,58,40,30,39,56,56,48,34,23,24,24,33,31,23,28,26,27,27,22,26,23,25,29,19,19,21,19,16,16,18,17,21,19,18,22,26,20,17,19,19,18,18,18,16,20,19,15,17,18,17,19,23,27,20,20,20,16,30,53,69,59,53,45,25,16,15,18,18,18,19,18,16,19,21,16,16,21,22,17,22,19,21,21,16,20,19,18,24,16,19,24,17,19,19,22,19,20,24,16,21,28,35,63,66,63,62,56,53,44,38,21,15,16,19,22,27,34,69,117,86,40,46,63,65,114,141,65,43,34,24,24,42,77,95,127,123,65,36,40,57,54,47,113,165,106,92,53,41,87,97,57,20,13,20,20,17,27,81,142,162,150,160,124,173,158,72,23,12,14,20,18,21,22,19,22,21,21,19,20,22,22,19,17,20,19,19,23,16,21,23,19,22,18,16,15,20,21,19,19,16,19,34,41,40,41,34,33,33,27,28,30,24,27,37,36,41,42,51,61,65,79,85,88,98,102,107,99,101,89,88,104,100,86,59,76,50,49,52,39,33,29,29,25,21,24,25,23,22,22,26,30,30,33,25,27,29,27,21,24,29,27,40,45,55,50,28,34,42,41,41,44,21,71,210,235,226,223,221,220,220,220,218,217,220,221,223,220,221,116,6,2,8,12,13,12,15,14,16,16,16,15,204,205,201,202,204,203,205,200,202,202,201,200,200,202,201,201,203,201,201,201,201,199,201,201,202,201,207,198,241,252,200,204,203,197,205,199,205,202,201,205,201,201,202,200,202,201,202,203,203,205,204,204,205,206,206,206,204,203,205,205,204,204,204,203,206,203,205,205,202,204,205,205,202,202,206,205,203,205,201,204,204,202,202,201,203,202,202,201,203,199,203,201,202,202,198,200,199,201,199,200,199,200,200,199,201,198,200,201,199,199,201,201,197,200,196,198,199,197,198,196,199,195,197,197,196,198,196,196,194,197,198,198,196,195,196,196,198,195,195,197,196,194,196,198,195,196,196,197,196,196,196,193,195,197,194,198,198,194,198,195,196,198,198,196,195,195,194,194,194,197,196,194,196,194,194,196,198,195,194,195,195,196,196,194,195,195,192,193,193,194,194,194,193,194,192,196,195,191,193,194,193,194,195,194,194,193,195,194,194,195,194,195,194,194,193,194,192,192,193,196,195,195,196,195,196,199,197,196,198,198,195,196,196,199,185,115,79,76,72,70,68,68,63,68,65,61,62,62,59,63,61,77,87,71,56,57,59,61,68,60,54,45,46,45,42,47,46,46,46,46,46,46,45,45,37,59,103,69,43,40,38,44,35,38,35,49,85,61,35,29,29,32,31,23,69,78,23,12,17,67,59,58,67,66,56,46,65,95,153,184,207,179,130,94,62,51,22,11,15,9,27,27,12,16,13,14,15,20,25,17,30,35,24,12,27,71,85,110,130,123,103,67,51,34,3,35,43,26,74,88,75,55,28,19,32,50,70,35,65,181,205,177,168,170,149,146,128,96,59,46,88,120,121,92,84,54,29,18,28,54,94,135,132,128,120,119,121,131,117,68,76,51,48,62,72,102,67,41,15,33,118,148,69,39,61,55,22,48,104,141,198,78,36,69,65,79,60,67,55,59,51,25,18,33,38,33,42,41,33,24,28,31,24,31,31,28,28,23,25,29,23,30,27,22,24,21,22,15,20,19,15,20,19,18,17,15,21,17,16,18,15,21,20,17,17,17,19,17,17,17,16,19,18,18,20,27,40,42,61,70,55,37,20,15,15,17,19,16,20,21,19,19,19,17,17,21,17,17,19,21,19,17,20,24,21,17,19,18,16,24,23,18,20,22,23,18,18,24,27,30,35,51,51,41,29,29,30,22,27,21,12,21,23,21,33,53,104,121,65,52,61,57,66,90,84,45,39,23,21,25,28,61,118,131,72,32,19,26,32,61,68,95,128,112,93,41,40,66,103,88,28,18,17,19,22,23,45,70,83,81,91,84,58,49,37,19,17,16,23,25,18,26,27,19,20,19,21,22,17,21,20,17,21,24,24,17,20,22,17,19,20,18,21,19,19,21,19,19,16,19,21,31,34,34,27,23,34,65,76,57,45,32,22,26,29,29,33,37,39,30,27,32,34,34,33,33,37,37,33,36,44,36,30,33,31,38,38,26,22,21,25,25,25,22,26,25,28,27,23,26,21,23,24,25,24,25,24,28,27,25,31,30,39,43,46,47,49,49,46,32,25,134,240,237,222,219,218,219,220,218,217,219,219,220,222,220,220,115,6,2,8,12,13,12,15,15,15,15,16,16,199,203,203,203,203,200,204,203,200,204,199,200,203,203,202,201,203,202,201,202,201,202,201,200,201,201,209,195,243,252,196,205,203,197,205,200,207,202,202,203,202,203,203,203,203,203,203,205,204,203,203,204,202,203,207,203,204,206,202,206,204,205,205,204,204,206,206,205,206,202,204,205,203,204,203,201,202,202,203,203,203,205,200,201,202,201,200,200,202,202,202,197,200,200,200,200,199,200,198,200,202,201,200,198,199,199,200,202,201,201,199,199,201,199,200,198,197,199,198,199,200,198,200,198,197,199,196,200,197,198,200,197,199,198,199,196,198,199,197,198,197,199,196,196,200,196,196,198,195,196,196,198,198,197,196,195,195,195,195,196,196,196,195,194,197,198,195,196,196,195,196,193,196,196,193,194,193,194,195,193,194,194,194,196,195,193,198,198,193,194,194,196,193,193,195,192,193,193,194,193,193,195,194,193,193,195,194,194,194,192,193,194,194,193,193,193,194,192,192,196,196,196,197,196,196,196,195,196,197,197,195,196,196,199,205,191,193,200,199,205,206,203,205,210,208,209,210,211,211,211,191,171,189,218,225,216,218,217,217,221,219,220,215,218,216,217,219,219,222,218,221,222,220,218,210,220,219,214,215,214,216,214,216,212,207,210,209,212,210,207,208,205,200,198,205,176,156,169,172,191,170,161,148,130,139,120,105,117,160,190,186,157,90,44,36,98,169,183,176,176,174,153,171,171,129,154,160,138,110,102,163,153,145,127,113,104,81,124,134,115,91,46,53,98,91,101,87,59,74,67,55,33,17,12,36,57,61,29,100,198,117,79,92,79,98,130,125,63,29,64,125,126,94,86,49,29,17,21,61,103,125,131,128,123,114,116,120,126,101,38,30,33,44,60,95,90,44,24,48,116,194,208,127,82,52,13,63,162,142,49,71,65,68,78,60,71,47,56,52,49,61,34,23,23,34,46,55,56,44,28,27,32,32,29,17,28,32,22,28,28,27,25,20,24,27,23,24,18,18,19,17,19,18,19,17,18,17,16,17,20,19,19,19,22,15,19,19,14,21,18,18,21,18,15,22,33,34,59,97,90,61,31,21,21,17,17,15,19,18,19,20,21,17,17,19,20,17,19,18,20,21,17,19,22,21,21,20,20,19,19,22,21,21,20,22,24,24,19,28,31,36,50,45,37,34,29,29,19,19,20,19,22,23,24,29,59,128,113,68,83,81,59,62,74,46,33,32,21,34,28,54,126,145,97,41,26,30,25,29,43,71,88,97,111,62,51,71,69,116,111,51,19,12,19,18,25,48,52,49,41,33,27,29,27,17,24,19,16,21,21,24,26,25,25,21,22,24,24,27,24,27,21,23,21,20,25,20,19,19,17,19,23,21,19,23,21,21,17,18,24,24,21,19,22,21,35,80,122,107,79,57,29,19,29,33,31,38,31,24,22,20,22,23,24,23,22,23,22,20,25,28,40,38,37,42,34,27,21,25,20,16,25,25,24,24,22,22,24,28,28,21,28,25,22,24,30,28,25,47,36,28,36,35,48,68,68,59,61,54,123,167,231,248,225,222,221,217,215,214,215,215,215,217,220,218,215,219,117,6,2,8,12,13,12,14,12,14,15,16,15,199,203,201,201,201,200,203,202,205,203,201,203,202,201,201,200,204,200,203,204,200,203,202,202,204,200,206,193,244,252,191,203,200,197,206,200,206,203,199,202,200,202,205,202,204,203,203,205,203,201,203,203,201,204,201,203,204,203,205,205,203,206,204,204,206,202,205,204,204,204,205,204,202,202,203,203,202,202,203,202,200,201,203,202,200,201,200,200,201,199,200,197,201,198,200,200,198,200,198,199,199,199,199,199,200,198,199,199,199,197,198,199,198,201,199,199,198,194,198,196,200,198,197,197,196,199,196,199,198,198,198,196,195,196,196,197,198,195,196,197,196,196,198,196,196,196,196,195,198,196,194,196,198,200,195,198,196,195,194,195,196,194,195,196,195,194,196,196,195,196,194,193,196,194,196,195,192,194,193,194,193,193,194,193,191,194,194,196,193,194,196,193,194,196,193,192,194,194,195,193,191,195,195,192,193,193,194,193,195,194,189,193,193,195,193,192,195,192,192,195,191,195,197,196,196,195,194,195,196,195,193,193,194,199,206,209,224,230,235,235,235,239,238,241,239,240,242,242,240,242,239,214,205,239,252,252,246,244,249,246,251,252,248,246,248,251,251,250,249,252,250,248,250,251,246,246,249,240,247,249,250,251,248,252,245,249,247,241,251,248,249,247,248,244,251,232,218,242,252,252,213,200,188,187,205,168,132,128,165,189,160,112,67,32,55,174,241,250,252,252,250,221,252,242,205,248,238,196,174,203,252,230,210,179,201,179,112,132,123,93,56,31,102,219,199,183,163,91,77,40,27,35,22,27,41,63,59,54,181,200,88,83,89,49,40,78,75,34,58,116,125,92,81,52,25,16,29,53,107,134,122,118,118,118,119,123,125,130,94,39,33,35,35,45,61,61,31,48,111,177,202,178,100,40,15,10,104,212,128,21,44,62,92,72,64,71,44,49,43,50,49,41,31,40,58,69,68,59,61,39,27,25,29,31,26,29,33,28,27,35,31,25,26,25,27,27,27,25,24,19,19,21,20,18,21,23,14,20,18,18,21,18,19,17,20,19,18,18,15,15,20,18,18,20,29,37,30,81,129,98,55,31,20,17,14,19,20,15,23,19,16,20,21,19,17,22,21,21,18,21,23,16,20,19,23,20,20,19,18,22,22,20,20,23,22,21,20,21,39,33,28,47,50,48,44,46,30,25,23,15,20,21,21,24,30,67,132,117,67,75,101,79,56,56,41,27,26,22,34,27,89,160,150,102,67,71,44,24,31,23,59,82,94,67,58,113,118,68,117,138,63,21,12,17,17,29,43,51,51,30,29,30,24,30,23,21,23,20,21,23,30,27,26,27,23,21,22,30,25,27,24,24,31,20,23,27,23,19,19,19,17,21,17,18,21,24,17,17,18,23,24,18,18,17,34,78,115,104,77,71,61,33,16,22,24,24,33,27,17,18,22,21,18,24,23,21,21,20,20,20,24,33,41,33,35,29,22,23,21,23,23,22,28,24,22,23,27,27,25,27,25,24,22,27,23,32,25,65,104,53,27,34,48,57,76,77,68,69,99,217,246,246,243,216,225,217,214,214,212,210,213,214,214,217,217,214,218,116,5,1,8,12,12,13,16,13,14,15,15,15,200,204,201,203,205,201,202,202,200,200,200,200,199,199,198,201,200,200,202,202,202,201,204,202,206,203,205,193,244,245,180,199,200,202,204,199,207,203,204,203,199,201,200,200,203,203,203,201,203,203,201,204,204,202,200,203,203,203,201,202,203,203,203,203,201,201,200,200,204,202,204,203,202,203,203,203,205,202,202,201,200,202,199,201,201,199,199,198,199,199,200,199,200,199,200,198,198,200,198,199,198,198,198,198,199,198,195,198,195,196,199,196,199,198,198,195,196,199,197,199,197,195,198,198,195,196,194,197,194,195,198,195,196,195,196,197,197,196,196,197,195,195,194,193,195,194,198,195,193,196,196,196,196,193,194,195,198,195,195,196,192,197,194,193,195,193,192,192,193,192,197,195,193,195,191,194,196,194,192,191,194,193,193,194,193,192,194,194,194,192,192,194,194,193,192,193,193,194,194,192,193,192,193,194,193,191,190,191,192,193,192,193,194,192,193,194,193,193,192,191,193,193,193,194,194,196,195,194,195,194,196,196,195,194,196,199,200,205,204,206,206,205,207,208,208,208,207,208,209,211,213,203,184,189,214,224,217,210,211,212,212,212,212,212,212,213,213,214,214,212,213,212,213,214,212,214,211,213,214,213,211,212,214,211,212,211,213,213,211,212,214,214,212,215,217,204,199,210,228,213,178,177,175,190,213,167,124,148,182,185,141,82,39,39,122,212,234,229,221,225,222,197,221,197,186,227,173,160,181,221,249,174,151,177,226,192,122,133,96,60,72,52,135,244,208,212,165,83,55,10,63,98,55,39,60,66,63,154,249,219,107,115,105,41,17,27,49,67,109,97,78,73,51,27,17,29,49,103,131,122,118,122,120,116,125,128,127,117,111,73,40,29,34,37,39,34,31,97,162,178,161,92,33,38,56,18,66,141,82,52,76,65,77,76,66,66,40,39,36,39,54,33,30,44,70,84,73,60,58,56,29,32,53,49,33,25,30,25,30,34,27,27,21,29,29,25,25,27,24,21,22,23,25,21,21,19,18,22,18,22,19,20,20,24,23,17,18,23,19,16,21,20,21,19,26,35,27,53,83,66,39,18,14,17,15,18,19,22,22,22,21,17,21,21,22,21,21,21,21,23,19,20,24,17,23,24,22,20,19,22,22,19,22,25,22,22,23,24,41,45,39,59,75,73,81,84,57,31,21,18,17,20,23,23,23,56,146,123,87,109,127,110,63,51,31,22,31,43,67,55,71,83,59,71,79,72,42,30,44,33,39,74,85,66,96,157,98,54,126,149,76,20,12,14,23,28,37,51,55,38,41,44,29,27,18,25,23,24,25,21,27,26,27,27,23,24,29,29,24,25,25,26,27,23,26,27,23,23,22,19,24,20,18,21,25,24,17,22,21,21,19,17,24,28,59,88,77,76,80,80,50,27,21,15,17,18,25,21,22,23,19,20,22,19,24,23,19,22,21,24,23,24,27,26,21,21,25,25,27,23,24,24,23,27,27,24,24,27,27,26,27,26,33,25,27,26,41,150,125,46,39,38,55,66,85,78,71,76,78,200,245,237,230,213,218,216,212,211,211,214,213,214,214,216,217,215,216,115,6,2,8,12,13,12,15,14,15,15,15,16,205,207,205,202,203,201,199,202,203,202,201,202,201,200,203,202,201,202,201,201,202,203,203,203,206,205,206,193,234,219,167,195,195,199,205,200,203,202,205,206,203,201,203,200,204,203,200,203,203,201,203,203,202,203,203,200,201,201,200,200,202,202,203,201,201,201,201,203,201,202,203,203,203,202,202,201,202,202,202,201,198,200,202,199,200,200,200,202,200,198,198,198,200,197,199,199,199,200,198,198,198,201,199,196,198,198,198,198,199,199,198,196,195,197,197,197,198,196,197,193,194,193,193,195,196,195,191,195,195,195,196,195,196,195,195,195,196,194,193,194,196,195,196,195,196,194,193,193,195,197,196,196,192,193,194,193,194,194,196,193,194,194,193,193,191,191,192,192,191,194,193,192,195,193,193,193,195,195,191,193,193,194,193,195,192,192,192,193,192,192,193,191,193,194,192,193,195,194,192,193,193,190,194,193,194,194,192,193,194,194,191,194,194,195,193,193,194,193,194,194,193,193,194,192,194,196,196,195,194,195,195,195,194,194,194,195,196,195,197,195,198,198,198,200,198,199,196,199,199,199,204,199,188,166,178,207,208,204,199,198,200,200,198,199,200,199,200,199,197,199,200,198,199,199,200,198,199,199,200,200,198,198,198,196,197,198,198,198,199,198,197,199,195,198,200,186,182,187,204,170,128,143,158,191,207,152,124,173,207,180,130,63,30,102,198,231,217,204,196,203,197,184,194,147,162,187,147,159,179,225,177,72,108,193,230,186,141,109,89,81,74,66,127,196,156,126,100,65,30,2,72,130,82,79,71,63,167,237,246,182,74,84,39,16,19,28,84,105,90,57,49,49,27,19,30,57,102,125,128,113,117,121,110,118,129,119,108,119,124,98,63,57,53,38,31,38,92,156,179,162,103,100,110,127,130,64,44,69,71,86,79,62,77,65,57,50,42,37,33,39,39,31,30,49,64,74,69,54,57,73,87,106,105,73,36,30,28,27,34,33,29,24,27,26,23,29,27,29,30,23,25,27,27,31,25,19,21,21,24,27,22,20,24,24,22,22,20,19,18,17,19,19,18,18,24,39,44,66,77,57,37,23,14,15,15,20,19,21,28,19,22,23,18,19,20,22,21,21,23,21,19,25,25,21,22,22,22,19,20,23,23,20,27,24,25,28,27,30,48,53,61,120,135,141,155,134,92,53,25,13,17,20,19,29,19,49,128,136,145,139,152,143,95,53,22,23,34,81,115,98,82,61,34,25,52,63,71,83,60,35,32,64,98,75,117,159,101,94,148,165,75,18,15,9,21,31,39,57,71,66,81,79,51,29,24,23,22,22,22,23,26,27,24,30,22,26,29,27,30,27,24,25,25,25,28,29,26,29,24,22,27,24,21,27,23,21,23,21,19,19,21,23,24,41,72,53,67,131,137,95,39,24,21,17,23,16,18,22,19,23,17,23,23,19,21,18,27,20,17,27,19,22,18,21,29,21,23,24,26,24,26,26,27,24,26,26,25,25,29,30,30,28,29,29,29,17,116,200,96,69,88,66,73,74,93,71,76,76,43,161,219,214,226,211,215,211,212,216,213,213,216,214,214,216,216,216,216,115,6,2,7,11,14,12,15,14,14,15,15,15,201,203,202,202,201,203,202,204,203,200,204,203,205,204,203,204,200,201,201,203,205,201,201,202,206,204,203,195,235,233,184,194,193,203,201,201,203,201,203,202,203,204,205,201,201,203,204,201,201,201,200,203,202,202,201,201,202,201,200,200,201,200,199,201,200,200,202,203,201,201,205,202,204,203,201,202,200,200,201,200,201,200,201,199,200,200,199,200,199,200,199,199,200,199,199,198,197,199,198,199,199,199,200,197,198,197,197,199,196,196,198,199,197,196,197,197,198,196,195,196,195,196,195,196,195,196,194,197,196,194,195,192,194,194,193,193,193,195,196,194,196,196,196,200,195,197,198,196,199,197,196,195,195,194,196,192,193,194,192,195,193,193,194,193,196,193,193,194,194,193,195,193,191,193,193,193,192,192,196,196,195,193,193,194,192,192,195,193,193,193,192,192,191,193,191,193,194,193,193,192,192,192,192,193,192,195,196,193,195,196,193,193,194,194,193,194,194,194,193,193,195,192,194,194,193,193,190,193,196,193,193,194,194,193,195,198,191,194,194,195,197,196,199,198,196,197,198,198,199,196,200,202,198,174,163,189,203,207,202,194,197,198,198,198,195,195,198,198,195,196,198,196,198,196,196,195,196,196,199,198,193,194,197,195,194,196,191,195,196,194,196,193,194,196,196,186,184,182,186,160,115,111,128,178,182,130,146,195,215,184,119,55,90,191,230,225,206,197,193,199,194,182,176,138,176,187,162,176,194,199,103,44,133,186,178,177,139,112,106,81,81,77,127,152,78,62,49,42,41,3,34,72,76,81,57,149,240,246,206,91,66,37,5,16,11,45,108,86,55,43,39,31,17,31,56,95,127,120,118,118,107,100,105,128,124,103,108,111,128,118,113,100,53,24,42,108,166,199,161,116,128,150,160,154,131,66,50,57,53,74,72,70,73,55,43,45,42,45,46,55,56,37,43,53,67,64,53,67,102,122,130,145,144,107,59,32,26,29,30,35,28,27,24,28,25,28,34,32,28,26,24,28,31,33,34,27,23,22,29,27,24,21,24,23,24,25,15,19,21,18,21,20,21,19,47,80,95,117,95,62,63,60,34,21,21,19,19,26,22,19,24,23,21,19,17,19,21,26,21,19,23,21,19,22,27,23,24,19,18,26,24,21,30,31,27,28,22,32,53,79,124,169,199,162,145,141,105,72,30,12,16,19,15,20,25,28,89,113,124,145,133,152,110,42,21,27,16,96,166,145,122,87,36,53,97,105,139,105,39,23,24,71,95,94,124,148,145,141,174,149,49,15,18,12,15,34,54,107,123,139,169,137,85,46,22,28,25,24,24,16,22,27,24,27,24,27,25,31,33,26,27,27,30,27,29,26,24,25,26,21,27,26,26,30,24,23,24,26,18,23,21,19,24,48,60,38,66,130,125,67,35,25,17,16,18,19,22,21,20,20,19,26,22,19,22,22,24,23,20,22,21,21,23,27,22,24,26,20,27,27,24,25,24,29,28,24,24,27,30,29,27,29,32,33,19,83,223,168,89,160,129,83,57,49,52,47,71,63,27,143,205,206,227,211,214,208,210,213,210,215,212,215,217,216,217,214,219,116,5,1,8,12,12,12,14,12,14,15,14,14,200,200,201,201,203,202,199,204,201,202,203,203,204,200,203,202,203,203,200,200,201,203,200,203,205,204,207,203,252,252,208,201,200,204,200,199,203,200,202,201,202,202,203,201,201,202,201,203,200,200,200,201,201,200,203,202,201,202,202,201,200,199,200,201,200,200,199,200,198,199,201,199,200,200,201,201,199,199,200,202,200,201,200,199,199,200,201,198,199,200,201,200,196,199,199,199,200,198,198,194,196,198,199,198,196,199,196,196,196,196,196,197,198,196,197,196,196,198,198,195,198,194,197,196,195,198,195,197,193,192,193,195,195,194,195,196,198,197,197,198,198,197,196,196,200,198,196,198,196,196,194,195,197,195,198,194,195,196,194,194,196,196,197,197,196,195,198,198,194,198,193,196,197,193,196,193,194,193,193,196,191,193,195,194,194,195,195,193,192,191,193,194,192,195,191,192,191,192,194,192,195,190,192,193,191,196,191,191,194,194,193,193,194,194,191,191,193,191,192,192,193,193,193,194,196,193,194,193,193,191,194,195,192,196,196,194,194,194,196,197,196,198,198,196,198,198,196,196,196,196,199,198,203,194,177,179,194,204,205,202,195,196,196,198,196,195,197,196,196,195,195,194,193,196,195,193,196,195,196,201,197,196,196,192,196,194,193,193,193,194,193,195,191,191,196,185,193,183,170,159,135,119,132,167,132,100,155,208,217,201,134,126,189,224,227,207,193,198,193,197,192,174,166,157,192,177,151,159,184,187,93,54,97,100,124,177,164,123,109,71,79,114,127,92,37,27,35,78,49,23,38,50,54,66,160,227,244,195,96,92,97,65,14,12,33,92,95,49,50,37,22,22,26,53,79,106,117,118,125,102,96,96,105,115,107,108,108,113,122,117,111,79,36,20,79,169,188,156,81,81,118,130,120,78,56,51,44,44,45,57,72,61,56,41,41,45,46,46,61,81,63,30,41,56,75,112,103,111,122,123,122,122,129,103,70,35,22,31,26,31,34,26,29,25,27,33,29,36,25,25,30,25,37,29,33,29,19,31,34,30,29,25,29,27,24,25,19,19,18,23,23,17,29,18,110,193,171,171,146,134,145,124,83,30,10,17,18,22,24,23,22,22,21,21,21,19,18,29,21,20,27,20,22,22,24,26,26,22,21,30,29,27,29,23,29,29,31,33,66,71,112,174,137,134,131,114,89,55,40,18,14,18,18,19,19,27,73,85,94,84,110,148,74,34,26,21,31,129,148,93,113,105,53,107,148,145,146,77,26,26,15,61,97,107,122,130,128,136,139,93,36,21,17,17,24,50,117,176,212,172,173,167,116,72,33,31,21,17,24,18,21,23,29,22,21,25,27,34,30,33,25,27,34,28,26,27,27,26,21,24,28,28,28,29,22,24,30,24,28,18,25,25,22,45,56,54,76,88,65,47,27,17,21,15,19,21,21,19,19,27,19,23,20,20,25,20,21,24,26,21,20,25,26,21,27,26,24,24,31,24,21,27,27,28,30,27,25,29,28,31,29,27,38,18,84,236,239,119,97,177,149,88,68,40,41,38,66,50,25,150,212,211,227,211,211,210,214,214,211,215,215,213,215,218,217,216,218,115,5,2,8,11,12,12,15,13,14,15,15,15,200,200,202,200,201,200,201,202,200,201,203,202,202,201,201,202,199,200,201,200,203,200,200,203,203,203,203,206,252,252,230,209,203,202,201,201,202,200,200,201,200,200,200,198,201,199,199,200,199,199,200,200,200,200,200,200,201,201,200,200,202,200,199,202,199,198,200,200,200,198,198,197,199,199,200,200,199,197,198,200,199,199,199,198,200,199,198,198,197,199,199,197,200,196,198,200,196,198,196,196,199,198,196,196,198,198,198,196,195,196,196,197,195,196,195,195,198,195,196,196,196,195,195,197,195,196,194,194,195,194,196,194,195,197,196,198,199,196,198,197,199,197,199,202,196,197,196,196,196,194,198,195,194,197,197,195,195,198,196,196,199,196,197,195,196,198,196,197,195,194,195,195,195,197,195,195,198,195,193,193,193,191,194,194,196,193,193,194,193,195,193,193,193,195,193,193,196,190,193,194,193,194,192,194,192,193,193,193,194,191,192,193,193,195,192,191,192,192,192,191,193,191,193,193,193,194,192,196,194,193,193,193,196,195,195,195,193,199,197,196,199,197,196,197,197,195,196,195,196,198,194,198,201,199,197,184,185,199,204,205,203,198,199,202,199,196,194,196,198,195,196,195,195,194,196,194,193,192,195,199,195,194,195,193,192,196,193,193,195,194,195,197,195,192,195,186,201,184,160,164,151,164,184,180,116,105,181,210,215,194,179,197,219,226,206,197,192,194,196,197,198,170,162,182,180,114,105,162,213,184,72,43,35,63,160,207,170,119,99,69,78,100,95,48,4,67,114,93,68,48,78,62,61,174,228,245,191,96,88,107,132,84,37,66,121,132,84,49,40,23,13,37,47,79,114,108,110,113,119,106,96,101,101,108,105,109,115,110,108,92,71,52,24,28,131,188,134,76,8,42,104,103,87,46,80,76,41,42,46,57,61,51,47,43,38,47,42,33,45,64,46,34,57,103,155,155,133,117,110,113,107,110,94,59,36,23,22,34,27,30,35,31,31,27,29,29,31,28,29,28,28,32,34,36,33,29,23,32,44,36,36,35,33,33,32,29,19,21,22,23,22,16,25,24,95,191,191,205,180,180,189,191,122,25,8,12,15,20,22,22,19,21,25,22,26,22,21,24,20,29,24,26,26,24,25,29,25,20,24,24,29,26,24,27,27,35,26,32,70,75,89,103,139,118,110,110,71,64,43,21,16,12,19,20,25,24,74,107,104,109,119,104,55,45,29,22,33,101,98,47,101,129,60,95,146,83,53,40,22,29,16,61,83,114,114,107,127,92,121,109,35,19,19,14,32,50,118,206,160,150,146,122,108,61,40,43,23,15,23,21,27,22,25,28,24,26,34,39,28,30,27,27,31,29,34,27,29,27,25,28,24,33,29,25,29,28,27,26,29,29,23,21,25,57,72,93,118,77,57,47,33,24,19,18,21,24,20,19,22,23,23,24,21,24,24,22,23,21,25,18,27,25,22,28,23,26,25,28,31,24,25,29,25,28,29,31,29,29,28,30,31,32,28,58,212,242,182,67,72,134,104,74,76,49,39,45,62,25,38,190,226,219,224,209,216,213,217,216,213,213,213,215,213,214,218,214,218,116,5,1,7,12,13,12,15,13,14,15,15,15,202,202,201,200,201,199,199,204,203,200,202,200,200,199,202,201,199,202,201,202,200,201,202,200,201,201,201,200,253,253,243,213,198,202,200,201,201,198,203,201,200,200,200,201,201,200,200,202,200,200,198,199,200,201,200,199,201,200,201,199,198,203,200,200,200,200,200,200,200,200,200,200,200,200,200,200,199,199,201,198,197,198,199,200,197,198,199,199,199,197,197,197,198,200,196,196,198,199,199,197,198,198,197,196,196,201,199,198,198,198,198,196,198,195,196,199,196,198,196,196,198,196,200,196,196,199,197,198,198,199,198,199,198,196,196,195,194,194,196,195,196,196,197,198,195,195,196,196,196,196,194,197,198,194,197,196,196,198,195,196,196,197,198,196,196,194,193,195,193,196,193,193,194,193,197,195,198,195,193,196,194,196,193,193,195,193,195,194,193,191,192,196,193,193,191,194,194,192,195,191,193,193,191,193,193,194,191,193,191,192,193,189,192,191,193,193,193,193,194,193,193,191,192,193,193,192,194,196,195,195,195,195,196,197,197,196,196,195,195,198,195,198,196,198,201,197,198,197,198,197,197,196,200,200,201,192,181,191,200,208,205,204,201,207,206,198,199,196,197,195,196,198,196,199,196,194,197,193,193,197,196,193,194,193,194,195,193,192,195,195,192,195,194,192,196,187,200,177,148,159,166,181,204,191,129,151,205,208,202,184,188,211,217,207,199,197,192,193,192,202,198,155,172,200,125,60,131,215,230,139,47,23,9,112,210,213,141,67,70,54,73,74,59,21,34,203,192,87,57,74,66,84,196,244,243,157,90,84,103,116,74,116,132,144,134,102,89,39,20,20,40,65,75,117,118,107,113,108,108,104,113,99,103,112,110,111,104,100,90,81,50,36,18,60,152,139,69,14,15,83,141,129,72,76,114,78,42,38,47,51,42,46,48,39,41,40,33,34,29,44,66,83,138,150,164,157,114,103,99,102,82,53,43,25,19,17,17,27,32,29,30,31,30,28,29,31,27,31,31,26,29,27,35,38,33,32,29,31,39,44,31,36,41,32,41,30,24,29,24,26,24,23,23,24,60,111,129,148,135,139,144,102,53,22,11,15,23,17,26,21,28,27,24,27,24,27,19,28,29,26,27,30,31,27,26,27,20,18,27,29,29,26,27,30,31,25,33,29,71,58,80,114,95,118,92,81,62,63,49,17,19,16,21,22,25,25,53,110,133,116,96,108,81,46,26,20,31,73,95,97,155,144,117,152,135,78,61,42,25,25,29,70,88,112,93,126,151,150,188,101,17,20,9,20,24,46,101,133,171,115,108,118,84,67,42,41,22,14,23,20,27,18,24,26,26,31,32,34,24,31,35,24,28,32,28,30,34,29,30,29,30,29,22,32,26,26,30,27,30,29,20,33,29,85,150,150,160,117,86,87,71,56,32,19,18,17,20,26,24,24,19,26,22,19,30,26,23,23,24,24,24,27,26,23,27,22,25,30,22,29,27,28,24,29,29,24,31,29,29,33,32,44,41,104,231,211,103,66,64,72,59,61,66,45,41,44,39,12,123,233,232,221,217,213,214,213,214,214,211,214,211,213,216,215,218,214,218,115,5,1,7,12,13,12,14,13,14,15,16,15,198,199,201,200,201,199,200,198,198,201,198,198,199,199,201,199,201,197,200,201,200,203,200,202,200,203,200,198,241,250,206,197,197,195,198,200,199,199,199,198,200,200,199,199,199,199,199,200,197,198,201,200,199,200,200,200,199,200,201,200,201,197,198,201,200,201,200,200,199,199,200,200,199,200,198,197,200,199,199,198,197,199,199,198,198,199,198,198,198,200,196,199,200,196,198,199,198,196,198,198,199,198,195,196,196,196,197,195,198,197,196,199,198,200,198,195,198,196,199,196,197,198,200,198,194,198,200,198,197,198,198,197,198,199,194,198,198,197,198,195,199,196,196,198,195,195,195,196,196,194,197,193,194,197,194,195,194,196,195,194,197,195,197,196,194,194,194,192,192,194,194,195,194,193,192,193,192,195,194,194,194,191,195,191,194,195,195,194,192,195,192,193,193,194,192,192,193,190,194,193,193,191,190,193,189,191,191,191,191,193,193,189,190,193,192,190,192,192,195,193,191,192,192,195,194,193,192,196,195,193,196,198,196,195,196,196,196,198,197,195,197,196,199,200,200,199,199,198,197,200,199,197,198,199,200,199,188,181,193,205,206,199,188,197,204,200,200,199,198,198,197,195,197,198,198,197,196,197,196,195,196,194,196,197,194,195,193,193,193,192,194,196,196,192,193,189,191,153,117,139,169,191,207,171,126,172,216,198,189,189,197,203,200,199,194,196,193,191,197,200,190,151,184,184,101,65,181,236,157,81,48,9,37,174,239,179,107,46,40,45,56,50,25,19,89,193,136,68,73,41,97,204,245,245,141,80,82,105,107,44,84,144,143,122,98,83,46,25,25,35,79,125,117,117,113,114,114,95,102,111,112,101,105,117,110,106,94,81,101,83,41,26,16,85,119,64,8,37,97,119,110,65,76,73,87,74,44,44,36,43,40,39,45,38,35,34,29,32,33,43,108,154,159,162,139,118,116,103,71,50,36,35,42,45,32,15,16,22,28,38,33,29,29,28,33,24,30,29,31,29,28,34,32,37,33,29,29,33,42,39,41,35,38,39,35,34,29,25,25,24,30,29,24,33,49,59,57,51,36,37,35,31,27,17,17,15,19,24,17,25,30,25,31,25,23,31,29,31,29,27,27,27,28,25,33,30,25,22,30,26,30,31,22,29,29,33,27,36,58,69,77,66,105,64,62,65,48,65,41,26,19,14,19,24,29,30,29,58,83,69,102,147,92,53,36,27,24,39,65,107,120,96,90,98,82,94,102,50,22,25,44,97,102,139,94,90,144,151,141,42,14,17,16,19,36,39,79,151,106,112,101,84,87,49,50,45,22,22,20,23,24,28,29,22,28,27,33,31,29,34,25,31,35,31,27,31,33,29,27,24,32,29,26,28,27,29,30,29,28,27,22,33,31,96,166,183,189,173,144,168,167,135,50,15,15,20,21,26,27,21,26,24,24,22,29,22,27,24,21,27,22,28,22,25,28,19,26,31,24,27,29,28,29,24,22,32,24,28,34,32,41,47,74,141,177,134,96,98,75,49,51,46,54,49,41,40,29,114,235,242,228,215,217,214,214,211,212,211,214,212,214,214,216,216,219,213,215,117,4,2,8,12,12,12,15,13,15,16,15,15,196,199,196,199,199,198,198,198,199,195,199,199,198,200,199,198,199,198,196,197,198,198,198,198,200,198,200,193,249,240,173,187,192,194,197,198,200,193,199,199,200,200,198,199,198,198,196,199,198,200,198,198,200,200,200,198,198,197,198,198,198,201,199,199,199,197,199,197,198,200,200,200,199,197,197,195,199,199,196,198,197,197,198,199,198,198,198,198,197,198,198,196,199,196,194,197,198,198,198,197,198,198,197,198,196,196,196,196,197,195,198,196,199,199,197,200,195,196,198,198,197,193,199,197,197,199,197,198,198,200,198,199,198,198,199,200,199,199,200,198,199,198,196,196,195,198,198,196,194,196,197,195,196,194,192,194,195,194,193,195,194,196,193,192,193,191,193,193,191,193,193,193,193,190,194,190,191,192,191,194,193,191,193,193,193,189,191,191,192,192,191,193,191,193,191,191,192,189,193,190,191,192,188,190,191,192,193,193,191,189,194,190,194,191,191,191,189,193,191,193,195,193,194,196,195,196,199,198,199,195,193,196,197,195,196,199,196,196,197,194,197,198,196,198,197,198,199,199,199,200,200,195,197,198,199,198,197,184,181,196,201,196,162,170,197,199,201,197,199,199,197,200,195,198,198,194,198,196,195,196,196,196,195,194,194,194,196,193,195,195,193,194,192,192,197,191,181,146,110,113,150,186,191,149,116,179,203,189,197,193,201,198,196,197,191,194,191,195,194,203,174,144,194,185,87,64,128,126,60,47,90,55,98,237,235,166,121,55,52,34,34,40,12,39,92,101,59,53,45,113,214,249,246,146,82,86,107,93,57,79,108,122,84,90,79,33,29,15,38,78,130,143,117,120,113,109,104,105,111,106,105,103,116,114,103,94,83,92,113,103,56,24,30,73,61,21,43,157,170,84,97,100,94,81,84,99,53,25,31,34,43,45,35,33,35,30,26,30,33,31,103,155,152,128,126,112,78,53,27,31,35,39,53,54,47,34,21,19,21,32,39,32,32,30,26,32,32,27,32,29,30,35,30,33,33,30,31,36,42,45,37,36,44,42,35,35,25,27,27,25,28,27,26,31,45,49,50,40,26,31,25,23,29,19,16,18,20,22,21,22,33,34,27,32,25,26,31,29,29,27,32,27,27,27,27,32,24,28,30,26,32,27,24,32,33,29,30,33,58,59,74,83,45,79,55,35,47,54,57,23,19,21,18,24,29,25,31,71,89,115,149,163,130,66,39,27,22,27,27,33,66,59,55,56,47,51,44,21,24,22,73,120,118,171,110,65,56,56,57,26,16,21,15,27,22,34,79,66,100,58,61,63,35,59,48,40,22,25,23,17,30,26,29,26,26,30,29,33,32,30,26,31,32,33,33,30,32,25,25,24,31,34,23,30,29,28,30,29,29,29,27,29,29,69,123,144,167,162,176,174,168,129,42,21,12,16,18,21,26,23,26,23,28,33,29,24,28,24,24,27,24,25,28,24,24,26,23,30,27,27,25,29,29,29,29,27,29,30,36,36,39,48,117,170,161,130,128,140,111,93,81,63,55,42,37,27,125,229,251,249,219,220,219,214,212,211,212,213,213,216,214,214,217,216,218,215,220,115,4,2,8,11,12,12,14,12,15,16,15,15,195,197,198,198,196,195,199,199,198,199,198,199,201,197,198,197,199,198,199,200,196,198,195,195,198,200,202,195,250,237,171,201,199,198,199,197,199,198,200,201,201,200,199,200,199,200,200,200,199,199,200,200,199,199,199,199,201,198,199,200,201,198,198,200,196,198,199,200,199,197,199,198,200,199,200,200,198,198,198,198,198,196,199,197,194,199,197,198,196,198,198,199,199,198,197,197,198,199,199,198,199,199,199,198,199,198,200,197,198,200,198,199,196,198,197,195,200,196,198,198,198,197,196,198,196,198,199,197,198,198,198,197,198,200,198,201,198,198,199,198,198,198,199,199,196,197,197,196,195,193,195,194,196,195,195,196,193,193,193,194,196,192,193,191,191,193,192,190,191,194,194,192,193,192,191,193,192,192,186,190,191,191,191,188,193,190,190,189,190,192,189,192,193,193,191,193,191,190,193,190,191,191,190,193,190,190,193,190,191,193,195,191,191,194,193,194,192,192,193,194,192,197,195,192,198,196,198,199,196,198,198,196,196,194,195,196,197,198,196,195,193,195,194,192,197,196,197,196,198,198,197,198,199,198,199,199,200,195,179,184,199,200,174,171,196,200,201,198,194,198,199,198,198,195,196,199,196,196,197,195,196,195,196,192,195,197,196,196,196,193,191,191,193,196,201,191,169,158,143,120,159,169,154,126,122,183,190,195,203,201,200,193,196,197,192,196,193,192,201,198,165,160,214,164,70,36,16,57,63,126,179,95,159,231,222,180,117,55,39,9,55,149,64,98,196,116,34,37,141,227,243,242,139,84,87,114,84,56,97,113,146,108,61,60,39,24,19,39,79,124,139,123,120,112,103,100,104,115,108,104,104,106,116,104,89,83,88,104,118,124,103,73,54,48,16,63,117,201,160,69,142,147,134,110,120,108,48,30,25,31,35,40,39,31,27,33,30,29,24,25,56,84,94,93,69,43,36,31,28,28,36,34,33,37,50,56,33,17,23,24,35,35,32,32,27,30,23,29,33,28,31,35,34,33,36,33,29,36,47,45,36,41,43,45,41,35,30,25,29,28,24,31,26,27,38,50,49,39,36,35,29,25,24,20,17,16,24,25,22,27,33,30,35,37,33,33,29,32,35,33,31,34,29,27,35,29,32,33,27,35,34,28,30,31,27,31,31,37,57,87,108,69,112,100,53,64,52,74,69,49,26,14,24,24,27,28,73,156,166,152,159,160,118,66,45,36,21,30,27,45,107,89,77,107,84,36,19,22,29,29,110,107,125,182,150,143,84,60,58,16,19,16,15,22,29,48,66,88,45,67,66,33,52,45,49,45,22,21,21,22,27,26,29,30,28,25,32,29,27,30,25,32,29,30,32,31,27,31,33,26,35,29,29,29,25,33,28,26,33,29,26,31,37,50,66,66,75,93,100,97,60,48,34,17,16,18,22,27,22,31,27,29,34,37,36,31,27,25,28,27,32,34,27,25,29,26,27,27,28,32,28,29,28,28,31,28,31,34,33,33,36,48,93,131,144,149,150,154,147,146,139,108,69,44,24,18,159,233,249,238,224,227,224,219,219,217,215,215,216,214,218,216,215,216,218,216,215,115,5,2,7,11,13,12,14,13,14,14,15,15,196,200,198,198,198,196,197,199,200,198,199,199,198,198,199,195,198,198,196,199,198,198,198,199,199,197,202,194,249,238,175,214,203,198,200,197,201,196,201,199,199,201,199,200,199,199,200,199,198,198,198,200,199,199,198,199,201,200,201,200,198,199,200,198,198,198,197,198,200,199,200,198,198,198,200,199,201,198,198,200,198,198,197,198,198,197,199,198,197,198,197,197,198,197,199,200,197,198,196,196,199,198,198,198,198,200,199,198,198,197,199,196,199,197,196,198,198,198,196,194,195,195,198,197,197,198,198,198,197,196,195,197,197,198,197,195,198,198,198,196,196,197,198,198,196,197,196,195,196,194,195,195,194,196,198,193,192,195,195,194,192,193,193,193,192,192,196,191,191,194,192,194,193,191,195,192,192,192,192,191,192,191,191,192,191,191,193,193,191,192,191,194,194,192,192,191,194,192,193,191,192,191,193,193,192,192,189,191,191,191,193,191,194,192,191,194,193,196,195,194,195,194,196,197,194,194,196,195,195,198,196,197,194,193,194,194,193,193,195,193,194,196,193,193,195,197,195,198,197,195,196,196,198,198,196,199,199,198,192,177,186,203,200,200,200,200,203,201,201,197,195,198,197,198,196,196,195,194,196,194,195,194,193,195,196,194,193,193,192,192,195,193,192,197,203,189,159,151,160,156,176,156,128,134,148,195,200,204,220,207,199,194,195,196,194,195,194,196,204,192,171,207,208,122,58,33,18,116,188,207,215,148,219,223,174,168,96,36,21,9,169,225,81,139,179,85,64,157,235,245,214,112,89,85,109,71,53,99,132,141,109,94,57,30,24,21,38,84,132,134,125,116,116,110,95,110,112,109,104,104,110,98,106,92,75,92,102,121,126,124,112,74,39,13,73,155,100,97,71,60,117,113,112,113,110,76,48,31,19,34,36,33,31,34,29,24,31,29,28,31,28,39,43,37,34,22,29,25,17,27,33,32,31,43,54,59,55,38,27,20,31,43,37,31,29,29,29,33,30,32,32,27,30,33,33,35,31,37,46,42,37,38,38,43,46,34,29,27,25,28,31,26,29,27,40,55,59,58,58,61,44,31,18,16,18,17,21,25,24,27,31,31,36,33,31,29,32,35,33,33,33,31,35,30,34,37,31,34,35,35,28,29,31,27,31,31,36,31,78,154,163,148,161,168,146,165,165,166,171,103,31,10,15,21,24,39,154,199,128,99,63,52,51,47,60,52,36,34,24,101,171,158,181,182,111,36,14,29,29,88,117,81,81,97,126,152,149,134,72,26,24,12,24,24,36,71,108,76,115,104,66,71,49,77,76,69,34,18,20,19,33,26,28,31,26,33,31,31,33,30,32,32,27,31,30,32,29,26,33,32,33,27,25,31,30,27,26,29,29,28,29,27,41,53,54,57,48,34,31,29,26,31,24,17,18,17,24,24,25,27,29,32,35,42,35,35,33,26,29,28,33,36,35,29,24,32,29,27,32,27,34,34,32,33,31,34,32,40,33,85,124,66,66,67,84,110,122,141,141,160,165,144,78,45,24,37,203,234,249,246,235,244,238,235,235,229,228,224,219,218,213,214,215,214,218,213,214,114,5,2,7,11,12,12,13,12,14,14,15,15,194,201,200,200,199,199,198,200,198,198,199,198,200,197,197,199,199,196,198,200,196,200,198,199,202,198,200,196,247,232,153,202,202,197,201,197,203,199,199,198,196,197,196,200,198,198,199,199,199,196,199,198,199,200,198,200,200,200,201,199,200,200,198,202,201,200,200,199,200,200,201,198,198,198,200,202,197,200,200,198,198,196,201,199,197,202,196,198,198,198,199,197,198,198,198,199,198,198,197,198,198,196,198,198,199,197,198,198,196,199,198,199,198,200,199,196,199,195,198,196,196,198,198,198,198,199,198,195,197,198,199,196,198,197,196,198,194,195,194,196,196,194,194,193,194,195,196,196,196,195,194,193,196,194,194,194,192,194,192,193,195,191,193,191,192,192,192,194,193,193,191,192,193,190,193,194,192,192,192,194,194,190,191,189,192,193,194,193,191,193,195,194,193,192,193,193,193,191,192,192,193,194,191,191,190,191,193,190,191,191,194,190,189,191,192,195,193,193,193,194,193,194,193,192,196,194,193,193,193,195,195,197,195,193,195,194,192,192,195,194,194,194,194,195,192,193,195,194,195,192,193,194,193,195,193,192,197,198,198,186,177,191,199,204,204,196,198,202,202,200,198,194,194,195,193,193,193,194,194,195,194,194,196,191,193,194,192,193,191,192,193,193,193,201,201,184,148,118,149,168,175,151,106,124,170,221,195,193,222,224,198,194,193,194,194,194,194,198,210,188,158,217,169,73,91,73,34,126,128,140,148,164,243,200,113,107,50,19,5,27,142,129,54,95,71,63,175,245,244,190,90,86,89,99,59,63,109,137,141,90,97,75,31,21,23,43,84,131,134,120,116,119,119,96,105,118,105,103,105,111,106,97,95,79,84,106,121,120,110,103,57,27,13,29,148,196,63,21,51,70,79,95,114,113,87,40,35,67,45,29,26,33,31,36,38,25,24,28,27,26,26,29,19,27,31,23,24,18,26,24,27,35,41,52,57,63,60,60,50,32,39,57,66,59,57,57,59,57,53,54,57,54,57,61,56,58,56,57,60,66,66,57,63,68,61,55,52,46,71,41,42,44,44,42,54,89,97,88,112,100,66,49,24,14,21,21,22,20,21,24,31,34,31,36,31,31,29,26,36,29,31,37,30,28,27,29,27,25,30,28,30,29,27,27,35,30,36,29,67,164,186,141,149,147,159,171,161,181,127,103,24,15,12,15,30,40,137,149,73,45,46,54,56,72,76,64,60,51,29,136,153,99,115,128,117,34,26,31,74,107,96,100,59,41,42,64,78,118,116,46,21,13,30,24,66,151,166,170,180,193,171,170,178,171,190,136,40,15,14,15,27,27,29,33,36,33,33,33,37,34,39,44,34,34,33,28,34,33,36,31,38,31,27,28,27,29,28,34,31,32,34,29,36,44,53,57,41,27,26,28,24,29,20,20,23,24,25,24,28,29,29,36,41,40,45,38,29,32,30,28,29,42,38,25,29,29,32,30,35,29,34,39,33,33,35,35,41,36,64,197,215,137,105,64,57,78,78,85,115,146,151,120,67,96,127,160,237,222,220,239,235,238,236,237,233,235,233,232,227,219,220,212,215,217,216,214,216,116,4,1,7,11,12,11,15,12,14,14,14,15,197,200,198,196,199,199,198,196,196,198,198,199,198,198,198,197,200,197,196,196,196,199,199,199,200,198,200,196,246,214,105,170,188,191,203,195,200,196,198,201,197,198,198,200,198,198,200,199,200,199,199,200,198,201,200,200,199,199,201,200,201,200,199,200,198,200,200,198,198,198,200,199,200,200,198,198,198,197,197,197,199,199,198,200,198,197,199,198,197,198,198,197,198,198,196,195,197,197,198,198,198,198,198,199,197,196,200,199,198,199,199,198,200,198,198,198,196,196,199,197,199,198,198,199,195,196,196,195,196,198,198,197,195,195,196,197,196,196,197,195,194,195,196,195,192,193,195,194,196,194,194,194,193,194,192,192,193,193,193,191,191,195,194,195,195,191,196,193,191,192,193,192,192,191,194,194,191,193,193,192,193,191,193,191,191,193,193,191,192,194,193,193,191,193,194,191,193,192,193,192,191,193,192,192,190,190,189,191,196,194,192,192,193,191,189,193,191,191,191,191,193,191,191,192,191,191,194,193,194,191,193,193,192,194,193,195,191,192,195,195,193,192,193,194,193,193,193,193,193,192,193,191,194,192,194,195,191,194,196,195,181,177,189,198,199,199,194,195,200,201,200,197,197,195,193,191,192,191,195,193,194,196,191,193,196,192,194,200,194,202,205,198,198,207,200,174,138,98,134,156,163,141,85,108,188,235,189,119,146,202,200,195,196,191,194,192,196,203,202,157,121,132,77,89,166,117,37,37,55,34,74,158,237,151,24,7,14,29,7,54,110,61,111,165,129,182,240,242,167,86,83,96,89,51,72,111,145,140,90,76,81,61,26,19,47,91,127,136,122,116,117,114,106,101,105,104,96,105,114,105,97,88,86,97,97,116,121,112,76,36,50,65,89,92,136,130,42,76,68,78,86,94,116,68,21,23,111,125,60,24,19,31,29,30,29,25,23,25,33,26,25,31,21,27,24,24,24,24,29,22,22,33,34,42,60,70,71,64,76,57,65,139,167,175,173,171,171,178,178,178,173,165,171,164,168,168,163,157,161,180,191,186,180,185,169,155,155,147,127,131,137,140,142,91,146,188,171,173,151,151,134,87,46,41,47,53,50,50,58,57,67,82,78,78,82,74,74,71,77,80,76,80,76,75,72,68,65,61,66,63,61,61,62,63,62,61,63,57,101,166,128,47,27,41,41,48,49,49,57,41,27,14,12,17,29,25,68,104,67,71,69,89,107,109,95,64,81,57,74,122,66,33,14,53,109,61,34,76,83,100,133,108,101,81,57,57,28,88,143,65,20,11,27,19,92,189,201,170,167,170,177,185,179,186,188,133,33,13,16,12,22,24,28,34,31,33,29,31,34,28,42,38,31,31,33,31,30,33,32,33,34,27,24,31,29,29,25,33,23,26,31,21,29,37,48,52,45,37,43,39,23,27,18,24,25,20,27,25,29,30,32,34,47,45,42,44,28,33,29,30,35,34,39,31,32,32,32,31,32,31,33,36,36,33,36,41,39,41,51,156,189,141,141,101,87,101,81,60,49,80,83,53,37,121,179,182,190,144,160,179,181,186,178,180,185,200,214,224,229,231,227,221,219,217,217,212,214,115,4,1,7,11,12,11,15,13,14,14,15,14,194,196,196,196,199,194,193,198,197,197,197,198,196,193,199,198,198,195,196,199,194,200,197,197,199,197,201,199,249,217,91,160,183,187,203,192,196,193,199,200,198,199,195,198,198,199,199,198,199,198,200,202,200,199,198,200,200,198,201,199,199,199,196,199,200,196,198,199,199,197,198,197,198,198,195,199,196,196,194,196,198,195,198,199,196,197,193,195,195,195,196,197,198,198,199,198,196,196,198,198,194,199,196,196,198,195,198,196,199,198,194,199,195,195,195,194,196,196,196,194,194,195,196,196,196,195,195,193,194,194,195,195,196,193,195,195,194,196,194,195,195,196,195,193,194,193,193,195,193,192,193,193,193,191,194,191,190,193,193,193,192,192,191,192,191,192,195,192,194,195,193,193,193,193,194,193,194,192,191,192,193,191,192,193,192,192,192,193,195,193,193,195,194,193,191,194,195,193,194,195,194,193,193,192,191,192,195,192,191,191,193,191,191,193,190,193,191,192,192,191,191,191,191,191,190,189,194,193,193,193,193,193,193,193,194,194,195,193,193,191,191,193,191,193,194,196,194,193,194,190,195,191,192,194,190,193,194,192,194,195,194,177,177,193,196,196,194,191,194,198,201,198,198,198,197,198,193,195,191,191,193,191,196,193,198,196,207,213,204,220,211,197,206,212,181,164,139,105,118,130,132,123,95,90,165,234,212,115,112,193,198,207,197,193,194,193,208,204,183,113,54,20,55,158,230,130,56,70,102,87,85,144,193,104,15,8,122,145,59,199,132,102,185,220,246,249,229,163,92,84,104,87,55,86,114,146,118,82,62,38,53,46,29,54,94,130,132,120,113,119,110,108,112,95,98,97,101,116,112,99,83,83,96,105,108,97,109,70,38,95,145,165,157,116,76,50,51,86,81,72,87,76,77,25,20,87,180,160,56,28,18,21,27,27,24,26,25,24,26,27,25,25,23,27,24,21,24,27,30,26,26,26,29,33,44,71,82,75,71,71,67,118,146,150,160,160,165,168,175,178,170,161,170,169,159,165,156,145,152,163,184,185,177,162,120,154,150,150,101,153,156,172,154,109,131,184,188,145,154,141,123,107,83,88,89,91,95,94,107,115,143,169,170,167,159,173,166,172,190,179,183,185,192,193,179,178,167,174,165,164,172,175,182,181,183,184,193,188,193,159,81,40,32,27,33,38,35,36,39,27,21,14,15,16,19,21,43,57,97,149,137,122,103,117,128,112,99,79,76,80,46,37,29,38,82,60,66,60,91,128,148,165,146,139,120,101,81,140,133,39,25,10,21,22,89,188,152,83,43,46,60,62,65,61,61,57,30,27,26,27,32,31,42,45,46,45,51,46,44,43,41,47,41,38,41,40,40,41,38,38,41,38,39,40,33,37,41,32,36,35,33,33,34,47,60,79,70,71,84,66,40,27,21,22,19,21,29,22,30,33,35,44,50,41,43,44,29,30,29,30,29,34,37,32,34,31,31,32,34,33,33,33,36,39,39,41,49,49,55,133,137,104,97,42,47,108,108,51,25,30,43,25,25,116,150,144,141,110,118,124,125,127,124,126,130,148,163,193,217,227,236,230,230,227,222,218,215,114,5,2,7,11,13,12,13,12,14,15,15,15,195,200,196,196,196,197,196,196,197,198,194,196,196,195,197,193,195,195,193,195,198,194,197,195,196,199,201,211,251,217,123,185,189,193,200,193,199,195,196,200,198,198,199,199,199,197,198,196,198,197,198,201,197,198,199,200,199,201,201,198,199,196,197,199,199,198,198,198,197,198,199,197,199,198,199,199,196,197,198,198,198,198,197,197,195,197,197,195,198,196,197,197,196,198,196,198,199,196,197,194,195,196,195,197,195,198,198,195,196,198,196,195,197,196,196,198,198,195,197,195,195,194,193,198,195,196,194,194,195,195,195,196,199,195,195,196,195,195,195,197,195,194,193,193,195,196,193,193,195,194,194,194,192,194,193,195,196,192,194,194,194,193,191,193,191,190,193,193,191,192,192,190,193,194,191,193,192,193,192,189,193,192,193,191,191,193,191,191,193,193,190,192,193,194,193,194,194,193,196,194,195,193,191,194,191,193,194,189,193,194,192,192,192,193,194,193,192,192,191,191,190,192,191,192,193,192,194,188,191,191,192,193,193,193,191,194,192,195,193,195,192,191,196,194,195,195,193,193,194,194,194,192,195,193,193,193,192,195,193,196,194,190,177,179,193,197,193,192,193,193,195,198,201,199,198,199,196,195,195,193,192,193,201,198,211,219,224,229,170,178,200,194,209,179,144,129,113,116,121,132,115,100,108,75,123,214,226,132,115,203,216,213,212,203,205,202,217,198,136,94,26,22,95,216,247,131,64,84,151,150,105,133,152,163,157,121,227,160,109,210,136,107,110,161,241,204,106,84,87,99,63,70,107,125,148,111,73,56,41,25,38,44,63,108,131,133,112,113,118,115,108,108,116,96,98,103,101,106,102,92,86,98,101,113,105,88,62,13,69,148,169,161,123,74,37,31,40,60,60,79,75,52,39,55,100,142,184,118,44,32,17,17,20,23,23,23,21,27,29,32,31,27,21,23,28,22,25,26,23,25,27,21,27,36,35,52,74,81,78,74,66,60,43,37,34,34,36,49,55,51,46,48,50,53,50,42,62,58,60,57,48,50,39,45,54,60,46,47,48,42,47,46,51,31,98,119,127,139,103,112,75,68,64,34,36,37,48,58,60,64,69,89,90,88,81,91,87,90,118,114,111,105,134,130,117,116,103,124,120,130,122,130,150,131,143,147,159,172,153,104,94,92,62,55,57,60,57,64,51,19,17,16,20,22,18,19,43,86,120,136,99,106,160,175,177,180,154,84,76,86,57,60,57,57,66,46,68,132,170,209,203,159,136,118,137,137,148,141,51,9,25,12,16,26,46,85,79,62,46,36,36,38,40,40,40,50,55,55,54,56,63,82,105,118,122,131,136,134,129,119,123,111,104,108,103,99,96,101,94,87,90,88,80,81,77,81,76,71,70,68,76,76,67,90,145,148,152,164,155,131,67,36,28,26,34,30,32,33,37,48,57,66,73,63,57,59,49,45,44,49,49,51,51,44,41,38,38,44,41,39,36,36,45,47,41,50,57,87,132,165,137,77,46,20,22,95,96,44,21,21,43,56,100,150,138,133,133,118,128,116,112,117,114,120,121,122,123,150,170,182,212,224,228,231,229,220,214,113,5,2,7,11,14,12,13,12,15,15,14,15,198,198,198,197,198,197,199,198,195,197,195,198,198,194,198,196,197,198,194,196,195,199,198,195,200,198,199,210,252,211,142,210,198,200,200,194,200,195,200,198,199,199,198,198,196,198,198,198,198,198,200,200,200,199,198,201,197,196,200,198,200,198,197,198,196,199,199,198,198,198,198,196,197,198,199,200,198,199,196,196,198,198,196,199,199,198,198,197,199,197,196,199,196,196,197,196,199,196,196,197,194,198,198,197,198,195,198,196,196,196,195,197,195,198,195,194,194,196,196,196,194,195,198,193,193,193,196,196,193,196,198,194,196,192,195,195,192,197,192,192,195,196,194,196,194,193,194,194,193,192,194,194,192,192,195,193,194,194,192,195,194,193,193,192,195,191,191,192,192,192,191,191,189,192,194,190,194,191,190,192,192,193,193,193,193,191,191,190,192,192,191,192,192,194,193,191,194,193,193,193,192,196,191,191,194,192,194,192,193,193,194,192,193,193,193,196,196,195,195,195,191,193,194,193,195,190,196,195,193,191,190,193,193,193,192,193,192,191,194,193,192,194,193,193,191,191,193,193,191,193,192,193,194,192,193,193,194,192,194,195,194,196,187,178,184,197,200,194,192,192,194,194,200,201,198,196,194,198,197,197,200,199,214,195,220,234,232,201,110,174,215,198,226,178,124,102,71,103,177,154,97,83,83,65,69,110,148,136,134,220,235,221,216,220,214,207,218,159,150,163,94,96,113,197,209,91,59,36,79,97,65,113,188,252,253,163,131,88,89,125,67,65,26,79,155,101,94,93,88,58,71,117,136,135,103,79,53,35,27,24,55,70,114,141,129,124,107,114,110,107,109,109,109,94,102,101,95,97,83,75,79,94,112,122,120,71,23,5,74,132,129,111,56,51,52,32,37,48,62,69,54,40,50,99,132,132,125,71,30,39,32,16,23,22,26,26,24,29,26,38,32,25,25,21,24,25,28,26,23,25,25,24,30,27,34,41,61,78,77,79,76,71,53,30,32,27,30,44,45,41,42,46,42,44,37,49,76,84,98,59,34,32,42,78,78,72,60,37,39,36,33,45,34,48,85,109,135,90,93,71,67,68,44,26,18,23,55,80,77,83,53,62,84,72,71,77,42,27,66,78,71,41,60,69,53,72,30,34,72,84,57,31,51,55,63,55,37,35,36,27,94,138,119,118,122,113,116,104,46,16,13,16,19,18,29,23,77,141,114,81,117,174,139,115,103,105,141,128,67,69,79,91,91,101,119,153,150,200,245,198,158,101,76,76,79,91,112,122,59,19,19,12,16,18,24,43,72,89,78,62,54,55,61,60,55,53,56,63,61,69,92,128,95,143,149,142,145,141,147,146,141,132,142,156,151,151,145,144,151,149,144,132,139,143,133,141,142,144,148,87,155,160,150,152,174,212,176,165,170,140,105,67,51,52,57,60,59,56,64,99,128,143,136,125,132,123,113,112,122,125,123,131,118,102,101,98,109,107,94,87,81,94,94,74,41,60,126,153,155,164,147,111,77,54,101,148,125,93,92,136,153,145,157,160,141,140,137,132,136,127,125,122,124,131,129,132,119,120,127,138,168,178,189,211,225,221,219,115,4,1,7,11,11,11,14,12,13,14,14,14,194,200,197,197,197,196,195,197,196,198,197,198,196,194,197,193,197,200,197,197,198,197,198,198,198,199,195,203,251,209,162,214,194,202,196,197,200,193,198,200,197,198,199,198,198,198,198,198,198,197,198,199,198,197,199,201,197,199,199,198,197,197,200,199,197,195,196,197,198,198,199,197,196,199,198,198,198,196,196,196,197,195,197,198,198,195,194,198,196,195,195,198,198,199,198,194,195,193,196,196,196,197,193,198,194,196,196,197,197,194,196,196,196,195,196,194,194,194,195,196,194,194,197,195,194,193,192,195,194,192,192,192,194,192,193,195,194,191,191,194,193,194,193,193,194,195,194,193,193,193,194,194,192,193,193,191,192,191,194,192,194,194,193,194,191,191,192,193,193,191,191,190,190,192,189,191,190,191,195,190,194,189,189,192,194,194,192,194,192,193,193,193,193,192,193,192,193,191,193,191,193,196,195,192,191,194,194,190,193,190,189,193,191,190,193,193,194,193,194,195,194,193,189,192,193,191,195,193,193,194,191,193,192,191,193,193,192,191,193,192,192,191,193,191,191,193,191,192,193,189,191,191,191,192,193,194,191,193,191,193,196,193,196,185,177,189,194,197,193,191,195,192,195,198,198,198,195,193,189,191,196,199,211,147,139,191,150,122,138,232,244,188,220,171,117,94,71,107,180,182,92,51,60,46,37,21,51,74,118,207,221,204,200,209,207,199,178,121,165,209,138,107,100,131,139,93,66,35,35,27,17,84,205,247,242,154,76,81,199,150,53,61,4,31,85,88,116,83,53,86,115,146,129,83,60,40,37,23,23,44,76,115,133,134,121,110,110,112,106,110,113,108,110,102,104,101,90,83,73,71,87,110,120,129,76,26,21,6,66,114,122,89,77,101,60,31,39,51,54,47,44,41,54,110,103,84,91,47,29,43,43,29,17,17,21,23,23,30,32,38,33,25,26,24,24,29,24,25,27,27,24,26,25,27,32,36,47,69,80,84,88,81,69,51,39,35,30,37,43,48,45,42,41,39,38,76,130,121,113,63,28,73,118,121,89,77,54,36,40,48,44,39,41,29,81,74,109,99,62,76,46,63,53,16,20,14,55,110,113,116,71,81,125,125,124,128,66,25,94,133,102,19,49,94,95,87,18,37,111,160,57,57,132,98,115,81,29,22,30,18,97,152,109,129,136,130,145,101,29,10,15,14,22,21,23,24,78,152,81,94,173,95,60,57,53,23,36,104,77,65,100,139,168,171,218,182,148,213,139,38,20,16,64,71,73,48,85,169,90,25,17,11,26,18,24,35,87,133,123,111,104,106,111,89,42,31,39,41,49,44,38,55,65,66,74,61,68,56,50,71,69,59,53,59,66,65,64,58,59,72,74,64,62,76,66,60,58,74,86,80,102,111,88,116,160,143,150,136,129,118,81,77,58,53,59,60,70,70,92,122,133,133,128,128,131,141,134,141,160,145,145,153,150,159,159,163,168,160,170,174,168,159,152,76,50,129,153,158,140,137,135,140,175,171,182,186,153,151,159,170,171,157,149,137,128,142,139,133,137,127,128,130,126,131,136,130,120,120,115,113,125,128,134,170,212,223,223,115,4,1,7,11,10,11,12,11,13,14,14,13,197,198,197,198,199,195,195,197,196,198,195,196,197,195,198,196,195,196,196,198,198,200,196,197,197,199,198,206,252,217,185,215,187,200,195,194,200,197,197,196,197,198,198,196,198,198,198,198,198,196,196,197,199,199,198,200,198,199,199,197,199,197,197,200,198,198,197,195,198,195,198,198,200,198,196,196,193,198,197,196,197,196,196,196,193,196,197,198,197,197,196,196,195,196,196,194,196,194,196,198,196,195,198,196,195,197,194,196,195,194,196,194,197,195,193,196,196,196,193,194,194,193,196,194,194,194,194,196,194,196,196,193,196,195,197,195,192,193,194,195,192,193,194,193,189,191,192,193,193,194,195,195,193,192,192,191,191,192,192,191,191,192,194,191,190,190,190,191,192,192,192,190,188,192,189,189,193,191,190,191,190,189,191,191,194,192,191,192,192,193,192,194,192,191,191,191,192,191,191,194,191,192,191,191,193,189,192,191,189,189,190,192,193,190,190,192,191,191,191,193,189,191,192,192,193,191,192,191,193,191,190,191,190,191,192,195,192,190,190,189,191,191,191,190,193,191,193,191,191,195,189,190,191,190,192,192,193,192,194,195,192,196,193,193,181,174,188,196,197,194,191,192,192,190,196,200,199,200,191,190,186,187,192,61,60,100,42,19,61,130,161,103,148,122,110,99,78,108,150,148,98,49,76,86,66,73,27,29,64,88,162,151,150,165,165,169,146,103,121,129,62,84,66,79,120,104,69,9,45,25,28,126,184,245,235,173,107,174,253,158,86,48,2,65,105,105,89,57,97,119,137,116,72,55,37,30,16,26,59,87,112,121,132,116,107,113,101,103,107,117,116,107,115,110,107,90,87,81,87,99,109,128,118,78,34,18,20,29,100,102,87,100,102,122,72,37,43,41,46,40,45,43,48,84,87,73,69,56,27,26,37,35,26,12,18,19,22,33,32,32,27,27,26,27,26,24,25,27,29,26,25,25,27,27,31,37,54,79,84,91,114,106,86,72,54,41,37,39,44,47,44,41,39,43,42,85,115,114,104,40,39,109,125,132,104,73,49,27,41,41,40,37,36,46,47,87,74,41,63,32,46,59,40,24,20,12,44,110,113,125,68,68,122,107,128,150,71,18,103,137,93,20,38,106,114,118,63,50,124,160,57,103,164,108,141,96,29,32,45,25,119,164,117,116,110,119,124,74,19,14,14,13,20,23,24,23,67,138,66,122,157,64,77,71,97,51,11,87,91,61,73,124,174,182,144,72,110,169,54,8,66,72,92,75,89,71,64,140,107,41,20,12,18,21,28,36,106,152,144,133,134,139,150,97,37,43,46,53,61,52,58,66,72,93,97,89,92,43,36,105,102,63,30,30,64,79,72,43,39,74,83,64,60,71,75,54,39,63,78,51,33,29,56,87,99,136,121,110,117,88,81,59,36,34,28,36,43,41,46,55,66,74,63,53,57,70,68,65,61,72,76,108,74,69,79,83,92,96,96,109,94,96,74,49,103,150,152,138,129,129,134,158,192,194,185,171,156,160,150,151,145,129,127,124,122,131,128,125,132,127,131,128,122,125,122,128,127,126,122,115,127,122,110,136,184,212,224,116,5,2,6,11,12,12,14,13,15,15,15,15,196,200,198,198,198,199,199,199,199,199,198,200,196,194,196,197,198,198,198,196,197,198,196,197,196,198,191,204,251,224,194,210,188,203,193,198,196,193,198,197,196,196,197,195,195,195,196,198,197,198,198,195,199,196,195,198,196,198,197,197,196,196,198,194,196,197,198,198,197,195,196,199,196,197,197,198,194,194,198,197,196,196,196,197,196,195,199,196,195,196,195,196,193,195,195,194,195,195,198,195,194,197,193,193,195,194,197,196,194,196,192,195,197,196,193,194,194,193,194,192,193,194,193,193,194,193,191,193,193,194,193,191,194,192,191,194,193,191,193,195,193,194,194,193,190,191,192,191,190,193,191,193,193,194,191,192,195,190,194,189,191,191,188,192,191,190,189,192,192,190,191,190,191,193,192,192,190,191,190,188,191,191,190,190,191,190,189,193,189,190,193,190,193,191,191,191,190,192,191,190,192,192,189,190,190,190,191,188,191,190,191,193,190,188,192,193,189,190,189,191,190,189,188,189,189,190,190,190,190,191,193,193,192,190,191,191,191,193,193,189,190,191,190,189,191,191,190,194,192,189,191,189,192,192,191,194,190,194,192,191,195,191,193,194,188,179,174,187,197,193,193,192,191,193,190,191,198,201,200,200,189,193,197,82,14,87,56,5,5,10,7,43,89,72,96,89,71,99,84,108,111,98,128,93,73,75,53,40,31,50,109,111,103,113,125,146,140,99,84,64,29,61,53,56,117,101,53,21,43,14,74,203,178,142,123,97,87,160,234,106,54,23,24,146,117,67,82,97,140,137,90,66,57,81,65,24,36,64,123,128,106,121,110,110,109,98,90,102,117,111,106,114,114,105,95,81,84,83,99,116,125,116,67,31,14,23,46,98,135,88,76,92,93,102,77,43,30,34,40,42,40,39,37,59,71,72,63,50,38,18,25,29,18,17,16,22,24,24,27,32,28,26,23,24,25,24,28,33,28,28,27,26,34,27,31,41,74,91,84,100,109,81,64,75,73,44,33,39,41,45,45,42,43,39,35,55,96,104,92,38,40,121,142,130,126,117,56,31,30,39,43,36,51,49,105,63,54,81,36,54,39,61,68,33,27,14,35,100,99,81,35,51,113,107,138,120,34,4,84,130,90,19,51,112,115,131,115,99,126,133,24,57,92,79,116,58,21,36,40,36,151,172,108,108,85,98,106,63,24,12,15,12,24,25,27,20,63,137,72,120,149,59,43,55,137,93,21,104,104,65,36,39,69,48,24,60,176,176,19,49,129,42,105,134,105,92,41,130,126,42,24,13,16,20,27,37,122,163,134,123,119,133,140,80,22,44,62,58,74,73,83,74,83,121,131,129,132,41,47,159,131,55,19,31,97,110,77,33,42,114,125,67,78,121,117,97,88,136,104,45,26,35,50,77,107,95,112,93,92,78,69,65,36,32,24,34,43,44,50,61,94,95,65,57,74,84,52,35,66,79,81,68,39,41,41,44,46,34,36,37,39,38,42,88,142,158,130,130,131,142,146,151,174,167,167,162,152,150,145,143,137,130,132,134,129,129,122,121,126,121,128,127,117,117,118,124,128,124,122,128,141,139,120,117,149,168,196,118,6,3,8,11,13,12,14,13,14,14,15,15,198,200,198,199,199,198,199,199,200,200,199,199,200,197,199,199,198,199,197,196,196,199,195,200,192,193,179,169,248,212,186,208,184,202,194,196,196,192,199,198,195,195,196,195,196,195,196,198,196,198,199,196,197,196,198,195,194,196,198,196,197,197,196,197,193,196,196,196,198,194,199,196,196,195,195,199,197,198,196,195,196,194,195,197,196,198,194,196,194,195,197,193,198,196,194,194,196,196,194,193,193,193,195,193,192,196,193,196,195,194,198,194,196,196,195,195,193,193,193,196,193,193,196,194,195,198,194,193,191,193,191,190,192,193,194,192,191,191,191,192,192,192,192,194,193,193,193,191,193,195,192,192,194,192,191,193,193,194,192,192,193,192,193,193,192,191,193,193,193,190,191,193,190,191,191,190,190,191,193,191,190,191,192,190,188,191,193,190,191,190,190,189,189,192,190,191,191,192,190,191,191,190,191,191,193,191,192,190,190,190,190,190,190,191,191,191,191,190,190,189,190,190,190,190,188,190,190,192,189,189,191,192,193,190,192,191,190,191,190,192,191,190,191,191,194,190,191,190,191,193,192,193,192,191,192,192,191,191,192,190,188,190,190,193,194,189,176,174,190,196,195,195,192,191,194,192,191,198,199,204,198,212,250,137,80,149,133,62,3,9,5,25,92,70,80,73,77,113,84,92,141,147,107,59,53,72,35,46,44,57,112,141,135,128,141,156,152,155,154,113,97,97,81,105,137,94,51,93,63,8,69,126,93,107,74,44,29,141,191,49,24,4,39,122,77,86,114,131,141,82,67,72,103,147,84,48,72,108,143,114,105,109,102,106,103,90,77,94,109,106,109,114,106,84,83,84,88,89,110,131,108,64,20,19,27,61,108,127,155,157,132,87,84,114,75,33,23,28,29,38,36,28,29,39,59,60,61,55,42,22,26,39,36,31,19,16,24,25,23,27,26,24,22,26,29,23,27,29,31,31,26,25,26,33,30,37,64,77,76,74,66,50,48,63,70,46,35,36,39,46,40,42,37,39,37,59,99,105,89,36,39,101,103,92,127,110,54,28,32,39,45,39,42,110,144,155,168,150,145,129,131,147,157,117,30,9,30,87,75,43,11,53,125,116,134,103,34,11,84,129,86,18,53,116,130,128,122,143,147,133,21,24,65,85,133,71,19,31,33,53,174,163,102,92,75,82,87,69,29,14,15,19,29,24,25,24,57,144,100,107,112,51,69,141,226,96,15,111,130,89,59,44,78,39,60,204,241,155,11,60,117,62,120,115,113,90,134,179,114,51,15,14,16,27,24,41,141,152,110,95,88,102,105,61,22,34,50,67,77,77,76,56,65,92,71,99,121,34,54,153,100,42,22,28,98,95,69,24,28,102,114,46,63,123,128,129,142,152,85,33,28,29,57,71,63,86,59,63,65,34,60,55,40,30,27,39,39,47,46,68,105,114,113,112,120,64,39,92,105,124,101,52,42,39,51,51,39,37,32,45,35,45,113,148,173,152,127,133,144,153,152,150,151,149,157,154,145,155,147,142,151,147,147,150,146,146,141,136,139,136,139,132,128,123,121,135,130,117,117,130,146,146,130,121,128,130,157,111,12,2,9,11,14,12,15,13,13,15,15,14,193,200,199,199,199,197,198,195,199,203,198,199,200,199,198,198,199,198,197,198,196,198,195,200,197,204,183,157,214,172,178,203,188,203,191,198,197,195,201,196,195,199,197,196,196,196,196,195,196,196,198,198,199,194,195,198,197,197,196,194,196,196,198,198,197,197,197,196,193,198,198,198,196,196,198,198,195,195,198,195,197,197,193,196,197,195,195,195,197,196,195,195,193,193,195,195,194,192,194,195,193,194,196,195,194,193,195,193,192,194,193,193,193,194,192,193,193,193,196,192,193,196,194,193,193,193,195,195,192,194,192,192,194,193,195,192,191,193,193,195,191,191,192,191,193,192,193,193,192,191,190,194,192,191,191,192,193,189,192,191,191,192,190,192,193,191,190,192,189,187,192,190,189,191,190,192,188,190,193,190,191,190,191,187,189,191,191,190,189,189,188,189,190,190,188,189,187,193,190,189,191,188,190,191,190,191,194,192,190,189,190,191,191,189,189,190,190,191,188,193,191,190,191,189,191,191,189,188,189,188,192,191,189,191,193,193,192,192,192,189,189,191,190,191,194,191,190,191,190,191,193,190,191,191,192,193,189,191,192,189,189,193,191,194,190,193,187,172,179,190,198,194,191,191,191,193,192,193,192,196,196,232,253,177,148,216,204,190,155,117,62,47,83,74,92,56,78,159,120,98,109,143,116,29,24,39,30,43,35,39,106,113,110,98,93,95,162,205,170,159,125,153,150,128,141,59,88,207,100,33,77,39,66,163,123,41,57,185,152,15,6,15,114,131,97,130,134,120,95,70,76,132,153,164,90,55,84,98,127,99,92,97,98,103,93,82,71,80,94,103,107,97,77,83,96,92,97,98,124,110,53,23,14,28,68,114,125,129,147,144,120,89,112,109,61,45,21,17,29,34,35,33,27,30,37,42,51,56,44,27,37,59,63,46,24,16,21,17,26,26,23,26,23,26,30,24,26,29,23,29,28,27,27,29,32,34,47,55,57,63,61,56,58,60,39,22,32,41,45,40,37,43,39,40,33,71,104,98,88,42,16,41,61,63,71,54,42,34,33,41,41,44,35,113,204,190,201,199,196,212,205,212,209,147,37,7,22,74,78,46,14,44,116,98,103,119,58,24,96,132,83,14,38,117,123,80,83,127,158,144,20,57,99,98,134,66,15,33,32,81,186,132,78,79,60,74,81,71,37,14,14,19,24,26,27,19,40,101,111,98,74,76,122,230,215,57,8,90,164,153,113,98,119,136,203,248,202,119,10,54,111,93,125,85,66,150,173,183,108,10,22,14,13,33,27,59,148,133,85,69,56,67,72,57,27,17,43,78,83,83,90,79,87,54,30,83,113,34,57,139,84,44,22,28,90,92,61,25,37,105,102,31,79,113,75,96,113,131,69,30,24,30,54,69,88,45,62,68,40,43,57,59,47,36,24,28,35,46,58,49,57,98,110,85,61,31,35,88,116,114,75,36,33,42,57,53,46,35,42,48,63,135,170,160,156,148,130,145,148,144,145,136,137,139,150,149,149,165,155,146,150,148,150,153,152,158,155,157,162,156,160,153,141,142,137,141,131,116,118,128,135,139,134,121,120,113,139,107,14,2,10,10,14,12,13,12,13,15,15,14,195,198,194,198,199,198,196,196,198,198,199,197,198,197,199,199,198,200,198,199,197,198,196,200,200,225,228,167,153,135,169,197,191,200,193,198,200,198,198,197,196,198,199,198,197,198,196,196,195,196,198,196,198,196,195,195,197,199,198,196,197,194,195,197,195,196,197,196,196,196,199,196,196,197,197,198,197,197,196,196,195,194,195,197,196,195,195,196,194,194,194,191,196,196,193,193,194,194,192,194,194,194,195,193,190,194,193,193,195,193,193,193,195,193,192,193,195,193,194,194,190,194,194,192,192,192,191,194,193,190,194,193,192,193,194,193,192,193,194,194,193,191,191,192,190,190,193,190,190,193,190,189,191,190,192,191,189,193,191,191,189,191,190,188,190,188,191,191,191,192,192,190,190,192,190,190,192,189,191,189,189,191,190,190,190,190,189,188,189,190,190,190,191,192,189,191,190,188,188,187,191,188,191,196,189,200,205,194,193,189,191,191,188,190,190,190,190,191,191,192,192,190,191,191,190,192,191,193,191,190,190,191,190,189,192,190,192,192,190,191,193,193,192,191,193,191,191,190,189,190,191,190,192,189,189,191,193,191,194,193,188,191,190,190,192,193,195,186,177,180,193,199,195,195,193,192,194,194,194,187,191,234,234,162,163,218,229,236,244,249,200,126,95,69,91,56,73,155,120,75,55,91,119,81,52,60,33,43,43,27,99,33,39,21,18,50,149,173,125,113,90,128,127,80,102,31,103,194,99,170,177,35,131,211,118,60,125,214,122,23,18,109,212,165,127,139,113,93,81,54,87,122,150,165,81,59,76,90,113,89,85,104,110,108,93,85,78,89,94,95,97,83,83,99,107,102,103,113,94,50,22,20,27,77,116,123,131,133,129,87,77,98,95,84,39,49,39,31,30,31,36,33,31,26,29,35,48,51,35,27,48,72,66,57,37,14,15,22,24,22,21,23,25,28,32,27,28,26,27,29,31,30,30,24,34,67,81,86,92,91,91,95,95,75,62,56,71,83,112,77,73,73,74,73,68,97,104,97,91,56,46,51,70,76,66,55,51,53,55,56,63,53,72,150,157,117,97,89,107,111,108,103,113,78,33,33,41,71,114,67,29,76,108,87,97,117,95,79,121,129,92,19,70,116,101,50,25,92,137,144,41,35,98,116,93,29,25,37,44,139,178,84,66,61,64,70,71,81,44,25,24,17,29,32,27,23,19,44,66,96,101,95,131,165,95,15,14,18,84,147,156,170,137,178,171,167,119,50,6,13,32,77,116,74,84,121,161,108,24,27,15,21,16,27,35,83,160,103,54,55,49,57,70,66,32,15,33,74,83,101,126,113,77,21,23,96,112,29,70,146,75,57,43,54,121,93,66,45,64,128,99,28,95,117,45,30,57,104,63,31,26,35,86,121,119,129,120,98,81,69,97,103,88,55,26,26,35,73,92,57,72,98,104,63,56,40,39,107,108,93,48,29,40,42,65,66,45,39,36,81,147,173,142,87,103,129,130,153,154,144,132,126,130,135,146,150,155,162,151,146,142,139,144,155,155,155,156,155,159,155,155,151,147,147,142,141,131,122,129,136,137,132,120,107,112,118,144,105,15,3,10,10,14,12,15,13,14,15,14,15,191,196,195,195,195,194,196,195,198,195,198,198,196,195,197,198,196,199,198,197,197,199,196,196,193,223,230,149,132,133,170,198,193,200,193,199,197,196,199,197,196,196,196,196,198,197,196,197,198,195,197,198,198,198,197,198,199,197,199,197,198,197,196,197,196,196,197,198,198,198,196,198,197,198,199,198,197,197,196,194,195,196,196,194,194,195,196,194,193,194,195,195,194,192,195,195,195,196,193,194,192,192,193,194,192,193,196,193,193,195,195,193,195,195,192,193,191,191,193,191,192,189,190,193,194,192,192,193,193,193,193,193,192,191,193,194,192,194,191,193,193,191,193,190,190,189,193,192,192,191,192,191,189,191,189,191,192,190,190,191,189,191,191,189,191,190,189,190,189,190,190,188,189,191,190,189,189,189,191,189,188,191,191,188,191,190,188,187,188,191,189,191,191,193,196,196,195,193,191,192,195,197,198,204,195,190,208,202,192,191,189,192,193,189,191,190,191,190,188,191,187,190,191,190,189,192,190,190,193,189,190,191,189,190,188,186,190,190,191,190,188,191,191,191,190,190,191,189,192,193,192,191,190,191,191,191,191,190,192,193,191,193,191,191,192,195,194,197,187,178,189,199,206,199,199,200,201,203,199,188,204,252,212,142,159,206,217,229,241,251,217,145,97,68,93,91,81,112,136,110,57,49,97,141,151,95,31,49,93,116,126,118,107,93,80,92,139,132,108,69,63,95,77,69,81,75,71,86,104,197,129,46,165,165,57,68,175,204,125,55,91,211,219,152,137,102,84,76,31,33,46,70,136,154,69,58,69,86,112,73,95,106,114,120,100,96,89,101,102,98,92,91,101,113,118,104,110,78,37,19,22,38,77,120,131,117,121,122,117,101,98,95,62,13,25,105,98,49,22,22,29,27,28,29,23,28,47,55,37,30,61,77,66,60,51,46,42,29,18,19,22,22,24,30,26,27,29,26,30,30,27,27,32,28,50,139,179,178,168,161,166,162,179,169,155,161,164,174,173,175,182,180,179,170,163,160,139,136,148,151,170,174,162,162,180,161,139,143,153,166,165,162,175,184,147,93,83,78,83,93,88,100,112,122,137,145,160,174,159,173,167,165,158,138,150,157,150,145,164,159,150,136,144,141,134,122,104,122,141,151,108,126,133,128,125,94,111,127,152,199,164,108,116,110,115,118,115,123,92,75,70,64,76,75,75,67,68,53,57,98,116,129,121,81,29,38,56,27,48,86,121,137,135,125,118,109,43,28,39,31,26,54,100,89,91,117,74,35,39,31,38,33,39,48,54,141,169,70,52,55,60,70,71,76,49,24,28,71,52,54,110,87,41,14,57,121,89,30,103,134,92,111,89,97,129,96,95,78,75,134,85,23,99,107,47,12,34,98,61,40,17,44,132,160,181,158,169,159,152,162,165,171,159,86,32,18,35,114,123,103,99,104,100,111,135,86,76,100,87,71,33,26,41,63,81,63,47,38,41,106,183,174,110,40,64,122,138,166,158,142,135,132,135,137,146,139,134,139,139,143,136,136,141,148,147,146,145,144,143,139,144,143,141,145,144,145,145,144,146,148,139,129,118,111,119,129,149,104,14,2,10,10,14,12,13,13,13,13,15,14,192,198,194,195,196,194,194,194,195,197,195,193,199,195,195,195,193,196,195,196,196,196,195,198,189,200,198,170,193,176,181,196,192,200,195,198,197,195,196,199,197,197,198,196,195,198,196,196,196,195,199,196,198,197,198,198,195,196,196,196,197,198,200,199,197,197,198,196,196,197,195,197,198,197,198,196,197,194,194,196,196,198,196,196,194,194,193,195,196,195,196,193,195,196,193,194,194,193,191,193,195,193,192,193,193,193,194,193,192,193,192,193,193,192,192,193,192,190,192,192,190,191,192,192,193,194,194,196,193,194,192,191,192,190,190,189,191,193,190,188,189,190,192,191,189,191,192,190,189,191,190,190,192,191,192,193,194,193,191,196,192,192,190,190,192,188,190,193,189,187,188,187,186,190,186,184,190,189,190,188,187,187,188,189,190,191,192,189,191,193,190,195,196,206,212,212,211,204,205,205,209,212,214,210,157,148,182,202,201,191,191,190,190,190,189,187,189,191,187,192,190,188,190,190,193,189,190,191,190,190,188,191,190,189,192,190,192,189,189,193,188,189,190,188,189,190,190,191,189,191,191,191,191,191,192,187,191,191,193,192,190,193,196,198,203,206,201,202,207,197,190,198,211,211,207,208,213,212,211,193,229,234,169,130,143,181,208,217,234,221,168,118,87,59,77,86,101,111,119,147,127,93,93,149,168,126,59,35,104,165,201,214,216,212,183,164,137,130,116,75,114,97,107,171,124,157,136,59,38,81,53,71,179,107,8,87,178,136,60,76,179,245,171,121,101,79,59,36,14,39,55,71,140,126,48,55,71,86,99,69,95,102,97,111,102,89,89,100,85,87,105,102,108,116,117,93,76,39,13,24,48,79,108,129,129,111,112,112,105,73,93,63,56,16,103,196,127,46,28,11,21,24,24,18,24,32,62,75,37,36,63,76,77,71,62,56,49,35,22,19,20,24,26,28,29,28,29,28,29,33,31,28,30,31,53,126,151,103,81,81,83,88,90,113,91,76,78,81,113,105,109,119,106,102,106,97,78,84,101,113,129,130,130,133,135,128,128,125,128,130,130,127,134,119,90,92,95,93,100,101,107,120,137,144,149,160,162,167,109,151,154,153,139,134,139,136,127,128,132,134,100,139,160,154,159,147,147,155,132,146,141,174,159,148,160,161,190,183,177,181,156,145,163,161,169,165,167,181,174,184,179,175,178,178,183,179,169,162,153,139,144,153,142,141,130,145,159,140,144,141,150,149,146,147,139,121,117,121,100,124,120,114,134,133,133,113,103,91,113,131,118,147,154,160,172,209,179,112,115,121,127,129,137,132,96,73,91,106,63,81,106,81,92,68,124,132,101,83,127,128,105,130,105,97,111,96,111,74,84,123,79,59,100,91,39,33,79,104,64,51,36,66,143,159,143,113,108,116,121,124,126,134,137,71,29,27,29,73,99,92,95,57,54,113,139,98,56,75,63,41,34,27,46,70,65,56,49,36,27,61,131,149,114,74,113,137,137,158,146,137,136,141,136,139,141,131,126,124,128,139,145,142,131,130,127,125,125,130,136,136,141,141,145,151,148,152,152,148,147,142,136,136,146,153,151,132,142,105,13,1,10,10,13,11,14,13,14,15,14,14,193,196,194,195,196,194,195,194,195,195,195,195,194,195,195,196,194,195,196,193,196,197,193,196,193,203,203,212,249,213,191,202,192,203,194,198,196,195,200,197,197,199,199,200,198,195,196,196,196,195,197,196,195,197,196,195,199,195,197,198,194,196,196,197,196,194,195,195,197,195,195,194,193,197,196,194,197,194,194,194,193,195,193,194,194,197,195,193,192,193,193,193,194,192,194,193,193,191,192,193,194,196,193,193,192,192,192,193,193,191,191,192,191,191,191,190,190,190,192,193,193,191,194,196,198,203,208,204,203,199,192,193,190,191,194,192,190,191,190,193,190,190,192,191,191,188,192,190,191,191,190,189,190,194,193,205,206,204,205,205,200,191,188,191,189,189,191,189,190,191,192,191,189,194,186,188,191,188,192,191,190,194,193,191,191,194,200,200,207,210,204,207,198,196,195,196,218,216,214,211,201,209,201,181,133,101,155,204,205,204,195,190,193,189,192,190,190,191,189,191,190,191,191,192,190,192,191,190,190,187,190,190,191,190,187,190,190,190,189,187,191,191,191,191,188,190,191,189,190,190,192,192,191,191,189,188,191,195,196,196,198,205,202,200,210,214,204,205,200,200,188,174,196,202,206,194,195,211,204,194,252,224,130,126,116,141,155,169,199,183,122,93,68,49,67,105,120,118,124,108,131,141,144,145,152,151,80,18,12,22,84,141,167,197,180,171,144,143,110,86,168,133,148,171,139,232,170,58,118,107,57,120,194,72,9,131,161,110,67,125,213,185,111,87,81,53,28,18,17,95,81,72,145,94,42,62,70,99,121,92,105,85,92,102,87,91,81,73,62,81,107,118,122,105,105,70,29,22,18,64,106,113,123,119,128,114,107,98,73,66,88,93,80,118,188,201,96,42,31,9,16,18,27,20,18,27,54,60,35,62,91,103,84,62,53,42,32,25,19,18,18,27,27,22,24,25,31,28,31,29,29,30,34,28,44,117,120,112,97,79,85,71,71,66,69,63,59,67,60,55,65,62,57,61,61,60,56,59,63,60,63,54,57,55,51,57,54,62,56,49,50,57,55,48,49,54,58,57,66,66,54,57,60,56,49,53,55,52,46,51,44,46,48,49,46,40,47,41,46,39,41,48,60,63,57,52,51,55,50,53,50,70,61,67,69,66,81,67,75,77,61,76,83,87,93,86,92,105,98,105,96,97,102,98,117,110,109,113,112,109,92,102,112,120,124,148,145,134,153,128,118,105,106,108,102,118,120,141,139,144,149,136,125,112,121,123,123,143,158,161,173,185,196,176,159,170,148,133,150,146,158,162,165,170,165,168,179,173,144,142,154,148,149,156,158,158,143,147,157,136,133,142,132,136,138,129,145,127,125,147,133,136,152,153,146,150,166,166,136,153,151,165,180,155,127,76,77,77,83,92,87,95,105,80,63,58,59,87,103,118,94,62,67,110,134,92,84,81,52,57,54,59,67,76,73,61,63,51,42,35,66,114,138,129,132,124,131,156,141,138,138,145,134,131,137,127,124,121,122,137,127,123,125,123,122,121,122,132,144,139,142,145,149,151,147,145,144,137,130,130,136,151,179,189,174,153,153,104,12,1,9,9,12,11,14,13,13,14,14,14,193,198,197,194,194,194,195,196,191,194,195,195,197,194,194,196,191,195,198,193,196,196,193,198,196,202,201,225,248,214,193,202,192,203,201,204,205,203,204,204,205,205,205,203,200,199,196,197,198,198,198,197,198,197,194,196,195,196,196,196,196,195,194,194,197,196,195,194,193,194,196,196,196,193,195,196,198,195,195,197,196,195,193,194,197,195,196,197,195,193,193,194,196,193,193,192,192,193,193,194,193,195,192,194,195,191,191,192,191,191,193,192,191,191,190,191,191,190,191,193,193,194,198,203,207,216,211,213,212,203,198,194,195,195,193,194,197,193,192,196,193,194,196,193,195,193,199,203,196,197,195,190,193,192,196,203,202,209,199,202,200,190,193,189,191,189,190,194,196,206,203,203,209,220,205,199,204,200,203,203,208,211,208,208,208,217,213,205,220,223,218,219,177,139,122,131,176,185,181,166,145,140,133,139,114,101,141,196,218,220,213,203,206,201,201,196,190,193,190,194,192,193,194,190,193,191,190,190,190,193,190,195,192,190,191,190,194,191,192,193,189,193,192,192,194,190,188,190,190,190,189,193,193,195,197,196,202,204,208,210,211,213,193,164,169,192,185,173,163,151,149,134,155,164,144,137,139,159,153,174,246,167,119,137,97,112,100,121,147,150,127,84,69,58,84,118,136,143,128,130,96,100,174,159,136,156,93,25,15,18,26,16,17,57,80,96,110,124,95,108,221,146,73,87,105,161,89,115,243,155,92,159,175,85,85,173,189,170,152,167,151,92,66,63,46,22,19,33,65,134,77,74,129,64,45,65,76,114,132,95,110,93,98,110,84,86,78,71,62,90,112,125,129,98,59,37,22,26,45,111,127,121,115,123,118,108,100,83,88,77,114,125,135,165,174,139,59,34,39,24,14,17,27,21,21,22,27,47,68,94,101,93,67,37,30,23,19,18,17,16,15,24,22,24,25,29,27,27,29,29,30,30,35,26,49,124,105,166,155,147,146,142,141,142,157,157,155,148,130,127,126,127,115,117,117,105,115,116,121,118,123,129,129,124,130,134,86,140,141,141,90,139,142,132,125,123,135,136,138,131,120,114,113,115,103,102,105,110,108,103,104,99,103,110,107,97,101,101,101,107,102,96,93,92,105,95,84,87,79,83,83,94,81,93,83,63,77,94,85,73,59,60,51,56,57,65,57,61,63,61,55,58,67,59,64,59,59,60,56,64,59,57,51,54,60,55,59,54,59,50,50,49,47,52,50,50,57,61,114,66,62,57,61,50,51,49,59,59,56,56,51,63,53,51,47,43,45,57,66,61,72,73,84,82,85,113,99,100,84,97,96,91,105,104,98,103,111,110,101,96,96,102,96,106,114,105,119,105,118,123,92,101,131,141,151,151,148,139,142,158,158,153,141,128,103,88,87,78,79,98,96,97,109,104,125,121,121,130,128,139,130,119,127,134,128,117,116,113,112,119,124,127,134,137,102,127,145,123,115,105,107,129,143,145,141,131,147,161,160,162,159,157,131,124,122,112,118,115,115,118,104,116,132,141,140,132,127,138,141,139,145,146,150,153,149,150,146,144,145,142,149,161,185,191,185,169,172,110,9,1,9,10,13,12,13,12,13,14,14,14,191,198,196,196,193,191,195,194,196,196,194,195,194,198,197,193,193,190,195,193,195,198,193,197,196,199,194,223,247,207,182,198,191,203,215,224,222,224,229,227,228,227,221,217,211,202,196,199,198,198,196,194,197,194,194,194,196,197,194,196,197,196,196,194,195,194,194,196,195,193,194,194,196,199,192,195,198,194,197,196,195,198,197,195,196,192,193,195,193,196,195,193,195,194,194,191,190,193,195,191,193,192,189,193,193,193,190,194,194,192,193,192,192,193,193,193,192,193,191,192,193,196,193,191,177,166,174,168,194,206,198,202,203,202,203,211,207,205,202,206,200,205,208,201,204,200,223,216,210,209,198,194,193,192,163,160,162,155,147,173,196,192,193,193,191,190,199,198,214,225,211,207,214,243,218,205,218,213,219,216,217,220,220,219,217,220,205,155,150,177,184,189,150,114,92,78,108,117,119,111,86,80,81,90,118,114,129,177,184,214,229,223,225,218,217,208,207,204,197,199,207,203,199,198,197,202,198,195,204,202,200,203,201,200,201,206,203,201,203,204,205,198,192,192,190,192,193,191,193,191,193,196,200,204,205,209,208,198,186,183,188,191,153,110,118,148,146,137,127,115,122,116,115,105,84,73,80,97,84,163,224,111,115,125,97,117,102,127,116,133,163,113,70,63,81,112,145,158,118,127,115,59,135,149,93,144,122,47,28,24,34,31,25,42,16,9,36,58,88,150,205,155,95,57,80,69,33,166,236,101,103,156,162,155,164,208,179,191,192,139,71,56,55,37,26,31,54,90,136,152,63,78,110,33,51,76,83,106,88,79,104,77,86,91,61,58,64,81,87,102,113,130,112,46,27,17,29,62,93,123,117,112,123,110,112,101,81,103,101,92,113,122,118,108,110,83,45,31,43,48,24,14,21,24,22,25,31,47,76,84,67,51,39,27,23,17,15,15,23,22,19,21,19,28,27,24,27,30,32,28,31,29,33,34,36,63,70,65,60,48,65,66,65,74,78,77,72,66,65,64,72,75,72,80,75,76,73,75,84,88,95,94,107,97,108,113,106,120,118,125,116,128,122,114,112,108,120,120,127,115,114,119,119,122,111,124,118,117,127,118,122,118,116,130,131,134,128,124,144,142,141,134,130,139,133,132,132,120,125,128,139,143,129,146,129,126,125,132,136,117,114,109,106,108,114,113,116,117,118,121,106,118,118,123,137,134,134,125,132,141,135,127,121,123,120,116,114,101,95,95,107,113,115,111,107,109,103,106,105,112,112,108,113,106,103,100,102,96,93,83,80,76,79,84,72,74,68,74,73,57,59,61,63,63,62,60,66,62,70,69,65,69,65,71,61,69,63,57,60,55,65,59,52,60,58,56,57,56,63,61,48,57,46,54,59,63,62,57,62,61,61,55,60,57,58,59,61,56,60,64,62,69,71,74,83,85,89,91,83,88,80,79,86,81,80,81,95,101,104,119,110,108,115,120,118,123,137,132,129,122,118,106,104,106,108,103,115,120,116,132,130,122,99,104,106,101,107,107,103,102,97,112,135,141,141,137,135,129,136,135,137,141,138,142,146,158,165,160,163,158,152,157,173,181,179,163,167,110,10,2,9,10,14,12,13,12,14,15,14,14,196,198,194,195,198,196,198,197,196,198,198,193,198,197,196,198,196,195,196,196,198,197,192,196,196,203,192,227,248,195,175,194,185,191,217,229,231,234,234,237,243,241,239,233,217,201,198,199,196,193,195,196,197,196,195,198,195,197,198,197,196,196,196,195,197,194,194,195,195,198,195,193,194,194,193,195,196,196,196,194,198,195,194,196,196,194,191,193,194,193,192,193,196,191,195,195,194,195,192,194,195,193,192,194,193,191,196,194,192,191,193,194,191,192,190,191,191,191,193,193,193,192,183,139,99,106,98,110,171,206,208,214,216,213,218,216,215,211,220,222,207,217,215,209,201,195,211,210,204,210,207,198,208,188,148,134,114,125,111,158,200,196,205,196,200,200,203,185,177,181,153,157,188,204,171,175,196,190,179,167,160,167,173,164,151,168,142,81,83,96,112,130,116,122,117,96,87,84,96,98,97,89,87,111,135,116,106,108,110,168,188,182,188,193,206,207,211,216,205,196,201,204,203,199,193,204,205,199,211,202,189,201,208,203,198,201,206,197,191,195,199,205,199,201,202,202,204,205,201,200,200,198,193,186,192,193,179,155,133,130,136,142,125,106,109,131,131,126,121,116,119,109,100,91,76,55,59,71,78,195,193,85,113,99,100,112,83,166,152,126,180,124,79,57,75,106,120,133,95,101,92,78,104,105,89,107,129,112,97,76,95,74,48,22,11,15,61,81,95,108,142,199,113,105,173,75,36,158,166,55,116,132,127,181,200,190,156,181,189,149,82,61,51,42,59,72,81,107,136,116,69,96,80,25,56,83,86,101,77,59,89,51,36,71,46,41,49,70,83,97,90,84,49,19,21,39,75,110,127,127,116,114,105,106,97,93,100,106,113,91,118,117,99,82,57,63,53,31,27,45,31,14,17,22,25,25,32,34,50,51,32,33,33,21,22,19,21,35,44,39,22,19,23,23,25,29,28,27,30,33,29,32,35,28,30,28,32,37,39,52,59,55,50,37,22,21,20,23,33,43,44,43,45,44,43,38,45,41,45,43,44,47,42,43,40,42,65,41,38,35,70,37,39,36,39,35,34,38,40,44,42,44,44,41,42,43,40,42,47,47,45,41,45,46,44,48,47,40,46,52,53,51,44,47,48,46,44,50,50,45,48,51,47,51,51,51,53,49,55,50,54,54,54,58,59,62,60,65,57,59,73,68,69,79,92,91,98,85,97,98,97,100,103,101,87,100,99,88,94,98,104,110,114,116,114,119,115,116,110,121,119,122,130,123,137,125,84,139,136,123,127,132,128,137,125,130,135,122,113,102,107,98,100,106,96,99,108,120,116,132,130,125,133,125,128,125,124,131,127,122,88,121,124,117,117,119,84,120,115,108,78,89,95,98,107,110,97,113,122,104,95,101,107,110,94,91,98,96,101,103,97,98,94,86,86,79,86,76,55,68,65,66,70,71,74,79,88,83,90,88,78,82,80,83,107,67,119,66,69,65,61,53,52,55,50,49,53,47,46,56,59,59,73,106,124,121,120,103,91,91,83,92,98,109,132,143,133,133,136,130,127,126,129,130,127,139,147,153,160,152,149,142,158,165,163,139,138,107,13,2,11,10,14,11,14,12,14,15,15,15,196,200,195,198,199,197,200,197,197,200,199,198,196,198,198,195,199,197,197,196,197,201,195,199,199,202,195,234,248,206,177,188,151,105,122,128,125,133,132,141,145,149,166,197,208,198,197,200,197,195,198,198,195,195,197,198,196,194,197,196,194,193,196,196,196,197,195,197,194,196,195,193,198,195,193,196,196,195,195,195,194,196,194,194,196,195,195,196,196,194,194,194,196,191,193,193,191,196,195,193,197,198,196,195,195,194,195,196,192,191,193,194,194,195,193,196,194,196,199,204,198,208,189,129,112,113,108,105,163,205,201,200,205,185,162,177,179,178,199,207,162,166,211,213,179,121,131,141,156,205,217,211,223,200,144,127,124,139,130,170,209,210,216,210,210,213,206,133,91,86,69,117,167,158,119,114,133,122,105,83,80,81,92,87,77,96,92,78,69,68,71,83,88,117,122,96,89,89,107,114,122,126,122,122,121,92,75,81,62,81,85,85,100,108,132,148,175,202,182,145,156,171,179,169,169,191,181,155,176,166,141,162,179,174,154,172,178,154,143,149,176,203,205,212,220,217,210,205,211,207,201,179,156,152,150,149,134,122,106,101,110,106,91,93,109,119,124,128,123,114,120,113,112,111,127,91,76,72,105,232,157,70,98,88,101,110,112,197,165,110,175,147,77,56,63,74,47,65,108,118,104,75,101,101,79,86,105,177,171,130,122,114,78,100,97,92,208,147,111,132,117,126,81,171,189,87,85,171,140,65,141,83,89,186,153,97,104,184,208,189,91,57,51,63,90,74,63,67,88,74,55,81,59,29,60,79,89,99,79,59,72,48,42,63,46,42,57,64,70,80,60,39,22,29,52,76,119,132,132,135,120,101,102,96,89,107,101,111,107,99,99,63,68,59,56,65,62,42,15,20,22,19,15,15,23,22,29,31,31,26,27,29,24,25,24,25,24,40,44,30,24,23,16,24,27,29,30,29,32,29,31,31,31,32,33,29,31,40,63,72,75,56,32,25,15,22,24,33,48,49,46,52,48,44,47,43,46,44,41,47,42,41,49,43,39,41,38,40,37,42,39,38,39,38,41,34,39,33,36,40,35,47,39,44,42,37,44,39,46,42,40,48,46,46,46,48,53,46,45,46,46,48,45,44,44,42,46,37,38,36,41,44,34,42,37,37,36,40,41,34,40,36,37,40,39,41,41,43,40,44,41,41,37,42,43,40,42,39,38,39,39,39,41,39,43,41,37,45,41,44,43,37,45,44,41,44,40,48,42,42,46,39,44,41,41,48,44,42,43,41,47,45,47,49,49,55,50,51,49,47,46,47,51,53,50,49,59,60,67,71,72,82,81,81,83,95,92,100,99,90,96,95,104,103,101,99,105,112,113,113,103,116,111,112,119,119,129,130,128,126,128,123,134,131,117,124,116,120,134,137,126,128,122,115,120,114,111,105,99,107,109,110,122,119,122,125,123,122,117,118,117,117,120,121,107,104,118,113,118,113,101,84,84,72,74,73,68,70,55,63,63,73,105,149,160,154,139,112,98,94,84,76,87,110,141,149,150,142,144,135,128,130,120,109,98,104,111,118,138,142,136,126,136,147,147,120,122,100,16,2,11,11,14,11,14,13,15,15,15,14,190,194,193,198,200,198,200,200,203,203,205,204,203,202,196,198,197,196,199,194,196,196,196,198,196,202,198,243,250,207,180,194,117,18,5,11,9,14,15,19,16,17,57,150,199,200,201,202,200,197,202,198,195,198,199,201,200,198,198,196,198,196,195,197,199,198,197,195,196,196,195,198,198,196,198,199,194,195,199,198,197,196,195,196,196,195,196,196,197,197,196,196,195,194,194,195,195,197,195,196,200,196,200,204,200,200,199,198,197,195,196,194,195,193,198,203,203,213,215,215,210,202,196,156,139,159,141,133,162,171,146,150,158,105,95,115,120,114,136,141,79,108,189,211,130,62,66,67,106,174,210,194,192,161,121,112,100,123,130,175,193,193,203,194,200,203,191,103,49,57,101,159,165,131,102,87,100,92,76,69,69,69,77,85,87,104,101,91,98,92,94,100,98,115,113,91,93,109,116,120,130,125,110,87,84,71,66,77,67,71,54,64,80,75,97,120,142,165,155,123,132,149,152,152,149,161,147,119,146,145,129,145,146,141,141,154,153,138,142,145,159,181,190,188,186,184,164,167,184,184,171,156,150,130,124,115,118,124,102,103,111,143,123,109,101,90,104,102,107,104,141,163,161,164,182,159,104,88,166,241,118,81,102,94,145,173,174,229,188,94,146,158,107,55,61,77,35,53,134,211,169,83,89,113,97,89,82,149,196,187,159,149,125,174,178,148,213,151,164,158,54,28,56,168,137,68,121,178,134,114,166,92,118,163,95,29,26,133,167,156,89,44,41,38,61,59,51,55,67,57,52,78,51,39,66,80,96,83,57,46,71,54,47,73,46,52,63,65,69,48,33,23,32,70,100,124,135,132,135,118,108,101,87,87,98,110,101,93,99,75,41,18,33,51,43,59,66,45,21,28,36,38,28,12,18,21,22,26,30,25,19,26,26,29,28,29,29,37,32,21,24,29,24,19,29,24,31,33,24,31,30,30,33,30,36,33,45,69,85,79,56,35,22,19,18,21,42,54,55,61,56,50,54,53,53,50,47,46,44,40,44,47,42,39,41,43,40,49,44,45,46,36,39,33,31,33,34,29,31,39,37,41,41,39,43,43,49,51,46,51,52,49,52,54,53,55,53,55,55,52,51,54,54,55,50,49,49,40,38,44,43,42,41,42,41,37,39,39,36,35,39,39,35,41,41,43,38,38,43,43,40,45,45,38,43,39,41,46,44,43,47,44,45,46,45,44,44,44,43,45,43,46,45,39,42,41,39,44,38,39,45,41,44,37,41,39,34,38,35,36,35,33,36,40,38,41,36,34,41,37,41,39,37,41,43,41,37,41,41,41,41,43,40,35,44,37,36,40,39,43,41,39,44,42,43,36,41,43,43,41,68,73,41,42,46,45,81,44,48,48,48,48,48,48,51,49,53,54,85,60,58,57,59,79,60,57,59,59,61,67,68,73,80,86,85,90,84,78,90,99,99,96,106,110,96,106,84,110,111,126,110,101,104,103,115,101,103,102,97,101,108,113,125,152,157,153,147,126,115,109,106,114,125,149,156,160,159,153,146,142,141,125,112,107,98,96,90,97,117,124,131,120,121,123,129,115,113,100,17,3,12,10,16,13,14,13,14,15,15,15,184,190,186,199,201,201,205,202,203,205,208,210,208,205,198,194,193,193,194,193,197,195,196,197,198,198,201,249,250,205,181,199,120,15,4,10,17,27,25,29,40,32,50,158,214,214,222,218,210,213,212,206,203,208,217,215,214,209,204,204,203,202,201,204,202,202,203,202,200,200,201,203,205,204,208,210,205,207,213,206,206,205,200,202,201,200,203,198,203,207,203,202,201,200,204,201,199,203,199,211,214,214,221,214,211,214,209,205,200,193,189,183,184,188,199,206,206,212,209,212,164,121,137,118,123,131,105,106,119,103,86,87,90,71,69,92,100,77,86,107,69,86,167,166,91,67,56,45,74,118,150,123,129,120,99,93,85,93,96,142,155,145,151,141,144,150,155,101,75,95,158,207,165,113,101,117,116,107,108,104,109,107,119,121,123,131,122,123,120,128,130,133,139,135,127,123,132,140,132,112,103,100,86,97,106,100,110,116,105,102,93,116,133,131,138,151,149,138,149,158,157,168,171,161,162,160,155,149,160,162,163,168,155,160,168,184,178,165,170,169,162,163,131,103,98,91,81,98,138,151,165,162,163,151,129,112,119,126,112,120,164,205,185,151,119,88,75,73,75,88,159,198,198,196,225,184,119,117,223,227,88,120,108,110,182,224,242,244,191,84,119,163,131,69,64,125,108,101,163,212,162,87,105,105,109,86,32,100,188,229,221,230,177,212,217,106,140,120,103,127,114,46,72,165,98,81,141,137,130,162,184,196,175,124,82,22,4,53,51,69,49,30,32,17,39,45,44,35,42,38,61,94,55,56,81,92,94,75,63,55,74,61,55,75,54,47,58,55,45,27,23,34,74,118,121,122,123,128,116,99,95,92,96,95,104,117,93,83,72,37,19,15,42,46,38,57,65,47,22,39,53,54,49,21,17,18,22,26,26,28,23,25,24,28,26,27,29,37,38,34,38,37,43,23,19,29,28,31,28,34,28,29,25,33,39,54,76,86,82,48,25,18,18,18,22,41,56,55,51,59,57,55,57,51,52,56,54,51,48,47,46,49,50,44,44,46,48,48,47,45,25,24,21,15,23,21,18,16,25,33,33,35,36,42,46,48,52,56,51,46,54,55,55,53,53,55,55,58,53,54,54,55,54,54,55,53,53,45,46,42,43,44,39,46,36,32,26,22,28,26,26,26,26,35,36,37,38,40,40,39,43,43,37,43,46,50,51,53,55,53,57,50,55,57,53,56,52,52,49,54,57,53,56,50,50,45,44,45,40,43,38,39,41,39,43,37,37,35,36,39,39,37,38,36,36,34,32,34,32,36,42,41,42,41,42,46,40,43,41,42,42,41,39,39,38,37,42,41,40,36,43,43,38,41,44,41,38,41,41,40,42,40,39,39,42,42,41,35,39,42,37,41,45,42,44,49,52,54,44,48,49,48,53,49,50,44,43,45,46,49,48,46,53,52,57,56,54,55,59,61,57,51,38,38,39,35,37,39,38,49,56,57,87,102,113,98,83,98,103,112,108,108,99,105,110,116,127,119,120,125,132,141,149,153,154,154,160,155,145,143,143,136,122,119,132,118,105,95,93,108,112,111,106,105,110,97,114,99,18,4,12,11,15,13,15,13,15,15,15,15,191,188,181,188,187,188,190,183,187,177,180,192,196,200,195,188,189,189,188,187,192,191,190,192,192,193,195,253,249,188,168,213,128,22,33,30,27,31,33,36,46,46,88,195,227,220,222,225,219,207,214,208,203,205,210,212,212,214,214,212,210,217,222,217,211,213,214,215,208,211,215,216,214,214,222,220,208,216,218,217,221,220,215,215,217,214,213,212,221,220,218,222,214,219,218,217,215,215,203,211,214,192,208,214,215,220,214,213,196,184,184,187,194,187,191,179,137,132,144,147,97,66,83,78,90,95,80,88,92,88,76,72,77,82,95,99,100,101,108,105,89,97,119,96,67,75,79,72,88,96,107,117,132,107,97,118,147,152,134,151,145,133,114,100,101,109,126,115,125,148,188,200,173,134,136,134,128,122,138,141,126,117,118,124,130,122,120,131,133,130,133,133,131,144,148,152,153,148,125,98,94,99,97,119,159,149,152,149,120,128,130,151,164,135,128,143,131,115,147,171,168,171,172,170,170,177,178,168,181,184,188,199,178,169,181,194,178,150,141,151,162,132,94,65,69,81,83,137,168,182,189,179,184,162,148,131,127,134,123,138,163,207,219,220,206,189,175,166,160,157,187,194,181,202,220,184,95,121,226,171,98,155,115,122,184,235,236,197,177,108,115,146,149,94,69,118,161,180,130,120,129,111,106,85,81,24,4,58,90,160,200,196,151,166,190,125,132,102,69,178,160,78,154,178,78,108,142,108,174,172,163,224,178,108,65,12,12,35,32,63,44,28,29,17,31,27,32,27,31,40,74,87,57,66,65,71,66,55,64,59,50,38,39,55,46,39,42,35,28,31,54,66,98,116,105,97,101,105,103,88,85,89,95,101,110,117,105,75,35,16,24,41,59,70,51,54,60,35,26,49,63,61,57,32,13,17,18,21,21,24,24,20,24,28,24,21,32,37,39,51,48,42,47,34,24,24,24,29,34,31,29,30,35,44,58,74,77,70,44,24,21,17,16,15,35,54,54,57,57,54,57,56,55,53,53,60,50,51,56,53,54,57,51,48,48,50,48,50,48,35,21,16,21,17,16,15,18,19,17,25,36,39,38,45,49,51,54,55,51,50,51,55,56,56,54,58,57,54,54,56,57,54,53,55,54,55,53,47,44,40,44,44,40,38,23,21,17,19,17,17,19,18,23,22,32,33,39,36,39,43,40,37,30,46,56,57,59,59,61,60,61,62,56,55,63,55,55,56,51,57,55,54,60,49,45,49,44,41,35,42,44,39,41,44,39,39,38,34,46,38,37,34,20,23,21,23,25,21,25,27,35,41,42,44,45,44,49,47,46,49,47,42,39,39,40,39,42,41,38,41,39,42,50,42,38,42,44,45,43,41,43,47,37,44,46,39,42,36,40,35,35,39,36,41,49,55,55,53,48,54,54,53,58,54,53,55,51,55,54,53,58,54,55,61,60,63,59,64,69,61,53,36,23,23,27,28,37,36,27,47,51,54,71,82,87,74,77,100,115,106,98,85,64,67,68,88,98,83,113,127,131,131,126,130,133,150,154,144,136,139,139,129,132,149,152,138,123,117,104,109,117,109,105,98,108,101,111,97,19,4,12,11,16,13,15,16,14,15,15,15,194,180,158,155,145,141,151,151,141,129,128,143,162,184,192,191,192,190,187,188,192,192,195,196,194,196,177,234,242,143,165,215,131,28,33,29,25,30,28,41,42,51,82,176,178,136,150,166,157,153,176,177,152,120,134,146,165,188,194,210,200,202,210,210,200,199,213,216,199,199,215,216,194,171,187,207,185,179,193,187,202,214,209,208,216,220,214,196,192,205,207,213,214,208,210,212,209,208,166,158,139,113,141,152,146,154,176,210,216,219,244,251,245,202,168,108,67,77,84,99,87,82,98,101,118,129,121,129,117,108,117,100,103,107,113,117,110,119,123,125,110,91,94,78,85,118,120,141,139,132,143,151,146,93,94,146,207,213,176,177,162,143,126,108,118,127,151,150,151,195,201,187,177,154,135,126,112,110,136,134,113,95,98,113,116,114,104,109,111,117,110,100,109,121,131,146,140,125,113,100,105,105,97,131,145,127,136,135,120,126,132,152,136,108,102,100,93,93,147,180,159,150,134,138,157,151,144,128,145,170,173,169,145,127,141,150,119,108,107,120,144,127,101,95,127,160,174,196,195,184,179,162,161,146,132,129,136,135,132,133,140,170,209,232,249,249,251,251,249,243,230,200,168,142,146,118,79,120,182,126,94,144,116,118,143,193,236,175,162,120,108,120,149,129,72,67,149,197,116,102,122,131,147,113,84,42,14,55,19,62,96,113,120,114,204,162,181,158,127,205,124,99,198,151,65,128,110,104,191,124,78,162,131,57,36,6,39,122,121,91,62,61,74,63,59,49,41,48,57,61,77,69,46,44,42,48,47,34,46,47,37,32,31,33,26,31,28,22,37,65,91,94,118,108,81,88,85,105,94,77,75,80,106,111,120,122,73,32,21,22,52,68,97,112,74,59,55,31,37,66,69,62,66,51,43,39,24,17,19,23,21,27,22,24,28,21,26,29,30,44,55,50,46,40,27,21,25,33,28,33,31,29,44,63,77,74,54,32,24,19,17,16,17,40,53,54,57,60,61,57,55,55,57,53,54,53,57,55,57,56,50,54,55,55,51,50,46,47,44,29,22,18,17,21,17,20,20,17,19,23,39,42,43,44,50,55,52,56,50,48,57,55,53,54,57,59,53,54,53,59,58,52,53,51,56,53,51,51,50,46,44,46,44,32,18,20,18,21,18,17,19,16,18,25,29,32,34,33,44,46,39,30,39,58,54,57,61,63,61,60,61,60,60,57,60,61,59,54,56,49,52,53,51,52,41,50,45,37,42,41,43,44,38,42,40,38,44,41,45,45,26,23,19,16,21,17,22,18,20,26,31,36,39,46,41,46,50,52,51,53,50,46,41,42,43,41,44,41,43,46,42,43,44,39,43,46,44,43,45,43,41,45,41,42,46,38,30,27,21,24,29,24,27,34,35,42,48,49,52,50,53,56,56,55,55,58,56,59,56,55,59,60,63,63,62,61,61,66,63,50,38,26,29,28,27,35,51,56,65,79,55,53,51,39,57,60,59,83,87,68,53,47,36,62,74,68,77,77,107,119,121,118,106,112,125,132,136,124,121,116,102,107,117,129,142,120,118,129,122,134,134,126,115,124,133,111,130,103,16,4,11,12,15,13,15,15,15,16,16,15,183,160,138,123,108,112,127,143,145,127,118,116,139,181,203,209,206,201,203,204,211,210,212,215,214,214,196,238,224,133,155,205,109,27,46,22,29,33,29,36,42,51,72,130,95,50,60,81,78,91,144,128,89,63,70,89,114,139,156,168,136,116,125,137,129,147,158,162,139,130,149,148,116,87,123,154,113,112,121,114,132,141,134,131,156,178,157,104,111,129,141,158,128,126,134,141,153,160,105,83,87,64,90,78,74,98,133,208,246,249,250,250,250,207,135,96,93,108,107,119,116,122,143,145,159,157,143,150,135,130,139,120,119,128,115,113,112,122,128,128,128,104,115,127,148,182,183,182,163,143,149,160,139,88,90,137,184,178,162,170,151,149,141,131,132,129,145,153,187,192,170,157,143,130,109,102,93,79,107,114,107,99,92,97,101,95,95,103,98,103,99,86,94,112,115,117,120,128,115,104,120,114,99,107,110,99,109,114,96,94,108,131,125,101,97,94,88,97,150,170,143,119,99,100,111,108,103,80,112,141,128,118,95,92,102,105,101,110,125,137,147,136,142,133,141,160,151,165,137,118,118,111,113,107,105,108,112,117,115,107,109,117,128,141,144,146,154,167,178,195,211,216,212,200,202,166,92,151,234,152,92,106,101,127,186,228,229,175,155,141,144,109,133,152,97,31,132,219,128,97,122,147,186,246,241,214,180,159,99,112,141,159,182,137,148,130,192,170,150,203,80,125,207,104,86,151,136,139,153,80,22,63,73,44,29,9,62,142,152,122,90,124,138,125,107,83,77,73,76,83,90,67,41,34,30,42,44,39,41,39,53,53,43,33,29,25,21,39,74,109,111,109,129,112,88,89,97,100,88,91,88,97,114,113,111,74,29,20,19,45,85,97,112,121,114,111,73,23,30,64,80,93,89,69,63,55,33,19,14,22,17,19,27,21,27,24,25,23,25,40,45,53,52,42,36,27,24,26,29,33,30,47,66,77,69,48,28,24,18,17,15,22,53,55,56,57,56,57,55,57,54,59,56,54,60,57,52,56,59,57,57,51,50,50,48,50,49,44,40,24,20,17,18,18,18,21,21,22,17,31,40,44,45,45,49,51,54,50,52,55,51,49,49,49,55,51,52,55,58,57,53,56,55,59,54,53,55,53,56,56,51,50,42,25,21,21,16,21,21,19,19,22,19,22,31,36,44,49,51,45,25,28,48,61,60,61,63,59,57,56,59,59,60,61,57,58,55,51,55,50,48,52,46,48,48,44,41,38,44,42,40,42,42,46,43,48,46,46,48,39,22,19,17,19,16,21,22,17,21,16,37,37,40,44,42,46,49,53,54,50,46,51,48,45,42,43,48,42,46,43,44,44,41,45,42,48,44,45,47,45,47,45,45,42,33,26,20,16,20,22,18,22,25,24,27,36,42,41,44,49,52,55,51,51,50,53,57,55,55,55,56,61,63,60,60,62,63,61,56,42,28,29,29,29,27,37,58,99,142,118,69,67,84,75,74,70,63,69,60,38,38,29,33,84,96,103,111,104,128,129,126,118,112,116,123,118,106,108,106,105,97,92,87,108,106,83,83,109,130,128,125,123,124,134,137,121,141,102,15,4,11,12,15,14,16,15,15,16,16,15,166,149,125,112,114,115,132,162,173,171,159,155,164,193,211,210,208,205,203,202,205,206,206,207,214,219,229,247,237,139,119,138,57,25,39,24,29,32,31,35,38,56,75,122,100,64,68,78,85,104,142,132,113,99,122,106,111,126,120,120,76,57,53,62,71,78,88,100,83,62,67,80,78,61,71,94,77,76,80,69,71,78,72,65,89,101,88,66,63,74,89,86,65,59,59,84,103,109,93,83,77,81,95,92,95,99,103,151,178,178,173,174,181,144,113,108,117,115,119,132,122,124,131,134,140,122,114,131,124,114,123,116,117,122,117,114,111,121,120,122,126,124,147,166,186,223,237,218,190,160,145,149,138,106,112,129,136,130,118,127,119,125,128,123,109,92,109,157,195,177,141,124,119,106,81,77,75,67,93,116,107,85,78,75,79,84,100,105,89,95,86,79,108,130,110,102,126,132,118,103,125,124,103,113,101,96,105,114,112,84,83,107,105,103,101,102,104,103,128,124,100,100,82,81,103,107,107,96,118,131,112,98,92,89,95,110,113,132,149,152,156,150,158,151,107,91,98,95,87,88,82,80,89,91,96,89,96,102,99,97,88,84,69,57,55,53,54,61,69,105,154,200,230,233,245,217,116,134,203,152,121,122,100,161,241,237,193,160,160,158,174,111,119,173,157,73,153,238,123,98,101,99,186,247,250,250,251,251,187,232,204,177,212,107,100,121,148,123,172,160,61,172,163,71,129,201,252,184,95,59,16,28,43,49,38,33,55,59,62,65,56,85,102,88,88,76,72,77,73,69,72,63,52,34,27,42,41,43,40,36,57,69,53,29,20,28,34,69,106,113,114,101,128,112,95,109,88,106,107,105,113,111,125,108,61,28,12,26,49,82,110,102,110,112,118,121,72,30,51,94,111,102,80,61,46,30,26,24,16,15,18,25,24,24,29,27,30,25,27,29,35,51,56,49,48,34,22,27,29,34,50,74,79,57,38,24,20,16,15,19,31,50,59,57,54,57,57,54,54,61,57,59,56,56,56,53,61,57,55,51,58,55,52,54,46,50,47,45,40,24,18,19,17,27,21,19,24,18,19,30,42,40,46,49,52,54,53,56,54,47,51,45,41,49,49,51,54,55,53,57,55,57,59,51,54,58,57,57,57,55,57,51,44,30,21,17,19,25,19,19,18,22,19,30,48,45,52,53,43,35,20,32,55,57,61,61,59,60,50,57,56,55,60,55,58,59,53,53,48,47,54,47,55,53,47,47,39,42,41,43,44,44,47,56,49,49,47,48,40,27,21,18,22,17,21,20,26,21,18,23,29,41,41,42,45,46,48,53,51,54,52,44,45,45,45,46,43,45,48,44,44,43,46,50,48,50,51,51,47,50,52,49,50,42,26,22,21,21,22,21,21,24,23,22,25,33,37,38,47,47,48,50,49,54,54,50,57,56,54,49,53,62,57,60,56,59,57,54,48,31,29,33,33,33,26,38,87,125,149,137,104,126,129,123,134,123,98,98,84,86,100,90,101,120,139,148,152,135,128,131,132,128,121,125,137,134,122,117,119,118,111,101,102,111,100,87,89,116,110,101,107,93,87,105,116,109,128,96,17,5,11,12,14,14,16,14,15,16,15,16,144,122,116,117,120,109,110,130,143,155,171,175,180,174,166,167,164,155,141,126,127,120,116,117,142,154,183,245,236,136,67,62,9,18,36,21,31,29,30,40,41,52,99,149,128,110,116,137,133,141,173,167,165,141,148,141,127,131,122,115,101,102,93,83,77,92,102,103,86,70,69,76,81,85,92,89,80,88,84,87,94,89,89,81,84,95,91,92,96,96,103,107,92,83,81,95,113,120,112,110,110,101,125,122,115,112,89,84,86,85,76,75,86,84,100,109,105,104,109,118,112,109,111,107,111,97,101,113,107,109,115,116,116,121,118,123,118,116,111,102,127,145,151,159,199,226,234,234,233,213,197,190,165,122,111,117,129,122,100,95,95,100,110,113,108,94,141,184,191,152,105,119,119,105,87,88,102,108,127,140,122,98,83,92,107,108,117,122,99,100,101,98,133,153,124,118,142,146,122,108,127,128,126,124,112,114,127,142,136,113,99,103,108,99,103,112,109,104,104,96,82,84,100,110,123,135,126,119,127,119,111,110,100,96,110,116,123,152,156,154,153,152,157,155,108,67,81,101,92,93,95,93,97,97,95,95,101,101,103,94,94,85,67,71,81,84,73,67,57,86,108,131,147,133,165,172,125,81,91,126,139,131,113,115,160,143,88,76,74,73,86,63,93,151,172,88,98,119,103,119,79,38,82,222,252,252,251,251,232,186,148,110,166,137,124,145,108,126,188,91,83,188,81,87,173,200,249,160,84,38,2,40,54,51,45,44,44,36,39,43,41,41,53,59,57,61,55,46,42,41,47,45,38,32,28,36,35,29,33,40,55,46,33,28,27,31,55,95,100,105,110,100,101,87,100,98,98,112,106,118,116,124,112,56,24,14,29,60,87,115,111,100,101,113,111,115,100,77,109,116,99,71,44,30,26,24,22,18,15,17,24,24,22,27,33,24,28,30,26,28,29,42,51,51,56,46,27,22,32,55,76,73,54,33,21,19,20,17,21,41,56,60,60,53,57,55,56,55,55,56,56,57,54,52,52,54,57,56,57,55,53,53,55,55,49,51,45,43,37,20,22,18,17,24,20,17,19,24,21,31,39,41,46,47,47,57,59,53,55,53,38,31,37,47,52,50,60,55,53,56,54,56,51,56,56,59,56,53,58,54,53,53,39,19,21,18,19,19,22,22,18,19,24,38,42,50,49,46,31,22,20,29,57,60,61,58,56,57,54,57,60,51,59,59,53,54,51,53,53,50,52,53,50,47,40,38,44,44,41,41,45,50,48,50,51,50,52,45,41,28,21,22,19,19,21,20,20,19,21,20,34,43,42,46,49,49,49,54,57,49,46,53,46,46,45,44,45,41,47,46,47,52,51,48,50,51,51,58,55,59,57,55,51,36,27,20,25,21,20,21,21,22,21,23,22,35,39,39,47,48,43,47,48,50,52,49,52,48,50,54,57,55,55,59,56,56,59,53,40,30,29,36,33,27,26,66,112,127,131,119,120,126,128,136,144,139,116,119,130,138,148,144,149,153,146,145,138,124,124,120,123,125,125,136,153,155,138,129,125,125,125,123,127,137,122,121,117,116,114,96,97,75,64,86,84,85,110,88,20,5,12,12,15,13,15,15,15,16,15,15,117,109,117,117,117,106,93,87,79,90,107,129,133,114,102,110,119,110,85,64,53,44,46,46,69,75,131,239,239,128,55,58,12,13,33,34,31,25,36,44,43,52,93,154,153,149,149,165,165,164,174,168,163,115,137,143,128,128,121,126,129,140,139,145,141,146,149,146,127,92,91,104,114,113,112,116,102,105,106,108,117,109,107,111,116,105,114,126,122,125,135,132,125,130,120,126,129,127,119,116,118,128,141,123,108,102,102,98,90,77,69,84,94,95,105,105,108,110,110,112,106,110,114,109,110,110,107,111,109,112,119,113,115,116,116,122,125,126,124,114,122,140,137,165,203,214,214,227,252,252,246,236,200,149,120,113,128,129,106,97,94,98,118,139,147,153,195,215,180,136,132,144,142,143,150,156,170,170,163,178,168,163,166,168,169,141,136,152,141,151,155,137,139,144,129,134,152,147,134,127,139,141,139,136,125,127,125,137,142,127,127,112,99,103,111,121,120,109,110,103,98,119,125,123,145,146,126,112,109,107,106,112,104,117,146,133,125,156,159,154,151,149,153,161,130,74,87,107,114,112,106,106,105,103,101,98,109,110,104,108,118,151,134,93,106,107,100,109,99,98,92,91,84,45,93,141,151,107,45,51,66,72,48,37,66,69,53,46,35,33,21,11,26,71,146,101,41,11,84,158,83,35,9,28,127,205,219,244,177,131,139,126,203,156,148,154,95,167,143,46,133,166,133,183,129,73,139,103,45,22,20,61,53,41,32,29,30,30,32,29,27,29,34,33,37,34,33,30,23,29,27,27,25,20,27,27,21,27,27,31,37,41,30,40,42,47,69,90,104,108,112,94,95,93,87,99,100,110,115,116,125,99,50,21,15,32,67,99,98,106,109,93,95,107,116,112,101,94,111,76,40,36,26,26,17,19,15,21,23,21,22,21,24,27,26,29,28,25,33,28,31,31,46,57,55,62,40,29,52,75,71,44,29,21,21,15,18,29,42,55,64,61,57,55,53,58,58,56,55,55,55,60,51,55,57,55,54,57,59,55,59,55,56,51,51,50,47,37,24,26,23,17,21,22,16,21,19,19,21,34,42,39,43,48,52,55,54,54,55,44,35,34,37,53,57,59,61,57,57,55,56,57,57,57,55,57,57,55,55,59,57,47,25,19,25,21,18,21,19,19,19,18,24,35,44,49,51,34,22,24,17,44,58,59,63,55,59,59,57,60,52,56,56,51,61,53,58,60,54,59,53,52,51,47,46,43,42,41,43,42,48,53,50,49,47,48,50,49,36,21,22,23,20,19,24,25,18,24,22,24,36,45,45,45,51,51,52,50,53,55,50,54,47,48,44,45,49,46,51,50,53,57,53,45,49,49,56,57,57,57,52,55,46,34,20,24,28,17,20,27,22,23,26,24,34,36,38,44,48,47,50,50,49,50,54,51,46,49,54,53,55,60,57,62,60,58,61,52,33,34,38,39,36,21,39,104,129,113,117,110,107,99,94,107,99,107,92,101,124,134,141,131,146,149,128,111,119,119,122,124,119,122,133,141,148,142,125,131,134,127,122,121,143,150,128,121,114,122,119,108,111,87,84,83,78,81,106,94,19,5,13,12,16,13,15,15,15,16,15,15,95,110,121,122,122,110,103,80,67,63,72,83,92,97,96,127,164,165,144,130,110,95,96,92,88,80,141,245,246,142,102,126,50,19,29,31,33,26,32,34,39,50,93,157,170,180,180,178,178,175,162,145,139,110,134,150,129,122,118,127,136,141,150,160,166,172,170,174,159,128,111,116,121,118,114,112,118,113,107,114,111,107,116,117,113,101,105,126,123,130,135,136,137,143,134,126,122,114,113,112,119,132,141,121,98,95,105,102,103,95,86,105,111,107,107,101,103,112,115,117,112,115,114,111,116,117,110,111,119,111,118,121,110,110,116,128,126,126,136,139,131,129,129,136,151,150,144,152,179,199,216,221,213,197,182,146,149,125,109,112,114,114,124,136,171,226,226,215,179,161,171,176,183,186,192,209,205,194,171,173,164,198,228,222,179,138,140,168,184,182,169,132,111,111,103,105,128,139,135,130,134,125,122,127,131,124,113,113,110,112,127,115,100,104,117,128,124,117,112,118,125,125,131,130,143,148,112,100,98,102,122,118,120,159,215,190,139,155,152,157,153,146,158,161,147,96,73,98,118,125,109,106,108,131,137,111,126,127,125,126,173,251,234,123,97,105,110,140,135,108,101,113,91,108,186,200,207,173,65,18,10,20,10,21,120,169,139,133,132,140,110,45,21,27,122,151,154,115,156,188,117,96,63,23,20,78,95,152,172,153,198,190,212,146,160,139,108,180,97,102,184,192,198,171,92,18,25,46,45,44,49,57,38,26,23,23,27,29,25,24,27,27,27,27,30,29,26,31,24,22,22,21,26,19,22,21,20,25,28,26,29,44,51,61,53,53,68,95,108,106,105,91,98,90,98,112,105,119,117,118,96,45,21,12,39,71,108,109,87,101,92,90,71,80,101,107,103,90,96,55,33,38,51,35,17,24,36,48,34,19,18,18,27,29,29,29,28,29,28,31,24,27,38,51,60,66,59,50,59,61,39,24,19,17,18,22,30,51,58,67,64,57,59,58,55,55,57,55,59,53,57,56,55,60,60,57,55,57,53,54,53,52,56,51,52,46,42,33,19,28,26,21,18,23,24,23,25,18,29,43,39,46,50,49,55,54,54,52,44,38,30,32,42,56,63,59,57,59,63,59,57,59,59,55,53,56,56,61,59,57,53,33,22,27,21,22,23,23,21,19,26,20,32,42,47,48,43,27,21,21,22,49,63,60,64,61,60,61,59,59,57,60,57,53,59,60,60,61,57,59,55,57,57,49,48,44,45,42,48,47,48,56,51,51,54,53,48,38,28,19,19,23,16,18,20,22,18,19,20,25,41,43,43,49,50,52,50,51,51,50,55,54,50,50,51,51,52,48,56,58,55,54,48,49,51,53,53,54,55,55,55,53,38,25,21,25,23,18,25,23,18,27,27,24,30,40,41,43,49,45,52,54,48,54,53,55,55,55,56,59,59,60,63,65,60,63,61,43,39,38,44,39,38,22,69,142,131,120,128,117,98,68,66,71,71,90,67,78,115,128,128,122,132,136,123,121,117,115,134,140,130,130,146,143,140,133,116,129,122,114,118,122,149,151,123,121,120,122,130,126,133,114,110,109,95,106,117,91,19,4,13,12,15,14,15,14,15,15,15,15,92,106,112,116,122,114,108,96,78,75,73,79,89,86,94,144,206,218,204,196,181,162,150,132,129,124,181,248,249,159,156,211,104,37,33,30,34,29,36,28,37,45,79,133,139,167,169,165,164,153,139,116,120,118,139,139,120,115,120,129,141,132,125,134,137,148,139,155,159,128,117,110,103,93,99,111,102,105,106,106,109,108,108,111,111,89,93,113,125,131,128,124,129,132,118,114,116,115,118,120,127,129,113,107,111,108,105,107,123,112,104,112,108,107,111,111,121,126,131,142,131,121,115,108,104,113,112,111,115,104,120,114,107,112,117,131,121,107,119,143,149,136,101,91,96,84,84,89,108,136,164,183,198,224,237,222,182,148,125,113,111,111,104,100,163,241,230,181,159,160,174,180,186,191,184,191,199,178,139,140,125,160,202,195,142,103,118,156,181,161,135,108,98,93,86,110,120,117,118,120,120,94,101,121,121,119,90,82,84,84,112,114,125,133,131,135,129,112,116,122,113,112,113,110,129,118,89,90,106,135,152,150,152,190,247,226,150,154,152,155,159,145,153,157,159,116,71,83,105,120,118,113,145,181,177,137,135,134,122,126,162,239,226,126,94,101,132,167,170,117,78,96,92,118,176,233,231,161,131,88,77,63,55,57,127,208,177,182,158,196,192,96,81,24,94,172,179,172,206,201,126,116,105,89,83,57,57,156,157,158,165,150,199,153,169,114,139,182,125,196,222,221,189,118,68,10,12,30,63,71,51,34,23,24,23,27,29,25,29,30,27,28,27,29,28,28,25,24,24,18,23,22,26,20,23,25,24,24,25,29,29,47,67,78,56,53,83,109,105,91,97,77,81,98,103,115,108,117,120,77,42,16,15,39,72,114,125,110,98,96,99,104,71,71,96,103,113,102,107,88,86,107,103,68,26,25,49,51,29,22,24,22,25,24,31,28,28,29,24,26,27,29,31,46,61,66,76,61,46,27,15,19,18,19,26,39,55,57,63,68,61,59,62,57,56,60,57,59,59,60,66,61,59,61,57,63,63,57,56,53,53,55,57,51,45,41,45,30,23,21,22,25,19,21,24,27,23,27,39,43,42,50,54,53,50,51,54,39,39,36,25,35,40,51,58,60,60,60,61,57,61,62,57,55,56,55,55,55,56,57,45,27,23,24,25,27,18,19,23,21,20,30,39,42,46,46,42,28,31,24,37,63,63,64,65,61,57,57,57,61,55,56,57,58,65,63,58,54,58,57,56,60,57,54,50,45,51,51,55,53,50,54,51,52,48,51,51,37,26,19,23,22,19,23,19,17,25,24,20,35,43,43,48,49,50,53,57,55,57,55,51,53,60,62,54,56,58,57,54,48,53,46,49,55,52,53,52,55,56,57,51,42,31,22,25,30,21,22,24,19,24,22,21,23,31,46,47,51,52,56,54,53,60,60,60,56,64,68,68,66,64,65,66,65,60,59,53,44,41,45,38,46,44,44,125,160,141,140,148,133,115,76,68,85,84,105,77,96,128,127,136,127,137,136,131,135,131,132,145,148,141,135,132,113,125,130,108,123,118,125,134,137,151,148,143,147,141,134,132,137,142,127,125,124,121,110,109,84,19,6,12,12,14,13,15,15,15,16,15,15,91,97,103,118,122,119,119,104,93,82,84,77,78,85,71,83,128,143,140,145,152,147,137,134,148,176,229,251,248,142,170,244,158,62,25,17,33,34,44,49,39,46,67,91,88,118,123,104,108,101,86,89,95,94,108,110,99,97,100,107,101,100,98,88,93,83,91,109,117,104,95,93,81,92,98,96,99,108,110,108,110,112,109,114,115,100,103,122,134,127,111,113,112,110,107,112,122,123,121,122,140,124,105,120,127,128,127,124,131,127,119,126,129,120,125,133,134,141,148,149,137,129,118,110,99,105,109,100,110,94,110,111,103,115,117,136,117,103,103,120,131,131,104,83,84,75,77,78,84,102,124,137,139,151,178,210,207,208,181,162,116,98,77,63,136,192,169,122,124,129,137,145,134,132,122,132,132,130,117,121,93,98,127,130,107,95,106,122,136,130,113,105,101,118,125,129,121,101,102,104,109,86,105,113,101,88,72,73,62,72,85,98,126,137,135,130,118,117,125,122,110,109,116,104,99,97,89,117,141,165,178,170,168,184,220,188,147,165,156,160,160,142,149,150,168,147,83,71,79,105,113,137,171,181,184,156,135,112,104,98,95,165,149,122,107,121,152,171,183,145,78,76,81,107,127,139,160,169,221,156,153,166,118,95,95,162,199,165,132,152,174,130,71,10,50,139,145,123,189,165,74,83,75,79,90,78,97,120,139,136,88,141,185,162,137,83,151,159,168,235,193,229,170,59,38,2,31,60,57,54,32,25,32,35,29,27,29,28,30,33,29,25,32,24,27,26,24,24,19,24,22,19,25,24,25,24,26,25,27,32,42,66,74,80,60,79,106,98,92,85,84,64,82,102,103,105,101,110,75,30,18,21,49,84,116,124,127,100,95,107,103,156,114,88,100,107,117,112,119,105,124,139,145,101,39,33,34,33,26,30,29,21,24,29,29,35,34,31,24,23,28,27,26,43,61,68,81,79,46,21,19,17,17,27,50,55,61,67,62,66,63,61,60,61,63,61,65,61,62,64,62,62,62,60,53,55,55,51,57,54,54,55,55,53,45,46,39,23,17,20,23,21,19,23,26,24,22,30,43,45,51,53,52,51,53,54,49,44,36,33,29,34,43,52,57,60,56,55,60,58,57,60,60,57,58,53,49,52,54,50,41,28,19,23,21,19,17,19,22,21,23,31,42,45,48,44,39,37,34,27,46,66,62,61,65,59,55,56,53,55,54,54,54,60,56,59,60,61,56,55,61,57,60,61,50,52,60,55,53,55,60,54,50,53,53,47,49,35,19,23,17,19,25,19,19,25,24,21,30,37,46,46,47,57,57,52,54,57,55,52,58,64,57,56,55,58,59,55,49,45,44,44,48,54,53,57,61,57,55,51,50,35,25,26,21,25,22,24,21,17,22,23,27,25,43,57,54,58,61,61,63,60,55,61,60,61,69,66,63,63,61,61,60,62,59,55,49,41,42,33,37,48,57,91,145,142,127,139,141,141,127,111,121,122,114,143,127,116,122,130,135,112,132,145,131,139,132,132,151,149,136,113,101,84,105,125,111,118,120,137,143,130,129,132,144,162,153,141,130,121,128,113,116,111,105,104,102,86,19,4,12,11,15,13,15,14,15,15,15,15,110,113,112,122,124,126,129,118,110,108,107,103,104,100,95,94,92,87,67,76,101,98,98,103,142,195,232,252,246,105,128,237,157,39,27,18,28,37,63,78,48,45,67,83,72,91,81,66,77,67,74,77,83,82,87,92,90,92,89,89,93,86,86,87,83,87,78,85,97,92,96,97,102,101,96,96,96,104,104,98,101,111,111,124,127,109,119,124,133,119,100,106,107,107,101,105,124,120,117,120,136,136,129,136,121,125,129,120,131,135,141,144,135,117,117,117,128,136,138,147,131,134,131,124,113,114,129,125,131,110,121,116,107,116,113,122,116,110,99,118,151,171,147,123,110,96,98,94,97,92,103,115,103,85,110,178,212,229,233,218,182,146,78,47,86,142,107,67,93,99,104,106,108,110,96,87,91,100,96,96,88,87,89,105,107,109,112,97,110,117,113,109,124,136,127,131,100,82,88,98,113,97,108,102,84,78,63,78,69,61,74,66,86,112,122,127,120,124,134,120,113,123,120,116,112,116,133,152,175,184,186,184,171,164,168,160,155,173,165,166,168,153,150,139,167,176,106,68,68,93,113,129,150,150,147,139,130,118,105,91,96,107,109,119,145,160,173,174,186,174,114,86,94,121,125,131,107,189,240,185,245,216,152,104,109,185,199,172,148,161,135,81,66,9,45,159,165,135,181,131,42,42,38,60,66,69,89,122,144,106,113,171,171,139,76,105,148,127,189,157,85,147,113,27,7,28,67,57,38,34,33,31,27,39,42,35,32,30,27,28,28,29,23,27,25,24,30,21,22,23,20,22,23,21,29,26,24,26,27,43,59,84,79,63,57,87,98,98,91,73,78,63,89,110,98,97,97,62,29,17,27,59,98,122,122,116,110,92,97,89,104,129,88,97,105,106,117,105,110,108,117,120,129,133,86,42,33,37,42,45,43,36,22,25,33,31,32,30,26,28,27,30,34,51,67,73,97,103,66,42,51,39,41,51,56,61,63,66,65,62,61,60,60,57,57,60,58,63,59,56,60,53,55,54,50,54,47,49,55,51,46,53,53,52,48,45,37,19,19,23,22,23,19,19,18,19,30,40,50,49,47,57,50,53,57,55,50,42,39,34,38,37,44,54,54,57,57,61,58,57,64,53,56,61,59,55,52,51,50,48,32,20,23,19,17,22,19,21,22,20,27,42,46,49,48,38,29,30,29,33,53,63,59,61,61,53,55,54,50,49,54,56,57,57,58,57,55,60,62,59,53,59,63,59,61,59,57,59,61,58,56,55,53,53,49,49,46,40,20,18,25,19,21,15,17,24,27,27,33,43,44,53,57,57,57,57,57,55,55,58,62,58,59,58,55,50,53,50,41,39,40,47,49,52,60,60,60,61,59,51,49,33,18,22,24,25,24,22,16,21,27,27,27,39,51,55,59,61,59,62,61,57,60,57,62,63,60,64,59,66,62,58,62,59,60,57,45,47,44,43,77,77,74,107,122,105,106,120,114,114,124,127,152,142,143,165,127,127,122,117,117,102,132,139,118,120,116,122,139,143,137,124,117,107,118,131,121,113,99,107,113,105,100,105,125,145,145,131,119,98,93,90,98,112,103,95,108,86,21,6,11,12,15,13,15,14,15,15,15,15,116,123,117,119,112,117,134,136,138,139,142,141,145,150,151,167,160,137,113,100,107,104,103,101,111,139,196,250,237,79,88,180,74,31,33,17,25,26,65,82,46,45,75,89,89,93,79,78,85,86,89,84,84,87,89,92,93,88,87,94,88,93,98,91,96,90,90,90,93,96,107,131,126,111,98,94,94,91,92,87,96,105,108,123,122,106,109,112,115,112,97,104,106,101,101,107,118,117,117,115,125,128,134,136,103,103,106,104,117,135,136,123,125,108,105,110,122,133,140,139,131,136,126,135,131,139,150,155,160,134,137,119,106,112,105,109,107,107,113,159,189,189,166,139,113,116,141,129,111,108,126,134,113,116,112,156,181,197,232,234,238,215,127,101,132,133,113,69,66,63,71,117,140,128,121,125,104,90,99,108,96,103,118,117,116,121,108,93,105,105,104,107,112,123,108,102,81,81,111,127,123,103,118,112,93,79,74,74,71,73,64,57,58,71,107,128,122,133,128,107,102,112,125,129,127,133,156,172,179,182,178,171,170,164,167,160,159,171,170,169,165,166,163,121,126,156,124,102,96,110,111,112,118,111,112,103,107,120,129,117,107,111,102,109,143,170,181,162,169,169,130,117,125,129,129,137,128,225,222,191,251,213,163,103,128,214,201,136,126,137,103,68,50,5,66,205,173,143,189,100,26,66,78,122,129,125,142,142,140,89,146,178,146,126,102,179,196,154,148,78,5,37,34,32,57,55,56,35,29,28,23,37,28,31,43,43,39,33,29,26,33,25,30,27,23,29,20,24,24,24,24,22,26,29,31,23,24,27,35,53,70,100,88,60,41,75,97,87,78,70,86,82,104,120,109,89,50,27,25,29,62,98,111,120,116,104,106,84,90,90,83,85,35,56,68,85,97,94,100,84,110,103,123,141,116,75,38,48,57,57,56,51,33,20,31,38,26,27,29,25,26,44,61,58,55,74,103,94,54,62,79,60,59,59,62,67,68,69,62,56,55,59,55,59,56,53,56,53,53,50,57,56,51,55,54,55,53,47,59,54,51,57,55,50,54,53,30,19,22,24,20,19,23,20,22,22,25,48,54,53,54,53,54,56,56,55,48,42,42,33,38,47,46,50,58,57,55,56,57,59,59,54,53,58,58,53,52,53,50,41,22,22,19,21,21,24,21,20,22,21,31,44,50,53,42,36,31,29,32,29,52,59,58,59,60,55,55,55,53,49,50,52,54,59,54,56,46,48,55,59,59,54,54,56,59,55,59,55,56,57,57,57,53,53,52,55,51,28,19,21,20,19,21,24,19,18,21,24,39,48,50,58,57,53,59,61,57,55,60,59,58,58,57,60,57,60,57,44,46,43,47,54,47,60,57,60,57,57,57,52,47,25,20,21,22,27,21,15,23,19,21,26,24,43,59,59,56,61,61,58,50,49,57,61,60,61,61,58,59,63,63,60,57,57,60,54,42,58,101,140,162,140,124,116,102,83,96,100,78,84,95,120,132,131,134,132,105,97,92,92,113,111,127,111,97,113,112,122,139,152,150,145,146,135,127,119,129,122,82,78,97,103,99,101,108,125,117,111,108,92,83,81,92,102,101,95,107,95,19,5,13,11,14,13,14,14,14,15,15,15,116,118,105,112,104,110,130,130,136,141,144,137,143,146,168,205,204,191,153,129,123,118,119,113,120,116,171,250,233,78,69,84,11,37,35,16,29,26,39,45,46,57,89,113,103,111,98,92,100,101,101,89,93,94,98,105,95,92,89,92,96,88,96,94,91,92,93,100,96,91,107,133,125,115,110,100,89,88,101,105,98,106,115,116,101,90,111,111,113,108,101,116,113,115,111,111,117,111,110,102,106,106,110,120,97,100,108,107,110,111,111,111,113,105,116,121,128,137,126,127,113,103,101,110,116,113,118,124,127,115,116,103,100,103,100,104,98,104,112,141,157,150,128,110,106,128,163,137,110,115,132,134,112,106,122,143,159,169,176,178,191,242,218,208,167,159,189,139,131,81,57,112,151,157,155,159,137,125,128,137,133,122,129,132,117,120,114,85,89,94,80,81,90,92,83,78,77,97,124,130,108,108,119,100,98,91,98,99,84,81,75,62,51,60,95,99,106,120,126,107,101,121,113,112,120,142,165,171,177,172,168,170,169,168,167,168,167,166,169,167,164,166,178,114,75,123,131,130,125,138,116,103,102,91,94,77,78,101,123,106,108,107,98,106,113,132,143,139,133,125,120,129,133,120,127,118,158,252,201,215,251,221,166,123,201,221,136,79,110,184,137,49,42,1,78,184,131,152,198,101,42,95,132,166,159,155,146,177,141,104,170,145,154,170,172,230,245,158,92,42,1,19,36,65,71,46,35,24,24,29,26,31,34,33,34,40,44,37,30,29,26,26,26,24,24,23,23,23,23,26,24,29,29,25,28,27,25,27,43,54,82,108,100,77,54,76,77,78,77,81,94,89,119,128,102,51,17,17,37,66,105,116,114,108,106,108,95,85,92,78,118,103,17,14,30,69,87,90,69,53,88,98,119,131,102,80,59,53,54,62,67,63,47,27,22,32,33,33,33,26,46,57,54,41,31,55,61,55,39,59,81,68,63,59,62,71,70,61,56,56,55,56,57,53,58,55,49,54,53,55,53,50,56,60,57,59,59,57,55,54,59,54,55,57,51,42,28,20,17,21,23,21,21,16,24,19,32,51,52,52,50,54,53,50,54,53,49,39,32,29,35,54,54,55,50,53,57,55,56,52,54,51,56,57,47,52,55,53,48,36,21,18,24,26,21,21,24,23,23,24,34,50,52,51,39,35,34,37,29,36,52,55,52,59,59,53,53,50,52,45,48,49,51,48,50,45,30,38,47,53,58,53,53,51,54,57,50,54,54,58,55,53,57,52,50,52,42,25,17,22,20,21,24,22,20,15,24,32,46,52,55,63,53,57,57,59,56,55,62,59,61,65,65,65,59,61,55,45,46,45,50,49,54,59,59,62,62,60,58,53,37,19,19,22,23,19,17,22,25,19,20,26,37,56,60,59,57,65,62,50,48,51,58,58,59,59,56,59,60,60,60,63,63,55,61,49,43,116,162,184,187,160,148,113,85,79,81,87,73,77,90,102,121,109,106,100,88,77,53,71,99,113,117,100,89,107,125,134,141,153,148,131,136,133,114,108,147,148,106,101,108,114,114,113,105,112,113,108,113,108,106,96,89,80,81,84,101,89,19,5,13,12,16,13,15,15,15,16,15,16,155,141,112,120,116,110,111,107,111,113,118,109,108,112,120,158,169,166,135,122,122,118,116,122,139,133,196,250,228,71,72,64,4,49,28,26,31,23,36,41,44,55,103,122,113,119,112,111,110,116,116,112,108,110,116,123,124,117,115,110,113,115,115,107,107,110,119,125,113,113,107,117,117,116,111,95,96,103,113,103,107,112,109,110,89,95,117,120,114,114,118,126,125,121,126,125,122,112,90,91,94,80,92,110,99,106,117,113,101,107,104,109,116,107,117,114,108,107,115,111,92,90,92,95,82,83,84,91,105,104,108,98,102,98,97,99,92,86,88,111,110,111,108,111,97,114,147,124,108,99,92,89,69,69,86,99,149,193,190,139,149,234,201,157,88,98,202,230,225,179,149,150,145,117,131,139,141,146,148,137,128,118,122,124,105,110,106,90,102,98,95,91,95,102,82,77,80,84,111,116,91,93,97,84,79,99,135,131,120,108,88,79,68,81,98,95,95,124,137,120,125,118,99,116,143,158,171,171,163,170,171,170,169,164,168,168,168,170,168,166,163,166,173,141,94,112,127,122,128,136,131,117,109,111,103,78,67,78,96,93,95,97,102,105,96,84,94,107,97,99,102,117,128,112,129,123,202,252,212,242,251,243,174,160,225,192,85,64,169,211,177,62,32,7,93,204,143,181,210,125,80,101,153,155,136,162,163,179,120,144,171,150,181,196,185,165,188,136,71,19,5,49,61,64,43,31,33,29,29,27,32,33,31,33,32,33,43,44,43,26,28,33,24,24,23,24,23,26,23,29,27,27,30,21,26,26,29,39,51,61,82,105,103,86,66,71,75,87,91,99,98,97,129,93,41,19,16,42,78,106,119,114,101,107,104,105,99,83,96,99,140,146,40,14,31,69,95,84,78,74,93,98,124,114,67,70,96,97,57,57,67,68,61,39,26,23,33,33,35,51,60,48,30,22,20,35,59,55,50,81,83,65,59,62,63,69,69,61,53,50,53,56,55,52,55,49,55,57,55,51,50,54,54,58,53,61,59,55,60,56,53,53,53,54,53,44,26,19,21,18,19,17,17,20,22,28,39,50,50,49,49,47,53,51,57,57,47,38,29,35,45,50,50,56,53,56,55,51,56,59,55,53,53,56,53,54,53,50,44,30,18,19,19,16,21,19,18,21,18,29,48,51,50,50,40,35,39,36,31,41,55,49,51,53,51,45,35,36,39,39,37,36,32,33,28,26,25,25,34,43,56,53,50,56,53,53,51,53,57,49,56,57,50,52,54,46,37,24,15,19,16,24,24,19,19,22,20,31,54,57,54,59,57,57,56,57,59,60,58,56,62,61,61,63,57,61,50,43,46,42,53,56,54,59,57,61,62,59,57,45,23,21,17,19,21,19,25,21,20,21,23,34,50,60,59,60,64,63,60,47,41,54,57,57,57,58,59,54,60,62,57,59,66,60,63,43,58,126,159,156,146,131,105,81,56,44,84,107,78,77,99,129,122,115,110,105,113,91,83,79,93,107,105,95,98,122,130,141,145,150,136,118,110,106,110,118,165,172,138,124,109,104,105,105,108,122,121,119,125,128,116,99,93,74,71,59,69,75,21,7,12,12,15,13,14,14,14,14,15,15,198,162,136,132,130,120,119,111,101,120,135,117,121,119,100,114,117,120,115,122,118,115,116,127,151,150,214,249,229,71,69,49,13,52,21,35,31,26,43,40,45,54,104,120,105,113,115,117,115,125,126,115,118,110,115,128,132,137,123,118,120,123,128,115,110,109,119,128,127,125,115,109,105,105,115,112,104,109,115,112,110,117,116,109,100,104,125,129,130,123,121,132,130,129,130,121,125,113,100,95,81,80,93,104,97,98,103,103,90,85,89,95,104,98,113,97,88,92,103,122,105,113,116,115,110,105,112,119,128,128,120,118,113,108,114,102,103,101,104,112,109,122,122,111,100,104,115,111,110,85,62,77,78,63,55,51,97,157,177,151,143,151,127,97,24,28,121,181,237,237,221,201,156,109,84,78,84,113,121,107,102,90,102,106,91,98,112,112,119,127,105,99,101,95,111,101,77,85,101,107,84,98,103,77,81,84,121,134,127,117,96,88,77,91,126,121,145,159,146,131,123,129,126,147,163,161,158,156,148,152,160,160,159,160,160,159,163,163,162,157,153,157,173,158,126,114,103,118,127,135,128,113,106,102,89,72,84,92,106,91,77,84,86,87,68,60,62,80,91,87,92,107,105,112,118,139,242,252,228,227,244,240,156,157,220,155,47,32,97,191,175,69,36,21,145,253,185,159,180,143,107,115,182,158,151,179,182,139,96,172,174,168,180,164,110,42,86,83,34,19,36,66,56,38,29,27,32,36,35,35,34,33,33,32,32,31,28,46,46,35,32,21,31,25,23,32,25,25,26,31,23,24,28,24,23,27,38,50,57,80,101,95,81,63,69,91,94,94,93,117,112,88,70,30,18,24,50,87,111,117,109,100,105,98,105,104,88,97,106,94,160,177,90,66,52,74,87,88,93,97,107,106,115,96,78,92,113,136,94,49,63,67,71,54,31,21,24,43,52,51,39,23,21,22,16,24,41,60,67,91,84,63,67,61,69,70,61,54,55,49,55,55,56,53,48,55,55,52,53,55,55,55,55,58,63,55,56,59,53,55,53,53,53,54,49,39,24,17,20,17,18,18,19,19,32,36,39,49,44,48,41,39,42,42,46,43,38,29,35,31,36,48,44,49,55,58,57,56,57,55,54,50,53,56,53,51,49,47,44,27,19,19,16,19,19,21,19,16,27,41,46,53,50,45,43,34,37,32,32,54,59,57,54,55,37,19,18,16,17,15,20,21,17,19,17,20,20,15,32,50,55,53,50,52,48,52,53,50,51,51,53,54,55,47,51,46,26,21,21,17,16,19,22,18,19,23,29,43,60,51,52,57,54,59,53,57,55,59,59,50,57,57,57,61,61,50,42,38,38,40,46,56,49,56,59,57,61,58,54,38,26,17,17,22,19,21,21,27,21,19,26,44,63,65,61,66,60,57,56,41,46,55,60,64,59,63,61,62,66,63,69,70,64,65,55,38,92,146,139,126,114,86,73,68,51,62,113,125,69,63,96,105,112,115,118,105,100,90,93,93,86,84,83,101,112,118,120,127,136,145,126,101,101,101,102,99,137,156,133,110,76,64,65,77,92,104,111,116,126,122,114,95,91,83,53,39,34,43,24,8,14,13,14,13,14,15,15,15,15,15,171,155,144,131,126,128,131,126,115,122,141,130,137,139,115,109,101,117,121,131,128,120,126,135,147,147,216,250,228,74,70,43,12,50,19,29,30,29,44,43,47,56,101,117,92,96,108,111,108,119,115,108,108,101,100,103,113,117,113,98,96,110,115,111,101,89,93,108,112,109,102,109,105,110,121,117,119,114,117,118,122,118,112,118,107,122,136,133,130,115,121,126,124,126,126,120,109,113,103,98,92,84,111,117,97,90,87,87,77,80,71,73,93,94,104,101,95,93,110,115,131,145,145,151,145,147,147,150,134,133,130,126,125,115,103,107,121,122,117,115,118,122,126,122,112,107,102,102,109,102,101,121,111,79,56,51,56,78,93,104,126,121,117,147,127,51,35,112,153,164,220,195,194,181,138,104,83,88,89,74,83,73,86,110,103,115,122,118,123,130,113,94,72,79,115,104,92,84,104,121,108,125,127,118,96,79,82,99,120,113,123,123,96,120,139,158,163,131,120,117,131,149,149,162,165,149,148,146,135,143,152,155,157,152,151,152,148,147,148,148,145,152,170,162,140,122,108,105,111,111,115,100,95,105,74,69,99,122,110,68,56,51,53,61,53,44,55,79,88,92,88,94,84,89,107,179,234,252,229,213,240,226,163,188,221,148,40,6,12,46,84,40,31,40,159,245,140,104,122,132,111,144,203,163,161,176,174,100,137,210,193,182,154,118,70,14,8,27,39,53,68,48,26,29,27,27,29,24,40,39,35,36,29,33,29,29,29,25,44,47,34,24,22,26,24,27,27,28,27,24,24,26,27,27,28,31,49,54,70,93,99,73,62,72,82,101,103,107,106,118,84,49,27,15,33,65,95,121,116,105,101,104,101,99,102,92,94,104,113,95,150,176,94,92,75,76,80,86,104,102,103,105,113,98,98,111,113,122,88,59,54,63,75,64,48,21,31,49,44,38,21,17,24,17,21,30,53,57,68,90,74,63,66,71,78,68,55,56,55,55,56,61,54,53,57,59,60,59,53,57,61,55,57,59,56,59,58,60,53,53,55,53,55,50,50,32,16,24,13,19,20,16,21,24,40,44,43,45,42,40,37,39,33,25,27,27,24,27,25,24,22,33,36,43,56,54,54,53,54,54,54,55,51,50,49,54,50,44,39,26,15,20,21,16,18,18,18,29,32,32,39,47,48,41,34,39,36,27,39,50,44,50,53,48,30,11,17,19,14,16,16,15,15,19,21,17,19,22,47,57,57,58,52,51,48,55,49,48,50,50,53,49,51,51,45,41,25,16,18,18,16,19,22,15,22,33,39,50,53,54,55,56,58,51,53,57,53,58,53,47,54,52,51,53,54,47,35,31,42,43,46,53,49,56,55,53,53,53,47,31,23,21,16,17,18,19,21,18,20,25,39,56,61,63,68,65,62,56,45,36,48,63,60,64,66,63,75,71,68,71,68,65,52,44,34,72,143,134,119,118,100,72,74,92,108,124,143,127,66,43,55,58,55,75,74,47,42,38,72,89,75,70,60,69,78,84,81,87,108,125,113,100,110,95,67,47,83,113,94,71,48,34,44,53,57,72,81,97,98,102,88,77,83,64,55,34,21,29,17,12,15,14,15,15,15,15,16,15,16,15,132,155,167,126,115,124,141,140,108,113,117,98,123,132,117,117,109,122,116,120,122,119,130,134,122,124,210,251,226,71,74,43,13,48,24,37,26,30,43,46,53,58,112,122,97,101,101,108,109,118,114,100,115,107,105,101,96,102,101,101,103,110,113,113,107,94,92,106,103,107,107,102,116,121,122,117,113,122,120,118,120,117,117,116,115,130,149,138,118,118,113,111,111,117,119,113,107,95,108,117,100,101,121,115,108,108,102,112,100,97,94,96,113,103,107,109,122,105,94,106,114,128,123,135,139,127,130,129,130,122,121,127,96,85,82,80,119,111,97,96,96,109,120,127,124,117,109,106,114,116,128,128,102,94,88,91,79,57,69,80,88,116,163,190,169,80,46,99,128,128,144,162,212,232,207,162,115,87,65,63,75,81,98,116,126,128,130,124,121,125,116,121,106,96,117,107,106,114,123,134,128,139,149,128,102,80,75,81,123,150,153,137,120,133,145,141,119,82,76,114,146,172,176,185,185,177,178,180,165,178,190,180,184,181,172,160,159,163,176,185,177,190,193,165,156,148,115,93,95,107,115,113,121,129,99,90,110,118,112,75,56,63,60,68,65,56,49,60,78,77,83,84,77,84,106,219,251,251,234,218,250,236,187,200,225,168,61,9,2,12,46,41,39,75,129,141,89,43,64,104,128,195,193,171,171,177,157,142,221,245,169,158,157,97,57,9,11,49,72,69,42,35,31,26,27,29,33,35,27,37,44,39,33,29,31,28,31,29,25,39,42,35,29,23,32,27,30,29,22,28,24,27,30,28,36,48,55,65,88,103,78,64,83,88,98,103,110,121,120,97,48,28,23,28,69,101,120,111,96,103,95,94,105,97,108,100,89,99,97,74,124,149,93,95,83,87,88,96,106,101,105,104,109,110,109,105,93,111,90,49,47,54,79,77,64,42,35,39,25,23,19,18,18,23,38,56,66,73,78,62,41,58,84,89,80,67,70,67,67,67,69,69,57,47,69,72,70,63,61,67,65,66,63,59,60,58,62,63,57,57,55,55,57,54,47,29,21,17,16,20,23,19,16,31,45,41,46,42,37,40,30,41,41,28,31,24,26,27,19,22,26,25,31,41,48,42,54,62,59,60,57,58,57,57,58,57,55,53,39,20,18,17,19,19,19,22,20,20,19,20,25,28,33,40,39,34,33,27,24,28,29,25,37,39,22,15,17,17,15,17,15,17,17,18,19,19,16,29,51,47,51,50,46,44,41,53,50,54,59,54,51,47,49,48,46,30,20,19,17,20,19,18,22,19,32,42,41,51,52,54,56,55,55,58,57,51,48,47,47,49,45,41,49,41,48,42,39,39,39,44,37,44,43,52,54,50,49,45,37,21,22,21,15,22,16,21,23,19,28,42,55,61,65,62,60,63,57,48,42,43,55,69,64,73,78,75,79,74,71,55,51,47,34,34,31,96,122,108,136,132,99,85,106,130,149,153,145,131,96,77,59,30,48,67,52,30,28,42,110,140,102,87,86,92,87,75,72,87,99,122,115,112,111,74,53,33,53,84,70,77,83,66,63,63,53,50,63,71,76,78,68,62,68,53,47,66,79,76,23,6,14,12,16,15,16,15,15,16,16,16,142,174,188,127,110,134,170,168,120,122,121,101,122,130,128,126,120,139,124,122,125,124,120,126,118,125,211,253,224,66,86,48,14,49,22,32,27,27,44,50,55,63,120,135,108,101,103,117,111,126,126,109,118,110,105,102,104,114,120,116,105,109,117,120,113,109,105,111,119,116,113,109,110,117,116,107,104,103,110,112,116,114,107,111,104,131,141,128,118,110,124,111,105,109,113,119,105,93,111,124,113,103,105,94,99,112,112,114,105,113,113,117,130,116,114,118,131,114,92,89,96,96,88,102,108,92,84,94,109,105,111,108,77,78,83,97,104,89,81,97,101,105,106,91,108,112,110,100,93,103,97,86,77,102,121,118,122,115,99,102,85,106,134,139,121,69,59,114,127,98,96,120,132,176,185,181,180,172,137,107,109,101,99,121,126,127,123,125,122,116,128,137,131,128,128,111,123,136,114,117,117,129,135,111,98,89,73,83,113,134,142,107,92,117,108,104,89,59,56,91,163,200,192,207,220,219,224,219,198,201,220,214,209,207,182,170,182,205,221,228,205,213,220,163,134,135,105,92,95,92,115,124,131,127,100,96,89,106,129,116,107,99,89,95,74,54,53,41,66,60,70,114,114,98,126,234,252,252,250,232,251,232,180,185,185,172,112,36,19,100,164,61,53,110,104,62,39,12,18,101,167,214,188,179,188,187,174,201,250,174,94,123,114,52,18,6,59,96,72,48,30,32,36,31,28,32,33,30,34,33,39,43,36,34,28,27,31,29,25,29,40,41,39,27,28,30,33,26,23,33,22,24,31,30,42,57,60,83,95,78,77,83,96,96,101,117,112,114,83,35,19,25,51,65,99,108,99,99,97,98,77,92,103,100,108,84,79,65,51,39,108,141,84,97,76,93,100,88,102,103,98,103,111,104,98,93,92,102,101,83,59,52,74,84,76,59,41,17,17,23,17,18,27,44,55,60,65,76,84,64,61,92,93,94,79,78,84,75,83,77,70,63,53,55,76,89,83,78,70,73,76,73,71,65,66,66,64,60,63,61,57,63,57,55,46,34,24,19,16,18,19,17,23,28,44,41,37,34,32,30,29,39,29,35,32,19,20,26,27,24,23,24,26,37,35,32,54,59,62,63,64,61,55,62,62,56,55,50,37,20,19,16,16,21,18,22,20,21,20,27,22,20,36,41,37,39,33,23,21,20,20,30,41,31,21,16,16,17,14,19,19,19,15,17,19,19,23,23,31,34,31,37,32,28,29,38,54,55,54,56,54,52,48,52,46,31,18,16,19,21,21,17,21,24,34,42,45,55,59,53,56,53,53,55,53,53,49,47,41,41,43,44,44,38,43,40,40,34,32,38,41,47,43,44,50,49,46,44,35,19,18,18,19,22,14,22,24,30,47,49,58,61,60,57,62,60,51,45,39,49,66,74,72,81,78,77,84,65,44,36,37,33,32,28,36,89,109,112,145,132,91,104,146,146,147,133,116,129,124,110,109,116,133,143,121,107,112,133,186,184,139,141,152,164,156,129,123,125,137,148,139,136,123,95,76,46,83,125,98,99,107,90,88,83,77,57,49,67,77,89,71,65,66,45,72,108,157,113,12,5,10,13,14,12,15,15,15,15,16,16,122,169,180,132,126,149,188,184,147,160,150,128,148,148,153,149,146,165,149,148,151,147,141,146,130,145,224,253,218,69,105,56,15,51,21,36,24,29,44,51,54,59,122,131,117,113,105,118,113,119,120,112,115,103,106,115,120,128,120,110,103,107,113,115,109,109,110,121,119,119,121,112,120,118,117,103,89,101,103,113,112,104,103,91,98,111,130,134,129,141,135,124,112,121,123,123,124,112,122,123,110,96,85,78,90,99,89,88,75,83,88,94,112,102,107,105,118,116,97,100,107,111,103,104,90,79,80,88,94,90,96,95,81,93,100,105,104,85,85,101,102,97,84,70,77,95,103,89,77,71,77,69,65,92,97,107,123,126,120,114,94,78,76,93,101,74,74,91,91,102,114,94,92,113,139,167,219,245,216,174,166,132,105,124,121,111,109,107,103,113,125,118,116,129,136,130,136,127,91,84,88,110,135,103,92,97,74,60,77,98,122,118,85,91,96,87,66,70,57,67,143,183,178,208,231,230,232,218,170,174,210,211,200,193,184,171,193,213,214,214,172,201,217,131,90,103,101,104,105,98,97,110,124,97,66,66,61,83,125,139,136,105,79,85,70,57,40,44,85,80,99,128,139,108,149,253,253,253,235,214,240,203,171,147,178,183,150,132,134,227,201,71,54,79,54,7,26,28,28,110,192,200,189,194,211,201,179,199,187,119,57,38,45,18,26,67,85,63,41,36,30,31,35,39,37,28,27,33,36,35,32,39,44,43,28,23,27,24,31,26,27,40,46,41,29,29,24,24,25,28,26,29,39,41,56,59,83,90,73,88,96,92,93,94,107,116,108,66,28,19,28,57,83,97,112,106,100,99,100,104,92,92,98,100,105,90,84,67,61,51,112,141,84,92,80,99,88,85,100,91,97,92,94,93,89,94,93,105,107,112,100,64,64,75,83,73,45,20,14,18,13,33,50,57,61,63,77,89,101,97,96,103,110,101,74,63,69,64,66,67,55,46,43,48,62,69,83,81,74,67,72,66,67,73,64,69,66,68,66,65,63,59,59,58,42,29,22,14,18,17,21,23,16,22,33,34,28,24,29,33,34,29,24,22,30,27,19,26,23,29,25,22,27,29,38,37,42,48,53,59,57,54,44,41,52,42,37,39,28,20,19,16,21,20,21,21,18,20,24,29,26,29,33,39,39,34,31,27,22,24,31,33,31,25,18,14,19,21,19,21,21,22,20,23,19,16,21,25,27,29,23,25,24,21,25,30,48,64,56,55,58,54,51,46,46,27,19,21,17,22,18,22,22,21,45,51,51,56,56,58,57,56,58,59,56,50,51,45,43,48,50,51,47,43,44,35,38,39,40,40,43,53,48,50,51,45,48,41,27,25,23,17,22,25,19,19,26,40,50,54,57,60,62,64,66,63,48,45,46,54,69,79,72,76,78,61,60,45,35,26,24,31,31,29,48,120,112,92,111,95,92,120,139,127,115,103,89,103,99,101,153,182,194,193,181,171,170,193,201,179,145,141,173,184,179,156,136,141,141,156,153,155,154,129,112,77,121,171,117,89,92,88,113,124,106,90,91,98,114,122,103,99,103,94,112,152,187,115,10,3,10,12,14,12,15,14,14,14,15,15,92,124,141,118,128,151,166,162,149,162,150,145,155,150,160,152,150,163,149,151,163,161,152,157,142,161,235,253,206,64,117,54,13,46,17,37,16,30,42,46,58,64,113,122,115,111,101,113,95,96,107,108,123,118,116,115,119,125,116,110,102,106,113,111,108,107,110,113,113,117,116,121,128,124,118,108,112,115,120,113,103,101,104,111,108,127,143,148,147,148,148,135,140,152,142,149,147,141,149,137,133,128,119,108,110,109,104,98,81,71,69,83,92,101,105,102,125,116,117,136,136,140,116,99,97,92,92,95,94,81,91,92,95,88,76,97,96,83,80,92,83,81,74,43,64,90,106,98,74,71,78,72,72,85,88,89,104,113,118,120,107,104,95,103,118,104,94,78,75,92,121,121,109,125,110,125,153,185,188,185,207,174,121,120,123,118,92,79,69,83,114,105,107,123,133,124,123,115,101,105,87,83,116,110,92,87,81,58,54,67,118,140,107,90,80,80,74,65,55,59,121,167,159,192,239,240,237,206,160,159,193,212,193,186,184,181,189,195,192,189,155,168,161,98,84,106,95,109,116,113,117,109,119,77,39,42,55,94,98,103,97,69,63,69,57,45,32,56,162,157,95,77,126,129,188,234,240,212,165,175,195,172,129,160,160,113,114,73,120,160,127,37,27,66,56,22,87,94,63,133,171,184,205,207,230,223,142,127,146,74,33,14,12,50,85,74,48,32,34,34,27,35,34,37,40,38,34,33,34,30,33,36,37,45,36,30,25,27,32,27,27,29,45,44,37,28,23,29,24,30,35,36,36,48,57,64,68,68,73,90,100,99,99,95,112,103,50,23,18,41,70,86,108,99,103,105,101,108,102,100,97,105,104,102,108,100,104,99,104,77,139,152,86,103,82,104,95,77,93,96,100,88,85,93,94,93,94,105,117,118,123,96,70,71,80,76,57,29,16,20,41,49,55,58,62,74,80,100,104,96,110,125,119,98,52,40,48,48,50,46,48,46,42,42,44,55,62,57,57,53,48,51,56,51,47,48,57,53,53,51,45,53,46,37,32,23,20,18,15,19,17,19,20,22,25,24,25,31,34,29,35,34,25,27,32,24,25,25,24,30,26,30,32,37,40,34,39,46,42,34,41,40,30,26,30,30,31,29,19,21,20,18,22,21,20,17,22,26,22,27,25,23,31,35,35,35,31,28,28,29,38,31,24,19,17,18,18,18,25,23,20,24,19,22,18,19,21,25,29,24,25,22,23,26,28,29,45,56,57,57,53,54,55,53,37,22,24,19,17,21,19,20,19,33,38,47,51,53,61,55,57,55,60,60,51,51,47,43,49,47,51,51,48,46,38,36,41,37,45,46,49,57,51,52,55,48,45,39,29,21,22,17,19,18,22,24,22,44,53,60,66,60,66,66,62,63,49,46,50,56,60,66,61,73,69,50,43,30,29,27,27,27,33,31,93,128,93,66,60,69,86,93,101,78,75,84,71,73,73,104,152,165,171,174,178,160,155,171,158,148,132,131,146,135,139,130,119,114,104,128,145,143,141,137,134,104,152,190,122,86,85,107,143,146,141,135,141,151,140,136,124,134,150,147,164,171,181,111,9,3,9,11,13,12,15,13,15,16,15,15,83,94,93,93,104,123,140,139,136,142,131,139,145,132,144,135,129,137,125,136,143,150,147,148,142,164,237,252,196,63,96,37,18,46,24,39,19,30,40,46,56,67,117,126,114,110,100,98,94,98,105,109,121,123,105,100,105,111,107,103,113,108,112,121,108,109,105,110,110,106,106,119,129,118,118,114,110,120,117,110,104,110,117,122,133,142,159,155,144,154,151,150,151,154,156,157,158,156,166,171,177,174,159,149,149,159,160,158,142,126,125,125,135,139,135,137,143,124,127,152,151,134,110,108,103,101,98,103,103,84,92,99,94,81,57,83,89,85,80,82,78,67,65,50,58,90,111,118,114,92,95,83,86,104,99,112,110,113,117,119,129,128,117,118,117,93,88,86,70,79,102,114,128,146,136,93,89,112,131,163,218,198,133,123,112,112,84,62,55,70,112,115,113,113,121,119,113,116,121,117,103,98,97,94,102,109,95,100,95,80,85,108,83,64,71,80,90,77,55,66,137,176,152,157,214,251,249,201,155,166,198,217,207,191,183,171,179,182,190,186,151,138,118,89,98,114,87,101,120,123,108,101,121,92,74,72,69,94,93,85,83,53,61,111,115,145,131,154,232,191,171,141,131,172,227,236,181,138,104,129,173,128,136,157,84,99,127,81,34,64,67,28,19,69,152,119,122,101,94,170,179,206,231,212,197,194,101,61,78,46,13,10,70,88,63,48,36,29,30,37,27,34,34,31,44,39,33,34,40,31,34,35,31,35,34,39,27,28,30,25,31,31,34,41,41,35,27,25,33,35,34,38,50,55,62,64,81,78,74,106,104,105,113,106,74,31,20,28,50,86,101,107,101,90,101,99,102,101,97,98,98,105,109,105,101,103,110,105,111,86,161,172,92,104,84,112,100,94,97,86,100,91,101,94,96,100,96,109,111,124,133,97,75,73,80,83,68,49,29,42,57,59,59,69,73,80,97,99,95,103,119,127,129,81,46,45,42,47,46,43,46,48,43,40,41,42,46,47,46,48,39,34,42,43,39,45,43,43,40,35,44,39,34,34,30,21,22,18,24,17,22,24,21,29,18,24,29,29,26,27,30,26,27,35,37,23,26,21,25,32,27,33,40,37,36,43,41,44,43,32,32,37,29,30,27,35,42,28,19,18,23,21,19,19,18,22,19,23,27,24,26,30,29,26,27,30,25,27,28,38,41,28,28,17,15,18,17,17,17,26,21,19,17,18,22,19,21,28,31,32,31,25,29,29,36,34,37,47,41,43,40,44,49,46,34,21,22,17,17,24,16,20,21,23,34,35,44,54,59,57,56,57,56,55,53,50,41,38,46,46,51,53,44,39,37,39,44,40,42,42,43,51,51,51,53,55,50,36,23,23,21,16,21,22,16,22,29,40,57,66,67,69,63,63,64,57,54,50,51,54,50,53,52,68,68,45,37,28,29,27,25,28,30,40,89,124,103,83,67,72,70,61,66,48,39,50,63,90,111,113,122,125,130,141,154,146,136,142,123,123,114,107,98,83,115,122,111,93,76,101,121,115,108,117,136,133,166,188,130,100,112,122,138,140,135,146,156,153,141,128,138,150,160,164,169,163,157,106,13,2,11,10,15,13,15,14,15,16,16,15,92,93,87,81,80,89,106,120,133,136,133,139,135,127,133,134,124,124,124,126,139,148,143,148,140,167,239,253,190,54,77,20,22,44,19,40,22,32,36,48,58,71,126,130,124,112,99,102,104,117,106,100,101,101,102,89,92,101,98,111,108,108,116,107,105,102,106,102,105,102,95,115,110,103,106,105,101,101,106,102,105,109,115,120,122,122,128,138,130,135,130,127,134,140,130,123,116,120,137,147,170,167,152,140,138,156,169,177,169,167,168,173,180,181,174,158,135,100,112,130,132,131,100,101,106,89,90,94,86,73,81,78,84,73,63,87,99,106,91,77,65,87,77,61,87,101,123,126,113,107,96,93,101,108,114,120,112,107,115,105,117,119,100,111,98,84,76,81,81,69,87,97,96,111,119,113,97,101,107,127,160,146,127,107,93,90,75,85,82,105,123,94,87,109,153,163,127,114,109,127,129,119,118,120,119,118,93,117,110,103,82,63,57,70,76,89,111,112,81,74,117,160,147,100,131,193,179,115,106,147,198,214,191,182,169,154,141,141,152,157,147,140,116,98,103,106,93,105,113,101,83,82,120,111,98,92,83,110,103,113,105,73,109,163,208,252,251,251,253,253,253,213,182,211,234,213,160,143,132,158,156,122,152,139,107,170,184,122,103,115,119,63,16,84,164,139,142,113,161,208,209,245,227,145,125,150,69,34,21,10,33,67,81,59,43,33,33,32,36,30,28,37,29,30,31,37,42,35,37,33,29,27,30,33,38,44,35,34,31,31,38,29,24,28,41,42,35,34,36,39,36,44,61,65,76,90,96,99,98,116,118,118,92,46,31,18,27,65,98,112,107,94,93,93,99,99,100,100,94,85,77,97,95,102,101,97,106,99,109,78,177,174,78,102,81,105,100,92,91,76,94,96,103,97,96,95,85,105,123,119,83,40,41,67,89,91,86,66,61,74,62,63,60,68,76,87,94,89,103,77,61,56,47,50,51,49,46,48,50,51,52,48,46,46,41,39,43,49,48,41,38,33,42,40,43,41,40,37,40,43,41,38,40,35,30,23,19,24,17,24,21,22,25,22,24,23,26,35,34,24,22,27,34,40,39,31,24,17,31,42,40,36,38,42,39,39,37,43,49,42,33,29,24,29,32,35,46,35,24,22,19,21,22,27,26,27,24,23,30,28,27,27,30,32,33,33,22,27,38,41,42,27,19,18,22,23,20,21,21,29,29,20,18,21,22,21,23,29,36,35,29,28,30,32,42,37,23,36,32,30,34,28,32,32,26,25,17,17,18,19,19,19,23,23,27,28,31,38,37,51,50,48,55,53,56,44,36,34,38,46,48,48,42,44,38,35,36,33,34,33,40,46,45,43,49,50,48,34,17,21,18,18,19,20,22,22,30,46,49,57,64,60,59,60,61,59,54,53,51,57,56,56,52,61,61,43,36,29,32,29,27,29,28,39,60,91,101,104,88,73,69,62,85,61,26,51,82,104,105,90,76,76,88,94,131,141,141,139,99,100,102,83,78,83,114,111,106,96,73,94,106,96,85,93,122,118,144,156,118,119,120,110,109,104,111,117,125,123,115,116,124,136,136,143,156,148,141,99,16,3,11,11,15,13,15,15,15,16,15,15,95,97,93,93,84,77,76,107,122,136,146,146,137,132,143,137,132,132,129,142,147,155,145,143,136,166,239,253,192,84,102,21,29,46,17,33,24,36,34,47,62,64,110,124,118,116,103,106,112,116,103,88,89,101,105,99,105,110,112,108,112,99,103,115,99,103,101,96,99,101,91,102,111,96,91,98,94,100,104,92,95,98,101,110,100,92,97,108,106,98,101,97,109,111,92,66,51,71,93,107,117,117,125,118,106,115,133,149,151,154,157,165,176,168,143,118,87,61,83,95,94,108,99,91,88,78,67,66,64,54,50,48,61,61,64,97,113,120,100,95,87,89,93,93,117,115,103,84,70,80,105,116,127,130,121,117,103,114,125,103,95,117,94,93,98,83,91,88,79,77,79,87,81,87,118,122,111,107,104,106,109,95,90,96,87,88,94,119,109,93,105,90,89,99,143,152,116,110,119,131,154,153,134,139,131,101,84,98,99,95,85,88,82,84,85,70,104,124,116,96,87,110,99,56,34,70,75,52,70,120,181,183,155,156,162,145,127,99,103,111,118,134,122,112,93,98,98,105,103,83,64,75,100,91,84,70,84,106,103,105,110,93,112,184,216,250,251,251,244,215,244,168,128,220,239,193,199,165,168,173,122,128,161,201,187,191,154,116,118,149,141,41,2,51,146,161,132,160,226,220,232,228,133,84,63,85,38,5,29,41,71,66,54,39,38,33,33,36,26,38,36,34,32,31,35,27,36,42,33,32,33,29,33,29,36,38,41,39,36,33,29,29,23,30,31,45,50,39,39,40,49,57,69,90,104,98,99,102,114,135,123,77,38,16,23,40,78,107,112,110,92,96,94,95,102,96,92,101,97,79,75,83,99,101,99,98,104,99,103,84,187,184,75,87,74,102,87,78,71,60,90,90,105,96,98,90,77,117,100,49,26,18,26,51,76,102,101,66,72,82,54,63,70,72,83,81,90,104,83,48,22,12,23,36,48,46,48,50,50,51,48,50,50,52,48,44,50,49,52,53,46,45,44,44,41,37,39,44,47,41,39,42,39,33,30,22,19,16,19,24,17,21,23,19,28,26,29,38,34,25,19,29,43,43,47,34,32,28,40,66,47,39,37,39,39,38,39,45,50,46,35,25,26,30,34,38,47,36,28,20,19,19,21,29,24,26,30,28,29,28,26,35,33,33,34,22,23,32,45,44,43,35,22,27,29,33,33,33,30,30,39,44,42,32,21,24,28,33,30,31,35,29,36,39,42,38,31,28,22,32,26,27,29,29,23,19,23,17,23,20,20,24,24,27,30,28,32,32,34,41,37,46,49,46,39,30,25,31,37,43,46,41,39,35,33,26,29,31,22,27,34,44,39,42,43,40,35,26,23,22,22,19,21,22,21,28,37,40,48,54,56,59,56,61,62,55,57,54,54,58,56,55,52,56,57,45,39,34,28,33,35,28,33,42,47,61,68,69,64,67,54,63,83,52,50,80,57,57,61,40,38,42,35,42,106,139,130,111,79,98,100,92,92,96,111,100,107,92,83,104,105,93,80,84,98,96,94,116,116,116,125,108,93,94,100,96,105,103,101,97,104,105,96,115,137,138,139,100,18,3,12,12,16,13,16,15,15,16,16,16,89,101,103,108,117,108,103,108,110,128,145,148,140,140,142,137,136,139,147,150,153,157,141,135,126,160,237,251,189,107,127,34,30,36,25,35,22,35,34,45,59,69,94,95,101,107,102,96,98,109,98,87,86,104,108,92,116,127,119,119,100,100,105,103,107,97,97,89,99,98,86,107,120,112,97,97,89,93,98,86,94,97,100,108,107,95,96,101,86,83,87,97,105,96,78,66,53,62,69,69,87,89,113,117,110,115,113,120,114,121,122,132,141,109,80,63,43,36,56,50,59,94,85,81,85,66,66,58,57,53,53,42,53,68,61,84,101,99,89,100,83,89,101,109,132,87,68,65,42,72,86,125,137,114,120,119,118,117,129,110,101,108,87,93,80,89,98,102,101,75,92,104,104,110,137,143,114,112,125,123,108,78,66,87,114,107,104,114,90,64,78,92,89,78,99,89,84,89,101,118,132,134,117,121,116,93,93,106,122,117,113,128,122,108,103,92,123,153,158,145,93,75,57,45,42,55,65,62,90,122,157,160,146,152,165,157,128,104,97,92,97,105,106,103,93,93,93,95,94,83,77,89,89,68,49,60,92,113,104,84,79,69,90,145,145,174,148,134,109,79,137,79,78,160,176,196,235,199,190,159,145,151,206,242,218,160,87,89,106,125,108,41,12,68,161,162,147,208,246,187,188,172,96,57,10,16,24,29,56,69,60,42,31,43,41,34,38,33,30,35,40,33,33,39,35,29,30,37,38,34,29,29,27,27,36,34,43,48,37,35,27,24,31,30,39,37,45,56,45,49,59,83,97,103,108,97,98,111,122,107,58,29,24,26,56,89,110,112,110,105,90,92,91,92,98,95,101,103,94,87,84,97,97,108,105,95,105,95,107,89,191,200,91,84,68,89,76,68,74,73,94,98,106,104,96,110,97,67,35,21,22,14,19,37,66,84,66,40,62,71,57,60,66,77,74,95,103,101,71,34,18,17,18,32,41,46,51,50,44,45,46,47,48,47,47,49,43,50,49,49,49,44,50,51,48,42,41,44,42,43,42,42,42,35,24,23,17,16,20,22,18,23,24,26,25,31,33,35,38,31,30,36,39,46,47,40,30,27,50,57,48,40,42,42,43,43,35,50,50,41,36,25,29,30,33,43,48,34,21,19,22,16,18,23,22,29,33,28,27,25,32,29,28,35,33,26,30,41,41,46,49,36,24,31,39,34,38,39,34,38,53,66,53,31,17,27,29,31,40,28,33,38,36,37,41,39,34,28,24,29,25,25,32,37,21,20,22,21,21,19,23,24,25,29,30,35,33,32,38,32,36,47,43,37,29,26,28,26,29,38,39,39,34,24,27,29,31,28,24,33,28,33,35,28,39,30,28,23,19,27,22,24,21,23,26,33,45,46,47,48,59,59,55,57,55,57,55,55,53,51,59,53,50,47,49,46,39,34,29,35,39,39,44,43,49,51,45,48,55,55,41,34,76,70,63,71,44,39,30,55,63,46,37,43,105,128,115,98,93,106,100,99,106,104,98,99,111,97,98,110,106,95,84,93,102,87,77,101,110,126,137,126,115,106,104,103,111,103,107,93,86,94,81,103,118,118,124,92,20,4,13,12,16,14,16,15,15,16,16,16,105,121,130,153,189,186,170,172,137,117,132,143,136,129,131,126,128,135,145,150,148,158,139,134,120,161,239,248,169,96,128,22,38,38,22,38,16,32,35,43,60,68,83,78,89,100,79,81,75,77,84,79,84,93,89,84,98,114,123,112,114,98,98,109,100,95,93,97,101,105,95,121,138,123,109,91,85,95,98,84,87,97,101,116,121,114,113,102,92,101,110,93,76,78,91,98,97,87,73,85,85,77,101,110,120,128,114,105,88,100,118,131,121,81,60,55,44,35,43,53,70,91,88,73,81,80,62,63,65,70,72,57,63,65,60,65,71,74,73,77,77,99,108,103,103,61,43,62,63,73,74,85,104,100,107,104,111,117,102,89,88,115,101,89,107,108,101,111,132,120,109,113,94,113,152,148,125,128,134,119,107,89,68,92,107,103,98,99,83,66,81,75,78,63,61,65,54,66,76,71,87,93,81,105,114,125,134,152,152,153,151,141,160,154,161,163,182,195,178,142,96,86,67,63,74,85,77,76,103,122,135,134,136,145,146,131,119,103,108,104,92,93,91,105,86,95,92,83,93,90,116,119,78,61,72,93,116,96,88,87,84,61,39,90,63,56,27,58,68,25,94,34,100,196,148,196,240,205,224,210,195,207,202,241,198,139,119,116,132,150,141,99,100,100,137,163,166,228,229,127,146,139,49,28,3,29,49,61,69,48,47,34,36,41,38,37,29,34,39,32,39,43,35,37,33,32,30,30,39,35,33,33,33,33,39,35,36,36,39,38,31,37,44,51,42,39,45,53,50,55,96,115,110,97,95,105,107,98,65,35,22,24,46,71,94,111,119,101,102,101,87,93,83,87,103,101,106,100,88,96,100,98,104,100,102,103,108,106,99,88,160,209,136,72,51,78,84,89,88,96,103,103,115,107,119,97,46,26,21,23,18,15,30,53,63,64,63,46,69,67,50,62,63,80,103,86,92,98,55,27,19,14,22,34,41,42,47,48,43,47,45,45,45,43,47,39,44,44,43,47,42,45,48,51,50,51,49,43,42,46,41,40,43,37,27,20,24,16,22,22,21,27,36,33,29,30,33,37,37,34,36,43,46,47,47,42,34,30,46,55,42,41,44,39,41,45,41,50,53,44,36,27,27,24,35,51,47,23,24,22,21,19,15,16,21,31,28,27,24,22,33,29,31,36,29,32,40,45,44,46,47,44,32,35,40,36,38,35,35,42,44,61,75,47,24,27,32,33,36,34,32,39,44,38,36,38,33,28,33,29,26,26,36,39,20,20,19,17,21,21,24,22,23,25,30,34,28,36,33,35,39,42,40,31,25,26,19,23,30,27,30,29,30,23,29,33,29,27,29,26,25,29,28,31,29,28,24,20,22,22,22,24,22,24,23,34,43,43,46,44,50,49,54,53,48,53,53,50,52,53,51,50,55,48,43,42,33,36,29,41,43,33,44,43,44,45,35,43,44,46,34,69,112,89,79,55,23,41,63,101,114,98,93,82,121,133,119,122,112,113,103,106,101,85,90,104,124,103,110,114,95,86,75,93,101,85,74,91,105,115,137,131,126,120,110,105,109,109,111,95,91,103,91,102,103,92,93,77,23,5,15,13,15,15,15,16,15,16,16,16,152,156,160,188,235,224,189,186,142,115,108,114,111,107,112,104,114,110,109,118,128,145,141,142,135,179,246,247,150,62,87,21,50,41,24,34,22,33,31,51,62,72,104,94,89,85,73,74,66,62,66,74,74,73,64,55,78,95,102,113,108,102,98,108,103,92,97,93,106,103,99,121,128,125,103,96,89,104,104,88,95,98,103,114,112,112,107,101,99,99,97,71,65,70,87,124,124,113,88,71,74,57,71,84,98,110,108,108,91,107,129,130,102,87,78,69,64,58,76,75,76,100,92,92,91,79,78,69,83,97,108,83,65,69,59,80,73,53,57,69,78,103,104,77,71,59,77,74,65,66,58,77,86,87,86,88,122,118,78,55,80,113,133,122,98,99,91,123,156,146,118,89,83,104,156,156,137,133,127,106,101,98,73,82,85,88,91,81,71,61,71,62,67,61,52,57,60,90,79,66,64,68,83,127,150,155,169,170,179,159,148,166,165,158,160,139,139,146,134,100,73,72,74,77,64,74,81,62,78,105,112,112,109,122,122,102,95,89,94,91,83,85,84,100,99,101,91,86,86,85,105,117,83,77,91,100,103,75,84,98,92,66,60,92,74,69,26,99,135,45,129,76,136,203,114,162,195,206,228,218,227,179,146,169,155,148,153,147,156,157,148,145,142,129,149,187,177,184,181,98,98,77,29,15,17,67,70,59,48,37,50,50,39,38,40,37,32,39,36,37,38,36,31,37,32,24,34,29,34,38,39,41,37,38,36,33,29,31,37,39,48,67,69,53,42,49,59,69,62,54,77,105,87,73,91,103,81,42,24,19,30,56,86,98,99,107,99,97,101,92,83,90,83,88,101,103,104,89,100,105,98,103,95,102,108,103,101,99,96,77,118,195,173,77,45,84,89,94,91,95,107,108,115,113,72,39,21,20,20,16,25,46,71,81,75,62,51,49,84,74,59,68,81,98,81,83,89,82,42,14,15,19,23,35,36,38,44,45,46,43,38,44,45,40,42,39,36,37,40,44,44,49,47,48,53,49,48,44,45,44,44,46,46,34,25,30,33,33,25,34,33,27,42,33,33,35,36,44,43,39,37,46,47,45,43,37,33,35,53,53,40,47,46,44,41,39,45,44,44,44,31,31,22,30,42,40,41,24,22,19,23,21,21,26,24,32,27,28,29,27,38,26,32,38,27,36,42,43,45,49,50,45,39,37,39,40,39,38,38,39,29,46,79,67,40,32,27,31,36,33,37,37,42,46,40,36,37,37,38,31,28,27,25,31,21,17,17,19,19,22,23,24,31,33,27,31,28,33,36,40,45,36,31,28,21,22,22,26,37,37,38,37,29,24,35,31,33,35,28,33,31,33,41,39,33,28,26,24,18,20,21,16,20,23,21,31,34,34,41,39,44,44,42,40,41,45,33,45,45,42,45,42,44,36,39,46,41,37,34,39,36,35,42,32,32,27,28,39,57,53,88,138,134,109,105,72,38,87,90,112,122,119,125,121,142,130,124,117,101,110,112,103,76,72,99,110,112,98,105,98,77,62,49,76,91,88,76,91,89,83,110,120,127,112,94,101,110,113,125,99,84,105,97,111,104,97,91,68,24,6,14,14,14,13,15,16,16,16,16,16,188,178,139,143,198,167,113,120,103,88,86,85,88,86,89,84,87,74,78,84,91,123,130,147,136,187,246,246,139,47,79,14,47,39,29,40,20,34,34,51,63,78,117,104,104,97,76,100,94,78,84,87,76,55,49,49,63,74,86,91,106,101,97,103,107,95,95,106,107,104,92,107,109,108,112,107,110,118,117,110,115,113,105,100,93,92,98,83,80,81,73,71,77,79,99,116,117,110,78,57,39,50,63,49,59,81,92,98,101,108,122,112,95,89,95,95,77,83,93,92,93,98,97,98,88,76,89,95,99,105,101,88,89,83,87,98,82,67,69,68,68,92,96,69,82,97,89,83,63,55,50,59,71,72,89,98,109,121,81,53,85,125,123,87,97,95,94,124,139,142,113,89,89,127,165,159,146,132,112,94,94,100,98,78,67,86,81,74,63,53,60,59,71,57,48,44,67,101,87,64,73,82,90,141,155,155,141,141,150,147,163,161,145,95,68,58,55,71,63,45,43,57,69,107,65,64,81,57,92,109,126,120,96,101,100,88,90,84,84,86,75,83,80,87,77,85,91,84,87,74,78,94,79,73,78,79,87,81,82,72,80,92,107,125,81,65,65,154,217,90,119,160,203,169,108,193,220,188,191,184,150,107,73,81,70,121,169,146,163,170,162,172,180,176,187,222,151,106,122,53,41,22,21,48,50,69,57,47,33,41,51,47,43,41,40,40,38,35,41,35,39,36,36,37,30,32,28,29,37,43,46,42,36,35,36,26,32,33,32,59,82,91,64,45,53,68,103,114,85,71,56,59,83,90,81,55,37,22,26,46,69,97,113,110,103,103,104,99,95,83,79,91,86,83,96,100,101,90,95,107,100,97,97,101,108,103,94,99,84,80,97,158,201,111,61,76,85,96,84,103,107,110,103,49,25,21,20,19,15,35,61,87,105,116,105,103,71,59,104,82,87,105,93,87,78,79,85,66,33,19,13,18,34,36,35,35,39,41,34,39,33,36,42,39,36,42,48,46,52,53,51,53,48,48,47,49,45,47,44,46,44,41,46,31,29,34,34,34,32,31,33,34,37,37,36,39,41,44,44,40,42,43,49,47,50,40,29,37,56,55,37,42,42,42,39,40,46,41,37,32,33,27,27,34,39,44,34,26,27,19,23,23,24,32,33,29,28,29,24,28,39,25,34,41,32,41,42,45,44,47,47,48,37,35,38,33,39,38,34,35,32,33,61,76,64,40,29,33,34,38,37,40,45,37,38,36,36,36,39,36,31,32,33,24,18,23,21,21,25,24,20,30,33,28,31,31,30,42,39,39,34,27,35,29,22,19,19,27,44,50,49,39,21,30,35,35,39,32,38,38,36,48,45,44,41,39,28,19,19,20,20,19,23,19,33,32,30,36,34,40,43,49,45,38,39,41,38,30,37,36,37,36,35,35,35,40,38,37,33,36,37,35,40,30,23,26,25,53,72,90,137,149,122,127,145,113,119,118,84,98,105,125,143,137,128,113,117,109,93,105,107,83,64,94,125,116,103,80,89,98,83,81,77,93,95,83,67,75,66,59,87,96,113,96,75,85,103,119,125,101,81,87,90,121,137,133,121,80,22,4,13,14,15,13,15,15,16,16,16,16,133,132,87,65,91,73,74,84,74,88,78,77,83,79,82,64,72,72,71,75,86,98,110,129,119,182,245,246,138,60,84,21,46,30,25,36,18,34,34,47,67,84,117,109,105,92,73,107,109,103,93,92,87,75,56,52,59,66,73,79,83,86,89,104,103,90,98,102,111,103,94,93,95,103,105,119,117,122,115,112,118,108,97,92,78,87,84,78,78,64,72,90,103,94,107,99,85,100,72,64,67,71,79,63,64,77,81,93,97,102,104,91,83,88,100,103,100,93,109,115,101,101,90,76,63,63,85,95,99,82,78,73,87,91,81,99,89,78,71,67,65,91,96,89,94,96,97,85,77,59,56,65,66,98,95,88,87,92,97,87,112,120,120,121,107,99,102,117,118,127,131,109,118,141,157,157,153,133,123,99,95,101,96,106,68,87,83,69,71,66,75,61,66,71,49,53,60,63,68,75,101,105,83,91,108,98,70,66,92,110,125,120,92,54,37,20,28,32,27,33,26,67,77,149,144,71,74,107,130,139,134,136,111,89,94,87,92,97,100,97,84,78,90,75,61,79,76,85,76,77,70,79,88,62,55,52,62,77,66,44,51,91,125,110,71,70,88,159,190,118,148,213,226,129,131,240,232,199,133,97,129,76,10,11,8,90,151,124,145,150,159,169,168,164,168,184,103,52,57,19,18,35,61,68,47,47,47,46,42,44,46,43,46,44,43,38,36,39,36,33,34,39,39,31,30,35,34,38,45,36,39,40,38,30,26,34,30,31,36,76,105,89,69,52,41,72,118,132,113,115,87,88,110,91,49,28,26,36,58,87,108,107,110,105,98,105,105,110,101,86,93,96,88,97,97,89,101,97,90,95,98,103,97,88,97,99,93,91,83,81,74,114,200,160,77,74,94,103,97,114,116,75,59,21,19,20,17,28,48,73,99,110,108,110,122,127,118,83,79,97,112,100,84,83,77,92,92,54,26,16,15,16,32,35,31,35,32,36,36,36,42,39,32,41,49,44,53,49,52,54,49,53,46,50,45,46,50,43,47,43,45,42,41,32,27,32,27,24,24,26,22,33,44,35,34,37,35,42,46,46,47,45,45,48,45,37,29,40,57,50,39,32,29,35,33,39,37,27,27,24,27,29,36,45,43,38,30,22,23,23,27,26,24,30,27,27,24,27,38,42,39,43,50,41,41,43,38,44,42,39,41,40,35,41,36,31,32,30,25,30,30,29,34,66,84,51,31,30,31,34,40,38,37,39,31,38,39,30,32,34,24,28,33,25,18,27,27,24,27,28,33,29,30,32,35,33,39,43,39,33,23,29,33,24,29,23,18,31,46,49,44,31,21,21,34,34,33,33,31,31,36,49,48,44,41,36,27,18,26,25,24,26,25,27,33,41,35,32,36,32,38,50,52,48,44,46,41,36,34,39,36,30,35,32,34,34,32,37,31,33,39,32,34,27,23,23,26,49,74,101,125,118,115,124,134,132,122,129,108,105,109,128,139,124,107,106,121,107,86,100,96,76,80,106,123,115,104,86,106,123,116,124,103,103,76,54,61,68,55,45,57,69,76,74,70,81,99,111,122,105,90,89,91,133,159,161,147,91,17,3,12,12,15,14,15,14,15,16,16,16,58,59,41,36,43,54,72,89,87,86,89,89,77,71,70,66,78,70,81,87,88,100,98,107,102,179,244,245,135,66,97,13,46,27,19,35,18,38,36,45,65,85,114,99,90,75,65,88,105,96,80,86,98,96,90,69,60,59,53,61,67,76,83,94,100,87,95,106,109,101,93,96,97,102,101,92,90,92,79,83,101,105,107,107,88,81,83,74,81,77,75,96,112,99,107,81,66,87,83,95,97,95,102,89,92,97,102,98,97,87,81,77,68,74,79,78,91,115,121,124,114,109,105,96,81,66,77,85,79,69,83,87,84,78,64,86,104,93,89,63,65,87,97,84,76,96,109,110,82,72,72,81,107,122,111,77,70,92,108,121,125,128,140,134,105,92,104,107,96,108,136,142,146,152,152,150,152,155,144,109,83,77,111,99,71,89,75,61,66,69,76,74,68,72,68,63,60,63,72,69,83,95,67,63,53,49,46,37,47,72,108,104,89,71,45,37,38,49,47,49,71,89,90,173,134,57,81,102,141,119,107,117,96,101,103,112,113,115,116,100,93,92,84,78,71,84,75,74,77,62,60,65,89,74,60,46,41,59,56,39,40,69,91,83,69,84,82,80,127,171,237,243,156,92,179,252,229,146,35,69,149,83,25,26,24,95,125,64,53,55,66,84,74,75,68,71,50,23,40,25,45,69,62,56,39,41,44,50,48,46,49,50,49,49,45,37,42,37,36,35,37,38,33,35,36,38,41,42,44,37,34,38,39,36,32,36,33,33,45,83,100,89,69,57,42,41,87,120,137,140,134,132,82,38,24,28,46,73,101,111,113,108,102,97,101,101,100,101,99,97,99,106,103,105,103,91,104,100,98,99,97,104,93,89,94,93,89,96,90,83,80,87,180,190,103,73,91,108,111,107,53,23,23,12,23,21,38,62,89,110,108,109,107,115,113,123,99,89,106,98,94,86,81,82,79,83,85,49,22,15,15,23,33,39,43,46,42,44,43,44,46,36,33,39,45,43,47,49,49,49,47,45,43,51,46,42,45,44,42,39,39,40,36,26,31,34,26,18,19,17,19,38,39,37,36,29,29,41,43,44,42,40,40,43,42,31,28,43,57,51,42,32,23,40,45,41,30,23,27,24,24,31,39,36,43,41,24,22,24,20,23,20,19,23,20,27,18,27,38,44,45,33,44,39,42,39,39,44,46,41,34,42,35,43,35,27,31,25,27,33,31,36,33,41,80,81,46,27,27,29,32,34,38,35,34,36,36,30,29,34,30,33,28,18,21,24,26,22,27,23,25,24,27,38,37,34,37,34,26,29,29,38,38,24,24,28,31,40,51,45,39,27,20,23,21,21,26,22,24,26,38,48,45,42,39,34,23,19,22,28,27,27,25,31,33,39,45,39,38,31,28,39,44,51,54,50,51,47,46,46,38,34,39,33,33,33,32,33,31,32,31,36,34,25,19,18,36,51,65,83,87,91,94,105,104,92,111,105,109,111,97,117,118,100,91,92,102,87,70,86,106,101,91,100,111,108,103,93,105,121,120,127,100,85,54,45,77,83,56,27,40,45,61,77,83,93,103,121,122,117,122,105,104,146,171,167,147,94,17,3,12,12,15,14,15,14,15,17,16,16,56,54,47,43,47,48,74,85,75,89,93,88,71,69,78,72,81,85,94,93,105,110,111,116,104,185,242,242,124,58,72,10,46,31,24,33,23,32,36,44,66,83,101,95,80,70,62,77,92,95,65,71,76,78,92,79,71,57,57,60,54,66,80,87,89,87,96,104,107,95,99,113,105,97,84,69,63,74,74,79,94,114,125,122,94,87,96,87,99,97,82,108,118,102,101,74,76,105,96,98,99,98,106,99,84,100,103,102,89,71,77,63,63,68,46,55,89,109,128,133,124,134,133,141,129,105,99,93,83,81,103,100,98,86,63,92,111,102,81,63,60,87,98,91,91,97,113,112,78,63,71,94,111,117,94,70,89,109,121,106,109,111,119,121,82,87,97,79,74,83,112,136,141,144,139,129,137,144,141,99,71,75,115,129,71,82,60,49,53,58,64,73,76,75,64,74,101,106,94,64,69,95,66,56,73,80,65,49,59,99,132,120,116,98,99,87,77,86,75,104,116,117,90,113,113,68,92,118,111,100,92,91,91,93,97,99,110,96,88,87,89,86,87,61,77,84,69,83,74,71,60,61,78,63,73,60,42,56,59,57,48,42,61,56,44,69,54,86,186,239,241,150,65,92,172,243,189,80,8,29,114,107,102,103,99,124,72,11,10,12,13,17,17,20,14,31,43,34,50,62,69,54,52,51,44,34,45,51,47,53,48,51,50,45,41,38,43,34,33,39,33,33,39,39,41,43,46,45,37,36,33,32,36,36,43,41,35,42,46,71,89,83,68,55,49,42,47,81,141,147,108,65,26,20,34,60,89,101,105,110,102,97,100,94,98,98,93,98,92,88,83,97,100,104,103,96,103,104,102,103,96,100,97,91,99,111,101,98,108,105,96,89,152,210,141,84,92,107,78,38,21,19,16,16,26,51,72,96,109,104,108,111,111,107,105,98,97,100,94,81,84,78,78,80,74,88,75,39,19,15,14,32,40,46,48,43,49,46,45,45,40,38,38,45,45,39,46,43,44,46,39,41,44,40,39,38,32,36,40,33,36,36,32,29,32,32,15,19,19,17,21,34,36,27,31,26,37,31,28,35,33,32,25,27,31,28,30,45,51,43,40,34,32,36,29,32,33,27,26,25,32,28,29,41,42,39,26,21,25,19,21,18,18,21,21,21,21,32,34,39,32,29,39,38,45,40,43,49,43,38,34,37,34,30,32,31,31,37,36,32,35,39,35,33,63,97,73,34,31,29,29,34,32,34,31,31,33,31,36,34,31,29,22,20,21,18,23,20,22,21,24,33,29,34,35,34,31,24,25,26,40,44,32,25,27,34,43,49,48,42,38,27,16,23,25,19,23,21,24,29,37,42,43,48,42,29,18,22,24,24,25,21,21,29,29,37,39,39,42,39,30,27,29,42,50,53,53,54,51,49,40,39,44,40,40,37,35,36,35,38,37,33,31,24,21,17,37,50,52,60,62,77,77,77,82,83,79,74,87,104,103,112,99,71,66,79,89,69,73,114,131,120,103,102,118,101,91,72,61,67,72,97,81,65,49,39,65,52,41,32,40,88,94,94,94,99,108,110,113,115,126,115,111,136,148,137,115,78,21,5,13,13,16,15,14,15,16,16,16,16,79,90,77,64,54,69,79,81,85,87,100,87,65,61,67,82,97,104,110,106,125,130,120,130,125,196,243,243,117,48,68,8,51,29,26,36,22,31,35,50,66,86,92,96,84,67,65,71,94,87,77,56,39,51,65,73,78,74,66,72,64,70,82,95,110,84,89,104,106,105,105,111,99,90,84,75,84,97,87,99,113,128,140,116,108,112,109,107,122,139,106,112,127,94,95,79,102,117,90,87,100,102,128,131,83,72,89,103,94,79,87,84,87,96,77,67,100,128,136,137,132,142,130,147,150,134,133,128,127,114,120,106,109,137,103,106,123,98,94,87,96,113,126,114,109,95,130,164,86,73,69,84,109,90,83,70,67,93,96,86,93,91,112,107,84,103,99,88,98,96,103,114,126,134,131,127,123,122,112,94,85,78,126,153,111,84,77,61,52,53,55,51,59,80,59,74,114,116,110,85,79,100,87,119,128,106,104,78,78,97,120,124,129,140,135,132,109,107,119,120,122,113,84,97,104,114,139,124,119,116,108,101,91,93,76,66,57,78,92,79,76,77,92,79,74,79,74,99,89,81,83,58,61,58,82,86,74,81,73,73,59,53,69,54,53,60,41,142,251,241,144,45,30,45,80,163,125,29,2,25,76,98,120,117,106,114,88,53,79,73,41,41,36,75,77,61,57,57,92,76,53,44,42,48,44,44,44,49,51,53,47,50,52,39,43,39,36,36,32,37,36,40,42,42,48,45,45,42,31,36,37,36,37,42,47,47,42,47,61,77,87,78,64,57,54,53,39,51,100,88,45,28,23,45,72,95,101,98,100,97,97,93,97,98,91,96,97,104,98,74,69,78,94,99,96,97,99,100,101,101,103,98,93,105,127,137,89,77,105,152,128,57,91,177,184,108,61,42,24,22,13,19,20,31,64,86,107,102,112,117,109,108,102,100,99,108,94,86,83,74,79,74,73,81,86,81,66,36,16,20,16,33,41,39,40,37,36,39,38,37,42,36,37,42,36,39,42,46,41,39,42,41,37,30,28,34,35,34,35,33,36,35,29,33,46,38,18,16,18,18,27,32,29,33,32,33,31,27,32,27,29,29,29,31,33,39,45,43,45,44,42,43,43,39,33,39,37,33,30,36,39,27,32,37,43,36,19,21,23,19,20,18,21,21,26,29,27,35,36,33,29,30,39,37,42,39,34,39,33,27,29,30,24,23,31,29,31,32,36,33,27,29,27,29,39,68,85,61,32,29,29,32,38,32,31,29,31,35,34,32,33,28,18,22,22,17,19,22,20,23,26,25,32,29,30,29,23,24,21,35,47,40,38,34,34,45,47,53,47,40,34,22,19,21,25,19,24,23,22,36,39,42,46,45,37,26,24,22,16,21,24,20,23,23,35,39,38,45,42,39,34,30,23,29,35,41,46,50,49,51,47,42,44,46,50,44,44,39,43,37,35,36,29,26,24,21,33,44,42,51,62,66,59,59,67,74,72,49,70,108,115,112,92,60,68,84,86,96,101,110,130,135,121,112,110,96,87,67,47,42,50,72,64,52,49,41,37,42,39,31,75,121,118,93,98,113,106,98,84,87,101,100,97,102,97,87,83,66,23,7,13,13,16,15,15,15,16,16,16,16,87,103,105,88,86,96,97,92,90,96,101,83,49,53,69,74,92,97,101,108,134,135,127,137,133,200,243,243,103,53,68,10,50,24,29,27,27,34,37,50,72,86,87,75,66,62,66,72,91,100,91,81,34,30,43,59,72,67,70,77,76,71,81,103,139,109,81,98,116,110,110,113,97,101,97,94,110,119,118,122,121,134,136,113,114,111,108,100,129,157,104,107,112,93,78,62,102,106,88,85,91,100,161,184,75,48,78,125,122,81,99,91,112,127,104,104,127,134,139,117,109,139,87,120,154,142,147,143,146,128,123,96,128,187,130,111,125,108,109,111,119,129,113,104,100,80,151,194,76,61,80,97,115,92,89,77,61,63,87,117,177,159,113,112,120,154,152,132,140,151,139,141,151,160,160,158,154,134,138,111,90,89,110,141,122,113,92,68,57,54,55,57,46,63,77,92,110,96,83,78,82,94,95,147,130,128,140,124,108,103,117,130,131,131,130,126,127,122,123,131,127,111,84,99,125,112,123,122,105,115,118,103,98,107,104,87,84,100,91,80,69,81,101,86,71,71,79,92,88,80,70,56,66,68,60,98,120,89,63,59,51,77,113,78,75,76,18,97,187,200,178,32,2,43,39,82,54,7,13,51,89,87,99,95,89,102,93,102,109,103,99,84,67,153,125,112,105,88,89,50,38,37,39,45,43,42,42,45,50,56,53,52,46,39,39,32,39,39,34,37,39,37,46,50,45,39,39,40,40,39,31,40,43,42,49,48,49,63,86,92,92,80,57,49,55,53,51,49,53,49,35,40,54,88,93,108,101,97,96,91,98,98,93,91,95,96,102,101,101,94,75,85,93,97,96,98,101,100,99,101,98,104,121,139,121,113,53,37,95,138,81,21,17,113,177,93,41,18,12,18,16,17,39,71,90,104,109,111,113,112,105,103,99,104,106,91,74,66,70,73,75,69,76,79,84,89,59,27,18,15,18,34,37,38,42,39,38,39,39,37,33,34,41,37,37,36,39,39,42,46,35,28,32,30,27,34,34,34,38,33,34,30,23,46,57,46,34,29,22,25,31,31,35,39,37,37,38,33,37,32,25,33,35,38,44,41,40,42,40,42,45,46,46,46,44,45,42,39,46,44,44,35,32,39,45,31,19,23,19,24,25,22,23,30,30,23,34,31,37,35,25,31,29,29,28,34,26,25,27,21,24,24,22,29,29,27,24,24,26,27,32,24,22,20,24,36,65,83,52,24,33,32,32,33,29,36,30,33,34,29,32,31,17,19,19,18,21,16,21,17,27,32,23,32,26,20,19,19,25,46,48,44,43,39,42,46,43,51,41,39,32,23,24,22,24,23,27,25,24,31,36,42,47,45,34,25,18,23,23,21,20,22,21,27,38,37,39,41,44,40,39,36,27,21,27,31,37,46,45,47,50,49,47,44,45,47,46,46,44,43,46,39,32,19,20,29,33,41,43,54,59,53,58,48,60,83,93,77,62,105,110,106,94,58,70,77,94,112,92,90,99,124,111,95,105,111,124,118,114,107,102,103,89,92,87,84,95,86,92,81,109,152,121,107,118,131,117,101,97,96,107,104,103,90,84,90,100,83,22,7,14,12,15,15,16,15,15,16,16,16,89,117,112,103,110,122,114,105,104,104,111,90,57,44,48,49,66,74,87,104,139,134,121,139,132,197,240,240,103,57,54,9,49,24,32,27,22,35,35,54,80,104,75,55,44,42,57,64,87,84,124,123,46,30,39,53,65,67,66,55,54,61,60,96,179,120,77,108,131,129,105,115,105,112,110,101,120,127,116,124,102,108,129,82,87,84,69,74,110,153,101,83,85,71,69,61,88,91,79,69,81,81,158,192,69,49,86,145,125,67,66,63,85,100,106,110,118,116,120,89,94,127,53,78,142,143,134,116,114,104,112,84,117,200,121,83,104,98,101,85,92,95,89,66,73,63,135,182,54,61,84,107,118,83,124,188,148,100,110,159,237,171,130,135,149,197,181,146,146,165,184,179,178,201,205,205,190,160,152,126,100,93,115,122,97,95,83,53,62,73,70,61,45,57,67,99,113,84,75,71,83,96,90,116,127,133,146,132,128,122,124,127,120,119,116,119,125,119,123,128,123,109,101,111,122,111,105,111,104,103,105,103,123,137,140,128,106,110,92,80,72,81,103,87,81,77,86,89,83,90,83,65,84,76,53,89,116,106,89,52,32,59,107,105,111,76,22,34,49,142,156,24,12,80,105,98,37,16,69,103,113,108,98,98,95,93,102,95,108,103,108,95,90,134,102,98,89,39,39,45,42,38,47,45,43,46,51,52,52,60,55,46,46,39,44,37,36,39,35,38,44,46,47,45,41,41,42,36,37,41,40,40,44,44,46,58,62,74,93,96,99,83,65,54,49,53,55,51,49,45,60,88,112,117,102,106,101,90,93,101,100,93,91,98,94,98,102,99,103,98,106,103,98,98,102,105,101,101,100,101,108,125,134,128,96,52,34,44,42,57,50,35,17,40,125,116,65,24,6,17,23,49,73,92,108,106,109,97,110,108,90,96,98,98,83,72,63,63,66,71,74,69,80,83,88,80,49,23,16,16,21,37,36,39,41,36,35,35,38,37,37,37,36,39,32,35,39,39,43,38,34,30,33,35,28,30,35,31,34,31,32,27,29,46,50,39,29,33,19,24,33,30,38,37,39,47,44,36,34,31,36,34,35,45,44,45,34,32,33,38,44,45,49,45,47,44,44,46,47,42,42,42,39,38,36,27,21,26,29,28,32,30,28,29,28,21,24,30,23,29,28,23,24,22,28,30,22,27,28,26,26,22,24,29,29,22,19,24,26,20,23,24,21,26,24,23,42,83,77,42,27,24,31,31,34,36,33,33,31,37,30,24,21,23,22,21,23,19,21,23,30,28,22,27,30,21,18,23,33,46,48,44,46,45,44,45,40,42,39,35,34,23,20,28,30,30,33,31,31,40,40,40,47,42,29,25,21,24,20,18,20,21,27,31,38,38,34,41,43,45,45,42,31,25,26,23,26,31,40,43,46,50,44,48,43,46,49,44,47,45,43,45,30,21,23,25,37,42,46,53,52,52,51,52,67,92,112,92,72,88,85,85,81,55,50,67,90,87,89,61,52,84,90,98,127,147,159,169,170,167,159,156,152,154,158,153,153,156,157,144,157,162,141,140,154,149,135,139,137,136,137,136,132,120,131,127,143,107,13,4,11,12,15,13,15,13,15,16,16,15,89,105,102,98,118,133,134,131,126,127,135,129,87,65,51,27,42,67,89,105,141,136,114,128,132,200,241,241,81,45,45,4,44,22,31,29,29,40,37,55,94,137,124,96,59,39,68,67,85,94,150,155,83,77,58,56,64,74,85,47,32,47,59,99,179,120,78,107,134,115,94,107,95,103,99,96,106,95,84,82,65,103,125,60,61,63,58,60,107,160,111,96,89,84,93,97,115,103,96,89,98,97,168,192,73,61,90,147,125,63,74,74,96,99,104,106,106,108,102,89,120,173,71,52,99,108,106,95,95,88,100,86,122,169,97,83,92,92,102,87,89,102,90,78,101,95,182,210,84,85,90,107,116,104,188,252,221,116,113,155,165,144,142,141,158,153,123,120,125,146,160,159,153,170,172,152,128,90,93,92,79,88,81,76,59,51,47,56,61,66,55,56,50,55,41,69,81,76,81,72,71,70,53,78,83,101,122,114,113,116,120,118,95,99,113,117,120,121,116,104,98,100,110,113,125,138,130,136,136,122,132,132,134,143,141,119,109,122,116,104,84,77,80,79,79,79,89,78,81,80,73,62,75,86,61,122,143,122,116,82,51,83,127,122,113,78,72,78,43,66,133,59,24,115,147,139,101,96,117,126,125,108,114,106,108,114,107,128,124,132,126,106,103,100,69,49,45,34,38,47,53,48,46,53,52,57,57,50,55,57,53,59,50,42,41,34,41,38,45,47,43,42,34,39,40,35,36,37,38,44,43,39,42,50,67,85,77,62,72,85,102,130,119,71,48,51,53,51,55,49,66,118,128,116,101,93,88,87,92,87,94,90,92,97,98,98,92,100,102,105,104,101,94,101,99,104,104,96,96,108,111,94,69,53,60,30,36,43,34,37,40,46,17,69,152,117,92,39,12,30,45,78,98,110,109,91,93,92,83,86,83,84,81,75,66,64,65,61,66,67,79,76,81,86,87,75,42,21,20,16,34,38,36,36,39,41,32,36,38,41,38,37,37,33,41,41,43,36,34,37,29,30,30,27,33,38,31,27,32,28,27,31,36,50,35,22,24,22,19,23,32,37,37,44,38,39,45,39,43,36,44,47,39,44,44,41,29,27,27,30,43,44,44,47,42,41,46,42,45,47,42,44,40,39,39,29,26,24,20,21,22,21,26,24,33,28,27,33,29,32,31,31,36,33,34,34,34,31,26,30,33,25,24,31,23,22,20,21,27,19,25,22,22,24,20,22,27,58,84,67,40,27,26,36,29,30,36,30,30,34,33,20,21,23,22,27,27,24,28,26,29,33,26,31,29,20,19,25,42,44,35,42,45,40,44,42,41,42,42,34,30,29,29,32,32,33,36,34,36,38,38,40,39,33,26,29,23,24,22,21,27,24,35,33,31,39,38,40,39,39,42,37,38,40,24,22,28,21,31,36,36,46,44,49,48,42,50,48,46,48,48,37,29,21,19,27,42,48,54,59,57,49,47,48,57,63,69,71,51,67,64,66,69,55,48,40,46,57,71,50,25,38,52,103,170,174,178,188,188,190,197,208,199,192,192,190,194,190,191,176,177,182,171,180,170,162,160,168,172,162,169,160,157,158,163,158,163,110,12,2,10,11,15,12,15,15,15,16,16,15,80,91,90,92,109,140,148,150,139,130,153,163,155,139,115,49,42,80,93,108,147,131,105,114,120,199,241,241,73,54,46,8,48,17,36,29,23,33,37,54,97,169,199,209,150,90,101,102,141,183,233,231,167,134,106,83,74,91,122,104,68,72,81,128,207,135,89,100,115,110,76,96,94,102,107,100,103,91,80,84,83,137,171,93,69,74,90,84,116,175,149,147,152,151,159,162,162,157,165,158,174,161,214,216,111,97,102,152,145,134,154,151,162,158,159,156,159,152,152,155,188,229,136,70,56,75,85,83,92,73,87,75,112,160,135,147,153,156,158,154,161,158,157,148,180,181,232,249,154,134,106,125,127,100,173,240,172,107,114,120,123,93,105,95,101,102,60,61,69,79,71,53,46,48,44,31,17,10,12,12,14,14,13,14,13,13,14,15,14,14,14,14,15,15,14,13,14,14,17,20,21,31,20,14,17,40,77,88,103,101,102,98,94,110,121,125,121,118,105,105,107,100,126,117,125,159,155,153,142,141,145,134,127,131,135,120,128,140,131,118,93,81,72,66,67,82,89,77,65,48,40,42,75,76,59,139,151,111,104,91,110,133,151,117,97,98,112,122,81,87,155,103,44,111,134,118,112,117,126,109,116,116,108,115,114,110,120,121,128,134,125,114,116,110,62,65,92,79,68,56,62,57,48,53,54,51,55,57,53,57,53,61,51,41,44,45,53,50,49,41,37,45,39,38,32,31,37,41,46,41,46,47,69,105,109,111,97,84,63,87,139,157,145,67,41,45,48,59,56,50,54,87,121,117,94,84,79,78,84,84,88,87,84,98,91,93,92,96,104,98,93,92,99,100,107,103,96,100,97,92,59,32,31,32,37,34,31,36,32,39,41,39,33,152,216,132,104,51,26,54,74,106,111,98,89,79,73,60,61,71,80,72,66,67,66,66,63,72,73,75,90,81,87,92,77,52,19,21,19,21,31,38,42,34,41,40,46,46,38,37,40,38,33,39,41,41,36,35,32,28,31,30,31,27,31,35,29,32,33,28,29,29,46,47,24,22,22,16,22,26,31,37,39,40,41,41,41,44,45,43,45,40,39,43,46,43,34,37,32,32,34,38,47,44,43,41,45,47,39,43,42,39,42,41,38,24,22,20,21,26,17,21,28,28,35,31,37,38,37,39,33,36,41,39,39,36,38,41,33,32,31,28,35,32,32,28,21,24,21,22,27,23,23,20,24,23,22,36,65,84,61,30,26,33,34,28,33,35,35,35,30,21,20,26,24,19,20,24,26,26,29,28,33,27,19,21,21,30,39,41,40,42,39,39,42,43,39,41,43,32,33,29,24,26,26,30,32,28,36,41,38,38,38,32,23,24,23,22,23,29,24,29,31,31,34,36,38,40,41,41,41,37,34,33,30,24,21,26,23,27,29,34,40,44,51,50,50,51,48,50,56,42,23,22,21,31,50,60,62,67,55,51,51,44,51,48,45,46,44,47,50,58,57,57,53,53,63,65,96,79,39,22,23,120,196,194,205,205,206,217,229,238,220,206,213,212,214,217,212,199,200,202,199,198,190,187,186,199,199,194,203,198,188,180,188,184,193,115,7,2,8,11,14,13,15,13,15,15,15,15,82,90,83,97,102,130,139,138,130,130,160,194,241,250,223,112,79,105,100,102,141,130,105,110,116,193,244,243,87,110,81,12,51,22,32,29,25,36,34,47,79,155,218,250,217,125,140,136,213,247,252,252,185,167,147,139,137,142,178,157,138,152,155,188,246,184,120,110,133,132,117,134,135,147,149,145,152,150,141,155,160,206,235,154,127,162,205,162,157,179,176,194,203,196,195,194,193,203,207,212,213,188,199,186,152,148,136,170,185,195,209,203,206,203,204,201,196,197,204,194,191,194,164,131,74,66,71,74,68,58,71,86,136,179,186,206,202,206,208,199,199,206,193,197,214,195,200,185,174,168,106,120,107,65,92,129,136,110,93,93,64,21,11,7,22,28,13,23,31,38,44,45,49,133,79,60,33,16,15,15,14,14,13,14,14,13,15,14,14,14,13,13,14,14,13,14,13,13,14,14,15,18,20,18,15,16,38,67,81,85,92,118,123,129,133,125,125,107,89,103,108,117,146,137,132,154,150,137,124,117,117,111,112,105,121,134,129,126,118,113,107,115,101,78,79,95,94,75,71,60,42,48,78,79,47,89,111,110,107,88,109,131,136,111,89,89,105,89,76,68,154,153,51,88,96,66,64,72,98,98,92,92,102,99,92,92,87,93,80,92,95,90,87,84,59,66,113,120,116,101,105,86,57,47,43,50,49,53,46,44,51,55,47,47,50,48,51,50,41,39,44,41,35,33,34,38,37,41,45,47,57,80,137,150,114,99,96,119,108,117,138,107,60,27,35,47,48,54,57,55,44,57,81,97,91,81,75,72,84,83,89,83,88,90,91,93,88,97,95,96,93,95,106,107,101,97,100,96,88,77,64,41,31,32,32,35,33,31,35,35,35,37,34,162,183,118,136,51,42,83,101,114,78,67,73,64,62,63,64,73,71,63,62,67,70,76,71,70,90,104,96,102,95,65,38,23,17,15,17,22,36,41,45,35,36,41,45,48,41,38,37,39,36,36,40,34,32,31,32,29,33,29,27,31,28,35,32,30,28,32,31,27,49,36,17,25,17,18,19,25,36,36,43,39,41,43,42,40,44,41,41,46,40,42,40,37,39,41,39,36,35,44,46,43,43,47,46,38,45,43,39,44,41,38,39,24,20,24,14,21,21,21,23,31,41,36,36,39,37,36,37,37,41,35,40,37,35,38,38,36,36,34,33,31,30,25,23,28,22,19,27,24,22,23,24,22,21,27,41,73,81,54,26,24,33,33,33,33,39,35,27,19,19,22,19,21,22,21,24,24,29,33,24,26,19,29,31,33,44,36,39,38,44,41,41,42,37,41,39,39,41,31,29,30,24,29,29,28,34,42,40,39,37,29,23,24,23,24,23,18,22,25,31,33,36,41,40,37,39,43,34,40,41,39,38,25,29,23,24,27,25,25,34,34,46,50,49,50,50,50,53,44,24,19,20,38,65,76,73,57,53,52,55,54,48,40,35,44,39,46,45,42,45,57,76,95,108,113,147,105,63,36,53,176,213,208,217,206,216,226,228,225,208,207,214,208,210,218,217,205,202,202,204,206,203,203,204,216,220,215,226,217,208,205,207,213,216,113,5,2,8,11,12,12,15,13,15,15,15,15,73,89,78,90,94,117,125,127,124,130,189,237,249,249,250,160,116,128,117,124,146,151,159,160,141,198,247,242,104,169,126,31,53,14,32,27,25,32,35,53,69,105,155,238,209,141,159,152,229,252,247,221,173,177,179,194,192,194,201,195,194,201,191,201,228,188,156,147,160,184,183,190,190,192,200,196,196,194,195,200,193,210,212,183,201,243,252,235,199,208,197,203,202,194,206,225,218,198,208,195,190,128,92,97,129,183,175,189,192,199,209,200,201,195,196,196,191,191,188,170,105,81,130,172,124,78,59,59,63,54,85,124,168,194,203,206,205,207,196,189,193,189,200,196,199,148,87,85,137,176,113,94,83,65,83,122,137,102,95,81,36,15,24,24,57,84,110,155,179,213,242,249,253,253,249,237,227,227,224,224,217,214,212,210,210,204,200,205,211,202,196,199,194,191,180,173,170,166,166,156,160,165,174,171,128,112,97,67,80,86,102,123,127,109,107,122,113,105,88,81,87,95,139,141,134,144,121,119,111,96,103,95,102,103,121,131,111,108,126,133,125,131,122,104,90,96,84,68,71,71,73,77,93,74,47,63,83,100,88,64,86,117,121,89,73,73,63,57,53,50,126,163,61,45,69,34,22,32,59,65,69,81,84,81,73,73,76,65,56,55,56,58,53,56,42,36,43,70,102,98,113,116,104,92,76,84,96,87,70,64,56,50,42,50,49,41,43,38,46,39,39,41,34,39,39,44,43,42,50,54,72,83,141,145,96,103,105,137,135,108,60,20,24,27,66,64,54,55,51,60,54,46,46,55,65,67,71,83,91,90,95,92,93,95,93,99,97,97,99,97,90,101,106,95,106,95,83,82,79,95,97,80,47,30,31,35,34,32,32,32,31,42,29,78,91,76,136,65,33,62,60,66,51,57,60,57,72,73,74,75,66,50,61,71,71,76,71,81,93,92,95,84,53,30,19,21,14,17,23,31,42,39,39,38,39,37,38,39,45,38,36,42,39,39,35,33,36,29,26,33,35,30,34,34,31,31,31,32,29,33,31,36,32,20,21,24,19,16,18,24,35,36,38,39,39,39,39,38,44,42,39,42,41,39,39,39,42,45,40,43,42,42,45,42,45,44,43,43,44,41,36,41,44,38,34,24,17,19,22,17,16,18,26,37,39,37,42,41,37,38,36,39,39,39,37,38,37,36,38,37,32,29,29,29,25,28,24,29,22,28,33,22,27,27,22,25,26,29,29,46,78,69,40,22,29,33,33,35,33,33,21,15,22,19,19,24,19,21,20,24,35,27,24,25,22,21,27,37,39,39,36,39,39,39,48,44,40,40,39,45,38,35,36,29,33,28,29,35,39,43,36,38,37,28,25,20,20,22,20,21,22,23,29,37,34,38,36,39,42,37,41,39,43,40,38,33,30,25,22,25,19,25,33,29,33,39,44,42,44,53,50,40,21,25,24,44,90,97,95,85,87,99,99,83,79,78,84,89,95,79,37,32,36,50,62,83,98,103,128,105,90,66,136,229,213,211,213,208,221,221,201,198,199,208,214,193,193,208,204,199,192,198,211,206,207,206,205,218,218,218,225,219,208,205,219,225,222,113,5,2,8,12,13,13,14,13,15,16,15,15,66,79,75,106,104,121,127,129,131,137,204,240,249,249,250,139,105,148,152,160,174,201,245,245,196,214,250,226,105,193,126,22,45,14,35,26,24,36,29,52,72,116,132,147,148,130,160,144,179,217,183,156,154,181,180,190,199,198,200,194,194,192,168,118,106,138,178,185,194,202,201,208,198,197,194,191,192,193,192,188,164,85,103,144,223,251,252,234,189,201,177,182,185,188,226,250,209,190,187,174,129,41,24,22,80,169,200,213,192,191,188,187,188,182,187,178,184,176,165,110,40,36,73,158,168,108,58,60,74,92,128,157,189,197,197,196,190,191,184,179,177,192,194,184,161,77,50,50,92,165,148,117,89,89,100,132,148,118,112,169,231,243,252,252,252,252,252,252,253,253,252,252,252,252,252,252,252,252,252,252,253,253,252,252,253,253,253,253,252,252,253,253,253,253,253,253,253,253,252,252,253,253,253,253,252,252,229,168,112,98,101,117,110,89,96,112,110,106,109,92,64,78,111,125,131,127,110,109,119,117,98,119,134,110,127,131,112,131,143,134,110,110,123,105,92,81,59,59,57,79,90,81,77,54,46,69,76,71,68,48,75,103,105,94,76,77,86,67,58,44,96,168,78,23,33,26,41,19,32,56,66,81,63,62,61,49,62,71,71,55,47,50,40,53,44,35,36,37,62,77,89,110,134,130,113,131,158,159,148,134,78,30,34,45,68,47,48,42,45,43,40,41,44,42,41,43,50,64,81,95,74,48,119,137,108,128,120,112,65,28,20,20,36,70,117,105,73,53,50,55,48,39,29,38,58,78,91,93,93,92,97,98,101,104,106,96,96,104,98,101,96,103,103,105,92,75,70,79,87,100,113,113,91,51,34,34,32,35,31,34,33,33,33,53,45,39,85,54,31,29,32,52,51,56,53,70,80,81,77,71,71,68,67,72,76,74,74,84,83,77,62,42,23,17,21,15,19,24,43,49,41,43,37,41,42,35,34,36,35,37,43,38,39,34,34,33,28,29,29,32,30,35,34,31,33,33,32,26,34,33,29,37,25,20,24,17,21,21,19,29,35,36,34,38,37,38,36,36,47,39,41,43,43,42,41,41,37,42,41,39,43,43,42,39,43,48,39,43,42,41,47,43,39,35,27,21,22,17,22,21,15,26,30,35,38,39,39,37,37,34,41,38,35,39,36,38,35,40,35,32,34,23,23,29,30,29,33,34,40,36,33,33,37,35,30,37,33,29,29,28,57,81,66,38,24,25,29,30,35,27,18,21,19,17,17,19,21,19,23,28,24,26,25,22,18,18,37,41,39,38,39,35,38,36,39,36,41,40,35,41,38,40,38,30,28,31,35,38,37,39,40,36,33,25,19,20,18,21,21,19,20,21,30,34,27,33,34,28,32,31,36,39,34,35,33,33,33,29,25,22,24,23,22,27,29,34,37,42,43,47,45,40,26,61,153,171,147,129,118,122,141,146,139,141,137,143,157,156,160,148,107,86,58,51,55,68,74,80,113,113,98,113,197,230,207,211,205,208,218,200,185,191,200,211,203,183,186,199,200,199,196,202,204,202,207,204,200,210,208,205,216,201,193,202,214,214,208,114,5,0,8,11,13,12,13,12,15,15,14,15,62,82,72,118,123,124,141,137,148,151,195,239,245,245,236,114,110,145,161,182,201,243,252,252,247,240,252,220,108,165,74,13,39,14,32,23,26,33,35,53,70,112,105,77,66,82,134,128,117,137,141,146,177,179,171,181,184,190,182,179,171,166,100,22,11,59,155,199,209,199,180,178,177,169,164,164,167,178,171,160,73,1,10,55,178,246,248,214,166,152,142,169,174,185,210,212,186,181,179,139,53,9,17,20,36,104,182,217,201,184,181,184,182,181,180,183,185,174,124,68,40,45,61,103,167,155,89,64,81,122,159,182,197,188,189,187,183,179,171,170,180,190,179,155,86,32,34,37,46,95,139,111,63,35,41,83,105,96,173,250,252,252,253,253,252,252,253,253,252,252,253,253,253,253,252,252,253,253,252,252,252,252,253,253,252,252,252,252,252,252,252,252,253,253,253,253,252,252,252,252,252,252,252,252,252,252,251,251,210,130,96,105,109,90,108,137,115,118,120,109,100,103,117,121,129,126,114,115,118,117,125,139,137,97,117,137,129,151,114,79,68,84,117,105,98,79,42,59,81,80,62,53,56,51,42,62,74,74,71,53,44,57,95,95,82,93,92,79,72,74,73,152,99,10,45,59,67,55,33,42,83,86,103,102,48,33,57,93,88,73,71,51,51,67,42,46,39,35,57,78,94,101,132,137,119,121,139,143,121,87,54,36,38,75,95,92,63,45,53,42,46,49,48,48,44,61,90,110,110,111,108,63,111,134,125,130,64,35,20,15,34,71,105,115,129,130,122,59,53,73,34,16,21,60,90,95,100,99,97,97,105,104,103,100,96,97,98,106,96,96,98,106,103,83,77,79,83,86,92,98,112,124,125,98,50,31,30,36,36,34,35,31,27,54,58,41,99,68,27,21,19,45,53,62,67,75,83,78,69,73,83,73,76,75,71,82,81,91,76,49,34,17,17,17,19,23,29,48,59,53,39,38,37,42,40,36,40,39,36,41,39,36,32,35,37,32,35,36,33,33,35,34,37,33,30,32,31,34,36,32,33,43,37,28,27,27,24,23,22,22,29,26,28,32,29,38,41,43,42,41,41,43,40,42,44,43,37,42,47,44,45,41,44,42,40,44,42,43,42,42,42,41,42,34,22,25,19,18,21,21,22,25,30,30,29,34,38,35,36,37,38,39,36,41,39,36,43,37,33,25,20,21,26,29,33,29,24,38,36,30,26,27,31,27,37,32,33,31,26,34,39,62,81,64,34,24,30,29,29,22,17,18,18,23,18,19,19,22,26,27,22,21,22,18,24,29,32,32,31,31,37,39,37,36,34,36,35,35,39,38,36,37,36,32,34,34,33,41,34,39,39,36,28,21,24,16,16,19,16,25,19,29,36,37,31,27,29,18,23,24,23,27,24,26,24,33,33,27,27,27,24,23,27,25,26,29,32,27,42,45,50,27,51,212,250,250,223,171,154,141,150,149,152,160,163,170,170,171,178,169,160,143,107,89,77,81,84,100,123,116,111,158,219,216,203,206,197,194,193,183,177,193,199,201,194,182,190,201,201,204,196,189,197,197,203,196,195,206,198,198,200,190,182,196,210,200,200,114,6,1,8,11,13,12,13,12,15,14,13,13,83,96,92,134,123,124,139,142,149,147,196,237,248,248,235,130,122,144,144,169,200,250,252,252,252,252,252,221,134,175,68,17,40,8,35,27,29,36,36,57,76,108,73,48,59,71,115,125,141,163,166,177,179,185,177,168,178,176,179,164,163,123,49,16,17,33,90,170,206,193,164,155,157,155,157,150,165,168,165,119,51,34,37,31,77,170,215,195,164,149,160,184,171,180,176,169,174,173,163,82,27,34,32,33,29,43,130,200,205,194,179,176,185,181,184,180,175,146,79,61,76,89,84,68,125,176,136,94,102,146,171,195,189,173,179,172,178,177,172,173,184,175,159,102,33,21,34,31,24,22,62,99,74,58,26,23,33,37,54,120,235,248,252,237,223,209,205,188,179,156,138,112,135,231,252,252,251,251,252,252,252,252,252,252,252,252,251,251,252,252,252,252,252,252,252,252,252,252,253,253,252,252,253,253,253,253,252,252,252,173,92,74,87,93,104,130,119,116,104,111,127,140,140,127,132,132,120,125,123,108,105,118,105,70,114,137,137,127,84,59,73,94,100,100,107,102,75,80,84,87,72,56,62,75,60,50,60,69,77,52,45,27,55,87,94,106,98,91,91,105,84,150,124,30,85,140,144,115,83,76,108,157,169,122,46,24,71,93,88,77,84,87,83,105,91,82,68,65,78,92,110,109,111,115,98,73,64,56,54,63,83,89,93,101,88,91,78,89,88,66,63,53,53,50,53,81,119,129,113,110,119,104,131,131,84,51,7,17,21,55,95,117,125,112,113,122,132,107,117,86,28,9,34,99,114,105,105,101,101,106,105,108,99,92,102,104,107,104,89,97,110,97,87,81,77,81,93,98,94,106,110,119,114,107,50,23,30,32,33,39,33,36,28,63,68,76,196,114,17,19,14,47,59,76,78,78,77,69,71,70,78,82,79,73,82,89,88,71,42,23,16,17,19,18,25,40,57,65,59,48,38,37,41,42,42,41,39,39,42,33,36,37,33,36,33,37,36,33,31,36,37,40,39,33,36,33,33,37,42,37,39,57,51,52,49,37,39,38,30,27,30,19,29,34,27,33,37,41,38,41,40,36,41,41,41,37,43,45,42,41,39,45,43,36,42,44,44,42,43,39,39,42,42,29,19,25,26,29,32,29,28,30,27,35,30,32,41,33,33,38,39,41,38,35,39,35,37,37,29,17,17,34,32,21,28,24,27,28,27,29,22,25,21,26,29,27,25,32,31,29,36,41,73,78,54,30,24,29,29,24,18,19,20,23,18,18,21,27,25,27,21,21,25,18,23,24,30,30,31,32,36,34,34,36,34,40,31,38,39,33,37,34,37,36,34,40,36,35,36,32,38,31,27,26,22,16,20,21,21,23,20,35,30,33,32,24,27,27,25,21,24,23,24,24,25,31,35,37,29,27,26,25,31,26,25,27,24,31,37,35,45,18,79,211,247,238,229,217,196,167,147,129,128,141,143,149,154,158,163,162,170,171,152,141,122,103,113,130,116,96,122,201,230,215,217,215,201,187,181,183,192,200,198,203,204,199,202,208,210,209,194,197,201,199,198,188,196,203,194,195,198,190,191,199,203,198,205,113,5,1,7,11,13,12,12,12,14,15,14,14,98,130,113,144,126,115,128,113,125,122,189,244,252,252,238,200,149,125,118,124,161,167,199,244,251,251,252,217,144,186,76,21,37,16,38,24,28,39,39,63,98,153,130,91,101,104,134,150,170,190,186,188,183,184,178,169,167,175,172,165,132,70,59,67,71,60,47,108,184,199,180,165,163,174,171,170,166,160,122,86,112,130,131,79,41,109,200,189,170,172,205,198,160,165,163,165,168,155,89,43,56,89,105,101,60,34,75,152,200,205,188,180,191,189,189,177,150,90,62,93,100,111,114,74,88,152,174,141,139,165,186,194,181,175,175,176,179,183,181,186,186,171,129,102,107,105,113,104,96,120,160,230,251,251,209,144,70,25,16,10,90,141,145,146,177,237,252,239,229,189,135,115,199,253,252,252,251,251,252,252,252,252,252,252,252,252,248,249,252,252,251,251,252,252,251,251,253,253,251,251,249,249,212,137,186,246,245,245,252,169,79,60,60,83,96,127,126,120,100,101,125,131,136,125,108,115,126,124,128,111,81,95,101,74,118,142,120,127,97,89,95,97,96,97,117,113,99,101,123,121,104,100,93,83,66,66,54,63,77,49,42,27,46,80,96,94,89,105,105,131,110,150,153,53,101,144,146,133,89,114,174,194,169,111,46,33,61,79,68,73,92,79,95,147,146,133,122,111,94,74,85,96,89,66,63,70,66,71,97,131,150,139,129,109,75,71,83,95,107,103,98,88,89,97,73,73,98,110,120,119,134,131,145,109,28,12,12,53,84,110,126,119,122,118,119,120,123,105,131,82,20,12,47,120,116,111,110,104,106,105,105,96,96,100,108,108,101,94,99,106,97,78,67,78,84,90,93,98,103,109,113,113,85,38,20,26,24,29,35,35,32,37,29,55,69,92,212,146,39,22,27,66,75,81,75,80,73,68,74,72,76,74,84,85,89,89,57,33,19,18,21,17,19,32,51,63,70,69,61,48,38,36,34,39,44,36,40,40,36,36,29,35,34,31,34,35,33,33,34,36,39,42,38,37,38,33,39,36,40,40,42,69,70,70,66,64,66,54,49,28,23,26,22,27,23,30,33,39,41,38,39,37,41,38,39,43,38,41,40,37,46,41,38,40,42,46,43,44,39,39,41,40,39,27,26,37,39,38,36,40,38,32,34,34,31,36,34,36,37,35,37,37,37,37,36,31,37,36,25,17,27,36,24,21,23,26,27,25,24,19,21,19,23,23,18,25,24,24,31,27,35,33,44,81,75,51,25,21,29,19,21,30,28,29,31,25,30,29,23,21,17,23,17,22,22,27,29,29,33,30,35,35,33,30,33,35,29,32,34,36,34,30,35,32,35,38,33,34,33,34,29,31,29,17,22,22,22,19,23,26,27,40,32,33,29,24,31,27,28,29,30,29,30,34,24,33,37,36,32,27,24,27,27,25,29,23,29,29,34,26,44,124,196,229,194,185,191,198,208,203,200,162,132,121,114,123,128,137,143,144,157,165,165,165,150,136,147,146,102,95,166,231,235,234,242,236,225,213,216,221,229,230,222,232,235,230,227,232,229,224,221,220,220,213,213,211,214,210,206,208,205,201,204,208,211,210,211,113,4,0,7,10,12,12,14,12,14,14,14,14,92,120,117,150,125,108,107,103,88,53,111,215,249,249,242,215,104,89,79,87,83,50,56,107,139,192,245,187,144,175,61,41,34,19,36,21,34,34,39,69,123,199,184,150,137,136,162,177,193,197,188,184,183,183,180,166,168,166,167,141,74,52,101,130,120,88,47,56,118,191,207,188,183,181,193,178,163,124,49,86,155,187,175,124,86,117,198,169,161,188,201,186,137,141,149,144,153,86,24,24,91,171,162,168,108,37,57,96,171,206,189,176,182,189,180,151,100,52,61,108,90,103,130,77,66,96,146,138,128,171,192,194,181,177,186,185,190,193,196,186,173,116,92,149,152,162,185,225,252,252,252,252,252,252,253,253,243,215,160,122,170,210,241,252,252,252,252,252,252,252,252,252,253,253,251,251,252,252,252,252,250,249,252,251,252,252,250,252,252,252,252,252,252,252,253,253,251,251,252,252,229,121,48,38,86,168,141,158,173,110,88,70,103,117,116,137,141,138,104,108,113,93,98,99,86,77,84,98,112,97,92,125,116,114,148,123,114,118,120,112,102,95,89,99,110,99,110,122,115,114,108,94,79,76,66,109,97,58,71,61,58,50,83,92,87,89,97,103,106,129,122,156,177,66,63,112,99,92,102,174,194,153,147,110,48,28,54,75,69,72,73,78,113,158,151,154,150,126,119,80,80,98,99,87,93,120,122,122,136,165,172,137,118,113,89,93,89,92,116,112,98,107,124,113,91,63,55,92,129,145,133,103,76,59,75,92,103,120,130,146,138,139,139,130,127,121,125,113,117,84,53,29,74,122,109,113,111,108,110,99,101,99,101,105,105,99,86,97,104,89,95,84,80,84,87,95,99,101,87,113,93,47,45,16,16,16,22,37,38,29,37,34,34,39,52,66,143,167,97,42,59,96,86,79,71,76,76,73,74,75,71,73,88,83,67,43,27,17,16,18,17,26,42,58,66,64,69,70,61,48,28,34,39,34,41,39,36,33,32,34,37,37,31,35,35,34,33,31,35,37,36,42,35,36,40,38,40,36,36,41,45,73,84,77,80,81,87,74,58,35,17,21,24,23,25,24,29,35,34,34,39,40,38,36,38,39,40,38,39,41,45,44,44,41,41,43,42,41,36,40,39,38,39,23,27,36,36,39,34,31,31,30,33,37,28,35,38,30,37,38,38,36,37,38,36,30,35,33,25,18,28,24,17,18,22,25,23,23,25,19,21,27,27,27,23,23,20,24,27,33,28,28,32,59,86,72,41,24,22,19,26,35,33,38,36,37,36,31,27,19,19,22,22,21,23,27,32,32,29,28,35,35,32,32,32,33,33,36,30,31,35,37,34,32,34,36,31,32,30,28,36,29,22,26,33,31,31,27,34,31,31,32,32,35,31,29,29,36,34,35,34,32,36,35,39,36,37,35,30,30,24,27,25,27,25,29,24,26,32,33,184,246,250,250,216,198,183,171,180,198,215,205,179,155,131,113,108,113,122,129,142,149,152,159,155,152,162,148,120,147,188,210,210,213,226,226,221,222,225,229,238,234,231,237,239,238,233,236,237,235,229,230,233,230,236,230,230,224,223,225,208,207,217,222,226,218,217,113,2,0,6,10,12,10,13,12,14,14,14,13,90,110,105,152,130,118,113,107,111,75,60,100,162,211,229,143,50,51,55,69,68,42,39,56,57,135,219,128,115,151,52,37,32,20,34,19,34,36,39,60,98,171,174,147,154,156,179,191,196,194,184,182,176,181,177,171,162,156,146,77,36,58,133,152,143,115,66,39,54,154,194,184,156,150,162,152,142,65,19,84,162,183,190,184,165,152,160,142,149,174,178,150,125,149,146,127,112,41,9,45,108,177,177,191,127,46,55,50,114,181,175,123,120,136,146,116,58,44,71,111,87,94,120,72,65,69,98,111,109,169,208,197,188,188,195,200,200,210,193,173,122,37,45,113,171,242,252,252,252,252,253,253,253,253,252,252,253,253,252,252,252,252,252,252,252,252,250,250,252,252,252,252,253,253,252,252,250,252,251,250,245,249,252,252,251,251,252,252,252,252,252,252,251,251,253,253,251,251,249,237,94,36,41,45,77,89,83,127,153,68,81,108,131,154,131,139,123,125,107,91,79,70,88,87,61,49,67,81,95,110,113,140,118,104,135,122,110,122,139,123,88,83,91,96,99,100,118,118,108,108,95,74,73,47,51,109,77,55,62,97,133,121,126,116,104,125,148,142,137,143,139,157,188,102,52,94,130,137,136,192,150,115,135,87,46,35,77,92,62,66,71,89,125,149,152,148,143,145,151,132,108,105,105,85,89,120,131,130,131,151,150,112,116,123,100,52,87,148,146,131,87,79,104,118,89,83,77,82,127,111,63,17,7,84,146,128,123,134,139,140,144,142,129,119,117,112,106,100,112,92,97,99,102,121,98,105,111,103,101,101,107,108,105,96,106,83,79,103,106,105,103,105,107,109,100,102,107,108,96,68,35,20,19,12,20,35,53,71,56,37,33,33,33,33,36,25,83,174,170,85,59,78,81,83,67,77,77,71,74,73,79,83,69,49,33,17,16,24,15,18,36,49,66,69,67,72,71,69,64,43,31,36,36,30,35,36,34,34,33,41,38,36,33,34,34,38,34,30,36,38,39,39,36,35,39,41,40,39,45,39,40,74,77,76,77,79,85,72,65,39,19,19,21,21,28,29,25,29,26,32,34,31,35,37,36,40,35,38,39,36,48,44,39,44,38,44,42,38,39,36,37,42,34,28,25,29,36,37,33,31,33,35,29,33,33,35,35,31,39,34,37,38,34,39,32,35,36,33,30,18,28,24,18,18,18,22,21,17,21,24,29,40,42,39,29,26,20,24,29,29,30,27,29,36,69,89,63,35,16,14,18,30,34,37,37,36,37,29,26,18,19,22,19,21,24,27,27,26,32,30,33,31,30,32,30,35,28,32,32,32,33,31,34,32,37,36,32,35,34,31,29,25,21,21,39,39,38,33,34,42,37,35,27,32,31,31,33,31,39,33,34,36,35,38,33,34,32,35,36,31,31,27,26,34,24,32,27,34,20,70,245,246,250,250,247,245,218,196,178,170,184,185,197,203,194,168,143,127,122,145,153,145,149,157,162,158,165,165,156,170,169,163,143,130,136,133,143,154,152,158,167,166,170,173,182,183,178,187,184,189,184,188,199,201,207,199,203,204,208,210,191,193,202,212,221,219,219,113,3,0,7,11,12,12,13,12,15,15,14,14,90,108,95,139,123,113,122,157,192,190,163,110,113,108,87,39,37,76,79,98,93,74,84,93,83,138,184,85,78,97,24,36,29,28,36,16,34,34,41,62,86,124,127,134,150,171,193,190,191,187,182,176,175,174,174,168,147,142,84,25,34,64,128,144,145,135,132,113,60,103,152,150,115,97,125,139,74,29,29,92,151,165,189,222,223,190,190,125,116,167,164,125,122,179,184,132,123,74,33,43,101,181,176,203,124,44,61,37,73,136,148,98,77,112,128,71,49,60,76,107,59,84,113,71,74,106,185,222,194,181,206,201,190,185,185,180,168,181,177,145,89,91,181,250,253,253,252,252,253,253,250,248,252,252,252,252,252,252,253,253,253,253,252,252,250,247,250,252,252,252,252,252,252,252,251,240,229,252,252,249,248,250,252,252,251,251,252,252,252,252,252,252,252,251,253,254,250,250,222,122,71,64,83,91,73,70,61,113,128,78,90,83,125,134,116,107,90,84,70,71,68,76,101,110,88,81,96,111,119,120,148,140,89,78,130,137,123,100,100,104,95,97,97,98,91,95,116,129,120,106,98,83,84,69,31,43,46,77,102,134,129,161,156,123,120,137,146,137,131,134,121,117,163,126,45,66,103,113,84,88,84,53,67,66,49,32,60,87,51,57,84,112,126,145,138,126,139,145,189,168,138,98,64,60,55,78,77,92,105,117,117,120,130,113,73,16,53,144,159,132,96,57,64,89,115,117,97,83,57,30,12,13,45,98,127,100,81,83,99,110,96,103,96,84,81,74,86,81,85,81,83,89,98,105,98,112,104,100,105,99,99,97,102,113,123,117,75,88,122,103,69,63,100,116,115,98,110,87,43,25,15,19,19,20,46,69,91,110,97,63,39,27,34,31,30,34,65,155,187,97,38,53,69,76,70,74,74,71,77,79,76,57,35,23,21,17,22,21,27,46,57,64,66,69,77,69,69,70,61,40,33,35,34,34,35,36,35,36,36,41,39,35,37,38,38,37,34,36,37,37,40,36,39,42,43,43,45,44,38,35,46,71,66,63,64,60,69,61,55,39,17,23,15,23,31,28,29,24,28,30,31,27,27,34,29,30,34,32,32,33,37,36,39,39,41,45,38,44,44,33,34,35,34,24,21,34,39,39,38,35,39,33,31,32,25,31,27,33,34,33,38,36,36,37,41,29,30,36,27,23,33,22,20,21,15,23,19,22,22,17,27,30,34,39,31,27,23,24,26,32,27,27,31,28,41,74,86,58,22,13,16,24,37,36,33,36,36,35,27,18,20,23,18,21,25,25,27,25,24,24,28,27,28,29,30,33,29,29,31,32,30,32,31,32,33,33,33,30,32,31,24,25,23,21,35,39,36,41,43,40,37,37,27,29,36,26,33,33,26,33,29,34,36,29,34,32,34,37,35,33,31,31,26,30,30,33,21,40,15,92,246,246,249,249,248,249,243,232,216,196,180,172,185,205,220,229,237,212,174,177,177,173,176,178,180,177,183,181,174,171,162,141,97,51,21,12,20,28,27,36,39,39,42,46,55,58,61,63,71,75,79,83,93,101,108,109,117,129,140,142,130,136,152,164,182,178,191,114,7,3,9,12,14,13,14,12,15,16,16,16,102,119,93,123,101,101,137,211,243,243,240,180,140,87,42,46,71,106,131,154,146,109,123,131,118,172,191,85,66,63,23,41,32,30,31,25,30,35,46,64,95,132,129,130,162,182,188,181,178,173,173,173,168,162,165,155,127,76,24,12,33,53,78,87,105,129,205,189,63,76,111,136,126,117,141,75,28,23,44,117,162,173,185,227,204,165,181,109,96,173,194,125,118,230,250,165,158,134,69,40,90,177,171,198,118,40,61,39,53,80,151,151,141,148,83,54,70,81,87,110,83,98,113,87,93,174,251,251,227,147,168,173,159,133,107,92,82,103,113,139,211,249,253,253,252,252,252,252,250,250,250,251,252,252,252,252,252,252,252,252,252,252,251,249,250,251,252,252,252,252,252,252,252,252,251,244,230,252,252,252,251,251,252,252,252,252,252,252,252,252,253,253,251,251,249,237,190,165,95,57,57,54,55,71,81,68,57,77,96,97,112,94,98,103,85,83,83,79,74,95,80,88,130,130,103,102,99,104,115,109,126,134,111,121,140,135,99,57,76,81,93,107,103,102,103,103,110,121,106,110,89,77,112,93,78,43,45,109,105,101,98,92,89,73,63,72,72,52,57,48,48,36,95,142,59,10,19,35,36,37,35,44,48,49,56,38,59,76,54,56,89,141,149,131,120,125,132,150,185,168,167,150,131,115,104,97,67,74,92,105,104,128,131,108,95,24,20,73,100,113,103,81,63,92,134,117,61,31,17,9,48,97,108,104,97,66,54,41,51,62,65,66,65,67,59,57,62,65,69,62,66,68,79,96,92,98,99,103,108,94,101,112,132,148,152,97,42,67,86,61,32,27,42,86,105,83,49,31,23,16,15,20,37,57,76,99,122,131,142,118,66,34,38,40,36,59,72,103,146,133,74,40,62,74,75,78,73,78,77,73,49,23,20,17,18,21,19,37,52,65,66,66,73,67,71,72,68,67,61,41,39,39,32,37,35,38,36,38,45,44,45,42,35,37,38,35,42,41,41,35,38,40,39,42,45,47,37,43,44,42,47,60,54,42,42,40,43,42,41,24,23,24,20,29,24,25,24,28,27,32,32,24,32,27,28,32,34,26,27,33,29,31,34,35,33,36,38,40,38,31,32,34,27,20,27,34,36,40,34,33,34,34,31,26,27,27,31,32,29,29,39,35,29,35,33,33,31,27,21,19,35,24,18,21,17,22,23,22,19,22,26,24,32,31,29,25,14,22,25,29,33,27,27,30,24,49,83,79,45,18,14,25,37,40,39,37,43,34,29,23,18,19,22,20,22,29,24,27,27,24,31,28,26,27,26,32,31,25,29,29,29,31,27,29,32,31,30,24,29,27,28,24,18,23,35,38,46,47,43,42,41,39,26,26,29,30,33,28,29,32,29,33,32,32,35,34,35,33,35,36,32,28,33,27,26,35,22,41,15,122,247,247,249,246,241,247,245,247,241,233,220,206,196,186,213,234,252,252,176,166,171,169,175,169,163,158,163,166,155,151,144,127,87,39,15,9,11,12,14,13,14,15,14,14,14,15,14,15,15,15,15,15,15,15,15,16,23,29,38,44,38,51,61,76,92,91,110,94,19,5,13,12,15,12,16,15,15,16,16,16,113,134,105,126,97,84,113,188,222,222,192,115,124,88,59,96,108,104,129,154,153,123,139,155,145,193,197,85,74,63,31,50,29,27,32,28,34,32,50,74,115,155,147,156,178,183,182,174,169,170,174,169,160,160,164,152,98,50,21,32,56,49,49,44,61,91,181,155,41,83,109,113,134,109,98,32,13,30,69,140,182,181,201,226,168,152,182,120,111,183,229,155,132,236,252,208,169,174,122,63,102,170,126,126,69,39,66,43,59,58,115,169,163,110,64,57,83,102,84,110,101,118,113,110,131,147,204,202,132,72,91,99,90,76,60,65,79,98,123,187,251,252,252,252,252,252,252,252,250,251,252,252,252,252,252,252,252,252,252,252,252,252,250,251,252,252,252,252,252,252,252,252,252,252,252,252,252,252,253,253,252,252,253,253,252,252,253,253,253,253,253,253,252,252,252,252,160,88,43,41,59,31,47,65,86,68,61,102,105,127,156,136,119,107,86,92,120,109,103,92,89,132,139,117,91,87,77,83,96,93,117,119,126,124,133,132,99,58,59,69,66,87,88,96,97,88,109,113,104,109,79,78,114,112,97,66,41,66,63,48,41,38,40,39,38,41,36,25,36,25,31,13,36,139,78,15,16,35,42,37,44,41,48,53,63,48,56,92,66,79,112,149,162,135,147,160,163,164,168,168,178,151,131,115,112,134,111,101,104,118,97,95,112,107,90,29,6,12,11,66,78,83,92,118,91,45,20,14,24,78,120,140,153,125,76,42,24,21,30,35,37,42,39,38,43,41,42,38,42,45,43,54,71,84,81,87,94,103,105,101,115,139,143,113,76,48,39,44,42,37,32,32,29,39,57,55,26,21,16,15,22,49,64,89,110,128,128,129,136,133,108,91,104,64,39,66,61,75,105,150,125,56,50,63,79,86,81,73,48,35,27,21,19,17,21,29,50,63,68,70,69,77,75,72,68,59,61,57,51,44,40,39,39,38,30,39,38,38,43,43,41,40,39,37,34,38,43,37,42,42,37,40,39,43,42,42,46,46,39,41,50,52,39,34,33,26,28,29,31,24,20,27,19,26,25,19,21,28,25,24,29,34,31,24,29,33,28,27,27,29,27,28,29,29,25,29,30,30,29,28,34,26,21,19,22,25,24,31,29,29,27,24,23,20,29,21,21,27,25,25,26,28,25,25,28,26,27,23,17,31,35,21,20,17,16,20,22,23,22,18,26,27,22,24,33,26,20,24,27,29,26,26,22,26,28,28,55,85,76,34,15,22,35,35,35,40,27,35,33,19,19,21,21,20,29,26,24,29,27,27,24,24,29,26,26,24,30,28,26,31,25,28,24,27,32,27,29,25,27,27,23,24,19,19,31,40,41,44,47,42,40,38,31,27,27,28,31,29,29,33,32,33,35,33,29,33,35,28,33,33,37,35,29,27,29,32,29,38,23,168,248,246,249,243,243,245,240,245,245,245,245,238,230,211,211,185,210,182,86,83,90,90,90,87,84,79,81,83,82,75,93,101,101,133,139,131,121,102,88,89,111,96,87,73,75,63,60,61,53,44,39,38,48,49,43,37,26,16,19,25,24,22,20,26,29,31,34,43,24,10,15,14,16,15,15,16,16,16,16,16,152,166,148,162,133,108,89,101,103,96,74,50,82,83,103,155,144,103,81,93,110,100,134,157,148,194,196,89,77,65,30,46,30,31,33,24,35,34,46,83,141,205,215,216,224,218,220,217,213,208,209,209,211,208,226,174,91,116,109,102,101,92,104,90,94,98,122,114,83,135,159,110,94,85,64,37,45,65,96,167,190,183,234,237,164,164,196,142,137,195,213,158,118,188,246,191,155,172,159,117,138,166,66,62,58,69,97,74,79,57,73,92,87,81,71,76,98,123,97,87,84,96,100,125,119,84,107,95,54,26,36,53,76,92,103,153,205,242,250,250,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,253,253,250,250,247,244,144,76,46,79,111,93,86,92,81,51,59,106,134,155,159,150,139,114,108,107,103,89,83,98,103,114,117,92,81,91,76,78,79,96,125,119,103,92,124,136,101,61,39,37,38,51,86,60,42,56,105,122,110,114,74,93,114,105,94,38,35,50,39,46,43,51,45,42,37,64,84,49,51,52,69,53,53,143,126,67,86,98,78,66,50,54,72,78,63,40,73,112,132,164,156,162,146,134,158,160,160,147,141,87,53,31,11,10,17,57,69,71,79,81,44,38,47,56,76,41,14,17,14,57,104,92,95,57,15,10,18,66,107,131,132,132,141,110,48,16,24,24,29,32,31,33,32,33,27,28,31,29,28,25,31,50,63,74,73,84,89,88,94,102,111,84,52,38,31,36,38,36,34,32,36,35,34,34,35,36,27,19,14,33,63,79,105,122,126,131,124,118,113,119,146,162,152,73,29,58,57,60,74,125,162,106,49,37,66,86,64,42,22,17,21,15,25,24,45,64,65,74,74,75,73,75,74,62,64,61,52,59,55,36,38,35,39,41,39,45,46,44,42,46,46,41,40,39,40,40,43,39,43,46,43,43,40,42,43,42,44,42,37,43,53,56,35,20,25,21,26,24,27,21,20,22,21,18,17,27,20,21,25,21,24,27,28,23,32,24,26,27,24,24,29,27,26,24,23,25,24,23,26,28,27,23,20,20,20,23,21,24,22,20,22,23,26,18,19,19,20,22,24,28,24,19,19,25,21,23,21,21,18,28,38,17,15,21,19,23,20,20,21,20,23,19,23,23,22,24,21,23,23,27,24,27,25,24,24,22,29,62,86,66,36,17,22,28,29,27,29,26,23,24,22,24,21,22,27,28,24,22,22,21,26,30,21,26,27,25,29,27,27,25,27,28,23,26,27,23,28,27,26,21,22,26,19,18,26,34,34,36,35,34,34,31,27,31,24,30,37,32,30,34,33,26,35,31,31,33,32,32,31,36,32,30,36,26,28,30,32,30,35,208,251,251,248,246,247,242,238,240,239,244,246,249,248,233,223,128,61,39,6,11,12,16,24,29,35,28,23,29,21,19,65,141,191,230,244,239,235,225,217,216,216,224,216,202,207,195,193,196,182,179,171,178,181,181,177,156,139,131,138,140,122,110,94,81,69,56,50,45,21,7,11,12,15,14,14,15,14,15,15,15,198,206,190,209,203,185,137,120,108,92,80,103,151,135,148,182,160,128,103,96,99,82,93,118,127,184,185,84,71,44,25,46,27,33,32,28,33,33,50,71,145,234,247,247,248,245,240,243,223,208,242,249,235,233,232,133,109,187,120,135,128,111,130,122,131,121,129,125,118,181,229,120,51,69,101,125,131,151,173,214,234,229,234,208,177,141,142,120,123,177,134,87,75,84,152,118,103,141,146,136,160,186,113,88,77,97,115,89,106,80,73,59,63,73,92,85,105,133,104,98,83,99,96,88,88,105,141,112,79,59,58,68,104,131,155,205,232,242,250,250,252,252,252,252,252,252,253,253,252,252,252,252,252,252,252,252,252,252,253,253,253,253,252,252,253,253,252,252,252,252,252,252,253,253,251,251,244,159,165,175,170,163,160,158,123,152,164,156,146,160,157,150,152,145,148,132,55,65,79,104,124,92,94,89,78,67,69,116,139,146,131,120,120,120,101,82,66,57,91,92,92,98,89,98,94,92,73,69,88,95,120,108,78,71,101,102,96,61,34,34,31,55,68,48,42,49,107,119,112,110,73,93,101,82,71,34,33,39,39,39,33,73,74,54,49,112,153,125,134,138,145,130,108,171,172,117,141,136,105,117,92,110,140,67,37,23,77,131,122,168,148,123,111,74,81,69,68,64,19,7,12,11,16,16,14,14,27,51,69,89,93,97,110,155,203,229,243,247,248,178,253,155,51,10,5,47,101,132,140,130,117,113,124,92,57,49,61,84,84,89,89,77,70,71,69,52,63,58,32,27,43,64,57,65,71,80,79,85,93,79,56,25,30,38,29,32,37,33,33,31,31,36,36,41,35,35,36,30,44,65,91,120,121,118,108,103,90,85,92,101,110,114,110,51,29,57,66,70,66,92,150,155,95,42,47,49,31,23,16,18,20,21,40,54,66,72,72,73,78,81,72,70,71,71,101,122,123,113,91,63,90,63,75,76,72,73,69,78,82,79,98,71,66,66,70,66,69,76,74,76,81,69,71,98,63,66,67,63,62,62,60,52,25,19,17,18,21,21,23,21,22,23,21,24,24,21,25,25,24,28,23,28,27,21,30,38,38,41,27,19,26,22,24,26,25,25,25,31,23,22,26,22,19,17,22,22,22,21,20,25,19,24,24,19,21,20,24,23,21,23,19,20,19,24,23,21,27,21,17,29,34,22,20,17,15,25,18,19,24,17,17,22,21,20,18,19,21,24,25,19,20,22,27,23,24,23,22,30,66,93,66,28,15,17,23,27,24,22,22,21,18,18,20,24,27,23,20,24,22,21,24,27,22,23,26,24,26,24,27,24,28,24,21,25,23,27,26,22,26,26,20,19,18,18,24,21,28,29,25,26,27,29,27,24,26,29,25,26,33,29,26,28,27,30,31,27,30,31,29,31,32,29,30,27,31,23,42,16,55,237,253,253,251,251,249,248,246,243,245,246,248,249,246,238,214,109,8,6,10,11,22,26,25,40,53,51,49,47,45,42,98,180,213,240,241,231,227,218,217,211,209,217,216,219,227,218,219,224,225,231,229,230,231,229,236,224,212,215,225,223,215,202,191,173,164,147,143,102,12,2,10,10,13,12,15,13,13,14,13,13,172,178,158,169,186,182,160,170,174,159,137,160,181,146,134,151,170,164,136,119,108,82,78,91,111,172,164,61,37,22,11,37,30,31,29,24,32,39,52,58,92,157,182,185,171,145,162,158,114,126,175,193,161,123,109,66,83,137,103,113,104,92,98,78,94,88,84,94,90,160,236,96,22,88,147,196,218,241,251,251,242,182,157,161,174,122,106,98,101,155,96,59,53,50,73,45,69,96,86,79,136,153,98,99,80,89,101,95,111,105,91,89,81,71,83,81,87,105,97,98,101,97,87,75,60,75,121,114,110,85,63,72,79,128,173,194,155,119,168,214,194,208,236,237,253,253,250,250,253,253,252,252,252,252,252,252,253,253,252,252,253,253,253,253,253,253,252,252,253,253,253,253,252,252,190,212,232,33,24,38,32,34,41,95,31,27,49,26,37,39,34,34,53,46,58,60,55,75,89,101,84,81,70,73,80,81,98,113,122,120,118,105,110,117,105,89,88,92,86,89,93,106,107,93,72,79,80,83,89,89,97,93,69,45,54,76,75,60,58,48,31,56,80,40,35,46,107,118,110,110,62,77,74,68,69,44,58,55,47,35,21,66,80,50,54,127,143,99,116,112,132,122,83,129,148,122,124,95,70,101,108,122,123,81,45,26,70,68,37,72,40,18,21,9,15,12,13,14,13,14,20,27,73,108,138,182,210,237,251,251,252,252,252,252,252,252,252,252,246,246,245,101,21,46,93,143,153,152,115,108,109,119,120,98,93,102,135,149,158,161,157,157,150,150,149,136,146,101,35,24,62,84,62,67,64,77,86,77,74,74,63,44,36,27,26,34,38,34,33,33,35,43,36,41,38,35,35,41,62,81,90,89,82,82,73,62,61,71,66,42,40,34,42,34,37,63,69,82,68,71,110,170,156,62,24,15,18,20,14,19,25,46,65,70,72,73,76,78,72,73,69,66,66,75,117,157,171,187,179,160,147,143,150,151,152,148,151,151,160,159,151,165,159,158,160,166,161,156,169,158,164,160,155,156,145,152,160,154,148,119,76,40,16,12,12,12,16,16,21,27,22,27,29,31,35,29,34,46,48,46,46,41,37,41,53,82,87,77,48,31,36,33,35,29,33,32,31,29,28,29,26,22,23,22,19,22,17,16,23,19,17,18,18,21,19,21,23,21,19,24,24,20,22,28,24,18,18,24,18,30,39,24,20,17,15,24,23,22,20,19,21,18,19,23,19,23,23,19,22,24,23,22,22,25,23,21,25,21,33,84,92,57,23,15,19,21,19,18,26,19,23,24,24,24,22,19,26,24,21,24,21,22,27,27,24,24,23,21,21,28,27,22,27,26,21,24,24,23,24,19,19,22,20,19,23,19,19,24,21,25,24,25,21,21,25,23,27,22,24,29,29,28,27,25,27,29,29,29,24,31,32,29,31,27,38,26,33,15,88,240,252,252,252,252,252,252,249,249,249,249,249,249,249,216,206,163,116,73,16,10,11,11,18,45,68,77,87,87,83,74,98,130,154,193,203,198,186,173,175,171,172,171,168,180,191,188,188,193,200,207,203,208,201,203,209,200,202,202,208,214,211,208,204,201,203,198,199,112,5,2,8,10,12,11,13,12,14,15,14,14,130,134,97,95,119,110,104,141,171,150,113,143,141,89,73,98,125,131,126,120,123,107,92,99,119,170,155,57,50,35,16,31,30,48,42,37,42,45,57,56,71,73,61,63,66,80,86,82,43,53,81,73,37,21,35,27,49,61,65,87,78,76,78,53,59,39,51,61,44,123,215,87,24,92,153,205,226,243,235,184,106,65,81,137,219,214,213,179,175,207,167,155,134,101,103,97,86,85,76,57,49,56,62,74,74,77,79,79,97,98,94,86,78,66,86,81,79,84,89,98,81,86,70,62,45,46,76,74,73,67,67,52,53,81,115,139,62,7,11,12,17,27,44,43,63,87,116,118,92,93,98,89,85,109,113,76,69,84,82,53,77,79,40,47,53,48,60,61,93,178,90,110,52,27,6,128,210,27,13,15,16,10,96,207,48,19,43,19,27,15,20,30,62,50,57,86,80,110,104,104,88,57,95,77,54,65,64,76,98,124,130,130,116,105,103,117,125,115,119,94,97,105,93,83,55,73,80,55,68,69,73,76,61,45,48,50,61,63,57,61,35,64,88,43,19,51,100,117,127,104,65,91,72,68,79,81,105,113,128,111,104,121,117,73,76,134,116,60,54,47,79,85,47,47,66,75,88,84,63,62,39,61,59,51,56,35,43,30,12,35,28,27,29,25,57,125,130,167,145,240,252,252,252,252,253,253,253,253,252,252,248,248,243,243,228,200,139,172,116,93,79,43,71,118,139,162,151,117,100,107,104,120,116,100,96,106,130,136,135,140,132,132,129,132,132,139,139,69,24,15,57,94,86,92,70,77,85,91,94,91,101,84,60,32,31,37,29,33,36,37,36,41,38,34,40,37,35,45,51,63,60,62,63,60,56,68,71,64,65,37,28,27,35,40,37,63,75,83,74,74,84,130,175,106,23,9,15,18,19,37,58,61,72,74,75,75,73,73,67,73,60,65,75,64,77,97,108,129,112,126,96,108,117,118,118,122,125,121,127,126,100,125,115,124,128,124,110,115,102,111,119,108,126,104,117,99,122,109,96,75,50,34,21,14,12,16,18,21,23,24,32,28,34,37,34,37,54,69,72,75,65,65,66,74,120,158,171,148,85,53,57,72,67,55,58,50,37,30,35,34,26,33,26,21,24,18,15,19,19,15,17,23,21,27,23,23,27,23,27,22,25,27,25,25,24,21,26,21,24,26,24,21,21,17,19,24,19,20,21,21,21,24,20,19,23,24,22,19,23,28,24,23,26,23,23,21,23,21,17,44,84,82,45,18,17,15,20,20,19,23,24,25,26,29,26,28,28,24,26,27,24,26,35,38,30,28,23,27,26,24,25,21,23,21,24,23,23,27,24,22,19,16,21,21,23,22,22,19,24,24,18,24,28,26,31,26,29,31,22,29,27,30,24,25,34,30,33,26,29,31,32,31,30,28,30,31,39,14,97,226,172,206,229,245,251,251,249,249,249,249,249,249,241,200,223,237,242,229,191,151,109,49,56,33,52,34,44,50,45,42,38,48,68,109,143,151,153,154,165,171,176,171,160,172,183,178,173,179,182,181,176,181,174,176,171,162,171,168,174,176,178,178,177,181,184,184,192,113,7,1,8,11,13,11,14,13,15,15,15,14,120,118,88,83,108,75,62,97,132,102,71,123,119,71,37,57,67,68,106,143,181,177,162,165,170,207,196,139,149,134,121,131,122,122,107,84,60,62,74,78,92,87,89,98,101,97,117,109,63,59,53,33,27,44,53,47,61,56,49,75,59,80,89,75,77,55,81,87,61,124,208,117,80,123,137,163,164,148,92,60,41,32,63,143,253,253,253,253,242,240,248,249,180,107,118,134,113,84,86,80,72,57,39,57,66,78,86,81,84,77,78,81,69,74,97,92,82,84,88,89,87,103,110,119,92,62,47,54,87,87,90,91,63,65,56,53,42,66,59,13,22,19,19,17,20,24,72,83,24,18,25,23,20,36,50,24,16,24,32,19,53,86,34,13,22,18,28,28,59,165,76,8,16,26,13,141,243,34,24,29,21,31,131,253,83,33,76,42,53,32,30,46,87,70,81,112,120,110,81,69,35,44,60,46,43,32,45,78,105,112,115,103,99,92,97,112,115,122,101,94,76,100,116,97,82,86,79,47,52,59,52,50,59,51,53,63,44,48,57,63,45,59,80,58,69,74,146,165,149,118,87,116,109,124,130,133,165,171,194,195,190,198,175,132,93,121,108,46,39,16,22,32,14,15,12,13,15,15,14,14,14,15,23,40,65,78,116,135,162,210,220,247,253,253,252,252,253,253,252,252,252,252,248,248,238,234,232,208,169,150,118,111,99,79,43,6,12,15,13,15,47,79,141,158,142,132,110,102,101,101,104,120,100,103,106,105,120,120,122,119,112,113,109,111,117,118,86,28,10,10,70,105,108,98,77,90,98,101,103,103,103,120,108,66,36,32,37,36,39,33,32,34,36,37,32,37,40,30,34,50,61,59,59,66,74,70,69,67,60,40,34,29,36,37,39,69,79,87,74,64,45,35,125,165,77,18,12,28,50,58,66,67,67,79,79,73,73,74,69,65,62,79,93,67,65,63,39,30,29,29,27,23,29,34,32,32,35,29,31,33,32,32,27,25,29,29,26,21,31,27,28,28,25,36,32,28,28,28,24,27,23,21,19,18,17,19,25,22,22,24,21,26,22,24,25,23,35,34,35,44,36,42,36,39,39,74,131,121,69,37,47,45,42,45,38,35,31,29,28,23,25,27,22,21,21,24,18,20,19,17,23,24,25,23,21,25,29,24,21,25,22,23,27,23,23,24,22,21,23,21,17,16,19,19,17,20,21,25,19,20,24,23,24,17,21,25,18,22,23,24,27,22,27,24,23,28,23,25,26,25,56,88,77,39,17,15,19,22,27,25,26,33,33,34,28,31,35,25,29,32,27,32,40,42,42,31,31,32,31,35,30,28,30,29,31,31,24,28,28,23,26,23,19,17,22,21,18,19,19,21,23,27,31,35,35,32,34,34,35,32,36,38,35,37,38,39,35,40,34,35,38,35,41,41,43,39,46,33,61,106,74,92,142,200,238,217,202,204,212,212,214,212,195,182,203,234,251,251,249,239,211,157,111,85,56,37,29,29,32,25,24,21,24,43,63,91,108,131,157,171,187,178,171,184,188,187,181,187,191,186,183,178,165,166,155,155,169,164,165,157,150,155,161,164,166,163,182,113,8,2,9,11,14,12,15,13,14,15,15,15,122,113,81,65,98,80,74,85,119,84,61,127,152,132,76,71,54,56,110,168,232,240,220,222,231,244,238,207,230,211,193,214,206,224,215,164,127,123,121,109,121,129,149,163,148,116,125,132,97,99,89,65,53,65,88,95,118,113,94,109,77,101,124,102,126,132,164,159,139,181,237,198,181,201,184,172,156,141,118,139,127,106,130,158,224,243,251,222,217,208,246,236,123,106,108,122,83,55,69,79,96,74,53,51,57,78,87,78,64,55,66,84,70,74,94,86,91,83,91,89,92,148,178,212,169,90,59,65,95,122,155,134,92,81,88,110,144,208,179,117,183,217,205,179,154,125,250,246,150,152,174,180,172,209,205,156,179,174,169,160,228,250,171,166,159,136,166,173,207,253,178,141,135,139,184,253,243,57,97,132,151,119,190,253,110,115,128,104,120,94,99,103,129,103,103,133,103,75,48,42,59,48,48,33,26,64,90,97,99,95,100,118,119,111,107,92,84,90,100,92,98,121,117,101,105,134,110,53,46,63,66,61,54,46,59,57,47,43,43,49,77,109,107,130,145,149,186,165,142,93,81,128,142,167,164,171,180,181,198,198,196,192,174,144,109,79,27,3,12,11,13,13,16,17,20,23,33,56,126,132,160,199,222,241,252,252,253,253,252,252,253,253,252,252,246,246,245,245,220,193,184,168,148,109,119,21,27,18,18,22,12,16,23,27,20,8,20,39,20,14,91,141,160,153,125,105,86,90,108,113,118,129,96,89,107,106,115,112,121,114,107,113,108,114,115,118,77,9,27,66,111,103,91,98,91,107,96,98,95,100,109,115,115,99,49,30,32,44,36,28,37,33,37,43,44,50,47,28,34,57,56,63,72,76,74,64,64,67,63,41,31,31,31,40,39,62,76,73,62,33,26,17,29,129,143,60,31,51,62,74,76,70,75,70,73,77,70,67,66,78,86,88,85,61,62,58,33,16,19,20,21,20,20,22,22,23,21,19,22,19,18,21,24,21,20,22,19,21,20,20,22,17,22,22,18,20,19,22,22,17,23,18,21,21,17,20,18,22,25,17,17,16,22,23,18,20,20,26,21,22,23,21,18,22,20,26,80,100,71,27,12,14,21,21,24,24,19,21,20,19,20,23,19,18,20,23,19,17,18,19,21,19,16,19,19,16,23,18,16,20,18,21,19,18,20,19,17,21,16,21,19,16,22,18,18,23,18,24,24,24,25,21,27,27,20,23,22,19,24,19,21,21,21,23,23,20,21,21,21,21,24,64,91,76,42,14,17,22,27,29,22,26,25,24,29,29,27,27,24,33,24,32,38,42,40,32,30,26,34,29,29,34,31,28,30,30,29,28,25,23,26,25,20,24,24,15,21,21,21,21,29,35,33,34,34,33,35,33,35,40,39,41,42,38,41,42,43,42,40,44,41,47,44,44,51,50,52,51,57,57,44,52,57,98,148,154,143,148,151,154,156,152,152,141,148,162,197,232,222,229,225,185,171,172,141,127,117,123,120,116,110,115,101,89,92,98,115,131,153,170,172,156,152,162,171,177,187,202,201,198,195,175,154,160,158,165,175,165,163,151,145,152,162,159,159,161,181,113,9,4,10,10,13,11,15,13,14,15,15,14,151,116,78,49,81,74,102,117,152,122,85,163,230,218,135,146,124,104,116,142,217,214,179,205,223,217,199,175,193,177,138,143,155,207,232,153,95,101,112,122,130,136,155,160,167,160,159,145,140,142,128,128,100,89,119,143,189,182,164,194,156,165,184,184,225,238,247,247,250,250,252,252,251,251,251,251,239,240,252,252,250,250,253,250,223,194,210,216,227,197,245,224,84,95,131,139,96,52,57,66,90,78,60,69,74,75,71,60,55,58,79,89,75,75,85,82,83,82,83,74,82,134,149,179,127,80,69,70,88,106,122,101,83,94,123,159,160,222,191,129,210,233,245,226,200,214,253,253,218,214,225,214,218,237,234,196,219,240,222,229,249,249,206,206,236,248,236,223,204,248,214,208,242,233,210,247,198,65,141,186,188,121,173,248,118,118,122,103,119,98,101,92,127,121,110,121,96,85,97,116,126,106,66,44,83,122,126,91,77,115,135,140,118,95,89,64,40,57,79,126,121,104,98,76,109,148,122,66,36,46,76,76,57,41,51,56,46,55,66,84,126,150,155,170,171,157,166,130,82,72,85,107,110,127,147,146,149,169,165,140,149,141,116,127,125,104,57,47,102,115,136,167,200,235,250,250,252,252,253,253,252,252,252,252,250,250,243,241,230,224,196,187,171,112,94,72,58,66,25,3,14,27,42,44,17,8,13,15,21,30,19,21,19,24,51,46,42,65,44,31,73,104,146,116,93,81,74,100,116,126,150,154,84,72,83,103,112,112,116,105,112,114,115,113,116,112,85,90,116,117,113,95,104,101,97,107,96,98,99,108,111,118,104,53,30,28,44,42,34,36,34,34,34,38,43,53,51,37,45,59,69,84,73,71,69,65,75,66,64,47,29,29,35,38,36,60,61,44,27,22,15,18,14,41,153,146,74,57,63,83,80,75,64,62,64,72,70,72,92,88,90,80,69,54,41,37,25,19,13,17,19,15,27,20,21,21,18,22,16,21,18,18,24,21,19,19,16,22,22,17,21,17,20,20,17,20,16,18,21,19,21,19,19,18,21,17,16,21,17,20,20,21,20,21,19,20,19,17,23,18,23,20,15,23,19,21,63,96,80,30,11,17,16,20,18,18,21,21,21,19,20,23,19,22,20,19,19,20,19,16,23,20,19,18,21,23,17,19,19,19,15,18,21,19,16,17,20,17,23,19,16,18,18,19,20,22,18,27,24,24,25,20,24,19,22,25,18,21,19,19,21,19,18,18,21,21,21,20,21,18,18,30,68,96,69,29,15,18,22,18,17,24,21,21,20,20,21,23,17,23,18,23,45,45,40,21,21,23,19,22,22,24,24,21,22,24,26,27,25,18,21,27,21,22,18,18,21,18,20,23,24,27,27,23,24,27,33,27,28,30,27,33,31,33,24,34,33,30,33,32,35,36,34,35,39,39,39,47,40,41,46,51,30,20,109,151,148,159,156,160,156,149,152,147,142,142,155,172,177,183,183,181,193,212,213,206,211,212,206,197,205,206,196,188,181,179,170,174,183,176,165,146,141,140,146,163,179,195,196,201,199,175,157,165,165,173,176,165,167,164,153,161,172,167,169,176,191,113,8,3,10,12,12,12,14,13,14,15,15,14,192,149,135,124,128,98,132,162,236,189,118,205,251,250,200,239,238,220,179,159,209,211,200,238,244,206,195,174,208,190,137,132,115,191,235,139,107,120,134,151,138,144,141,142,191,200,178,173,170,171,163,157,145,141,160,167,218,206,186,251,221,226,252,252,253,253,252,252,252,252,252,252,253,253,252,252,252,252,252,252,252,252,253,251,221,232,250,244,253,184,250,193,32,100,117,144,118,87,89,86,116,92,69,71,83,77,67,72,74,80,84,93,90,101,101,73,73,78,86,74,59,79,63,60,63,56,51,71,76,66,74,72,85,116,119,98,74,181,130,49,74,91,110,105,137,172,225,247,153,180,152,92,66,130,154,66,131,158,173,226,246,174,121,148,201,246,194,163,131,180,230,206,199,172,126,204,123,69,120,95,113,59,124,167,67,97,63,70,90,66,64,61,110,112,112,126,105,122,142,139,119,83,93,102,105,127,117,93,110,130,140,118,81,78,57,47,42,21,37,92,99,83,61,40,76,98,74,55,33,40,46,57,58,43,45,55,33,52,129,141,160,174,150,161,147,130,138,100,73,72,95,92,76,92,116,133,151,167,146,134,152,152,178,216,237,248,250,250,252,252,253,253,253,253,252,252,251,251,239,239,240,245,210,158,128,112,104,60,33,19,3,16,31,14,10,12,12,27,18,9,18,25,29,45,55,43,31,39,46,44,42,45,42,100,134,52,41,81,56,40,39,71,95,75,78,77,96,123,138,141,152,147,80,68,86,102,110,112,111,107,109,113,112,109,109,110,108,116,105,97,113,103,111,107,100,107,93,111,112,117,103,59,59,23,32,38,32,38,33,31,35,36,36,37,42,48,46,43,64,79,73,75,71,68,75,70,78,77,64,45,32,33,34,35,37,52,46,33,35,30,23,24,42,56,104,154,134,80,51,66,79,73,66,61,65,70,85,92,89,76,78,72,60,46,26,14,18,21,15,16,20,17,18,18,19,20,16,19,19,20,23,18,22,22,21,21,17,20,17,17,19,19,19,19,21,24,17,21,18,18,20,19,21,17,17,21,16,17,15,22,22,16,20,17,21,21,17,21,19,18,18,17,21,20,21,14,48,101,87,39,13,12,18,21,17,16,19,18,18,21,19,19,15,18,18,15,18,17,20,22,17,20,18,18,17,20,18,17,21,19,20,19,19,20,18,17,17,18,20,19,17,17,21,18,19,24,19,26,23,27,27,24,28,24,22,21,24,24,19,19,17,21,23,15,19,19,20,16,17,19,23,22,38,77,87,63,23,14,19,16,21,22,18,17,24,19,22,19,20,23,16,29,40,46,29,17,19,20,19,21,22,18,23,21,23,20,20,22,19,24,18,18,20,19,22,19,19,22,20,19,23,18,21,20,19,29,19,19,21,18,24,22,23,24,21,21,23,25,21,23,22,27,22,24,27,23,29,26,23,31,28,32,23,27,131,190,190,199,190,197,191,183,183,179,178,164,162,167,156,164,176,179,191,185,192,206,214,220,203,199,212,222,220,213,215,218,211,204,201,191,186,182,171,165,165,171,178,175,177,193,202,188,184,192,184,184,183,183,183,179,180,189,195,186,194,196,200,114,7,2,7,11,13,12,15,13,14,15,15,15,226,212,231,233,227,165,181,199,248,220,119,203,244,242,220,252,243,243,218,178,222,244,253,253,252,252,251,251,252,252,243,237,181,223,241,190,174,183,174,186,146,134,138,145,193,174,170,175,179,160,119,146,163,184,187,153,171,143,143,217,159,175,238,225,230,206,220,201,226,250,250,251,252,252,252,252,252,252,252,252,252,244,240,178,157,192,208,230,218,171,247,184,45,78,72,103,104,107,141,122,128,103,64,65,70,64,83,96,94,84,77,92,107,121,109,77,70,74,84,79,68,83,71,77,61,61,48,63,88,75,92,77,110,131,146,127,74,155,125,91,101,72,75,90,123,132,217,250,207,219,214,113,63,132,174,115,144,158,155,218,240,203,172,148,199,248,214,215,169,205,247,227,221,208,137,169,133,90,98,93,104,69,101,93,72,96,70,101,103,87,88,86,127,106,104,131,106,117,132,92,68,51,75,71,73,99,107,131,129,122,113,89,64,73,95,91,57,24,13,44,58,55,57,38,48,48,41,42,39,46,37,61,77,76,71,33,17,39,118,152,149,141,117,125,113,117,135,123,119,126,141,154,157,174,213,232,242,250,250,242,252,252,252,252,252,252,252,252,250,250,241,237,216,194,161,137,145,115,65,38,19,105,129,30,4,8,10,14,13,25,10,19,42,17,16,39,40,56,59,47,41,54,47,71,118,91,46,37,42,61,67,89,128,153,168,86,48,83,51,40,40,31,52,59,101,113,128,147,144,145,135,108,66,80,98,114,107,109,119,107,110,107,118,114,106,106,101,98,74,89,110,105,108,99,100,111,113,117,116,84,38,21,30,42,45,45,33,22,37,34,36,39,33,37,34,45,57,69,74,73,66,62,67,67,69,75,77,81,73,41,31,33,31,38,33,45,54,45,55,56,56,63,70,77,80,107,162,136,66,56,60,70,68,65,73,85,89,80,75,63,69,72,61,38,21,16,16,19,17,24,21,19,23,17,20,22,19,21,18,15,21,20,17,18,16,24,18,17,21,15,23,21,21,15,22,26,15,22,21,19,18,20,20,17,20,15,17,20,17,21,18,19,20,19,21,21,17,16,23,18,20,19,19,19,20,19,35,88,91,46,18,15,19,18,17,19,18,13,21,19,21,24,19,22,17,23,18,21,23,19,21,16,18,17,17,20,20,20,19,19,18,16,16,17,17,18,20,17,17,19,17,17,19,20,22,26,23,28,31,27,29,30,26,23,26,25,25,26,22,19,18,20,20,15,18,19,17,20,19,18,17,21,19,41,80,88,59,20,14,17,17,20,15,22,20,17,21,20,24,20,18,26,42,45,25,17,22,18,22,19,21,21,20,20,21,21,18,21,18,18,18,18,19,19,20,20,21,19,17,23,22,19,20,19,21,21,19,23,21,23,18,19,23,20,22,22,19,21,22,24,22,21,22,25,22,23,21,26,24,22,22,31,21,65,183,200,190,190,181,191,182,178,166,165,179,181,171,150,141,149,150,163,181,178,179,174,174,175,158,154,187,204,194,186,187,198,199,192,190,187,194,197,199,200,199,193,181,173,157,177,203,204,217,223,208,201,205,212,209,206,205,212,218,208,208,201,208,114,5,2,6,11,13,12,14,13,14,15,15,14,204,223,234,233,224,158,165,165,244,181,83,169,222,159,134,197,202,194,121,92,121,178,221,250,244,210,250,239,251,251,240,227,131,197,236,134,119,127,136,171,93,101,120,130,170,131,131,172,176,113,76,112,151,177,188,122,97,92,96,160,81,70,100,101,139,84,98,92,150,203,197,198,164,243,254,254,252,252,252,252,252,231,236,141,75,120,116,148,191,178,251,181,40,81,60,85,76,96,133,112,124,101,83,75,66,60,77,104,88,78,73,85,83,89,92,72,57,61,73,67,56,98,111,91,84,78,62,55,71,99,111,83,79,84,93,96,61,145,90,92,117,65,85,88,125,108,162,217,148,176,160,65,58,105,155,103,110,98,94,156,160,144,131,118,176,208,184,204,159,201,244,184,194,183,147,179,114,90,99,91,112,74,102,86,74,96,68,101,97,95,103,110,128,96,104,126,100,102,101,74,62,54,57,44,42,71,106,118,114,77,70,79,75,99,114,111,105,86,49,31,26,44,47,38,58,41,47,56,42,46,20,54,84,67,69,53,30,71,132,134,142,145,145,168,173,198,230,237,250,250,252,252,252,252,252,252,252,252,245,244,239,242,252,240,229,212,204,203,177,128,105,85,39,13,8,9,14,18,9,16,12,67,98,20,8,10,26,41,38,43,34,46,67,51,71,112,87,119,95,53,55,63,49,76,139,131,66,32,48,89,86,87,95,95,93,53,40,49,43,39,23,41,69,96,137,132,128,144,143,136,131,105,68,85,106,115,112,116,113,114,110,111,113,95,93,84,92,103,87,98,107,101,105,94,97,117,119,99,45,26,21,23,60,56,69,54,23,21,24,40,35,37,39,37,49,66,74,74,71,73,60,64,76,71,69,67,77,74,66,50,31,34,36,37,35,48,57,45,63,79,88,93,86,78,72,81,131,172,117,53,46,57,78,87,88,83,75,72,69,60,71,68,54,41,21,17,16,21,18,19,24,17,22,18,16,21,19,19,19,21,19,19,21,22,18,19,17,21,22,17,20,16,19,21,21,19,19,19,22,19,19,19,18,21,19,20,18,18,17,16,21,19,21,18,22,20,17,19,21,21,16,19,24,19,19,21,25,75,104,49,19,16,15,17,19,22,17,16,17,19,19,20,24,20,17,19,19,18,18,20,18,20,21,18,21,19,17,22,18,17,19,18,17,17,24,17,19,21,17,18,19,18,19,23,23,30,28,29,30,33,33,26,31,29,27,27,27,28,21,15,19,19,18,19,23,19,17,19,21,19,21,20,20,22,48,91,90,46,17,17,17,21,20,19,21,19,22,21,18,17,21,27,48,46,26,21,17,17,25,21,16,18,17,18,21,21,19,20,18,18,21,19,20,20,19,19,19,21,19,18,19,17,22,20,24,19,16,19,17,22,21,19,23,22,18,22,19,22,20,18,23,23,24,19,21,23,21,24,19,23,31,23,40,131,190,168,157,146,140,155,145,131,127,159,205,205,188,174,164,155,141,170,219,229,227,198,165,151,128,113,133,148,151,131,128,139,136,141,143,144,152,163,170,181,178,162,150,128,131,148,181,198,216,224,205,203,212,226,224,215,216,212,215,196,185,184,193,114,6,1,9,12,12,12,14,12,15,15,15,15,97,124,137,147,119,72,91,97,199,162,94,169,189,108,42,60,57,58,46,55,78,117,156,219,193,145,193,172,218,211,167,166,70,151,221,64,39,61,88,137,65,66,87,106,134,88,111,155,152,110,81,107,113,127,150,111,103,92,118,204,124,59,49,77,111,60,75,63,123,147,112,148,134,182,253,253,252,252,252,252,249,249,244,128,50,80,63,98,132,163,240,141,28,78,67,87,59,60,96,92,109,118,112,95,79,68,76,87,79,76,77,69,61,56,73,67,61,61,56,64,58,81,93,84,64,73,76,53,48,58,86,64,53,41,67,69,59,85,39,39,55,61,71,78,97,72,69,78,51,36,34,27,31,46,57,33,43,40,38,71,50,33,41,47,71,93,98,128,107,122,146,110,134,142,105,101,66,57,56,57,70,56,63,51,62,59,42,64,46,58,100,119,126,96,96,96,64,68,87,103,103,65,48,42,45,50,59,87,104,95,81,76,76,73,81,111,118,97,81,48,13,9,15,15,25,42,61,72,72,63,57,74,87,109,134,148,184,230,249,249,252,252,252,252,252,252,253,253,253,253,253,253,250,244,240,229,201,222,165,88,54,116,188,180,154,131,147,155,131,79,61,44,23,27,12,10,12,16,18,12,16,57,95,94,58,24,45,57,34,50,35,90,132,84,73,83,115,136,137,84,37,46,53,46,71,86,58,44,41,56,59,53,53,53,76,57,36,65,62,41,33,107,146,143,159,134,128,134,129,137,141,125,111,105,109,120,113,112,115,111,114,98,89,73,75,74,73,98,97,105,101,104,111,95,105,98,59,36,17,22,16,49,85,95,102,39,15,23,30,39,44,36,41,64,73,77,71,73,77,75,69,72,75,69,79,82,83,79,69,47,32,32,37,37,29,51,54,47,66,81,96,88,79,80,72,66,90,137,163,114,48,57,80,93,81,71,76,70,68,63,66,64,56,36,22,18,18,22,20,24,19,19,21,18,21,21,21,18,18,19,20,18,17,18,18,19,21,19,18,23,19,18,22,19,21,19,20,21,19,20,21,22,19,17,22,24,16,20,20,20,23,19,18,20,20,19,20,17,21,19,17,21,19,23,17,21,19,57,114,66,21,15,14,18,15,20,24,21,20,18,18,21,21,19,20,22,15,17,21,17,21,21,21,24,18,19,17,20,23,17,19,17,18,17,19,19,22,24,18,18,18,19,22,23,23,29,23,28,32,25,28,29,29,28,26,24,29,30,23,20,18,16,21,18,19,20,18,21,21,18,15,17,21,20,28,55,91,82,41,17,15,17,18,20,22,16,19,22,18,18,20,25,48,41,19,22,17,19,24,18,18,22,20,20,20,24,21,21,20,16,20,19,21,21,19,18,22,22,16,21,20,23,21,19,19,21,21,19,21,17,20,19,21,23,20,20,21,23,24,20,19,19,19,18,26,23,21,19,22,18,26,21,61,150,164,155,145,125,125,132,124,147,169,217,252,241,244,244,235,220,185,222,252,252,250,241,203,186,119,72,72,83,119,118,93,101,101,108,125,117,106,92,94,111,100,82,73,77,65,84,114,127,152,153,141,141,152,159,164,167,172,167,168,151,151,149,162,113,10,4,10,11,15,13,15,14,15,16,16,15,54,55,60,72,69,66,111,130,229,190,125,212,239,160,91,102,96,104,118,143,151,180,183,228,181,135,196,163,217,200,163,166,75,179,222,55,59,59,82,134,65,70,76,93,98,66,88,118,141,127,103,99,92,110,162,163,141,118,153,250,160,85,69,105,144,100,125,83,54,39,67,146,186,210,242,251,250,250,251,251,251,251,246,125,78,124,100,96,71,92,160,42,36,73,65,86,49,64,63,72,101,105,115,96,97,79,66,89,69,77,92,101,66,48,44,56,62,55,64,74,74,85,85,81,78,66,69,57,67,78,69,83,91,64,108,159,130,168,154,77,69,110,102,71,84,69,79,90,97,138,170,143,181,176,208,168,144,113,141,184,116,107,71,42,72,53,56,83,68,80,77,61,85,88,72,52,16,36,41,35,46,42,46,46,41,46,42,43,31,61,121,135,115,92,77,60,45,76,107,106,89,54,51,49,49,52,30,60,110,109,93,64,36,23,43,63,49,51,63,49,15,15,16,36,87,113,155,178,195,231,249,249,252,252,253,253,253,253,252,252,252,252,252,252,246,240,237,230,214,203,200,208,195,156,160,181,150,172,108,8,4,52,157,133,80,65,67,72,55,40,41,37,41,77,89,101,130,147,115,44,65,129,147,152,84,24,71,72,42,60,41,86,132,76,39,42,71,83,80,64,45,39,49,53,47,56,77,71,39,45,52,67,97,97,123,89,39,87,88,41,90,170,174,156,151,134,131,137,112,91,124,145,113,112,110,112,113,115,115,96,78,74,78,74,72,69,81,77,84,98,96,109,122,103,59,31,19,19,19,27,40,90,94,70,77,35,20,19,29,54,52,49,58,71,79,80,73,70,77,76,77,76,74,80,83,77,84,64,42,25,19,27,29,35,33,49,57,48,67,79,86,82,76,78,74,65,51,83,152,166,108,62,57,74,75,70,69,66,66,66,66,63,52,35,19,19,18,19,22,22,24,22,23,19,21,21,16,19,17,21,17,17,20,16,21,21,19,22,20,21,21,21,18,24,20,18,19,17,21,18,21,19,18,21,18,23,18,16,19,18,21,19,21,21,24,21,21,21,16,21,17,20,22,22,22,18,17,54,112,72,29,17,12,18,17,19,16,23,18,21,19,20,19,22,23,23,23,16,23,19,19,23,17,19,21,20,19,17,22,18,17,20,20,15,19,16,26,23,17,24,23,24,25,24,33,33,29,27,26,28,29,33,33,27,27,33,28,27,26,17,17,19,19,22,16,19,22,18,21,16,21,19,20,22,19,31,62,91,76,33,16,18,18,19,19,21,23,19,17,23,19,22,53,38,21,22,18,24,19,19,22,17,16,18,19,21,17,23,21,20,19,21,25,17,18,21,21,21,18,21,23,21,21,22,24,16,25,25,17,22,18,22,20,21,19,19,24,19,19,23,23,21,21,18,22,23,20,20,26,22,27,22,48,110,122,111,116,91,105,109,117,159,182,204,216,212,225,226,227,207,166,217,245,245,249,208,206,171,88,51,47,73,140,140,127,129,118,139,149,139,114,88,49,37,31,23,37,55,39,34,41,47,52,54,33,32,33,36,44,58,75,66,121,165,184,173,135,97,17,3,12,11,15,12,14,14,16,16,16,16,106,113,118,141,141,136,166,166,250,206,131,217,235,191,125,154,148,152,170,185,198,221,221,250,177,134,200,186,237,214,189,193,95,197,221,57,66,60,85,132,71,53,39,36,51,34,48,78,115,144,120,95,98,150,225,217,167,117,155,247,168,81,71,123,150,141,152,105,39,1,26,131,250,237,229,253,245,245,197,181,172,168,204,123,134,180,130,105,69,87,136,70,58,54,43,78,66,69,62,57,74,82,103,98,90,86,83,103,87,67,112,118,77,51,31,35,50,53,65,81,87,86,89,89,75,72,61,50,92,120,86,113,139,84,144,174,202,252,253,192,107,145,137,86,72,72,125,234,252,252,252,252,253,253,252,252,252,250,249,252,250,250,225,193,151,45,24,39,29,40,42,29,39,42,43,43,39,37,41,41,38,41,39,37,40,34,32,43,25,59,116,130,109,75,81,68,54,76,71,57,42,22,22,26,28,20,12,12,19,44,50,51,53,55,69,86,95,106,153,177,190,221,246,249,253,253,252,252,253,253,253,253,252,252,247,247,243,241,223,206,196,182,162,171,164,134,145,174,179,154,147,149,145,110,118,122,80,112,65,5,6,18,67,66,57,60,59,83,77,94,151,147,145,159,182,212,217,231,214,95,39,93,127,129,74,32,63,69,46,55,37,41,61,51,49,44,46,36,71,102,46,57,90,97,114,104,108,83,50,46,42,68,100,98,103,82,42,63,62,31,91,169,160,145,134,129,136,141,89,38,99,125,99,106,105,105,108,100,87,65,59,62,69,71,78,81,83,91,89,92,100,97,68,39,27,17,17,23,33,78,118,99,53,33,36,27,23,17,32,55,60,63,60,72,80,80,75,74,77,79,69,80,88,85,89,63,49,29,19,19,16,22,34,41,40,59,66,77,87,85,87,78,78,76,66,67,60,53,98,157,163,99,39,48,61,69,71,65,64,76,92,72,51,26,18,22,14,22,24,21,23,23,22,20,23,21,16,19,20,21,19,19,22,20,19,19,20,19,18,20,21,18,21,20,22,21,21,25,19,21,22,21,18,19,22,18,22,21,19,20,19,24,19,21,21,21,24,18,23,17,17,19,17,23,21,22,18,29,98,87,34,19,14,16,21,18,22,19,19,21,20,20,18,21,20,17,20,23,23,21,22,17,23,22,19,24,17,21,20,18,20,18,19,18,19,21,22,22,18,18,23,20,19,26,38,36,25,26,26,32,29,29,34,28,29,29,27,24,23,22,16,18,18,22,24,19,17,22,25,21,22,17,19,21,21,22,32,77,99,68,29,15,20,16,17,21,21,19,19,23,18,23,49,36,19,20,16,24,20,19,23,20,19,23,21,20,24,18,21,22,19,20,23,21,24,16,19,21,21,25,21,19,20,20,21,18,21,20,21,21,19,19,22,22,19,19,23,20,19,23,21,23,26,20,24,23,20,21,17,23,27,21,41,56,64,66,63,57,57,56,56,86,79,88,89,88,98,88,103,91,62,107,131,124,123,104,128,96,52,53,54,85,139,156,129,128,113,121,137,131,135,105,44,29,35,33,41,46,41,39,40,43,34,19,13,17,21,21,24,18,17,13,122,227,237,225,147,71,16,3,14,12,16,14,13,16,17,16,16,16,122,124,132,154,156,149,163,168,250,188,107,195,232,162,92,111,88,108,150,169,200,231,233,252,195,163,218,200,245,244,227,199,96,206,218,66,75,51,85,130,63,48,29,24,25,25,39,35,89,163,132,94,91,117,197,175,103,69,95,173,101,51,47,91,130,133,158,121,77,29,51,118,205,213,210,244,214,212,170,79,21,44,104,97,139,149,104,91,95,162,234,124,79,66,65,79,66,72,56,61,71,81,120,97,76,77,86,132,137,85,106,145,93,93,72,51,49,55,77,69,84,79,80,88,61,68,62,54,108,90,105,118,122,75,97,141,177,246,252,213,70,114,128,96,95,94,129,198,247,247,250,250,249,249,252,252,252,244,235,231,203,241,249,222,128,31,6,10,21,35,39,39,38,31,41,43,37,39,36,43,42,38,37,41,53,59,55,41,34,23,30,59,61,75,107,87,31,8,11,12,13,14,15,19,20,24,24,35,123,105,148,196,219,236,252,252,253,253,252,252,253,253,252,252,252,252,246,246,236,222,184,155,144,133,112,95,86,102,121,125,145,138,116,120,120,94,85,118,124,104,71,52,60,53,71,62,37,76,56,22,23,11,67,137,179,191,191,194,167,178,214,200,173,188,181,169,156,168,151,56,14,21,49,63,43,42,44,51,50,46,41,43,69,59,57,61,43,20,91,151,76,64,90,126,149,112,96,75,54,52,44,43,53,51,54,56,42,53,50,32,94,158,153,139,132,131,133,141,83,50,110,125,97,108,112,109,89,69,63,54,63,73,78,80,80,82,96,97,100,93,63,37,24,23,19,20,33,46,54,69,77,67,43,28,22,16,25,32,39,51,57,61,64,72,79,84,75,76,77,75,83,88,88,75,48,32,22,22,19,17,27,45,58,66,68,84,90,93,99,79,75,74,69,71,65,69,73,64,61,92,152,148,96,53,37,53,69,73,72,102,95,68,56,27,19,16,22,24,19,24,20,24,21,22,20,25,24,18,24,17,19,21,21,21,18,18,19,21,20,21,24,15,20,23,19,21,23,17,21,21,20,24,21,21,19,25,19,20,23,17,24,20,17,20,21,21,23,20,22,21,19,19,17,21,19,18,25,25,77,101,51,20,12,16,22,18,21,22,19,23,20,22,25,20,18,21,21,18,22,23,25,22,19,23,20,18,21,19,21,21,19,17,17,22,17,23,24,19,19,23,20,22,21,19,30,35,31,31,30,26,31,31,30,29,29,32,28,29,21,17,20,18,20,19,20,24,23,21,19,20,19,20,24,23,18,19,20,35,79,88,61,29,15,19,17,19,18,25,18,21,23,29,47,31,22,23,17,21,23,19,21,21,21,19,22,23,21,22,21,23,21,21,21,24,21,21,23,22,23,21,21,24,24,18,25,22,19,23,17,24,20,21,19,21,19,21,24,21,21,21,21,25,24,21,25,23,23,23,23,22,24,25,31,50,53,59,57,53,56,48,36,35,26,16,18,15,18,15,50,45,16,46,43,34,39,38,63,47,46,66,65,77,105,96,77,69,46,57,69,71,91,84,50,45,44,43,46,50,48,44,39,37,20,23,66,115,141,145,129,93,47,29,171,232,241,240,158,69,12,5,14,13,14,14,15,15,16,17,16,15,131,117,80,76,111,125,158,169,238,164,105,208,241,172,100,84,29,74,130,164,197,192,205,248,206,174,202,213,237,236,194,130,86,202,205,86,85,52,90,128,81,87,84,64,63,54,56,62,92,130,117,88,49,49,72,43,27,17,19,33,18,21,17,42,57,90,118,116,110,122,146,148,173,151,155,169,120,146,166,114,48,27,48,73,120,87,57,76,87,160,197,96,68,83,85,104,110,95,59,60,81,96,106,87,72,60,76,146,148,110,142,167,133,141,115,92,74,65,84,75,105,102,87,102,76,71,69,88,134,107,80,106,83,52,95,106,127,183,236,131,19,71,101,139,163,154,144,99,61,131,169,159,153,177,253,253,252,240,235,228,174,141,142,162,90,15,9,12,30,38,39,44,38,39,40,35,39,37,42,49,40,37,38,33,83,120,93,47,12,13,11,24,67,89,114,91,41,36,37,42,74,115,151,194,219,239,251,251,253,253,252,252,252,252,252,252,250,250,246,246,227,197,180,190,174,147,139,107,98,86,27,4,10,30,56,41,28,53,76,90,93,78,60,42,51,42,23,17,28,44,48,66,87,114,153,131,125,170,118,60,43,36,136,215,240,237,203,175,111,98,121,88,83,85,67,60,58,68,87,91,109,110,115,129,136,123,106,110,44,50,49,54,110,74,72,66,62,42,60,124,73,37,35,66,81,59,60,55,63,64,63,73,64,66,76,71,44,50,67,46,99,149,149,145,133,141,139,137,61,60,128,111,101,108,98,80,69,55,67,61,74,84,83,89,83,88,97,92,67,39,23,21,18,28,35,38,51,63,74,56,79,106,72,28,15,26,56,62,43,39,38,51,49,63,69,70,79,79,84,90,91,78,59,35,25,18,16,17,23,39,65,81,92,96,102,105,84,91,84,72,75,66,76,77,75,74,68,67,59,52,93,163,173,113,43,34,67,78,83,79,71,56,46,29,16,23,22,24,24,22,26,19,23,27,21,20,23,18,18,23,21,22,18,19,20,19,24,20,19,21,24,19,20,24,21,21,21,20,22,23,19,22,23,17,21,19,18,20,18,21,24,21,21,19,19,24,20,19,23,20,25,19,21,19,19,19,24,24,60,106,66,24,12,15,19,22,21,19,18,19,24,19,22,23,19,21,21,22,17,22,21,19,26,19,21,21,19,20,22,21,19,20,19,19,19,19,22,21,18,17,19,19,22,22,32,29,28,32,27,28,32,27,30,30,30,30,24,24,22,20,22,19,18,23,21,19,26,21,20,21,21,17,20,20,22,20,20,23,41,83,91,60,26,15,18,23,21,23,19,22,26,32,49,30,20,26,16,21,23,21,26,21,22,23,17,26,21,23,26,19,23,18,23,25,21,21,19,24,21,21,23,23,22,16,22,24,21,23,25,23,23,24,21,21,23,23,24,23,24,21,21,23,26,25,24,22,24,26,22,23,21,21,36,53,63,62,61,51,38,31,17,15,15,16,16,16,16,19,65,51,32,57,31,17,25,28,57,43,58,81,72,77,56,50,49,54,37,39,56,48,56,69,55,51,73,84,91,103,98,71,50,39,36,81,139,206,227,226,218,191,132,109,228,238,241,234,178,99,9,4,13,11,14,13,16,14,16,17,15,15,214,175,89,50,101,142,191,214,249,191,156,241,251,233,188,155,76,90,141,171,165,127,159,221,162,120,160,185,229,188,111,97,92,194,174,101,119,81,116,126,104,138,133,123,123,120,130,125,134,141,100,89,71,57,68,54,54,56,62,72,54,66,70,64,69,77,86,97,129,159,181,163,155,132,111,120,83,84,107,108,93,69,53,53,69,57,53,61,61,98,114,46,63,90,98,129,153,122,59,62,71,77,94,68,67,73,86,132,134,103,131,178,122,137,136,123,113,85,86,87,129,124,92,111,63,51,55,110,161,98,72,29,39,65,55,56,45,54,27,8,13,18,31,77,106,96,77,31,4,23,34,28,25,26,77,105,110,112,159,177,131,91,90,148,95,10,6,10,14,21,36,39,32,34,32,32,38,36,50,59,41,36,29,12,83,139,134,109,81,87,92,122,158,174,192,205,220,245,250,250,252,252,253,253,252,252,253,253,249,249,243,237,212,177,151,150,141,105,89,91,42,7,8,17,19,29,68,60,74,73,20,5,31,77,72,38,21,27,42,47,49,50,54,51,46,57,28,17,12,63,106,119,140,160,182,141,130,167,128,71,46,26,93,139,119,97,75,67,57,59,69,61,66,108,105,81,88,117,141,164,179,184,188,183,120,156,125,125,69,73,65,42,73,55,65,56,55,59,39,50,61,63,36,55,75,61,59,62,79,98,97,98,99,95,101,92,57,48,69,45,67,112,125,129,126,131,143,144,84,87,113,79,82,78,72,72,57,60,64,71,81,78,89,85,87,87,64,43,31,26,29,37,33,38,56,80,108,124,130,136,134,145,124,57,50,75,81,60,38,33,40,48,47,59,62,70,74,78,85,80,59,37,24,16,20,16,22,41,54,77,90,99,108,101,96,86,84,81,76,73,69,75,77,77,70,64,69,66,63,63,55,95,170,179,119,58,48,74,68,39,29,32,31,22,22,23,22,24,23,30,19,23,30,23,23,17,25,19,17,25,18,21,23,23,24,20,21,22,26,22,17,23,23,19,19,20,23,19,18,23,20,24,22,21,20,22,19,20,20,22,21,19,21,21,24,18,23,18,22,21,17,21,22,21,22,23,18,25,38,83,69,26,22,14,19,19,20,17,19,27,19,23,19,19,22,20,24,18,20,23,22,22,20,20,22,20,23,24,21,18,19,18,18,20,18,17,24,20,19,20,18,22,17,24,31,29,29,26,29,27,28,28,32,29,29,35,30,27,17,19,22,16,22,18,18,24,21,24,24,21,22,21,23,20,22,22,20,22,22,52,90,89,53,22,18,17,21,23,18,22,21,39,53,25,21,25,19,19,21,21,24,21,24,22,20,24,18,22,22,25,21,19,24,18,24,21,21,22,22,28,22,22,22,21,25,20,18,26,23,24,22,21,26,22,24,19,22,24,24,28,23,21,24,26,24,21,23,22,21,26,24,23,37,61,69,71,57,31,22,22,26,32,27,25,25,19,18,27,84,70,57,66,32,30,36,36,50,39,58,83,80,72,60,56,57,52,34,42,55,49,62,64,49,98,165,200,222,236,230,208,182,167,154,172,201,234,229,227,205,187,154,147,241,245,238,229,205,111,6,3,9,12,14,12,14,14,15,15,15,16,246,245,174,122,167,205,252,252,251,244,232,249,253,253,246,244,159,139,152,165,160,127,143,173,127,116,135,139,160,133,109,117,126,166,150,130,147,122,146,139,114,154,165,172,203,216,235,235,223,212,195,195,198,208,216,204,213,214,219,225,220,223,219,224,212,201,196,192,202,212,216,203,181,169,184,186,135,81,54,55,79,80,47,44,72,63,88,92,78,127,143,89,103,132,141,155,146,146,131,90,54,41,61,66,72,71,73,116,98,87,129,137,111,116,123,154,133,105,112,92,97,82,55,79,49,32,34,62,101,48,42,44,57,75,52,60,58,42,33,22,22,31,30,36,30,24,30,28,27,26,28,27,26,24,27,24,29,41,61,74,57,55,81,121,67,14,20,14,16,17,20,22,17,20,25,24,29,19,73,83,32,37,41,92,187,213,219,232,245,247,250,250,252,248,252,252,248,248,246,245,233,199,223,215,174,160,134,122,118,107,92,71,53,29,10,18,24,15,13,16,15,13,13,13,14,14,15,14,14,14,14,12,18,25,29,55,78,88,92,102,97,93,94,71,73,57,35,15,11,27,21,25,37,44,36,14,17,27,24,15,20,14,49,73,61,55,53,101,130,151,159,134,135,145,106,52,50,61,80,85,78,78,77,71,66,64,52,59,42,47,45,39,48,55,67,69,103,80,19,76,143,89,45,104,104,53,47,46,53,51,56,61,48,53,52,47,45,44,42,41,42,62,95,98,108,125,140,148,110,91,83,76,94,89,82,71,65,72,80,73,85,89,89,91,66,48,30,31,42,41,42,50,65,88,104,127,143,148,149,140,143,139,124,105,85,88,77,59,46,29,36,41,46,73,76,74,77,69,60,42,25,18,21,16,24,33,53,69,76,96,101,99,98,89,87,84,66,77,77,78,85,75,73,64,64,64,67,70,66,65,60,49,95,170,182,144,81,41,40,34,23,22,24,21,24,21,24,30,24,24,22,24,28,25,23,23,25,22,21,22,20,22,21,21,26,19,19,22,21,26,20,24,22,17,19,21,19,20,24,23,18,22,24,19,21,18,20,22,23,20,17,23,24,19,23,24,20,21,23,19,23,20,22,26,19,24,18,21,26,71,77,33,22,15,19,21,19,23,22,19,18,18,19,19,21,27,20,19,21,19,24,19,22,19,18,25,21,17,21,19,19,18,18,21,16,19,24,16,23,23,17,21,19,24,33,27,27,26,27,29,30,29,31,34,26,29,32,23,19,17,19,20,20,21,20,21,24,21,21,19,19,22,25,22,17,22,19,24,24,21,56,95,87,52,23,19,21,16,23,21,22,38,36,21,24,17,18,22,22,26,24,19,19,19,20,23,19,26,23,23,22,18,24,26,18,22,21,21,23,22,20,21,23,23,22,23,21,20,21,21,26,20,23,18,20,25,23,25,21,22,20,24,29,20,21,24,20,21,27,29,21,23,44,69,84,89,96,116,150,171,181,189,184,184,176,154,136,161,192,157,157,161,126,107,121,141,126,72,66,89,84,94,78,67,61,55,41,41,56,51,65,61,55,120,184,216,246,246,247,247,241,230,218,222,241,245,215,198,172,155,129,135,220,238,228,233,224,111,4,1,7,11,14,11,13,14,15,15,15,15,247,247,246,197,236,245,249,249,252,240,218,250,252,252,248,248,188,138,159,151,148,136,147,152,131,142,149,137,143,129,120,142,150,169,168,184,198,192,209,184,177,206,230,248,252,252,252,252,253,253,253,253,252,252,252,252,252,252,252,252,253,253,252,252,252,252,253,253,252,252,252,252,246,251,253,253,224,124,81,69,76,100,94,90,118,120,153,165,152,176,186,124,146,200,209,217,193,190,140,115,59,37,43,57,74,70,63,92,99,132,152,166,127,107,118,140,156,124,104,83,70,66,72,122,92,76,68,66,77,60,122,127,162,194,159,214,247,247,247,230,225,228,239,239,224,216,236,237,239,243,230,229,228,220,217,142,181,181,193,188,184,185,193,210,196,208,227,230,232,228,235,235,229,230,151,223,217,147,247,247,245,247,252,252,253,253,252,252,253,253,253,253,252,241,223,213,194,191,181,184,156,49,65,128,139,135,101,72,48,27,14,10,14,13,14,19,26,36,51,68,71,66,63,68,68,125,73,76,71,84,47,13,29,53,93,132,134,113,97,82,66,42,35,37,30,33,27,23,28,29,26,25,24,34,33,26,36,67,87,41,20,18,101,153,130,103,65,66,75,85,89,73,67,54,15,8,13,11,14,12,13,13,13,14,75,26,55,81,46,34,41,38,54,92,141,144,152,102,36,116,160,78,61,93,65,42,46,39,45,49,47,52,42,44,46,37,43,34,34,38,49,70,80,87,95,103,120,129,94,76,66,67,98,98,90,78,71,77,84,91,98,98,71,47,42,38,40,36,37,51,71,88,110,122,129,137,137,134,126,124,117,99,90,76,73,72,72,66,42,38,41,42,48,66,79,79,65,41,27,22,21,22,19,29,56,67,80,80,80,94,90,97,95,81,75,72,78,78,87,92,68,74,67,63,71,73,81,66,71,77,60,66,60,96,180,210,163,62,26,33,24,20,19,24,22,24,27,23,31,29,27,29,27,23,28,23,21,26,21,20,21,24,27,19,21,24,21,18,21,27,29,29,24,21,22,19,23,27,22,22,22,19,17,22,24,21,24,21,21,19,24,24,22,25,23,19,23,18,22,21,19,23,20,24,19,18,20,22,16,66,81,33,25,16,17,23,17,23,21,16,22,21,21,23,21,21,22,17,21,21,21,26,19,19,23,16,22,22,19,20,20,22,16,18,19,19,26,19,23,20,16,17,23,23,32,34,22,26,30,32,30,29,29,33,28,27,29,25,22,18,19,18,20,24,18,19,21,21,18,18,22,19,21,19,19,21,20,24,19,21,31,62,99,84,45,24,18,18,24,21,19,35,34,24,19,21,22,18,22,23,25,22,24,22,20,25,22,23,19,19,21,21,29,21,20,21,22,26,23,22,23,27,21,22,23,22,23,23,22,19,17,17,24,27,23,24,24,24,24,17,25,25,22,28,19,24,23,22,25,21,27,31,44,64,70,113,188,245,252,252,252,252,253,253,252,252,252,252,251,251,250,250,236,228,248,248,239,155,94,108,89,88,76,50,56,58,47,38,53,56,71,65,77,134,149,152,189,242,245,243,234,235,237,241,245,241,204,196,183,170,137,155,226,228,227,243,237,108,1,1,8,11,10,12,13,12,14,14,15,14,246,246,249,218,244,246,240,240,240,129,118,197,207,237,237,236,144,108,143,134,139,132,144,155,147,159,159,155,160,161,165,184,212,243,250,250,252,252,252,252,253,253,252,252,253,253,253,253,253,253,253,253,253,253,252,252,253,253,253,253,253,253,253,253,252,252,253,253,252,252,252,252,252,252,253,253,247,192,144,119,125,157,162,155,165,167,210,229,183,171,136,100,156,202,223,237,210,185,118,76,80,79,74,54,79,86,93,124,105,106,133,147,134,134,94,111,137,110,82,71,70,77,101,192,174,134,136,123,142,111,155,164,207,242,215,252,252,252,253,253,252,252,253,253,252,252,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,252,252,253,253,253,253,252,252,252,252,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,252,252,253,253,252,252,253,253,252,252,253,253,251,76,100,237,233,218,158,121,69,48,78,99,141,173,204,237,251,251,252,252,253,253,252,252,252,252,252,252,252,250,196,61,96,223,216,177,97,63,67,57,61,74,101,141,181,213,249,249,252,252,253,253,252,252,253,253,252,252,251,209,76,23,110,165,110,62,11,17,24,22,46,59,83,113,140,162,177,167,168,164,148,165,138,121,125,136,120,106,64,30,49,37,39,97,124,103,103,79,46,69,78,49,50,56,51,60,65,71,80,85,105,112,117,120,101,92,71,30,63,66,36,69,75,71,77,85,98,90,81,61,55,66,80,82,81,74,69,81,95,97,76,51,41,36,33,29,33,50,73,83,97,112,116,122,127,125,121,120,119,102,85,66,69,66,59,73,75,63,48,36,45,45,42,66,63,49,34,17,21,19,18,31,51,66,80,83,90,95,81,91,81,78,77,67,71,73,78,84,79,76,72,64,63,68,76,70,74,75,74,77,73,74,78,68,74,162,177,63,27,27,20,26,18,21,20,20,25,24,24,26,25,25,29,26,25,24,24,24,25,22,19,24,22,23,20,23,23,18,28,34,19,32,36,21,21,24,29,24,20,24,21,21,26,26,28,19,21,24,19,24,23,21,24,21,25,21,21,26,21,23,19,18,21,21,21,22,19,21,24,46,85,47,18,21,17,20,20,24,19,21,27,21,18,20,22,21,18,23,25,20,24,23,21,20,19,21,22,21,19,17,20,19,21,21,19,21,22,18,23,19,21,23,21,31,30,30,28,25,30,26,31,35,31,27,23,30,29,27,25,18,19,19,20,19,20,19,20,22,24,21,18,21,20,21,21,21,21,22,19,25,25,23,67,100,80,43,20,19,18,17,27,45,33,19,22,21,19,23,23,25,23,21,28,23,23,24,21,22,22,24,26,20,18,25,20,24,24,23,23,21,22,23,22,18,21,23,21,24,21,21,24,22,23,23,24,24,25,22,17,22,22,25,23,23,23,19,27,23,25,20,23,25,29,45,33,112,234,246,250,250,253,253,253,253,253,253,253,253,253,253,253,253,251,251,253,253,247,190,128,116,60,41,21,14,39,57,52,48,51,67,85,87,112,141,120,83,91,140,181,191,213,237,249,249,248,248,238,242,238,230,217,231,248,244,241,244,236,105,1,1,7,11,10,12,13,12,14,14,13,13,247,247,242,191,207,244,227,193,111,37,68,150,159,163,173,174,109,105,140,136,153,144,150,165,165,169,163,165,183,218,242,251,238,236,230,210,236,244,241,241,231,230,217,213,213,209,215,213,209,208,206,207,210,216,151,217,211,211,210,200,200,198,181,179,156,173,201,199,202,211,215,200,189,208,208,194,160,148,131,112,126,158,144,128,146,124,167,156,124,104,74,67,101,114,122,134,118,88,68,37,78,106,103,64,74,98,109,137,90,71,87,90,108,97,51,82,94,67,74,71,71,70,81,149,136,113,113,85,130,109,116,121,136,144,129,229,237,236,207,174,232,196,138,211,227,184,234,217,235,239,234,241,242,250,252,252,252,252,253,253,252,252,253,253,252,252,253,253,252,252,250,250,253,253,252,252,253,253,252,252,252,252,253,253,252,252,252,252,252,252,252,252,252,252,252,252,246,246,231,229,204,42,71,176,195,214,199,216,239,252,253,253,252,252,252,252,253,253,253,253,252,252,252,252,252,252,249,249,231,226,146,17,86,193,204,189,174,200,237,249,253,253,252,252,253,253,253,253,253,253,252,252,253,253,252,252,246,245,251,150,35,12,81,132,110,146,178,213,239,247,252,252,253,253,253,253,253,253,252,252,252,252,249,194,158,147,98,85,46,41,52,33,41,49,55,54,53,57,60,78,77,81,95,88,86,104,102,99,110,118,133,144,151,141,79,104,68,32,71,53,32,59,68,87,99,100,90,79,83,78,83,95,97,92,81,86,86,85,75,55,40,29,23,23,34,51,68,84,95,105,113,108,115,114,114,112,107,99,78,63,44,49,66,61,66,72,77,67,45,42,43,37,41,44,46,34,24,30,22,32,48,64,81,89,90,92,93,87,80,88,78,66,69,73,77,75,66,66,57,59,53,63,73,64,69,69,70,69,76,80,78,81,61,44,23,23,58,50,33,29,23,22,24,26,20,21,21,25,33,22,26,30,24,22,25,27,28,26,23,23,21,19,19,25,22,22,22,29,33,23,23,26,20,20,26,24,23,24,22,20,19,24,26,28,20,21,26,18,23,23,31,19,25,25,20,25,19,26,23,21,22,17,20,23,21,23,20,24,27,37,84,62,26,21,21,24,21,22,24,20,21,26,21,24,23,24,22,21,27,22,21,26,27,24,23,21,19,17,19,24,16,16,22,21,17,20,22,19,19,20,17,17,22,24,34,32,29,31,27,30,30,32,32,27,29,31,29,28,24,22,19,21,19,20,20,20,18,19,21,23,22,20,20,19,23,21,19,24,25,22,22,19,36,76,100,76,34,20,19,17,29,45,27,19,24,20,21,19,25,19,23,24,19,24,21,24,24,23,21,24,20,22,23,22,27,25,25,17,23,25,17,23,21,22,21,20,21,18,22,23,21,22,19,24,26,21,23,24,21,21,27,23,23,27,24,21,23,25,23,24,23,29,25,30,19,111,217,230,242,229,218,217,223,225,219,226,237,235,222,240,246,224,227,249,252,252,226,179,156,112,59,53,34,16,39,57,63,55,69,91,102,104,112,125,98,66,50,79,152,174,221,243,251,251,251,251,253,253,252,252,253,253,250,250,244,244,236,105,2,1,7,11,12,11,12,12,14,14,15,14,244,244,243,147,100,125,113,133,87,17,79,170,224,237,215,200,152,134,151,160,191,172,153,165,163,168,178,199,223,252,246,246,219,150,101,35,65,151,122,99,71,35,21,7,14,12,14,21,16,12,15,15,15,23,31,29,36,39,39,26,23,29,36,50,23,12,14,19,45,79,81,29,6,33,71,64,65,78,85,87,83,77,54,53,74,74,71,55,30,51,52,45,53,41,43,45,31,45,38,49,68,78,93,81,76,81,102,108,89,88,98,94,80,76,53,66,71,79,86,76,70,56,50,65,46,45,37,37,78,61,69,60,48,51,37,83,81,30,12,19,40,23,24,68,89,112,140,147,169,165,172,203,208,235,250,250,249,249,248,248,242,238,222,208,199,192,184,183,179,173,118,67,107,162,178,193,174,136,169,208,215,209,203,202,189,176,165,151,137,131,128,117,110,103,85,75,64,66,66,81,82,15,30,74,117,196,228,243,245,245,246,246,245,245,241,238,228,216,200,189,182,169,152,131,116,105,100,70,46,58,48,4,44,177,236,250,251,251,252,252,251,251,250,250,248,248,244,244,234,227,214,191,167,160,136,105,69,38,90,63,15,1,91,238,253,253,252,252,252,252,252,252,252,252,253,253,251,251,244,242,229,227,173,81,48,46,61,71,49,32,40,37,90,125,88,51,47,63,71,85,84,82,89,76,77,77,56,53,48,46,51,47,52,53,50,63,61,46,71,59,44,86,129,143,131,122,104,84,84,80,110,128,130,116,94,79,54,42,32,23,27,24,35,51,66,84,94,108,120,112,108,112,111,104,100,96,77,69,57,45,55,51,74,77,63,74,74,68,49,44,46,39,33,39,51,33,44,64,66,82,95,104,108,101,97,99,84,87,79,79,74,77,81,84,81,61,57,52,57,53,52,58,63,66,66,70,75,73,81,81,64,47,29,18,18,16,18,33,42,30,16,26,25,27,26,20,23,24,26,23,26,23,24,22,21,36,39,24,24,27,21,25,22,24,25,19,31,31,19,20,24,22,21,24,29,28,21,19,26,19,24,23,22,23,24,20,20,29,24,23,23,28,23,17,25,23,21,24,21,25,20,21,28,19,22,22,37,42,20,27,85,76,32,19,28,41,23,25,31,29,24,21,31,28,23,19,23,27,22,22,24,29,24,27,24,17,21,24,20,17,19,18,18,18,19,20,21,16,22,21,21,18,19,32,31,31,31,27,27,31,30,28,27,33,31,25,28,25,23,18,17,23,19,20,22,18,22,21,21,23,19,20,21,21,25,17,22,26,21,23,21,26,21,34,84,100,76,35,19,19,27,45,24,19,22,20,22,22,24,27,19,21,24,21,22,23,24,24,25,18,22,26,22,26,22,23,27,24,19,26,22,19,23,25,26,20,23,24,19,22,20,22,24,21,23,24,23,21,24,19,24,23,28,21,23,27,24,21,20,25,24,19,24,29,22,134,204,190,195,180,178,188,196,199,202,212,221,214,203,212,204,189,195,189,203,193,160,172,158,129,135,158,153,118,82,81,73,63,76,94,93,95,100,103,114,101,67,126,218,210,235,247,250,250,251,251,252,252,252,252,253,253,248,230,221,244,237,107,4,1,6,10,12,12,12,13,14,14,15,15,183,200,197,130,113,121,127,142,97,63,117,173,214,250,250,250,237,184,188,201,235,191,133,148,146,163,197,225,232,216,224,221,144,103,89,22,23,59,32,15,16,14,15,13,14,14,14,15,14,14,15,15,25,14,15,16,16,17,23,22,24,34,52,96,42,9,19,11,39,74,66,25,7,34,68,56,61,88,104,92,63,54,29,39,67,70,63,39,42,47,51,54,50,46,54,59,50,53,53,62,66,73,79,73,77,71,89,114,93,79,80,77,81,71,63,79,72,73,83,60,45,30,39,61,26,19,18,27,74,45,41,42,31,33,33,61,34,17,22,38,89,54,48,104,141,199,227,238,249,247,245,245,239,238,226,213,204,188,175,157,139,116,103,92,91,93,105,115,122,110,39,1,24,85,102,94,46,2,32,44,45,45,38,35,27,21,16,22,19,15,15,14,15,16,21,44,59,79,88,109,122,34,23,15,36,113,111,103,87,84,86,78,70,59,46,33,34,27,27,24,12,15,15,24,43,46,54,21,46,105,61,8,46,134,173,158,153,154,139,128,117,122,86,71,62,44,37,32,24,14,12,15,15,16,29,48,21,18,96,65,34,11,74,203,217,154,176,134,187,176,171,149,129,131,119,113,126,70,51,38,19,11,12,25,36,10,81,132,63,35,48,43,110,150,100,49,29,37,47,47,47,47,43,46,43,48,45,50,47,51,72,76,92,101,97,137,129,84,140,83,49,62,116,111,86,67,58,40,46,59,80,104,90,65,40,31,24,19,22,22,34,48,66,82,93,105,114,120,114,115,113,107,105,95,85,69,60,61,56,60,60,60,82,78,74,79,81,75,45,43,50,40,33,44,52,29,69,107,114,114,108,116,105,93,94,94,80,74,78,78,76,84,77,66,64,53,55,57,54,59,58,60,67,69,71,67,75,87,75,53,34,19,18,20,17,16,16,25,39,37,30,21,23,25,26,19,21,29,22,25,28,19,24,26,22,24,22,24,22,21,27,23,23,24,25,33,27,24,20,22,25,21,29,24,32,45,32,21,22,25,27,21,22,24,24,29,22,22,29,24,23,24,27,22,24,24,21,25,24,21,25,31,33,23,19,22,27,25,24,21,68,87,38,23,30,29,19,25,37,21,19,26,29,25,16,23,24,16,21,24,23,34,28,22,25,24,22,23,24,20,17,19,22,17,19,24,18,21,23,16,17,21,29,30,29,29,24,30,29,31,31,25,30,32,24,31,27,26,21,19,24,19,18,21,26,22,17,22,22,23,18,21,24,22,25,22,27,24,19,21,30,23,19,25,43,88,98,70,33,14,33,40,29,23,17,21,22,21,26,23,24,24,22,21,22,24,23,22,22,23,21,19,24,23,22,27,25,21,26,22,22,25,22,24,21,19,23,21,25,20,23,19,24,30,21,24,18,28,29,21,25,19,26,25,21,29,21,24,21,22,27,22,30,22,32,151,176,153,160,154,162,165,173,177,174,181,193,189,184,189,187,173,157,135,132,141,130,144,122,89,126,185,205,151,87,64,42,38,57,64,56,52,61,89,111,104,103,200,249,232,231,246,249,249,250,250,249,249,251,251,253,253,237,183,191,241,238,107,3,2,7,11,12,12,14,12,13,14,14,14,230,243,250,250,250,250,249,249,237,201,227,243,252,252,252,252,251,251,246,245,240,180,125,143,144,169,222,250,223,181,166,198,196,196,193,132,125,139,105,112,129,148,122,80,96,77,94,116,102,93,92,101,88,93,96,109,114,107,121,112,128,141,149,175,97,78,99,84,95,109,108,58,45,85,98,68,88,110,120,117,90,78,57,60,72,84,77,62,67,71,67,64,51,59,76,98,87,74,55,57,77,69,65,60,56,51,69,87,81,57,41,35,24,33,31,47,48,46,57,39,35,24,46,77,54,64,35,59,129,103,127,141,149,153,130,170,165,125,118,134,183,141,134,198,229,245,242,234,220,178,169,155,118,129,131,126,116,106,104,94,85,82,79,78,76,65,58,56,59,61,28,4,24,60,49,33,19,9,15,14,16,14,14,22,38,57,71,83,87,90,87,88,88,92,94,99,97,98,95,103,112,47,15,14,17,42,23,14,14,12,15,14,15,15,17,34,53,72,87,81,76,84,101,109,103,102,110,67,73,131,97,41,16,39,32,10,17,14,15,15,15,15,14,16,18,31,36,43,48,55,67,48,8,46,100,117,72,33,107,102,57,33,14,54,41,34,22,11,21,12,16,16,14,14,14,15,14,16,13,28,30,20,19,53,59,23,81,108,62,48,49,35,53,61,51,49,39,43,50,56,58,66,80,84,99,109,120,141,144,155,155,148,146,136,132,122,116,96,75,46,37,31,33,52,49,48,31,30,46,46,43,41,38,27,26,23,20,25,38,55,69,79,97,99,105,116,108,114,114,112,104,98,88,68,69,59,50,65,68,65,65,72,93,86,79,101,86,51,38,34,38,41,34,43,50,30,78,106,106,113,98,96,96,91,87,84,68,83,82,87,81,69,73,59,57,46,53,57,62,64,66,69,67,71,70,82,74,50,37,21,17,22,16,18,21,17,16,33,39,38,33,22,24,21,23,22,22,21,27,24,21,22,28,24,24,23,22,29,24,24,19,24,27,23,28,28,24,28,25,27,24,23,29,22,36,41,24,22,21,29,24,21,24,25,24,24,22,23,26,24,27,24,22,24,26,23,25,23,28,28,30,33,30,33,23,22,25,20,27,20,46,92,50,28,22,15,24,27,22,23,19,24,24,21,24,23,21,21,22,31,32,26,30,29,25,24,24,32,27,17,20,19,19,19,17,24,19,19,22,24,17,24,26,27,32,30,31,31,25,31,33,32,28,27,35,28,23,26,21,23,20,19,24,19,19,21,23,20,22,23,18,19,17,19,21,20,16,23,22,19,21,23,22,19,24,44,89,97,66,27,33,39,21,20,20,18,23,24,23,25,24,24,25,22,23,23,23,23,22,23,20,22,20,23,24,26,24,23,24,17,23,22,22,25,17,21,25,25,20,24,22,21,24,20,24,22,22,27,21,22,26,21,23,30,26,23,26,22,22,23,24,23,29,22,41,116,118,120,129,122,130,133,133,128,120,121,128,137,138,142,145,134,128,106,113,117,99,102,61,27,61,107,131,89,46,73,66,25,21,24,16,16,26,49,66,71,113,226,248,210,228,245,248,248,250,250,249,249,251,251,251,251,214,173,184,217,231,110,3,2,8,12,12,13,14,13,14,15,15,15,252,252,252,252,252,252,250,250,249,241,251,251,252,252,254,254,251,251,248,248,240,163,115,141,146,185,248,250,246,179,137,219,252,252,248,215,202,243,196,190,235,239,173,139,174,165,185,234,242,225,217,225,160,226,246,253,246,234,249,249,252,252,251,251,167,137,179,156,134,128,119,59,39,91,107,83,96,125,142,137,126,107,66,57,63,70,74,78,75,62,55,59,65,54,71,90,76,55,51,51,65,66,53,42,41,38,61,70,63,61,51,38,25,35,32,35,55,85,108,113,130,99,114,156,135,138,109,145,227,196,207,206,214,200,156,205,178,122,97,102,105,85,126,162,170,177,155,131,122,39,25,48,32,84,83,82,92,82,90,89,97,91,83,78,63,42,21,12,14,15,22,17,19,66,81,105,56,16,97,123,126,125,128,142,144,157,158,158,156,149,144,141,120,105,90,72,47,35,33,25,52,35,39,26,37,112,119,125,115,120,124,125,127,128,145,151,171,171,171,155,139,125,111,95,70,61,67,39,64,81,45,47,29,58,66,77,91,86,85,95,95,111,122,127,145,151,151,136,133,117,110,86,36,66,78,78,35,41,100,60,42,37,17,57,66,78,59,46,50,38,14,19,23,21,19,26,39,35,34,43,44,41,45,51,49,36,54,50,44,49,45,41,44,50,67,88,103,128,134,147,153,155,154,150,150,147,141,133,91,100,81,56,45,36,29,28,27,27,23,21,41,33,44,76,61,47,36,30,41,36,36,35,21,21,20,29,38,58,76,89,101,103,114,105,107,107,107,109,104,98,80,73,60,55,58,56,62,63,66,67,78,94,105,105,88,65,42,24,21,21,23,40,38,65,63,66,100,106,104,102,88,92,84,75,71,74,79,84,84,77,65,55,55,56,53,57,57,55,64,65,67,75,70,68,65,52,40,22,18,19,17,17,19,21,19,26,29,39,38,34,33,28,39,25,21,27,24,24,25,24,21,25,22,24,23,22,31,24,22,28,24,24,27,27,26,24,21,26,21,24,27,20,24,20,24,24,17,24,24,23,25,23,22,23,25,24,24,24,23,23,28,24,23,29,22,23,24,25,23,29,27,24,38,33,23,25,22,23,24,25,41,91,66,28,18,20,26,21,27,20,23,25,25,26,23,29,19,22,29,27,29,25,31,28,21,24,32,28,19,22,18,16,19,19,18,21,21,19,18,20,16,21,24,32,29,30,30,31,33,29,30,34,30,29,32,28,24,26,19,21,24,18,23,19,21,21,19,19,19,20,22,22,18,24,21,16,23,24,22,19,21,19,23,25,18,23,50,94,100,60,45,33,20,21,19,23,19,21,23,21,19,22,21,24,22,21,24,19,22,26,24,24,25,19,25,24,22,24,22,29,22,20,22,19,21,19,22,21,25,21,24,24,22,24,22,27,20,21,26,21,23,22,21,26,24,20,23,23,25,19,25,25,24,21,48,119,122,135,130,122,137,134,124,122,125,125,125,122,124,130,136,131,132,128,125,123,112,105,96,89,88,105,114,91,108,138,116,79,63,80,94,94,92,88,91,98,146,235,243,218,248,248,251,251,251,251,251,251,251,251,249,244,216,202,202,199,200,112,6,1,7,11,14,12,15,13,15,15,15,15,148,157,179,189,200,175,163,186,135,94,138,173,184,159,150,192,217,229,245,240,233,153,115,133,118,133,160,210,199,131,95,127,195,210,218,117,112,162,110,104,145,150,95,94,151,118,152,214,217,188,169,199,180,194,210,225,206,170,227,234,245,245,246,246,120,108,157,113,118,126,122,51,21,92,108,73,67,90,118,128,119,100,66,55,44,53,67,82,87,58,51,57,50,51,44,40,23,39,37,39,50,47,54,48,45,36,89,103,117,151,151,174,143,128,123,104,139,192,217,229,245,175,158,169,127,170,120,124,219,161,113,107,130,120,102,113,63,51,47,14,29,14,61,111,71,90,97,105,100,27,26,26,31,51,29,29,25,27,32,51,68,75,88,96,106,110,113,114,117,129,103,38,47,142,182,199,127,56,130,178,171,144,126,109,94,92,84,83,80,79,81,91,87,93,98,94,96,90,100,100,125,96,53,39,79,180,201,200,190,179,171,151,132,125,123,125,119,101,98,85,71,63,54,53,56,62,75,47,61,96,63,44,25,96,175,179,183,164,155,156,141,135,130,128,130,118,105,88,64,53,57,45,22,51,79,92,56,54,143,136,66,26,27,155,186,173,131,101,125,83,80,63,68,66,45,46,38,39,49,55,44,50,52,53,55,86,122,112,110,123,116,61,78,128,153,159,150,154,131,119,111,86,62,47,44,42,29,29,21,24,22,17,22,16,17,17,14,16,24,24,44,44,53,69,57,57,29,30,36,24,28,24,21,28,35,52,69,83,111,115,111,107,108,108,107,107,99,103,90,70,60,61,58,64,66,57,69,72,70,78,87,95,109,78,44,28,17,24,21,18,40,55,66,94,104,101,112,106,103,103,79,79,75,70,78,78,75,74,69,63,57,50,56,58,60,53,57,60,65,66,68,74,67,57,37,20,20,21,15,17,17,19,23,27,27,29,40,47,37,37,39,76,94,37,23,23,20,26,22,21,26,24,24,24,27,27,21,22,24,22,21,31,28,24,25,21,21,31,24,22,26,20,24,20,22,23,24,26,22,29,29,26,24,25,25,21,26,26,21,27,24,28,27,25,22,23,22,23,33,24,23,24,24,21,19,22,27,24,28,29,27,79,80,33,23,18,24,22,25,23,21,24,21,26,31,20,22,21,24,25,24,25,26,23,24,27,24,22,21,22,19,21,19,20,21,23,22,20,21,15,21,23,27,32,30,35,27,24,28,23,31,33,34,32,30,32,31,29,19,20,21,17,22,26,23,23,21,19,22,22,21,20,21,18,20,24,22,19,20,21,22,24,23,19,22,21,24,61,109,103,77,37,18,27,17,21,20,21,21,19,23,19,24,22,22,25,24,21,22,23,24,24,27,23,19,23,23,27,23,23,24,24,24,24,23,22,22,18,25,22,24,27,21,20,24,23,19,24,22,21,22,23,27,22,25,26,22,24,23,23,25,19,27,22,69,125,111,134,130,137,160,147,140,157,173,181,166,160,169,171,184,182,195,182,181,189,179,205,219,202,202,211,207,196,198,204,189,176,186,204,201,201,200,193,171,170,198,230,246,246,251,251,251,251,250,250,249,249,251,251,247,240,242,242,224,196,188,109,7,1,7,10,12,11,12,12,14,14,14,14,17,17,27,45,46,21,18,33,26,33,76,88,83,75,92,146,181,193,197,200,218,150,118,129,115,93,26,14,43,64,38,9,18,20,16,13,19,29,18,32,46,45,25,32,38,24,41,52,48,36,28,33,36,32,33,54,50,25,32,47,86,96,115,118,41,49,52,56,101,122,166,89,55,123,118,85,55,57,59,60,73,79,89,105,54,57,62,82,94,67,48,50,59,70,83,49,42,71,96,53,61,76,105,134,132,98,158,189,191,244,246,242,195,165,147,153,195,237,240,217,226,188,188,148,73,103,66,80,155,118,73,60,76,72,77,91,55,53,51,37,25,17,48,54,27,34,49,64,66,14,31,38,22,66,53,80,44,51,81,95,110,128,143,158,166,171,129,173,169,168,127,31,36,128,147,148,69,39,107,103,86,71,61,54,39,37,33,36,53,74,97,114,131,143,139,148,155,157,157,146,146,92,36,25,73,141,131,119,87,76,61,43,38,26,36,26,20,19,31,54,78,93,113,126,134,151,148,78,98,137,73,52,19,91,124,139,118,85,66,47,27,24,19,15,16,16,24,34,66,83,107,82,44,127,165,172,86,77,174,164,96,38,23,102,130,118,63,32,44,46,51,44,41,46,58,69,73,107,134,167,147,142,145,160,164,169,177,161,158,187,167,77,44,63,62,51,42,39,27,29,26,19,22,16,18,21,23,16,21,21,21,23,18,22,19,19,29,29,39,38,39,38,59,98,85,57,27,18,25,20,21,23,33,50,54,61,64,73,91,99,100,100,108,97,103,99,92,77,59,54,57,67,67,66,71,69,72,79,79,86,88,75,54,38,25,21,20,16,30,55,73,86,100,101,101,104,105,93,98,75,66,80,79,90,83,74,61,59,61,55,54,50,55,60,63,59,56,62,67,66,66,56,38,27,19,17,18,18,19,20,21,25,27,34,44,44,47,39,41,41,38,64,132,71,29,18,21,20,25,23,20,25,20,21,31,25,39,37,24,24,24,29,23,22,24,22,27,26,23,24,22,17,25,24,22,24,28,24,29,29,24,24,24,27,25,26,21,22,28,24,22,25,27,19,24,23,22,28,24,25,24,18,23,22,26,29,21,22,22,27,23,67,85,40,22,16,23,23,23,23,27,21,25,27,22,21,22,22,24,27,20,20,24,26,26,24,23,18,19,23,21,19,19,19,21,20,21,22,22,21,17,22,23,29,36,33,30,24,31,32,31,29,30,30,29,28,33,31,20,19,23,21,18,21,20,21,20,23,21,18,25,21,19,22,21,18,21,19,20,23,19,17,19,24,27,22,18,30,62,110,99,53,26,18,22,22,23,21,19,24,22,19,26,24,18,24,19,24,21,22,23,22,23,24,25,21,24,21,22,20,24,25,22,23,21,23,17,20,27,18,21,22,21,26,23,25,19,24,22,23,26,21,27,29,26,24,26,24,23,18,20,24,24,22,52,76,56,66,82,105,118,105,100,128,148,154,145,141,143,162,181,190,201,192,200,205,212,244,250,248,245,250,239,217,204,209,213,208,218,229,210,198,210,204,187,178,186,208,217,215,226,221,207,206,204,203,202,203,212,210,217,220,242,238,208,190,187,112,5,2,9,10,13,12,14,12,14,15,14,14,25,42,77,82,88,47,46,81,47,93,155,165,177,179,183,174,148,118,126,169,213,154,130,139,129,101,25,16,43,74,45,32,36,21,19,16,62,63,43,63,70,66,46,47,39,24,50,48,29,17,28,38,29,44,35,57,56,33,34,16,27,42,77,96,58,107,112,89,155,208,246,198,110,177,211,198,134,68,60,54,63,99,149,163,110,93,83,91,94,57,61,75,113,158,195,169,132,174,174,164,112,89,145,217,232,155,189,177,143,185,179,168,101,71,86,110,155,182,162,146,198,240,252,178,53,105,105,138,229,202,119,93,85,66,80,86,73,74,76,57,45,39,44,70,60,71,60,78,78,29,74,60,81,141,135,149,143,147,146,141,131,124,122,122,117,116,111,98,91,79,47,18,23,37,35,45,27,25,73,87,100,108,120,124,125,128,126,132,129,135,139,145,137,143,139,124,121,97,93,75,64,41,24,32,32,37,26,32,42,57,73,89,104,107,111,116,116,116,127,127,146,166,171,171,158,152,131,61,63,72,47,60,29,35,54,28,32,35,46,62,79,92,97,100,101,109,120,136,158,171,181,142,73,106,134,117,51,19,63,71,50,46,17,40,60,66,71,73,109,120,132,118,127,130,142,166,163,166,182,191,168,150,146,152,125,93,71,47,53,71,53,34,35,22,19,27,16,19,21,17,21,21,23,20,17,24,20,21,22,19,24,26,29,35,35,36,34,32,28,25,29,22,57,91,66,37,17,15,15,22,33,44,57,59,59,63,62,64,68,76,86,87,88,93,85,75,58,58,57,55,57,61,66,72,71,77,80,83,88,77,53,30,24,19,24,21,24,42,62,84,96,93,98,97,91,93,86,79,76,65,65,85,80,69,71,54,55,53,58,56,53,59,65,62,61,61,62,67,63,55,39,26,16,16,22,18,15,19,30,28,27,38,40,42,48,44,40,44,37,34,28,36,100,112,64,23,14,21,22,22,22,24,24,22,25,29,35,32,19,32,29,20,24,22,24,27,22,20,26,24,23,22,22,23,27,25,23,24,22,30,23,23,31,29,22,23,27,27,26,27,24,25,25,20,26,24,24,27,23,23,22,23,24,25,25,26,23,24,21,20,19,53,93,53,23,16,22,26,24,22,24,26,21,27,20,19,25,24,23,21,22,22,29,29,22,21,21,22,19,24,25,18,17,21,21,21,21,20,20,20,19,21,28,30,27,34,28,32,31,27,32,25,30,30,27,34,29,28,23,18,22,18,17,22,19,24,19,21,23,20,23,20,20,19,20,20,21,25,19,23,20,17,20,22,23,21,20,25,28,65,106,83,50,22,15,23,22,21,26,20,19,24,24,24,21,22,21,21,22,26,23,23,22,20,24,26,21,23,24,21,22,29,21,24,24,21,23,23,25,21,23,21,23,24,21,23,25,23,21,27,24,21,24,23,23,25,28,22,22,17,23,28,27,29,53,89,58,42,36,38,55,47,42,45,61,68,59,69,72,88,112,113,129,119,137,150,149,189,203,186,189,182,155,138,141,165,179,173,169,162,132,130,133,134,122,109,113,118,130,118,116,112,99,104,103,107,115,125,135,143,152,158,178,172,155,145,157,110,12,3,11,11,15,13,16,14,15,16,16,15,94,118,155,145,141,87,95,131,94,162,231,245,252,252,216,164,128,119,141,184,219,160,145,155,146,118,31,29,92,136,71,47,65,45,46,68,97,82,78,95,97,90,66,73,66,61,87,80,58,48,61,66,70,81,81,100,99,91,76,48,71,77,127,139,95,144,117,114,200,242,249,174,83,200,241,228,120,90,106,120,134,134,167,155,112,137,134,130,127,64,48,94,147,202,249,193,177,240,233,211,111,49,107,171,204,138,104,71,38,94,45,55,31,38,59,69,68,69,71,94,155,161,186,131,71,110,93,109,186,163,86,67,75,92,121,124,119,123,129,125,65,13,54,136,153,146,145,147,139,72,93,89,103,173,152,115,130,105,94,66,44,27,18,21,27,36,104,73,85,108,107,39,17,87,134,142,77,51,132,175,188,184,182,121,170,166,151,130,109,82,67,55,49,55,54,63,67,64,80,95,128,118,58,20,56,112,138,150,151,168,182,190,199,189,186,168,154,136,110,91,87,88,88,88,80,82,68,33,47,65,47,51,36,45,87,114,140,146,160,172,180,190,183,178,165,141,125,109,111,106,110,69,29,59,54,62,29,35,69,73,101,62,39,53,98,164,170,176,192,182,178,162,156,148,133,104,103,83,97,47,36,33,27,32,28,21,24,20,22,17,16,37,35,23,16,20,21,18,22,23,22,21,24,24,24,27,23,23,27,26,31,40,33,32,30,20,23,24,26,31,24,22,33,42,31,27,21,16,23,45,64,73,71,66,66,64,62,66,68,75,78,80,90,80,66,48,48,66,61,60,61,57,69,71,79,83,87,78,49,35,27,22,18,20,22,43,62,67,82,90,96,94,95,87,88,85,77,71,79,71,75,75,64,60,54,57,55,51,54,54,62,67,58,61,69,66,65,53,40,31,19,19,17,17,21,21,22,24,32,41,40,38,42,44,39,45,44,44,41,30,28,19,34,96,102,48,19,15,24,19,24,27,20,26,21,22,28,21,26,29,23,21,25,22,22,27,24,19,19,26,22,21,28,25,22,29,22,22,31,29,23,23,28,27,22,26,28,22,27,25,26,25,20,22,23,29,23,22,21,25,27,22,33,22,20,29,23,25,21,22,21,35,85,65,27,19,16,22,23,24,24,24,24,23,21,21,24,25,22,21,24,31,30,20,26,22,20,23,21,25,18,24,23,17,21,21,26,24,19,18,22,23,30,27,31,36,31,28,27,32,31,34,29,29,30,29,32,25,20,21,19,20,23,20,22,23,17,21,24,21,17,19,24,21,20,23,24,25,21,20,22,21,19,22,24,23,19,25,19,43,86,98,79,46,23,17,20,21,23,20,24,24,21,21,21,24,19,22,24,21,24,19,22,20,24,24,22,20,19,26,18,21,24,23,26,21,18,22,24,20,22,22,26,28,24,26,22,22,22,22,25,23,26,24,22,27,21,24,26,22,26,32,34,29,87,114,61,33,27,24,47,49,42,39,47,55,44,59,52,50,67,69,79,55,82,86,73,100,101,98,105,91,84,81,98,143,155,137,119,112,90,83,86,66,66,68,61,60,61,53,52,53,47,46,36,44,63,71,72,65,79,86,100,98,90,94,110,97,21,5,14,12,17,15,17,16,16,17,17,16,98,132,165,145,139,88,87,109,81,162,241,248,252,252,247,239,234,230,232,236,235,158,145,153,151,123,33,22,88,141,78,72,85,64,57,66,92,89,108,122,111,94,63,49,53,69,92,79,60,58,56,60,67,68,77,87,75,81,71,68,79,74,79,77,78,96,79,86,153,204,224,130,72,145,149,126,113,120,153,181,176,169,123,95,105,146,149,139,128,73,54,62,95,125,159,132,134,186,175,171,114,64,76,113,136,88,61,19,17,31,20,55,47,46,69,69,47,21,14,37,34,13,16,33,55,36,24,61,57,48,51,47,79,131,174,181,186,198,200,183,61,8,22,83,99,86,87,82,72,31,64,48,64,92,58,65,61,69,62,55,48,45,50,59,84,106,122,139,145,165,159,80,28,102,177,160,66,33,99,115,113,103,94,88,72,66,57,53,48,41,33,29,27,35,61,87,110,128,153,163,186,161,77,32,81,156,152,142,130,134,132,127,120,92,78,65,56,43,31,19,14,15,15,36,63,105,105,48,101,112,57,57,21,74,169,185,174,153,157,151,134,124,106,89,62,52,40,23,21,11,34,46,68,121,152,171,135,164,193,178,156,118,61,73,132,132,133,100,80,69,49,38,31,34,27,25,21,26,18,17,22,21,19,18,21,17,20,19,18,21,22,42,42,34,23,21,24,21,21,23,27,22,22,25,21,25,34,31,38,39,33,31,32,37,39,46,35,21,27,35,34,33,34,23,24,33,33,49,71,89,94,84,76,70,65,65,63,66,70,71,75,78,78,76,60,63,61,61,66,61,65,64,71,76,81,73,49,35,28,20,24,22,25,41,59,76,80,89,92,92,91,87,92,84,79,80,75,76,84,82,71,64,60,57,61,55,51,54,58,61,64,66,60,68,64,53,42,27,22,15,18,21,19,22,21,27,36,39,43,45,44,43,40,47,46,40,41,42,43,37,29,26,24,40,97,95,36,18,16,25,23,24,27,18,25,24,25,26,29,27,21,21,25,23,23,22,21,24,24,24,24,21,30,28,23,24,24,29,24,27,25,22,39,30,26,27,25,30,23,24,19,23,26,22,26,24,30,21,24,25,21,25,25,24,23,23,21,23,24,26,26,27,78,75,33,22,18,19,25,29,23,17,26,28,25,30,22,29,22,18,32,21,22,28,21,23,22,27,22,18,27,31,27,19,24,28,23,20,22,17,20,28,24,30,32,31,31,29,30,28,29,30,27,27,33,31,25,30,21,23,22,21,24,19,19,22,21,21,23,19,24,23,21,20,21,23,21,19,21,21,22,21,19,21,23,18,23,22,22,47,51,79,105,81,44,19,18,27,19,19,24,21,23,25,29,20,25,29,22,23,21,24,24,23,24,22,21,26,22,23,24,22,24,24,21,19,22,24,21,21,26,23,24,21,23,25,24,28,21,25,24,21,29,25,26,23,23,24,22,24,29,55,40,53,150,171,132,107,61,31,35,40,45,39,35,46,48,67,54,53,70,65,69,56,66,62,50,60,48,59,100,91,68,62,93,136,129,113,103,107,91,88,81,61,63,61,62,54,53,50,60,64,46,41,35,50,55,53,46,35,45,42,49,48,53,57,60,63,25,9,16,15,17,15,16,17,16,17,16,17,83,107,139,137,145,105,92,108,80,154,233,248,253,253,253,253,252,252,243,243,230,148,139,150,151,130,71,45,90,133,91,113,119,84,78,95,124,114,124,125,109,101,59,41,39,65,101,99,67,50,61,59,57,62,83,103,89,95,82,87,120,81,50,36,75,113,72,95,166,221,247,119,71,169,131,93,119,166,138,131,164,206,192,98,61,86,93,101,86,62,74,70,51,48,62,52,70,84,91,141,129,100,90,87,91,67,55,36,46,53,42,68,59,59,66,71,71,53,39,39,63,57,45,53,78,74,52,77,96,68,83,85,66,56,89,120,113,113,136,92,31,36,70,108,87,64,69,71,71,24,42,42,61,105,102,130,136,146,153,149,162,161,160,160,152,143,129,113,106,107,104,44,6,56,89,61,35,36,38,46,48,48,45,49,53,115,93,104,120,123,135,134,130,134,129,136,134,131,138,127,133,105,51,32,51,47,31,30,35,33,36,33,21,22,39,57,77,88,95,103,110,87,102,110,120,139,122,84,102,103,59,47,38,31,59,66,53,47,51,50,37,36,45,66,107,135,140,141,137,129,141,152,165,184,188,192,157,147,132,112,103,55,47,38,74,34,28,29,26,17,23,20,19,19,15,25,21,21,22,17,22,21,19,23,26,25,23,23,23,26,27,40,39,33,35,29,40,51,57,62,66,67,46,38,37,42,47,49,48,50,61,71,85,98,116,105,67,20,23,39,42,49,44,37,29,40,59,66,76,83,82,79,69,64,67,60,64,67,71,65,59,63,51,56,65,57,61,60,62,67,62,69,73,69,54,37,30,25,19,19,22,40,62,73,79,94,98,94,98,88,86,84,84,82,81,79,76,75,80,68,56,57,57,57,56,57,56,63,66,59,64,66,64,53,39,26,21,22,20,17,19,22,24,32,35,45,42,46,49,45,42,40,43,43,41,45,38,42,47,44,41,33,35,32,63,106,81,33,17,23,23,29,24,20,24,24,30,32,23,21,25,26,21,30,22,20,22,21,24,24,27,26,24,27,24,24,28,22,27,27,23,26,23,27,24,24,27,24,24,20,21,27,25,19,21,24,19,23,23,24,24,30,30,27,23,21,25,19,30,28,23,25,61,89,45,23,19,21,26,21,21,22,21,31,35,23,22,28,27,26,21,27,34,29,22,20,22,21,22,18,27,29,18,24,25,29,25,19,22,17,21,25,31,28,27,28,27,34,30,29,30,31,30,33,32,31,29,23,21,22,21,19,22,20,25,19,23,24,15,21,22,20,24,22,19,21,17,24,19,20,23,18,23,18,18,19,27,24,24,49,41,37,89,108,79,43,20,16,20,22,19,26,22,23,24,21,29,24,22,21,24,24,21,22,22,21,23,25,22,23,26,22,23,21,20,23,23,23,22,29,22,19,21,22,23,22,24,20,20,27,25,25,26,24,24,22,23,21,26,24,48,53,72,175,218,218,215,193,167,117,62,22,30,35,33,36,43,44,50,56,59,59,65,49,62,62,65,71,37,75,125,101,97,75,85,100,85,92,86,82,68,81,85,56,70,71,74,67,55,61,65,66,56,55,58,60,53,59,71,57,49,46,43,46,51,48,50,54,23,10,15,14,15,15,16,16,16,17,16,16,147,137,148,149,181,171,166,172,137,181,234,246,253,253,252,252,252,252,237,232,208,127,133,148,157,171,156,108,127,156,101,113,97,71,61,74,101,80,81,77,86,96,64,45,38,56,91,96,54,33,45,52,51,48,76,84,75,95,67,101,115,60,41,18,73,98,64,84,154,206,177,69,66,160,105,61,109,120,80,71,100,188,154,60,48,38,53,63,54,59,87,82,74,52,46,43,27,26,26,86,98,59,66,61,64,79,74,39,50,56,39,64,57,72,90,72,76,85,101,104,103,103,103,112,95,89,73,86,90,88,109,63,34,18,21,46,24,12,15,16,61,176,245,251,222,158,161,190,202,113,95,69,120,194,180,190,169,172,164,152,151,139,118,92,75,53,27,11,9,16,33,47,30,34,100,119,63,63,148,177,177,177,174,174,171,179,179,180,184,174,169,154,142,125,105,77,56,41,35,32,48,50,38,35,31,84,112,136,145,152,163,168,166,155,169,184,194,197,191,183,174,152,129,106,77,66,63,34,63,55,41,58,39,42,45,81,118,120,122,119,121,127,144,170,183,189,189,179,177,146,135,117,91,80,60,62,34,31,22,19,29,14,33,39,29,17,24,24,23,22,21,21,18,21,23,26,22,24,24,24,24,24,25,24,29,48,63,69,55,36,41,44,46,69,78,106,133,138,134,119,103,76,59,54,62,84,101,107,108,103,113,128,136,146,154,131,65,28,20,32,56,69,62,44,37,39,45,49,44,41,57,66,63,63,53,57,63,65,63,55,49,57,59,50,56,61,59,62,63,60,66,59,50,36,27,27,20,20,28,41,58,74,77,86,95,95,95,90,86,88,79,71,79,77,87,80,66,58,62,55,50,57,49,53,59,59,64,66,66,64,61,54,41,28,21,21,16,15,21,25,23,33,37,39,45,45,48,46,45,44,42,44,43,40,44,43,44,49,41,43,46,42,40,40,41,78,107,81,35,21,27,24,41,35,23,27,27,34,39,28,23,21,22,27,27,20,24,24,22,27,25,27,27,25,27,29,24,30,29,26,25,19,24,25,30,31,27,26,27,22,22,29,22,25,23,23,28,23,24,25,23,36,33,21,22,19,36,39,28,18,31,29,44,93,58,24,22,21,30,29,23,27,28,25,24,25,26,26,27,28,23,22,30,21,20,26,23,26,20,18,25,22,25,24,31,27,24,25,17,20,24,27,28,27,33,30,33,29,31,34,28,32,33,27,30,34,29,30,21,23,23,19,22,21,29,23,20,19,22,25,19,22,20,20,25,25,23,23,24,18,21,22,19,21,25,24,24,22,30,62,42,24,40,93,114,78,38,15,19,22,23,20,24,22,21,23,28,20,23,25,27,26,17,26,23,20,22,27,22,22,25,21,21,22,18,22,26,24,24,21,22,23,24,23,24,26,26,24,22,30,26,22,25,28,24,22,27,26,25,41,59,73,152,204,201,204,200,204,211,198,114,56,46,34,35,33,34,37,41,46,49,49,54,56,59,61,60,61,45,61,94,82,101,78,50,50,48,70,63,56,48,53,60,59,66,62,68,66,59,61,61,63,61,58,59,60,50,69,90,80,65,45,51,50,48,50,47,48,24,12,16,15,17,16,16,16,16,16,17,17,230,224,197,186,241,251,248,248,198,207,247,248,253,252,248,246,237,239,227,220,190,117,128,141,163,204,224,162,150,139,70,90,74,47,42,52,64,56,68,84,104,109,91,70,57,64,73,77,60,52,62,68,71,69,74,67,65,79,71,95,105,69,45,15,54,77,59,61,76,90,81,33,42,80,55,66,74,76,62,56,67,89,81,50,39,43,47,53,41,57,78,77,79,84,88,76,72,59,43,74,83,101,136,160,139,114,92,57,55,50,36,59,78,86,94,82,82,84,104,98,82,72,74,82,65,47,29,42,15,36,41,26,26,33,62,99,92,73,100,130,190,248,234,233,201,131,142,185,238,132,49,32,54,97,84,80,71,64,54,45,38,27,22,29,25,29,27,25,31,58,90,59,22,68,143,154,84,69,131,124,155,131,122,118,108,111,109,96,119,69,61,47,42,49,46,43,33,32,37,38,91,93,53,34,44,158,182,182,176,171,167,158,148,98,140,145,143,124,111,96,82,67,61,63,54,47,44,57,93,116,130,142,102,71,68,191,206,206,179,149,138,91,129,102,91,77,49,46,41,29,23,24,24,26,23,22,22,21,23,16,24,18,33,46,25,22,21,18,21,24,20,23,27,31,34,33,30,30,33,36,33,39,36,33,71,102,129,137,85,61,71,64,95,135,129,123,123,101,74,54,64,74,81,105,101,117,116,122,123,115,132,124,121,103,63,41,33,27,35,35,35,40,43,39,29,39,38,41,44,37,58,61,60,63,63,60,60,60,60,59,57,78,74,57,55,54,63,61,61,59,47,38,27,24,22,22,30,38,61,77,79,89,92,97,91,85,92,81,79,78,78,83,84,73,62,62,54,54,55,49,50,57,55,56,61,61,57,59,62,51,37,29,19,21,19,20,20,21,31,30,41,43,44,46,44,49,45,42,44,44,43,46,41,41,47,45,37,46,43,41,45,46,41,38,49,46,85,115,64,33,17,41,70,30,22,22,24,39,35,26,23,23,26,26,25,20,23,23,19,29,27,26,27,24,24,26,26,26,27,24,22,27,25,25,30,34,36,31,27,30,26,20,29,28,23,25,25,29,24,22,23,26,24,24,21,33,54,37,23,23,32,31,32,90,75,33,17,33,40,20,27,31,33,25,25,36,28,25,28,21,20,24,18,19,27,27,22,22,20,24,23,25,23,23,27,29,26,23,23,19,22,24,28,29,31,29,29,28,31,31,27,31,29,29,31,26,29,31,23,19,20,19,24,23,23,26,20,20,28,21,19,22,21,24,20,22,26,21,22,24,25,20,23,26,21,21,24,18,41,74,36,18,28,42,101,106,76,37,15,21,18,22,24,20,21,23,21,24,22,21,27,23,21,22,26,21,26,26,21,19,22,22,22,24,22,19,20,21,22,27,26,25,24,21,26,24,21,23,18,25,27,21,25,33,45,47,53,60,53,78,123,139,149,143,113,117,117,115,157,181,152,140,116,77,51,33,35,37,35,40,45,40,51,57,60,57,49,57,57,59,50,41,70,61,49,53,41,53,57,61,54,45,47,49,55,53,58,57,57,55,53,59,61,65,55,57,63,65,73,67,65,56,48,49,46,44,48,48,24,12,16,15,16,16,16,17,16,16,16,17,224,242,204,181,240,245,241,224,150,186,243,249,253,249,244,243,237,241,227,222,186,110,125,129,145,185,216,151,128,116,59,101,94,66,50,68,104,95,115,143,157,160,152,131,118,120,134,136,114,111,121,131,139,132,134,126,120,137,120,135,122,128,118,51,93,120,118,95,84,110,112,83,84,113,94,113,141,127,113,114,103,101,72,57,51,45,83,69,53,47,61,69,65,68,97,129,132,129,104,139,157,180,246,244,203,136,71,61,59,46,39,65,68,62,69,63,66,63,77,53,43,49,36,49,41,31,20,24,22,39,89,109,147,163,194,235,238,239,229,224,218,209,160,97,63,33,29,84,113,63,41,17,32,60,71,100,96,109,111,110,113,124,143,153,158,159,159,150,152,151,148,100,24,28,99,81,37,41,57,49,28,24,26,33,47,50,49,64,74,81,83,79,105,122,133,138,134,139,138,141,142,125,84,31,46,104,98,78,63,43,29,21,23,33,40,56,61,61,58,61,80,103,142,158,153,143,145,151,155,159,166,148,104,60,70,117,119,102,59,32,34,30,26,25,23,21,19,21,23,17,20,25,15,23,23,21,23,20,26,28,26,27,48,48,41,43,40,39,50,51,54,51,52,86,103,83,53,48,79,109,107,83,46,36,118,162,141,118,85,59,56,55,71,81,53,47,47,57,71,78,85,81,91,101,108,103,105,112,117,117,111,84,50,36,29,28,34,48,85,94,65,42,27,31,25,33,46,42,41,42,59,63,65,68,65,64,67,68,74,73,69,85,83,59,63,66,57,57,48,39,27,19,26,24,24,39,60,76,78,84,92,91,95,84,86,87,79,77,69,83,84,78,79,56,48,55,58,51,55,54,57,60,59,62,57,60,62,53,40,28,22,19,18,19,23,20,28,36,38,47,47,38,46,49,47,46,46,40,39,46,44,49,48,48,48,46,45,46,42,41,45,41,46,46,44,40,38,86,106,66,30,22,34,22,24,24,21,27,21,22,24,29,24,24,29,24,24,25,24,25,21,26,27,26,29,24,23,26,22,28,23,26,34,25,28,30,29,24,22,30,23,26,25,24,25,22,25,24,23,24,25,22,22,29,26,25,27,24,22,27,27,24,27,72,82,37,24,21,25,24,28,29,24,23,21,27,24,24,24,23,20,19,26,26,26,23,21,25,24,24,26,26,24,25,27,24,30,27,22,18,18,29,28,31,27,26,30,26,30,30,27,31,27,27,33,33,31,28,23,19,20,22,19,19,19,19,22,21,26,20,23,21,17,25,23,23,26,20,22,22,26,20,22,29,24,17,29,21,55,81,28,27,25,24,47,90,111,70,36,17,15,23,20,22,18,21,27,22,22,25,24,21,22,24,24,20,23,24,21,24,24,22,23,22,21,21,21,26,23,19,24,20,22,29,27,20,24,24,24,24,22,27,28,61,90,101,128,117,100,146,169,143,145,105,74,73,50,57,94,142,153,165,160,122,107,53,32,35,35,39,41,49,47,53,58,53,42,52,60,53,47,37,56,48,51,61,47,45,53,67,59,43,43,46,44,45,46,49,55,53,51,49,54,55,54,57,48,49,43,48,56,50,54,45,47,46,45,48,23,12,16,16,16,16,16,17,16,17,16,17,184,227,168,125,169,192,181,118,65,141,213,241,252,249,249,248,243,243,227,222,181,107,123,125,125,133,113,71,132,134,90,127,129,114,77,99,128,134,160,173,182,176,179,176,168,174,177,182,174,177,177,177,180,174,182,173,174,175,164,158,133,161,134,71,124,166,163,146,139,152,155,145,148,159,154,158,155,164,160,152,144,127,99,99,78,74,100,87,64,51,57,63,60,39,52,71,92,102,82,109,100,121,165,130,115,91,52,63,66,33,42,39,47,44,59,57,45,43,52,41,28,45,56,65,65,89,105,137,174,205,233,247,240,238,230,222,198,177,164,111,78,57,19,9,19,27,22,28,59,76,112,100,128,168,174,201,208,210,213,214,214,219,228,223,144,184,163,146,131,114,108,61,9,24,34,36,20,42,75,107,103,120,144,155,164,175,175,116,120,208,210,206,137,205,187,175,152,136,123,110,96,77,37,27,42,29,46,71,88,99,109,126,142,158,157,164,159,148,94,159,152,143,154,152,139,115,90,88,56,36,40,37,27,43,44,24,25,27,29,22,19,15,22,21,21,19,20,23,19,23,22,21,29,25,19,26,23,33,43,55,68,76,116,130,141,171,149,153,189,191,192,163,149,189,208,175,98,89,153,189,181,149,81,91,209,219,127,100,76,50,47,27,44,49,59,81,84,94,104,103,96,98,100,103,98,110,105,104,77,66,41,27,32,34,34,48,84,131,154,155,164,115,34,22,55,63,42,39,44,46,67,71,73,68,63,67,74,79,77,90,86,93,72,73,76,61,53,36,27,20,21,24,23,39,53,61,69,81,84,83,81,83,81,82,80,78,75,69,76,79,72,61,55,50,51,54,54,59,60,62,59,60,57,63,64,53,42,24,21,19,19,17,19,19,27,36,39,42,49,49,45,46,44,43,46,45,43,46,46,45,49,49,46,48,46,48,45,45,44,41,44,44,48,46,42,41,31,37,98,109,54,21,17,24,29,24,21,24,25,27,25,22,24,27,29,27,24,25,26,26,24,25,21,29,32,27,28,28,27,25,23,29,28,23,29,31,22,28,29,27,24,27,22,24,27,22,32,22,21,23,23,24,21,29,27,27,26,25,31,22,22,28,21,56,91,51,23,15,23,25,20,29,24,19,27,27,23,27,26,18,20,21,23,21,20,24,21,27,22,24,29,18,21,24,25,29,30,27,24,24,19,24,29,27,28,30,27,29,32,26,32,31,27,29,29,32,32,35,27,18,21,19,21,21,22,23,19,21,21,21,20,21,21,22,22,19,27,22,20,25,22,22,22,22,23,26,23,22,71,71,25,29,22,16,23,45,97,102,67,37,22,22,21,21,19,24,25,25,25,22,24,24,19,22,28,24,21,24,21,23,24,26,24,17,26,25,20,18,24,22,20,25,23,28,24,20,26,23,24,24,24,24,39,99,123,113,113,93,101,132,126,116,118,111,96,91,76,65,80,108,122,131,123,117,108,55,32,30,36,43,51,50,48,47,41,41,41,57,59,53,56,59,63,48,55,66,51,57,60,69,56,48,50,48,50,43,52,48,51,50,41,43,61,64,39,38,41,39,44,36,43,33,36,45,36,47,50,43,23,12,17,16,16,16,16,17,17,17,17,16,200,228,166,135,167,185,210,142,87,160,214,244,252,251,252,248,245,248,227,226,181,109,129,124,121,93,48,51,137,169,144,166,171,134,92,128,163,174,179,184,187,181,185,181,186,187,195,191,199,203,198,199,199,193,194,198,191,182,174,153,113,143,110,56,98,134,171,164,146,159,160,170,181,179,175,163,140,143,157,157,148,139,137,145,104,82,105,84,80,70,54,59,52,37,24,14,24,33,34,49,28,18,31,36,66,90,84,110,78,16,12,14,42,74,83,85,63,72,98,96,117,125,150,174,185,223,233,240,236,233,222,196,178,143,121,98,60,40,51,43,21,21,21,20,88,127,65,50,62,86,135,160,194,188,174,170,141,130,122,124,125,127,134,121,97,88,98,104,129,127,129,116,51,26,84,114,75,98,162,183,191,201,206,194,188,174,147,134,128,131,120,122,130,108,98,89,84,90,93,104,106,83,70,49,51,103,150,181,185,178,187,184,184,176,156,144,112,112,110,91,69,42,37,32,34,30,21,20,18,18,24,19,23,41,33,25,19,22,24,22,28,19,27,29,27,32,27,37,41,49,60,61,61,68,65,67,55,76,136,161,183,189,193,190,200,208,188,203,217,209,219,165,162,201,197,159,116,135,165,189,186,167,78,76,181,135,73,50,37,54,49,47,89,113,109,116,121,121,123,127,116,108,114,109,103,87,72,48,34,30,33,39,38,72,107,129,145,146,155,153,169,130,37,35,90,101,63,42,47,56,74,70,68,63,65,74,69,81,81,94,104,103,93,76,63,40,25,25,22,27,25,33,51,62,68,72,76,82,82,78,76,69,72,66,67,72,76,78,68,61,55,49,52,53,57,55,59,57,57,65,65,68,63,60,46,29,23,17,21,22,17,23,25,35,41,43,50,48,45,47,46,47,43,48,49,41,51,48,48,49,45,49,48,46,46,43,43,42,45,48,41,41,43,43,40,43,42,31,54,105,98,48,19,21,22,19,19,27,29,29,31,24,24,24,21,22,25,25,25,26,24,24,31,27,24,30,29,20,18,26,28,32,30,26,22,27,25,28,24,31,39,24,26,23,26,27,25,24,22,21,25,30,25,32,28,28,29,25,36,35,29,32,28,41,92,67,27,20,18,24,22,23,29,30,27,30,29,27,22,24,25,22,18,24,22,22,24,20,26,25,31,31,22,25,24,28,28,27,33,20,16,30,27,25,29,30,33,26,29,29,32,33,30,29,28,37,32,25,24,19,18,21,18,20,20,19,24,17,25,24,20,20,23,23,19,24,21,22,22,23,22,19,21,22,16,25,29,20,81,66,19,28,18,18,18,23,53,99,103,67,34,17,21,22,22,20,24,23,18,21,23,23,20,21,24,22,27,22,19,25,22,22,21,25,24,21,21,24,22,25,27,21,26,23,23,23,25,21,23,25,31,26,39,98,113,94,84,74,88,97,78,65,93,104,113,117,98,88,74,89,85,81,78,75,83,65,65,66,61,77,94,92,90,87,92,105,106,104,97,81,88,90,91,77,69,79,81,73,75,81,60,50,58,54,49,50,57,56,74,87,87,87,86,90,85,84,77,84,95,97,94,87,89,72,56,64,67,55,22,9,16,14,16,16,16,16,16,17,16,16,196,221,178,171,200,227,243,168,132,200,244,250,252,252,252,249,243,243,224,226,171,112,132,125,139,114,93,111,150,126,104,125,116,94,48,92,124,123,137,129,132,125,131,132,126,132,129,140,138,142,142,142,144,134,143,141,148,153,163,125,63,76,50,13,44,71,79,85,77,92,96,110,134,131,129,110,109,113,115,124,122,112,98,122,102,87,90,80,79,65,46,41,53,46,54,53,60,81,80,118,84,65,90,84,128,125,99,141,92,24,40,57,141,158,169,170,162,185,201,199,205,201,201,211,206,208,164,177,138,112,73,45,39,29,46,39,18,10,61,74,40,36,19,55,156,187,132,50,15,42,81,134,145,124,98,60,46,34,27,36,55,65,69,76,102,119,125,152,179,171,178,166,47,63,160,188,111,121,165,162,146,127,118,82,57,39,18,12,17,17,22,38,129,89,115,140,158,179,193,193,178,159,112,66,84,121,132,122,94,78,68,53,37,39,35,31,30,28,61,24,24,18,25,19,19,26,20,21,26,21,24,30,28,42,43,37,27,27,33,26,28,30,39,44,55,66,107,155,181,210,219,214,218,218,217,206,132,152,216,224,227,222,214,208,201,202,200,198,201,193,200,155,129,168,174,179,178,185,196,198,148,136,65,33,78,97,92,93,85,83,94,109,136,135,125,134,129,128,134,141,125,105,87,69,38,30,34,31,37,35,61,87,122,138,142,142,135,136,129,128,89,25,16,21,75,97,76,77,64,67,75,66,71,65,64,66,75,81,78,92,110,105,73,45,28,23,22,23,27,29,50,57,62,68,68,69,73,77,71,66,71,66,71,66,64,69,74,63,53,48,47,48,51,55,56,59,63,66,64,61,71,61,46,35,20,15,21,21,21,17,26,37,42,43,44,49,48,46,46,45,45,46,49,48,49,53,51,53,45,46,41,44,49,45,48,39,44,47,40,42,41,38,41,39,42,44,39,43,45,69,112,96,41,16,19,21,25,33,24,37,38,27,28,24,25,24,24,26,20,26,27,31,29,28,28,23,24,24,27,27,30,33,35,24,21,27,27,27,27,45,37,22,26,27,24,23,21,25,29,23,30,33,26,27,24,30,27,29,36,32,29,27,28,32,82,80,39,29,19,23,23,21,31,31,24,27,29,27,22,28,26,22,23,17,22,23,19,30,26,28,42,27,19,23,22,28,30,26,29,26,24,32,30,28,31,31,26,30,33,33,30,30,31,29,30,35,34,29,23,24,20,19,22,22,22,24,24,23,23,27,24,23,22,27,24,20,26,23,27,23,22,28,23,22,21,29,21,35,85,49,21,27,16,22,22,23,27,50,100,103,69,32,15,21,24,23,24,25,23,23,21,27,24,26,25,22,22,18,25,27,24,22,20,23,23,22,25,27,22,21,23,27,21,24,26,21,26,19,23,31,30,22,34,85,124,125,105,108,113,113,92,82,103,108,114,114,106,101,93,100,93,79,90,79,98,108,125,125,124,133,141,139,137,139,147,159,154,146,130,113,120,111,109,108,101,102,94,100,108,101,85,78,82,83,87,87,91,95,120,140,135,138,134,141,150,138,150,148,155,164,167,151,141,133,119,122,128,99,17,3,13,12,16,13,15,15,16,15,15,15,142,190,191,216,243,243,246,163,128,205,248,252,252,252,253,248,244,245,224,226,162,108,131,128,149,151,174,199,144,36,2,21,35,47,36,30,27,33,37,36,37,34,36,30,36,37,36,35,39,40,39,39,39,36,40,38,40,69,111,87,37,41,32,39,35,25,30,29,30,29,26,25,36,35,35,40,46,50,42,56,60,46,43,52,65,67,67,49,36,29,24,45,93,150,178,173,171,188,191,193,155,162,191,181,183,168,154,184,175,184,210,226,243,240,236,231,220,217,200,168,160,128,113,104,74,66,62,48,42,54,29,8,19,36,52,50,39,32,50,61,42,37,37,105,229,250,225,114,14,10,27,30,36,32,27,29,31,36,34,40,50,42,45,57,73,73,77,78,88,78,76,75,33,26,71,81,46,41,50,48,38,24,22,21,25,50,74,94,118,131,134,144,150,158,160,154,150,169,160,135,116,78,64,45,35,35,28,30,28,27,28,23,23,25,24,28,20,21,29,24,27,27,27,23,26,27,26,28,27,30,29,27,37,41,35,34,27,22,22,32,44,52,56,66,104,139,169,175,175,178,179,190,188,186,191,166,92,113,161,163,165,154,160,149,147,157,145,155,151,146,157,100,98,129,141,159,160,170,155,136,92,65,31,46,108,117,125,92,70,66,61,83,104,104,99,93,84,75,73,73,50,31,33,34,39,37,47,61,82,107,119,134,133,139,132,132,134,130,125,89,32,5,65,90,56,80,79,83,81,67,67,61,71,64,71,77,83,96,85,100,83,51,39,21,23,19,26,36,50,64,64,65,66,70,70,73,73,69,68,68,63,68,73,68,75,66,50,51,49,49,52,50,56,58,63,66,66,74,71,62,55,28,23,22,19,24,14,21,31,36,44,45,44,49,50,47,51,48,48,53,47,52,47,47,54,50,52,45,46,47,44,40,46,48,42,45,47,46,46,45,41,39,45,39,41,45,45,47,46,41,79,118,86,39,19,20,25,22,24,36,30,21,25,27,27,29,25,23,30,27,25,25,31,24,27,25,24,34,22,30,33,27,31,26,32,31,24,30,27,28,24,25,27,23,26,28,25,24,23,30,32,36,39,27,25,24,31,29,44,53,31,25,24,29,71,92,75,47,17,18,26,27,21,27,36,25,23,22,24,31,25,35,28,24,25,21,29,25,27,23,24,22,21,27,27,30,28,24,29,28,25,27,31,31,34,32,26,31,30,31,31,33,29,29,29,35,34,26,29,22,19,21,23,22,22,19,23,22,22,29,24,23,26,22,20,24,25,22,24,20,20,23,24,27,19,28,20,46,89,38,27,27,13,23,20,23,21,25,55,101,104,67,35,21,19,22,24,23,20,24,23,24,21,22,25,22,23,19,20,25,24,21,24,21,22,25,25,24,23,21,19,24,25,27,17,26,26,23,28,24,26,21,31,59,94,108,116,137,153,166,151,141,147,139,142,135,137,138,128,134,122,117,117,130,144,144,154,148,146,155,155,146,141,144,148,159,157,148,142,125,124,117,113,117,111,111,112,121,128,125,117,113,111,110,114,119,122,127,150,161,162,166,172,181,180,181,177,177,180,183,186,181,179,161,160,166,173,111,10,2,9,11,14,11,14,14,15,15,15,15,110,184,236,249,246,246,252,147,117,207,241,251,252,252,252,248,246,243,225,225,155,117,138,134,147,118,155,177,101,20,6,27,36,49,38,27,23,21,34,25,29,27,21,27,25,30,24,27,26,27,29,27,29,25,30,46,53,71,70,45,53,52,37,50,52,41,45,51,42,35,30,21,29,27,30,33,39,44,27,23,26,26,28,47,50,56,58,47,51,64,109,106,171,245,251,228,231,235,211,198,177,205,226,209,206,197,205,232,229,227,226,217,211,192,173,152,128,117,94,92,122,101,69,36,8,11,22,13,23,69,59,22,44,54,63,48,33,38,40,40,39,36,54,160,249,249,252,202,46,5,13,14,21,18,26,23,23,32,32,28,36,37,42,44,46,46,39,35,27,31,42,47,45,46,44,48,61,79,78,84,99,116,122,114,126,139,150,157,97,160,150,122,93,75,55,39,29,36,42,36,30,22,27,39,44,25,20,27,25,29,29,27,25,31,29,30,33,32,34,31,28,26,30,34,27,27,31,27,29,32,40,38,21,24,25,23,23,20,23,21,24,30,30,34,29,33,39,39,38,35,31,34,33,31,36,40,46,44,22,26,35,35,36,39,38,30,35,38,38,34,39,42,46,45,41,63,52,43,35,26,29,30,47,47,37,37,39,44,46,47,40,32,35,36,39,45,41,42,42,45,43,50,50,54,72,88,101,118,132,133,129,122,125,123,125,125,125,118,113,108,88,101,130,108,60,68,61,73,65,63,66,64,73,73,87,90,94,105,77,45,35,28,21,24,25,36,60,76,80,77,80,76,76,71,74,71,65,62,65,68,71,67,69,68,65,58,50,52,50,47,50,56,65,66,66,62,75,72,51,34,27,22,19,19,18,23,29,36,47,44,46,50,46,53,51,53,50,57,53,51,53,52,53,50,51,48,50,48,46,45,46,46,48,44,46,41,39,49,39,43,46,39,42,43,49,46,44,42,39,39,43,80,109,81,38,17,23,26,25,21,18,23,25,28,27,28,24,24,28,31,28,28,30,29,29,19,30,34,27,27,33,33,29,33,33,29,23,32,29,22,30,27,29,29,31,29,24,29,30,34,27,35,38,21,27,29,31,34,49,45,22,23,31,32,52,93,80,36,14,19,25,23,22,33,30,19,26,28,27,22,23,30,22,20,24,26,22,26,23,24,28,23,29,25,26,29,29,30,31,23,26,32,27,32,29,34,27,32,33,28,28,29,33,32,31,31,33,31,19,23,21,21,24,22,23,25,22,19,20,23,24,21,17,24,21,19,24,23,24,22,23,20,18,28,21,25,19,57,84,29,23,20,17,24,20,26,22,21,28,56,101,99,68,34,15,23,21,22,21,23,22,26,25,23,23,19,25,22,22,22,22,23,26,23,16,22,24,23,23,23,24,26,29,25,22,22,24,20,27,27,24,23,31,59,59,55,77,113,162,197,194,183,178,175,173,168,172,183,178,166,154,155,159,177,173,171,169,162,167,175,172,160,153,152,152,159,155,153,142,126,130,125,126,133,117,122,126,139,130,136,139,137,139,139,132,134,139,148,156,169,165,170,177,179,182,176,173,174,177,178,177,175,179,166,167,173,183,114,8,2,9,10,13,12,14,12,15,15,15,15,92,163,203,244,233,230,239,100,114,202,237,251,252,251,252,246,247,240,223,224,143,125,154,159,167,76,52,55,17,24,27,22,37,42,41,33,29,31,27,34,33,29,32,28,34,30,28,36,29,29,33,33,32,29,35,44,66,98,66,36,38,34,34,44,60,55,52,66,72,62,44,48,55,47,55,52,43,39,41,71,87,94,105,120,139,139,156,170,175,196,205,217,235,247,242,212,193,177,155,137,139,159,167,153,152,142,133,148,128,114,89,78,63,45,38,19,13,26,50,78,137,122,67,46,27,23,22,24,25,50,41,32,49,44,46,54,38,32,49,51,43,38,91,166,225,239,243,228,101,24,21,18,30,35,41,43,45,46,53,50,47,50,55,57,57,57,63,61,84,100,110,113,65,62,96,126,157,156,148,149,157,156,131,111,95,78,57,42,43,36,34,33,24,27,21,24,24,24,22,23,26,23,32,40,45,33,27,33,32,37,31,29,30,32,27,35,34,29,26,29,32,20,29,26,19,25,29,22,23,33,51,41,22,22,22,18,23,24,20,23,22,22,23,24,24,22,24,23,24,25,18,22,23,21,19,29,48,38,20,20,25,24,22,24,25,21,25,29,31,36,44,39,38,37,26,31,28,23,19,22,22,28,50,39,24,19,29,25,29,47,42,39,39,39,54,66,66,61,58,63,75,93,100,115,123,117,126,122,119,120,115,121,113,120,115,113,117,113,132,141,125,103,92,69,61,70,59,62,65,67,72,67,84,85,95,94,70,49,34,25,24,23,27,44,61,77,98,101,94,102,97,89,81,79,81,78,68,59,71,76,67,66,66,66,63,56,51,48,47,53,56,63,73,69,73,67,52,37,25,19,17,17,20,20,31,38,41,48,44,50,46,50,55,54,56,57,57,56,55,49,52,53,49,49,44,48,46,47,48,42,49,50,45,39,43,47,39,39,43,45,41,45,46,44,39,41,41,39,44,40,40,34,73,110,78,35,23,23,20,22,23,30,26,27,31,26,29,25,29,30,28,27,30,30,27,29,31,32,26,30,32,31,37,28,28,29,22,29,29,30,34,24,31,38,30,27,27,32,28,29,30,29,27,23,31,28,28,27,29,24,25,33,28,29,36,84,79,31,21,24,27,19,27,24,23,26,29,29,23,23,22,27,25,26,29,21,25,28,24,27,24,28,24,28,29,25,25,27,33,27,25,28,29,28,34,31,31,35,31,33,31,32,27,30,30,34,36,22,21,20,21,23,22,22,23,22,23,23,22,27,21,20,24,22,24,26,25,27,25,23,22,23,24,21,23,24,17,74,74,20,30,27,19,23,21,25,21,23,26,26,56,97,96,62,29,21,18,24,24,19,23,21,26,22,24,24,20,27,22,26,25,23,27,23,26,23,22,23,24,24,21,22,30,23,21,26,24,24,22,24,28,24,52,81,98,122,112,77,117,186,201,206,200,187,185,184,193,194,188,187,172,181,193,195,195,186,189,186,185,188,188,181,178,180,177,179,177,171,162,151,162,160,155,155,155,162,164,167,160,162,170,171,175,168,165,160,160,158,161,171,173,174,174,179,175,170,169,169,173,171,171,171,174,168,173,176,184,114,8,1,9,10,14,12,15,13,15,15,14,15,105,142,147,173,150,146,165,81,126,201,236,251,253,253,251,245,245,238,223,214,124,133,169,193,203,64,5,9,19,37,28,24,35,36,33,36,33,28,33,30,36,40,41,44,36,42,42,39,42,40,41,39,43,48,44,49,43,68,92,45,35,42,45,49,50,56,40,66,85,88,96,143,133,139,155,166,182,195,191,199,207,213,211,213,210,208,214,222,216,215,212,203,152,180,164,146,139,141,147,130,109,111,117,128,123,71,23,65,25,17,12,19,30,32,36,36,25,35,49,53,69,51,53,47,28,71,146,154,93,51,19,25,50,34,45,57,39,33,50,63,51,81,141,158,159,157,182,169,104,60,29,38,54,51,57,58,67,76,83,100,101,124,143,155,165,162,152,146,142,140,136,123,72,47,56,76,84,60,49,48,45,38,34,34,28,21,23,25,25,23,29,27,18,24,28,27,27,29,33,26,34,33,37,46,40,32,28,27,28,32,26,25,28,26,24,28,25,24,27,24,25,26,23,24,25,22,22,22,24,34,51,44,23,22,22,24,24,23,23,24,22,22,24,23,27,22,21,23,25,22,17,23,25,23,21,29,46,42,24,19,20,24,25,30,32,37,40,41,38,33,27,24,24,24,24,22,19,27,25,18,21,33,53,40,21,23,27,22,52,60,39,47,52,48,83,106,103,101,94,108,120,122,116,122,123,112,108,113,110,105,105,105,110,111,115,117,125,120,114,90,65,53,54,56,59,64,58,63,62,77,85,83,89,90,75,53,33,23,24,23,31,46,64,91,100,108,105,100,97,97,98,87,89,78,80,73,74,77,70,70,68,64,75,75,66,61,53,57,50,60,61,59,63,59,53,35,27,20,18,24,17,24,32,36,44,51,53,47,52,55,57,54,53,57,55,55,57,58,55,46,42,48,48,46,44,43,50,47,49,45,45,44,44,44,43,45,41,41,44,42,41,43,40,34,43,40,42,39,41,41,42,39,31,78,111,75,33,13,23,27,29,26,27,31,25,27,25,24,27,28,27,33,26,30,36,26,26,29,33,31,30,34,31,26,24,27,24,29,30,30,27,22,28,29,23,29,27,23,29,29,29,33,32,25,31,29,27,29,24,29,28,30,23,31,31,70,94,46,23,21,17,28,25,22,27,33,28,32,41,31,25,24,26,25,29,28,24,26,26,26,27,21,24,24,30,31,29,33,27,24,24,32,29,28,34,32,32,35,31,30,28,31,34,27,29,31,32,32,24,18,23,22,22,24,20,22,25,19,19,21,25,23,17,24,21,19,23,24,21,21,25,19,26,23,23,27,27,85,66,19,29,19,22,21,21,23,19,28,26,29,29,55,101,92,60,32,17,19,22,24,21,23,22,24,23,25,24,19,22,23,23,24,24,25,19,21,25,20,24,23,28,24,22,24,24,30,21,24,25,19,44,84,89,99,142,181,170,82,55,123,171,193,193,197,196,184,169,165,200,212,190,189,187,184,185,189,186,184,187,186,183,184,187,186,185,190,185,184,180,181,189,184,188,188,194,188,184,189,177,187,189,188,189,186,186,181,178,175,176,182,177,177,181,178,178,178,177,177,177,177,174,171,174,170,176,174,184,115,9,2,8,10,14,12,12,12,13,15,15,14,145,171,168,189,158,158,158,76,135,203,243,250,253,253,252,245,244,232,218,198,89,105,155,199,222,69,6,11,21,45,38,36,32,32,36,34,36,34,32,32,39,44,47,54,54,53,57,59,57,57,63,56,53,57,56,51,59,105,111,87,86,93,116,133,141,151,169,192,204,208,224,239,242,241,240,235,235,233,226,227,224,208,191,170,159,138,137,134,112,101,87,66,46,36,39,84,137,190,213,160,120,99,108,118,106,55,3,11,24,38,40,41,47,51,52,38,37,49,59,56,53,47,42,39,77,174,251,251,201,116,24,11,59,75,68,51,36,36,55,48,67,152,192,128,82,76,77,93,62,75,98,117,141,134,147,152,153,159,143,140,135,126,125,121,120,110,79,51,39,36,33,33,30,35,34,19,28,27,21,29,23,25,21,27,29,24,29,19,27,26,26,31,31,40,33,40,48,51,57,56,54,49,46,45,42,37,37,46,45,45,49,47,44,48,47,44,50,43,50,50,46,47,51,49,50,51,49,51,29,44,58,45,46,46,48,41,40,40,37,43,42,48,56,53,50,48,47,47,46,42,48,43,48,50,46,61,63,54,34,27,28,41,45,47,49,48,46,43,39,34,40,39,36,38,35,34,29,31,30,32,30,44,56,39,29,21,42,57,54,52,31,41,50,55,91,103,96,84,87,98,104,104,97,111,111,103,104,102,104,101,101,104,103,114,121,120,102,73,57,36,33,47,55,53,58,61,61,65,68,81,84,73,65,44,33,26,23,24,23,51,66,87,113,107,102,101,100,100,96,93,84,86,83,83,83,89,82,62,65,65,71,78,79,76,69,64,59,54,61,62,62,60,49,37,29,22,22,20,18,23,26,38,44,46,54,54,56,56,53,57,54,58,56,51,55,48,53,51,46,44,44,47,47,49,43,44,49,45,41,47,44,43,41,39,42,42,44,45,48,43,41,41,43,39,41,45,38,42,41,44,47,35,33,32,82,109,70,34,21,24,28,63,50,23,22,27,26,26,29,28,28,23,30,27,29,27,25,37,30,34,30,27,25,20,29,26,28,30,29,33,25,27,24,19,25,33,34,31,27,29,25,29,33,31,32,26,27,29,28,34,29,29,29,27,23,50,93,63,30,22,19,27,27,27,31,25,23,42,37,19,22,29,29,30,26,25,28,22,28,29,20,27,29,24,32,36,31,29,27,28,29,32,32,24,33,33,34,31,33,33,30,27,29,33,30,31,35,26,25,25,19,20,23,19,21,27,22,23,21,21,21,21,19,24,22,20,24,16,23,19,24,24,21,20,24,18,44,93,48,20,27,17,24,21,23,23,19,21,28,29,22,32,64,100,101,63,29,17,22,22,25,24,21,21,23,28,17,23,27,22,23,21,28,25,21,21,24,24,22,25,19,27,23,22,23,23,26,26,25,34,141,182,111,66,63,121,130,41,7,23,77,139,167,190,195,191,160,166,239,250,221,187,178,176,179,177,177,175,178,179,179,181,178,167,171,186,185,184,186,185,192,191,194,196,189,171,181,188,177,184,186,186,189,189,192,193,191,187,186,186,184,185,183,185,185,184,186,184,185,183,184,180,181,180,184,183,188,116,7,1,9,10,13,12,14,11,14,15,15,14,177,194,198,213,184,181,145,75,136,202,245,251,253,252,251,246,243,233,217,181,52,64,139,192,214,65,9,14,26,55,49,53,47,38,38,37,40,43,43,46,54,53,64,61,55,99,64,77,91,107,122,137,148,162,179,178,182,212,223,214,234,243,242,242,237,236,234,232,227,225,220,219,214,206,199,182,169,150,128,122,118,114,116,116,112,84,74,72,46,39,24,12,10,14,11,31,103,145,155,125,107,99,97,105,106,65,5,16,50,49,38,40,46,56,50,35,38,63,103,110,111,98,87,94,149,242,250,250,247,145,36,16,75,107,93,60,31,38,75,87,118,171,151,90,63,35,32,26,45,101,129,91,125,112,118,97,83,53,39,39,33,33,67,27,23,24,27,26,19,26,24,26,29,35,33,22,25,27,23,27,31,29,33,37,36,39,43,45,49,51,56,53,49,75,104,131,152,172,182,190,203,190,186,184,179,179,187,197,192,187,182,148,194,200,198,198,193,192,188,153,184,182,186,194,194,199,200,198,190,158,191,183,193,193,187,116,75,82,85,135,154,174,187,184,183,174,181,182,175,169,178,196,206,202,188,178,152,91,27,9,33,76,141,159,155,164,160,163,153,149,153,149,154,147,145,139,118,106,95,108,123,137,129,97,72,85,96,113,103,68,42,46,38,48,80,79,81,70,70,86,87,82,80,87,94,94,101,98,95,98,105,108,109,85,85,63,45,37,34,34,36,48,50,58,61,61,68,69,62,65,59,46,30,24,19,22,28,47,63,86,101,112,102,94,91,93,98,97,91,84,80,77,81,83,87,77,63,69,66,78,74,73,81,72,65,61,63,69,70,63,52,37,27,20,22,18,19,21,27,41,47,51,50,50,52,56,59,53,58,49,47,54,53,51,48,50,46,45,44,47,47,46,44,43,49,48,44,43,44,43,44,43,42,41,43,46,45,45,42,40,39,41,43,42,42,36,45,42,42,45,40,36,36,27,32,81,109,69,27,17,33,56,35,20,26,24,25,31,26,27,29,27,28,29,25,28,33,29,29,35,29,26,29,26,24,28,33,32,27,29,27,21,23,29,32,31,50,48,27,26,26,33,30,27,29,28,32,27,29,37,27,26,30,30,26,39,89,75,34,23,17,25,24,29,32,24,24,26,25,21,29,29,31,28,29,24,20,32,23,29,39,31,25,24,35,34,34,31,32,24,28,33,29,31,31,35,29,33,34,30,33,32,33,29,32,34,31,29,22,23,19,19,22,22,27,22,20,20,23,24,22,24,22,22,19,24,26,19,21,27,24,20,30,22,27,21,54,92,34,24,26,16,24,22,26,21,21,22,24,24,21,26,29,63,105,93,59,29,19,21,22,24,17,24,24,22,25,21,23,17,19,22,23,28,26,21,23,24,24,23,25,19,21,29,22,24,27,30,25,53,180,178,101,92,57,46,40,14,17,12,37,102,126,154,192,206,178,185,248,247,190,179,177,173,176,176,171,172,173,178,177,179,173,157,165,182,179,177,178,179,186,186,186,181,170,152,168,184,178,180,180,179,183,188,187,189,193,188,182,185,184,185,188,186,185,186,183,185,184,184,186,185,189,185,190,187,193,115,7,1,8,10,13,12,14,13,14,15,14,14,185,195,189,191,167,171,146,81,139,201,249,251,253,253,252,246,244,237,222,172,43,66,141,201,207,49,8,15,20,53,57,59,53,45,53,43,48,62,66,82,95,113,127,142,152,163,181,188,200,214,224,231,236,236,235,230,228,230,222,224,223,218,215,201,184,168,149,131,111,92,81,76,65,48,38,29,32,21,17,14,28,65,115,152,153,94,69,93,71,57,59,59,33,17,14,18,74,127,130,113,110,119,133,121,95,33,3,44,74,66,35,32,51,56,55,35,25,59,103,100,96,87,95,128,189,250,248,246,229,145,33,9,66,133,157,125,36,55,155,151,125,78,55,82,78,55,27,28,31,28,35,41,31,27,30,33,22,22,26,20,26,18,27,24,21,27,25,28,23,27,28,29,36,39,38,34,32,39,45,50,50,57,58,51,69,92,107,130,159,176,194,204,206,214,219,225,225,229,236,235,240,235,234,241,237,236,237,234,224,221,223,230,242,246,238,237,238,238,240,234,235,238,242,241,244,243,249,250,249,249,245,236,211,203,162,110,116,151,213,236,240,245,236,237,236,241,244,244,248,245,246,241,228,191,152,127,96,101,136,172,200,222,239,241,231,230,231,235,229,232,238,237,234,220,222,231,202,171,162,175,198,217,216,186,167,151,140,122,119,109,83,81,53,61,85,71,72,61,66,81,77,71,66,76,83,90,95,93,94,94,103,101,71,52,40,34,39,36,36,36,43,52,51,58,59,61,67,63,53,42,27,26,25,19,34,45,69,80,96,116,96,93,95,91,89,87,87,84,80,81,84,84,80,74,68,65,66,71,81,74,71,66,67,67,65,69,70,66,59,44,21,20,25,22,19,19,26,40,53,53,52,57,49,52,55,51,55,46,53,51,46,52,55,53,42,47,49,43,46,42,46,49,49,47,46,44,41,41,43,43,37,44,45,44,47,40,46,41,37,41,39,39,39,38,43,45,43,43,43,38,38,41,34,26,22,32,87,106,72,35,18,26,23,27,25,23,27,26,26,27,28,29,26,26,25,24,26,29,30,31,29,24,27,24,24,30,29,27,27,27,22,29,28,29,30,29,43,30,25,28,30,27,27,27,21,29,32,26,30,28,30,29,24,24,29,29,69,87,45,27,24,24,25,25,22,24,23,22,24,29,33,34,27,30,30,27,22,21,22,34,38,20,20,29,29,32,29,34,39,30,43,46,34,33,32,31,33,30,35,35,28,28,33,33,30,29,31,31,22,22,23,19,22,22,19,24,20,24,24,22,24,22,22,19,23,23,23,18,23,27,19,21,23,20,29,17,66,81,25,30,24,19,24,19,28,26,21,27,24,22,20,25,26,32,66,99,89,60,32,21,17,23,25,19,20,23,21,23,30,23,21,23,24,23,22,24,28,22,24,29,27,23,22,25,23,27,18,30,22,88,174,139,163,175,145,125,94,59,30,37,87,122,117,115,159,204,178,182,215,173,145,149,157,159,164,167,163,164,168,170,169,172,167,151,157,169,164,164,166,165,166,162,162,162,154,141,159,174,167,176,174,171,173,175,171,167,175,177,174,177,176,174,177,183,183,180,178,179,181,179,183,184,185,186,188,185,191,115,7,1,8,11,13,11,14,13,15,15,14,15,175,177,171,170,154,165,123,69,124,191,242,251,253,253,251,250,244,241,230,178,59,93,165,218,222,75,56,73,84,125,127,151,149,152,166,171,184,203,212,223,232,234,234,227,222,168,215,212,209,202,195,189,178,160,153,127,127,112,77,109,112,68,65,42,19,9,13,13,19,16,18,34,33,32,26,19,26,28,26,29,36,43,96,127,113,81,61,79,81,92,109,128,141,145,104,66,105,139,133,110,99,100,110,112,110,53,1,57,130,135,129,139,147,147,113,54,23,38,56,49,36,48,92,134,169,201,216,208,190,109,22,4,76,128,135,102,35,59,137,116,59,29,21,59,61,33,23,15,24,27,21,27,27,26,25,17,33,27,26,30,27,33,26,37,46,43,43,31,39,46,47,48,56,59,54,53,66,90,112,146,167,189,203,207,213,214,214,220,232,234,236,230,220,211,199,189,173,172,179,183,188,188,190,194,193,190,185,180,176,180,184,187,184,171,168,179,188,192,190,188,193,186,186,185,191,206,205,214,211,206,166,91,68,77,131,154,179,212,211,219,212,209,211,216,224,220,210,191,174,145,117,87,60,59,78,118,152,184,208,223,224,220,221,202,193,196,186,182,183,185,193,195,193,186,192,201,175,141,128,133,142,161,181,171,162,139,122,103,97,103,90,88,91,107,119,84,85,88,77,89,77,68,72,67,80,79,80,89,87,86,78,69,49,45,43,43,38,38,44,43,72,89,63,58,59,59,52,36,33,24,23,26,30,50,63,83,105,106,100,104,92,89,84,81,78,83,78,78,79,87,84,74,77,61,72,69,77,87,71,77,61,65,66,75,79,65,62,45,27,26,21,19,21,19,27,43,51,55,60,49,49,51,50,50,44,47,53,47,48,53,51,48,50,46,44,48,44,46,48,46,50,46,45,44,43,39,45,45,38,43,43,44,43,43,42,42,44,40,39,38,38,39,43,41,43,39,39,39,40,39,41,37,35,33,24,26,32,82,108,69,31,16,21,27,26,27,26,24,27,28,27,32,24,26,28,24,29,27,29,31,26,25,23,25,32,29,26,26,22,30,25,27,28,26,27,26,29,23,27,33,23,26,27,27,29,28,35,25,24,29,27,27,29,31,27,29,54,89,61,27,25,24,27,24,23,23,26,25,33,29,35,41,27,24,24,27,21,22,30,26,17,17,24,27,29,36,33,30,39,39,44,46,41,41,38,27,29,27,33,31,32,36,33,28,30,29,31,29,21,23,27,25,21,24,19,21,24,18,22,22,24,23,20,24,22,17,21,23,20,22,21,24,24,20,21,26,83,67,21,30,20,16,23,24,26,21,24,24,24,22,22,23,21,28,33,61,102,93,61,33,15,21,23,21,20,19,25,23,23,25,24,24,22,26,22,22,23,25,24,26,26,22,25,23,25,26,31,19,57,175,207,213,230,217,215,214,200,166,155,140,112,113,92,51,89,141,134,145,171,157,147,154,152,141,146,156,150,149,146,145,143,146,139,120,128,141,139,139,139,137,133,133,133,139,139,127,144,146,143,153,152,152,148,153,141,139,155,157,158,162,162,163,166,169,170,169,167,171,173,177,176,174,180,182,188,178,186,116,8,2,9,11,13,11,14,12,13,14,14,13,143,150,142,148,141,144,104,48,98,165,227,252,250,250,250,250,246,246,242,212,119,151,195,246,243,155,177,192,207,217,205,218,221,223,222,220,220,218,216,216,215,221,206,184,162,141,132,122,110,98,91,88,83,76,70,62,79,69,14,8,11,18,33,39,45,39,40,48,48,46,43,50,51,44,41,38,33,35,39,41,52,47,45,39,33,24,32,43,36,54,87,138,198,249,228,156,148,168,197,193,188,190,200,204,229,147,19,79,182,223,220,212,208,187,168,69,28,61,85,77,60,78,122,138,95,101,116,129,113,70,36,14,30,49,60,61,39,39,54,51,57,38,49,62,27,25,24,21,29,29,32,37,38,43,45,47,55,60,59,60,74,78,56,66,132,137,86,74,88,118,143,160,173,184,197,204,208,217,225,232,238,241,240,232,230,216,209,203,198,194,186,186,175,171,174,170,164,158,155,156,164,159,154,155,150,150,151,156,165,169,164,150,141,134,141,155,162,168,170,173,175,169,166,180,193,186,160,120,102,97,63,55,82,138,179,199,203,196,193,185,188,197,201,192,171,140,106,68,41,39,67,97,124,152,169,183,190,201,198,191,183,174,171,165,168,165,160,154,155,165,166,174,174,170,182,189,165,125,112,114,97,99,111,109,100,93,86,77,94,92,77,91,114,141,136,102,103,107,108,100,84,81,81,74,76,78,78,81,72,71,67,61,57,50,56,49,51,48,42,54,101,89,60,59,46,41,29,26,21,24,37,58,76,84,98,98,100,95,87,90,81,83,81,77,75,76,82,72,75,77,71,61,59,55,64,81,80,80,75,67,72,73,81,81,73,51,33,22,19,21,16,21,27,38,53,53,54,54,51,51,47,49,46,51,45,47,53,46,50,50,49,47,48,45,47,50,45,48,51,44,50,48,42,45,43,43,40,42,44,43,46,42,42,42,39,37,40,38,37,37,41,49,42,41,36,41,43,37,43,38,35,42,39,35,27,24,26,35,86,108,69,29,16,25,30,26,26,27,30,24,24,29,31,29,25,32,30,27,26,31,23,25,28,28,32,22,29,28,25,29,30,27,28,27,24,26,32,27,25,31,48,33,28,30,27,30,26,29,25,27,21,23,32,28,25,26,38,87,76,35,23,20,26,20,24,24,24,31,27,31,35,31,23,25,27,26,23,19,22,28,22,24,24,26,28,34,29,36,31,25,35,32,45,44,41,34,25,33,32,28,34,29,34,29,33,32,26,23,25,26,21,24,26,23,19,21,19,27,21,19,24,24,24,20,22,21,19,20,22,25,23,25,17,26,24,34,90,53,18,31,18,17,27,24,17,23,23,22,22,21,24,23,23,27,23,29,70,105,92,64,31,14,19,20,24,24,19,24,23,20,24,23,24,23,20,20,25,22,19,25,20,24,24,23,28,28,29,24,139,230,238,240,202,170,165,195,204,192,199,177,119,96,63,13,9,30,53,97,163,194,222,233,212,177,177,189,178,169,160,156,153,153,155,142,141,150,153,154,152,148,142,145,146,149,154,146,146,132,113,124,129,135,136,135,122,117,133,139,139,145,143,146,149,147,148,151,151,157,161,160,166,165,169,171,177,165,168,113,10,2,10,10,13,12,15,13,14,15,15,15,148,152,150,154,155,162,132,109,146,190,243,247,248,248,247,247,243,243,242,216,166,180,184,207,193,156,192,190,192,193,179,174,163,159,152,144,133,122,102,77,68,84,95,98,106,104,106,105,104,112,111,116,125,127,134,129,150,134,75,39,6,19,32,59,91,96,94,100,98,93,101,82,71,69,71,58,50,42,37,55,83,97,89,76,66,53,33,36,26,33,70,142,228,251,251,231,229,248,252,252,253,253,252,252,245,198,28,45,144,166,143,111,93,85,54,32,35,60,142,183,176,145,110,90,55,35,22,17,17,17,34,30,50,59,53,51,27,42,46,49,54,53,62,52,52,42,47,49,59,53,54,75,89,120,140,150,173,181,206,205,232,189,92,126,214,226,224,213,224,231,241,235,237,229,226,221,215,221,212,208,199,194,186,185,184,179,182,179,181,177,172,173,173,177,178,179,174,163,161,158,161,163,159,162,157,160,170,166,173,175,165,163,156,153,156,163,171,172,176,177,191,187,187,180,163,152,73,35,35,86,151,160,185,202,210,208,199,193,170,152,132,105,111,66,41,43,66,99,130,156,175,185,199,208,204,193,181,179,175,172,169,164,165,165,169,174,166,162,167,163,160,165,160,159,170,179,170,130,108,112,101,74,75,41,19,49,69,61,52,62,57,84,127,147,147,117,124,135,132,131,110,105,98,90,92,88,90,87,88,83,89,84,76,74,72,66,66,64,53,57,63,53,41,38,27,27,27,25,43,60,75,92,105,98,89,88,90,91,89,83,85,84,83,74,72,77,81,75,71,64,57,53,54,55,75,73,75,76,73,83,77,84,74,49,39,22,20,19,14,19,27,42,51,53,54,57,53,49,53,46,45,46,49,46,50,53,48,49,49,49,48,48,49,44,48,51,49,47,40,47,45,42,46,44,44,43,45,43,42,45,40,43,45,36,37,40,38,39,42,44,45,39,37,41,40,41,37,37,37,38,44,41,41,37,30,24,28,24,31,81,103,71,29,33,36,22,28,27,24,25,26,28,27,25,32,28,29,32,24,27,24,27,30,25,33,24,26,26,26,33,28,25,26,26,27,33,28,31,28,48,56,24,26,27,34,29,28,31,22,29,24,28,22,26,28,23,31,71,93,46,19,18,21,25,22,21,30,33,27,29,26,24,28,30,26,21,24,26,21,21,27,29,24,26,29,29,29,37,31,20,30,34,30,41,44,33,25,29,36,27,33,32,27,33,31,30,29,27,24,17,22,24,22,27,22,24,21,21,25,23,22,20,24,25,24,20,23,19,26,25,18,28,19,26,21,45,92,39,24,30,17,27,24,22,25,23,24,24,19,23,24,21,23,25,23,21,32,71,110,97,61,26,17,26,27,20,20,21,22,26,26,23,22,25,22,22,18,27,23,22,23,25,25,27,23,29,22,57,180,227,211,170,131,100,114,168,175,170,196,195,153,149,104,18,7,13,13,42,114,213,253,253,251,230,232,248,238,233,223,220,219,227,229,226,231,232,233,234,230,227,224,224,225,228,229,215,212,190,165,177,181,182,181,177,159,151,160,158,158,159,154,151,149,146,150,147,146,148,149,150,153,151,154,155,165,146,145,108,13,2,9,10,13,10,14,13,13,13,14,14,191,194,192,193,194,205,196,193,202,206,221,222,206,194,182,176,181,174,170,153,131,133,123,127,108,90,124,121,122,121,119,122,114,119,120,122,121,115,99,35,5,13,46,105,142,155,158,159,166,175,183,191,196,195,206,196,205,192,153,139,50,20,58,105,155,151,144,137,131,129,125,122,113,110,123,122,114,64,29,77,133,179,186,162,151,98,85,77,59,81,104,171,238,248,241,227,227,233,253,253,248,247,226,196,194,93,6,20,71,75,47,38,36,35,37,26,25,71,171,206,168,102,59,69,63,49,17,12,9,26,33,37,89,107,88,62,31,38,64,62,59,62,57,59,101,139,153,174,194,200,211,216,220,221,227,231,232,232,232,229,240,156,100,174,210,214,210,223,226,224,217,207,193,185,185,174,168,171,169,169,169,170,171,164,169,170,174,167,166,171,168,175,169,174,173,171,170,168,167,165,163,160,169,170,167,167,166,169,170,165,167,174,170,159,171,171,175,187,180,189,190,187,131,76,68,78,95,102,145,194,212,230,232,222,203,176,156,123,80,51,24,18,47,77,101,115,131,153,174,191,198,196,194,188,180,177,173,176,165,169,173,161,168,171,174,169,167,167,168,168,155,158,155,155,167,174,166,144,125,108,96,86,70,24,4,27,47,39,33,36,31,81,133,154,147,127,132,139,137,123,111,106,97,90,93,90,93,100,98,100,105,107,96,87,75,78,92,74,58,46,36,29,28,27,25,31,46,64,71,82,89,92,90,89,82,76,81,80,79,86,78,81,78,81,79,76,83,75,65,51,50,57,66,69,76,78,74,75,76,85,79,60,40,21,19,19,21,18,25,38,48,51,56,54,50,49,51,49,46,49,42,48,53,48,49,46,48,49,45,45,46,43,45,46,46,46,45,40,43,44,42,43,40,44,46,46,45,39,38,41,41,39,37,38,44,43,46,39,39,38,36,40,37,39,39,38,38,42,42,40,42,39,36,35,33,26,23,28,26,30,80,108,75,46,25,19,24,24,30,31,27,28,26,24,32,32,24,26,27,25,29,29,23,26,27,25,28,27,39,28,22,29,25,26,29,32,28,23,29,29,29,23,29,33,29,31,27,26,28,24,26,24,24,31,26,24,24,54,97,60,26,17,25,26,31,32,27,29,28,33,25,27,31,27,35,29,23,23,20,27,26,27,25,21,31,32,31,34,27,21,29,41,39,30,35,34,29,33,30,29,29,32,29,23,29,29,28,30,21,21,24,19,21,23,23,24,22,24,20,20,21,24,27,21,19,22,24,23,24,23,24,23,19,25,18,65,86,25,26,25,18,25,25,22,23,23,19,23,20,27,24,23,26,20,23,21,25,30,69,113,96,60,31,18,21,19,24,24,19,23,24,22,21,22,24,22,28,23,24,23,23,29,22,23,21,31,23,75,210,229,184,186,173,137,145,178,171,166,182,184,181,194,162,112,122,91,39,21,96,194,248,248,250,232,249,252,252,252,252,252,252,252,253,253,253,253,253,253,253,253,253,253,252,252,253,253,251,251,249,249,249,249,246,246,239,237,238,233,231,229,223,214,206,195,195,189,184,182,174,171,171,161,158,155,157,136,130,103,14,1,9,9,12,10,12,11,12,12,13,13,178,178,171,167,158,165,154,158,163,140,123,97,83,71,53,40,48,63,76,90,112,134,143,166,150,124,139,137,143,145,146,155,153,159,165,169,170,169,160,81,9,22,73,142,184,186,182,167,162,164,159,159,154,149,147,136,137,127,121,97,21,23,60,98,128,122,122,128,132,138,149,150,167,171,183,192,198,94,21,77,155,210,222,210,175,134,112,95,105,132,132,165,208,223,201,162,136,134,178,168,133,107,84,69,59,23,7,53,99,118,110,111,124,124,74,24,17,65,148,118,73,61,57,103,104,87,80,83,105,139,41,30,139,154,122,86,67,102,107,112,83,34,36,110,168,239,240,244,245,242,231,219,207,201,197,188,189,188,184,186,188,90,78,177,184,181,174,168,163,158,167,162,165,163,162,159,153,156,150,151,156,160,163,159,158,161,160,154,154,160,162,160,158,158,157,150,151,160,164,169,164,159,157,145,139,137,135,138,148,152,152,160,173,182,179,186,185,182,167,106,103,81,45,54,78,151,186,211,222,220,203,171,150,119,87,51,28,22,44,89,117,134,124,165,161,151,145,152,162,166,169,179,176,165,168,166,158,164,164,155,157,159,163,162,156,165,170,170,160,147,151,159,156,150,160,162,163,154,137,106,72,95,112,57,8,32,59,59,72,78,68,96,122,121,116,98,112,105,95,93,83,92,89,94,89,87,101,107,111,107,114,103,105,95,85,89,72,55,33,24,24,25,29,39,50,63,76,79,92,85,81,80,79,77,73,76,74,76,77,71,78,82,86,81,74,77,65,63,53,50,64,71,77,76,84,81,76,79,68,61,44,25,16,19,26,20,25,38,48,55,50,51,52,51,49,46,47,50,42,50,49,47,49,49,46,46,45,45,47,48,50,48,44,45,46,45,40,46,48,40,44,45,47,43,44,44,40,41,41,43,38,42,43,44,45,43,45,39,41,39,36,43,37,40,39,39,45,40,38,41,35,37,37,35,33,24,26,21,24,28,30,82,104,78,35,21,27,21,26,29,25,24,32,30,24,30,31,22,24,29,29,26,25,27,27,24,28,34,26,24,27,28,33,26,29,27,24,25,21,28,28,27,34,32,32,33,27,27,27,29,23,26,25,25,22,24,30,39,86,74,33,23,20,29,31,36,31,24,33,29,26,24,18,37,49,29,18,25,28,24,27,27,27,29,28,29,32,29,24,24,28,34,37,34,35,33,31,35,27,30,29,30,32,27,28,31,31,28,24,19,21,23,23,24,26,23,20,21,21,23,29,21,19,23,23,20,23,23,22,22,23,22,22,24,26,78,71,18,27,22,17,22,23,26,24,16,23,28,20,23,21,22,23,23,25,26,28,22,33,80,114,97,66,29,15,23,18,23,23,24,25,23,24,23,25,21,30,29,20,24,19,24,23,24,27,24,40,126,218,213,185,215,225,184,166,184,182,171,178,182,177,196,189,166,191,172,130,147,171,178,162,159,179,170,215,250,251,251,249,249,253,253,252,252,253,253,252,252,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,252,252,250,247,243,239,234,226,219,217,213,199,194,186,183,157,148,107,10,1,8,8,11,10,12,10,11,13,13,12,105,109,101,98,87,90,71,65,93,103,106,96,82,86,56,18,13,24,69,117,152,181,208,249,227,173,178,170,176,176,179,181,173,177,173,176,171,169,163,83,16,28,73,127,145,134,118,102,90,81,74,69,64,62,62,53,61,60,69,59,12,27,73,130,168,168,171,179,188,193,198,204,207,213,217,228,220,113,28,69,159,236,244,239,211,123,93,89,110,155,127,103,112,127,120,77,36,38,86,118,127,126,133,150,182,74,9,112,190,205,183,168,161,152,104,41,19,58,127,87,52,66,64,108,106,129,186,237,249,249,111,38,129,144,125,72,97,158,145,152,120,36,21,113,212,220,212,209,205,208,200,181,170,169,164,156,163,167,159,165,136,40,92,162,152,163,145,137,122,127,135,126,132,126,142,154,160,164,144,149,152,151,143,144,152,151,154,141,150,151,144,136,131,149,150,134,137,162,175,193,185,177,176,153,155,147,131,147,162,159,147,165,196,184,193,185,165,134,51,12,46,119,155,179,207,212,206,174,140,111,61,31,14,28,62,92,105,120,156,191,185,184,199,193,184,154,161,179,199,203,184,178,157,160,162,140,156,168,158,145,156,164,167,160,159,170,174,167,140,152,163,167,152,157,176,162,162,156,143,108,78,92,117,81,48,76,96,86,106,118,93,90,82,80,79,78,80,84,92,86,83,97,97,102,102,97,108,108,107,116,121,120,105,90,62,39,34,22,26,31,30,42,55,67,77,87,89,92,79,80,82,78,79,71,78,81,75,74,75,86,87,88,78,76,66,60,61,56,60,66,81,81,79,80,77,87,85,66,42,27,20,21,24,24,27,42,49,50,51,50,49,49,51,51,49,49,46,48,49,50,48,45,48,49,41,48,48,49,50,47,52,47,42,41,44,46,44,46,44,43,47,50,43,42,42,45,39,41,41,42,41,44,46,43,39,37,38,39,40,36,37,40,37,38,45,42,36,39,36,41,36,33,38,37,38,31,26,22,26,28,25,29,79,108,75,42,21,22,24,24,26,29,26,26,23,27,31,25,25,29,28,23,27,28,22,27,30,28,24,27,27,24,27,31,29,22,32,26,23,26,33,32,29,46,43,25,25,30,24,24,23,22,25,24,27,27,29,25,68,91,48,26,21,30,33,32,29,27,28,27,28,24,26,28,32,24,21,24,20,39,39,23,27,36,29,27,35,29,25,21,27,32,33,31,37,35,23,31,32,30,34,30,33,29,29,24,26,27,19,21,20,22,21,24,24,20,27,23,23,20,27,28,22,23,21,22,23,22,23,23,27,22,25,17,34,87,54,23,26,15,24,21,22,23,18,21,23,25,18,25,23,25,24,20,27,21,23,21,24,32,71,113,99,62,32,21,22,24,23,23,21,19,27,25,18,25,23,22,24,23,21,24,24,24,27,35,92,119,153,142,109,179,208,169,154,152,162,163,167,183,188,203,197,182,206,216,219,240,237,170,76,24,43,77,133,175,207,202,174,175,219,244,252,252,252,252,252,252,253,253,252,252,252,252,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,252,252,252,252,252,249,244,242,241,234,233,226,226,207,198,114,4,1,6,9,10,9,12,10,13,13,13,13,106,113,113,118,109,118,63,39,91,132,152,147,141,141,91,22,16,34,91,148,173,191,227,245,231,170,163,152,154,152,145,144,129,122,119,113,108,94,79,29,13,39,61,95,96,84,79,71,78,79,87,94,96,105,116,117,124,126,149,128,27,30,94,168,215,216,221,225,227,230,228,224,226,216,203,195,188,65,1,35,89,145,142,126,89,59,58,50,82,122,98,59,42,35,38,24,14,43,126,210,253,253,250,250,246,141,23,110,181,191,150,118,108,97,56,31,26,51,136,150,128,132,146,123,66,137,219,253,242,237,86,16,102,93,56,42,116,199,167,170,129,18,16,113,192,200,191,194,191,189,188,179,171,178,166,152,167,167,167,160,100,50,127,168,155,167,163,163,145,155,165,160,165,152,158,166,180,182,161,165,174,176,162,158,165,177,169,155,164,169,170,153,157,170,181,186,171,183,199,229,229,177,168,165,199,223,199,185,200,227,222,206,200,190,143,106,96,127,174,156,175,211,214,207,168,137,108,66,40,40,78,127,143,160,183,218,241,222,207,218,228,201,190,199,229,207,179,199,234,234,178,159,157,196,229,210,177,181,212,212,190,179,202,216,193,178,188,217,212,186,184,217,224,191,183,163,165,160,139,128,98,99,112,82,73,95,96,109,120,115,103,93,86,84,93,76,95,100,101,106,97,105,99,104,102,103,113,106,110,125,117,82,50,33,30,24,27,29,35,51,54,65,76,86,91,88,88,80,80,78,81,78,78,79,70,82,80,79,84,86,87,78,74,64,57,59,57,67,74,76,96,89,89,89,85,78,42,29,21,17,17,21,29,38,51,51,54,58,51,50,53,51,50,51,48,45,50,52,49,49,44,47,44,41,46,44,49,49,47,47,47,44,44,42,43,46,46,44,43,46,47,44,41,41,40,40,41,39,45,47,45,43,37,40,39,39,42,35,41,37,41,44,39,42,34,33,38,36,38,36,30,35,37,35,39,32,29,25,27,24,21,27,31,84,114,84,52,24,19,26,26,27,25,26,26,24,28,33,32,25,26,29,30,22,26,29,25,23,25,27,25,28,25,26,26,30,29,24,28,31,32,25,29,35,33,22,28,28,19,27,22,25,29,28,39,29,26,25,50,93,66,37,26,24,29,28,29,29,32,22,28,24,22,33,22,21,26,22,29,36,29,22,29,35,35,32,29,33,24,22,25,29,32,31,37,32,31,28,32,28,35,31,28,28,31,27,29,26,22,22,17,21,26,23,22,21,27,27,19,21,22,26,21,23,22,20,26,18,25,24,23,21,27,19,50,94,43,23,28,16,25,19,21,26,18,24,22,19,22,25,23,24,25,25,22,23,23,24,21,21,33,75,111,95,63,29,16,22,19,23,23,21,24,23,24,21,24,24,19,26,24,24,27,25,30,89,92,44,80,72,55,122,149,130,116,99,116,141,161,185,200,220,211,184,211,229,229,246,231,200,122,14,9,27,48,83,130,142,114,122,179,233,238,236,224,217,236,241,248,248,252,252,252,252,252,252,253,253,252,252,252,252,252,252,253,253,252,252,252,252,251,251,250,250,252,252,248,240,236,243,245,250,253,248,250,247,227,112,3,1,5,9,10,10,12,12,13,13,13,13,150,160,160,165,154,155,87,56,116,151,172,168,162,166,122,45,33,53,100,151,161,170,188,226,169,97,102,93,95,89,88,79,71,71,66,66,66,65,58,23,18,62,97,141,157,159,168,174,187,192,200,206,208,214,219,215,218,210,218,175,60,39,99,188,238,245,239,235,227,216,194,180,159,134,110,90,72,17,18,35,25,23,15,16,11,16,39,69,74,73,71,79,55,21,35,64,107,146,188,249,252,252,246,246,240,98,2,56,98,96,61,47,40,30,28,34,25,53,152,209,187,197,206,208,84,46,157,179,159,107,8,19,81,83,73,44,132,237,194,165,120,16,13,106,183,188,184,164,119,115,145,164,184,233,249,200,176,188,228,228,103,87,214,248,215,190,185,224,210,178,179,207,249,225,181,181,227,251,193,170,204,249,236,176,181,240,251,205,179,210,251,244,177,197,250,250,189,157,165,203,135,85,125,149,243,250,205,186,242,248,222,141,148,93,45,56,133,248,250,190,141,126,92,50,21,65,184,220,147,152,234,250,247,203,235,253,252,199,185,252,251,193,168,224,250,204,149,165,210,170,92,122,160,245,250,213,162,208,250,225,153,175,249,248,164,161,230,251,216,166,211,251,248,181,162,140,158,155,138,131,122,118,103,66,55,91,103,107,118,118,117,114,111,109,105,106,106,106,112,110,100,108,99,100,109,118,129,114,105,82,33,23,30,24,33,42,51,62,70,76,74,83,88,81,82,82,82,82,77,81,82,79,78,74,83,88,84,86,80,69,71,62,50,64,58,67,69,72,82,89,105,103,98,79,46,33,19,19,19,18,29,37,51,59,51,59,53,55,53,46,51,49,46,49,53,46,44,49,47,46,43,45,48,45,46,52,46,44,44,49,40,41,46,40,43,44,53,45,41,42,42,41,40,45,39,41,40,47,45,41,43,41,39,39,42,38,39,41,38,39,39,38,41,34,40,38,35,40,32,34,36,39,38,37,38,32,33,27,23,28,29,27,29,27,73,117,86,51,21,23,31,27,23,24,23,29,33,32,27,26,30,26,28,28,26,24,29,28,24,33,28,21,29,28,27,24,25,30,31,34,28,25,24,26,22,23,27,24,27,24,27,25,39,49,28,29,24,24,35,81,93,42,24,19,27,32,28,25,22,29,25,27,29,23,22,26,26,25,26,30,20,24,30,30,31,29,32,27,27,22,23,29,35,34,32,33,33,27,27,35,33,29,31,29,33,28,30,29,23,19,20,21,19,24,20,21,26,26,23,17,24,19,22,28,19,27,21,22,23,18,27,20,27,23,67,86,28,24,24,19,22,16,25,23,20,24,22,21,24,23,17,26,21,21,23,23,26,21,24,25,21,33,71,114,98,60,36,18,19,21,23,24,19,24,26,23,24,22,21,28,26,25,26,31,95,138,82,27,78,85,48,91,111,125,136,105,141,168,165,189,203,214,191,165,192,208,208,227,232,240,201,104,116,94,43,28,48,82,118,155,178,174,150,156,140,127,149,155,166,216,249,229,160,186,207,195,198,208,218,242,251,251,249,249,252,252,252,252,252,252,250,250,250,250,251,250,237,221,222,236,250,252,252,252,252,252,237,109,2,0,6,9,12,12,11,11,13,14,14,13,182,181,178,175,169,165,91,74,129,162,171,160,149,153,109,36,42,54,92,120,112,108,123,136,75,46,70,71,83,87,89,103,110,118,130,139,155,160,167,78,28,84,135,205,229,237,244,240,248,245,248,248,248,247,247,242,238,227,219,162,44,25,65,108,191,178,159,135,120,97,74,62,56,56,59,69,67,23,28,43,44,50,33,29,48,78,90,99,73,51,64,89,92,59,84,156,206,227,184,188,221,202,168,138,107,20,14,53,76,89,83,94,98,95,49,61,57,87,128,140,114,134,183,191,35,14,87,92,90,67,9,34,115,128,99,60,118,196,190,204,163,33,7,93,169,177,153,100,43,54,103,131,166,245,251,165,158,199,250,210,72,137,249,250,201,165,188,235,191,149,157,204,250,204,155,165,197,213,165,152,188,208,190,156,171,237,253,175,159,200,251,217,163,184,236,231,147,120,123,102,74,100,138,158,217,227,163,146,162,169,95,60,63,72,34,60,76,115,117,61,36,54,125,84,107,160,247,250,200,193,237,249,208,165,177,214,215,163,154,198,205,172,163,187,207,166,141,144,192,110,88,144,148,206,226,170,139,165,197,170,148,158,188,177,150,156,198,218,173,152,174,216,196,157,151,133,146,147,132,135,135,136,109,70,61,54,57,97,106,103,110,107,108,111,109,108,115,108,110,118,114,117,102,115,118,103,86,56,30,25,25,35,39,47,67,81,93,93,88,83,81,79,85,83,79,80,83,83,88,84,86,83,84,90,81,84,76,77,66,63,63,51,56,56,64,73,71,92,96,100,108,85,55,32,19,21,21,19,24,35,51,60,59,58,55,53,53,53,53,49,46,49,50,51,47,45,45,44,45,44,44,47,45,47,44,43,51,46,44,45,43,46,43,45,46,44,39,44,46,40,45,45,45,43,43,45,42,42,43,42,39,42,37,41,35,36,38,44,41,37,38,34,39,36,35,36,33,36,40,39,41,38,31,36,33,35,33,29,23,35,58,31,21,27,23,70,109,86,57,27,20,24,20,26,24,37,35,24,30,24,20,24,24,28,34,21,28,24,26,27,23,26,31,23,24,27,25,36,31,28,26,24,30,23,27,28,26,47,35,24,22,29,44,30,27,23,22,24,34,73,92,61,27,23,24,29,27,26,28,22,27,29,27,26,30,30,26,24,27,29,19,24,26,28,29,29,33,29,27,21,34,31,34,33,33,33,30,26,29,33,30,31,29,31,29,30,33,29,26,19,16,22,24,22,27,19,22,23,21,27,23,18,19,19,24,24,21,23,23,21,18,25,22,23,84,69,19,26,20,19,23,18,24,21,16,25,23,22,24,19,23,24,21,22,24,27,22,24,22,24,21,23,37,72,106,98,62,32,19,19,23,26,22,19,27,21,25,22,22,28,20,24,25,87,162,171,109,89,149,134,97,121,141,177,193,170,199,191,179,205,205,200,181,172,194,194,199,222,226,242,217,175,211,173,84,29,9,36,104,141,154,137,133,167,143,128,137,131,127,176,234,151,52,74,115,117,120,140,150,204,243,216,183,154,182,206,213,224,202,234,253,253,249,249,251,250,237,213,220,245,252,252,251,251,252,252,231,111,4,1,6,10,11,11,13,12,14,13,14,14,160,159,148,137,128,127,66,59,108,117,121,100,92,89,37,10,41,57,88,112,107,114,120,134,110,110,142,150,171,184,195,200,208,218,222,224,225,227,222,104,21,71,137,208,229,232,233,223,221,214,205,198,193,181,171,157,148,128,108,62,16,21,17,23,23,15,14,12,13,15,14,31,64,91,123,150,147,55,38,110,192,238,235,229,222,185,95,59,70,64,71,71,37,39,68,110,126,103,70,62,85,101,107,117,112,27,24,118,178,186,155,151,148,127,95,95,127,122,104,81,41,57,97,77,9,55,146,180,208,190,46,29,118,125,100,38,83,146,142,208,206,49,3,83,166,188,176,153,126,130,146,141,144,155,146,135,142,165,192,134,50,120,190,178,162,152,152,168,153,147,155,171,194,154,147,150,160,162,139,148,150,152,139,143,148,155,155,141,144,142,159,147,143,154,174,165,139,153,158,178,163,182,212,194,213,181,109,98,73,52,47,55,92,74,38,25,15,51,78,102,154,174,188,188,189,198,206,188,159,160,173,177,147,145,152,160,151,142,151,165,159,152,163,167,167,145,153,165,180,159,155,173,146,159,162,137,148,161,171,157,160,169,169,162,159,171,177,171,153,161,157,160,142,150,161,124,145,145,139,138,139,141,131,111,83,66,66,94,99,97,100,93,95,95,89,104,110,103,105,111,108,101,88,66,45,22,19,27,30,36,39,73,104,107,111,99,91,88,84,82,82,89,88,83,82,82,83,83,85,84,81,86,86,78,70,61,66,66,57,62,66,64,64,63,64,83,83,93,111,96,63,34,30,21,18,18,24,39,45,56,60,61,59,55,51,52,53,54,50,47,53,50,47,46,47,47,49,44,42,47,48,49,48,47,39,41,45,45,41,43,45,46,47,43,41,41,41,41,42,43,44,45,42,46,44,37,39,37,41,39,42,37,35,44,40,38,41,42,39,36,39,38,37,37,33,39,43,36,35,39,34,33,34,32,35,34,33,27,23,29,39,25,21,24,24,23,60,108,93,57,26,16,30,26,32,34,22,23,27,24,24,31,27,24,24,29,29,24,24,23,26,27,27,26,24,29,39,30,23,26,21,26,28,22,32,25,36,38,24,26,25,29,22,26,26,25,25,32,34,45,95,75,33,27,24,24,26,23,27,29,25,25,30,28,27,25,24,28,25,29,25,16,26,30,31,31,32,32,27,22,30,34,30,32,36,36,34,24,35,34,26,33,29,26,33,36,34,25,24,19,21,19,17,22,22,23,22,24,22,24,24,23,26,23,20,22,24,25,21,20,20,27,27,42,91,53,20,26,17,19,21,26,21,26,22,19,23,22,23,18,25,23,16,26,21,22,24,21,27,24,21,23,24,33,69,105,99,57,32,19,20,22,18,29,29,23,26,23,27,23,24,25,62,120,137,120,101,129,150,125,106,130,150,171,181,170,181,178,179,195,198,200,194,185,194,184,172,183,178,188,172,152,195,155,119,123,54,20,38,57,89,97,133,171,163,160,163,154,140,174,241,152,29,77,134,132,134,143,143,184,199,154,113,61,100,141,149,162,117,167,245,246,247,247,252,252,242,221,221,242,249,249,249,234,231,206,182,107,9,1,9,10,12,11,13,12,13,14,14,13,88,89,84,70,68,63,26,39,78,92,95,87,82,89,41,17,53,83,146,176,190,205,229,245,201,185,199,201,220,225,226,227,225,226,222,218,213,212,196,80,23,54,92,162,173,164,146,127,118,100,94,83,73,70,59,46,31,16,11,15,28,39,33,42,49,42,37,22,18,13,25,44,69,83,93,105,87,44,47,116,204,251,245,243,232,146,39,32,75,98,96,40,12,22,45,73,57,49,38,44,79,139,174,194,172,32,20,103,158,150,103,93,80,63,30,70,137,147,157,144,117,127,198,167,45,107,228,252,242,228,76,30,95,88,56,33,108,169,146,165,165,49,1,41,110,162,203,218,205,196,190,175,162,163,150,144,170,166,157,67,50,150,174,168,154,156,163,162,150,163,163,160,161,139,152,155,160,158,150,165,167,165,150,155,158,159,150,145,158,154,155,144,151,159,159,159,159,186,208,218,217,207,178,158,127,54,25,15,41,45,19,33,55,88,116,136,155,174,184,191,205,213,205,193,180,177,175,157,152,165,167,162,151,160,164,166,166,158,158,155,138,130,134,143,146,147,157,165,182,182,178,168,156,162,155,150,154,161,161,156,160,162,170,165,159,144,163,162,159,164,158,158,156,162,163,134,139,151,143,131,133,142,130,128,129,110,105,111,100,93,96,89,84,78,77,77,78,76,71,49,41,35,19,24,19,27,33,68,71,104,121,134,124,112,101,93,91,87,86,80,85,89,88,81,78,84,85,78,80,77,83,80,73,65,57,59,60,66,63,66,70,66,76,75,89,95,91,102,66,35,31,18,20,25,23,31,43,58,57,58,62,57,57,52,53,49,47,55,50,52,51,42,43,47,49,44,44,45,47,47,46,44,43,45,40,43,41,46,45,47,46,39,43,39,42,44,39,38,46,43,46,42,41,45,38,39,40,38,37,40,41,42,41,44,42,39,37,37,34,35,41,36,39,34,33,43,40,37,33,34,37,39,28,29,33,29,39,32,25,21,23,22,24,26,19,33,25,52,108,95,62,35,20,31,25,23,28,22,24,23,29,31,24,31,25,28,21,19,32,26,31,27,24,27,30,39,25,19,25,29,27,23,27,27,27,24,21,28,24,29,34,21,26,24,24,29,24,34,36,36,80,90,46,23,20,25,29,27,26,26,30,26,27,30,24,23,27,24,29,27,25,25,21,30,33,27,31,34,23,23,26,30,32,36,34,31,34,33,29,32,33,27,28,35,29,30,38,29,18,21,25,19,19,20,19,20,23,23,21,24,21,24,24,23,21,21,26,24,19,24,17,28,19,57,89,31,21,25,18,24,21,23,21,19,22,22,25,24,22,23,24,24,22,25,24,24,19,24,23,18,26,25,27,22,33,63,104,98,67,37,21,23,20,21,23,24,21,20,26,25,27,61,92,83,62,48,64,87,63,56,63,74,95,83,95,89,110,120,117,137,142,147,143,133,130,118,124,137,148,162,149,141,163,142,168,191,116,96,101,46,46,74,97,135,154,173,187,185,175,208,248,168,52,113,163,163,160,166,155,193,213,162,110,66,120,153,156,158,91,146,242,243,247,247,252,252,245,228,213,214,225,220,201,185,179,160,144,97,15,2,10,10,14,12,14,14,14,15,14,15,88,94,101,107,107,111,57,72,135,162,177,170,169,177,99,36,63,100,163,212,219,236,245,245,226,190,188,186,193,186,184,177,166,162,151,145,131,127,111,29,13,30,63,103,91,76,59,49,39,35,39,46,53,63,73,79,76,66,49,20,17,39,72,98,96,101,98,103,110,101,99,84,74,68,54,40,22,13,34,67,125,146,125,119,109,105,88,80,90,110,150,106,16,24,89,132,131,120,110,100,110,138,132,102,59,15,14,29,46,47,33,34,27,24,15,46,138,153,170,212,191,228,245,245,102,96,208,232,218,130,14,37,98,109,85,44,142,230,212,190,131,19,12,25,26,57,121,190,212,197,193,184,179,182,171,185,181,183,122,34,98,176,181,170,164,165,163,170,166,166,166,163,167,157,164,161,163,162,168,172,166,165,165,170,166,168,165,167,166,165,170,163,162,160,178,188,184,201,202,198,151,83,39,7,6,10,12,16,42,94,130,152,178,188,198,208,211,212,202,191,179,176,173,163,163,162,167,168,167,162,165,172,171,171,173,170,166,166,157,155,151,143,138,143,155,155,159,163,167,165,164,162,162,168,171,170,171,165,155,155,157,149,141,139,128,118,129,135,139,151,149,154,153,163,166,133,132,140,139,130,125,128,130,127,126,124,117,123,110,109,107,95,86,75,69,66,63,53,46,39,37,46,55,61,83,93,99,121,125,132,129,121,107,91,94,86,83,77,80,77,81,81,77,80,80,89,88,79,86,80,73,74,61,62,66,61,76,72,66,75,72,71,84,95,109,103,72,39,23,27,24,26,26,30,44,50,59,59,55,59,57,57,56,55,46,50,53,53,51,48,53,43,42,47,44,50,47,45,45,43,45,42,46,45,42,45,46,46,42,43,40,46,43,43,41,37,42,44,44,42,44,43,43,38,42,40,33,39,38,38,46,37,39,39,34,42,35,40,40,36,37,36,41,38,34,37,35,36,37,33,36,30,33,34,33,36,37,32,28,28,23,25,26,22,27,27,30,31,48,96,107,81,44,22,23,25,23,27,22,27,30,26,24,23,28,27,25,30,30,24,25,26,26,31,36,28,21,23,23,27,22,25,29,26,28,23,25,30,27,33,24,30,33,24,27,29,30,28,36,36,54,92,67,34,21,21,30,22,26,34,32,27,26,26,27,28,24,29,32,29,28,24,23,28,34,33,29,29,27,22,29,36,33,30,33,29,31,30,34,29,33,33,26,33,31,29,36,27,16,23,23,19,22,18,21,21,19,21,20,29,23,22,20,24,23,19,24,24,25,21,26,23,23,77,77,27,23,22,17,22,21,24,23,20,21,21,23,22,24,24,22,28,22,24,27,22,26,26,22,24,23,21,25,21,25,32,60,104,102,68,40,24,17,22,23,21,27,19,23,29,66,101,77,59,50,49,53,60,48,35,48,50,49,46,48,39,41,51,45,43,49,53,52,41,78,146,168,194,214,222,224,221,238,197,198,203,132,188,191,105,78,55,81,93,118,177,208,216,200,225,247,169,60,116,147,153,159,168,164,202,226,170,112,86,148,173,167,170,96,148,244,244,247,247,252,252,245,237,202,179,182,176,159,145,144,139,144,100,16,2,10,10,14,12,15,13,14,15,15,15,170,178,183,184,190,174,98,110,167,197,210,204,198,195,103,42,55,93,162,193,206,221,236,239,185,141,135,121,115,105,93,84,78,73,66,58,49,42,24,9,29,48,78,116,112,113,119,122,123,128,137,140,146,149,155,156,164,169,167,108,29,51,124,194,215,203,198,180,172,155,134,113,99,86,73,55,28,10,30,83,117,131,130,139,162,184,205,108,107,147,195,173,30,27,111,165,165,150,127,104,107,111,82,27,7,11,33,63,81,76,90,115,130,151,89,88,172,158,139,166,144,160,228,168,19,37,88,75,35,10,8,33,78,83,61,50,137,221,233,234,175,31,9,104,131,52,31,90,155,196,207,202,198,195,200,196,190,133,41,70,156,188,178,168,168,162,169,173,168,170,164,166,171,168,172,167,166,168,166,166,161,160,163,167,166,169,170,171,176,175,184,184,185,200,208,210,177,116,79,57,16,6,11,14,39,88,137,163,188,198,203,216,212,208,201,190,186,181,171,160,158,159,171,173,167,166,164,168,166,165,166,168,170,168,163,160,163,168,173,177,180,179,174,176,175,174,170,163,167,164,165,169,168,169,167,169,167,162,162,160,159,150,161,160,153,145,152,155,139,149,149,147,156,168,170,135,109,123,134,137,138,125,119,120,117,108,103,112,118,123,125,120,117,103,98,107,99,92,96,95,107,109,114,130,124,134,126,121,120,105,109,102,95,93,87,76,68,71,71,76,77,76,77,85,84,81,79,69,73,66,63,68,61,63,62,65,67,78,70,73,73,79,102,91,71,47,25,27,23,24,29,34,47,53,53,60,63,59,53,52,55,55,51,48,50,53,53,55,47,48,46,45,47,44,50,45,46,43,37,47,46,45,47,46,45,48,44,39,45,43,40,44,47,40,44,44,46,46,36,45,38,35,41,36,42,41,38,43,39,41,36,38,34,37,39,35,36,41,36,36,43,38,33,37,36,38,33,33,34,33,38,35,35,35,37,35,33,30,35,30,24,24,24,24,21,26,24,24,24,31,83,106,75,50,24,22,19,21,30,25,26,27,24,27,29,22,27,28,24,27,27,24,28,34,27,27,24,28,22,27,28,21,28,23,25,27,27,34,29,22,35,30,24,27,27,26,29,27,29,31,37,81,84,45,17,18,27,23,27,32,33,26,28,26,25,31,30,23,31,45,24,21,20,27,34,31,30,28,24,26,26,33,36,32,33,29,30,32,31,31,29,32,32,29,33,41,34,22,22,24,18,22,24,21,21,24,21,20,25,21,21,21,22,19,22,24,26,23,23,24,22,16,34,89,56,19,27,16,19,21,22,21,25,24,21,21,22,24,21,24,24,23,24,18,25,25,24,24,20,21,23,23,23,24,21,21,33,65,115,113,71,46,18,24,26,19,25,25,24,66,108,82,55,50,53,54,53,59,42,36,46,48,53,51,54,47,51,63,48,38,41,29,22,87,185,227,244,251,253,253,252,252,253,244,244,199,106,168,181,118,84,65,56,33,61,131,172,191,172,203,247,144,53,98,104,114,128,150,150,193,220,152,93,90,158,172,169,168,100,160,244,244,247,247,252,252,247,245,217,178,176,171,154,141,141,138,154,109,14,2,10,10,14,12,14,14,14,15,14,14,198,197,196,193,194,174,97,107,156,183,194,178,174,170,83,27,45,71,124,148,148,154,170,156,89,74,72,61,63,56,57,61,67,76,80,92,98,108,79,16,37,87,143,193,197,205,206,208,209,211,212,214,200,189,192,184,190,189,202,126,22,37,103,152,164,150,131,125,117,110,105,113,116,129,141,155,114,31,59,128,196,237,246,247,227,212,156,109,135,183,218,155,35,22,78,115,111,86,72,65,77,91,75,64,26,9,69,143,193,203,208,227,233,245,162,121,202,202,174,156,74,45,48,12,8,19,21,26,29,39,31,34,46,59,54,33,103,158,166,224,211,53,15,125,203,174,66,16,53,107,155,184,189,186,174,154,80,24,66,144,192,184,171,170,170,170,169,168,170,171,167,164,168,167,171,165,171,169,163,171,170,168,167,166,171,178,182,188,199,212,215,212,202,166,134,110,39,1,11,23,65,105,143,171,192,214,221,226,216,207,194,184,184,178,174,174,174,174,166,160,155,159,162,166,174,169,169,171,167,171,171,175,174,166,166,160,165,171,169,173,177,177,174,170,169,166,165,167,170,173,177,180,172,154,144,137,135,137,137,150,168,179,184,185,181,176,181,181,174,171,166,170,170,168,169,133,104,108,131,149,138,121,118,116,118,114,105,110,109,124,125,130,133,116,127,129,130,126,123,124,126,121,122,118,113,113,108,107,100,102,101,92,87,86,80,74,69,64,78,72,78,81,82,85,71,69,65,60,67,63,66,66,66,68,73,64,78,86,81,82,87,82,59,39,28,26,25,21,32,44,44,55,59,61,59,56,64,54,54,52,52,59,53,49,47,53,47,50,50,45,45,44,50,47,42,47,47,49,46,43,45,46,43,49,42,40,43,44,43,41,40,42,46,49,45,40,38,40,41,38,38,39,39,36,42,39,43,38,41,39,39,38,34,39,38,39,33,39,36,35,34,34,37,33,36,34,32,34,39,33,32,36,34,33,32,34,30,35,36,28,21,25,25,23,25,21,26,28,23,39,41,78,114,90,61,31,25,29,25,24,28,23,26,31,25,27,23,25,27,29,29,29,30,23,27,21,25,24,27,34,26,28,23,21,29,29,29,27,23,23,24,29,21,25,29,32,31,31,32,28,30,63,88,61,32,21,21,31,31,28,33,27,30,31,27,32,28,21,29,29,21,23,26,27,29,31,34,24,29,24,21,36,34,31,33,35,33,28,29,29,33,35,34,30,29,35,37,27,21,20,19,22,17,21,17,22,21,22,23,19,24,21,23,22,21,21,23,21,22,23,23,16,54,87,39,24,25,19,17,22,27,18,16,24,27,22,24,21,22,20,23,25,20,25,24,25,24,23,24,23,23,22,24,22,24,21,21,34,60,111,110,72,44,24,22,21,25,23,69,105,77,55,44,50,53,49,53,60,52,38,46,55,52,55,57,47,56,60,49,37,42,37,103,205,227,237,230,249,249,249,250,250,253,253,249,215,91,101,109,97,115,94,34,6,12,20,51,78,66,125,200,89,46,85,64,86,96,106,107,157,211,136,68,94,151,169,164,164,105,160,245,245,247,247,252,252,249,249,239,195,182,181,173,160,152,148,163,110,12,2,10,12,14,13,15,13,14,15,15,14,181,180,178,168,165,121,81,85,125,136,136,114,105,93,30,29,39,55,90,90,86,80,87,78,70,86,102,117,129,143,153,162,171,178,183,191,194,207,152,45,42,85,154,209,216,215,208,206,206,200,192,184,162,149,141,123,115,104,97,44,13,41,65,97,103,97,107,112,131,146,155,170,185,203,206,218,142,50,55,139,203,251,236,220,139,68,63,46,103,134,131,82,5,19,44,65,51,46,48,57,91,131,152,170,99,18,84,177,224,232,226,222,211,207,112,89,179,211,222,211,114,65,81,23,7,76,139,162,172,137,40,49,88,74,60,42,117,163,142,165,170,42,15,121,190,197,173,98,22,12,38,64,74,73,55,15,27,86,145,194,191,171,167,164,162,160,159,165,162,166,167,164,164,165,165,164,171,172,171,173,177,181,186,198,205,211,218,217,211,191,159,117,61,83,8,25,63,100,141,173,190,208,217,217,217,205,195,186,181,177,172,174,170,170,175,172,172,171,168,168,166,160,166,167,167,171,166,169,171,171,171,171,170,166,162,165,168,166,161,154,150,155,156,158,160,162,157,152,164,169,171,178,173,165,154,151,148,146,148,160,170,175,184,181,177,173,180,184,182,181,178,175,165,154,159,141,116,105,109,121,118,108,109,112,114,113,114,112,116,124,125,123,122,122,124,125,120,118,120,116,114,110,108,109,100,106,105,107,103,97,104,87,86,80,75,74,65,71,73,79,84,81,74,71,65,67,65,59,67,67,71,68,66,73,76,78,78,85,92,80,60,39,23,22,24,24,29,40,47,57,62,57,57,56,55,57,57,53,50,51,54,54,50,49,49,50,47,49,49,47,54,50,48,43,44,46,47,45,41,46,46,52,43,44,44,44,43,41,42,40,45,48,49,41,37,39,40,42,40,41,38,36,43,41,43,40,35,41,37,36,40,33,36,38,40,39,35,34,32,35,33,39,32,28,39,35,33,35,37,33,33,35,33,28,33,31,31,32,33,34,24,24,25,24,26,26,26,24,29,34,32,47,82,134,111,68,41,21,21,20,22,24,26,22,27,25,23,27,27,31,24,31,24,23,27,26,25,31,37,29,22,27,28,22,24,27,27,27,23,23,25,28,33,29,35,39,28,28,30,28,24,45,87,84,39,20,28,34,29,29,29,26,29,26,31,27,19,20,24,19,25,36,27,24,30,32,24,33,25,24,33,28,33,33,32,34,29,32,32,26,36,36,31,30,35,33,37,29,22,20,19,24,24,17,24,21,22,21,22,24,23,24,28,19,24,24,21,27,21,20,21,23,71,78,28,25,20,16,24,18,22,29,19,24,24,19,24,22,23,24,20,24,23,22,26,22,21,22,22,22,22,24,25,25,21,25,27,22,32,61,108,112,68,40,24,19,23,66,107,78,49,48,50,51,46,46,59,61,49,40,41,48,53,55,52,50,53,62,46,35,43,47,139,189,195,216,206,208,200,194,210,234,250,250,249,214,123,107,88,100,149,156,86,32,11,12,12,17,11,54,147,70,42,77,75,93,77,63,63,141,203,121,56,74,125,143,142,147,96,160,247,247,248,248,253,253,252,252,247,210,189,189,184,177,170,160,173,111,11,3,10,12,14,13,15,13,15,15,15,15,124,114,96,90,89,61,26,59,78,79,77,56,53,36,10,36,61,86,122,139,141,144,151,156,159,182,195,200,212,211,216,219,216,215,212,213,211,215,164,47,32,72,125,170,166,155,135,124,118,105,95,88,79,71,70,61,60,59,57,20,16,60,108,158,169,172,182,189,198,202,204,213,208,213,208,208,142,34,39,97,160,196,170,79,44,57,51,59,66,64,57,25,8,35,77,98,103,102,103,119,155,206,215,232,132,27,82,145,187,170,142,125,95,68,12,49,139,164,200,236,181,164,190,89,42,132,205,225,205,135,49,68,107,84,47,52,150,225,199,177,122,10,21,121,187,191,186,185,157,105,44,36,44,50,71,102,135,170,199,193,179,171,166,160,153,152,150,150,154,159,162,163,167,166,170,171,180,184,186,198,197,201,202,191,178,156,131,101,64,19,5,11,24,47,98,153,178,201,214,210,205,196,187,183,176,172,167,165,162,165,167,164,169,167,169,168,163,163,157,162,165,163,170,171,171,166,166,172,170,173,170,163,152,146,158,162,169,173,163,155,154,157,160,160,163,165,157,148,145,149,153,164,168,169,168,171,177,175,179,176,165,155,155,162,163,164,166,173,174,172,171,174,175,173,177,163,134,105,83,86,96,96,96,103,106,99,105,116,118,127,124,120,122,118,122,122,120,118,118,112,114,109,111,114,106,106,105,103,99,99,97,88,83,84,81,77,76,67,82,81,78,72,60,66,61,69,66,65,66,69,69,77,84,73,81,87,92,73,56,38,27,19,21,29,34,38,50,55,59,61,56,60,59,59,54,53,58,44,51,54,49,55,53,49,45,48,49,48,52,53,49,46,46,44,43,43,48,47,45,47,43,46,44,43,47,43,43,40,43,46,46,41,41,44,43,39,39,35,42,42,44,44,35,43,37,36,37,38,39,34,39,41,34,41,37,32,38,33,34,35,31,36,32,29,39,33,41,37,25,34,31,32,31,31,34,34,33,35,33,31,29,25,22,25,24,22,29,26,27,27,25,37,26,42,99,107,80,51,26,19,29,28,26,20,27,32,21,25,27,29,27,23,27,24,27,28,28,33,28,24,22,26,29,25,28,28,27,27,23,24,25,27,32,33,48,39,20,28,22,24,26,30,70,92,57,28,28,24,30,27,29,29,23,31,27,21,22,24,22,25,37,34,26,29,27,30,31,28,26,22,31,31,33,31,33,36,34,36,27,28,37,30,28,29,34,35,33,31,16,20,19,22,20,20,23,19,23,18,21,21,20,21,19,26,24,27,22,24,24,21,19,32,92,63,16,27,23,16,21,24,19,24,23,25,23,24,26,18,24,24,29,26,20,22,22,26,23,19,23,25,27,24,27,23,23,26,26,27,24,29,59,108,107,71,47,21,75,111,76,47,33,50,48,45,50,49,53,61,47,33,45,44,50,55,55,50,54,63,53,40,39,41,82,122,163,215,212,208,200,185,186,202,209,211,219,171,144,136,104,111,191,235,207,197,188,167,125,80,59,124,172,51,33,92,105,122,84,53,62,143,203,121,50,52,68,84,87,100,73,149,247,247,249,249,252,252,250,253,248,224,195,190,193,182,179,176,176,110,10,1,10,12,14,12,15,13,15,15,15,15,60,63,53,52,42,19,22,53,94,114,128,128,132,100,25,43,81,115,187,200,208,204,212,210,208,210,209,210,206,204,199,191,188,184,174,163,153,151,87,16,26,47,90,108,89,76,66,60,64,64,66,82,89,105,117,128,145,155,150,54,16,73,138,201,208,206,210,205,202,193,189,181,160,146,129,113,50,4,32,74,110,115,100,75,71,108,122,107,136,146,144,85,2,55,126,169,158,143,132,130,165,200,203,198,93,2,50,103,115,88,67,61,67,75,29,71,154,160,152,191,170,171,201,89,33,90,150,137,95,55,22,35,35,39,27,37,138,220,231,226,159,21,21,121,183,190,187,191,196,199,182,169,165,174,184,190,202,198,182,176,168,167,168,157,156,160,159,160,160,163,170,174,178,186,192,195,198,191,177,157,136,114,85,56,24,5,11,10,14,11,30,111,166,190,205,211,207,196,188,179,172,172,171,164,163,165,169,162,159,162,162,165,163,161,162,159,160,158,156,159,159,161,167,171,171,171,167,170,170,165,165,162,161,160,162,169,170,167,163,164,168,170,168,168,166,168,171,165,165,165,166,171,169,170,168,169,172,174,174,174,165,160,155,159,169,173,180,175,162,151,159,173,179,176,180,174,145,104,80,81,87,94,93,88,94,87,96,111,118,126,123,125,119,115,122,122,121,119,115,111,119,112,110,111,108,107,101,102,96,103,104,93,96,88,93,77,79,83,77,76,69,64,66,63,61,72,71,65,75,69,73,80,87,92,71,78,49,38,24,22,26,27,34,39,52,57,55,60,60,61,60,51,54,55,48,51,57,57,52,50,49,49,50,51,49,47,51,55,50,47,43,45,45,47,42,42,46,47,43,45,46,43,42,42,44,38,44,45,42,41,38,36,43,41,39,40,41,41,49,39,36,45,37,36,39,38,37,39,35,37,38,40,39,31,36,36,34,36,31,38,33,29,39,36,36,30,34,33,32,33,34,36,33,33,34,35,35,38,29,35,33,27,28,21,26,22,26,27,23,24,24,26,27,30,36,81,110,84,58,37,21,21,24,20,30,25,25,29,26,31,29,23,24,28,27,33,31,23,25,28,24,29,27,31,28,19,27,28,25,31,27,29,30,28,33,26,24,23,25,32,25,26,49,88,77,37,19,26,31,25,24,27,30,27,24,24,23,22,25,31,31,32,28,32,24,27,34,27,27,26,29,29,30,36,33,33,32,28,31,31,34,36,31,27,27,31,35,33,19,19,23,19,23,22,21,24,21,20,21,20,17,22,24,23,28,24,21,28,22,20,17,59,92,40,19,27,16,23,26,22,22,21,24,26,21,22,24,21,18,26,26,21,24,24,25,21,23,22,21,26,17,23,26,21,27,25,23,28,25,23,31,57,97,108,79,77,118,67,39,37,42,49,46,44,49,48,50,57,42,35,43,48,51,53,54,51,52,61,50,34,37,37,57,61,100,156,172,201,212,212,212,216,210,196,182,133,121,115,97,118,199,246,251,251,253,253,248,247,230,246,223,88,83,123,124,141,112,91,110,188,232,157,83,52,50,51,42,63,60,149,247,247,249,249,251,251,216,237,248,227,205,191,193,180,179,179,179,111,10,1,10,12,14,12,14,13,14,15,15,14,124,135,133,139,128,60,47,101,146,187,200,188,197,146,46,42,78,134,183,201,203,194,192,185,177,171,161,149,145,136,121,111,103,97,90,81,63,46,17,12,39,57,90,110,104,114,121,136,147,157,168,178,187,193,201,204,209,215,195,78,19,67,133,186,189,172,163,147,131,117,102,91,68,61,49,35,17,15,50,100,152,167,168,127,116,142,147,147,187,226,236,134,23,59,135,164,152,123,103,89,104,121,101,84,22,10,58,99,127,117,137,157,176,179,103,128,206,203,169,143,100,112,122,46,23,44,59,69,48,32,27,21,29,30,31,28,78,150,169,227,193,33,17,118,184,184,174,175,182,188,196,203,201,199,193,190,183,177,169,163,165,163,162,164,164,170,172,177,185,190,198,198,191,181,169,146,127,97,64,42,17,23,29,36,47,46,41,31,39,82,146,191,214,199,194,189,176,178,169,167,171,173,174,176,172,174,173,169,168,170,169,167,168,165,167,165,163,166,166,169,170,169,170,168,167,168,167,166,165,163,165,169,169,168,170,169,167,168,167,170,171,167,168,169,165,172,173,177,179,177,175,171,165,165,159,159,158,155,164,167,169,170,172,174,174,179,174,168,159,157,162,165,165,165,169,174,148,110,92,81,84,95,96,81,77,69,79,99,115,126,120,117,119,116,115,118,114,116,116,110,109,106,108,105,101,101,101,102,103,111,111,100,89,79,67,73,80,76,81,73,74,79,73,74,71,72,67,76,79,78,79,92,93,66,52,34,22,22,29,33,34,43,52,56,60,60,59,57,56,57,54,58,59,54,50,53,55,55,54,47,53,49,45,51,54,52,45,48,49,49,48,45,49,46,47,44,48,46,45,46,42,42,40,43,47,43,43,41,39,39,43,43,36,43,43,41,42,41,40,37,34,40,36,41,38,35,45,34,43,39,37,38,35,41,33,38,39,31,36,33,33,39,33,33,33,33,35,33,36,31,28,33,34,35,36,38,29,33,33,29,35,29,27,22,25,22,30,23,20,27,24,28,29,29,27,34,69,113,104,73,48,24,22,25,28,22,23,31,24,28,23,28,28,27,29,21,27,29,29,24,27,32,29,26,28,21,20,24,26,33,30,26,31,32,23,24,30,29,28,30,28,29,33,76,92,47,27,25,25,27,27,26,22,27,25,25,27,23,30,24,28,32,26,34,31,34,29,26,23,25,34,33,31,33,37,34,30,33,34,31,33,35,28,29,28,29,30,25,24,21,22,24,23,22,25,23,22,22,24,18,20,24,19,22,22,23,22,21,23,20,28,81,80,27,22,23,15,18,23,23,25,23,19,24,22,20,23,22,22,22,24,21,26,22,19,24,24,24,25,24,18,27,27,21,29,25,28,29,19,26,32,31,53,99,133,130,83,33,39,42,38,43,46,54,55,60,60,52,49,51,48,48,53,49,49,51,52,54,42,35,38,43,46,47,39,58,79,114,149,162,183,202,209,213,185,125,100,88,83,89,141,193,219,244,246,250,253,253,253,253,244,181,198,188,136,134,126,134,181,242,246,226,171,140,130,127,112,120,110,179,250,250,250,250,246,229,168,208,246,235,216,188,196,180,175,183,172,108,11,0,9,10,13,12,14,13,15,15,15,14,190,199,186,190,172,82,64,108,161,194,198,189,185,136,41,33,59,105,144,157,146,127,121,109,96,87,81,72,64,60,60,57,55,61,66,70,78,84,33,14,55,97,159,186,187,195,201,203,210,208,207,210,209,210,206,201,193,191,170,56,12,43,85,131,117,101,84,75,66,56,62,70,79,94,113,132,63,18,89,162,235,253,249,191,137,147,141,137,177,215,227,130,16,33,88,113,94,71,57,50,71,81,85,87,26,17,101,167,200,199,207,210,213,196,108,129,205,232,207,178,87,45,61,18,25,29,33,34,30,36,22,33,36,38,48,34,78,139,132,169,158,23,19,109,173,181,167,165,163,169,171,172,171,170,166,165,166,164,166,169,168,171,174,173,179,191,193,192,190,176,160,135,103,77,49,27,23,46,70,92,115,136,156,169,183,190,190,178,122,158,158,153,148,146,148,152,155,155,155,157,153,156,160,160,163,163,169,169,170,168,170,171,165,163,165,163,161,160,163,168,167,165,165,164,160,163,163,162,165,165,167,166,166,170,170,169,171,172,170,174,173,168,162,162,164,164,165,159,159,161,158,158,155,148,153,160,165,163,165,171,169,171,171,174,173,169,167,162,168,168,165,165,159,162,164,168,165,132,104,94,74,92,102,82,77,55,61,73,97,114,115,119,112,116,117,114,113,114,107,107,104,102,107,103,106,114,110,108,105,103,97,77,66,55,73,72,69,83,77,77,77,79,75,74,77,78,83,76,87,89,61,58,56,29,37,24,25,37,38,51,55,53,62,60,60,58,58,56,54,57,54,55,55,57,53,51,55,52,51,50,47,52,53,51,50,48,50,48,46,47,48,47,50,49,47,42,45,46,42,49,44,44,45,43,48,43,44,46,40,39,34,39,40,37,45,42,36,39,40,38,36,39,39,39,39,37,37,37,39,32,37,38,32,36,32,33,34,29,35,36,34,34,33,31,33,35,35,32,34,34,35,39,31,29,33,30,29,30,29,33,27,29,29,25,23,19,24,26,22,21,29,26,31,27,47,52,24,44,101,115,84,55,33,26,23,20,27,30,23,21,29,25,29,28,28,29,24,27,27,26,23,27,29,23,23,26,27,28,29,33,30,27,29,27,29,27,24,31,38,33,22,31,33,50,89,67,32,21,21,26,24,27,25,24,27,24,26,24,29,35,29,28,27,34,36,33,33,29,26,24,31,32,36,32,30,38,35,29,29,33,29,35,28,29,34,28,27,25,21,19,21,21,19,22,20,17,22,18,21,21,21,20,22,23,23,22,23,23,27,19,40,96,57,21,24,21,18,20,24,24,22,24,20,22,24,24,21,19,27,26,22,22,22,21,23,21,26,27,21,26,22,25,20,23,34,16,24,23,26,24,24,26,50,118,124,108,79,29,54,39,36,34,44,49,54,64,57,62,64,61,50,53,54,55,53,48,53,51,41,42,42,40,43,46,44,48,35,32,86,62,89,127,147,165,158,122,115,101,83,73,118,171,188,187,164,162,174,190,201,211,203,193,220,185,102,63,57,94,182,242,251,251,248,243,242,240,230,229,207,241,252,252,252,252,246,235,186,229,252,250,230,195,201,184,175,185,172,105,12,0,10,10,13,12,14,13,14,15,15,14,186,188,179,181,146,63,62,94,136,173,163,152,142,84,31,31,46,78,98,95,77,65,63,59,61,68,73,81,98,109,121,136,145,151,162,172,175,182,93,25,63,116,182,212,212,213,205,205,195,188,178,164,156,146,132,117,96,90,63,14,18,32,71,98,89,92,103,116,130,142,156,174,182,200,206,214,127,39,94,168,250,251,248,208,137,139,130,113,113,122,119,45,13,29,47,64,52,53,69,88,139,177,193,190,68,31,112,182,218,196,186,167,147,112,49,90,157,183,213,222,130,47,27,13,26,26,19,23,23,22,34,43,65,75,69,99,164,213,197,175,108,7,25,119,178,176,162,165,165,164,164,162,164,166,167,167,167,176,177,180,190,191,198,196,177,161,139,116,90,55,36,25,33,56,83,111,138,160,172,185,193,199,195,192,193,188,189,183,184,176,159,148,144,138,144,143,139,143,143,143,145,141,142,149,145,145,151,153,152,154,155,158,151,150,155,154,158,159,159,160,158,161,165,159,158,159,160,160,163,165,160,162,167,169,169,170,171,173,168,170,168,164,166,169,166,167,162,160,161,163,166,167,163,167,171,178,177,172,174,171,169,164,162,165,165,165,168,165,162,169,174,174,174,169,171,180,183,168,122,88,78,87,87,83,90,77,65,59,78,108,111,113,117,115,116,116,116,114,112,108,107,113,114,111,113,117,107,97,82,67,63,65,69,73,78,78,81,81,85,75,73,81,70,76,86,83,94,97,69,54,35,27,26,27,36,36,48,53,59,61,60,61,60,62,64,61,53,53,55,56,52,55,55,49,49,52,52,53,55,49,51,53,50,46,48,53,44,47,49,53,51,46,47,46,46,41,38,46,46,47,40,45,47,41,44,42,40,39,36,41,39,41,44,42,42,39,38,40,33,39,40,34,37,39,39,37,38,37,33,31,32,36,35,36,32,37,34,33,39,32,34,34,33,35,32,32,35,36,34,35,33,33,29,30,31,27,33,33,29,32,30,30,34,26,25,25,28,27,22,27,23,24,31,27,30,28,21,27,28,65,110,100,72,45,24,24,27,25,24,23,25,26,27,25,24,29,27,19,23,31,32,29,26,25,27,31,25,32,29,29,26,29,29,25,33,24,25,33,29,27,26,26,28,33,76,87,52,26,16,25,24,24,23,29,23,26,31,25,37,33,27,24,30,39,32,31,35,35,24,25,31,33,34,26,38,33,31,33,28,33,31,33,29,33,29,29,31,29,17,21,22,19,19,21,21,22,23,20,21,24,23,25,19,19,23,17,24,21,23,18,61,94,36,24,30,17,15,19,26,21,22,28,23,19,25,23,20,21,24,21,23,21,20,27,22,27,22,23,26,21,22,21,28,33,34,25,25,27,22,27,25,52,110,98,61,74,78,78,69,42,39,38,53,48,45,56,52,57,57,54,55,50,54,55,56,54,51,55,52,50,49,44,46,49,50,52,50,46,45,33,28,35,53,89,110,124,141,126,101,103,141,180,199,188,164,150,139,146,150,151,148,150,161,130,52,15,9,27,118,193,240,250,252,252,252,252,252,252,253,253,252,252,252,252,248,246,232,251,250,250,242,199,210,192,180,190,167,102,13,1,10,10,13,11,13,13,14,15,15,14,152,149,136,113,83,46,53,73,99,125,98,81,63,27,18,41,62,92,106,111,116,122,132,143,153,165,172,179,190,197,208,208,211,212,211,214,213,203,102,29,53,99,162,188,184,169,157,143,128,113,99,92,83,73,66,59,49,61,45,10,29,73,133,168,167,178,187,204,208,213,219,219,222,221,219,213,108,19,55,114,183,210,191,137,118,132,124,89,27,5,10,18,32,38,49,51,41,41,71,121,174,196,200,185,71,28,89,143,165,136,113,84,57,33,24,91,135,142,159,213,132,10,12,15,28,33,30,24,21,28,29,60,118,162,165,152,175,212,215,211,127,4,34,125,176,175,166,168,166,166,164,165,168,167,177,186,192,198,192,182,167,147,130,105,74,51,27,28,48,66,97,125,147,168,184,198,200,199,196,188,181,174,167,161,160,159,162,165,169,173,166,159,156,156,155,153,155,154,155,154,154,157,153,153,154,151,152,149,149,149,149,148,151,151,150,151,155,158,155,157,159,158,156,156,155,156,156,157,158,160,162,167,169,166,168,166,168,169,169,171,167,168,167,167,167,168,168,171,177,174,174,176,173,174,177,173,172,170,168,166,162,162,161,160,160,163,165,165,162,152,157,171,172,173,169,168,169,169,146,122,89,78,64,63,106,96,96,98,118,134,128,133,127,120,120,118,122,122,117,121,112,117,121,108,100,90,83,74,72,61,68,79,85,86,90,81,85,90,83,75,79,85,86,93,86,76,55,40,29,29,29,35,38,43,52,54,64,63,61,66,60,59,57,53,59,55,55,53,55,56,51,51,51,53,51,50,53,55,54,53,48,50,52,50,45,49,49,49,51,49,53,44,46,46,42,42,39,40,49,49,42,40,43,45,45,43,43,41,37,40,42,39,43,38,39,40,39,33,37,41,42,41,38,33,34,36,33,37,41,34,32,37,39,37,39,39,37,33,38,32,29,37,33,34,36,33,38,35,33,31,29,37,29,34,30,21,35,30,30,37,28,25,27,31,32,21,22,26,22,27,29,24,27,29,28,28,27,27,24,28,53,101,111,88,62,31,20,17,27,26,29,32,24,26,27,30,21,27,32,27,31,23,22,28,26,30,33,33,35,26,22,32,30,24,26,32,24,24,28,24,34,27,26,29,48,89,71,33,24,23,24,28,25,28,21,28,29,22,27,25,25,25,28,34,30,33,35,29,25,29,32,31,35,36,35,37,31,29,30,34,35,34,31,31,29,32,31,24,23,24,18,21,22,20,23,18,20,21,22,19,16,25,19,23,22,18,23,23,26,27,85,74,22,30,19,19,21,19,23,18,22,24,24,24,22,21,19,22,22,21,26,26,23,22,23,24,25,20,27,25,19,26,24,34,29,21,31,25,27,28,67,110,86,54,12,61,128,125,100,76,100,100,107,86,66,57,37,74,72,41,64,45,44,52,51,49,43,55,51,54,46,46,48,51,56,54,55,51,55,47,46,42,27,53,87,131,168,165,130,120,134,181,223,232,219,204,195,189,188,182,171,167,160,144,108,74,48,49,79,86,130,185,224,248,252,252,252,252,252,252,253,253,252,252,251,251,252,252,252,252,247,208,216,203,187,198,168,96,15,1,10,10,12,12,13,13,14,15,15,14,78,69,58,43,23,21,44,55,72,94,91,89,93,43,13,53,97,148,181,196,201,203,210,214,212,213,212,212,213,211,213,207,201,198,191,182,181,163,63,20,39,73,118,118,108,91,79,77,70,72,81,95,105,120,134,148,155,169,139,33,38,103,170,216,218,222,220,215,213,201,192,179,163,143,129,112,30,1,33,49,76,75,93,101,109,132,123,105,59,20,14,21,34,46,60,76,71,79,91,100,136,131,122,91,27,33,61,92,94,81,71,56,48,29,32,123,188,178,145,156,90,9,15,15,27,34,34,27,28,26,28,51,86,105,100,99,104,128,160,208,149,16,21,105,167,179,174,176,174,181,189,192,193,187,181,170,150,128,105,79,54,27,21,41,63,92,116,141,159,178,194,200,200,199,194,185,179,175,168,167,166,163,162,160,162,166,170,166,168,169,168,164,162,160,158,160,167,165,161,158,158,160,160,160,155,157,160,156,155,154,155,157,158,158,159,157,159,159,154,157,157,157,158,154,157,159,156,155,158,160,163,165,166,165,162,163,164,160,162,168,165,166,169,167,165,168,166,167,170,165,162,162,158,161,161,165,167,170,169,164,163,167,171,171,167,167,171,174,167,159,159,159,158,152,153,152,155,166,194,219,192,122,54,65,96,95,118,124,126,134,128,132,132,125,123,119,116,115,110,106,104,98,89,74,69,75,84,92,83,75,83,94,92,96,93,86,93,91,87,75,95,105,84,63,43,28,26,29,30,41,45,54,55,56,61,61,62,60,65,60,56,60,59,55,57,55,54,57,54,54,53,50,51,53,53,51,55,54,53,50,50,50,50,47,46,50,51,49,47,48,46,46,48,44,46,45,46,46,43,46,42,40,44,44,46,43,43,46,38,40,38,35,41,35,39,39,34,42,42,39,40,42,36,33,36,35,35,34,33,34,37,37,38,39,35,32,34,38,33,36,34,33,33,34,38,30,31,31,30,33,28,33,32,24,34,34,34,35,32,29,24,31,30,31,30,26,25,24,26,22,22,24,24,27,30,32,23,40,69,36,21,30,78,123,100,76,46,28,24,29,33,29,29,27,24,27,24,36,29,24,26,22,27,30,32,31,27,33,27,27,27,31,33,22,27,32,25,27,31,30,27,36,40,27,31,72,87,55,27,17,22,25,23,27,27,27,26,26,31,25,26,25,29,32,29,33,34,27,23,29,31,33,36,33,36,33,29,29,29,33,34,36,36,32,29,34,33,29,19,19,16,21,27,19,23,21,22,22,23,23,20,22,22,23,22,22,17,25,23,51,99,53,23,27,15,19,26,19,25,23,23,29,19,28,23,17,26,23,21,21,26,24,23,20,22,24,22,21,26,24,23,24,23,26,25,27,25,24,33,80,113,77,55,16,42,164,185,160,127,107,126,149,155,119,91,69,82,106,106,101,83,54,48,50,46,35,43,42,44,45,39,42,37,44,52,47,49,53,58,55,51,47,43,54,71,128,180,171,152,147,146,189,250,251,250,250,246,243,242,242,238,226,222,219,208,204,186,162,117,60,77,96,115,144,184,229,242,246,246,248,246,246,251,251,253,253,252,252,253,253,249,209,214,208,197,207,178,100,12,1,9,9,13,12,13,13,14,14,14,14,57,59,61,67,23,27,50,70,116,138,171,178,169,99,41,65,106,161,198,208,211,206,210,203,198,194,188,186,177,168,158,148,139,126,116,102,85,53,11,18,42,68,93,93,98,104,107,122,141,154,168,180,189,196,205,211,208,228,169,57,41,87,161,198,194,178,156,145,126,113,103,89,79,59,56,44,12,16,53,93,116,110,125,134,136,138,139,135,128,146,121,51,34,77,137,157,146,133,121,109,95,78,62,49,27,31,47,57,60,50,45,35,33,30,49,145,217,232,190,155,61,6,25,9,26,34,26,27,29,32,24,46,69,75,83,92,103,136,146,186,160,32,56,136,178,195,191,197,194,191,182,162,141,112,89,64,35,21,31,51,72,101,129,147,165,183,195,200,202,196,190,181,175,171,165,165,164,163,162,159,156,158,159,157,157,160,160,160,162,163,159,159,156,152,155,152,159,160,158,155,151,150,153,158,155,153,160,158,156,159,157,155,158,157,155,156,154,151,147,151,151,156,157,157,160,159,159,155,155,156,153,154,156,158,156,157,159,155,158,161,157,159,160,159,159,156,150,150,147,142,148,149,148,151,155,160,166,165,170,165,165,163,165,173,166,168,168,170,174,165,165,160,151,145,153,155,152,152,195,252,252,209,89,59,94,105,123,113,99,97,92,101,108,106,102,91,84,83,81,83,86,77,78,69,76,91,98,92,96,88,92,97,87,93,93,90,101,101,103,89,69,50,33,33,24,29,40,46,54,55,58,61,63,60,61,64,60,58,57,61,56,57,55,55,57,55,53,53,56,51,49,53,53,48,54,54,52,53,51,49,46,49,50,47,50,47,45,49,46,44,49,46,46,44,47,46,39,42,40,42,45,41,44,44,45,45,35,45,37,34,40,36,42,39,35,37,41,41,35,38,37,36,36,37,35,34,38,33,35,43,36,33,34,29,35,37,34,34,34,36,34,35,35,36,31,30,36,31,29,29,31,26,31,36,33,35,27,29,31,26,27,28,28,33,33,28,23,24,26,26,24,22,26,31,24,26,25,53,60,25,21,30,25,50,71,115,96,65,44,29,20,24,20,23,28,24,31,27,24,27,24,22,29,33,32,27,34,32,28,33,26,27,29,29,27,24,28,32,36,25,29,36,29,27,23,50,88,71,39,22,20,26,26,27,23,24,24,30,31,26,28,21,25,31,34,35,32,29,22,27,33,35,33,33,36,33,32,29,27,31,32,35,33,28,34,30,35,31,17,20,20,24,22,21,23,16,25,26,24,20,19,20,19,26,19,21,22,23,18,72,94,29,25,27,15,22,18,27,22,19,21,20,25,22,26,24,22,23,19,24,22,18,24,28,22,23,23,27,24,22,24,26,26,24,26,23,30,45,94,111,71,48,21,28,137,219,201,168,151,139,116,116,133,125,115,100,115,117,119,123,113,103,99,99,101,89,94,100,111,114,102,86,60,44,55,49,35,33,36,50,47,42,41,47,41,68,113,132,147,161,160,206,252,252,252,252,253,253,253,253,252,252,250,250,250,250,246,239,204,165,159,116,59,33,55,89,113,132,158,185,198,206,212,220,231,239,244,246,250,250,232,171,188,191,183,200,180,103,12,1,10,10,13,12,12,12,13,14,14,14,145,151,154,108,57,53,46,78,132,178,198,199,195,105,38,61,98,147,179,184,171,161,153,147,134,123,115,107,99,86,78,69,62,59,58,49,58,47,10,29,66,112,162,175,177,181,186,189,196,197,194,196,193,196,194,191,187,190,129,23,22,57,101,127,102,84,72,65,67,73,84,95,107,120,141,136,37,25,102,159,208,200,207,191,149,145,138,137,157,184,163,73,39,79,135,153,125,109,99,98,92,77,62,38,23,25,23,30,34,28,22,19,21,29,37,100,173,207,221,210,80,7,22,11,26,27,27,26,24,31,26,69,131,149,151,150,150,148,138,146,100,57,135,200,204,185,164,143,121,97,68,42,24,27,44,69,96,117,148,164,179,187,198,199,190,187,178,181,175,170,163,160,165,163,161,159,163,166,158,152,152,155,160,159,159,159,157,156,157,150,152,157,158,154,149,153,159,159,160,154,150,149,157,159,159,158,156,157,156,159,157,158,158,152,147,152,150,144,144,145,150,151,155,152,153,153,151,147,150,150,145,144,142,151,149,151,152,143,148,150,144,143,139,139,138,131,131,132,139,141,141,147,146,144,144,137,133,132,131,132,127,123,128,134,139,143,146,147,145,148,148,145,143,139,151,155,147,136,163,232,253,253,135,66,67,93,121,104,99,95,89,86,91,97,88,74,77,85,76,69,75,88,96,93,98,103,103,103,96,98,100,101,84,86,101,93,94,80,54,36,32,30,33,39,45,51,55,64,67,65,60,62,64,59,57,58,60,54,50,54,60,54,54,57,55,53,56,51,47,56,50,57,55,49,51,51,49,52,53,49,50,49,49,48,49,45,47,46,46,53,47,46,45,48,43,37,46,44,39,44,42,44,41,46,44,39,42,36,35,41,35,36,39,40,39,42,41,36,33,36,40,36,34,31,37,41,39,37,37,33,34,38,34,34,33,30,32,35,33,36,37,26,34,33,31,31,29,34,31,32,31,33,35,30,29,24,29,31,31,25,27,32,32,33,33,30,29,27,26,23,25,25,39,40,22,28,27,27,27,26,28,28,28,24,31,73,113,118,93,56,36,27,21,22,27,25,24,26,26,27,27,25,31,32,32,33,27,29,27,34,36,28,26,26,31,25,26,34,29,21,31,29,23,21,28,31,69,91,57,27,15,28,28,27,27,24,30,26,27,23,25,21,25,36,30,29,33,31,25,29,34,32,38,35,37,31,29,33,26,34,30,30,29,32,30,33,33,29,22,20,20,22,23,19,22,20,25,19,20,24,21,22,24,21,20,25,20,22,40,97,62,20,30,14,20,22,21,23,22,23,22,27,24,22,23,23,28,20,23,26,21,22,22,25,21,24,21,20,28,24,24,24,24,25,26,31,68,115,104,62,41,20,35,118,197,210,183,176,170,150,117,100,109,124,129,117,121,118,120,135,139,139,134,132,137,139,149,159,165,160,158,147,133,139,119,95,73,51,48,36,46,40,35,44,26,28,47,65,96,120,132,182,248,249,251,251,253,253,253,253,253,253,252,252,253,253,252,252,251,251,246,210,141,68,25,26,19,46,108,158,175,174,171,165,171,180,177,185,195,191,157,116,142,147,141,158,152,99,15,1,11,10,14,12,13,12,13,14,15,15,185,186,175,102,60,49,42,71,109,152,173,174,160,71,24,43,69,112,128,116,104,87,82,72,66,63,59,61,70,81,85,98,110,118,137,143,159,128,42,35,86,144,197,210,208,200,198,201,193,184,176,165,160,147,132,118,92,91,45,6,22,42,86,98,97,105,114,134,147,161,173,182,193,198,210,186,78,39,98,162,208,195,199,168,130,132,133,120,101,103,75,35,32,45,78,89,83,78,61,54,50,37,29,27,21,23,32,29,27,26,24,28,27,38,33,73,124,146,192,227,101,7,17,10,25,29,24,25,16,19,27,40,69,74,72,71,63,63,45,47,30,44,138,155,127,92,50,30,28,34,57,81,107,130,155,179,186,193,201,200,191,185,178,171,167,155,154,151,157,155,158,160,164,158,151,162,160,163,162,157,154,156,159,159,158,159,162,160,154,153,148,151,158,160,161,153,157,155,154,152,151,149,150,153,150,151,153,152,154,156,157,153,158,156,152,154,150,156,154,150,150,148,146,142,149,150,152,153,147,149,152,150,150,149,149,148,146,146,146,147,143,139,142,136,138,139,136,145,145,137,139,137,138,136,135,125,106,109,111,107,109,109,115,122,122,124,118,112,113,121,124,129,135,135,147,139,126,122,152,199,249,250,158,77,52,79,107,99,104,98,100,104,105,108,98,96,101,95,87,89,86,98,100,101,106,108,107,98,104,106,107,107,87,78,60,38,38,30,27,36,43,45,52,53,57,60,63,68,63,64,57,62,60,53,58,56,56,57,55,52,59,57,50,57,50,55,53,51,53,49,54,50,50,53,50,53,47,53,51,48,57,47,49,46,48,49,41,47,48,48,53,50,49,39,43,44,43,42,43,44,43,45,43,44,45,34,37,42,36,38,40,40,39,39,35,38,38,34,41,37,38,37,35,39,39,35,33,33,38,36,31,35,33,34,37,35,34,32,29,34,37,33,31,34,29,33,33,27,29,31,35,27,29,34,29,28,29,30,26,29,30,33,28,31,27,27,26,25,21,23,25,29,46,33,23,22,25,27,27,33,25,41,53,27,24,26,46,102,134,113,75,50,27,24,30,25,25,22,25,24,25,31,30,34,30,30,26,25,36,31,27,28,26,27,26,29,32,21,24,25,31,29,27,26,29,53,61,88,77,33,27,23,23,26,19,22,27,23,26,32,24,24,32,30,38,30,34,28,25,29,31,39,27,37,35,31,35,29,28,35,33,31,33,28,36,34,33,27,22,18,17,21,21,24,18,19,22,20,19,24,21,24,21,21,28,17,24,19,61,92,35,25,28,16,23,23,19,22,20,24,24,21,22,24,23,29,23,18,27,23,23,22,23,22,22,23,24,27,21,23,29,26,21,25,33,80,124,83,54,38,19,33,107,183,200,179,172,181,177,160,139,131,116,115,126,131,142,130,145,146,152,155,145,142,148,156,161,160,163,158,161,161,169,165,155,136,130,120,108,108,98,92,82,79,67,53,36,35,45,53,67,108,170,214,235,245,250,250,252,252,252,252,253,253,253,253,252,252,253,253,252,252,238,178,143,93,53,61,96,153,189,202,203,190,175,165,157,156,157,149,130,118,139,134,120,122,117,87,20,2,10,9,14,13,14,13,13,15,14,14,176,169,146,72,37,46,46,66,86,114,110,101,72,17,27,47,64,90,86,81,80,84,93,104,113,125,141,151,164,174,183,191,200,200,209,203,213,163,57,45,84,145,188,198,192,183,174,160,145,126,109,96,85,73,63,55,45,49,28,8,52,100,153,181,183,191,196,202,207,206,206,204,199,195,195,162,59,22,67,116,140,129,131,122,113,125,127,109,71,56,47,26,27,32,42,45,39,36,23,24,21,19,18,23,30,26,33,32,29,46,55,62,71,78,97,144,177,152,153,178,76,7,14,8,24,32,25,19,17,17,21,30,27,29,30,25,32,50,48,40,39,36,44,43,49,66,83,103,126,146,164,184,195,200,202,196,183,173,176,169,167,162,160,154,148,148,139,146,149,145,150,153,153,156,154,152,153,157,158,156,160,154,152,155,150,153,162,157,156,151,150,150,153,154,155,155,150,148,148,150,153,150,151,150,150,152,146,147,151,152,145,147,153,155,153,154,156,157,158,150,149,145,147,141,143,149,150,153,150,149,148,151,155,151,153,154,151,145,152,152,149,153,152,155,164,161,157,160,155,153,149,147,148,149,154,144,137,141,145,157,155,152,148,144,143,141,135,130,137,141,139,139,141,143,137,116,106,108,160,191,204,204,158,136,99,102,98,87,108,98,99,106,111,115,108,107,105,103,97,95,97,93,95,101,105,115,110,90,100,96,79,66,36,29,30,33,41,39,48,53,55,53,60,65,56,58,60,65,60,54,58,55,57,57,59,53,56,60,55,56,53,54,53,49,54,57,52,51,55,58,51,55,54,46,53,49,49,57,52,48,49,48,49,49,51,46,47,46,49,48,47,49,46,49,43,41,44,37,42,44,43,42,39,38,42,41,36,42,37,42,39,36,36,38,40,39,39,36,38,39,36,42,42,36,37,34,35,30,32,36,36,32,33,36,36,34,31,38,33,30,31,36,31,26,33,35,34,33,29,30,34,35,31,31,31,26,31,29,29,33,31,26,29,31,25,28,32,29,21,23,27,25,24,25,24,26,22,29,32,26,25,48,46,21,23,35,30,29,54,103,135,118,79,48,29,20,17,22,21,24,33,31,32,28,29,29,27,36,35,27,24,29,31,24,33,31,28,27,27,25,29,28,29,24,30,53,39,61,86,61,29,21,26,24,24,30,29,23,33,31,22,27,33,35,33,36,34,27,24,26,34,35,32,33,34,35,30,28,32,34,31,37,33,27,35,29,35,28,20,22,18,21,21,28,21,18,23,20,17,23,20,19,21,19,21,21,20,29,90,69,19,27,22,17,22,22,24,26,22,23,19,23,22,20,22,24,28,26,25,22,21,22,26,21,21,23,23,27,24,25,19,24,27,53,113,116,79,46,32,27,41,92,160,195,181,168,174,182,173,168,171,155,131,111,112,139,166,170,173,174,170,164,154,154,158,162,165,162,162,158,156,160,163,160,150,149,158,158,148,148,144,118,112,110,91,81,58,41,36,27,17,25,58,89,117,132,149,181,217,241,246,248,251,251,253,253,253,253,252,252,252,252,252,252,248,201,156,124,113,164,211,237,246,226,198,186,177,174,177,163,156,161,180,171,148,134,112,83,20,1,10,9,14,12,13,14,13,14,14,13,107,112,86,46,40,45,51,79,89,94,78,65,45,15,39,66,99,137,143,162,167,173,187,192,197,201,204,211,211,210,207,206,208,199,192,177,183,126,30,29,57,102,144,144,134,113,101,86,69,63,63,64,76,93,103,126,133,157,89,17,66,120,189,213,210,209,203,202,193,185,170,154,140,122,119,80,15,27,56,89,107,103,136,128,122,128,129,102,48,32,31,24,22,22,25,26,21,24,18,23,22,17,25,24,30,39,30,31,42,83,152,114,177,181,186,212,225,191,156,129,39,14,17,10,24,29,28,21,20,24,30,35,42,35,35,42,43,44,45,38,30,24,76,123,142,160,168,183,192,188,187,185,179,173,166,162,151,150,152,147,148,149,147,146,147,148,153,147,152,148,145,145,150,149,143,150,145,152,150,150,158,151,152,152,153,152,149,151,153,158,153,152,147,144,151,151,149,142,146,152,154,152,158,155,153,155,152,157,153,145,148,149,153,158,153,147,146,154,152,149,148,149,151,149,151,145,149,156,150,148,145,143,153,156,157,156,148,155,154,152,157,146,151,157,159,153,148,145,150,158,155,144,147,153,155,153,142,158,167,168,168,160,155,155,163,160,158,138,159,160,158,151,146,150,145,122,98,84,118,171,192,164,129,141,128,127,109,94,94,75,73,84,101,105,102,115,106,104,102,94,103,91,98,108,112,87,101,73,52,41,34,35,32,40,45,45,53,51,55,59,63,63,63,57,59,61,57,60,52,53,56,57,59,55,57,59,59,59,54,55,52,49,54,50,56,55,54,54,49,53,51,53,52,54,53,51,55,52,49,51,48,44,51,49,45,51,49,47,46,49,47,42,47,45,47,40,41,47,40,40,42,39,39,41,38,39,41,42,42,39,40,36,37,40,32,38,39,39,42,38,43,34,34,36,32,35,35,36,34,33,37,30,32,33,39,31,29,31,31,35,34,35,32,32,31,33,29,29,36,28,28,30,34,31,25,31,33,29,26,26,30,28,25,28,26,26,27,29,29,27,24,22,26,25,23,27,25,25,27,27,24,24,24,29,34,31,23,23,27,33,72,125,134,105,74,47,29,19,29,29,27,28,31,34,27,28,34,33,28,28,31,23,28,28,26,33,26,28,29,27,32,30,28,24,24,30,21,37,81,84,49,19,21,27,25,25,23,25,30,29,24,29,36,35,35,32,37,25,23,31,36,35,34,34,36,38,32,35,32,32,31,32,33,32,31,33,38,23,22,21,18,18,20,26,20,21,22,20,19,26,21,19,20,24,19,19,21,52,89,39,20,28,18,19,21,21,27,20,21,23,18,24,18,23,23,23,24,22,24,24,24,21,24,24,25,26,24,25,22,21,25,30,78,114,92,64,43,32,31,50,89,134,181,193,181,175,176,179,177,179,174,165,149,131,102,109,145,164,178,180,178,173,162,156,155,158,162,162,163,162,156,157,155,151,152,150,153,149,148,150,141,132,130,114,100,78,62,53,53,44,28,28,51,80,86,82,77,84,109,124,138,157,182,200,206,218,226,231,247,248,253,253,253,253,253,248,222,193,189,218,237,238,229,211,191,194,194,200,208,191,186,197,217,212,194,172,147,96,14,1,10,9,13,12,13,13,13,15,14,14,47,39,27,27,42,47,59,100,125,152,147,154,115,35,44,79,136,183,200,210,208,210,210,205,204,199,194,190,179,169,159,154,139,125,110,87,81,40,8,26,26,59,80,78,79,71,81,92,109,128,142,160,178,189,200,210,208,214,127,33,59,112,182,204,188,175,159,144,133,120,113,105,91,91,79,50,25,29,56,101,118,130,168,163,140,137,132,91,25,8,18,21,25,21,24,21,22,29,19,21,22,19,26,19,24,33,31,29,39,83,131,134,120,121,127,142,179,189,180,137,40,14,16,10,24,31,30,35,39,34,42,51,51,48,54,59,62,61,57,50,53,105,160,182,185,173,164,158,162,156,148,148,140,131,134,139,139,131,135,133,132,138,145,150,152,152,149,153,152,143,144,147,143,145,141,147,145,142,143,136,141,144,147,148,146,141,140,139,150,154,150,150,147,139,146,151,150,150,148,153,155,154,158,161,160,156,150,156,159,154,154,160,159,152,146,141,144,147,155,150,154,152,150,153,155,153,148,155,155,160,155,149,160,160,163,159,150,148,154,155,155,155,143,141,142,139,128,131,141,141,136,133,140,142,136,129,137,141,132,130,132,138,139,150,158,151,146,138,132,127,137,133,123,137,148,137,104,60,50,73,105,92,61,87,109,123,108,94,92,65,42,43,49,61,82,99,103,100,99,93,91,90,84,86,71,50,40,33,36,39,41,47,48,56,55,56,55,54,59,59,66,66,66,62,52,59,56,52,58,54,54,57,57,57,51,54,57,54,53,52,53,55,54,57,56,49,54,56,49,55,54,50,51,51,56,51,48,51,54,45,48,44,45,51,51,53,45,43,44,44,44,49,49,44,51,45,34,41,44,36,42,42,39,39,41,46,38,43,36,41,40,35,42,39,38,37,38,37,36,39,35,36,34,37,37,33,37,34,35,41,34,28,37,33,35,38,30,27,33,34,34,29,34,35,27,34,32,27,34,35,27,30,32,31,34,31,24,27,25,28,32,24,28,25,30,29,27,29,32,27,27,31,26,24,23,23,30,26,22,24,29,27,26,37,35,24,30,29,26,22,28,40,84,137,139,116,75,48,33,25,32,23,31,29,28,34,30,27,24,27,30,29,28,27,23,29,28,29,29,28,28,22,25,22,21,31,26,29,60,91,78,37,18,31,24,19,27,22,26,25,26,24,34,37,29,35,29,27,31,28,36,41,32,36,34,35,33,34,33,33,37,32,30,34,35,30,34,31,21,22,20,23,19,20,23,21,21,16,27,23,15,21,21,23,21,22,24,77,70,22,27,18,18,22,27,21,19,27,22,24,20,24,24,24,21,18,22,21,27,27,21,21,26,23,27,24,22,26,24,27,58,108,108,76,50,27,32,32,63,95,124,173,193,190,181,172,170,174,172,171,162,153,164,155,120,98,92,118,145,163,178,166,151,146,149,161,161,155,159,163,162,159,155,157,156,153,152,152,151,152,154,148,151,144,129,110,87,87,75,73,73,84,122,139,140,121,97,80,76,81,82,102,122,125,128,132,136,139,155,182,201,204,204,222,237,227,228,231,245,250,243,186,147,156,159,174,189,220,236,222,219,223,241,243,233,216,194,111,7,1,8,10,13,12,14,12,14,15,14,14,98,95,66,48,40,45,55,92,127,169,185,199,141,47,51,77,134,178,193,196,186,179,173,163,147,136,123,110,103,89,76,71,63,54,46,42,47,22,19,41,41,66,97,118,141,159,178,189,202,213,219,222,222,230,218,219,213,205,110,27,39,79,139,146,129,111,94,84,84,93,98,102,114,115,111,63,19,24,31,47,69,120,184,172,145,135,132,90,21,16,16,19,24,29,25,22,23,23,24,21,21,25,25,19,24,36,35,29,37,48,66,117,51,44,56,98,134,168,211,196,84,21,14,24,44,40,46,50,41,36,37,39,38,67,110,126,114,116,153,170,174,176,173,156,147,146,142,142,139,134,133,131,128,126,122,134,139,138,135,134,140,139,141,150,151,148,145,142,148,138,139,144,142,142,141,145,139,135,137,137,139,142,141,138,143,145,140,136,137,141,134,139,143,144,146,145,147,147,150,151,148,139,141,148,146,141,135,143,154,155,153,146,155,156,149,153,158,161,152,149,150,146,146,147,148,143,145,147,155,161,157,152,155,152,153,146,141,146,141,142,150,149,146,144,142,139,145,146,142,142,146,145,151,152,148,150,150,145,136,137,146,145,150,158,160,147,132,141,126,129,134,131,124,145,160,145,128,92,59,31,61,36,63,76,82,98,94,105,145,175,122,34,6,10,24,46,55,62,65,60,63,50,43,60,42,45,45,49,46,51,51,55,61,60,57,56,56,53,61,63,61,61,62,57,54,60,60,60,60,55,57,53,51,59,52,57,56,55,60,55,57,55,54,48,54,57,50,53,53,53,50,49,59,56,53,52,52,50,50,51,49,48,46,46,46,50,46,42,45,44,47,49,48,44,43,43,38,42,43,40,42,43,45,41,39,41,39,37,39,38,35,38,37,36,42,39,33,35,37,36,35,37,32,36,34,32,38,37,36,35,30,32,31,38,35,29,32,35,35,32,34,31,34,31,25,36,27,30,31,28,33,32,33,28,30,31,24,28,30,30,26,24,27,28,29,28,27,30,25,25,30,23,24,26,24,27,23,25,24,27,25,32,35,29,33,35,31,24,24,23,27,29,27,40,86,138,138,110,81,52,29,26,23,21,29,28,24,29,29,28,27,30,32,24,24,32,26,27,30,27,29,21,20,27,27,28,31,29,40,76,88,67,31,21,23,21,22,21,23,22,24,24,32,38,34,34,32,29,27,26,39,34,35,39,36,39,27,29,35,34,35,28,33,29,29,35,35,25,26,32,16,21,23,23,18,20,20,18,23,20,19,22,25,19,27,19,41,78,42,21,29,19,15,27,19,22,25,18,24,24,23,20,22,21,21,25,26,22,25,24,20,23,26,22,25,25,21,26,44,92,113,88,58,35,28,34,32,60,102,119,156,194,191,184,168,157,155,167,162,160,153,149,168,166,143,112,94,90,106,137,170,169,160,159,166,169,168,160,158,168,166,169,165,160,161,159,160,155,156,157,157,163,169,165,164,145,129,116,99,100,100,116,139,150,153,153,132,113,115,116,125,136,134,129,122,129,128,115,123,129,141,141,133,175,171,170,177,188,210,206,177,107,74,122,139,148,174,224,251,249,245,244,248,248,249,249,230,112,4,1,6,11,13,12,12,12,14,15,14,14,173,145,92,64,43,37,48,62,86,138,165,180,124,45,37,56,108,139,142,131,116,106,96,81,74,66,55,54,57,61,66,83,90,108,122,128,147,76,17,47,72,119,146,177,204,211,218,208,210,208,203,192,182,167,157,154,136,117,54,19,35,47,87,89,85,84,78,80,86,76,72,60,47,48,39,32,25,29,29,33,43,87,150,144,127,127,129,90,35,13,14,21,26,28,22,26,17,24,26,22,24,24,25,23,27,34,31,31,52,81,124,128,127,134,131,139,155,159,184,184,94,46,34,31,42,33,32,29,24,25,22,29,30,76,153,166,170,173,187,191,181,173,152,144,147,153,157,153,145,141,141,147,152,147,146,150,153,149,147,146,143,144,143,141,143,141,141,141,146,143,137,139,146,144,139,145,141,138,139,141,142,142,144,141,143,147,143,137,137,134,133,139,148,152,153,151,148,146,147,149,147,142,134,134,142,142,141,141,139,142,141,142,144,140,143,150,156,155,154,148,144,145,145,143,147,147,145,149,145,146,139,136,136,127,136,135,140,142,141,137,140,153,153,160,149,146,153,151,153,152,150,149,153,159,160,162,166,159,156,152,152,148,141,141,145,145,141,149,156,152,156,143,139,150,151,143,109,81,59,49,45,44,56,71,77,70,68,113,221,252,245,188,131,75,25,4,11,15,24,27,39,43,44,48,50,53,56,59,63,61,64,63,65,59,50,59,53,51,57,62,59,56,65,61,59,56,56,55,56,56,55,55,52,53,51,56,54,57,57,57,55,50,54,53,48,54,53,50,53,51,55,48,51,54,49,53,52,50,47,50,51,47,46,43,44,49,49,49,40,44,47,45,46,44,44,43,41,42,44,43,42,41,40,43,42,37,39,41,37,35,40,40,37,41,37,35,33,38,36,34,39,36,36,37,38,37,33,32,36,33,33,36,33,29,38,30,33,35,27,33,30,27,35,35,29,31,27,33,37,27,31,29,28,27,29,30,30,26,29,29,28,31,32,27,21,29,25,26,29,29,26,27,24,24,26,27,25,22,24,29,24,28,30,26,31,23,27,27,23,29,27,25,22,27,26,44,86,134,143,129,101,59,42,29,21,24,24,25,23,31,31,29,27,23,28,24,29,33,27,28,26,26,24,22,29,33,27,26,29,38,79,83,49,29,19,20,27,22,26,21,18,27,33,39,36,32,32,27,27,28,32,37,37,31,34,42,31,29,37,37,29,31,30,31,33,33,34,24,22,22,21,20,19,24,21,23,20,20,23,22,24,25,21,19,26,24,59,64,26,23,24,16,22,24,21,23,25,24,20,27,26,24,19,24,24,22,23,19,21,23,24,27,24,21,25,23,32,83,115,95,66,41,28,27,26,38,72,91,112,141,184,193,178,171,160,154,153,158,160,161,159,149,162,174,165,155,129,115,95,113,160,179,189,185,178,177,169,164,170,173,175,176,174,172,166,167,161,162,165,164,167,165,173,169,171,162,146,132,113,109,114,125,139,146,153,155,150,145,148,155,162,173,165,150,148,159,160,148,139,137,141,133,127,139,146,139,132,133,143,139,125,72,51,92,96,104,137,202,245,248,252,239,248,248,252,252,239,110,3,0,6,11,13,12,12,12,14,15,14,14,157,122,71,54,42,46,47,63,73,102,129,142,76,26,45,45,80,85,83,69,56,62,65,71,84,100,106,121,137,147,160,170,176,180,188,193,187,98,46,59,94,147,171,191,184,168,158,142,132,126,132,107,88,107,73,63,56,39,28,33,30,51,72,69,69,46,42,42,33,28,25,21,24,22,24,29,30,36,28,39,41,65,120,114,114,123,125,89,25,15,14,17,28,28,23,25,27,28,25,22,19,21,24,25,25,35,37,36,43,98,150,160,155,151,152,137,110,79,62,46,42,31,17,21,33,36,33,37,34,32,29,31,32,55,132,170,170,170,166,156,156,159,155,155,145,150,155,151,153,154,153,152,155,152,150,154,154,157,154,147,149,146,143,145,138,140,144,141,146,146,147,145,139,134,141,143,143,141,143,143,139,148,146,149,150,144,151,145,145,148,150,152,152,156,151,154,157,155,158,158,155,150,143,147,147,148,149,133,132,135,140,128,121,129,125,122,122,134,138,134,136,131,134,136,134,137,141,125,120,129,133,130,129,134,146,146,143,151,152,146,145,146,145,149,147,146,145,146,142,136,136,134,139,141,136,132,124,123,124,118,115,107,100,97,105,110,119,127,126,129,124,122,111,105,106,93,74,45,47,50,52,32,30,51,49,53,45,63,173,250,252,252,251,248,175,95,24,4,10,17,28,44,55,61,59,66,75,74,72,76,67,57,60,57,53,54,50,50,53,57,63,59,55,61,57,54,53,53,55,56,57,53,59,54,52,59,57,52,57,50,53,52,53,51,47,54,57,56,49,49,53,55,52,49,50,50,53,50,48,51,48,44,47,46,45,44,47,51,47,41,44,45,46,49,44,44,43,44,38,42,39,36,40,38,41,35,35,37,36,38,38,39,34,34,33,39,36,34,42,37,42,36,39,41,29,33,35,34,37,33,31,35,33,30,33,34,30,30,29,33,31,29,31,31,30,31,35,35,27,29,29,28,27,27,33,28,24,30,29,29,29,23,25,25,28,29,22,26,26,27,31,27,29,27,22,22,28,29,21,24,28,22,28,27,27,24,30,24,24,33,35,27,24,27,29,37,36,80,85,142,152,147,111,90,47,27,25,29,18,24,27,24,30,26,29,25,26,31,29,24,25,25,21,26,32,29,28,25,25,24,51,89,75,44,27,19,23,23,24,22,19,25,35,49,38,32,36,27,24,30,41,33,36,30,35,39,28,29,29,36,34,29,33,32,35,34,33,31,24,19,21,28,19,23,22,18,21,19,19,20,20,22,19,21,19,39,69,40,22,27,20,18,18,23,22,21,24,24,23,22,21,25,28,21,19,21,20,24,25,23,24,21,23,25,34,66,115,113,78,52,33,25,28,25,39,80,99,104,120,161,186,181,167,165,159,156,161,169,157,155,150,145,161,172,168,181,177,152,114,108,128,158,180,187,176,165,160,160,164,171,171,176,169,168,173,171,171,172,177,175,169,171,173,174,177,170,153,137,130,123,125,139,142,151,160,167,160,157,159,151,159,174,163,152,146,150,152,148,145,134,141,137,129,128,136,131,129,132,145,143,138,104,91,85,51,49,72,134,184,206,206,206,236,244,251,251,236,111,3,0,6,10,12,12,12,12,14,14,15,14,121,83,52,51,44,37,46,53,61,76,73,66,40,41,60,66,94,110,120,120,124,133,148,158,172,179,178,190,191,191,192,193,191,188,185,179,166,78,34,50,78,132,147,145,117,95,84,74,73,69,72,57,60,61,53,60,54,40,23,23,21,23,33,27,28,23,21,26,25,19,22,27,20,20,22,28,36,44,47,53,47,80,129,125,126,128,119,68,24,17,12,20,20,24,29,27,23,24,22,24,19,23,22,21,28,33,39,31,39,49,57,62,57,57,46,41,35,31,27,22,25,28,26,29,45,64,78,87,82,81,86,94,99,124,154,157,163,162,160,152,148,151,149,145,134,128,134,132,131,137,137,134,136,136,142,142,146,151,150,147,147,149,152,152,155,147,148,147,145,147,143,143,141,139,145,147,140,148,152,152,147,145,147,152,155,147,148,145,147,152,147,139,137,141,134,137,145,141,147,146,133,131,139,139,137,140,136,137,141,141,136,136,141,139,131,125,119,125,138,142,142,141,141,140,140,135,131,130,132,132,123,129,145,151,156,150,148,146,143,135,134,132,131,134,123,122,122,128,137,129,125,122,111,111,111,95,100,110,112,114,122,127,120,117,114,112,108,113,120,104,105,98,96,94,76,77,75,65,59,60,61,63,53,38,41,41,39,27,93,181,203,251,252,252,252,241,167,90,27,11,21,33,50,62,72,81,84,77,78,72,64,54,48,53,54,52,53,48,53,60,64,66,56,55,56,55,57,52,57,57,55,59,59,55,54,56,55,55,51,56,53,53,51,53,55,53,49,51,57,51,54,53,49,50,52,54,49,49,47,46,53,45,50,47,44,53,46,48,46,43,46,47,47,43,45,42,45,40,36,39,34,39,37,35,42,34,38,39,41,36,35,38,34,42,31,33,39,33,40,39,39,35,32,36,36,31,32,32,29,29,35,41,35,31,36,34,31,33,31,30,29,30,29,33,36,31,31,29,27,32,31,32,27,28,32,25,27,28,24,27,26,27,27,25,27,27,24,31,26,23,27,21,21,26,24,22,24,24,27,29,25,30,21,27,32,24,31,26,27,31,28,29,28,37,28,27,34,32,25,36,73,134,160,147,132,87,55,39,29,24,22,26,23,26,27,31,27,21,29,22,25,24,29,25,26,29,23,26,27,29,30,65,95,71,30,22,20,24,41,27,23,27,38,45,38,38,32,27,26,30,36,40,34,32,36,38,28,30,33,29,36,32,30,34,34,35,32,27,24,18,22,24,19,20,21,22,22,19,21,18,20,24,21,22,28,54,46,27,21,24,23,18,22,18,19,24,25,21,24,24,21,28,23,22,24,22,27,31,29,20,22,20,31,68,109,118,84,55,36,24,32,24,23,43,87,108,106,115,154,190,184,171,171,163,157,164,169,165,156,153,158,162,168,171,170,173,173,169,143,112,98,109,145,167,166,171,162,159,169,168,165,168,166,170,170,174,177,178,181,177,176,171,171,169,177,176,165,153,150,153,152,152,158,162,160,166,157,151,151,141,151,169,175,163,142,136,136,144,143,139,135,134,130,121,118,119,128,145,162,165,169,155,150,134,83,61,47,74,106,125,130,136,183,201,210,218,215,116,5,1,7,10,13,13,13,12,14,15,15,15,47,31,29,42,32,36,37,46,58,63,81,84,52,50,65,94,155,174,193,186,188,193,191,191,190,190,184,183,179,178,174,172,164,149,138,132,100,36,31,39,64,97,100,105,94,94,100,100,92,87,53,66,51,38,35,33,28,27,27,25,15,25,24,18,29,23,24,22,19,27,23,27,21,19,23,30,38,46,78,98,116,154,181,155,145,141,131,73,14,16,15,18,19,25,28,20,24,30,25,27,23,26,25,28,37,46,43,42,40,34,31,29,34,23,22,25,27,30,27,29,34,45,55,72,122,164,194,207,194,191,199,198,192,184,174,160,150,148,142,142,146,142,139,131,127,126,128,134,129,134,131,123,125,122,127,133,132,132,136,134,140,139,133,142,148,149,146,145,143,134,135,142,147,153,153,149,150,151,152,147,136,144,140,139,142,139,139,139,140,137,131,127,127,130,121,118,122,119,131,129,121,122,123,129,128,133,139,139,143,141,138,141,144,141,130,125,120,122,129,133,148,137,128,128,131,139,144,141,137,130,126,130,141,137,129,133,128,118,124,131,141,137,131,126,117,113,103,112,121,120,130,125,120,127,132,139,139,139,141,146,158,168,167,161,150,128,119,116,102,90,90,95,96,88,93,97,75,80,78,77,84,80,81,70,43,37,39,34,79,136,134,179,249,253,252,252,252,235,154,83,22,5,24,31,55,69,71,71,72,64,60,47,47,54,49,49,49,50,54,60,63,57,55,61,53,54,61,53,59,57,57,56,57,59,49,51,50,54,56,53,57,54,55,50,48,52,50,52,49,48,51,50,52,53,52,50,53,49,42,49,47,46,50,51,51,48,45,44,46,42,42,42,45,44,44,42,40,43,35,38,38,37,37,37,38,39,40,39,39,40,37,35,36,36,36,35,35,37,39,31,35,33,35,39,32,33,33,34,34,36,36,29,35,33,33,37,29,30,29,30,32,33,35,26,29,32,24,29,32,31,31,27,28,30,29,26,25,25,27,31,28,19,29,25,27,30,24,31,24,22,27,21,21,24,22,24,25,21,26,28,27,25,27,27,25,29,22,24,28,29,22,36,41,33,37,31,29,28,22,25,27,37,56,99,151,156,145,127,84,55,68,33,25,22,27,25,24,23,27,26,27,29,30,27,25,29,30,33,31,26,29,35,71,84,48,32,16,35,51,22,20,25,38,45,41,42,34,21,27,34,34,40,31,34,38,29,30,33,29,35,28,30,33,28,40,37,34,26,18,16,22,20,19,27,24,21,21,20,19,24,24,23,24,22,52,58,29,19,23,24,21,21,22,21,22,22,23,23,21,22,19,19,22,27,24,23,24,23,24,24,39,66,116,124,89,60,37,30,29,26,21,25,49,84,113,110,104,134,185,194,177,174,175,163,163,168,168,166,157,168,175,172,174,172,165,165,164,170,162,140,116,93,103,132,163,179,179,177,173,171,167,167,170,171,167,168,166,166,170,171,173,169,165,168,174,175,171,158,155,151,154,147,156,158,155,159,157,150,144,142,156,177,183,171,153,149,153,159,157,146,149,150,149,129,121,125,132,145,163,169,175,178,186,178,152,139,115,105,100,85,63,65,111,133,160,171,186,119,8,2,8,11,12,10,15,13,13,14,14,14,17,34,40,43,36,42,38,44,58,77,124,141,78,48,72,116,165,191,203,194,194,190,183,175,168,164,152,144,136,128,122,113,107,92,79,66,39,25,39,36,68,107,117,118,113,106,85,61,39,32,33,25,22,19,18,21,26,23,29,27,23,27,21,23,25,26,22,20,24,24,27,24,18,30,28,27,31,57,116,160,182,196,194,154,140,142,139,84,24,11,12,16,26,31,27,30,31,32,40,36,36,42,37,36,41,37,35,27,24,25,24,22,26,24,23,26,36,41,46,59,77,124,161,190,210,211,206,198,183,182,181,181,178,164,162,157,149,149,147,146,137,130,132,130,135,136,141,139,142,139,134,141,130,122,124,120,118,117,129,138,136,125,117,124,132,130,133,137,142,146,146,145,136,134,134,145,141,133,127,117,128,130,127,124,125,131,134,130,130,129,127,139,140,143,134,122,127,131,132,129,129,125,132,128,119,125,132,131,131,126,111,117,130,137,132,130,130,113,107,120,129,125,121,136,142,141,148,142,141,136,128,131,137,130,111,109,116,125,139,137,137,126,122,118,114,129,116,100,97,113,127,129,118,131,141,132,142,126,121,137,151,148,134,142,133,115,100,83,84,79,85,81,83,97,95,89,74,75,81,77,83,90,74,70,66,48,45,36,92,122,82,98,129,188,252,252,252,252,252,240,159,85,23,6,12,27,38,45,60,66,66,57,60,59,49,46,50,51,52,56,60,61,60,61,60,57,59,57,57,55,55,58,49,55,59,52,53,55,53,55,57,53,53,49,53,48,52,50,48,50,52,51,47,54,50,52,53,49,46,43,46,49,49,40,47,46,43,49,43,44,45,48,41,41,44,46,40,39,42,34,37,40,38,37,37,38,40,37,38,37,35,35,36,36,37,40,35,34,38,33,32,34,35,34,34,34,36,35,29,38,33,30,33,30,33,29,35,35,33,29,29,31,31,33,29,29,32,29,30,34,32,35,28,27,28,25,29,25,26,25,26,25,32,27,28,30,21,26,22,32,26,26,23,23,24,27,22,22,23,24,27,24,24,26,27,29,26,23,29,27,33,43,38,38,46,32,21,25,29,31,23,23,28,32,45,78,127,154,161,153,128,96,62,46,29,19,23,21,22,24,31,27,24,31,29,28,30,32,27,26,26,26,42,76,81,48,23,31,29,18,24,28,38,46,39,37,42,33,31,44,39,34,28,36,38,29,33,27,36,33,31,30,27,37,35,33,38,25,17,21,21,17,24,23,18,23,19,21,24,21,20,24,21,34,59,37,23,23,22,22,16,23,23,18,24,23,18,23,23,24,22,23,28,25,21,21,22,24,48,83,119,115,89,58,34,32,29,27,23,22,37,64,98,118,115,103,119,168,190,174,162,169,170,165,165,167,167,163,165,167,170,170,164,164,165,162,162,165,165,160,149,113,92,101,134,170,182,181,177,168,165,175,176,169,165,157,151,151,157,166,164,160,158,162,174,173,165,150,139,139,137,142,147,148,153,165,166,155,152,152,164,173,178,173,162,170,169,162,155,155,160,168,167,148,146,147,142,154,169,170,179,188,199,200,193,191,175,164,142,113,64,28,51,72,104,130,152,111,15,2,10,12,13,11,14,12,13,15,15,15,49,49,45,48,45,45,45,46,51,63,107,117,59,46,61,94,148,167,174,160,152,137,125,113,110,92,92,87,69,73,70,66,72,74,74,73,46,30,26,28,41,51,53,44,40,36,24,27,19,20,22,16,22,22,22,22,22,24,29,26,24,26,26,24,22,23,24,22,23,22,21,27,24,21,24,24,27,46,85,118,130,140,142,118,125,134,135,86,23,16,17,28,32,39,39,39,43,41,39,35,34,29,29,21,28,30,29,31,19,23,25,19,28,34,36,54,71,108,122,137,184,203,201,202,184,169,163,161,159,156,158,153,152,149,149,152,149,150,149,146,146,138,137,137,141,139,137,146,137,127,118,127,139,136,131,125,125,122,131,140,142,134,125,133,133,131,129,133,147,146,135,125,109,111,122,120,111,98,103,118,120,121,123,123,110,130,127,135,143,146,143,142,148,145,135,132,135,125,136,125,120,125,128,124,116,124,123,128,117,105,107,120,128,132,126,122,112,94,101,108,119,117,122,115,119,132,135,136,137,120,112,120,124,131,121,123,130,129,136,127,115,101,95,109,124,127,118,113,99,91,95,89,87,92,99,90,77,82,88,104,99,85,98,111,116,118,118,102,94,91,89,97,96,90,92,89,63,76,86,81,70,82,64,67,70,64,60,42,54,94,94,92,86,106,150,205,253,253,252,252,252,242,142,81,17,2,9,13,34,48,59,58,60,59,50,49,53,54,50,57,64,59,59,66,61,58,56,57,57,52,57,53,54,52,52,59,55,57,52,54,54,50,54,54,49,53,50,51,56,50,49,52,49,50,55,53,49,50,50,49,48,45,49,46,43,46,48,44,41,49,46,41,44,37,42,42,39,39,40,41,37,41,36,33,37,43,39,33,39,39,35,35,36,38,34,35,38,36,33,35,35,34,34,34,42,28,32,38,31,37,29,24,36,30,30,37,29,33,34,30,29,30,33,27,36,29,28,34,27,33,28,25,30,23,28,29,24,24,29,27,21,31,27,25,27,21,27,24,24,26,23,23,27,23,23,25,22,22,23,18,22,24,24,28,23,24,24,23,26,30,35,36,33,35,34,23,29,30,29,29,27,30,26,28,32,32,34,48,76,130,100,162,154,138,104,67,48,35,27,29,23,22,26,26,31,35,28,23,29,30,24,29,27,49,81,73,45,23,20,23,29,29,38,43,38,39,37,39,46,39,40,41,38,38,34,33,35,36,36,34,29,31,29,34,36,32,37,25,21,16,20,23,20,23,19,21,24,19,19,21,21,23,27,53,49,24,21,23,19,21,21,17,21,23,22,19,24,21,22,24,23,23,24,30,21,37,67,97,123,120,89,50,41,27,28,29,18,24,25,41,79,108,124,115,105,113,155,187,172,160,161,167,167,166,172,167,162,160,160,165,164,160,160,157,162,168,162,163,167,168,168,150,130,105,102,127,156,175,180,174,166,168,163,160,158,158,151,146,156,166,163,160,157,165,176,175,165,150,146,141,143,143,146,151,156,168,169,162,153,156,163,169,167,158,154,163,163,158,153,148,154,165,170,159,155,149,151,158,170,176,182,191,202,208,204,205,197,192,185,162,114,70,56,42,65,78,113,105,17,4,11,10,15,12,14,13,13,15,14,15,63,63,41,50,48,42,51,49,52,52,72,75,47,42,49,69,105,120,113,97,81,76,75,78,75,78,83,84,89,96,88,80,78,65,50,44,34,26,25,19,23,23,25,26,22,24,22,24,24,20,19,19,23,23,24,21,21,23,24,29,24,26,25,20,26,23,21,23,20,24,22,22,21,22,20,21,27,40,64,71,87,106,122,127,142,146,139,87,33,29,24,37,45,36,40,34,30,31,27,24,24,25,24,17,27,30,28,28,29,36,40,41,53,63,88,134,170,183,195,201,198,194,174,156,158,155,149,153,151,152,159,162,158,156,155,148,143,139,139,143,141,143,146,140,138,127,130,128,135,130,113,116,128,147,141,136,132,123,132,135,131,127,120,124,131,123,115,120,110,95,94,87,95,110,112,111,94,94,105,110,108,103,100,94,91,83,93,105,116,117,101,103,105,112,100,98,107,112,121,113,117,113,119,131,131,128,130,127,112,110,114,124,125,121,120,113,114,108,104,110,106,102,92,93,109,112,106,112,119,107,105,108,112,121,115,117,117,114,124,116,116,110,114,127,128,116,97,98,103,97,84,81,82,101,92,74,99,102,98,98,104,120,128,138,130,123,128,125,111,105,125,120,107,96,91,94,96,88,80,76,74,80,70,67,69,66,68,53,52,67,93,143,170,162,136,142,171,219,253,253,252,252,252,248,181,110,44,7,6,11,19,26,46,52,53,55,55,56,57,61,67,61,55,57,56,57,57,59,60,54,50,59,53,54,57,53,51,51,55,49,53,53,51,53,53,53,50,54,47,50,52,48,46,52,52,50,49,50,49,44,47,45,47,42,46,43,46,50,45,45,42,39,43,41,42,43,38,42,38,35,38,34,36,35,35,36,35,34,39,41,36,36,35,34,34,35,33,33,34,35,36,35,35,38,34,30,36,33,31,32,31,30,32,33,29,35,33,27,35,29,32,27,28,33,28,33,33,28,31,29,27,29,25,29,28,28,28,27,28,29,29,24,24,27,27,25,23,31,22,24,24,23,29,22,20,21,23,27,25,19,24,27,21,26,27,23,25,47,42,22,29,26,32,31,28,35,30,26,33,36,31,29,24,28,31,25,26,29,30,36,43,71,115,155,162,159,148,134,108,68,52,43,36,29,29,29,28,22,27,26,26,27,27,23,46,78,69,46,22,18,24,26,33,34,36,36,37,33,36,48,50,51,42,39,39,35,34,30,36,34,29,34,34,31,31,34,39,26,20,20,21,19,19,23,19,22,21,23,24,23,22,23,53,53,31,21,21,20,19,20,24,18,20,27,20,21,24,19,22,23,20,31,39,61,89,131,146,107,71,47,35,31,27,26,23,21,24,30,53,80,116,111,100,104,108,151,185,186,171,159,161,165,163,163,170,166,156,152,159,161,158,165,162,163,171,163,160,165,169,173,179,179,169,139,105,95,113,153,183,182,171,168,160,156,159,163,162,162,163,169,172,169,169,174,179,177,170,160,150,151,148,150,153,154,161,162,166,163,158,160,167,170,168,167,162,162,155,151,148,143,143,155,168,157,148,151,151,153,163,165,166,176,189,198,200,200,200,195,188,189,166,136,110,71,59,49,76,87,21,4,12,10,15,13,13,14,14,14,14,14,50,50,48,49,51,56,56,56,58,56,59,49,37,43,46,66,86,92,96,97,100,100,120,106,106,102,97,58,57,48,34,27,32,27,24,20,27,27,21,20,22,26,21,22,27,27,23,21,25,24,18,26,20,21,25,18,20,24,26,26,24,28,24,21,25,26,24,24,21,22,23,21,23,17,22,22,31,50,83,106,118,133,137,136,130,122,95,63,40,28,23,28,33,31,27,22,27,27,24,26,19,21,24,20,27,30,29,34,41,69,109,141,160,168,182,203,212,198,191,184,180,179,165,162,160,159,167,154,148,154,155,159,160,156,147,141,139,142,141,135,134,139,141,133,124,109,105,118,136,137,128,130,137,144,140,140,130,120,131,117,128,124,103,114,119,128,128,123,108,98,103,120,127,131,127,132,130,123,127,107,125,105,122,115,112,77,98,103,83,83,80,82,99,94,91,113,95,115,105,119,136,126,122,126,128,110,103,101,92,114,103,115,114,108,112,116,134,110,99,92,79,86,94,98,99,92,89,97,108,100,102,106,111,118,104,105,113,101,88,99,114,114,116,116,111,81,71,86,106,86,91,94,101,100,107,126,130,126,120,130,146,148,150,137,112,103,98,103,110,111,122,119,104,96,107,105,99,89,91,105,103,91,74,66,65,65,64,57,64,63,66,109,168,123,110,110,125,136,167,177,253,253,252,252,252,252,202,133,69,15,4,9,14,35,49,56,53,56,57,62,63,57,61,57,57,57,56,55,56,55,56,57,57,56,53,52,53,53,52,51,54,56,54,53,53,50,48,52,52,47,45,49,51,54,54,45,47,49,47,45,45,45,41,43,46,46,45,41,43,38,40,42,43,43,37,39,38,38,39,36,35,38,39,36,36,36,34,35,34,35,39,35,34,32,33,33,33,37,35,35,36,35,33,29,37,32,33,29,27,39,29,32,33,29,34,33,30,33,29,29,33,28,30,33,29,28,28,29,29,31,27,27,31,25,27,29,24,27,28,29,26,23,23,24,30,23,29,26,22,25,26,24,20,23,23,22,22,21,20,25,21,23,21,22,27,21,35,49,34,25,23,21,27,24,33,30,32,33,29,34,26,24,30,26,30,29,28,25,23,37,29,27,34,41,55,99,100,167,167,159,151,144,137,114,106,60,45,37,31,25,24,32,22,26,29,47,77,67,45,30,22,24,37,39,33,45,51,77,126,137,98,56,47,45,40,34,26,31,31,34,31,31,33,26,33,29,36,27,23,30,22,24,18,20,22,22,22,19,21,19,21,42,56,35,22,23,22,22,19,19,19,22,20,19,25,22,19,27,36,43,63,88,128,153,130,88,54,42,31,29,30,20,21,21,22,25,39,60,83,106,109,92,104,99,130,175,182,177,157,155,155,155,158,157,159,159,155,153,157,158,160,163,167,163,161,160,159,165,159,157,173,178,178,160,134,105,88,113,154,175,178,177,169,169,165,169,169,165,173,174,175,170,166,173,177,178,170,161,143,139,139,148,152,162,169,169,169,167,163,166,168,169,181,185,173,162,156,154,155,156,156,167,179,162,150,152,151,158,157,155,158,160,179,195,196,198,198,193,187,187,184,171,155,121,102,84,88,87,20,2,11,10,14,12,13,12,13,15,13,13,56,55,53,66,64,72,77,80,82,71,57,39,30,34,45,69,105,115,103,98,92,76,55,46,38,32,26,26,24,21,18,21,29,21,23,23,24,31,24,22,23,23,22,24,25,21,21,22,23,24,20,22,19,21,20,19,22,22,22,22,22,26,29,27,24,22,27,24,25,24,23,27,23,23,23,28,33,53,78,100,108,83,71,52,51,58,46,40,27,19,16,24,27,29,28,23,22,23,23,24,23,22,24,23,27,25,29,37,65,152,212,227,217,201,186,175,174,161,153,154,149,154,154,146,147,136,128,131,133,140,146,145,137,139,120,107,110,117,129,131,129,128,133,123,119,117,109,108,120,137,141,138,140,137,131,139,125,113,126,128,129,127,124,136,136,131,143,148,136,138,145,153,155,145,148,144,144,140,141,149,144,147,152,146,145,128,113,112,114,127,126,129,130,136,135,137,130,127,143,149,153,139,127,120,105,103,99,108,121,125,124,117,102,95,105,118,116,99,93,105,118,123,119,118,125,114,116,127,130,136,147,146,150,148,145,152,144,140,128,125,131,111,113,126,114,105,101,113,117,118,122,118,117,124,122,112,104,93,96,103,101,100,105,96,75,75,79,82,92,103,97,78,74,81,91,96,97,91,89,91,82,66,57,55,57,55,55,56,57,54,41,35,39,57,75,116,144,112,86,93,141,204,253,253,252,252,252,252,239,165,96,34,5,9,22,36,51,51,53,59,61,63,64,57,61,61,55,59,60,56,56,55,50,59,59,48,56,54,52,55,54,53,53,53,50,51,48,53,49,53,52,50,50,47,48,49,49,45,46,45,43,45,45,45,47,40,39,44,42,44,43,41,40,42,43,41,38,35,36,40,38,33,38,36,39,38,34,35,28,38,34,34,36,34,33,37,38,35,37,31,29,38,34,32,30,30,35,31,34,33,33,33,32,32,34,34,27,32,29,32,33,29,33,27,28,29,29,26,28,28,21,31,27,25,31,26,23,26,27,28,25,26,27,20,34,28,23,26,24,24,19,26,26,19,25,24,24,25,24,22,19,19,23,25,22,17,27,27,22,29,27,24,21,26,30,27,25,30,25,23,26,27,26,32,38,30,27,25,31,31,27,28,27,30,27,31,31,42,62,86,125,153,170,166,173,165,156,147,141,132,117,93,71,66,59,60,92,125,117,94,65,92,124,135,155,167,196,209,212,198,133,64,53,51,47,37,23,30,35,30,28,32,29,29,33,33,28,31,44,41,39,36,27,28,21,16,21,19,22,21,40,51,38,23,16,22,20,16,24,16,21,24,22,35,39,52,66,82,122,137,149,129,84,66,43,34,30,27,29,21,19,21,21,27,36,50,70,85,94,98,96,97,100,117,157,177,170,159,157,157,159,157,157,156,162,167,160,163,165,164,164,167,165,158,164,160,161,159,152,154,161,167,169,169,171,147,107,92,111,137,163,182,178,176,174,174,170,166,169,170,169,170,165,169,177,179,171,162,155,155,154,159,169,174,177,177,176,171,168,165,165,175,184,184,170,158,157,162,168,163,165,178,178,162,150,154,154,158,165,159,151,156,181,198,205,207,203,191,178,184,182,181,168,155,152,132,136,99,15,1,9,9,13,12,13,12,13,14,14,14,73,89,95,109,118,123,124,87,123,103,61,30,22,25,27,35,37,36,36,27,30,21,24,23,18,21,21,21,19,22,21,20,22,21,22,24,24,27,26,28,22,24,24,23,27,26,28,22,20,24,21,26,22,20,21,19,24,18,21,27,26,26,23,28,24,27,28,26,31,29,29,33,33,33,34,37,39,40,39,41,39,32,24,31,30,37,40,30,22,17,22,23,30,32,24,25,25,23,21,27,25,19,21,23,22,19,21,20,51,121,164,174,160,134,138,109,112,107,110,114,120,125,125,120,110,120,115,103,109,117,129,132,131,137,134,118,125,117,131,128,137,136,124,122,131,134,128,107,101,115,116,119,110,114,118,125,106,96,111,129,129,134,123,117,107,94,86,94,105,115,110,113,113,99,101,87,85,80,91,107,122,100,117,108,105,105,97,97,120,122,125,122,120,128,120,127,127,101,128,125,93,91,98,96,87,87,124,114,118,118,131,112,88,84,86,104,95,105,128,141,143,136,116,112,117,99,118,118,123,128,128,121,121,134,124,121,136,137,128,135,130,119,95,114,126,115,117,132,132,133,122,117,133,121,113,110,106,107,97,95,84,77,83,84,68,91,89,84,96,89,87,93,98,105,88,94,86,75,72,74,69,59,53,61,61,62,63,57,67,60,46,48,30,46,68,105,145,171,122,130,101,123,159,202,245,253,252,252,252,252,250,187,90,35,6,10,29,34,45,60,60,59,61,61,60,60,56,54,57,53,53,51,56,60,54,55,54,57,54,54,53,51,51,53,53,45,47,52,51,51,54,49,42,49,46,48,49,45,50,44,48,42,43,45,39,43,40,43,45,44,40,40,42,40,40,37,32,39,35,33,35,37,39,36,35,33,32,35,32,34,33,33,39,38,34,35,32,29,33,30,36,28,30,31,31,35,32,32,32,30,27,36,31,31,33,27,30,31,27,33,30,21,32,30,26,28,26,27,28,27,29,25,27,26,27,24,27,30,27,27,27,27,26,22,28,27,23,18,22,21,24,28,23,27,23,23,29,19,20,26,20,25,22,21,24,21,24,24,25,24,23,26,27,24,22,26,26,25,23,31,24,27,27,31,34,22,33,32,27,38,30,24,29,27,26,32,23,26,31,31,34,42,52,60,81,116,137,152,169,172,177,174,173,178,167,158,155,177,199,183,184,201,198,202,193,191,172,153,136,101,51,31,35,35,42,35,32,29,31,34,29,29,28,30,30,31,32,34,41,48,51,55,49,48,40,40,41,32,34,53,65,41,31,53,52,34,35,39,45,75,55,68,88,88,142,146,146,137,100,68,53,75,34,33,27,29,22,19,24,22,21,22,30,46,56,76,86,92,96,97,99,100,113,145,177,177,165,167,168,169,169,168,164,165,173,170,165,162,165,164,162,162,162,160,160,163,163,159,155,159,162,160,164,172,183,177,147,118,93,97,136,171,184,184,175,174,171,171,173,165,169,167,167,169,169,171,164,162,170,168,165,169,171,181,181,177,177,175,170,168,168,170,176,171,162,159,161,167,169,172,170,170,171,160,155,155,152,160,162,160,162,173,197,214,217,210,203,190,178,182,183,174,169,163,165,159,157,107,13,1,9,9,12,10,13,13,14,14,12,13,123,137,134,134,122,113,96,73,47,33,33,22,21,22,18,23,21,24,21,22,23,19,23,22,23,23,26,22,24,21,18,21,22,24,20,21,21,22,27,29,24,25,21,24,27,21,24,23,24,22,22,22,19,22,20,19,22,24,21,28,30,27,27,31,34,35,34,35,41,41,40,35,39,35,29,32,27,28,24,27,29,23,23,24,22,22,21,28,25,21,24,23,33,36,31,33,35,33,34,30,28,24,26,25,22,24,29,34,46,74,96,123,130,122,98,81,89,89,92,104,116,113,100,95,103,108,108,119,112,105,115,120,129,147,142,141,141,137,154,158,165,163,155,144,144,144,133,132,111,101,93,81,90,83,84,101,90,86,101,105,112,109,92,96,100,84,77,77,90,93,79,81,77,63,67,47,45,55,49,48,45,47,55,62,70,76,77,77,76,81,76,88,87,89,92,84,81,65,46,33,36,47,63,68,80,97,105,111,103,99,93,88,77,73,87,95,104,109,110,118,113,94,84,78,76,77,90,112,114,102,93,83,82,80,76,87,77,72,78,93,103,79,81,106,113,108,96,97,100,106,107,101,100,103,103,108,119,111,96,87,76,62,69,75,65,70,77,78,76,71,81,81,80,79,79,84,68,57,57,63,61,59,70,69,68,68,76,78,62,66,64,62,65,57,70,87,128,160,186,191,168,137,104,109,130,167,216,251,252,252,252,252,204,139,86,49,32,17,19,23,33,33,41,49,51,56,56,60,56,53,55,53,58,61,54,60,54,50,50,49,57,53,50,53,46,53,52,48,50,47,49,50,46,46,47,51,49,47,52,44,40,43,41,44,45,46,41,42,45,42,43,44,39,39,41,39,37,38,39,37,36,34,42,33,37,34,34,29,32,40,35,40,36,32,34,38,34,35,32,29,33,35,32,30,38,32,26,32,32,30,34,29,29,30,30,37,31,28,29,30,28,25,32,29,26,26,30,33,27,23,24,27,23,29,28,24,28,23,23,24,30,28,21,27,25,21,24,27,26,22,25,27,23,22,21,21,24,19,24,22,27,28,20,29,24,20,22,26,22,24,24,24,22,25,28,25,29,22,24,26,24,30,27,29,24,27,36,25,54,50,26,28,26,28,30,32,23,25,29,29,33,29,32,27,29,25,28,34,43,52,56,69,71,84,102,107,124,131,122,131,145,148,139,101,71,36,35,36,19,42,65,37,24,22,35,39,23,35,28,25,33,29,33,28,27,33,33,31,23,33,45,49,60,63,62,68,71,66,103,121,101,98,86,92,96,103,117,136,141,130,144,146,141,120,83,63,52,40,41,29,27,27,22,27,19,22,24,21,21,25,25,42,61,71,72,86,95,82,100,102,94,112,149,179,181,170,167,173,174,170,173,167,163,164,163,165,158,156,162,160,163,164,162,163,163,160,163,163,165,165,158,160,162,168,177,175,170,153,119,89,99,135,162,179,181,173,171,169,169,165,162,170,168,164,160,157,158,165,168,166,169,170,178,180,176,178,176,180,177,171,171,172,171,168,166,164,169,170,177,177,170,167,166,168,170,162,154,154,158,159,157,170,193,207,213,208,198,185,175,179,178,174,166,159,168,158,164,111,11,2,9,9,14,10,13,13,13,14,13,14,75,61,41,35,28,25,18,39,17,22,24,19,20,21,20,21,22,26,25,19,20,28,29,28,29,30,28,24,23,18,19,21,22,21,19,22,22,18,26,23,24,24,23,22,26,24,21,27,23,24,22,24,24,22,24,21,24,25,25,35,40,40,37,39,44,37,39,41,34,33,34,29,26,22,21,24,22,22,31,26,24,24,22,22,22,29,23,22,29,22,28,30,41,54,68,86,91,90,86,78,69,64,63,69,66,105,100,93,95,105,122,135,137,112,109,95,96,89,93,103,108,97,84,72,78,98,102,103,96,89,96,105,105,103,94,101,118,112,116,108,134,134,134,129,120,120,118,113,115,112,103,100,104,100,99,101,101,102,115,109,106,112,103,122,137,141,138,133,126,110,128,129,107,100,122,130,136,131,98,68,69,69,75,91,96,105,107,94,98,93,94,101,92,83,102,75,68,83,53,87,94,105,111,102,112,102,88,112,102,86,87,92,90,96,107,122,106,104,91,94,96,96,87,77,82,82,99,108,108,106,103,101,95,97,90,80,67,78,69,87,90,87,102,112,106,92,79,71,85,88,98,97,82,86,94,92,83,97,90,77,81,74,80,72,70,68,68,87,77,76,94,79,84,81,72,80,62,67,66,63,68,64,64,74,63,69,75,76,64,54,60,63,62,70,55,33,85,87,139,162,199,176,153,124,102,107,122,160,212,251,252,252,240,210,172,141,104,66,43,25,15,11,13,14,24,35,42,53,56,57,60,59,56,57,51,55,56,53,54,54,56,48,50,54,51,52,50,50,50,51,48,49,46,49,47,48,50,49,42,43,43,39,44,44,43,46,43,45,44,43,41,41,42,42,36,39,40,37,40,33,35,35,37,39,35,37,35,30,32,35,39,35,32,33,32,35,28,30,36,31,37,34,33,29,29,37,31,29,29,32,27,31,30,29,31,23,32,29,25,32,29,30,24,28,29,32,26,31,28,23,28,27,29,23,27,25,27,24,23,28,26,24,25,28,24,23,21,21,24,23,25,20,20,22,22,22,19,22,22,21,21,25,24,22,22,22,24,17,19,20,24,24,24,29,24,22,24,22,24,24,24,25,28,29,28,31,24,39,59,39,27,29,27,32,29,27,27,30,29,30,26,28,31,30,22,24,30,29,28,28,28,27,23,29,32,28,36,35,29,32,51,63,74,67,53,46,47,23,47,145,122,47,26,27,41,36,29,30,32,33,30,31,31,31,30,32,31,27,19,22,25,24,39,46,41,60,71,63,76,88,93,95,100,70,72,91,91,81,76,80,53,52,39,68,33,29,31,29,23,22,24,21,19,24,22,21,24,27,30,37,53,64,74,82,76,90,90,81,98,92,102,143,181,182,165,157,154,153,160,159,153,149,146,147,151,150,157,157,159,163,160,163,160,162,161,160,165,163,168,165,159,159,158,156,159,167,172,171,147,111,93,94,124,158,167,171,166,165,166,159,164,167,165,163,160,157,156,163,169,169,167,169,167,164,163,162,165,167,165,168,172,173,176,171,169,170,175,177,174,171,164,167,173,173,174,168,159,159,160,154,155,167,173,191,193,194,192,184,178,180,178,171,170,160,162,157,159,111,13,1,9,9,12,11,13,11,12,13,14,14,18,20,21,20,18,16,17,19,19,22,21,24,24,25,22,23,21,23,26,22,26,26,29,40,44,44,47,47,35,26,21,18,23,20,17,18,23,19,25,24,26,27,19,24,27,24,24,27,21,24,29,26,29,28,30,34,34,36,39,38,41,37,34,35,34,37,29,27,26,23,23,22,21,23,22,23,21,27,29,22,28,25,27,25,22,28,24,23,29,37,52,61,92,135,174,202,207,206,207,216,205,191,188,192,195,189,169,147,137,135,128,122,109,105,114,118,124,124,121,117,125,128,113,99,92,97,94,96,86,81,100,104,101,85,76,89,97,90,75,57,60,70,74,79,95,96,94,106,112,123,126,123,125,116,118,136,142,141,132,116,119,117,115,136,146,145,137,124,119,122,131,135,131,129,136,139,151,156,136,111,104,105,110,116,128,137,131,115,122,139,137,144,137,138,124,95,99,116,137,153,158,165,156,131,138,140,130,120,109,105,101,102,105,118,124,106,97,92,79,79,81,78,91,98,99,92,95,105,103,113,114,107,101,88,94,104,101,96,91,102,116,115,110,107,93,91,92,81,79,87,100,103,101,98,90,87,85,84,89,95,100,89,94,87,81,91,96,99,100,111,108,90,89,87,84,84,76,83,80,81,78,67,59,57,63,61,66,70,63,65,63,70,68,61,59,48,57,59,75,103,119,147,160,173,179,152,117,105,117,141,182,196,185,193,194,190,181,161,151,150,146,111,66,27,7,10,12,15,26,41,51,49,52,58,53,55,55,50,57,54,52,51,53,52,50,54,48,47,53,46,50,47,46,48,46,49,45,48,45,44,48,39,45,41,48,48,40,47,38,43,41,39,39,37,43,44,42,36,33,34,31,35,34,34,34,28,39,37,34,31,31,33,36,34,35,30,32,36,36,36,30,29,30,31,29,33,29,25,29,32,32,31,30,27,28,31,27,29,31,23,30,31,29,29,27,31,22,24,29,24,26,27,27,24,25,27,28,23,22,26,23,23,24,26,24,24,24,25,24,19,25,22,20,23,22,21,21,25,24,22,22,21,24,21,19,25,24,23,20,22,22,18,24,23,23,22,27,24,24,28,26,27,24,33,25,24,27,22,30,27,24,32,29,37,39,29,29,27,32,30,27,31,30,24,30,29,22,24,27,29,26,27,31,30,28,30,25,23,25,26,26,40,63,68,77,77,95,110,169,226,150,53,38,41,48,44,42,48,47,49,46,48,43,36,40,38,44,33,22,17,22,24,21,31,37,45,39,32,33,29,35,40,29,34,34,29,33,30,29,27,27,25,21,26,19,21,23,18,25,23,22,25,25,20,24,24,25,41,50,63,65,79,84,72,77,86,88,87,95,92,132,177,184,174,157,155,148,147,152,145,146,145,144,152,152,160,165,160,164,161,161,163,160,161,159,156,160,160,160,152,147,151,148,152,156,160,170,168,163,145,121,96,92,119,140,162,176,169,160,147,151,154,157,159,158,155,155,162,165,162,162,158,149,149,148,141,134,142,150,158,164,163,167,165,158,162,169,166,165,166,162,160,162,159,159,155,155,162,155,152,152,159,168,175,188,195,194,190,188,190,184,180,176,169,169,156,166,113,11,2,9,10,12,11,13,12,12,14,14,13,21,21,17,19,19,21,17,17,20,20,21,24,25,21,21,27,20,18,21,24,27,29,33,49,57,61,69,66,52,35,20,17,21,17,21,16,21,25,22,26,27,29,26,22,29,29,29,27,30,36,35,37,39,42,39,42,33,34,35,29,38,28,29,25,22,24,21,23,24,20,24,23,20,18,21,23,24,25,24,24,29,29,23,25,24,24,29,26,44,64,67,137,96,162,132,144,153,153,163,155,194,139,158,149,138,143,125,118,117,111,101,88,98,105,124,141,145,147,133,129,128,122,123,120,117,108,114,122,109,101,95,93,92,96,90,92,96,97,74,78,75,84,93,91,98,101,101,104,110,116,118,104,115,117,123,131,136,123,128,119,112,116,110,118,116,117,106,97,90,88,98,95,88,85,92,105,114,134,128,112,125,127,122,114,110,111,108,98,110,123,131,123,136,131,109,96,108,118,130,119,121,125,118,104,110,113,111,96,95,93,93,93,84,90,105,87,84,84,78,84,78,90,108,116,108,96,100,100,99,105,100,93,98,84,95,92,99,103,110,110,114,116,120,110,102,109,101,94,90,95,94,97,94,88,84,83,78,89,82,82,83,86,93,91,93,95,89,91,94,83,93,80,86,83,81,78,74,81,87,81,71,71,59,60,61,61,61,64,65,66,69,59,64,66,56,54,52,44,35,61,51,108,142,175,193,184,171,138,101,94,123,125,122,137,148,169,180,191,208,249,252,252,231,180,133,87,37,8,9,11,13,21,35,43,49,56,57,58,55,52,51,51,54,52,50,49,47,52,47,49,50,48,46,46,44,48,48,44,47,45,45,43,43,42,41,42,46,42,39,39,37,41,37,44,42,36,33,37,35,33,37,35,39,37,33,33,33,33,34,31,31,32,34,35,36,34,31,35,31,30,31,31,29,31,33,29,32,32,27,28,29,33,27,29,29,27,28,29,27,31,33,23,30,29,26,24,27,25,22,26,28,25,26,27,22,28,29,25,22,24,23,26,28,23,26,23,22,22,21,22,20,20,27,24,25,25,19,24,24,21,20,17,20,25,23,24,24,19,24,22,21,19,18,23,22,22,26,24,25,29,28,27,24,26,25,28,24,23,23,27,30,29,26,39,35,27,35,33,27,29,27,29,25,22,27,27,27,27,28,27,27,24,24,30,27,23,25,29,24,25,24,36,40,45,59,68,69,68,123,125,68,46,41,39,51,75,109,98,98,112,103,90,78,81,89,90,73,48,27,26,32,30,38,37,39,37,21,23,21,19,25,24,27,25,23,22,22,19,21,22,24,19,19,24,21,20,21,29,23,23,24,22,24,25,28,30,42,55,62,66,68,78,76,69,74,95,89,83,96,113,161,184,181,168,159,162,155,152,160,158,160,158,153,159,164,166,167,163,164,161,165,163,160,160,156,155,156,155,147,143,141,143,148,154,157,156,158,161,162,158,152,121,93,88,103,147,166,171,166,152,150,153,153,152,156,158,155,156,154,154,152,150,151,146,145,139,137,141,143,151,154,149,145,145,149,155,159,159,162,161,160,157,158,156,153,152,155,160,158,157,159,164,165,171,183,194,200,199,198,191,188,184,181,172,171,164,169,113,11,2,9,10,14,10,14,13,14,14,14,15,21,21,18,21,17,21,19,19,18,18,21,20,27,23,19,21,21,21,19,23,25,27,41,51,60,63,76,77,54,32,19,22,17,21,22,19,23,19,31,33,30,33,33,37,35,39,37,35,39,40,38,37,36,31,31,26,23,19,19,27,30,25,22,21,22,26,26,22,23,21,22,22,21,23,19,21,29,21,21,30,23,25,29,23,29,31,30,33,49,63,71,79,74,65,66,62,48,44,54,67,66,64,54,39,43,60,74,90,103,97,102,106,95,96,97,90,89,83,81,63,62,84,94,105,110,118,123,130,135,121,92,84,103,112,104,101,100,108,118,111,110,124,123,109,103,107,102,95,90,76,70,77,89,90,91,90,86,94,114,115,99,89,76,75,79,78,79,81,73,78,87,83,78,76,74,78,87,94,98,97,107,104,101,106,92,86,84,78,80,86,88,90,86,91,86,83,83,78,83,71,73,76,71,77,74,69,72,72,83,81,82,87,86,89,89,88,88,94,90,93,101,101,110,111,114,110,101,101,94,97,93,89,101,97,99,99,105,108,106,109,110,107,103,106,108,105,102,108,111,104,105,101,95,89,96,97,96,103,101,101,97,88,101,97,98,102,102,102,89,89,80,73,86,83,81,75,72,78,80,84,85,79,71,68,63,64,72,71,68,61,63,70,69,68,64,61,59,59,49,41,50,68,83,90,103,131,137,128,139,159,171,130,79,74,95,114,132,140,162,228,253,253,252,252,252,252,201,155,103,60,21,6,11,12,17,19,40,45,48,53,50,53,50,49,51,55,54,57,49,46,48,46,47,48,46,44,47,54,49,45,43,44,44,43,41,42,43,40,41,39,39,39,35,43,35,32,38,32,39,32,32,37,33,35,36,37,36,29,34,28,29,35,33,36,36,32,33,29,33,35,29,32,30,29,31,36,30,31,31,29,34,31,29,25,26,29,29,30,31,24,29,28,27,28,23,27,26,26,27,25,30,24,25,24,28,28,19,26,29,20,24,29,19,27,19,25,24,19,24,21,25,22,24,24,24,22,20,21,23,23,23,19,23,23,21,19,20,23,18,24,22,19,22,21,21,24,22,25,26,20,27,23,24,27,28,25,18,27,24,26,27,27,27,27,29,28,29,31,35,30,25,29,24,29,31,26,30,25,30,26,31,26,23,30,27,25,24,27,27,27,31,38,38,32,40,39,29,27,43,49,34,36,34,33,48,106,155,141,143,151,145,132,120,134,149,150,141,91,36,27,31,29,32,28,21,24,22,23,23,19,24,20,22,28,26,21,21,23,22,22,19,23,23,19,23,22,22,26,24,20,24,31,31,39,42,47,55,52,66,72,70,84,71,70,96,89,84,93,122,164,184,177,168,167,166,165,157,162,166,162,163,162,158,162,166,165,165,163,165,162,165,160,160,162,161,160,163,158,155,155,153,157,156,165,164,160,160,157,166,165,161,153,126,102,86,105,142,162,175,172,168,162,158,154,156,162,159,155,159,154,150,149,148,155,157,155,152,155,155,158,156,149,147,151,151,157,161,161,164,163,162,160,162,156,155,157,158,166,163,165,165,167,172,175,179,187,194,194,192,189,188,184,181,174,173,165,173,114,10,2,8,10,12,10,13,12,13,14,13,13,18,21,21,19,21,17,17,19,18,19,18,19,24,21,21,21,26,21,19,23,21,29,34,50,57,63,71,66,51,30,22,22,20,20,19,25,30,30,34,38,42,43,39,39,44,38,37,32,33,28,23,23,22,26,19,22,24,19,24,27,27,26,23,24,23,23,24,24,23,25,23,24,21,20,23,17,19,24,20,18,25,27,33,34,30,43,42,36,39,36,49,50,58,57,52,50,44,48,46,59,71,73,84,93,96,97,88,118,133,104,132,113,103,94,89,66,70,83,81,81,87,90,116,120,131,137,136,131,129,132,98,87,96,124,131,109,106,130,118,108,97,96,84,74,75,84,83,64,56,57,62,77,60,53,49,56,78,55,57,91,72,59,61,85,71,77,97,89,81,94,78,72,75,82,83,83,87,95,88,79,83,86,95,105,109,105,83,80,76,71,78,86,83,80,85,91,83,76,69,67,68,81,77,89,88,84,93,96,105,98,99,97,99,101,99,101,105,113,104,104,107,106,108,106,107,103,110,106,105,106,109,114,109,103,105,105,124,120,119,117,111,107,105,108,97,109,106,116,114,115,121,118,122,121,117,120,115,112,118,108,107,99,97,100,95,101,94,93,91,82,90,87,89,81,81,83,71,81,96,99,83,76,63,62,66,73,78,66,73,73,77,71,74,69,71,69,67,66,62,55,46,46,39,38,36,34,39,73,110,152,177,130,84,66,67,68,59,71,94,154,230,252,253,253,252,252,252,252,252,220,175,128,83,46,12,9,11,12,21,33,43,45,50,50,54,50,52,52,46,46,45,46,45,47,43,44,48,55,55,42,42,42,39,44,42,38,41,39,42,39,34,38,39,39,37,37,38,37,36,38,34,35,39,35,30,31,35,38,35,33,34,36,34,31,29,34,34,25,31,30,32,33,33,36,27,32,32,34,34,27,24,29,29,26,30,29,24,23,27,23,30,32,24,29,27,30,25,26,26,25,22,26,29,20,22,24,24,26,29,25,25,23,21,24,26,20,24,21,20,29,24,22,23,22,19,24,21,21,21,23,24,16,26,22,19,24,19,19,23,22,19,19,24,19,21,19,19,24,24,27,23,27,22,28,23,29,54,35,25,23,25,26,28,37,32,32,30,36,42,26,27,33,27,27,24,28,31,26,23,28,28,26,24,24,30,28,28,25,24,29,29,38,40,39,36,36,31,23,30,30,29,33,32,39,46,81,142,136,137,142,138,134,124,126,130,153,157,111,37,14,20,19,24,20,23,27,18,24,27,21,27,19,20,24,20,22,24,23,24,25,24,24,21,21,24,20,21,26,25,22,31,36,39,48,48,48,55,65,75,68,71,78,75,88,91,81,92,122,167,184,177,170,162,160,162,165,162,164,168,168,171,166,157,160,159,161,165,160,163,163,166,160,160,163,160,162,166,164,166,167,163,168,166,167,164,163,167,165,165,165,169,164,155,137,103,88,97,131,166,180,179,173,168,163,162,160,158,159,158,151,148,152,158,158,159,164,165,158,161,166,160,158,159,159,157,157,161,162,160,162,167,162,162,162,165,162,162,171,171,168,169,172,170,176,179,186,186,184,188,181,181,176,173,170,172,162,170,116,10,2,9,10,12,11,14,12,13,14,14,14,23,20,19,19,19,20,19,18,22,18,20,18,18,23,21,24,21,25,20,22,24,24,35,44,48,56,59,63,49,29,29,24,27,27,31,31,36,36,34,42,39,34,36,32,29,29,27,23,22,24,23,20,23,23,24,23,20,23,22,26,27,25,24,24,24,22,23,24,24,25,22,21,23,18,16,17,21,21,19,19,23,29,38,37,30,38,37,39,39,31,37,49,52,49,49,52,45,44,52,63,76,90,92,105,120,109,98,87,91,93,87,105,100,91,81,71,81,85,101,102,102,96,87,91,83,87,88,74,69,84,92,82,80,83,96,107,104,108,109,98,82,68,76,80,80,86,80,85,86,92,96,90,84,76,80,80,85,83,83,79,82,94,98,106,106,114,120,121,116,113,107,95,95,100,103,111,105,107,106,97,99,93,96,104,110,113,101,97,95,92,98,97,94,100,101,104,104,108,110,99,98,98,97,107,104,107,104,100,110,104,108,108,105,103,103,112,113,114,112,107,110,112,111,105,108,107,104,111,116,119,116,109,116,116,118,120,128,131,123,120,112,116,120,113,111,107,113,122,116,117,122,120,123,117,118,111,108,115,104,99,99,102,106,97,96,93,94,92,89,103,105,97,98,95,104,110,103,96,95,90,87,76,69,71,64,67,65,67,80,77,71,67,72,65,62,69,61,57,56,55,54,52,49,41,38,34,33,30,46,80,107,110,101,89,77,60,46,43,48,70,123,159,193,225,238,253,253,253,252,252,253,253,247,211,170,122,86,48,17,8,11,11,18,36,34,41,47,44,35,42,47,39,49,46,47,48,45,50,49,50,40,43,41,41,41,38,46,39,34,33,38,39,34,36,37,41,34,37,38,36,35,31,34,32,31,33,36,29,37,31,30,33,27,32,32,32,34,32,30,35,31,32,27,33,29,33,28,26,32,24,34,25,29,29,25,27,27,26,27,33,23,22,29,30,25,25,24,23,21,25,24,24,26,23,26,27,28,23,22,25,21,19,21,29,19,24,26,21,27,16,19,21,21,20,22,23,21,24,23,22,19,22,23,22,21,26,22,19,23,20,21,19,16,21,22,21,22,19,18,23,24,30,27,30,22,50,61,25,24,24,24,23,30,27,33,32,30,34,23,30,28,28,27,34,33,26,28,28,33,32,26,25,28,27,28,29,33,37,33,27,32,37,39,42,38,37,29,24,28,35,35,35,29,41,45,68,119,125,124,126,132,129,119,101,115,141,156,119,39,14,14,14,23,24,24,22,19,26,23,21,23,24,17,21,24,18,26,22,23,23,23,24,19,24,20,25,27,19,28,29,42,44,42,52,50,56,62,69,78,64,76,90,76,87,88,90,113,151,181,173,166,156,146,145,153,163,164,167,171,169,172,167,158,162,156,163,162,160,162,159,162,160,161,162,162,163,166,167,167,165,167,168,168,168,166,170,168,166,168,167,165,166,169,166,139,104,84,91,128,159,179,181,178,173,165,164,155,153,155,151,153,157,163,162,162,161,160,160,158,160,158,157,158,153,150,153,161,160,158,158,155,155,159,163,170,168,165,169,167,168,166,164,167,169,173,177,176,175,174,170,172,166,169,168,170,166,174,115,10,0,9,10,12,11,13,12,13,14,14,14,21,22,16,19,17,17,20,19,18,16,22,20,21,24,19,21,21,23,27,23,20,25,31,36,39,43,47,49,42,31,42,30,31,37,28,36,26,32,28,24,31,24,28,23,19,23,24,22,20,24,23,19,23,22,22,21,20,21,22,24,30,28,23,29,26,19,23,24,24,23,17,23,23,17,18,21,22,17,21,18,20,29,29,31,29,27,27,31,32,31,32,32,33,35,38,39,38,36,36,37,37,40,42,40,39,45,67,37,39,39,33,65,41,40,42,40,39,47,49,53,57,78,59,46,33,38,46,35,38,34,74,35,71,71,59,69,91,84,90,89,88,83,93,92,98,96,93,96,94,109,112,105,104,106,105,102,111,100,111,105,103,99,110,111,116,110,105,117,117,112,112,101,109,112,112,104,105,109,106,107,103,99,98,97,103,113,107,119,116,118,121,115,107,107,113,118,110,110,107,115,115,116,119,112,108,105,107,112,113,115,116,111,118,117,114,119,122,123,122,125,120,124,117,114,122,116,118,110,114,111,110,116,113,122,122,116,117,115,116,116,120,114,124,124,118,116,114,117,118,117,110,107,110,107,105,112,116,114,110,110,116,111,109,107,101,102,97,97,102,99,102,98,104,110,107,108,95,89,91,85,89,86,75,79,72,79,69,67,77,71,70,73,70,69,64,63,58,52,52,55,55,53,49,51,53,57,63,48,49,63,82,85,81,83,85,79,69,62,51,48,68,121,103,125,147,178,196,218,252,252,252,252,253,253,252,252,250,195,152,122,84,44,14,9,56,12,24,36,36,40,47,46,42,44,46,46,42,45,49,46,44,44,42,42,42,38,40,41,39,39,36,39,35,40,39,38,35,31,34,42,36,35,38,25,38,35,33,32,29,37,27,27,29,26,32,29,29,33,30,33,35,28,30,35,29,30,28,28,30,27,29,29,29,27,27,25,27,24,30,29,24,25,33,26,24,23,22,26,17,22,24,24,24,26,29,26,24,27,24,20,23,26,25,20,24,24,21,24,21,19,21,19,22,21,23,27,23,24,18,19,24,19,23,23,22,21,21,24,21,18,23,18,19,22,19,24,18,19,21,21,22,20,24,28,24,31,35,21,24,25,24,28,20,29,23,30,29,21,28,27,29,30,38,34,28,28,30,34,35,24,29,26,25,31,27,34,40,41,27,22,33,38,43,42,34,36,27,23,28,32,32,31,37,36,44,53,94,105,101,107,115,113,102,97,106,130,147,111,39,15,17,16,25,24,20,24,20,19,22,25,24,21,24,26,19,23,22,24,27,23,19,23,21,20,23,24,22,29,34,39,44,42,47,51,54,57,66,72,66,67,84,89,79,83,88,108,149,164,167,165,160,148,138,139,151,162,156,160,161,162,168,163,162,164,157,159,157,154,158,159,160,157,162,163,160,164,162,159,157,159,157,159,158,158,163,160,163,160,159,162,164,164,168,169,159,144,114,85,91,119,154,177,181,177,168,164,160,157,161,156,159,162,162,165,160,162,160,155,160,160,157,160,156,156,157,154,153,152,151,150,152,153,154,156,163,160,159,160,157,159,157,156,158,163,160,167,165,163,164,159,164,163,165,164,168,161,174,116,10,1,8,10,13,11,13,13,13,13,14,13,18,17,21,18,19,19,19,17,21,20,21,23,24,24,24,24,22,25,28,29,30,29,32,34,32,33,39,34,37,33,29,34,24,21,27,24,22,21,28,26,23,22,19,25,24,25,27,24,21,23,21,21,23,22,24,19,24,21,24,23,26,28,21,23,20,20,24,20,17,18,20,17,19,17,20,18,20,18,18,21,22,31,40,49,46,48,53,59,61,59,57,54,55,52,48,43,41,43,40,45,51,52,46,42,42,42,46,51,42,41,38,41,47,37,40,45,46,48,52,53,59,69,85,81,75,83,83,87,80,79,78,74,82,79,88,85,88,103,99,98,98,100,106,105,108,110,108,108,102,105,120,128,123,117,116,108,111,113,118,120,110,116,122,125,119,113,100,98,106,112,112,107,117,117,114,111,108,114,111,105,105,110,112,111,113,118,117,118,127,117,127,120,117,125,119,120,114,116,114,114,118,123,120,122,112,113,121,118,125,118,120,122,120,121,120,122,124,125,127,122,123,124,126,127,125,125,117,111,111,111,110,107,112,117,117,111,107,108,111,113,116,123,129,123,125,124,115,112,100,103,112,115,119,119,118,117,118,119,118,114,117,114,106,100,104,98,94,92,99,100,87,94,100,98,87,74,86,92,89,79,74,73,74,87,88,83,75,68,75,90,83,73,74,81,77,77,66,56,59,53,57,57,55,60,69,81,90,72,69,77,72,67,60,58,72,93,97,99,96,114,132,111,107,101,88,104,111,126,155,187,218,239,253,253,252,252,252,252,252,252,244,213,184,153,114,90,37,6,10,11,23,33,38,42,39,44,44,42,45,51,47,45,40,40,39,35,37,39,33,36,39,37,41,42,46,41,31,37,35,33,35,33,36,36,33,29,28,32,36,33,29,34,28,27,28,31,33,24,35,33,22,33,31,30,31,28,31,33,32,21,29,29,26,26,22,29,23,23,34,22,20,26,19,29,24,23,24,17,29,27,24,26,25,25,21,24,22,29,22,24,25,21,28,25,24,24,22,19,21,23,22,20,24,19,22,23,19,24,19,20,22,21,22,20,20,19,23,21,16,23,19,23,24,20,21,21,21,22,22,21,19,20,23,19,22,24,27,29,26,23,23,26,24,25,25,26,20,30,32,25,27,33,39,28,23,31,34,32,37,31,24,31,28,29,31,29,32,29,24,26,33,39,41,37,36,37,27,21,27,32,33,34,35,35,39,48,63,85,90,95,98,89,93,85,104,107,127,107,36,24,17,17,24,22,23,20,23,21,22,25,18,28,22,23,24,17,26,21,21,23,24,22,20,27,24,29,25,33,42,35,47,44,49,60,56,59,64,71,73,72,91,81,76,92,103,149,171,163,167,163,163,163,152,153,159,154,147,147,155,157,163,162,160,166,159,160,159,155,157,157,154,153,163,162,160,165,160,157,160,151,148,148,144,144,147,148,150,155,152,151,153,158,160,166,171,171,155,122,93,83,116,145,169,175,172,171,162,165,162,159,162,164,159,160,162,159,160,162,161,163,165,169,165,164,167,161,155,153,158,155,152,155,155,154,155,154,150,148,144,147,150,151,153,152,153,158,157,159,160,158,162,163,165,165,167,162,170,113,12,2,8,10,13,11,13,12,14,15,13,14,22,20,17,23,18,19,19,22,20,21,24,20,31,28,28,34,33,33,34,36,37,38,36,35,35,31,32,29,24,25,24,20,22,19,17,21,22,24,24,20,25,22,23,29,23,23,27,22,24,23,22,20,19,23,19,21,21,18,24,21,22,24,18,22,19,17,21,21,19,21,18,20,17,17,18,19,20,16,24,23,21,31,45,54,63,63,74,89,62,64,64,60,59,57,63,57,58,61,66,73,66,53,55,46,42,37,53,52,49,46,53,59,57,52,57,57,62,70,54,47,47,72,97,109,103,108,113,104,114,122,134,130,129,107,113,115,107,111,107,108,107,110,109,112,118,114,117,113,107,113,115,118,117,113,115,108,107,107,112,117,118,117,124,131,127,116,116,113,117,124,124,124,121,122,128,122,118,118,116,110,115,115,123,126,120,123,113,117,117,118,119,118,129,125,132,124,122,124,116,122,118,113,128,126,126,121,119,126,122,124,123,119,126,130,128,127,125,123,122,118,114,125,125,122,125,122,122,117,119,121,117,120,119,121,122,123,122,121,123,123,127,113,119,120,117,122,119,114,113,120,118,122,121,118,119,121,122,117,110,111,108,116,108,109,105,103,92,99,98,89,88,86,89,93,84,84,87,86,92,85,92,94,82,85,93,77,72,66,76,73,66,66,70,76,77,75,68,62,60,59,63,63,63,63,70,62,79,70,74,81,84,71,59,59,71,81,81,87,98,104,117,93,111,105,107,105,88,71,76,89,111,137,168,194,211,235,252,252,253,253,252,252,252,252,199,228,154,120,108,64,25,8,9,17,24,28,34,41,40,46,50,43,36,40,40,38,38,39,39,36,39,41,34,36,37,42,37,34,35,34,39,33,34,34,33,35,30,33,29,30,31,30,33,24,29,29,24,30,25,28,30,25,31,34,30,29,32,29,25,26,24,27,24,24,27,21,32,27,29,22,21,19,24,26,26,25,27,28,23,27,26,27,21,26,26,19,24,21,21,25,27,20,22,22,19,20,23,21,25,19,23,26,18,20,22,22,21,19,19,22,21,22,23,19,20,21,22,19,21,21,22,20,17,17,21,21,18,19,21,22,19,24,23,18,22,18,24,24,25,27,24,29,26,23,26,24,27,34,30,25,27,25,28,27,32,32,31,42,42,34,29,28,29,29,32,32,28,29,25,32,35,39,44,36,38,35,25,24,31,32,39,32,34,38,42,43,67,104,112,114,108,97,95,95,109,110,142,126,50,23,12,16,21,20,20,19,24,24,18,23,24,20,21,26,21,20,20,21,24,20,21,25,21,24,30,29,31,33,33,39,51,46,57,58,59,62,74,75,75,81,83,88,93,107,144,171,174,162,159,163,169,165,163,165,163,161,150,153,159,158,159,159,162,167,166,168,162,159,156,160,162,157,163,164,164,163,158,159,164,158,157,156,145,148,151,146,152,156,156,155,148,153,162,167,177,179,174,160,137,103,87,104,135,165,177,179,168,162,165,161,165,161,159,160,159,168,169,167,171,172,169,173,170,171,171,163,160,156,161,166,157,158,155,155,163,158,154,150,149,148,152,155,152,151,153,161,159,158,162,160,160,162,164,160,163,153,169,115,10,1,9,10,12,10,14,12,12,14,14,14,21,20,22,21,23,20,24,27,22,29,29,27,33,35,33,34,36,33,34,30,29,27,21,27,26,21,23,22,26,20,17,21,18,19,18,18,18,22,21,26,24,22,23,23,23,17,23,22,21,24,19,19,18,18,21,18,17,17,24,20,19,21,17,17,17,17,17,17,19,19,19,18,16,17,17,18,19,19,19,18,18,29,39,53,57,50,45,42,49,53,56,55,52,54,58,57,63,67,70,69,62,57,59,54,50,50,49,50,51,49,51,59,61,62,64,66,71,65,57,51,51,53,66,81,90,98,86,77,83,105,116,121,122,125,132,120,117,117,114,118,116,117,112,107,116,113,106,111,113,112,117,117,111,112,113,110,112,110,117,122,114,120,117,117,120,117,108,115,128,137,138,122,116,115,122,127,123,116,115,116,124,124,126,124,120,119,118,112,110,109,116,112,110,110,111,125,109,111,117,118,123,118,110,112,112,115,118,123,131,127,129,127,131,132,134,128,117,124,124,115,121,121,125,124,119,122,122,122,125,127,129,129,136,134,128,135,126,124,130,122,115,106,111,118,119,122,125,129,134,127,116,116,115,118,124,127,127,127,125,124,120,122,126,121,119,110,117,109,103,89,74,76,81,93,89,86,91,81,83,85,93,105,90,83,76,70,69,67,64,60,60,64,72,74,71,74,72,71,71,68,68,65,58,53,54,59,66,65,74,77,76,74,61,46,54,64,57,57,59,78,80,62,84,112,138,149,134,115,89,66,53,53,64,83,108,136,162,184,205,220,237,246,253,253,253,253,220,248,250,211,177,149,147,114,55,30,12,9,14,21,37,38,35,32,36,33,37,42,34,34,36,38,32,33,39,41,33,34,39,34,35,32,32,35,32,32,32,31,29,37,32,27,29,29,25,27,28,27,24,27,29,29,29,29,30,24,23,28,32,27,27,24,24,31,26,27,26,22,25,25,29,27,26,23,27,29,25,23,28,23,19,26,23,24,25,24,23,25,24,23,21,23,28,21,19,21,25,26,23,22,22,19,20,21,20,21,21,22,23,21,23,20,19,24,19,19,20,19,23,20,18,22,22,22,21,19,18,18,21,20,19,21,19,20,19,23,22,16,19,24,22,24,27,24,27,24,37,35,29,36,32,26,30,31,31,31,42,53,36,33,29,32,27,27,31,27,27,26,32,29,35,39,41,41,39,42,27,24,32,34,38,34,37,39,42,43,77,137,145,135,135,118,113,109,127,134,167,150,63,27,11,14,21,22,24,18,24,25,19,23,24,23,19,19,25,21,23,24,22,21,21,23,27,27,22,27,34,35,37,43,50,54,56,56,63,68,78,79,78,87,96,92,105,152,173,179,169,157,160,160,165,169,162,162,160,164,168,165,165,160,159,164,167,170,166,167,163,160,155,161,165,163,171,165,160,155,149,148,159,162,166,163,160,160,156,153,158,165,164,159,157,160,161,161,165,172,173,176,170,136,104,84,101,132,163,182,174,169,166,166,167,160,158,159,162,164,165,169,167,166,164,161,159,162,164,158,154,159,166,165,161,159,156,162,170,163,163,163,158,160,164,165,163,165,168,174,175,170,165,165,161,160,162,154,154,148,163,114,11,1,9,10,12,11,13,11,13,14,14,14,24,22,25,27,33,26,32,33,27,34,33,34,33,31,33,27,26,24,27,23,19,22,19,19,22,19,23,22,20,18,23,23,18,21,18,17,20,24,27,20,25,24,19,22,22,24,19,18,20,19,18,17,20,20,17,21,19,19,20,22,21,16,18,18,19,18,18,17,20,21,19,21,18,20,18,18,19,18,19,18,19,33,37,51,54,45,48,57,56,57,54,54,63,57,74,65,66,66,89,89,89,85,76,63,76,66,59,63,66,72,70,83,89,86,87,88,103,99,90,82,68,72,79,77,79,86,85,86,71,95,76,88,112,108,128,124,125,124,127,126,125,123,119,120,119,118,119,115,122,121,128,131,126,118,115,121,125,123,125,125,118,122,119,115,117,122,122,118,126,127,129,120,118,124,121,123,126,120,122,127,125,129,128,121,118,122,120,123,124,120,118,118,115,106,108,107,122,127,121,127,126,111,107,110,113,110,115,125,126,130,122,115,123,126,128,121,118,129,126,118,117,121,123,124,124,120,125,124,123,129,125,134,134,128,112,109,114,109,111,110,120,123,123,121,118,116,120,117,121,128,120,124,120,130,127,128,135,129,131,132,131,134,141,138,134,138,136,129,120,102,92,93,96,96,90,82,77,72,63,63,87,95,94,96,78,83,89,83,82,79,84,73,92,93,88,76,68,73,70,78,79,64,59,54,60,57,57,60,57,61,61,54,48,49,42,46,54,52,51,66,71,64,92,120,139,156,148,156,154,143,127,108,90,91,84,89,101,127,118,130,150,165,189,215,236,210,217,253,253,253,252,252,252,241,221,194,153,115,82,112,26,9,16,19,24,36,33,30,35,37,33,33,32,34,36,36,39,35,34,27,34,32,33,32,30,34,33,33,28,28,25,27,30,30,27,24,26,27,30,26,24,24,21,29,23,25,27,23,25,27,29,24,28,26,24,24,26,27,22,25,28,24,30,25,25,25,20,24,22,22,27,24,23,27,28,24,24,24,24,23,26,22,24,20,21,27,20,19,20,17,22,24,21,19,19,22,21,22,20,21,21,18,22,20,21,21,17,19,21,22,21,21,21,23,21,21,21,19,24,19,21,19,20,19,18,21,20,21,19,23,24,24,25,25,23,36,33,24,30,29,29,29,35,27,30,31,34,35,29,35,32,33,35,31,26,27,30,33,31,24,30,41,42,45,41,40,27,23,29,40,40,32,40,36,38,41,63,113,116,123,116,98,98,103,112,109,146,130,55,27,15,16,24,21,20,22,24,21,17,24,20,21,21,20,25,23,22,19,20,25,23,23,29,29,27,25,31,39,40,45,56,54,56,62,64,70,80,80,86,108,89,105,149,174,180,172,171,164,159,165,172,162,156,157,156,164,165,162,163,155,158,165,155,160,163,161,156,155,152,156,160,160,166,162,160,149,136,141,159,162,165,166,161,165,160,157,165,169,163,159,148,146,145,141,148,151,157,163,172,163,137,103,84,98,127,163,177,177,173,170,173,163,162,161,158,162,155,158,156,151,154,151,151,155,156,154,158,161,167,166,159,162,162,164,167,163,164,160,163,163,168,170,168,171,175,180,177,172,170,162,159,160,162,156,157,148,163,113,12,1,8,10,13,11,12,12,12,14,13,13,30,32,37,33,35,36,31,31,30,28,25,23,25,27,22,18,19,20,17,18,22,21,18,22,19,20,21,20,23,19,21,21,18,19,20,19,22,24,24,22,18,22,22,21,18,21,19,16,21,18,16,17,18,17,17,21,21,26,24,19,23,19,19,19,17,24,21,21,19,21,21,19,20,20,19,15,22,19,19,20,21,30,36,49,45,46,54,61,71,67,76,83,79,79,89,109,112,105,106,107,109,104,89,68,62,51,46,51,54,66,75,95,106,103,104,102,118,124,120,99,93,91,93,92,87,106,107,96,87,87,77,89,100,111,120,121,130,118,125,129,121,132,131,126,126,129,133,128,130,127,125,128,127,131,127,126,129,121,125,134,128,125,128,129,131,134,125,120,117,117,121,120,126,125,121,120,114,110,117,119,125,125,124,122,119,124,132,134,134,134,134,130,125,116,115,131,131,131,133,125,128,125,120,128,123,120,118,116,125,120,122,115,112,125,133,132,136,143,127,116,118,124,123,122,131,129,127,124,130,130,130,129,125,127,113,120,118,118,125,121,132,133,131,129,124,124,125,125,130,127,127,133,130,128,118,112,107,113,122,127,133,136,139,137,138,130,128,129,131,118,111,110,108,104,87,84,95,93,101,89,90,98,100,110,105,103,109,101,97,95,85,92,91,87,92,83,75,77,75,74,74,73,61,59,59,61,67,57,53,52,50,51,46,46,48,47,54,51,49,67,81,80,95,106,126,146,143,163,185,186,182,173,164,147,130,122,113,92,83,75,69,72,90,130,157,148,184,234,241,251,239,242,252,252,252,252,252,249,238,226,198,147,113,78,49,19,6,15,22,27,36,31,31,36,33,34,36,34,31,29,33,30,32,30,29,38,26,30,29,28,32,27,29,27,26,25,26,24,26,24,27,23,25,25,22,28,25,29,27,24,25,24,22,24,24,29,24,25,27,24,27,22,26,24,22,25,24,24,24,26,24,22,28,25,22,22,20,23,27,25,23,23,19,21,24,23,21,18,22,23,20,21,20,22,25,24,21,20,22,21,16,23,23,16,22,20,17,26,17,21,24,20,21,21,23,20,23,18,20,22,17,20,24,20,18,22,23,19,19,19,19,19,22,21,25,33,25,27,32,25,29,33,29,29,31,33,31,29,31,38,37,48,44,27,31,31,34,33,29,28,34,43,42,35,40,39,27,24,26,41,44,35,42,30,34,46,48,61,67,67,61,54,52,58,54,56,74,73,55,26,15,17,18,25,21,19,24,26,22,22,23,21,25,21,20,22,21,22,24,22,23,24,28,28,28,26,26,39,43,52,54,53,65,65,65,74,83,85,96,98,106,147,173,177,166,159,165,163,162,160,162,156,152,153,149,153,155,154,153,155,157,155,148,151,157,155,153,157,151,158,158,154,163,160,157,155,153,156,165,163,165,161,163,171,163,165,169,167,164,153,147,141,139,137,139,143,147,161,166,169,163,139,113,89,89,125,157,173,173,169,171,165,160,163,160,159,155,155,152,152,155,154,150,155,155,162,167,164,168,165,161,162,158,161,163,153,156,153,156,164,165,164,163,168,169,170,168,165,163,157,156,161,164,155,163,158,167,114,11,2,8,10,13,11,13,12,13,13,13,13,36,29,30,29,28,27,24,22,16,19,23,19,23,19,20,22,21,27,20,18,21,21,20,17,25,24,21,22,24,24,17,23,20,22,19,18,24,19,19,17,20,20,17,18,18,18,18,19,17,19,19,16,19,17,17,20,19,19,22,20,17,19,16,18,18,19,22,17,19,20,21,18,17,22,18,17,18,19,20,18,21,27,27,33,31,33,42,42,39,48,47,57,57,60,62,59,57,64,69,71,70,67,61,49,48,38,38,44,39,39,44,55,64,65,67,71,66,82,63,61,50,69,68,49,45,50,64,55,48,43,47,51,63,89,89,81,75,80,88,98,96,101,104,110,109,117,128,128,128,120,118,111,119,128,128,129,121,123,129,133,129,124,125,124,119,118,116,117,113,108,117,115,115,122,125,126,128,122,126,135,128,127,124,125,128,133,138,134,136,129,124,128,123,127,131,134,139,130,129,125,126,124,128,138,136,134,129,127,125,133,129,124,128,130,134,134,137,126,122,128,127,131,131,129,131,124,122,123,124,129,131,128,128,137,134,132,138,136,142,139,134,129,125,131,131,125,124,132,133,129,133,132,128,124,114,117,122,117,123,127,127,126,123,118,110,110,110,110,108,113,113,111,103,108,108,110,116,115,117,106,106,110,107,104,103,107,99,88,81,83,83,92,92,75,75,75,74,65,57,66,67,67,71,66,68,69,73,60,61,57,54,53,48,50,51,60,62,53,55,73,85,81,108,87,81,85,105,100,102,102,98,95,89,100,109,110,113,106,97,89,93,52,51,66,64,57,107,131,138,159,169,179,194,218,234,245,252,252,252,252,253,253,242,215,201,171,136,114,117,55,22,15,14,25,28,25,29,26,31,27,29,34,29,29,29,26,27,31,31,28,29,27,29,30,29,28,24,27,24,28,27,26,27,29,27,32,25,27,24,24,29,21,26,20,28,25,25,27,24,24,20,26,24,21,28,24,20,27,24,21,24,24,19,25,24,23,28,23,20,24,24,22,23,23,17,23,20,22,22,19,24,23,21,19,23,23,20,18,18,20,20,22,19,19,19,19,22,20,23,21,18,22,23,21,21,22,22,15,24,27,19,23,22,19,22,21,19,19,22,24,20,17,20,19,22,23,22,25,26,28,26,28,29,34,34,30,36,36,35,39,40,49,39,28,29,30,43,42,27,31,33,39,44,43,40,39,28,26,33,39,43,37,42,40,41,41,42,51,46,51,41,33,41,36,38,38,48,49,41,27,17,18,18,21,24,21,21,22,26,24,19,18,21,22,21,26,22,19,23,19,24,22,23,27,26,29,35,35,40,56,55,57,70,67,68,86,86,86,97,113,150,174,175,163,156,158,164,158,155,156,160,153,157,156,151,150,147,151,157,153,155,161,156,159,160,158,162,165,162,163,157,153,162,155,154,161,160,163,168,162,160,157,157,162,154,156,159,162,160,160,152,153,158,151,153,156,163,160,164,165,166,162,153,128,84,83,112,140,156,165,165,159,159,160,162,164,164,162,157,160,163,159,155,159,162,168,169,160,160,158,158,160,159,158,159,156,153,155,158,160,163,159,159,160,163,165,161,162,160,158,160,160,163,162,167,161,171,115,11,2,10,10,12,11,14,12,13,14,14,13,19,25,22,17,25,20,16,17,17,20,22,26,21,21,22,21,23,20,21,20,22,22,21,22,19,21,20,23,24,19,19,18,21,23,16,17,17,16,19,17,20,21,15,23,22,16,21,18,19,18,17,19,16,21,19,17,18,20,18,17,19,17,18,18,18,18,17,21,20,20,19,16,17,17,20,19,16,21,21,18,19,21,20,19,21,19,19,21,22,28,26,32,35,39,35,33,30,31,30,34,35,36,36,33,44,40,42,44,36,38,38,34,41,39,45,46,42,43,39,48,51,53,50,49,46,47,43,37,37,38,43,46,54,60,59,62,66,54,59,74,79,87,93,101,108,114,125,124,124,121,116,113,112,128,127,122,128,129,128,127,125,120,125,116,104,105,111,122,122,113,116,120,119,124,131,130,129,128,134,134,135,132,130,132,130,131,127,122,121,126,126,125,128,129,126,129,129,134,137,125,128,128,131,141,146,140,137,136,134,132,132,133,135,139,133,123,128,130,122,122,131,138,134,133,135,133,128,124,127,129,136,134,132,142,141,143,139,137,139,134,135,131,128,130,123,120,121,128,135,130,129,131,125,128,130,135,140,142,141,133,129,125,117,109,110,107,105,111,113,110,116,113,108,117,112,119,114,99,110,106,110,107,89,89,89,92,86,74,79,73,74,88,88,75,81,84,72,73,63,63,67,65,69,74,74,73,67,59,70,69,64,66,62,58,51,54,53,51,60,65,80,70,69,69,59,65,51,47,48,43,57,60,63,71,80,88,103,119,130,136,132,121,91,87,52,25,50,38,36,59,64,81,99,129,151,163,186,207,220,241,252,252,252,252,252,252,251,247,223,200,186,157,127,95,59,32,25,15,26,28,26,28,24,31,24,23,24,25,26,27,29,30,28,25,30,26,27,26,29,29,27,29,32,32,23,28,25,25,25,23,26,23,25,28,23,24,25,27,23,20,29,23,21,23,23,26,24,24,23,22,25,24,18,22,24,22,21,23,25,24,23,19,20,21,22,21,22,24,24,21,22,22,16,20,18,21,23,18,21,21,20,19,22,18,23,19,18,23,23,21,22,22,19,21,18,21,24,20,20,23,18,19,19,22,25,20,16,20,21,19,21,17,22,21,18,19,22,24,23,27,23,26,32,29,36,46,45,38,39,35,34,32,31,35,27,29,52,38,19,32,38,39,45,42,44,36,29,28,34,47,42,39,41,35,40,41,46,50,45,46,34,29,32,35,33,39,40,39,40,30,22,17,19,18,22,22,18,22,24,21,20,21,21,19,24,22,17,23,19,19,25,25,24,30,27,27,36,35,46,56,59,64,74,77,86,95,91,92,115,159,181,179,172,162,163,162,155,150,153,156,158,159,163,163,163,164,165,162,160,160,168,168,167,165,164,163,166,165,155,163,157,161,168,164,169,171,165,160,162,159,160,152,152,155,150,152,151,155,159,160,161,164,167,164,165,163,166,166,165,163,162,169,173,156,117,83,77,97,131,158,168,168,166,166,165,170,168,168,167,166,165,166,161,162,162,163,166,162,162,159,160,165,160,164,166,164,166,165,163,161,162,162,161,166,165,164,165,162,162,162,159,158,163,162,167,157,166,115,11,0,9,10,12,11,13,11,12,13,14,13,19,21,20,16,17,17,18,19,22,16,21,23,19,21,23,20,15,23,21,22,21,16,19,19,18,17,19,19,17,17,19,18,17,19,18,17,19,17,16,20,19,17,17,16,19,19,18,17,16,17,18,18,20,19,17,15,21,19,18,21,16,19,16,19,21,19,22,22,23,20,19,18,19,19,18,21,19,18,19,22,19,17,21,18,19,22,19,22,20,19,21,21,26,24,27,24,23,29,29,31,32,36,44,43,49,54,52,57,52,50,46,39,43,42,56,52,47,47,45,52,61,66,70,69,61,59,50,54,55,52,65,66,82,75,78,84,85,92,77,106,99,128,128,123,130,130,128,131,128,132,128,125,125,124,128,130,128,131,126,124,129,129,130,130,129,126,127,133,134,125,127,130,131,130,128,124,124,125,128,130,131,129,131,132,120,118,114,111,126,128,127,130,124,125,126,128,135,131,128,126,126,128,131,133,137,137,136,131,126,129,123,124,130,133,130,126,132,131,131,139,134,133,135,141,141,136,133,138,139,135,135,132,133,141,146,139,136,132,132,135,140,138,135,133,135,139,134,142,141,144,133,131,134,133,130,133,139,136,132,137,122,123,123,125,115,114,125,127,123,120,119,122,118,120,111,107,101,93,107,113,110,104,90,82,80,89,84,82,80,77,79,90,84,81,92,77,85,87,75,68,69,74,66,65,73,72,74,68,70,78,74,75,65,69,61,55,56,59,60,56,64,56,60,56,61,52,50,44,44,40,71,49,52,74,71,67,99,103,127,128,127,143,146,141,86,57,87,86,90,68,58,55,48,53,69,87,99,122,147,168,197,215,226,232,236,245,246,248,251,251,252,249,236,218,205,200,155,123,117,87,66,43,27,24,26,25,24,28,28,25,29,24,27,24,28,28,24,28,28,30,27,33,26,26,30,24,29,27,29,24,27,21,26,29,21,23,21,24,26,25,24,25,26,26,24,23,23,24,21,21,23,19,23,28,21,29,20,22,21,19,24,20,21,20,21,29,24,20,21,19,19,23,17,23,21,18,24,19,18,17,24,26,19,23,22,17,18,21,18,18,21,21,25,22,21,23,21,22,24,20,24,21,19,22,19,22,17,21,19,18,20,20,21,17,24,22,15,21,22,19,23,26,28,30,44,50,36,35,39,32,36,40,33,33,26,29,35,26,27,34,34,43,45,40,49,43,24,29,41,47,39,43,41,37,45,42,45,46,43,42,39,35,33,34,33,36,39,45,42,26,25,19,20,21,21,24,16,20,19,21,22,18,23,19,22,20,23,26,21,21,22,25,24,24,30,28,36,40,46,52,53,69,79,84,100,91,89,123,167,183,182,174,164,166,161,159,160,152,154,159,162,158,162,163,171,171,163,165,163,160,159,159,155,155,160,160,157,154,156,160,159,163,168,165,169,168,158,159,165,165,164,157,162,167,161,156,153,155,152,151,157,162,165,160,155,157,158,163,165,159,159,160,176,172,147,124,95,77,90,129,159,175,179,174,171,164,164,166,167,167,167,168,166,163,163,166,164,166,165,160,163,163,165,162,164,169,169,165,162,165,168,167,167,171,171,169,167,169,170,165,161,156,164,161,166,159,166,114,12,2,8,10,13,11,14,13,14,14,12,14,22,23,19,20,18,21,21,18,19,18,22,19,20,21,19,22,19,21,18,17,20,17,17,18,19,19,20,17,16,17,17,19,17,19,19,18,17,16,17,16,17,16,19,21,16,18,19,17,20,18,17,17,18,19,17,19,19,17,19,18,16,18,18,22,19,21,23,22,20,20,20,16,18,17,18,19,17,19,18,19,22,26,28,24,23,28,26,24,27,23,23,30,32,37,43,42,45,47,50,52,50,49,54,55,61,65,71,81,74,67,65,70,72,73,71,62,60,55,50,55,50,47,57,69,74,65,63,68,80,88,91,93,96,92,87,89,84,83,89,101,112,117,122,124,127,128,128,126,127,136,136,132,125,126,122,123,127,125,128,130,130,134,147,142,140,139,137,141,143,137,131,130,132,132,131,128,129,125,125,125,127,127,127,121,121,123,120,127,132,136,121,120,131,128,136,135,128,130,122,123,131,128,132,129,122,118,119,126,120,123,127,131,131,135,141,135,144,146,143,144,141,141,141,139,141,141,143,150,145,144,146,148,154,153,144,134,136,139,146,142,143,149,148,154,153,153,153,153,155,155,152,155,157,155,148,140,143,147,147,147,141,139,139,136,128,141,153,148,145,133,126,130,132,134,127,124,116,121,128,121,120,107,97,92,84,91,85,74,78,81,85,77,80,67,59,66,72,77,69,79,83,76,85,79,78,89,84,76,67,54,64,67,70,68,53,62,55,54,49,51,55,54,48,48,51,48,46,48,50,44,49,43,45,46,48,51,56,86,105,110,112,124,149,159,117,117,158,165,167,162,130,101,71,69,76,64,58,47,48,67,89,113,136,152,162,183,196,206,224,231,235,236,235,250,251,237,229,227,234,227,215,197,183,179,129,87,66,57,36,20,30,25,26,21,20,29,27,26,25,22,22,27,26,28,29,29,24,24,28,22,24,21,24,27,30,23,21,26,25,29,27,21,23,23,21,22,25,22,23,25,21,28,22,24,23,17,24,18,19,21,21,22,21,21,24,20,18,21,24,17,19,19,19,23,21,21,21,23,19,21,19,19,22,20,23,20,22,24,16,21,24,19,20,22,19,23,21,19,18,19,18,20,21,19,21,18,24,18,19,21,21,20,17,22,23,23,19,18,18,20,23,22,24,25,34,36,32,33,34,39,38,37,32,29,29,31,31,25,34,34,30,45,47,44,45,41,33,26,40,46,44,42,39,44,44,42,43,46,42,39,45,35,35,34,29,34,38,44,36,24,23,23,21,22,23,23,19,18,21,21,21,22,22,22,23,24,23,23,23,22,21,20,29,28,28,29,30,41,43,53,62,78,84,84,90,82,105,155,173,171,165,162,167,160,160,166,167,166,164,160,155,149,155,158,163,162,156,152,153,152,151,149,150,147,148,156,157,161,161,159,155,153,155,156,160,159,154,158,160,162,162,159,163,167,165,160,161,160,151,150,157,162,165,155,147,147,152,154,158,155,150,159,168,166,162,160,135,102,86,93,125,153,174,174,164,160,148,149,155,156,160,160,160,160,158,161,160,164,171,166,164,166,164,166,163,160,160,155,158,158,163,165,159,166,168,169,174,171,171,166,164,161,166,163,163,159,165,113,12,2,8,10,13,10,12,12,13,13,14,15,23,20,18,19,18,17,20,20,18,19,19,17,19,18,20,19,17,21,16,18,17,18,20,17,17,17,17,18,18,18,16,16,17,16,17,17,19,17,17,20,16,18,20,17,18,19,19,19,18,18,18,17,19,19,17,19,18,22,17,19,18,20,24,17,21,22,19,19,21,20,21,17,16,18,15,18,21,18,22,18,23,33,29,32,37,33,32,32,34,33,35,34,35,43,39,48,49,39,56,56,49,45,51,57,58,49,55,66,77,83,55,57,73,65,59,59,72,66,65,59,52,47,48,46,50,47,40,55,66,68,73,76,87,89,84,77,63,55,80,92,106,107,114,103,98,104,107,108,111,118,119,127,133,131,130,115,117,124,116,117,116,108,110,117,131,127,118,120,131,137,141,141,132,129,133,135,130,119,121,128,132,130,126,118,117,135,139,138,140,138,137,139,141,142,137,134,139,143,139,143,145,142,133,118,114,125,132,133,131,137,140,145,146,145,141,137,140,135,139,142,144,147,141,145,146,145,146,151,155,151,149,151,148,136,134,139,139,135,136,136,138,132,141,136,134,133,140,142,145,142,143,146,161,174,160,160,142,164,145,144,142,142,141,136,145,146,145,137,130,122,119,117,112,115,112,108,113,117,120,109,106,100,85,100,83,95,83,73,79,79,63,61,57,59,67,67,65,71,67,83,99,84,81,84,75,83,72,78,64,72,81,68,53,57,57,57,54,53,54,57,51,55,59,54,58,57,56,55,50,52,49,54,65,58,61,58,60,83,81,78,94,97,110,129,99,118,142,146,162,162,134,116,113,113,120,105,95,69,50,38,72,46,43,52,66,92,110,121,141,149,158,174,194,205,210,217,214,224,232,239,242,247,251,234,218,202,195,193,171,149,139,134,101,52,25,24,30,28,28,23,43,26,24,24,30,23,27,29,23,23,23,23,24,26,23,25,24,24,28,25,23,21,24,23,21,21,24,27,23,27,21,23,23,21,23,22,22,22,24,24,21,21,21,19,21,19,19,21,22,19,20,20,15,24,19,21,28,17,22,19,21,21,19,21,18,23,19,22,19,17,22,21,17,19,25,18,23,25,22,21,18,19,22,17,22,19,21,21,18,21,20,24,21,21,22,24,20,22,22,19,24,19,19,26,26,27,28,34,36,34,40,38,34,33,29,33,29,33,34,32,40,45,49,46,45,37,27,29,44,46,48,44,44,43,42,46,39,48,47,39,40,35,36,36,28,34,33,33,35,25,19,17,21,19,17,20,20,19,22,23,21,24,20,21,19,19,20,22,23,21,21,24,25,27,32,30,33,39,44,47,67,81,87,89,87,120,145,155,159,150,148,156,154,155,162,161,162,164,155,153,156,145,155,158,164,157,151,151,150,146,147,151,160,149,148,151,156,159,160,160,155,153,149,145,147,151,152,152,153,152,152,151,155,155,155,158,162,163,160,162,166,170,163,154,150,156,155,154,149,149,154,153,160,160,156,160,159,141,117,92,83,112,144,159,168,160,146,139,145,148,150,151,155,158,152,153,155,163,159,161,168,160,161,162,158,158,156,155,157,160,160,158,155,158,161,167,171,170,167,165,165,161,167,164,165,156,168,116,11,2,9,10,12,11,13,11,12,14,14,14,20,20,20,21,18,17,17,19,19,16,17,18,19,18,16,17,17,18,17,18,17,18,18,16,17,18,17,16,17,16,17,20,18,17,16,18,17,16,16,16,17,16,18,18,21,19,17,17,17,17,17,21,17,19,20,16,17,18,19,18,19,18,19,19,17,21,19,20,18,18,19,19,19,17,17,20,20,17,20,20,19,25,21,26,25,25,29,30,33,24,24,28,29,31,32,28,30,29,30,29,27,22,30,33,30,34,30,28,27,31,37,36,38,37,44,54,61,57,49,49,50,44,44,38,32,32,31,25,37,47,50,61,70,81,82,82,81,77,82,92,104,95,91,89,83,87,94,104,101,103,110,122,128,128,132,133,129,136,134,129,116,95,97,108,123,113,104,113,117,118,124,127,126,126,126,136,128,122,133,131,136,137,132,133,137,139,141,142,147,148,146,153,151,141,139,144,149,154,157,157,157,148,144,138,135,139,129,134,134,128,124,122,130,128,127,121,120,129,127,121,118,104,102,111,126,129,126,129,129,129,114,108,114,125,136,137,135,132,128,113,109,115,110,111,107,105,103,100,105,108,110,113,125,127,125,129,132,134,132,131,125,113,120,134,130,122,116,107,107,107,108,107,102,102,96,94,100,110,104,93,101,94,90,110,115,103,86,77,90,86,77,64,79,90,84,77,68,68,88,104,96,86,73,63,64,65,64,90,92,83,75,59,63,57,60,60,53,59,57,57,55,58,61,60,65,70,67,66,58,62,70,84,91,68,71,78,72,76,73,68,75,65,77,91,69,82,77,80,102,89,86,99,116,148,147,135,137,131,118,105,88,65,57,43,42,53,43,34,32,39,57,79,97,113,136,148,157,160,174,192,198,202,205,214,208,209,221,229,229,237,241,228,200,178,162,147,126,108,87,89,84,29,23,34,24,27,22,25,23,27,27,21,23,24,28,25,22,21,25,25,21,25,22,22,26,24,21,25,26,22,24,23,21,17,24,22,21,24,24,23,22,16,18,21,21,22,19,21,20,21,21,19,25,22,21,21,16,20,21,22,17,19,21,16,22,21,18,20,22,23,16,19,21,16,19,22,17,25,22,17,22,19,22,19,22,18,22,20,20,21,20,22,21,25,18,22,22,19,21,21,21,20,22,21,21,23,28,35,33,37,39,38,40,34,36,35,29,37,43,40,41,54,49,46,44,39,27,31,48,46,42,45,45,43,42,39,44,51,47,45,36,38,39,35,32,33,36,34,34,29,19,20,17,20,22,19,24,20,19,22,23,20,21,23,19,25,17,22,26,20,21,23,26,23,27,33,31,46,48,48,66,77,95,113,141,164,165,165,162,159,162,161,155,154,154,151,151,159,162,157,159,159,160,157,165,172,166,161,156,148,146,156,160,145,147,155,148,148,151,154,160,160,160,154,153,159,155,159,155,150,147,151,158,155,150,150,155,165,166,165,168,161,161,160,162,166,163,158,153,154,155,155,157,156,155,160,162,169,154,123,97,83,105,131,158,171,156,149,153,155,157,155,159,160,158,160,154,155,154,152,158,156,156,158,157,157,157,155,157,161,162,160,155,157,156,162,169,169,170,164,163,160,162,162,165,164,169,113,11,2,9,10,12,11,14,12,12,14,13,13,20,16,17,20,18,16,18,17,16,18,17,16,17,16,17,17,17,17,18,19,17,16,17,16,16,16,17,17,17,17,16,17,17,17,16,17,17,16,18,17,16,22,18,18,20,16,18,19,16,17,17,18,18,16,17,17,18,17,17,18,20,16,21,20,17,22,17,21,17,20,21,18,19,17,20,17,21,20,18,21,19,19,24,27,32,30,28,31,30,28,26,29,26,26,28,33,26,19,25,27,26,26,23,24,24,26,28,24,30,33,33,39,37,48,60,61,64,63,55,53,52,52,46,45,42,41,38,44,44,49,57,53,54,60,73,65,71,100,99,76,80,63,54,75,89,102,114,118,117,117,117,124,113,110,112,120,138,146,139,136,139,132,128,127,120,116,118,118,117,108,105,108,120,113,129,130,128,137,137,136,131,130,130,123,130,131,125,130,123,121,111,103,106,110,113,117,108,109,114,107,110,116,125,129,132,136,123,122,122,122,118,120,131,122,135,128,122,112,97,76,71,73,87,109,123,130,130,117,107,106,109,119,122,130,133,130,134,128,123,123,121,116,118,122,116,108,77,86,99,100,102,106,105,94,84,88,91,83,91,97,99,104,118,120,114,110,104,113,112,114,122,115,99,98,89,96,102,107,102,100,116,112,109,117,104,99,96,88,115,101,95,82,83,94,92,101,101,116,98,101,86,73,55,48,60,74,82,96,81,62,59,61,57,62,55,56,59,53,52,57,50,52,55,54,54,50,53,53,54,51,53,63,67,65,70,63,55,55,55,60,61,53,46,53,51,47,53,39,42,35,61,60,94,115,111,104,129,143,117,132,117,109,131,110,101,91,74,62,53,60,63,63,31,69,46,53,67,73,87,111,123,130,145,153,165,174,183,189,203,212,200,212,210,205,210,211,209,214,218,202,144,107,110,97,88,61,39,40,45,32,27,26,21,23,23,22,19,21,22,20,22,26,25,23,22,22,23,21,23,22,21,21,22,26,23,22,25,20,20,26,17,23,21,22,26,15,23,22,18,23,20,19,24,16,20,21,19,19,21,20,18,18,19,16,17,24,21,19,20,19,21,18,21,23,18,21,18,19,21,22,21,16,20,19,22,19,21,24,15,21,20,19,19,19,22,21,21,18,21,19,24,22,19,21,19,22,27,26,27,37,35,36,41,34,32,37,34,44,39,30,45,49,50,53,45,36,24,26,45,46,48,46,42,42,43,42,43,55,46,43,40,35,35,35,37,32,35,37,35,24,23,16,19,22,18,20,18,21,20,21,21,19,21,19,22,23,22,25,22,22,24,24,26,23,25,30,37,43,43,48,74,85,112,165,158,170,176,167,175,172,168,167,166,163,163,159,158,165,159,165,163,162,164,158,166,164,166,160,161,160,152,161,159,151,159,157,146,144,153,159,161,160,166,168,163,171,168,167,166,157,160,162,165,160,156,151,156,162,156,155,152,151,154,154,160,160,160,159,153,157,158,155,157,162,159,160,169,174,173,155,127,98,83,98,133,160,169,167,162,160,161,157,157,157,158,160,149,149,145,146,150,146,153,154,152,155,160,159,159,159,163,159,155,159,162,168,171,170,168,165,160,155,156,155,165,157,168,115,11,1,8,10,12,11,13,12,14,13,13,13,18,17,17,17,16,17,16,17,17,17,17,16,17,16,17,16,17,17,17,16,16,16,17,18,16,17,18,16,18,18,16,19,16,17,17,16,17,16,19,19,16,18,20,18,17,18,19,17,18,18,17,16,19,20,17,19,19,17,17,17,20,20,17,19,17,17,18,18,20,19,18,18,17,18,18,18,22,18,20,20,19,24,23,29,30,33,33,30,32,26,28,22,24,26,27,26,25,27,29,29,36,32,34,31,30,33,26,34,39,39,44,42,40,45,51,61,81,97,87,62,50,49,51,49,45,44,40,35,39,38,37,40,37,40,38,44,53,50,51,50,52,63,75,87,103,118,137,141,134,136,137,132,123,122,125,128,127,131,127,133,146,150,149,137,134,130,134,141,133,126,117,117,118,119,128,134,137,141,143,131,123,119,110,108,115,123,122,118,117,98,81,78,84,91,93,87,71,66,71,60,68,88,104,113,120,128,115,128,133,118,118,125,137,142,143,137,137,132,113,101,115,129,145,153,153,148,132,124,130,139,139,136,132,131,130,126,122,130,142,142,143,142,141,150,143,131,125,130,137,127,125,133,138,129,119,117,110,112,120,118,117,122,139,146,138,125,119,120,127,128,125,122,116,116,103,103,104,104,101,105,112,107,107,103,85,81,87,95,110,104,106,96,83,95,100,123,128,111,100,74,65,63,59,62,63,76,91,87,60,59,62,53,58,57,61,60,51,55,57,56,54,54,51,48,53,53,54,55,54,66,61,62,63,52,56,51,48,51,53,50,53,49,46,46,44,49,44,41,44,41,50,62,65,79,69,64,73,88,95,96,121,129,128,116,117,120,110,104,98,109,98,78,67,59,62,45,46,37,27,34,39,50,61,71,86,99,112,126,128,142,151,160,173,184,197,197,203,214,199,200,191,179,190,186,179,178,183,173,132,100,91,83,70,54,36,31,40,40,26,23,24,22,21,21,21,18,24,20,21,22,19,24,26,20,22,20,18,23,20,20,24,21,19,19,20,22,16,21,23,22,22,19,25,17,23,25,19,24,19,20,20,19,19,21,19,18,23,21,19,22,16,27,21,16,24,18,22,19,19,21,20,23,22,21,17,22,22,18,19,22,19,23,22,22,21,19,19,22,19,21,21,18,27,20,15,21,18,27,25,27,34,39,39,39,35,32,37,35,30,34,43,50,57,55,53,37,23,30,45,45,43,45,42,46,46,45,46,45,42,41,33,33,32,32,37,28,36,32,32,22,18,23,17,22,17,20,21,19,22,19,23,21,20,22,19,22,22,20,24,21,22,26,26,25,23,34,39,41,39,53,68,75,122,152,157,167,168,176,175,169,165,163,168,166,159,159,159,157,155,147,150,160,160,160,159,154,151,154,165,165,165,170,166,163,166,165,157,162,169,166,162,160,159,159,161,167,166,168,168,162,165,165,163,163,158,159,159,160,151,150,154,147,152,152,149,155,153,153,156,157,159,156,156,161,162,163,163,170,172,170,158,134,108,83,99,131,152,170,171,168,162,153,152,152,154,159,147,143,140,145,151,148,153,152,151,156,158,159,157,161,162,158,159,163,162,167,170,167,170,163,163,159,159,158,160,158,165,112,13,2,9,10,13,11,13,12,13,13,13,13,17,17,18,17,18,18,17,20,16,17,18,16,17,17,17,16,19,18,18,17,16,17,20,19,16,19,16,19,17,16,17,18,17,19,21,17,17,16,19,20,17,21,17,17,19,20,19,16,19,19,16,19,19,16,17,17,18,18,18,18,16,17,17,19,19,18,19,18,17,18,19,16,18,19,17,17,19,24,19,21,23,25,27,22,21,30,23,23,23,21,22,20,20,21,19,23,25,23,27,21,28,28,22,26,25,23,24,23,24,31,34,34,35,35,39,37,44,49,53,58,61,56,52,43,46,36,42,41,32,37,40,42,42,47,47,40,45,47,47,46,47,59,81,100,106,119,134,139,143,142,139,130,136,136,130,127,125,127,127,139,147,141,141,146,148,146,147,143,143,143,143,141,128,131,140,137,135,145,136,131,120,118,124,118,125,123,125,119,131,112,109,116,139,135,130,121,114,103,113,112,116,123,119,109,111,119,113,107,106,121,123,126,139,145,149,148,139,133,143,152,155,162,161,151,139,132,135,131,125,123,127,143,139,139,137,125,131,132,130,135,147,147,153,149,149,154,145,148,137,135,140,143,144,144,139,137,141,142,145,145,139,144,153,147,139,133,127,135,133,134,116,111,116,122,119,119,117,114,106,105,109,102,104,95,80,81,81,97,79,97,106,106,103,106,105,101,104,81,81,66,70,71,71,75,71,66,63,56,61,65,58,54,56,63,65,59,51,53,59,59,57,57,57,59,58,59,71,71,59,83,75,71,59,49,56,56,53,59,52,57,56,51,53,49,48,49,53,48,51,53,53,61,55,58,50,50,55,75,76,76,87,92,47,60,87,84,99,102,107,123,122,92,107,116,103,92,89,74,67,63,63,57,55,46,41,40,42,46,64,60,70,95,115,130,125,143,153,158,171,178,174,170,185,189,195,199,200,190,184,184,187,180,136,168,171,175,154,100,72,76,89,44,43,36,39,47,37,27,25,24,23,17,15,17,23,20,19,19,19,19,16,21,20,20,18,15,21,22,21,18,24,23,19,19,22,23,20,19,20,17,23,22,17,20,20,21,19,19,23,20,22,21,17,23,19,21,19,17,22,18,17,19,21,19,19,21,19,22,17,24,18,19,19,19,26,18,21,20,24,21,21,24,18,21,21,20,22,22,23,27,27,33,37,37,36,25,31,32,25,36,46,49,49,54,51,40,31,40,56,50,49,46,45,44,47,49,44,35,33,36,34,29,34,31,33,29,29,34,29,23,23,20,19,21,18,23,20,22,20,22,19,17,20,22,21,19,25,21,24,27,25,24,25,24,31,34,36,36,42,53,66,73,83,105,133,158,171,178,159,160,168,165,165,162,154,150,146,144,146,147,155,157,152,159,160,155,156,153,162,163,164,166,165,163,169,170,169,177,177,169,161,163,155,147,149,161,160,166,170,161,160,157,155,155,153,160,163,161,159,163,159,155,158,152,153,153,156,157,153,156,155,155,156,159,159,157,156,158,163,161,164,162,143,113,91,98,124,152,171,174,170,156,152,157,155,157,152,148,144,147,152,152,158,154,151,152,154,158,157,157,160,157,159,162,161,164,165,163,165,164,165,164,163,162,165,151,160,113,12,1,9,9,12,11,14,12,12,14,13,13,19,17,17,18,16,17,18,16,17,17,17,17,16,16,16,16,17,17,18,17,17,19,17,17,16,18,19,17,18,16,17,21,17,17,18,17,19,19,16,20,17,18,20,20,21,15,17,17,19,17,17,19,17,19,16,17,19,17,20,17,19,19,18,19,20,22,18,17,22,18,20,17,19,19,16,22,19,21,19,19,23,25,26,31,31,36,30,26,30,26,30,24,24,23,18,26,21,24,22,22,29,24,28,26,24,34,27,30,34,26,28,29,34,37,46,46,45,45,50,53,50,54,53,55,58,58,55,57,51,46,48,51,59,57,50,55,54,60,56,47,44,63,87,98,110,107,101,99,101,105,115,121,121,118,111,113,118,119,120,123,121,117,116,116,120,121,132,141,134,136,141,147,139,129,139,136,133,134,127,116,120,134,141,139,127,117,116,124,128,128,139,153,156,148,144,151,152,147,164,170,169,163,140,115,114,120,119,124,130,132,133,137,136,143,149,145,141,134,141,144,151,148,139,139,136,133,131,121,122,136,134,142,140,132,133,125,127,132,142,144,143,143,133,135,141,137,141,141,140,136,134,138,139,142,137,141,140,149,154,143,137,134,135,127,126,134,135,131,139,131,119,123,118,125,129,133,130,125,121,117,108,93,101,99,107,110,104,104,100,100,113,120,115,116,107,99,80,68,71,78,84,72,83,96,76,69,66,59,55,55,63,66,65,55,55,59,56,62,54,52,57,59,55,60,61,53,56,60,59,59,55,52,52,50,47,55,54,52,50,48,52,50,55,50,53,54,52,53,52,62,54,48,51,48,44,51,72,76,74,76,76,77,69,79,95,97,94,101,104,113,101,93,99,110,116,114,123,122,137,136,125,108,108,103,84,77,55,49,27,29,34,30,28,36,51,54,72,83,89,110,124,138,154,160,154,149,157,165,177,187,185,175,181,202,208,205,187,177,174,174,163,148,162,176,169,127,89,81,72,58,47,43,49,67,53,27,29,25,27,23,19,22,18,21,18,23,20,15,17,21,19,16,21,22,18,19,19,18,24,19,21,18,17,21,21,26,18,21,19,22,23,18,21,17,17,22,17,19,19,15,22,20,17,22,19,19,16,19,23,21,17,22,21,18,19,19,24,22,20,21,17,20,20,20,24,21,18,21,24,20,27,27,34,38,33,31,29,32,34,41,43,45,48,44,51,48,48,68,69,57,54,45,47,45,46,48,44,41,30,33,28,27,30,30,36,29,32,31,32,22,19,19,19,21,18,20,16,24,22,20,22,20,22,23,21,22,22,22,25,26,21,22,27,26,28,35,36,37,46,69,81,66,73,84,110,146,163,160,145,153,160,162,164,159,156,153,153,153,158,157,157,155,146,160,167,160,157,153,158,158,159,158,150,155,160,164,167,172,167,160,165,165,158,151,154,162,164,167,164,160,165,157,157,164,163,168,160,159,160,161,166,159,163,165,163,163,163,159,151,157,160,158,156,156,157,158,152,152,152,154,154,162,168,144,120,94,87,112,143,165,174,165,160,157,159,160,159,161,151,156,155,154,153,149,151,155,156,156,155,156,157,156,160,160,160,160,157,156,155,155,162,165,171,167,164,154,159,112,12,1,9,9,12,10,14,12,12,14,13,13,15,16,16,16,16,16,17,17,17,17,17,17,16,16,18,16,17,16,17,18,15,18,17,17,18,16,16,17,18,18,17,17,17,17,16,23,19,18,21,21,20,22,22,17,17,17,17,17,17,17,17,16,19,18,20,18,17,17,17,20,17,19,19,18,22,19,21,20,17,21,18,16,17,17,20,20,17,19,17,23,24,29,25,34,35,37,39,35,36,34,34,28,29,30,29,28,26,26,33,32,31,30,34,36,38,41,40,43,35,32,34,38,49,45,49,50,48,55,53,50,46,51,51,65,83,82,70,70,63,60,57,50,76,65,63,67,76,70,75,67,70,91,101,122,124,118,107,91,96,94,96,95,106,108,110,107,105,111,111,115,107,96,94,99,121,106,118,133,131,127,131,132,124,128,127,131,128,135,122,126,128,135,144,136,142,137,136,141,141,133,130,138,134,131,135,142,141,146,153,156,158,143,126,114,136,149,147,145,157,162,159,146,133,132,137,137,131,137,144,139,137,142,141,139,139,138,139,140,148,147,146,152,131,132,131,127,122,132,136,132,131,124,115,123,129,134,131,147,140,141,139,141,141,139,131,137,141,137,137,137,134,129,128,132,134,130,129,130,135,142,136,129,137,129,126,120,117,110,111,113,110,100,100,98,105,111,110,111,108,113,117,119,120,105,98,88,79,76,85,79,83,85,86,69,69,65,69,62,64,63,57,62,63,61,60,57,62,67,59,64,63,61,63,63,62,59,53,54,61,53,59,53,49,49,53,61,52,50,56,51,48,49,51,59,57,55,57,47,50,58,46,51,44,46,51,54,67,68,72,75,107,100,71,96,98,97,99,94,99,83,75,77,83,98,100,97,107,116,128,131,101,114,116,119,92,94,103,89,71,67,54,41,43,36,47,47,25,21,25,29,30,42,84,70,77,77,98,106,125,140,143,146,149,164,166,175,176,177,177,173,162,185,208,189,170,146,141,152,148,137,144,153,175,163,111,85,90,98,84,58,57,65,39,25,29,57,32,28,23,24,22,18,19,21,21,19,18,22,19,20,19,15,21,18,19,19,15,18,18,19,22,20,21,21,16,17,19,22,21,21,21,17,17,20,17,18,17,17,22,18,19,18,20,21,17,22,19,17,19,19,27,23,17,24,19,22,26,20,22,20,30,26,30,37,34,34,31,29,34,34,40,48,47,44,47,51,48,65,73,66,61,46,46,42,44,50,44,41,33,30,35,31,32,36,29,38,34,28,32,22,20,18,17,17,20,21,20,21,19,22,18,20,18,25,26,17,27,23,23,27,22,26,30,29,33,37,44,56,66,81,92,85,78,80,94,125,131,137,147,159,157,151,150,147,156,161,170,165,168,166,162,157,153,163,161,158,143,134,140,151,146,155,150,146,150,153,149,150,156,159,161,160,155,165,173,170,166,163,158,159,161,160,165,169,168,166,163,161,162,162,160,164,169,170,171,169,168,164,155,156,161,161,162,162,159,160,157,157,162,156,162,163,166,163,150,126,89,87,103,134,162,171,174,169,166,166,163,165,162,160,162,156,157,152,154,159,162,160,154,156,156,158,159,162,163,158,151,149,148,146,153,159,163,163,166,151,159,113,12,1,8,10,13,11,12,12,13,13,14,13,17,17,17,17,17,18,17,17,18,17,16,19,18,17,18,19,18,18,17,16,17,16,16,17,17,19,16,17,18,18,18,19,19,19,17,23,22,21,21,19,24,19,20,17,17,19,17,17,19,19,19,18,16,16,17,17,16,20,18,17,19,18,19,19,20,18,19,23,19,17,18,18,18,18,19,19,17,19,18,18,21,26,23,25,26,26,26,24,31,31,34,40,41,39,34,35,33,36,38,37,49,47,49,49,46,52,49,56,50,45,48,46,56,57,51,51,53,57,65,63,57,55,53,71,86,88,79,79,83,75,74,76,92,103,106,116,124,111,97,103,113,123,126,133,141,139,129,127,128,119,108,105,114,126,125,124,110,106,115,118,118,115,120,133,129,120,120,125,123,111,106,105,106,109,116,113,111,118,118,120,119,114,104,107,112,119,128,139,138,134,131,118,105,104,112,114,116,108,108,106,103,97,99,110,129,134,130,137,135,139,136,138,130,123,129,134,137,136,134,130,129,129,141,136,133,134,139,138,136,146,132,132,124,118,119,117,118,120,129,130,125,116,114,117,124,123,123,130,131,132,134,131,139,138,139,140,129,132,134,130,128,124,122,116,114,117,115,111,114,119,128,133,129,117,100,98,95,100,106,114,118,111,109,98,100,97,87,95,114,122,117,114,98,95,81,73,83,85,80,73,76,82,78,71,71,71,71,75,77,75,74,64,62,64,63,61,63,72,67,63,65,63,63,63,65,57,56,58,61,61,60,61,53,54,53,53,60,56,53,54,53,57,61,61,61,54,59,49,50,55,42,49,50,51,54,52,57,54,56,61,61,69,61,50,45,51,53,46,51,51,52,55,52,62,69,59,59,66,77,70,63,77,91,94,90,101,113,119,110,103,95,93,76,59,67,66,62,46,41,42,37,35,33,34,35,36,41,33,30,39,59,70,89,100,105,119,121,120,128,137,135,139,141,139,136,131,137,152,139,150,168,150,142,144,133,139,145,154,153,139,155,130,100,96,95,105,93,87,86,81,60,55,59,46,39,37,31,27,25,24,27,23,20,17,16,21,18,19,17,17,21,21,19,20,21,19,16,19,19,19,22,19,17,18,19,16,18,16,17,22,19,24,21,20,20,22,21,19,21,19,19,20,24,23,22,22,26,23,26,25,25,29,31,36,34,40,28,29,34,34,43,47,42,45,54,47,36,46,59,63,60,54,49,44,48,50,46,42,36,29,35,31,29,35,29,35,31,33,30,18,22,19,19,23,21,24,21,21,21,17,24,22,20,23,18,22,25,20,25,30,27,30,37,42,53,54,57,73,76,86,82,77,82,70,84,97,110,135,147,161,150,142,139,137,157,170,176,169,164,162,154,158,160,162,157,152,153,148,151,150,154,162,153,151,154,148,139,146,155,158,159,157,153,152,157,155,155,151,148,155,159,162,167,168,162,165,167,164,164,160,161,157,159,163,160,163,162,159,151,153,158,157,162,166,168,169,165,163,166,168,163,164,170,166,165,156,133,103,92,98,127,156,173,179,172,167,169,171,160,161,158,163,160,152,159,160,162,156,153,159,158,159,159,159,161,157,156,153,151,151,153,160,164,158,159,153,159,111,14,0,9,10,13,11,13,12,13,14,14,13,19,18,16,17,20,18,19,18,17,17,18,18,19,19,18,19,20,17,16,17,19,16,16,17,19,18,16,19,17,19,21,17,21,24,20,20,21,21,23,17,18,19,20,21,20,17,17,18,16,19,17,17,18,17,18,17,17,17,18,21,17,18,20,17,19,18,19,18,19,19,18,20,15,19,19,17,17,17,17,22,30,30,38,40,34,31,26,31,42,49,53,55,51,44,43,37,38,43,44,50,61,64,57,59,61,54,54,52,45,36,38,42,38,36,42,44,41,47,50,53,52,47,43,50,51,51,58,53,44,41,41,46,62,57,54,57,87,112,94,128,117,115,99,90,90,112,117,116,123,114,106,102,105,105,115,111,108,110,105,119,125,122,124,130,131,131,119,121,119,110,107,107,112,112,120,103,92,100,117,109,104,118,105,103,104,119,122,132,139,131,122,109,107,104,103,105,104,109,113,109,113,107,110,117,128,112,106,105,103,104,110,108,108,112,121,128,132,131,116,116,125,131,132,127,125,123,121,118,122,116,112,121,124,126,118,123,127,132,139,140,139,133,122,126,134,128,126,125,128,126,126,126,128,131,132,141,139,128,119,113,107,100,105,115,105,111,108,99,97,104,118,120,116,121,116,105,117,115,116,109,108,106,104,97,110,101,99,102,101,100,95,103,89,96,93,85,95,101,89,70,69,70,75,83,87,83,68,60,61,64,60,52,52,53,60,58,64,61,49,54,57,55,62,55,56,58,49,57,56,55,53,45,54,51,51,54,51,55,54,51,57,54,56,57,55,63,52,45,51,51,54,44,49,53,51,54,51,53,53,51,53,47,44,49,48,48,48,43,39,46,42,47,47,51,57,50,56,59,61,69,53,59,60,75,73,78,85,93,92,84,84,98,82,68,110,78,74,99,87,76,69,64,63,51,45,45,38,36,39,38,39,39,41,42,39,42,36,39,45,49,63,62,73,85,93,96,102,112,112,123,124,110,113,115,134,138,123,137,142,133,142,137,127,135,139,133,145,141,145,112,136,149,139,120,111,93,84,84,70,71,67,53,47,41,38,39,34,31,29,33,29,40,46,24,35,18,24,22,19,21,17,21,17,16,19,21,19,18,18,20,20,22,20,17,20,22,24,19,21,21,22,22,19,26,19,22,26,22,27,29,28,29,29,29,36,39,42,36,33,31,41,47,42,49,49,45,30,38,48,56,60,57,57,50,54,51,52,51,32,33,35,30,34,33,32,35,31,31,35,24,18,18,17,17,22,20,21,21,18,22,20,25,22,24,25,27,28,28,39,39,40,45,52,56,63,68,74,71,75,78,74,76,72,69,79,85,93,110,123,132,129,128,135,147,156,159,160,158,157,158,151,155,155,151,148,159,157,158,166,161,164,172,167,166,173,159,154,170,168,166,162,158,150,137,137,139,151,150,148,157,160,155,157,159,159,157,155,158,163,158,154,150,156,159,152,154,152,150,148,151,150,152,157,158,164,166,160,159,164,163,161,165,168,165,168,172,155,142,115,97,106,122,153,171,175,174,168,165,157,153,158,158,162,159,159,164,162,159,156,155,153,154,155,155,160,163,160,159,161,161,163,161,164,158,158,146,157,113,12,1,9,10,12,10,13,12,12,14,13,13,17,17,18,18,16,17,19,23,16,18,19,18,19,20,16,18,20,17,19,17,18,17,18,17,17,18,17,19,19,16,19,20,19,23,16,21,21,21,25,20,20,17,19,19,16,17,17,19,16,18,22,18,17,18,16,17,17,17,18,17,18,18,17,20,19,18,17,17,19,20,18,18,20,17,19,20,19,19,20,19,33,34,33,40,39,37,36,43,49,57,55,56,43,40,42,29,30,29,24,30,32,33,32,27,29,26,29,29,28,26,21,26,28,30,34,38,46,41,37,48,44,45,44,46,39,40,39,29,34,33,30,37,31,34,39,39,43,60,77,93,91,79,77,66,69,91,106,108,109,107,102,98,101,108,108,112,112,120,124,117,116,117,122,127,120,119,120,115,119,117,122,116,116,117,126,130,122,122,120,119,118,131,136,132,125,123,118,119,121,122,126,118,116,117,121,120,122,121,130,130,133,129,125,131,130,125,125,126,122,122,103,107,107,106,117,122,127,128,125,125,133,134,136,130,128,133,128,118,110,108,113,125,125,125,125,130,136,139,136,135,137,137,138,137,145,142,139,137,135,132,133,128,129,117,116,119,113,117,110,107,114,118,122,123,120,113,119,112,104,117,116,113,123,120,117,119,116,122,117,96,93,98,96,95,107,109,101,95,85,78,74,96,107,108,106,101,110,107,90,79,73,67,76,74,76,71,55,53,50,52,55,47,53,53,53,57,52,51,55,55,56,48,53,54,48,56,56,57,55,52,53,51,53,54,48,51,55,53,53,54,50,53,51,51,57,55,50,43,54,48,46,50,42,49,49,48,49,47,50,48,53,57,55,58,58,57,58,56,58,53,54,48,49,50,54,50,49,65,74,75,57,55,69,63,59,57,61,56,72,68,69,91,74,62,66,71,65,51,59,61,78,100,113,107,102,92,84,80,69,59,48,42,39,42,42,39,39,36,36,39,40,47,45,45,50,44,44,61,44,63,72,78,92,84,104,100,99,113,116,114,123,127,136,130,126,141,126,129,126,124,124,130,127,128,131,122,121,120,128,135,137,132,129,119,113,117,112,110,92,91,91,74,87,76,64,66,57,49,40,42,43,39,36,29,29,27,29,23,16,25,19,20,19,17,19,19,21,17,20,24,23,19,19,17,23,21,21,26,24,26,26,28,28,27,33,40,37,34,30,26,40,43,47,52,53,47,33,40,51,54,55,56,63,59,57,55,46,45,40,39,35,31,34,27,32,37,29,34,35,24,19,17,20,19,19,24,21,27,25,27,32,29,33,37,41,43,41,45,50,53,55,60,59,56,65,61,65,63,64,68,70,83,80,71,77,77,83,89,101,107,109,142,162,160,157,144,138,144,153,159,153,151,151,142,135,134,141,150,152,151,152,158,155,158,167,166,163,160,160,165,166,160,149,141,144,154,161,158,156,162,153,149,154,152,156,151,148,153,157,152,152,151,154,160,150,152,155,152,149,154,156,151,148,146,151,152,151,154,158,156,154,156,156,156,158,165,165,155,143,124,104,97,114,145,162,173,172,161,152,143,147,155,159,164,162,160,160,158,159,162,161,159,160,158,162,162,161,165,161,165,168,164,165,158,158,149,159,111,12,1,9,10,13,12,13,12,13,14,14,14,16,16,18,17,17,17,18,18,17,16,17,17,17,17,18,17,16,18,17,17,19,19,17,16,17,17,18,20,19,18,18,18,18,21,23,19,21,23,23,24,20,18,17,16,20,21,16,18,17,19,19,15,19,18,16,18,17,20,18,16,16,18,17,20,20,16,18,16,19,19,19,22,17,18,20,16,19,17,19,29,30,38,42,45,43,43,48,51,57,59,55,48,44,42,46,46,36,36,33,32,29,31,27,26,31,26,27,26,27,28,31,37,37,37,42,41,45,47,36,37,42,46,38,37,34,35,41,35,35,36,36,33,30,36,38,42,48,48,57,68,78,81,87,92,90,106,113,117,108,106,110,104,114,114,110,121,116,116,122,126,122,126,128,135,115,132,125,125,128,123,118,118,113,115,122,127,124,117,120,125,126,128,128,141,132,124,115,108,109,118,109,111,116,118,118,119,120,114,110,116,123,119,124,119,114,122,119,131,131,128,123,116,120,112,130,141,142,130,132,128,130,125,133,135,138,140,144,129,123,130,131,133,127,128,127,128,130,120,119,127,137,154,152,144,144,142,141,140,132,124,127,125,122,114,110,114,124,131,133,136,134,137,138,127,123,131,129,128,127,115,114,123,121,118,111,94,93,97,94,92,100,107,97,100,98,84,75,76,79,80,78,95,95,102,102,97,97,105,100,90,101,80,77,57,57,64,66,70,63,60,53,56,58,55,48,55,49,55,57,53,59,52,56,55,54,63,54,54,57,53,53,53,54,54,55,57,49,56,56,54,54,51,56,53,47,50,52,48,53,46,49,49,51,47,46,53,44,52,59,46,50,56,56,59,56,55,58,59,61,71,63,54,44,48,51,48,57,56,57,60,56,71,67,54,57,63,54,61,58,57,51,66,50,49,43,42,53,50,53,64,73,92,102,98,103,96,106,113,76,99,90,86,83,79,74,69,53,55,51,42,52,48,53,45,37,44,46,47,39,45,47,45,66,55,50,51,56,58,60,66,80,85,90,88,89,103,93,107,107,93,97,108,105,101,118,107,114,122,132,131,131,131,137,120,133,151,156,148,112,118,132,100,116,139,103,141,107,139,123,119,126,123,96,91,100,90,96,74,53,57,57,49,38,31,54,44,46,25,22,26,27,20,18,26,20,21,22,22,19,18,25,24,24,26,28,33,33,29,27,21,29,40,43,48,56,51,27,38,55,57,57,54,56,57,64,53,50,46,31,34,35,33,29,25,33,29,26,35,36,25,21,26,27,27,31,36,38,38,39,42,43,45,51,45,52,53,50,53,51,53,51,56,57,49,51,49,53,59,65,74,77,82,76,74,71,71,78,90,102,106,122,158,172,172,155,133,129,135,145,153,155,158,159,149,139,141,144,156,145,153,143,144,153,152,151,150,147,142,141,151,156,154,151,154,159,157,167,162,160,163,152,152,153,155,158,157,158,155,156,154,153,149,154,159,152,160,165,163,164,165,164,157,154,147,146,152,151,151,157,156,155,157,153,152,154,158,151,151,147,140,128,94,84,104,133,157,164,165,156,146,148,153,163,165,162,161,160,163,160,162,162,159,159,158,160,160,155,157,160,162,166,164,162,158,163,153,162,113,12,1,8,10,13,11,12,12,13,14,13,13,17,15,19,19,16,17,16,17,17,19,18,18,20,17,17,17,17,18,16,17,19,17,16,18,18,19,18,20,17,19,19,19,21,19,22,24,22,22,18,21,21,18,18,17,17,16,18,20,17,15,19,17,18,19,16,18,17,17,17,16,19,19,16,19,20,20,19,17,20,22,18,21,17,20,20,19,21,15,22,24,27,27,29,34,39,36,31,37,36,40,44,41,31,29,34,39,36,35,34,32,29,27,29,21,27,27,28,27,24,34,29,27,31,32,30,29,28,32,27,23,23,24,28,21,30,30,30,34,33,29,34,32,31,37,36,40,40,53,59,59,63,78,92,95,110,117,119,117,111,117,117,112,115,112,114,109,103,111,113,123,122,119,115,102,106,105,109,110,111,110,114,108,107,113,111,115,111,113,113,110,117,122,114,120,120,123,119,110,112,117,127,126,121,114,111,112,113,111,109,107,105,106,103,97,98,94,105,114,115,120,113,123,121,116,116,117,126,127,122,119,113,101,103,103,102,124,127,122,123,127,132,134,133,123,121,119,116,117,118,124,141,149,146,130,127,136,137,129,120,118,123,120,125,120,128,141,139,145,136,128,123,114,118,125,128,125,125,134,128,125,119,118,118,104,111,110,105,103,98,103,112,110,103,100,92,84,72,67,83,89,82,81,74,78,98,86,82,97,100,97,100,92,82,74,80,86,93,88,77,73,66,63,67,63,57,58,55,65,66,59,62,53,58,63,57,55,53,55,51,52,56,53,55,52,53,54,56,57,59,58,50,55,55,48,55,54,52,55,55,55,53,52,55,52,51,54,56,49,51,46,48,50,44,50,50,50,50,50,49,54,56,47,46,47,53,49,49,50,43,46,44,51,55,45,46,46,48,39,49,49,47,50,46,56,46,46,52,69,63,59,72,64,66,60,60,59,56,66,69,67,69,75,81,95,104,99,98,79,76,78,80,89,80,77,73,61,55,59,45,54,48,42,50,37,48,41,46,48,37,41,41,47,46,46,53,52,50,56,55,61,58,66,61,74,93,82,91,102,107,101,108,112,103,106,123,125,126,132,114,125,120,108,128,127,128,131,141,152,145,135,141,151,151,149,145,149,161,142,130,141,133,131,117,116,104,87,89,83,90,77,52,51,42,27,28,41,59,55,61,62,21,8,15,24,24,23,31,29,24,19,23,31,36,43,53,41,23,40,59,59,61,55,51,54,59,57,53,44,29,32,33,29,28,29,37,29,29,35,36,37,41,42,43,44,50,48,50,57,50,53,58,51,50,49,47,47,45,45,37,48,45,41,46,42,50,53,59,57,66,81,82,82,70,73,72,66,81,87,113,118,132,167,176,179,170,146,129,131,139,147,149,150,161,165,158,159,159,163,158,157,166,159,155,152,151,144,135,133,143,155,150,145,153,161,157,157,154,147,154,159,156,157,160,162,160,157,158,158,161,163,160,153,158,165,161,165,169,167,168,171,168,165,160,157,158,158,158,155,158,162,160,161,160,156,159,162,157,152,148,153,153,125,101,88,97,127,151,170,174,169,165,164,165,168,165,165,163,166,162,162,162,161,165,162,162,154,151,152,151,158,161,160,160,156,161,155,162,112,13,1,8,9,13,11,12,12,14,14,13,13,20,17,18,18,17,19,17,17,20,21,21,21,19,19,21,19,19,18,19,21,20,20,22,19,18,17,19,21,18,17,18,20,18,19,25,19,22,21,21,22,16,19,18,16,18,17,19,17,17,18,18,17,17,19,16,16,17,18,19,19,20,18,19,21,16,18,22,20,19,17,20,19,19,19,18,21,17,21,23,19,18,22,22,27,23,19,22,18,20,24,29,27,24,24,27,25,23,30,23,20,29,20,21,23,22,20,24,29,24,23,24,25,29,24,26,30,34,35,35,30,30,29,27,39,43,46,48,42,43,41,48,45,44,49,48,46,44,46,50,66,76,85,90,92,86,86,96,107,102,118,115,113,105,103,101,97,105,103,103,102,99,102,93,84,83,89,96,107,104,104,110,105,106,105,117,123,120,122,118,113,112,118,112,113,110,114,116,116,124,114,127,118,107,109,115,115,119,119,119,127,113,103,116,114,116,120,109,114,107,112,116,113,118,111,116,115,108,108,101,103,100,95,92,88,95,97,99,95,102,109,110,112,107,110,111,117,117,118,113,110,112,115,116,111,121,122,123,123,118,119,118,117,132,122,131,140,137,129,120,111,108,113,111,114,118,120,113,114,112,106,106,104,110,115,122,122,116,105,108,100,105,102,105,112,101,97,95,87,91,86,79,72,66,74,81,87,95,84,85,97,95,93,108,112,112,104,91,89,95,87,96,94,93,96,95,85,87,82,83,61,59,58,57,63,61,53,53,50,51,49,53,57,52,51,52,52,56,54,53,50,52,50,51,55,50,51,57,49,46,46,50,53,53,55,49,50,50,51,52,49,49,45,39,42,47,48,43,45,41,43,42,43,43,42,43,44,44,43,46,43,47,45,45,41,40,41,40,42,46,40,50,59,56,55,60,60,51,48,46,46,48,47,46,48,47,43,43,53,48,53,51,56,57,64,74,75,68,68,64,73,86,93,100,92,87,72,65,73,68,68,62,57,59,52,48,51,46,42,47,45,43,38,43,39,43,44,35,42,42,38,45,43,43,46,50,49,44,53,55,47,57,63,61,71,66,79,86,87,83,97,92,96,104,113,118,105,117,138,116,113,116,122,138,125,124,134,144,145,125,133,138,150,157,157,141,137,138,163,153,143,137,145,154,160,177,198,218,224,223,207,184,184,170,158,139,136,91,39,30,46,66,39,9,11,16,26,22,39,53,58,63,57,50,47,51,54,56,43,27,30,31,30,35,37,35,33,31,36,42,53,57,49,57,54,53,53,46,51,47,43,45,44,39,36,38,36,34,32,36,39,39,46,38,46,53,49,61,55,63,70,66,67,59,55,53,56,59,64,92,100,112,139,145,153,156,150,149,144,147,151,154,154,159,163,159,161,159,163,165,165,177,165,153,151,147,140,135,138,150,159,159,153,156,165,162,154,154,150,150,155,154,161,164,163,156,153,155,160,171,174,170,162,160,163,161,160,160,160,158,163,170,165,163,165,164,165,157,154,162,163,164,159,159,161,161,164,160,159,157,162,166,157,142,118,95,94,119,149,170,172,171,162,159,160,159,152,153,163,160,159,165,165,166,160,160,160,151,150,156,159,162,158,156,155,162,152,160,112,12,1,9,10,12,10,13,12,13,13,13,12,24,28,31,29,29,29,31,34,29,34,31,33,38,38,42,36,40,41,38,39,36,39,36,39,40,32,28,21,18,19,18,19,17,21,22,17,19,24,22,19,20,17,17,17,17,17,15,18,18,17,17,17,18,16,20,20,16,19,17,17,21,17,17,17,18,17,19,20,21,17,19,21,15,19,21,18,21,23,21,22,28,27,26,24,23,26,22,24,27,31,31,31,39,28,29,34,32,33,31,33,32,29,30,30,30,29,25,27,23,23,27,25,29,27,33,38,35,42,40,33,32,26,34,33,34,44,47,51,48,53,61,52,57,55,53,56,51,55,65,77,89,96,92,84,80,82,90,96,91,101,110,108,106,104,116,107,110,108,97,106,103,105,109,105,108,108,109,113,112,107,108,97,92,101,111,119,115,120,122,119,121,119,117,124,122,117,108,104,108,110,109,105,108,115,118,120,118,111,114,117,119,121,122,128,136,129,120,117,117,117,112,115,116,120,119,118,118,111,105,103,107,102,102,101,87,85,77,85,100,101,100,96,98,101,106,109,114,114,112,106,101,99,103,104,115,118,111,121,127,130,130,133,141,128,126,127,114,114,120,122,126,122,115,113,108,112,103,96,93,92,97,95,103,105,108,112,108,110,101,103,108,105,105,102,103,112,109,101,93,89,97,91,84,70,75,76,72,72,72,83,89,88,89,91,89,84,80,84,90,93,99,97,100,107,104,106,76,68,81,64,62,60,59,59,55,55,54,56,55,56,59,57,55,51,58,54,59,58,53,55,50,48,53,53,57,54,53,54,45,53,51,49,53,48,49,47,47,49,49,43,44,44,44,46,48,46,42,45,52,49,52,47,44,45,43,46,50,44,43,48,49,49,43,43,47,44,45,43,47,51,60,61,59,66,71,63,54,49,46,47,41,38,43,45,41,44,49,47,44,51,48,40,46,52,51,47,61,64,62,62,67,71,66,73,64,59,68,70,66,68,77,83,70,64,67,66,66,66,66,59,55,53,56,55,51,48,51,49,47,47,42,40,40,41,40,39,39,36,40,41,41,39,43,41,36,49,48,50,50,53,57,65,74,74,77,63,87,98,82,95,93,99,113,112,106,109,136,125,122,141,136,149,152,157,159,166,167,154,153,163,188,217,232,234,237,249,251,251,250,250,250,250,253,253,251,251,243,229,233,245,249,237,185,141,94,18,2,11,9,17,37,53,53,53,55,54,57,42,29,35,27,30,37,39,38,33,34,30,39,48,51,51,46,41,44,44,42,39,37,36,33,31,34,31,33,36,34,33,36,40,39,43,44,44,52,50,50,50,54,60,55,49,51,48,46,49,56,52,67,81,92,108,110,125,138,157,168,158,155,157,160,160,161,159,157,160,159,166,168,174,175,155,150,149,145,141,141,147,154,160,156,153,157,168,171,169,164,157,152,144,149,158,159,160,152,152,152,155,163,166,164,159,160,163,165,163,154,148,153,159,158,164,161,160,168,163,159,157,163,166,162,163,162,158,158,159,158,157,159,165,165,159,162,145,123,101,88,108,134,153,160,160,158,154,151,148,144,149,152,156,163,166,166,166,168,163,160,160,160,165,165,163,160,155,158,149,160,113,12,0,10,10,12,10,13,12,12,14,13,12,59,58,62,59,58,63,62,66,63,66,71,71,76,79,87,89,93,105,107,113,111,110,109,108,92,69,54,42,31,28,19,16,18,16,23,18,19,21,17,22,20,16,20,16,16,16,16,17,17,17,16,17,16,17,17,18,19,17,17,17,17,18,21,18,18,19,17,15,22,20,20,21,19,16,20,19,20,21,21,26,26,22,25,29,31,22,28,32,29,39,46,47,42,41,39,36,43,42,36,34,30,30,31,27,24,24,22,18,23,21,24,18,21,29,28,28,23,29,26,25,30,27,31,29,24,35,38,37,47,45,56,45,38,38,41,42,45,50,62,69,75,75,46,49,67,72,77,84,83,89,87,78,83,96,104,103,108,101,105,107,105,116,118,118,119,125,122,124,121,117,120,118,118,114,110,112,112,117,121,114,119,118,115,127,120,114,112,107,111,114,117,118,113,110,113,110,108,113,115,125,123,119,125,122,125,128,120,128,120,121,118,116,125,114,117,117,122,125,126,124,120,115,109,102,89,89,103,113,123,127,122,113,108,113,116,127,122,126,124,126,131,129,123,118,113,116,109,117,131,137,140,142,130,121,114,108,109,114,121,121,114,122,128,125,120,114,106,105,110,106,107,96,93,101,100,98,103,104,91,92,90,93,97,105,99,101,99,92,92,79,83,73,66,77,81,66,71,83,59,84,81,51,49,50,58,58,65,54,61,68,80,61,60,63,77,87,59,53,59,59,71,59,56,60,61,61,64,59,55,57,65,69,57,59,62,59,63,50,55,55,52,57,54,57,55,53,57,54,50,46,51,53,52,50,51,49,49,49,48,42,43,48,42,44,44,43,49,48,51,57,55,51,53,54,50,54,48,48,52,46,55,54,44,50,46,36,48,43,40,50,46,40,46,45,55,46,43,41,44,46,42,41,44,42,37,45,46,45,48,50,44,39,40,46,56,57,61,57,57,57,54,47,49,63,51,51,57,59,63,63,65,66,69,65,68,69,71,77,77,76,69,70,67,65,72,63,63,63,60,61,53,55,49,48,52,51,45,46,47,44,43,42,41,37,34,35,39,39,37,36,39,42,53,43,53,45,45,55,48,57,59,59,68,73,75,77,91,93,91,101,124,125,136,139,127,134,145,156,179,209,235,247,251,245,248,251,252,252,252,252,252,252,252,252,252,252,253,253,253,253,252,252,252,252,244,187,75,26,5,5,13,9,21,37,47,57,54,48,38,34,35,29,39,35,31,35,35,38,34,39,41,38,34,32,37,37,32,32,34,33,30,32,31,29,35,35,38,39,39,39,41,37,37,51,45,42,48,45,56,58,54,57,49,49,49,53,53,56,66,78,89,81,93,113,137,160,165,155,156,147,151,157,167,160,155,153,156,163,174,167,173,158,151,154,155,151,157,153,153,162,159,154,161,169,179,174,166,158,146,147,150,152,155,154,155,156,160,157,158,157,155,153,155,164,168,164,154,152,153,155,158,158,158,160,163,160,155,155,158,160,161,158,160,157,155,153,153,156,157,155,153,156,152,156,152,124,96,81,103,127,148,162,165,162,159,150,149,149,150,154,157,161,160,165,164,161,159,158,158,158,162,160,162,155,159,152,157,112,12,1,9,9,13,11,13,12,13,13,13,13,131,126,125,124,131,134,134,139,136,144,145,156,160,162,171,171,177,174,174,171,162,162,148,150,139,98,66,45,33,25,22,18,13,17,17,15,19,18,17,22,16,18,18,14,17,18,15,15,15,16,16,15,18,16,17,17,17,18,20,19,16,17,17,18,17,19,18,17,18,16,18,17,17,18,19,18,19,18,18,21,18,19,23,24,21,22,20,27,36,35,42,40,34,31,28,27,29,31,26,25,20,24,29,23,22,23,26,24,23,22,22,31,27,27,29,22,32,33,29,33,41,41,37,40,38,44,47,52,52,53,57,59,55,47,44,38,33,40,54,57,64,62,64,66,65,71,72,83,87,86,81,78,80,87,101,95,94,93,102,110,111,110,113,119,120,126,122,120,118,118,124,126,121,112,107,105,106,113,115,111,111,113,110,110,113,116,114,122,126,122,125,112,113,114,111,112,107,110,117,113,117,114,109,115,115,117,121,118,118,118,121,123,119,113,107,112,118,125,128,131,130,128,131,124,120,127,129,128,125,125,120,120,124,125,125,127,125,124,123,120,125,123,117,114,117,113,107,110,109,109,109,115,117,111,109,99,94,110,117,113,114,116,113,115,109,107,112,117,118,111,113,106,106,97,92,95,97,95,84,84,78,88,101,100,99,99,105,98,91,75,81,83,87,94,99,95,99,102,94,92,82,77,78,76,75,75,62,58,59,50,65,54,49,59,57,73,60,54,56,54,66,59,54,61,66,66,60,54,51,60,57,56,57,57,57,53,57,54,50,61,52,50,54,50,53,52,52,55,53,53,56,53,52,49,51,49,51,47,52,50,49,50,49,47,46,45,48,51,46,47,47,48,46,49,43,45,45,42,46,43,47,51,44,44,45,46,42,38,46,42,43,42,37,41,42,40,44,43,36,45,45,39,41,49,42,35,45,40,42,45,39,42,43,45,47,48,46,42,46,52,50,41,38,49,44,46,46,52,57,49,48,50,53,51,51,46,53,56,53,54,53,54,55,55,56,54,55,60,63,64,66,69,60,64,68,59,56,56,62,60,56,58,46,47,45,49,47,50,49,40,43,41,45,46,43,40,41,44,39,45,41,43,47,46,48,48,53,46,56,55,53,66,72,83,88,96,115,136,160,180,194,205,208,210,217,226,233,240,253,253,253,253,252,252,252,252,253,253,253,253,252,252,252,252,252,252,214,178,150,117,55,7,6,11,10,19,39,52,46,39,34,32,32,40,39,35,33,34,36,29,34,31,27,36,29,34,36,33,33,33,35,34,36,35,42,39,37,42,34,39,41,36,41,43,42,39,49,53,55,57,57,57,57,55,54,60,54,60,69,71,73,75,85,104,130,145,141,133,134,140,141,150,159,163,154,146,151,154,151,153,152,146,150,151,154,158,153,153,152,153,159,158,159,160,162,165,153,149,150,149,159,160,155,157,156,163,168,163,157,155,152,155,162,163,169,169,163,163,160,158,156,159,162,159,161,159,153,154,152,151,152,157,155,151,151,150,157,159,160,159,156,155,160,159,163,155,131,108,92,101,125,150,172,178,174,166,160,160,155,152,156,155,149,153,155,153,156,151,148,150,155,159,159,154,156,150,159,112,13,2,9,10,13,11,12,12,13,13,13,13,137,135,139,138,145,146,146,151,147,157,165,162,165,163,162,154,148,139,112,106,95,75,57,45,43,42,41,30,26,21,18,16,15,18,15,17,21,17,17,18,20,17,16,19,20,16,15,16,17,17,16,17,17,17,16,17,19,19,16,16,18,17,19,19,17,17,17,17,18,16,20,21,17,17,20,20,19,22,24,24,27,33,27,24,29,26,28,30,36,38,38,33,30,30,31,33,31,33,39,36,31,37,32,27,31,33,34,32,27,35,44,40,35,35,39,40,38,34,46,46,47,46,43,35,41,45,47,58,61,68,67,65,52,55,47,45,50,54,69,65,88,92,93,101,94,96,95,97,99,103,101,107,112,111,112,112,105,107,109,119,117,116,110,112,110,113,113,112,116,113,115,121,109,107,105,107,113,113,113,107,112,115,118,118,117,122,122,120,129,120,114,112,112,117,121,118,114,120,117,115,115,112,119,117,122,122,112,118,115,119,118,114,117,109,113,115,117,122,124,123,123,122,120,118,118,118,118,115,116,117,110,123,126,123,124,117,118,120,111,114,115,112,115,122,117,115,108,104,103,103,106,106,108,101,101,99,102,109,115,119,106,105,111,104,107,114,115,121,117,116,118,117,114,107,107,105,100,92,99,89,92,97,102,102,96,113,113,98,102,95,97,94,93,92,99,120,123,116,113,109,105,107,91,100,79,79,83,78,71,56,67,70,69,76,64,73,71,59,56,56,58,53,54,64,55,61,59,50,57,56,56,55,49,59,56,55,54,47,54,54,51,52,48,50,52,52,49,51,57,56,57,51,46,53,53,49,55,53,52,54,51,53,48,47,49,48,48,51,48,48,48,41,42,41,43,44,40,44,44,45,44,38,48,43,39,41,42,41,45,41,39,43,42,40,42,39,40,44,42,41,44,44,46,40,44,42,39,39,39,42,42,46,41,41,39,45,47,42,39,39,45,41,39,39,41,41,38,39,40,43,43,36,42,41,39,42,40,47,42,45,43,46,50,42,52,47,53,57,57,60,56,61,62,59,64,61,63,63,65,65,59,69,62,61,69,70,76,68,57,59,56,54,57,60,57,49,56,58,50,45,44,49,50,47,44,44,45,47,44,48,50,45,49,45,42,53,54,61,68,83,89,94,103,106,128,137,141,160,176,187,204,224,239,251,252,252,253,243,253,253,234,252,252,252,253,253,251,251,252,252,234,134,125,25,1,9,10,19,33,37,38,34,40,38,38,38,36,35,35,35,29,38,35,35,41,33,34,37,39,37,37,37,31,35,35,34,37,34,36,36,36,33,35,35,35,41,42,45,51,51,48,54,57,56,52,56,54,53,58,54,60,55,74,80,96,96,110,130,142,150,136,140,137,146,142,130,133,129,121,127,129,128,146,132,144,146,147,143,149,153,154,151,151,146,149,146,143,147,144,148,155,155,160,159,162,150,156,157,162,159,146,146,151,160,164,165,166,161,156,151,150,152,159,162,159,160,154,158,162,155,155,159,160,158,152,150,153,152,155,160,158,160,161,163,167,162,160,138,113,96,89,120,147,169,182,174,170,162,159,156,153,154,148,152,156,156,155,151,149,148,148,153,158,153,158,150,159,113,12,1,10,10,12,10,13,12,13,14,13,13,125,126,128,132,134,129,128,126,126,133,120,118,100,89,79,65,54,41,35,32,31,34,27,26,29,29,27,21,24,17,18,18,16,17,21,21,21,19,19,17,17,21,16,15,18,17,18,16,16,19,17,17,17,17,19,18,18,17,20,20,17,19,20,17,17,17,16,18,19,23,16,16,18,18,22,22,21,26,29,34,43,37,36,37,40,40,47,46,48,46,44,43,41,43,37,36,42,39,33,36,34,30,28,28,33,31,31,31,34,34,40,44,40,38,31,33,36,33,40,46,47,44,39,35,41,44,41,45,47,53,51,53,48,53,56,59,62,60,86,101,114,121,121,119,120,123,118,116,115,122,126,126,121,114,124,121,122,121,117,115,119,118,118,112,110,113,112,118,118,117,122,122,121,116,118,123,119,117,118,117,117,122,126,122,115,118,122,131,126,118,122,122,128,124,123,122,121,127,125,119,129,130,130,131,125,131,129,128,130,119,120,123,128,127,125,125,124,125,125,129,122,114,115,116,122,121,122,123,122,119,114,124,122,117,118,120,119,125,129,127,123,127,138,136,134,129,123,127,129,130,122,123,113,111,116,118,124,127,125,117,113,111,113,118,117,117,120,124,119,114,117,114,121,118,120,123,117,114,116,117,115,120,113,105,103,103,98,93,106,105,103,95,87,79,87,88,92,89,91,103,98,95,94,89,74,85,94,94,89,60,72,68,64,68,65,71,61,62,60,63,68,67,66,60,65,58,49,56,51,57,57,55,56,55,63,53,56,54,53,60,49,55,55,55,61,50,51,51,55,57,50,51,49,47,51,50,52,51,51,56,50,47,48,46,52,51,46,49,49,48,44,44,46,46,48,48,45,43,42,43,44,40,40,45,43,40,43,44,42,38,46,52,45,42,40,44,45,46,44,42,42,46,47,44,43,50,47,42,42,44,44,42,39,44,48,46,48,47,46,46,47,44,43,45,40,41,39,42,42,40,40,39,41,39,42,44,46,44,42,47,43,46,39,43,46,46,45,47,54,45,45,50,47,57,60,57,57,51,57,58,57,67,56,53,59,58,69,63,60,61,60,63,57,61,61,62,72,73,61,60,67,66,65,67,60,59,56,53,55,50,54,53,53,61,57,53,51,45,46,43,53,53,42,48,50,45,46,53,65,67,85,103,118,138,151,186,185,156,187,228,239,250,253,253,253,253,253,253,252,252,252,252,242,221,156,84,21,2,9,11,24,24,26,31,33,32,24,29,34,30,36,37,34,42,37,36,34,41,40,33,40,37,33,34,33,31,38,37,33,30,30,33,31,31,39,39,36,38,44,48,44,51,51,50,53,57,53,51,49,53,54,53,62,58,69,84,105,127,154,158,136,139,140,149,145,131,134,127,123,130,139,143,150,150,154,158,153,150,148,148,149,146,147,143,144,146,141,147,146,144,148,150,145,148,149,139,145,152,150,150,140,139,148,153,160,152,152,157,157,160,158,157,162,159,161,162,156,162,163,162,160,166,167,160,157,152,155,151,157,158,159,158,160,165,165,168,163,160,139,114,91,84,109,137,163,169,170,170,164,156,152,153,154,161,163,161,163,162,159,155,156,153,158,157,159,154,164,114,12,0,9,10,13,10,13,12,12,14,13,13,118,123,119,113,110,96,78,76,73,49,46,37,39,31,32,35,29,29,24,29,22,27,28,23,24,22,28,27,20,26,22,17,21,21,21,18,21,19,19,19,16,17,20,18,16,18,18,18,16,16,17,20,20,16,17,21,19,18,19,15,18,19,16,21,19,20,19,16,21,17,18,21,17,16,18,20,20,25,21,28,30,28,32,33,28,28,38,22,31,33,28,37,29,29,31,28,29,29,27,28,33,37,31,29,31,31,32,39,38,44,44,40,39,29,35,33,29,31,27,24,27,37,25,29,34,28,34,33,42,40,46,49,44,47,46,53,56,59,68,81,88,102,111,122,117,115,122,119,125,122,113,117,112,118,114,112,115,118,123,124,128,126,124,125,123,123,122,122,125,124,125,123,120,125,121,125,127,122,122,119,126,130,132,127,119,126,128,124,128,126,128,128,128,128,126,122,124,123,118,113,126,122,128,124,122,123,124,122,119,119,126,133,128,127,131,127,128,122,121,120,122,119,118,124,128,128,125,118,122,123,119,123,117,124,127,125,129,130,127,125,122,128,127,125,128,129,125,128,128,128,126,121,126,127,133,127,134,130,116,113,107,116,118,124,119,118,121,117,114,98,106,107,109,113,116,113,106,114,112,115,106,106,95,90,89,89,78,90,91,79,75,91,83,75,68,68,80,81,93,99,75,95,93,80,70,82,92,74,78,60,70,55,51,59,51,60,50,54,57,62,60,66,58,61,64,55,56,53,55,55,58,54,55,57,53,58,66,56,59,57,57,59,54,61,56,53,55,53,54,56,51,49,51,49,55,49,50,50,55,56,44,53,50,45,53,48,44,49,47,43,48,51,44,45,53,51,47,45,45,44,44,42,45,44,45,43,39,44,45,46,43,46,47,44,44,44,47,48,44,40,43,47,41,45,47,43,45,43,46,48,48,45,42,44,43,45,46,45,43,41,43,41,48,44,42,45,42,40,38,45,42,40,46,38,39,46,42,43,45,43,41,48,44,40,42,39,42,43,47,45,36,47,47,43,49,45,50,40,42,47,42,54,47,42,40,48,50,47,55,51,53,48,45,51,54,54,64,65,65,66,59,62,65,67,68,62,61,62,61,64,71,75,81,76,76,77,70,77,61,66,90,95,80,68,78,76,58,66,71,59,55,46,46,48,57,91,78,37,56,132,156,174,213,250,251,250,240,253,253,253,253,253,252,252,246,246,208,145,59,5,5,10,12,14,12,14,14,22,36,34,35,41,36,36,38,36,39,32,34,34,30,39,33,27,33,33,34,33,30,27,32,27,31,33,34,39,34,37,39,44,51,49,43,49,49,57,62,59,57,52,56,65,70,65,76,85,89,92,125,134,141,135,145,134,120,125,133,148,152,163,164,169,164,166,171,169,154,158,156,157,154,157,163,156,160,156,150,149,150,149,145,142,145,144,144,142,151,157,156,153,157,153,150,154,149,144,150,154,158,163,163,155,158,161,159,159,152,153,160,158,162,166,161,152,153,158,158,162,156,157,161,159,163,166,168,167,165,160,156,148,121,101,88,101,130,154,171,171,170,164,156,153,156,165,165,166,165,167,165,163,162,160,161,159,166,159,165,113,12,1,9,10,12,10,13,12,13,13,13,13,92,86,69,53,44,38,35,31,28,28,23,28,25,23,26,27,26,22,23,25,26,27,27,26,24,22,25,28,26,19,20,21,18,21,22,20,22,22,15,19,21,15,21,20,17,17,17,17,19,18,17,18,19,16,21,19,17,19,15,17,18,21,18,18,19,18,18,19,22,17,21,19,18,19,17,24,23,20,21,27,23,21,26,22,19,20,29,22,22,24,29,26,27,31,27,33,30,28,27,25,34,36,29,33,34,33,34,34,41,36,37,31,26,36,35,38,31,33,39,32,36,34,34,26,27,31,34,36,42,44,41,40,45,47,48,54,56,54,54,52,63,84,102,105,98,96,100,106,108,107,109,115,113,116,117,113,113,119,125,123,120,113,117,117,122,120,123,125,125,123,121,120,122,127,121,123,127,125,124,122,122,123,130,129,122,122,115,117,121,124,127,123,119,118,118,116,117,117,111,105,107,104,102,105,101,103,106,116,117,117,133,130,130,131,136,131,126,122,113,118,114,110,114,113,116,115,114,114,113,105,112,122,121,126,124,123,123,124,122,118,112,114,120,117,125,122,122,116,107,111,106,122,123,125,127,126,127,114,117,115,117,124,123,121,112,101,101,109,110,108,113,123,126,119,111,113,112,108,116,102,91,93,82,91,93,93,95,97,94,89,94,97,90,84,81,76,83,90,89,89,99,101,94,73,53,57,54,57,71,53,54,42,45,56,50,52,50,56,48,52,56,50,61,61,67,62,51,57,53,60,55,60,60,55,59,48,57,55,54,55,48,53,49,56,56,51,56,51,55,54,54,54,61,56,50,60,54,52,54,53,54,50,49,54,48,50,50,50,49,48,53,48,51,50,48,47,48,49,45,44,44,43,48,47,39,45,42,40,44,43,43,36,46,40,40,47,42,44,41,42,38,44,43,43,44,42,45,41,43,46,44,45,44,40,40,42,41,35,38,46,39,41,43,40,43,40,38,41,42,39,39,41,41,44,44,40,40,40,40,44,37,46,40,39,43,40,40,36,41,44,41,41,43,36,38,41,35,42,38,39,40,39,37,34,41,36,36,50,46,42,45,43,43,41,46,48,51,55,55,50,49,58,53,54,57,66,71,71,63,61,74,76,78,68,73,77,72,74,79,92,110,115,116,116,127,131,137,153,154,142,127,107,112,108,107,120,72,22,27,64,92,107,125,160,201,175,182,250,253,253,252,252,253,253,252,252,250,250,229,205,200,190,195,200,194,193,178,204,171,45,29,39,28,35,29,30,36,29,31,36,32,32,29,29,32,30,33,30,30,24,24,27,33,31,31,37,37,37,35,39,47,50,45,44,50,59,64,64,62,61,62,71,83,76,90,89,84,101,117,116,114,124,117,105,97,117,140,152,160,163,169,171,174,163,161,165,155,155,163,164,160,167,170,169,165,157,151,151,153,158,155,154,153,155,153,148,160,162,162,177,176,170,165,155,152,146,150,151,154,163,159,158,159,163,160,160,155,153,158,158,158,159,155,153,152,155,161,161,164,160,158,154,160,168,166,166,163,164,160,160,156,132,108,89,96,125,154,170,170,165,162,157,158,168,170,169,163,165,165,159,163,155,160,159,162,158,163,113,12,1,9,10,13,10,13,12,13,13,13,13,34,31,35,24,24,27,19,24,20,22,21,19,25,21,21,29,28,24,29,24,22,23,28,27,25,27,24,27,29,19,17,18,17,17,21,21,17,18,17,19,17,17,18,18,19,19,17,20,19,17,18,19,18,16,19,17,17,17,17,20,19,17,19,23,16,18,19,19,18,21,22,18,18,21,19,20,22,20,24,20,21,26,21,27,30,27,33,38,35,33,34,36,39,41,35,37,36,35,28,29,29,30,32,36,29,26,35,27,31,29,36,35,29,36,33,35,46,44,46,49,45,45,37,38,34,36,40,38,45,47,53,55,60,67,72,76,75,75,85,89,103,108,120,103,104,99,104,110,99,107,101,112,112,118,117,112,122,118,118,117,110,107,111,115,118,118,121,122,117,114,116,118,119,124,118,115,109,108,110,105,108,108,103,100,101,98,107,109,108,112,119,123,117,108,114,116,121,116,110,109,104,110,112,112,115,116,110,112,110,127,127,122,122,131,132,130,132,122,124,124,125,124,120,122,122,123,122,115,108,109,115,127,122,113,118,125,124,125,124,117,123,127,127,129,126,125,120,106,103,112,115,125,128,120,122,118,115,116,123,129,126,126,118,106,101,109,110,114,122,125,129,120,114,115,125,125,112,110,103,104,108,116,127,122,119,121,113,119,113,105,107,111,115,100,97,83,78,87,91,100,85,83,84,71,50,55,64,78,72,72,63,49,51,61,56,57,57,65,56,56,55,50,55,66,62,56,60,56,58,57,60,56,57,60,54,56,52,57,58,53,56,53,57,57,50,56,58,63,55,55,57,66,60,53,62,56,60,55,52,53,53,55,47,54,51,50,54,49,52,51,50,53,55,54,47,49,49,44,46,49,43,49,42,44,47,44,43,39,43,46,42,43,44,35,43,41,39,44,38,42,40,45,48,41,43,41,44,39,39,42,42,41,41,43,41,39,40,43,40,38,38,42,40,40,42,42,39,39,38,38,44,44,39,37,38,39,42,40,36,42,44,35,39,42,39,37,40,36,36,42,39,42,38,38,41,38,39,35,40,42,33,37,40,33,34,39,34,40,35,36,43,35,40,38,41,43,41,46,48,45,48,54,52,54,59,63,62,54,53,60,60,65,66,59,66,69,79,75,75,89,101,121,125,124,135,136,152,156,155,153,133,159,182,169,165,154,131,98,120,129,141,125,97,97,139,76,117,210,251,252,252,252,253,253,252,252,253,253,252,252,252,252,252,252,252,252,251,251,226,67,32,33,25,31,26,29,28,27,29,29,29,29,26,27,25,28,24,29,27,22,27,24,24,33,24,30,34,29,36,37,37,42,42,42,43,52,60,62,61,55,57,63,75,70,93,85,109,122,112,107,109,121,110,123,112,122,131,146,149,147,152,158,155,150,151,155,160,170,179,173,163,160,162,158,155,151,151,152,158,165,161,162,163,157,157,151,155,153,158,175,174,170,162,159,156,152,155,154,153,155,159,159,161,165,162,161,159,159,159,154,154,149,155,152,154,155,157,161,161,161,153,149,157,155,158,160,160,159,157,164,162,162,136,106,87,93,121,143,162,166,166,164,160,167,165,163,158,156,160,159,156,156,160,158,160,151,160,113,12,0,9,9,12,10,13,11,12,14,13,13,21,19,24,22,22,20,20,24,23,24,21,24,25,23,24,23,27,25,24,24,22,26,25,29,27,21,25,23,20,24,21,15,21,20,16,18,17,19,20,17,17,19,17,18,17,17,21,16,17,16,17,18,17,20,17,17,18,18,19,19,21,18,20,19,18,19,19,20,19,19,20,18,22,23,16,22,18,23,23,22,29,26,29,33,34,36,39,33,32,26,27,30,34,33,40,39,37,43,39,41,45,49,43,43,42,35,37,41,43,44,45,46,43,40,43,48,48,45,46,47,43,42,50,49,47,45,49,40,42,50,53,61,73,85,96,100,98,99,95,103,101,98,96,101,117,118,121,117,126,122,117,116,111,115,113,117,114,114,118,117,110,104,110,120,125,120,122,118,113,116,116,116,116,119,117,114,110,105,103,110,112,108,111,113,113,117,119,119,118,119,123,122,117,115,115,119,127,127,124,120,125,125,124,127,125,120,116,110,107,113,121,113,111,114,115,114,109,114,122,130,132,130,127,129,131,125,129,127,123,122,123,127,119,122,125,122,125,127,125,125,129,131,128,127,131,127,128,123,118,129,127,131,127,126,119,118,117,117,129,119,125,124,108,110,117,113,116,115,116,113,105,109,111,116,115,115,112,108,116,118,119,130,129,132,128,123,125,131,122,111,112,111,110,110,109,97,90,88,77,71,68,67,80,77,85,95,96,98,93,93,84,65,66,66,69,70,73,80,58,58,51,45,61,58,59,59,62,61,60,63,57,57,55,55,60,57,59,57,58,62,61,58,54,57,55,56,55,54,53,50,56,53,57,53,46,53,49,50,51,48,55,50,48,53,49,47,47,52,50,51,52,47,49,49,47,49,49,47,46,48,43,44,45,47,44,44,45,45,47,45,45,46,42,41,48,42,41,44,45,42,43,46,43,43,40,42,44,43,41,38,39,43,41,43,49,46,46,45,40,44,44,46,43,45,44,42,40,40,45,41,45,40,39,42,38,40,40,34,40,44,37,41,38,34,38,36,38,38,36,37,39,43,41,41,45,44,44,38,38,41,38,41,40,34,41,41,35,40,36,33,36,39,38,42,42,42,41,46,50,40,46,49,46,51,53,55,49,51,54,53,51,55,58,54,62,70,75,68,67,81,91,100,113,113,115,116,106,106,105,113,122,137,162,150,168,172,142,143,160,188,189,150,113,109,104,57,72,133,148,178,217,248,253,253,252,252,253,253,252,252,253,253,252,252,253,253,252,252,226,44,21,21,12,31,21,27,27,30,28,29,27,27,24,27,27,21,30,27,27,23,21,27,28,27,29,28,28,33,30,32,34,33,31,36,39,42,50,52,52,54,57,54,64,62,67,79,97,117,105,104,115,134,161,150,133,137,132,136,143,137,133,130,139,153,166,168,165,170,171,166,154,153,157,162,160,157,158,159,159,162,163,161,159,160,151,154,154,145,154,160,160,153,153,157,155,152,156,150,148,150,151,156,156,160,157,162,159,155,159,155,150,151,150,155,155,151,154,153,160,165,161,150,148,150,144,151,155,155,154,158,162,156,152,132,101,86,88,108,137,160,170,170,164,162,160,158,153,157,158,157,163,158,162,160,160,154,160,113,13,1,9,10,12,11,14,12,12,14,13,13,19,17,20,19,24,22,22,22,24,23,18,22,22,24,25,23,25,27,24,25,25,22,27,27,26,26,29,24,19,22,16,18,18,19,22,19,21,16,19,22,17,17,17,19,16,17,17,19,21,17,16,18,17,19,18,17,20,20,17,20,21,17,21,18,18,19,18,22,18,17,16,19,18,19,19,18,22,21,22,27,32,29,25,29,31,29,28,25,26,19,25,28,33,33,30,27,29,37,35,37,36,36,37,33,38,38,31,46,50,46,45,39,35,43,40,40,39,36,35,36,38,41,39,47,46,45,47,46,48,42,39,42,59,71,80,91,94,100,95,98,103,102,108,111,119,113,121,108,124,125,121,123,125,120,124,120,116,114,110,116,119,121,114,122,125,116,120,123,122,124,125,124,121,127,122,123,121,123,122,118,125,130,131,129,130,127,130,126,123,123,119,123,120,115,122,116,118,116,124,127,118,120,119,118,120,121,126,123,123,120,122,118,116,118,115,119,124,116,120,119,122,122,122,122,122,126,128,128,124,125,122,126,126,120,123,125,125,125,122,121,123,122,124,128,125,129,135,131,131,130,132,127,125,122,119,120,121,122,119,112,108,110,115,117,117,116,107,98,103,106,107,111,110,103,101,101,105,111,110,106,101,98,104,106,105,108,103,104,93,90,90,92,96,105,107,112,108,102,92,79,85,82,92,97,102,99,103,98,88,97,94,95,100,104,103,100,93,95,80,89,80,74,79,61,65,61,66,59,54,60,56,53,56,55,56,60,55,56,57,62,60,56,59,59,54,54,57,54,54,49,53,53,50,50,48,51,51,51,49,50,53,55,50,50,49,46,52,49,47,47,50,48,48,48,42,47,44,46,50,46,46,47,49,47,49,43,46,47,37,48,43,44,49,41,48,43,44,41,40,44,41,43,42,43,48,41,45,42,36,40,41,39,44,42,41,48,39,38,44,41,42,47,46,43,39,39,42,42,41,39,46,41,38,41,39,42,41,41,42,40,39,42,36,36,37,39,41,37,40,39,39,38,43,38,40,44,37,36,39,38,41,45,41,36,36,39,36,33,36,37,38,36,38,40,40,39,39,40,37,44,45,42,42,45,46,44,47,50,43,46,47,52,50,49,57,53,61,57,58,68,64,66,83,85,59,63,73,82,101,98,100,105,124,126,142,132,108,122,140,165,160,149,172,192,146,101,97,116,105,103,117,141,178,208,233,249,253,253,252,250,253,253,252,252,253,253,252,252,209,38,16,13,12,29,17,30,27,24,26,27,25,31,29,25,26,22,28,23,24,27,24,22,21,26,29,29,35,32,28,35,39,32,29,33,35,42,39,47,52,54,57,52,57,59,51,60,74,89,98,114,130,145,164,138,122,125,130,145,156,151,142,137,143,153,166,160,154,160,164,146,154,151,158,164,165,164,165,161,158,155,153,158,161,160,161,164,160,153,152,155,152,147,146,146,150,148,146,146,143,141,142,146,148,150,151,156,157,155,157,157,152,152,155,158,158,156,156,157,160,160,160,156,154,151,151,151,151,155,155,155,153,157,151,150,133,114,93,81,103,130,154,164,164,162,156,154,150,152,156,161,163,163,165,161,169,159,166,113,11,1,9,10,13,11,12,12,13,13,13,13,21,20,22,21,24,24,18,24,22,24,20,19,22,24,26,24,26,24,24,28,23,26,27,23,23,24,27,20,21,19,17,18,17,19,19,19,16,21,19,20,18,18,19,18,18,17,17,17,17,19,19,18,20,19,18,17,16,17,16,21,18,17,21,19,21,21,18,18,17,20,18,19,22,19,18,18,19,23,22,28,29,23,24,27,23,27,28,23,22,22,25,22,22,29,25,27,31,31,30,30,27,29,27,21,27,28,24,29,30,36,35,36,32,29,29,30,34,30,35,42,36,41,38,38,45,42,47,45,42,42,42,39,47,55,61,76,81,93,93,99,106,111,118,117,116,102,99,99,107,112,122,130,130,128,124,125,117,125,127,127,130,130,122,120,122,128,128,123,120,127,129,122,121,128,130,128,131,132,128,123,125,127,124,126,125,122,125,122,122,125,123,122,126,126,128,127,122,121,124,124,130,134,125,124,129,134,138,137,139,137,134,129,128,137,136,134,131,132,134,128,131,129,128,128,124,121,125,122,122,129,129,129,121,124,130,124,128,129,128,125,125,124,126,137,134,130,135,133,129,137,134,130,126,122,121,126,127,122,124,122,118,116,120,122,128,117,116,114,117,117,115,126,118,115,114,117,113,109,104,97,93,98,104,103,104,98,88,86,86,89,81,84,93,101,108,111,112,113,109,107,105,97,103,104,107,102,105,102,86,103,118,116,123,118,118,104,102,106,102,120,110,107,102,80,74,61,57,55,59,55,54,59,56,59,54,63,57,55,67,64,56,53,56,53,55,57,53,57,55,51,53,52,54,50,53,57,51,55,52,51,53,50,51,48,53,52,51,50,48,54,50,47,49,45,47,49,51,50,40,49,51,46,50,46,47,48,47,47,42,40,43,46,47,43,45,40,43,45,42,45,42,43,40,45,44,42,44,44,43,40,39,43,36,43,45,42,44,39,46,42,37,45,42,37,43,42,36,39,41,38,41,37,37,39,42,39,37,42,36,40,40,39,40,38,41,41,42,36,39,40,39,41,35,37,39,36,35,38,36,41,41,42,39,39,44,38,39,36,39,38,34,38,38,39,34,37,42,35,35,40,43,36,42,39,39,46,42,53,44,40,49,45,48,48,53,45,46,56,51,55,54,49,57,61,49,48,53,72,72,86,82,72,89,95,114,92,97,108,112,132,125,152,190,209,179,125,135,161,145,114,86,74,79,98,127,148,189,204,190,198,202,198,197,199,204,210,222,251,179,19,15,9,18,29,21,23,26,29,27,28,29,26,21,25,27,31,29,24,24,22,26,27,23,32,31,30,32,35,34,32,39,32,36,35,36,41,39,50,51,51,51,52,53,51,54,61,61,75,94,120,133,139,139,113,103,110,129,152,165,165,157,155,155,155,151,145,146,155,161,162,159,151,152,162,166,167,168,160,156,148,153,153,152,161,156,161,159,160,164,159,160,154,152,151,147,141,147,148,155,155,147,148,146,143,139,143,146,148,150,153,155,160,160,157,156,158,163,160,162,164,165,161,162,162,155,158,160,158,155,159,157,159,160,160,158,151,128,98,85,95,124,148,159,167,165,159,152,153,152,155,157,156,161,160,164,159,169,114,11,1,9,10,13,11,12,12,13,14,13,13,28,21,20,20,24,21,19,23,24,24,23,21,22,27,23,24,24,26,24,21,21,24,28,26,31,28,26,24,19,18,18,18,19,20,19,19,17,21,20,16,19,23,16,20,21,17,19,17,19,16,19,20,17,18,17,18,19,18,21,19,16,21,18,19,20,19,21,18,18,21,19,16,19,22,17,20,22,24,25,24,21,25,24,25,28,27,27,27,26,29,26,23,26,22,24,28,26,24,29,24,27,27,24,25,25,31,28,27,36,35,35,34,32,31,28,29,31,35,37,37,38,38,42,48,42,41,48,43,50,42,40,46,44,51,55,53,64,69,79,89,90,93,102,105,112,108,116,122,127,129,128,126,123,115,111,111,116,120,117,124,126,127,130,124,126,119,118,120,121,125,124,124,130,129,132,133,128,129,126,118,115,123,128,130,134,128,128,131,132,137,128,133,128,121,131,127,129,134,126,137,133,131,114,120,128,136,132,137,134,140,143,138,131,141,136,139,145,146,146,139,132,133,131,130,125,127,124,123,128,129,121,129,136,135,137,130,131,133,134,139,138,136,137,139,129,130,136,132,135,132,136,129,131,139,133,136,135,141,145,147,141,132,127,132,128,128,127,127,119,125,128,131,122,112,114,113,118,118,117,113,121,129,133,133,122,113,118,103,125,110,101,110,107,112,106,95,89,84,99,106,102,92,100,110,107,105,112,108,84,107,93,81,70,84,85,84,89,81,84,95,88,115,82,62,69,67,71,82,71,81,86,78,69,60,62,62,61,61,53,53,49,55,54,51,54,52,51,47,61,55,53,54,52,57,56,54,58,53,51,53,51,53,51,55,51,51,49,50,49,50,52,45,48,46,45,51,44,42,47,47,46,46,48,43,48,46,44,51,41,38,43,43,46,44,45,45,42,44,45,42,41,46,44,42,38,45,46,38,43,42,40,42,42,41,44,42,40,44,45,39,37,41,36,39,46,41,40,37,34,39,38,36,41,37,42,40,38,41,34,40,39,36,39,39,37,38,40,35,38,39,37,39,34,42,38,38,38,35,37,39,40,41,39,37,39,41,38,39,38,33,37,38,37,39,33,36,38,37,35,40,38,36,41,39,41,42,44,45,39,46,41,34,49,45,43,43,36,42,45,47,47,44,46,51,51,53,57,53,66,67,62,57,59,73,77,60,74,78,82,101,101,126,153,169,149,119,134,171,165,164,142,125,92,128,150,131,147,139,138,125,99,90,97,98,101,107,126,177,93,16,24,9,21,24,19,26,20,27,28,25,24,24,24,24,27,23,26,24,26,24,25,24,24,29,25,30,32,33,38,38,32,34,34,38,41,39,43,47,47,49,49,51,51,55,62,60,62,64,87,92,99,102,115,84,100,110,116,125,139,151,162,171,163,157,152,156,154,165,169,165,159,151,150,156,158,156,156,156,150,142,141,143,149,150,154,151,153,159,160,162,161,158,155,151,147,147,155,163,165,167,162,159,153,144,142,139,142,146,148,152,155,157,155,151,149,153,157,157,154,153,159,162,162,161,156,156,162,158,153,153,153,155,155,163,155,159,147,129,108,79,91,112,142,162,170,171,166,162,158,152,148,148,151,152,159,152,167,115,11,1,9,10,12,11,14,12,13,14,14,13,22,24,17,21,21,19,23,22,21,24,18,21,22,22,23,23,25,23,22,21,25,27,24,21,30,22,22,25,24,23,16,16,17,19,19,16,20,20,19,19,19,19,19,16,17,17,17,17,23,21,16,20,20,21,19,17,22,21,19,19,19,16,24,23,17,19,18,23,19,22,21,19,20,19,24,19,20,21,21,25,19,24,19,20,28,21,24,26,20,24,24,23,25,22,21,22,24,23,22,26,23,26,29,20,27,25,27,33,27,27,32,29,29,29,29,26,31,35,37,33,29,34,33,38,40,39,41,43,37,37,42,47,54,55,53,57,57,66,71,80,81,79,93,103,116,124,131,129,132,129,124,118,122,124,120,121,118,121,121,123,124,125,125,127,122,123,126,131,126,127,131,129,129,128,128,125,127,124,122,127,133,136,134,135,133,137,141,131,130,132,127,124,122,122,130,134,134,129,125,118,114,114,106,116,127,132,124,111,118,127,131,129,128,133,130,134,136,140,142,132,133,128,133,136,130,133,129,130,130,126,122,123,127,131,133,132,129,132,132,131,132,128,122,120,122,122,126,130,132,136,127,126,137,141,139,141,141,138,135,131,135,132,130,128,128,127,125,113,118,121,122,125,117,117,114,115,114,126,128,125,128,132,127,130,134,120,116,118,111,107,106,108,118,117,105,98,95,87,78,87,90,88,94,90,87,92,98,83,75,65,59,50,50,59,54,64,64,57,57,60,55,60,51,39,61,68,83,79,77,92,84,72,68,68,64,61,56,57,59,52,60,58,55,59,56,52,51,56,58,54,54,54,52,56,57,49,56,55,51,51,54,56,51,59,56,49,54,52,48,49,52,51,56,45,50,50,50,48,48,51,44,44,46,46,45,46,46,48,43,49,45,41,48,45,45,44,45,42,45,42,39,44,44,46,42,42,39,42,44,39,44,41,39,40,38,39,42,41,41,41,41,41,42,39,45,42,41,46,41,41,41,39,42,45,43,45,44,40,36,37,46,42,39,37,34,37,41,37,36,39,40,39,38,38,40,42,39,40,38,37,37,39,41,36,39,37,40,34,36,39,33,39,34,37,37,34,37,37,37,35,38,35,37,40,38,43,34,40,43,40,41,36,38,36,42,41,35,39,36,42,44,40,43,46,43,51,57,57,56,52,47,50,56,55,53,49,60,54,61,81,83,99,113,125,118,110,117,141,200,223,205,190,180,187,203,196,184,147,105,74,62,74,84,78,70,42,45,77,39,28,29,10,23,22,23,25,17,24,21,26,25,22,25,27,27,17,29,28,27,24,22,31,27,24,26,30,35,32,39,39,34,37,38,39,44,45,46,53,47,48,52,54,57,57,60,65,67,64,69,85,83,86,104,98,110,122,110,102,115,137,159,170,165,159,157,158,160,169,170,163,155,151,153,155,156,149,147,153,152,140,148,152,152,155,153,154,154,159,158,153,153,153,154,159,157,153,162,164,165,167,165,161,159,160,157,154,153,159,155,156,155,155,157,146,146,151,158,154,148,150,150,154,152,157,161,159,161,160,154,152,149,151,154,153,156,158,159,155,136,111,91,87,110,139,162,173,178,174,168,166,158,151,151,146,148,148,162,112,13,1,10,10,12,11,14,12,13,14,14,13,22,20,17,18,19,23,22,22,21,21,22,24,23,23,24,21,24,27,22,23,26,24,23,23,28,20,23,24,18,17,20,18,17,19,16,17,16,21,20,17,19,16,17,16,17,21,19,18,18,19,19,17,19,19,18,18,21,19,18,18,18,19,18,17,20,23,20,22,19,18,20,19,21,17,21,21,24,21,22,20,19,24,22,24,24,22,23,21,24,27,22,22,24,20,22,24,22,30,24,24,27,24,26,24,25,24,25,30,27,24,29,24,29,31,30,29,28,31,33,27,27,27,34,37,36,40,41,40,44,47,51,55,61,57,61,61,59,55,51,54,57,67,100,111,113,108,106,99,95,98,106,115,126,129,129,127,127,124,135,137,133,127,128,128,124,128,130,128,128,130,127,121,125,128,126,123,125,128,125,130,135,139,129,123,126,122,126,125,125,129,128,131,127,132,136,127,125,118,127,120,113,116,115,111,124,118,115,124,118,117,120,118,130,123,124,126,125,125,133,137,136,132,134,135,132,137,138,125,121,126,119,122,114,128,126,125,129,129,125,116,117,112,111,113,117,124,126,128,127,125,128,129,134,140,136,136,128,122,115,110,105,105,117,107,114,120,114,110,119,116,118,121,119,124,111,123,115,114,114,116,122,127,128,126,118,111,119,123,112,119,123,110,122,115,103,122,134,120,106,102,88,92,96,103,93,84,83,65,68,76,83,61,55,70,60,66,67,40,51,61,39,64,59,51,41,65,45,78,72,58,52,47,55,56,64,49,51,61,53,57,57,53,54,61,60,59,64,59,60,55,57,56,53,55,53,55,52,55,55,52,55,53,53,49,48,54,51,55,53,55,51,49,59,53,55,51,49,53,53,44,45,53,51,53,48,43,48,48,46,51,52,42,44,48,46,41,47,45,41,45,44,46,43,41,43,42,44,39,47,42,40,47,38,42,39,41,45,44,43,46,39,40,44,40,44,40,46,42,38,42,42,41,38,41,42,39,45,42,36,39,39,41,45,36,40,43,35,40,39,36,40,39,33,40,40,39,38,37,40,37,38,37,34,42,34,37,36,34,38,35,36,39,34,37,36,37,39,32,36,39,35,33,39,37,38,39,38,42,35,39,39,37,41,34,35,41,33,40,36,37,40,41,43,41,48,57,57,46,48,48,51,54,50,55,61,67,67,55,62,80,84,102,107,98,97,104,104,120,170,192,187,179,171,171,172,161,148,130,121,121,127,140,137,146,129,103,109,108,43,18,27,9,22,26,17,23,23,23,20,23,25,20,26,23,24,26,22,29,24,25,27,23,23,30,32,29,32,35,39,35,36,41,40,43,47,47,49,48,51,56,58,63,62,66,63,73,66,65,80,94,101,102,119,114,125,147,134,121,134,145,154,141,136,130,135,141,140,147,150,149,147,143,147,152,153,152,154,155,160,146,163,165,167,162,162,160,161,162,160,152,153,159,164,163,158,157,162,162,162,162,154,155,155,155,159,159,158,163,165,160,159,160,160,153,151,155,156,158,155,155,153,150,150,151,151,154,155,155,156,155,147,145,145,152,151,157,151,155,147,135,118,95,90,101,135,157,170,175,174,171,162,150,148,145,146,147,161,112,13,1,9,10,14,11,13,12,13,14,13,12,22,22,16,24,22,19,22,18,24,24,22,21,23,22,24,24,19,24,23,21,26,24,23,24,23,24,22,20,22,19,17,17,18,17,17,17,17,16,19,18,18,17,17,21,18,20,19,18,19,17,17,18,17,18,19,18,18,19,21,17,22,21,20,21,17,20,19,21,17,19,22,19,22,19,17,20,21,27,25,27,27,31,34,28,26,27,31,33,38,35,27,30,32,25,30,29,25,26,25,31,30,35,31,29,36,27,27,29,25,28,27,31,33,29,34,33,29,40,43,41,39,36,41,38,43,48,45,48,53,56,57,57,69,72,69,78,71,70,56,53,55,57,77,76,75,73,73,76,85,100,112,119,126,128,128,127,128,126,129,127,127,127,123,132,130,128,125,128,126,124,124,119,125,125,127,126,125,126,123,131,131,124,125,124,122,123,123,124,128,130,131,135,132,130,128,122,118,118,127,128,130,135,134,136,127,125,124,129,133,128,134,139,138,136,134,130,117,116,122,121,134,132,131,130,129,133,128,127,132,133,129,127,127,126,123,126,129,126,120,118,124,126,125,130,140,145,141,140,127,125,126,129,133,130,132,135,131,123,122,117,122,118,110,113,113,115,126,124,123,122,119,114,113,114,109,112,107,108,111,118,114,117,110,101,106,104,118,128,126,129,139,136,126,113,101,111,130,125,119,120,103,100,116,114,101,105,113,116,128,127,114,98,96,92,76,78,82,83,81,77,81,84,81,77,86,73,78,84,76,67,61,65,67,74,69,60,57,55,53,48,55,47,59,61,55,65,66,68,65,50,53,52,47,49,51,49,51,54,51,53,48,54,50,52,50,45,50,48,56,49,55,53,49,54,51,51,49,52,51,48,51,49,48,47,52,47,47,50,45,46,48,45,44,41,48,46,43,46,46,45,45,46,39,45,48,42,40,49,43,40,51,44,45,40,42,43,40,46,44,44,42,44,46,44,42,40,45,42,38,43,44,39,44,41,39,44,42,42,43,41,41,43,46,42,39,42,38,37,41,37,37,41,40,38,38,39,39,39,40,42,40,36,37,36,38,39,37,39,38,34,36,33,36,42,39,39,38,41,39,38,36,35,44,41,40,39,39,42,34,39,41,42,40,41,37,37,36,36,41,40,44,44,47,49,55,63,59,48,54,58,60,51,46,64,78,79,67,56,61,74,82,103,98,71,76,93,87,92,124,140,146,143,133,124,116,109,103,107,134,165,187,196,177,168,181,184,183,156,64,30,25,13,28,22,27,25,23,31,22,27,26,25,27,26,26,23,28,27,29,29,29,32,29,30,28,31,39,35,39,44,40,41,44,48,53,53,55,57,59,60,62,67,71,68,61,63,66,67,93,123,120,121,133,125,132,149,151,142,154,161,150,139,127,117,121,128,122,133,139,141,142,137,142,149,154,159,161,160,156,153,159,159,159,160,158,159,158,165,167,164,163,163,163,162,165,165,162,162,162,160,162,158,156,156,154,152,151,160,159,159,156,157,163,159,158,160,162,165,163,162,164,160,152,147,146,146,150,156,155,154,150,147,145,144,152,154,151,154,152,153,149,118,92,83,95,122,146,167,173,165,163,152,150,144,147,146,155,111,14,1,9,10,14,11,13,12,13,14,14,14,19,21,23,21,25,19,23,20,21,27,18,23,24,17,24,24,23,23,24,25,21,28,23,18,27,24,21,25,21,19,17,21,19,18,20,18,18,17,19,18,22,20,17,17,22,19,16,19,18,18,21,17,16,18,19,22,19,17,19,18,19,22,19,17,23,21,20,22,17,20,20,22,22,19,21,21,23,27,26,29,31,32,35,28,30,39,43,43,38,41,46,45,44,37,31,30,35,30,31,41,35,38,38,38,34,35,36,28,33,32,34,37,36,39,39,42,36,49,53,47,45,46,45,48,54,59,61,60,54,56,59,70,86,96,110,100,87,99,78,84,80,81,86,99,94,97,89,91,100,109,118,119,122,120,120,117,115,112,107,120,122,121,125,123,127,125,123,127,125,128,126,125,120,121,123,121,128,130,136,135,131,132,131,132,132,131,131,129,131,134,134,134,130,133,130,124,125,128,138,136,136,141,141,142,145,144,136,138,140,138,142,143,148,152,145,129,116,115,125,128,127,131,136,133,124,125,136,137,134,132,135,126,126,127,125,131,133,131,129,125,130,130,138,137,134,134,129,127,125,125,128,133,129,141,134,127,122,127,127,133,137,134,138,139,141,141,138,136,135,136,136,134,120,121,122,123,131,128,129,120,112,107,99,91,93,106,109,107,102,102,111,110,106,95,92,98,94,102,119,119,112,118,122,100,86,96,107,107,113,122,122,94,102,98,90,89,88,93,94,84,81,79,84,87,108,78,74,74,90,77,80,80,69,66,73,57,55,53,55,58,55,63,57,49,49,50,54,53,54,49,52,54,51,54,50,47,47,47,50,49,55,50,49,53,51,50,49,52,50,54,52,55,52,47,49,47,47,45,48,47,47,47,44,50,47,44,46,45,46,46,42,44,43,44,45,46,48,49,43,44,49,47,45,43,45,42,44,43,45,44,41,44,40,44,43,44,40,41,46,41,42,44,47,43,43,41,40,39,41,42,40,42,44,42,42,41,37,39,41,45,42,42,43,42,38,39,40,39,42,39,40,38,37,38,39,42,36,43,39,36,43,37,41,36,35,42,35,42,37,36,37,37,36,32,38,38,36,39,43,38,38,42,38,36,41,36,34,36,38,41,39,38,39,39,35,37,37,36,42,40,44,45,41,48,51,50,51,48,52,53,54,46,48,56,54,59,55,43,52,50,55,73,61,54,44,69,62,57,75,98,106,90,78,60,97,107,102,92,104,122,142,149,129,125,138,167,173,160,92,41,24,17,30,24,27,29,23,29,25,30,27,29,29,22,32,29,26,29,22,30,33,27,29,29,36,35,34,40,41,41,44,41,44,47,49,50,56,57,56,60,62,60,66,63,59,55,55,70,93,114,120,123,124,110,111,139,145,129,144,141,149,141,139,143,140,141,149,153,152,151,148,146,146,155,166,161,160,163,152,151,162,161,158,156,151,150,149,153,159,155,152,153,154,154,151,154,151,157,156,158,164,162,159,152,148,150,149,151,153,150,151,156,157,157,157,163,160,160,161,159,156,158,159,152,147,145,147,149,146,143,146,142,141,145,147,149,145,152,154,151,150,141,122,97,82,89,115,139,157,161,160,153,147,144,144,139,152,109,14,1,10,10,12,11,14,12,12,14,13,13,20,22,19,22,23,22,23,21,24,22,21,18,26,27,22,25,24,24,22,22,22,25,22,24,24,23,24,24,22,18,20,19,18,17,17,19,19,19,20,19,16,19,20,19,15,21,22,17,19,17,18,17,19,18,19,19,16,19,21,16,20,21,18,20,19,18,21,27,17,20,24,19,21,17,24,25,23,17,24,30,27,35,31,32,33,38,40,34,37,38,38,42,43,45,41,33,41,37,35,36,32,36,35,34,36,32,30,34,32,33,36,39,37,39,39,36,40,35,38,43,44,45,44,45,49,49,52,51,51,54,57,55,71,84,95,108,89,93,93,103,113,117,132,125,130,130,125,120,122,118,118,121,118,116,115,110,107,104,106,120,127,126,116,114,110,110,108,118,124,121,123,122,127,130,130,131,133,135,133,139,135,129,128,128,129,127,125,125,130,128,127,128,130,127,127,125,128,137,137,135,124,118,123,131,136,141,139,136,134,133,139,142,143,141,132,129,136,137,142,134,128,127,133,132,133,137,133,128,129,128,127,126,128,133,133,141,141,140,141,139,141,144,140,129,122,121,118,125,129,140,142,137,137,141,141,140,139,135,130,124,125,127,125,135,143,145,147,137,145,144,139,127,124,133,128,129,131,133,112,95,100,101,95,101,111,105,93,90,86,81,86,83,87,80,86,98,92,90,101,118,118,128,130,100,75,64,71,72,75,77,75,83,96,90,93,89,83,86,89,84,72,63,75,81,75,70,60,59,60,50,52,53,50,57,55,53,48,50,54,58,70,62,61,57,53,61,49,63,69,64,68,61,63,64,59,58,63,63,55,50,60,62,59,63,54,54,51,53,55,53,54,49,52,49,47,52,52,45,46,49,48,51,50,47,49,51,49,50,48,50,40,46,53,47,53,51,47,49,47,48,47,47,50,45,44,49,43,45,45,45,48,43,42,47,47,43,44,44,46,44,42,40,42,44,46,40,41,41,42,41,39,40,43,42,38,42,42,41,37,41,42,39,41,36,39,44,43,38,38,40,37,39,36,40,39,40,37,44,44,37,39,39,43,37,35,39,40,39,38,40,36,34,42,34,38,39,38,40,38,39,38,44,43,36,40,35,33,39,36,44,32,35,41,37,43,41,40,39,42,44,48,46,41,47,49,48,53,50,47,49,52,54,54,52,42,45,46,44,51,43,46,50,53,53,51,56,49,36,46,73,81,56,45,66,97,115,108,83,77,77,95,101,85,82,83,115,144,149,108,47,22,19,27,28,29,28,25,30,28,30,32,28,29,28,29,29,27,27,29,29,32,29,36,32,32,37,38,40,34,37,44,44,44,49,48,47,57,52,50,57,59,61,62,67,57,56,59,72,90,102,116,122,116,101,117,132,127,113,116,116,114,132,162,173,164,161,165,173,162,159,153,148,151,160,164,159,158,158,158,162,166,164,160,157,151,152,153,155,157,154,147,147,147,142,144,143,142,147,148,152,160,162,157,153,153,148,146,147,148,149,146,148,151,151,153,153,153,155,153,151,152,153,156,155,152,150,149,150,139,141,143,145,147,146,142,138,142,144,146,144,147,149,137,125,101,89,90,105,132,148,162,158,160,152,147,143,150,108,14,1,10,10,12,11,14,13,13,14,13,13,21,19,22,22,19,19,23,18,20,23,23,23,21,22,24,24,21,21,23,22,25,23,28,24,21,25,27,24,19,19,16,16,19,18,18,17,18,19,19,18,19,20,21,19,20,20,16,17,17,19,19,19,21,18,18,19,17,20,21,20,21,19,21,19,19,22,20,20,25,18,19,23,20,19,23,21,24,24,21,25,27,29,28,24,27,25,23,26,31,24,25,30,29,33,31,32,30,33,33,29,31,35,34,35,28,33,38,32,36,36,36,42,37,42,42,41,41,37,37,41,42,42,43,47,45,47,51,48,48,49,50,56,67,69,75,79,78,85,73,77,101,103,113,106,115,122,126,126,135,132,124,123,122,124,127,131,123,134,135,137,131,128,128,119,117,118,120,125,118,120,116,121,130,131,133,132,130,130,130,130,128,128,129,126,123,125,129,128,131,130,123,120,123,130,127,125,128,131,130,122,118,119,122,132,133,123,126,127,121,117,129,129,129,130,132,140,136,133,131,127,124,129,134,137,133,132,132,131,130,129,127,127,135,141,139,140,135,141,143,144,159,155,153,145,135,143,146,146,149,147,142,144,143,146,138,146,140,139,139,141,139,134,131,130,132,134,139,136,137,125,115,106,103,116,126,120,121,120,111,113,113,119,119,122,112,96,94,111,122,123,113,113,107,99,103,116,101,101,109,111,114,119,116,101,106,111,89,103,95,99,88,88,90,86,90,85,62,76,71,79,48,70,74,74,45,42,64,65,55,47,55,64,68,63,70,70,68,72,78,83,77,83,78,75,80,76,68,100,87,81,80,69,83,77,81,92,90,84,73,62,65,75,74,69,60,50,56,57,51,51,55,51,47,49,52,50,51,51,51,49,55,51,46,53,48,54,53,55,55,51,53,50,54,54,51,52,53,53,51,46,42,51,48,53,50,36,46,44,46,44,43,46,45,47,41,46,43,42,44,41,44,44,45,42,42,39,40,39,40,43,41,39,33,44,42,42,43,38,38,38,37,40,41,39,40,39,41,38,37,41,37,40,43,34,40,39,35,42,36,34,46,37,36,36,33,44,38,38,39,40,38,37,41,33,38,39,39,39,37,40,42,38,38,40,35,35,42,38,35,36,37,35,37,39,37,38,41,46,41,41,44,43,50,48,50,46,50,48,52,69,68,69,63,52,45,46,53,54,52,48,54,55,52,60,66,67,65,46,60,64,88,81,71,79,87,98,104,95,103,93,97,94,92,82,66,83,102,115,94,44,28,26,27,28,33,31,29,30,29,34,30,30,30,24,28,32,33,31,26,35,29,29,33,33,37,30,37,36,33,42,37,42,44,37,46,44,48,47,50,55,55,57,57,66,54,53,55,64,78,96,109,123,121,117,128,134,131,124,118,116,123,139,160,164,163,171,172,161,148,148,143,146,153,154,155,149,158,159,158,160,162,161,160,163,157,150,152,151,154,154,150,150,152,149,154,155,152,156,155,152,159,159,163,163,159,154,145,143,154,152,145,146,143,142,148,151,153,155,152,152,148,144,149,149,150,151,150,152,141,141,148,147,150,148,144,143,142,144,144,143,136,137,141,143,131,112,95,83,102,125,144,155,161,159,154,146,154,108,14,1,9,10,13,11,13,12,13,14,13,14,21,21,17,23,22,20,21,21,19,20,18,21,25,24,27,20,24,21,21,29,23,24,24,26,22,23,26,21,20,17,17,18,17,19,21,17,20,17,17,20,18,19,19,20,19,19,19,18,21,20,17,19,19,19,20,19,18,21,24,20,17,22,23,20,22,18,22,22,21,21,17,24,22,20,29,21,22,22,23,24,23,23,21,25,21,24,27,27,27,22,23,27,32,24,26,32,30,29,27,31,30,30,34,34,34,29,29,32,35,37,39,42,40,43,45,43,46,46,44,43,45,44,44,46,46,46,48,45,46,46,46,53,66,59,63,69,74,91,87,95,93,91,93,87,95,103,119,134,141,141,137,137,128,132,141,139,141,141,137,137,132,130,130,127,134,144,144,142,138,136,139,134,132,127,127,133,132,133,128,128,124,124,130,134,140,139,143,142,146,144,134,132,134,137,136,128,126,122,115,116,116,123,132,136,135,131,135,130,131,131,129,125,121,129,134,135,130,119,127,135,141,143,144,134,135,141,145,141,130,133,136,137,141,149,141,137,134,141,148,140,142,144,147,146,146,151,144,146,142,134,134,130,131,132,131,131,132,136,141,144,144,141,134,128,127,125,132,127,133,130,125,122,123,127,120,125,119,125,133,143,144,138,141,144,136,126,124,139,141,136,143,148,141,122,123,134,129,114,98,100,101,100,94,90,100,98,105,108,122,124,103,97,93,93,98,93,95,99,94,83,91,89,82,77,73,81,79,79,79,72,73,76,86,95,101,94,89,90,92,81,77,76,81,64,64,83,76,77,57,61,72,71,82,72,78,89,90,88,86,67,64,67,68,69,61,57,60,58,47,54,53,49,51,49,55,46,53,54,50,54,49,57,50,54,53,48,53,51,55,52,52,50,54,50,53,48,50,53,46,48,46,43,46,45,44,42,42,46,48,49,46,45,42,46,43,46,43,39,46,41,46,46,42,42,41,43,43,40,42,40,42,37,41,41,39,39,39,41,37,42,37,41,45,39,46,41,37,40,41,39,36,40,39,41,36,37,41,37,41,38,41,38,37,42,33,37,41,41,39,33,39,38,36,35,41,39,35,41,36,41,44,37,40,41,39,39,39,41,36,36,38,41,39,37,40,41,39,42,41,40,39,44,51,44,46,47,47,45,59,62,56,65,59,55,48,49,61,65,54,59,60,53,66,71,81,95,81,73,73,71,95,102,93,84,69,78,96,104,106,92,93,85,87,86,59,60,65,83,80,45,35,27,26,32,32,29,30,34,32,30,32,30,33,29,32,34,30,32,31,29,31,31,31,32,32,35,37,36,36,39,41,42,41,43,44,41,47,47,48,51,50,57,57,61,56,53,53,62,70,68,99,116,120,127,132,125,130,139,129,125,138,149,150,149,157,164,153,146,137,145,152,155,154,152,151,147,152,158,160,156,155,154,158,165,161,155,153,158,159,164,162,159,157,153,162,169,163,158,155,154,152,155,160,163,168,168,162,152,155,156,150,151,148,150,153,153,157,160,162,162,160,157,155,150,149,145,146,151,148,151,149,150,150,149,151,150,146,149,145,139,139,137,139,146,146,136,114,88,85,94,122,142,161,162,162,153,158,112,14,1,9,10,13,11,13,12,13,14,14,14,18,21,20,26,24,22,21,23,24,17,22,22,22,22,21,24,25,22,24,24,24,27,24,28,24,24,24,18,22,19,19,18,21,22,17,19,17,19,17,19,18,19,22,19,18,20,19,22,21,18,21,19,20,19,20,18,20,21,19,21,21,19,23,23,19,23,19,26,22,21,25,18,21,22,23,21,24,23,23,21,21,22,22,27,24,26,30,25,29,27,33,30,29,32,31,35,33,39,34,28,39,38,33,35,36,39,39,44,45,39,54,46,50,54,47,50,52,52,56,54,50,57,49,56,53,45,55,51,55,53,67,70,77,76,88,84,82,89,110,111,107,92,92,90,91,86,99,110,117,120,123,125,122,127,125,127,128,127,130,131,125,123,133,135,143,147,145,139,128,137,136,130,133,125,127,130,124,122,123,123,118,127,120,133,136,147,151,153,153,152,138,149,134,139,134,133,130,129,123,127,128,131,140,141,137,139,131,136,134,131,136,128,131,131,131,130,125,132,139,139,139,142,149,145,141,147,145,137,141,137,132,129,132,125,129,124,128,130,122,124,121,122,130,132,136,125,128,127,122,120,106,107,109,119,124,125,119,115,127,135,126,144,135,118,115,122,132,113,127,134,131,132,128,121,122,119,120,123,131,151,131,122,131,130,131,126,120,124,126,136,139,133,128,126,126,120,113,99,95,94,91,89,80,79,61,87,90,86,85,102,83,85,82,88,114,78,106,98,104,108,102,94,68,65,78,72,84,67,73,69,66,66,70,80,89,75,70,72,73,50,51,60,53,54,52,67,53,49,55,57,58,60,66,62,61,65,72,70,70,57,59,64,65,70,62,62,58,64,71,61,57,49,50,53,52,56,48,49,53,45,50,49,50,50,45,49,44,48,50,48,50,49,43,51,47,42,47,45,48,45,43,46,42,46,44,44,49,43,43,47,49,44,44,44,42,46,41,47,44,39,46,39,44,44,39,45,44,41,40,42,38,46,38,39,42,36,41,38,42,42,38,42,38,40,40,38,44,37,41,42,33,39,39,36,41,39,37,40,35,41,38,39,41,37,40,40,42,34,41,44,37,40,40,42,41,41,39,39,43,40,44,44,39,41,38,44,41,39,41,38,43,39,42,40,40,40,37,42,40,39,41,40,42,46,43,44,46,45,44,45,47,52,49,47,50,55,63,60,54,57,61,59,67,67,74,88,71,63,66,69,80,80,81,85,68,65,73,78,84,55,54,60,61,63,54,50,55,66,65,48,37,24,32,34,29,29,35,29,31,36,30,32,31,29,29,28,30,34,33,30,30,34,32,31,35,29,38,33,30,42,38,41,42,39,44,45,46,48,48,50,55,54,59,60,64,58,59,73,79,68,77,80,96,107,105,104,111,132,146,122,132,141,133,134,133,148,145,147,147,153,157,163,154,149,148,145,147,151,153,152,153,148,154,159,159,159,156,148,152,155,153,157,154,156,159,162,162,155,151,148,151,148,152,153,154,155,158,154,152,150,147,151,151,150,153,152,158,155,160,157,158,159,159,158,155,152,147,151,147,149,152,145,148,150,149,147,147,149,147,148,141,144,142,143,145,139,129,106,86,75,90,115,141,152,160,160,163,111,14,1,10,10,12,11,14,12,13,14,14,13,21,22,20,19,19,19,26,23,19,23,21,24,19,27,26,19,24,27,21,19,24,29,25,23,22,23,24,21,23,19,18,18,17,20,20,17,19,18,17,21,17,23,21,21,20,16,21,19,17,20,22,18,18,19,21,20,21,19,23,23,22,21,21,23,21,21,19,19,24,21,23,22,19,25,27,18,24,22,21,24,23,22,25,30,28,34,29,28,37,30,22,27,30,31,38,27,34,33,29,34,37,37,38,42,43,46,45,47,50,49,52,53,57,56,53,55,51,56,56,51,48,46,51,53,57,54,61,67,79,88,94,92,92,97,97,91,81,77,80,93,89,89,84,80,81,79,87,92,100,105,114,132,133,131,132,125,130,124,125,127,127,126,130,134,130,129,120,120,117,114,122,122,125,122,121,126,122,131,128,134,135,131,137,134,133,127,131,133,131,134,140,141,132,127,127,134,136,136,136,137,141,139,143,139,134,131,131,128,121,127,132,131,128,127,122,125,132,131,137,135,142,145,147,149,139,130,130,135,134,131,123,120,126,138,137,131,125,123,119,115,117,128,137,131,127,122,124,133,127,118,117,118,126,129,122,118,110,109,116,130,145,150,139,125,125,113,105,97,105,112,110,108,110,106,100,117,125,136,137,129,115,105,107,106,124,124,117,124,122,131,131,124,114,93,87,92,96,98,100,98,96,94,81,84,92,88,83,68,76,80,71,75,69,78,71,54,57,60,73,78,96,83,55,55,49,62,57,53,65,64,66,70,69,75,72,64,67,64,61,60,61,57,58,58,57,58,54,62,57,55,62,60,60,57,53,56,57,55,55,50,56,54,61,65,61,60,61,64,61,62,55,59,56,55,53,54,53,47,56,50,49,49,48,54,51,46,46,45,44,46,52,42,46,49,47,45,46,46,48,50,46,51,50,45,47,48,50,44,44,49,48,45,48,47,44,44,42,46,42,41,46,43,39,44,46,45,45,47,40,38,46,43,39,42,39,42,45,39,43,44,39,41,41,38,43,39,40,46,37,42,41,43,41,42,45,39,44,41,45,43,38,45,43,43,45,46,48,42,44,46,48,47,46,43,45,45,43,47,43,40,38,42,42,40,39,43,39,39,38,37,39,39,42,40,36,39,39,41,45,42,44,43,40,43,46,44,45,49,50,48,55,60,48,47,54,61,61,55,54,57,57,53,57,62,62,65,58,53,62,67,66,57,73,93,80,64,58,60,71,63,59,45,48,61,53,57,51,51,53,50,41,32,34,32,34,35,35,35,31,34,33,32,32,29,37,35,33,34,32,35,29,34,36,32,32,33,39,33,37,41,36,44,44,43,43,42,47,49,48,52,53,57,62,65,67,64,63,80,86,67,61,67,78,90,98,87,96,125,130,118,127,139,127,127,139,146,145,154,162,164,171,170,153,146,143,143,145,145,145,145,152,148,149,148,145,146,147,150,149,153,154,150,152,151,157,165,165,160,159,159,158,154,149,146,140,141,151,150,151,148,148,154,155,152,150,155,157,157,153,150,153,150,155,160,163,154,150,153,153,158,156,155,156,153,152,148,139,141,142,139,144,147,146,147,143,144,141,132,115,91,81,79,103,126,149,152,167,117,12,2,10,10,13,11,14,12,13,14,14,13,22,19,21,23,20,22,20,21,22,20,25,23,22,25,25,24,21,25,24,23,21,24,25,22,25,26,25,19,19,19,21,20,18,18,17,21,19,19,18,20,20,17,17,21,20,17,19,22,19,19,23,19,17,21,19,18,21,22,17,21,21,18,21,21,20,20,21,24,24,19,23,26,24,20,21,21,20,23,21,26,29,24,24,27,22,29,31,27,29,29,32,26,25,26,30,27,25,34,29,36,31,29,34,41,39,42,48,48,50,54,55,50,61,55,56,53,46,49,44,49,57,56,49,48,49,55,73,88,85,99,86,87,83,85,84,82,76,75,82,83,79,95,76,97,93,100,104,111,116,120,123,128,131,138,140,139,134,132,129,129,126,123,127,126,129,125,125,122,121,129,134,134,139,138,139,137,134,131,137,135,141,141,141,140,125,129,128,118,118,129,117,137,122,112,108,120,129,122,129,135,137,132,131,129,127,118,121,135,131,137,132,141,140,137,140,141,134,133,141,142,135,127,120,134,129,121,136,138,139,129,123,132,130,133,132,147,132,137,134,137,137,139,135,132,130,131,139,142,145,137,130,128,122,122,123,124,122,119,120,125,132,137,132,133,128,121,117,116,125,124,127,127,114,122,136,133,133,142,132,122,114,109,123,129,127,125,131,117,123,122,129,133,127,108,116,122,126,128,129,120,136,137,132,120,122,117,112,96,96,94,91,97,92,77,80,68,69,70,74,94,95,93,84,72,67,80,77,76,93,102,101,95,91,81,91,83,84,86,65,79,56,59,66,69,76,77,69,66,59,55,57,60,55,58,60,54,53,51,54,54,51,51,55,63,58,59,53,52,54,57,61,59,60,58,55,60,60,55,57,53,51,51,53,52,52,53,49,48,50,47,48,47,46,49,48,47,47,47,42,50,49,47,53,47,47,47,46,47,46,49,45,46,48,48,43,45,43,43,45,44,47,41,43,45,45,45,45,45,40,39,43,41,43,43,43,42,40,40,41,42,39,44,43,36,43,49,43,40,44,42,42,44,40,44,40,40,40,40,43,41,42,46,43,42,46,44,46,45,43,42,43,46,47,42,47,44,41,39,39,41,42,41,40,41,41,41,43,37,37,38,37,39,39,41,41,36,39,41,36,40,41,37,41,44,47,48,53,48,52,54,53,57,46,48,53,53,58,56,50,51,53,51,52,58,59,59,52,49,64,70,66,68,84,90,87,85,75,91,86,76,84,56,48,63,57,57,47,51,48,39,43,36,32,34,29,34,36,31,36,33,35,36,35,35,31,29,39,39,35,30,33,34,39,36,36,38,37,37,37,42,39,42,48,41,43,43,45,51,49,54,50,53,57,56,61,60,60,63,63,62,59,57,65,86,101,95,94,127,144,130,140,152,138,137,145,149,155,160,162,168,174,169,161,154,149,144,158,158,162,158,160,160,158,146,139,138,152,157,159,158,152,139,137,144,157,162,168,165,162,164,159,157,158,157,148,144,140,145,150,151,149,155,155,148,147,150,152,149,143,141,139,145,148,152,152,154,153,152,157,157,154,154,158,153,150,144,141,146,141,139,139,140,142,142,146,147,152,154,145,126,96,79,83,93,125,143,162,114,14,2,10,11,13,11,13,12,13,14,14,14,19,19,17,21,20,19,24,19,21,19,22,24,27,31,24,22,25,21,27,19,24,28,25,26,25,25,19,20,20,19,20,16,17,18,16,24,18,19,23,17,23,18,17,22,22,18,16,22,19,20,23,17,21,20,21,19,20,22,17,19,19,19,25,24,18,21,19,24,24,17,27,21,19,27,22,23,24,29,27,28,35,24,25,28,24,26,27,27,27,32,24,27,32,26,29,29,30,31,32,31,27,33,30,35,35,41,46,38,50,45,44,51,49,53,56,56,59,63,64,70,76,79,81,78,77,81,87,84,87,72,74,82,77,79,79,84,80,80,86,91,103,114,121,121,123,125,122,126,128,125,120,115,115,111,118,119,125,125,124,125,126,131,126,131,137,139,151,152,145,142,146,146,153,155,145,141,130,131,125,123,124,122,130,129,129,127,128,133,132,128,135,140,141,133,120,117,126,131,131,135,135,128,127,131,132,133,137,145,144,149,145,139,146,147,151,148,151,149,145,139,134,131,122,129,129,132,141,142,137,136,130,125,135,126,122,107,107,126,134,133,119,118,123,127,141,142,146,146,144,142,138,136,125,131,136,143,137,123,128,122,121,123,125,129,130,129,127,139,144,135,137,132,134,139,145,148,136,131,120,120,129,148,146,130,131,128,124,116,114,131,140,138,141,142,150,153,154,150,132,136,146,155,148,146,141,135,133,120,117,108,112,119,113,103,96,89,101,84,71,80,90,100,87,84,80,87,93,97,101,99,88,80,78,69,72,80,87,74,65,60,55,59,61,71,71,61,71,68,63,71,66,59,52,61,70,73,63,59,57,58,53,53,59,54,50,50,56,48,51,52,56,59,48,57,56,51,51,55,50,48,54,51,53,45,53,54,52,51,49,51,51,51,49,49,45,51,52,45,46,48,45,48,44,45,47,43,43,48,46,44,45,41,45,43,45,44,42,43,42,49,47,43,42,42,47,46,46,45,44,44,48,43,43,43,43,43,40,42,43,44,46,49,43,42,43,41,41,41,34,40,41,41,42,37,40,38,42,41,41,41,39,46,40,41,41,43,41,42,42,40,43,43,43,41,44,43,40,42,41,40,40,36,42,41,36,43,38,39,38,36,42,36,40,38,35,43,41,40,42,36,40,42,43,45,46,54,46,46,42,46,49,40,44,44,41,45,47,45,50,45,51,49,52,57,61,63,53,55,67,72,64,60,77,81,89,101,88,90,94,84,72,48,55,68,53,48,49,48,44,45,43,38,29,32,38,35,37,36,37,31,38,36,35,42,34,33,36,35,37,39,32,34,40,35,37,33,38,41,39,39,43,45,45,43,45,47,46,50,52,55,57,51,53,55,53,55,57,53,54,52,59,53,52,80,97,89,92,129,154,138,148,160,145,143,146,154,164,165,161,159,171,179,172,166,160,166,174,170,173,177,175,170,166,160,152,154,161,165,170,166,154,136,137,148,157,159,162,161,159,159,161,164,167,169,157,151,151,150,153,154,154,154,155,151,146,150,152,152,144,136,142,145,146,147,153,153,151,151,150,152,152,155,154,155,154,151,152,152,150,147,141,141,141,142,145,144,149,152,157,151,135,109,91,81,95,112,143,112,16,2,10,10,14,12,13,12,13,14,14,14,19,19,21,19,19,21,21,21,20,23,24,19,25,23,22,25,22,24,23,21,23,25,25,19,24,26,21,24,20,19,16,19,21,16,17,20,18,19,21,18,19,18,22,22,21,20,21,20,18,22,20,20,22,22,24,22,22,27,22,22,25,21,21,21,24,22,27,25,20,21,20,28,19,22,28,26,22,22,23,24,30,24,27,25,23,32,28,28,31,29,27,29,31,33,35,29,34,36,36,36,30,33,41,38,45,47,47,48,51,50,45,49,47,53,60,77,103,114,77,117,108,105,103,103,78,108,74,70,75,65,95,85,117,96,94,97,96,96,108,121,131,132,130,128,123,128,123,120,120,121,121,120,126,121,117,121,122,125,126,130,133,140,143,145,145,146,142,147,142,134,127,119,116,124,126,130,135,130,128,119,118,122,130,133,128,132,141,135,146,148,156,162,141,157,153,152,143,136,141,146,141,139,137,138,149,151,153,155,151,148,138,146,150,143,137,132,136,142,143,137,139,141,143,139,130,128,132,130,131,131,130,135,139,113,128,120,131,133,134,135,132,137,141,141,142,141,143,141,137,139,138,136,136,141,152,140,141,150,148,145,143,138,134,131,132,136,141,140,128,123,115,121,119,132,147,136,121,117,111,117,125,125,122,117,118,122,119,124,131,117,115,116,129,136,137,128,122,122,127,122,120,123,117,114,108,104,104,93,96,91,93,90,90,94,98,88,96,85,69,70,68,66,72,77,77,80,82,82,72,78,69,44,65,50,52,48,50,53,53,56,50,60,60,52,57,60,62,69,66,63,66,59,54,59,64,68,76,66,64,61,57,60,55,54,51,46,53,56,52,57,54,57,53,47,51,48,50,53,50,49,49,46,48,50,51,52,47,53,50,47,53,53,51,47,50,52,48,48,50,47,42,45,43,41,41,46,48,42,42,46,45,45,44,43,44,43,45,42,46,40,47,46,44,43,43,45,44,46,45,49,47,43,44,44,41,41,47,44,42,41,45,44,42,42,40,42,42,40,42,42,39,41,41,41,39,42,42,39,37,38,41,36,39,41,37,39,39,36,39,42,38,36,42,42,42,38,38,41,36,35,38,37,33,38,37,36,39,36,42,42,41,40,37,41,34,42,43,39,42,48,47,41,43,43,44,47,44,44,47,41,42,46,42,46,42,50,53,45,46,45,51,56,62,61,57,57,57,58,66,63,66,72,84,76,77,79,57,56,63,58,56,54,56,61,56,55,45,45,42,35,42,42,34,32,34,36,37,35,34,36,39,34,39,36,35,35,33,41,37,38,36,39,36,34,37,38,37,39,41,43,43,44,46,43,48,45,48,50,49,55,56,52,53,53,51,53,53,55,60,51,58,61,55,71,82,75,78,116,137,121,134,147,140,143,147,151,164,163,150,145,157,174,177,171,161,163,164,167,173,170,173,160,164,163,160,160,159,160,165,163,157,151,148,149,156,158,157,153,152,148,150,161,159,158,157,151,149,152,154,155,153,149,154,155,153,153,151,153,147,142,146,148,149,151,148,151,150,141,148,149,147,151,149,151,153,152,150,151,150,145,146,141,140,142,137,143,139,144,147,147,147,132,119,90,82,87,111,103,19,2,12,10,14,11,14,12,12,14,14,13,20,17,23,19,19,19,19,22,24,21,18,25,24,22,22,24,24,18,22,22,23,24,24,30,28,21,23,24,20,20,16,17,19,19,17,19,18,19,21,19,23,15,19,22,19,22,17,23,22,16,21,22,19,22,22,24,20,19,27,21,25,24,26,24,20,29,22,24,24,25,22,24,27,24,29,29,24,25,26,27,26,24,29,25,25,25,29,26,28,33,29,31,31,36,31,23,35,30,30,30,34,34,33,36,42,45,45,48,48,51,52,54,53,59,66,70,70,69,76,71,71,66,65,68,69,62,53,51,59,55,65,79,87,102,101,105,110,115,133,139,141,134,134,131,131,129,125,122,121,124,124,131,136,136,138,133,133,128,132,135,136,143,138,143,143,133,130,125,113,111,114,114,121,129,127,135,132,130,130,133,134,137,141,136,140,141,135,137,138,136,136,132,135,142,149,147,151,149,141,136,131,130,127,131,133,131,131,135,143,153,152,155,155,142,134,127,129,127,122,124,130,130,127,127,130,137,129,130,129,136,140,131,145,136,141,144,150,149,136,138,141,144,144,140,136,130,140,141,138,141,143,142,133,135,132,137,144,147,152,147,137,126,132,137,146,153,148,145,136,126,135,141,148,150,145,144,143,145,145,136,117,101,94,99,101,107,119,128,118,101,93,88,93,111,118,96,84,100,108,105,92,92,87,85,80,80,84,73,85,81,69,67,74,88,102,92,97,97,81,76,61,56,60,73,81,78,75,78,69,66,61,51,61,48,46,43,47,55,57,55,57,59,55,58,59,50,50,54,55,64,66,63,57,59,54,62,66,58,57,57,59,61,60,57,54,57,59,57,62,56,61,59,54,55,54,61,63,63,65,59,60,59,60,60,53,47,53,54,52,53,52,51,51,53,51,51,50,53,45,49,48,45,50,47,44,46,47,43,48,48,45,46,47,44,43,43,42,43,42,44,44,44,39,41,45,44,46,36,39,42,41,45,42,43,44,45,43,43,40,41,47,42,44,42,47,41,41,47,42,44,42,47,45,43,42,39,43,43,41,39,35,44,43,40,39,39,39,37,41,39,41,40,37,39,43,40,43,41,33,40,39,41,42,37,41,42,40,40,40,47,43,43,46,41,38,45,49,41,47,45,46,47,44,49,50,46,47,53,56,49,49,47,45,47,55,62,63,57,52,55,55,59,69,61,50,60,57,58,63,54,69,87,93,76,66,59,50,57,56,55,64,60,56,56,49,56,48,42,48,50,45,42,39,37,32,35,40,33,35,38,34,37,39,40,33,36,41,45,40,38,37,37,38,34,42,37,37,40,46,46,39,44,47,47,44,49,48,41,56,54,55,56,53,55,55,57,51,61,69,54,71,66,63,66,57,59,67,105,123,108,118,130,136,145,141,145,160,160,139,131,144,155,168,172,159,149,151,154,160,158,152,150,152,159,156,157,155,153,160,159,160,153,149,154,154,150,149,146,142,143,147,143,141,146,145,151,151,154,158,159,160,157,162,165,160,156,155,154,152,150,149,152,150,148,153,155,158,155,152,153,154,154,149,149,151,151,151,153,151,150,151,145,146,142,142,144,141,139,142,142,144,144,136,117,92,75,86,86,23,3,12,10,14,12,14,13,12,14,14,14,20,19,24,21,19,22,17,20,23,19,21,27,24,22,25,29,24,19,21,24,27,24,25,23,25,31,24,24,19,17,20,19,21,18,21,19,17,24,19,19,22,17,24,19,19,27,20,19,19,23,27,17,24,23,23,22,24,24,17,21,24,23,26,23,22,25,22,23,23,25,21,24,25,28,26,23,32,29,26,29,26,25,36,35,34,33,30,36,33,33,34,33,31,33,33,32,38,33,32,38,35,36,35,42,45,41,45,47,50,52,55,51,52,55,56,55,53,54,56,53,53,48,55,52,67,51,44,44,41,51,44,51,60,76,94,96,113,122,124,132,136,136,146,149,157,160,144,148,145,150,150,145,144,139,141,143,139,139,139,137,131,134,127,130,129,127,132,122,122,122,124,133,141,142,135,130,135,141,148,148,148,144,141,141,141,141,135,130,127,122,120,121,126,131,135,139,139,135,136,141,140,141,135,129,122,116,118,122,131,142,147,148,145,142,141,143,143,141,132,130,139,138,136,139,144,143,145,141,142,144,143,139,132,131,133,139,144,137,130,127,121,124,128,123,131,134,142,148,145,148,142,135,130,127,127,131,129,135,139,131,127,128,125,129,138,131,142,145,151,154,150,148,133,128,141,145,134,127,127,124,119,112,118,118,113,122,128,122,111,114,120,114,104,98,86,96,84,77,92,92,99,99,90,86,92,101,101,92,107,108,103,110,94,106,95,82,82,79,109,89,71,60,52,51,59,62,65,71,73,69,68,77,77,72,60,65,74,75,70,64,62,65,63,62,59,55,60,60,56,56,58,55,50,63,61,54,60,54,59,52,58,60,61,61,54,59,60,57,55,55,59,52,50,54,50,55,56,57,62,59,55,55,61,56,53,54,51,49,51,55,55,53,51,51,47,51,51,47,50,52,47,51,50,47,49,50,49,48,45,46,47,47,46,41,46,43,40,45,46,41,45,44,42,42,42,43,40,45,40,41,44,38,48,42,40,42,39,45,42,42,44,39,40,39,40,40,39,38,43,44,41,40,46,41,38,40,40,42,39,39,43,42,41,45,42,38,41,39,37,39,43,39,41,42,44,45,36,42,41,42,46,46,46,43,41,50,46,43,44,44,47,45,42,43,44,43,44,43,43,46,44,49,49,45,50,50,50,49,50,49,53,54,50,58,51,59,61,54,55,47,52,58,57,50,50,49,56,55,50,51,59,70,76,66,59,50,56,53,56,67,63,57,56,61,58,59,57,47,48,45,42,39,36,37,38,34,33,37,37,37,36,38,38,39,35,41,38,38,39,37,41,35,33,42,38,36,39,43,41,39,42,41,42,42,45,47,45,44,47,49,45,47,57,47,51,54,55,58,73,64,76,72,80,80,74,73,94,128,130,113,122,125,132,139,134,141,160,167,153,138,141,141,164,171,155,147,144,147,151,148,146,149,138,148,152,146,151,154,157,153,148,153,151,149,148,147,141,141,144,144,145,149,143,143,148,148,150,156,159,160,162,158,157,163,163,157,157,160,152,149,148,148,149,150,151,153,154,154,156,153,151,155,150,151,150,148,153,153,151,153,152,148,148,151,146,146,139,145,142,143,150,143,145,131,115,96,80,71,23,4,12,11,15,13,13,14,13,14,14,14,21,23,21,21,20,23,22,21,22,24,26,19,24,26,24,23,24,27,23,28,27,23,24,29,23,29,29,24,23,15,19,17,19,20,17,22,21,19,18,20,20,18,23,20,22,21,19,24,18,21,25,23,23,23,21,19,24,21,22,25,24,24,23,26,23,21,22,27,30,21,25,22,21,27,27,26,25,31,30,34,38,39,39,45,41,37,39,36,39,36,37,42,38,41,40,42,39,39,45,36,41,47,44,41,46,45,45,50,50,51,53,49,56,50,44,44,45,53,50,47,44,43,53,64,79,68,74,68,63,59,48,50,54,77,95,101,103,99,104,114,122,122,127,131,141,151,157,150,146,137,128,130,126,137,139,135,134,131,139,135,128,129,122,125,130,130,132,132,132,134,136,139,137,137,137,137,137,142,147,147,142,141,141,143,139,137,136,133,136,132,132,136,137,137,131,129,128,131,130,135,134,134,128,128,129,122,120,117,122,127,129,133,129,130,133,137,144,146,147,139,146,150,151,149,148,150,140,140,137,142,136,129,141,137,145,146,145,139,128,130,131,137,140,142,143,141,141,137,143,147,141,131,124,132,130,124,122,121,126,141,142,128,122,117,113,110,114,120,118,113,110,102,97,97,96,93,89,90,83,86,103,128,138,130,124,124,124,127,129,142,150,135,128,127,125,111,99,98,104,113,116,114,100,95,111,125,117,108,126,133,129,121,109,104,84,64,56,50,74,95,82,72,71,64,64,59,64,76,73,72,73,68,95,90,88,88,83,77,64,59,54,61,59,65,73,79,73,63,51,54,60,63,64,66,58,54,55,47,54,54,51,55,63,53,52,57,52,51,56,55,47,50,53,53,53,53,53,51,57,53,51,50,54,50,51,55,49,52,53,53,48,52,57,49,48,51,49,53,52,50,50,48,50,54,53,49,47,52,46,49,51,48,44,42,48,53,43,46,50,48,47,48,49,48,46,47,44,41,48,45,43,42,44,39,43,41,40,45,43,40,39,39,36,44,37,41,42,41,44,39,45,45,45,47,43,42,46,42,48,46,43,39,38,46,39,42,41,35,43,38,39,38,38,45,41,43,40,42,49,43,44,46,44,45,40,44,45,46,43,44,45,46,43,40,46,43,44,45,45,47,41,45,49,48,49,50,49,49,48,51,52,53,53,54,50,44,54,54,48,48,45,49,48,52,47,47,47,48,51,46,48,51,58,54,55,57,50,53,59,60,55,54,53,59,60,58,53,53,48,44,41,38,37,37,38,34,33,35,39,35,37,39,36,39,37,36,39,41,36,36,39,36,39,39,39,39,39,42,41,39,41,44,42,44,48,42,42,42,45,49,44,46,49,53,53,55,49,60,81,78,84,78,91,100,102,121,126,137,138,128,141,132,131,136,132,146,166,179,171,150,144,148,164,171,159,150,152,151,153,156,153,151,145,151,147,141,150,153,157,150,148,153,151,151,158,153,148,148,148,152,160,164,160,160,159,161,156,156,155,158,163,155,155,158,157,161,166,163,160,164,160,155,153,152,158,154,159,162,158,155,152,150,148,149,149,146,145,144,148,146,145,145,144,146,146,146,146,148,150,146,147,150,148,142,138,123,99,77,21,3,11,11,14,12,12,13,13,14,14,13,23,23,22,23,21,21,20,23,23,25,26,22,25,24,24,24,26,29,26,26,24,25,26,24,28,22,23,29,21,21,24,19,19,19,21,23,19,21,22,19,21,24,26,24,19,24,24,22,24,20,23,25,21,24,25,23,23,26,27,24,25,29,27,25,28,28,30,27,31,27,25,29,29,26,29,24,27,31,26,33,31,34,32,32,37,33,36,40,42,41,39,43,50,41,46,46,49,53,45,51,55,53,53,65,73,74,84,78,76,81,92,110,112,119,127,131,134,137,130,126,115,110,108,88,93,80,80,83,89,104,105,107,96,98,96,91,94,92,96,94,99,99,102,98,89,97,101,107,114,118,128,134,136,139,135,137,131,135,142,140,148,148,149,145,137,137,139,139,136,139,141,141,137,134,141,136,137,139,136,135,138,136,136,141,147,153,151,139,133,136,134,139,143,142,134,137,143,150,155,150,148,148,147,146,152,150,146,151,155,154,148,146,148,154,158,160,152,145,145,136,138,125,124,127,120,125,121,131,134,137,139,139,137,130,131,129,129,127,134,146,147,155,157,157,156,155,158,140,126,122,132,145,151,155,153,153,151,147,131,121,131,134,141,152,150,145,145,155,156,149,141,134,145,154,149,142,140,144,142,136,122,120,126,124,131,131,131,134,121,106,115,119,124,132,120,108,115,132,137,129,130,131,132,134,127,124,127,131,130,122,112,96,96,113,120,125,138,128,112,83,69,82,93,101,100,101,102,97,102,106,95,81,85,76,66,53,49,63,65,72,71,66,59,70,66,61,65,66,76,81,78,78,72,71,63,64,79,80,87,79,72,62,73,69,55,55,50,51,55,50,51,50,49,57,51,53,52,45,48,47,51,48,48,53,53,47,51,55,41,46,53,47,49,49,45,49,49,51,53,49,51,54,51,51,48,51,52,52,53,53,55,56,53,47,49,49,55,53,53,46,50,50,50,50,53,49,45,50,51,44,49,47,44,46,45,46,41,48,44,44,45,46,44,43,43,43,45,43,46,44,45,42,48,45,45,51,42,43,42,43,46,45,44,45,43,41,41,46,43,43,49,45,42,42,46,49,45,45,45,46,45,41,44,47,43,43,46,44,44,45,46,47,42,45,42,38,46,45,47,45,50,47,41,44,49,45,46,46,46,49,43,47,46,42,44,47,45,46,48,47,48,47,45,48,48,48,49,51,47,48,54,50,54,51,56,53,50,59,53,55,56,58,56,49,51,48,49,49,50,45,39,43,42,38,41,41,41,39,39,40,40,37,36,38,42,37,40,41,38,42,38,42,39,40,44,37,42,39,44,46,41,40,46,51,41,45,48,45,50,43,48,48,49,54,56,63,61,74,69,62,77,78,87,91,99,119,113,101,93,113,146,135,139,147,153,162,162,163,168,172,174,159,151,157,165,165,163,164,160,160,160,160,158,162,162,162,170,163,165,164,155,159,152,145,152,158,162,159,152,155,155,152,151,150,156,160,158,158,150,148,155,157,152,144,142,146,148,151,148,149,148,146,152,150,151,150,152,152,149,149,148,149,147,154,155,147,146,143,141,139,134,141,139,139,137,129,130,131,131,134,136,139,144,141,145,139,143,105,15,1,10,10,13,12,14,12,13,14,14,13,23,23,22,23,21,21,20,23,23,25,26,22,25,24,24,24,26,29,26,26,24,25,26,24,28,22,23,29,21,21,24,19,19,19,21,23,19,21,22,19,21,24,26,24,19,24,24,22,24,20,23,25,21,24,25,23,23,26,27,24,25,29,27,25,28,28,30,27,31,27,25,29,29,26,29,24,27,31,26,33,31,34,32,32,37,33,36,40,42,41,39,43,50,41,46,46,49,53,45,51,55,53,53,65,73,74,84,78,76,81,92,110,112,119,127,131,134,137,130,126,115,110,108,88,93,80,80,83,89,104,105,107,96,98,96,91,94,92,96,94,99,99,102,98,89,97,101,107,114,118,128,134,136,139,135,137,131,135,142,140,148,148,149,145,137,137,139,139,136,139,141,141,137,134,141,136,137,139,136,135,138,136,136,141,147,153,151,139,133,136,134,139,143,142,134,137,143,150,155,150,148,148,147,146,152,150,146,151,155,154,148,146,148,154,158,160,152,145,145,136,138,125,124,127,120,125,121,131,134,137,139,139,137,130,131,129,129,127,134,146,147,155,157,157,156,155,158,140,126,122,132,145,151,155,153,153,151,147,131,121,131,134,141,152,150,145,145,155,156,149,141,134,145,154,149,142,140,144,142,136,122,120,126,124,131,131,131,134,121,106,115,119,124,132,120,108,115,132,137,129,130,131,132,134,127,124,127,131,130,122,112,96,96,113,120,125,138,128,112,83,69,82,93,101,100,101,102,97,102,106,95,81,85,76,66,53,49,63,65,72,71,66,59,70,66,61,65,66,76,81,78,78,72,71,63,64,79,80,87,79,72,62,73,69,55,55,50,51,55,50,51,50,49,57,51,53,52,45,48,47,51,48,48,53,53,47,51,55,41,46,53,47,49,49,45,49,49,51,53,49,51,54,51,51,48,51,52,52,53,53,55,56,53,47,49,49,55,53,53,46,50,50,50,50,53,49,45,50,51,44,49,47,44,46,45,46,41,48,44,44,45,46,44,43,43,43,45,43,46,44,45,42,48,45,45,51,42,43,42,43,46,45,44,45,43,41,41,46,43,43,49,45,42,42,46,49,45,45,45,46,45,41,44,47,43,43,46,44,44,45,46,47,42,45,42,38,46,45,47,45,50,47,41,44,49,45,46,46,46,49,43,47,46,42,44,47,45,46,48,47,48,47,45,48,48,48,49,51,47,48,54,50,54,51,56,53,50,59,53,55,56,58,56,49,51,48,49,49,50,45,39,43,42,38,41,41,41,39,39,40,40,37,36,38,42,37,40,41,38,42,38,42,39,40,44,37,42,39,44,46,41,40,46,51,41,45,48,45,50,43,48,48,49,54,56,63,61,74,69,62,77,78,87,91,99,119,113,101,93,113,146,135,139,147,153,162,162,163,168,172,174,159,151,157,165,165,163,164,160,160,160,160,158,162,162,162,170,163,165,164,155,159,152,145,152,158,162,159,152,155,155,152,151,150,156,160,158,158,150,148,155,157,152,144,142,146,148,151,148,149,148,146,152,150,151,150,152,152,149,149,148,149,147,154,155,147,146,143,141,139,134,141,139,139,137,129,130,131,131,134,136,139,144,141,145,139,143,105,15,1,10,10,13,12,14,12,13,14,14,13,29,24,24,21,24,25,23,29,24,26,26,29,29,27,25,27,30,24,27,24,27,26,29,28,26,27,24,29,24,23,25,23,24,21,21,25,23,22,24,26,26,21,25,24,27,27,21,27,26,22,29,27,29,30,23,29,32,27,26,29,30,29,27,33,29,27,35,27,27,29,30,28,33,32,30,34,29,29,28,37,30,28,32,33,36,33,33,35,36,35,41,42,49,46,49,52,49,55,59,57,57,64,63,63,73,82,97,87,98,108,102,109,101,107,111,107,110,115,112,111,105,101,110,110,122,111,117,122,124,128,119,128,128,131,134,134,140,138,141,134,129,126,124,128,132,141,144,155,160,152,144,141,144,144,145,147,147,149,149,147,145,141,136,135,138,139,139,138,138,141,139,139,141,139,145,145,141,144,142,147,146,142,138,127,127,128,136,142,150,153,142,140,139,144,147,148,149,148,138,129,133,141,144,137,135,127,123,128,132,135,141,147,145,150,143,134,133,139,136,134,143,141,141,139,136,147,147,148,145,142,145,146,149,153,148,146,145,140,148,146,144,146,141,141,139,141,148,147,145,139,134,133,138,143,135,128,127,142,153,142,133,119,112,114,123,124,119,129,126,126,133,136,131,129,127,128,121,144,111,15,2,10,10,14,11,14,12,12,14,13,13,15,13,13,13,13,14,13,14,14,14,14,14,14,14,14,14,15,15,14,14,14,14,15,14,15,14,14,15,15,15,14,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,14,14,15,15,15,15,15,15,16,16,15,15,15,15,16,15,15,15,16,15,16,17,16,16,16,15,16,15,16,16,16,16,15,16,15,16,16,15,16,16,16,16,16,16,17,16,16,16,16,16,16,16,16,17,16,17,17,16,16,16,16,16,16,16,16,16,16,16,16,17,17,16,16,16,16,16,16,16,16,16,16,16,16,16,17,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,17,16,16,16,16,16,16,16,16,16,17,16,16,16,16,16,16,17,16,16,16,16,17,16,16,16,17,17,16,16,16,17,17,16,17,16,17,16,16,16,16,17,17,17,17,16,17,16,17,17,16,16,16,17,16,17,17,16,17,17,17,16,16,17,16,17,17,16,17,16,17,16,16,16,16,17,16,16,16,16,17,16,16,16,16,17,17,17,16,16,17,16,17,16,17,16,16,17,17,16,16,17,17,16,17,17,17,17,17,16,17,17,16,17,17,16,16,16,17,17,16,17,17,16,16,16,16,16,17,17,16,16,16,17,16,16,16,16,17,16,17,16,16,17,16,17,17,17,16,16,17,16,17,16,16,16,17,17,16,17,17,17,17,16,16,17,17,17,16,17,16,17,17,16,17,16,17,17,17,16,16,17,17,17,17,16,16,16,17,17,16,16,17,17,17,17,16,17,17,17,17,16,16,17,17,17,16,16,17,17,17,16,16,17,17,17,17,16,16,17,17,16,16,16,16,17,16,16,17,16,17,16,17,16,16,17,16,17,17,16,17,16,17,16,17,17,17,17,17,16,16,17,17,17,17,16,17,16,16,17,16,17,16,17,17,16,17,16,17,16,16,16,16,17,17,17,17,29,24,24,21,24,25,23,29,24,26,26,29,29,27,25,27,30,24,27,24,27,26,29,28,26,27,24,29,24,23,25,23,24,21,21,25,23,22,24,26,26,21,25,24,27,27,21,27,26,22,29,27,29,30,23,29,32,27,26,29,30,29,27,33,29,27,35,27,27,29,30,28,33,32,30,34,29,29,28,37,30,28,32,33,36,33,33,35,36,35,41,42,49,46,49,52,49,55,59,57,57,64,63,63,73,82,97,87,98,108,102,109,101,107,111,107,110,115,112,111,105,101,110,110,122,111,117,122,124,128,119,128,128,131,134,134,140,138,141,134,129,126,124,128,132,141,144,155,160,152,144,141,144,144,145,147,147,149,149,147,145,141,136,135,138,139,139,138,138,141,139,139,141,139,145,145,141,144,142,147,146,142,138,127,127,128,136,142,150,153,142,140,139,144,147,148,149,148,138,129,133,141,144,137,135,127,123,128,132,135,141,147,145,150,143,134,133,139,136,134,143,141,141,139,136,147,147,148,145,142,145,146,149,153,148,146,145,140,148,146,144,146,141,141,139,141,148,147,145,139,134,133,138,143,135,128,127,142,153,142,133,119,112,114,123,124,119,129,126,126,133,136,131,129,127,128,121,144,111,15,2,10,10,14,11,14,12,12,14,13,13,15,13,13,13,13,14,13,14,14,14,14,14,14,14,14,14,15,15,14,14,14,14,15,14,15,14,14,15,15,15,14,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,14,14,15,15,15,15,15,15,16,16,15,15,15,15,16,15,15,15,16,15,16,17,16,16,16,15,16,15,16,16,16,16,15,16,15,16,16,15,16,16,16,16,16,16,17,16,16,16,16,16,16,16,16,17,16,17,17,16,16,16,16,16,16,16,16,16,16,16,16,17,17,16,16,16,16,16,16,16,16,16,16,16,16,16,17,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,17,16,16,16,16,16,16,16,16,16,17,16,16,16,16,16,16,17,16,16,16,16,17,16,16,16,17,17,16,16,16,17,17,16,17,16,17,16,16,16,16,17,17,17,17,16,17,16,17,17,16,16,16,17,16,17,17,16,17,17,17,16,16,17,16,17,17,16,17,16,17,16,16,16,16,17,16,16,16,16,17,16,16,16,16,17,17,17,16,16,17,16,17,16,17,16,16,17,17,16,16,17,17,16,17,17,17,17,17,16,17,17,16,17,17,16,16,16,17,17,16,17,17,16,16,16,16,16,17,17,16,16,16,17,16,16,16,16,17,16,17,16,16,17,16,17,17,17,16,16,17,16,17,16,16,16,17,17,16,17,17,17,17,16,16,17,17,17,16,17,16,17,17,16,17,16,17,17,17,16,16,17,17,17,17,16,16,16,17,17,16,16,17,17,17,17,16,17,17,17,17,16,16,17,17,17,16,16,17,17,17,16,16,17,17,17,17,16,16,17,17,16,16,16,16,17,16,16,17,16,17,16,17,16,16,17,16,17,17,16,17,16,17,16,17,17,17,17,17,16,16,17,17,17,17,16,17,16,16,17,16,17,16,17,17,16,17,16,17,16,16,16,16,17,17,17,17, \ No newline at end of file diff --git a/media/libvpx/libvpx/codereview.settings b/media/libvpx/libvpx/codereview.settings new file mode 100644 index 0000000000..ccba2eeed2 --- /dev/null +++ b/media/libvpx/libvpx/codereview.settings @@ -0,0 +1,4 @@ +# This file is used by git cl to get repository specific information. +GERRIT_HOST: True +CODE_REVIEW_SERVER: chromium-review.googlesource.com +GERRIT_SQUASH_UPLOADS: False diff --git a/media/libvpx/libvpx/configure b/media/libvpx/libvpx/configure new file mode 100755 index 0000000000..b212e0709d --- /dev/null +++ b/media/libvpx/libvpx/configure @@ -0,0 +1,831 @@ +#!/bin/sh +## +## configure +## +## This script is the front-end to the build system. It provides a similar +## interface to standard configure scripts with some extra bits for dealing +## with toolchains that differ from the standard POSIX interface and +## for extracting subsets of the source tree. In theory, reusable parts +## of this script were intended to live in build/make/configure.sh, +## but in practice, the line is pretty blurry. +## +## This build system is based in part on the FFmpeg configure script. +## + +#source_path="`dirname \"$0\"`" +source_path=${0%/*} +. "${source_path}/build/make/configure.sh" + +show_help(){ + show_help_pre + cat << EOF +Advanced options: + ${toggle_libs} libraries + ${toggle_examples} examples + ${toggle_tools} tools + ${toggle_docs} documentation + ${toggle_unit_tests} unit tests + ${toggle_decode_perf_tests} build decoder perf tests with unit tests + ${toggle_encode_perf_tests} build encoder perf tests with unit tests + --cpu=CPU tune for the specified CPU (ARM: cortex-a8, X86: sse3) + --libc=PATH path to alternate libc + --size-limit=WxH max size to allow in the decoder + --as={yasm|nasm|auto} use specified assembler [auto, yasm preferred] + ${toggle_codec_srcs} in/exclude codec library source code + ${toggle_debug_libs} in/exclude debug version of libraries + ${toggle_static_msvcrt} use static MSVCRT (VS builds only) + ${toggle_vp9_highbitdepth} use VP9 high bit depth (10/12) profiles + ${toggle_better_hw_compatibility} + enable encoder to produce streams with better + hardware decoder compatibility + ${toggle_vp8} VP8 codec support + ${toggle_vp9} VP9 codec support + ${toggle_internal_stats} output of encoder internal stats for debug, if supported (encoders) + ${toggle_postproc} postprocessing + ${toggle_vp9_postproc} vp9 specific postprocessing + ${toggle_multithread} multithreaded encoding and decoding + ${toggle_spatial_resampling} spatial sampling (scaling) support + ${toggle_realtime_only} enable this option while building for real-time encoding + ${toggle_onthefly_bitpacking} enable on-the-fly bitpacking in real-time encoding + ${toggle_error_concealment} enable this option to get a decoder which is able to conceal losses + ${toggle_coefficient_range_checking} + enable decoder to check if intermediate + transform coefficients are in valid range + ${toggle_runtime_cpu_detect} runtime cpu detection + ${toggle_shared} shared library support + ${toggle_static} static library support + ${toggle_small} favor smaller size over speed + ${toggle_postproc_visualizer} macro block / block level visualizers + ${toggle_multi_res_encoding} enable multiple-resolution encoding + ${toggle_temporal_denoising} enable temporal denoising and disable the spatial denoiser + ${toggle_vp9_temporal_denoising} + enable vp9 temporal denoising + ${toggle_webm_io} enable input from and output to WebM container + ${toggle_libyuv} enable libyuv + +Codecs: + Codecs can be selectively enabled or disabled individually, or by family: + --disable- + is equivalent to: + --disable--encoder + --disable--decoder + + Codecs available in this distribution: +EOF +#restore editor state ' + + family=""; + last_family=""; + c=""; + str=""; + for c in ${CODECS}; do + family=${c%_*} + if [ "${family}" != "${last_family}" ]; then + [ -z "${str}" ] || echo "${str}" + str="$(printf ' %10s:' ${family})" + fi + str="${str} $(printf '%10s' ${c#*_})" + last_family=${family} + done + echo "${str}" + show_help_post +} + +## +## BEGIN APPLICATION SPECIFIC CONFIGURATION +## + +# all_platforms is a list of all supported target platforms. Maintain +# alphabetically by architecture, generic-gnu last. +all_platforms="${all_platforms} arm64-android-gcc" +all_platforms="${all_platforms} arm64-darwin-gcc" +all_platforms="${all_platforms} arm64-darwin20-gcc" +all_platforms="${all_platforms} arm64-darwin21-gcc" +all_platforms="${all_platforms} arm64-darwin22-gcc" +all_platforms="${all_platforms} arm64-darwin23-gcc" +all_platforms="${all_platforms} arm64-linux-gcc" +all_platforms="${all_platforms} arm64-win64-gcc" +all_platforms="${all_platforms} arm64-win64-vs15" +all_platforms="${all_platforms} arm64-win64-vs16" +all_platforms="${all_platforms} arm64-win64-vs16-clangcl" +all_platforms="${all_platforms} arm64-win64-vs17" +all_platforms="${all_platforms} arm64-win64-vs17-clangcl" +all_platforms="${all_platforms} armv7-android-gcc" #neon Cortex-A8 +all_platforms="${all_platforms} armv7-darwin-gcc" #neon Cortex-A8 +all_platforms="${all_platforms} armv7-linux-rvct" #neon Cortex-A8 +all_platforms="${all_platforms} armv7-linux-gcc" #neon Cortex-A8 +all_platforms="${all_platforms} armv7-none-rvct" #neon Cortex-A8 +all_platforms="${all_platforms} armv7-win32-gcc" +all_platforms="${all_platforms} armv7-win32-vs14" +all_platforms="${all_platforms} armv7-win32-vs15" +all_platforms="${all_platforms} armv7-win32-vs16" +all_platforms="${all_platforms} armv7-win32-vs17" +all_platforms="${all_platforms} armv7s-darwin-gcc" +all_platforms="${all_platforms} armv8-linux-gcc" +all_platforms="${all_platforms} loongarch32-linux-gcc" +all_platforms="${all_platforms} loongarch64-linux-gcc" +all_platforms="${all_platforms} mips32-linux-gcc" +all_platforms="${all_platforms} mips64-linux-gcc" +all_platforms="${all_platforms} ppc64le-linux-gcc" +all_platforms="${all_platforms} sparc-solaris-gcc" +all_platforms="${all_platforms} x86-android-gcc" +all_platforms="${all_platforms} x86-darwin8-gcc" +all_platforms="${all_platforms} x86-darwin8-icc" +all_platforms="${all_platforms} x86-darwin9-gcc" +all_platforms="${all_platforms} x86-darwin9-icc" +all_platforms="${all_platforms} x86-darwin10-gcc" +all_platforms="${all_platforms} x86-darwin11-gcc" +all_platforms="${all_platforms} x86-darwin12-gcc" +all_platforms="${all_platforms} x86-darwin13-gcc" +all_platforms="${all_platforms} x86-darwin14-gcc" +all_platforms="${all_platforms} x86-darwin15-gcc" +all_platforms="${all_platforms} x86-darwin16-gcc" +all_platforms="${all_platforms} x86-darwin17-gcc" +all_platforms="${all_platforms} x86-iphonesimulator-gcc" +all_platforms="${all_platforms} x86-linux-gcc" +all_platforms="${all_platforms} x86-linux-icc" +all_platforms="${all_platforms} x86-os2-gcc" +all_platforms="${all_platforms} x86-solaris-gcc" +all_platforms="${all_platforms} x86-win32-gcc" +all_platforms="${all_platforms} x86-win32-vs14" +all_platforms="${all_platforms} x86-win32-vs15" +all_platforms="${all_platforms} x86-win32-vs16" +all_platforms="${all_platforms} x86-win32-vs17" +all_platforms="${all_platforms} x86_64-android-gcc" +all_platforms="${all_platforms} x86_64-darwin9-gcc" +all_platforms="${all_platforms} x86_64-darwin10-gcc" +all_platforms="${all_platforms} x86_64-darwin11-gcc" +all_platforms="${all_platforms} x86_64-darwin12-gcc" +all_platforms="${all_platforms} x86_64-darwin13-gcc" +all_platforms="${all_platforms} x86_64-darwin14-gcc" +all_platforms="${all_platforms} x86_64-darwin15-gcc" +all_platforms="${all_platforms} x86_64-darwin16-gcc" +all_platforms="${all_platforms} x86_64-darwin17-gcc" +all_platforms="${all_platforms} x86_64-darwin18-gcc" +all_platforms="${all_platforms} x86_64-darwin19-gcc" +all_platforms="${all_platforms} x86_64-darwin20-gcc" +all_platforms="${all_platforms} x86_64-darwin21-gcc" +all_platforms="${all_platforms} x86_64-darwin22-gcc" +all_platforms="${all_platforms} x86_64-darwin23-gcc" +all_platforms="${all_platforms} x86_64-iphonesimulator-gcc" +all_platforms="${all_platforms} x86_64-linux-gcc" +all_platforms="${all_platforms} x86_64-linux-icc" +all_platforms="${all_platforms} x86_64-solaris-gcc" +all_platforms="${all_platforms} x86_64-win64-gcc" +all_platforms="${all_platforms} x86_64-win64-vs14" +all_platforms="${all_platforms} x86_64-win64-vs15" +all_platforms="${all_platforms} x86_64-win64-vs16" +all_platforms="${all_platforms} x86_64-win64-vs17" +all_platforms="${all_platforms} generic-gnu" + +# all_targets is a list of all targets that can be configured +# note that these should be in dependency order for now. +all_targets="libs examples tools docs" + +# all targets available are enabled, by default. +for t in ${all_targets}; do + [ -f "${source_path}/${t}.mk" ] && enable_feature ${t} +done + +if ! diff --version >/dev/null; then + die "diff missing: Try installing diffutils via your package manager." +fi + +if ! perl --version >/dev/null; then + die "Perl is required to build" +fi + +if [ "`cd \"${source_path}\" && pwd`" != "`pwd`" ]; then + # test to see if source_path already configured + if [ -f "${source_path}/vpx_config.h" ]; then + die "source directory already configured; run 'make distclean' there first" + fi +fi + +# check installed doxygen version +doxy_version=$(doxygen --version 2>/dev/null) +doxy_major=${doxy_version%%.*} +if [ ${doxy_major:-0} -ge 1 ]; then + doxy_version=${doxy_version#*.} + doxy_minor=${doxy_version%%.*} + doxy_patch=${doxy_version##*.} + + [ $doxy_major -gt 1 ] && enable_feature doxygen + [ $doxy_minor -gt 5 ] && enable_feature doxygen + [ $doxy_minor -eq 5 ] && [ $doxy_patch -ge 3 ] && enable_feature doxygen +fi + +# disable codecs when their source directory does not exist +[ -d "${source_path}/vp8" ] || disable_codec vp8 +[ -d "${source_path}/vp9" ] || disable_codec vp9 + +# install everything except the sources, by default. sources will have +# to be enabled when doing dist builds, since that's no longer a common +# case. +enabled doxygen && enable_feature install_docs +enable_feature install_bins +enable_feature install_libs + +enable_feature static +enable_feature optimizations +enable_feature dependency_tracking +enable_feature spatial_resampling +enable_feature multithread +enable_feature os_support +enable_feature temporal_denoising + +CODECS=" + vp8_encoder + vp8_decoder + vp9_encoder + vp9_decoder +" +CODEC_FAMILIES=" + vp8 + vp9 +" + +ARCH_LIST=" + arm + aarch64 + mips + x86 + x86_64 + ppc + loongarch +" + +ARCH_EXT_LIST_AARCH64=" + neon + neon_dotprod + neon_i8mm + sve +" + +ARCH_EXT_LIST_X86=" + mmx + sse + sse2 + sse3 + ssse3 + sse4_1 + avx + avx2 + avx512 +" + +ARCH_EXT_LIST_LOONGSON=" + mmi + lsx + lasx +" + +ARCH_EXT_LIST=" + neon_asm + ${ARCH_EXT_LIST_AARCH64} + + mips32 + dspr2 + msa + mips64 + + ${ARCH_EXT_LIST_X86} + + vsx + + ${ARCH_EXT_LIST_LOONGSON} +" +HAVE_LIST=" + ${ARCH_EXT_LIST} + vpx_ports + pthread_h + unistd_h +" +EXPERIMENT_LIST=" + fp_mb_stats + emulate_hardware + non_greedy_mv + rate_ctrl + collect_component_timing +" +CONFIG_LIST=" + dependency_tracking + external_build + install_docs + install_bins + install_libs + install_srcs + debug + gprof + gcov + rvct + gcc + msvs + pic + big_endian + + codec_srcs + debug_libs + + dequant_tokens + dc_recon + runtime_cpu_detect + postproc + vp9_postproc + multithread + internal_stats + ${CODECS} + ${CODEC_FAMILIES} + encoders + decoders + static_msvcrt + spatial_resampling + realtime_only + onthefly_bitpacking + error_concealment + shared + static + small + postproc_visualizer + os_support + unit_tests + webm_io + libyuv + decode_perf_tests + encode_perf_tests + multi_res_encoding + temporal_denoising + vp9_temporal_denoising + coefficient_range_checking + vp9_highbitdepth + better_hw_compatibility + experimental + size_limit + always_adjust_bpm + bitstream_debug + mismatch_debug + ${EXPERIMENT_LIST} +" +CMDLINE_SELECT=" + dependency_tracking + external_build + extra_warnings + werror + install_docs + install_bins + install_libs + install_srcs + debug + gprof + gcov + pic + optimizations + ccache + runtime_cpu_detect + thumb + + libs + examples + tools + docs + libc + as + size_limit + codec_srcs + debug_libs + + dequant_tokens + dc_recon + postproc + vp9_postproc + multithread + internal_stats + ${CODECS} + ${CODEC_FAMILIES} + static_msvcrt + spatial_resampling + realtime_only + onthefly_bitpacking + error_concealment + shared + static + small + postproc_visualizer + unit_tests + webm_io + libyuv + decode_perf_tests + encode_perf_tests + multi_res_encoding + temporal_denoising + vp9_temporal_denoising + coefficient_range_checking + better_hw_compatibility + vp9_highbitdepth + experimental + always_adjust_bpm + bitstream_debug + mismatch_debug +" + +process_cmdline() { + for opt do + optval="${opt#*=}" + case "$opt" in + --disable-codecs) + for c in ${CODEC_FAMILIES}; do disable_codec $c; done + ;; + --enable-?*|--disable-?*) + eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'` + if is_in ${option} ${EXPERIMENT_LIST}; then + if enabled experimental; then + ${action}_feature $option + else + log_echo "Ignoring $opt -- not in experimental mode." + fi + elif is_in ${option} "${CODECS} ${CODEC_FAMILIES}"; then + ${action}_codec ${option} + else + process_common_cmdline $opt + fi + ;; + *) process_common_cmdline "$opt" + ;; + esac + done +} + +post_process_cmdline() { + if enabled coefficient_range_checking; then + echo "coefficient-range-checking is for decoders only, disabling encoders:" + soft_disable vp8_encoder + soft_disable vp9_encoder + fi + + c="" + + # Enable all detected codecs, if they haven't been disabled + for c in ${CODECS}; do soft_enable $c; done + + # Enable the codec family if any component of that family is enabled + for c in ${CODECS}; do + enabled $c && enable_feature ${c%_*} + done + + # Set the {en,de}coders variable if any algorithm in that class is enabled + for c in ${CODECS}; do + enabled ${c} && enable_feature ${c##*_}s + done +} + + +process_targets() { + enabled child || write_common_config_banner + write_common_target_config_h ${BUILD_PFX}vpx_config.h + write_common_config_targets + enabled win_arm64_neon_h_workaround && write_win_arm64_neon_h_workaround ${BUILD_PFX}arm_neon.h + + # Calculate the default distribution name, based on the enabled features + cf="" + DIST_DIR=vpx + for cf in $CODEC_FAMILIES; do + if enabled ${cf}_encoder && enabled ${cf}_decoder; then + DIST_DIR="${DIST_DIR}-${cf}" + elif enabled ${cf}_encoder; then + DIST_DIR="${DIST_DIR}-${cf}cx" + elif enabled ${cf}_decoder; then + DIST_DIR="${DIST_DIR}-${cf}dx" + fi + done + enabled debug_libs && DIST_DIR="${DIST_DIR}-debug" + enabled codec_srcs && DIST_DIR="${DIST_DIR}-src" + ! enabled postproc && ! enabled vp9_postproc && DIST_DIR="${DIST_DIR}-nopost" + ! enabled multithread && DIST_DIR="${DIST_DIR}-nomt" + ! enabled install_docs && DIST_DIR="${DIST_DIR}-nodocs" + DIST_DIR="${DIST_DIR}-${tgt_isa}-${tgt_os}" + case "${tgt_os}" in + win*) enabled static_msvcrt && DIST_DIR="${DIST_DIR}mt" || DIST_DIR="${DIST_DIR}md" + DIST_DIR="${DIST_DIR}-${tgt_cc}" + ;; + esac + if [ -f "${source_path}/build/make/version.sh" ]; then + ver=`"$source_path/build/make/version.sh" --bare "$source_path"` + DIST_DIR="${DIST_DIR}-${ver}" + VERSION_STRING=${ver} + ver=${ver%%-*} + VERSION_PATCH=${ver##*.} + ver=${ver%.*} + VERSION_MINOR=${ver##*.} + ver=${ver#v} + VERSION_MAJOR=${ver%.*} + fi + enabled child || cat <> config.mk + +PREFIX=${prefix} +ifeq (\$(MAKECMDGOALS),dist) +DIST_DIR?=${DIST_DIR} +else +DIST_DIR?=\$(DESTDIR)${prefix} +endif +LIBSUBDIR=${libdir##${prefix}/} + +VERSION_STRING=${VERSION_STRING} + +VERSION_MAJOR=${VERSION_MAJOR} +VERSION_MINOR=${VERSION_MINOR} +VERSION_PATCH=${VERSION_PATCH} + +CONFIGURE_ARGS=${CONFIGURE_ARGS} +EOF + enabled child || echo "CONFIGURE_ARGS?=${CONFIGURE_ARGS}" >> config.mk + + # + # Write makefiles for all enabled targets + # + for tgt in libs examples tools docs solution; do + tgt_fn="$tgt-$toolchain.mk" + + if enabled $tgt; then + echo "Creating makefiles for ${toolchain} ${tgt}" + write_common_target_config_mk $tgt_fn ${BUILD_PFX}vpx_config.h + #write_${tgt}_config + fi + done + +} + +process_detect() { + if enabled shared; then + # Can only build shared libs on a subset of platforms. Doing this check + # here rather than at option parse time because the target auto-detect + # magic happens after the command line has been parsed. + case "${tgt_os}" in + linux|os2|solaris|darwin*|iphonesimulator*) + # Supported platforms + ;; + *) + if enabled gnu; then + echo "--enable-shared is only supported on ELF; assuming this is OK" + else + die "--enable-shared only supported on ELF, OS/2, and Darwin for now" + fi + ;; + esac + fi + if [ -z "$CC" ] || enabled external_build; then + echo "Bypassing toolchain for environment detection." + enable_feature external_build + check_header() { + log fake_check_header "$@" + header=$1 + shift + var=`echo $header | sed 's/[^A-Za-z0-9_]/_/g'` + disable_feature $var + # Headers common to all environments + case $header in + stdio.h) + true; + ;; + *) + result=false + for d in "$@"; do + [ -f "${d##-I}/$header" ] && result=true && break + done + ${result:-true} + esac && enable_feature $var + + # Specialize windows and POSIX environments. + case $toolchain in + *-win*-*) + # Don't check for any headers in Windows builds. + false + ;; + *) + case $header in + pthread.h) true;; + unistd.h) true;; + *) false;; + esac && enable_feature $var + esac + enabled $var + } + check_ld() { + true + } + check_lib() { + true + } + fi + check_header stdio.h || die "Unable to invoke compiler: ${CC} ${CFLAGS}" + check_ld < +#include +int main(void) { return pthread_create(NULL, NULL, NULL, NULL); } +EOF + check_header unistd.h # for sysconf(3) and friends. + + check_header vpx/vpx_integer.h -I${source_path} && enable_feature vpx_ports + + if enabled neon && ! enabled external_build; then + check_header arm_neon.h || die "Unable to find arm_neon.h" + fi +} + +process_toolchain() { + process_common_toolchain + + # Enable some useful compiler flags + if enabled gcc; then + enabled werror && check_add_cflags -Werror + check_add_cflags -Wall + check_add_cflags -Wdisabled-optimization + check_add_cflags -Wextra-semi + check_add_cflags -Wextra-semi-stmt + check_add_cflags -Wfloat-conversion + check_add_cflags -Wformat=2 + check_add_cflags -Wparentheses-equality + check_add_cflags -Wpointer-arith + check_add_cflags -Wtype-limits + check_add_cflags -Wcast-qual + check_add_cflags -Wvla + check_add_cflags -Wimplicit-function-declaration + check_add_cflags -Wmissing-declarations + check_add_cflags -Wmissing-prototypes + check_add_cflags -Wshadow + check_add_cflags -Wuninitialized + check_add_cflags -Wunreachable-code-aggressive + check_add_cflags -Wunused + check_add_cflags -Wextra + # check_add_cflags also adds to cxxflags. gtest does not do well with + # these flags so add them explicitly to CFLAGS only. + check_cflags -Wundef && add_cflags_only -Wundef + check_cflags -Wframe-larger-than=52000 && \ + add_cflags_only -Wframe-larger-than=52000 + if enabled mips || [ -z "${INLINE}" ]; then + enabled extra_warnings || check_add_cflags -Wno-unused-function + fi + # Enforce C99 for C files. Allow GNU extensions. + check_cflags -std=gnu99 && add_cflags_only -std=gnu99 + # Avoid this warning for third_party C++ sources. Some reorganization + # would be needed to apply this only to test/*.cc. + check_cflags -Wshorten-64-to-32 && add_cflags_only -Wshorten-64-to-32 + + # Quiet gcc 6 vs 7 abi warnings: + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77728 + if enabled arm; then + check_add_cxxflags -Wno-psabi + fi + + # Enforce C++11 compatibility. + check_add_cxxflags -Wc++14-extensions + check_add_cxxflags -Wc++17-extensions + check_add_cxxflags -Wc++20-extensions + check_add_cxxflags -Wnon-virtual-dtor + + # disable some warnings specific to libyuv / libwebm. + check_cxxflags -Wno-missing-declarations \ + && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-declarations" + check_cxxflags -Wno-missing-prototypes \ + && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-prototypes" + check_cxxflags -Wno-pass-failed \ + && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-pass-failed" + check_cxxflags -Wno-shadow \ + && LIBWEBM_CXXFLAGS="${LIBWEBM_CXXFLAGS} -Wno-shadow" \ + && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-shadow" + check_cxxflags -Wno-unused-parameter \ + && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-unused-parameter" + fi + + if enabled icc; then + enabled werror && check_add_cflags -Werror + check_add_cflags -Wall + check_add_cflags -Wpointer-arith + + # ICC has a number of floating point optimizations that we disable + # in favor of deterministic output WRT to other compilers + add_cflags -fp-model precise + fi + + # Enable extra, harmless warnings. These might provide additional insight + # to what the compiler is doing and why, but in general, but they shouldn't + # be treated as fatal, even if we're treating warnings as errors. + GCC_EXTRA_WARNINGS=" + -Wdisabled-optimization + -Winline + " + enabled gcc && EXTRA_WARNINGS="${GCC_EXTRA_WARNINGS}" + RVCT_EXTRA_WARNINGS=" + --remarks + " + enabled rvct && EXTRA_WARNINGS="${RVCT_EXTRA_WARNINGS}" + if enabled extra_warnings; then + for w in ${EXTRA_WARNINGS}; do + check_add_cflags ${w} + enabled gcc && enabled werror && check_add_cflags -Wno-error=${w} + done + fi + + # ccache only really works on gcc toolchains + enabled gcc || soft_disable ccache + if enabled mips; then + enable_feature dequant_tokens + enable_feature dc_recon + fi + + if enabled internal_stats; then + enable_feature vp9_postproc + fi + + # Enable the postbuild target if building for visual studio. + case "$tgt_cc" in + vs*) enable_feature msvs + enable_feature solution + vs_version=${tgt_cc##vs} + VCPROJ_SFX=vcxproj + gen_vcproj_cmd=${source_path}/build/make/gen_msvs_vcxproj.sh + enabled werror && gen_vcproj_cmd="${gen_vcproj_cmd} --enable-werror" + all_targets="${all_targets} solution" + INLINE="__inline" + ;; + esac + + # Other toolchain specific defaults + case $toolchain in x86*) soft_enable postproc;; esac + + if enabled postproc_visualizer; then + enabled postproc || die "postproc_visualizer requires postproc to be enabled" + fi + + # Enable unit tests by default if we have a working C++ compiler. + case "$toolchain" in + *-vs*) + soft_enable unit_tests + soft_enable webm_io + soft_enable libyuv + ;; + *-android-*) + check_add_cxxflags -std=gnu++11 && soft_enable webm_io + soft_enable libyuv + # GTestLog must be modified to use Android logging utilities. + ;; + *-darwin-*) + check_add_cxxflags -std=gnu++11 + # iOS/ARM builds do not work with gtest. This does not match + # x86 targets. + ;; + *-iphonesimulator-*) + check_add_cxxflags -std=gnu++11 && soft_enable webm_io + soft_enable libyuv + ;; + *-win*) + # Some mingw toolchains don't have pthread available by default. + # Treat these more like visual studio where threading in gtest + # would be disabled for the same reason. + check_add_cxxflags -std=gnu++11 && soft_enable unit_tests \ + && soft_enable webm_io + check_cxx "$@" <> ${BUILD_PFX}vpx_config.c +#include "vpx/vpx_codec.h" +static const char* const cfg = "$CONFIGURE_ARGS"; +const char *vpx_codec_build_config(void) {return cfg;} +EOF diff --git a/media/libvpx/libvpx/docs.mk b/media/libvpx/libvpx/docs.mk new file mode 100644 index 0000000000..889d18251f --- /dev/null +++ b/media/libvpx/libvpx/docs.mk @@ -0,0 +1,48 @@ +## +## Copyright (c) 2010 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + + +INSTALL_MAPS += docs/% docs/% +INSTALL_MAPS += src/% % +INSTALL_MAPS += % % + +# Static documentation authored in doxygen +CODEC_DOX := mainpage.dox \ + keywords.dox \ + usage.dox \ + usage_cx.dox \ + usage_dx.dox \ + +# Other doxy files sourced in Markdown +TXT_DOX = $(call enabled,TXT_DOX) + +EXAMPLE_PATH += $(SRC_PATH_BARE) #for CHANGELOG, README, etc +EXAMPLE_PATH += $(SRC_PATH_BARE)/examples + +doxyfile: $(if $(findstring examples, $(ALL_TARGETS)),examples.doxy) +doxyfile: libs.doxy_template libs.doxy + @echo " [CREATE] $@" + @cat $^ > $@ + @echo "STRIP_FROM_PATH += $(SRC_PATH_BARE) $(BUILD_ROOT)" >> $@ + @echo "INPUT += $(addprefix $(SRC_PATH_BARE)/,$(CODEC_DOX))" >> $@; + @echo "INPUT += $(TXT_DOX)" >> $@; + @echo "EXAMPLE_PATH += $(EXAMPLE_PATH)" >> $@ + +CLEAN-OBJS += doxyfile $(wildcard docs/html/*) +docs/html/index.html: doxyfile $(CODEC_DOX) $(TXT_DOX) + @echo " [DOXYGEN] $<" + @doxygen $< +DOCS-yes += docs/html/index.html + +DIST-DOCS-yes = $(wildcard docs/html/*) +DIST-DOCS-$(CONFIG_CODEC_SRCS) += $(addprefix src/,$(CODEC_DOX)) +DIST-DOCS-$(CONFIG_CODEC_SRCS) += src/libs.doxy_template +DIST-DOCS-yes += CHANGELOG +DIST-DOCS-yes += README diff --git a/media/libvpx/libvpx/examples.mk b/media/libvpx/libvpx/examples.mk new file mode 100644 index 0000000000..22726a3d41 --- /dev/null +++ b/media/libvpx/libvpx/examples.mk @@ -0,0 +1,423 @@ +## +## Copyright (c) 2010 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +LIBYUV_SRCS += third_party/libyuv/include/libyuv/basic_types.h \ + third_party/libyuv/include/libyuv/convert.h \ + third_party/libyuv/include/libyuv/convert_argb.h \ + third_party/libyuv/include/libyuv/convert_from.h \ + third_party/libyuv/include/libyuv/cpu_id.h \ + third_party/libyuv/include/libyuv/planar_functions.h \ + third_party/libyuv/include/libyuv/rotate.h \ + third_party/libyuv/include/libyuv/row.h \ + third_party/libyuv/include/libyuv/scale.h \ + third_party/libyuv/include/libyuv/scale_row.h \ + third_party/libyuv/source/cpu_id.cc \ + third_party/libyuv/source/planar_functions.cc \ + third_party/libyuv/source/row_any.cc \ + third_party/libyuv/source/row_common.cc \ + third_party/libyuv/source/row_gcc.cc \ + third_party/libyuv/source/row_msa.cc \ + third_party/libyuv/source/row_neon.cc \ + third_party/libyuv/source/row_neon64.cc \ + third_party/libyuv/source/row_win.cc \ + third_party/libyuv/source/scale.cc \ + third_party/libyuv/source/scale_any.cc \ + third_party/libyuv/source/scale_common.cc \ + third_party/libyuv/source/scale_gcc.cc \ + third_party/libyuv/source/scale_msa.cc \ + third_party/libyuv/source/scale_neon.cc \ + third_party/libyuv/source/scale_neon64.cc \ + third_party/libyuv/source/scale_win.cc \ + +LIBWEBM_COMMON_SRCS += third_party/libwebm/common/hdr_util.cc \ + third_party/libwebm/common/hdr_util.h \ + third_party/libwebm/common/webmids.h + +LIBWEBM_MUXER_SRCS += third_party/libwebm/mkvmuxer/mkvmuxer.cc \ + third_party/libwebm/mkvmuxer/mkvmuxerutil.cc \ + third_party/libwebm/mkvmuxer/mkvwriter.cc \ + third_party/libwebm/mkvmuxer/mkvmuxer.h \ + third_party/libwebm/mkvmuxer/mkvmuxertypes.h \ + third_party/libwebm/mkvmuxer/mkvmuxerutil.h \ + third_party/libwebm/mkvparser/mkvparser.h \ + third_party/libwebm/mkvmuxer/mkvwriter.h + +LIBWEBM_PARSER_SRCS = third_party/libwebm/mkvparser/mkvparser.cc \ + third_party/libwebm/mkvparser/mkvreader.cc \ + third_party/libwebm/mkvparser/mkvparser.h \ + third_party/libwebm/mkvparser/mkvreader.h + +# Add compile flags and include path for libwebm sources. +ifeq ($(CONFIG_WEBM_IO),yes) + CXXFLAGS += -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS + $(BUILD_PFX)third_party/libwebm/%.cc.o: CXXFLAGS += $(LIBWEBM_CXXFLAGS) + INC_PATH-yes += $(SRC_PATH_BARE)/third_party/libwebm +endif + + +# List of examples to build. UTILS are tools meant for distribution +# while EXAMPLES demonstrate specific portions of the API. +UTILS-$(CONFIG_DECODERS) += vpxdec.c +vpxdec.SRCS += md5_utils.c md5_utils.h +vpxdec.SRCS += vpx_ports/compiler_attributes.h +vpxdec.SRCS += vpx_ports/mem_ops.h +vpxdec.SRCS += vpx_ports/mem_ops_aligned.h +vpxdec.SRCS += vpx_ports/msvc.h +vpxdec.SRCS += vpx_ports/vpx_timer.h +vpxdec.SRCS += vpx/vpx_integer.h +vpxdec.SRCS += args.c args.h +vpxdec.SRCS += ivfdec.c ivfdec.h +vpxdec.SRCS += y4minput.c y4minput.h +vpxdec.SRCS += tools_common.c tools_common.h +vpxdec.SRCS += y4menc.c y4menc.h +ifeq ($(CONFIG_LIBYUV),yes) + vpxdec.SRCS += $(LIBYUV_SRCS) + $(BUILD_PFX)third_party/libyuv/%.cc.o: CXXFLAGS += ${LIBYUV_CXXFLAGS} +endif +ifeq ($(CONFIG_WEBM_IO),yes) + vpxdec.SRCS += $(LIBWEBM_PARSER_SRCS) + vpxdec.SRCS += webmdec.cc webmdec.h +endif +vpxdec.GUID = BA5FE66F-38DD-E034-F542-B1578C5FB950 +vpxdec.DESCRIPTION = Full featured decoder +UTILS-$(CONFIG_ENCODERS) += vpxenc.c +vpxenc.SRCS += args.c args.h y4minput.c y4minput.h vpxenc.h +vpxenc.SRCS += ivfdec.c ivfdec.h +vpxenc.SRCS += ivfenc.c ivfenc.h +vpxenc.SRCS += rate_hist.c rate_hist.h +vpxenc.SRCS += tools_common.c tools_common.h +vpxenc.SRCS += warnings.c warnings.h +vpxenc.SRCS += vpx_ports/mem_ops.h +vpxenc.SRCS += vpx_ports/mem_ops_aligned.h +vpxenc.SRCS += vpx_ports/msvc.h +vpxenc.SRCS += vpx_ports/vpx_timer.h +vpxenc.SRCS += vpxstats.c vpxstats.h +ifeq ($(CONFIG_LIBYUV),yes) + vpxenc.SRCS += $(LIBYUV_SRCS) +endif +ifeq ($(CONFIG_WEBM_IO),yes) + vpxenc.SRCS += $(LIBWEBM_COMMON_SRCS) + vpxenc.SRCS += $(LIBWEBM_MUXER_SRCS) + vpxenc.SRCS += $(LIBWEBM_PARSER_SRCS) + vpxenc.SRCS += webmenc.cc webmenc.h +endif +vpxenc.GUID = 548DEC74-7A15-4B2B-AFC3-AA102E7C25C1 +vpxenc.DESCRIPTION = Full featured encoder + +EXAMPLES-$(CONFIG_VP9_ENCODER) += vp9_spatial_svc_encoder.c +vp9_spatial_svc_encoder.SRCS += args.c args.h +vp9_spatial_svc_encoder.SRCS += ivfenc.c ivfenc.h +vp9_spatial_svc_encoder.SRCS += y4minput.c y4minput.h +vp9_spatial_svc_encoder.SRCS += tools_common.c tools_common.h +vp9_spatial_svc_encoder.SRCS += video_common.h +vp9_spatial_svc_encoder.SRCS += video_writer.h video_writer.c +vp9_spatial_svc_encoder.SRCS += vpx_ports/msvc.h +vp9_spatial_svc_encoder.SRCS += vpxstats.c vpxstats.h +vp9_spatial_svc_encoder.SRCS += examples/svc_encodeframe.c +vp9_spatial_svc_encoder.SRCS += examples/svc_context.h +vp9_spatial_svc_encoder.GUID = 4A38598D-627D-4505-9C7B-D4020C84100D +vp9_spatial_svc_encoder.DESCRIPTION = VP9 Spatial SVC Encoder + +ifneq ($(CONFIG_SHARED),yes) +EXAMPLES-$(CONFIG_VP9_ENCODER) += resize_util.c +endif + +EXAMPLES-$(CONFIG_ENCODERS) += vpx_temporal_svc_encoder.c +vpx_temporal_svc_encoder.SRCS += ivfenc.c ivfenc.h +vpx_temporal_svc_encoder.SRCS += y4minput.c y4minput.h +vpx_temporal_svc_encoder.SRCS += tools_common.c tools_common.h +vpx_temporal_svc_encoder.SRCS += video_common.h +vpx_temporal_svc_encoder.SRCS += video_writer.h video_writer.c +vpx_temporal_svc_encoder.SRCS += vpx_ports/msvc.h +vpx_temporal_svc_encoder.GUID = B18C08F2-A439-4502-A78E-849BE3D60947 +vpx_temporal_svc_encoder.DESCRIPTION = Temporal SVC Encoder +EXAMPLES-$(CONFIG_DECODERS) += simple_decoder.c +simple_decoder.GUID = D3BBF1E9-2427-450D-BBFF-B2843C1D44CC +simple_decoder.SRCS += ivfdec.h ivfdec.c +simple_decoder.SRCS += y4minput.c y4minput.h +simple_decoder.SRCS += tools_common.h tools_common.c +simple_decoder.SRCS += video_common.h +simple_decoder.SRCS += video_reader.h video_reader.c +simple_decoder.SRCS += vpx_ports/mem_ops.h +simple_decoder.SRCS += vpx_ports/mem_ops_aligned.h +simple_decoder.SRCS += vpx_ports/msvc.h +simple_decoder.DESCRIPTION = Simplified decoder loop +EXAMPLES-$(CONFIG_DECODERS) += postproc.c +postproc.SRCS += ivfdec.h ivfdec.c +postproc.SRCS += y4minput.c y4minput.h +postproc.SRCS += tools_common.h tools_common.c +postproc.SRCS += video_common.h +postproc.SRCS += video_reader.h video_reader.c +postproc.SRCS += vpx_ports/mem_ops.h +postproc.SRCS += vpx_ports/mem_ops_aligned.h +postproc.SRCS += vpx_ports/msvc.h +postproc.GUID = 65E33355-F35E-4088-884D-3FD4905881D7 +postproc.DESCRIPTION = Decoder postprocessor control +EXAMPLES-$(CONFIG_DECODERS) += decode_to_md5.c +decode_to_md5.SRCS += md5_utils.h md5_utils.c +decode_to_md5.SRCS += ivfdec.h ivfdec.c +decode_to_md5.SRCS += y4minput.c y4minput.h +decode_to_md5.SRCS += tools_common.h tools_common.c +decode_to_md5.SRCS += video_common.h +decode_to_md5.SRCS += video_reader.h video_reader.c +decode_to_md5.SRCS += vpx_ports/compiler_attributes.h +decode_to_md5.SRCS += vpx_ports/mem_ops.h +decode_to_md5.SRCS += vpx_ports/mem_ops_aligned.h +decode_to_md5.SRCS += vpx_ports/msvc.h +decode_to_md5.GUID = 59120B9B-2735-4BFE-B022-146CA340FE42 +decode_to_md5.DESCRIPTION = Frame by frame MD5 checksum +EXAMPLES-$(CONFIG_ENCODERS) += simple_encoder.c +simple_encoder.SRCS += ivfenc.h ivfenc.c +simple_encoder.SRCS += y4minput.c y4minput.h +simple_encoder.SRCS += tools_common.h tools_common.c +simple_encoder.SRCS += video_common.h +simple_encoder.SRCS += video_writer.h video_writer.c +simple_encoder.SRCS += vpx_ports/msvc.h +simple_encoder.GUID = 4607D299-8A71-4D2C-9B1D-071899B6FBFD +simple_encoder.DESCRIPTION = Simplified encoder loop +EXAMPLES-$(CONFIG_VP9_ENCODER) += vp9_lossless_encoder.c +vp9_lossless_encoder.SRCS += ivfenc.h ivfenc.c +vp9_lossless_encoder.SRCS += y4minput.c y4minput.h +vp9_lossless_encoder.SRCS += tools_common.h tools_common.c +vp9_lossless_encoder.SRCS += video_common.h +vp9_lossless_encoder.SRCS += video_writer.h video_writer.c +vp9_lossless_encoder.SRCS += vpx_ports/msvc.h +vp9_lossless_encoder.GUID = B63C7C88-5348-46DC-A5A6-CC151EF93366 +vp9_lossless_encoder.DESCRIPTION = Simplified lossless VP9 encoder +EXAMPLES-$(CONFIG_ENCODERS) += twopass_encoder.c +twopass_encoder.SRCS += ivfenc.h ivfenc.c +twopass_encoder.SRCS += y4minput.c y4minput.h +twopass_encoder.SRCS += tools_common.h tools_common.c +twopass_encoder.SRCS += video_common.h +twopass_encoder.SRCS += video_writer.h video_writer.c +twopass_encoder.SRCS += vpx_ports/msvc.h +twopass_encoder.GUID = 73494FA6-4AF9-4763-8FBB-265C92402FD8 +twopass_encoder.DESCRIPTION = Two-pass encoder loop +EXAMPLES-$(CONFIG_DECODERS) += decode_with_drops.c +decode_with_drops.SRCS += ivfdec.h ivfdec.c +decode_with_drops.SRCS += y4minput.c y4minput.h +decode_with_drops.SRCS += tools_common.h tools_common.c +decode_with_drops.SRCS += video_common.h +decode_with_drops.SRCS += video_reader.h video_reader.c +decode_with_drops.SRCS += vpx_ports/mem_ops.h +decode_with_drops.SRCS += vpx_ports/mem_ops_aligned.h +decode_with_drops.SRCS += vpx_ports/msvc.h +decode_with_drops.GUID = CE5C53C4-8DDA-438A-86ED-0DDD3CDB8D26 +decode_with_drops.DESCRIPTION = Drops frames while decoding +EXAMPLES-$(CONFIG_ENCODERS) += set_maps.c +set_maps.SRCS += ivfenc.h ivfenc.c +set_maps.SRCS += y4minput.c y4minput.h +set_maps.SRCS += tools_common.h tools_common.c +set_maps.SRCS += video_common.h +set_maps.SRCS += video_writer.h video_writer.c +set_maps.SRCS += vpx_ports/msvc.h +set_maps.GUID = ECB2D24D-98B8-4015-A465-A4AF3DCC145F +set_maps.DESCRIPTION = Set active and ROI maps +EXAMPLES-$(CONFIG_VP8_ENCODER) += vp8cx_set_ref.c +vp8cx_set_ref.SRCS += ivfenc.h ivfenc.c +vp8cx_set_ref.SRCS += y4minput.c y4minput.h +vp8cx_set_ref.SRCS += tools_common.h tools_common.c +vp8cx_set_ref.SRCS += video_common.h +vp8cx_set_ref.SRCS += video_writer.h video_writer.c +vp8cx_set_ref.SRCS += vpx_ports/msvc.h +vp8cx_set_ref.GUID = C5E31F7F-96F6-48BD-BD3E-10EBF6E8057A +vp8cx_set_ref.DESCRIPTION = VP8 set encoder reference frame + +ifeq ($(CONFIG_VP9_ENCODER),yes) +ifeq ($(CONFIG_DECODERS),yes) +EXAMPLES-yes += vp9cx_set_ref.c +vp9cx_set_ref.SRCS += ivfenc.h ivfenc.c +vp9cx_set_ref.SRCS += y4minput.c y4minput.h +vp9cx_set_ref.SRCS += tools_common.h tools_common.c +vp9cx_set_ref.SRCS += video_common.h +vp9cx_set_ref.SRCS += video_writer.h video_writer.c +vp9cx_set_ref.GUID = 65D7F14A-2EE6-4293-B958-AB5107A03B55 +vp9cx_set_ref.DESCRIPTION = VP9 set encoder reference frame +endif +endif + +ifeq ($(CONFIG_MULTI_RES_ENCODING),yes) +ifeq ($(CONFIG_LIBYUV),yes) +EXAMPLES-$(CONFIG_VP8_ENCODER) += vp8_multi_resolution_encoder.c +vp8_multi_resolution_encoder.SRCS += ivfenc.h ivfenc.c +vp8_multi_resolution_encoder.SRCS += y4minput.c y4minput.h +vp8_multi_resolution_encoder.SRCS += tools_common.h tools_common.c +vp8_multi_resolution_encoder.SRCS += video_writer.h video_writer.c +vp8_multi_resolution_encoder.SRCS += vpx_ports/msvc.h +vp8_multi_resolution_encoder.SRCS += $(LIBYUV_SRCS) +vp8_multi_resolution_encoder.GUID = 04f8738e-63c8-423b-90fa-7c2703a374de +vp8_multi_resolution_encoder.DESCRIPTION = VP8 Multiple-resolution Encoding +endif +endif + +# Handle extra library flags depending on codec configuration + +# We should not link to math library (libm) on RVCT +# when building for bare-metal targets +ifeq ($(CONFIG_OS_SUPPORT), yes) +CODEC_EXTRA_LIBS-$(CONFIG_VP8) += m +CODEC_EXTRA_LIBS-$(CONFIG_VP9) += m +else + ifeq ($(CONFIG_GCC), yes) + CODEC_EXTRA_LIBS-$(CONFIG_VP8) += m + CODEC_EXTRA_LIBS-$(CONFIG_VP9) += m + endif +endif +# +# End of specified files. The rest of the build rules should happen +# automagically from here. +# + + +# Examples need different flags based on whether we're building +# from an installed tree or a version controlled tree. Determine +# the proper paths. +ifeq ($(HAVE_ALT_TREE_LAYOUT),yes) + LIB_PATH-yes := $(SRC_PATH_BARE)/../lib + INC_PATH-yes := $(SRC_PATH_BARE)/../include +else + LIB_PATH-yes += $(if $(BUILD_PFX),$(BUILD_PFX),.) + INC_PATH-$(CONFIG_VP8_DECODER) += $(SRC_PATH_BARE)/vp8 + INC_PATH-$(CONFIG_VP8_ENCODER) += $(SRC_PATH_BARE)/vp8 + INC_PATH-$(CONFIG_VP9_DECODER) += $(SRC_PATH_BARE)/vp9 + INC_PATH-$(CONFIG_VP9_ENCODER) += $(SRC_PATH_BARE)/vp9 +endif +INC_PATH-$(CONFIG_LIBYUV) += $(SRC_PATH_BARE)/third_party/libyuv/include +LIB_PATH := $(call enabled,LIB_PATH) +INC_PATH := $(call enabled,INC_PATH) +INTERNAL_CFLAGS = $(addprefix -I,$(INC_PATH)) +INTERNAL_LDFLAGS += $(addprefix -L,$(LIB_PATH)) + + +# Expand list of selected examples to build (as specified above) +UTILS = $(call enabled,UTILS) +EXAMPLES = $(addprefix examples/,$(call enabled,EXAMPLES)) +ALL_EXAMPLES = $(UTILS) $(EXAMPLES) +UTIL_SRCS = $(foreach ex,$(UTILS),$($(ex:.c=).SRCS)) +ALL_SRCS = $(foreach ex,$(ALL_EXAMPLES),$($(notdir $(ex:.c=)).SRCS)) +CODEC_EXTRA_LIBS=$(sort $(call enabled,CODEC_EXTRA_LIBS)) + + +# Expand all example sources into a variable containing all sources +# for that example (not just them main one specified in UTILS/EXAMPLES) +# and add this file to the list (for MSVS workspace generation) +$(foreach ex,$(ALL_EXAMPLES),$(eval $(notdir $(ex:.c=)).SRCS += $(ex) examples.mk)) + + +# Create build/install dependencies for all examples. The common case +# is handled here. The MSVS case is handled below. +NOT_MSVS = $(if $(CONFIG_MSVS),,yes) +DIST-BINS-$(NOT_MSVS) += $(addprefix bin/,$(ALL_EXAMPLES:.c=$(EXE_SFX))) +INSTALL-BINS-$(NOT_MSVS) += $(addprefix bin/,$(UTILS:.c=$(EXE_SFX))) +DIST-SRCS-yes += $(ALL_SRCS) +INSTALL-SRCS-yes += $(UTIL_SRCS) +OBJS-$(NOT_MSVS) += $(call objs,$(ALL_SRCS)) +BINS-$(NOT_MSVS) += $(addprefix $(BUILD_PFX),$(ALL_EXAMPLES:.c=$(EXE_SFX))) + + +# Instantiate linker template for all examples. +CODEC_LIB=$(if $(CONFIG_DEBUG_LIBS),vpx_g,vpx) +ifneq ($(filter darwin%,$(TGT_OS)),) +SHARED_LIB_SUF=.dylib +else +ifneq ($(filter os2%,$(TGT_OS)),) +SHARED_LIB_SUF=_dll.a +else +SHARED_LIB_SUF=.so +endif +endif +CODEC_LIB_SUF=$(if $(CONFIG_SHARED),$(SHARED_LIB_SUF),.a) +$(foreach bin,$(BINS-yes),\ + $(eval $(bin):$(LIB_PATH)/lib$(CODEC_LIB)$(CODEC_LIB_SUF))\ + $(eval $(call linker_template,$(bin),\ + $(call objs,$($(notdir $(bin:$(EXE_SFX)=)).SRCS)) \ + -l$(CODEC_LIB) $(addprefix -l,$(CODEC_EXTRA_LIBS))\ + ))) + +# The following pairs define a mapping of locations in the distribution +# tree to locations in the source/build trees. +INSTALL_MAPS += src/%.c %.c +INSTALL_MAPS += src/% $(SRC_PATH_BARE)/% +INSTALL_MAPS += bin/% % +INSTALL_MAPS += % % + + +# Set up additional MSVS environment +ifeq ($(CONFIG_MSVS),yes) +CODEC_LIB=$(if $(CONFIG_SHARED),vpx,$(if $(CONFIG_STATIC_MSVCRT),vpxmt,vpxmd)) +# This variable uses deferred expansion intentionally, since the results of +# $(wildcard) may change during the course of the Make. +VS_PLATFORMS = $(foreach d,$(wildcard */Release/$(CODEC_LIB).lib),$(word 1,$(subst /, ,$(d)))) +INSTALL_MAPS += $(foreach p,$(VS_PLATFORMS),bin/$(p)/% $(p)/Release/%) +endif + +# Build Visual Studio Projects. We use a template here to instantiate +# explicit rules rather than using an implicit rule because we want to +# leverage make's VPATH searching rather than specifying the paths on +# each file in ALL_EXAMPLES. This has the unfortunate side effect that +# touching the source files trigger a rebuild of the project files +# even though there is no real dependency there (the dependency is on +# the makefiles). We may want to revisit this. +define vcproj_template +$(1): $($(1:.$(VCPROJ_SFX)=).SRCS) vpx.$(VCPROJ_SFX) + $(if $(quiet),@echo " [vcproj] $$@") + $(qexec)$$(GEN_VCPROJ)\ + --exe\ + --target=$$(TOOLCHAIN)\ + --name=$$(@:.$(VCPROJ_SFX)=)\ + --ver=$$(CONFIG_VS_VERSION)\ + --proj-guid=$$($$(@:.$(VCPROJ_SFX)=).GUID)\ + --src-path-bare="$(SRC_PATH_BARE)" \ + --as=$$(AS) \ + $$(if $$(CONFIG_STATIC_MSVCRT),--static-crt) \ + --out=$$@ $$(INTERNAL_CFLAGS) $$(CFLAGS) \ + $$(INTERNAL_LDFLAGS) $$(LDFLAGS) -l$$(CODEC_LIB) $$^ +endef +ALL_EXAMPLES_BASENAME := $(notdir $(ALL_EXAMPLES)) +PROJECTS-$(CONFIG_MSVS) += $(ALL_EXAMPLES_BASENAME:.c=.$(VCPROJ_SFX)) +INSTALL-BINS-$(CONFIG_MSVS) += $(foreach p,$(VS_PLATFORMS),\ + $(addprefix bin/$(p)/,$(ALL_EXAMPLES_BASENAME:.c=.exe))) +$(foreach proj,$(call enabled,PROJECTS),\ + $(eval $(call vcproj_template,$(proj)))) + +# +# Documentation Rules +# +%.dox: %.c + @echo " [DOXY] $@" + @mkdir -p $(dir $@) + @echo "/*!\page example_$(@F:.dox=) $(@F:.dox=)" > $@ + @echo " \includelineno $(> $@ + @echo "*/" >> $@ + +samples.dox: examples.mk + @echo " [DOXY] $@" + @echo "/*!\page samples Sample Code" > $@ + @echo " This SDK includes a number of sample applications."\ + "Each sample documents a feature of the SDK in both prose"\ + "and the associated C code."\ + "The following samples are included: ">>$@ + @$(foreach ex,$(sort $(notdir $(EXAMPLES:.c=))),\ + echo " - \subpage example_$(ex) $($(ex).DESCRIPTION)" >> $@;) + @echo >> $@ + @echo " In addition, the SDK contains a number of utilities."\ + "Since these utilities are built upon the concepts described"\ + "in the sample code listed above, they are not documented in"\ + "pieces like the samples are. Their source is included here"\ + "for reference. The following utilities are included:" >> $@ + @$(foreach ex,$(sort $(UTILS:.c=)),\ + echo " - \subpage example_$(ex) $($(ex).DESCRIPTION)" >> $@;) + @echo "*/" >> $@ + +CLEAN-OBJS += examples.doxy samples.dox $(ALL_EXAMPLES:.c=.dox) +DOCS-yes += examples.doxy samples.dox +examples.doxy: samples.dox $(ALL_EXAMPLES:.c=.dox) + @echo "INPUT += $^" > $@ + @echo "ENABLED_SECTIONS += samples" >> $@ diff --git a/media/libvpx/libvpx/examples/decode_to_md5.c b/media/libvpx/libvpx/examples/decode_to_md5.c new file mode 100644 index 0000000000..51959f37df --- /dev/null +++ b/media/libvpx/libvpx/examples/decode_to_md5.c @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// Frame-by-frame MD5 Checksum +// =========================== +// +// This example builds upon the simple decoder loop to show how checksums +// of the decoded output can be generated. These are used for validating +// decoder implementations against the reference implementation, for example. +// +// MD5 algorithm +// ------------- +// The Message-Digest 5 (MD5) is a well known hash function. We have provided +// an implementation derived from the RSA Data Security, Inc. MD5 Message-Digest +// Algorithm for your use. Our implmentation only changes the interface of this +// reference code. You must include the `md5_utils.h` header for access to these +// functions. +// +// Processing The Decoded Data +// --------------------------- +// Each row of the image is passed to the MD5 accumulator. First the Y plane +// is processed, then U, then V. It is important to honor the image's `stride` +// values. + +#include +#include +#include + +#include "vpx/vp8dx.h" +#include "vpx/vpx_decoder.h" + +#include "../md5_utils.h" +#include "../tools_common.h" +#include "../video_reader.h" +#include "./vpx_config.h" + +static void get_image_md5(const vpx_image_t *img, unsigned char digest[16]) { + int plane, y; + MD5Context md5; + + MD5Init(&md5); + + for (plane = 0; plane < 3; ++plane) { + const unsigned char *buf = img->planes[plane]; + const int stride = img->stride[plane]; + const int w = plane ? (img->d_w + 1) >> 1 : img->d_w; + const int h = plane ? (img->d_h + 1) >> 1 : img->d_h; + + for (y = 0; y < h; ++y) { + MD5Update(&md5, buf, w); + buf += stride; + } + } + + MD5Final(digest, &md5); +} + +static void print_md5(FILE *stream, unsigned char digest[16]) { + int i; + + for (i = 0; i < 16; ++i) fprintf(stream, "%02x", digest[i]); +} + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, "Usage: %s \n", exec_name); + exit(EXIT_FAILURE); +} + +int main(int argc, char **argv) { + int frame_cnt = 0; + FILE *outfile = NULL; + vpx_codec_ctx_t codec; + VpxVideoReader *reader = NULL; + const VpxVideoInfo *info = NULL; + const VpxInterface *decoder = NULL; + + exec_name = argv[0]; + + if (argc != 3) die("Invalid number of arguments."); + + reader = vpx_video_reader_open(argv[1]); + if (!reader) die("Failed to open %s for reading.", argv[1]); + + if (!(outfile = fopen(argv[2], "wb"))) + die("Failed to open %s for writing.", argv[2]); + + info = vpx_video_reader_get_info(reader); + + decoder = get_vpx_decoder_by_fourcc(info->codec_fourcc); + if (!decoder) die("Unknown input codec."); + + printf("Using %s\n", vpx_codec_iface_name(decoder->codec_interface())); + + if (vpx_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0)) + die_codec(&codec, "Failed to initialize decoder"); + + while (vpx_video_reader_read_frame(reader)) { + vpx_codec_iter_t iter = NULL; + vpx_image_t *img = NULL; + size_t frame_size = 0; + const unsigned char *frame = + vpx_video_reader_get_frame(reader, &frame_size); + if (vpx_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 0)) + die_codec(&codec, "Failed to decode frame"); + + while ((img = vpx_codec_get_frame(&codec, &iter)) != NULL) { + unsigned char digest[16]; + + get_image_md5(img, digest); + print_md5(outfile, digest); + fprintf(outfile, " img-%dx%d-%04d.i420\n", img->d_w, img->d_h, + ++frame_cnt); + } + } + + printf("Processed %d frames.\n", frame_cnt); + if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); + + vpx_video_reader_close(reader); + + fclose(outfile); + return EXIT_SUCCESS; +} diff --git a/media/libvpx/libvpx/examples/decode_with_drops.c b/media/libvpx/libvpx/examples/decode_with_drops.c new file mode 100644 index 0000000000..03c79a4561 --- /dev/null +++ b/media/libvpx/libvpx/examples/decode_with_drops.c @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// Decode With Drops Example +// ========================= +// +// This is an example utility which drops a series of frames, as specified +// on the command line. This is useful for observing the error recovery +// features of the codec. +// +// Usage +// ----- +// This example adds a single argument to the `simple_decoder` example, +// which specifies the range or pattern of frames to drop. The parameter is +// parsed as follows: +// +// Dropping A Range Of Frames +// -------------------------- +// To drop a range of frames, specify the starting frame and the ending +// frame to drop, separated by a dash. The following command will drop +// frames 5 through 10 (base 1). +// +// $ ./decode_with_drops in.ivf out.i420 5-10 +// +// +// Dropping A Pattern Of Frames +// ---------------------------- +// To drop a pattern of frames, specify the number of frames to drop and +// the number of frames after which to repeat the pattern, separated by +// a forward-slash. The following command will drop 3 of 7 frames. +// Specifically, it will decode 4 frames, then drop 3 frames, and then +// repeat. +// +// $ ./decode_with_drops in.ivf out.i420 3/7 +// +// +// Extra Variables +// --------------- +// This example maintains the pattern passed on the command line in the +// `n`, `m`, and `is_range` variables: +// +// +// Making The Drop Decision +// ------------------------ +// The example decides whether to drop the frame based on the current +// frame number, immediately before decoding the frame. + +#include +#include +#include + +#include "vpx/vp8dx.h" +#include "vpx/vpx_decoder.h" + +#include "../tools_common.h" +#include "../video_reader.h" +#include "./vpx_config.h" + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, "Usage: %s \n", exec_name); + exit(EXIT_FAILURE); +} + +int main(int argc, char **argv) { + int frame_cnt = 0; + FILE *outfile = NULL; + vpx_codec_ctx_t codec; + const VpxInterface *decoder = NULL; + VpxVideoReader *reader = NULL; + const VpxVideoInfo *info = NULL; + int n = 0; + int m = 0; + int is_range = 0; + char *nptr = NULL; + + exec_name = argv[0]; + + if (argc != 4) die("Invalid number of arguments."); + + reader = vpx_video_reader_open(argv[1]); + if (!reader) die("Failed to open %s for reading.", argv[1]); + + if (!(outfile = fopen(argv[2], "wb"))) + die("Failed to open %s for writing.", argv[2]); + + n = (int)strtol(argv[3], &nptr, 0); + m = (int)strtol(nptr + 1, NULL, 0); + is_range = (*nptr == '-'); + if (!n || !m || (*nptr != '-' && *nptr != '/')) + die("Couldn't parse pattern %s.\n", argv[3]); + + info = vpx_video_reader_get_info(reader); + + decoder = get_vpx_decoder_by_fourcc(info->codec_fourcc); + if (!decoder) die("Unknown input codec."); + + printf("Using %s\n", vpx_codec_iface_name(decoder->codec_interface())); + + if (vpx_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0)) + die("Failed to initialize decoder."); + + while (vpx_video_reader_read_frame(reader)) { + vpx_codec_iter_t iter = NULL; + vpx_image_t *img = NULL; + size_t frame_size = 0; + int skip; + const unsigned char *frame = + vpx_video_reader_get_frame(reader, &frame_size); + if (vpx_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 0)) + die_codec(&codec, "Failed to decode frame."); + + ++frame_cnt; + + skip = (is_range && frame_cnt >= n && frame_cnt <= m) || + (!is_range && m - (frame_cnt - 1) % m <= n); + + if (!skip) { + putc('.', stdout); + + while ((img = vpx_codec_get_frame(&codec, &iter)) != NULL) + vpx_img_write(img, outfile); + } else { + putc('X', stdout); + } + + fflush(stdout); + } + + printf("Processed %d frames.\n", frame_cnt); + if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); + + printf("Play: ffplay -f rawvideo -pix_fmt yuv420p -s %dx%d %s\n", + info->frame_width, info->frame_height, argv[2]); + + vpx_video_reader_close(reader); + fclose(outfile); + + return EXIT_SUCCESS; +} diff --git a/media/libvpx/libvpx/examples/postproc.c b/media/libvpx/libvpx/examples/postproc.c new file mode 100644 index 0000000000..b53c15ea15 --- /dev/null +++ b/media/libvpx/libvpx/examples/postproc.c @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// Postprocessing Decoder +// ====================== +// +// This example adds postprocessing to the simple decoder loop. +// +// Initializing Postprocessing +// --------------------------- +// You must inform the codec that you might request postprocessing at +// initialization time. This is done by passing the VPX_CODEC_USE_POSTPROC +// flag to `vpx_codec_dec_init`. If the codec does not support +// postprocessing, this call will return VPX_CODEC_INCAPABLE. For +// demonstration purposes, we also fall back to default initialization if +// the codec does not provide support. +// +// Using Adaptive Postprocessing +// ----------------------------- +// VP6 provides "adaptive postprocessing." It will automatically select the +// best postprocessing filter on a frame by frame basis based on the amount +// of time remaining before the user's specified deadline expires. The +// special value 0 indicates that the codec should take as long as +// necessary to provide the best quality frame. This example gives the +// codec 15ms (15000us) to return a frame. Remember that this is a soft +// deadline, and the codec may exceed it doing its regular processing. In +// these cases, no additional postprocessing will be done. +// +// Codec Specific Postprocessing Controls +// -------------------------------------- +// Some codecs provide fine grained controls over their built-in +// postprocessors. VP8 is one example. The following sample code toggles +// postprocessing on and off every 15 frames. + +#include +#include +#include + +#include "vpx/vp8dx.h" +#include "vpx/vpx_decoder.h" + +#include "../tools_common.h" +#include "../video_reader.h" +#include "./vpx_config.h" + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, "Usage: %s \n", exec_name); + exit(EXIT_FAILURE); +} + +int main(int argc, char **argv) { + int frame_cnt = 0; + FILE *outfile = NULL; + vpx_codec_ctx_t codec; + vpx_codec_err_t res; + VpxVideoReader *reader = NULL; + const VpxInterface *decoder = NULL; + const VpxVideoInfo *info = NULL; + + exec_name = argv[0]; + + if (argc != 3) die("Invalid number of arguments."); + + reader = vpx_video_reader_open(argv[1]); + if (!reader) die("Failed to open %s for reading.", argv[1]); + + if (!(outfile = fopen(argv[2], "wb"))) + die("Failed to open %s for writing", argv[2]); + + info = vpx_video_reader_get_info(reader); + + decoder = get_vpx_decoder_by_fourcc(info->codec_fourcc); + if (!decoder) die("Unknown input codec."); + + printf("Using %s\n", vpx_codec_iface_name(decoder->codec_interface())); + + res = vpx_codec_dec_init(&codec, decoder->codec_interface(), NULL, + VPX_CODEC_USE_POSTPROC); + if (res == VPX_CODEC_INCAPABLE) + die("Postproc not supported by this decoder."); + + if (res) die("Failed to initialize decoder."); + + while (vpx_video_reader_read_frame(reader)) { + vpx_codec_iter_t iter = NULL; + vpx_image_t *img = NULL; + size_t frame_size = 0; + const unsigned char *frame = + vpx_video_reader_get_frame(reader, &frame_size); + + ++frame_cnt; + + if (frame_cnt % 30 == 1) { + vp8_postproc_cfg_t pp = { 0, 0, 0 }; + + if (vpx_codec_control(&codec, VP8_SET_POSTPROC, &pp)) + die_codec(&codec, "Failed to turn off postproc."); + } else if (frame_cnt % 30 == 16) { + vp8_postproc_cfg_t pp = { VP8_DEBLOCK | VP8_DEMACROBLOCK | VP8_MFQE, 4, + 0 }; + if (vpx_codec_control(&codec, VP8_SET_POSTPROC, &pp)) + die_codec(&codec, "Failed to turn on postproc."); + } + + // Decode the frame with 15ms deadline + if (vpx_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 15000)) + die_codec(&codec, "Failed to decode frame"); + + while ((img = vpx_codec_get_frame(&codec, &iter)) != NULL) { + vpx_img_write(img, outfile); + } + } + + printf("Processed %d frames.\n", frame_cnt); + if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec"); + + printf("Play: ffplay -f rawvideo -pix_fmt yuv420p -s %dx%d %s\n", + info->frame_width, info->frame_height, argv[2]); + + vpx_video_reader_close(reader); + + fclose(outfile); + return EXIT_SUCCESS; +} diff --git a/media/libvpx/libvpx/examples/resize_util.c b/media/libvpx/libvpx/examples/resize_util.c new file mode 100644 index 0000000000..5fb63e1660 --- /dev/null +++ b/media/libvpx/libvpx/examples/resize_util.c @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include +#include +#include + +#include "../tools_common.h" +#include "../vp9/encoder/vp9_resize.h" + +static const char *exec_name = NULL; + +static void usage() { + printf("Usage:\n"); + printf("%s x x ", + exec_name); + printf(" []\n"); +} + +void usage_exit(void) { + usage(); + exit(EXIT_FAILURE); +} + +static int parse_dim(char *v, int *width, int *height) { + char *x = strchr(v, 'x'); + if (x == NULL) x = strchr(v, 'X'); + if (x == NULL) return 0; + *width = atoi(v); + *height = atoi(&x[1]); + if (*width <= 0 || *height <= 0) + return 0; + else + return 1; +} + +int main(int argc, char *argv[]) { + char *fin, *fout; + FILE *fpin, *fpout; + uint8_t *inbuf, *outbuf; + uint8_t *inbuf_u, *outbuf_u; + uint8_t *inbuf_v, *outbuf_v; + int f, frames; + int width, height, target_width, target_height; + int failed = 0; + + exec_name = argv[0]; + + if (argc < 5) { + printf("Incorrect parameters:\n"); + usage(); + return 1; + } + + fin = argv[1]; + fout = argv[4]; + if (!parse_dim(argv[2], &width, &height)) { + printf("Incorrect parameters: %s\n", argv[2]); + usage(); + return 1; + } + if (!parse_dim(argv[3], &target_width, &target_height)) { + printf("Incorrect parameters: %s\n", argv[3]); + usage(); + return 1; + } + + fpin = fopen(fin, "rb"); + if (fpin == NULL) { + printf("Can't open file %s to read\n", fin); + usage(); + return 1; + } + fpout = fopen(fout, "wb"); + if (fpout == NULL) { + fclose(fpin); + printf("Can't open file %s to write\n", fout); + usage(); + return 1; + } + if (argc >= 6) + frames = atoi(argv[5]); + else + frames = INT_MAX; + + printf("Input size: %dx%d\n", width, height); + printf("Target size: %dx%d, Frames: ", target_width, target_height); + if (frames == INT_MAX) + printf("All\n"); + else + printf("%d\n", frames); + + inbuf = (uint8_t *)malloc(width * height * 3 / 2); + outbuf = (uint8_t *)malloc(target_width * target_height * 3 / 2); + if (!(inbuf && outbuf)) { + printf("Failed to allocate buffers.\n"); + failed = 1; + goto Error; + } + inbuf_u = inbuf + width * height; + inbuf_v = inbuf_u + width * height / 4; + outbuf_u = outbuf + target_width * target_height; + outbuf_v = outbuf_u + target_width * target_height / 4; + f = 0; + while (f < frames) { + if (fread(inbuf, width * height * 3 / 2, 1, fpin) != 1) break; + vp9_resize_frame420(inbuf, width, inbuf_u, inbuf_v, width / 2, height, + width, outbuf, target_width, outbuf_u, outbuf_v, + target_width / 2, target_height, target_width); + fwrite(outbuf, target_width * target_height * 3 / 2, 1, fpout); + f++; + } + printf("%d frames processed\n", f); +Error: + fclose(fpin); + fclose(fpout); + + free(inbuf); + free(outbuf); + return failed; +} diff --git a/media/libvpx/libvpx/examples/set_maps.c b/media/libvpx/libvpx/examples/set_maps.c new file mode 100644 index 0000000000..867e473aea --- /dev/null +++ b/media/libvpx/libvpx/examples/set_maps.c @@ -0,0 +1,243 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// VP8 Set Active and ROI Maps +// =========================== +// +// This is an example demonstrating how to control the VP8 encoder's +// ROI and Active maps. +// +// ROI (Reigon of Interest) maps are a way for the application to assign +// each macroblock in the image to a region, and then set quantizer and +// filtering parameters on that image. +// +// Active maps are a way for the application to specify on a +// macroblock-by-macroblock basis whether there is any activity in that +// macroblock. +// +// +// Configuration +// ------------- +// An ROI map is set on frame 22. If the width of the image in macroblocks +// is evenly divisble by 4, then the output will appear to have distinct +// columns, where the quantizer, loopfilter, and static threshold differ +// from column to column. +// +// An active map is set on frame 33. If the width of the image in macroblocks +// is evenly divisble by 4, then the output will appear to have distinct +// columns, where one column will have motion and the next will not. +// +// The active map is cleared on frame 44. +// +// Observing The Effects +// --------------------- +// Use the `simple_decoder` example to decode this sample, and observe +// the change in the image at frames 22, 33, and 44. + +#include +#include +#include +#include + +#include "vpx/vp8cx.h" +#include "vpx/vpx_encoder.h" + +#include "../tools_common.h" +#include "../video_writer.h" + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, "Usage: %s \n", + exec_name); + exit(EXIT_FAILURE); +} + +static void set_roi_map(const vpx_codec_enc_cfg_t *cfg, + vpx_codec_ctx_t *codec) { + unsigned int i; + vpx_roi_map_t roi; + memset(&roi, 0, sizeof(roi)); + + roi.rows = (cfg->g_h + 15) / 16; + roi.cols = (cfg->g_w + 15) / 16; + + roi.delta_q[0] = 0; + roi.delta_q[1] = -2; + roi.delta_q[2] = -4; + roi.delta_q[3] = -6; + + roi.delta_lf[0] = 0; + roi.delta_lf[1] = 1; + roi.delta_lf[2] = 2; + roi.delta_lf[3] = 3; + + roi.static_threshold[0] = 1500; + roi.static_threshold[1] = 1000; + roi.static_threshold[2] = 500; + roi.static_threshold[3] = 0; + + roi.roi_map = (uint8_t *)malloc(roi.rows * roi.cols); + for (i = 0; i < roi.rows * roi.cols; ++i) roi.roi_map[i] = i % 4; + + if (vpx_codec_control(codec, VP8E_SET_ROI_MAP, &roi)) + die_codec(codec, "Failed to set ROI map"); + + free(roi.roi_map); +} + +static void set_active_map(const vpx_codec_enc_cfg_t *cfg, + vpx_codec_ctx_t *codec) { + unsigned int i; + vpx_active_map_t map = { 0, 0, 0 }; + + map.rows = (cfg->g_h + 15) / 16; + map.cols = (cfg->g_w + 15) / 16; + + map.active_map = (uint8_t *)malloc(map.rows * map.cols); + for (i = 0; i < map.rows * map.cols; ++i) map.active_map[i] = i % 2; + + if (vpx_codec_control(codec, VP8E_SET_ACTIVEMAP, &map)) + die_codec(codec, "Failed to set active map"); + + free(map.active_map); +} + +static void unset_active_map(const vpx_codec_enc_cfg_t *cfg, + vpx_codec_ctx_t *codec) { + vpx_active_map_t map = { 0, 0, 0 }; + + map.rows = (cfg->g_h + 15) / 16; + map.cols = (cfg->g_w + 15) / 16; + map.active_map = NULL; + + if (vpx_codec_control(codec, VP8E_SET_ACTIVEMAP, &map)) + die_codec(codec, "Failed to set active map"); +} + +static int encode_frame(vpx_codec_ctx_t *codec, vpx_image_t *img, + int frame_index, VpxVideoWriter *writer) { + int got_pkts = 0; + vpx_codec_iter_t iter = NULL; + const vpx_codec_cx_pkt_t *pkt = NULL; + const vpx_codec_err_t res = + vpx_codec_encode(codec, img, frame_index, 1, 0, VPX_DL_GOOD_QUALITY); + if (res != VPX_CODEC_OK) die_codec(codec, "Failed to encode frame"); + + while ((pkt = vpx_codec_get_cx_data(codec, &iter)) != NULL) { + got_pkts = 1; + + if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) { + const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0; + if (!vpx_video_writer_write_frame(writer, pkt->data.frame.buf, + pkt->data.frame.sz, + pkt->data.frame.pts)) { + die_codec(codec, "Failed to write compressed frame"); + } + + printf(keyframe ? "K" : "."); + fflush(stdout); + } + } + + return got_pkts; +} + +int main(int argc, char **argv) { + FILE *infile = NULL; + vpx_codec_ctx_t codec; + vpx_codec_enc_cfg_t cfg; + int frame_count = 0; + vpx_image_t raw; + vpx_codec_err_t res; + VpxVideoInfo info; + VpxVideoWriter *writer = NULL; + const VpxInterface *encoder = NULL; + const int fps = 2; // TODO(dkovalev) add command line argument + const double bits_per_pixel_per_frame = 0.067; + + exec_name = argv[0]; + if (argc != 6) die("Invalid number of arguments"); + + memset(&info, 0, sizeof(info)); + + encoder = get_vpx_encoder_by_name(argv[1]); + if (encoder == NULL) { + die("Unsupported codec."); + } + assert(encoder != NULL); + info.codec_fourcc = encoder->fourcc; + info.frame_width = (int)strtol(argv[2], NULL, 0); + info.frame_height = (int)strtol(argv[3], NULL, 0); + info.time_base.numerator = 1; + info.time_base.denominator = fps; + + if (info.frame_width <= 0 || info.frame_height <= 0 || + (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) { + die("Invalid frame size: %dx%d", info.frame_width, info.frame_height); + } + + if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, info.frame_width, + info.frame_height, 1)) { + die("Failed to allocate image."); + } + + printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface())); + + res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0); + if (res) die_codec(&codec, "Failed to get default codec config."); + + cfg.g_w = info.frame_width; + cfg.g_h = info.frame_height; + cfg.g_timebase.num = info.time_base.numerator; + cfg.g_timebase.den = info.time_base.denominator; + cfg.rc_target_bitrate = + (unsigned int)(bits_per_pixel_per_frame * cfg.g_w * cfg.g_h * fps / 1000); + cfg.g_lag_in_frames = 0; + + writer = vpx_video_writer_open(argv[5], kContainerIVF, &info); + if (!writer) die("Failed to open %s for writing.", argv[5]); + + if (!(infile = fopen(argv[4], "rb"))) + die("Failed to open %s for reading.", argv[4]); + + if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0)) + die("Failed to initialize encoder"); + + // Encode frames. + while (vpx_img_read(&raw, infile)) { + ++frame_count; + + if (frame_count == 22 && encoder->fourcc == VP8_FOURCC) { + set_roi_map(&cfg, &codec); + } else if (frame_count == 33) { + set_active_map(&cfg, &codec); + } else if (frame_count == 44) { + unset_active_map(&cfg, &codec); + } + + encode_frame(&codec, &raw, frame_count, writer); + } + + // Flush encoder. + while (encode_frame(&codec, NULL, -1, writer)) { + } + + printf("\n"); + fclose(infile); + printf("Processed %d frames.\n", frame_count); + + vpx_img_free(&raw); + if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); + + vpx_video_writer_close(writer); + + return EXIT_SUCCESS; +} diff --git a/media/libvpx/libvpx/examples/simple_decoder.c b/media/libvpx/libvpx/examples/simple_decoder.c new file mode 100644 index 0000000000..d089e826d5 --- /dev/null +++ b/media/libvpx/libvpx/examples/simple_decoder.c @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// Simple Decoder +// ============== +// +// This is an example of a simple decoder loop. It takes an input file +// containing the compressed data (in IVF format), passes it through the +// decoder, and writes the decompressed frames to disk. Other decoder +// examples build upon this one. +// +// The details of the IVF format have been elided from this example for +// simplicity of presentation, as IVF files will not generally be used by +// your application. In general, an IVF file consists of a file header, +// followed by a variable number of frames. Each frame consists of a frame +// header followed by a variable length payload. The length of the payload +// is specified in the first four bytes of the frame header. The payload is +// the raw compressed data. +// +// Standard Includes +// ----------------- +// For decoders, you only have to include `vpx_decoder.h` and then any +// header files for the specific codecs you use. In this case, we're using +// vp8. +// +// Initializing The Codec +// ---------------------- +// The libvpx decoder is initialized by the call to vpx_codec_dec_init(). +// Determining the codec interface to use is handled by VpxVideoReader and the +// functions prefixed with vpx_video_reader_. Discussion of those functions is +// beyond the scope of this example, but the main gist is to open the input file +// and parse just enough of it to determine if it's a VPx file and which VPx +// codec is contained within the file. +// Note the NULL pointer passed to vpx_codec_dec_init(). We do that in this +// example because we want the algorithm to determine the stream configuration +// (width/height) and allocate memory automatically. +// +// Decoding A Frame +// ---------------- +// Once the frame has been read into memory, it is decoded using the +// `vpx_codec_decode` function. The call takes a pointer to the data +// (`frame`) and the length of the data (`frame_size`). No application data +// is associated with the frame in this example, so the `user_priv` +// parameter is NULL. The `deadline` parameter is left at zero for this +// example. This parameter is generally only used when doing adaptive post +// processing. +// +// Codecs may produce a variable number of output frames for every call to +// `vpx_codec_decode`. These frames are retrieved by the +// `vpx_codec_get_frame` iterator function. The iterator variable `iter` is +// initialized to NULL each time `vpx_codec_decode` is called. +// `vpx_codec_get_frame` is called in a loop, returning a pointer to a +// decoded image or NULL to indicate the end of list. +// +// Processing The Decoded Data +// --------------------------- +// In this example, we simply write the encoded data to disk. It is +// important to honor the image's `stride` values. +// +// Cleanup +// ------- +// The `vpx_codec_destroy` call frees any memory allocated by the codec. +// +// Error Handling +// -------------- +// This example does not special case any error return codes. If there was +// an error, a descriptive message is printed and the program exits. With +// few exceptions, vpx_codec functions return an enumerated error status, +// with the value `0` indicating success. + +#include +#include +#include + +#include "vpx/vpx_decoder.h" + +#include "../tools_common.h" +#include "../video_reader.h" +#include "./vpx_config.h" + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, "Usage: %s \n", exec_name); + exit(EXIT_FAILURE); +} + +int main(int argc, char **argv) { + int frame_cnt = 0; + FILE *outfile = NULL; + vpx_codec_ctx_t codec; + VpxVideoReader *reader = NULL; + const VpxInterface *decoder = NULL; + const VpxVideoInfo *info = NULL; + + exec_name = argv[0]; + + if (argc != 3) die("Invalid number of arguments."); + + reader = vpx_video_reader_open(argv[1]); + if (!reader) die("Failed to open %s for reading.", argv[1]); + + if (!(outfile = fopen(argv[2], "wb"))) + die("Failed to open %s for writing.", argv[2]); + + info = vpx_video_reader_get_info(reader); + + decoder = get_vpx_decoder_by_fourcc(info->codec_fourcc); + if (!decoder) die("Unknown input codec."); + + printf("Using %s\n", vpx_codec_iface_name(decoder->codec_interface())); + + if (vpx_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0)) + die("Failed to initialize decoder."); + + while (vpx_video_reader_read_frame(reader)) { + vpx_codec_iter_t iter = NULL; + vpx_image_t *img = NULL; + size_t frame_size = 0; + const unsigned char *frame = + vpx_video_reader_get_frame(reader, &frame_size); + if (vpx_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 0)) + die_codec(&codec, "Failed to decode frame."); + + while ((img = vpx_codec_get_frame(&codec, &iter)) != NULL) { + vpx_img_write(img, outfile); + ++frame_cnt; + } + } + + printf("Processed %d frames.\n", frame_cnt); + if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec"); + + printf("Play: ffplay -f rawvideo -pix_fmt yuv420p -s %dx%d %s\n", + info->frame_width, info->frame_height, argv[2]); + + vpx_video_reader_close(reader); + + fclose(outfile); + + return EXIT_SUCCESS; +} diff --git a/media/libvpx/libvpx/examples/simple_encoder.c b/media/libvpx/libvpx/examples/simple_encoder.c new file mode 100644 index 0000000000..dffdd6d7da --- /dev/null +++ b/media/libvpx/libvpx/examples/simple_encoder.c @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// Simple Encoder +// ============== +// +// This is an example of a simple encoder loop. It takes an input file in +// YV12 format, passes it through the encoder, and writes the compressed +// frames to disk in IVF format. Other decoder examples build upon this +// one. +// +// The details of the IVF format have been elided from this example for +// simplicity of presentation, as IVF files will not generally be used by +// your application. In general, an IVF file consists of a file header, +// followed by a variable number of frames. Each frame consists of a frame +// header followed by a variable length payload. The length of the payload +// is specified in the first four bytes of the frame header. The payload is +// the raw compressed data. +// +// Standard Includes +// ----------------- +// For encoders, you only have to include `vpx_encoder.h` and then any +// header files for the specific codecs you use. In this case, we're using +// vp8. +// +// Getting The Default Configuration +// --------------------------------- +// Encoders have the notion of "usage profiles." For example, an encoder +// may want to publish default configurations for both a video +// conferencing application and a best quality offline encoder. These +// obviously have very different default settings. Consult the +// documentation for your codec to see if it provides any default +// configurations. All codecs provide a default configuration, number 0, +// which is valid for material in the vacinity of QCIF/QVGA. +// +// Updating The Configuration +// --------------------------------- +// Almost all applications will want to update the default configuration +// with settings specific to their usage. Here we set the width and height +// of the video file to that specified on the command line. We also scale +// the default bitrate based on the ratio between the default resolution +// and the resolution specified on the command line. +// +// Initializing The Codec +// ---------------------- +// The encoder is initialized by the following code. +// +// Encoding A Frame +// ---------------- +// The frame is read as a continuous block (size width * height * 3 / 2) +// from the input file. If a frame was read (the input file has not hit +// EOF) then the frame is passed to the encoder. Otherwise, a NULL +// is passed, indicating the End-Of-Stream condition to the encoder. The +// `frame_cnt` is reused as the presentation time stamp (PTS) and each +// frame is shown for one frame-time in duration. The flags parameter is +// unused in this example. The deadline is set to VPX_DL_REALTIME to +// make the example run as quickly as possible. + +// Forced Keyframes +// ---------------- +// Keyframes can be forced by setting the VPX_EFLAG_FORCE_KF bit of the +// flags passed to `vpx_codec_control()`. In this example, we force a +// keyframe every frames. Note, the output stream can +// contain additional keyframes beyond those that have been forced using the +// VPX_EFLAG_FORCE_KF flag because of automatic keyframe placement by the +// encoder. +// +// Processing The Encoded Data +// --------------------------- +// Each packet of type `VPX_CODEC_CX_FRAME_PKT` contains the encoded data +// for this frame. We write a IVF frame header, followed by the raw data. +// +// Cleanup +// ------- +// The `vpx_codec_destroy` call frees any memory allocated by the codec. +// +// Error Handling +// -------------- +// This example does not special case any error return codes. If there was +// an error, a descriptive message is printed and the program exits. With +// few exeptions, vpx_codec functions return an enumerated error status, +// with the value `0` indicating success. +// +// Error Resiliency Features +// ------------------------- +// Error resiliency is controlled by the g_error_resilient member of the +// configuration structure. Use the `decode_with_drops` example to decode with +// frames 5-10 dropped. Compare the output for a file encoded with this example +// versus one encoded with the `simple_encoder` example. + +#include +#include +#include + +#include "vpx/vpx_encoder.h" + +#include "../tools_common.h" +#include "../video_writer.h" + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, + "Usage: %s " + " \n" + "See comments in simple_encoder.c for more information.\n", + exec_name); + exit(EXIT_FAILURE); +} + +static int encode_frame(vpx_codec_ctx_t *codec, vpx_image_t *img, + int frame_index, int flags, VpxVideoWriter *writer) { + int got_pkts = 0; + vpx_codec_iter_t iter = NULL; + const vpx_codec_cx_pkt_t *pkt = NULL; + const vpx_codec_err_t res = + vpx_codec_encode(codec, img, frame_index, 1, flags, VPX_DL_GOOD_QUALITY); + if (res != VPX_CODEC_OK) die_codec(codec, "Failed to encode frame"); + + while ((pkt = vpx_codec_get_cx_data(codec, &iter)) != NULL) { + got_pkts = 1; + + if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) { + const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0; + if (!vpx_video_writer_write_frame(writer, pkt->data.frame.buf, + pkt->data.frame.sz, + pkt->data.frame.pts)) { + die_codec(codec, "Failed to write compressed frame"); + } + printf(keyframe ? "K" : "."); + fflush(stdout); + } + } + + return got_pkts; +} + +// TODO(tomfinegan): Improve command line parsing and add args for bitrate/fps. +int main(int argc, char **argv) { + FILE *infile = NULL; + vpx_codec_ctx_t codec; + vpx_codec_enc_cfg_t cfg; + int frame_count = 0; + vpx_image_t raw; + vpx_codec_err_t res; + VpxVideoInfo info = { 0, 0, 0, { 0, 0 } }; + VpxVideoWriter *writer = NULL; + const VpxInterface *encoder = NULL; + const int fps = 30; + const int bitrate = 200; + int keyframe_interval = 0; + int max_frames = 0; + int frames_encoded = 0; + const char *codec_arg = NULL; + const char *width_arg = NULL; + const char *height_arg = NULL; + const char *infile_arg = NULL; + const char *outfile_arg = NULL; + const char *keyframe_interval_arg = NULL; + + exec_name = argv[0]; + + if (argc != 9) die("Invalid number of arguments"); + + codec_arg = argv[1]; + width_arg = argv[2]; + height_arg = argv[3]; + infile_arg = argv[4]; + outfile_arg = argv[5]; + keyframe_interval_arg = argv[6]; + max_frames = (int)strtol(argv[8], NULL, 0); + + encoder = get_vpx_encoder_by_name(codec_arg); + if (!encoder) die("Unsupported codec."); + + info.codec_fourcc = encoder->fourcc; + info.frame_width = (int)strtol(width_arg, NULL, 0); + info.frame_height = (int)strtol(height_arg, NULL, 0); + info.time_base.numerator = 1; + info.time_base.denominator = fps; + + if (info.frame_width <= 0 || info.frame_height <= 0 || + (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) { + die("Invalid frame size: %dx%d", info.frame_width, info.frame_height); + } + + if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, info.frame_width, + info.frame_height, 1)) { + die("Failed to allocate image."); + } + + keyframe_interval = (int)strtol(keyframe_interval_arg, NULL, 0); + if (keyframe_interval < 0) die("Invalid keyframe interval value."); + + printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface())); + + res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0); + if (res) die_codec(&codec, "Failed to get default codec config."); + + cfg.g_w = info.frame_width; + cfg.g_h = info.frame_height; + cfg.g_timebase.num = info.time_base.numerator; + cfg.g_timebase.den = info.time_base.denominator; + cfg.rc_target_bitrate = bitrate; + cfg.g_error_resilient = (vpx_codec_er_flags_t)strtoul(argv[7], NULL, 0); + + writer = vpx_video_writer_open(outfile_arg, kContainerIVF, &info); + if (!writer) die("Failed to open %s for writing.", outfile_arg); + + if (!(infile = fopen(infile_arg, "rb"))) + die("Failed to open %s for reading.", infile_arg); + + if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0)) + die("Failed to initialize encoder"); + + // Encode frames. + while (vpx_img_read(&raw, infile)) { + int flags = 0; + if (keyframe_interval > 0 && frame_count % keyframe_interval == 0) + flags |= VPX_EFLAG_FORCE_KF; + encode_frame(&codec, &raw, frame_count++, flags, writer); + frames_encoded++; + if (max_frames > 0 && frames_encoded >= max_frames) break; + } + + // Flush encoder. + while (encode_frame(&codec, NULL, -1, 0, writer)) { + } + + printf("\n"); + fclose(infile); + printf("Processed %d frames.\n", frame_count); + + vpx_img_free(&raw); + if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); + + vpx_video_writer_close(writer); + + return EXIT_SUCCESS; +} diff --git a/media/libvpx/libvpx/examples/svc_context.h b/media/libvpx/libvpx/examples/svc_context.h new file mode 100644 index 0000000000..c5779ce8a9 --- /dev/null +++ b/media/libvpx/libvpx/examples/svc_context.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/** + * SvcContext - input parameters and state to encode a multi-layered + * spatial SVC frame + */ + +#ifndef VPX_EXAMPLES_SVC_CONTEXT_H_ +#define VPX_EXAMPLES_SVC_CONTEXT_H_ + +#include "vpx/vp8cx.h" +#include "vpx/vpx_encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum SVC_LOG_LEVEL { + SVC_LOG_ERROR, + SVC_LOG_INFO, + SVC_LOG_DEBUG +} SVC_LOG_LEVEL; + +typedef struct { + // public interface to svc_command options + int spatial_layers; // number of spatial layers + int temporal_layers; // number of temporal layers + int temporal_layering_mode; + SVC_LOG_LEVEL log_level; // amount of information to display + int output_rc_stat; // for outputting rc stats + int speed; // speed setting for codec + int threads; + int aqmode; // turns on aq-mode=3 (cyclic_refresh): 0=off, 1=on. + // private storage for vpx_svc_encode + void *internal; +} SvcContext; + +#define OPTION_BUFFER_SIZE 1024 +#define COMPONENTS 4 // psnr & sse statistics maintained for total, y, u, v + +typedef struct SvcInternal { + char options[OPTION_BUFFER_SIZE]; // set by vpx_svc_set_options + + // values extracted from option, quantizers + vpx_svc_extra_cfg_t svc_params; + int enable_auto_alt_ref[VPX_SS_MAX_LAYERS]; + int bitrates[VPX_MAX_LAYERS]; + + // accumulated statistics + double psnr_sum[VPX_SS_MAX_LAYERS][COMPONENTS]; // total/Y/U/V + uint64_t sse_sum[VPX_SS_MAX_LAYERS][COMPONENTS]; + uint32_t bytes_sum[VPX_SS_MAX_LAYERS]; + + // codec encoding values + int width; // width of highest layer + int height; // height of highest layer + int kf_dist; // distance between keyframes + + // state variables + int psnr_pkt_received; + int layer; + int use_multiple_frame_contexts; + + vpx_codec_ctx_t *codec_ctx; +} SvcInternal_t; + +/** + * Set SVC options + * options are supplied as a single string separated by spaces + * Format: encoding-mode= + * layers= + * scaling-factors=/,/,... + * quantizers=,,... + */ +vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options); + +/** + * initialize SVC encoding + */ +vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, + vpx_codec_iface_t *iface, + vpx_codec_enc_cfg_t *cfg); +/** + * encode a frame of video with multiple layers + */ +vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, + struct vpx_image *rawimg, vpx_codec_pts_t pts, + int64_t duration, int deadline); + +/** + * finished with svc encoding, release allocated resources + */ +void vpx_svc_release(SvcContext *svc_ctx); + +/** + * dump accumulated statistics and reset accumulated values + */ +void vpx_svc_dump_statistics(SvcContext *svc_ctx); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_EXAMPLES_SVC_CONTEXT_H_ diff --git a/media/libvpx/libvpx/examples/svc_encodeframe.c b/media/libvpx/libvpx/examples/svc_encodeframe.c new file mode 100644 index 0000000000..1dd731765c --- /dev/null +++ b/media/libvpx/libvpx/examples/svc_encodeframe.c @@ -0,0 +1,634 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/** + * @file + * VP9 SVC encoding support via libvpx + */ + +#include +#include +#include +#include +#include +#include +#include +#define VPX_DISABLE_CTRL_TYPECHECKS 1 +#include "../tools_common.h" +#include "./vpx_config.h" +#include "./svc_context.h" +#include "vpx/vp8cx.h" +#include "vpx/vpx_encoder.h" +#include "vpx_mem/vpx_mem.h" +#include "vp9/common/vp9_onyxc_int.h" + +#ifdef __MINGW32__ +#define strtok_r strtok_s +#ifndef MINGW_HAS_SECURE_API +// proto from /usr/x86_64-w64-mingw32/include/sec_api/string_s.h +_CRTIMP char *__cdecl strtok_s(char *str, const char *delim, char **context); +#endif /* MINGW_HAS_SECURE_API */ +#endif /* __MINGW32__ */ + +#ifdef _MSC_VER +#define strdup _strdup +#define strtok_r strtok_s +#endif + +#define SVC_REFERENCE_FRAMES 8 +#define SUPERFRAME_SLOTS (8) +#define SUPERFRAME_BUFFER_SIZE (SUPERFRAME_SLOTS * sizeof(uint32_t) + 2) + +#define MAX_QUANTIZER 63 + +static const int DEFAULT_SCALE_FACTORS_NUM[VPX_SS_MAX_LAYERS] = { 4, 5, 7, 11, + 16 }; + +static const int DEFAULT_SCALE_FACTORS_DEN[VPX_SS_MAX_LAYERS] = { 16, 16, 16, + 16, 16 }; + +static const int DEFAULT_SCALE_FACTORS_NUM_2x[VPX_SS_MAX_LAYERS] = { 1, 2, 4 }; + +static const int DEFAULT_SCALE_FACTORS_DEN_2x[VPX_SS_MAX_LAYERS] = { 4, 4, 4 }; + +typedef enum { + QUANTIZER = 0, + BITRATE, + SCALE_FACTOR, + AUTO_ALT_REF, + ALL_OPTION_TYPES +} LAYER_OPTION_TYPE; + +static const int option_max_values[ALL_OPTION_TYPES] = { 63, INT_MAX, INT_MAX, + 1 }; + +static const int option_min_values[ALL_OPTION_TYPES] = { 0, 0, 1, 0 }; + +// One encoded frame +typedef struct FrameData { + void *buf; // compressed data buffer + size_t size; // length of compressed data + vpx_codec_frame_flags_t flags; /**< flags for this frame */ + struct FrameData *next; +} FrameData; + +static SvcInternal_t *get_svc_internal(SvcContext *svc_ctx) { + if (svc_ctx == NULL) return NULL; + if (svc_ctx->internal == NULL) { + SvcInternal_t *const si = (SvcInternal_t *)malloc(sizeof(*si)); + if (si != NULL) { + memset(si, 0, sizeof(*si)); + } + svc_ctx->internal = si; + } + return (SvcInternal_t *)svc_ctx->internal; +} + +static const SvcInternal_t *get_const_svc_internal(const SvcContext *svc_ctx) { + if (svc_ctx == NULL) return NULL; + return (const SvcInternal_t *)svc_ctx->internal; +} + +static VPX_TOOLS_FORMAT_PRINTF(3, 4) int svc_log(SvcContext *svc_ctx, + SVC_LOG_LEVEL level, + const char *fmt, ...) { + char buf[512]; + int retval = 0; + va_list ap; + + if (level > svc_ctx->log_level) { + return retval; + } + + va_start(ap, fmt); + retval = vsnprintf(buf, sizeof(buf), fmt, ap); + va_end(ap); + + printf("%s", buf); + + return retval; +} + +static vpx_codec_err_t extract_option(LAYER_OPTION_TYPE type, char *input, + int *value0, int *value1) { + if (type == SCALE_FACTOR) { + *value0 = (int)strtol(input, &input, 10); + if (*input++ != '/') return VPX_CODEC_INVALID_PARAM; + *value1 = (int)strtol(input, &input, 10); + + if (*value0 < option_min_values[SCALE_FACTOR] || + *value1 < option_min_values[SCALE_FACTOR] || + *value0 > option_max_values[SCALE_FACTOR] || + *value1 > option_max_values[SCALE_FACTOR] || + *value0 > *value1) // num shouldn't be greater than den + return VPX_CODEC_INVALID_PARAM; + } else { + *value0 = atoi(input); + if (*value0 < option_min_values[type] || *value0 > option_max_values[type]) + return VPX_CODEC_INVALID_PARAM; + } + return VPX_CODEC_OK; +} + +static vpx_codec_err_t parse_layer_options_from_string(SvcContext *svc_ctx, + LAYER_OPTION_TYPE type, + const char *input, + int *option0, + int *option1) { + int i; + vpx_codec_err_t res = VPX_CODEC_OK; + char *input_string; + char *token; + const char *delim = ","; + char *save_ptr; + int num_layers = svc_ctx->spatial_layers; + if (type == BITRATE) + num_layers = svc_ctx->spatial_layers * svc_ctx->temporal_layers; + + if (input == NULL || option0 == NULL || + (option1 == NULL && type == SCALE_FACTOR)) + return VPX_CODEC_INVALID_PARAM; + + input_string = strdup(input); + if (input_string == NULL) return VPX_CODEC_MEM_ERROR; + token = strtok_r(input_string, delim, &save_ptr); + for (i = 0; i < num_layers; ++i) { + if (token != NULL) { + res = extract_option(type, token, option0 + i, option1 + i); + if (res != VPX_CODEC_OK) break; + token = strtok_r(NULL, delim, &save_ptr); + } else { + break; + } + } + if (res == VPX_CODEC_OK && i != num_layers) { + svc_log(svc_ctx, SVC_LOG_ERROR, + "svc: layer params type: %d %d values required, " + "but only %d specified\n", + type, num_layers, i); + res = VPX_CODEC_INVALID_PARAM; + } + free(input_string); + return res; +} + +/** + * Parse SVC encoding options + * Format: encoding-mode=,layers= + * scale-factors=/,/,... + * quantizers=,,... + * svc_mode = [i|ip|alt_ip|gf] + */ +static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) { + char *input_string; + char *option_name; + char *option_value; + char *input_ptr = NULL; + SvcInternal_t *const si = get_svc_internal(svc_ctx); + vpx_codec_err_t res = VPX_CODEC_OK; + int i, alt_ref_enabled = 0; + + if (options == NULL) return VPX_CODEC_OK; + input_string = strdup(options); + if (input_string == NULL) return VPX_CODEC_MEM_ERROR; + + // parse option name + option_name = strtok_r(input_string, "=", &input_ptr); + while (option_name != NULL) { + // parse option value + option_value = strtok_r(NULL, " ", &input_ptr); + if (option_value == NULL) { + svc_log(svc_ctx, SVC_LOG_ERROR, "option missing value: %s\n", + option_name); + res = VPX_CODEC_INVALID_PARAM; + break; + } + if (strcmp("spatial-layers", option_name) == 0) { + svc_ctx->spatial_layers = atoi(option_value); + } else if (strcmp("temporal-layers", option_name) == 0) { + svc_ctx->temporal_layers = atoi(option_value); + } else if (strcmp("scale-factors", option_name) == 0) { + res = parse_layer_options_from_string(svc_ctx, SCALE_FACTOR, option_value, + si->svc_params.scaling_factor_num, + si->svc_params.scaling_factor_den); + if (res != VPX_CODEC_OK) break; + } else if (strcmp("max-quantizers", option_name) == 0) { + res = + parse_layer_options_from_string(svc_ctx, QUANTIZER, option_value, + si->svc_params.max_quantizers, NULL); + if (res != VPX_CODEC_OK) break; + } else if (strcmp("min-quantizers", option_name) == 0) { + res = + parse_layer_options_from_string(svc_ctx, QUANTIZER, option_value, + si->svc_params.min_quantizers, NULL); + if (res != VPX_CODEC_OK) break; + } else if (strcmp("auto-alt-refs", option_name) == 0) { + res = parse_layer_options_from_string(svc_ctx, AUTO_ALT_REF, option_value, + si->enable_auto_alt_ref, NULL); + if (res != VPX_CODEC_OK) break; + } else if (strcmp("bitrates", option_name) == 0) { + res = parse_layer_options_from_string(svc_ctx, BITRATE, option_value, + si->bitrates, NULL); + if (res != VPX_CODEC_OK) break; + } else if (strcmp("multi-frame-contexts", option_name) == 0) { + si->use_multiple_frame_contexts = atoi(option_value); + } else { + svc_log(svc_ctx, SVC_LOG_ERROR, "invalid option: %s\n", option_name); + res = VPX_CODEC_INVALID_PARAM; + break; + } + option_name = strtok_r(NULL, "=", &input_ptr); + } + free(input_string); + + for (i = 0; i < svc_ctx->spatial_layers; ++i) { + if (si->svc_params.max_quantizers[i] > MAX_QUANTIZER || + si->svc_params.max_quantizers[i] < 0 || + si->svc_params.min_quantizers[i] > si->svc_params.max_quantizers[i] || + si->svc_params.min_quantizers[i] < 0) + res = VPX_CODEC_INVALID_PARAM; + } + + if (si->use_multiple_frame_contexts && + (svc_ctx->spatial_layers > 3 || + svc_ctx->spatial_layers * svc_ctx->temporal_layers > 4)) + res = VPX_CODEC_INVALID_PARAM; + + for (i = 0; i < svc_ctx->spatial_layers; ++i) + alt_ref_enabled += si->enable_auto_alt_ref[i]; + if (alt_ref_enabled > REF_FRAMES - svc_ctx->spatial_layers) { + svc_log(svc_ctx, SVC_LOG_ERROR, + "svc: auto alt ref: Maxinum %d(REF_FRAMES - layers) layers could" + "enabled auto alt reference frame, but %d layers are enabled\n", + REF_FRAMES - svc_ctx->spatial_layers, alt_ref_enabled); + res = VPX_CODEC_INVALID_PARAM; + } + + return res; +} + +vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options) { + SvcInternal_t *const si = get_svc_internal(svc_ctx); + if (svc_ctx == NULL || options == NULL || si == NULL) { + return VPX_CODEC_INVALID_PARAM; + } + strncpy(si->options, options, sizeof(si->options) - 1); + si->options[sizeof(si->options) - 1] = '\0'; + return VPX_CODEC_OK; +} + +static vpx_codec_err_t assign_layer_bitrates( + const SvcContext *svc_ctx, vpx_codec_enc_cfg_t *const enc_cfg) { + int i; + const SvcInternal_t *const si = get_const_svc_internal(svc_ctx); + int sl, tl, spatial_layer_target; + + if (svc_ctx->temporal_layering_mode != 0) { + if (si->bitrates[0] != 0) { + unsigned int total_bitrate = 0; + for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) { + total_bitrate += si->bitrates[sl * svc_ctx->temporal_layers + + svc_ctx->temporal_layers - 1]; + for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) { + enc_cfg->ss_target_bitrate[sl * svc_ctx->temporal_layers] += + (unsigned int)si->bitrates[sl * svc_ctx->temporal_layers + tl]; + enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers + tl] = + si->bitrates[sl * svc_ctx->temporal_layers + tl]; + if (tl > 0 && (si->bitrates[sl * svc_ctx->temporal_layers + tl] <= + si->bitrates[sl * svc_ctx->temporal_layers + tl - 1])) + return VPX_CODEC_INVALID_PARAM; + } + } + if (total_bitrate != enc_cfg->rc_target_bitrate) + return VPX_CODEC_INVALID_PARAM; + } else { + float total = 0; + float alloc_ratio[VPX_MAX_LAYERS] = { 0 }; + + for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) { + if (si->svc_params.scaling_factor_den[sl] > 0) { + alloc_ratio[sl] = (float)(pow(2, sl)); + total += alloc_ratio[sl]; + } + } + + for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) { + enc_cfg->ss_target_bitrate[sl] = spatial_layer_target = + (unsigned int)(enc_cfg->rc_target_bitrate * alloc_ratio[sl] / + total); + if (svc_ctx->temporal_layering_mode == 3) { + enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers] = + (spatial_layer_target * 6) / 10; // 60% + enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers + 1] = + (spatial_layer_target * 8) / 10; // 80% + enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers + 2] = + spatial_layer_target; + } else if (svc_ctx->temporal_layering_mode == 2 || + svc_ctx->temporal_layering_mode == 1) { + enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers] = + spatial_layer_target * 2 / 3; + enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers + 1] = + spatial_layer_target; + } else { + // User should explicitly assign bitrates in this case. + assert(0); + } + } + } + } else { + if (si->bitrates[0] != 0) { + unsigned int total_bitrate = 0; + for (i = 0; i < svc_ctx->spatial_layers; ++i) { + enc_cfg->ss_target_bitrate[i] = (unsigned int)si->bitrates[i]; + enc_cfg->layer_target_bitrate[i] = (unsigned int)si->bitrates[i]; + total_bitrate += si->bitrates[i]; + } + if (total_bitrate != enc_cfg->rc_target_bitrate) + return VPX_CODEC_INVALID_PARAM; + } else { + float total = 0; + float alloc_ratio[VPX_MAX_LAYERS] = { 0 }; + + for (i = 0; i < svc_ctx->spatial_layers; ++i) { + if (si->svc_params.scaling_factor_den[i] > 0) { + alloc_ratio[i] = (float)(si->svc_params.scaling_factor_num[i] * 1.0 / + si->svc_params.scaling_factor_den[i]); + + alloc_ratio[i] *= alloc_ratio[i]; + total += alloc_ratio[i]; + } + } + for (i = 0; i < VPX_SS_MAX_LAYERS; ++i) { + if (total > 0) { + enc_cfg->layer_target_bitrate[i] = + (unsigned int)(enc_cfg->rc_target_bitrate * alloc_ratio[i] / + total); + } + } + } + } + return VPX_CODEC_OK; +} + +vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, + vpx_codec_iface_t *iface, + vpx_codec_enc_cfg_t *enc_cfg) { + vpx_codec_err_t res; + int sl, tl; + SvcInternal_t *const si = get_svc_internal(svc_ctx); + if (svc_ctx == NULL || codec_ctx == NULL || iface == NULL || + enc_cfg == NULL) { + return VPX_CODEC_INVALID_PARAM; + } + if (si == NULL) return VPX_CODEC_MEM_ERROR; + + si->codec_ctx = codec_ctx; + + si->width = enc_cfg->g_w; + si->height = enc_cfg->g_h; + + si->kf_dist = enc_cfg->kf_max_dist; + + if (svc_ctx->spatial_layers == 0) + svc_ctx->spatial_layers = VPX_SS_DEFAULT_LAYERS; + if (svc_ctx->spatial_layers < 1 || + svc_ctx->spatial_layers > VPX_SS_MAX_LAYERS) { + svc_log(svc_ctx, SVC_LOG_ERROR, "spatial layers: invalid value: %d\n", + svc_ctx->spatial_layers); + return VPX_CODEC_INVALID_PARAM; + } + + // Note: temporal_layering_mode only applies to one-pass CBR + // si->svc_params.temporal_layering_mode = svc_ctx->temporal_layering_mode; + if (svc_ctx->temporal_layering_mode == 3) { + svc_ctx->temporal_layers = 3; + } else if (svc_ctx->temporal_layering_mode == 2 || + svc_ctx->temporal_layering_mode == 1) { + svc_ctx->temporal_layers = 2; + } + + for (sl = 0; sl < VPX_SS_MAX_LAYERS; ++sl) { + si->svc_params.scaling_factor_num[sl] = DEFAULT_SCALE_FACTORS_NUM[sl]; + si->svc_params.scaling_factor_den[sl] = DEFAULT_SCALE_FACTORS_DEN[sl]; + si->svc_params.speed_per_layer[sl] = svc_ctx->speed; + } + if (enc_cfg->rc_end_usage == VPX_CBR && enc_cfg->g_pass == VPX_RC_ONE_PASS && + svc_ctx->spatial_layers <= 3) { + for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) { + int sl2 = (svc_ctx->spatial_layers == 2) ? sl + 1 : sl; + si->svc_params.scaling_factor_num[sl] = DEFAULT_SCALE_FACTORS_NUM_2x[sl2]; + si->svc_params.scaling_factor_den[sl] = DEFAULT_SCALE_FACTORS_DEN_2x[sl2]; + } + if (svc_ctx->spatial_layers == 1) { + si->svc_params.scaling_factor_num[0] = 1; + si->svc_params.scaling_factor_den[0] = 1; + } + } + for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) { + for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) { + const int i = sl * svc_ctx->temporal_layers + tl; + si->svc_params.max_quantizers[i] = MAX_QUANTIZER; + si->svc_params.min_quantizers[i] = 0; + if (enc_cfg->rc_end_usage == VPX_CBR && + enc_cfg->g_pass == VPX_RC_ONE_PASS) { + si->svc_params.max_quantizers[i] = 56; + si->svc_params.min_quantizers[i] = 2; + } + } + } + + // Parse aggregate command line options. Options must start with + // "layers=xx" then followed by other options + res = parse_options(svc_ctx, si->options); + if (res != VPX_CODEC_OK) return res; + + if (svc_ctx->spatial_layers < 1) svc_ctx->spatial_layers = 1; + if (svc_ctx->spatial_layers > VPX_SS_MAX_LAYERS) + svc_ctx->spatial_layers = VPX_SS_MAX_LAYERS; + + if (svc_ctx->temporal_layers < 1) svc_ctx->temporal_layers = 1; + if (svc_ctx->temporal_layers > VPX_TS_MAX_LAYERS) + svc_ctx->temporal_layers = VPX_TS_MAX_LAYERS; + + if (svc_ctx->temporal_layers * svc_ctx->spatial_layers > VPX_MAX_LAYERS) { + svc_log( + svc_ctx, SVC_LOG_ERROR, + "spatial layers * temporal layers (%d) exceeds the maximum number of " + "allowed layers of %d\n", + svc_ctx->spatial_layers * svc_ctx->temporal_layers, VPX_MAX_LAYERS); + return VPX_CODEC_INVALID_PARAM; + } + res = assign_layer_bitrates(svc_ctx, enc_cfg); + if (res != VPX_CODEC_OK) { + svc_log(svc_ctx, SVC_LOG_ERROR, + "layer bitrates incorrect: \n" + "1) spatial layer bitrates should sum up to target \n" + "2) temporal layer bitrates should be increasing within \n" + "a spatial layer \n"); + return VPX_CODEC_INVALID_PARAM; + } + + if (svc_ctx->temporal_layers > 1) { + int i; + for (i = 0; i < svc_ctx->temporal_layers; ++i) { + enc_cfg->ts_target_bitrate[i] = + enc_cfg->rc_target_bitrate / svc_ctx->temporal_layers; + enc_cfg->ts_rate_decimator[i] = 1 << (svc_ctx->temporal_layers - 1 - i); + } + } + + if (svc_ctx->threads) enc_cfg->g_threads = svc_ctx->threads; + + // Modify encoder configuration + enc_cfg->ss_number_layers = svc_ctx->spatial_layers; + enc_cfg->ts_number_layers = svc_ctx->temporal_layers; + + if (enc_cfg->rc_end_usage == VPX_CBR) { + enc_cfg->rc_resize_allowed = 0; + enc_cfg->rc_min_quantizer = 2; + enc_cfg->rc_max_quantizer = 56; + enc_cfg->rc_undershoot_pct = 50; + enc_cfg->rc_overshoot_pct = 50; + enc_cfg->rc_buf_initial_sz = 500; + enc_cfg->rc_buf_optimal_sz = 600; + enc_cfg->rc_buf_sz = 1000; + } + + for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) { + for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) { + const int i = sl * svc_ctx->temporal_layers + tl; + if (enc_cfg->rc_end_usage == VPX_CBR && + enc_cfg->g_pass == VPX_RC_ONE_PASS) { + si->svc_params.max_quantizers[i] = enc_cfg->rc_max_quantizer; + si->svc_params.min_quantizers[i] = enc_cfg->rc_min_quantizer; + } + } + } + + if (enc_cfg->g_error_resilient == 0 && si->use_multiple_frame_contexts == 0) + enc_cfg->g_error_resilient = 1; + + // Initialize codec + res = vpx_codec_enc_init(codec_ctx, iface, enc_cfg, VPX_CODEC_USE_PSNR); + if (res != VPX_CODEC_OK) { + svc_log(svc_ctx, SVC_LOG_ERROR, "svc_enc_init error\n"); + return res; + } + if (svc_ctx->spatial_layers > 1 || svc_ctx->temporal_layers > 1) { + vpx_codec_control(codec_ctx, VP9E_SET_SVC, 1); + vpx_codec_control(codec_ctx, VP9E_SET_SVC_PARAMETERS, &si->svc_params); + } + return VPX_CODEC_OK; +} + +/** + * Encode a frame into multiple layers + * Create a superframe containing the individual layers + */ +vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, + struct vpx_image *rawimg, vpx_codec_pts_t pts, + int64_t duration, int deadline) { + vpx_codec_err_t res; + vpx_codec_iter_t iter; + const vpx_codec_cx_pkt_t *cx_pkt; + SvcInternal_t *const si = get_svc_internal(svc_ctx); + if (svc_ctx == NULL || codec_ctx == NULL || si == NULL) { + return VPX_CODEC_INVALID_PARAM; + } + + res = + vpx_codec_encode(codec_ctx, rawimg, pts, (uint32_t)duration, 0, deadline); + if (res != VPX_CODEC_OK) { + return res; + } + // save compressed data + iter = NULL; + while ((cx_pkt = vpx_codec_get_cx_data(codec_ctx, &iter))) { + switch (cx_pkt->kind) { + case VPX_CODEC_PSNR_PKT: ++si->psnr_pkt_received; break; + default: break; + } + } + + return VPX_CODEC_OK; +} + +static double calc_psnr(double d) { + if (d == 0) return 100; + return -10.0 * log(d) / log(10.0); +} + +// dump accumulated statistics and reset accumulated values +void vpx_svc_dump_statistics(SvcContext *svc_ctx) { + int number_of_frames; + int i, j; + uint32_t bytes_total = 0; + double scale[COMPONENTS]; + double psnr[COMPONENTS]; + double mse[COMPONENTS]; + double y_scale; + + SvcInternal_t *const si = get_svc_internal(svc_ctx); + if (svc_ctx == NULL || si == NULL) return; + + number_of_frames = si->psnr_pkt_received; + if (number_of_frames <= 0) return; + + svc_log(svc_ctx, SVC_LOG_INFO, "\n"); + for (i = 0; i < svc_ctx->spatial_layers; ++i) { + svc_log(svc_ctx, SVC_LOG_INFO, + "Layer %d Average PSNR=[%2.3f, %2.3f, %2.3f, %2.3f], Bytes=[%u]\n", + i, si->psnr_sum[i][0] / number_of_frames, + si->psnr_sum[i][1] / number_of_frames, + si->psnr_sum[i][2] / number_of_frames, + si->psnr_sum[i][3] / number_of_frames, si->bytes_sum[i]); + // the following psnr calculation is deduced from ffmpeg.c#print_report + y_scale = si->width * si->height * 255.0 * 255.0 * number_of_frames; + scale[1] = y_scale; + scale[2] = scale[3] = y_scale / 4; // U or V + scale[0] = y_scale * 1.5; // total + + for (j = 0; j < COMPONENTS; j++) { + psnr[j] = calc_psnr(si->sse_sum[i][j] / scale[j]); + mse[j] = si->sse_sum[i][j] * 255.0 * 255.0 / scale[j]; + } + svc_log(svc_ctx, SVC_LOG_INFO, + "Layer %d Overall PSNR=[%2.3f, %2.3f, %2.3f, %2.3f]\n", i, psnr[0], + psnr[1], psnr[2], psnr[3]); + svc_log(svc_ctx, SVC_LOG_INFO, + "Layer %d Overall MSE=[%2.3f, %2.3f, %2.3f, %2.3f]\n", i, mse[0], + mse[1], mse[2], mse[3]); + + bytes_total += si->bytes_sum[i]; + // Clear sums for next time. + si->bytes_sum[i] = 0; + for (j = 0; j < COMPONENTS; ++j) { + si->psnr_sum[i][j] = 0; + si->sse_sum[i][j] = 0; + } + } + + // only display statistics once + si->psnr_pkt_received = 0; + + svc_log(svc_ctx, SVC_LOG_INFO, "Total Bytes=[%u]\n", bytes_total); +} + +void vpx_svc_release(SvcContext *svc_ctx) { + SvcInternal_t *si; + if (svc_ctx == NULL) return; + // do not use get_svc_internal as it will unnecessarily allocate an + // SvcInternal_t if it was not already allocated + si = (SvcInternal_t *)svc_ctx->internal; + if (si != NULL) { + free(si); + svc_ctx->internal = NULL; + } +} diff --git a/media/libvpx/libvpx/examples/twopass_encoder.c b/media/libvpx/libvpx/examples/twopass_encoder.c new file mode 100644 index 0000000000..07a10d9cf3 --- /dev/null +++ b/media/libvpx/libvpx/examples/twopass_encoder.c @@ -0,0 +1,257 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// Two Pass Encoder +// ================ +// +// This is an example of a two pass encoder loop. It takes an input file in +// YV12 format, passes it through the encoder twice, and writes the compressed +// frames to disk in IVF format. It builds upon the simple_encoder example. +// +// Twopass Variables +// ----------------- +// Twopass mode needs to track the current pass number and the buffer of +// statistics packets. +// +// Updating The Configuration +// --------------------------------- +// In two pass mode, the configuration has to be updated on each pass. The +// statistics buffer is passed on the last pass. +// +// Encoding A Frame +// ---------------- +// Encoding a frame in two pass mode is identical to the simple encoder +// example. To increase the quality while sacrificing encoding speed, +// VPX_DL_BEST_QUALITY can be used in place of VPX_DL_GOOD_QUALITY. +// +// Processing Statistics Packets +// ----------------------------- +// Each packet of type `VPX_CODEC_CX_FRAME_PKT` contains the encoded data +// for this frame. We write a IVF frame header, followed by the raw data. +// +// +// Pass Progress Reporting +// ----------------------------- +// It's sometimes helpful to see when each pass completes. +// +// +// Clean-up +// ----------------------------- +// Destruction of the encoder instance must be done on each pass. The +// raw image should be destroyed at the end as usual. + +#include +#include +#include + +#include "vpx/vpx_encoder.h" + +#include "../tools_common.h" +#include "../video_writer.h" + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, + "Usage: %s " + "\n", + exec_name); + exit(EXIT_FAILURE); +} + +static int get_frame_stats(vpx_codec_ctx_t *ctx, const vpx_image_t *img, + vpx_codec_pts_t pts, unsigned int duration, + vpx_enc_frame_flags_t flags, unsigned int deadline, + vpx_fixed_buf_t *stats) { + int got_pkts = 0; + vpx_codec_iter_t iter = NULL; + const vpx_codec_cx_pkt_t *pkt = NULL; + const vpx_codec_err_t res = + vpx_codec_encode(ctx, img, pts, duration, flags, deadline); + if (res != VPX_CODEC_OK) die_codec(ctx, "Failed to get frame stats."); + + while ((pkt = vpx_codec_get_cx_data(ctx, &iter)) != NULL) { + got_pkts = 1; + + if (pkt->kind == VPX_CODEC_STATS_PKT) { + const uint8_t *const pkt_buf = pkt->data.twopass_stats.buf; + const size_t pkt_size = pkt->data.twopass_stats.sz; + stats->buf = realloc(stats->buf, stats->sz + pkt_size); + if (!stats->buf) die("Failed to reallocate stats buffer."); + memcpy((uint8_t *)stats->buf + stats->sz, pkt_buf, pkt_size); + stats->sz += pkt_size; + } + } + + return got_pkts; +} + +static int encode_frame(vpx_codec_ctx_t *ctx, const vpx_image_t *img, + vpx_codec_pts_t pts, unsigned int duration, + vpx_enc_frame_flags_t flags, unsigned int deadline, + VpxVideoWriter *writer) { + int got_pkts = 0; + vpx_codec_iter_t iter = NULL; + const vpx_codec_cx_pkt_t *pkt = NULL; + const vpx_codec_err_t res = + vpx_codec_encode(ctx, img, pts, duration, flags, deadline); + if (res != VPX_CODEC_OK) die_codec(ctx, "Failed to encode frame."); + + while ((pkt = vpx_codec_get_cx_data(ctx, &iter)) != NULL) { + got_pkts = 1; + if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) { + const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0; + + if (!vpx_video_writer_write_frame(writer, pkt->data.frame.buf, + pkt->data.frame.sz, + pkt->data.frame.pts)) + die_codec(ctx, "Failed to write compressed frame."); + printf(keyframe ? "K" : "."); + fflush(stdout); + } + } + + return got_pkts; +} + +static vpx_fixed_buf_t pass0(vpx_image_t *raw, FILE *infile, + const VpxInterface *encoder, + const vpx_codec_enc_cfg_t *cfg, int max_frames) { + vpx_codec_ctx_t codec; + int frame_count = 0; + vpx_fixed_buf_t stats = { NULL, 0 }; + + if (vpx_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0)) + die("Failed to initialize encoder"); + + // Calculate frame statistics. + while (vpx_img_read(raw, infile)) { + ++frame_count; + get_frame_stats(&codec, raw, frame_count, 1, 0, VPX_DL_GOOD_QUALITY, + &stats); + if (max_frames > 0 && frame_count >= max_frames) break; + } + + // Flush encoder. + while (get_frame_stats(&codec, NULL, frame_count, 1, 0, VPX_DL_GOOD_QUALITY, + &stats)) { + } + + printf("Pass 0 complete. Processed %d frames.\n", frame_count); + if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); + + return stats; +} + +static void pass1(vpx_image_t *raw, FILE *infile, const char *outfile_name, + const VpxInterface *encoder, const vpx_codec_enc_cfg_t *cfg, + int max_frames) { + VpxVideoInfo info = { encoder->fourcc, + cfg->g_w, + cfg->g_h, + { cfg->g_timebase.num, cfg->g_timebase.den } }; + VpxVideoWriter *writer = NULL; + vpx_codec_ctx_t codec; + int frame_count = 0; + + writer = vpx_video_writer_open(outfile_name, kContainerIVF, &info); + if (!writer) die("Failed to open %s for writing", outfile_name); + + if (vpx_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0)) + die("Failed to initialize encoder"); + + // Encode frames. + while (vpx_img_read(raw, infile)) { + ++frame_count; + encode_frame(&codec, raw, frame_count, 1, 0, VPX_DL_GOOD_QUALITY, writer); + + if (max_frames > 0 && frame_count >= max_frames) break; + } + + // Flush encoder. + while (encode_frame(&codec, NULL, -1, 1, 0, VPX_DL_GOOD_QUALITY, writer)) { + } + + printf("\n"); + + if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); + + vpx_video_writer_close(writer); + + printf("Pass 1 complete. Processed %d frames.\n", frame_count); +} + +int main(int argc, char **argv) { + FILE *infile = NULL; + int w, h; + vpx_codec_ctx_t codec; + vpx_codec_enc_cfg_t cfg; + vpx_image_t raw; + vpx_codec_err_t res; + vpx_fixed_buf_t stats; + + const VpxInterface *encoder = NULL; + const int fps = 30; // TODO(dkovalev) add command line argument + const int bitrate = 200; // kbit/s TODO(dkovalev) add command line argument + const char *const codec_arg = argv[1]; + const char *const width_arg = argv[2]; + const char *const height_arg = argv[3]; + const char *const infile_arg = argv[4]; + const char *const outfile_arg = argv[5]; + int max_frames = 0; + exec_name = argv[0]; + + if (argc != 7) die("Invalid number of arguments."); + + max_frames = (int)strtol(argv[6], NULL, 0); + + encoder = get_vpx_encoder_by_name(codec_arg); + if (!encoder) die("Unsupported codec."); + + w = (int)strtol(width_arg, NULL, 0); + h = (int)strtol(height_arg, NULL, 0); + + if (w <= 0 || h <= 0 || (w % 2) != 0 || (h % 2) != 0) + die("Invalid frame size: %dx%d", w, h); + + if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, w, h, 1)) + die("Failed to allocate image (%dx%d)", w, h); + + printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface())); + + // Configuration + res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0); + if (res) die_codec(&codec, "Failed to get default codec config."); + + cfg.g_w = w; + cfg.g_h = h; + cfg.g_timebase.num = 1; + cfg.g_timebase.den = fps; + cfg.rc_target_bitrate = bitrate; + + if (!(infile = fopen(infile_arg, "rb"))) + die("Failed to open %s for reading", infile_arg); + + // Pass 0 + cfg.g_pass = VPX_RC_FIRST_PASS; + stats = pass0(&raw, infile, encoder, &cfg, max_frames); + + // Pass 1 + rewind(infile); + cfg.g_pass = VPX_RC_LAST_PASS; + cfg.rc_twopass_stats_in = stats; + pass1(&raw, infile, outfile_arg, encoder, &cfg, max_frames); + free(stats.buf); + + vpx_img_free(&raw); + fclose(infile); + + return EXIT_SUCCESS; +} diff --git a/media/libvpx/libvpx/examples/vp8_multi_resolution_encoder.c b/media/libvpx/libvpx/examples/vp8_multi_resolution_encoder.c new file mode 100644 index 0000000000..62d96de557 --- /dev/null +++ b/media/libvpx/libvpx/examples/vp8_multi_resolution_encoder.c @@ -0,0 +1,666 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/* + * This is an example demonstrating multi-resolution encoding in VP8. + * High-resolution input video is down-sampled to lower-resolutions. The + * encoder then encodes the video and outputs multiple bitstreams with + * different resolutions. + * + * This test also allows for settings temporal layers for each spatial layer. + * Different number of temporal layers per spatial stream may be used. + * Currently up to 3 temporal layers per spatial stream (encoder) are supported + * in this test. + */ + +#include "./vpx_config.h" + +#include +#include +#include +#include +#include +#include +#include +#include "vpx_ports/vpx_timer.h" +#include "vpx/vpx_encoder.h" +#include "vpx/vp8cx.h" +#include "vpx_ports/mem_ops.h" +#include "../tools_common.h" +#define interface (vpx_codec_vp8_cx()) +#define fourcc 0x30385056 + +void usage_exit(void) { exit(EXIT_FAILURE); } + +/* + * The input video frame is downsampled several times to generate a multi-level + * hierarchical structure. NUM_ENCODERS is defined as the number of encoding + * levels required. For example, if the size of input video is 1280x720, + * NUM_ENCODERS is 3, and down-sampling factor is 2, the encoder outputs 3 + * bitstreams with resolution of 1280x720(level 0), 640x360(level 1), and + * 320x180(level 2) respectively. + */ + +/* Number of encoders (spatial resolutions) used in this test. */ +#define NUM_ENCODERS 3 + +/* Maximum number of temporal layers allowed for this test. */ +#define MAX_NUM_TEMPORAL_LAYERS 3 + +/* This example uses the scaler function in libyuv. */ +#include "third_party/libyuv/include/libyuv/basic_types.h" +#include "third_party/libyuv/include/libyuv/scale.h" +#include "third_party/libyuv/include/libyuv/cpu_id.h" + +int (*read_frame_p)(FILE *f, vpx_image_t *img); + +static int mulres_read_frame(FILE *f, vpx_image_t *img) { + size_t nbytes, to_read; + int res = 1; + + to_read = img->w * img->h * 3 / 2; + nbytes = fread(img->planes[0], 1, to_read, f); + if (nbytes != to_read) { + res = 0; + if (nbytes > 0) + printf("Warning: Read partial frame. Check your width & height!\n"); + } + return res; +} + +static int mulres_read_frame_by_row(FILE *f, vpx_image_t *img) { + size_t nbytes, to_read; + int res = 1; + int plane; + + for (plane = 0; plane < 3; plane++) { + unsigned char *ptr; + int w = (plane ? (1 + img->d_w) / 2 : img->d_w); + int h = (plane ? (1 + img->d_h) / 2 : img->d_h); + int r; + + /* Determine the correct plane based on the image format. The for-loop + * always counts in Y,U,V order, but this may not match the order of + * the data on disk. + */ + switch (plane) { + case 1: + ptr = img->planes[img->fmt == VPX_IMG_FMT_YV12 ? VPX_PLANE_V + : VPX_PLANE_U]; + break; + case 2: + ptr = img->planes[img->fmt == VPX_IMG_FMT_YV12 ? VPX_PLANE_U + : VPX_PLANE_V]; + break; + default: ptr = img->planes[plane]; + } + + for (r = 0; r < h; r++) { + to_read = w; + + nbytes = fread(ptr, 1, to_read, f); + if (nbytes != to_read) { + res = 0; + if (nbytes > 0) + printf("Warning: Read partial frame. Check your width & height!\n"); + break; + } + + ptr += img->stride[plane]; + } + if (!res) break; + } + + return res; +} + +static void write_ivf_file_header(FILE *outfile, const vpx_codec_enc_cfg_t *cfg, + int frame_cnt) { + char header[32]; + + if (cfg->g_pass != VPX_RC_ONE_PASS && cfg->g_pass != VPX_RC_LAST_PASS) return; + header[0] = 'D'; + header[1] = 'K'; + header[2] = 'I'; + header[3] = 'F'; + mem_put_le16(header + 4, 0); /* version */ + mem_put_le16(header + 6, 32); /* headersize */ + mem_put_le32(header + 8, fourcc); /* headersize */ + mem_put_le16(header + 12, cfg->g_w); /* width */ + mem_put_le16(header + 14, cfg->g_h); /* height */ + mem_put_le32(header + 16, cfg->g_timebase.den); /* rate */ + mem_put_le32(header + 20, cfg->g_timebase.num); /* scale */ + mem_put_le32(header + 24, frame_cnt); /* length */ + mem_put_le32(header + 28, 0); /* unused */ + + (void)fwrite(header, 1, 32, outfile); +} + +static void write_ivf_frame_header(FILE *outfile, + const vpx_codec_cx_pkt_t *pkt) { + char header[12]; + vpx_codec_pts_t pts; + + if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return; + + pts = pkt->data.frame.pts; + mem_put_le32(header, (int)pkt->data.frame.sz); + mem_put_le32(header + 4, pts & 0xFFFFFFFF); + mem_put_le32(header + 8, pts >> 32); + + (void)fwrite(header, 1, 12, outfile); +} + +/* Temporal scaling parameters */ +/* This sets all the temporal layer parameters given |num_temporal_layers|, + * including the target bit allocation across temporal layers. Bit allocation + * parameters will be passed in as user parameters in another version. + */ +static void set_temporal_layer_pattern(int num_temporal_layers, + vpx_codec_enc_cfg_t *cfg, int bitrate, + int *layer_flags) { + assert(num_temporal_layers <= MAX_NUM_TEMPORAL_LAYERS); + switch (num_temporal_layers) { + case 1: { + /* 1-layer */ + cfg->ts_number_layers = 1; + cfg->ts_periodicity = 1; + cfg->ts_rate_decimator[0] = 1; + cfg->ts_layer_id[0] = 0; + cfg->ts_target_bitrate[0] = bitrate; + + // Update L only. + layer_flags[0] = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF; + break; + } + + case 2: { + /* 2-layers, with sync point at first frame of layer 1. */ + cfg->ts_number_layers = 2; + cfg->ts_periodicity = 2; + cfg->ts_rate_decimator[0] = 2; + cfg->ts_rate_decimator[1] = 1; + cfg->ts_layer_id[0] = 0; + cfg->ts_layer_id[1] = 1; + // Use 60/40 bit allocation as example. + cfg->ts_target_bitrate[0] = (int)(0.6f * bitrate); + cfg->ts_target_bitrate[1] = bitrate; + + /* 0=L, 1=GF */ + // ARF is used as predictor for all frames, and is only updated on + // key frame. Sync point every 8 frames. + + // Layer 0: predict from L and ARF, update L and G. + layer_flags[0] = VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_ARF; + + // Layer 1: sync point: predict from L and ARF, and update G. + layer_flags[1] = + VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_ARF; + + // Layer 0, predict from L and ARF, update L. + layer_flags[2] = + VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF; + + // Layer 1: predict from L, G and ARF, and update G. + layer_flags[3] = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST | + VP8_EFLAG_NO_UPD_ENTROPY; + + // Layer 0 + layer_flags[4] = layer_flags[2]; + + // Layer 1 + layer_flags[5] = layer_flags[3]; + + // Layer 0 + layer_flags[6] = layer_flags[4]; + + // Layer 1 + layer_flags[7] = layer_flags[5]; + break; + } + + case 3: + default: { + // 3-layers structure where ARF is used as predictor for all frames, + // and is only updated on key frame. + // Sync points for layer 1 and 2 every 8 frames. + cfg->ts_number_layers = 3; + cfg->ts_periodicity = 4; + cfg->ts_rate_decimator[0] = 4; + cfg->ts_rate_decimator[1] = 2; + cfg->ts_rate_decimator[2] = 1; + cfg->ts_layer_id[0] = 0; + cfg->ts_layer_id[1] = 2; + cfg->ts_layer_id[2] = 1; + cfg->ts_layer_id[3] = 2; + // Use 45/20/35 bit allocation as example. + cfg->ts_target_bitrate[0] = (int)(0.45f * bitrate); + cfg->ts_target_bitrate[1] = (int)(0.65f * bitrate); + cfg->ts_target_bitrate[2] = bitrate; + + /* 0=L, 1=GF, 2=ARF */ + + // Layer 0: predict from L and ARF; update L and G. + layer_flags[0] = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_GF; + + // Layer 2: sync point: predict from L and ARF; update none. + layer_flags[1] = VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_GF | + VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST | + VP8_EFLAG_NO_UPD_ENTROPY; + + // Layer 1: sync point: predict from L and ARF; update G. + layer_flags[2] = + VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST; + + // Layer 2: predict from L, G, ARF; update none. + layer_flags[3] = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | + VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_ENTROPY; + + // Layer 0: predict from L and ARF; update L. + layer_flags[4] = + VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_GF; + + // Layer 2: predict from L, G, ARF; update none. + layer_flags[5] = layer_flags[3]; + + // Layer 1: predict from L, G, ARF; update G. + layer_flags[6] = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST; + + // Layer 2: predict from L, G, ARF; update none. + layer_flags[7] = layer_flags[3]; + break; + } + } +} + +/* The periodicity of the pattern given the number of temporal layers. */ +static int periodicity_to_num_layers[MAX_NUM_TEMPORAL_LAYERS] = { 1, 8, 8 }; + +int main(int argc, char **argv) { + FILE *infile, *outfile[NUM_ENCODERS]; + FILE *downsampled_input[NUM_ENCODERS - 1]; + char filename[50]; + vpx_codec_ctx_t codec[NUM_ENCODERS]; + vpx_codec_enc_cfg_t cfg[NUM_ENCODERS]; + int frame_cnt = 0; + vpx_image_t raw[NUM_ENCODERS]; + vpx_codec_err_t res[NUM_ENCODERS]; + + int i; + int width; + int height; + int length_frame; + int frame_avail; + int got_data; + int flags = 0; + int layer_id = 0; + + int layer_flags[VPX_TS_MAX_PERIODICITY * NUM_ENCODERS] = { 0 }; + int flag_periodicity; + + /*Currently, only realtime mode is supported in multi-resolution encoding.*/ + int arg_deadline = VPX_DL_REALTIME; + + /* Set show_psnr to 1/0 to show/not show PSNR. Choose show_psnr=0 if you + don't need to know PSNR, which will skip PSNR calculation and save + encoding time. */ + int show_psnr = 0; + int key_frame_insert = 0; + uint64_t psnr_sse_total[NUM_ENCODERS] = { 0 }; + uint64_t psnr_samples_total[NUM_ENCODERS] = { 0 }; + double psnr_totals[NUM_ENCODERS][4] = { { 0, 0 } }; + int psnr_count[NUM_ENCODERS] = { 0 }; + + int64_t cx_time = 0; + + /* Set the required target bitrates for each resolution level. + * If target bitrate for highest-resolution level is set to 0, + * (i.e. target_bitrate[0]=0), we skip encoding at that level. + */ + unsigned int target_bitrate[NUM_ENCODERS] = { 1000, 500, 100 }; + + /* Enter the frame rate of the input video */ + int framerate = 30; + + /* Set down-sampling factor for each resolution level. + dsf[0] controls down sampling from level 0 to level 1; + dsf[1] controls down sampling from level 1 to level 2; + dsf[2] is not used. */ + vpx_rational_t dsf[NUM_ENCODERS] = { { 2, 1 }, { 2, 1 }, { 1, 1 } }; + + /* Set the number of temporal layers for each encoder/resolution level, + * starting from highest resoln down to lowest resoln. */ + unsigned int num_temporal_layers[NUM_ENCODERS] = { 3, 3, 3 }; + + if (argc != (7 + 3 * NUM_ENCODERS)) + die("Usage: %s " + " \n", + argv[0]); + + printf("Using %s\n", vpx_codec_iface_name(interface)); + + width = (int)strtol(argv[1], NULL, 0); + height = (int)strtol(argv[2], NULL, 0); + framerate = (int)strtol(argv[3], NULL, 0); + + if (width < 16 || width % 2 || height < 16 || height % 2) + die("Invalid resolution: %dx%d", width, height); + + /* Open input video file for encoding */ + if (!(infile = fopen(argv[4], "rb"))) + die("Failed to open %s for reading", argv[4]); + + /* Open output file for each encoder to output bitstreams */ + for (i = 0; i < NUM_ENCODERS; i++) { + if (!target_bitrate[i]) { + outfile[i] = NULL; + continue; + } + + if (!(outfile[i] = fopen(argv[i + 5], "wb"))) + die("Failed to open %s for writing", argv[i + 4]); + } + + // Bitrates per spatial layer: overwrite default rates above. + for (i = 0; i < NUM_ENCODERS; i++) { + target_bitrate[i] = (int)strtol(argv[NUM_ENCODERS + 5 + i], NULL, 0); + } + + // Temporal layers per spatial layers: overwrite default settings above. + for (i = 0; i < NUM_ENCODERS; i++) { + num_temporal_layers[i] = + (int)strtol(argv[2 * NUM_ENCODERS + 5 + i], NULL, 0); + if (num_temporal_layers[i] < 1 || num_temporal_layers[i] > 3) + die("Invalid temporal layers: %d, Must be 1, 2, or 3. \n", + num_temporal_layers[i]); + } + + /* Open file to write out each spatially downsampled input stream. */ + for (i = 0; i < NUM_ENCODERS - 1; i++) { + // Highest resoln is encoder 0. + if (sprintf(filename, "ds%d.yuv", NUM_ENCODERS - i) < 0) { + return EXIT_FAILURE; + } + downsampled_input[i] = fopen(filename, "wb"); + } + + key_frame_insert = (int)strtol(argv[3 * NUM_ENCODERS + 5], NULL, 0); + + show_psnr = (int)strtol(argv[3 * NUM_ENCODERS + 6], NULL, 0); + + /* Populate default encoder configuration */ + for (i = 0; i < NUM_ENCODERS; i++) { + res[i] = vpx_codec_enc_config_default(interface, &cfg[i], 0); + if (res[i]) { + printf("Failed to get config: %s\n", vpx_codec_err_to_string(res[i])); + return EXIT_FAILURE; + } + } + + /* + * Update the default configuration according to needs of the application. + */ + /* Highest-resolution encoder settings */ + cfg[0].g_w = width; + cfg[0].g_h = height; + cfg[0].rc_dropframe_thresh = 0; + cfg[0].rc_end_usage = VPX_CBR; + cfg[0].rc_resize_allowed = 0; + cfg[0].rc_min_quantizer = 2; + cfg[0].rc_max_quantizer = 56; + cfg[0].rc_undershoot_pct = 100; + cfg[0].rc_overshoot_pct = 15; + cfg[0].rc_buf_initial_sz = 500; + cfg[0].rc_buf_optimal_sz = 600; + cfg[0].rc_buf_sz = 1000; + cfg[0].g_error_resilient = 1; /* Enable error resilient mode */ + cfg[0].g_lag_in_frames = 0; + + /* Disable automatic keyframe placement */ + /* Note: These 3 settings are copied to all levels. But, except the lowest + * resolution level, all other levels are set to VPX_KF_DISABLED internally. + */ + cfg[0].kf_mode = VPX_KF_AUTO; + cfg[0].kf_min_dist = 3000; + cfg[0].kf_max_dist = 3000; + + cfg[0].rc_target_bitrate = target_bitrate[0]; /* Set target bitrate */ + cfg[0].g_timebase.num = 1; /* Set fps */ + cfg[0].g_timebase.den = framerate; + + /* Other-resolution encoder settings */ + for (i = 1; i < NUM_ENCODERS; i++) { + memcpy(&cfg[i], &cfg[0], sizeof(vpx_codec_enc_cfg_t)); + + cfg[i].rc_target_bitrate = target_bitrate[i]; + + /* Note: Width & height of other-resolution encoders are calculated + * from the highest-resolution encoder's size and the corresponding + * down_sampling_factor. + */ + { + unsigned int iw = cfg[i - 1].g_w * dsf[i - 1].den + dsf[i - 1].num - 1; + unsigned int ih = cfg[i - 1].g_h * dsf[i - 1].den + dsf[i - 1].num - 1; + cfg[i].g_w = iw / dsf[i - 1].num; + cfg[i].g_h = ih / dsf[i - 1].num; + } + + /* Make width & height to be multiplier of 2. */ + // Should support odd size ??? + if ((cfg[i].g_w) % 2) cfg[i].g_w++; + if ((cfg[i].g_h) % 2) cfg[i].g_h++; + } + + // Set the number of threads per encode/spatial layer. + // (1, 1, 1) means no encoder threading. + cfg[0].g_threads = 1; + cfg[1].g_threads = 1; + cfg[2].g_threads = 1; + + /* Allocate image for each encoder */ + for (i = 0; i < NUM_ENCODERS; i++) + if (!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 32)) + die("Failed to allocate image (%dx%d)", cfg[i].g_w, cfg[i].g_h); + + if (raw[0].stride[VPX_PLANE_Y] == (int)raw[0].d_w) + read_frame_p = mulres_read_frame; + else + read_frame_p = mulres_read_frame_by_row; + + for (i = 0; i < NUM_ENCODERS; i++) + if (outfile[i]) write_ivf_file_header(outfile[i], &cfg[i], 0); + + /* Temporal layers settings */ + for (i = 0; i < NUM_ENCODERS; i++) { + set_temporal_layer_pattern(num_temporal_layers[i], &cfg[i], + cfg[i].rc_target_bitrate, + &layer_flags[i * VPX_TS_MAX_PERIODICITY]); + } + + /* Initialize multi-encoder */ + if (vpx_codec_enc_init_multi(&codec[0], interface, &cfg[0], NUM_ENCODERS, + (show_psnr ? VPX_CODEC_USE_PSNR : 0), &dsf[0])) + die_codec(&codec[0], "Failed to initialize encoder"); + + /* The extra encoding configuration parameters can be set as follows. */ + /* Set encoding speed */ + for (i = 0; i < NUM_ENCODERS; i++) { + int speed = -6; + /* Lower speed for the lowest resolution. */ + if (i == NUM_ENCODERS - 1) speed = -4; + if (vpx_codec_control(&codec[i], VP8E_SET_CPUUSED, speed)) + die_codec(&codec[i], "Failed to set cpu_used"); + } + + /* Set static threshold = 1 for all encoders */ + for (i = 0; i < NUM_ENCODERS; i++) { + if (vpx_codec_control(&codec[i], VP8E_SET_STATIC_THRESHOLD, 1)) + die_codec(&codec[i], "Failed to set static threshold"); + } + + /* Set NOISE_SENSITIVITY to do TEMPORAL_DENOISING */ + /* Enable denoising for the highest-resolution encoder. */ + if (vpx_codec_control(&codec[0], VP8E_SET_NOISE_SENSITIVITY, 1)) + die_codec(&codec[0], "Failed to set noise_sensitivity"); + if (vpx_codec_control(&codec[1], VP8E_SET_NOISE_SENSITIVITY, 1)) + die_codec(&codec[1], "Failed to set noise_sensitivity"); + for (i = 2; i < NUM_ENCODERS; i++) { + if (vpx_codec_control(&codec[i], VP8E_SET_NOISE_SENSITIVITY, 0)) + die_codec(&codec[i], "Failed to set noise_sensitivity"); + } + + /* Set the number of token partitions */ + for (i = 0; i < NUM_ENCODERS; i++) { + if (vpx_codec_control(&codec[i], VP8E_SET_TOKEN_PARTITIONS, 1)) + die_codec(&codec[i], "Failed to set static threshold"); + } + + /* Set the max intra target bitrate */ + for (i = 0; i < NUM_ENCODERS; i++) { + unsigned int max_intra_size_pct = + (int)(((double)cfg[0].rc_buf_optimal_sz * 0.5) * framerate / 10); + if (vpx_codec_control(&codec[i], VP8E_SET_MAX_INTRA_BITRATE_PCT, + max_intra_size_pct)) + die_codec(&codec[i], "Failed to set static threshold"); + // printf("%d %d \n",i,max_intra_size_pct); + } + + frame_avail = 1; + got_data = 0; + + while (frame_avail || got_data) { + struct vpx_usec_timer timer; + vpx_codec_iter_t iter[NUM_ENCODERS] = { NULL }; + const vpx_codec_cx_pkt_t *pkt[NUM_ENCODERS]; + + flags = 0; + frame_avail = read_frame_p(infile, &raw[0]); + + if (frame_avail) { + for (i = 1; i < NUM_ENCODERS; i++) { + /*Scale the image down a number of times by downsampling factor*/ + /* FilterMode 1 or 2 give better psnr than FilterMode 0. */ + I420Scale( + raw[i - 1].planes[VPX_PLANE_Y], raw[i - 1].stride[VPX_PLANE_Y], + raw[i - 1].planes[VPX_PLANE_U], raw[i - 1].stride[VPX_PLANE_U], + raw[i - 1].planes[VPX_PLANE_V], raw[i - 1].stride[VPX_PLANE_V], + raw[i - 1].d_w, raw[i - 1].d_h, raw[i].planes[VPX_PLANE_Y], + raw[i].stride[VPX_PLANE_Y], raw[i].planes[VPX_PLANE_U], + raw[i].stride[VPX_PLANE_U], raw[i].planes[VPX_PLANE_V], + raw[i].stride[VPX_PLANE_V], raw[i].d_w, raw[i].d_h, 1); + /* Write out down-sampled input. */ + length_frame = cfg[i].g_w * cfg[i].g_h * 3 / 2; + if (fwrite(raw[i].planes[0], 1, length_frame, + downsampled_input[NUM_ENCODERS - i - 1]) != + (unsigned int)length_frame) { + return EXIT_FAILURE; + } + } + } + + /* Set the flags (reference and update) for all the encoders.*/ + for (i = 0; i < NUM_ENCODERS; i++) { + layer_id = cfg[i].ts_layer_id[frame_cnt % cfg[i].ts_periodicity]; + flags = 0; + flag_periodicity = periodicity_to_num_layers[num_temporal_layers[i] - 1]; + flags = layer_flags[i * VPX_TS_MAX_PERIODICITY + + frame_cnt % flag_periodicity]; + // Key frame flag for first frame. + if (frame_cnt == 0) { + flags |= VPX_EFLAG_FORCE_KF; + } + if (frame_cnt > 0 && frame_cnt == key_frame_insert) { + flags = VPX_EFLAG_FORCE_KF; + } + + vpx_codec_control(&codec[i], VP8E_SET_FRAME_FLAGS, flags); + vpx_codec_control(&codec[i], VP8E_SET_TEMPORAL_LAYER_ID, layer_id); + } + + /* Encode each frame at multi-levels */ + /* Note the flags must be set to 0 in the encode call if they are set + for each frame with the vpx_codec_control(), as done above. */ + vpx_usec_timer_start(&timer); + if (vpx_codec_encode(&codec[0], frame_avail ? &raw[0] : NULL, frame_cnt, 1, + 0, arg_deadline)) { + die_codec(&codec[0], "Failed to encode frame"); + } + vpx_usec_timer_mark(&timer); + cx_time += vpx_usec_timer_elapsed(&timer); + + for (i = NUM_ENCODERS - 1; i >= 0; i--) { + got_data = 0; + while ((pkt[i] = vpx_codec_get_cx_data(&codec[i], &iter[i]))) { + got_data = 1; + switch (pkt[i]->kind) { + case VPX_CODEC_CX_FRAME_PKT: + write_ivf_frame_header(outfile[i], pkt[i]); + (void)fwrite(pkt[i]->data.frame.buf, 1, pkt[i]->data.frame.sz, + outfile[i]); + break; + case VPX_CODEC_PSNR_PKT: + if (show_psnr) { + int j; + + psnr_sse_total[i] += pkt[i]->data.psnr.sse[0]; + psnr_samples_total[i] += pkt[i]->data.psnr.samples[0]; + for (j = 0; j < 4; j++) { + psnr_totals[i][j] += pkt[i]->data.psnr.psnr[j]; + } + psnr_count[i]++; + } + + break; + default: break; + } + fflush(stdout); + } + } + frame_cnt++; + } + printf("\n"); + printf("Frame cnt and encoding time/FPS stats for encoding: %d %f %f \n", + frame_cnt, 1000 * (float)cx_time / (double)(frame_cnt * 1000000), + 1000000 * (double)frame_cnt / (double)cx_time); + + fclose(infile); + + printf("Processed %ld frames.\n", (long int)frame_cnt - 1); + for (i = 0; i < NUM_ENCODERS; i++) { + /* Calculate PSNR and print it out */ + if ((show_psnr) && (psnr_count[i] > 0)) { + int j; + double ovpsnr = + sse_to_psnr(psnr_samples_total[i], 255.0, psnr_sse_total[i]); + + fprintf(stderr, "\n ENC%d PSNR (Overall/Avg/Y/U/V)", i); + + fprintf(stderr, " %.3lf", ovpsnr); + for (j = 0; j < 4; j++) { + fprintf(stderr, " %.3lf", psnr_totals[i][j] / psnr_count[i]); + } + } + + if (vpx_codec_destroy(&codec[i])) + die_codec(&codec[i], "Failed to destroy codec"); + + vpx_img_free(&raw[i]); + + if (!outfile[i]) continue; + + /* Try to rewrite the file header with the actual frame count */ + if (!fseek(outfile[i], 0, SEEK_SET)) + write_ivf_file_header(outfile[i], &cfg[i], frame_cnt - 1); + fclose(outfile[i]); + } + + return EXIT_SUCCESS; +} diff --git a/media/libvpx/libvpx/examples/vp8cx_set_ref.c b/media/libvpx/libvpx/examples/vp8cx_set_ref.c new file mode 100644 index 0000000000..ca528f9e90 --- /dev/null +++ b/media/libvpx/libvpx/examples/vp8cx_set_ref.c @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// VP8 Set Reference Frame +// ======================= +// +// This is an example demonstrating how to overwrite the VP8 encoder's +// internal reference frame. In the sample we set the last frame to the +// current frame. If this is done at a cut scene it will avoid a keyframe. +// This technique could be used to bounce between two cameras. +// +// Note that the decoder would also have to set the reference frame to the +// same value on the same frame, or the video will become corrupt. +// +// Usage +// ----- +// This example adds a single argument to the `simple_encoder` example, +// which specifies the frame number to update the reference frame on. +// The parameter is parsed as follows: +// +// +// Extra Variables +// --------------- +// This example maintains the frame number passed on the command line +// in the `update_frame_num` variable. +// +// +// Configuration +// ------------- +// +// The reference frame is updated on the frame specified on the command +// line. +// +// Observing The Effects +// --------------------- +// Use the `simple_encoder` example to encode a sample with a cut scene. +// Determine the frame number of the cut scene by looking for a generated +// key-frame (indicated by a 'K'). Supply that frame number as an argument +// to this example, and observe that no key-frame is generated. + +#include +#include +#include + +#include "vpx/vp8cx.h" +#include "vpx/vpx_encoder.h" +#include "vp8/common/common.h" + +#include "../tools_common.h" +#include "../video_writer.h" + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, "Usage: %s \n", + exec_name); + exit(EXIT_FAILURE); +} + +static int encode_frame(vpx_codec_ctx_t *codec, vpx_image_t *img, + int frame_index, VpxVideoWriter *writer) { + int got_pkts = 0; + vpx_codec_iter_t iter = NULL; + const vpx_codec_cx_pkt_t *pkt = NULL; + const vpx_codec_err_t res = + vpx_codec_encode(codec, img, frame_index, 1, 0, VPX_DL_GOOD_QUALITY); + if (res != VPX_CODEC_OK) die_codec(codec, "Failed to encode frame"); + + while ((pkt = vpx_codec_get_cx_data(codec, &iter)) != NULL) { + got_pkts = 1; + + if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) { + const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0; + if (!vpx_video_writer_write_frame(writer, pkt->data.frame.buf, + pkt->data.frame.sz, + pkt->data.frame.pts)) { + die_codec(codec, "Failed to write compressed frame"); + } + + printf(keyframe ? "K" : "."); + fflush(stdout); + } + } + + return got_pkts; +} + +int main(int argc, char **argv) { + FILE *infile = NULL; + vpx_codec_ctx_t codec; + vpx_codec_enc_cfg_t cfg; + int frame_count = 0; + vpx_image_t raw; + vpx_codec_err_t res; + VpxVideoInfo info; + VpxVideoWriter *writer = NULL; + const VpxInterface *encoder = NULL; + int update_frame_num = 0; + const int fps = 30; // TODO(dkovalev) add command line argument + const int bitrate = 200; // kbit/s TODO(dkovalev) add command line argument + + vp8_zero(codec); + vp8_zero(cfg); + vp8_zero(info); + + exec_name = argv[0]; + + if (argc != 6) die("Invalid number of arguments"); + + // TODO(dkovalev): add vp9 support and rename the file accordingly + encoder = get_vpx_encoder_by_name("vp8"); + if (!encoder) die("Unsupported codec."); + + update_frame_num = atoi(argv[5]); + if (!update_frame_num) die("Couldn't parse frame number '%s'\n", argv[5]); + + info.codec_fourcc = encoder->fourcc; + info.frame_width = (int)strtol(argv[1], NULL, 0); + info.frame_height = (int)strtol(argv[2], NULL, 0); + info.time_base.numerator = 1; + info.time_base.denominator = fps; + + if (info.frame_width <= 0 || info.frame_height <= 0 || + (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) { + die("Invalid frame size: %dx%d", info.frame_width, info.frame_height); + } + + if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, info.frame_width, + info.frame_height, 1)) { + die("Failed to allocate image."); + } + + printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface())); + + res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0); + if (res) die_codec(&codec, "Failed to get default codec config."); + + cfg.g_w = info.frame_width; + cfg.g_h = info.frame_height; + cfg.g_timebase.num = info.time_base.numerator; + cfg.g_timebase.den = info.time_base.denominator; + cfg.rc_target_bitrate = bitrate; + + writer = vpx_video_writer_open(argv[4], kContainerIVF, &info); + if (!writer) die("Failed to open %s for writing.", argv[4]); + + if (!(infile = fopen(argv[3], "rb"))) + die("Failed to open %s for reading.", argv[3]); + + if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0)) + die("Failed to initialize encoder"); + + // Encode frames. + while (vpx_img_read(&raw, infile)) { + if (frame_count + 1 == update_frame_num) { + vpx_ref_frame_t ref; + ref.frame_type = VP8_LAST_FRAME; + ref.img = raw; + if (vpx_codec_control(&codec, VP8_SET_REFERENCE, &ref)) + die_codec(&codec, "Failed to set reference frame"); + } + + encode_frame(&codec, &raw, frame_count++, writer); + } + + // Flush encoder. + while (encode_frame(&codec, NULL, -1, writer)) { + } + + printf("\n"); + fclose(infile); + printf("Processed %d frames.\n", frame_count); + + vpx_img_free(&raw); + if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); + + vpx_video_writer_close(writer); + + return EXIT_SUCCESS; +} diff --git a/media/libvpx/libvpx/examples/vp9_lossless_encoder.c b/media/libvpx/libvpx/examples/vp9_lossless_encoder.c new file mode 100644 index 0000000000..c4eb3a8b17 --- /dev/null +++ b/media/libvpx/libvpx/examples/vp9_lossless_encoder.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "vpx/vpx_encoder.h" +#include "vpx/vp8cx.h" +#include "vp9/common/vp9_common.h" + +#include "../tools_common.h" +#include "../video_writer.h" + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, + "vp9_lossless_encoder: Example demonstrating VP9 lossless " + "encoding feature. Supports raw input only.\n"); + fprintf(stderr, "Usage: %s \n", exec_name); + exit(EXIT_FAILURE); +} + +static int encode_frame(vpx_codec_ctx_t *codec, vpx_image_t *img, + int frame_index, int flags, VpxVideoWriter *writer) { + int got_pkts = 0; + vpx_codec_iter_t iter = NULL; + const vpx_codec_cx_pkt_t *pkt = NULL; + const vpx_codec_err_t res = + vpx_codec_encode(codec, img, frame_index, 1, flags, VPX_DL_GOOD_QUALITY); + if (res != VPX_CODEC_OK) die_codec(codec, "Failed to encode frame"); + + while ((pkt = vpx_codec_get_cx_data(codec, &iter)) != NULL) { + got_pkts = 1; + + if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) { + const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0; + if (!vpx_video_writer_write_frame(writer, pkt->data.frame.buf, + pkt->data.frame.sz, + pkt->data.frame.pts)) { + die_codec(codec, "Failed to write compressed frame"); + } + printf(keyframe ? "K" : "."); + fflush(stdout); + } + } + + return got_pkts; +} + +int main(int argc, char **argv) { + FILE *infile = NULL; + vpx_codec_ctx_t codec; + vpx_codec_enc_cfg_t cfg; + int frame_count = 0; + vpx_image_t raw; + vpx_codec_err_t res; + VpxVideoInfo info; + VpxVideoWriter *writer = NULL; + const VpxInterface *encoder = NULL; + const int fps = 30; + + vp9_zero(info); + + exec_name = argv[0]; + + if (argc < 5) die("Invalid number of arguments"); + + encoder = get_vpx_encoder_by_name("vp9"); + if (!encoder) die("Unsupported codec."); + + info.codec_fourcc = encoder->fourcc; + info.frame_width = (int)strtol(argv[1], NULL, 0); + info.frame_height = (int)strtol(argv[2], NULL, 0); + info.time_base.numerator = 1; + info.time_base.denominator = fps; + + if (info.frame_width <= 0 || info.frame_height <= 0 || + (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) { + die("Invalid frame size: %dx%d", info.frame_width, info.frame_height); + } + + if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, info.frame_width, + info.frame_height, 1)) { + die("Failed to allocate image."); + } + + printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface())); + + res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0); + if (res) die_codec(&codec, "Failed to get default codec config."); + + cfg.g_w = info.frame_width; + cfg.g_h = info.frame_height; + cfg.g_timebase.num = info.time_base.numerator; + cfg.g_timebase.den = info.time_base.denominator; + + writer = vpx_video_writer_open(argv[4], kContainerIVF, &info); + if (!writer) die("Failed to open %s for writing.", argv[4]); + + if (!(infile = fopen(argv[3], "rb"))) + die("Failed to open %s for reading.", argv[3]); + + if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0)) + die("Failed to initialize encoder"); + + if (vpx_codec_control_(&codec, VP9E_SET_LOSSLESS, 1)) + die_codec(&codec, "Failed to use lossless mode"); + + // Encode frames. + while (vpx_img_read(&raw, infile)) { + encode_frame(&codec, &raw, frame_count++, 0, writer); + } + + // Flush encoder. + while (encode_frame(&codec, NULL, -1, 0, writer)) { + } + + printf("\n"); + fclose(infile); + printf("Processed %d frames.\n", frame_count); + + vpx_img_free(&raw); + if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); + + vpx_video_writer_close(writer); + + return EXIT_SUCCESS; +} diff --git a/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c b/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c new file mode 100644 index 0000000000..998e4fb20d --- /dev/null +++ b/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c @@ -0,0 +1,1216 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/* + * This is an example demonstrating how to implement a multi-layer + * VP9 encoding scheme based on spatial scalability for video applications + * that benefit from a scalable bitstream. + */ + +#include +#include +#include +#include +#include + +#include "../args.h" +#include "../tools_common.h" +#include "../video_writer.h" + +#include "../vpx_ports/vpx_timer.h" +#include "./svc_context.h" +#include "vpx/vp8cx.h" +#include "vpx/vpx_encoder.h" +#include "../vpxstats.h" +#include "vp9/encoder/vp9_encoder.h" +#include "./y4minput.h" + +#define OUTPUT_FRAME_STATS 0 +#define OUTPUT_RC_STATS 1 + +#define SIMULCAST_MODE 0 + +static const arg_def_t outputfile = + ARG_DEF("o", "output", 1, "Output filename"); +static const arg_def_t skip_frames_arg = + ARG_DEF("s", "skip-frames", 1, "input frames to skip"); +static const arg_def_t frames_arg = + ARG_DEF("f", "frames", 1, "number of frames to encode"); +static const arg_def_t threads_arg = + ARG_DEF("th", "threads", 1, "number of threads to use"); +#if OUTPUT_RC_STATS +static const arg_def_t output_rc_stats_arg = + ARG_DEF("rcstat", "output_rc_stats", 1, "output rc stats"); +#endif +static const arg_def_t width_arg = ARG_DEF("w", "width", 1, "source width"); +static const arg_def_t height_arg = ARG_DEF("h", "height", 1, "source height"); +static const arg_def_t timebase_arg = + ARG_DEF("t", "timebase", 1, "timebase (num/den)"); +static const arg_def_t bitrate_arg = ARG_DEF( + "b", "target-bitrate", 1, "encoding bitrate, in kilobits per second"); +static const arg_def_t spatial_layers_arg = + ARG_DEF("sl", "spatial-layers", 1, "number of spatial SVC layers"); +static const arg_def_t temporal_layers_arg = + ARG_DEF("tl", "temporal-layers", 1, "number of temporal SVC layers"); +static const arg_def_t temporal_layering_mode_arg = + ARG_DEF("tlm", "temporal-layering-mode", 1, + "temporal layering scheme." + "VP9E_TEMPORAL_LAYERING_MODE"); +static const arg_def_t kf_dist_arg = + ARG_DEF("k", "kf-dist", 1, "number of frames between keyframes"); +static const arg_def_t scale_factors_arg = + ARG_DEF("r", "scale-factors", 1, "scale factors (lowest to highest layer)"); +static const arg_def_t min_q_arg = + ARG_DEF(NULL, "min-q", 1, "Minimum quantizer"); +static const arg_def_t max_q_arg = + ARG_DEF(NULL, "max-q", 1, "Maximum quantizer"); +static const arg_def_t min_bitrate_arg = + ARG_DEF(NULL, "min-bitrate", 1, "Minimum bitrate"); +static const arg_def_t max_bitrate_arg = + ARG_DEF(NULL, "max-bitrate", 1, "Maximum bitrate"); +static const arg_def_t lag_in_frame_arg = + ARG_DEF(NULL, "lag-in-frames", 1, + "Number of frame to input before " + "generating any outputs"); +static const arg_def_t rc_end_usage_arg = + ARG_DEF(NULL, "rc-end-usage", 1, "0 - 3: VBR, CBR, CQ, Q"); +static const arg_def_t speed_arg = + ARG_DEF("sp", "speed", 1, "speed configuration"); +static const arg_def_t aqmode_arg = + ARG_DEF("aq", "aqmode", 1, "aq-mode off/on"); +static const arg_def_t bitrates_arg = + ARG_DEF("bl", "bitrates", 1, "bitrates[sl * num_tl + tl]"); +static const arg_def_t dropframe_thresh_arg = + ARG_DEF(NULL, "drop-frame", 1, "Temporal resampling threshold (buf %)"); +static const struct arg_enum_list tune_content_enum[] = { + { "default", VP9E_CONTENT_DEFAULT }, + { "screen", VP9E_CONTENT_SCREEN }, + { "film", VP9E_CONTENT_FILM }, + { NULL, 0 } +}; + +static const arg_def_t tune_content_arg = ARG_DEF_ENUM( + NULL, "tune-content", 1, "Tune content type", tune_content_enum); +static const arg_def_t inter_layer_pred_arg = ARG_DEF( + NULL, "inter-layer-pred", 1, "0 - 3: On, Off, Key-frames, Constrained"); + +#if CONFIG_VP9_HIGHBITDEPTH +static const struct arg_enum_list bitdepth_enum[] = { + { "8", VPX_BITS_8 }, { "10", VPX_BITS_10 }, { "12", VPX_BITS_12 }, { NULL, 0 } +}; + +static const arg_def_t bitdepth_arg = ARG_DEF_ENUM( + "d", "bit-depth", 1, "Bit depth for codec 8, 10 or 12. ", bitdepth_enum); +#endif // CONFIG_VP9_HIGHBITDEPTH + +static const arg_def_t *svc_args[] = { &frames_arg, + &outputfile, + &width_arg, + &height_arg, + &timebase_arg, + &bitrate_arg, + &skip_frames_arg, + &spatial_layers_arg, + &kf_dist_arg, + &scale_factors_arg, + &min_q_arg, + &max_q_arg, + &min_bitrate_arg, + &max_bitrate_arg, + &temporal_layers_arg, + &temporal_layering_mode_arg, + &lag_in_frame_arg, + &threads_arg, + &aqmode_arg, +#if OUTPUT_RC_STATS + &output_rc_stats_arg, +#endif + +#if CONFIG_VP9_HIGHBITDEPTH + &bitdepth_arg, +#endif + &speed_arg, + &rc_end_usage_arg, + &bitrates_arg, + &dropframe_thresh_arg, + &tune_content_arg, + &inter_layer_pred_arg, + NULL }; + +static const uint32_t default_frames_to_skip = 0; +static const uint32_t default_frames_to_code = 60 * 60; +static const uint32_t default_width = 1920; +static const uint32_t default_height = 1080; +static const uint32_t default_timebase_num = 1; +static const uint32_t default_timebase_den = 60; +static const uint32_t default_bitrate = 1000; +static const uint32_t default_spatial_layers = 5; +static const uint32_t default_temporal_layers = 1; +static const uint32_t default_kf_dist = 100; +static const uint32_t default_temporal_layering_mode = 0; +static const uint32_t default_output_rc_stats = 0; +static const int32_t default_speed = -1; // -1 means use library default. +static const uint32_t default_threads = 0; // zero means use library default. + +typedef struct { + const char *output_filename; + uint32_t frames_to_code; + uint32_t frames_to_skip; + struct VpxInputContext input_ctx; + stats_io_t rc_stats; + int tune_content; + int inter_layer_pred; +} AppInput; + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, "Usage: %s input_filename -o output_filename\n", + exec_name); + fprintf(stderr, "Options:\n"); + arg_show_usage(stderr, svc_args); + exit(EXIT_FAILURE); +} + +static void parse_command_line(int argc, const char **argv_, + AppInput *app_input, SvcContext *svc_ctx, + vpx_codec_enc_cfg_t *enc_cfg) { + struct arg arg; + char **argv = NULL; + char **argi = NULL; + char **argj = NULL; + vpx_codec_err_t res; + unsigned int min_bitrate = 0; + unsigned int max_bitrate = 0; + char string_options[1024] = { 0 }; + + // initialize SvcContext with parameters that will be passed to vpx_svc_init + svc_ctx->log_level = SVC_LOG_DEBUG; + svc_ctx->spatial_layers = default_spatial_layers; + svc_ctx->temporal_layers = default_temporal_layers; + svc_ctx->temporal_layering_mode = default_temporal_layering_mode; +#if OUTPUT_RC_STATS + svc_ctx->output_rc_stat = default_output_rc_stats; +#endif + svc_ctx->speed = default_speed; + svc_ctx->threads = default_threads; + + // start with default encoder configuration + res = vpx_codec_enc_config_default(vpx_codec_vp9_cx(), enc_cfg, 0); + if (res) { + die("Failed to get config: %s\n", vpx_codec_err_to_string(res)); + } + // update enc_cfg with app default values + enc_cfg->g_w = default_width; + enc_cfg->g_h = default_height; + enc_cfg->g_timebase.num = default_timebase_num; + enc_cfg->g_timebase.den = default_timebase_den; + enc_cfg->rc_target_bitrate = default_bitrate; + enc_cfg->kf_min_dist = default_kf_dist; + enc_cfg->kf_max_dist = default_kf_dist; + enc_cfg->rc_end_usage = VPX_CQ; + + // initialize AppInput with default values + app_input->frames_to_code = default_frames_to_code; + app_input->frames_to_skip = default_frames_to_skip; + + // process command line options + argv = argv_dup(argc - 1, argv_ + 1); + if (!argv) { + fprintf(stderr, "Error allocating argument list\n"); + exit(EXIT_FAILURE); + } + for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) { + arg.argv_step = 1; + + if (arg_match(&arg, &frames_arg, argi)) { + app_input->frames_to_code = arg_parse_uint(&arg); + } else if (arg_match(&arg, &outputfile, argi)) { + app_input->output_filename = arg.val; + } else if (arg_match(&arg, &width_arg, argi)) { + enc_cfg->g_w = arg_parse_uint(&arg); + } else if (arg_match(&arg, &height_arg, argi)) { + enc_cfg->g_h = arg_parse_uint(&arg); + } else if (arg_match(&arg, &timebase_arg, argi)) { + enc_cfg->g_timebase = arg_parse_rational(&arg); + } else if (arg_match(&arg, &bitrate_arg, argi)) { + enc_cfg->rc_target_bitrate = arg_parse_uint(&arg); + } else if (arg_match(&arg, &skip_frames_arg, argi)) { + app_input->frames_to_skip = arg_parse_uint(&arg); + } else if (arg_match(&arg, &spatial_layers_arg, argi)) { + svc_ctx->spatial_layers = arg_parse_uint(&arg); + } else if (arg_match(&arg, &temporal_layers_arg, argi)) { + svc_ctx->temporal_layers = arg_parse_uint(&arg); +#if OUTPUT_RC_STATS + } else if (arg_match(&arg, &output_rc_stats_arg, argi)) { + svc_ctx->output_rc_stat = arg_parse_uint(&arg); +#endif + } else if (arg_match(&arg, &speed_arg, argi)) { + svc_ctx->speed = arg_parse_uint(&arg); + if (svc_ctx->speed > 9) { + warn("Mapping speed %d to speed 9.\n", svc_ctx->speed); + } + } else if (arg_match(&arg, &aqmode_arg, argi)) { + svc_ctx->aqmode = arg_parse_uint(&arg); + } else if (arg_match(&arg, &threads_arg, argi)) { + svc_ctx->threads = arg_parse_uint(&arg); + } else if (arg_match(&arg, &temporal_layering_mode_arg, argi)) { + svc_ctx->temporal_layering_mode = enc_cfg->temporal_layering_mode = + arg_parse_int(&arg); + if (svc_ctx->temporal_layering_mode) { + enc_cfg->g_error_resilient = 1; + } + } else if (arg_match(&arg, &kf_dist_arg, argi)) { + enc_cfg->kf_min_dist = arg_parse_uint(&arg); + enc_cfg->kf_max_dist = enc_cfg->kf_min_dist; + } else if (arg_match(&arg, &scale_factors_arg, argi)) { + strncat(string_options, " scale-factors=", + sizeof(string_options) - strlen(string_options) - 1); + strncat(string_options, arg.val, + sizeof(string_options) - strlen(string_options) - 1); + } else if (arg_match(&arg, &bitrates_arg, argi)) { + strncat(string_options, " bitrates=", + sizeof(string_options) - strlen(string_options) - 1); + strncat(string_options, arg.val, + sizeof(string_options) - strlen(string_options) - 1); + } else if (arg_match(&arg, &min_q_arg, argi)) { + strncat(string_options, " min-quantizers=", + sizeof(string_options) - strlen(string_options) - 1); + strncat(string_options, arg.val, + sizeof(string_options) - strlen(string_options) - 1); + } else if (arg_match(&arg, &max_q_arg, argi)) { + strncat(string_options, " max-quantizers=", + sizeof(string_options) - strlen(string_options) - 1); + strncat(string_options, arg.val, + sizeof(string_options) - strlen(string_options) - 1); + } else if (arg_match(&arg, &min_bitrate_arg, argi)) { + min_bitrate = arg_parse_uint(&arg); + } else if (arg_match(&arg, &max_bitrate_arg, argi)) { + max_bitrate = arg_parse_uint(&arg); + } else if (arg_match(&arg, &lag_in_frame_arg, argi)) { + enc_cfg->g_lag_in_frames = arg_parse_uint(&arg); + } else if (arg_match(&arg, &rc_end_usage_arg, argi)) { + enc_cfg->rc_end_usage = arg_parse_uint(&arg); +#if CONFIG_VP9_HIGHBITDEPTH + } else if (arg_match(&arg, &bitdepth_arg, argi)) { + enc_cfg->g_bit_depth = arg_parse_enum_or_int(&arg); + switch (enc_cfg->g_bit_depth) { + case VPX_BITS_8: + enc_cfg->g_input_bit_depth = 8; + enc_cfg->g_profile = 0; + break; + case VPX_BITS_10: + enc_cfg->g_input_bit_depth = 10; + enc_cfg->g_profile = 2; + break; + case VPX_BITS_12: + enc_cfg->g_input_bit_depth = 12; + enc_cfg->g_profile = 2; + break; + default: + die("Error: Invalid bit depth selected (%d)\n", enc_cfg->g_bit_depth); + } +#endif // CONFIG_VP9_HIGHBITDEPTH + } else if (arg_match(&arg, &dropframe_thresh_arg, argi)) { + enc_cfg->rc_dropframe_thresh = arg_parse_uint(&arg); + } else if (arg_match(&arg, &tune_content_arg, argi)) { + app_input->tune_content = arg_parse_uint(&arg); + } else if (arg_match(&arg, &inter_layer_pred_arg, argi)) { + app_input->inter_layer_pred = arg_parse_uint(&arg); + } else { + ++argj; + } + } + + // There will be a space in front of the string options + if (strlen(string_options) > 0) + vpx_svc_set_options(svc_ctx, string_options + 1); + + enc_cfg->g_pass = VPX_RC_ONE_PASS; + + if (enc_cfg->rc_target_bitrate > 0) { + if (min_bitrate > 0) { + enc_cfg->rc_2pass_vbr_minsection_pct = + min_bitrate * 100 / enc_cfg->rc_target_bitrate; + } + if (max_bitrate > 0) { + enc_cfg->rc_2pass_vbr_maxsection_pct = + max_bitrate * 100 / enc_cfg->rc_target_bitrate; + } + } + + // Check for unrecognized options + for (argi = argv; *argi; ++argi) + if (argi[0][0] == '-' && strlen(argi[0]) > 1) + die("Error: Unrecognized option %s\n", *argi); + + if (argv[0] == NULL) { + usage_exit(); + } + app_input->input_ctx.filename = argv[0]; + free(argv); + + open_input_file(&app_input->input_ctx); + if (app_input->input_ctx.file_type == FILE_TYPE_Y4M) { + enc_cfg->g_w = app_input->input_ctx.width; + enc_cfg->g_h = app_input->input_ctx.height; + enc_cfg->g_timebase.den = app_input->input_ctx.framerate.numerator; + enc_cfg->g_timebase.num = app_input->input_ctx.framerate.denominator; + } + + if (enc_cfg->g_w < 16 || enc_cfg->g_w % 2 || enc_cfg->g_h < 16 || + enc_cfg->g_h % 2) + die("Invalid resolution: %d x %d\n", enc_cfg->g_w, enc_cfg->g_h); + + printf( + "Codec %s\nframes: %d, skip: %d\n" + "layers: %d\n" + "width %d, height: %d,\n" + "num: %d, den: %d, bitrate: %d,\n" + "gop size: %d\n", + vpx_codec_iface_name(vpx_codec_vp9_cx()), app_input->frames_to_code, + app_input->frames_to_skip, svc_ctx->spatial_layers, enc_cfg->g_w, + enc_cfg->g_h, enc_cfg->g_timebase.num, enc_cfg->g_timebase.den, + enc_cfg->rc_target_bitrate, enc_cfg->kf_max_dist); +} + +#if OUTPUT_RC_STATS +// For rate control encoding stats. +struct RateControlStats { + // Number of input frames per layer. + int layer_input_frames[VPX_MAX_LAYERS]; + // Total (cumulative) number of encoded frames per layer. + int layer_tot_enc_frames[VPX_MAX_LAYERS]; + // Number of encoded non-key frames per layer. + int layer_enc_frames[VPX_MAX_LAYERS]; + // Framerate per layer (cumulative). + double layer_framerate[VPX_MAX_LAYERS]; + // Target average frame size per layer (per-frame-bandwidth per layer). + double layer_pfb[VPX_MAX_LAYERS]; + // Actual average frame size per layer. + double layer_avg_frame_size[VPX_MAX_LAYERS]; + // Average rate mismatch per layer (|target - actual| / target). + double layer_avg_rate_mismatch[VPX_MAX_LAYERS]; + // Actual encoding bitrate per layer (cumulative). + double layer_encoding_bitrate[VPX_MAX_LAYERS]; + // Average of the short-time encoder actual bitrate. + // TODO(marpan): Should we add these short-time stats for each layer? + double avg_st_encoding_bitrate; + // Variance of the short-time encoder actual bitrate. + double variance_st_encoding_bitrate; + // Window (number of frames) for computing short-time encoding bitrate. + int window_size; + // Number of window measurements. + int window_count; +}; + +// Note: these rate control stats assume only 1 key frame in the +// sequence (i.e., first frame only). +static void set_rate_control_stats(struct RateControlStats *rc, + vpx_codec_enc_cfg_t *cfg) { + unsigned int sl, tl; + // Set the layer (cumulative) framerate and the target layer (non-cumulative) + // per-frame-bandwidth, for the rate control encoding stats below. + const double framerate = cfg->g_timebase.den / cfg->g_timebase.num; + + for (sl = 0; sl < cfg->ss_number_layers; ++sl) { + for (tl = 0; tl < cfg->ts_number_layers; ++tl) { + const int layer = sl * cfg->ts_number_layers + tl; + if (cfg->ts_number_layers == 1) + rc->layer_framerate[layer] = framerate; + else + rc->layer_framerate[layer] = framerate / cfg->ts_rate_decimator[tl]; + if (tl > 0) { + rc->layer_pfb[layer] = + 1000.0 * + (cfg->layer_target_bitrate[layer] - + cfg->layer_target_bitrate[layer - 1]) / + (rc->layer_framerate[layer] - rc->layer_framerate[layer - 1]); + } else { + rc->layer_pfb[layer] = 1000.0 * cfg->layer_target_bitrate[layer] / + rc->layer_framerate[layer]; + } + rc->layer_input_frames[layer] = 0; + rc->layer_enc_frames[layer] = 0; + rc->layer_tot_enc_frames[layer] = 0; + rc->layer_encoding_bitrate[layer] = 0.0; + rc->layer_avg_frame_size[layer] = 0.0; + rc->layer_avg_rate_mismatch[layer] = 0.0; + } + } + rc->window_count = 0; + rc->window_size = 15; + rc->avg_st_encoding_bitrate = 0.0; + rc->variance_st_encoding_bitrate = 0.0; +} + +static void printout_rate_control_summary(struct RateControlStats *rc, + vpx_codec_enc_cfg_t *cfg, + int frame_cnt) { + unsigned int sl, tl; + double perc_fluctuation = 0.0; + int tot_num_frames = 0; + printf("Total number of processed frames: %d\n\n", frame_cnt - 1); + printf("Rate control layer stats for sl%d tl%d layer(s):\n\n", + cfg->ss_number_layers, cfg->ts_number_layers); + for (sl = 0; sl < cfg->ss_number_layers; ++sl) { + tot_num_frames = 0; + for (tl = 0; tl < cfg->ts_number_layers; ++tl) { + const int layer = sl * cfg->ts_number_layers + tl; + const int num_dropped = + (tl > 0) + ? (rc->layer_input_frames[layer] - rc->layer_enc_frames[layer]) + : (rc->layer_input_frames[layer] - rc->layer_enc_frames[layer] - + 1); + tot_num_frames += rc->layer_input_frames[layer]; + rc->layer_encoding_bitrate[layer] = 0.001 * rc->layer_framerate[layer] * + rc->layer_encoding_bitrate[layer] / + tot_num_frames; + rc->layer_avg_frame_size[layer] = + rc->layer_avg_frame_size[layer] / rc->layer_enc_frames[layer]; + rc->layer_avg_rate_mismatch[layer] = 100.0 * + rc->layer_avg_rate_mismatch[layer] / + rc->layer_enc_frames[layer]; + printf("For layer#: sl%d tl%d \n", sl, tl); + printf("Bitrate (target vs actual): %d %f.0 kbps\n", + cfg->layer_target_bitrate[layer], + rc->layer_encoding_bitrate[layer]); + printf("Average frame size (target vs actual): %f %f bits\n", + rc->layer_pfb[layer], rc->layer_avg_frame_size[layer]); + printf("Average rate_mismatch: %f\n", rc->layer_avg_rate_mismatch[layer]); + printf( + "Number of input frames, encoded (non-key) frames, " + "and percent dropped frames: %d %d %f.0 \n", + rc->layer_input_frames[layer], rc->layer_enc_frames[layer], + 100.0 * num_dropped / rc->layer_input_frames[layer]); + printf("\n"); + } + } + rc->avg_st_encoding_bitrate = rc->avg_st_encoding_bitrate / rc->window_count; + rc->variance_st_encoding_bitrate = + rc->variance_st_encoding_bitrate / rc->window_count - + (rc->avg_st_encoding_bitrate * rc->avg_st_encoding_bitrate); + perc_fluctuation = 100.0 * sqrt(rc->variance_st_encoding_bitrate) / + rc->avg_st_encoding_bitrate; + printf("Short-time stats, for window of %d frames: \n", rc->window_size); + printf("Average, rms-variance, and percent-fluct: %f %f %f \n", + rc->avg_st_encoding_bitrate, sqrt(rc->variance_st_encoding_bitrate), + perc_fluctuation); + printf("Num of input, num of encoded (super) frames: %d %d \n", frame_cnt, + tot_num_frames); +} + +static vpx_codec_err_t parse_superframe_index(const uint8_t *data, + size_t data_sz, uint64_t sizes[8], + int *count) { + // A chunk ending with a byte matching 0xc0 is an invalid chunk unless + // it is a super frame index. If the last byte of real video compression + // data is 0xc0 the encoder must add a 0 byte. If we have the marker but + // not the associated matching marker byte at the front of the index we have + // an invalid bitstream and need to return an error. + + uint8_t marker; + + marker = *(data + data_sz - 1); + *count = 0; + + if ((marker & 0xe0) == 0xc0) { + const uint32_t frames = (marker & 0x7) + 1; + const uint32_t mag = ((marker >> 3) & 0x3) + 1; + const size_t index_sz = 2 + mag * frames; + + // This chunk is marked as having a superframe index but doesn't have + // enough data for it, thus it's an invalid superframe index. + if (data_sz < index_sz) return VPX_CODEC_CORRUPT_FRAME; + + { + const uint8_t marker2 = *(data + data_sz - index_sz); + + // This chunk is marked as having a superframe index but doesn't have + // the matching marker byte at the front of the index therefore it's an + // invalid chunk. + if (marker != marker2) return VPX_CODEC_CORRUPT_FRAME; + } + + { + // Found a valid superframe index. + uint32_t i, j; + const uint8_t *x = &data[data_sz - index_sz + 1]; + + for (i = 0; i < frames; ++i) { + uint32_t this_sz = 0; + + for (j = 0; j < mag; ++j) this_sz |= (*x++) << (j * 8); + sizes[i] = this_sz; + } + *count = frames; + } + } + return VPX_CODEC_OK; +} +#endif + +// Example pattern for spatial layers and 2 temporal layers used in the +// bypass/flexible mode. The pattern corresponds to the pattern +// VP9E_TEMPORAL_LAYERING_MODE_0101 (temporal_layering_mode == 2) used in +// non-flexible mode. +static void set_frame_flags_bypass_mode_ex0( + int tl, int num_spatial_layers, int is_key_frame, + vpx_svc_ref_frame_config_t *ref_frame_config) { + int sl; + for (sl = 0; sl < num_spatial_layers; ++sl) + ref_frame_config->update_buffer_slot[sl] = 0; + + for (sl = 0; sl < num_spatial_layers; ++sl) { + // Set the buffer idx. + if (tl == 0) { + ref_frame_config->lst_fb_idx[sl] = sl; + if (sl) { + if (is_key_frame) { + ref_frame_config->lst_fb_idx[sl] = sl - 1; + ref_frame_config->gld_fb_idx[sl] = sl; + } else { + ref_frame_config->gld_fb_idx[sl] = sl - 1; + } + } else { + ref_frame_config->gld_fb_idx[sl] = 0; + } + ref_frame_config->alt_fb_idx[sl] = 0; + } else if (tl == 1) { + ref_frame_config->lst_fb_idx[sl] = sl; + ref_frame_config->gld_fb_idx[sl] = + (sl == 0) ? 0 : num_spatial_layers + sl - 1; + ref_frame_config->alt_fb_idx[sl] = num_spatial_layers + sl; + } + // Set the reference and update flags. + if (!tl) { + if (!sl) { + // Base spatial and base temporal (sl = 0, tl = 0) + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->lst_fb_idx[sl]; + } else { + if (is_key_frame) { + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->gld_fb_idx[sl]; + } else { + // Non-zero spatiall layer. + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 1; + ref_frame_config->reference_alt_ref[sl] = 1; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->lst_fb_idx[sl]; + } + } + } else if (tl == 1) { + if (!sl) { + // Base spatial and top temporal (tl = 1) + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->alt_fb_idx[sl]; + } else { + // Non-zero spatial. + if (sl < num_spatial_layers - 1) { + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 1; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->alt_fb_idx[sl]; + } else if (sl == num_spatial_layers - 1) { + // Top spatial and top temporal (non-reference -- doesn't update any + // reference buffers) + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 1; + ref_frame_config->reference_alt_ref[sl] = 0; + } + } + } + } +} + +// Example pattern for 2 spatial layers and 2 temporal layers used in the +// bypass/flexible mode, except only 1 spatial layer when temporal_layer_id = 1. +static void set_frame_flags_bypass_mode_ex1( + int tl, int num_spatial_layers, int is_key_frame, + vpx_svc_ref_frame_config_t *ref_frame_config) { + int sl; + for (sl = 0; sl < num_spatial_layers; ++sl) + ref_frame_config->update_buffer_slot[sl] = 0; + + if (tl == 0) { + if (is_key_frame) { + ref_frame_config->lst_fb_idx[1] = 0; + ref_frame_config->gld_fb_idx[1] = 1; + } else { + ref_frame_config->lst_fb_idx[1] = 1; + ref_frame_config->gld_fb_idx[1] = 0; + } + ref_frame_config->alt_fb_idx[1] = 0; + + ref_frame_config->lst_fb_idx[0] = 0; + ref_frame_config->gld_fb_idx[0] = 0; + ref_frame_config->alt_fb_idx[0] = 0; + } + if (tl == 1) { + ref_frame_config->lst_fb_idx[0] = 0; + ref_frame_config->gld_fb_idx[0] = 1; + ref_frame_config->alt_fb_idx[0] = 2; + + ref_frame_config->lst_fb_idx[1] = 1; + ref_frame_config->gld_fb_idx[1] = 2; + ref_frame_config->alt_fb_idx[1] = 3; + } + // Set the reference and update flags. + if (tl == 0) { + // Base spatial and base temporal (sl = 0, tl = 0) + ref_frame_config->reference_last[0] = 1; + ref_frame_config->reference_golden[0] = 0; + ref_frame_config->reference_alt_ref[0] = 0; + ref_frame_config->update_buffer_slot[0] |= + 1 << ref_frame_config->lst_fb_idx[0]; + + if (is_key_frame) { + ref_frame_config->reference_last[1] = 1; + ref_frame_config->reference_golden[1] = 0; + ref_frame_config->reference_alt_ref[1] = 0; + ref_frame_config->update_buffer_slot[1] |= + 1 << ref_frame_config->gld_fb_idx[1]; + } else { + // Non-zero spatiall layer. + ref_frame_config->reference_last[1] = 1; + ref_frame_config->reference_golden[1] = 1; + ref_frame_config->reference_alt_ref[1] = 1; + ref_frame_config->update_buffer_slot[1] |= + 1 << ref_frame_config->lst_fb_idx[1]; + } + } + if (tl == 1) { + // Top spatial and top temporal (non-reference -- doesn't update any + // reference buffers) + ref_frame_config->reference_last[1] = 1; + ref_frame_config->reference_golden[1] = 0; + ref_frame_config->reference_alt_ref[1] = 0; + } +} + +#if CONFIG_VP9_DECODER && !SIMULCAST_MODE +static void test_decode(vpx_codec_ctx_t *encoder, vpx_codec_ctx_t *decoder, + const int frames_out, int *mismatch_seen) { + vpx_image_t enc_img, dec_img; + struct vp9_ref_frame ref_enc, ref_dec; + if (*mismatch_seen) return; + /* Get the internal reference frame */ + ref_enc.idx = 0; + ref_dec.idx = 0; + vpx_codec_control(encoder, VP9_GET_REFERENCE, &ref_enc); + enc_img = ref_enc.img; + vpx_codec_control(decoder, VP9_GET_REFERENCE, &ref_dec); + dec_img = ref_dec.img; +#if CONFIG_VP9_HIGHBITDEPTH + if ((enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) != + (dec_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH)) { + if (enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) { + vpx_img_alloc(&enc_img, enc_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH, + enc_img.d_w, enc_img.d_h, 16); + vpx_img_truncate_16_to_8(&enc_img, &ref_enc.img); + } + if (dec_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) { + vpx_img_alloc(&dec_img, dec_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH, + dec_img.d_w, dec_img.d_h, 16); + vpx_img_truncate_16_to_8(&dec_img, &ref_dec.img); + } + } +#endif + + if (!compare_img(&enc_img, &dec_img)) { + int y[4], u[4], v[4]; +#if CONFIG_VP9_HIGHBITDEPTH + if (enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) { + find_mismatch_high(&enc_img, &dec_img, y, u, v); + } else { + find_mismatch(&enc_img, &dec_img, y, u, v); + } +#else + find_mismatch(&enc_img, &dec_img, y, u, v); +#endif + decoder->err = 1; + printf( + "Encode/decode mismatch on frame %d at" + " Y[%d, %d] {%d/%d}," + " U[%d, %d] {%d/%d}," + " V[%d, %d] {%d/%d}\n", + frames_out, y[0], y[1], y[2], y[3], u[0], u[1], u[2], u[3], v[0], v[1], + v[2], v[3]); + *mismatch_seen = frames_out; + } + + vpx_img_free(&enc_img); + vpx_img_free(&dec_img); +} +#endif + +#if OUTPUT_RC_STATS +static void svc_output_rc_stats( + vpx_codec_ctx_t *codec, vpx_codec_enc_cfg_t *enc_cfg, + vpx_svc_layer_id_t *layer_id, const vpx_codec_cx_pkt_t *cx_pkt, + struct RateControlStats *rc, VpxVideoWriter **outfile, + const uint32_t frame_cnt, const double framerate) { + int num_layers_encoded = 0; + unsigned int sl, tl; + uint64_t sizes[8]; + uint64_t sizes_parsed[8]; + int count = 0; + double sum_bitrate = 0.0; + double sum_bitrate2 = 0.0; + vp9_zero(sizes); + vp9_zero(sizes_parsed); + vpx_codec_control(codec, VP9E_GET_SVC_LAYER_ID, layer_id); + parse_superframe_index(cx_pkt->data.frame.buf, cx_pkt->data.frame.sz, + sizes_parsed, &count); + if (enc_cfg->ss_number_layers == 1) { + sizes[0] = cx_pkt->data.frame.sz; + } else { + for (sl = 0; sl < enc_cfg->ss_number_layers; ++sl) { + sizes[sl] = 0; + if (cx_pkt->data.frame.spatial_layer_encoded[sl]) { + sizes[sl] = sizes_parsed[num_layers_encoded]; + num_layers_encoded++; + } + } + } + for (sl = 0; sl < enc_cfg->ss_number_layers; ++sl) { + unsigned int sl2; + uint64_t tot_size = 0; +#if SIMULCAST_MODE + for (sl2 = 0; sl2 < sl; ++sl2) { + if (cx_pkt->data.frame.spatial_layer_encoded[sl2]) tot_size += sizes[sl2]; + } + vpx_video_writer_write_frame(outfile[sl], + (uint8_t *)(cx_pkt->data.frame.buf) + tot_size, + (size_t)(sizes[sl]), cx_pkt->data.frame.pts); +#else + for (sl2 = 0; sl2 <= sl; ++sl2) { + if (cx_pkt->data.frame.spatial_layer_encoded[sl2]) tot_size += sizes[sl2]; + } + if (tot_size > 0) + vpx_video_writer_write_frame(outfile[sl], cx_pkt->data.frame.buf, + (size_t)(tot_size), cx_pkt->data.frame.pts); +#endif // SIMULCAST_MODE + } + for (sl = 0; sl < enc_cfg->ss_number_layers; ++sl) { + if (cx_pkt->data.frame.spatial_layer_encoded[sl]) { + for (tl = layer_id->temporal_layer_id; tl < enc_cfg->ts_number_layers; + ++tl) { + const int layer = sl * enc_cfg->ts_number_layers + tl; + ++rc->layer_tot_enc_frames[layer]; + rc->layer_encoding_bitrate[layer] += 8.0 * sizes[sl]; + // Keep count of rate control stats per layer, for non-key + // frames. + if (tl == (unsigned int)layer_id->temporal_layer_id && + !(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY)) { + rc->layer_avg_frame_size[layer] += 8.0 * sizes[sl]; + rc->layer_avg_rate_mismatch[layer] += + fabs(8.0 * sizes[sl] - rc->layer_pfb[layer]) / + rc->layer_pfb[layer]; + ++rc->layer_enc_frames[layer]; + } + } + } + } + + // Update for short-time encoding bitrate states, for moving + // window of size rc->window, shifted by rc->window / 2. + // Ignore first window segment, due to key frame. + if (frame_cnt > (unsigned int)rc->window_size) { + for (sl = 0; sl < enc_cfg->ss_number_layers; ++sl) { + if (cx_pkt->data.frame.spatial_layer_encoded[sl]) + sum_bitrate += 0.001 * 8.0 * sizes[sl] * framerate; + } + if (frame_cnt % rc->window_size == 0) { + rc->window_count += 1; + rc->avg_st_encoding_bitrate += sum_bitrate / rc->window_size; + rc->variance_st_encoding_bitrate += + (sum_bitrate / rc->window_size) * (sum_bitrate / rc->window_size); + } + } + + // Second shifted window. + if (frame_cnt > (unsigned int)(rc->window_size + rc->window_size / 2)) { + for (sl = 0; sl < enc_cfg->ss_number_layers; ++sl) { + sum_bitrate2 += 0.001 * 8.0 * sizes[sl] * framerate; + } + + if (frame_cnt > (unsigned int)(2 * rc->window_size) && + frame_cnt % rc->window_size == 0) { + rc->window_count += 1; + rc->avg_st_encoding_bitrate += sum_bitrate2 / rc->window_size; + rc->variance_st_encoding_bitrate += + (sum_bitrate2 / rc->window_size) * (sum_bitrate2 / rc->window_size); + } + } +} +#endif + +int main(int argc, const char **argv) { + AppInput app_input; + VpxVideoWriter *writer = NULL; + VpxVideoInfo info; + vpx_codec_ctx_t encoder; + vpx_codec_enc_cfg_t enc_cfg; + SvcContext svc_ctx; + vpx_svc_frame_drop_t svc_drop_frame; + uint32_t i; + uint32_t frame_cnt = 0; + vpx_image_t raw; + vpx_codec_err_t res; + int pts = 0; /* PTS starts at 0 */ + int frame_duration = 1; /* 1 timebase tick per frame */ + int end_of_stream = 0; +#if OUTPUT_FRAME_STATS + int frames_received = 0; +#endif +#if OUTPUT_RC_STATS + VpxVideoWriter *outfile[VPX_SS_MAX_LAYERS] = { NULL }; + struct RateControlStats rc; + vpx_svc_layer_id_t layer_id; + vpx_svc_ref_frame_config_t ref_frame_config; + unsigned int sl; + double framerate = 30.0; +#endif + struct vpx_usec_timer timer; + int64_t cx_time = 0; +#if CONFIG_INTERNAL_STATS + FILE *f = fopen("opsnr.stt", "a"); +#endif +#if CONFIG_VP9_DECODER && !SIMULCAST_MODE + int mismatch_seen = 0; + vpx_codec_ctx_t decoder; +#endif + memset(&svc_ctx, 0, sizeof(svc_ctx)); + memset(&app_input, 0, sizeof(AppInput)); + memset(&info, 0, sizeof(VpxVideoInfo)); + memset(&layer_id, 0, sizeof(vpx_svc_layer_id_t)); + memset(&rc, 0, sizeof(struct RateControlStats)); + exec_name = argv[0]; + + /* Setup default input stream settings */ + app_input.input_ctx.framerate.numerator = 30; + app_input.input_ctx.framerate.denominator = 1; + app_input.input_ctx.only_i420 = 1; + app_input.input_ctx.bit_depth = 0; + + parse_command_line(argc, argv, &app_input, &svc_ctx, &enc_cfg); + + // Y4M reader handles its own allocation. + if (app_input.input_ctx.file_type != FILE_TYPE_Y4M) { +// Allocate image buffer +#if CONFIG_VP9_HIGHBITDEPTH + if (!vpx_img_alloc(&raw, + enc_cfg.g_input_bit_depth == 8 ? VPX_IMG_FMT_I420 + : VPX_IMG_FMT_I42016, + enc_cfg.g_w, enc_cfg.g_h, 32)) { + die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h); + } +#else + if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, enc_cfg.g_w, enc_cfg.g_h, 32)) { + die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h); + } +#endif // CONFIG_VP9_HIGHBITDEPTH + } + + // Initialize codec + if (vpx_svc_init(&svc_ctx, &encoder, vpx_codec_vp9_cx(), &enc_cfg) != + VPX_CODEC_OK) + die("Failed to initialize encoder\n"); +#if CONFIG_VP9_DECODER && !SIMULCAST_MODE + if (vpx_codec_dec_init( + &decoder, get_vpx_decoder_by_name("vp9")->codec_interface(), NULL, 0)) + die("Failed to initialize decoder\n"); +#endif + +#if OUTPUT_RC_STATS + rc.window_count = 1; + rc.window_size = 15; // Silence a static analysis warning. + rc.avg_st_encoding_bitrate = 0.0; + rc.variance_st_encoding_bitrate = 0.0; + if (svc_ctx.output_rc_stat) { + set_rate_control_stats(&rc, &enc_cfg); + framerate = enc_cfg.g_timebase.den / enc_cfg.g_timebase.num; + } +#endif + + info.codec_fourcc = VP9_FOURCC; + info.frame_width = enc_cfg.g_w; + info.frame_height = enc_cfg.g_h; + info.time_base.numerator = enc_cfg.g_timebase.num; + info.time_base.denominator = enc_cfg.g_timebase.den; + + writer = + vpx_video_writer_open(app_input.output_filename, kContainerIVF, &info); + if (!writer) + die("Failed to open %s for writing\n", app_input.output_filename); + +#if OUTPUT_RC_STATS + // Write out spatial layer stream. + // TODO(marpan/jianj): allow for writing each spatial and temporal stream. + if (svc_ctx.output_rc_stat) { + for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { + char file_name[PATH_MAX]; + + snprintf(file_name, sizeof(file_name), "%s_s%d.ivf", + app_input.output_filename, sl); + outfile[sl] = vpx_video_writer_open(file_name, kContainerIVF, &info); + if (!outfile[sl]) die("Failed to open %s for writing", file_name); + } + } +#endif + + // skip initial frames + for (i = 0; i < app_input.frames_to_skip; ++i) + read_frame(&app_input.input_ctx, &raw); + + if (svc_ctx.speed != -1) + vpx_codec_control(&encoder, VP8E_SET_CPUUSED, svc_ctx.speed); + if (svc_ctx.threads) { + vpx_codec_control(&encoder, VP9E_SET_TILE_COLUMNS, + get_msb(svc_ctx.threads)); + if (svc_ctx.threads > 1) + vpx_codec_control(&encoder, VP9E_SET_ROW_MT, 1); + else + vpx_codec_control(&encoder, VP9E_SET_ROW_MT, 0); + } + if (svc_ctx.speed >= 5 && svc_ctx.aqmode == 1) + vpx_codec_control(&encoder, VP9E_SET_AQ_MODE, 3); + if (svc_ctx.speed >= 5) + vpx_codec_control(&encoder, VP8E_SET_STATIC_THRESHOLD, 1); + vpx_codec_control(&encoder, VP8E_SET_MAX_INTRA_BITRATE_PCT, 900); + + vpx_codec_control(&encoder, VP9E_SET_SVC_INTER_LAYER_PRED, + app_input.inter_layer_pred); + + vpx_codec_control(&encoder, VP9E_SET_NOISE_SENSITIVITY, 0); + + vpx_codec_control(&encoder, VP9E_SET_TUNE_CONTENT, app_input.tune_content); + + vpx_codec_control(&encoder, VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, 0); + vpx_codec_control(&encoder, VP9E_SET_DISABLE_LOOPFILTER, 0); + + svc_drop_frame.framedrop_mode = FULL_SUPERFRAME_DROP; + for (sl = 0; sl < (unsigned int)svc_ctx.spatial_layers; ++sl) + svc_drop_frame.framedrop_thresh[sl] = enc_cfg.rc_dropframe_thresh; + svc_drop_frame.max_consec_drop = INT_MAX; + vpx_codec_control(&encoder, VP9E_SET_SVC_FRAME_DROP_LAYER, &svc_drop_frame); + + // Encode frames + while (!end_of_stream) { + vpx_codec_iter_t iter = NULL; + const vpx_codec_cx_pkt_t *cx_pkt; + // Example patterns for bypass/flexible mode: + // example_pattern = 0: 2 temporal layers, and spatial_layers = 1,2,3. Exact + // to fixed SVC patterns. example_pattern = 1: 2 spatial and 2 temporal + // layers, with SL0 only has TL0, and SL1 has both TL0 and TL1. This example + // uses the extended API. + int example_pattern = 0; + if (frame_cnt >= app_input.frames_to_code || + !read_frame(&app_input.input_ctx, &raw)) { + // We need one extra vpx_svc_encode call at end of stream to flush + // encoder and get remaining data + end_of_stream = 1; + } + + // For BYPASS/FLEXIBLE mode, set the frame flags (reference and updates) + // and the buffer indices for each spatial layer of the current + // (super)frame to be encoded. The spatial and temporal layer_id for the + // current frame also needs to be set. + // TODO(marpan): Should rename the "VP9E_TEMPORAL_LAYERING_MODE_BYPASS" + // mode to "VP9E_LAYERING_MODE_BYPASS". + if (svc_ctx.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + layer_id.spatial_layer_id = 0; + // Example for 2 temporal layers. + if (frame_cnt % 2 == 0) { + layer_id.temporal_layer_id = 0; + for (i = 0; i < VPX_SS_MAX_LAYERS; i++) + layer_id.temporal_layer_id_per_spatial[i] = 0; + } else { + layer_id.temporal_layer_id = 1; + for (i = 0; i < VPX_SS_MAX_LAYERS; i++) + layer_id.temporal_layer_id_per_spatial[i] = 1; + } + if (example_pattern == 1) { + // example_pattern 1 is hard-coded for 2 spatial and 2 temporal layers. + assert(svc_ctx.spatial_layers == 2); + assert(svc_ctx.temporal_layers == 2); + if (frame_cnt % 2 == 0) { + // Spatial layer 0 and 1 are encoded. + layer_id.temporal_layer_id_per_spatial[0] = 0; + layer_id.temporal_layer_id_per_spatial[1] = 0; + layer_id.spatial_layer_id = 0; + } else { + // Only spatial layer 1 is encoded here. + layer_id.temporal_layer_id_per_spatial[1] = 1; + layer_id.spatial_layer_id = 1; + } + } + vpx_codec_control(&encoder, VP9E_SET_SVC_LAYER_ID, &layer_id); + // TODO(jianj): Fix the parameter passing for "is_key_frame" in + // set_frame_flags_bypass_model() for case of periodic key frames. + if (example_pattern == 0) { + set_frame_flags_bypass_mode_ex0(layer_id.temporal_layer_id, + svc_ctx.spatial_layers, frame_cnt == 0, + &ref_frame_config); + } else if (example_pattern == 1) { + set_frame_flags_bypass_mode_ex1(layer_id.temporal_layer_id, + svc_ctx.spatial_layers, frame_cnt == 0, + &ref_frame_config); + } + ref_frame_config.duration[0] = frame_duration * 1; + ref_frame_config.duration[1] = frame_duration * 1; + + vpx_codec_control(&encoder, VP9E_SET_SVC_REF_FRAME_CONFIG, + &ref_frame_config); + // Keep track of input frames, to account for frame drops in rate control + // stats/metrics. + for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { + ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers + + layer_id.temporal_layer_id]; + } + } else { + // For the fixed pattern SVC, temporal layer is given by superframe count. + unsigned int tl = 0; + if (enc_cfg.ts_number_layers == 2) + tl = (frame_cnt % 2 != 0); + else if (enc_cfg.ts_number_layers == 3) { + if (frame_cnt % 2 != 0) tl = 2; + if ((frame_cnt > 1) && ((frame_cnt - 2) % 4 == 0)) tl = 1; + } + for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) + ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers + tl]; + } + + vpx_usec_timer_start(&timer); + res = vpx_svc_encode( + &svc_ctx, &encoder, (end_of_stream ? NULL : &raw), pts, frame_duration, + svc_ctx.speed >= 5 ? VPX_DL_REALTIME : VPX_DL_GOOD_QUALITY); + vpx_usec_timer_mark(&timer); + cx_time += vpx_usec_timer_elapsed(&timer); + + fflush(stdout); + if (res != VPX_CODEC_OK) { + die_codec(&encoder, "Failed to encode frame"); + } + + while ((cx_pkt = vpx_codec_get_cx_data(&encoder, &iter)) != NULL) { + switch (cx_pkt->kind) { + case VPX_CODEC_CX_FRAME_PKT: { + SvcInternal_t *const si = (SvcInternal_t *)svc_ctx.internal; + if (cx_pkt->data.frame.sz > 0) { + vpx_video_writer_write_frame(writer, cx_pkt->data.frame.buf, + cx_pkt->data.frame.sz, + cx_pkt->data.frame.pts); +#if OUTPUT_RC_STATS + if (svc_ctx.output_rc_stat) { + svc_output_rc_stats(&encoder, &enc_cfg, &layer_id, cx_pkt, &rc, + outfile, frame_cnt, framerate); + } +#endif + } +#if OUTPUT_FRAME_STATS + printf("SVC frame: %d, kf: %d, size: %d, pts: %d\n", frames_received, + !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY), + (int)cx_pkt->data.frame.sz, (int)cx_pkt->data.frame.pts); + ++frames_received; +#endif + if (enc_cfg.ss_number_layers == 1 && enc_cfg.ts_number_layers == 1) + si->bytes_sum[0] += (int)cx_pkt->data.frame.sz; +#if CONFIG_VP9_DECODER && !SIMULCAST_MODE + if (vpx_codec_decode(&decoder, cx_pkt->data.frame.buf, + (unsigned int)cx_pkt->data.frame.sz, NULL, 0)) + die_codec(&decoder, "Failed to decode frame."); +#endif + break; + } + case VPX_CODEC_STATS_PKT: { + stats_write(&app_input.rc_stats, cx_pkt->data.twopass_stats.buf, + cx_pkt->data.twopass_stats.sz); + break; + } + default: { + break; + } + } + +#if CONFIG_VP9_DECODER && !SIMULCAST_MODE + vpx_codec_control(&encoder, VP9E_GET_SVC_LAYER_ID, &layer_id); + // Don't look for mismatch on top spatial and top temporal layers as they + // are non reference frames. + if ((enc_cfg.ss_number_layers > 1 || enc_cfg.ts_number_layers > 1) && + !(layer_id.temporal_layer_id > 0 && + layer_id.temporal_layer_id == (int)enc_cfg.ts_number_layers - 1 && + cx_pkt->data.frame + .spatial_layer_encoded[enc_cfg.ss_number_layers - 1])) { + test_decode(&encoder, &decoder, frame_cnt, &mismatch_seen); + } +#endif + } + + if (!end_of_stream) { + ++frame_cnt; + pts += frame_duration; + } + } + + printf("Processed %d frames\n", frame_cnt); + + close_input_file(&app_input.input_ctx); + +#if OUTPUT_RC_STATS + if (svc_ctx.output_rc_stat) { + printout_rate_control_summary(&rc, &enc_cfg, frame_cnt); + printf("\n"); + } +#endif + if (vpx_codec_destroy(&encoder)) + die_codec(&encoder, "Failed to destroy codec"); + if (writer) { + vpx_video_writer_close(writer); + } +#if OUTPUT_RC_STATS + if (svc_ctx.output_rc_stat) { + for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { + vpx_video_writer_close(outfile[sl]); + } + } +#endif +#if CONFIG_INTERNAL_STATS + if (mismatch_seen) { + fprintf(f, "First mismatch occurred in frame %d\n", mismatch_seen); + } else { + fprintf(f, "No mismatch detected in recon buffers\n"); + } + fclose(f); +#endif + printf("Frame cnt and encoding time/FPS stats for encoding: %d %f %f \n", + frame_cnt, 1000 * (float)cx_time / (double)(frame_cnt * 1000000), + 1000000 * (double)frame_cnt / (double)cx_time); + if (app_input.input_ctx.file_type != FILE_TYPE_Y4M) { + vpx_img_free(&raw); + } + // display average size, psnr + vpx_svc_dump_statistics(&svc_ctx); + vpx_svc_release(&svc_ctx); + return EXIT_SUCCESS; +} diff --git a/media/libvpx/libvpx/examples/vp9cx_set_ref.c b/media/libvpx/libvpx/examples/vp9cx_set_ref.c new file mode 100644 index 0000000000..1a0823153b --- /dev/null +++ b/media/libvpx/libvpx/examples/vp9cx_set_ref.c @@ -0,0 +1,320 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// VP9 Set Reference Frame +// ============================ +// +// This is an example demonstrating how to overwrite the VP9 encoder's +// internal reference frame. In the sample we set the last frame to the +// current frame. This technique could be used to bounce between two cameras. +// +// The decoder would also have to set the reference frame to the same value +// on the same frame, or the video will become corrupt. The 'test_decode' +// variable is set to 1 in this example that tests if the encoder and decoder +// results are matching. +// +// Usage +// ----- +// This example encodes a raw video. And the last argument passed in specifies +// the frame number to update the reference frame on. For example, run +// examples/vp9cx_set_ref 352 288 in.yuv out.ivf 4 30 +// The parameter is parsed as follows: +// +// +// Extra Variables +// --------------- +// This example maintains the frame number passed on the command line +// in the `update_frame_num` variable. +// +// +// Configuration +// ------------- +// +// The reference frame is updated on the frame specified on the command +// line. +// +// Observing The Effects +// --------------------- +// The encoder and decoder results should be matching when the same reference +// frame setting operation is done in both encoder and decoder. Otherwise, +// the encoder/decoder mismatch would be seen. + +#include +#include +#include + +#include "vpx/vp8cx.h" +#include "vpx/vpx_decoder.h" +#include "vpx/vpx_encoder.h" +#include "vp9/common/vp9_common.h" + +#include "./tools_common.h" +#include "./video_writer.h" + +static const char *exec_name; + +void usage_exit() { + fprintf(stderr, + "Usage: %s " + " \n", + exec_name); + exit(EXIT_FAILURE); +} + +static void testing_decode(vpx_codec_ctx_t *encoder, vpx_codec_ctx_t *decoder, + unsigned int frame_out, int *mismatch_seen) { + vpx_image_t enc_img, dec_img; + struct vp9_ref_frame ref_enc, ref_dec; + + if (*mismatch_seen) return; + + ref_enc.idx = 0; + ref_dec.idx = 0; + if (vpx_codec_control(encoder, VP9_GET_REFERENCE, &ref_enc)) + die_codec(encoder, "Failed to get encoder reference frame"); + enc_img = ref_enc.img; + if (vpx_codec_control(decoder, VP9_GET_REFERENCE, &ref_dec)) + die_codec(decoder, "Failed to get decoder reference frame"); + dec_img = ref_dec.img; + + if (!compare_img(&enc_img, &dec_img)) { + int y[4], u[4], v[4]; + + *mismatch_seen = 1; + + find_mismatch(&enc_img, &dec_img, y, u, v); + printf( + "Encode/decode mismatch on frame %d at" + " Y[%d, %d] {%d/%d}," + " U[%d, %d] {%d/%d}," + " V[%d, %d] {%d/%d}", + frame_out, y[0], y[1], y[2], y[3], u[0], u[1], u[2], u[3], v[0], v[1], + v[2], v[3]); + } + + vpx_img_free(&enc_img); + vpx_img_free(&dec_img); +} + +static int encode_frame(vpx_codec_ctx_t *ecodec, vpx_image_t *img, + unsigned int frame_in, VpxVideoWriter *writer, + int test_decode, vpx_codec_ctx_t *dcodec, + unsigned int *frame_out, int *mismatch_seen) { + int got_pkts = 0; + vpx_codec_iter_t iter = NULL; + const vpx_codec_cx_pkt_t *pkt = NULL; + int got_data; + const vpx_codec_err_t res = + vpx_codec_encode(ecodec, img, frame_in, 1, 0, VPX_DL_GOOD_QUALITY); + if (res != VPX_CODEC_OK) die_codec(ecodec, "Failed to encode frame"); + + got_data = 0; + + while ((pkt = vpx_codec_get_cx_data(ecodec, &iter)) != NULL) { + got_pkts = 1; + + if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) { + const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0; + + if (!(pkt->data.frame.flags & VPX_FRAME_IS_FRAGMENT)) { + *frame_out += 1; + } + + if (!vpx_video_writer_write_frame(writer, pkt->data.frame.buf, + pkt->data.frame.sz, + pkt->data.frame.pts)) { + die_codec(ecodec, "Failed to write compressed frame"); + } + printf(keyframe ? "K" : "."); + fflush(stdout); + got_data = 1; + + // Decode 1 frame. + if (test_decode) { + if (vpx_codec_decode(dcodec, pkt->data.frame.buf, + (unsigned int)pkt->data.frame.sz, NULL, 0)) + die_codec(dcodec, "Failed to decode frame."); + } + } + } + + // Mismatch checking + if (got_data && test_decode) { + testing_decode(ecodec, dcodec, *frame_out, mismatch_seen); + } + + return got_pkts; +} + +int main(int argc, char **argv) { + FILE *infile = NULL; + // Encoder + vpx_codec_ctx_t ecodec; + vpx_codec_enc_cfg_t cfg; + unsigned int frame_in = 0; + vpx_image_t raw; + vpx_codec_err_t res; + VpxVideoInfo info; + VpxVideoWriter *writer = NULL; + const VpxInterface *encoder = NULL; + + // Test encoder/decoder mismatch. + int test_decode = 1; + // Decoder + vpx_codec_ctx_t dcodec; + unsigned int frame_out = 0; + + // The frame number to set reference frame on + unsigned int update_frame_num = 0; + int mismatch_seen = 0; + + const int fps = 30; + const int bitrate = 500; + + const char *width_arg = NULL; + const char *height_arg = NULL; + const char *infile_arg = NULL; + const char *outfile_arg = NULL; + const char *update_frame_num_arg = NULL; + unsigned int limit = 0; + + vp9_zero(ecodec); + vp9_zero(cfg); + vp9_zero(info); + + exec_name = argv[0]; + + if (argc < 6) die("Invalid number of arguments"); + + width_arg = argv[1]; + height_arg = argv[2]; + infile_arg = argv[3]; + outfile_arg = argv[4]; + update_frame_num_arg = argv[5]; + + encoder = get_vpx_encoder_by_name("vp9"); + if (!encoder) die("Unsupported codec."); + + update_frame_num = (unsigned int)strtoul(update_frame_num_arg, NULL, 0); + // In VP9, the reference buffers (cm->buffer_pool->frame_bufs[i].buf) are + // allocated while calling vpx_codec_encode(), thus, setting reference for + // 1st frame isn't supported. + if (update_frame_num <= 1) { + die("Couldn't parse frame number '%s'\n", update_frame_num_arg); + } + + if (argc > 6) { + limit = (unsigned int)strtoul(argv[6], NULL, 0); + if (update_frame_num > limit) + die("Update frame number couldn't larger than limit\n"); + } + + info.codec_fourcc = encoder->fourcc; + info.frame_width = (int)strtol(width_arg, NULL, 0); + info.frame_height = (int)strtol(height_arg, NULL, 0); + info.time_base.numerator = 1; + info.time_base.denominator = fps; + + if (info.frame_width <= 0 || info.frame_height <= 0 || + (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) { + die("Invalid frame size: %dx%d", info.frame_width, info.frame_height); + } + + if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, info.frame_width, + info.frame_height, 1)) { + die("Failed to allocate image."); + } + + printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface())); + + res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0); + if (res) die_codec(&ecodec, "Failed to get default codec config."); + + cfg.g_w = info.frame_width; + cfg.g_h = info.frame_height; + cfg.g_timebase.num = info.time_base.numerator; + cfg.g_timebase.den = info.time_base.denominator; + cfg.rc_target_bitrate = bitrate; + cfg.g_lag_in_frames = 3; + + writer = vpx_video_writer_open(outfile_arg, kContainerIVF, &info); + if (!writer) die("Failed to open %s for writing.", outfile_arg); + + if (!(infile = fopen(infile_arg, "rb"))) + die("Failed to open %s for reading.", infile_arg); + + if (vpx_codec_enc_init(&ecodec, encoder->codec_interface(), &cfg, 0)) + die("Failed to initialize encoder"); + + // Disable alt_ref. + if (vpx_codec_control(&ecodec, VP8E_SET_ENABLEAUTOALTREF, 0)) + die_codec(&ecodec, "Failed to set enable auto alt ref"); + + if (test_decode) { + const VpxInterface *decoder = get_vpx_decoder_by_name("vp9"); + if (vpx_codec_dec_init(&dcodec, decoder->codec_interface(), NULL, 0)) + die_codec(&dcodec, "Failed to initialize decoder."); + } + + // Encode frames. + while (vpx_img_read(&raw, infile)) { + if (limit && frame_in >= limit) break; + if (update_frame_num > 1 && frame_out + 1 == update_frame_num) { + vpx_ref_frame_t ref; + ref.frame_type = VP8_LAST_FRAME; + ref.img = raw; + // Set reference frame in encoder. + if (vpx_codec_control(&ecodec, VP8_SET_REFERENCE, &ref)) + die_codec(&ecodec, "Failed to set reference frame"); + printf(" "); + + // If set_reference in decoder is commented out, the enc/dec mismatch + // would be seen. + if (test_decode) { + if (vpx_codec_control(&dcodec, VP8_SET_REFERENCE, &ref)) + die_codec(&dcodec, "Failed to set reference frame"); + } + } + + encode_frame(&ecodec, &raw, frame_in, writer, test_decode, &dcodec, + &frame_out, &mismatch_seen); + frame_in++; + if (mismatch_seen) break; + } + + // Flush encoder. + if (!mismatch_seen) + while (encode_frame(&ecodec, NULL, frame_in, writer, test_decode, &dcodec, + &frame_out, &mismatch_seen)) { + } + + printf("\n"); + fclose(infile); + printf("Processed %d frames.\n", frame_out); + + if (test_decode) { + if (!mismatch_seen) + printf("Encoder/decoder results are matching.\n"); + else + printf("Encoder/decoder results are NOT matching.\n"); + } + + if (test_decode) + if (vpx_codec_destroy(&dcodec)) + die_codec(&dcodec, "Failed to destroy decoder"); + + vpx_img_free(&raw); + if (vpx_codec_destroy(&ecodec)) + die_codec(&ecodec, "Failed to destroy encoder."); + + vpx_video_writer_close(writer); + + return EXIT_SUCCESS; +} diff --git a/media/libvpx/libvpx/examples/vpx_dec_fuzzer.cc b/media/libvpx/libvpx/examples/vpx_dec_fuzzer.cc new file mode 100644 index 0000000000..5eba9d74da --- /dev/null +++ b/media/libvpx/libvpx/examples/vpx_dec_fuzzer.cc @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/* + * Fuzzer for libvpx decoders + * ========================== + * Requirements + * -------------- + * Requires Clang 6.0 or above as -fsanitize=fuzzer is used as a linker + * option. + + * Steps to build + * -------------- + * Clone libvpx repository + $git clone https://chromium.googlesource.com/webm/libvpx + + * Create a directory in parallel to libvpx and change directory + $mkdir vpx_dec_fuzzer + $cd vpx_dec_fuzzer/ + + * Enable sanitizers (Supported: address integer memory thread undefined) + $source ../libvpx/tools/set_analyzer_env.sh address + + * Configure libvpx. + * Note --size-limit and VPX_MAX_ALLOCABLE_MEMORY are defined to avoid + * Out of memory errors when running generated fuzzer binary + $../libvpx/configure --disable-unit-tests --size-limit=12288x12288 \ + --extra-cflags="-fsanitize=fuzzer-no-link \ + -DVPX_MAX_ALLOCABLE_MEMORY=1073741824" \ + --disable-webm-io --enable-debug --disable-vp8-encoder \ + --disable-vp9-encoder --disable-examples + + * Build libvpx + $make -j32 + + * Build vp9 fuzzer + $ $CXX $CXXFLAGS -std=gnu++11 -DDECODER=vp9 \ + -fsanitize=fuzzer -I../libvpx -I. -Wl,--start-group \ + ../libvpx/examples/vpx_dec_fuzzer.cc -o ./vpx_dec_fuzzer_vp9 \ + ./libvpx.a -Wl,--end-group + + * DECODER should be defined as vp9 or vp8 to enable vp9/vp8 + * + * create a corpus directory and copy some ivf files there. + * Based on which codec (vp8/vp9) is being tested, it is recommended to + * have corresponding ivf files in corpus directory + * Empty corpus directoy also is acceptable, though not recommended + $mkdir CORPUS && cp some-files CORPUS + + * Run fuzzing: + $./vpx_dec_fuzzer_vp9 CORPUS + + * References: + * http://llvm.org/docs/LibFuzzer.html + * https://github.com/google/oss-fuzz + */ + +#include +#include +#include +#include +#include +#include + +#include "vpx/vp8dx.h" +#include "vpx/vpx_decoder.h" +#include "vpx_ports/mem_ops.h" + +#define IVF_FRAME_HDR_SZ (4 + 8) /* 4 byte size + 8 byte timestamp */ +#define IVF_FILE_HDR_SZ 32 + +#define VPXD_INTERFACE(name) VPXD_INTERFACE_(name) +#define VPXD_INTERFACE_(name) vpx_codec_##name##_dx() + +extern "C" void usage_exit(void) { exit(EXIT_FAILURE); } + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size <= IVF_FILE_HDR_SZ) { + return 0; + } + + vpx_codec_ctx_t codec; + // Set thread count in the range [1, 64]. + const unsigned int threads = (data[IVF_FILE_HDR_SZ] & 0x3f) + 1; + vpx_codec_dec_cfg_t cfg = { threads, 0, 0 }; + if (vpx_codec_dec_init(&codec, VPXD_INTERFACE(DECODER), &cfg, 0)) { + return 0; + } + + if (threads > 1) { + const int enable = (data[IVF_FILE_HDR_SZ] & 0xa0) != 0; + const vpx_codec_err_t err = + vpx_codec_control(&codec, VP9D_SET_LOOP_FILTER_OPT, enable); + static_cast(err); + } + + data += IVF_FILE_HDR_SZ; + size -= IVF_FILE_HDR_SZ; + + while (size > IVF_FRAME_HDR_SZ) { + size_t frame_size = mem_get_le32(data); + size -= IVF_FRAME_HDR_SZ; + data += IVF_FRAME_HDR_SZ; + frame_size = std::min(size, frame_size); + + const vpx_codec_err_t err = + vpx_codec_decode(&codec, data, frame_size, nullptr, 0); + static_cast(err); + vpx_codec_iter_t iter = nullptr; + vpx_image_t *img = nullptr; + while ((img = vpx_codec_get_frame(&codec, &iter)) != nullptr) { + } + data += frame_size; + size -= frame_size; + } + vpx_codec_destroy(&codec); + return 0; +} diff --git a/media/libvpx/libvpx/examples/vpx_temporal_svc_encoder.c b/media/libvpx/libvpx/examples/vpx_temporal_svc_encoder.c new file mode 100644 index 0000000000..a80027822a --- /dev/null +++ b/media/libvpx/libvpx/examples/vpx_temporal_svc_encoder.c @@ -0,0 +1,1069 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This is an example demonstrating how to implement a multi-layer VPx +// encoding scheme based on temporal scalability for video applications +// that benefit from a scalable bitstream. + +#include +#include +#include +#include +#include + +#include "./vpx_config.h" +#include "./y4minput.h" +#include "../vpx_ports/vpx_timer.h" +#include "vpx/vp8cx.h" +#include "vpx/vpx_encoder.h" +#include "vpx_ports/bitops.h" + +#include "../tools_common.h" +#include "../video_writer.h" + +#define ROI_MAP 0 + +#define zero(Dest) memset(&(Dest), 0, sizeof(Dest)) + +static const char *exec_name; + +void usage_exit(void) { exit(EXIT_FAILURE); } + +// Denoiser states for vp8, for temporal denoising. +enum denoiserStateVp8 { + kVp8DenoiserOff, + kVp8DenoiserOnYOnly, + kVp8DenoiserOnYUV, + kVp8DenoiserOnYUVAggressive, + kVp8DenoiserOnAdaptive +}; + +// Denoiser states for vp9, for temporal denoising. +enum denoiserStateVp9 { + kVp9DenoiserOff, + kVp9DenoiserOnYOnly, + // For SVC: denoise the top two spatial layers. + kVp9DenoiserOnYTwoSpatialLayers +}; + +static int mode_to_num_layers[13] = { 1, 2, 2, 3, 3, 3, 3, 5, 2, 3, 3, 3, 3 }; + +// For rate control encoding stats. +struct RateControlMetrics { + // Number of input frames per layer. + int layer_input_frames[VPX_TS_MAX_LAYERS]; + // Total (cumulative) number of encoded frames per layer. + int layer_tot_enc_frames[VPX_TS_MAX_LAYERS]; + // Number of encoded non-key frames per layer. + int layer_enc_frames[VPX_TS_MAX_LAYERS]; + // Framerate per layer layer (cumulative). + double layer_framerate[VPX_TS_MAX_LAYERS]; + // Target average frame size per layer (per-frame-bandwidth per layer). + double layer_pfb[VPX_TS_MAX_LAYERS]; + // Actual average frame size per layer. + double layer_avg_frame_size[VPX_TS_MAX_LAYERS]; + // Average rate mismatch per layer (|target - actual| / target). + double layer_avg_rate_mismatch[VPX_TS_MAX_LAYERS]; + // Actual encoding bitrate per layer (cumulative). + double layer_encoding_bitrate[VPX_TS_MAX_LAYERS]; + // Average of the short-time encoder actual bitrate. + // TODO(marpan): Should we add these short-time stats for each layer? + double avg_st_encoding_bitrate; + // Variance of the short-time encoder actual bitrate. + double variance_st_encoding_bitrate; + // Window (number of frames) for computing short-timee encoding bitrate. + int window_size; + // Number of window measurements. + int window_count; + int layer_target_bitrate[VPX_MAX_LAYERS]; +}; + +// Note: these rate control metrics assume only 1 key frame in the +// sequence (i.e., first frame only). So for temporal pattern# 7 +// (which has key frame for every frame on base layer), the metrics +// computation will be off/wrong. +// TODO(marpan): Update these metrics to account for multiple key frames +// in the stream. +static void set_rate_control_metrics(struct RateControlMetrics *rc, + vpx_codec_enc_cfg_t *cfg) { + int i = 0; + // Set the layer (cumulative) framerate and the target layer (non-cumulative) + // per-frame-bandwidth, for the rate control encoding stats below. + const double framerate = cfg->g_timebase.den / cfg->g_timebase.num; + const int ts_number_layers = cfg->ts_number_layers; + rc->layer_framerate[0] = framerate / cfg->ts_rate_decimator[0]; + rc->layer_pfb[0] = + 1000.0 * rc->layer_target_bitrate[0] / rc->layer_framerate[0]; + for (i = 0; i < ts_number_layers; ++i) { + if (i > 0) { + rc->layer_framerate[i] = framerate / cfg->ts_rate_decimator[i]; + rc->layer_pfb[i] = + 1000.0 * + (rc->layer_target_bitrate[i] - rc->layer_target_bitrate[i - 1]) / + (rc->layer_framerate[i] - rc->layer_framerate[i - 1]); + } + rc->layer_input_frames[i] = 0; + rc->layer_enc_frames[i] = 0; + rc->layer_tot_enc_frames[i] = 0; + rc->layer_encoding_bitrate[i] = 0.0; + rc->layer_avg_frame_size[i] = 0.0; + rc->layer_avg_rate_mismatch[i] = 0.0; + } + rc->window_count = 0; + rc->window_size = 15; + rc->avg_st_encoding_bitrate = 0.0; + rc->variance_st_encoding_bitrate = 0.0; + // Target bandwidth for the whole stream. + // Set to layer_target_bitrate for highest layer (total bitrate). + cfg->rc_target_bitrate = rc->layer_target_bitrate[ts_number_layers - 1]; +} + +static void printout_rate_control_summary(struct RateControlMetrics *rc, + vpx_codec_enc_cfg_t *cfg, + int frame_cnt) { + unsigned int i = 0; + int tot_num_frames = 0; + double perc_fluctuation = 0.0; + printf("Total number of processed frames: %d\n\n", frame_cnt - 1); + printf("Rate control layer stats for %d layer(s):\n\n", + cfg->ts_number_layers); + for (i = 0; i < cfg->ts_number_layers; ++i) { + const int num_dropped = + (i > 0) ? (rc->layer_input_frames[i] - rc->layer_enc_frames[i]) + : (rc->layer_input_frames[i] - rc->layer_enc_frames[i] - 1); + tot_num_frames += rc->layer_input_frames[i]; + rc->layer_encoding_bitrate[i] = 0.001 * rc->layer_framerate[i] * + rc->layer_encoding_bitrate[i] / + tot_num_frames; + rc->layer_avg_frame_size[i] = + rc->layer_avg_frame_size[i] / rc->layer_enc_frames[i]; + rc->layer_avg_rate_mismatch[i] = + 100.0 * rc->layer_avg_rate_mismatch[i] / rc->layer_enc_frames[i]; + printf("For layer#: %d \n", i); + printf("Bitrate (target vs actual): %d %f \n", rc->layer_target_bitrate[i], + rc->layer_encoding_bitrate[i]); + printf("Average frame size (target vs actual): %f %f \n", rc->layer_pfb[i], + rc->layer_avg_frame_size[i]); + printf("Average rate_mismatch: %f \n", rc->layer_avg_rate_mismatch[i]); + printf( + "Number of input frames, encoded (non-key) frames, " + "and perc dropped frames: %d %d %f \n", + rc->layer_input_frames[i], rc->layer_enc_frames[i], + 100.0 * num_dropped / rc->layer_input_frames[i]); + printf("\n"); + } + rc->avg_st_encoding_bitrate = rc->avg_st_encoding_bitrate / rc->window_count; + rc->variance_st_encoding_bitrate = + rc->variance_st_encoding_bitrate / rc->window_count - + (rc->avg_st_encoding_bitrate * rc->avg_st_encoding_bitrate); + perc_fluctuation = 100.0 * sqrt(rc->variance_st_encoding_bitrate) / + rc->avg_st_encoding_bitrate; + printf("Short-time stats, for window of %d frames: \n", rc->window_size); + printf("Average, rms-variance, and percent-fluct: %f %f %f \n", + rc->avg_st_encoding_bitrate, sqrt(rc->variance_st_encoding_bitrate), + perc_fluctuation); + if ((frame_cnt - 1) != tot_num_frames) + die("Error: Number of input frames not equal to output! \n"); +} + +#if ROI_MAP +static void set_roi_map(const char *enc_name, vpx_codec_enc_cfg_t *cfg, + vpx_roi_map_t *roi) { + unsigned int i, j; + int block_size = 0; + uint8_t is_vp8 = strncmp(enc_name, "vp8", 3) == 0 ? 1 : 0; + uint8_t is_vp9 = strncmp(enc_name, "vp9", 3) == 0 ? 1 : 0; + if (!is_vp8 && !is_vp9) { + die("unsupported codec."); + } + zero(*roi); + + block_size = is_vp9 && !is_vp8 ? 8 : 16; + + // ROI is based on the segments (4 for vp8, 8 for vp9), smallest unit for + // segment is 16x16 for vp8, 8x8 for vp9. + roi->rows = (cfg->g_h + block_size - 1) / block_size; + roi->cols = (cfg->g_w + block_size - 1) / block_size; + + // Applies delta QP on the segment blocks, varies from -63 to 63. + // Setting to negative means lower QP (better quality). + // Below we set delta_q to the extreme (-63) to show strong effect. + // VP8 uses the first 4 segments. VP9 uses all 8 segments. + zero(roi->delta_q); + roi->delta_q[1] = -63; + + // Applies delta loopfilter strength on the segment blocks, varies from -63 to + // 63. Setting to positive means stronger loopfilter. VP8 uses the first 4 + // segments. VP9 uses all 8 segments. + zero(roi->delta_lf); + + if (is_vp8) { + // Applies skip encoding threshold on the segment blocks, varies from 0 to + // UINT_MAX. Larger value means more skipping of encoding is possible. + // This skip threshold only applies on delta frames. + zero(roi->static_threshold); + } + + if (is_vp9) { + // Apply skip segment. Setting to 1 means this block will be copied from + // previous frame. + zero(roi->skip); + } + + if (is_vp9) { + // Apply ref frame segment. + // -1 : Do not apply this segment. + // 0 : Froce using intra. + // 1 : Force using last. + // 2 : Force using golden. + // 3 : Force using alfref but not used in non-rd pickmode for 0 lag. + memset(roi->ref_frame, -1, sizeof(roi->ref_frame)); + roi->ref_frame[1] = 1; + } + + // Use 2 states: 1 is center square, 0 is the rest. + roi->roi_map = + (uint8_t *)calloc(roi->rows * roi->cols, sizeof(*roi->roi_map)); + for (i = 0; i < roi->rows; ++i) { + for (j = 0; j < roi->cols; ++j) { + if (i > (roi->rows >> 2) && i < ((roi->rows * 3) >> 2) && + j > (roi->cols >> 2) && j < ((roi->cols * 3) >> 2)) { + roi->roi_map[i * roi->cols + j] = 1; + } + } + } +} + +static void set_roi_skip_map(vpx_codec_enc_cfg_t *cfg, vpx_roi_map_t *roi, + int *skip_map, int *prev_mask_map, int frame_num) { + const int block_size = 8; + unsigned int i, j; + roi->rows = (cfg->g_h + block_size - 1) / block_size; + roi->cols = (cfg->g_w + block_size - 1) / block_size; + zero(roi->skip); + zero(roi->delta_q); + zero(roi->delta_lf); + memset(roi->ref_frame, -1, sizeof(roi->ref_frame)); + roi->ref_frame[1] = 1; + // Use segment 3 for skip. + roi->skip[3] = 1; + roi->roi_map = + (uint8_t *)calloc(roi->rows * roi->cols, sizeof(*roi->roi_map)); + for (i = 0; i < roi->rows; ++i) { + for (j = 0; j < roi->cols; ++j) { + const int idx = i * roi->cols + j; + // Use segment 3 for skip. + // prev_mask_map keeps track of blocks that have been stably on segment 3 + // for the past 10 frames. Only skip when the block is on segment 3 in + // both current map and prev_mask_map. + if (skip_map[idx] == 1 && prev_mask_map[idx] == 1) roi->roi_map[idx] = 3; + // Reset it every 10 frames so it doesn't propagate for too many frames. + if (frame_num % 10 == 0) + prev_mask_map[idx] = skip_map[idx]; + else if (prev_mask_map[idx] == 1 && skip_map[idx] == 0) + prev_mask_map[idx] = 0; + } + } +} +#endif + +// Temporal scaling parameters: +// NOTE: The 3 prediction frames cannot be used interchangeably due to +// differences in the way they are handled throughout the code. The +// frames should be allocated to layers in the order LAST, GF, ARF. +// Other combinations work, but may produce slightly inferior results. +static void set_temporal_layer_pattern(int layering_mode, + vpx_codec_enc_cfg_t *cfg, + int *layer_flags, + int *flag_periodicity) { + switch (layering_mode) { + case 0: { + // 1-layer. + int ids[1] = { 0 }; + cfg->ts_periodicity = 1; + *flag_periodicity = 1; + cfg->ts_number_layers = 1; + cfg->ts_rate_decimator[0] = 1; + memcpy(cfg->ts_layer_id, ids, sizeof(ids)); + // Update L only. + layer_flags[0] = + VPX_EFLAG_FORCE_KF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF; + break; + } + case 1: { + // 2-layers, 2-frame period. + int ids[2] = { 0, 1 }; + cfg->ts_periodicity = 2; + *flag_periodicity = 2; + cfg->ts_number_layers = 2; + cfg->ts_rate_decimator[0] = 2; + cfg->ts_rate_decimator[1] = 1; + memcpy(cfg->ts_layer_id, ids, sizeof(ids)); +#if 1 + // 0=L, 1=GF, Intra-layer prediction enabled. + layer_flags[0] = VPX_EFLAG_FORCE_KF | VP8_EFLAG_NO_UPD_GF | + VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_GF | + VP8_EFLAG_NO_REF_ARF; + layer_flags[1] = + VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_REF_ARF; +#else + // 0=L, 1=GF, Intra-layer prediction disabled. + layer_flags[0] = VPX_EFLAG_FORCE_KF | VP8_EFLAG_NO_UPD_GF | + VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_GF | + VP8_EFLAG_NO_REF_ARF; + layer_flags[1] = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST | + VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_REF_LAST; +#endif + break; + } + case 2: { + // 2-layers, 3-frame period. + int ids[3] = { 0, 1, 1 }; + cfg->ts_periodicity = 3; + *flag_periodicity = 3; + cfg->ts_number_layers = 2; + cfg->ts_rate_decimator[0] = 3; + cfg->ts_rate_decimator[1] = 1; + memcpy(cfg->ts_layer_id, ids, sizeof(ids)); + // 0=L, 1=GF, Intra-layer prediction enabled. + layer_flags[0] = VPX_EFLAG_FORCE_KF | VP8_EFLAG_NO_REF_GF | + VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF | + VP8_EFLAG_NO_UPD_ARF; + layer_flags[1] = layer_flags[2] = + VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_ARF | + VP8_EFLAG_NO_UPD_LAST; + break; + } + case 3: { + // 3-layers, 6-frame period. + int ids[6] = { 0, 2, 2, 1, 2, 2 }; + cfg->ts_periodicity = 6; + *flag_periodicity = 6; + cfg->ts_number_layers = 3; + cfg->ts_rate_decimator[0] = 6; + cfg->ts_rate_decimator[1] = 3; + cfg->ts_rate_decimator[2] = 1; + memcpy(cfg->ts_layer_id, ids, sizeof(ids)); + // 0=L, 1=GF, 2=ARF, Intra-layer prediction enabled. + layer_flags[0] = VPX_EFLAG_FORCE_KF | VP8_EFLAG_NO_REF_GF | + VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF | + VP8_EFLAG_NO_UPD_ARF; + layer_flags[3] = + VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST; + layer_flags[1] = layer_flags[2] = layer_flags[4] = layer_flags[5] = + VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_LAST; + break; + } + case 4: { + // 3-layers, 4-frame period. + int ids[4] = { 0, 2, 1, 2 }; + cfg->ts_periodicity = 4; + *flag_periodicity = 4; + cfg->ts_number_layers = 3; + cfg->ts_rate_decimator[0] = 4; + cfg->ts_rate_decimator[1] = 2; + cfg->ts_rate_decimator[2] = 1; + memcpy(cfg->ts_layer_id, ids, sizeof(ids)); + // 0=L, 1=GF, 2=ARF, Intra-layer prediction disabled. + layer_flags[0] = VPX_EFLAG_FORCE_KF | VP8_EFLAG_NO_REF_GF | + VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF | + VP8_EFLAG_NO_UPD_ARF; + layer_flags[2] = VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF | + VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST; + layer_flags[1] = layer_flags[3] = + VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF | + VP8_EFLAG_NO_UPD_ARF; + break; + } + case 5: { + // 3-layers, 4-frame period. + int ids[4] = { 0, 2, 1, 2 }; + cfg->ts_periodicity = 4; + *flag_periodicity = 4; + cfg->ts_number_layers = 3; + cfg->ts_rate_decimator[0] = 4; + cfg->ts_rate_decimator[1] = 2; + cfg->ts_rate_decimator[2] = 1; + memcpy(cfg->ts_layer_id, ids, sizeof(ids)); + // 0=L, 1=GF, 2=ARF, Intra-layer prediction enabled in layer 1, disabled + // in layer 2. + layer_flags[0] = VPX_EFLAG_FORCE_KF | VP8_EFLAG_NO_REF_GF | + VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF | + VP8_EFLAG_NO_UPD_ARF; + layer_flags[2] = + VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_ARF; + layer_flags[1] = layer_flags[3] = + VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF | + VP8_EFLAG_NO_UPD_ARF; + break; + } + case 6: { + // 3-layers, 4-frame period. + int ids[4] = { 0, 2, 1, 2 }; + cfg->ts_periodicity = 4; + *flag_periodicity = 4; + cfg->ts_number_layers = 3; + cfg->ts_rate_decimator[0] = 4; + cfg->ts_rate_decimator[1] = 2; + cfg->ts_rate_decimator[2] = 1; + memcpy(cfg->ts_layer_id, ids, sizeof(ids)); + // 0=L, 1=GF, 2=ARF, Intra-layer prediction enabled. + layer_flags[0] = VPX_EFLAG_FORCE_KF | VP8_EFLAG_NO_REF_GF | + VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF | + VP8_EFLAG_NO_UPD_ARF; + layer_flags[2] = + VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_ARF; + layer_flags[1] = layer_flags[3] = + VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF; + break; + } + case 7: { + // NOTE: Probably of academic interest only. + // 5-layers, 16-frame period. + int ids[16] = { 0, 4, 3, 4, 2, 4, 3, 4, 1, 4, 3, 4, 2, 4, 3, 4 }; + cfg->ts_periodicity = 16; + *flag_periodicity = 16; + cfg->ts_number_layers = 5; + cfg->ts_rate_decimator[0] = 16; + cfg->ts_rate_decimator[1] = 8; + cfg->ts_rate_decimator[2] = 4; + cfg->ts_rate_decimator[3] = 2; + cfg->ts_rate_decimator[4] = 1; + memcpy(cfg->ts_layer_id, ids, sizeof(ids)); + layer_flags[0] = VPX_EFLAG_FORCE_KF; + layer_flags[1] = layer_flags[3] = layer_flags[5] = layer_flags[7] = + layer_flags[9] = layer_flags[11] = layer_flags[13] = layer_flags[15] = + VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF | + VP8_EFLAG_NO_UPD_ARF; + layer_flags[2] = layer_flags[6] = layer_flags[10] = layer_flags[14] = + VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_GF; + layer_flags[4] = layer_flags[12] = + VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_UPD_ARF; + layer_flags[8] = VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_GF; + break; + } + case 8: { + // 2-layers, with sync point at first frame of layer 1. + int ids[2] = { 0, 1 }; + cfg->ts_periodicity = 2; + *flag_periodicity = 8; + cfg->ts_number_layers = 2; + cfg->ts_rate_decimator[0] = 2; + cfg->ts_rate_decimator[1] = 1; + memcpy(cfg->ts_layer_id, ids, sizeof(ids)); + // 0=L, 1=GF. + // ARF is used as predictor for all frames, and is only updated on + // key frame. Sync point every 8 frames. + + // Layer 0: predict from L and ARF, update L and G. + layer_flags[0] = + VPX_EFLAG_FORCE_KF | VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_ARF; + // Layer 1: sync point: predict from L and ARF, and update G. + layer_flags[1] = + VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_ARF; + // Layer 0, predict from L and ARF, update L. + layer_flags[2] = + VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF; + // Layer 1: predict from L, G and ARF, and update G. + layer_flags[3] = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST | + VP8_EFLAG_NO_UPD_ENTROPY; + // Layer 0. + layer_flags[4] = layer_flags[2]; + // Layer 1. + layer_flags[5] = layer_flags[3]; + // Layer 0. + layer_flags[6] = layer_flags[4]; + // Layer 1. + layer_flags[7] = layer_flags[5]; + break; + } + case 9: { + // 3-layers: Sync points for layer 1 and 2 every 8 frames. + int ids[4] = { 0, 2, 1, 2 }; + cfg->ts_periodicity = 4; + *flag_periodicity = 8; + cfg->ts_number_layers = 3; + cfg->ts_rate_decimator[0] = 4; + cfg->ts_rate_decimator[1] = 2; + cfg->ts_rate_decimator[2] = 1; + memcpy(cfg->ts_layer_id, ids, sizeof(ids)); + // 0=L, 1=GF, 2=ARF. + layer_flags[0] = VPX_EFLAG_FORCE_KF | VP8_EFLAG_NO_REF_GF | + VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF | + VP8_EFLAG_NO_UPD_ARF; + layer_flags[1] = VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF | + VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF; + layer_flags[2] = VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF | + VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_ARF; + layer_flags[3] = layer_flags[5] = + VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF; + layer_flags[4] = VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF | + VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF; + layer_flags[6] = + VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_ARF; + layer_flags[7] = VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF | + VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_ENTROPY; + break; + } + case 10: { + // 3-layers structure where ARF is used as predictor for all frames, + // and is only updated on key frame. + // Sync points for layer 1 and 2 every 8 frames. + + int ids[4] = { 0, 2, 1, 2 }; + cfg->ts_periodicity = 4; + *flag_periodicity = 8; + cfg->ts_number_layers = 3; + cfg->ts_rate_decimator[0] = 4; + cfg->ts_rate_decimator[1] = 2; + cfg->ts_rate_decimator[2] = 1; + memcpy(cfg->ts_layer_id, ids, sizeof(ids)); + // 0=L, 1=GF, 2=ARF. + // Layer 0: predict from L and ARF; update L and G. + layer_flags[0] = + VPX_EFLAG_FORCE_KF | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_GF; + // Layer 2: sync point: predict from L and ARF; update none. + layer_flags[1] = VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_GF | + VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST | + VP8_EFLAG_NO_UPD_ENTROPY; + // Layer 1: sync point: predict from L and ARF; update G. + layer_flags[2] = + VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST; + // Layer 2: predict from L, G, ARF; update none. + layer_flags[3] = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | + VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_ENTROPY; + // Layer 0: predict from L and ARF; update L. + layer_flags[4] = + VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_GF; + // Layer 2: predict from L, G, ARF; update none. + layer_flags[5] = layer_flags[3]; + // Layer 1: predict from L, G, ARF; update G. + layer_flags[6] = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST; + // Layer 2: predict from L, G, ARF; update none. + layer_flags[7] = layer_flags[3]; + break; + } + case 11: { + // 3-layers structure with one reference frame. + // This works same as temporal_layering_mode 3. + // This was added to compare with vp9_spatial_svc_encoder. + + // 3-layers, 4-frame period. + int ids[4] = { 0, 2, 1, 2 }; + cfg->ts_periodicity = 4; + *flag_periodicity = 4; + cfg->ts_number_layers = 3; + cfg->ts_rate_decimator[0] = 4; + cfg->ts_rate_decimator[1] = 2; + cfg->ts_rate_decimator[2] = 1; + memcpy(cfg->ts_layer_id, ids, sizeof(ids)); + // 0=L, 1=GF, 2=ARF, Intra-layer prediction disabled. + layer_flags[0] = VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF | + VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF; + layer_flags[2] = VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF | + VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST; + layer_flags[1] = VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF | + VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF; + layer_flags[3] = VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_ARF | + VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF; + break; + } + case 12: + default: { + // 3-layers structure as in case 10, but no sync/refresh points for + // layer 1 and 2. + int ids[4] = { 0, 2, 1, 2 }; + cfg->ts_periodicity = 4; + *flag_periodicity = 8; + cfg->ts_number_layers = 3; + cfg->ts_rate_decimator[0] = 4; + cfg->ts_rate_decimator[1] = 2; + cfg->ts_rate_decimator[2] = 1; + memcpy(cfg->ts_layer_id, ids, sizeof(ids)); + // 0=L, 1=GF, 2=ARF. + // Layer 0: predict from L and ARF; update L. + layer_flags[0] = + VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_GF; + layer_flags[4] = layer_flags[0]; + // Layer 1: predict from L, G, ARF; update G. + layer_flags[2] = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST; + layer_flags[6] = layer_flags[2]; + // Layer 2: predict from L, G, ARF; update none. + layer_flags[1] = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | + VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_ENTROPY; + layer_flags[3] = layer_flags[1]; + layer_flags[5] = layer_flags[1]; + layer_flags[7] = layer_flags[1]; + break; + } + } +} + +#if ROI_MAP +static void read_mask(FILE *mask_file, int *seg_map) { + int mask_rows, mask_cols, i, j; + int *map_start = seg_map; + fscanf(mask_file, "%d %d\n", &mask_cols, &mask_rows); + for (i = 0; i < mask_rows; i++) { + for (j = 0; j < mask_cols; j++) { + fscanf(mask_file, "%d ", &seg_map[j]); + // reverse the bit + seg_map[j] = 1 - seg_map[j]; + } + seg_map += mask_cols; + } + seg_map = map_start; +} +#endif + +int main(int argc, char **argv) { + VpxVideoWriter *outfile[VPX_TS_MAX_LAYERS] = { NULL }; + vpx_codec_ctx_t codec; + vpx_codec_enc_cfg_t cfg; + int frame_cnt = 0; + vpx_image_t raw; + vpx_codec_err_t res; + unsigned int width; + unsigned int height; + uint32_t error_resilient = 0; + int speed; + int frame_avail; + int got_data; + int flags = 0; + unsigned int i; + int pts = 0; // PTS starts at 0. + int frame_duration = 1; // 1 timebase tick per frame. + int layering_mode = 0; + int layer_flags[VPX_TS_MAX_PERIODICITY] = { 0 }; + int flag_periodicity = 1; +#if ROI_MAP + vpx_roi_map_t roi; +#endif + vpx_svc_layer_id_t layer_id; + const VpxInterface *encoder = NULL; + struct VpxInputContext input_ctx; + struct RateControlMetrics rc; + int64_t cx_time = 0; + const int min_args_base = 13; +#if CONFIG_VP9_HIGHBITDEPTH + vpx_bit_depth_t bit_depth = VPX_BITS_8; + int input_bit_depth = 8; + const int min_args = min_args_base + 1; +#else + const int min_args = min_args_base; +#endif // CONFIG_VP9_HIGHBITDEPTH + double sum_bitrate = 0.0; + double sum_bitrate2 = 0.0; + double framerate = 30.0; +#if ROI_MAP + FILE *mask_file = NULL; + int block_size = 8; + int mask_rows = 0; + int mask_cols = 0; + int *mask_map; + int *prev_mask_map; +#endif + zero(rc.layer_target_bitrate); + memset(&layer_id, 0, sizeof(vpx_svc_layer_id_t)); + memset(&input_ctx, 0, sizeof(input_ctx)); + /* Setup default input stream settings */ + input_ctx.framerate.numerator = 30; + input_ctx.framerate.denominator = 1; + input_ctx.only_i420 = 1; + input_ctx.bit_depth = 0; + + exec_name = argv[0]; + // Check usage and arguments. + if (argc < min_args) { +#if CONFIG_VP9_HIGHBITDEPTH + die("Usage: %s " + " " + " " + " ... \n", + argv[0]); +#else + die("Usage: %s " + " " + " " + " ... \n", + argv[0]); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + + encoder = get_vpx_encoder_by_name(argv[3]); + if (!encoder) die("Unsupported codec."); + + printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface())); + + width = (unsigned int)strtoul(argv[4], NULL, 0); + height = (unsigned int)strtoul(argv[5], NULL, 0); + if (width < 16 || width % 2 || height < 16 || height % 2) { + die("Invalid resolution: %d x %d", width, height); + } + + layering_mode = (int)strtol(argv[12], NULL, 0); + if (layering_mode < 0 || layering_mode > 13) { + die("Invalid layering mode (0..12) %s", argv[12]); + } + +#if ROI_MAP + if (argc != min_args + mode_to_num_layers[layering_mode] + 1) { + die("Invalid number of arguments"); + } +#else + if (argc != min_args + mode_to_num_layers[layering_mode]) { + die("Invalid number of arguments"); + } +#endif + + input_ctx.filename = argv[1]; + open_input_file(&input_ctx); + +#if CONFIG_VP9_HIGHBITDEPTH + switch (strtol(argv[argc - 1], NULL, 0)) { + case 8: + bit_depth = VPX_BITS_8; + input_bit_depth = 8; + break; + case 10: + bit_depth = VPX_BITS_10; + input_bit_depth = 10; + break; + case 12: + bit_depth = VPX_BITS_12; + input_bit_depth = 12; + break; + default: die("Invalid bit depth (8, 10, 12) %s", argv[argc - 1]); + } + + // Y4M reader has its own allocation. + if (input_ctx.file_type != FILE_TYPE_Y4M) { + if (!vpx_img_alloc( + &raw, + bit_depth == VPX_BITS_8 ? VPX_IMG_FMT_I420 : VPX_IMG_FMT_I42016, + width, height, 32)) { + die("Failed to allocate image (%dx%d)", width, height); + } + } +#else + // Y4M reader has its own allocation. + if (input_ctx.file_type != FILE_TYPE_Y4M) { + if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, width, height, 32)) { + die("Failed to allocate image (%dx%d)", width, height); + } + } +#endif // CONFIG_VP9_HIGHBITDEPTH + + // Populate encoder configuration. + res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0); + if (res) { + printf("Failed to get config: %s\n", vpx_codec_err_to_string(res)); + return EXIT_FAILURE; + } + + // Update the default configuration with our settings. + cfg.g_w = width; + cfg.g_h = height; + +#if CONFIG_VP9_HIGHBITDEPTH + if (bit_depth != VPX_BITS_8) { + cfg.g_bit_depth = bit_depth; + cfg.g_input_bit_depth = input_bit_depth; + cfg.g_profile = 2; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + + // Timebase format e.g. 30fps: numerator=1, demoninator = 30. + cfg.g_timebase.num = (int)strtol(argv[6], NULL, 0); + cfg.g_timebase.den = (int)strtol(argv[7], NULL, 0); + + speed = (int)strtol(argv[8], NULL, 0); + if (speed < 0) { + die("Invalid speed setting: must be positive"); + } + if (strncmp(encoder->name, "vp9", 3) == 0 && speed > 9) { + warn("Mapping speed %d to speed 9.\n", speed); + } + + for (i = min_args_base; + (int)i < min_args_base + mode_to_num_layers[layering_mode]; ++i) { + rc.layer_target_bitrate[i - 13] = (int)strtol(argv[i], NULL, 0); + if (strncmp(encoder->name, "vp8", 3) == 0) + cfg.ts_target_bitrate[i - 13] = rc.layer_target_bitrate[i - 13]; + else if (strncmp(encoder->name, "vp9", 3) == 0) + cfg.layer_target_bitrate[i - 13] = rc.layer_target_bitrate[i - 13]; + } + + // Real time parameters. + cfg.rc_dropframe_thresh = (unsigned int)strtoul(argv[9], NULL, 0); + cfg.rc_end_usage = VPX_CBR; + cfg.rc_min_quantizer = 2; + cfg.rc_max_quantizer = 56; + if (strncmp(encoder->name, "vp9", 3) == 0) cfg.rc_max_quantizer = 52; + cfg.rc_undershoot_pct = 50; + cfg.rc_overshoot_pct = 50; + cfg.rc_buf_initial_sz = 600; + cfg.rc_buf_optimal_sz = 600; + cfg.rc_buf_sz = 1000; + + // Disable dynamic resizing by default. + cfg.rc_resize_allowed = 0; + + // Use 1 thread as default. + cfg.g_threads = (unsigned int)strtoul(argv[11], NULL, 0); + + error_resilient = (uint32_t)strtoul(argv[10], NULL, 0); + if (error_resilient != 0 && error_resilient != 1) { + die("Invalid value for error resilient (0, 1): %d.", error_resilient); + } + // Enable error resilient mode. + cfg.g_error_resilient = error_resilient; + cfg.g_lag_in_frames = 0; + cfg.kf_mode = VPX_KF_AUTO; + + // Disable automatic keyframe placement. + cfg.kf_min_dist = cfg.kf_max_dist = 3000; + + cfg.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS; + + set_temporal_layer_pattern(layering_mode, &cfg, layer_flags, + &flag_periodicity); + + set_rate_control_metrics(&rc, &cfg); + + if (input_ctx.file_type == FILE_TYPE_Y4M) { + if (input_ctx.width != cfg.g_w || input_ctx.height != cfg.g_h) { + die("Incorrect width or height: %d x %d", cfg.g_w, cfg.g_h); + } + if (input_ctx.framerate.numerator != cfg.g_timebase.den || + input_ctx.framerate.denominator != cfg.g_timebase.num) { + die("Incorrect framerate: numerator %d denominator %d", + cfg.g_timebase.num, cfg.g_timebase.den); + } + } + + framerate = cfg.g_timebase.den / cfg.g_timebase.num; + // Open an output file for each stream. + for (i = 0; i < cfg.ts_number_layers; ++i) { + char file_name[PATH_MAX]; + VpxVideoInfo info; + info.codec_fourcc = encoder->fourcc; + info.frame_width = cfg.g_w; + info.frame_height = cfg.g_h; + info.time_base.numerator = cfg.g_timebase.num; + info.time_base.denominator = cfg.g_timebase.den; + + snprintf(file_name, sizeof(file_name), "%s_%d.ivf", argv[2], i); + outfile[i] = vpx_video_writer_open(file_name, kContainerIVF, &info); + if (!outfile[i]) die("Failed to open %s for writing", file_name); + + assert(outfile[i] != NULL); + } + // No spatial layers in this encoder. + cfg.ss_number_layers = 1; + +// Initialize codec. +#if CONFIG_VP9_HIGHBITDEPTH + if (vpx_codec_enc_init( + &codec, encoder->codec_interface(), &cfg, + bit_depth == VPX_BITS_8 ? 0 : VPX_CODEC_USE_HIGHBITDEPTH)) +#else + if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0)) +#endif // CONFIG_VP9_HIGHBITDEPTH + die("Failed to initialize encoder"); + +#if ROI_MAP + mask_rows = (cfg.g_h + block_size - 1) / block_size; + mask_cols = (cfg.g_w + block_size - 1) / block_size; + mask_map = (int *)calloc(mask_rows * mask_cols, sizeof(*mask_map)); + prev_mask_map = (int *)calloc(mask_rows * mask_cols, sizeof(*mask_map)); +#endif + + if (strncmp(encoder->name, "vp8", 3) == 0) { + vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed); + vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kVp8DenoiserOff); + vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1); + vpx_codec_control(&codec, VP8E_SET_GF_CBR_BOOST_PCT, 0); +#if ROI_MAP + set_roi_map(encoder->name, &cfg, &roi); + if (vpx_codec_control(&codec, VP8E_SET_ROI_MAP, &roi)) + die_codec(&codec, "Failed to set ROI map"); +#endif + } else if (strncmp(encoder->name, "vp9", 3) == 0) { + vpx_svc_extra_cfg_t svc_params; + memset(&svc_params, 0, sizeof(svc_params)); + vpx_codec_control(&codec, VP9E_SET_POSTENCODE_DROP, 0); + vpx_codec_control(&codec, VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, 0); + vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed); + vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3); + vpx_codec_control(&codec, VP9E_SET_GF_CBR_BOOST_PCT, 0); + vpx_codec_control(&codec, VP9E_SET_FRAME_PARALLEL_DECODING, 0); + vpx_codec_control(&codec, VP9E_SET_FRAME_PERIODIC_BOOST, 0); + vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, kVp9DenoiserOff); + vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1); + vpx_codec_control(&codec, VP9E_SET_TUNE_CONTENT, 0); + vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, get_msb(cfg.g_threads)); + vpx_codec_control(&codec, VP9E_SET_DISABLE_LOOPFILTER, 0); + + if (cfg.g_threads > 1) + vpx_codec_control(&codec, VP9E_SET_ROW_MT, 1); + else + vpx_codec_control(&codec, VP9E_SET_ROW_MT, 0); + if (vpx_codec_control(&codec, VP9E_SET_SVC, layering_mode > 0 ? 1 : 0)) + die_codec(&codec, "Failed to set SVC"); + for (i = 0; i < cfg.ts_number_layers; ++i) { + svc_params.max_quantizers[i] = cfg.rc_max_quantizer; + svc_params.min_quantizers[i] = cfg.rc_min_quantizer; + } + svc_params.scaling_factor_num[0] = cfg.g_h; + svc_params.scaling_factor_den[0] = cfg.g_h; + vpx_codec_control(&codec, VP9E_SET_SVC_PARAMETERS, &svc_params); + } + if (strncmp(encoder->name, "vp8", 3) == 0) { + vpx_codec_control(&codec, VP8E_SET_SCREEN_CONTENT_MODE, 0); + } + vpx_codec_control(&codec, VP8E_SET_TOKEN_PARTITIONS, 1); + // This controls the maximum target size of the key frame. + // For generating smaller key frames, use a smaller max_intra_size_pct + // value, like 100 or 200. + { + const int max_intra_size_pct = 1000; + vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT, + max_intra_size_pct); + } + + frame_avail = 1; + while (frame_avail || got_data) { + struct vpx_usec_timer timer; + vpx_codec_iter_t iter = NULL; + const vpx_codec_cx_pkt_t *pkt; +#if ROI_MAP + char mask_file_name[255]; +#endif + // Update the temporal layer_id. No spatial layers in this test. + layer_id.spatial_layer_id = 0; + layer_id.temporal_layer_id = + cfg.ts_layer_id[frame_cnt % cfg.ts_periodicity]; + layer_id.temporal_layer_id_per_spatial[0] = layer_id.temporal_layer_id; + if (strncmp(encoder->name, "vp9", 3) == 0) { + vpx_codec_control(&codec, VP9E_SET_SVC_LAYER_ID, &layer_id); + } else if (strncmp(encoder->name, "vp8", 3) == 0) { + vpx_codec_control(&codec, VP8E_SET_TEMPORAL_LAYER_ID, + layer_id.temporal_layer_id); + } + flags = layer_flags[frame_cnt % flag_periodicity]; + if (layering_mode == 0) flags = 0; +#if ROI_MAP + snprintf(mask_file_name, sizeof(mask_file_name), "%s%05d.txt", + argv[argc - 1], frame_cnt); + mask_file = fopen(mask_file_name, "r"); + if (mask_file != NULL) { + read_mask(mask_file, mask_map); + fclose(mask_file); + // set_roi_map(encoder->name, &cfg, &roi); + set_roi_skip_map(&cfg, &roi, mask_map, prev_mask_map, frame_cnt); + if (vpx_codec_control(&codec, VP9E_SET_ROI_MAP, &roi)) + die_codec(&codec, "Failed to set ROI map"); + } +#endif + frame_avail = read_frame(&input_ctx, &raw); + if (frame_avail) ++rc.layer_input_frames[layer_id.temporal_layer_id]; + vpx_usec_timer_start(&timer); + if (vpx_codec_encode(&codec, frame_avail ? &raw : NULL, pts, 1, flags, + VPX_DL_REALTIME)) { + die_codec(&codec, "Failed to encode frame"); + } + vpx_usec_timer_mark(&timer); + cx_time += vpx_usec_timer_elapsed(&timer); + // Reset KF flag. + if (layering_mode != 7) { + layer_flags[0] &= ~VPX_EFLAG_FORCE_KF; + } + got_data = 0; + while ((pkt = vpx_codec_get_cx_data(&codec, &iter))) { + got_data = 1; + switch (pkt->kind) { + case VPX_CODEC_CX_FRAME_PKT: + for (i = cfg.ts_layer_id[frame_cnt % cfg.ts_periodicity]; + i < cfg.ts_number_layers; ++i) { + vpx_video_writer_write_frame(outfile[i], pkt->data.frame.buf, + pkt->data.frame.sz, pts); + ++rc.layer_tot_enc_frames[i]; + rc.layer_encoding_bitrate[i] += 8.0 * pkt->data.frame.sz; + // Keep count of rate control stats per layer (for non-key frames). + if (i == cfg.ts_layer_id[frame_cnt % cfg.ts_periodicity] && + !(pkt->data.frame.flags & VPX_FRAME_IS_KEY)) { + rc.layer_avg_frame_size[i] += 8.0 * pkt->data.frame.sz; + rc.layer_avg_rate_mismatch[i] += + fabs(8.0 * pkt->data.frame.sz - rc.layer_pfb[i]) / + rc.layer_pfb[i]; + ++rc.layer_enc_frames[i]; + } + } + // Update for short-time encoding bitrate states, for moving window + // of size rc->window, shifted by rc->window / 2. + // Ignore first window segment, due to key frame. + if (rc.window_size == 0) rc.window_size = 15; + if (frame_cnt > rc.window_size) { + sum_bitrate += 0.001 * 8.0 * pkt->data.frame.sz * framerate; + if (frame_cnt % rc.window_size == 0) { + rc.window_count += 1; + rc.avg_st_encoding_bitrate += sum_bitrate / rc.window_size; + rc.variance_st_encoding_bitrate += + (sum_bitrate / rc.window_size) * + (sum_bitrate / rc.window_size); + sum_bitrate = 0.0; + } + } + // Second shifted window. + if (frame_cnt > rc.window_size + rc.window_size / 2) { + sum_bitrate2 += 0.001 * 8.0 * pkt->data.frame.sz * framerate; + if (frame_cnt > 2 * rc.window_size && + frame_cnt % rc.window_size == 0) { + rc.window_count += 1; + rc.avg_st_encoding_bitrate += sum_bitrate2 / rc.window_size; + rc.variance_st_encoding_bitrate += + (sum_bitrate2 / rc.window_size) * + (sum_bitrate2 / rc.window_size); + sum_bitrate2 = 0.0; + } + } + break; + default: break; + } + } + ++frame_cnt; + pts += frame_duration; + } +#if ROI_MAP + free(mask_map); + free(prev_mask_map); +#endif + close_input_file(&input_ctx); + printout_rate_control_summary(&rc, &cfg, frame_cnt); + printf("\n"); + printf("Frame cnt and encoding time/FPS stats for encoding: %d %f %f \n", + frame_cnt, 1000 * (float)cx_time / (double)(frame_cnt * 1000000), + 1000000 * (double)frame_cnt / (double)cx_time); + + if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec"); + + // Try to rewrite the output file headers with the actual frame count. + for (i = 0; i < cfg.ts_number_layers; ++i) vpx_video_writer_close(outfile[i]); + + if (input_ctx.file_type != FILE_TYPE_Y4M) { + vpx_img_free(&raw); + } + +#if ROI_MAP + free(roi.roi_map); +#endif + return EXIT_SUCCESS; +} diff --git a/media/libvpx/libvpx/ivfdec.c b/media/libvpx/libvpx/ivfdec.c new file mode 100644 index 0000000000..3e179bc6ed --- /dev/null +++ b/media/libvpx/libvpx/ivfdec.c @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "vpx_ports/mem_ops.h" + +#include "./ivfdec.h" + +static const char *IVF_SIGNATURE = "DKIF"; + +static void fix_framerate(int *num, int *den) { + // Some versions of vpxenc used 1/(2*fps) for the timebase, so + // we can guess the framerate using only the timebase in this + // case. Other files would require reading ahead to guess the + // timebase, like we do for webm. + if (*den > 0 && *den < 1000000000 && *num > 0 && *num < 1000) { + // Correct for the factor of 2 applied to the timebase in the encoder. + if (*num & 1) + *den *= 2; + else + *num /= 2; + } else { + // Don't know FPS for sure, and don't have readahead code + // (yet?), so just default to 30fps. + *num = 30; + *den = 1; + } +} + +int file_is_ivf(struct VpxInputContext *input_ctx) { + char raw_hdr[32]; + int is_ivf = 0; + + if (fread(raw_hdr, 1, 32, input_ctx->file) == 32) { + if (memcmp(IVF_SIGNATURE, raw_hdr, 4) == 0) { + is_ivf = 1; + + if (mem_get_le16(raw_hdr + 4) != 0) { + fprintf(stderr, + "Error: Unrecognized IVF version! This file may not" + " decode properly."); + } + + input_ctx->fourcc = mem_get_le32(raw_hdr + 8); + input_ctx->width = mem_get_le16(raw_hdr + 12); + input_ctx->height = mem_get_le16(raw_hdr + 14); + input_ctx->framerate.numerator = mem_get_le32(raw_hdr + 16); + input_ctx->framerate.denominator = mem_get_le32(raw_hdr + 20); + fix_framerate(&input_ctx->framerate.numerator, + &input_ctx->framerate.denominator); + } + } + + if (!is_ivf) { + rewind(input_ctx->file); + input_ctx->detect.buf_read = 0; + } else { + input_ctx->detect.position = 4; + } + return is_ivf; +} + +int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read, + size_t *buffer_size) { + char raw_header[IVF_FRAME_HDR_SZ] = { 0 }; + size_t frame_size = 0; + + if (fread(raw_header, IVF_FRAME_HDR_SZ, 1, infile) != 1) { + if (!feof(infile)) warn("Failed to read frame size"); + } else { + frame_size = mem_get_le32(raw_header); + + if (frame_size > 256 * 1024 * 1024) { + warn("Read invalid frame size (%u)", (unsigned int)frame_size); + frame_size = 0; + } + + if (frame_size > *buffer_size) { + uint8_t *new_buffer = realloc(*buffer, 2 * frame_size); + + if (new_buffer) { + *buffer = new_buffer; + *buffer_size = 2 * frame_size; + } else { + warn("Failed to allocate compressed data buffer"); + frame_size = 0; + } + } + } + + if (!feof(infile)) { + if (fread(*buffer, 1, frame_size, infile) != frame_size) { + warn("Failed to read full frame"); + return 1; + } + + *bytes_read = frame_size; + return 0; + } + + return 1; +} diff --git a/media/libvpx/libvpx/ivfdec.h b/media/libvpx/libvpx/ivfdec.h new file mode 100644 index 0000000000..847cd79f3f --- /dev/null +++ b/media/libvpx/libvpx/ivfdec.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_IVFDEC_H_ +#define VPX_IVFDEC_H_ + +#include "./tools_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +int file_is_ivf(struct VpxInputContext *input); + +int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read, + size_t *buffer_size); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif // VPX_IVFDEC_H_ diff --git a/media/libvpx/libvpx/ivfenc.c b/media/libvpx/libvpx/ivfenc.c new file mode 100644 index 0000000000..2e8e04283a --- /dev/null +++ b/media/libvpx/libvpx/ivfenc.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./ivfenc.h" + +#include "vpx/vpx_encoder.h" +#include "vpx_ports/mem_ops.h" + +void ivf_write_file_header_with_video_info(FILE *outfile, unsigned int fourcc, + int frame_cnt, int frame_width, + int frame_height, + vpx_rational_t timebase) { + char header[32]; + + header[0] = 'D'; + header[1] = 'K'; + header[2] = 'I'; + header[3] = 'F'; + mem_put_le16(header + 4, 0); // version + mem_put_le16(header + 6, 32); // header size + mem_put_le32(header + 8, fourcc); // fourcc + mem_put_le16(header + 12, frame_width); // width + mem_put_le16(header + 14, frame_height); // height + mem_put_le32(header + 16, timebase.den); // rate + mem_put_le32(header + 20, timebase.num); // scale + mem_put_le32(header + 24, frame_cnt); // length + mem_put_le32(header + 28, 0); // unused + + fwrite(header, 1, 32, outfile); +} + +void ivf_write_file_header(FILE *outfile, const struct vpx_codec_enc_cfg *cfg, + unsigned int fourcc, int frame_cnt) { + ivf_write_file_header_with_video_info(outfile, fourcc, frame_cnt, cfg->g_w, + cfg->g_h, cfg->g_timebase); +} + +void ivf_write_frame_header(FILE *outfile, int64_t pts, size_t frame_size) { + char header[12]; + + mem_put_le32(header, (int)frame_size); + mem_put_le32(header + 4, (int)(pts & 0xFFFFFFFF)); + mem_put_le32(header + 8, (int)(pts >> 32)); + fwrite(header, 1, 12, outfile); +} + +void ivf_write_frame_size(FILE *outfile, size_t frame_size) { + char header[4]; + + mem_put_le32(header, (int)frame_size); + fwrite(header, 1, 4, outfile); +} diff --git a/media/libvpx/libvpx/ivfenc.h b/media/libvpx/libvpx/ivfenc.h new file mode 100644 index 0000000000..27b6910805 --- /dev/null +++ b/media/libvpx/libvpx/ivfenc.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_IVFENC_H_ +#define VPX_IVFENC_H_ + +#include "./tools_common.h" + +#include "vpx/vpx_encoder.h" + +struct vpx_codec_enc_cfg; +struct vpx_codec_cx_pkt; + +#ifdef __cplusplus +extern "C" { +#endif + +void ivf_write_file_header_with_video_info(FILE *outfile, unsigned int fourcc, + int frame_cnt, int frame_width, + int frame_height, + vpx_rational_t timebase); + +void ivf_write_file_header(FILE *outfile, const struct vpx_codec_enc_cfg *cfg, + uint32_t fourcc, int frame_cnt); + +void ivf_write_frame_header(FILE *outfile, int64_t pts, size_t frame_size); + +void ivf_write_frame_size(FILE *outfile, size_t frame_size); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif // VPX_IVFENC_H_ diff --git a/media/libvpx/libvpx/keywords.dox b/media/libvpx/libvpx/keywords.dox new file mode 100644 index 0000000000..56f5368900 --- /dev/null +++ b/media/libvpx/libvpx/keywords.dox @@ -0,0 +1,51 @@ +/*!\page rfc2119 RFC2119 Keywords + + The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL + NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and + "OPTIONAL" in this document are to be interpreted as described in + RFC 2119. + +Specifically, the following definitions are used: + +\section MUST +\anchor REQUIRED +\anchor SHALL + This word, or the terms "REQUIRED" or "SHALL", mean that the + definition is an absolute requirement of the specification. + +\section MUSTNOT MUST NOT +\anchor SHALLNOT + This phrase, or the phrase "SHALL NOT", mean that the + definition is an absolute prohibition of the specification. + +\section SHOULD +\anchor RECOMMENDED + This word, or the adjective "RECOMMENDED", mean that there + may exist valid reasons in particular circumstances to ignore a + particular item, but the full implications must be understood and + carefully weighed before choosing a different course. + +\section SHOULDNOT SHOULD NOT +\anchor NOTRECOMMENDED + This phrase, or the phrase "NOT RECOMMENDED" mean that + there may exist valid reasons in particular circumstances when the + particular behavior is acceptable or even useful, but the full + implications should be understood and the case carefully weighed + before implementing any behavior described with this label. + +\section MAY +\anchor OPTIONAL + This word, or the adjective "OPTIONAL", mean that an item is + truly optional. One vendor may choose to include the item because a + particular marketplace requires it or because the vendor feels that + it enhances the product while another vendor may omit the same item. + An implementation which does not include a particular option \ref MUST be + prepared to interoperate with another implementation which does + include the option, though perhaps with reduced functionality. In the + same vein an implementation which does include a particular option + \ref MUST be prepared to interoperate with another implementation which + does not include the option (except, of course, for the feature the + option provides.) + + +*/ diff --git a/media/libvpx/libvpx/libs.doxy_template b/media/libvpx/libvpx/libs.doxy_template new file mode 100644 index 0000000000..1ee442af3e --- /dev/null +++ b/media/libvpx/libvpx/libs.doxy_template @@ -0,0 +1,1260 @@ +## +## Copyright (c) 2010 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + + +# Doxyfile 1.5.4 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project +# +# All text after a hash (#) is considered a comment and will be ignored +# The format is: +# TAG = value [value, ...] +# For lists items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (" ") + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the config file that +# follow. The default is UTF-8 which is also the encoding used for all text before +# the first occurrence of this tag. Doxygen uses libiconv (or the iconv built into +# libc) for the transcoding. See http://www.gnu.org/software/libiconv for the list of +# possible encodings. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded +# by quotes) that should identify the project. + +PROJECT_NAME = "WebM Codec SDK" + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) +# base path where the generated documentation will be put. +# If a relative path is entered, it will be relative to the location +# where doxygen was started. If left blank the current directory will be used. + +OUTPUT_DIRECTORY = docs + +# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create +# 4096 sub-directories (in 2 levels) under the output directory of each output +# format and will distribute the generated files over these directories. +# Enabling this option can be useful when feeding doxygen a huge amount of +# source files, where putting all generated files in the same directory would +# otherwise cause performance problems for the file system. + +CREATE_SUBDIRS = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# The default language is English, other supported languages are: +# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, +# Croatian, Czech, Danish, Dutch, Finnish, French, German, Greek, Hungarian, +# Italian, Japanese, Japanese-en (Japanese with English messages), Korean, +# Korean-en, Lithuanian, Norwegian, Polish, Portuguese, Romanian, Russian, +# Serbian, Slovak, Slovene, Spanish, Swedish, and Ukrainian. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will +# include brief member descriptions after the members that are listed in +# the file and class documentation (similar to java_doc). +# Set to NO to disable this. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend +# the brief description of a member or function before the detailed description. +# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator +# that is used to form the text in various listings. Each string +# in this list, if found as the leading text of the brief description, will be +# stripped from the text and the result after processing the whole list, is +# used as the annotated text. Otherwise, the brief description is used as-is. +# If left blank, the following values are used ("$name" is automatically +# replaced with the name of the entity): "The $name class" "The $name widget" +# "The $name file" "is" "provides" "specifies" "contains" +# "represents" "a" "an" "the" + +ABBREVIATE_BRIEF = + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# Doxygen will generate a detailed section even if there is only a brief +# description. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full +# path before files name in the file list and in the header files. If set +# to NO the shortest path that makes the file name unique will be used. + +FULL_PATH_NAMES = YES + +# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag +# can be used to strip a user-defined part of the path. Stripping is +# only done if one of the specified strings matches the left-hand part of +# the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the +# path to strip. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of +# the path mentioned in the documentation of a class, which tells +# the reader which header file to include in order to use a class. +# If left blank only the name of the header file containing the class +# definition is used. Otherwise one should specify the include paths that +# are normally passed to the compiler using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter +# (but less readable) file names. This can be useful is your file systems +# doesn't support long names like on DOS, Mac, or CD-ROM. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen +# will interpret the first line (until the first dot) of a java_doc-style +# comment as the brief description. If set to NO, the java_doc +# comments will behave just like regular Qt-style comments +# (thus requiring an explicit @brief command for a brief description.) + +JAVADOC_AUTOBRIEF = NO + +# If the QT_AUTOBRIEF tag is set to YES then Doxygen will +# interpret the first line (until the first dot) of a Qt-style +# comment as the brief description. If set to NO, the comments +# will behave just like regular Qt-style comments (thus requiring +# an explicit \brief command for a brief description.) + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen +# treat a multi-line C++ special comment block (i.e. a block of //! or /// +# comments) as a brief description. This used to be the default behaviour. +# The new default is to treat a multi-line C++ comment block as a detailed +# description. Set this tag to YES if you prefer the old behaviour instead. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented +# member inherits the documentation from any documented member that it +# re-implements. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce +# a new page for each member. If set to NO, the documentation of a member will +# be part of the file/class/namespace that contains it. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. +# Doxygen uses this value to replace tabs by spaces in code fragments. + +TAB_SIZE = 4 + +# This tag can be used to specify a number of aliases that acts +# as commands in the documentation. An alias has the form "name=value". +# For example adding "sideeffect=\par Side Effects:\n" will allow you to +# put the command \sideeffect (or @sideeffect) in the documentation, which +# will result in a user-defined paragraph with heading "Side Effects:". +# You can put \n's in the value part of an alias to insert newlines. + +ALIASES = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C +# sources only. Doxygen will then generate output that is more tailored for C. +# For instance, some of the names that are used will be different. The list +# of all members will be omitted, etc. + +OPTIMIZE_OUTPUT_FOR_C = YES + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java +# sources only. Doxygen will then generate output that is more tailored for Java. +# For instance, namespaces will be presented as packages, qualified scopes +# will look different, etc. + +OPTIMIZE_OUTPUT_JAVA = NO + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want to +# include (a tag file for) the STL sources as input, then you should +# set this tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. +# func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. + +BUILTIN_STL_SUPPORT = NO + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. +# Doxygen will parse them like normal C++ but will assume all classes use public +# instead of private inheritance when no explicit protection keyword is present. + +SIP_SUPPORT = NO + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES, then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. + +DISTRIBUTE_GROUP_DOC = NO + +# Set the SUBGROUPING tag to YES (the defqault) to allow class member groups of +# the same type (for instance a group of public functions) to be put as a +# subgroup of that type (e.g. under the Public Functions section). Set it to +# NO to prevent subgrouping. Alternatively, this can be done per class using +# the \nosubgrouping command. + +SUBGROUPING = YES + +# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct (or union) is +# documented as struct with the name of the typedef. So +# typedef struct type_s {} type_t, will appear in the documentation as a struct +# with name type_t. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named type_s. This can typically +# be useful for C code where the coding convention is that all structs are +# typedef'ed and only the typedef is referenced never the struct's name. + +TYPEDEF_HIDES_STRUCT = NO + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in +# documentation are documented, even if no documentation was available. +# Private class members and static file members will be hidden unless +# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES + +EXTRACT_ALL = NO + +# If the EXTRACT_PRIVATE tag is set to YES all private members of a class +# will be included in the documentation. + +EXTRACT_PRIVATE = NO + +# If the EXTRACT_STATIC tag is set to YES all static members of a file +# will be included in the documentation. + +EXTRACT_STATIC = NO + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) +# defined locally in source files will be included in the documentation. +# If set to NO only classes defined in header files are included. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. When set to YES local +# methods, which are defined in the implementation section but not in +# the interface are included in the documentation. +# If set to NO (the default) only methods in the interface are included. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be extracted +# and appear in the documentation as a namespace called 'anonymous_namespace{file}', +# where file will be replaced with the base name of the file that contains the anonymous +# namespace. By default anonymous namespace are hidden. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all +# undocumented members of documented classes, files or namespaces. +# If set to NO (the default) these members will be included in the +# various overviews, but no documentation section is generated. +# This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. +# If set to NO (the default) these classes will be included in the various +# overviews. This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_CLASSES = NO + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all +# friend (class|struct|union) declarations. +# If set to NO (the default) these declarations will be included in the +# documentation. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any +# documentation blocks found inside the body of a function. +# If set to NO (the default) these blocks will be appended to the +# function's detailed documentation block. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation +# that is typed after a \internal command is included. If the tag is set +# to NO (the default) then the documentation will be excluded. +# Set it to YES to include the internal documentation. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate +# file names in lower-case letters. If set to YES upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. + +CASE_SENSE_NAMES = YES + +# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen +# will show members with their full class and namespace scopes in the +# documentation. If set to YES the scope will be hidden. + +HIDE_SCOPE_NAMES = NO + +# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen +# will put a list of the files that are included by a file in the documentation +# of that file. + +SHOW_INCLUDE_FILES = YES + +# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] +# is inserted in the documentation for inline members. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen +# will sort the (detailed) documentation of file and class members +# alphabetically by member name. If set to NO the members will appear in +# declaration order. + +SORT_MEMBER_DOCS = NO + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the +# brief documentation of file, namespace and class members alphabetically +# by member name. If set to NO (the default) the members will appear in +# declaration order. + +SORT_BRIEF_DOCS = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be +# sorted by fully-qualified names, including namespaces. If set to +# NO (the default), the class list will be sorted only by class name, +# not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the +# alphabetical list. + +SORT_BY_SCOPE_NAME = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or +# disable (NO) the todo list. This list is created by putting \todo +# commands in the documentation. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or +# disable (NO) the test list. This list is created by putting \test +# commands in the documentation. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or +# disable (NO) the bug list. This list is created by putting \bug +# commands in the documentation. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or +# disable (NO) the deprecated list. This list is created by putting +# \deprecated commands in the documentation. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional +# documentation sections, marked by \if sectionname ... \endif. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines +# the initial value of a variable or define consists of for it to appear in +# the documentation. If the initializer consists of more lines than specified +# here it will be hidden. Use a value of 0 to hide initializers completely. +# The appearance of the initializer of individual variables and defines in the +# documentation can be controlled using \showinitializer or \hideinitializer +# command in the documentation regardless of this setting. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated +# at the bottom of the documentation of classes and structs. If set to YES the +# list will mention the files that were used to generate the documentation. + +SHOW_USED_FILES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from the +# version control system). Doxygen will invoke the program by executing (via +# popen()) the command , where is the value of +# the FILE_VERSION_FILTER tag, and is the name of an input file +# provided by doxygen. Whatever the program writes to standard output +# is used as the file version. See the manual for examples. + +FILE_VERSION_FILTER = + +#--------------------------------------------------------------------------- +# configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated +# by doxygen. Possible values are YES and NO. If left blank NO is used. + +QUIET = YES + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated by doxygen. Possible values are YES and NO. If left blank +# NO is used. + +WARNINGS = YES + +# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings +# for undocumented members. If EXTRACT_ALL is set to YES then this flag will +# automatically be disabled. + +WARN_IF_UNDOCUMENTED = YES + +# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some +# parameters in a documented function, or documenting parameters that +# don't exist or using markup commands wrongly. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be abled to get warnings for +# functions that are documented, but have no documentation for their parameters +# or return value. If set to NO (the default) doxygen will only warn about +# wrong or incomplete parameter documentation, but not about the absence of +# documentation. + +WARN_NO_PARAMDOC = NO + +# The WARN_FORMAT tag determines the format of the warning messages that +# doxygen can produce. The string should contain the $file, $line, and $text +# tags, which will be replaced by the file and line number from which the +# warning originated and the warning text. Optionally the format may contain +# $version, which will be replaced by the version of the file (if it could +# be obtained via FILE_VERSION_FILTER) + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning +# and error messages should be written. If left blank the output is written +# to stderr. + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag can be used to specify the files and/or directories that contain +# documented source files. You may enter file names like "myfile.cpp" or +# directories like "/usr/src/myproject". Separate the files or directories +# with spaces. + +INPUT = + +# This tag can be used to specify the character encoding of the source files that +# doxygen parses. Internally doxygen uses the UTF-8 encoding, which is also the default +# input encoding. Doxygen uses libiconv (or the iconv built into libc) for the transcoding. +# See http://www.gnu.org/software/libiconv for the list of possible encodings. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank the following patterns are tested: +# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx +# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90 + +FILE_PATTERNS = + +# The RECURSIVE tag can be used to turn specify whether or not subdirectories +# should be searched for input files as well. Possible values are YES and NO. +# If left blank NO is used. + +RECURSIVE = NO + +# The EXCLUDE tag can be used to specify files and/or directories that should +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. + +EXCLUDE = + +# The EXCLUDE_SYMLINKS tag can be used select whether or not files or +# directories that are symbolic links (a Unix filesystem feature) are excluded +# from the input. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. Note that the wildcards are matched +# against the file with absolute path, so to exclude all test directories +# for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the output. +# The symbol name can be a fully qualified name, a word, or if the wildcard * is used, +# a substring. Examples: ANamespace, AClass, AClass::ANamespace, ANamespace::*Test + +EXCLUDE_SYMBOLS = + +# The EXAMPLE_PATH tag can be used to specify one or more files or +# directories that contain example code fragments that are included (see +# the \include command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank all files are included. + +EXAMPLE_PATTERNS = + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude +# commands irrespective of the value of the RECURSIVE tag. +# Possible values are YES and NO. If left blank NO is used. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or +# directories that contain image that are included in the documentation (see +# the \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command , where +# is the value of the INPUT_FILTER tag, and is the name of an +# input file. Doxygen will then use the output that the filter program writes +# to standard output. If FILTER_PATTERNS is specified, this tag will be +# ignored. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: +# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further +# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER +# is applied to all files. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will be used to filter the input files when producing source +# files to browse (i.e. when SOURCE_BROWSER is set to YES). + +FILTER_SOURCE_FILES = NO + +#--------------------------------------------------------------------------- +# configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will +# be generated. Documented entities will be cross-referenced with these sources. +# Note: To get rid of all source code in the generated output, make sure also +# VERBATIM_HEADERS is set to NO. If you have enabled CALL_GRAPH or CALLER_GRAPH +# then you must also enable this option. If you don't then doxygen will produce +# a warning and turn it on anyway + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body +# of functions and classes directly in the documentation. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct +# doxygen to hide any special comment blocks from generated source code +# fragments. Normal C and C++ comments will always remain visible. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES (the default) +# then for each documented function all documented +# functions referencing it will be listed. + +REFERENCED_BY_RELATION = YES + +# If the REFERENCES_RELATION tag is set to YES (the default) +# then for each documented function all documented entities +# called/used by that function will be listed. + +REFERENCES_RELATION = YES + +# If the REFERENCES_LINK_SOURCE tag is set to YES (the default) +# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from +# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will +# link to the source code. Otherwise they will link to the documentstion. + +REFERENCES_LINK_SOURCE = YES + +# If the USE_HTAGS tag is set to YES then the references to source code +# will point to the HTML generated by the htags(1) tool instead of doxygen +# built-in source browser. The htags tool is part of GNU's global source +# tagging system (see http://www.gnu.org/software/global/global.html). You +# will need version 4.8.6 or higher. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen +# will generate a verbatim copy of the header file for each class for +# which an include is specified. Set to NO to disable this. + +VERBATIM_HEADERS = YES + +#--------------------------------------------------------------------------- +# configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index +# of all compounds will be generated. Enable this if the project +# contains a lot of classes, structs, unions or interfaces. + +ALPHABETICAL_INDEX = NO + +# In case all classes in a project start with a common prefix, all +# classes will be put under the same header in the alphabetical index. +# The IGNORE_PREFIX tag can be used to specify one or more prefixes that +# should be ignored while generating the index headers. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES (the default) Doxygen will +# generate HTML output. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `html' will be used as the default path. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for +# each generated HTML page (for example: .htm,.php,.asp). If it is left blank +# doxygen will generate files with .html extension. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a personal HTML header for +# each generated HTML page. If it is left blank doxygen will generate a +# standard header. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a personal HTML footer for +# each generated HTML page. If it is left blank doxygen will generate a +# standard footer. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading +# style sheet that is used by each HTML page. It can be used to +# fine-tune the look of the HTML output. If the tag is left blank doxygen +# will generate a default style sheet. Note that doxygen will try to copy +# the style sheet file to the HTML output directory, so don't put your own +# stylesheet in the HTML output directory as well, or it will be erased! + +HTML_STYLESHEET = + +# If the GENERATE_HTMLHELP tag is set to YES, additional index files +# will be generated that can be used as input for tools like the +# Microsoft HTML help workshop to generate a compressed HTML help file (.chm) +# of the generated HTML documentation. + +GENERATE_HTMLHELP = NO + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. For this to work a browser that supports +# java_script and DHTML is required (for instance Mozilla 1.0+, Firefox +# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari). + +HTML_DYNAMIC_SECTIONS = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can +# be used to specify the file name of the resulting .chm file. You +# can add a path in front of the file if the result should not be +# written to the html output directory. + +CHM_FILE = + +# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can +# be used to specify the location (absolute path including file name) of +# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run +# the HTML help compiler on the generated index.hhp. + +HHC_LOCATION = + +# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag +# controls if a separate .chi index file is generated (YES) or that +# it should be included in the master .chm file (NO). + +GENERATE_CHI = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag +# controls whether a binary table of contents is generated (YES) or a +# normal table of contents (NO) in the .chm file. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members +# to the contents of the HTML help documentation and to the tree view. + +TOC_EXPAND = NO + +# The DISABLE_INDEX tag can be used to turn on/off the condensed index at +# top of each HTML page. The value NO (the default) enables the index and +# the value YES disables it. + +DISABLE_INDEX = NO + +# This tag can be used to set the number of enum values (range [1..20]) +# that doxygen will group on one line in the generated HTML documentation. + +ENUM_VALUES_PER_LINE = 4 + +# If the GENERATE_TREEVIEW tag is set to YES, a side panel will be +# generated containing a tree-like index structure (just like the one that +# is generated for HTML Help). For this to work a browser that supports +# java_script, DHTML, CSS and frames is required (for instance Mozilla 1.0+, +# Netscape 6.0+, Internet explorer 5.0+, or Konqueror). Windows users are +# probably better off using the HTML help feature. + +GENERATE_TREEVIEW = NO + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be +# used to set the initial width (in pixels) of the frame in which the tree +# is shown. + +TREEVIEW_WIDTH = 250 + +#--------------------------------------------------------------------------- +# configuration options related to the la_te_x output +#--------------------------------------------------------------------------- + +# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will +# generate Latex output. + +GENERATE_LATEX = YES + +# The LATEX_OUTPUT tag is used to specify where the la_te_x docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `latex' will be used as the default path. + +LATEX_OUTPUT = latex + +# The LATEX_CMD_NAME tag can be used to specify the la_te_x command name to be +# invoked. If left blank `latex' will be used as the default command name. + +LATEX_CMD_NAME = latex + +# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to +# generate index for la_te_x. If left blank `makeindex' will be used as the +# default command name. + +MAKEINDEX_CMD_NAME = makeindex + +# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact +# la_te_x documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_LATEX = YES + +# The PAPER_TYPE tag can be used to set the paper type that is used +# by the printer. Possible values are: a4, a4wide, letter, legal and +# executive. If left blank a4wide will be used. + +PAPER_TYPE = letter + +# The EXTRA_PACKAGES tag can be to specify one or more names of la_te_x +# packages that should be included in the la_te_x output. + +EXTRA_PACKAGES = + +# The LATEX_HEADER tag can be used to specify a personal la_te_x header for +# the generated latex document. The header should contain everything until +# the first chapter. If it is left blank doxygen will generate a +# standard header. Notice: only use this tag if you know what you are doing! + +LATEX_HEADER = + +# If the PDF_HYPERLINKS tag is set to YES, the la_te_x that is generated +# is prepared for conversion to pdf (using ps2pdf). The pdf file will +# contain links (just like the HTML output) instead of page references +# This makes the output suitable for online browsing using a pdf viewer. + +PDF_HYPERLINKS = YES + +# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of +# plain latex in the generated Makefile. Set this option to YES to get a +# higher quality PDF documentation. + +USE_PDFLATEX = YES + +# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. +# command to the generated la_te_x files. This will instruct la_te_x to keep +# running if errors occur, instead of asking the user for help. +# This option is also used when generating formulas in HTML. + +LATEX_BATCHMODE = NO + +# If LATEX_HIDE_INDICES is set to YES then doxygen will not +# include the index chapters (such as File Index, Compound Index, etc.) +# in the output. + +LATEX_HIDE_INDICES = NO + +#--------------------------------------------------------------------------- +# configuration options related to the RTF output +#--------------------------------------------------------------------------- + +# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output +# The RTF output is optimized for Word 97 and may not look very pretty with +# other RTF readers or editors. + +GENERATE_RTF = NO + +# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `rtf' will be used as the default path. + +RTF_OUTPUT = rtf + +# If the COMPACT_RTF tag is set to YES Doxygen generates more compact +# RTF documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_RTF = NO + +# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated +# will contain hyperlink fields. The RTF file will +# contain links (just like the HTML output) instead of page references. +# This makes the output suitable for online browsing using WORD or other +# programs which support those fields. +# Note: wordpad (write) and others do not support links. + +RTF_HYPERLINKS = NO + +# Load stylesheet definitions from file. Syntax is similar to doxygen's +# config file, i.e. a series of assignments. You only have to provide +# replacements, missing definitions are set to their default value. + +RTF_STYLESHEET_FILE = + +# Set optional variables used in the generation of an rtf document. +# Syntax is similar to doxygen's config file. + +RTF_EXTENSIONS_FILE = + +#--------------------------------------------------------------------------- +# configuration options related to the man page output +#--------------------------------------------------------------------------- + +# If the GENERATE_MAN tag is set to YES (the default) Doxygen will +# generate man pages + +GENERATE_MAN = NO + +# The MAN_OUTPUT tag is used to specify where the man pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `man' will be used as the default path. + +MAN_OUTPUT = man + +# The MAN_EXTENSION tag determines the extension that is added to +# the generated man pages (default is the subroutine's section .3) + +MAN_EXTENSION = .3 + +# If the MAN_LINKS tag is set to YES and Doxygen generates man output, +# then it will generate one additional man file for each entity +# documented in the real man page(s). These additional files +# only source the real man page, but without them the man command +# would be unable to find the correct page. The default is NO. + +MAN_LINKS = YES + +#--------------------------------------------------------------------------- +# configuration options related to the XML output +#--------------------------------------------------------------------------- + +# If the GENERATE_XML tag is set to YES Doxygen will +# generate an XML file that captures the structure of +# the code including all documentation. + +GENERATE_XML = NO + +# The XML_OUTPUT tag is used to specify where the XML pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `xml' will be used as the default path. + +XML_OUTPUT = xml + +# If the XML_PROGRAMLISTING tag is set to YES Doxygen will +# dump the program listings (including syntax highlighting +# and cross-referencing information) to the XML output. Note that +# enabling this will significantly increase the size of the XML output. + +XML_PROGRAMLISTING = YES + +#--------------------------------------------------------------------------- +# configuration options for the auto_gen Definitions output +#--------------------------------------------------------------------------- + +# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will +# generate an auto_gen Definitions (see autogen.sf.net) file +# that captures the structure of the code including all +# documentation. Note that this feature is still experimental +# and incomplete at the moment. + +GENERATE_AUTOGEN_DEF = NO + +#--------------------------------------------------------------------------- +# configuration options related to the Perl module output +#--------------------------------------------------------------------------- + +# If the GENERATE_PERLMOD tag is set to YES Doxygen will +# generate a Perl module file that captures the structure of +# the code including all documentation. Note that this +# feature is still experimental and incomplete at the +# moment. + +GENERATE_PERLMOD = NO + +# If the PERLMOD_LATEX tag is set to YES Doxygen will generate +# the necessary Makefile rules, Perl scripts and la_te_x code to be able +# to generate PDF and DVI output from the Perl module output. + +PERLMOD_LATEX = NO + +# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be +# nicely formatted so it can be parsed by a human reader. This is useful +# if you want to understand what is going on. On the other hand, if this +# tag is set to NO the size of the Perl module output will be much smaller +# and Perl will parse it just the same. + +PERLMOD_PRETTY = YES + +# The names of the make variables in the generated doxyrules.make file +# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. +# This is useful so different doxyrules.make files included by the same +# Makefile don't overwrite each other's variables. + +PERLMOD_MAKEVAR_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the preprocessor +#--------------------------------------------------------------------------- + +# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will +# evaluate all C-preprocessor directives found in the sources and include +# files. + +ENABLE_PREPROCESSING = YES + +# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro +# names in the source code. If set to NO (the default) only conditional +# compilation will be performed. Macro expansion can be done in a controlled +# way by setting EXPAND_ONLY_PREDEF to YES. + +MACRO_EXPANSION = YES + +# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES +# then the macro expansion is limited to the macros specified with the +# PREDEFINED and EXPAND_AS_DEFINED tags. + +EXPAND_ONLY_PREDEF = NO + +# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files +# in the INCLUDE_PATH (see below) will be search if a #include is found. + +SEARCH_INCLUDES = YES + +# The INCLUDE_PATH tag can be used to specify one or more directories that +# contain include files that are not input files but should be processed by +# the preprocessor. + +INCLUDE_PATH = + +# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard +# patterns (like *.h and *.hpp) to filter out the header-files in the +# directories. If left blank, the patterns specified with FILE_PATTERNS will +# be used. + +INCLUDE_FILE_PATTERNS = *.h + +# The PREDEFINED tag can be used to specify one or more macro names that +# are defined before the preprocessor is started (similar to the -D option of +# gcc). The argument of the tag is a list of macros of the form: name +# or name=definition (no spaces). If the definition and the = are +# omitted =1 is assumed. To prevent a macro definition from being +# undefined via #undef or recursively expanded use the := operator +# instead of the = operator. + +PREDEFINED = + +# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then +# this tag can be used to specify a list of macro names that should be expanded. +# The macro definition that is found in the sources will be used. +# Use the PREDEFINED tag if you want to use a different macro definition. + +EXPAND_AS_DEFINED = + +# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then +# doxygen's preprocessor will remove all function-like macros that are alone +# on a line, have an all uppercase name, and do not end with a semicolon. Such +# function macros are typically used for boiler-plate code, and will confuse +# the parser if not removed. + +SKIP_FUNCTION_MACROS = YES + +#--------------------------------------------------------------------------- +# Configuration::additions related to external references +#--------------------------------------------------------------------------- + +# The TAGFILES option can be used to specify one or more tagfiles. +# Optionally an initial location of the external documentation +# can be added for each tagfile. The format of a tag file without +# this location is as follows: +# TAGFILES = file1 file2 ... +# Adding location for the tag files is done as follows: +# TAGFILES = file1=loc1 "file2 = loc2" ... +# where "loc1" and "loc2" can be relative or absolute paths or +# URLs. If a location is present for each tag, the installdox tool +# does not have to be run to correct the links. +# Note that each tag file must have a unique name +# (where the name does NOT include the path) +# If a tag file is not located in the directory in which doxygen +# is run, you must also specify the path to the tagfile here. + +TAGFILES = + +# When a file name is specified after GENERATE_TAGFILE, doxygen will create +# a tag file that is based on the input files it reads. + +GENERATE_TAGFILE = + +# If the ALLEXTERNALS tag is set to YES all external classes will be listed +# in the class index. If set to NO only the inherited external classes +# will be listed. + +ALLEXTERNALS = NO + +# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed +# in the modules index. If set to NO, only the current project's groups will +# be listed. + +EXTERNAL_GROUPS = YES + +#--------------------------------------------------------------------------- +# Configuration options related to the dot tool +#--------------------------------------------------------------------------- + +# If set to YES, the inheritance and collaboration graphs will hide +# inheritance and usage relations if the target is undocumented +# or is not a class. + +HIDE_UNDOC_RELATIONS = YES + +# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is +# available from the path. This tool is part of Graphviz, a graph visualization +# toolkit from AT&T and Lucent Bell Labs. The other options in this section +# have no effect if this option is set to NO (the default) + +HAVE_DOT = NO + +# If the CLASS_GRAPH tag is set to YES (or GRAPH) then doxygen will generate a +# graph for each documented class showing the direct and indirect inheritance +# relations. In case HAVE_DOT is set as well dot will be used to draw the graph, +# otherwise the built-in generator will be used. If the CLASS_GRAPH tag is set +# to TEXT the direct and indirect inheritance relations will be shown as texts / +# links. +# Possible values are: NO, YES, TEXT and GRAPH. +# The default value is: YES. + +CLASS_GRAPH = YES + +# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect implementation dependencies (inheritance, containment, and +# class references variables) of the class with other documented classes. + +COLLABORATION_GRAPH = YES + +# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for groups, showing the direct groups dependencies + +GROUP_GRAPHS = YES + +# If the UML_LOOK tag is set to YES doxygen will generate inheritance and +# collaboration diagrams in a style similar to the OMG's Unified Modeling +# Language. + +UML_LOOK = NO + +# If set to YES, the inheritance and collaboration graphs will show the +# relations between templates and their instances. + +TEMPLATE_RELATIONS = NO + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT +# tags are set to YES then doxygen will generate a graph for each documented +# file showing the direct and indirect include dependencies of the file with +# other documented files. + +INCLUDE_GRAPH = YES + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and +# HAVE_DOT tags are set to YES then doxygen will generate a graph for each +# documented header file showing the documented files that directly or +# indirectly include this file. + +INCLUDED_BY_GRAPH = YES + +# If the CALL_GRAPH, SOURCE_BROWSER and HAVE_DOT tags are set to YES then doxygen will +# generate a call dependency graph for every global function or class method. +# Note that enabling this option will significantly increase the time of a run. +# So in most cases it will be better to enable call graphs for selected +# functions only using the \callgraph command. + +CALL_GRAPH = NO + +# If the CALLER_GRAPH, SOURCE_BROWSER and HAVE_DOT tags are set to YES then doxygen will +# generate a caller dependency graph for every global function or class method. +# Note that enabling this option will significantly increase the time of a run. +# So in most cases it will be better to enable caller graphs for selected +# functions only using the \callergraph command. + +CALLER_GRAPH = NO + +# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen +# will graphical hierarchy of all classes instead of a textual one. + +GRAPHICAL_HIERARCHY = YES + +# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES +# then doxygen will show the dependencies a directory has on other directories +# in a graphical way. The dependency relations are determined by the #include +# relations between the files in the directories. + +DIRECTORY_GRAPH = YES + +# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images +# generated by dot. Possible values are png, jpg, or gif +# If left blank png will be used. + +DOT_IMAGE_FORMAT = png + +# The tag DOT_PATH can be used to specify the path where the dot tool can be +# found. If left blank, it is assumed the dot tool can be found in the path. + +DOT_PATH = + +# The DOTFILE_DIRS tag can be used to specify one or more directories that +# contain dot files that are included in the documentation (see the +# \dotfile command). + +DOTFILE_DIRS = + +# The MAX_DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of +# nodes that will be shown in the graph. If the number of nodes in a graph +# becomes larger than this value, doxygen will truncate the graph, which is +# visualized by representing a node as a red box. Note that doxygen if the number +# of direct children of the root node in a graph is already larger than +# MAX_DOT_GRAPH_NOTES then the graph will not be shown at all. Also note +# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH. + +DOT_GRAPH_MAX_NODES = 50 + +# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the +# graphs generated by dot. A depth value of 3 means that only nodes reachable +# from the root by following a path via at most 3 edges will be shown. Nodes +# that lay further from the root node will be omitted. Note that setting this +# option to 1 or 2 may greatly reduce the computation time needed for large +# code bases. Also note that the size of a graph can be further restricted by +# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction. + +MAX_DOT_GRAPH_DEPTH = 0 + +# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent +# background. This is disabled by default, which results in a white background. +# Warning: Depending on the platform used, enabling this option may lead to +# badly anti-aliased labels on the edges of a graph (i.e. they become hard to +# read). + +DOT_TRANSPARENT = YES + +# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output +# files in one run (i.e. multiple -o and -T options on the command line). This +# makes dot run faster, but since only newer versions of dot (>1.8.10) +# support this, this feature is disabled by default. + +DOT_MULTI_TARGETS = NO + +# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will +# generate a legend page explaining the meaning of the various boxes and +# arrows in the dot generated graphs. + +GENERATE_LEGEND = YES + +# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will +# remove the intermediate dot files that are used to generate +# the various graphs. + +DOT_CLEANUP = YES + +#--------------------------------------------------------------------------- +# Configuration::additions related to the search engine +#--------------------------------------------------------------------------- + +# The SEARCHENGINE tag specifies whether or not a search engine should be +# used. If set to NO the values of all tags below this one will be ignored. + +SEARCHENGINE = NO diff --git a/media/libvpx/libvpx/libs.mk b/media/libvpx/libvpx/libs.mk new file mode 100644 index 0000000000..ff1c569c3b --- /dev/null +++ b/media/libvpx/libvpx/libs.mk @@ -0,0 +1,801 @@ +## +## Copyright (c) 2010 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + + +# ARM assembly files are written in RVCT-style. We use some make magic to +# filter those files to allow GCC compilation +ifeq ($(VPX_ARCH_ARM),yes) + ASM:=$(if $(filter yes,$(CONFIG_GCC)$(CONFIG_MSVS)),.asm.S,.asm) +else + ASM:=.asm +endif + +# +# Rule to generate runtime cpu detection files +# +define rtcd_h_template +$$(BUILD_PFX)$(1).h: $$(SRC_PATH_BARE)/$(2) + @echo " [CREATE] $$@" + $$(qexec)$$(SRC_PATH_BARE)/build/make/rtcd.pl --arch=$$(TGT_ISA) \ + --sym=$(1) \ + --config=$$(CONFIG_DIR)$$(target)-$$(TOOLCHAIN).mk \ + $$(RTCD_OPTIONS) $$^ > $$@ +CLEAN-OBJS += $$(BUILD_PFX)$(1).h +RTCD += $$(BUILD_PFX)$(1).h +endef + +CODEC_SRCS-yes += CHANGELOG +CODEC_SRCS-yes += libs.mk + +include $(SRC_PATH_BARE)/vpx/vpx_codec.mk +CODEC_SRCS-yes += $(addprefix vpx/,$(call enabled,API_SRCS)) +CODEC_DOC_SRCS += $(addprefix vpx/,$(call enabled,API_DOC_SRCS)) + +include $(SRC_PATH_BARE)/vpx_mem/vpx_mem.mk +CODEC_SRCS-yes += $(addprefix vpx_mem/,$(call enabled,MEM_SRCS)) + +include $(SRC_PATH_BARE)/vpx_scale/vpx_scale.mk +CODEC_SRCS-yes += $(addprefix vpx_scale/,$(call enabled,SCALE_SRCS)) + +include $(SRC_PATH_BARE)/vpx_ports/vpx_ports.mk +CODEC_SRCS-yes += $(addprefix vpx_ports/,$(call enabled,PORTS_SRCS)) + +include $(SRC_PATH_BARE)/vpx_dsp/vpx_dsp.mk +CODEC_SRCS-yes += $(addprefix vpx_dsp/,$(call enabled,DSP_SRCS)) + +include $(SRC_PATH_BARE)/vpx_util/vpx_util.mk +CODEC_SRCS-yes += $(addprefix vpx_util/,$(call enabled,UTIL_SRCS)) + +ifeq ($(CONFIG_VP8),yes) + VP8_PREFIX=vp8/ + include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8_common.mk +endif + +ifeq ($(CONFIG_VP8_ENCODER),yes) + include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8cx.mk + CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_CX_SRCS)) + CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_CX_EXPORTS)) + INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h + INSTALL-LIBS-yes += include/vpx/vpx_ext_ratectrl.h + INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/% + CODEC_DOC_SECTIONS += vp8 vp8_encoder +endif + +ifeq ($(CONFIG_VP8_DECODER),yes) + include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8dx.mk + CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_DX_SRCS)) + CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_DX_EXPORTS)) + INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8dx.h + INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/% + CODEC_DOC_SECTIONS += vp8 vp8_decoder +endif + +ifeq ($(CONFIG_VP9),yes) + VP9_PREFIX=vp9/ + include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9_common.mk +endif + +ifeq ($(CONFIG_VP9_ENCODER),yes) + VP9_PREFIX=vp9/ + include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9cx.mk + CODEC_SRCS-yes += $(addprefix $(VP9_PREFIX),$(call enabled,VP9_CX_SRCS)) + CODEC_EXPORTS-yes += $(addprefix $(VP9_PREFIX),$(VP9_CX_EXPORTS)) + CODEC_SRCS-yes += $(VP9_PREFIX)vp9cx.mk vpx/vp8.h vpx/vp8cx.h + CODEC_SRCS-yes += vpx/vpx_ext_ratectrl.h + INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h + INSTALL-LIBS-yes += include/vpx/vpx_ext_ratectrl.h + INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/% + CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h vpx/vpx_ext_ratectrl.h + CODEC_DOC_SECTIONS += vp9 vp9_encoder +endif + +RC_RTC_SRCS := vpx/vp8.h vpx/vp8cx.h +RC_RTC_SRCS += vpx/vpx_ext_ratectrl.h +RC_RTC_SRCS += vpx/internal/vpx_ratectrl_rtc.h +ifeq ($(CONFIG_VP9_ENCODER),yes) + VP9_PREFIX=vp9/ + RC_RTC_SRCS += $(addprefix $(VP9_PREFIX),$(call enabled,VP9_CX_SRCS)) + RC_RTC_SRCS += $(VP9_PREFIX)vp9cx.mk + RC_RTC_SRCS += $(VP9_PREFIX)ratectrl_rtc.cc + RC_RTC_SRCS += $(VP9_PREFIX)ratectrl_rtc.h + INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP9_PREFIX)ratectrl_rtc.cc + INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP9_PREFIX)ratectrl_rtc.h +endif +ifeq ($(CONFIG_VP8_ENCODER),yes) + VP8_PREFIX=vp8/ + RC_RTC_SRCS += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_CX_SRCS)) + RC_RTC_SRCS += $(VP8_PREFIX)vp8_ratectrl_rtc.cc + RC_RTC_SRCS += $(VP8_PREFIX)vp8_ratectrl_rtc.h + INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP8_PREFIX)vp8_ratectrl_rtc.cc + INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP8_PREFIX)vp8_ratectrl_rtc.h +endif + +ifeq ($(CONFIG_VP9_DECODER),yes) + VP9_PREFIX=vp9/ + include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9dx.mk + CODEC_SRCS-yes += $(addprefix $(VP9_PREFIX),$(call enabled,VP9_DX_SRCS)) + CODEC_EXPORTS-yes += $(addprefix $(VP9_PREFIX),$(VP9_DX_EXPORTS)) + CODEC_SRCS-yes += $(VP9_PREFIX)vp9dx.mk vpx/vp8.h vpx/vp8dx.h + INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8dx.h + INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/% + CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8dx.h + CODEC_DOC_SECTIONS += vp9 vp9_decoder +endif + +ifeq ($(CONFIG_ENCODERS),yes) + CODEC_DOC_SECTIONS += encoder +endif +ifeq ($(CONFIG_DECODERS),yes) + CODEC_DOC_SECTIONS += decoder +endif + +ifeq ($(CONFIG_MSVS),yes) +CODEC_LIB=$(if $(CONFIG_STATIC_MSVCRT),vpxmt,vpxmd) +GTEST_LIB=$(if $(CONFIG_STATIC_MSVCRT),gtestmt,gtestmd) +RC_RTC_LIB=$(if $(CONFIG_STATIC_MSVCRT),vpxrcmt,vpxrcmd) +# This variable uses deferred expansion intentionally, since the results of +# $(wildcard) may change during the course of the Make. +VS_PLATFORMS = $(foreach d,$(wildcard */Release/$(CODEC_LIB).lib),$(word 1,$(subst /, ,$(d)))) +endif + +# The following pairs define a mapping of locations in the distribution +# tree to locations in the source/build trees. +INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/vpx/% +INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/vpx_ports/% +INSTALL_MAPS += $(LIBSUBDIR)/% % +INSTALL_MAPS += src/% $(SRC_PATH_BARE)/% +ifeq ($(CONFIG_MSVS),yes) +INSTALL_MAPS += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/% $(p)/Release/%) +INSTALL_MAPS += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/% $(p)/Debug/%) +endif + +CODEC_SRCS-yes += build/make/version.sh +CODEC_SRCS-yes += build/make/rtcd.pl +CODEC_SRCS-yes += vpx_ports/emmintrin_compat.h +CODEC_SRCS-yes += vpx_ports/mem_ops.h +CODEC_SRCS-yes += vpx_ports/mem_ops_aligned.h +CODEC_SRCS-yes += vpx_ports/vpx_once.h +CODEC_SRCS-yes += $(BUILD_PFX)vpx_config.c +INSTALL-SRCS-no += $(BUILD_PFX)vpx_config.c +ifeq ($(VPX_ARCH_X86)$(VPX_ARCH_X86_64),yes) +INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += third_party/x86inc/x86inc.asm +INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += vpx_dsp/x86/bitdepth_conversion_sse2.asm +endif +CODEC_EXPORTS-yes += vpx/exports_com +CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc +CODEC_EXPORTS-$(CONFIG_DECODERS) += vpx/exports_dec + +INSTALL-LIBS-yes += include/vpx/vpx_codec.h +INSTALL-LIBS-yes += include/vpx/vpx_frame_buffer.h +INSTALL-LIBS-yes += include/vpx/vpx_image.h +INSTALL-LIBS-yes += include/vpx/vpx_integer.h +INSTALL-LIBS-$(CONFIG_DECODERS) += include/vpx/vpx_decoder.h +INSTALL-LIBS-$(CONFIG_ENCODERS) += include/vpx/vpx_encoder.h +INSTALL-LIBS-$(CONFIG_ENCODERS) += include/vpx/vpx_tpl.h +ifeq ($(CONFIG_EXTERNAL_BUILD),yes) +ifeq ($(CONFIG_MSVS),yes) +INSTALL-LIBS-yes += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/$(CODEC_LIB).lib) +INSTALL-LIBS-$(CONFIG_DEBUG_LIBS) += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/$(CODEC_LIB)d.lib) +INSTALL-LIBS-$(CONFIG_SHARED) += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/vpx.dll) +INSTALL-LIBS-$(CONFIG_SHARED) += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/vpx.exp) +endif +else +INSTALL-LIBS-$(CONFIG_STATIC) += $(LIBSUBDIR)/libvpx.a +INSTALL-LIBS-$(CONFIG_DEBUG_LIBS) += $(LIBSUBDIR)/libvpx_g.a +endif + +ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_RATE_CTRL),yesyes) + SIMPLE_ENCODE_SRCS := $(call enabled,CODEC_SRCS) + SIMPLE_ENCODE_SRCS += $(VP9_PREFIX)simple_encode.cc + SIMPLE_ENCODE_SRCS += $(VP9_PREFIX)simple_encode.h + SIMPLE_ENCODE_SRCS += ivfenc.h + SIMPLE_ENCODE_SRCS += ivfenc.c + INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP9_PREFIX)simple_encode.cc + INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP9_PREFIX)simple_encode.h +endif + +CODEC_SRCS=$(call enabled,CODEC_SRCS) + +INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(CODEC_SRCS) +INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(call enabled,CODEC_EXPORTS) + + +# Generate a list of all enabled sources, in particular for exporting to gyp +# based build systems. +libvpx_srcs.txt: + @echo " [CREATE] $@" + @echo $(CODEC_SRCS) | xargs -n1 echo | LC_ALL=C sort -u > $@ +CLEAN-OBJS += libvpx_srcs.txt + +# Assembly files that are included, but don't define symbols themselves. +# Filtered out to avoid Windows build warnings. +ASM_INCLUDES := \ + third_party/x86inc/x86inc.asm \ + vpx_config.asm \ + vpx_ports/x86_abi_support.asm \ + vpx_dsp/x86/bitdepth_conversion_sse2.asm \ + +ifeq ($(CONFIG_EXTERNAL_BUILD),yes) +ifeq ($(CONFIG_MSVS),yes) + +vpx.def: $(call enabled,CODEC_EXPORTS) + @echo " [CREATE] $@" + $(qexec)$(SRC_PATH_BARE)/build/make/gen_msvs_def.sh\ + --name=vpx\ + --out=$@ $^ +CLEAN-OBJS += vpx.def + +vpx.$(VCPROJ_SFX): VCPROJ_SRCS=$(filter-out $(addprefix %, $(ASM_INCLUDES)), $^) + +vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def + @echo " [CREATE] $@" + $(qexec)$(GEN_VCPROJ) \ + $(if $(CONFIG_SHARED),--dll,--lib) \ + --target=$(TOOLCHAIN) \ + $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \ + --name=vpx \ + --proj-guid=DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74 \ + --module-def=vpx.def \ + --ver=$(CONFIG_VS_VERSION) \ + --src-path-bare="$(SRC_PATH_BARE)" \ + --out=$@ $(CFLAGS) \ + --as=$(AS) \ + $(filter $(SRC_PATH_BARE)/vp8/%.c, $(VCPROJ_SRCS)) \ + $(filter $(SRC_PATH_BARE)/vp8/%.h, $(VCPROJ_SRCS)) \ + $(filter $(SRC_PATH_BARE)/vp9/%.c, $(VCPROJ_SRCS)) \ + $(filter $(SRC_PATH_BARE)/vp9/%.h, $(VCPROJ_SRCS)) \ + $(filter $(SRC_PATH_BARE)/vpx/%, $(VCPROJ_SRCS)) \ + $(filter $(SRC_PATH_BARE)/vpx_dsp/%, $(VCPROJ_SRCS)) \ + $(filter-out $(addprefix $(SRC_PATH_BARE)/, \ + vp8/%.c vp8/%.h vp9/%.c vp9/%.h vpx/% vpx_dsp/%), \ + $(VCPROJ_SRCS)) \ + --src-path-bare="$(SRC_PATH_BARE)" \ + +PROJECTS-yes += vpx.$(VCPROJ_SFX) + +vpx.$(VCPROJ_SFX): vpx_config.asm +vpx.$(VCPROJ_SFX): $(RTCD) + +vpxrc.$(VCPROJ_SFX): \ + VCPROJ_SRCS=$(filter-out $(addprefix %, $(ASM_INCLUDES)), $^) + +vpxrc.$(VCPROJ_SFX): $(RC_RTC_SRCS) + @echo " [CREATE] $@" + $(qexec)$(GEN_VCPROJ) \ + $(if $(CONFIG_SHARED),--dll,--lib) \ + --target=$(TOOLCHAIN) \ + $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \ + --name=vpxrc \ + --proj-guid=C26FF952-9494-4838-9A3F-7F3D4F613385 \ + --ver=$(CONFIG_VS_VERSION) \ + --src-path-bare="$(SRC_PATH_BARE)" \ + --out=$@ $(CFLAGS) \ + --as=$(AS) \ + $(filter $(SRC_PATH_BARE)/vp9/%.c, $(VCPROJ_SRCS)) \ + $(filter $(SRC_PATH_BARE)/vp9/%.cc, $(VCPROJ_SRCS)) \ + $(filter $(SRC_PATH_BARE)/vp9/%.h, $(VCPROJ_SRCS)) \ + $(filter $(SRC_PATH_BARE)/vpx/%, $(VCPROJ_SRCS)) \ + $(filter $(SRC_PATH_BARE)/vpx_dsp/%, $(VCPROJ_SRCS)) \ + $(filter-out $(addprefix $(SRC_PATH_BARE)/, \ + vp8/%.c vp8/%.h vp9/%.c vp9/%.cc vp9/%.h vpx/% \ + vpx_dsp/%), \ + $(VCPROJ_SRCS)) \ + --src-path-bare="$(SRC_PATH_BARE)" \ + +PROJECTS-yes += vpxrc.$(VCPROJ_SFX) + +vpxrc.$(VCPROJ_SFX): vpx_config.asm +vpxrc.$(VCPROJ_SFX): $(RTCD) + +endif # ifeq ($(CONFIG_MSVS),yes) +else # ifeq ($(CONFIG_EXTERNAL_BUILD),yes) +LIBVPX_OBJS=$(call objs, $(filter-out $(ASM_INCLUDES), $(CODEC_SRCS))) +OBJS-yes += $(LIBVPX_OBJS) +LIBS-$(if yes,$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a +$(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS) + +# Updating version info. +# https://www.gnu.org/software/libtool/manual/libtool.html#Updating-version-info +# For libtool: c=, a=, r= +# libtool generates .so file as .so.[c-a].a.r, while -version-info c:r:a is +# passed to libtool. +# +# libvpx library file is generated as libvpx.so... +# MAJOR = c-a, MINOR = a, PATCH = r +# +# To determine SO_VERSION_{MAJOR,MINOR,PATCH}, calculate c,a,r with current +# SO_VERSION_* then follow the rules in the link to detemine the new version +# (c1, a1, r1) and set MAJOR to [c1-a1], MINOR to a1 and PATCH to r1 +SO_VERSION_MAJOR := 8 +SO_VERSION_MINOR := 0 +SO_VERSION_PATCH := 1 +ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS)) +LIBVPX_SO := libvpx.$(SO_VERSION_MAJOR).dylib +SHARED_LIB_SUF := .dylib +EXPORT_FILE := libvpx.syms +LIBVPX_SO_SYMLINKS := $(addprefix $(LIBSUBDIR)/, \ + libvpx.dylib ) +else +ifeq ($(filter iphonesimulator%,$(TGT_OS)),$(TGT_OS)) +LIBVPX_SO := libvpx.$(SO_VERSION_MAJOR).dylib +SHARED_LIB_SUF := .dylib +EXPORT_FILE := libvpx.syms +LIBVPX_SO_SYMLINKS := $(addprefix $(LIBSUBDIR)/, libvpx.dylib) +else +ifeq ($(filter os2%,$(TGT_OS)),$(TGT_OS)) +LIBVPX_SO := libvpx$(SO_VERSION_MAJOR).dll +SHARED_LIB_SUF := _dll.a +EXPORT_FILE := libvpx.def +LIBVPX_SO_SYMLINKS := +LIBVPX_SO_IMPLIB := libvpx_dll.a +else +LIBVPX_SO := libvpx.so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR).$(SO_VERSION_PATCH) +SHARED_LIB_SUF := .so +EXPORT_FILE := libvpx.ver +LIBVPX_SO_SYMLINKS := $(addprefix $(LIBSUBDIR)/, \ + libvpx.so libvpx.so.$(SO_VERSION_MAJOR) \ + libvpx.so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR)) +endif +endif +endif + +LIBS-$(CONFIG_SHARED) += $(BUILD_PFX)$(LIBVPX_SO)\ + $(notdir $(LIBVPX_SO_SYMLINKS)) \ + $(if $(LIBVPX_SO_IMPLIB), $(BUILD_PFX)$(LIBVPX_SO_IMPLIB)) +$(BUILD_PFX)$(LIBVPX_SO): $(LIBVPX_OBJS) $(EXPORT_FILE) +$(BUILD_PFX)$(LIBVPX_SO): extralibs += -lm +$(BUILD_PFX)$(LIBVPX_SO): SONAME = libvpx.so.$(SO_VERSION_MAJOR) +$(BUILD_PFX)$(LIBVPX_SO): EXPORTS_FILE = $(EXPORT_FILE) + +libvpx.def: $(call enabled,CODEC_EXPORTS) + @echo " [CREATE] $@" + $(qexec)echo LIBRARY $(LIBVPX_SO:.dll=) INITINSTANCE TERMINSTANCE > $@ + $(qexec)echo "DATA MULTIPLE NONSHARED" >> $@ + $(qexec)echo "EXPORTS" >> $@ + $(qexec)awk '!/vpx_svc_*/ {print "_"$$2}' $^ >>$@ +CLEAN-OBJS += libvpx.def + +libvpx_dll.a: $(LIBVPX_SO) + @echo " [IMPLIB] $@" + $(qexec)emximp -o $@ $< +CLEAN-OBJS += libvpx_dll.a + +define libvpx_symlink_template +$(1): $(2) + @echo " [LN] $(2) $$@" + $(qexec)mkdir -p $$(dir $$@) + $(qexec)ln -sf $(2) $$@ +endef + +$(eval $(call libvpx_symlink_template,\ + $(addprefix $(BUILD_PFX),$(notdir $(LIBVPX_SO_SYMLINKS))),\ + $(BUILD_PFX)$(LIBVPX_SO))) +$(eval $(call libvpx_symlink_template,\ + $(addprefix $(DIST_DIR)/,$(LIBVPX_SO_SYMLINKS)),\ + $(LIBVPX_SO))) + + +INSTALL-LIBS-$(CONFIG_SHARED) += $(LIBVPX_SO_SYMLINKS) +INSTALL-LIBS-$(CONFIG_SHARED) += $(LIBSUBDIR)/$(LIBVPX_SO) +INSTALL-LIBS-$(CONFIG_SHARED) += $(if $(LIBVPX_SO_IMPLIB),$(LIBSUBDIR)/$(LIBVPX_SO_IMPLIB)) + + +LIBS-yes += vpx.pc +vpx.pc: config.mk libs.mk + @echo " [CREATE] $@" + $(qexec)echo '# pkg-config file from libvpx $(VERSION_STRING)' > $@ + $(qexec)echo 'prefix=$(PREFIX)' >> $@ + $(qexec)echo 'exec_prefix=$${prefix}' >> $@ + $(qexec)echo 'libdir=$${prefix}/$(LIBSUBDIR)' >> $@ + $(qexec)echo 'includedir=$${prefix}/include' >> $@ + $(qexec)echo '' >> $@ + $(qexec)echo 'Name: vpx' >> $@ + $(qexec)echo 'Description: WebM Project VPx codec implementation' >> $@ + $(qexec)echo 'Version: $(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH)' >> $@ + $(qexec)echo 'Requires:' >> $@ + $(qexec)echo 'Conflicts:' >> $@ + $(qexec)echo 'Libs: -L$${libdir} -lvpx -lm' >> $@ +ifeq ($(HAVE_PTHREAD_H),yes) + $(qexec)echo 'Libs.private: -lm -lpthread' >> $@ +else + $(qexec)echo 'Libs.private: -lm' >> $@ +endif + $(qexec)echo 'Cflags: -I$${includedir}' >> $@ +INSTALL-LIBS-yes += $(LIBSUBDIR)/pkgconfig/vpx.pc +INSTALL_MAPS += $(LIBSUBDIR)/pkgconfig/%.pc %.pc +CLEAN-OBJS += vpx.pc + +ifeq ($(CONFIG_ENCODERS),yes) + RC_RTC_OBJS=$(call objs,$(RC_RTC_SRCS)) + OBJS-yes += $(RC_RTC_OBJS) + LIBS-yes += $(BUILD_PFX)libvpxrc.a $(BUILD_PFX)libvpxrc_g.a + $(BUILD_PFX)libvpxrc_g.a: $(RC_RTC_OBJS) +endif + +ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_RATE_CTRL),yesyes) + SIMPLE_ENCODE_OBJS=$(call objs,$(SIMPLE_ENCODE_SRCS)) + OBJS-yes += $(SIMPLE_ENCODE_OBJS) + LIBS-yes += $(BUILD_PFX)libsimple_encode.a $(BUILD_PFX)libsimple_encode_g.a + $(BUILD_PFX)libsimple_encode_g.a: $(SIMPLE_ENCODE_OBJS) +endif + +endif # ifeq ($(CONFIG_EXTERNAL_BUILD),yes) + +libvpx.ver: $(call enabled,CODEC_EXPORTS) + @echo " [CREATE] $@" + $(qexec)echo "{ global:" > $@ + $(qexec)for f in $?; do awk '{print $$2";"}' < $$f >>$@; done + $(qexec)echo "local: *; };" >> $@ +CLEAN-OBJS += libvpx.ver + +libvpx.syms: $(call enabled,CODEC_EXPORTS) + @echo " [CREATE] $@" + $(qexec)awk '{print "_"$$2}' $^ >$@ +CLEAN-OBJS += libvpx.syms + +# +# Rule to make assembler configuration file from C configuration file +# +ifeq ($(VPX_ARCH_X86)$(VPX_ARCH_X86_64),yes) +# YASM +$(BUILD_PFX)vpx_config.asm: $(BUILD_PFX)vpx_config.h + @echo " [CREATE] $@" + @LC_ALL=C grep -E "#define [A-Z0-9_]+ [01]" $< \ + | awk '{print $$2 " equ " $$3}' > $@ +else +ADS2GAS=$(if $(filter yes,$(CONFIG_GCC)),| $(ASM_CONVERSION)) +$(BUILD_PFX)vpx_config.asm: $(BUILD_PFX)vpx_config.h + @echo " [CREATE] $@" + @LC_ALL=C grep -E "#define [A-Z0-9_]+ [01]" $< \ + | awk '{print $$2 " EQU " $$3}' $(ADS2GAS) > $@ + @echo " END" $(ADS2GAS) >> $@ +CLEAN-OBJS += $(BUILD_PFX)vpx_config.asm +endif + +# +# Add assembler dependencies for configuration. +# +$(filter %.S.o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm +$(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm + + +$(shell $(SRC_PATH_BARE)/build/make/version.sh "$(SRC_PATH_BARE)" $(BUILD_PFX)vpx_version.h) +CLEAN-OBJS += $(BUILD_PFX)vpx_version.h + +# +# Add include path for libwebm sources. +# +ifeq ($(CONFIG_WEBM_IO),yes) + CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/libwebm +endif + +## +## libvpx test directives +## +ifeq ($(CONFIG_UNIT_TESTS),yes) +LIBVPX_TEST_DATA_PATH ?= . + +include $(SRC_PATH_BARE)/test/test.mk + +# addprefix_clean behaves like addprefix if the target doesn't start with "../" +# However, if the target starts with "../", instead of adding prefix, +# it will remove "../". +# Using addprefix_clean, we can avoid two different targets building the +# same file, i.e. +# test/../ivfenc.c.d: ivfenc.o +# ivfenc.c.d: ivfenc.o +# Note that the other way to solve this problem is using "realpath". +# The "realpath" is supported by make 3.81 or later. +addprefix_clean=$(patsubst $(1)../%,%,$(addprefix $(1), $(2))) +LIBVPX_TEST_SRCS=$(call addprefix_clean,test/,$(call enabled,LIBVPX_TEST_SRCS)) + +LIBVPX_TEST_BIN=./test_libvpx$(EXE_SFX) +LIBVPX_TEST_DATA=$(addprefix $(LIBVPX_TEST_DATA_PATH)/,\ + $(call enabled,LIBVPX_TEST_DATA)) +libvpx_test_data_url=https://storage.googleapis.com/downloads.webmproject.org/test_data/libvpx/$(1) + +TEST_INTRA_PRED_SPEED_BIN=./test_intra_pred_speed$(EXE_SFX) +TEST_INTRA_PRED_SPEED_SRCS=$(call addprefix_clean,test/,\ + $(call enabled,TEST_INTRA_PRED_SPEED_SRCS)) +TEST_INTRA_PRED_SPEED_OBJS := $(sort $(call objs,$(TEST_INTRA_PRED_SPEED_SRCS))) + +ifeq ($(CONFIG_ENCODERS),yes) +RC_INTERFACE_TEST_BIN=./test_rc_interface$(EXE_SFX) +RC_INTERFACE_TEST_SRCS=$(call addprefix_clean,test/,\ + $(call enabled,RC_INTERFACE_TEST_SRCS)) +RC_INTERFACE_TEST_OBJS := $(sort $(call objs,$(RC_INTERFACE_TEST_SRCS))) +endif + +SIMPLE_ENCODE_TEST_BIN=./test_simple_encode$(EXE_SFX) +SIMPLE_ENCODE_TEST_SRCS=$(call addprefix_clean,test/,\ + $(call enabled,SIMPLE_ENCODE_TEST_SRCS)) +SIMPLE_ENCODE_TEST_OBJS := $(sort $(call objs,$(SIMPLE_ENCODE_TEST_SRCS))) + +libvpx_test_srcs.txt: + @echo " [CREATE] $@" + @echo $(LIBVPX_TEST_SRCS) | xargs -n1 echo | LC_ALL=C sort -u > $@ +CLEAN-OBJS += libvpx_test_srcs.txt + +# Attempt to download the file using curl, retrying once if it fails for a +# partial file (18). +$(LIBVPX_TEST_DATA): $(SRC_PATH_BARE)/test/test-data.sha1 + @echo " [DOWNLOAD] $@" + $(qexec)( \ + trap 'rm -f $@' INT TERM; \ + curl="curl -S -s --retry 1 -L -o $@ $(call libvpx_test_data_url,$(@F))"; \ + $$curl; ret=$$?; \ + case "$$ret" in \ + 18) $$curl -C - ;; \ + *) exit $$ret ;; \ + esac \ + ) + +testdata: $(LIBVPX_TEST_DATA) + $(qexec)[ -x "$$(which sha1sum)" ] && sha1sum=sha1sum;\ + [ -x "$$(which shasum)" ] && sha1sum=shasum;\ + [ -x "$$(which sha1)" ] && sha1sum=sha1;\ + if [ -n "$${sha1sum}" ]; then\ + set -e;\ + echo "Checking test data:";\ + for f in $(call enabled,LIBVPX_TEST_DATA); do\ + grep $$f $(SRC_PATH_BARE)/test/test-data.sha1 |\ + (cd "$(LIBVPX_TEST_DATA_PATH)"; $${sha1sum} -c);\ + done; \ + else\ + echo "Skipping test data integrity check, sha1sum not found.";\ + fi + +ifeq ($(CONFIG_EXTERNAL_BUILD),yes) +ifeq ($(CONFIG_MSVS),yes) + +gtest.$(VCPROJ_SFX): $(SRC_PATH_BARE)/third_party/googletest/src/src/gtest-all.cc + @echo " [CREATE] $@" + $(qexec)$(GEN_VCPROJ) \ + --lib \ + --target=$(TOOLCHAIN) \ + $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \ + --name=gtest \ + --proj-guid=EC00E1EC-AF68-4D92-A255-181690D1C9B1 \ + --ver=$(CONFIG_VS_VERSION) \ + --src-path-bare="$(SRC_PATH_BARE)" \ + --as=$(AS) \ + -D_VARIADIC_MAX=10 \ + --out=gtest.$(VCPROJ_SFX) $(SRC_PATH_BARE)/third_party/googletest/src/src/gtest-all.cc \ + -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" -I"$(SRC_PATH_BARE)/third_party/googletest/src" + +PROJECTS-$(CONFIG_MSVS) += gtest.$(VCPROJ_SFX) + +test_libvpx.$(VCPROJ_SFX): $(LIBVPX_TEST_SRCS) vpx.$(VCPROJ_SFX) gtest.$(VCPROJ_SFX) + @echo " [CREATE] $@" + $(qexec)$(GEN_VCPROJ) \ + --exe \ + --target=$(TOOLCHAIN) \ + --name=test_libvpx \ + -D_VARIADIC_MAX=10 \ + --proj-guid=CD837F5F-52D8-4314-A370-895D614166A7 \ + --ver=$(CONFIG_VS_VERSION) \ + --src-path-bare="$(SRC_PATH_BARE)" \ + --as=$(AS) \ + $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \ + --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \ + -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \ + $(if $(CONFIG_WEBM_IO),-I"$(SRC_PATH_BARE)/third_party/libwebm") \ + -L. -l$(CODEC_LIB) -l$(GTEST_LIB) $^ + +PROJECTS-$(CONFIG_MSVS) += test_libvpx.$(VCPROJ_SFX) + +LIBVPX_TEST_BIN := $(addprefix $(TGT_OS:win64=x64)/Release/,$(notdir $(LIBVPX_TEST_BIN))) + +ifneq ($(strip $(TEST_INTRA_PRED_SPEED_OBJS)),) +PROJECTS-$(CONFIG_MSVS) += test_intra_pred_speed.$(VCPROJ_SFX) +test_intra_pred_speed.$(VCPROJ_SFX): $(TEST_INTRA_PRED_SPEED_SRCS) vpx.$(VCPROJ_SFX) gtest.$(VCPROJ_SFX) + @echo " [CREATE] $@" + $(qexec)$(GEN_VCPROJ) \ + --exe \ + --target=$(TOOLCHAIN) \ + --name=test_intra_pred_speed \ + -D_VARIADIC_MAX=10 \ + --proj-guid=CD837F5F-52D8-4314-A370-895D614166A7 \ + --ver=$(CONFIG_VS_VERSION) \ + --src-path-bare="$(SRC_PATH_BARE)" \ + --as=$(AS) \ + $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \ + --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \ + -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \ + -L. -l$(CODEC_LIB) -l$(GTEST_LIB) $^ +endif # TEST_INTRA_PRED_SPEED + +ifeq ($(CONFIG_ENCODERS),yes) +ifneq ($(strip $(RC_INTERFACE_TEST_OBJS)),) +PROJECTS-$(CONFIG_MSVS) += test_rc_interface.$(VCPROJ_SFX) +test_rc_interface.$(VCPROJ_SFX): $(RC_INTERFACE_TEST_SRCS) vpx.$(VCPROJ_SFX) \ + vpxrc.$(VCPROJ_SFX) gtest.$(VCPROJ_SFX) + @echo " [CREATE] $@" + $(qexec)$(GEN_VCPROJ) \ + --exe \ + --target=$(TOOLCHAIN) \ + --name=test_rc_interface \ + -D_VARIADIC_MAX=10 \ + --proj-guid=30458F88-1BC6-4689-B41C-50F3737AAB27 \ + --ver=$(CONFIG_VS_VERSION) \ + --as=$(AS) \ + --src-path-bare="$(SRC_PATH_BARE)" \ + $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \ + --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \ + -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \ + -L. -l$(CODEC_LIB) -l$(RC_RTC_LIB) -l$(GTEST_LIB) $^ +endif # RC_INTERFACE_TEST +endif # CONFIG_ENCODERS +endif # CONFIG_MSVS +else + +include $(SRC_PATH_BARE)/third_party/googletest/gtest.mk +GTEST_SRCS := $(addprefix third_party/googletest/src/,$(call enabled,GTEST_SRCS)) +GTEST_OBJS=$(call objs,$(GTEST_SRCS)) +ifeq ($(filter win%,$(TGT_OS)),$(TGT_OS)) +# Disabling pthreads globally will cause issues on darwin and possibly elsewhere +$(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -DGTEST_HAS_PTHREAD=0 +endif +GTEST_INCLUDES := -I$(SRC_PATH_BARE)/third_party/googletest/src +GTEST_INCLUDES += -I$(SRC_PATH_BARE)/third_party/googletest/src/include +$(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += $(GTEST_INCLUDES) +OBJS-yes += $(GTEST_OBJS) +LIBS-yes += $(BUILD_PFX)libgtest.a $(BUILD_PFX)libgtest_g.a +$(BUILD_PFX)libgtest_g.a: $(GTEST_OBJS) + +LIBVPX_TEST_OBJS=$(sort $(call objs,$(LIBVPX_TEST_SRCS))) +$(LIBVPX_TEST_OBJS) $(LIBVPX_TEST_OBJS:.o=.d): CXXFLAGS += $(GTEST_INCLUDES) +OBJS-yes += $(LIBVPX_TEST_OBJS) +BINS-yes += $(LIBVPX_TEST_BIN) + +CODEC_LIB=$(if $(CONFIG_DEBUG_LIBS),vpx_g,vpx) +CODEC_LIB_SUF=$(if $(CONFIG_SHARED),$(SHARED_LIB_SUF),.a) +TEST_LIBS := lib$(CODEC_LIB)$(CODEC_LIB_SUF) libgtest.a +$(LIBVPX_TEST_BIN): $(TEST_LIBS) +$(eval $(call linkerxx_template,$(LIBVPX_TEST_BIN), \ + $(LIBVPX_TEST_OBJS) \ + -L. -lvpx -lgtest $(extralibs) -lm)) + +ifneq ($(strip $(TEST_INTRA_PRED_SPEED_OBJS)),) +$(TEST_INTRA_PRED_SPEED_OBJS) $(TEST_INTRA_PRED_SPEED_OBJS:.o=.d): CXXFLAGS += $(GTEST_INCLUDES) +OBJS-yes += $(TEST_INTRA_PRED_SPEED_OBJS) +BINS-yes += $(TEST_INTRA_PRED_SPEED_BIN) + +$(TEST_INTRA_PRED_SPEED_BIN): $(TEST_LIBS) +$(eval $(call linkerxx_template,$(TEST_INTRA_PRED_SPEED_BIN), \ + $(TEST_INTRA_PRED_SPEED_OBJS) \ + -L. -lvpx -lgtest $(extralibs) -lm)) +endif # TEST_INTRA_PRED_SPEED + +ifeq ($(CONFIG_ENCODERS),yes) +ifneq ($(strip $(RC_INTERFACE_TEST_OBJS)),) +$(RC_INTERFACE_TEST_OBJS) $(RC_INTERFACE_TEST_OBJS:.o=.d): \ + CXXFLAGS += $(GTEST_INCLUDES) +OBJS-yes += $(RC_INTERFACE_TEST_OBJS) +BINS-yes += $(RC_INTERFACE_TEST_BIN) + +$(RC_INTERFACE_TEST_BIN): $(TEST_LIBS) libvpxrc.a +$(eval $(call linkerxx_template,$(RC_INTERFACE_TEST_BIN), \ + $(RC_INTERFACE_TEST_OBJS) \ + -L. -lvpx -lgtest -lvpxrc $(extralibs) -lm)) +endif # RC_INTERFACE_TEST +endif # CONFIG_ENCODERS + +ifneq ($(strip $(SIMPLE_ENCODE_TEST_OBJS)),) +$(SIMPLE_ENCODE_TEST_OBJS) $(SIMPLE_ENCODE_TEST_OBJS:.o=.d): \ + CXXFLAGS += $(GTEST_INCLUDES) +OBJS-yes += $(SIMPLE_ENCODE_TEST_OBJS) +BINS-yes += $(SIMPLE_ENCODE_TEST_BIN) + +$(SIMPLE_ENCODE_TEST_BIN): $(TEST_LIBS) libsimple_encode.a +$(eval $(call linkerxx_template,$(SIMPLE_ENCODE_TEST_BIN), \ + $(SIMPLE_ENCODE_TEST_OBJS) \ + -L. -lsimple_encode -lvpx -lgtest $(extralibs) -lm)) +endif # SIMPLE_ENCODE_TEST + +endif # CONFIG_EXTERNAL_BUILD + +# Install test sources only if codec source is included +INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(patsubst $(SRC_PATH_BARE)/%,%,\ + $(shell find $(SRC_PATH_BARE)/third_party/googletest -type f)) +INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(LIBVPX_TEST_SRCS) +INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(TEST_INTRA_PRED_SPEED_SRCS) +INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(RC_INTERFACE_TEST_SRCS) + +define test_shard_template +test: test_shard.$(1) +test-no-data-check: test_shard_ndc.$(1) +test_shard.$(1) test_shard_ndc.$(1): $(LIBVPX_TEST_BIN) + @set -e; \ + export GTEST_SHARD_INDEX=$(1); \ + export GTEST_TOTAL_SHARDS=$(2); \ + $(LIBVPX_TEST_BIN) +test_shard.$(1): testdata +.PHONY: test_shard.$(1) test_shard_ndc.$(1) +endef + +NUM_SHARDS := 10 +SHARDS := 0 1 2 3 4 5 6 7 8 9 +$(foreach s,$(SHARDS),$(eval $(call test_shard_template,$(s),$(NUM_SHARDS)))) + +endif # CONFIG_UNIT_TESTS + +## +## documentation directives +## +CLEAN-OBJS += libs.doxy +DOCS-yes += libs.doxy +libs.doxy: $(CODEC_DOC_SRCS) + @echo " [CREATE] $@" + @rm -f $@ + @echo "INPUT += $^" >> $@ + @echo "INCLUDE_PATH += ." >> $@; + @echo "ENABLED_SECTIONS += $(sort $(CODEC_DOC_SECTIONS))" >> $@ + +## Generate rtcd.h for all objects +ifeq ($(CONFIG_DEPENDENCY_TRACKING),yes) +$(OBJS-yes:.o=.d): $(RTCD) +else +$(OBJS-yes): $(RTCD) +endif + +## Update the global src list +SRCS += $(CODEC_SRCS) $(LIBVPX_TEST_SRCS) $(GTEST_SRCS) +SRCS += $(RC_INTERFACE_TEST_SRCS) + +## +## vpxdec/vpxenc tests. +## +ifeq ($(CONFIG_UNIT_TESTS),yes) +TEST_BIN_PATH = . +ifeq ($(CONFIG_MSVS),yes) +# MSVC will build both Debug and Release configurations of tools in a +# sub directory named for the current target. Assume the user wants to +# run the Release tools, and assign TEST_BIN_PATH accordingly. +# TODO(tomfinegan): Is this adequate for ARM? +# TODO(tomfinegan): Support running the debug versions of tools? +TEST_BIN_PATH := $(addsuffix /$(TGT_OS:win64=x64)/Release, $(TEST_BIN_PATH)) +endif +utiltest utiltest-no-data-check: + $(qexec)$(SRC_PATH_BARE)/test/vpxdec.sh \ + --test-data-path "$(LIBVPX_TEST_DATA_PATH)" \ + --bin-path $(TEST_BIN_PATH) + $(qexec)$(SRC_PATH_BARE)/test/vpxenc.sh \ + --test-data-path "$(LIBVPX_TEST_DATA_PATH)" \ + --bin-path $(TEST_BIN_PATH) +utiltest: testdata +else +utiltest utiltest-no-data-check: + @echo Unit tests must be enabled to make the utiltest target. +endif + +## +## Example tests. +## +ifeq ($(CONFIG_UNIT_TESTS),yes) +# All non-MSVC targets output example targets in a sub dir named examples. +EXAMPLES_BIN_PATH = examples +ifeq ($(CONFIG_MSVS),yes) +# MSVC will build both Debug and Release configurations of the examples in a +# sub directory named for the current target. Assume the user wants to +# run the Release tools, and assign EXAMPLES_BIN_PATH accordingly. +# TODO(tomfinegan): Is this adequate for ARM? +# TODO(tomfinegan): Support running the debug versions of tools? +EXAMPLES_BIN_PATH := $(TGT_OS:win64=x64)/Release +endif +exampletest exampletest-no-data-check: examples + $(qexec)$(SRC_PATH_BARE)/test/examples.sh \ + --test-data-path "$(LIBVPX_TEST_DATA_PATH)" \ + --bin-path $(EXAMPLES_BIN_PATH) +exampletest: testdata +else +exampletest exampletest-no-data-check: + @echo Unit tests must be enabled to make the exampletest target. +endif diff --git a/media/libvpx/libvpx/mainpage.dox b/media/libvpx/libvpx/mainpage.dox new file mode 100644 index 0000000000..4b0dff0871 --- /dev/null +++ b/media/libvpx/libvpx/mainpage.dox @@ -0,0 +1,55 @@ +/*!\mainpage WebM Codec SDK + + \section main_contents Page Contents + - \ref main_intro + - \ref main_startpoints + - \ref main_support + + \section main_intro Introduction + Welcome to the WebM Codec SDK. This SDK allows you to integrate your + applications with the VP8 and VP9 video codecs, high quality, royalty free, + open source codecs deployed on billions of computers and devices worldwide. + + This distribution of the WebM Codec SDK includes the following support: + + \if vp8_encoder + - \ref vp8_encoder + \endif + \if vp8_decoder + - \ref vp8_decoder + \endif + + + \section main_startpoints Starting Points + - Consult the \ref changelog for a complete list of improvements in this + release. + - The \ref readme contains instructions on recompiling the sample applications. + - Read the \ref usage "usage" for a narrative on codec usage. + \if samples + - Read the \ref samples "sample code" for examples of how to interact with the + codec. + \endif + - \ref codec reference + \if encoder + - \ref encoder reference + \endif + \if decoder + - \ref decoder reference + \endif + + \section main_support Support Options & FAQ + The WebM project is an open source project supported by its community. For + questions about this SDK, please mail the apps-devel@webmproject.org list. + To contribute, see http://www.webmproject.org/code/contribute and mail + codec-devel@webmproject.org. +*/ + +/*!\page changelog CHANGELOG + \verbinclude CHANGELOG +*/ + +/*!\page readme README + \verbinclude README +*/ + +/*!\defgroup codecs Supported Codecs */ diff --git a/media/libvpx/libvpx/md5_utils.c b/media/libvpx/libvpx/md5_utils.c new file mode 100644 index 0000000000..abd8d43c39 --- /dev/null +++ b/media/libvpx/libvpx/md5_utils.c @@ -0,0 +1,237 @@ +/* + * This code implements the MD5 message-digest algorithm. + * The algorithm is due to Ron Rivest. This code was + * written by Colin Plumb in 1993, no copyright is claimed. + * This code is in the public domain; do with it what you wish. + * + * Equivalent code is available from RSA Data Security, Inc. + * This code has been tested against that, and is equivalent, + * except that you don't need to include two pages of legalese + * with every copy. + * + * To compute the message digest of a chunk of bytes, declare an + * MD5Context structure, pass it to MD5Init, call MD5Update as + * needed on buffers full of bytes, and then call MD5Final, which + * will fill a supplied 16-byte array with the digest. + * + * Changed so as no longer to depend on Colin Plumb's `usual.h' header + * definitions + * - Ian Jackson . + * Still in the public domain. + */ + +#include /* for memcpy() */ + +#include "md5_utils.h" +#include "vpx_ports/compiler_attributes.h" + +static void byteSwap(UWORD32 *buf, unsigned words) { + md5byte *p; + + /* Only swap bytes for big endian machines */ + int i = 1; + + if (*(char *)&i == 1) return; + + p = (md5byte *)buf; + + do { + *buf++ = (UWORD32)((unsigned)p[3] << 8 | p[2]) << 16 | + ((unsigned)p[1] << 8 | p[0]); + p += 4; + } while (--words); +} + +/* + * Start MD5 accumulation. Set bit count to 0 and buffer to mysterious + * initialization constants. + */ +void MD5Init(struct MD5Context *ctx) { + ctx->buf[0] = 0x67452301; + ctx->buf[1] = 0xefcdab89; + ctx->buf[2] = 0x98badcfe; + ctx->buf[3] = 0x10325476; + + ctx->bytes[0] = 0; + ctx->bytes[1] = 0; +} + +/* + * Update context to reflect the concatenation of another buffer full + * of bytes. + */ +void MD5Update(struct MD5Context *ctx, md5byte const *buf, unsigned len) { + UWORD32 t; + + /* Update byte count */ + + t = ctx->bytes[0]; + + if ((ctx->bytes[0] = t + len) < t) + ctx->bytes[1]++; /* Carry from low to high */ + + t = 64 - (t & 0x3f); /* Space available in ctx->in (at least 1) */ + + if (t > len) { + memcpy((md5byte *)ctx->in + 64 - t, buf, len); + return; + } + + /* First chunk is an odd size */ + memcpy((md5byte *)ctx->in + 64 - t, buf, t); + byteSwap(ctx->in, 16); + MD5Transform(ctx->buf, ctx->in); + buf += t; + len -= t; + + /* Process data in 64-byte chunks */ + while (len >= 64) { + memcpy(ctx->in, buf, 64); + byteSwap(ctx->in, 16); + MD5Transform(ctx->buf, ctx->in); + buf += 64; + len -= 64; + } + + /* Handle any remaining bytes of data. */ + memcpy(ctx->in, buf, len); +} + +/* + * Final wrapup - pad to 64-byte boundary with the bit pattern + * 1 0* (64-bit count of bits processed, MSB-first) + */ +void MD5Final(md5byte digest[16], struct MD5Context *ctx) { + int count = ctx->bytes[0] & 0x3f; /* Number of bytes in ctx->in */ + md5byte *p = (md5byte *)ctx->in + count; + + /* Set the first char of padding to 0x80. There is always room. */ + *p++ = 0x80; + + /* Bytes of padding needed to make 56 bytes (-8..55) */ + count = 56 - 1 - count; + + if (count < 0) { /* Padding forces an extra block */ + memset(p, 0, count + 8); + byteSwap(ctx->in, 16); + MD5Transform(ctx->buf, ctx->in); + p = (md5byte *)ctx->in; + count = 56; + } + + memset(p, 0, count); + byteSwap(ctx->in, 14); + + /* Append length in bits and transform */ + ctx->in[14] = ctx->bytes[0] << 3; + ctx->in[15] = ctx->bytes[1] << 3 | ctx->bytes[0] >> 29; + MD5Transform(ctx->buf, ctx->in); + + byteSwap(ctx->buf, 4); + memcpy(digest, ctx->buf, 16); + memset(ctx, 0, sizeof(*ctx)); /* In case it's sensitive */ +} + +#ifndef ASM_MD5 + +/* The four core functions - F1 is optimized somewhat */ + +/* #define F1(x, y, z) (x & y | ~x & z) */ +#define F1(x, y, z) (z ^ (x & (y ^ z))) +#define F2(x, y, z) F1(z, x, y) +#define F3(x, y, z) (x ^ y ^ z) +#define F4(x, y, z) (y ^ (x | ~z)) + +/* This is the central step in the MD5 algorithm. */ +#define MD5STEP(f, w, x, y, z, in, s) \ + (w += f(x, y, z) + in, w = (w << s | w >> (32 - s)) + x) + +/* + * The core of the MD5 algorithm, this alters an existing MD5 hash to + * reflect the addition of 16 longwords of new data. MD5Update blocks + * the data and converts bytes into longwords for this routine. + */ +VPX_NO_UNSIGNED_OVERFLOW_CHECK VPX_NO_UNSIGNED_SHIFT_CHECK void MD5Transform( + UWORD32 buf[4], UWORD32 const in[16]) { + UWORD32 a, b, c, d; + + a = buf[0]; + b = buf[1]; + c = buf[2]; + d = buf[3]; + + MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7); + MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12); + MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17); + MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22); + MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7); + MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12); + MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17); + MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22); + MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7); + MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12); + MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17); + MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22); + MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7); + MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12); + MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17); + MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22); + + MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5); + MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9); + MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14); + MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20); + MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5); + MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9); + MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14); + MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20); + MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5); + MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9); + MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14); + MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20); + MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5); + MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9); + MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14); + MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20); + + MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4); + MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11); + MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16); + MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23); + MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4); + MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11); + MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16); + MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23); + MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4); + MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11); + MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16); + MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23); + MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4); + MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11); + MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16); + MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23); + + MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6); + MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10); + MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15); + MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21); + MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6); + MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10); + MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15); + MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21); + MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6); + MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10); + MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15); + MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21); + MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6); + MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10); + MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15); + MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21); + + buf[0] += a; + buf[1] += b; + buf[2] += c; + buf[3] += d; +} + +#endif diff --git a/media/libvpx/libvpx/md5_utils.h b/media/libvpx/libvpx/md5_utils.h new file mode 100644 index 0000000000..e0d5a2d1fb --- /dev/null +++ b/media/libvpx/libvpx/md5_utils.h @@ -0,0 +1,49 @@ +/* + * This is the header file for the MD5 message-digest algorithm. + * The algorithm is due to Ron Rivest. This code was + * written by Colin Plumb in 1993, no copyright is claimed. + * This code is in the public domain; do with it what you wish. + * + * Equivalent code is available from RSA Data Security, Inc. + * This code has been tested against that, and is equivalent, + * except that you don't need to include two pages of legalese + * with every copy. + * + * To compute the message digest of a chunk of bytes, declare an + * MD5Context structure, pass it to MD5Init, call MD5Update as + * needed on buffers full of bytes, and then call MD5Final, which + * will fill a supplied 16-byte array with the digest. + * + * Changed so as no longer to depend on Colin Plumb's `usual.h' + * header definitions + * - Ian Jackson . + * Still in the public domain. + */ + +#ifndef VPX_MD5_UTILS_H_ +#define VPX_MD5_UTILS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#define md5byte unsigned char +#define UWORD32 unsigned int + +typedef struct MD5Context MD5Context; +struct MD5Context { + UWORD32 buf[4]; + UWORD32 bytes[2]; + UWORD32 in[16]; +}; + +void MD5Init(struct MD5Context *context); +void MD5Update(struct MD5Context *context, md5byte const *buf, unsigned len); +void MD5Final(unsigned char digest[16], struct MD5Context *context); +void MD5Transform(UWORD32 buf[4], UWORD32 const in[16]); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_MD5_UTILS_H_ diff --git a/media/libvpx/libvpx/rate_hist.c b/media/libvpx/libvpx/rate_hist.c new file mode 100644 index 0000000000..947950d481 --- /dev/null +++ b/media/libvpx/libvpx/rate_hist.c @@ -0,0 +1,292 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include +#include + +#include "./rate_hist.h" + +#define RATE_BINS 100 +#define HIST_BAR_MAX 40 + +struct hist_bucket { + int low; + int high; + int count; +}; + +struct rate_hist { + int64_t *pts; + int *sz; + int samples; + int frames; + struct hist_bucket bucket[RATE_BINS]; + int total; +}; + +struct rate_hist *init_rate_histogram(const vpx_codec_enc_cfg_t *cfg, + const vpx_rational_t *fps) { + int i; + struct rate_hist *hist = calloc(1, sizeof(*hist)); + + if (hist == NULL || cfg == NULL || fps == NULL || fps->num == 0 || + fps->den == 0) { + destroy_rate_histogram(hist); + return NULL; + } + + // Determine the number of samples in the buffer. Use the file's framerate + // to determine the number of frames in rc_buf_sz milliseconds, with an + // adjustment (5/4) to account for alt-refs + hist->samples = cfg->rc_buf_sz * 5 / 4 * fps->num / fps->den / 1000; + + // prevent division by zero + if (hist->samples == 0) hist->samples = 1; + + hist->frames = 0; + hist->total = 0; + + hist->pts = calloc(hist->samples, sizeof(*hist->pts)); + hist->sz = calloc(hist->samples, sizeof(*hist->sz)); + for (i = 0; i < RATE_BINS; i++) { + hist->bucket[i].low = INT_MAX; + hist->bucket[i].high = 0; + hist->bucket[i].count = 0; + } + + return hist; +} + +void destroy_rate_histogram(struct rate_hist *hist) { + if (hist) { + free(hist->pts); + free(hist->sz); + free(hist); + } +} + +void update_rate_histogram(struct rate_hist *hist, + const vpx_codec_enc_cfg_t *cfg, + const vpx_codec_cx_pkt_t *pkt) { + int i; + int64_t then = 0; + int64_t avg_bitrate = 0; + int64_t sum_sz = 0; + const int64_t now = pkt->data.frame.pts * 1000 * + (uint64_t)cfg->g_timebase.num / + (uint64_t)cfg->g_timebase.den; + + int idx; + + if (hist == NULL || cfg == NULL || pkt == NULL) return; + + idx = hist->frames++ % hist->samples; + hist->pts[idx] = now; + hist->sz[idx] = (int)pkt->data.frame.sz; + + if (now < cfg->rc_buf_initial_sz) return; + + if (!cfg->rc_target_bitrate) return; + + then = now; + + /* Sum the size over the past rc_buf_sz ms */ + for (i = hist->frames; i > 0 && hist->frames - i < hist->samples; i--) { + const int i_idx = (i - 1) % hist->samples; + + then = hist->pts[i_idx]; + if (now - then > cfg->rc_buf_sz) break; + sum_sz += hist->sz[i_idx]; + } + + if (now == then) return; + + avg_bitrate = sum_sz * 8 * 1000 / (now - then); + idx = (int)(avg_bitrate * (RATE_BINS / 2) / (cfg->rc_target_bitrate * 1000)); + if (idx < 0) idx = 0; + if (idx > RATE_BINS - 1) idx = RATE_BINS - 1; + if (hist->bucket[idx].low > avg_bitrate) + hist->bucket[idx].low = (int)avg_bitrate; + if (hist->bucket[idx].high < avg_bitrate) + hist->bucket[idx].high = (int)avg_bitrate; + hist->bucket[idx].count++; + hist->total++; +} + +static int merge_hist_buckets(struct hist_bucket *bucket, int max_buckets, + int *num_buckets) { + int small_bucket = 0, merge_bucket = INT_MAX, big_bucket = 0; + int buckets; + int i; + + assert(bucket != NULL); + assert(num_buckets != NULL); + + buckets = *num_buckets; + + /* Find the extrema for this list of buckets */ + big_bucket = small_bucket = 0; + for (i = 0; i < buckets; i++) { + if (bucket[i].count < bucket[small_bucket].count) small_bucket = i; + if (bucket[i].count > bucket[big_bucket].count) big_bucket = i; + } + + /* If we have too many buckets, merge the smallest with an adjacent + * bucket. + */ + while (buckets > max_buckets) { + int last_bucket = buckets - 1; + + /* merge the small bucket with an adjacent one. */ + if (small_bucket == 0) + merge_bucket = 1; + else if (small_bucket == last_bucket) + merge_bucket = last_bucket - 1; + else if (bucket[small_bucket - 1].count < bucket[small_bucket + 1].count) + merge_bucket = small_bucket - 1; + else + merge_bucket = small_bucket + 1; + + assert(abs(merge_bucket - small_bucket) <= 1); + assert(small_bucket < buckets); + assert(big_bucket < buckets); + assert(merge_bucket < buckets); + + if (merge_bucket < small_bucket) { + bucket[merge_bucket].high = bucket[small_bucket].high; + bucket[merge_bucket].count += bucket[small_bucket].count; + } else { + bucket[small_bucket].high = bucket[merge_bucket].high; + bucket[small_bucket].count += bucket[merge_bucket].count; + merge_bucket = small_bucket; + } + + assert(bucket[merge_bucket].low != bucket[merge_bucket].high); + + buckets--; + + /* Remove the merge_bucket from the list, and find the new small + * and big buckets while we're at it + */ + big_bucket = small_bucket = 0; + for (i = 0; i < buckets; i++) { + if (i > merge_bucket) bucket[i] = bucket[i + 1]; + + if (bucket[i].count < bucket[small_bucket].count) small_bucket = i; + if (bucket[i].count > bucket[big_bucket].count) big_bucket = i; + } + } + + *num_buckets = buckets; + return bucket[big_bucket].count; +} + +static void show_histogram(const struct hist_bucket *bucket, int buckets, + int total, int scale) { + int width1, width2; + int i; + + if (!buckets) return; + assert(bucket != NULL); + assert(buckets > 0); + + switch ((int)(log(bucket[buckets - 1].high) / log(10)) + 1) { + case 1: + case 2: + width1 = 4; + width2 = 2; + break; + case 3: + width1 = 5; + width2 = 3; + break; + case 4: + width1 = 6; + width2 = 4; + break; + case 5: + width1 = 7; + width2 = 5; + break; + case 6: + width1 = 8; + width2 = 6; + break; + case 7: + width1 = 9; + width2 = 7; + break; + default: + width1 = 12; + width2 = 10; + break; + } + + for (i = 0; i < buckets; i++) { + int len; + int j; + float pct; + + pct = (float)(100.0 * bucket[i].count / total); + len = HIST_BAR_MAX * bucket[i].count / scale; + if (len < 1) len = 1; + assert(len <= HIST_BAR_MAX); + + if (bucket[i].low == bucket[i].high) + fprintf(stderr, "%*d %*s: ", width1, bucket[i].low, width2, ""); + else + fprintf(stderr, "%*d-%*d: ", width1, bucket[i].low, width2, + bucket[i].high); + + for (j = 0; j < HIST_BAR_MAX; j++) fprintf(stderr, j < len ? "=" : " "); + fprintf(stderr, "\t%5d (%6.2f%%)\n", bucket[i].count, pct); + } +} + +void show_q_histogram(const int counts[64], int max_buckets) { + struct hist_bucket bucket[64]; + int buckets = 0; + int total = 0; + int scale; + int i; + + for (i = 0; i < 64; i++) { + if (counts[i]) { + bucket[buckets].low = bucket[buckets].high = i; + bucket[buckets].count = counts[i]; + buckets++; + total += counts[i]; + } + } + + fprintf(stderr, "\nQuantizer Selection:\n"); + scale = merge_hist_buckets(bucket, max_buckets, &buckets); + show_histogram(bucket, buckets, total, scale); +} + +void show_rate_histogram(struct rate_hist *hist, const vpx_codec_enc_cfg_t *cfg, + int max_buckets) { + int i, scale; + int buckets = 0; + + if (hist == NULL || cfg == NULL) return; + + for (i = 0; i < RATE_BINS; i++) { + if (hist->bucket[i].low == INT_MAX) continue; + hist->bucket[buckets++] = hist->bucket[i]; + } + + fprintf(stderr, "\nRate (over %dms window):\n", cfg->rc_buf_sz); + scale = merge_hist_buckets(hist->bucket, max_buckets, &buckets); + show_histogram(hist->bucket, buckets, hist->total, scale); +} diff --git a/media/libvpx/libvpx/rate_hist.h b/media/libvpx/libvpx/rate_hist.h new file mode 100644 index 0000000000..d6a4c68519 --- /dev/null +++ b/media/libvpx/libvpx/rate_hist.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_RATE_HIST_H_ +#define VPX_RATE_HIST_H_ + +#include "vpx/vpx_encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct rate_hist; + +struct rate_hist *init_rate_histogram(const vpx_codec_enc_cfg_t *cfg, + const vpx_rational_t *fps); + +void destroy_rate_histogram(struct rate_hist *hist); + +void update_rate_histogram(struct rate_hist *hist, + const vpx_codec_enc_cfg_t *cfg, + const vpx_codec_cx_pkt_t *pkt); + +void show_q_histogram(const int counts[64], int max_buckets); + +void show_rate_histogram(struct rate_hist *hist, const vpx_codec_enc_cfg_t *cfg, + int max_buckets); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_RATE_HIST_H_ diff --git a/media/libvpx/libvpx/solution.mk b/media/libvpx/libvpx/solution.mk new file mode 100644 index 0000000000..145adc0dda --- /dev/null +++ b/media/libvpx/libvpx/solution.mk @@ -0,0 +1,31 @@ +## +## Copyright (c) 2010 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +# libvpx reverse dependencies (targets that depend on libvpx) +VPX_NONDEPS=$(addsuffix .$(VCPROJ_SFX),vpx gtest) +VPX_RDEPS=$(foreach vcp,\ + $(filter-out $(VPX_NONDEPS),$^), --dep=$(vcp:.$(VCPROJ_SFX)=):vpx) + +vpx.sln: $(wildcard *.$(VCPROJ_SFX)) + @echo " [CREATE] $@" + $(SRC_PATH_BARE)/build/make/gen_msvs_sln.sh \ + $(if $(filter vpx.$(VCPROJ_SFX),$^),$(VPX_RDEPS)) \ + --dep=test_libvpx:gtest \ + --ver=$(CONFIG_VS_VERSION)\ + --out=$@ $^ +vpx.sln.mk: vpx.sln + @true + +PROJECTS-yes += vpx.sln vpx.sln.mk +-include vpx.sln.mk + +# Always install this file, as it is an unconditional post-build rule. +INSTALL_MAPS += src/% $(SRC_PATH_BARE)/% +INSTALL-SRCS-yes += $(target).mk diff --git a/media/libvpx/libvpx/test/acm_random.h b/media/libvpx/libvpx/test/acm_random.h new file mode 100644 index 0000000000..e3520c47de --- /dev/null +++ b/media/libvpx/libvpx/test/acm_random.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_TEST_ACM_RANDOM_H_ +#define VPX_TEST_ACM_RANDOM_H_ + +#include + +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "vpx/vpx_integer.h" + +namespace libvpx_test { + +class ACMRandom { + public: + ACMRandom() : random_(DeterministicSeed()) {} + + explicit ACMRandom(int seed) : random_(seed) {} + + void Reset(int seed) { random_.Reseed(seed); } + uint16_t Rand16() { + const uint32_t value = + random_.Generate(testing::internal::Random::kMaxRange); + return (value >> 15) & 0xffff; + } + + int32_t Rand20Signed() { + // Use 20 bits: values between 524287 and -524288. + const uint32_t value = random_.Generate(1048576); + return static_cast(value) - 524288; + } + + int16_t Rand16Signed() { + // Use 16 bits: values between 32767 and -32768. + return static_cast(random_.Generate(65536)); + } + + uint16_t Rand12() { + const uint32_t value = + random_.Generate(testing::internal::Random::kMaxRange); + // There's a bit more entropy in the upper bits of this implementation. + return (value >> 19) & 0xfff; + } + + uint8_t Rand8() { + const uint32_t value = + random_.Generate(testing::internal::Random::kMaxRange); + // There's a bit more entropy in the upper bits of this implementation. + return (value >> 23) & 0xff; + } + + uint8_t Rand8Extremes() { + // Returns a random value near 0 or near 255, to better exercise + // saturation behavior. + const uint8_t r = Rand8(); + return static_cast((r < 128) ? r << 4 : r >> 4); + } + + uint32_t RandRange(const uint32_t range) { + // testing::internal::Random::Generate provides values in the range + // testing::internal::Random::kMaxRange. + assert(range <= testing::internal::Random::kMaxRange); + return random_.Generate(range); + } + + int PseudoUniform(int range) { return random_.Generate(range); } + + int operator()(int n) { return PseudoUniform(n); } + + static int DeterministicSeed() { return 0xbaba; } + + private: + testing::internal::Random random_; +}; + +} // namespace libvpx_test + +#endif // VPX_TEST_ACM_RANDOM_H_ diff --git a/media/libvpx/libvpx/test/active_map_refresh_test.cc b/media/libvpx/libvpx/test/active_map_refresh_test.cc new file mode 100644 index 0000000000..ad067346a7 --- /dev/null +++ b/media/libvpx/libvpx/test/active_map_refresh_test.cc @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/util.h" +#include "test/y4m_video_source.h" + +namespace { + +// Check if any pixel in a 16x16 macroblock varies between frames. +int CheckMb(const vpx_image_t ¤t, const vpx_image_t &previous, int mb_r, + int mb_c) { + for (int plane = 0; plane < 3; plane++) { + int r = 16 * mb_r; + int c0 = 16 * mb_c; + int r_top = std::min(r + 16, static_cast(current.d_h)); + int c_top = std::min(c0 + 16, static_cast(current.d_w)); + r = std::max(r, 0); + c0 = std::max(c0, 0); + if (plane > 0 && current.x_chroma_shift) { + c_top = (c_top + 1) >> 1; + c0 >>= 1; + } + if (plane > 0 && current.y_chroma_shift) { + r_top = (r_top + 1) >> 1; + r >>= 1; + } + for (; r < r_top; ++r) { + for (int c = c0; c < c_top; ++c) { + if (current.planes[plane][current.stride[plane] * r + c] != + previous.planes[plane][previous.stride[plane] * r + c]) { + return 1; + } + } + } + } + return 0; +} + +void GenerateMap(int mb_rows, int mb_cols, const vpx_image_t ¤t, + const vpx_image_t &previous, uint8_t *map) { + for (int mb_r = 0; mb_r < mb_rows; ++mb_r) { + for (int mb_c = 0; mb_c < mb_cols; ++mb_c) { + map[mb_r * mb_cols + mb_c] = CheckMb(current, previous, mb_r, mb_c); + } + } +} + +const int kAqModeCyclicRefresh = 3; + +class ActiveMapRefreshTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith2Params { + protected: + ActiveMapRefreshTest() : EncoderTest(GET_PARAM(0)) {} + ~ActiveMapRefreshTest() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(GET_PARAM(1)); + cpu_used_ = GET_PARAM(2); + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + ::libvpx_test::Y4mVideoSource *y4m_video = + static_cast(video); + if (video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, cpu_used_); + encoder->Control(VP9E_SET_AQ_MODE, kAqModeCyclicRefresh); + } else if (video->frame() >= 2 && video->img()) { + vpx_image_t *current = video->img(); + vpx_image_t *previous = y4m_holder_->img(); + ASSERT_NE(previous, nullptr); + vpx_active_map_t map = vpx_active_map_t(); + const int width = static_cast(current->d_w); + const int height = static_cast(current->d_h); + const int mb_width = (width + 15) / 16; + const int mb_height = (height + 15) / 16; + uint8_t *active_map = new uint8_t[mb_width * mb_height]; + GenerateMap(mb_height, mb_width, *current, *previous, active_map); + map.cols = mb_width; + map.rows = mb_height; + map.active_map = active_map; + encoder->Control(VP8E_SET_ACTIVEMAP, &map); + delete[] active_map; + } + if (video->img()) { + y4m_video->SwapBuffers(y4m_holder_); + } + } + + int cpu_used_; + ::libvpx_test::Y4mVideoSource *y4m_holder_; +}; + +TEST_P(ActiveMapRefreshTest, Test) { + cfg_.g_lag_in_frames = 0; + cfg_.g_profile = 1; + cfg_.rc_target_bitrate = 600; + cfg_.rc_resize_allowed = 0; + cfg_.rc_min_quantizer = 8; + cfg_.rc_max_quantizer = 30; + cfg_.g_pass = VPX_RC_ONE_PASS; + cfg_.rc_end_usage = VPX_CBR; + cfg_.kf_max_dist = 90000; + + ::libvpx_test::Y4mVideoSource video("desktop_credits.y4m", 0, 30); + ::libvpx_test::Y4mVideoSource video_holder("desktop_credits.y4m", 0, 30); + video_holder.Begin(); + y4m_holder_ = &video_holder; + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +VP9_INSTANTIATE_TEST_SUITE(ActiveMapRefreshTest, + ::testing::Values(::libvpx_test::kRealTime), + ::testing::Range(5, 6)); +} // namespace diff --git a/media/libvpx/libvpx/test/active_map_test.cc b/media/libvpx/libvpx/test/active_map_test.cc new file mode 100644 index 0000000000..d222c00b74 --- /dev/null +++ b/media/libvpx/libvpx/test/active_map_test.cc @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include +#include +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" + +namespace { + +class ActiveMapTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith3Params { + protected: + static const int kWidth = 208; + static const int kHeight = 144; + + ActiveMapTest() : EncoderTest(GET_PARAM(0)) {} + ~ActiveMapTest() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(GET_PARAM(1)); + cpu_used_ = GET_PARAM(2); + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, cpu_used_); + encoder->Control(VP9E_SET_AQ_MODE, GET_PARAM(3)); + } else if (video->frame() == 3) { + vpx_active_map_t map = vpx_active_map_t(); + /* clang-format off */ + uint8_t active_map[9 * 13] = { + 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, + 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, + 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, + 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, + 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, + 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, + 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, + 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, + 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, + }; + /* clang-format on */ + map.cols = (kWidth + 15) / 16; + map.rows = (kHeight + 15) / 16; + ASSERT_EQ(map.cols, 13u); + ASSERT_EQ(map.rows, 9u); + map.active_map = active_map; + encoder->Control(VP8E_SET_ACTIVEMAP, &map); + } else if (video->frame() == 15) { + vpx_active_map_t map = vpx_active_map_t(); + map.cols = (kWidth + 15) / 16; + map.rows = (kHeight + 15) / 16; + map.active_map = nullptr; + encoder->Control(VP8E_SET_ACTIVEMAP, &map); + } + } + + int cpu_used_; +}; + +TEST_P(ActiveMapTest, Test) { + // Validate that this non multiple of 64 wide clip encodes + cfg_.g_lag_in_frames = 0; + cfg_.rc_target_bitrate = 400; + cfg_.rc_resize_allowed = 0; + cfg_.g_pass = VPX_RC_ONE_PASS; + cfg_.rc_end_usage = VPX_CBR; + cfg_.kf_max_dist = 90000; + + ::libvpx_test::I420VideoSource video("hantro_odd.yuv", kWidth, kHeight, 30, 1, + 0, 20); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +VP9_INSTANTIATE_TEST_SUITE(ActiveMapTest, + ::testing::Values(::libvpx_test::kRealTime), + ::testing::Range(5, 10), ::testing::Values(0, 3)); +} // namespace diff --git a/media/libvpx/libvpx/test/add_noise_test.cc b/media/libvpx/libvpx/test/add_noise_test.cc new file mode 100644 index 0000000000..4fc4e81e63 --- /dev/null +++ b/media/libvpx/libvpx/test/add_noise_test.cc @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include +#include + +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/postproc.h" +#include "vpx_mem/vpx_mem.h" + +namespace { + +static const int kNoiseSize = 3072; + +typedef void (*AddNoiseFunc)(uint8_t *start, const int8_t *noise, + int blackclamp, int whiteclamp, int width, + int height, int pitch); + +typedef std::tuple AddNoiseTestFPParam; + +class AddNoiseTest : public ::testing::Test, + public ::testing::WithParamInterface { + public: + void TearDown() override { libvpx_test::ClearSystemState(); } + ~AddNoiseTest() override = default; +}; + +double stddev6(char a, char b, char c, char d, char e, char f) { + const double n = (a + b + c + d + e + f) / 6.0; + const double v = ((a - n) * (a - n) + (b - n) * (b - n) + (c - n) * (c - n) + + (d - n) * (d - n) + (e - n) * (e - n) + (f - n) * (f - n)) / + 6.0; + return sqrt(v); +} + +TEST_P(AddNoiseTest, CheckNoiseAdded) { + const int width = 64; + const int height = 64; + const int image_size = width * height; + int8_t noise[kNoiseSize]; + const int clamp = vpx_setup_noise(GET_PARAM(0), noise, kNoiseSize); + uint8_t *const s = + reinterpret_cast(vpx_calloc(image_size, sizeof(*s))); + ASSERT_NE(s, nullptr); + memset(s, 99, image_size * sizeof(*s)); + + ASM_REGISTER_STATE_CHECK( + GET_PARAM(1)(s, noise, clamp, clamp, width, height, width)); + + // Check to make sure we don't end up having either the same or no added + // noise either vertically or horizontally. + for (int i = 0; i < image_size - 6 * width - 6; ++i) { + const double hd = stddev6(s[i] - 99, s[i + 1] - 99, s[i + 2] - 99, + s[i + 3] - 99, s[i + 4] - 99, s[i + 5] - 99); + const double vd = stddev6(s[i] - 99, s[i + width] - 99, + s[i + 2 * width] - 99, s[i + 3 * width] - 99, + s[i + 4 * width] - 99, s[i + 5 * width] - 99); + + EXPECT_NE(hd, 0); + EXPECT_NE(vd, 0); + } + + // Initialize pixels in the image to 255 and check for roll over. + memset(s, 255, image_size); + + ASM_REGISTER_STATE_CHECK( + GET_PARAM(1)(s, noise, clamp, clamp, width, height, width)); + + // Check to make sure don't roll over. + for (int i = 0; i < image_size; ++i) { + EXPECT_GT(static_cast(s[i]), clamp) << "i = " << i; + } + + // Initialize pixels in the image to 0 and check for roll under. + memset(s, 0, image_size); + + ASM_REGISTER_STATE_CHECK( + GET_PARAM(1)(s, noise, clamp, clamp, width, height, width)); + + // Check to make sure don't roll under. + for (int i = 0; i < image_size; ++i) { + EXPECT_LT(static_cast(s[i]), 255 - clamp) << "i = " << i; + } + + vpx_free(s); +} + +TEST_P(AddNoiseTest, CheckCvsAssembly) { + const int width = 64; + const int height = 64; + const int image_size = width * height; + int8_t noise[kNoiseSize]; + const int clamp = vpx_setup_noise(4.4, noise, kNoiseSize); + + uint8_t *const s = reinterpret_cast(vpx_calloc(image_size, 1)); + uint8_t *const d = reinterpret_cast(vpx_calloc(image_size, 1)); + ASSERT_NE(s, nullptr); + ASSERT_NE(d, nullptr); + + memset(s, 99, image_size); + memset(d, 99, image_size); + + srand(0); + ASM_REGISTER_STATE_CHECK( + GET_PARAM(1)(s, noise, clamp, clamp, width, height, width)); + srand(0); + ASM_REGISTER_STATE_CHECK( + vpx_plane_add_noise_c(d, noise, clamp, clamp, width, height, width)); + + for (int i = 0; i < image_size; ++i) { + EXPECT_EQ(static_cast(s[i]), static_cast(d[i])) << "i = " << i; + } + + vpx_free(d); + vpx_free(s); +} + +using std::make_tuple; + +INSTANTIATE_TEST_SUITE_P( + C, AddNoiseTest, + ::testing::Values(make_tuple(3.25, vpx_plane_add_noise_c), + make_tuple(4.4, vpx_plane_add_noise_c))); + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2, AddNoiseTest, + ::testing::Values(make_tuple(3.25, vpx_plane_add_noise_sse2), + make_tuple(4.4, vpx_plane_add_noise_sse2))); +#endif + +#if HAVE_MSA +INSTANTIATE_TEST_SUITE_P( + MSA, AddNoiseTest, + ::testing::Values(make_tuple(3.25, vpx_plane_add_noise_msa), + make_tuple(4.4, vpx_plane_add_noise_msa))); +#endif +} // namespace diff --git a/media/libvpx/libvpx/test/alt_ref_aq_segment_test.cc b/media/libvpx/libvpx/test/alt_ref_aq_segment_test.cc new file mode 100644 index 0000000000..3b1a26ed16 --- /dev/null +++ b/media/libvpx/libvpx/test/alt_ref_aq_segment_test.cc @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" + +namespace { + +class AltRefAqSegmentTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith2Params { + protected: + AltRefAqSegmentTest() : EncoderTest(GET_PARAM(0)) {} + ~AltRefAqSegmentTest() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(GET_PARAM(1)); + set_cpu_used_ = GET_PARAM(2); + aq_mode_ = 0; + alt_ref_aq_mode_ = 0; + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); + encoder->Control(VP9E_SET_ALT_REF_AQ, alt_ref_aq_mode_); + encoder->Control(VP9E_SET_AQ_MODE, aq_mode_); + encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 100); + } + } + + int set_cpu_used_; + int aq_mode_; + int alt_ref_aq_mode_; +}; + +// Validate that this ALT_REF_AQ/AQ segmentation mode +// (ALT_REF_AQ=0, AQ=0/no_aq) +// encodes and decodes without a mismatch. +TEST_P(AltRefAqSegmentTest, TestNoMisMatchAltRefAQ0) { + cfg_.rc_min_quantizer = 8; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_VBR; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_target_bitrate = 300; + + aq_mode_ = 0; + alt_ref_aq_mode_ = 1; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 100); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +// Validate that this ALT_REF_AQ/AQ segmentation mode +// (ALT_REF_AQ=0, AQ=1/variance_aq) +// encodes and decodes without a mismatch. +TEST_P(AltRefAqSegmentTest, TestNoMisMatchAltRefAQ1) { + cfg_.rc_min_quantizer = 8; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_VBR; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_target_bitrate = 300; + + aq_mode_ = 1; + alt_ref_aq_mode_ = 1; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 100); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +// Validate that this ALT_REF_AQ/AQ segmentation mode +// (ALT_REF_AQ=0, AQ=2/complexity_aq) +// encodes and decodes without a mismatch. +TEST_P(AltRefAqSegmentTest, TestNoMisMatchAltRefAQ2) { + cfg_.rc_min_quantizer = 8; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_VBR; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_target_bitrate = 300; + + aq_mode_ = 2; + alt_ref_aq_mode_ = 1; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 100); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +// Validate that this ALT_REF_AQ/AQ segmentation mode +// (ALT_REF_AQ=0, AQ=3/cyclicrefresh_aq) +// encodes and decodes without a mismatch. +TEST_P(AltRefAqSegmentTest, TestNoMisMatchAltRefAQ3) { + cfg_.rc_min_quantizer = 8; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_VBR; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_target_bitrate = 300; + + aq_mode_ = 3; + alt_ref_aq_mode_ = 1; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 100); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +// Validate that this ALT_REF_AQ/AQ segmentation mode +// (ALT_REF_AQ=0, AQ=4/equator360_aq) +// encodes and decodes without a mismatch. +TEST_P(AltRefAqSegmentTest, TestNoMisMatchAltRefAQ4) { + cfg_.rc_min_quantizer = 8; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_VBR; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_target_bitrate = 300; + + aq_mode_ = 4; + alt_ref_aq_mode_ = 1; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 100); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +VP9_INSTANTIATE_TEST_SUITE(AltRefAqSegmentTest, + ::testing::Values(::libvpx_test::kOnePassGood, + ::libvpx_test::kTwoPassGood), + ::testing::Range(2, 5)); +} // namespace diff --git a/media/libvpx/libvpx/test/altref_test.cc b/media/libvpx/libvpx/test/altref_test.cc new file mode 100644 index 0000000000..903230fde9 --- /dev/null +++ b/media/libvpx/libvpx/test/altref_test.cc @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +namespace { + +#if CONFIG_VP8_ENCODER + +// lookahead range: [kLookAheadMin, kLookAheadMax). +const int kLookAheadMin = 5; +const int kLookAheadMax = 26; + +class AltRefTest : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam { + protected: + AltRefTest() : EncoderTest(GET_PARAM(0)), altref_count_(0) {} + ~AltRefTest() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(libvpx_test::kTwoPassGood); + } + + void BeginPassHook(unsigned int /*pass*/) override { altref_count_ = 0; } + + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); + encoder->Control(VP8E_SET_CPUUSED, 3); + } + } + + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + if (pkt->data.frame.flags & VPX_FRAME_IS_INVISIBLE) ++altref_count_; + } + + int altref_count() const { return altref_count_; } + + private: + int altref_count_; +}; + +TEST_P(AltRefTest, MonotonicTimestamps) { + const vpx_rational timebase = { 33333333, 1000000000 }; + cfg_.g_timebase = timebase; + cfg_.rc_target_bitrate = 1000; + cfg_.g_lag_in_frames = GET_PARAM(1); + + libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + timebase.den, timebase.num, 0, 30); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + EXPECT_GE(altref_count(), 1); +} + +VP8_INSTANTIATE_TEST_SUITE(AltRefTest, + ::testing::Range(kLookAheadMin, kLookAheadMax)); + +#endif // CONFIG_VP8_ENCODER + +class AltRefForcedKeyTestLarge + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith2Params { + protected: + AltRefForcedKeyTestLarge() + : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), + cpu_used_(GET_PARAM(2)), forced_kf_frame_num_(1), frame_num_(0) {} + ~AltRefForcedKeyTestLarge() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(encoding_mode_); + cfg_.rc_end_usage = VPX_VBR; + cfg_.g_threads = 0; + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, cpu_used_); + encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); +#if CONFIG_VP9_ENCODER + // override test default for tile columns if necessary. + if (GET_PARAM(0) == &libvpx_test::kVP9) { + encoder->Control(VP9E_SET_TILE_COLUMNS, 6); + } +#endif + } + frame_flags_ = + (video->frame() == forced_kf_frame_num_) ? VPX_EFLAG_FORCE_KF : 0; + } + + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + if (frame_num_ == forced_kf_frame_num_) { + ASSERT_TRUE(!!(pkt->data.frame.flags & VPX_FRAME_IS_KEY)) + << "Frame #" << frame_num_ << " isn't a keyframe!"; + } + ++frame_num_; + } + + ::libvpx_test::TestMode encoding_mode_; + int cpu_used_; + unsigned int forced_kf_frame_num_; + unsigned int frame_num_; +}; + +TEST_P(AltRefForcedKeyTestLarge, Frame1IsKey) { + const vpx_rational timebase = { 1, 30 }; + const int lag_values[] = { 3, 15, 25, -1 }; + + forced_kf_frame_num_ = 1; + for (int i = 0; lag_values[i] != -1; ++i) { + frame_num_ = 0; + cfg_.g_lag_in_frames = lag_values[i]; + libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + timebase.den, timebase.num, 0, 30); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } +} + +TEST_P(AltRefForcedKeyTestLarge, ForcedFrameIsKey) { + const vpx_rational timebase = { 1, 30 }; + const int lag_values[] = { 3, 15, 25, -1 }; + + for (int i = 0; lag_values[i] != -1; ++i) { + frame_num_ = 0; + forced_kf_frame_num_ = lag_values[i] - 1; + cfg_.g_lag_in_frames = lag_values[i]; + libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + timebase.den, timebase.num, 0, 30); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } +} + +VP8_INSTANTIATE_TEST_SUITE(AltRefForcedKeyTestLarge, + ::testing::Values(::libvpx_test::kOnePassGood), + ::testing::Range(0, 9)); + +VP9_INSTANTIATE_TEST_SUITE(AltRefForcedKeyTestLarge, + ::testing::Values(::libvpx_test::kOnePassGood), + ::testing::Range(0, 9)); +} // namespace diff --git a/media/libvpx/libvpx/test/android/Android.mk b/media/libvpx/libvpx/test/android/Android.mk new file mode 100644 index 0000000000..9a7533ebba --- /dev/null +++ b/media/libvpx/libvpx/test/android/Android.mk @@ -0,0 +1,67 @@ +# Copyright (c) 2013 The WebM project authors. All Rights Reserved. +# +# Use of this source code is governed by a BSD-style license +# that can be found in the LICENSE file in the root of the source +# tree. An additional intellectual property rights grant can be found +# in the file PATENTS. All contributing project authors may +# be found in the AUTHORS file in the root of the source tree. +# +# This make file builds vpx_test app for android. +# The test app itself runs on the command line through adb shell +# The paths are really messed up as the libvpx make file +# expects to be made from a parent directory. + +# Ignore this file during non-NDK builds. +ifdef NDK_ROOT +CUR_WD := $(call my-dir) +BINDINGS_DIR := $(CUR_WD)/../../.. +LOCAL_PATH := $(CUR_WD)/../../.. + +#libwebm +include $(CLEAR_VARS) +include $(BINDINGS_DIR)/libvpx/third_party/libwebm/Android.mk +LOCAL_PATH := $(CUR_WD)/../../.. + +#libvpx +include $(CLEAR_VARS) +LOCAL_STATIC_LIBRARIES := libwebm +include $(BINDINGS_DIR)/libvpx/build/make/Android.mk +LOCAL_PATH := $(CUR_WD)/../.. + +#libgtest +include $(CLEAR_VARS) +LOCAL_ARM_MODE := arm +LOCAL_CPP_EXTENSION := .cc +LOCAL_MODULE := gtest +LOCAL_C_INCLUDES := $(LOCAL_PATH)/third_party/googletest/src/ +LOCAL_C_INCLUDES += $(LOCAL_PATH)/third_party/googletest/src/include/ +LOCAL_EXPORT_C_INCLUDES := $(LOCAL_PATH)/third_party/googletest/src/include/ +LOCAL_SRC_FILES := ./third_party/googletest/src/src/gtest-all.cc +LOCAL_LICENSE_KINDS := SPDX-license-identifier-BSD +LOCAL_LICENSE_CONDITIONS := notice +LOCAL_NOTICE_FILE := $(LOCAL_PATH)/../../LICENSE $(LOCAL_PATH)/../../PATENTS +include $(BUILD_STATIC_LIBRARY) + +#libvpx_test +include $(CLEAR_VARS) +LOCAL_ARM_MODE := arm +LOCAL_MODULE := libvpx_test +LOCAL_STATIC_LIBRARIES := gtest libwebm + +ifeq ($(ENABLE_SHARED),1) + LOCAL_SHARED_LIBRARIES := vpx +else + LOCAL_STATIC_LIBRARIES += vpx +endif + +LOCAL_LICENSE_KINDS := SPDX-license-identifier-BSD +LOCAL_LICENSE_CONDITIONS := notice +LOCAL_NOTICE_FILE := $(LOCAL_PATH)/../../LICENSE $(LOCAL_PATH)/../../PATENTS +include $(LOCAL_PATH)/test/test.mk +LOCAL_C_INCLUDES := $(BINDINGS_DIR) +FILTERED_SRC := $(sort $(filter %.cc %.c, $(LIBVPX_TEST_SRCS-yes))) +LOCAL_SRC_FILES := $(addprefix ./test/, $(FILTERED_SRC)) +# some test files depend on *_rtcd.h, ensure they're generated first. +$(eval $(call rtcd_dep_template)) +include $(BUILD_EXECUTABLE) +endif # NDK_ROOT diff --git a/media/libvpx/libvpx/test/android/README b/media/libvpx/libvpx/test/android/README new file mode 100644 index 0000000000..0cd30779d4 --- /dev/null +++ b/media/libvpx/libvpx/test/android/README @@ -0,0 +1,33 @@ +Android.mk will build vpx unittests on android. +1) Configure libvpx from the parent directory: +./libvpx/configure --target=armv7-android-gcc --enable-external-build \ + --enable-postproc --disable-install-srcs --enable-multi-res-encoding \ + --enable-temporal-denoising --disable-unit-tests --disable-install-docs \ + --disable-examples --disable-runtime-cpu-detect + +2) From the parent directory, invoke ndk-build: +NDK_PROJECT_PATH=. ndk-build APP_BUILD_SCRIPT=./libvpx/test/android/Android.mk \ + APP_ABI=armeabi-v7a APP_PLATFORM=android-18 APP_OPTIM=release \ + APP_STL=c++_static + +Note: Both adb and ndk-build are available at: + https://developer.android.com/studio#downloads + https://developer.android.com/ndk/downloads + +3) Run get_files.py to download the test files: +python get_files.py -i /path/to/test-data.sha1 -o /path/to/put/files \ + -u https://storage.googleapis.com/downloads.webmproject.org/test_data/libvpx + +4) Transfer files to device using adb. Ensure you have proper permissions for +the target + +adb push /path/to/test_files /data/local/tmp +adb push /path/to/built_libs /data/local/tmp + +NOTE: Built_libs defaults to parent_dir/libs/armeabi-v7a + +5) Run tests: +adb shell +(on device) +cd /data/local/tmp +LD_LIBRARY_PATH=. ./vpx_test diff --git a/media/libvpx/libvpx/test/android/get_files.py b/media/libvpx/libvpx/test/android/get_files.py new file mode 100644 index 0000000000..1c69740d2b --- /dev/null +++ b/media/libvpx/libvpx/test/android/get_files.py @@ -0,0 +1,118 @@ +# Copyright (c) 2013 The WebM project authors. All Rights Reserved. +# +# Use of this source code is governed by a BSD-style license +# that can be found in the LICENSE file in the root of the source +# tree. An additional intellectual property rights grant can be found +# in the file PATENTS. All contributing project authors may +# be found in the AUTHORS file in the root of the source tree. +# +# This simple script pulls test files from the webm homepage +# It is intelligent enough to only pull files if +# 1) File / test_data folder does not exist +# 2) SHA mismatch + +import pycurl +import csv +import hashlib +import re +import os.path +import time +import itertools +import sys +import getopt + +#globals +url = '' +file_list_path = '' +local_resource_path = '' + +# Helper functions: +# A simple function which returns the sha hash of a file in hex +def get_file_sha(filename): + try: + sha_hash = hashlib.sha1() + with open(filename, 'rb') as file: + buf = file.read(HASH_CHUNK) + while len(buf) > 0: + sha_hash.update(buf) + buf = file.read(HASH_CHUNK) + return sha_hash.hexdigest() + except IOError: + print "Error reading " + filename + +# Downloads a file from a url, and then checks the sha against the passed +# in sha +def download_and_check_sha(url, filename, sha): + path = os.path.join(local_resource_path, filename) + fp = open(path, "wb") + curl = pycurl.Curl() + curl.setopt(pycurl.URL, url + "/" + filename) + curl.setopt(pycurl.WRITEDATA, fp) + curl.perform() + curl.close() + fp.close() + return get_file_sha(path) == sha + +#constants +ftp_retries = 3 + +SHA_COL = 0 +NAME_COL = 1 +EXPECTED_COL = 2 +HASH_CHUNK = 65536 + +# Main script +try: + opts, args = \ + getopt.getopt(sys.argv[1:], \ + "u:i:o:", ["url=", "input_csv=", "output_dir="]) +except: + print 'get_files.py -u -i -o ' + sys.exit(2) + +for opt, arg in opts: + if opt == '-u': + url = arg + elif opt in ("-i", "--input_csv"): + file_list_path = os.path.join(arg) + elif opt in ("-o", "--output_dir"): + local_resource_path = os.path.join(arg) + +if len(sys.argv) != 7: + print "Expects two paths and a url!" + exit(1) + +if not os.path.isdir(local_resource_path): + os.makedirs(local_resource_path) + +file_list_csv = open(file_list_path, "rb") + +# Our 'csv' file uses multiple spaces as a delimiter, python's +# csv class only uses single character delimiters, so we convert them below +file_list_reader = csv.reader((re.sub(' +', ' ', line) \ + for line in file_list_csv), delimiter = ' ') + +file_shas = [] +file_names = [] + +for row in file_list_reader: + if len(row) != EXPECTED_COL: + continue + file_shas.append(row[SHA_COL]) + file_names.append(row[NAME_COL]) + +file_list_csv.close() + +# Download files, only if they don't already exist and have correct shas +for filename, sha in itertools.izip(file_names, file_shas): + path = os.path.join(local_resource_path, filename) + if os.path.isfile(path) \ + and get_file_sha(path) == sha: + print path + ' exists, skipping' + continue + for retry in range(0, ftp_retries): + print "Downloading " + path + if not download_and_check_sha(url, filename, sha): + print "Sha does not match, retrying..." + else: + break diff --git a/media/libvpx/libvpx/test/android/scrape_gtest_log.py b/media/libvpx/libvpx/test/android/scrape_gtest_log.py new file mode 100644 index 0000000000..487845c270 --- /dev/null +++ b/media/libvpx/libvpx/test/android/scrape_gtest_log.py @@ -0,0 +1,57 @@ +# Copyright (c) 2014 The WebM project authors. All Rights Reserved. +# +# Use of this source code is governed by a BSD-style license +# that can be found in the LICENSE file in the root of the source +# tree. An additional intellectual property rights grant can be found +# in the file PATENTS. All contributing project authors may +# be found in the AUTHORS file in the root of the source tree. + +"""Standalone script which parses a gtest log for json. + +Json is returned returns as an array. This script is used by the libvpx +waterfall to gather json results mixed in with gtest logs. This is +dubious software engineering. +""" + +import getopt +import json +import os +import re +import sys + + +def main(): + if len(sys.argv) != 3: + print "Expects a file to write json to!" + exit(1) + + try: + opts, _ = \ + getopt.getopt(sys.argv[1:], \ + 'o:', ['output-json=']) + except getopt.GetOptError: + print 'scrape_gtest_log.py -o ' + sys.exit(2) + + output_json = '' + for opt, arg in opts: + if opt in ('-o', '--output-json'): + output_json = os.path.join(arg) + + blob = sys.stdin.read() + json_string = '[' + ','.join('{' + x + '}' for x in + re.findall(r'{([^}]*.?)}', blob)) + ']' + print blob + + output = json.dumps(json.loads(json_string), indent=4, sort_keys=True) + print output + + path = os.path.dirname(output_json) + if path and not os.path.exists(path): + os.makedirs(path) + + outfile = open(output_json, 'w') + outfile.write(output) + +if __name__ == '__main__': + sys.exit(main()) diff --git a/media/libvpx/libvpx/test/aq_segment_test.cc b/media/libvpx/libvpx/test/aq_segment_test.cc new file mode 100644 index 0000000000..955e1dafc0 --- /dev/null +++ b/media/libvpx/libvpx/test/aq_segment_test.cc @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" + +namespace { + +class AqSegmentTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith2Params { + protected: + AqSegmentTest() : EncoderTest(GET_PARAM(0)) {} + ~AqSegmentTest() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(GET_PARAM(1)); + set_cpu_used_ = GET_PARAM(2); + aq_mode_ = 0; + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); + encoder->Control(VP9E_SET_AQ_MODE, aq_mode_); + encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 100); + } + } + + int set_cpu_used_; + int aq_mode_; +}; + +// Validate that this AQ segmentation mode (AQ=1, variance_ap) +// encodes and decodes without a mismatch. +TEST_P(AqSegmentTest, TestNoMisMatchAQ1) { + cfg_.rc_min_quantizer = 8; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_target_bitrate = 300; + + aq_mode_ = 1; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 100); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +// Validate that this AQ segmentation mode (AQ=2, complexity_aq) +// encodes and decodes without a mismatch. +TEST_P(AqSegmentTest, TestNoMisMatchAQ2) { + cfg_.rc_min_quantizer = 8; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_target_bitrate = 300; + + aq_mode_ = 2; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 100); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +// Validate that this AQ segmentation mode (AQ=3, cyclic_refresh_aq) +// encodes and decodes without a mismatch. +TEST_P(AqSegmentTest, TestNoMisMatchAQ3) { + cfg_.rc_min_quantizer = 8; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_target_bitrate = 300; + + aq_mode_ = 3; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 100); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +VP9_INSTANTIATE_TEST_SUITE(AqSegmentTest, + ::testing::Values(::libvpx_test::kRealTime, + ::libvpx_test::kOnePassGood), + ::testing::Range(3, 9)); +} // namespace diff --git a/media/libvpx/libvpx/test/avg_test.cc b/media/libvpx/libvpx/test/avg_test.cc new file mode 100644 index 0000000000..ede9c0ba8c --- /dev/null +++ b/media/libvpx/libvpx/test/avg_test.cc @@ -0,0 +1,757 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "vpx/vpx_codec.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/vpx_timer.h" + +using libvpx_test::ACMRandom; + +namespace { + +template +class AverageTestBase : public ::testing::Test { + public: + AverageTestBase(int width, int height) + : width_(width), height_(height), source_data_(nullptr), + source_stride_(0), bit_depth_(8) {} + + void TearDown() override { + vpx_free(source_data_); + source_data_ = nullptr; + libvpx_test::ClearSystemState(); + } + + protected: + // Handle blocks up to 4 blocks 64x64 with stride up to 128 + static const int kDataAlignment = 16; + static const int kDataBlockSize = 64 * 128; + + void SetUp() override { + source_data_ = reinterpret_cast( + vpx_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0]))); + ASSERT_NE(source_data_, nullptr); + source_stride_ = (width_ + 31) & ~31; + bit_depth_ = 8; + rnd_.Reset(ACMRandom::DeterministicSeed()); + } + + // Sum Pixels + static unsigned int ReferenceAverage8x8(const Pixel *source, int pitch) { + unsigned int average = 0; + for (int h = 0; h < 8; ++h) { + for (int w = 0; w < 8; ++w) average += source[h * pitch + w]; + } + return ((average + 32) >> 6); + } + + static unsigned int ReferenceAverage4x4(const Pixel *source, int pitch) { + unsigned int average = 0; + for (int h = 0; h < 4; ++h) { + for (int w = 0; w < 4; ++w) average += source[h * pitch + w]; + } + return ((average + 8) >> 4); + } + + void FillConstant(Pixel fill_constant) { + for (int i = 0; i < width_ * height_; ++i) { + source_data_[i] = fill_constant; + } + } + + void FillRandom() { + for (int i = 0; i < width_ * height_; ++i) { + source_data_[i] = rnd_.Rand16() & ((1 << bit_depth_) - 1); + } + } + + int width_, height_; + Pixel *source_data_; + int source_stride_; + int bit_depth_; + + ACMRandom rnd_; +}; +typedef unsigned int (*AverageFunction)(const uint8_t *s, int pitch); + +typedef std::tuple AvgFunc; + +class AverageTest : public AverageTestBase, + public ::testing::WithParamInterface { + public: + AverageTest() : AverageTestBase(GET_PARAM(0), GET_PARAM(1)) {} + + protected: + void CheckAverages() { + const int block_size = GET_PARAM(3); + unsigned int expected = 0; + if (block_size == 8) { + expected = + ReferenceAverage8x8(source_data_ + GET_PARAM(2), source_stride_); + } else if (block_size == 4) { + expected = + ReferenceAverage4x4(source_data_ + GET_PARAM(2), source_stride_); + } + + ASM_REGISTER_STATE_CHECK( + GET_PARAM(4)(source_data_ + GET_PARAM(2), source_stride_)); + unsigned int actual = + GET_PARAM(4)(source_data_ + GET_PARAM(2), source_stride_); + + EXPECT_EQ(expected, actual); + } +}; + +#if CONFIG_VP9_HIGHBITDEPTH +class AverageTestHBD : public AverageTestBase, + public ::testing::WithParamInterface { + public: + AverageTestHBD() : AverageTestBase(GET_PARAM(0), GET_PARAM(1)) {} + + protected: + void CheckAverages() { + const int block_size = GET_PARAM(3); + unsigned int expected = 0; + if (block_size == 8) { + expected = + ReferenceAverage8x8(source_data_ + GET_PARAM(2), source_stride_); + } else if (block_size == 4) { + expected = + ReferenceAverage4x4(source_data_ + GET_PARAM(2), source_stride_); + } + + ASM_REGISTER_STATE_CHECK(GET_PARAM(4)( + CONVERT_TO_BYTEPTR(source_data_ + GET_PARAM(2)), source_stride_)); + unsigned int actual = GET_PARAM(4)( + CONVERT_TO_BYTEPTR(source_data_ + GET_PARAM(2)), source_stride_); + + EXPECT_EQ(expected, actual); + } +}; +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_NEON || HAVE_SSE2 || HAVE_MSA +typedef void (*IntProRowFunc)(int16_t hbuf[16], uint8_t const *ref, + const int ref_stride, const int height); + +typedef std::tuple IntProRowParam; + +class IntProRowTest : public AverageTestBase, + public ::testing::WithParamInterface { + public: + IntProRowTest() + : AverageTestBase(16, GET_PARAM(0)), hbuf_asm_(nullptr), + hbuf_c_(nullptr) { + asm_func_ = GET_PARAM(1); + c_func_ = GET_PARAM(2); + } + + protected: + void SetUp() override { + source_data_ = reinterpret_cast( + vpx_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0]))); + ASSERT_NE(source_data_, nullptr); + + hbuf_asm_ = reinterpret_cast( + vpx_memalign(kDataAlignment, sizeof(*hbuf_asm_) * 16)); + hbuf_c_ = reinterpret_cast( + vpx_memalign(kDataAlignment, sizeof(*hbuf_c_) * 16)); + } + + void TearDown() override { + vpx_free(source_data_); + source_data_ = nullptr; + vpx_free(hbuf_c_); + hbuf_c_ = nullptr; + vpx_free(hbuf_asm_); + hbuf_asm_ = nullptr; + } + + void RunComparison() { + ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, width_, height_)); + ASM_REGISTER_STATE_CHECK( + asm_func_(hbuf_asm_, source_data_, width_, height_)); + EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16)) + << "Output mismatch"; + } + + private: + IntProRowFunc asm_func_; + IntProRowFunc c_func_; + int16_t *hbuf_asm_; + int16_t *hbuf_c_; +}; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(IntProRowTest); + +typedef int16_t (*IntProColFunc)(uint8_t const *ref, const int width); + +typedef std::tuple IntProColParam; + +class IntProColTest : public AverageTestBase, + public ::testing::WithParamInterface { + public: + IntProColTest() : AverageTestBase(GET_PARAM(0), 1), sum_asm_(0), sum_c_(0) { + asm_func_ = GET_PARAM(1); + c_func_ = GET_PARAM(2); + } + + protected: + void RunComparison() { + ASM_REGISTER_STATE_CHECK(sum_c_ = c_func_(source_data_, width_)); + ASM_REGISTER_STATE_CHECK(sum_asm_ = asm_func_(source_data_, width_)); + EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch"; + } + + private: + IntProColFunc asm_func_; + IntProColFunc c_func_; + int16_t sum_asm_; + int16_t sum_c_; +}; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(IntProColTest); +#endif // HAVE_NEON || HAVE_SSE2 || HAVE_MSA + +typedef int (*SatdFunc)(const tran_low_t *coeffs, int length); +typedef std::tuple SatdTestParam; + +class SatdTest : public ::testing::Test, + public ::testing::WithParamInterface { + protected: + void SetUp() override { + satd_size_ = GET_PARAM(0); + satd_func_ = GET_PARAM(1); + rnd_.Reset(ACMRandom::DeterministicSeed()); + src_ = reinterpret_cast( + vpx_memalign(16, sizeof(*src_) * satd_size_)); + ASSERT_NE(src_, nullptr); + } + + void TearDown() override { + libvpx_test::ClearSystemState(); + vpx_free(src_); + } + + void FillConstant(const tran_low_t val) { + for (int i = 0; i < satd_size_; ++i) src_[i] = val; + } + + virtual void FillRandom() = 0; + + void Check(const int expected) { + int total; + ASM_REGISTER_STATE_CHECK(total = satd_func_(src_, satd_size_)); + EXPECT_EQ(expected, total); + } + + tran_low_t *GetCoeff() const { return src_; } + + int satd_size_; + ACMRandom rnd_; + tran_low_t *src_; + + private: + SatdFunc satd_func_; +}; + +class SatdLowbdTest : public SatdTest { + protected: + void FillRandom() override { + for (int i = 0; i < satd_size_; ++i) { + const int16_t tmp = rnd_.Rand16Signed(); + src_[i] = (tran_low_t)tmp; + } + } +}; + +typedef int64_t (*BlockErrorFunc)(const tran_low_t *coeff, + const tran_low_t *dqcoeff, int block_size); +typedef std::tuple BlockErrorTestFPParam; + +class BlockErrorTestFP + : public ::testing::Test, + public ::testing::WithParamInterface { + protected: + void SetUp() override { + txfm_size_ = GET_PARAM(0); + block_error_func_ = GET_PARAM(1); + rnd_.Reset(ACMRandom::DeterministicSeed()); + coeff_ = reinterpret_cast( + vpx_memalign(16, sizeof(*coeff_) * txfm_size_)); + dqcoeff_ = reinterpret_cast( + vpx_memalign(16, sizeof(*dqcoeff_) * txfm_size_)); + ASSERT_NE(coeff_, nullptr); + ASSERT_NE(dqcoeff_, nullptr); + } + + void TearDown() override { + libvpx_test::ClearSystemState(); + vpx_free(coeff_); + vpx_free(dqcoeff_); + } + + void FillConstant(const tran_low_t coeff_val, const tran_low_t dqcoeff_val) { + for (int i = 0; i < txfm_size_; ++i) coeff_[i] = coeff_val; + for (int i = 0; i < txfm_size_; ++i) dqcoeff_[i] = dqcoeff_val; + } + + void FillRandom() { + // Just two fixed seeds + rnd_.Reset(0xb0b9); + for (int i = 0; i < txfm_size_; ++i) coeff_[i] = rnd_.Rand16() >> 1; + rnd_.Reset(0xb0c8); + for (int i = 0; i < txfm_size_; ++i) dqcoeff_[i] = rnd_.Rand16() >> 1; + } + + void Check(const int64_t expected) { + int64_t total; + ASM_REGISTER_STATE_CHECK( + total = block_error_func_(coeff_, dqcoeff_, txfm_size_)); + EXPECT_EQ(expected, total); + } + + tran_low_t *GetCoeff() const { return coeff_; } + + tran_low_t *GetDQCoeff() const { return dqcoeff_; } + + int txfm_size_; + + private: + tran_low_t *coeff_; + tran_low_t *dqcoeff_; + BlockErrorFunc block_error_func_; + ACMRandom rnd_; +}; + +TEST_P(AverageTest, MinValue) { + FillConstant(0); + CheckAverages(); +} + +TEST_P(AverageTest, MaxValue) { + FillConstant(255); + CheckAverages(); +} + +TEST_P(AverageTest, Random) { + // The reference frame, but not the source frame, may be unaligned for + // certain types of searches. + for (int i = 0; i < 1000; i++) { + FillRandom(); + CheckAverages(); + } +} +#if CONFIG_VP9_HIGHBITDEPTH +TEST_P(AverageTestHBD, MinValue) { + FillConstant(0); + CheckAverages(); +} + +TEST_P(AverageTestHBD, MaxValue) { + FillConstant((1 << VPX_BITS_12) - 1); + CheckAverages(); +} + +TEST_P(AverageTestHBD, Random) { + bit_depth_ = VPX_BITS_12; + // The reference frame, but not the source frame, may be unaligned for + // certain types of searches. + for (int i = 0; i < 1000; i++) { + FillRandom(); + CheckAverages(); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_NEON || HAVE_SSE2 || HAVE_MSA +TEST_P(IntProRowTest, MinValue) { + FillConstant(0); + RunComparison(); +} + +TEST_P(IntProRowTest, MaxValue) { + FillConstant(255); + RunComparison(); +} + +TEST_P(IntProRowTest, Random) { + FillRandom(); + RunComparison(); +} + +TEST_P(IntProColTest, MinValue) { + FillConstant(0); + RunComparison(); +} + +TEST_P(IntProColTest, MaxValue) { + FillConstant(255); + RunComparison(); +} + +TEST_P(IntProColTest, Random) { + FillRandom(); + RunComparison(); +} +#endif + +TEST_P(SatdLowbdTest, MinValue) { + const int kMin = -32640; + const int expected = -kMin * satd_size_; + FillConstant(kMin); + Check(expected); +} + +TEST_P(SatdLowbdTest, MaxValue) { + const int kMax = 32640; + const int expected = kMax * satd_size_; + FillConstant(kMax); + Check(expected); +} + +TEST_P(SatdLowbdTest, Random) { + int expected; + switch (satd_size_) { + case 16: expected = 261036; break; + case 64: expected = 991732; break; + case 256: expected = 4136358; break; + case 1024: expected = 16677592; break; + default: + FAIL() << "Invalid satd size (" << satd_size_ + << ") valid: 16/64/256/1024"; + } + FillRandom(); + Check(expected); +} + +TEST_P(SatdLowbdTest, DISABLED_Speed) { + const int kCountSpeedTestBlock = 20000; + vpx_usec_timer timer; + const int blocksize = GET_PARAM(0); + FillRandom(); + tran_low_t *coeff = GetCoeff(); + + vpx_usec_timer_start(&timer); + for (int i = 0; i < kCountSpeedTestBlock; ++i) { + GET_PARAM(1)(coeff, blocksize); + } + vpx_usec_timer_mark(&timer); + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time); +} + +#if CONFIG_VP9_HIGHBITDEPTH +class SatdHighbdTest : public SatdTest { + protected: + void FillRandom() override { + for (int i = 0; i < satd_size_; ++i) { + src_[i] = rnd_.Rand20Signed(); + } + } +}; + +TEST_P(SatdHighbdTest, MinValue) { + const int kMin = -524280; + const int expected = -kMin * satd_size_; + FillConstant(kMin); + Check(expected); +} + +TEST_P(SatdHighbdTest, MaxValue) { + const int kMax = 524280; + const int expected = kMax * satd_size_; + FillConstant(kMax); + Check(expected); +} + +TEST_P(SatdHighbdTest, Random) { + int expected; + switch (satd_size_) { + case 16: expected = 5249712; break; + case 64: expected = 18362120; break; + case 256: expected = 66100520; break; + case 1024: expected = 266094734; break; + default: + FAIL() << "Invalid satd size (" << satd_size_ + << ") valid: 16/64/256/1024"; + } + FillRandom(); + Check(expected); +} + +TEST_P(SatdHighbdTest, DISABLED_Speed) { + const int kCountSpeedTestBlock = 20000; + vpx_usec_timer timer; + const int blocksize = GET_PARAM(0); + FillRandom(); + tran_low_t *coeff = GetCoeff(); + + vpx_usec_timer_start(&timer); + for (int i = 0; i < kCountSpeedTestBlock; ++i) { + GET_PARAM(1)(coeff, blocksize); + } + vpx_usec_timer_mark(&timer); + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +TEST_P(BlockErrorTestFP, MinValue) { + const int64_t kMin = -32640; + const int64_t expected = kMin * kMin * txfm_size_; + FillConstant(kMin, 0); + Check(expected); +} + +TEST_P(BlockErrorTestFP, MaxValue) { + const int64_t kMax = 32640; + const int64_t expected = kMax * kMax * txfm_size_; + FillConstant(kMax, 0); + Check(expected); +} + +TEST_P(BlockErrorTestFP, Random) { + int64_t expected; + switch (txfm_size_) { + case 16: expected = 2051681432; break; + case 64: expected = 11075114379; break; + case 256: expected = 44386271116; break; + case 1024: expected = 184774996089; break; + default: + FAIL() << "Invalid satd size (" << txfm_size_ + << ") valid: 16/64/256/1024"; + } + FillRandom(); + Check(expected); +} + +TEST_P(BlockErrorTestFP, DISABLED_Speed) { + const int kCountSpeedTestBlock = 20000; + vpx_usec_timer timer; + const int blocksize = GET_PARAM(0); + FillRandom(); + tran_low_t *coeff = GetCoeff(); + tran_low_t *dqcoeff = GetDQCoeff(); + + vpx_usec_timer_start(&timer); + for (int i = 0; i < kCountSpeedTestBlock; ++i) { + GET_PARAM(1)(coeff, dqcoeff, blocksize); + } + vpx_usec_timer_mark(&timer); + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time); +} + +using std::make_tuple; + +INSTANTIATE_TEST_SUITE_P( + C, AverageTest, + ::testing::Values(make_tuple(16, 16, 1, 8, &vpx_avg_8x8_c), + make_tuple(16, 16, 1, 4, &vpx_avg_4x4_c))); + +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + C, AverageTestHBD, + ::testing::Values(make_tuple(16, 16, 1, 8, &vpx_highbd_avg_8x8_c), + make_tuple(16, 16, 1, 4, &vpx_highbd_avg_4x4_c))); + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2, AverageTestHBD, + ::testing::Values(make_tuple(16, 16, 1, 8, &vpx_highbd_avg_8x8_sse2), + make_tuple(16, 16, 1, 4, &vpx_highbd_avg_4x4_sse2))); +#endif // HAVE_SSE2 + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, AverageTestHBD, + ::testing::Values(make_tuple(16, 16, 1, 8, &vpx_highbd_avg_8x8_neon), + make_tuple(16, 16, 1, 4, &vpx_highbd_avg_4x4_neon))); +#endif // HAVE_NEON + +INSTANTIATE_TEST_SUITE_P(C, SatdHighbdTest, + ::testing::Values(make_tuple(16, &vpx_satd_c), + make_tuple(64, &vpx_satd_c), + make_tuple(256, &vpx_satd_c), + make_tuple(1024, &vpx_satd_c))); +#endif // CONFIG_VP9_HIGHBITDEPTH + +INSTANTIATE_TEST_SUITE_P(C, SatdLowbdTest, + ::testing::Values(make_tuple(16, &vpx_satd_c), + make_tuple(64, &vpx_satd_c), + make_tuple(256, &vpx_satd_c), + make_tuple(1024, &vpx_satd_c))); + +INSTANTIATE_TEST_SUITE_P( + C, BlockErrorTestFP, + ::testing::Values(make_tuple(16, &vp9_block_error_fp_c), + make_tuple(64, &vp9_block_error_fp_c), + make_tuple(256, &vp9_block_error_fp_c), + make_tuple(1024, &vp9_block_error_fp_c))); + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2, AverageTest, + ::testing::Values(make_tuple(16, 16, 0, 8, &vpx_avg_8x8_sse2), + make_tuple(16, 16, 5, 8, &vpx_avg_8x8_sse2), + make_tuple(32, 32, 15, 8, &vpx_avg_8x8_sse2), + make_tuple(16, 16, 0, 4, &vpx_avg_4x4_sse2), + make_tuple(16, 16, 5, 4, &vpx_avg_4x4_sse2), + make_tuple(32, 32, 15, 4, &vpx_avg_4x4_sse2))); + +INSTANTIATE_TEST_SUITE_P( + SSE2, IntProRowTest, + ::testing::Values(make_tuple(16, &vpx_int_pro_row_sse2, &vpx_int_pro_row_c), + make_tuple(32, &vpx_int_pro_row_sse2, &vpx_int_pro_row_c), + make_tuple(64, &vpx_int_pro_row_sse2, + &vpx_int_pro_row_c))); + +INSTANTIATE_TEST_SUITE_P( + SSE2, IntProColTest, + ::testing::Values(make_tuple(16, &vpx_int_pro_col_sse2, &vpx_int_pro_col_c), + make_tuple(32, &vpx_int_pro_col_sse2, &vpx_int_pro_col_c), + make_tuple(64, &vpx_int_pro_col_sse2, + &vpx_int_pro_col_c))); + +INSTANTIATE_TEST_SUITE_P(SSE2, SatdLowbdTest, + ::testing::Values(make_tuple(16, &vpx_satd_sse2), + make_tuple(64, &vpx_satd_sse2), + make_tuple(256, &vpx_satd_sse2), + make_tuple(1024, &vpx_satd_sse2))); + +INSTANTIATE_TEST_SUITE_P( + SSE2, BlockErrorTestFP, + ::testing::Values(make_tuple(16, &vp9_block_error_fp_sse2), + make_tuple(64, &vp9_block_error_fp_sse2), + make_tuple(256, &vp9_block_error_fp_sse2), + make_tuple(1024, &vp9_block_error_fp_sse2))); +#endif // HAVE_SSE2 + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P(AVX2, SatdLowbdTest, + ::testing::Values(make_tuple(16, &vpx_satd_avx2), + make_tuple(64, &vpx_satd_avx2), + make_tuple(256, &vpx_satd_avx2), + make_tuple(1024, &vpx_satd_avx2))); + +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + AVX2, SatdHighbdTest, + ::testing::Values(make_tuple(16, &vpx_highbd_satd_avx2), + make_tuple(64, &vpx_highbd_satd_avx2), + make_tuple(256, &vpx_highbd_satd_avx2), + make_tuple(1024, &vpx_highbd_satd_avx2))); +#endif // CONFIG_VP9_HIGHBITDEPTH + +INSTANTIATE_TEST_SUITE_P( + AVX2, BlockErrorTestFP, + ::testing::Values(make_tuple(16, &vp9_block_error_fp_avx2), + make_tuple(64, &vp9_block_error_fp_avx2), + make_tuple(256, &vp9_block_error_fp_avx2), + make_tuple(1024, &vp9_block_error_fp_avx2))); +#endif + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, AverageTest, + ::testing::Values(make_tuple(16, 16, 0, 8, &vpx_avg_8x8_neon), + make_tuple(16, 16, 5, 8, &vpx_avg_8x8_neon), + make_tuple(32, 32, 15, 8, &vpx_avg_8x8_neon), + make_tuple(16, 16, 0, 4, &vpx_avg_4x4_neon), + make_tuple(16, 16, 5, 4, &vpx_avg_4x4_neon), + make_tuple(32, 32, 15, 4, &vpx_avg_4x4_neon))); + +INSTANTIATE_TEST_SUITE_P( + NEON, IntProRowTest, + ::testing::Values(make_tuple(16, &vpx_int_pro_row_neon, &vpx_int_pro_row_c), + make_tuple(32, &vpx_int_pro_row_neon, &vpx_int_pro_row_c), + make_tuple(64, &vpx_int_pro_row_neon, + &vpx_int_pro_row_c))); + +INSTANTIATE_TEST_SUITE_P( + NEON, IntProColTest, + ::testing::Values(make_tuple(16, &vpx_int_pro_col_neon, &vpx_int_pro_col_c), + make_tuple(32, &vpx_int_pro_col_neon, &vpx_int_pro_col_c), + make_tuple(64, &vpx_int_pro_col_neon, + &vpx_int_pro_col_c))); + +INSTANTIATE_TEST_SUITE_P(NEON, SatdLowbdTest, + ::testing::Values(make_tuple(16, &vpx_satd_neon), + make_tuple(64, &vpx_satd_neon), + make_tuple(256, &vpx_satd_neon), + make_tuple(1024, &vpx_satd_neon))); + +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + NEON, SatdHighbdTest, + ::testing::Values(make_tuple(16, &vpx_highbd_satd_neon), + make_tuple(64, &vpx_highbd_satd_neon), + make_tuple(256, &vpx_highbd_satd_neon), + make_tuple(1024, &vpx_highbd_satd_neon))); +#endif // CONFIG_VP9_HIGHBITDEPTH + +INSTANTIATE_TEST_SUITE_P( + NEON, BlockErrorTestFP, + ::testing::Values(make_tuple(16, &vp9_block_error_fp_neon), + make_tuple(64, &vp9_block_error_fp_neon), + make_tuple(256, &vp9_block_error_fp_neon), + make_tuple(1024, &vp9_block_error_fp_neon))); +#endif // HAVE_NEON + +#if HAVE_MSA +INSTANTIATE_TEST_SUITE_P( + MSA, AverageTest, + ::testing::Values(make_tuple(16, 16, 0, 8, &vpx_avg_8x8_msa), + make_tuple(16, 16, 5, 8, &vpx_avg_8x8_msa), + make_tuple(32, 32, 15, 8, &vpx_avg_8x8_msa), + make_tuple(16, 16, 0, 4, &vpx_avg_4x4_msa), + make_tuple(16, 16, 5, 4, &vpx_avg_4x4_msa), + make_tuple(32, 32, 15, 4, &vpx_avg_4x4_msa))); + +INSTANTIATE_TEST_SUITE_P( + MSA, IntProRowTest, + ::testing::Values(make_tuple(16, &vpx_int_pro_row_msa, &vpx_int_pro_row_c), + make_tuple(32, &vpx_int_pro_row_msa, &vpx_int_pro_row_c), + make_tuple(64, &vpx_int_pro_row_msa, + &vpx_int_pro_row_c))); + +INSTANTIATE_TEST_SUITE_P( + MSA, IntProColTest, + ::testing::Values(make_tuple(16, &vpx_int_pro_col_msa, &vpx_int_pro_col_c), + make_tuple(32, &vpx_int_pro_col_msa, &vpx_int_pro_col_c), + make_tuple(64, &vpx_int_pro_col_msa, + &vpx_int_pro_col_c))); + +// TODO(jingning): Remove the highbitdepth flag once the SIMD functions are +// in place. +#if !CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P(MSA, SatdLowbdTest, + ::testing::Values(make_tuple(16, &vpx_satd_msa), + make_tuple(64, &vpx_satd_msa), + make_tuple(256, &vpx_satd_msa), + make_tuple(1024, &vpx_satd_msa))); +#endif // !CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_MSA + +} // namespace diff --git a/media/libvpx/libvpx/test/bench.cc b/media/libvpx/libvpx/test/bench.cc new file mode 100644 index 0000000000..4b883d8250 --- /dev/null +++ b/media/libvpx/libvpx/test/bench.cc @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "test/bench.h" +#include "vpx_ports/vpx_timer.h" + +void AbstractBench::RunNTimes(int n) { + for (int r = 0; r < VPX_BENCH_ROBUST_ITER; r++) { + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int j = 0; j < n; ++j) { + Run(); + } + vpx_usec_timer_mark(&timer); + times_[r] = static_cast(vpx_usec_timer_elapsed(&timer)); + } +} + +void AbstractBench::PrintMedian(const char *title) { + std::sort(times_, times_ + VPX_BENCH_ROBUST_ITER); + const int med = times_[VPX_BENCH_ROBUST_ITER >> 1]; + int sad = 0; + for (int t = 0; t < VPX_BENCH_ROBUST_ITER; t++) { + sad += abs(times_[t] - med); + } + printf("[%10s] %s %.1f ms ( ±%.1f ms )\n", "BENCH ", title, med / 1000.0, + sad / (VPX_BENCH_ROBUST_ITER * 1000.0)); +} diff --git a/media/libvpx/libvpx/test/bench.h b/media/libvpx/libvpx/test/bench.h new file mode 100644 index 0000000000..203e4d247e --- /dev/null +++ b/media/libvpx/libvpx/test/bench.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_TEST_BENCH_H_ +#define VPX_TEST_BENCH_H_ + +// Number of iterations used to compute median run time. +#define VPX_BENCH_ROBUST_ITER 15 + +class AbstractBench { + public: + virtual ~AbstractBench() = default; + + void RunNTimes(int n); + void PrintMedian(const char *title); + + protected: + // Implement this method and put the code to benchmark in it. + virtual void Run() = 0; + + private: + int times_[VPX_BENCH_ROBUST_ITER]; +}; + +#endif // VPX_TEST_BENCH_H_ diff --git a/media/libvpx/libvpx/test/blockiness_test.cc b/media/libvpx/libvpx/test/blockiness_test.cc new file mode 100644 index 0000000000..5a45bc0b7f --- /dev/null +++ b/media/libvpx/libvpx/test/blockiness_test.cc @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vpx_config.h" +#if CONFIG_VP9_ENCODER +#include "./vp9_rtcd.h" +#endif + +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" + +#include "vpx_mem/vpx_mem.h" +#include "vp9/encoder/vp9_blockiness.h" + +using libvpx_test::ACMRandom; + +namespace { +class BlockinessTestBase : public ::testing::Test { + public: + BlockinessTestBase(int width, int height) : width_(width), height_(height) {} + + static void SetUpTestSuite() { + source_data_ = reinterpret_cast( + vpx_memalign(kDataAlignment, kDataBufferSize)); + reference_data_ = reinterpret_cast( + vpx_memalign(kDataAlignment, kDataBufferSize)); + } + + static void TearDownTestSuite() { + vpx_free(source_data_); + source_data_ = nullptr; + vpx_free(reference_data_); + reference_data_ = nullptr; + } + + void TearDown() override { libvpx_test::ClearSystemState(); } + + protected: + // Handle frames up to 640x480 + static const int kDataAlignment = 16; + static const int kDataBufferSize = 640 * 480; + + void SetUp() override { + source_stride_ = (width_ + 31) & ~31; + reference_stride_ = width_ * 2; + rnd_.Reset(ACMRandom::DeterministicSeed()); + } + + void FillConstant(uint8_t *data, int stride, uint8_t fill_constant, int width, + int height) { + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + data[h * stride + w] = fill_constant; + } + } + } + + void FillConstant(uint8_t *data, int stride, uint8_t fill_constant) { + FillConstant(data, stride, fill_constant, width_, height_); + } + + void FillRandom(uint8_t *data, int stride, int width, int height) { + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + data[h * stride + w] = rnd_.Rand8(); + } + } + } + + void FillRandom(uint8_t *data, int stride) { + FillRandom(data, stride, width_, height_); + } + + void FillRandomBlocky(uint8_t *data, int stride) { + for (int h = 0; h < height_; h += 4) { + for (int w = 0; w < width_; w += 4) { + FillRandom(data + h * stride + w, stride, 4, 4); + } + } + } + + void FillCheckerboard(uint8_t *data, int stride) { + for (int h = 0; h < height_; h += 4) { + for (int w = 0; w < width_; w += 4) { + if (((h / 4) ^ (w / 4)) & 1) { + FillConstant(data + h * stride + w, stride, 255, 4, 4); + } else { + FillConstant(data + h * stride + w, stride, 0, 4, 4); + } + } + } + } + + void Blur(uint8_t *data, int stride, int taps) { + int sum = 0; + int half_taps = taps / 2; + for (int h = 0; h < height_; ++h) { + for (int w = 0; w < taps; ++w) { + sum += data[w + h * stride]; + } + for (int w = taps; w < width_; ++w) { + sum += data[w + h * stride] - data[w - taps + h * stride]; + data[w - half_taps + h * stride] = (sum + half_taps) / taps; + } + } + for (int w = 0; w < width_; ++w) { + for (int h = 0; h < taps; ++h) { + sum += data[h + w * stride]; + } + for (int h = taps; h < height_; ++h) { + sum += data[w + h * stride] - data[(h - taps) * stride + w]; + data[(h - half_taps) * stride + w] = (sum + half_taps) / taps; + } + } + } + int width_, height_; + static uint8_t *source_data_; + int source_stride_; + static uint8_t *reference_data_; + int reference_stride_; + + ACMRandom rnd_; +}; + +#if CONFIG_VP9_ENCODER +typedef std::tuple BlockinessParam; +class BlockinessVP9Test + : public BlockinessTestBase, + public ::testing::WithParamInterface { + public: + BlockinessVP9Test() : BlockinessTestBase(GET_PARAM(0), GET_PARAM(1)) {} + + protected: + double GetBlockiness() const { + return vp9_get_blockiness(source_data_, source_stride_, reference_data_, + reference_stride_, width_, height_); + } +}; +#endif // CONFIG_VP9_ENCODER + +uint8_t *BlockinessTestBase::source_data_ = nullptr; +uint8_t *BlockinessTestBase::reference_data_ = nullptr; + +#if CONFIG_VP9_ENCODER +TEST_P(BlockinessVP9Test, SourceBlockierThanReference) { + // Source is blockier than reference. + FillRandomBlocky(source_data_, source_stride_); + FillConstant(reference_data_, reference_stride_, 128); + const double super_blocky = GetBlockiness(); + + EXPECT_DOUBLE_EQ(0.0, super_blocky) + << "Blocky source should produce 0 blockiness."; +} + +TEST_P(BlockinessVP9Test, ReferenceBlockierThanSource) { + // Source is blockier than reference. + FillConstant(source_data_, source_stride_, 128); + FillRandomBlocky(reference_data_, reference_stride_); + const double super_blocky = GetBlockiness(); + + EXPECT_GT(super_blocky, 0.0) + << "Blocky reference should score high for blockiness."; +} + +TEST_P(BlockinessVP9Test, BlurringDecreasesBlockiness) { + // Source is blockier than reference. + FillConstant(source_data_, source_stride_, 128); + FillRandomBlocky(reference_data_, reference_stride_); + const double super_blocky = GetBlockiness(); + + Blur(reference_data_, reference_stride_, 4); + const double less_blocky = GetBlockiness(); + + EXPECT_GT(super_blocky, less_blocky) + << "A straight blur should decrease blockiness."; +} + +TEST_P(BlockinessVP9Test, WorstCaseBlockiness) { + // Source is blockier than reference. + FillConstant(source_data_, source_stride_, 128); + FillCheckerboard(reference_data_, reference_stride_); + + const double super_blocky = GetBlockiness(); + + Blur(reference_data_, reference_stride_, 4); + const double less_blocky = GetBlockiness(); + + EXPECT_GT(super_blocky, less_blocky) + << "A straight blur should decrease blockiness."; +} +#endif // CONFIG_VP9_ENCODER + +using std::make_tuple; + +//------------------------------------------------------------------------------ +// C functions + +#if CONFIG_VP9_ENCODER +const BlockinessParam c_vp9_tests[] = { make_tuple(320, 240), + make_tuple(318, 242), + make_tuple(318, 238) }; +INSTANTIATE_TEST_SUITE_P(C, BlockinessVP9Test, + ::testing::ValuesIn(c_vp9_tests)); +#endif + +} // namespace diff --git a/media/libvpx/libvpx/test/borders_test.cc b/media/libvpx/libvpx/test/borders_test.cc new file mode 100644 index 0000000000..2726bd557d --- /dev/null +++ b/media/libvpx/libvpx/test/borders_test.cc @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include +#include +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" + +namespace { + +class BordersTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam { + protected: + BordersTest() : EncoderTest(GET_PARAM(0)) {} + ~BordersTest() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(GET_PARAM(1)); + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, 1); + encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); + encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7); + encoder->Control(VP8E_SET_ARNR_STRENGTH, 5); + encoder->Control(VP8E_SET_ARNR_TYPE, 3); + } + } + + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) { + } + } +}; + +TEST_P(BordersTest, TestEncodeHighBitrate) { + // Validate that this non multiple of 64 wide clip encodes and decodes + // without a mismatch when passing in a very low max q. This pushes + // the encoder to producing lots of big partitions which will likely + // extend into the border and test the border condition. + cfg_.g_lag_in_frames = 25; + cfg_.rc_2pass_vbr_minsection_pct = 5; + cfg_.rc_2pass_vbr_maxsection_pct = 2000; + cfg_.rc_target_bitrate = 2000; + cfg_.rc_max_quantizer = 10; + + ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0, + 40); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} +TEST_P(BordersTest, TestLowBitrate) { + // Validate that this clip encodes and decodes without a mismatch + // when passing in a very high min q. This pushes the encoder to producing + // lots of small partitions which might will test the other condition. + + cfg_.g_lag_in_frames = 25; + cfg_.rc_2pass_vbr_minsection_pct = 5; + cfg_.rc_2pass_vbr_maxsection_pct = 2000; + cfg_.rc_target_bitrate = 200; + cfg_.rc_min_quantizer = 40; + + ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0, + 40); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +VP9_INSTANTIATE_TEST_SUITE(BordersTest, + ::testing::Values(::libvpx_test::kTwoPassGood)); +} // namespace diff --git a/media/libvpx/libvpx/test/buffer.h b/media/libvpx/libvpx/test/buffer.h new file mode 100644 index 0000000000..023939cedf --- /dev/null +++ b/media/libvpx/libvpx/test/buffer.h @@ -0,0 +1,382 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_TEST_BUFFER_H_ +#define VPX_TEST_BUFFER_H_ + +#include + +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "test/acm_random.h" +#include "vpx/vpx_integer.h" +#include "vpx_mem/vpx_mem.h" + +namespace libvpx_test { + +template +class Buffer { + public: + Buffer(int width, int height, int top_padding, int left_padding, + int right_padding, int bottom_padding) + : width_(width), height_(height), top_padding_(top_padding), + left_padding_(left_padding), right_padding_(right_padding), + bottom_padding_(bottom_padding), alignment_(0), padding_value_(0), + stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(nullptr) {} + + Buffer(int width, int height, int top_padding, int left_padding, + int right_padding, int bottom_padding, unsigned int alignment) + : width_(width), height_(height), top_padding_(top_padding), + left_padding_(left_padding), right_padding_(right_padding), + bottom_padding_(bottom_padding), alignment_(alignment), + padding_value_(0), stride_(0), raw_size_(0), num_elements_(0), + raw_buffer_(nullptr) {} + + Buffer(int width, int height, int padding) + : width_(width), height_(height), top_padding_(padding), + left_padding_(padding), right_padding_(padding), + bottom_padding_(padding), alignment_(0), padding_value_(0), stride_(0), + raw_size_(0), num_elements_(0), raw_buffer_(nullptr) {} + + Buffer(int width, int height, int padding, unsigned int alignment) + : width_(width), height_(height), top_padding_(padding), + left_padding_(padding), right_padding_(padding), + bottom_padding_(padding), alignment_(alignment), padding_value_(0), + stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(nullptr) {} + + ~Buffer() { + if (alignment_) { + vpx_free(raw_buffer_); + } else { + delete[] raw_buffer_; + } + } + + T *TopLeftPixel() const; + + int stride() const { return stride_; } + + // Set the buffer (excluding padding) to 'value'. + void Set(const T value); + + // Set the buffer (excluding padding) to the output of ACMRandom function + // 'rand_func'. + void Set(ACMRandom *rand_class, T (ACMRandom::*rand_func)()); + + // Set the buffer (excluding padding) to the output of ACMRandom function + // 'RandRange' with range 'low' to 'high' which typically must be within + // testing::internal::Random::kMaxRange (1u << 31). However, because we want + // to allow negative low (and high) values, it is restricted to INT32_MAX + // here. + void Set(ACMRandom *rand_class, const T low, const T high); + + // Copy the contents of Buffer 'a' (excluding padding). + void CopyFrom(const Buffer &a); + + void DumpBuffer() const; + + // Highlight the differences between two buffers if they are the same size. + void PrintDifference(const Buffer &a) const; + + bool HasPadding() const; + + // Sets all the values in the buffer to 'padding_value'. + void SetPadding(const T padding_value); + + // Checks if all the values (excluding padding) are equal to 'value' if the + // Buffers are the same size. + bool CheckValues(const T value) const; + + // Check that padding matches the expected value or there is no padding. + bool CheckPadding() const; + + // Compare the non-padding portion of two buffers if they are the same size. + bool CheckValues(const Buffer &a) const; + + bool Init() { + if (raw_buffer_ != nullptr) return false; + EXPECT_GT(width_, 0); + EXPECT_GT(height_, 0); + EXPECT_GE(top_padding_, 0); + EXPECT_GE(left_padding_, 0); + EXPECT_GE(right_padding_, 0); + EXPECT_GE(bottom_padding_, 0); + stride_ = left_padding_ + width_ + right_padding_; + num_elements_ = stride_ * (top_padding_ + height_ + bottom_padding_); + raw_size_ = num_elements_ * sizeof(T); + if (alignment_) { + EXPECT_GE(alignment_, sizeof(T)); + // Ensure alignment of the first value will be preserved. + EXPECT_EQ((left_padding_ * sizeof(T)) % alignment_, 0u); + // Ensure alignment of the subsequent rows will be preserved when there is + // a stride. + if (stride_ != width_) { + EXPECT_EQ((stride_ * sizeof(T)) % alignment_, 0u); + } + raw_buffer_ = reinterpret_cast(vpx_memalign(alignment_, raw_size_)); + } else { + raw_buffer_ = new (std::nothrow) T[num_elements_]; + } + EXPECT_NE(raw_buffer_, nullptr); + SetPadding(std::numeric_limits::max()); + return !::testing::Test::HasFailure(); + } + + private: + bool BufferSizesMatch(const Buffer &a) const; + + const int width_; + const int height_; + const int top_padding_; + const int left_padding_; + const int right_padding_; + const int bottom_padding_; + const unsigned int alignment_; + T padding_value_; + int stride_; + int raw_size_; + int num_elements_; + T *raw_buffer_; +}; + +template +T *Buffer::TopLeftPixel() const { + if (!raw_buffer_) return nullptr; + return raw_buffer_ + (top_padding_ * stride_) + left_padding_; +} + +template +void Buffer::Set(const T value) { + if (!raw_buffer_) return; + T *src = TopLeftPixel(); + for (int height = 0; height < height_; ++height) { + for (int width = 0; width < width_; ++width) { + src[width] = value; + } + src += stride_; + } +} + +template +void Buffer::Set(ACMRandom *rand_class, T (ACMRandom::*rand_func)()) { + if (!raw_buffer_) return; + T *src = TopLeftPixel(); + for (int height = 0; height < height_; ++height) { + for (int width = 0; width < width_; ++width) { + src[width] = (*rand_class.*rand_func)(); + } + src += stride_; + } +} + +template +void Buffer::Set(ACMRandom *rand_class, const T low, const T high) { + if (!raw_buffer_) return; + + EXPECT_LE(low, high); + EXPECT_LE(static_cast(high) - low, + std::numeric_limits::max()); + + T *src = TopLeftPixel(); + for (int height = 0; height < height_; ++height) { + for (int width = 0; width < width_; ++width) { + // 'low' will be promoted to unsigned given the return type of RandRange. + // Store the value as an int to avoid unsigned overflow warnings when + // 'low' is negative. + const int32_t value = + static_cast((*rand_class).RandRange(high - low)); + src[width] = static_cast(value + low); + } + src += stride_; + } +} + +template +void Buffer::CopyFrom(const Buffer &a) { + if (!raw_buffer_) return; + if (!BufferSizesMatch(a)) return; + + T *a_src = a.TopLeftPixel(); + T *b_src = this->TopLeftPixel(); + for (int height = 0; height < height_; ++height) { + for (int width = 0; width < width_; ++width) { + b_src[width] = a_src[width]; + } + a_src += a.stride(); + b_src += this->stride(); + } +} + +template +void Buffer::DumpBuffer() const { + if (!raw_buffer_) return; + for (int height = 0; height < height_ + top_padding_ + bottom_padding_; + ++height) { + for (int width = 0; width < stride_; ++width) { + printf("%4d", raw_buffer_[height + width * stride_]); + } + printf("\n"); + } +} + +template +bool Buffer::HasPadding() const { + if (!raw_buffer_) return false; + return top_padding_ || left_padding_ || right_padding_ || bottom_padding_; +} + +template +void Buffer::PrintDifference(const Buffer &a) const { + if (!raw_buffer_) return; + if (!BufferSizesMatch(a)) return; + + T *a_src = a.TopLeftPixel(); + T *b_src = TopLeftPixel(); + + printf("This buffer:\n"); + for (int height = 0; height < height_; ++height) { + for (int width = 0; width < width_; ++width) { + if (a_src[width] != b_src[width]) { + printf("*%3d", b_src[width]); + } else { + printf("%4d", b_src[width]); + } + } + printf("\n"); + a_src += a.stride(); + b_src += this->stride(); + } + + a_src = a.TopLeftPixel(); + b_src = TopLeftPixel(); + + printf("Reference buffer:\n"); + for (int height = 0; height < height_; ++height) { + for (int width = 0; width < width_; ++width) { + if (a_src[width] != b_src[width]) { + printf("*%3d", a_src[width]); + } else { + printf("%4d", a_src[width]); + } + } + printf("\n"); + a_src += a.stride(); + b_src += this->stride(); + } +} + +template +void Buffer::SetPadding(const T padding_value) { + if (!raw_buffer_) return; + padding_value_ = padding_value; + + T *src = raw_buffer_; + for (int i = 0; i < num_elements_; ++i) { + src[i] = padding_value; + } +} + +template +bool Buffer::CheckValues(const T value) const { + if (!raw_buffer_) return false; + T *src = TopLeftPixel(); + for (int height = 0; height < height_; ++height) { + for (int width = 0; width < width_; ++width) { + if (value != src[width]) { + return false; + } + } + src += stride_; + } + return true; +} + +template +bool Buffer::CheckPadding() const { + if (!raw_buffer_) return false; + if (!HasPadding()) return true; + + // Top padding. + T const *top = raw_buffer_; + for (int i = 0; i < stride_ * top_padding_; ++i) { + if (padding_value_ != top[i]) { + return false; + } + } + + // Left padding. + T const *left = TopLeftPixel() - left_padding_; + for (int height = 0; height < height_; ++height) { + for (int width = 0; width < left_padding_; ++width) { + if (padding_value_ != left[width]) { + return false; + } + } + left += stride_; + } + + // Right padding. + T const *right = TopLeftPixel() + width_; + for (int height = 0; height < height_; ++height) { + for (int width = 0; width < right_padding_; ++width) { + if (padding_value_ != right[width]) { + return false; + } + } + right += stride_; + } + + // Bottom padding + T const *bottom = raw_buffer_ + (top_padding_ + height_) * stride_; + for (int i = 0; i < stride_ * bottom_padding_; ++i) { + if (padding_value_ != bottom[i]) { + return false; + } + } + + return true; +} + +template +bool Buffer::CheckValues(const Buffer &a) const { + if (!raw_buffer_) return false; + if (!BufferSizesMatch(a)) return false; + + T *a_src = a.TopLeftPixel(); + T *b_src = this->TopLeftPixel(); + for (int height = 0; height < height_; ++height) { + for (int width = 0; width < width_; ++width) { + if (a_src[width] != b_src[width]) { + return false; + } + } + a_src += a.stride(); + b_src += this->stride(); + } + return true; +} + +template +bool Buffer::BufferSizesMatch(const Buffer &a) const { + if (!raw_buffer_) return false; + if (a.width_ != this->width_ || a.height_ != this->height_) { + printf( + "Reference buffer of size %dx%d does not match this buffer which is " + "size %dx%d\n", + a.width_, a.height_, this->width_, this->height_); + return false; + } + + return true; +} +} // namespace libvpx_test +#endif // VPX_TEST_BUFFER_H_ diff --git a/media/libvpx/libvpx/test/byte_alignment_test.cc b/media/libvpx/libvpx/test/byte_alignment_test.cc new file mode 100644 index 0000000000..ba6fffc524 --- /dev/null +++ b/media/libvpx/libvpx/test/byte_alignment_test.cc @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "test/codec_factory.h" +#include "test/decode_test_driver.h" +#include "test/md5_helper.h" +#include "test/util.h" +#if CONFIG_WEBM_IO +#include "test/webm_video_source.h" +#endif + +namespace { + +#if CONFIG_WEBM_IO + +const int kLegacyByteAlignment = 0; +const int kLegacyYPlaneByteAlignment = 32; +const int kNumPlanesToCheck = 3; +const char kVP9TestFile[] = "vp90-2-02-size-lf-1920x1080.webm"; +const char kVP9Md5File[] = "vp90-2-02-size-lf-1920x1080.webm.md5"; + +struct ByteAlignmentTestParam { + int byte_alignment; + vpx_codec_err_t expected_value; + bool decode_remaining; +}; + +const ByteAlignmentTestParam kBaTestParams[] = { + { kLegacyByteAlignment, VPX_CODEC_OK, true }, + { 32, VPX_CODEC_OK, true }, + { 64, VPX_CODEC_OK, true }, + { 128, VPX_CODEC_OK, true }, + { 256, VPX_CODEC_OK, true }, + { 512, VPX_CODEC_OK, true }, + { 1024, VPX_CODEC_OK, true }, + { 1, VPX_CODEC_INVALID_PARAM, false }, + { -2, VPX_CODEC_INVALID_PARAM, false }, + { 4, VPX_CODEC_INVALID_PARAM, false }, + { 16, VPX_CODEC_INVALID_PARAM, false }, + { 255, VPX_CODEC_INVALID_PARAM, false }, + { 2048, VPX_CODEC_INVALID_PARAM, false }, +}; + +// Class for testing byte alignment of reference buffers. +class ByteAlignmentTest + : public ::testing::TestWithParam { + protected: + ByteAlignmentTest() + : video_(nullptr), decoder_(nullptr), md5_file_(nullptr) {} + + void SetUp() override { + video_ = new libvpx_test::WebMVideoSource(kVP9TestFile); + ASSERT_NE(video_, nullptr); + video_->Init(); + video_->Begin(); + + const vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); + decoder_ = new libvpx_test::VP9Decoder(cfg, 0); + ASSERT_NE(decoder_, nullptr); + + OpenMd5File(kVP9Md5File); + } + + void TearDown() override { + if (md5_file_ != nullptr) fclose(md5_file_); + + delete decoder_; + delete video_; + } + + void SetByteAlignment(int byte_alignment, vpx_codec_err_t expected_value) { + decoder_->Control(VP9_SET_BYTE_ALIGNMENT, byte_alignment, expected_value); + } + + vpx_codec_err_t DecodeOneFrame(int byte_alignment_to_check) { + const vpx_codec_err_t res = + decoder_->DecodeFrame(video_->cxdata(), video_->frame_size()); + CheckDecodedFrames(byte_alignment_to_check); + if (res == VPX_CODEC_OK) video_->Next(); + return res; + } + + vpx_codec_err_t DecodeRemainingFrames(int byte_alignment_to_check) { + for (; video_->cxdata() != nullptr; video_->Next()) { + const vpx_codec_err_t res = + decoder_->DecodeFrame(video_->cxdata(), video_->frame_size()); + if (res != VPX_CODEC_OK) return res; + CheckDecodedFrames(byte_alignment_to_check); + } + return VPX_CODEC_OK; + } + + private: + // Check if |data| is aligned to |byte_alignment_to_check|. + // |byte_alignment_to_check| must be a power of 2. + void CheckByteAlignment(const uint8_t *data, int byte_alignment_to_check) { + ASSERT_EQ(0u, reinterpret_cast(data) % byte_alignment_to_check); + } + + // Iterate through the planes of the decoded frames and check for + // alignment based off |byte_alignment_to_check|. + void CheckDecodedFrames(int byte_alignment_to_check) { + libvpx_test::DxDataIterator dec_iter = decoder_->GetDxData(); + const vpx_image_t *img; + + // Get decompressed data + while ((img = dec_iter.Next()) != nullptr) { + if (byte_alignment_to_check == kLegacyByteAlignment) { + CheckByteAlignment(img->planes[0], kLegacyYPlaneByteAlignment); + } else { + for (int i = 0; i < kNumPlanesToCheck; ++i) { + CheckByteAlignment(img->planes[i], byte_alignment_to_check); + } + } + CheckMd5(*img); + } + } + + // TODO(fgalligan): Move the MD5 testing code into another class. + void OpenMd5File(const std::string &md5_file_name_) { + md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_); + ASSERT_NE(md5_file_, nullptr) + << "MD5 file open failed. Filename: " << md5_file_name_; + } + + void CheckMd5(const vpx_image_t &img) { + ASSERT_NE(md5_file_, nullptr); + char expected_md5[33]; + char junk[128]; + + // Read correct md5 checksums. + const int res = fscanf(md5_file_, "%s %s", expected_md5, junk); + ASSERT_NE(EOF, res) << "Read md5 data failed"; + expected_md5[32] = '\0'; + + ::libvpx_test::MD5 md5_res; + md5_res.Add(&img); + const char *const actual_md5 = md5_res.Get(); + + // Check md5 match. + ASSERT_STREQ(expected_md5, actual_md5) << "MD5 checksums don't match"; + } + + libvpx_test::WebMVideoSource *video_; + libvpx_test::VP9Decoder *decoder_; + FILE *md5_file_; +}; + +TEST_F(ByteAlignmentTest, SwitchByteAlignment) { + const int num_elements = 14; + const int byte_alignments[] = { 0, 32, 64, 128, 256, 512, 1024, + 0, 1024, 32, 512, 64, 256, 128 }; + + for (int i = 0; i < num_elements; ++i) { + SetByteAlignment(byte_alignments[i], VPX_CODEC_OK); + ASSERT_EQ(VPX_CODEC_OK, DecodeOneFrame(byte_alignments[i])); + } + SetByteAlignment(byte_alignments[0], VPX_CODEC_OK); + ASSERT_EQ(VPX_CODEC_OK, DecodeRemainingFrames(byte_alignments[0])); +} + +TEST_P(ByteAlignmentTest, TestAlignment) { + const ByteAlignmentTestParam t = GetParam(); + SetByteAlignment(t.byte_alignment, t.expected_value); + if (t.decode_remaining) { + ASSERT_EQ(VPX_CODEC_OK, DecodeRemainingFrames(t.byte_alignment)); + } +} + +INSTANTIATE_TEST_SUITE_P(Alignments, ByteAlignmentTest, + ::testing::ValuesIn(kBaTestParams)); + +#endif // CONFIG_WEBM_IO + +} // namespace diff --git a/media/libvpx/libvpx/test/clear_system_state.h b/media/libvpx/libvpx/test/clear_system_state.h new file mode 100644 index 0000000000..ba3c0b386a --- /dev/null +++ b/media/libvpx/libvpx/test/clear_system_state.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_TEST_CLEAR_SYSTEM_STATE_H_ +#define VPX_TEST_CLEAR_SYSTEM_STATE_H_ + +#include "./vpx_config.h" +#include "vpx_ports/system_state.h" + +namespace libvpx_test { + +// Reset system to a known state. This function should be used for all non-API +// test cases. +inline void ClearSystemState() { vpx_clear_system_state(); } + +} // namespace libvpx_test +#endif // VPX_TEST_CLEAR_SYSTEM_STATE_H_ diff --git a/media/libvpx/libvpx/test/codec_factory.h b/media/libvpx/libvpx/test/codec_factory.h new file mode 100644 index 0000000000..c7e8f54847 --- /dev/null +++ b/media/libvpx/libvpx/test/codec_factory.h @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_TEST_CODEC_FACTORY_H_ +#define VPX_TEST_CODEC_FACTORY_H_ + +#include + +#include "./vpx_config.h" +#include "vpx/vpx_decoder.h" +#include "vpx/vpx_encoder.h" +#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER +#include "vpx/vp8cx.h" +#endif +#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER +#include "vpx/vp8dx.h" +#endif + +#include "test/decode_test_driver.h" +#include "test/encode_test_driver.h" +namespace libvpx_test { + +const int kCodecFactoryParam = 0; + +class CodecFactory { + public: + CodecFactory() {} + + virtual ~CodecFactory() {} + + virtual Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg) const = 0; + + virtual Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg, + const vpx_codec_flags_t flags) const = 0; + + virtual Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, + vpx_enc_deadline_t deadline, + const unsigned long init_flags, + TwopassStatsStore *stats) const = 0; + + virtual vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg, + int usage) const = 0; +}; + +/* Provide CodecTestWithParams classes for a variable number of parameters + * to avoid having to include a pointer to the CodecFactory in every test + * definition. + */ +template +class CodecTestWithParam + : public ::testing::TestWithParam< + std::tuple > {}; + +template +class CodecTestWith2Params + : public ::testing::TestWithParam< + std::tuple > {}; + +template +class CodecTestWith3Params + : public ::testing::TestWithParam< + std::tuple > {}; + +template +class CodecTestWith4Params + : public ::testing::TestWithParam< + std::tuple > {}; + +/* + * VP8 Codec Definitions + */ +#if CONFIG_VP8 +class VP8Decoder : public Decoder { + public: + explicit VP8Decoder(vpx_codec_dec_cfg_t cfg) : Decoder(cfg) {} + + VP8Decoder(vpx_codec_dec_cfg_t cfg, const vpx_codec_flags_t flag) + : Decoder(cfg, flag) {} + + protected: + vpx_codec_iface_t *CodecInterface() const override { +#if CONFIG_VP8_DECODER + return &vpx_codec_vp8_dx_algo; +#else + return nullptr; +#endif + } +}; + +class VP8Encoder : public Encoder { + public: + VP8Encoder(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline, + const unsigned long init_flags, TwopassStatsStore *stats) + : Encoder(cfg, deadline, init_flags, stats) {} + + protected: + vpx_codec_iface_t *CodecInterface() const override { +#if CONFIG_VP8_ENCODER + return &vpx_codec_vp8_cx_algo; +#else + return nullptr; +#endif + } +}; + +class VP8CodecFactory : public CodecFactory { + public: + VP8CodecFactory() : CodecFactory() {} + + Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg) const override { + return CreateDecoder(cfg, 0); + } + + Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg, + const vpx_codec_flags_t flags) const override { +#if CONFIG_VP8_DECODER + return new VP8Decoder(cfg, flags); +#else + (void)cfg; + (void)flags; + return nullptr; +#endif + } + + Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline, + const unsigned long init_flags, + TwopassStatsStore *stats) const override { +#if CONFIG_VP8_ENCODER + return new VP8Encoder(cfg, deadline, init_flags, stats); +#else + (void)cfg; + (void)deadline; + (void)init_flags; + (void)stats; + return nullptr; +#endif + } + + vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg, + int usage) const override { +#if CONFIG_VP8_ENCODER + return vpx_codec_enc_config_default(&vpx_codec_vp8_cx_algo, cfg, usage); +#else + (void)cfg; + (void)usage; + return VPX_CODEC_INCAPABLE; +#endif + } +}; + +const libvpx_test::VP8CodecFactory kVP8; + +#define VP8_INSTANTIATE_TEST_SUITE(test, ...) \ + INSTANTIATE_TEST_SUITE_P( \ + VP8, test, \ + ::testing::Combine( \ + ::testing::Values(static_cast( \ + &libvpx_test::kVP8)), \ + __VA_ARGS__)) +#else +#define VP8_INSTANTIATE_TEST_SUITE(test, ...) +#endif // CONFIG_VP8 + +/* + * VP9 Codec Definitions + */ +#if CONFIG_VP9 +class VP9Decoder : public Decoder { + public: + explicit VP9Decoder(vpx_codec_dec_cfg_t cfg) : Decoder(cfg) {} + + VP9Decoder(vpx_codec_dec_cfg_t cfg, const vpx_codec_flags_t flag) + : Decoder(cfg, flag) {} + + protected: + vpx_codec_iface_t *CodecInterface() const override { +#if CONFIG_VP9_DECODER + return &vpx_codec_vp9_dx_algo; +#else + return nullptr; +#endif + } +}; + +class VP9Encoder : public Encoder { + public: + VP9Encoder(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline, + const unsigned long init_flags, TwopassStatsStore *stats) + : Encoder(cfg, deadline, init_flags, stats) {} + + protected: + vpx_codec_iface_t *CodecInterface() const override { +#if CONFIG_VP9_ENCODER + return &vpx_codec_vp9_cx_algo; +#else + return nullptr; +#endif + } +}; + +class VP9CodecFactory : public CodecFactory { + public: + VP9CodecFactory() : CodecFactory() {} + + Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg) const override { + return CreateDecoder(cfg, 0); + } + + Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg, + const vpx_codec_flags_t flags) const override { +#if CONFIG_VP9_DECODER + return new VP9Decoder(cfg, flags); +#else + (void)cfg; + (void)flags; + return nullptr; +#endif + } + + Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline, + const unsigned long init_flags, + TwopassStatsStore *stats) const override { +#if CONFIG_VP9_ENCODER + return new VP9Encoder(cfg, deadline, init_flags, stats); +#else + (void)cfg; + (void)deadline; + (void)init_flags; + (void)stats; + return nullptr; +#endif + } + + vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg, + int usage) const override { +#if CONFIG_VP9_ENCODER + return vpx_codec_enc_config_default(&vpx_codec_vp9_cx_algo, cfg, usage); +#else + (void)cfg; + (void)usage; + return VPX_CODEC_INCAPABLE; +#endif + } +}; + +const libvpx_test::VP9CodecFactory kVP9; + +#define VP9_INSTANTIATE_TEST_SUITE(test, ...) \ + INSTANTIATE_TEST_SUITE_P( \ + VP9, test, \ + ::testing::Combine( \ + ::testing::Values(static_cast( \ + &libvpx_test::kVP9)), \ + __VA_ARGS__)) +#else +#define VP9_INSTANTIATE_TEST_SUITE(test, ...) +#endif // CONFIG_VP9 + +} // namespace libvpx_test +#endif // VPX_TEST_CODEC_FACTORY_H_ diff --git a/media/libvpx/libvpx/test/comp_avg_pred_test.cc b/media/libvpx/libvpx/test/comp_avg_pred_test.cc new file mode 100644 index 0000000000..3234cc9a25 --- /dev/null +++ b/media/libvpx/libvpx/test/comp_avg_pred_test.cc @@ -0,0 +1,275 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vpx_dsp_rtcd.h" + +#include "test/acm_random.h" +#include "test/buffer.h" +#include "test/register_state_check.h" +#include "vpx_ports/vpx_timer.h" + +namespace { + +using ::libvpx_test::ACMRandom; +using ::libvpx_test::Buffer; + +template +Pixel avg_with_rounding(Pixel a, Pixel b) { + return (a + b + 1) >> 1; +} + +template +void reference_pred(const Buffer &pred, const Buffer &ref, + int width, int height, Buffer *avg) { + ASSERT_NE(avg->TopLeftPixel(), nullptr); + ASSERT_NE(pred.TopLeftPixel(), nullptr); + ASSERT_NE(ref.TopLeftPixel(), nullptr); + + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + avg->TopLeftPixel()[y * avg->stride() + x] = + avg_with_rounding(pred.TopLeftPixel()[y * pred.stride() + x], + ref.TopLeftPixel()[y * ref.stride() + x]); + } + } +} + +using AvgPredFunc = void (*)(uint8_t *a, const uint8_t *b, int w, int h, + const uint8_t *c, int c_stride); + +template +class AvgPredTest : public ::testing::TestWithParam { + public: + void SetUp() override { + avg_pred_func_ = GetParam(); + rnd_.Reset(ACMRandom::DeterministicSeed()); + } + + void TestSizeCombinations(); + void TestCompareReferenceRandom(); + void TestSpeed(); + + protected: + AvgPredFunc avg_pred_func_; + ACMRandom rnd_; +}; + +template +void AvgPredTest::TestSizeCombinations() { + // This is called as part of the sub pixel variance. As such it must be one of + // the variance block sizes. + for (int width_pow = 2; width_pow <= 6; ++width_pow) { + for (int height_pow = width_pow - 1; height_pow <= width_pow + 1; + ++height_pow) { + // Don't test 4x2 or 64x128 + if (height_pow == 1 || height_pow == 7) continue; + + // The sse2 special-cases when ref width == stride, so make sure to test + // it. + for (int ref_padding = 0; ref_padding < 2; ref_padding++) { + const int width = 1 << width_pow; + const int height = 1 << height_pow; + // Only the reference buffer may have a stride not equal to width. + Buffer ref = Buffer(width, height, ref_padding ? 8 : 0); + ASSERT_TRUE(ref.Init()); + Buffer pred = Buffer(width, height, 0, 32); + ASSERT_TRUE(pred.Init()); + Buffer avg_ref = Buffer(width, height, 0, 32); + ASSERT_TRUE(avg_ref.Init()); + Buffer avg_chk = Buffer(width, height, 0, 32); + ASSERT_TRUE(avg_chk.Init()); + const int bitdepth_mask = (1 << bitdepth) - 1; + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + ref.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask; + } + } + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + pred.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask; + } + } + + reference_pred(pred, ref, width, height, &avg_ref); + ASM_REGISTER_STATE_CHECK(avg_pred_func_( + (uint8_t *)avg_chk.TopLeftPixel(), (uint8_t *)pred.TopLeftPixel(), + width, height, (uint8_t *)ref.TopLeftPixel(), ref.stride())); + + EXPECT_TRUE(avg_chk.CheckValues(avg_ref)); + if (HasFailure()) { + printf("Width: %d Height: %d\n", width, height); + avg_chk.PrintDifference(avg_ref); + return; + } + } + } + } +} + +template +void AvgPredTest::TestCompareReferenceRandom() { + const int width = 64; + const int height = 32; + Buffer ref = Buffer(width, height, 8); + ASSERT_TRUE(ref.Init()); + Buffer pred = Buffer(width, height, 0, 32); + ASSERT_TRUE(pred.Init()); + Buffer avg_ref = Buffer(width, height, 0, 32); + ASSERT_TRUE(avg_ref.Init()); + Buffer avg_chk = Buffer(width, height, 0, 32); + ASSERT_TRUE(avg_chk.Init()); + + for (int i = 0; i < 500; ++i) { + const int bitdepth_mask = (1 << bitdepth) - 1; + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + ref.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask; + } + } + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + pred.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask; + } + } + + reference_pred(pred, ref, width, height, &avg_ref); + ASM_REGISTER_STATE_CHECK(avg_pred_func_( + (uint8_t *)avg_chk.TopLeftPixel(), (uint8_t *)pred.TopLeftPixel(), + width, height, (uint8_t *)ref.TopLeftPixel(), ref.stride())); + EXPECT_TRUE(avg_chk.CheckValues(avg_ref)); + if (HasFailure()) { + printf("Width: %d Height: %d\n", width, height); + avg_chk.PrintDifference(avg_ref); + return; + } + } +} + +template +void AvgPredTest::TestSpeed() { + for (int width_pow = 2; width_pow <= 6; ++width_pow) { + for (int height_pow = width_pow - 1; height_pow <= width_pow + 1; + ++height_pow) { + // Don't test 4x2 or 64x128 + if (height_pow == 1 || height_pow == 7) continue; + + for (int ref_padding = 0; ref_padding < 2; ref_padding++) { + const int width = 1 << width_pow; + const int height = 1 << height_pow; + Buffer ref = Buffer(width, height, ref_padding ? 8 : 0); + ASSERT_TRUE(ref.Init()); + Buffer pred = Buffer(width, height, 0, 32); + ASSERT_TRUE(pred.Init()); + Buffer avg = Buffer(width, height, 0, 32); + ASSERT_TRUE(avg.Init()); + const int bitdepth_mask = (1 << bitdepth) - 1; + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + ref.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask; + } + } + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + pred.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask; + } + } + + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int i = 0; i < 100000000 / (width * height); ++i) { + avg_pred_func_((uint8_t *)avg.TopLeftPixel(), + (uint8_t *)pred.TopLeftPixel(), width, height, + (uint8_t *)ref.TopLeftPixel(), ref.stride()); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = + static_cast(vpx_usec_timer_elapsed(&timer)); + printf("Average Test (ref_padding: %d) %dx%d time: %5d us\n", + ref_padding, width, height, elapsed_time); + } + } + } +} + +using AvgPredTestLBD = AvgPredTest<8, uint8_t>; + +TEST_P(AvgPredTestLBD, SizeCombinations) { TestSizeCombinations(); } + +TEST_P(AvgPredTestLBD, CompareReferenceRandom) { TestCompareReferenceRandom(); } + +TEST_P(AvgPredTestLBD, DISABLED_Speed) { TestSpeed(); } + +INSTANTIATE_TEST_SUITE_P(C, AvgPredTestLBD, + ::testing::Values(&vpx_comp_avg_pred_c)); + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P(SSE2, AvgPredTestLBD, + ::testing::Values(&vpx_comp_avg_pred_sse2)); +#endif // HAVE_SSE2 + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P(AVX2, AvgPredTestLBD, + ::testing::Values(&vpx_comp_avg_pred_avx2)); +#endif // HAVE_AVX2 + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, AvgPredTestLBD, + ::testing::Values(&vpx_comp_avg_pred_neon)); +#endif // HAVE_NEON + +#if HAVE_VSX +INSTANTIATE_TEST_SUITE_P(VSX, AvgPredTestLBD, + ::testing::Values(&vpx_comp_avg_pred_vsx)); +#endif // HAVE_VSX + +#if HAVE_LSX +INSTANTIATE_TEST_SUITE_P(LSX, AvgPredTestLBD, + ::testing::Values(&vpx_comp_avg_pred_lsx)); +#endif // HAVE_LSX + +#if CONFIG_VP9_HIGHBITDEPTH +using HighbdAvgPredFunc = void (*)(uint16_t *a, const uint16_t *b, int w, int h, + const uint16_t *c, int c_stride); + +template +void highbd_wrapper(uint8_t *a, const uint8_t *b, int w, int h, + const uint8_t *c, int c_stride) { + fn((uint16_t *)a, (const uint16_t *)b, w, h, (const uint16_t *)c, c_stride); +} + +using AvgPredTestHBD = AvgPredTest<12, uint16_t>; + +TEST_P(AvgPredTestHBD, SizeCombinations) { TestSizeCombinations(); } + +TEST_P(AvgPredTestHBD, CompareReferenceRandom) { TestCompareReferenceRandom(); } + +TEST_P(AvgPredTestHBD, DISABLED_Speed) { TestSpeed(); } + +INSTANTIATE_TEST_SUITE_P( + C, AvgPredTestHBD, + ::testing::Values(&highbd_wrapper)); + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2, AvgPredTestHBD, + ::testing::Values(&highbd_wrapper)); +#endif // HAVE_SSE2 + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, AvgPredTestHBD, + ::testing::Values(&highbd_wrapper)); +#endif // HAVE_NEON + +#endif // CONFIG_VP9_HIGHBITDEPTH +} // namespace diff --git a/media/libvpx/libvpx/test/config_test.cc b/media/libvpx/libvpx/test/config_test.cc new file mode 100644 index 0000000000..729b01151b --- /dev/null +++ b/media/libvpx/libvpx/test/config_test.cc @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/util.h" +#include "test/video_source.h" + +namespace { + +class ConfigTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam { + protected: + ConfigTest() + : EncoderTest(GET_PARAM(0)), frame_count_in_(0), frame_count_out_(0), + frame_count_max_(0) {} + ~ConfigTest() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(GET_PARAM(1)); + } + + void BeginPassHook(unsigned int /*pass*/) override { + frame_count_in_ = 0; + frame_count_out_ = 0; + } + + void PreEncodeFrameHook(libvpx_test::VideoSource * /*video*/) override { + ++frame_count_in_; + abort_ |= (frame_count_in_ >= frame_count_max_); + } + + void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) override { + ++frame_count_out_; + } + + unsigned int frame_count_in_; + unsigned int frame_count_out_; + unsigned int frame_count_max_; +}; + +TEST_P(ConfigTest, LagIsDisabled) { + frame_count_max_ = 2; + cfg_.g_lag_in_frames = 15; + + libvpx_test::DummyVideoSource video; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + EXPECT_EQ(frame_count_in_, frame_count_out_); +} + +VP8_INSTANTIATE_TEST_SUITE(ConfigTest, ONE_PASS_TEST_MODES); +} // namespace diff --git a/media/libvpx/libvpx/test/consistency_test.cc b/media/libvpx/libvpx/test/consistency_test.cc new file mode 100644 index 0000000000..5e872e70a8 --- /dev/null +++ b/media/libvpx/libvpx/test/consistency_test.cc @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vpx_config.h" +#if CONFIG_VP9_ENCODER +#include "./vp9_rtcd.h" +#endif + +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "vpx_dsp/ssim.h" +#include "vpx_mem/vpx_mem.h" + +extern "C" double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch, + uint8_t *img2, int img2_pitch, int width, + int height, Ssimv *sv2, Metrics *m, + int do_inconsistency); + +using libvpx_test::ACMRandom; + +namespace { +class ConsistencyTestBase : public ::testing::Test { + public: + ConsistencyTestBase(int width, int height) : width_(width), height_(height) {} + + static void SetUpTestSuite() { + source_data_[0] = reinterpret_cast( + vpx_memalign(kDataAlignment, kDataBufferSize)); + reference_data_[0] = reinterpret_cast( + vpx_memalign(kDataAlignment, kDataBufferSize)); + source_data_[1] = reinterpret_cast( + vpx_memalign(kDataAlignment, kDataBufferSize)); + reference_data_[1] = reinterpret_cast( + vpx_memalign(kDataAlignment, kDataBufferSize)); + ssim_array_ = new Ssimv[kDataBufferSize / 16]; + } + + static void ClearSsim() { memset(ssim_array_, 0, kDataBufferSize / 16); } + static void TearDownTestSuite() { + vpx_free(source_data_[0]); + source_data_[0] = nullptr; + vpx_free(reference_data_[0]); + reference_data_[0] = nullptr; + vpx_free(source_data_[1]); + source_data_[1] = nullptr; + vpx_free(reference_data_[1]); + reference_data_[1] = nullptr; + + delete[] ssim_array_; + } + + void TearDown() override { libvpx_test::ClearSystemState(); } + + protected: + // Handle frames up to 640x480 + static const int kDataAlignment = 16; + static const int kDataBufferSize = 640 * 480; + + void SetUp() override { + source_stride_ = (width_ + 31) & ~31; + reference_stride_ = width_ * 2; + rnd_.Reset(ACMRandom::DeterministicSeed()); + } + + void FillRandom(uint8_t *data, int stride, int width, int height) { + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + data[h * stride + w] = rnd_.Rand8(); + } + } + } + + void FillRandom(uint8_t *data, int stride) { + FillRandom(data, stride, width_, height_); + } + + void Copy(uint8_t *reference, uint8_t *source) { + memcpy(reference, source, kDataBufferSize); + } + + void Blur(uint8_t *data, int stride, int taps) { + int sum = 0; + int half_taps = taps / 2; + for (int h = 0; h < height_; ++h) { + for (int w = 0; w < taps; ++w) { + sum += data[w + h * stride]; + } + for (int w = taps; w < width_; ++w) { + sum += data[w + h * stride] - data[w - taps + h * stride]; + data[w - half_taps + h * stride] = (sum + half_taps) / taps; + } + } + for (int w = 0; w < width_; ++w) { + for (int h = 0; h < taps; ++h) { + sum += data[h + w * stride]; + } + for (int h = taps; h < height_; ++h) { + sum += data[w + h * stride] - data[(h - taps) * stride + w]; + data[(h - half_taps) * stride + w] = (sum + half_taps) / taps; + } + } + } + int width_, height_; + static uint8_t *source_data_[2]; + int source_stride_; + static uint8_t *reference_data_[2]; + int reference_stride_; + static Ssimv *ssim_array_; + Metrics metrics_; + + ACMRandom rnd_; +}; + +#if CONFIG_VP9_ENCODER +typedef std::tuple ConsistencyParam; +class ConsistencyVP9Test + : public ConsistencyTestBase, + public ::testing::WithParamInterface { + public: + ConsistencyVP9Test() : ConsistencyTestBase(GET_PARAM(0), GET_PARAM(1)) {} + + protected: + double CheckConsistency(int frame) { + EXPECT_LT(frame, 2) << "Frame to check has to be less than 2."; + return vpx_get_ssim_metrics(source_data_[frame], source_stride_, + reference_data_[frame], reference_stride_, + width_, height_, ssim_array_, &metrics_, 1); + } +}; +#endif // CONFIG_VP9_ENCODER + +uint8_t *ConsistencyTestBase::source_data_[2] = { nullptr, nullptr }; +uint8_t *ConsistencyTestBase::reference_data_[2] = { nullptr, nullptr }; +Ssimv *ConsistencyTestBase::ssim_array_ = nullptr; + +#if CONFIG_VP9_ENCODER +TEST_P(ConsistencyVP9Test, ConsistencyIsZero) { + FillRandom(source_data_[0], source_stride_); + Copy(source_data_[1], source_data_[0]); + Copy(reference_data_[0], source_data_[0]); + Blur(reference_data_[0], reference_stride_, 3); + Copy(reference_data_[1], source_data_[0]); + Blur(reference_data_[1], reference_stride_, 3); + + double inconsistency = CheckConsistency(1); + inconsistency = CheckConsistency(0); + EXPECT_EQ(inconsistency, 0.0) + << "Should have 0 inconsistency if they are exactly the same."; + + // If sources are not consistent reference frames inconsistency should + // be less than if the source is consistent. + FillRandom(source_data_[0], source_stride_); + FillRandom(source_data_[1], source_stride_); + FillRandom(reference_data_[0], reference_stride_); + FillRandom(reference_data_[1], reference_stride_); + CheckConsistency(0); + inconsistency = CheckConsistency(1); + + Copy(source_data_[1], source_data_[0]); + CheckConsistency(0); + double inconsistency2 = CheckConsistency(1); + EXPECT_LT(inconsistency, inconsistency2) + << "Should have less inconsistency if source itself is inconsistent."; + + // Less of a blur should be less inconsistent than more blur coming off a + // a frame with no blur. + ClearSsim(); + FillRandom(source_data_[0], source_stride_); + Copy(source_data_[1], source_data_[0]); + Copy(reference_data_[0], source_data_[0]); + Copy(reference_data_[1], source_data_[0]); + Blur(reference_data_[1], reference_stride_, 4); + CheckConsistency(0); + inconsistency = CheckConsistency(1); + ClearSsim(); + Copy(reference_data_[1], source_data_[0]); + Blur(reference_data_[1], reference_stride_, 8); + CheckConsistency(0); + inconsistency2 = CheckConsistency(1); + + EXPECT_LT(inconsistency, inconsistency2) + << "Stronger Blur should produce more inconsistency."; +} +#endif // CONFIG_VP9_ENCODER + +using std::make_tuple; + +//------------------------------------------------------------------------------ +// C functions + +#if CONFIG_VP9_ENCODER +const ConsistencyParam c_vp9_tests[] = { make_tuple(320, 240), + make_tuple(318, 242), + make_tuple(318, 238) }; +INSTANTIATE_TEST_SUITE_P(C, ConsistencyVP9Test, + ::testing::ValuesIn(c_vp9_tests)); +#endif + +} // namespace diff --git a/media/libvpx/libvpx/test/convolve_test.cc b/media/libvpx/libvpx/test/convolve_test.cc new file mode 100644 index 0000000000..ffd5c41c63 --- /dev/null +++ b/media/libvpx/libvpx/test/convolve_test.cc @@ -0,0 +1,1518 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_filter.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/vpx_timer.h" + +namespace { + +static const unsigned int kMaxDimension = 64; + +typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h); + +typedef void (*WrapperFilterBlock2d8Func)( + const uint8_t *src_ptr, const unsigned int src_stride, + const int16_t *hfilter, const int16_t *vfilter, uint8_t *dst_ptr, + unsigned int dst_stride, unsigned int output_width, + unsigned int output_height, int use_highbd); + +struct ConvolveFunctions { + ConvolveFunctions(ConvolveFunc copy, ConvolveFunc avg, ConvolveFunc h8, + ConvolveFunc h8_avg, ConvolveFunc v8, ConvolveFunc v8_avg, + ConvolveFunc hv8, ConvolveFunc hv8_avg, ConvolveFunc sh8, + ConvolveFunc sh8_avg, ConvolveFunc sv8, + ConvolveFunc sv8_avg, ConvolveFunc shv8, + ConvolveFunc shv8_avg, int bd) + : use_highbd_(bd) { + copy_[0] = copy; + copy_[1] = avg; + h8_[0] = h8; + h8_[1] = h8_avg; + v8_[0] = v8; + v8_[1] = v8_avg; + hv8_[0] = hv8; + hv8_[1] = hv8_avg; + sh8_[0] = sh8; + sh8_[1] = sh8_avg; + sv8_[0] = sv8; + sv8_[1] = sv8_avg; + shv8_[0] = shv8; + shv8_[1] = shv8_avg; + } + + ConvolveFunc copy_[2]; + ConvolveFunc h8_[2]; + ConvolveFunc v8_[2]; + ConvolveFunc hv8_[2]; + ConvolveFunc sh8_[2]; // scaled horiz + ConvolveFunc sv8_[2]; // scaled vert + ConvolveFunc shv8_[2]; // scaled horiz/vert + int use_highbd_; // 0 if high bitdepth not used, else the actual bit depth. +}; + +typedef std::tuple ConvolveParam; + +#define ALL_SIZES(convolve_fn) \ + make_tuple(4, 4, &convolve_fn), make_tuple(8, 4, &convolve_fn), \ + make_tuple(4, 8, &convolve_fn), make_tuple(8, 8, &convolve_fn), \ + make_tuple(16, 8, &convolve_fn), make_tuple(8, 16, &convolve_fn), \ + make_tuple(16, 16, &convolve_fn), make_tuple(32, 16, &convolve_fn), \ + make_tuple(16, 32, &convolve_fn), make_tuple(32, 32, &convolve_fn), \ + make_tuple(64, 32, &convolve_fn), make_tuple(32, 64, &convolve_fn), \ + make_tuple(64, 64, &convolve_fn) + +// Reference 8-tap subpixel filter, slightly modified to fit into this test. +#define VP9_FILTER_WEIGHT 128 +#define VP9_FILTER_SHIFT 7 +uint8_t clip_pixel(int x) { return x < 0 ? 0 : x > 255 ? 255 : x; } + +void filter_block2d_8_c(const uint8_t *src_ptr, const unsigned int src_stride, + const int16_t *hfilter, const int16_t *vfilter, + uint8_t *dst_ptr, unsigned int dst_stride, + unsigned int output_width, unsigned int output_height) { + // Between passes, we use an intermediate buffer whose height is extended to + // have enough horizontally filtered values as input for the vertical pass. + // This buffer is allocated to be big enough for the largest block type we + // support. + const int kInterp_Extend = 4; + const unsigned int intermediate_height = + (kInterp_Extend - 1) + output_height + kInterp_Extend; + unsigned int i, j; + + // Size of intermediate_buffer is max_intermediate_height * filter_max_width, + // where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height + // + kInterp_Extend + // = 3 + 16 + 4 + // = 23 + // and filter_max_width = 16 + // + uint8_t intermediate_buffer[71 * kMaxDimension]; + vp9_zero(intermediate_buffer); + const int intermediate_next_stride = + 1 - static_cast(intermediate_height * output_width); + + // Horizontal pass (src -> transposed intermediate). + uint8_t *output_ptr = intermediate_buffer; + const int src_next_row_stride = src_stride - output_width; + src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1); + for (i = 0; i < intermediate_height; ++i) { + for (j = 0; j < output_width; ++j) { + // Apply filter... + const int temp = (src_ptr[0] * hfilter[0]) + (src_ptr[1] * hfilter[1]) + + (src_ptr[2] * hfilter[2]) + (src_ptr[3] * hfilter[3]) + + (src_ptr[4] * hfilter[4]) + (src_ptr[5] * hfilter[5]) + + (src_ptr[6] * hfilter[6]) + (src_ptr[7] * hfilter[7]) + + (VP9_FILTER_WEIGHT >> 1); // Rounding + + // Normalize back to 0-255... + *output_ptr = clip_pixel(temp >> VP9_FILTER_SHIFT); + ++src_ptr; + output_ptr += intermediate_height; + } + src_ptr += src_next_row_stride; + output_ptr += intermediate_next_stride; + } + + // Vertical pass (transposed intermediate -> dst). + src_ptr = intermediate_buffer; + const int dst_next_row_stride = dst_stride - output_width; + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + // Apply filter... + const int temp = (src_ptr[0] * vfilter[0]) + (src_ptr[1] * vfilter[1]) + + (src_ptr[2] * vfilter[2]) + (src_ptr[3] * vfilter[3]) + + (src_ptr[4] * vfilter[4]) + (src_ptr[5] * vfilter[5]) + + (src_ptr[6] * vfilter[6]) + (src_ptr[7] * vfilter[7]) + + (VP9_FILTER_WEIGHT >> 1); // Rounding + + // Normalize back to 0-255... + *dst_ptr++ = clip_pixel(temp >> VP9_FILTER_SHIFT); + src_ptr += intermediate_height; + } + src_ptr += intermediate_next_stride; + dst_ptr += dst_next_row_stride; + } +} + +void block2d_average_c(uint8_t *src, unsigned int src_stride, + uint8_t *output_ptr, unsigned int output_stride, + unsigned int output_width, unsigned int output_height) { + unsigned int i, j; + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1; + } + output_ptr += output_stride; + } +} + +void filter_average_block2d_8_c(const uint8_t *src_ptr, + const unsigned int src_stride, + const int16_t *hfilter, const int16_t *vfilter, + uint8_t *dst_ptr, unsigned int dst_stride, + unsigned int output_width, + unsigned int output_height) { + uint8_t tmp[kMaxDimension * kMaxDimension]; + + assert(output_width <= kMaxDimension); + assert(output_height <= kMaxDimension); + filter_block2d_8_c(src_ptr, src_stride, hfilter, vfilter, tmp, 64, + output_width, output_height); + block2d_average_c(tmp, 64, dst_ptr, dst_stride, output_width, output_height); +} + +#if CONFIG_VP9_HIGHBITDEPTH +void highbd_filter_block2d_8_c(const uint16_t *src_ptr, + const unsigned int src_stride, + const int16_t *hfilter, const int16_t *vfilter, + uint16_t *dst_ptr, unsigned int dst_stride, + unsigned int output_width, + unsigned int output_height, int bd) { + // Between passes, we use an intermediate buffer whose height is extended to + // have enough horizontally filtered values as input for the vertical pass. + // This buffer is allocated to be big enough for the largest block type we + // support. + const int kInterp_Extend = 4; + const unsigned int intermediate_height = + (kInterp_Extend - 1) + output_height + kInterp_Extend; + + /* Size of intermediate_buffer is max_intermediate_height * filter_max_width, + * where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height + * + kInterp_Extend + * = 3 + 16 + 4 + * = 23 + * and filter_max_width = 16 + */ + uint16_t intermediate_buffer[71 * kMaxDimension]; + const int intermediate_next_stride = + 1 - static_cast(intermediate_height * output_width); + + vp9_zero(intermediate_buffer); + + // Horizontal pass (src -> transposed intermediate). + { + uint16_t *output_ptr = intermediate_buffer; + const int src_next_row_stride = src_stride - output_width; + unsigned int i, j; + src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1); + for (i = 0; i < intermediate_height; ++i) { + for (j = 0; j < output_width; ++j) { + // Apply filter... + const int temp = (src_ptr[0] * hfilter[0]) + (src_ptr[1] * hfilter[1]) + + (src_ptr[2] * hfilter[2]) + (src_ptr[3] * hfilter[3]) + + (src_ptr[4] * hfilter[4]) + (src_ptr[5] * hfilter[5]) + + (src_ptr[6] * hfilter[6]) + (src_ptr[7] * hfilter[7]) + + (VP9_FILTER_WEIGHT >> 1); // Rounding + + // Normalize back to 0-255... + *output_ptr = clip_pixel_highbd(temp >> VP9_FILTER_SHIFT, bd); + ++src_ptr; + output_ptr += intermediate_height; + } + src_ptr += src_next_row_stride; + output_ptr += intermediate_next_stride; + } + } + + // Vertical pass (transposed intermediate -> dst). + { + src_ptr = intermediate_buffer; + const int dst_next_row_stride = dst_stride - output_width; + unsigned int i, j; + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + // Apply filter... + const int temp = (src_ptr[0] * vfilter[0]) + (src_ptr[1] * vfilter[1]) + + (src_ptr[2] * vfilter[2]) + (src_ptr[3] * vfilter[3]) + + (src_ptr[4] * vfilter[4]) + (src_ptr[5] * vfilter[5]) + + (src_ptr[6] * vfilter[6]) + (src_ptr[7] * vfilter[7]) + + (VP9_FILTER_WEIGHT >> 1); // Rounding + + // Normalize back to 0-255... + *dst_ptr++ = clip_pixel_highbd(temp >> VP9_FILTER_SHIFT, bd); + src_ptr += intermediate_height; + } + src_ptr += intermediate_next_stride; + dst_ptr += dst_next_row_stride; + } + } +} + +void highbd_block2d_average_c(uint16_t *src, unsigned int src_stride, + uint16_t *output_ptr, unsigned int output_stride, + unsigned int output_width, + unsigned int output_height) { + unsigned int i, j; + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1; + } + output_ptr += output_stride; + } +} + +void highbd_filter_average_block2d_8_c( + const uint16_t *src_ptr, const unsigned int src_stride, + const int16_t *hfilter, const int16_t *vfilter, uint16_t *dst_ptr, + unsigned int dst_stride, unsigned int output_width, + unsigned int output_height, int bd) { + uint16_t tmp[kMaxDimension * kMaxDimension]; + + assert(output_width <= kMaxDimension); + assert(output_height <= kMaxDimension); + highbd_filter_block2d_8_c(src_ptr, src_stride, hfilter, vfilter, tmp, 64, + output_width, output_height, bd); + highbd_block2d_average_c(tmp, 64, dst_ptr, dst_stride, output_width, + output_height); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +void wrapper_filter_average_block2d_8_c( + const uint8_t *src_ptr, const unsigned int src_stride, + const int16_t *hfilter, const int16_t *vfilter, uint8_t *dst_ptr, + unsigned int dst_stride, unsigned int output_width, + unsigned int output_height, int use_highbd) { +#if CONFIG_VP9_HIGHBITDEPTH + if (use_highbd == 0) { + filter_average_block2d_8_c(src_ptr, src_stride, hfilter, vfilter, dst_ptr, + dst_stride, output_width, output_height); + } else { + highbd_filter_average_block2d_8_c(CAST_TO_SHORTPTR(src_ptr), src_stride, + hfilter, vfilter, + CAST_TO_SHORTPTR(dst_ptr), dst_stride, + output_width, output_height, use_highbd); + } +#else + ASSERT_EQ(0, use_highbd); + filter_average_block2d_8_c(src_ptr, src_stride, hfilter, vfilter, dst_ptr, + dst_stride, output_width, output_height); +#endif +} + +void wrapper_filter_block2d_8_c(const uint8_t *src_ptr, + const unsigned int src_stride, + const int16_t *hfilter, const int16_t *vfilter, + uint8_t *dst_ptr, unsigned int dst_stride, + unsigned int output_width, + unsigned int output_height, int use_highbd) { +#if CONFIG_VP9_HIGHBITDEPTH + if (use_highbd == 0) { + filter_block2d_8_c(src_ptr, src_stride, hfilter, vfilter, dst_ptr, + dst_stride, output_width, output_height); + } else { + highbd_filter_block2d_8_c(CAST_TO_SHORTPTR(src_ptr), src_stride, hfilter, + vfilter, CAST_TO_SHORTPTR(dst_ptr), dst_stride, + output_width, output_height, use_highbd); + } +#else + ASSERT_EQ(0, use_highbd); + filter_block2d_8_c(src_ptr, src_stride, hfilter, vfilter, dst_ptr, dst_stride, + output_width, output_height); +#endif +} + +class ConvolveTest : public ::testing::TestWithParam { + public: + static void SetUpTestSuite() { + // Force input_ to be unaligned, output to be 16 byte aligned. + input_ = reinterpret_cast( + vpx_memalign(kDataAlignment, kInputBufferSize + 1)) + + 1; + output_ = reinterpret_cast( + vpx_memalign(kDataAlignment, kOutputBufferSize)); + output_ref_ = reinterpret_cast( + vpx_memalign(kDataAlignment, kOutputBufferSize)); +#if CONFIG_VP9_HIGHBITDEPTH + input16_ = reinterpret_cast(vpx_memalign( + kDataAlignment, (kInputBufferSize + 1) * sizeof(uint16_t))) + + 1; + output16_ = reinterpret_cast( + vpx_memalign(kDataAlignment, (kOutputBufferSize) * sizeof(uint16_t))); + output16_ref_ = reinterpret_cast( + vpx_memalign(kDataAlignment, (kOutputBufferSize) * sizeof(uint16_t))); +#endif + } + + void TearDown() override { libvpx_test::ClearSystemState(); } + + static void TearDownTestSuite() { + vpx_free(input_ - 1); + input_ = nullptr; + vpx_free(output_); + output_ = nullptr; + vpx_free(output_ref_); + output_ref_ = nullptr; +#if CONFIG_VP9_HIGHBITDEPTH + vpx_free(input16_ - 1); + input16_ = nullptr; + vpx_free(output16_); + output16_ = nullptr; + vpx_free(output16_ref_); + output16_ref_ = nullptr; +#endif + } + + protected: + static const int kDataAlignment = 16; + static const int kOuterBlockSize = 256; + static const int kInputStride = kOuterBlockSize; + static const int kOutputStride = kOuterBlockSize; + static const int kInputBufferSize = kOuterBlockSize * kOuterBlockSize; + static const int kOutputBufferSize = kOuterBlockSize * kOuterBlockSize; + + int Width() const { return GET_PARAM(0); } + int Height() const { return GET_PARAM(1); } + int BorderLeft() const { + const int center = (kOuterBlockSize - Width()) / 2; + return (center + (kDataAlignment - 1)) & ~(kDataAlignment - 1); + } + int BorderTop() const { return (kOuterBlockSize - Height()) / 2; } + + bool IsIndexInBorder(int i) { + return (i < BorderTop() * kOuterBlockSize || + i >= (BorderTop() + Height()) * kOuterBlockSize || + i % kOuterBlockSize < BorderLeft() || + i % kOuterBlockSize >= (BorderLeft() + Width())); + } + + void SetUp() override { + UUT_ = GET_PARAM(2); +#if CONFIG_VP9_HIGHBITDEPTH + if (UUT_->use_highbd_ != 0) { + mask_ = (1 << UUT_->use_highbd_) - 1; + } else { + mask_ = 255; + } +#endif + /* Set up guard blocks for an inner block centered in the outer block */ + for (int i = 0; i < kOutputBufferSize; ++i) { + if (IsIndexInBorder(i)) { + output_[i] = 255; +#if CONFIG_VP9_HIGHBITDEPTH + output16_[i] = mask_; +#endif + } else { + output_[i] = 0; +#if CONFIG_VP9_HIGHBITDEPTH + output16_[i] = 0; +#endif + } + } + + ::libvpx_test::ACMRandom prng; + for (int i = 0; i < kInputBufferSize; ++i) { + if (i & 1) { + input_[i] = 255; +#if CONFIG_VP9_HIGHBITDEPTH + input16_[i] = mask_; +#endif + } else { + input_[i] = prng.Rand8Extremes(); +#if CONFIG_VP9_HIGHBITDEPTH + input16_[i] = prng.Rand16() & mask_; +#endif + } + } + } + + void SetConstantInput(int value) { + memset(input_, value, kInputBufferSize); +#if CONFIG_VP9_HIGHBITDEPTH + vpx_memset16(input16_, value, kInputBufferSize); +#endif + } + + void CopyOutputToRef() { + memcpy(output_ref_, output_, kOutputBufferSize); +#if CONFIG_VP9_HIGHBITDEPTH + memcpy(output16_ref_, output16_, + kOutputBufferSize * sizeof(output16_ref_[0])); +#endif + } + + void CheckGuardBlocks() { + for (int i = 0; i < kOutputBufferSize; ++i) { + if (IsIndexInBorder(i)) { + EXPECT_EQ(255, output_[i]); + } + } + } + + uint8_t *input() const { + const int offset = BorderTop() * kOuterBlockSize + BorderLeft(); +#if CONFIG_VP9_HIGHBITDEPTH + if (UUT_->use_highbd_ == 0) { + return input_ + offset; + } else { + return CAST_TO_BYTEPTR(input16_ + offset); + } +#else + return input_ + offset; +#endif + } + + uint8_t *output() const { + const int offset = BorderTop() * kOuterBlockSize + BorderLeft(); +#if CONFIG_VP9_HIGHBITDEPTH + if (UUT_->use_highbd_ == 0) { + return output_ + offset; + } else { + return CAST_TO_BYTEPTR(output16_ + offset); + } +#else + return output_ + offset; +#endif + } + + uint8_t *output_ref() const { + const int offset = BorderTop() * kOuterBlockSize + BorderLeft(); +#if CONFIG_VP9_HIGHBITDEPTH + if (UUT_->use_highbd_ == 0) { + return output_ref_ + offset; + } else { + return CAST_TO_BYTEPTR(output16_ref_ + offset); + } +#else + return output_ref_ + offset; +#endif + } + + uint16_t lookup(uint8_t *list, int index) const { +#if CONFIG_VP9_HIGHBITDEPTH + if (UUT_->use_highbd_ == 0) { + return list[index]; + } else { + return CAST_TO_SHORTPTR(list)[index]; + } +#else + return list[index]; +#endif + } + + void assign_val(uint8_t *list, int index, uint16_t val) const { +#if CONFIG_VP9_HIGHBITDEPTH + if (UUT_->use_highbd_ == 0) { + list[index] = (uint8_t)val; + } else { + CAST_TO_SHORTPTR(list)[index] = val; + } +#else + list[index] = (uint8_t)val; +#endif + } + + const ConvolveFunctions *UUT_; + static uint8_t *input_; + static uint8_t *output_; + static uint8_t *output_ref_; +#if CONFIG_VP9_HIGHBITDEPTH + static uint16_t *input16_; + static uint16_t *output16_; + static uint16_t *output16_ref_; + int mask_; +#endif +}; + +uint8_t *ConvolveTest::input_ = nullptr; +uint8_t *ConvolveTest::output_ = nullptr; +uint8_t *ConvolveTest::output_ref_ = nullptr; +#if CONFIG_VP9_HIGHBITDEPTH +uint16_t *ConvolveTest::input16_ = nullptr; +uint16_t *ConvolveTest::output16_ = nullptr; +uint16_t *ConvolveTest::output16_ref_ = nullptr; +#endif + +TEST_P(ConvolveTest, GuardBlocks) { CheckGuardBlocks(); } + +TEST_P(ConvolveTest, DISABLED_Copy_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->copy_[0](in, kInputStride, out, kOutputStride, nullptr, 0, 0, 0, 0, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve_copy_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + +TEST_P(ConvolveTest, DISABLED_Avg_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->copy_[1](in, kInputStride, out, kOutputStride, nullptr, 0, 0, 0, 0, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve_avg_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + +TEST_P(ConvolveTest, DISABLED_Scale_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP]; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->shv8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve_scale_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + +TEST_P(ConvolveTest, DISABLED_8Tap_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP]; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->hv8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve8_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + +TEST_P(ConvolveTest, DISABLED_8Tap_Horiz_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP]; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->h8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve8_horiz_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + +TEST_P(ConvolveTest, DISABLED_8Tap_Vert_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP]; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->v8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve8_vert_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + +TEST_P(ConvolveTest, DISABLED_4Tap_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel *const fourtap = vp9_filter_kernels[FOURTAP]; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->hv8_[0](in, kInputStride, out, kOutputStride, fourtap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve4_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + +TEST_P(ConvolveTest, DISABLED_4Tap_Horiz_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel *const fourtap = vp9_filter_kernels[FOURTAP]; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->h8_[0](in, kInputStride, out, kOutputStride, fourtap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve4_horiz_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + +TEST_P(ConvolveTest, DISABLED_4Tap_Vert_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel *const fourtap = vp9_filter_kernels[FOURTAP]; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->v8_[0](in, kInputStride, out, kOutputStride, fourtap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve4_vert_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} +TEST_P(ConvolveTest, DISABLED_8Tap_Avg_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP]; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->hv8_[1](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve8_avg_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + +TEST_P(ConvolveTest, Copy) { + uint8_t *const in = input(); + uint8_t *const out = output(); + + ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](in, kInputStride, out, kOutputStride, + nullptr, 0, 0, 0, 0, Width(), + Height())); + + CheckGuardBlocks(); + + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) + ASSERT_EQ(lookup(out, y * kOutputStride + x), + lookup(in, y * kInputStride + x)) + << "(" << x << "," << y << ")"; + } +} + +TEST_P(ConvolveTest, Avg) { + uint8_t *const in = input(); + uint8_t *const out = output(); + uint8_t *const out_ref = output_ref(); + CopyOutputToRef(); + + ASM_REGISTER_STATE_CHECK(UUT_->copy_[1](in, kInputStride, out, kOutputStride, + nullptr, 0, 0, 0, 0, Width(), + Height())); + + CheckGuardBlocks(); + + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) + ASSERT_EQ(lookup(out, y * kOutputStride + x), + ROUND_POWER_OF_TWO(lookup(in, y * kInputStride + x) + + lookup(out_ref, y * kOutputStride + x), + 1)) + << "(" << x << "," << y << ")"; + } +} + +TEST_P(ConvolveTest, CopyHoriz) { + uint8_t *const in = input(); + uint8_t *const out = output(); + + ASM_REGISTER_STATE_CHECK(UUT_->sh8_[0](in, kInputStride, out, kOutputStride, + vp9_filter_kernels[0], 0, 16, 0, 16, + Width(), Height())); + + CheckGuardBlocks(); + + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) + ASSERT_EQ(lookup(out, y * kOutputStride + x), + lookup(in, y * kInputStride + x)) + << "(" << x << "," << y << ")"; + } +} + +TEST_P(ConvolveTest, CopyVert) { + uint8_t *const in = input(); + uint8_t *const out = output(); + + ASM_REGISTER_STATE_CHECK(UUT_->sv8_[0](in, kInputStride, out, kOutputStride, + vp9_filter_kernels[0], 0, 16, 0, 16, + Width(), Height())); + + CheckGuardBlocks(); + + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) + ASSERT_EQ(lookup(out, y * kOutputStride + x), + lookup(in, y * kInputStride + x)) + << "(" << x << "," << y << ")"; + } +} + +TEST_P(ConvolveTest, Copy2D) { + uint8_t *const in = input(); + uint8_t *const out = output(); + + ASM_REGISTER_STATE_CHECK(UUT_->shv8_[0](in, kInputStride, out, kOutputStride, + vp9_filter_kernels[0], 0, 16, 0, 16, + Width(), Height())); + + CheckGuardBlocks(); + + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) + ASSERT_EQ(lookup(out, y * kOutputStride + x), + lookup(in, y * kInputStride + x)) + << "(" << x << "," << y << ")"; + } +} + +const int kNumFilterBanks = 5; +const int kNumFilters = 16; + +TEST(ConvolveTest, FiltersWontSaturateWhenAddedPairwise) { + for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) { + const InterpKernel *filters = + vp9_filter_kernels[static_cast(filter_bank)]; + for (int i = 0; i < kNumFilters; i++) { + const int p0 = filters[i][0] + filters[i][1]; + const int p1 = filters[i][2] + filters[i][3]; + const int p2 = filters[i][4] + filters[i][5]; + const int p3 = filters[i][6] + filters[i][7]; + EXPECT_LE(p0, 128); + EXPECT_LE(p1, 128); + EXPECT_LE(p2, 128); + EXPECT_LE(p3, 128); + EXPECT_LE(p0 + p3, 128); + EXPECT_LE(p0 + p3 + p1, 128); + EXPECT_LE(p0 + p3 + p1 + p2, 128); + EXPECT_EQ(p0 + p1 + p2 + p3, 128); + } + } +} + +const WrapperFilterBlock2d8Func wrapper_filter_block2d_8[2] = { + wrapper_filter_block2d_8_c, wrapper_filter_average_block2d_8_c +}; + +TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) { + for (int i = 0; i < 2; ++i) { + uint8_t *const in = input(); + uint8_t *const out = output(); +#if CONFIG_VP9_HIGHBITDEPTH + uint8_t ref8[kOutputStride * kMaxDimension]; + uint16_t ref16[kOutputStride * kMaxDimension]; + uint8_t *ref; + if (UUT_->use_highbd_ == 0) { + ref = ref8; + } else { + ref = CAST_TO_BYTEPTR(ref16); + } +#else + uint8_t ref[kOutputStride * kMaxDimension]; +#endif + + // Populate ref and out with some random data + ::libvpx_test::ACMRandom prng; + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) { + uint16_t r; +#if CONFIG_VP9_HIGHBITDEPTH + if (UUT_->use_highbd_ == 0 || UUT_->use_highbd_ == 8) { + r = prng.Rand8Extremes(); + } else { + r = prng.Rand16() & mask_; + } +#else + r = prng.Rand8Extremes(); +#endif + + assign_val(out, y * kOutputStride + x, r); + assign_val(ref, y * kOutputStride + x, r); + } + } + + for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) { + const InterpKernel *filters = + vp9_filter_kernels[static_cast(filter_bank)]; + + for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) { + for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) { + wrapper_filter_block2d_8[i](in, kInputStride, filters[filter_x], + filters[filter_y], ref, kOutputStride, + Width(), Height(), UUT_->use_highbd_); + + if (filter_x && filter_y) + ASM_REGISTER_STATE_CHECK( + UUT_->hv8_[i](in, kInputStride, out, kOutputStride, filters, + filter_x, 16, filter_y, 16, Width(), Height())); + else if (filter_y) + ASM_REGISTER_STATE_CHECK( + UUT_->v8_[i](in, kInputStride, out, kOutputStride, filters, 0, + 16, filter_y, 16, Width(), Height())); + else if (filter_x) + ASM_REGISTER_STATE_CHECK( + UUT_->h8_[i](in, kInputStride, out, kOutputStride, filters, + filter_x, 16, 0, 16, Width(), Height())); + else + ASM_REGISTER_STATE_CHECK( + UUT_->copy_[i](in, kInputStride, out, kOutputStride, nullptr, 0, + 0, 0, 0, Width(), Height())); + + CheckGuardBlocks(); + + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) + ASSERT_EQ(lookup(ref, y * kOutputStride + x), + lookup(out, y * kOutputStride + x)) + << "mismatch at (" << x << "," << y << "), " + << "filters (" << filter_bank << "," << filter_x << "," + << filter_y << ")"; + } + } + } + } + } +} + +TEST_P(ConvolveTest, FilterExtremes) { + uint8_t *const in = input(); + uint8_t *const out = output(); +#if CONFIG_VP9_HIGHBITDEPTH + uint8_t ref8[kOutputStride * kMaxDimension]; + uint16_t ref16[kOutputStride * kMaxDimension]; + uint8_t *ref; + if (UUT_->use_highbd_ == 0) { + ref = ref8; + } else { + ref = CAST_TO_BYTEPTR(ref16); + } +#else + uint8_t ref[kOutputStride * kMaxDimension]; +#endif + + // Populate ref and out with some random data + ::libvpx_test::ACMRandom prng; + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) { + uint16_t r; +#if CONFIG_VP9_HIGHBITDEPTH + if (UUT_->use_highbd_ == 0 || UUT_->use_highbd_ == 8) { + r = prng.Rand8Extremes(); + } else { + r = prng.Rand16() & mask_; + } +#else + r = prng.Rand8Extremes(); +#endif + assign_val(out, y * kOutputStride + x, r); + assign_val(ref, y * kOutputStride + x, r); + } + } + + for (int axis = 0; axis < 2; axis++) { + int seed_val = 0; + while (seed_val < 256) { + for (int y = 0; y < 8; ++y) { + for (int x = 0; x < 8; ++x) { +#if CONFIG_VP9_HIGHBITDEPTH + assign_val(in, y * kOutputStride + x - SUBPEL_TAPS / 2 + 1, + ((seed_val >> (axis ? y : x)) & 1) * mask_); +#else + assign_val(in, y * kOutputStride + x - SUBPEL_TAPS / 2 + 1, + ((seed_val >> (axis ? y : x)) & 1) * 255); +#endif + if (axis) seed_val++; + } + if (axis) { + seed_val -= 8; + } else { + seed_val++; + } + } + if (axis) seed_val += 8; + + for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) { + const InterpKernel *filters = + vp9_filter_kernels[static_cast(filter_bank)]; + for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) { + for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) { + wrapper_filter_block2d_8_c(in, kInputStride, filters[filter_x], + filters[filter_y], ref, kOutputStride, + Width(), Height(), UUT_->use_highbd_); + if (filter_x && filter_y) + ASM_REGISTER_STATE_CHECK( + UUT_->hv8_[0](in, kInputStride, out, kOutputStride, filters, + filter_x, 16, filter_y, 16, Width(), Height())); + else if (filter_y) + ASM_REGISTER_STATE_CHECK( + UUT_->v8_[0](in, kInputStride, out, kOutputStride, filters, 0, + 16, filter_y, 16, Width(), Height())); + else if (filter_x) + ASM_REGISTER_STATE_CHECK( + UUT_->h8_[0](in, kInputStride, out, kOutputStride, filters, + filter_x, 16, 0, 16, Width(), Height())); + else + ASM_REGISTER_STATE_CHECK( + UUT_->copy_[0](in, kInputStride, out, kOutputStride, nullptr, + 0, 0, 0, 0, Width(), Height())); + + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) + ASSERT_EQ(lookup(ref, y * kOutputStride + x), + lookup(out, y * kOutputStride + x)) + << "mismatch at (" << x << "," << y << "), " + << "filters (" << filter_bank << "," << filter_x << "," + << filter_y << ")"; + } + } + } + } + } + } +} + +/* This test exercises that enough rows and columns are filtered with every + possible initial fractional positions and scaling steps. */ +#if !CONFIG_VP9_HIGHBITDEPTH +static const ConvolveFunc scaled_2d_c_funcs[2] = { vpx_scaled_2d_c, + vpx_scaled_avg_2d_c }; + +TEST_P(ConvolveTest, CheckScalingFiltering) { + uint8_t *const in = input(); + uint8_t *const out = output(); + uint8_t ref[kOutputStride * kMaxDimension]; + + ::libvpx_test::ACMRandom prng; + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) { + const uint16_t r = prng.Rand8Extremes(); + assign_val(in, y * kInputStride + x, r); + } + } + + for (int i = 0; i < 2; ++i) { + for (INTERP_FILTER filter_type = 0; filter_type < 4; ++filter_type) { + const InterpKernel *const eighttap = vp9_filter_kernels[filter_type]; + for (int frac = 0; frac < 16; ++frac) { + for (int step = 1; step <= 32; ++step) { + /* Test the horizontal and vertical filters in combination. */ + scaled_2d_c_funcs[i](in, kInputStride, ref, kOutputStride, eighttap, + frac, step, frac, step, Width(), Height()); + ASM_REGISTER_STATE_CHECK( + UUT_->shv8_[i](in, kInputStride, out, kOutputStride, eighttap, + frac, step, frac, step, Width(), Height())); + + CheckGuardBlocks(); + + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) { + ASSERT_EQ(lookup(ref, y * kOutputStride + x), + lookup(out, y * kOutputStride + x)) + << "x == " << x << ", y == " << y << ", frac == " << frac + << ", step == " << step; + } + } + } + } + } + } +} +#endif + +using std::make_tuple; + +#if CONFIG_VP9_HIGHBITDEPTH +#define WRAP(func, bd) \ + void wrap_##func##_##bd( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \ + vpx_highbd_##func(reinterpret_cast(src), src_stride, \ + reinterpret_cast(dst), dst_stride, filter, \ + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); \ + } + +#if HAVE_SSE2 && VPX_ARCH_X86_64 +WRAP(convolve_copy_sse2, 8) +WRAP(convolve_avg_sse2, 8) +WRAP(convolve_copy_sse2, 10) +WRAP(convolve_avg_sse2, 10) +WRAP(convolve_copy_sse2, 12) +WRAP(convolve_avg_sse2, 12) +WRAP(convolve8_horiz_sse2, 8) +WRAP(convolve8_avg_horiz_sse2, 8) +WRAP(convolve8_vert_sse2, 8) +WRAP(convolve8_avg_vert_sse2, 8) +WRAP(convolve8_sse2, 8) +WRAP(convolve8_avg_sse2, 8) +WRAP(convolve8_horiz_sse2, 10) +WRAP(convolve8_avg_horiz_sse2, 10) +WRAP(convolve8_vert_sse2, 10) +WRAP(convolve8_avg_vert_sse2, 10) +WRAP(convolve8_sse2, 10) +WRAP(convolve8_avg_sse2, 10) +WRAP(convolve8_horiz_sse2, 12) +WRAP(convolve8_avg_horiz_sse2, 12) +WRAP(convolve8_vert_sse2, 12) +WRAP(convolve8_avg_vert_sse2, 12) +WRAP(convolve8_sse2, 12) +WRAP(convolve8_avg_sse2, 12) +#endif // HAVE_SSE2 && VPX_ARCH_X86_64 + +#if HAVE_AVX2 +WRAP(convolve_copy_avx2, 8) +WRAP(convolve_avg_avx2, 8) +WRAP(convolve8_horiz_avx2, 8) +WRAP(convolve8_avg_horiz_avx2, 8) +WRAP(convolve8_vert_avx2, 8) +WRAP(convolve8_avg_vert_avx2, 8) +WRAP(convolve8_avx2, 8) +WRAP(convolve8_avg_avx2, 8) + +WRAP(convolve_copy_avx2, 10) +WRAP(convolve_avg_avx2, 10) +WRAP(convolve8_avx2, 10) +WRAP(convolve8_horiz_avx2, 10) +WRAP(convolve8_vert_avx2, 10) +WRAP(convolve8_avg_avx2, 10) +WRAP(convolve8_avg_horiz_avx2, 10) +WRAP(convolve8_avg_vert_avx2, 10) + +WRAP(convolve_copy_avx2, 12) +WRAP(convolve_avg_avx2, 12) +WRAP(convolve8_avx2, 12) +WRAP(convolve8_horiz_avx2, 12) +WRAP(convolve8_vert_avx2, 12) +WRAP(convolve8_avg_avx2, 12) +WRAP(convolve8_avg_horiz_avx2, 12) +WRAP(convolve8_avg_vert_avx2, 12) +#endif // HAVE_AVX2 + +#if HAVE_NEON +WRAP(convolve_copy_neon, 8) +WRAP(convolve_avg_neon, 8) +WRAP(convolve_copy_neon, 10) +WRAP(convolve_avg_neon, 10) +WRAP(convolve_copy_neon, 12) +WRAP(convolve_avg_neon, 12) +WRAP(convolve8_horiz_neon, 8) +WRAP(convolve8_avg_horiz_neon, 8) +WRAP(convolve8_vert_neon, 8) +WRAP(convolve8_avg_vert_neon, 8) +WRAP(convolve8_neon, 8) +WRAP(convolve8_avg_neon, 8) +WRAP(convolve8_horiz_neon, 10) +WRAP(convolve8_avg_horiz_neon, 10) +WRAP(convolve8_vert_neon, 10) +WRAP(convolve8_avg_vert_neon, 10) +WRAP(convolve8_neon, 10) +WRAP(convolve8_avg_neon, 10) +WRAP(convolve8_horiz_neon, 12) +WRAP(convolve8_avg_horiz_neon, 12) +WRAP(convolve8_vert_neon, 12) +WRAP(convolve8_avg_vert_neon, 12) +WRAP(convolve8_neon, 12) +WRAP(convolve8_avg_neon, 12) +#endif // HAVE_NEON + +WRAP(convolve_copy_c, 8) +WRAP(convolve_avg_c, 8) +WRAP(convolve8_horiz_c, 8) +WRAP(convolve8_avg_horiz_c, 8) +WRAP(convolve8_vert_c, 8) +WRAP(convolve8_avg_vert_c, 8) +WRAP(convolve8_c, 8) +WRAP(convolve8_avg_c, 8) +WRAP(convolve_copy_c, 10) +WRAP(convolve_avg_c, 10) +WRAP(convolve8_horiz_c, 10) +WRAP(convolve8_avg_horiz_c, 10) +WRAP(convolve8_vert_c, 10) +WRAP(convolve8_avg_vert_c, 10) +WRAP(convolve8_c, 10) +WRAP(convolve8_avg_c, 10) +WRAP(convolve_copy_c, 12) +WRAP(convolve_avg_c, 12) +WRAP(convolve8_horiz_c, 12) +WRAP(convolve8_avg_horiz_c, 12) +WRAP(convolve8_vert_c, 12) +WRAP(convolve8_avg_vert_c, 12) +WRAP(convolve8_c, 12) +WRAP(convolve8_avg_c, 12) +#undef WRAP + +const ConvolveFunctions convolve8_c( + wrap_convolve_copy_c_8, wrap_convolve_avg_c_8, wrap_convolve8_horiz_c_8, + wrap_convolve8_avg_horiz_c_8, wrap_convolve8_vert_c_8, + wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8, + wrap_convolve8_horiz_c_8, wrap_convolve8_avg_horiz_c_8, + wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, + wrap_convolve8_avg_c_8, 8); +const ConvolveFunctions convolve10_c( + wrap_convolve_copy_c_10, wrap_convolve_avg_c_10, wrap_convolve8_horiz_c_10, + wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_c_10, + wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, wrap_convolve8_avg_c_10, + wrap_convolve8_horiz_c_10, wrap_convolve8_avg_horiz_c_10, + wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, + wrap_convolve8_avg_c_10, 10); +const ConvolveFunctions convolve12_c( + wrap_convolve_copy_c_12, wrap_convolve_avg_c_12, wrap_convolve8_horiz_c_12, + wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_c_12, + wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, wrap_convolve8_avg_c_12, + wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12, + wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, + wrap_convolve8_avg_c_12, 12); +const ConvolveParam kArrayConvolve_c[] = { ALL_SIZES(convolve8_c), + ALL_SIZES(convolve10_c), + ALL_SIZES(convolve12_c) }; + +#else +const ConvolveFunctions convolve8_c( + vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_c, + vpx_convolve8_avg_horiz_c, vpx_convolve8_vert_c, vpx_convolve8_avg_vert_c, + vpx_convolve8_c, vpx_convolve8_avg_c, vpx_scaled_horiz_c, + vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, vpx_scaled_avg_vert_c, + vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); +const ConvolveParam kArrayConvolve_c[] = { ALL_SIZES(convolve8_c) }; +#endif +INSTANTIATE_TEST_SUITE_P(C, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_c)); + +#if HAVE_SSE2 && VPX_ARCH_X86_64 +#if CONFIG_VP9_HIGHBITDEPTH +const ConvolveFunctions convolve8_sse2( + wrap_convolve_copy_sse2_8, wrap_convolve_avg_sse2_8, + wrap_convolve8_horiz_sse2_8, wrap_convolve8_avg_horiz_sse2_8, + wrap_convolve8_vert_sse2_8, wrap_convolve8_avg_vert_sse2_8, + wrap_convolve8_sse2_8, wrap_convolve8_avg_sse2_8, + wrap_convolve8_horiz_sse2_8, wrap_convolve8_avg_horiz_sse2_8, + wrap_convolve8_vert_sse2_8, wrap_convolve8_avg_vert_sse2_8, + wrap_convolve8_sse2_8, wrap_convolve8_avg_sse2_8, 8); +const ConvolveFunctions convolve10_sse2( + wrap_convolve_copy_sse2_10, wrap_convolve_avg_sse2_10, + wrap_convolve8_horiz_sse2_10, wrap_convolve8_avg_horiz_sse2_10, + wrap_convolve8_vert_sse2_10, wrap_convolve8_avg_vert_sse2_10, + wrap_convolve8_sse2_10, wrap_convolve8_avg_sse2_10, + wrap_convolve8_horiz_sse2_10, wrap_convolve8_avg_horiz_sse2_10, + wrap_convolve8_vert_sse2_10, wrap_convolve8_avg_vert_sse2_10, + wrap_convolve8_sse2_10, wrap_convolve8_avg_sse2_10, 10); +const ConvolveFunctions convolve12_sse2( + wrap_convolve_copy_sse2_12, wrap_convolve_avg_sse2_12, + wrap_convolve8_horiz_sse2_12, wrap_convolve8_avg_horiz_sse2_12, + wrap_convolve8_vert_sse2_12, wrap_convolve8_avg_vert_sse2_12, + wrap_convolve8_sse2_12, wrap_convolve8_avg_sse2_12, + wrap_convolve8_horiz_sse2_12, wrap_convolve8_avg_horiz_sse2_12, + wrap_convolve8_vert_sse2_12, wrap_convolve8_avg_vert_sse2_12, + wrap_convolve8_sse2_12, wrap_convolve8_avg_sse2_12, 12); +const ConvolveParam kArrayConvolve_sse2[] = { ALL_SIZES(convolve8_sse2), + ALL_SIZES(convolve10_sse2), + ALL_SIZES(convolve12_sse2) }; +#else +const ConvolveFunctions convolve8_sse2( + vpx_convolve_copy_sse2, vpx_convolve_avg_sse2, vpx_convolve8_horiz_sse2, + vpx_convolve8_avg_horiz_sse2, vpx_convolve8_vert_sse2, + vpx_convolve8_avg_vert_sse2, vpx_convolve8_sse2, vpx_convolve8_avg_sse2, + vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, + vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); + +const ConvolveParam kArrayConvolve_sse2[] = { ALL_SIZES(convolve8_sse2) }; +#endif // CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P(SSE2, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_sse2)); +#endif + +#if HAVE_SSSE3 +const ConvolveFunctions convolve8_ssse3( + vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_ssse3, + vpx_convolve8_avg_horiz_ssse3, vpx_convolve8_vert_ssse3, + vpx_convolve8_avg_vert_ssse3, vpx_convolve8_ssse3, vpx_convolve8_avg_ssse3, + vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, + vpx_scaled_avg_vert_c, vpx_scaled_2d_ssse3, vpx_scaled_avg_2d_c, 0); + +const ConvolveParam kArrayConvolve8_ssse3[] = { ALL_SIZES(convolve8_ssse3) }; +INSTANTIATE_TEST_SUITE_P(SSSE3, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve8_ssse3)); +#endif + +#if HAVE_AVX2 +#if CONFIG_VP9_HIGHBITDEPTH +const ConvolveFunctions convolve8_avx2( + wrap_convolve_copy_avx2_8, wrap_convolve_avg_avx2_8, + wrap_convolve8_horiz_avx2_8, wrap_convolve8_avg_horiz_avx2_8, + wrap_convolve8_vert_avx2_8, wrap_convolve8_avg_vert_avx2_8, + wrap_convolve8_avx2_8, wrap_convolve8_avg_avx2_8, wrap_convolve8_horiz_c_8, + wrap_convolve8_avg_horiz_c_8, wrap_convolve8_vert_c_8, + wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8, 8); +const ConvolveFunctions convolve10_avx2( + wrap_convolve_copy_avx2_10, wrap_convolve_avg_avx2_10, + wrap_convolve8_horiz_avx2_10, wrap_convolve8_avg_horiz_avx2_10, + wrap_convolve8_vert_avx2_10, wrap_convolve8_avg_vert_avx2_10, + wrap_convolve8_avx2_10, wrap_convolve8_avg_avx2_10, + wrap_convolve8_horiz_c_10, wrap_convolve8_avg_horiz_c_10, + wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, + wrap_convolve8_avg_c_10, 10); +const ConvolveFunctions convolve12_avx2( + wrap_convolve_copy_avx2_12, wrap_convolve_avg_avx2_12, + wrap_convolve8_horiz_avx2_12, wrap_convolve8_avg_horiz_avx2_12, + wrap_convolve8_vert_avx2_12, wrap_convolve8_avg_vert_avx2_12, + wrap_convolve8_avx2_12, wrap_convolve8_avg_avx2_12, + wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12, + wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, + wrap_convolve8_avg_c_12, 12); +const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2), + ALL_SIZES(convolve10_avx2), + ALL_SIZES(convolve12_avx2) }; +INSTANTIATE_TEST_SUITE_P(AVX2, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve8_avx2)); +#else // !CONFIG_VP9_HIGHBITDEPTH +const ConvolveFunctions convolve8_avx2( + vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_avx2, + vpx_convolve8_avg_horiz_avx2, vpx_convolve8_vert_avx2, + vpx_convolve8_avg_vert_avx2, vpx_convolve8_avx2, vpx_convolve8_avg_avx2, + vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, + vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); +const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2) }; +INSTANTIATE_TEST_SUITE_P(AVX2, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve8_avx2)); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_AVX2 + +#if HAVE_NEON +#if CONFIG_VP9_HIGHBITDEPTH +const ConvolveFunctions convolve8_neon( + wrap_convolve_copy_neon_8, wrap_convolve_avg_neon_8, + wrap_convolve8_horiz_neon_8, wrap_convolve8_avg_horiz_neon_8, + wrap_convolve8_vert_neon_8, wrap_convolve8_avg_vert_neon_8, + wrap_convolve8_neon_8, wrap_convolve8_avg_neon_8, + wrap_convolve8_horiz_neon_8, wrap_convolve8_avg_horiz_neon_8, + wrap_convolve8_vert_neon_8, wrap_convolve8_avg_vert_neon_8, + wrap_convolve8_neon_8, wrap_convolve8_avg_neon_8, 8); +const ConvolveFunctions convolve10_neon( + wrap_convolve_copy_neon_10, wrap_convolve_avg_neon_10, + wrap_convolve8_horiz_neon_10, wrap_convolve8_avg_horiz_neon_10, + wrap_convolve8_vert_neon_10, wrap_convolve8_avg_vert_neon_10, + wrap_convolve8_neon_10, wrap_convolve8_avg_neon_10, + wrap_convolve8_horiz_neon_10, wrap_convolve8_avg_horiz_neon_10, + wrap_convolve8_vert_neon_10, wrap_convolve8_avg_vert_neon_10, + wrap_convolve8_neon_10, wrap_convolve8_avg_neon_10, 10); +const ConvolveFunctions convolve12_neon( + wrap_convolve_copy_neon_12, wrap_convolve_avg_neon_12, + wrap_convolve8_horiz_neon_12, wrap_convolve8_avg_horiz_neon_12, + wrap_convolve8_vert_neon_12, wrap_convolve8_avg_vert_neon_12, + wrap_convolve8_neon_12, wrap_convolve8_avg_neon_12, + wrap_convolve8_horiz_neon_12, wrap_convolve8_avg_horiz_neon_12, + wrap_convolve8_vert_neon_12, wrap_convolve8_avg_vert_neon_12, + wrap_convolve8_neon_12, wrap_convolve8_avg_neon_12, 12); +const ConvolveParam kArrayConvolve_neon[] = { ALL_SIZES(convolve8_neon), + ALL_SIZES(convolve10_neon), + ALL_SIZES(convolve12_neon) }; +#else +const ConvolveFunctions convolve8_neon( + vpx_convolve_copy_neon, vpx_convolve_avg_neon, vpx_convolve8_horiz_neon, + vpx_convolve8_avg_horiz_neon, vpx_convolve8_vert_neon, + vpx_convolve8_avg_vert_neon, vpx_convolve8_neon, vpx_convolve8_avg_neon, + vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, + vpx_scaled_avg_vert_c, vpx_scaled_2d_neon, vpx_scaled_avg_2d_c, 0); + +const ConvolveParam kArrayConvolve_neon[] = { ALL_SIZES(convolve8_neon) }; +#endif // CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P(NEON, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_neon)); +#endif // HAVE_NEON + +#if HAVE_NEON_DOTPROD +const ConvolveFunctions convolve8_neon_dotprod( + vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_neon_dotprod, + vpx_convolve8_avg_horiz_neon_dotprod, vpx_convolve8_vert_neon_dotprod, + vpx_convolve8_avg_vert_neon_dotprod, vpx_convolve8_neon_dotprod, + vpx_convolve8_avg_neon_dotprod, vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, + vpx_scaled_vert_c, vpx_scaled_avg_vert_c, vpx_scaled_2d_c, + vpx_scaled_avg_2d_c, 0); + +const ConvolveParam kArrayConvolve_neon_dotprod[] = { ALL_SIZES( + convolve8_neon_dotprod) }; +INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_neon_dotprod)); +#endif // HAVE_NEON_DOTPROD + +#if HAVE_NEON_I8MM +const ConvolveFunctions convolve8_neon_i8mm( + vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_neon_i8mm, + vpx_convolve8_avg_horiz_neon_i8mm, vpx_convolve8_vert_neon_i8mm, + vpx_convolve8_avg_vert_neon_i8mm, vpx_convolve8_neon_i8mm, + vpx_convolve8_avg_neon_i8mm, vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, + vpx_scaled_vert_c, vpx_scaled_avg_vert_c, vpx_scaled_2d_c, + vpx_scaled_avg_2d_c, 0); + +const ConvolveParam kArrayConvolve_neon_i8mm[] = { ALL_SIZES( + convolve8_neon_i8mm) }; +INSTANTIATE_TEST_SUITE_P(NEON_I8MM, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_neon_i8mm)); +#endif // HAVE_NEON_I8MM + +#if HAVE_DSPR2 +const ConvolveFunctions convolve8_dspr2( + vpx_convolve_copy_dspr2, vpx_convolve_avg_dspr2, vpx_convolve8_horiz_dspr2, + vpx_convolve8_avg_horiz_dspr2, vpx_convolve8_vert_dspr2, + vpx_convolve8_avg_vert_dspr2, vpx_convolve8_dspr2, vpx_convolve8_avg_dspr2, + vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, + vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); + +const ConvolveParam kArrayConvolve8_dspr2[] = { ALL_SIZES(convolve8_dspr2) }; +INSTANTIATE_TEST_SUITE_P(DSPR2, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve8_dspr2)); +#endif // HAVE_DSPR2 + +#if HAVE_MSA +const ConvolveFunctions convolve8_msa( + vpx_convolve_copy_msa, vpx_convolve_avg_msa, vpx_convolve8_horiz_msa, + vpx_convolve8_avg_horiz_msa, vpx_convolve8_vert_msa, + vpx_convolve8_avg_vert_msa, vpx_convolve8_msa, vpx_convolve8_avg_msa, + vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, + vpx_scaled_avg_vert_c, vpx_scaled_2d_msa, vpx_scaled_avg_2d_c, 0); + +const ConvolveParam kArrayConvolve8_msa[] = { ALL_SIZES(convolve8_msa) }; +INSTANTIATE_TEST_SUITE_P(MSA, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve8_msa)); +#endif // HAVE_MSA + +#if HAVE_LSX +const ConvolveFunctions convolve8_lsx( + vpx_convolve_copy_lsx, vpx_convolve_avg_lsx, vpx_convolve8_horiz_lsx, + vpx_convolve8_avg_horiz_lsx, vpx_convolve8_vert_lsx, + vpx_convolve8_avg_vert_lsx, vpx_convolve8_lsx, vpx_convolve8_avg_lsx, + vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, + vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); + +const ConvolveParam kArrayConvolve8_lsx[] = { ALL_SIZES(convolve8_lsx) }; +INSTANTIATE_TEST_SUITE_P(LSX, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve8_lsx)); +#endif // HAVE_LSX + +#if HAVE_VSX +const ConvolveFunctions convolve8_vsx( + vpx_convolve_copy_vsx, vpx_convolve_avg_vsx, vpx_convolve8_horiz_vsx, + vpx_convolve8_avg_horiz_vsx, vpx_convolve8_vert_vsx, + vpx_convolve8_avg_vert_vsx, vpx_convolve8_vsx, vpx_convolve8_avg_vsx, + vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, + vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); +const ConvolveParam kArrayConvolve_vsx[] = { ALL_SIZES(convolve8_vsx) }; +INSTANTIATE_TEST_SUITE_P(VSX, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_vsx)); +#endif // HAVE_VSX + +#if HAVE_MMI +const ConvolveFunctions convolve8_mmi( + vpx_convolve_copy_c, vpx_convolve_avg_mmi, vpx_convolve8_horiz_mmi, + vpx_convolve8_avg_horiz_mmi, vpx_convolve8_vert_mmi, + vpx_convolve8_avg_vert_mmi, vpx_convolve8_mmi, vpx_convolve8_avg_mmi, + vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, + vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); +const ConvolveParam kArrayConvolve_mmi[] = { ALL_SIZES(convolve8_mmi) }; +INSTANTIATE_TEST_SUITE_P(MMI, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_mmi)); +#endif // HAVE_MMI +} // namespace diff --git a/media/libvpx/libvpx/test/cpu_speed_test.cc b/media/libvpx/libvpx/test/cpu_speed_test.cc new file mode 100644 index 0000000000..22f4552963 --- /dev/null +++ b/media/libvpx/libvpx/test/cpu_speed_test.cc @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/y4m_video_source.h" + +namespace { + +const int kMaxPSNR = 100; + +class CpuSpeedTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith2Params { + protected: + CpuSpeedTest() + : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), + set_cpu_used_(GET_PARAM(2)), min_psnr_(kMaxPSNR), + tune_content_(VP9E_CONTENT_DEFAULT) {} + ~CpuSpeedTest() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(encoding_mode_); + if (encoding_mode_ != ::libvpx_test::kRealTime) { + cfg_.g_lag_in_frames = 25; + cfg_.rc_end_usage = VPX_VBR; + } else { + cfg_.g_lag_in_frames = 0; + cfg_.rc_end_usage = VPX_CBR; + } + } + + void BeginPassHook(unsigned int /*pass*/) override { min_psnr_ = kMaxPSNR; } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); + encoder->Control(VP9E_SET_TUNE_CONTENT, tune_content_); + if (encoding_mode_ != ::libvpx_test::kRealTime) { + encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); + encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7); + encoder->Control(VP8E_SET_ARNR_STRENGTH, 5); + encoder->Control(VP8E_SET_ARNR_TYPE, 3); + } + } + } + + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { + if (pkt->data.psnr.psnr[0] < min_psnr_) min_psnr_ = pkt->data.psnr.psnr[0]; + } + + ::libvpx_test::TestMode encoding_mode_; + int set_cpu_used_; + double min_psnr_; + int tune_content_; +}; + +TEST_P(CpuSpeedTest, TestQ0) { + // Validate that this non multiple of 64 wide clip encodes and decodes + // without a mismatch when passing in a very low max q. This pushes + // the encoder to producing lots of big partitions which will likely + // extend into the border and test the border condition. + cfg_.rc_2pass_vbr_minsection_pct = 5; + cfg_.rc_2pass_vbr_maxsection_pct = 2000; + cfg_.rc_target_bitrate = 400; + cfg_.rc_max_quantizer = 0; + cfg_.rc_min_quantizer = 0; + + ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0, + 20); + + init_flags_ = VPX_CODEC_USE_PSNR; + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + EXPECT_GE(min_psnr_, kMaxPSNR); +} + +TEST_P(CpuSpeedTest, TestScreencastQ0) { + ::libvpx_test::Y4mVideoSource video("screendata.y4m", 0, 25); + cfg_.g_timebase = video.timebase(); + cfg_.rc_2pass_vbr_minsection_pct = 5; + cfg_.rc_2pass_vbr_maxsection_pct = 2000; + cfg_.rc_target_bitrate = 400; + cfg_.rc_max_quantizer = 0; + cfg_.rc_min_quantizer = 0; + + init_flags_ = VPX_CODEC_USE_PSNR; + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + EXPECT_GE(min_psnr_, kMaxPSNR); +} + +TEST_P(CpuSpeedTest, TestTuneScreen) { + ::libvpx_test::Y4mVideoSource video("screendata.y4m", 0, 25); + cfg_.g_timebase = video.timebase(); + cfg_.rc_2pass_vbr_minsection_pct = 5; + cfg_.rc_2pass_vbr_minsection_pct = 2000; + cfg_.rc_target_bitrate = 2000; + cfg_.rc_max_quantizer = 63; + cfg_.rc_min_quantizer = 0; + tune_content_ = VP9E_CONTENT_SCREEN; + + init_flags_ = VPX_CODEC_USE_PSNR; + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +TEST_P(CpuSpeedTest, TestEncodeHighBitrate) { + // Validate that this non multiple of 64 wide clip encodes and decodes + // without a mismatch when passing in a very low max q. This pushes + // the encoder to producing lots of big partitions which will likely + // extend into the border and test the border condition. + cfg_.rc_2pass_vbr_minsection_pct = 5; + cfg_.rc_2pass_vbr_maxsection_pct = 2000; + cfg_.rc_target_bitrate = 12000; + cfg_.rc_max_quantizer = 10; + cfg_.rc_min_quantizer = 0; + + ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0, + 20); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +TEST_P(CpuSpeedTest, TestLowBitrate) { + // Validate that this clip encodes and decodes without a mismatch + // when passing in a very high min q. This pushes the encoder to producing + // lots of small partitions which might will test the other condition. + cfg_.rc_2pass_vbr_minsection_pct = 5; + cfg_.rc_2pass_vbr_maxsection_pct = 2000; + cfg_.rc_target_bitrate = 200; + cfg_.rc_min_quantizer = 40; + + ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0, + 20); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +VP9_INSTANTIATE_TEST_SUITE(CpuSpeedTest, + ::testing::Values(::libvpx_test::kTwoPassGood, + ::libvpx_test::kOnePassGood, + ::libvpx_test::kRealTime), + ::testing::Range(0, 10)); +} // namespace diff --git a/media/libvpx/libvpx/test/cq_test.cc b/media/libvpx/libvpx/test/cq_test.cc new file mode 100644 index 0000000000..b74915a336 --- /dev/null +++ b/media/libvpx/libvpx/test/cq_test.cc @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include +#include +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" + +namespace { + +// CQ level range: [kCQLevelMin, kCQLevelMax). +const int kCQLevelMin = 4; +const int kCQLevelMax = 63; +const int kCQLevelStep = 8; +const unsigned int kCQTargetBitrate = 2000; + +class CQTest : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam { + public: + // maps the cqlevel to the bitrate produced. + typedef std::map BitrateMap; + + static void SetUpTestSuite() { bitrates_.clear(); } + + static void TearDownTestSuite() { + ASSERT_TRUE(!HasFailure()) + << "skipping bitrate validation due to earlier failure."; + uint32_t prev_actual_bitrate = kCQTargetBitrate; + for (BitrateMap::const_iterator iter = bitrates_.begin(); + iter != bitrates_.end(); ++iter) { + const uint32_t cq_actual_bitrate = iter->second; + EXPECT_LE(cq_actual_bitrate, prev_actual_bitrate) + << "cq_level: " << iter->first + << ", bitrate should decrease with increase in CQ level."; + prev_actual_bitrate = cq_actual_bitrate; + } + } + + protected: + CQTest() : EncoderTest(GET_PARAM(0)), cq_level_(GET_PARAM(1)) { + init_flags_ = VPX_CODEC_USE_PSNR; + } + + ~CQTest() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(libvpx_test::kTwoPassGood); + } + + void BeginPassHook(unsigned int /*pass*/) override { + file_size_ = 0; + psnr_ = 0.0; + n_frames_ = 0; + } + + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + if (cfg_.rc_end_usage == VPX_CQ) { + encoder->Control(VP8E_SET_CQ_LEVEL, cq_level_); + } + encoder->Control(VP8E_SET_CPUUSED, 3); + } + } + + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { + psnr_ += pow(10.0, pkt->data.psnr.psnr[0] / 10.0); + n_frames_++; + } + + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + file_size_ += pkt->data.frame.sz; + } + + double GetLinearPSNROverBitrate() const { + double avg_psnr = log10(psnr_ / n_frames_) * 10.0; + return pow(10.0, avg_psnr / 10.0) / file_size_; + } + + int cq_level() const { return cq_level_; } + size_t file_size() const { return file_size_; } + int n_frames() const { return n_frames_; } + + static BitrateMap bitrates_; + + private: + int cq_level_; + size_t file_size_; + double psnr_; + int n_frames_; +}; + +CQTest::BitrateMap CQTest::bitrates_; + +TEST_P(CQTest, LinearPSNRIsHigherForCQLevel) { + const vpx_rational timebase = { 33333333, 1000000000 }; + cfg_.g_timebase = timebase; + cfg_.rc_target_bitrate = kCQTargetBitrate; + cfg_.g_lag_in_frames = 25; + + cfg_.rc_end_usage = VPX_CQ; + libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + timebase.den, timebase.num, 0, 30); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + const double cq_psnr_lin = GetLinearPSNROverBitrate(); + const unsigned int cq_actual_bitrate = + static_cast(file_size()) * 8 * 30 / (n_frames() * 1000); + EXPECT_LE(cq_actual_bitrate, kCQTargetBitrate); + bitrates_[cq_level()] = cq_actual_bitrate; + + // try targeting the approximate same bitrate with VBR mode + cfg_.rc_end_usage = VPX_VBR; + cfg_.rc_target_bitrate = cq_actual_bitrate; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + const double vbr_psnr_lin = GetLinearPSNROverBitrate(); + EXPECT_GE(cq_psnr_lin, vbr_psnr_lin); +} + +VP8_INSTANTIATE_TEST_SUITE(CQTest, ::testing::Range(kCQLevelMin, kCQLevelMax, + kCQLevelStep)); +} // namespace diff --git a/media/libvpx/libvpx/test/cx_set_ref.sh b/media/libvpx/libvpx/test/cx_set_ref.sh new file mode 100755 index 0000000000..0a3d50ce1f --- /dev/null +++ b/media/libvpx/libvpx/test/cx_set_ref.sh @@ -0,0 +1,60 @@ +#!/bin/sh +## +## Copyright (c) 2016 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## +## This file tests the libvpx cx_set_ref example. To add new tests to this +## file, do the following: +## 1. Write a shell function (this is your test). +## 2. Add the function to cx_set_ref_tests (on a new line). +## +. $(dirname $0)/tools_common.sh + +# Environment check: $YUV_RAW_INPUT is required. +cx_set_ref_verify_environment() { + if [ ! -e "${YUV_RAW_INPUT}" ]; then + echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH." + return 1 + fi +} + +# Runs cx_set_ref and updates the reference frame before encoding frame 90. +# $1 is the codec name. +vpx_set_ref() { + local codec="$1" + local encoder="${LIBVPX_BIN_PATH}/${codec}cx_set_ref${VPX_TEST_EXE_SUFFIX}" + local output_file="${VPX_TEST_OUTPUT_DIR}/${codec}cx_set_ref_${codec}.ivf" + local ref_frame_num=90 + + if [ ! -x "${encoder}" ]; then + elog "${encoder} does not exist or is not executable." + return 1 + fi + + eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT_WIDTH}" \ + "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \ + "${ref_frame_num}" ${devnull} || return 1 + + [ -e "${output_file}" ] || return 1 +} + +cx_set_ref_vp8() { + if [ "$(vp8_encode_available)" = "yes" ]; then + vpx_set_ref vp8 || return 1 + fi +} + +cx_set_ref_vp9() { + if [ "$(vp9_encode_available)" = "yes" ]; then + vpx_set_ref vp9 || return 1 + fi +} + +cx_set_ref_tests="cx_set_ref_vp8 cx_set_ref_vp9" + +run_tests cx_set_ref_verify_environment "${cx_set_ref_tests}" diff --git a/media/libvpx/libvpx/test/dct16x16_test.cc b/media/libvpx/libvpx/test/dct16x16_test.cc new file mode 100644 index 0000000000..8c4213ee16 --- /dev/null +++ b/media/libvpx/libvpx/test/dct16x16_test.cc @@ -0,0 +1,1029 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_scan.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/msvc.h" // for round() +#include "vpx_ports/vpx_timer.h" + +using libvpx_test::ACMRandom; + +namespace { + +const int kNumCoeffs = 256; +const double C1 = 0.995184726672197; +const double C2 = 0.98078528040323; +const double C3 = 0.956940335732209; +const double C4 = 0.923879532511287; +const double C5 = 0.881921264348355; +const double C6 = 0.831469612302545; +const double C7 = 0.773010453362737; +const double C8 = 0.707106781186548; +const double C9 = 0.634393284163646; +const double C10 = 0.555570233019602; +const double C11 = 0.471396736825998; +const double C12 = 0.38268343236509; +const double C13 = 0.290284677254462; +const double C14 = 0.195090322016128; +const double C15 = 0.098017140329561; + +void butterfly_16x16_dct_1d(double input[16], double output[16]) { + double step[16]; + double intermediate[16]; + double temp1, temp2; + + // step 1 + step[0] = input[0] + input[15]; + step[1] = input[1] + input[14]; + step[2] = input[2] + input[13]; + step[3] = input[3] + input[12]; + step[4] = input[4] + input[11]; + step[5] = input[5] + input[10]; + step[6] = input[6] + input[9]; + step[7] = input[7] + input[8]; + step[8] = input[7] - input[8]; + step[9] = input[6] - input[9]; + step[10] = input[5] - input[10]; + step[11] = input[4] - input[11]; + step[12] = input[3] - input[12]; + step[13] = input[2] - input[13]; + step[14] = input[1] - input[14]; + step[15] = input[0] - input[15]; + + // step 2 + output[0] = step[0] + step[7]; + output[1] = step[1] + step[6]; + output[2] = step[2] + step[5]; + output[3] = step[3] + step[4]; + output[4] = step[3] - step[4]; + output[5] = step[2] - step[5]; + output[6] = step[1] - step[6]; + output[7] = step[0] - step[7]; + + temp1 = step[8] * C7; + temp2 = step[15] * C9; + output[8] = temp1 + temp2; + + temp1 = step[9] * C11; + temp2 = step[14] * C5; + output[9] = temp1 - temp2; + + temp1 = step[10] * C3; + temp2 = step[13] * C13; + output[10] = temp1 + temp2; + + temp1 = step[11] * C15; + temp2 = step[12] * C1; + output[11] = temp1 - temp2; + + temp1 = step[11] * C1; + temp2 = step[12] * C15; + output[12] = temp2 + temp1; + + temp1 = step[10] * C13; + temp2 = step[13] * C3; + output[13] = temp2 - temp1; + + temp1 = step[9] * C5; + temp2 = step[14] * C11; + output[14] = temp2 + temp1; + + temp1 = step[8] * C9; + temp2 = step[15] * C7; + output[15] = temp2 - temp1; + + // step 3 + step[0] = output[0] + output[3]; + step[1] = output[1] + output[2]; + step[2] = output[1] - output[2]; + step[3] = output[0] - output[3]; + + temp1 = output[4] * C14; + temp2 = output[7] * C2; + step[4] = temp1 + temp2; + + temp1 = output[5] * C10; + temp2 = output[6] * C6; + step[5] = temp1 + temp2; + + temp1 = output[5] * C6; + temp2 = output[6] * C10; + step[6] = temp2 - temp1; + + temp1 = output[4] * C2; + temp2 = output[7] * C14; + step[7] = temp2 - temp1; + + step[8] = output[8] + output[11]; + step[9] = output[9] + output[10]; + step[10] = output[9] - output[10]; + step[11] = output[8] - output[11]; + + step[12] = output[12] + output[15]; + step[13] = output[13] + output[14]; + step[14] = output[13] - output[14]; + step[15] = output[12] - output[15]; + + // step 4 + output[0] = (step[0] + step[1]); + output[8] = (step[0] - step[1]); + + temp1 = step[2] * C12; + temp2 = step[3] * C4; + temp1 = temp1 + temp2; + output[4] = 2 * (temp1 * C8); + + temp1 = step[2] * C4; + temp2 = step[3] * C12; + temp1 = temp2 - temp1; + output[12] = 2 * (temp1 * C8); + + output[2] = 2 * ((step[4] + step[5]) * C8); + output[14] = 2 * ((step[7] - step[6]) * C8); + + temp1 = step[4] - step[5]; + temp2 = step[6] + step[7]; + output[6] = (temp1 + temp2); + output[10] = (temp1 - temp2); + + intermediate[8] = step[8] + step[14]; + intermediate[9] = step[9] + step[15]; + + temp1 = intermediate[8] * C12; + temp2 = intermediate[9] * C4; + temp1 = temp1 - temp2; + output[3] = 2 * (temp1 * C8); + + temp1 = intermediate[8] * C4; + temp2 = intermediate[9] * C12; + temp1 = temp2 + temp1; + output[13] = 2 * (temp1 * C8); + + output[9] = 2 * ((step[10] + step[11]) * C8); + + intermediate[11] = step[10] - step[11]; + intermediate[12] = step[12] + step[13]; + intermediate[13] = step[12] - step[13]; + intermediate[14] = step[8] - step[14]; + intermediate[15] = step[9] - step[15]; + + output[15] = (intermediate[11] + intermediate[12]); + output[1] = -(intermediate[11] - intermediate[12]); + + output[7] = 2 * (intermediate[13] * C8); + + temp1 = intermediate[14] * C12; + temp2 = intermediate[15] * C4; + temp1 = temp1 - temp2; + output[11] = -2 * (temp1 * C8); + + temp1 = intermediate[14] * C4; + temp2 = intermediate[15] * C12; + temp1 = temp2 + temp1; + output[5] = 2 * (temp1 * C8); +} + +void reference_16x16_dct_2d(int16_t input[256], double output[256]) { + // First transform columns + for (int i = 0; i < 16; ++i) { + double temp_in[16], temp_out[16]; + for (int j = 0; j < 16; ++j) temp_in[j] = input[j * 16 + i]; + butterfly_16x16_dct_1d(temp_in, temp_out); + for (int j = 0; j < 16; ++j) output[j * 16 + i] = temp_out[j]; + } + // Then transform rows + for (int i = 0; i < 16; ++i) { + double temp_in[16], temp_out[16]; + for (int j = 0; j < 16; ++j) temp_in[j] = output[j + i * 16]; + butterfly_16x16_dct_1d(temp_in, temp_out); + // Scale by some magic number + for (int j = 0; j < 16; ++j) output[j + i * 16] = temp_out[j] / 2; + } +} + +typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride); +typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride); +typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride, + int tx_type); +typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride, + int tx_type); + +typedef std::tuple Dct16x16Param; +typedef std::tuple Ht16x16Param; +typedef std::tuple Idct16x16Param; + +void fdct16x16_ref(const int16_t *in, tran_low_t *out, int stride, + int /*tx_type*/) { + vpx_fdct16x16_c(in, out, stride); +} + +void idct16x16_ref(const tran_low_t *in, uint8_t *dest, int stride, + int /*tx_type*/) { + vpx_idct16x16_256_add_c(in, dest, stride); +} + +void fht16x16_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) { + vp9_fht16x16_c(in, out, stride, tx_type); +} + +void iht16x16_ref(const tran_low_t *in, uint8_t *dest, int stride, + int tx_type) { + vp9_iht16x16_256_add_c(in, dest, stride, tx_type); +} + +#if CONFIG_VP9_HIGHBITDEPTH +void idct16x16_10(const tran_low_t *in, uint8_t *out, int stride) { + vpx_highbd_idct16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); +} + +void idct16x16_12(const tran_low_t *in, uint8_t *out, int stride) { + vpx_highbd_idct16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); +} + +void idct16x16_10_ref(const tran_low_t *in, uint8_t *out, int stride, + int /*tx_type*/) { + idct16x16_10(in, out, stride); +} + +void idct16x16_12_ref(const tran_low_t *in, uint8_t *out, int stride, + int /*tx_type*/) { + idct16x16_12(in, out, stride); +} + +void iht16x16_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) { + vp9_highbd_iht16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10); +} + +void iht16x16_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) { + vp9_highbd_iht16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12); +} + +#if HAVE_SSE2 +void idct16x16_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) { + vpx_highbd_idct16x16_10_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); +} + +void idct16x16_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) { + vpx_highbd_idct16x16_10_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); +} + +void idct16x16_256_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) { + vpx_highbd_idct16x16_256_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10); +} + +void idct16x16_256_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { + vpx_highbd_idct16x16_256_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12); +} + +void idct16x16_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) { + vpx_highbd_idct16x16_10_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10); +} + +void idct16x16_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { + vpx_highbd_idct16x16_10_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12); +} +#endif // HAVE_SSE2 +#endif // CONFIG_VP9_HIGHBITDEPTH + +class Trans16x16TestBase { + public: + virtual ~Trans16x16TestBase() = default; + + protected: + virtual void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) = 0; + + virtual void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) = 0; + + void RunAccuracyCheck() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + uint32_t max_error = 0; + int64_t total_error = 0; + const int count_test_block = 10000; + for (int i = 0; i < count_test_block; ++i) { + DECLARE_ALIGNED(16, int16_t, test_input_block[kNumCoeffs]); + DECLARE_ALIGNED(16, tran_low_t, test_temp_block[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]); +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]); + DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]); +#endif + + // Initialize a test block with input range [-mask_, mask_]. + for (int j = 0; j < kNumCoeffs; ++j) { + if (bit_depth_ == VPX_BITS_8) { + src[j] = rnd.Rand8(); + dst[j] = rnd.Rand8(); + test_input_block[j] = src[j] - dst[j]; +#if CONFIG_VP9_HIGHBITDEPTH + } else { + src16[j] = rnd.Rand16() & mask_; + dst16[j] = rnd.Rand16() & mask_; + test_input_block[j] = src16[j] - dst16[j]; +#endif + } + } + + ASM_REGISTER_STATE_CHECK( + RunFwdTxfm(test_input_block, test_temp_block, pitch_)); + if (bit_depth_ == VPX_BITS_8) { + ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_)); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + ASM_REGISTER_STATE_CHECK( + RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_)); +#endif + } + + for (int j = 0; j < kNumCoeffs; ++j) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32_t diff = + bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j]; +#else + const int32_t diff = dst[j] - src[j]; +#endif + const uint32_t error = diff * diff; + if (max_error < error) max_error = error; + total_error += error; + } + } + + EXPECT_GE(1u << 2 * (bit_depth_ - 8), max_error) + << "Error: 16x16 FHT/IHT has an individual round trip error > 1"; + + EXPECT_GE(count_test_block << 2 * (bit_depth_ - 8), total_error) + << "Error: 16x16 FHT/IHT has average round trip error > 1 per block"; + } + + void RunCoeffCheck() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 1000; + DECLARE_ALIGNED(16, int16_t, input_block[kNumCoeffs]); + DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]); + DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]); + + for (int i = 0; i < count_test_block; ++i) { + // Initialize a test block with input range [-mask_, mask_]. + for (int j = 0; j < kNumCoeffs; ++j) { + input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_); + } + + fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_); + ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_)); + + // The minimum quant value is 4. + for (int j = 0; j < kNumCoeffs; ++j) + EXPECT_EQ(output_block[j], output_ref_block[j]); + } + } + + void RunMemCheck() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 1000; + DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]); + DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]); + DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]); + + for (int i = 0; i < count_test_block; ++i) { + // Initialize a test block with input range [-mask_, mask_]. + for (int j = 0; j < kNumCoeffs; ++j) { + input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_; + } + if (i == 0) { + for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = mask_; + } else if (i == 1) { + for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = -mask_; + } + + fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_); + ASM_REGISTER_STATE_CHECK( + RunFwdTxfm(input_extreme_block, output_block, pitch_)); + + // The minimum quant value is 4. + for (int j = 0; j < kNumCoeffs; ++j) { + EXPECT_EQ(output_block[j], output_ref_block[j]); + EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j])) + << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE"; + } + } + } + + void RunQuantCheck(int dc_thred, int ac_thred) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 100000; + DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]); + DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]); + + DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]); +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]); + DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]); +#endif + + for (int i = 0; i < count_test_block; ++i) { + // Initialize a test block with input range [-mask_, mask_]. + for (int j = 0; j < kNumCoeffs; ++j) { + input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_; + } + if (i == 0) { + for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = mask_; + } + if (i == 1) { + for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = -mask_; + } + + fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_); + + // clear reconstructed pixel buffers + memset(dst, 0, kNumCoeffs * sizeof(uint8_t)); + memset(ref, 0, kNumCoeffs * sizeof(uint8_t)); +#if CONFIG_VP9_HIGHBITDEPTH + memset(dst16, 0, kNumCoeffs * sizeof(uint16_t)); + memset(ref16, 0, kNumCoeffs * sizeof(uint16_t)); +#endif + + // quantization with maximum allowed step sizes + output_ref_block[0] = (output_ref_block[0] / dc_thred) * dc_thred; + for (int j = 1; j < kNumCoeffs; ++j) { + output_ref_block[j] = (output_ref_block[j] / ac_thred) * ac_thred; + } + if (bit_depth_ == VPX_BITS_8) { + inv_txfm_ref(output_ref_block, ref, pitch_, tx_type_); + ASM_REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block, dst, pitch_)); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + inv_txfm_ref(output_ref_block, CAST_TO_BYTEPTR(ref16), pitch_, + tx_type_); + ASM_REGISTER_STATE_CHECK( + RunInvTxfm(output_ref_block, CAST_TO_BYTEPTR(dst16), pitch_)); +#endif + } + if (bit_depth_ == VPX_BITS_8) { + for (int j = 0; j < kNumCoeffs; ++j) EXPECT_EQ(ref[j], dst[j]); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + for (int j = 0; j < kNumCoeffs; ++j) EXPECT_EQ(ref16[j], dst16[j]); +#endif + } + } + } + + void RunInvAccuracyCheck() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 1000; + DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]); + DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]); +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]); + DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]); +#endif // CONFIG_VP9_HIGHBITDEPTH + + for (int i = 0; i < count_test_block; ++i) { + double out_r[kNumCoeffs]; + + // Initialize a test block with input range [-255, 255]. + for (int j = 0; j < kNumCoeffs; ++j) { + if (bit_depth_ == VPX_BITS_8) { + src[j] = rnd.Rand8(); + dst[j] = rnd.Rand8(); + in[j] = src[j] - dst[j]; +#if CONFIG_VP9_HIGHBITDEPTH + } else { + src16[j] = rnd.Rand16() & mask_; + dst16[j] = rnd.Rand16() & mask_; + in[j] = src16[j] - dst16[j]; +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + + reference_16x16_dct_2d(in, out_r); + for (int j = 0; j < kNumCoeffs; ++j) { + coeff[j] = static_cast(round(out_r[j])); + } + + if (bit_depth_ == VPX_BITS_8) { + ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16)); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), 16)); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + + for (int j = 0; j < kNumCoeffs; ++j) { +#if CONFIG_VP9_HIGHBITDEPTH + const uint32_t diff = + bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j]; +#else + const uint32_t diff = dst[j] - src[j]; +#endif // CONFIG_VP9_HIGHBITDEPTH + const uint32_t error = diff * diff; + EXPECT_GE(1u, error) + << "Error: 16x16 IDCT has error " << error << " at index " << j; + } + } + } + + void RunSpeedTest() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 10000; + int c_sum_time = 0; + int simd_sum_time = 0; + + DECLARE_ALIGNED(32, int16_t, input_block[kNumCoeffs]); + DECLARE_ALIGNED(32, tran_low_t, output_ref_block[kNumCoeffs]); + DECLARE_ALIGNED(32, tran_low_t, output_block[kNumCoeffs]); + + // Initialize a test block with input range [-mask_, mask_]. + for (int j = 0; j < kNumCoeffs; ++j) { + input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_); + } + + vpx_usec_timer timer_c; + vpx_usec_timer_start(&timer_c); + for (int i = 0; i < count_test_block; ++i) { + vpx_fdct16x16_c(input_block, output_ref_block, pitch_); + } + vpx_usec_timer_mark(&timer_c); + c_sum_time += static_cast(vpx_usec_timer_elapsed(&timer_c)); + + vpx_usec_timer timer_mod; + vpx_usec_timer_start(&timer_mod); + for (int i = 0; i < count_test_block; ++i) { + RunFwdTxfm(input_block, output_block, pitch_); + } + + vpx_usec_timer_mark(&timer_mod); + simd_sum_time += static_cast(vpx_usec_timer_elapsed(&timer_mod)); + + printf( + "c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time, + simd_sum_time, + (static_cast(c_sum_time) / static_cast(simd_sum_time))); + } + + void CompareInvReference(IdctFunc ref_txfm, int thresh) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 10000; + const int eob = 10; + const int16_t *scan = vp9_default_scan_orders[TX_16X16].scan; + DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]); +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]); + DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]); +#endif // CONFIG_VP9_HIGHBITDEPTH + + for (int i = 0; i < count_test_block; ++i) { + for (int j = 0; j < kNumCoeffs; ++j) { + if (j < eob) { + // Random values less than the threshold, either positive or negative + coeff[scan[j]] = rnd(thresh) * (1 - 2 * (i % 2)); + } else { + coeff[scan[j]] = 0; + } + if (bit_depth_ == VPX_BITS_8) { + dst[j] = 0; + ref[j] = 0; +#if CONFIG_VP9_HIGHBITDEPTH + } else { + dst16[j] = 0; + ref16[j] = 0; +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + if (bit_depth_ == VPX_BITS_8) { + ref_txfm(coeff, ref, pitch_); + ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_)); + } else { +#if CONFIG_VP9_HIGHBITDEPTH + ref_txfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_); + ASM_REGISTER_STATE_CHECK( + RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_)); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + + for (int j = 0; j < kNumCoeffs; ++j) { +#if CONFIG_VP9_HIGHBITDEPTH + const uint32_t diff = + bit_depth_ == VPX_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j]; +#else + const uint32_t diff = dst[j] - ref[j]; +#endif // CONFIG_VP9_HIGHBITDEPTH + const uint32_t error = diff * diff; + EXPECT_EQ(0u, error) << "Error: 16x16 IDCT Comparison has error " + << error << " at index " << j; + } + } + } + + void RunInvTrans16x16SpeedTest(IdctFunc ref_txfm, int thresh) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 10000; + const int eob = 10; + const int16_t *scan = vp9_default_scan_orders[TX_16X16].scan; + int64_t c_sum_time = 0; + int64_t simd_sum_time = 0; + DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]); +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]); + DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]); +#endif // CONFIG_VP9_HIGHBITDEPTH + + for (int j = 0; j < kNumCoeffs; ++j) { + if (j < eob) { + // Random values less than the threshold, either positive or negative + coeff[scan[j]] = rnd(thresh); + } else { + coeff[scan[j]] = 0; + } + if (bit_depth_ == VPX_BITS_8) { + dst[j] = 0; + ref[j] = 0; +#if CONFIG_VP9_HIGHBITDEPTH + } else { + dst16[j] = 0; + ref16[j] = 0; +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + + if (bit_depth_ == VPX_BITS_8) { + vpx_usec_timer timer_c; + vpx_usec_timer_start(&timer_c); + for (int i = 0; i < count_test_block; ++i) { + ref_txfm(coeff, ref, pitch_); + } + vpx_usec_timer_mark(&timer_c); + c_sum_time += vpx_usec_timer_elapsed(&timer_c); + + vpx_usec_timer timer_mod; + vpx_usec_timer_start(&timer_mod); + for (int i = 0; i < count_test_block; ++i) { + RunInvTxfm(coeff, dst, pitch_); + } + vpx_usec_timer_mark(&timer_mod); + simd_sum_time += vpx_usec_timer_elapsed(&timer_mod); + } else { +#if CONFIG_VP9_HIGHBITDEPTH + vpx_usec_timer timer_c; + vpx_usec_timer_start(&timer_c); + for (int i = 0; i < count_test_block; ++i) { + ref_txfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_); + } + vpx_usec_timer_mark(&timer_c); + c_sum_time += vpx_usec_timer_elapsed(&timer_c); + + vpx_usec_timer timer_mod; + vpx_usec_timer_start(&timer_mod); + for (int i = 0; i < count_test_block; ++i) { + RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_); + } + vpx_usec_timer_mark(&timer_mod); + simd_sum_time += vpx_usec_timer_elapsed(&timer_mod); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + printf( + "c_time = %" PRId64 " \t simd_time = %" PRId64 " \t Gain = %4.2f \n", + c_sum_time, simd_sum_time, + (static_cast(c_sum_time) / static_cast(simd_sum_time))); + } + + int pitch_; + int tx_type_; + vpx_bit_depth_t bit_depth_; + int mask_; + FhtFunc fwd_txfm_ref; + IhtFunc inv_txfm_ref; +}; + +class Trans16x16DCT : public Trans16x16TestBase, + public ::testing::TestWithParam { + public: + ~Trans16x16DCT() override = default; + + void SetUp() override { + fwd_txfm_ = GET_PARAM(0); + inv_txfm_ = GET_PARAM(1); + tx_type_ = GET_PARAM(2); + bit_depth_ = GET_PARAM(3); + pitch_ = 16; + fwd_txfm_ref = fdct16x16_ref; + inv_txfm_ref = idct16x16_ref; + mask_ = (1 << bit_depth_) - 1; +#if CONFIG_VP9_HIGHBITDEPTH + switch (bit_depth_) { + case VPX_BITS_10: inv_txfm_ref = idct16x16_10_ref; break; + case VPX_BITS_12: inv_txfm_ref = idct16x16_12_ref; break; + default: inv_txfm_ref = idct16x16_ref; break; + } +#else + inv_txfm_ref = idct16x16_ref; +#endif + } + void TearDown() override { libvpx_test::ClearSystemState(); } + + protected: + void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) override { + fwd_txfm_(in, out, stride); + } + void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override { + inv_txfm_(out, dst, stride); + } + + FdctFunc fwd_txfm_; + IdctFunc inv_txfm_; +}; + +TEST_P(Trans16x16DCT, AccuracyCheck) { RunAccuracyCheck(); } + +TEST_P(Trans16x16DCT, CoeffCheck) { RunCoeffCheck(); } + +TEST_P(Trans16x16DCT, MemCheck) { RunMemCheck(); } + +TEST_P(Trans16x16DCT, QuantCheck) { + // Use maximally allowed quantization step sizes for DC and AC + // coefficients respectively. + RunQuantCheck(1336, 1828); +} + +TEST_P(Trans16x16DCT, InvAccuracyCheck) { RunInvAccuracyCheck(); } + +TEST_P(Trans16x16DCT, DISABLED_Speed) { RunSpeedTest(); } + +class Trans16x16HT : public Trans16x16TestBase, + public ::testing::TestWithParam { + public: + ~Trans16x16HT() override = default; + + void SetUp() override { + fwd_txfm_ = GET_PARAM(0); + inv_txfm_ = GET_PARAM(1); + tx_type_ = GET_PARAM(2); + bit_depth_ = GET_PARAM(3); + pitch_ = 16; + fwd_txfm_ref = fht16x16_ref; + inv_txfm_ref = iht16x16_ref; + mask_ = (1 << bit_depth_) - 1; +#if CONFIG_VP9_HIGHBITDEPTH + switch (bit_depth_) { + case VPX_BITS_10: inv_txfm_ref = iht16x16_10; break; + case VPX_BITS_12: inv_txfm_ref = iht16x16_12; break; + default: inv_txfm_ref = iht16x16_ref; break; + } +#else + inv_txfm_ref = iht16x16_ref; +#endif + } + void TearDown() override { libvpx_test::ClearSystemState(); } + + protected: + void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) override { + fwd_txfm_(in, out, stride, tx_type_); + } + void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override { + inv_txfm_(out, dst, stride, tx_type_); + } + + FhtFunc fwd_txfm_; + IhtFunc inv_txfm_; +}; + +TEST_P(Trans16x16HT, AccuracyCheck) { RunAccuracyCheck(); } + +TEST_P(Trans16x16HT, CoeffCheck) { RunCoeffCheck(); } + +TEST_P(Trans16x16HT, MemCheck) { RunMemCheck(); } + +TEST_P(Trans16x16HT, QuantCheck) { + // The encoder skips any non-DC intra prediction modes, + // when the quantization step size goes beyond 988. + RunQuantCheck(429, 729); +} + +class InvTrans16x16DCT : public Trans16x16TestBase, + public ::testing::TestWithParam { + public: + ~InvTrans16x16DCT() override = default; + + void SetUp() override { + ref_txfm_ = GET_PARAM(0); + inv_txfm_ = GET_PARAM(1); + thresh_ = GET_PARAM(2); + bit_depth_ = GET_PARAM(3); + pitch_ = 16; + mask_ = (1 << bit_depth_) - 1; + } + void TearDown() override { libvpx_test::ClearSystemState(); } + + protected: + void RunFwdTxfm(int16_t * /*in*/, tran_low_t * /*out*/, + int /*stride*/) override {} + void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override { + inv_txfm_(out, dst, stride); + } + + IdctFunc ref_txfm_; + IdctFunc inv_txfm_; + int thresh_; +}; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(InvTrans16x16DCT); + +TEST_P(InvTrans16x16DCT, CompareReference) { + CompareInvReference(ref_txfm_, thresh_); +} + +TEST_P(InvTrans16x16DCT, DISABLED_Speed) { + RunInvTrans16x16SpeedTest(ref_txfm_, thresh_); +} + +using std::make_tuple; + +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + C, Trans16x16DCT, + ::testing::Values( + make_tuple(&vpx_highbd_fdct16x16_c, &idct16x16_10, 0, VPX_BITS_10), + make_tuple(&vpx_highbd_fdct16x16_c, &idct16x16_12, 0, VPX_BITS_12), + make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, 0, VPX_BITS_8))); +#else +INSTANTIATE_TEST_SUITE_P(C, Trans16x16DCT, + ::testing::Values(make_tuple(&vpx_fdct16x16_c, + &vpx_idct16x16_256_add_c, + 0, VPX_BITS_8))); +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + C, Trans16x16HT, + ::testing::Values( + make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 0, VPX_BITS_10), + make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 1, VPX_BITS_10), + make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 2, VPX_BITS_10), + make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 3, VPX_BITS_10), + make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 0, VPX_BITS_12), + make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 1, VPX_BITS_12), + make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 2, VPX_BITS_12), + make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 3, VPX_BITS_12), + make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8), + make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8), + make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8), + make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8))); +#else +INSTANTIATE_TEST_SUITE_P( + C, Trans16x16HT, + ::testing::Values( + make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8), + make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8), + make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8), + make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8))); + +INSTANTIATE_TEST_SUITE_P(C, InvTrans16x16DCT, + ::testing::Values(make_tuple(&vpx_idct16x16_256_add_c, + &vpx_idct16x16_256_add_c, + 6225, VPX_BITS_8))); + +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P( + NEON, Trans16x16DCT, + ::testing::Values(make_tuple(&vpx_fdct16x16_neon, + &vpx_idct16x16_256_add_neon, 0, VPX_BITS_8))); +#endif // HAVE_NEON && !CONFIG_EMULATE_HARDWARE + +#if HAVE_NEON && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P( + NEON, Trans16x16DCT, + ::testing::Values( + make_tuple(&vpx_highbd_fdct16x16_neon, &idct16x16_10, 0, VPX_BITS_10), + make_tuple(&vpx_highbd_fdct16x16_neon, &idct16x16_12, 0, VPX_BITS_12), + make_tuple(&vpx_fdct16x16_neon, &vpx_idct16x16_256_add_c, 0, + VPX_BITS_8))); +#endif // HAVE_NEON && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P( + SSE2, Trans16x16DCT, + ::testing::Values(make_tuple(&vpx_fdct16x16_sse2, + &vpx_idct16x16_256_add_sse2, 0, VPX_BITS_8))); +INSTANTIATE_TEST_SUITE_P( + SSE2, Trans16x16HT, + ::testing::Values(make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, + 0, VPX_BITS_8), + make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, + 1, VPX_BITS_8), + make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, + 2, VPX_BITS_8), + make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, + 3, VPX_BITS_8))); + +INSTANTIATE_TEST_SUITE_P(SSE2, InvTrans16x16DCT, + ::testing::Values(make_tuple( + &vpx_idct16x16_256_add_c, + &vpx_idct16x16_256_add_sse2, 6225, VPX_BITS_8))); +#endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +#if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P( + AVX2, Trans16x16DCT, + ::testing::Values(make_tuple(&vpx_fdct16x16_avx2, + &vpx_idct16x16_256_add_sse2, 0, VPX_BITS_8))); + +INSTANTIATE_TEST_SUITE_P(AVX2, InvTrans16x16DCT, + ::testing::Values(make_tuple( + &vpx_idct16x16_256_add_c, + &vpx_idct16x16_256_add_avx2, 6225, VPX_BITS_8))); +#endif // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P( + SSE2, Trans16x16DCT, + ::testing::Values( + make_tuple(&vpx_highbd_fdct16x16_sse2, &idct16x16_10, 0, VPX_BITS_10), + make_tuple(&vpx_highbd_fdct16x16_c, &idct16x16_256_add_10_sse2, 0, + VPX_BITS_10), + make_tuple(&vpx_highbd_fdct16x16_sse2, &idct16x16_12, 0, VPX_BITS_12), + make_tuple(&vpx_highbd_fdct16x16_c, &idct16x16_256_add_12_sse2, 0, + VPX_BITS_12), + make_tuple(&vpx_fdct16x16_sse2, &vpx_idct16x16_256_add_c, 0, + VPX_BITS_8))); +INSTANTIATE_TEST_SUITE_P( + SSE2, Trans16x16HT, + ::testing::Values( + make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8), + make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8), + make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8), + make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 3, + VPX_BITS_8))); +// Optimizations take effect at a threshold of 3155, so we use a value close to +// that to test both branches. +INSTANTIATE_TEST_SUITE_P( + SSE2, InvTrans16x16DCT, + ::testing::Values(make_tuple(&idct16x16_10_add_10_c, + &idct16x16_10_add_10_sse2, 3167, VPX_BITS_10), + make_tuple(&idct16x16_10, &idct16x16_256_add_10_sse2, + 3167, VPX_BITS_10), + make_tuple(&idct16x16_10_add_12_c, + &idct16x16_10_add_12_sse2, 3167, VPX_BITS_12), + make_tuple(&idct16x16_12, &idct16x16_256_add_12_sse2, + 3167, VPX_BITS_12))); +#endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P( + MSA, Trans16x16DCT, + ::testing::Values(make_tuple(&vpx_fdct16x16_msa, &vpx_idct16x16_256_add_msa, + 0, VPX_BITS_8))); +INSTANTIATE_TEST_SUITE_P( + MSA, Trans16x16HT, + ::testing::Values( + make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 0, VPX_BITS_8), + make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 1, VPX_BITS_8), + make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 2, VPX_BITS_8), + make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 3, + VPX_BITS_8))); +#endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P( + VSX, Trans16x16DCT, + ::testing::Values(make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_vsx, + 0, VPX_BITS_8))); +#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P(LSX, Trans16x16DCT, + ::testing::Values(make_tuple(&vpx_fdct16x16_lsx, + &vpx_idct16x16_256_add_c, + 0, VPX_BITS_8))); +#endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +} // namespace diff --git a/media/libvpx/libvpx/test/dct32x32_test.cc b/media/libvpx/libvpx/test/dct32x32_test.cc new file mode 100644 index 0000000000..6233b17a43 --- /dev/null +++ b/media/libvpx/libvpx/test/dct32x32_test.cc @@ -0,0 +1,605 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "test/acm_random.h" +#include "test/bench.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_scan.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/msvc.h" // for round() +#include "vpx_ports/vpx_timer.h" + +using libvpx_test::ACMRandom; + +namespace { + +const int kNumCoeffs = 1024; +const double kPi = 3.141592653589793238462643383279502884; +void reference_32x32_dct_1d(const double in[32], double out[32]) { + const double kInvSqrt2 = 0.707106781186547524400844362104; + for (int k = 0; k < 32; k++) { + out[k] = 0.0; + for (int n = 0; n < 32; n++) { + out[k] += in[n] * cos(kPi * (2 * n + 1) * k / 64.0); + } + if (k == 0) out[k] = out[k] * kInvSqrt2; + } +} + +void reference_32x32_dct_2d(const int16_t input[kNumCoeffs], + double output[kNumCoeffs]) { + // First transform columns + for (int i = 0; i < 32; ++i) { + double temp_in[32], temp_out[32]; + for (int j = 0; j < 32; ++j) temp_in[j] = input[j * 32 + i]; + reference_32x32_dct_1d(temp_in, temp_out); + for (int j = 0; j < 32; ++j) output[j * 32 + i] = temp_out[j]; + } + // Then transform rows + for (int i = 0; i < 32; ++i) { + double temp_in[32], temp_out[32]; + for (int j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32]; + reference_32x32_dct_1d(temp_in, temp_out); + // Scale by some magic number + for (int j = 0; j < 32; ++j) output[j + i * 32] = temp_out[j] / 4; + } +} + +typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride); +typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride); + +typedef std::tuple + Trans32x32Param; + +typedef std::tuple + InvTrans32x32Param; + +#if CONFIG_VP9_HIGHBITDEPTH +void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) { + vpx_highbd_idct32x32_1024_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); +} + +void idct32x32_12(const tran_low_t *in, uint8_t *out, int stride) { + vpx_highbd_idct32x32_1024_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +class Trans32x32Test : public AbstractBench, + public ::testing::TestWithParam { + public: + ~Trans32x32Test() override = default; + void SetUp() override { + fwd_txfm_ = GET_PARAM(0); + inv_txfm_ = GET_PARAM(1); + version_ = GET_PARAM(2); // 0: high precision forward transform + // 1: low precision version for rd loop + bit_depth_ = GET_PARAM(3); + mask_ = (1 << bit_depth_) - 1; + } + + void TearDown() override { libvpx_test::ClearSystemState(); } + + protected: + int version_; + vpx_bit_depth_t bit_depth_; + int mask_; + FwdTxfmFunc fwd_txfm_; + InvTxfmFunc inv_txfm_; + + int16_t *bench_in_; + tran_low_t *bench_out_; + void Run() override; +}; + +void Trans32x32Test::Run() { fwd_txfm_(bench_in_, bench_out_, 32); } + +TEST_P(Trans32x32Test, AccuracyCheck) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + uint32_t max_error = 0; + int64_t total_error = 0; + const int count_test_block = 10000; + DECLARE_ALIGNED(16, int16_t, test_input_block[kNumCoeffs]); + DECLARE_ALIGNED(16, tran_low_t, test_temp_block[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]); +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]); + DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]); +#endif + + for (int i = 0; i < count_test_block; ++i) { + // Initialize a test block with input range [-mask_, mask_]. + for (int j = 0; j < kNumCoeffs; ++j) { + if (bit_depth_ == VPX_BITS_8) { + src[j] = rnd.Rand8(); + dst[j] = rnd.Rand8(); + test_input_block[j] = src[j] - dst[j]; +#if CONFIG_VP9_HIGHBITDEPTH + } else { + src16[j] = rnd.Rand16() & mask_; + dst16[j] = rnd.Rand16() & mask_; + test_input_block[j] = src16[j] - dst16[j]; +#endif + } + } + + ASM_REGISTER_STATE_CHECK(fwd_txfm_(test_input_block, test_temp_block, 32)); + if (bit_depth_ == VPX_BITS_8) { + ASM_REGISTER_STATE_CHECK(inv_txfm_(test_temp_block, dst, 32)); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + ASM_REGISTER_STATE_CHECK( + inv_txfm_(test_temp_block, CAST_TO_BYTEPTR(dst16), 32)); +#endif + } + + for (int j = 0; j < kNumCoeffs; ++j) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32_t diff = + bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j]; +#else + const int32_t diff = dst[j] - src[j]; +#endif + const uint32_t error = diff * diff; + if (max_error < error) max_error = error; + total_error += error; + } + } + + if (version_ == 1) { + max_error /= 2; + total_error /= 45; + } + + EXPECT_GE(1u << 2 * (bit_depth_ - 8), max_error) + << "Error: 32x32 FDCT/IDCT has an individual round-trip error > 1"; + + EXPECT_GE(count_test_block << 2 * (bit_depth_ - 8), total_error) + << "Error: 32x32 FDCT/IDCT has average round-trip error > 1 per block"; +} + +TEST_P(Trans32x32Test, CoeffCheck) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 1000; + + DECLARE_ALIGNED(16, int16_t, input_block[kNumCoeffs]); + DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]); + DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]); + + for (int i = 0; i < count_test_block; ++i) { + for (int j = 0; j < kNumCoeffs; ++j) { + input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_); + } + + const int stride = 32; + vpx_fdct32x32_c(input_block, output_ref_block, stride); + ASM_REGISTER_STATE_CHECK(fwd_txfm_(input_block, output_block, stride)); + + if (version_ == 0) { + for (int j = 0; j < kNumCoeffs; ++j) + EXPECT_EQ(output_block[j], output_ref_block[j]) + << "Error: 32x32 FDCT versions have mismatched coefficients"; + } else { + for (int j = 0; j < kNumCoeffs; ++j) + EXPECT_GE(6, abs(output_block[j] - output_ref_block[j])) + << "Error: 32x32 FDCT rd has mismatched coefficients"; + } + } +} + +TEST_P(Trans32x32Test, MemCheck) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 2000; + + DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]); + DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]); + DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]); + + for (int i = 0; i < count_test_block; ++i) { + // Initialize a test block with input range [-mask_, mask_]. + for (int j = 0; j < kNumCoeffs; ++j) { + input_extreme_block[j] = rnd.Rand8() & 1 ? mask_ : -mask_; + } + if (i == 0) { + for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = mask_; + } else if (i == 1) { + for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = -mask_; + } + + const int stride = 32; + vpx_fdct32x32_c(input_extreme_block, output_ref_block, stride); + ASM_REGISTER_STATE_CHECK( + fwd_txfm_(input_extreme_block, output_block, stride)); + + // The minimum quant value is 4. + for (int j = 0; j < kNumCoeffs; ++j) { + if (version_ == 0) { + EXPECT_EQ(output_block[j], output_ref_block[j]) + << "Error: 32x32 FDCT versions have mismatched coefficients"; + } else { + EXPECT_GE(6, abs(output_block[j] - output_ref_block[j])) + << "Error: 32x32 FDCT rd has mismatched coefficients"; + } + EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_ref_block[j])) + << "Error: 32x32 FDCT C has coefficient larger than 4*DCT_MAX_VALUE"; + EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j])) + << "Error: 32x32 FDCT has coefficient larger than " + << "4*DCT_MAX_VALUE"; + } + } +} + +TEST_P(Trans32x32Test, DISABLED_Speed) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + + DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]); + DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]); + + bench_in_ = input_extreme_block; + bench_out_ = output_block; + + RunNTimes(INT16_MAX); + PrintMedian("32x32"); +} + +TEST_P(Trans32x32Test, InverseAccuracy) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 1000; + DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]); + DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]); +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]); + DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]); +#endif + + for (int i = 0; i < count_test_block; ++i) { + double out_r[kNumCoeffs]; + + // Initialize a test block with input range [-255, 255] + for (int j = 0; j < kNumCoeffs; ++j) { + if (bit_depth_ == VPX_BITS_8) { + src[j] = rnd.Rand8(); + dst[j] = rnd.Rand8(); + in[j] = src[j] - dst[j]; +#if CONFIG_VP9_HIGHBITDEPTH + } else { + src16[j] = rnd.Rand16() & mask_; + dst16[j] = rnd.Rand16() & mask_; + in[j] = src16[j] - dst16[j]; +#endif + } + } + + reference_32x32_dct_2d(in, out_r); + for (int j = 0; j < kNumCoeffs; ++j) { + coeff[j] = static_cast(round(out_r[j])); + } + if (bit_depth_ == VPX_BITS_8) { + ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32)); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, CAST_TO_BYTEPTR(dst16), 32)); +#endif + } + for (int j = 0; j < kNumCoeffs; ++j) { +#if CONFIG_VP9_HIGHBITDEPTH + const int diff = + bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j]; +#else + const int diff = dst[j] - src[j]; +#endif + const int error = diff * diff; + EXPECT_GE(1, error) << "Error: 32x32 IDCT has error " << error + << " at index " << j; + } + } +} + +class InvTrans32x32Test : public ::testing::TestWithParam { + public: + ~InvTrans32x32Test() override = default; + void SetUp() override { + ref_txfm_ = GET_PARAM(0); + inv_txfm_ = GET_PARAM(1); + version_ = GET_PARAM(2); // 0: high precision forward transform + // 1: low precision version for rd loop + bit_depth_ = GET_PARAM(3); + eob_ = GET_PARAM(4); + thresh_ = GET_PARAM(4); + mask_ = (1 << bit_depth_) - 1; + pitch_ = 32; + } + + void TearDown() override { libvpx_test::ClearSystemState(); } + + protected: + void RunRefTxfm(tran_low_t *out, uint8_t *dst, int stride) { + ref_txfm_(out, dst, stride); + } + void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) { + inv_txfm_(out, dst, stride); + } + int version_; + vpx_bit_depth_t bit_depth_; + int mask_; + int eob_; + int thresh_; + + InvTxfmFunc ref_txfm_; + InvTxfmFunc inv_txfm_; + int pitch_; + + void RunInvTrans32x32SpeedTest() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 10000; + int64_t c_sum_time = 0; + int64_t simd_sum_time = 0; + const int16_t *scan = vp9_default_scan_orders[TX_32X32].scan; + DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]); +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]); + DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]); +#endif // CONFIG_VP9_HIGHBITDEPTH + + for (int j = 0; j < kNumCoeffs; ++j) { + if (j < eob_) { + // Random values less than the threshold, either positive or negative + coeff[scan[j]] = rnd(thresh_); + } else { + coeff[scan[j]] = 0; + } + if (bit_depth_ == VPX_BITS_8) { + dst[j] = 0; + ref[j] = 0; +#if CONFIG_VP9_HIGHBITDEPTH + } else { + dst16[j] = 0; + ref16[j] = 0; +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + + if (bit_depth_ == VPX_BITS_8) { + vpx_usec_timer timer_c; + vpx_usec_timer_start(&timer_c); + for (int i = 0; i < count_test_block; ++i) { + RunRefTxfm(coeff, ref, pitch_); + } + vpx_usec_timer_mark(&timer_c); + c_sum_time += vpx_usec_timer_elapsed(&timer_c); + + vpx_usec_timer timer_mod; + vpx_usec_timer_start(&timer_mod); + for (int i = 0; i < count_test_block; ++i) { + RunInvTxfm(coeff, dst, pitch_); + } + vpx_usec_timer_mark(&timer_mod); + simd_sum_time += vpx_usec_timer_elapsed(&timer_mod); + } else { +#if CONFIG_VP9_HIGHBITDEPTH + vpx_usec_timer timer_c; + vpx_usec_timer_start(&timer_c); + for (int i = 0; i < count_test_block; ++i) { + RunRefTxfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_); + } + vpx_usec_timer_mark(&timer_c); + c_sum_time += vpx_usec_timer_elapsed(&timer_c); + + vpx_usec_timer timer_mod; + vpx_usec_timer_start(&timer_mod); + for (int i = 0; i < count_test_block; ++i) { + RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_); + } + vpx_usec_timer_mark(&timer_mod); + simd_sum_time += vpx_usec_timer_elapsed(&timer_mod); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + printf( + "c_time = %" PRId64 " \t simd_time = %" PRId64 " \t Gain = %4.2f \n", + c_sum_time, simd_sum_time, + (static_cast(c_sum_time) / static_cast(simd_sum_time))); + } + + void CompareInvReference32x32() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 10000; + const int eob = 31; + const int16_t *scan = vp9_default_scan_orders[TX_32X32].scan; + DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]); +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]); + DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]); +#endif // CONFIG_VP9_HIGHBITDEPTH + + for (int i = 0; i < count_test_block; ++i) { + for (int j = 0; j < kNumCoeffs; ++j) { + if (j < eob) { + coeff[scan[j]] = rnd.Rand8Extremes(); + } else { + coeff[scan[j]] = 0; + } + if (bit_depth_ == VPX_BITS_8) { + dst[j] = 0; + ref[j] = 0; +#if CONFIG_VP9_HIGHBITDEPTH + } else { + dst16[j] = 0; + ref16[j] = 0; +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + if (bit_depth_ == VPX_BITS_8) { + RunRefTxfm(coeff, ref, pitch_); + RunInvTxfm(coeff, dst, pitch_); + } else { +#if CONFIG_VP9_HIGHBITDEPTH + RunRefTxfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_); + ASM_REGISTER_STATE_CHECK( + RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_)); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + + for (int j = 0; j < kNumCoeffs; ++j) { +#if CONFIG_VP9_HIGHBITDEPTH + const uint32_t diff = + bit_depth_ == VPX_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j]; +#else + const uint32_t diff = dst[j] - ref[j]; +#endif // CONFIG_VP9_HIGHBITDEPTH + const uint32_t error = diff * diff; + EXPECT_EQ(0u, error) << "Error: 32x32 IDCT Comparison has error " + << error << " at index " << j; + } + } + } +}; + +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(InvTrans32x32Test); + +TEST_P(InvTrans32x32Test, DISABLED_Speed) { RunInvTrans32x32SpeedTest(); } +TEST_P(InvTrans32x32Test, CompareReference) { CompareInvReference32x32(); } + +using std::make_tuple; + +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + C, Trans32x32Test, + ::testing::Values( + make_tuple(&vpx_highbd_fdct32x32_c, &idct32x32_10, 0, VPX_BITS_10), + make_tuple(&vpx_highbd_fdct32x32_rd_c, &idct32x32_10, 1, VPX_BITS_10), + make_tuple(&vpx_highbd_fdct32x32_c, &idct32x32_12, 0, VPX_BITS_12), + make_tuple(&vpx_highbd_fdct32x32_rd_c, &idct32x32_12, 1, VPX_BITS_12), + make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, 0, VPX_BITS_8), + make_tuple(&vpx_fdct32x32_rd_c, &vpx_idct32x32_1024_add_c, 1, + VPX_BITS_8))); +#else +INSTANTIATE_TEST_SUITE_P( + C, Trans32x32Test, + ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, 0, + VPX_BITS_8), + make_tuple(&vpx_fdct32x32_rd_c, &vpx_idct32x32_1024_add_c, + 1, VPX_BITS_8))); + +INSTANTIATE_TEST_SUITE_P( + C, InvTrans32x32Test, + ::testing::Values( + (make_tuple(&vpx_idct32x32_1024_add_c, &vpx_idct32x32_1024_add_c, 0, + VPX_BITS_8, 32, 6225)), + make_tuple(&vpx_idct32x32_135_add_c, &vpx_idct32x32_135_add_c, 0, + VPX_BITS_8, 16, 6255))); +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P( + NEON, Trans32x32Test, + ::testing::Values(make_tuple(&vpx_fdct32x32_neon, + &vpx_idct32x32_1024_add_neon, 0, VPX_BITS_8), + make_tuple(&vpx_fdct32x32_rd_neon, + &vpx_idct32x32_1024_add_neon, 1, VPX_BITS_8))); +#endif // HAVE_NEON && !CONFIG_EMULATE_HARDWARE + +#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P( + SSE2, Trans32x32Test, + ::testing::Values(make_tuple(&vpx_fdct32x32_sse2, + &vpx_idct32x32_1024_add_sse2, 0, VPX_BITS_8), + make_tuple(&vpx_fdct32x32_rd_sse2, + &vpx_idct32x32_1024_add_sse2, 1, VPX_BITS_8))); + +INSTANTIATE_TEST_SUITE_P( + SSE2, InvTrans32x32Test, + ::testing::Values( + (make_tuple(&vpx_idct32x32_1024_add_c, &vpx_idct32x32_1024_add_sse2, 0, + VPX_BITS_8, 32, 6225)), + make_tuple(&vpx_idct32x32_135_add_c, &vpx_idct32x32_135_add_sse2, 0, + VPX_BITS_8, 16, 6225))); +#endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P( + SSE2, Trans32x32Test, + ::testing::Values( + make_tuple(&vpx_highbd_fdct32x32_sse2, &idct32x32_10, 0, VPX_BITS_10), + make_tuple(&vpx_highbd_fdct32x32_rd_sse2, &idct32x32_10, 1, + VPX_BITS_10), + make_tuple(&vpx_highbd_fdct32x32_sse2, &idct32x32_12, 0, VPX_BITS_12), + make_tuple(&vpx_highbd_fdct32x32_rd_sse2, &idct32x32_12, 1, + VPX_BITS_12), + make_tuple(&vpx_fdct32x32_sse2, &vpx_idct32x32_1024_add_c, 0, + VPX_BITS_8), + make_tuple(&vpx_fdct32x32_rd_sse2, &vpx_idct32x32_1024_add_c, 1, + VPX_BITS_8))); +#endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +#if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P( + AVX2, Trans32x32Test, + ::testing::Values(make_tuple(&vpx_fdct32x32_avx2, + &vpx_idct32x32_1024_add_sse2, 0, VPX_BITS_8), + make_tuple(&vpx_fdct32x32_rd_avx2, + &vpx_idct32x32_1024_add_sse2, 1, VPX_BITS_8))); + +INSTANTIATE_TEST_SUITE_P( + AVX2, InvTrans32x32Test, + ::testing::Values( + (make_tuple(&vpx_idct32x32_1024_add_c, &vpx_idct32x32_1024_add_avx2, 0, + VPX_BITS_8, 32, 6225)), + make_tuple(&vpx_idct32x32_135_add_c, &vpx_idct32x32_135_add_avx2, 0, + VPX_BITS_8, 16, 6225))); +#endif // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P( + MSA, Trans32x32Test, + ::testing::Values(make_tuple(&vpx_fdct32x32_msa, + &vpx_idct32x32_1024_add_msa, 0, VPX_BITS_8), + make_tuple(&vpx_fdct32x32_rd_msa, + &vpx_idct32x32_1024_add_msa, 1, VPX_BITS_8))); +#endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P( + VSX, Trans32x32Test, + ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_vsx, + 0, VPX_BITS_8), + make_tuple(&vpx_fdct32x32_rd_vsx, + &vpx_idct32x32_1024_add_vsx, 1, VPX_BITS_8))); +#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P( + LSX, Trans32x32Test, + ::testing::Values(make_tuple(&vpx_fdct32x32_lsx, + &vpx_idct32x32_1024_add_lsx, 0, VPX_BITS_8), + make_tuple(&vpx_fdct32x32_rd_lsx, + &vpx_idct32x32_1024_add_lsx, 1, VPX_BITS_8))); +#endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +} // namespace diff --git a/media/libvpx/libvpx/test/dct_partial_test.cc b/media/libvpx/libvpx/test/dct_partial_test.cc new file mode 100644 index 0000000000..ec6f543f71 --- /dev/null +++ b/media/libvpx/libvpx/test/dct_partial_test.cc @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vpx_dsp_rtcd.h" +#include "test/acm_random.h" +#include "test/buffer.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" + +using libvpx_test::ACMRandom; +using libvpx_test::Buffer; +using std::make_tuple; +using std::tuple; + +namespace { +typedef void (*PartialFdctFunc)(const int16_t *in, tran_low_t *out, int stride); + +typedef tuple + PartialFdctParam; + +tran_low_t partial_fdct_ref(const Buffer &in, int size) { + int64_t sum = 0; + if (in.TopLeftPixel() != nullptr) { + for (int y = 0; y < size; ++y) { + for (int x = 0; x < size; ++x) { + sum += in.TopLeftPixel()[y * in.stride() + x]; + } + } + } else { + assert(0); + } + + switch (size) { + case 4: sum *= 2; break; + case 8: /*sum = sum;*/ break; + case 16: sum >>= 1; break; + case 32: sum >>= 3; break; + } + + return static_cast(sum); +} + +class PartialFdctTest : public ::testing::TestWithParam { + public: + PartialFdctTest() { + fwd_txfm_ = GET_PARAM(0); + size_ = GET_PARAM(1); + bit_depth_ = GET_PARAM(2); + } + + void TearDown() override { libvpx_test::ClearSystemState(); } + + protected: + void RunTest() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int16_t maxvalue = + clip_pixel_highbd(std::numeric_limits::max(), bit_depth_); + const int16_t minvalue = -maxvalue; + Buffer input_block = + Buffer(size_, size_, 8, size_ == 4 ? 0 : 16); + ASSERT_TRUE(input_block.Init()); + Buffer output_block = Buffer(size_, size_, 0, 16); + ASSERT_TRUE(output_block.Init()); + + if (output_block.TopLeftPixel() != nullptr) { + for (int i = 0; i < 100; ++i) { + if (i == 0) { + input_block.Set(maxvalue); + } else if (i == 1) { + input_block.Set(minvalue); + } else { + input_block.Set(&rnd, minvalue, maxvalue); + } + + ASM_REGISTER_STATE_CHECK(fwd_txfm_(input_block.TopLeftPixel(), + output_block.TopLeftPixel(), + input_block.stride())); + + EXPECT_EQ(partial_fdct_ref(input_block, size_), + output_block.TopLeftPixel()[0]); + } + } else { + assert(0); + } + } + + PartialFdctFunc fwd_txfm_; + vpx_bit_depth_t bit_depth_; + int size_; +}; + +TEST_P(PartialFdctTest, PartialFdctTest) { RunTest(); } + +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + C, PartialFdctTest, + ::testing::Values(make_tuple(&vpx_highbd_fdct32x32_1_c, 32, VPX_BITS_12), + make_tuple(&vpx_highbd_fdct32x32_1_c, 32, VPX_BITS_10), + make_tuple(&vpx_fdct32x32_1_c, 32, VPX_BITS_8), + make_tuple(&vpx_highbd_fdct16x16_1_c, 16, VPX_BITS_12), + make_tuple(&vpx_highbd_fdct16x16_1_c, 16, VPX_BITS_10), + make_tuple(&vpx_fdct16x16_1_c, 16, VPX_BITS_8), + make_tuple(&vpx_highbd_fdct8x8_1_c, 8, VPX_BITS_12), + make_tuple(&vpx_highbd_fdct8x8_1_c, 8, VPX_BITS_10), + make_tuple(&vpx_fdct8x8_1_c, 8, VPX_BITS_8), + make_tuple(&vpx_fdct4x4_1_c, 4, VPX_BITS_8))); +#else +INSTANTIATE_TEST_SUITE_P( + C, PartialFdctTest, + ::testing::Values(make_tuple(&vpx_fdct32x32_1_c, 32, VPX_BITS_8), + make_tuple(&vpx_fdct16x16_1_c, 16, VPX_BITS_8), + make_tuple(&vpx_fdct8x8_1_c, 8, VPX_BITS_8), + make_tuple(&vpx_fdct4x4_1_c, 4, VPX_BITS_8))); +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2, PartialFdctTest, + ::testing::Values(make_tuple(&vpx_fdct32x32_1_sse2, 32, VPX_BITS_8), + make_tuple(&vpx_fdct16x16_1_sse2, 16, VPX_BITS_8), + make_tuple(&vpx_fdct8x8_1_sse2, 8, VPX_BITS_8), + make_tuple(&vpx_fdct4x4_1_sse2, 4, VPX_BITS_8))); +#endif // HAVE_SSE2 + +#if HAVE_NEON +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + NEON, PartialFdctTest, + ::testing::Values(make_tuple(&vpx_highbd_fdct32x32_1_neon, 32, VPX_BITS_12), + make_tuple(&vpx_highbd_fdct32x32_1_neon, 32, VPX_BITS_10), + make_tuple(&vpx_highbd_fdct32x32_1_neon, 32, VPX_BITS_8), + make_tuple(&vpx_highbd_fdct16x16_1_neon, 16, VPX_BITS_12), + make_tuple(&vpx_highbd_fdct16x16_1_neon, 16, VPX_BITS_10), + make_tuple(&vpx_highbd_fdct16x16_1_neon, 16, VPX_BITS_8), + make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_12), + make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_10), + make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_8), + make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_12), + make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_10), + make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_8))); +#else +INSTANTIATE_TEST_SUITE_P( + NEON, PartialFdctTest, + ::testing::Values(make_tuple(&vpx_fdct32x32_1_neon, 32, VPX_BITS_8), + make_tuple(&vpx_fdct16x16_1_neon, 16, VPX_BITS_8), + make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_8), + make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_8))); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_NEON + +#if HAVE_MSA +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P(MSA, PartialFdctTest, + ::testing::Values(make_tuple(&vpx_fdct8x8_1_msa, 8, + VPX_BITS_8))); +#else // !CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + MSA, PartialFdctTest, + ::testing::Values(make_tuple(&vpx_fdct32x32_1_msa, 32, VPX_BITS_8), + make_tuple(&vpx_fdct16x16_1_msa, 16, VPX_BITS_8), + make_tuple(&vpx_fdct8x8_1_msa, 8, VPX_BITS_8))); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_MSA +} // namespace diff --git a/media/libvpx/libvpx/test/dct_test.cc b/media/libvpx/libvpx/test/dct_test.cc new file mode 100644 index 0000000000..c3d3081c42 --- /dev/null +++ b/media/libvpx/libvpx/test/dct_test.cc @@ -0,0 +1,790 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "test/acm_random.h" +#include "test/buffer.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "vp9/common/vp9_entropy.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +using libvpx_test::ACMRandom; +using libvpx_test::Buffer; +using std::make_tuple; +using std::tuple; + +namespace { +typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride); +typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride); +typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride, + int tx_type); +typedef void (*FhtFuncRef)(const Buffer &in, Buffer *out, + int size, int tx_type); +typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride, + int tx_type); +typedef void (*IhtWithBdFunc)(const tran_low_t *in, uint8_t *out, int stride, + int tx_type, int bd); + +template +void fdct_wrapper(const int16_t *in, tran_low_t *out, int stride, int tx_type) { + (void)tx_type; + fn(in, out, stride); +} + +template +void idct_wrapper(const tran_low_t *in, uint8_t *out, int stride, int tx_type, + int bd) { + (void)tx_type; + (void)bd; + fn(in, out, stride); +} + +template +void iht_wrapper(const tran_low_t *in, uint8_t *out, int stride, int tx_type, + int bd) { + (void)bd; + fn(in, out, stride, tx_type); +} + +#if CONFIG_VP9_HIGHBITDEPTH +typedef void (*HighbdIdctFunc)(const tran_low_t *in, uint16_t *out, int stride, + int bd); + +typedef void (*HighbdIhtFunc)(const tran_low_t *in, uint16_t *out, int stride, + int tx_type, int bd); + +template +void highbd_idct_wrapper(const tran_low_t *in, uint8_t *out, int stride, + int tx_type, int bd) { + (void)tx_type; + fn(in, CAST_TO_SHORTPTR(out), stride, bd); +} + +template +void highbd_iht_wrapper(const tran_low_t *in, uint8_t *out, int stride, + int tx_type, int bd) { + fn(in, CAST_TO_SHORTPTR(out), stride, tx_type, bd); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +struct FuncInfo { + FhtFunc ft_func; + IhtWithBdFunc it_func; + int size; + int pixel_size; +}; + +/* forward transform, inverse transform, size, transform type, bit depth */ +typedef tuple DctParam; + +void fdct_ref(const Buffer &in, Buffer *out, int size, + int /*tx_type*/) { + const int16_t *i = in.TopLeftPixel(); + const int i_stride = in.stride(); + tran_low_t *o = out->TopLeftPixel(); + if (size == 4) { + vpx_fdct4x4_c(i, o, i_stride); + } else if (size == 8) { + vpx_fdct8x8_c(i, o, i_stride); + } else if (size == 16) { + vpx_fdct16x16_c(i, o, i_stride); + } else if (size == 32) { + vpx_fdct32x32_c(i, o, i_stride); + } +} + +void fht_ref(const Buffer &in, Buffer *out, int size, + int tx_type) { + const int16_t *i = in.TopLeftPixel(); + const int i_stride = in.stride(); + tran_low_t *o = out->TopLeftPixel(); + if (size == 4) { + vp9_fht4x4_c(i, o, i_stride, tx_type); + } else if (size == 8) { + vp9_fht8x8_c(i, o, i_stride, tx_type); + } else if (size == 16) { + vp9_fht16x16_c(i, o, i_stride, tx_type); + } +} + +void fwht_ref(const Buffer &in, Buffer *out, int size, + int /*tx_type*/) { + ASSERT_EQ(size, 4); + vp9_fwht4x4_c(in.TopLeftPixel(), out->TopLeftPixel(), in.stride()); +} + +class TransTestBase : public ::testing::TestWithParam { + public: + void SetUp() override { + rnd_.Reset(ACMRandom::DeterministicSeed()); + const int idx = GET_PARAM(0); + const FuncInfo *func_info = &(GET_PARAM(1)[idx]); + tx_type_ = GET_PARAM(2); + bit_depth_ = GET_PARAM(3); + fwd_txfm_ = func_info->ft_func; + inv_txfm_ = func_info->it_func; + size_ = func_info->size; + pixel_size_ = func_info->pixel_size; + max_pixel_value_ = (1 << bit_depth_) - 1; + + // Randomize stride_ to a value less than or equal to 1024 + stride_ = rnd_(1024) + 1; + if (stride_ < size_) { + stride_ = size_; + } + // Align stride_ to 16 if it's bigger than 16. + if (stride_ > 16) { + stride_ &= ~15; + } + + block_size_ = size_ * stride_; + + src_ = reinterpret_cast( + vpx_memalign(16, pixel_size_ * block_size_)); + ASSERT_NE(src_, nullptr); + dst_ = reinterpret_cast( + vpx_memalign(16, pixel_size_ * block_size_)); + ASSERT_NE(dst_, nullptr); + } + + void TearDown() override { + vpx_free(src_); + src_ = nullptr; + vpx_free(dst_); + dst_ = nullptr; + libvpx_test::ClearSystemState(); + } + + void InitMem() { + if (pixel_size_ == 1 && bit_depth_ > VPX_BITS_8) return; + if (pixel_size_ == 1) { + for (int j = 0; j < block_size_; ++j) { + src_[j] = rnd_.Rand16() & max_pixel_value_; + } + for (int j = 0; j < block_size_; ++j) { + dst_[j] = rnd_.Rand16() & max_pixel_value_; + } + } else { + ASSERT_EQ(pixel_size_, 2); + uint16_t *const src = reinterpret_cast(src_); + uint16_t *const dst = reinterpret_cast(dst_); + for (int j = 0; j < block_size_; ++j) { + src[j] = rnd_.Rand16() & max_pixel_value_; + } + for (int j = 0; j < block_size_; ++j) { + dst[j] = rnd_.Rand16() & max_pixel_value_; + } + } + } + + void RunFwdTxfm(const Buffer &in, Buffer *out) { + fwd_txfm_(in.TopLeftPixel(), out->TopLeftPixel(), in.stride(), tx_type_); + } + + void RunInvTxfm(const Buffer &in, uint8_t *out) { + inv_txfm_(in.TopLeftPixel(), out, stride_, tx_type_, bit_depth_); + } + + protected: + void RunAccuracyCheck(int limit) { + if (pixel_size_ == 1 && bit_depth_ > VPX_BITS_8) return; + ACMRandom rnd(ACMRandom::DeterministicSeed()); + Buffer test_input_block = + Buffer(size_, size_, 8, size_ == 4 ? 0 : 16); + ASSERT_TRUE(test_input_block.Init()); + ASSERT_NE(test_input_block.TopLeftPixel(), nullptr); + Buffer test_temp_block = + Buffer(size_, size_, 0, 16); + ASSERT_TRUE(test_temp_block.Init()); + uint32_t max_error = 0; + int64_t total_error = 0; + const int count_test_block = 10000; + for (int i = 0; i < count_test_block; ++i) { + InitMem(); + for (int h = 0; h < size_; ++h) { + for (int w = 0; w < size_; ++w) { + if (pixel_size_ == 1) { + test_input_block.TopLeftPixel()[h * test_input_block.stride() + w] = + src_[h * stride_ + w] - dst_[h * stride_ + w]; + } else { + ASSERT_EQ(pixel_size_, 2); + const uint16_t *const src = reinterpret_cast(src_); + const uint16_t *const dst = reinterpret_cast(dst_); + test_input_block.TopLeftPixel()[h * test_input_block.stride() + w] = + src[h * stride_ + w] - dst[h * stride_ + w]; + } + } + } + + ASM_REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block, &test_temp_block)); + ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst_)); + + for (int h = 0; h < size_; ++h) { + for (int w = 0; w < size_; ++w) { + int diff; + if (pixel_size_ == 1) { + diff = dst_[h * stride_ + w] - src_[h * stride_ + w]; + } else { + ASSERT_EQ(pixel_size_, 2); + const uint16_t *const src = reinterpret_cast(src_); + const uint16_t *const dst = reinterpret_cast(dst_); + diff = dst[h * stride_ + w] - src[h * stride_ + w]; + } + const uint32_t error = diff * diff; + if (max_error < error) max_error = error; + total_error += error; + } + } + } + + EXPECT_GE(static_cast(limit), max_error) + << "Error: " << size_ << "x" << size_ + << " transform/inverse transform has an individual round trip error > " + << limit; + + EXPECT_GE(count_test_block * limit, total_error) + << "Error: " << size_ << "x" << size_ + << " transform/inverse transform has average round trip error > " + << limit << " per block"; + } + + void RunCoeffCheck() { + if (pixel_size_ == 1 && bit_depth_ > VPX_BITS_8) return; + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 5000; + Buffer input_block = + Buffer(size_, size_, 8, size_ == 4 ? 0 : 16); + ASSERT_TRUE(input_block.Init()); + Buffer output_ref_block = Buffer(size_, size_, 0); + ASSERT_TRUE(output_ref_block.Init()); + Buffer output_block = Buffer(size_, size_, 0, 16); + ASSERT_TRUE(output_block.Init()); + + for (int i = 0; i < count_test_block; ++i) { + // Initialize a test block with input range [-max_pixel_value_, + // max_pixel_value_]. + input_block.Set(&rnd, -max_pixel_value_, max_pixel_value_); + + fwd_txfm_ref(input_block, &output_ref_block, size_, tx_type_); + ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, &output_block)); + + // The minimum quant value is 4. + EXPECT_TRUE(output_block.CheckValues(output_ref_block)); + if (::testing::Test::HasFailure()) { + printf("Size: %d Transform type: %d\n", size_, tx_type_); + output_block.PrintDifference(output_ref_block); + return; + } + } + } + + void RunMemCheck() { + if (pixel_size_ == 1 && bit_depth_ > VPX_BITS_8) return; + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 5000; + Buffer input_extreme_block = + Buffer(size_, size_, 8, size_ == 4 ? 0 : 16); + ASSERT_TRUE(input_extreme_block.Init()); + Buffer output_ref_block = Buffer(size_, size_, 0); + ASSERT_TRUE(output_ref_block.Init()); + Buffer output_block = Buffer(size_, size_, 0, 16); + ASSERT_TRUE(output_block.Init()); + + for (int i = 0; i < count_test_block; ++i) { + // Initialize a test block with -max_pixel_value_ or max_pixel_value_. + if (i == 0) { + input_extreme_block.Set(max_pixel_value_); + } else if (i == 1) { + input_extreme_block.Set(-max_pixel_value_); + } else { + ASSERT_NE(input_extreme_block.TopLeftPixel(), nullptr); + for (int h = 0; h < size_; ++h) { + for (int w = 0; w < size_; ++w) { + input_extreme_block + .TopLeftPixel()[h * input_extreme_block.stride() + w] = + rnd.Rand8() % 2 ? max_pixel_value_ : -max_pixel_value_; + } + } + } + + fwd_txfm_ref(input_extreme_block, &output_ref_block, size_, tx_type_); + ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block, &output_block)); + + // The minimum quant value is 4. + EXPECT_TRUE(output_block.CheckValues(output_ref_block)); + ASSERT_NE(output_block.TopLeftPixel(), nullptr); + for (int h = 0; h < size_; ++h) { + for (int w = 0; w < size_; ++w) { + EXPECT_GE( + 4 * DCT_MAX_VALUE << (bit_depth_ - 8), + abs(output_block.TopLeftPixel()[h * output_block.stride() + w])) + << "Error: " << size_ << "x" << size_ + << " transform has coefficient larger than 4*DCT_MAX_VALUE" + << " at " << w << "," << h; + if (::testing::Test::HasFailure()) { + printf("Size: %d Transform type: %d\n", size_, tx_type_); + output_block.DumpBuffer(); + return; + } + } + } + } + } + + void RunInvAccuracyCheck(int limit) { + if (pixel_size_ == 1 && bit_depth_ > VPX_BITS_8) return; + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 1000; + Buffer in = Buffer(size_, size_, 4); + ASSERT_TRUE(in.Init()); + Buffer coeff = Buffer(size_, size_, 0, 16); + ASSERT_TRUE(coeff.Init()); + + for (int i = 0; i < count_test_block; ++i) { + InitMem(); + ASSERT_NE(in.TopLeftPixel(), nullptr); + // Initialize a test block with input range [-max_pixel_value_, + // max_pixel_value_]. + for (int h = 0; h < size_; ++h) { + for (int w = 0; w < size_; ++w) { + if (pixel_size_ == 1) { + in.TopLeftPixel()[h * in.stride() + w] = + src_[h * stride_ + w] - dst_[h * stride_ + w]; + } else { + ASSERT_EQ(pixel_size_, 2); + const uint16_t *const src = reinterpret_cast(src_); + const uint16_t *const dst = reinterpret_cast(dst_); + in.TopLeftPixel()[h * in.stride() + w] = + src[h * stride_ + w] - dst[h * stride_ + w]; + } + } + } + + fwd_txfm_ref(in, &coeff, size_, tx_type_); + + ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst_)); + + for (int h = 0; h < size_; ++h) { + for (int w = 0; w < size_; ++w) { + int diff; + if (pixel_size_ == 1) { + diff = dst_[h * stride_ + w] - src_[h * stride_ + w]; + } else { + ASSERT_EQ(pixel_size_, 2); + const uint16_t *const src = reinterpret_cast(src_); + const uint16_t *const dst = reinterpret_cast(dst_); + diff = dst[h * stride_ + w] - src[h * stride_ + w]; + } + const uint32_t error = diff * diff; + EXPECT_GE(static_cast(limit), error) + << "Error: " << size_ << "x" << size_ + << " inverse transform has error " << error << " at " << w << "," + << h; + if (::testing::Test::HasFailure()) { + printf("Size: %d Transform type: %d\n", size_, tx_type_); + return; + } + } + } + } + } + + FhtFunc fwd_txfm_; + FhtFuncRef fwd_txfm_ref; + IhtWithBdFunc inv_txfm_; + ACMRandom rnd_; + uint8_t *src_; + uint8_t *dst_; + vpx_bit_depth_t bit_depth_; + int tx_type_; + int max_pixel_value_; + int size_; + int stride_; + int pixel_size_; + int block_size_; +}; + +/* -------------------------------------------------------------------------- */ + +class TransDCT : public TransTestBase { + public: + TransDCT() { fwd_txfm_ref = fdct_ref; } +}; + +TEST_P(TransDCT, AccuracyCheck) { + int t = 1; + if (size_ == 16 && bit_depth_ > 10 && pixel_size_ == 2) { + t = 2; + } else if (size_ == 32 && bit_depth_ > 10 && pixel_size_ == 2) { + t = 7; + } + RunAccuracyCheck(t); +} + +TEST_P(TransDCT, CoeffCheck) { RunCoeffCheck(); } + +TEST_P(TransDCT, MemCheck) { RunMemCheck(); } + +TEST_P(TransDCT, InvAccuracyCheck) { RunInvAccuracyCheck(1); } + +static const FuncInfo dct_c_func_info[] = { +#if CONFIG_VP9_HIGHBITDEPTH + { &fdct_wrapper, + &highbd_idct_wrapper, 4, 2 }, + { &fdct_wrapper, + &highbd_idct_wrapper, 8, 2 }, + { &fdct_wrapper, + &highbd_idct_wrapper, 16, 2 }, + { &fdct_wrapper, + &highbd_idct_wrapper, 32, 2 }, +#endif + { &fdct_wrapper, &idct_wrapper, 4, 1 }, + { &fdct_wrapper, &idct_wrapper, 8, 1 }, + { &fdct_wrapper, &idct_wrapper, 16, + 1 }, + { &fdct_wrapper, &idct_wrapper, 32, + 1 } +}; + +INSTANTIATE_TEST_SUITE_P( + C, TransDCT, + ::testing::Combine( + ::testing::Range(0, static_cast(sizeof(dct_c_func_info) / + sizeof(dct_c_func_info[0]))), + ::testing::Values(dct_c_func_info), ::testing::Values(0), + ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12))); + +#if !CONFIG_EMULATE_HARDWARE + +#if HAVE_SSE2 +static const FuncInfo dct_sse2_func_info[] = { +#if CONFIG_VP9_HIGHBITDEPTH + { &fdct_wrapper, + &highbd_idct_wrapper, 4, 2 }, + { &fdct_wrapper, + &highbd_idct_wrapper, 8, 2 }, + { &fdct_wrapper, + &highbd_idct_wrapper, 16, 2 }, + { &fdct_wrapper, + &highbd_idct_wrapper, 32, 2 }, +#endif + { &fdct_wrapper, &idct_wrapper, 4, + 1 }, + { &fdct_wrapper, &idct_wrapper, 8, + 1 }, + { &fdct_wrapper, + &idct_wrapper, 16, 1 }, + { &fdct_wrapper, + &idct_wrapper, 32, 1 } +}; + +INSTANTIATE_TEST_SUITE_P( + SSE2, TransDCT, + ::testing::Combine( + ::testing::Range(0, static_cast(sizeof(dct_sse2_func_info) / + sizeof(dct_sse2_func_info[0]))), + ::testing::Values(dct_sse2_func_info), ::testing::Values(0), + ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12))); +#endif // HAVE_SSE2 + +#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64 +// vpx_fdct8x8_ssse3 is only available in 64 bit builds. +static const FuncInfo dct_ssse3_func_info = { + &fdct_wrapper, &idct_wrapper, 8, 1 +}; + +// TODO(johannkoenig): high bit depth fdct8x8. +INSTANTIATE_TEST_SUITE_P(SSSE3, TransDCT, + ::testing::Values(make_tuple(0, &dct_ssse3_func_info, + 0, VPX_BITS_8))); +#endif // HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64 + +#if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH +static const FuncInfo dct_avx2_func_info = { + &fdct_wrapper, &idct_wrapper, + 32, 1 +}; + +// TODO(johannkoenig): high bit depth fdct32x32. +INSTANTIATE_TEST_SUITE_P(AVX2, TransDCT, + ::testing::Values(make_tuple(0, &dct_avx2_func_info, 0, + VPX_BITS_8))); +#endif // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_NEON +#if CONFIG_VP9_HIGHBITDEPTH +static const FuncInfo dct_neon_func_info[] = { + { &fdct_wrapper, + &highbd_idct_wrapper, 4, 2 }, + { &fdct_wrapper, + &highbd_idct_wrapper, 8, 2 }, + { &fdct_wrapper, + &highbd_idct_wrapper, 16, 2 }, + /* { &fdct_wrapper, + &highbd_idct_wrapper, 32, 2 },*/ +}; +#else +static const FuncInfo dct_neon_func_info[4] = { + { &fdct_wrapper, &idct_wrapper, 4, + 1 }, + { &fdct_wrapper, &idct_wrapper, 8, + 1 }, + { &fdct_wrapper, + &idct_wrapper, 16, 1 }, + { &fdct_wrapper, + &idct_wrapper, 32, 1 } +}; +#endif // CONFIG_VP9_HIGHBITDEPTH + +INSTANTIATE_TEST_SUITE_P( + NEON, TransDCT, + ::testing::Combine( + ::testing::Range(0, static_cast(sizeof(dct_neon_func_info) / + sizeof(dct_neon_func_info[0]))), + ::testing::Values(dct_neon_func_info), ::testing::Values(0), + ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12))); +#endif // HAVE_NEON + +#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH +static const FuncInfo dct_msa_func_info[4] = { + { &fdct_wrapper, &idct_wrapper, 4, + 1 }, + { &fdct_wrapper, &idct_wrapper, 8, + 1 }, + { &fdct_wrapper, &idct_wrapper, + 16, 1 }, + { &fdct_wrapper, &idct_wrapper, + 32, 1 } +}; + +INSTANTIATE_TEST_SUITE_P( + MSA, TransDCT, + ::testing::Combine(::testing::Range(0, 4), + ::testing::Values(dct_msa_func_info), + ::testing::Values(0), ::testing::Values(VPX_BITS_8))); +#endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH +static const FuncInfo dct_vsx_func_info = { + &fdct_wrapper, &idct_wrapper, 4, 1 +}; + +INSTANTIATE_TEST_SUITE_P(VSX, TransDCT, + ::testing::Values(make_tuple(0, &dct_vsx_func_info, 0, + VPX_BITS_8))); +#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && + +#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH +static const FuncInfo dct_lsx_func_info[4] = { + { &fdct_wrapper, &idct_wrapper, 4, 1 }, + { &fdct_wrapper, &idct_wrapper, 8, 1 }, + { &fdct_wrapper, &idct_wrapper, + 16, 1 }, + { &fdct_wrapper, &idct_wrapper, + 32, 1 } +}; + +INSTANTIATE_TEST_SUITE_P( + LSX, TransDCT, + ::testing::Combine(::testing::Range(0, 4), + ::testing::Values(dct_lsx_func_info), + ::testing::Values(0), ::testing::Values(VPX_BITS_8))); +#endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH + +#endif // !CONFIG_EMULATE_HARDWARE + +/* -------------------------------------------------------------------------- */ + +class TransHT : public TransTestBase { + public: + TransHT() { fwd_txfm_ref = fht_ref; } +}; + +TEST_P(TransHT, AccuracyCheck) { + RunAccuracyCheck(size_ == 16 && bit_depth_ > 10 && pixel_size_ == 2 ? 2 : 1); +} + +TEST_P(TransHT, CoeffCheck) { RunCoeffCheck(); } + +TEST_P(TransHT, MemCheck) { RunMemCheck(); } + +TEST_P(TransHT, InvAccuracyCheck) { RunInvAccuracyCheck(1); } + +static const FuncInfo ht_c_func_info[] = { +#if CONFIG_VP9_HIGHBITDEPTH + { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper, 4, + 2 }, + { &vp9_highbd_fht8x8_c, &highbd_iht_wrapper, 8, + 2 }, + { &vp9_highbd_fht16x16_c, &highbd_iht_wrapper, + 16, 2 }, +#endif + { &vp9_fht4x4_c, &iht_wrapper, 4, 1 }, + { &vp9_fht8x8_c, &iht_wrapper, 8, 1 }, + { &vp9_fht16x16_c, &iht_wrapper, 16, 1 } +}; + +INSTANTIATE_TEST_SUITE_P( + C, TransHT, + ::testing::Combine( + ::testing::Range(0, static_cast(sizeof(ht_c_func_info) / + sizeof(ht_c_func_info[0]))), + ::testing::Values(ht_c_func_info), ::testing::Range(0, 4), + ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12))); + +#if !CONFIG_EMULATE_HARDWARE + +#if HAVE_NEON + +static const FuncInfo ht_neon_func_info[] = { +#if CONFIG_VP9_HIGHBITDEPTH + { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper, 4, + 2 }, + { &vp9_highbd_fht4x4_neon, &highbd_iht_wrapper, + 4, 2 }, + { &vp9_highbd_fht8x8_c, &highbd_iht_wrapper, 8, + 2 }, + { &vp9_highbd_fht8x8_neon, &highbd_iht_wrapper, + 8, 2 }, + { &vp9_highbd_fht16x16_c, + &highbd_iht_wrapper, 16, 2 }, + { &vp9_highbd_fht16x16_neon, + &highbd_iht_wrapper, 16, 2 }, +#endif + { &vp9_fht4x4_c, &iht_wrapper, 4, 1 }, + { &vp9_fht4x4_neon, &iht_wrapper, 4, 1 }, + { &vp9_fht8x8_c, &iht_wrapper, 8, 1 }, + { &vp9_fht8x8_neon, &iht_wrapper, 8, 1 }, + { &vp9_fht16x16_c, &iht_wrapper, 16, 1 }, + { &vp9_fht16x16_neon, &iht_wrapper, 16, 1 } +}; + +INSTANTIATE_TEST_SUITE_P( + NEON, TransHT, + ::testing::Combine( + ::testing::Range(0, static_cast(sizeof(ht_neon_func_info) / + sizeof(ht_neon_func_info[0]))), + ::testing::Values(ht_neon_func_info), ::testing::Range(0, 4), + ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12))); +#endif // HAVE_NEON + +#if HAVE_SSE2 + +static const FuncInfo ht_sse2_func_info[3] = { + { &vp9_fht4x4_sse2, &iht_wrapper, 4, 1 }, + { &vp9_fht8x8_sse2, &iht_wrapper, 8, 1 }, + { &vp9_fht16x16_sse2, &iht_wrapper, 16, 1 } +}; + +INSTANTIATE_TEST_SUITE_P( + SSE2, TransHT, + ::testing::Combine(::testing::Range(0, 3), + ::testing::Values(ht_sse2_func_info), + ::testing::Range(0, 4), ::testing::Values(VPX_BITS_8))); +#endif // HAVE_SSE2 + +#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH +static const FuncInfo ht_sse4_1_func_info[3] = { + { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper, + 4, 2 }, + { vp9_highbd_fht8x8_c, &highbd_iht_wrapper, + 8, 2 }, + { &vp9_highbd_fht16x16_c, + &highbd_iht_wrapper, 16, 2 } +}; + +INSTANTIATE_TEST_SUITE_P( + SSE4_1, TransHT, + ::testing::Combine(::testing::Range(0, 3), + ::testing::Values(ht_sse4_1_func_info), + ::testing::Range(0, 4), + ::testing::Values(VPX_BITS_8, VPX_BITS_10, + VPX_BITS_12))); +#endif // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_VSX && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH +static const FuncInfo ht_vsx_func_info[3] = { + { &vp9_fht4x4_c, &iht_wrapper, 4, 1 }, + { &vp9_fht8x8_c, &iht_wrapper, 8, 1 }, + { &vp9_fht16x16_c, &iht_wrapper, 16, 1 } +}; + +INSTANTIATE_TEST_SUITE_P(VSX, TransHT, + ::testing::Combine(::testing::Range(0, 3), + ::testing::Values(ht_vsx_func_info), + ::testing::Range(0, 4), + ::testing::Values(VPX_BITS_8))); +#endif // HAVE_VSX +#endif // !CONFIG_EMULATE_HARDWARE + +/* -------------------------------------------------------------------------- */ + +class TransWHT : public TransTestBase { + public: + TransWHT() { fwd_txfm_ref = fwht_ref; } +}; + +TEST_P(TransWHT, AccuracyCheck) { RunAccuracyCheck(0); } + +TEST_P(TransWHT, CoeffCheck) { RunCoeffCheck(); } + +TEST_P(TransWHT, MemCheck) { RunMemCheck(); } + +TEST_P(TransWHT, InvAccuracyCheck) { RunInvAccuracyCheck(0); } + +static const FuncInfo wht_c_func_info[] = { +#if CONFIG_VP9_HIGHBITDEPTH + { &fdct_wrapper, + &highbd_idct_wrapper, 4, 2 }, +#endif + { &fdct_wrapper, &idct_wrapper, 4, 1 } +}; + +INSTANTIATE_TEST_SUITE_P( + C, TransWHT, + ::testing::Combine( + ::testing::Range(0, static_cast(sizeof(wht_c_func_info) / + sizeof(wht_c_func_info[0]))), + ::testing::Values(wht_c_func_info), ::testing::Values(0), + ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12))); + +#if HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE +static const FuncInfo wht_sse2_func_info = { + &fdct_wrapper, &idct_wrapper, 4, 1 +}; + +INSTANTIATE_TEST_SUITE_P(SSE2, TransWHT, + ::testing::Values(make_tuple(0, &wht_sse2_func_info, 0, + VPX_BITS_8))); +#endif // HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE + +#if HAVE_VSX && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH +static const FuncInfo wht_vsx_func_info = { + &fdct_wrapper, &idct_wrapper, 4, 1 +}; + +INSTANTIATE_TEST_SUITE_P(VSX, TransWHT, + ::testing::Values(make_tuple(0, &wht_vsx_func_info, 0, + VPX_BITS_8))); +#endif // HAVE_VSX && !CONFIG_EMULATE_HARDWARE + +} // namespace diff --git a/media/libvpx/libvpx/test/decode_api_test.cc b/media/libvpx/libvpx/test/decode_api_test.cc new file mode 100644 index 0000000000..44e4397726 --- /dev/null +++ b/media/libvpx/libvpx/test/decode_api_test.cc @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vpx_config.h" +#include "test/ivf_video_source.h" +#include "vpx/vp8dx.h" +#include "vpx/vpx_decoder.h" + +namespace { + +#define NELEMENTS(x) static_cast(sizeof(x) / sizeof(x[0])) + +TEST(DecodeAPI, InvalidParams) { + static vpx_codec_iface_t *kCodecs[] = { +#if CONFIG_VP8_DECODER + &vpx_codec_vp8_dx_algo, +#endif +#if CONFIG_VP9_DECODER + &vpx_codec_vp9_dx_algo, +#endif + }; + uint8_t buf[1] = { 0 }; + vpx_codec_ctx_t dec; + + EXPECT_EQ(vpx_codec_dec_init(nullptr, nullptr, nullptr, 0), + VPX_CODEC_INVALID_PARAM); + EXPECT_EQ(vpx_codec_dec_init(&dec, nullptr, nullptr, 0), + VPX_CODEC_INVALID_PARAM); + EXPECT_EQ(vpx_codec_decode(nullptr, nullptr, 0, nullptr, 0), + VPX_CODEC_INVALID_PARAM); + EXPECT_EQ(vpx_codec_decode(nullptr, buf, 0, nullptr, 0), + VPX_CODEC_INVALID_PARAM); + EXPECT_EQ(vpx_codec_decode(nullptr, buf, NELEMENTS(buf), nullptr, 0), + VPX_CODEC_INVALID_PARAM); + EXPECT_EQ(vpx_codec_decode(nullptr, nullptr, NELEMENTS(buf), nullptr, 0), + VPX_CODEC_INVALID_PARAM); + EXPECT_EQ(vpx_codec_destroy(nullptr), VPX_CODEC_INVALID_PARAM); + EXPECT_NE(vpx_codec_error(nullptr), nullptr); + EXPECT_EQ(vpx_codec_error_detail(nullptr), nullptr); + + for (int i = 0; i < NELEMENTS(kCodecs); ++i) { + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_dec_init(nullptr, kCodecs[i], nullptr, 0)); + + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, kCodecs[i], nullptr, 0)); + EXPECT_EQ(VPX_CODEC_UNSUP_BITSTREAM, + vpx_codec_decode(&dec, buf, NELEMENTS(buf), nullptr, 0)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_decode(&dec, nullptr, NELEMENTS(buf), nullptr, 0)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_decode(&dec, buf, 0, nullptr, 0)); + + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&dec)); + } +} + +#if CONFIG_VP8_DECODER +TEST(DecodeAPI, OptionalParams) { + vpx_codec_ctx_t dec; + +#if CONFIG_ERROR_CONCEALMENT + EXPECT_EQ(VPX_CODEC_OK, + vpx_codec_dec_init(&dec, &vpx_codec_vp8_dx_algo, nullptr, + VPX_CODEC_USE_ERROR_CONCEALMENT)); +#else + EXPECT_EQ(VPX_CODEC_INCAPABLE, + vpx_codec_dec_init(&dec, &vpx_codec_vp8_dx_algo, nullptr, + VPX_CODEC_USE_ERROR_CONCEALMENT)); +#endif // CONFIG_ERROR_CONCEALMENT +} +#endif // CONFIG_VP8_DECODER + +#if CONFIG_VP9_DECODER +// Test VP9 codec controls after a decode error to ensure the code doesn't +// misbehave. +void TestVp9Controls(vpx_codec_ctx_t *dec) { + static const int kControls[] = { VP8D_GET_LAST_REF_UPDATES, + VP8D_GET_FRAME_CORRUPTED, + VP9D_GET_DISPLAY_SIZE, VP9D_GET_FRAME_SIZE }; + int val[2]; + + for (int i = 0; i < NELEMENTS(kControls); ++i) { + const vpx_codec_err_t res = vpx_codec_control_(dec, kControls[i], val); + switch (kControls[i]) { + case VP8D_GET_FRAME_CORRUPTED: + EXPECT_EQ(VPX_CODEC_ERROR, res) << kControls[i]; + break; + default: EXPECT_EQ(VPX_CODEC_OK, res) << kControls[i]; break; + } + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_control_(dec, kControls[i], nullptr)); + } + + vp9_ref_frame_t ref; + ref.idx = 0; + EXPECT_EQ(VPX_CODEC_ERROR, vpx_codec_control(dec, VP9_GET_REFERENCE, &ref)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_control(dec, VP9_GET_REFERENCE, nullptr)); + + vpx_ref_frame_t ref_copy; + const int width = 352; + const int height = 288; + EXPECT_NE(vpx_img_alloc(&ref_copy.img, VPX_IMG_FMT_I420, width, height, 1), + nullptr); + ref_copy.frame_type = VP8_LAST_FRAME; + EXPECT_EQ(VPX_CODEC_ERROR, + vpx_codec_control(dec, VP8_COPY_REFERENCE, &ref_copy)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_control(dec, VP8_COPY_REFERENCE, nullptr)); + vpx_img_free(&ref_copy.img); +} + +TEST(DecodeAPI, Vp9InvalidDecode) { + vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo; + const char filename[] = + "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf"; + libvpx_test::IVFVideoSource video(filename); + video.Init(); + video.Begin(); + ASSERT_TRUE(!HasFailure()); + + vpx_codec_ctx_t dec; + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, codec, nullptr, 0)); + const uint32_t frame_size = static_cast(video.frame_size()); +#if CONFIG_VP9_HIGHBITDEPTH + EXPECT_EQ(VPX_CODEC_MEM_ERROR, + vpx_codec_decode(&dec, video.cxdata(), frame_size, nullptr, 0)); +#else + EXPECT_EQ(VPX_CODEC_UNSUP_BITSTREAM, + vpx_codec_decode(&dec, video.cxdata(), frame_size, nullptr, 0)); +#endif + vpx_codec_iter_t iter = nullptr; + EXPECT_EQ(nullptr, vpx_codec_get_frame(&dec, &iter)); + + TestVp9Controls(&dec); + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&dec)); +} + +void TestPeekInfo(const uint8_t *const data, uint32_t data_sz, + uint32_t peek_size) { + vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo; + // Verify behavior of vpx_codec_decode. vpx_codec_decode doesn't even get + // to decoder_peek_si_internal on frames of size < 8. + if (data_sz >= 8) { + vpx_codec_ctx_t dec; + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, codec, nullptr, 0)); + EXPECT_EQ((data_sz < peek_size) ? VPX_CODEC_UNSUP_BITSTREAM + : VPX_CODEC_CORRUPT_FRAME, + vpx_codec_decode(&dec, data, data_sz, nullptr, 0)); + vpx_codec_iter_t iter = nullptr; + EXPECT_EQ(nullptr, vpx_codec_get_frame(&dec, &iter)); + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&dec)); + } + + // Verify behavior of vpx_codec_peek_stream_info. + vpx_codec_stream_info_t si; + si.sz = sizeof(si); + EXPECT_EQ((data_sz < peek_size) ? VPX_CODEC_UNSUP_BITSTREAM : VPX_CODEC_OK, + vpx_codec_peek_stream_info(codec, data, data_sz, &si)); +} + +TEST(DecodeAPI, Vp9PeekStreamInfo) { + // The first 9 bytes are valid and the rest of the bytes are made up. Until + // size 10, this should return VPX_CODEC_UNSUP_BITSTREAM and after that it + // should return VPX_CODEC_CORRUPT_FRAME. + const uint8_t data[32] = { + 0x85, 0xa4, 0xc1, 0xa1, 0x38, 0x81, 0xa3, 0x49, 0x83, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + }; + + for (uint32_t data_sz = 1; data_sz <= 32; ++data_sz) { + TestPeekInfo(data, data_sz, 10); + } +} + +TEST(DecodeAPI, Vp9PeekStreamInfoTruncated) { + // This profile 1 header requires 10.25 bytes, ensure + // vpx_codec_peek_stream_info doesn't over read. + const uint8_t profile1_data[10] = { 0xa4, 0xe9, 0x30, 0x68, 0x53, + 0xe9, 0x30, 0x68, 0x53, 0x04 }; + + for (uint32_t data_sz = 1; data_sz <= 10; ++data_sz) { + TestPeekInfo(profile1_data, data_sz, 11); + } +} +#endif // CONFIG_VP9_DECODER + +TEST(DecodeAPI, HighBitDepthCapability) { +// VP8 should not claim VP9 HBD as a capability. +#if CONFIG_VP8_DECODER + const vpx_codec_caps_t vp8_caps = vpx_codec_get_caps(&vpx_codec_vp8_dx_algo); + EXPECT_EQ(vp8_caps & VPX_CODEC_CAP_HIGHBITDEPTH, 0); +#endif + +#if CONFIG_VP9_DECODER + const vpx_codec_caps_t vp9_caps = vpx_codec_get_caps(&vpx_codec_vp9_dx_algo); +#if CONFIG_VP9_HIGHBITDEPTH + EXPECT_EQ(vp9_caps & VPX_CODEC_CAP_HIGHBITDEPTH, VPX_CODEC_CAP_HIGHBITDEPTH); +#else + EXPECT_EQ(vp9_caps & VPX_CODEC_CAP_HIGHBITDEPTH, 0); +#endif +#endif +} + +} // namespace diff --git a/media/libvpx/libvpx/test/decode_corrupted.cc b/media/libvpx/libvpx/test/decode_corrupted.cc new file mode 100644 index 0000000000..58773d7b86 --- /dev/null +++ b/media/libvpx/libvpx/test/decode_corrupted.cc @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/util.h" +#include "test/i420_video_source.h" +#include "vpx_mem/vpx_mem.h" + +namespace { + +class DecodeCorruptedFrameTest + : public ::libvpx_test::EncoderTest, + public ::testing::TestWithParam< + std::tuple > { + public: + DecodeCorruptedFrameTest() : EncoderTest(GET_PARAM(0)) {} + + protected: + ~DecodeCorruptedFrameTest() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + cfg_.g_lag_in_frames = 0; + cfg_.rc_end_usage = VPX_CBR; + cfg_.rc_buf_sz = 1000; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 600; + + // Set small key frame distance such that we insert more key frames. + cfg_.kf_max_dist = 3; + dec_cfg_.threads = 1; + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) encoder->Control(VP8E_SET_CPUUSED, 7); + } + + void MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) override {} + + const vpx_codec_cx_pkt_t *MutateEncoderOutputHook( + const vpx_codec_cx_pkt_t *pkt) override { + // Don't edit frame packet on key frame. + if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) return pkt; + if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return pkt; + + memcpy(&modified_pkt_, pkt, sizeof(*pkt)); + + // Halve the size so it's corrupted to decoder. + modified_pkt_.data.frame.sz = modified_pkt_.data.frame.sz / 2; + + return &modified_pkt_; + } + + bool HandleDecodeResult(const vpx_codec_err_t res_dec, + const libvpx_test::VideoSource & /*video*/, + libvpx_test::Decoder *decoder) override { + EXPECT_NE(res_dec, VPX_CODEC_MEM_ERROR) << decoder->DecodeError(); + return VPX_CODEC_MEM_ERROR != res_dec; + } + + vpx_codec_cx_pkt_t modified_pkt_; +}; + +TEST_P(DecodeCorruptedFrameTest, DecodeCorruptedFrame) { + cfg_.rc_target_bitrate = 200; + cfg_.g_error_resilient = 0; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +#if CONFIG_VP9 +INSTANTIATE_TEST_SUITE_P( + VP9, DecodeCorruptedFrameTest, + ::testing::Values( + static_cast(&libvpx_test::kVP9))); +#endif // CONFIG_VP9 + +#if CONFIG_VP8 +INSTANTIATE_TEST_SUITE_P( + VP8, DecodeCorruptedFrameTest, + ::testing::Values( + static_cast(&libvpx_test::kVP8))); +#endif // CONFIG_VP8 + +} // namespace diff --git a/media/libvpx/libvpx/test/decode_perf_test.cc b/media/libvpx/libvpx/test/decode_perf_test.cc new file mode 100644 index 0000000000..383fd2d896 --- /dev/null +++ b/media/libvpx/libvpx/test/decode_perf_test.cc @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "test/codec_factory.h" +#include "test/decode_test_driver.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/ivf_video_source.h" +#include "test/md5_helper.h" +#include "test/util.h" +#include "test/webm_video_source.h" +#include "vpx_ports/vpx_timer.h" +#include "./ivfenc.h" +#include "./vpx_version.h" + +using std::make_tuple; + +namespace { + +#define VIDEO_NAME 0 +#define THREADS 1 + +const double kUsecsInSec = 1000000.0; +const char kNewEncodeOutputFile[] = "new_encode.ivf"; + +/* + DecodePerfTest takes a tuple of filename + number of threads to decode with + */ +typedef std::tuple DecodePerfParam; + +const DecodePerfParam kVP9DecodePerfVectors[] = { + make_tuple("vp90-2-bbb_426x240_tile_1x1_180kbps.webm", 1), + make_tuple("vp90-2-bbb_640x360_tile_1x2_337kbps.webm", 2), + make_tuple("vp90-2-bbb_854x480_tile_1x2_651kbps.webm", 2), + make_tuple("vp90-2-bbb_1280x720_tile_1x4_1310kbps.webm", 4), + make_tuple("vp90-2-bbb_1920x1080_tile_1x1_2581kbps.webm", 1), + make_tuple("vp90-2-bbb_1920x1080_tile_1x4_2586kbps.webm", 4), + make_tuple("vp90-2-bbb_1920x1080_tile_1x4_fpm_2304kbps.webm", 4), + make_tuple("vp90-2-sintel_426x182_tile_1x1_171kbps.webm", 1), + make_tuple("vp90-2-sintel_640x272_tile_1x2_318kbps.webm", 2), + make_tuple("vp90-2-sintel_854x364_tile_1x2_621kbps.webm", 2), + make_tuple("vp90-2-sintel_1280x546_tile_1x4_1257kbps.webm", 4), + make_tuple("vp90-2-sintel_1920x818_tile_1x4_fpm_2279kbps.webm", 4), + make_tuple("vp90-2-tos_426x178_tile_1x1_181kbps.webm", 1), + make_tuple("vp90-2-tos_640x266_tile_1x2_336kbps.webm", 2), + make_tuple("vp90-2-tos_854x356_tile_1x2_656kbps.webm", 2), + make_tuple("vp90-2-tos_854x356_tile_1x2_fpm_546kbps.webm", 2), + make_tuple("vp90-2-tos_1280x534_tile_1x4_1306kbps.webm", 4), + make_tuple("vp90-2-tos_1280x534_tile_1x4_fpm_952kbps.webm", 4), + make_tuple("vp90-2-tos_1920x800_tile_1x4_fpm_2335kbps.webm", 4), +}; + +/* + In order to reflect real world performance as much as possible, Perf tests + *DO NOT* do any correctness checks. Please run them alongside correctness + tests to ensure proper codec integrity. Furthermore, in this test we + deliberately limit the amount of system calls we make to avoid OS + preemption. + + TODO(joshualitt) create a more detailed perf measurement test to collect + power/temp/min max frame decode times/etc + */ + +class DecodePerfTest : public ::testing::TestWithParam {}; + +TEST_P(DecodePerfTest, PerfTest) { + const char *const video_name = GET_PARAM(VIDEO_NAME); + const unsigned threads = GET_PARAM(THREADS); + + libvpx_test::WebMVideoSource video(video_name); + video.Init(); + + vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); + cfg.threads = threads; + libvpx_test::VP9Decoder decoder(cfg, 0); + + vpx_usec_timer t; + vpx_usec_timer_start(&t); + + for (video.Begin(); video.cxdata() != nullptr; video.Next()) { + decoder.DecodeFrame(video.cxdata(), video.frame_size()); + } + + vpx_usec_timer_mark(&t); + const double elapsed_secs = double(vpx_usec_timer_elapsed(&t)) / kUsecsInSec; + const unsigned frames = video.frame_number(); + const double fps = double(frames) / elapsed_secs; + + printf("{\n"); + printf("\t\"type\" : \"decode_perf_test\",\n"); + printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP); + printf("\t\"videoName\" : \"%s\",\n", video_name); + printf("\t\"threadCount\" : %u,\n", threads); + printf("\t\"decodeTimeSecs\" : %f,\n", elapsed_secs); + printf("\t\"totalFrames\" : %u,\n", frames); + printf("\t\"framesPerSecond\" : %f\n", fps); + printf("}\n"); +} + +INSTANTIATE_TEST_SUITE_P(VP9, DecodePerfTest, + ::testing::ValuesIn(kVP9DecodePerfVectors)); + +class VP9NewEncodeDecodePerfTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam { + protected: + VP9NewEncodeDecodePerfTest() + : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), speed_(0), + outfile_(nullptr), out_frames_(0) {} + + ~VP9NewEncodeDecodePerfTest() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(encoding_mode_); + + cfg_.g_lag_in_frames = 25; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_undershoot_pct = 50; + cfg_.rc_overshoot_pct = 50; + cfg_.rc_buf_sz = 1000; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 600; + cfg_.rc_resize_allowed = 0; + cfg_.rc_end_usage = VPX_VBR; + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, speed_); + encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1); + encoder->Control(VP9E_SET_TILE_COLUMNS, 2); + } + } + + void BeginPassHook(unsigned int /*pass*/) override { + const std::string data_path = getenv("LIBVPX_TEST_DATA_PATH"); + const std::string path_to_source = data_path + "/" + kNewEncodeOutputFile; + outfile_ = fopen(path_to_source.c_str(), "wb"); + ASSERT_NE(outfile_, nullptr); + } + + void EndPassHook() override { + if (outfile_ != nullptr) { + if (!fseek(outfile_, 0, SEEK_SET)) { + ivf_write_file_header(outfile_, &cfg_, VP9_FOURCC, out_frames_); + } + fclose(outfile_); + outfile_ = nullptr; + } + } + + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + ++out_frames_; + + // Write initial file header if first frame. + if (pkt->data.frame.pts == 0) { + ivf_write_file_header(outfile_, &cfg_, VP9_FOURCC, out_frames_); + } + + // Write frame header and data. + ivf_write_frame_header(outfile_, out_frames_, pkt->data.frame.sz); + ASSERT_EQ(fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_), + pkt->data.frame.sz); + } + + bool DoDecode() const override { return false; } + + void set_speed(unsigned int speed) { speed_ = speed; } + + private: + libvpx_test::TestMode encoding_mode_; + uint32_t speed_; + FILE *outfile_; + uint32_t out_frames_; +}; + +struct EncodePerfTestVideo { + EncodePerfTestVideo(const char *name_, uint32_t width_, uint32_t height_, + uint32_t bitrate_, int frames_) + : name(name_), width(width_), height(height_), bitrate(bitrate_), + frames(frames_) {} + const char *name; + uint32_t width; + uint32_t height; + uint32_t bitrate; + int frames; +}; + +const EncodePerfTestVideo kVP9EncodePerfTestVectors[] = { + EncodePerfTestVideo("niklas_1280_720_30.yuv", 1280, 720, 600, 470), +}; + +TEST_P(VP9NewEncodeDecodePerfTest, PerfTest) { + SetUp(); + + // TODO(JBB): Make this work by going through the set of given files. + const int i = 0; + const vpx_rational timebase = { 33333333, 1000000000 }; + cfg_.g_timebase = timebase; + cfg_.rc_target_bitrate = kVP9EncodePerfTestVectors[i].bitrate; + + init_flags_ = VPX_CODEC_USE_PSNR; + + const char *video_name = kVP9EncodePerfTestVectors[i].name; + libvpx_test::I420VideoSource video( + video_name, kVP9EncodePerfTestVectors[i].width, + kVP9EncodePerfTestVectors[i].height, timebase.den, timebase.num, 0, + kVP9EncodePerfTestVectors[i].frames); + set_speed(2); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + const uint32_t threads = 4; + + libvpx_test::IVFVideoSource decode_video(kNewEncodeOutputFile); + decode_video.Init(); + + vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); + cfg.threads = threads; + libvpx_test::VP9Decoder decoder(cfg, 0); + + vpx_usec_timer t; + vpx_usec_timer_start(&t); + + for (decode_video.Begin(); decode_video.cxdata() != nullptr; + decode_video.Next()) { + decoder.DecodeFrame(decode_video.cxdata(), decode_video.frame_size()); + } + + vpx_usec_timer_mark(&t); + const double elapsed_secs = + static_cast(vpx_usec_timer_elapsed(&t)) / kUsecsInSec; + const unsigned decode_frames = decode_video.frame_number(); + const double fps = static_cast(decode_frames) / elapsed_secs; + + printf("{\n"); + printf("\t\"type\" : \"decode_perf_test\",\n"); + printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP); + printf("\t\"videoName\" : \"%s\",\n", kNewEncodeOutputFile); + printf("\t\"threadCount\" : %u,\n", threads); + printf("\t\"decodeTimeSecs\" : %f,\n", elapsed_secs); + printf("\t\"totalFrames\" : %u,\n", decode_frames); + printf("\t\"framesPerSecond\" : %f\n", fps); + printf("}\n"); +} + +VP9_INSTANTIATE_TEST_SUITE(VP9NewEncodeDecodePerfTest, + ::testing::Values(::libvpx_test::kTwoPassGood)); +} // namespace diff --git a/media/libvpx/libvpx/test/decode_svc_test.cc b/media/libvpx/libvpx/test/decode_svc_test.cc new file mode 100644 index 0000000000..7098e7b270 --- /dev/null +++ b/media/libvpx/libvpx/test/decode_svc_test.cc @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "test/codec_factory.h" +#include "test/decode_test_driver.h" +#include "test/ivf_video_source.h" +#include "test/test_vectors.h" +#include "test/util.h" + +namespace { + +const unsigned int kNumFrames = 19; + +class DecodeSvcTest : public ::libvpx_test::DecoderTest, + public ::libvpx_test::CodecTestWithParam { + protected: + DecodeSvcTest() : DecoderTest(GET_PARAM(::libvpx_test::kCodecFactoryParam)) {} + ~DecodeSvcTest() override = default; + + void PreDecodeFrameHook(const libvpx_test::CompressedVideoSource &video, + libvpx_test::Decoder *decoder) override { + if (video.frame_number() == 0) + decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER, spatial_layer_); + } + + void DecompressedFrameHook(const vpx_image_t &img, + const unsigned int frame_number) override { + ASSERT_EQ(img.d_w, width_); + ASSERT_EQ(img.d_h, height_); + total_frames_ = frame_number; + } + + int spatial_layer_; + unsigned int width_; + unsigned int height_; + unsigned int total_frames_; +}; + +// SVC test vector is 1280x720, with 3 spatial layers, and 20 frames. + +// Decode the SVC test vector, which has 3 spatial layers, and decode up to +// spatial layer 0. Verify the resolution of each decoded frame and the total +// number of frames decoded. This results in 1/4x1/4 resolution (320x180). +TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer0) { + const std::string filename = GET_PARAM(1); + std::unique_ptr video; + video.reset(new libvpx_test::IVFVideoSource(filename)); + ASSERT_NE(video.get(), nullptr); + video->Init(); + total_frames_ = 0; + spatial_layer_ = 0; + width_ = 320; + height_ = 180; + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); + ASSERT_EQ(total_frames_, kNumFrames); +} + +// Decode the SVC test vector, which has 3 spatial layers, and decode up to +// spatial layer 1. Verify the resolution of each decoded frame and the total +// number of frames decoded. This results in 1/2x1/2 resolution (640x360). +TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer1) { + const std::string filename = GET_PARAM(1); + std::unique_ptr video; + video.reset(new libvpx_test::IVFVideoSource(filename)); + ASSERT_NE(video.get(), nullptr); + video->Init(); + total_frames_ = 0; + spatial_layer_ = 1; + width_ = 640; + height_ = 360; + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); + ASSERT_EQ(total_frames_, kNumFrames); +} + +// Decode the SVC test vector, which has 3 spatial layers, and decode up to +// spatial layer 2. Verify the resolution of each decoded frame and the total +// number of frames decoded. This results in the full resolution (1280x720). +TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer2) { + const std::string filename = GET_PARAM(1); + std::unique_ptr video; + video.reset(new libvpx_test::IVFVideoSource(filename)); + ASSERT_NE(video.get(), nullptr); + video->Init(); + total_frames_ = 0; + spatial_layer_ = 2; + width_ = 1280; + height_ = 720; + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); + ASSERT_EQ(total_frames_, kNumFrames); +} + +// Decode the SVC test vector, which has 3 spatial layers, and decode up to +// spatial layer 10. Verify the resolution of each decoded frame and the total +// number of frames decoded. This is beyond the number of spatial layers, so +// the decoding should result in the full resolution (1280x720). +TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer10) { + const std::string filename = GET_PARAM(1); + std::unique_ptr video; + video.reset(new libvpx_test::IVFVideoSource(filename)); + ASSERT_NE(video.get(), nullptr); + video->Init(); + total_frames_ = 0; + spatial_layer_ = 10; + width_ = 1280; + height_ = 720; + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); + ASSERT_EQ(total_frames_, kNumFrames); +} + +VP9_INSTANTIATE_TEST_SUITE( + DecodeSvcTest, ::testing::ValuesIn(libvpx_test::kVP9TestVectorsSvc, + libvpx_test::kVP9TestVectorsSvc + + libvpx_test::kNumVP9TestVectorsSvc)); +} // namespace diff --git a/media/libvpx/libvpx/test/decode_test_driver.cc b/media/libvpx/libvpx/test/decode_test_driver.cc new file mode 100644 index 0000000000..773d673d37 --- /dev/null +++ b/media/libvpx/libvpx/test/decode_test_driver.cc @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "test/codec_factory.h" +#include "test/decode_test_driver.h" +#include "test/register_state_check.h" +#include "test/video_source.h" + +namespace libvpx_test { + +const char kVP8Name[] = "WebM Project VP8"; + +vpx_codec_err_t Decoder::PeekStream(const uint8_t *cxdata, size_t size, + vpx_codec_stream_info_t *stream_info) { + return vpx_codec_peek_stream_info( + CodecInterface(), cxdata, static_cast(size), stream_info); +} + +vpx_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, size_t size) { + return DecodeFrame(cxdata, size, nullptr); +} + +vpx_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, size_t size, + void *user_priv) { + vpx_codec_err_t res_dec; + InitOnce(); + API_REGISTER_STATE_CHECK( + res_dec = vpx_codec_decode( + &decoder_, cxdata, static_cast(size), user_priv, 0)); + return res_dec; +} + +bool Decoder::IsVP8() const { + const char *codec_name = GetDecoderName(); + return strncmp(kVP8Name, codec_name, sizeof(kVP8Name) - 1) == 0; +} + +void DecoderTest::HandlePeekResult(Decoder *const decoder, + CompressedVideoSource *video, + const vpx_codec_err_t res_peek) { + const bool is_vp8 = decoder->IsVP8(); + if (is_vp8) { + /* Vp8's implementation of PeekStream returns an error if the frame you + * pass it is not a keyframe, so we only expect VPX_CODEC_OK on the first + * frame, which must be a keyframe. */ + if (video->frame_number() == 0) { + ASSERT_EQ(VPX_CODEC_OK, res_peek) + << "Peek return failed: " << vpx_codec_err_to_string(res_peek); + } + } else { + /* The Vp9 implementation of PeekStream returns an error only if the + * data passed to it isn't a valid Vp9 chunk. */ + ASSERT_EQ(VPX_CODEC_OK, res_peek) + << "Peek return failed: " << vpx_codec_err_to_string(res_peek); + } +} + +void DecoderTest::RunLoop(CompressedVideoSource *video, + const vpx_codec_dec_cfg_t &dec_cfg) { + Decoder *const decoder = codec_->CreateDecoder(dec_cfg, flags_); + ASSERT_NE(decoder, nullptr); + bool end_of_file = false; + + // Decode frames. + for (video->Begin(); !::testing::Test::HasFailure() && !end_of_file; + video->Next()) { + PreDecodeFrameHook(*video, decoder); + + vpx_codec_stream_info_t stream_info; + stream_info.sz = sizeof(stream_info); + + if (video->cxdata() != nullptr) { + const vpx_codec_err_t res_peek = decoder->PeekStream( + video->cxdata(), video->frame_size(), &stream_info); + HandlePeekResult(decoder, video, res_peek); + ASSERT_FALSE(::testing::Test::HasFailure()); + + vpx_codec_err_t res_dec = + decoder->DecodeFrame(video->cxdata(), video->frame_size()); + if (!HandleDecodeResult(res_dec, *video, decoder)) break; + } else { + // Signal end of the file to the decoder. + const vpx_codec_err_t res_dec = decoder->DecodeFrame(nullptr, 0); + ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError(); + end_of_file = true; + } + + DxDataIterator dec_iter = decoder->GetDxData(); + const vpx_image_t *img = nullptr; + + // Get decompressed data + while (!::testing::Test::HasFailure() && (img = dec_iter.Next())) { + DecompressedFrameHook(*img, video->frame_number()); + } + } + delete decoder; +} + +void DecoderTest::RunLoop(CompressedVideoSource *video) { + vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t(); + RunLoop(video, dec_cfg); +} + +void DecoderTest::set_cfg(const vpx_codec_dec_cfg_t &dec_cfg) { + memcpy(&cfg_, &dec_cfg, sizeof(cfg_)); +} + +void DecoderTest::set_flags(const vpx_codec_flags_t flags) { flags_ = flags; } + +} // namespace libvpx_test diff --git a/media/libvpx/libvpx/test/decode_test_driver.h b/media/libvpx/libvpx/test/decode_test_driver.h new file mode 100644 index 0000000000..f446ab4664 --- /dev/null +++ b/media/libvpx/libvpx/test/decode_test_driver.h @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_TEST_DECODE_TEST_DRIVER_H_ +#define VPX_TEST_DECODE_TEST_DRIVER_H_ +#include +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "./vpx_config.h" +#include "vpx/vpx_decoder.h" + +namespace libvpx_test { + +class CodecFactory; +class CompressedVideoSource; + +// Provides an object to handle decoding output +class DxDataIterator { + public: + explicit DxDataIterator(vpx_codec_ctx_t *decoder) + : decoder_(decoder), iter_(nullptr) {} + + const vpx_image_t *Next() { return vpx_codec_get_frame(decoder_, &iter_); } + + private: + vpx_codec_ctx_t *decoder_; + vpx_codec_iter_t iter_; +}; + +// Provides a simplified interface to manage one video decoding. +// Similar to Encoder class, the exact services should be added +// as more tests are added. +class Decoder { + public: + explicit Decoder(vpx_codec_dec_cfg_t cfg) + : cfg_(cfg), flags_(0), init_done_(false) { + memset(&decoder_, 0, sizeof(decoder_)); + } + + Decoder(vpx_codec_dec_cfg_t cfg, const vpx_codec_flags_t flag) + : cfg_(cfg), flags_(flag), init_done_(false) { + memset(&decoder_, 0, sizeof(decoder_)); + } + + virtual ~Decoder() { vpx_codec_destroy(&decoder_); } + + vpx_codec_err_t PeekStream(const uint8_t *cxdata, size_t size, + vpx_codec_stream_info_t *stream_info); + + vpx_codec_err_t DecodeFrame(const uint8_t *cxdata, size_t size); + + vpx_codec_err_t DecodeFrame(const uint8_t *cxdata, size_t size, + void *user_priv); + + DxDataIterator GetDxData() { return DxDataIterator(&decoder_); } + + void Control(int ctrl_id, int arg) { Control(ctrl_id, arg, VPX_CODEC_OK); } + + void Control(int ctrl_id, const void *arg) { + InitOnce(); + const vpx_codec_err_t res = vpx_codec_control_(&decoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << DecodeError(); + } + + void Control(int ctrl_id, int arg, vpx_codec_err_t expected_value) { + InitOnce(); + const vpx_codec_err_t res = vpx_codec_control_(&decoder_, ctrl_id, arg); + ASSERT_EQ(expected_value, res) << DecodeError(); + } + + const char *DecodeError() { + const char *detail = vpx_codec_error_detail(&decoder_); + return detail ? detail : vpx_codec_error(&decoder_); + } + + // Passes the external frame buffer information to libvpx. + vpx_codec_err_t SetFrameBufferFunctions( + vpx_get_frame_buffer_cb_fn_t cb_get, + vpx_release_frame_buffer_cb_fn_t cb_release, void *user_priv) { + InitOnce(); + return vpx_codec_set_frame_buffer_functions(&decoder_, cb_get, cb_release, + user_priv); + } + + const char *GetDecoderName() const { + return vpx_codec_iface_name(CodecInterface()); + } + + bool IsVP8() const; + + vpx_codec_ctx_t *GetDecoder() { return &decoder_; } + + protected: + virtual vpx_codec_iface_t *CodecInterface() const = 0; + + void InitOnce() { + if (!init_done_) { + const vpx_codec_err_t res = + vpx_codec_dec_init(&decoder_, CodecInterface(), &cfg_, flags_); + ASSERT_EQ(VPX_CODEC_OK, res) << DecodeError(); + init_done_ = true; + } + } + + vpx_codec_ctx_t decoder_; + vpx_codec_dec_cfg_t cfg_; + vpx_codec_flags_t flags_; + bool init_done_; +}; + +// Common test functionality for all Decoder tests. +class DecoderTest { + public: + // Main decoding loop + virtual void RunLoop(CompressedVideoSource *video); + virtual void RunLoop(CompressedVideoSource *video, + const vpx_codec_dec_cfg_t &dec_cfg); + + virtual void set_cfg(const vpx_codec_dec_cfg_t &dec_cfg); + virtual void set_flags(const vpx_codec_flags_t flags); + + // Hook to be called before decompressing every frame. + virtual void PreDecodeFrameHook(const CompressedVideoSource & /*video*/, + Decoder * /*decoder*/) {} + + // Hook to be called to handle decode result. Return true to continue. + virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec, + const CompressedVideoSource & /*video*/, + Decoder *decoder) { + EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError(); + return VPX_CODEC_OK == res_dec; + } + + // Hook to be called on every decompressed frame. + virtual void DecompressedFrameHook(const vpx_image_t & /*img*/, + const unsigned int /*frame_number*/) {} + + // Hook to be called on peek result + virtual void HandlePeekResult(Decoder *const decoder, + CompressedVideoSource *video, + const vpx_codec_err_t res_peek); + + protected: + explicit DecoderTest(const CodecFactory *codec) + : codec_(codec), cfg_(), flags_(0) {} + + virtual ~DecoderTest() {} + + const CodecFactory *codec_; + vpx_codec_dec_cfg_t cfg_; + vpx_codec_flags_t flags_; +}; + +} // namespace libvpx_test + +#endif // VPX_TEST_DECODE_TEST_DRIVER_H_ diff --git a/media/libvpx/libvpx/test/decode_to_md5.sh b/media/libvpx/libvpx/test/decode_to_md5.sh new file mode 100755 index 0000000000..15eee39fac --- /dev/null +++ b/media/libvpx/libvpx/test/decode_to_md5.sh @@ -0,0 +1,73 @@ +#!/bin/sh +## +## Copyright (c) 2014 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## +## This file tests the libvpx decode_to_md5 example. To add new tests to this +## file, do the following: +## 1. Write a shell function (this is your test). +## 2. Add the function to decode_to_md5_tests (on a new line). +## +. $(dirname $0)/tools_common.sh + +# Environment check: Make sure input is available: +# $VP8_IVF_FILE and $VP9_IVF_FILE are required. +decode_to_md5_verify_environment() { + if [ ! -e "${VP8_IVF_FILE}" ] || [ ! -e "${VP9_IVF_FILE}" ]; then + echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH." + return 1 + fi +} + +# Runs decode_to_md5 on $1 and captures the md5 sum for the final frame. $2 is +# interpreted as codec name and used solely to name the output file. $3 is the +# expected md5 sum: It must match that of the final frame. +decode_to_md5() { + local decoder="${LIBVPX_BIN_PATH}/decode_to_md5${VPX_TEST_EXE_SUFFIX}" + local input_file="$1" + local codec="$2" + local expected_md5="$3" + local output_file="${VPX_TEST_OUTPUT_DIR}/decode_to_md5_${codec}" + + if [ ! -x "${decoder}" ]; then + elog "${decoder} does not exist or is not executable." + return 1 + fi + + eval "${VPX_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \ + ${devnull} || return 1 + + [ -e "${output_file}" ] || return 1 + + local md5_last_frame="$(tail -n1 "${output_file}" | awk '{print $1}')" + local actual_md5="$(echo "${md5_last_frame}" | awk '{print $1}')" + [ "${actual_md5}" = "${expected_md5}" ] || return 1 +} + +decode_to_md5_vp8() { + # expected MD5 sum for the last frame. + local expected_md5="56794d911b02190212bca92f88ad60c6" + + if [ "$(vp8_decode_available)" = "yes" ]; then + decode_to_md5 "${VP8_IVF_FILE}" "vp8" "${expected_md5}" + fi +} + +decode_to_md5_vp9() { + # expected MD5 sum for the last frame. + local expected_md5="2952c0eae93f3dadd1aa84c50d3fd6d2" + + if [ "$(vp9_decode_available)" = "yes" ]; then + decode_to_md5 "${VP9_IVF_FILE}" "vp9" "${expected_md5}" + fi +} + +decode_to_md5_tests="decode_to_md5_vp8 + decode_to_md5_vp9" + +run_tests decode_to_md5_verify_environment "${decode_to_md5_tests}" diff --git a/media/libvpx/libvpx/test/decode_with_drops.sh b/media/libvpx/libvpx/test/decode_with_drops.sh new file mode 100755 index 0000000000..2c826045b3 --- /dev/null +++ b/media/libvpx/libvpx/test/decode_with_drops.sh @@ -0,0 +1,79 @@ +#!/bin/sh +## +## Copyright (c) 2014 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## +## This file tests the libvpx decode_with_drops example. To add new tests to +## this file, do the following: +## 1. Write a shell function (this is your test). +## 2. Add the function to decode_with_drops_tests (on a new line). +## +. $(dirname $0)/tools_common.sh + +# Environment check: Make sure input is available: +# $VP8_IVF_FILE and $VP9_IVF_FILE are required. +decode_with_drops_verify_environment() { + if [ ! -e "${VP8_IVF_FILE}" ] || [ ! -e "${VP9_IVF_FILE}" ]; then + echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH." + return 1 + fi +} + +# Runs decode_with_drops on $1, $2 is interpreted as codec name and used solely +# to name the output file. $3 is the drop mode, and is passed directly to +# decode_with_drops. +decode_with_drops() { + local decoder="${LIBVPX_BIN_PATH}/decode_with_drops${VPX_TEST_EXE_SUFFIX}" + local input_file="$1" + local codec="$2" + local output_file="${VPX_TEST_OUTPUT_DIR}/decode_with_drops_${codec}" + local drop_mode="$3" + + if [ ! -x "${decoder}" ]; then + elog "${decoder} does not exist or is not executable." + return 1 + fi + + eval "${VPX_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \ + "${drop_mode}" ${devnull} || return 1 + + [ -e "${output_file}" ] || return 1 +} + +# Decodes $VP8_IVF_FILE while dropping frames, twice: once in sequence mode, +# and once in pattern mode. +# Note: This test assumes that $VP8_IVF_FILE has exactly 29 frames, and could +# break if the file is modified. +decode_with_drops_vp8() { + if [ "$(vp8_decode_available)" = "yes" ]; then + # Test sequence mode: Drop frames 2-28. + decode_with_drops "${VP8_IVF_FILE}" "vp8" "2-28" || return 1 + + # Test pattern mode: Drop 3 of every 4 frames. + decode_with_drops "${VP8_IVF_FILE}" "vp8" "3/4" || return 1 + fi +} + +# Decodes $VP9_IVF_FILE while dropping frames, twice: once in sequence mode, +# and once in pattern mode. +# Note: This test assumes that $VP9_IVF_FILE has exactly 20 frames, and could +# break if the file is modified. +decode_with_drops_vp9() { + if [ "$(vp9_decode_available)" = "yes" ]; then + # Test sequence mode: Drop frames 2-28. + decode_with_drops "${VP9_IVF_FILE}" "vp9" "2-19" || return 1 + + # Test pattern mode: Drop 3 of every 4 frames. + decode_with_drops "${VP9_IVF_FILE}" "vp9" "3/4" || return 1 + fi +} + +decode_with_drops_tests="decode_with_drops_vp8 + decode_with_drops_vp9" + +run_tests decode_with_drops_verify_environment "${decode_with_drops_tests}" diff --git a/media/libvpx/libvpx/test/encode_api_test.cc b/media/libvpx/libvpx/test/encode_api_test.cc new file mode 100644 index 0000000000..508083673a --- /dev/null +++ b/media/libvpx/libvpx/test/encode_api_test.cc @@ -0,0 +1,949 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/acm_random.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/video_source.h" + +#include "./vpx_config.h" +#include "vpx/vp8cx.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_encoder.h" +#include "vpx/vpx_image.h" +#include "vpx/vpx_tpl.h" + +namespace { + +vpx_codec_iface_t *kCodecIfaces[] = { +#if CONFIG_VP8_ENCODER + &vpx_codec_vp8_cx_algo, +#endif +#if CONFIG_VP9_ENCODER + &vpx_codec_vp9_cx_algo, +#endif +}; + +bool IsVP9(vpx_codec_iface_t *iface) { + static const char kVP9Name[] = "WebM Project VP9"; + return strncmp(kVP9Name, vpx_codec_iface_name(iface), sizeof(kVP9Name) - 1) == + 0; +} + +TEST(EncodeAPI, InvalidParams) { + uint8_t buf[1] = { 0 }; + vpx_image_t img; + vpx_codec_ctx_t enc; + vpx_codec_enc_cfg_t cfg; + + EXPECT_EQ(&img, vpx_img_wrap(&img, VPX_IMG_FMT_I420, 1, 1, 1, buf)); + + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_enc_init(nullptr, nullptr, nullptr, 0)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_enc_init(&enc, nullptr, nullptr, 0)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_encode(nullptr, nullptr, 0, 0, 0, 0)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_encode(nullptr, &img, 0, 0, 0, 0)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_destroy(nullptr)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_enc_config_default(nullptr, nullptr, 0)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_enc_config_default(nullptr, &cfg, 0)); + EXPECT_NE(vpx_codec_error(nullptr), nullptr); + + for (const auto *iface : kCodecIfaces) { + SCOPED_TRACE(vpx_codec_iface_name(iface)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_enc_init(nullptr, iface, nullptr, 0)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_enc_init(&enc, iface, nullptr, 0)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_enc_config_default(iface, &cfg, 1)); + + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_config_default(iface, &cfg, 0)); + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_init(&enc, iface, &cfg, 0)); + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_encode(&enc, nullptr, 0, 0, 0, 0)); + + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&enc)); + } +} + +TEST(EncodeAPI, HighBitDepthCapability) { +// VP8 should not claim VP9 HBD as a capability. +#if CONFIG_VP8_ENCODER + const vpx_codec_caps_t vp8_caps = vpx_codec_get_caps(&vpx_codec_vp8_cx_algo); + EXPECT_EQ(vp8_caps & VPX_CODEC_CAP_HIGHBITDEPTH, 0); +#endif + +#if CONFIG_VP9_ENCODER + const vpx_codec_caps_t vp9_caps = vpx_codec_get_caps(&vpx_codec_vp9_cx_algo); +#if CONFIG_VP9_HIGHBITDEPTH + EXPECT_EQ(vp9_caps & VPX_CODEC_CAP_HIGHBITDEPTH, VPX_CODEC_CAP_HIGHBITDEPTH); +#else + EXPECT_EQ(vp9_caps & VPX_CODEC_CAP_HIGHBITDEPTH, 0); +#endif +#endif +} + +#if CONFIG_VP8_ENCODER +TEST(EncodeAPI, ImageSizeSetting) { + const int width = 711; + const int height = 360; + const int bps = 12; + vpx_image_t img; + vpx_codec_ctx_t enc; + vpx_codec_enc_cfg_t cfg; + uint8_t *img_buf = reinterpret_cast( + calloc(width * height * bps / 8, sizeof(*img_buf))); + vpx_codec_enc_config_default(vpx_codec_vp8_cx(), &cfg, 0); + + cfg.g_w = width; + cfg.g_h = height; + + vpx_img_wrap(&img, VPX_IMG_FMT_I420, width, height, 1, img_buf); + + vpx_codec_enc_init(&enc, vpx_codec_vp8_cx(), &cfg, 0); + + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_encode(&enc, &img, 0, 1, 0, 0)); + + free(img_buf); + + vpx_codec_destroy(&enc); +} + +// Verifies the fix for a float-cast-overflow in vp8_change_config(). +// +// Causes cpi->framerate to become the largest possible value (10,000,000) in +// VP8 by setting cfg.g_timebase to 1/10000000 and passing a duration of 1 to +// vpx_codec_encode(). +TEST(EncodeAPI, HugeFramerateVp8) { + vpx_codec_iface_t *const iface = vpx_codec_vp8_cx(); + vpx_codec_enc_cfg_t cfg; + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + cfg.g_w = 271; + cfg.g_h = 1080; + cfg.g_timebase.num = 1; + // Largest value (VP8's TICKS_PER_SEC) such that frame duration is nonzero (1 + // tick). + cfg.g_timebase.den = 10000000; + cfg.g_pass = VPX_RC_ONE_PASS; + cfg.g_lag_in_frames = 0; + cfg.rc_end_usage = VPX_CBR; + + vpx_codec_ctx_t enc; + // Before we encode the first frame, cpi->framerate is set to a guess (the + // reciprocal of cfg.g_timebase). If this guess doesn't seem reasonable + // (> 180), cpi->framerate is set to 30. + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + + ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, -12), VPX_CODEC_OK); + + vpx_image_t *const image = + vpx_img_alloc(nullptr, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h, 1); + ASSERT_NE(image, nullptr); + + for (unsigned int i = 0; i < image->d_h; ++i) { + memset(image->planes[0] + i * image->stride[0], 128, image->d_w); + } + const unsigned int uv_h = (image->d_h + 1) / 2; + const unsigned int uv_w = (image->d_w + 1) / 2; + for (unsigned int i = 0; i < uv_h; ++i) { + memset(image->planes[1] + i * image->stride[1], 128, uv_w); + memset(image->planes[2] + i * image->stride[2], 128, uv_w); + } + + // Encode a frame. + // Up to this point cpi->framerate is 30. Now pass a duration of only 1. This + // causes cpi->framerate to become 10,000,000. + ASSERT_EQ(vpx_codec_encode(&enc, image, 0, 1, 0, VPX_DL_REALTIME), + VPX_CODEC_OK); + + // Change to the same config. Since cpi->framerate is now huge, when it is + // used to calculate raw_target_rate (bit rate of uncompressed frames), the + // result is likely to overflow an unsigned int. + ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK); + + vpx_img_free(image); + ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); +} + +// A test that reproduces https://crbug.com/webm/1831. +TEST(EncodeAPI, RandomPixelsVp8) { + // Initialize libvpx encoder + vpx_codec_iface_t *const iface = vpx_codec_vp8_cx(); + vpx_codec_enc_cfg_t cfg; + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + + cfg.rc_target_bitrate = 2000; + cfg.g_w = 1280; + cfg.g_h = 720; + + vpx_codec_ctx_t enc; + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + + // Generate random frame data and encode + uint8_t img[1280 * 720 * 3 / 2]; + libvpx_test::ACMRandom rng; + for (size_t i = 0; i < sizeof(img); ++i) { + img[i] = rng.Rand8(); + } + vpx_image_t img_wrapper; + ASSERT_EQ( + vpx_img_wrap(&img_wrapper, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h, 1, img), + &img_wrapper); + ASSERT_EQ(vpx_codec_encode(&enc, &img_wrapper, 0, 1, 0, VPX_DL_BEST_QUALITY), + VPX_CODEC_OK); + + // Destroy libvpx encoder + vpx_codec_destroy(&enc); +} +#endif + +// Set up 2 spatial streams with 2 temporal layers per stream, and generate +// invalid configuration by setting the temporal layer rate allocation +// (ts_target_bitrate[]) to 0 for both layers. This should fail independent of +// CONFIG_MULTI_RES_ENCODING. +TEST(EncodeAPI, MultiResEncode) { + const int width = 1280; + const int height = 720; + const int width_down = width / 2; + const int height_down = height / 2; + const int target_bitrate = 1000; + const int framerate = 30; + + for (const auto *iface : kCodecIfaces) { + vpx_codec_ctx_t enc[2]; + vpx_codec_enc_cfg_t cfg[2]; + vpx_rational_t dsf[2] = { { 2, 1 }, { 2, 1 } }; + + memset(enc, 0, sizeof(enc)); + + for (int i = 0; i < 2; i++) { + vpx_codec_enc_config_default(iface, &cfg[i], 0); + } + + /* Highest-resolution encoder settings */ + cfg[0].g_w = width; + cfg[0].g_h = height; + cfg[0].rc_dropframe_thresh = 0; + cfg[0].rc_end_usage = VPX_CBR; + cfg[0].rc_resize_allowed = 0; + cfg[0].rc_min_quantizer = 2; + cfg[0].rc_max_quantizer = 56; + cfg[0].rc_undershoot_pct = 100; + cfg[0].rc_overshoot_pct = 15; + cfg[0].rc_buf_initial_sz = 500; + cfg[0].rc_buf_optimal_sz = 600; + cfg[0].rc_buf_sz = 1000; + cfg[0].g_error_resilient = 1; /* Enable error resilient mode */ + cfg[0].g_lag_in_frames = 0; + + cfg[0].kf_mode = VPX_KF_AUTO; + cfg[0].kf_min_dist = 3000; + cfg[0].kf_max_dist = 3000; + + cfg[0].rc_target_bitrate = target_bitrate; /* Set target bitrate */ + cfg[0].g_timebase.num = 1; /* Set fps */ + cfg[0].g_timebase.den = framerate; + + memcpy(&cfg[1], &cfg[0], sizeof(cfg[0])); + cfg[1].rc_target_bitrate = 500; + cfg[1].g_w = width_down; + cfg[1].g_h = height_down; + + for (int i = 0; i < 2; i++) { + cfg[i].ts_number_layers = 2; + cfg[i].ts_periodicity = 2; + cfg[i].ts_rate_decimator[0] = 2; + cfg[i].ts_rate_decimator[1] = 1; + cfg[i].ts_layer_id[0] = 0; + cfg[i].ts_layer_id[1] = 1; + // Invalid parameters. + cfg[i].ts_target_bitrate[0] = 0; + cfg[i].ts_target_bitrate[1] = 0; + } + + // VP9 should report incapable, VP8 invalid for all configurations. + EXPECT_EQ(IsVP9(iface) ? VPX_CODEC_INCAPABLE : VPX_CODEC_INVALID_PARAM, + vpx_codec_enc_init_multi(&enc[0], iface, &cfg[0], 2, 0, &dsf[0])); + + for (int i = 0; i < 2; i++) { + vpx_codec_destroy(&enc[i]); + } + } +} + +TEST(EncodeAPI, SetRoi) { + static struct { + vpx_codec_iface_t *iface; + int ctrl_id; + } kCodecs[] = { +#if CONFIG_VP8_ENCODER + { &vpx_codec_vp8_cx_algo, VP8E_SET_ROI_MAP }, +#endif +#if CONFIG_VP9_ENCODER + { &vpx_codec_vp9_cx_algo, VP9E_SET_ROI_MAP }, +#endif + }; + constexpr int kWidth = 64; + constexpr int kHeight = 64; + + for (const auto &codec : kCodecs) { + SCOPED_TRACE(vpx_codec_iface_name(codec.iface)); + vpx_codec_ctx_t enc; + vpx_codec_enc_cfg_t cfg; + + EXPECT_EQ(vpx_codec_enc_config_default(codec.iface, &cfg, 0), VPX_CODEC_OK); + cfg.g_w = kWidth; + cfg.g_h = kHeight; + EXPECT_EQ(vpx_codec_enc_init(&enc, codec.iface, &cfg, 0), VPX_CODEC_OK); + + vpx_roi_map_t roi = {}; + uint8_t roi_map[kWidth * kHeight] = {}; + if (IsVP9(codec.iface)) { + roi.rows = (cfg.g_w + 7) >> 3; + roi.cols = (cfg.g_h + 7) >> 3; + } else { + roi.rows = (cfg.g_w + 15) >> 4; + roi.cols = (cfg.g_h + 15) >> 4; + } + EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), VPX_CODEC_OK); + + roi.roi_map = roi_map; + // VP8 only. This value isn't range checked. + roi.static_threshold[1] = 1000; + roi.static_threshold[2] = UINT_MAX / 2 + 1; + roi.static_threshold[3] = UINT_MAX; + + for (const auto delta : { -63, -1, 0, 1, 63 }) { + for (int i = 0; i < 8; ++i) { + roi.delta_q[i] = delta; + roi.delta_lf[i] = delta; + // VP9 only. + roi.skip[i] ^= 1; + roi.ref_frame[i] = (roi.ref_frame[i] + 1) % 4; + EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), VPX_CODEC_OK); + } + } + + vpx_codec_err_t expected_error; + for (const auto delta : { -64, 64, INT_MIN, INT_MAX }) { + expected_error = VPX_CODEC_INVALID_PARAM; + for (int i = 0; i < 8; ++i) { + roi.delta_q[i] = delta; + // The max segment count for VP8 is 4, the remainder of the entries are + // ignored. + if (i >= 4 && !IsVP9(codec.iface)) expected_error = VPX_CODEC_OK; + + EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error) + << "delta_q[" << i << "]: " << delta; + roi.delta_q[i] = 0; + + roi.delta_lf[i] = delta; + EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error) + << "delta_lf[" << i << "]: " << delta; + roi.delta_lf[i] = 0; + } + } + + // VP8 should ignore skip[] and ref_frame[] values. + expected_error = + IsVP9(codec.iface) ? VPX_CODEC_INVALID_PARAM : VPX_CODEC_OK; + for (const auto skip : { -2, 2, INT_MIN, INT_MAX }) { + for (int i = 0; i < 8; ++i) { + roi.skip[i] = skip; + EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error) + << "skip[" << i << "]: " << skip; + roi.skip[i] = 0; + } + } + + // VP9 allows negative values to be used to disable segmentation. + for (int ref_frame = -3; ref_frame < 0; ++ref_frame) { + for (int i = 0; i < 8; ++i) { + roi.ref_frame[i] = ref_frame; + EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), VPX_CODEC_OK) + << "ref_frame[" << i << "]: " << ref_frame; + roi.ref_frame[i] = 0; + } + } + + for (const auto ref_frame : { 4, INT_MIN, INT_MAX }) { + for (int i = 0; i < 8; ++i) { + roi.ref_frame[i] = ref_frame; + EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error) + << "ref_frame[" << i << "]: " << ref_frame; + roi.ref_frame[i] = 0; + } + } + + EXPECT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); + } +} + +void InitCodec(vpx_codec_iface_t &iface, int width, int height, + vpx_codec_ctx_t *enc, vpx_codec_enc_cfg_t *cfg) { + cfg->g_w = width; + cfg->g_h = height; + cfg->g_lag_in_frames = 0; + cfg->g_pass = VPX_RC_ONE_PASS; + ASSERT_EQ(vpx_codec_enc_init(enc, &iface, cfg, 0), VPX_CODEC_OK); + + ASSERT_EQ(vpx_codec_control_(enc, VP8E_SET_CPUUSED, 2), VPX_CODEC_OK); +} + +// Encodes 1 frame of size |cfg.g_w| x |cfg.g_h| setting |enc|'s configuration +// to |cfg|. +void EncodeWithConfig(const vpx_codec_enc_cfg_t &cfg, vpx_codec_ctx_t *enc) { + libvpx_test::DummyVideoSource video; + video.SetSize(cfg.g_w, cfg.g_h); + video.Begin(); + EXPECT_EQ(vpx_codec_enc_config_set(enc, &cfg), VPX_CODEC_OK) + << vpx_codec_error_detail(enc); + + EXPECT_EQ(vpx_codec_encode(enc, video.img(), video.pts(), video.duration(), + /*flags=*/0, VPX_DL_GOOD_QUALITY), + VPX_CODEC_OK) + << vpx_codec_error_detail(enc); +} + +TEST(EncodeAPI, ConfigChangeThreadCount) { + constexpr int kWidth = 1920; + constexpr int kHeight = 1080; + + for (const auto *iface : kCodecIfaces) { + SCOPED_TRACE(vpx_codec_iface_name(iface)); + for (int i = 0; i < (IsVP9(iface) ? 2 : 1); ++i) { + vpx_codec_enc_cfg_t cfg = {}; + struct Encoder { + ~Encoder() { EXPECT_EQ(vpx_codec_destroy(&ctx), VPX_CODEC_OK); } + vpx_codec_ctx_t ctx = {}; + } enc; + + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + EXPECT_NO_FATAL_FAILURE( + InitCodec(*iface, kWidth, kHeight, &enc.ctx, &cfg)); + if (IsVP9(iface)) { + EXPECT_EQ(vpx_codec_control_(&enc.ctx, VP9E_SET_TILE_COLUMNS, 6), + VPX_CODEC_OK); + EXPECT_EQ(vpx_codec_control_(&enc.ctx, VP9E_SET_ROW_MT, i), + VPX_CODEC_OK); + } + + for (const auto threads : { 1, 4, 8, 6, 2, 1 }) { + cfg.g_threads = threads; + EXPECT_NO_FATAL_FAILURE(EncodeWithConfig(cfg, &enc.ctx)) + << "iteration: " << i << " threads: " << threads; + } + } + } +} + +TEST(EncodeAPI, ConfigResizeChangeThreadCount) { + constexpr int kInitWidth = 1024; + constexpr int kInitHeight = 1024; + + for (const auto *iface : kCodecIfaces) { + SCOPED_TRACE(vpx_codec_iface_name(iface)); + for (int i = 0; i < (IsVP9(iface) ? 2 : 1); ++i) { + vpx_codec_enc_cfg_t cfg = {}; + struct Encoder { + ~Encoder() { EXPECT_EQ(vpx_codec_destroy(&ctx), VPX_CODEC_OK); } + vpx_codec_ctx_t ctx = {}; + } enc; + + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + // Start in threaded mode to ensure resolution and thread related + // allocations are updated correctly across changes in resolution and + // thread counts. See https://crbug.com/1486441. + cfg.g_threads = 4; + EXPECT_NO_FATAL_FAILURE( + InitCodec(*iface, kInitWidth, kInitHeight, &enc.ctx, &cfg)); + if (IsVP9(iface)) { + EXPECT_EQ(vpx_codec_control_(&enc.ctx, VP9E_SET_TILE_COLUMNS, 6), + VPX_CODEC_OK); + EXPECT_EQ(vpx_codec_control_(&enc.ctx, VP9E_SET_ROW_MT, i), + VPX_CODEC_OK); + } + + cfg.g_w = 1000; + cfg.g_h = 608; + EXPECT_EQ(vpx_codec_enc_config_set(&enc.ctx, &cfg), VPX_CODEC_OK) + << vpx_codec_error_detail(&enc.ctx); + + cfg.g_w = 1000; + cfg.g_h = 720; + + for (const auto threads : { 1, 4, 8, 6, 2, 1 }) { + cfg.g_threads = threads; + EXPECT_NO_FATAL_FAILURE(EncodeWithConfig(cfg, &enc.ctx)) + << "iteration: " << i << " threads: " << threads; + } + } + } +} + +#if CONFIG_VP9_ENCODER +// Frame size needed to trigger the overflow exceeds the max buffer allowed on +// 32-bit systems defined by VPX_MAX_ALLOCABLE_MEMORY +#if VPX_ARCH_X86_64 || VPX_ARCH_AARCH64 +TEST(EncodeAPI, ConfigLargeTargetBitrateVp9) { + constexpr int kWidth = 12383; + constexpr int kHeight = 8192; + constexpr auto *iface = &vpx_codec_vp9_cx_algo; + SCOPED_TRACE(vpx_codec_iface_name(iface)); + vpx_codec_enc_cfg_t cfg = {}; + struct Encoder { + ~Encoder() { EXPECT_EQ(vpx_codec_destroy(&ctx), VPX_CODEC_OK); } + vpx_codec_ctx_t ctx = {}; + } enc; + + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + // The following setting will cause avg_frame_bandwidth in rate control to be + // larger than INT_MAX + cfg.rc_target_bitrate = INT_MAX; + // Framerate 0.1 (equivalent to timebase 10) is the smallest framerate allowed + // by libvpx + cfg.g_timebase.den = 1; + cfg.g_timebase.num = 10; + EXPECT_NO_FATAL_FAILURE(InitCodec(*iface, kWidth, kHeight, &enc.ctx, &cfg)) + << "target bitrate: " << cfg.rc_target_bitrate << " framerate: " + << static_cast(cfg.g_timebase.den) / cfg.g_timebase.num; +} +#endif // VPX_ARCH_X86_64 || VPX_ARCH_AARCH64 + +vpx_image_t *CreateImage(const unsigned int width, const unsigned int height) { + vpx_image_t *image = + vpx_img_alloc(nullptr, VPX_IMG_FMT_I420, width, height, 1); + if (!image) return image; + + for (unsigned int i = 0; i < image->d_h; ++i) { + memset(image->planes[0] + i * image->stride[0], 128, image->d_w); + } + const unsigned int uv_h = (image->d_h + 1) / 2; + const unsigned int uv_w = (image->d_w + 1) / 2; + for (unsigned int i = 0; i < uv_h; ++i) { + memset(image->planes[1] + i * image->stride[1], 128, uv_w); + memset(image->planes[2] + i * image->stride[2], 128, uv_w); + } + + return image; +} + +// Emulates the WebCodecs VideoEncoder interface. +class VP9Encoder { + public: + explicit VP9Encoder(int speed) : speed_(speed) {} + ~VP9Encoder(); + + void Configure(unsigned int threads, unsigned int width, unsigned int height, + vpx_rc_mode end_usage, vpx_enc_deadline_t deadline); + void Encode(bool key_frame); + + private: + const int speed_; + bool initialized_ = false; + vpx_codec_enc_cfg_t cfg_; + vpx_codec_ctx_t enc_; + int frame_index_ = 0; + vpx_enc_deadline_t deadline_ = 0; +}; + +VP9Encoder::~VP9Encoder() { + if (initialized_) { + EXPECT_EQ(vpx_codec_destroy(&enc_), VPX_CODEC_OK); + } +} + +void VP9Encoder::Configure(unsigned int threads, unsigned int width, + unsigned int height, vpx_rc_mode end_usage, + vpx_enc_deadline_t deadline) { + deadline_ = deadline; + + if (!initialized_) { + vpx_codec_iface_t *const iface = vpx_codec_vp9_cx(); + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg_, /*usage=*/0), + VPX_CODEC_OK); + cfg_.g_threads = threads; + cfg_.g_w = width; + cfg_.g_h = height; + cfg_.g_timebase.num = 1; + cfg_.g_timebase.den = 1000 * 1000; // microseconds + cfg_.g_pass = VPX_RC_ONE_PASS; + cfg_.g_lag_in_frames = 0; + cfg_.rc_end_usage = end_usage; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 58; + ASSERT_EQ(vpx_codec_enc_init(&enc_, iface, &cfg_, 0), VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_control(&enc_, VP8E_SET_CPUUSED, speed_), VPX_CODEC_OK); + initialized_ = true; + return; + } + + cfg_.g_threads = threads; + cfg_.g_w = width; + cfg_.g_h = height; + cfg_.rc_end_usage = end_usage; + ASSERT_EQ(vpx_codec_enc_config_set(&enc_, &cfg_), VPX_CODEC_OK) + << vpx_codec_error_detail(&enc_); +} + +void VP9Encoder::Encode(bool key_frame) { + const vpx_codec_cx_pkt_t *pkt; + vpx_image_t *image = CreateImage(cfg_.g_w, cfg_.g_h); + ASSERT_NE(image, nullptr); + const vpx_enc_frame_flags_t frame_flags = key_frame ? VPX_EFLAG_FORCE_KF : 0; + ASSERT_EQ( + vpx_codec_encode(&enc_, image, frame_index_, 1, frame_flags, deadline_), + VPX_CODEC_OK); + frame_index_++; + vpx_codec_iter_t iter = nullptr; + while ((pkt = vpx_codec_get_cx_data(&enc_, &iter)) != nullptr) { + ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT); + } + vpx_img_free(image); +} + +// This is a test case from clusterfuzz. +TEST(EncodeAPI, PrevMiCheckNullptr) { + VP9Encoder encoder(0); + encoder.Configure(0, 1554, 644, VPX_VBR, VPX_DL_REALTIME); + + // First step: encode, without forcing KF. + encoder.Encode(false); + // Second step: change config + encoder.Configure(0, 1131, 644, VPX_CBR, VPX_DL_GOOD_QUALITY); + // Third step: encode, without forcing KF + encoder.Encode(false); +} + +// This is a test case from clusterfuzz: based on b/310477034. +// Encode a few frames with multiple change config calls +// with different frame sizes. +TEST(EncodeAPI, MultipleChangeConfigResize) { + VP9Encoder encoder(3); + + // Set initial config. + encoder.Configure(3, 41, 1, VPX_VBR, VPX_DL_REALTIME); + + // Encode first frame. + encoder.Encode(true); + + // Change config. + encoder.Configure(16, 31, 1, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Change config again. + encoder.Configure(0, 17, 1, VPX_CBR, VPX_DL_REALTIME); + + // Encode 2nd frame with new config, set delta frame. + encoder.Encode(false); + + // Encode 3rd frame with same config, set delta frame. + encoder.Encode(false); +} + +// This is a test case from clusterfuzz: based on b/310663186. +// Encode set of frames while varying the deadline on the fly from +// good to realtime to best and back to realtime. +TEST(EncodeAPI, DynamicDeadlineChange) { + // Use realtime speed: 5 to 9. + VP9Encoder encoder(5); + + // Set initial config, in particular set deadline to GOOD mode. + encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 1st frame. + encoder.Encode(true); + + // Encode 2nd frame, delta frame. + encoder.Encode(false); + + // Change config: change deadline to REALTIME. + encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_REALTIME); + + // Encode 3rd frame with new config, set key frame. + encoder.Encode(true); + + // Encode 4th frame with same config, delta frame. + encoder.Encode(false); + + // Encode 5th frame with same config, key frame. + encoder.Encode(true); + + // Change config: change deadline to BEST. + encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_BEST_QUALITY); + + // Encode 6th frame with new config, set delta frame. + encoder.Encode(false); + + // Change config: change deadline to REALTIME. + encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_REALTIME); + + // Encode 7th frame with new config, set delta frame. + encoder.Encode(false); + + // Encode 8th frame with new config, set key frame. + encoder.Encode(true); + + // Encode 9th frame with new config, set delta frame. + encoder.Encode(false); +} + +TEST(EncodeAPI, Buganizer310340241) { + VP9Encoder encoder(-6); + + // Set initial config, in particular set deadline to GOOD mode. + encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 1st frame. + encoder.Encode(true); + + // Encode 2nd frame, delta frame. + encoder.Encode(false); + + // Change config: change deadline to REALTIME. + encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_REALTIME); + + // Encode 3rd frame with new config, set key frame. + encoder.Encode(true); +} + +// This is a test case from clusterfuzz: based on b/312517065. +TEST(EncodeAPI, Buganizer312517065) { + VP9Encoder encoder(4); + encoder.Configure(0, 1060, 437, VPX_CBR, VPX_DL_REALTIME); + encoder.Encode(true); + encoder.Configure(10, 33, 437, VPX_VBR, VPX_DL_GOOD_QUALITY); + encoder.Encode(false); + encoder.Configure(6, 327, 269, VPX_VBR, VPX_DL_GOOD_QUALITY); + encoder.Configure(15, 1060, 437, VPX_CBR, VPX_DL_REALTIME); + encoder.Encode(false); +} + +// This is a test case from clusterfuzz: based on b/311489136. +// Encode a few frames with multiple change config calls +// with different frame sizes. +TEST(EncodeAPI, Buganizer311489136) { + VP9Encoder encoder(1); + + // Set initial config. + encoder.Configure(12, 1678, 620, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode first frame. + encoder.Encode(true); + + // Change config. + encoder.Configure(3, 1678, 202, VPX_CBR, VPX_DL_GOOD_QUALITY); + + // Encode 2nd frame with new config, set delta frame. + encoder.Encode(false); + + // Change config again. + encoder.Configure(8, 1037, 476, VPX_CBR, VPX_DL_REALTIME); + + // Encode 3rd frame with new config, set delta frame. + encoder.Encode(false); + + // Change config again. + encoder.Configure(0, 580, 620, VPX_CBR, VPX_DL_GOOD_QUALITY); + + // Encode 4th frame with same config, set delta frame. + encoder.Encode(false); +} + +// This is a test case from clusterfuzz: based on b/312656387. +// Encode a few frames with multiple change config calls +// with different frame sizes. +TEST(EncodeAPI, Buganizer312656387) { + VP9Encoder encoder(1); + + // Set initial config. + encoder.Configure(16, 1, 1024, VPX_CBR, VPX_DL_REALTIME); + + // Change config. + encoder.Configure(15, 1, 1024, VPX_VBR, VPX_DL_REALTIME); + + // Encode first frame. + encoder.Encode(true); + + // Change config again. + encoder.Configure(14, 1, 595, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 2nd frame with new config. + encoder.Encode(true); + + // Change config again. + encoder.Configure(2, 1, 1024, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 3rd frame with new config, set delta frame. + encoder.Encode(false); +} + +// This is a test case from clusterfuzz: based on b/310329177. +// Encode a few frames with multiple change config calls +// with different frame sizes. +TEST(EncodeAPI, Buganizer310329177) { + VP9Encoder encoder(6); + + // Set initial config. + encoder.Configure(10, 41, 1, VPX_VBR, VPX_DL_REALTIME); + + // Encode first frame. + encoder.Encode(true); + + // Change config. + encoder.Configure(16, 1, 1, VPX_VBR, VPX_DL_REALTIME); + + // Encode 2nd frame with new config, set delta frame. + encoder.Encode(false); +} + +// This is a test case from clusterfuzz: based on b/311394513. +// Encode a few frames with multiple change config calls +// with different frame sizes. +TEST(EncodeAPI, Buganizer311394513) { + VP9Encoder encoder(-7); + + // Set initial config. + encoder.Configure(0, 5, 9, VPX_VBR, VPX_DL_REALTIME); + + // Encode first frame. + encoder.Encode(false); + + // Change config. + encoder.Configure(5, 2, 1, VPX_VBR, VPX_DL_REALTIME); + + // Encode 2nd frame with new config. + encoder.Encode(true); +} + +TEST(EncodeAPI, Buganizer311985118) { + VP9Encoder encoder(0); + + // Set initial config, in particular set deadline to GOOD mode. + encoder.Configure(12, 1678, 620, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 1st frame. + encoder.Encode(false); + + // Change config: change threads and width. + encoder.Configure(0, 1574, 620, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Change config: change threads, width and height. + encoder.Configure(16, 837, 432, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 2nd frame. + encoder.Encode(false); +} + +// This is a test case from clusterfuzz: based on b/314857577. +// Encode a few frames with multiple change config calls +// with different frame sizes. +TEST(EncodeAPI, Buganizer314857577) { + VP9Encoder encoder(4); + + // Set initial config. + encoder.Configure(12, 1060, 437, VPX_VBR, VPX_DL_REALTIME); + + // Encode first frame. + encoder.Encode(false); + + // Change config. + encoder.Configure(16, 1060, 1, VPX_CBR, VPX_DL_REALTIME); + + // Encode 2nd frame with new config. + encoder.Encode(false); + + // Encode 3rd frame with new config. + encoder.Encode(true); + + // Change config. + encoder.Configure(15, 33, 437, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 4th frame with new config. + encoder.Encode(true); + + // Encode 5th frame with new config. + encoder.Encode(false); + + // Change config. + encoder.Configure(5, 327, 269, VPX_VBR, VPX_DL_REALTIME); + + // Change config. + encoder.Configure(15, 1060, 437, VPX_CBR, VPX_DL_REALTIME); + + // Encode 6th frame with new config. + encoder.Encode(false); + + // Encode 7th frame with new config. + encoder.Encode(false); + + // Change config. + encoder.Configure(4, 1060, 437, VPX_VBR, VPX_DL_REALTIME); + + // Encode 8th frame with new config. + encoder.Encode(false); +} + +TEST(EncodeAPI, Buganizer312875957PredBufferStride) { + VP9Encoder encoder(-1); + + encoder.Configure(12, 1678, 620, VPX_VBR, VPX_DL_REALTIME); + encoder.Encode(true); + encoder.Encode(false); + encoder.Configure(0, 456, 486, VPX_VBR, VPX_DL_REALTIME); + encoder.Encode(true); + encoder.Configure(0, 1678, 620, VPX_CBR, 1000000); + encoder.Encode(false); + encoder.Encode(false); +} + +// This is a test case from clusterfuzz: based on b/311294795 +// Encode a few frames with multiple change config calls +// with different frame sizes. +TEST(EncodeAPI, Buganizer311294795) { + VP9Encoder encoder(1); + + // Set initial config. + encoder.Configure(12, 1678, 620, VPX_VBR, VPX_DL_REALTIME); + + // Encode first frame. + encoder.Encode(false); + + // Change config. + encoder.Configure(16, 632, 620, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 2nd frame with new config + encoder.Encode(true); + + // Change config. + encoder.Configure(16, 1678, 342, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 3rd frame with new config. + encoder.Encode(false); + + // Change config. + encoder.Configure(0, 1574, 618, VPX_VBR, VPX_DL_REALTIME); + // Encode more frames with new config. + encoder.Encode(false); + encoder.Encode(false); +} +#endif // CONFIG_VP9_ENCODER + +} // namespace diff --git a/media/libvpx/libvpx/test/encode_perf_test.cc b/media/libvpx/libvpx/test/encode_perf_test.cc new file mode 100644 index 0000000000..171ff8eeca --- /dev/null +++ b/media/libvpx/libvpx/test/encode_perf_test.cc @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "./vpx_config.h" +#include "./vpx_version.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "vpx_ports/vpx_timer.h" + +namespace { + +const int kMaxPsnr = 100; +const double kUsecsInSec = 1000000.0; + +struct EncodePerfTestVideo { + EncodePerfTestVideo(const char *name_, uint32_t width_, uint32_t height_, + uint32_t bitrate_, int frames_) + : name(name_), width(width_), height(height_), bitrate(bitrate_), + frames(frames_) {} + const char *name; + uint32_t width; + uint32_t height; + uint32_t bitrate; + int frames; +}; + +const EncodePerfTestVideo kVP9EncodePerfTestVectors[] = { + EncodePerfTestVideo("desktop_640_360_30.yuv", 640, 360, 200, 2484), + EncodePerfTestVideo("kirland_640_480_30.yuv", 640, 480, 200, 300), + EncodePerfTestVideo("macmarcomoving_640_480_30.yuv", 640, 480, 200, 987), + EncodePerfTestVideo("macmarcostationary_640_480_30.yuv", 640, 480, 200, 718), + EncodePerfTestVideo("niklas_640_480_30.yuv", 640, 480, 200, 471), + EncodePerfTestVideo("tacomanarrows_640_480_30.yuv", 640, 480, 200, 300), + EncodePerfTestVideo("tacomasmallcameramovement_640_480_30.yuv", 640, 480, 200, + 300), + EncodePerfTestVideo("thaloundeskmtg_640_480_30.yuv", 640, 480, 200, 300), + EncodePerfTestVideo("niklas_1280_720_30.yuv", 1280, 720, 600, 470), +}; + +const int kEncodePerfTestSpeeds[] = { 5, 6, 7, 8, 9 }; +const int kEncodePerfTestThreads[] = { 1, 2, 4 }; + +#define NELEMENTS(x) (sizeof((x)) / sizeof((x)[0])) + +class VP9EncodePerfTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam { + protected: + VP9EncodePerfTest() + : EncoderTest(GET_PARAM(0)), min_psnr_(kMaxPsnr), nframes_(0), + encoding_mode_(GET_PARAM(1)), speed_(0), threads_(1) {} + + ~VP9EncodePerfTest() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(encoding_mode_); + + cfg_.g_lag_in_frames = 0; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_undershoot_pct = 50; + cfg_.rc_overshoot_pct = 50; + cfg_.rc_buf_sz = 1000; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 600; + cfg_.rc_resize_allowed = 0; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_error_resilient = 1; + cfg_.g_threads = threads_; + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + const int log2_tile_columns = 3; + encoder->Control(VP8E_SET_CPUUSED, speed_); + encoder->Control(VP9E_SET_TILE_COLUMNS, log2_tile_columns); + encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1); + encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 0); + } + } + + void BeginPassHook(unsigned int /*pass*/) override { + min_psnr_ = kMaxPsnr; + nframes_ = 0; + } + + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { + if (pkt->data.psnr.psnr[0] < min_psnr_) { + min_psnr_ = pkt->data.psnr.psnr[0]; + } + } + + // for performance reasons don't decode + bool DoDecode() const override { return false; } + + double min_psnr() const { return min_psnr_; } + + void set_speed(unsigned int speed) { speed_ = speed; } + + void set_threads(unsigned int threads) { threads_ = threads; } + + private: + double min_psnr_; + unsigned int nframes_; + libvpx_test::TestMode encoding_mode_; + unsigned speed_; + unsigned int threads_; +}; + +TEST_P(VP9EncodePerfTest, PerfTest) { + for (size_t i = 0; i < NELEMENTS(kVP9EncodePerfTestVectors); ++i) { + for (size_t j = 0; j < NELEMENTS(kEncodePerfTestSpeeds); ++j) { + for (size_t k = 0; k < NELEMENTS(kEncodePerfTestThreads); ++k) { + if (kVP9EncodePerfTestVectors[i].width < 512 && + kEncodePerfTestThreads[k] > 1) { + continue; + } else if (kVP9EncodePerfTestVectors[i].width < 1024 && + kEncodePerfTestThreads[k] > 2) { + continue; + } + + set_threads(kEncodePerfTestThreads[k]); + SetUp(); + + const vpx_rational timebase = { 33333333, 1000000000 }; + cfg_.g_timebase = timebase; + cfg_.rc_target_bitrate = kVP9EncodePerfTestVectors[i].bitrate; + + init_flags_ = VPX_CODEC_USE_PSNR; + + const unsigned frames = kVP9EncodePerfTestVectors[i].frames; + const char *video_name = kVP9EncodePerfTestVectors[i].name; + libvpx_test::I420VideoSource video( + video_name, kVP9EncodePerfTestVectors[i].width, + kVP9EncodePerfTestVectors[i].height, timebase.den, timebase.num, 0, + kVP9EncodePerfTestVectors[i].frames); + set_speed(kEncodePerfTestSpeeds[j]); + + vpx_usec_timer t; + vpx_usec_timer_start(&t); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + vpx_usec_timer_mark(&t); + const double elapsed_secs = vpx_usec_timer_elapsed(&t) / kUsecsInSec; + const double fps = frames / elapsed_secs; + const double minimum_psnr = min_psnr(); + std::string display_name(video_name); + if (kEncodePerfTestThreads[k] > 1) { + char thread_count[32]; + snprintf(thread_count, sizeof(thread_count), "_t-%d", + kEncodePerfTestThreads[k]); + display_name += thread_count; + } + + printf("{\n"); + printf("\t\"type\" : \"encode_perf_test\",\n"); + printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP); + printf("\t\"videoName\" : \"%s\",\n", display_name.c_str()); + printf("\t\"encodeTimeSecs\" : %f,\n", elapsed_secs); + printf("\t\"totalFrames\" : %u,\n", frames); + printf("\t\"framesPerSecond\" : %f,\n", fps); + printf("\t\"minPsnr\" : %f,\n", minimum_psnr); + printf("\t\"speed\" : %d,\n", kEncodePerfTestSpeeds[j]); + printf("\t\"threads\" : %d\n", kEncodePerfTestThreads[k]); + printf("}\n"); + } + } + } +} + +VP9_INSTANTIATE_TEST_SUITE(VP9EncodePerfTest, + ::testing::Values(::libvpx_test::kRealTime)); +} // namespace diff --git a/media/libvpx/libvpx/test/encode_test_driver.cc b/media/libvpx/libvpx/test/encode_test_driver.cc new file mode 100644 index 0000000000..d3feeee34d --- /dev/null +++ b/media/libvpx/libvpx/test/encode_test_driver.cc @@ -0,0 +1,269 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vpx_config.h" +#include "test/codec_factory.h" +#include "test/decode_test_driver.h" +#include "test/encode_test_driver.h" +#include "test/register_state_check.h" +#include "test/video_source.h" + +namespace libvpx_test { +void Encoder::InitEncoder(VideoSource *video) { + vpx_codec_err_t res; + const vpx_image_t *img = video->img(); + + if (video->img() && !encoder_.priv) { + cfg_.g_w = img->d_w; + cfg_.g_h = img->d_h; + cfg_.g_timebase = video->timebase(); + cfg_.rc_twopass_stats_in = stats_->buf(); + + res = vpx_codec_enc_init(&encoder_, CodecInterface(), &cfg_, init_flags_); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + +#if CONFIG_VP9_ENCODER + if (CodecInterface() == &vpx_codec_vp9_cx_algo) { + // Default to 1 tile column for VP9. + const int log2_tile_columns = 0; + res = vpx_codec_control_(&encoder_, VP9E_SET_TILE_COLUMNS, + log2_tile_columns); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } else +#endif + { +#if CONFIG_VP8_ENCODER + ASSERT_EQ(&vpx_codec_vp8_cx_algo, CodecInterface()) + << "Unknown Codec Interface"; +#endif + } + } +} + +void Encoder::EncodeFrame(VideoSource *video, + const vpx_enc_frame_flags_t frame_flags) { + if (video->img()) { + EncodeFrameInternal(*video, frame_flags); + } else { + Flush(); + } + + // Handle twopass stats + CxDataIterator iter = GetCxData(); + + while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) { + if (pkt->kind != VPX_CODEC_STATS_PKT) continue; + + stats_->Append(*pkt); + } +} + +void Encoder::EncodeFrameInternal(const VideoSource &video, + const vpx_enc_frame_flags_t frame_flags) { + vpx_codec_err_t res; + const vpx_image_t *img = video.img(); + + // Handle frame resizing + if (cfg_.g_w != img->d_w || cfg_.g_h != img->d_h) { + cfg_.g_w = img->d_w; + cfg_.g_h = img->d_h; + res = vpx_codec_enc_config_set(&encoder_, &cfg_); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } + + // Encode the frame + API_REGISTER_STATE_CHECK(res = vpx_codec_encode(&encoder_, img, video.pts(), + video.duration(), frame_flags, + deadline_)); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); +} + +void Encoder::Flush() { + const vpx_codec_err_t res = + vpx_codec_encode(&encoder_, nullptr, 0, 0, 0, deadline_); + if (!encoder_.priv) + ASSERT_EQ(VPX_CODEC_ERROR, res) << EncoderError(); + else + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); +} + +void EncoderTest::InitializeConfig() { + const vpx_codec_err_t res = codec_->DefaultEncoderConfig(&cfg_, 0); + dec_cfg_ = vpx_codec_dec_cfg_t(); + ASSERT_EQ(VPX_CODEC_OK, res); +} + +void EncoderTest::SetMode(TestMode mode) { + switch (mode) { + case kRealTime: deadline_ = VPX_DL_REALTIME; break; + + case kOnePassGood: + case kTwoPassGood: deadline_ = VPX_DL_GOOD_QUALITY; break; + + case kOnePassBest: + case kTwoPassBest: deadline_ = VPX_DL_BEST_QUALITY; break; + + default: ASSERT_TRUE(false) << "Unexpected mode " << mode; + } + + if (mode == kTwoPassGood || mode == kTwoPassBest) { + passes_ = 2; + } else { + passes_ = 1; + } +} +// The function should return "true" most of the time, therefore no early +// break-out is implemented within the match checking process. +static bool compare_img(const vpx_image_t *img1, const vpx_image_t *img2) { + bool match = (img1->fmt == img2->fmt) && (img1->cs == img2->cs) && + (img1->d_w == img2->d_w) && (img1->d_h == img2->d_h); + + if (!match) return false; + + const unsigned int width_y = img1->d_w; + const unsigned int height_y = img1->d_h; + unsigned int i; + for (i = 0; i < height_y; ++i) { + match = (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y], + img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y], + width_y) == 0) && + match; + } + const unsigned int width_uv = (img1->d_w + 1) >> 1; + const unsigned int height_uv = (img1->d_h + 1) >> 1; + for (i = 0; i < height_uv; ++i) { + match = (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U], + img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U], + width_uv) == 0) && + match; + } + for (i = 0; i < height_uv; ++i) { + match = (memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V], + img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V], + width_uv) == 0) && + match; + } + return match; +} + +void EncoderTest::MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) { + ASSERT_TRUE(0) << "Encode/Decode mismatch found"; +} + +void EncoderTest::RunLoop(VideoSource *video) { + vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t(); + + stats_.Reset(); + + ASSERT_TRUE(passes_ == 1 || passes_ == 2); + for (unsigned int pass = 0; pass < passes_; pass++) { + vpx_codec_pts_t last_pts = 0; + + if (passes_ == 1) { + cfg_.g_pass = VPX_RC_ONE_PASS; + } else if (pass == 0) { + cfg_.g_pass = VPX_RC_FIRST_PASS; + } else { + cfg_.g_pass = VPX_RC_LAST_PASS; + } + + BeginPassHook(pass); + std::unique_ptr encoder( + codec_->CreateEncoder(cfg_, deadline_, init_flags_, &stats_)); + ASSERT_NE(encoder.get(), nullptr); + + ASSERT_NO_FATAL_FAILURE(video->Begin()); + encoder->InitEncoder(video); + ASSERT_FALSE(::testing::Test::HasFatalFailure()); + + unsigned long dec_init_flags = 0; // NOLINT + // Use fragment decoder if encoder outputs partitions. + // NOTE: fragment decoder and partition encoder are only supported by VP8. + if (init_flags_ & VPX_CODEC_USE_OUTPUT_PARTITION) { + dec_init_flags |= VPX_CODEC_USE_INPUT_FRAGMENTS; + } + std::unique_ptr decoder( + codec_->CreateDecoder(dec_cfg, dec_init_flags)); + bool again; + for (again = true; again; video->Next()) { + again = (video->img() != nullptr); + + PreEncodeFrameHook(video); + PreEncodeFrameHook(video, encoder.get()); + encoder->EncodeFrame(video, frame_flags_); + + PostEncodeFrameHook(encoder.get()); + + CxDataIterator iter = encoder->GetCxData(); + + bool has_cxdata = false; + bool has_dxdata = false; + while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) { + pkt = MutateEncoderOutputHook(pkt); + again = true; + switch (pkt->kind) { + case VPX_CODEC_CX_FRAME_PKT: + has_cxdata = true; + if (decoder != nullptr && DoDecode()) { + PreDecodeFrameHook(video, decoder.get()); + vpx_codec_err_t res_dec = decoder->DecodeFrame( + (const uint8_t *)pkt->data.frame.buf, pkt->data.frame.sz); + + if (!HandleDecodeResult(res_dec, *video, decoder.get())) break; + + has_dxdata = true; + } + ASSERT_GE(pkt->data.frame.pts, last_pts); + last_pts = pkt->data.frame.pts; + FramePktHook(pkt); + break; + + case VPX_CODEC_PSNR_PKT: PSNRPktHook(pkt); break; + + case VPX_CODEC_STATS_PKT: StatsPktHook(pkt); break; + + default: break; + } + } + + // Flush the decoder when there are no more fragments. + if ((init_flags_ & VPX_CODEC_USE_OUTPUT_PARTITION) && has_dxdata) { + const vpx_codec_err_t res_dec = decoder->DecodeFrame(nullptr, 0); + if (!HandleDecodeResult(res_dec, *video, decoder.get())) break; + } + + if (has_dxdata && has_cxdata) { + const vpx_image_t *img_enc = encoder->GetPreviewFrame(); + DxDataIterator dec_iter = decoder->GetDxData(); + const vpx_image_t *img_dec = dec_iter.Next(); + if (img_enc && img_dec) { + const bool res = compare_img(img_enc, img_dec); + if (!res) { // Mismatch + MismatchHook(img_enc, img_dec); + } + } + if (img_dec) DecompressedFrameHook(*img_dec, video->pts()); + } + if (!Continue()) break; + } + + EndPassHook(); + + if (!Continue()) break; + } +} + +} // namespace libvpx_test diff --git a/media/libvpx/libvpx/test/encode_test_driver.h b/media/libvpx/libvpx/test/encode_test_driver.h new file mode 100644 index 0000000000..7dd80d6664 --- /dev/null +++ b/media/libvpx/libvpx/test/encode_test_driver.h @@ -0,0 +1,302 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_TEST_ENCODE_TEST_DRIVER_H_ +#define VPX_TEST_ENCODE_TEST_DRIVER_H_ + +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vpx_config.h" +#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER +#include "vpx/vp8cx.h" +#endif +#include "vpx/vpx_tpl.h" + +namespace libvpx_test { + +class CodecFactory; +class VideoSource; + +enum TestMode { + kRealTime, + kOnePassGood, + kOnePassBest, + kTwoPassGood, + kTwoPassBest +}; +#define ALL_TEST_MODES \ + ::testing::Values(::libvpx_test::kRealTime, ::libvpx_test::kOnePassGood, \ + ::libvpx_test::kOnePassBest, ::libvpx_test::kTwoPassGood, \ + ::libvpx_test::kTwoPassBest) + +#define ONE_PASS_TEST_MODES \ + ::testing::Values(::libvpx_test::kRealTime, ::libvpx_test::kOnePassGood, \ + ::libvpx_test::kOnePassBest) + +#define TWO_PASS_TEST_MODES \ + ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kTwoPassBest) + +// Provides an object to handle the libvpx get_cx_data() iteration pattern +class CxDataIterator { + public: + explicit CxDataIterator(vpx_codec_ctx_t *encoder) + : encoder_(encoder), iter_(nullptr) {} + + const vpx_codec_cx_pkt_t *Next() { + return vpx_codec_get_cx_data(encoder_, &iter_); + } + + private: + vpx_codec_ctx_t *encoder_; + vpx_codec_iter_t iter_; +}; + +// Implements an in-memory store for libvpx twopass statistics +class TwopassStatsStore { + public: + void Append(const vpx_codec_cx_pkt_t &pkt) { + buffer_.append(reinterpret_cast(pkt.data.twopass_stats.buf), + pkt.data.twopass_stats.sz); + } + + vpx_fixed_buf_t buf() { + const vpx_fixed_buf_t buf = { &buffer_[0], buffer_.size() }; + return buf; + } + + void Reset() { buffer_.clear(); } + + protected: + std::string buffer_; +}; + +// Provides a simplified interface to manage one video encoding pass, given +// a configuration and video source. +// +// TODO(jkoleszar): The exact services it provides and the appropriate +// level of abstraction will be fleshed out as more tests are written. +class Encoder { + public: + Encoder(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline, + const unsigned long init_flags, TwopassStatsStore *stats) + : cfg_(cfg), deadline_(deadline), init_flags_(init_flags), stats_(stats) { + memset(&encoder_, 0, sizeof(encoder_)); + } + + virtual ~Encoder() { vpx_codec_destroy(&encoder_); } + + CxDataIterator GetCxData() { return CxDataIterator(&encoder_); } + + void InitEncoder(VideoSource *video); + + const vpx_image_t *GetPreviewFrame() { + return vpx_codec_get_preview_frame(&encoder_); + } + // This is a thin wrapper around vpx_codec_encode(), so refer to + // vpx_encoder.h for its semantics. + void EncodeFrame(VideoSource *video, vpx_enc_frame_flags_t frame_flags); + + // Convenience wrapper for EncodeFrame() + void EncodeFrame(VideoSource *video) { EncodeFrame(video, 0); } + + void Control(int ctrl_id, int arg) { + const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } + + void Control(int ctrl_id, int *arg) { + const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } + + void Control(int ctrl_id, struct vpx_scaling_mode *arg) { + const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } + + void Control(int ctrl_id, struct vpx_svc_layer_id *arg) { + const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } + + void Control(int ctrl_id, struct vpx_svc_ref_frame_config *arg) { + const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } + + void Control(int ctrl_id, struct vpx_svc_parameters *arg) { + const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } + + void Control(int ctrl_id, struct vpx_svc_frame_drop *arg) { + const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } + + void Control(int ctrl_id, struct vpx_svc_spatial_layer_sync *arg) { + const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } + +#if CONFIG_VP9_ENCODER + void Control(int ctrl_id, vpx_rc_funcs_t *arg) { + const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } + + void Control(int ctrl_id, VpxTplGopStats *arg) { + const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } +#endif // CONFIG_VP9_ENCODER + +#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER + void Control(int ctrl_id, vpx_active_map_t *arg) { + const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } + + void Control(int ctrl_id, vpx_roi_map_t *arg) { + const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } +#endif + void Config(const vpx_codec_enc_cfg_t *cfg) { + const vpx_codec_err_t res = vpx_codec_enc_config_set(&encoder_, cfg); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + cfg_ = *cfg; + } + + void set_deadline(vpx_enc_deadline_t deadline) { deadline_ = deadline; } + + protected: + virtual vpx_codec_iface_t *CodecInterface() const = 0; + + const char *EncoderError() { + const char *detail = vpx_codec_error_detail(&encoder_); + return detail ? detail : vpx_codec_error(&encoder_); + } + + // Encode an image + void EncodeFrameInternal(const VideoSource &video, + vpx_enc_frame_flags_t frame_flags); + + // Flush the encoder on EOS + void Flush(); + + vpx_codec_ctx_t encoder_; + vpx_codec_enc_cfg_t cfg_; + vpx_enc_deadline_t deadline_; + unsigned long init_flags_; + TwopassStatsStore *stats_; +}; + +// Common test functionality for all Encoder tests. +// +// This class is a mixin which provides the main loop common to all +// encoder tests. It provides hooks which can be overridden by subclasses +// to implement each test's specific behavior, while centralizing the bulk +// of the boilerplate. Note that it doesn't inherit the gtest testing +// classes directly, so that tests can be parameterized differently. +class EncoderTest { + protected: + explicit EncoderTest(const CodecFactory *codec) + : codec_(codec), abort_(false), init_flags_(0), frame_flags_(0) { + // Default to 1 thread. + cfg_.g_threads = 1; + } + + virtual ~EncoderTest() {} + + // Initialize the cfg_ member with the default configuration. + void InitializeConfig(); + + // Map the TestMode enum to the deadline_ and passes_ variables. + void SetMode(TestMode mode); + + // Set encoder flag. + void set_init_flags(unsigned long flag) { // NOLINT(runtime/int) + init_flags_ = flag; + } + + // Main loop + virtual void RunLoop(VideoSource *video); + + // Hook to be called at the beginning of a pass. + virtual void BeginPassHook(unsigned int /*pass*/) {} + + // Hook to be called at the end of a pass. + virtual void EndPassHook() {} + + // Hook to be called before encoding a frame. + virtual void PreEncodeFrameHook(VideoSource * /*video*/) {} + virtual void PreEncodeFrameHook(VideoSource * /*video*/, + Encoder * /*encoder*/) {} + + virtual void PreDecodeFrameHook(VideoSource * /*video*/, + Decoder * /*decoder*/) {} + + virtual void PostEncodeFrameHook(Encoder * /*encoder*/) {} + + // Hook to be called on every compressed data packet. + virtual void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {} + + // Hook to be called on every PSNR packet. + virtual void PSNRPktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {} + + // Hook to be called on every first pass stats packet. + virtual void StatsPktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {} + + // Hook to determine whether the encode loop should continue. + virtual bool Continue() const { + return !(::testing::Test::HasFatalFailure() || abort_); + } + + const CodecFactory *codec_; + // Hook to determine whether to decode frame after encoding + virtual bool DoDecode() const { return true; } + + // Hook to handle encode/decode mismatch + virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2); + + // Hook to be called on every decompressed frame. + virtual void DecompressedFrameHook(const vpx_image_t & /*img*/, + vpx_codec_pts_t /*pts*/) {} + + // Hook to be called to handle decode result. Return true to continue. + virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec, + const VideoSource & /*video*/, + Decoder *decoder) { + EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError(); + return VPX_CODEC_OK == res_dec; + } + + // Hook that can modify the encoder's output data + virtual const vpx_codec_cx_pkt_t *MutateEncoderOutputHook( + const vpx_codec_cx_pkt_t *pkt) { + return pkt; + } + + bool abort_; + vpx_codec_enc_cfg_t cfg_; + vpx_codec_dec_cfg_t dec_cfg_; + unsigned int passes_; + vpx_enc_deadline_t deadline_; + TwopassStatsStore stats_; + unsigned long init_flags_; + vpx_enc_frame_flags_t frame_flags_; +}; + +} // namespace libvpx_test + +#endif // VPX_TEST_ENCODE_TEST_DRIVER_H_ diff --git a/media/libvpx/libvpx/test/error_resilience_test.cc b/media/libvpx/libvpx/test/error_resilience_test.cc new file mode 100644 index 0000000000..6b019b2bfb --- /dev/null +++ b/media/libvpx/libvpx/test/error_resilience_test.cc @@ -0,0 +1,582 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" + +namespace { + +const int kMaxErrorFrames = 12; +const int kMaxDroppableFrames = 12; + +class ErrorResilienceTestLarge + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith2Params { + protected: + ErrorResilienceTestLarge() + : EncoderTest(GET_PARAM(0)), svc_support_(GET_PARAM(2)), psnr_(0.0), + nframes_(0), mismatch_psnr_(0.0), mismatch_nframes_(0), + encoding_mode_(GET_PARAM(1)) { + Reset(); + } + + ~ErrorResilienceTestLarge() override = default; + + void Reset() { + error_nframes_ = 0; + droppable_nframes_ = 0; + pattern_switch_ = 0; + } + + void SetUp() override { + InitializeConfig(); + SetMode(encoding_mode_); + } + + void BeginPassHook(unsigned int /*pass*/) override { + psnr_ = 0.0; + nframes_ = 0; + mismatch_psnr_ = 0.0; + mismatch_nframes_ = 0; + } + + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { + psnr_ += pkt->data.psnr.psnr[0]; + nframes_++; + } + + // + // Frame flags and layer id for temporal layers. + // For two layers, test pattern is: + // 1 3 + // 0 2 ..... + // LAST is updated on base/layer 0, GOLDEN updated on layer 1. + // Non-zero pattern_switch parameter means pattern will switch to + // not using LAST for frame_num >= pattern_switch. + int SetFrameFlags(int frame_num, int num_temp_layers, int pattern_switch) { + int frame_flags = 0; + if (num_temp_layers == 2) { + if (frame_num % 2 == 0) { + if (frame_num < pattern_switch || pattern_switch == 0) { + // Layer 0: predict from LAST and ARF, update LAST. + frame_flags = + VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF; + } else { + // Layer 0: predict from GF and ARF, update GF. + frame_flags = VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_UPD_LAST | + VP8_EFLAG_NO_UPD_ARF; + } + } else { + if (frame_num < pattern_switch || pattern_switch == 0) { + // Layer 1: predict from L, GF, and ARF, update GF. + frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST; + } else { + // Layer 1: predict from GF and ARF, update GF. + frame_flags = VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_UPD_LAST | + VP8_EFLAG_NO_UPD_ARF; + } + } + } + return frame_flags; + } + + void PreEncodeFrameHook(libvpx_test::VideoSource *video) override { + frame_flags_ &= + ~(VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF); + // For temporal layer case. + if (cfg_.ts_number_layers > 1) { + frame_flags_ = + SetFrameFlags(video->frame(), cfg_.ts_number_layers, pattern_switch_); + for (unsigned int i = 0; i < droppable_nframes_; ++i) { + if (droppable_frames_[i] == video->frame()) { + std::cout << "Encoding droppable frame: " << droppable_frames_[i] + << "\n"; + } + } + } else { + if (droppable_nframes_ > 0 && + (cfg_.g_pass == VPX_RC_LAST_PASS || cfg_.g_pass == VPX_RC_ONE_PASS)) { + for (unsigned int i = 0; i < droppable_nframes_; ++i) { + if (droppable_frames_[i] == video->frame()) { + std::cout << "Encoding droppable frame: " << droppable_frames_[i] + << "\n"; + frame_flags_ |= (VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF | + VP8_EFLAG_NO_UPD_ARF); + return; + } + } + } + } + } + + double GetAveragePsnr() const { + if (nframes_) return psnr_ / nframes_; + return 0.0; + } + + double GetAverageMismatchPsnr() const { + if (mismatch_nframes_) return mismatch_psnr_ / mismatch_nframes_; + return 0.0; + } + + bool DoDecode() const override { + if (error_nframes_ > 0 && + (cfg_.g_pass == VPX_RC_LAST_PASS || cfg_.g_pass == VPX_RC_ONE_PASS)) { + for (unsigned int i = 0; i < error_nframes_; ++i) { + if (error_frames_[i] == nframes_ - 1) { + std::cout << " Skipping decoding frame: " + << error_frames_[i] << "\n"; + return false; + } + } + } + return true; + } + + void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) override { + double mismatch_psnr = compute_psnr(img1, img2); + mismatch_psnr_ += mismatch_psnr; + ++mismatch_nframes_; + // std::cout << "Mismatch frame psnr: " << mismatch_psnr << "\n"; + } + + void SetErrorFrames(int num, unsigned int *list) { + if (num > kMaxErrorFrames) { + num = kMaxErrorFrames; + } else if (num < 0) { + num = 0; + } + error_nframes_ = num; + for (unsigned int i = 0; i < error_nframes_; ++i) { + error_frames_[i] = list[i]; + } + } + + void SetDroppableFrames(int num, unsigned int *list) { + if (num > kMaxDroppableFrames) { + num = kMaxDroppableFrames; + } else if (num < 0) { + num = 0; + } + droppable_nframes_ = num; + for (unsigned int i = 0; i < droppable_nframes_; ++i) { + droppable_frames_[i] = list[i]; + } + } + + unsigned int GetMismatchFrames() { return mismatch_nframes_; } + + void SetPatternSwitch(int frame_switch) { pattern_switch_ = frame_switch; } + + bool svc_support_; + + private: + double psnr_; + unsigned int nframes_; + unsigned int error_nframes_; + unsigned int droppable_nframes_; + unsigned int pattern_switch_; + double mismatch_psnr_; + unsigned int mismatch_nframes_; + unsigned int error_frames_[kMaxErrorFrames]; + unsigned int droppable_frames_[kMaxDroppableFrames]; + libvpx_test::TestMode encoding_mode_; +}; + +TEST_P(ErrorResilienceTestLarge, OnVersusOff) { + const vpx_rational timebase = { 33333333, 1000000000 }; + cfg_.g_timebase = timebase; + cfg_.rc_target_bitrate = 2000; + cfg_.g_lag_in_frames = 10; + + init_flags_ = VPX_CODEC_USE_PSNR; + + libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + timebase.den, timebase.num, 0, 30); + + // Error resilient mode OFF. + cfg_.g_error_resilient = 0; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + const double psnr_resilience_off = GetAveragePsnr(); + EXPECT_GT(psnr_resilience_off, 25.0); + + // Error resilient mode ON. + cfg_.g_error_resilient = 1; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + const double psnr_resilience_on = GetAveragePsnr(); + EXPECT_GT(psnr_resilience_on, 25.0); + + // Test that turning on error resilient mode hurts by 10% at most. + if (psnr_resilience_off > 0.0) { + const double psnr_ratio = psnr_resilience_on / psnr_resilience_off; + EXPECT_GE(psnr_ratio, 0.9); + EXPECT_LE(psnr_ratio, 1.1); + } +} + +// Check for successful decoding and no encoder/decoder mismatch +// if we lose (i.e., drop before decoding) a set of droppable +// frames (i.e., frames that don't update any reference buffers). +// Check both isolated and consecutive loss. +TEST_P(ErrorResilienceTestLarge, DropFramesWithoutRecovery) { + const vpx_rational timebase = { 33333333, 1000000000 }; + cfg_.g_timebase = timebase; + cfg_.rc_target_bitrate = 500; + // FIXME(debargha): Fix this to work for any lag. + // Currently this test only works for lag = 0 + cfg_.g_lag_in_frames = 0; + + init_flags_ = VPX_CODEC_USE_PSNR; + + libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + timebase.den, timebase.num, 0, 40); + + // Error resilient mode ON. + cfg_.g_error_resilient = 1; + cfg_.kf_mode = VPX_KF_DISABLED; + + // Set an arbitrary set of error frames same as droppable frames. + // In addition to isolated loss/drop, add a long consecutive series + // (of size 9) of dropped frames. + unsigned int num_droppable_frames = 11; + unsigned int droppable_frame_list[] = { 5, 16, 22, 23, 24, 25, + 26, 27, 28, 29, 30 }; + SetDroppableFrames(num_droppable_frames, droppable_frame_list); + SetErrorFrames(num_droppable_frames, droppable_frame_list); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Test that no mismatches have been found + std::cout << " Mismatch frames: " << GetMismatchFrames() << "\n"; + EXPECT_EQ(GetMismatchFrames(), (unsigned int)0); + + // Reset previously set of error/droppable frames. + Reset(); + +#if 0 + // TODO(jkoleszar): This test is disabled for the time being as too + // sensitive. It's not clear how to set a reasonable threshold for + // this behavior. + + // Now set an arbitrary set of error frames that are non-droppable + unsigned int num_error_frames = 3; + unsigned int error_frame_list[] = {3, 10, 20}; + SetErrorFrames(num_error_frames, error_frame_list); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + // Test that dropping an arbitrary set of inter frames does not hurt too much + // Note the Average Mismatch PSNR is the average of the PSNR between + // decoded frame and encoder's version of the same frame for all frames + // with mismatch. + const double psnr_resilience_mismatch = GetAverageMismatchPsnr(); + std::cout << " Mismatch PSNR: " + << psnr_resilience_mismatch << "\n"; + EXPECT_GT(psnr_resilience_mismatch, 20.0); +#endif +} + +// Check for successful decoding and no encoder/decoder mismatch +// if we lose (i.e., drop before decoding) the enhancement layer frames for a +// two layer temporal pattern. The base layer does not predict from the top +// layer, so successful decoding is expected. +TEST_P(ErrorResilienceTestLarge, 2LayersDropEnhancement) { + // This test doesn't run if SVC is not supported. + if (!svc_support_) return; + + const vpx_rational timebase = { 33333333, 1000000000 }; + cfg_.g_timebase = timebase; + cfg_.rc_target_bitrate = 500; + cfg_.g_lag_in_frames = 0; + + cfg_.rc_end_usage = VPX_CBR; + // 2 Temporal layers, no spatial layers, CBR mode. + cfg_.ss_number_layers = 1; + cfg_.ts_number_layers = 2; + cfg_.ts_rate_decimator[0] = 2; + cfg_.ts_rate_decimator[1] = 1; + cfg_.ts_periodicity = 2; + cfg_.ts_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100; + cfg_.ts_target_bitrate[1] = cfg_.rc_target_bitrate; + + init_flags_ = VPX_CODEC_USE_PSNR; + + libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + timebase.den, timebase.num, 0, 40); + + // Error resilient mode ON. + cfg_.g_error_resilient = 1; + cfg_.kf_mode = VPX_KF_DISABLED; + SetPatternSwitch(0); + + // The odd frames are the enhancement layer for 2 layer pattern, so set + // those frames as droppable. Drop the last 7 frames. + unsigned int num_droppable_frames = 7; + unsigned int droppable_frame_list[] = { 27, 29, 31, 33, 35, 37, 39 }; + SetDroppableFrames(num_droppable_frames, droppable_frame_list); + SetErrorFrames(num_droppable_frames, droppable_frame_list); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Test that no mismatches have been found + std::cout << " Mismatch frames: " << GetMismatchFrames() << "\n"; + EXPECT_EQ(GetMismatchFrames(), (unsigned int)0); + + // Reset previously set of error/droppable frames. + Reset(); +} + +// Check for successful decoding and no encoder/decoder mismatch +// for a two layer temporal pattern, where at some point in the +// sequence, the LAST ref is not used anymore. +TEST_P(ErrorResilienceTestLarge, 2LayersNoRefLast) { + // This test doesn't run if SVC is not supported. + if (!svc_support_) return; + + const vpx_rational timebase = { 33333333, 1000000000 }; + cfg_.g_timebase = timebase; + cfg_.rc_target_bitrate = 500; + cfg_.g_lag_in_frames = 0; + + cfg_.rc_end_usage = VPX_CBR; + // 2 Temporal layers, no spatial layers, CBR mode. + cfg_.ss_number_layers = 1; + cfg_.ts_number_layers = 2; + cfg_.ts_rate_decimator[0] = 2; + cfg_.ts_rate_decimator[1] = 1; + cfg_.ts_periodicity = 2; + cfg_.ts_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100; + cfg_.ts_target_bitrate[1] = cfg_.rc_target_bitrate; + + init_flags_ = VPX_CODEC_USE_PSNR; + + libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + timebase.den, timebase.num, 0, 100); + + // Error resilient mode ON. + cfg_.g_error_resilient = 1; + cfg_.kf_mode = VPX_KF_DISABLED; + SetPatternSwitch(60); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Test that no mismatches have been found + std::cout << " Mismatch frames: " << GetMismatchFrames() << "\n"; + EXPECT_EQ(GetMismatchFrames(), (unsigned int)0); + + // Reset previously set of error/droppable frames. + Reset(); +} + +class ErrorResilienceTestLargeCodecControls + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam { + protected: + ErrorResilienceTestLargeCodecControls() + : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)) { + Reset(); + } + + ~ErrorResilienceTestLargeCodecControls() override = default; + + void Reset() { + last_pts_ = 0; + tot_frame_number_ = 0; + // For testing up to 3 layers. + for (int i = 0; i < 3; ++i) { + bits_total_[i] = 0; + } + duration_ = 0.0; + } + + void SetUp() override { + InitializeConfig(); + SetMode(encoding_mode_); + } + + // + // Frame flags and layer id for temporal layers. + // + + // For two layers, test pattern is: + // 1 3 + // 0 2 ..... + // For three layers, test pattern is: + // 1 3 5 7 + // 2 6 + // 0 4 .... + // LAST is always update on base/layer 0, GOLDEN is updated on layer 1, + // and ALTREF is updated on top layer for 3 layer pattern. + int SetFrameFlags(int frame_num, int num_temp_layers) { + int frame_flags = 0; + if (num_temp_layers == 2) { + if (frame_num % 2 == 0) { + // Layer 0: predict from L and ARF, update L. + frame_flags = + VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF; + } else { + // Layer 1: predict from L, G and ARF, and update G. + frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST | + VP8_EFLAG_NO_UPD_ENTROPY; + } + } else if (num_temp_layers == 3) { + if (frame_num % 4 == 0) { + // Layer 0: predict from L, update L. + frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | + VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF; + } else if ((frame_num - 2) % 4 == 0) { + // Layer 1: predict from L, G, update G. + frame_flags = + VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_REF_ARF; + } else if ((frame_num - 1) % 2 == 0) { + // Layer 2: predict from L, G, ARF; update ARG. + frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_LAST; + } + } + return frame_flags; + } + + int SetLayerId(int frame_num, int num_temp_layers) { + int layer_id = 0; + if (num_temp_layers == 2) { + if (frame_num % 2 == 0) { + layer_id = 0; + } else { + layer_id = 1; + } + } else if (num_temp_layers == 3) { + if (frame_num % 4 == 0) { + layer_id = 0; + } else if ((frame_num - 2) % 4 == 0) { + layer_id = 1; + } else if ((frame_num - 1) % 2 == 0) { + layer_id = 2; + } + } + return layer_id; + } + + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { + if (cfg_.ts_number_layers > 1) { + int layer_id = SetLayerId(video->frame(), cfg_.ts_number_layers); + int frame_flags = SetFrameFlags(video->frame(), cfg_.ts_number_layers); + if (video->frame() > 0) { + encoder->Control(VP8E_SET_TEMPORAL_LAYER_ID, layer_id); + encoder->Control(VP8E_SET_FRAME_FLAGS, frame_flags); + } + const vpx_rational_t tb = video->timebase(); + timebase_ = static_cast(tb.num) / tb.den; + duration_ = 0; + return; + } + } + + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + // Time since last timestamp = duration. + vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_; + if (duration > 1) { + // Update counter for total number of frames (#frames input to encoder). + // Needed for setting the proper layer_id below. + tot_frame_number_ += static_cast(duration - 1); + } + int layer = SetLayerId(tot_frame_number_, cfg_.ts_number_layers); + const size_t frame_size_in_bits = pkt->data.frame.sz * 8; + // Update the total encoded bits. For temporal layers, update the cumulative + // encoded bits per layer. + for (int i = layer; i < static_cast(cfg_.ts_number_layers); ++i) { + bits_total_[i] += frame_size_in_bits; + } + // Update the most recent pts. + last_pts_ = pkt->data.frame.pts; + ++tot_frame_number_; + } + + void EndPassHook() override { + duration_ = (last_pts_ + 1) * timebase_; + if (cfg_.ts_number_layers > 1) { + for (int layer = 0; layer < static_cast(cfg_.ts_number_layers); + ++layer) { + if (bits_total_[layer]) { + // Effective file datarate: + effective_datarate_[layer] = + (bits_total_[layer] / 1000.0) / duration_; + } + } + } + } + + double effective_datarate_[3]; + + private: + libvpx_test::TestMode encoding_mode_; + vpx_codec_pts_t last_pts_; + double timebase_; + int64_t bits_total_[3]; + double duration_; + int tot_frame_number_; +}; + +// Check two codec controls used for: +// (1) for setting temporal layer id, and (2) for settings encoder flags. +// This test invokes those controls for each frame, and verifies encoder/decoder +// mismatch and basic rate control response. +// TODO(marpan): Maybe move this test to datarate_test.cc. +TEST_P(ErrorResilienceTestLargeCodecControls, CodecControl3TemporalLayers) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.rc_dropframe_thresh = 1; + cfg_.g_lag_in_frames = 0; + cfg_.kf_mode = VPX_KF_DISABLED; + cfg_.g_error_resilient = 1; + + // 3 Temporal layers. Framerate decimation (4, 2, 1). + cfg_.ts_number_layers = 3; + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + cfg_.ts_periodicity = 4; + cfg_.ts_layer_id[0] = 0; + cfg_.ts_layer_id[1] = 2; + cfg_.ts_layer_id[2] = 1; + cfg_.ts_layer_id[3] = 2; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 200); + for (int i = 200; i <= 800; i += 200) { + cfg_.rc_target_bitrate = i; + Reset(); + // 40-20-40 bitrate allocation for 3 temporal layers. + cfg_.ts_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100; + cfg_.ts_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100; + cfg_.ts_target_bitrate[2] = cfg_.rc_target_bitrate; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + for (int j = 0; j < static_cast(cfg_.ts_number_layers); ++j) { + ASSERT_GE(effective_datarate_[j], cfg_.ts_target_bitrate[j] * 0.75) + << " The datarate for the file is lower than target by too much, " + "for layer: " + << j; + ASSERT_LE(effective_datarate_[j], cfg_.ts_target_bitrate[j] * 1.25) + << " The datarate for the file is greater than target by too much, " + "for layer: " + << j; + } + } +} + +VP8_INSTANTIATE_TEST_SUITE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES, + ::testing::Values(true)); +VP8_INSTANTIATE_TEST_SUITE(ErrorResilienceTestLargeCodecControls, + ONE_PASS_TEST_MODES); +VP9_INSTANTIATE_TEST_SUITE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES, + ::testing::Values(true)); +} // namespace diff --git a/media/libvpx/libvpx/test/examples.sh b/media/libvpx/libvpx/test/examples.sh new file mode 100755 index 0000000000..629f04239c --- /dev/null +++ b/media/libvpx/libvpx/test/examples.sh @@ -0,0 +1,29 @@ +#!/bin/sh +## +## Copyright (c) 2014 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## +## This file runs all of the tests for the libvpx examples. +## +. $(dirname $0)/tools_common.sh + +example_tests=$(ls $(dirname $0)/*.sh) + +# List of script names to exclude. +exclude_list="examples stress tools_common" + +# Filter out the scripts in $exclude_list. +for word in ${exclude_list}; do + example_tests=$(filter_strings "${example_tests}" "${word}" exclude) +done + +for test in ${example_tests}; do + # Source each test script so that exporting variables can be avoided. + VPX_TEST_NAME="$(basename ${test%.*})" + . "${test}" +done diff --git a/media/libvpx/libvpx/test/external_frame_buffer_test.cc b/media/libvpx/libvpx/test/external_frame_buffer_test.cc new file mode 100644 index 0000000000..7b9a836fbc --- /dev/null +++ b/media/libvpx/libvpx/test/external_frame_buffer_test.cc @@ -0,0 +1,518 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "test/codec_factory.h" +#include "test/decode_test_driver.h" +#include "test/ivf_video_source.h" +#include "test/md5_helper.h" +#include "test/test_vectors.h" +#include "test/util.h" +#if CONFIG_WEBM_IO +#include "test/webm_video_source.h" +#endif + +namespace { + +const int kVideoNameParam = 1; + +struct ExternalFrameBuffer { + uint8_t *data; + size_t size; + int in_use; +}; + +// Class to manipulate a list of external frame buffers. +class ExternalFrameBufferList { + public: + ExternalFrameBufferList() + : num_buffers_(0), num_used_buffers_(0), ext_fb_list_(nullptr) {} + + virtual ~ExternalFrameBufferList() { + for (int i = 0; i < num_buffers_; ++i) { + delete[] ext_fb_list_[i].data; + } + delete[] ext_fb_list_; + } + + // Creates the list to hold the external buffers. Returns true on success. + bool CreateBufferList(int num_buffers) { + if (num_buffers < 0) return false; + + num_buffers_ = num_buffers; + ext_fb_list_ = new ExternalFrameBuffer[num_buffers_]; + EXPECT_NE(ext_fb_list_, nullptr); + memset(ext_fb_list_, 0, sizeof(ext_fb_list_[0]) * num_buffers_); + return true; + } + + // Searches the frame buffer list for a free frame buffer. Makes sure + // that the frame buffer is at least |min_size| in bytes. Marks that the + // frame buffer is in use by libvpx. Finally sets |fb| to point to the + // external frame buffer. Returns < 0 on an error. + int GetFreeFrameBuffer(size_t min_size, vpx_codec_frame_buffer_t *fb) { + EXPECT_NE(fb, nullptr); + const int idx = FindFreeBufferIndex(); + if (idx == num_buffers_) return -1; + + if (ext_fb_list_[idx].size < min_size) { + delete[] ext_fb_list_[idx].data; + ext_fb_list_[idx].data = new uint8_t[min_size]; + memset(ext_fb_list_[idx].data, 0, min_size); + ext_fb_list_[idx].size = min_size; + } + + SetFrameBuffer(idx, fb); + + num_used_buffers_++; + return 0; + } + + // Test function that will not allocate any data for the frame buffer. + // Returns < 0 on an error. + int GetZeroFrameBuffer(size_t min_size, vpx_codec_frame_buffer_t *fb) { + EXPECT_NE(fb, nullptr); + const int idx = FindFreeBufferIndex(); + if (idx == num_buffers_) return -1; + + if (ext_fb_list_[idx].size < min_size) { + delete[] ext_fb_list_[idx].data; + ext_fb_list_[idx].data = nullptr; + ext_fb_list_[idx].size = min_size; + } + + SetFrameBuffer(idx, fb); + return 0; + } + + // Marks the external frame buffer that |fb| is pointing to as free. + // Returns < 0 on an error. + int ReturnFrameBuffer(vpx_codec_frame_buffer_t *fb) { + if (fb == nullptr) { + EXPECT_NE(fb, nullptr); + return -1; + } + ExternalFrameBuffer *const ext_fb = + reinterpret_cast(fb->priv); + if (ext_fb == nullptr) { + EXPECT_NE(ext_fb, nullptr); + return -1; + } + EXPECT_EQ(1, ext_fb->in_use); + ext_fb->in_use = 0; + num_used_buffers_--; + return 0; + } + + // Checks that the vpx_image_t data is contained within the external frame + // buffer private data passed back in the vpx_image_t. + void CheckImageFrameBuffer(const vpx_image_t *img) { + if (img->fb_priv != nullptr) { + const struct ExternalFrameBuffer *const ext_fb = + reinterpret_cast(img->fb_priv); + + ASSERT_TRUE(img->planes[0] >= ext_fb->data && + img->planes[0] < (ext_fb->data + ext_fb->size)); + } + } + + int num_used_buffers() const { return num_used_buffers_; } + + private: + // Returns the index of the first free frame buffer. Returns |num_buffers_| + // if there are no free frame buffers. + int FindFreeBufferIndex() { + int i; + // Find a free frame buffer. + for (i = 0; i < num_buffers_; ++i) { + if (!ext_fb_list_[i].in_use) break; + } + return i; + } + + // Sets |fb| to an external frame buffer. idx is the index into the frame + // buffer list. + void SetFrameBuffer(int idx, vpx_codec_frame_buffer_t *fb) { + ASSERT_NE(fb, nullptr); + fb->data = ext_fb_list_[idx].data; + fb->size = ext_fb_list_[idx].size; + ASSERT_EQ(0, ext_fb_list_[idx].in_use); + ext_fb_list_[idx].in_use = 1; + fb->priv = &ext_fb_list_[idx]; + } + + int num_buffers_; + int num_used_buffers_; + ExternalFrameBuffer *ext_fb_list_; +}; + +#if CONFIG_WEBM_IO + +// Callback used by libvpx to request the application to return a frame +// buffer of at least |min_size| in bytes. +int get_vp9_frame_buffer(void *user_priv, size_t min_size, + vpx_codec_frame_buffer_t *fb) { + ExternalFrameBufferList *const fb_list = + reinterpret_cast(user_priv); + return fb_list->GetFreeFrameBuffer(min_size, fb); +} + +// Callback used by libvpx to tell the application that |fb| is not needed +// anymore. +int release_vp9_frame_buffer(void *user_priv, vpx_codec_frame_buffer_t *fb) { + ExternalFrameBufferList *const fb_list = + reinterpret_cast(user_priv); + return fb_list->ReturnFrameBuffer(fb); +} + +// Callback will not allocate data for frame buffer. +int get_vp9_zero_frame_buffer(void *user_priv, size_t min_size, + vpx_codec_frame_buffer_t *fb) { + ExternalFrameBufferList *const fb_list = + reinterpret_cast(user_priv); + return fb_list->GetZeroFrameBuffer(min_size, fb); +} + +// Callback will allocate one less byte than |min_size|. +int get_vp9_one_less_byte_frame_buffer(void *user_priv, size_t min_size, + vpx_codec_frame_buffer_t *fb) { + ExternalFrameBufferList *const fb_list = + reinterpret_cast(user_priv); + return fb_list->GetFreeFrameBuffer(min_size - 1, fb); +} + +// Callback will not release the external frame buffer. +int do_not_release_vp9_frame_buffer(void *user_priv, + vpx_codec_frame_buffer_t *fb) { + (void)user_priv; + (void)fb; + return 0; +} + +#endif // CONFIG_WEBM_IO + +// Class for testing passing in external frame buffers to libvpx. +class ExternalFrameBufferMD5Test + : public ::libvpx_test::DecoderTest, + public ::libvpx_test::CodecTestWithParam { + protected: + ExternalFrameBufferMD5Test() + : DecoderTest(GET_PARAM(::libvpx_test::kCodecFactoryParam)), + md5_file_(nullptr), num_buffers_(0) {} + + ~ExternalFrameBufferMD5Test() override { + if (md5_file_ != nullptr) fclose(md5_file_); + } + + void PreDecodeFrameHook(const libvpx_test::CompressedVideoSource &video, + libvpx_test::Decoder *decoder) override { + if (num_buffers_ > 0 && video.frame_number() == 0) { + // Have libvpx use frame buffers we create. + ASSERT_TRUE(fb_list_.CreateBufferList(num_buffers_)); + ASSERT_EQ(VPX_CODEC_OK, + decoder->SetFrameBufferFunctions(GetVP9FrameBuffer, + ReleaseVP9FrameBuffer, this)); + } + } + + void OpenMD5File(const std::string &md5_file_name_) { + md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_); + ASSERT_NE(md5_file_, nullptr) + << "Md5 file open failed. Filename: " << md5_file_name_; + } + + void DecompressedFrameHook(const vpx_image_t &img, + const unsigned int frame_number) override { + ASSERT_NE(md5_file_, nullptr); + char expected_md5[33]; + char junk[128]; + + // Read correct md5 checksums. + const int res = fscanf(md5_file_, "%s %s", expected_md5, junk); + ASSERT_NE(EOF, res) << "Read md5 data failed"; + expected_md5[32] = '\0'; + + ::libvpx_test::MD5 md5_res; + md5_res.Add(&img); + const char *const actual_md5 = md5_res.Get(); + + // Check md5 match. + ASSERT_STREQ(expected_md5, actual_md5) + << "Md5 checksums don't match: frame number = " << frame_number; + } + + // Callback to get a free external frame buffer. Return value < 0 is an + // error. + static int GetVP9FrameBuffer(void *user_priv, size_t min_size, + vpx_codec_frame_buffer_t *fb) { + ExternalFrameBufferMD5Test *const md5Test = + reinterpret_cast(user_priv); + return md5Test->fb_list_.GetFreeFrameBuffer(min_size, fb); + } + + // Callback to release an external frame buffer. Return value < 0 is an + // error. + static int ReleaseVP9FrameBuffer(void *user_priv, + vpx_codec_frame_buffer_t *fb) { + ExternalFrameBufferMD5Test *const md5Test = + reinterpret_cast(user_priv); + return md5Test->fb_list_.ReturnFrameBuffer(fb); + } + + void set_num_buffers(int num_buffers) { num_buffers_ = num_buffers; } + int num_buffers() const { return num_buffers_; } + + private: + FILE *md5_file_; + int num_buffers_; + ExternalFrameBufferList fb_list_; +}; + +#if CONFIG_WEBM_IO +const char kVP9TestFile[] = "vp90-2-02-size-lf-1920x1080.webm"; +const char kVP9NonRefTestFile[] = "vp90-2-22-svc_1280x720_1.webm"; + +// Class for testing passing in external frame buffers to libvpx. +class ExternalFrameBufferTest : public ::testing::Test { + protected: + ExternalFrameBufferTest() + : video_(nullptr), decoder_(nullptr), num_buffers_(0) {} + + void SetUp() override { + video_ = new libvpx_test::WebMVideoSource(kVP9TestFile); + ASSERT_NE(video_, nullptr); + video_->Init(); + video_->Begin(); + + vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); + decoder_ = new libvpx_test::VP9Decoder(cfg, 0); + ASSERT_NE(decoder_, nullptr); + } + + void TearDown() override { + delete decoder_; + decoder_ = nullptr; + delete video_; + video_ = nullptr; + } + + // Passes the external frame buffer information to libvpx. + vpx_codec_err_t SetFrameBufferFunctions( + int num_buffers, vpx_get_frame_buffer_cb_fn_t cb_get, + vpx_release_frame_buffer_cb_fn_t cb_release) { + if (num_buffers > 0) { + num_buffers_ = num_buffers; + EXPECT_TRUE(fb_list_.CreateBufferList(num_buffers_)); + } + + return decoder_->SetFrameBufferFunctions(cb_get, cb_release, &fb_list_); + } + + vpx_codec_err_t DecodeOneFrame() { + const vpx_codec_err_t res = + decoder_->DecodeFrame(video_->cxdata(), video_->frame_size()); + CheckDecodedFrames(); + if (res == VPX_CODEC_OK) video_->Next(); + return res; + } + + vpx_codec_err_t DecodeRemainingFrames() { + for (; video_->cxdata() != nullptr; video_->Next()) { + const vpx_codec_err_t res = + decoder_->DecodeFrame(video_->cxdata(), video_->frame_size()); + if (res != VPX_CODEC_OK) return res; + CheckDecodedFrames(); + } + return VPX_CODEC_OK; + } + + void CheckDecodedFrames() { + libvpx_test::DxDataIterator dec_iter = decoder_->GetDxData(); + const vpx_image_t *img = nullptr; + + // Get decompressed data + while ((img = dec_iter.Next()) != nullptr) { + fb_list_.CheckImageFrameBuffer(img); + } + } + + libvpx_test::WebMVideoSource *video_; + libvpx_test::VP9Decoder *decoder_; + int num_buffers_; + ExternalFrameBufferList fb_list_; +}; + +class ExternalFrameBufferNonRefTest : public ExternalFrameBufferTest { + protected: + void SetUp() override { + video_ = new libvpx_test::WebMVideoSource(kVP9NonRefTestFile); + ASSERT_NE(video_, nullptr); + video_->Init(); + video_->Begin(); + + vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); + decoder_ = new libvpx_test::VP9Decoder(cfg, 0); + ASSERT_NE(decoder_, nullptr); + } + + virtual void CheckFrameBufferRelease() { + TearDown(); + ASSERT_EQ(0, fb_list_.num_used_buffers()); + } +}; +#endif // CONFIG_WEBM_IO + +// This test runs through the set of test vectors, and decodes them. +// Libvpx will call into the application to allocate a frame buffer when +// needed. The md5 checksums are computed for each frame in the video file. +// If md5 checksums match the correct md5 data, then the test is passed. +// Otherwise, the test failed. +TEST_P(ExternalFrameBufferMD5Test, ExtFBMD5Match) { + const std::string filename = GET_PARAM(kVideoNameParam); + + // Number of buffers equals #VP9_MAXIMUM_REF_BUFFERS + + // #VPX_MAXIMUM_WORK_BUFFERS + four jitter buffers. + const int jitter_buffers = 4; + const int num_buffers = + VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS + jitter_buffers; + set_num_buffers(num_buffers); + +#if CONFIG_VP8_DECODER + // Tell compiler we are not using kVP8TestVectors. + (void)libvpx_test::kVP8TestVectors; +#endif + + // Open compressed video file. + std::unique_ptr video; + if (filename.substr(filename.length() - 3, 3) == "ivf") { + video.reset(new libvpx_test::IVFVideoSource(filename)); + } else { +#if CONFIG_WEBM_IO + video.reset(new libvpx_test::WebMVideoSource(filename)); +#else + fprintf(stderr, "WebM IO is disabled, skipping test vector %s\n", + filename.c_str()); + return; +#endif + } + ASSERT_NE(video.get(), nullptr); + video->Init(); + + // Construct md5 file name. + const std::string md5_filename = filename + ".md5"; + OpenMD5File(md5_filename); + + // Decode frame, and check the md5 matching. + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); +} + +#if CONFIG_WEBM_IO +TEST_F(ExternalFrameBufferTest, MinFrameBuffers) { + // Minimum number of external frame buffers for VP9 is + // #VP9_MAXIMUM_REF_BUFFERS + #VPX_MAXIMUM_WORK_BUFFERS. + const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS; + ASSERT_EQ(VPX_CODEC_OK, + SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer, + release_vp9_frame_buffer)); + ASSERT_EQ(VPX_CODEC_OK, DecodeRemainingFrames()); +} + +TEST_F(ExternalFrameBufferTest, EightJitterBuffers) { + // Number of buffers equals #VP9_MAXIMUM_REF_BUFFERS + + // #VPX_MAXIMUM_WORK_BUFFERS + eight jitter buffers. + const int jitter_buffers = 8; + const int num_buffers = + VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS + jitter_buffers; + ASSERT_EQ(VPX_CODEC_OK, + SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer, + release_vp9_frame_buffer)); + ASSERT_EQ(VPX_CODEC_OK, DecodeRemainingFrames()); +} + +TEST_F(ExternalFrameBufferTest, NotEnoughBuffers) { + // Minimum number of external frame buffers for VP9 is + // #VP9_MAXIMUM_REF_BUFFERS + #VPX_MAXIMUM_WORK_BUFFERS. Most files will + // only use 5 frame buffers at one time. + const int num_buffers = 2; + ASSERT_EQ(VPX_CODEC_OK, + SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer, + release_vp9_frame_buffer)); + ASSERT_EQ(VPX_CODEC_OK, DecodeOneFrame()); + // Only run this on long clips. Decoding a very short clip will return + // VPX_CODEC_OK even with only 2 buffers. + ASSERT_EQ(VPX_CODEC_MEM_ERROR, DecodeRemainingFrames()); +} + +TEST_F(ExternalFrameBufferTest, NoRelease) { + const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS; + ASSERT_EQ(VPX_CODEC_OK, + SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer, + do_not_release_vp9_frame_buffer)); + ASSERT_EQ(VPX_CODEC_OK, DecodeOneFrame()); + ASSERT_EQ(VPX_CODEC_MEM_ERROR, DecodeRemainingFrames()); +} + +TEST_F(ExternalFrameBufferTest, NullRealloc) { + const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS; + ASSERT_EQ(VPX_CODEC_OK, + SetFrameBufferFunctions(num_buffers, get_vp9_zero_frame_buffer, + release_vp9_frame_buffer)); + ASSERT_EQ(VPX_CODEC_MEM_ERROR, DecodeOneFrame()); +} + +TEST_F(ExternalFrameBufferTest, ReallocOneLessByte) { + const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS; + ASSERT_EQ(VPX_CODEC_OK, SetFrameBufferFunctions( + num_buffers, get_vp9_one_less_byte_frame_buffer, + release_vp9_frame_buffer)); + ASSERT_EQ(VPX_CODEC_MEM_ERROR, DecodeOneFrame()); +} + +TEST_F(ExternalFrameBufferTest, NullGetFunction) { + const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS; + ASSERT_EQ( + VPX_CODEC_INVALID_PARAM, + SetFrameBufferFunctions(num_buffers, nullptr, release_vp9_frame_buffer)); +} + +TEST_F(ExternalFrameBufferTest, NullReleaseFunction) { + const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS; + ASSERT_EQ( + VPX_CODEC_INVALID_PARAM, + SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer, nullptr)); +} + +TEST_F(ExternalFrameBufferTest, SetAfterDecode) { + const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS; + ASSERT_EQ(VPX_CODEC_OK, DecodeOneFrame()); + ASSERT_EQ(VPX_CODEC_ERROR, + SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer, + release_vp9_frame_buffer)); +} + +TEST_F(ExternalFrameBufferNonRefTest, ReleaseNonRefFrameBuffer) { + const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS; + ASSERT_EQ(VPX_CODEC_OK, + SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer, + release_vp9_frame_buffer)); + ASSERT_EQ(VPX_CODEC_OK, DecodeRemainingFrames()); + CheckFrameBufferRelease(); +} +#endif // CONFIG_WEBM_IO + +VP9_INSTANTIATE_TEST_SUITE( + ExternalFrameBufferMD5Test, + ::testing::ValuesIn(libvpx_test::kVP9TestVectors, + libvpx_test::kVP9TestVectors + + libvpx_test::kNumVP9TestVectors)); +} // namespace diff --git a/media/libvpx/libvpx/test/fdct8x8_test.cc b/media/libvpx/libvpx/test/fdct8x8_test.cc new file mode 100644 index 0000000000..3cdf909d46 --- /dev/null +++ b/media/libvpx/libvpx/test/fdct8x8_test.cc @@ -0,0 +1,791 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_scan.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +using libvpx_test::ACMRandom; + +namespace { + +const int kNumCoeffs = 64; +const double kPi = 3.141592653589793238462643383279502884; + +const int kSignBiasMaxDiff255 = 1500; +const int kSignBiasMaxDiff15 = 10000; + +typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride); +typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride); +typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride, + int tx_type); +typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride, + int tx_type); + +typedef std::tuple Dct8x8Param; +typedef std::tuple Ht8x8Param; +typedef std::tuple Idct8x8Param; + +void reference_8x8_dct_1d(const double in[8], double out[8]) { + const double kInvSqrt2 = 0.707106781186547524400844362104; + for (int k = 0; k < 8; k++) { + out[k] = 0.0; + for (int n = 0; n < 8; n++) { + out[k] += in[n] * cos(kPi * (2 * n + 1) * k / 16.0); + } + if (k == 0) out[k] = out[k] * kInvSqrt2; + } +} + +void reference_8x8_dct_2d(const int16_t input[kNumCoeffs], + double output[kNumCoeffs]) { + // First transform columns + for (int i = 0; i < 8; ++i) { + double temp_in[8], temp_out[8]; + for (int j = 0; j < 8; ++j) temp_in[j] = input[j * 8 + i]; + reference_8x8_dct_1d(temp_in, temp_out); + for (int j = 0; j < 8; ++j) output[j * 8 + i] = temp_out[j]; + } + // Then transform rows + for (int i = 0; i < 8; ++i) { + double temp_in[8], temp_out[8]; + for (int j = 0; j < 8; ++j) temp_in[j] = output[j + i * 8]; + reference_8x8_dct_1d(temp_in, temp_out); + // Scale by some magic number + for (int j = 0; j < 8; ++j) output[j + i * 8] = temp_out[j] * 2; + } +} + +void fdct8x8_ref(const int16_t *in, tran_low_t *out, int stride, + int /*tx_type*/) { + vpx_fdct8x8_c(in, out, stride); +} + +void fht8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) { + vp9_fht8x8_c(in, out, stride, tx_type); +} + +#if CONFIG_VP9_HIGHBITDEPTH +void idct8x8_10(const tran_low_t *in, uint8_t *out, int stride) { + vpx_highbd_idct8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); +} + +void idct8x8_12(const tran_low_t *in, uint8_t *out, int stride) { + vpx_highbd_idct8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); +} + +void iht8x8_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) { + vp9_highbd_iht8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10); +} + +void iht8x8_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) { + vp9_highbd_iht8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12); +} + +#if HAVE_SSE2 + +void idct8x8_12_add_10_c(const tran_low_t *in, uint8_t *out, int stride) { + vpx_highbd_idct8x8_12_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); +} + +void idct8x8_12_add_12_c(const tran_low_t *in, uint8_t *out, int stride) { + vpx_highbd_idct8x8_12_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); +} + +void idct8x8_12_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) { + vpx_highbd_idct8x8_12_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10); +} + +void idct8x8_12_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { + vpx_highbd_idct8x8_12_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12); +} + +void idct8x8_64_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) { + vpx_highbd_idct8x8_64_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10); +} + +void idct8x8_64_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { + vpx_highbd_idct8x8_64_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12); +} +#endif // HAVE_SSE2 +#endif // CONFIG_VP9_HIGHBITDEPTH + +// Visual Studio 2022 (cl.exe) targeting AArch64 with optimizations enabled +// produces invalid code in RunExtremalCheck() and RunInvAccuracyCheck(). +// See: +// https://developercommunity.visualstudio.com/t/1770-preview-1:-Misoptimization-for-AR/10369786 +// TODO(jzern): check the compiler version after a fix for the issue is +// released. +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) +#pragma optimize("", off) +#endif +class FwdTrans8x8TestBase { + public: + virtual ~FwdTrans8x8TestBase() = default; + + protected: + virtual void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) = 0; + virtual void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) = 0; + + void RunSignBiasCheck() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + DECLARE_ALIGNED(16, int16_t, test_input_block[64]); + DECLARE_ALIGNED(16, tran_low_t, test_output_block[64]); + int count_sign_block[64][2]; + const int count_test_block = 100000; + + memset(count_sign_block, 0, sizeof(count_sign_block)); + + for (int i = 0; i < count_test_block; ++i) { + // Initialize a test block with input range [-255, 255]. + for (int j = 0; j < 64; ++j) { + test_input_block[j] = ((rnd.Rand16() >> (16 - bit_depth_)) & mask_) - + ((rnd.Rand16() >> (16 - bit_depth_)) & mask_); + } + ASM_REGISTER_STATE_CHECK( + RunFwdTxfm(test_input_block, test_output_block, pitch_)); + + for (int j = 0; j < 64; ++j) { + if (test_output_block[j] < 0) { + ++count_sign_block[j][0]; + } else if (test_output_block[j] > 0) { + ++count_sign_block[j][1]; + } + } + } + + for (int j = 0; j < 64; ++j) { + const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]); + const int max_diff = kSignBiasMaxDiff255; + ASSERT_LT(diff, max_diff << (bit_depth_ - 8)) + << "Error: 8x8 FDCT/FHT has a sign bias > " + << 1. * max_diff / count_test_block * 100 << "%" + << " for input range [-255, 255] at index " << j + << " count0: " << count_sign_block[j][0] + << " count1: " << count_sign_block[j][1] << " diff: " << diff; + } + + memset(count_sign_block, 0, sizeof(count_sign_block)); + + for (int i = 0; i < count_test_block; ++i) { + // Initialize a test block with input range [-mask_ / 16, mask_ / 16]. + for (int j = 0; j < 64; ++j) { + test_input_block[j] = + ((rnd.Rand16() & mask_) >> 4) - ((rnd.Rand16() & mask_) >> 4); + } + ASM_REGISTER_STATE_CHECK( + RunFwdTxfm(test_input_block, test_output_block, pitch_)); + + for (int j = 0; j < 64; ++j) { + if (test_output_block[j] < 0) { + ++count_sign_block[j][0]; + } else if (test_output_block[j] > 0) { + ++count_sign_block[j][1]; + } + } + } + + for (int j = 0; j < 64; ++j) { + const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]); + const int max_diff = kSignBiasMaxDiff15; + ASSERT_LT(diff, max_diff << (bit_depth_ - 8)) + << "Error: 8x8 FDCT/FHT has a sign bias > " + << 1. * max_diff / count_test_block * 100 << "%" + << " for input range [-15, 15] at index " << j + << " count0: " << count_sign_block[j][0] + << " count1: " << count_sign_block[j][1] << " diff: " << diff; + } + } + + void RunRoundTripErrorCheck() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + int max_error = 0; + int total_error = 0; + const int count_test_block = 100000; + DECLARE_ALIGNED(16, int16_t, test_input_block[64]); + DECLARE_ALIGNED(16, tran_low_t, test_temp_block[64]); + DECLARE_ALIGNED(16, uint8_t, dst[64]); + DECLARE_ALIGNED(16, uint8_t, src[64]); +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, dst16[64]); + DECLARE_ALIGNED(16, uint16_t, src16[64]); +#endif + + for (int i = 0; i < count_test_block; ++i) { + // Initialize a test block with input range [-mask_, mask_]. + for (int j = 0; j < 64; ++j) { + if (bit_depth_ == VPX_BITS_8) { + src[j] = rnd.Rand8(); + dst[j] = rnd.Rand8(); + test_input_block[j] = src[j] - dst[j]; +#if CONFIG_VP9_HIGHBITDEPTH + } else { + src16[j] = rnd.Rand16() & mask_; + dst16[j] = rnd.Rand16() & mask_; + test_input_block[j] = src16[j] - dst16[j]; +#endif + } + } + + ASM_REGISTER_STATE_CHECK( + RunFwdTxfm(test_input_block, test_temp_block, pitch_)); + for (int j = 0; j < 64; ++j) { + if (test_temp_block[j] > 0) { + test_temp_block[j] += 2; + test_temp_block[j] /= 4; + test_temp_block[j] *= 4; + } else { + test_temp_block[j] -= 2; + test_temp_block[j] /= 4; + test_temp_block[j] *= 4; + } + } + if (bit_depth_ == VPX_BITS_8) { + ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_)); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + ASM_REGISTER_STATE_CHECK( + RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_)); +#endif + } + + for (int j = 0; j < 64; ++j) { +#if CONFIG_VP9_HIGHBITDEPTH + const int diff = + bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j]; +#else + const int diff = dst[j] - src[j]; +#endif + const int error = diff * diff; + if (max_error < error) max_error = error; + total_error += error; + } + } + + ASSERT_GE(1 << 2 * (bit_depth_ - 8), max_error) + << "Error: 8x8 FDCT/IDCT or FHT/IHT has an individual" + << " roundtrip error > 1"; + + ASSERT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error) + << "Error: 8x8 FDCT/IDCT or FHT/IHT has average roundtrip " + << "error > 1/5 per block"; + } + + void RunExtremalCheck() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + int max_error = 0; + int total_error = 0; + int total_coeff_error = 0; + const int count_test_block = 100000; + DECLARE_ALIGNED(16, int16_t, test_input_block[64]); + DECLARE_ALIGNED(16, tran_low_t, test_temp_block[64]); + DECLARE_ALIGNED(16, tran_low_t, ref_temp_block[64]); + DECLARE_ALIGNED(16, uint8_t, dst[64]); + DECLARE_ALIGNED(16, uint8_t, src[64]); +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, dst16[64]); + DECLARE_ALIGNED(16, uint16_t, src16[64]); +#endif + + for (int i = 0; i < count_test_block; ++i) { + // Initialize a test block with input range [-mask_, mask_]. + for (int j = 0; j < 64; ++j) { + if (bit_depth_ == VPX_BITS_8) { + if (i == 0) { + src[j] = 255; + dst[j] = 0; + } else if (i == 1) { + src[j] = 0; + dst[j] = 255; + } else { + src[j] = rnd.Rand8() % 2 ? 255 : 0; + dst[j] = rnd.Rand8() % 2 ? 255 : 0; + } + test_input_block[j] = src[j] - dst[j]; +#if CONFIG_VP9_HIGHBITDEPTH + } else { + if (i == 0) { + src16[j] = mask_; + dst16[j] = 0; + } else if (i == 1) { + src16[j] = 0; + dst16[j] = mask_; + } else { + src16[j] = rnd.Rand8() % 2 ? mask_ : 0; + dst16[j] = rnd.Rand8() % 2 ? mask_ : 0; + } + test_input_block[j] = src16[j] - dst16[j]; +#endif + } + } + + ASM_REGISTER_STATE_CHECK( + RunFwdTxfm(test_input_block, test_temp_block, pitch_)); + ASM_REGISTER_STATE_CHECK( + fwd_txfm_ref(test_input_block, ref_temp_block, pitch_, tx_type_)); + if (bit_depth_ == VPX_BITS_8) { + ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_)); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + ASM_REGISTER_STATE_CHECK( + RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_)); +#endif + } + + for (int j = 0; j < 64; ++j) { +#if CONFIG_VP9_HIGHBITDEPTH + const int diff = + bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j]; +#else + const int diff = dst[j] - src[j]; +#endif + const int error = diff * diff; + if (max_error < error) max_error = error; + total_error += error; + + const int coeff_diff = test_temp_block[j] - ref_temp_block[j]; + total_coeff_error += abs(coeff_diff); + } + + ASSERT_GE(1 << 2 * (bit_depth_ - 8), max_error) + << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has" + << " an individual roundtrip error > 1"; + + ASSERT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error) + << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has average" + << " roundtrip error > 1/5 per block"; + + ASSERT_EQ(0, total_coeff_error) + << "Error: Extremal 8x8 FDCT/FHT has" + << " overflow issues in the intermediate steps > 1"; + } + } + + void RunInvAccuracyCheck() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 1000; + DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]); + DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]); +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]); + DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]); +#endif + + for (int i = 0; i < count_test_block; ++i) { + double out_r[kNumCoeffs]; + + // Initialize a test block with input range [-255, 255]. + for (int j = 0; j < kNumCoeffs; ++j) { + if (bit_depth_ == VPX_BITS_8) { + src[j] = rnd.Rand8() % 2 ? 255 : 0; + dst[j] = src[j] > 0 ? 0 : 255; + in[j] = src[j] - dst[j]; +#if CONFIG_VP9_HIGHBITDEPTH + } else { + src16[j] = rnd.Rand8() % 2 ? mask_ : 0; + dst16[j] = src16[j] > 0 ? 0 : mask_; + in[j] = src16[j] - dst16[j]; +#endif + } + } + + reference_8x8_dct_2d(in, out_r); + for (int j = 0; j < kNumCoeffs; ++j) { + coeff[j] = static_cast(round(out_r[j])); + } + + if (bit_depth_ == VPX_BITS_8) { + ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_)); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + ASM_REGISTER_STATE_CHECK( + RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_)); +#endif + } + + for (int j = 0; j < kNumCoeffs; ++j) { +#if CONFIG_VP9_HIGHBITDEPTH + const int diff = + bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j]; +#else + const int diff = dst[j] - src[j]; +#endif + const uint32_t error = diff * diff; + ASSERT_GE(1u << 2 * (bit_depth_ - 8), error) + << "Error: 8x8 IDCT has error " << error << " at index " << j; + } + } + } + + void RunFwdAccuracyCheck() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 1000; + DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]); + DECLARE_ALIGNED(16, tran_low_t, coeff_r[kNumCoeffs]); + DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]); + + for (int i = 0; i < count_test_block; ++i) { + double out_r[kNumCoeffs]; + + // Initialize a test block with input range [-mask_, mask_]. + for (int j = 0; j < kNumCoeffs; ++j) { + in[j] = rnd.Rand8() % 2 == 0 ? mask_ : -mask_; + } + + RunFwdTxfm(in, coeff, pitch_); + reference_8x8_dct_2d(in, out_r); + for (int j = 0; j < kNumCoeffs; ++j) { + coeff_r[j] = static_cast(round(out_r[j])); + } + + for (int j = 0; j < kNumCoeffs; ++j) { + const int32_t diff = coeff[j] - coeff_r[j]; + const uint32_t error = diff * diff; + ASSERT_GE(9u << 2 * (bit_depth_ - 8), error) + << "Error: 8x8 DCT has error " << error << " at index " << j; + } + } + } + + void CompareInvReference(IdctFunc ref_txfm, int thresh) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 10000; + const int eob = 12; + DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]); +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]); + DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]); +#endif + const int16_t *scan = vp9_default_scan_orders[TX_8X8].scan; + + for (int i = 0; i < count_test_block; ++i) { + for (int j = 0; j < kNumCoeffs; ++j) { + if (j < eob) { + // Random values less than the threshold, either positive or negative + coeff[scan[j]] = rnd(thresh) * (1 - 2 * (i % 2)); + } else { + coeff[scan[j]] = 0; + } + if (bit_depth_ == VPX_BITS_8) { + dst[j] = 0; + ref[j] = 0; +#if CONFIG_VP9_HIGHBITDEPTH + } else { + dst16[j] = 0; + ref16[j] = 0; +#endif + } + } + if (bit_depth_ == VPX_BITS_8) { + ref_txfm(coeff, ref, pitch_); + ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_)); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + ref_txfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_); + ASM_REGISTER_STATE_CHECK( + RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_)); +#endif + } + + for (int j = 0; j < kNumCoeffs; ++j) { +#if CONFIG_VP9_HIGHBITDEPTH + const int diff = + bit_depth_ == VPX_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j]; +#else + const int diff = dst[j] - ref[j]; +#endif + const uint32_t error = diff * diff; + ASSERT_EQ(0u, error) + << "Error: 8x8 IDCT has error " << error << " at index " << j; + } + } + } + int pitch_; + int tx_type_; + FhtFunc fwd_txfm_ref; + vpx_bit_depth_t bit_depth_; + int mask_; +}; +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) +#pragma optimize("", on) +#endif + +class FwdTrans8x8DCT : public FwdTrans8x8TestBase, + public ::testing::TestWithParam { + public: + ~FwdTrans8x8DCT() override = default; + + void SetUp() override { + fwd_txfm_ = GET_PARAM(0); + inv_txfm_ = GET_PARAM(1); + tx_type_ = GET_PARAM(2); + pitch_ = 8; + fwd_txfm_ref = fdct8x8_ref; + bit_depth_ = GET_PARAM(3); + mask_ = (1 << bit_depth_) - 1; + } + + void TearDown() override { libvpx_test::ClearSystemState(); } + + protected: + void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) override { + fwd_txfm_(in, out, stride); + } + void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override { + inv_txfm_(out, dst, stride); + } + + FdctFunc fwd_txfm_; + IdctFunc inv_txfm_; +}; + +TEST_P(FwdTrans8x8DCT, SignBiasCheck) { RunSignBiasCheck(); } + +TEST_P(FwdTrans8x8DCT, RoundTripErrorCheck) { RunRoundTripErrorCheck(); } + +TEST_P(FwdTrans8x8DCT, ExtremalCheck) { RunExtremalCheck(); } + +TEST_P(FwdTrans8x8DCT, FwdAccuracyCheck) { RunFwdAccuracyCheck(); } + +TEST_P(FwdTrans8x8DCT, InvAccuracyCheck) { RunInvAccuracyCheck(); } + +class FwdTrans8x8HT : public FwdTrans8x8TestBase, + public ::testing::TestWithParam { + public: + ~FwdTrans8x8HT() override = default; + + void SetUp() override { + fwd_txfm_ = GET_PARAM(0); + inv_txfm_ = GET_PARAM(1); + tx_type_ = GET_PARAM(2); + pitch_ = 8; + fwd_txfm_ref = fht8x8_ref; + bit_depth_ = GET_PARAM(3); + mask_ = (1 << bit_depth_) - 1; + } + + void TearDown() override { libvpx_test::ClearSystemState(); } + + protected: + void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) override { + fwd_txfm_(in, out, stride, tx_type_); + } + void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override { + inv_txfm_(out, dst, stride, tx_type_); + } + + FhtFunc fwd_txfm_; + IhtFunc inv_txfm_; +}; + +TEST_P(FwdTrans8x8HT, SignBiasCheck) { RunSignBiasCheck(); } + +TEST_P(FwdTrans8x8HT, RoundTripErrorCheck) { RunRoundTripErrorCheck(); } + +TEST_P(FwdTrans8x8HT, ExtremalCheck) { RunExtremalCheck(); } + +#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +class InvTrans8x8DCT : public FwdTrans8x8TestBase, + public ::testing::TestWithParam { + public: + ~InvTrans8x8DCT() override = default; + + void SetUp() override { + ref_txfm_ = GET_PARAM(0); + inv_txfm_ = GET_PARAM(1); + thresh_ = GET_PARAM(2); + pitch_ = 8; + bit_depth_ = GET_PARAM(3); + mask_ = (1 << bit_depth_) - 1; + } + + void TearDown() override { libvpx_test::ClearSystemState(); } + + protected: + void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override { + inv_txfm_(out, dst, stride); + } + void RunFwdTxfm(int16_t * /*out*/, tran_low_t * /*dst*/, + int /*stride*/) override {} + + IdctFunc ref_txfm_; + IdctFunc inv_txfm_; + int thresh_; +}; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(InvTrans8x8DCT); + +TEST_P(InvTrans8x8DCT, CompareReference) { + CompareInvReference(ref_txfm_, thresh_); +} +#endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +using std::make_tuple; + +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + C, FwdTrans8x8DCT, + ::testing::Values( + make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c, 0, VPX_BITS_8), + make_tuple(&vpx_highbd_fdct8x8_c, &idct8x8_10, 0, VPX_BITS_10), + make_tuple(&vpx_highbd_fdct8x8_c, &idct8x8_12, 0, VPX_BITS_12))); +#else +INSTANTIATE_TEST_SUITE_P(C, FwdTrans8x8DCT, + ::testing::Values(make_tuple(&vpx_fdct8x8_c, + &vpx_idct8x8_64_add_c, 0, + VPX_BITS_8))); +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + C, FwdTrans8x8HT, + ::testing::Values( + make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8), + make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 0, VPX_BITS_10), + make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 1, VPX_BITS_10), + make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 2, VPX_BITS_10), + make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 3, VPX_BITS_10), + make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 0, VPX_BITS_12), + make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 1, VPX_BITS_12), + make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 2, VPX_BITS_12), + make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 3, VPX_BITS_12), + make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8), + make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8), + make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8))); +#else +INSTANTIATE_TEST_SUITE_P( + C, FwdTrans8x8HT, + ::testing::Values( + make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8), + make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8), + make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8), + make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8))); +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P(NEON, FwdTrans8x8DCT, + ::testing::Values(make_tuple(&vpx_fdct8x8_neon, + &vpx_idct8x8_64_add_neon, + 0, VPX_BITS_8))); + +#if !CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + NEON, FwdTrans8x8HT, + ::testing::Values( + make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 0, VPX_BITS_8), + make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 1, VPX_BITS_8), + make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 2, VPX_BITS_8), + make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 3, VPX_BITS_8))); +#endif // !CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_NEON && !CONFIG_EMULATE_HARDWARE + +#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P(SSE2, FwdTrans8x8DCT, + ::testing::Values(make_tuple(&vpx_fdct8x8_sse2, + &vpx_idct8x8_64_add_sse2, + 0, VPX_BITS_8))); +INSTANTIATE_TEST_SUITE_P( + SSE2, FwdTrans8x8HT, + ::testing::Values( + make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 0, VPX_BITS_8), + make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 1, VPX_BITS_8), + make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 2, VPX_BITS_8), + make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3, VPX_BITS_8))); +#endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P( + SSE2, FwdTrans8x8DCT, + ::testing::Values(make_tuple(&vpx_fdct8x8_sse2, &vpx_idct8x8_64_add_c, 0, + VPX_BITS_8), + make_tuple(&vpx_highbd_fdct8x8_c, &idct8x8_64_add_10_sse2, + 12, VPX_BITS_10), + make_tuple(&vpx_highbd_fdct8x8_sse2, + &idct8x8_64_add_10_sse2, 12, VPX_BITS_10), + make_tuple(&vpx_highbd_fdct8x8_c, &idct8x8_64_add_12_sse2, + 12, VPX_BITS_12), + make_tuple(&vpx_highbd_fdct8x8_sse2, + &idct8x8_64_add_12_sse2, 12, VPX_BITS_12))); + +INSTANTIATE_TEST_SUITE_P( + SSE2, FwdTrans8x8HT, + ::testing::Values( + make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8), + make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8), + make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8), + make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8))); + +// Optimizations take effect at a threshold of 6201, so we use a value close to +// that to test both branches. +INSTANTIATE_TEST_SUITE_P( + SSE2, InvTrans8x8DCT, + ::testing::Values( + make_tuple(&idct8x8_12_add_10_c, &idct8x8_12_add_10_sse2, 6225, + VPX_BITS_10), + make_tuple(&idct8x8_10, &idct8x8_64_add_10_sse2, 6225, VPX_BITS_10), + make_tuple(&idct8x8_12_add_12_c, &idct8x8_12_add_12_sse2, 6225, + VPX_BITS_12), + make_tuple(&idct8x8_12, &idct8x8_64_add_12_sse2, 6225, VPX_BITS_12))); +#endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +#if HAVE_SSSE3 && VPX_ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH && \ + !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P(SSSE3, FwdTrans8x8DCT, + ::testing::Values(make_tuple(&vpx_fdct8x8_ssse3, + &vpx_idct8x8_64_add_sse2, + 0, VPX_BITS_8))); +#endif + +#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P(MSA, FwdTrans8x8DCT, + ::testing::Values(make_tuple(&vpx_fdct8x8_msa, + &vpx_idct8x8_64_add_msa, + 0, VPX_BITS_8))); +INSTANTIATE_TEST_SUITE_P( + MSA, FwdTrans8x8HT, + ::testing::Values( + make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 0, VPX_BITS_8), + make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 1, VPX_BITS_8), + make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 2, VPX_BITS_8), + make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 3, VPX_BITS_8))); +#endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P(VSX, FwdTrans8x8DCT, + ::testing::Values(make_tuple(&vpx_fdct8x8_c, + &vpx_idct8x8_64_add_vsx, + 0, VPX_BITS_8))); +#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P(LSX, FwdTrans8x8DCT, + ::testing::Values(make_tuple(&vpx_fdct8x8_lsx, + &vpx_idct8x8_64_add_c, 0, + VPX_BITS_8))); +#endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +} // namespace diff --git a/media/libvpx/libvpx/test/frame_size_tests.cc b/media/libvpx/libvpx/test/frame_size_tests.cc new file mode 100644 index 0000000000..eea5647a78 --- /dev/null +++ b/media/libvpx/libvpx/test/frame_size_tests.cc @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/register_state_check.h" +#include "test/video_source.h" + +namespace { + +class EncoderWithExpectedError : public ::libvpx_test::Encoder { + public: + EncoderWithExpectedError(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline, + const unsigned long init_flags, // NOLINT + ::libvpx_test::TwopassStatsStore *stats) + : ::libvpx_test::Encoder(cfg, deadline, init_flags, stats) {} + // This overrides with expected error code. + void EncodeFrame(::libvpx_test::VideoSource *video, + const unsigned long frame_flags, // NOLINT + const vpx_codec_err_t expected_err) { + if (video->img()) { + EncodeFrameInternal(*video, frame_flags, expected_err); + } else { + Flush(); + } + + // Handle twopass stats + ::libvpx_test::CxDataIterator iter = GetCxData(); + + while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) { + if (pkt->kind != VPX_CODEC_STATS_PKT) continue; + + stats_->Append(*pkt); + } + } + + protected: + void EncodeFrameInternal(const ::libvpx_test::VideoSource &video, + const unsigned long frame_flags, // NOLINT + const vpx_codec_err_t expected_err) { + vpx_codec_err_t res; + const vpx_image_t *img = video.img(); + + // Handle frame resizing + if (cfg_.g_w != img->d_w || cfg_.g_h != img->d_h) { + cfg_.g_w = img->d_w; + cfg_.g_h = img->d_h; + res = vpx_codec_enc_config_set(&encoder_, &cfg_); + ASSERT_EQ(res, VPX_CODEC_OK) << EncoderError(); + } + + // Encode the frame + API_REGISTER_STATE_CHECK(res = vpx_codec_encode(&encoder_, img, video.pts(), + video.duration(), + frame_flags, deadline_)); + ASSERT_EQ(expected_err, res) << EncoderError(); + } + + vpx_codec_iface_t *CodecInterface() const override { +#if CONFIG_VP9_ENCODER + return &vpx_codec_vp9_cx_algo; +#else + return nullptr; +#endif + } +}; + +class VP9FrameSizeTestsLarge : public ::libvpx_test::EncoderTest, + public ::testing::Test { + protected: + VP9FrameSizeTestsLarge() + : EncoderTest(&::libvpx_test::kVP9), expected_res_(VPX_CODEC_OK) {} + ~VP9FrameSizeTestsLarge() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + } + + bool HandleDecodeResult(const vpx_codec_err_t res_dec, + const libvpx_test::VideoSource & /*video*/, + libvpx_test::Decoder *decoder) override { + EXPECT_EQ(expected_res_, res_dec) << decoder->DecodeError(); + return !::testing::Test::HasFailure(); + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, 7); + encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); + encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7); + encoder->Control(VP8E_SET_ARNR_STRENGTH, 5); + encoder->Control(VP8E_SET_ARNR_TYPE, 3); + } + } + + using ::libvpx_test::EncoderTest::RunLoop; + virtual void RunLoop(::libvpx_test::VideoSource *video, + const vpx_codec_err_t expected_err) { + stats_.Reset(); + + ASSERT_TRUE(passes_ == 1 || passes_ == 2); + for (unsigned int pass = 0; pass < passes_; pass++) { + vpx_codec_pts_t last_pts = 0; + + if (passes_ == 1) { + cfg_.g_pass = VPX_RC_ONE_PASS; + } else if (pass == 0) { + cfg_.g_pass = VPX_RC_FIRST_PASS; + } else { + cfg_.g_pass = VPX_RC_LAST_PASS; + } + + BeginPassHook(pass); + std::unique_ptr encoder( + new EncoderWithExpectedError(cfg_, deadline_, init_flags_, &stats_)); + ASSERT_NE(encoder.get(), nullptr); + + ASSERT_NO_FATAL_FAILURE(video->Begin()); + encoder->InitEncoder(video); + ASSERT_FALSE(::testing::Test::HasFatalFailure()); + for (bool again = true; again; video->Next()) { + again = (video->img() != nullptr); + + PreEncodeFrameHook(video, encoder.get()); + encoder->EncodeFrame(video, frame_flags_, expected_err); + + PostEncodeFrameHook(encoder.get()); + + ::libvpx_test::CxDataIterator iter = encoder->GetCxData(); + + while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) { + pkt = MutateEncoderOutputHook(pkt); + again = true; + switch (pkt->kind) { + case VPX_CODEC_CX_FRAME_PKT: + ASSERT_GE(pkt->data.frame.pts, last_pts); + last_pts = pkt->data.frame.pts; + FramePktHook(pkt); + break; + + case VPX_CODEC_PSNR_PKT: PSNRPktHook(pkt); break; + case VPX_CODEC_STATS_PKT: StatsPktHook(pkt); break; + default: break; + } + } + + if (!Continue()) break; + } + + EndPassHook(); + + if (!Continue()) break; + } + } + + vpx_codec_err_t expected_res_; +}; + +TEST_F(VP9FrameSizeTestsLarge, TestInvalidSizes) { + ::libvpx_test::RandomVideoSource video; + +#if CONFIG_SIZE_LIMIT + video.SetSize(DECODE_WIDTH_LIMIT + 16, DECODE_HEIGHT_LIMIT + 16); + video.set_limit(2); + expected_res_ = VPX_CODEC_MEM_ERROR; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video, expected_res_)); +#endif +} + +TEST_F(VP9FrameSizeTestsLarge, ValidSizes) { + ::libvpx_test::RandomVideoSource video; + +#if CONFIG_SIZE_LIMIT + video.SetSize(DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT); + video.set_limit(2); + expected_res_ = VPX_CODEC_OK; + ASSERT_NO_FATAL_FAILURE(::libvpx_test::EncoderTest::RunLoop(&video)); +#else +// This test produces a pretty large single frame allocation, (roughly +// 25 megabits). The encoder allocates a good number of these frames +// one for each lag in frames (for 2 pass), and then one for each possible +// reference buffer (8) - we can end up with up to 30 buffers of roughly this +// size or almost 1 gig of memory. +// In total the allocations will exceed 2GiB which may cause a failure with +// mingw + wine, use a smaller size in that case. +#if defined(_WIN32) && !defined(_WIN64) || defined(__OS2__) + video.SetSize(4096, 3072); +#else + video.SetSize(4096, 4096); +#endif + video.set_limit(2); + expected_res_ = VPX_CODEC_OK; + ASSERT_NO_FATAL_FAILURE(::libvpx_test::EncoderTest::RunLoop(&video)); +#endif +} + +TEST_F(VP9FrameSizeTestsLarge, OneByOneVideo) { + ::libvpx_test::RandomVideoSource video; + + video.SetSize(1, 1); + video.set_limit(2); + expected_res_ = VPX_CODEC_OK; + ASSERT_NO_FATAL_FAILURE(::libvpx_test::EncoderTest::RunLoop(&video)); +} +} // namespace diff --git a/media/libvpx/libvpx/test/hadamard_test.cc b/media/libvpx/libvpx/test/hadamard_test.cc new file mode 100644 index 0000000000..b22bae87cc --- /dev/null +++ b/media/libvpx/libvpx/test/hadamard_test.cc @@ -0,0 +1,380 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vpx_dsp_rtcd.h" +#include "vpx_ports/vpx_timer.h" + +#include "test/acm_random.h" +#include "test/register_state_check.h" + +namespace { + +using ::libvpx_test::ACMRandom; + +typedef void (*HadamardFunc)(const int16_t *a, ptrdiff_t a_stride, + tran_low_t *b); + +void hadamard_loop(const tran_low_t *a, tran_low_t *out) { + tran_low_t b[8]; + for (int i = 0; i < 8; i += 2) { + b[i + 0] = a[i * 8] + a[(i + 1) * 8]; + b[i + 1] = a[i * 8] - a[(i + 1) * 8]; + } + tran_low_t c[8]; + for (int i = 0; i < 8; i += 4) { + c[i + 0] = b[i + 0] + b[i + 2]; + c[i + 1] = b[i + 1] + b[i + 3]; + c[i + 2] = b[i + 0] - b[i + 2]; + c[i + 3] = b[i + 1] - b[i + 3]; + } + out[0] = c[0] + c[4]; + out[7] = c[1] + c[5]; + out[3] = c[2] + c[6]; + out[4] = c[3] + c[7]; + out[2] = c[0] - c[4]; + out[6] = c[1] - c[5]; + out[1] = c[2] - c[6]; + out[5] = c[3] - c[7]; +} + +void reference_hadamard8x8(const int16_t *a, int a_stride, tran_low_t *b) { + tran_low_t input[64]; + tran_low_t buf[64]; + for (int i = 0; i < 8; ++i) { + for (int j = 0; j < 8; ++j) { + input[i * 8 + j] = static_cast(a[i * a_stride + j]); + } + } + for (int i = 0; i < 8; ++i) hadamard_loop(input + i, buf + i * 8); + for (int i = 0; i < 8; ++i) hadamard_loop(buf + i, b + i * 8); +} + +void reference_hadamard16x16(const int16_t *a, int a_stride, tran_low_t *b) { + /* The source is a 16x16 block. The destination is rearranged to 8x32. + * Input is 9 bit. */ + reference_hadamard8x8(a + 0 + 0 * a_stride, a_stride, b + 0); + reference_hadamard8x8(a + 8 + 0 * a_stride, a_stride, b + 64); + reference_hadamard8x8(a + 0 + 8 * a_stride, a_stride, b + 128); + reference_hadamard8x8(a + 8 + 8 * a_stride, a_stride, b + 192); + + /* Overlay the 8x8 blocks and combine. */ + for (int i = 0; i < 64; ++i) { + /* 8x8 steps the range up to 15 bits. */ + const tran_low_t a0 = b[0]; + const tran_low_t a1 = b[64]; + const tran_low_t a2 = b[128]; + const tran_low_t a3 = b[192]; + + /* Prevent the result from escaping int16_t. */ + const tran_low_t b0 = (a0 + a1) >> 1; + const tran_low_t b1 = (a0 - a1) >> 1; + const tran_low_t b2 = (a2 + a3) >> 1; + const tran_low_t b3 = (a2 - a3) >> 1; + + /* Store a 16 bit value. */ + b[0] = b0 + b2; + b[64] = b1 + b3; + b[128] = b0 - b2; + b[192] = b1 - b3; + + ++b; + } +} + +void reference_hadamard32x32(const int16_t *a, int a_stride, tran_low_t *b) { + reference_hadamard16x16(a + 0 + 0 * a_stride, a_stride, b + 0); + reference_hadamard16x16(a + 16 + 0 * a_stride, a_stride, b + 256); + reference_hadamard16x16(a + 0 + 16 * a_stride, a_stride, b + 512); + reference_hadamard16x16(a + 16 + 16 * a_stride, a_stride, b + 768); + + for (int i = 0; i < 256; ++i) { + const tran_low_t a0 = b[0]; + const tran_low_t a1 = b[256]; + const tran_low_t a2 = b[512]; + const tran_low_t a3 = b[768]; + + const tran_low_t b0 = (a0 + a1) >> 2; + const tran_low_t b1 = (a0 - a1) >> 2; + const tran_low_t b2 = (a2 + a3) >> 2; + const tran_low_t b3 = (a2 - a3) >> 2; + + b[0] = b0 + b2; + b[256] = b1 + b3; + b[512] = b0 - b2; + b[768] = b1 - b3; + + ++b; + } +} + +struct HadamardFuncWithSize { + HadamardFuncWithSize(HadamardFunc f, int s) : func(f), block_size(s) {} + HadamardFunc func; + int block_size; +}; + +std::ostream &operator<<(std::ostream &os, const HadamardFuncWithSize &hfs) { + return os << "block size: " << hfs.block_size; +} + +class HadamardTestBase : public ::testing::TestWithParam { + public: + void SetUp() override { + h_func_ = GetParam().func; + bwh_ = GetParam().block_size; + block_size_ = bwh_ * bwh_; + rnd_.Reset(ACMRandom::DeterministicSeed()); + } + + // The Rand() function generates values in the range [-((1 << BitDepth) - 1), + // (1 << BitDepth) - 1]. This is because the input to the Hadamard transform + // is the residual pixel, which is defined as 'source pixel - predicted + // pixel'. Source pixel and predicted pixel take values in the range + // [0, (1 << BitDepth) - 1] and thus the residual pixel ranges from + // -((1 << BitDepth) - 1) to ((1 << BitDepth) - 1). + virtual int16_t Rand() = 0; + + void ReferenceHadamard(const int16_t *a, int a_stride, tran_low_t *b, + int bwh) { + if (bwh == 32) + reference_hadamard32x32(a, a_stride, b); + else if (bwh == 16) + reference_hadamard16x16(a, a_stride, b); + else + reference_hadamard8x8(a, a_stride, b); + } + + void CompareReferenceRandom() { + const int kMaxBlockSize = 32 * 32; + DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize]); + DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]); + memset(a, 0, sizeof(a)); + memset(b, 0, sizeof(b)); + + tran_low_t b_ref[kMaxBlockSize]; + memset(b_ref, 0, sizeof(b_ref)); + + for (int i = 0; i < block_size_; ++i) a[i] = Rand(); + + ReferenceHadamard(a, bwh_, b_ref, bwh_); + ASM_REGISTER_STATE_CHECK(h_func_(a, bwh_, b)); + + // The order of the output is not important. Sort before checking. + std::sort(b, b + block_size_); + std::sort(b_ref, b_ref + block_size_); + EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b))); + } + + void ExtremeValuesTest() { + const int kMaxBlockSize = 32 * 32; + DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxBlockSize]); + DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]); + memset(b, 0, sizeof(b)); + + tran_low_t b_ref[kMaxBlockSize]; + memset(b_ref, 0, sizeof(b_ref)); + + for (int i = 0; i < 2; ++i) { + // Initialize a test block with input range [-mask_, mask_]. + const int sign = (i == 0) ? 1 : -1; + for (int j = 0; j < kMaxBlockSize; ++j) + input_extreme_block[j] = sign * 255; + + ReferenceHadamard(input_extreme_block, bwh_, b_ref, bwh_); + ASM_REGISTER_STATE_CHECK(h_func_(input_extreme_block, bwh_, b)); + + // The order of the output is not important. Sort before checking. + std::sort(b, b + block_size_); + std::sort(b_ref, b_ref + block_size_); + EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b))); + } + } + + void VaryStride() { + const int kMaxBlockSize = 32 * 32; + DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize * 8]); + DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]); + memset(a, 0, sizeof(a)); + for (int i = 0; i < block_size_ * 8; ++i) a[i] = Rand(); + + tran_low_t b_ref[kMaxBlockSize]; + for (int i = 8; i < 64; i += 8) { + memset(b, 0, sizeof(b)); + memset(b_ref, 0, sizeof(b_ref)); + + ReferenceHadamard(a, i, b_ref, bwh_); + ASM_REGISTER_STATE_CHECK(h_func_(a, i, b)); + + // The order of the output is not important. Sort before checking. + std::sort(b, b + block_size_); + std::sort(b_ref, b_ref + block_size_); + EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b))); + } + } + + void SpeedTest(int times) { + const int kMaxBlockSize = 32 * 32; + DECLARE_ALIGNED(16, int16_t, input[kMaxBlockSize]); + DECLARE_ALIGNED(16, tran_low_t, output[kMaxBlockSize]); + memset(input, 1, sizeof(input)); + memset(output, 0, sizeof(output)); + + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int i = 0; i < times; ++i) { + h_func_(input, bwh_, output); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("Hadamard%dx%d[%12d runs]: %d us\n", bwh_, bwh_, times, + elapsed_time); + } + + protected: + int bwh_; + int block_size_; + HadamardFunc h_func_; + ACMRandom rnd_; +}; + +class HadamardLowbdTest : public HadamardTestBase { + protected: + // Use values between -255 (0xFF01) and 255 (0x00FF) + int16_t Rand() override { + int16_t src = rnd_.Rand8(); + int16_t pred = rnd_.Rand8(); + return src - pred; + } +}; + +TEST_P(HadamardLowbdTest, CompareReferenceRandom) { CompareReferenceRandom(); } + +TEST_P(HadamardLowbdTest, ExtremeValuesTest) { ExtremeValuesTest(); } + +TEST_P(HadamardLowbdTest, VaryStride) { VaryStride(); } + +TEST_P(HadamardLowbdTest, DISABLED_Speed) { + SpeedTest(10); + SpeedTest(10000); + SpeedTest(10000000); +} + +INSTANTIATE_TEST_SUITE_P( + C, HadamardLowbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_c, 8), + HadamardFuncWithSize(&vpx_hadamard_16x16_c, 16), + HadamardFuncWithSize(&vpx_hadamard_32x32_c, 32))); + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2, HadamardLowbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_sse2, 8), + HadamardFuncWithSize(&vpx_hadamard_16x16_sse2, 16), + HadamardFuncWithSize(&vpx_hadamard_32x32_sse2, 32))); +#endif // HAVE_SSE2 + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, HadamardLowbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_16x16_avx2, 16), + HadamardFuncWithSize(&vpx_hadamard_32x32_avx2, 32))); +#endif // HAVE_AVX2 + +#if HAVE_SSSE3 && VPX_ARCH_X86_64 +INSTANTIATE_TEST_SUITE_P( + SSSE3, HadamardLowbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_ssse3, 8))); +#endif // HAVE_SSSE3 && VPX_ARCH_X86_64 + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, HadamardLowbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_neon, 8), + HadamardFuncWithSize(&vpx_hadamard_16x16_neon, 16), + HadamardFuncWithSize(&vpx_hadamard_32x32_neon, 32))); +#endif // HAVE_NEON + +// TODO(jingning): Remove highbitdepth flag when the SIMD functions are +// in place and turn on the unit test. +#if !CONFIG_VP9_HIGHBITDEPTH +#if HAVE_MSA +INSTANTIATE_TEST_SUITE_P( + MSA, HadamardLowbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_msa, 8), + HadamardFuncWithSize(&vpx_hadamard_16x16_msa, 16))); +#endif // HAVE_MSA +#endif // !CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_VSX +INSTANTIATE_TEST_SUITE_P( + VSX, HadamardLowbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_vsx, 8), + HadamardFuncWithSize(&vpx_hadamard_16x16_vsx, 16))); +#endif // HAVE_VSX + +#if HAVE_LSX +INSTANTIATE_TEST_SUITE_P( + LSX, HadamardLowbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_lsx, 8), + HadamardFuncWithSize(&vpx_hadamard_16x16_lsx, 16))); +#endif // HAVE_LSX + +#if CONFIG_VP9_HIGHBITDEPTH +class HadamardHighbdTest : public HadamardTestBase { + protected: + // Use values between -4095 (0xF001) and 4095 (0x0FFF) + int16_t Rand() override { + int16_t src = rnd_.Rand12(); + int16_t pred = rnd_.Rand12(); + return src - pred; + } +}; + +TEST_P(HadamardHighbdTest, CompareReferenceRandom) { CompareReferenceRandom(); } + +TEST_P(HadamardHighbdTest, VaryStride) { VaryStride(); } + +TEST_P(HadamardHighbdTest, DISABLED_Speed) { + SpeedTest(10); + SpeedTest(10000); + SpeedTest(10000000); +} + +INSTANTIATE_TEST_SUITE_P( + C, HadamardHighbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_c, 8), + HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_c, 16), + HadamardFuncWithSize(&vpx_highbd_hadamard_32x32_c, 32))); + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, HadamardHighbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_avx2, 8), + HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_avx2, 16), + HadamardFuncWithSize(&vpx_highbd_hadamard_32x32_avx2, + 32))); +#endif // HAVE_AVX2 + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, HadamardHighbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_neon, 8), + HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_neon, 16), + HadamardFuncWithSize(&vpx_highbd_hadamard_32x32_neon, + 32))); +#endif + +#endif // CONFIG_VP9_HIGHBITDEPTH +} // namespace diff --git a/media/libvpx/libvpx/test/i420_video_source.h b/media/libvpx/libvpx/test/i420_video_source.h new file mode 100644 index 0000000000..97473b5c2f --- /dev/null +++ b/media/libvpx/libvpx/test/i420_video_source.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_TEST_I420_VIDEO_SOURCE_H_ +#define VPX_TEST_I420_VIDEO_SOURCE_H_ +#include +#include +#include + +#include "test/yuv_video_source.h" + +namespace libvpx_test { + +// This class extends VideoSource to allow parsing of raw yv12 +// so that we can do actual file encodes. +class I420VideoSource : public YUVVideoSource { + public: + I420VideoSource(const std::string &file_name, unsigned int width, + unsigned int height, int rate_numerator, int rate_denominator, + unsigned int start, int limit) + : YUVVideoSource(file_name, VPX_IMG_FMT_I420, width, height, + rate_numerator, rate_denominator, start, limit) {} +}; + +} // namespace libvpx_test + +#endif // VPX_TEST_I420_VIDEO_SOURCE_H_ diff --git a/media/libvpx/libvpx/test/idct8x8_test.cc b/media/libvpx/libvpx/test/idct8x8_test.cc new file mode 100644 index 0000000000..7951bb93c9 --- /dev/null +++ b/media/libvpx/libvpx/test/idct8x8_test.cc @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vpx_dsp_rtcd.h" +#include "test/acm_random.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/msvc.h" // for round() + +using libvpx_test::ACMRandom; + +namespace { + +void reference_dct_1d(double input[8], double output[8]) { + const double kPi = 3.141592653589793238462643383279502884; + const double kInvSqrt2 = 0.707106781186547524400844362104; + for (int k = 0; k < 8; k++) { + output[k] = 0.0; + for (int n = 0; n < 8; n++) { + output[k] += input[n] * cos(kPi * (2 * n + 1) * k / 16.0); + } + if (k == 0) output[k] = output[k] * kInvSqrt2; + } +} + +void reference_dct_2d(int16_t input[64], double output[64]) { + // First transform columns + for (int i = 0; i < 8; ++i) { + double temp_in[8], temp_out[8]; + for (int j = 0; j < 8; ++j) temp_in[j] = input[j * 8 + i]; + reference_dct_1d(temp_in, temp_out); + for (int j = 0; j < 8; ++j) output[j * 8 + i] = temp_out[j]; + } + // Then transform rows + for (int i = 0; i < 8; ++i) { + double temp_in[8], temp_out[8]; + for (int j = 0; j < 8; ++j) temp_in[j] = output[j + i * 8]; + reference_dct_1d(temp_in, temp_out); + for (int j = 0; j < 8; ++j) output[j + i * 8] = temp_out[j]; + } + // Scale by some magic number + for (int i = 0; i < 64; ++i) output[i] *= 2; +} + +TEST(VP9Idct8x8Test, AccuracyCheck) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 10000; + for (int i = 0; i < count_test_block; ++i) { + int16_t input[64]; + tran_low_t coeff[64]; + double output_r[64]; + uint8_t dst[64], src[64]; + + for (int j = 0; j < 64; ++j) { + src[j] = rnd.Rand8(); + dst[j] = rnd.Rand8(); + } + // Initialize a test block with input range [-255, 255]. + for (int j = 0; j < 64; ++j) input[j] = src[j] - dst[j]; + + reference_dct_2d(input, output_r); + for (int j = 0; j < 64; ++j) { + coeff[j] = static_cast(round(output_r[j])); + } + vpx_idct8x8_64_add_c(coeff, dst, 8); + for (int j = 0; j < 64; ++j) { + const int diff = dst[j] - src[j]; + const int error = diff * diff; + EXPECT_GE(1, error) << "Error: 8x8 FDCT/IDCT has error " << error + << " at index " << j; + } + } +} + +} // namespace diff --git a/media/libvpx/libvpx/test/idct_test.cc b/media/libvpx/libvpx/test/idct_test.cc new file mode 100644 index 0000000000..279e58e2aa --- /dev/null +++ b/media/libvpx/libvpx/test/idct_test.cc @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vp8_rtcd.h" + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "test/buffer.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "vpx/vpx_integer.h" + +typedef void (*IdctFunc)(int16_t *input, unsigned char *pred_ptr, + int pred_stride, unsigned char *dst_ptr, + int dst_stride); +namespace { + +using libvpx_test::Buffer; + +class IDCTTest : public ::testing::TestWithParam { + protected: + void SetUp() override { + UUT = GetParam(); + + input = new Buffer(4, 4, 0); + ASSERT_NE(input, nullptr); + ASSERT_TRUE(input->Init()); + predict = new Buffer(4, 4, 3); + ASSERT_NE(predict, nullptr); + ASSERT_TRUE(predict->Init()); + output = new Buffer(4, 4, 3); + ASSERT_NE(output, nullptr); + ASSERT_TRUE(output->Init()); + } + + void TearDown() override { + delete input; + delete predict; + delete output; + libvpx_test::ClearSystemState(); + } + + IdctFunc UUT; + Buffer *input; + Buffer *predict; + Buffer *output; +}; + +TEST_P(IDCTTest, TestAllZeros) { + // When the input is '0' the output will be '0'. + input->Set(0); + predict->Set(0); + output->Set(0); + + ASM_REGISTER_STATE_CHECK(UUT(input->TopLeftPixel(), predict->TopLeftPixel(), + predict->stride(), output->TopLeftPixel(), + output->stride())); + + ASSERT_TRUE(input->CheckValues(0)); + ASSERT_TRUE(input->CheckPadding()); + ASSERT_TRUE(output->CheckValues(0)); + ASSERT_TRUE(output->CheckPadding()); +} + +TEST_P(IDCTTest, TestAllOnes) { + input->Set(0); + ASSERT_NE(input->TopLeftPixel(), nullptr); + // When the first element is '4' it will fill the output buffer with '1'. + input->TopLeftPixel()[0] = 4; + predict->Set(0); + output->Set(0); + + ASM_REGISTER_STATE_CHECK(UUT(input->TopLeftPixel(), predict->TopLeftPixel(), + predict->stride(), output->TopLeftPixel(), + output->stride())); + + ASSERT_TRUE(output->CheckValues(1)); + ASSERT_TRUE(output->CheckPadding()); +} + +TEST_P(IDCTTest, TestAddOne) { + // Set the transform output to '1' and make sure it gets added to the + // prediction buffer. + input->Set(0); + ASSERT_NE(input->TopLeftPixel(), nullptr); + input->TopLeftPixel()[0] = 4; + output->Set(0); + + uint8_t *pred = predict->TopLeftPixel(); + for (int y = 0; y < 4; ++y) { + for (int x = 0; x < 4; ++x) { + pred[y * predict->stride() + x] = y * 4 + x; + } + } + + ASM_REGISTER_STATE_CHECK(UUT(input->TopLeftPixel(), predict->TopLeftPixel(), + predict->stride(), output->TopLeftPixel(), + output->stride())); + + uint8_t const *out = output->TopLeftPixel(); + for (int y = 0; y < 4; ++y) { + for (int x = 0; x < 4; ++x) { + EXPECT_EQ(1 + y * 4 + x, out[y * output->stride() + x]); + } + } + + if (HasFailure()) { + output->DumpBuffer(); + } + + ASSERT_TRUE(output->CheckPadding()); +} + +TEST_P(IDCTTest, TestWithData) { + // Test a single known input. + predict->Set(0); + + int16_t *in = input->TopLeftPixel(); + for (int y = 0; y < 4; ++y) { + for (int x = 0; x < 4; ++x) { + in[y * input->stride() + x] = y * 4 + x; + } + } + + ASM_REGISTER_STATE_CHECK(UUT(input->TopLeftPixel(), predict->TopLeftPixel(), + predict->stride(), output->TopLeftPixel(), + output->stride())); + + uint8_t *out = output->TopLeftPixel(); + for (int y = 0; y < 4; ++y) { + for (int x = 0; x < 4; ++x) { + switch (y * 4 + x) { + case 0: EXPECT_EQ(11, out[y * output->stride() + x]); break; + case 2: + case 5: + case 8: EXPECT_EQ(3, out[y * output->stride() + x]); break; + case 10: EXPECT_EQ(1, out[y * output->stride() + x]); break; + default: EXPECT_EQ(0, out[y * output->stride() + x]); + } + } + } + + if (HasFailure()) { + output->DumpBuffer(); + } + + ASSERT_TRUE(output->CheckPadding()); +} + +INSTANTIATE_TEST_SUITE_P(C, IDCTTest, + ::testing::Values(vp8_short_idct4x4llm_c)); + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, IDCTTest, + ::testing::Values(vp8_short_idct4x4llm_neon)); +#endif // HAVE_NEON + +#if HAVE_MMX +INSTANTIATE_TEST_SUITE_P(MMX, IDCTTest, + ::testing::Values(vp8_short_idct4x4llm_mmx)); +#endif // HAVE_MMX + +#if HAVE_MSA +INSTANTIATE_TEST_SUITE_P(MSA, IDCTTest, + ::testing::Values(vp8_short_idct4x4llm_msa)); +#endif // HAVE_MSA + +#if HAVE_MMI +INSTANTIATE_TEST_SUITE_P(MMI, IDCTTest, + ::testing::Values(vp8_short_idct4x4llm_mmi)); +#endif // HAVE_MMI +} // namespace diff --git a/media/libvpx/libvpx/test/init_vpx_test.cc b/media/libvpx/libvpx/test/init_vpx_test.cc new file mode 100644 index 0000000000..f66f00b5c1 --- /dev/null +++ b/media/libvpx/libvpx/test/init_vpx_test.cc @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "test/init_vpx_test.h" + +#include "./vpx_config.h" + +#if !CONFIG_SHARED +#include +#include "third_party/googletest/src/include/gtest/gtest.h" +#if VPX_ARCH_ARM +#include "vpx_ports/arm.h" +#endif +#if VPX_ARCH_X86 || VPX_ARCH_X86_64 +#include "vpx_ports/x86.h" +#endif +extern "C" { +#if CONFIG_VP8 +extern void vp8_rtcd(); +#endif // CONFIG_VP8 +#if CONFIG_VP9 +extern void vp9_rtcd(); +#endif // CONFIG_VP9 +extern void vpx_dsp_rtcd(); +extern void vpx_scale_rtcd(); +} + +#if VPX_ARCH_ARM || VPX_ARCH_X86 || VPX_ARCH_X86_64 +static void append_negative_gtest_filter(const char *str) { + std::string filter = GTEST_FLAG_GET(filter); + // Negative patterns begin with one '-' followed by a ':' separated list. + if (filter.find('-') == std::string::npos) filter += '-'; + filter += str; + GTEST_FLAG_SET(filter, filter); +} +#endif // VPX_ARCH_ARM || VPX_ARCH_X86 || VPX_ARCH_X86_64 +#endif // !CONFIG_SHARED + +namespace libvpx_test { +void init_vpx_test() { +#if !CONFIG_SHARED +#if VPX_ARCH_AARCH64 + const int caps = arm_cpu_caps(); + if (!(caps & HAS_NEON_DOTPROD)) { + append_negative_gtest_filter(":NEON_DOTPROD.*:NEON_DOTPROD/*"); + } + if (!(caps & HAS_NEON_I8MM)) { + append_negative_gtest_filter(":NEON_I8MM.*:NEON_I8MM/*"); + } + if (!(caps & HAS_SVE)) { + append_negative_gtest_filter(":SVE.*:SVE/*"); + } +#elif VPX_ARCH_ARM + const int caps = arm_cpu_caps(); + if (!(caps & HAS_NEON)) append_negative_gtest_filter(":NEON.*:NEON/*"); +#endif // VPX_ARCH_ARM + +#if VPX_ARCH_X86 || VPX_ARCH_X86_64 + const int simd_caps = x86_simd_caps(); + if (!(simd_caps & HAS_MMX)) append_negative_gtest_filter(":MMX.*:MMX/*"); + if (!(simd_caps & HAS_SSE)) append_negative_gtest_filter(":SSE.*:SSE/*"); + if (!(simd_caps & HAS_SSE2)) append_negative_gtest_filter(":SSE2.*:SSE2/*"); + if (!(simd_caps & HAS_SSE3)) append_negative_gtest_filter(":SSE3.*:SSE3/*"); + if (!(simd_caps & HAS_SSSE3)) { + append_negative_gtest_filter(":SSSE3.*:SSSE3/*"); + } + if (!(simd_caps & HAS_SSE4_1)) { + append_negative_gtest_filter(":SSE4_1.*:SSE4_1/*"); + } + if (!(simd_caps & HAS_AVX)) append_negative_gtest_filter(":AVX.*:AVX/*"); + if (!(simd_caps & HAS_AVX2)) append_negative_gtest_filter(":AVX2.*:AVX2/*"); + if (!(simd_caps & HAS_AVX512)) { + append_negative_gtest_filter(":AVX512.*:AVX512/*"); + } +#endif // VPX_ARCH_X86 || VPX_ARCH_X86_64 + + // Shared library builds don't support whitebox tests that exercise internal + // symbols. +#if CONFIG_VP8 + vp8_rtcd(); +#endif // CONFIG_VP8 +#if CONFIG_VP9 + vp9_rtcd(); +#endif // CONFIG_VP9 + vpx_dsp_rtcd(); + vpx_scale_rtcd(); +#endif // !CONFIG_SHARED +} +} // namespace libvpx_test diff --git a/media/libvpx/libvpx/test/init_vpx_test.h b/media/libvpx/libvpx/test/init_vpx_test.h new file mode 100644 index 0000000000..5e0dbb0e7e --- /dev/null +++ b/media/libvpx/libvpx/test/init_vpx_test.h @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef TEST_INIT_VPX_TEST_H_ +#define TEST_INIT_VPX_TEST_H_ + +namespace libvpx_test { +void init_vpx_test(); +} + +#endif // TEST_INIT_VPX_TEST_H_ diff --git a/media/libvpx/libvpx/test/invalid_file_test.cc b/media/libvpx/libvpx/test/invalid_file_test.cc new file mode 100644 index 0000000000..c37dc0d486 --- /dev/null +++ b/media/libvpx/libvpx/test/invalid_file_test.cc @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include +#include +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "./vpx_config.h" +#include "test/codec_factory.h" +#include "test/decode_test_driver.h" +#include "test/ivf_video_source.h" +#include "test/util.h" +#if CONFIG_WEBM_IO +#include "test/webm_video_source.h" +#endif +#include "vpx_mem/vpx_mem.h" + +namespace { + +struct DecodeParam { + int threads; + const char *filename; +}; + +std::ostream &operator<<(std::ostream &os, const DecodeParam &dp) { + return os << "threads: " << dp.threads << " file: " << dp.filename; +} + +class InvalidFileTest : public ::libvpx_test::DecoderTest, + public ::libvpx_test::CodecTestWithParam { + protected: + InvalidFileTest() : DecoderTest(GET_PARAM(0)), res_file_(nullptr) {} + + ~InvalidFileTest() override { + if (res_file_ != nullptr) fclose(res_file_); + } + + void OpenResFile(const std::string &res_file_name_) { + res_file_ = libvpx_test::OpenTestDataFile(res_file_name_); + ASSERT_NE(res_file_, nullptr) + << "Result file open failed. Filename: " << res_file_name_; + } + + bool HandleDecodeResult(const vpx_codec_err_t res_dec, + const libvpx_test::CompressedVideoSource &video, + libvpx_test::Decoder *decoder) override { + EXPECT_NE(res_file_, nullptr); + int expected_res_dec; + + // Read integer result. + const int res = fscanf(res_file_, "%d", &expected_res_dec); + EXPECT_NE(res, EOF) << "Read result data failed"; + + // Check results match. + const DecodeParam input = GET_PARAM(1); + if (input.threads > 1) { + // The serial decode check is too strict for tile-threaded decoding as + // there is no guarantee on the decode order nor which specific error + // will take precedence. Currently a tile-level error is not forwarded so + // the frame will simply be marked corrupt. + EXPECT_TRUE(res_dec == expected_res_dec || + res_dec == VPX_CODEC_CORRUPT_FRAME) + << "Results don't match: frame number = " << video.frame_number() + << ". (" << decoder->DecodeError() + << "). Expected: " << expected_res_dec << " or " + << VPX_CODEC_CORRUPT_FRAME; + } else { + EXPECT_EQ(expected_res_dec, res_dec) + << "Results don't match: frame number = " << video.frame_number() + << ". (" << decoder->DecodeError() << ")"; + } + + return !HasFailure(); + } + + void RunTest() { + const DecodeParam input = GET_PARAM(1); + vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); + cfg.threads = input.threads; + const std::string filename = input.filename; + + // Open compressed video file. + std::unique_ptr video; + if (filename.substr(filename.length() - 3, 3) == "ivf") { + video.reset(new libvpx_test::IVFVideoSource(filename)); + } else if (filename.substr(filename.length() - 4, 4) == "webm") { +#if CONFIG_WEBM_IO + video.reset(new libvpx_test::WebMVideoSource(filename)); +#else + fprintf(stderr, "WebM IO is disabled, skipping test vector %s\n", + filename.c_str()); + return; +#endif + } + ASSERT_NE(video.get(), nullptr); + video->Init(); + + // Construct result file name. The file holds a list of expected integer + // results, one for each decoded frame. Any result that doesn't match + // the files list will cause a test failure. + const std::string res_filename = filename + ".res"; + OpenResFile(res_filename); + + // Decode frame, and check the md5 matching. + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get(), cfg)); + } + + private: + FILE *res_file_; +}; + +TEST_P(InvalidFileTest, ReturnCode) { RunTest(); } + +#if CONFIG_VP8_DECODER +const DecodeParam kVP8InvalidFileTests[] = { + { 1, "invalid-bug-1443.ivf" }, + { 1, "invalid-bug-148271109.ivf" }, + { 1, "invalid-token-partition.ivf" }, + { 1, "invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf" }, +}; + +VP8_INSTANTIATE_TEST_SUITE(InvalidFileTest, + ::testing::ValuesIn(kVP8InvalidFileTests)); +#endif // CONFIG_VP8_DECODER + +#if CONFIG_VP9_DECODER +const DecodeParam kVP9InvalidFileTests[] = { + { 1, "invalid-vp90-02-v2.webm" }, +#if CONFIG_VP9_HIGHBITDEPTH + { 1, "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf" }, + { 1, + "invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-." + "ivf" }, +#endif + { 1, "invalid-vp90-03-v3.webm" }, + { 1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf" }, + { 1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf" }, +// This file will cause a large allocation which is expected to fail in 32-bit +// environments. Test x86 for coverage purposes as the allocation failure will +// be in platform agnostic code. +#if VPX_ARCH_X86 + { 1, "invalid-vp90-2-00-quantizer-63.ivf.kf_65527x61446.ivf" }, +#endif + { 1, "invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf" }, + { 1, "invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf" }, + { 1, "invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf" }, + { 1, "invalid-vp91-2-mixedrefcsp-444to420.ivf" }, + { 1, "invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf" }, + { 1, "invalid-vp90-2-03-size-224x196.webm.ivf.s44156_r01-05_b6-.ivf" }, + { 1, "invalid-vp90-2-03-size-202x210.webm.ivf.s113306_r01-05_b6-.ivf" }, + { 1, + "invalid-vp90-2-10-show-existing-frame.webm.ivf.s180315_r01-05_b6-.ivf" }, + { 1, "invalid-crbug-667044.webm" }, +}; + +VP9_INSTANTIATE_TEST_SUITE(InvalidFileTest, + ::testing::ValuesIn(kVP9InvalidFileTests)); +#endif // CONFIG_VP9_DECODER + +// This class will include test vectors that are expected to fail +// peek. However they are still expected to have no fatal failures. +class InvalidFileInvalidPeekTest : public InvalidFileTest { + protected: + InvalidFileInvalidPeekTest() : InvalidFileTest() {} + void HandlePeekResult(libvpx_test::Decoder *const /*decoder*/, + libvpx_test::CompressedVideoSource * /*video*/, + const vpx_codec_err_t /*res_peek*/) override {} +}; + +TEST_P(InvalidFileInvalidPeekTest, ReturnCode) { RunTest(); } + +#if CONFIG_VP8_DECODER +const DecodeParam kVP8InvalidPeekTests[] = { + { 1, "invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf" }, +}; + +VP8_INSTANTIATE_TEST_SUITE(InvalidFileInvalidPeekTest, + ::testing::ValuesIn(kVP8InvalidPeekTests)); +#endif // CONFIG_VP8_DECODER + +#if CONFIG_VP9_DECODER +const DecodeParam kVP9InvalidFileInvalidPeekTests[] = { + { 1, "invalid-vp90-01-v3.webm" }, +}; + +VP9_INSTANTIATE_TEST_SUITE( + InvalidFileInvalidPeekTest, + ::testing::ValuesIn(kVP9InvalidFileInvalidPeekTests)); + +const DecodeParam kMultiThreadedVP9InvalidFileTests[] = { + { 4, "invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm" }, + { 4, + "invalid-" + "vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf" }, + { 4, + "invalid-vp90-2-08-tile_1x8_frame_parallel.webm.ivf.s288_r01-05_b6-.ivf" }, + { 2, "invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf" }, + { 4, "invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf" }, + { 2, "invalid-crbug-629481.webm" }, + { 3, "invalid-crbug-1558.ivf" }, + { 4, "invalid-crbug-1562.ivf" }, +}; + +INSTANTIATE_TEST_SUITE_P( + VP9MultiThreaded, InvalidFileTest, + ::testing::Combine( + ::testing::Values( + static_cast(&libvpx_test::kVP9)), + ::testing::ValuesIn(kMultiThreadedVP9InvalidFileTests))); +#endif // CONFIG_VP9_DECODER +} // namespace diff --git a/media/libvpx/libvpx/test/ivf_video_source.h b/media/libvpx/libvpx/test/ivf_video_source.h new file mode 100644 index 0000000000..3ccac62b51 --- /dev/null +++ b/media/libvpx/libvpx/test/ivf_video_source.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_TEST_IVF_VIDEO_SOURCE_H_ +#define VPX_TEST_IVF_VIDEO_SOURCE_H_ +#include +#include +#include +#include +#include "test/video_source.h" + +namespace libvpx_test { +const unsigned int kCodeBufferSize = 256 * 1024 * 1024; +const unsigned int kIvfFileHdrSize = 32; +const unsigned int kIvfFrameHdrSize = 12; + +static unsigned int MemGetLe32(const uint8_t *mem) { + return (mem[3] << 24) | (mem[2] << 16) | (mem[1] << 8) | (mem[0]); +} + +// This class extends VideoSource to allow parsing of ivf files, +// so that we can do actual file decodes. +class IVFVideoSource : public CompressedVideoSource { + public: + explicit IVFVideoSource(const std::string &file_name) + : file_name_(file_name), input_file_(nullptr), + compressed_frame_buf_(nullptr), frame_sz_(0), frame_(0), + end_of_file_(false) {} + + ~IVFVideoSource() override { + delete[] compressed_frame_buf_; + + if (input_file_) fclose(input_file_); + } + + void Init() override { + // Allocate a buffer for read in the compressed video frame. + compressed_frame_buf_ = new uint8_t[libvpx_test::kCodeBufferSize]; + ASSERT_NE(compressed_frame_buf_, nullptr) << "Allocate frame buffer failed"; + } + + void Begin() override { + input_file_ = OpenTestDataFile(file_name_); + ASSERT_NE(input_file_, nullptr) + << "Input file open failed. Filename: " << file_name_; + + // Read file header + uint8_t file_hdr[kIvfFileHdrSize]; + ASSERT_EQ(kIvfFileHdrSize, fread(file_hdr, 1, kIvfFileHdrSize, input_file_)) + << "File header read failed."; + // Check file header + ASSERT_TRUE(file_hdr[0] == 'D' && file_hdr[1] == 'K' && + file_hdr[2] == 'I' && file_hdr[3] == 'F') + << "Input is not an IVF file."; + + FillFrame(); + } + + void Next() override { + ++frame_; + FillFrame(); + } + + void FillFrame() { + ASSERT_NE(input_file_, nullptr); + uint8_t frame_hdr[kIvfFrameHdrSize]; + // Check frame header and read a frame from input_file. + if (fread(frame_hdr, 1, kIvfFrameHdrSize, input_file_) != + kIvfFrameHdrSize) { + end_of_file_ = true; + } else { + end_of_file_ = false; + + frame_sz_ = MemGetLe32(frame_hdr); + ASSERT_LE(frame_sz_, kCodeBufferSize) + << "Frame is too big for allocated code buffer"; + ASSERT_EQ(frame_sz_, + fread(compressed_frame_buf_, 1, frame_sz_, input_file_)) + << "Failed to read complete frame"; + } + } + + const uint8_t *cxdata() const override { + return end_of_file_ ? nullptr : compressed_frame_buf_; + } + size_t frame_size() const override { return frame_sz_; } + unsigned int frame_number() const override { return frame_; } + + protected: + std::string file_name_; + FILE *input_file_; + uint8_t *compressed_frame_buf_; + size_t frame_sz_; + unsigned int frame_; + bool end_of_file_; +}; + +} // namespace libvpx_test + +#endif // VPX_TEST_IVF_VIDEO_SOURCE_H_ diff --git a/media/libvpx/libvpx/test/keyframe_test.cc b/media/libvpx/libvpx/test/keyframe_test.cc new file mode 100644 index 0000000000..6a1c99cbe2 --- /dev/null +++ b/media/libvpx/libvpx/test/keyframe_test.cc @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include +#include +#include +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "./vpx_config.h" +#include "vpx/vp8cx.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_encoder.h" +#include "vpx/vpx_image.h" + +namespace { + +class KeyframeTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam { + protected: + KeyframeTest() : EncoderTest(GET_PARAM(0)) {} + ~KeyframeTest() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(GET_PARAM(1)); + kf_count_ = 0; + kf_count_max_ = INT_MAX; + kf_do_force_kf_ = false; + set_cpu_used_ = 0; + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (kf_do_force_kf_) { + frame_flags_ = (video->frame() % 3) ? 0 : VPX_EFLAG_FORCE_KF; + } + if (set_cpu_used_ && video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); + } + } + + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) { + kf_pts_list_.push_back(pkt->data.frame.pts); + kf_count_++; + abort_ |= kf_count_ > kf_count_max_; + } + } + + bool kf_do_force_kf_; + int kf_count_; + int kf_count_max_; + std::vector kf_pts_list_; + int set_cpu_used_; +}; + +TEST_P(KeyframeTest, TestRandomVideoSource) { + // Validate that encoding the RandomVideoSource produces multiple keyframes. + // This validates the results of the TestDisableKeyframes test. + kf_count_max_ = 2; // early exit successful tests. + + ::libvpx_test::RandomVideoSource video; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + // In realtime mode - auto placed keyframes are exceedingly rare, don't + // bother with this check if(GetParam() > 0) + if (GET_PARAM(1) > 0) { + EXPECT_GT(kf_count_, 1); + } +} + +TEST_P(KeyframeTest, TestDisableKeyframes) { + cfg_.kf_mode = VPX_KF_DISABLED; + kf_count_max_ = 1; // early exit failed tests. + + ::libvpx_test::RandomVideoSource video; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + EXPECT_EQ(1, kf_count_); +} + +TEST_P(KeyframeTest, TestForceKeyframe) { + cfg_.kf_mode = VPX_KF_DISABLED; + kf_do_force_kf_ = true; + + ::libvpx_test::DummyVideoSource video; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + // verify that every third frame is a keyframe. + for (std::vector::const_iterator iter = kf_pts_list_.begin(); + iter != kf_pts_list_.end(); ++iter) { + ASSERT_EQ(0, *iter % 3) << "Unexpected keyframe at frame " << *iter; + } +} + +TEST_P(KeyframeTest, TestKeyframeMaxDistance) { + cfg_.kf_max_dist = 25; + + ::libvpx_test::DummyVideoSource video; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + // verify that keyframe interval matches kf_max_dist + for (std::vector::const_iterator iter = kf_pts_list_.begin(); + iter != kf_pts_list_.end(); ++iter) { + ASSERT_EQ(0, *iter % 25) << "Unexpected keyframe at frame " << *iter; + } +} + +TEST_P(KeyframeTest, TestAutoKeyframe) { + cfg_.kf_mode = VPX_KF_AUTO; + kf_do_force_kf_ = false; + + // Force a deterministic speed step in Real Time mode, as the faster modes + // may not produce a keyframe like we expect. This is necessary when running + // on very slow environments (like Valgrind). The step -11 was determined + // experimentally as the fastest mode that still throws the keyframe. + if (deadline_ == VPX_DL_REALTIME) set_cpu_used_ = -11; + + // This clip has a cut scene every 30 frames -> Frame 0, 30, 60, 90, 120. + // I check only the first 40 frames to make sure there's a keyframe at frame + // 0 and 30. + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 40); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + // In realtime mode - auto placed keyframes are exceedingly rare, don't + // bother with this check + if (GET_PARAM(1) > 0) { + EXPECT_EQ(2u, kf_pts_list_.size()) << " Not the right number of keyframes "; + } + + // Verify that keyframes match the file keyframes in the file. + for (std::vector::const_iterator iter = kf_pts_list_.begin(); + iter != kf_pts_list_.end(); ++iter) { + if (deadline_ == VPX_DL_REALTIME && *iter > 0) + EXPECT_EQ(0, (*iter - 1) % 30) + << "Unexpected keyframe at frame " << *iter; + else + EXPECT_EQ(0, *iter % 30) << "Unexpected keyframe at frame " << *iter; + } +} + +VP8_INSTANTIATE_TEST_SUITE(KeyframeTest, ALL_TEST_MODES); + +bool IsVP9(vpx_codec_iface_t *iface) { + static const char kVP9Name[] = "WebM Project VP9"; + return strncmp(kVP9Name, vpx_codec_iface_name(iface), sizeof(kVP9Name) - 1) == + 0; +} + +vpx_image_t *CreateGrayImage(vpx_img_fmt_t fmt, unsigned int w, + unsigned int h) { + vpx_image_t *const image = vpx_img_alloc(nullptr, fmt, w, h, 1); + if (!image) return image; + + for (unsigned int i = 0; i < image->d_h; ++i) { + memset(image->planes[0] + i * image->stride[0], 128, image->d_w); + } + const unsigned int uv_h = (image->d_h + 1) / 2; + const unsigned int uv_w = (image->d_w + 1) / 2; + for (unsigned int i = 0; i < uv_h; ++i) { + memset(image->planes[1] + i * image->stride[1], 128, uv_w); + memset(image->planes[2] + i * image->stride[2], 128, uv_w); + } + return image; +} + +// Tests kf_max_dist in one-pass encoding with zero lag. +void TestKeyframeMaximumInterval(vpx_codec_iface_t *iface, + vpx_enc_deadline_t deadline, + unsigned int kf_max_dist) { + vpx_codec_enc_cfg_t cfg; + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, /*usage=*/0), + VPX_CODEC_OK); + cfg.g_w = 320; + cfg.g_h = 240; + cfg.g_pass = VPX_RC_ONE_PASS; + cfg.g_lag_in_frames = 0; + cfg.kf_mode = VPX_KF_AUTO; + cfg.kf_min_dist = 0; + cfg.kf_max_dist = kf_max_dist; + + vpx_codec_ctx_t enc; + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + + const int speed = IsVP9(iface) ? 9 : -12; + ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, speed), VPX_CODEC_OK); + + vpx_image_t *image = CreateGrayImage(VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h); + ASSERT_NE(image, nullptr); + + // Encode frames. + const vpx_codec_cx_pkt_t *pkt; + const unsigned int num_frames = kf_max_dist == 0 ? 4 : 3 * kf_max_dist + 1; + for (unsigned int i = 0; i < num_frames; ++i) { + ASSERT_EQ(vpx_codec_encode(&enc, image, i, 1, 0, deadline), VPX_CODEC_OK); + vpx_codec_iter_t iter = nullptr; + while ((pkt = vpx_codec_get_cx_data(&enc, &iter)) != nullptr) { + ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT); + if (kf_max_dist == 0 || i % kf_max_dist == 0) { + ASSERT_EQ(pkt->data.frame.flags & VPX_FRAME_IS_KEY, VPX_FRAME_IS_KEY); + } else { + ASSERT_EQ(pkt->data.frame.flags & VPX_FRAME_IS_KEY, 0u); + } + } + } + + // Flush the encoder. + bool got_data; + do { + ASSERT_EQ(vpx_codec_encode(&enc, nullptr, 0, 1, 0, deadline), VPX_CODEC_OK); + got_data = false; + vpx_codec_iter_t iter = nullptr; + while ((pkt = vpx_codec_get_cx_data(&enc, &iter)) != nullptr) { + ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT); + got_data = true; + } + } while (got_data); + + vpx_img_free(image); + ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); +} + +TEST(KeyframeIntervalTest, KeyframeMaximumInterval) { + std::vector ifaces; +#if CONFIG_VP8_ENCODER + ifaces.push_back(vpx_codec_vp8_cx()); +#endif +#if CONFIG_VP9_ENCODER + ifaces.push_back(vpx_codec_vp9_cx()); +#endif + for (vpx_codec_iface_t *iface : ifaces) { + for (vpx_enc_deadline_t deadline : + { VPX_DL_REALTIME, VPX_DL_GOOD_QUALITY, VPX_DL_BEST_QUALITY }) { + // Test 0 and 1 (both mean all intra), some powers of 2, some multiples + // of 10, and some prime numbers. + for (unsigned int kf_max_dist : + { 0, 1, 2, 3, 4, 7, 10, 13, 16, 20, 23, 29, 32 }) { + TestKeyframeMaximumInterval(iface, deadline, kf_max_dist); + } + } + } +} + +} // namespace diff --git a/media/libvpx/libvpx/test/level_test.cc b/media/libvpx/libvpx/test/level_test.cc new file mode 100644 index 0000000000..36cfd645c9 --- /dev/null +++ b/media/libvpx/libvpx/test/level_test.cc @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" + +namespace { +class LevelTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith2Params { + protected: + LevelTest() + : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), + cpu_used_(GET_PARAM(2)), min_gf_internal_(24), target_level_(0), + level_(0) {} + ~LevelTest() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(encoding_mode_); + if (encoding_mode_ != ::libvpx_test::kRealTime) { + cfg_.g_lag_in_frames = 25; + cfg_.rc_end_usage = VPX_VBR; + } else { + cfg_.g_lag_in_frames = 0; + cfg_.rc_end_usage = VPX_CBR; + } + cfg_.rc_2pass_vbr_minsection_pct = 5; + cfg_.rc_2pass_vbr_maxsection_pct = 2000; + cfg_.rc_target_bitrate = 400; + cfg_.rc_max_quantizer = 63; + cfg_.rc_min_quantizer = 0; + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, cpu_used_); + encoder->Control(VP9E_SET_TARGET_LEVEL, target_level_); + encoder->Control(VP9E_SET_MIN_GF_INTERVAL, min_gf_internal_); + if (encoding_mode_ != ::libvpx_test::kRealTime) { + encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); + encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7); + encoder->Control(VP8E_SET_ARNR_STRENGTH, 5); + encoder->Control(VP8E_SET_ARNR_TYPE, 3); + } + } + encoder->Control(VP9E_GET_LEVEL, &level_); + ASSERT_LE(level_, 51); + ASSERT_GE(level_, 0); + } + + ::libvpx_test::TestMode encoding_mode_; + int cpu_used_; + int min_gf_internal_; + int target_level_; + int level_; +}; + +TEST_P(LevelTest, TestTargetLevel11Large) { + ASSERT_NE(encoding_mode_, ::libvpx_test::kRealTime); + ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0, + 60); + target_level_ = 11; + cfg_.rc_target_bitrate = 150; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(target_level_, level_); +} + +TEST_P(LevelTest, TestTargetLevel20Large) { + ASSERT_NE(encoding_mode_, ::libvpx_test::kRealTime); + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 60); + target_level_ = 20; + cfg_.rc_target_bitrate = 1200; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(target_level_, level_); +} + +TEST_P(LevelTest, TestTargetLevel31Large) { + ASSERT_NE(encoding_mode_, ::libvpx_test::kRealTime); + ::libvpx_test::I420VideoSource video("niklas_1280_720_30.y4m", 1280, 720, 30, + 1, 0, 60); + target_level_ = 31; + cfg_.rc_target_bitrate = 8000; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(target_level_, level_); +} + +// Test for keeping level stats only +TEST_P(LevelTest, TestTargetLevel0) { + ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0, + 40); + target_level_ = 0; + min_gf_internal_ = 4; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(11, level_); + + cfg_.rc_target_bitrate = 1600; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(20, level_); +} + +// Test for level control being turned off +TEST_P(LevelTest, TestTargetLevel255) { + ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0, + 30); + target_level_ = 255; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +TEST_P(LevelTest, TestTargetLevelApi) { + ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0, 1); + static vpx_codec_iface_t *codec = &vpx_codec_vp9_cx_algo; + vpx_codec_ctx_t enc; + vpx_codec_enc_cfg_t cfg; + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_config_default(codec, &cfg, 0)); + cfg.rc_target_bitrate = 100; + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_init(&enc, codec, &cfg, 0)); + for (int level = 0; level <= 256; ++level) { + if (level == 10 || level == 11 || level == 20 || level == 21 || + level == 30 || level == 31 || level == 40 || level == 41 || + level == 50 || level == 51 || level == 52 || level == 60 || + level == 61 || level == 62 || level == 0 || level == 1 || level == 255) + EXPECT_EQ(VPX_CODEC_OK, + vpx_codec_control(&enc, VP9E_SET_TARGET_LEVEL, level)); + else + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_control(&enc, VP9E_SET_TARGET_LEVEL, level)); + } + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&enc)); +} + +VP9_INSTANTIATE_TEST_SUITE(LevelTest, + ::testing::Values(::libvpx_test::kTwoPassGood, + ::libvpx_test::kOnePassGood), + ::testing::Range(0, 9)); +} // namespace diff --git a/media/libvpx/libvpx/test/lpf_test.cc b/media/libvpx/libvpx/test/lpf_test.cc new file mode 100644 index 0000000000..ce0ddeae18 --- /dev/null +++ b/media/libvpx/libvpx/test/lpf_test.cc @@ -0,0 +1,721 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_loopfilter.h" +#include "vpx/vpx_integer.h" + +using libvpx_test::ACMRandom; + +namespace { +// Horizontally and Vertically need 32x32: 8 Coeffs preceeding filtered section +// 16 Coefs within filtered section +// 8 Coeffs following filtered section +const int kNumCoeffs = 1024; + +const int number_of_iterations = 10000; + +#if CONFIG_VP9_HIGHBITDEPTH +typedef uint16_t Pixel; +#define PIXEL_WIDTH 16 + +typedef void (*loop_op_t)(Pixel *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, int bd); +typedef void (*dual_loop_op_t)(Pixel *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd); +#else +typedef uint8_t Pixel; +#define PIXEL_WIDTH 8 + +typedef void (*loop_op_t)(Pixel *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh); +typedef void (*dual_loop_op_t)(Pixel *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1); +#endif // CONFIG_VP9_HIGHBITDEPTH + +typedef std::tuple loop8_param_t; +typedef std::tuple dualloop8_param_t; + +void InitInput(Pixel *s, Pixel *ref_s, ACMRandom *rnd, const uint8_t limit, + const int mask, const int32_t p, const int i) { + uint16_t tmp_s[kNumCoeffs]; + + for (int j = 0; j < kNumCoeffs;) { + const uint8_t val = rnd->Rand8(); + if (val & 0x80) { // 50% chance to choose a new value. + tmp_s[j] = rnd->Rand16(); + j++; + } else { // 50% chance to repeat previous value in row X times. + int k = 0; + while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) { + if (j < 1) { + tmp_s[j] = rnd->Rand16(); + } else if (val & 0x20) { // Increment by a value within the limit. + tmp_s[j] = static_cast(tmp_s[j - 1] + (limit - 1)); + } else { // Decrement by a value within the limit. + tmp_s[j] = static_cast(tmp_s[j - 1] - (limit - 1)); + } + j++; + } + } + } + + for (int j = 0; j < kNumCoeffs;) { + const uint8_t val = rnd->Rand8(); + if (val & 0x80) { + j++; + } else { // 50% chance to repeat previous value in column X times. + int k = 0; + while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) { + if (j < 1) { + tmp_s[j] = rnd->Rand16(); + } else if (val & 0x20) { // Increment by a value within the limit. + tmp_s[(j % 32) * 32 + j / 32] = static_cast( + tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] + (limit - 1)); + } else { // Decrement by a value within the limit. + tmp_s[(j % 32) * 32 + j / 32] = static_cast( + tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] - (limit - 1)); + } + j++; + } + } + } + + for (int j = 0; j < kNumCoeffs; j++) { + if (i % 2) { + s[j] = tmp_s[j] & mask; + } else { + s[j] = tmp_s[p * (j % p) + j / p] & mask; + } + ref_s[j] = s[j]; + } +} + +uint8_t GetOuterThresh(ACMRandom *rnd) { + return static_cast(rnd->RandRange(3 * MAX_LOOP_FILTER + 5)); +} + +uint8_t GetInnerThresh(ACMRandom *rnd) { + return static_cast(rnd->RandRange(MAX_LOOP_FILTER + 1)); +} + +uint8_t GetHevThresh(ACMRandom *rnd) { + return static_cast(rnd->RandRange(MAX_LOOP_FILTER + 1) >> 4); +} + +class Loop8Test6Param : public ::testing::TestWithParam { + public: + ~Loop8Test6Param() override = default; + void SetUp() override { + loopfilter_op_ = GET_PARAM(0); + ref_loopfilter_op_ = GET_PARAM(1); + bit_depth_ = GET_PARAM(2); + mask_ = (1 << bit_depth_) - 1; + } + + void TearDown() override { libvpx_test::ClearSystemState(); } + + protected: + int bit_depth_; + int mask_; + loop_op_t loopfilter_op_; + loop_op_t ref_loopfilter_op_; +}; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test6Param); + +#if HAVE_NEON || HAVE_SSE2 || (HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH) || \ + (HAVE_DSPR2 || HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH) +class Loop8Test9Param : public ::testing::TestWithParam { + public: + ~Loop8Test9Param() override = default; + void SetUp() override { + loopfilter_op_ = GET_PARAM(0); + ref_loopfilter_op_ = GET_PARAM(1); + bit_depth_ = GET_PARAM(2); + mask_ = (1 << bit_depth_) - 1; + } + + void TearDown() override { libvpx_test::ClearSystemState(); } + + protected: + int bit_depth_; + int mask_; + dual_loop_op_t loopfilter_op_; + dual_loop_op_t ref_loopfilter_op_; +}; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test9Param); +#endif // HAVE_NEON || HAVE_SSE2 || (HAVE_DSPR2 || HAVE_MSA && + // (!CONFIG_VP9_HIGHBITDEPTH) || (HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH)) + +TEST_P(Loop8Test6Param, OperationCheck) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = number_of_iterations; + const int32_t p = kNumCoeffs / 32; + DECLARE_ALIGNED(PIXEL_WIDTH, Pixel, s[kNumCoeffs]); + DECLARE_ALIGNED(PIXEL_WIDTH, Pixel, ref_s[kNumCoeffs]); + int err_count_total = 0; + int first_failure = -1; + for (int i = 0; i < count_test_block; ++i) { + int err_count = 0; + uint8_t tmp = GetOuterThresh(&rnd); + DECLARE_ALIGNED(16, const uint8_t, + blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; + tmp = GetInnerThresh(&rnd); + DECLARE_ALIGNED(16, const uint8_t, + limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; + tmp = GetHevThresh(&rnd); + DECLARE_ALIGNED(16, const uint8_t, + thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; + InitInput(s, ref_s, &rnd, *limit, mask_, p, i); +#if CONFIG_VP9_HIGHBITDEPTH + ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, bit_depth_); + ASM_REGISTER_STATE_CHECK( + loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, bit_depth_)); +#else + ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh); + ASM_REGISTER_STATE_CHECK( + loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh)); +#endif // CONFIG_VP9_HIGHBITDEPTH + + for (int j = 0; j < kNumCoeffs; ++j) { + err_count += ref_s[j] != s[j]; + } + if (err_count && !err_count_total) { + first_failure = i; + } + err_count_total += err_count; + } + EXPECT_EQ(0, err_count_total) + << "Error: Loop8Test6Param, C output doesn't match SSE2 " + "loopfilter output. " + << "First failed at test case " << first_failure; +} + +TEST_P(Loop8Test6Param, ValueCheck) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = number_of_iterations; + DECLARE_ALIGNED(PIXEL_WIDTH, Pixel, s[kNumCoeffs]); + DECLARE_ALIGNED(PIXEL_WIDTH, Pixel, ref_s[kNumCoeffs]); + int err_count_total = 0; + int first_failure = -1; + + // NOTE: The code in vp9_loopfilter.c:update_sharpness computes mblim as a + // function of sharpness_lvl and the loopfilter lvl as: + // block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4)); + // ... + // memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit), + // SIMD_WIDTH); + // This means that the largest value for mblim will occur when sharpness_lvl + // is equal to 0, and lvl is equal to its greatest value (MAX_LOOP_FILTER). + // In this case block_inside_limit will be equal to MAX_LOOP_FILTER and + // therefore mblim will be equal to (2 * (lvl + 2) + block_inside_limit) = + // 2 * (MAX_LOOP_FILTER + 2) + MAX_LOOP_FILTER = 3 * MAX_LOOP_FILTER + 4 + + for (int i = 0; i < count_test_block; ++i) { + int err_count = 0; + uint8_t tmp = GetOuterThresh(&rnd); + DECLARE_ALIGNED(16, const uint8_t, + blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; + tmp = GetInnerThresh(&rnd); + DECLARE_ALIGNED(16, const uint8_t, + limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; + tmp = GetHevThresh(&rnd); + DECLARE_ALIGNED(16, const uint8_t, + thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; + int32_t p = kNumCoeffs / 32; + for (int j = 0; j < kNumCoeffs; ++j) { + s[j] = rnd.Rand16() & mask_; + ref_s[j] = s[j]; + } +#if CONFIG_VP9_HIGHBITDEPTH + ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, bit_depth_); + ASM_REGISTER_STATE_CHECK( + loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, bit_depth_)); +#else + ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh); + ASM_REGISTER_STATE_CHECK( + loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh)); +#endif // CONFIG_VP9_HIGHBITDEPTH + + for (int j = 0; j < kNumCoeffs; ++j) { + err_count += ref_s[j] != s[j]; + } + if (err_count && !err_count_total) { + first_failure = i; + } + err_count_total += err_count; + } + EXPECT_EQ(0, err_count_total) + << "Error: Loop8Test6Param, C output doesn't match SSE2 " + "loopfilter output. " + << "First failed at test case " << first_failure; +} + +#if HAVE_NEON || HAVE_SSE2 || (HAVE_LSX && (!CONFIG_VP9_HIGHBITDEPTH)) || \ + (HAVE_DSPR2 || HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)) +TEST_P(Loop8Test9Param, OperationCheck) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = number_of_iterations; + DECLARE_ALIGNED(PIXEL_WIDTH, Pixel, s[kNumCoeffs]); + DECLARE_ALIGNED(PIXEL_WIDTH, Pixel, ref_s[kNumCoeffs]); + int err_count_total = 0; + int first_failure = -1; + for (int i = 0; i < count_test_block; ++i) { + int err_count = 0; + uint8_t tmp = GetOuterThresh(&rnd); + DECLARE_ALIGNED(16, const uint8_t, + blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; + tmp = GetInnerThresh(&rnd); + DECLARE_ALIGNED(16, const uint8_t, + limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; + tmp = GetHevThresh(&rnd); + DECLARE_ALIGNED(16, const uint8_t, + thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; + tmp = GetOuterThresh(&rnd); + DECLARE_ALIGNED(16, const uint8_t, + blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; + tmp = GetInnerThresh(&rnd); + DECLARE_ALIGNED(16, const uint8_t, + limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; + tmp = GetHevThresh(&rnd); + DECLARE_ALIGNED(16, const uint8_t, + thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; + int32_t p = kNumCoeffs / 32; + const uint8_t limit = *limit0 < *limit1 ? *limit0 : *limit1; + InitInput(s, ref_s, &rnd, limit, mask_, p, i); +#if CONFIG_VP9_HIGHBITDEPTH + ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bit_depth_); + ASM_REGISTER_STATE_CHECK(loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0, + thresh0, blimit1, limit1, thresh1, + bit_depth_)); +#else + ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1); + ASM_REGISTER_STATE_CHECK(loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0, + thresh0, blimit1, limit1, thresh1)); +#endif // CONFIG_VP9_HIGHBITDEPTH + + for (int j = 0; j < kNumCoeffs; ++j) { + err_count += ref_s[j] != s[j]; + } + if (err_count && !err_count_total) { + first_failure = i; + } + err_count_total += err_count; + } + EXPECT_EQ(0, err_count_total) + << "Error: Loop8Test9Param, C output doesn't match SSE2 " + "loopfilter output. " + << "First failed at test case " << first_failure; +} + +TEST_P(Loop8Test9Param, ValueCheck) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = number_of_iterations; + DECLARE_ALIGNED(PIXEL_WIDTH, Pixel, s[kNumCoeffs]); + DECLARE_ALIGNED(PIXEL_WIDTH, Pixel, ref_s[kNumCoeffs]); + int err_count_total = 0; + int first_failure = -1; + for (int i = 0; i < count_test_block; ++i) { + int err_count = 0; + uint8_t tmp = GetOuterThresh(&rnd); + DECLARE_ALIGNED(16, const uint8_t, + blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; + tmp = GetInnerThresh(&rnd); + DECLARE_ALIGNED(16, const uint8_t, + limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; + tmp = GetHevThresh(&rnd); + DECLARE_ALIGNED(16, const uint8_t, + thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; + tmp = GetOuterThresh(&rnd); + DECLARE_ALIGNED(16, const uint8_t, + blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; + tmp = GetInnerThresh(&rnd); + DECLARE_ALIGNED(16, const uint8_t, + limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; + tmp = GetHevThresh(&rnd); + DECLARE_ALIGNED(16, const uint8_t, + thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, + tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; + int32_t p = kNumCoeffs / 32; // TODO(pdlf) can we have non-square here? + for (int j = 0; j < kNumCoeffs; ++j) { + s[j] = rnd.Rand16() & mask_; + ref_s[j] = s[j]; + } +#if CONFIG_VP9_HIGHBITDEPTH + ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bit_depth_); + ASM_REGISTER_STATE_CHECK(loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0, + thresh0, blimit1, limit1, thresh1, + bit_depth_)); +#else + ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1); + ASM_REGISTER_STATE_CHECK(loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0, + thresh0, blimit1, limit1, thresh1)); +#endif // CONFIG_VP9_HIGHBITDEPTH + + for (int j = 0; j < kNumCoeffs; ++j) { + err_count += ref_s[j] != s[j]; + } + if (err_count && !err_count_total) { + first_failure = i; + } + err_count_total += err_count; + } + EXPECT_EQ(0, err_count_total) + << "Error: Loop8Test9Param, C output doesn't match SSE2" + "loopfilter output. " + << "First failed at test case " << first_failure; +} +#endif // HAVE_NEON || HAVE_SSE2 || (HAVE_DSPR2 || HAVE_MSA && + // (!CONFIG_VP9_HIGHBITDEPTH)) || (HAVE_LSX && + // (!CONFIG_VP9_HIGHBITDEPTH)) + +using std::make_tuple; + +#if HAVE_SSE2 +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + SSE2, Loop8Test6Param, + ::testing::Values(make_tuple(&vpx_highbd_lpf_horizontal_4_sse2, + &vpx_highbd_lpf_horizontal_4_c, 8), + make_tuple(&vpx_highbd_lpf_vertical_4_sse2, + &vpx_highbd_lpf_vertical_4_c, 8), + make_tuple(&vpx_highbd_lpf_horizontal_8_sse2, + &vpx_highbd_lpf_horizontal_8_c, 8), + make_tuple(&vpx_highbd_lpf_horizontal_16_sse2, + &vpx_highbd_lpf_horizontal_16_c, 8), + make_tuple(&vpx_highbd_lpf_horizontal_16_dual_sse2, + &vpx_highbd_lpf_horizontal_16_dual_c, 8), + make_tuple(&vpx_highbd_lpf_vertical_8_sse2, + &vpx_highbd_lpf_vertical_8_c, 8), + make_tuple(&vpx_highbd_lpf_vertical_16_sse2, + &vpx_highbd_lpf_vertical_16_c, 8), + make_tuple(&vpx_highbd_lpf_horizontal_4_sse2, + &vpx_highbd_lpf_horizontal_4_c, 10), + make_tuple(&vpx_highbd_lpf_vertical_4_sse2, + &vpx_highbd_lpf_vertical_4_c, 10), + make_tuple(&vpx_highbd_lpf_horizontal_8_sse2, + &vpx_highbd_lpf_horizontal_8_c, 10), + make_tuple(&vpx_highbd_lpf_horizontal_16_sse2, + &vpx_highbd_lpf_horizontal_16_c, 10), + make_tuple(&vpx_highbd_lpf_horizontal_16_dual_sse2, + &vpx_highbd_lpf_horizontal_16_dual_c, 10), + make_tuple(&vpx_highbd_lpf_vertical_8_sse2, + &vpx_highbd_lpf_vertical_8_c, 10), + make_tuple(&vpx_highbd_lpf_vertical_16_sse2, + &vpx_highbd_lpf_vertical_16_c, 10), + make_tuple(&vpx_highbd_lpf_horizontal_4_sse2, + &vpx_highbd_lpf_horizontal_4_c, 12), + make_tuple(&vpx_highbd_lpf_vertical_4_sse2, + &vpx_highbd_lpf_vertical_4_c, 12), + make_tuple(&vpx_highbd_lpf_horizontal_8_sse2, + &vpx_highbd_lpf_horizontal_8_c, 12), + make_tuple(&vpx_highbd_lpf_horizontal_16_sse2, + &vpx_highbd_lpf_horizontal_16_c, 12), + make_tuple(&vpx_highbd_lpf_horizontal_16_dual_sse2, + &vpx_highbd_lpf_horizontal_16_dual_c, 12), + make_tuple(&vpx_highbd_lpf_vertical_8_sse2, + &vpx_highbd_lpf_vertical_8_c, 12), + make_tuple(&vpx_highbd_lpf_vertical_16_sse2, + &vpx_highbd_lpf_vertical_16_c, 12), + make_tuple(&vpx_highbd_lpf_vertical_16_dual_sse2, + &vpx_highbd_lpf_vertical_16_dual_c, 8), + make_tuple(&vpx_highbd_lpf_vertical_16_dual_sse2, + &vpx_highbd_lpf_vertical_16_dual_c, 10), + make_tuple(&vpx_highbd_lpf_vertical_16_dual_sse2, + &vpx_highbd_lpf_vertical_16_dual_c, 12))); +#else +INSTANTIATE_TEST_SUITE_P( + SSE2, Loop8Test6Param, + ::testing::Values( + make_tuple(&vpx_lpf_horizontal_4_sse2, &vpx_lpf_horizontal_4_c, 8), + make_tuple(&vpx_lpf_horizontal_8_sse2, &vpx_lpf_horizontal_8_c, 8), + make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8), + make_tuple(&vpx_lpf_horizontal_16_dual_sse2, + &vpx_lpf_horizontal_16_dual_c, 8), + make_tuple(&vpx_lpf_vertical_4_sse2, &vpx_lpf_vertical_4_c, 8), + make_tuple(&vpx_lpf_vertical_8_sse2, &vpx_lpf_vertical_8_c, 8), + make_tuple(&vpx_lpf_vertical_16_sse2, &vpx_lpf_vertical_16_c, 8), + make_tuple(&vpx_lpf_vertical_16_dual_sse2, &vpx_lpf_vertical_16_dual_c, + 8))); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif + +#if HAVE_AVX2 && (!CONFIG_VP9_HIGHBITDEPTH) +INSTANTIATE_TEST_SUITE_P( + AVX2, Loop8Test6Param, + ::testing::Values(make_tuple(&vpx_lpf_horizontal_16_avx2, + &vpx_lpf_horizontal_16_c, 8), + make_tuple(&vpx_lpf_horizontal_16_dual_avx2, + &vpx_lpf_horizontal_16_dual_c, 8))); +#endif + +#if HAVE_SSE2 +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + SSE2, Loop8Test9Param, + ::testing::Values(make_tuple(&vpx_highbd_lpf_horizontal_4_dual_sse2, + &vpx_highbd_lpf_horizontal_4_dual_c, 8), + make_tuple(&vpx_highbd_lpf_horizontal_8_dual_sse2, + &vpx_highbd_lpf_horizontal_8_dual_c, 8), + make_tuple(&vpx_highbd_lpf_vertical_4_dual_sse2, + &vpx_highbd_lpf_vertical_4_dual_c, 8), + make_tuple(&vpx_highbd_lpf_vertical_8_dual_sse2, + &vpx_highbd_lpf_vertical_8_dual_c, 8), + make_tuple(&vpx_highbd_lpf_horizontal_4_dual_sse2, + &vpx_highbd_lpf_horizontal_4_dual_c, 10), + make_tuple(&vpx_highbd_lpf_horizontal_8_dual_sse2, + &vpx_highbd_lpf_horizontal_8_dual_c, 10), + make_tuple(&vpx_highbd_lpf_vertical_4_dual_sse2, + &vpx_highbd_lpf_vertical_4_dual_c, 10), + make_tuple(&vpx_highbd_lpf_vertical_8_dual_sse2, + &vpx_highbd_lpf_vertical_8_dual_c, 10), + make_tuple(&vpx_highbd_lpf_horizontal_4_dual_sse2, + &vpx_highbd_lpf_horizontal_4_dual_c, 12), + make_tuple(&vpx_highbd_lpf_horizontal_8_dual_sse2, + &vpx_highbd_lpf_horizontal_8_dual_c, 12), + make_tuple(&vpx_highbd_lpf_vertical_4_dual_sse2, + &vpx_highbd_lpf_vertical_4_dual_c, 12), + make_tuple(&vpx_highbd_lpf_vertical_8_dual_sse2, + &vpx_highbd_lpf_vertical_8_dual_c, 12))); +#else +INSTANTIATE_TEST_SUITE_P( + SSE2, Loop8Test9Param, + ::testing::Values(make_tuple(&vpx_lpf_horizontal_4_dual_sse2, + &vpx_lpf_horizontal_4_dual_c, 8), + make_tuple(&vpx_lpf_horizontal_8_dual_sse2, + &vpx_lpf_horizontal_8_dual_c, 8), + make_tuple(&vpx_lpf_vertical_4_dual_sse2, + &vpx_lpf_vertical_4_dual_c, 8), + make_tuple(&vpx_lpf_vertical_8_dual_sse2, + &vpx_lpf_vertical_8_dual_c, 8))); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif + +#if HAVE_NEON +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + NEON, Loop8Test6Param, + ::testing::Values(make_tuple(&vpx_highbd_lpf_horizontal_4_neon, + &vpx_highbd_lpf_horizontal_4_c, 8), + make_tuple(&vpx_highbd_lpf_horizontal_4_neon, + &vpx_highbd_lpf_horizontal_4_c, 10), + make_tuple(&vpx_highbd_lpf_horizontal_4_neon, + &vpx_highbd_lpf_horizontal_4_c, 12), + make_tuple(&vpx_highbd_lpf_horizontal_8_neon, + &vpx_highbd_lpf_horizontal_8_c, 8), + make_tuple(&vpx_highbd_lpf_horizontal_8_neon, + &vpx_highbd_lpf_horizontal_8_c, 10), + make_tuple(&vpx_highbd_lpf_horizontal_8_neon, + &vpx_highbd_lpf_horizontal_8_c, 12), + make_tuple(&vpx_highbd_lpf_horizontal_16_neon, + &vpx_highbd_lpf_horizontal_16_c, 8), + make_tuple(&vpx_highbd_lpf_horizontal_16_neon, + &vpx_highbd_lpf_horizontal_16_c, 10), + make_tuple(&vpx_highbd_lpf_horizontal_16_neon, + &vpx_highbd_lpf_horizontal_16_c, 12), + make_tuple(&vpx_highbd_lpf_horizontal_16_dual_neon, + &vpx_highbd_lpf_horizontal_16_dual_c, 8), + make_tuple(&vpx_highbd_lpf_horizontal_16_dual_neon, + &vpx_highbd_lpf_horizontal_16_dual_c, 10), + make_tuple(&vpx_highbd_lpf_horizontal_16_dual_neon, + &vpx_highbd_lpf_horizontal_16_dual_c, 12), + make_tuple(&vpx_highbd_lpf_vertical_4_neon, + &vpx_highbd_lpf_vertical_4_c, 8), + make_tuple(&vpx_highbd_lpf_vertical_4_neon, + &vpx_highbd_lpf_vertical_4_c, 10), + make_tuple(&vpx_highbd_lpf_vertical_4_neon, + &vpx_highbd_lpf_vertical_4_c, 12), + make_tuple(&vpx_highbd_lpf_vertical_8_neon, + &vpx_highbd_lpf_vertical_8_c, 8), + make_tuple(&vpx_highbd_lpf_vertical_8_neon, + &vpx_highbd_lpf_vertical_8_c, 10), + make_tuple(&vpx_highbd_lpf_vertical_8_neon, + &vpx_highbd_lpf_vertical_8_c, 12), + make_tuple(&vpx_highbd_lpf_vertical_16_neon, + &vpx_highbd_lpf_vertical_16_c, 8), + make_tuple(&vpx_highbd_lpf_vertical_16_neon, + &vpx_highbd_lpf_vertical_16_c, 10), + make_tuple(&vpx_highbd_lpf_vertical_16_neon, + &vpx_highbd_lpf_vertical_16_c, 12), + make_tuple(&vpx_highbd_lpf_vertical_16_dual_neon, + &vpx_highbd_lpf_vertical_16_dual_c, 8), + make_tuple(&vpx_highbd_lpf_vertical_16_dual_neon, + &vpx_highbd_lpf_vertical_16_dual_c, 10), + make_tuple(&vpx_highbd_lpf_vertical_16_dual_neon, + &vpx_highbd_lpf_vertical_16_dual_c, 12))); +INSTANTIATE_TEST_SUITE_P( + NEON, Loop8Test9Param, + ::testing::Values(make_tuple(&vpx_highbd_lpf_horizontal_4_dual_neon, + &vpx_highbd_lpf_horizontal_4_dual_c, 8), + make_tuple(&vpx_highbd_lpf_horizontal_4_dual_neon, + &vpx_highbd_lpf_horizontal_4_dual_c, 10), + make_tuple(&vpx_highbd_lpf_horizontal_4_dual_neon, + &vpx_highbd_lpf_horizontal_4_dual_c, 12), + make_tuple(&vpx_highbd_lpf_horizontal_8_dual_neon, + &vpx_highbd_lpf_horizontal_8_dual_c, 8), + make_tuple(&vpx_highbd_lpf_horizontal_8_dual_neon, + &vpx_highbd_lpf_horizontal_8_dual_c, 10), + make_tuple(&vpx_highbd_lpf_horizontal_8_dual_neon, + &vpx_highbd_lpf_horizontal_8_dual_c, 12), + make_tuple(&vpx_highbd_lpf_vertical_4_dual_neon, + &vpx_highbd_lpf_vertical_4_dual_c, 8), + make_tuple(&vpx_highbd_lpf_vertical_4_dual_neon, + &vpx_highbd_lpf_vertical_4_dual_c, 10), + make_tuple(&vpx_highbd_lpf_vertical_4_dual_neon, + &vpx_highbd_lpf_vertical_4_dual_c, 12), + make_tuple(&vpx_highbd_lpf_vertical_8_dual_neon, + &vpx_highbd_lpf_vertical_8_dual_c, 8), + make_tuple(&vpx_highbd_lpf_vertical_8_dual_neon, + &vpx_highbd_lpf_vertical_8_dual_c, 10), + make_tuple(&vpx_highbd_lpf_vertical_8_dual_neon, + &vpx_highbd_lpf_vertical_8_dual_c, 12))); +#else +INSTANTIATE_TEST_SUITE_P( + NEON, Loop8Test6Param, + ::testing::Values( + make_tuple(&vpx_lpf_horizontal_16_neon, &vpx_lpf_horizontal_16_c, 8), + make_tuple(&vpx_lpf_horizontal_16_dual_neon, + &vpx_lpf_horizontal_16_dual_c, 8), + make_tuple(&vpx_lpf_vertical_16_neon, &vpx_lpf_vertical_16_c, 8), + make_tuple(&vpx_lpf_vertical_16_dual_neon, &vpx_lpf_vertical_16_dual_c, + 8), + make_tuple(&vpx_lpf_horizontal_8_neon, &vpx_lpf_horizontal_8_c, 8), + make_tuple(&vpx_lpf_vertical_8_neon, &vpx_lpf_vertical_8_c, 8), + make_tuple(&vpx_lpf_horizontal_4_neon, &vpx_lpf_horizontal_4_c, 8), + make_tuple(&vpx_lpf_vertical_4_neon, &vpx_lpf_vertical_4_c, 8))); +INSTANTIATE_TEST_SUITE_P( + NEON, Loop8Test9Param, + ::testing::Values(make_tuple(&vpx_lpf_horizontal_8_dual_neon, + &vpx_lpf_horizontal_8_dual_c, 8), + make_tuple(&vpx_lpf_vertical_8_dual_neon, + &vpx_lpf_vertical_8_dual_c, 8), + make_tuple(&vpx_lpf_horizontal_4_dual_neon, + &vpx_lpf_horizontal_4_dual_c, 8), + make_tuple(&vpx_lpf_vertical_4_dual_neon, + &vpx_lpf_vertical_4_dual_c, 8))); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_NEON + +#if HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + DSPR2, Loop8Test6Param, + ::testing::Values( + make_tuple(&vpx_lpf_horizontal_4_dspr2, &vpx_lpf_horizontal_4_c, 8), + make_tuple(&vpx_lpf_horizontal_8_dspr2, &vpx_lpf_horizontal_8_c, 8), + make_tuple(&vpx_lpf_horizontal_16_dspr2, &vpx_lpf_horizontal_16_c, 8), + make_tuple(&vpx_lpf_horizontal_16_dual_dspr2, + &vpx_lpf_horizontal_16_dual_c, 8), + make_tuple(&vpx_lpf_vertical_4_dspr2, &vpx_lpf_vertical_4_c, 8), + make_tuple(&vpx_lpf_vertical_8_dspr2, &vpx_lpf_vertical_8_c, 8), + make_tuple(&vpx_lpf_vertical_16_dspr2, &vpx_lpf_vertical_16_c, 8), + make_tuple(&vpx_lpf_vertical_16_dual_dspr2, &vpx_lpf_vertical_16_dual_c, + 8))); + +INSTANTIATE_TEST_SUITE_P( + DSPR2, Loop8Test9Param, + ::testing::Values(make_tuple(&vpx_lpf_horizontal_4_dual_dspr2, + &vpx_lpf_horizontal_4_dual_c, 8), + make_tuple(&vpx_lpf_horizontal_8_dual_dspr2, + &vpx_lpf_horizontal_8_dual_c, 8), + make_tuple(&vpx_lpf_vertical_4_dual_dspr2, + &vpx_lpf_vertical_4_dual_c, 8), + make_tuple(&vpx_lpf_vertical_8_dual_dspr2, + &vpx_lpf_vertical_8_dual_c, 8))); +#endif // HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH) +INSTANTIATE_TEST_SUITE_P( + MSA, Loop8Test6Param, + ::testing::Values( + make_tuple(&vpx_lpf_horizontal_4_msa, &vpx_lpf_horizontal_4_c, 8), + make_tuple(&vpx_lpf_horizontal_8_msa, &vpx_lpf_horizontal_8_c, 8), + make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8), + make_tuple(&vpx_lpf_horizontal_16_dual_msa, + &vpx_lpf_horizontal_16_dual_c, 8), + make_tuple(&vpx_lpf_vertical_4_msa, &vpx_lpf_vertical_4_c, 8), + make_tuple(&vpx_lpf_vertical_8_msa, &vpx_lpf_vertical_8_c, 8), + make_tuple(&vpx_lpf_vertical_16_msa, &vpx_lpf_vertical_16_c, 8))); + +INSTANTIATE_TEST_SUITE_P( + MSA, Loop8Test9Param, + ::testing::Values(make_tuple(&vpx_lpf_horizontal_4_dual_msa, + &vpx_lpf_horizontal_4_dual_c, 8), + make_tuple(&vpx_lpf_horizontal_8_dual_msa, + &vpx_lpf_horizontal_8_dual_c, 8), + make_tuple(&vpx_lpf_vertical_4_dual_msa, + &vpx_lpf_vertical_4_dual_c, 8), + make_tuple(&vpx_lpf_vertical_8_dual_msa, + &vpx_lpf_vertical_8_dual_c, 8))); +#endif // HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH) + +#if HAVE_LSX && (!CONFIG_VP9_HIGHBITDEPTH) +INSTANTIATE_TEST_SUITE_P( + LSX, Loop8Test6Param, + ::testing::Values( + make_tuple(&vpx_lpf_horizontal_4_lsx, &vpx_lpf_horizontal_4_c, 8), + make_tuple(&vpx_lpf_horizontal_8_lsx, &vpx_lpf_horizontal_8_c, 8), + make_tuple(&vpx_lpf_horizontal_16_dual_lsx, + &vpx_lpf_horizontal_16_dual_c, 8), + make_tuple(&vpx_lpf_vertical_4_lsx, &vpx_lpf_vertical_4_c, 8), + make_tuple(&vpx_lpf_vertical_8_lsx, &vpx_lpf_vertical_8_c, 8), + make_tuple(&vpx_lpf_vertical_16_dual_lsx, &vpx_lpf_vertical_16_dual_c, + 8))); + +INSTANTIATE_TEST_SUITE_P( + LSX, Loop8Test9Param, + ::testing::Values(make_tuple(&vpx_lpf_horizontal_4_dual_lsx, + &vpx_lpf_horizontal_4_dual_c, 8), + make_tuple(&vpx_lpf_horizontal_8_dual_lsx, + &vpx_lpf_horizontal_8_dual_c, 8), + make_tuple(&vpx_lpf_vertical_4_dual_lsx, + &vpx_lpf_vertical_4_dual_c, 8), + make_tuple(&vpx_lpf_vertical_8_dual_lsx, + &vpx_lpf_vertical_8_dual_c, 8))); +#endif // HAVE_LSX && (!CONFIG_VP9_HIGHBITDEPTH) + +} // namespace diff --git a/media/libvpx/libvpx/test/md5_helper.h b/media/libvpx/libvpx/test/md5_helper.h new file mode 100644 index 0000000000..9095d96a8a --- /dev/null +++ b/media/libvpx/libvpx/test/md5_helper.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_TEST_MD5_HELPER_H_ +#define VPX_TEST_MD5_HELPER_H_ + +#include "./md5_utils.h" +#include "vpx/vpx_decoder.h" + +namespace libvpx_test { +class MD5 { + public: + MD5() { MD5Init(&md5_); } + + void Add(const vpx_image_t *img) { + for (int plane = 0; plane < 3; ++plane) { + const uint8_t *buf = img->planes[plane]; + // Calculate the width and height to do the md5 check. For the chroma + // plane, we never want to round down and thus skip a pixel so if + // we are shifting by 1 (chroma_shift) we add 1 before doing the shift. + // This works only for chroma_shift of 0 and 1. + const int bytes_per_sample = + (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; + const int h = + plane ? (img->d_h + img->y_chroma_shift) >> img->y_chroma_shift + : img->d_h; + const int w = + (plane ? (img->d_w + img->x_chroma_shift) >> img->x_chroma_shift + : img->d_w) * + bytes_per_sample; + + for (int y = 0; y < h; ++y) { + MD5Update(&md5_, buf, w); + buf += img->stride[plane]; + } + } + } + + void Add(const uint8_t *data, size_t size) { + MD5Update(&md5_, data, static_cast(size)); + } + + const char *Get() { + static const char hex[16] = { + '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', + }; + uint8_t tmp[16]; + MD5Context ctx_tmp = md5_; + + MD5Final(tmp, &ctx_tmp); + for (int i = 0; i < 16; i++) { + res_[i * 2 + 0] = hex[tmp[i] >> 4]; + res_[i * 2 + 1] = hex[tmp[i] & 0xf]; + } + res_[32] = 0; + + return res_; + } + + protected: + char res_[33]; + MD5Context md5_; +}; + +} // namespace libvpx_test + +#endif // VPX_TEST_MD5_HELPER_H_ diff --git a/media/libvpx/libvpx/test/minmax_test.cc b/media/libvpx/libvpx/test/minmax_test.cc new file mode 100644 index 0000000000..b495709063 --- /dev/null +++ b/media/libvpx/libvpx/test/minmax_test.cc @@ -0,0 +1,248 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_mem/vpx_mem.h" + +#include "test/acm_random.h" +#include "test/register_state_check.h" + +namespace { + +using ::libvpx_test::ACMRandom; + +typedef void (*MinMaxFunc)(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int *min, int *max); + +class MinMaxTest : public ::testing::TestWithParam { + public: + void SetUp() override { + mm_func_ = GetParam(); + rnd_.Reset(ACMRandom::DeterministicSeed()); + } + + protected: + MinMaxFunc mm_func_; + ACMRandom rnd_; +}; + +void reference_minmax(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int *min_ret, int *max_ret) { + int min = 255; + int max = 0; + for (int i = 0; i < 8; i++) { + for (int j = 0; j < 8; j++) { + const int diff = abs(a[i * a_stride + j] - b[i * b_stride + j]); + if (min > diff) min = diff; + if (max < diff) max = diff; + } + } + + *min_ret = min; + *max_ret = max; +} + +TEST_P(MinMaxTest, MinValue) { + for (int i = 0; i < 64; i++) { + uint8_t a[64], b[64]; + memset(a, 0, sizeof(a)); + memset(b, 255, sizeof(b)); + b[i] = i; // Set a minimum difference of i. + + int min, max; + ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max)); + EXPECT_EQ(255, max); + EXPECT_EQ(i, min); + } +} + +TEST_P(MinMaxTest, MaxValue) { + for (int i = 0; i < 64; i++) { + uint8_t a[64], b[64]; + memset(a, 0, sizeof(a)); + memset(b, 0, sizeof(b)); + b[i] = i; // Set a maximum difference of i. + + int min, max; + ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max)); + EXPECT_EQ(i, max); + EXPECT_EQ(0, min); + } +} + +TEST_P(MinMaxTest, CompareReference) { + uint8_t a[64], b[64]; + for (int j = 0; j < 64; j++) { + a[j] = rnd_.Rand8(); + b[j] = rnd_.Rand8(); + } + + int min_ref, max_ref, min, max; + reference_minmax(a, 8, b, 8, &min_ref, &max_ref); + ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max)); + EXPECT_EQ(max_ref, max); + EXPECT_EQ(min_ref, min); +} + +TEST_P(MinMaxTest, CompareReferenceAndVaryStride) { + uint8_t a[8 * 64], b[8 * 64]; + for (int i = 0; i < 8 * 64; i++) { + a[i] = rnd_.Rand8(); + b[i] = rnd_.Rand8(); + } + for (int a_stride = 8; a_stride <= 64; a_stride += 8) { + for (int b_stride = 8; b_stride <= 64; b_stride += 8) { + int min_ref, max_ref, min, max; + reference_minmax(a, a_stride, b, b_stride, &min_ref, &max_ref); + ASM_REGISTER_STATE_CHECK(mm_func_(a, a_stride, b, b_stride, &min, &max)); + EXPECT_EQ(max_ref, max) + << "when a_stride = " << a_stride << " and b_stride = " << b_stride; + EXPECT_EQ(min_ref, min) + << "when a_stride = " << a_stride << " and b_stride = " << b_stride; + } + } +} + +#if CONFIG_VP9_HIGHBITDEPTH + +using HBDMinMaxTest = MinMaxTest; + +void highbd_reference_minmax(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int *min_ret, int *max_ret) { + int min = 65535; + int max = 0; + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b); + for (int i = 0; i < 8; i++) { + for (int j = 0; j < 8; j++) { + const int diff = abs(a_ptr[i * a_stride + j] - b_ptr[i * b_stride + j]); + if (min > diff) min = diff; + if (max < diff) max = diff; + } + } + + *min_ret = min; + *max_ret = max; +} + +TEST_P(HBDMinMaxTest, MinValue) { + uint8_t *a = CONVERT_TO_BYTEPTR( + reinterpret_cast(vpx_malloc(64 * sizeof(uint16_t)))); + uint8_t *b = CONVERT_TO_BYTEPTR( + reinterpret_cast(vpx_malloc(64 * sizeof(uint16_t)))); + for (int i = 0; i < 64; i++) { + vpx_memset16(CONVERT_TO_SHORTPTR(a), 0, 64); + vpx_memset16(CONVERT_TO_SHORTPTR(b), 65535, 64); + CONVERT_TO_SHORTPTR(b)[i] = i; // Set a minimum difference of i. + + int min, max; + ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max)); + EXPECT_EQ(65535, max); + EXPECT_EQ(i, min); + } + vpx_free(CONVERT_TO_SHORTPTR(a)); + vpx_free(CONVERT_TO_SHORTPTR(b)); +} + +TEST_P(HBDMinMaxTest, MaxValue) { + uint8_t *a = CONVERT_TO_BYTEPTR( + reinterpret_cast(vpx_malloc(64 * sizeof(uint16_t)))); + uint8_t *b = CONVERT_TO_BYTEPTR( + reinterpret_cast(vpx_malloc(64 * sizeof(uint16_t)))); + for (int i = 0; i < 64; i++) { + vpx_memset16(CONVERT_TO_SHORTPTR(a), 0, 64); + vpx_memset16(CONVERT_TO_SHORTPTR(b), 0, 64); + CONVERT_TO_SHORTPTR(b)[i] = i; // Set a minimum difference of i. + + int min, max; + ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max)); + EXPECT_EQ(i, max); + EXPECT_EQ(0, min); + } + vpx_free(CONVERT_TO_SHORTPTR(a)); + vpx_free(CONVERT_TO_SHORTPTR(b)); +} + +TEST_P(HBDMinMaxTest, CompareReference) { + uint8_t *a = CONVERT_TO_BYTEPTR( + reinterpret_cast(vpx_malloc(64 * sizeof(uint16_t)))); + uint8_t *b = CONVERT_TO_BYTEPTR( + reinterpret_cast(vpx_malloc(64 * sizeof(uint16_t)))); + for (int j = 0; j < 64; j++) { + CONVERT_TO_SHORTPTR(a)[j] = rnd_.Rand16(); + CONVERT_TO_SHORTPTR(b)[j] = rnd_.Rand16(); + } + + int min_ref, max_ref, min, max; + highbd_reference_minmax(a, 8, b, 8, &min_ref, &max_ref); + ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max)); + vpx_free(CONVERT_TO_SHORTPTR(a)); + vpx_free(CONVERT_TO_SHORTPTR(b)); + EXPECT_EQ(max_ref, max); + EXPECT_EQ(min_ref, min); +} + +TEST_P(HBDMinMaxTest, CompareReferenceAndVaryStride) { + uint8_t *a = CONVERT_TO_BYTEPTR( + reinterpret_cast(vpx_malloc((8 * 64) * sizeof(uint16_t)))); + uint8_t *b = CONVERT_TO_BYTEPTR( + reinterpret_cast(vpx_malloc((8 * 64) * sizeof(uint16_t)))); + for (int i = 0; i < 8 * 64; i++) { + CONVERT_TO_SHORTPTR(a)[i] = rnd_.Rand16(); + CONVERT_TO_SHORTPTR(b)[i] = rnd_.Rand16(); + } + for (int a_stride = 8; a_stride <= 64; a_stride += 8) { + for (int b_stride = 8; b_stride <= 64; b_stride += 8) { + int min_ref, max_ref, min, max; + highbd_reference_minmax(a, a_stride, b, b_stride, &min_ref, &max_ref); + ASM_REGISTER_STATE_CHECK(mm_func_(a, a_stride, b, b_stride, &min, &max)); + EXPECT_EQ(max_ref, max) + << "when a_stride = " << a_stride << " and b_stride = " << b_stride; + EXPECT_EQ(min_ref, min) + << "when a_stride = " << a_stride << " and b_stride = " << b_stride; + } + } + vpx_free(CONVERT_TO_SHORTPTR(a)); + vpx_free(CONVERT_TO_SHORTPTR(b)); +} +#endif + +INSTANTIATE_TEST_SUITE_P(C, MinMaxTest, ::testing::Values(&vpx_minmax_8x8_c)); +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P(C, HBDMinMaxTest, + ::testing::Values(&vpx_highbd_minmax_8x8_c)); +#endif + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P(SSE2, MinMaxTest, + ::testing::Values(&vpx_minmax_8x8_sse2)); +#endif + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, MinMaxTest, + ::testing::Values(&vpx_minmax_8x8_neon)); +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P(NEON, HBDMinMaxTest, + ::testing::Values(&vpx_highbd_minmax_8x8_neon)); +#endif +#endif + +#if HAVE_MSA +INSTANTIATE_TEST_SUITE_P(MSA, MinMaxTest, + ::testing::Values(&vpx_minmax_8x8_msa)); +#endif + +} // namespace diff --git a/media/libvpx/libvpx/test/non_greedy_mv_test.cc b/media/libvpx/libvpx/test/non_greedy_mv_test.cc new file mode 100644 index 0000000000..927029de45 --- /dev/null +++ b/media/libvpx/libvpx/test/non_greedy_mv_test.cc @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "vp9/encoder/vp9_non_greedy_mv.h" +#include "./vpx_dsp_rtcd.h" + +namespace { + +static void read_in_mf(const char *filename, int *rows_ptr, int *cols_ptr, + MV **buffer_ptr) { + FILE *input = fopen(filename, "rb"); + int row, col; + int idx; + + ASSERT_NE(input, nullptr) << "Cannot open file: " << filename << std::endl; + + fscanf(input, "%d,%d\n", rows_ptr, cols_ptr); + + *buffer_ptr = (MV *)malloc((*rows_ptr) * (*cols_ptr) * sizeof(MV)); + + for (idx = 0; idx < (*rows_ptr) * (*cols_ptr); ++idx) { + fscanf(input, "%d,%d;", &row, &col); + (*buffer_ptr)[idx].row = row; + (*buffer_ptr)[idx].col = col; + } + fclose(input); +} + +static void read_in_local_var(const char *filename, int *rows_ptr, + int *cols_ptr, + int (**M_ptr)[MF_LOCAL_STRUCTURE_SIZE]) { + FILE *input = fopen(filename, "rb"); + int M00, M01, M10, M11; + int idx; + int int_type; + + ASSERT_NE(input, nullptr) << "Cannot open file: " << filename << std::endl; + + fscanf(input, "%d,%d\n", rows_ptr, cols_ptr); + + *M_ptr = (int(*)[MF_LOCAL_STRUCTURE_SIZE])malloc( + (*rows_ptr) * (*cols_ptr) * MF_LOCAL_STRUCTURE_SIZE * sizeof(int_type)); + + for (idx = 0; idx < (*rows_ptr) * (*cols_ptr); ++idx) { + fscanf(input, "%d,%d,%d,%d;", &M00, &M01, &M10, &M11); + (*M_ptr)[idx][0] = M00; + (*M_ptr)[idx][1] = M01; + (*M_ptr)[idx][2] = M10; + (*M_ptr)[idx][3] = M11; + } + fclose(input); +} + +static void compare_mf(const MV *mf1, const MV *mf2, int rows, int cols, + float *mean_ptr, float *std_ptr) { + float float_type; + float *diffs = (float *)malloc(rows * cols * sizeof(float_type)); + int idx; + float accu = 0.0f; + for (idx = 0; idx < rows * cols; ++idx) { + MV mv1 = mf1[idx]; + MV mv2 = mf2[idx]; + float row_diff2 = (float)((mv1.row - mv2.row) * (mv1.row - mv2.row)); + float col_diff2 = (float)((mv1.col - mv2.col) * (mv1.col - mv2.col)); + diffs[idx] = sqrt(row_diff2 + col_diff2); + accu += diffs[idx]; + } + *mean_ptr = accu / rows / cols; + *std_ptr = 0; + for (idx = 0; idx < rows * cols; ++idx) { + *std_ptr += (diffs[idx] - (*mean_ptr)) * (diffs[idx] - (*mean_ptr)); + } + *std_ptr = sqrt(*std_ptr / rows / cols); + free(diffs); +} + +static void load_frame_info(const char *filename, + YV12_BUFFER_CONFIG *ref_frame_ptr) { + FILE *input = fopen(filename, "rb"); + int idx; + uint8_t data_type; + + ASSERT_NE(input, nullptr) << "Cannot open file: " << filename << std::endl; + + fscanf(input, "%d,%d\n", &(ref_frame_ptr->y_height), + &(ref_frame_ptr->y_width)); + + ref_frame_ptr->y_buffer = (uint8_t *)malloc( + (ref_frame_ptr->y_width) * (ref_frame_ptr->y_height) * sizeof(data_type)); + + for (idx = 0; idx < (ref_frame_ptr->y_width) * (ref_frame_ptr->y_height); + ++idx) { + int value; + fscanf(input, "%d,", &value); + ref_frame_ptr->y_buffer[idx] = (uint8_t)value; + } + + ref_frame_ptr->y_stride = ref_frame_ptr->y_width; + fclose(input); +} + +static int compare_local_var(const int (*local_var1)[MF_LOCAL_STRUCTURE_SIZE], + const int (*local_var2)[MF_LOCAL_STRUCTURE_SIZE], + int rows, int cols) { + int diff = 0; + int outter_idx, inner_idx; + for (outter_idx = 0; outter_idx < rows * cols; ++outter_idx) { + for (inner_idx = 0; inner_idx < MF_LOCAL_STRUCTURE_SIZE; ++inner_idx) { + diff += abs(local_var1[outter_idx][inner_idx] - + local_var2[outter_idx][inner_idx]); + } + } + return diff / rows / cols; +} + +TEST(non_greedy_mv, smooth_mf) { + const char *search_mf_file = "non_greedy_mv_test_files/exhaust_16x16.txt"; + const char *local_var_file = "non_greedy_mv_test_files/localVar_16x16.txt"; + const char *estimation_file = "non_greedy_mv_test_files/estimation_16x16.txt"; + const char *ground_truth_file = + "non_greedy_mv_test_files/ground_truth_16x16.txt"; + BLOCK_SIZE bsize = BLOCK_32X32; + MV *search_mf = nullptr; + MV *smooth_mf = nullptr; + MV *estimation = nullptr; + MV *ground_truth = nullptr; + int(*local_var)[MF_LOCAL_STRUCTURE_SIZE] = nullptr; + int rows = 0, cols = 0; + + int alpha = 100, max_iter = 100; + + read_in_mf(search_mf_file, &rows, &cols, &search_mf); + read_in_local_var(local_var_file, &rows, &cols, &local_var); + read_in_mf(estimation_file, &rows, &cols, &estimation); + read_in_mf(ground_truth_file, &rows, &cols, &ground_truth); + + float sm_mean, sm_std; + float est_mean, est_std; + + smooth_mf = (MV *)malloc(rows * cols * sizeof(MV)); + vp9_get_smooth_motion_field(search_mf, local_var, rows, cols, bsize, alpha, + max_iter, smooth_mf); + + compare_mf(smooth_mf, ground_truth, rows, cols, &sm_mean, &sm_std); + compare_mf(smooth_mf, estimation, rows, cols, &est_mean, &est_std); + + EXPECT_LE(sm_mean, 3); + EXPECT_LE(est_mean, 2); + + free(search_mf); + free(local_var); + free(estimation); + free(ground_truth); + free(smooth_mf); +} + +TEST(non_greedy_mv, local_var) { + const char *ref_frame_file = "non_greedy_mv_test_files/ref_frame_16x16.txt"; + const char *cur_frame_file = "non_greedy_mv_test_files/cur_frame_16x16.txt"; + const char *gt_local_var_file = "non_greedy_mv_test_files/localVar_16x16.txt"; + const char *search_mf_file = "non_greedy_mv_test_files/exhaust_16x16.txt"; + BLOCK_SIZE bsize = BLOCK_16X16; + int(*gt_local_var)[MF_LOCAL_STRUCTURE_SIZE] = nullptr; + int(*est_local_var)[MF_LOCAL_STRUCTURE_SIZE] = nullptr; + YV12_BUFFER_CONFIG ref_frame, cur_frame; + int rows, cols; + MV *search_mf; + int int_type; + int local_var_diff; + vp9_variance_fn_ptr_t fn; + + load_frame_info(ref_frame_file, &ref_frame); + load_frame_info(cur_frame_file, &cur_frame); + read_in_mf(search_mf_file, &rows, &cols, &search_mf); + + fn.sdf = vpx_sad16x16; + est_local_var = (int(*)[MF_LOCAL_STRUCTURE_SIZE])malloc( + rows * cols * MF_LOCAL_STRUCTURE_SIZE * sizeof(int_type)); + vp9_get_local_structure(&cur_frame, &ref_frame, search_mf, &fn, rows, cols, + bsize, est_local_var); + read_in_local_var(gt_local_var_file, &rows, &cols, >_local_var); + + local_var_diff = compare_local_var(est_local_var, gt_local_var, rows, cols); + + EXPECT_LE(local_var_diff, 1); + + free(gt_local_var); + free(est_local_var); + free(ref_frame.y_buffer); +} +} // namespace diff --git a/media/libvpx/libvpx/test/partial_idct_test.cc b/media/libvpx/libvpx/test/partial_idct_test.cc new file mode 100644 index 0000000000..01e63eb691 --- /dev/null +++ b/media/libvpx/libvpx/test/partial_idct_test.cc @@ -0,0 +1,973 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_scan.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/vpx_timer.h" + +using libvpx_test::ACMRandom; + +namespace { + +typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride); +typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride); +typedef void (*InvTxfmWithBdFunc)(const tran_low_t *in, uint8_t *out, + int stride, int bd); + +template +void wrapper(const tran_low_t *in, uint8_t *out, int stride, int bd) { + (void)bd; + fn(in, out, stride); +} + +#if CONFIG_VP9_HIGHBITDEPTH +typedef void (*InvTxfmHighbdFunc)(const tran_low_t *in, uint16_t *out, + int stride, int bd); +template +void highbd_wrapper(const tran_low_t *in, uint8_t *out, int stride, int bd) { + fn(in, CAST_TO_SHORTPTR(out), stride, bd); +} +#endif + +typedef std::tuple + PartialInvTxfmParam; +const int kMaxNumCoeffs = 1024; +const int kCountTestBlock = 1000; + +class PartialIDctTest : public ::testing::TestWithParam { + public: + ~PartialIDctTest() override = default; + void SetUp() override { + rnd_.Reset(ACMRandom::DeterministicSeed()); + fwd_txfm_ = GET_PARAM(0); + full_inv_txfm_ = GET_PARAM(1); + partial_inv_txfm_ = GET_PARAM(2); + tx_size_ = GET_PARAM(3); + last_nonzero_ = GET_PARAM(4); + bit_depth_ = GET_PARAM(5); + pixel_size_ = GET_PARAM(6); + mask_ = (1 << bit_depth_) - 1; + + switch (tx_size_) { + case TX_4X4: size_ = 4; break; + case TX_8X8: size_ = 8; break; + case TX_16X16: size_ = 16; break; + case TX_32X32: size_ = 32; break; + default: FAIL() << "Wrong Size!"; + } + + // Randomize stride_ to a value less than or equal to 1024 + stride_ = rnd_(1024) + 1; + if (stride_ < size_) { + stride_ = size_; + } + // Align stride_ to 16 if it's bigger than 16. + if (stride_ > 16) { + stride_ &= ~15; + } + + input_block_size_ = size_ * size_; + output_block_size_ = size_ * stride_; + + input_block_ = reinterpret_cast( + vpx_memalign(16, sizeof(*input_block_) * input_block_size_)); + output_block_ = reinterpret_cast( + vpx_memalign(16, pixel_size_ * output_block_size_)); + output_block_ref_ = reinterpret_cast( + vpx_memalign(16, pixel_size_ * output_block_size_)); + } + + void TearDown() override { + vpx_free(input_block_); + input_block_ = nullptr; + vpx_free(output_block_); + output_block_ = nullptr; + vpx_free(output_block_ref_); + output_block_ref_ = nullptr; + libvpx_test::ClearSystemState(); + } + + void InitMem() { + memset(input_block_, 0, sizeof(*input_block_) * input_block_size_); + if (pixel_size_ == 1) { + for (int j = 0; j < output_block_size_; ++j) { + output_block_[j] = output_block_ref_[j] = rnd_.Rand16() & mask_; + } + } else { + ASSERT_EQ(2, pixel_size_); + uint16_t *const output = reinterpret_cast(output_block_); + uint16_t *const output_ref = + reinterpret_cast(output_block_ref_); + for (int j = 0; j < output_block_size_; ++j) { + output[j] = output_ref[j] = rnd_.Rand16() & mask_; + } + } + } + + void InitInput() { + const int64_t max_coeff = (32766 << (bit_depth_ - 8)) / 4; + int64_t max_energy_leftover = max_coeff * max_coeff; + for (int j = 0; j < last_nonzero_; ++j) { + tran_low_t coeff = static_cast( + sqrt(1.0 * max_energy_leftover) * (rnd_.Rand16() - 32768) / 65536); + max_energy_leftover -= static_cast(coeff) * coeff; + if (max_energy_leftover < 0) { + max_energy_leftover = 0; + coeff = 0; + } + input_block_[vp9_default_scan_orders[tx_size_].scan[j]] = coeff; + } + } + + void PrintDiff() { + if (memcmp(output_block_ref_, output_block_, + pixel_size_ * output_block_size_)) { + uint16_t ref, opt; + for (int y = 0; y < size_; y++) { + for (int x = 0; x < size_; x++) { + if (pixel_size_ == 1) { + ref = output_block_ref_[y * stride_ + x]; + opt = output_block_[y * stride_ + x]; + } else { + ref = reinterpret_cast( + output_block_ref_)[y * stride_ + x]; + opt = reinterpret_cast(output_block_)[y * stride_ + x]; + } + if (ref != opt) { + printf("dest[%d][%d] diff:%6d (ref),%6d (opt)\n", y, x, ref, opt); + } + } + } + + printf("\ninput_block_:\n"); + for (int y = 0; y < size_; y++) { + for (int x = 0; x < size_; x++) { + printf("%6d,", input_block_[y * size_ + x]); + } + printf("\n"); + } + } + } + + protected: + int last_nonzero_; + TX_SIZE tx_size_; + tran_low_t *input_block_; + uint8_t *output_block_; + uint8_t *output_block_ref_; + int size_; + int stride_; + int pixel_size_; + int input_block_size_; + int output_block_size_; + int bit_depth_; + int mask_; + FwdTxfmFunc fwd_txfm_; + InvTxfmWithBdFunc full_inv_txfm_; + InvTxfmWithBdFunc partial_inv_txfm_; + ACMRandom rnd_; +}; + +TEST_P(PartialIDctTest, RunQuantCheck) { + const int count_test_block = (size_ != 4) ? kCountTestBlock : 65536; + DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxNumCoeffs]); + DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kMaxNumCoeffs]); + + InitMem(); + + for (int i = 0; i < count_test_block; ++i) { + // Initialize a test block with input range [-mask_, mask_]. + if (size_ != 4) { + if (i == 0) { + for (int k = 0; k < input_block_size_; ++k) { + input_extreme_block[k] = mask_; + } + } else if (i == 1) { + for (int k = 0; k < input_block_size_; ++k) { + input_extreme_block[k] = -mask_; + } + } else { + for (int k = 0; k < input_block_size_; ++k) { + input_extreme_block[k] = rnd_.Rand8() % 2 ? mask_ : -mask_; + } + } + } else { + // Try all possible combinations. + for (int k = 0; k < input_block_size_; ++k) { + input_extreme_block[k] = (i & (1 << k)) ? mask_ : -mask_; + } + } + + fwd_txfm_(input_extreme_block, output_ref_block, size_); + + // quantization with minimum allowed step sizes + input_block_[0] = (output_ref_block[0] / 4) * 4; + for (int k = 1; k < last_nonzero_; ++k) { + const int pos = vp9_default_scan_orders[tx_size_].scan[k]; + input_block_[pos] = (output_ref_block[pos] / 4) * 4; + } + + ASM_REGISTER_STATE_CHECK( + full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_)); + ASM_REGISTER_STATE_CHECK( + partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_)); + ASSERT_EQ(0, memcmp(output_block_ref_, output_block_, + pixel_size_ * output_block_size_)) + << "Error: partial inverse transform produces different results"; + } +} + +TEST_P(PartialIDctTest, ResultsMatch) { + for (int i = 0; i < kCountTestBlock; ++i) { + InitMem(); + InitInput(); + + ASM_REGISTER_STATE_CHECK( + full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_)); + ASM_REGISTER_STATE_CHECK( + partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_)); + ASSERT_EQ(0, memcmp(output_block_ref_, output_block_, + pixel_size_ * output_block_size_)) + << "Error: partial inverse transform produces different results"; + } +} + +TEST_P(PartialIDctTest, AddOutputBlock) { + for (int i = 0; i < kCountTestBlock; ++i) { + InitMem(); + for (int j = 0; j < last_nonzero_; ++j) { + input_block_[vp9_default_scan_orders[tx_size_].scan[j]] = 10; + } + + ASM_REGISTER_STATE_CHECK( + full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_)); + ASM_REGISTER_STATE_CHECK( + partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_)); + ASSERT_EQ(0, memcmp(output_block_ref_, output_block_, + pixel_size_ * output_block_size_)) + << "Error: Transform results are not correctly added to output."; + } +} + +TEST_P(PartialIDctTest, SingleExtremeCoeff) { + const int16_t max_coeff = std::numeric_limits::max(); + const int16_t min_coeff = std::numeric_limits::min(); + for (int i = 0; i < last_nonzero_; ++i) { + memset(input_block_, 0, sizeof(*input_block_) * input_block_size_); + // Run once for min and once for max. + for (int j = 0; j < 2; ++j) { + const int coeff = j ? min_coeff : max_coeff; + + memset(output_block_, 0, pixel_size_ * output_block_size_); + memset(output_block_ref_, 0, pixel_size_ * output_block_size_); + input_block_[vp9_default_scan_orders[tx_size_].scan[i]] = coeff; + + ASM_REGISTER_STATE_CHECK( + full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_)); + ASM_REGISTER_STATE_CHECK( + partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_)); + ASSERT_EQ(0, memcmp(output_block_ref_, output_block_, + pixel_size_ * output_block_size_)) + << "Error: Fails with single coeff of " << coeff << " at " << i + << "."; + } + } +} + +TEST_P(PartialIDctTest, DISABLED_Speed) { + // Keep runtime stable with transform size. + const int kCountSpeedTestBlock = 500000000 / input_block_size_; + InitMem(); + InitInput(); + + for (int i = 0; i < kCountSpeedTestBlock; ++i) { + ASM_REGISTER_STATE_CHECK( + full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_)); + } + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int i = 0; i < kCountSpeedTestBlock; ++i) { + partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_); + } + libvpx_test::ClearSystemState(); + vpx_usec_timer_mark(&timer); + const int elapsed_time = + static_cast(vpx_usec_timer_elapsed(&timer) / 1000); + printf("idct%dx%d_%d (%s %d) time: %5d ms\n", size_, size_, last_nonzero_, + (pixel_size_ == 1) ? "bitdepth" : "high bitdepth", bit_depth_, + elapsed_time); + ASSERT_EQ(0, memcmp(output_block_ref_, output_block_, + pixel_size_ * output_block_size_)) + << "Error: partial inverse transform produces different results"; +} + +using std::make_tuple; + +const PartialInvTxfmParam c_partial_idct_tests[] = { +#if CONFIG_VP9_HIGHBITDEPTH + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 1024, 8, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 1024, 10, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 1024, 12, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 135, 8, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 135, 10, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 135, 12, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 8, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 10, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 12, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, 1, 8, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, 1, 10, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, 1, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 256, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 256, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 256, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 10, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 10, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 10, 12, 2), + make_tuple(&vpx_highbd_fdct16x16_c, + &highbd_wrapper, + &highbd_wrapper, TX_16X16, 1, 8, 2), + make_tuple(&vpx_highbd_fdct16x16_c, + &highbd_wrapper, + &highbd_wrapper, TX_16X16, 1, 10, 2), + make_tuple(&vpx_highbd_fdct16x16_c, + &highbd_wrapper, + &highbd_wrapper, TX_16X16, 1, 12, 2), + make_tuple(&vpx_highbd_fdct8x8_c, + &highbd_wrapper, + &highbd_wrapper, TX_8X8, 64, 8, 2), + make_tuple(&vpx_highbd_fdct8x8_c, + &highbd_wrapper, + &highbd_wrapper, TX_8X8, 64, 10, 2), + make_tuple(&vpx_highbd_fdct8x8_c, + &highbd_wrapper, + &highbd_wrapper, TX_8X8, 64, 12, 2), + make_tuple(&vpx_highbd_fdct8x8_c, + &highbd_wrapper, + &highbd_wrapper, TX_8X8, 12, 8, 2), + make_tuple(&vpx_highbd_fdct8x8_c, + &highbd_wrapper, + &highbd_wrapper, TX_8X8, 12, 10, 2), + make_tuple(&vpx_highbd_fdct8x8_c, + &highbd_wrapper, + &highbd_wrapper, TX_8X8, 12, 12, 2), + make_tuple(&vpx_highbd_fdct8x8_c, + &highbd_wrapper, + &highbd_wrapper, TX_8X8, 1, 8, 2), + make_tuple(&vpx_highbd_fdct8x8_c, + &highbd_wrapper, + &highbd_wrapper, TX_8X8, 1, 10, 2), + make_tuple(&vpx_highbd_fdct8x8_c, + &highbd_wrapper, + &highbd_wrapper, TX_8X8, 1, 12, 2), + make_tuple(&vpx_highbd_fdct4x4_c, + &highbd_wrapper, + &highbd_wrapper, TX_4X4, 16, 8, 2), + make_tuple(&vpx_highbd_fdct4x4_c, + &highbd_wrapper, + &highbd_wrapper, TX_4X4, 16, 10, 2), + make_tuple(&vpx_highbd_fdct4x4_c, + &highbd_wrapper, + &highbd_wrapper, TX_4X4, 16, 12, 2), + make_tuple(&vpx_highbd_fdct4x4_c, + &highbd_wrapper, + &highbd_wrapper, TX_4X4, 1, 8, 2), + make_tuple(&vpx_highbd_fdct4x4_c, + &highbd_wrapper, + &highbd_wrapper, TX_4X4, 1, 10, 2), + make_tuple(&vpx_highbd_fdct4x4_c, + &highbd_wrapper, + &highbd_wrapper, TX_4X4, 1, 12, 2), +#endif // CONFIG_VP9_HIGHBITDEPTH + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 1024, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 135, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 34, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 1, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 256, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 38, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 10, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 1, 8, 1), + make_tuple(&vpx_fdct8x8_c, &wrapper, + &wrapper, TX_8X8, 64, 8, 1), + make_tuple(&vpx_fdct8x8_c, &wrapper, + &wrapper, TX_8X8, 12, 8, 1), + make_tuple(&vpx_fdct8x8_c, &wrapper, + &wrapper, TX_8X8, 1, 8, 1), + make_tuple(&vpx_fdct4x4_c, &wrapper, + &wrapper, TX_4X4, 16, 8, 1), + make_tuple(&vpx_fdct4x4_c, &wrapper, + &wrapper, TX_4X4, 1, 8, 1) +}; + +INSTANTIATE_TEST_SUITE_P(C, PartialIDctTest, + ::testing::ValuesIn(c_partial_idct_tests)); + +#if !CONFIG_EMULATE_HARDWARE + +#if HAVE_NEON +const PartialInvTxfmParam neon_partial_idct_tests[] = { +#if CONFIG_VP9_HIGHBITDEPTH + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 1024, 8, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 1024, 10, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 1024, 12, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 135, 8, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 135, 10, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 135, 12, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 8, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 10, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 12, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 1, 8, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 1, 10, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 1, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 256, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 256, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 256, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 10, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 10, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 10, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 1, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 1, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 1, 12, 2), + make_tuple(&vpx_highbd_fdct8x8_c, + &highbd_wrapper, + &highbd_wrapper, TX_8X8, 64, 8, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 64, 10, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 64, 12, 2), + make_tuple(&vpx_highbd_fdct8x8_c, + &highbd_wrapper, + &highbd_wrapper, TX_8X8, 12, 8, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 12, 10, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 12, 12, 2), + make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 1, 8, 2), + make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 1, 10, 2), + make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 1, 12, 2), + make_tuple(&vpx_highbd_fdct4x4_c, + &highbd_wrapper, + &highbd_wrapper, TX_4X4, 16, 8, 2), + make_tuple( + &vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 16, 10, 2), + make_tuple( + &vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 16, 12, 2), + make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 1, 8, 2), + make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 1, 10, 2), + make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 1, 12, 2), +#endif // CONFIG_VP9_HIGHBITDEPTH + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 1024, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 135, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 34, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 1, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 256, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 38, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 10, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 1, 8, 1), + make_tuple(&vpx_fdct8x8_c, &wrapper, + &wrapper, TX_8X8, 64, 8, 1), + make_tuple(&vpx_fdct8x8_c, &wrapper, + &wrapper, TX_8X8, 12, 8, 1), + make_tuple(&vpx_fdct8x8_c, &wrapper, + &wrapper, TX_8X8, 1, 8, 1), + make_tuple(&vpx_fdct4x4_c, &wrapper, + &wrapper, TX_4X4, 16, 8, 1), + make_tuple(&vpx_fdct4x4_c, &wrapper, + &wrapper, TX_4X4, 1, 8, 1) +}; + +INSTANTIATE_TEST_SUITE_P(NEON, PartialIDctTest, + ::testing::ValuesIn(neon_partial_idct_tests)); +#endif // HAVE_NEON + +#if HAVE_SSE2 +// 32x32_135_ is implemented using the 1024 version. +const PartialInvTxfmParam sse2_partial_idct_tests[] = { +#if CONFIG_VP9_HIGHBITDEPTH + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 1024, 8, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 1024, 10, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 1024, 12, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 135, 8, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 135, 10, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 135, 12, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 8, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 10, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 12, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 1, 8, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 1, 10, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 1, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 256, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 256, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 256, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 10, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 10, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 10, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 1, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 1, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 1, 12, 2), + make_tuple(&vpx_highbd_fdct8x8_c, + &highbd_wrapper, + &highbd_wrapper, TX_8X8, 64, 8, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 64, 10, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 64, 12, 2), + make_tuple(&vpx_highbd_fdct8x8_c, + &highbd_wrapper, + &highbd_wrapper, TX_8X8, 12, 8, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 12, 10, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 12, 12, 2), + make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 1, 8, 2), + make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 1, 10, 2), + make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 1, 12, 2), + make_tuple(&vpx_highbd_fdct4x4_c, + &highbd_wrapper, + &highbd_wrapper, TX_4X4, 16, 8, 2), + make_tuple( + &vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 16, 10, 2), + make_tuple( + &vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 16, 12, 2), + make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 1, 8, 2), + make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 1, 10, 2), + make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 1, 12, 2), +#endif // CONFIG_VP9_HIGHBITDEPTH + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 1024, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 135, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 34, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 1, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 256, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 38, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 10, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 1, 8, 1), + make_tuple(&vpx_fdct8x8_c, &wrapper, + &wrapper, TX_8X8, 64, 8, 1), + make_tuple(&vpx_fdct8x8_c, &wrapper, + &wrapper, TX_8X8, 12, 8, 1), + make_tuple(&vpx_fdct8x8_c, &wrapper, + &wrapper, TX_8X8, 1, 8, 1), + make_tuple(&vpx_fdct4x4_c, &wrapper, + &wrapper, TX_4X4, 16, 8, 1), + make_tuple(&vpx_fdct4x4_c, &wrapper, + &wrapper, TX_4X4, 1, 8, 1) +}; + +INSTANTIATE_TEST_SUITE_P(SSE2, PartialIDctTest, + ::testing::ValuesIn(sse2_partial_idct_tests)); + +#endif // HAVE_SSE2 + +#if HAVE_SSSE3 +const PartialInvTxfmParam ssse3_partial_idct_tests[] = { + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 135, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 34, 8, 1), + make_tuple(&vpx_fdct8x8_c, &wrapper, + &wrapper, TX_8X8, 12, 8, 1) +}; + +INSTANTIATE_TEST_SUITE_P(SSSE3, PartialIDctTest, + ::testing::ValuesIn(ssse3_partial_idct_tests)); +#endif // HAVE_SSSE3 + +#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH +const PartialInvTxfmParam sse4_1_partial_idct_tests[] = { + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 1024, 8, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 1024, 10, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 1024, 12, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 135, 8, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 135, 10, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 135, 12, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 8, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 10, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 12, 2), + make_tuple(&vpx_highbd_fdct16x16_c, + &highbd_wrapper, + &highbd_wrapper, TX_16X16, + 256, 8, 2), + make_tuple(&vpx_highbd_fdct16x16_c, + &highbd_wrapper, + &highbd_wrapper, TX_16X16, + 256, 10, 2), + make_tuple(&vpx_highbd_fdct16x16_c, + &highbd_wrapper, + &highbd_wrapper, TX_16X16, + 256, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 10, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 10, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 10, 12, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 64, 8, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 64, 10, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 64, 12, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 12, 8, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 12, 10, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 12, 12, 2), + make_tuple( + &vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 16, 8, 2), + make_tuple( + &vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 16, 10, 2), + make_tuple( + &vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 16, 12, 2) +}; + +INSTANTIATE_TEST_SUITE_P(SSE4_1, PartialIDctTest, + ::testing::ValuesIn(sse4_1_partial_idct_tests)); +#endif // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH +const PartialInvTxfmParam dspr2_partial_idct_tests[] = { + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 1024, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 34, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 1, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 256, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 10, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 1, 8, 1), + make_tuple(&vpx_fdct8x8_c, &wrapper, + &wrapper, TX_8X8, 64, 8, 1), + make_tuple(&vpx_fdct8x8_c, &wrapper, + &wrapper, TX_8X8, 12, 8, 1), + make_tuple(&vpx_fdct8x8_c, &wrapper, + &wrapper, TX_8X8, 1, 8, 1), + make_tuple(&vpx_fdct4x4_c, &wrapper, + &wrapper, TX_4X4, 16, 8, 1), + make_tuple(&vpx_fdct4x4_c, &wrapper, + &wrapper, TX_4X4, 1, 8, 1) +}; + +INSTANTIATE_TEST_SUITE_P(DSPR2, PartialIDctTest, + ::testing::ValuesIn(dspr2_partial_idct_tests)); +#endif // HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH +// 32x32_135_ is implemented using the 1024 version. +const PartialInvTxfmParam msa_partial_idct_tests[] = { + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 1024, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 34, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 1, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 256, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 10, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 1, 8, 1), + make_tuple(&vpx_fdct8x8_c, &wrapper, + &wrapper, TX_8X8, 64, 8, 1), + make_tuple(&vpx_fdct8x8_c, &wrapper, + &wrapper, TX_8X8, 12, 8, 1), + make_tuple(&vpx_fdct8x8_c, &wrapper, + &wrapper, TX_8X8, 1, 8, 1), + make_tuple(&vpx_fdct4x4_c, &wrapper, + &wrapper, TX_4X4, 16, 8, 1), + make_tuple(&vpx_fdct4x4_c, &wrapper, + &wrapper, TX_4X4, 1, 8, 1) +}; + +INSTANTIATE_TEST_SUITE_P(MSA, PartialIDctTest, + ::testing::ValuesIn(msa_partial_idct_tests)); +#endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH +const PartialInvTxfmParam lsx_partial_idct_tests[] = { + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 1024, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 34, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 1, 8, 1), +}; + +INSTANTIATE_TEST_SUITE_P(LSX, PartialIDctTest, + ::testing::ValuesIn(lsx_partial_idct_tests)); +#endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH + +#endif // !CONFIG_EMULATE_HARDWARE + +} // namespace diff --git a/media/libvpx/libvpx/test/postproc.sh b/media/libvpx/libvpx/test/postproc.sh new file mode 100755 index 0000000000..91ca9b26fe --- /dev/null +++ b/media/libvpx/libvpx/test/postproc.sh @@ -0,0 +1,63 @@ +#!/bin/sh +## +## Copyright (c) 2014 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## +## This file tests the libvpx postproc example code. To add new tests to this +## file, do the following: +## 1. Write a shell function (this is your test). +## 2. Add the function to postproc_tests (on a new line). +## +. $(dirname $0)/tools_common.sh + +# Environment check: Make sure input is available: +# $VP8_IVF_FILE and $VP9_IVF_FILE are required. +postproc_verify_environment() { + if [ ! -e "${VP8_IVF_FILE}" ] || [ ! -e "${VP9_IVF_FILE}" ]; then + echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH." + return 1 + fi +} + +# Runs postproc using $1 as input file. $2 is the codec name, and is used +# solely to name the output file. +postproc() { + local decoder="${LIBVPX_BIN_PATH}/postproc${VPX_TEST_EXE_SUFFIX}" + local input_file="$1" + local codec="$2" + local output_file="${VPX_TEST_OUTPUT_DIR}/postproc_${codec}.raw" + + if [ ! -x "${decoder}" ]; then + elog "${decoder} does not exist or is not executable." + return 1 + fi + + eval "${VPX_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \ + ${devnull} || return 1 + + [ -e "${output_file}" ] || return 1 +} + +postproc_vp8() { + if [ "$(vp8_decode_available)" = "yes" ]; then + postproc "${VP8_IVF_FILE}" vp8 || return 1 + fi +} + +postproc_vp9() { + if [ "$(vpx_config_option_enabled CONFIG_VP9_POSTPROC)" = "yes" ]; then + if [ "$(vp9_decode_available)" = "yes" ]; then + postproc "${VP9_IVF_FILE}" vp9 || return 1 + fi + fi +} + +postproc_tests="postproc_vp8 + postproc_vp9" + +run_tests postproc_verify_environment "${postproc_tests}" diff --git a/media/libvpx/libvpx/test/pp_filter_test.cc b/media/libvpx/libvpx/test/pp_filter_test.cc new file mode 100644 index 0000000000..d2db8a7c7d --- /dev/null +++ b/media/libvpx/libvpx/test/pp_filter_test.cc @@ -0,0 +1,575 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "test/acm_random.h" +#include "test/bench.h" +#include "test/buffer.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "vpx/vpx_integer.h" +#include "vpx_mem/vpx_mem.h" + +using libvpx_test::ACMRandom; +using libvpx_test::Buffer; + +typedef void (*VpxPostProcDownAndAcrossMbRowFunc)( + unsigned char *src_ptr, unsigned char *dst_ptr, int src_pixels_per_line, + int dst_pixels_per_line, int cols, unsigned char *flimit, int size); + +typedef void (*VpxMbPostProcAcrossIpFunc)(unsigned char *src, int pitch, + int rows, int cols, int flimit); + +typedef void (*VpxMbPostProcDownFunc)(unsigned char *dst, int pitch, int rows, + int cols, int flimit); + +namespace { +// Compute the filter level used in post proc from the loop filter strength +int q2mbl(int x) { + if (x < 20) x = 20; + + x = 50 + (x - 50) * 10 / 8; + return x * x / 3; +} + +class VpxPostProcDownAndAcrossMbRowTest + : public AbstractBench, + public ::testing::TestWithParam { + public: + VpxPostProcDownAndAcrossMbRowTest() + : mb_post_proc_down_and_across_(GetParam()) {} + void TearDown() override { libvpx_test::ClearSystemState(); } + + protected: + void Run() override; + + const VpxPostProcDownAndAcrossMbRowFunc mb_post_proc_down_and_across_; + // Size of the underlying data block that will be filtered. + int block_width_; + int block_height_; + Buffer *src_image_; + Buffer *dst_image_; + uint8_t *flimits_; +}; + +void VpxPostProcDownAndAcrossMbRowTest::Run() { + mb_post_proc_down_and_across_( + src_image_->TopLeftPixel(), dst_image_->TopLeftPixel(), + src_image_->stride(), dst_image_->stride(), block_width_, flimits_, 16); +} + +// Test routine for the VPx post-processing function +// vpx_post_proc_down_and_across_mb_row_c. + +TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) { + // Size of the underlying data block that will be filtered. + block_width_ = 16; + block_height_ = 16; + + // 5-tap filter needs 2 padding rows above and below the block in the input. + Buffer src_image = Buffer(block_width_, block_height_, 2); + ASSERT_TRUE(src_image.Init()); + + // Filter extends output block by 8 samples at left and right edges. + // Though the left padding is only 8 bytes, the assembly code tries to + // read 16 bytes before the pointer. + Buffer dst_image = + Buffer(block_width_, block_height_, 8, 16, 8, 8); + ASSERT_TRUE(dst_image.Init()); + + flimits_ = reinterpret_cast(vpx_memalign(16, block_width_)); + (void)memset(flimits_, 255, block_width_); + + // Initialize pixels in the input: + // block pixels to value 1, + // border pixels to value 10. + src_image.SetPadding(10); + src_image.Set(1); + + // Initialize pixels in the output to 99. + dst_image.Set(99); + + ASM_REGISTER_STATE_CHECK(mb_post_proc_down_and_across_( + src_image.TopLeftPixel(), dst_image.TopLeftPixel(), src_image.stride(), + dst_image.stride(), block_width_, flimits_, 16)); + + static const uint8_t kExpectedOutput[] = { 4, 3, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 3, 4 }; + + uint8_t *pixel_ptr = dst_image.TopLeftPixel(); + for (int i = 0; i < block_height_; ++i) { + for (int j = 0; j < block_width_; ++j) { + ASSERT_EQ(kExpectedOutput[i], pixel_ptr[j]) + << "at (" << i << ", " << j << ")"; + } + pixel_ptr += dst_image.stride(); + } + + vpx_free(flimits_); +} + +TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) { + // Size of the underlying data block that will be filtered. + // Y blocks are always a multiple of 16 wide and exactly 16 high. U and V + // blocks are always a multiple of 8 wide and exactly 8 high. + block_width_ = 136; + block_height_ = 16; + + // 5-tap filter needs 2 padding rows above and below the block in the input. + // SSE2 reads in blocks of 16. Pad an extra 8 in case the width is not %16. + Buffer src_image = + Buffer(block_width_, block_height_, 2, 2, 10, 2); + ASSERT_TRUE(src_image.Init()); + + // Filter extends output block by 8 samples at left and right edges. + // Though the left padding is only 8 bytes, there is 'above' padding as well + // so when the assembly code tries to read 16 bytes before the pointer it is + // not a problem. + // SSE2 reads in blocks of 16. Pad an extra 8 in case the width is not %16. + Buffer dst_image = + Buffer(block_width_, block_height_, 8, 8, 16, 8); + ASSERT_TRUE(dst_image.Init()); + Buffer dst_image_ref = + Buffer(block_width_, block_height_, 8); + ASSERT_TRUE(dst_image_ref.Init()); + + // Filter values are set in blocks of 16 for Y and 8 for U/V. Each macroblock + // can have a different filter. SSE2 assembly reads flimits in blocks of 16 so + // it must be padded out. + const int flimits_width = block_width_ % 16 ? block_width_ + 8 : block_width_; + flimits_ = reinterpret_cast(vpx_memalign(16, flimits_width)); + + ACMRandom rnd; + rnd.Reset(ACMRandom::DeterministicSeed()); + // Initialize pixels in the input: + // block pixels to random values. + // border pixels to value 10. + src_image.SetPadding(10); + src_image.Set(&rnd, &ACMRandom::Rand8); + + for (int blocks = 0; blocks < block_width_; blocks += 8) { + (void)memset(flimits_, 0, sizeof(*flimits_) * flimits_width); + + for (int f = 0; f < 255; f++) { + (void)memset(flimits_ + blocks, f, sizeof(*flimits_) * 8); + dst_image.Set(0); + dst_image_ref.Set(0); + + vpx_post_proc_down_and_across_mb_row_c( + src_image.TopLeftPixel(), dst_image_ref.TopLeftPixel(), + src_image.stride(), dst_image_ref.stride(), block_width_, flimits_, + block_height_); + ASM_REGISTER_STATE_CHECK(mb_post_proc_down_and_across_( + src_image.TopLeftPixel(), dst_image.TopLeftPixel(), + src_image.stride(), dst_image.stride(), block_width_, flimits_, + block_height_)); + + ASSERT_TRUE(dst_image.CheckValues(dst_image_ref)); + } + } + + vpx_free(flimits_); +} + +TEST_P(VpxPostProcDownAndAcrossMbRowTest, DISABLED_Speed) { + // Size of the underlying data block that will be filtered. + block_width_ = 16; + block_height_ = 16; + + // 5-tap filter needs 2 padding rows above and below the block in the input. + Buffer src_image = Buffer(block_width_, block_height_, 2); + ASSERT_TRUE(src_image.Init()); + this->src_image_ = &src_image; + + // Filter extends output block by 8 samples at left and right edges. + // Though the left padding is only 8 bytes, the assembly code tries to + // read 16 bytes before the pointer. + Buffer dst_image = + Buffer(block_width_, block_height_, 8, 16, 8, 8); + ASSERT_TRUE(dst_image.Init()); + this->dst_image_ = &dst_image; + + flimits_ = reinterpret_cast(vpx_memalign(16, block_width_)); + (void)memset(flimits_, 255, block_width_); + + // Initialize pixels in the input: + // block pixels to value 1, + // border pixels to value 10. + src_image.SetPadding(10); + src_image.Set(1); + + // Initialize pixels in the output to 99. + dst_image.Set(99); + + RunNTimes(INT16_MAX); + PrintMedian("16x16"); + + vpx_free(flimits_); +} + +class VpxMbPostProcAcrossIpTest + : public AbstractBench, + public ::testing::TestWithParam { + public: + VpxMbPostProcAcrossIpTest() + : rows_(16), cols_(16), mb_post_proc_across_ip_(GetParam()), + src_(Buffer(rows_, cols_, 8, 8, 17, 8)) {} + void TearDown() override { libvpx_test::ClearSystemState(); } + + protected: + void Run() override; + + void SetCols(unsigned char *s, int rows, int cols, int src_width) { + for (int r = 0; r < rows; r++) { + for (int c = 0; c < cols; c++) { + s[c] = c; + } + s += src_width; + } + } + + void RunComparison(const unsigned char *expected_output, unsigned char *src_c, + int rows, int cols, int src_pitch) { + for (int r = 0; r < rows; r++) { + for (int c = 0; c < cols; c++) { + ASSERT_EQ(expected_output[c], src_c[c]) + << "at (" << r << ", " << c << ")"; + } + src_c += src_pitch; + } + } + + void RunFilterLevel(unsigned char *s, int rows, int cols, int src_width, + int filter_level, const unsigned char *expected_output) { + ASM_REGISTER_STATE_CHECK( + GetParam()(s, src_width, rows, cols, filter_level)); + RunComparison(expected_output, s, rows, cols, src_width); + } + + const int rows_; + const int cols_; + const VpxMbPostProcAcrossIpFunc mb_post_proc_across_ip_; + Buffer src_; +}; + +void VpxMbPostProcAcrossIpTest::Run() { + mb_post_proc_across_ip_(src_.TopLeftPixel(), src_.stride(), rows_, cols_, + q2mbl(0)); +} + +TEST_P(VpxMbPostProcAcrossIpTest, CheckLowFilterOutput) { + ASSERT_TRUE(src_.Init()); + src_.SetPadding(10); + SetCols(src_.TopLeftPixel(), rows_, cols_, src_.stride()); + + Buffer expected_output = Buffer(cols_, rows_, 0); + ASSERT_TRUE(expected_output.Init()); + SetCols(expected_output.TopLeftPixel(), rows_, cols_, + expected_output.stride()); + + RunFilterLevel(src_.TopLeftPixel(), rows_, cols_, src_.stride(), q2mbl(0), + expected_output.TopLeftPixel()); +} + +TEST_P(VpxMbPostProcAcrossIpTest, CheckMediumFilterOutput) { + ASSERT_TRUE(src_.Init()); + src_.SetPadding(10); + SetCols(src_.TopLeftPixel(), rows_, cols_, src_.stride()); + + static const unsigned char kExpectedOutput[] = { + 2, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 13 + }; + + RunFilterLevel(src_.TopLeftPixel(), rows_, cols_, src_.stride(), q2mbl(70), + kExpectedOutput); +} + +TEST_P(VpxMbPostProcAcrossIpTest, CheckHighFilterOutput) { + ASSERT_TRUE(src_.Init()); + src_.SetPadding(10); + SetCols(src_.TopLeftPixel(), rows_, cols_, src_.stride()); + + static const unsigned char kExpectedOutput[] = { + 2, 2, 3, 4, 4, 5, 6, 7, 8, 9, 10, 11, 11, 12, 13, 13 + }; + + RunFilterLevel(src_.TopLeftPixel(), rows_, cols_, src_.stride(), INT_MAX, + kExpectedOutput); + + SetCols(src_.TopLeftPixel(), rows_, cols_, src_.stride()); + + RunFilterLevel(src_.TopLeftPixel(), rows_, cols_, src_.stride(), q2mbl(100), + kExpectedOutput); +} + +TEST_P(VpxMbPostProcAcrossIpTest, CheckCvsAssembly) { + Buffer c_mem = Buffer(cols_, rows_, 8, 8, 17, 8); + ASSERT_TRUE(c_mem.Init()); + Buffer asm_mem = Buffer(cols_, rows_, 8, 8, 17, 8); + ASSERT_TRUE(asm_mem.Init()); + + // When level >= 100, the filter behaves the same as the level = INT_MAX + // When level < 20, it behaves the same as the level = 0 + for (int level = 0; level < 100; level++) { + c_mem.SetPadding(10); + asm_mem.SetPadding(10); + SetCols(c_mem.TopLeftPixel(), rows_, cols_, c_mem.stride()); + SetCols(asm_mem.TopLeftPixel(), rows_, cols_, asm_mem.stride()); + + vpx_mbpost_proc_across_ip_c(c_mem.TopLeftPixel(), c_mem.stride(), rows_, + cols_, q2mbl(level)); + ASM_REGISTER_STATE_CHECK(GetParam()( + asm_mem.TopLeftPixel(), asm_mem.stride(), rows_, cols_, q2mbl(level))); + + ASSERT_TRUE(asm_mem.CheckValues(c_mem)); + } +} + +TEST_P(VpxMbPostProcAcrossIpTest, DISABLED_Speed) { + ASSERT_TRUE(src_.Init()); + src_.SetPadding(10); + + SetCols(src_.TopLeftPixel(), rows_, cols_, src_.stride()); + + RunNTimes(100000); + PrintMedian("16x16"); +} + +class VpxMbPostProcDownTest + : public AbstractBench, + public ::testing::TestWithParam { + public: + VpxMbPostProcDownTest() + : rows_(16), cols_(16), mb_post_proc_down_(GetParam()), + src_c_(Buffer(rows_, cols_, 8, 8, 8, 17)) {} + + void TearDown() override { libvpx_test::ClearSystemState(); } + + protected: + void Run() override; + + void SetRows(unsigned char *src_c, int rows, int cols, int src_width) { + for (int r = 0; r < rows; r++) { + memset(src_c, r, cols); + src_c += src_width; + } + } + + void RunComparison(const unsigned char *expected_output, unsigned char *src_c, + int rows, int cols, int src_pitch) { + for (int r = 0; r < rows; r++) { + for (int c = 0; c < cols; c++) { + ASSERT_EQ(expected_output[r * rows + c], src_c[c]) + << "at (" << r << ", " << c << ")"; + } + src_c += src_pitch; + } + } + + void RunFilterLevel(unsigned char *s, int rows, int cols, int src_width, + int filter_level, const unsigned char *expected_output) { + ASM_REGISTER_STATE_CHECK( + mb_post_proc_down_(s, src_width, rows, cols, filter_level)); + RunComparison(expected_output, s, rows, cols, src_width); + } + + const int rows_; + const int cols_; + const VpxMbPostProcDownFunc mb_post_proc_down_; + Buffer src_c_; +}; + +void VpxMbPostProcDownTest::Run() { + mb_post_proc_down_(src_c_.TopLeftPixel(), src_c_.stride(), rows_, cols_, + q2mbl(0)); +} + +TEST_P(VpxMbPostProcDownTest, CheckHighFilterOutput) { + ASSERT_TRUE(src_c_.Init()); + src_c_.SetPadding(10); + + SetRows(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride()); + + static const unsigned char kExpectedOutput[] = { + 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, + 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 4, 4, 3, 3, 3, + 4, 4, 3, 4, 4, 3, 3, 4, 5, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, + 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 8, 9, 9, 8, 8, 8, 9, + 9, 8, 9, 9, 8, 8, 8, 9, 9, 10, 10, 9, 9, 9, 10, 10, 9, 10, 10, + 9, 9, 9, 10, 10, 10, 11, 10, 10, 10, 11, 10, 11, 10, 11, 10, 10, 10, 11, + 10, 11, 11, 11, 11, 11, 11, 11, 12, 11, 11, 11, 11, 11, 11, 11, 12, 11, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 12, + 13, 12, 13, 12, 12, 12, 13, 12, 13, 12, 13, 12, 13, 13, 13, 14, 13, 13, 13, + 13, 13, 13, 13, 14, 13, 13, 13, 13 + }; + + RunFilterLevel(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride(), INT_MAX, + kExpectedOutput); + + src_c_.SetPadding(10); + SetRows(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride()); + RunFilterLevel(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride(), + q2mbl(100), kExpectedOutput); +} + +TEST_P(VpxMbPostProcDownTest, CheckMediumFilterOutput) { + ASSERT_TRUE(src_c_.Init()); + src_c_.SetPadding(10); + + SetRows(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride()); + + static const unsigned char kExpectedOutput[] = { + 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, + 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 12, 12, 13, 12, + 13, 12, 13, 12, 12, 12, 13, 12, 13, 12, 13, 12, 13, 13, 13, 14, 13, 13, 13, + 13, 13, 13, 13, 14, 13, 13, 13, 13 + }; + + RunFilterLevel(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride(), + q2mbl(70), kExpectedOutput); +} + +TEST_P(VpxMbPostProcDownTest, CheckLowFilterOutput) { + ASSERT_TRUE(src_c_.Init()); + src_c_.SetPadding(10); + + SetRows(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride()); + + std::unique_ptr expected_output( + new unsigned char[rows_ * cols_]); + ASSERT_NE(expected_output, nullptr); + SetRows(expected_output.get(), rows_, cols_, cols_); + + RunFilterLevel(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride(), q2mbl(0), + expected_output.get()); +} + +TEST_P(VpxMbPostProcDownTest, CheckCvsAssembly) { + ACMRandom rnd; + rnd.Reset(ACMRandom::DeterministicSeed()); + + ASSERT_TRUE(src_c_.Init()); + Buffer src_asm = Buffer(cols_, rows_, 8, 8, 8, 17); + ASSERT_TRUE(src_asm.Init()); + + for (int level = 0; level < 100; level++) { + src_c_.SetPadding(10); + src_asm.SetPadding(10); + src_c_.Set(&rnd, &ACMRandom::Rand8); + src_asm.CopyFrom(src_c_); + + vpx_mbpost_proc_down_c(src_c_.TopLeftPixel(), src_c_.stride(), rows_, cols_, + q2mbl(level)); + ASM_REGISTER_STATE_CHECK(mb_post_proc_down_( + src_asm.TopLeftPixel(), src_asm.stride(), rows_, cols_, q2mbl(level))); + ASSERT_TRUE(src_asm.CheckValues(src_c_)); + + src_c_.SetPadding(10); + src_asm.SetPadding(10); + src_c_.Set(&rnd, &ACMRandom::Rand8Extremes); + src_asm.CopyFrom(src_c_); + + vpx_mbpost_proc_down_c(src_c_.TopLeftPixel(), src_c_.stride(), rows_, cols_, + q2mbl(level)); + ASM_REGISTER_STATE_CHECK(mb_post_proc_down_( + src_asm.TopLeftPixel(), src_asm.stride(), rows_, cols_, q2mbl(level))); + ASSERT_TRUE(src_asm.CheckValues(src_c_)); + } +} + +TEST_P(VpxMbPostProcDownTest, DISABLED_Speed) { + ASSERT_TRUE(src_c_.Init()); + src_c_.SetPadding(10); + + SetRows(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride()); + + RunNTimes(100000); + PrintMedian("16x16"); +} + +INSTANTIATE_TEST_SUITE_P( + C, VpxPostProcDownAndAcrossMbRowTest, + ::testing::Values(vpx_post_proc_down_and_across_mb_row_c)); + +INSTANTIATE_TEST_SUITE_P(C, VpxMbPostProcAcrossIpTest, + ::testing::Values(vpx_mbpost_proc_across_ip_c)); + +INSTANTIATE_TEST_SUITE_P(C, VpxMbPostProcDownTest, + ::testing::Values(vpx_mbpost_proc_down_c)); + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2, VpxPostProcDownAndAcrossMbRowTest, + ::testing::Values(vpx_post_proc_down_and_across_mb_row_sse2)); + +INSTANTIATE_TEST_SUITE_P(SSE2, VpxMbPostProcAcrossIpTest, + ::testing::Values(vpx_mbpost_proc_across_ip_sse2)); + +INSTANTIATE_TEST_SUITE_P(SSE2, VpxMbPostProcDownTest, + ::testing::Values(vpx_mbpost_proc_down_sse2)); +#endif // HAVE_SSE2 + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, VpxPostProcDownAndAcrossMbRowTest, + ::testing::Values(vpx_post_proc_down_and_across_mb_row_neon)); + +INSTANTIATE_TEST_SUITE_P(NEON, VpxMbPostProcAcrossIpTest, + ::testing::Values(vpx_mbpost_proc_across_ip_neon)); + +INSTANTIATE_TEST_SUITE_P(NEON, VpxMbPostProcDownTest, + ::testing::Values(vpx_mbpost_proc_down_neon)); +#endif // HAVE_NEON + +#if HAVE_MSA +INSTANTIATE_TEST_SUITE_P( + MSA, VpxPostProcDownAndAcrossMbRowTest, + ::testing::Values(vpx_post_proc_down_and_across_mb_row_msa)); + +INSTANTIATE_TEST_SUITE_P(MSA, VpxMbPostProcAcrossIpTest, + ::testing::Values(vpx_mbpost_proc_across_ip_msa)); + +INSTANTIATE_TEST_SUITE_P(MSA, VpxMbPostProcDownTest, + ::testing::Values(vpx_mbpost_proc_down_msa)); +#endif // HAVE_MSA + +#if HAVE_VSX +INSTANTIATE_TEST_SUITE_P( + VSX, VpxPostProcDownAndAcrossMbRowTest, + ::testing::Values(vpx_post_proc_down_and_across_mb_row_vsx)); + +INSTANTIATE_TEST_SUITE_P(VSX, VpxMbPostProcAcrossIpTest, + ::testing::Values(vpx_mbpost_proc_across_ip_vsx)); + +INSTANTIATE_TEST_SUITE_P(VSX, VpxMbPostProcDownTest, + ::testing::Values(vpx_mbpost_proc_down_vsx)); +#endif // HAVE_VSX + +} // namespace diff --git a/media/libvpx/libvpx/test/predict_test.cc b/media/libvpx/libvpx/test/predict_test.cc new file mode 100644 index 0000000000..474eab2cb5 --- /dev/null +++ b/media/libvpx/libvpx/test/predict_test.cc @@ -0,0 +1,414 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vp8_rtcd.h" +#include "./vpx_config.h" +#include "test/acm_random.h" +#include "test/bench.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "vpx/vpx_integer.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/msvc.h" + +namespace { + +using libvpx_test::ACMRandom; +using std::make_tuple; + +typedef void (*PredictFunc)(uint8_t *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, uint8_t *dst_ptr, + int dst_pitch); + +typedef std::tuple PredictParam; + +class PredictTestBase : public AbstractBench, + public ::testing::TestWithParam { + public: + PredictTestBase() + : width_(GET_PARAM(0)), height_(GET_PARAM(1)), predict_(GET_PARAM(2)), + src_(nullptr), padded_dst_(nullptr), dst_(nullptr), dst_c_(nullptr) {} + + void SetUp() override { + src_ = new uint8_t[kSrcSize]; + ASSERT_NE(src_, nullptr); + + // padded_dst_ provides a buffer of kBorderSize around the destination + // memory to facilitate detecting out of bounds writes. + dst_stride_ = kBorderSize + width_ + kBorderSize; + padded_dst_size_ = dst_stride_ * (kBorderSize + height_ + kBorderSize); + padded_dst_ = + reinterpret_cast(vpx_memalign(16, padded_dst_size_)); + ASSERT_NE(padded_dst_, nullptr); + dst_ = padded_dst_ + (kBorderSize * dst_stride_) + kBorderSize; + + dst_c_ = new uint8_t[16 * 16]; + ASSERT_NE(dst_c_, nullptr); + + memset(src_, 0, kSrcSize); + memset(padded_dst_, 128, padded_dst_size_); + memset(dst_c_, 0, 16 * 16); + } + + void TearDown() override { + delete[] src_; + src_ = nullptr; + vpx_free(padded_dst_); + padded_dst_ = nullptr; + dst_ = nullptr; + delete[] dst_c_; + dst_c_ = nullptr; + libvpx_test::ClearSystemState(); + } + + protected: + // Make reference arrays big enough for 16x16 functions. Six-tap filters need + // 5 extra pixels outside of the macroblock. + static const int kSrcStride = 21; + static const int kSrcSize = kSrcStride * kSrcStride; + static const int kBorderSize = 16; + + int width_; + int height_; + PredictFunc predict_; + uint8_t *src_; + uint8_t *padded_dst_; + uint8_t *dst_; + int padded_dst_size_; + uint8_t *dst_c_; + int dst_stride_; + + bool CompareBuffers(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride) const { + for (int height = 0; height < height_; ++height) { + EXPECT_EQ(0, memcmp(a + height * a_stride, b + height * b_stride, + sizeof(*a) * width_)) + << "Row " << height << " does not match."; + } + + return !HasFailure(); + } + + // Given a block of memory 'a' with size 'a_size', determine if all regions + // excepting block 'b' described by 'b_stride', 'b_height', and 'b_width' + // match pixel value 'c'. + bool CheckBorder(const uint8_t *a, int a_size, const uint8_t *b, int b_width, + int b_height, int b_stride, uint8_t c) const { + const uint8_t *a_end = a + a_size; + const int b_size = (b_stride * b_height) + b_width; + const uint8_t *b_end = b + b_size; + const int left_border = (b_stride - b_width) / 2; + const int right_border = left_border + ((b_stride - b_width) % 2); + + EXPECT_GE(b - left_border, a) << "'b' does not start within 'a'"; + EXPECT_LE(b_end + right_border, a_end) << "'b' does not end within 'a'"; + + // Top border. + for (int pixel = 0; pixel < b - a - left_border; ++pixel) { + EXPECT_EQ(c, a[pixel]) << "Mismatch at " << pixel << " in top border."; + } + + // Left border. + for (int height = 0; height < b_height; ++height) { + for (int width = left_border; width > 0; --width) { + EXPECT_EQ(c, b[height * b_stride - width]) + << "Mismatch at row " << height << " column " << left_border - width + << " in left border."; + } + } + + // Right border. + for (int height = 0; height < b_height; ++height) { + for (int width = b_width; width < b_width + right_border; ++width) { + EXPECT_EQ(c, b[height * b_stride + width]) + << "Mismatch at row " << height << " column " << width - b_width + << " in right border."; + } + } + + // Bottom border. + for (int pixel = static_cast(b - a + b_size); pixel < a_size; + ++pixel) { + EXPECT_EQ(c, a[pixel]) << "Mismatch at " << pixel << " in bottom border."; + } + + return !HasFailure(); + } + + void TestWithRandomData(PredictFunc reference) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + + // Run tests for almost all possible offsets. + for (int xoffset = 0; xoffset < 8; ++xoffset) { + for (int yoffset = 0; yoffset < 8; ++yoffset) { + if (xoffset == 0 && yoffset == 0) { + // This represents a copy which is not required to be handled by this + // module. + continue; + } + + for (int i = 0; i < kSrcSize; ++i) { + src_[i] = rnd.Rand8(); + } + reference(&src_[kSrcStride * 2 + 2], kSrcStride, xoffset, yoffset, + dst_c_, 16); + + ASM_REGISTER_STATE_CHECK(predict_(&src_[kSrcStride * 2 + 2], kSrcStride, + xoffset, yoffset, dst_, dst_stride_)); + + ASSERT_TRUE(CompareBuffers(dst_c_, 16, dst_, dst_stride_)); + ASSERT_TRUE(CheckBorder(padded_dst_, padded_dst_size_, dst_, width_, + height_, dst_stride_, 128)); + } + } + } + + void TestWithUnalignedDst(PredictFunc reference) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + + // Only the 4x4 need to be able to handle unaligned writes. + if (width_ == 4 && height_ == 4) { + for (int xoffset = 0; xoffset < 8; ++xoffset) { + for (int yoffset = 0; yoffset < 8; ++yoffset) { + if (xoffset == 0 && yoffset == 0) { + continue; + } + for (int i = 0; i < kSrcSize; ++i) { + src_[i] = rnd.Rand8(); + } + reference(&src_[kSrcStride * 2 + 2], kSrcStride, xoffset, yoffset, + dst_c_, 16); + + for (int i = 1; i < 4; ++i) { + memset(padded_dst_, 128, padded_dst_size_); + + ASM_REGISTER_STATE_CHECK(predict_(&src_[kSrcStride * 2 + 2], + kSrcStride, xoffset, yoffset, + dst_ + i, dst_stride_ + i)); + + ASSERT_TRUE(CompareBuffers(dst_c_, 16, dst_ + i, dst_stride_ + i)); + ASSERT_TRUE(CheckBorder(padded_dst_, padded_dst_size_, dst_ + i, + width_, height_, dst_stride_ + i, 128)); + } + } + } + } + } + + void Run() override { + for (int xoffset = 0; xoffset < 8; ++xoffset) { + for (int yoffset = 0; yoffset < 8; ++yoffset) { + if (xoffset == 0 && yoffset == 0) { + continue; + } + + predict_(&src_[kSrcStride * 2 + 2], kSrcStride, xoffset, yoffset, dst_, + dst_stride_); + } + } + } +}; // namespace + +class SixtapPredictTest : public PredictTestBase {}; + +TEST_P(SixtapPredictTest, TestWithRandomData) { + TestWithRandomData(vp8_sixtap_predict16x16_c); +} +TEST_P(SixtapPredictTest, TestWithUnalignedDst) { + TestWithUnalignedDst(vp8_sixtap_predict16x16_c); +} + +TEST_P(SixtapPredictTest, TestWithPresetData) { + // Test input + static const uint8_t kTestData[kSrcSize] = { + 184, 4, 191, 82, 92, 41, 0, 1, 226, 236, 172, 20, 182, 42, 226, + 177, 79, 94, 77, 179, 203, 206, 198, 22, 192, 19, 75, 17, 192, 44, + 233, 120, 48, 168, 203, 141, 210, 203, 143, 180, 184, 59, 201, 110, 102, + 171, 32, 182, 10, 109, 105, 213, 60, 47, 236, 253, 67, 55, 14, 3, + 99, 247, 124, 148, 159, 71, 34, 114, 19, 177, 38, 203, 237, 239, 58, + 83, 155, 91, 10, 166, 201, 115, 124, 5, 163, 104, 2, 231, 160, 16, + 234, 4, 8, 103, 153, 167, 174, 187, 26, 193, 109, 64, 141, 90, 48, + 200, 174, 204, 36, 184, 114, 237, 43, 238, 242, 207, 86, 245, 182, 247, + 6, 161, 251, 14, 8, 148, 182, 182, 79, 208, 120, 188, 17, 6, 23, + 65, 206, 197, 13, 242, 126, 128, 224, 170, 110, 211, 121, 197, 200, 47, + 188, 207, 208, 184, 221, 216, 76, 148, 143, 156, 100, 8, 89, 117, 14, + 112, 183, 221, 54, 197, 208, 180, 69, 176, 94, 180, 131, 215, 121, 76, + 7, 54, 28, 216, 238, 249, 176, 58, 142, 64, 215, 242, 72, 49, 104, + 87, 161, 32, 52, 216, 230, 4, 141, 44, 181, 235, 224, 57, 195, 89, + 134, 203, 144, 162, 163, 126, 156, 84, 185, 42, 148, 145, 29, 221, 194, + 134, 52, 100, 166, 105, 60, 140, 110, 201, 184, 35, 181, 153, 93, 121, + 243, 227, 68, 131, 134, 232, 2, 35, 60, 187, 77, 209, 76, 106, 174, + 15, 241, 227, 115, 151, 77, 175, 36, 187, 121, 221, 223, 47, 118, 61, + 168, 105, 32, 237, 236, 167, 213, 238, 202, 17, 170, 24, 226, 247, 131, + 145, 6, 116, 117, 121, 11, 194, 41, 48, 126, 162, 13, 93, 209, 131, + 154, 122, 237, 187, 103, 217, 99, 60, 200, 45, 78, 115, 69, 49, 106, + 200, 194, 112, 60, 56, 234, 72, 251, 19, 120, 121, 182, 134, 215, 135, + 10, 114, 2, 247, 46, 105, 209, 145, 165, 153, 191, 243, 12, 5, 36, + 119, 206, 231, 231, 11, 32, 209, 83, 27, 229, 204, 149, 155, 83, 109, + 35, 93, 223, 37, 84, 14, 142, 37, 160, 52, 191, 96, 40, 204, 101, + 77, 67, 52, 53, 43, 63, 85, 253, 147, 113, 226, 96, 6, 125, 179, + 115, 161, 17, 83, 198, 101, 98, 85, 139, 3, 137, 75, 99, 178, 23, + 201, 255, 91, 253, 52, 134, 60, 138, 131, 208, 251, 101, 48, 2, 227, + 228, 118, 132, 245, 202, 75, 91, 44, 160, 231, 47, 41, 50, 147, 220, + 74, 92, 219, 165, 89, 16 + }; + + // Expected results for xoffset = 2 and yoffset = 2. + static const int kExpectedDstStride = 16; + static const uint8_t kExpectedDst[256] = { + 117, 102, 74, 135, 42, 98, 175, 206, 70, 73, 222, 197, 50, 24, 39, + 49, 38, 105, 90, 47, 169, 40, 171, 215, 200, 73, 109, 141, 53, 85, + 177, 164, 79, 208, 124, 89, 212, 18, 81, 145, 151, 164, 217, 153, 91, + 154, 102, 102, 159, 75, 164, 152, 136, 51, 213, 219, 186, 116, 193, 224, + 186, 36, 231, 208, 84, 211, 155, 167, 35, 59, 42, 76, 216, 149, 73, + 201, 78, 149, 184, 100, 96, 196, 189, 198, 188, 235, 195, 117, 129, 120, + 129, 49, 25, 133, 113, 69, 221, 114, 70, 143, 99, 157, 108, 189, 140, + 78, 6, 55, 65, 240, 255, 245, 184, 72, 90, 100, 116, 131, 39, 60, + 234, 167, 33, 160, 88, 185, 200, 157, 159, 176, 127, 151, 138, 102, 168, + 106, 170, 86, 82, 219, 189, 76, 33, 115, 197, 106, 96, 198, 136, 97, + 141, 237, 151, 98, 137, 191, 185, 2, 57, 95, 142, 91, 255, 185, 97, + 137, 76, 162, 94, 173, 131, 193, 161, 81, 106, 72, 135, 222, 234, 137, + 66, 137, 106, 243, 210, 147, 95, 15, 137, 110, 85, 66, 16, 96, 167, + 147, 150, 173, 203, 140, 118, 196, 84, 147, 160, 19, 95, 101, 123, 74, + 132, 202, 82, 166, 12, 131, 166, 189, 170, 159, 85, 79, 66, 57, 152, + 132, 203, 194, 0, 1, 56, 146, 180, 224, 156, 28, 83, 181, 79, 76, + 80, 46, 160, 175, 59, 106, 43, 87, 75, 136, 85, 189, 46, 71, 200, + 90 + }; + + ASM_REGISTER_STATE_CHECK( + predict_(const_cast(kTestData) + kSrcStride * 2 + 2, + kSrcStride, 2, 2, dst_, dst_stride_)); + + ASSERT_TRUE( + CompareBuffers(kExpectedDst, kExpectedDstStride, dst_, dst_stride_)); +} + +INSTANTIATE_TEST_SUITE_P( + C, SixtapPredictTest, + ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_c), + make_tuple(8, 8, &vp8_sixtap_predict8x8_c), + make_tuple(8, 4, &vp8_sixtap_predict8x4_c), + make_tuple(4, 4, &vp8_sixtap_predict4x4_c))); +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, SixtapPredictTest, + ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_neon), + make_tuple(8, 8, &vp8_sixtap_predict8x8_neon), + make_tuple(8, 4, &vp8_sixtap_predict8x4_neon), + make_tuple(4, 4, &vp8_sixtap_predict4x4_neon))); +#endif +#if HAVE_MMX +INSTANTIATE_TEST_SUITE_P( + MMX, SixtapPredictTest, + ::testing::Values(make_tuple(4, 4, &vp8_sixtap_predict4x4_mmx))); +#endif +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2, SixtapPredictTest, + ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_sse2), + make_tuple(8, 8, &vp8_sixtap_predict8x8_sse2), + make_tuple(8, 4, &vp8_sixtap_predict8x4_sse2))); +#endif +#if HAVE_SSSE3 +INSTANTIATE_TEST_SUITE_P( + SSSE3, SixtapPredictTest, + ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_ssse3), + make_tuple(8, 8, &vp8_sixtap_predict8x8_ssse3), + make_tuple(8, 4, &vp8_sixtap_predict8x4_ssse3), + make_tuple(4, 4, &vp8_sixtap_predict4x4_ssse3))); +#endif +#if HAVE_MSA +INSTANTIATE_TEST_SUITE_P( + MSA, SixtapPredictTest, + ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_msa), + make_tuple(8, 8, &vp8_sixtap_predict8x8_msa), + make_tuple(8, 4, &vp8_sixtap_predict8x4_msa), + make_tuple(4, 4, &vp8_sixtap_predict4x4_msa))); +#endif + +#if HAVE_MMI +INSTANTIATE_TEST_SUITE_P( + MMI, SixtapPredictTest, + ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_mmi), + make_tuple(8, 8, &vp8_sixtap_predict8x8_mmi), + make_tuple(8, 4, &vp8_sixtap_predict8x4_mmi), + make_tuple(4, 4, &vp8_sixtap_predict4x4_mmi))); +#endif + +#if HAVE_LSX +INSTANTIATE_TEST_SUITE_P( + LSX, SixtapPredictTest, + ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_lsx), + make_tuple(8, 8, &vp8_sixtap_predict8x8_lsx), + make_tuple(4, 4, &vp8_sixtap_predict4x4_lsx))); +#endif + +class BilinearPredictTest : public PredictTestBase {}; + +TEST_P(BilinearPredictTest, TestWithRandomData) { + TestWithRandomData(vp8_bilinear_predict16x16_c); +} +TEST_P(BilinearPredictTest, TestWithUnalignedDst) { + TestWithUnalignedDst(vp8_bilinear_predict16x16_c); +} +TEST_P(BilinearPredictTest, DISABLED_Speed) { + const int kCountSpeedTestBlock = 5000000 / (width_ * height_); + RunNTimes(kCountSpeedTestBlock); + + char title[16]; + snprintf(title, sizeof(title), "%dx%d", width_, height_); + PrintMedian(title); +} + +INSTANTIATE_TEST_SUITE_P( + C, BilinearPredictTest, + ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_c), + make_tuple(8, 8, &vp8_bilinear_predict8x8_c), + make_tuple(8, 4, &vp8_bilinear_predict8x4_c), + make_tuple(4, 4, &vp8_bilinear_predict4x4_c))); +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, BilinearPredictTest, + ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_neon), + make_tuple(8, 8, &vp8_bilinear_predict8x8_neon), + make_tuple(8, 4, &vp8_bilinear_predict8x4_neon), + make_tuple(4, 4, &vp8_bilinear_predict4x4_neon))); +#endif +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2, BilinearPredictTest, + ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_sse2), + make_tuple(8, 8, &vp8_bilinear_predict8x8_sse2), + make_tuple(8, 4, &vp8_bilinear_predict8x4_sse2), + make_tuple(4, 4, &vp8_bilinear_predict4x4_sse2))); +#endif +#if HAVE_SSSE3 +INSTANTIATE_TEST_SUITE_P( + SSSE3, BilinearPredictTest, + ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_ssse3), + make_tuple(8, 8, &vp8_bilinear_predict8x8_ssse3))); +#endif +#if HAVE_MSA +INSTANTIATE_TEST_SUITE_P( + MSA, BilinearPredictTest, + ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_msa), + make_tuple(8, 8, &vp8_bilinear_predict8x8_msa), + make_tuple(8, 4, &vp8_bilinear_predict8x4_msa), + make_tuple(4, 4, &vp8_bilinear_predict4x4_msa))); +#endif +} // namespace diff --git a/media/libvpx/libvpx/test/quantize_test.cc b/media/libvpx/libvpx/test/quantize_test.cc new file mode 100644 index 0000000000..ab38f5c1b0 --- /dev/null +++ b/media/libvpx/libvpx/test/quantize_test.cc @@ -0,0 +1,234 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vp8_rtcd.h" +#include "./vpx_config.h" +#include "test/acm_random.h" +#include "test/bench.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "vp8/common/blockd.h" +#include "vp8/common/onyx.h" +#include "vp8/encoder/block.h" +#include "vp8/encoder/onyx_int.h" +#include "vp8/encoder/quantize.h" +#include "vpx/vpx_integer.h" +#include "vpx_mem/vpx_mem.h" + +namespace { + +const int kNumBlocks = 25; +const int kNumBlockEntries = 16; + +typedef void (*VP8Quantize)(BLOCK *b, BLOCKD *d); + +typedef std::tuple VP8QuantizeParam; + +using libvpx_test::ACMRandom; +using std::make_tuple; + +// Create and populate a VP8_COMP instance which has a complete set of +// quantization inputs as well as a second MACROBLOCKD for output. +class QuantizeTestBase { + public: + virtual ~QuantizeTestBase() { + vp8_remove_compressor(&vp8_comp_); + vp8_comp_ = nullptr; + vpx_free(macroblockd_dst_); + macroblockd_dst_ = nullptr; + libvpx_test::ClearSystemState(); + } + + protected: + void SetupCompressor() { + rnd_.Reset(ACMRandom::DeterministicSeed()); + + // The full configuration is necessary to generate the quantization tables. + VP8_CONFIG vp8_config; + memset(&vp8_config, 0, sizeof(vp8_config)); + + vp8_comp_ = vp8_create_compressor(&vp8_config); + + // Set the tables based on a quantizer of 0. + vp8_set_quantizer(vp8_comp_, 0); + + // Set up all the block/blockd pointers for the mb in vp8_comp_. + vp8cx_frame_init_quantizer(vp8_comp_); + + // Copy macroblockd from the reference to get pre-set-up dequant values. + macroblockd_dst_ = reinterpret_cast( + vpx_memalign(32, sizeof(*macroblockd_dst_))); + memcpy(macroblockd_dst_, &vp8_comp_->mb.e_mbd, sizeof(*macroblockd_dst_)); + // Fix block pointers - currently they point to the blocks in the reference + // structure. + vp8_setup_block_dptrs(macroblockd_dst_); + } + + void UpdateQuantizer(int q) { + vp8_set_quantizer(vp8_comp_, q); + + memcpy(macroblockd_dst_, &vp8_comp_->mb.e_mbd, sizeof(*macroblockd_dst_)); + vp8_setup_block_dptrs(macroblockd_dst_); + } + + void FillCoeffConstant(int16_t c) { + for (int i = 0; i < kNumBlocks * kNumBlockEntries; ++i) { + vp8_comp_->mb.coeff[i] = c; + } + } + + void FillCoeffRandom() { + for (int i = 0; i < kNumBlocks * kNumBlockEntries; ++i) { + vp8_comp_->mb.coeff[i] = rnd_.Rand8(); + } + } + + void CheckOutput() { + EXPECT_EQ(0, memcmp(vp8_comp_->mb.e_mbd.qcoeff, macroblockd_dst_->qcoeff, + sizeof(*macroblockd_dst_->qcoeff) * kNumBlocks * + kNumBlockEntries)) + << "qcoeff mismatch"; + EXPECT_EQ(0, memcmp(vp8_comp_->mb.e_mbd.dqcoeff, macroblockd_dst_->dqcoeff, + sizeof(*macroblockd_dst_->dqcoeff) * kNumBlocks * + kNumBlockEntries)) + << "dqcoeff mismatch"; + EXPECT_EQ(0, memcmp(vp8_comp_->mb.e_mbd.eobs, macroblockd_dst_->eobs, + sizeof(*macroblockd_dst_->eobs) * kNumBlocks)) + << "eobs mismatch"; + } + + VP8_COMP *vp8_comp_; + MACROBLOCKD *macroblockd_dst_; + + private: + ACMRandom rnd_; +}; + +class QuantizeTest : public QuantizeTestBase, + public ::testing::TestWithParam, + public AbstractBench { + protected: + void SetUp() override { + SetupCompressor(); + asm_quant_ = GET_PARAM(0); + c_quant_ = GET_PARAM(1); + } + + void Run() override { + asm_quant_(&vp8_comp_->mb.block[0], ¯oblockd_dst_->block[0]); + } + + void RunComparison() { + for (int i = 0; i < kNumBlocks; ++i) { + ASM_REGISTER_STATE_CHECK( + c_quant_(&vp8_comp_->mb.block[i], &vp8_comp_->mb.e_mbd.block[i])); + ASM_REGISTER_STATE_CHECK( + asm_quant_(&vp8_comp_->mb.block[i], ¯oblockd_dst_->block[i])); + } + + CheckOutput(); + } + + private: + VP8Quantize asm_quant_; + VP8Quantize c_quant_; +}; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(QuantizeTest); + +TEST_P(QuantizeTest, TestZeroInput) { + FillCoeffConstant(0); + RunComparison(); +} + +TEST_P(QuantizeTest, TestLargeNegativeInput) { + FillCoeffConstant(0); + // Generate a qcoeff which contains 512/-512 (0x0100/0xFE00) to catch issues + // like BUG=883 where the constant being compared was incorrectly initialized. + vp8_comp_->mb.coeff[0] = -8191; + RunComparison(); +} + +TEST_P(QuantizeTest, TestRandomInput) { + FillCoeffRandom(); + RunComparison(); +} + +TEST_P(QuantizeTest, TestMultipleQ) { + for (int q = 0; q < QINDEX_RANGE; ++q) { + UpdateQuantizer(q); + FillCoeffRandom(); + RunComparison(); + } +} + +TEST_P(QuantizeTest, DISABLED_Speed) { + FillCoeffRandom(); + + RunNTimes(10000000); + PrintMedian("vp8 quantize"); +} + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2, QuantizeTest, + ::testing::Values( + make_tuple(&vp8_fast_quantize_b_sse2, &vp8_fast_quantize_b_c), + make_tuple(&vp8_regular_quantize_b_sse2, &vp8_regular_quantize_b_c))); +#endif // HAVE_SSE2 + +#if HAVE_SSSE3 +INSTANTIATE_TEST_SUITE_P( + SSSE3, QuantizeTest, + ::testing::Values(make_tuple(&vp8_fast_quantize_b_ssse3, + &vp8_fast_quantize_b_c))); +#endif // HAVE_SSSE3 + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_SUITE_P( + SSE4_1, QuantizeTest, + ::testing::Values(make_tuple(&vp8_regular_quantize_b_sse4_1, + &vp8_regular_quantize_b_c))); +#endif // HAVE_SSE4_1 + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, QuantizeTest, + ::testing::Values(make_tuple(&vp8_fast_quantize_b_neon, + &vp8_fast_quantize_b_c))); +#endif // HAVE_NEON + +#if HAVE_MSA +INSTANTIATE_TEST_SUITE_P( + MSA, QuantizeTest, + ::testing::Values( + make_tuple(&vp8_fast_quantize_b_msa, &vp8_fast_quantize_b_c), + make_tuple(&vp8_regular_quantize_b_msa, &vp8_regular_quantize_b_c))); +#endif // HAVE_MSA + +#if HAVE_MMI +INSTANTIATE_TEST_SUITE_P( + MMI, QuantizeTest, + ::testing::Values( + make_tuple(&vp8_fast_quantize_b_mmi, &vp8_fast_quantize_b_c), + make_tuple(&vp8_regular_quantize_b_mmi, &vp8_regular_quantize_b_c))); +#endif // HAVE_MMI + +#if HAVE_LSX +INSTANTIATE_TEST_SUITE_P( + LSX, QuantizeTest, + ::testing::Values(make_tuple(&vp8_regular_quantize_b_lsx, + &vp8_regular_quantize_b_c))); +#endif // HAVE_LSX +} // namespace diff --git a/media/libvpx/libvpx/test/realtime_test.cc b/media/libvpx/libvpx/test/realtime_test.cc new file mode 100644 index 0000000000..a9870b3cbf --- /dev/null +++ b/media/libvpx/libvpx/test/realtime_test.cc @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include + +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/util.h" +#include "test/video_source.h" +#include "third_party/googletest/src/include/gtest/gtest.h" + +namespace { + +const int kVideoSourceWidth = 320; +const int kVideoSourceHeight = 240; +const int kFramesToEncode = 2; + +class RealtimeTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam { + protected: + RealtimeTest() : EncoderTest(GET_PARAM(0)), frame_packets_(0) {} + ~RealtimeTest() override = default; + + void SetUp() override { + InitializeConfig(); + cfg_.g_lag_in_frames = 0; + SetMode(::libvpx_test::kRealTime); + } + + void BeginPassHook(unsigned int /*pass*/) override { +#if !CONFIG_REALTIME_ONLY + // TODO(tomfinegan): We're changing the pass value here to make sure + // we get frames when real time mode is combined with |g_pass| set to + // VPX_RC_FIRST_PASS. This is necessary because EncoderTest::RunLoop() sets + // the pass value based on the mode passed into EncoderTest::SetMode(), + // which overrides the one specified in SetUp() above. + cfg_.g_pass = VPX_RC_FIRST_PASS; +#endif + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0 && set_cpu_used_) { + encoder->Control(VP8E_SET_CPUUSED, 8); + } + } + + void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) override { + frame_packets_++; + } + + bool IsVP9() const { +#if CONFIG_VP9_ENCODER + return codec_ == &libvpx_test::kVP9; +#else + return false; +#endif + } + + void TestIntegerOverflow(unsigned int width, unsigned int height) { + ::libvpx_test::RandomVideoSource video; + video.SetSize(width, height); + video.set_limit(20); + cfg_.rc_target_bitrate = UINT_MAX; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + void TestEncode() { + ::libvpx_test::RandomVideoSource video; + video.SetSize(kVideoSourceWidth, kVideoSourceHeight); + video.set_limit(kFramesToEncode); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + EXPECT_EQ(kFramesToEncode, frame_packets_); + } + + int frame_packets_; + bool set_cpu_used_ = true; +}; + +TEST_P(RealtimeTest, RealtimeFirstPassProducesFrames) { TestEncode(); } + +TEST_P(RealtimeTest, RealtimeDefaultCpuUsed) { + set_cpu_used_ = false; + TestEncode(); +} + +TEST_P(RealtimeTest, IntegerOverflow) { TestIntegerOverflow(2048, 2048); } + +TEST_P(RealtimeTest, IntegerOverflowLarge) { + if (IsVP9()) { +#if VPX_ARCH_AARCH64 || VPX_ARCH_X86_64 + TestIntegerOverflow(16384, 16384); +#else + TestIntegerOverflow(4096, 4096); +#endif + } else { + GTEST_SKIP() + << "TODO(https://crbug.com/webm/1748,https://crbug.com/webm/1751):" + << " Enable this test after bitstream errors & undefined sanitizer " + "warnings are fixed."; + // TestIntegerOverflow(16383, 16383); + } +} + +VP8_INSTANTIATE_TEST_SUITE(RealtimeTest, + ::testing::Values(::libvpx_test::kRealTime)); +VP9_INSTANTIATE_TEST_SUITE(RealtimeTest, + ::testing::Values(::libvpx_test::kRealTime)); + +} // namespace diff --git a/media/libvpx/libvpx/test/register_state_check.h b/media/libvpx/libvpx/test/register_state_check.h new file mode 100644 index 0000000000..ede86ef52f --- /dev/null +++ b/media/libvpx/libvpx/test/register_state_check.h @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_TEST_REGISTER_STATE_CHECK_H_ +#define VPX_TEST_REGISTER_STATE_CHECK_H_ + +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" + +// ASM_REGISTER_STATE_CHECK(asm_function) +// Minimally validates the environment pre & post function execution. This +// variant should be used with assembly functions which are not expected to +// fully restore the system state. See platform implementations of +// RegisterStateCheck for details. +// +// API_REGISTER_STATE_CHECK(api_function) +// Performs all the checks done by ASM_REGISTER_STATE_CHECK() and any +// additional checks to ensure the environment is in a consistent state pre & +// post function execution. This variant should be used with API functions. +// See platform implementations of RegisterStateCheckXXX for details. +// + +#if defined(_WIN64) && VPX_ARCH_X86_64 + +#undef NOMINMAX +#define NOMINMAX +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif +#include +#include +#include + +inline bool operator==(const M128A &lhs, const M128A &rhs) { + return (lhs.Low == rhs.Low && lhs.High == rhs.High); +} + +namespace libvpx_test { + +// Compares the state of xmm[6-15] at construction with their state at +// destruction. These registers should be preserved by the callee on +// Windows x64. +class RegisterStateCheck { + public: + RegisterStateCheck() { initialized_ = StoreRegisters(&pre_context_); } + ~RegisterStateCheck() { Check(); } + + private: + static bool StoreRegisters(CONTEXT *const context) { + const HANDLE this_thread = GetCurrentThread(); + EXPECT_NE(this_thread, nullptr); + context->ContextFlags = CONTEXT_FLOATING_POINT; + const bool context_saved = GetThreadContext(this_thread, context) == TRUE; + EXPECT_TRUE(context_saved) << "GetLastError: " << GetLastError(); + return context_saved; + } + + // Compares the register state. Returns true if the states match. + void Check() const { + ASSERT_TRUE(initialized_); + CONTEXT post_context; + ASSERT_TRUE(StoreRegisters(&post_context)); + + const M128A *xmm_pre = &pre_context_.Xmm6; + const M128A *xmm_post = &post_context.Xmm6; + for (int i = 6; i <= 15; ++i) { + EXPECT_EQ(*xmm_pre, *xmm_post) << "xmm" << i << " has been modified!"; + ++xmm_pre; + ++xmm_post; + } + } + + bool initialized_; + CONTEXT pre_context_; +}; + +#define ASM_REGISTER_STATE_CHECK(statement) \ + do { \ + { \ + libvpx_test::RegisterStateCheck reg_check; \ + statement; \ + } \ + _ReadWriteBarrier(); \ + } while (false) + +} // namespace libvpx_test + +#elif defined(CONFIG_SHARED) && defined(HAVE_NEON_ASM) && \ + defined(CONFIG_VP9) && !CONFIG_SHARED && HAVE_NEON_ASM && CONFIG_VP9 + +extern "C" { +// Save the d8-d15 registers into store. +void vpx_push_neon(int64_t *store); +} + +namespace libvpx_test { + +// Compares the state of d8-d15 at construction with their state at +// destruction. These registers should be preserved by the callee on +// arm platform. +class RegisterStateCheck { + public: + RegisterStateCheck() { vpx_push_neon(pre_store_); } + ~RegisterStateCheck() { Check(); } + + private: + // Compares the register state. Returns true if the states match. + void Check() const { + int64_t post_store[8]; + vpx_push_neon(post_store); + for (int i = 0; i < 8; ++i) { + EXPECT_EQ(pre_store_[i], post_store[i]) + << "d" << i + 8 << " has been modified"; + } + } + + int64_t pre_store_[8]; +}; + +#if defined(__GNUC__) +#define ASM_REGISTER_STATE_CHECK(statement) \ + do { \ + { \ + libvpx_test::RegisterStateCheck reg_check; \ + statement; \ + } \ + __asm__ volatile("" ::: "memory"); \ + } while (false) +#else +#define ASM_REGISTER_STATE_CHECK(statement) \ + do { \ + libvpx_test::RegisterStateCheck reg_check; \ + statement; \ + } while (false) +#endif + +} // namespace libvpx_test + +#else + +namespace libvpx_test { + +class RegisterStateCheck {}; +#define ASM_REGISTER_STATE_CHECK(statement) statement + +} // namespace libvpx_test + +#endif // _WIN64 && VPX_ARCH_X86_64 + +#if VPX_ARCH_X86 || VPX_ARCH_X86_64 +#if defined(__GNUC__) + +namespace libvpx_test { + +// Checks the FPU tag word pre/post execution to ensure emms has been called. +class RegisterStateCheckMMX { + public: + RegisterStateCheckMMX() { + __asm__ volatile("fstenv %0" : "=rm"(pre_fpu_env_)); + } + ~RegisterStateCheckMMX() { Check(); } + + private: + // Checks the FPU tag word pre/post execution, returning false if not cleared + // to 0xffff. + void Check() const { + EXPECT_EQ(0xffff, pre_fpu_env_[4]) + << "FPU was in an inconsistent state prior to call"; + + uint16_t post_fpu_env[14]; + __asm__ volatile("fstenv %0" : "=rm"(post_fpu_env)); + EXPECT_EQ(0xffff, post_fpu_env[4]) + << "FPU was left in an inconsistent state after call"; + } + + uint16_t pre_fpu_env_[14]; +}; + +#define API_REGISTER_STATE_CHECK(statement) \ + do { \ + { \ + libvpx_test::RegisterStateCheckMMX reg_check_mmx; \ + ASM_REGISTER_STATE_CHECK(statement); \ + } \ + __asm__ volatile("" ::: "memory"); \ + } while (false) + +} // namespace libvpx_test + +#endif // __GNUC__ +#endif // VPX_ARCH_X86 || VPX_ARCH_X86_64 + +#ifndef API_REGISTER_STATE_CHECK +#define API_REGISTER_STATE_CHECK ASM_REGISTER_STATE_CHECK +#endif + +#endif // VPX_TEST_REGISTER_STATE_CHECK_H_ diff --git a/media/libvpx/libvpx/test/resize_test.cc b/media/libvpx/libvpx/test/resize_test.cc new file mode 100644 index 0000000000..20ad2229b4 --- /dev/null +++ b/media/libvpx/libvpx/test/resize_test.cc @@ -0,0 +1,783 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include + +#include +#include +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/video_source.h" +#include "test/util.h" + +// Enable(1) or Disable(0) writing of the compressed bitstream. +#define WRITE_COMPRESSED_STREAM 0 + +namespace { + +#if WRITE_COMPRESSED_STREAM +static void mem_put_le16(char *const mem, const unsigned int val) { + mem[0] = val; + mem[1] = val >> 8; +} + +static void mem_put_le32(char *const mem, const unsigned int val) { + mem[0] = val; + mem[1] = val >> 8; + mem[2] = val >> 16; + mem[3] = val >> 24; +} + +static void write_ivf_file_header(const vpx_codec_enc_cfg_t *const cfg, + int frame_cnt, FILE *const outfile) { + char header[32]; + + header[0] = 'D'; + header[1] = 'K'; + header[2] = 'I'; + header[3] = 'F'; + mem_put_le16(header + 4, 0); /* version */ + mem_put_le16(header + 6, 32); /* headersize */ + mem_put_le32(header + 8, 0x30395056); /* fourcc (vp9) */ + mem_put_le16(header + 12, cfg->g_w); /* width */ + mem_put_le16(header + 14, cfg->g_h); /* height */ + mem_put_le32(header + 16, cfg->g_timebase.den); /* rate */ + mem_put_le32(header + 20, cfg->g_timebase.num); /* scale */ + mem_put_le32(header + 24, frame_cnt); /* length */ + mem_put_le32(header + 28, 0); /* unused */ + + (void)fwrite(header, 1, 32, outfile); +} + +static void write_ivf_frame_size(FILE *const outfile, const size_t size) { + char header[4]; + mem_put_le32(header, static_cast(size)); + (void)fwrite(header, 1, 4, outfile); +} + +static void write_ivf_frame_header(const vpx_codec_cx_pkt_t *const pkt, + FILE *const outfile) { + char header[12]; + vpx_codec_pts_t pts; + + if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return; + + pts = pkt->data.frame.pts; + mem_put_le32(header, static_cast(pkt->data.frame.sz)); + mem_put_le32(header + 4, pts & 0xFFFFFFFF); + mem_put_le32(header + 8, pts >> 32); + + (void)fwrite(header, 1, 12, outfile); +} +#endif // WRITE_COMPRESSED_STREAM + +const unsigned int kInitialWidth = 320; +const unsigned int kInitialHeight = 240; + +struct FrameInfo { + FrameInfo(vpx_codec_pts_t _pts, unsigned int _w, unsigned int _h) + : pts(_pts), w(_w), h(_h) {} + + vpx_codec_pts_t pts; + unsigned int w; + unsigned int h; +}; + +void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, + unsigned int initial_h, unsigned int *w, + unsigned int *h, bool flag_codec, + bool smaller_width_larger_size_) { + *w = initial_w; + *h = initial_h; + + if (smaller_width_larger_size_) { + if (frame < 30) { + return; + } + *w = initial_w * 7 / 10; + *h = initial_h * 16 / 10; + return; + } + if (frame < 10) { + return; + } + if (frame < 20) { + *w = initial_w * 3 / 4; + *h = initial_h * 3 / 4; + return; + } + if (frame < 30) { + *w = initial_w / 2; + *h = initial_h / 2; + return; + } + if (frame < 40) { + return; + } + if (frame < 50) { + *w = initial_w * 3 / 4; + *h = initial_h * 3 / 4; + return; + } + if (frame < 60) { + *w = initial_w / 2; + *h = initial_h / 2; + return; + } + if (frame < 70) { + return; + } + if (frame < 80) { + *w = initial_w * 3 / 4; + *h = initial_h * 3 / 4; + return; + } + if (frame < 90) { + *w = initial_w / 2; + *h = initial_h / 2; + return; + } + if (frame < 100) { + *w = initial_w * 3 / 4; + *h = initial_h * 3 / 4; + return; + } + if (frame < 110) { + return; + } + if (frame < 120) { + *w = initial_w * 3 / 4; + *h = initial_h * 3 / 4; + return; + } + if (frame < 130) { + *w = initial_w / 2; + *h = initial_h / 2; + return; + } + if (frame < 140) { + *w = initial_w * 3 / 4; + *h = initial_h * 3 / 4; + return; + } + if (frame < 150) { + return; + } + if (frame < 160) { + *w = initial_w * 3 / 4; + *h = initial_h * 3 / 4; + return; + } + if (frame < 170) { + *w = initial_w / 2; + *h = initial_h / 2; + return; + } + if (frame < 180) { + *w = initial_w * 3 / 4; + *h = initial_h * 3 / 4; + return; + } + if (frame < 190) { + return; + } + if (frame < 200) { + *w = initial_w * 3 / 4; + *h = initial_h * 3 / 4; + return; + } + if (frame < 210) { + *w = initial_w / 2; + *h = initial_h / 2; + return; + } + if (frame < 220) { + *w = initial_w * 3 / 4; + *h = initial_h * 3 / 4; + return; + } + if (frame < 230) { + return; + } + if (frame < 240) { + *w = initial_w * 3 / 4; + *h = initial_h * 3 / 4; + return; + } + if (frame < 250) { + *w = initial_w / 2; + *h = initial_h / 2; + return; + } + if (frame < 260) { + return; + } + // Go down very low. + if (frame < 270) { + *w = initial_w / 4; + *h = initial_h / 4; + return; + } + if (flag_codec == 1) { + // Cases that only works for VP9. + // For VP9: Swap width and height of original. + if (frame < 320) { + return; + } + } +} + +class ResizingVideoSource : public ::libvpx_test::DummyVideoSource { + public: + ResizingVideoSource() { + SetSize(kInitialWidth, kInitialHeight); + limit_ = 350; + smaller_width_larger_size_ = false; + } + bool flag_codec_; + bool smaller_width_larger_size_; + ~ResizingVideoSource() override = default; + + protected: + void Next() override { + ++frame_; + unsigned int width = 0; + unsigned int height = 0; + ScaleForFrameNumber(frame_, kInitialWidth, kInitialHeight, &width, &height, + flag_codec_, smaller_width_larger_size_); + SetSize(width, height); + FillFrame(); + } +}; + +class ResizeTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam { + protected: + ResizeTest() : EncoderTest(GET_PARAM(0)) {} + + ~ResizeTest() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(GET_PARAM(1)); + } + + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + ASSERT_NE(static_cast(pkt->data.frame.width[0]), 0); + ASSERT_NE(static_cast(pkt->data.frame.height[0]), 0); + encode_frame_width_.push_back(pkt->data.frame.width[0]); + encode_frame_height_.push_back(pkt->data.frame.height[0]); + } + + unsigned int GetFrameWidth(size_t idx) const { + return encode_frame_width_[idx]; + } + + unsigned int GetFrameHeight(size_t idx) const { + return encode_frame_height_[idx]; + } + + void DecompressedFrameHook(const vpx_image_t &img, + vpx_codec_pts_t pts) override { + frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h)); + } + + std::vector frame_info_list_; + std::vector encode_frame_width_; + std::vector encode_frame_height_; +}; + +TEST_P(ResizeTest, TestExternalResizeWorks) { + ResizingVideoSource video; + video.flag_codec_ = false; + video.smaller_width_larger_size_ = false; + cfg_.g_lag_in_frames = 0; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + for (std::vector::const_iterator info = frame_info_list_.begin(); + info != frame_info_list_.end(); ++info) { + const unsigned int frame = static_cast(info->pts); + unsigned int expected_w; + unsigned int expected_h; + const size_t idx = info - frame_info_list_.begin(); + ASSERT_EQ(info->w, GetFrameWidth(idx)); + ASSERT_EQ(info->h, GetFrameHeight(idx)); + ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w, + &expected_h, video.flag_codec_, + video.smaller_width_larger_size_); + EXPECT_EQ(expected_w, info->w) + << "Frame " << frame << " had unexpected width"; + EXPECT_EQ(expected_h, info->h) + << "Frame " << frame << " had unexpected height"; + } +} + +const unsigned int kStepDownFrame = 3; +const unsigned int kStepUpFrame = 6; + +class ResizeInternalTest : public ResizeTest { + protected: +#if WRITE_COMPRESSED_STREAM + ResizeInternalTest() + : ResizeTest(), frame0_psnr_(0.0), outfile_(nullptr), out_frames_(0) {} +#else + ResizeInternalTest() : ResizeTest(), frame0_psnr_(0.0) {} +#endif + + ~ResizeInternalTest() override = default; + + void BeginPassHook(unsigned int /*pass*/) override { +#if WRITE_COMPRESSED_STREAM + outfile_ = fopen("vp90-2-05-resize.ivf", "wb"); +#endif + } + + void EndPassHook() override { +#if WRITE_COMPRESSED_STREAM + if (outfile_) { + if (!fseek(outfile_, 0, SEEK_SET)) + write_ivf_file_header(&cfg_, out_frames_, outfile_); + fclose(outfile_); + outfile_ = nullptr; + } +#endif + } + + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { + if (change_config_) { + int new_q = 60; + if (video->frame() == 0) { + struct vpx_scaling_mode mode = { VP8E_ONETWO, VP8E_ONETWO }; + encoder->Control(VP8E_SET_SCALEMODE, &mode); + } + if (video->frame() == 1) { + struct vpx_scaling_mode mode = { VP8E_NORMAL, VP8E_NORMAL }; + encoder->Control(VP8E_SET_SCALEMODE, &mode); + cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = new_q; + encoder->Config(&cfg_); + } + } else { + if (video->frame() == kStepDownFrame) { + struct vpx_scaling_mode mode = { VP8E_FOURFIVE, VP8E_THREEFIVE }; + encoder->Control(VP8E_SET_SCALEMODE, &mode); + } + if (video->frame() == kStepUpFrame) { + struct vpx_scaling_mode mode = { VP8E_NORMAL, VP8E_NORMAL }; + encoder->Control(VP8E_SET_SCALEMODE, &mode); + } + } + } + + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { + if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0]; + EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0); + } + +#if WRITE_COMPRESSED_STREAM + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + ++out_frames_; + + // Write initial file header if first frame. + if (pkt->data.frame.pts == 0) write_ivf_file_header(&cfg_, 0, outfile_); + + // Write frame header and data. + write_ivf_frame_header(pkt, outfile_); + (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_); + } +#endif + + double frame0_psnr_; + bool change_config_; +#if WRITE_COMPRESSED_STREAM + FILE *outfile_; + unsigned int out_frames_; +#endif +}; + +TEST_P(ResizeInternalTest, TestInternalResizeWorks) { + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 10); + init_flags_ = VPX_CODEC_USE_PSNR; + change_config_ = false; + + // q picked such that initial keyframe on this clip is ~30dB PSNR + cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = 48; + + // If the number of frames being encoded is smaller than g_lag_in_frames + // the encoded frame is unavailable using the current API. Comparing + // frames to detect mismatch would then not be possible. Set + // g_lag_in_frames = 0 to get around this. + cfg_.g_lag_in_frames = 0; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + for (std::vector::const_iterator info = frame_info_list_.begin(); + info != frame_info_list_.end(); ++info) { + const vpx_codec_pts_t pts = info->pts; + if (pts >= kStepDownFrame && pts < kStepUpFrame) { + ASSERT_EQ(282U, info->w) << "Frame " << pts << " had unexpected width"; + ASSERT_EQ(173U, info->h) << "Frame " << pts << " had unexpected height"; + } else { + EXPECT_EQ(352U, info->w) << "Frame " << pts << " had unexpected width"; + EXPECT_EQ(288U, info->h) << "Frame " << pts << " had unexpected height"; + } + } +} + +TEST_P(ResizeInternalTest, TestInternalResizeChangeConfig) { + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 10); + cfg_.g_w = 352; + cfg_.g_h = 288; + change_config_ = true; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +class ResizeRealtimeTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith2Params { + protected: + ResizeRealtimeTest() : EncoderTest(GET_PARAM(0)) {} + ~ResizeRealtimeTest() override = default; + + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP9E_SET_AQ_MODE, 3); + encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); + } + + if (change_bitrate_ && video->frame() == 120) { + change_bitrate_ = false; + cfg_.rc_target_bitrate = 500; + encoder->Config(&cfg_); + } + } + + void SetUp() override { + InitializeConfig(); + SetMode(GET_PARAM(1)); + set_cpu_used_ = GET_PARAM(2); + } + + void DecompressedFrameHook(const vpx_image_t &img, + vpx_codec_pts_t pts) override { + frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h)); + } + + void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) override { + double mismatch_psnr = compute_psnr(img1, img2); + mismatch_psnr_ += mismatch_psnr; + ++mismatch_nframes_; + } + + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + ASSERT_NE(static_cast(pkt->data.frame.width[0]), 0); + ASSERT_NE(static_cast(pkt->data.frame.height[0]), 0); + encode_frame_width_.push_back(pkt->data.frame.width[0]); + encode_frame_height_.push_back(pkt->data.frame.height[0]); + } + + unsigned int GetMismatchFrames() { return mismatch_nframes_; } + + unsigned int GetFrameWidth(size_t idx) const { + return encode_frame_width_[idx]; + } + + unsigned int GetFrameHeight(size_t idx) const { + return encode_frame_height_[idx]; + } + + void DefaultConfig() { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 600; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.rc_undershoot_pct = 50; + cfg_.rc_overshoot_pct = 50; + cfg_.rc_end_usage = VPX_CBR; + cfg_.kf_mode = VPX_KF_AUTO; + cfg_.g_lag_in_frames = 0; + cfg_.kf_min_dist = cfg_.kf_max_dist = 3000; + // Enable dropped frames. + cfg_.rc_dropframe_thresh = 1; + // Enable error_resilience mode. + cfg_.g_error_resilient = 1; + // Enable dynamic resizing. + cfg_.rc_resize_allowed = 1; + // Run at low bitrate. + cfg_.rc_target_bitrate = 200; + } + + std::vector frame_info_list_; + int set_cpu_used_; + bool change_bitrate_; + double mismatch_psnr_; + int mismatch_nframes_; + std::vector encode_frame_width_; + std::vector encode_frame_height_; +}; + +TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) { + ResizingVideoSource video; + video.flag_codec_ = true; + video.smaller_width_larger_size_ = false; + DefaultConfig(); + // Disable internal resize for this test. + cfg_.rc_resize_allowed = 0; + change_bitrate_ = false; + mismatch_psnr_ = 0.0; + mismatch_nframes_ = 0; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + for (std::vector::const_iterator info = frame_info_list_.begin(); + info != frame_info_list_.end(); ++info) { + const unsigned int frame = static_cast(info->pts); + unsigned int expected_w; + unsigned int expected_h; + ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w, + &expected_h, video.flag_codec_, + video.smaller_width_larger_size_); + EXPECT_EQ(expected_w, info->w) + << "Frame " << frame << " had unexpected width"; + EXPECT_EQ(expected_h, info->h) + << "Frame " << frame << " had unexpected height"; + EXPECT_EQ(static_cast(0), GetMismatchFrames()); + } +} + +TEST_P(ResizeRealtimeTest, TestExternalResizeSmallerWidthBiggerSize) { + ResizingVideoSource video; + video.flag_codec_ = true; + video.smaller_width_larger_size_ = true; + DefaultConfig(); + // Disable internal resize for this test. + cfg_.rc_resize_allowed = 0; + change_bitrate_ = false; + mismatch_psnr_ = 0.0; + mismatch_nframes_ = 0; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + for (std::vector::const_iterator info = frame_info_list_.begin(); + info != frame_info_list_.end(); ++info) { + const unsigned int frame = static_cast(info->pts); + unsigned int expected_w; + unsigned int expected_h; + ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w, + &expected_h, video.flag_codec_, + video.smaller_width_larger_size_); + EXPECT_EQ(expected_w, info->w) + << "Frame " << frame << " had unexpected width"; + EXPECT_EQ(expected_h, info->h) + << "Frame " << frame << " had unexpected height"; + EXPECT_EQ(static_cast(0), GetMismatchFrames()); + } +} + +// Verify the dynamic resizer behavior for real time, 1 pass CBR mode. +// Run at low bitrate, with resize_allowed = 1, and verify that we get +// one resize down event. +TEST_P(ResizeRealtimeTest, TestInternalResizeDown) { + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 299); + DefaultConfig(); + cfg_.g_w = 640; + cfg_.g_h = 480; + change_bitrate_ = false; + mismatch_psnr_ = 0.0; + mismatch_nframes_ = 0; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + unsigned int last_w = cfg_.g_w; + unsigned int last_h = cfg_.g_h; + int resize_count = 0; + for (std::vector::const_iterator info = frame_info_list_.begin(); + info != frame_info_list_.end(); ++info) { + if (info->w != last_w || info->h != last_h) { + // Verify that resize down occurs. + ASSERT_LT(info->w, last_w); + ASSERT_LT(info->h, last_h); + last_w = info->w; + last_h = info->h; + resize_count++; + } + } + +#if CONFIG_VP9_DECODER + // Verify that we get 1 resize down event in this test. + ASSERT_EQ(1, resize_count) << "Resizing should occur."; + EXPECT_EQ(static_cast(0), GetMismatchFrames()); +#else + printf("Warning: VP9 decoder unavailable, unable to check resize count!\n"); +#endif +} + +// Verify the dynamic resizer behavior for real time, 1 pass CBR mode. +// Start at low target bitrate, raise the bitrate in the middle of the clip, +// scaling-up should occur after bitrate changed. +TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) { + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + DefaultConfig(); + cfg_.g_w = 640; + cfg_.g_h = 480; + change_bitrate_ = true; + mismatch_psnr_ = 0.0; + mismatch_nframes_ = 0; + // Disable dropped frames. + cfg_.rc_dropframe_thresh = 0; + // Starting bitrate low. + cfg_.rc_target_bitrate = 80; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + unsigned int last_w = cfg_.g_w; + unsigned int last_h = cfg_.g_h; + int resize_count = 0; + for (std::vector::const_iterator info = frame_info_list_.begin(); + info != frame_info_list_.end(); ++info) { + const size_t idx = info - frame_info_list_.begin(); + ASSERT_EQ(info->w, GetFrameWidth(idx)); + ASSERT_EQ(info->h, GetFrameHeight(idx)); + if (info->w != last_w || info->h != last_h) { + resize_count++; + if (resize_count <= 2) { + // Verify that resize down occurs. + ASSERT_LT(info->w, last_w); + ASSERT_LT(info->h, last_h); + } else if (resize_count > 2) { + // Verify that resize up occurs. + ASSERT_GT(info->w, last_w); + ASSERT_GT(info->h, last_h); + } + last_w = info->w; + last_h = info->h; + } + } + +#if CONFIG_VP9_DECODER + // Verify that we get 4 resize events in this test. + ASSERT_EQ(resize_count, 4) << "Resizing should occur twice."; + EXPECT_EQ(static_cast(0), GetMismatchFrames()); +#else + printf("Warning: VP9 decoder unavailable, unable to check resize count!\n"); +#endif +} + +vpx_img_fmt_t CspForFrameNumber(int frame) { + if (frame < 10) return VPX_IMG_FMT_I420; + if (frame < 20) return VPX_IMG_FMT_I444; + return VPX_IMG_FMT_I420; +} + +class ResizeCspTest : public ResizeTest { + protected: +#if WRITE_COMPRESSED_STREAM + ResizeCspTest() + : ResizeTest(), frame0_psnr_(0.0), outfile_(nullptr), out_frames_(0) {} +#else + ResizeCspTest() : ResizeTest(), frame0_psnr_(0.0) {} +#endif + + ~ResizeCspTest() override = default; + + void BeginPassHook(unsigned int /*pass*/) override { +#if WRITE_COMPRESSED_STREAM + outfile_ = fopen("vp91-2-05-cspchape.ivf", "wb"); +#endif + } + + void EndPassHook() override { +#if WRITE_COMPRESSED_STREAM + if (outfile_) { + if (!fseek(outfile_, 0, SEEK_SET)) + write_ivf_file_header(&cfg_, out_frames_, outfile_); + fclose(outfile_); + outfile_ = nullptr; + } +#endif + } + + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { + if (CspForFrameNumber(video->frame()) != VPX_IMG_FMT_I420 && + cfg_.g_profile != 1) { + cfg_.g_profile = 1; + encoder->Config(&cfg_); + } + if (CspForFrameNumber(video->frame()) == VPX_IMG_FMT_I420 && + cfg_.g_profile != 0) { + cfg_.g_profile = 0; + encoder->Config(&cfg_); + } + } + + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { + if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0]; + EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0); + } + +#if WRITE_COMPRESSED_STREAM + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + ++out_frames_; + + // Write initial file header if first frame. + if (pkt->data.frame.pts == 0) write_ivf_file_header(&cfg_, 0, outfile_); + + // Write frame header and data. + write_ivf_frame_header(pkt, outfile_); + (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_); + } +#endif + + double frame0_psnr_; +#if WRITE_COMPRESSED_STREAM + FILE *outfile_; + unsigned int out_frames_; +#endif +}; + +class ResizingCspVideoSource : public ::libvpx_test::DummyVideoSource { + public: + ResizingCspVideoSource() { + SetSize(kInitialWidth, kInitialHeight); + limit_ = 30; + } + + ~ResizingCspVideoSource() override = default; + + protected: + void Next() override { + ++frame_; + SetImageFormat(CspForFrameNumber(frame_)); + FillFrame(); + } +}; + +TEST_P(ResizeCspTest, TestResizeCspWorks) { + ResizingCspVideoSource video; + init_flags_ = VPX_CODEC_USE_PSNR; + cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = 48; + cfg_.g_lag_in_frames = 0; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +VP8_INSTANTIATE_TEST_SUITE(ResizeTest, ONE_PASS_TEST_MODES); +VP9_INSTANTIATE_TEST_SUITE(ResizeTest, ONE_PASS_TEST_MODES); +VP9_INSTANTIATE_TEST_SUITE(ResizeInternalTest, + ::testing::Values(::libvpx_test::kOnePassBest)); +VP9_INSTANTIATE_TEST_SUITE(ResizeRealtimeTest, + ::testing::Values(::libvpx_test::kRealTime), + ::testing::Range(5, 9)); +VP9_INSTANTIATE_TEST_SUITE(ResizeCspTest, + ::testing::Values(::libvpx_test::kRealTime)); +} // namespace diff --git a/media/libvpx/libvpx/test/resize_util.sh b/media/libvpx/libvpx/test/resize_util.sh new file mode 100755 index 0000000000..a9b0f81e2b --- /dev/null +++ b/media/libvpx/libvpx/test/resize_util.sh @@ -0,0 +1,69 @@ +#!/bin/sh +## +## Copyright (c) 2014 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## +## This file tests the libvpx resize_util example code. To add new tests to +## this file, do the following: +## 1. Write a shell function (this is your test). +## 2. Add the function to resize_util_tests (on a new line). +## +. $(dirname $0)/tools_common.sh + +# Environment check: $YUV_RAW_INPUT is required. +resize_util_verify_environment() { + if [ ! -e "${YUV_RAW_INPUT}" ]; then + echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH." + return 1 + fi +} + +# Resizes $YUV_RAW_INPUT using the resize_util example. $1 is the output +# dimensions that will be passed to resize_util. +resize_util() { + local resizer="${LIBVPX_BIN_PATH}/resize_util${VPX_TEST_EXE_SUFFIX}" + local output_file="${VPX_TEST_OUTPUT_DIR}/resize_util.raw" + local frames_to_resize="10" + local target_dimensions="$1" + + # resize_util is available only when CONFIG_SHARED is disabled. + if [ -z "$(vpx_config_option_enabled CONFIG_SHARED)" ]; then + if [ ! -x "${resizer}" ]; then + elog "${resizer} does not exist or is not executable." + return 1 + fi + + eval "${VPX_TEST_PREFIX}" "${resizer}" "${YUV_RAW_INPUT}" \ + "${YUV_RAW_INPUT_WIDTH}x${YUV_RAW_INPUT_HEIGHT}" \ + "${target_dimensions}" "${output_file}" ${frames_to_resize} \ + ${devnull} || return 1 + + [ -e "${output_file}" ] || return 1 + fi +} + +# Halves each dimension of $YUV_RAW_INPUT using resize_util(). +resize_down() { + local target_width=$((${YUV_RAW_INPUT_WIDTH} / 2)) + local target_height=$((${YUV_RAW_INPUT_HEIGHT} / 2)) + + resize_util "${target_width}x${target_height}" +} + +# Doubles each dimension of $YUV_RAW_INPUT using resize_util(). +resize_up() { + local target_width=$((${YUV_RAW_INPUT_WIDTH} * 2)) + local target_height=$((${YUV_RAW_INPUT_HEIGHT} * 2)) + + resize_util "${target_width}x${target_height}" +} + +resize_util_tests="resize_down + resize_up" + +run_tests resize_util_verify_environment "${resize_util_tests}" diff --git a/media/libvpx/libvpx/test/sad_test.cc b/media/libvpx/libvpx/test/sad_test.cc new file mode 100644 index 0000000000..3530e66050 --- /dev/null +++ b/media/libvpx/libvpx/test/sad_test.cc @@ -0,0 +1,2079 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "test/acm_random.h" +#include "test/bench.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "vpx/vpx_codec.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/msvc.h" +#include "vpx_ports/vpx_timer.h" + +// const[expr] should be sufficient for DECLARE_ALIGNED but early +// implementations of c++11 appear to have some issues with it. +#define kDataAlignment 32 + +template +struct TestParams { + TestParams(int w, int h, Function f, int bd = -1) + : width(w), height(h), bit_depth(bd), func(f) {} + int width, height, bit_depth; + Function func; +}; + +typedef unsigned int (*SadMxNFunc)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride); +typedef TestParams SadMxNParam; + +typedef unsigned int (*SadSkipMxNFunc)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride); +typedef TestParams SadSkipMxNParam; + +typedef unsigned int (*SadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred); +typedef TestParams SadMxNAvgParam; + +typedef void (*SadMxNx4Func)(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_ptr[], int ref_stride, + unsigned int *sad_array); +typedef TestParams SadMxNx4Param; + +typedef void (*SadSkipMxNx4Func)(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_ptr[], int ref_stride, + unsigned int *sad_array); +typedef TestParams SadSkipMxNx4Param; + +typedef void (*SadMxNx8Func)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sad_array); + +using libvpx_test::ACMRandom; + +namespace { +template +class SADTestBase : public ::testing::TestWithParam { + public: + explicit SADTestBase(const ParamType ¶ms) : params_(params) {} + + void SetUp() override { + source_data8_ = reinterpret_cast( + vpx_memalign(kDataAlignment, kDataBlockSize)); + reference_data8_ = reinterpret_cast( + vpx_memalign(kDataAlignment, kDataBufferSize)); + second_pred8_ = + reinterpret_cast(vpx_memalign(kDataAlignment, 64 * 64)); + source_data16_ = reinterpret_cast( + vpx_memalign(kDataAlignment, kDataBlockSize * sizeof(uint16_t))); + reference_data16_ = reinterpret_cast( + vpx_memalign(kDataAlignment, kDataBufferSize * sizeof(uint16_t))); + second_pred16_ = reinterpret_cast( + vpx_memalign(kDataAlignment, 64 * 64 * sizeof(uint16_t))); + + if (params_.bit_depth == -1) { + use_high_bit_depth_ = false; + bit_depth_ = VPX_BITS_8; + source_data_ = source_data8_; + reference_data_ = reference_data8_; + second_pred_ = second_pred8_; +#if CONFIG_VP9_HIGHBITDEPTH + } else { + use_high_bit_depth_ = true; + bit_depth_ = static_cast(params_.bit_depth); + source_data_ = CONVERT_TO_BYTEPTR(source_data16_); + reference_data_ = CONVERT_TO_BYTEPTR(reference_data16_); + second_pred_ = CONVERT_TO_BYTEPTR(second_pred16_); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + mask_ = (1 << bit_depth_) - 1; + source_stride_ = (params_.width + 63) & ~63; + reference_stride_ = params_.width * 2; + rnd_.Reset(ACMRandom::DeterministicSeed()); + } + + void TearDown() override { + vpx_free(source_data8_); + source_data8_ = nullptr; + vpx_free(reference_data8_); + reference_data8_ = nullptr; + vpx_free(second_pred8_); + second_pred8_ = nullptr; + vpx_free(source_data16_); + source_data16_ = nullptr; + vpx_free(reference_data16_); + reference_data16_ = nullptr; + vpx_free(second_pred16_); + second_pred16_ = nullptr; + + libvpx_test::ClearSystemState(); + } + + protected: + // Handle blocks up to 4 blocks 64x64 with stride up to 128 + // crbug.com/webm/1660 + static const int kDataBlockSize = 64 * 128; + static const int kDataBufferSize = 4 * kDataBlockSize; + + int GetBlockRefOffset(int block_idx) const { + return block_idx * kDataBlockSize; + } + + uint8_t *GetReferenceFromOffset(int ref_offset) const { + assert((params_.height - 1) * reference_stride_ + params_.width - 1 + + ref_offset < + kDataBufferSize); +#if CONFIG_VP9_HIGHBITDEPTH + if (use_high_bit_depth_) { + return CONVERT_TO_BYTEPTR(CONVERT_TO_SHORTPTR(reference_data_) + + ref_offset); + } +#endif // CONFIG_VP9_HIGHBITDEPTH + return reference_data_ + ref_offset; + } + + uint8_t *GetReference(int block_idx) const { + return GetReferenceFromOffset(GetBlockRefOffset(block_idx)); + } + + // Sum of Absolute Differences. Given two blocks, calculate the absolute + // difference between two pixels in the same relative location; accumulate. + uint32_t ReferenceSAD(int ref_offset) const { + uint32_t sad = 0; + const uint8_t *const reference8 = GetReferenceFromOffset(ref_offset); + const uint8_t *const source8 = source_data_; +#if CONFIG_VP9_HIGHBITDEPTH + const uint16_t *const reference16 = + CONVERT_TO_SHORTPTR(GetReferenceFromOffset(ref_offset)); + const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_); +#endif // CONFIG_VP9_HIGHBITDEPTH + for (int h = 0; h < params_.height; ++h) { + for (int w = 0; w < params_.width; ++w) { + if (!use_high_bit_depth_) { + sad += abs(source8[h * source_stride_ + w] - + reference8[h * reference_stride_ + w]); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + sad += abs(source16[h * source_stride_ + w] - + reference16[h * reference_stride_ + w]); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + } + return sad; + } + + // Sum of Absolute Differences Skip rows. Given two blocks, calculate the + // absolute difference between two pixels in the same relative location every + // other row; accumulate and double the result at the end. + uint32_t ReferenceSADSkip(int ref_offset) const { + uint32_t sad = 0; + const uint8_t *const reference8 = GetReferenceFromOffset(ref_offset); + const uint8_t *const source8 = source_data_; +#if CONFIG_VP9_HIGHBITDEPTH + const uint16_t *const reference16 = + CONVERT_TO_SHORTPTR(GetReferenceFromOffset(ref_offset)); + const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_); +#endif // CONFIG_VP9_HIGHBITDEPTH + for (int h = 0; h < params_.height; h += 2) { + for (int w = 0; w < params_.width; ++w) { + if (!use_high_bit_depth_) { + sad += abs(source8[h * source_stride_ + w] - + reference8[h * reference_stride_ + w]); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + sad += abs(source16[h * source_stride_ + w] - + reference16[h * reference_stride_ + w]); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + } + return sad * 2; + } + + // Sum of Absolute Differences Average. Given two blocks, and a prediction + // calculate the absolute difference between one pixel and average of the + // corresponding and predicted pixels; accumulate. + unsigned int ReferenceSADavg(int block_idx) const { + unsigned int sad = 0; + const uint8_t *const reference8 = GetReference(block_idx); + const uint8_t *const source8 = source_data_; + const uint8_t *const second_pred8 = second_pred_; +#if CONFIG_VP9_HIGHBITDEPTH + const uint16_t *const reference16 = + CONVERT_TO_SHORTPTR(GetReference(block_idx)); + const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_); + const uint16_t *const second_pred16 = CONVERT_TO_SHORTPTR(second_pred_); +#endif // CONFIG_VP9_HIGHBITDEPTH + for (int h = 0; h < params_.height; ++h) { + for (int w = 0; w < params_.width; ++w) { + if (!use_high_bit_depth_) { + const int tmp = second_pred8[h * params_.width + w] + + reference8[h * reference_stride_ + w]; + const uint8_t comp_pred = ROUND_POWER_OF_TWO(tmp, 1); + sad += abs(source8[h * source_stride_ + w] - comp_pred); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + const int tmp = second_pred16[h * params_.width + w] + + reference16[h * reference_stride_ + w]; + const uint16_t comp_pred = ROUND_POWER_OF_TWO(tmp, 1); + sad += abs(source16[h * source_stride_ + w] - comp_pred); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + } + return sad; + } + + void FillConstant(uint8_t *data, int stride, uint16_t fill_constant) const { + uint8_t *data8 = data; +#if CONFIG_VP9_HIGHBITDEPTH + uint16_t *data16 = CONVERT_TO_SHORTPTR(data); +#endif // CONFIG_VP9_HIGHBITDEPTH + for (int h = 0; h < params_.height; ++h) { + for (int w = 0; w < params_.width; ++w) { + if (!use_high_bit_depth_) { + data8[h * stride + w] = static_cast(fill_constant); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + data16[h * stride + w] = fill_constant; +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + } + } + + void FillRandomWH(uint8_t *data, int stride, int w, int h) { + uint8_t *data8 = data; +#if CONFIG_VP9_HIGHBITDEPTH + uint16_t *data16 = CONVERT_TO_SHORTPTR(data); +#endif // CONFIG_VP9_HIGHBITDEPTH + for (int r = 0; r < h; ++r) { + for (int c = 0; c < w; ++c) { + if (!use_high_bit_depth_) { + data8[r * stride + c] = rnd_.Rand8(); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + data16[r * stride + c] = rnd_.Rand16() & mask_; +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + } + } + + void FillRandom(uint8_t *data, int stride) { + FillRandomWH(data, stride, params_.width, params_.height); + } + + uint32_t mask_; + vpx_bit_depth_t bit_depth_; + int source_stride_; + int reference_stride_; + bool use_high_bit_depth_; + + uint8_t *source_data_; + uint8_t *reference_data_; + uint8_t *second_pred_; + uint8_t *source_data8_; + uint8_t *reference_data8_; + uint8_t *second_pred8_; + uint16_t *source_data16_; + uint16_t *reference_data16_; + uint16_t *second_pred16_; + + ACMRandom rnd_; + ParamType params_; +}; + +class SADx4Test : public SADTestBase { + public: + SADx4Test() : SADTestBase(GetParam()) {} + + protected: + void SADs(unsigned int *results) const { + const uint8_t *references[] = { GetReference(0), GetReference(1), + GetReference(2), GetReference(3) }; + + ASM_REGISTER_STATE_CHECK(params_.func( + source_data_, source_stride_, references, reference_stride_, results)); + } + + void CheckSADs() const { + uint32_t reference_sad; + DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]); + + SADs(exp_sad); + for (int block = 0; block < 4; ++block) { + reference_sad = ReferenceSAD(GetBlockRefOffset(block)); + + EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block; + } + } +}; + +class SADSkipx4Test : public SADTestBase { + public: + SADSkipx4Test() : SADTestBase(GetParam()) {} + + protected: + void SADs(unsigned int *results) const { + const uint8_t *references[] = { GetReference(0), GetReference(1), + GetReference(2), GetReference(3) }; + + ASM_REGISTER_STATE_CHECK(params_.func( + source_data_, source_stride_, references, reference_stride_, results)); + } + + void CheckSADs() const { + uint32_t reference_sad; + DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]); + + SADs(exp_sad); + for (int block = 0; block < 4; ++block) { + reference_sad = ReferenceSADSkip(GetBlockRefOffset(block)); + + EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block; + } + } +}; + +class SADTest : public AbstractBench, public SADTestBase { + public: + SADTest() : SADTestBase(GetParam()) {} + + protected: + unsigned int SAD(int block_idx) const { + unsigned int ret; + const uint8_t *const reference = GetReference(block_idx); + + ASM_REGISTER_STATE_CHECK(ret = params_.func(source_data_, source_stride_, + reference, reference_stride_)); + return ret; + } + + void CheckSAD() const { + const unsigned int reference_sad = ReferenceSAD(GetBlockRefOffset(0)); + const unsigned int exp_sad = SAD(0); + + ASSERT_EQ(reference_sad, exp_sad); + } + + void Run() override { + params_.func(source_data_, source_stride_, reference_data_, + reference_stride_); + } +}; + +class SADSkipTest : public AbstractBench, public SADTestBase { + public: + SADSkipTest() : SADTestBase(GetParam()) {} + + protected: + unsigned int SAD(int block_idx) const { + unsigned int ret; + const uint8_t *const reference = GetReference(block_idx); + + ASM_REGISTER_STATE_CHECK(ret = params_.func(source_data_, source_stride_, + reference, reference_stride_)); + return ret; + } + + void CheckSAD() const { + const unsigned int reference_sad = ReferenceSADSkip(GetBlockRefOffset(0)); + const unsigned int exp_sad = SAD(0); + + ASSERT_EQ(reference_sad, exp_sad); + } + + void Run() override { + params_.func(source_data_, source_stride_, reference_data_, + reference_stride_); + } +}; + +class SADavgTest : public AbstractBench, public SADTestBase { + public: + SADavgTest() : SADTestBase(GetParam()) {} + + protected: + unsigned int SAD_avg(int block_idx) const { + unsigned int ret; + const uint8_t *const reference = GetReference(block_idx); + + ASM_REGISTER_STATE_CHECK(ret = params_.func(source_data_, source_stride_, + reference, reference_stride_, + second_pred_)); + return ret; + } + + void CheckSAD() const { + const unsigned int reference_sad = ReferenceSADavg(0); + const unsigned int exp_sad = SAD_avg(0); + + ASSERT_EQ(reference_sad, exp_sad); + } + + void Run() override { + params_.func(source_data_, source_stride_, reference_data_, + reference_stride_, second_pred_); + } +}; + +TEST_P(SADTest, MaxRef) { + FillConstant(source_data_, source_stride_, 0); + FillConstant(reference_data_, reference_stride_, mask_); + CheckSAD(); +} + +TEST_P(SADTest, MaxSrc) { + FillConstant(source_data_, source_stride_, mask_); + FillConstant(reference_data_, reference_stride_, 0); + CheckSAD(); +} + +TEST_P(SADTest, ShortRef) { + const int tmp_stride = reference_stride_; + reference_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + CheckSAD(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADTest, UnalignedRef) { + // The reference frame, but not the source frame, may be unaligned for + // certain types of searches. + const int tmp_stride = reference_stride_; + reference_stride_ -= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + CheckSAD(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADTest, ShortSrc) { + const int tmp_stride = source_stride_; + source_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + CheckSAD(); + source_stride_ = tmp_stride; +} + +TEST_P(SADTest, DISABLED_Speed) { + const int kCountSpeedTestBlock = 50000000 / (params_.width * params_.height); + FillRandom(source_data_, source_stride_); + + RunNTimes(kCountSpeedTestBlock); + + char title[16]; + snprintf(title, sizeof(title), "%dx%d", params_.width, params_.height); + PrintMedian(title); +} + +TEST_P(SADSkipTest, MaxRef) { + FillConstant(source_data_, source_stride_, 0); + FillConstant(reference_data_, reference_stride_, mask_); + CheckSAD(); +} + +TEST_P(SADSkipTest, MaxSrc) { + FillConstant(source_data_, source_stride_, mask_); + FillConstant(reference_data_, reference_stride_, 0); + CheckSAD(); +} + +TEST_P(SADSkipTest, ShortRef) { + const int tmp_stride = reference_stride_; + reference_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + CheckSAD(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADSkipTest, UnalignedRef) { + // The reference frame, but not the source frame, may be unaligned for + // certain types of searches. + const int tmp_stride = reference_stride_; + reference_stride_ -= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + CheckSAD(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADSkipTest, ShortSrc) { + const int tmp_stride = source_stride_; + source_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + CheckSAD(); + source_stride_ = tmp_stride; +} + +TEST_P(SADSkipTest, DISABLED_Speed) { + const int kCountSpeedTestBlock = 50000000 / (params_.width * params_.height); + FillRandom(source_data_, source_stride_); + + RunNTimes(kCountSpeedTestBlock); + + char title[16]; + snprintf(title, sizeof(title), "%dx%d", params_.width, params_.height); + PrintMedian(title); +} + +TEST_P(SADavgTest, MaxRef) { + FillConstant(source_data_, source_stride_, 0); + FillConstant(reference_data_, reference_stride_, mask_); + FillConstant(second_pred_, params_.width, 0); + CheckSAD(); +} +TEST_P(SADavgTest, MaxSrc) { + FillConstant(source_data_, source_stride_, mask_); + FillConstant(reference_data_, reference_stride_, 0); + FillConstant(second_pred_, params_.width, 0); + CheckSAD(); +} + +TEST_P(SADavgTest, ShortRef) { + const int tmp_stride = reference_stride_; + reference_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + FillRandom(second_pred_, params_.width); + CheckSAD(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADavgTest, UnalignedRef) { + // The reference frame, but not the source frame, may be unaligned for + // certain types of searches. + const int tmp_stride = reference_stride_; + reference_stride_ -= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + FillRandom(second_pred_, params_.width); + CheckSAD(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADavgTest, ShortSrc) { + const int tmp_stride = source_stride_; + source_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + FillRandom(second_pred_, params_.width); + CheckSAD(); + source_stride_ = tmp_stride; +} + +TEST_P(SADavgTest, DISABLED_Speed) { + const int kCountSpeedTestBlock = 50000000 / (params_.width * params_.height); + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + FillRandom(second_pred_, params_.width); + + RunNTimes(kCountSpeedTestBlock); + + char title[16]; + snprintf(title, sizeof(title), "%dx%d", params_.width, params_.height); + PrintMedian(title); +} + +TEST_P(SADx4Test, MaxRef) { + FillConstant(source_data_, source_stride_, 0); + FillConstant(GetReference(0), reference_stride_, mask_); + FillConstant(GetReference(1), reference_stride_, mask_); + FillConstant(GetReference(2), reference_stride_, mask_); + FillConstant(GetReference(3), reference_stride_, mask_); + CheckSADs(); +} + +TEST_P(SADx4Test, MaxSrc) { + FillConstant(source_data_, source_stride_, mask_); + FillConstant(GetReference(0), reference_stride_, 0); + FillConstant(GetReference(1), reference_stride_, 0); + FillConstant(GetReference(2), reference_stride_, 0); + FillConstant(GetReference(3), reference_stride_, 0); + CheckSADs(); +} + +TEST_P(SADx4Test, ShortRef) { + int tmp_stride = reference_stride_; + reference_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + CheckSADs(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADx4Test, UnalignedRef) { + // The reference frame, but not the source frame, may be unaligned for + // certain types of searches. + int tmp_stride = reference_stride_; + reference_stride_ -= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + CheckSADs(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADx4Test, ShortSrc) { + int tmp_stride = source_stride_; + source_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + CheckSADs(); + source_stride_ = tmp_stride; +} + +TEST_P(SADx4Test, SrcAlignedByWidth) { + uint8_t *tmp_source_data = source_data_; + source_data_ += params_.width; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + CheckSADs(); + source_data_ = tmp_source_data; +} + +TEST_P(SADx4Test, DISABLED_Speed) { + int tmp_stride = reference_stride_; + reference_stride_ -= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + const int kCountSpeedTestBlock = 500000000 / (params_.width * params_.height); + uint32_t reference_sad[4]; + DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]); + vpx_usec_timer timer; + for (int block = 0; block < 4; ++block) { + reference_sad[block] = ReferenceSAD(GetBlockRefOffset(block)); + } + vpx_usec_timer_start(&timer); + for (int i = 0; i < kCountSpeedTestBlock; ++i) { + SADs(exp_sad); + } + vpx_usec_timer_mark(&timer); + for (int block = 0; block < 4; ++block) { + EXPECT_EQ(reference_sad[block], exp_sad[block]) << "block " << block; + } + const int elapsed_time = + static_cast(vpx_usec_timer_elapsed(&timer) / 1000); + printf("sad%dx%dx4 (%2dbit) time: %5d ms\n", params_.width, params_.height, + bit_depth_, elapsed_time); + + reference_stride_ = tmp_stride; +} + +TEST_P(SADSkipx4Test, MaxRef) { + FillConstant(source_data_, source_stride_, 0); + FillConstant(GetReference(0), reference_stride_, mask_); + FillConstant(GetReference(1), reference_stride_, mask_); + FillConstant(GetReference(2), reference_stride_, mask_); + FillConstant(GetReference(3), reference_stride_, mask_); + CheckSADs(); +} + +TEST_P(SADSkipx4Test, MaxSrc) { + FillConstant(source_data_, source_stride_, mask_); + FillConstant(GetReference(0), reference_stride_, 0); + FillConstant(GetReference(1), reference_stride_, 0); + FillConstant(GetReference(2), reference_stride_, 0); + FillConstant(GetReference(3), reference_stride_, 0); + CheckSADs(); +} + +TEST_P(SADSkipx4Test, ShortRef) { + int tmp_stride = reference_stride_; + reference_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + CheckSADs(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADSkipx4Test, UnalignedRef) { + // The reference frame, but not the source frame, may be unaligned for + // certain types of searches. + int tmp_stride = reference_stride_; + reference_stride_ -= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + CheckSADs(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADSkipx4Test, ShortSrc) { + int tmp_stride = source_stride_; + source_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + CheckSADs(); + source_stride_ = tmp_stride; +} + +TEST_P(SADSkipx4Test, SrcAlignedByWidth) { + uint8_t *tmp_source_data = source_data_; + source_data_ += params_.width; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + CheckSADs(); + source_data_ = tmp_source_data; +} + +TEST_P(SADSkipx4Test, DISABLED_Speed) { + int tmp_stride = reference_stride_; + reference_stride_ -= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + const int kCountSpeedTestBlock = 500000000 / (params_.width * params_.height); + uint32_t reference_sad[4]; + DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]); + vpx_usec_timer timer; + for (int block = 0; block < 4; ++block) { + reference_sad[block] = ReferenceSADSkip(GetBlockRefOffset(block)); + } + vpx_usec_timer_start(&timer); + for (int i = 0; i < kCountSpeedTestBlock; ++i) { + SADs(exp_sad); + } + vpx_usec_timer_mark(&timer); + for (int block = 0; block < 4; ++block) { + EXPECT_EQ(reference_sad[block], exp_sad[block]) << "block " << block; + } + const int elapsed_time = + static_cast(vpx_usec_timer_elapsed(&timer) / 1000); + printf("sad%dx%dx4 (%2dbit) time: %5d ms\n", params_.width, params_.height, + bit_depth_, elapsed_time); + + reference_stride_ = tmp_stride; +} + +//------------------------------------------------------------------------------ +// C functions +const SadMxNParam c_tests[] = { + SadMxNParam(64, 64, &vpx_sad64x64_c), + SadMxNParam(64, 32, &vpx_sad64x32_c), + SadMxNParam(32, 64, &vpx_sad32x64_c), + SadMxNParam(32, 32, &vpx_sad32x32_c), + SadMxNParam(32, 16, &vpx_sad32x16_c), + SadMxNParam(16, 32, &vpx_sad16x32_c), + SadMxNParam(16, 16, &vpx_sad16x16_c), + SadMxNParam(16, 8, &vpx_sad16x8_c), + SadMxNParam(8, 16, &vpx_sad8x16_c), + SadMxNParam(8, 8, &vpx_sad8x8_c), + SadMxNParam(8, 4, &vpx_sad8x4_c), + SadMxNParam(4, 8, &vpx_sad4x8_c), + SadMxNParam(4, 4, &vpx_sad4x4_c), +#if CONFIG_VP9_HIGHBITDEPTH + SadMxNParam(64, 64, &vpx_highbd_sad64x64_c, 8), + SadMxNParam(64, 32, &vpx_highbd_sad64x32_c, 8), + SadMxNParam(32, 64, &vpx_highbd_sad32x64_c, 8), + SadMxNParam(32, 32, &vpx_highbd_sad32x32_c, 8), + SadMxNParam(32, 16, &vpx_highbd_sad32x16_c, 8), + SadMxNParam(16, 32, &vpx_highbd_sad16x32_c, 8), + SadMxNParam(16, 16, &vpx_highbd_sad16x16_c, 8), + SadMxNParam(16, 8, &vpx_highbd_sad16x8_c, 8), + SadMxNParam(8, 16, &vpx_highbd_sad8x16_c, 8), + SadMxNParam(8, 8, &vpx_highbd_sad8x8_c, 8), + SadMxNParam(8, 4, &vpx_highbd_sad8x4_c, 8), + SadMxNParam(4, 8, &vpx_highbd_sad4x8_c, 8), + SadMxNParam(4, 4, &vpx_highbd_sad4x4_c, 8), + SadMxNParam(64, 64, &vpx_highbd_sad64x64_c, 10), + SadMxNParam(64, 32, &vpx_highbd_sad64x32_c, 10), + SadMxNParam(32, 64, &vpx_highbd_sad32x64_c, 10), + SadMxNParam(32, 32, &vpx_highbd_sad32x32_c, 10), + SadMxNParam(32, 16, &vpx_highbd_sad32x16_c, 10), + SadMxNParam(16, 32, &vpx_highbd_sad16x32_c, 10), + SadMxNParam(16, 16, &vpx_highbd_sad16x16_c, 10), + SadMxNParam(16, 8, &vpx_highbd_sad16x8_c, 10), + SadMxNParam(8, 16, &vpx_highbd_sad8x16_c, 10), + SadMxNParam(8, 8, &vpx_highbd_sad8x8_c, 10), + SadMxNParam(8, 4, &vpx_highbd_sad8x4_c, 10), + SadMxNParam(4, 8, &vpx_highbd_sad4x8_c, 10), + SadMxNParam(4, 4, &vpx_highbd_sad4x4_c, 10), + SadMxNParam(64, 64, &vpx_highbd_sad64x64_c, 12), + SadMxNParam(64, 32, &vpx_highbd_sad64x32_c, 12), + SadMxNParam(32, 64, &vpx_highbd_sad32x64_c, 12), + SadMxNParam(32, 32, &vpx_highbd_sad32x32_c, 12), + SadMxNParam(32, 16, &vpx_highbd_sad32x16_c, 12), + SadMxNParam(16, 32, &vpx_highbd_sad16x32_c, 12), + SadMxNParam(16, 16, &vpx_highbd_sad16x16_c, 12), + SadMxNParam(16, 8, &vpx_highbd_sad16x8_c, 12), + SadMxNParam(8, 16, &vpx_highbd_sad8x16_c, 12), + SadMxNParam(8, 8, &vpx_highbd_sad8x8_c, 12), + SadMxNParam(8, 4, &vpx_highbd_sad8x4_c, 12), + SadMxNParam(4, 8, &vpx_highbd_sad4x8_c, 12), + SadMxNParam(4, 4, &vpx_highbd_sad4x4_c, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(C, SADTest, ::testing::ValuesIn(c_tests)); + +const SadSkipMxNParam skip_c_tests[] = { + SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_c), + SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_c), + SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_c), + SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_c), + SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_c), + SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_c), + SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_c), + SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_c), + SadSkipMxNParam(8, 16, &vpx_sad_skip_8x16_c), + SadSkipMxNParam(8, 8, &vpx_sad_skip_8x8_c), + SadSkipMxNParam(4, 8, &vpx_sad_skip_4x8_c), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 8), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 8), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 8), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 8), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 8), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 8), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 8), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 8), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 8), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 8), + SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 8), + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 10), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 10), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 10), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 10), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 10), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 10), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 10), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 10), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 10), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 10), + SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 10), + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 12), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 12), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 12), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 12), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 12), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 12), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 12), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 12), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 12), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 12), + SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(C, SADSkipTest, ::testing::ValuesIn(skip_c_tests)); + +const SadMxNAvgParam avg_c_tests[] = { + SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_c), + SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_c), + SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_c), + SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_c), + SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_c), + SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_c), + SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_c), + SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_c), + SadMxNAvgParam(8, 16, &vpx_sad8x16_avg_c), + SadMxNAvgParam(8, 8, &vpx_sad8x8_avg_c), + SadMxNAvgParam(8, 4, &vpx_sad8x4_avg_c), + SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_c), + SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_c), +#if CONFIG_VP9_HIGHBITDEPTH + SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_c, 8), + SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_c, 8), + SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_c, 8), + SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_c, 8), + SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_c, 8), + SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_c, 8), + SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_c, 8), + SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_c, 8), + SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_c, 8), + SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_c, 8), + SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_c, 8), + SadMxNAvgParam(4, 8, &vpx_highbd_sad4x8_avg_c, 8), + SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_c, 8), + SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_c, 10), + SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_c, 10), + SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_c, 10), + SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_c, 10), + SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_c, 10), + SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_c, 10), + SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_c, 10), + SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_c, 10), + SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_c, 10), + SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_c, 10), + SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_c, 10), + SadMxNAvgParam(4, 8, &vpx_highbd_sad4x8_avg_c, 10), + SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_c, 10), + SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_c, 12), + SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_c, 12), + SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_c, 12), + SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_c, 12), + SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_c, 12), + SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_c, 12), + SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_c, 12), + SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_c, 12), + SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_c, 12), + SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_c, 12), + SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_c, 12), + SadMxNAvgParam(4, 8, &vpx_highbd_sad4x8_avg_c, 12), + SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_c, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(C, SADavgTest, ::testing::ValuesIn(avg_c_tests)); + +const SadMxNx4Param x4d_c_tests[] = { + SadMxNx4Param(64, 64, &vpx_sad64x64x4d_c), + SadMxNx4Param(64, 32, &vpx_sad64x32x4d_c), + SadMxNx4Param(32, 64, &vpx_sad32x64x4d_c), + SadMxNx4Param(32, 32, &vpx_sad32x32x4d_c), + SadMxNx4Param(32, 16, &vpx_sad32x16x4d_c), + SadMxNx4Param(16, 32, &vpx_sad16x32x4d_c), + SadMxNx4Param(16, 16, &vpx_sad16x16x4d_c), + SadMxNx4Param(16, 8, &vpx_sad16x8x4d_c), + SadMxNx4Param(8, 16, &vpx_sad8x16x4d_c), + SadMxNx4Param(8, 8, &vpx_sad8x8x4d_c), + SadMxNx4Param(8, 4, &vpx_sad8x4x4d_c), + SadMxNx4Param(4, 8, &vpx_sad4x8x4d_c), + SadMxNx4Param(4, 4, &vpx_sad4x4x4d_c), +#if CONFIG_VP9_HIGHBITDEPTH + SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_c, 8), + SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_c, 8), + SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_c, 8), + SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_c, 8), + SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_c, 8), + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_c, 8), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_c, 8), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_c, 8), + SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_c, 8), + SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_c, 8), + SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_c, 8), + SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_c, 8), + SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_c, 8), + SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_c, 10), + SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_c, 10), + SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_c, 10), + SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_c, 10), + SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_c, 10), + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_c, 10), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_c, 10), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_c, 10), + SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_c, 10), + SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_c, 10), + SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_c, 10), + SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_c, 10), + SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_c, 10), + SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_c, 12), + SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_c, 12), + SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_c, 12), + SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_c, 12), + SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_c, 12), + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_c, 12), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_c, 12), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_c, 12), + SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_c, 12), + SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_c, 12), + SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_c, 12), + SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_c, 12), + SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_c, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests)); + +const SadSkipMxNx4Param skip_x4d_c_tests[] = { + SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_c), + SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_c), + SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_c), + SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_c), + SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_c), + SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_c), + SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_c), + SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_c), + SadSkipMxNx4Param(8, 16, &vpx_sad_skip_8x16x4d_c), + SadSkipMxNx4Param(8, 8, &vpx_sad_skip_8x8x4d_c), + SadSkipMxNx4Param(4, 8, &vpx_sad_skip_4x8x4d_c), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 8), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 8), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 8), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 8), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 8), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 8), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 8), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 8), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 8), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 8), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 8), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 10), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 10), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 10), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 10), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 10), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 10), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 10), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 10), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 10), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 10), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 10), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 12), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 12), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 12), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 12), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 12), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 12), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 12), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 12), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 12), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 12), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(C, SADSkipx4Test, + ::testing::ValuesIn(skip_x4d_c_tests)); + +//------------------------------------------------------------------------------ +// ARM functions +#if HAVE_NEON +const SadMxNParam neon_tests[] = { + SadMxNParam(64, 64, &vpx_sad64x64_neon), + SadMxNParam(64, 32, &vpx_sad64x32_neon), + SadMxNParam(32, 32, &vpx_sad32x32_neon), + SadMxNParam(16, 32, &vpx_sad16x32_neon), + SadMxNParam(16, 16, &vpx_sad16x16_neon), + SadMxNParam(16, 8, &vpx_sad16x8_neon), + SadMxNParam(8, 16, &vpx_sad8x16_neon), + SadMxNParam(8, 8, &vpx_sad8x8_neon), + SadMxNParam(8, 4, &vpx_sad8x4_neon), + SadMxNParam(4, 8, &vpx_sad4x8_neon), + SadMxNParam(4, 4, &vpx_sad4x4_neon), +#if CONFIG_VP9_HIGHBITDEPTH + SadMxNParam(4, 4, &vpx_highbd_sad4x4_neon, 8), + SadMxNParam(4, 8, &vpx_highbd_sad4x8_neon, 8), + SadMxNParam(8, 4, &vpx_highbd_sad8x4_neon, 8), + SadMxNParam(8, 8, &vpx_highbd_sad8x8_neon, 8), + SadMxNParam(8, 16, &vpx_highbd_sad8x16_neon, 8), + SadMxNParam(16, 8, &vpx_highbd_sad16x8_neon, 8), + SadMxNParam(16, 16, &vpx_highbd_sad16x16_neon, 8), + SadMxNParam(16, 32, &vpx_highbd_sad16x32_neon, 8), + SadMxNParam(32, 32, &vpx_highbd_sad32x32_neon, 8), + SadMxNParam(32, 64, &vpx_highbd_sad32x64_neon, 8), + SadMxNParam(64, 32, &vpx_highbd_sad64x32_neon, 8), + SadMxNParam(64, 64, &vpx_highbd_sad64x64_neon, 8), + SadMxNParam(4, 4, &vpx_highbd_sad4x4_neon, 10), + SadMxNParam(4, 8, &vpx_highbd_sad4x8_neon, 10), + SadMxNParam(8, 4, &vpx_highbd_sad8x4_neon, 10), + SadMxNParam(8, 8, &vpx_highbd_sad8x8_neon, 10), + SadMxNParam(8, 16, &vpx_highbd_sad8x16_neon, 10), + SadMxNParam(16, 8, &vpx_highbd_sad16x8_neon, 10), + SadMxNParam(16, 16, &vpx_highbd_sad16x16_neon, 10), + SadMxNParam(16, 32, &vpx_highbd_sad16x32_neon, 10), + SadMxNParam(32, 32, &vpx_highbd_sad32x32_neon, 10), + SadMxNParam(32, 64, &vpx_highbd_sad32x64_neon, 10), + SadMxNParam(64, 32, &vpx_highbd_sad64x32_neon, 10), + SadMxNParam(64, 64, &vpx_highbd_sad64x64_neon, 10), + SadMxNParam(4, 4, &vpx_highbd_sad4x4_neon, 12), + SadMxNParam(4, 8, &vpx_highbd_sad4x8_neon, 12), + SadMxNParam(8, 4, &vpx_highbd_sad8x4_neon, 12), + SadMxNParam(8, 8, &vpx_highbd_sad8x8_neon, 12), + SadMxNParam(8, 16, &vpx_highbd_sad8x16_neon, 12), + SadMxNParam(16, 8, &vpx_highbd_sad16x8_neon, 12), + SadMxNParam(16, 16, &vpx_highbd_sad16x16_neon, 12), + SadMxNParam(16, 32, &vpx_highbd_sad16x32_neon, 12), + SadMxNParam(32, 32, &vpx_highbd_sad32x32_neon, 12), + SadMxNParam(32, 64, &vpx_highbd_sad32x64_neon, 12), + SadMxNParam(64, 32, &vpx_highbd_sad64x32_neon, 12), + SadMxNParam(64, 64, &vpx_highbd_sad64x64_neon, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH + +}; +INSTANTIATE_TEST_SUITE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests)); + +#if HAVE_NEON_DOTPROD +const SadMxNParam neon_dotprod_tests[] = { + SadMxNParam(64, 64, &vpx_sad64x64_neon_dotprod), + SadMxNParam(64, 32, &vpx_sad64x32_neon_dotprod), + SadMxNParam(32, 64, &vpx_sad32x64_neon_dotprod), + SadMxNParam(32, 32, &vpx_sad32x32_neon_dotprod), + SadMxNParam(32, 16, &vpx_sad32x16_neon_dotprod), + SadMxNParam(16, 32, &vpx_sad16x32_neon_dotprod), + SadMxNParam(16, 16, &vpx_sad16x16_neon_dotprod), + SadMxNParam(16, 8, &vpx_sad16x8_neon_dotprod), +}; +INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADTest, + ::testing::ValuesIn(neon_dotprod_tests)); +#endif // HAVE_NEON_DOTPROD + +const SadSkipMxNParam skip_neon_tests[] = { + SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_neon), + SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_neon), + SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_neon), + SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_neon), + SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_neon), + SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_neon), + SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_neon), + SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_neon), + SadSkipMxNParam(8, 16, &vpx_sad_skip_8x16_neon), + SadSkipMxNParam(8, 8, &vpx_sad_skip_8x8_neon), + SadSkipMxNParam(8, 4, &vpx_sad_skip_8x4_neon), + SadSkipMxNParam(4, 8, &vpx_sad_skip_4x8_neon), + SadSkipMxNParam(4, 4, &vpx_sad_skip_4x4_neon), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNParam(4, 4, &vpx_highbd_sad_skip_4x4_neon, 8), + SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_neon, 8), + SadSkipMxNParam(8, 4, &vpx_highbd_sad_skip_8x4_neon, 8), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_neon, 8), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_neon, 8), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_neon, 8), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_neon, 8), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_neon, 8), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_neon, 8), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_neon, 8), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_neon, 8), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_neon, 8), + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_neon, 8), + SadSkipMxNParam(4, 4, &vpx_highbd_sad_skip_4x4_neon, 10), + SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_neon, 10), + SadSkipMxNParam(8, 4, &vpx_highbd_sad_skip_8x4_neon, 10), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_neon, 10), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_neon, 10), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_neon, 10), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_neon, 10), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_neon, 10), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_neon, 10), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_neon, 10), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_neon, 10), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_neon, 10), + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_neon, 10), + SadSkipMxNParam(4, 4, &vpx_highbd_sad_skip_4x4_neon, 12), + SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_neon, 12), + SadSkipMxNParam(8, 4, &vpx_highbd_sad_skip_8x4_neon, 12), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_neon, 12), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_neon, 12), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_neon, 12), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_neon, 12), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_neon, 12), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_neon, 12), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_neon, 12), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_neon, 12), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_neon, 12), + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_neon, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(NEON, SADSkipTest, + ::testing::ValuesIn(skip_neon_tests)); + +#if HAVE_NEON_DOTPROD +const SadSkipMxNParam skip_neon_dotprod_tests[] = { + SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_neon_dotprod), + SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_neon_dotprod), + SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_neon_dotprod), + SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_neon_dotprod), + SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_neon_dotprod), + SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_neon_dotprod), + SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_neon_dotprod), + SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_neon_dotprod), +}; +INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADSkipTest, + ::testing::ValuesIn(skip_neon_dotprod_tests)); +#endif // HAVE_NEON_DOTPROD + +const SadMxNAvgParam avg_neon_tests[] = { + SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_neon), + SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_neon), + SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_neon), + SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_neon), + SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_neon), + SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_neon), + SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_neon), + SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_neon), + SadMxNAvgParam(8, 16, &vpx_sad8x16_avg_neon), + SadMxNAvgParam(8, 8, &vpx_sad8x8_avg_neon), + SadMxNAvgParam(8, 4, &vpx_sad8x4_avg_neon), + SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_neon), + SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_neon), +#if CONFIG_VP9_HIGHBITDEPTH + SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_neon, 8), + SadMxNAvgParam(4, 8, &vpx_highbd_sad4x8_avg_neon, 8), + SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_neon, 8), + SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_neon, 8), + SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_neon, 8), + SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_neon, 8), + SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_neon, 8), + SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_neon, 8), + SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_neon, 8), + SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_neon, 8), + SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_neon, 8), + SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_neon, 8), + SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_neon, 8), + SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_neon, 10), + SadMxNAvgParam(4, 8, &vpx_highbd_sad4x8_avg_neon, 10), + SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_neon, 10), + SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_neon, 10), + SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_neon, 10), + SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_neon, 10), + SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_neon, 10), + SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_neon, 10), + SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_neon, 10), + SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_neon, 10), + SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_neon, 10), + SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_neon, 10), + SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_neon, 10), + SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_neon, 12), + SadMxNAvgParam(4, 8, &vpx_highbd_sad4x8_avg_neon, 12), + SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_neon, 12), + SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_neon, 12), + SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_neon, 12), + SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_neon, 12), + SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_neon, 12), + SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_neon, 12), + SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_neon, 12), + SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_neon, 12), + SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_neon, 12), + SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_neon, 12), + SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_neon, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(NEON, SADavgTest, ::testing::ValuesIn(avg_neon_tests)); + +#if HAVE_NEON_DOTPROD +const SadMxNAvgParam avg_neon_dotprod_tests[] = { + SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_neon_dotprod), + SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_neon_dotprod), + SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_neon_dotprod), + SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_neon_dotprod), + SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_neon_dotprod), + SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_neon_dotprod), + SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_neon_dotprod), + SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_neon_dotprod), +}; +INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADavgTest, + ::testing::ValuesIn(avg_neon_dotprod_tests)); +#endif // HAVE_NEON_DOTPROD + +const SadMxNx4Param x4d_neon_tests[] = { + SadMxNx4Param(64, 64, &vpx_sad64x64x4d_neon), + SadMxNx4Param(64, 32, &vpx_sad64x32x4d_neon), + SadMxNx4Param(32, 64, &vpx_sad32x64x4d_neon), + SadMxNx4Param(32, 32, &vpx_sad32x32x4d_neon), + SadMxNx4Param(32, 16, &vpx_sad32x16x4d_neon), + SadMxNx4Param(16, 32, &vpx_sad16x32x4d_neon), + SadMxNx4Param(16, 16, &vpx_sad16x16x4d_neon), + SadMxNx4Param(16, 8, &vpx_sad16x8x4d_neon), + SadMxNx4Param(8, 16, &vpx_sad8x16x4d_neon), + SadMxNx4Param(8, 8, &vpx_sad8x8x4d_neon), + SadMxNx4Param(8, 4, &vpx_sad8x4x4d_neon), + SadMxNx4Param(4, 8, &vpx_sad4x8x4d_neon), + SadMxNx4Param(4, 4, &vpx_sad4x4x4d_neon), +#if CONFIG_VP9_HIGHBITDEPTH + SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_neon, 8), + SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_neon, 8), + SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_neon, 8), + SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_neon, 8), + SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_neon, 8), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_neon, 8), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_neon, 8), + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_neon, 8), + SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_neon, 8), + SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_neon, 8), + SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_neon, 8), + SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_neon, 8), + SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_neon, 10), + SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_neon, 10), + SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_neon, 10), + SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_neon, 10), + SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_neon, 10), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_neon, 10), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_neon, 10), + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_neon, 10), + SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_neon, 10), + SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_neon, 10), + SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_neon, 10), + SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_neon, 10), + SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_neon, 12), + SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_neon, 12), + SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_neon, 12), + SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_neon, 12), + SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_neon, 12), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_neon, 12), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_neon, 12), + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_neon, 12), + SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_neon, 12), + SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_neon, 12), + SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_neon, 12), + SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_neon, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests)); + +#if HAVE_NEON_DOTPROD +const SadMxNx4Param x4d_neon_dotprod_tests[] = { + SadMxNx4Param(64, 64, &vpx_sad64x64x4d_neon_dotprod), + SadMxNx4Param(64, 32, &vpx_sad64x32x4d_neon_dotprod), + SadMxNx4Param(32, 64, &vpx_sad32x64x4d_neon_dotprod), + SadMxNx4Param(32, 32, &vpx_sad32x32x4d_neon_dotprod), + SadMxNx4Param(32, 16, &vpx_sad32x16x4d_neon_dotprod), + SadMxNx4Param(16, 32, &vpx_sad16x32x4d_neon_dotprod), + SadMxNx4Param(16, 16, &vpx_sad16x16x4d_neon_dotprod), + SadMxNx4Param(16, 8, &vpx_sad16x8x4d_neon_dotprod), +}; +INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADx4Test, + ::testing::ValuesIn(x4d_neon_dotprod_tests)); +#endif // HAVE_NEON_DOTPROD + +const SadSkipMxNx4Param skip_x4d_neon_tests[] = { + SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_neon), + SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_neon), + SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_neon), + SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_neon), + SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_neon), + SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_neon), + SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_neon), + SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_neon), + SadSkipMxNx4Param(8, 16, &vpx_sad_skip_8x16x4d_neon), + SadSkipMxNx4Param(8, 8, &vpx_sad_skip_8x8x4d_neon), + SadSkipMxNx4Param(8, 4, &vpx_sad_skip_8x4x4d_neon), + SadSkipMxNx4Param(4, 8, &vpx_sad_skip_4x8x4d_neon), + SadSkipMxNx4Param(4, 4, &vpx_sad_skip_4x4x4d_neon), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNx4Param(4, 4, &vpx_highbd_sad_skip_4x4x4d_neon, 8), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_neon, 8), + SadSkipMxNx4Param(8, 4, &vpx_highbd_sad_skip_8x4x4d_neon, 8), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_neon, 8), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_neon, 8), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_neon, 8), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_neon, 8), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_neon, 8), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_neon, 8), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_neon, 8), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_neon, 8), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_neon, 8), + SadSkipMxNx4Param(4, 4, &vpx_highbd_sad_skip_4x4x4d_neon, 10), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_neon, 10), + SadSkipMxNx4Param(8, 4, &vpx_highbd_sad_skip_8x4x4d_neon, 10), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_neon, 10), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_neon, 10), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_neon, 10), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_neon, 10), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_neon, 10), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_neon, 10), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_neon, 10), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_neon, 10), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_neon, 10), + SadSkipMxNx4Param(4, 4, &vpx_highbd_sad_skip_4x4x4d_neon, 12), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_neon, 12), + SadSkipMxNx4Param(8, 4, &vpx_highbd_sad_skip_8x4x4d_neon, 12), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_neon, 12), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_neon, 12), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_neon, 12), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_neon, 12), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_neon, 12), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_neon, 12), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_neon, 12), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_neon, 12), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_neon, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(NEON, SADSkipx4Test, + ::testing::ValuesIn(skip_x4d_neon_tests)); + +#if HAVE_NEONE_DOTPROD +const SadSkipMxNx4Param skip_x4d_neon_dotprod_tests[] = { + SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_neon_dotprod), + SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_neon_dotprod), + SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_neon_dotprod), + SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_neon_dotprod), + SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_neon_dotprod), + SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_neon_dotprod), + SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_neon_dotprod), + SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_neon_dotprod), +}; +INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADSkipx4Test, + ::testing::ValuesIn(skip_x4d_neon_dotprod_tests)); +#endif // HAVE_NEON_DOTPROD +#endif // HAVE_NEON + +//------------------------------------------------------------------------------ +// x86 functions +#if HAVE_SSE2 +const SadMxNParam sse2_tests[] = { + SadMxNParam(64, 64, &vpx_sad64x64_sse2), + SadMxNParam(64, 32, &vpx_sad64x32_sse2), + SadMxNParam(32, 64, &vpx_sad32x64_sse2), + SadMxNParam(32, 32, &vpx_sad32x32_sse2), + SadMxNParam(32, 16, &vpx_sad32x16_sse2), + SadMxNParam(16, 32, &vpx_sad16x32_sse2), + SadMxNParam(16, 16, &vpx_sad16x16_sse2), + SadMxNParam(16, 8, &vpx_sad16x8_sse2), + SadMxNParam(8, 16, &vpx_sad8x16_sse2), + SadMxNParam(8, 8, &vpx_sad8x8_sse2), + SadMxNParam(8, 4, &vpx_sad8x4_sse2), + SadMxNParam(4, 8, &vpx_sad4x8_sse2), + SadMxNParam(4, 4, &vpx_sad4x4_sse2), +#if CONFIG_VP9_HIGHBITDEPTH + SadMxNParam(64, 64, &vpx_highbd_sad64x64_sse2, 8), + SadMxNParam(64, 32, &vpx_highbd_sad64x32_sse2, 8), + SadMxNParam(32, 64, &vpx_highbd_sad32x64_sse2, 8), + SadMxNParam(32, 32, &vpx_highbd_sad32x32_sse2, 8), + SadMxNParam(32, 16, &vpx_highbd_sad32x16_sse2, 8), + SadMxNParam(16, 32, &vpx_highbd_sad16x32_sse2, 8), + SadMxNParam(16, 16, &vpx_highbd_sad16x16_sse2, 8), + SadMxNParam(16, 8, &vpx_highbd_sad16x8_sse2, 8), + SadMxNParam(8, 16, &vpx_highbd_sad8x16_sse2, 8), + SadMxNParam(8, 8, &vpx_highbd_sad8x8_sse2, 8), + SadMxNParam(8, 4, &vpx_highbd_sad8x4_sse2, 8), + SadMxNParam(64, 64, &vpx_highbd_sad64x64_sse2, 10), + SadMxNParam(64, 32, &vpx_highbd_sad64x32_sse2, 10), + SadMxNParam(32, 64, &vpx_highbd_sad32x64_sse2, 10), + SadMxNParam(32, 32, &vpx_highbd_sad32x32_sse2, 10), + SadMxNParam(32, 16, &vpx_highbd_sad32x16_sse2, 10), + SadMxNParam(16, 32, &vpx_highbd_sad16x32_sse2, 10), + SadMxNParam(16, 16, &vpx_highbd_sad16x16_sse2, 10), + SadMxNParam(16, 8, &vpx_highbd_sad16x8_sse2, 10), + SadMxNParam(8, 16, &vpx_highbd_sad8x16_sse2, 10), + SadMxNParam(8, 8, &vpx_highbd_sad8x8_sse2, 10), + SadMxNParam(8, 4, &vpx_highbd_sad8x4_sse2, 10), + SadMxNParam(64, 64, &vpx_highbd_sad64x64_sse2, 12), + SadMxNParam(64, 32, &vpx_highbd_sad64x32_sse2, 12), + SadMxNParam(32, 64, &vpx_highbd_sad32x64_sse2, 12), + SadMxNParam(32, 32, &vpx_highbd_sad32x32_sse2, 12), + SadMxNParam(32, 16, &vpx_highbd_sad32x16_sse2, 12), + SadMxNParam(16, 32, &vpx_highbd_sad16x32_sse2, 12), + SadMxNParam(16, 16, &vpx_highbd_sad16x16_sse2, 12), + SadMxNParam(16, 8, &vpx_highbd_sad16x8_sse2, 12), + SadMxNParam(8, 16, &vpx_highbd_sad8x16_sse2, 12), + SadMxNParam(8, 8, &vpx_highbd_sad8x8_sse2, 12), + SadMxNParam(8, 4, &vpx_highbd_sad8x4_sse2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests)); + +const SadSkipMxNParam skip_sse2_tests[] = { + SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_sse2), + SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_sse2), + SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_sse2), + SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_sse2), + SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_sse2), + SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_sse2), + SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_sse2), + SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_sse2), + SadSkipMxNParam(8, 16, &vpx_sad_skip_8x16_sse2), + SadSkipMxNParam(8, 8, &vpx_sad_skip_8x8_sse2), + SadSkipMxNParam(4, 8, &vpx_sad_skip_4x8_sse2), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 8), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 8), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 8), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 8), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 8), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 8), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 8), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 8), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 8), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 8), + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 10), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 10), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 10), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 10), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 10), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 10), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 10), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 10), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 10), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 10), + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 12), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 12), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 12), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 12), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 12), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 12), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 12), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 12), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 12), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(SSE2, SADSkipTest, + ::testing::ValuesIn(skip_sse2_tests)); + +const SadMxNAvgParam avg_sse2_tests[] = { + SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_sse2), + SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_sse2), + SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_sse2), + SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_sse2), + SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_sse2), + SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_sse2), + SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_sse2), + SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_sse2), + SadMxNAvgParam(8, 16, &vpx_sad8x16_avg_sse2), + SadMxNAvgParam(8, 8, &vpx_sad8x8_avg_sse2), + SadMxNAvgParam(8, 4, &vpx_sad8x4_avg_sse2), + SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_sse2), + SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_sse2), +#if CONFIG_VP9_HIGHBITDEPTH + SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_sse2, 8), + SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_sse2, 8), + SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_sse2, 8), + SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_sse2, 8), + SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_sse2, 8), + SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_sse2, 8), + SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_sse2, 8), + SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_sse2, 8), + SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_sse2, 8), + SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_sse2, 8), + SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_sse2, 8), + SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_sse2, 10), + SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_sse2, 10), + SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_sse2, 10), + SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_sse2, 10), + SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_sse2, 10), + SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_sse2, 10), + SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_sse2, 10), + SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_sse2, 10), + SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_sse2, 10), + SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_sse2, 10), + SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_sse2, 10), + SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_sse2, 12), + SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_sse2, 12), + SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_sse2, 12), + SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_sse2, 12), + SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_sse2, 12), + SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_sse2, 12), + SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_sse2, 12), + SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_sse2, 12), + SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_sse2, 12), + SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_sse2, 12), + SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_sse2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(SSE2, SADavgTest, ::testing::ValuesIn(avg_sse2_tests)); + +const SadMxNx4Param x4d_sse2_tests[] = { + SadMxNx4Param(64, 64, &vpx_sad64x64x4d_sse2), + SadMxNx4Param(64, 32, &vpx_sad64x32x4d_sse2), + SadMxNx4Param(32, 64, &vpx_sad32x64x4d_sse2), + SadMxNx4Param(32, 32, &vpx_sad32x32x4d_sse2), + SadMxNx4Param(32, 16, &vpx_sad32x16x4d_sse2), + SadMxNx4Param(16, 32, &vpx_sad16x32x4d_sse2), + SadMxNx4Param(16, 16, &vpx_sad16x16x4d_sse2), + SadMxNx4Param(16, 8, &vpx_sad16x8x4d_sse2), + SadMxNx4Param(8, 16, &vpx_sad8x16x4d_sse2), + SadMxNx4Param(8, 8, &vpx_sad8x8x4d_sse2), + SadMxNx4Param(8, 4, &vpx_sad8x4x4d_sse2), + SadMxNx4Param(4, 8, &vpx_sad4x8x4d_sse2), + SadMxNx4Param(4, 4, &vpx_sad4x4x4d_sse2), +#if CONFIG_VP9_HIGHBITDEPTH + SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_sse2, 8), + SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_sse2, 8), + SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_sse2, 8), + SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_sse2, 8), + SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_sse2, 8), + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_sse2, 8), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_sse2, 8), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_sse2, 8), + SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_sse2, 8), + SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_sse2, 8), + SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_sse2, 8), + SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_sse2, 8), + SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_sse2, 8), + SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_sse2, 10), + SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_sse2, 10), + SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_sse2, 10), + SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_sse2, 10), + SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_sse2, 10), + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_sse2, 10), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_sse2, 10), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_sse2, 10), + SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_sse2, 10), + SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_sse2, 10), + SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_sse2, 10), + SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_sse2, 10), + SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_sse2, 10), + SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_sse2, 12), + SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_sse2, 12), + SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_sse2, 12), + SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_sse2, 12), + SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_sse2, 12), + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_sse2, 12), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_sse2, 12), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_sse2, 12), + SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_sse2, 12), + SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_sse2, 12), + SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_sse2, 12), + SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_sse2, 12), + SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_sse2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests)); + +const SadSkipMxNx4Param skip_x4d_sse2_tests[] = { + SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_sse2), + SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_sse2), + SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_sse2), + SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_sse2), + SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_sse2), + SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_sse2), + SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_sse2), + SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_sse2), + SadSkipMxNx4Param(8, 16, &vpx_sad_skip_8x16x4d_sse2), + SadSkipMxNx4Param(8, 8, &vpx_sad_skip_8x8x4d_sse2), + SadSkipMxNx4Param(4, 8, &vpx_sad_skip_4x8x4d_sse2), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 8), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 8), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 8), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 8), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 8), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 8), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 8), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 8), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 8), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 8), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 8), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 10), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 10), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 10), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 10), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 10), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 10), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 10), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 10), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 10), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 10), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 10), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 12), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 12), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 12), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 12), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 12), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 12), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 12), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 12), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 12), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 12), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(SSE2, SADSkipx4Test, + ::testing::ValuesIn(skip_x4d_sse2_tests)); +#endif // HAVE_SSE2 + +#if HAVE_SSE3 +// Only functions are x3, which do not have tests. +#endif // HAVE_SSE3 + +#if HAVE_SSSE3 +// Only functions are x3, which do not have tests. +#endif // HAVE_SSSE3 + +#if HAVE_AVX2 +const SadMxNParam avx2_tests[] = { + SadMxNParam(64, 64, &vpx_sad64x64_avx2), + SadMxNParam(64, 32, &vpx_sad64x32_avx2), + SadMxNParam(32, 64, &vpx_sad32x64_avx2), + SadMxNParam(32, 32, &vpx_sad32x32_avx2), + SadMxNParam(32, 16, &vpx_sad32x16_avx2), +#if CONFIG_VP9_HIGHBITDEPTH + SadMxNParam(64, 64, &vpx_highbd_sad64x64_avx2, 8), + SadMxNParam(64, 32, &vpx_highbd_sad64x32_avx2, 8), + SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 8), + SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 8), + SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 8), + SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 8), + SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 8), + SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 8), + + SadMxNParam(64, 64, &vpx_highbd_sad64x64_avx2, 10), + SadMxNParam(64, 32, &vpx_highbd_sad64x32_avx2, 10), + SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 10), + SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 10), + SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 10), + SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 10), + SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 10), + SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 10), + + SadMxNParam(64, 64, &vpx_highbd_sad64x64_avx2, 12), + SadMxNParam(64, 32, &vpx_highbd_sad64x32_avx2, 12), + SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 12), + SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 12), + SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 12), + SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 12), + SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 12), + SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests)); + +const SadSkipMxNParam skip_avx2_tests[] = { + SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_avx2), + SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_avx2), + SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_avx2), + SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_avx2), + SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_avx2), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 8), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 8), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 8), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 8), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 8), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 8), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 8), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 8), + + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 10), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 10), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 10), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 10), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 10), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 10), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 10), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 10), + + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 12), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 12), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 12), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 12), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 12), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 12), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 12), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(AVX2, SADSkipTest, + ::testing::ValuesIn(skip_avx2_tests)); + +const SadMxNAvgParam avg_avx2_tests[] = { + SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_avx2), + SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_avx2), + SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_avx2), + SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_avx2), + SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_avx2), +#if CONFIG_VP9_HIGHBITDEPTH + SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_avx2, 8), + SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_avx2, 8), + SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 8), + SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 8), + SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 8), + SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 8), + SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 8), + SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 8), + SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_avx2, 10), + SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_avx2, 10), + SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 10), + SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 10), + SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 10), + SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 10), + SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 10), + SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 10), + SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_avx2, 12), + SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_avx2, 12), + SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 12), + SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 12), + SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 12), + SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 12), + SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 12), + SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(AVX2, SADavgTest, ::testing::ValuesIn(avg_avx2_tests)); + +const SadMxNx4Param x4d_avx2_tests[] = { + SadMxNx4Param(64, 64, &vpx_sad64x64x4d_avx2), + SadMxNx4Param(32, 32, &vpx_sad32x32x4d_avx2), +#if CONFIG_VP9_HIGHBITDEPTH + SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_avx2, 8), + SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_avx2, 8), + SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 8), + SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 8), + SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 8), + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 8), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 8), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 8), + SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_avx2, 10), + SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_avx2, 10), + SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 10), + SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 10), + SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 10), + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 10), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 10), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 10), + SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_avx2, 12), + SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_avx2, 12), + SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 12), + SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 12), + SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 12), + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 12), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 12), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests)); + +const SadSkipMxNx4Param skip_x4d_avx2_tests[] = { + SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_avx2), + SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_avx2), + SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_avx2), + SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_avx2), + SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_avx2), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 8), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 8), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 8), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 8), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 8), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 8), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 8), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 8), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 10), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 10), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 10), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 10), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 10), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 10), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 10), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 10), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 12), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 12), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 12), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 12), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 12), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 12), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 12), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(AVX2, SADSkipx4Test, + ::testing::ValuesIn(skip_x4d_avx2_tests)); + +#endif // HAVE_AVX2 + +#if HAVE_AVX512 +const SadMxNx4Param x4d_avx512_tests[] = { + SadMxNx4Param(64, 64, &vpx_sad64x64x4d_avx512), +}; +INSTANTIATE_TEST_SUITE_P(AVX512, SADx4Test, + ::testing::ValuesIn(x4d_avx512_tests)); +#endif // HAVE_AVX512 + +//------------------------------------------------------------------------------ +// MIPS functions +#if HAVE_MSA +const SadMxNParam msa_tests[] = { + SadMxNParam(64, 64, &vpx_sad64x64_msa), + SadMxNParam(64, 32, &vpx_sad64x32_msa), + SadMxNParam(32, 64, &vpx_sad32x64_msa), + SadMxNParam(32, 32, &vpx_sad32x32_msa), + SadMxNParam(32, 16, &vpx_sad32x16_msa), + SadMxNParam(16, 32, &vpx_sad16x32_msa), + SadMxNParam(16, 16, &vpx_sad16x16_msa), + SadMxNParam(16, 8, &vpx_sad16x8_msa), + SadMxNParam(8, 16, &vpx_sad8x16_msa), + SadMxNParam(8, 8, &vpx_sad8x8_msa), + SadMxNParam(8, 4, &vpx_sad8x4_msa), + SadMxNParam(4, 8, &vpx_sad4x8_msa), + SadMxNParam(4, 4, &vpx_sad4x4_msa), +}; +INSTANTIATE_TEST_SUITE_P(MSA, SADTest, ::testing::ValuesIn(msa_tests)); + +const SadMxNAvgParam avg_msa_tests[] = { + SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_msa), + SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_msa), + SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_msa), + SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_msa), + SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_msa), + SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_msa), + SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_msa), + SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_msa), + SadMxNAvgParam(8, 16, &vpx_sad8x16_avg_msa), + SadMxNAvgParam(8, 8, &vpx_sad8x8_avg_msa), + SadMxNAvgParam(8, 4, &vpx_sad8x4_avg_msa), + SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_msa), + SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_msa), +}; +INSTANTIATE_TEST_SUITE_P(MSA, SADavgTest, ::testing::ValuesIn(avg_msa_tests)); + +const SadMxNx4Param x4d_msa_tests[] = { + SadMxNx4Param(64, 64, &vpx_sad64x64x4d_msa), + SadMxNx4Param(64, 32, &vpx_sad64x32x4d_msa), + SadMxNx4Param(32, 64, &vpx_sad32x64x4d_msa), + SadMxNx4Param(32, 32, &vpx_sad32x32x4d_msa), + SadMxNx4Param(32, 16, &vpx_sad32x16x4d_msa), + SadMxNx4Param(16, 32, &vpx_sad16x32x4d_msa), + SadMxNx4Param(16, 16, &vpx_sad16x16x4d_msa), + SadMxNx4Param(16, 8, &vpx_sad16x8x4d_msa), + SadMxNx4Param(8, 16, &vpx_sad8x16x4d_msa), + SadMxNx4Param(8, 8, &vpx_sad8x8x4d_msa), + SadMxNx4Param(8, 4, &vpx_sad8x4x4d_msa), + SadMxNx4Param(4, 8, &vpx_sad4x8x4d_msa), + SadMxNx4Param(4, 4, &vpx_sad4x4x4d_msa), +}; +INSTANTIATE_TEST_SUITE_P(MSA, SADx4Test, ::testing::ValuesIn(x4d_msa_tests)); +#endif // HAVE_MSA + +//------------------------------------------------------------------------------ +// VSX functions +#if HAVE_VSX +const SadMxNParam vsx_tests[] = { + SadMxNParam(64, 64, &vpx_sad64x64_vsx), + SadMxNParam(64, 32, &vpx_sad64x32_vsx), + SadMxNParam(32, 64, &vpx_sad32x64_vsx), + SadMxNParam(32, 32, &vpx_sad32x32_vsx), + SadMxNParam(32, 16, &vpx_sad32x16_vsx), + SadMxNParam(16, 32, &vpx_sad16x32_vsx), + SadMxNParam(16, 16, &vpx_sad16x16_vsx), + SadMxNParam(16, 8, &vpx_sad16x8_vsx), + SadMxNParam(8, 16, &vpx_sad8x16_vsx), + SadMxNParam(8, 8, &vpx_sad8x8_vsx), + SadMxNParam(8, 4, &vpx_sad8x4_vsx), +}; +INSTANTIATE_TEST_SUITE_P(VSX, SADTest, ::testing::ValuesIn(vsx_tests)); + +const SadMxNAvgParam avg_vsx_tests[] = { + SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_vsx), + SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_vsx), + SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_vsx), + SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_vsx), + SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_vsx), + SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_vsx), + SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_vsx), + SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_vsx), +}; +INSTANTIATE_TEST_SUITE_P(VSX, SADavgTest, ::testing::ValuesIn(avg_vsx_tests)); + +const SadMxNx4Param x4d_vsx_tests[] = { + SadMxNx4Param(64, 64, &vpx_sad64x64x4d_vsx), + SadMxNx4Param(64, 32, &vpx_sad64x32x4d_vsx), + SadMxNx4Param(32, 64, &vpx_sad32x64x4d_vsx), + SadMxNx4Param(32, 32, &vpx_sad32x32x4d_vsx), + SadMxNx4Param(32, 16, &vpx_sad32x16x4d_vsx), + SadMxNx4Param(16, 32, &vpx_sad16x32x4d_vsx), + SadMxNx4Param(16, 16, &vpx_sad16x16x4d_vsx), + SadMxNx4Param(16, 8, &vpx_sad16x8x4d_vsx), +}; +INSTANTIATE_TEST_SUITE_P(VSX, SADx4Test, ::testing::ValuesIn(x4d_vsx_tests)); +#endif // HAVE_VSX + +//------------------------------------------------------------------------------ +// Loongson functions +#if HAVE_MMI +const SadMxNParam mmi_tests[] = { + SadMxNParam(64, 64, &vpx_sad64x64_mmi), + SadMxNParam(64, 32, &vpx_sad64x32_mmi), + SadMxNParam(32, 64, &vpx_sad32x64_mmi), + SadMxNParam(32, 32, &vpx_sad32x32_mmi), + SadMxNParam(32, 16, &vpx_sad32x16_mmi), + SadMxNParam(16, 32, &vpx_sad16x32_mmi), + SadMxNParam(16, 16, &vpx_sad16x16_mmi), + SadMxNParam(16, 8, &vpx_sad16x8_mmi), + SadMxNParam(8, 16, &vpx_sad8x16_mmi), + SadMxNParam(8, 8, &vpx_sad8x8_mmi), + SadMxNParam(8, 4, &vpx_sad8x4_mmi), + SadMxNParam(4, 8, &vpx_sad4x8_mmi), + SadMxNParam(4, 4, &vpx_sad4x4_mmi), +}; +INSTANTIATE_TEST_SUITE_P(MMI, SADTest, ::testing::ValuesIn(mmi_tests)); + +const SadMxNAvgParam avg_mmi_tests[] = { + SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_mmi), + SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_mmi), + SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_mmi), + SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_mmi), + SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_mmi), + SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_mmi), + SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_mmi), + SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_mmi), + SadMxNAvgParam(8, 16, &vpx_sad8x16_avg_mmi), + SadMxNAvgParam(8, 8, &vpx_sad8x8_avg_mmi), + SadMxNAvgParam(8, 4, &vpx_sad8x4_avg_mmi), + SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_mmi), + SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_mmi), +}; +INSTANTIATE_TEST_SUITE_P(MMI, SADavgTest, ::testing::ValuesIn(avg_mmi_tests)); + +const SadMxNx4Param x4d_mmi_tests[] = { + SadMxNx4Param(64, 64, &vpx_sad64x64x4d_mmi), + SadMxNx4Param(64, 32, &vpx_sad64x32x4d_mmi), + SadMxNx4Param(32, 64, &vpx_sad32x64x4d_mmi), + SadMxNx4Param(32, 32, &vpx_sad32x32x4d_mmi), + SadMxNx4Param(32, 16, &vpx_sad32x16x4d_mmi), + SadMxNx4Param(16, 32, &vpx_sad16x32x4d_mmi), + SadMxNx4Param(16, 16, &vpx_sad16x16x4d_mmi), + SadMxNx4Param(16, 8, &vpx_sad16x8x4d_mmi), + SadMxNx4Param(8, 16, &vpx_sad8x16x4d_mmi), + SadMxNx4Param(8, 8, &vpx_sad8x8x4d_mmi), + SadMxNx4Param(8, 4, &vpx_sad8x4x4d_mmi), + SadMxNx4Param(4, 8, &vpx_sad4x8x4d_mmi), + SadMxNx4Param(4, 4, &vpx_sad4x4x4d_mmi), +}; +INSTANTIATE_TEST_SUITE_P(MMI, SADx4Test, ::testing::ValuesIn(x4d_mmi_tests)); +#endif // HAVE_MMI + +//------------------------------------------------------------------------------ +// loongarch functions +#if HAVE_LSX +const SadMxNParam lsx_tests[] = { + SadMxNParam(64, 64, &vpx_sad64x64_lsx), + SadMxNParam(32, 32, &vpx_sad32x32_lsx), + SadMxNParam(16, 16, &vpx_sad16x16_lsx), + SadMxNParam(8, 8, &vpx_sad8x8_lsx), +}; +INSTANTIATE_TEST_SUITE_P(LSX, SADTest, ::testing::ValuesIn(lsx_tests)); + +const SadMxNAvgParam avg_lsx_tests[] = { + SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_lsx), + SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_lsx), +}; +INSTANTIATE_TEST_SUITE_P(LSX, SADavgTest, ::testing::ValuesIn(avg_lsx_tests)); + +const SadMxNx4Param x4d_lsx_tests[] = { + SadMxNx4Param(64, 64, &vpx_sad64x64x4d_lsx), + SadMxNx4Param(64, 32, &vpx_sad64x32x4d_lsx), + SadMxNx4Param(32, 64, &vpx_sad32x64x4d_lsx), + SadMxNx4Param(32, 32, &vpx_sad32x32x4d_lsx), + SadMxNx4Param(16, 16, &vpx_sad16x16x4d_lsx), + SadMxNx4Param(8, 8, &vpx_sad8x8x4d_lsx), +}; +INSTANTIATE_TEST_SUITE_P(LSX, SADx4Test, ::testing::ValuesIn(x4d_lsx_tests)); +#endif // HAVE_LSX + +} // namespace diff --git a/media/libvpx/libvpx/test/set_maps.sh b/media/libvpx/libvpx/test/set_maps.sh new file mode 100755 index 0000000000..f45dc51f49 --- /dev/null +++ b/media/libvpx/libvpx/test/set_maps.sh @@ -0,0 +1,59 @@ +#!/bin/sh +## +## Copyright (c) 2014 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## +## This file tests the libvpx set_maps example. To add new tests to this file, +## do the following: +## 1. Write a shell function (this is your test). +## 2. Add the function to set_maps_tests (on a new line). +## +. $(dirname $0)/tools_common.sh + +# Environment check: $YUV_RAW_INPUT is required, and set_maps must exist in +# $LIBVPX_BIN_PATH. +set_maps_verify_environment() { + if [ ! -e "${YUV_RAW_INPUT}" ]; then + echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH." + return 1 + fi + if [ -z "$(vpx_tool_path set_maps)" ]; then + elog "set_maps not found. It must exist in LIBVPX_BIN_PATH or its parent." + return 1 + fi +} + +# Runs set_maps using the codec specified by $1. +set_maps() { + local encoder="$(vpx_tool_path set_maps)" + local codec="$1" + local output_file="${VPX_TEST_OUTPUT_DIR}/set_maps_${codec}.ivf" + + eval "${VPX_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \ + "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \ + ${devnull} || return 1 + + [ -e "${output_file}" ] || return 1 +} + +set_maps_vp8() { + if [ "$(vp8_encode_available)" = "yes" ]; then + set_maps vp8 || return 1 + fi +} + +set_maps_vp9() { + if [ "$(vp9_encode_available)" = "yes" ]; then + set_maps vp9 || return 1 + fi +} + +set_maps_tests="set_maps_vp8 + set_maps_vp9" + +run_tests set_maps_verify_environment "${set_maps_tests}" diff --git a/media/libvpx/libvpx/test/set_roi.cc b/media/libvpx/libvpx/test/set_roi.cc new file mode 100644 index 0000000000..693410e391 --- /dev/null +++ b/media/libvpx/libvpx/test/set_roi.cc @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/acm_random.h" +#include "vp8/encoder/onyx_int.h" +#include "vpx/vpx_integer.h" +#include "vpx_mem/vpx_mem.h" + +using libvpx_test::ACMRandom; + +namespace { + +TEST(VP8RoiMapTest, ParameterCheck) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + int delta_q[MAX_MB_SEGMENTS] = { -2, -25, 0, 31 }; + int delta_lf[MAX_MB_SEGMENTS] = { -2, -25, 0, 31 }; + unsigned int threshold[MAX_MB_SEGMENTS] = { 0, 100, 200, 300 }; + + const int internalq_trans[] = { + 0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 12, 13, 15, 17, 18, 19, + 20, 21, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 35, 37, 39, 41, + 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 64, 67, 70, 73, 76, 79, + 82, 85, 88, 91, 94, 97, 100, 103, 106, 109, 112, 115, 118, 121, 124, 127, + }; + + // Initialize elements of cpi with valid defaults. + VP8_COMP cpi; + cpi.mb.e_mbd.mb_segment_abs_delta = SEGMENT_DELTADATA; + cpi.cyclic_refresh_mode_enabled = 0; + cpi.mb.e_mbd.segmentation_enabled = 0; + cpi.mb.e_mbd.update_mb_segmentation_map = 0; + cpi.mb.e_mbd.update_mb_segmentation_data = 0; + cpi.common.mb_rows = 240 >> 4; + cpi.common.mb_cols = 320 >> 4; + const int mbs = (cpi.common.mb_rows * cpi.common.mb_cols); + memset(cpi.segment_feature_data, 0, sizeof(cpi.segment_feature_data)); + + // Segment map + cpi.segmentation_map = reinterpret_cast(vpx_calloc(mbs, 1)); + + // Allocate memory for the source memory map. + unsigned char *roi_map = + reinterpret_cast(vpx_calloc(mbs, 1)); + memset(&roi_map[mbs >> 2], 1, (mbs >> 2)); + memset(&roi_map[mbs >> 1], 2, (mbs >> 2)); + memset(&roi_map[mbs - (mbs >> 2)], 3, (mbs >> 2)); + + // Do a test call with valid parameters. + int roi_retval = + vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows, cpi.common.mb_cols, + delta_q, delta_lf, threshold); + EXPECT_EQ(0, roi_retval) + << "vp8_set_roimap roi failed with default test parameters"; + + // Check that the values in the cpi structure get set as expected. + if (roi_retval == 0) { + // Check that the segment map got set. + const int mapcompare = memcmp(roi_map, cpi.segmentation_map, mbs); + EXPECT_EQ(0, mapcompare) << "segment map error"; + + // Check the q deltas (note the need to translate into + // the interanl range of 0-127. + for (int i = 0; i < MAX_MB_SEGMENTS; ++i) { + const int transq = internalq_trans[abs(delta_q[i])]; + if (abs(cpi.segment_feature_data[MB_LVL_ALT_Q][i]) != transq) { + EXPECT_EQ(transq, cpi.segment_feature_data[MB_LVL_ALT_Q][i]) + << "segment delta_q error"; + break; + } + } + + // Check the loop filter deltas + for (int i = 0; i < MAX_MB_SEGMENTS; ++i) { + if (cpi.segment_feature_data[MB_LVL_ALT_LF][i] != delta_lf[i]) { + EXPECT_EQ(delta_lf[i], cpi.segment_feature_data[MB_LVL_ALT_LF][i]) + << "segment delta_lf error"; + break; + } + } + + // Check the breakout thresholds + for (int i = 0; i < MAX_MB_SEGMENTS; ++i) { + unsigned int breakout = + static_cast(cpi.segment_encode_breakout[i]); + + if (threshold[i] != breakout) { + EXPECT_EQ(threshold[i], breakout) << "breakout threshold error"; + break; + } + } + + // Segmentation, and segmentation update flages should be set. + EXPECT_EQ(1, cpi.mb.e_mbd.segmentation_enabled) + << "segmentation_enabled error"; + EXPECT_EQ(1, cpi.mb.e_mbd.update_mb_segmentation_map) + << "update_mb_segmentation_map error"; + EXPECT_EQ(1, cpi.mb.e_mbd.update_mb_segmentation_data) + << "update_mb_segmentation_data error"; + + // Try a range of delta q and lf parameters (some legal, some not) + for (int i = 0; i < 1000; ++i) { + int rand_deltas[4]; + int deltas_valid; + rand_deltas[0] = rnd(160) - 80; + rand_deltas[1] = rnd(160) - 80; + rand_deltas[2] = rnd(160) - 80; + rand_deltas[3] = rnd(160) - 80; + + deltas_valid = + ((abs(rand_deltas[0]) <= 63) && (abs(rand_deltas[1]) <= 63) && + (abs(rand_deltas[2]) <= 63) && (abs(rand_deltas[3]) <= 63)) + ? 0 + : -1; + + // Test with random delta q values. + roi_retval = + vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows, cpi.common.mb_cols, + rand_deltas, delta_lf, threshold); + EXPECT_EQ(deltas_valid, roi_retval) << "dq range check error"; + + // One delta_q error shown at a time + if (deltas_valid != roi_retval) break; + + // Test with random loop filter values. + roi_retval = + vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows, cpi.common.mb_cols, + delta_q, rand_deltas, threshold); + EXPECT_EQ(deltas_valid, roi_retval) << "dlf range check error"; + + // One delta loop filter error shown at a time + if (deltas_valid != roi_retval) break; + } + + // Test invalid number of rows or colums. + roi_retval = + vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows + 1, + cpi.common.mb_cols, delta_q, delta_lf, threshold); + EXPECT_EQ(-1, roi_retval) << "MB rows bounds check error"; + + roi_retval = + vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows, + cpi.common.mb_cols - 1, delta_q, delta_lf, threshold); + EXPECT_EQ(-1, roi_retval) << "MB cols bounds check error"; + } + + // Free allocated memory + if (cpi.segmentation_map) vpx_free(cpi.segmentation_map); + if (roi_map) vpx_free(roi_map); +} + +} // namespace diff --git a/media/libvpx/libvpx/test/simple_decoder.sh b/media/libvpx/libvpx/test/simple_decoder.sh new file mode 100755 index 0000000000..65fc4828ed --- /dev/null +++ b/media/libvpx/libvpx/test/simple_decoder.sh @@ -0,0 +1,61 @@ +#!/bin/sh +## +## Copyright (c) 2014 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## +## This file tests the libvpx simple_decoder example code. To add new tests to +## this file, do the following: +## 1. Write a shell function (this is your test). +## 2. Add the function to simple_decoder_tests (on a new line). +## +. $(dirname $0)/tools_common.sh + +# Environment check: Make sure input is available: +# $VP8_IVF_FILE and $VP9_IVF_FILE are required. +simple_decoder_verify_environment() { + if [ ! -e "${VP8_IVF_FILE}" ] || [ ! -e "${VP9_IVF_FILE}" ]; then + echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH." + return 1 + fi +} + +# Runs simple_decoder using $1 as input file. $2 is the codec name, and is used +# solely to name the output file. +simple_decoder() { + local decoder="${LIBVPX_BIN_PATH}/simple_decoder${VPX_TEST_EXE_SUFFIX}" + local input_file="$1" + local codec="$2" + local output_file="${VPX_TEST_OUTPUT_DIR}/simple_decoder_${codec}.raw" + + if [ ! -x "${decoder}" ]; then + elog "${decoder} does not exist or is not executable." + return 1 + fi + + eval "${VPX_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \ + ${devnull} || return 1 + + [ -e "${output_file}" ] || return 1 +} + +simple_decoder_vp8() { + if [ "$(vp8_decode_available)" = "yes" ]; then + simple_decoder "${VP8_IVF_FILE}" vp8 || return 1 + fi +} + +simple_decoder_vp9() { + if [ "$(vp9_decode_available)" = "yes" ]; then + simple_decoder "${VP9_IVF_FILE}" vp9 || return 1 + fi +} + +simple_decoder_tests="simple_decoder_vp8 + simple_decoder_vp9" + +run_tests simple_decoder_verify_environment "${simple_decoder_tests}" diff --git a/media/libvpx/libvpx/test/simple_encode_test.cc b/media/libvpx/libvpx/test/simple_encode_test.cc new file mode 100644 index 0000000000..01fc258566 --- /dev/null +++ b/media/libvpx/libvpx/test/simple_encode_test.cc @@ -0,0 +1,574 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/video_source.h" +#include "vp9/simple_encode.h" + +namespace vp9 { +namespace { + +double GetBitrateInKbps(size_t bit_size, int num_frames, int frame_rate_num, + int frame_rate_den) { + return static_cast(bit_size) / num_frames * frame_rate_num / + frame_rate_den / 1000.0; +} + +// Returns the number of unit in size of 4. +// For example, if size is 7, return 2. +int GetNumUnit4x4(int size) { return (size + 3) >> 2; } + +class SimpleEncodeTest : public ::testing::Test { + protected: + const int width_ = 352; + const int height_ = 288; + const int frame_rate_num_ = 30; + const int frame_rate_den_ = 1; + const int target_bitrate_ = 1000; + const int num_frames_ = 17; + const int target_level_ = LEVEL_UNKNOWN; + const std::string in_file_path_str_ = + libvpx_test::GetDataPath() + "/bus_352x288_420_f20_b8.yuv"; +}; + +TEST_F(SimpleEncodeTest, ComputeFirstPassStats) { + SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, + target_bitrate_, num_frames_, target_level_, + in_file_path_str_.c_str()); + simple_encode.ComputeFirstPassStats(); + std::vector> frame_stats = + simple_encode.ObserveFirstPassStats(); + EXPECT_EQ(frame_stats.size(), static_cast(num_frames_)); + const size_t data_num = frame_stats[0].size(); + // Read ObserveFirstPassStats before changing FIRSTPASS_STATS. + EXPECT_EQ(data_num, static_cast(25)); + for (size_t i = 0; i < frame_stats.size(); ++i) { + EXPECT_EQ(frame_stats[i].size(), data_num); + // FIRSTPASS_STATS's first element is frame + EXPECT_EQ(frame_stats[i][0], i); + // FIRSTPASS_STATS's last element is count, and the count is 1 for single + // frame stats + EXPECT_EQ(frame_stats[i][data_num - 1], 1); + } +} + +TEST_F(SimpleEncodeTest, ObserveFirstPassMotionVectors) { + SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, + target_bitrate_, num_frames_, target_level_, + in_file_path_str_.c_str()); + simple_encode.ComputeFirstPassStats(); + std::vector> fps_motion_vectors = + simple_encode.ObserveFirstPassMotionVectors(); + EXPECT_EQ(fps_motion_vectors.size(), static_cast(num_frames_)); + const size_t num_blocks = ((width_ + 15) >> 4) * ((height_ + 15) >> 4); + EXPECT_EQ(num_blocks, fps_motion_vectors[0].size()); + for (size_t i = 0; i < fps_motion_vectors.size(); ++i) { + EXPECT_EQ(num_blocks, fps_motion_vectors[i].size()); + for (size_t j = 0; j < num_blocks; ++j) { + const int mv_count = fps_motion_vectors[i][j].mv_count; + const int ref_count = + (fps_motion_vectors[i][j].ref_frame[0] != kRefFrameTypeNone) + + (fps_motion_vectors[i][j].ref_frame[1] != kRefFrameTypeNone); + EXPECT_EQ(mv_count, ref_count); + } + } +} + +TEST_F(SimpleEncodeTest, GetCodingFrameNum) { + SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, + target_bitrate_, num_frames_, target_level_, + in_file_path_str_.c_str()); + simple_encode.ComputeFirstPassStats(); + const int num_coding_frames = simple_encode.GetCodingFrameNum(); + EXPECT_EQ(num_coding_frames, 19); +} + +TEST_F(SimpleEncodeTest, EncodeFrame) { + SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, + target_bitrate_, num_frames_, target_level_, + in_file_path_str_.c_str()); + simple_encode.ComputeFirstPassStats(); + int num_coding_frames = simple_encode.GetCodingFrameNum(); + EXPECT_GE(num_coding_frames, num_frames_); + simple_encode.StartEncode(); + size_t total_data_bit_size = 0; + int coded_show_frame_count = 0; + int frame_coding_index = 0; + while (coded_show_frame_count < num_frames_) { + const GroupOfPicture group_of_picture = + simple_encode.ObserveGroupOfPicture(); + const std::vector &encode_frame_list = + group_of_picture.encode_frame_list; + for (size_t group_index = 0; group_index < encode_frame_list.size(); + ++group_index) { + EncodeFrameResult encode_frame_result; + simple_encode.EncodeFrame(&encode_frame_result); + EXPECT_EQ(encode_frame_result.show_idx, + encode_frame_list[group_index].show_idx); + EXPECT_EQ(encode_frame_result.frame_type, + encode_frame_list[group_index].frame_type); + EXPECT_EQ(encode_frame_list[group_index].coding_index, + frame_coding_index); + EXPECT_GE(encode_frame_result.psnr, 34) + << "The psnr is supposed to be greater than 34 given the " + "target_bitrate 1000 kbps"; + EXPECT_EQ(encode_frame_result.ref_frame_info, + encode_frame_list[group_index].ref_frame_info); + total_data_bit_size += encode_frame_result.coding_data_bit_size; + ++frame_coding_index; + } + coded_show_frame_count += group_of_picture.show_frame_count; + } + const double bitrate = GetBitrateInKbps(total_data_bit_size, num_frames_, + frame_rate_num_, frame_rate_den_); + const double off_target_threshold = 150; + EXPECT_LE(fabs(target_bitrate_ - bitrate), off_target_threshold); + simple_encode.EndEncode(); +} + +TEST_F(SimpleEncodeTest, ObserveKeyFrameMap) { + SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, + target_bitrate_, num_frames_, target_level_, + in_file_path_str_.c_str()); + simple_encode.ComputeFirstPassStats(); + std::vector key_frame_map = simple_encode.ObserveKeyFrameMap(); + EXPECT_EQ(key_frame_map.size(), static_cast(num_frames_)); + simple_encode.StartEncode(); + int coded_show_frame_count = 0; + while (coded_show_frame_count < num_frames_) { + const GroupOfPicture group_of_picture = + simple_encode.ObserveGroupOfPicture(); + const std::vector &encode_frame_list = + group_of_picture.encode_frame_list; + for (size_t group_index = 0; group_index < encode_frame_list.size(); + ++group_index) { + EncodeFrameResult encode_frame_result; + simple_encode.EncodeFrame(&encode_frame_result); + if (encode_frame_result.frame_type == kFrameTypeKey) { + EXPECT_EQ(key_frame_map[encode_frame_result.show_idx], 1); + } else { + EXPECT_EQ(key_frame_map[encode_frame_result.show_idx], 0); + } + } + coded_show_frame_count += group_of_picture.show_frame_count; + } + simple_encode.EndEncode(); +} + +TEST_F(SimpleEncodeTest, EncodeFrameWithTargetFrameBits) { + SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, + target_bitrate_, num_frames_, target_level_, + in_file_path_str_.c_str()); + simple_encode.ComputeFirstPassStats(); + const int num_coding_frames = simple_encode.GetCodingFrameNum(); + simple_encode.StartEncode(); + for (int i = 0; i < num_coding_frames; ++i) { + EncodeFrameInfo encode_frame_info = simple_encode.GetNextEncodeFrameInfo(); + int target_frame_bits; + switch (encode_frame_info.frame_type) { + case kFrameTypeInter: target_frame_bits = 20000; break; + case kFrameTypeKey: + case kFrameTypeAltRef: + case kFrameTypeGolden: target_frame_bits = 100000; break; + case kFrameTypeOverlay: target_frame_bits = 2000; break; + default: target_frame_bits = 20000; + } + + double percent_diff = 15; + if (encode_frame_info.frame_type == kFrameTypeOverlay) { + percent_diff = 100; + } + EncodeFrameResult encode_frame_result; + simple_encode.EncodeFrameWithTargetFrameBits( + &encode_frame_result, target_frame_bits, percent_diff); + const int recode_count = encode_frame_result.recode_count; + // TODO(angiebird): Replace 7 by RATE_CTRL_MAX_RECODE_NUM + EXPECT_LE(recode_count, 7); + EXPECT_GE(recode_count, 1); + + const double diff = fabs((double)encode_frame_result.coding_data_bit_size - + target_frame_bits); + EXPECT_LE(diff * 100 / target_frame_bits, percent_diff); + } + simple_encode.EndEncode(); +} + +TEST_F(SimpleEncodeTest, EncodeFrameWithQuantizeIndex) { + SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, + target_bitrate_, num_frames_, target_level_, + in_file_path_str_.c_str()); + simple_encode.ComputeFirstPassStats(); + const int num_coding_frames = simple_encode.GetCodingFrameNum(); + simple_encode.StartEncode(); + for (int i = 0; i < num_coding_frames; ++i) { + const int assigned_quantize_index = 100 + i; + EncodeFrameResult encode_frame_result; + simple_encode.EncodeFrameWithQuantizeIndex(&encode_frame_result, + assigned_quantize_index); + EXPECT_EQ(encode_frame_result.quantize_index, assigned_quantize_index); + } + simple_encode.EndEncode(); +} + +// This test encodes the video using EncodeFrame(), where quantize indexes +// are selected by vp9 rate control. +// Encode stats and the quantize_indexes are collected. +// Then the test encodes the video again using EncodeFrameWithQuantizeIndex() +// using the quantize indexes collected from the first run. +// Then test whether the encode stats of the two encoding runs match. +TEST_F(SimpleEncodeTest, EncodeConsistencyTest) { + std::vector quantize_index_list; + std::vector ref_sse_list; + std::vector ref_psnr_list; + std::vector ref_bit_size_list; + std::vector ref_frame_type_list; + std::vector ref_show_idx_list; + { + // The first encode. + SimpleEncode simple_encode(width_, height_, frame_rate_num_, + frame_rate_den_, target_bitrate_, num_frames_, + target_level_, in_file_path_str_.c_str()); + simple_encode.ComputeFirstPassStats(); + const int num_coding_frames = simple_encode.GetCodingFrameNum(); + simple_encode.StartEncode(); + for (int i = 0; i < num_coding_frames; ++i) { + EncodeFrameResult encode_frame_result; + simple_encode.EncodeFrame(&encode_frame_result); + quantize_index_list.push_back(encode_frame_result.quantize_index); + ref_sse_list.push_back(encode_frame_result.sse); + ref_psnr_list.push_back(encode_frame_result.psnr); + ref_bit_size_list.push_back(encode_frame_result.coding_data_bit_size); + ref_frame_type_list.push_back(encode_frame_result.frame_type); + ref_show_idx_list.push_back(encode_frame_result.show_idx); + } + simple_encode.EndEncode(); + } + { + // The second encode with quantize index got from the first encode. + SimpleEncode simple_encode(width_, height_, frame_rate_num_, + frame_rate_den_, target_bitrate_, num_frames_, + target_level_, in_file_path_str_.c_str()); + simple_encode.ComputeFirstPassStats(); + const int num_coding_frames = simple_encode.GetCodingFrameNum(); + EXPECT_EQ(static_cast(num_coding_frames), + quantize_index_list.size()); + simple_encode.StartEncode(); + for (int i = 0; i < num_coding_frames; ++i) { + EncodeFrameResult encode_frame_result; + simple_encode.EncodeFrameWithQuantizeIndex(&encode_frame_result, + quantize_index_list[i]); + EXPECT_EQ(encode_frame_result.quantize_index, quantize_index_list[i]); + EXPECT_EQ(encode_frame_result.sse, ref_sse_list[i]); + EXPECT_DOUBLE_EQ(encode_frame_result.psnr, ref_psnr_list[i]); + EXPECT_EQ(encode_frame_result.coding_data_bit_size, ref_bit_size_list[i]); + EXPECT_EQ(encode_frame_result.frame_type, ref_frame_type_list[i]); + EXPECT_EQ(encode_frame_result.show_idx, ref_show_idx_list[i]); + } + simple_encode.EndEncode(); + } +} + +// Test the information (partition info and motion vector info) stored in +// encoder is the same between two encode runs. +TEST_F(SimpleEncodeTest, EncodeConsistencyTest2) { + const int num_rows_4x4 = GetNumUnit4x4(width_); + const int num_cols_4x4 = GetNumUnit4x4(height_); + const int num_units_4x4 = num_rows_4x4 * num_cols_4x4; + // The first encode. + SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, + target_bitrate_, num_frames_, target_level_, + in_file_path_str_.c_str()); + simple_encode.ComputeFirstPassStats(); + const int num_coding_frames = simple_encode.GetCodingFrameNum(); + std::vector partition_info_list(num_units_4x4 * + num_coding_frames); + std::vector motion_vector_info_list(num_units_4x4 * + num_coding_frames); + simple_encode.StartEncode(); + for (int i = 0; i < num_coding_frames; ++i) { + EncodeFrameResult encode_frame_result; + simple_encode.EncodeFrame(&encode_frame_result); + for (int j = 0; j < num_rows_4x4 * num_cols_4x4; ++j) { + partition_info_list[i * num_units_4x4 + j] = + encode_frame_result.partition_info[j]; + motion_vector_info_list[i * num_units_4x4 + j] = + encode_frame_result.motion_vector_info[j]; + } + } + simple_encode.EndEncode(); + // The second encode. + SimpleEncode simple_encode_2(width_, height_, frame_rate_num_, + frame_rate_den_, target_bitrate_, num_frames_, + target_level_, in_file_path_str_.c_str()); + simple_encode_2.ComputeFirstPassStats(); + const int num_coding_frames_2 = simple_encode_2.GetCodingFrameNum(); + simple_encode_2.StartEncode(); + for (int i = 0; i < num_coding_frames_2; ++i) { + EncodeFrameResult encode_frame_result; + simple_encode_2.EncodeFrame(&encode_frame_result); + for (int j = 0; j < num_rows_4x4 * num_cols_4x4; ++j) { + EXPECT_EQ(encode_frame_result.partition_info[j].row, + partition_info_list[i * num_units_4x4 + j].row); + EXPECT_EQ(encode_frame_result.partition_info[j].column, + partition_info_list[i * num_units_4x4 + j].column); + EXPECT_EQ(encode_frame_result.partition_info[j].row_start, + partition_info_list[i * num_units_4x4 + j].row_start); + EXPECT_EQ(encode_frame_result.partition_info[j].column_start, + partition_info_list[i * num_units_4x4 + j].column_start); + EXPECT_EQ(encode_frame_result.partition_info[j].width, + partition_info_list[i * num_units_4x4 + j].width); + EXPECT_EQ(encode_frame_result.partition_info[j].height, + partition_info_list[i * num_units_4x4 + j].height); + + EXPECT_EQ(encode_frame_result.motion_vector_info[j].mv_count, + motion_vector_info_list[i * num_units_4x4 + j].mv_count); + EXPECT_EQ(encode_frame_result.motion_vector_info[j].ref_frame[0], + motion_vector_info_list[i * num_units_4x4 + j].ref_frame[0]); + EXPECT_EQ(encode_frame_result.motion_vector_info[j].ref_frame[1], + motion_vector_info_list[i * num_units_4x4 + j].ref_frame[1]); + EXPECT_EQ(encode_frame_result.motion_vector_info[j].mv_row[0], + motion_vector_info_list[i * num_units_4x4 + j].mv_row[0]); + EXPECT_EQ(encode_frame_result.motion_vector_info[j].mv_column[0], + motion_vector_info_list[i * num_units_4x4 + j].mv_column[0]); + EXPECT_EQ(encode_frame_result.motion_vector_info[j].mv_row[1], + motion_vector_info_list[i * num_units_4x4 + j].mv_row[1]); + EXPECT_EQ(encode_frame_result.motion_vector_info[j].mv_column[1], + motion_vector_info_list[i * num_units_4x4 + j].mv_column[1]); + } + } + simple_encode_2.EndEncode(); +} + +// Test the information stored in encoder is the same between two encode runs. +TEST_F(SimpleEncodeTest, EncodeConsistencyTest3) { + std::vector quantize_index_list; + const int num_rows_4x4 = GetNumUnit4x4(width_); + const int num_cols_4x4 = GetNumUnit4x4(height_); + const int num_units_4x4 = num_rows_4x4 * num_cols_4x4; + // The first encode. + SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, + target_bitrate_, num_frames_, target_level_, + in_file_path_str_.c_str()); + simple_encode.ComputeFirstPassStats(); + const int num_coding_frames = simple_encode.GetCodingFrameNum(); + std::vector partition_info_list(num_units_4x4 * + num_coding_frames); + simple_encode.StartEncode(); + for (int i = 0; i < num_coding_frames; ++i) { + EncodeFrameResult encode_frame_result; + simple_encode.EncodeFrame(&encode_frame_result); + quantize_index_list.push_back(encode_frame_result.quantize_index); + for (int j = 0; j < num_rows_4x4 * num_cols_4x4; ++j) { + partition_info_list[i * num_units_4x4 + j] = + encode_frame_result.partition_info[j]; + } + } + simple_encode.EndEncode(); + // The second encode. + SimpleEncode simple_encode_2(width_, height_, frame_rate_num_, + frame_rate_den_, target_bitrate_, num_frames_, + target_level_, in_file_path_str_.c_str()); + simple_encode_2.ComputeFirstPassStats(); + const int num_coding_frames_2 = simple_encode_2.GetCodingFrameNum(); + simple_encode_2.StartEncode(); + for (int i = 0; i < num_coding_frames_2; ++i) { + EncodeFrameResult encode_frame_result; + simple_encode_2.EncodeFrameWithQuantizeIndex(&encode_frame_result, + quantize_index_list[i]); + for (int j = 0; j < num_rows_4x4 * num_cols_4x4; ++j) { + EXPECT_EQ(encode_frame_result.partition_info[j].row, + partition_info_list[i * num_units_4x4 + j].row); + EXPECT_EQ(encode_frame_result.partition_info[j].column, + partition_info_list[i * num_units_4x4 + j].column); + EXPECT_EQ(encode_frame_result.partition_info[j].row_start, + partition_info_list[i * num_units_4x4 + j].row_start); + EXPECT_EQ(encode_frame_result.partition_info[j].column_start, + partition_info_list[i * num_units_4x4 + j].column_start); + EXPECT_EQ(encode_frame_result.partition_info[j].width, + partition_info_list[i * num_units_4x4 + j].width); + EXPECT_EQ(encode_frame_result.partition_info[j].height, + partition_info_list[i * num_units_4x4 + j].height); + } + } + simple_encode_2.EndEncode(); +} + +// Encode with default VP9 decision first. +// Get QPs and arf locations from the first encode. +// Set external arfs and QPs for the second encode. +// Expect to get matched results. +TEST_F(SimpleEncodeTest, EncodeConsistencySetExternalGroupOfPicturesMap) { + std::vector quantize_index_list; + std::vector ref_sse_list; + std::vector ref_psnr_list; + std::vector ref_bit_size_list; + std::vector gop_map(num_frames_, 0); + { + // The first encode. + SimpleEncode simple_encode(width_, height_, frame_rate_num_, + frame_rate_den_, target_bitrate_, num_frames_, + target_level_, in_file_path_str_.c_str()); + simple_encode.ComputeFirstPassStats(); + simple_encode.StartEncode(); + + int coded_show_frame_count = 0; + while (coded_show_frame_count < num_frames_) { + const GroupOfPicture group_of_picture = + simple_encode.ObserveGroupOfPicture(); + gop_map[coded_show_frame_count] |= kGopMapFlagStart; + if (group_of_picture.use_alt_ref) { + gop_map[coded_show_frame_count] |= kGopMapFlagUseAltRef; + } + const std::vector &encode_frame_list = + group_of_picture.encode_frame_list; + for (size_t group_index = 0; group_index < encode_frame_list.size(); + ++group_index) { + EncodeFrameResult encode_frame_result; + simple_encode.EncodeFrame(&encode_frame_result); + quantize_index_list.push_back(encode_frame_result.quantize_index); + ref_sse_list.push_back(encode_frame_result.sse); + ref_psnr_list.push_back(encode_frame_result.psnr); + ref_bit_size_list.push_back(encode_frame_result.coding_data_bit_size); + } + coded_show_frame_count += group_of_picture.show_frame_count; + } + simple_encode.EndEncode(); + } + { + // The second encode with quantize index got from the first encode. + // The external arfs are the same as the first encode. + SimpleEncode simple_encode(width_, height_, frame_rate_num_, + frame_rate_den_, target_bitrate_, num_frames_, + target_level_, in_file_path_str_.c_str()); + simple_encode.ComputeFirstPassStats(); + simple_encode.SetExternalGroupOfPicturesMap(gop_map.data(), gop_map.size()); + const int num_coding_frames = simple_encode.GetCodingFrameNum(); + EXPECT_EQ(static_cast(num_coding_frames), + quantize_index_list.size()); + simple_encode.StartEncode(); + for (int i = 0; i < num_coding_frames; ++i) { + EncodeFrameResult encode_frame_result; + simple_encode.EncodeFrameWithQuantizeIndex(&encode_frame_result, + quantize_index_list[i]); + EXPECT_EQ(encode_frame_result.quantize_index, quantize_index_list[i]); + EXPECT_EQ(encode_frame_result.sse, ref_sse_list[i]); + EXPECT_DOUBLE_EQ(encode_frame_result.psnr, ref_psnr_list[i]); + EXPECT_EQ(encode_frame_result.coding_data_bit_size, ref_bit_size_list[i]); + } + simple_encode.EndEncode(); + } +} + +TEST_F(SimpleEncodeTest, SetExternalGroupOfPicturesMap) { + SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, + target_bitrate_, num_frames_, target_level_, + in_file_path_str_.c_str()); + simple_encode.ComputeFirstPassStats(); + + std::vector gop_map(num_frames_, 0); + + // Should be the first gop group. + gop_map[0] = 0; + + // Second gop group with an alt ref. + gop_map[5] |= kGopMapFlagStart | kGopMapFlagUseAltRef; + + // Third gop group without an alt ref. + gop_map[10] |= kGopMapFlagStart; + + // Last gop group. + gop_map[14] |= kGopMapFlagStart | kGopMapFlagUseAltRef; + + simple_encode.SetExternalGroupOfPicturesMap(gop_map.data(), gop_map.size()); + + std::vector observed_gop_map = + simple_encode.ObserveExternalGroupOfPicturesMap(); + + // First gop group. + // There is always a key frame at show_idx 0 and key frame should always be + // the start of a gop. We expect ObserveExternalGroupOfPicturesMap() will + // insert an extra gop start here. + EXPECT_EQ(observed_gop_map[0], kGopMapFlagStart | kGopMapFlagUseAltRef); + + // Second gop group with an alt ref. + EXPECT_EQ(observed_gop_map[5], kGopMapFlagStart | kGopMapFlagUseAltRef); + + // Third gop group without an alt ref. + EXPECT_EQ(observed_gop_map[10], kGopMapFlagStart); + + // Last gop group. The last gop is not supposed to use an alt ref. We expect + // ObserveExternalGroupOfPicturesMap() will remove the alt ref flag here. + EXPECT_EQ(observed_gop_map[14], kGopMapFlagStart); + + int ref_gop_show_frame_count_list[4] = { 5, 5, 4, 3 }; + size_t ref_gop_coded_frame_count_list[4] = { 6, 6, 4, 3 }; + int gop_count = 0; + + simple_encode.StartEncode(); + int coded_show_frame_count = 0; + while (coded_show_frame_count < num_frames_) { + const GroupOfPicture group_of_picture = + simple_encode.ObserveGroupOfPicture(); + const std::vector &encode_frame_list = + group_of_picture.encode_frame_list; + EXPECT_EQ(encode_frame_list.size(), + ref_gop_coded_frame_count_list[gop_count]); + EXPECT_EQ(group_of_picture.show_frame_count, + ref_gop_show_frame_count_list[gop_count]); + for (size_t group_index = 0; group_index < encode_frame_list.size(); + ++group_index) { + EncodeFrameResult encode_frame_result; + simple_encode.EncodeFrame(&encode_frame_result); + } + coded_show_frame_count += group_of_picture.show_frame_count; + ++gop_count; + } + EXPECT_EQ(gop_count, 4); + simple_encode.EndEncode(); +} + +TEST_F(SimpleEncodeTest, GetEncodeFrameInfo) { + // Makes sure that the encode_frame_info obtained from GetEncodeFrameInfo() + // matches the counterpart in encode_frame_result obtained from EncodeFrame() + SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, + target_bitrate_, num_frames_, target_level_, + in_file_path_str_.c_str()); + simple_encode.ComputeFirstPassStats(); + const int num_coding_frames = simple_encode.GetCodingFrameNum(); + simple_encode.StartEncode(); + for (int i = 0; i < num_coding_frames; ++i) { + EncodeFrameInfo encode_frame_info = simple_encode.GetNextEncodeFrameInfo(); + EncodeFrameResult encode_frame_result; + simple_encode.EncodeFrame(&encode_frame_result); + EXPECT_EQ(encode_frame_info.show_idx, encode_frame_result.show_idx); + EXPECT_EQ(encode_frame_info.frame_type, encode_frame_result.frame_type); + } + simple_encode.EndEncode(); +} + +TEST_F(SimpleEncodeTest, GetFramePixelCount) { + SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_, + target_bitrate_, num_frames_, target_level_, + in_file_path_str_.c_str()); + EXPECT_EQ(simple_encode.GetFramePixelCount(), + static_cast(width_ * height_ * 3 / 2)); +} + +} // namespace +} // namespace vp9 + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/media/libvpx/libvpx/test/simple_encoder.sh b/media/libvpx/libvpx/test/simple_encoder.sh new file mode 100755 index 0000000000..dc7f46ff38 --- /dev/null +++ b/media/libvpx/libvpx/test/simple_encoder.sh @@ -0,0 +1,59 @@ +#!/bin/sh +## +## Copyright (c) 2014 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## +## This file tests the libvpx simple_encoder example. To add new tests to this +## file, do the following: +## 1. Write a shell function (this is your test). +## 2. Add the function to simple_encoder_tests (on a new line). +## +. $(dirname $0)/tools_common.sh + +# Environment check: $YUV_RAW_INPUT is required. +simple_encoder_verify_environment() { + if [ ! -e "${YUV_RAW_INPUT}" ]; then + echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH." + return 1 + fi +} + +# Runs simple_encoder using the codec specified by $1 with a frame limit of 100. +simple_encoder() { + local encoder="${LIBVPX_BIN_PATH}/simple_encoder${VPX_TEST_EXE_SUFFIX}" + local codec="$1" + local output_file="${VPX_TEST_OUTPUT_DIR}/simple_encoder_${codec}.ivf" + + if [ ! -x "${encoder}" ]; then + elog "${encoder} does not exist or is not executable." + return 1 + fi + + eval "${VPX_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \ + "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" 9999 0 100 \ + ${devnull} || return 1 + + [ -e "${output_file}" ] || return 1 +} + +simple_encoder_vp8() { + if [ "$(vp8_encode_available)" = "yes" ]; then + simple_encoder vp8 || return 1 + fi +} + +simple_encoder_vp9() { + if [ "$(vp9_encode_available)" = "yes" ]; then + simple_encoder vp9 || return 1 + fi +} + +simple_encoder_tests="simple_encoder_vp8 + simple_encoder_vp9" + +run_tests simple_encoder_verify_environment "${simple_encoder_tests}" diff --git a/media/libvpx/libvpx/test/stress.sh b/media/libvpx/libvpx/test/stress.sh new file mode 100755 index 0000000000..ba79a52ac3 --- /dev/null +++ b/media/libvpx/libvpx/test/stress.sh @@ -0,0 +1,183 @@ +#!/bin/sh +## +## Copyright (c) 2016 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## +## This file performs a stress test. It runs (STRESS_ONEPASS_MAX_JOBS, +## default=5) one, (STRESS_TWOPASS_MAX_JOBS, default=5) two pass & +## (STRESS_RT_MAX_JOBS, default=5) encodes and (STRESS__DECODE_MAX_JOBS, +## default=30) decodes in parallel. + +. $(dirname $0)/tools_common.sh + +YUV="${LIBVPX_TEST_DATA_PATH}/niklas_1280_720_30.yuv" +VP8="${LIBVPX_TEST_DATA_PATH}/tos_vp8.webm" +VP9="${LIBVPX_TEST_DATA_PATH}/vp90-2-sintel_1920x818_tile_1x4_fpm_2279kbps.webm" +DATA_URL="https://storage.googleapis.com/downloads.webmproject.org/test_data/libvpx/" +SHA1_FILE="$(dirname $0)/test-data.sha1" + +# Set sha1sum to proper sha program (sha1sum, shasum, sha1). This code is +# cribbed from libs.mk. +[ -x "$(which sha1sum)" ] && sha1sum=sha1sum +[ -x "$(which shasum)" ] && sha1sum=shasum +[ -x "$(which sha1)" ] && sha1sum=sha1 + +# Download a file from the url and check its sha1sum. +download_and_check_file() { + # Get the file from the file path. + local root="${1#${LIBVPX_TEST_DATA_PATH}/}" + + # Download the file using curl. Trap to insure non partial file. + (trap "rm -f $1" INT TERM \ + && eval "curl --retry 1 -L -o $1 ${DATA_URL}${root} ${devnull}") + + # Check the sha1 sum of the file. + if [ -n "${sha1sum}" ]; then + set -e + grep ${root} ${SHA1_FILE} \ + | (cd ${LIBVPX_TEST_DATA_PATH}; ${sha1sum} -c); + fi +} + +# Environment check: Make sure input is available. +stress_verify_environment() { + if [ ! -e "${SHA1_FILE}" ] ; then + echo "Missing ${SHA1_FILE}" + return 1 + fi + for file in "${YUV}" "${VP8}" "${VP9}"; do + if [ ! -e "${file}" ] ; then + download_and_check_file "${file}" || return 1 + fi + done + if [ ! -e "${YUV}" ] || [ ! -e "${VP8}" ] || [ ! -e "${VP9}" ] ; then + elog "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH." + return 1 + fi + if [ -z "$(vpx_tool_path vpxenc)" ]; then + elog "vpxenc not found. It must exist in LIBVPX_BIN_PATH or its parent." + return 1 + fi + if [ -z "$(vpx_tool_path vpxdec)" ]; then + elog "vpxdec not found. It must exist in LIBVPX_BIN_PATH or its parent." + return 1 + fi +} + +# This function runs tests on libvpx that run multiple encodes and decodes +# in parallel in hopes of catching synchronization and/or threading issues. +stress() { + local decoder="$(vpx_tool_path vpxdec)" + local encoder="$(vpx_tool_path vpxenc)" + local codec="$1" + local webm="$2" + local decode_count="$3" + local threads="$4" + local enc_args="$5" + local pids="" + local rt_max_jobs=${STRESS_RT_MAX_JOBS:-5} + local onepass_max_jobs=${STRESS_ONEPASS_MAX_JOBS:-5} + local twopass_max_jobs=${STRESS_TWOPASS_MAX_JOBS:-5} + + # Enable job control, so we can run multiple processes. + set -m + + # Start $onepass_max_jobs encode jobs in parallel. + for i in $(seq ${onepass_max_jobs}); do + bitrate=$(($i * 20 + 300)) + eval "${VPX_TEST_PREFIX}" "${encoder}" "--codec=${codec} -w 1280 -h 720" \ + "${YUV}" "-t ${threads} --limit=150 --test-decode=fatal --passes=1" \ + "--target-bitrate=${bitrate} -o ${VPX_TEST_OUTPUT_DIR}/${i}.1pass.webm" \ + "${enc_args}" ${devnull} & + pids="${pids} $!" + done + + # Start $twopass_max_jobs encode jobs in parallel. + for i in $(seq ${twopass_max_jobs}); do + bitrate=$(($i * 20 + 300)) + eval "${VPX_TEST_PREFIX}" "${encoder}" "--codec=${codec} -w 1280 -h 720" \ + "${YUV}" "-t ${threads} --limit=150 --test-decode=fatal --passes=2" \ + "--target-bitrate=${bitrate} -o ${VPX_TEST_OUTPUT_DIR}/${i}.2pass.webm" \ + "${enc_args}" ${devnull} & + pids="${pids} $!" + done + + # Start $rt_max_jobs rt encode jobs in parallel. + for i in $(seq ${rt_max_jobs}); do + bitrate=$(($i * 20 + 300)) + eval "${VPX_TEST_PREFIX}" "${encoder}" "--codec=${codec} -w 1280 -h 720" \ + "${YUV}" "-t ${threads} --limit=150 --test-decode=fatal " \ + "--target-bitrate=${bitrate} --lag-in-frames=0 --error-resilient=1" \ + "--kf-min-dist=3000 --kf-max-dist=3000 --cpu-used=-6 --static-thresh=1" \ + "--end-usage=cbr --min-q=2 --max-q=56 --undershoot-pct=100" \ + "--overshoot-pct=15 --buf-sz=1000 --buf-initial-sz=500" \ + "--buf-optimal-sz=600 --max-intra-rate=900 --resize-allowed=0" \ + "--drop-frame=0 --passes=1 --rt --noise-sensitivity=4" \ + "-o ${VPX_TEST_OUTPUT_DIR}/${i}.rt.webm" ${devnull} & + pids="${pids} $!" + done + + # Start $decode_count decode jobs in parallel. + for i in $(seq "${decode_count}"); do + eval "${decoder}" "-t ${threads}" "${webm}" "--noblit" ${devnull} & + pids="${pids} $!" + done + + # Wait for all parallel jobs to finish. + fail=0 + for job in "${pids}"; do + wait $job || fail=$(($fail + 1)) + done + return $fail +} + +vp8_stress_test() { + local vp8_max_jobs=${STRESS_VP8_DECODE_MAX_JOBS:-40} + if [ "$(vp8_decode_available)" = "yes" -a \ + "$(vp8_encode_available)" = "yes" ]; then + stress vp8 "${VP8}" "${vp8_max_jobs}" 4 + fi +} + +vp8_stress_test_token_parititions() { + local vp8_max_jobs=${STRESS_VP8_DECODE_MAX_JOBS:-40} + if [ "$(vp8_decode_available)" = "yes" -a \ + "$(vp8_encode_available)" = "yes" ]; then + for threads in 2 4 8; do + for token_partitions in 1 2 3; do + stress vp8 "${VP8}" "${vp8_max_jobs}" ${threads} \ + "--token-parts=$token_partitions" + done + done + fi +} + +vp9_stress() { + local vp9_max_jobs=${STRESS_VP9_DECODE_MAX_JOBS:-25} + + if [ "$(vp9_decode_available)" = "yes" -a \ + "$(vp9_encode_available)" = "yes" ]; then + stress vp9 "${VP9}" "${vp9_max_jobs}" "$@" + fi +} + +vp9_stress_test() { + for threads in 4 8 64; do + vp9_stress "$threads" "--row-mt=0" + done +} + +vp9_stress_test_row_mt() { + for threads in 4 8 64; do + vp9_stress "$threads" "--row-mt=1" + done +} + +run_tests stress_verify_environment \ + "vp8_stress_test vp8_stress_test_token_parititions + vp9_stress_test vp9_stress_test_row_mt" diff --git a/media/libvpx/libvpx/test/sum_squares_test.cc b/media/libvpx/libvpx/test/sum_squares_test.cc new file mode 100644 index 0000000000..d3c76a34d2 --- /dev/null +++ b/media/libvpx/libvpx/test/sum_squares_test.cc @@ -0,0 +1,341 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/vpx_timer.h" + +using libvpx_test::ACMRandom; +using ::testing::Combine; +using ::testing::Range; +using ::testing::ValuesIn; + +namespace { +const int kNumIterations = 10000; + +typedef uint64_t (*SSI16Func)(const int16_t *src, int stride, int size); +typedef std::tuple SumSquaresParam; + +class SumSquaresTest : public ::testing::TestWithParam { + public: + ~SumSquaresTest() override = default; + void SetUp() override { + ref_func_ = GET_PARAM(0); + tst_func_ = GET_PARAM(1); + } + + void TearDown() override { libvpx_test::ClearSystemState(); } + + protected: + SSI16Func ref_func_; + SSI16Func tst_func_; +}; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SumSquaresTest); + +TEST_P(SumSquaresTest, OperationCheck) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + DECLARE_ALIGNED(16, int16_t, src[256 * 256]); + const int msb = 11; // Up to 12 bit input + const int limit = 1 << (msb + 1); + + for (int k = 0; k < kNumIterations; k++) { + const int size = 4 << rnd(6); // Up to 128x128 + int stride = 4 << rnd(7); // Up to 256 stride + while (stride < size) { // Make sure it's valid + stride = 4 << rnd(7); + } + + for (int i = 0; i < size; ++i) { + for (int j = 0; j < size; ++j) { + src[i * stride + j] = rnd(2) ? rnd(limit) : -rnd(limit); + } + } + + const uint64_t res_ref = ref_func_(src, stride, size); + uint64_t res_tst; + ASM_REGISTER_STATE_CHECK(res_tst = tst_func_(src, stride, size)); + + ASSERT_EQ(res_ref, res_tst) << "Error: Sum Squares Test" + << " C output does not match optimized output."; + } +} + +TEST_P(SumSquaresTest, ExtremeValues) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + DECLARE_ALIGNED(16, int16_t, src[256 * 256]); + const int msb = 11; // Up to 12 bit input + const int limit = 1 << (msb + 1); + + for (int k = 0; k < kNumIterations; k++) { + const int size = 4 << rnd(6); // Up to 128x128 + int stride = 4 << rnd(7); // Up to 256 stride + while (stride < size) { // Make sure it's valid + stride = 4 << rnd(7); + } + + const int val = rnd(2) ? limit - 1 : -(limit - 1); + for (int i = 0; i < size; ++i) { + for (int j = 0; j < size; ++j) { + src[i * stride + j] = val; + } + } + + const uint64_t res_ref = ref_func_(src, stride, size); + uint64_t res_tst; + ASM_REGISTER_STATE_CHECK(res_tst = tst_func_(src, stride, size)); + + ASSERT_EQ(res_ref, res_tst) << "Error: Sum Squares Test" + << " C output does not match optimized output."; + } +} + +using std::make_tuple; + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, SumSquaresTest, + ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c, + &vpx_sum_squares_2d_i16_neon))); +#endif // HAVE_NEON + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2, SumSquaresTest, + ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c, + &vpx_sum_squares_2d_i16_sse2))); +#endif // HAVE_SSE2 + +#if HAVE_MSA +INSTANTIATE_TEST_SUITE_P( + MSA, SumSquaresTest, + ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c, + &vpx_sum_squares_2d_i16_msa))); +#endif // HAVE_MSA + +typedef int64_t (*SSEFunc)(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height); + +struct TestSSEFuncs { + TestSSEFuncs(SSEFunc ref = nullptr, SSEFunc tst = nullptr, int depth = 0) + : ref_func(ref), tst_func(tst), bit_depth(depth) {} + SSEFunc ref_func; // Pointer to reference function + SSEFunc tst_func; // Pointer to tested function + int bit_depth; +}; + +typedef std::tuple SSETestParam; + +class SSETest : public ::testing::TestWithParam { + public: + ~SSETest() override = default; + void SetUp() override { + params_ = GET_PARAM(0); + width_ = GET_PARAM(1); + is_hbd_ = +#if CONFIG_VP9_HIGHBITDEPTH + params_.ref_func == vpx_highbd_sse_c; +#else + false; +#endif + rnd_.Reset(ACMRandom::DeterministicSeed()); + src_ = reinterpret_cast(vpx_memalign(32, 256 * 256 * 2)); + ref_ = reinterpret_cast(vpx_memalign(32, 256 * 256 * 2)); + ASSERT_NE(src_, nullptr); + ASSERT_NE(ref_, nullptr); + } + + void TearDown() override { + vpx_free(src_); + vpx_free(ref_); + } + void RunTest(bool is_random, int width, int height, int run_times); + + void GenRandomData(int width, int height, int stride) { + uint16_t *src16 = reinterpret_cast(src_); + uint16_t *ref16 = reinterpret_cast(ref_); + const int msb = 11; // Up to 12 bit input + const int limit = 1 << (msb + 1); + for (int ii = 0; ii < height; ii++) { + for (int jj = 0; jj < width; jj++) { + if (!is_hbd_) { + src_[ii * stride + jj] = rnd_.Rand8(); + ref_[ii * stride + jj] = rnd_.Rand8(); + } else { + src16[ii * stride + jj] = rnd_(limit); + ref16[ii * stride + jj] = rnd_(limit); + } + } + } + } + + void GenExtremeData(int width, int height, int stride, uint8_t *data, + int16_t val) { + uint16_t *data16 = reinterpret_cast(data); + for (int ii = 0; ii < height; ii++) { + for (int jj = 0; jj < width; jj++) { + if (!is_hbd_) { + data[ii * stride + jj] = static_cast(val); + } else { + data16[ii * stride + jj] = val; + } + } + } + } + + protected: + bool is_hbd_; + int width_; + TestSSEFuncs params_; + uint8_t *src_; + uint8_t *ref_; + ACMRandom rnd_; +}; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SSETest); + +void SSETest::RunTest(bool is_random, int width, int height, int run_times) { + int failed = 0; + vpx_usec_timer ref_timer, test_timer; + for (int k = 0; k < 3; k++) { + int stride = 4 << rnd_(7); // Up to 256 stride + while (stride < width) { // Make sure it's valid + stride = 4 << rnd_(7); + } + if (is_random) { + GenRandomData(width, height, stride); + } else { + const int msb = is_hbd_ ? 12 : 8; // Up to 12 bit input + const int limit = (1 << msb) - 1; + if (k == 0) { + GenExtremeData(width, height, stride, src_, 0); + GenExtremeData(width, height, stride, ref_, limit); + } else { + GenExtremeData(width, height, stride, src_, limit); + GenExtremeData(width, height, stride, ref_, 0); + } + } + int64_t res_ref, res_tst; + uint8_t *src = src_; + uint8_t *ref = ref_; +#if CONFIG_VP9_HIGHBITDEPTH + if (is_hbd_) { + src = CONVERT_TO_BYTEPTR(src_); + ref = CONVERT_TO_BYTEPTR(ref_); + } +#endif + res_ref = params_.ref_func(src, stride, ref, stride, width, height); + res_tst = params_.tst_func(src, stride, ref, stride, width, height); + if (run_times > 1) { + vpx_usec_timer_start(&ref_timer); + for (int j = 0; j < run_times; j++) { + params_.ref_func(src, stride, ref, stride, width, height); + } + vpx_usec_timer_mark(&ref_timer); + const int elapsed_time_c = + static_cast(vpx_usec_timer_elapsed(&ref_timer)); + + vpx_usec_timer_start(&test_timer); + for (int j = 0; j < run_times; j++) { + params_.tst_func(src, stride, ref, stride, width, height); + } + vpx_usec_timer_mark(&test_timer); + const int elapsed_time_simd = + static_cast(vpx_usec_timer_elapsed(&test_timer)); + + printf( + "c_time=%d \t simd_time=%d \t " + "gain=%d\n", + elapsed_time_c, elapsed_time_simd, + (elapsed_time_c / elapsed_time_simd)); + } else { + if (!failed) { + failed = res_ref != res_tst; + EXPECT_EQ(res_ref, res_tst) + << "Error:" << (is_hbd_ ? "hbd " : " ") << k << " SSE Test [" + << width << "x" << height + << "] C output does not match optimized output."; + } + } + } +} + +TEST_P(SSETest, OperationCheck) { + for (int height = 4; height <= 128; height += 4) { + RunTest(true, width_, height, 1); // GenRandomData + } +} + +TEST_P(SSETest, ExtremeValues) { + for (int height = 4; height <= 128; height += 4) { + RunTest(false, width_, height, 1); + } +} + +TEST_P(SSETest, DISABLED_Speed) { + for (int height = 4; height <= 128; height += 4) { + RunTest(true, width_, height, 100); + } +} + +#if HAVE_NEON +TestSSEFuncs sse_neon[] = { + TestSSEFuncs(&vpx_sse_c, &vpx_sse_neon), +#if CONFIG_VP9_HIGHBITDEPTH + TestSSEFuncs(&vpx_highbd_sse_c, &vpx_highbd_sse_neon) +#endif +}; +INSTANTIATE_TEST_SUITE_P(NEON, SSETest, + Combine(ValuesIn(sse_neon), Range(4, 129, 4))); +#endif // HAVE_NEON + +#if HAVE_NEON_DOTPROD +TestSSEFuncs sse_neon_dotprod[] = { + TestSSEFuncs(&vpx_sse_c, &vpx_sse_neon_dotprod), +}; +INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SSETest, + Combine(ValuesIn(sse_neon_dotprod), Range(4, 129, 4))); +#endif // HAVE_NEON_DOTPROD + +#if HAVE_SSE4_1 +TestSSEFuncs sse_sse4[] = { + TestSSEFuncs(&vpx_sse_c, &vpx_sse_sse4_1), +#if CONFIG_VP9_HIGHBITDEPTH + TestSSEFuncs(&vpx_highbd_sse_c, &vpx_highbd_sse_sse4_1) +#endif +}; +INSTANTIATE_TEST_SUITE_P(SSE4_1, SSETest, + Combine(ValuesIn(sse_sse4), Range(4, 129, 4))); +#endif // HAVE_SSE4_1 + +#if HAVE_AVX2 + +TestSSEFuncs sse_avx2[] = { + TestSSEFuncs(&vpx_sse_c, &vpx_sse_avx2), +#if CONFIG_VP9_HIGHBITDEPTH + TestSSEFuncs(&vpx_highbd_sse_c, &vpx_highbd_sse_avx2) +#endif +}; +INSTANTIATE_TEST_SUITE_P(AVX2, SSETest, + Combine(ValuesIn(sse_avx2), Range(4, 129, 4))); +#endif // HAVE_AVX2 +} // namespace diff --git a/media/libvpx/libvpx/test/superframe_test.cc b/media/libvpx/libvpx/test/superframe_test.cc new file mode 100644 index 0000000000..4c3aa1625a --- /dev/null +++ b/media/libvpx/libvpx/test/superframe_test.cc @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" + +namespace { + +const int kTestMode = 0; + +typedef std::tuple SuperframeTestParam; + +class SuperframeTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam { + protected: + SuperframeTest() + : EncoderTest(GET_PARAM(0)), modified_buf_(nullptr), last_sf_pts_(0) {} + ~SuperframeTest() override = default; + + void SetUp() override { + InitializeConfig(); + const SuperframeTestParam input = GET_PARAM(1); + const libvpx_test::TestMode mode = std::get(input); + SetMode(mode); + sf_count_ = 0; + sf_count_max_ = INT_MAX; + } + + void TearDown() override { delete[] modified_buf_; } + + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); + } + } + + const vpx_codec_cx_pkt_t *MutateEncoderOutputHook( + const vpx_codec_cx_pkt_t *pkt) override { + if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return pkt; + + const uint8_t *buffer = reinterpret_cast(pkt->data.frame.buf); + const uint8_t marker = buffer[pkt->data.frame.sz - 1]; + const int frames = (marker & 0x7) + 1; + const int mag = ((marker >> 3) & 3) + 1; + const unsigned int index_sz = 2 + mag * frames; + if ((marker & 0xe0) == 0xc0 && pkt->data.frame.sz >= index_sz && + buffer[pkt->data.frame.sz - index_sz] == marker) { + // frame is a superframe. strip off the index. + if (modified_buf_) delete[] modified_buf_; + modified_buf_ = new uint8_t[pkt->data.frame.sz - index_sz]; + memcpy(modified_buf_, pkt->data.frame.buf, pkt->data.frame.sz - index_sz); + modified_pkt_ = *pkt; + modified_pkt_.data.frame.buf = modified_buf_; + modified_pkt_.data.frame.sz -= index_sz; + + sf_count_++; + last_sf_pts_ = pkt->data.frame.pts; + return &modified_pkt_; + } + + // Make sure we do a few frames after the last SF + abort_ |= + sf_count_ > sf_count_max_ && pkt->data.frame.pts - last_sf_pts_ >= 5; + return pkt; + } + + int sf_count_; + int sf_count_max_; + vpx_codec_cx_pkt_t modified_pkt_; + uint8_t *modified_buf_; + vpx_codec_pts_t last_sf_pts_; +}; + +TEST_P(SuperframeTest, TestSuperframeIndexIsOptional) { + sf_count_max_ = 0; // early exit on successful test. + cfg_.g_lag_in_frames = 25; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 40); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + EXPECT_EQ(sf_count_, 1); +} + +VP9_INSTANTIATE_TEST_SUITE( + SuperframeTest, + ::testing::Combine(::testing::Values(::libvpx_test::kTwoPassGood), + ::testing::Values(0))); +} // namespace diff --git a/media/libvpx/libvpx/test/svc_datarate_test.cc b/media/libvpx/libvpx/test/svc_datarate_test.cc new file mode 100644 index 0000000000..aff4ace843 --- /dev/null +++ b/media/libvpx/libvpx/test/svc_datarate_test.cc @@ -0,0 +1,1796 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_config.h" +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/svc_test.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vpx/vpx_codec.h" +#include "vpx_ports/bitops.h" + +namespace svc_test { +namespace { + +typedef enum { + // Inter-layer prediction is on on all frames. + INTER_LAYER_PRED_ON, + // Inter-layer prediction is off on all frames. + INTER_LAYER_PRED_OFF, + // Inter-layer prediction is off on non-key frames and non-sync frames. + INTER_LAYER_PRED_OFF_NONKEY, + // Inter-layer prediction is on on all frames, but constrained such + // that any layer S (> 0) can only predict from previous spatial + // layer S-1, from the same superframe. + INTER_LAYER_PRED_ON_CONSTRAINED +} INTER_LAYER_PRED; + +class DatarateOnePassCbrSvc : public OnePassCbrSvc { + public: + explicit DatarateOnePassCbrSvc(const ::libvpx_test::CodecFactory *codec) + : OnePassCbrSvc(codec) { + inter_layer_pred_mode_ = 0; + } + + protected: + ~DatarateOnePassCbrSvc() override = default; + + virtual void ResetModel() { + last_pts_ = 0; + duration_ = 0.0; + mismatch_psnr_ = 0.0; + mismatch_nframes_ = 0; + denoiser_on_ = 0; + tune_content_ = 0; + base_speed_setting_ = 5; + spatial_layer_id_ = 0; + temporal_layer_id_ = 0; + update_pattern_ = 0; + memset(bits_in_buffer_model_, 0, sizeof(bits_in_buffer_model_)); + memset(bits_total_, 0, sizeof(bits_total_)); + memset(layer_target_avg_bandwidth_, 0, sizeof(layer_target_avg_bandwidth_)); + dynamic_drop_layer_ = false; + single_layer_resize_ = false; + change_bitrate_ = false; + last_pts_ref_ = 0; + middle_bitrate_ = 0; + top_bitrate_ = 0; + superframe_count_ = -1; + key_frame_spacing_ = 9999; + num_nonref_frames_ = 0; + layer_framedrop_ = 0; + force_key_ = 0; + force_key_test_ = 0; + insert_layer_sync_ = 0; + layer_sync_on_base_ = 0; + force_intra_only_frame_ = 0; + superframe_has_intra_only_ = 0; + use_post_encode_drop_ = 0; + denoiser_off_on_ = false; + denoiser_enable_layers_ = false; + num_resize_down_ = 0; + num_resize_up_ = 0; + for (int i = 0; i < VPX_MAX_LAYERS; i++) { + prev_frame_width[i] = 320; + prev_frame_height[i] = 240; + } + ksvc_flex_noupd_tlenh_ = false; + } + void BeginPassHook(unsigned int /*pass*/) override {} + + // Example pattern for spatial layers and 2 temporal layers used in the + // bypass/flexible mode. The pattern corresponds to the pattern + // VP9E_TEMPORAL_LAYERING_MODE_0101 (temporal_layering_mode == 2) used in + // non-flexible mode, except that we disable inter-layer prediction. + void set_frame_flags_bypass_mode(int tl, int num_spatial_layers, + int is_key_frame, + vpx_svc_ref_frame_config_t *ref_frame_config, + int noupdate_tlenh) { + for (int sl = 0; sl < num_spatial_layers; ++sl) + ref_frame_config->update_buffer_slot[sl] = 0; + + for (int sl = 0; sl < num_spatial_layers; ++sl) { + if (tl == 0) { + ref_frame_config->lst_fb_idx[sl] = sl; + if (sl) { + if (is_key_frame) { + ref_frame_config->lst_fb_idx[sl] = sl - 1; + ref_frame_config->gld_fb_idx[sl] = sl; + } else { + ref_frame_config->gld_fb_idx[sl] = sl - 1; + } + } else { + ref_frame_config->gld_fb_idx[sl] = 0; + } + ref_frame_config->alt_fb_idx[sl] = 0; + } else if (tl == 1) { + ref_frame_config->lst_fb_idx[sl] = sl; + ref_frame_config->gld_fb_idx[sl] = + VPXMIN(REF_FRAMES - 1, num_spatial_layers + sl - 1); + ref_frame_config->alt_fb_idx[sl] = + VPXMIN(REF_FRAMES - 1, num_spatial_layers + sl); + } + if (!tl) { + if (!sl) { + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->lst_fb_idx[sl]; + } else { + if (is_key_frame) { + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->gld_fb_idx[sl]; + } else { + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->lst_fb_idx[sl]; + } + } + } else if (tl == 1) { + if (!sl) { + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->alt_fb_idx[sl]; + } else { + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + // Non reference frame on top temporal top spatial. + ref_frame_config->update_buffer_slot[sl] = 0; + } + // Force no update on all spatial layers for temporal enhancement layer + // frames. + if (noupdate_tlenh) ref_frame_config->update_buffer_slot[sl] = 0; + } + } + } + + void CheckLayerRateTargeting(int num_spatial_layers, int num_temporal_layers, + double thresh_overshoot, + double thresh_undershoot) const { + for (int sl = 0; sl < num_spatial_layers; ++sl) + for (int tl = 0; tl < num_temporal_layers; ++tl) { + const int layer = sl * num_temporal_layers + tl; + ASSERT_GE(cfg_.layer_target_bitrate[layer], + file_datarate_[layer] * thresh_overshoot) + << " The datarate for the file exceeds the target by too much!"; + ASSERT_LE(cfg_.layer_target_bitrate[layer], + file_datarate_[layer] * thresh_undershoot) + << " The datarate for the file is lower than the target by too " + "much!"; + } + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + PreEncodeFrameHookSetup(video, encoder); + + if (video->frame() == 0) { + if (force_intra_only_frame_) { + // Decoder sets the color_space for Intra-only frames + // to BT_601 (see line 1810 in vp9_decodeframe.c). + // So set it here in these tess to avoid encoder-decoder + // mismatch check on color space setting. + encoder->Control(VP9E_SET_COLOR_SPACE, VPX_CS_BT_601); + } + encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_); + encoder->Control(VP9E_SET_TUNE_CONTENT, tune_content_); + encoder->Control(VP9E_SET_SVC_INTER_LAYER_PRED, inter_layer_pred_mode_); + + if (layer_framedrop_) { + vpx_svc_frame_drop_t svc_drop_frame; + svc_drop_frame.framedrop_mode = LAYER_DROP; + for (int i = 0; i < number_spatial_layers_; i++) + svc_drop_frame.framedrop_thresh[i] = 30; + svc_drop_frame.max_consec_drop = 30; + encoder->Control(VP9E_SET_SVC_FRAME_DROP_LAYER, &svc_drop_frame); + } + + if (use_post_encode_drop_) { + encoder->Control(VP9E_SET_POSTENCODE_DROP, use_post_encode_drop_); + } + } + + if (denoiser_off_on_) { + encoder->Control(VP9E_SET_AQ_MODE, 3); + // Set inter_layer_pred to INTER_LAYER_PRED_OFF_NONKEY (K-SVC). + encoder->Control(VP9E_SET_SVC_INTER_LAYER_PRED, 2); + if (!denoiser_enable_layers_) { + if (video->frame() == 0) + encoder->Control(VP9E_SET_NOISE_SENSITIVITY, 0); + else if (video->frame() == 100) + encoder->Control(VP9E_SET_NOISE_SENSITIVITY, 1); + } else { + // Cumulative bitrates for top spatial layers, for + // 3 temporal layers. + if (video->frame() == 0) { + encoder->Control(VP9E_SET_NOISE_SENSITIVITY, 0); + // Change layer bitrates to set top spatial layer to 0. + // This is for 3 spatial 3 temporal layers. + // This will trigger skip encoding/dropping of top spatial layer. + cfg_.rc_target_bitrate -= cfg_.layer_target_bitrate[8]; + for (int i = 0; i < 3; i++) + bitrate_sl3_[i] = cfg_.layer_target_bitrate[i + 6]; + cfg_.layer_target_bitrate[6] = 0; + cfg_.layer_target_bitrate[7] = 0; + cfg_.layer_target_bitrate[8] = 0; + encoder->Config(&cfg_); + } else if (video->frame() == 100) { + // Change layer bitrates to non-zero on top spatial layer. + // This will trigger skip encoding of top spatial layer + // on key frame (period = 100). + for (int i = 0; i < 3; i++) + cfg_.layer_target_bitrate[i + 6] = bitrate_sl3_[i]; + cfg_.rc_target_bitrate += cfg_.layer_target_bitrate[8]; + encoder->Config(&cfg_); + } else if (video->frame() == 120) { + // Enable denoiser and top spatial layer after key frame (period is + // 100). + encoder->Control(VP9E_SET_NOISE_SENSITIVITY, 1); + } + } + } + + if (ksvc_flex_noupd_tlenh_) { + vpx_svc_layer_id_t layer_id; + layer_id.spatial_layer_id = 0; + layer_id.temporal_layer_id = (video->frame() % 2 != 0); + temporal_layer_id_ = layer_id.temporal_layer_id; + for (int i = 0; i < number_spatial_layers_; i++) { + layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_; + ref_frame_config_.duration[i] = 1; + } + encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id); + set_frame_flags_bypass_mode(layer_id.temporal_layer_id, + number_spatial_layers_, 0, &ref_frame_config_, + 1); + encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_); + } + + if (update_pattern_ && video->frame() >= 100) { + vpx_svc_layer_id_t layer_id; + if (video->frame() == 100) { + cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS; + encoder->Config(&cfg_); + } + // Set layer id since the pattern changed. + layer_id.spatial_layer_id = 0; + layer_id.temporal_layer_id = (video->frame() % 2 != 0); + temporal_layer_id_ = layer_id.temporal_layer_id; + for (int i = 0; i < number_spatial_layers_; i++) { + layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_; + ref_frame_config_.duration[i] = 1; + } + encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id); + set_frame_flags_bypass_mode(layer_id.temporal_layer_id, + number_spatial_layers_, 0, &ref_frame_config_, + 0); + encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_); + } + + if (change_bitrate_ && video->frame() == 200) { + duration_ = (last_pts_ + 1) * timebase_; + for (int sl = 0; sl < number_spatial_layers_; ++sl) { + for (int tl = 0; tl < number_temporal_layers_; ++tl) { + const int layer = sl * number_temporal_layers_ + tl; + const double file_size_in_kb = bits_total_[layer] / 1000.; + file_datarate_[layer] = file_size_in_kb / duration_; + } + } + + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, + 0.78, 1.15); + + memset(file_datarate_, 0, sizeof(file_datarate_)); + memset(bits_total_, 0, sizeof(bits_total_)); + int64_t bits_in_buffer_model_tmp[VPX_MAX_LAYERS]; + last_pts_ref_ = last_pts_; + // Set new target bitarate. + cfg_.rc_target_bitrate = cfg_.rc_target_bitrate >> 1; + // Buffer level should not reset on dynamic bitrate change. + memcpy(bits_in_buffer_model_tmp, bits_in_buffer_model_, + sizeof(bits_in_buffer_model_)); + AssignLayerBitrates(); + memcpy(bits_in_buffer_model_, bits_in_buffer_model_tmp, + sizeof(bits_in_buffer_model_)); + + // Change config to update encoder with new bitrate configuration. + encoder->Config(&cfg_); + } + + if (dynamic_drop_layer_ && !single_layer_resize_) { + if (video->frame() == 0) { + // Change layer bitrates to set top layers to 0. This will trigger skip + // encoding/dropping of top two spatial layers. + cfg_.rc_target_bitrate -= + (cfg_.layer_target_bitrate[1] + cfg_.layer_target_bitrate[2]); + middle_bitrate_ = cfg_.layer_target_bitrate[1]; + top_bitrate_ = cfg_.layer_target_bitrate[2]; + cfg_.layer_target_bitrate[1] = 0; + cfg_.layer_target_bitrate[2] = 0; + encoder->Config(&cfg_); + } else if (video->frame() == 50) { + // Change layer bitrates to non-zero on two top spatial layers. + // This will trigger skip encoding of top two spatial layers. + cfg_.layer_target_bitrate[1] = middle_bitrate_; + cfg_.layer_target_bitrate[2] = top_bitrate_; + cfg_.rc_target_bitrate += + cfg_.layer_target_bitrate[2] + cfg_.layer_target_bitrate[1]; + encoder->Config(&cfg_); + } else if (video->frame() == 100) { + // Change layer bitrates to set top layers to 0. This will trigger skip + // encoding/dropping of top two spatial layers. + cfg_.rc_target_bitrate -= + (cfg_.layer_target_bitrate[1] + cfg_.layer_target_bitrate[2]); + middle_bitrate_ = cfg_.layer_target_bitrate[1]; + top_bitrate_ = cfg_.layer_target_bitrate[2]; + cfg_.layer_target_bitrate[1] = 0; + cfg_.layer_target_bitrate[2] = 0; + encoder->Config(&cfg_); + } else if (video->frame() == 150) { + // Change layer bitrate on second layer to non-zero to start + // encoding it again. + cfg_.layer_target_bitrate[1] = middle_bitrate_; + cfg_.rc_target_bitrate += cfg_.layer_target_bitrate[1]; + encoder->Config(&cfg_); + } else if (video->frame() == 200) { + // Change layer bitrate on top layer to non-zero to start + // encoding it again. + cfg_.layer_target_bitrate[2] = top_bitrate_; + cfg_.rc_target_bitrate += cfg_.layer_target_bitrate[2]; + encoder->Config(&cfg_); + } + } else if (dynamic_drop_layer_ && single_layer_resize_) { + // Change layer bitrates to set top layers to 0. This will trigger skip + // encoding/dropping of top spatial layers. + if (video->frame() == 2) { + cfg_.rc_target_bitrate -= + (cfg_.layer_target_bitrate[1] + cfg_.layer_target_bitrate[2]); + middle_bitrate_ = cfg_.layer_target_bitrate[1]; + top_bitrate_ = cfg_.layer_target_bitrate[2]; + cfg_.layer_target_bitrate[1] = 0; + cfg_.layer_target_bitrate[2] = 0; + // Set spatial layer 0 to a very low bitrate to trigger resize. + cfg_.layer_target_bitrate[0] = 30; + cfg_.rc_target_bitrate = cfg_.layer_target_bitrate[0]; + encoder->Config(&cfg_); + } else if (video->frame() == 100) { + // Set base spatial layer to very high to go back up to original size. + cfg_.layer_target_bitrate[0] = 400; + cfg_.rc_target_bitrate = cfg_.layer_target_bitrate[0]; + encoder->Config(&cfg_); + } + } else if (!dynamic_drop_layer_ && single_layer_resize_) { + if (video->frame() == 2) { + cfg_.layer_target_bitrate[0] = 30; + cfg_.layer_target_bitrate[1] = 50; + cfg_.rc_target_bitrate = + (cfg_.layer_target_bitrate[0] + cfg_.layer_target_bitrate[1]); + encoder->Config(&cfg_); + } else if (video->frame() == 160) { + cfg_.layer_target_bitrate[0] = 1500; + cfg_.layer_target_bitrate[1] = 2000; + cfg_.rc_target_bitrate = + (cfg_.layer_target_bitrate[0] + cfg_.layer_target_bitrate[1]); + encoder->Config(&cfg_); + } + } + if (force_key_test_ && force_key_) frame_flags_ = VPX_EFLAG_FORCE_KF; + + if (insert_layer_sync_) { + vpx_svc_spatial_layer_sync_t svc_layer_sync; + svc_layer_sync.base_layer_intra_only = 0; + for (int i = 0; i < number_spatial_layers_; i++) + svc_layer_sync.spatial_layer_sync[i] = 0; + if (force_intra_only_frame_) { + superframe_has_intra_only_ = 0; + if (video->frame() == 0) { + svc_layer_sync.base_layer_intra_only = 1; + svc_layer_sync.spatial_layer_sync[0] = 1; + encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync); + superframe_has_intra_only_ = 1; + } else if (video->frame() == 100) { + svc_layer_sync.base_layer_intra_only = 1; + svc_layer_sync.spatial_layer_sync[0] = 1; + encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync); + superframe_has_intra_only_ = 1; + } + } else { + layer_sync_on_base_ = 0; + if (video->frame() == 150) { + svc_layer_sync.spatial_layer_sync[1] = 1; + encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync); + } else if (video->frame() == 240) { + svc_layer_sync.spatial_layer_sync[2] = 1; + encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync); + } else if (video->frame() == 320) { + svc_layer_sync.spatial_layer_sync[0] = 1; + layer_sync_on_base_ = 1; + encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync); + } + } + } + + const vpx_rational_t tb = video->timebase(); + timebase_ = static_cast(tb.num) / tb.den; + duration_ = 0; + } + + vpx_codec_err_t parse_superframe_index(const uint8_t *data, size_t data_sz, + uint32_t sizes[8], int *count) { + uint8_t marker; + marker = *(data + data_sz - 1); + *count = 0; + if ((marker & 0xe0) == 0xc0) { + const uint32_t frames = (marker & 0x7) + 1; + const uint32_t mag = ((marker >> 3) & 0x3) + 1; + const size_t index_sz = 2 + mag * frames; + // This chunk is marked as having a superframe index but doesn't have + // enough data for it, thus it's an invalid superframe index. + if (data_sz < index_sz) return VPX_CODEC_CORRUPT_FRAME; + { + const uint8_t marker2 = *(data + data_sz - index_sz); + // This chunk is marked as having a superframe index but doesn't have + // the matching marker byte at the front of the index therefore it's an + // invalid chunk. + if (marker != marker2) return VPX_CODEC_CORRUPT_FRAME; + } + { + uint32_t i, j; + const uint8_t *x = &data[data_sz - index_sz + 1]; + for (i = 0; i < frames; ++i) { + uint32_t this_sz = 0; + + for (j = 0; j < mag; ++j) this_sz |= (*x++) << (j * 8); + sizes[i] = this_sz; + } + *count = frames; + } + } + return VPX_CODEC_OK; + } + + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + uint32_t sizes[8] = { 0 }; + uint32_t sizes_parsed[8] = { 0 }; + int count = 0; + int num_layers_encoded = 0; + last_pts_ = pkt->data.frame.pts; + const bool key_frame = + (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false; + if (key_frame) { + // For test that inserts layer sync frames: requesting a layer_sync on + // the base layer must force key frame. So if any key frame occurs after + // first superframe it must due to layer sync on base spatial layer. + if (superframe_count_ > 0 && insert_layer_sync_ && + !force_intra_only_frame_) { + ASSERT_EQ(layer_sync_on_base_, 1); + } + temporal_layer_id_ = 0; + superframe_count_ = 0; + } + parse_superframe_index(static_cast(pkt->data.frame.buf), + pkt->data.frame.sz, sizes_parsed, &count); + // Count may be less than number of spatial layers because of frame drops. + if (number_spatial_layers_ > 1) { + for (int sl = 0; sl < number_spatial_layers_; ++sl) { + if (pkt->data.frame.spatial_layer_encoded[sl]) { + sizes[sl] = sizes_parsed[num_layers_encoded]; + num_layers_encoded++; + } + } + } + // For superframe with Intra-only count will be +1 larger + // because of no-show frame. + if (force_intra_only_frame_ && superframe_has_intra_only_) + ASSERT_EQ(count, num_layers_encoded + 1); + else + ASSERT_EQ(count, num_layers_encoded); + + // In the constrained frame drop mode, if a given spatial is dropped all + // upper layers must be dropped too. + if (!layer_framedrop_) { + int num_layers_dropped = 0; + for (int sl = 0; sl < number_spatial_layers_; ++sl) { + if (!pkt->data.frame.spatial_layer_encoded[sl]) { + // Check that all upper layers are dropped. + num_layers_dropped++; + for (int sl2 = sl + 1; sl2 < number_spatial_layers_; ++sl2) + ASSERT_EQ(pkt->data.frame.spatial_layer_encoded[sl2], 0); + } + } + if (num_layers_dropped == number_spatial_layers_ - 1) + force_key_ = 1; + else + force_key_ = 0; + } + // Keep track of number of non-reference frames, needed for mismatch check. + // Non-reference frames are top spatial and temporal layer frames, + // for TL > 0. + if (temporal_layer_id_ == number_temporal_layers_ - 1 && + temporal_layer_id_ > 0 && + pkt->data.frame.spatial_layer_encoded[number_spatial_layers_ - 1]) + num_nonref_frames_++; + for (int sl = 0; sl < number_spatial_layers_; ++sl) { + sizes[sl] = sizes[sl] << 3; + // Update the total encoded bits per layer. + // For temporal layers, update the cumulative encoded bits per layer. + for (int tl = temporal_layer_id_; tl < number_temporal_layers_; ++tl) { + const int layer = sl * number_temporal_layers_ + tl; + bits_total_[layer] += static_cast(sizes[sl]); + // Update the per-layer buffer level with the encoded frame size. + bits_in_buffer_model_[layer] -= static_cast(sizes[sl]); + // There should be no buffer underrun, except on the base + // temporal layer, since there may be key frames there. + // Fo short key frame spacing, buffer can underrun on individual frames. + if (!key_frame && tl > 0 && key_frame_spacing_ < 100) { + ASSERT_GE(bits_in_buffer_model_[layer], 0) + << "Buffer Underrun at frame " << pkt->data.frame.pts; + } + } + + if (!single_layer_resize_) { + unsigned int scaled_width = top_sl_width_ * + svc_params_.scaling_factor_num[sl] / + svc_params_.scaling_factor_den[sl]; + if (scaled_width % 2 != 0) scaled_width += 1; + ASSERT_EQ(pkt->data.frame.width[sl], scaled_width); + unsigned int scaled_height = top_sl_height_ * + svc_params_.scaling_factor_num[sl] / + svc_params_.scaling_factor_den[sl]; + if (scaled_height % 2 != 0) scaled_height += 1; + ASSERT_EQ(pkt->data.frame.height[sl], scaled_height); + } else if (superframe_count_ > 0) { + if (pkt->data.frame.width[sl] < prev_frame_width[sl] && + pkt->data.frame.height[sl] < prev_frame_height[sl]) + num_resize_down_ += 1; + if (pkt->data.frame.width[sl] > prev_frame_width[sl] && + pkt->data.frame.height[sl] > prev_frame_height[sl]) + num_resize_up_ += 1; + } + prev_frame_width[sl] = pkt->data.frame.width[sl]; + prev_frame_height[sl] = pkt->data.frame.height[sl]; + } + } + + void EndPassHook() override { + if (change_bitrate_) last_pts_ = last_pts_ - last_pts_ref_; + duration_ = (last_pts_ + 1) * timebase_; + for (int sl = 0; sl < number_spatial_layers_; ++sl) { + for (int tl = 0; tl < number_temporal_layers_; ++tl) { + const int layer = sl * number_temporal_layers_ + tl; + const double file_size_in_kb = bits_total_[layer] / 1000.; + file_datarate_[layer] = file_size_in_kb / duration_; + } + } + } + + void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) override { + // TODO(marpan): Look into why an assert is triggered in compute_psnr + // for mismatch frames for the special test case: ksvc_flex_noupd_tlenh. + // Has to do with dropped frames in bypass/flexible svc mode. + if (!ksvc_flex_noupd_tlenh_) { + double mismatch_psnr = compute_psnr(img1, img2); + mismatch_psnr_ += mismatch_psnr; + ++mismatch_nframes_; + } + } + + unsigned int GetMismatchFrames() { return mismatch_nframes_; } + unsigned int GetNonRefFrames() { return num_nonref_frames_; } + + vpx_codec_pts_t last_pts_; + double timebase_; + int64_t bits_total_[VPX_MAX_LAYERS]; + double duration_; + double file_datarate_[VPX_MAX_LAYERS]; + size_t bits_in_last_frame_; + double mismatch_psnr_; + int denoiser_on_; + int tune_content_; + int spatial_layer_id_; + bool dynamic_drop_layer_; + bool single_layer_resize_; + unsigned int top_sl_width_; + unsigned int top_sl_height_; + vpx_svc_ref_frame_config_t ref_frame_config_; + int update_pattern_; + bool change_bitrate_; + vpx_codec_pts_t last_pts_ref_; + int middle_bitrate_; + int top_bitrate_; + int key_frame_spacing_; + int layer_framedrop_; + int force_key_; + int force_key_test_; + int inter_layer_pred_mode_; + int insert_layer_sync_; + int layer_sync_on_base_; + int force_intra_only_frame_; + int superframe_has_intra_only_; + int use_post_encode_drop_; + int bitrate_sl3_[3]; + // Denoiser switched on the fly. + bool denoiser_off_on_; + // Top layer enabled on the fly. + bool denoiser_enable_layers_; + int num_resize_up_; + int num_resize_down_; + unsigned int prev_frame_width[VPX_MAX_LAYERS]; + unsigned int prev_frame_height[VPX_MAX_LAYERS]; + bool ksvc_flex_noupd_tlenh_; + + private: + void SetConfig(const int num_temporal_layer) override { + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 1; + if (num_temporal_layer == 3) { + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + cfg_.temporal_layering_mode = 3; + } else if (num_temporal_layer == 2) { + cfg_.ts_rate_decimator[0] = 2; + cfg_.ts_rate_decimator[1] = 1; + cfg_.temporal_layering_mode = 2; + } else if (num_temporal_layer == 1) { + cfg_.ts_rate_decimator[0] = 1; + cfg_.temporal_layering_mode = 0; + } + } + + unsigned int num_nonref_frames_; + unsigned int mismatch_nframes_; +}; + +// Params: speed setting. +class DatarateOnePassCbrSvcSingleBR + : public DatarateOnePassCbrSvc, + public ::libvpx_test::CodecTestWithParam { + public: + DatarateOnePassCbrSvcSingleBR() : DatarateOnePassCbrSvc(GET_PARAM(0)) { + memset(&svc_params_, 0, sizeof(svc_params_)); + } + ~DatarateOnePassCbrSvcSingleBR() override = default; + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + speed_setting_ = GET_PARAM(1); + ResetModel(); + } +}; + +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3 +// temporal layers, for 4:4:4 Profile 1. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TL444Profile1) { + SetSvcConfig(3, 3); + ::libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140); + cfg_.g_profile = 1; + cfg_.g_bit_depth = VPX_BITS_8; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 0; + cfg_.kf_max_dist = 9999; + + top_sl_width_ = 352; + top_sl_height_ = 288; + cfg_.rc_target_bitrate = 500; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78, + 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 3 +// temporal layers, for 4:2:2 Profile 1. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2SL3TL422Profile1) { + SetSvcConfig(2, 3); + ::libvpx_test::Y4mVideoSource video("park_joy_90p_8_422.y4m", 0, 20); + cfg_.g_profile = 1; + cfg_.g_bit_depth = VPX_BITS_8; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 0; + cfg_.kf_max_dist = 9999; + + top_sl_width_ = 160; + top_sl_height_ = 90; + cfg_.rc_target_bitrate = 500; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Use large under/over shoot thresholds as this is a very short clip, + // so not good for testing rate-targeting. + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.5, + 1.7); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +#if CONFIG_VP9_HIGHBITDEPTH +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3 +// temporal layers, for Profle 2 10bit. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TL10bitProfile2) { + SetSvcConfig(3, 3); + ::libvpx_test::Y4mVideoSource video("park_joy_90p_10_420_20f.y4m", 0, 20); + cfg_.g_profile = 2; + cfg_.g_bit_depth = VPX_BITS_10; + cfg_.g_input_bit_depth = VPX_BITS_10; + if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 0; + cfg_.kf_max_dist = 9999; + + top_sl_width_ = 160; + top_sl_height_ = 90; + cfg_.rc_target_bitrate = 500; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // TODO(marpan/jianj): Comment out the rate-target checking for now + // as superframe parsing to get frame size needs to be fixed for + // high bitdepth. + /* + // Use large under/over shoot thresholds as this is a very short clip, + // so not good for testing rate-targeting. + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.5, + 1.7); + */ +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3 +// temporal layers, for Profle 2 12bit. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TL12bitProfile2) { + SetSvcConfig(3, 3); + ::libvpx_test::Y4mVideoSource video("park_joy_90p_12_420_20f.y4m", 0, 20); + cfg_.g_profile = 2; + cfg_.g_bit_depth = VPX_BITS_12; + cfg_.g_input_bit_depth = VPX_BITS_12; + if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 0; + cfg_.kf_max_dist = 9999; + + top_sl_width_ = 160; + top_sl_height_ = 90; + cfg_.rc_target_bitrate = 500; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // TODO(marpan/jianj): Comment out the rate-target checking for now + // as superframe parsing to get frame size needs to be fixed for + // high bitdepth. + /* + // Use large under/over shoot thresholds as this is a very short clip, + // so not good for testing rate-targeting. + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.5, + 1.7); + */ +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} +#endif + +// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 1 +// temporal layer, with screen content mode on and same speed setting for all +// layers. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2SL1TLScreenContent1) { + SetSvcConfig(2, 1); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 10; + cfg_.kf_max_dist = 9999; + + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + top_sl_width_ = 1280; + top_sl_height_ = 720; + cfg_.rc_target_bitrate = 500; + ResetModel(); + tune_content_ = 1; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78, + 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and +// 3 temporal layers, with force key frame after frame drop +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TLForceKey) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + cfg_.rc_target_bitrate = 100; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78, + 1.25); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and +// 2 temporal layers, with a change on the fly from the fixed SVC pattern to one +// generate via SVC_SET_REF_FRAME_CONFIG. The new pattern also disables +// inter-layer prediction. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL2TLDynamicPatternChange) { + SetSvcConfig(3, 2); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + cfg_.rc_target_bitrate = 800; + ResetModel(); + // Change SVC pattern on the fly. + update_pattern_ = 1; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78, + 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC with 3 spatial and 3 temporal +// layers, for inter_layer_pred=OffKey (K-SVC) and on the fly switching +// of denoiser from off to on (on at frame = 100). Key frame period is set to +// 1000 so denoise is enabled on non-key. +TEST_P(DatarateOnePassCbrSvcSingleBR, + OnePassCbrSvc3SL3TL_DenoiserOffOnFixedLayers) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 1000; + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", 1280, + 720, 30, 1, 0, 300); + top_sl_width_ = 1280; + top_sl_height_ = 720; + cfg_.rc_target_bitrate = 1000; + ResetModel(); + denoiser_off_on_ = true; + denoiser_enable_layers_ = false; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Don't check rate targeting on two top spatial layer since they will be + // skipped for part of the sequence. + CheckLayerRateTargeting(number_spatial_layers_ - 2, number_temporal_layers_, + 0.78, 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC with 3 spatial and 3 temporal +// layers, for inter_layer_pred=OffKey (K-SVC) and on the fly switching +// of denoiser from off to on, for dynamic layers. Start at 2 spatial layers +// and enable 3rd spatial layer at frame = 100. Use periodic key frame with +// period 100 so enabling of spatial layer occurs at key frame. Enable denoiser +// at frame > 100, after the key frame sync. +TEST_P(DatarateOnePassCbrSvcSingleBR, + OnePassCbrSvc3SL3TL_DenoiserOffOnEnableLayers) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 0; + cfg_.kf_max_dist = 100; + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", 1280, + 720, 30, 1, 0, 300); + top_sl_width_ = 1280; + top_sl_height_ = 720; + cfg_.rc_target_bitrate = 1000; + ResetModel(); + denoiser_off_on_ = true; + denoiser_enable_layers_ = true; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Don't check rate targeting on two top spatial layer since they will be + // skipped for part of the sequence. + CheckLayerRateTargeting(number_spatial_layers_ - 2, number_temporal_layers_, + 0.78, 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC with 3 spatial layers and on +// the fly switching to 1 and then 2 and back to 3 spatial layers. This switch +// is done by setting spatial layer bitrates to 0, and then back to non-zero, +// during the sequence. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL_DisableEnableLayers) { + SetSvcConfig(3, 1); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.temporal_layering_mode = 0; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + cfg_.rc_target_bitrate = 800; + ResetModel(); + dynamic_drop_layer_ = true; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Don't check rate targeting on two top spatial layer since they will be + // skipped for part of the sequence. + CheckLayerRateTargeting(number_spatial_layers_ - 2, number_temporal_layers_, + 0.78, 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC with 2 spatial layers and on +// the fly switching to 1 spatial layer with dynamic resize enabled. +// The resizer will resize the single layer down and back up again, as the +// bitrate goes back up. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2SL_SingleLayerResize) { + SetSvcConfig(2, 1); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.temporal_layering_mode = 0; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + cfg_.rc_resize_allowed = 1; + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", 1280, + 720, 15, 1, 0, 300); + top_sl_width_ = 1280; + top_sl_height_ = 720; + cfg_.rc_target_bitrate = 800; + ResetModel(); + dynamic_drop_layer_ = true; + single_layer_resize_ = true; + base_speed_setting_ = speed_setting_; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Expect at least one resize down and at least one resize back up. + EXPECT_GE(num_resize_down_, 1); + EXPECT_GE(num_resize_up_, 1); + // Don't check rate targeting on two top spatial layer since they will be + // skipped for part of the sequence. + CheckLayerRateTargeting(number_spatial_layers_ - 2, number_temporal_layers_, + 0.78, 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// For pass CBR SVC with 1 spatial and 2 temporal layers with dynamic resize +// and denoiser enabled. The resizer will resize the single layer down and back +// up again, as the bitrate goes back up. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc1SL2TL_DenoiseResize) { + SetSvcConfig(1, 2); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.temporal_layering_mode = 2; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + cfg_.rc_resize_allowed = 1; + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", 1280, + 720, 12, 1, 0, 300); + top_sl_width_ = 1280; + top_sl_height_ = 720; + cfg_.rc_target_bitrate = 800; + ResetModel(); + dynamic_drop_layer_ = false; + single_layer_resize_ = true; + denoiser_on_ = 1; + base_speed_setting_ = speed_setting_; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Expect at least one resize down and at least one resize back up. + EXPECT_GE(num_resize_down_, 1); + EXPECT_GE(num_resize_up_, 1); +} + +// Run SVC encoder for 1 temporal layer, 2 spatial layers, with spatial +// downscale 5x5. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2SL1TL5x5MultipleRuns) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.ss_number_layers = 2; + cfg_.ts_number_layers = 1; + cfg_.ts_rate_decimator[0] = 1; + cfg_.g_error_resilient = 1; + cfg_.g_threads = 3; + cfg_.temporal_layering_mode = 0; + svc_params_.scaling_factor_num[0] = 256; + svc_params_.scaling_factor_den[0] = 1280; + svc_params_.scaling_factor_num[1] = 1280; + svc_params_.scaling_factor_den[1] = 1280; + cfg_.rc_dropframe_thresh = 10; + cfg_.kf_max_dist = 999999; + cfg_.kf_min_dist = 0; + cfg_.ss_target_bitrate[0] = 300; + cfg_.ss_target_bitrate[1] = 1400; + cfg_.layer_target_bitrate[0] = 300; + cfg_.layer_target_bitrate[1] = 1400; + cfg_.rc_target_bitrate = 1700; + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; + ResetModel(); + layer_target_avg_bandwidth_[0] = cfg_.layer_target_bitrate[0] * 1000 / 30; + bits_in_buffer_model_[0] = + cfg_.layer_target_bitrate[0] * cfg_.rc_buf_initial_sz; + layer_target_avg_bandwidth_[1] = cfg_.layer_target_bitrate[1] * 1000 / 30; + bits_in_buffer_model_[1] = + cfg_.layer_target_bitrate[1] * cfg_.rc_buf_initial_sz; + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + top_sl_width_ = 1280; + top_sl_height_ = 720; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78, + 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Params: speed setting and index for bitrate array. +class DatarateOnePassCbrSvcMultiBR + : public DatarateOnePassCbrSvc, + public ::libvpx_test::CodecTestWith2Params { + public: + DatarateOnePassCbrSvcMultiBR() : DatarateOnePassCbrSvc(GET_PARAM(0)) { + memset(&svc_params_, 0, sizeof(svc_params_)); + } + ~DatarateOnePassCbrSvcMultiBR() override = default; + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + speed_setting_ = GET_PARAM(1); + ResetModel(); + } +}; + +// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and +// 3 temporal layers. Run CIF clip with 1 thread. +TEST_P(DatarateOnePassCbrSvcMultiBR, OnePassCbrSvc2SL3TL) { + SetSvcConfig(2, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + const int bitrates[3] = { 200, 400, 600 }; + // TODO(marpan): Check that effective_datarate for each layer hits the + // layer target_bitrate. + cfg_.rc_target_bitrate = bitrates[GET_PARAM(2)]; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.75, + 1.2); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass VBR SVC: 2 spatial layers and +// 3 temporal layers. Run VGA clip with 1 thread. +TEST_P(DatarateOnePassCbrSvcMultiBR, OnePassVbrSvc2SL3TL) { + SetSvcConfig(2, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + cfg_.rc_end_usage = VPX_VBR; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + const int bitrates[3] = { 200, 400, 600 }; + cfg_.rc_target_bitrate = bitrates[GET_PARAM(2)]; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.70, + 1.3); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Params: speed setting, layer framedrop control and index for bitrate array. +class DatarateOnePassCbrSvcFrameDropMultiBR + : public DatarateOnePassCbrSvc, + public ::libvpx_test::CodecTestWith3Params { + public: + DatarateOnePassCbrSvcFrameDropMultiBR() + : DatarateOnePassCbrSvc(GET_PARAM(0)) { + memset(&svc_params_, 0, sizeof(svc_params_)); + } + ~DatarateOnePassCbrSvcFrameDropMultiBR() override = default; + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + speed_setting_ = GET_PARAM(1); + ResetModel(); + } +}; + +// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and +// 3 temporal layers. Run HD clip with 4 threads. +TEST_P(DatarateOnePassCbrSvcFrameDropMultiBR, OnePassCbrSvc2SL3TL4Threads) { + SetSvcConfig(2, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 4; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + top_sl_width_ = 1280; + top_sl_height_ = 720; + layer_framedrop_ = 0; + const int bitrates[3] = { 200, 400, 600 }; + cfg_.rc_target_bitrate = bitrates[GET_PARAM(3)]; + ResetModel(); + layer_framedrop_ = GET_PARAM(2); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.64, + 1.45); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and +// 3 temporal layers. Run HD clip with 4 threads. +TEST_P(DatarateOnePassCbrSvcFrameDropMultiBR, OnePassCbrSvc3SL3TL4Threads) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 4; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + top_sl_width_ = 1280; + top_sl_height_ = 720; + layer_framedrop_ = 0; + const int bitrates[3] = { 200, 400, 600 }; + cfg_.rc_target_bitrate = bitrates[GET_PARAM(3)]; + ResetModel(); + layer_framedrop_ = GET_PARAM(2); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.58, + 1.2); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and +// 2 temporal layers, for KSVC in flexible mode with no update of reference +// frames for all spatial layers on TL > 0 superframes. +// Run HD clip with 4 threads. +TEST_P(DatarateOnePassCbrSvcFrameDropMultiBR, OnePassCbrSvc3SL2TL4ThKSVCFlex) { + SetSvcConfig(3, 2); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 4; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + top_sl_width_ = 1280; + top_sl_height_ = 720; + layer_framedrop_ = 0; + const int bitrates[3] = { 200, 400, 600 }; + cfg_.rc_target_bitrate = bitrates[GET_PARAM(3)]; + ResetModel(); + layer_framedrop_ = GET_PARAM(2); + AssignLayerBitrates(); + ksvc_flex_noupd_tlenh_ = true; + cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.58, + 1.2); +} + +// Params: speed setting, inter-layer prediction mode. +class DatarateOnePassCbrSvcInterLayerPredSingleBR + : public DatarateOnePassCbrSvc, + public ::libvpx_test::CodecTestWith2Params { + public: + DatarateOnePassCbrSvcInterLayerPredSingleBR() + : DatarateOnePassCbrSvc(GET_PARAM(0)) { + memset(&svc_params_, 0, sizeof(svc_params_)); + } + ~DatarateOnePassCbrSvcInterLayerPredSingleBR() override = default; + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + speed_setting_ = GET_PARAM(1); + inter_layer_pred_mode_ = GET_PARAM(2); + ResetModel(); + } +}; + +// Check basic rate targeting with different inter-layer prediction modes for 1 +// pass CBR SVC: 3 spatial layers and 3 temporal layers. Run CIF clip with 1 +// thread. +TEST_P(DatarateOnePassCbrSvcInterLayerPredSingleBR, OnePassCbrSvc3SL3TL) { + // Disable test for inter-layer pred off for now since simulcast_mode fails. + if (inter_layer_pred_mode_ == INTER_LAYER_PRED_OFF) return; + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.temporal_layering_mode = 3; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + cfg_.rc_target_bitrate = 800; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78, + 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check rate targeting with different inter-layer prediction modes for 1 pass +// CBR SVC: 3 spatial layers and 3 temporal layers, changing the target bitrate +// at the middle of encoding. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TLDynamicBitrateChange) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + cfg_.rc_target_bitrate = 800; + ResetModel(); + change_bitrate_ = true; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78, + 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +#if CONFIG_VP9_TEMPORAL_DENOISING +// Params: speed setting, noise sensitivity, index for bitrate array and inter +// layer pred mode. +class DatarateOnePassCbrSvcDenoiser + : public DatarateOnePassCbrSvc, + public ::libvpx_test::CodecTestWith4Params { + public: + DatarateOnePassCbrSvcDenoiser() : DatarateOnePassCbrSvc(GET_PARAM(0)) { + memset(&svc_params_, 0, sizeof(svc_params_)); + } + ~DatarateOnePassCbrSvcDenoiser() override = default; + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + speed_setting_ = GET_PARAM(1); + inter_layer_pred_mode_ = GET_PARAM(3); + ResetModel(); + } +}; + +// Check basic rate targeting for 1 pass CBR SVC with denoising. +// 2 spatial layers and 3 temporal layer. Run HD clip with 2 threads. +TEST_P(DatarateOnePassCbrSvcDenoiser, OnePassCbrSvc2SL3TLDenoiserOn) { + SetSvcConfig(2, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 2; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + const int bitrates[3] = { 600, 800, 1000 }; + // TODO(marpan): Check that effective_datarate for each layer hits the + // layer target_bitrate. + // For SVC, noise_sen = 1 means denoising only the top spatial layer + // noise_sen = 2 means denoising the two top spatial layers. + cfg_.rc_target_bitrate = bitrates[GET_PARAM(3)]; + ResetModel(); + denoiser_on_ = GET_PARAM(2); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78, + 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} +#endif + +// Params: speed setting, key frame dist. +class DatarateOnePassCbrSvcSmallKF + : public DatarateOnePassCbrSvc, + public ::libvpx_test::CodecTestWith2Params { + public: + DatarateOnePassCbrSvcSmallKF() : DatarateOnePassCbrSvc(GET_PARAM(0)) { + memset(&svc_params_, 0, sizeof(svc_params_)); + } + ~DatarateOnePassCbrSvcSmallKF() override = default; + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + speed_setting_ = GET_PARAM(1); + ResetModel(); + } +}; + +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3 +// temporal layers. Run CIF clip with 1 thread, and few short key frame periods. +TEST_P(DatarateOnePassCbrSvcSmallKF, OnePassCbrSvc3SL3TLSmallKf) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 10; + cfg_.rc_target_bitrate = 800; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + // For this 3 temporal layer case, pattern repeats every 4 frames, so choose + // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2). + const int kf_dist = GET_PARAM(2); + cfg_.kf_max_dist = kf_dist; + key_frame_spacing_ = kf_dist; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.70, + 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 3 +// temporal layers. Run CIF clip with 1 thread, and few short key frame periods. +TEST_P(DatarateOnePassCbrSvcSmallKF, OnePassCbrSvc2SL3TLSmallKf) { + SetSvcConfig(2, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 10; + cfg_.rc_target_bitrate = 400; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + // For this 3 temporal layer case, pattern repeats every 4 frames, so choose + // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2). + const int kf_dist = GET_PARAM(2) + 32; + cfg_.kf_max_dist = kf_dist; + key_frame_spacing_ = kf_dist; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78, + 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3 +// temporal layers. Run VGA clip with 1 thread, and place layer sync frames: +// one at middle layer first, then another one for top layer, and another +// insert for base spatial layer (which forces key frame). +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TLSyncFrames) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.kf_max_dist = 9999; + cfg_.rc_dropframe_thresh = 10; + cfg_.rc_target_bitrate = 400; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + ResetModel(); + insert_layer_sync_ = 1; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78, + 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Run SVC encoder for 3 spatial layers, 1 temporal layer, with +// intra-only frame as sync frame on base spatial layer. +// Intra_only is inserted at start and in middle of sequence. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL1TLSyncWithIntraOnly) { + SetSvcConfig(3, 1); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 4; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + cfg_.rc_target_bitrate = 400; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + ResetModel(); + insert_layer_sync_ = 1; + // Use intra_only frame for sync on base layer. + force_intra_only_frame_ = 1; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.73, + 1.2); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Run SVC encoder for 2 quality layers (same resolution different, +// bitrates), 1 temporal layer, with screen content mode. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2QL1TLScreen) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.ss_number_layers = 2; + cfg_.ts_number_layers = 1; + cfg_.ts_rate_decimator[0] = 1; + cfg_.temporal_layering_mode = 0; + cfg_.g_error_resilient = 1; + cfg_.g_threads = 2; + svc_params_.scaling_factor_num[0] = 1; + svc_params_.scaling_factor_den[0] = 1; + svc_params_.scaling_factor_num[1] = 1; + svc_params_.scaling_factor_den[1] = 1; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + ResetModel(); + tune_content_ = 1; + // Set the layer bitrates, for 2 spatial layers, 1 temporal. + cfg_.rc_target_bitrate = 400; + cfg_.ss_target_bitrate[0] = 100; + cfg_.ss_target_bitrate[1] = 300; + cfg_.layer_target_bitrate[0] = 100; + cfg_.layer_target_bitrate[1] = 300; + for (int sl = 0; sl < 2; ++sl) { + float layer_framerate = 30.0; + layer_target_avg_bandwidth_[sl] = static_cast( + cfg_.layer_target_bitrate[sl] * 1000.0 / layer_framerate); + bits_in_buffer_model_[sl] = + cfg_.layer_target_bitrate[sl] * cfg_.rc_buf_initial_sz; + } + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.73, + 1.25); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Params: speed setting. +class DatarateOnePassCbrSvcPostencodeDrop + : public DatarateOnePassCbrSvc, + public ::libvpx_test::CodecTestWithParam { + public: + DatarateOnePassCbrSvcPostencodeDrop() : DatarateOnePassCbrSvc(GET_PARAM(0)) { + memset(&svc_params_, 0, sizeof(svc_params_)); + } + ~DatarateOnePassCbrSvcPostencodeDrop() override = default; + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + speed_setting_ = GET_PARAM(1); + ResetModel(); + } +}; + +// Run SVC encoder for 2 quality layers (same resolution different, +// bitrates), 1 temporal layer, with screen content mode. +TEST_P(DatarateOnePassCbrSvcPostencodeDrop, OnePassCbrSvc2QL1TLScreen) { + cfg_.rc_buf_initial_sz = 200; + cfg_.rc_buf_optimal_sz = 200; + cfg_.rc_buf_sz = 400; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 52; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.ss_number_layers = 2; + cfg_.ts_number_layers = 1; + cfg_.ts_rate_decimator[0] = 1; + cfg_.temporal_layering_mode = 0; + cfg_.g_error_resilient = 1; + cfg_.g_threads = 2; + svc_params_.scaling_factor_num[0] = 1; + svc_params_.scaling_factor_den[0] = 1; + svc_params_.scaling_factor_num[1] = 1; + svc_params_.scaling_factor_den[1] = 1; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + top_sl_width_ = 352; + top_sl_height_ = 288; + ResetModel(); + base_speed_setting_ = speed_setting_; + tune_content_ = 1; + use_post_encode_drop_ = 1; + // Set the layer bitrates, for 2 spatial layers, 1 temporal. + cfg_.rc_target_bitrate = 400; + cfg_.ss_target_bitrate[0] = 100; + cfg_.ss_target_bitrate[1] = 300; + cfg_.layer_target_bitrate[0] = 100; + cfg_.layer_target_bitrate[1] = 300; + for (int sl = 0; sl < 2; ++sl) { + float layer_framerate = 30.0; + layer_target_avg_bandwidth_[sl] = static_cast( + cfg_.layer_target_bitrate[sl] * 1000.0 / layer_framerate); + bits_in_buffer_model_[sl] = + cfg_.layer_target_bitrate[sl] * cfg_.rc_buf_initial_sz; + } + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.73, + 1.25); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +VP9_INSTANTIATE_TEST_SUITE(DatarateOnePassCbrSvcSingleBR, + ::testing::Range(5, 10)); + +VP9_INSTANTIATE_TEST_SUITE(DatarateOnePassCbrSvcPostencodeDrop, + ::testing::Range(5, 6)); + +VP9_INSTANTIATE_TEST_SUITE(DatarateOnePassCbrSvcInterLayerPredSingleBR, + ::testing::Range(5, 10), ::testing::Range(0, 3)); + +VP9_INSTANTIATE_TEST_SUITE(DatarateOnePassCbrSvcMultiBR, + ::testing::Range(5, 10), ::testing::Range(0, 3)); + +VP9_INSTANTIATE_TEST_SUITE(DatarateOnePassCbrSvcFrameDropMultiBR, + ::testing::Range(5, 10), ::testing::Range(0, 2), + ::testing::Range(0, 3)); + +#if CONFIG_VP9_TEMPORAL_DENOISING +VP9_INSTANTIATE_TEST_SUITE(DatarateOnePassCbrSvcDenoiser, + ::testing::Range(5, 10), ::testing::Range(1, 3), + ::testing::Range(0, 3), ::testing::Range(0, 4)); +#endif + +VP9_INSTANTIATE_TEST_SUITE(DatarateOnePassCbrSvcSmallKF, + ::testing::Range(5, 10), ::testing::Range(32, 36)); +} // namespace +} // namespace svc_test diff --git a/media/libvpx/libvpx/test/svc_end_to_end_test.cc b/media/libvpx/libvpx/test/svc_end_to_end_test.cc new file mode 100644 index 0000000000..b4337ae754 --- /dev/null +++ b/media/libvpx/libvpx/test/svc_end_to_end_test.cc @@ -0,0 +1,825 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_config.h" +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/svc_test.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vpx/vpx_codec.h" +#include "vpx_ports/bitops.h" + +namespace svc_test { +namespace { + +typedef enum { + // Inter-layer prediction is on on all frames. + INTER_LAYER_PRED_ON, + // Inter-layer prediction is off on all frames. + INTER_LAYER_PRED_OFF, + // Inter-layer prediction is off on non-key frames and non-sync frames. + INTER_LAYER_PRED_OFF_NONKEY, + // Inter-layer prediction is on on all frames, but constrained such + // that any layer S (> 0) can only predict from previous spatial + // layer S-1, from the same superframe. + INTER_LAYER_PRED_ON_CONSTRAINED +} INTER_LAYER_PRED; + +class ScalePartitionOnePassCbrSvc + : public OnePassCbrSvc, + public ::testing::TestWithParam { + public: + ScalePartitionOnePassCbrSvc() + : OnePassCbrSvc(GetParam()), mismatch_nframes_(0), num_nonref_frames_(0) { + SetMode(::libvpx_test::kRealTime); + } + + protected: + ~ScalePartitionOnePassCbrSvc() override = default; + + void SetUp() override { + InitializeConfig(); + speed_setting_ = 7; + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + PreEncodeFrameHookSetup(video, encoder); + } + + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + // Keep track of number of non-reference frames, needed for mismatch check. + // Non-reference frames are top spatial and temporal layer frames, + // for TL > 0. + if (temporal_layer_id_ == number_temporal_layers_ - 1 && + temporal_layer_id_ > 0 && + pkt->data.frame.spatial_layer_encoded[number_spatial_layers_ - 1]) + num_nonref_frames_++; + } + + void MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) override { + ++mismatch_nframes_; + } + + void SetConfig(const int /*num_temporal_layer*/) override {} + + unsigned int GetMismatchFrames() const { return mismatch_nframes_; } + unsigned int GetNonRefFrames() const { return num_nonref_frames_; } + + private: + unsigned int mismatch_nframes_; + unsigned int num_nonref_frames_; +}; + +TEST_P(ScalePartitionOnePassCbrSvc, OnePassCbrSvc3SL3TL1080P) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 10; + cfg_.rc_target_bitrate = 800; + cfg_.kf_max_dist = 9999; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 1; + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + cfg_.temporal_layering_mode = 3; + ::libvpx_test::I420VideoSource video( + "slides_code_term_web_plot.1920_1080.yuv", 1920, 1080, 30, 1, 0, 100); + // For this 3 temporal layer case, pattern repeats every 4 frames, so choose + // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2). + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Params: Inter layer prediction modes. +class SyncFrameOnePassCbrSvc : public OnePassCbrSvc, + public ::libvpx_test::CodecTestWithParam { + public: + SyncFrameOnePassCbrSvc() + : OnePassCbrSvc(GET_PARAM(0)), current_video_frame_(0), + frame_to_start_decode_(0), frame_to_sync_(0), + inter_layer_pred_mode_(GET_PARAM(1)), decode_to_layer_before_sync_(-1), + decode_to_layer_after_sync_(-1), denoiser_on_(0), + intra_only_test_(false), loopfilter_off_(0), mismatch_nframes_(0), + num_nonref_frames_(0) { + SetMode(::libvpx_test::kRealTime); + memset(&svc_layer_sync_, 0, sizeof(svc_layer_sync_)); + } + + protected: + ~SyncFrameOnePassCbrSvc() override = default; + + void SetUp() override { + InitializeConfig(); + speed_setting_ = 7; + } + + bool DoDecode() const override { + return current_video_frame_ >= frame_to_start_decode_; + } + + // Example pattern for spatial layers and 2 temporal layers used in the + // bypass/flexible mode. The pattern corresponds to the pattern + // VP9E_TEMPORAL_LAYERING_MODE_0101 (temporal_layering_mode == 2) used in + // non-flexible mode. + void set_frame_flags_bypass_mode( + int tl, int num_spatial_layers, int is_key_frame, + vpx_svc_ref_frame_config_t *ref_frame_config) { + int sl; + for (sl = 0; sl < num_spatial_layers; ++sl) + ref_frame_config->update_buffer_slot[sl] = 0; + + for (sl = 0; sl < num_spatial_layers; ++sl) { + // Set the buffer idx. + if (tl == 0) { + ref_frame_config->lst_fb_idx[sl] = sl; + if (sl) { + if (is_key_frame) { + ref_frame_config->lst_fb_idx[sl] = sl - 1; + ref_frame_config->gld_fb_idx[sl] = sl; + } else { + ref_frame_config->gld_fb_idx[sl] = sl - 1; + } + } else { + ref_frame_config->gld_fb_idx[sl] = 0; + } + ref_frame_config->alt_fb_idx[sl] = 0; + } else if (tl == 1) { + ref_frame_config->lst_fb_idx[sl] = sl; + ref_frame_config->gld_fb_idx[sl] = + (sl == 0) ? 0 : num_spatial_layers + sl - 1; + ref_frame_config->alt_fb_idx[sl] = num_spatial_layers + sl; + } + // Set the reference and update flags. + if (!tl) { + if (!sl) { + // Base spatial and base temporal (sl = 0, tl = 0) + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->lst_fb_idx[sl]; + } else { + if (is_key_frame) { + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->gld_fb_idx[sl]; + } else { + // Non-zero spatiall layer. + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 1; + ref_frame_config->reference_alt_ref[sl] = 1; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->lst_fb_idx[sl]; + } + } + } else if (tl == 1) { + if (!sl) { + // Base spatial and top temporal (tl = 1) + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->alt_fb_idx[sl]; + } else { + // Non-zero spatial. + if (sl < num_spatial_layers - 1) { + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 1; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->alt_fb_idx[sl]; + } else if (sl == num_spatial_layers - 1) { + // Top spatial and top temporal (non-reference -- doesn't + // update any reference buffers). + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 1; + ref_frame_config->reference_alt_ref[sl] = 0; + } + } + } + } + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + current_video_frame_ = video->frame(); + PreEncodeFrameHookSetup(video, encoder); + if (video->frame() == 0) { + // Do not turn off inter-layer pred completely because simulcast mode + // fails. + if (inter_layer_pred_mode_ != INTER_LAYER_PRED_OFF) + encoder->Control(VP9E_SET_SVC_INTER_LAYER_PRED, inter_layer_pred_mode_); + encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_); + if (intra_only_test_) + // Decoder sets the color_space for Intra-only frames + // to BT_601 (see line 1810 in vp9_decodeframe.c). + // So set it here in these tess to avoid encoder-decoder + // mismatch check on color space setting. + encoder->Control(VP9E_SET_COLOR_SPACE, VPX_CS_BT_601); + + encoder->Control(VP9E_SET_DISABLE_LOOPFILTER, loopfilter_off_); + } + if (flexible_mode_) { + vpx_svc_layer_id_t layer_id; + layer_id.spatial_layer_id = 0; + layer_id.temporal_layer_id = (video->frame() % 2 != 0); + temporal_layer_id_ = layer_id.temporal_layer_id; + for (int i = 0; i < number_spatial_layers_; i++) { + layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_; + ref_frame_config_.duration[i] = 1; + } + encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id); + set_frame_flags_bypass_mode(layer_id.temporal_layer_id, + number_spatial_layers_, 0, + &ref_frame_config_); + encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_); + } + if (video->frame() == frame_to_sync_) { + encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync_); + } + } + +#if CONFIG_VP9_DECODER + void PreDecodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Decoder *decoder) override { + if (video->frame() < frame_to_sync_) { + if (decode_to_layer_before_sync_ >= 0) + decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER, + decode_to_layer_before_sync_); + } else { + if (decode_to_layer_after_sync_ >= 0) { + int decode_to_layer = decode_to_layer_after_sync_; + // Overlay frame is additional layer for intra-only. + if (video->frame() == frame_to_sync_ && intra_only_test_ && + decode_to_layer_after_sync_ == 0 && number_spatial_layers_ > 1) + decode_to_layer += 1; + decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER, decode_to_layer); + } + } + } +#endif + + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + // Keep track of number of non-reference frames, needed for mismatch check. + // Non-reference frames are top spatial and temporal layer frames, + // for TL > 0. + if (temporal_layer_id_ == number_temporal_layers_ - 1 && + temporal_layer_id_ > 0 && + pkt->data.frame.spatial_layer_encoded[number_spatial_layers_ - 1] && + current_video_frame_ >= frame_to_sync_) + num_nonref_frames_++; + + if (intra_only_test_ && current_video_frame_ == frame_to_sync_) { + // Intra-only frame is only generated for spatial layers > 1 and <= 3, + // among other conditions (see constraint in set_intra_only_frame(). If + // intra-only is no allowed then encoder will insert key frame instead. + const bool key_frame = + (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false; + if (number_spatial_layers_ == 1 || number_spatial_layers_ > 3) + ASSERT_TRUE(key_frame); + else + ASSERT_FALSE(key_frame); + } + } + + void MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) override { + if (current_video_frame_ >= frame_to_sync_) ++mismatch_nframes_; + } + + unsigned int GetMismatchFrames() const { return mismatch_nframes_; } + unsigned int GetNonRefFrames() const { return num_nonref_frames_; } + + unsigned int current_video_frame_; + unsigned int frame_to_start_decode_; + unsigned int frame_to_sync_; + int inter_layer_pred_mode_; + int decode_to_layer_before_sync_; + int decode_to_layer_after_sync_; + int denoiser_on_; + bool intra_only_test_; + int loopfilter_off_; + vpx_svc_spatial_layer_sync_t svc_layer_sync_; + unsigned int mismatch_nframes_; + unsigned int num_nonref_frames_; + bool flexible_mode_; + vpx_svc_ref_frame_config_t ref_frame_config_; + + private: + void SetConfig(const int num_temporal_layer) override { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 1; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + if (num_temporal_layer == 3) { + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + cfg_.temporal_layering_mode = 3; + } else if (num_temporal_layer == 2) { + cfg_.ts_rate_decimator[0] = 2; + cfg_.ts_rate_decimator[1] = 1; + cfg_.temporal_layering_mode = 2; + } else if (num_temporal_layer == 1) { + cfg_.ts_rate_decimator[0] = 1; + cfg_.temporal_layering_mode = 0; + } + } +}; + +// Test for sync layer for 1 pass CBR SVC: 3 spatial layers and +// 3 temporal layers. Only start decoding on the sync layer. +// Full sync: insert key frame on base layer. +TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLFullSync) { + SetSvcConfig(3, 3); + // Sync is on base layer so the frame to sync and the frame to start decoding + // is the same. + frame_to_start_decode_ = 20; + frame_to_sync_ = 20; + decode_to_layer_before_sync_ = -1; + decode_to_layer_after_sync_ = 2; + + // Set up svc layer sync structure. + svc_layer_sync_.base_layer_intra_only = 0; + svc_layer_sync_.spatial_layer_sync[0] = 1; + + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + + cfg_.rc_target_bitrate = 600; + flexible_mode_ = false; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Test for sync layer for 1 pass CBR SVC: 2 spatial layers and +// 3 temporal layers. Decoding QVGA before sync frame and decode up to +// VGA on and after sync. +TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc2SL3TLSyncToVGA) { + SetSvcConfig(2, 3); + frame_to_start_decode_ = 0; + frame_to_sync_ = 100; + decode_to_layer_before_sync_ = 0; + decode_to_layer_after_sync_ = 1; + + // Set up svc layer sync structure. + svc_layer_sync_.base_layer_intra_only = 0; + svc_layer_sync_.spatial_layer_sync[0] = 0; + svc_layer_sync_.spatial_layer_sync[1] = 1; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + cfg_.rc_target_bitrate = 400; + flexible_mode_ = false; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Test for sync layer for 1 pass CBR SVC: 3 spatial layers and +// 3 temporal layers. Decoding QVGA and VGA before sync frame and decode up to +// HD on and after sync. +TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncToHD) { + SetSvcConfig(3, 3); + frame_to_start_decode_ = 0; + frame_to_sync_ = 20; + decode_to_layer_before_sync_ = 1; + decode_to_layer_after_sync_ = 2; + + // Set up svc layer sync structure. + svc_layer_sync_.base_layer_intra_only = 0; + svc_layer_sync_.spatial_layer_sync[0] = 0; + svc_layer_sync_.spatial_layer_sync[1] = 0; + svc_layer_sync_.spatial_layer_sync[2] = 1; + + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + cfg_.rc_target_bitrate = 600; + flexible_mode_ = false; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Test for sync layer for 1 pass CBR SVC: 3 spatial layers and +// 3 temporal layers. Decoding QVGA before sync frame and decode up to +// HD on and after sync. +TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncToVGAHD) { + SetSvcConfig(3, 3); + frame_to_start_decode_ = 0; + frame_to_sync_ = 20; + decode_to_layer_before_sync_ = 0; + decode_to_layer_after_sync_ = 2; + + // Set up svc layer sync structure. + svc_layer_sync_.base_layer_intra_only = 0; + svc_layer_sync_.spatial_layer_sync[0] = 0; + svc_layer_sync_.spatial_layer_sync[1] = 1; + svc_layer_sync_.spatial_layer_sync[2] = 1; + + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + cfg_.rc_target_bitrate = 600; + flexible_mode_ = false; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +#if CONFIG_VP9_TEMPORAL_DENOISING +// Test for sync layer for 1 pass CBR SVC: 2 spatial layers and +// 3 temporal layers. Decoding QVGA before sync frame and decode up to +// VGA on and after sync. +TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc2SL3TLSyncFrameVGADenoise) { + SetSvcConfig(2, 3); + frame_to_start_decode_ = 0; + frame_to_sync_ = 100; + decode_to_layer_before_sync_ = 0; + decode_to_layer_after_sync_ = 1; + + denoiser_on_ = 1; + // Set up svc layer sync structure. + svc_layer_sync_.base_layer_intra_only = 0; + svc_layer_sync_.spatial_layer_sync[0] = 0; + svc_layer_sync_.spatial_layer_sync[1] = 1; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + cfg_.rc_target_bitrate = 400; + flexible_mode_ = false; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} +#endif + +// Encode 3 spatial, 2 temporal layer in flexible mode but don't +// start decoding. During the sequence insert intra-only on base/qvga +// layer at frame 20 and start decoding only QVGA layer from there. +TEST_P(SyncFrameOnePassCbrSvc, + OnePassCbrSvc3SL3TLSyncFrameStartDecodeOnIntraOnlyQVGAFlex) { + SetSvcConfig(3, 2); + frame_to_start_decode_ = 20; + frame_to_sync_ = 20; + decode_to_layer_before_sync_ = 2; + decode_to_layer_after_sync_ = 0; + intra_only_test_ = true; + + // Set up svc layer sync structure. + svc_layer_sync_.base_layer_intra_only = 1; + svc_layer_sync_.spatial_layer_sync[0] = 1; + svc_layer_sync_.spatial_layer_sync[1] = 0; + svc_layer_sync_.spatial_layer_sync[2] = 0; + + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + cfg_.rc_target_bitrate = 600; + flexible_mode_ = true; + AssignLayerBitrates(); + cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Can't check mismatch here because only base is decoded at + // frame sync, whereas encoder continues encoding all layers. +} + +// Encode 3 spatial, 3 temporal layer but don't start decoding. +// During the sequence insert intra-only on base/qvga layer at frame 20 +// and start decoding only QVGA layer from there. +TEST_P(SyncFrameOnePassCbrSvc, + OnePassCbrSvc3SL3TLSyncFrameStartDecodeOnIntraOnlyQVGA) { + SetSvcConfig(3, 3); + frame_to_start_decode_ = 20; + frame_to_sync_ = 20; + decode_to_layer_before_sync_ = 2; + decode_to_layer_after_sync_ = 0; + intra_only_test_ = true; + + // Set up svc layer sync structure. + svc_layer_sync_.base_layer_intra_only = 1; + svc_layer_sync_.spatial_layer_sync[0] = 1; + svc_layer_sync_.spatial_layer_sync[1] = 0; + svc_layer_sync_.spatial_layer_sync[2] = 0; + + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + cfg_.rc_target_bitrate = 600; + flexible_mode_ = false; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Can't check mismatch here because only base is decoded at + // frame sync, whereas encoder continues encoding all layers. +} + +// Start decoding from beginning of sequence, during sequence insert intra-only +// on base/qvga layer. Decode all layers. +TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncFrameIntraOnlyQVGA) { + SetSvcConfig(3, 3); + frame_to_start_decode_ = 0; + frame_to_sync_ = 20; + decode_to_layer_before_sync_ = 2; + // The superframe containing intra-only layer will have +1 frames. Thus set + // the layer to decode after sync frame to +1 from + // decode_to_layer_before_sync. + decode_to_layer_after_sync_ = 3; + intra_only_test_ = true; + + // Set up svc layer sync structure. + svc_layer_sync_.base_layer_intra_only = 1; + svc_layer_sync_.spatial_layer_sync[0] = 1; + svc_layer_sync_.spatial_layer_sync[1] = 0; + svc_layer_sync_.spatial_layer_sync[2] = 0; + + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + cfg_.rc_target_bitrate = 600; + flexible_mode_ = false; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Start decoding from beginning of sequence, during sequence insert intra-only +// on base/qvga layer and sync_layer on middle/VGA layer. Decode all layers. +TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncFrameIntraOnlyVGA) { + SetSvcConfig(3, 3); + frame_to_start_decode_ = 0; + frame_to_sync_ = 20; + decode_to_layer_before_sync_ = 2; + // The superframe containing intra-only layer will have +1 frames. Thus set + // the layer to decode after sync frame to +1 from + // decode_to_layer_before_sync. + decode_to_layer_after_sync_ = 3; + intra_only_test_ = true; + + // Set up svc layer sync structure. + svc_layer_sync_.base_layer_intra_only = 1; + svc_layer_sync_.spatial_layer_sync[0] = 1; + svc_layer_sync_.spatial_layer_sync[1] = 1; + svc_layer_sync_.spatial_layer_sync[2] = 0; + + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + cfg_.rc_target_bitrate = 600; + flexible_mode_ = false; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Start decoding from sync frame, insert intra-only on base/qvga layer. Decode +// all layers. For 1 spatial layer, it inserts a key frame. +TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc1SL3TLSyncFrameIntraOnlyQVGA) { + SetSvcConfig(1, 3); + frame_to_start_decode_ = 20; + frame_to_sync_ = 20; + decode_to_layer_before_sync_ = 0; + decode_to_layer_after_sync_ = 0; + intra_only_test_ = true; + + // Set up svc layer sync structure. + svc_layer_sync_.base_layer_intra_only = 1; + svc_layer_sync_.spatial_layer_sync[0] = 1; + + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + cfg_.rc_target_bitrate = 600; + flexible_mode_ = false; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Params: Loopfilter modes. +class LoopfilterOnePassCbrSvc : public OnePassCbrSvc, + public ::libvpx_test::CodecTestWithParam { + public: + LoopfilterOnePassCbrSvc() + : OnePassCbrSvc(GET_PARAM(0)), loopfilter_off_(GET_PARAM(1)), + mismatch_nframes_(0), num_nonref_frames_(0) { + SetMode(::libvpx_test::kRealTime); + } + + protected: + ~LoopfilterOnePassCbrSvc() override = default; + + void SetUp() override { + InitializeConfig(); + speed_setting_ = 7; + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + PreEncodeFrameHookSetup(video, encoder); + if (number_temporal_layers_ > 1 || number_spatial_layers_ > 1) { + // Consider 3 cases: + if (loopfilter_off_ == 0) { + // loopfilter is on for all spatial layers on every superrframe. + for (int i = 0; i < VPX_SS_MAX_LAYERS; ++i) { + svc_params_.loopfilter_ctrl[i] = 0; + } + } else if (loopfilter_off_ == 1) { + // loopfilter is off for non-reference frames for all spatial layers. + for (int i = 0; i < VPX_SS_MAX_LAYERS; ++i) { + svc_params_.loopfilter_ctrl[i] = 1; + } + } else { + // loopfilter is off for all SL0 frames, and off only for non-reference + // frames for SL > 0. + svc_params_.loopfilter_ctrl[0] = 2; + for (int i = 1; i < VPX_SS_MAX_LAYERS; ++i) { + svc_params_.loopfilter_ctrl[i] = 1; + } + } + encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_); + } else if (number_temporal_layers_ == 1 && number_spatial_layers_ == 1) { + // For non-SVC mode use the single layer control. + encoder->Control(VP9E_SET_DISABLE_LOOPFILTER, loopfilter_off_); + } + } + + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + // Keep track of number of non-reference frames, needed for mismatch check. + // Non-reference frames are top spatial and temporal layer frames, + // for TL > 0. + if (temporal_layer_id_ == number_temporal_layers_ - 1 && + temporal_layer_id_ > 0 && + pkt->data.frame.spatial_layer_encoded[number_spatial_layers_ - 1]) + num_nonref_frames_++; + } + + void MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) override { + ++mismatch_nframes_; + } + + void SetConfig(const int /*num_temporal_layer*/) override {} + + int GetMismatchFrames() const { return mismatch_nframes_; } + int GetNonRefFrames() const { return num_nonref_frames_; } + + int loopfilter_off_; + + private: + int mismatch_nframes_; + int num_nonref_frames_; +}; + +TEST_P(LoopfilterOnePassCbrSvc, OnePassCbrSvc1SL1TLLoopfilterOff) { + SetSvcConfig(1, 1); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_target_bitrate = 800; + cfg_.kf_max_dist = 9999; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 1; + cfg_.ts_rate_decimator[0] = 1; + cfg_.temporal_layering_mode = 0; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + cfg_.rc_target_bitrate = 600; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + if (loopfilter_off_ == 0) + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); + else + EXPECT_EQ(GetMismatchFrames(), 0); +#endif +} + +TEST_P(LoopfilterOnePassCbrSvc, OnePassCbrSvc1SL3TLLoopfilterOff) { + SetSvcConfig(1, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_target_bitrate = 800; + cfg_.kf_max_dist = 9999; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 1; + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + cfg_.temporal_layering_mode = 3; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + cfg_.rc_target_bitrate = 600; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + if (loopfilter_off_ == 0) + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); + else + EXPECT_EQ(GetMismatchFrames(), 0); +#endif +} + +TEST_P(LoopfilterOnePassCbrSvc, OnePassCbrSvc3SL3TLLoopfilterOff) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_target_bitrate = 800; + cfg_.kf_max_dist = 9999; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 1; + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + cfg_.temporal_layering_mode = 3; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + cfg_.rc_target_bitrate = 600; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + if (loopfilter_off_ == 0) + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); + else + EXPECT_EQ(GetMismatchFrames(), 0); +#endif +} + +VP9_INSTANTIATE_TEST_SUITE(SyncFrameOnePassCbrSvc, ::testing::Range(0, 3)); + +VP9_INSTANTIATE_TEST_SUITE(LoopfilterOnePassCbrSvc, ::testing::Range(0, 3)); + +INSTANTIATE_TEST_SUITE_P( + VP9, ScalePartitionOnePassCbrSvc, + ::testing::Values( + static_cast(&libvpx_test::kVP9))); + +} // namespace +} // namespace svc_test diff --git a/media/libvpx/libvpx/test/svc_test.cc b/media/libvpx/libvpx/test/svc_test.cc new file mode 100644 index 0000000000..cbc0abe032 --- /dev/null +++ b/media/libvpx/libvpx/test/svc_test.cc @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "test/svc_test.h" + +namespace svc_test { +void OnePassCbrSvc::SetSvcConfig(const int num_spatial_layer, + const int num_temporal_layer) { + SetConfig(num_temporal_layer); + cfg_.ss_number_layers = num_spatial_layer; + cfg_.ts_number_layers = num_temporal_layer; + if (num_spatial_layer == 1) { + svc_params_.scaling_factor_num[0] = 288; + svc_params_.scaling_factor_den[0] = 288; + } else if (num_spatial_layer == 2) { + svc_params_.scaling_factor_num[0] = 144; + svc_params_.scaling_factor_den[0] = 288; + svc_params_.scaling_factor_num[1] = 288; + svc_params_.scaling_factor_den[1] = 288; + } else if (num_spatial_layer == 3) { + svc_params_.scaling_factor_num[0] = 72; + svc_params_.scaling_factor_den[0] = 288; + svc_params_.scaling_factor_num[1] = 144; + svc_params_.scaling_factor_den[1] = 288; + svc_params_.scaling_factor_num[2] = 288; + svc_params_.scaling_factor_den[2] = 288; + } + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; +} + +void OnePassCbrSvc::PreEncodeFrameHookSetup(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) { + if (video->frame() == 0) { + for (int i = 0; i < VPX_MAX_LAYERS; ++i) { + svc_params_.max_quantizers[i] = 63; + svc_params_.min_quantizers[i] = 0; + } + if (number_temporal_layers_ > 1 || number_spatial_layers_ > 1) { + svc_params_.speed_per_layer[0] = base_speed_setting_; + for (int i = 1; i < VPX_SS_MAX_LAYERS; ++i) { + svc_params_.speed_per_layer[i] = speed_setting_; + } + encoder->Control(VP9E_SET_SVC, 1); + encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_); + } + encoder->Control(VP8E_SET_CPUUSED, speed_setting_); + encoder->Control(VP9E_SET_AQ_MODE, 3); + encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 300); + encoder->Control(VP9E_SET_TILE_COLUMNS, get_msb(cfg_.g_threads)); + encoder->Control(VP9E_SET_ROW_MT, 1); + encoder->Control(VP8E_SET_STATIC_THRESHOLD, 1); + } + + superframe_count_++; + temporal_layer_id_ = 0; + if (number_temporal_layers_ == 2) { + temporal_layer_id_ = (superframe_count_ % 2 != 0); + } else if (number_temporal_layers_ == 3) { + if (superframe_count_ % 2 != 0) temporal_layer_id_ = 2; + if (superframe_count_ > 1) { + if ((superframe_count_ - 2) % 4 == 0) temporal_layer_id_ = 1; + } + } + + frame_flags_ = 0; +} + +void OnePassCbrSvc::PostEncodeFrameHook(::libvpx_test::Encoder *encoder) { + vpx_svc_layer_id_t layer_id; + encoder->Control(VP9E_GET_SVC_LAYER_ID, &layer_id); + temporal_layer_id_ = layer_id.temporal_layer_id; + for (int sl = 0; sl < number_spatial_layers_; ++sl) { + for (int tl = temporal_layer_id_; tl < number_temporal_layers_; ++tl) { + const int layer = sl * number_temporal_layers_ + tl; + bits_in_buffer_model_[layer] += + static_cast(layer_target_avg_bandwidth_[layer]); + } + } +} + +void OnePassCbrSvc::AssignLayerBitrates() { + int sl, spatial_layer_target; + int spatial_layers = cfg_.ss_number_layers; + int temporal_layers = cfg_.ts_number_layers; + float total = 0; + float alloc_ratio[VPX_MAX_LAYERS] = { 0 }; + float framerate = 30.0; + for (sl = 0; sl < spatial_layers; ++sl) { + if (svc_params_.scaling_factor_den[sl] > 0) { + alloc_ratio[sl] = + static_cast((svc_params_.scaling_factor_num[sl] * 1.0 / + svc_params_.scaling_factor_den[sl])); + total += alloc_ratio[sl]; + } + } + for (sl = 0; sl < spatial_layers; ++sl) { + cfg_.ss_target_bitrate[sl] = spatial_layer_target = + static_cast(cfg_.rc_target_bitrate * alloc_ratio[sl] / + total); + const int index = sl * temporal_layers; + if (cfg_.temporal_layering_mode == 3) { + cfg_.layer_target_bitrate[index] = spatial_layer_target >> 1; + cfg_.layer_target_bitrate[index + 1] = + (spatial_layer_target >> 1) + (spatial_layer_target >> 2); + cfg_.layer_target_bitrate[index + 2] = spatial_layer_target; + } else if (cfg_.temporal_layering_mode == 2) { + cfg_.layer_target_bitrate[index] = spatial_layer_target * 2 / 3; + cfg_.layer_target_bitrate[index + 1] = spatial_layer_target; + } else if (cfg_.temporal_layering_mode <= 1) { + cfg_.layer_target_bitrate[index] = spatial_layer_target; + } + } + for (sl = 0; sl < spatial_layers; ++sl) { + for (int tl = 0; tl < temporal_layers; ++tl) { + const int layer = sl * temporal_layers + tl; + float layer_framerate = framerate; + if (temporal_layers == 2 && tl == 0) layer_framerate = framerate / 2; + if (temporal_layers == 3 && tl == 0) layer_framerate = framerate / 4; + if (temporal_layers == 3 && tl == 1) layer_framerate = framerate / 2; + layer_target_avg_bandwidth_[layer] = static_cast( + cfg_.layer_target_bitrate[layer] * 1000.0 / layer_framerate); + bits_in_buffer_model_[layer] = + cfg_.layer_target_bitrate[layer] * cfg_.rc_buf_initial_sz; + } + } +} +} // namespace svc_test diff --git a/media/libvpx/libvpx/test/svc_test.h b/media/libvpx/libvpx/test/svc_test.h new file mode 100644 index 0000000000..0026372de5 --- /dev/null +++ b/media/libvpx/libvpx/test/svc_test.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_TEST_SVC_TEST_H_ +#define VPX_TEST_SVC_TEST_H_ + +#include "./vpx_config.h" +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "vpx/vpx_codec.h" +#include "vpx_ports/bitops.h" + +namespace svc_test { +class OnePassCbrSvc : public ::libvpx_test::EncoderTest { + public: + explicit OnePassCbrSvc(const ::libvpx_test::CodecFactory *codec) + : EncoderTest(codec), base_speed_setting_(0), speed_setting_(0), + superframe_count_(0), temporal_layer_id_(0), number_temporal_layers_(0), + number_spatial_layers_(0) { + memset(&svc_params_, 0, sizeof(svc_params_)); + memset(bits_in_buffer_model_, 0, + sizeof(bits_in_buffer_model_[0]) * VPX_MAX_LAYERS); + memset(layer_target_avg_bandwidth_, 0, + sizeof(layer_target_avg_bandwidth_[0]) * VPX_MAX_LAYERS); + } + + protected: + ~OnePassCbrSvc() override {} + + virtual void SetConfig(const int num_temporal_layer) = 0; + + virtual void SetSvcConfig(const int num_spatial_layer, + const int num_temporal_layer); + + virtual void PreEncodeFrameHookSetup(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder); + + void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override; + + virtual void AssignLayerBitrates(); + + void MismatchHook(const vpx_image_t *, const vpx_image_t *) override {} + + vpx_svc_extra_cfg_t svc_params_; + int64_t bits_in_buffer_model_[VPX_MAX_LAYERS]; + int layer_target_avg_bandwidth_[VPX_MAX_LAYERS]; + int base_speed_setting_; + int speed_setting_; + int superframe_count_; + int temporal_layer_id_; + int number_temporal_layers_; + int number_spatial_layers_; +}; +} // namespace svc_test + +#endif // VPX_TEST_SVC_TEST_H_ diff --git a/media/libvpx/libvpx/test/test-data.mk b/media/libvpx/libvpx/test/test-data.mk new file mode 100644 index 0000000000..9eabffae3e --- /dev/null +++ b/media/libvpx/libvpx/test/test-data.mk @@ -0,0 +1,899 @@ +LIBVPX_TEST_SRCS-yes += test-data.mk + +# Encoder test source +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_collage_w352h288.yuv +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_collage_w352h288_nv12.yuv +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_odd.yuv +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += desktop_office1.1280_720-020.yuv +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += slides_code_term_web_plot.1920_1080.yuv +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += desktopqvga.320_240.yuv + +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_420_20f.y4m +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_422_20f.y4m +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_444_20f.y4m +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_440.yuv +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_420_20f.y4m +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_422_20f.y4m +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_444_20f.y4m +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_440.yuv +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_420_a10-1.y4m +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_420.y4m +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_422.y4m +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_444.y4m +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_440.yuv + +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += desktop_credits.y4m +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1280_720_30.y4m +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += noisy_clip_640_360.y4m +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += screendata.y4m +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += bus_352x288_420_f20_b8.yuv +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += crowd_run_360p_10_150f.y4m + +# Test vectors +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-002.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-002.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-003.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-003.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-004.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-004.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-005.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-005.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-006.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-006.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-007.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-007.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-008.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-008.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-009.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-009.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-010.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-010.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-011.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-011.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-012.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-012.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-013.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-013.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-014.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-014.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-015.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-015.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-016.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-016.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-017.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-017.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-018.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-018.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1400.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1400.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1411.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1411.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1416.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1416.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1417.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1417.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1402.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1402.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1412.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1412.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1418.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1418.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1424.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1424.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-01.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-01.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-02.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-02.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-03.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-03.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-04.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-04.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1401.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1401.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1403.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1403.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1407.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1407.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1408.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1408.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1409.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1409.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1410.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1410.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1413.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1413.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1414.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1414.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1415.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1415.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1425.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1425.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1426.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1426.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1427.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1427.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1432.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1432.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1435.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1435.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1436.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1436.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1437.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1437.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1441.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1441.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1442.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1442.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1404.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1404.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1405.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1405.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1406.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1406.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1428.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1428.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1429.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1429.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1430.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1430.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1431.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1431.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1433.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1433.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1434.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1434.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1438.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1438.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1439.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1439.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1440.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1440.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1443.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1443.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-06-smallsize.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-06-smallsize.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-00.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-00.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-01.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-01.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-02.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-02.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-03.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-03.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-04.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-04.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-05.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-05.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-06.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-06.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-07.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-07.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-08.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-08.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-09.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-09.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-10.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-10.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-11.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-11.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-12.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-12.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-13.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-13.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-14.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-14.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-15.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-15.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-16.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-16.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-17.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-17.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-18.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-18.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-19.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-19.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-20.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-20.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-21.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-21.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-22.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-22.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-23.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-23.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-24.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-24.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-25.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-25.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-26.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-26.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-27.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-27.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-28.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-28.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-29.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-29.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-30.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-30.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-31.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-31.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-32.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-32.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-33.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-33.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-34.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-34.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-35.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-35.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-36.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-36.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-37.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-37.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-38.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-38.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-39.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-39.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-40.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-40.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-41.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-41.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-42.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-42.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-43.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-43.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-44.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-44.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-45.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-45.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-46.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-46.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-47.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-47.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-48.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-48.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-49.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-49.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-50.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-50.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-51.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-51.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-52.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-52.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-53.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-53.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-54.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-54.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-55.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-55.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-56.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-56.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-57.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-57.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-58.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-58.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-59.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-59.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-60.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-60.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-61.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-61.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-62.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-62.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-63.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-63.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-1.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-1.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-2.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-2.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-3.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-3.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-4.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-4.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-5.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-5.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-6.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-6.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-7.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-7.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x08.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x08.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x10.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x10.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x16.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x16.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x18.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x18.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x32.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x32.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x34.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x34.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x64.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x64.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x66.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x66.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x08.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x08.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x10.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x10.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x16.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x16.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x18.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x18.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x32.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x32.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x34.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x34.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x64.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x64.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x66.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x66.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x08.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x08.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x10.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x10.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x16.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x16.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x18.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x18.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x32.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x32.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x34.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x34.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x64.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x64.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x66.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x66.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x08.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x08.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x10.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x10.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x16.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x16.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x18.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x18.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x32.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x32.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x34.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x34.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x64.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x64.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x66.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x66.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x08.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x08.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x10.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x10.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x16.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x16.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x18.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x18.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x32.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x32.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x34.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x34.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x64.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x64.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x66.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x66.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x08.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x08.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x10.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x10.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x16.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x16.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x18.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x18.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x32.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x32.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x34.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x34.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x64.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x64.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x66.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x66.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x08.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x08.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x10.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x10.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x16.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x16.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x18.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x18.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x32.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x32.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x34.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x34.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x64.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x64.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x66.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x66.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x08.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x08.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x10.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x10.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x16.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x16.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x18.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x18.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x32.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x32.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x34.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x34.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x64.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x64.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x66.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x66.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-130x132.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-130x132.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-132x130.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-132x130.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-132x132.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-132x132.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-178x180.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-178x180.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-180x178.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-180x178.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-180x180.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-180x180.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-lf-1920x1080.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-lf-1920x1080.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x196.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x196.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x198.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x198.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x200.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x200.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x202.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x202.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x208.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x208.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x210.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x210.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x224.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x224.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x226.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x226.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x196.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x196.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x198.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x198.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x200.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x200.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x202.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x202.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x208.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x208.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x210.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x210.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x224.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x224.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x226.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x226.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x196.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x196.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x198.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x198.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x200.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x200.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x202.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x202.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x208.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x208.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x210.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x210.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x224.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x224.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x226.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x226.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x196.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x196.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x198.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x198.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x200.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x200.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x202.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x202.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x208.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x208.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x210.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x210.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x224.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x224.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x226.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x226.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x196.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x196.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x198.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x198.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x200.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x200.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x202.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x202.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x208.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x208.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x210.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x210.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x224.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x224.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x226.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x226.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x196.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x196.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x198.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x198.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x200.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x200.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x202.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x202.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x208.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x208.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x210.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x210.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x224.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x224.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x226.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x226.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x196.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x196.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x198.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x198.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x200.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x200.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x202.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x202.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x208.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x208.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x210.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x210.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x224.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x224.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x226.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x226.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x196.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x196.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x198.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x198.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x200.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x200.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x202.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x202.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x208.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x208.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x210.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x210.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-352x288.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-352x288.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-05-resize.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-05-resize.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-06-bilinear.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-06-bilinear.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-07-frame_parallel.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-07-frame_parallel.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-07-frame_parallel-1.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-07-frame_parallel-1.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x1.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x1.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x4.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x4.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2_frame_parallel.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2_frame_parallel.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4_frame_parallel.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4_frame_parallel.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8_frame_parallel.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8_frame_parallel.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-aq2.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-aq2.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-lf_deltas.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-lf_deltas.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-subpixel-00.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-subpixel-00.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame2.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame2.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x287.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x287.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x288.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x288.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-352x287.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-352x287.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_1.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_1.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_2.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_2.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_3.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_3.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-13-largescaling.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-13-largescaling.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-16.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-16.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-2.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-2.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-4.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-4.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-8.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-8.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-1.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-1.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-2.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-2.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-4.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-4.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-8.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-8.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-1.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-1.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-16.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-16.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-4.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-4.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-8.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-8.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-1.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-1.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-16.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-16.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-2.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-2.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-8.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-8.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-1.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-1.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-16.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-16.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-2.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-2.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-4.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-4.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-1-2-4-8.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-1-2-4-8.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-1-2.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-1-2.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-1-4.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-1-4.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-1-8.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-1-8.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-2-1.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-2-1.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-2-4.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-2-4.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-2-8.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-2-8.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-4-1.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-4-1.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-4-2.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-4-2.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-4-8.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-4-8.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-8-1.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-8-1.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-8-2.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-8-2.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-8-4-2-1.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-8-4-2-1.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-8-4.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-8-4.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-16-intra-only.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-16-intra-only.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-17-show-existing-frame.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-17-show-existing-frame.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-18-resize.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-18-resize.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip-01.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip-01.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip-02.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip-02.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv422.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv422.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv440.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv440.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-20-big_superframe-01.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-20-big_superframe-01.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-20-big_superframe-02.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-20-big_superframe-02.webm.md5 +ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp92-2-20-10bit-yuv420.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp92-2-20-10bit-yuv420.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp92-2-20-12bit-yuv420.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp92-2-20-12bit-yuv420.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-10bit-yuv422.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-10bit-yuv422.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv422.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv422.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-10bit-yuv440.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-10bit-yuv440.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv440.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv440.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-10bit-yuv444.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-10bit-yuv444.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv444.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv444.webm.md5 +endif # CONFIG_VP9_HIGHBITDEPTH + +# Invalid files for testing libvpx error checking. +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-1443.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-1443.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-148271109.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-148271109.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-token-partition.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-token-partition.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v3.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v3.webm.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02-v2.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02-v2.webm.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v3.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v3.webm.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-63.ivf.kf_65527x61446.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-63.ivf.kf_65527x61446.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-03-size-202x210.webm.ivf.s113306_r01-05_b6-.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-03-size-202x210.webm.ivf.s113306_r01-05_b6-.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-03-size-224x196.webm.ivf.s44156_r01-05_b6-.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-03-size-224x196.webm.ivf.s44156_r01-05_b6-.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x8_frame_parallel.webm.ivf.s288_r01-05_b6-.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x8_frame_parallel.webm.ivf.s288_r01-05_b6-.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-10-show-existing-frame.webm.ivf.s180315_r01-05_b6-.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-10-show-existing-frame.webm.ivf.s180315_r01-05_b6-.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-1.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-2.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-3.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-629481.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-629481.webm.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-1558.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-1558.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-1562.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-1562.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-667044.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-667044.webm.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += crbug-1539.rawfile + +ifeq ($(CONFIG_DECODE_PERF_TESTS),yes) +# Encode / Decode test +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1280_720_30.yuv +# BBB VP9 streams +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-bbb_426x240_tile_1x1_180kbps.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-bbb_640x360_tile_1x2_337kbps.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-bbb_854x480_tile_1x2_651kbps.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-bbb_1280x720_tile_1x4_1310kbps.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-bbb_1920x1080_tile_1x1_2581kbps.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-bbb_1920x1080_tile_1x4_2586kbps.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-bbb_1920x1080_tile_1x4_fpm_2304kbps.webm +# Sintel VP9 streams +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-sintel_426x182_tile_1x1_171kbps.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-sintel_640x272_tile_1x2_318kbps.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-sintel_854x364_tile_1x2_621kbps.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-sintel_1280x546_tile_1x4_1257kbps.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-sintel_1920x818_tile_1x4_fpm_2279kbps.webm +# TOS VP9 streams +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-tos_426x178_tile_1x1_181kbps.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-tos_640x266_tile_1x2_336kbps.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-tos_854x356_tile_1x2_656kbps.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-tos_854x356_tile_1x2_fpm_546kbps.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-tos_1280x534_tile_1x4_1306kbps.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-tos_1280x534_tile_1x4_fpm_952kbps.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-tos_1920x800_tile_1x4_fpm_2335kbps.webm +endif # CONFIG_DECODE_PERF_TESTS + +ifeq ($(CONFIG_ENCODE_PERF_TESTS),yes) +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += desktop_640_360_30.yuv +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += kirland_640_480_30.yuv +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += macmarcomoving_640_480_30.yuv +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += macmarcostationary_640_480_30.yuv +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1280_720_30.yuv +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += tacomanarrows_640_480_30.yuv +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += tacomasmallcameramovement_640_480_30.yuv +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += thaloundeskmtg_640_480_30.yuv +endif # CONFIG_ENCODE_PERF_TESTS + +# sort and remove duplicates +LIBVPX_TEST_DATA-yes := $(sort $(LIBVPX_TEST_DATA-yes)) + +# VP9 dynamic resizing test (decoder) +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x180_5_1-2.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x180_5_1-2.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x180_5_3-4.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x180_5_3-4.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x180_7_1-2.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x180_7_1-2.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x180_7_3-4.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x180_7_3-4.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x240_5_1-2.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x240_5_1-2.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x240_5_3-4.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x240_5_3-4.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x240_7_1-2.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x240_7_1-2.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x240_7_3-4.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x240_7_3-4.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x360_5_1-2.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x360_5_1-2.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x360_5_3-4.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x360_5_3-4.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x360_7_1-2.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x360_7_1-2.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x360_7_3-4.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x360_7_3-4.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x480_5_1-2.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x480_5_1-2.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x480_5_3-4.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x480_5_3-4.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x480_7_1-2.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x480_7_1-2.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x480_7_3-4.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x480_7_3-4.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1280x720_5_1-2.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1280x720_5_1-2.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1280x720_5_3-4.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1280x720_5_3-4.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1280x720_7_1-2.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1280x720_7_1-2.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1280x720_7_3-4.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1280x720_7_3-4.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_5_1-2.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_5_1-2.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_5_3-4.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_5_3-4.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_1-2.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_1-2.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_3-4.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_3-4.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_3.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_3.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_1.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_1.webm.md5 diff --git a/media/libvpx/libvpx/test/test-data.sha1 b/media/libvpx/libvpx/test/test-data.sha1 new file mode 100644 index 0000000000..a9decc6b6b --- /dev/null +++ b/media/libvpx/libvpx/test/test-data.sha1 @@ -0,0 +1,873 @@ +3eaf216d9fc8b4b9bb8c3956311f49a85974806c *bus_352x288_420_f20_b8.yuv +d5dfb0151c9051f8c85999255645d7a23916d3c0 *hantro_collage_w352h288.yuv +b87815bf86020c592ccc7a846ba2e28ec8043902 *hantro_odd.yuv +76024eb753cdac6a5e5703aaea189d35c3c30ac7 *invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf +7448d8798a4380162d4b56f9b452e2f6f9e24e7a *invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf.res +83f50908c8dc0ef8760595447a2ff7727489542e *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf +456d1493e52d32a5c30edf44a27debc1fa6b253a *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res +c123d1f9f02fb4143abb5e271916e3a3080de8f6 *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf +456d1493e52d32a5c30edf44a27debc1fa6b253a *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res +efafb92b7567bc04c3f1432ea6c268c1c31affd5 *invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf +5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf.res +fe346136b9b8c1e6f6084cc106485706915795e4 *invalid-vp90-01-v3.webm +5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-vp90-01-v3.webm.res +d78e2fceba5ac942246503ec8366f879c4775ca5 *invalid-vp90-02-v2.webm +8e2eff4af87d2b561cce2365713269e301457ef3 *invalid-vp90-02-v2.webm.res +df1a1453feb3c00d7d89746c7003b4163523bff3 *invalid-vp90-03-v3.webm +4935c62becc68c13642a03db1e6d3e2331c1c612 *invalid-vp90-03-v3.webm.res +d637297561dd904eb2c97a9015deeb31c4a1e8d2 *invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm +3a204bdbeaa3c6458b77bcebb8366d107267f55d *invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm.res +9aa21d8b2cb9d39abe8a7bb6032dc66955fb4342 *noisy_clip_640_360.y4m +0936b837708ae68c034719f8e07596021c2c214f *park_joy_90p_10_420_20f.y4m +5727a853c083c1099f837d27967bc1322d50ed4f *park_joy_90p_10_422_20f.y4m +e13489470ef8e8b2a871a5640d795a42a39be58d *park_joy_90p_10_444_20f.y4m +c934da6fb8cc54ee2a8c17c54cf6076dac37ead0 *park_joy_90p_10_440.yuv +79b0dc1784635a7f291e21c4e8d66a29c496ab99 *park_joy_90p_12_420_20f.y4m +9cf22b0f809f7464c8b9058f0cfa9d905921cbd1 *park_joy_90p_12_422_20f.y4m +22b2a4abaecc4a9ade6bb503d25fb82367947e85 *park_joy_90p_12_444_20f.y4m +82c1bfcca368c2f22bad7d693d690d5499ecdd11 *park_joy_90p_12_440.yuv +b9e1e90aece2be6e2c90d89e6ab2372d5f8c792d *park_joy_90p_8_420_a10-1.y4m +4e0eb61e76f0684188d9bc9f3ce61f6b6b77bb2c *park_joy_90p_8_420.y4m +7a193ff7dfeb96ba5f82b2afd7afa9e1fe83d947 *park_joy_90p_8_422.y4m +bdb7856e6bc93599bdda05c2e773a9f22b6c6d03 *park_joy_90p_8_444.y4m +81e1f3843748438b8f2e71db484eb22daf72e939 *park_joy_90p_8_440.yuv +b1f1c3ec79114b9a0651af24ce634afb44a9a419 *rush_hour_444.y4m +5184c46ddca8b1fadd16742e8500115bc8f749da *vp80-00-comprehensive-001.ivf +65bf1bbbced81b97bd030f376d1b7f61a224793f *vp80-00-comprehensive-002.ivf +906b4c1e99eb734504c504b3f1ad8052137ce672 *vp80-00-comprehensive-003.ivf +ec144b1af53af895db78355785650b96dd3f0ade *vp80-00-comprehensive-004.ivf +afc7091785c62f1c121c4554a2830c30704587d9 *vp80-00-comprehensive-005.ivf +42ea9d55c818145d06a9b633b8e85c6a6164fd3e *vp80-00-comprehensive-006.ivf +e5b3a73ab79fe024c14309d653d6bed92902ee3b *vp80-00-comprehensive-007.ivf +f3c50a58875930adfb84525c0ef59d7e4c08540c *vp80-00-comprehensive-008.ivf +4b2841fdb83db51ae322096ae468bbb9dc2c8362 *vp80-00-comprehensive-009.ivf +efbff736e3a91ab6a98c5bc2dce65d645944c7b1 *vp80-00-comprehensive-010.ivf +6b315102cae008d22a3d2c231be92cb704a222f8 *vp80-00-comprehensive-011.ivf +f3214a4fea14c2d5ec689936c1613f274c859ee8 *vp80-00-comprehensive-012.ivf +e4094e96d308c8a35b74c480a43d853c5294cd34 *vp80-00-comprehensive-013.ivf +5b0adfaf60a69e0aaf3ec021a39d0a68fc0e1b5a *vp80-00-comprehensive-014.ivf +e8467688ddf26b5000664f904faf0d70506aa653 *vp80-00-comprehensive-015.ivf +aab55582337dfd2a39ff54fb2576a91910d49337 *vp80-00-comprehensive-016.ivf +1ba24724f80203c9bae4f1d0f99d534721980016 *vp80-00-comprehensive-017.ivf +143a15512b46f436280ddb4d0e6411eb4af434f2 *vp80-00-comprehensive-018.ivf +c5baeaf5714fdfb3a8bc960a8e33ac438e83b16b *vp80-01-intra-1400.ivf +f383955229afe3408453e316d11553d923ca60d5 *vp80-01-intra-1411.ivf +84e1f4343f174c9f3c83f834bac3196fb325bf2c *vp80-01-intra-1416.ivf +fb6e712a47dd57a28a3727d2ae2c97a8b7c7ca51 *vp80-01-intra-1417.ivf +71ea772d3e9d315b8cbecf41207b8a237c34853b *vp80-02-inter-1402.ivf +d85dbc4271525dcd128c503f936fe69091d1f8d0 *vp80-02-inter-1412.ivf +d4e5d3ad56511867d025f93724d090f92ba6ec3d *vp80-02-inter-1418.ivf +91791cbcc37c60f35dbd8090bacb54e5ec6dd4fa *vp80-02-inter-1424.ivf +17fbfe2fea70f6e2f3fa6ca4efaae6c0b03b5f02 *vp80-03-segmentation-01.ivf +3c3600dbbcde08e20d54c66fe3b7eadd4f09bdbb *vp80-03-segmentation-02.ivf +c156778d5340967d4b369c490848076e92f1f875 *vp80-03-segmentation-03.ivf +d25dcff6c60e87a1af70945b8911b6b4998533b0 *vp80-03-segmentation-04.ivf +362baba2ce454c9db21218f35e81c27a5ed0b730 *vp80-03-segmentation-1401.ivf +d223ae7ee748ce07e74c4679bfd219e84aa9f4b0 *vp80-03-segmentation-1403.ivf +033adf7f3a13836a3f1cffcb87c1972900f2b5c6 *vp80-03-segmentation-1407.ivf +4d51dfbf9f3e2c590ec99d1d6f59dd731d04375f *vp80-03-segmentation-1408.ivf +f37a62b197c2600d75e0ccfbb31b60efdedac251 *vp80-03-segmentation-1409.ivf +eb25bd7bfba5b2f6935018a930f42d123b1e7fcd *vp80-03-segmentation-1410.ivf +b9d5c436663a30c27cfff84b53a002e501258843 *vp80-03-segmentation-1413.ivf +6da92b9d1a180cc3a8afe348ab12258f5a37be1a *vp80-03-segmentation-1414.ivf +a4f5842602886bd669f115f93d8a35c035cb0948 *vp80-03-segmentation-1415.ivf +f295dceb8ef278b77251b3f9df8aee22e161d547 *vp80-03-segmentation-1425.ivf +198dbf9f36f733200e432664cc8c5752d59779de *vp80-03-segmentation-1426.ivf +7704804e32f5de976803929934a7fafe101ac7b0 *vp80-03-segmentation-1427.ivf +831ccd862ea95ca025d2f3bd8b88678752f5416d *vp80-03-segmentation-1432.ivf +b3c11978529289f9109f2766fcaba3ebc40e11ef *vp80-03-segmentation-1435.ivf +a835a731f5520ebfc1002c40121264d0020559ac *vp80-03-segmentation-1436.ivf +1d1732942f773bb2a5775fcb9689b1579ce28eab *vp80-03-segmentation-1437.ivf +db04799adfe089dfdf74dbd43cc05ede7161f99e *vp80-03-segmentation-1441.ivf +7caf39b3f20cfd52b998210878062e52a5edf1e6 *vp80-03-segmentation-1442.ivf +3607f6bb4ee106c38fa1ea370dc4ff8b8cde2261 *vp80-04-partitions-1404.ivf +93cc323b6b6867f1b12dd48773424549c6960a6b *vp80-04-partitions-1405.ivf +047eedb14b865bdac8a3538e63801054e0295e9c *vp80-04-partitions-1406.ivf +0f1233bd2bc33f56ce5e495dbd455d122339f384 *vp80-05-sharpness-1428.ivf +51767fc136488a9535c2a4c38067c542ee2048df *vp80-05-sharpness-1429.ivf +9805aa107672de25d6fb8c35e20d06deca5efe18 *vp80-05-sharpness-1430.ivf +61db6b965f9c27aebe71b85bf2d5877e58e4bbdf *vp80-05-sharpness-1431.ivf +10420d266290d2923555f84af38eeb96edbd3ae8 *vp80-05-sharpness-1433.ivf +3ed24f9a80cddfdf75824ba95cdb4ff9286cb443 *vp80-05-sharpness-1434.ivf +c87599cbecd72d4cd4f7ace3313b7a6bc6eb8163 *vp80-05-sharpness-1438.ivf +aff51d865c2621b60510459244ea83e958e4baed *vp80-05-sharpness-1439.ivf +da386e72b19b5485a6af199c5eb60ef25e510dd1 *vp80-05-sharpness-1440.ivf +6759a095203d96ccd267ce09b1b050b8cc4c2f1f *vp80-05-sharpness-1443.ivf +b95d3cc1d0df991e63e150a801710a72f20d9ba0 *vp80-06-smallsize.ivf +db55ec7fd02c864ba996ff060b25b1e08611330b *vp80-00-comprehensive-001.ivf.md5 +29db0ad011cba1e45f856d5623cd38dac3e3bf19 *vp80-00-comprehensive-002.ivf.md5 +e84f258f69e173e7d68f8f8c037a0a3766902182 *vp80-00-comprehensive-003.ivf.md5 +eb7912eaf69559a16fd82bc3f5fb1524cf4a4466 *vp80-00-comprehensive-004.ivf.md5 +4206f71c94894bd5b5b376f6c09b3817dbc65206 *vp80-00-comprehensive-005.ivf.md5 +4f89b356f6f2fecb928f330a10f804f00f5325f5 *vp80-00-comprehensive-006.ivf.md5 +2813236a32964dd8007e17648bcf035a20fcda6c *vp80-00-comprehensive-007.ivf.md5 +10746c72098f872803c900e17c5680e451f5f498 *vp80-00-comprehensive-008.ivf.md5 +39a23d0692ce64421a7bb7cdf6ccec5928d37fff *vp80-00-comprehensive-009.ivf.md5 +f6e3de8931a0cc659bda8fbc14050346955e72d4 *vp80-00-comprehensive-010.ivf.md5 +101683ec195b6e944f7cd1e468fc8921439363e6 *vp80-00-comprehensive-011.ivf.md5 +1f592751ce46d8688998fa0fa4fbdcda0fd4058c *vp80-00-comprehensive-012.ivf.md5 +6066176f90ca790251e795fca1a5797d59999841 *vp80-00-comprehensive-013.ivf.md5 +2656da94ba93691f23edc4d60b3a09e2be46c217 *vp80-00-comprehensive-014.ivf.md5 +c6e0d5f5d61460c8ac8edfa4e701f10312c03133 *vp80-00-comprehensive-015.ivf.md5 +ee60fee501d8493e34e8d6a1fe315b51ed09b24a *vp80-00-comprehensive-016.ivf.md5 +9f1914ceffcad4546c0a29de3ef591d8bea304dc *vp80-00-comprehensive-017.ivf.md5 +e0305178fe288a9fd8082b39e2d03181edb19054 *vp80-00-comprehensive-018.ivf.md5 +612494da2fa799cc9d76dcdd835ae6c7cb2e5c05 *vp80-01-intra-1400.ivf.md5 +48ea06097ac8269c5e8c2131d3d0639f431fcf0e *vp80-01-intra-1411.ivf.md5 +6e2ab4e7677ad0ba868083ca6bc387ee922b400c *vp80-01-intra-1416.ivf.md5 +eca0a90348959ce3854142f8d8641b13050e8349 *vp80-01-intra-1417.ivf.md5 +920feea203145d5c2258a91c4e6991934a79a99e *vp80-02-inter-1402.ivf.md5 +f71d97909fe2b3dd65be7e1f56c72237f0cef200 *vp80-02-inter-1412.ivf.md5 +e911254569a30bbb2a237ff8b79f69ed9da0672d *vp80-02-inter-1418.ivf.md5 +58c789c50c9bb9cc90580bed291164a0939d28ba *vp80-02-inter-1424.ivf.md5 +ff3e2f441327b9c20a0b37c524e0f5a48a36de7b *vp80-03-segmentation-01.ivf.md5 +0791f417f076a542ae66fbc3426ab4d94cbd6c75 *vp80-03-segmentation-02.ivf.md5 +722e50f1a6a91c34302d68681faffc1c26d1cc57 *vp80-03-segmentation-03.ivf.md5 +c701f1885bcfb27fb8e70cc65606b289172ef889 *vp80-03-segmentation-04.ivf.md5 +f79bc9ec189a2b4807632a3d0c5bf04a178b5300 *vp80-03-segmentation-1401.ivf.md5 +b9aa4c74c0219b639811c44760d0b24cd8bb436a *vp80-03-segmentation-1403.ivf.md5 +70d5a2207ca1891bcaebd5cf6dd88ce8d57b4334 *vp80-03-segmentation-1407.ivf.md5 +265f962ee781531f9a93b9309461316fd32b2a1d *vp80-03-segmentation-1408.ivf.md5 +0c4ecbbd6dc042d30e626d951b65f460dd6cd563 *vp80-03-segmentation-1409.ivf.md5 +cf779af36a937f06570a0fca9db64ba133451dee *vp80-03-segmentation-1410.ivf.md5 +0e6c5036d51ab078842f133934926c598a9cff02 *vp80-03-segmentation-1413.ivf.md5 +eb3930aaf229116c80d507516c34759c3f6cdf69 *vp80-03-segmentation-1414.ivf.md5 +123d6c0f72ee87911c4ae7538e87b7d163b22d6c *vp80-03-segmentation-1415.ivf.md5 +e70551d1a38920e097a5d8782390b79ecaeb7505 *vp80-03-segmentation-1425.ivf.md5 +44e8f4117e46dbb302b2cfd81171cc1a1846e431 *vp80-03-segmentation-1426.ivf.md5 +52636e54aee5f95bbace37021bd67de5db767e9a *vp80-03-segmentation-1427.ivf.md5 +b1ad3eff20215c28e295b15ef3636ed926d59cba *vp80-03-segmentation-1432.ivf.md5 +24c22a552fa28a90e5978f67f57181cc2d7546d7 *vp80-03-segmentation-1435.ivf.md5 +96c49c390abfced18a7a8c9b9ea10af778e10edb *vp80-03-segmentation-1436.ivf.md5 +f95eb6214571434f1f73ab7833b9ccdf47588020 *vp80-03-segmentation-1437.ivf.md5 +1c0700ca27c9b0090a7747a4b0b4dc21d1843181 *vp80-03-segmentation-1441.ivf.md5 +81d4f23ca32667ee958bae579c8f5e97ba72eb97 *vp80-03-segmentation-1442.ivf.md5 +272efcef07a3a30fbca51bfd566063d8258ec0be *vp80-04-partitions-1404.ivf.md5 +66ed219ab812ac801b256d35cf495d193d4cf478 *vp80-04-partitions-1405.ivf.md5 +36083f37f56f502bd60ec5e07502ee9e6b8699b0 *vp80-04-partitions-1406.ivf.md5 +6ca909bf168a64c09415626294665dc1be3d1973 *vp80-05-sharpness-1428.ivf.md5 +1667d2ee2334e5fdea8a8a866f4ccf3cf76f033a *vp80-05-sharpness-1429.ivf.md5 +71bcbe5357d36a19df5b07fbe3e27bffa8893f0a *vp80-05-sharpness-1430.ivf.md5 +89a09b1dffce2d55770a89e58d9925c70ef79bf8 *vp80-05-sharpness-1431.ivf.md5 +08444a18b4e6ba3450c0796dd728d48c399a2dc9 *vp80-05-sharpness-1433.ivf.md5 +6d6223719a90c13e848aa2a8a6642098cdb5977a *vp80-05-sharpness-1434.ivf.md5 +41d70bb5fa45bc88da1604a0af466930b8dd77b5 *vp80-05-sharpness-1438.ivf.md5 +086c56378df81b6cee264d7540a7b8f2b405c7a4 *vp80-05-sharpness-1439.ivf.md5 +d32dc2c4165eb266ea4c23c14a45459b363def32 *vp80-05-sharpness-1440.ivf.md5 +8c69dc3d8e563f56ffab5ad1e400d9e689dd23df *vp80-05-sharpness-1443.ivf.md5 +d6f246df012c241b5fa6c1345019a3703d85c419 *vp80-06-smallsize.ivf.md5 +ce881e567fe1d0fbcb2d3e9e6281a1a8d74d82e0 *vp90-2-00-quantizer-00.webm +ac5eda33407d0521c7afca43a63fd305c0cd9d13 *vp90-2-00-quantizer-00.webm.md5 +2ca0463f2cfb93d25d7dded174db70b7cb87cb48 *vp90-2-00-quantizer-01.webm +10d98884fc6d9a5f47a2057922b8e25dd48d7786 *vp90-2-00-quantizer-01.webm.md5 +d80a2920a5e0819d69dcba8fe260c01f820f8982 *vp90-2-00-quantizer-02.webm +c964c8e5e04165fabbf1c6ee8ee5121d35921965 *vp90-2-00-quantizer-02.webm.md5 +fdef046777b5b75c962b715d809dbe2ea331afb9 *vp90-2-00-quantizer-03.webm +f270bee0b0c7aa2bf4c5afe098556b4f3f890faf *vp90-2-00-quantizer-03.webm.md5 +66d98609e809394a6ac730787e6724e3badc075a *vp90-2-00-quantizer-04.webm +427433bfe121c4aea1095ec3124fdc174d200e3a *vp90-2-00-quantizer-04.webm.md5 +e6e42626d8cadf0b5be16313f69212981b96fee5 *vp90-2-00-quantizer-05.webm +c98f6a9a1af4cfd71416792827304266aad4bd46 *vp90-2-00-quantizer-05.webm.md5 +413ef09b721f5dcec1a96e937a97e5873c2e6db6 *vp90-2-00-quantizer-06.webm +5080e940a23805c82e578e21b57fc2c511e76376 *vp90-2-00-quantizer-06.webm.md5 +4a50a5f4ac717c30dfaae8bb46702e3542e867de *vp90-2-00-quantizer-07.webm +76c429a02b56762e10ee4db88729d8834b3a70f4 *vp90-2-00-quantizer-07.webm.md5 +d2f4e464780bf8b7e647efa18ac777a930e62bc0 *vp90-2-00-quantizer-08.webm +ab94aabf9316111b52d7c531962ed4123313b6ba *vp90-2-00-quantizer-08.webm.md5 +174bc58433936dd79550398d744f1072ce7f5693 *vp90-2-00-quantizer-09.webm +e1f7690cd83ccc56d045e17cce552544a5f03810 *vp90-2-00-quantizer-09.webm.md5 +52bc1dfd3a97b24d922eb8a31d07527891561f2a *vp90-2-00-quantizer-10.webm +9b37bed893b5f6a4e12f2aa40f02dd40f944d0f8 *vp90-2-00-quantizer-10.webm.md5 +10031eecafde1e1d8e6323fe2b2a1d7e77a66869 *vp90-2-00-quantizer-11.webm +fe4620a4bb0e4f5cb9bbfedc4039a22b81b0f5c0 *vp90-2-00-quantizer-11.webm.md5 +78e9f7bb77e8e348155bbdfa12790789d1d50c34 *vp90-2-00-quantizer-12.webm +0961d060cc8dd469c6dac8d7d75f927c0bb971b8 *vp90-2-00-quantizer-12.webm.md5 +133b77a3bbcef652552d74ffc46afbfe3b8a1cba *vp90-2-00-quantizer-13.webm +df29e5e0f95772af482f540d776f6b9dea4bfa29 *vp90-2-00-quantizer-13.webm.md5 +27323afdaf8987e025c27129c74c86502315a206 *vp90-2-00-quantizer-14.webm +ce96a2cc312942f0427a463f15a392870dd69764 *vp90-2-00-quantizer-14.webm.md5 +ab58d0b41037829f6bc993910999f4af0212aafd *vp90-2-00-quantizer-15.webm +40f700db606501aa7cb49049624cbdde6409b122 *vp90-2-00-quantizer-15.webm.md5 +cd948e66448aafb65998815ce37241f95d7c9ee7 *vp90-2-00-quantizer-16.webm +039b742d149c945ed79c7b9a6384352852a1c116 *vp90-2-00-quantizer-16.webm.md5 +62f56e663e13c576764e491cf08f19bd46a71999 *vp90-2-00-quantizer-17.webm +90c5a39bf76e6b3e0a1c0d3e9b68a9fd78be963e *vp90-2-00-quantizer-17.webm.md5 +f26ecad7263cd66a614e53ba5d7c00df181affeb *vp90-2-00-quantizer-18.webm +cda0a1c0fca2ec2976ae55124a8a67305508bae6 *vp90-2-00-quantizer-18.webm.md5 +94bfc4c04fcfe139a63b98c569e8c14ba98c401f *vp90-2-00-quantizer-19.webm +5b8ec169ccf67d8a0a8e46a62eb173f5a1dbaf4f *vp90-2-00-quantizer-19.webm.md5 +0ee88e9318985e1e245de78c2c4a665885ab76a7 *vp90-2-00-quantizer-20.webm +4b26f7edb4fcd3a1b4cce9ba3cb8650e3ee6e063 *vp90-2-00-quantizer-20.webm.md5 +6a995cb2b1db33da8087321df1e646f95c3e32d1 *vp90-2-00-quantizer-21.webm +e216b4a1eceac03efcc433759be54ab8ea87b24b *vp90-2-00-quantizer-21.webm.md5 +aa7722fc427e7180115f3c9cd96bb6b2768e7296 *vp90-2-00-quantizer-22.webm +1aa813bd45ae831bf5e79ace4d73dfd25989a07d *vp90-2-00-quantizer-22.webm.md5 +7677e5b929ed6d142041f19b8a9cd5822ee1504a *vp90-2-00-quantizer-23.webm +0de0af34abd843d5b37e58baf3ed96a6104b64c3 *vp90-2-00-quantizer-23.webm.md5 +b2995cbe1128b2d4926f1b28d01c501ecb6be8c8 *vp90-2-00-quantizer-24.webm +db6033af2ba2f2bca62468fb4b8808e474f93923 *vp90-2-00-quantizer-24.webm.md5 +8135ba35587fd92cd4667be7896323d9b634401c *vp90-2-00-quantizer-25.webm +3499e00c2cc15876f61f07e3d3cfca54ebcd98fd *vp90-2-00-quantizer-25.webm.md5 +af0fa2907746db82d345f6d831fcc1b2862a29fb *vp90-2-00-quantizer-26.webm +cd6fe3d14dab48886ebf65be00e6ed9616ebe5a7 *vp90-2-00-quantizer-26.webm.md5 +bd0002e91323776beb5ff11e06edcf19fc08e9b9 *vp90-2-00-quantizer-27.webm +fe72154ef196067d6c272521012dd79706496cac *vp90-2-00-quantizer-27.webm.md5 +fc15eb606f81455ff03df16bf3432296b002c43c *vp90-2-00-quantizer-28.webm +40b2e24b542206a6bfd746ef199e49ccea07678a *vp90-2-00-quantizer-28.webm.md5 +3090bbf913cad0b2eddca7228f5ed51a58378b8d *vp90-2-00-quantizer-29.webm +eb59745e0912d8ed6c928268bcf265237c9ba93f *vp90-2-00-quantizer-29.webm.md5 +c615abdca9c25e1cb110d908edbedfb3b7c92b91 *vp90-2-00-quantizer-30.webm +ad0f4fe6733e4e7cdfe8ef8722bb341dcc7538c0 *vp90-2-00-quantizer-30.webm.md5 +037d9f242086cfb085518f6416259defa82d5fc2 *vp90-2-00-quantizer-31.webm +4654b40792572f0a790874c6347ef9196d86c1a7 *vp90-2-00-quantizer-31.webm.md5 +505899f3f3515044c5c8b3213d9b9d16f614619d *vp90-2-00-quantizer-32.webm +659a2e6dd02df323f62600626859006640b445df *vp90-2-00-quantizer-32.webm.md5 +8b32ec9c3b7e5ca8ddc6b8aea1c1cb7ca996bccc *vp90-2-00-quantizer-33.webm +5b175ef1120ddeba4feae1247bf381bbc4e816ce *vp90-2-00-quantizer-33.webm.md5 +4d283755d17e287b1d099a80604398f60d7fb6ea *vp90-2-00-quantizer-34.webm +22a739de95acfeb27524e3700b8f678a9ad744d8 *vp90-2-00-quantizer-34.webm.md5 +4296f56a892a412d3d4f64824718dd566c4e6459 *vp90-2-00-quantizer-35.webm +c532c9c8dc7b3506fc6a51e5c20c17ef0ac039e7 *vp90-2-00-quantizer-35.webm.md5 +6f54e11da461e4410dd9075b015e2d9bc1d07dfb *vp90-2-00-quantizer-36.webm +0b3573f5addea4e3eb11a0b85f068299d5bdad78 *vp90-2-00-quantizer-36.webm.md5 +210581682a26c2c4375efc785c36e07539888bc2 *vp90-2-00-quantizer-37.webm +2b4fb6f8ba975237858e61cc8f560bcfc87cb38e *vp90-2-00-quantizer-37.webm.md5 +a15ef31283dfc4860f837fe200eb32a445f59629 *vp90-2-00-quantizer-38.webm +fb76771f3a795054b9936f70da7505c3ac585284 *vp90-2-00-quantizer-38.webm.md5 +1df8433a441412831daae6726df89fa70d21b14d *vp90-2-00-quantizer-39.webm +39e162c09a20e7e684868097766347014371fee6 *vp90-2-00-quantizer-39.webm.md5 +5330e4788ab9129dbb25a7a7d5411104521248b6 *vp90-2-00-quantizer-40.webm +872cc0f2cc9dbf000f89eadb4d8f9940e48e00b1 *vp90-2-00-quantizer-40.webm.md5 +d88d03b982889e399a78d7a06eeb1cf30e6c2da2 *vp90-2-00-quantizer-41.webm +5b4f7217e57fa2a221011d0b32f8d0409496b7b6 *vp90-2-00-quantizer-41.webm.md5 +9e16406e3e26955a6e17d455ef1ef64bbfa26e53 *vp90-2-00-quantizer-42.webm +0219d090cf37daabe19256ba8e932ba4874b92e4 *vp90-2-00-quantizer-42.webm.md5 +a9b15843486fb05f8cd15437ef279782a42b75db *vp90-2-00-quantizer-43.webm +3c9b0b4c607f9579a31726bfcf56729334ddc686 *vp90-2-00-quantizer-43.webm.md5 +1dbc931ac446c91eabe7213efff55b596cccf07c *vp90-2-00-quantizer-44.webm +73bc8f675103abaef3d9f73a2742b3bffd726d23 *vp90-2-00-quantizer-44.webm.md5 +7c6c1be15beb9d6201204b018966c8c4f9777efc *vp90-2-00-quantizer-45.webm +c907b29da821f790c6748de61f592689312e4e36 *vp90-2-00-quantizer-45.webm.md5 +07b434da1a467580f73b32177ee11b3e00f65a0d *vp90-2-00-quantizer-46.webm +7b2b7ce60c50bc970bc0ada46d7a7ce440148da3 *vp90-2-00-quantizer-46.webm.md5 +233d0465fb1a6fa36e9f89bd2193ac79bd4d2809 *vp90-2-00-quantizer-47.webm +527e0a9fb932efe915027ffe077f9e8d3a4fb139 *vp90-2-00-quantizer-47.webm.md5 +719613df7307e205c3fdb6acfb373849c5ab23c7 *vp90-2-00-quantizer-48.webm +65ab6c9d1b682c183b201c7ff42b90343ce3e304 *vp90-2-00-quantizer-48.webm.md5 +3bf04a598325ed0eabae1598ec7f718f715ec672 *vp90-2-00-quantizer-49.webm +ac68c4387ce11fcc998d8ba455ab9b2bb361d240 *vp90-2-00-quantizer-49.webm.md5 +d59238fb3a654931c9b65a11e7321b40d1f702e9 *vp90-2-00-quantizer-50.webm +d0576bfede46fd55659f028f2fd28554ceb3e6cc *vp90-2-00-quantizer-50.webm.md5 +3f579785101d4209360dd96f8c2ffe9beddf3bee *vp90-2-00-quantizer-51.webm +89fcfe04f4457a7f02ab4a2f94aacbb88aee5789 *vp90-2-00-quantizer-51.webm.md5 +28be5836e2fedefe4babf12fc9b79e460ab0a0f4 *vp90-2-00-quantizer-52.webm +f3dd52b70c18345fee740220f35da9c4def2017a *vp90-2-00-quantizer-52.webm.md5 +488ad4058c17170665b6acd1021fade9a02771e4 *vp90-2-00-quantizer-53.webm +1cdcb1d4f3a37cf83ad235eb27ec62ed2a01afc7 *vp90-2-00-quantizer-53.webm.md5 +682978289cb28cc8c9d39bc797300e45d6039de7 *vp90-2-00-quantizer-54.webm +36c35353f2c03cb099bd710d9994de7d9ed88834 *vp90-2-00-quantizer-54.webm.md5 +c398ce49af762a48f10cc4da9fae0769aae5f226 *vp90-2-00-quantizer-55.webm +2cf3570542d984f167ab087f59493c7fb47e0ed2 *vp90-2-00-quantizer-55.webm.md5 +3071f18b2fce261aa82d61f81a7ae4ca9a75d0e3 *vp90-2-00-quantizer-56.webm +d3f93f8272b6de31cffb011a26f11abb514efb12 *vp90-2-00-quantizer-56.webm.md5 +f4e8e14b1f278801a7eb6f11734780a01b1668e9 *vp90-2-00-quantizer-57.webm +6478fdf1d7faf6db5f19dffc5e1363af358699ee *vp90-2-00-quantizer-57.webm.md5 +307dc264f57cc618fff211fa44d7f52767ed9660 *vp90-2-00-quantizer-58.webm +cf231d4a52d492fa692ea4194ec5eb7511fec54e *vp90-2-00-quantizer-58.webm.md5 +1fd7cd596170afce2de0b1441b7674bda5723440 *vp90-2-00-quantizer-59.webm +4681f7ef96f63e085c41bb1a964b0df7e67e0b38 *vp90-2-00-quantizer-59.webm.md5 +34cdcc81c0ba7085aefbb22d7b4aa9bca3dd7c62 *vp90-2-00-quantizer-60.webm +58691ef53b6b623810e2c57ded374c77535df935 *vp90-2-00-quantizer-60.webm.md5 +e6e812406aab81021bb16e772c1db03f75906cb6 *vp90-2-00-quantizer-61.webm +76436eace62f08ff92b61a0845e66667a027db1b *vp90-2-00-quantizer-61.webm.md5 +84d811bceed70c950a6a08e572a6e274866e72b1 *vp90-2-00-quantizer-62.webm +2d937cc011eeddd95222b960982da5cd18db580f *vp90-2-00-quantizer-62.webm.md5 +0912b295ba0ea09359315315ffd67d22d046f883 *vp90-2-00-quantizer-63.webm +5a829031055d70565f57dbcd47a6ac33619952b3 *vp90-2-00-quantizer-63.webm.md5 +0cf9e5ebe0112bdb47b5887ee5d58eb9d4727c00 *vp90-2-01-sharpness-1.webm +5a0476be4448bae8f8ca17ea236c98793a755948 *vp90-2-01-sharpness-1.webm.md5 +51e02d7911810cdf5be8b68ac40aedab479a3179 *vp90-2-01-sharpness-2.webm +a0ca5bc87a5ed7c7051f59078daa0d03be1b45b6 *vp90-2-01-sharpness-2.webm.md5 +0603f8ad239c07a531d948187f4dafcaf51eda8d *vp90-2-01-sharpness-3.webm +3af8000a69c72fe77881e3176f026c2affb78cc7 *vp90-2-01-sharpness-3.webm.md5 +4ca4839f48146252fb261ed88838d80211804841 *vp90-2-01-sharpness-4.webm +08832a1494f84fa9edd40e080bcf2c0e80100c76 *vp90-2-01-sharpness-4.webm.md5 +95099dc8f9cbaf9b9a7dd65311923e441ff70731 *vp90-2-01-sharpness-5.webm +93ceee30c140f0b406726c0d896b9db6031c4c7f *vp90-2-01-sharpness-5.webm.md5 +ceb4116fb7b078d266d153233b6d62a255a34e4c *vp90-2-01-sharpness-6.webm +da83efe59e537ce538e8b03a6eac63cf25849c9a *vp90-2-01-sharpness-6.webm.md5 +b5f7cd19aece3880f9d616a778e5cc24c6b9b505 *vp90-2-01-sharpness-7.webm +2957408d20deac8633941a2169f801bae6f086e1 *vp90-2-01-sharpness-7.webm.md5 +ffc096c2ce1050450ad462b5fabd2a5220846319 *vp90-2-02-size-08x08.webm +e36d2ed6fa2746347710b750586aafa6a01ff3ae *vp90-2-02-size-08x08.webm.md5 +895b986f9fd55cd879472b31c6a06b82094418c8 *vp90-2-02-size-08x10.webm +079157a19137ccaebba606f2871f45a397347150 *vp90-2-02-size-08x10.webm.md5 +1c5992203e62a2b83040ccbecd748b604e19f4c0 *vp90-2-02-size-08x16.webm +9aa45ffdf2078f883bbed01450031b691819c144 *vp90-2-02-size-08x16.webm.md5 +d0a8953da1f85f484487408fee5da9e2a8391901 *vp90-2-02-size-08x18.webm +59a5cc17d354c6a23e5e959d666b1456a5d49c56 *vp90-2-02-size-08x18.webm.md5 +1b13461a9fc65cb041bacfe4ea6f02d363397d61 *vp90-2-02-size-08x32.webm +2bdddd6878f05d37d84cde056a3f5e7f926ba3d6 *vp90-2-02-size-08x32.webm.md5 +2861f0a0daadb62295b0504a1fbe5b50c79a8f59 *vp90-2-02-size-08x34.webm +6b5812cfb8a82d378ea2913bf009e93668020147 *vp90-2-02-size-08x34.webm.md5 +02f948216d4246579dc53c47fe55d8fb264ba251 *vp90-2-02-size-08x64.webm +84b55fdee6d9aa820c7a8c62822446184b191767 *vp90-2-02-size-08x64.webm.md5 +4b011242cbf42516efd2b197baebb61dd34562c9 *vp90-2-02-size-08x66.webm +6b1fa0a885947b3cc0fe58f75f838e662bd9bb8b *vp90-2-02-size-08x66.webm.md5 +4057796be9dd12df48ab607f502ae6aa70eeeab6 *vp90-2-02-size-10x08.webm +71c752c51aec9f48de286b93f4c20e9c11cad7d0 *vp90-2-02-size-10x08.webm.md5 +6583c853fa43fc53d51743eac5f3a43a359d45d0 *vp90-2-02-size-10x10.webm +1da524d24af1944b671d4d3f2b398d6e336584c3 *vp90-2-02-size-10x10.webm.md5 +ba442fc03ccd3a705c64c83b36f5ada67d198874 *vp90-2-02-size-10x16.webm +7cfd960f232c34c641a4a2a9411b6fd0efb2fc50 *vp90-2-02-size-10x16.webm.md5 +cc92ed40eef14f52e4d080cb2c57939dd8326374 *vp90-2-02-size-10x18.webm +db5626275cc55ce970b91c995e74f6838d943aca *vp90-2-02-size-10x18.webm.md5 +3a93d501d22325e9fd4c9d8b82e2a432de33c351 *vp90-2-02-size-10x32.webm +5cae51b0c71cfc131651f345f87583eb2903afaf *vp90-2-02-size-10x32.webm.md5 +50d2f2b15a9a5178153db44a9e03aaf32b227f67 *vp90-2-02-size-10x34.webm +bb0efe058122641e7f73e94497dda2b9e6c21efd *vp90-2-02-size-10x34.webm.md5 +01624ec173e533e0b33fd9bdb91eb7360c7c9175 *vp90-2-02-size-10x64.webm +b9c0e3b054463546356acf5157f9be92fd34732f *vp90-2-02-size-10x64.webm.md5 +2942879baf1c09e96b14d0fc84806abfe129c706 *vp90-2-02-size-10x66.webm +bab5f539c2f91952e187456b4beafbb4c01e25ee *vp90-2-02-size-10x66.webm.md5 +88d2b63ca5e9ee163d8f20e8886f3df3ff301a66 *vp90-2-02-size-16x08.webm +7f48a0fcf8c25963f3057d7f6669c5f2415834b8 *vp90-2-02-size-16x08.webm.md5 +59261eb34c15ea9b5ddd2d416215c1a8b9e6dc1f *vp90-2-02-size-16x10.webm +73a7c209a46dd051c9f7339b6e02ccd5b3b9fc81 *vp90-2-02-size-16x10.webm.md5 +066834fef9cf5b9a72932cf4dea5f253e14a976d *vp90-2-02-size-16x16.webm +faec542f52f37601cb9c480d887ae9355be99372 *vp90-2-02-size-16x16.webm.md5 +195307b4eb3192271ee4a935b0e48deef0c54cc2 *vp90-2-02-size-16x18.webm +5a92e19e624c0376321d4d0e22c0c91995bc23e1 *vp90-2-02-size-16x18.webm.md5 +14f3f884216d7ae16ec521f024a2f2d31bbf9c1a *vp90-2-02-size-16x32.webm +ea622d1c817dd174556f7ee7ccfe4942b34d4845 *vp90-2-02-size-16x32.webm.md5 +2e0501100578a5da9dd47e4beea160f945bdd1ba *vp90-2-02-size-16x34.webm +1b8645ef64239334921c5f56b24ce815e6070b05 *vp90-2-02-size-16x34.webm.md5 +89a6797fbebebe93215f367229a9152277f5dcfe *vp90-2-02-size-16x64.webm +a03d8c1179ca626a8856fb416d635dbf377979cd *vp90-2-02-size-16x64.webm.md5 +0f3a182e0750fcbae0b9eae80c7a53aabafdd18d *vp90-2-02-size-16x66.webm +8cb6736dc2d897c1283919a32068af377d66c59c *vp90-2-02-size-16x66.webm.md5 +68fe70dc7914cc1d8d6dcd97388b79196ba3e7f1 *vp90-2-02-size-18x08.webm +874c7fb505be9db3160c57cb405c4dbd5b990dc2 *vp90-2-02-size-18x08.webm.md5 +0546352dd78496d4dd86c3727ac2ff36c9e72032 *vp90-2-02-size-18x10.webm +1d80eb36557ea5f25a386495a36f93da0f25316b *vp90-2-02-size-18x10.webm.md5 +60fe99e5f5cc99706efa3e0b894e45cbcf0d6330 *vp90-2-02-size-18x16.webm +1ab6cdd89a53662995d103546e6611c84f9292ab *vp90-2-02-size-18x16.webm.md5 +f9a8f5fb749d69fd555db6ca093b7f77800c7b4f *vp90-2-02-size-18x18.webm +ace8a66328f7802b15f9989c2720c029c6abd279 *vp90-2-02-size-18x18.webm.md5 +a197123a527ec25913a9bf52dc8c347749e00045 *vp90-2-02-size-18x32.webm +34fbd7036752232d1663e70d7f7cdc93f7129202 *vp90-2-02-size-18x32.webm.md5 +f219655a639a774a2c9c0a9f45c28dc0b5e75e24 *vp90-2-02-size-18x34.webm +2c4d622a9ea548791c1a07903d3702e9774388bb *vp90-2-02-size-18x34.webm.md5 +5308578da48c677d477a5404e19391d1303033c9 *vp90-2-02-size-18x64.webm +e7fd4462527bac38559518ba80e41847db880f15 *vp90-2-02-size-18x64.webm.md5 +e109a7e013bd179f97e378542e1e81689ed06802 *vp90-2-02-size-18x66.webm +45c04e422fb383c1f3be04beefaa4490e83bdb1a *vp90-2-02-size-18x66.webm.md5 +38844cae5d99caf445f7de33c3ae78494ce36c01 *vp90-2-02-size-32x08.webm +ad018be39e493ca2405225034b1a5b7a42af6f3a *vp90-2-02-size-32x08.webm.md5 +7b57eaad55906f9de9903c8657a3fcb2aaf792ea *vp90-2-02-size-32x10.webm +2294425d4e55d275af5e25a0beac9738a1b4ee73 *vp90-2-02-size-32x10.webm.md5 +f47ca2ced0d47f761bb0a5fdcd911d3f450fdcc1 *vp90-2-02-size-32x16.webm +ae10981d93913f0ab1f28c1146255e01769aa8c0 *vp90-2-02-size-32x16.webm.md5 +08b23ad838b6cf1fbfe3ad7e7775d95573e815fc *vp90-2-02-size-32x18.webm +1ba76f4c4a4ac7aabfa3ce195c1b473535eb7cc8 *vp90-2-02-size-32x18.webm.md5 +d5b88ae6c8c25c53dee74d9f1e6ca64244349a57 *vp90-2-02-size-32x32.webm +e39c067a8ee2da52a51641eb1cb7f8eba935eb6b *vp90-2-02-size-32x32.webm.md5 +529429920dc36bd899059fa75a767f02c8c60874 *vp90-2-02-size-32x34.webm +56888e7834f52b106e8911e3a7fc0f473b609995 *vp90-2-02-size-32x34.webm.md5 +38e848e160391c2b1a55040aadde613b9f4bf15e *vp90-2-02-size-32x64.webm +8950485fb3f68b0e8be234db860e4ec5f5490fd0 *vp90-2-02-size-32x64.webm.md5 +5e8670f0b8ec9cefa8795b8959ffbe1a8e1aea94 *vp90-2-02-size-32x66.webm +225df9d7d72ec711b0b60f4aeb65311c97db054a *vp90-2-02-size-32x66.webm.md5 +695f929e2ce6fb11a1f180322d46c5cb1c97fa61 *vp90-2-02-size-34x08.webm +5bb4262030018dd01883965c6aa6070185924ef6 *vp90-2-02-size-34x08.webm.md5 +5adf74ec906d2ad3f7526e06bd29f5ad7d966a90 *vp90-2-02-size-34x10.webm +71c100b437d3e8701632ae8d65c3555339b1c68f *vp90-2-02-size-34x10.webm.md5 +d0918923c987fba2d00193d83797b21289fe54aa *vp90-2-02-size-34x16.webm +5d5a52f3535b4d2698dd3d87f4a13fdc9b57163d *vp90-2-02-size-34x16.webm.md5 +553ab0042cf87f5e668ec31b2e4b2a4b6ec196fd *vp90-2-02-size-34x18.webm +a164c7f3c424987df2340496e6a8cf76e973f0f1 *vp90-2-02-size-34x18.webm.md5 +baf3e233634f150de81c18ba5d8848068e1c3c54 *vp90-2-02-size-34x32.webm +22a79d3bd1c9b85dfe8c70bb2e19f08a92a8be03 *vp90-2-02-size-34x32.webm.md5 +6d50a533774a7167350e4a7ef43c94a5622179a2 *vp90-2-02-size-34x34.webm +0c099638e79c273546523e06704553e42eb00b00 *vp90-2-02-size-34x34.webm.md5 +698cdd0a5e895cc202c488675e682a8c537ede4f *vp90-2-02-size-34x64.webm +9317b63987cddab8389510a27b86f9f3d46e3fa5 *vp90-2-02-size-34x64.webm.md5 +4b5335ca06f082b6b69f584eb8e7886bdcafefd3 *vp90-2-02-size-34x66.webm +e18d68b35428f46a84a947c646804a51ef1d7cec *vp90-2-02-size-34x66.webm.md5 +a54ae7b494906ec928a876e8290e5574f2f9f6a2 *vp90-2-02-size-64x08.webm +87f9f7087b6489d45e9e4b38ede2c5aef4a4928f *vp90-2-02-size-64x08.webm.md5 +24522c70804a3c23d937df2d829ae63965b23f38 *vp90-2-02-size-64x10.webm +447ce03938ab53bffcb4a841ee0bfaa90462dcb9 *vp90-2-02-size-64x10.webm.md5 +2a5035d035d214ae614af8051930690ef623989b *vp90-2-02-size-64x16.webm +84e355761dd2e0361b904c84c52a0dd0384d89cf *vp90-2-02-size-64x16.webm.md5 +3a293ef4e270a19438e59b817fbe5f43eed4d36b *vp90-2-02-size-64x18.webm +666824e5ba746779eb46079e0631853dcc86d48b *vp90-2-02-size-64x18.webm.md5 +ed32fae837095c9e8fc95d223ec68101812932c2 *vp90-2-02-size-64x32.webm +97086eadedce1d0d9c072b585ba7b49aec69b1e7 *vp90-2-02-size-64x32.webm.md5 +696c7a7250bdfff594f4dfd88af34239092ecd00 *vp90-2-02-size-64x34.webm +253a1d38d452e7826b086846c6f872f829c276bb *vp90-2-02-size-64x34.webm.md5 +fc508e0e3c2e6872c60919a60b812c5232e9c2b0 *vp90-2-02-size-64x64.webm +2cd6ebeca0f82e9f505616825c07950371b905ab *vp90-2-02-size-64x64.webm.md5 +0f8a4fc1d6521187660425c283f08dff8c66e476 *vp90-2-02-size-64x66.webm +5806be11a1d346be235f88d3683e69f73746166c *vp90-2-02-size-64x66.webm.md5 +273b0c36e3658685cde250408a478116d7ae92f1 *vp90-2-02-size-66x08.webm +23c3cd0dca20a2f71f036e77ea92025ff4e7a298 *vp90-2-02-size-66x08.webm.md5 +4844c59c3306d1e671bb0568f00e344bf797e66e *vp90-2-02-size-66x10.webm +e041eaf6841d775f8fde8bbb4949d2733fdaab7f *vp90-2-02-size-66x10.webm.md5 +bdf3f1582b234fcd2805ffec59f9d716a2345302 *vp90-2-02-size-66x16.webm +2ec85ee18119e6798968571ea6e1b93ca386e3af *vp90-2-02-size-66x16.webm.md5 +0acce9af12b13b025d5274013da7ef6f568f075f *vp90-2-02-size-66x18.webm +77c4d53e2a5c96b70af9d575fe6811e0f5ee627b *vp90-2-02-size-66x18.webm.md5 +682b36a25774bbdedcd603f504d18eb63f0167d4 *vp90-2-02-size-66x32.webm +53728fae2a428f16d376a29f341a64ddca97996a *vp90-2-02-size-66x32.webm.md5 +e71b70e901e29eaa6672a6aa4f37f6f5faa02bd6 *vp90-2-02-size-66x34.webm +f69a6a555e3f614b0a35f9bfc313d8ebb35bc725 *vp90-2-02-size-66x34.webm.md5 +4151b8c29452d5c2266397a7b9bf688899a2937b *vp90-2-02-size-66x64.webm +69486e7fd9e380b6c97a03d3e167affc79f73840 *vp90-2-02-size-66x64.webm.md5 +68784a1ecac776fe2a3f230345af32f06f123536 *vp90-2-02-size-66x66.webm +7f008c7f48d55e652fbd6bac405b51e0015c94f2 *vp90-2-02-size-66x66.webm.md5 +7e1bc449231ac1c5c2a11c9a6333b3e828763798 *vp90-2-03-size-196x196.webm +6788a561466dace32d500194bf042e19cccc35e1 *vp90-2-03-size-196x196.webm.md5 +a170c9a88ec1dd854c7a471ff55fb2a97ac31870 *vp90-2-03-size-196x198.webm +6bf9d6a8e2bdc5bf4f8a78071a3fed5ca02ad6f2 *vp90-2-03-size-196x198.webm.md5 +68f861d21c4c8b03d572c3d3fcd9f4fbf1f4503f *vp90-2-03-size-196x200.webm +bbfc260b2bfd872cc6054272bb6b7f959a9e1c6e *vp90-2-03-size-196x200.webm.md5 +fc34889feeca2b7e5b27b4f1ce22d2e2b8e3e4b1 *vp90-2-03-size-196x202.webm +158ee72af578f39aad0c3b8f4cbed2fc78b57e0f *vp90-2-03-size-196x202.webm.md5 +dd28fb7247af534bdf5e6795a3ac429610489a0b *vp90-2-03-size-196x208.webm +7546be847efce2d1c0a23f807bfb03f91b764e1e *vp90-2-03-size-196x208.webm.md5 +41d5cf5ed65b722a1b6dc035e67f978ea8ffecf8 *vp90-2-03-size-196x210.webm +9444fdf632d6a1b6143f4cb10fed8f63c1d67ec1 *vp90-2-03-size-196x210.webm.md5 +5007bc618143437c009d6dde5fc2e86f72d37dc2 *vp90-2-03-size-196x224.webm +858361d8f79b44df5545feabbc9754ec9ede632f *vp90-2-03-size-196x224.webm.md5 +0bcbe357fbc776c3fa68e7117179574ed7564a44 *vp90-2-03-size-196x226.webm +72006a5f42031a43d70a2cd9fc1958962a86628f *vp90-2-03-size-196x226.webm.md5 +000239f048cceaac055558e97ef07078ebf65502 *vp90-2-03-size-198x196.webm +2d6841901b72000c5340f30be602853438c1b787 *vp90-2-03-size-198x196.webm.md5 +ae75b766306a6404c3b3b35a6b6d53633c14fbdb *vp90-2-03-size-198x198.webm +3f2544b4f3b4b643a98f2c3b15ea5826fc702fa1 *vp90-2-03-size-198x198.webm.md5 +95ffd573fa84ccef1cd59e1583e6054f56a5c83d *vp90-2-03-size-198x200.webm +5d537e3c9b9c54418c79677543454c4cda3de1af *vp90-2-03-size-198x200.webm.md5 +ecc845bf574375f469bc91bf5c75c79dc00073d6 *vp90-2-03-size-198x202.webm +1b59f5e111265615a7a459eeda8cc9045178d228 *vp90-2-03-size-198x202.webm.md5 +432fb27144fe421b9f51cf44d2750a26133ed585 *vp90-2-03-size-198x208.webm +a58a67f4fb357c73ca078aeecbc0f782975630b1 *vp90-2-03-size-198x208.webm.md5 +ff5058e7e6a47435046612afc8536f2040989e6f *vp90-2-03-size-198x210.webm +18d3be7935e52217e2e9400b6f2c681a9e45dc89 *vp90-2-03-size-198x210.webm.md5 +a0d55263c1ed2c03817454dd4ec4090d36dbc864 *vp90-2-03-size-198x224.webm +efa366a299817e2da51c00623b165aab9fbb8d91 *vp90-2-03-size-198x224.webm.md5 +ccd142fa2920fc85bb753f049160c1c353ad1574 *vp90-2-03-size-198x226.webm +534524a0b2dbff852e0b92ef09939db072f83243 *vp90-2-03-size-198x226.webm.md5 +0d483b94ed40abc8ab6e49f960432ee54ad9c7f1 *vp90-2-03-size-200x196.webm +41795f548181717906e7a504ba551f06c32102ae *vp90-2-03-size-200x196.webm.md5 +f6c2dc54e0989d50f01333fe40c91661fcbf849a *vp90-2-03-size-200x198.webm +43df5d8c46a40089441392e6d096c588c1079a68 *vp90-2-03-size-200x198.webm.md5 +2f6e9df82e44fc145f0d9212dcccbed3de605e23 *vp90-2-03-size-200x200.webm +757b2ef96b82093255725bab9690bbafe27f3caf *vp90-2-03-size-200x200.webm.md5 +40c5ea60415642a4a2e75c0d127b06309baadfab *vp90-2-03-size-200x202.webm +3022c4a1c625b5dc04fdb1052d17d45b4171cfba *vp90-2-03-size-200x202.webm.md5 +6942ed5b27476bb8506d10e600d6ff60887780ca *vp90-2-03-size-200x208.webm +c4ab8c66f3cf2dc8e8dd7abae9ac21f4d32cd6be *vp90-2-03-size-200x208.webm.md5 +71dbc99b83c49d1da45589b91eabb98e2f4a7b1e *vp90-2-03-size-200x210.webm +3f0b40da7eef7974b9bc326562f251feb67d9c7c *vp90-2-03-size-200x210.webm.md5 +6b6b8489081cfefb377cc5f18eb754ec2383f655 *vp90-2-03-size-200x224.webm +a259df2ac0e294492e3f9d4315baa34cab044f04 *vp90-2-03-size-200x224.webm.md5 +c9adc1c9bb07559349a0b054df4af56f7a6edbb9 *vp90-2-03-size-200x226.webm +714cec61e3575581e4f1a0e3921f4dfdbbd316c5 *vp90-2-03-size-200x226.webm.md5 +f9bdc936bdf53f8be9ce78fecd41a21d31ff3943 *vp90-2-03-size-202x196.webm +5b8e2e50fcea2c43b12fc067b8a9cc117af77bda *vp90-2-03-size-202x196.webm.md5 +c7b66ea3da87613deb47ff24a111247d3c384fec *vp90-2-03-size-202x198.webm +517e91204b25586da943556f4adc5951c9be8bee *vp90-2-03-size-202x198.webm.md5 +935ef56b01cfdb4265a7e24696645209ccb20970 *vp90-2-03-size-202x200.webm +55b8ec4a2513183144a8e27564596c06c7576fce *vp90-2-03-size-202x200.webm.md5 +849acf75e4f1d8d90046704e1103a18c64f30e35 *vp90-2-03-size-202x202.webm +c79afc6660df2824e7df314e5bfd71f0d8acf76b *vp90-2-03-size-202x202.webm.md5 +17b3a4d55576b770626ccb856b9f1a6c8f6ae476 *vp90-2-03-size-202x208.webm +0b887ff30409c58f2ccdc3bfacd6be7c69f8997a *vp90-2-03-size-202x208.webm.md5 +032d0ade4230fb2eef6d19915a7a1c9aa4a52617 *vp90-2-03-size-202x210.webm +f78f8e79533c0c88dd2bfdcec9b1c07848568ece *vp90-2-03-size-202x210.webm.md5 +915a38c31fe425d5b93c837121cfa8082f5ea5bc *vp90-2-03-size-202x224.webm +bf52a104074d0c5942aa7a5b31e11db47e43d48e *vp90-2-03-size-202x224.webm.md5 +be5cfde35666fa435e47d544d9258215beb1cf29 *vp90-2-03-size-202x226.webm +2fa2f87502fda756b319389c8975204e130a2e3f *vp90-2-03-size-202x226.webm.md5 +15d908e97862b5b4bf295610df011fb9aa09909b *vp90-2-03-size-208x196.webm +50c60792305d6a99be376dd596a6ff979325e6cc *vp90-2-03-size-208x196.webm.md5 +a367c7bc9fde56d6f4848cc573c7d4c1ce75e348 *vp90-2-03-size-208x198.webm +be85fb2c8d435a75484231356f07d06ebddd13cd *vp90-2-03-size-208x198.webm.md5 +05fd46deb7288e7253742091f56e54a9a441a187 *vp90-2-03-size-208x200.webm +74f8ec3b3a2fe81767ed1ab36a47bc0062d6223c *vp90-2-03-size-208x200.webm.md5 +d8985c4b386513a7385a4b3639bf91e469f1378b *vp90-2-03-size-208x202.webm +0614a1e8d92048852adcf605a51333f5fabc7f03 *vp90-2-03-size-208x202.webm.md5 +28b002242238479165ba4fb87ee6b442c64b32e4 *vp90-2-03-size-208x208.webm +37de5aca59bb900228400b0e115d3229edb9dcc0 *vp90-2-03-size-208x208.webm.md5 +c545be0050c2fad7c68427dbf86c62a739e94ab3 *vp90-2-03-size-208x210.webm +d646eccb3cd578f94b54777e32b88898bef6e17a *vp90-2-03-size-208x210.webm.md5 +63a0cfe295b661026dd7b1bebb67acace1db766f *vp90-2-03-size-208x224.webm +85c0361d93bf85a335248fef2767ff43eeef23db *vp90-2-03-size-208x224.webm.md5 +f911cc718d66e4fe8a865226088939c9eb1b7825 *vp90-2-03-size-208x226.webm +a6d583a57876e7b7ec48625b2b2cdbcf70cab837 *vp90-2-03-size-208x226.webm.md5 +5bbb0f36da9a4683cf04e724124d8696332911bf *vp90-2-03-size-210x196.webm +a3580fc7816d7fbcfb54fdba501cabbd06ba2f1d *vp90-2-03-size-210x196.webm.md5 +8db64d6f9ce36dd382013b42ae4e292deba697bc *vp90-2-03-size-210x198.webm +eda20f8268c7f4147bead4059e9c4897e09140a9 *vp90-2-03-size-210x198.webm.md5 +ce391505eeaf1d12406563101cd6b2dbbbb44bfc *vp90-2-03-size-210x200.webm +79d73b7f623082d2a00aa33e95c79d11c7d9c3a8 *vp90-2-03-size-210x200.webm.md5 +852db6fdc206e72391fc69b807f1954934679949 *vp90-2-03-size-210x202.webm +f69414c5677ed2f2b8b37ae76429e509a92276a5 *vp90-2-03-size-210x202.webm.md5 +c424cc3edd2308da7d33f27acb36b54db5bf2595 *vp90-2-03-size-210x208.webm +27b18562faa1b3184256f4eae8114b539b3e9d3e *vp90-2-03-size-210x208.webm.md5 +dd029eba719d50a2851592fa8b9b2efe88904930 *vp90-2-03-size-210x210.webm +c853a1670465eaa04ca31b3511995f1b6ed4f58f *vp90-2-03-size-210x210.webm.md5 +d962e8ae676c54d0c3ea04ec7c04b37ae6a786e3 *vp90-2-03-size-210x224.webm +93b793e79d987065b39ad8e2e71244368435fc25 *vp90-2-03-size-210x224.webm.md5 +3d0825fe83bcc125be1f78145ff43ca6d7588784 *vp90-2-03-size-210x226.webm +5230f31a57ca3b5311698a12035d2644533b3ec4 *vp90-2-03-size-210x226.webm.md5 +6622f8bd9279e1ce45509a58a31a990052d45e14 *vp90-2-03-size-224x196.webm +65411da07f60113f2be05c807879072b161d561e *vp90-2-03-size-224x196.webm.md5 +6744ff2ee2c41eb08c62ff30880833b6d77b585b *vp90-2-03-size-224x198.webm +46ea3641d41acd4bff347b224646c060d5620385 *vp90-2-03-size-224x198.webm.md5 +8eb91f3416a1404705f370caecd74b2b458351b1 *vp90-2-03-size-224x200.webm +196aefb854c8b95b9330263d6690b7ee15693ecf *vp90-2-03-size-224x200.webm.md5 +256a5a23ef4e6d5ef2871af5afb8cd13d28cec00 *vp90-2-03-size-224x202.webm +840ad8455dcf2be378c14b007e66fa642fc8196d *vp90-2-03-size-224x202.webm.md5 +db4606480ab48b96c9a6ff5e639f1f1aea2a12e4 *vp90-2-03-size-224x208.webm +40b9801d5620467499ac70fa6b7c40aaa5e1c331 *vp90-2-03-size-224x208.webm.md5 +e37159e687fe1cb24cffddfae059301adbaf4212 *vp90-2-03-size-224x210.webm +1e4acd4b6334ae260c3eed08652d0ba8122073f2 *vp90-2-03-size-224x210.webm.md5 +0de1eb4bb6285ae621e4f2b613d2aa4a8c95a130 *vp90-2-03-size-224x224.webm +37db449ad86fb286c2c02d94aa8fe0379c05044a *vp90-2-03-size-224x224.webm.md5 +32ebbf903a7d7881bcfe59639f1d472371f3bf27 *vp90-2-03-size-224x226.webm +5cc3ac5dc9f6912491aa2ddac863f8187f34c569 *vp90-2-03-size-224x226.webm.md5 +9480ff5c2c32b1870ac760c87514912616e6cf01 *vp90-2-03-size-226x196.webm +fe83655c0f1888f0af7b047785f01ba7ca9f1324 *vp90-2-03-size-226x196.webm.md5 +09cad4221996315cdddad4e502dbfabf53ca1d6a *vp90-2-03-size-226x198.webm +e3ddfdc650acb95adb45abd9b634e1f09ea8ac96 *vp90-2-03-size-226x198.webm.md5 +c34f49d55fe39e3f0b607e3cc95e30244225cecb *vp90-2-03-size-226x200.webm +abb83edc868a3523ccd4e5523fac2efbe7c3df1f *vp90-2-03-size-226x200.webm.md5 +d17bc08eedfc60c4c23d576a6c964a21bf854d1f *vp90-2-03-size-226x202.webm +1d22d2d0f375251c2d5a1acb4714bc35d963865b *vp90-2-03-size-226x202.webm.md5 +9bd537c4f92a25596ccd29fedfe181feac948b92 *vp90-2-03-size-226x208.webm +6feb0e7325386275719f3511ada9e248a2ae7df4 *vp90-2-03-size-226x208.webm.md5 +4487067f6cedd495b93696b44b37fe0a3e7eda14 *vp90-2-03-size-226x210.webm +49a8fa87945f47208168d541c068e78d878075d5 *vp90-2-03-size-226x210.webm.md5 +559fea2f8da42b33c1aa1dbc34d1d6781009847a *vp90-2-03-size-226x224.webm +83c6d8f2969b759e10e5c6542baca1265c874c29 *vp90-2-03-size-226x224.webm.md5 +fe0af2ee47b1e5f6a66db369e2d7e9d870b38dce *vp90-2-03-size-226x226.webm +94ad19b8b699cea105e2ff18f0df2afd7242bcf7 *vp90-2-03-size-226x226.webm.md5 +52bc1dfd3a97b24d922eb8a31d07527891561f2a *vp90-2-03-size-352x288.webm +3084d6d0a1eec22e85a394422fbc8faae58930a5 *vp90-2-03-size-352x288.webm.md5 +b6524e4084d15b5d0caaa3d3d1368db30cbee69c *vp90-2-03-deltaq.webm +65f45ec9a55537aac76104818278e0978f94a678 *vp90-2-03-deltaq.webm.md5 +4dbb87494c7f565ffc266c98d17d0d8c7a5c5aba *vp90-2-05-resize.ivf +7f6d8879336239a43dbb6c9f13178cb11cf7ed09 *vp90-2-05-resize.ivf.md5 +bf61ddc1f716eba58d4c9837d4e91031d9ce4ffe *vp90-2-06-bilinear.webm +f6235f937552e11d8eb331ec55da6b3aa596b9ac *vp90-2-06-bilinear.webm.md5 +0c83a1e414fde3bccd6dc451bbaee68e59974c76 *vp90-2-07-frame_parallel.webm +e5c2c9fb383e5bf3b563480adaeba5b7e3475ecd *vp90-2-07-frame_parallel.webm.md5 +086c7edcffd699ae7d99d710fd7e53b18910ca5b *vp90-2-08-tile_1x2_frame_parallel.webm +e981ecaabb29a80e0cbc1f4002384965ce8e95bb *vp90-2-08-tile_1x2_frame_parallel.webm.md5 +ed79be026a6f28646c5825da1c12d1fbc70f96a4 *vp90-2-08-tile_1x2.webm +45b404e025841c9750895fc1a9f6bd384fe6a315 *vp90-2-08-tile_1x2.webm.md5 +cf8ea970c776797aae71dac8317ea926d9431cab *vp90-2-08-tile_1x4_frame_parallel.webm +a481fbea465010b57af5a19ebf6d4a5cfe5b9278 *vp90-2-08-tile_1x4_frame_parallel.webm.md5 +0203ec456277a01aec401e7fb6c72c9a7e5e3f9d *vp90-2-08-tile_1x4.webm +c9b237dfcc01c1b414fbcaa481d014a906ef7998 *vp90-2-08-tile_1x4.webm.md5 +20c75157e91ab41f82f70ffa73d5d01df8469287 *vp90-2-08-tile-4x4.webm +ae7451810247fd13975cc257aa0301ff17102255 *vp90-2-08-tile-4x4.webm.md5 +2ec6e15422ac7a61af072dc5f27fcaf1942ce116 *vp90-2-08-tile-4x1.webm +0094f5ee5e46345017c30e0aa4835b550212d853 *vp90-2-08-tile-4x1.webm.md5 +edea45dac4a3c2e5372339f8851d24c9bef803d6 *vp90-2-09-subpixel-00.ivf +5428efc4bf92191faedf4a727fcd1d94966a7abc *vp90-2-09-subpixel-00.ivf.md5 +8cdd435d89029987ee196896e21520e5f879f04d *vp90-2-bbb_1280x720_tile_1x4_1310kbps.webm +091b373aa2ecb59aa5c647affd5bcafcc7547364 *vp90-2-bbb_1920x1080_tile_1x1_2581kbps.webm +87ee28032b0963a44b73a850fcc816a6dc83efbb *vp90-2-bbb_1920x1080_tile_1x4_2586kbps.webm +c6ce25c4bfd4bdfc2932b70428e3dfe11210ec4f *vp90-2-bbb_1920x1080_tile_1x4_fpm_2304kbps.webm +2064bdb22aa71c2691e0469fb62e8087a43f08f8 *vp90-2-bbb_426x240_tile_1x1_180kbps.webm +8080eda22694910162f0996e8a962612f381a57f *vp90-2-bbb_640x360_tile_1x2_337kbps.webm +a484b335c27ea189c0f0d77babea4a510ce12d50 *vp90-2-bbb_854x480_tile_1x2_651kbps.webm +3eacf1f006250be4cc5c92a7ef146e385ee62653 *vp90-2-sintel_1280x546_tile_1x4_1257kbps.webm +217f089a16447490823127b36ce0d945522accfd *vp90-2-sintel_1920x818_tile_1x4_fpm_2279kbps.webm +eedb3c641e60dacbe082491a16df529a5c9187df *vp90-2-sintel_426x182_tile_1x1_171kbps.webm +cb7e4955af183dff33bcba0c837f0922ab066400 *vp90-2-sintel_640x272_tile_1x2_318kbps.webm +48613f9380e2580002f8a09d6e412ea4e89a52b9 *vp90-2-sintel_854x364_tile_1x2_621kbps.webm +990a91f24dd284562d21d714ae773dff5452cad8 *vp90-2-tos_1280x534_tile_1x4_1306kbps.webm +aa402217577a659cfc670157735b4b8e9aa670fe *vp90-2-tos_1280x534_tile_1x4_fpm_952kbps.webm +b6dd558c90bca466b4bcbd03b3371648186465a7 *vp90-2-tos_1920x800_tile_1x4_fpm_2335kbps.webm +1a9c2914ba932a38f0a143efc1ad0e318e78888b *vp90-2-tos_426x178_tile_1x1_181kbps.webm +a3d2b09f24debad4747a1b3066f572be4273bced *vp90-2-tos_640x266_tile_1x2_336kbps.webm +c64b03b5c090e6888cb39685c31f00a6b79fa45c *vp90-2-tos_854x356_tile_1x2_656kbps.webm +94b533dbcf94292001e27cc51fec87f9e8c90c0b *vp90-2-tos_854x356_tile_1x2_fpm_546kbps.webm +0e7cd4135b231c9cea8d76c19f9e84b6fd77acec *vp90-2-08-tile_1x8_frame_parallel.webm +c9b6850af28579b031791066457f4cb40df6e1c7 *vp90-2-08-tile_1x8_frame_parallel.webm.md5 +e448b6e83490bca0f8d58b4f4b1126a17baf4b0c *vp90-2-08-tile_1x8.webm +5e524165f0397e6141d914f4f0a66267d7658376 *vp90-2-08-tile_1x8.webm.md5 +a34e14923d6d17b1144254d8187d7f85b700a63c *vp90-2-02-size-lf-1920x1080.webm +e3b28ddcfaeb37fb4d132b93f92642a9ad17c22d *vp90-2-02-size-lf-1920x1080.webm.md5 +d48c5db1b0f8e60521a7c749696b8067886033a3 *vp90-2-09-aq2.webm +84c1599298aac78f2fc05ae2274575d10569dfa0 *vp90-2-09-aq2.webm.md5 +55fc55ed73d578ed60fad05692579873f8bad758 *vp90-2-09-lf_deltas.webm +54638c38009198c38c8f3b25c182b709b6c1fd2e *vp90-2-09-lf_deltas.webm.md5 +510d95f3beb3b51c572611fdaeeece12277dac30 *vp90-2-10-show-existing-frame.webm +14d631096f4bfa2d71f7f739aec1448fb3c33bad *vp90-2-10-show-existing-frame.webm.md5 +d2feea7728e8d2c615981d0f47427a4a5a45d881 *vp90-2-10-show-existing-frame2.webm +5f7c7811baa3e4f03be1dd78c33971b727846821 *vp90-2-10-show-existing-frame2.webm.md5 +b4318e75f73a6a08992c7326de2fb589c2a794c7 *vp90-2-11-size-351x287.webm +b3c48382cf7d0454e83a02497c229d27720f9e20 *vp90-2-11-size-351x287.webm.md5 +8e0096475ea2535bac71d3e2fc09e0c451c444df *vp90-2-11-size-351x288.webm +19e003804ec1dfc5464813b32339a15d5ba7b42f *vp90-2-11-size-351x288.webm.md5 +40cd1d6a188d7a88b21ebac1e573d3f270ab261e *vp90-2-11-size-352x287.webm +68f515abe3858fc1eded46c8e6b2f727d43b5331 *vp90-2-11-size-352x287.webm.md5 +9a510769ff23db410880ec3029d433e87d17f7fc *vp90-2-12-droppable_1.ivf +952eaac6eefa6f62179ed1db3e922fd42fecc624 *vp90-2-12-droppable_1.ivf.md5 +9a510769ff23db410880ec3029d433e87d17f7fc *vp90-2-12-droppable_2.ivf +92a756469fa438220524e7fa6ac1d38c89514d17 *vp90-2-12-droppable_2.ivf.md5 +c21e97e4ba486520118d78b01a5cb6e6dc33e190 *vp90-2-12-droppable_3.ivf +601abc9e4176c70f82ac0381365e9b151fdd24cd *vp90-2-12-droppable_3.ivf.md5 +61c640dad23cd4f7ad811b867e7b7e3521f4e3ba *vp90-2-13-largescaling.webm +bca1b02eebdb088fa3f389fe0e7571e75a71f523 *vp90-2-13-largescaling.webm.md5 +c740708fa390806eebaf669909c1285ab464f886 *vp90-2-14-resize-fp-tiles-1-2.webm +c7b85ffd8e11500f73f52e7dc5a47f57c393d47f *vp90-2-14-resize-fp-tiles-1-2.webm.md5 +ec8faa352a08f7033c60f29f80d505e2d7daa103 *vp90-2-14-resize-fp-tiles-1-4.webm +6852c783fb421bda5ded3d4c5a3ffc46de03fbc1 *vp90-2-14-resize-fp-tiles-1-4.webm.md5 +8af61853ac0d07c4cb5bf7c2016661ba350b3497 *vp90-2-14-resize-fp-tiles-1-8.webm +571353bac89fea60b5706073409aa3c0d42aefe9 *vp90-2-14-resize-fp-tiles-1-8.webm.md5 +b1c187ed69931496b82ec194017a79831bafceef *vp90-2-14-resize-fp-tiles-1-16.webm +1c199a41afe42ce303944d70089eaaa2263b4a09 *vp90-2-14-resize-fp-tiles-1-16.webm.md5 +8eaae5a6f2dff934610b0c7a917d7f583ba74aa5 *vp90-2-14-resize-fp-tiles-2-1.webm +db18fcf915f7ffaea6c39feab8bda6c1688af011 *vp90-2-14-resize-fp-tiles-2-1.webm.md5 +bc3046d138941e2a20e9ceec0ff6d25c25d12af3 *vp90-2-14-resize-fp-tiles-4-1.webm +393211b808030d09a79927b17a4374b2f68a60ae *vp90-2-14-resize-fp-tiles-4-1.webm.md5 +6e8f8e31721a0f7f68a2964e36e0e698c2e276b1 *vp90-2-14-resize-fp-tiles-8-1.webm +491fd3cd78fb0577bfe905bb64bbf64bd7d29140 *vp90-2-14-resize-fp-tiles-8-1.webm.md5 +cc5958da2a7edf739cd2cfeb18bd05e77903087e *vp90-2-14-resize-fp-tiles-16-1.webm +0b58daf55aaf9063bf5b4fb33393d18b417dc428 *vp90-2-14-resize-fp-tiles-16-1.webm.md5 +821eeecc9d8c6a316134dd42d1ff057787d8047b *vp90-2-14-resize-fp-tiles-2-4.webm +374c549f2839a3d0b732c4e3650700144037e76c *vp90-2-14-resize-fp-tiles-2-4.webm.md5 +dff8c8e49aacea9f4c7f22cb882da984e2a1b405 *vp90-2-14-resize-fp-tiles-2-8.webm +e5b8820a7c823b21297d6e889e57ec401882c210 *vp90-2-14-resize-fp-tiles-2-8.webm.md5 +77629e4b23e32896aadf6e994c78bd4ffa1c7797 *vp90-2-14-resize-fp-tiles-2-16.webm +1937f5df032664ac345d4613ad4417b4967b1230 *vp90-2-14-resize-fp-tiles-2-16.webm.md5 +380ba5702bb1ec7947697314ab0300b5c56a1665 *vp90-2-14-resize-fp-tiles-4-2.webm +fde7b30d2aa64c1e851a4852f655d79fc542cf66 *vp90-2-14-resize-fp-tiles-4-2.webm.md5 +dc784b258ffa2abc2ae693d11792acf0bb9cb74f *vp90-2-14-resize-fp-tiles-8-2.webm +edf26f0130aeee8342d49c2c8f0793ad008782d9 *vp90-2-14-resize-fp-tiles-8-2.webm.md5 +8e575789fd63ebf69e8eff1b9a4351a249a73bee *vp90-2-14-resize-fp-tiles-16-2.webm +b6415318c1c589a1f64b9d569ce3cabbec2e0d52 *vp90-2-14-resize-fp-tiles-16-2.webm.md5 +e3adc944a11c4c5517e63664c84ebb0847b64d81 *vp90-2-14-resize-fp-tiles-4-8.webm +03cba0532bc90a05b1990db830bf5701e24e7982 *vp90-2-14-resize-fp-tiles-4-8.webm.md5 +3b27a991eb6d78dce38efab35b7db682e8cbbee3 *vp90-2-14-resize-fp-tiles-4-16.webm +5d16b7f82bf59f802724ddfd97abb487150b1c9d *vp90-2-14-resize-fp-tiles-4-16.webm.md5 +d5fed8c28c1d4c7e232ebbd25cf758757313ed96 *vp90-2-14-resize-fp-tiles-8-4.webm +5a8ff8a52cbbde7bfab569beb6d971c5f8b904f7 *vp90-2-14-resize-fp-tiles-8-4.webm.md5 +17a5faa023d77ee9dad423a4e0d3145796bbc500 *vp90-2-14-resize-fp-tiles-16-4.webm +2ef8daa3c3e750fd745130d0a76a39fe86f0448f *vp90-2-14-resize-fp-tiles-16-4.webm.md5 +9361e031f5cc990d8740863e310abb5167ae351e *vp90-2-14-resize-fp-tiles-8-16.webm +57f13a2197486584f4e1a4f82ad969f3abc5a1a2 *vp90-2-14-resize-fp-tiles-8-16.webm.md5 +5803fc6fcbfb47b7661f3fcc6499158a32b56675 *vp90-2-14-resize-fp-tiles-16-8.webm +be0fe64a1a4933696ff92d93f9bdecdbd886dc13 *vp90-2-14-resize-fp-tiles-16-8.webm.md5 +0ac0f6d20a0afed77f742a3b9acb59fd7b9cb093 *vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm +1765315acccfe6cd12230e731369fcb15325ebfa *vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm.md5 +4a2b7a683576fe8e330c7d1c4f098ff4e70a43a8 *vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm +1ef480392112b3509cb190afbb96f9a38dd9fbac *vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm.md5 +e615575ded499ea1d992f3b38e3baa434509cdcd *vp90-2-15-segkey.webm +e3ab35d4316c5e81325c50f5236ceca4bc0d35df *vp90-2-15-segkey.webm.md5 +9b7ca2cac09d34c4a5d296c1900f93b1e2f69d0d *vp90-2-15-segkey_adpq.webm +8f46ba5f785d0c2170591a153e0d0d146a7c8090 *vp90-2-15-segkey_adpq.webm.md5 +698a6910a97486b833073ef0c0b18d75dce57ee8 *vp90-2-16-intra-only.webm +5661b0168752969f055eec37b05fa9fa947dc7eb *vp90-2-16-intra-only.webm.md5 +c01bb7938f9a9f25e0c37afdec2f2fb73b6cc7fa *vp90-2-17-show-existing-frame.webm +cc75f351818b9a619818f5cc77b9bc013d0c1e11 *vp90-2-17-show-existing-frame.webm.md5 +013708bd043f0821a3e56fb8404d82e7a0c7af6c *vp91-2-04-yuv422.webm +1e58a7d23adad830a672f1733c9d2ae17890d59c *vp91-2-04-yuv422.webm.md5 +25d78f28948789d159a9453ebc13048b818251b1 *vp91-2-04-yuv440.webm +81b3870b27a7f695ef6a43e87ab04bbdb5aee2f5 *vp91-2-04-yuv440.webm.md5 +0321d507ce62dedc8a51b4e9011f7a19aed9c3dc *vp91-2-04-yuv444.webm +367e423dd41fdb49aa028574a2cfec5c2f325c5c *vp91-2-04-yuv444.webm.md5 +f77673b566f686853adefe0c578ad251b7241281 *vp92-2-20-10bit-yuv420.webm +abdedfaddacbbe1a15ac7a54e86360f03629fb7a *vp92-2-20-10bit-yuv420.webm.md5 +0c2c355a1b17b28537c5a3b19997c8783b69f1af *vp92-2-20-12bit-yuv420.webm +afb2c2798703e039189b0a15c8ac5685aa51d33f *vp92-2-20-12bit-yuv420.webm.md5 +0d661bc6e83da33238981481efd1b1802d323d88 *vp93-2-20-10bit-yuv422.webm +10318907063db22eb02fad332556edbbecd443cc *vp93-2-20-10bit-yuv422.webm.md5 +ebc6be2f7511a0bdeac0b18c67f84ba7168839c7 *vp93-2-20-12bit-yuv422.webm +235232267c6a1dc8a11e45d600f1c99d2f8b42d4 *vp93-2-20-12bit-yuv422.webm.md5 +f76b11b26d4beaceac7a7e7729dd5054d095164f *vp93-2-20-10bit-yuv440.webm +757b33b5ac969c5999999488a731a3d1e6d9fb88 *vp93-2-20-10bit-yuv440.webm.md5 +df8807dbd29bec795c2db9c3c18e511fbb988101 *vp93-2-20-12bit-yuv440.webm +ea4100930c3f59a1c23fbb33ab0ea01151cae159 *vp93-2-20-12bit-yuv440.webm.md5 +189c1b5f404ff41a50a7fc96341085ad541314a9 *vp93-2-20-10bit-yuv444.webm +2dd0177c2f9d970b6e698892634c653630f91f40 *vp93-2-20-10bit-yuv444.webm.md5 +bd44cf6e1c27343e3639df9ac21346aedd5d6973 *vp93-2-20-12bit-yuv444.webm +f36e5bdf5ec3213f32c0ddc82f95d82c5133bf27 *vp93-2-20-12bit-yuv444.webm.md5 +eb438c6540eb429f74404eedfa3228d409c57874 *desktop_640_360_30.yuv +89e70ebd22c27d275fe14dc2f1a41841a6d8b9ab *kirland_640_480_30.yuv +33c533192759e5bb4f07abfbac389dc259db4686 *macmarcomoving_640_480_30.yuv +8bfaab121080821b8f03b23467911e59ec59b8fe *macmarcostationary_640_480_30.yuv +70894878d916a599842d9ad0dcd24e10c13e5467 *niklas_640_480_30.yuv +8784b6df2d8cc946195a90ac00540500d2e522e4 *tacomanarrows_640_480_30.yuv +edd86a1f5e62fd9da9a9d46078247759c2638009 *tacomasmallcameramovement_640_480_30.yuv +9a70e8b7d14fba9234d0e51dce876635413ce444 *thaloundeskmtg_640_480_30.yuv +e7d315dbf4f3928779e0dc624311196d44491d32 *niklas_1280_720_30.yuv +c77e4a26616add298a05dd5d12397be22c0e40c5 *vp90-2-18-resize.ivf +c12918cf0a716417fba2de35c3fc5ab90e52dfce *vp90-2-18-resize.ivf.md5 +717da707afcaa1f692ff1946f291054eb75a4f06 *screendata.y4m +b7c1296630cdf1a7ef493d15ff4f9eb2999202f6 *invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf +0a3884edb3fd8f9d9b500223e650f7de257b67d8 *invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf.res +359e138dfb66863828397b77000ea7a83c844d02 *invalid-vp90-2-08-tile_1x8_frame_parallel.webm.ivf.s288_r01-05_b6-.ivf +bbd33de01c17b165b4ce00308e8a19a942023ab8 *invalid-vp90-2-08-tile_1x8_frame_parallel.webm.ivf.s288_r01-05_b6-.ivf.res +fac89b5735be8a86b0dc05159f996a5c3208ae32 *invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf +0a3884edb3fd8f9d9b500223e650f7de257b67d8 *invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf.res +4506dfdcdf8ee4250924b075a0dcf1f070f72e5a *invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf +bcdedaf168ac225575468fda77502d2dc9fd5baa *invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf.res +65e93f9653bcf65b022f7d225268d1a90a76e7bb *vp90-2-19-skip.webm +368dccdde5288c13c25695d2eacdc7402cadf613 *vp90-2-19-skip.webm.md5 +ffe460282df2b0e7d4603c2158653ad96f574b02 *vp90-2-19-skip-01.webm +bd21bc9eda4a4a36b221d71ede3a139fc3c7bd85 *vp90-2-19-skip-01.webm.md5 +178f5bd239e38cc1cc2657a7a5e1a9f52ad2d3fe *vp90-2-19-skip-02.webm +9020d5e260bd7df08e2b3d4b86f8623cee3daea2 *vp90-2-19-skip-02.webm.md5 +b03c408cf23158638da18dbc3323b99a1635c68a *invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf +0a3884edb3fd8f9d9b500223e650f7de257b67d8 *invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf.res +5e67e24e7f53fd189e565513cef8519b1bd6c712 *invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf +741158f67c0d9d23726624d06bdc482ad368afc9 *invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf.res +8b1f7bf7e86c0976d277f60e8fcd9539e75a079a *invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf +9c6bdf048fb2e66f07d4b4db5b32e6f303bd6109 *invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf.res +552e372e9b78127389fb06b34545df2cec15ba6d *invalid-vp91-2-mixedrefcsp-444to420.ivf +a61774cf03fc584bd9f0904fc145253bb8ea6c4c *invalid-vp91-2-mixedrefcsp-444to420.ivf.res +812d05a64a0d83c1b504d0519927ddc5a2cdb273 *invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf +1e472baaf5f6113459f0399a38a5a5e68d17799d *invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf.res +f97088c7359fc8d3d5aa5eafe57bc7308b3ee124 *vp90-2-20-big_superframe-01.webm +47d7d409785afa33b123376de0c907336e6c7bd7 *vp90-2-20-big_superframe-01.webm.md5 +65ade6d2786209582c50d34cfe22b3cdb033abaf *vp90-2-20-big_superframe-02.webm +7c0ed8d04c4d06c5411dd2e5de2411d37f092db5 *vp90-2-20-big_superframe-02.webm.md5 +667ec8718c982aef6be07eb94f083c2efb9d2d16 *vp90-2-07-frame_parallel-1.webm +bfc82bf848e9c05020d61e3ffc1e62f25df81d19 *vp90-2-07-frame_parallel-1.webm.md5 +efd5a51d175cfdacd169ed23477729dc558030dc *invalid-vp90-2-07-frame_parallel-1.webm +9f912712ec418be69adb910e2ca886a63c4cec08 *invalid-vp90-2-07-frame_parallel-2.webm +445f5a53ca9555341852997ccdd480a51540bd14 *invalid-vp90-2-07-frame_parallel-3.webm +d18c90709a0d03c82beadf10898b27d88fff719c *invalid-vp90-2-03-size-224x196.webm.ivf.s44156_r01-05_b6-.ivf +d06285d109ecbaef63b0cbcc44d70a129186f51c *invalid-vp90-2-03-size-224x196.webm.ivf.s44156_r01-05_b6-.ivf.res +e60d859b0ef2b331b21740cf6cb83fabe469b079 *invalid-vp90-2-03-size-202x210.webm.ivf.s113306_r01-05_b6-.ivf +0ae808dca4d3c1152a9576e14830b6faa39f1b4a *invalid-vp90-2-03-size-202x210.webm.ivf.s113306_r01-05_b6-.ivf.res +9cfc855459e7549fd015c79e8eca512b2f2cb7e3 *niklas_1280_720_30.y4m +5b5763b388b1b52a81bb82b39f7ec25c4bd3d0e1 *desktop_credits.y4m +85771f6ab44e4a0226e206c0cde8351dd5918953 *vp90-2-02-size-130x132.webm +512dad5eabbed37b4bbbc64ce153f1a5484427b8 *vp90-2-02-size-130x132.webm.md5 +01f7127d40360289db63b27f61cb9afcda350e95 *vp90-2-02-size-132x130.webm +4a94275328ae076cf60f966c097a8721010fbf5a *vp90-2-02-size-132x130.webm.md5 +f41c0400b5716b4b70552c40dd03d44be131e1cc *vp90-2-02-size-132x132.webm +1a69e989f697e424bfe3e3e8a77bb0c0992c8e47 *vp90-2-02-size-132x132.webm.md5 +94a5cbfacacba100e0c5f7861c72a1b417feca0f *vp90-2-02-size-178x180.webm +dedfecf1d784bcf70629592fa5e6f01d5441ccc9 *vp90-2-02-size-178x180.webm.md5 +4828b62478c04014bba3095a83106911a71cf387 *vp90-2-02-size-180x178.webm +423da2b861050c969d78ed8e8f8f14045d1d8199 *vp90-2-02-size-180x178.webm.md5 +338f7c9282f43e29940f5391118aadd17e4f9234 *vp90-2-02-size-180x180.webm +6c2ef013392310778dca5dd5351160eca66b0a60 *vp90-2-02-size-180x180.webm.md5 +679fa7d6807e936ff937d7b282e7dbd8ac76447e *vp90-2-14-resize-10frames-fp-tiles-1-2-4-8.webm +fc7267ab8fc2bf5d6c234e34ee6c078a967b4888 *vp90-2-14-resize-10frames-fp-tiles-1-2-4-8.webm.md5 +9d33a137c819792209c5ce4e4e1ee5da73d574fe *vp90-2-14-resize-10frames-fp-tiles-1-2.webm +0c78a154956a8605d050bdd75e0dcc4d39c040a6 *vp90-2-14-resize-10frames-fp-tiles-1-2.webm.md5 +d6a8d8c57f66a91d23e8e7df480f9ae841e56c37 *vp90-2-14-resize-10frames-fp-tiles-1-4.webm +e9b4e8c7b33b5fda745d340c3f47e6623ae40cf2 *vp90-2-14-resize-10frames-fp-tiles-1-4.webm.md5 +aa6fe043a0c4a42b49c87ebbe812d4afd9945bec *vp90-2-14-resize-10frames-fp-tiles-1-8.webm +028520578994c2d013d4c0129033d4f2ff31bbe0 *vp90-2-14-resize-10frames-fp-tiles-1-8.webm.md5 +d1d5463c9ea7b5cc5f609ddedccddf656f348d1a *vp90-2-14-resize-10frames-fp-tiles-2-1.webm +92d5872f5bdffbed721703b7e959b4f885e3d77a *vp90-2-14-resize-10frames-fp-tiles-2-1.webm.md5 +677cb29de1215d97346015af5807a9b1faad54cf *vp90-2-14-resize-10frames-fp-tiles-2-4.webm +a5db19f977094ec3fd60b4f7671b3e6740225e12 *vp90-2-14-resize-10frames-fp-tiles-2-4.webm.md5 +cdd3c52ba21067efdbb2de917fe2a965bf27332e *vp90-2-14-resize-10frames-fp-tiles-2-8.webm +db17ec5d894ea8b8d0b7f32206d0dd3d46dcfa6d *vp90-2-14-resize-10frames-fp-tiles-2-8.webm.md5 +0f6093c472125d05b764d7d1965c1d56771c0ea2 *vp90-2-14-resize-10frames-fp-tiles-4-1.webm +bc7c79e1bee07926dd970462ce6f64fc30eec3e1 *vp90-2-14-resize-10frames-fp-tiles-4-1.webm.md5 +c5142e2bff4091338196c8ea8bc9266e64f548bc *vp90-2-14-resize-10frames-fp-tiles-4-2.webm +22aa3dd430b69fd3d92f6561bac86deeed90486d *vp90-2-14-resize-10frames-fp-tiles-4-2.webm.md5 +ede8b1466d2f26e1b1bd9602addb9cd1017e1d8c *vp90-2-14-resize-10frames-fp-tiles-4-8.webm +508d5ebb9c0eac2a4100281a3ee052ec2fc19217 *vp90-2-14-resize-10frames-fp-tiles-4-8.webm.md5 +2b292e3392854cd1d76ae597a6f53656cf741cfa *vp90-2-14-resize-10frames-fp-tiles-8-1.webm +1c24e54fa19e94e1722f24676404444e941c3d31 *vp90-2-14-resize-10frames-fp-tiles-8-1.webm.md5 +61beda21064e09634564caa6697ab90bd53c9af7 *vp90-2-14-resize-10frames-fp-tiles-8-2.webm +9c0657b4d9e1d0e4c9d28a90e5a8630a65519124 *vp90-2-14-resize-10frames-fp-tiles-8-2.webm.md5 +1758c50a11a7c92522749b4a251664705f1f0d4b *vp90-2-14-resize-10frames-fp-tiles-8-4-2-1.webm +4f454a06750614314ae15a44087b79016fe2db97 *vp90-2-14-resize-10frames-fp-tiles-8-4-2-1.webm.md5 +3920c95ba94f1f048a731d9d9b416043b44aa4bd *vp90-2-14-resize-10frames-fp-tiles-8-4.webm +4eb347a0456d2c49a1e1d8de5aa1c51acc39887e *vp90-2-14-resize-10frames-fp-tiles-8-4.webm.md5 +4b95a74c032a473b6683d7ad5754db1b0ec378e9 *vp90-2-21-resize_inter_1280x720_5_1-2.webm +a7826dd386bedfe69d02736969bfb47fb6a40a5e *vp90-2-21-resize_inter_1280x720_5_1-2.webm.md5 +5cfff79e82c4d69964ccb8e75b4f0c53b9295167 *vp90-2-21-resize_inter_1280x720_5_3-4.webm +a18f57db4a25e1f543a99f2ceb182e00db0ee22f *vp90-2-21-resize_inter_1280x720_5_3-4.webm.md5 +d26db0811bf30eb4131d928669713e2485f8e833 *vp90-2-21-resize_inter_1280x720_7_1-2.webm +fd6f9f332cd5bea4c0f0d57be4297bea493cc5a1 *vp90-2-21-resize_inter_1280x720_7_1-2.webm.md5 +5c7d73d4d268e2ba9593b31cb091fd339505c7fd *vp90-2-21-resize_inter_1280x720_7_3-4.webm +7bbb949cabc1e70dadcc74582739f63b833034e0 *vp90-2-21-resize_inter_1280x720_7_3-4.webm.md5 +f2d2a41a60eb894aff0c5854afca15931f1445a8 *vp90-2-21-resize_inter_1920x1080_5_1-2.webm +66d7789992613ac9d678ff905ff1059daa1b89e4 *vp90-2-21-resize_inter_1920x1080_5_1-2.webm.md5 +764edb75fe7dd64e73a1b4f3b4b2b1bf237a4dea *vp90-2-21-resize_inter_1920x1080_5_3-4.webm +f78bea1075983fd990e7f25d4f31438f9b5efa34 *vp90-2-21-resize_inter_1920x1080_5_3-4.webm.md5 +96496f2ade764a5de9f0c27917c7df1f120fb2ef *vp90-2-21-resize_inter_1920x1080_7_1-2.webm +2632b635135ed5ecd67fd22dec7990d29c4f4cb5 *vp90-2-21-resize_inter_1920x1080_7_1-2.webm.md5 +74889ea42001bf41428cb742ca74e65129c886dc *vp90-2-21-resize_inter_1920x1080_7_3-4.webm +d2cf3b25956415bb579d368e7098097e482dd73a *vp90-2-21-resize_inter_1920x1080_7_3-4.webm.md5 +4658986a8ce36ebfcc80a1903e446eaab3985336 *vp90-2-21-resize_inter_320x180_5_1-2.webm +8a3d8cf325109ffa913cc9426c32eea8c202a09a *vp90-2-21-resize_inter_320x180_5_1-2.webm.md5 +16303aa45176520ee42c2c425247aadc1506b881 *vp90-2-21-resize_inter_320x180_5_3-4.webm +41cab1ddf7715b680a4dbce42faa9bcd72af4e5c *vp90-2-21-resize_inter_320x180_5_3-4.webm.md5 +56648adcee66dd0e5cb6ac947f5ee1b9cc8ba129 *vp90-2-21-resize_inter_320x180_7_1-2.webm +70047377787003cc03dda7b2394e6d7eaa666d9e *vp90-2-21-resize_inter_320x180_7_1-2.webm.md5 +d2ff99165488499cc55f75929f1ce5ca9c9e359b *vp90-2-21-resize_inter_320x180_7_3-4.webm +e69019e378114a4643db283b66d1a7e304761a56 *vp90-2-21-resize_inter_320x180_7_3-4.webm.md5 +4834d129bed0f4289d3a88f2ae3a1736f77621b0 *vp90-2-21-resize_inter_320x240_5_1-2.webm +a75653c53d22b623c1927fc0088da21dafef21f4 *vp90-2-21-resize_inter_320x240_5_1-2.webm.md5 +19818e1b7fd1c1e63d8873c31b0babe29dd33ba6 *vp90-2-21-resize_inter_320x240_5_3-4.webm +8d89814ff469a186312111651b16601dfbce4336 *vp90-2-21-resize_inter_320x240_5_3-4.webm.md5 +ac8057bae52498f324ce92a074d5f8207cc4a4a7 *vp90-2-21-resize_inter_320x240_7_1-2.webm +2643440898c83c08cc47bc744245af696b877c24 *vp90-2-21-resize_inter_320x240_7_1-2.webm.md5 +cf4a4cd38ac8b18c42d8c25a3daafdb39132256b *vp90-2-21-resize_inter_320x240_7_3-4.webm +70ba8ec9120b26e9b0ffa2c79b432f16cbcb50ec *vp90-2-21-resize_inter_320x240_7_3-4.webm.md5 +669f10409fe1c4a054010162ca47773ea1fdbead *vp90-2-21-resize_inter_640x360_5_1-2.webm +6355a04249004a35fb386dd1024214234f044383 *vp90-2-21-resize_inter_640x360_5_1-2.webm.md5 +c23763b950b8247c1775d1f8158d93716197676c *vp90-2-21-resize_inter_640x360_5_3-4.webm +59e6fc381e3ec3b7bdaac586334e0bc944d18fb6 *vp90-2-21-resize_inter_640x360_5_3-4.webm.md5 +71b45cbfdd068baa1f679a69e5e6f421d256a85f *vp90-2-21-resize_inter_640x360_7_1-2.webm +1416fc761b690c54a955c4cf017fa078520e8c18 *vp90-2-21-resize_inter_640x360_7_1-2.webm.md5 +6c409903279448a697e4db63bab1061784bcd8d2 *vp90-2-21-resize_inter_640x360_7_3-4.webm +60de1299793433a630b71130cf76c9f5965758e2 *vp90-2-21-resize_inter_640x360_7_3-4.webm.md5 +852b597b8af096d90c80bf0ed6ed3b336b851f19 *vp90-2-21-resize_inter_640x480_5_1-2.webm +f6856f19236ee46ed462bd0a2e7e72b9c3b9cea6 *vp90-2-21-resize_inter_640x480_5_1-2.webm.md5 +792a16c6f60043bd8dceb515f0b95b8891647858 *vp90-2-21-resize_inter_640x480_5_3-4.webm +68ffe59877e9a7863805e1c0a3ce18ce037d7c9d *vp90-2-21-resize_inter_640x480_5_3-4.webm.md5 +61e044c4759972a35ea3db8c1478a988910a4ef4 *vp90-2-21-resize_inter_640x480_7_1-2.webm +7739bfca167b1b43fea72f807f01e097b7cb98d8 *vp90-2-21-resize_inter_640x480_7_1-2.webm.md5 +7291af354b4418917eee00e3a7e366086a0b7a10 *vp90-2-21-resize_inter_640x480_7_3-4.webm +4a18b09ccb36564193f0215f599d745d95bb558c *vp90-2-21-resize_inter_640x480_7_3-4.webm.md5 +a000d568431d07379dd5a8ec066061c07e560b47 *invalid-vp90-2-00-quantizer-63.ivf.kf_65527x61446.ivf +1e75aad3433c5c21c194a7b53fc393970f0a8d7f *invalid-vp90-2-00-quantizer-63.ivf.kf_65527x61446.ivf.res +235182f9a1c5c8841552510dd4288487447bfc40 *invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf +787f04f0483320d536894282f3358a4f8cac1cf9 *invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf.res +91d3cefd0deb98f3b0caf3a2d900ec7a7605e53a *invalid-vp90-2-10-show-existing-frame.webm.ivf.s180315_r01-05_b6-.ivf +1e472baaf5f6113459f0399a38a5a5e68d17799d *invalid-vp90-2-10-show-existing-frame.webm.ivf.s180315_r01-05_b6-.ivf.res +70057835bf29d14e66699ce5f022df2551fb6b37 *invalid-crbug-629481.webm +5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-crbug-629481.webm.res +7602e00378161ca36ae93cc6ee12dd30b5ba1e1d *vp90-2-22-svc_1280x720_3.ivf +02e53e3eefbf25ec0929047fe50876acdeb040bd *vp90-2-22-svc_1280x720_3.ivf.md5 +6fa3d3ac306a3d9ce1d610b78441dc00d2c2d4b9 *tos_vp8.webm +e402cbbf9e550ae017a1e9f1f73931c1d18474e8 *invalid-crbug-667044.webm +d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-crbug-667044.webm.res +fd9df7f3f6992af1d7a9dde975c9a0d6f28c053d *invalid-bug-1443.ivf +fd3020fa6e9ca5966206738654c97dec313b0a95 *invalid-bug-1443.ivf.res +1a0e405606939f2febab1a21b30c37cb8f2c8cb1 *invalid-token-partition.ivf +90a8a95e7024f015b87f5483a65036609b3d1b74 *invalid-token-partition.ivf.res +17696cd21e875f1d6e5d418cbf89feab02c8850a *vp90-2-22-svc_1280x720_1.webm +e2f9e1e47a791b4e939a9bdc50bf7a25b3761f77 *vp90-2-22-svc_1280x720_1.webm.md5 +a0fbbbc5dd50fd452096f4455a58c1a8c9f66697 *invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf +a61774cf03fc584bd9f0904fc145253bb8ea6c4c *invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf.res +894fae3afee0290546590823974203ab4b8abd95 *crbug-1539.rawfile +f1026c03efd5da21b381c8eb21f0d64e6d7e4ba3 *invalid-crbug-1558.ivf +eb198c25f861c3fe2cbd310de11eb96843019345 *invalid-crbug-1558.ivf.res +c62b005a9fd32c36a1b3f67de6840330f9915e34 *invalid-crbug-1562.ivf +f0cd8389948ad16085714d96567612136f6a46c5 *invalid-crbug-1562.ivf.res +bac455906360b45338a16dd626ac5f19bc36a307 *desktop_office1.1280_720-020.yuv +094be4b80fa30bd227149ea16ab6476d549ea092 *slides_code_term_web_plot.1920_1080.yuv +518a0be998afece76d3df76047d51e256c591ff2 *invalid-bug-148271109.ivf +d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-bug-148271109.ivf.res +ad18ca16f0a249fb3b7c38de0d9b327fed273f96 *hantro_collage_w352h288_nv12.yuv +8a0b2c350539859463d3546a67876c83ff6ff0ac *desktopqvga.320_240.yuv +ad9942a073e245585c93f764ea299382a65939a7 *crowd_run_360p_10_150f.y4m diff --git a/media/libvpx/libvpx/test/test.mk b/media/libvpx/libvpx/test/test.mk new file mode 100644 index 0000000000..d4521f08bf --- /dev/null +++ b/media/libvpx/libvpx/test/test.mk @@ -0,0 +1,234 @@ +LIBVPX_TEST_SRCS-yes += acm_random.h +LIBVPX_TEST_SRCS-yes += bench.h +LIBVPX_TEST_SRCS-yes += bench.cc +LIBVPX_TEST_SRCS-yes += buffer.h +LIBVPX_TEST_SRCS-yes += clear_system_state.h +LIBVPX_TEST_SRCS-yes += codec_factory.h +LIBVPX_TEST_SRCS-yes += md5_helper.h +LIBVPX_TEST_SRCS-yes += register_state_check.h +LIBVPX_TEST_SRCS-yes += test.mk +LIBVPX_TEST_SRCS-yes += init_vpx_test.cc +LIBVPX_TEST_SRCS-yes += init_vpx_test.h +LIBVPX_TEST_SRCS-yes += test_libvpx.cc +LIBVPX_TEST_SRCS-yes += test_vectors.cc +LIBVPX_TEST_SRCS-yes += test_vectors.h +LIBVPX_TEST_SRCS-yes += util.h +LIBVPX_TEST_SRCS-yes += video_source.h + +## +## BLACK BOX TESTS +## +## Black box tests only use the public API. +## +LIBVPX_TEST_SRCS-yes += ../md5_utils.h ../md5_utils.c +LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += ivf_video_source.h +LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += ../y4minput.h ../y4minput.c +LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += altref_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += encode_api_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += error_resilience_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += i420_video_source.h +LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += realtime_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += resize_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += y4m_video_source.h +LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += yuv_video_source.h + +LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += config_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += cq_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_datarate_test.cc + +LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += byte_alignment_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += decode_svc_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += external_frame_buffer_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += user_priv_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_refresh_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += alt_ref_aq_segment_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += aq_segment_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += frame_size_tests.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_lossless_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_end_to_end_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += decode_corrupted.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ethread_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_motion_vector_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += level_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_datarate_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.h +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_end_to_end_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += timestamp_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_datarate_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ext_ratectrl_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += ../vp9/simple_encode.h + +LIBVPX_TEST_SRCS-yes += decode_test_driver.cc +LIBVPX_TEST_SRCS-yes += decode_test_driver.h +LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += encode_test_driver.cc +LIBVPX_TEST_SRCS-yes += encode_test_driver.h + +## IVF writing. +LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += ../ivfenc.c ../ivfenc.h + +## Y4m parsing. +LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += y4m_test.cc ../y4menc.c ../y4menc.h + +## WebM Parsing +ifeq ($(CONFIG_WEBM_IO), yes) +LIBWEBM_PARSER_SRCS += ../third_party/libwebm/mkvparser/mkvparser.cc +LIBWEBM_PARSER_SRCS += ../third_party/libwebm/mkvparser/mkvreader.cc +LIBWEBM_PARSER_SRCS += ../third_party/libwebm/mkvparser/mkvparser.h +LIBWEBM_PARSER_SRCS += ../third_party/libwebm/mkvparser/mkvreader.h +LIBWEBM_PARSER_SRCS += ../third_party/libwebm/common/webmids.h +LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += $(LIBWEBM_PARSER_SRCS) +LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += ../tools_common.h +LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += ../webmdec.cc +LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += ../webmdec.h +LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += webm_video_source.h +LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_skip_loopfilter_test.cc +$(BUILD_PFX)third_party/libwebm/%.cc.o: CXXFLAGS += $(LIBWEBM_CXXFLAGS) +endif + +LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += decode_api_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += test_vector_test.cc + +# Currently we only support decoder perf tests for vp9. Also they read from WebM +# files, so WebM IO is required. +ifeq ($(CONFIG_DECODE_PERF_TESTS)$(CONFIG_VP9_DECODER)$(CONFIG_WEBM_IO), \ + yesyesyes) +LIBVPX_TEST_SRCS-yes += decode_perf_test.cc +endif + +# encode perf tests are vp9 only +ifeq ($(CONFIG_ENCODE_PERF_TESTS)$(CONFIG_VP9_ENCODER), yesyes) +LIBVPX_TEST_SRCS-yes += encode_perf_test.cc +endif + +## Multi-codec blackbox tests. +ifeq ($(findstring yes,$(CONFIG_VP8_DECODER)$(CONFIG_VP9_DECODER)), yes) +LIBVPX_TEST_SRCS-yes += invalid_file_test.cc +endif + +## +## WHITE BOX TESTS +## +## Whitebox tests invoke functions not exposed via the public API. Certain +## shared library builds don't make these functions accessible. +## +ifeq ($(CONFIG_SHARED),) + +## VP8 +ifeq ($(CONFIG_VP8),yes) + +# These tests require both the encoder and decoder to be built. +ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),yesyes) +LIBVPX_TEST_SRCS-yes += vp8_boolcoder_test.cc +LIBVPX_TEST_SRCS-yes += vp8_fragments_test.cc +endif +LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC) += add_noise_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC) += pp_filter_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc +ifneq (, $(filter yes, $(HAVE_SSE2) $(HAVE_SSSE3) $(HAVE_SSE4_1) $(HAVE_NEON) \ + $(HAVE_MSA) $(HAVE_MMI))) +LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += quantize_test.cc +endif +LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc + +LIBVPX_TEST_SRCS-yes += idct_test.cc +LIBVPX_TEST_SRCS-yes += predict_test.cc +LIBVPX_TEST_SRCS-yes += vpx_scale_test.cc +LIBVPX_TEST_SRCS-yes += vpx_scale_test.h + +ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_TEMPORAL_DENOISING),yesyes) +LIBVPX_TEST_SRCS-$(HAVE_SSE2) += vp8_denoiser_sse2_test.cc +endif + +endif # VP8 + +## VP9 +ifeq ($(CONFIG_VP9),yes) + +# These tests require both the encoder and decoder to be built. +ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_DECODER),yesyes) +# IDCT test currently depends on FDCT function +LIBVPX_TEST_SRCS-yes += idct8x8_test.cc +LIBVPX_TEST_SRCS-yes += partial_idct_test.cc +LIBVPX_TEST_SRCS-yes += superframe_test.cc +LIBVPX_TEST_SRCS-yes += tile_independence_test.cc +LIBVPX_TEST_SRCS-yes += vp9_boolcoder_test.cc +LIBVPX_TEST_SRCS-yes += vp9_encoder_parms_get_to_decoder.cc +LIBVPX_TEST_SRCS-yes += vp9_roi_test.cc +endif + +LIBVPX_TEST_SRCS-yes += convolve_test.cc +LIBVPX_TEST_SRCS-yes += lpf_test.cc +LIBVPX_TEST_SRCS-yes += vp9_intrapred_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_decrypt_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_thread_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += avg_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += comp_avg_pred_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct32x32_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct_partial_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += hadamard_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += minmax_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_scale_test.cc +ifneq ($(CONFIG_REALTIME_ONLY),yes) +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += yuv_temporal_filter_test.cc +endif +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc +ifneq (, $(filter yes, $(HAVE_SSE2) $(HAVE_AVX2) $(HAVE_NEON))) +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_block_error_test.cc +endif +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc + +ifeq ($(CONFIG_VP9_ENCODER),yes) +LIBVPX_TEST_SRCS-$(CONFIG_INTERNAL_STATS) += blockiness_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_INTERNAL_STATS) += consistency_test.cc +endif + +ifeq ($(CONFIG_VP9_ENCODER),yes) +LIBVPX_TEST_SRCS-$(CONFIG_NON_GREEDY_MV) += non_greedy_mv_test.cc +endif + +ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_TEMPORAL_DENOISING),yesyes) +LIBVPX_TEST_SRCS-yes += vp9_denoiser_test.cc +endif +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_arf_freq_test.cc + +ifeq ($(CONFIG_VP9_ENCODER),yes) +SIMPLE_ENCODE_TEST_SRCS-$(CONFIG_RATE_CTRL) := simple_encode_test.cc +endif + +endif # VP9 + +## Multi-codec / unconditional whitebox tests. + +LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc +ifneq (, $(filter yes, $(HAVE_NEON) $(HAVE_SSE2) $(HAVE_MSA))) +LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sum_squares_test.cc +endif + +TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc +TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c +TEST_INTRA_PRED_SPEED_SRCS-yes += init_vpx_test.cc +TEST_INTRA_PRED_SPEED_SRCS-yes += init_vpx_test.h + +RC_INTERFACE_TEST_SRCS-yes := test_rc_interface.cc +RC_INTERFACE_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ratectrl_rtc_test.cc +RC_INTERFACE_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_ratectrl_rtc_test.cc +RC_INTERFACE_TEST_SRCS-$(CONFIG_ENCODERS) += encode_test_driver.cc +RC_INTERFACE_TEST_SRCS-$(CONFIG_ENCODERS) += encode_test_driver.h +RC_INTERFACE_TEST_SRCS-yes += decode_test_driver.cc +RC_INTERFACE_TEST_SRCS-yes += decode_test_driver.h +RC_INTERFACE_TEST_SRCS-yes += codec_factory.h + +endif # CONFIG_SHARED + +include $(SRC_PATH_BARE)/test/test-data.mk diff --git a/media/libvpx/libvpx/test/test_intra_pred_speed.cc b/media/libvpx/libvpx/test/test_intra_pred_speed.cc new file mode 100644 index 0000000000..4c464a262f --- /dev/null +++ b/media/libvpx/libvpx/test/test_intra_pred_speed.cc @@ -0,0 +1,616 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +// Test and time VPX intra-predictor functions + +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/init_vpx_test.h" +#include "test/md5_helper.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/vpx_timer.h" + +// ----------------------------------------------------------------------------- + +namespace { + +typedef void (*VpxPredFunc)(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left); + +const int kBPS = 32; +const int kTotalPixels = 32 * kBPS; +const int kNumVp9IntraPredFuncs = 13; +const char *kVp9IntraPredNames[kNumVp9IntraPredFuncs] = { + "DC_PRED", "DC_LEFT_PRED", "DC_TOP_PRED", "DC_128_PRED", "V_PRED", + "H_PRED", "D45_PRED", "D135_PRED", "D117_PRED", "D153_PRED", + "D207_PRED", "D63_PRED", "TM_PRED" +}; + +template +struct IntraPredTestMem { + void Init(int block_size, int bd) { + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); + Pixel *const above = above_mem + 16; + const int mask = (1 << bd) - 1; + for (int i = 0; i < kTotalPixels; ++i) ref_src[i] = rnd.Rand16() & mask; + for (int i = 0; i < kBPS; ++i) left[i] = rnd.Rand16() & mask; + for (int i = -1; i < kBPS; ++i) above[i] = rnd.Rand16() & mask; + + // d45/d63 require the top row to be extended. + ASSERT_LE(block_size, kBPS); + for (int i = block_size; i < 2 * block_size; ++i) { + above[i] = above[block_size - 1]; + } + } + + DECLARE_ALIGNED(16, Pixel, src[kTotalPixels]); + DECLARE_ALIGNED(16, Pixel, ref_src[kTotalPixels]); + DECLARE_ALIGNED(16, Pixel, left[kBPS]); + DECLARE_ALIGNED(16, Pixel, above_mem[2 * kBPS + 16]); +}; + +typedef IntraPredTestMem Vp9IntraPredTestMem; + +void CheckMd5Signature(const char name[], const char *const signatures[], + const void *data, size_t data_size, int elapsed_time, + int idx) { + libvpx_test::MD5 md5; + md5.Add(reinterpret_cast(data), data_size); + printf("Mode %s[%12s]: %5d ms MD5: %s\n", name, kVp9IntraPredNames[idx], + elapsed_time, md5.Get()); + EXPECT_STREQ(signatures[idx], md5.Get()); +} + +void TestIntraPred(const char name[], VpxPredFunc const *pred_funcs, + const char *const signatures[], int block_size) { + const int kNumTests = static_cast( + 2.e10 / (block_size * block_size * kNumVp9IntraPredFuncs)); + Vp9IntraPredTestMem intra_pred_test_mem; + const uint8_t *const above = intra_pred_test_mem.above_mem + 16; + + intra_pred_test_mem.Init(block_size, 8); + + for (int k = 0; k < kNumVp9IntraPredFuncs; ++k) { + if (pred_funcs[k] == nullptr) continue; + memcpy(intra_pred_test_mem.src, intra_pred_test_mem.ref_src, + sizeof(intra_pred_test_mem.src)); + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int num_tests = 0; num_tests < kNumTests; ++num_tests) { + pred_funcs[k](intra_pred_test_mem.src, kBPS, above, + intra_pred_test_mem.left); + } + libvpx_test::ClearSystemState(); + vpx_usec_timer_mark(&timer); + const int elapsed_time = + static_cast(vpx_usec_timer_elapsed(&timer) / 1000); + CheckMd5Signature(name, signatures, intra_pred_test_mem.src, + sizeof(intra_pred_test_mem.src), elapsed_time, k); + } +} + +void TestIntraPred4(VpxPredFunc const *pred_funcs) { + static const char *const kSignatures[kNumVp9IntraPredFuncs] = { + "e7ed7353c3383fff942e500e9bfe82fe", "2a4a26fcc6ce005eadc08354d196c8a9", + "269d92eff86f315d9c38fe7640d85b15", "ae2960eea9f71ee3dabe08b282ec1773", + "6c1abcc44e90148998b51acd11144e9c", "f7bb3186e1ef8a2b326037ff898cad8e", + "364c1f3fb2f445f935aec2a70a67eaa4", "141624072a4a56773f68fadbdd07c4a7", + "7be49b08687a5f24df3a2c612fca3876", "459bb5d9fd5b238348179c9a22108cd6", + "73edb8831bf1bdfce21ae8eaa43b1234", "2e2457f2009c701a355a8b25eb74fcda", + "52ae4e8bdbe41494c1f43051d4dd7f0b" + }; + TestIntraPred("Intra4", pred_funcs, kSignatures, 4); +} + +void TestIntraPred8(VpxPredFunc const *pred_funcs) { + static const char *const kSignatures[kNumVp9IntraPredFuncs] = { + "d8bbae5d6547cfc17e4f5f44c8730e88", "373bab6d931868d41a601d9d88ce9ac3", + "6fdd5ff4ff79656c14747598ca9e3706", "d9661c2811d6a73674f40ffb2b841847", + "7c722d10b19ccff0b8c171868e747385", "f81dd986eb2b50f750d3a7da716b7e27", + "d500f2c8fc78f46a4c74e4dcf51f14fb", "0e3523f9cab2142dd37fd07ec0760bce", + "79ac4efe907f0a0f1885d43066cfedee", "19ecf2432ac305057de3b6578474eec6", + "4f985b61acc6dd5d2d2585fa89ea2e2d", "f1bb25a9060dd262f405f15a38f5f674", + "209ea00801584829e9a0f7be7d4a74ba" + }; + TestIntraPred("Intra8", pred_funcs, kSignatures, 8); +} + +void TestIntraPred16(VpxPredFunc const *pred_funcs) { + static const char *const kSignatures[kNumVp9IntraPredFuncs] = { + "50971c07ce26977d30298538fffec619", "527a6b9e0dc5b21b98cf276305432bef", + "7eff2868f80ebc2c43a4f367281d80f7", "67cd60512b54964ef6aff1bd4816d922", + "48371c87dc95c08a33b2048f89cf6468", "b0acf2872ee411d7530af6d2625a7084", + "f32aafed4d8d3776ed58bcb6188756d5", "dae208f3dca583529cff49b73f7c4183", + "7af66a2f4c8e0b4908e40f047e60c47c", "125e3ab6ab9bc961f183ec366a7afa88", + "6b90f25b23983c35386b9fd704427622", "f8d6b11d710edc136a7c62c917435f93", + "ed308f18614a362917f411c218aee532" + }; + TestIntraPred("Intra16", pred_funcs, kSignatures, 16); +} + +void TestIntraPred32(VpxPredFunc const *pred_funcs) { + static const char *const kSignatures[kNumVp9IntraPredFuncs] = { + "a0a618c900e65ae521ccc8af789729f2", "985aaa7c72b4a6c2fb431d32100cf13a", + "10662d09febc3ca13ee4e700120daeb5", "b3b01379ba08916ef6b1b35f7d9ad51c", + "9f4261755795af97e34679c333ec7004", "bc2c9da91ad97ef0d1610fb0a9041657", + "75c79b1362ad18abfcdb1aa0aacfc21d", "4039bb7da0f6860090d3c57b5c85468f", + "b29fff7b61804e68383e3a609b33da58", "e1aa5e49067fd8dba66c2eb8d07b7a89", + "4e042822909c1c06d3b10a88281df1eb", "72eb9d9e0e67c93f4c66b70348e9fef7", + "a22d102bcb51ca798aac12ca4ae8f2e8" + }; + TestIntraPred("Intra32", pred_funcs, kSignatures, 32); +} + +} // namespace + +// Defines a test case for |arch| (e.g., C, SSE2, ...) passing the predictors +// to |test_func|. The test name is 'arch.test_func', e.g., C.TestIntraPred4. +#define INTRA_PRED_TEST(arch, test_func, dc, dc_left, dc_top, dc_128, v, h, \ + d45, d135, d117, d153, d207, d63, tm) \ + TEST(arch, test_func) { \ + static const VpxPredFunc vpx_intra_pred[] = { \ + dc, dc_left, dc_top, dc_128, v, h, d45, d135, d117, d153, d207, d63, tm \ + }; \ + test_func(vpx_intra_pred); \ + } + +// ----------------------------------------------------------------------------- + +INTRA_PRED_TEST(C, TestIntraPred4, vpx_dc_predictor_4x4_c, + vpx_dc_left_predictor_4x4_c, vpx_dc_top_predictor_4x4_c, + vpx_dc_128_predictor_4x4_c, vpx_v_predictor_4x4_c, + vpx_h_predictor_4x4_c, vpx_d45_predictor_4x4_c, + vpx_d135_predictor_4x4_c, vpx_d117_predictor_4x4_c, + vpx_d153_predictor_4x4_c, vpx_d207_predictor_4x4_c, + vpx_d63_predictor_4x4_c, vpx_tm_predictor_4x4_c) + +INTRA_PRED_TEST(C, TestIntraPred8, vpx_dc_predictor_8x8_c, + vpx_dc_left_predictor_8x8_c, vpx_dc_top_predictor_8x8_c, + vpx_dc_128_predictor_8x8_c, vpx_v_predictor_8x8_c, + vpx_h_predictor_8x8_c, vpx_d45_predictor_8x8_c, + vpx_d135_predictor_8x8_c, vpx_d117_predictor_8x8_c, + vpx_d153_predictor_8x8_c, vpx_d207_predictor_8x8_c, + vpx_d63_predictor_8x8_c, vpx_tm_predictor_8x8_c) + +INTRA_PRED_TEST(C, TestIntraPred16, vpx_dc_predictor_16x16_c, + vpx_dc_left_predictor_16x16_c, vpx_dc_top_predictor_16x16_c, + vpx_dc_128_predictor_16x16_c, vpx_v_predictor_16x16_c, + vpx_h_predictor_16x16_c, vpx_d45_predictor_16x16_c, + vpx_d135_predictor_16x16_c, vpx_d117_predictor_16x16_c, + vpx_d153_predictor_16x16_c, vpx_d207_predictor_16x16_c, + vpx_d63_predictor_16x16_c, vpx_tm_predictor_16x16_c) + +INTRA_PRED_TEST(C, TestIntraPred32, vpx_dc_predictor_32x32_c, + vpx_dc_left_predictor_32x32_c, vpx_dc_top_predictor_32x32_c, + vpx_dc_128_predictor_32x32_c, vpx_v_predictor_32x32_c, + vpx_h_predictor_32x32_c, vpx_d45_predictor_32x32_c, + vpx_d135_predictor_32x32_c, vpx_d117_predictor_32x32_c, + vpx_d153_predictor_32x32_c, vpx_d207_predictor_32x32_c, + vpx_d63_predictor_32x32_c, vpx_tm_predictor_32x32_c) + +#if HAVE_SSE2 +INTRA_PRED_TEST(SSE2, TestIntraPred4, vpx_dc_predictor_4x4_sse2, + vpx_dc_left_predictor_4x4_sse2, vpx_dc_top_predictor_4x4_sse2, + vpx_dc_128_predictor_4x4_sse2, vpx_v_predictor_4x4_sse2, + vpx_h_predictor_4x4_sse2, vpx_d45_predictor_4x4_sse2, nullptr, + nullptr, nullptr, vpx_d207_predictor_4x4_sse2, nullptr, + vpx_tm_predictor_4x4_sse2) + +INTRA_PRED_TEST(SSE2, TestIntraPred8, vpx_dc_predictor_8x8_sse2, + vpx_dc_left_predictor_8x8_sse2, vpx_dc_top_predictor_8x8_sse2, + vpx_dc_128_predictor_8x8_sse2, vpx_v_predictor_8x8_sse2, + vpx_h_predictor_8x8_sse2, vpx_d45_predictor_8x8_sse2, nullptr, + nullptr, nullptr, nullptr, nullptr, vpx_tm_predictor_8x8_sse2) + +INTRA_PRED_TEST(SSE2, TestIntraPred16, vpx_dc_predictor_16x16_sse2, + vpx_dc_left_predictor_16x16_sse2, + vpx_dc_top_predictor_16x16_sse2, + vpx_dc_128_predictor_16x16_sse2, vpx_v_predictor_16x16_sse2, + vpx_h_predictor_16x16_sse2, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, vpx_tm_predictor_16x16_sse2) + +INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2, + vpx_dc_left_predictor_32x32_sse2, + vpx_dc_top_predictor_32x32_sse2, + vpx_dc_128_predictor_32x32_sse2, vpx_v_predictor_32x32_sse2, + vpx_h_predictor_32x32_sse2, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, vpx_tm_predictor_32x32_sse2) +#endif // HAVE_SSE2 + +#if HAVE_SSSE3 +INTRA_PRED_TEST(SSSE3, TestIntraPred4, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, + vpx_d153_predictor_4x4_ssse3, nullptr, + vpx_d63_predictor_4x4_ssse3, nullptr) +INTRA_PRED_TEST(SSSE3, TestIntraPred8, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, + vpx_d153_predictor_8x8_ssse3, vpx_d207_predictor_8x8_ssse3, + vpx_d63_predictor_8x8_ssse3, nullptr) +INTRA_PRED_TEST(SSSE3, TestIntraPred16, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, vpx_d45_predictor_16x16_ssse3, nullptr, + nullptr, vpx_d153_predictor_16x16_ssse3, + vpx_d207_predictor_16x16_ssse3, vpx_d63_predictor_16x16_ssse3, + nullptr) +INTRA_PRED_TEST(SSSE3, TestIntraPred32, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, vpx_d45_predictor_32x32_ssse3, nullptr, + nullptr, vpx_d153_predictor_32x32_ssse3, + vpx_d207_predictor_32x32_ssse3, vpx_d63_predictor_32x32_ssse3, + nullptr) +#endif // HAVE_SSSE3 + +#if HAVE_DSPR2 +INTRA_PRED_TEST(DSPR2, TestIntraPred4, vpx_dc_predictor_4x4_dspr2, nullptr, + nullptr, nullptr, nullptr, vpx_h_predictor_4x4_dspr2, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, + vpx_tm_predictor_4x4_dspr2) +INTRA_PRED_TEST(DSPR2, TestIntraPred8, vpx_dc_predictor_8x8_dspr2, nullptr, + nullptr, nullptr, nullptr, vpx_h_predictor_8x8_dspr2, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, + vpx_tm_predictor_8x8_c) +INTRA_PRED_TEST(DSPR2, TestIntraPred16, vpx_dc_predictor_16x16_dspr2, nullptr, + nullptr, nullptr, nullptr, vpx_h_predictor_16x16_dspr2, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr) +#endif // HAVE_DSPR2 + +#if HAVE_NEON +INTRA_PRED_TEST(NEON, TestIntraPred4, vpx_dc_predictor_4x4_neon, + vpx_dc_left_predictor_4x4_neon, vpx_dc_top_predictor_4x4_neon, + vpx_dc_128_predictor_4x4_neon, vpx_v_predictor_4x4_neon, + vpx_h_predictor_4x4_neon, vpx_d45_predictor_4x4_neon, + vpx_d135_predictor_4x4_neon, vpx_d117_predictor_4x4_neon, + vpx_d153_predictor_4x4_neon, vpx_d207_predictor_4x4_neon, + vpx_d63_predictor_4x4_neon, vpx_tm_predictor_4x4_neon) +INTRA_PRED_TEST(NEON, TestIntraPred8, vpx_dc_predictor_8x8_neon, + vpx_dc_left_predictor_8x8_neon, vpx_dc_top_predictor_8x8_neon, + vpx_dc_128_predictor_8x8_neon, vpx_v_predictor_8x8_neon, + vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon, + vpx_d135_predictor_8x8_neon, vpx_d117_predictor_8x8_neon, + vpx_d153_predictor_8x8_neon, vpx_d207_predictor_8x8_neon, + vpx_d63_predictor_8x8_neon, vpx_tm_predictor_8x8_neon) +INTRA_PRED_TEST(NEON, TestIntraPred16, vpx_dc_predictor_16x16_neon, + vpx_dc_left_predictor_16x16_neon, + vpx_dc_top_predictor_16x16_neon, + vpx_dc_128_predictor_16x16_neon, vpx_v_predictor_16x16_neon, + vpx_h_predictor_16x16_neon, vpx_d45_predictor_16x16_neon, + vpx_d135_predictor_16x16_neon, vpx_d117_predictor_16x16_neon, + vpx_d153_predictor_16x16_neon, vpx_d207_predictor_16x16_neon, + vpx_d63_predictor_16x16_neon, vpx_tm_predictor_16x16_neon) +INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon, + vpx_dc_left_predictor_32x32_neon, + vpx_dc_top_predictor_32x32_neon, + vpx_dc_128_predictor_32x32_neon, vpx_v_predictor_32x32_neon, + vpx_h_predictor_32x32_neon, vpx_d45_predictor_32x32_neon, + vpx_d135_predictor_32x32_neon, vpx_d117_predictor_32x32_neon, + vpx_d153_predictor_32x32_neon, vpx_d207_predictor_32x32_neon, + vpx_d63_predictor_32x32_neon, vpx_tm_predictor_32x32_neon) +#endif // HAVE_NEON + +#if HAVE_MSA +INTRA_PRED_TEST(MSA, TestIntraPred4, vpx_dc_predictor_4x4_msa, + vpx_dc_left_predictor_4x4_msa, vpx_dc_top_predictor_4x4_msa, + vpx_dc_128_predictor_4x4_msa, vpx_v_predictor_4x4_msa, + vpx_h_predictor_4x4_msa, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, vpx_tm_predictor_4x4_msa) +INTRA_PRED_TEST(MSA, TestIntraPred8, vpx_dc_predictor_8x8_msa, + vpx_dc_left_predictor_8x8_msa, vpx_dc_top_predictor_8x8_msa, + vpx_dc_128_predictor_8x8_msa, vpx_v_predictor_8x8_msa, + vpx_h_predictor_8x8_msa, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, vpx_tm_predictor_8x8_msa) +INTRA_PRED_TEST(MSA, TestIntraPred16, vpx_dc_predictor_16x16_msa, + vpx_dc_left_predictor_16x16_msa, vpx_dc_top_predictor_16x16_msa, + vpx_dc_128_predictor_16x16_msa, vpx_v_predictor_16x16_msa, + vpx_h_predictor_16x16_msa, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, vpx_tm_predictor_16x16_msa) +INTRA_PRED_TEST(MSA, TestIntraPred32, vpx_dc_predictor_32x32_msa, + vpx_dc_left_predictor_32x32_msa, vpx_dc_top_predictor_32x32_msa, + vpx_dc_128_predictor_32x32_msa, vpx_v_predictor_32x32_msa, + vpx_h_predictor_32x32_msa, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, vpx_tm_predictor_32x32_msa) +#endif // HAVE_MSA + +#if HAVE_VSX +// TODO(crbug.com/webm/1522): Fix test failures. +#if 0 +INTRA_PRED_TEST(VSX, TestIntraPred4, nullptr, nullptr, nullptr, nullptr, + nullptr, vpx_h_predictor_4x4_vsx, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, vpx_tm_predictor_4x4_vsx) + +INTRA_PRED_TEST(VSX, TestIntraPred8, vpx_dc_predictor_8x8_vsx, nullptr, nullptr, + nullptr, nullptr, vpx_h_predictor_8x8_vsx, + vpx_d45_predictor_8x8_vsx, nullptr, nullptr, nullptr, nullptr, + vpx_d63_predictor_8x8_vsx, vpx_tm_predictor_8x8_vsx) +#endif + +INTRA_PRED_TEST(VSX, TestIntraPred16, vpx_dc_predictor_16x16_vsx, + vpx_dc_left_predictor_16x16_vsx, vpx_dc_top_predictor_16x16_vsx, + vpx_dc_128_predictor_16x16_vsx, vpx_v_predictor_16x16_vsx, + vpx_h_predictor_16x16_vsx, vpx_d45_predictor_16x16_vsx, nullptr, + nullptr, nullptr, nullptr, vpx_d63_predictor_16x16_vsx, + vpx_tm_predictor_16x16_vsx) + +INTRA_PRED_TEST(VSX, TestIntraPred32, vpx_dc_predictor_32x32_vsx, + vpx_dc_left_predictor_32x32_vsx, vpx_dc_top_predictor_32x32_vsx, + vpx_dc_128_predictor_32x32_vsx, vpx_v_predictor_32x32_vsx, + vpx_h_predictor_32x32_vsx, vpx_d45_predictor_32x32_vsx, nullptr, + nullptr, nullptr, nullptr, vpx_d63_predictor_32x32_vsx, + vpx_tm_predictor_32x32_vsx) +#endif // HAVE_VSX + +#if HAVE_LSX +INTRA_PRED_TEST(LSX, TestIntraPred8, vpx_dc_predictor_8x8_lsx, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr) +INTRA_PRED_TEST(LSX, TestIntraPred16, vpx_dc_predictor_16x16_lsx, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr) +#endif // HAVE_LSX + +// ----------------------------------------------------------------------------- + +#if CONFIG_VP9_HIGHBITDEPTH +namespace { + +typedef void (*VpxHighbdPredFunc)(uint16_t *dst, ptrdiff_t y_stride, + const uint16_t *above, const uint16_t *left, + int bd); + +typedef IntraPredTestMem Vp9HighbdIntraPredTestMem; + +void TestHighbdIntraPred(const char name[], VpxHighbdPredFunc const *pred_funcs, + const char *const signatures[], int block_size) { + const int kNumTests = static_cast( + 2.e10 / (block_size * block_size * kNumVp9IntraPredFuncs)); + Vp9HighbdIntraPredTestMem intra_pred_test_mem; + const uint16_t *const above = intra_pred_test_mem.above_mem + 16; + + intra_pred_test_mem.Init(block_size, 12); + + for (int k = 0; k < kNumVp9IntraPredFuncs; ++k) { + if (pred_funcs[k] == nullptr) continue; + memcpy(intra_pred_test_mem.src, intra_pred_test_mem.ref_src, + sizeof(intra_pred_test_mem.src)); + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int num_tests = 0; num_tests < kNumTests; ++num_tests) { + pred_funcs[k](intra_pred_test_mem.src, kBPS, above, + intra_pred_test_mem.left, 12); + } + libvpx_test::ClearSystemState(); + vpx_usec_timer_mark(&timer); + const int elapsed_time = + static_cast(vpx_usec_timer_elapsed(&timer) / 1000); + CheckMd5Signature(name, signatures, intra_pred_test_mem.src, + sizeof(intra_pred_test_mem.src), elapsed_time, k); + } +} + +void TestHighbdIntraPred4(VpxHighbdPredFunc const *pred_funcs) { + static const char *const kSignatures[kNumVp9IntraPredFuncs] = { + "11f74af6c5737df472f3275cbde062fa", "51bea056b6447c93f6eb8f6b7e8f6f71", + "27e97f946766331795886f4de04c5594", "53ab15974b049111fb596c5168ec7e3f", + "f0b640bb176fbe4584cf3d32a9b0320a", "729783ca909e03afd4b47111c80d967b", + "fbf1c30793d9f32812e4d9f905d53530", "293fc903254a33754133314c6cdba81f", + "f8074d704233e73dfd35b458c6092374", "aa6363d08544a1ec4da33d7a0be5640d", + "462abcfdfa3d087bb33c9a88f2aec491", "863eab65d22550dd44a2397277c1ec71", + "23d61df1574d0fa308f9731811047c4b" + }; + TestHighbdIntraPred("Intra4", pred_funcs, kSignatures, 4); +} + +void TestHighbdIntraPred8(VpxHighbdPredFunc const *pred_funcs) { + static const char *const kSignatures[kNumVp9IntraPredFuncs] = { + "03da8829fe94663047fd108c5fcaa71d", "ecdb37b8120a2d3a4c706b016bd1bfd7", + "1d4543ed8d2b9368cb96898095fe8a75", "f791c9a67b913cbd82d9da8ecede30e2", + "065c70646f4dbaff913282f55a45a441", "51f87123616662ef7c35691497dfd0ba", + "2a5b0131ef4716f098ee65e6df01e3dd", "9ffe186a6bc7db95275f1bbddd6f7aba", + "a3258a2eae2e2bd55cb8f71351b22998", "8d909f0a2066e39b3216092c6289ece4", + "d183abb30b9f24c886a0517e991b22c7", "702a42fe4c7d665dc561b2aeeb60f311", + "7b5dbbbe7ae3a4ac2948731600bde5d6" + }; + TestHighbdIntraPred("Intra8", pred_funcs, kSignatures, 8); +} + +void TestHighbdIntraPred16(VpxHighbdPredFunc const *pred_funcs) { + static const char *const kSignatures[kNumVp9IntraPredFuncs] = { + "e33cb3f56a878e2fddb1b2fc51cdd275", "c7bff6f04b6052c8ab335d726dbbd52d", + "d0b0b47b654a9bcc5c6008110a44589b", "78f5da7b10b2b9ab39f114a33b6254e9", + "c78e31d23831abb40d6271a318fdd6f3", "90d1347f4ec9198a0320daecb6ff90b8", + "d2c623746cbb64a0c9e29c10f2c57041", "cf28bd387b81ad3e5f1a1c779a4b70a0", + "24c304330431ddeaf630f6ce94af2eac", "91a329798036bf64e8e00a87b131b8b1", + "d39111f22885307f920796a42084c872", "e2e702f7250ece98dd8f3f2854c31eeb", + "e2fb05b01eb8b88549e85641d8ce5b59" + }; + TestHighbdIntraPred("Intra16", pred_funcs, kSignatures, 16); +} + +void TestHighbdIntraPred32(VpxHighbdPredFunc const *pred_funcs) { + static const char *const kSignatures[kNumVp9IntraPredFuncs] = { + "a3e8056ba7e36628cce4917cd956fedd", "cc7d3024fe8748b512407edee045377e", + "2aab0a0f330a1d3e19b8ecb8f06387a3", "a547bc3fb7b06910bf3973122a426661", + "26f712514da95042f93d6e8dc8e431dc", "bb08c6e16177081daa3d936538dbc2e3", + "8f031af3e2650e89620d8d2c3a843d8b", "42867c8553285e94ee8e4df7abafbda8", + "6496bdee96100667833f546e1be3d640", "2ebfa25bf981377e682e580208504300", + "3e8ae52fd1f607f348aa4cb436c71ab7", "3d4efe797ca82193613696753ea624c4", + "cb8aab6d372278f3131e8d99efde02d9" + }; + TestHighbdIntraPred("Intra32", pred_funcs, kSignatures, 32); +} + +} // namespace + +// Defines a test case for |arch| (e.g., C, SSE2, ...) passing the predictors +// to |test_func|. The test name is 'arch.test_func', e.g., C.TestIntraPred4. +#define HIGHBD_INTRA_PRED_TEST(arch, test_func, dc, dc_left, dc_top, dc_128, \ + v, h, d45, d135, d117, d153, d207, d63, tm) \ + TEST(arch, test_func) { \ + static const VpxHighbdPredFunc vpx_intra_pred[] = { \ + dc, dc_left, dc_top, dc_128, v, h, d45, d135, d117, d153, d207, d63, tm \ + }; \ + test_func(vpx_intra_pred); \ + } + +// ----------------------------------------------------------------------------- + +HIGHBD_INTRA_PRED_TEST( + C, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_c, + vpx_highbd_dc_left_predictor_4x4_c, vpx_highbd_dc_top_predictor_4x4_c, + vpx_highbd_dc_128_predictor_4x4_c, vpx_highbd_v_predictor_4x4_c, + vpx_highbd_h_predictor_4x4_c, vpx_highbd_d45_predictor_4x4_c, + vpx_highbd_d135_predictor_4x4_c, vpx_highbd_d117_predictor_4x4_c, + vpx_highbd_d153_predictor_4x4_c, vpx_highbd_d207_predictor_4x4_c, + vpx_highbd_d63_predictor_4x4_c, vpx_highbd_tm_predictor_4x4_c) + +HIGHBD_INTRA_PRED_TEST( + C, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_c, + vpx_highbd_dc_left_predictor_8x8_c, vpx_highbd_dc_top_predictor_8x8_c, + vpx_highbd_dc_128_predictor_8x8_c, vpx_highbd_v_predictor_8x8_c, + vpx_highbd_h_predictor_8x8_c, vpx_highbd_d45_predictor_8x8_c, + vpx_highbd_d135_predictor_8x8_c, vpx_highbd_d117_predictor_8x8_c, + vpx_highbd_d153_predictor_8x8_c, vpx_highbd_d207_predictor_8x8_c, + vpx_highbd_d63_predictor_8x8_c, vpx_highbd_tm_predictor_8x8_c) + +HIGHBD_INTRA_PRED_TEST( + C, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_c, + vpx_highbd_dc_left_predictor_16x16_c, vpx_highbd_dc_top_predictor_16x16_c, + vpx_highbd_dc_128_predictor_16x16_c, vpx_highbd_v_predictor_16x16_c, + vpx_highbd_h_predictor_16x16_c, vpx_highbd_d45_predictor_16x16_c, + vpx_highbd_d135_predictor_16x16_c, vpx_highbd_d117_predictor_16x16_c, + vpx_highbd_d153_predictor_16x16_c, vpx_highbd_d207_predictor_16x16_c, + vpx_highbd_d63_predictor_16x16_c, vpx_highbd_tm_predictor_16x16_c) + +HIGHBD_INTRA_PRED_TEST( + C, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_c, + vpx_highbd_dc_left_predictor_32x32_c, vpx_highbd_dc_top_predictor_32x32_c, + vpx_highbd_dc_128_predictor_32x32_c, vpx_highbd_v_predictor_32x32_c, + vpx_highbd_h_predictor_32x32_c, vpx_highbd_d45_predictor_32x32_c, + vpx_highbd_d135_predictor_32x32_c, vpx_highbd_d117_predictor_32x32_c, + vpx_highbd_d153_predictor_32x32_c, vpx_highbd_d207_predictor_32x32_c, + vpx_highbd_d63_predictor_32x32_c, vpx_highbd_tm_predictor_32x32_c) + +#if HAVE_SSE2 +HIGHBD_INTRA_PRED_TEST( + SSE2, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_sse2, + vpx_highbd_dc_left_predictor_4x4_sse2, vpx_highbd_dc_top_predictor_4x4_sse2, + vpx_highbd_dc_128_predictor_4x4_sse2, vpx_highbd_v_predictor_4x4_sse2, + vpx_highbd_h_predictor_4x4_sse2, nullptr, + vpx_highbd_d135_predictor_4x4_sse2, vpx_highbd_d117_predictor_4x4_sse2, + vpx_highbd_d153_predictor_4x4_sse2, vpx_highbd_d207_predictor_4x4_sse2, + vpx_highbd_d63_predictor_4x4_sse2, vpx_highbd_tm_predictor_4x4_c) + +HIGHBD_INTRA_PRED_TEST( + SSE2, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_sse2, + vpx_highbd_dc_left_predictor_8x8_sse2, vpx_highbd_dc_top_predictor_8x8_sse2, + vpx_highbd_dc_128_predictor_8x8_sse2, vpx_highbd_v_predictor_8x8_sse2, + vpx_highbd_h_predictor_8x8_sse2, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, vpx_highbd_tm_predictor_8x8_sse2) + +HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred16, + vpx_highbd_dc_predictor_16x16_sse2, + vpx_highbd_dc_left_predictor_16x16_sse2, + vpx_highbd_dc_top_predictor_16x16_sse2, + vpx_highbd_dc_128_predictor_16x16_sse2, + vpx_highbd_v_predictor_16x16_sse2, + vpx_highbd_h_predictor_16x16_sse2, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, + vpx_highbd_tm_predictor_16x16_sse2) + +HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred32, + vpx_highbd_dc_predictor_32x32_sse2, + vpx_highbd_dc_left_predictor_32x32_sse2, + vpx_highbd_dc_top_predictor_32x32_sse2, + vpx_highbd_dc_128_predictor_32x32_sse2, + vpx_highbd_v_predictor_32x32_sse2, + vpx_highbd_h_predictor_32x32_sse2, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, + vpx_highbd_tm_predictor_32x32_sse2) +#endif // HAVE_SSE2 + +#if HAVE_SSSE3 +HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred4, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, + vpx_highbd_d45_predictor_4x4_ssse3, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr) +HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred8, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, + vpx_highbd_d45_predictor_8x8_ssse3, + vpx_highbd_d135_predictor_8x8_ssse3, + vpx_highbd_d117_predictor_8x8_ssse3, + vpx_highbd_d153_predictor_8x8_ssse3, + vpx_highbd_d207_predictor_8x8_ssse3, + vpx_highbd_d63_predictor_8x8_ssse3, nullptr) +HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred16, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, + vpx_highbd_d45_predictor_16x16_ssse3, + vpx_highbd_d135_predictor_16x16_ssse3, + vpx_highbd_d117_predictor_16x16_ssse3, + vpx_highbd_d153_predictor_16x16_ssse3, + vpx_highbd_d207_predictor_16x16_ssse3, + vpx_highbd_d63_predictor_16x16_ssse3, nullptr) +HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred32, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, + vpx_highbd_d45_predictor_32x32_ssse3, + vpx_highbd_d135_predictor_32x32_ssse3, + vpx_highbd_d117_predictor_32x32_ssse3, + vpx_highbd_d153_predictor_32x32_ssse3, + vpx_highbd_d207_predictor_32x32_ssse3, + vpx_highbd_d63_predictor_32x32_ssse3, nullptr) +#endif // HAVE_SSSE3 + +#if HAVE_NEON +HIGHBD_INTRA_PRED_TEST( + NEON, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_neon, + vpx_highbd_dc_left_predictor_4x4_neon, vpx_highbd_dc_top_predictor_4x4_neon, + vpx_highbd_dc_128_predictor_4x4_neon, vpx_highbd_v_predictor_4x4_neon, + vpx_highbd_h_predictor_4x4_neon, vpx_highbd_d45_predictor_4x4_neon, + vpx_highbd_d135_predictor_4x4_neon, vpx_highbd_d117_predictor_4x4_neon, + vpx_highbd_d153_predictor_4x4_neon, vpx_highbd_d207_predictor_4x4_neon, + vpx_highbd_d63_predictor_4x4_neon, vpx_highbd_tm_predictor_4x4_neon) +HIGHBD_INTRA_PRED_TEST( + NEON, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_neon, + vpx_highbd_dc_left_predictor_8x8_neon, vpx_highbd_dc_top_predictor_8x8_neon, + vpx_highbd_dc_128_predictor_8x8_neon, vpx_highbd_v_predictor_8x8_neon, + vpx_highbd_h_predictor_8x8_neon, vpx_highbd_d45_predictor_8x8_neon, + vpx_highbd_d135_predictor_8x8_neon, vpx_highbd_d117_predictor_8x8_neon, + vpx_highbd_d153_predictor_8x8_neon, vpx_highbd_d207_predictor_8x8_neon, + vpx_highbd_d63_predictor_8x8_neon, vpx_highbd_tm_predictor_8x8_neon) +HIGHBD_INTRA_PRED_TEST( + NEON, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_neon, + vpx_highbd_dc_left_predictor_16x16_neon, + vpx_highbd_dc_top_predictor_16x16_neon, + vpx_highbd_dc_128_predictor_16x16_neon, vpx_highbd_v_predictor_16x16_neon, + vpx_highbd_h_predictor_16x16_neon, vpx_highbd_d45_predictor_16x16_neon, + vpx_highbd_d135_predictor_16x16_neon, vpx_highbd_d117_predictor_16x16_neon, + vpx_highbd_d153_predictor_16x16_neon, vpx_highbd_d207_predictor_16x16_neon, + vpx_highbd_d63_predictor_16x16_neon, vpx_highbd_tm_predictor_16x16_neon) +HIGHBD_INTRA_PRED_TEST( + NEON, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_neon, + vpx_highbd_dc_left_predictor_32x32_neon, + vpx_highbd_dc_top_predictor_32x32_neon, + vpx_highbd_dc_128_predictor_32x32_neon, vpx_highbd_v_predictor_32x32_neon, + vpx_highbd_h_predictor_32x32_neon, vpx_highbd_d45_predictor_32x32_neon, + vpx_highbd_d135_predictor_32x32_neon, vpx_highbd_d117_predictor_32x32_neon, + vpx_highbd_d153_predictor_32x32_neon, vpx_highbd_d207_predictor_32x32_neon, + vpx_highbd_d63_predictor_32x32_neon, vpx_highbd_tm_predictor_32x32_neon) +#endif // HAVE_NEON + +#endif // CONFIG_VP9_HIGHBITDEPTH + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + ::libvpx_test::init_vpx_test(); + return RUN_ALL_TESTS(); +} diff --git a/media/libvpx/libvpx/test/test_libvpx.cc b/media/libvpx/libvpx/test/test_libvpx.cc new file mode 100644 index 0000000000..c1798b8b8b --- /dev/null +++ b/media/libvpx/libvpx/test/test_libvpx.cc @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "test/init_vpx_test.h" +#include "third_party/googletest/src/include/gtest/gtest.h" + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + ::libvpx_test::init_vpx_test(); + return RUN_ALL_TESTS(); +} diff --git a/media/libvpx/libvpx/test/test_rc_interface.cc b/media/libvpx/libvpx/test/test_rc_interface.cc new file mode 100644 index 0000000000..ec75700f73 --- /dev/null +++ b/media/libvpx/libvpx/test/test_rc_interface.cc @@ -0,0 +1,6 @@ +#include "third_party/googletest/src/include/gtest/gtest.h" + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/media/libvpx/libvpx/test/test_vector_test.cc b/media/libvpx/libvpx/test/test_vector_test.cc new file mode 100644 index 0000000000..ee552113ce --- /dev/null +++ b/media/libvpx/libvpx/test/test_vector_test.cc @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "../tools_common.h" +#include "./vpx_config.h" +#include "test/codec_factory.h" +#include "test/decode_test_driver.h" +#include "test/ivf_video_source.h" +#include "test/md5_helper.h" +#include "test/test_vectors.h" +#include "test/util.h" +#if CONFIG_WEBM_IO +#include "test/webm_video_source.h" +#endif +#include "vpx_mem/vpx_mem.h" + +namespace { + +const int kThreads = 0; +const int kMtMode = 1; +const int kFileName = 2; + +typedef std::tuple DecodeParam; + +class TestVectorTest : public ::libvpx_test::DecoderTest, + public ::libvpx_test::CodecTestWithParam { + protected: + TestVectorTest() : DecoderTest(GET_PARAM(0)), md5_file_(nullptr) { +#if CONFIG_VP9_DECODER + resize_clips_.insert(::libvpx_test::kVP9TestVectorsResize, + ::libvpx_test::kVP9TestVectorsResize + + ::libvpx_test::kNumVP9TestVectorsResize); +#endif + } + + ~TestVectorTest() override { + if (md5_file_) fclose(md5_file_); + } + + void OpenMD5File(const std::string &md5_file_name_) { + md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_); + ASSERT_NE(md5_file_, nullptr) + << "Md5 file open failed. Filename: " << md5_file_name_; + } + +#if CONFIG_VP9_DECODER + void PreDecodeFrameHook(const libvpx_test::CompressedVideoSource &video, + libvpx_test::Decoder *decoder) override { + if (video.frame_number() == 0 && mt_mode_ >= 0) { + if (mt_mode_ == 1) { + decoder->Control(VP9D_SET_LOOP_FILTER_OPT, 1); + decoder->Control(VP9D_SET_ROW_MT, 0); + } else if (mt_mode_ == 2) { + decoder->Control(VP9D_SET_LOOP_FILTER_OPT, 0); + decoder->Control(VP9D_SET_ROW_MT, 1); + } else { + decoder->Control(VP9D_SET_LOOP_FILTER_OPT, 0); + decoder->Control(VP9D_SET_ROW_MT, 0); + } + } + } +#endif + + void DecompressedFrameHook(const vpx_image_t &img, + const unsigned int frame_number) override { + ASSERT_NE(md5_file_, nullptr); + char expected_md5[33]; + char junk[128]; + + // Read correct md5 checksums. + const int res = fscanf(md5_file_, "%s %s", expected_md5, junk); + ASSERT_NE(res, EOF) << "Read md5 data failed"; + expected_md5[32] = '\0'; + + ::libvpx_test::MD5 md5_res; + md5_res.Add(&img); + const char *actual_md5 = md5_res.Get(); + + // Check md5 match. + ASSERT_STREQ(expected_md5, actual_md5) + << "Md5 checksums don't match: frame number = " << frame_number; + } + +#if CONFIG_VP9_DECODER + std::set resize_clips_; +#endif + int mt_mode_; + + private: + FILE *md5_file_; +}; + +// This test runs through the whole set of test vectors, and decodes them. +// The md5 checksums are computed for each frame in the video file. If md5 +// checksums match the correct md5 data, then the test is passed. Otherwise, +// the test failed. +TEST_P(TestVectorTest, MD5Match) { + const DecodeParam input = GET_PARAM(1); + const std::string filename = std::get(input); + vpx_codec_flags_t flags = 0; + vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); + char str[256]; + + cfg.threads = std::get(input); + mt_mode_ = std::get(input); + snprintf(str, sizeof(str) / sizeof(str[0]) - 1, + "file: %s threads: %d MT mode: %d", filename.c_str(), cfg.threads, + mt_mode_); + SCOPED_TRACE(str); + + // Open compressed video file. + std::unique_ptr video; + if (filename.substr(filename.length() - 3, 3) == "ivf") { + video.reset(new libvpx_test::IVFVideoSource(filename)); + } else if (filename.substr(filename.length() - 4, 4) == "webm") { +#if CONFIG_WEBM_IO + video.reset(new libvpx_test::WebMVideoSource(filename)); +#else + fprintf(stderr, "WebM IO is disabled, skipping test vector %s\n", + filename.c_str()); + return; +#endif + } + ASSERT_NE(video.get(), nullptr); + video->Init(); + + // Construct md5 file name. + const std::string md5_filename = filename + ".md5"; + OpenMD5File(md5_filename); + + // Set decode config and flags. + set_cfg(cfg); + set_flags(flags); + + // Decode frame, and check the md5 matching. + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get(), cfg)); +} + +#if CONFIG_VP8_DECODER +VP8_INSTANTIATE_TEST_SUITE( + TestVectorTest, + ::testing::Combine( + ::testing::Values(1), // Single thread. + ::testing::Values(-1), // LPF opt and Row MT is not applicable + ::testing::ValuesIn(libvpx_test::kVP8TestVectors, + libvpx_test::kVP8TestVectors + + libvpx_test::kNumVP8TestVectors))); + +// Test VP8 decode in with different numbers of threads. +INSTANTIATE_TEST_SUITE_P( + VP8MultiThreaded, TestVectorTest, + ::testing::Combine( + ::testing::Values( + static_cast(&libvpx_test::kVP8)), + ::testing::Combine( + ::testing::Range(2, 9), // With 2 ~ 8 threads. + ::testing::Values(-1), // LPF opt and Row MT is not applicable + ::testing::ValuesIn(libvpx_test::kVP8TestVectors, + libvpx_test::kVP8TestVectors + + libvpx_test::kNumVP8TestVectors)))); + +#endif // CONFIG_VP8_DECODER + +#if CONFIG_VP9_DECODER +VP9_INSTANTIATE_TEST_SUITE( + TestVectorTest, + ::testing::Combine( + ::testing::Values(1), // Single thread. + ::testing::Values(-1), // LPF opt and Row MT is not applicable + ::testing::ValuesIn(libvpx_test::kVP9TestVectors, + libvpx_test::kVP9TestVectors + + libvpx_test::kNumVP9TestVectors))); + +INSTANTIATE_TEST_SUITE_P( + VP9MultiThreaded, TestVectorTest, + ::testing::Combine( + ::testing::Values( + static_cast(&libvpx_test::kVP9)), + ::testing::Combine( + ::testing::Range(2, 9), // With 2 ~ 8 threads. + ::testing::Range(0, 3), // With multi threads modes 0 ~ 2 + // 0: LPF opt and Row MT disabled + // 1: LPF opt enabled + // 2: Row MT enabled + ::testing::ValuesIn(libvpx_test::kVP9TestVectors, + libvpx_test::kVP9TestVectors + + libvpx_test::kNumVP9TestVectors)))); +#endif +} // namespace diff --git a/media/libvpx/libvpx/test/test_vectors.cc b/media/libvpx/libvpx/test/test_vectors.cc new file mode 100644 index 0000000000..3ffc3efc41 --- /dev/null +++ b/media/libvpx/libvpx/test/test_vectors.cc @@ -0,0 +1,385 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "test/test_vectors.h" + +namespace libvpx_test { + +#define NELEMENTS(x) static_cast(sizeof(x) / sizeof(x[0])) + +#if CONFIG_VP8_DECODER +const char *const kVP8TestVectors[] = { + "vp80-00-comprehensive-001.ivf", "vp80-00-comprehensive-002.ivf", + "vp80-00-comprehensive-003.ivf", "vp80-00-comprehensive-004.ivf", + "vp80-00-comprehensive-005.ivf", "vp80-00-comprehensive-006.ivf", + "vp80-00-comprehensive-007.ivf", "vp80-00-comprehensive-008.ivf", + "vp80-00-comprehensive-009.ivf", "vp80-00-comprehensive-010.ivf", + "vp80-00-comprehensive-011.ivf", "vp80-00-comprehensive-012.ivf", + "vp80-00-comprehensive-013.ivf", "vp80-00-comprehensive-014.ivf", + "vp80-00-comprehensive-015.ivf", "vp80-00-comprehensive-016.ivf", + "vp80-00-comprehensive-017.ivf", "vp80-00-comprehensive-018.ivf", + "vp80-01-intra-1400.ivf", "vp80-01-intra-1411.ivf", + "vp80-01-intra-1416.ivf", "vp80-01-intra-1417.ivf", + "vp80-02-inter-1402.ivf", "vp80-02-inter-1412.ivf", + "vp80-02-inter-1418.ivf", "vp80-02-inter-1424.ivf", + "vp80-03-segmentation-01.ivf", "vp80-03-segmentation-02.ivf", + "vp80-03-segmentation-03.ivf", "vp80-03-segmentation-04.ivf", + "vp80-03-segmentation-1401.ivf", "vp80-03-segmentation-1403.ivf", + "vp80-03-segmentation-1407.ivf", "vp80-03-segmentation-1408.ivf", + "vp80-03-segmentation-1409.ivf", "vp80-03-segmentation-1410.ivf", + "vp80-03-segmentation-1413.ivf", "vp80-03-segmentation-1414.ivf", + "vp80-03-segmentation-1415.ivf", "vp80-03-segmentation-1425.ivf", + "vp80-03-segmentation-1426.ivf", "vp80-03-segmentation-1427.ivf", + "vp80-03-segmentation-1432.ivf", "vp80-03-segmentation-1435.ivf", + "vp80-03-segmentation-1436.ivf", "vp80-03-segmentation-1437.ivf", + "vp80-03-segmentation-1441.ivf", "vp80-03-segmentation-1442.ivf", + "vp80-04-partitions-1404.ivf", "vp80-04-partitions-1405.ivf", + "vp80-04-partitions-1406.ivf", "vp80-05-sharpness-1428.ivf", + "vp80-05-sharpness-1429.ivf", "vp80-05-sharpness-1430.ivf", + "vp80-05-sharpness-1431.ivf", "vp80-05-sharpness-1433.ivf", + "vp80-05-sharpness-1434.ivf", "vp80-05-sharpness-1438.ivf", + "vp80-05-sharpness-1439.ivf", "vp80-05-sharpness-1440.ivf", + "vp80-05-sharpness-1443.ivf", "vp80-06-smallsize.ivf" +}; +const int kNumVP8TestVectors = NELEMENTS(kVP8TestVectors); +#endif // CONFIG_VP8_DECODER +#if CONFIG_VP9_DECODER +#define RESIZE_TEST_VECTORS \ + "vp90-2-21-resize_inter_320x180_5_1-2.webm", \ + "vp90-2-21-resize_inter_320x180_5_3-4.webm", \ + "vp90-2-21-resize_inter_320x180_7_1-2.webm", \ + "vp90-2-21-resize_inter_320x180_7_3-4.webm", \ + "vp90-2-21-resize_inter_320x240_5_1-2.webm", \ + "vp90-2-21-resize_inter_320x240_5_3-4.webm", \ + "vp90-2-21-resize_inter_320x240_7_1-2.webm", \ + "vp90-2-21-resize_inter_320x240_7_3-4.webm", \ + "vp90-2-21-resize_inter_640x360_5_1-2.webm", \ + "vp90-2-21-resize_inter_640x360_5_3-4.webm", \ + "vp90-2-21-resize_inter_640x360_7_1-2.webm", \ + "vp90-2-21-resize_inter_640x360_7_3-4.webm", \ + "vp90-2-21-resize_inter_640x480_5_1-2.webm", \ + "vp90-2-21-resize_inter_640x480_5_3-4.webm", \ + "vp90-2-21-resize_inter_640x480_7_1-2.webm", \ + "vp90-2-21-resize_inter_640x480_7_3-4.webm", \ + "vp90-2-21-resize_inter_1280x720_5_1-2.webm", \ + "vp90-2-21-resize_inter_1280x720_5_3-4.webm", \ + "vp90-2-21-resize_inter_1280x720_7_1-2.webm", \ + "vp90-2-21-resize_inter_1280x720_7_3-4.webm", \ + "vp90-2-21-resize_inter_1920x1080_5_1-2.webm", \ + "vp90-2-21-resize_inter_1920x1080_5_3-4.webm", \ + "vp90-2-21-resize_inter_1920x1080_7_1-2.webm", \ + "vp90-2-21-resize_inter_1920x1080_7_3-4.webm", + +const char *const kVP9TestVectors[] = { + "vp90-2-00-quantizer-00.webm", + "vp90-2-00-quantizer-01.webm", + "vp90-2-00-quantizer-02.webm", + "vp90-2-00-quantizer-03.webm", + "vp90-2-00-quantizer-04.webm", + "vp90-2-00-quantizer-05.webm", + "vp90-2-00-quantizer-06.webm", + "vp90-2-00-quantizer-07.webm", + "vp90-2-00-quantizer-08.webm", + "vp90-2-00-quantizer-09.webm", + "vp90-2-00-quantizer-10.webm", + "vp90-2-00-quantizer-11.webm", + "vp90-2-00-quantizer-12.webm", + "vp90-2-00-quantizer-13.webm", + "vp90-2-00-quantizer-14.webm", + "vp90-2-00-quantizer-15.webm", + "vp90-2-00-quantizer-16.webm", + "vp90-2-00-quantizer-17.webm", + "vp90-2-00-quantizer-18.webm", + "vp90-2-00-quantizer-19.webm", + "vp90-2-00-quantizer-20.webm", + "vp90-2-00-quantizer-21.webm", + "vp90-2-00-quantizer-22.webm", + "vp90-2-00-quantizer-23.webm", + "vp90-2-00-quantizer-24.webm", + "vp90-2-00-quantizer-25.webm", + "vp90-2-00-quantizer-26.webm", + "vp90-2-00-quantizer-27.webm", + "vp90-2-00-quantizer-28.webm", + "vp90-2-00-quantizer-29.webm", + "vp90-2-00-quantizer-30.webm", + "vp90-2-00-quantizer-31.webm", + "vp90-2-00-quantizer-32.webm", + "vp90-2-00-quantizer-33.webm", + "vp90-2-00-quantizer-34.webm", + "vp90-2-00-quantizer-35.webm", + "vp90-2-00-quantizer-36.webm", + "vp90-2-00-quantizer-37.webm", + "vp90-2-00-quantizer-38.webm", + "vp90-2-00-quantizer-39.webm", + "vp90-2-00-quantizer-40.webm", + "vp90-2-00-quantizer-41.webm", + "vp90-2-00-quantizer-42.webm", + "vp90-2-00-quantizer-43.webm", + "vp90-2-00-quantizer-44.webm", + "vp90-2-00-quantizer-45.webm", + "vp90-2-00-quantizer-46.webm", + "vp90-2-00-quantizer-47.webm", + "vp90-2-00-quantizer-48.webm", + "vp90-2-00-quantizer-49.webm", + "vp90-2-00-quantizer-50.webm", + "vp90-2-00-quantizer-51.webm", + "vp90-2-00-quantizer-52.webm", + "vp90-2-00-quantizer-53.webm", + "vp90-2-00-quantizer-54.webm", + "vp90-2-00-quantizer-55.webm", + "vp90-2-00-quantizer-56.webm", + "vp90-2-00-quantizer-57.webm", + "vp90-2-00-quantizer-58.webm", + "vp90-2-00-quantizer-59.webm", + "vp90-2-00-quantizer-60.webm", + "vp90-2-00-quantizer-61.webm", + "vp90-2-00-quantizer-62.webm", + "vp90-2-00-quantizer-63.webm", + "vp90-2-01-sharpness-1.webm", + "vp90-2-01-sharpness-2.webm", + "vp90-2-01-sharpness-3.webm", + "vp90-2-01-sharpness-4.webm", + "vp90-2-01-sharpness-5.webm", + "vp90-2-01-sharpness-6.webm", + "vp90-2-01-sharpness-7.webm", + "vp90-2-02-size-08x08.webm", + "vp90-2-02-size-08x10.webm", + "vp90-2-02-size-08x16.webm", + "vp90-2-02-size-08x18.webm", + "vp90-2-02-size-08x32.webm", + "vp90-2-02-size-08x34.webm", + "vp90-2-02-size-08x64.webm", + "vp90-2-02-size-08x66.webm", + "vp90-2-02-size-10x08.webm", + "vp90-2-02-size-10x10.webm", + "vp90-2-02-size-10x16.webm", + "vp90-2-02-size-10x18.webm", + "vp90-2-02-size-10x32.webm", + "vp90-2-02-size-10x34.webm", + "vp90-2-02-size-10x64.webm", + "vp90-2-02-size-10x66.webm", + "vp90-2-02-size-16x08.webm", + "vp90-2-02-size-16x10.webm", + "vp90-2-02-size-16x16.webm", + "vp90-2-02-size-16x18.webm", + "vp90-2-02-size-16x32.webm", + "vp90-2-02-size-16x34.webm", + "vp90-2-02-size-16x64.webm", + "vp90-2-02-size-16x66.webm", + "vp90-2-02-size-18x08.webm", + "vp90-2-02-size-18x10.webm", + "vp90-2-02-size-18x16.webm", + "vp90-2-02-size-18x18.webm", + "vp90-2-02-size-18x32.webm", + "vp90-2-02-size-18x34.webm", + "vp90-2-02-size-18x64.webm", + "vp90-2-02-size-18x66.webm", + "vp90-2-02-size-32x08.webm", + "vp90-2-02-size-32x10.webm", + "vp90-2-02-size-32x16.webm", + "vp90-2-02-size-32x18.webm", + "vp90-2-02-size-32x32.webm", + "vp90-2-02-size-32x34.webm", + "vp90-2-02-size-32x64.webm", + "vp90-2-02-size-32x66.webm", + "vp90-2-02-size-34x08.webm", + "vp90-2-02-size-34x10.webm", + "vp90-2-02-size-34x16.webm", + "vp90-2-02-size-34x18.webm", + "vp90-2-02-size-34x32.webm", + "vp90-2-02-size-34x34.webm", + "vp90-2-02-size-34x64.webm", + "vp90-2-02-size-34x66.webm", + "vp90-2-02-size-64x08.webm", + "vp90-2-02-size-64x10.webm", + "vp90-2-02-size-64x16.webm", + "vp90-2-02-size-64x18.webm", + "vp90-2-02-size-64x32.webm", + "vp90-2-02-size-64x34.webm", + "vp90-2-02-size-64x64.webm", + "vp90-2-02-size-64x66.webm", + "vp90-2-02-size-66x08.webm", + "vp90-2-02-size-66x10.webm", + "vp90-2-02-size-66x16.webm", + "vp90-2-02-size-66x18.webm", + "vp90-2-02-size-66x32.webm", + "vp90-2-02-size-66x34.webm", + "vp90-2-02-size-66x64.webm", + "vp90-2-02-size-66x66.webm", + "vp90-2-02-size-130x132.webm", + "vp90-2-02-size-132x130.webm", + "vp90-2-02-size-132x132.webm", + "vp90-2-02-size-178x180.webm", + "vp90-2-02-size-180x178.webm", + "vp90-2-02-size-180x180.webm", + "vp90-2-03-size-196x196.webm", + "vp90-2-03-size-196x198.webm", + "vp90-2-03-size-196x200.webm", + "vp90-2-03-size-196x202.webm", + "vp90-2-03-size-196x208.webm", + "vp90-2-03-size-196x210.webm", + "vp90-2-03-size-196x224.webm", + "vp90-2-03-size-196x226.webm", + "vp90-2-03-size-198x196.webm", + "vp90-2-03-size-198x198.webm", + "vp90-2-03-size-198x200.webm", + "vp90-2-03-size-198x202.webm", + "vp90-2-03-size-198x208.webm", + "vp90-2-03-size-198x210.webm", + "vp90-2-03-size-198x224.webm", + "vp90-2-03-size-198x226.webm", + "vp90-2-03-size-200x196.webm", + "vp90-2-03-size-200x198.webm", + "vp90-2-03-size-200x200.webm", + "vp90-2-03-size-200x202.webm", + "vp90-2-03-size-200x208.webm", + "vp90-2-03-size-200x210.webm", + "vp90-2-03-size-200x224.webm", + "vp90-2-03-size-200x226.webm", + "vp90-2-03-size-202x196.webm", + "vp90-2-03-size-202x198.webm", + "vp90-2-03-size-202x200.webm", + "vp90-2-03-size-202x202.webm", + "vp90-2-03-size-202x208.webm", + "vp90-2-03-size-202x210.webm", + "vp90-2-03-size-202x224.webm", + "vp90-2-03-size-202x226.webm", + "vp90-2-03-size-208x196.webm", + "vp90-2-03-size-208x198.webm", + "vp90-2-03-size-208x200.webm", + "vp90-2-03-size-208x202.webm", + "vp90-2-03-size-208x208.webm", + "vp90-2-03-size-208x210.webm", + "vp90-2-03-size-208x224.webm", + "vp90-2-03-size-208x226.webm", + "vp90-2-03-size-210x196.webm", + "vp90-2-03-size-210x198.webm", + "vp90-2-03-size-210x200.webm", + "vp90-2-03-size-210x202.webm", + "vp90-2-03-size-210x208.webm", + "vp90-2-03-size-210x210.webm", + "vp90-2-03-size-210x224.webm", + "vp90-2-03-size-210x226.webm", + "vp90-2-03-size-224x196.webm", + "vp90-2-03-size-224x198.webm", + "vp90-2-03-size-224x200.webm", + "vp90-2-03-size-224x202.webm", + "vp90-2-03-size-224x208.webm", + "vp90-2-03-size-224x210.webm", + "vp90-2-03-size-224x224.webm", + "vp90-2-03-size-224x226.webm", + "vp90-2-03-size-226x196.webm", + "vp90-2-03-size-226x198.webm", + "vp90-2-03-size-226x200.webm", + "vp90-2-03-size-226x202.webm", + "vp90-2-03-size-226x208.webm", + "vp90-2-03-size-226x210.webm", + "vp90-2-03-size-226x224.webm", + "vp90-2-03-size-226x226.webm", + "vp90-2-03-size-352x288.webm", + "vp90-2-03-deltaq.webm", + "vp90-2-05-resize.ivf", + "vp90-2-06-bilinear.webm", + "vp90-2-07-frame_parallel.webm", + "vp90-2-08-tile_1x2_frame_parallel.webm", + "vp90-2-08-tile_1x2.webm", + "vp90-2-08-tile_1x4_frame_parallel.webm", + "vp90-2-08-tile_1x4.webm", + "vp90-2-08-tile_1x8_frame_parallel.webm", + "vp90-2-08-tile_1x8.webm", + "vp90-2-08-tile-4x4.webm", + "vp90-2-08-tile-4x1.webm", + "vp90-2-09-subpixel-00.ivf", + "vp90-2-02-size-lf-1920x1080.webm", + "vp90-2-09-aq2.webm", + "vp90-2-09-lf_deltas.webm", + "vp90-2-10-show-existing-frame.webm", + "vp90-2-10-show-existing-frame2.webm", + "vp90-2-11-size-351x287.webm", + "vp90-2-11-size-351x288.webm", + "vp90-2-11-size-352x287.webm", + "vp90-2-12-droppable_1.ivf", + "vp90-2-12-droppable_2.ivf", + "vp90-2-12-droppable_3.ivf", +#if !CONFIG_SIZE_LIMIT || \ + (DECODE_WIDTH_LIMIT >= 20400 && DECODE_HEIGHT_LIMIT >= 120) + "vp90-2-13-largescaling.webm", +#endif + "vp90-2-14-resize-fp-tiles-1-16.webm", + "vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm", + "vp90-2-14-resize-fp-tiles-1-2.webm", + "vp90-2-14-resize-fp-tiles-1-4.webm", + "vp90-2-14-resize-fp-tiles-16-1.webm", + "vp90-2-14-resize-fp-tiles-16-2.webm", + "vp90-2-14-resize-fp-tiles-16-4.webm", + "vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm", + "vp90-2-14-resize-fp-tiles-16-8.webm", + "vp90-2-14-resize-fp-tiles-1-8.webm", + "vp90-2-14-resize-fp-tiles-2-16.webm", + "vp90-2-14-resize-fp-tiles-2-1.webm", + "vp90-2-14-resize-fp-tiles-2-4.webm", + "vp90-2-14-resize-fp-tiles-2-8.webm", + "vp90-2-14-resize-fp-tiles-4-16.webm", + "vp90-2-14-resize-fp-tiles-4-1.webm", + "vp90-2-14-resize-fp-tiles-4-2.webm", + "vp90-2-14-resize-fp-tiles-4-8.webm", + "vp90-2-14-resize-fp-tiles-8-16.webm", + "vp90-2-14-resize-fp-tiles-8-1.webm", + "vp90-2-14-resize-fp-tiles-8-2.webm", + "vp90-2-14-resize-fp-tiles-8-4.webm", + "vp90-2-14-resize-10frames-fp-tiles-1-2-4-8.webm", + "vp90-2-14-resize-10frames-fp-tiles-1-2.webm", + "vp90-2-14-resize-10frames-fp-tiles-1-4.webm", + "vp90-2-14-resize-10frames-fp-tiles-1-8.webm", + "vp90-2-14-resize-10frames-fp-tiles-2-1.webm", + "vp90-2-14-resize-10frames-fp-tiles-2-4.webm", + "vp90-2-14-resize-10frames-fp-tiles-2-8.webm", + "vp90-2-14-resize-10frames-fp-tiles-4-1.webm", + "vp90-2-14-resize-10frames-fp-tiles-4-2.webm", + "vp90-2-14-resize-10frames-fp-tiles-4-8.webm", + "vp90-2-14-resize-10frames-fp-tiles-8-1.webm", + "vp90-2-14-resize-10frames-fp-tiles-8-2.webm", + "vp90-2-14-resize-10frames-fp-tiles-8-4-2-1.webm", + "vp90-2-14-resize-10frames-fp-tiles-8-4.webm", + "vp90-2-15-segkey.webm", + "vp90-2-15-segkey_adpq.webm", + "vp90-2-16-intra-only.webm", + "vp90-2-17-show-existing-frame.webm", + "vp90-2-18-resize.ivf", + "vp90-2-19-skip.webm", + "vp90-2-19-skip-01.webm", + "vp90-2-19-skip-02.webm", + "vp91-2-04-yuv444.webm", + "vp91-2-04-yuv422.webm", + "vp91-2-04-yuv440.webm", +#if CONFIG_VP9_HIGHBITDEPTH + "vp92-2-20-10bit-yuv420.webm", + "vp92-2-20-12bit-yuv420.webm", + "vp93-2-20-10bit-yuv422.webm", + "vp93-2-20-12bit-yuv422.webm", + "vp93-2-20-10bit-yuv440.webm", + "vp93-2-20-12bit-yuv440.webm", + "vp93-2-20-10bit-yuv444.webm", + "vp93-2-20-12bit-yuv444.webm", +#endif // CONFIG_VP9_HIGHBITDEPTH + "vp90-2-20-big_superframe-01.webm", + "vp90-2-20-big_superframe-02.webm", + "vp90-2-22-svc_1280x720_1.webm", + RESIZE_TEST_VECTORS +}; +const char *const kVP9TestVectorsSvc[] = { "vp90-2-22-svc_1280x720_3.ivf" }; +const int kNumVP9TestVectors = NELEMENTS(kVP9TestVectors); +const int kNumVP9TestVectorsSvc = NELEMENTS(kVP9TestVectorsSvc); +const char *const kVP9TestVectorsResize[] = { RESIZE_TEST_VECTORS }; +const int kNumVP9TestVectorsResize = NELEMENTS(kVP9TestVectorsResize); +#undef RESIZE_TEST_VECTORS +#endif // CONFIG_VP9_DECODER + +} // namespace libvpx_test diff --git a/media/libvpx/libvpx/test/test_vectors.h b/media/libvpx/libvpx/test/test_vectors.h new file mode 100644 index 0000000000..0a4be0f1a2 --- /dev/null +++ b/media/libvpx/libvpx/test/test_vectors.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_TEST_TEST_VECTORS_H_ +#define VPX_TEST_TEST_VECTORS_H_ + +#include "./vpx_config.h" + +namespace libvpx_test { + +#if CONFIG_VP8_DECODER +extern const int kNumVP8TestVectors; +extern const char *const kVP8TestVectors[]; +#endif + +#if CONFIG_VP9_DECODER +extern const int kNumVP9TestVectors; +extern const char *const kVP9TestVectors[]; +extern const int kNumVP9TestVectorsSvc; +extern const char *const kVP9TestVectorsSvc[]; +extern const int kNumVP9TestVectorsResize; +extern const char *const kVP9TestVectorsResize[]; +#endif // CONFIG_VP9_DECODER + +} // namespace libvpx_test + +#endif // VPX_TEST_TEST_VECTORS_H_ diff --git a/media/libvpx/libvpx/test/tile_independence_test.cc b/media/libvpx/libvpx/test/tile_independence_test.cc new file mode 100644 index 0000000000..dab6e531b7 --- /dev/null +++ b/media/libvpx/libvpx/test/tile_independence_test.cc @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/md5_helper.h" +#include "vpx_mem/vpx_mem.h" + +namespace { +class TileIndependenceTest : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam { + protected: + TileIndependenceTest() + : EncoderTest(GET_PARAM(0)), md5_fw_order_(), md5_inv_order_(), + n_tiles_(GET_PARAM(1)) { + init_flags_ = VPX_CODEC_USE_PSNR; + vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); + cfg.w = 704; + cfg.h = 144; + cfg.threads = 1; + fw_dec_ = codec_->CreateDecoder(cfg, 0); + inv_dec_ = codec_->CreateDecoder(cfg, 0); + inv_dec_->Control(VP9_INVERT_TILE_DECODE_ORDER, 1); + } + + ~TileIndependenceTest() override { + delete fw_dec_; + delete inv_dec_; + } + + void SetUp() override { + InitializeConfig(); + SetMode(libvpx_test::kTwoPassGood); + } + + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP9E_SET_TILE_COLUMNS, n_tiles_); + } + } + + void UpdateMD5(::libvpx_test::Decoder *dec, const vpx_codec_cx_pkt_t *pkt, + ::libvpx_test::MD5 *md5) { + const vpx_codec_err_t res = dec->DecodeFrame( + reinterpret_cast(pkt->data.frame.buf), pkt->data.frame.sz); + if (res != VPX_CODEC_OK) { + abort_ = true; + ASSERT_EQ(VPX_CODEC_OK, res); + } + const vpx_image_t *img = dec->GetDxData().Next(); + md5->Add(img); + } + + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + UpdateMD5(fw_dec_, pkt, &md5_fw_order_); + UpdateMD5(inv_dec_, pkt, &md5_inv_order_); + } + + ::libvpx_test::MD5 md5_fw_order_, md5_inv_order_; + ::libvpx_test::Decoder *fw_dec_, *inv_dec_; + + private: + int n_tiles_; +}; + +// run an encode with 2 or 4 tiles, and do the decode both in normal and +// inverted tile ordering. Ensure that the MD5 of the output in both cases +// is identical. If so, tiles are considered independent and the test passes. +TEST_P(TileIndependenceTest, MD5Match) { + const vpx_rational timebase = { 33333333, 1000000000 }; + cfg_.g_timebase = timebase; + cfg_.rc_target_bitrate = 500; + cfg_.g_lag_in_frames = 25; + cfg_.rc_end_usage = VPX_VBR; + + libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 704, 144, + timebase.den, timebase.num, 0, 30); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + const char *md5_fw_str = md5_fw_order_.Get(); + const char *md5_inv_str = md5_inv_order_.Get(); + + // could use ASSERT_EQ(!memcmp(.., .., 16) here, but this gives nicer + // output if it fails. Not sure if it's helpful since it's really just + // a MD5... + ASSERT_STREQ(md5_fw_str, md5_inv_str); +} + +VP9_INSTANTIATE_TEST_SUITE(TileIndependenceTest, ::testing::Range(0, 2, 1)); +} // namespace diff --git a/media/libvpx/libvpx/test/timestamp_test.cc b/media/libvpx/libvpx/test/timestamp_test.cc new file mode 100644 index 0000000000..00abf8f31c --- /dev/null +++ b/media/libvpx/libvpx/test/timestamp_test.cc @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/util.h" +#include "test/video_source.h" +#include "third_party/googletest/src/include/gtest/gtest.h" + +namespace { + +const int kVideoSourceWidth = 320; +const int kVideoSourceHeight = 240; +const int kFramesToEncode = 3; + +// A video source that exposes functions to set the timebase, framerate and +// starting pts. +class DummyTimebaseVideoSource : public ::libvpx_test::DummyVideoSource { + public: + // Parameters num and den set the timebase for the video source. + DummyTimebaseVideoSource(int num, int den) + : timebase_({ num, den }), framerate_numerator_(30), + framerate_denominator_(1), starting_pts_(0) { + SetSize(kVideoSourceWidth, kVideoSourceHeight); + set_limit(kFramesToEncode); + } + + void SetFramerate(int numerator, int denominator) { + framerate_numerator_ = numerator; + framerate_denominator_ = denominator; + } + + // Returns one frames duration in timebase units as a double. + double FrameDuration() const { + return (static_cast(timebase_.den) / timebase_.num) / + (static_cast(framerate_numerator_) / framerate_denominator_); + } + + vpx_codec_pts_t pts() const override { + return static_cast(frame_ * FrameDuration() + + starting_pts_ + 0.5); + } + + unsigned long duration() const override { + return static_cast(FrameDuration() + 0.5); + } + + vpx_rational_t timebase() const override { return timebase_; } + + void set_starting_pts(int64_t starting_pts) { starting_pts_ = starting_pts; } + + private: + vpx_rational_t timebase_; + int framerate_numerator_; + int framerate_denominator_; + int64_t starting_pts_; +}; + +class TimestampTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam { + protected: + TimestampTest() : EncoderTest(GET_PARAM(0)) {} + ~TimestampTest() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(GET_PARAM(1)); + } +}; + +// Tests encoding in millisecond timebase. +TEST_P(TimestampTest, EncodeFrames) { + DummyTimebaseVideoSource video(1, 1000); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +TEST_P(TimestampTest, TestMicrosecondTimebase) { + // Set the timebase to microseconds. + DummyTimebaseVideoSource video(1, 1000000); + video.set_limit(1); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +TEST_P(TimestampTest, TestVpxRollover) { + DummyTimebaseVideoSource video(1, 1000); + video.set_starting_pts(922337170351ll); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +VP8_INSTANTIATE_TEST_SUITE(TimestampTest, + ::testing::Values(::libvpx_test::kTwoPassGood)); +VP9_INSTANTIATE_TEST_SUITE(TimestampTest, + ::testing::Values(::libvpx_test::kTwoPassGood)); +} // namespace diff --git a/media/libvpx/libvpx/test/tools_common.sh b/media/libvpx/libvpx/test/tools_common.sh new file mode 100755 index 0000000000..d0dd24df36 --- /dev/null +++ b/media/libvpx/libvpx/test/tools_common.sh @@ -0,0 +1,447 @@ +#!/bin/sh +## +## Copyright (c) 2014 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## +## This file contains shell code shared by test scripts for libvpx tools. + +# Use $VPX_TEST_TOOLS_COMMON_SH as a pseudo include guard. +if [ -z "${VPX_TEST_TOOLS_COMMON_SH}" ]; then +VPX_TEST_TOOLS_COMMON_SH=included + +set -e +devnull='> /dev/null 2>&1' +VPX_TEST_PREFIX="" + +elog() { + echo "$@" 1>&2 +} + +vlog() { + if [ "${VPX_TEST_VERBOSE_OUTPUT}" = "yes" ]; then + echo "$@" + fi +} + +# Sets $VPX_TOOL_TEST to the name specified by positional parameter one. +test_begin() { + VPX_TOOL_TEST="${1}" +} + +# Clears the VPX_TOOL_TEST variable after confirming that $VPX_TOOL_TEST matches +# positional parameter one. +test_end() { + if [ "$1" != "${VPX_TOOL_TEST}" ]; then + echo "FAIL completed test mismatch!." + echo " completed test: ${1}" + echo " active test: ${VPX_TOOL_TEST}." + return 1 + fi + VPX_TOOL_TEST='' +} + +# Echoes the target configuration being tested. +test_configuration_target() { + vpx_config_mk="${LIBVPX_CONFIG_PATH}/config.mk" + # Find the TOOLCHAIN line, split it using ':=' as the field separator, and + # print the last field to get the value. Then pipe the value to tr to consume + # any leading/trailing spaces while allowing tr to echo the output to stdout. + awk -F ':=' '/TOOLCHAIN/ { print $NF }' "${vpx_config_mk}" | tr -d ' ' +} + +# Trap function used for failure reports and tool output directory removal. +# When the contents of $VPX_TOOL_TEST do not match the string '', reports +# failure of test stored in $VPX_TOOL_TEST. +cleanup() { + if [ -n "${VPX_TOOL_TEST}" ] && [ "${VPX_TOOL_TEST}" != '' ]; then + echo "FAIL: $VPX_TOOL_TEST" + fi + if [ -n "${VPX_TEST_OUTPUT_DIR}" ] && [ -d "${VPX_TEST_OUTPUT_DIR}" ]; then + rm -rf "${VPX_TEST_OUTPUT_DIR}" + fi +} + +# Echoes the git hash portion of the VERSION_STRING variable defined in +# $LIBVPX_CONFIG_PATH/config.mk to stdout, or the version number string when +# no git hash is contained in VERSION_STRING. +config_hash() { + vpx_config_mk="${LIBVPX_CONFIG_PATH}/config.mk" + # Find VERSION_STRING line, split it with "-g" and print the last field to + # output the git hash to stdout. + vpx_version=$(awk -F -g '/VERSION_STRING/ {print $NF}' "${vpx_config_mk}") + # Handle two situations here: + # 1. The default case: $vpx_version is a git hash, so echo it unchanged. + # 2. When being run a non-dev tree, the -g portion is not present in the + # version string: It's only the version number. + # In this case $vpx_version is something like 'VERSION_STRING=v1.3.0', so + # we echo only what is after the '='. + echo "${vpx_version##*=}" +} + +# Echoes the short form of the current git hash. +current_hash() { + if git --version > /dev/null 2>&1; then + (cd "$(dirname "${0}")" + git rev-parse --short HEAD) + else + # Return the config hash if git is unavailable: Fail silently, git hashes + # are used only for warnings. + config_hash + fi +} + +# Echoes warnings to stdout when git hash in vpx_config.h does not match the +# current git hash. +check_git_hashes() { + hash_at_configure_time=$(config_hash) + hash_now=$(current_hash) + + if [ "${hash_at_configure_time}" != "${hash_now}" ]; then + echo "Warning: git hash has changed since last configure." + fi +} + +# $1 is the name of an environment variable containing a directory name to +# test. +test_env_var_dir() { + local dir=$(eval echo "\${$1}") + if [ ! -d "${dir}" ]; then + elog "'${dir}': No such directory" + elog "The $1 environment variable must be set to a valid directory." + return 1 + fi +} + +# This script requires that the LIBVPX_BIN_PATH, LIBVPX_CONFIG_PATH, and +# LIBVPX_TEST_DATA_PATH variables are in the environment: Confirm that +# the variables are set and that they all evaluate to directory paths. +verify_vpx_test_environment() { + test_env_var_dir "LIBVPX_BIN_PATH" \ + && test_env_var_dir "LIBVPX_CONFIG_PATH" \ + && test_env_var_dir "LIBVPX_TEST_DATA_PATH" +} + +# Greps vpx_config.h in LIBVPX_CONFIG_PATH for positional parameter one, which +# should be a LIBVPX preprocessor flag. Echoes yes to stdout when the feature +# is available. +vpx_config_option_enabled() { + vpx_config_option="${1}" + vpx_config_file="${LIBVPX_CONFIG_PATH}/vpx_config.h" + config_line=$(grep "${vpx_config_option}" "${vpx_config_file}") + if echo "${config_line}" | grep -E -q '1$'; then + echo yes + fi +} + +# Echoes yes when output of test_configuration_target() contains win32 or win64. +is_windows_target() { + if test_configuration_target \ + | grep -q -e win32 -e win64 > /dev/null 2>&1; then + echo yes + fi +} + +# Echoes path to $1 when it's executable and exists in ${LIBVPX_BIN_PATH}, or an +# empty string. Caller is responsible for testing the string once the function +# returns. +vpx_tool_path() { + local tool_name="$1" + local tool_path="${LIBVPX_BIN_PATH}/${tool_name}${VPX_TEST_EXE_SUFFIX}" + if [ ! -x "${tool_path}" ]; then + # Try one directory up: when running via examples.sh the tool could be in + # the parent directory of $LIBVPX_BIN_PATH. + tool_path="${LIBVPX_BIN_PATH}/../${tool_name}${VPX_TEST_EXE_SUFFIX}" + fi + + if [ ! -x "${tool_path}" ]; then + tool_path="" + fi + echo "${tool_path}" +} + +# Echoes yes to stdout when the file named by positional parameter one exists +# in LIBVPX_BIN_PATH, and is executable. +vpx_tool_available() { + local tool_name="$1" + local tool="${LIBVPX_BIN_PATH}/${tool_name}${VPX_TEST_EXE_SUFFIX}" + [ -x "${tool}" ] && echo yes +} + +# Echoes yes to stdout when vpx_config_option_enabled() reports yes for +# CONFIG_VP8_DECODER. +vp8_decode_available() { + [ "$(vpx_config_option_enabled CONFIG_VP8_DECODER)" = "yes" ] && echo yes +} + +# Echoes yes to stdout when vpx_config_option_enabled() reports yes for +# CONFIG_VP8_ENCODER. +vp8_encode_available() { + [ "$(vpx_config_option_enabled CONFIG_VP8_ENCODER)" = "yes" ] && echo yes +} + +# Echoes yes to stdout when vpx_config_option_enabled() reports yes for +# CONFIG_VP9_DECODER. +vp9_decode_available() { + [ "$(vpx_config_option_enabled CONFIG_VP9_DECODER)" = "yes" ] && echo yes +} + +# Echoes yes to stdout when vpx_config_option_enabled() reports yes for +# CONFIG_VP9_ENCODER. +vp9_encode_available() { + [ "$(vpx_config_option_enabled CONFIG_VP9_ENCODER)" = "yes" ] && echo yes +} + +# Echoes yes to stdout when vpx_config_option_enabled() reports yes for +# CONFIG_WEBM_IO. +webm_io_available() { + [ "$(vpx_config_option_enabled CONFIG_WEBM_IO)" = "yes" ] && echo yes +} + +# Filters strings from $1 using the filter specified by $2. Filter behavior +# depends on the presence of $3. When $3 is present, strings that match the +# filter are excluded. When $3 is omitted, strings matching the filter are +# included. +# The filtered result is echoed to stdout. +filter_strings() { + strings=${1} + filter=${2} + exclude=${3} + + if [ -n "${exclude}" ]; then + # When positional parameter three exists the caller wants to remove strings. + # Tell grep to invert matches using the -v argument. + exclude='-v' + else + unset exclude + fi + + if [ -n "${filter}" ]; then + for s in ${strings}; do + if echo "${s}" | grep -E -q ${exclude} "${filter}" > /dev/null 2>&1; then + filtered_strings="${filtered_strings} ${s}" + fi + done + else + filtered_strings="${strings}" + fi + echo "${filtered_strings}" +} + +# Runs user test functions passed via positional parameters one and two. +# Functions in positional parameter one are treated as environment verification +# functions and are run unconditionally. Functions in positional parameter two +# are run according to the rules specified in vpx_test_usage(). +run_tests() { + local env_tests="verify_vpx_test_environment $1" + local tests_to_filter="$2" + local test_name="${VPX_TEST_NAME}" + + if [ -z "${test_name}" ]; then + test_name="$(basename "${0%.*}")" + fi + + if [ "${VPX_TEST_RUN_DISABLED_TESTS}" != "yes" ]; then + # Filter out DISABLED tests. + tests_to_filter=$(filter_strings "${tests_to_filter}" ^DISABLED exclude) + fi + + if [ -n "${VPX_TEST_FILTER}" ]; then + # Remove tests not matching the user's filter. + tests_to_filter=$(filter_strings "${tests_to_filter}" ${VPX_TEST_FILTER}) + fi + + # User requested test listing: Dump test names and return. + if [ "${VPX_TEST_LIST_TESTS}" = "yes" ]; then + for test_name in $tests_to_filter; do + echo ${test_name} + done + return + fi + + # Don't bother with the environment tests if everything else was disabled. + [ -z "${tests_to_filter}" ] && return + + # Combine environment and actual tests. + local tests_to_run="${env_tests} ${tests_to_filter}" + + check_git_hashes + + # Run tests. + for test in ${tests_to_run}; do + test_begin "${test}" + vlog " RUN ${test}" + "${test}" + vlog " PASS ${test}" + test_end "${test}" + done + + # C vs SIMD tests are run for x86 32-bit, 64-bit and ARM platform + if [ "${test_name}" = "vp9_c_vs_simd_encode" ]; then + local tested_config="$(current_hash)" + else + local tested_config="$(test_configuration_target) @ $(current_hash)" + fi + echo "${test_name}: Done, all tests pass for ${tested_config}." +} + +vpx_test_usage() { +cat << EOF + Usage: ${0##*/} [arguments] + --bin-path + --config-path + --filter : User test filter. Only tests matching filter are run. + --run-disabled-tests: Run disabled tests. + --help: Display this message and exit. + --test-data-path + --show-program-output: Shows output from all programs being tested. + --prefix: Allows for a user specified prefix to be inserted before all test + programs. Grants the ability, for example, to run test programs + within valgrind. + --list-tests: List all test names and exit without actually running tests. + --verbose: Verbose output. + + When the --bin-path option is not specified the script attempts to use + \$LIBVPX_BIN_PATH and then the current directory. + + When the --config-path option is not specified the script attempts to use + \$LIBVPX_CONFIG_PATH and then the current directory. + + When the -test-data-path option is not specified the script attempts to use + \$LIBVPX_TEST_DATA_PATH and then the current directory. +EOF +} + +# Returns non-zero (failure) when required environment variables are empty +# strings. +vpx_test_check_environment() { + if [ -z "${LIBVPX_BIN_PATH}" ] || \ + [ -z "${LIBVPX_CONFIG_PATH}" ] || \ + [ -z "${LIBVPX_TEST_DATA_PATH}" ]; then + return 1 + fi +} + +# Parse the command line. +while [ -n "$1" ]; do + case "$1" in + --bin-path) + LIBVPX_BIN_PATH="$2" + shift + ;; + --config-path) + LIBVPX_CONFIG_PATH="$2" + shift + ;; + --filter) + VPX_TEST_FILTER="$2" + shift + ;; + --run-disabled-tests) + VPX_TEST_RUN_DISABLED_TESTS=yes + ;; + --help) + vpx_test_usage + exit + ;; + --test-data-path) + LIBVPX_TEST_DATA_PATH="$2" + shift + ;; + --prefix) + VPX_TEST_PREFIX="$2" + shift + ;; + --verbose) + VPX_TEST_VERBOSE_OUTPUT=yes + ;; + --show-program-output) + devnull= + ;; + --list-tests) + VPX_TEST_LIST_TESTS=yes + ;; + *) + vpx_test_usage + exit 1 + ;; + esac + shift +done + +# Handle running the tests from a build directory without arguments when running +# the tests on *nix/macosx. +LIBVPX_BIN_PATH="${LIBVPX_BIN_PATH:-.}" +LIBVPX_CONFIG_PATH="${LIBVPX_CONFIG_PATH:-.}" +LIBVPX_TEST_DATA_PATH="${LIBVPX_TEST_DATA_PATH:-.}" + +# Create a temporary directory for output files, and a trap to clean it up. +if [ -n "${TMPDIR}" ]; then + VPX_TEST_TEMP_ROOT="${TMPDIR}" +elif [ -n "${TEMPDIR}" ]; then + VPX_TEST_TEMP_ROOT="${TEMPDIR}" +else + VPX_TEST_TEMP_ROOT=/tmp +fi + +VPX_TEST_OUTPUT_DIR="${VPX_TEST_TEMP_ROOT}/vpx_test_$$" + +if ! mkdir -p "${VPX_TEST_OUTPUT_DIR}" || \ + [ ! -d "${VPX_TEST_OUTPUT_DIR}" ]; then + echo "${0##*/}: Cannot create output directory, giving up." + echo "${0##*/}: VPX_TEST_OUTPUT_DIR=${VPX_TEST_OUTPUT_DIR}" + exit 1 +fi + +if [ "$(is_windows_target)" = "yes" ]; then + VPX_TEST_EXE_SUFFIX=".exe" +fi + +# Variables shared by tests. +VP8_IVF_FILE="${LIBVPX_TEST_DATA_PATH}/vp80-00-comprehensive-001.ivf" +VP9_IVF_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-09-subpixel-00.ivf" + +VP9_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-00-quantizer-00.webm" +VP9_FPM_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-07-frame_parallel-1.webm" +VP9_LT_50_FRAMES_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-02-size-32x08.webm" + +VP9_RAW_FILE="${LIBVPX_TEST_DATA_PATH}/crbug-1539.rawfile" + +YUV_RAW_INPUT="${LIBVPX_TEST_DATA_PATH}/hantro_collage_w352h288.yuv" +YUV_RAW_INPUT_WIDTH=352 +YUV_RAW_INPUT_HEIGHT=288 + +Y4M_NOSQ_PAR_INPUT="${LIBVPX_TEST_DATA_PATH}/park_joy_90p_8_420_a10-1.y4m" +Y4M_720P_INPUT="${LIBVPX_TEST_DATA_PATH}/niklas_1280_720_30.y4m" +Y4M_720P_INPUT_WIDTH=1280 +Y4M_720P_INPUT_HEIGHT=720 + +# Setup a trap function to clean up after tests complete. +trap cleanup EXIT + +vlog "$(basename "${0%.*}") test configuration: + LIBVPX_BIN_PATH=${LIBVPX_BIN_PATH} + LIBVPX_CONFIG_PATH=${LIBVPX_CONFIG_PATH} + LIBVPX_TEST_DATA_PATH=${LIBVPX_TEST_DATA_PATH} + VP8_IVF_FILE=${VP8_IVF_FILE} + VP9_IVF_FILE=${VP9_IVF_FILE} + VP9_WEBM_FILE=${VP9_WEBM_FILE} + VPX_TEST_EXE_SUFFIX=${VPX_TEST_EXE_SUFFIX} + VPX_TEST_FILTER=${VPX_TEST_FILTER} + VPX_TEST_LIST_TESTS=${VPX_TEST_LIST_TESTS} + VPX_TEST_OUTPUT_DIR=${VPX_TEST_OUTPUT_DIR} + VPX_TEST_PREFIX=${VPX_TEST_PREFIX} + VPX_TEST_RUN_DISABLED_TESTS=${VPX_TEST_RUN_DISABLED_TESTS} + VPX_TEST_SHOW_PROGRAM_OUTPUT=${VPX_TEST_SHOW_PROGRAM_OUTPUT} + VPX_TEST_TEMP_ROOT=${VPX_TEST_TEMP_ROOT} + VPX_TEST_VERBOSE_OUTPUT=${VPX_TEST_VERBOSE_OUTPUT} + YUV_RAW_INPUT=${YUV_RAW_INPUT} + YUV_RAW_INPUT_WIDTH=${YUV_RAW_INPUT_WIDTH} + YUV_RAW_INPUT_HEIGHT=${YUV_RAW_INPUT_HEIGHT} + Y4M_NOSQ_PAR_INPUT=${Y4M_NOSQ_PAR_INPUT}" + +fi # End $VPX_TEST_TOOLS_COMMON_SH pseudo include guard. diff --git a/media/libvpx/libvpx/test/twopass_encoder.sh b/media/libvpx/libvpx/test/twopass_encoder.sh new file mode 100755 index 0000000000..69ecbacd0c --- /dev/null +++ b/media/libvpx/libvpx/test/twopass_encoder.sh @@ -0,0 +1,63 @@ +#!/bin/sh +## +## Copyright (c) 2014 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## +## This file tests the libvpx twopass_encoder example. To add new tests to this +## file, do the following: +## 1. Write a shell function (this is your test). +## 2. Add the function to twopass_encoder_tests (on a new line). +## +. $(dirname $0)/tools_common.sh + +# Environment check: $YUV_RAW_INPUT is required. +twopass_encoder_verify_environment() { + if [ ! -e "${YUV_RAW_INPUT}" ]; then + echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH." + return 1 + fi +} + +# Runs twopass_encoder using the codec specified by $1 with a frame limit of +# 100. +twopass_encoder() { + local encoder="${LIBVPX_BIN_PATH}/twopass_encoder${VPX_TEST_EXE_SUFFIX}" + local codec="$1" + local output_file="${VPX_TEST_OUTPUT_DIR}/twopass_encoder_${codec}.ivf" + + if [ ! -x "${encoder}" ]; then + elog "${encoder} does not exist or is not executable." + return 1 + fi + + eval "${VPX_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \ + "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" 100 \ + ${devnull} || return 1 + + [ -e "${output_file}" ] || return 1 +} + +twopass_encoder_vp8() { + if [ "$(vp8_encode_available)" = "yes" ]; then + twopass_encoder vp8 || return 1 + fi +} + +twopass_encoder_vp9() { + if [ "$(vp9_encode_available)" = "yes" ]; then + twopass_encoder vp9 || return 1 + fi +} + + +if [ "$(vpx_config_option_enabled CONFIG_REALTIME_ONLY)" != "yes" ]; then + twopass_encoder_tests="twopass_encoder_vp8 + twopass_encoder_vp9" + + run_tests twopass_encoder_verify_environment "${twopass_encoder_tests}" +fi diff --git a/media/libvpx/libvpx/test/user_priv_test.cc b/media/libvpx/libvpx/test/user_priv_test.cc new file mode 100644 index 0000000000..20741f8268 --- /dev/null +++ b/media/libvpx/libvpx/test/user_priv_test.cc @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "./vpx_config.h" +#include "test/acm_random.h" +#include "test/codec_factory.h" +#include "test/decode_test_driver.h" +#include "test/ivf_video_source.h" +#include "test/md5_helper.h" +#include "test/util.h" +#if CONFIG_WEBM_IO +#include "test/webm_video_source.h" +#endif +#include "vpx_mem/vpx_mem.h" +#include "vpx/vp8.h" + +namespace { + +using libvpx_test::ACMRandom; +using std::string; + +#if CONFIG_WEBM_IO + +void CheckUserPrivateData(void *user_priv, int *target) { + // actual pointer value should be the same as expected. + EXPECT_EQ(reinterpret_cast(target), user_priv) + << "user_priv pointer value does not match."; +} + +// Decodes |filename|. Passes in user_priv data when calling DecodeFrame and +// compares the user_priv from return img with the original user_priv to see if +// they match. Both the pointer values and the values inside the addresses +// should match. +string DecodeFile(const string &filename) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + libvpx_test::WebMVideoSource video(filename); + video.Init(); + + vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); + libvpx_test::VP9Decoder decoder(cfg, 0); + + libvpx_test::MD5 md5; + int frame_num = 0; + for (video.Begin(); !::testing::Test::HasFailure() && video.cxdata(); + video.Next()) { + void *user_priv = reinterpret_cast(&frame_num); + const vpx_codec_err_t res = + decoder.DecodeFrame(video.cxdata(), video.frame_size(), + (frame_num == 0) ? nullptr : user_priv); + if (res != VPX_CODEC_OK) { + EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError(); + break; + } + libvpx_test::DxDataIterator dec_iter = decoder.GetDxData(); + const vpx_image_t *img = nullptr; + + // Get decompressed data. + while ((img = dec_iter.Next())) { + if (frame_num == 0) { + CheckUserPrivateData(img->user_priv, nullptr); + } else { + CheckUserPrivateData(img->user_priv, &frame_num); + + // Also test ctrl_get_reference api. + struct vp9_ref_frame ref = vp9_ref_frame(); + // Randomly fetch a reference frame. + ref.idx = rnd.Rand8() % 3; + decoder.Control(VP9_GET_REFERENCE, &ref); + + CheckUserPrivateData(ref.img.user_priv, nullptr); + } + md5.Add(img); + } + + frame_num++; + } + return string(md5.Get()); +} + +TEST(UserPrivTest, VideoDecode) { + // no tiles or frame parallel; this exercises the decoding to test the + // user_priv. + EXPECT_STREQ("b35a1b707b28e82be025d960aba039bc", + DecodeFile("vp90-2-03-size-226x226.webm").c_str()); +} + +#endif // CONFIG_WEBM_IO + +} // namespace diff --git a/media/libvpx/libvpx/test/util.h b/media/libvpx/libvpx/test/util.h new file mode 100644 index 0000000000..985f487094 --- /dev/null +++ b/media/libvpx/libvpx/test/util.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_TEST_UTIL_H_ +#define VPX_TEST_UTIL_H_ + +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "vpx/vpx_image.h" + +// Macros +#define GET_PARAM(k) std::get(GetParam()) + +inline double compute_psnr(const vpx_image_t *img1, const vpx_image_t *img2) { + assert((img1->fmt == img2->fmt) && (img1->d_w == img2->d_w) && + (img1->d_h == img2->d_h)); + + const unsigned int width_y = img1->d_w; + const unsigned int height_y = img1->d_h; + unsigned int i, j; + + int64_t sqrerr = 0; + for (i = 0; i < height_y; ++i) { + for (j = 0; j < width_y; ++j) { + int64_t d = img1->planes[VPX_PLANE_Y][i * img1->stride[VPX_PLANE_Y] + j] - + img2->planes[VPX_PLANE_Y][i * img2->stride[VPX_PLANE_Y] + j]; + sqrerr += d * d; + } + } + double mse = static_cast(sqrerr) / (width_y * height_y); + double psnr = 100.0; + if (mse > 0.0) { + psnr = 10 * log10(255.0 * 255.0 / mse); + } + return psnr; +} + +#endif // VPX_TEST_UTIL_H_ diff --git a/media/libvpx/libvpx/test/variance_test.cc b/media/libvpx/libvpx/test/variance_test.cc new file mode 100644 index 0000000000..b8320e9ceb --- /dev/null +++ b/media/libvpx/libvpx/test/variance_test.cc @@ -0,0 +1,1993 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/variance.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/vpx_timer.h" + +namespace { + +typedef unsigned int (*Get4x4SseFunc)(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride); +typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src); + +using libvpx_test::ACMRandom; + +// Truncate high bit depth results by downshifting (with rounding) by: +// 2 * (bit_depth - 8) for sse +// (bit_depth - 8) for se +static void RoundHighBitDepth(int bit_depth, int64_t *se, uint64_t *sse) { + switch (bit_depth) { + case VPX_BITS_12: + *sse = (*sse + 128) >> 8; + *se = (*se + 8) >> 4; + break; + case VPX_BITS_10: + *sse = (*sse + 8) >> 4; + *se = (*se + 2) >> 2; + break; + case VPX_BITS_8: + default: break; + } +} + +static unsigned int mb_ss_ref(const int16_t *src) { + unsigned int res = 0; + for (int i = 0; i < 256; ++i) { + res += src[i] * src[i]; + } + return res; +} + +/* Note: + * Our codebase calculates the "diff" value in the variance algorithm by + * (src - ref). + */ +static uint32_t variance_ref(const uint8_t *src, const uint8_t *ref, int l2w, + int l2h, int src_stride, int ref_stride, + uint32_t *sse_ptr, bool use_high_bit_depth_, + vpx_bit_depth_t bit_depth) { + int64_t se = 0; + uint64_t sse = 0; + const int w = 1 << l2w; + const int h = 1 << l2h; + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + int diff; + if (!use_high_bit_depth_) { + diff = src[y * src_stride + x] - ref[y * ref_stride + x]; + se += diff; + sse += diff * diff; +#if CONFIG_VP9_HIGHBITDEPTH + } else { + diff = CONVERT_TO_SHORTPTR(src)[y * src_stride + x] - + CONVERT_TO_SHORTPTR(ref)[y * ref_stride + x]; + se += diff; + sse += diff * diff; +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + } + RoundHighBitDepth(bit_depth, &se, &sse); + *sse_ptr = static_cast(sse); + return static_cast( + sse - ((static_cast(se) * se) >> (l2w + l2h))); +} + +/* The subpel reference functions differ from the codec version in one aspect: + * they calculate the bilinear factors directly instead of using a lookup table + * and therefore upshift xoff and yoff by 1. Only every other calculated value + * is used so the codec version shrinks the table to save space and maintain + * compatibility with vp8. + */ +static uint32_t subpel_variance_ref(const uint8_t *ref, const uint8_t *src, + int l2w, int l2h, int xoff, int yoff, + uint32_t *sse_ptr, bool use_high_bit_depth_, + vpx_bit_depth_t bit_depth) { + int64_t se = 0; + uint64_t sse = 0; + const int w = 1 << l2w; + const int h = 1 << l2h; + + xoff <<= 1; + yoff <<= 1; + + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + // Bilinear interpolation at a 16th pel step. + if (!use_high_bit_depth_) { + const int a1 = ref[(w + 1) * (y + 0) + x + 0]; + const int a2 = ref[(w + 1) * (y + 0) + x + 1]; + const int b1 = ref[(w + 1) * (y + 1) + x + 0]; + const int b2 = ref[(w + 1) * (y + 1) + x + 1]; + const int a = a1 + (((a2 - a1) * xoff + 8) >> 4); + const int b = b1 + (((b2 - b1) * xoff + 8) >> 4); + const int r = a + (((b - a) * yoff + 8) >> 4); + const int diff = r - src[w * y + x]; + se += diff; + sse += diff * diff; +#if CONFIG_VP9_HIGHBITDEPTH + } else { + uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref); + uint16_t *src16 = CONVERT_TO_SHORTPTR(src); + const int a1 = ref16[(w + 1) * (y + 0) + x + 0]; + const int a2 = ref16[(w + 1) * (y + 0) + x + 1]; + const int b1 = ref16[(w + 1) * (y + 1) + x + 0]; + const int b2 = ref16[(w + 1) * (y + 1) + x + 1]; + const int a = a1 + (((a2 - a1) * xoff + 8) >> 4); + const int b = b1 + (((b2 - b1) * xoff + 8) >> 4); + const int r = a + (((b - a) * yoff + 8) >> 4); + const int diff = r - src16[w * y + x]; + se += diff; + sse += diff * diff; +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + } + RoundHighBitDepth(bit_depth, &se, &sse); + *sse_ptr = static_cast(sse); + return static_cast( + sse - ((static_cast(se) * se) >> (l2w + l2h))); +} + +static uint32_t subpel_avg_variance_ref(const uint8_t *ref, const uint8_t *src, + const uint8_t *second_pred, int l2w, + int l2h, int xoff, int yoff, + uint32_t *sse_ptr, + bool use_high_bit_depth, + vpx_bit_depth_t bit_depth) { + int64_t se = 0; + uint64_t sse = 0; + const int w = 1 << l2w; + const int h = 1 << l2h; + + xoff <<= 1; + yoff <<= 1; + + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + // bilinear interpolation at a 16th pel step + if (!use_high_bit_depth) { + const int a1 = ref[(w + 1) * (y + 0) + x + 0]; + const int a2 = ref[(w + 1) * (y + 0) + x + 1]; + const int b1 = ref[(w + 1) * (y + 1) + x + 0]; + const int b2 = ref[(w + 1) * (y + 1) + x + 1]; + const int a = a1 + (((a2 - a1) * xoff + 8) >> 4); + const int b = b1 + (((b2 - b1) * xoff + 8) >> 4); + const int r = a + (((b - a) * yoff + 8) >> 4); + const int diff = + ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x]; + se += diff; + sse += diff * diff; +#if CONFIG_VP9_HIGHBITDEPTH + } else { + const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref); + const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); + const uint16_t *sec16 = CONVERT_TO_SHORTPTR(second_pred); + const int a1 = ref16[(w + 1) * (y + 0) + x + 0]; + const int a2 = ref16[(w + 1) * (y + 0) + x + 1]; + const int b1 = ref16[(w + 1) * (y + 1) + x + 0]; + const int b2 = ref16[(w + 1) * (y + 1) + x + 1]; + const int a = a1 + (((a2 - a1) * xoff + 8) >> 4); + const int b = b1 + (((b2 - b1) * xoff + 8) >> 4); + const int r = a + (((b - a) * yoff + 8) >> 4); + const int diff = ((r + sec16[w * y + x] + 1) >> 1) - src16[w * y + x]; + se += diff; + sse += diff * diff; +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + } + RoundHighBitDepth(bit_depth, &se, &sse); + *sse_ptr = static_cast(sse); + return static_cast( + sse - ((static_cast(se) * se) >> (l2w + l2h))); +} + +//////////////////////////////////////////////////////////////////////////////// + +class SumOfSquaresTest : public ::testing::TestWithParam { + public: + SumOfSquaresTest() : func_(GetParam()) {} + + ~SumOfSquaresTest() override { libvpx_test::ClearSystemState(); } + + protected: + void ConstTest(); + void RefTest(); + + SumOfSquaresFunction func_; + ACMRandom rnd_; +}; + +void SumOfSquaresTest::ConstTest() { + int16_t mem[256]; + unsigned int res; + for (int v = 0; v < 256; ++v) { + for (int i = 0; i < 256; ++i) { + mem[i] = v; + } + ASM_REGISTER_STATE_CHECK(res = func_(mem)); + EXPECT_EQ(256u * (v * v), res); + } +} + +void SumOfSquaresTest::RefTest() { + int16_t mem[256]; + for (int i = 0; i < 100; ++i) { + for (int j = 0; j < 256; ++j) { + mem[j] = rnd_.Rand8() - rnd_.Rand8(); + } + + const unsigned int expected = mb_ss_ref(mem); + unsigned int res; + ASM_REGISTER_STATE_CHECK(res = func_(mem)); + EXPECT_EQ(expected, res); + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Encapsulating struct to store the function to test along with +// some testing context. +// Can be used for MSE, SSE, Variance, etc. + +template +struct TestParams { + TestParams(int log2w = 0, int log2h = 0, Func function = nullptr, + int bit_depth_value = 0) + : log2width(log2w), log2height(log2h), func(function) { + use_high_bit_depth = (bit_depth_value > 0); + if (use_high_bit_depth) { + bit_depth = static_cast(bit_depth_value); + } else { + bit_depth = VPX_BITS_8; + } + width = 1 << log2width; + height = 1 << log2height; + block_size = width * height; + mask = (1u << bit_depth) - 1; + } + + int log2width, log2height; + int width, height; + int block_size; + Func func; + vpx_bit_depth_t bit_depth; + bool use_high_bit_depth; + uint32_t mask; +}; + +template +std::ostream &operator<<(std::ostream &os, const TestParams &p) { + return os << "log2width/height:" << p.log2width << "/" << p.log2height + << " function:" << reinterpret_cast(p.func) + << " bit-depth:" << p.bit_depth; +} + +// Main class for testing a function type +template +class MainTestClass + : public ::testing::TestWithParam > { + public: + void SetUp() override { + params_ = this->GetParam(); + + rnd_.Reset(ACMRandom::DeterministicSeed()); + const size_t unit = + use_high_bit_depth() ? sizeof(uint16_t) : sizeof(uint8_t); + src_ = reinterpret_cast(vpx_memalign(16, block_size() * unit)); + ref_ = new uint8_t[block_size() * unit]; + ASSERT_NE(src_, nullptr); + ASSERT_NE(ref_, nullptr); +#if CONFIG_VP9_HIGHBITDEPTH + if (use_high_bit_depth()) { + // TODO(skal): remove! + src_ = CONVERT_TO_BYTEPTR(src_); + ref_ = CONVERT_TO_BYTEPTR(ref_); + } +#endif + } + + void TearDown() override { +#if CONFIG_VP9_HIGHBITDEPTH + if (use_high_bit_depth()) { + // TODO(skal): remove! + src_ = reinterpret_cast(CONVERT_TO_SHORTPTR(src_)); + ref_ = reinterpret_cast(CONVERT_TO_SHORTPTR(ref_)); + } +#endif + + vpx_free(src_); + delete[] ref_; + src_ = nullptr; + ref_ = nullptr; + libvpx_test::ClearSystemState(); + } + + protected: + // We could sub-class MainTestClass into dedicated class for Variance + // and MSE/SSE, but it involves a lot of 'this->xxx' dereferencing + // to access top class fields xxx. That's cumbersome, so for now we'll just + // implement the testing methods here: + + // Variance tests + void ZeroTest(); + void RefTest(); + void RefStrideTest(); + void OneQuarterTest(); + void SpeedTest(); + + // MSE/SSE tests + void RefTestMse(); + void RefTestSse(); + void MaxTestMse(); + void MaxTestSse(); + + protected: + ACMRandom rnd_; + uint8_t *src_; + uint8_t *ref_; + TestParams params_; + + // some relay helpers + bool use_high_bit_depth() const { return params_.use_high_bit_depth; } + int byte_shift() const { return params_.bit_depth - 8; } + int block_size() const { return params_.block_size; } + int width() const { return params_.width; } + int height() const { return params_.height; } + uint32_t mask() const { return params_.mask; } +}; + +//////////////////////////////////////////////////////////////////////////////// +// Tests related to variance. + +template +void MainTestClass::ZeroTest() { + for (int i = 0; i <= 255; ++i) { + if (!use_high_bit_depth()) { + memset(src_, i, block_size()); + } else { + uint16_t *const src16 = CONVERT_TO_SHORTPTR(src_); + for (int k = 0; k < block_size(); ++k) src16[k] = i << byte_shift(); + } + for (int j = 0; j <= 255; ++j) { + if (!use_high_bit_depth()) { + memset(ref_, j, block_size()); + } else { + uint16_t *const ref16 = CONVERT_TO_SHORTPTR(ref_); + for (int k = 0; k < block_size(); ++k) ref16[k] = j << byte_shift(); + } + unsigned int sse, var; + ASM_REGISTER_STATE_CHECK( + var = params_.func(src_, width(), ref_, width(), &sse)); + EXPECT_EQ(0u, var) << "src values: " << i << " ref values: " << j; + } + } +} + +template +void MainTestClass::RefTest() { + for (int i = 0; i < 10; ++i) { + for (int j = 0; j < block_size(); j++) { + if (!use_high_bit_depth()) { + src_[j] = rnd_.Rand8(); + ref_[j] = rnd_.Rand8(); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask(); + CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask(); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + unsigned int sse1, sse2, var1, var2; + const int stride = width(); + ASM_REGISTER_STATE_CHECK( + var1 = params_.func(src_, stride, ref_, stride, &sse1)); + var2 = + variance_ref(src_, ref_, params_.log2width, params_.log2height, stride, + stride, &sse2, use_high_bit_depth(), params_.bit_depth); + EXPECT_EQ(sse1, sse2) << "Error at test index: " << i; + EXPECT_EQ(var1, var2) << "Error at test index: " << i; + } +} + +template +void MainTestClass::RefStrideTest() { + for (int i = 0; i < 10; ++i) { + const int ref_stride = (i & 1) * width(); + const int src_stride = ((i >> 1) & 1) * width(); + for (int j = 0; j < block_size(); j++) { + const int ref_ind = (j / width()) * ref_stride + j % width(); + const int src_ind = (j / width()) * src_stride + j % width(); + if (!use_high_bit_depth()) { + src_[src_ind] = rnd_.Rand8(); + ref_[ref_ind] = rnd_.Rand8(); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + CONVERT_TO_SHORTPTR(src_)[src_ind] = rnd_.Rand16() & mask(); + CONVERT_TO_SHORTPTR(ref_)[ref_ind] = rnd_.Rand16() & mask(); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + unsigned int sse1, sse2; + unsigned int var1, var2; + + ASM_REGISTER_STATE_CHECK( + var1 = params_.func(src_, src_stride, ref_, ref_stride, &sse1)); + var2 = variance_ref(src_, ref_, params_.log2width, params_.log2height, + src_stride, ref_stride, &sse2, use_high_bit_depth(), + params_.bit_depth); + EXPECT_EQ(sse1, sse2) << "Error at test index: " << i; + EXPECT_EQ(var1, var2) << "Error at test index: " << i; + } +} + +template +void MainTestClass::OneQuarterTest() { + const int half = block_size() / 2; + if (!use_high_bit_depth()) { + memset(src_, 255, block_size()); + memset(ref_, 255, half); + memset(ref_ + half, 0, half); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + vpx_memset16(CONVERT_TO_SHORTPTR(src_), 255 << byte_shift(), block_size()); + vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 255 << byte_shift(), half); + vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, 0, half); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + unsigned int sse, var, expected; + ASM_REGISTER_STATE_CHECK( + var = params_.func(src_, width(), ref_, width(), &sse)); + expected = block_size() * 255 * 255 / 4; + EXPECT_EQ(expected, var); +} + +template +void MainTestClass::SpeedTest() { + const int half = block_size() / 2; + if (!use_high_bit_depth()) { + memset(src_, 255, block_size()); + memset(ref_, 255, half); + memset(ref_ + half, 0, half); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + vpx_memset16(CONVERT_TO_SHORTPTR(src_), 255 << byte_shift(), block_size()); + vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 255 << byte_shift(), half); + vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, 0, half); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + unsigned int sse; + + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int i = 0; i < (1 << 30) / block_size(); ++i) { + const uint32_t variance = params_.func(src_, width(), ref_, width(), &sse); + // Ignore return value. + (void)variance; + } + vpx_usec_timer_mark(&timer); + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("Variance %dx%d %dbpp time: %5d ms\n", width(), height(), + params_.bit_depth, elapsed_time / 1000); +} + +//////////////////////////////////////////////////////////////////////////////// +// Tests related to MSE / SSE. + +template +void MainTestClass::RefTestMse() { + for (int i = 0; i < 10; ++i) { + for (int j = 0; j < block_size(); ++j) { + if (!use_high_bit_depth()) { + src_[j] = rnd_.Rand8(); + ref_[j] = rnd_.Rand8(); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask(); + CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask(); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + unsigned int sse1, sse2; + const int stride = width(); + ASM_REGISTER_STATE_CHECK(params_.func(src_, stride, ref_, stride, &sse1)); + variance_ref(src_, ref_, params_.log2width, params_.log2height, stride, + stride, &sse2, use_high_bit_depth(), params_.bit_depth); + EXPECT_EQ(sse1, sse2); + } +} + +template +void MainTestClass::RefTestSse() { + for (int i = 0; i < 10; ++i) { + for (int j = 0; j < block_size(); ++j) { + src_[j] = rnd_.Rand8(); + ref_[j] = rnd_.Rand8(); + } + unsigned int sse2; + unsigned int var1; + const int stride = width(); + ASM_REGISTER_STATE_CHECK(var1 = params_.func(src_, stride, ref_, stride)); + variance_ref(src_, ref_, params_.log2width, params_.log2height, stride, + stride, &sse2, false, VPX_BITS_8); + EXPECT_EQ(var1, sse2); + } +} + +template +void MainTestClass::MaxTestMse() { + if (!use_high_bit_depth()) { + memset(src_, 255, block_size()); + memset(ref_, 0, block_size()); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + vpx_memset16(CONVERT_TO_SHORTPTR(src_), 255 << byte_shift(), block_size()); + vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 0, block_size()); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + unsigned int sse; + ASM_REGISTER_STATE_CHECK(params_.func(src_, width(), ref_, width(), &sse)); + const unsigned int expected = block_size() * 255 * 255; + EXPECT_EQ(expected, sse); +} + +template +void MainTestClass::MaxTestSse() { + memset(src_, 255, block_size()); + memset(ref_, 0, block_size()); + unsigned int var; + ASM_REGISTER_STATE_CHECK(var = params_.func(src_, width(), ref_, width())); + const unsigned int expected = block_size() * 255 * 255; + EXPECT_EQ(expected, var); +} + +//////////////////////////////////////////////////////////////////////////////// + +template +class SubpelVarianceTest + : public ::testing::TestWithParam > { + public: + void SetUp() override { + params_ = this->GetParam(); + + rnd_.Reset(ACMRandom::DeterministicSeed()); + if (!use_high_bit_depth()) { + src_ = reinterpret_cast(vpx_memalign(16, block_size())); + sec_ = reinterpret_cast(vpx_memalign(16, block_size())); + ref_ = reinterpret_cast( + vpx_malloc(block_size() + width() + height() + 1)); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + src_ = CONVERT_TO_BYTEPTR(reinterpret_cast( + vpx_memalign(16, block_size() * sizeof(uint16_t)))); + sec_ = CONVERT_TO_BYTEPTR(reinterpret_cast( + vpx_memalign(16, block_size() * sizeof(uint16_t)))); + ref_ = CONVERT_TO_BYTEPTR(reinterpret_cast(vpx_malloc( + (block_size() + width() + height() + 1) * sizeof(uint16_t)))); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + ASSERT_NE(src_, nullptr); + ASSERT_NE(sec_, nullptr); + ASSERT_NE(ref_, nullptr); + } + + void TearDown() override { + if (!use_high_bit_depth()) { + vpx_free(src_); + vpx_free(sec_); + vpx_free(ref_); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + vpx_free(CONVERT_TO_SHORTPTR(src_)); + vpx_free(CONVERT_TO_SHORTPTR(ref_)); + vpx_free(CONVERT_TO_SHORTPTR(sec_)); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + libvpx_test::ClearSystemState(); + } + + protected: + void RefTest(); + void ExtremeRefTest(); + void SpeedTest(); + + ACMRandom rnd_; + uint8_t *src_; + uint8_t *ref_; + uint8_t *sec_; + TestParams params_; + + // some relay helpers + bool use_high_bit_depth() const { return params_.use_high_bit_depth; } + int byte_shift() const { return params_.bit_depth - 8; } + int block_size() const { return params_.block_size; } + int width() const { return params_.width; } + int height() const { return params_.height; } + uint32_t mask() const { return params_.mask; } +}; + +template +void SubpelVarianceTest::RefTest() { + for (int x = 0; x < 8; ++x) { + for (int y = 0; y < 8; ++y) { + if (!use_high_bit_depth()) { + for (int j = 0; j < block_size(); j++) { + src_[j] = rnd_.Rand8(); + } + for (int j = 0; j < block_size() + width() + height() + 1; j++) { + ref_[j] = rnd_.Rand8(); + } +#if CONFIG_VP9_HIGHBITDEPTH + } else { + for (int j = 0; j < block_size(); j++) { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask(); + } + for (int j = 0; j < block_size() + width() + height() + 1; j++) { + CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask(); + } +#endif // CONFIG_VP9_HIGHBITDEPTH + } + unsigned int sse1, sse2; + unsigned int var1; + ASM_REGISTER_STATE_CHECK( + var1 = params_.func(ref_, width() + 1, x, y, src_, width(), &sse1)); + const unsigned int var2 = subpel_variance_ref( + ref_, src_, params_.log2width, params_.log2height, x, y, &sse2, + use_high_bit_depth(), params_.bit_depth); + EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y; + EXPECT_EQ(var1, var2) << "at position " << x << ", " << y; + } + } +} + +template +void SubpelVarianceTest::ExtremeRefTest() { + // Compare against reference. + // Src: Set the first half of values to 0, the second half to the maximum. + // Ref: Set the first half of values to the maximum, the second half to 0. + for (int x = 0; x < 8; ++x) { + for (int y = 0; y < 8; ++y) { + const int half = block_size() / 2; + if (!use_high_bit_depth()) { + memset(src_, 0, half); + memset(src_ + half, 255, half); + memset(ref_, 255, half); + memset(ref_ + half, 0, half + width() + height() + 1); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + vpx_memset16(CONVERT_TO_SHORTPTR(src_), mask(), half); + vpx_memset16(CONVERT_TO_SHORTPTR(src_) + half, 0, half); + vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 0, half); + vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask(), + half + width() + height() + 1); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + unsigned int sse1, sse2; + unsigned int var1; + ASM_REGISTER_STATE_CHECK( + var1 = params_.func(ref_, width() + 1, x, y, src_, width(), &sse1)); + const unsigned int var2 = subpel_variance_ref( + ref_, src_, params_.log2width, params_.log2height, x, y, &sse2, + use_high_bit_depth(), params_.bit_depth); + EXPECT_EQ(sse1, sse2) << "for xoffset " << x << " and yoffset " << y; + EXPECT_EQ(var1, var2) << "for xoffset " << x << " and yoffset " << y; + } + } +} + +template +void SubpelVarianceTest::SpeedTest() { + // The only interesting points are 0, 4, and anything else. To make the loops + // simple we will use 0, 2 and 4. + for (int x = 0; x <= 4; x += 2) { + for (int y = 0; y <= 4; y += 2) { + if (!use_high_bit_depth()) { + memset(src_, 25, block_size()); + memset(ref_, 50, block_size()); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + vpx_memset16(CONVERT_TO_SHORTPTR(src_), 25, block_size()); + vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 50, block_size()); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + unsigned int sse; + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int i = 0; i < 1000000000 / block_size(); ++i) { + const uint32_t variance = + params_.func(ref_, width() + 1, x, y, src_, width(), &sse); + (void)variance; + } + vpx_usec_timer_mark(&timer); + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("SubpelVariance %dx%d xoffset: %d yoffset: %d time: %5d ms\n", + width(), height(), x, y, elapsed_time / 1000); + } + } +} + +template <> +void SubpelVarianceTest::RefTest() { + for (int x = 0; x < 8; ++x) { + for (int y = 0; y < 8; ++y) { + if (!use_high_bit_depth()) { + for (int j = 0; j < block_size(); j++) { + src_[j] = rnd_.Rand8(); + sec_[j] = rnd_.Rand8(); + } + for (int j = 0; j < block_size() + width() + height() + 1; j++) { + ref_[j] = rnd_.Rand8(); + } +#if CONFIG_VP9_HIGHBITDEPTH + } else { + for (int j = 0; j < block_size(); j++) { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask(); + CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask(); + } + for (int j = 0; j < block_size() + width() + height() + 1; j++) { + CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask(); + } +#endif // CONFIG_VP9_HIGHBITDEPTH + } + uint32_t sse1, sse2; + uint32_t var1, var2; + ASM_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 1, x, y, + src_, width(), &sse1, sec_)); + var2 = subpel_avg_variance_ref(ref_, src_, sec_, params_.log2width, + params_.log2height, x, y, &sse2, + use_high_bit_depth(), params_.bit_depth); + EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y; + EXPECT_EQ(var1, var2) << "at position " << x << ", " << y; + } + } +} + +typedef MainTestClass VpxSseTest; +typedef MainTestClass VpxMseTest; +typedef MainTestClass VpxVarianceTest; +typedef SubpelVarianceTest VpxSubpelVarianceTest; +typedef SubpelVarianceTest VpxSubpelAvgVarianceTest; + +TEST_P(VpxSseTest, RefSse) { RefTestSse(); } +TEST_P(VpxSseTest, MaxSse) { MaxTestSse(); } +TEST_P(VpxMseTest, RefMse) { RefTestMse(); } +TEST_P(VpxMseTest, MaxMse) { MaxTestMse(); } +TEST_P(VpxMseTest, DISABLED_Speed) { SpeedTest(); } +TEST_P(VpxVarianceTest, Zero) { ZeroTest(); } +TEST_P(VpxVarianceTest, Ref) { RefTest(); } +TEST_P(VpxVarianceTest, RefStride) { RefStrideTest(); } +TEST_P(VpxVarianceTest, OneQuarter) { OneQuarterTest(); } +TEST_P(VpxVarianceTest, DISABLED_Speed) { SpeedTest(); } +TEST_P(SumOfSquaresTest, Const) { ConstTest(); } +TEST_P(SumOfSquaresTest, Ref) { RefTest(); } +TEST_P(VpxSubpelVarianceTest, Ref) { RefTest(); } +TEST_P(VpxSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); } +TEST_P(VpxSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); } +TEST_P(VpxSubpelAvgVarianceTest, Ref) { RefTest(); } + +INSTANTIATE_TEST_SUITE_P(C, SumOfSquaresTest, + ::testing::Values(vpx_get_mb_ss_c)); + +typedef TestParams SseParams; +INSTANTIATE_TEST_SUITE_P(C, VpxSseTest, + ::testing::Values(SseParams(2, 2, + &vpx_get4x4sse_cs_c))); + +typedef TestParams MseParams; +INSTANTIATE_TEST_SUITE_P(C, VpxMseTest, + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_c), + MseParams(4, 3, &vpx_mse16x8_c), + MseParams(3, 4, &vpx_mse8x16_c), + MseParams(3, 3, &vpx_mse8x8_c))); + +typedef TestParams VarianceParams; +INSTANTIATE_TEST_SUITE_P( + C, VpxVarianceTest, + ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_c), + VarianceParams(6, 5, &vpx_variance64x32_c), + VarianceParams(5, 6, &vpx_variance32x64_c), + VarianceParams(5, 5, &vpx_variance32x32_c), + VarianceParams(5, 4, &vpx_variance32x16_c), + VarianceParams(4, 5, &vpx_variance16x32_c), + VarianceParams(4, 4, &vpx_variance16x16_c), + VarianceParams(4, 3, &vpx_variance16x8_c), + VarianceParams(3, 4, &vpx_variance8x16_c), + VarianceParams(3, 3, &vpx_variance8x8_c), + VarianceParams(3, 2, &vpx_variance8x4_c), + VarianceParams(2, 3, &vpx_variance4x8_c), + VarianceParams(2, 2, &vpx_variance4x4_c))); + +typedef TestParams SubpelVarianceParams; +INSTANTIATE_TEST_SUITE_P( + C, VpxSubpelVarianceTest, + ::testing::Values( + SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_c, 0), + SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_c, 0), + SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_c, 0), + SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_c, 0), + SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_c, 0), + SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_c, 0), + SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_c, 0), + SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_c, 0), + SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_c, 0), + SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_c, 0), + SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_c, 0), + SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_c, 0), + SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_c, 0))); + +typedef TestParams SubpelAvgVarianceParams; +INSTANTIATE_TEST_SUITE_P( + C, VpxSubpelAvgVarianceTest, + ::testing::Values( + SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_c, 0), + SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_c, 0), + SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_c, 0), + SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_c, 0), + SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_c, 0), + SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_c, 0), + SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_c, 0), + SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_c, 0), + SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_c, 0), + SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_c, 0), + SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_c, 0), + SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_c, 0), + SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_c, 0))); + +#if CONFIG_VP9_HIGHBITDEPTH +typedef MainTestClass VpxHBDVarianceTest; +typedef SubpelVarianceTest VpxHBDSubpelVarianceTest; +typedef SubpelVarianceTest + VpxHBDSubpelAvgVarianceTest; + +TEST_P(VpxHBDVarianceTest, Zero) { ZeroTest(); } +TEST_P(VpxHBDVarianceTest, Ref) { RefTest(); } +TEST_P(VpxHBDVarianceTest, RefStride) { RefStrideTest(); } +TEST_P(VpxHBDVarianceTest, OneQuarter) { OneQuarterTest(); } +TEST_P(VpxHBDVarianceTest, DISABLED_Speed) { SpeedTest(); } +TEST_P(VpxHBDSubpelVarianceTest, Ref) { RefTest(); } +TEST_P(VpxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); } +TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); } + +typedef MainTestClass VpxHBDMseTest; +TEST_P(VpxHBDMseTest, RefMse) { RefTestMse(); } +TEST_P(VpxHBDMseTest, MaxMse) { MaxTestMse(); } +TEST_P(VpxHBDMseTest, DISABLED_Speed) { SpeedTest(); } +INSTANTIATE_TEST_SUITE_P( + C, VpxHBDMseTest, + ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_c, VPX_BITS_12), + MseParams(4, 3, &vpx_highbd_12_mse16x8_c, VPX_BITS_12), + MseParams(3, 4, &vpx_highbd_12_mse8x16_c, VPX_BITS_12), + MseParams(3, 3, &vpx_highbd_12_mse8x8_c, VPX_BITS_12), + MseParams(4, 4, &vpx_highbd_10_mse16x16_c, VPX_BITS_10), + MseParams(4, 3, &vpx_highbd_10_mse16x8_c, VPX_BITS_10), + MseParams(3, 4, &vpx_highbd_10_mse8x16_c, VPX_BITS_10), + MseParams(3, 3, &vpx_highbd_10_mse8x8_c, VPX_BITS_10), + MseParams(4, 4, &vpx_highbd_8_mse16x16_c, VPX_BITS_8), + MseParams(4, 3, &vpx_highbd_8_mse16x8_c, VPX_BITS_8), + MseParams(3, 4, &vpx_highbd_8_mse8x16_c, VPX_BITS_8), + MseParams(3, 3, &vpx_highbd_8_mse8x8_c, VPX_BITS_8))); + +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(VpxHBDMseTest); + +INSTANTIATE_TEST_SUITE_P( + C, VpxHBDVarianceTest, + ::testing::Values(VarianceParams(6, 6, &vpx_highbd_12_variance64x64_c, 12), + VarianceParams(6, 5, &vpx_highbd_12_variance64x32_c, 12), + VarianceParams(5, 6, &vpx_highbd_12_variance32x64_c, 12), + VarianceParams(5, 5, &vpx_highbd_12_variance32x32_c, 12), + VarianceParams(5, 4, &vpx_highbd_12_variance32x16_c, 12), + VarianceParams(4, 5, &vpx_highbd_12_variance16x32_c, 12), + VarianceParams(4, 4, &vpx_highbd_12_variance16x16_c, 12), + VarianceParams(4, 3, &vpx_highbd_12_variance16x8_c, 12), + VarianceParams(3, 4, &vpx_highbd_12_variance8x16_c, 12), + VarianceParams(3, 3, &vpx_highbd_12_variance8x8_c, 12), + VarianceParams(3, 2, &vpx_highbd_12_variance8x4_c, 12), + VarianceParams(2, 3, &vpx_highbd_12_variance4x8_c, 12), + VarianceParams(2, 2, &vpx_highbd_12_variance4x4_c, 12), + VarianceParams(6, 6, &vpx_highbd_10_variance64x64_c, 10), + VarianceParams(6, 5, &vpx_highbd_10_variance64x32_c, 10), + VarianceParams(5, 6, &vpx_highbd_10_variance32x64_c, 10), + VarianceParams(5, 5, &vpx_highbd_10_variance32x32_c, 10), + VarianceParams(5, 4, &vpx_highbd_10_variance32x16_c, 10), + VarianceParams(4, 5, &vpx_highbd_10_variance16x32_c, 10), + VarianceParams(4, 4, &vpx_highbd_10_variance16x16_c, 10), + VarianceParams(4, 3, &vpx_highbd_10_variance16x8_c, 10), + VarianceParams(3, 4, &vpx_highbd_10_variance8x16_c, 10), + VarianceParams(3, 3, &vpx_highbd_10_variance8x8_c, 10), + VarianceParams(3, 2, &vpx_highbd_10_variance8x4_c, 10), + VarianceParams(2, 3, &vpx_highbd_10_variance4x8_c, 10), + VarianceParams(2, 2, &vpx_highbd_10_variance4x4_c, 10), + VarianceParams(6, 6, &vpx_highbd_8_variance64x64_c, 8), + VarianceParams(6, 5, &vpx_highbd_8_variance64x32_c, 8), + VarianceParams(5, 6, &vpx_highbd_8_variance32x64_c, 8), + VarianceParams(5, 5, &vpx_highbd_8_variance32x32_c, 8), + VarianceParams(5, 4, &vpx_highbd_8_variance32x16_c, 8), + VarianceParams(4, 5, &vpx_highbd_8_variance16x32_c, 8), + VarianceParams(4, 4, &vpx_highbd_8_variance16x16_c, 8), + VarianceParams(4, 3, &vpx_highbd_8_variance16x8_c, 8), + VarianceParams(3, 4, &vpx_highbd_8_variance8x16_c, 8), + VarianceParams(3, 3, &vpx_highbd_8_variance8x8_c, 8), + VarianceParams(3, 2, &vpx_highbd_8_variance8x4_c, 8), + VarianceParams(2, 3, &vpx_highbd_8_variance4x8_c, 8), + VarianceParams(2, 2, &vpx_highbd_8_variance4x4_c, 8))); + +INSTANTIATE_TEST_SUITE_P( + C, VpxHBDSubpelVarianceTest, + ::testing::Values( + SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_c, 8), + SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_c, 8), + SubpelVarianceParams(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_c, 8), + SubpelVarianceParams(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_c, 8), + SubpelVarianceParams(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_c, 8), + SubpelVarianceParams(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_c, 8), + SubpelVarianceParams(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_c, 8), + SubpelVarianceParams(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_c, 8), + SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_c, 8), + SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_c, 8), + SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_c, 8), + SubpelVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_c, 8), + SubpelVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_c, 8), + SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_c, + 10), + SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_c, + 10), + SubpelVarianceParams(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_c, + 10), + SubpelVarianceParams(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_c, + 10), + SubpelVarianceParams(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_c, + 10), + SubpelVarianceParams(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_c, + 10), + SubpelVarianceParams(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_c, + 10), + SubpelVarianceParams(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_c, 10), + SubpelVarianceParams(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_c, 10), + SubpelVarianceParams(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_c, 10), + SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_c, 10), + SubpelVarianceParams(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_c, 10), + SubpelVarianceParams(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_c, 10), + SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_c, + 12), + SubpelVarianceParams(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_c, + 12), + SubpelVarianceParams(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_c, + 12), + SubpelVarianceParams(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_c, + 12), + SubpelVarianceParams(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_c, + 12), + SubpelVarianceParams(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_c, + 12), + SubpelVarianceParams(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_c, + 12), + SubpelVarianceParams(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_c, 12), + SubpelVarianceParams(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_c, 12), + SubpelVarianceParams(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_c, 12), + SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_c, 12), + SubpelVarianceParams(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_c, 12), + SubpelVarianceParams(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_c, + 12))); + +INSTANTIATE_TEST_SUITE_P( + C, VpxHBDSubpelAvgVarianceTest, + ::testing::Values( + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_8_sub_pixel_avg_variance64x64_c, 8), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_8_sub_pixel_avg_variance64x32_c, 8), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_8_sub_pixel_avg_variance32x64_c, 8), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_8_sub_pixel_avg_variance32x32_c, 8), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_8_sub_pixel_avg_variance32x16_c, 8), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_8_sub_pixel_avg_variance16x32_c, 8), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_8_sub_pixel_avg_variance16x16_c, 8), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_8_sub_pixel_avg_variance16x8_c, 8), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_8_sub_pixel_avg_variance8x16_c, 8), + SubpelAvgVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_c, + 8), + SubpelAvgVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_c, + 8), + SubpelAvgVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_avg_variance4x8_c, + 8), + SubpelAvgVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_avg_variance4x4_c, + 8), + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_10_sub_pixel_avg_variance64x64_c, + 10), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_10_sub_pixel_avg_variance64x32_c, + 10), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_10_sub_pixel_avg_variance32x64_c, + 10), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_10_sub_pixel_avg_variance32x32_c, + 10), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_10_sub_pixel_avg_variance32x16_c, + 10), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_10_sub_pixel_avg_variance16x32_c, + 10), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_10_sub_pixel_avg_variance16x16_c, + 10), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_10_sub_pixel_avg_variance16x8_c, + 10), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_10_sub_pixel_avg_variance8x16_c, + 10), + SubpelAvgVarianceParams(3, 3, + &vpx_highbd_10_sub_pixel_avg_variance8x8_c, 10), + SubpelAvgVarianceParams(3, 2, + &vpx_highbd_10_sub_pixel_avg_variance8x4_c, 10), + SubpelAvgVarianceParams(2, 3, + &vpx_highbd_10_sub_pixel_avg_variance4x8_c, 10), + SubpelAvgVarianceParams(2, 2, + &vpx_highbd_10_sub_pixel_avg_variance4x4_c, 10), + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_12_sub_pixel_avg_variance64x64_c, + 12), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_12_sub_pixel_avg_variance64x32_c, + 12), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_12_sub_pixel_avg_variance32x64_c, + 12), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_12_sub_pixel_avg_variance32x32_c, + 12), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_12_sub_pixel_avg_variance32x16_c, + 12), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_12_sub_pixel_avg_variance16x32_c, + 12), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_12_sub_pixel_avg_variance16x16_c, + 12), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_12_sub_pixel_avg_variance16x8_c, + 12), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_12_sub_pixel_avg_variance8x16_c, + 12), + SubpelAvgVarianceParams(3, 3, + &vpx_highbd_12_sub_pixel_avg_variance8x8_c, 12), + SubpelAvgVarianceParams(3, 2, + &vpx_highbd_12_sub_pixel_avg_variance8x4_c, 12), + SubpelAvgVarianceParams(2, 3, + &vpx_highbd_12_sub_pixel_avg_variance4x8_c, 12), + SubpelAvgVarianceParams(2, 2, + &vpx_highbd_12_sub_pixel_avg_variance4x4_c, + 12))); +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P(SSE2, SumOfSquaresTest, + ::testing::Values(vpx_get_mb_ss_sse2)); + +INSTANTIATE_TEST_SUITE_P(SSE2, VpxMseTest, + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_sse2), + MseParams(4, 3, &vpx_mse16x8_sse2), + MseParams(3, 4, &vpx_mse8x16_sse2), + MseParams(3, 3, &vpx_mse8x8_sse2))); + +INSTANTIATE_TEST_SUITE_P( + SSE2, VpxVarianceTest, + ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_sse2), + VarianceParams(6, 5, &vpx_variance64x32_sse2), + VarianceParams(5, 6, &vpx_variance32x64_sse2), + VarianceParams(5, 5, &vpx_variance32x32_sse2), + VarianceParams(5, 4, &vpx_variance32x16_sse2), + VarianceParams(4, 5, &vpx_variance16x32_sse2), + VarianceParams(4, 4, &vpx_variance16x16_sse2), + VarianceParams(4, 3, &vpx_variance16x8_sse2), + VarianceParams(3, 4, &vpx_variance8x16_sse2), + VarianceParams(3, 3, &vpx_variance8x8_sse2), + VarianceParams(3, 2, &vpx_variance8x4_sse2), + VarianceParams(2, 3, &vpx_variance4x8_sse2), + VarianceParams(2, 2, &vpx_variance4x4_sse2))); + +INSTANTIATE_TEST_SUITE_P( + SSE2, VpxSubpelVarianceTest, + ::testing::Values( + SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_sse2, 0), + SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_sse2, 0), + SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_sse2, 0), + SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_sse2, 0), + SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_sse2, 0), + SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_sse2, 0), + SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_sse2, 0), + SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_sse2, 0), + SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_sse2, 0), + SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_sse2, 0), + SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_sse2, 0), + SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_sse2, 0), + SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_sse2, 0))); + +INSTANTIATE_TEST_SUITE_P( + SSE2, VpxSubpelAvgVarianceTest, + ::testing::Values( + SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_sse2, 0), + SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_sse2, 0), + SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_sse2, 0), + SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_sse2, 0), + SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_sse2, 0), + SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_sse2, 0), + SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_sse2, 0), + SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_sse2, 0), + SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_sse2, 0), + SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_sse2, 0), + SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0), + SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_sse2, 0), + SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0))); + +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + SSE2, VpxHBDMseTest, + ::testing::Values( + MseParams(4, 4, &vpx_highbd_12_mse16x16_sse2, VPX_BITS_12), + MseParams(3, 3, &vpx_highbd_12_mse8x8_sse2, VPX_BITS_12), + MseParams(4, 4, &vpx_highbd_10_mse16x16_sse2, VPX_BITS_10), + MseParams(3, 3, &vpx_highbd_10_mse8x8_sse2, VPX_BITS_10), + MseParams(4, 4, &vpx_highbd_8_mse16x16_sse2, VPX_BITS_8), + MseParams(3, 3, &vpx_highbd_8_mse8x8_sse2, VPX_BITS_8))); + +INSTANTIATE_TEST_SUITE_P( + SSE2, VpxHBDVarianceTest, + ::testing::Values( + VarianceParams(6, 6, &vpx_highbd_12_variance64x64_sse2, 12), + VarianceParams(6, 5, &vpx_highbd_12_variance64x32_sse2, 12), + VarianceParams(5, 6, &vpx_highbd_12_variance32x64_sse2, 12), + VarianceParams(5, 5, &vpx_highbd_12_variance32x32_sse2, 12), + VarianceParams(5, 4, &vpx_highbd_12_variance32x16_sse2, 12), + VarianceParams(4, 5, &vpx_highbd_12_variance16x32_sse2, 12), + VarianceParams(4, 4, &vpx_highbd_12_variance16x16_sse2, 12), + VarianceParams(4, 3, &vpx_highbd_12_variance16x8_sse2, 12), + VarianceParams(3, 4, &vpx_highbd_12_variance8x16_sse2, 12), + VarianceParams(3, 3, &vpx_highbd_12_variance8x8_sse2, 12), + VarianceParams(6, 6, &vpx_highbd_10_variance64x64_sse2, 10), + VarianceParams(6, 5, &vpx_highbd_10_variance64x32_sse2, 10), + VarianceParams(5, 6, &vpx_highbd_10_variance32x64_sse2, 10), + VarianceParams(5, 5, &vpx_highbd_10_variance32x32_sse2, 10), + VarianceParams(5, 4, &vpx_highbd_10_variance32x16_sse2, 10), + VarianceParams(4, 5, &vpx_highbd_10_variance16x32_sse2, 10), + VarianceParams(4, 4, &vpx_highbd_10_variance16x16_sse2, 10), + VarianceParams(4, 3, &vpx_highbd_10_variance16x8_sse2, 10), + VarianceParams(3, 4, &vpx_highbd_10_variance8x16_sse2, 10), + VarianceParams(3, 3, &vpx_highbd_10_variance8x8_sse2, 10), + VarianceParams(6, 6, &vpx_highbd_8_variance64x64_sse2, 8), + VarianceParams(6, 5, &vpx_highbd_8_variance64x32_sse2, 8), + VarianceParams(5, 6, &vpx_highbd_8_variance32x64_sse2, 8), + VarianceParams(5, 5, &vpx_highbd_8_variance32x32_sse2, 8), + VarianceParams(5, 4, &vpx_highbd_8_variance32x16_sse2, 8), + VarianceParams(4, 5, &vpx_highbd_8_variance16x32_sse2, 8), + VarianceParams(4, 4, &vpx_highbd_8_variance16x16_sse2, 8), + VarianceParams(4, 3, &vpx_highbd_8_variance16x8_sse2, 8), + VarianceParams(3, 4, &vpx_highbd_8_variance8x16_sse2, 8), + VarianceParams(3, 3, &vpx_highbd_8_variance8x8_sse2, 8))); + +INSTANTIATE_TEST_SUITE_P( + SSE2, VpxHBDSubpelVarianceTest, + ::testing::Values( + SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_sse2, + 12), + SubpelVarianceParams(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_sse2, + 12), + SubpelVarianceParams(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_sse2, + 12), + SubpelVarianceParams(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_sse2, + 12), + SubpelVarianceParams(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_sse2, + 12), + SubpelVarianceParams(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_sse2, + 12), + SubpelVarianceParams(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_sse2, + 12), + SubpelVarianceParams(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_sse2, + 12), + SubpelVarianceParams(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_sse2, + 12), + SubpelVarianceParams(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_sse2, + 12), + SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_sse2, + 12), + SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_sse2, + 10), + SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_sse2, + 10), + SubpelVarianceParams(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_sse2, + 10), + SubpelVarianceParams(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_sse2, + 10), + SubpelVarianceParams(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_sse2, + 10), + SubpelVarianceParams(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_sse2, + 10), + SubpelVarianceParams(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_sse2, + 10), + SubpelVarianceParams(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_sse2, + 10), + SubpelVarianceParams(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_sse2, + 10), + SubpelVarianceParams(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_sse2, + 10), + SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_sse2, + 10), + SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_sse2, + 8), + SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_sse2, + 8), + SubpelVarianceParams(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_sse2, + 8), + SubpelVarianceParams(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_sse2, + 8), + SubpelVarianceParams(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_sse2, + 8), + SubpelVarianceParams(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_sse2, + 8), + SubpelVarianceParams(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_sse2, + 8), + SubpelVarianceParams(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_sse2, + 8), + SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_sse2, + 8), + SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_sse2, 8), + SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_sse2, + 8))); + +INSTANTIATE_TEST_SUITE_P( + SSE2, VpxHBDSubpelAvgVarianceTest, + ::testing::Values( + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_12_sub_pixel_avg_variance64x64_sse2, + 12), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_12_sub_pixel_avg_variance64x32_sse2, + 12), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_12_sub_pixel_avg_variance32x64_sse2, + 12), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_12_sub_pixel_avg_variance32x32_sse2, + 12), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_12_sub_pixel_avg_variance32x16_sse2, + 12), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_12_sub_pixel_avg_variance16x32_sse2, + 12), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_12_sub_pixel_avg_variance16x16_sse2, + 12), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_12_sub_pixel_avg_variance16x8_sse2, + 12), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_12_sub_pixel_avg_variance8x16_sse2, + 12), + SubpelAvgVarianceParams(3, 3, + &vpx_highbd_12_sub_pixel_avg_variance8x8_sse2, + 12), + SubpelAvgVarianceParams(3, 2, + &vpx_highbd_12_sub_pixel_avg_variance8x4_sse2, + 12), + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_10_sub_pixel_avg_variance64x64_sse2, + 10), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_10_sub_pixel_avg_variance64x32_sse2, + 10), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_10_sub_pixel_avg_variance32x64_sse2, + 10), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_10_sub_pixel_avg_variance32x32_sse2, + 10), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_10_sub_pixel_avg_variance32x16_sse2, + 10), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_10_sub_pixel_avg_variance16x32_sse2, + 10), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_10_sub_pixel_avg_variance16x16_sse2, + 10), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_10_sub_pixel_avg_variance16x8_sse2, + 10), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_10_sub_pixel_avg_variance8x16_sse2, + 10), + SubpelAvgVarianceParams(3, 3, + &vpx_highbd_10_sub_pixel_avg_variance8x8_sse2, + 10), + SubpelAvgVarianceParams(3, 2, + &vpx_highbd_10_sub_pixel_avg_variance8x4_sse2, + 10), + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_8_sub_pixel_avg_variance64x64_sse2, + 8), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_8_sub_pixel_avg_variance64x32_sse2, + 8), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_8_sub_pixel_avg_variance32x64_sse2, + 8), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_8_sub_pixel_avg_variance32x32_sse2, + 8), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_8_sub_pixel_avg_variance32x16_sse2, + 8), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_8_sub_pixel_avg_variance16x32_sse2, + 8), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_8_sub_pixel_avg_variance16x16_sse2, + 8), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_8_sub_pixel_avg_variance16x8_sse2, + 8), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_8_sub_pixel_avg_variance8x16_sse2, + 8), + SubpelAvgVarianceParams(3, 3, + &vpx_highbd_8_sub_pixel_avg_variance8x8_sse2, + 8), + SubpelAvgVarianceParams(3, 2, + &vpx_highbd_8_sub_pixel_avg_variance8x4_sse2, + 8))); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_SSE2 + +#if HAVE_SSSE3 +INSTANTIATE_TEST_SUITE_P( + SSSE3, VpxSubpelVarianceTest, + ::testing::Values( + SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_ssse3, 0), + SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_ssse3, 0), + SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_ssse3, 0), + SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_ssse3, 0), + SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_ssse3, 0), + SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_ssse3, 0), + SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_ssse3, 0), + SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_ssse3, 0), + SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_ssse3, 0), + SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_ssse3, 0), + SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_ssse3, 0), + SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_ssse3, 0), + SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_ssse3, 0))); + +INSTANTIATE_TEST_SUITE_P( + SSSE3, VpxSubpelAvgVarianceTest, + ::testing::Values( + SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_ssse3, + 0), + SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_ssse3, + 0), + SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_ssse3, + 0), + SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_ssse3, + 0), + SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_ssse3, + 0), + SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_ssse3, + 0), + SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_ssse3, + 0), + SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_ssse3, 0), + SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_ssse3, 0), + SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_ssse3, 0), + SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_ssse3, 0), + SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_ssse3, 0), + SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_ssse3, + 0))); +#endif // HAVE_SSSE3 + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P(AVX2, VpxMseTest, + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_avx2), + MseParams(4, 3, &vpx_mse16x8_avx2))); + +INSTANTIATE_TEST_SUITE_P( + AVX2, VpxVarianceTest, + ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_avx2), + VarianceParams(6, 5, &vpx_variance64x32_avx2), + VarianceParams(5, 6, &vpx_variance32x64_avx2), + VarianceParams(5, 5, &vpx_variance32x32_avx2), + VarianceParams(5, 4, &vpx_variance32x16_avx2), + VarianceParams(4, 5, &vpx_variance16x32_avx2), + VarianceParams(4, 4, &vpx_variance16x16_avx2), + VarianceParams(4, 3, &vpx_variance16x8_avx2), + VarianceParams(3, 4, &vpx_variance8x16_avx2), + VarianceParams(3, 3, &vpx_variance8x8_avx2), + VarianceParams(3, 2, &vpx_variance8x4_avx2))); + +INSTANTIATE_TEST_SUITE_P( + AVX2, VpxSubpelVarianceTest, + ::testing::Values( + SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_avx2, 0), + SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_avx2, 0))); + +INSTANTIATE_TEST_SUITE_P( + AVX2, VpxSubpelAvgVarianceTest, + ::testing::Values( + SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_avx2, 0), + SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_avx2, + 0))); +#endif // HAVE_AVX2 + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, VpxSseTest, + ::testing::Values(SseParams(2, 2, + &vpx_get4x4sse_cs_neon))); + +INSTANTIATE_TEST_SUITE_P(NEON, VpxMseTest, + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_neon), + MseParams(4, 3, &vpx_mse16x8_neon), + MseParams(3, 4, &vpx_mse8x16_neon), + MseParams(3, 3, &vpx_mse8x8_neon))); + +INSTANTIATE_TEST_SUITE_P( + NEON, VpxVarianceTest, + ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_neon), + VarianceParams(6, 5, &vpx_variance64x32_neon), + VarianceParams(5, 6, &vpx_variance32x64_neon), + VarianceParams(5, 5, &vpx_variance32x32_neon), + VarianceParams(5, 4, &vpx_variance32x16_neon), + VarianceParams(4, 5, &vpx_variance16x32_neon), + VarianceParams(4, 4, &vpx_variance16x16_neon), + VarianceParams(4, 3, &vpx_variance16x8_neon), + VarianceParams(3, 4, &vpx_variance8x16_neon), + VarianceParams(3, 3, &vpx_variance8x8_neon), + VarianceParams(3, 2, &vpx_variance8x4_neon), + VarianceParams(2, 3, &vpx_variance4x8_neon), + VarianceParams(2, 2, &vpx_variance4x4_neon))); + +#if HAVE_NEON_DOTPROD +INSTANTIATE_TEST_SUITE_P( + NEON_DOTPROD, VpxSseTest, + ::testing::Values(SseParams(2, 2, &vpx_get4x4sse_cs_neon_dotprod))); + +INSTANTIATE_TEST_SUITE_P( + NEON_DOTPROD, VpxMseTest, + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_neon_dotprod), + MseParams(4, 3, &vpx_mse16x8_neon_dotprod), + MseParams(3, 4, &vpx_mse8x16_neon_dotprod), + MseParams(3, 3, &vpx_mse8x8_neon_dotprod))); + +INSTANTIATE_TEST_SUITE_P( + NEON_DOTPROD, VpxVarianceTest, + ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_neon_dotprod), + VarianceParams(6, 5, &vpx_variance64x32_neon_dotprod), + VarianceParams(5, 6, &vpx_variance32x64_neon_dotprod), + VarianceParams(5, 5, &vpx_variance32x32_neon_dotprod), + VarianceParams(5, 4, &vpx_variance32x16_neon_dotprod), + VarianceParams(4, 5, &vpx_variance16x32_neon_dotprod), + VarianceParams(4, 4, &vpx_variance16x16_neon_dotprod), + VarianceParams(4, 3, &vpx_variance16x8_neon_dotprod), + VarianceParams(3, 4, &vpx_variance8x16_neon_dotprod), + VarianceParams(3, 3, &vpx_variance8x8_neon_dotprod), + VarianceParams(3, 2, &vpx_variance8x4_neon_dotprod), + VarianceParams(2, 3, &vpx_variance4x8_neon_dotprod), + VarianceParams(2, 2, &vpx_variance4x4_neon_dotprod))); +#endif // HAVE_NEON_DOTPROD + +INSTANTIATE_TEST_SUITE_P( + NEON, VpxSubpelVarianceTest, + ::testing::Values( + SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_neon, 0), + SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_neon, 0), + SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_neon, 0), + SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_neon, 0), + SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_neon, 0), + SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_neon, 0), + SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_neon, 0), + SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_neon, 0), + SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_neon, 0), + SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_neon, 0), + SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_neon, 0), + SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_neon, 0), + SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_neon, 0))); + +INSTANTIATE_TEST_SUITE_P( + NEON, VpxSubpelAvgVarianceTest, + ::testing::Values( + SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_neon, 0), + SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_neon, 0), + SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_neon, 0), + SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_neon, 0), + SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_neon, 0), + SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_neon, 0), + SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_neon, 0), + SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_neon, 0), + SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_neon, 0), + SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_neon, 0), + SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_neon, 0), + SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_neon, 0), + SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_neon, 0))); + +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + NEON, VpxHBDMseTest, + ::testing::Values( + MseParams(4, 4, &vpx_highbd_12_mse16x16_neon, VPX_BITS_12), + MseParams(4, 3, &vpx_highbd_12_mse16x8_neon, VPX_BITS_12), + MseParams(3, 4, &vpx_highbd_12_mse8x16_neon, VPX_BITS_12), + MseParams(3, 3, &vpx_highbd_12_mse8x8_neon, VPX_BITS_12), + MseParams(4, 4, &vpx_highbd_10_mse16x16_neon, VPX_BITS_10), + MseParams(4, 3, &vpx_highbd_10_mse16x8_neon, VPX_BITS_10), + MseParams(3, 4, &vpx_highbd_10_mse8x16_neon, VPX_BITS_10), + MseParams(3, 3, &vpx_highbd_10_mse8x8_neon, VPX_BITS_10), + MseParams(4, 4, &vpx_highbd_8_mse16x16_neon, VPX_BITS_8), + MseParams(4, 3, &vpx_highbd_8_mse16x8_neon, VPX_BITS_8), + MseParams(3, 4, &vpx_highbd_8_mse8x16_neon, VPX_BITS_8), + MseParams(3, 3, &vpx_highbd_8_mse8x8_neon, VPX_BITS_8))); + +// TODO(webm:1819): Re-enable when vpx_highbd_8_mse16x16_neon_dotprod, etc. can +// be used again. +#if 0 +#if HAVE_NEON_DOTPROD +INSTANTIATE_TEST_SUITE_P( + NEON_DOTPROD, VpxHBDMseTest, + ::testing::Values( + MseParams(4, 4, &vpx_highbd_8_mse16x16_neon_dotprod, VPX_BITS_8), + MseParams(4, 3, &vpx_highbd_8_mse16x8_neon_dotprod, VPX_BITS_8), + MseParams(3, 4, &vpx_highbd_8_mse8x16_neon_dotprod, VPX_BITS_8), + MseParams(3, 3, &vpx_highbd_8_mse8x8_neon_dotprod, VPX_BITS_8))); +#endif // HAVE_NEON_DOTPROD +#endif // 0 + +INSTANTIATE_TEST_SUITE_P( + NEON, VpxHBDVarianceTest, + ::testing::Values( + VarianceParams(6, 6, &vpx_highbd_12_variance64x64_neon, 12), + VarianceParams(6, 5, &vpx_highbd_12_variance64x32_neon, 12), + VarianceParams(5, 6, &vpx_highbd_12_variance32x64_neon, 12), + VarianceParams(5, 5, &vpx_highbd_12_variance32x32_neon, 12), + VarianceParams(5, 4, &vpx_highbd_12_variance32x16_neon, 12), + VarianceParams(4, 5, &vpx_highbd_12_variance16x32_neon, 12), + VarianceParams(4, 4, &vpx_highbd_12_variance16x16_neon, 12), + VarianceParams(4, 3, &vpx_highbd_12_variance16x8_neon, 12), + VarianceParams(3, 4, &vpx_highbd_12_variance8x16_neon, 12), + VarianceParams(3, 3, &vpx_highbd_12_variance8x8_neon, 12), + VarianceParams(3, 2, &vpx_highbd_12_variance8x4_neon, 12), + VarianceParams(2, 3, &vpx_highbd_12_variance4x8_neon, 12), + VarianceParams(2, 2, &vpx_highbd_12_variance4x4_neon, 12), + VarianceParams(6, 6, &vpx_highbd_10_variance64x64_neon, 10), + VarianceParams(6, 5, &vpx_highbd_10_variance64x32_neon, 10), + VarianceParams(5, 6, &vpx_highbd_10_variance32x64_neon, 10), + VarianceParams(5, 5, &vpx_highbd_10_variance32x32_neon, 10), + VarianceParams(5, 4, &vpx_highbd_10_variance32x16_neon, 10), + VarianceParams(4, 5, &vpx_highbd_10_variance16x32_neon, 10), + VarianceParams(4, 4, &vpx_highbd_10_variance16x16_neon, 10), + VarianceParams(4, 3, &vpx_highbd_10_variance16x8_neon, 10), + VarianceParams(3, 4, &vpx_highbd_10_variance8x16_neon, 10), + VarianceParams(3, 3, &vpx_highbd_10_variance8x8_neon, 10), + VarianceParams(3, 2, &vpx_highbd_10_variance8x4_neon, 10), + VarianceParams(2, 3, &vpx_highbd_10_variance4x8_neon, 10), + VarianceParams(2, 2, &vpx_highbd_10_variance4x4_neon, 10), + VarianceParams(6, 6, &vpx_highbd_8_variance64x64_neon, 8), + VarianceParams(6, 5, &vpx_highbd_8_variance64x32_neon, 8), + VarianceParams(5, 6, &vpx_highbd_8_variance32x64_neon, 8), + VarianceParams(5, 5, &vpx_highbd_8_variance32x32_neon, 8), + VarianceParams(5, 4, &vpx_highbd_8_variance32x16_neon, 8), + VarianceParams(4, 5, &vpx_highbd_8_variance16x32_neon, 8), + VarianceParams(4, 4, &vpx_highbd_8_variance16x16_neon, 8), + VarianceParams(4, 3, &vpx_highbd_8_variance16x8_neon, 8), + VarianceParams(3, 4, &vpx_highbd_8_variance8x16_neon, 8), + VarianceParams(3, 3, &vpx_highbd_8_variance8x8_neon, 8), + VarianceParams(3, 2, &vpx_highbd_8_variance8x4_neon, 8), + VarianceParams(2, 3, &vpx_highbd_8_variance4x8_neon, 8), + VarianceParams(2, 2, &vpx_highbd_8_variance4x4_neon, 8))); + +INSTANTIATE_TEST_SUITE_P( + NEON, VpxHBDSubpelVarianceTest, + ::testing::Values( + SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_neon, + 12), + SubpelVarianceParams(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_neon, + 12), + SubpelVarianceParams(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_neon, + 12), + SubpelVarianceParams(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_neon, + 12), + SubpelVarianceParams(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_neon, + 12), + SubpelVarianceParams(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_neon, + 12), + SubpelVarianceParams(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_neon, + 12), + SubpelVarianceParams(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_neon, + 12), + SubpelVarianceParams(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_neon, + 12), + SubpelVarianceParams(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_neon, + 12), + SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_neon, + 12), + SubpelVarianceParams(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_neon, + 12), + SubpelVarianceParams(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_neon, + 12), + SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_neon, + 10), + SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_neon, + 10), + SubpelVarianceParams(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_neon, + 10), + SubpelVarianceParams(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_neon, + 10), + SubpelVarianceParams(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_neon, + 10), + SubpelVarianceParams(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_neon, + 10), + SubpelVarianceParams(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_neon, + 10), + SubpelVarianceParams(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_neon, + 10), + SubpelVarianceParams(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_neon, + 10), + SubpelVarianceParams(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_neon, + 10), + SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_neon, + 10), + SubpelVarianceParams(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_neon, + 10), + SubpelVarianceParams(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_neon, + 10), + SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_neon, + 8), + SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_neon, + 8), + SubpelVarianceParams(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_neon, + 8), + SubpelVarianceParams(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_neon, + 8), + SubpelVarianceParams(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_neon, + 8), + SubpelVarianceParams(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_neon, + 8), + SubpelVarianceParams(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_neon, + 8), + SubpelVarianceParams(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_neon, + 8), + SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_neon, + 8), + SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_neon, 8), + SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon, 8), + SubpelVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_neon, 8), + SubpelVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_neon, + 8))); + +INSTANTIATE_TEST_SUITE_P( + NEON, VpxHBDSubpelAvgVarianceTest, + ::testing::Values( + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_12_sub_pixel_avg_variance64x64_neon, + 12), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_12_sub_pixel_avg_variance64x32_neon, + 12), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_12_sub_pixel_avg_variance32x64_neon, + 12), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_12_sub_pixel_avg_variance32x32_neon, + 12), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_12_sub_pixel_avg_variance32x16_neon, + 12), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_12_sub_pixel_avg_variance16x32_neon, + 12), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_12_sub_pixel_avg_variance16x16_neon, + 12), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_12_sub_pixel_avg_variance16x8_neon, + 12), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_12_sub_pixel_avg_variance8x16_neon, + 12), + SubpelAvgVarianceParams(3, 3, + &vpx_highbd_12_sub_pixel_avg_variance8x8_neon, + 12), + SubpelAvgVarianceParams(3, 2, + &vpx_highbd_12_sub_pixel_avg_variance8x4_neon, + 12), + SubpelAvgVarianceParams(2, 3, + &vpx_highbd_12_sub_pixel_avg_variance4x8_neon, + 12), + SubpelAvgVarianceParams(2, 2, + &vpx_highbd_12_sub_pixel_avg_variance4x4_neon, + 12), + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_10_sub_pixel_avg_variance64x64_neon, + 10), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_10_sub_pixel_avg_variance64x32_neon, + 10), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_10_sub_pixel_avg_variance32x64_neon, + 10), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_10_sub_pixel_avg_variance32x32_neon, + 10), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_10_sub_pixel_avg_variance32x16_neon, + 10), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_10_sub_pixel_avg_variance16x32_neon, + 10), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_10_sub_pixel_avg_variance16x16_neon, + 10), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_10_sub_pixel_avg_variance16x8_neon, + 10), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_10_sub_pixel_avg_variance8x16_neon, + 10), + SubpelAvgVarianceParams(3, 3, + &vpx_highbd_10_sub_pixel_avg_variance8x8_neon, + 10), + SubpelAvgVarianceParams(3, 2, + &vpx_highbd_10_sub_pixel_avg_variance8x4_neon, + 10), + SubpelAvgVarianceParams(2, 3, + &vpx_highbd_10_sub_pixel_avg_variance4x8_neon, + 10), + SubpelAvgVarianceParams(2, 2, + &vpx_highbd_10_sub_pixel_avg_variance4x4_neon, + 10), + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_8_sub_pixel_avg_variance64x64_neon, + 8), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_8_sub_pixel_avg_variance64x32_neon, + 8), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_8_sub_pixel_avg_variance32x64_neon, + 8), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_8_sub_pixel_avg_variance32x32_neon, + 8), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_8_sub_pixel_avg_variance32x16_neon, + 8), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_8_sub_pixel_avg_variance16x32_neon, + 8), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_8_sub_pixel_avg_variance16x16_neon, + 8), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_8_sub_pixel_avg_variance16x8_neon, + 8), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_8_sub_pixel_avg_variance8x16_neon, + 8), + SubpelAvgVarianceParams(3, 3, + &vpx_highbd_8_sub_pixel_avg_variance8x8_neon, + 8), + SubpelAvgVarianceParams(3, 2, + &vpx_highbd_8_sub_pixel_avg_variance8x4_neon, + 8), + SubpelAvgVarianceParams(2, 3, + &vpx_highbd_8_sub_pixel_avg_variance4x8_neon, + 8), + SubpelAvgVarianceParams(2, 2, + &vpx_highbd_8_sub_pixel_avg_variance4x4_neon, + 8))); + +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_NEON + +#if HAVE_MSA +INSTANTIATE_TEST_SUITE_P(MSA, SumOfSquaresTest, + ::testing::Values(vpx_get_mb_ss_msa)); + +INSTANTIATE_TEST_SUITE_P(MSA, VpxSseTest, + ::testing::Values(SseParams(2, 2, + &vpx_get4x4sse_cs_msa))); + +INSTANTIATE_TEST_SUITE_P(MSA, VpxMseTest, + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_msa), + MseParams(4, 3, &vpx_mse16x8_msa), + MseParams(3, 4, &vpx_mse8x16_msa), + MseParams(3, 3, &vpx_mse8x8_msa))); + +INSTANTIATE_TEST_SUITE_P( + MSA, VpxVarianceTest, + ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_msa), + VarianceParams(6, 5, &vpx_variance64x32_msa), + VarianceParams(5, 6, &vpx_variance32x64_msa), + VarianceParams(5, 5, &vpx_variance32x32_msa), + VarianceParams(5, 4, &vpx_variance32x16_msa), + VarianceParams(4, 5, &vpx_variance16x32_msa), + VarianceParams(4, 4, &vpx_variance16x16_msa), + VarianceParams(4, 3, &vpx_variance16x8_msa), + VarianceParams(3, 4, &vpx_variance8x16_msa), + VarianceParams(3, 3, &vpx_variance8x8_msa), + VarianceParams(3, 2, &vpx_variance8x4_msa), + VarianceParams(2, 3, &vpx_variance4x8_msa), + VarianceParams(2, 2, &vpx_variance4x4_msa))); + +INSTANTIATE_TEST_SUITE_P( + MSA, VpxSubpelVarianceTest, + ::testing::Values( + SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_msa, 0), + SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_msa, 0), + SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_msa, 0), + SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_msa, 0), + SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_msa, 0), + SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_msa, 0), + SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_msa, 0), + SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_msa, 0), + SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_msa, 0), + SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_msa, 0), + SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_msa, 0), + SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_msa, 0), + SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_msa, 0))); + +INSTANTIATE_TEST_SUITE_P( + MSA, VpxSubpelAvgVarianceTest, + ::testing::Values( + SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_msa, 0), + SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_msa, 0), + SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_msa, 0), + SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_msa, 0), + SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_msa, 0), + SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_msa, 0), + SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_msa, 0), + SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_msa, 0), + SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_msa, 0), + SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_msa, 0), + SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_msa, 0), + SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_msa, 0), + SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_msa, 0))); +#endif // HAVE_MSA + +#if HAVE_VSX +INSTANTIATE_TEST_SUITE_P(VSX, SumOfSquaresTest, + ::testing::Values(vpx_get_mb_ss_vsx)); + +INSTANTIATE_TEST_SUITE_P(VSX, VpxSseTest, + ::testing::Values(SseParams(2, 2, + &vpx_get4x4sse_cs_vsx))); +INSTANTIATE_TEST_SUITE_P(VSX, VpxMseTest, + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_vsx), + MseParams(4, 3, &vpx_mse16x8_vsx), + MseParams(3, 4, &vpx_mse8x16_vsx), + MseParams(3, 3, &vpx_mse8x8_vsx))); + +INSTANTIATE_TEST_SUITE_P( + VSX, VpxVarianceTest, + ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_vsx), + VarianceParams(6, 5, &vpx_variance64x32_vsx), + VarianceParams(5, 6, &vpx_variance32x64_vsx), + VarianceParams(5, 5, &vpx_variance32x32_vsx), + VarianceParams(5, 4, &vpx_variance32x16_vsx), + VarianceParams(4, 5, &vpx_variance16x32_vsx), + VarianceParams(4, 4, &vpx_variance16x16_vsx), + VarianceParams(4, 3, &vpx_variance16x8_vsx), + VarianceParams(3, 4, &vpx_variance8x16_vsx), + VarianceParams(3, 3, &vpx_variance8x8_vsx), + VarianceParams(3, 2, &vpx_variance8x4_vsx), + VarianceParams(2, 3, &vpx_variance4x8_vsx), + VarianceParams(2, 2, &vpx_variance4x4_vsx))); +#endif // HAVE_VSX + +#if HAVE_MMI +INSTANTIATE_TEST_SUITE_P(MMI, VpxMseTest, + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_mmi), + MseParams(4, 3, &vpx_mse16x8_mmi), + MseParams(3, 4, &vpx_mse8x16_mmi), + MseParams(3, 3, &vpx_mse8x8_mmi))); + +INSTANTIATE_TEST_SUITE_P( + MMI, VpxVarianceTest, + ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_mmi), + VarianceParams(6, 5, &vpx_variance64x32_mmi), + VarianceParams(5, 6, &vpx_variance32x64_mmi), + VarianceParams(5, 5, &vpx_variance32x32_mmi), + VarianceParams(5, 4, &vpx_variance32x16_mmi), + VarianceParams(4, 5, &vpx_variance16x32_mmi), + VarianceParams(4, 4, &vpx_variance16x16_mmi), + VarianceParams(4, 3, &vpx_variance16x8_mmi), + VarianceParams(3, 4, &vpx_variance8x16_mmi), + VarianceParams(3, 3, &vpx_variance8x8_mmi), + VarianceParams(3, 2, &vpx_variance8x4_mmi), + VarianceParams(2, 3, &vpx_variance4x8_mmi), + VarianceParams(2, 2, &vpx_variance4x4_mmi))); + +INSTANTIATE_TEST_SUITE_P( + MMI, VpxSubpelVarianceTest, + ::testing::Values( + SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_mmi, 0), + SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_mmi, 0), + SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_mmi, 0), + SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_mmi, 0), + SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_mmi, 0), + SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_mmi, 0), + SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_mmi, 0), + SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_mmi, 0), + SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_mmi, 0), + SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_mmi, 0), + SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_mmi, 0), + SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_mmi, 0), + SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_mmi, 0))); + +INSTANTIATE_TEST_SUITE_P( + MMI, VpxSubpelAvgVarianceTest, + ::testing::Values( + SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_mmi, 0), + SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_mmi, 0), + SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_mmi, 0), + SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_mmi, 0), + SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_mmi, 0), + SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_mmi, 0), + SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_mmi, 0), + SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_mmi, 0), + SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_mmi, 0), + SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_mmi, 0), + SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_mmi, 0), + SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_mmi, 0), + SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_mmi, 0))); +#endif // HAVE_MMI + +#if HAVE_LSX +INSTANTIATE_TEST_SUITE_P(LSX, VpxMseTest, + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_lsx))); + +INSTANTIATE_TEST_SUITE_P( + LSX, VpxVarianceTest, + ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_lsx), + VarianceParams(5, 5, &vpx_variance32x32_lsx), + VarianceParams(4, 4, &vpx_variance16x16_lsx), + VarianceParams(3, 3, &vpx_variance8x8_lsx))); + +INSTANTIATE_TEST_SUITE_P( + LSX, VpxSubpelVarianceTest, + ::testing::Values( + SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_lsx, 0), + SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_lsx, 0), + SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_lsx, 0))); + +INSTANTIATE_TEST_SUITE_P(LSX, VpxSubpelAvgVarianceTest, + ::testing::Values(SubpelAvgVarianceParams( + 6, 6, &vpx_sub_pixel_avg_variance64x64_lsx, 0))); +#endif +} // namespace diff --git a/media/libvpx/libvpx/test/video_source.h b/media/libvpx/libvpx/test/video_source.h new file mode 100644 index 0000000000..2194126f1f --- /dev/null +++ b/media/libvpx/libvpx/test/video_source.h @@ -0,0 +1,286 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_TEST_VIDEO_SOURCE_H_ +#define VPX_TEST_VIDEO_SOURCE_H_ + +#if defined(_WIN32) +#undef NOMINMAX +#define NOMINMAX +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif +#include +#endif +#include +#include +#include +#include +#include + +#include "test/acm_random.h" +#if !defined(_WIN32) +#include "third_party/googletest/src/include/gtest/gtest.h" +#endif +#include "vpx/vpx_encoder.h" + +namespace libvpx_test { + +// Helper macros to ensure LIBVPX_TEST_DATA_PATH is a quoted string. +// These are undefined right below GetDataPath +// NOTE: LIBVPX_TEST_DATA_PATH MUST NOT be a quoted string before +// Stringification or the GetDataPath will fail at runtime +#define TO_STRING(S) #S +#define STRINGIFY(S) TO_STRING(S) + +// A simple function to encapsulate cross platform retrieval of test data path +static std::string GetDataPath() { + const char *const data_path = getenv("LIBVPX_TEST_DATA_PATH"); + if (data_path == nullptr) { +#ifdef LIBVPX_TEST_DATA_PATH + // In some environments, we cannot set environment variables + // Instead, we set the data path by using a preprocessor symbol + // which can be set from make files + return STRINGIFY(LIBVPX_TEST_DATA_PATH); +#else + return "."; +#endif + } + return data_path; +} + +// Undefining stringification macros because they are not used elsewhere +#undef TO_STRING +#undef STRINGIFY + +inline FILE *OpenTestDataFile(const std::string &file_name) { + const std::string path_to_source = GetDataPath() + "/" + file_name; + return fopen(path_to_source.c_str(), "rb"); +} + +static FILE *GetTempOutFile(std::string *file_name, const char *io_mode) { + file_name->clear(); +#if defined(_WIN32) + char fname[MAX_PATH]; + char tmppath[MAX_PATH]; + if (GetTempPathA(MAX_PATH, tmppath)) { + // Assume for now that the filename generated is unique per process + if (GetTempFileNameA(tmppath, "lvx", 0, fname)) { + file_name->assign(fname); + return fopen(fname, io_mode); + } + } + return nullptr; +#else + std::string temp_dir = testing::TempDir(); + if (temp_dir.empty()) return nullptr; + // Versions of testing::TempDir() prior to release-1.11.0-214-g5e6a5336 may + // use the value of an environment variable without checking for a trailing + // path delimiter. + if (temp_dir[temp_dir.size() - 1] != '/') temp_dir += '/'; + const char name_template[] = "libvpxtest.XXXXXX"; + std::unique_ptr temp_file_name( + new char[temp_dir.size() + sizeof(name_template)]); + if (temp_file_name == nullptr) return nullptr; + memcpy(temp_file_name.get(), temp_dir.data(), temp_dir.size()); + memcpy(temp_file_name.get() + temp_dir.size(), name_template, + sizeof(name_template)); + const int fd = mkstemp(temp_file_name.get()); + if (fd == -1) return nullptr; + *file_name = temp_file_name.get(); + return fdopen(fd, io_mode); +#endif +} + +class TempOutFile { + public: + TempOutFile() { file_ = GetTempOutFile(&file_name_, "wb+"); } + TempOutFile(const char *io_mode) { + file_ = GetTempOutFile(&file_name_, io_mode); + } + ~TempOutFile() { + CloseFile(); + if (!file_name_.empty()) { + EXPECT_EQ(0, remove(file_name_.c_str())); + } + } + FILE *file() { return file_; } + const std::string &file_name() { return file_name_; } + + protected: + void CloseFile() { + if (file_) { + fclose(file_); + file_ = nullptr; + } + } + FILE *file_; + std::string file_name_; +}; + +// Abstract base class for test video sources, which provide a stream of +// vpx_image_t images with associated timestamps and duration. +class VideoSource { + public: + virtual ~VideoSource() {} + + // Prepare the stream for reading, rewind/open as necessary. + virtual void Begin() = 0; + + // Advance the cursor to the next frame + virtual void Next() = 0; + + // Get the current video frame, or nullptr on End-Of-Stream. + virtual vpx_image_t *img() const = 0; + + // Get the presentation timestamp of the current frame. + virtual vpx_codec_pts_t pts() const = 0; + + // Get the current frame's duration + virtual unsigned long duration() const = 0; + + // Get the timebase for the stream + virtual vpx_rational_t timebase() const = 0; + + // Get the current frame counter, starting at 0. + virtual unsigned int frame() const = 0; + + // Get the current file limit. + virtual unsigned int limit() const = 0; +}; + +class DummyVideoSource : public VideoSource { + public: + DummyVideoSource() + : img_(nullptr), limit_(100), width_(80), height_(64), + format_(VPX_IMG_FMT_I420) { + ReallocImage(); + } + + ~DummyVideoSource() override { vpx_img_free(img_); } + + void Begin() override { + frame_ = 0; + FillFrame(); + } + + void Next() override { + ++frame_; + FillFrame(); + } + + vpx_image_t *img() const override { + return (frame_ < limit_) ? img_ : nullptr; + } + + // Models a stream where Timebase = 1/FPS, so pts == frame. + vpx_codec_pts_t pts() const override { return frame_; } + + unsigned long duration() const override { return 1; } + + vpx_rational_t timebase() const override { + const vpx_rational_t t = { 1, 30 }; + return t; + } + + unsigned int frame() const override { return frame_; } + + unsigned int limit() const override { return limit_; } + + void set_limit(unsigned int limit) { limit_ = limit; } + + void SetSize(unsigned int width, unsigned int height) { + if (width != width_ || height != height_) { + width_ = width; + height_ = height; + ReallocImage(); + } + } + + void SetImageFormat(vpx_img_fmt_t format) { + if (format_ != format) { + format_ = format; + ReallocImage(); + } + } + + protected: + virtual void FillFrame() { + if (img_) memset(img_->img_data, 0, raw_sz_); + } + + void ReallocImage() { + vpx_img_free(img_); + img_ = vpx_img_alloc(nullptr, format_, width_, height_, 32); + ASSERT_NE(img_, nullptr); + raw_sz_ = ((img_->w + 31) & ~31u) * img_->h * img_->bps / 8; + } + + vpx_image_t *img_; + size_t raw_sz_; + unsigned int limit_; + unsigned int frame_; + unsigned int width_; + unsigned int height_; + vpx_img_fmt_t format_; +}; + +class RandomVideoSource : public DummyVideoSource { + public: + RandomVideoSource(int seed = ACMRandom::DeterministicSeed()) + : rnd_(seed), seed_(seed) {} + + protected: + // Reset the RNG to get a matching stream for the second pass + void Begin() override { + frame_ = 0; + rnd_.Reset(seed_); + FillFrame(); + } + + // 15 frames of noise, followed by 15 static frames. Reset to 0 rather + // than holding previous frames to encourage keyframes to be thrown. + void FillFrame() override { + if (img_) { + if (frame_ % 30 < 15) { + for (size_t i = 0; i < raw_sz_; ++i) img_->img_data[i] = rnd_.Rand8(); + } else { + memset(img_->img_data, 0, raw_sz_); + } + } + } + + ACMRandom rnd_; + int seed_; +}; + +// Abstract base class for test video sources, which provide a stream of +// decompressed images to the decoder. +class CompressedVideoSource { + public: + virtual ~CompressedVideoSource() {} + + virtual void Init() = 0; + + // Prepare the stream for reading, rewind/open as necessary. + virtual void Begin() = 0; + + // Advance the cursor to the next frame + virtual void Next() = 0; + + virtual const uint8_t *cxdata() const = 0; + + virtual size_t frame_size() const = 0; + + virtual unsigned int frame_number() const = 0; +}; + +} // namespace libvpx_test + +#endif // VPX_TEST_VIDEO_SOURCE_H_ diff --git a/media/libvpx/libvpx/test/vp8_boolcoder_test.cc b/media/libvpx/libvpx/test/vp8_boolcoder_test.cc new file mode 100644 index 0000000000..c78b0b3b6c --- /dev/null +++ b/media/libvpx/libvpx/test/vp8_boolcoder_test.cc @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "test/acm_random.h" +#include "vp8/decoder/dboolhuff.h" +#include "vp8/encoder/boolhuff.h" +#include "vpx/vpx_integer.h" + +namespace { +const int num_tests = 10; + +// In a real use the 'decrypt_state' parameter will be a pointer to a struct +// with whatever internal state the decryptor uses. For testing we'll just +// xor with a constant key, and decrypt_state will point to the start of +// the original buffer. +const uint8_t secret_key[16] = { + 0x01, 0x12, 0x23, 0x34, 0x45, 0x56, 0x67, 0x78, + 0x89, 0x9a, 0xab, 0xbc, 0xcd, 0xde, 0xef, 0xf0 +}; + +void encrypt_buffer(uint8_t *buffer, size_t size) { + for (size_t i = 0; i < size; ++i) { + buffer[i] ^= secret_key[i & 15]; + } +} + +void test_decrypt_cb(void *decrypt_state, const uint8_t *input, uint8_t *output, + int count) { + const size_t offset = input - reinterpret_cast(decrypt_state); + for (int i = 0; i < count; i++) { + output[i] = input[i] ^ secret_key[(offset + i) & 15]; + } +} + +} // namespace + +using libvpx_test::ACMRandom; + +TEST(VP8, TestBitIO) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + for (int n = 0; n < num_tests; ++n) { + for (int method = 0; method <= 7; ++method) { // we generate various proba + const int kBitsToTest = 1000; + uint8_t probas[kBitsToTest]; + + for (int i = 0; i < kBitsToTest; ++i) { + const int parity = i & 1; + /* clang-format off */ + probas[i] = + (method == 0) ? 0 : (method == 1) ? 255 : + (method == 2) ? 128 : + (method == 3) ? rnd.Rand8() : + (method == 4) ? (parity ? 0 : 255) : + // alternate between low and high proba: + (method == 5) ? (parity ? rnd(128) : 255 - rnd(128)) : + (method == 6) ? + (parity ? rnd(64) : 255 - rnd(64)) : + (parity ? rnd(32) : 255 - rnd(32)); + /* clang-format on */ + } + for (int bit_method = 0; bit_method <= 3; ++bit_method) { + const int random_seed = 6432; + const int kBufferSize = 10000; + ACMRandom bit_rnd(random_seed); + BOOL_CODER bw; + uint8_t bw_buffer[kBufferSize]; + vp8_start_encode(&bw, bw_buffer, bw_buffer + kBufferSize); + + int bit = (bit_method == 0) ? 0 : (bit_method == 1) ? 1 : 0; + for (int i = 0; i < kBitsToTest; ++i) { + if (bit_method == 2) { + bit = (i & 1); + } else if (bit_method == 3) { + bit = bit_rnd(2); + } + vp8_encode_bool(&bw, bit, static_cast(probas[i])); + } + + vp8_stop_encode(&bw); + // vp8dx_bool_decoder_fill() may read into uninitialized data that + // isn't used meaningfully, but may trigger an MSan warning. + memset(bw_buffer + bw.pos, 0, sizeof(VP8_BD_VALUE) - 1); + + BOOL_DECODER br; + encrypt_buffer(bw_buffer, kBufferSize); + vp8dx_start_decode(&br, bw_buffer, kBufferSize, test_decrypt_cb, + reinterpret_cast(bw_buffer)); + bit_rnd.Reset(random_seed); + for (int i = 0; i < kBitsToTest; ++i) { + if (bit_method == 2) { + bit = (i & 1); + } else if (bit_method == 3) { + bit = bit_rnd(2); + } + GTEST_ASSERT_EQ(vp8dx_decode_bool(&br, probas[i]), bit) + << "pos: " << i << " / " << kBitsToTest + << " bit_method: " << bit_method << " method: " << method; + } + } + } + } +} diff --git a/media/libvpx/libvpx/test/vp8_datarate_test.cc b/media/libvpx/libvpx/test/vp8_datarate_test.cc new file mode 100644 index 0000000000..aee27af66e --- /dev/null +++ b/media/libvpx/libvpx/test/vp8_datarate_test.cc @@ -0,0 +1,438 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_config.h" +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "vpx/vpx_codec.h" + +namespace { + +class DatarateTestLarge + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith2Params { + public: + DatarateTestLarge() : EncoderTest(GET_PARAM(0)) {} + + ~DatarateTestLarge() override = default; + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(GET_PARAM(1)); + set_cpu_used_ = GET_PARAM(2); + ResetModel(); + } + + virtual void ResetModel() { + last_pts_ = 0; + bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz; + frame_number_ = 0; + first_drop_ = 0; + bits_total_ = 0; + duration_ = 0.0; + denoiser_offon_test_ = 0; + denoiser_offon_period_ = -1; + gf_boost_ = 0; + use_roi_ = false; + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_); + encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); + encoder->Control(VP8E_SET_GF_CBR_BOOST_PCT, gf_boost_); + } + + if (use_roi_) { + encoder->Control(VP8E_SET_ROI_MAP, &roi_); + } + + if (denoiser_offon_test_) { + ASSERT_GT(denoiser_offon_period_, 0) + << "denoiser_offon_period_ is not positive."; + if ((video->frame() + 1) % denoiser_offon_period_ == 0) { + // Flip denoiser_on_ periodically + denoiser_on_ ^= 1; + } + encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_); + } + + const vpx_rational_t tb = video->timebase(); + timebase_ = static_cast(tb.num) / tb.den; + duration_ = 0; + } + + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + // Time since last timestamp = duration. + vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_; + + // TODO(jimbankoski): Remove these lines when the issue: + // http://code.google.com/p/webm/issues/detail?id=496 is fixed. + // For now the codec assumes buffer starts at starting buffer rate + // plus one frame's time. + if (last_pts_ == 0) duration = 1; + + // Add to the buffer the bits we'd expect from a constant bitrate server. + bits_in_buffer_model_ += static_cast( + duration * timebase_ * cfg_.rc_target_bitrate * 1000); + + /* Test the buffer model here before subtracting the frame. Do so because + * the way the leaky bucket model works in libvpx is to allow the buffer to + * empty - and then stop showing frames until we've got enough bits to + * show one. As noted in comment below (issue 495), this does not currently + * apply to key frames. For now exclude key frames in condition below. */ + const bool key_frame = + (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false; + if (!key_frame) { + ASSERT_GE(bits_in_buffer_model_, 0) + << "Buffer Underrun at frame " << pkt->data.frame.pts; + } + + const int64_t frame_size_in_bits = pkt->data.frame.sz * 8; + + // Subtract from the buffer the bits associated with a played back frame. + bits_in_buffer_model_ -= frame_size_in_bits; + + // Update the running total of bits for end of test datarate checks. + bits_total_ += frame_size_in_bits; + + // If first drop not set and we have a drop set it to this time. + if (!first_drop_ && duration > 1) first_drop_ = last_pts_ + 1; + + // Update the most recent pts. + last_pts_ = pkt->data.frame.pts; + + // We update this so that we can calculate the datarate minus the last + // frame encoded in the file. + bits_in_last_frame_ = frame_size_in_bits; + + ++frame_number_; + } + + void EndPassHook() override { + if (bits_total_) { + const double file_size_in_kb = bits_total_ / 1000.; // bits per kilobit + + duration_ = (last_pts_ + 1) * timebase_; + + // Effective file datarate includes the time spent prebuffering. + effective_datarate_ = (bits_total_ - bits_in_last_frame_) / 1000.0 / + (cfg_.rc_buf_initial_sz / 1000.0 + duration_); + + file_datarate_ = file_size_in_kb / duration_; + } + } + + virtual void DenoiserLevelsTest() { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, + 288, 30, 1, 0, 140); + for (int j = 1; j < 5; ++j) { + // Run over the denoiser levels. + // For the temporal denoiser (#if CONFIG_TEMPORAL_DENOISING) the level j + // refers to the 4 denoiser modes: denoiserYonly, denoiserOnYUV, + // denoiserOnAggressive, and denoiserOnAdaptive. + denoiser_on_ = j; + cfg_.rc_target_bitrate = 300; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) + << " The datarate for the file exceeds the target!"; + + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) + << " The datarate for the file missed the target!"; + } + } + + virtual void DenoiserOffOnTest() { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, + 288, 30, 1, 0, 299); + cfg_.rc_target_bitrate = 300; + ResetModel(); + // The denoiser is off by default. + denoiser_on_ = 0; + // Set the offon test flag. + denoiser_offon_test_ = 1; + denoiser_offon_period_ = 100; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) + << " The datarate for the file exceeds the target!"; + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) + << " The datarate for the file missed the target!"; + } + + virtual void BasicBufferModelTest() { + denoiser_on_ = 0; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + // 2 pass cbr datarate control has a bug hidden by the small # of + // frames selected in this encode. The problem is that even if the buffer is + // negative we produce a keyframe on a cutscene. Ignoring datarate + // constraints + // TODO(jimbankoski): ( Fix when issue + // http://code.google.com/p/webm/issues/detail?id=495 is addressed. ) + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, + 288, 30, 1, 0, 140); + + // There is an issue for low bitrates in real-time mode, where the + // effective_datarate slightly overshoots the target bitrate. + // This is same the issue as noted about (#495). + // TODO(jimbankoski/marpan): Update test to run for lower bitrates (< 100), + // when the issue is resolved. + for (int i = 100; i < 800; i += 200) { + cfg_.rc_target_bitrate = i; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) + << " The datarate for the file exceeds the target!"; + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) + << " The datarate for the file missed the target!"; + } + } + + virtual void ChangingDropFrameThreshTest() { + denoiser_on_ = 0; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_max_quantizer = 36; + cfg_.rc_end_usage = VPX_CBR; + cfg_.rc_target_bitrate = 200; + cfg_.kf_mode = VPX_KF_DISABLED; + + const int frame_count = 40; + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, + 288, 30, 1, 0, frame_count); + + // Here we check that the first dropped frame gets earlier and earlier + // as the drop frame threshold is increased. + + const int kDropFrameThreshTestStep = 30; + vpx_codec_pts_t last_drop = frame_count; + for (int i = 1; i < 91; i += kDropFrameThreshTestStep) { + cfg_.rc_dropframe_thresh = i; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_LE(first_drop_, last_drop) + << " The first dropped frame for drop_thresh " << i + << " > first dropped frame for drop_thresh " + << i - kDropFrameThreshTestStep; + last_drop = first_drop_; + } + } + + virtual void DropFramesMultiThreadsTest() { + denoiser_on_ = 0; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 30; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_threads = 2; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, + 288, 30, 1, 0, 140); + cfg_.rc_target_bitrate = 200; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) + << " The datarate for the file exceeds the target!"; + + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) + << " The datarate for the file missed the target!"; + } + + vpx_codec_pts_t last_pts_; + int64_t bits_in_buffer_model_; + double timebase_; + int frame_number_; + vpx_codec_pts_t first_drop_; + int64_t bits_total_; + double duration_; + double file_datarate_; + double effective_datarate_; + int64_t bits_in_last_frame_; + int denoiser_on_; + int denoiser_offon_test_; + int denoiser_offon_period_; + int set_cpu_used_; + int gf_boost_; + bool use_roi_; + vpx_roi_map_t roi_; +}; + +#if CONFIG_TEMPORAL_DENOISING +// Check basic datarate targeting, for a single bitrate, but loop over the +// various denoiser settings. +TEST_P(DatarateTestLarge, DenoiserLevels) { DenoiserLevelsTest(); } + +// Check basic datarate targeting, for a single bitrate, when denoiser is off +// and on. +TEST_P(DatarateTestLarge, DenoiserOffOn) { DenoiserOffOnTest(); } +#endif // CONFIG_TEMPORAL_DENOISING + +TEST_P(DatarateTestLarge, BasicBufferModel) { BasicBufferModelTest(); } + +TEST_P(DatarateTestLarge, ChangingDropFrameThresh) { + ChangingDropFrameThreshTest(); +} + +TEST_P(DatarateTestLarge, DropFramesMultiThreads) { + DropFramesMultiThreadsTest(); +} + +class DatarateTestRealTime : public DatarateTestLarge { + public: + ~DatarateTestRealTime() override = default; +}; + +#if CONFIG_TEMPORAL_DENOISING +// Check basic datarate targeting, for a single bitrate, but loop over the +// various denoiser settings. +TEST_P(DatarateTestRealTime, DenoiserLevels) { DenoiserLevelsTest(); } + +// Check basic datarate targeting, for a single bitrate, when denoiser is off +// and on. +TEST_P(DatarateTestRealTime, DenoiserOffOn) {} +#endif // CONFIG_TEMPORAL_DENOISING + +TEST_P(DatarateTestRealTime, BasicBufferModel) { BasicBufferModelTest(); } + +TEST_P(DatarateTestRealTime, ChangingDropFrameThresh) { + ChangingDropFrameThreshTest(); +} + +TEST_P(DatarateTestRealTime, DropFramesMultiThreads) { + DropFramesMultiThreadsTest(); +} + +TEST_P(DatarateTestRealTime, RegionOfInterest) { + denoiser_on_ = 0; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + // Encode using multiple threads. + cfg_.g_threads = 2; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + cfg_.rc_target_bitrate = 450; + cfg_.g_w = 352; + cfg_.g_h = 288; + + ResetModel(); + + // Set ROI parameters + use_roi_ = true; + memset(&roi_, 0, sizeof(roi_)); + + roi_.rows = (cfg_.g_h + 15) / 16; + roi_.cols = (cfg_.g_w + 15) / 16; + + roi_.delta_q[0] = 0; + roi_.delta_q[1] = -20; + roi_.delta_q[2] = 0; + roi_.delta_q[3] = 0; + + roi_.delta_lf[0] = 0; + roi_.delta_lf[1] = -20; + roi_.delta_lf[2] = 0; + roi_.delta_lf[3] = 0; + + roi_.static_threshold[0] = 0; + roi_.static_threshold[1] = 1000; + roi_.static_threshold[2] = 0; + roi_.static_threshold[3] = 0; + + // Use 2 states: 1 is center square, 0 is the rest. + roi_.roi_map = + (uint8_t *)calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map)); + for (unsigned int i = 0; i < roi_.rows; ++i) { + for (unsigned int j = 0; j < roi_.cols; ++j) { + if (i > (roi_.rows >> 2) && i < ((roi_.rows * 3) >> 2) && + j > (roi_.cols >> 2) && j < ((roi_.cols * 3) >> 2)) { + roi_.roi_map[i * roi_.cols + j] = 1; + } + } + } + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) + << " The datarate for the file exceeds the target!"; + + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) + << " The datarate for the file missed the target!"; + + free(roi_.roi_map); +} + +TEST_P(DatarateTestRealTime, GFBoost) { + denoiser_on_ = 0; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_error_resilient = 0; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + cfg_.rc_target_bitrate = 300; + ResetModel(); + // Apply a gf boost. + gf_boost_ = 50; + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) + << " The datarate for the file exceeds the target!"; + + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) + << " The datarate for the file missed the target!"; +} + +TEST_P(DatarateTestRealTime, NV12) { + denoiser_on_ = 0; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_error_resilient = 0; + ::libvpx_test::YUVVideoSource video("hantro_collage_w352h288_nv12.yuv", + VPX_IMG_FMT_NV12, 352, 288, 30, 1, 0, + 100); + + cfg_.rc_target_bitrate = 200; + ResetModel(); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) + << " The datarate for the file exceeds the target!"; + + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) + << " The datarate for the file missed the target!"; +} + +VP8_INSTANTIATE_TEST_SUITE(DatarateTestLarge, ALL_TEST_MODES, + ::testing::Values(0)); +VP8_INSTANTIATE_TEST_SUITE(DatarateTestRealTime, + ::testing::Values(::libvpx_test::kRealTime), + ::testing::Values(-6, -12)); +} // namespace diff --git a/media/libvpx/libvpx/test/vp8_decrypt_test.cc b/media/libvpx/libvpx/test/vp8_decrypt_test.cc new file mode 100644 index 0000000000..bcac9d1a82 --- /dev/null +++ b/media/libvpx/libvpx/test/vp8_decrypt_test.cc @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/ivf_video_source.h" + +namespace { +// In a real use the 'decrypt_state' parameter will be a pointer to a struct +// with whatever internal state the decryptor uses. For testing we'll just +// xor with a constant key, and decrypt_state will point to the start of +// the original buffer. +const uint8_t test_key[16] = { 0x01, 0x12, 0x23, 0x34, 0x45, 0x56, 0x67, 0x78, + 0x89, 0x9a, 0xab, 0xbc, 0xcd, 0xde, 0xef, 0xf0 }; + +void encrypt_buffer(const uint8_t *src, uint8_t *dst, size_t size, + ptrdiff_t offset) { + for (size_t i = 0; i < size; ++i) { + dst[i] = src[i] ^ test_key[(offset + i) & 15]; + } +} + +void test_decrypt_cb(void *decrypt_state, const uint8_t *input, uint8_t *output, + int count) { + encrypt_buffer(input, output, count, + input - reinterpret_cast(decrypt_state)); +} + +} // namespace + +namespace libvpx_test { + +TEST(TestDecrypt, DecryptWorksVp8) { + libvpx_test::IVFVideoSource video("vp80-00-comprehensive-001.ivf"); + video.Init(); + + vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t(); + VP8Decoder decoder(dec_cfg, 0); + + video.Begin(); + + // no decryption + vpx_codec_err_t res = decoder.DecodeFrame(video.cxdata(), video.frame_size()); + ASSERT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError(); + + // decrypt frame + video.Next(); + + std::vector encrypted(video.frame_size()); + encrypt_buffer(video.cxdata(), &encrypted[0], video.frame_size(), 0); + vpx_decrypt_init di = { test_decrypt_cb, &encrypted[0] }; + decoder.Control(VPXD_SET_DECRYPTOR, &di); + + res = decoder.DecodeFrame(&encrypted[0], encrypted.size()); + ASSERT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError(); +} + +} // namespace libvpx_test diff --git a/media/libvpx/libvpx/test/vp8_denoiser_sse2_test.cc b/media/libvpx/libvpx/test/vp8_denoiser_sse2_test.cc new file mode 100644 index 0000000000..7fa867d8bb --- /dev/null +++ b/media/libvpx/libvpx/test/vp8_denoiser_sse2_test.cc @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" + +#include "vp8/encoder/denoising.h" +#include "vp8/common/reconinter.h" +#include "vpx/vpx_integer.h" +#include "vpx_mem/vpx_mem.h" + +using libvpx_test::ACMRandom; + +namespace { + +const int kNumPixels = 16 * 16; +class VP8DenoiserTest : public ::testing::TestWithParam { + public: + ~VP8DenoiserTest() override = default; + + void SetUp() override { increase_denoising_ = GetParam(); } + + void TearDown() override { libvpx_test::ClearSystemState(); } + + protected: + int increase_denoising_; +}; + +// TODO(https://crbug.com/webm/1718): This test fails with gcc 8-10. +#if defined(__GNUC__) && __GNUC__ >= 8 +TEST_P(VP8DenoiserTest, DISABLED_BitexactCheck) { +#else +TEST_P(VP8DenoiserTest, BitexactCheck) { +#endif + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 4000; + const int stride = 16; + + // Allocate the space for input and output, + // where sig_block_c/_sse2 is the block to be denoised, + // mc_avg_block is the denoised reference block, + // avg_block_c is the denoised result from C code, + // avg_block_sse2 is the denoised result from SSE2 code. + DECLARE_ALIGNED(16, uint8_t, sig_block_c[kNumPixels]); + // Since in VP8 denoiser, the source signal will be changed, + // we need another copy of the source signal as the input of sse2 code. + DECLARE_ALIGNED(16, uint8_t, sig_block_sse2[kNumPixels]); + DECLARE_ALIGNED(16, uint8_t, mc_avg_block[kNumPixels]); + DECLARE_ALIGNED(16, uint8_t, avg_block_c[kNumPixels]); + DECLARE_ALIGNED(16, uint8_t, avg_block_sse2[kNumPixels]); + + for (int i = 0; i < count_test_block; ++i) { + // Generate random motion magnitude, 20% of which exceed the threshold. + const int motion_magnitude_ran = + rnd.Rand8() % static_cast(MOTION_MAGNITUDE_THRESHOLD * 1.2); + + // Initialize a test block with random number in range [0, 255]. + for (int j = 0; j < kNumPixels; ++j) { + int temp = 0; + sig_block_sse2[j] = sig_block_c[j] = rnd.Rand8(); + // The pixels in mc_avg_block are generated by adding a random + // number in range [-19, 19] to corresponding pixels in sig_block. + temp = + sig_block_c[j] + (rnd.Rand8() % 2 == 0 ? -1 : 1) * (rnd.Rand8() % 20); + // Clip. + mc_avg_block[j] = (temp < 0) ? 0 : ((temp > 255) ? 255 : temp); + } + + // Test denosiser on Y component. + ASM_REGISTER_STATE_CHECK(vp8_denoiser_filter_c( + mc_avg_block, stride, avg_block_c, stride, sig_block_c, stride, + motion_magnitude_ran, increase_denoising_)); + + ASM_REGISTER_STATE_CHECK(vp8_denoiser_filter_sse2( + mc_avg_block, stride, avg_block_sse2, stride, sig_block_sse2, stride, + motion_magnitude_ran, increase_denoising_)); + + // Check bitexactness. + for (int h = 0; h < 16; ++h) { + for (int w = 0; w < 16; ++w) { + ASSERT_EQ(avg_block_c[h * stride + w], avg_block_sse2[h * stride + w]); + } + } + + // Test denoiser on UV component. + ASM_REGISTER_STATE_CHECK(vp8_denoiser_filter_uv_c( + mc_avg_block, stride, avg_block_c, stride, sig_block_c, stride, + motion_magnitude_ran, increase_denoising_)); + + ASM_REGISTER_STATE_CHECK(vp8_denoiser_filter_uv_sse2( + mc_avg_block, stride, avg_block_sse2, stride, sig_block_sse2, stride, + motion_magnitude_ran, increase_denoising_)); + + // Check bitexactness. + for (int h = 0; h < 16; ++h) { + for (int w = 0; w < 16; ++w) { + ASSERT_EQ(avg_block_c[h * stride + w], avg_block_sse2[h * stride + w]); + } + } + } +} + +// Test for all block size. +INSTANTIATE_TEST_SUITE_P(SSE2, VP8DenoiserTest, ::testing::Values(0, 1)); +} // namespace diff --git a/media/libvpx/libvpx/test/vp8_fdct4x4_test.cc b/media/libvpx/libvpx/test/vp8_fdct4x4_test.cc new file mode 100644 index 0000000000..66d5c151c5 --- /dev/null +++ b/media/libvpx/libvpx/test/vp8_fdct4x4_test.cc @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vpx_config.h" +#include "./vp8_rtcd.h" +#include "test/acm_random.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +namespace { + +typedef void (*FdctFunc)(int16_t *a, int16_t *b, int a_stride); + +const int cospi8sqrt2minus1 = 20091; +const int sinpi8sqrt2 = 35468; + +void reference_idct4x4(const int16_t *input, int16_t *output) { + const int16_t *ip = input; + int16_t *op = output; + + for (int i = 0; i < 4; ++i) { + const int a1 = ip[0] + ip[8]; + const int b1 = ip[0] - ip[8]; + const int temp1 = (ip[4] * sinpi8sqrt2) >> 16; + const int temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16); + const int c1 = temp1 - temp2; + const int temp3 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16); + const int temp4 = (ip[12] * sinpi8sqrt2) >> 16; + const int d1 = temp3 + temp4; + op[0] = a1 + d1; + op[12] = a1 - d1; + op[4] = b1 + c1; + op[8] = b1 - c1; + ++ip; + ++op; + } + ip = output; + op = output; + for (int i = 0; i < 4; ++i) { + const int a1 = ip[0] + ip[2]; + const int b1 = ip[0] - ip[2]; + const int temp1 = (ip[1] * sinpi8sqrt2) >> 16; + const int temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16); + const int c1 = temp1 - temp2; + const int temp3 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16); + const int temp4 = (ip[3] * sinpi8sqrt2) >> 16; + const int d1 = temp3 + temp4; + op[0] = (a1 + d1 + 4) >> 3; + op[3] = (a1 - d1 + 4) >> 3; + op[1] = (b1 + c1 + 4) >> 3; + op[2] = (b1 - c1 + 4) >> 3; + ip += 4; + op += 4; + } +} + +using libvpx_test::ACMRandom; + +class FdctTest : public ::testing::TestWithParam { + public: + void SetUp() override { + fdct_func_ = GetParam(); + rnd_.Reset(ACMRandom::DeterministicSeed()); + } + + protected: + FdctFunc fdct_func_; + ACMRandom rnd_; +}; + +TEST_P(FdctTest, SignBiasCheck) { + int16_t test_input_block[16]; + DECLARE_ALIGNED(16, int16_t, test_output_block[16]); + const int pitch = 8; + int count_sign_block[16][2]; + const int count_test_block = 1000000; + + memset(count_sign_block, 0, sizeof(count_sign_block)); + + for (int i = 0; i < count_test_block; ++i) { + // Initialize a test block with input range [-255, 255]. + for (int j = 0; j < 16; ++j) { + test_input_block[j] = rnd_.Rand8() - rnd_.Rand8(); + } + + fdct_func_(test_input_block, test_output_block, pitch); + + for (int j = 0; j < 16; ++j) { + if (test_output_block[j] < 0) { + ++count_sign_block[j][0]; + } else if (test_output_block[j] > 0) { + ++count_sign_block[j][1]; + } + } + } + + bool bias_acceptable = true; + for (int j = 0; j < 16; ++j) { + bias_acceptable = + bias_acceptable && + (abs(count_sign_block[j][0] - count_sign_block[j][1]) < 10000); + } + + EXPECT_EQ(true, bias_acceptable) + << "Error: 4x4 FDCT has a sign bias > 1% for input range [-255, 255]"; + + memset(count_sign_block, 0, sizeof(count_sign_block)); + + for (int i = 0; i < count_test_block; ++i) { + // Initialize a test block with input range [-15, 15]. + for (int j = 0; j < 16; ++j) { + test_input_block[j] = (rnd_.Rand8() >> 4) - (rnd_.Rand8() >> 4); + } + + fdct_func_(test_input_block, test_output_block, pitch); + + for (int j = 0; j < 16; ++j) { + if (test_output_block[j] < 0) { + ++count_sign_block[j][0]; + } else if (test_output_block[j] > 0) { + ++count_sign_block[j][1]; + } + } + } + + bias_acceptable = true; + for (int j = 0; j < 16; ++j) { + bias_acceptable = + bias_acceptable && + (abs(count_sign_block[j][0] - count_sign_block[j][1]) < 100000); + } + + EXPECT_EQ(true, bias_acceptable) + << "Error: 4x4 FDCT has a sign bias > 10% for input range [-15, 15]"; +} + +TEST_P(FdctTest, RoundTripErrorCheck) { + int max_error = 0; + double total_error = 0; + const int count_test_block = 1000000; + for (int i = 0; i < count_test_block; ++i) { + int16_t test_input_block[16]; + int16_t test_output_block[16]; + DECLARE_ALIGNED(16, int16_t, test_temp_block[16]); + + // Initialize a test block with input range [-255, 255]. + for (int j = 0; j < 16; ++j) { + test_input_block[j] = rnd_.Rand8() - rnd_.Rand8(); + } + + const int pitch = 8; + fdct_func_(test_input_block, test_temp_block, pitch); + reference_idct4x4(test_temp_block, test_output_block); + + for (int j = 0; j < 16; ++j) { + const int diff = test_input_block[j] - test_output_block[j]; + const int error = diff * diff; + if (max_error < error) max_error = error; + total_error += error; + } + } + + EXPECT_GE(1, max_error) + << "Error: FDCT/IDCT has an individual roundtrip error > 1"; + + EXPECT_GE(count_test_block, total_error) + << "Error: FDCT/IDCT has average roundtrip error > 1 per block"; +} + +INSTANTIATE_TEST_SUITE_P(C, FdctTest, ::testing::Values(vp8_short_fdct4x4_c)); + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, FdctTest, + ::testing::Values(vp8_short_fdct4x4_neon)); +#endif // HAVE_NEON + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P(SSE2, FdctTest, + ::testing::Values(vp8_short_fdct4x4_sse2)); +#endif // HAVE_SSE2 + +#if HAVE_MSA +INSTANTIATE_TEST_SUITE_P(MSA, FdctTest, + ::testing::Values(vp8_short_fdct4x4_msa)); +#endif // HAVE_MSA +#if HAVE_MMI +INSTANTIATE_TEST_SUITE_P(MMI, FdctTest, + ::testing::Values(vp8_short_fdct4x4_mmi)); +#endif // HAVE_MMI + +#if HAVE_LSX +INSTANTIATE_TEST_SUITE_P(LSX, FdctTest, + ::testing::Values(vp8_short_fdct4x4_lsx)); +#endif // HAVE_LSX +} // namespace diff --git a/media/libvpx/libvpx/test/vp8_fragments_test.cc b/media/libvpx/libvpx/test/vp8_fragments_test.cc new file mode 100644 index 0000000000..01b4c2120e --- /dev/null +++ b/media/libvpx/libvpx/test/vp8_fragments_test.cc @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/video_source.h" + +namespace { + +class VP8FragmentsTest : public ::libvpx_test::EncoderTest, + public ::testing::Test { + protected: + VP8FragmentsTest() : EncoderTest(&::libvpx_test::kVP8) {} + ~VP8FragmentsTest() override = default; + + void SetUp() override { + const unsigned long init_flags = // NOLINT(runtime/int) + VPX_CODEC_USE_OUTPUT_PARTITION; + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + set_init_flags(init_flags); + } +}; + +TEST_F(VP8FragmentsTest, TestFragmentsEncodeDecode) { + ::libvpx_test::RandomVideoSource video; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +} // namespace diff --git a/media/libvpx/libvpx/test/vp8_multi_resolution_encoder.sh b/media/libvpx/libvpx/test/vp8_multi_resolution_encoder.sh new file mode 100755 index 0000000000..1e96f94cc7 --- /dev/null +++ b/media/libvpx/libvpx/test/vp8_multi_resolution_encoder.sh @@ -0,0 +1,87 @@ +#!/bin/sh +## +## Copyright (c) 2014 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## +## This file tests the libvpx vp8_multi_resolution_encoder example. To add new +## tests to this file, do the following: +## 1. Write a shell function (this is your test). +## 2. Add the function to vp8_mre_tests (on a new line). +## +. $(dirname $0)/tools_common.sh + +# Environment check: $YUV_RAW_INPUT is required. +vp8_multi_resolution_encoder_verify_environment() { + if [ "$(vpx_config_option_enabled CONFIG_MULTI_RES_ENCODING)" = "yes" ]; then + if [ ! -e "${YUV_RAW_INPUT}" ]; then + elog "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH." + return 1 + fi + local app="vp8_multi_resolution_encoder" + if [ -z "$(vpx_tool_path "${app}")" ]; then + elog "${app} not found. It must exist in LIBVPX_BIN_PATH or its parent." + return 1 + fi + fi +} + +# Runs vp8_multi_resolution_encoder. Simply forwards all arguments to +# vp8_multi_resolution_encoder after building path to the executable. +vp8_mre() { + local encoder="$(vpx_tool_path vp8_multi_resolution_encoder)" + if [ ! -x "${encoder}" ]; then + elog "${encoder} does not exist or is not executable." + return 1 + fi + + eval "${VPX_TEST_PREFIX}" "${encoder}" "$@" ${devnull} +} + +vp8_multi_resolution_encoder_three_formats() { + local output_files="${VPX_TEST_OUTPUT_DIR}/vp8_mre_0.ivf + ${VPX_TEST_OUTPUT_DIR}/vp8_mre_1.ivf + ${VPX_TEST_OUTPUT_DIR}/vp8_mre_2.ivf" + local layer_bitrates="150 80 50" + local keyframe_insert="200" + local temporal_layers="3 3 3" + local framerate="30" + + if [ "$(vpx_config_option_enabled CONFIG_MULTI_RES_ENCODING)" = "yes" ]; then + if [ "$(vp8_encode_available)" = "yes" ]; then + # Param order: + # Input width + # Input height + # Framerate + # Input file path + # Output file names + # Layer bitrates + # Temporal layers + # Keyframe insert + # Output PSNR + vp8_mre "${YUV_RAW_INPUT_WIDTH}" \ + "${YUV_RAW_INPUT_HEIGHT}" \ + "${framerate}" \ + "${YUV_RAW_INPUT}" \ + ${output_files} \ + ${layer_bitrates} \ + ${temporal_layers} \ + "${keyframe_insert}" \ + 0 || return 1 + + for output_file in ${output_files}; do + if [ ! -e "${output_file}" ]; then + elog "Missing output file: ${output_file}" + return 1 + fi + done + fi + fi +} + +vp8_mre_tests="vp8_multi_resolution_encoder_three_formats" +run_tests vp8_multi_resolution_encoder_verify_environment "${vp8_mre_tests}" diff --git a/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc b/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc new file mode 100644 index 0000000000..50478f7635 --- /dev/null +++ b/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc @@ -0,0 +1,417 @@ +/* + * Copyright (c) 2021 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // NOLINT +#include + +#include "./vpx_config.h" +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/video_source.h" +#include "vp8/vp8_ratectrl_rtc.h" +#include "vpx/vpx_codec.h" +#include "vpx_ports/bitops.h" + +namespace { + +struct Vp8RCTestVideo { + Vp8RCTestVideo() = default; + Vp8RCTestVideo(const char *name_, int width_, int height_, + unsigned int frames_) + : name(name_), width(width_), height(height_), frames(frames_) {} + + friend std::ostream &operator<<(std::ostream &os, + const Vp8RCTestVideo &video) { + os << video.name << " " << video.width << " " << video.height << " " + << video.frames; + return os; + } + const char *name; + int width; + int height; + unsigned int frames; +}; + +const Vp8RCTestVideo kVp8RCTestVectors[] = { + Vp8RCTestVideo("niklas_640_480_30.yuv", 640, 480, 470), + Vp8RCTestVideo("desktop_office1.1280_720-020.yuv", 1280, 720, 300), + Vp8RCTestVideo("hantro_collage_w352h288.yuv", 352, 288, 100), +}; + +class Vp8RcInterfaceTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith2Params { + public: + Vp8RcInterfaceTest() + : EncoderTest(GET_PARAM(0)), key_interval_(3000), encoder_exit_(false), + frame_drop_thresh_(0) {} + ~Vp8RcInterfaceTest() override = default; + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + } + + // From error_resilience_test.cc + int SetFrameFlags(int frame_num, int num_temp_layers) { + int frame_flags = 0; + if (num_temp_layers == 2) { + if (frame_num % 2 == 0) { + // Layer 0: predict from L and ARF, update L. + frame_flags = + VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF; + } else { + // Layer 1: predict from L, G and ARF, and update G. + frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST | + VP8_EFLAG_NO_UPD_ENTROPY; + } + } else if (num_temp_layers == 3) { + if (frame_num % 4 == 0) { + // Layer 0: predict from L, update L. + frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | + VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF; + } else if ((frame_num - 2) % 4 == 0) { + // Layer 1: predict from L, G, update G. + frame_flags = + VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_REF_ARF; + } else if ((frame_num - 1) % 2 == 0) { + // Layer 2: predict from L, G, ARF; update ARG. + frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_LAST; + } + } + return frame_flags; + } + + int SetLayerId(int frame_num, int num_temp_layers) { + int layer_id = 0; + if (num_temp_layers == 2) { + if (frame_num % 2 == 0) { + layer_id = 0; + } else { + layer_id = 1; + } + } else if (num_temp_layers == 3) { + if (frame_num % 4 == 0) { + layer_id = 0; + } else if ((frame_num - 2) % 4 == 0) { + layer_id = 1; + } else if ((frame_num - 1) % 2 == 0) { + layer_id = 2; + } + } + return layer_id; + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (rc_cfg_.ts_number_layers > 1) { + const int layer_id = SetLayerId(video->frame(), cfg_.ts_number_layers); + const int frame_flags = + SetFrameFlags(video->frame(), cfg_.ts_number_layers); + frame_params_.temporal_layer_id = layer_id; + if (video->frame() > 0) { + encoder->Control(VP8E_SET_TEMPORAL_LAYER_ID, layer_id); + encoder->Control(VP8E_SET_FRAME_FLAGS, frame_flags); + } + } else { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, -6); + encoder->Control(VP8E_SET_RTC_EXTERNAL_RATECTRL, 1); + encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000); + if (rc_cfg_.is_screen) { + encoder->Control(VP8E_SET_SCREEN_CONTENT_MODE, 1); + } + } else if (frame_params_.frame_type == libvpx::RcFrameType::kInterFrame) { + // Disable golden frame update. + frame_flags_ |= VP8_EFLAG_NO_UPD_GF; + frame_flags_ |= VP8_EFLAG_NO_UPD_ARF; + } + } + frame_params_.frame_type = video->frame() % key_interval_ == 0 + ? libvpx::RcFrameType::kKeyFrame + : libvpx::RcFrameType::kInterFrame; + encoder_exit_ = video->frame() == test_video_.frames; + } + + void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override { + if (encoder_exit_) { + return; + } + int qp; + encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp); + if (rc_api_->ComputeQP(frame_params_) == libvpx::FrameDropDecision::kOk) { + ASSERT_EQ(rc_api_->GetQP(), qp); + } else { + num_drops_++; + } + } + + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + rc_api_->PostEncodeUpdate(pkt->data.frame.sz); + } + + void RunOneLayer() { + test_video_ = GET_PARAM(2); + target_bitrate_ = GET_PARAM(1); + SetConfig(); + rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); + + ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width, + test_video_.height, 30, 1, 0, + test_video_.frames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + void RunOneLayerScreen() { + test_video_ = GET_PARAM(2); + target_bitrate_ = GET_PARAM(1); + SetConfig(); + rc_cfg_.is_screen = true; + rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); + + ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width, + test_video_.height, 30, 1, 0, + test_video_.frames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + void RunOneLayerDropFrames() { + test_video_ = GET_PARAM(2); + target_bitrate_ = GET_PARAM(1); + frame_drop_thresh_ = 30; + num_drops_ = 0; + // Use lower target_bitrate and max_quantizer to trigger drops. + target_bitrate_ = target_bitrate_ >> 2; + SetConfig(); + rc_cfg_.max_quantizer = 56; + cfg_.rc_max_quantizer = 56; + rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); + + ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width, + test_video_.height, 30, 1, 0, + test_video_.frames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Check that some frames were dropped, otherwise test has no value. + ASSERT_GE(num_drops_, 1); + } + + void RunPeriodicKey() { + test_video_ = GET_PARAM(2); + target_bitrate_ = GET_PARAM(1); + key_interval_ = 100; + frame_drop_thresh_ = 30; + SetConfig(); + rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); + + ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width, + test_video_.height, 30, 1, 0, + test_video_.frames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + void RunTemporalLayers2TL() { + test_video_ = GET_PARAM(2); + target_bitrate_ = GET_PARAM(1); + SetConfigTemporalLayers(2); + rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); + + ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width, + test_video_.height, 30, 1, 0, + test_video_.frames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + void RunTemporalLayers3TL() { + test_video_ = GET_PARAM(2); + target_bitrate_ = GET_PARAM(1); + SetConfigTemporalLayers(3); + rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); + + ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width, + test_video_.height, 30, 1, 0, + test_video_.frames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + void RunTemporalLayers3TLDropFrames() { + test_video_ = GET_PARAM(2); + target_bitrate_ = GET_PARAM(1); + frame_drop_thresh_ = 30; + num_drops_ = 0; + // Use lower target_bitrate and max_quantizer to trigger drops. + target_bitrate_ = target_bitrate_ >> 2; + SetConfigTemporalLayers(3); + rc_cfg_.max_quantizer = 56; + cfg_.rc_max_quantizer = 56; + rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); + + ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width, + test_video_.height, 30, 1, 0, + test_video_.frames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Check that some frames were dropped, otherwise test has no value. + ASSERT_GE(num_drops_, 1); + } + + private: + void SetConfig() { + rc_cfg_.width = test_video_.width; + rc_cfg_.height = test_video_.height; + rc_cfg_.max_quantizer = 60; + rc_cfg_.min_quantizer = 2; + rc_cfg_.target_bandwidth = target_bitrate_; + rc_cfg_.buf_initial_sz = 600; + rc_cfg_.buf_optimal_sz = 600; + rc_cfg_.buf_sz = target_bitrate_; + rc_cfg_.undershoot_pct = 50; + rc_cfg_.overshoot_pct = 50; + rc_cfg_.max_intra_bitrate_pct = 1000; + rc_cfg_.framerate = 30.0; + rc_cfg_.layer_target_bitrate[0] = target_bitrate_; + rc_cfg_.frame_drop_thresh = frame_drop_thresh_; + + // Encoder settings for ground truth. + cfg_.g_w = test_video_.width; + cfg_.g_h = test_video_.height; + cfg_.rc_undershoot_pct = 50; + cfg_.rc_overshoot_pct = 50; + cfg_.rc_buf_initial_sz = 600; + cfg_.rc_buf_optimal_sz = 600; + cfg_.rc_buf_sz = target_bitrate_; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 60; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 1; + cfg_.rc_target_bitrate = target_bitrate_; + cfg_.kf_min_dist = key_interval_; + cfg_.kf_max_dist = key_interval_; + cfg_.rc_dropframe_thresh = frame_drop_thresh_; + } + + void SetConfigTemporalLayers(int temporal_layers) { + rc_cfg_.width = test_video_.width; + rc_cfg_.height = test_video_.height; + rc_cfg_.max_quantizer = 60; + rc_cfg_.min_quantizer = 2; + rc_cfg_.target_bandwidth = target_bitrate_; + rc_cfg_.buf_initial_sz = 600; + rc_cfg_.buf_optimal_sz = 600; + rc_cfg_.buf_sz = target_bitrate_; + rc_cfg_.undershoot_pct = 50; + rc_cfg_.overshoot_pct = 50; + rc_cfg_.max_intra_bitrate_pct = 1000; + rc_cfg_.framerate = 30.0; + rc_cfg_.frame_drop_thresh = frame_drop_thresh_; + if (temporal_layers == 2) { + rc_cfg_.layer_target_bitrate[0] = 60 * target_bitrate_ / 100; + rc_cfg_.layer_target_bitrate[1] = target_bitrate_; + rc_cfg_.ts_rate_decimator[0] = 2; + rc_cfg_.ts_rate_decimator[1] = 1; + } else if (temporal_layers == 3) { + rc_cfg_.layer_target_bitrate[0] = 40 * target_bitrate_ / 100; + rc_cfg_.layer_target_bitrate[1] = 60 * target_bitrate_ / 100; + rc_cfg_.layer_target_bitrate[2] = target_bitrate_; + rc_cfg_.ts_rate_decimator[0] = 4; + rc_cfg_.ts_rate_decimator[1] = 2; + rc_cfg_.ts_rate_decimator[2] = 1; + } + + rc_cfg_.ts_number_layers = temporal_layers; + + // Encoder settings for ground truth. + cfg_.g_w = test_video_.width; + cfg_.g_h = test_video_.height; + cfg_.rc_undershoot_pct = 50; + cfg_.rc_overshoot_pct = 50; + cfg_.rc_buf_initial_sz = 600; + cfg_.rc_buf_optimal_sz = 600; + cfg_.rc_buf_sz = target_bitrate_; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 60; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 1; + cfg_.rc_target_bitrate = target_bitrate_; + cfg_.kf_min_dist = key_interval_; + cfg_.kf_max_dist = key_interval_; + cfg_.rc_dropframe_thresh = frame_drop_thresh_; + // 2 Temporal layers, no spatial layers, CBR mode. + cfg_.ss_number_layers = 1; + cfg_.ts_number_layers = temporal_layers; + if (temporal_layers == 2) { + cfg_.ts_rate_decimator[0] = 2; + cfg_.ts_rate_decimator[1] = 1; + cfg_.ts_periodicity = 2; + cfg_.ts_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100; + cfg_.ts_target_bitrate[1] = cfg_.rc_target_bitrate; + } else if (temporal_layers == 3) { + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + cfg_.ts_periodicity = 4; + cfg_.ts_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100; + cfg_.ts_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100; + cfg_.ts_target_bitrate[2] = cfg_.rc_target_bitrate; + } + } + + std::unique_ptr rc_api_; + libvpx::VP8RateControlRtcConfig rc_cfg_; + int key_interval_; + int target_bitrate_; + Vp8RCTestVideo test_video_; + libvpx::VP8FrameParamsQpRTC frame_params_; + bool encoder_exit_; + int frame_drop_thresh_; + int num_drops_; +}; + +TEST_P(Vp8RcInterfaceTest, OneLayer) { RunOneLayer(); } + +TEST_P(Vp8RcInterfaceTest, OneLayerScreen) { RunOneLayerScreen(); } + +TEST_P(Vp8RcInterfaceTest, OneLayerDropFrames) { RunOneLayerDropFrames(); } + +TEST_P(Vp8RcInterfaceTest, OneLayerPeriodicKey) { RunPeriodicKey(); } + +TEST_P(Vp8RcInterfaceTest, TemporalLayers2TL) { RunTemporalLayers2TL(); } + +TEST_P(Vp8RcInterfaceTest, TemporalLayers3TL) { RunTemporalLayers3TL(); } + +TEST_P(Vp8RcInterfaceTest, TemporalLayers3TLDropFrames) { + RunTemporalLayers3TLDropFrames(); +} + +VP8_INSTANTIATE_TEST_SUITE(Vp8RcInterfaceTest, + ::testing::Values(200, 400, 1000), + ::testing::ValuesIn(kVp8RCTestVectors)); + +} // namespace diff --git a/media/libvpx/libvpx/test/vp9_arf_freq_test.cc b/media/libvpx/libvpx/test/vp9_arf_freq_test.cc new file mode 100644 index 0000000000..3882326d2f --- /dev/null +++ b/media/libvpx/libvpx/test/vp9_arf_freq_test.cc @@ -0,0 +1,219 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "test/yuv_video_source.h" +#include "vp9/encoder/vp9_ratectrl.h" + +namespace { + +const unsigned int kFrames = 100; +const int kBitrate = 500; + +#define ARF_NOT_SEEN 1000001 +#define ARF_SEEN_ONCE 1000000 + +typedef struct { + const char *filename; + unsigned int width; + unsigned int height; + unsigned int framerate_num; + unsigned int framerate_den; + unsigned int input_bit_depth; + vpx_img_fmt fmt; + vpx_bit_depth_t bit_depth; + unsigned int profile; +} TestVideoParam; + +typedef struct { + libvpx_test::TestMode mode; + int cpu_used; +} TestEncodeParam; + +const TestVideoParam kTestVectors[] = { + // artificially increase framerate to trigger default check + { "hantro_collage_w352h288.yuv", 352, 288, 5000, 1, 8, VPX_IMG_FMT_I420, + VPX_BITS_8, 0 }, + { "hantro_collage_w352h288.yuv", 352, 288, 30, 1, 8, VPX_IMG_FMT_I420, + VPX_BITS_8, 0 }, + { "rush_hour_444.y4m", 352, 288, 30, 1, 8, VPX_IMG_FMT_I444, VPX_BITS_8, 1 }, +#if CONFIG_VP9_HIGHBITDEPTH +// Add list of profile 2/3 test videos here ... +#endif // CONFIG_VP9_HIGHBITDEPTH +}; + +const TestEncodeParam kEncodeVectors[] = { + { ::libvpx_test::kOnePassGood, 2 }, { ::libvpx_test::kOnePassGood, 5 }, + { ::libvpx_test::kTwoPassGood, 1 }, { ::libvpx_test::kTwoPassGood, 2 }, + { ::libvpx_test::kTwoPassGood, 5 }, { ::libvpx_test::kRealTime, 5 }, +}; + +const int kMinArfVectors[] = { + // NOTE: 0 refers to the default built-in logic in: + // vp9_rc_get_default_min_gf_interval(...) + 0, 4, 8, 12, 15 +}; + +int is_extension_y4m(const char *filename) { + const char *dot = strrchr(filename, '.'); + if (!dot || dot == filename) { + return 0; + } else { + return !strcmp(dot, ".y4m"); + } +} + +class ArfFreqTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith3Params { + protected: + ArfFreqTest() + : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)), + test_encode_param_(GET_PARAM(2)), min_arf_requested_(GET_PARAM(3)) {} + + ~ArfFreqTest() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(test_encode_param_.mode); + if (test_encode_param_.mode != ::libvpx_test::kRealTime) { + cfg_.g_lag_in_frames = 25; + cfg_.rc_end_usage = VPX_VBR; + } else { + cfg_.g_lag_in_frames = 0; + cfg_.rc_end_usage = VPX_CBR; + cfg_.rc_buf_sz = 1000; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 600; + } + dec_cfg_.threads = 4; + } + + void BeginPassHook(unsigned int) override { + min_run_ = ARF_NOT_SEEN; + run_of_visible_frames_ = 0; + } + + int GetNumFramesInPkt(const vpx_codec_cx_pkt_t *pkt) { + const uint8_t *buffer = reinterpret_cast(pkt->data.frame.buf); + const uint8_t marker = buffer[pkt->data.frame.sz - 1]; + const int mag = ((marker >> 3) & 3) + 1; + int frames = (marker & 0x7) + 1; + const unsigned int index_sz = 2 + mag * frames; + // Check for superframe or not. + // Assume superframe has only one visible frame, the rest being + // invisible. If superframe index is not found, then there is only + // one frame. + if (!((marker & 0xe0) == 0xc0 && pkt->data.frame.sz >= index_sz && + buffer[pkt->data.frame.sz - index_sz] == marker)) { + frames = 1; + } + return frames; + } + + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return; + const int frames = GetNumFramesInPkt(pkt); + if (frames == 1) { + run_of_visible_frames_++; + } else if (frames == 2) { + if (min_run_ == ARF_NOT_SEEN) { + min_run_ = ARF_SEEN_ONCE; + } else if (min_run_ == ARF_SEEN_ONCE || + run_of_visible_frames_ < min_run_) { + min_run_ = run_of_visible_frames_; + } + run_of_visible_frames_ = 1; + } else { + min_run_ = 0; + run_of_visible_frames_ = 1; + } + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1); + encoder->Control(VP9E_SET_TILE_COLUMNS, 4); + encoder->Control(VP8E_SET_CPUUSED, test_encode_param_.cpu_used); + encoder->Control(VP9E_SET_MIN_GF_INTERVAL, min_arf_requested_); + if (test_encode_param_.mode != ::libvpx_test::kRealTime) { + encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); + encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7); + encoder->Control(VP8E_SET_ARNR_STRENGTH, 5); + encoder->Control(VP8E_SET_ARNR_TYPE, 3); + } + } + } + + int GetMinVisibleRun() const { return min_run_; } + + int GetMinArfDistanceRequested() const { + if (min_arf_requested_) { + return min_arf_requested_; + } else { + return vp9_rc_get_default_min_gf_interval( + test_video_param_.width, test_video_param_.height, + (double)test_video_param_.framerate_num / + test_video_param_.framerate_den); + } + } + + TestVideoParam test_video_param_; + TestEncodeParam test_encode_param_; + + private: + int min_arf_requested_; + int min_run_; + int run_of_visible_frames_; +}; + +TEST_P(ArfFreqTest, MinArfFreqTest) { + cfg_.rc_target_bitrate = kBitrate; + cfg_.g_error_resilient = 0; + cfg_.g_profile = test_video_param_.profile; + cfg_.g_input_bit_depth = test_video_param_.input_bit_depth; + cfg_.g_bit_depth = test_video_param_.bit_depth; + init_flags_ = VPX_CODEC_USE_PSNR; + if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH; + + std::unique_ptr video; + if (is_extension_y4m(test_video_param_.filename)) { + video.reset(new libvpx_test::Y4mVideoSource(test_video_param_.filename, 0, + kFrames)); + } else { + video.reset(new libvpx_test::YUVVideoSource( + test_video_param_.filename, test_video_param_.fmt, + test_video_param_.width, test_video_param_.height, + test_video_param_.framerate_num, test_video_param_.framerate_den, 0, + kFrames)); + } + + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); + const int min_run = GetMinVisibleRun(); + const int min_arf_dist_requested = GetMinArfDistanceRequested(); + if (min_run != ARF_NOT_SEEN && min_run != ARF_SEEN_ONCE) { + const int min_arf_dist = min_run + 1; + EXPECT_GE(min_arf_dist, min_arf_dist_requested); + } +} + +VP9_INSTANTIATE_TEST_SUITE(ArfFreqTest, ::testing::ValuesIn(kTestVectors), + ::testing::ValuesIn(kEncodeVectors), + ::testing::ValuesIn(kMinArfVectors)); +} // namespace diff --git a/media/libvpx/libvpx/test/vp9_block_error_test.cc b/media/libvpx/libvpx/test/vp9_block_error_test.cc new file mode 100644 index 0000000000..0645341ac1 --- /dev/null +++ b/media/libvpx/libvpx/test/vp9_block_error_test.cc @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "vp9/common/vp9_entropy.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" + +using libvpx_test::ACMRandom; + +namespace { +const int kNumIterations = 1000; + +typedef int64_t (*HBDBlockErrorFunc)(const tran_low_t *coeff, + const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz, + int bps); + +typedef std::tuple + BlockErrorParam; + +typedef int64_t (*BlockErrorFunc)(const tran_low_t *coeff, + const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz); + +template +int64_t BlockError8BitWrapper(const tran_low_t *coeff, + const tran_low_t *dqcoeff, intptr_t block_size, + int64_t *ssz, int bps) { + EXPECT_EQ(bps, 8); + return fn(coeff, dqcoeff, block_size, ssz); +} + +class BlockErrorTest : public ::testing::TestWithParam { + public: + ~BlockErrorTest() override = default; + void SetUp() override { + error_block_op_ = GET_PARAM(0); + ref_error_block_op_ = GET_PARAM(1); + bit_depth_ = GET_PARAM(2); + } + + void TearDown() override { libvpx_test::ClearSystemState(); } + + protected: + vpx_bit_depth_t bit_depth_; + HBDBlockErrorFunc error_block_op_; + HBDBlockErrorFunc ref_error_block_op_; +}; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BlockErrorTest); + +TEST_P(BlockErrorTest, OperationCheck) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + DECLARE_ALIGNED(16, tran_low_t, coeff[4096]); + DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]); + int err_count_total = 0; + int first_failure = -1; + intptr_t block_size; + int64_t ssz; + int64_t ret; + int64_t ref_ssz; + int64_t ref_ret; + const int msb = bit_depth_ + 8 - 1; + for (int i = 0; i < kNumIterations; ++i) { + int err_count = 0; + block_size = 16 << (i % 9); // All block sizes from 4x4, 8x4 ..64x64 + for (int j = 0; j < block_size; j++) { + // coeff and dqcoeff will always have at least the same sign, and this + // can be used for optimization, so generate test input precisely. + if (rnd(2)) { + // Positive number + coeff[j] = rnd(1 << msb); + dqcoeff[j] = rnd(1 << msb); + } else { + // Negative number + coeff[j] = -rnd(1 << msb); + dqcoeff[j] = -rnd(1 << msb); + } + } + ref_ret = + ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz, bit_depth_); + ASM_REGISTER_STATE_CHECK( + ret = error_block_op_(coeff, dqcoeff, block_size, &ssz, bit_depth_)); + err_count += (ref_ret != ret) | (ref_ssz != ssz); + if (err_count && !err_count_total) { + first_failure = i; + } + err_count_total += err_count; + } + EXPECT_EQ(0, err_count_total) + << "Error: Error Block Test, C output doesn't match optimized output. " + << "First failed at test case " << first_failure; +} + +TEST_P(BlockErrorTest, ExtremeValues) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + DECLARE_ALIGNED(16, tran_low_t, coeff[4096]); + DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]); + int err_count_total = 0; + int first_failure = -1; + intptr_t block_size; + int64_t ssz; + int64_t ret; + int64_t ref_ssz; + int64_t ref_ret; + const int msb = bit_depth_ + 8 - 1; + int max_val = ((1 << msb) - 1); + for (int i = 0; i < kNumIterations; ++i) { + int err_count = 0; + int k = (i / 9) % 9; + + // Change the maximum coeff value, to test different bit boundaries + if (k == 8 && (i % 9) == 0) { + max_val >>= 1; + } + block_size = 16 << (i % 9); // All block sizes from 4x4, 8x4 ..64x64 + for (int j = 0; j < block_size; j++) { + if (k < 4) { + // Test at positive maximum values + coeff[j] = k % 2 ? max_val : 0; + dqcoeff[j] = (k >> 1) % 2 ? max_val : 0; + } else if (k < 8) { + // Test at negative maximum values + coeff[j] = k % 2 ? -max_val : 0; + dqcoeff[j] = (k >> 1) % 2 ? -max_val : 0; + } else { + if (rnd(2)) { + // Positive number + coeff[j] = rnd(1 << 14); + dqcoeff[j] = rnd(1 << 14); + } else { + // Negative number + coeff[j] = -rnd(1 << 14); + dqcoeff[j] = -rnd(1 << 14); + } + } + } + ref_ret = + ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz, bit_depth_); + ASM_REGISTER_STATE_CHECK( + ret = error_block_op_(coeff, dqcoeff, block_size, &ssz, bit_depth_)); + err_count += (ref_ret != ret) | (ref_ssz != ssz); + if (err_count && !err_count_total) { + first_failure = i; + } + err_count_total += err_count; + } + EXPECT_EQ(0, err_count_total) + << "Error: Error Block Test, C output doesn't match optimized output. " + << "First failed at test case " << first_failure; +} + +using std::make_tuple; + +#if HAVE_SSE2 +const BlockErrorParam sse2_block_error_tests[] = { +#if CONFIG_VP9_HIGHBITDEPTH + make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c, + VPX_BITS_10), + make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c, + VPX_BITS_12), + make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c, + VPX_BITS_8), +#endif // CONFIG_VP9_HIGHBITDEPTH + make_tuple(&BlockError8BitWrapper, + &BlockError8BitWrapper, VPX_BITS_8) +}; + +INSTANTIATE_TEST_SUITE_P(SSE2, BlockErrorTest, + ::testing::ValuesIn(sse2_block_error_tests)); +#endif // HAVE_SSE2 + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, BlockErrorTest, + ::testing::Values(make_tuple(&BlockError8BitWrapper, + &BlockError8BitWrapper, + VPX_BITS_8))); +#endif // HAVE_AVX2 + +#if HAVE_NEON +const BlockErrorParam neon_block_error_tests[] = { +#if CONFIG_VP9_HIGHBITDEPTH + make_tuple(&vp9_highbd_block_error_neon, &vp9_highbd_block_error_c, + VPX_BITS_10), + make_tuple(&vp9_highbd_block_error_neon, &vp9_highbd_block_error_c, + VPX_BITS_12), + make_tuple(&vp9_highbd_block_error_neon, &vp9_highbd_block_error_c, + VPX_BITS_8), +#endif // CONFIG_VP9_HIGHBITDEPTH + make_tuple(&BlockError8BitWrapper, + &BlockError8BitWrapper, VPX_BITS_8) +}; + +INSTANTIATE_TEST_SUITE_P(NEON, BlockErrorTest, + ::testing::ValuesIn(neon_block_error_tests)); +#endif // HAVE_NEON +} // namespace diff --git a/media/libvpx/libvpx/test/vp9_boolcoder_test.cc b/media/libvpx/libvpx/test/vp9_boolcoder_test.cc new file mode 100644 index 0000000000..6ba171a000 --- /dev/null +++ b/media/libvpx/libvpx/test/vp9_boolcoder_test.cc @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "test/acm_random.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/bitreader.h" +#include "vpx_dsp/bitwriter.h" + +using libvpx_test::ACMRandom; + +namespace { +const int num_tests = 10; +} // namespace + +TEST(VP9, TestBitIO) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + for (int n = 0; n < num_tests; ++n) { + for (int method = 0; method <= 7; ++method) { // we generate various proba + const int kBitsToTest = 1000; + uint8_t probas[kBitsToTest]; + + for (int i = 0; i < kBitsToTest; ++i) { + const int parity = i & 1; + /* clang-format off */ + probas[i] = + (method == 0) ? 0 : (method == 1) ? 255 : + (method == 2) ? 128 : + (method == 3) ? rnd.Rand8() : + (method == 4) ? (parity ? 0 : 255) : + // alternate between low and high proba: + (method == 5) ? (parity ? rnd(128) : 255 - rnd(128)) : + (method == 6) ? + (parity ? rnd(64) : 255 - rnd(64)) : + (parity ? rnd(32) : 255 - rnd(32)); + /* clang-format on */ + } + for (int bit_method = 0; bit_method <= 3; ++bit_method) { + const int random_seed = 6432; + const int kBufferSize = 10000; + ACMRandom bit_rnd(random_seed); + vpx_writer bw; + uint8_t bw_buffer[kBufferSize]; + vpx_start_encode(&bw, bw_buffer); + + int bit = (bit_method == 0) ? 0 : (bit_method == 1) ? 1 : 0; + for (int i = 0; i < kBitsToTest; ++i) { + if (bit_method == 2) { + bit = (i & 1); + } else if (bit_method == 3) { + bit = bit_rnd(2); + } + vpx_write(&bw, bit, static_cast(probas[i])); + } + + vpx_stop_encode(&bw); + // vpx_reader_fill() may read into uninitialized data that + // isn't used meaningfully, but may trigger an MSan warning. + memset(bw_buffer + bw.pos, 0, sizeof(BD_VALUE) - 1); + + // First bit should be zero + GTEST_ASSERT_EQ(bw_buffer[0] & 0x80, 0); + + vpx_reader br; + vpx_reader_init(&br, bw_buffer, kBufferSize, nullptr, nullptr); + bit_rnd.Reset(random_seed); + for (int i = 0; i < kBitsToTest; ++i) { + if (bit_method == 2) { + bit = (i & 1); + } else if (bit_method == 3) { + bit = bit_rnd(2); + } + GTEST_ASSERT_EQ(vpx_read(&br, probas[i]), bit) + << "pos: " << i << " / " << kBitsToTest + << " bit_method: " << bit_method << " method: " << method; + } + } + } + } +} diff --git a/media/libvpx/libvpx/test/vp9_c_vs_simd_encode.sh b/media/libvpx/libvpx/test/vp9_c_vs_simd_encode.sh new file mode 100755 index 0000000000..03843610dc --- /dev/null +++ b/media/libvpx/libvpx/test/vp9_c_vs_simd_encode.sh @@ -0,0 +1,420 @@ +#!/bin/sh +## +## Copyright (c) 2023 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## +## This script checks the bit exactness between C and SIMD +## implementations of VP9 encoder. +## +. $(dirname $0)/tools_common.sh + +TEST_BITRATES="1600 6400" +PRESETS="good rt" +TEST_CLIPS="yuv_raw_input y4m_360p_10bit_input yuv_480p_raw_input y4m_720p_input" +OUT_FILE_SUFFIX=".ivf" +SCRIPT_DIR=$(dirname "$0") +LIBVPX_SOURCE_DIR=$(cd "${SCRIPT_DIR}/.."; pwd) + +# Clips used in test. +YUV_RAW_INPUT="${LIBVPX_TEST_DATA_PATH}/hantro_collage_w352h288.yuv" +YUV_480P_RAW_INPUT="${LIBVPX_TEST_DATA_PATH}/niklas_640_480_30.yuv" +Y4M_360P_10BIT_INPUT="${LIBVPX_TEST_DATA_PATH}/crowd_run_360p_10_150f.y4m" +Y4M_720P_INPUT="${LIBVPX_TEST_DATA_PATH}/niklas_1280_720_30.y4m" + +# Number of frames to test. +VP9_ENCODE_C_VS_SIMD_TEST_FRAME_LIMIT=20 + +# Create a temporary directory for output files. +if [ -n "${TMPDIR}" ]; then + VPX_TEST_TEMP_ROOT="${TMPDIR}" +elif [ -n "${TEMPDIR}" ]; then + VPX_TEST_TEMP_ROOT="${TEMPDIR}" +else + VPX_TEST_TEMP_ROOT=/tmp +fi + +VPX_TEST_OUTPUT_DIR="${VPX_TEST_TEMP_ROOT}/vpx_test_$$" + +if ! mkdir -p "${VPX_TEST_OUTPUT_DIR}" || \ + [ ! -d "${VPX_TEST_OUTPUT_DIR}" ]; then + echo "${0##*/}: Cannot create output directory, giving up." + echo "${0##*/}: VPX_TEST_OUTPUT_DIR=${VPX_TEST_OUTPUT_DIR}" + exit 1 +fi + +elog() { + echo "$@" 1>&2 +} + +# Echoes path to $1 when it's executable and exists in ${VPX_TEST_OUTPUT_DIR}, +# or an empty string. Caller is responsible for testing the string once the +# function returns. +vp9_enc_tool_path() { + local target="$1" + local tool_path="${VPX_TEST_OUTPUT_DIR}/build_target_${target}/vpxenc" + + if [ ! -x "${tool_path}" ]; then + tool_path="" + fi + echo "${tool_path}" +} + +# Environment check: Make sure input and source directories are available. +vp9_c_vs_simd_enc_verify_environment() { + if [ ! -e "${YUV_RAW_INPUT}" ]; then + elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH." + return 1 + fi + if [ ! -e "${YUV_480P_RAW_INPUT}" ]; then + elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH." + return 1 + fi + if [ ! -e "${Y4M_720P_INPUT}" ]; then + elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH." + return 1 + fi + if [ ! -e "${Y4M_360P_10BIT_INPUT}" ]; then + elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH." + return 1 + fi + if [ ! -d "$LIBVPX_SOURCE_DIR" ]; then + elog "LIBVPX_SOURCE_DIR does not exist." + return 1 + fi +} + +# This is not needed since tools_common.sh does the same cleanup. +# Keep the code here for our reference. +# cleanup() { +# rm -rf ${VPX_TEST_OUTPUT_DIR} +# } + +# Echo VPX_SIMD_CAPS_MASK for different instruction set architecture. +avx512f() { + echo "0x1FF" +} + +avx2() { + echo "0x0FF" +} + +sse4_1() { + echo "0x03F" +} + +ssse3() { + echo "0x01F" +} + +sse2() { + echo "0x007" +} + +# Echo clip details to be used as input to vpxenc. +yuv_raw_input() { + echo ""${YUV_RAW_INPUT}" + --width=352 + --height=288 + --bit-depth=8 + --profile=0" +} + +yuv_480p_raw_input() { + echo ""${YUV_480P_RAW_INPUT}" + --width=640 + --height=480 + --bit-depth=8 + --profile=0" +} + +y4m_720p_input() { + echo ""${Y4M_720P_INPUT}" + --bit-depth=8 + --profile=0" +} + +y4m_360p_10bit_input() { + echo ""${Y4M_360P_10BIT_INPUT}" + --bit-depth=10 + --profile=2" +} + +has_x86_isa_extn() { + instruction_set=$1 + if ! grep -q "$instruction_set" /proc/cpuinfo; then + # This instruction_set is not supported. + return 1 + fi + # This instruction_set is supported. + return 0 +} + +# Echo good encode params for use with VP9 encoder. +vp9_encode_good_params() { + echo "--codec=vp9 \ + --good \ + --test-decode=fatal \ + --ivf \ + --threads=1 \ + --static-thresh=0 \ + --tile-columns=0 \ + --end-usage=vbr \ + --kf-max-dist=160 \ + --kf-min-dist=0 \ + --lag-in-frames=19 \ + --max-q=63 \ + --min-q=0 \ + --passes=2 \ + --undershoot-pct=100 \ + --overshoot-pct=100 \ + --verbose \ + --auto-alt-ref=1 \ + --drop-frame=0 \ + --bias-pct=50 \ + --minsection-pct=0 \ + --maxsection-pct=2000 \ + --arnr-maxframes=7 \ + --arnr-strength=5 \ + --sharpness=0 \ + --frame-parallel=0" +} + +# Echo realtime encode params for use with VP9 encoder. +vp9_encode_rt_params() { + echo "--codec=vp9 \ + --rt \ + --test-decode=fatal \ + --ivf \ + --threads=1 \ + --static-thresh=0 \ + --tile-columns=0 \ + --tile-rows=0 \ + --end-usage=cbr \ + --kf-max-dist=90000 \ + --lag-in-frames=0 \ + --max-q=58 \ + --min-q=2 \ + --passes=1 \ + --undershoot-pct=50 \ + --overshoot-pct=50 \ + --verbose \ + --row-mt=0 \ + --buf-sz=1000 \ + --buf-initial-sz=500 \ + --buf-optimal-sz=600 \ + --max-intra-rate=300 \ + --resize-allowed=0 \ + --noise-sensitivity=0 \ + --aq-mode=3 \ + --error-resilient=0" +} + +# Configures for the given target in the +# ${VPX_TEST_OUTPUT_DIR}/build_target_${target} directory. +vp9_enc_build() { + local target=$1 + local configure="$2" + local tmp_build_dir=${VPX_TEST_OUTPUT_DIR}/build_target_${target} + mkdir -p "$tmp_build_dir" + local save_dir="$PWD" + cd "$tmp_build_dir" + + echo "Building target: ${target}" + local config_args="--disable-install-docs \ + --enable-unit-tests \ + --enable-debug \ + --enable-postproc \ + --enable-vp9-postproc \ + --enable-vp9-temporal-denoising \ + --enable-vp9-highbitdepth" + + eval "$configure" --target="${target}" "${config_args}" ${devnull} + eval make -j$(nproc) ${devnull} + echo "Done building target: ${target}" + cd "${save_dir}" +} + +compare_enc_output() { + local target=$1 + local cpu=$2 + local clip=$3 + local bitrate=$4 + local preset=$5 + if ! diff -q ${VPX_TEST_OUTPUT_DIR}/Out-generic-gnu-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} \ + ${VPX_TEST_OUTPUT_DIR}/Out-${target}-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX}; then + elog "C vs ${target} encode mismatches for ${clip}, at ${bitrate} kbps, speed ${cpu}, ${preset} preset" + return 1 + fi +} + +vp9_enc_test() { + local encoder="$1" + local target=$2 + if [ -z "$(vp9_enc_tool_path "${target}")" ]; then + elog "vpxenc not found. It must exist in ${VPX_TEST_OUTPUT_DIR}/build_target_${target} path" + return 1 + fi + + local tmp_build_dir=${VPX_TEST_OUTPUT_DIR}/build_target_${target} + local save_dir="$PWD" + cd "$tmp_build_dir" + for preset in ${PRESETS}; do + if [ "${preset}" = "good" ]; then + local max_cpu_used=5 + local test_params=vp9_encode_good_params + elif [ "${preset}" = "rt" ]; then + local max_cpu_used=9 + local test_params=vp9_encode_rt_params + else + elog "Invalid preset" + cd "${save_dir}" + return 1 + fi + + # Enable armv8 test for real-time only + if [ "${preset}" = "good" ] && [ "${target}" = "armv8-linux-gcc" ]; then + continue + fi + + for cpu in $(seq 0 $max_cpu_used); do + for clip in ${TEST_CLIPS}; do + for bitrate in ${TEST_BITRATES}; do + eval "${encoder}" $($clip) $($test_params) \ + "--limit=${VP9_ENCODE_C_VS_SIMD_TEST_FRAME_LIMIT}" \ + "--cpu-used=${cpu}" "--target-bitrate=${bitrate}" "-o" \ + ${VPX_TEST_OUTPUT_DIR}/Out-${target}-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} \ + ${devnull} + + if [ "${target}" != "generic-gnu" ]; then + if ! compare_enc_output ${target} $cpu ${clip} $bitrate ${preset}; then + # Find the mismatch + cd "${save_dir}" + return 1 + fi + fi + done + done + done + done + cd "${save_dir}" +} + +vp9_test_generic() { + local configure="$LIBVPX_SOURCE_DIR/configure" + local target="generic-gnu" + + echo "Build for: ${target}" + vp9_enc_build ${target} ${configure} + local encoder="$(vp9_enc_tool_path "${target}")" + vp9_enc_test $encoder "${target}" +} + +# This function encodes VP9 bitstream by enabling SSE2, SSSE3, SSE4_1, AVX2, AVX512f as there are +# no functions with MMX, SSE, SSE3 and AVX specialization. +# The value of environment variable 'VPX_SIMD_CAPS' controls enabling of different instruction +# set extension optimizations. The value of the flag 'VPX_SIMD_CAPS' and the corresponding +# instruction set extension optimization enabled are as follows: +# AVX512 AVX2 AVX SSE4_1 SSSE3 SSE3 SSE2 SSE MMX +# 1 1 1 1 1 1 1 1 1 -> 0x1FF -> Enable AVX512 and lower variants +# 0 1 1 1 1 1 1 1 1 -> 0x0FF -> Enable AVX2 and lower variants +# 0 0 1 1 1 1 1 1 1 -> 0x07F -> Enable AVX and lower variants +# 0 0 0 1 1 1 1 1 1 -> 0x03F -> Enable SSE4_1 and lower variants +# 0 0 0 0 1 1 1 1 1 -> 0x01F -> Enable SSSE3 and lower variants +# 0 0 0 0 0 1 1 1 1 -> 0x00F -> Enable SSE3 and lower variants +# 0 0 0 0 0 0 1 1 1 -> 0x007 -> Enable SSE2 and lower variants +# 0 0 0 0 0 0 0 1 1 -> 0x003 -> Enable SSE and lower variants +# 0 0 0 0 0 0 0 0 1 -> 0x001 -> Enable MMX +## NOTE: In x86_64 platform, it is not possible to enable sse/mmx/c using "VPX_SIMD_CAPS_MASK" as +# all x86_64 platforms implement sse2. +vp9_test_x86() { + local arch=$1 + + if ! uname -m | grep -q "x86"; then + elog "Machine architecture is not x86 or x86_64" + return 0 + fi + + if [ $arch = "x86" ]; then + local target="x86-linux-gcc" + elif [ $arch = "x86_64" ]; then + local target="x86_64-linux-gcc" + fi + + local x86_isa_variants="avx512f avx2 sse4_1 ssse3 sse2" + local configure="$LIBVPX_SOURCE_DIR/configure" + + echo "Build for x86: ${target}" + vp9_enc_build ${target} ${configure} + local encoder="$(vp9_enc_tool_path "${target}")" + for isa in $x86_isa_variants; do + # Note that if has_x86_isa_extn returns 1, it is false, and vice versa. + if ! has_x86_isa_extn $isa; then + echo "${isa} is not supported in this machine" + continue + fi + export VPX_SIMD_CAPS_MASK=$($isa) + if ! vp9_enc_test $encoder ${target}; then + # Find the mismatch + return 1 + fi + unset VPX_SIMD_CAPS_MASK + done +} + +vp9_test_arm() { + local target="armv8-linux-gcc" + local configure="CROSS=aarch64-linux-gnu- $LIBVPX_SOURCE_DIR/configure --extra-cflags=-march=armv8.4-a \ + --extra-cxxflags=-march=armv8.4-a" + echo "Build for arm64: ${target}" + vp9_enc_build ${target} "${configure}" + + local encoder="$(vp9_enc_tool_path "${target}")" + if ! vp9_enc_test "qemu-aarch64 -L /usr/aarch64-linux-gnu ${encoder}" ${target}; then + # Find the mismatch + return 1 + fi +} + +vp9_c_vs_simd_enc_test() { + # Test Generic + vp9_test_generic + + # Test x86 (32 bit) + echo "vp9 test for x86 (32 bit): Started." + if ! vp9_test_x86 "x86"; then + echo "vp9 test for x86 (32 bit): Done, test failed." + return 1 + else + echo "vp9 test for x86 (32 bit): Done, all tests passed." + fi + + # Test x86_64 (64 bit) + if [ "$(eval uname -m)" = "x86_64" ]; then + echo "vp9 test for x86_64 (64 bit): Started." + if ! vp9_test_x86 "x86_64"; then + echo "vp9 test for x86_64 (64 bit): Done, test failed." + return 1 + else + echo "vp9 test for x86_64 (64 bit): Done, all tests passed." + fi + fi + + # Test ARM + echo "vp9_test_arm: Started." + if ! vp9_test_arm; then + echo "vp9 test for arm: Done, test failed." + return 1 + else + echo "vp9 test for arm: Done, all tests passed." + fi +} + +# Setup a trap function to clean up build, and output files after tests complete. +# trap cleanup EXIT + +run_tests vp9_c_vs_simd_enc_verify_environment vp9_c_vs_simd_enc_test diff --git a/media/libvpx/libvpx/test/vp9_datarate_test.cc b/media/libvpx/libvpx/test/vp9_datarate_test.cc new file mode 100644 index 0000000000..4bc9099206 --- /dev/null +++ b/media/libvpx/libvpx/test/vp9_datarate_test.cc @@ -0,0 +1,1096 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_config.h" +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/acm_random.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "vpx/vpx_codec.h" +#include "vpx_ports/bitops.h" + +namespace { + +class DatarateTestVP9 : public ::libvpx_test::EncoderTest { + public: + explicit DatarateTestVP9(const ::libvpx_test::CodecFactory *codec) + : EncoderTest(codec) { + tune_content_ = 0; + } + + protected: + ~DatarateTestVP9() override = default; + + virtual void ResetModel() { + last_pts_ = 0; + bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz; + frame_number_ = 0; + tot_frame_number_ = 0; + first_drop_ = 0; + num_drops_ = 0; + aq_mode_ = 3; + // Denoiser is off by default. + denoiser_on_ = 0; + // For testing up to 3 layers. + for (int i = 0; i < 3; ++i) { + bits_total_[i] = 0; + } + denoiser_offon_test_ = 0; + denoiser_offon_period_ = -1; + frame_parallel_decoding_mode_ = 1; + delta_q_uv_ = 0; + use_roi_ = false; + } + + // + // Frame flags and layer id for temporal layers. + // + + // For two layers, test pattern is: + // 1 3 + // 0 2 ..... + // For three layers, test pattern is: + // 1 3 5 7 + // 2 6 + // 0 4 .... + // LAST is always update on base/layer 0, GOLDEN is updated on layer 1. + // For this 3 layer example, the 2nd enhancement layer (layer 2) updates + // the altref frame. + static int GetFrameFlags(int frame_num, int num_temp_layers) { + int frame_flags = 0; + if (num_temp_layers == 2) { + if (frame_num % 2 == 0) { + // Layer 0: predict from L and ARF, update L. + frame_flags = + VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF; + } else { + // Layer 1: predict from L, G and ARF, and update G. + frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST | + VP8_EFLAG_NO_UPD_ENTROPY; + } + } else if (num_temp_layers == 3) { + if (frame_num % 4 == 0) { + // Layer 0: predict from L and ARF; update L. + frame_flags = + VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_GF; + } else if ((frame_num - 2) % 4 == 0) { + // Layer 1: predict from L, G, ARF; update G. + frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST; + } else if ((frame_num - 1) % 2 == 0) { + // Layer 2: predict from L, G, ARF; update ARF. + frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_LAST; + } + } + return frame_flags; + } + + static int SetLayerId(int frame_num, int num_temp_layers) { + int layer_id = 0; + if (num_temp_layers == 2) { + if (frame_num % 2 == 0) { + layer_id = 0; + } else { + layer_id = 1; + } + } else if (num_temp_layers == 3) { + if (frame_num % 4 == 0) { + layer_id = 0; + } else if ((frame_num - 2) % 4 == 0) { + layer_id = 1; + } else if ((frame_num - 1) % 2 == 0) { + layer_id = 2; + } + } + return layer_id; + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); + encoder->Control(VP9E_SET_AQ_MODE, aq_mode_); + encoder->Control(VP9E_SET_TUNE_CONTENT, tune_content_); + } + + if (denoiser_offon_test_) { + ASSERT_GT(denoiser_offon_period_, 0) + << "denoiser_offon_period_ is not positive."; + if ((video->frame() + 1) % denoiser_offon_period_ == 0) { + // Flip denoiser_on_ periodically + denoiser_on_ ^= 1; + } + } + + encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_); + encoder->Control(VP9E_SET_TILE_COLUMNS, get_msb(cfg_.g_threads)); + encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, + frame_parallel_decoding_mode_); + + if (use_roi_) { + encoder->Control(VP9E_SET_ROI_MAP, &roi_); + encoder->Control(VP9E_SET_AQ_MODE, 0); + } + + if (delta_q_uv_ != 0) { + encoder->Control(VP9E_SET_DELTA_Q_UV, delta_q_uv_); + } + + if (cfg_.ts_number_layers > 1) { + if (video->frame() == 0) { + encoder->Control(VP9E_SET_SVC, 1); + } + if (cfg_.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + vpx_svc_layer_id_t layer_id; + frame_flags_ = GetFrameFlags(video->frame(), cfg_.ts_number_layers); + layer_id.spatial_layer_id = 0; + layer_id.temporal_layer_id = + SetLayerId(video->frame(), cfg_.ts_number_layers); + layer_id.temporal_layer_id_per_spatial[0] = + SetLayerId(video->frame(), cfg_.ts_number_layers); + encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id); + } + } + const vpx_rational_t tb = video->timebase(); + timebase_ = static_cast(tb.num) / tb.den; + duration_ = 0; + } + + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + // Time since last timestamp = duration. + vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_; + + if (duration > 1) { + // If first drop not set and we have a drop set it to this time. + if (!first_drop_) first_drop_ = last_pts_ + 1; + // Update the number of frame drops. + num_drops_ += static_cast(duration - 1); + // Update counter for total number of frames (#frames input to encoder). + // Needed for setting the proper layer_id below. + tot_frame_number_ += static_cast(duration - 1); + } + + int layer = SetLayerId(tot_frame_number_, cfg_.ts_number_layers); + + // Add to the buffer the bits we'd expect from a constant bitrate server. + bits_in_buffer_model_ += static_cast( + duration * timebase_ * cfg_.rc_target_bitrate * 1000); + + // Buffer should not go negative. + ASSERT_GE(bits_in_buffer_model_, 0) + << "Buffer Underrun at frame " << pkt->data.frame.pts; + + const size_t frame_size_in_bits = pkt->data.frame.sz * 8; + + // Update the total encoded bits. For temporal layers, update the cumulative + // encoded bits per layer. + for (int i = layer; i < static_cast(cfg_.ts_number_layers); ++i) { + bits_total_[i] += frame_size_in_bits; + } + + // Update the most recent pts. + last_pts_ = pkt->data.frame.pts; + ++frame_number_; + ++tot_frame_number_; + } + + void EndPassHook() override { + for (int layer = 0; layer < static_cast(cfg_.ts_number_layers); + ++layer) { + duration_ = (last_pts_ + 1) * timebase_; + if (bits_total_[layer]) { + // Effective file datarate: + effective_datarate_[layer] = (bits_total_[layer] / 1000.0) / duration_; + } + } + } + + vpx_codec_pts_t last_pts_; + double timebase_; + int tune_content_; + int frame_number_; // Counter for number of non-dropped/encoded frames. + int tot_frame_number_; // Counter for total number of input frames. + int64_t bits_total_[3]; + double duration_; + double effective_datarate_[3]; + int set_cpu_used_; + int64_t bits_in_buffer_model_; + vpx_codec_pts_t first_drop_; + int num_drops_; + int aq_mode_; + int denoiser_on_; + int denoiser_offon_test_; + int denoiser_offon_period_; + int frame_parallel_decoding_mode_; + int delta_q_uv_; + bool use_roi_; + vpx_roi_map_t roi_; +}; + +// Params: test mode, speed setting and index for bitrate array. +class DatarateTestVP9RealTimeMultiBR + : public DatarateTestVP9, + public ::libvpx_test::CodecTestWith2Params { + public: + DatarateTestVP9RealTimeMultiBR() : DatarateTestVP9(GET_PARAM(0)) {} + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + set_cpu_used_ = GET_PARAM(1); + ResetModel(); + } +}; + +// Params: speed setting and index for bitrate array. +class DatarateTestVP9LargeVBR + : public DatarateTestVP9, + public ::libvpx_test::CodecTestWith2Params { + public: + DatarateTestVP9LargeVBR() : DatarateTestVP9(GET_PARAM(0)) {} + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + set_cpu_used_ = GET_PARAM(1); + ResetModel(); + } +}; + +// Check basic rate targeting for VBR mode with 0 lag. +TEST_P(DatarateTestVP9LargeVBR, BasicRateTargetingVBRLagZero) { + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_error_resilient = 0; + cfg_.rc_end_usage = VPX_VBR; + cfg_.g_lag_in_frames = 0; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + + const int bitrates[2] = { 400, 800 }; + const int bitrate_index = GET_PARAM(2); + cfg_.rc_target_bitrate = bitrates[bitrate_index]; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.36) + << " The datarate for the file is greater than target by too much!"; +} + +// Check basic rate targeting for VBR mode with non-zero lag. +TEST_P(DatarateTestVP9LargeVBR, BasicRateTargetingVBRLagNonZero) { + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_error_resilient = 0; + cfg_.rc_end_usage = VPX_VBR; + // For non-zero lag, rate control will work (be within bounds) for + // real-time mode. + if (deadline_ == VPX_DL_REALTIME) { + cfg_.g_lag_in_frames = 15; + } else { + cfg_.g_lag_in_frames = 0; + } + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + const int bitrates[2] = { 400, 800 }; + const int bitrate_index = GET_PARAM(2); + cfg_.rc_target_bitrate = bitrates[bitrate_index]; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.35) + << " The datarate for the file is greater than target by too much!"; +} + +// Check basic rate targeting for VBR mode with non-zero lag, with +// frame_parallel_decoding_mode off. This enables the adapt_coeff/mode/mv probs +// since error_resilience is off. +TEST_P(DatarateTestVP9LargeVBR, BasicRateTargetingVBRLagNonZeroFrameParDecOff) { + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_error_resilient = 0; + cfg_.rc_end_usage = VPX_VBR; + // For non-zero lag, rate control will work (be within bounds) for + // real-time mode. + if (deadline_ == VPX_DL_REALTIME) { + cfg_.g_lag_in_frames = 15; + } else { + cfg_.g_lag_in_frames = 0; + } + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + const int bitrates[2] = { 400, 800 }; + const int bitrate_index = GET_PARAM(2); + cfg_.rc_target_bitrate = bitrates[bitrate_index]; + ResetModel(); + frame_parallel_decoding_mode_ = 0; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.35) + << " The datarate for the file is greater than target by too much!"; +} + +// Check basic rate targeting for CBR mode. +TEST_P(DatarateTestVP9RealTimeMultiBR, BasicRateTargeting) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + const int bitrates[4] = { 150, 350, 550, 750 }; + const int bitrate_index = GET_PARAM(2); + cfg_.rc_target_bitrate = bitrates[bitrate_index]; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) + << " The datarate for the file is greater than target by too much!"; +} + +// Check basic rate targeting for CBR mode, with frame_parallel_decoding_mode +// off( and error_resilience off). +TEST_P(DatarateTestVP9RealTimeMultiBR, BasicRateTargetingFrameParDecOff) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 0; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + const int bitrates[4] = { 150, 350, 550, 750 }; + const int bitrate_index = GET_PARAM(2); + cfg_.rc_target_bitrate = bitrates[bitrate_index]; + ResetModel(); + frame_parallel_decoding_mode_ = 0; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) + << " The datarate for the file is greater than target by too much!"; +} + +// Check basic rate targeting for CBR. +TEST_P(DatarateTestVP9RealTimeMultiBR, BasicRateTargeting444) { + ::libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140); + + cfg_.g_profile = 1; + cfg_.g_timebase = video.timebase(); + + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + const int bitrates[4] = { 250, 450, 650, 850 }; + const int bitrate_index = GET_PARAM(2); + cfg_.rc_target_bitrate = bitrates[bitrate_index]; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(static_cast(cfg_.rc_target_bitrate), + effective_datarate_[0] * 0.80) + << " The datarate for the file exceeds the target by too much!"; + ASSERT_LE(static_cast(cfg_.rc_target_bitrate), + effective_datarate_[0] * 1.15) + << " The datarate for the file missed the target!" + << cfg_.rc_target_bitrate << " " << effective_datarate_; +} + +// Check that (1) the first dropped frame gets earlier and earlier +// as the drop frame threshold is increased, and (2) that the total number of +// frame drops does not decrease as we increase frame drop threshold. +// Use a lower qp-max to force some frame drops. +TEST_P(DatarateTestVP9RealTimeMultiBR, ChangingDropFrameThresh) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_undershoot_pct = 20; + cfg_.rc_undershoot_pct = 20; + cfg_.rc_dropframe_thresh = 10; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 50; + cfg_.rc_end_usage = VPX_CBR; + cfg_.rc_target_bitrate = 200; + cfg_.g_lag_in_frames = 0; + // TODO(marpan): Investigate datarate target failures with a smaller keyframe + // interval (128). + cfg_.kf_max_dist = 9999; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + + const int kDropFrameThreshTestStep = 30; + const int bitrates[2] = { 50, 150 }; + const int bitrate_index = GET_PARAM(2); + if (bitrate_index > 1) return; + cfg_.rc_target_bitrate = bitrates[bitrate_index]; + vpx_codec_pts_t last_drop = 140; + int last_num_drops = 0; + for (int i = 10; i < 100; i += kDropFrameThreshTestStep) { + cfg_.rc_dropframe_thresh = i; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25) + << " The datarate for the file is greater than target by too much!"; + ASSERT_LE(first_drop_, last_drop) + << " The first dropped frame for drop_thresh " << i + << " > first dropped frame for drop_thresh " + << i - kDropFrameThreshTestStep; + ASSERT_GE(num_drops_, last_num_drops * 0.85) + << " The number of dropped frames for drop_thresh " << i + << " < number of dropped frames for drop_thresh " + << i - kDropFrameThreshTestStep; + last_drop = first_drop_; + last_num_drops = num_drops_; + } +} // namespace + +// Check basic rate targeting for 2 temporal layers. +TEST_P(DatarateTestVP9RealTimeMultiBR, BasicRateTargeting2TemporalLayers) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + // 2 Temporal layers, no spatial layers: Framerate decimation (2, 1). + cfg_.ss_number_layers = 1; + cfg_.ts_number_layers = 2; + cfg_.ts_rate_decimator[0] = 2; + cfg_.ts_rate_decimator[1] = 1; + + cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + const int bitrates[4] = { 200, 400, 600, 800 }; + const int bitrate_index = GET_PARAM(2); + cfg_.rc_target_bitrate = bitrates[bitrate_index]; + ResetModel(); + // 60-40 bitrate allocation for 2 temporal layers. + cfg_.layer_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[1] = cfg_.rc_target_bitrate; + aq_mode_ = 0; + if (deadline_ == VPX_DL_REALTIME) { + aq_mode_ = 3; + cfg_.g_error_resilient = 1; + } + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + for (int j = 0; j < static_cast(cfg_.ts_number_layers); ++j) { + ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.85) + << " The datarate for the file is lower than target by too much, " + "for layer: " + << j; + ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.15) + << " The datarate for the file is greater than target by too much, " + "for layer: " + << j; + } +} + +// Check basic rate targeting for 3 temporal layers. +TEST_P(DatarateTestVP9RealTimeMultiBR, BasicRateTargeting3TemporalLayers) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1). + cfg_.ss_number_layers = 1; + cfg_.ts_number_layers = 3; + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + + cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + const int bitrates[4] = { 200, 400, 600, 800 }; + const int bitrate_index = GET_PARAM(2); + cfg_.rc_target_bitrate = bitrates[bitrate_index]; + ResetModel(); + // 40-20-40 bitrate allocation for 3 temporal layers. + cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate; + aq_mode_ = 0; + if (deadline_ == VPX_DL_REALTIME) { + aq_mode_ = 3; + cfg_.g_error_resilient = 1; + } + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + for (int j = 0; j < static_cast(cfg_.ts_number_layers); ++j) { + // TODO(yaowu): Work out more stable rc control strategy and + // Adjust the thresholds to be tighter than .75. + ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.75) + << " The datarate for the file is lower than target by too much, " + "for layer: " + << j; + // TODO(yaowu): Work out more stable rc control strategy and + // Adjust the thresholds to be tighter than 1.25. + ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.25) + << " The datarate for the file is greater than target by too much, " + "for layer: " + << j; + } +} + +// Params: speed setting. +class DatarateTestVP9RealTime : public DatarateTestVP9, + public ::libvpx_test::CodecTestWithParam { + public: + DatarateTestVP9RealTime() : DatarateTestVP9(GET_PARAM(0)) {} + ~DatarateTestVP9RealTime() override = default; + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + set_cpu_used_ = GET_PARAM(1); + ResetModel(); + } +}; + +// Check basic rate targeting for CBR mode, with 2 threads and dropped frames. +TEST_P(DatarateTestVP9RealTime, BasicRateTargetingDropFramesMultiThreads) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 30; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + // Encode using multiple threads. + cfg_.g_threads = 2; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + cfg_.rc_target_bitrate = 200; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) + << " The datarate for the file is greater than target by too much!"; +} + +// Check basic rate targeting for 3 temporal layers, with frame dropping. +// Only for one (low) bitrate with lower max_quantizer, and somewhat higher +// frame drop threshold, to force frame dropping. +TEST_P(DatarateTestVP9RealTime, + BasicRateTargeting3TemporalLayersFrameDropping) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + // Set frame drop threshold and rc_max_quantizer to force some frame drops. + cfg_.rc_dropframe_thresh = 20; + cfg_.rc_max_quantizer = 45; + cfg_.rc_min_quantizer = 0; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1). + cfg_.ss_number_layers = 1; + cfg_.ts_number_layers = 3; + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + + cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + cfg_.rc_target_bitrate = 200; + ResetModel(); + // 40-20-40 bitrate allocation for 3 temporal layers. + cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate; + aq_mode_ = 0; + if (deadline_ == VPX_DL_REALTIME) { + aq_mode_ = 3; + cfg_.g_error_resilient = 1; + } + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + for (int j = 0; j < static_cast(cfg_.ts_number_layers); ++j) { + ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.85) + << " The datarate for the file is lower than target by too much, " + "for layer: " + << j; + ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.20) + << " The datarate for the file is greater than target by too much, " + "for layer: " + << j; + // Expect some frame drops in this test: for this 200 frames test, + // expect at least 10% and not more than 60% drops. + ASSERT_GE(num_drops_, 20); + ASSERT_LE(num_drops_, 280); + } +} + +// Check VP9 region of interest feature. +TEST_P(DatarateTestVP9RealTime, RegionOfInterest) { + if (deadline_ != VPX_DL_REALTIME || set_cpu_used_ < 5) return; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + + cfg_.rc_target_bitrate = 450; + cfg_.g_w = 640; + cfg_.g_h = 480; + + ResetModel(); + + // Set ROI parameters + use_roi_ = true; + memset(&roi_, 0, sizeof(roi_)); + + roi_.rows = (cfg_.g_h + 7) / 8; + roi_.cols = (cfg_.g_w + 7) / 8; + + roi_.delta_q[1] = -20; + roi_.delta_lf[1] = -20; + memset(roi_.ref_frame, -1, sizeof(roi_.ref_frame)); + roi_.ref_frame[1] = 1; + + // Use 2 states: 1 is center square, 0 is the rest. + roi_.roi_map = reinterpret_cast( + calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map))); + ASSERT_NE(roi_.roi_map, nullptr); + + for (unsigned int i = 0; i < roi_.rows; ++i) { + for (unsigned int j = 0; j < roi_.cols; ++j) { + if (i > (roi_.rows >> 2) && i < ((roi_.rows * 3) >> 2) && + j > (roi_.cols >> 2) && j < ((roi_.cols * 3) >> 2)) { + roi_.roi_map[i * roi_.cols + j] = 1; + } + } + } + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_[0] * 0.90) + << " The datarate for the file exceeds the target!"; + + ASSERT_LE(cfg_.rc_target_bitrate, effective_datarate_[0] * 1.4) + << " The datarate for the file missed the target!"; + + free(roi_.roi_map); +} + +// Params: speed setting, delta q UV. +class DatarateTestVP9RealTimeDeltaQUV + : public DatarateTestVP9, + public ::libvpx_test::CodecTestWith2Params { + public: + DatarateTestVP9RealTimeDeltaQUV() : DatarateTestVP9(GET_PARAM(0)) {} + ~DatarateTestVP9RealTimeDeltaQUV() override = default; + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + set_cpu_used_ = GET_PARAM(1); + ResetModel(); + } +}; + +TEST_P(DatarateTestVP9RealTimeDeltaQUV, DeltaQUV) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + + cfg_.rc_target_bitrate = 450; + cfg_.g_w = 640; + cfg_.g_h = 480; + + ResetModel(); + + delta_q_uv_ = GET_PARAM(2); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_[0] * 0.90) + << " The datarate for the file exceeds the target!"; + + ASSERT_LE(cfg_.rc_target_bitrate, effective_datarate_[0] * 1.4) + << " The datarate for the file missed the target!"; +} + +// Params: test mode, speed setting and index for bitrate array. +class DatarateTestVP9PostEncodeDrop + : public DatarateTestVP9, + public ::libvpx_test::CodecTestWithParam { + public: + DatarateTestVP9PostEncodeDrop() : DatarateTestVP9(GET_PARAM(0)) {} + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + set_cpu_used_ = GET_PARAM(1); + ResetModel(); + } +}; + +// Check basic rate targeting for CBR mode, with 2 threads and dropped frames. +TEST_P(DatarateTestVP9PostEncodeDrop, PostEncodeDropScreenContent) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 30; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + // Encode using multiple threads. + cfg_.g_threads = 2; + cfg_.g_error_resilient = 0; + tune_content_ = 1; + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + cfg_.rc_target_bitrate = 300; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) + << " The datarate for the file is greater than target by too much!"; +} + +using libvpx_test::ACMRandom; + +class DatarateTestVP9FrameQp + : public DatarateTestVP9, + public ::testing::TestWithParam { + public: + DatarateTestVP9FrameQp() : DatarateTestVP9(GetParam()), frame_(0) {} + ~DatarateTestVP9FrameQp() override = default; + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + ResetModel(); + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + set_cpu_used_ = 7; + DatarateTestVP9::PreEncodeFrameHook(video, encoder); + frame_qp_ = static_cast(rnd_.RandRange(64)); + encoder->Control(VP9E_SET_QUANTIZER_ONE_PASS, frame_qp_); + frame_++; + } + + void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override { + int qp = 0; + vpx_svc_layer_id_t layer_id; + if (frame_ >= total_frame_) return; + encoder->Control(VP8E_GET_LAST_QUANTIZER_64, &qp); + ASSERT_EQ(frame_qp_, qp); + encoder->Control(VP9E_GET_SVC_LAYER_ID, &layer_id); + temporal_layer_id_ = layer_id.temporal_layer_id; + } + + void MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) override { + if (frame_ >= total_frame_) return; + ASSERT_TRUE(cfg_.temporal_layering_mode == + VP9E_TEMPORAL_LAYERING_MODE_0212 && + temporal_layer_id_ == 2); + } + + protected: + int total_frame_; + + private: + ACMRandom rnd_; + int frame_qp_; + int frame_; + int temporal_layer_id_; +}; + +TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + total_frame_ = 400; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, total_frame_); + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayersBypass) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_min_quantizer = 0; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1). + cfg_.ss_number_layers = 1; + cfg_.ts_number_layers = 3; + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + + cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS; + cfg_.rc_target_bitrate = 200; + total_frame_ = 400; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, total_frame_); + ResetModel(); + cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayersFixedMode) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_min_quantizer = 0; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1). + cfg_.ss_number_layers = 1; + cfg_.ts_number_layers = 3; + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + + cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_0212; + cfg_.rc_target_bitrate = 200; + cfg_.g_error_resilient = 1; + total_frame_ = 400; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, total_frame_); + ResetModel(); + cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +#if CONFIG_VP9_TEMPORAL_DENOISING +// Params: speed setting. +class DatarateTestVP9RealTimeDenoiser : public DatarateTestVP9RealTime { + public: + ~DatarateTestVP9RealTimeDenoiser() override = default; +}; + +// Check basic datarate targeting, for a single bitrate, when denoiser is on. +TEST_P(DatarateTestVP9RealTimeDenoiser, LowNoise) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + + // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING), + // there is only one denoiser mode: denoiserYonly(which is 1), + // but may add more modes in the future. + cfg_.rc_target_bitrate = 400; + ResetModel(); + // Turn on the denoiser. + denoiser_on_ = 1; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) + << " The datarate for the file is greater than target by too much!"; +} + +// Check basic datarate targeting, for a single bitrate, when denoiser is on, +// for clip with high noise level. Use 2 threads. +TEST_P(DatarateTestVP9RealTimeDenoiser, HighNoise) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_threads = 2; + + ::libvpx_test::Y4mVideoSource video("noisy_clip_640_360.y4m", 0, 200); + + // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING), + // there is only one denoiser mode: kDenoiserOnYOnly(which is 1), + // but may add more modes in the future. + cfg_.rc_target_bitrate = 1000; + ResetModel(); + // Turn on the denoiser. + denoiser_on_ = 1; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) + << " The datarate for the file is greater than target by too much!"; +} + +// Check basic datarate targeting, for a single bitrate, when denoiser is on, +// for 1280x720 clip with 4 threads. +TEST_P(DatarateTestVP9RealTimeDenoiser, 4threads) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_threads = 4; + + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300); + + // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING), + // there is only one denoiser mode: denoiserYonly(which is 1), + // but may add more modes in the future. + cfg_.rc_target_bitrate = 1000; + ResetModel(); + // Turn on the denoiser. + denoiser_on_ = 1; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.29) + << " The datarate for the file is greater than target by too much!"; +} + +// Check basic datarate targeting, for a single bitrate, when denoiser is off +// and on. +TEST_P(DatarateTestVP9RealTimeDenoiser, DenoiserOffOn) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + + // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING), + // there is only one denoiser mode: denoiserYonly(which is 1), + // but may add more modes in the future. + cfg_.rc_target_bitrate = 400; + ResetModel(); + // The denoiser is off by default. + denoiser_on_ = 0; + // Set the offon test flag. + denoiser_offon_test_ = 1; + denoiser_offon_period_ = 100; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) + << " The datarate for the file is greater than target by too much!"; +} +#endif // CONFIG_VP9_TEMPORAL_DENOISING + +VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9RealTimeMultiBR, + ::testing::Range(5, 10), ::testing::Range(0, 4)); + +VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9LargeVBR, ::testing::Range(5, 9), + ::testing::Range(0, 2)); + +VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9RealTime, ::testing::Range(5, 10)); + +#if CONFIG_VP9 +INSTANTIATE_TEST_SUITE_P( + VP9, DatarateTestVP9FrameQp, + ::testing::Values( + static_cast(&libvpx_test::kVP9))); +#endif + +VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9RealTimeDeltaQUV, + ::testing::Range(5, 10), + ::testing::Values(-5, -10, -15)); + +VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9PostEncodeDrop, + ::testing::Range(5, 6)); + +#if CONFIG_VP9_TEMPORAL_DENOISING +VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9RealTimeDenoiser, + ::testing::Range(5, 10)); +#endif +} // namespace diff --git a/media/libvpx/libvpx/test/vp9_decrypt_test.cc b/media/libvpx/libvpx/test/vp9_decrypt_test.cc new file mode 100644 index 0000000000..1874d23117 --- /dev/null +++ b/media/libvpx/libvpx/test/vp9_decrypt_test.cc @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/ivf_video_source.h" + +namespace { +// In a real use the 'decrypt_state' parameter will be a pointer to a struct +// with whatever internal state the decryptor uses. For testing we'll just +// xor with a constant key, and decrypt_state will point to the start of +// the original buffer. +const uint8_t test_key[16] = { 0x01, 0x12, 0x23, 0x34, 0x45, 0x56, 0x67, 0x78, + 0x89, 0x9a, 0xab, 0xbc, 0xcd, 0xde, 0xef, 0xf0 }; + +void encrypt_buffer(const uint8_t *src, uint8_t *dst, size_t size, + ptrdiff_t offset) { + for (size_t i = 0; i < size; ++i) { + dst[i] = src[i] ^ test_key[(offset + i) & 15]; + } +} + +void test_decrypt_cb(void *decrypt_state, const uint8_t *input, uint8_t *output, + int count) { + encrypt_buffer(input, output, count, + input - reinterpret_cast(decrypt_state)); +} + +} // namespace + +namespace libvpx_test { + +TEST(TestDecrypt, DecryptWorksVp9) { + libvpx_test::IVFVideoSource video("vp90-2-05-resize.ivf"); + video.Init(); + + vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t(); + VP9Decoder decoder(dec_cfg, 0); + + video.Begin(); + + // no decryption + vpx_codec_err_t res = decoder.DecodeFrame(video.cxdata(), video.frame_size()); + ASSERT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError(); + + // decrypt frame + video.Next(); + + std::vector encrypted(video.frame_size()); + encrypt_buffer(video.cxdata(), &encrypted[0], video.frame_size(), 0); + vpx_decrypt_init di = { test_decrypt_cb, &encrypted[0] }; + decoder.Control(VPXD_SET_DECRYPTOR, &di); + + res = decoder.DecodeFrame(&encrypted[0], encrypted.size()); + ASSERT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError(); +} + +} // namespace libvpx_test diff --git a/media/libvpx/libvpx/test/vp9_denoiser_test.cc b/media/libvpx/libvpx/test/vp9_denoiser_test.cc new file mode 100644 index 0000000000..831f83305c --- /dev/null +++ b/media/libvpx/libvpx/test/vp9_denoiser_test.cc @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" + +#include "vpx_scale/yv12config.h" +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/encoder/vp9_context_tree.h" +#include "vp9/encoder/vp9_denoiser.h" + +using libvpx_test::ACMRandom; + +namespace { + +const int kNumPixels = 64 * 64; + +typedef int (*Vp9DenoiserFilterFunc)(const uint8_t *sig, int sig_stride, + const uint8_t *mc_avg, int mc_avg_stride, + uint8_t *avg, int avg_stride, + int increase_denoising, BLOCK_SIZE bs, + int motion_magnitude); +typedef std::tuple VP9DenoiserTestParam; + +class VP9DenoiserTest + : public ::testing::Test, + public ::testing::WithParamInterface { + public: + ~VP9DenoiserTest() override = default; + + void SetUp() override { bs_ = GET_PARAM(1); } + + void TearDown() override { libvpx_test::ClearSystemState(); } + + protected: + BLOCK_SIZE bs_; +}; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(VP9DenoiserTest); + +TEST_P(VP9DenoiserTest, BitexactCheck) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 4000; + + // Allocate the space for input and output, + // where sig_block is the block to be denoised, + // mc_avg_block is the denoised reference block, + // avg_block_c is the denoised result from C code, + // avg_block_sse2 is the denoised result from SSE2 code. + DECLARE_ALIGNED(16, uint8_t, sig_block[kNumPixels]); + DECLARE_ALIGNED(16, uint8_t, mc_avg_block[kNumPixels]); + DECLARE_ALIGNED(16, uint8_t, avg_block_c[kNumPixels]); + DECLARE_ALIGNED(16, uint8_t, avg_block_sse2[kNumPixels]); + + for (int i = 0; i < count_test_block; ++i) { + // Generate random motion magnitude, 20% of which exceed the threshold. + const int motion_magnitude_random = + rnd.Rand8() % static_cast(MOTION_MAGNITUDE_THRESHOLD * 1.2); + + // Initialize a test block with random number in range [0, 255]. + for (int j = 0; j < kNumPixels; ++j) { + int temp = 0; + sig_block[j] = rnd.Rand8(); + // The pixels in mc_avg_block are generated by adding a random + // number in range [-19, 19] to corresponding pixels in sig_block. + temp = + sig_block[j] + ((rnd.Rand8() % 2 == 0) ? -1 : 1) * (rnd.Rand8() % 20); + // Clip. + mc_avg_block[j] = (temp < 0) ? 0 : ((temp > 255) ? 255 : temp); + } + + ASM_REGISTER_STATE_CHECK(vp9_denoiser_filter_c(sig_block, 64, mc_avg_block, + 64, avg_block_c, 64, 0, bs_, + motion_magnitude_random)); + + ASM_REGISTER_STATE_CHECK(GET_PARAM(0)(sig_block, 64, mc_avg_block, 64, + avg_block_sse2, 64, 0, bs_, + motion_magnitude_random)); + + // Test bitexactness. + for (int h = 0; h < (4 << b_height_log2_lookup[bs_]); ++h) { + for (int w = 0; w < (4 << b_width_log2_lookup[bs_]); ++w) { + EXPECT_EQ(avg_block_c[h * 64 + w], avg_block_sse2[h * 64 + w]); + } + } + } +} + +using std::make_tuple; + +// Test for all block size. +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2, VP9DenoiserTest, + ::testing::Values(make_tuple(&vp9_denoiser_filter_sse2, BLOCK_8X8), + make_tuple(&vp9_denoiser_filter_sse2, BLOCK_8X16), + make_tuple(&vp9_denoiser_filter_sse2, BLOCK_16X8), + make_tuple(&vp9_denoiser_filter_sse2, BLOCK_16X16), + make_tuple(&vp9_denoiser_filter_sse2, BLOCK_16X32), + make_tuple(&vp9_denoiser_filter_sse2, BLOCK_32X16), + make_tuple(&vp9_denoiser_filter_sse2, BLOCK_32X32), + make_tuple(&vp9_denoiser_filter_sse2, BLOCK_32X64), + make_tuple(&vp9_denoiser_filter_sse2, BLOCK_64X32), + make_tuple(&vp9_denoiser_filter_sse2, BLOCK_64X64))); +#endif // HAVE_SSE2 + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, VP9DenoiserTest, + ::testing::Values(make_tuple(&vp9_denoiser_filter_neon, BLOCK_8X8), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_8X16), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_16X8), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_16X16), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_16X32), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_32X16), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_32X32), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_32X64), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_64X32), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_64X64))); +#endif +} // namespace diff --git a/media/libvpx/libvpx/test/vp9_encoder_parms_get_to_decoder.cc b/media/libvpx/libvpx/test/vp9_encoder_parms_get_to_decoder.cc new file mode 100644 index 0000000000..0e182c76db --- /dev/null +++ b/media/libvpx/libvpx/test/vp9_encoder_parms_get_to_decoder.cc @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "vp9/vp9_dx_iface.h" + +namespace { + +const int kCpuUsed = 2; + +struct EncodePerfTestVideo { + const char *name; + uint32_t width; + uint32_t height; + uint32_t bitrate; + int frames; +}; + +const EncodePerfTestVideo kVP9EncodePerfTestVectors[] = { + { "niklas_1280_720_30.y4m", 1280, 720, 600, 10 }, +}; + +struct EncodeParameters { + int32_t tile_rows; + int32_t tile_cols; + int32_t lossless; + int32_t error_resilient; + int32_t frame_parallel; + vpx_color_range_t color_range; + vpx_color_space_t cs; + int render_size[2]; + // TODO(JBB): quantizers / bitrate +}; + +const EncodeParameters kVP9EncodeParameterSet[] = { + { 0, 0, 0, 1, 0, VPX_CR_STUDIO_RANGE, VPX_CS_BT_601, { 0, 0 } }, + { 0, 0, 0, 0, 0, VPX_CR_FULL_RANGE, VPX_CS_BT_709, { 0, 0 } }, + { 0, 0, 1, 0, 0, VPX_CR_FULL_RANGE, VPX_CS_BT_2020, { 0, 0 } }, + { 0, 2, 0, 0, 1, VPX_CR_STUDIO_RANGE, VPX_CS_UNKNOWN, { 640, 480 } }, + // TODO(JBB): Test profiles (requires more work). +}; + +class VpxEncoderParmsGetToDecoder + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith2Params { + protected: + VpxEncoderParmsGetToDecoder() + : EncoderTest(GET_PARAM(0)), encode_parms(GET_PARAM(1)) {} + + ~VpxEncoderParmsGetToDecoder() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kTwoPassGood); + cfg_.g_lag_in_frames = 25; + cfg_.g_error_resilient = encode_parms.error_resilient; + dec_cfg_.threads = 4; + test_video_ = GET_PARAM(2); + cfg_.rc_target_bitrate = test_video_.bitrate; + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP9E_SET_COLOR_SPACE, encode_parms.cs); + encoder->Control(VP9E_SET_COLOR_RANGE, encode_parms.color_range); + encoder->Control(VP9E_SET_LOSSLESS, encode_parms.lossless); + encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, + encode_parms.frame_parallel); + encoder->Control(VP9E_SET_TILE_ROWS, encode_parms.tile_rows); + encoder->Control(VP9E_SET_TILE_COLUMNS, encode_parms.tile_cols); + encoder->Control(VP8E_SET_CPUUSED, kCpuUsed); + encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); + encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7); + encoder->Control(VP8E_SET_ARNR_STRENGTH, 5); + encoder->Control(VP8E_SET_ARNR_TYPE, 3); + if (encode_parms.render_size[0] > 0 && encode_parms.render_size[1] > 0) { + encoder->Control(VP9E_SET_RENDER_SIZE, encode_parms.render_size); + } + } + } + + bool HandleDecodeResult(const vpx_codec_err_t res_dec, + const libvpx_test::VideoSource & /*video*/, + libvpx_test::Decoder *decoder) override { + vpx_codec_ctx_t *const vp9_decoder = decoder->GetDecoder(); + vpx_codec_alg_priv_t *const priv = + reinterpret_cast(vp9_decoder->priv); + VP9_COMMON *const common = &priv->pbi->common; + + if (encode_parms.lossless) { + EXPECT_EQ(0, common->base_qindex); + EXPECT_EQ(0, common->y_dc_delta_q); + EXPECT_EQ(0, common->uv_dc_delta_q); + EXPECT_EQ(0, common->uv_ac_delta_q); + EXPECT_EQ(ONLY_4X4, common->tx_mode); + } + EXPECT_EQ(encode_parms.error_resilient, common->error_resilient_mode); + if (encode_parms.error_resilient) { + EXPECT_EQ(1, common->frame_parallel_decoding_mode); + EXPECT_EQ(0, common->use_prev_frame_mvs); + } else { + EXPECT_EQ(encode_parms.frame_parallel, + common->frame_parallel_decoding_mode); + } + EXPECT_EQ(encode_parms.color_range, common->color_range); + EXPECT_EQ(encode_parms.cs, common->color_space); + if (encode_parms.render_size[0] > 0 && encode_parms.render_size[1] > 0) { + EXPECT_EQ(encode_parms.render_size[0], common->render_width); + EXPECT_EQ(encode_parms.render_size[1], common->render_height); + } + EXPECT_EQ(encode_parms.tile_cols, common->log2_tile_cols); + EXPECT_EQ(encode_parms.tile_rows, common->log2_tile_rows); + + EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError(); + return VPX_CODEC_OK == res_dec; + } + + EncodePerfTestVideo test_video_; + + private: + EncodeParameters encode_parms; +}; + +TEST_P(VpxEncoderParmsGetToDecoder, BitstreamParms) { + init_flags_ = VPX_CODEC_USE_PSNR; + + std::unique_ptr video( + new libvpx_test::Y4mVideoSource(test_video_.name, 0, test_video_.frames)); + ASSERT_NE(video.get(), nullptr); + + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); +} + +VP9_INSTANTIATE_TEST_SUITE(VpxEncoderParmsGetToDecoder, + ::testing::ValuesIn(kVP9EncodeParameterSet), + ::testing::ValuesIn(kVP9EncodePerfTestVectors)); +} // namespace diff --git a/media/libvpx/libvpx/test/vp9_end_to_end_test.cc b/media/libvpx/libvpx/test/vp9_end_to_end_test.cc new file mode 100644 index 0000000000..79be4ee146 --- /dev/null +++ b/media/libvpx/libvpx/test/vp9_end_to_end_test.cc @@ -0,0 +1,354 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "memory" + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "test/yuv_video_source.h" + +namespace { + +const unsigned int kWidth = 160; +const unsigned int kHeight = 90; +const unsigned int kFramerate = 50; +const unsigned int kFrames = 20; +const int kBitrate = 500; +// List of psnr thresholds for speed settings 0-7 and 5 encoding modes +const double kPsnrThreshold[][5] = { + { 36.0, 37.0, 37.0, 37.0, 37.0 }, { 35.0, 36.0, 36.0, 36.0, 36.0 }, + { 34.0, 35.0, 35.0, 35.0, 35.0 }, { 33.0, 34.0, 34.0, 34.0, 34.0 }, + { 32.0, 33.0, 33.0, 33.0, 33.0 }, { 28.0, 32.0, 32.0, 32.0, 32.0 }, + { 28.4, 31.0, 31.0, 31.0, 31.0 }, { 27.5, 30.0, 30.0, 30.0, 30.0 }, +}; + +typedef struct { + const char *filename; + unsigned int input_bit_depth; + vpx_img_fmt fmt; + vpx_bit_depth_t bit_depth; + unsigned int profile; +} TestVideoParam; + +const TestVideoParam kTestVectors[] = { + { "park_joy_90p_8_420.y4m", 8, VPX_IMG_FMT_I420, VPX_BITS_8, 0 }, + { "park_joy_90p_8_422.y4m", 8, VPX_IMG_FMT_I422, VPX_BITS_8, 1 }, + { "park_joy_90p_8_444.y4m", 8, VPX_IMG_FMT_I444, VPX_BITS_8, 1 }, + { "park_joy_90p_8_440.yuv", 8, VPX_IMG_FMT_I440, VPX_BITS_8, 1 }, +#if CONFIG_VP9_HIGHBITDEPTH + { "park_joy_90p_10_420_20f.y4m", 10, VPX_IMG_FMT_I42016, VPX_BITS_10, 2 }, + { "park_joy_90p_10_422_20f.y4m", 10, VPX_IMG_FMT_I42216, VPX_BITS_10, 3 }, + { "park_joy_90p_10_444_20f.y4m", 10, VPX_IMG_FMT_I44416, VPX_BITS_10, 3 }, + { "park_joy_90p_10_440.yuv", 10, VPX_IMG_FMT_I44016, VPX_BITS_10, 3 }, + { "park_joy_90p_12_420_20f.y4m", 12, VPX_IMG_FMT_I42016, VPX_BITS_12, 2 }, + { "park_joy_90p_12_422_20f.y4m", 12, VPX_IMG_FMT_I42216, VPX_BITS_12, 3 }, + { "park_joy_90p_12_444_20f.y4m", 12, VPX_IMG_FMT_I44416, VPX_BITS_12, 3 }, + { "park_joy_90p_12_440.yuv", 12, VPX_IMG_FMT_I44016, VPX_BITS_12, 3 }, +#endif // CONFIG_VP9_HIGHBITDEPTH +}; + +const TestVideoParam kTestVectorsNv12[] = { + { "hantro_collage_w352h288_nv12.yuv", 8, VPX_IMG_FMT_NV12, VPX_BITS_8, 0 }, +}; + +// Encoding modes tested +const libvpx_test::TestMode kEncodingModeVectors[] = { + ::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood, + ::libvpx_test::kRealTime +}; + +// Speed settings tested +const int kCpuUsedVectors[] = { 1, 2, 3, 5, 6, 7 }; + +int is_extension_y4m(const char *filename) { + const char *dot = strrchr(filename, '.'); + if (!dot || dot == filename) { + return 0; + } else { + return !strcmp(dot, ".y4m"); + } +} + +class EndToEndTestAdaptiveRDThresh + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith2Params { + protected: + EndToEndTestAdaptiveRDThresh() + : EncoderTest(GET_PARAM(0)), cpu_used_start_(GET_PARAM(1)), + cpu_used_end_(GET_PARAM(2)) {} + + ~EndToEndTestAdaptiveRDThresh() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + cfg_.g_lag_in_frames = 0; + cfg_.rc_end_usage = VPX_CBR; + cfg_.rc_buf_sz = 1000; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 600; + dec_cfg_.threads = 4; + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, cpu_used_start_); + encoder->Control(VP9E_SET_ROW_MT, 1); + encoder->Control(VP9E_SET_TILE_COLUMNS, 2); + } + if (video->frame() == 100) + encoder->Control(VP8E_SET_CPUUSED, cpu_used_end_); + } + + private: + int cpu_used_start_; + int cpu_used_end_; +}; + +class EndToEndTestLarge + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith3Params { + protected: + EndToEndTestLarge() + : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(2)), + cpu_used_(GET_PARAM(3)), psnr_(0.0), nframes_(0), + encoding_mode_(GET_PARAM(1)) { + cyclic_refresh_ = 0; + denoiser_on_ = 0; + } + + ~EndToEndTestLarge() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(encoding_mode_); + if (encoding_mode_ != ::libvpx_test::kRealTime) { + cfg_.g_lag_in_frames = 5; + cfg_.rc_end_usage = VPX_VBR; + } else { + cfg_.g_lag_in_frames = 0; + cfg_.rc_end_usage = VPX_CBR; + cfg_.rc_buf_sz = 1000; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 600; + } + dec_cfg_.threads = 4; + } + + void BeginPassHook(unsigned int) override { + psnr_ = 0.0; + nframes_ = 0; + } + + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { + psnr_ += pkt->data.psnr.psnr[0]; + nframes_++; + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1); + encoder->Control(VP9E_SET_TILE_COLUMNS, 4); + encoder->Control(VP8E_SET_CPUUSED, cpu_used_); + if (encoding_mode_ != ::libvpx_test::kRealTime) { + encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); + encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7); + encoder->Control(VP8E_SET_ARNR_STRENGTH, 5); + encoder->Control(VP8E_SET_ARNR_TYPE, 3); + } else { + encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_); + encoder->Control(VP9E_SET_AQ_MODE, cyclic_refresh_); + } + } + } + + double GetAveragePsnr() const { + if (nframes_) return psnr_ / nframes_; + return 0.0; + } + + double GetPsnrThreshold() { + return kPsnrThreshold[cpu_used_][encoding_mode_]; + } + + TestVideoParam test_video_param_; + int cpu_used_; + int cyclic_refresh_; + int denoiser_on_; + + private: + double psnr_; + unsigned int nframes_; + libvpx_test::TestMode encoding_mode_; +}; + +#if CONFIG_VP9_DECODER +// The test parameters control VP9D_SET_LOOP_FILTER_OPT and the number of +// decoder threads. +class EndToEndTestLoopFilterThreading + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith2Params { + protected: + EndToEndTestLoopFilterThreading() + : EncoderTest(GET_PARAM(0)), use_loop_filter_opt_(GET_PARAM(1)) {} + + ~EndToEndTestLoopFilterThreading() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + cfg_.g_threads = 2; + cfg_.g_lag_in_frames = 0; + cfg_.rc_target_bitrate = 500; + cfg_.rc_end_usage = VPX_CBR; + cfg_.kf_min_dist = 1; + cfg_.kf_max_dist = 1; + dec_cfg_.threads = GET_PARAM(2); + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, 8); + } + encoder->Control(VP9E_SET_TILE_COLUMNS, 4 - video->frame() % 5); + } + + void PreDecodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Decoder *decoder) override { + if (video->frame() == 0) { + decoder->Control(VP9D_SET_LOOP_FILTER_OPT, use_loop_filter_opt_ ? 1 : 0); + } + } + + private: + const bool use_loop_filter_opt_; +}; +#endif // CONFIG_VP9_DECODER + +class EndToEndNV12 : public EndToEndTestLarge {}; + +TEST_P(EndToEndNV12, EndtoEndNV12Test) { + cfg_.rc_target_bitrate = kBitrate; + cfg_.g_error_resilient = 0; + cfg_.g_profile = test_video_param_.profile; + cfg_.g_input_bit_depth = test_video_param_.input_bit_depth; + cfg_.g_bit_depth = test_video_param_.bit_depth; + init_flags_ = VPX_CODEC_USE_PSNR; + if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH; + + std::unique_ptr video; + + video.reset(new libvpx_test::YUVVideoSource(test_video_param_.filename, + test_video_param_.fmt, 352, 288, + 30, 1, 0, 100)); + ASSERT_NE(video.get(), nullptr); + + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); +} + +TEST_P(EndToEndTestLarge, EndtoEndPSNRTest) { + cfg_.rc_target_bitrate = kBitrate; + cfg_.g_error_resilient = 0; + cfg_.g_profile = test_video_param_.profile; + cfg_.g_input_bit_depth = test_video_param_.input_bit_depth; + cfg_.g_bit_depth = test_video_param_.bit_depth; + init_flags_ = VPX_CODEC_USE_PSNR; + if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH; + + std::unique_ptr video; + if (is_extension_y4m(test_video_param_.filename)) { + video.reset(new libvpx_test::Y4mVideoSource(test_video_param_.filename, 0, + kFrames)); + } else { + video.reset(new libvpx_test::YUVVideoSource( + test_video_param_.filename, test_video_param_.fmt, kWidth, kHeight, + kFramerate, 1, 0, kFrames)); + } + ASSERT_NE(video.get(), nullptr); + + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); + const double psnr = GetAveragePsnr(); + EXPECT_GT(psnr, GetPsnrThreshold()); +} + +TEST_P(EndToEndTestLarge, EndtoEndPSNRDenoiserAQTest) { + cfg_.rc_target_bitrate = kBitrate; + cfg_.g_error_resilient = 0; + cfg_.g_profile = test_video_param_.profile; + cfg_.g_input_bit_depth = test_video_param_.input_bit_depth; + cfg_.g_bit_depth = test_video_param_.bit_depth; + init_flags_ = VPX_CODEC_USE_PSNR; + cyclic_refresh_ = 3; + denoiser_on_ = 1; + if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH; + + std::unique_ptr video; + if (is_extension_y4m(test_video_param_.filename)) { + video.reset(new libvpx_test::Y4mVideoSource(test_video_param_.filename, 0, + kFrames)); + } else { + video.reset(new libvpx_test::YUVVideoSource( + test_video_param_.filename, test_video_param_.fmt, kWidth, kHeight, + kFramerate, 1, 0, kFrames)); + } + ASSERT_NE(video.get(), nullptr); + + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); + const double psnr = GetAveragePsnr(); + EXPECT_GT(psnr, GetPsnrThreshold()); +} + +TEST_P(EndToEndTestAdaptiveRDThresh, EndtoEndAdaptiveRDThreshRowMT) { + cfg_.rc_target_bitrate = kBitrate; + cfg_.g_error_resilient = 0; + cfg_.g_threads = 2; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +#if CONFIG_VP9_DECODER +TEST_P(EndToEndTestLoopFilterThreading, TileCountChange) { + ::libvpx_test::RandomVideoSource video; + video.SetSize(4096, 2160); + video.set_limit(10); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} +#endif // CONFIG_VP9_DECODER + +VP9_INSTANTIATE_TEST_SUITE(EndToEndTestLarge, + ::testing::ValuesIn(kEncodingModeVectors), + ::testing::ValuesIn(kTestVectors), + ::testing::ValuesIn(kCpuUsedVectors)); + +VP9_INSTANTIATE_TEST_SUITE(EndToEndNV12, + ::testing::Values(::libvpx_test::kRealTime), + ::testing::ValuesIn(kTestVectorsNv12), + ::testing::Values(6, 7, 8)); + +VP9_INSTANTIATE_TEST_SUITE(EndToEndTestAdaptiveRDThresh, + ::testing::Values(5, 6, 7), ::testing::Values(8, 9)); + +#if CONFIG_VP9_DECODER +VP9_INSTANTIATE_TEST_SUITE(EndToEndTestLoopFilterThreading, ::testing::Bool(), + ::testing::Range(2, 6)); +#endif // CONFIG_VP9_DECODER +} // namespace diff --git a/media/libvpx/libvpx/test/vp9_ethread_test.cc b/media/libvpx/libvpx/test/vp9_ethread_test.cc new file mode 100644 index 0000000000..c8d3cba7fb --- /dev/null +++ b/media/libvpx/libvpx/test/vp9_ethread_test.cc @@ -0,0 +1,429 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/md5_helper.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "vp9/encoder/vp9_firstpass.h" + +namespace { +// FIRSTPASS_STATS struct: +// { +// 26 double members; +// 1 int64_t member; +// } +// Whenever FIRSTPASS_STATS struct is modified, the following constants need to +// be revisited. +const int kDbl = 26; +const int kInt = 1; +const size_t kFirstPassStatsSz = kDbl * sizeof(double) + kInt * sizeof(int64_t); + +class VPxFirstPassEncoderThreadTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith2Params { + protected: + VPxFirstPassEncoderThreadTest() + : EncoderTest(GET_PARAM(0)), encoder_initialized_(false), tiles_(0), + encoding_mode_(GET_PARAM(1)), set_cpu_used_(GET_PARAM(2)) { + init_flags_ = VPX_CODEC_USE_PSNR; + + row_mt_mode_ = 1; + first_pass_only_ = true; + firstpass_stats_.buf = nullptr; + firstpass_stats_.sz = 0; + } + ~VPxFirstPassEncoderThreadTest() override { free(firstpass_stats_.buf); } + + void SetUp() override { + InitializeConfig(); + SetMode(encoding_mode_); + + cfg_.rc_end_usage = VPX_VBR; + cfg_.rc_2pass_vbr_minsection_pct = 5; + cfg_.rc_2pass_vbr_maxsection_pct = 2000; + cfg_.rc_max_quantizer = 56; + cfg_.rc_min_quantizer = 0; + } + + void BeginPassHook(unsigned int /*pass*/) override { + encoder_initialized_ = false; + abort_ = false; + } + + void EndPassHook() override { + // For first pass stats test, only run first pass encoder. + if (first_pass_only_ && cfg_.g_pass == VPX_RC_FIRST_PASS) + abort_ |= first_pass_only_; + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/, + ::libvpx_test::Encoder *encoder) override { + if (!encoder_initialized_) { + // Encode in 2-pass mode. + encoder->Control(VP9E_SET_TILE_COLUMNS, tiles_); + encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); + encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); + encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7); + encoder->Control(VP8E_SET_ARNR_STRENGTH, 5); + encoder->Control(VP8E_SET_ARNR_TYPE, 3); + encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 0); + + if (encoding_mode_ == ::libvpx_test::kTwoPassGood) + encoder->Control(VP9E_SET_ROW_MT, row_mt_mode_); + + encoder_initialized_ = true; + } + } + + void StatsPktHook(const vpx_codec_cx_pkt_t *pkt) override { + const uint8_t *const pkt_buf = + reinterpret_cast(pkt->data.twopass_stats.buf); + const size_t pkt_size = pkt->data.twopass_stats.sz; + + // First pass stats size equals sizeof(FIRSTPASS_STATS) + EXPECT_EQ(pkt_size, kFirstPassStatsSz) + << "Error: First pass stats size doesn't equal kFirstPassStatsSz"; + + firstpass_stats_.buf = + realloc(firstpass_stats_.buf, firstpass_stats_.sz + pkt_size); + ASSERT_NE(firstpass_stats_.buf, nullptr); + memcpy((uint8_t *)firstpass_stats_.buf + firstpass_stats_.sz, pkt_buf, + pkt_size); + firstpass_stats_.sz += pkt_size; + } + + bool encoder_initialized_; + int tiles_; + ::libvpx_test::TestMode encoding_mode_; + int set_cpu_used_; + int row_mt_mode_; + bool first_pass_only_; + vpx_fixed_buf_t firstpass_stats_; +}; + +static void compare_fp_stats(vpx_fixed_buf_t *fp_stats, double factor) { + // fp_stats consists of 2 set of first pass encoding stats. These 2 set of + // stats are compared to check if the stats match or at least are very close. + FIRSTPASS_STATS *stats1 = reinterpret_cast(fp_stats->buf); + int nframes_ = (int)(fp_stats->sz / sizeof(FIRSTPASS_STATS)); + FIRSTPASS_STATS *stats2 = stats1 + nframes_ / 2; + int i, j; + + // The total stats are also output and included in the first pass stats. Here + // ignore that in the comparison. + for (i = 0; i < (nframes_ / 2 - 1); ++i) { + const double *frame_stats1 = reinterpret_cast(stats1); + const double *frame_stats2 = reinterpret_cast(stats2); + + for (j = 0; j < kDbl; ++j) { + ASSERT_LE(fabs(*frame_stats1 - *frame_stats2), + fabs(*frame_stats1) / factor) + << "First failure @ frame #" << i << " stat #" << j << " (" + << *frame_stats1 << " vs. " << *frame_stats2 << ")"; + frame_stats1++; + frame_stats2++; + } + + stats1++; + stats2++; + } + + // Reset firstpass_stats_ to 0. + memset((uint8_t *)fp_stats->buf, 0, fp_stats->sz); + fp_stats->sz = 0; +} + +static void compare_fp_stats_md5(vpx_fixed_buf_t *fp_stats) { + // fp_stats consists of 2 set of first pass encoding stats. These 2 set of + // stats are compared to check if the stats match. + uint8_t *stats1 = reinterpret_cast(fp_stats->buf); + uint8_t *stats2 = stats1 + fp_stats->sz / 2; + ::libvpx_test::MD5 md5_row_mt_0, md5_row_mt_1; + + md5_row_mt_0.Add(stats1, fp_stats->sz / 2); + const char *md5_row_mt_0_str = md5_row_mt_0.Get(); + + md5_row_mt_1.Add(stats2, fp_stats->sz / 2); + const char *md5_row_mt_1_str = md5_row_mt_1.Get(); + + // Check md5 match. + ASSERT_STREQ(md5_row_mt_0_str, md5_row_mt_1_str) + << "MD5 checksums don't match"; + + // Reset firstpass_stats_ to 0. + memset((uint8_t *)fp_stats->buf, 0, fp_stats->sz); + fp_stats->sz = 0; +} + +TEST_P(VPxFirstPassEncoderThreadTest, FirstPassStatsTest) { + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + + first_pass_only_ = true; + cfg_.rc_target_bitrate = 1000; + + // Test row_mt_mode: 0 vs 1 at single thread case(threads = 1, tiles_ = 0) + tiles_ = 0; + cfg_.g_threads = 1; + + row_mt_mode_ = 0; + init_flags_ = VPX_CODEC_USE_PSNR; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + row_mt_mode_ = 1; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + // Compare to check if using or not using row-mt generates close stats. + ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 400.0)); + + // Test single thread vs multiple threads + row_mt_mode_ = 1; + tiles_ = 0; + + cfg_.g_threads = 1; + init_flags_ = VPX_CODEC_USE_PSNR; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + cfg_.g_threads = 4; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + // Compare to check if single-thread and multi-thread stats are close enough. + ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 400.0)); + + // Bit exact test in row_mt mode. + // When row_mt_mode_=1 and using >1 threads, the encoder generates bit exact + // result. + row_mt_mode_ = 1; + tiles_ = 2; + + cfg_.g_threads = 2; + init_flags_ = VPX_CODEC_USE_PSNR; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + cfg_.g_threads = 8; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + // Compare to check if stats match with row-mt=0/1. + compare_fp_stats_md5(&firstpass_stats_); +} + +class VPxEncoderThreadTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith4Params { + protected: + VPxEncoderThreadTest() + : EncoderTest(GET_PARAM(0)), encoder_initialized_(false), + tiles_(GET_PARAM(3)), threads_(GET_PARAM(4)), + encoding_mode_(GET_PARAM(1)), set_cpu_used_(GET_PARAM(2)) { + init_flags_ = VPX_CODEC_USE_PSNR; + md5_.clear(); + row_mt_mode_ = 1; + psnr_ = 0.0; + nframes_ = 0; + } + ~VPxEncoderThreadTest() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(encoding_mode_); + + if (encoding_mode_ != ::libvpx_test::kRealTime) { + cfg_.rc_end_usage = VPX_VBR; + cfg_.rc_2pass_vbr_minsection_pct = 5; + cfg_.rc_2pass_vbr_maxsection_pct = 2000; + } else { + cfg_.g_lag_in_frames = 0; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_error_resilient = 1; + } + cfg_.rc_max_quantizer = 56; + cfg_.rc_min_quantizer = 0; + } + + void BeginPassHook(unsigned int /*pass*/) override { + encoder_initialized_ = false; + psnr_ = 0.0; + nframes_ = 0; + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/, + ::libvpx_test::Encoder *encoder) override { + if (!encoder_initialized_) { + // Encode 4 column tiles. + encoder->Control(VP9E_SET_TILE_COLUMNS, tiles_); + encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); + if (encoding_mode_ != ::libvpx_test::kRealTime) { + encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); + encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7); + encoder->Control(VP8E_SET_ARNR_STRENGTH, 5); + encoder->Control(VP8E_SET_ARNR_TYPE, 3); + encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 0); + } else { + encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 0); + encoder->Control(VP9E_SET_AQ_MODE, 3); + } + encoder->Control(VP9E_SET_ROW_MT, row_mt_mode_); + + encoder_initialized_ = true; + } + } + + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { + psnr_ += pkt->data.psnr.psnr[0]; + nframes_++; + } + + void DecompressedFrameHook(const vpx_image_t &img, + vpx_codec_pts_t /*pts*/) override { + ::libvpx_test::MD5 md5_res; + md5_res.Add(&img); + md5_.push_back(md5_res.Get()); + } + + bool HandleDecodeResult(const vpx_codec_err_t res, + const libvpx_test::VideoSource & /*video*/, + libvpx_test::Decoder * /*decoder*/) override { + if (res != VPX_CODEC_OK) { + EXPECT_EQ(VPX_CODEC_OK, res); + return false; + } + + return true; + } + + double GetAveragePsnr() const { return nframes_ ? (psnr_ / nframes_) : 0.0; } + + bool encoder_initialized_; + int tiles_; + int threads_; + ::libvpx_test::TestMode encoding_mode_; + int set_cpu_used_; + int row_mt_mode_; + double psnr_; + unsigned int nframes_; + std::vector md5_; +}; + +TEST_P(VPxEncoderThreadTest, EncoderResultTest) { + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 15, 20); + cfg_.rc_target_bitrate = 1000; + + // Part 1: Bit exact test for row_mt_mode_ = 0. + // This part keeps original unit tests done before row-mt code is checked in. + row_mt_mode_ = 0; + + // Encode using single thread. + cfg_.g_threads = 1; + init_flags_ = VPX_CODEC_USE_PSNR; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + const std::vector single_thr_md5 = md5_; + md5_.clear(); + + // Encode using multiple threads. + cfg_.g_threads = threads_; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + const std::vector multi_thr_md5 = md5_; + md5_.clear(); + + // Compare to check if two vectors are equal. + ASSERT_EQ(single_thr_md5, multi_thr_md5); + + // Part 2: row_mt_mode_ = 0 vs row_mt_mode_ = 1 single thread bit exact test. + row_mt_mode_ = 1; + + // Encode using single thread + cfg_.g_threads = 1; + init_flags_ = VPX_CODEC_USE_PSNR; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + std::vector row_mt_single_thr_md5 = md5_; + md5_.clear(); + + ASSERT_EQ(single_thr_md5, row_mt_single_thr_md5); + + // Part 3: Bit exact test with row-mt on + // When row_mt_mode_=1 and using >1 threads, the encoder generates bit exact + // result. + row_mt_mode_ = 1; + row_mt_single_thr_md5.clear(); + + // Encode using 2 threads. + cfg_.g_threads = 2; + init_flags_ = VPX_CODEC_USE_PSNR; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + row_mt_single_thr_md5 = md5_; + md5_.clear(); + + // Encode using multiple threads. + cfg_.g_threads = threads_; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + const std::vector row_mt_multi_thr_md5 = md5_; + md5_.clear(); + + // Compare to check if two vectors are equal. + ASSERT_EQ(row_mt_single_thr_md5, row_mt_multi_thr_md5); + + // Part 4: PSNR test with bit_match_mode_ = 0 + row_mt_mode_ = 1; + + // Encode using single thread. + cfg_.g_threads = 1; + init_flags_ = VPX_CODEC_USE_PSNR; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + const double single_thr_psnr = GetAveragePsnr(); + + // Encode using multiple threads. + cfg_.g_threads = threads_; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + const double multi_thr_psnr = GetAveragePsnr(); + + EXPECT_NEAR(single_thr_psnr, multi_thr_psnr, 0.2); +} + +INSTANTIATE_TEST_SUITE_P( + VP9, VPxFirstPassEncoderThreadTest, + ::testing::Combine( + ::testing::Values( + static_cast(&libvpx_test::kVP9)), + ::testing::Values(::libvpx_test::kTwoPassGood), + ::testing::Range(0, 4))); // cpu_used + +// Split this into two instantiations so that we can distinguish +// between very slow runs ( ie cpu_speed 0 ) vs ones that can be +// run nightly by adding Large to the title. +INSTANTIATE_TEST_SUITE_P( + VP9, VPxEncoderThreadTest, + ::testing::Combine( + ::testing::Values( + static_cast(&libvpx_test::kVP9)), + ::testing::Values(::libvpx_test::kTwoPassGood, + ::libvpx_test::kOnePassGood, + ::libvpx_test::kRealTime), + ::testing::Range(3, 10), // cpu_used + ::testing::Range(0, 3), // tile_columns + ::testing::Range(2, 5))); // threads + +INSTANTIATE_TEST_SUITE_P( + VP9Large, VPxEncoderThreadTest, + ::testing::Combine( + ::testing::Values( + static_cast(&libvpx_test::kVP9)), + ::testing::Values(::libvpx_test::kTwoPassGood, + ::libvpx_test::kOnePassGood, + ::libvpx_test::kRealTime), + ::testing::Range(0, 3), // cpu_used + ::testing::Range(0, 3), // tile_columns + ::testing::Range(2, 5))); // threads + +} // namespace diff --git a/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc b/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc new file mode 100644 index 0000000000..33fa05c65c --- /dev/null +++ b/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc @@ -0,0 +1,979 @@ +/* + * Copyright (c) 2020 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/util.h" +#include "test/yuv_video_source.h" +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "vp9/simple_encode.h" +#include "vpx/vpx_ext_ratectrl.h" +#include "vpx/vpx_tpl.h" +#include "vpx_dsp/vpx_dsp_common.h" + +namespace { + +constexpr int kModelMagicNumber = 51396; +constexpr uintptr_t PrivMagicNumber = 5566; +constexpr int kFrameNum = 5; +constexpr int kFrameNumGOP = 30; +constexpr int kFrameNumGOPShort = 4; +constexpr int kLosslessCodingIndex = 2; +constexpr int kFixedGOPSize = 9; +// The range check in vp9_cx_iface.c shows that the max +// lag in buffer is MAX_LAG_BUFFERS (25): +// RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS); +constexpr int kMaxLagInFrames = 25; +constexpr int kDefaultMinGfInterval = 4; +constexpr int kDefaultMaxGfInterval = 16; +// The active gf interval might change for each GOP +// See function "get_active_gf_inverval_range". +// The numbers below are from manual inspection. +constexpr int kReadMinGfInterval = 5; +constexpr int kReadMaxGfInterval = 13; +const char kTestFileName[] = "bus_352x288_420_f20_b8.yuv"; +const double kPsnrThreshold = 30.4; + +struct ToyRateCtrl { + int magic_number; + int coding_index; + + int gop_global_index; + int frames_since_key; + int show_index; +}; + +vpx_rc_status_t rc_create_model(void *priv, + const vpx_rc_config_t *ratectrl_config, + vpx_rc_model_t *rate_ctrl_model_ptr) { + ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl; + if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR; + toy_rate_ctrl->magic_number = kModelMagicNumber; + toy_rate_ctrl->coding_index = -1; + *rate_ctrl_model_ptr = toy_rate_ctrl; + EXPECT_EQ(priv, reinterpret_cast(PrivMagicNumber)); + EXPECT_EQ(ratectrl_config->frame_width, 352); + EXPECT_EQ(ratectrl_config->frame_height, 288); + EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNum); + EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 24000); + EXPECT_EQ(ratectrl_config->frame_rate_num, 30); + EXPECT_EQ(ratectrl_config->frame_rate_den, 1); + return VPX_RC_OK; +} + +vpx_rc_status_t rc_create_model_gop(void *priv, + const vpx_rc_config_t *ratectrl_config, + vpx_rc_model_t *rate_ctrl_model_ptr) { + ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl; + if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR; + toy_rate_ctrl->magic_number = kModelMagicNumber; + toy_rate_ctrl->gop_global_index = 0; + toy_rate_ctrl->frames_since_key = 0; + toy_rate_ctrl->show_index = 0; + toy_rate_ctrl->coding_index = 0; + *rate_ctrl_model_ptr = toy_rate_ctrl; + EXPECT_EQ(priv, reinterpret_cast(PrivMagicNumber)); + EXPECT_EQ(ratectrl_config->frame_width, 640); + EXPECT_EQ(ratectrl_config->frame_height, 360); + EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNumGOP); + EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 4000); + EXPECT_EQ(ratectrl_config->frame_rate_num, 30); + EXPECT_EQ(ratectrl_config->frame_rate_den, 1); + return VPX_RC_OK; +} + +vpx_rc_status_t rc_create_model_gop_short( + void *priv, const vpx_rc_config_t *ratectrl_config, + vpx_rc_model_t *rate_ctrl_model_ptr) { + ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl; + if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR; + toy_rate_ctrl->magic_number = kModelMagicNumber; + toy_rate_ctrl->gop_global_index = 0; + toy_rate_ctrl->frames_since_key = 0; + toy_rate_ctrl->show_index = 0; + toy_rate_ctrl->coding_index = 0; + *rate_ctrl_model_ptr = toy_rate_ctrl; + EXPECT_EQ(priv, reinterpret_cast(PrivMagicNumber)); + EXPECT_EQ(ratectrl_config->frame_width, 352); + EXPECT_EQ(ratectrl_config->frame_height, 288); + EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNumGOPShort); + EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 500); + EXPECT_EQ(ratectrl_config->frame_rate_num, 30); + EXPECT_EQ(ratectrl_config->frame_rate_den, 1); + return VPX_RC_OK; +} + +vpx_rc_status_t rc_send_firstpass_stats( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_firstpass_stats_t *first_pass_stats) { + const ToyRateCtrl *toy_rate_ctrl = + static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_EQ(first_pass_stats->num_frames, kFrameNum); + for (int i = 0; i < first_pass_stats->num_frames; ++i) { + EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i); + } + return VPX_RC_OK; +} + +vpx_rc_status_t rc_send_firstpass_stats_gop( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_firstpass_stats_t *first_pass_stats) { + const ToyRateCtrl *toy_rate_ctrl = + static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_EQ(first_pass_stats->num_frames, kFrameNumGOP); + for (int i = 0; i < first_pass_stats->num_frames; ++i) { + EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i); + } + return VPX_RC_OK; +} + +vpx_rc_status_t rc_send_firstpass_stats_gop_short( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_firstpass_stats_t *first_pass_stats) { + const ToyRateCtrl *toy_rate_ctrl = + static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_EQ(first_pass_stats->num_frames, kFrameNumGOPShort); + for (int i = 0; i < first_pass_stats->num_frames; ++i) { + EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i); + } + return VPX_RC_OK; +} + +vpx_rc_status_t rc_send_tpl_gop_stats(vpx_rc_model_t rate_ctrl_model, + const VpxTplGopStats *tpl_gop_stats) { + const ToyRateCtrl *toy_rate_ctrl = + static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_GT(tpl_gop_stats->size, 0); + + for (int i = 0; i < tpl_gop_stats->size; ++i) { + EXPECT_GT(tpl_gop_stats->frame_stats_list[i].num_blocks, 0); + } + return VPX_RC_OK; +} + +vpx_rc_status_t rc_get_encodeframe_decision( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_encodeframe_info_t *encode_frame_info, + vpx_rc_encodeframe_decision_t *frame_decision) { + ToyRateCtrl *toy_rate_ctrl = static_cast(rate_ctrl_model); + toy_rate_ctrl->coding_index += 1; + + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + + EXPECT_LT(encode_frame_info->show_index, kFrameNum); + EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index); + + if (encode_frame_info->coding_index == 0) { + EXPECT_EQ(encode_frame_info->show_index, 0); + EXPECT_EQ(encode_frame_info->gop_index, 0); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 0); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 0); // kRefFrameTypeFuture + } else if (encode_frame_info->coding_index == 1) { + EXPECT_EQ(encode_frame_info->show_index, 4); + EXPECT_EQ(encode_frame_info->gop_index, 1); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 1); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 0); // kRefFrameTypeFuture + EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0], + 0); // kRefFrameTypeLast + } else if (encode_frame_info->coding_index >= 2 && + encode_frame_info->coding_index < 5) { + // In the first group of pictures, coding_index and gop_index are equal. + EXPECT_EQ(encode_frame_info->gop_index, encode_frame_info->coding_index); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); + } else if (encode_frame_info->coding_index == 5) { + EXPECT_EQ(encode_frame_info->show_index, 4); + EXPECT_EQ(encode_frame_info->gop_index, 0); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 1); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 1); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 1); // kRefFrameTypeFuture + EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0], + 4); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[2], + 1); // kRefFrameTypeFuture + } + if (encode_frame_info->coding_index == kLosslessCodingIndex) { + // We should get sse == 0 at rc_update_encodeframe_result() + frame_decision->q_index = 0; + } else { + frame_decision->q_index = 100; + } + frame_decision->max_frame_size = 0; + return VPX_RC_OK; +} + +vpx_rc_status_t rc_get_encodeframe_decision_gop( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_encodeframe_info_t *encode_frame_info, + vpx_rc_encodeframe_decision_t *frame_decision) { + ToyRateCtrl *toy_rate_ctrl = static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_LT(encode_frame_info->show_index, kFrameNumGOP); + EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index); + + if (encode_frame_info->coding_index == 0) { + EXPECT_EQ(encode_frame_info->show_index, 0); + EXPECT_EQ(encode_frame_info->gop_index, 0); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 0); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 0); // kRefFrameTypeFuture + } else if (encode_frame_info->coding_index == 1) { + EXPECT_EQ(encode_frame_info->show_index, 1); + EXPECT_EQ(encode_frame_info->gop_index, 1); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 1); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 0); // kRefFrameTypeFuture + EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0], + 0); // kRefFrameTypeLast + } else if (encode_frame_info->coding_index == 2) { + EXPECT_EQ(encode_frame_info->show_index, 2); + EXPECT_EQ(encode_frame_info->gop_index, 0); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 0); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 0); // kRefFrameTypeFuture + } else if (encode_frame_info->coding_index == 3 || + encode_frame_info->coding_index == 12 || + encode_frame_info->coding_index == 21) { + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef); + EXPECT_EQ(encode_frame_info->gop_index, 1); + } else if (encode_frame_info->coding_index == 11 || + encode_frame_info->coding_index == 20 || + encode_frame_info->coding_index == 29) { + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay); + EXPECT_EQ(encode_frame_info->gop_index, 0); + } else if (encode_frame_info->coding_index >= 30) { + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); + } + + // When the model recommends an invalid q, valid range [0, 255], + // the encoder will ignore it and use the default q selected + // by libvpx rate control strategy. + frame_decision->q_index = VPX_DEFAULT_Q; + frame_decision->max_frame_size = 0; + + toy_rate_ctrl->coding_index += 1; + return VPX_RC_OK; +} + +vpx_rc_status_t rc_get_encodeframe_decision_gop_short( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_encodeframe_info_t *encode_frame_info, + vpx_rc_encodeframe_decision_t *frame_decision) { + ToyRateCtrl *toy_rate_ctrl = static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort); + EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index); + + if (encode_frame_info->coding_index == 0) { + EXPECT_EQ(encode_frame_info->show_index, 0); + EXPECT_EQ(encode_frame_info->gop_index, 0); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 0); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 0); // kRefFrameTypeFuture + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } else if (encode_frame_info->coding_index == 1) { + EXPECT_EQ(encode_frame_info->show_index, 1); + EXPECT_EQ(encode_frame_info->gop_index, 1); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 1); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 0); // kRefFrameTypeFuture + EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0], + 0); // kRefFrameTypeLast + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } else if (encode_frame_info->coding_index == 2) { + EXPECT_EQ(encode_frame_info->show_index, 2); + EXPECT_EQ(encode_frame_info->gop_index, 2); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } else if (encode_frame_info->coding_index == 3) { + EXPECT_EQ(encode_frame_info->show_index, 3); + EXPECT_EQ(encode_frame_info->gop_index, 0); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeGolden); + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 2); + } + + // When the model recommends an invalid q, valid range [0, 255], + // the encoder will ignore it and use the default q selected + // by libvpx rate control strategy. + frame_decision->q_index = VPX_DEFAULT_Q; + frame_decision->max_frame_size = 0; + + toy_rate_ctrl->coding_index += 1; + return VPX_RC_OK; +} + +vpx_rc_status_t rc_get_encodeframe_decision_gop_short_overlay( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_encodeframe_info_t *encode_frame_info, + vpx_rc_encodeframe_decision_t *frame_decision) { + ToyRateCtrl *toy_rate_ctrl = static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort); + EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index); + + if (encode_frame_info->coding_index == 0) { + EXPECT_EQ(encode_frame_info->show_index, 0); + EXPECT_EQ(encode_frame_info->gop_index, 0); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 0); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 0); // kRefFrameTypeFuture + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } else if (encode_frame_info->coding_index == 1) { + EXPECT_EQ(encode_frame_info->show_index, 3); + EXPECT_EQ(encode_frame_info->gop_index, 1); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 1); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 0); // kRefFrameTypeFuture + EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0], + 0); // kRefFrameTypeLast + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } else if (encode_frame_info->coding_index == 2) { + EXPECT_EQ(encode_frame_info->show_index, 1); + EXPECT_EQ(encode_frame_info->gop_index, 2); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } else if (encode_frame_info->coding_index == 3) { + EXPECT_EQ(encode_frame_info->show_index, 2); + EXPECT_EQ(encode_frame_info->gop_index, 3); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } else if (encode_frame_info->coding_index == 4) { + EXPECT_EQ(encode_frame_info->show_index, 3); + EXPECT_EQ(encode_frame_info->gop_index, 0); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay); + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } + + // When the model recommends an invalid q, valid range [0, 255], + // the encoder will ignore it and use the default q selected + // by libvpx rate control strategy. + frame_decision->q_index = VPX_DEFAULT_Q; + frame_decision->max_frame_size = 0; + + toy_rate_ctrl->coding_index += 1; + return VPX_RC_OK; +} + +vpx_rc_status_t rc_get_encodeframe_decision_gop_short_no_arf( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_encodeframe_info_t *encode_frame_info, + vpx_rc_encodeframe_decision_t *frame_decision) { + ToyRateCtrl *toy_rate_ctrl = static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort); + EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index); + + if (encode_frame_info->coding_index == 0) { + EXPECT_EQ(encode_frame_info->show_index, 0); + EXPECT_EQ(encode_frame_info->gop_index, 0); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 0); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 0); // kRefFrameTypeFuture + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } else if (encode_frame_info->coding_index == 1) { + EXPECT_EQ(encode_frame_info->show_index, 1); + EXPECT_EQ(encode_frame_info->gop_index, 1); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 1); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 0); // kRefFrameTypeFuture + EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0], + 0); // kRefFrameTypeLast + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } else if (encode_frame_info->coding_index == 2) { + EXPECT_EQ(encode_frame_info->show_index, 2); + EXPECT_EQ(encode_frame_info->gop_index, 2); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } else if (encode_frame_info->coding_index == 3) { + EXPECT_EQ(encode_frame_info->show_index, 3); + EXPECT_EQ(encode_frame_info->gop_index, 3); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } + + // When the model recommends an invalid q, valid range [0, 255], + // the encoder will ignore it and use the default q selected + // by libvpx rate control strategy. + frame_decision->q_index = VPX_DEFAULT_Q; + frame_decision->max_frame_size = 0; + + toy_rate_ctrl->coding_index += 1; + return VPX_RC_OK; +} + +vpx_rc_status_t rc_get_gop_decision(vpx_rc_model_t rate_ctrl_model, + const vpx_rc_gop_info_t *gop_info, + vpx_rc_gop_decision_t *gop_decision) { + ToyRateCtrl *toy_rate_ctrl = static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames); + EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval); + EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval); + EXPECT_EQ(gop_info->active_min_gf_interval, kReadMinGfInterval); + EXPECT_EQ(gop_info->active_max_gf_interval, kReadMaxGfInterval); + EXPECT_EQ(gop_info->allow_alt_ref, 1); + if (gop_info->is_key_frame) { + EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); + EXPECT_EQ(gop_info->frames_since_key, 0); + EXPECT_EQ(gop_info->gop_global_index, 0); + toy_rate_ctrl->gop_global_index = 0; + toy_rate_ctrl->frames_since_key = 0; + } else { + EXPECT_EQ(gop_info->last_gop_use_alt_ref, 1); + } + EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index); + EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key); + EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index); + EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index); + + gop_decision->gop_coding_frames = + VPXMIN(kFixedGOPSize, gop_info->frames_to_key); + gop_decision->use_alt_ref = gop_decision->gop_coding_frames == kFixedGOPSize; + toy_rate_ctrl->frames_since_key += + gop_decision->gop_coding_frames - gop_decision->use_alt_ref; + toy_rate_ctrl->show_index += + gop_decision->gop_coding_frames - gop_decision->use_alt_ref; + ++toy_rate_ctrl->gop_global_index; + return VPX_RC_OK; +} + +// Test on a 4 frame video. +// Test a setting of 2 GOPs. +// The first GOP has 3 coding frames, no alt ref. +// The second GOP has 1 coding frame, no alt ref. +vpx_rc_status_t rc_get_gop_decision_short(vpx_rc_model_t rate_ctrl_model, + const vpx_rc_gop_info_t *gop_info, + vpx_rc_gop_decision_t *gop_decision) { + ToyRateCtrl *toy_rate_ctrl = static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1); + EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval); + EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval); + EXPECT_EQ(gop_info->allow_alt_ref, 1); + if (gop_info->is_key_frame) { + EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); + EXPECT_EQ(gop_info->frames_since_key, 0); + EXPECT_EQ(gop_info->gop_global_index, 0); + toy_rate_ctrl->gop_global_index = 0; + toy_rate_ctrl->frames_since_key = 0; + } else { + EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); + } + EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index); + EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key); + EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index); + EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index); + + gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 3 : 1; + gop_decision->use_alt_ref = 0; + toy_rate_ctrl->frames_since_key += + gop_decision->gop_coding_frames - gop_decision->use_alt_ref; + toy_rate_ctrl->show_index += + gop_decision->gop_coding_frames - gop_decision->use_alt_ref; + ++toy_rate_ctrl->gop_global_index; + return VPX_RC_OK; +} + +// Test on a 4 frame video. +// Test a setting of 2 GOPs. +// The first GOP has 4 coding frames. Use alt ref. +// The second GOP only contains the overlay frame of the first GOP's alt ref +// frame. +vpx_rc_status_t rc_get_gop_decision_short_overlay( + vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info, + vpx_rc_gop_decision_t *gop_decision) { + ToyRateCtrl *toy_rate_ctrl = static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1); + EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval); + EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval); + EXPECT_EQ(gop_info->allow_alt_ref, 1); + if (gop_info->is_key_frame) { + EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); + EXPECT_EQ(gop_info->frames_since_key, 0); + EXPECT_EQ(gop_info->gop_global_index, 0); + toy_rate_ctrl->gop_global_index = 0; + toy_rate_ctrl->frames_since_key = 0; + } else { + EXPECT_EQ(gop_info->last_gop_use_alt_ref, 1); + } + EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index); + EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key); + EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index); + EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index); + + gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 4 : 1; + gop_decision->use_alt_ref = gop_info->is_key_frame ? 1 : 0; + toy_rate_ctrl->frames_since_key += + gop_decision->gop_coding_frames - gop_decision->use_alt_ref; + toy_rate_ctrl->show_index += + gop_decision->gop_coding_frames - gop_decision->use_alt_ref; + ++toy_rate_ctrl->gop_global_index; + return VPX_RC_OK; +} + +// Test on a 4 frame video. +// Test a setting of 1 GOP. +// The GOP has 4 coding frames. Do not use alt ref. +vpx_rc_status_t rc_get_gop_decision_short_no_arf( + vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info, + vpx_rc_gop_decision_t *gop_decision) { + ToyRateCtrl *toy_rate_ctrl = static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1); + EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval); + EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval); + EXPECT_EQ(gop_info->allow_alt_ref, 1); + if (gop_info->is_key_frame) { + EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); + EXPECT_EQ(gop_info->frames_since_key, 0); + EXPECT_EQ(gop_info->gop_global_index, 0); + toy_rate_ctrl->gop_global_index = 0; + toy_rate_ctrl->frames_since_key = 0; + } else { + EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); + } + EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index); + EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key); + EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index); + EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index); + + gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 4 : 1; + gop_decision->use_alt_ref = 0; + toy_rate_ctrl->frames_since_key += + gop_decision->gop_coding_frames - gop_decision->use_alt_ref; + toy_rate_ctrl->show_index += + gop_decision->gop_coding_frames - gop_decision->use_alt_ref; + ++toy_rate_ctrl->gop_global_index; + return VPX_RC_OK; +} + +vpx_rc_status_t rc_update_encodeframe_result( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_encodeframe_result_t *encode_frame_result) { + const ToyRateCtrl *toy_rate_ctrl = + static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + + const int64_t ref_pixel_count = 352 * 288 * 3 / 2; + EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count); + if (toy_rate_ctrl->coding_index == kLosslessCodingIndex) { + EXPECT_EQ(encode_frame_result->sse, 0); + } + if (toy_rate_ctrl->coding_index == kLosslessCodingIndex) { + EXPECT_EQ(encode_frame_result->actual_encoding_qindex, 0); + } else { + EXPECT_EQ(encode_frame_result->actual_encoding_qindex, 100); + } + return VPX_RC_OK; +} + +vpx_rc_status_t rc_update_encodeframe_result_gop( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_encodeframe_result_t *encode_frame_result) { + const ToyRateCtrl *toy_rate_ctrl = + static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + + const int64_t ref_pixel_count = 640 * 360 * 3 / 2; + EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count); + return VPX_RC_OK; +} + +vpx_rc_status_t rc_update_encodeframe_result_gop_short( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_encodeframe_result_t *encode_frame_result) { + const ToyRateCtrl *toy_rate_ctrl = + static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + + const int64_t ref_pixel_count = 352 * 288 * 3 / 2; + EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count); + return VPX_RC_OK; +} + +vpx_rc_status_t rc_get_default_frame_rdmult( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_encodeframe_info_t *encode_frame_info, int *rdmult) { + const ToyRateCtrl *toy_rate_ctrl = + static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort); + EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index); + + *rdmult = VPX_DEFAULT_RDMULT; + return VPX_RC_OK; +} + +vpx_rc_status_t rc_delete_model(vpx_rc_model_t rate_ctrl_model) { + ToyRateCtrl *toy_rate_ctrl = static_cast(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + delete toy_rate_ctrl; + return VPX_RC_OK; +} + +class ExtRateCtrlTest : public ::libvpx_test::EncoderTest, + public ::testing::Test { + protected: + ExtRateCtrlTest() : EncoderTest(&::libvpx_test::kVP9) {} + + ~ExtRateCtrlTest() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kTwoPassGood); + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + vpx_rc_funcs_t rc_funcs = {}; + rc_funcs.rc_type = VPX_RC_QP; + rc_funcs.create_model = rc_create_model; + rc_funcs.send_firstpass_stats = rc_send_firstpass_stats; + rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision; + rc_funcs.update_encodeframe_result = rc_update_encodeframe_result; + rc_funcs.delete_model = rc_delete_model; + rc_funcs.priv = reinterpret_cast(PrivMagicNumber); + encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs); + } + } +}; + +TEST_F(ExtRateCtrlTest, EncodeTest) { + cfg_.rc_target_bitrate = 24000; + + std::unique_ptr video; + video.reset(new (std::nothrow) libvpx_test::YUVVideoSource( + "bus_352x288_420_f20_b8.yuv", VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, + kFrameNum)); + + ASSERT_NE(video, nullptr); + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); +} + +class ExtRateCtrlTestGOP : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam { + protected: + ExtRateCtrlTestGOP() : EncoderTest(&::libvpx_test::kVP9) {} + + ~ExtRateCtrlTestGOP() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kTwoPassGood); + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval); + encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); + + vpx_rc_funcs_t rc_funcs = {}; + rc_funcs.rc_type = VPX_RC_GOP_QP; + rc_funcs.create_model = rc_create_model_gop; + rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop; + rc_funcs.send_tpl_gop_stats = rc_send_tpl_gop_stats; + rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop; + rc_funcs.get_gop_decision = rc_get_gop_decision; + rc_funcs.update_encodeframe_result = rc_update_encodeframe_result_gop; + rc_funcs.delete_model = rc_delete_model; + rc_funcs.priv = reinterpret_cast(PrivMagicNumber); + encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs); + } + } +}; + +TEST_F(ExtRateCtrlTestGOP, EncodeTest) { + cfg_.rc_target_bitrate = 4000; + cfg_.g_lag_in_frames = kMaxLagInFrames; + cfg_.rc_end_usage = VPX_VBR; + + std::unique_ptr video; + video.reset(new (std::nothrow) libvpx_test::YUVVideoSource( + "noisy_clip_640_360.y4m", VPX_IMG_FMT_I420, 640, 360, 30, 1, 0, + kFrameNumGOP)); + + ASSERT_NE(video, nullptr); + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); +} + +class ExtRateCtrlTestGOPShort : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam { + protected: + ExtRateCtrlTestGOPShort() : EncoderTest(&::libvpx_test::kVP9) {} + + ~ExtRateCtrlTestGOPShort() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kTwoPassGood); + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval); + encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); + encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO); + + vpx_rc_funcs_t rc_funcs = {}; + rc_funcs.rc_type = VPX_RC_GOP_QP; + rc_funcs.create_model = rc_create_model_gop_short; + rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short; + rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop_short; + rc_funcs.get_gop_decision = rc_get_gop_decision_short; + rc_funcs.update_encodeframe_result = + rc_update_encodeframe_result_gop_short; + rc_funcs.delete_model = rc_delete_model; + rc_funcs.priv = reinterpret_cast(PrivMagicNumber); + encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs); + } + } +}; + +TEST_F(ExtRateCtrlTestGOPShort, EncodeTest) { + cfg_.rc_target_bitrate = 500; + cfg_.g_lag_in_frames = kMaxLagInFrames - 1; + cfg_.rc_end_usage = VPX_VBR; + + std::unique_ptr video; + video.reset(new (std::nothrow) libvpx_test::YUVVideoSource( + kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort)); + + ASSERT_NE(video, nullptr); + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); +} + +class ExtRateCtrlTestGOPShortOverlay + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam { + protected: + ExtRateCtrlTestGOPShortOverlay() : EncoderTest(&::libvpx_test::kVP9) {} + + ~ExtRateCtrlTestGOPShortOverlay() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kTwoPassGood); + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval); + encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); + encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO); + + vpx_rc_funcs_t rc_funcs = {}; + rc_funcs.rc_type = VPX_RC_GOP_QP; + rc_funcs.create_model = rc_create_model_gop_short; + rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short; + rc_funcs.get_encodeframe_decision = + rc_get_encodeframe_decision_gop_short_overlay; + rc_funcs.get_gop_decision = rc_get_gop_decision_short_overlay; + rc_funcs.update_encodeframe_result = + rc_update_encodeframe_result_gop_short; + rc_funcs.delete_model = rc_delete_model; + rc_funcs.priv = reinterpret_cast(PrivMagicNumber); + encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs); + } + } +}; + +TEST_F(ExtRateCtrlTestGOPShortOverlay, EncodeTest) { + cfg_.rc_target_bitrate = 500; + cfg_.g_lag_in_frames = kMaxLagInFrames - 1; + cfg_.rc_end_usage = VPX_VBR; + + std::unique_ptr video; + video.reset(new (std::nothrow) libvpx_test::YUVVideoSource( + kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort)); + + ASSERT_NE(video, nullptr); + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); +} + +class ExtRateCtrlTestGOPShortNoARF + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam { + protected: + ExtRateCtrlTestGOPShortNoARF() : EncoderTest(&::libvpx_test::kVP9) {} + + ~ExtRateCtrlTestGOPShortNoARF() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kTwoPassGood); + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval); + encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); + encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO); + + vpx_rc_funcs_t rc_funcs = {}; + rc_funcs.rc_type = VPX_RC_GOP_QP; + rc_funcs.create_model = rc_create_model_gop_short; + rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short; + rc_funcs.get_encodeframe_decision = + rc_get_encodeframe_decision_gop_short_no_arf; + rc_funcs.get_gop_decision = rc_get_gop_decision_short_no_arf; + rc_funcs.update_encodeframe_result = + rc_update_encodeframe_result_gop_short; + rc_funcs.delete_model = rc_delete_model; + rc_funcs.priv = reinterpret_cast(PrivMagicNumber); + encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs); + } + } +}; + +TEST_F(ExtRateCtrlTestGOPShortNoARF, EncodeTest) { + cfg_.rc_target_bitrate = 500; + cfg_.g_lag_in_frames = kMaxLagInFrames - 1; + cfg_.rc_end_usage = VPX_VBR; + + std::unique_ptr video; + video.reset(new (std::nothrow) libvpx_test::YUVVideoSource( + kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort)); + + ASSERT_NE(video, nullptr); + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); +} + +class ExtRateCtrlTestRdmult : public ::libvpx_test::EncoderTest, + public ::testing::Test { + protected: + ExtRateCtrlTestRdmult() : EncoderTest(&::libvpx_test::kVP9) {} + + ~ExtRateCtrlTestRdmult() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kTwoPassGood); + } + + void BeginPassHook(unsigned int) override { + psnr_ = 0.0; + nframes_ = 0; + } + + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { + psnr_ += pkt->data.psnr.psnr[0]; + nframes_++; + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + vpx_rc_funcs_t rc_funcs = {}; + rc_funcs.rc_type = VPX_RC_GOP_QP_RDMULT; + rc_funcs.create_model = rc_create_model_gop_short; + rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short; + rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop_short; + rc_funcs.get_gop_decision = rc_get_gop_decision_short; + rc_funcs.update_encodeframe_result = + rc_update_encodeframe_result_gop_short; + rc_funcs.get_frame_rdmult = rc_get_default_frame_rdmult; + rc_funcs.delete_model = rc_delete_model; + rc_funcs.priv = reinterpret_cast(PrivMagicNumber); + encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs); + } + } + + double GetAveragePsnr() const { + if (nframes_) return psnr_ / nframes_; + return 0.0; + } + + private: + double psnr_; + unsigned int nframes_; +}; + +TEST_F(ExtRateCtrlTestRdmult, DefaultRdmult) { + cfg_.rc_target_bitrate = 500; + cfg_.g_lag_in_frames = kMaxLagInFrames - 1; + cfg_.rc_end_usage = VPX_VBR; + init_flags_ = VPX_CODEC_USE_PSNR; + + std::unique_ptr video; + video.reset(new (std::nothrow) libvpx_test::YUVVideoSource( + kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort)); + + ASSERT_NE(video, nullptr); + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); + + const double psnr = GetAveragePsnr(); + EXPECT_GT(psnr, kPsnrThreshold); +} + +} // namespace diff --git a/media/libvpx/libvpx/test/vp9_intrapred_test.cc b/media/libvpx/libvpx/test/vp9_intrapred_test.cc new file mode 100644 index 0000000000..c69d43efbc --- /dev/null +++ b/media/libvpx/libvpx/test/vp9_intrapred_test.cc @@ -0,0 +1,1207 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_pred_common.h" +#include "vpx_mem/vpx_mem.h" + +namespace { + +using libvpx_test::ACMRandom; + +const int count_test_block = 100000; + +typedef void (*IntraPredFunc)(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left); + +struct IntraPredParam { + IntraPredParam(IntraPredFunc pred = nullptr, IntraPredFunc ref = nullptr, + int block_size_value = 0, int bit_depth_value = 0) + : pred_fn(pred), ref_fn(ref), block_size(block_size_value), + bit_depth(bit_depth_value) {} + + IntraPredFunc pred_fn; + IntraPredFunc ref_fn; + int block_size; + int bit_depth; +}; + +template +class IntraPredTest : public ::testing::TestWithParam { + public: + void RunTest(Pixel *left_col, Pixel *above_data, Pixel *dst, Pixel *ref_dst) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int block_size = params_.block_size; + above_row_ = above_data + 16; + left_col_ = left_col; + dst_ = dst; + ref_dst_ = ref_dst; + int error_count = 0; + for (int i = 0; i < count_test_block; ++i) { + // TODO(webm:1797): Some of the optimised predictor implementations rely + // on the trailing half of the above_row_ being a copy of the final + // element, however relying on this in some cases can cause the MD5 tests + // to fail. We have fixed all of these cases for Neon, so fill the whole + // of above_row_ randomly. +#if HAVE_NEON + // Fill edges with random data, try first with saturated values. + for (int x = -1; x < 2 * block_size; x++) { + if (i == 0) { + above_row_[x] = mask_; + } else { + above_row_[x] = rnd.Rand16() & mask_; + } + } +#else + // Fill edges with random data, try first with saturated values. + for (int x = -1; x < block_size; x++) { + if (i == 0) { + above_row_[x] = mask_; + } else { + above_row_[x] = rnd.Rand16() & mask_; + } + } + for (int x = block_size; x < 2 * block_size; x++) { + above_row_[x] = above_row_[block_size - 1]; + } +#endif + for (int y = 0; y < block_size; y++) { + if (i == 0) { + left_col_[y] = mask_; + } else { + left_col_[y] = rnd.Rand16() & mask_; + } + } + Predict(); + CheckPrediction(i, &error_count); + } + ASSERT_EQ(0, error_count); + } + + protected: + void SetUp() override { + params_ = this->GetParam(); + stride_ = params_.block_size * 3; + mask_ = (1 << params_.bit_depth) - 1; + } + + void Predict(); + + void CheckPrediction(int test_case_number, int *error_count) const { + // For each pixel ensure that the calculated value is the same as reference. + const int block_size = params_.block_size; + for (int y = 0; y < block_size; y++) { + for (int x = 0; x < block_size; x++) { + *error_count += ref_dst_[x + y * stride_] != dst_[x + y * stride_]; + if (*error_count == 1) { + ASSERT_EQ(ref_dst_[x + y * stride_], dst_[x + y * stride_]) + << " Failed on Test Case Number " << test_case_number; + } + } + } + } + + Pixel *above_row_; + Pixel *left_col_; + Pixel *dst_; + Pixel *ref_dst_; + ptrdiff_t stride_; + int mask_; + + PredParam params_; +}; + +template <> +void IntraPredTest::Predict() { + params_.ref_fn(ref_dst_, stride_, above_row_, left_col_); + ASM_REGISTER_STATE_CHECK( + params_.pred_fn(dst_, stride_, above_row_, left_col_)); +} + +typedef IntraPredTest VP9IntraPredTest; + +TEST_P(VP9IntraPredTest, IntraPredTests) { + // max block size is 32 + DECLARE_ALIGNED(16, uint8_t, left_col[2 * 32]); + DECLARE_ALIGNED(16, uint8_t, above_data[2 * 32 + 32]); + DECLARE_ALIGNED(16, uint8_t, dst[3 * 32 * 32]); + DECLARE_ALIGNED(16, uint8_t, ref_dst[3 * 32 * 32]); + RunTest(left_col, above_data, dst, ref_dst); +} + +// Instantiate a token test to avoid -Wuninitialized warnings when none of the +// other tests are enabled. +INSTANTIATE_TEST_SUITE_P( + C, VP9IntraPredTest, + ::testing::Values(IntraPredParam(&vpx_d45_predictor_4x4_c, + &vpx_d45_predictor_4x4_c, 4, 8))); +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2, VP9IntraPredTest, + ::testing::Values( + IntraPredParam(&vpx_d45_predictor_4x4_sse2, &vpx_d45_predictor_4x4_c, 4, + 8), + IntraPredParam(&vpx_d45_predictor_8x8_sse2, &vpx_d45_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_d207_predictor_4x4_sse2, &vpx_d207_predictor_4x4_c, + 4, 8), + IntraPredParam(&vpx_dc_128_predictor_4x4_sse2, + &vpx_dc_128_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_dc_128_predictor_8x8_sse2, + &vpx_dc_128_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_dc_128_predictor_16x16_sse2, + &vpx_dc_128_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_128_predictor_32x32_sse2, + &vpx_dc_128_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_dc_left_predictor_4x4_sse2, + &vpx_dc_left_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_dc_left_predictor_8x8_sse2, + &vpx_dc_left_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_dc_left_predictor_16x16_sse2, + &vpx_dc_left_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_left_predictor_32x32_sse2, + &vpx_dc_left_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_dc_predictor_4x4_sse2, &vpx_dc_predictor_4x4_c, 4, + 8), + IntraPredParam(&vpx_dc_predictor_8x8_sse2, &vpx_dc_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_dc_predictor_16x16_sse2, &vpx_dc_predictor_16x16_c, + 16, 8), + IntraPredParam(&vpx_dc_predictor_32x32_sse2, &vpx_dc_predictor_32x32_c, + 32, 8), + IntraPredParam(&vpx_dc_top_predictor_4x4_sse2, + &vpx_dc_top_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_dc_top_predictor_8x8_sse2, + &vpx_dc_top_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_dc_top_predictor_16x16_sse2, + &vpx_dc_top_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_top_predictor_32x32_sse2, + &vpx_dc_top_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_h_predictor_4x4_sse2, &vpx_h_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_h_predictor_8x8_sse2, &vpx_h_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_h_predictor_16x16_sse2, &vpx_h_predictor_16x16_c, + 16, 8), + IntraPredParam(&vpx_h_predictor_32x32_sse2, &vpx_h_predictor_32x32_c, + 32, 8), + IntraPredParam(&vpx_tm_predictor_4x4_sse2, &vpx_tm_predictor_4x4_c, 4, + 8), + IntraPredParam(&vpx_tm_predictor_8x8_sse2, &vpx_tm_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_tm_predictor_16x16_sse2, &vpx_tm_predictor_16x16_c, + 16, 8), + IntraPredParam(&vpx_tm_predictor_32x32_sse2, &vpx_tm_predictor_32x32_c, + 32, 8), + IntraPredParam(&vpx_v_predictor_4x4_sse2, &vpx_v_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_v_predictor_8x8_sse2, &vpx_v_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_v_predictor_16x16_sse2, &vpx_v_predictor_16x16_c, + 16, 8), + IntraPredParam(&vpx_v_predictor_32x32_sse2, &vpx_v_predictor_32x32_c, + 32, 8))); +#endif // HAVE_SSE2 + +#if HAVE_SSSE3 +INSTANTIATE_TEST_SUITE_P( + SSSE3, VP9IntraPredTest, + ::testing::Values(IntraPredParam(&vpx_d45_predictor_16x16_ssse3, + &vpx_d45_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d45_predictor_32x32_ssse3, + &vpx_d45_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_d63_predictor_4x4_ssse3, + &vpx_d63_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_d63_predictor_8x8_ssse3, + &vpx_d63_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_d63_predictor_16x16_ssse3, + &vpx_d63_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d63_predictor_32x32_ssse3, + &vpx_d63_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_d153_predictor_4x4_ssse3, + &vpx_d153_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_d153_predictor_8x8_ssse3, + &vpx_d153_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_d153_predictor_16x16_ssse3, + &vpx_d153_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d153_predictor_32x32_ssse3, + &vpx_d153_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_d207_predictor_8x8_ssse3, + &vpx_d207_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_d207_predictor_16x16_ssse3, + &vpx_d207_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d207_predictor_32x32_ssse3, + &vpx_d207_predictor_32x32_c, 32, 8))); +#endif // HAVE_SSSE3 + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, VP9IntraPredTest, + ::testing::Values( + IntraPredParam(&vpx_d45_predictor_4x4_neon, &vpx_d45_predictor_4x4_c, 4, + 8), + IntraPredParam(&vpx_d45_predictor_8x8_neon, &vpx_d45_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_d45_predictor_16x16_neon, + &vpx_d45_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d45_predictor_32x32_neon, + &vpx_d45_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_d63_predictor_4x4_neon, &vpx_d63_predictor_4x4_c, 4, + 8), + IntraPredParam(&vpx_d63_predictor_8x8_neon, &vpx_d63_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_d63_predictor_16x16_neon, + &vpx_d63_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d63_predictor_32x32_neon, + &vpx_d63_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_d117_predictor_4x4_neon, &vpx_d117_predictor_4x4_c, + 4, 8), + IntraPredParam(&vpx_d117_predictor_8x8_neon, &vpx_d117_predictor_8x8_c, + 8, 8), + IntraPredParam(&vpx_d117_predictor_16x16_neon, + &vpx_d117_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d117_predictor_32x32_neon, + &vpx_d117_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_d135_predictor_4x4_neon, &vpx_d135_predictor_4x4_c, + 4, 8), + IntraPredParam(&vpx_d135_predictor_8x8_neon, &vpx_d135_predictor_8x8_c, + 8, 8), + IntraPredParam(&vpx_d135_predictor_16x16_neon, + &vpx_d135_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d135_predictor_32x32_neon, + &vpx_d135_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_d153_predictor_4x4_neon, &vpx_d153_predictor_4x4_c, + 4, 8), + IntraPredParam(&vpx_d153_predictor_8x8_neon, &vpx_d153_predictor_8x8_c, + 8, 8), + IntraPredParam(&vpx_d153_predictor_16x16_neon, + &vpx_d153_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d153_predictor_32x32_neon, + &vpx_d153_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_d207_predictor_4x4_neon, &vpx_d207_predictor_4x4_c, + 4, 8), + IntraPredParam(&vpx_d207_predictor_8x8_neon, &vpx_d207_predictor_8x8_c, + 8, 8), + IntraPredParam(&vpx_d207_predictor_16x16_neon, + &vpx_d207_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d207_predictor_32x32_neon, + &vpx_d207_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_dc_128_predictor_4x4_neon, + &vpx_dc_128_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_dc_128_predictor_8x8_neon, + &vpx_dc_128_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_dc_128_predictor_16x16_neon, + &vpx_dc_128_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_128_predictor_32x32_neon, + &vpx_dc_128_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_dc_left_predictor_4x4_neon, + &vpx_dc_left_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_dc_left_predictor_8x8_neon, + &vpx_dc_left_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_dc_left_predictor_16x16_neon, + &vpx_dc_left_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_left_predictor_32x32_neon, + &vpx_dc_left_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_dc_predictor_4x4_neon, &vpx_dc_predictor_4x4_c, 4, + 8), + IntraPredParam(&vpx_dc_predictor_8x8_neon, &vpx_dc_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_dc_predictor_16x16_neon, &vpx_dc_predictor_16x16_c, + 16, 8), + IntraPredParam(&vpx_dc_predictor_32x32_neon, &vpx_dc_predictor_32x32_c, + 32, 8), + IntraPredParam(&vpx_dc_top_predictor_4x4_neon, + &vpx_dc_top_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_dc_top_predictor_8x8_neon, + &vpx_dc_top_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_dc_top_predictor_16x16_neon, + &vpx_dc_top_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_top_predictor_32x32_neon, + &vpx_dc_top_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_h_predictor_4x4_neon, &vpx_h_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_h_predictor_8x8_neon, &vpx_h_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_h_predictor_16x16_neon, &vpx_h_predictor_16x16_c, + 16, 8), + IntraPredParam(&vpx_h_predictor_32x32_neon, &vpx_h_predictor_32x32_c, + 32, 8), + IntraPredParam(&vpx_tm_predictor_4x4_neon, &vpx_tm_predictor_4x4_c, 4, + 8), + IntraPredParam(&vpx_tm_predictor_8x8_neon, &vpx_tm_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_tm_predictor_16x16_neon, &vpx_tm_predictor_16x16_c, + 16, 8), + IntraPredParam(&vpx_tm_predictor_32x32_neon, &vpx_tm_predictor_32x32_c, + 32, 8), + IntraPredParam(&vpx_v_predictor_4x4_neon, &vpx_v_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_v_predictor_8x8_neon, &vpx_v_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_v_predictor_16x16_neon, &vpx_v_predictor_16x16_c, + 16, 8), + IntraPredParam(&vpx_v_predictor_32x32_neon, &vpx_v_predictor_32x32_c, + 32, 8))); +#endif // HAVE_NEON + +#if HAVE_DSPR2 +INSTANTIATE_TEST_SUITE_P( + DSPR2, VP9IntraPredTest, + ::testing::Values(IntraPredParam(&vpx_dc_predictor_4x4_dspr2, + &vpx_dc_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_dc_predictor_8x8_dspr2, + &vpx_dc_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_dc_predictor_16x16_dspr2, + &vpx_dc_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_h_predictor_4x4_dspr2, + &vpx_h_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_h_predictor_8x8_dspr2, + &vpx_h_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_h_predictor_16x16_dspr2, + &vpx_h_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_tm_predictor_4x4_dspr2, + &vpx_tm_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_tm_predictor_8x8_dspr2, + &vpx_tm_predictor_8x8_c, 8, 8))); +#endif // HAVE_DSPR2 + +#if HAVE_MSA +INSTANTIATE_TEST_SUITE_P( + MSA, VP9IntraPredTest, + ::testing::Values( + IntraPredParam(&vpx_dc_128_predictor_4x4_msa, + &vpx_dc_128_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_dc_128_predictor_8x8_msa, + &vpx_dc_128_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_dc_128_predictor_16x16_msa, + &vpx_dc_128_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_128_predictor_32x32_msa, + &vpx_dc_128_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_dc_left_predictor_4x4_msa, + &vpx_dc_left_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_dc_left_predictor_8x8_msa, + &vpx_dc_left_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_dc_left_predictor_16x16_msa, + &vpx_dc_left_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_left_predictor_32x32_msa, + &vpx_dc_left_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_dc_predictor_4x4_msa, &vpx_dc_predictor_4x4_c, 4, + 8), + IntraPredParam(&vpx_dc_predictor_8x8_msa, &vpx_dc_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_dc_predictor_16x16_msa, &vpx_dc_predictor_16x16_c, + 16, 8), + IntraPredParam(&vpx_dc_predictor_32x32_msa, &vpx_dc_predictor_32x32_c, + 32, 8), + IntraPredParam(&vpx_dc_top_predictor_4x4_msa, + &vpx_dc_top_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_dc_top_predictor_8x8_msa, + &vpx_dc_top_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_dc_top_predictor_16x16_msa, + &vpx_dc_top_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_top_predictor_32x32_msa, + &vpx_dc_top_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_h_predictor_4x4_msa, &vpx_h_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_h_predictor_8x8_msa, &vpx_h_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_h_predictor_16x16_msa, &vpx_h_predictor_16x16_c, 16, + 8), + IntraPredParam(&vpx_h_predictor_32x32_msa, &vpx_h_predictor_32x32_c, 32, + 8), + IntraPredParam(&vpx_tm_predictor_4x4_msa, &vpx_tm_predictor_4x4_c, 4, + 8), + IntraPredParam(&vpx_tm_predictor_8x8_msa, &vpx_tm_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_tm_predictor_16x16_msa, &vpx_tm_predictor_16x16_c, + 16, 8), + IntraPredParam(&vpx_tm_predictor_32x32_msa, &vpx_tm_predictor_32x32_c, + 32, 8), + IntraPredParam(&vpx_v_predictor_4x4_msa, &vpx_v_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_v_predictor_8x8_msa, &vpx_v_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_v_predictor_16x16_msa, &vpx_v_predictor_16x16_c, 16, + 8), + IntraPredParam(&vpx_v_predictor_32x32_msa, &vpx_v_predictor_32x32_c, 32, + 8))); +#endif // HAVE_MSA + +// TODO(crbug.com/webm/1522): Fix test failures. +#if 0 + IntraPredParam(&vpx_d45_predictor_8x8_vsx, &vpx_d45_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_d63_predictor_8x8_vsx, &vpx_d63_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_dc_predictor_8x8_vsx, &vpx_dc_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_h_predictor_4x4_vsx, &vpx_h_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_h_predictor_8x8_vsx, &vpx_h_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_tm_predictor_4x4_vsx, &vpx_tm_predictor_4x4_c, 4, + 8), + IntraPredParam(&vpx_tm_predictor_8x8_vsx, &vpx_tm_predictor_8x8_c, 8, + 8), +#endif + +#if HAVE_VSX +INSTANTIATE_TEST_SUITE_P( + VSX, VP9IntraPredTest, + ::testing::Values(IntraPredParam(&vpx_d45_predictor_16x16_vsx, + &vpx_d45_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d45_predictor_32x32_vsx, + &vpx_d45_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_d63_predictor_16x16_vsx, + &vpx_d63_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d63_predictor_32x32_vsx, + &vpx_d63_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_dc_128_predictor_16x16_vsx, + &vpx_dc_128_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_128_predictor_32x32_vsx, + &vpx_dc_128_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_dc_left_predictor_16x16_vsx, + &vpx_dc_left_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_left_predictor_32x32_vsx, + &vpx_dc_left_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_dc_predictor_16x16_vsx, + &vpx_dc_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_predictor_32x32_vsx, + &vpx_dc_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_dc_top_predictor_16x16_vsx, + &vpx_dc_top_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_top_predictor_32x32_vsx, + &vpx_dc_top_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_h_predictor_16x16_vsx, + &vpx_h_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_h_predictor_32x32_vsx, + &vpx_h_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_tm_predictor_16x16_vsx, + &vpx_tm_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_tm_predictor_32x32_vsx, + &vpx_tm_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_v_predictor_16x16_vsx, + &vpx_v_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_v_predictor_32x32_vsx, + &vpx_v_predictor_32x32_c, 32, 8))); +#endif // HAVE_VSX + +#if HAVE_LSX +INSTANTIATE_TEST_SUITE_P( + LSX, VP9IntraPredTest, + ::testing::Values(IntraPredParam(&vpx_dc_predictor_8x8_lsx, + &vpx_dc_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_dc_predictor_16x16_lsx, + &vpx_dc_predictor_16x16_c, 16, 8))); +#endif // HAVE_LSX + +#if CONFIG_VP9_HIGHBITDEPTH +typedef void (*HighbdIntraPred)(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bps); +struct HighbdIntraPredParam { + HighbdIntraPredParam(HighbdIntraPred pred = nullptr, + HighbdIntraPred ref = nullptr, int block_size_value = 0, + int bit_depth_value = 0) + : pred_fn(pred), ref_fn(ref), block_size(block_size_value), + bit_depth(bit_depth_value) {} + + HighbdIntraPred pred_fn; + HighbdIntraPred ref_fn; + int block_size; + int bit_depth; +}; + +#if HAVE_SSSE3 || HAVE_NEON || HAVE_SSE2 +template <> +void IntraPredTest::Predict() { + const int bit_depth = params_.bit_depth; + params_.ref_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth); + ASM_REGISTER_STATE_CHECK( + params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth)); +} + +typedef IntraPredTest VP9HighbdIntraPredTest; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(VP9HighbdIntraPredTest); + +TEST_P(VP9HighbdIntraPredTest, HighbdIntraPredTests) { + // max block size is 32 + DECLARE_ALIGNED(16, uint16_t, left_col[2 * 32]); + DECLARE_ALIGNED(16, uint16_t, above_data[2 * 32 + 32]); + DECLARE_ALIGNED(16, uint16_t, dst[3 * 32 * 32]); + DECLARE_ALIGNED(16, uint16_t, ref_dst[3 * 32 * 32]); + RunTest(left_col, above_data, dst, ref_dst); +} +#endif + +#if HAVE_SSSE3 +INSTANTIATE_TEST_SUITE_P( + SSSE3_TO_C_8, VP9HighbdIntraPredTest, + ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3, + &vpx_highbd_d45_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_ssse3, + &vpx_highbd_d45_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_ssse3, + &vpx_highbd_d45_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_ssse3, + &vpx_highbd_d45_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_ssse3, + &vpx_highbd_d63_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_ssse3, + &vpx_highbd_d63_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_c, + &vpx_highbd_d63_predictor_32x32_ssse3, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_ssse3, + &vpx_highbd_d117_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_ssse3, + &vpx_highbd_d117_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_c, + &vpx_highbd_d117_predictor_32x32_ssse3, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_ssse3, + &vpx_highbd_d135_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_ssse3, + &vpx_highbd_d135_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_ssse3, + &vpx_highbd_d135_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_ssse3, + &vpx_highbd_d153_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_ssse3, + &vpx_highbd_d153_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_ssse3, + &vpx_highbd_d153_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_ssse3, + &vpx_highbd_d207_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_ssse3, + &vpx_highbd_d207_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_ssse3, + &vpx_highbd_d207_predictor_32x32_c, 32, 8))); + +INSTANTIATE_TEST_SUITE_P( + SSSE3_TO_C_10, VP9HighbdIntraPredTest, + ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3, + &vpx_highbd_d45_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_ssse3, + &vpx_highbd_d45_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_ssse3, + &vpx_highbd_d45_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_ssse3, + &vpx_highbd_d45_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_ssse3, + &vpx_highbd_d63_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_ssse3, + &vpx_highbd_d63_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_c, + &vpx_highbd_d63_predictor_32x32_ssse3, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_ssse3, + &vpx_highbd_d117_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_ssse3, + &vpx_highbd_d117_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_c, + &vpx_highbd_d117_predictor_32x32_ssse3, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_ssse3, + &vpx_highbd_d135_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_ssse3, + &vpx_highbd_d135_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_ssse3, + &vpx_highbd_d135_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_ssse3, + &vpx_highbd_d153_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_ssse3, + &vpx_highbd_d153_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_ssse3, + &vpx_highbd_d153_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_ssse3, + &vpx_highbd_d207_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_ssse3, + &vpx_highbd_d207_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_ssse3, + &vpx_highbd_d207_predictor_32x32_c, 32, 10))); + +INSTANTIATE_TEST_SUITE_P( + SSSE3_TO_C_12, VP9HighbdIntraPredTest, + ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3, + &vpx_highbd_d45_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_ssse3, + &vpx_highbd_d45_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_ssse3, + &vpx_highbd_d45_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_ssse3, + &vpx_highbd_d45_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_ssse3, + &vpx_highbd_d63_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_ssse3, + &vpx_highbd_d63_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_c, + &vpx_highbd_d63_predictor_32x32_ssse3, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_ssse3, + &vpx_highbd_d117_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_ssse3, + &vpx_highbd_d117_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_c, + &vpx_highbd_d117_predictor_32x32_ssse3, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_ssse3, + &vpx_highbd_d135_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_ssse3, + &vpx_highbd_d135_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_ssse3, + &vpx_highbd_d135_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_ssse3, + &vpx_highbd_d153_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_ssse3, + &vpx_highbd_d153_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_ssse3, + &vpx_highbd_d153_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_ssse3, + &vpx_highbd_d207_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_ssse3, + &vpx_highbd_d207_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_ssse3, + &vpx_highbd_d207_predictor_32x32_c, 32, 12))); +#endif // HAVE_SSSE3 + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2_TO_C_8, VP9HighbdIntraPredTest, + ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2, + &vpx_highbd_dc_128_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_sse2, + &vpx_highbd_dc_128_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_sse2, + &vpx_highbd_dc_128_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2, + &vpx_highbd_dc_128_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_sse2, + &vpx_highbd_d63_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_sse2, + &vpx_highbd_d117_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_sse2, + &vpx_highbd_d135_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_sse2, + &vpx_highbd_d153_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_sse2, + &vpx_highbd_d207_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2, + &vpx_highbd_dc_left_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2, + &vpx_highbd_dc_left_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_sse2, + &vpx_highbd_dc_left_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_sse2, + &vpx_highbd_dc_left_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2, + &vpx_highbd_dc_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2, + &vpx_highbd_dc_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_sse2, + &vpx_highbd_dc_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2, + &vpx_highbd_dc_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_sse2, + &vpx_highbd_dc_top_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_sse2, + &vpx_highbd_dc_top_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_sse2, + &vpx_highbd_dc_top_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_sse2, + &vpx_highbd_dc_top_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2, + &vpx_highbd_tm_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2, + &vpx_highbd_tm_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_sse2, + &vpx_highbd_tm_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2, + &vpx_highbd_tm_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_sse2, + &vpx_highbd_h_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_sse2, + &vpx_highbd_h_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_sse2, + &vpx_highbd_h_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_sse2, + &vpx_highbd_h_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2, + &vpx_highbd_v_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2, + &vpx_highbd_v_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_sse2, + &vpx_highbd_v_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_sse2, + &vpx_highbd_v_predictor_32x32_c, 32, 8))); + +INSTANTIATE_TEST_SUITE_P( + SSE2_TO_C_10, VP9HighbdIntraPredTest, + ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2, + &vpx_highbd_dc_128_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_sse2, + &vpx_highbd_dc_128_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_sse2, + &vpx_highbd_dc_128_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2, + &vpx_highbd_dc_128_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_sse2, + &vpx_highbd_d63_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_sse2, + &vpx_highbd_d117_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_sse2, + &vpx_highbd_d135_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_sse2, + &vpx_highbd_d153_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_sse2, + &vpx_highbd_d207_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2, + &vpx_highbd_dc_left_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2, + &vpx_highbd_dc_left_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_sse2, + &vpx_highbd_dc_left_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_sse2, + &vpx_highbd_dc_left_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2, + &vpx_highbd_dc_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2, + &vpx_highbd_dc_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_sse2, + &vpx_highbd_dc_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2, + &vpx_highbd_dc_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_sse2, + &vpx_highbd_dc_top_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_sse2, + &vpx_highbd_dc_top_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_sse2, + &vpx_highbd_dc_top_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_sse2, + &vpx_highbd_dc_top_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2, + &vpx_highbd_tm_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2, + &vpx_highbd_tm_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_sse2, + &vpx_highbd_tm_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2, + &vpx_highbd_tm_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_sse2, + &vpx_highbd_h_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_sse2, + &vpx_highbd_h_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_sse2, + &vpx_highbd_h_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_sse2, + &vpx_highbd_h_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2, + &vpx_highbd_v_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2, + &vpx_highbd_v_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_sse2, + &vpx_highbd_v_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_sse2, + &vpx_highbd_v_predictor_32x32_c, 32, 10))); + +INSTANTIATE_TEST_SUITE_P( + SSE2_TO_C_12, VP9HighbdIntraPredTest, + ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2, + &vpx_highbd_dc_128_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_sse2, + &vpx_highbd_dc_128_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_sse2, + &vpx_highbd_dc_128_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2, + &vpx_highbd_dc_128_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_sse2, + &vpx_highbd_d63_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_sse2, + &vpx_highbd_d117_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_sse2, + &vpx_highbd_d135_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_sse2, + &vpx_highbd_d153_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_sse2, + &vpx_highbd_d207_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2, + &vpx_highbd_dc_left_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2, + &vpx_highbd_dc_left_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_sse2, + &vpx_highbd_dc_left_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_sse2, + &vpx_highbd_dc_left_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2, + &vpx_highbd_dc_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2, + &vpx_highbd_dc_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_sse2, + &vpx_highbd_dc_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2, + &vpx_highbd_dc_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_sse2, + &vpx_highbd_dc_top_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_sse2, + &vpx_highbd_dc_top_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_sse2, + &vpx_highbd_dc_top_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_sse2, + &vpx_highbd_dc_top_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2, + &vpx_highbd_tm_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2, + &vpx_highbd_tm_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_sse2, + &vpx_highbd_tm_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2, + &vpx_highbd_tm_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_sse2, + &vpx_highbd_h_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_sse2, + &vpx_highbd_h_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_sse2, + &vpx_highbd_h_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_sse2, + &vpx_highbd_h_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2, + &vpx_highbd_v_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2, + &vpx_highbd_v_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_sse2, + &vpx_highbd_v_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_sse2, + &vpx_highbd_v_predictor_32x32_c, 32, 12))); +#endif // HAVE_SSE2 + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON_TO_C_8, VP9HighbdIntraPredTest, + ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon, + &vpx_highbd_d45_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_neon, + &vpx_highbd_d45_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_neon, + &vpx_highbd_d45_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon, + &vpx_highbd_d45_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon, + &vpx_highbd_d63_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon, + &vpx_highbd_d63_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon, + &vpx_highbd_d63_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon, + &vpx_highbd_d63_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon, + &vpx_highbd_d117_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon, + &vpx_highbd_d117_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_neon, + &vpx_highbd_d117_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_neon, + &vpx_highbd_d117_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon, + &vpx_highbd_d135_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon, + &vpx_highbd_d135_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_neon, + &vpx_highbd_d135_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon, + &vpx_highbd_d135_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_neon, + &vpx_highbd_d153_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_neon, + &vpx_highbd_d153_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_neon, + &vpx_highbd_d153_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_neon, + &vpx_highbd_d153_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_neon, + &vpx_highbd_d207_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_neon, + &vpx_highbd_d207_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_neon, + &vpx_highbd_d207_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_neon, + &vpx_highbd_d207_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon, + &vpx_highbd_dc_128_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon, + &vpx_highbd_dc_128_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_neon, + &vpx_highbd_dc_128_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_neon, + &vpx_highbd_dc_128_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_neon, + &vpx_highbd_dc_left_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_neon, + &vpx_highbd_dc_left_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_neon, + &vpx_highbd_dc_left_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_neon, + &vpx_highbd_dc_left_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_neon, + &vpx_highbd_dc_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_neon, + &vpx_highbd_dc_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_neon, + &vpx_highbd_dc_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_neon, + &vpx_highbd_dc_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_neon, + &vpx_highbd_dc_top_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_neon, + &vpx_highbd_dc_top_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_neon, + &vpx_highbd_dc_top_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_neon, + &vpx_highbd_dc_top_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_neon, + &vpx_highbd_h_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_neon, + &vpx_highbd_h_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_neon, + &vpx_highbd_h_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_neon, + &vpx_highbd_h_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_neon, + &vpx_highbd_tm_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_neon, + &vpx_highbd_tm_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_neon, + &vpx_highbd_tm_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_neon, + &vpx_highbd_tm_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_neon, + &vpx_highbd_v_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_neon, + &vpx_highbd_v_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_neon, + &vpx_highbd_v_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_neon, + &vpx_highbd_v_predictor_32x32_c, 32, 8))); + +INSTANTIATE_TEST_SUITE_P( + NEON_TO_C_10, VP9HighbdIntraPredTest, + ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon, + &vpx_highbd_d45_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_neon, + &vpx_highbd_d45_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_neon, + &vpx_highbd_d45_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon, + &vpx_highbd_d45_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon, + &vpx_highbd_d63_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon, + &vpx_highbd_d63_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon, + &vpx_highbd_d63_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon, + &vpx_highbd_d63_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon, + &vpx_highbd_d117_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon, + &vpx_highbd_d117_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_neon, + &vpx_highbd_d117_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_neon, + &vpx_highbd_d117_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon, + &vpx_highbd_d135_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon, + &vpx_highbd_d135_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_neon, + &vpx_highbd_d135_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon, + &vpx_highbd_d135_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_neon, + &vpx_highbd_d153_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_neon, + &vpx_highbd_d153_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_neon, + &vpx_highbd_d153_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_neon, + &vpx_highbd_d153_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_neon, + &vpx_highbd_d207_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_neon, + &vpx_highbd_d207_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_neon, + &vpx_highbd_d207_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_neon, + &vpx_highbd_d207_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon, + &vpx_highbd_dc_128_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon, + &vpx_highbd_dc_128_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_neon, + &vpx_highbd_dc_128_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_neon, + &vpx_highbd_dc_128_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_neon, + &vpx_highbd_dc_left_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_neon, + &vpx_highbd_dc_left_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_neon, + &vpx_highbd_dc_left_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_neon, + &vpx_highbd_dc_left_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_neon, + &vpx_highbd_dc_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_neon, + &vpx_highbd_dc_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_neon, + &vpx_highbd_dc_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_neon, + &vpx_highbd_dc_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_neon, + &vpx_highbd_dc_top_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_neon, + &vpx_highbd_dc_top_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_neon, + &vpx_highbd_dc_top_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_neon, + &vpx_highbd_dc_top_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_neon, + &vpx_highbd_h_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_neon, + &vpx_highbd_h_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_neon, + &vpx_highbd_h_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_neon, + &vpx_highbd_h_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_neon, + &vpx_highbd_tm_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_neon, + &vpx_highbd_tm_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_neon, + &vpx_highbd_tm_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_neon, + &vpx_highbd_tm_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_neon, + &vpx_highbd_v_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_neon, + &vpx_highbd_v_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_neon, + &vpx_highbd_v_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_neon, + &vpx_highbd_v_predictor_32x32_c, 32, 10))); + +INSTANTIATE_TEST_SUITE_P( + NEON_TO_C_12, VP9HighbdIntraPredTest, + ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon, + &vpx_highbd_d45_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_neon, + &vpx_highbd_d45_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_neon, + &vpx_highbd_d45_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon, + &vpx_highbd_d45_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon, + &vpx_highbd_d63_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon, + &vpx_highbd_d63_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon, + &vpx_highbd_d63_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon, + &vpx_highbd_d63_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon, + &vpx_highbd_d117_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon, + &vpx_highbd_d117_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_neon, + &vpx_highbd_d117_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_neon, + &vpx_highbd_d117_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon, + &vpx_highbd_d135_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon, + &vpx_highbd_d135_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_neon, + &vpx_highbd_d135_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon, + &vpx_highbd_d135_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_neon, + &vpx_highbd_d153_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_neon, + &vpx_highbd_d153_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_neon, + &vpx_highbd_d153_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_neon, + &vpx_highbd_d153_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_neon, + &vpx_highbd_d207_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_neon, + &vpx_highbd_d207_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_neon, + &vpx_highbd_d207_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_neon, + &vpx_highbd_d207_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon, + &vpx_highbd_dc_128_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon, + &vpx_highbd_dc_128_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_neon, + &vpx_highbd_dc_128_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_neon, + &vpx_highbd_dc_128_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_neon, + &vpx_highbd_dc_left_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_neon, + &vpx_highbd_dc_left_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_neon, + &vpx_highbd_dc_left_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_neon, + &vpx_highbd_dc_left_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_neon, + &vpx_highbd_dc_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_neon, + &vpx_highbd_dc_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_neon, + &vpx_highbd_dc_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_neon, + &vpx_highbd_dc_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_neon, + &vpx_highbd_dc_top_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_neon, + &vpx_highbd_dc_top_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_neon, + &vpx_highbd_dc_top_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_neon, + &vpx_highbd_dc_top_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_neon, + &vpx_highbd_h_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_neon, + &vpx_highbd_h_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_neon, + &vpx_highbd_h_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_neon, + &vpx_highbd_h_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_neon, + &vpx_highbd_tm_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_neon, + &vpx_highbd_tm_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_neon, + &vpx_highbd_tm_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_neon, + &vpx_highbd_tm_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_neon, + &vpx_highbd_v_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_neon, + &vpx_highbd_v_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_neon, + &vpx_highbd_v_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_neon, + &vpx_highbd_v_predictor_32x32_c, 32, 12))); +#endif // HAVE_NEON + +#endif // CONFIG_VP9_HIGHBITDEPTH +} // namespace diff --git a/media/libvpx/libvpx/test/vp9_lossless_test.cc b/media/libvpx/libvpx/test/vp9_lossless_test.cc new file mode 100644 index 0000000000..fe3cd1aba4 --- /dev/null +++ b/media/libvpx/libvpx/test/vp9_lossless_test.cc @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vpx_config.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/y4m_video_source.h" + +namespace { + +const int kMaxPsnr = 100; + +class LosslessTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam { + protected: + LosslessTest() + : EncoderTest(GET_PARAM(0)), psnr_(kMaxPsnr), nframes_(0), + encoding_mode_(GET_PARAM(1)) {} + + ~LosslessTest() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(encoding_mode_); + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + // Only call Control if quantizer > 0 to verify that using quantizer + // alone will activate lossless + if (cfg_.rc_max_quantizer > 0 || cfg_.rc_min_quantizer > 0) { + encoder->Control(VP9E_SET_LOSSLESS, 1); + } + } + } + + void BeginPassHook(unsigned int /*pass*/) override { + psnr_ = kMaxPsnr; + nframes_ = 0; + } + + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { + if (pkt->data.psnr.psnr[0] < psnr_) psnr_ = pkt->data.psnr.psnr[0]; + } + + double GetMinPsnr() const { return psnr_; } + + private: + double psnr_; + unsigned int nframes_; + libvpx_test::TestMode encoding_mode_; +}; + +TEST_P(LosslessTest, TestLossLessEncoding) { + const vpx_rational timebase = { 33333333, 1000000000 }; + cfg_.g_timebase = timebase; + cfg_.rc_target_bitrate = 2000; + cfg_.g_lag_in_frames = 25; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 0; + + init_flags_ = VPX_CODEC_USE_PSNR; + + // intentionally changed the dimension for better testing coverage + libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + timebase.den, timebase.num, 0, 10); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + const double psnr_lossless = GetMinPsnr(); + EXPECT_GE(psnr_lossless, kMaxPsnr); +} + +TEST_P(LosslessTest, TestLossLessEncoding444) { + libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 10); + + cfg_.g_profile = 1; + cfg_.g_timebase = video.timebase(); + cfg_.rc_target_bitrate = 2000; + cfg_.g_lag_in_frames = 25; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 0; + + init_flags_ = VPX_CODEC_USE_PSNR; + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + const double psnr_lossless = GetMinPsnr(); + EXPECT_GE(psnr_lossless, kMaxPsnr); +} + +TEST_P(LosslessTest, TestLossLessEncodingCtrl) { + const vpx_rational timebase = { 33333333, 1000000000 }; + cfg_.g_timebase = timebase; + cfg_.rc_target_bitrate = 2000; + cfg_.g_lag_in_frames = 25; + // Intentionally set Q > 0, to make sure control can be used to activate + // lossless + cfg_.rc_min_quantizer = 10; + cfg_.rc_max_quantizer = 20; + + init_flags_ = VPX_CODEC_USE_PSNR; + + libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + timebase.den, timebase.num, 0, 10); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + const double psnr_lossless = GetMinPsnr(); + EXPECT_GE(psnr_lossless, kMaxPsnr); +} + +VP9_INSTANTIATE_TEST_SUITE(LosslessTest, + ::testing::Values(::libvpx_test::kRealTime, + ::libvpx_test::kOnePassGood, + ::libvpx_test::kTwoPassGood)); +} // namespace diff --git a/media/libvpx/libvpx/test/vp9_motion_vector_test.cc b/media/libvpx/libvpx/test/vp9_motion_vector_test.cc new file mode 100644 index 0000000000..495ea11fce --- /dev/null +++ b/media/libvpx/libvpx/test/vp9_motion_vector_test.cc @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/util.h" +#include "test/yuv_video_source.h" + +namespace { +#define MAX_EXTREME_MV 1 +#define MIN_EXTREME_MV 2 + +// Encoding modes +const libvpx_test::TestMode kEncodingModeVectors[] = { + ::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood, + ::libvpx_test::kRealTime +}; + +// Encoding speeds +const int kCpuUsedVectors[] = { 0, 1, 2, 3, 4, 5, 6 }; + +// MV test modes: 1 - always use maximum MV; 2 - always use minimum MV. +const int kMVTestModes[] = { MAX_EXTREME_MV, MIN_EXTREME_MV }; + +class MotionVectorTestLarge + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith3Params { + protected: + MotionVectorTestLarge() + : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), + cpu_used_(GET_PARAM(2)), mv_test_mode_(GET_PARAM(3)) {} + + ~MotionVectorTestLarge() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(encoding_mode_); + if (encoding_mode_ != ::libvpx_test::kRealTime) { + cfg_.g_lag_in_frames = 3; + cfg_.rc_end_usage = VPX_VBR; + } else { + cfg_.g_lag_in_frames = 0; + cfg_.rc_end_usage = VPX_CBR; + cfg_.rc_buf_sz = 1000; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 600; + } + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, cpu_used_); + encoder->Control(VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, mv_test_mode_); + if (encoding_mode_ != ::libvpx_test::kRealTime) { + encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); + encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7); + encoder->Control(VP8E_SET_ARNR_STRENGTH, 5); + encoder->Control(VP8E_SET_ARNR_TYPE, 3); + } + } + } + + libvpx_test::TestMode encoding_mode_; + int cpu_used_; + int mv_test_mode_; +}; + +TEST_P(MotionVectorTestLarge, OverallTest) { + cfg_.rc_target_bitrate = 24000; + cfg_.g_profile = 0; + init_flags_ = VPX_CODEC_USE_PSNR; + + std::unique_ptr video; + video.reset(new libvpx_test::YUVVideoSource( + "niklas_640_480_30.yuv", VPX_IMG_FMT_I420, 3840, 2160, // 2048, 1080, + 30, 1, 0, 5)); + + ASSERT_NE(video.get(), nullptr); + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); +} + +VP9_INSTANTIATE_TEST_SUITE(MotionVectorTestLarge, + ::testing::ValuesIn(kEncodingModeVectors), + ::testing::ValuesIn(kCpuUsedVectors), + ::testing::ValuesIn(kMVTestModes)); +} // namespace diff --git a/media/libvpx/libvpx/test/vp9_quantize_test.cc b/media/libvpx/libvpx/test/vp9_quantize_test.cc new file mode 100644 index 0000000000..e00ab4022c --- /dev/null +++ b/media/libvpx/libvpx/test/vp9_quantize_test.cc @@ -0,0 +1,725 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "test/acm_random.h" +#include "test/bench.h" +#include "test/buffer.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/msvc.h" +#include "vpx_ports/vpx_timer.h" + +using libvpx_test::ACMRandom; +using libvpx_test::Buffer; + +namespace { +const int number_of_iterations = 100; + +typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count, + const macroblock_plane *mb_plane, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + const int16_t *dequant, uint16_t *eob, + const struct ScanOrder *const scan_order); +typedef std::tuple + QuantizeParam; + +// Wrapper for 32x32 version which does not use count +typedef void (*Quantize32x32Func)(const tran_low_t *coeff, + const macroblock_plane *const mb_plane, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + const int16_t *dequant, uint16_t *eob, + const struct ScanOrder *const scan_order); + +template +void Quant32x32Wrapper(const tran_low_t *coeff, intptr_t count, + const macroblock_plane *const mb_plane, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + const int16_t *dequant, uint16_t *eob, + const struct ScanOrder *const scan_order) { + (void)count; + fn(coeff, mb_plane, qcoeff, dqcoeff, dequant, eob, scan_order); +} + +// Wrapper for FP version which does not use zbin or quant_shift. +typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count, + const macroblock_plane *const mb_plane, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + const int16_t *dequant, uint16_t *eob, + const struct ScanOrder *const scan_order); + +template +void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, + const macroblock_plane *const mb_plane, tran_low_t *qcoeff, + tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, + const struct ScanOrder *const scan_order) { + fn(coeff, count, mb_plane, qcoeff, dqcoeff, dequant, eob, scan_order); +} + +void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round, + int16_t *quant, int16_t *quant_shift, + int16_t *dequant, int16_t *round_fp, + int16_t *quant_fp) { + // Max when q == 0. Otherwise, it is 48 for Y and 42 for U/V. + constexpr int kMaxQRoundingFactorFp = 64; + + for (int j = 0; j < 2; j++) { + // The range is 4 to 1828 in the VP9 tables. + const int qlookup = rnd->RandRange(1825) + 4; + round_fp[j] = (kMaxQRoundingFactorFp * qlookup) >> 7; + quant_fp[j] = (1 << 16) / qlookup; + + // Values determined by deconstructing vp9_init_quantizer(). + // zbin may be up to 1143 for 8 and 10 bit Y values, or 1200 for 12 bit Y + // values or U/V values of any bit depth. This is because y_delta is not + // factored into the vp9_ac_quant() call. + zbin[j] = rnd->RandRange(1200); + + // round may be up to 685 for Y values or 914 for U/V. + round[j] = rnd->RandRange(914); + // quant ranges from 1 to -32703 + quant[j] = static_cast(rnd->RandRange(32704)) - 32703; + // quant_shift goes up to 1 << 16. + quant_shift[j] = rnd->RandRange(16384); + // dequant maxes out at 1828 for all cases. + dequant[j] = rnd->RandRange(1828); + } + for (int j = 2; j < 8; j++) { + zbin[j] = zbin[1]; + round_fp[j] = round_fp[1]; + quant_fp[j] = quant_fp[1]; + round[j] = round[1]; + quant[j] = quant[1]; + quant_shift[j] = quant_shift[1]; + dequant[j] = dequant[1]; + } +} + +class VP9QuantizeBase : public AbstractBench { + public: + VP9QuantizeBase(vpx_bit_depth_t bit_depth, int max_size, bool is_fp) + : bit_depth_(bit_depth), max_size_(max_size), is_fp_(is_fp), + coeff_(Buffer(max_size_, max_size_, 0, 16)), + qcoeff_(Buffer(max_size_, max_size_, 0, 32)), + dqcoeff_(Buffer(max_size_, max_size_, 0, 32)) { + // TODO(jianj): SSSE3 and AVX2 tests fail on extreme values. +#if HAVE_NEON + max_value_ = (1 << (7 + bit_depth_)) - 1; +#else + max_value_ = (1 << bit_depth_) - 1; +#endif + + mb_plane_ = reinterpret_cast( + vpx_memalign(16, sizeof(macroblock_plane))); + + zbin_ptr_ = mb_plane_->zbin = + reinterpret_cast(vpx_memalign(16, 8 * sizeof(*zbin_ptr_))); + round_fp_ptr_ = mb_plane_->round_fp = reinterpret_cast( + vpx_memalign(16, 8 * sizeof(*round_fp_ptr_))); + quant_fp_ptr_ = mb_plane_->quant_fp = reinterpret_cast( + vpx_memalign(16, 8 * sizeof(*quant_fp_ptr_))); + round_ptr_ = mb_plane_->round = + reinterpret_cast(vpx_memalign(16, 8 * sizeof(*round_ptr_))); + quant_ptr_ = mb_plane_->quant = + reinterpret_cast(vpx_memalign(16, 8 * sizeof(*quant_ptr_))); + quant_shift_ptr_ = mb_plane_->quant_shift = reinterpret_cast( + vpx_memalign(16, 8 * sizeof(*quant_shift_ptr_))); + dequant_ptr_ = reinterpret_cast( + vpx_memalign(16, 8 * sizeof(*dequant_ptr_))); + + r_ptr_ = (is_fp_) ? round_fp_ptr_ : round_ptr_; + q_ptr_ = (is_fp_) ? quant_fp_ptr_ : quant_ptr_; + } + + ~VP9QuantizeBase() override { + vpx_free(mb_plane_); + vpx_free(zbin_ptr_); + vpx_free(round_fp_ptr_); + vpx_free(quant_fp_ptr_); + vpx_free(round_ptr_); + vpx_free(quant_ptr_); + vpx_free(quant_shift_ptr_); + vpx_free(dequant_ptr_); + mb_plane_ = nullptr; + zbin_ptr_ = nullptr; + round_fp_ptr_ = nullptr; + quant_fp_ptr_ = nullptr; + round_ptr_ = nullptr; + quant_ptr_ = nullptr; + quant_shift_ptr_ = nullptr; + dequant_ptr_ = nullptr; + libvpx_test::ClearSystemState(); + } + + protected: + macroblock_plane *mb_plane_; + int16_t *zbin_ptr_; + int16_t *quant_fp_ptr_; + int16_t *round_fp_ptr_; + int16_t *round_ptr_; + int16_t *quant_ptr_; + int16_t *quant_shift_ptr_; + int16_t *dequant_ptr_; + const vpx_bit_depth_t bit_depth_; + int max_value_; + const int max_size_; + const bool is_fp_; + Buffer coeff_; + Buffer qcoeff_; + Buffer dqcoeff_; + int16_t *r_ptr_; + int16_t *q_ptr_; + int count_; + const ScanOrder *scan_; + uint16_t eob_; +}; + +class VP9QuantizeTest : public VP9QuantizeBase, + public ::testing::TestWithParam { + public: + VP9QuantizeTest() + : VP9QuantizeBase(GET_PARAM(2), GET_PARAM(3), GET_PARAM(4)), + quantize_op_(GET_PARAM(0)), ref_quantize_op_(GET_PARAM(1)) {} + + protected: + void Run() override; + void Speed(bool is_median); + const QuantizeFunc quantize_op_; + const QuantizeFunc ref_quantize_op_; +}; + +void VP9QuantizeTest::Run() { + quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, qcoeff_.TopLeftPixel(), + dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_); +} + +void VP9QuantizeTest::Speed(bool is_median) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + ASSERT_TRUE(coeff_.Init()); + ASSERT_TRUE(qcoeff_.Init()); + ASSERT_TRUE(dqcoeff_.Init()); + TX_SIZE starting_sz, ending_sz; + + if (max_size_ == 16) { + starting_sz = TX_4X4; + ending_sz = TX_16X16; + } else { + starting_sz = TX_32X32; + ending_sz = TX_32X32; + } + + for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) { + // zbin > coeff, zbin < coeff. + for (int i = 0; i < 2; ++i) { + // TX_TYPE defines the scan order. That is not relevant to the speed test. + // Pick the first one. + const TX_TYPE tx_type = DCT_DCT; + count_ = (4 << sz) * (4 << sz); + scan_ = &vp9_scan_orders[sz][tx_type]; + + GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, + quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, + quant_fp_ptr_); + + if (i == 0) { + // When |coeff values| are less than zbin the results are 0. + int threshold = 100; + if (max_size_ == 32) { + // For 32x32, the threshold is halved. Double it to keep the values + // from clearing it. + threshold = 200; + } + for (int j = 0; j < 8; ++j) zbin_ptr_[j] = threshold; + coeff_.Set(&rnd, -99, 99); + } else if (i == 1) { + for (int j = 0; j < 8; ++j) zbin_ptr_[j] = 50; + coeff_.Set(&rnd, -500, 500); + } + + const char *type = + (i == 0) ? "Bypass calculations " : "Full calculations "; + char block_size[16]; + snprintf(block_size, sizeof(block_size), "%dx%d", 4 << sz, 4 << sz); + char title[100]; + snprintf(title, sizeof(title), "%25s %8s ", type, block_size); + + if (is_median) { + RunNTimes(10000000 / count_); + PrintMedian(title); + } else { + Buffer ref_qcoeff = + Buffer(max_size_, max_size_, 0, 32); + ASSERT_TRUE(ref_qcoeff.Init()); + Buffer ref_dqcoeff = + Buffer(max_size_, max_size_, 0, 32); + ASSERT_TRUE(ref_dqcoeff.Init()); + uint16_t ref_eob = 0; + + const int kNumTests = 5000000; + vpx_usec_timer timer, simd_timer; + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, + ref_qcoeff.TopLeftPixel(), + ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, + scan_); + } + vpx_usec_timer_mark(&timer); + + vpx_usec_timer_start(&simd_timer); + for (int n = 0; n < kNumTests; ++n) { + quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, + qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), + dequant_ptr_, &eob_, scan_); + } + vpx_usec_timer_mark(&simd_timer); + + const int elapsed_time = + static_cast(vpx_usec_timer_elapsed(&timer)); + const int simd_elapsed_time = + static_cast(vpx_usec_timer_elapsed(&simd_timer)); + printf("%s c_time = %d \t simd_time = %d \t Gain = %f \n", title, + elapsed_time, simd_elapsed_time, + ((float)elapsed_time / simd_elapsed_time)); + } + } + } +} + +// This quantizer compares the AC coefficients to the quantization step size to +// determine if further multiplication operations are needed. +// Based on vp9_quantize_fp_sse2(). +inline void quant_fp_nz(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order, + int is_32x32) { + int i, eob = -1; + const int thr = dequant_ptr[1] >> (1 + is_32x32); + const int16_t *round_ptr = mb_plane->round_fp; + const int16_t *quant_ptr = mb_plane->quant_fp; + const int16_t *scan = scan_order->scan; + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < n_coeffs; i += 16) { + int y; + int nzflag_cnt = 0; + int abs_coeff[16]; + int coeff_sign[16]; + + // count nzflag for each row (16 tran_low_t) + for (y = 0; y < 16; ++y) { + const int rc = i + y; + const int coeff = coeff_ptr[rc]; + coeff_sign[y] = (coeff >> 31); + abs_coeff[y] = (coeff ^ coeff_sign[y]) - coeff_sign[y]; + // The first 16 are skipped in the sse2 code. Do the same here to match. + if (i >= 16 && (abs_coeff[y] <= thr)) { + nzflag_cnt++; + } + } + + for (y = 0; y < 16; ++y) { + const int rc = i + y; + // If all of the AC coeffs in a row has magnitude less than the + // quantization step_size/2, quantize to zero. + if (nzflag_cnt < 16) { + int tmp; + int _round; + + if (is_32x32) { + _round = ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + } else { + _round = round_ptr[rc != 0]; + } + tmp = clamp(abs_coeff[y] + _round, INT16_MIN, INT16_MAX); + tmp = (tmp * quant_ptr[rc != 0]) >> (16 - is_32x32); + qcoeff_ptr[rc] = (tmp ^ coeff_sign[y]) - coeff_sign[y]; + dqcoeff_ptr[rc] = + static_cast(qcoeff_ptr[rc] * dequant_ptr[rc != 0]); + + if (is_32x32) { + dqcoeff_ptr[rc] = static_cast(qcoeff_ptr[rc] * + dequant_ptr[rc != 0] / 2); + } else { + dqcoeff_ptr[rc] = + static_cast(qcoeff_ptr[rc] * dequant_ptr[rc != 0]); + } + } else { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + } + } + + // Scan for eob. + for (i = 0; i < n_coeffs; i++) { + // Use the scan order to find the correct eob. + const int rc = scan[i]; + if (qcoeff_ptr[rc]) { + eob = i; + } + } + *eob_ptr = eob + 1; +} + +void quantize_fp_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + quant_fp_nz(coeff_ptr, n_coeffs, mb_plane, qcoeff_ptr, dqcoeff_ptr, + dequant_ptr, eob_ptr, scan_order, 0); +} + +void quantize_fp_32x32_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + quant_fp_nz(coeff_ptr, n_coeffs, mb_plane, qcoeff_ptr, dqcoeff_ptr, + dequant_ptr, eob_ptr, scan_order, 1); +} + +TEST_P(VP9QuantizeTest, OperationCheck) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + ASSERT_TRUE(coeff_.Init()); + ASSERT_TRUE(qcoeff_.Init()); + ASSERT_TRUE(dqcoeff_.Init()); + Buffer ref_qcoeff = + Buffer(max_size_, max_size_, 0, 32); + ASSERT_TRUE(ref_qcoeff.Init()); + Buffer ref_dqcoeff = + Buffer(max_size_, max_size_, 0, 32); + ASSERT_TRUE(ref_dqcoeff.Init()); + uint16_t ref_eob = 0; + eob_ = 0; + + for (int i = 0; i < number_of_iterations; ++i) { + TX_SIZE sz; + if (max_size_ == 16) { + sz = static_cast(i % 3); // TX_4X4, TX_8X8 TX_16X16 + } else { + sz = TX_32X32; + } + const TX_TYPE tx_type = static_cast((i >> 2) % 3); + scan_ = &vp9_scan_orders[sz][tx_type]; + count_ = (4 << sz) * (4 << sz); + coeff_.Set(&rnd, -max_value_, max_value_); + GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, + quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, + quant_fp_ptr_); + ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, + ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), + dequant_ptr_, &ref_eob, scan_); + + ASM_REGISTER_STATE_CHECK(quantize_op_( + coeff_.TopLeftPixel(), count_, mb_plane_, qcoeff_.TopLeftPixel(), + dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_)); + + EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff)); + EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff)); + + EXPECT_EQ(eob_, ref_eob); + + if (HasFailure()) { + printf("Failure on iteration %d.\n", i); + qcoeff_.PrintDifference(ref_qcoeff); + dqcoeff_.PrintDifference(ref_dqcoeff); + return; + } + } +} + +TEST_P(VP9QuantizeTest, EOBCheck) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + ASSERT_TRUE(coeff_.Init()); + ASSERT_TRUE(qcoeff_.Init()); + ASSERT_TRUE(dqcoeff_.Init()); + Buffer ref_qcoeff = + Buffer(max_size_, max_size_, 0, 32); + ASSERT_TRUE(ref_qcoeff.Init()); + Buffer ref_dqcoeff = + Buffer(max_size_, max_size_, 0, 32); + ASSERT_TRUE(ref_dqcoeff.Init()); + uint16_t ref_eob = 0; + eob_ = 0; + const uint32_t max_index = max_size_ * max_size_ - 1; + + for (int i = 0; i < number_of_iterations; ++i) { + TX_SIZE sz; + if (max_size_ == 16) { + sz = static_cast(i % 3); // TX_4X4, TX_8X8 TX_16X16 + } else { + sz = TX_32X32; + } + const TX_TYPE tx_type = static_cast((i >> 2) % 3); + scan_ = &vp9_scan_orders[sz][tx_type]; + count_ = (4 << sz) * (4 << sz); + // Two random entries + coeff_.Set(0); + coeff_.TopLeftPixel()[rnd.RandRange(count_) & max_index] = + static_cast(rnd.RandRange(max_value_ * 2)) - max_value_; + coeff_.TopLeftPixel()[rnd.RandRange(count_) & max_index] = + static_cast(rnd.RandRange(max_value_ * 2)) - max_value_; + GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, + quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, + quant_fp_ptr_); + ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, + ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), + dequant_ptr_, &ref_eob, scan_); + + ASM_REGISTER_STATE_CHECK(quantize_op_( + coeff_.TopLeftPixel(), count_, mb_plane_, qcoeff_.TopLeftPixel(), + dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_)); + + EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff)); + EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff)); + + EXPECT_EQ(eob_, ref_eob); + + if (HasFailure()) { + printf("Failure on iteration %d.\n", i); + qcoeff_.PrintDifference(ref_qcoeff); + dqcoeff_.PrintDifference(ref_dqcoeff); + return; + } + } +} + +TEST_P(VP9QuantizeTest, DISABLED_Speed) { Speed(false); } + +TEST_P(VP9QuantizeTest, DISABLED_SpeedMedian) { Speed(true); } + +using std::make_tuple; + +#if HAVE_SSE2 +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + SSE2, VP9QuantizeTest, + ::testing::Values( + make_tuple(vpx_quantize_b_sse2, vpx_quantize_b_c, VPX_BITS_8, 16, + false), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, 16, true), + make_tuple(vpx_highbd_quantize_b_sse2, vpx_highbd_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(vpx_highbd_quantize_b_sse2, vpx_highbd_quantize_b_c, + VPX_BITS_10, 16, false), + make_tuple(vpx_highbd_quantize_b_sse2, vpx_highbd_quantize_b_c, + VPX_BITS_12, 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_10, 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_12, 32, false))); + +#else +INSTANTIATE_TEST_SUITE_P( + SSE2, VP9QuantizeTest, + ::testing::Values(make_tuple(vpx_quantize_b_sse2, vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, + 16, true))); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_SSE2 + +#if HAVE_SSSE3 +INSTANTIATE_TEST_SUITE_P( + SSSE3, VP9QuantizeTest, + ::testing::Values(make_tuple(vpx_quantize_b_ssse3, vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, + 16, true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, + VPX_BITS_8, 32, true))); +#endif // HAVE_SSSE3 + +#if HAVE_AVX +INSTANTIATE_TEST_SUITE_P( + AVX, VP9QuantizeTest, + ::testing::Values(make_tuple(vpx_quantize_b_avx, vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false))); +#endif // HAVE_AVX + +#if VPX_ARCH_X86_64 && HAVE_AVX2 +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + AVX2, VP9QuantizeTest, + ::testing::Values( + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, 16, true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_12, 16, + true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_12, + 32, true), + make_tuple(vpx_quantize_b_avx2, vpx_quantize_b_c, VPX_BITS_8, 16, + false), + make_tuple(vpx_highbd_quantize_b_avx2, vpx_highbd_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(vpx_highbd_quantize_b_avx2, vpx_highbd_quantize_b_c, + VPX_BITS_10, 16, false), + make_tuple(vpx_highbd_quantize_b_avx2, vpx_highbd_quantize_b_c, + VPX_BITS_12, 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, VPX_BITS_8, 32, + false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_10, 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_12, 32, false))); +#else +INSTANTIATE_TEST_SUITE_P( + AVX2, VP9QuantizeTest, + ::testing::Values(make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, + 16, true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, + VPX_BITS_8, 32, true), + make_tuple(vpx_quantize_b_avx2, vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false))); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_AVX2 + +#if HAVE_NEON +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + NEON, VP9QuantizeTest, + ::testing::Values( + make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16, + false), + make_tuple(vpx_highbd_quantize_b_neon, vpx_highbd_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(vpx_highbd_quantize_b_neon, vpx_highbd_quantize_b_c, + VPX_BITS_10, 16, false), + make_tuple(vpx_highbd_quantize_b_neon, vpx_highbd_quantize_b_c, + VPX_BITS_12, 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, VPX_BITS_8, 32, + false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_10, 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_12, 32, false), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, 16, true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, 32, + true))); +#else +INSTANTIATE_TEST_SUITE_P( + NEON, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, + 16, true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, + VPX_BITS_8, 32, true))); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_NEON + +#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + VSX, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_vsx, &vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&vpx_quantize_b_32x32_vsx, + &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, + false), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, + 16, true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, + VPX_BITS_8, 32, true))); +#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + LSX, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_lsx, &vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false))); +#endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH + +// Only useful to compare "Speed" test results. +INSTANTIATE_TEST_SUITE_P( + DISABLED_C, VP9QuantizeTest, + ::testing::Values( + make_tuple(&vpx_quantize_b_c, &vpx_quantize_b_c, VPX_BITS_8, 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, VPX_BITS_8, 32, + false), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, 16, true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, 16, true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, 32, + true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, 32, + true))); +} // namespace diff --git a/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc b/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc new file mode 100644 index 0000000000..f7be47542c --- /dev/null +++ b/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc @@ -0,0 +1,672 @@ +/* + * Copyright (c) 2020 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "vp9/ratectrl_rtc.h" + +#include // NOLINT +#include + +#include "./vpx_config.h" +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/video_source.h" +#include "vpx/vpx_codec.h" +#include "vpx_ports/bitops.h" + +namespace { + +const size_t kNumFrames = 300; + +const int kTemporalId3Layer[4] = { 0, 2, 1, 2 }; +const int kTemporalId2Layer[2] = { 0, 1 }; +const int kTemporalRateAllocation3Layer[3] = { 50, 70, 100 }; +const int kTemporalRateAllocation2Layer[2] = { 60, 100 }; +const int kSpatialLayerBitrate[3] = { 200, 400, 1000 }; +const int kSpatialLayerBitrateLow[3] = { 50, 100, 400 }; + +class RcInterfaceTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith2Params { + public: + RcInterfaceTest() + : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000), + encoder_exit_(false), frame_drop_thresh_(0), num_drops_(0) {} + + ~RcInterfaceTest() override = default; + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + } + + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, 7); + encoder->Control(VP9E_SET_AQ_MODE, aq_mode_); + if (rc_cfg_.is_screen) { + encoder->Control(VP9E_SET_TUNE_CONTENT, VP9E_CONTENT_SCREEN); + } else { + encoder->Control(VP9E_SET_TUNE_CONTENT, VP9E_CONTENT_DEFAULT); + } + encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000); + encoder->Control(VP9E_SET_RTC_EXTERNAL_RATECTRL, 1); + } + frame_params_.frame_type = video->frame() % key_interval_ == 0 + ? libvpx::RcFrameType::kKeyFrame + : libvpx::RcFrameType::kInterFrame; + if (rc_cfg_.rc_mode == VPX_CBR && + frame_params_.frame_type == libvpx::RcFrameType::kInterFrame) { + // Disable golden frame update. + frame_flags_ |= VP8_EFLAG_NO_UPD_GF; + frame_flags_ |= VP8_EFLAG_NO_UPD_ARF; + } + encoder_exit_ = video->frame() == kNumFrames; + } + + void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override { + if (encoder_exit_) { + return; + } + int loopfilter_level, qp; + encoder->Control(VP9E_GET_LOOPFILTER_LEVEL, &loopfilter_level); + encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp); + if (rc_api_->ComputeQP(frame_params_) == libvpx::FrameDropDecision::kOk) { + ASSERT_EQ(rc_api_->GetQP(), qp); + ASSERT_EQ(rc_api_->GetLoopfilterLevel(), loopfilter_level); + } else { + num_drops_++; + } + } + + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + rc_api_->PostEncodeUpdate(pkt->data.frame.sz, frame_params_); + } + + void RunOneLayer() { + SetConfig(GET_PARAM(2)); + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + frame_params_.spatial_layer_id = 0; + frame_params_.temporal_layer_id = 0; + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + void RunOneLayerScreen() { + SetConfig(GET_PARAM(2)); + rc_cfg_.is_screen = true; + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + frame_params_.spatial_layer_id = 0; + frame_params_.temporal_layer_id = 0; + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + void RunOneLayerDropFramesCBR() { + if (GET_PARAM(2) != VPX_CBR) { + GTEST_SKIP() << "Frame dropping is only for CBR mode."; + } + frame_drop_thresh_ = 30; + SetConfig(GET_PARAM(2)); + // Use lower bitrate, lower max-q, and enable frame dropper. + rc_cfg_.target_bandwidth = 200; + cfg_.rc_target_bitrate = 200; + rc_cfg_.max_quantizer = 50; + cfg_.rc_max_quantizer = 50; + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + frame_params_.spatial_layer_id = 0; + frame_params_.temporal_layer_id = 0; + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Check that some frames were dropped, otherwise test has no value. + ASSERT_GE(num_drops_, 1); + } + + void RunOneLayerVBRPeriodicKey() { + if (GET_PARAM(2) != VPX_VBR) return; + key_interval_ = 100; + SetConfig(VPX_VBR); + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + frame_params_.spatial_layer_id = 0; + frame_params_.temporal_layer_id = 0; + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + private: + void SetConfig(vpx_rc_mode rc_mode) { + rc_cfg_.width = 1280; + rc_cfg_.height = 720; + rc_cfg_.max_quantizer = 52; + rc_cfg_.min_quantizer = 2; + rc_cfg_.target_bandwidth = 1000; + rc_cfg_.buf_initial_sz = 600; + rc_cfg_.buf_optimal_sz = 600; + rc_cfg_.buf_sz = 1000; + rc_cfg_.undershoot_pct = 50; + rc_cfg_.overshoot_pct = 50; + rc_cfg_.max_intra_bitrate_pct = 1000; + rc_cfg_.framerate = 30.0; + rc_cfg_.ss_number_layers = 1; + rc_cfg_.ts_number_layers = 1; + rc_cfg_.scaling_factor_num[0] = 1; + rc_cfg_.scaling_factor_den[0] = 1; + rc_cfg_.layer_target_bitrate[0] = 1000; + rc_cfg_.max_quantizers[0] = 52; + rc_cfg_.min_quantizers[0] = 2; + rc_cfg_.rc_mode = rc_mode; + rc_cfg_.aq_mode = aq_mode_; + rc_cfg_.frame_drop_thresh = frame_drop_thresh_; + + // Encoder settings for ground truth. + cfg_.g_w = 1280; + cfg_.g_h = 720; + cfg_.rc_undershoot_pct = 50; + cfg_.rc_overshoot_pct = 50; + cfg_.rc_buf_initial_sz = 600; + cfg_.rc_buf_optimal_sz = 600; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 52; + cfg_.rc_end_usage = rc_mode; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 0; + cfg_.rc_target_bitrate = 1000; + cfg_.kf_min_dist = key_interval_; + cfg_.kf_max_dist = key_interval_; + cfg_.rc_dropframe_thresh = frame_drop_thresh_; + } + + std::unique_ptr rc_api_; + libvpx::VP9RateControlRtcConfig rc_cfg_; + int aq_mode_; + int key_interval_; + libvpx::VP9FrameParamsQpRTC frame_params_; + bool encoder_exit_; + int frame_drop_thresh_; + int num_drops_; +}; + +class RcInterfaceSvcTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith2Params { + public: + RcInterfaceSvcTest() + : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000), + dynamic_spatial_layers_(0), inter_layer_pred_off_(GET_PARAM(2)), + parallel_spatial_layers_(false), frame_drop_thresh_(0), + max_consec_drop_(INT_MAX), num_drops_(0) {} + ~RcInterfaceSvcTest() override = default; + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + } + + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + current_superframe_ = 0; + encoder->Control(VP8E_SET_CPUUSED, 7); + encoder->Control(VP9E_SET_AQ_MODE, aq_mode_); + encoder->Control(VP9E_SET_TUNE_CONTENT, 0); + encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 900); + encoder->Control(VP9E_SET_RTC_EXTERNAL_RATECTRL, 1); + encoder->Control(VP9E_SET_SVC, 1); + encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_); + if (inter_layer_pred_off_) { + encoder->Control(VP9E_SET_SVC_INTER_LAYER_PRED, + INTER_LAYER_PRED_OFF_NONKEY); + } + if (frame_drop_thresh_ > 0) { + vpx_svc_frame_drop_t svc_drop_frame; + svc_drop_frame.framedrop_mode = FULL_SUPERFRAME_DROP; + for (int sl = 0; sl < rc_cfg_.ss_number_layers; ++sl) + svc_drop_frame.framedrop_thresh[sl] = frame_drop_thresh_; + svc_drop_frame.max_consec_drop = max_consec_drop_; + encoder->Control(VP9E_SET_SVC_FRAME_DROP_LAYER, &svc_drop_frame); + } + } + frame_params_.frame_type = video->frame() % key_interval_ == 0 + ? libvpx::RcFrameType::kKeyFrame + : libvpx::RcFrameType::kInterFrame; + encoder_exit_ = video->frame() == kNumFrames; + if (dynamic_spatial_layers_ == 1) { + if (video->frame() == 100) { + // Go down to 2 spatial layers: set top SL to 0 bitrate. + // Update the encoder config. + cfg_.rc_target_bitrate -= cfg_.layer_target_bitrate[8]; + cfg_.layer_target_bitrate[6] = 0; + cfg_.layer_target_bitrate[7] = 0; + cfg_.layer_target_bitrate[8] = 0; + encoder->Config(&cfg_); + // Update the RC config. + rc_cfg_.target_bandwidth -= rc_cfg_.layer_target_bitrate[8]; + rc_cfg_.layer_target_bitrate[6] = 0; + rc_cfg_.layer_target_bitrate[7] = 0; + rc_cfg_.layer_target_bitrate[8] = 0; + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); + } else if (video->frame() == 200) { + // Go down to 1 spatial layer. + // Update the encoder config. + cfg_.rc_target_bitrate -= cfg_.layer_target_bitrate[5]; + cfg_.layer_target_bitrate[3] = 0; + cfg_.layer_target_bitrate[4] = 0; + cfg_.layer_target_bitrate[5] = 0; + encoder->Config(&cfg_); + // Update the RC config. + rc_cfg_.target_bandwidth -= rc_cfg_.layer_target_bitrate[5]; + rc_cfg_.layer_target_bitrate[3] = 0; + rc_cfg_.layer_target_bitrate[4] = 0; + rc_cfg_.layer_target_bitrate[5] = 0; + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); + } else if (/*DISABLES CODE*/ (false) && video->frame() == 280) { + // TODO(marpan): Re-enable this going back up when issue is fixed. + // Go back up to 3 spatial layers. + // Update the encoder config: use the original bitrates. + SetEncoderConfigSvc(3, 3); + encoder->Config(&cfg_); + // Update the RC config. + SetRCConfigSvc(3, 3); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); + } + } + } + + virtual void SetFrameParamsSvc(int sl) { + frame_params_.spatial_layer_id = sl; + if (rc_cfg_.ts_number_layers == 3) + frame_params_.temporal_layer_id = + kTemporalId3Layer[current_superframe_ % 4]; + else if (rc_cfg_.ts_number_layers == 2) + frame_params_.temporal_layer_id = + kTemporalId2Layer[current_superframe_ % 2]; + else + frame_params_.temporal_layer_id = 0; + frame_params_.frame_type = + current_superframe_ % key_interval_ == 0 && sl == 0 + ? libvpx::RcFrameType::kKeyFrame + : libvpx::RcFrameType::kInterFrame; + } + + void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override { + if (encoder_exit_) { + return; + } + int superframe_is_dropped = false; + ::libvpx_test::CxDataIterator iter = encoder->GetCxData(); + for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) sizes_[sl] = 0; + std::vector rc_qp; + // For FULL_SUPERFRAME_DROP: the full superframe drop decision is + // determined on the base spatial layer. + SetFrameParamsSvc(0); + if (rc_api_->ComputeQP(frame_params_) == libvpx::FrameDropDecision::kDrop) { + superframe_is_dropped = true; + num_drops_++; + } + while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) { + ASSERT_EQ(superframe_is_dropped, false); + ParseSuperframeSizes(static_cast(pkt->data.frame.buf), + pkt->data.frame.sz); + if (!parallel_spatial_layers_ || current_superframe_ == 0) { + for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) { + if (sizes_[sl] > 0) { + SetFrameParamsSvc(sl); + // For sl=0 ComputeQP() is already called above (line 310). + if (sl > 0) rc_api_->ComputeQP(frame_params_); + rc_api_->PostEncodeUpdate(sizes_[sl], frame_params_); + rc_qp.push_back(rc_api_->GetQP()); + } + } + } else { + for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) { + // For sl=0 ComputeQP() is already called above (line 310). + if (sizes_[sl] > 0 && sl > 0) { + SetFrameParamsSvc(sl); + rc_api_->ComputeQP(frame_params_); + } + } + for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) { + if (sizes_[sl] > 0) { + SetFrameParamsSvc(sl); + rc_api_->PostEncodeUpdate(sizes_[sl], frame_params_); + rc_qp.push_back(rc_api_->GetQP()); + } + } + } + } + if (!superframe_is_dropped) { + int loopfilter_level; + std::vector encoder_qp(VPX_SS_MAX_LAYERS, 0); + encoder->Control(VP9E_GET_LOOPFILTER_LEVEL, &loopfilter_level); + encoder->Control(VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, encoder_qp.data()); + encoder_qp.resize(rc_qp.size()); + ASSERT_EQ(rc_qp, encoder_qp); + ASSERT_EQ(rc_api_->GetLoopfilterLevel(), loopfilter_level); + current_superframe_++; + } + } + // This method needs to be overridden because non-reference frames are + // expected to be mismatched frames as the encoder will avoid loopfilter on + // these frames. + void MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) override {} + + void RunSvc() { + SetRCConfigSvc(3, 3); + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + SetEncoderConfigSvc(3, 3); + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + void RunSvcDropFramesCBR() { + max_consec_drop_ = 10; + frame_drop_thresh_ = 30; + SetRCConfigSvc(3, 3); + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + SetEncoderConfigSvc(3, 3); + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Check that some frames were dropped, otherwise test has no value. + ASSERT_GE(num_drops_, 1); + } + + void RunSvcPeriodicKey() { + SetRCConfigSvc(3, 3); + key_interval_ = 100; + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + SetEncoderConfigSvc(3, 3); + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + void RunSvcDynamicSpatial() { + dynamic_spatial_layers_ = 1; + SetRCConfigSvc(3, 3); + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + SetEncoderConfigSvc(3, 3); + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + void RunSvcParallelSpatialLayers() { + if (!inter_layer_pred_off_) return; + parallel_spatial_layers_ = true; + SetRCConfigSvc(3, 3); + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + SetEncoderConfigSvc(3, 3); + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + private: + vpx_codec_err_t ParseSuperframeSizes(const uint8_t *data, size_t data_sz) { + uint8_t marker = *(data + data_sz - 1); + if ((marker & 0xe0) == 0xc0) { + const uint32_t frames = (marker & 0x7) + 1; + const uint32_t mag = ((marker >> 3) & 0x3) + 1; + const size_t index_sz = 2 + mag * frames; + // This chunk is marked as having a superframe index but doesn't have + // enough data for it, thus it's an invalid superframe index. + if (data_sz < index_sz) return VPX_CODEC_CORRUPT_FRAME; + { + const uint8_t marker2 = *(data + data_sz - index_sz); + // This chunk is marked as having a superframe index but doesn't have + // the matching marker byte at the front of the index therefore it's an + // invalid chunk. + if (marker != marker2) return VPX_CODEC_CORRUPT_FRAME; + } + const uint8_t *x = &data[data_sz - index_sz + 1]; + for (uint32_t i = 0; i < frames; ++i) { + uint32_t this_sz = 0; + + for (uint32_t j = 0; j < mag; ++j) this_sz |= (*x++) << (j * 8); + sizes_[i] = this_sz; + } + } + return VPX_CODEC_OK; + } + + void SetEncoderConfigSvc(int number_spatial_layers, + int number_temporal_layers) { + cfg_.g_w = 1280; + cfg_.g_h = 720; + cfg_.ss_number_layers = number_spatial_layers; + cfg_.ts_number_layers = number_temporal_layers; + cfg_.g_timebase.num = 1; + cfg_.g_timebase.den = 30; + if (number_spatial_layers == 3) { + svc_params_.scaling_factor_num[0] = 1; + svc_params_.scaling_factor_den[0] = 4; + svc_params_.scaling_factor_num[1] = 2; + svc_params_.scaling_factor_den[1] = 4; + svc_params_.scaling_factor_num[2] = 4; + svc_params_.scaling_factor_den[2] = 4; + } else if (number_spatial_layers == 2) { + svc_params_.scaling_factor_num[0] = 1; + svc_params_.scaling_factor_den[0] = 2; + svc_params_.scaling_factor_num[1] = 2; + svc_params_.scaling_factor_den[1] = 2; + } else if (number_spatial_layers == 1) { + svc_params_.scaling_factor_num[0] = 1; + svc_params_.scaling_factor_den[0] = 1; + } + + for (int i = 0; i < VPX_MAX_LAYERS; ++i) { + svc_params_.max_quantizers[i] = 56; + svc_params_.min_quantizers[i] = 2; + svc_params_.speed_per_layer[i] = 7; + svc_params_.loopfilter_ctrl[i] = LOOPFILTER_ALL; + } + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 0; + + if (number_temporal_layers == 3) { + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + cfg_.temporal_layering_mode = 3; + } else if (number_temporal_layers == 2) { + cfg_.ts_rate_decimator[0] = 2; + cfg_.ts_rate_decimator[1] = 1; + cfg_.temporal_layering_mode = 2; + } else if (number_temporal_layers == 1) { + cfg_.ts_rate_decimator[0] = 1; + cfg_.temporal_layering_mode = 0; + } + + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 600; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.g_threads = 1; + cfg_.kf_max_dist = 9999; + cfg_.rc_overshoot_pct = 50; + cfg_.rc_undershoot_pct = 50; + cfg_.rc_dropframe_thresh = frame_drop_thresh_; + + cfg_.rc_target_bitrate = 0; + for (int sl = 0; sl < number_spatial_layers; sl++) { + int spatial_bitrate = 0; + if (number_spatial_layers <= 3) + spatial_bitrate = frame_drop_thresh_ > 0 ? kSpatialLayerBitrateLow[sl] + : kSpatialLayerBitrate[sl]; + for (int tl = 0; tl < number_temporal_layers; tl++) { + int layer = sl * number_temporal_layers + tl; + if (number_temporal_layers == 3) + cfg_.layer_target_bitrate[layer] = + kTemporalRateAllocation3Layer[tl] * spatial_bitrate / 100; + else if (number_temporal_layers == 2) + cfg_.layer_target_bitrate[layer] = + kTemporalRateAllocation2Layer[tl] * spatial_bitrate / 100; + else if (number_temporal_layers == 1) + cfg_.layer_target_bitrate[layer] = spatial_bitrate; + } + cfg_.rc_target_bitrate += spatial_bitrate; + } + + cfg_.kf_min_dist = key_interval_; + cfg_.kf_max_dist = key_interval_; + } + + void SetRCConfigSvc(int number_spatial_layers, int number_temporal_layers) { + rc_cfg_.width = 1280; + rc_cfg_.height = 720; + rc_cfg_.ss_number_layers = number_spatial_layers; + rc_cfg_.ts_number_layers = number_temporal_layers; + rc_cfg_.max_quantizer = 56; + rc_cfg_.min_quantizer = 2; + rc_cfg_.buf_initial_sz = 500; + rc_cfg_.buf_optimal_sz = 600; + rc_cfg_.buf_sz = 1000; + rc_cfg_.undershoot_pct = 50; + rc_cfg_.overshoot_pct = 50; + rc_cfg_.max_intra_bitrate_pct = 900; + rc_cfg_.framerate = 30.0; + rc_cfg_.rc_mode = VPX_CBR; + rc_cfg_.aq_mode = aq_mode_; + rc_cfg_.frame_drop_thresh = frame_drop_thresh_; + rc_cfg_.max_consec_drop = max_consec_drop_; + + if (number_spatial_layers == 3) { + rc_cfg_.scaling_factor_num[0] = 1; + rc_cfg_.scaling_factor_den[0] = 4; + rc_cfg_.scaling_factor_num[1] = 2; + rc_cfg_.scaling_factor_den[1] = 4; + rc_cfg_.scaling_factor_num[2] = 4; + rc_cfg_.scaling_factor_den[2] = 4; + } else if (number_spatial_layers == 2) { + rc_cfg_.scaling_factor_num[0] = 1; + rc_cfg_.scaling_factor_den[0] = 2; + rc_cfg_.scaling_factor_num[1] = 2; + rc_cfg_.scaling_factor_den[1] = 2; + } else if (number_spatial_layers == 1) { + rc_cfg_.scaling_factor_num[0] = 1; + rc_cfg_.scaling_factor_den[0] = 1; + } + + if (number_temporal_layers == 3) { + rc_cfg_.ts_rate_decimator[0] = 4; + rc_cfg_.ts_rate_decimator[1] = 2; + rc_cfg_.ts_rate_decimator[2] = 1; + } else if (number_temporal_layers == 2) { + rc_cfg_.ts_rate_decimator[0] = 2; + rc_cfg_.ts_rate_decimator[1] = 1; + } else if (number_temporal_layers == 1) { + rc_cfg_.ts_rate_decimator[0] = 1; + } + + rc_cfg_.target_bandwidth = 0; + for (int sl = 0; sl < number_spatial_layers; sl++) { + int spatial_bitrate = 0; + if (number_spatial_layers <= 3) + spatial_bitrate = frame_drop_thresh_ > 0 ? kSpatialLayerBitrateLow[sl] + : kSpatialLayerBitrate[sl]; + for (int tl = 0; tl < number_temporal_layers; tl++) { + int layer = sl * number_temporal_layers + tl; + if (number_temporal_layers == 3) + rc_cfg_.layer_target_bitrate[layer] = + kTemporalRateAllocation3Layer[tl] * spatial_bitrate / 100; + else if (number_temporal_layers == 2) + rc_cfg_.layer_target_bitrate[layer] = + kTemporalRateAllocation2Layer[tl] * spatial_bitrate / 100; + else if (number_temporal_layers == 1) + rc_cfg_.layer_target_bitrate[layer] = spatial_bitrate; + } + rc_cfg_.target_bandwidth += spatial_bitrate; + } + + for (int sl = 0; sl < rc_cfg_.ss_number_layers; ++sl) { + for (int tl = 0; tl < rc_cfg_.ts_number_layers; ++tl) { + const int i = sl * rc_cfg_.ts_number_layers + tl; + rc_cfg_.max_quantizers[i] = 56; + rc_cfg_.min_quantizers[i] = 2; + } + } + } + + int aq_mode_; + std::unique_ptr rc_api_; + libvpx::VP9RateControlRtcConfig rc_cfg_; + vpx_svc_extra_cfg_t svc_params_; + libvpx::VP9FrameParamsQpRTC frame_params_; + bool encoder_exit_; + int current_superframe_; + uint32_t sizes_[8]; + int key_interval_; + int dynamic_spatial_layers_; + bool inter_layer_pred_off_; + // ComputeQP() and PostEncodeUpdate() don't need to be sequential for KSVC. + bool parallel_spatial_layers_; + int frame_drop_thresh_; + int max_consec_drop_; + int num_drops_; +}; + +TEST_P(RcInterfaceTest, OneLayer) { RunOneLayer(); } + +TEST_P(RcInterfaceTest, OneLayerDropFramesCBR) { RunOneLayerDropFramesCBR(); } + +TEST_P(RcInterfaceTest, OneLayerScreen) { RunOneLayerScreen(); } + +TEST_P(RcInterfaceTest, OneLayerVBRPeriodicKey) { RunOneLayerVBRPeriodicKey(); } + +TEST_P(RcInterfaceSvcTest, Svc) { RunSvc(); } + +TEST_P(RcInterfaceSvcTest, SvcDropFramesCBR) { RunSvcDropFramesCBR(); } + +TEST_P(RcInterfaceSvcTest, SvcParallelSpatialLayers) { + RunSvcParallelSpatialLayers(); +} + +TEST_P(RcInterfaceSvcTest, SvcPeriodicKey) { RunSvcPeriodicKey(); } + +TEST_P(RcInterfaceSvcTest, SvcDynamicSpatial) { RunSvcDynamicSpatial(); } + +VP9_INSTANTIATE_TEST_SUITE(RcInterfaceTest, ::testing::Values(0, 3), + ::testing::Values(VPX_CBR, VPX_VBR)); +VP9_INSTANTIATE_TEST_SUITE(RcInterfaceSvcTest, ::testing::Values(0, 3), + ::testing::Values(true, false)); +} // namespace diff --git a/media/libvpx/libvpx/test/vp9_roi_test.cc b/media/libvpx/libvpx/test/vp9_roi_test.cc new file mode 100644 index 0000000000..a9347fb365 --- /dev/null +++ b/media/libvpx/libvpx/test/vp9_roi_test.cc @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/video_source.h" +#include "test/y4m_video_source.h" +#include "test/yuv_video_source.h" +#include "vpx/vp8cx.h" +#include "vpx/vpx_encoder.h" + +#define MASK_WIDTH 40 +#define MASK_HEIGHT 30 +#define MASK_SIZE MASK_WIDTH *MASK_HEIGHT + +namespace { + +const int mask[MASK_SIZE] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, + 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0 +}; + +class RoiMaskBackgroundSkip : public ::libvpx_test::EncoderTest, + public ::testing::Test { + protected: + RoiMaskBackgroundSkip() : EncoderTest(&::libvpx_test::kVP9) {} + ~RoiMaskBackgroundSkip() override { free(roi_.roi_map); } + + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + SetRoi(); + } + + void SetRoi() { + const int block_size = 8; + unsigned int i, j; + roi_.rows = (cfg_.g_h + block_size - 1) / block_size; + roi_.cols = (cfg_.g_w + block_size - 1) / block_size; + memset(&roi_.skip, 0, sizeof(roi_.skip)); + memset(&roi_.delta_q, 0, sizeof(roi_.delta_q)); + memset(&roi_.delta_lf, 0, sizeof(roi_.delta_lf)); + memset(roi_.ref_frame, -1, sizeof(roi_.ref_frame)); + roi_.ref_frame[1] = 1; + // Use segment 3 for skip. + roi_.skip[3] = 1; + roi_.roi_map = + (uint8_t *)calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map)); + for (i = 0; i < roi_.rows; ++i) { + for (j = 0; j < roi_.cols; ++j) { + const int idx = i * roi_.cols + j; + if (mask[idx] == 1) roi_.roi_map[idx] = 3; + } + } + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, 7); + encoder->Control(VP9E_SET_AQ_MODE, 3); + } + encoder->Control(VP9E_SET_ROI_MAP, &roi_); + } + + private: + vpx_roi_map_t roi_; +}; + +TEST_F(RoiMaskBackgroundSkip, RoiMaskNoMismatch) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_undershoot_pct = 20; + cfg_.rc_undershoot_pct = 20; + cfg_.rc_dropframe_thresh = 10; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 50; + cfg_.rc_end_usage = VPX_CBR; + cfg_.rc_target_bitrate = 200; + cfg_.g_lag_in_frames = 0; + cfg_.kf_max_dist = 9999; + + ::libvpx_test::I420VideoSource video("desktopqvga.320_240.yuv", 320, 240, 30, + 1, 0, 150); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} +} // namespace diff --git a/media/libvpx/libvpx/test/vp9_scale_test.cc b/media/libvpx/libvpx/test/vp9_scale_test.cc new file mode 100644 index 0000000000..049a10a617 --- /dev/null +++ b/media/libvpx/libvpx/test/vp9_scale_test.cc @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "./vpx_scale_rtcd.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/vpx_scale_test.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/vpx_timer.h" +#include "vpx_scale/yv12config.h" + +namespace libvpx_test { + +typedef void (*ScaleFrameFunc)(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + INTERP_FILTER filter_type, int phase_scaler); + +class ScaleTest : public VpxScaleBase, + public ::testing::TestWithParam { + public: + ~ScaleTest() override = default; + + protected: + void SetUp() override { scale_fn_ = GetParam(); } + + void ReferenceScaleFrame(INTERP_FILTER filter_type, int phase_scaler) { + vp9_scale_and_extend_frame_c(&img_, &ref_img_, filter_type, phase_scaler); + } + + void ScaleFrame(INTERP_FILTER filter_type, int phase_scaler) { + ASM_REGISTER_STATE_CHECK( + scale_fn_(&img_, &dst_img_, filter_type, phase_scaler)); + } + + void RunTest(INTERP_FILTER filter_type) { + static const int kNumSizesToTest = 20; + static const int kNumScaleFactorsToTest = 4; + static const int kSizesToTest[] = { + 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, + 22, 24, 26, 28, 30, 32, 34, 68, 128, 134 + }; + static const int kScaleFactors[] = { 1, 2, 3, 4 }; + for (int phase_scaler = 0; phase_scaler < 16; ++phase_scaler) { + for (int h = 0; h < kNumSizesToTest; ++h) { + const int src_height = kSizesToTest[h]; + for (int w = 0; w < kNumSizesToTest; ++w) { + const int src_width = kSizesToTest[w]; + for (int sf_up_idx = 0; sf_up_idx < kNumScaleFactorsToTest; + ++sf_up_idx) { + const int sf_up = kScaleFactors[sf_up_idx]; + for (int sf_down_idx = 0; sf_down_idx < kNumScaleFactorsToTest; + ++sf_down_idx) { + const int sf_down = kScaleFactors[sf_down_idx]; + const int dst_width = src_width * sf_up / sf_down; + const int dst_height = src_height * sf_up / sf_down; + if (sf_up == sf_down && sf_up != 1) { + continue; + } + // I420 frame width and height must be even. + if (!dst_width || !dst_height || dst_width & 1 || + dst_height & 1) { + continue; + } + // vpx_convolve8_c() has restriction on the step which cannot + // exceed 64 (ratio 1 to 4). + if (src_width > 4 * dst_width || src_height > 4 * dst_height) { + continue; + } + ASSERT_NO_FATAL_FAILURE(ResetScaleImages(src_width, src_height, + dst_width, dst_height)); + ReferenceScaleFrame(filter_type, phase_scaler); + ScaleFrame(filter_type, phase_scaler); + if (memcmp(dst_img_.buffer_alloc, ref_img_.buffer_alloc, + ref_img_.frame_size)) { + printf( + "filter_type = %d, phase_scaler = %d, src_width = %4d, " + "src_height = %4d, dst_width = %4d, dst_height = %4d, " + "scale factor = %d:%d\n", + filter_type, phase_scaler, src_width, src_height, dst_width, + dst_height, sf_down, sf_up); + PrintDiff(); + } + CompareImages(dst_img_); + DeallocScaleImages(); + } + } + } + } + } + } + + void PrintDiffComponent(const uint8_t *const ref, const uint8_t *const opt, + const int stride, const int width, const int height, + const int plane_idx) const { + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + if (ref[y * stride + x] != opt[y * stride + x]) { + printf("Plane %d pixel[%d][%d] diff:%6d (ref),%6d (opt)\n", plane_idx, + y, x, ref[y * stride + x], opt[y * stride + x]); + break; + } + } + } + } + + void PrintDiff() const { + assert(ref_img_.y_stride == dst_img_.y_stride); + assert(ref_img_.y_width == dst_img_.y_width); + assert(ref_img_.y_height == dst_img_.y_height); + assert(ref_img_.uv_stride == dst_img_.uv_stride); + assert(ref_img_.uv_width == dst_img_.uv_width); + assert(ref_img_.uv_height == dst_img_.uv_height); + + if (memcmp(dst_img_.buffer_alloc, ref_img_.buffer_alloc, + ref_img_.frame_size)) { + PrintDiffComponent(ref_img_.y_buffer, dst_img_.y_buffer, + ref_img_.y_stride, ref_img_.y_width, ref_img_.y_height, + 0); + PrintDiffComponent(ref_img_.u_buffer, dst_img_.u_buffer, + ref_img_.uv_stride, ref_img_.uv_width, + ref_img_.uv_height, 1); + PrintDiffComponent(ref_img_.v_buffer, dst_img_.v_buffer, + ref_img_.uv_stride, ref_img_.uv_width, + ref_img_.uv_height, 2); + } + } + + ScaleFrameFunc scale_fn_; +}; + +TEST_P(ScaleTest, ScaleFrame_EightTap) { RunTest(EIGHTTAP); } +TEST_P(ScaleTest, ScaleFrame_EightTapSmooth) { RunTest(EIGHTTAP_SMOOTH); } +TEST_P(ScaleTest, ScaleFrame_EightTapSharp) { RunTest(EIGHTTAP_SHARP); } +TEST_P(ScaleTest, ScaleFrame_Bilinear) { RunTest(BILINEAR); } + +TEST_P(ScaleTest, DISABLED_Speed) { + static const int kCountSpeedTestBlock = 100; + static const int kNumScaleFactorsToTest = 4; + static const int kScaleFactors[] = { 1, 2, 3, 4 }; + const int src_width = 1280; + const int src_height = 720; + for (INTERP_FILTER filter_type = 2; filter_type < 4; ++filter_type) { + for (int phase_scaler = 0; phase_scaler < 2; ++phase_scaler) { + for (int sf_up_idx = 0; sf_up_idx < kNumScaleFactorsToTest; ++sf_up_idx) { + const int sf_up = kScaleFactors[sf_up_idx]; + for (int sf_down_idx = 0; sf_down_idx < kNumScaleFactorsToTest; + ++sf_down_idx) { + const int sf_down = kScaleFactors[sf_down_idx]; + const int dst_width = src_width * sf_up / sf_down; + const int dst_height = src_height * sf_up / sf_down; + if (sf_up == sf_down && sf_up != 1) { + continue; + } + // I420 frame width and height must be even. + if (dst_width & 1 || dst_height & 1) { + continue; + } + ASSERT_NO_FATAL_FAILURE( + ResetScaleImages(src_width, src_height, dst_width, dst_height)); + ASM_REGISTER_STATE_CHECK( + ReferenceScaleFrame(filter_type, phase_scaler)); + + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int i = 0; i < kCountSpeedTestBlock; ++i) { + ScaleFrame(filter_type, phase_scaler); + } + libvpx_test::ClearSystemState(); + vpx_usec_timer_mark(&timer); + const int elapsed_time = + static_cast(vpx_usec_timer_elapsed(&timer) / 1000); + CompareImages(dst_img_); + DeallocScaleImages(); + + printf( + "filter_type = %d, phase_scaler = %d, src_width = %4d, " + "src_height = %4d, dst_width = %4d, dst_height = %4d, " + "scale factor = %d:%d, scale time: %5d ms\n", + filter_type, phase_scaler, src_width, src_height, dst_width, + dst_height, sf_down, sf_up, elapsed_time); + } + } + } + } +} + +INSTANTIATE_TEST_SUITE_P(C, ScaleTest, + ::testing::Values(vp9_scale_and_extend_frame_c)); + +#if HAVE_SSSE3 +INSTANTIATE_TEST_SUITE_P(SSSE3, ScaleTest, + ::testing::Values(vp9_scale_and_extend_frame_ssse3)); +#endif // HAVE_SSSE3 + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, ScaleTest, + ::testing::Values(vp9_scale_and_extend_frame_neon)); +#endif // HAVE_NEON + +} // namespace libvpx_test diff --git a/media/libvpx/libvpx/test/vp9_skip_loopfilter_test.cc b/media/libvpx/libvpx/test/vp9_skip_loopfilter_test.cc new file mode 100644 index 0000000000..c080a2caae --- /dev/null +++ b/media/libvpx/libvpx/test/vp9_skip_loopfilter_test.cc @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "test/codec_factory.h" +#include "test/decode_test_driver.h" +#include "test/md5_helper.h" +#include "test/util.h" +#include "test/webm_video_source.h" + +namespace { + +const char kVp9TestFile[] = "vp90-2-08-tile_1x8_frame_parallel.webm"; +const char kVp9Md5File[] = "vp90-2-08-tile_1x8_frame_parallel.webm.md5"; + +// Class for testing shutting off the loop filter. +class SkipLoopFilterTest { + public: + SkipLoopFilterTest() + : video_(nullptr), decoder_(nullptr), md5_file_(nullptr) {} + + ~SkipLoopFilterTest() { + if (md5_file_ != nullptr) fclose(md5_file_); + delete decoder_; + delete video_; + } + + // If |threads| > 0 then set the decoder with that number of threads. + bool Init(int num_threads) { + expected_md5_[0] = '\0'; + junk_[0] = '\0'; + video_ = new libvpx_test::WebMVideoSource(kVp9TestFile); + if (video_ == nullptr) { + EXPECT_NE(video_, nullptr); + return false; + } + video_->Init(); + video_->Begin(); + + vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); + if (num_threads > 0) cfg.threads = num_threads; + decoder_ = new libvpx_test::VP9Decoder(cfg, 0); + if (decoder_ == nullptr) { + EXPECT_NE(decoder_, nullptr); + return false; + } + + OpenMd5File(kVp9Md5File); + return !::testing::Test::HasFailure(); + } + + // Set the VP9 skipLoopFilter control value. + void SetSkipLoopFilter(int value, vpx_codec_err_t expected_value) { + ASSERT_NE(decoder_, nullptr); + decoder_->Control(VP9_SET_SKIP_LOOP_FILTER, value, expected_value); + } + + vpx_codec_err_t DecodeOneFrame() { + const vpx_codec_err_t res = + decoder_->DecodeFrame(video_->cxdata(), video_->frame_size()); + if (res == VPX_CODEC_OK) { + ReadMd5(); + video_->Next(); + } + return res; + } + + vpx_codec_err_t DecodeRemainingFrames() { + for (; video_->cxdata() != nullptr; video_->Next()) { + const vpx_codec_err_t res = + decoder_->DecodeFrame(video_->cxdata(), video_->frame_size()); + if (res != VPX_CODEC_OK) return res; + ReadMd5(); + } + return VPX_CODEC_OK; + } + + // Checks if MD5 matches or doesn't. + void CheckMd5(bool matches) { + libvpx_test::DxDataIterator dec_iter = decoder_->GetDxData(); + const vpx_image_t *img = dec_iter.Next(); + CheckMd5Vpx(*img, matches); + } + + private: + // TODO(fgalligan): Move the MD5 testing code into another class. + void OpenMd5File(const std::string &md5_file_name) { + md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name); + ASSERT_NE(md5_file_, nullptr) + << "MD5 file open failed. Filename: " << md5_file_name; + } + + // Reads the next line of the MD5 file. + void ReadMd5() { + ASSERT_NE(md5_file_, nullptr); + const int res = fscanf(md5_file_, "%s %s", expected_md5_, junk_); + ASSERT_NE(EOF, res) << "Read md5 data failed"; + expected_md5_[32] = '\0'; + } + + // Checks if the last read MD5 matches |img| or doesn't. + void CheckMd5Vpx(const vpx_image_t &img, bool matches) { + ::libvpx_test::MD5 md5_res; + md5_res.Add(&img); + const char *const actual_md5 = md5_res.Get(); + + // Check MD5. + if (matches) + ASSERT_STREQ(expected_md5_, actual_md5) << "MD5 checksums don't match"; + else + ASSERT_STRNE(expected_md5_, actual_md5) << "MD5 checksums match"; + } + + libvpx_test::WebMVideoSource *video_; + libvpx_test::VP9Decoder *decoder_; + FILE *md5_file_; + char expected_md5_[33]; + char junk_[128]; +}; + +TEST(SkipLoopFilterTest, ShutOffLoopFilter) { + const int non_zero_value = 1; + const int num_threads = 0; + SkipLoopFilterTest skip_loop_filter; + ASSERT_TRUE(skip_loop_filter.Init(num_threads)); + skip_loop_filter.SetSkipLoopFilter(non_zero_value, VPX_CODEC_OK); + ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeRemainingFrames()); + skip_loop_filter.CheckMd5(false); +} + +TEST(SkipLoopFilterTest, ShutOffLoopFilterSingleThread) { + const int non_zero_value = 1; + const int num_threads = 1; + SkipLoopFilterTest skip_loop_filter; + ASSERT_TRUE(skip_loop_filter.Init(num_threads)); + skip_loop_filter.SetSkipLoopFilter(non_zero_value, VPX_CODEC_OK); + ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeRemainingFrames()); + skip_loop_filter.CheckMd5(false); +} + +TEST(SkipLoopFilterTest, ShutOffLoopFilter8Threads) { + const int non_zero_value = 1; + const int num_threads = 8; + SkipLoopFilterTest skip_loop_filter; + ASSERT_TRUE(skip_loop_filter.Init(num_threads)); + skip_loop_filter.SetSkipLoopFilter(non_zero_value, VPX_CODEC_OK); + ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeRemainingFrames()); + skip_loop_filter.CheckMd5(false); +} + +TEST(SkipLoopFilterTest, WithLoopFilter) { + const int non_zero_value = 1; + const int num_threads = 0; + SkipLoopFilterTest skip_loop_filter; + ASSERT_TRUE(skip_loop_filter.Init(num_threads)); + skip_loop_filter.SetSkipLoopFilter(non_zero_value, VPX_CODEC_OK); + skip_loop_filter.SetSkipLoopFilter(0, VPX_CODEC_OK); + ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeRemainingFrames()); + skip_loop_filter.CheckMd5(true); +} + +TEST(SkipLoopFilterTest, ToggleLoopFilter) { + const int num_threads = 0; + SkipLoopFilterTest skip_loop_filter; + ASSERT_TRUE(skip_loop_filter.Init(num_threads)); + + for (int i = 0; i < 10; ++i) { + skip_loop_filter.SetSkipLoopFilter(i % 2, VPX_CODEC_OK); + ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeOneFrame()); + } + ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeRemainingFrames()); + skip_loop_filter.CheckMd5(false); +} + +} // namespace diff --git a/media/libvpx/libvpx/test/vp9_subtract_test.cc b/media/libvpx/libvpx/test/vp9_subtract_test.cc new file mode 100644 index 0000000000..78deb51909 --- /dev/null +++ b/media/libvpx/libvpx/test/vp9_subtract_test.cc @@ -0,0 +1,321 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "test/acm_random.h" +#include "test/bench.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "vp9/common/vp9_blockd.h" +#include "vpx_ports/msvc.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/vpx_timer.h" + +typedef void (*SubtractFunc)(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, + ptrdiff_t pred_stride); + +namespace vp9 { + +class VP9SubtractBlockTest : public AbstractBench, + public ::testing::TestWithParam { + public: + void TearDown() override { libvpx_test::ClearSystemState(); } + + protected: + void Run() override { + GetParam()(block_height_, block_width_, diff_, block_width_, src_, + block_width_, pred_, block_width_); + } + + void SetupBlocks(BLOCK_SIZE bsize) { + block_width_ = 4 * num_4x4_blocks_wide_lookup[bsize]; + block_height_ = 4 * num_4x4_blocks_high_lookup[bsize]; + diff_ = reinterpret_cast( + vpx_memalign(16, sizeof(*diff_) * block_width_ * block_height_ * 2)); + pred_ = reinterpret_cast( + vpx_memalign(16, block_width_ * block_height_ * 2)); + src_ = reinterpret_cast( + vpx_memalign(16, block_width_ * block_height_ * 2)); + } + + int block_width_; + int block_height_; + int16_t *diff_; + uint8_t *pred_; + uint8_t *src_; +}; + +using libvpx_test::ACMRandom; + +TEST_P(VP9SubtractBlockTest, DISABLED_Speed) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + + for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES; + bsize = static_cast(static_cast(bsize) + 1)) { + SetupBlocks(bsize); + + RunNTimes(100000000 / (block_height_ * block_width_)); + char block_size[16]; + snprintf(block_size, sizeof(block_size), "%dx%d", block_height_, + block_width_); + char title[100]; + snprintf(title, sizeof(title), "%8s ", block_size); + PrintMedian(title); + + vpx_free(diff_); + vpx_free(pred_); + vpx_free(src_); + } +} + +TEST_P(VP9SubtractBlockTest, SimpleSubtract) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + + for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES; + bsize = static_cast(static_cast(bsize) + 1)) { + SetupBlocks(bsize); + + for (int n = 0; n < 100; n++) { + for (int r = 0; r < block_height_; ++r) { + for (int c = 0; c < block_width_ * 2; ++c) { + src_[r * block_width_ * 2 + c] = rnd.Rand8(); + pred_[r * block_width_ * 2 + c] = rnd.Rand8(); + } + } + + GetParam()(block_height_, block_width_, diff_, block_width_, src_, + block_width_, pred_, block_width_); + + for (int r = 0; r < block_height_; ++r) { + for (int c = 0; c < block_width_; ++c) { + EXPECT_EQ(diff_[r * block_width_ + c], + (src_[r * block_width_ + c] - pred_[r * block_width_ + c])) + << "r = " << r << ", c = " << c + << ", bs = " << static_cast(bsize); + } + } + + GetParam()(block_height_, block_width_, diff_, block_width_ * 2, src_, + block_width_ * 2, pred_, block_width_ * 2); + + for (int r = 0; r < block_height_; ++r) { + for (int c = 0; c < block_width_; ++c) { + EXPECT_EQ(diff_[r * block_width_ * 2 + c], + (src_[r * block_width_ * 2 + c] - + pred_[r * block_width_ * 2 + c])) + << "r = " << r << ", c = " << c + << ", bs = " << static_cast(bsize); + } + } + } + vpx_free(diff_); + vpx_free(pred_); + vpx_free(src_); + } +} + +INSTANTIATE_TEST_SUITE_P(C, VP9SubtractBlockTest, + ::testing::Values(vpx_subtract_block_c)); + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P(SSE2, VP9SubtractBlockTest, + ::testing::Values(vpx_subtract_block_sse2)); +#endif +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P(AVX2, VP9SubtractBlockTest, + ::testing::Values(vpx_subtract_block_avx2)); +#endif +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, VP9SubtractBlockTest, + ::testing::Values(vpx_subtract_block_neon)); +#endif +#if HAVE_MSA +INSTANTIATE_TEST_SUITE_P(MSA, VP9SubtractBlockTest, + ::testing::Values(vpx_subtract_block_msa)); +#endif + +#if HAVE_MMI +INSTANTIATE_TEST_SUITE_P(MMI, VP9SubtractBlockTest, + ::testing::Values(vpx_subtract_block_mmi)); +#endif + +#if HAVE_VSX +INSTANTIATE_TEST_SUITE_P(VSX, VP9SubtractBlockTest, + ::testing::Values(vpx_subtract_block_vsx)); +#endif + +#if HAVE_LSX +INSTANTIATE_TEST_SUITE_P(LSX, VP9SubtractBlockTest, + ::testing::Values(vpx_subtract_block_lsx)); +#endif + +#if CONFIG_VP9_HIGHBITDEPTH + +typedef void (*HBDSubtractFunc)(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, + ptrdiff_t pred_stride, int bd); + +// +using Params = std::tuple; + +class VPXHBDSubtractBlockTest : public ::testing::TestWithParam { + public: + void SetUp() override { + block_width_ = 4 * num_4x4_blocks_wide_lookup[GET_PARAM(0)]; + block_height_ = 4 * num_4x4_blocks_high_lookup[GET_PARAM(0)]; + bit_depth_ = static_cast(GET_PARAM(1)); + func_ = GET_PARAM(2); + ref_func_ = GET_PARAM(3); + + rnd_.Reset(ACMRandom::DeterministicSeed()); + + constexpr size_t kMaxWidth = 128; + constexpr size_t kMaxBlockSize = kMaxWidth * kMaxWidth; + src_ = CONVERT_TO_BYTEPTR(reinterpret_cast( + vpx_memalign(16, kMaxBlockSize * sizeof(uint16_t)))); + ASSERT_NE(src_, nullptr); + pred_ = CONVERT_TO_BYTEPTR(reinterpret_cast( + vpx_memalign(16, kMaxBlockSize * sizeof(uint16_t)))); + ASSERT_NE(pred_, nullptr); + diff_ = reinterpret_cast( + vpx_memalign(16, kMaxBlockSize * sizeof(int16_t))); + ASSERT_NE(diff_, nullptr); + } + + void TearDown() override { + vpx_free(CONVERT_TO_SHORTPTR(src_)); + vpx_free(CONVERT_TO_SHORTPTR(pred_)); + vpx_free(diff_); + } + + protected: + void CheckResult(); + void RunForSpeed(); + + private: + ACMRandom rnd_; + int block_height_; + int block_width_; + vpx_bit_depth_t bit_depth_; + HBDSubtractFunc func_; + HBDSubtractFunc ref_func_; + uint8_t *src_; + uint8_t *pred_; + int16_t *diff_; +}; + +void VPXHBDSubtractBlockTest::CheckResult() { + constexpr int kTestNum = 100; + constexpr int kMaxWidth = 128; + constexpr int kMaxBlockSize = kMaxWidth * kMaxWidth; + const int mask = (1 << bit_depth_) - 1; + for (int i = 0; i < kTestNum; ++i) { + for (int j = 0; j < kMaxBlockSize; ++j) { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask; + CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask; + } + + func_(block_height_, block_width_, diff_, block_width_, src_, block_width_, + pred_, block_width_, bit_depth_); + + for (int r = 0; r < block_height_; ++r) { + for (int c = 0; c < block_width_; ++c) { + EXPECT_EQ(diff_[r * block_width_ + c], + (CONVERT_TO_SHORTPTR(src_)[r * block_width_ + c] - + CONVERT_TO_SHORTPTR(pred_)[r * block_width_ + c])) + << "r = " << r << ", c = " << c << ", test: " << i; + } + } + } +} + +TEST_P(VPXHBDSubtractBlockTest, CheckResult) { CheckResult(); } + +void VPXHBDSubtractBlockTest::RunForSpeed() { + constexpr int kTestNum = 200000; + constexpr int kMaxWidth = 128; + constexpr int kMaxBlockSize = kMaxWidth * kMaxWidth; + const int mask = (1 << bit_depth_) - 1; + + if (ref_func_ == func_) GTEST_SKIP(); + + for (int j = 0; j < kMaxBlockSize; ++j) { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask; + CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask; + } + + vpx_usec_timer ref_timer; + vpx_usec_timer_start(&ref_timer); + for (int i = 0; i < kTestNum; ++i) { + ref_func_(block_height_, block_width_, diff_, block_width_, src_, + block_width_, pred_, block_width_, bit_depth_); + } + vpx_usec_timer_mark(&ref_timer); + const int64_t ref_elapsed_time = vpx_usec_timer_elapsed(&ref_timer); + + for (int j = 0; j < kMaxBlockSize; ++j) { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask; + CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask; + } + + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int i = 0; i < kTestNum; ++i) { + func_(block_height_, block_width_, diff_, block_width_, src_, block_width_, + pred_, block_width_, bit_depth_); + } + vpx_usec_timer_mark(&timer); + const int64_t elapsed_time = vpx_usec_timer_elapsed(&timer); + + printf( + "[%dx%d]: " + "ref_time=%6" PRId64 " \t simd_time=%6" PRId64 + " \t " + "gain=%f \n", + block_width_, block_height_, ref_elapsed_time, elapsed_time, + static_cast(ref_elapsed_time) / + static_cast(elapsed_time)); +} + +TEST_P(VPXHBDSubtractBlockTest, DISABLED_Speed) { RunForSpeed(); } + +const BLOCK_SIZE kValidBlockSize[] = { BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, + BLOCK_8X8, BLOCK_8X16, BLOCK_16X8, + BLOCK_16X16, BLOCK_16X32, BLOCK_32X16, + BLOCK_32X32, BLOCK_32X64, BLOCK_64X32, + BLOCK_64X64 }; + +INSTANTIATE_TEST_SUITE_P( + C, VPXHBDSubtractBlockTest, + ::testing::Combine(::testing::ValuesIn(kValidBlockSize), + ::testing::Values(12), + ::testing::Values(&vpx_highbd_subtract_block_c), + ::testing::Values(&vpx_highbd_subtract_block_c))); + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, VPXHBDSubtractBlockTest, + ::testing::Combine(::testing::ValuesIn(kValidBlockSize), + ::testing::Values(12), + ::testing::Values(&vpx_highbd_subtract_block_avx2), + ::testing::Values(&vpx_highbd_subtract_block_c))); +#endif // HAVE_AVX2 + +#endif // CONFIG_VP9_HIGHBITDEPTH +} // namespace vp9 diff --git a/media/libvpx/libvpx/test/vp9_thread_test.cc b/media/libvpx/libvpx/test/vp9_thread_test.cc new file mode 100644 index 0000000000..c0cea681d7 --- /dev/null +++ b/media/libvpx/libvpx/test/vp9_thread_test.cc @@ -0,0 +1,295 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "./vpx_config.h" +#include "test/codec_factory.h" +#include "test/decode_test_driver.h" +#include "test/md5_helper.h" +#if CONFIG_WEBM_IO +#include "test/webm_video_source.h" +#endif +#include "vpx_util/vpx_thread.h" + +namespace { + +using std::string; + +class VPxWorkerThreadTest : public ::testing::TestWithParam { + protected: + ~VPxWorkerThreadTest() override = default; + void SetUp() override { vpx_get_worker_interface()->init(&worker_); } + + void TearDown() override { vpx_get_worker_interface()->end(&worker_); } + + void Run(VPxWorker *worker) { + const bool synchronous = GetParam(); + if (synchronous) { + vpx_get_worker_interface()->execute(worker); + } else { + vpx_get_worker_interface()->launch(worker); + } + } + + VPxWorker worker_; +}; + +int ThreadHook(void *data, void *return_value) { + int *const hook_data = reinterpret_cast(data); + *hook_data = 5; + return *reinterpret_cast(return_value); +} + +TEST_P(VPxWorkerThreadTest, HookSuccess) { + // should be a no-op. + EXPECT_NE(vpx_get_worker_interface()->sync(&worker_), 0); + + for (int i = 0; i < 2; ++i) { + EXPECT_NE(vpx_get_worker_interface()->reset(&worker_), 0); + + int hook_data = 0; + int return_value = 1; // return successfully from the hook + worker_.hook = ThreadHook; + worker_.data1 = &hook_data; + worker_.data2 = &return_value; + + Run(&worker_); + EXPECT_NE(vpx_get_worker_interface()->sync(&worker_), 0); + EXPECT_FALSE(worker_.had_error); + EXPECT_EQ(5, hook_data); + + // should be a no-op. + EXPECT_NE(vpx_get_worker_interface()->sync(&worker_), 0); + } +} + +TEST_P(VPxWorkerThreadTest, HookFailure) { + EXPECT_NE(vpx_get_worker_interface()->reset(&worker_), 0); + + int hook_data = 0; + int return_value = 0; // return failure from the hook + worker_.hook = ThreadHook; + worker_.data1 = &hook_data; + worker_.data2 = &return_value; + + Run(&worker_); + EXPECT_FALSE(vpx_get_worker_interface()->sync(&worker_)); + EXPECT_EQ(1, worker_.had_error); + + // Ensure _reset() clears the error and _launch() can be called again. + return_value = 1; + EXPECT_NE(vpx_get_worker_interface()->reset(&worker_), 0); + EXPECT_FALSE(worker_.had_error); + vpx_get_worker_interface()->launch(&worker_); + EXPECT_NE(vpx_get_worker_interface()->sync(&worker_), 0); + EXPECT_FALSE(worker_.had_error); +} + +TEST_P(VPxWorkerThreadTest, EndWithoutSync) { + // Create a large number of threads to increase the chances of detecting a + // race. Doing more work in the hook is no guarantee as any race would occur + // post hook execution in the main thread loop driver. + static const int kNumWorkers = 64; + VPxWorker workers[kNumWorkers]; + int hook_data[kNumWorkers]; + int return_value[kNumWorkers]; + + for (int n = 0; n < kNumWorkers; ++n) { + vpx_get_worker_interface()->init(&workers[n]); + return_value[n] = 1; // return successfully from the hook + workers[n].hook = ThreadHook; + workers[n].data1 = &hook_data[n]; + workers[n].data2 = &return_value[n]; + } + + for (int i = 0; i < 2; ++i) { + for (int n = 0; n < kNumWorkers; ++n) { + EXPECT_NE(vpx_get_worker_interface()->reset(&workers[n]), 0); + hook_data[n] = 0; + } + + for (int n = 0; n < kNumWorkers; ++n) { + Run(&workers[n]); + } + + for (int n = kNumWorkers - 1; n >= 0; --n) { + vpx_get_worker_interface()->end(&workers[n]); + } + } +} + +TEST(VPxWorkerThreadTest, TestInterfaceAPI) { + EXPECT_EQ(0, vpx_set_worker_interface(nullptr)); + EXPECT_NE(vpx_get_worker_interface(), nullptr); + for (int i = 0; i < 6; ++i) { + VPxWorkerInterface winterface = *vpx_get_worker_interface(); + switch (i) { + default: + case 0: winterface.init = nullptr; break; + case 1: winterface.reset = nullptr; break; + case 2: winterface.sync = nullptr; break; + case 3: winterface.launch = nullptr; break; + case 4: winterface.execute = nullptr; break; + case 5: winterface.end = nullptr; break; + } + EXPECT_EQ(0, vpx_set_worker_interface(&winterface)); + } +} + +// ----------------------------------------------------------------------------- +// Multi-threaded decode tests +#if CONFIG_WEBM_IO +// Decodes |filename| with |num_threads|. Returns the md5 of the decoded frames. +string DecodeFile(const string &filename, int num_threads) { + libvpx_test::WebMVideoSource video(filename); + video.Init(); + + vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); + cfg.threads = num_threads; + libvpx_test::VP9Decoder decoder(cfg, 0); + + libvpx_test::MD5 md5; + for (video.Begin(); video.cxdata(); video.Next()) { + const vpx_codec_err_t res = + decoder.DecodeFrame(video.cxdata(), video.frame_size()); + if (res != VPX_CODEC_OK) { + EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError(); + break; + } + + libvpx_test::DxDataIterator dec_iter = decoder.GetDxData(); + const vpx_image_t *img = nullptr; + + // Get decompressed data + while ((img = dec_iter.Next())) { + md5.Add(img); + } + } + return string(md5.Get()); +} + +// Trivial serialized thread worker interface implementation. +// Note any worker that requires synchronization between other workers will +// hang. +namespace impl { +namespace { + +void Init(VPxWorker *const worker) { memset(worker, 0, sizeof(*worker)); } +int Reset(VPxWorker *const /*worker*/) { return 1; } +int Sync(VPxWorker *const worker) { return !worker->had_error; } + +void Execute(VPxWorker *const worker) { + worker->had_error |= !worker->hook(worker->data1, worker->data2); +} + +void Launch(VPxWorker *const worker) { Execute(worker); } +void End(VPxWorker *const /*worker*/) {} + +} // namespace +} // namespace impl + +TEST(VPxWorkerThreadTest, TestSerialInterface) { + static const VPxWorkerInterface serial_interface = { + impl::Init, impl::Reset, impl::Sync, impl::Launch, impl::Execute, impl::End + }; + static const char expected_md5[] = "b35a1b707b28e82be025d960aba039bc"; + static const char filename[] = "vp90-2-03-size-226x226.webm"; + VPxWorkerInterface default_interface = *vpx_get_worker_interface(); + + EXPECT_NE(vpx_set_worker_interface(&serial_interface), 0); + EXPECT_EQ(expected_md5, DecodeFile(filename, 2)); + + // Reset the interface. + EXPECT_NE(vpx_set_worker_interface(&default_interface), 0); + EXPECT_EQ(expected_md5, DecodeFile(filename, 2)); +} + +struct FileParam { + const char *name; + const char *expected_md5; + friend std::ostream &operator<<(std::ostream &os, const FileParam ¶m) { + return os << "file name: " << param.name + << " digest: " << param.expected_md5; + } +}; + +class VP9DecodeMultiThreadedTest : public ::testing::TestWithParam { +}; + +TEST_P(VP9DecodeMultiThreadedTest, Decode) { + for (int t = 1; t <= 8; ++t) { + EXPECT_EQ(GetParam().expected_md5, DecodeFile(GetParam().name, t)) + << "threads = " << t; + } +} + +const FileParam kNoTilesNonFrameParallelFiles[] = { + { "vp90-2-03-size-226x226.webm", "b35a1b707b28e82be025d960aba039bc" } +}; + +const FileParam kFrameParallelFiles[] = { + { "vp90-2-08-tile_1x2_frame_parallel.webm", + "68ede6abd66bae0a2edf2eb9232241b6" }, + { "vp90-2-08-tile_1x4_frame_parallel.webm", + "368ebc6ebf3a5e478d85b2c3149b2848" }, + { "vp90-2-08-tile_1x8_frame_parallel.webm", + "17e439da2388aff3a0f69cb22579c6c1" }, +}; + +const FileParam kFrameParallelResizeFiles[] = { + { "vp90-2-14-resize-fp-tiles-1-16.webm", "0cd5e632c326297e975f38949c31ea94" }, + { "vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm", + "5c78a96a42e7f4a4f6b2edcdb791e44c" }, + { "vp90-2-14-resize-fp-tiles-1-2.webm", "e030450ae85c3277be2a418769df98e2" }, + { "vp90-2-14-resize-fp-tiles-1-4.webm", "312eed4e2b64eb7a4e7f18916606a430" }, + { "vp90-2-14-resize-fp-tiles-16-1.webm", "1755c16d8af16a9cb3fe7338d90abe52" }, + { "vp90-2-14-resize-fp-tiles-16-2.webm", "500300592d3fcb6f12fab25e48aaf4df" }, + { "vp90-2-14-resize-fp-tiles-16-4.webm", "47c48379fa6331215d91c67648e1af6e" }, + { "vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm", + "eecf17290739bc708506fa4827665989" }, + { "vp90-2-14-resize-fp-tiles-16-8.webm", "29b6bb54e4c26b5ca85d5de5fed94e76" }, + { "vp90-2-14-resize-fp-tiles-1-8.webm", "1b6f175e08cd82cf84bb800ac6d1caa3" }, + { "vp90-2-14-resize-fp-tiles-2-16.webm", "ca3b03e4197995d8d5444ede7a6c0804" }, + { "vp90-2-14-resize-fp-tiles-2-1.webm", "99aec065369d70bbb78ccdff65afed3f" }, + { "vp90-2-14-resize-fp-tiles-2-4.webm", "22d0ebdb49b87d2920a85aea32e1afd5" }, + { "vp90-2-14-resize-fp-tiles-2-8.webm", "c2115cf051c62e0f7db1d4a783831541" }, + { "vp90-2-14-resize-fp-tiles-4-16.webm", "c690d7e1719b31367564cac0af0939cb" }, + { "vp90-2-14-resize-fp-tiles-4-1.webm", "a926020b2cc3e15ad4cc271853a0ff26" }, + { "vp90-2-14-resize-fp-tiles-4-2.webm", "42699063d9e581f1993d0cf890c2be78" }, + { "vp90-2-14-resize-fp-tiles-4-8.webm", "7f76d96036382f45121e3d5aa6f8ec52" }, + { "vp90-2-14-resize-fp-tiles-8-16.webm", "76a43fcdd7e658542913ea43216ec55d" }, + { "vp90-2-14-resize-fp-tiles-8-1.webm", "8e3fbe89486ca60a59299dea9da91378" }, + { "vp90-2-14-resize-fp-tiles-8-2.webm", "ae96f21f21b6370cc0125621b441fc52" }, + { "vp90-2-14-resize-fp-tiles-8-4.webm", "3eb4f24f10640d42218f7fd7b9fd30d4" }, +}; + +const FileParam kNonFrameParallelFiles[] = { + { "vp90-2-08-tile_1x2.webm", "570b4a5d5a70d58b5359671668328a16" }, + { "vp90-2-08-tile_1x4.webm", "988d86049e884c66909d2d163a09841a" }, + { "vp90-2-08-tile_1x8.webm", "0941902a52e9092cb010905eab16364c" }, + { "vp90-2-08-tile-4x1.webm", "06505aade6647c583c8e00a2f582266f" }, + { "vp90-2-08-tile-4x4.webm", "85c2299892460d76e2c600502d52bfe2" }, +}; + +INSTANTIATE_TEST_SUITE_P(NoTilesNonFrameParallel, VP9DecodeMultiThreadedTest, + ::testing::ValuesIn(kNoTilesNonFrameParallelFiles)); +INSTANTIATE_TEST_SUITE_P(FrameParallel, VP9DecodeMultiThreadedTest, + ::testing::ValuesIn(kFrameParallelFiles)); +INSTANTIATE_TEST_SUITE_P(FrameParallelResize, VP9DecodeMultiThreadedTest, + ::testing::ValuesIn(kFrameParallelResizeFiles)); +INSTANTIATE_TEST_SUITE_P(NonFrameParallel, VP9DecodeMultiThreadedTest, + ::testing::ValuesIn(kNonFrameParallelFiles)); +#endif // CONFIG_WEBM_IO + +INSTANTIATE_TEST_SUITE_P(Synchronous, VPxWorkerThreadTest, ::testing::Bool()); + +} // namespace diff --git a/media/libvpx/libvpx/test/vpx_scale_test.cc b/media/libvpx/libvpx/test/vpx_scale_test.cc new file mode 100644 index 0000000000..3897a6088d --- /dev/null +++ b/media/libvpx/libvpx/test/vpx_scale_test.cc @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vpx_config.h" +#include "./vpx_scale_rtcd.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/vpx_scale_test.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/vpx_timer.h" +#include "vpx_scale/yv12config.h" + +namespace libvpx_test { +namespace { + +#if VPX_ARCH_ARM || (VPX_ARCH_MIPS && !HAVE_MIPS64) || VPX_ARCH_X86 +// Avoid OOM failures on 32-bit platforms. +const int kNumSizesToTest = 7; +#else +const int kNumSizesToTest = 8; +#endif +const int kSizesToTest[] = { 1, 15, 33, 145, 512, 1025, 3840, 16383 }; + +typedef void (*ExtendFrameBorderFunc)(YV12_BUFFER_CONFIG *ybf); +typedef void (*CopyFrameFunc)(const YV12_BUFFER_CONFIG *src_ybf, + YV12_BUFFER_CONFIG *dst_ybf); + +class ExtendBorderTest + : public VpxScaleBase, + public ::testing::TestWithParam { + public: + ~ExtendBorderTest() override = default; + + protected: + void SetUp() override { extend_fn_ = GetParam(); } + + void ExtendBorder() { ASM_REGISTER_STATE_CHECK(extend_fn_(&img_)); } + + void RunTest() { + for (int h = 0; h < kNumSizesToTest; ++h) { + for (int w = 0; w < kNumSizesToTest; ++w) { + ASSERT_NO_FATAL_FAILURE(ResetImages(kSizesToTest[w], kSizesToTest[h])); + ReferenceCopyFrame(); + ExtendBorder(); + CompareImages(img_); + DeallocImages(); + } + } + } + + ExtendFrameBorderFunc extend_fn_; +}; + +TEST_P(ExtendBorderTest, ExtendBorder) { ASSERT_NO_FATAL_FAILURE(RunTest()); } + +INSTANTIATE_TEST_SUITE_P(C, ExtendBorderTest, + ::testing::Values(vp8_yv12_extend_frame_borders_c)); + +class CopyFrameTest : public VpxScaleBase, + public ::testing::TestWithParam { + public: + ~CopyFrameTest() override = default; + + protected: + void SetUp() override { copy_frame_fn_ = GetParam(); } + + void CopyFrame() { + ASM_REGISTER_STATE_CHECK(copy_frame_fn_(&img_, &dst_img_)); + } + + void RunTest() { + for (int h = 0; h < kNumSizesToTest; ++h) { + for (int w = 0; w < kNumSizesToTest; ++w) { + ASSERT_NO_FATAL_FAILURE(ResetImages(kSizesToTest[w], kSizesToTest[h])); + ReferenceCopyFrame(); + CopyFrame(); + CompareImages(dst_img_); + DeallocImages(); + } + } + } + + CopyFrameFunc copy_frame_fn_; +}; + +TEST_P(CopyFrameTest, CopyFrame) { ASSERT_NO_FATAL_FAILURE(RunTest()); } + +INSTANTIATE_TEST_SUITE_P(C, CopyFrameTest, + ::testing::Values(vp8_yv12_copy_frame_c)); + +} // namespace +} // namespace libvpx_test diff --git a/media/libvpx/libvpx/test/vpx_scale_test.h b/media/libvpx/libvpx/test/vpx_scale_test.h new file mode 100644 index 0000000000..11c259ae80 --- /dev/null +++ b/media/libvpx/libvpx/test/vpx_scale_test.h @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_TEST_VPX_SCALE_TEST_H_ +#define VPX_TEST_VPX_SCALE_TEST_H_ + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vpx_config.h" +#include "./vpx_scale_rtcd.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_scale/yv12config.h" + +using libvpx_test::ACMRandom; + +namespace libvpx_test { + +class VpxScaleBase { + public: + virtual ~VpxScaleBase() { libvpx_test::ClearSystemState(); } + + void ResetImage(YV12_BUFFER_CONFIG *const img, const int width, + const int height) { + memset(img, 0, sizeof(*img)); + ASSERT_EQ( + 0, vp8_yv12_alloc_frame_buffer(img, width, height, VP8BORDERINPIXELS)) + << "for width: " << width << " height: " << height; + memset(img->buffer_alloc, kBufFiller, img->frame_size); + } + + void ResetImages(const int width, const int height) { + ResetImage(&img_, width, height); + ResetImage(&ref_img_, width, height); + ResetImage(&dst_img_, width, height); + + FillPlane(img_.y_buffer, img_.y_crop_width, img_.y_crop_height, + img_.y_stride); + FillPlane(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height, + img_.uv_stride); + FillPlane(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height, + img_.uv_stride); + } + + void ResetScaleImage(YV12_BUFFER_CONFIG *const img, const int width, + const int height) { + memset(img, 0, sizeof(*img)); +#if CONFIG_VP9_HIGHBITDEPTH + ASSERT_EQ(0, vpx_alloc_frame_buffer(img, width, height, 1, 1, 0, + VP9_ENC_BORDER_IN_PIXELS, 0)); +#else + ASSERT_EQ(0, vpx_alloc_frame_buffer(img, width, height, 1, 1, + VP9_ENC_BORDER_IN_PIXELS, 0)); +#endif + memset(img->buffer_alloc, kBufFiller, img->frame_size); + } + + void ResetScaleImages(const int src_width, const int src_height, + const int dst_width, const int dst_height) { + ResetScaleImage(&img_, src_width, src_height); + ResetScaleImage(&ref_img_, dst_width, dst_height); + ResetScaleImage(&dst_img_, dst_width, dst_height); + FillPlaneExtreme(img_.y_buffer, img_.y_crop_width, img_.y_crop_height, + img_.y_stride); + FillPlaneExtreme(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height, + img_.uv_stride); + FillPlaneExtreme(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height, + img_.uv_stride); + } + + void DeallocImages() { + vp8_yv12_de_alloc_frame_buffer(&img_); + vp8_yv12_de_alloc_frame_buffer(&ref_img_); + vp8_yv12_de_alloc_frame_buffer(&dst_img_); + } + + void DeallocScaleImages() { + vpx_free_frame_buffer(&img_); + vpx_free_frame_buffer(&ref_img_); + vpx_free_frame_buffer(&dst_img_); + } + + protected: + static const int kBufFiller = 123; + static const int kBufMax = kBufFiller - 1; + + static void FillPlane(uint8_t *const buf, const int width, const int height, + const int stride) { + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + buf[x + (y * stride)] = (x + (width * y)) % kBufMax; + } + } + } + + static void FillPlaneExtreme(uint8_t *const buf, const int width, + const int height, const int stride) { + ACMRandom rnd; + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + buf[x + (y * stride)] = rnd.Rand8() % 2 ? 255 : 0; + } + } + } + + static void ExtendPlane(uint8_t *buf, int crop_width, int crop_height, + int width, int height, int stride, int padding) { + // Copy the outermost visible pixel to a distance of at least 'padding.' + // The buffers are allocated such that there may be excess space outside the + // padding. As long as the minimum amount of padding is achieved it is not + // necessary to fill this space as well. + uint8_t *left = buf - padding; + uint8_t *right = buf + crop_width; + const int right_extend = padding + (width - crop_width); + const int bottom_extend = padding + (height - crop_height); + + // Fill the border pixels from the nearest image pixel. + for (int y = 0; y < crop_height; ++y) { + memset(left, left[padding], padding); + memset(right, right[-1], right_extend); + left += stride; + right += stride; + } + + left = buf - padding; + uint8_t *top = left - (stride * padding); + // The buffer does not always extend as far as the stride. + // Equivalent to padding + width + padding. + const int extend_width = padding + crop_width + right_extend; + + // The first row was already extended to the left and right. Copy it up. + for (int y = 0; y < padding; ++y) { + memcpy(top, left, extend_width); + top += stride; + } + + uint8_t *bottom = left + (crop_height * stride); + for (int y = 0; y < bottom_extend; ++y) { + memcpy(bottom, left + (crop_height - 1) * stride, extend_width); + bottom += stride; + } + } + + void ReferenceExtendBorder() { + ExtendPlane(ref_img_.y_buffer, ref_img_.y_crop_width, + ref_img_.y_crop_height, ref_img_.y_width, ref_img_.y_height, + ref_img_.y_stride, ref_img_.border); + ExtendPlane(ref_img_.u_buffer, ref_img_.uv_crop_width, + ref_img_.uv_crop_height, ref_img_.uv_width, ref_img_.uv_height, + ref_img_.uv_stride, ref_img_.border / 2); + ExtendPlane(ref_img_.v_buffer, ref_img_.uv_crop_width, + ref_img_.uv_crop_height, ref_img_.uv_width, ref_img_.uv_height, + ref_img_.uv_stride, ref_img_.border / 2); + } + + void ReferenceCopyFrame() { + // Copy img_ to ref_img_ and extend frame borders. This will be used for + // verifying extend_fn_ as well as copy_frame_fn_. + EXPECT_EQ(ref_img_.frame_size, img_.frame_size); + for (int y = 0; y < img_.y_crop_height; ++y) { + for (int x = 0; x < img_.y_crop_width; ++x) { + ref_img_.y_buffer[x + y * ref_img_.y_stride] = + img_.y_buffer[x + y * img_.y_stride]; + } + } + + for (int y = 0; y < img_.uv_crop_height; ++y) { + for (int x = 0; x < img_.uv_crop_width; ++x) { + ref_img_.u_buffer[x + y * ref_img_.uv_stride] = + img_.u_buffer[x + y * img_.uv_stride]; + ref_img_.v_buffer[x + y * ref_img_.uv_stride] = + img_.v_buffer[x + y * img_.uv_stride]; + } + } + + ReferenceExtendBorder(); + } + + void CompareImages(const YV12_BUFFER_CONFIG actual) { + EXPECT_EQ(ref_img_.frame_size, actual.frame_size); + EXPECT_EQ(0, memcmp(ref_img_.buffer_alloc, actual.buffer_alloc, + ref_img_.frame_size)); + } + + YV12_BUFFER_CONFIG img_; + YV12_BUFFER_CONFIG ref_img_; + YV12_BUFFER_CONFIG dst_img_; +}; + +} // namespace libvpx_test + +#endif // VPX_TEST_VPX_SCALE_TEST_H_ diff --git a/media/libvpx/libvpx/test/vpx_temporal_svc_encoder.sh b/media/libvpx/libvpx/test/vpx_temporal_svc_encoder.sh new file mode 100755 index 0000000000..69c734daf8 --- /dev/null +++ b/media/libvpx/libvpx/test/vpx_temporal_svc_encoder.sh @@ -0,0 +1,334 @@ +#!/bin/sh +## +## Copyright (c) 2014 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## +## This file tests the libvpx vpx_temporal_svc_encoder example. To add new +## tests to this file, do the following: +## 1. Write a shell function (this is your test). +## 2. Add the function to vpx_tsvc_encoder_tests (on a new line). +## +. $(dirname $0)/tools_common.sh + +# Environment check: $YUV_RAW_INPUT is required. +vpx_tsvc_encoder_verify_environment() { + if [ ! -e "${YUV_RAW_INPUT}" ]; then + echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH." + return 1 + fi + if [ "$(vpx_config_option_enabled CONFIG_TEMPORAL_DENOISING)" != "yes" ]; then + elog "Warning: Temporal denoising is disabled! Spatial denoising will be " \ + "used instead, which is probably not what you want for this test." + fi +} + +# Runs vpx_temporal_svc_encoder using the codec specified by $1 and output file +# name by $2. Additional positional parameters are passed directly to +# vpx_temporal_svc_encoder. +vpx_tsvc_encoder() { + local encoder="${LIBVPX_BIN_PATH}/vpx_temporal_svc_encoder" + encoder="${encoder}${VPX_TEST_EXE_SUFFIX}" + local codec="$1" + local output_file_base="$2" + local output_file="${VPX_TEST_OUTPUT_DIR}/${output_file_base}" + local timebase_num="1" + local timebase_den="1000" + local timebase_den_y4m="30" + local speed="6" + local frame_drop_thresh="30" + local max_threads="4" + local error_resilient="1" + + shift 2 + + if [ ! -x "${encoder}" ]; then + elog "${encoder} does not exist or is not executable." + return 1 + fi + + # TODO(tomfinegan): Verify file output for all thread runs. + for threads in $(seq $max_threads); do + if [ "$(vpx_config_option_enabled CONFIG_VP9_HIGHBITDEPTH)" != "yes" ]; then + eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" \ + "${output_file}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \ + "${YUV_RAW_INPUT_HEIGHT}" "${timebase_num}" "${timebase_den}" \ + "${speed}" "${frame_drop_thresh}" "${error_resilient}" "${threads}" \ + "$@" ${devnull} || return 1 + # Test for y4m input. + eval "${VPX_TEST_PREFIX}" "${encoder}" "${Y4M_720P_INPUT}" \ + "${output_file}" "${codec}" "${Y4M_720P_INPUT_WIDTH}" \ + "${Y4M_720P_INPUT_HEIGHT}" "${timebase_num}" "${timebase_den_y4m}" \ + "${speed}" "${frame_drop_thresh}" "${error_resilient}" "${threads}" \ + "$@" ${devnull} || return 1 + else + eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" \ + "${output_file}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \ + "${YUV_RAW_INPUT_HEIGHT}" "${timebase_num}" "${timebase_den}" \ + "${speed}" "${frame_drop_thresh}" "${error_resilient}" "${threads}" \ + "$@" "8" ${devnull} || return 1 + fi + done +} + +# Confirms that all expected output files exist given the output file name +# passed to vpx_temporal_svc_encoder. +# The file name passed to vpx_temporal_svc_encoder is joined with the stream +# number and the extension .ivf to produce per stream output files. Here $1 is +# file name, and $2 is expected number of files. +files_exist() { + local file_name="${VPX_TEST_OUTPUT_DIR}/$1" + local num_files="$(($2 - 1))" + for stream_num in $(seq 0 ${num_files}); do + [ -e "${file_name}_${stream_num}.ivf" ] || return 1 + done +} + +# Run vpx_temporal_svc_encoder in all supported modes for vp8 and vp9. + +vpx_tsvc_encoder_vp8_mode_0() { + if [ "$(vp8_encode_available)" = "yes" ]; then + local output_basename="vpx_tsvc_encoder_vp8_mode_0" + vpx_tsvc_encoder vp8 "${output_basename}" 0 200 || return 1 + # Mode 0 produces 1 stream + files_exist "${output_basename}" 1 || return 1 + fi +} + +vpx_tsvc_encoder_vp8_mode_1() { + if [ "$(vp8_encode_available)" = "yes" ]; then + local output_basename="vpx_tsvc_encoder_vp8_mode_1" + vpx_tsvc_encoder vp8 "${output_basename}" 1 200 400 || return 1 + # Mode 1 produces 2 streams + files_exist "${output_basename}" 2 || return 1 + fi +} + +vpx_tsvc_encoder_vp8_mode_2() { + if [ "$(vp8_encode_available)" = "yes" ]; then + local output_basename="vpx_tsvc_encoder_vp8_mode_2" + vpx_tsvc_encoder vp8 "${output_basename}" 2 200 400 || return 1 + # Mode 2 produces 2 streams + files_exist "${output_basename}" 2 || return 1 + fi +} + +vpx_tsvc_encoder_vp8_mode_3() { + if [ "$(vp8_encode_available)" = "yes" ]; then + local output_basename="vpx_tsvc_encoder_vp8_mode_3" + vpx_tsvc_encoder vp8 "${output_basename}" 3 200 400 600 || return 1 + # Mode 3 produces 3 streams + files_exist "${output_basename}" 3 || return 1 + fi +} + +vpx_tsvc_encoder_vp8_mode_4() { + if [ "$(vp8_encode_available)" = "yes" ]; then + local output_basename="vpx_tsvc_encoder_vp8_mode_4" + vpx_tsvc_encoder vp8 "${output_basename}" 4 200 400 600 || return 1 + # Mode 4 produces 3 streams + files_exist "${output_basename}" 3 || return 1 + fi +} + +vpx_tsvc_encoder_vp8_mode_5() { + if [ "$(vp8_encode_available)" = "yes" ]; then + local output_basename="vpx_tsvc_encoder_vp8_mode_5" + vpx_tsvc_encoder vp8 "${output_basename}" 5 200 400 600 || return 1 + # Mode 5 produces 3 streams + files_exist "${output_basename}" 3 || return 1 + fi +} + +vpx_tsvc_encoder_vp8_mode_6() { + if [ "$(vp8_encode_available)" = "yes" ]; then + local output_basename="vpx_tsvc_encoder_vp8_mode_6" + vpx_tsvc_encoder vp8 "${output_basename}" 6 200 400 600 || return 1 + # Mode 6 produces 3 streams + files_exist "${output_basename}" 3 || return 1 + fi +} + +vpx_tsvc_encoder_vp8_mode_7() { + if [ "$(vp8_encode_available)" = "yes" ]; then + local output_basename="vpx_tsvc_encoder_vp8_mode_7" + vpx_tsvc_encoder vp8 "${output_basename}" 7 200 400 600 800 1000 || return 1 + # Mode 7 produces 5 streams + files_exist "${output_basename}" 5 || return 1 + fi +} + +vpx_tsvc_encoder_vp8_mode_8() { + if [ "$(vp8_encode_available)" = "yes" ]; then + local output_basename="vpx_tsvc_encoder_vp8_mode_8" + vpx_tsvc_encoder vp8 "${output_basename}" 8 200 400 || return 1 + # Mode 8 produces 2 streams + files_exist "${output_basename}" 2 || return 1 + fi +} + +vpx_tsvc_encoder_vp8_mode_9() { + if [ "$(vp8_encode_available)" = "yes" ]; then + local output_basename="vpx_tsvc_encoder_vp8_mode_9" + vpx_tsvc_encoder vp8 "${output_basename}" 9 200 400 600 || return 1 + # Mode 9 produces 3 streams + files_exist "${output_basename}" 3 || return 1 + fi +} + +vpx_tsvc_encoder_vp8_mode_10() { + if [ "$(vp8_encode_available)" = "yes" ]; then + local output_basename="vpx_tsvc_encoder_vp8_mode_10" + vpx_tsvc_encoder vp8 "${output_basename}" 10 200 400 600 || return 1 + # Mode 10 produces 3 streams + files_exist "${output_basename}" 3 || return 1 + fi +} + +vpx_tsvc_encoder_vp8_mode_11() { + if [ "$(vp8_encode_available)" = "yes" ]; then + local output_basename="vpx_tsvc_encoder_vp8_mode_11" + vpx_tsvc_encoder vp8 "${output_basename}" 11 200 400 600 || return 1 + # Mode 11 produces 3 streams + files_exist "${output_basename}" 3 || return 1 + fi +} + +vpx_tsvc_encoder_vp9_mode_0() { + if [ "$(vp9_encode_available)" = "yes" ]; then + local output_basename="vpx_tsvc_encoder_vp9_mode_0" + vpx_tsvc_encoder vp9 "${output_basename}" 0 200 || return 1 + # Mode 0 produces 1 stream + files_exist "${output_basename}" 1 || return 1 + fi +} + +vpx_tsvc_encoder_vp9_mode_1() { + if [ "$(vp9_encode_available)" = "yes" ]; then + local output_basename="vpx_tsvc_encoder_vp9_mode_1" + vpx_tsvc_encoder vp9 "${output_basename}" 1 200 400 || return 1 + # Mode 1 produces 2 streams + files_exist "${output_basename}" 2 || return 1 + fi +} + +vpx_tsvc_encoder_vp9_mode_2() { + if [ "$(vp9_encode_available)" = "yes" ]; then + local output_basename="vpx_tsvc_encoder_vp9_mode_2" + vpx_tsvc_encoder vp9 "${output_basename}" 2 200 400 || return 1 + # Mode 2 produces 2 streams + files_exist "${output_basename}" 2 || return 1 + fi +} + +vpx_tsvc_encoder_vp9_mode_3() { + if [ "$(vp9_encode_available)" = "yes" ]; then + local output_basename="vpx_tsvc_encoder_vp9_mode_3" + vpx_tsvc_encoder vp9 "${output_basename}" 3 200 400 600 || return 1 + # Mode 3 produces 3 streams + files_exist "${output_basename}" 3 || return 1 + fi +} + +vpx_tsvc_encoder_vp9_mode_4() { + if [ "$(vp9_encode_available)" = "yes" ]; then + local output_basename="vpx_tsvc_encoder_vp9_mode_4" + vpx_tsvc_encoder vp9 "${output_basename}" 4 200 400 600 || return 1 + # Mode 4 produces 3 streams + files_exist "${output_basename}" 3 || return 1 + fi +} + +vpx_tsvc_encoder_vp9_mode_5() { + if [ "$(vp9_encode_available)" = "yes" ]; then + local output_basename="vpx_tsvc_encoder_vp9_mode_5" + vpx_tsvc_encoder vp9 "${output_basename}" 5 200 400 600 || return 1 + # Mode 5 produces 3 streams + files_exist "${output_basename}" 3 || return 1 + fi +} + +vpx_tsvc_encoder_vp9_mode_6() { + if [ "$(vp9_encode_available)" = "yes" ]; then + local output_basename="vpx_tsvc_encoder_vp9_mode_6" + vpx_tsvc_encoder vp9 "${output_basename}" 6 200 400 600 || return 1 + # Mode 6 produces 3 streams + files_exist "${output_basename}" 3 || return 1 + fi +} + +vpx_tsvc_encoder_vp9_mode_7() { + if [ "$(vp9_encode_available)" = "yes" ]; then + local output_basename="vpx_tsvc_encoder_vp9_mode_7" + vpx_tsvc_encoder vp9 "${output_basename}" 7 200 400 600 800 1000 || return 1 + # Mode 7 produces 5 streams + files_exist "${output_basename}" 5 || return 1 + fi +} + +vpx_tsvc_encoder_vp9_mode_8() { + if [ "$(vp9_encode_available)" = "yes" ]; then + local output_basename="vpx_tsvc_encoder_vp9_mode_8" + vpx_tsvc_encoder vp9 "${output_basename}" 8 200 400 || return 1 + # Mode 8 produces 2 streams + files_exist "${output_basename}" 2 || return 1 + fi +} + +vpx_tsvc_encoder_vp9_mode_9() { + if [ "$(vp9_encode_available)" = "yes" ]; then + local output_basename="vpx_tsvc_encoder_vp9_mode_9" + vpx_tsvc_encoder vp9 "${output_basename}" 9 200 400 600 || return 1 + # Mode 9 produces 3 streams + files_exist "${output_basename}" 3 || return 1 + fi +} + +vpx_tsvc_encoder_vp9_mode_10() { + if [ "$(vp9_encode_available)" = "yes" ]; then + local output_basename="vpx_tsvc_encoder_vp9_mode_10" + vpx_tsvc_encoder vp9 "${output_basename}" 10 200 400 600 || return 1 + # Mode 10 produces 3 streams + files_exist "${output_basename}" 3 || return 1 + fi +} + +vpx_tsvc_encoder_vp9_mode_11() { + if [ "$(vp9_encode_available)" = "yes" ]; then + local output_basename="vpx_tsvc_encoder_vp9_mode_11" + vpx_tsvc_encoder vp9 "${output_basename}" 11 200 400 600 || return 1 + # Mode 11 produces 3 streams + files_exist "${output_basename}" 3 || return 1 + fi +} + +vpx_tsvc_encoder_tests="vpx_tsvc_encoder_vp8_mode_0 + vpx_tsvc_encoder_vp8_mode_1 + vpx_tsvc_encoder_vp8_mode_2 + vpx_tsvc_encoder_vp8_mode_3 + vpx_tsvc_encoder_vp8_mode_4 + vpx_tsvc_encoder_vp8_mode_5 + vpx_tsvc_encoder_vp8_mode_6 + vpx_tsvc_encoder_vp8_mode_7 + vpx_tsvc_encoder_vp8_mode_8 + vpx_tsvc_encoder_vp8_mode_9 + vpx_tsvc_encoder_vp8_mode_10 + vpx_tsvc_encoder_vp8_mode_11 + vpx_tsvc_encoder_vp9_mode_0 + vpx_tsvc_encoder_vp9_mode_1 + vpx_tsvc_encoder_vp9_mode_2 + vpx_tsvc_encoder_vp9_mode_3 + vpx_tsvc_encoder_vp9_mode_4 + vpx_tsvc_encoder_vp9_mode_5 + vpx_tsvc_encoder_vp9_mode_6 + vpx_tsvc_encoder_vp9_mode_7 + vpx_tsvc_encoder_vp9_mode_8 + vpx_tsvc_encoder_vp9_mode_9 + vpx_tsvc_encoder_vp9_mode_10 + vpx_tsvc_encoder_vp9_mode_11" + +run_tests vpx_tsvc_encoder_verify_environment "${vpx_tsvc_encoder_tests}" diff --git a/media/libvpx/libvpx/test/vpxdec.sh b/media/libvpx/libvpx/test/vpxdec.sh new file mode 100755 index 0000000000..199feae5f3 --- /dev/null +++ b/media/libvpx/libvpx/test/vpxdec.sh @@ -0,0 +1,135 @@ +#!/bin/sh +## +## Copyright (c) 2014 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## +## This file tests vpxdec. To add new tests to this file, do the following: +## 1. Write a shell function (this is your test). +## 2. Add the function to vpxdec_tests (on a new line). +## +. $(dirname $0)/tools_common.sh + +# Environment check: Make sure input is available. +vpxdec_verify_environment() { + if [ ! -e "${VP8_IVF_FILE}" ] || [ ! -e "${VP9_WEBM_FILE}" ] || \ + [ ! -e "${VP9_FPM_WEBM_FILE}" ] || \ + [ ! -e "${VP9_LT_50_FRAMES_WEBM_FILE}" ] || \ + [ ! -e "${VP9_RAW_FILE}" ]; then + elog "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH." + return 1 + fi + if [ -z "$(vpx_tool_path vpxdec)" ]; then + elog "vpxdec not found. It must exist in LIBVPX_BIN_PATH or its parent." + return 1 + fi +} + +# Wrapper function for running vpxdec with pipe input. Requires that +# LIBVPX_BIN_PATH points to the directory containing vpxdec. $1 is used as the +# input file path and shifted away. All remaining parameters are passed through +# to vpxdec. +vpxdec_pipe() { + local decoder="$(vpx_tool_path vpxdec)" + local input="$1" + shift + cat "${input}" | eval "${VPX_TEST_PREFIX}" "${decoder}" - "$@" ${devnull} +} + +# Wrapper function for running vpxdec. Requires that LIBVPX_BIN_PATH points to +# the directory containing vpxdec. $1 one is used as the input file path and +# shifted away. All remaining parameters are passed through to vpxdec. +vpxdec() { + local decoder="$(vpx_tool_path vpxdec)" + local input="$1" + shift + eval "${VPX_TEST_PREFIX}" "${decoder}" "$input" "$@" ${devnull} +} + +vpxdec_can_decode_vp8() { + if [ "$(vp8_decode_available)" = "yes" ]; then + echo yes + fi +} + +vpxdec_can_decode_vp9() { + if [ "$(vp9_decode_available)" = "yes" ]; then + echo yes + fi +} + +vpxdec_vp8_ivf() { + if [ "$(vpxdec_can_decode_vp8)" = "yes" ]; then + vpxdec "${VP8_IVF_FILE}" --summary --noblit + fi +} + +vpxdec_vp8_ivf_pipe_input() { + if [ "$(vpxdec_can_decode_vp8)" = "yes" ]; then + vpxdec_pipe "${VP8_IVF_FILE}" --summary --noblit + fi +} + +vpxdec_vp9_webm() { + if [ "$(vpxdec_can_decode_vp9)" = "yes" ] && \ + [ "$(webm_io_available)" = "yes" ]; then + vpxdec "${VP9_WEBM_FILE}" --summary --noblit + fi +} + +vpxdec_vp9_webm_frame_parallel() { + if [ "$(vpxdec_can_decode_vp9)" = "yes" ] && \ + [ "$(webm_io_available)" = "yes" ]; then + for threads in 2 3 4 5 6 7 8; do + vpxdec "${VP9_FPM_WEBM_FILE}" --summary --noblit --threads=$threads \ + --frame-parallel || return 1 + done + fi +} + +vpxdec_vp9_webm_less_than_50_frames() { + # ensure that reaching eof in webm_guess_framerate doesn't result in invalid + # frames in actual webm_read_frame calls. + if [ "$(vpxdec_can_decode_vp9)" = "yes" ] && \ + [ "$(webm_io_available)" = "yes" ]; then + local decoder="$(vpx_tool_path vpxdec)" + local expected=10 + local num_frames=$(${VPX_TEST_PREFIX} "${decoder}" \ + "${VP9_LT_50_FRAMES_WEBM_FILE}" --summary --noblit 2>&1 \ + | awk '/^[0-9]+ decoded frames/ { print $1 }') + if [ "$num_frames" -ne "$expected" ]; then + elog "Output frames ($num_frames) != expected ($expected)" + return 1 + fi + fi +} + +# Ensures VP9_RAW_FILE correctly produces 1 frame instead of causing a hang. +vpxdec_vp9_raw_file() { + # Ensure a raw file properly reports eof and doesn't cause a hang. + if [ "$(vpxdec_can_decode_vp9)" = "yes" ]; then + local decoder="$(vpx_tool_path vpxdec)" + local expected=1 + [ -x /usr/bin/timeout ] && local TIMEOUT="/usr/bin/timeout 30s" + local num_frames=$(${TIMEOUT} ${VPX_TEST_PREFIX} "${decoder}" \ + "${VP9_RAW_FILE}" --summary --noblit 2>&1 \ + | awk '/^[0-9]+ decoded frames/ { print $1 }') + if [ -z "$num_frames" ] || [ "$num_frames" -ne "$expected" ]; then + elog "Output frames ($num_frames) != expected ($expected)" + return 1 + fi + fi +} + +vpxdec_tests="vpxdec_vp8_ivf + vpxdec_vp8_ivf_pipe_input + vpxdec_vp9_webm + vpxdec_vp9_webm_frame_parallel + vpxdec_vp9_webm_less_than_50_frames + vpxdec_vp9_raw_file" + +run_tests vpxdec_verify_environment "${vpxdec_tests}" diff --git a/media/libvpx/libvpx/test/vpxenc.sh b/media/libvpx/libvpx/test/vpxenc.sh new file mode 100755 index 0000000000..172349a2b3 --- /dev/null +++ b/media/libvpx/libvpx/test/vpxenc.sh @@ -0,0 +1,489 @@ +#!/bin/sh +## +## Copyright (c) 2014 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## +## This file tests vpxenc using hantro_collage_w352h288.yuv as input. To add +## new tests to this file, do the following: +## 1. Write a shell function (this is your test). +## 2. Add the function to vpxenc_tests (on a new line). +## +. $(dirname $0)/tools_common.sh + +readonly TEST_FRAMES=10 + +# Environment check: Make sure input is available. +vpxenc_verify_environment() { + if [ ! -e "${YUV_RAW_INPUT}" ]; then + elog "The file ${YUV_RAW_INPUT##*/} must exist in LIBVPX_TEST_DATA_PATH." + return 1 + fi + if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then + if [ ! -e "${Y4M_NOSQ_PAR_INPUT}" ]; then + elog "The file ${Y4M_NOSQ_PAR_INPUT##*/} must exist in" + elog "LIBVPX_TEST_DATA_PATH." + return 1 + fi + fi + if [ -z "$(vpx_tool_path vpxenc)" ]; then + elog "vpxenc not found. It must exist in LIBVPX_BIN_PATH or its parent." + return 1 + fi +} + +vpxenc_can_encode_vp8() { + if [ "$(vp8_encode_available)" = "yes" ]; then + echo yes + fi +} + +vpxenc_can_encode_vp9() { + if [ "$(vp9_encode_available)" = "yes" ]; then + echo yes + fi +} + +# Echo vpxenc command line parameters allowing use of +# hantro_collage_w352h288.yuv as input. +yuv_input_hantro_collage() { + echo ""${YUV_RAW_INPUT}" + --width="${YUV_RAW_INPUT_WIDTH}" + --height="${YUV_RAW_INPUT_HEIGHT}"" +} + +y4m_input_non_square_par() { + echo ""${Y4M_NOSQ_PAR_INPUT}"" +} + +y4m_input_720p() { + echo ""${Y4M_720P_INPUT}"" +} + +# Echo default vpxenc real time encoding params. $1 is the codec, which defaults +# to vp8 if unspecified. +vpxenc_rt_params() { + local codec="${1:-vp8}" + echo "--codec=${codec} + --buf-initial-sz=500 + --buf-optimal-sz=600 + --buf-sz=1000 + --cpu-used=-6 + --end-usage=cbr + --error-resilient=1 + --kf-max-dist=90000 + --lag-in-frames=0 + --max-intra-rate=300 + --max-q=56 + --min-q=2 + --noise-sensitivity=0 + --overshoot-pct=50 + --passes=1 + --profile=0 + --resize-allowed=0 + --rt + --static-thresh=0 + --undershoot-pct=50" +} + +# Forces --passes to 1 with CONFIG_REALTIME_ONLY. +vpxenc_passes_param() { + if [ "$(vpx_config_option_enabled CONFIG_REALTIME_ONLY)" = "yes" ]; then + echo "--passes=1" + else + echo "--passes=2" + fi +} + +# Wrapper function for running vpxenc with pipe input. Requires that +# LIBVPX_BIN_PATH points to the directory containing vpxenc. $1 is used as the +# input file path and shifted away. All remaining parameters are passed through +# to vpxenc. +vpxenc_pipe() { + local encoder="$(vpx_tool_path vpxenc)" + local input="$1" + shift + cat "${input}" | eval "${VPX_TEST_PREFIX}" "${encoder}" - \ + --test-decode=fatal \ + "$@" ${devnull} +} + +# Wrapper function for running vpxenc. Requires that LIBVPX_BIN_PATH points to +# the directory containing vpxenc. $1 one is used as the input file path and +# shifted away. All remaining parameters are passed through to vpxenc. +vpxenc() { + local encoder="$(vpx_tool_path vpxenc)" + local input="$1" + shift + eval "${VPX_TEST_PREFIX}" "${encoder}" "${input}" \ + --test-decode=fatal \ + "$@" ${devnull} +} + +vpxenc_vp8_ivf() { + if [ "$(vpxenc_can_encode_vp8)" = "yes" ]; then + local output="${VPX_TEST_OUTPUT_DIR}/vp8.ivf" + vpxenc $(yuv_input_hantro_collage) \ + --codec=vp8 \ + --limit="${TEST_FRAMES}" \ + --ivf \ + --output="${output}" || return 1 + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + fi +} + +vpxenc_vp8_webm() { + if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \ + [ "$(webm_io_available)" = "yes" ]; then + local output="${VPX_TEST_OUTPUT_DIR}/vp8.webm" + vpxenc $(yuv_input_hantro_collage) \ + --codec=vp8 \ + --limit="${TEST_FRAMES}" \ + --output="${output}" || return 1 + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + fi +} + +vpxenc_vp8_webm_rt() { + if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \ + [ "$(webm_io_available)" = "yes" ]; then + local output="${VPX_TEST_OUTPUT_DIR}/vp8_rt.webm" + vpxenc $(yuv_input_hantro_collage) \ + $(vpxenc_rt_params vp8) \ + --output="${output}" || return 1 + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + fi +} + +vpxenc_vp8_webm_2pass() { + if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \ + [ "$(webm_io_available)" = "yes" ]; then + local output="${VPX_TEST_OUTPUT_DIR}/vp8.webm" + vpxenc $(yuv_input_hantro_collage) \ + --codec=vp8 \ + --limit="${TEST_FRAMES}" \ + --output="${output}" \ + --passes=2 || return 1 + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + fi +} + +vpxenc_vp8_webm_lag10_frames20() { + if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \ + [ "$(webm_io_available)" = "yes" ]; then + local lag_total_frames=20 + local lag_frames=10 + local output="${VPX_TEST_OUTPUT_DIR}/vp8_lag10_frames20.webm" + vpxenc $(yuv_input_hantro_collage) \ + --codec=vp8 \ + --limit="${lag_total_frames}" \ + --lag-in-frames="${lag_frames}" \ + --output="${output}" \ + --auto-alt-ref=1 \ + --passes=2 || return 1 + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + fi +} + +vpxenc_vp8_ivf_piped_input() { + if [ "$(vpxenc_can_encode_vp8)" = "yes" ]; then + local output="${VPX_TEST_OUTPUT_DIR}/vp8_piped_input.ivf" + vpxenc_pipe $(yuv_input_hantro_collage) \ + --codec=vp8 \ + --limit="${TEST_FRAMES}" \ + --ivf \ + --output="${output}" || return 1 + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + fi +} + +vpxenc_vp9_ivf() { + if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then + local output="${VPX_TEST_OUTPUT_DIR}/vp9.ivf" + local passes=$(vpxenc_passes_param) + vpxenc $(yuv_input_hantro_collage) \ + --codec=vp9 \ + --limit="${TEST_FRAMES}" \ + "${passes}" \ + --ivf \ + --output="${output}" || return 1 + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + fi +} + +vpxenc_vp9_webm() { + if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \ + [ "$(webm_io_available)" = "yes" ]; then + local output="${VPX_TEST_OUTPUT_DIR}/vp9.webm" + local passes=$(vpxenc_passes_param) + vpxenc $(yuv_input_hantro_collage) \ + --codec=vp9 \ + --limit="${TEST_FRAMES}" \ + "${passes}" \ + --output="${output}" || return 1 + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + fi +} + +vpxenc_vp9_webm_rt() { + if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \ + [ "$(webm_io_available)" = "yes" ]; then + local output="${VPX_TEST_OUTPUT_DIR}/vp9_rt.webm" + vpxenc $(yuv_input_hantro_collage) \ + $(vpxenc_rt_params vp9) \ + --output="${output}" || return 1 + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + fi +} + +vpxenc_vp9_webm_rt_multithread_tiled() { + if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \ + [ "$(webm_io_available)" = "yes" ]; then + local output="${VPX_TEST_OUTPUT_DIR}/vp9_rt_multithread_tiled.webm" + local tilethread_min=2 + local tilethread_max=4 + local num_threads="$(seq ${tilethread_min} ${tilethread_max})" + local num_tile_cols="$(seq ${tilethread_min} ${tilethread_max})" + + for threads in ${num_threads}; do + for tile_cols in ${num_tile_cols}; do + vpxenc $(y4m_input_720p) \ + $(vpxenc_rt_params vp9) \ + --threads=${threads} \ + --tile-columns=${tile_cols} \ + --output="${output}" || return 1 + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + rm "${output}" + done + done + fi +} + +vpxenc_vp9_webm_rt_multithread_tiled_frameparallel() { + if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \ + [ "$(webm_io_available)" = "yes" ]; then + local output="${VPX_TEST_OUTPUT_DIR}/vp9_rt_mt_t_fp.webm" + local tilethread_min=2 + local tilethread_max=4 + local num_threads="$(seq ${tilethread_min} ${tilethread_max})" + local num_tile_cols="$(seq ${tilethread_min} ${tilethread_max})" + + for threads in ${num_threads}; do + for tile_cols in ${num_tile_cols}; do + vpxenc $(y4m_input_720p) \ + $(vpxenc_rt_params vp9) \ + --threads=${threads} \ + --tile-columns=${tile_cols} \ + --frame-parallel=1 \ + --output="${output}" || return 1 + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + rm "${output}" + done + done + fi +} + +vpxenc_vp9_webm_2pass() { + if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \ + [ "$(webm_io_available)" = "yes" ]; then + local output="${VPX_TEST_OUTPUT_DIR}/vp9.webm" + vpxenc $(yuv_input_hantro_collage) \ + --codec=vp9 \ + --limit="${TEST_FRAMES}" \ + --output="${output}" \ + --passes=2 || return 1 + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + fi +} + +vpxenc_vp9_ivf_lossless() { + if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then + local output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless.ivf" + local passes=$(vpxenc_passes_param) + vpxenc $(yuv_input_hantro_collage) \ + --codec=vp9 \ + --limit="${TEST_FRAMES}" \ + --ivf \ + --output="${output}" \ + "${passes}" \ + --lossless=1 || return 1 + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + fi +} + +vpxenc_vp9_ivf_minq0_maxq0() { + if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then + local output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless_minq0_maxq0.ivf" + local passes=$(vpxenc_passes_param) + vpxenc $(yuv_input_hantro_collage) \ + --codec=vp9 \ + --limit="${TEST_FRAMES}" \ + --ivf \ + --output="${output}" \ + "${passes}" \ + --min-q=0 \ + --max-q=0 || return 1 + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + fi +} + +vpxenc_vp9_webm_lag10_frames20() { + if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \ + [ "$(webm_io_available)" = "yes" ]; then + local lag_total_frames=20 + local lag_frames=10 + local output="${VPX_TEST_OUTPUT_DIR}/vp9_lag10_frames20.webm" + local passes=$(vpxenc_passes_param) + vpxenc $(yuv_input_hantro_collage) \ + --codec=vp9 \ + --limit="${lag_total_frames}" \ + --lag-in-frames="${lag_frames}" \ + --output="${output}" \ + "${passes}" \ + --auto-alt-ref=1 || return 1 + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + fi +} + +# TODO(fgalligan): Test that DisplayWidth is different than video width. +vpxenc_vp9_webm_non_square_par() { + if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \ + [ "$(webm_io_available)" = "yes" ]; then + local output="${VPX_TEST_OUTPUT_DIR}/vp9_non_square_par.webm" + local passes=$(vpxenc_passes_param) + vpxenc $(y4m_input_non_square_par) \ + --codec=vp9 \ + --limit="${TEST_FRAMES}" \ + "${passes}" \ + --output="${output}" || return 1 + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + fi +} + +vpxenc_vp9_webm_sharpness() { + if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then + local sharpnesses="0 1 2 3 4 5 6 7" + local output="${VPX_TEST_OUTPUT_DIR}/vpxenc_vp9_webm_sharpness.ivf" + local last_size=0 + local this_size=0 + + for sharpness in ${sharpnesses}; do + + vpxenc $(yuv_input_hantro_collage) \ + --sharpness="${sharpness}" \ + --codec=vp9 \ + --limit=1 \ + --cpu-used=2 \ + --end-usage=q \ + --cq-level=40 \ + --output="${output}" \ + "${passes}" || return 1 + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + + this_size=$(stat -c '%s' "${output}") + if [ "${this_size}" -lt "${last_size}" ]; then + elog "Higher sharpness value yielded lower file size." + echo "${this_size}" " < " "${last_size}" + return 1 + fi + last_size="${this_size}" + + done + fi +} + +vpxenc_tests="vpxenc_vp8_ivf + vpxenc_vp8_webm + vpxenc_vp8_webm_rt + vpxenc_vp8_ivf_piped_input + vpxenc_vp9_ivf + vpxenc_vp9_webm + vpxenc_vp9_webm_rt + vpxenc_vp9_webm_rt_multithread_tiled + vpxenc_vp9_webm_rt_multithread_tiled_frameparallel + vpxenc_vp9_ivf_lossless + vpxenc_vp9_ivf_minq0_maxq0 + vpxenc_vp9_webm_lag10_frames20 + vpxenc_vp9_webm_non_square_par + vpxenc_vp9_webm_sharpness" + +if [ "$(vpx_config_option_enabled CONFIG_REALTIME_ONLY)" != "yes" ]; then + vpxenc_tests="$vpxenc_tests + vpxenc_vp8_webm_2pass + vpxenc_vp8_webm_lag10_frames20 + vpxenc_vp9_webm_2pass" +fi + +run_tests vpxenc_verify_environment "${vpxenc_tests}" diff --git a/media/libvpx/libvpx/test/webm_video_source.h b/media/libvpx/libvpx/test/webm_video_source.h new file mode 100644 index 0000000000..6ab50c849f --- /dev/null +++ b/media/libvpx/libvpx/test/webm_video_source.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_TEST_WEBM_VIDEO_SOURCE_H_ +#define VPX_TEST_WEBM_VIDEO_SOURCE_H_ +#include +#include +#include +#include +#include +#include "../tools_common.h" +#include "../webmdec.h" +#include "test/video_source.h" + +namespace libvpx_test { + +// This class extends VideoSource to allow parsing of WebM files, +// so that we can do actual file decodes. +class WebMVideoSource : public CompressedVideoSource { + public: + explicit WebMVideoSource(const std::string &file_name) + : file_name_(file_name), vpx_ctx_(new VpxInputContext()), + webm_ctx_(new WebmInputContext()), buf_(nullptr), buf_sz_(0), frame_(0), + end_of_file_(false) {} + + ~WebMVideoSource() override { + if (vpx_ctx_->file != nullptr) fclose(vpx_ctx_->file); + webm_free(webm_ctx_); + delete vpx_ctx_; + delete webm_ctx_; + } + + void Init() override {} + + void Begin() override { + vpx_ctx_->file = OpenTestDataFile(file_name_); + ASSERT_NE(vpx_ctx_->file, nullptr) + << "Input file open failed. Filename: " << file_name_; + + ASSERT_EQ(file_is_webm(webm_ctx_, vpx_ctx_), 1) << "file is not WebM"; + + FillFrame(); + } + + void Next() override { + ++frame_; + FillFrame(); + } + + void FillFrame() { + ASSERT_NE(vpx_ctx_->file, nullptr); + const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_); + ASSERT_GE(status, 0) << "webm_read_frame failed"; + if (status == 1) { + end_of_file_ = true; + } + } + + void SeekToNextKeyFrame() { + ASSERT_NE(vpx_ctx_->file, nullptr); + do { + const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_); + ASSERT_GE(status, 0) << "webm_read_frame failed"; + ++frame_; + if (status == 1) { + end_of_file_ = true; + } + } while (!webm_ctx_->is_key_frame && !end_of_file_); + } + + const uint8_t *cxdata() const override { + return end_of_file_ ? nullptr : buf_; + } + size_t frame_size() const override { return buf_sz_; } + unsigned int frame_number() const override { return frame_; } + + protected: + std::string file_name_; + VpxInputContext *vpx_ctx_; + WebmInputContext *webm_ctx_; + uint8_t *buf_; + size_t buf_sz_; + unsigned int frame_; + bool end_of_file_; +}; + +} // namespace libvpx_test + +#endif // VPX_TEST_WEBM_VIDEO_SOURCE_H_ diff --git a/media/libvpx/libvpx/test/y4m_test.cc b/media/libvpx/libvpx/test/y4m_test.cc new file mode 100644 index 0000000000..78a944fd08 --- /dev/null +++ b/media/libvpx/libvpx/test/y4m_test.cc @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vpx_config.h" +#include "./y4menc.h" +#include "test/md5_helper.h" +#include "test/util.h" +#include "test/y4m_video_source.h" + +namespace { + +using std::string; + +static const unsigned int kWidth = 160; +static const unsigned int kHeight = 90; +static const unsigned int kFrames = 10; + +struct Y4mTestParam { + const char *filename; + unsigned int bit_depth; + vpx_img_fmt format; + const char *md5raw; +}; + +const Y4mTestParam kY4mTestVectors[] = { + { "park_joy_90p_8_420.y4m", 8, VPX_IMG_FMT_I420, + "e5406275b9fc6bb3436c31d4a05c1cab" }, + { "park_joy_90p_8_422.y4m", 8, VPX_IMG_FMT_I422, + "284a47a47133b12884ec3a14e959a0b6" }, + { "park_joy_90p_8_444.y4m", 8, VPX_IMG_FMT_I444, + "90517ff33843d85de712fd4fe60dbed0" }, + { "park_joy_90p_10_420_20f.y4m", 10, VPX_IMG_FMT_I42016, + "2f56ab9809269f074df7e3daf1ce0be6" }, + { "park_joy_90p_10_422_20f.y4m", 10, VPX_IMG_FMT_I42216, + "1b5c73d2e8e8c4e02dc4889ecac41c83" }, + { "park_joy_90p_10_444_20f.y4m", 10, VPX_IMG_FMT_I44416, + "ec4ab5be53195c5b838d1d19e1bc2674" }, + { "park_joy_90p_12_420_20f.y4m", 12, VPX_IMG_FMT_I42016, + "3370856c8ddebbd1f9bb2e66f97677f4" }, + { "park_joy_90p_12_422_20f.y4m", 12, VPX_IMG_FMT_I42216, + "4eab364318dd8201acbb182e43bd4966" }, + { "park_joy_90p_12_444_20f.y4m", 12, VPX_IMG_FMT_I44416, + "f189dfbbd92119fc8e5f211a550166be" }, +}; + +static void write_image_file(const vpx_image_t *img, FILE *file) { + int plane, y; + for (plane = 0; plane < 3; ++plane) { + const unsigned char *buf = img->planes[plane]; + const int stride = img->stride[plane]; + const int bytes_per_sample = (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; + const int h = + (plane ? (img->d_h + img->y_chroma_shift) >> img->y_chroma_shift + : img->d_h); + const int w = + (plane ? (img->d_w + img->x_chroma_shift) >> img->x_chroma_shift + : img->d_w); + for (y = 0; y < h; ++y) { + fwrite(buf, bytes_per_sample, w, file); + buf += stride; + } + } +} + +class Y4mVideoSourceTest : public ::testing::TestWithParam, + public ::libvpx_test::Y4mVideoSource { + protected: + Y4mVideoSourceTest() : Y4mVideoSource("", 0, 0) {} + + ~Y4mVideoSourceTest() override { CloseSource(); } + + virtual void Init(const std::string &file_name, int limit) { + file_name_ = file_name; + start_ = 0; + limit_ = limit; + frame_ = 0; + Begin(); + } + + // Checks y4m header information + void HeaderChecks(unsigned int bit_depth, vpx_img_fmt_t fmt) { + ASSERT_NE(input_file_, nullptr); + ASSERT_EQ(y4m_.pic_w, (int)kWidth); + ASSERT_EQ(y4m_.pic_h, (int)kHeight); + ASSERT_EQ(img()->d_w, kWidth); + ASSERT_EQ(img()->d_h, kHeight); + ASSERT_EQ(y4m_.bit_depth, bit_depth); + ASSERT_EQ(y4m_.vpx_fmt, fmt); + if (fmt == VPX_IMG_FMT_I420 || fmt == VPX_IMG_FMT_I42016) { + ASSERT_EQ(y4m_.bps, (int)y4m_.bit_depth * 3 / 2); + ASSERT_EQ(img()->x_chroma_shift, 1U); + ASSERT_EQ(img()->y_chroma_shift, 1U); + } + if (fmt == VPX_IMG_FMT_I422 || fmt == VPX_IMG_FMT_I42216) { + ASSERT_EQ(y4m_.bps, (int)y4m_.bit_depth * 2); + ASSERT_EQ(img()->x_chroma_shift, 1U); + ASSERT_EQ(img()->y_chroma_shift, 0U); + } + if (fmt == VPX_IMG_FMT_I444 || fmt == VPX_IMG_FMT_I44416) { + ASSERT_EQ(y4m_.bps, (int)y4m_.bit_depth * 3); + ASSERT_EQ(img()->x_chroma_shift, 0U); + ASSERT_EQ(img()->y_chroma_shift, 0U); + } + } + + // Checks MD5 of the raw frame data + void Md5Check(const string &expected_md5) { + ASSERT_NE(input_file_, nullptr); + libvpx_test::MD5 md5; + for (unsigned int i = start_; i < limit_; i++) { + md5.Add(img()); + Next(); + } + ASSERT_EQ(string(md5.Get()), expected_md5); + } +}; + +TEST_P(Y4mVideoSourceTest, SourceTest) { + const Y4mTestParam t = GetParam(); + Init(t.filename, kFrames); + HeaderChecks(t.bit_depth, t.format); + Md5Check(t.md5raw); +} + +INSTANTIATE_TEST_SUITE_P(C, Y4mVideoSourceTest, + ::testing::ValuesIn(kY4mTestVectors)); + +class Y4mVideoWriteTest : public Y4mVideoSourceTest { + protected: + Y4mVideoWriteTest() : tmpfile_(nullptr) {} + + ~Y4mVideoWriteTest() override { + delete tmpfile_; + input_file_ = nullptr; + } + + void ReplaceInputFile(FILE *input_file) { + CloseSource(); + frame_ = 0; + input_file_ = input_file; + rewind(input_file_); + ReadSourceToStart(); + } + + // Writes out a y4m file and then reads it back + void WriteY4mAndReadBack() { + ASSERT_NE(input_file_, nullptr); + char buf[Y4M_BUFFER_SIZE] = { 0 }; + const struct VpxRational framerate = { y4m_.fps_n, y4m_.fps_d }; + tmpfile_ = new libvpx_test::TempOutFile; + ASSERT_NE(tmpfile_->file(), nullptr); + y4m_write_file_header(buf, sizeof(buf), kWidth, kHeight, &framerate, + y4m_.vpx_fmt, y4m_.bit_depth); + fputs(buf, tmpfile_->file()); + for (unsigned int i = start_; i < limit_; i++) { + y4m_write_frame_header(buf, sizeof(buf)); + fputs(buf, tmpfile_->file()); + write_image_file(img(), tmpfile_->file()); + Next(); + } + ReplaceInputFile(tmpfile_->file()); + } + + void Init(const std::string &file_name, int limit) override { + Y4mVideoSourceTest::Init(file_name, limit); + WriteY4mAndReadBack(); + } + libvpx_test::TempOutFile *tmpfile_; +}; + +TEST_P(Y4mVideoWriteTest, WriteTest) { + const Y4mTestParam t = GetParam(); + Init(t.filename, kFrames); + HeaderChecks(t.bit_depth, t.format); + Md5Check(t.md5raw); +} + +INSTANTIATE_TEST_SUITE_P(C, Y4mVideoWriteTest, + ::testing::ValuesIn(kY4mTestVectors)); + +static const char kY4MRegularHeader[] = + "YUV4MPEG2 W4 H4 F30:1 Ip A0:0 C420jpeg XYSCSS=420JPEG\n" + "FRAME\n" + "012345678912345601230123"; + +TEST(Y4MHeaderTest, RegularHeader) { + libvpx_test::TempOutFile f; + ASSERT_NE(f.file(), nullptr); + fwrite(kY4MRegularHeader, 1, sizeof(kY4MRegularHeader), f.file()); + fflush(f.file()); + EXPECT_EQ(0, fseek(f.file(), 0, 0)); + + y4m_input y4m; + EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/nullptr, + /*num_skip=*/0, /*only_420=*/0), + 0); + EXPECT_EQ(y4m.pic_w, 4); + EXPECT_EQ(y4m.pic_h, 4); + EXPECT_EQ(y4m.fps_n, 30); + EXPECT_EQ(y4m.fps_d, 1); + EXPECT_EQ(y4m.interlace, 'p'); + EXPECT_EQ(strcmp("420jpeg", y4m.chroma_type), 0); + y4m_input_close(&y4m); +} + +// Testing that headers over 100 characters can be parsed. +static const char kY4MLongHeader[] = + "YUV4MPEG2 W4 H4 F30:1 Ip A0:0 C420jpeg XYSCSS=420JPEG " + "XCOLORRANGE=LIMITED XSOME_UNKNOWN_METADATA XOTHER_UNKNOWN_METADATA\n" + "FRAME\n" + "012345678912345601230123"; + +TEST(Y4MHeaderTest, LongHeader) { + libvpx_test::TempOutFile f; + ASSERT_NE(f.file(), nullptr); + fwrite(kY4MLongHeader, 1, sizeof(kY4MLongHeader), f.file()); + fflush(f.file()); + EXPECT_EQ(fseek(f.file(), 0, 0), 0); + + y4m_input y4m; + EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/nullptr, + /*num_skip=*/0, /*only_420=*/0), + 0); + EXPECT_EQ(y4m.pic_w, 4); + EXPECT_EQ(y4m.pic_h, 4); + EXPECT_EQ(y4m.fps_n, 30); + EXPECT_EQ(y4m.fps_d, 1); + EXPECT_EQ(y4m.interlace, 'p'); + EXPECT_EQ(strcmp("420jpeg", y4m.chroma_type), 0); + y4m_input_close(&y4m); +} + +} // namespace diff --git a/media/libvpx/libvpx/test/y4m_video_source.h b/media/libvpx/libvpx/test/y4m_video_source.h new file mode 100644 index 0000000000..e43e37d9e4 --- /dev/null +++ b/media/libvpx/libvpx/test/y4m_video_source.h @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_TEST_Y4M_VIDEO_SOURCE_H_ +#define VPX_TEST_Y4M_VIDEO_SOURCE_H_ +#include +#include +#include + +#include "test/video_source.h" +#include "./y4minput.h" + +namespace libvpx_test { + +// This class extends VideoSource to allow parsing of raw yv12 +// so that we can do actual file encodes. +class Y4mVideoSource : public VideoSource { + public: + Y4mVideoSource(const std::string &file_name, unsigned int start, int limit) + : file_name_(file_name), input_file_(nullptr), img_(new vpx_image_t()), + start_(start), limit_(limit), frame_(0), framerate_numerator_(0), + framerate_denominator_(0), y4m_() {} + + ~Y4mVideoSource() override { + vpx_img_free(img_.get()); + CloseSource(); + } + + virtual void OpenSource() { + CloseSource(); + input_file_ = OpenTestDataFile(file_name_); + ASSERT_NE(input_file_, nullptr) + << "Input file open failed. Filename: " << file_name_; + } + + virtual void ReadSourceToStart() { + ASSERT_NE(input_file_, nullptr); + ASSERT_FALSE(y4m_input_open(&y4m_, input_file_, nullptr, 0, 0)); + framerate_numerator_ = y4m_.fps_n; + framerate_denominator_ = y4m_.fps_d; + frame_ = 0; + for (unsigned int i = 0; i < start_; i++) { + Next(); + } + FillFrame(); + } + + void Begin() override { + OpenSource(); + ReadSourceToStart(); + } + + void Next() override { + ++frame_; + FillFrame(); + } + + vpx_image_t *img() const override { + return (frame_ < limit_) ? img_.get() : nullptr; + } + + // Models a stream where Timebase = 1/FPS, so pts == frame. + vpx_codec_pts_t pts() const override { return frame_; } + + unsigned long duration() const override { return 1; } + + vpx_rational_t timebase() const override { + const vpx_rational_t t = { framerate_denominator_, framerate_numerator_ }; + return t; + } + + unsigned int frame() const override { return frame_; } + + unsigned int limit() const override { return limit_; } + + virtual void FillFrame() { + ASSERT_NE(input_file_, nullptr); + // Read a frame from input_file. + y4m_input_fetch_frame(&y4m_, input_file_, img_.get()); + } + + // Swap buffers with another y4m source. This allows reading a new frame + // while keeping the old frame around. A whole Y4mSource is required and + // not just a vpx_image_t because of how the y4m reader manipulates + // vpx_image_t internals, + void SwapBuffers(Y4mVideoSource *other) { + std::swap(other->y4m_.dst_buf, y4m_.dst_buf); + vpx_image_t *tmp; + tmp = other->img_.release(); + other->img_.reset(img_.release()); + img_.reset(tmp); + } + + protected: + void CloseSource() { + y4m_input_close(&y4m_); + y4m_ = y4m_input(); + if (input_file_ != nullptr) { + fclose(input_file_); + input_file_ = nullptr; + } + } + + std::string file_name_; + FILE *input_file_; + std::unique_ptr img_; + unsigned int start_; + unsigned int limit_; + unsigned int frame_; + int framerate_numerator_; + int framerate_denominator_; + y4m_input y4m_; +}; + +} // namespace libvpx_test + +#endif // VPX_TEST_Y4M_VIDEO_SOURCE_H_ diff --git a/media/libvpx/libvpx/test/yuv_temporal_filter_test.cc b/media/libvpx/libvpx/test/yuv_temporal_filter_test.cc new file mode 100644 index 0000000000..0677d55688 --- /dev/null +++ b/media/libvpx/libvpx/test/yuv_temporal_filter_test.cc @@ -0,0 +1,726 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vp9_rtcd.h" +#include "test/acm_random.h" +#include "test/buffer.h" +#include "test/register_state_check.h" +#include "vpx_ports/vpx_timer.h" + +namespace { + +using ::libvpx_test::ACMRandom; +using ::libvpx_test::Buffer; + +typedef void (*YUVTemporalFilterFunc)( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, + uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, + uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); + +struct TemporalFilterWithBd { + TemporalFilterWithBd(YUVTemporalFilterFunc func, int bitdepth) + : temporal_filter(func), bd(bitdepth) {} + + YUVTemporalFilterFunc temporal_filter; + int bd; +}; + +std::ostream &operator<<(std::ostream &os, const TemporalFilterWithBd &tf) { + return os << "Bitdepth: " << tf.bd; +} + +int GetFilterWeight(unsigned int row, unsigned int col, + unsigned int block_height, unsigned int block_width, + const int *const blk_fw, int use_32x32) { + if (use_32x32) { + return blk_fw[0]; + } + + return blk_fw[2 * (row >= block_height / 2) + (col >= block_width / 2)]; +} + +template +int GetModIndex(int sum_dist, int index, int rounding, int strength, + int filter_weight) { + int mod = sum_dist * 3 / index; + mod += rounding; + mod >>= strength; + + mod = VPXMIN(16, mod); + + mod = 16 - mod; + mod *= filter_weight; + + return mod; +} + +template <> +int GetModIndex(int sum_dist, int index, int rounding, int strength, + int filter_weight) { + unsigned int index_mult[14] = { 0, 0, 0, 0, 49152, + 39322, 32768, 28087, 24576, 21846, + 19661, 17874, 0, 15124 }; + + assert(index >= 0 && index <= 13); + assert(index_mult[index] != 0); + + int mod = (clamp(sum_dist, 0, UINT16_MAX) * index_mult[index]) >> 16; + mod += rounding; + mod >>= strength; + + mod = VPXMIN(16, mod); + + mod = 16 - mod; + mod *= filter_weight; + + return mod; +} + +template <> +int GetModIndex(int sum_dist, int index, int rounding, int strength, + int filter_weight) { + int64_t index_mult[14] = { 0U, 0U, 0U, 0U, + 3221225472U, 2576980378U, 2147483648U, 1840700270U, + 1610612736U, 1431655766U, 1288490189U, 1171354718U, + 0U, 991146300U }; + + assert(index >= 0 && index <= 13); + assert(index_mult[index] != 0); + + int mod = static_cast((sum_dist * index_mult[index]) >> 32); + mod += rounding; + mod >>= strength; + + mod = VPXMIN(16, mod); + + mod = 16 - mod; + mod *= filter_weight; + + return mod; +} + +template +void ApplyReferenceFilter( + const Buffer &y_src, const Buffer &y_pre, + const Buffer &u_src, const Buffer &v_src, + const Buffer &u_pre, const Buffer &v_pre, + unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, + int strength, const int *const blk_fw, int use_32x32, + Buffer *y_accumulator, Buffer *y_counter, + Buffer *u_accumulator, Buffer *u_counter, + Buffer *v_accumulator, Buffer *v_counter) { + const PixelType *y_src_ptr = y_src.TopLeftPixel(); + const PixelType *y_pre_ptr = y_pre.TopLeftPixel(); + const PixelType *u_src_ptr = u_src.TopLeftPixel(); + const PixelType *u_pre_ptr = u_pre.TopLeftPixel(); + const PixelType *v_src_ptr = v_src.TopLeftPixel(); + const PixelType *v_pre_ptr = v_pre.TopLeftPixel(); + + const int uv_block_width = block_width >> ss_x, + uv_block_height = block_height >> ss_y; + const int y_src_stride = y_src.stride(), y_pre_stride = y_pre.stride(); + const int uv_src_stride = u_src.stride(), uv_pre_stride = u_pre.stride(); + const int y_diff_stride = block_width, uv_diff_stride = uv_block_width; + + Buffer y_dif = Buffer(block_width, block_height, 0); + Buffer u_dif = Buffer(uv_block_width, uv_block_height, 0); + Buffer v_dif = Buffer(uv_block_width, uv_block_height, 0); + + ASSERT_TRUE(y_dif.Init()); + ASSERT_TRUE(u_dif.Init()); + ASSERT_TRUE(v_dif.Init()); + y_dif.Set(0); + u_dif.Set(0); + v_dif.Set(0); + + int *y_diff_ptr = y_dif.TopLeftPixel(); + int *u_diff_ptr = u_dif.TopLeftPixel(); + int *v_diff_ptr = v_dif.TopLeftPixel(); + + uint32_t *y_accum = y_accumulator->TopLeftPixel(); + uint32_t *u_accum = u_accumulator->TopLeftPixel(); + uint32_t *v_accum = v_accumulator->TopLeftPixel(); + uint16_t *y_count = y_counter->TopLeftPixel(); + uint16_t *u_count = u_counter->TopLeftPixel(); + uint16_t *v_count = v_counter->TopLeftPixel(); + + const int y_accum_stride = y_accumulator->stride(); + const int u_accum_stride = u_accumulator->stride(); + const int v_accum_stride = v_accumulator->stride(); + const int y_count_stride = y_counter->stride(); + const int u_count_stride = u_counter->stride(); + const int v_count_stride = v_counter->stride(); + + const int rounding = (1 << strength) >> 1; + + // Get the square diffs + for (int row = 0; row < static_cast(block_height); row++) { + for (int col = 0; col < static_cast(block_width); col++) { + const int diff = y_src_ptr[row * y_src_stride + col] - + y_pre_ptr[row * y_pre_stride + col]; + y_diff_ptr[row * y_diff_stride + col] = diff * diff; + } + } + + for (int row = 0; row < uv_block_height; row++) { + for (int col = 0; col < uv_block_width; col++) { + const int u_diff = u_src_ptr[row * uv_src_stride + col] - + u_pre_ptr[row * uv_pre_stride + col]; + const int v_diff = v_src_ptr[row * uv_src_stride + col] - + v_pre_ptr[row * uv_pre_stride + col]; + u_diff_ptr[row * uv_diff_stride + col] = u_diff * u_diff; + v_diff_ptr[row * uv_diff_stride + col] = v_diff * v_diff; + } + } + + // Apply the filter to luma + for (int row = 0; row < static_cast(block_height); row++) { + for (int col = 0; col < static_cast(block_width); col++) { + const int uv_row = row >> ss_y; + const int uv_col = col >> ss_x; + const int filter_weight = GetFilterWeight(row, col, block_height, + block_width, blk_fw, use_32x32); + + // First we get the modifier for the current y pixel + const int y_pixel = y_pre_ptr[row * y_pre_stride + col]; + int y_num_used = 0; + int y_mod = 0; + + // Sum the neighboring 3x3 y pixels + for (int row_step = -1; row_step <= 1; row_step++) { + for (int col_step = -1; col_step <= 1; col_step++) { + const int sub_row = row + row_step; + const int sub_col = col + col_step; + + if (sub_row >= 0 && sub_row < static_cast(block_height) && + sub_col >= 0 && sub_col < static_cast(block_width)) { + y_mod += y_diff_ptr[sub_row * y_diff_stride + sub_col]; + y_num_used++; + } + } + } + + // Sum the corresponding uv pixels to the current y modifier + // Note we are rounding down instead of rounding to the nearest pixel. + y_mod += u_diff_ptr[uv_row * uv_diff_stride + uv_col]; + y_mod += v_diff_ptr[uv_row * uv_diff_stride + uv_col]; + + y_num_used += 2; + + // Set the modifier + y_mod = GetModIndex(y_mod, y_num_used, rounding, strength, + filter_weight); + + // Accumulate the result + y_count[row * y_count_stride + col] += y_mod; + y_accum[row * y_accum_stride + col] += y_mod * y_pixel; + } + } + + // Apply the filter to chroma + for (int uv_row = 0; uv_row < uv_block_height; uv_row++) { + for (int uv_col = 0; uv_col < uv_block_width; uv_col++) { + const int y_row = uv_row << ss_y; + const int y_col = uv_col << ss_x; + const int filter_weight = GetFilterWeight( + uv_row, uv_col, uv_block_height, uv_block_width, blk_fw, use_32x32); + + const int u_pixel = u_pre_ptr[uv_row * uv_pre_stride + uv_col]; + const int v_pixel = v_pre_ptr[uv_row * uv_pre_stride + uv_col]; + + int uv_num_used = 0; + int u_mod = 0, v_mod = 0; + + // Sum the neighboring 3x3 chromal pixels to the chroma modifier + for (int row_step = -1; row_step <= 1; row_step++) { + for (int col_step = -1; col_step <= 1; col_step++) { + const int sub_row = uv_row + row_step; + const int sub_col = uv_col + col_step; + + if (sub_row >= 0 && sub_row < uv_block_height && sub_col >= 0 && + sub_col < uv_block_width) { + u_mod += u_diff_ptr[sub_row * uv_diff_stride + sub_col]; + v_mod += v_diff_ptr[sub_row * uv_diff_stride + sub_col]; + uv_num_used++; + } + } + } + + // Sum all the luma pixels associated with the current luma pixel + for (int row_step = 0; row_step < 1 + ss_y; row_step++) { + for (int col_step = 0; col_step < 1 + ss_x; col_step++) { + const int sub_row = y_row + row_step; + const int sub_col = y_col + col_step; + const int y_diff = y_diff_ptr[sub_row * y_diff_stride + sub_col]; + + u_mod += y_diff; + v_mod += y_diff; + uv_num_used++; + } + } + + // Set the modifier + u_mod = GetModIndex(u_mod, uv_num_used, rounding, strength, + filter_weight); + v_mod = GetModIndex(v_mod, uv_num_used, rounding, strength, + filter_weight); + + // Accumulate the result + u_count[uv_row * u_count_stride + uv_col] += u_mod; + u_accum[uv_row * u_accum_stride + uv_col] += u_mod * u_pixel; + v_count[uv_row * v_count_stride + uv_col] += v_mod; + v_accum[uv_row * v_accum_stride + uv_col] += v_mod * v_pixel; + } + } +} + +class YUVTemporalFilterTest + : public ::testing::TestWithParam { + public: + void SetUp() override { + filter_func_ = GetParam().temporal_filter; + bd_ = GetParam().bd; + use_highbd_ = (bd_ != 8); + + rnd_.Reset(ACMRandom::DeterministicSeed()); + saturate_test_ = 0; + num_repeats_ = 10; + + ASSERT_TRUE(bd_ == 8 || bd_ == 10 || bd_ == 12); + } + + protected: + template + void CompareTestWithParam(int width, int height, int ss_x, int ss_y, + int filter_strength, int use_32x32, + const int *filter_weight); + template + void RunTestFilterWithParam(int width, int height, int ss_x, int ss_y, + int filter_strength, int use_32x32, + const int *filter_weight); + YUVTemporalFilterFunc filter_func_; + ACMRandom rnd_; + int saturate_test_; + int num_repeats_; + int use_highbd_; + int bd_; +}; + +template +void YUVTemporalFilterTest::CompareTestWithParam(int width, int height, + int ss_x, int ss_y, + int filter_strength, + int use_32x32, + const int *filter_weight) { + const int uv_width = width >> ss_x, uv_height = height >> ss_y; + + Buffer y_src = Buffer(width, height, 0); + Buffer y_pre = Buffer(width, height, 0); + Buffer y_count_ref = Buffer(width, height, 0); + Buffer y_accum_ref = Buffer(width, height, 0); + Buffer y_count_tst = Buffer(width, height, 0); + Buffer y_accum_tst = Buffer(width, height, 0); + + Buffer u_src = Buffer(uv_width, uv_height, 0); + Buffer u_pre = Buffer(uv_width, uv_height, 0); + Buffer u_count_ref = Buffer(uv_width, uv_height, 0); + Buffer u_accum_ref = Buffer(uv_width, uv_height, 0); + Buffer u_count_tst = Buffer(uv_width, uv_height, 0); + Buffer u_accum_tst = Buffer(uv_width, uv_height, 0); + + Buffer v_src = Buffer(uv_width, uv_height, 0); + Buffer v_pre = Buffer(uv_width, uv_height, 0); + Buffer v_count_ref = Buffer(uv_width, uv_height, 0); + Buffer v_accum_ref = Buffer(uv_width, uv_height, 0); + Buffer v_count_tst = Buffer(uv_width, uv_height, 0); + Buffer v_accum_tst = Buffer(uv_width, uv_height, 0); + + ASSERT_TRUE(y_src.Init()); + ASSERT_TRUE(y_pre.Init()); + ASSERT_TRUE(y_count_ref.Init()); + ASSERT_TRUE(y_accum_ref.Init()); + ASSERT_TRUE(y_count_tst.Init()); + ASSERT_TRUE(y_accum_tst.Init()); + ASSERT_TRUE(u_src.Init()); + ASSERT_TRUE(u_pre.Init()); + ASSERT_TRUE(u_count_ref.Init()); + ASSERT_TRUE(u_accum_ref.Init()); + ASSERT_TRUE(u_count_tst.Init()); + ASSERT_TRUE(u_accum_tst.Init()); + + ASSERT_TRUE(v_src.Init()); + ASSERT_TRUE(v_pre.Init()); + ASSERT_TRUE(v_count_ref.Init()); + ASSERT_TRUE(v_accum_ref.Init()); + ASSERT_TRUE(v_count_tst.Init()); + ASSERT_TRUE(v_accum_tst.Init()); + + y_accum_ref.Set(0); + y_accum_tst.Set(0); + y_count_ref.Set(0); + y_count_tst.Set(0); + u_accum_ref.Set(0); + u_accum_tst.Set(0); + u_count_ref.Set(0); + u_count_tst.Set(0); + v_accum_ref.Set(0); + v_accum_tst.Set(0); + v_count_ref.Set(0); + v_count_tst.Set(0); + + for (int repeats = 0; repeats < num_repeats_; repeats++) { + if (saturate_test_) { + const int max_val = (1 << bd_) - 1; + y_src.Set(max_val); + y_pre.Set(0); + u_src.Set(max_val); + u_pre.Set(0); + v_src.Set(max_val); + v_pre.Set(0); + } else { + y_src.Set(&rnd_, 0, 7 << (bd_ - 8)); + y_pre.Set(&rnd_, 0, 7 << (bd_ - 8)); + u_src.Set(&rnd_, 0, 7 << (bd_ - 8)); + u_pre.Set(&rnd_, 0, 7 << (bd_ - 8)); + v_src.Set(&rnd_, 0, 7 << (bd_ - 8)); + v_pre.Set(&rnd_, 0, 7 << (bd_ - 8)); + } + + ApplyReferenceFilter( + y_src, y_pre, u_src, v_src, u_pre, v_pre, width, height, ss_x, ss_y, + filter_strength, filter_weight, use_32x32, &y_accum_ref, &y_count_ref, + &u_accum_ref, &u_count_ref, &v_accum_ref, &v_count_ref); + + ASM_REGISTER_STATE_CHECK(filter_func_( + reinterpret_cast(y_src.TopLeftPixel()), y_src.stride(), + reinterpret_cast(y_pre.TopLeftPixel()), y_pre.stride(), + reinterpret_cast(u_src.TopLeftPixel()), + reinterpret_cast(v_src.TopLeftPixel()), u_src.stride(), + reinterpret_cast(u_pre.TopLeftPixel()), + reinterpret_cast(v_pre.TopLeftPixel()), u_pre.stride(), + width, height, ss_x, ss_y, filter_strength, filter_weight, use_32x32, + y_accum_tst.TopLeftPixel(), y_count_tst.TopLeftPixel(), + u_accum_tst.TopLeftPixel(), u_count_tst.TopLeftPixel(), + v_accum_tst.TopLeftPixel(), v_count_tst.TopLeftPixel())); + + EXPECT_TRUE(y_accum_tst.CheckValues(y_accum_ref)); + EXPECT_TRUE(y_count_tst.CheckValues(y_count_ref)); + EXPECT_TRUE(u_accum_tst.CheckValues(u_accum_ref)); + EXPECT_TRUE(u_count_tst.CheckValues(u_count_ref)); + EXPECT_TRUE(v_accum_tst.CheckValues(v_accum_ref)); + EXPECT_TRUE(v_count_tst.CheckValues(v_count_ref)); + + if (HasFailure()) { + if (use_32x32) { + printf("SS_X: %d, SS_Y: %d, Strength: %d, Weight: %d\n", ss_x, ss_y, + filter_strength, *filter_weight); + } else { + printf("SS_X: %d, SS_Y: %d, Strength: %d, Weights: %d,%d,%d,%d\n", ss_x, + ss_y, filter_strength, filter_weight[0], filter_weight[1], + filter_weight[2], filter_weight[3]); + } + y_accum_tst.PrintDifference(y_accum_ref); + y_count_tst.PrintDifference(y_count_ref); + u_accum_tst.PrintDifference(u_accum_ref); + u_count_tst.PrintDifference(u_count_ref); + v_accum_tst.PrintDifference(v_accum_ref); + v_count_tst.PrintDifference(v_count_ref); + + return; + } + } +} + +template +void YUVTemporalFilterTest::RunTestFilterWithParam(int width, int height, + int ss_x, int ss_y, + int filter_strength, + int use_32x32, + const int *filter_weight) { + const int uv_width = width >> ss_x, uv_height = height >> ss_y; + + Buffer y_src = Buffer(width, height, 0); + Buffer y_pre = Buffer(width, height, 0); + Buffer y_count = Buffer(width, height, 0); + Buffer y_accum = Buffer(width, height, 0); + + Buffer u_src = Buffer(uv_width, uv_height, 0); + Buffer u_pre = Buffer(uv_width, uv_height, 0); + Buffer u_count = Buffer(uv_width, uv_height, 0); + Buffer u_accum = Buffer(uv_width, uv_height, 0); + + Buffer v_src = Buffer(uv_width, uv_height, 0); + Buffer v_pre = Buffer(uv_width, uv_height, 0); + Buffer v_count = Buffer(uv_width, uv_height, 0); + Buffer v_accum = Buffer(uv_width, uv_height, 0); + + ASSERT_TRUE(y_src.Init()); + ASSERT_TRUE(y_pre.Init()); + ASSERT_TRUE(y_count.Init()); + ASSERT_TRUE(y_accum.Init()); + + ASSERT_TRUE(u_src.Init()); + ASSERT_TRUE(u_pre.Init()); + ASSERT_TRUE(u_count.Init()); + ASSERT_TRUE(u_accum.Init()); + + ASSERT_TRUE(v_src.Init()); + ASSERT_TRUE(v_pre.Init()); + ASSERT_TRUE(v_count.Init()); + ASSERT_TRUE(v_accum.Init()); + + y_accum.Set(0); + y_count.Set(0); + + u_accum.Set(0); + u_count.Set(0); + + v_accum.Set(0); + v_count.Set(0); + + y_src.Set(&rnd_, 0, 7 << (bd_ - 8)); + y_pre.Set(&rnd_, 0, 7 << (bd_ - 8)); + u_src.Set(&rnd_, 0, 7 << (bd_ - 8)); + u_pre.Set(&rnd_, 0, 7 << (bd_ - 8)); + v_src.Set(&rnd_, 0, 7 << (bd_ - 8)); + v_pre.Set(&rnd_, 0, 7 << (bd_ - 8)); + + for (int repeats = 0; repeats < num_repeats_; repeats++) { + ASM_REGISTER_STATE_CHECK(filter_func_( + reinterpret_cast(y_src.TopLeftPixel()), y_src.stride(), + reinterpret_cast(y_pre.TopLeftPixel()), y_pre.stride(), + reinterpret_cast(u_src.TopLeftPixel()), + reinterpret_cast(v_src.TopLeftPixel()), u_src.stride(), + reinterpret_cast(u_pre.TopLeftPixel()), + reinterpret_cast(v_pre.TopLeftPixel()), u_pre.stride(), + width, height, ss_x, ss_y, filter_strength, filter_weight, use_32x32, + y_accum.TopLeftPixel(), y_count.TopLeftPixel(), u_accum.TopLeftPixel(), + u_count.TopLeftPixel(), v_accum.TopLeftPixel(), + v_count.TopLeftPixel())); + } +} + +TEST_P(YUVTemporalFilterTest, Use32x32) { + const int width = 32, height = 32; + const int use_32x32 = 1; + + for (int ss_x = 0; ss_x <= 1; ss_x++) { + for (int ss_y = 0; ss_y <= 1; ss_y++) { + for (int filter_strength = 0; filter_strength <= 6; + filter_strength += 2) { + for (int filter_weight = 0; filter_weight <= 2; filter_weight++) { + if (use_highbd_) { + const int adjusted_strength = filter_strength + 2 * (bd_ - 8); + CompareTestWithParam(width, height, ss_x, ss_y, + adjusted_strength, use_32x32, + &filter_weight); + } else { + CompareTestWithParam(width, height, ss_x, ss_y, + filter_strength, use_32x32, + &filter_weight); + } + ASSERT_FALSE(HasFailure()); + } + } + } + } +} + +TEST_P(YUVTemporalFilterTest, Use16x16) { + const int width = 32, height = 32; + const int use_32x32 = 0; + + for (int ss_x = 0; ss_x <= 1; ss_x++) { + for (int ss_y = 0; ss_y <= 1; ss_y++) { + for (int filter_idx = 0; filter_idx < 3 * 3 * 3 * 3; filter_idx++) { + // Set up the filter + int filter_weight[4]; + int filter_idx_cp = filter_idx; + for (int idx = 0; idx < 4; idx++) { + filter_weight[idx] = filter_idx_cp % 3; + filter_idx_cp /= 3; + } + + // Test each parameter + for (int filter_strength = 0; filter_strength <= 6; + filter_strength += 2) { + if (use_highbd_) { + const int adjusted_strength = filter_strength + 2 * (bd_ - 8); + CompareTestWithParam(width, height, ss_x, ss_y, + adjusted_strength, use_32x32, + filter_weight); + } else { + CompareTestWithParam(width, height, ss_x, ss_y, + filter_strength, use_32x32, + filter_weight); + } + + ASSERT_FALSE(HasFailure()); + } + } + } + } +} + +TEST_P(YUVTemporalFilterTest, SaturationTest) { + const int width = 32, height = 32; + const int use_32x32 = 1; + const int filter_weight = 1; + saturate_test_ = 1; + + for (int ss_x = 0; ss_x <= 1; ss_x++) { + for (int ss_y = 0; ss_y <= 1; ss_y++) { + for (int filter_strength = 0; filter_strength <= 6; + filter_strength += 2) { + if (use_highbd_) { + const int adjusted_strength = filter_strength + 2 * (bd_ - 8); + CompareTestWithParam(width, height, ss_x, ss_y, + adjusted_strength, use_32x32, + &filter_weight); + } else { + CompareTestWithParam(width, height, ss_x, ss_y, + filter_strength, use_32x32, + &filter_weight); + } + + ASSERT_FALSE(HasFailure()); + } + } + } +} + +TEST_P(YUVTemporalFilterTest, DISABLED_Speed) { + const int width = 32, height = 32; + num_repeats_ = 1000; + + for (int use_32x32 = 0; use_32x32 <= 1; use_32x32++) { + const int num_filter_weights = use_32x32 ? 3 : 3 * 3 * 3 * 3; + for (int ss_x = 0; ss_x <= 1; ss_x++) { + for (int ss_y = 0; ss_y <= 1; ss_y++) { + for (int filter_idx = 0; filter_idx < num_filter_weights; + filter_idx++) { + // Set up the filter + int filter_weight[4]; + int filter_idx_cp = filter_idx; + for (int idx = 0; idx < 4; idx++) { + filter_weight[idx] = filter_idx_cp % 3; + filter_idx_cp /= 3; + } + + // Test each parameter + for (int filter_strength = 0; filter_strength <= 6; + filter_strength += 2) { + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + + if (use_highbd_) { + RunTestFilterWithParam(width, height, ss_x, ss_y, + filter_strength, use_32x32, + filter_weight); + } else { + RunTestFilterWithParam(width, height, ss_x, ss_y, + filter_strength, use_32x32, + filter_weight); + } + + vpx_usec_timer_mark(&timer); + const int elapsed_time = + static_cast(vpx_usec_timer_elapsed(&timer)); + + printf( + "Bitdepth: %d, Use 32X32: %d, SS_X: %d, SS_Y: %d, Weight Idx: " + "%d, Strength: %d, Time: %5d\n", + bd_, use_32x32, ss_x, ss_y, filter_idx, filter_strength, + elapsed_time); + } + } + } + } + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +#define WRAP_HIGHBD_FUNC(func, bd) \ + void wrap_##func##_##bd( \ + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, \ + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, \ + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, \ + int uv_pre_stride, unsigned int block_width, unsigned int block_height, \ + int ss_x, int ss_y, int strength, const int *const blk_fw, \ + int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, \ + uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, \ + uint16_t *v_count) { \ + func(reinterpret_cast(y_src), y_src_stride, \ + reinterpret_cast(y_pre), y_pre_stride, \ + reinterpret_cast(u_src), \ + reinterpret_cast(v_src), uv_src_stride, \ + reinterpret_cast(u_pre), \ + reinterpret_cast(v_pre), uv_pre_stride, \ + block_width, block_height, ss_x, ss_y, strength, blk_fw, use_32x32, \ + y_accumulator, y_count, u_accumulator, u_count, v_accumulator, \ + v_count); \ + } + +WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_c, 10) +WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_c, 12) + +INSTANTIATE_TEST_SUITE_P( + C, YUVTemporalFilterTest, + ::testing::Values( + TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_c_10, 10), + TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_c_12, 12))); +#if HAVE_SSE4_1 +WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_sse4_1, 10) +WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_sse4_1, 12) + +INSTANTIATE_TEST_SUITE_P( + SSE4_1, YUVTemporalFilterTest, + ::testing::Values( + TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_sse4_1_10, + 10), + TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_sse4_1_12, + 12))); +#endif // HAVE_SSE4_1 +#if HAVE_NEON +WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_neon, 10) +WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_neon, 12) + +INSTANTIATE_TEST_SUITE_P( + NEON, YUVTemporalFilterTest, + ::testing::Values( + TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_neon_10, + 10), + TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_neon_12, + 12))); +#endif // HAVE_NEON +#else +INSTANTIATE_TEST_SUITE_P( + C, YUVTemporalFilterTest, + ::testing::Values(TemporalFilterWithBd(&vp9_apply_temporal_filter_c, 8))); + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE4_1, YUVTemporalFilterTest, + ::testing::Values(TemporalFilterWithBd( + &vp9_apply_temporal_filter_sse4_1, 8))); +#endif // HAVE_SSE4_1 +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, YUVTemporalFilterTest, + ::testing::Values(TemporalFilterWithBd( + &vp9_apply_temporal_filter_neon, 8))); +#endif // HAVE_NEON +#endif // CONFIG_VP9_HIGHBITDEPTH + +} // namespace diff --git a/media/libvpx/libvpx/test/yuv_video_source.h b/media/libvpx/libvpx/test/yuv_video_source.h new file mode 100644 index 0000000000..bb5eec5bb8 --- /dev/null +++ b/media/libvpx/libvpx/test/yuv_video_source.h @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_TEST_YUV_VIDEO_SOURCE_H_ +#define VPX_TEST_YUV_VIDEO_SOURCE_H_ + +#include +#include +#include + +#include "test/video_source.h" +#include "vpx/vpx_image.h" + +namespace libvpx_test { + +// This class extends VideoSource to allow parsing of raw YUV +// formats of various color sampling and bit-depths so that we can +// do actual file encodes. +class YUVVideoSource : public VideoSource { + public: + YUVVideoSource(const std::string &file_name, vpx_img_fmt format, + unsigned int width, unsigned int height, int rate_numerator, + int rate_denominator, unsigned int start, int limit) + : file_name_(file_name), input_file_(nullptr), img_(nullptr), + start_(start), limit_(limit), frame_(0), width_(0), height_(0), + format_(VPX_IMG_FMT_NONE), framerate_numerator_(rate_numerator), + framerate_denominator_(rate_denominator) { + // This initializes format_, raw_size_, width_, height_ and allocates img. + SetSize(width, height, format); + } + + ~YUVVideoSource() override { + vpx_img_free(img_); + if (input_file_) fclose(input_file_); + } + + void Begin() override { + if (input_file_) fclose(input_file_); + input_file_ = OpenTestDataFile(file_name_); + ASSERT_NE(input_file_, nullptr) + << "Input file open failed. Filename: " << file_name_; + if (start_) { + fseek(input_file_, static_cast(raw_size_) * start_, SEEK_SET); + } + + frame_ = start_; + FillFrame(); + } + + void Next() override { + ++frame_; + FillFrame(); + } + + vpx_image_t *img() const override { + return (frame_ < limit_) ? img_ : nullptr; + } + + // Models a stream where Timebase = 1/FPS, so pts == frame. + vpx_codec_pts_t pts() const override { return frame_; } + + unsigned long duration() const override { return 1; } + + vpx_rational_t timebase() const override { + const vpx_rational_t t = { framerate_denominator_, framerate_numerator_ }; + return t; + } + + unsigned int frame() const override { return frame_; } + + unsigned int limit() const override { return limit_; } + + virtual void SetSize(unsigned int width, unsigned int height, + vpx_img_fmt format) { + if (width != width_ || height != height_ || format != format_) { + vpx_img_free(img_); + img_ = vpx_img_alloc(nullptr, format, width, height, 1); + ASSERT_NE(img_, nullptr); + width_ = width; + height_ = height; + format_ = format; + switch (format) { + case VPX_IMG_FMT_NV12: + case VPX_IMG_FMT_I420: raw_size_ = width * height * 3 / 2; break; + case VPX_IMG_FMT_I422: raw_size_ = width * height * 2; break; + case VPX_IMG_FMT_I440: raw_size_ = width * height * 2; break; + case VPX_IMG_FMT_I444: raw_size_ = width * height * 3; break; + case VPX_IMG_FMT_I42016: raw_size_ = width * height * 3; break; + case VPX_IMG_FMT_I42216: raw_size_ = width * height * 4; break; + case VPX_IMG_FMT_I44016: raw_size_ = width * height * 4; break; + case VPX_IMG_FMT_I44416: raw_size_ = width * height * 6; break; + default: ASSERT_TRUE(0); + } + } + } + + virtual void FillFrame() { + ASSERT_NE(input_file_, nullptr); + // Read a frame from input_file. + if (fread(img_->img_data, raw_size_, 1, input_file_) == 0) { + limit_ = frame_; + } + } + + protected: + std::string file_name_; + FILE *input_file_; + vpx_image_t *img_; + size_t raw_size_; + unsigned int start_; + unsigned int limit_; + unsigned int frame_; + unsigned int width_; + unsigned int height_; + vpx_img_fmt format_; + int framerate_numerator_; + int framerate_denominator_; +}; + +} // namespace libvpx_test + +#endif // VPX_TEST_YUV_VIDEO_SOURCE_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/README.libvpx b/media/libvpx/libvpx/third_party/googletest/README.libvpx new file mode 100644 index 0000000000..5f6b01b0ec --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/README.libvpx @@ -0,0 +1,29 @@ +URL: https://github.com/google/googletest.git +Version: release-1.12.1 +License: BSD +License File: LICENSE + +Description: +Google's framework for writing C++ tests on a variety of platforms +(Linux, Mac OS X, Windows, Windows CE, Symbian, etc). Based on the +xUnit architecture. Supports automatic test discovery, a rich set of +assertions, user-defined assertions, death tests, fatal and non-fatal +failures, various options for running the tests, and XML test report +generation. + +Local Modifications: +- Remove everything but: + .clang-format + CONTRIBUTORS + googletest/ + include + README.md + src + LICENSE +- Move .clang-format, CONTRIBUTORS, and LICENSE into googletest/ +- In googletest/include/gtest/internal/custom/gtest-port.h, define + GTEST_HAS_NOTIFICATION_ as 1 and use a stub Notification class to fix + the mingw32 g++ compilation errors caused by the lack of std::mutex + and std::condition_variable in the and + headers if mingw32 is configured with the win32 threads option. See + https://stackoverflow.com/questions/17242516/mingw-w64-threads-posix-vs-win32 diff --git a/media/libvpx/libvpx/third_party/googletest/gtest.mk b/media/libvpx/libvpx/third_party/googletest/gtest.mk new file mode 100644 index 0000000000..0de3113c7a --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/gtest.mk @@ -0,0 +1 @@ +GTEST_SRCS-yes += src/gtest-all.cc diff --git a/media/libvpx/libvpx/third_party/googletest/src/.clang-format b/media/libvpx/libvpx/third_party/googletest/src/.clang-format new file mode 100644 index 0000000000..5b9bfe6d22 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/.clang-format @@ -0,0 +1,4 @@ +# Run manually to reformat a file: +# clang-format -i --style=file +Language: Cpp +BasedOnStyle: Google diff --git a/media/libvpx/libvpx/third_party/googletest/src/CONTRIBUTORS b/media/libvpx/libvpx/third_party/googletest/src/CONTRIBUTORS new file mode 100644 index 0000000000..77397a5b53 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/CONTRIBUTORS @@ -0,0 +1,65 @@ +# This file contains a list of people who've made non-trivial +# contribution to the Google C++ Testing Framework project. People +# who commit code to the project are encouraged to add their names +# here. Please keep the list sorted by first names. + +Ajay Joshi +Balázs Dán +Benoit Sigoure +Bharat Mediratta +Bogdan Piloca +Chandler Carruth +Chris Prince +Chris Taylor +Dan Egnor +Dave MacLachlan +David Anderson +Dean Sturtevant +Eric Roman +Gene Volovich +Hady Zalek +Hal Burch +Jeffrey Yasskin +Jim Keller +Joe Walnes +Jon Wray +Jói Sigurðsson +Keir Mierle +Keith Ray +Kenton Varda +Kostya Serebryany +Krystian Kuzniarek +Lev Makhlis +Manuel Klimek +Mario Tanev +Mark Paskin +Markus Heule +Martijn Vels +Matthew Simmons +Mika Raento +Mike Bland +Miklós Fazekas +Neal Norwitz +Nermin Ozkiranartli +Owen Carlsen +Paneendra Ba +Pasi Valminen +Patrick Hanna +Patrick Riley +Paul Menage +Peter Kaminski +Piotr Kaminski +Preston Jackson +Rainer Klaffenboeck +Russ Cox +Russ Rufer +Sean Mcafee +Sigurður Ásgeirsson +Sverre Sundsdal +Szymon Sobik +Takeshi Yoshino +Tracy Bialik +Vadim Berman +Vlad Losev +Wolfgang Klier +Zhanyong Wan diff --git a/media/libvpx/libvpx/third_party/googletest/src/LICENSE b/media/libvpx/libvpx/third_party/googletest/src/LICENSE new file mode 100644 index 0000000000..1941a11f8c --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/LICENSE @@ -0,0 +1,28 @@ +Copyright 2008, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/media/libvpx/libvpx/third_party/googletest/src/README.md b/media/libvpx/libvpx/third_party/googletest/src/README.md new file mode 100644 index 0000000000..d26b309ed0 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/README.md @@ -0,0 +1,217 @@ +### Generic Build Instructions + +#### Setup + +To build GoogleTest and your tests that use it, you need to tell your build +system where to find its headers and source files. The exact way to do it +depends on which build system you use, and is usually straightforward. + +### Build with CMake + +GoogleTest comes with a CMake build script +([CMakeLists.txt](https://github.com/google/googletest/blob/master/CMakeLists.txt)) +that can be used on a wide range of platforms ("C" stands for cross-platform.). +If you don't have CMake installed already, you can download it for free from +. + +CMake works by generating native makefiles or build projects that can be used in +the compiler environment of your choice. You can either build GoogleTest as a +standalone project or it can be incorporated into an existing CMake build for +another project. + +#### Standalone CMake Project + +When building GoogleTest as a standalone project, the typical workflow starts +with + +``` +git clone https://github.com/google/googletest.git -b release-1.11.0 +cd googletest # Main directory of the cloned repository. +mkdir build # Create a directory to hold the build output. +cd build +cmake .. # Generate native build scripts for GoogleTest. +``` + +The above command also includes GoogleMock by default. And so, if you want to +build only GoogleTest, you should replace the last command with + +``` +cmake .. -DBUILD_GMOCK=OFF +``` + +If you are on a \*nix system, you should now see a Makefile in the current +directory. Just type `make` to build GoogleTest. And then you can simply install +GoogleTest if you are a system administrator. + +``` +make +sudo make install # Install in /usr/local/ by default +``` + +If you use Windows and have Visual Studio installed, a `gtest.sln` file and +several `.vcproj` files will be created. You can then build them using Visual +Studio. + +On Mac OS X with Xcode installed, a `.xcodeproj` file will be generated. + +#### Incorporating Into An Existing CMake Project + +If you want to use GoogleTest in a project which already uses CMake, the easiest +way is to get installed libraries and headers. + +* Import GoogleTest by using `find_package` (or `pkg_check_modules`). For + example, if `find_package(GTest CONFIG REQUIRED)` succeeds, you can use the + libraries as `GTest::gtest`, `GTest::gmock`. + +And a more robust and flexible approach is to build GoogleTest as part of that +project directly. This is done by making the GoogleTest source code available to +the main build and adding it using CMake's `add_subdirectory()` command. This +has the significant advantage that the same compiler and linker settings are +used between GoogleTest and the rest of your project, so issues associated with +using incompatible libraries (eg debug/release), etc. are avoided. This is +particularly useful on Windows. Making GoogleTest's source code available to the +main build can be done a few different ways: + +* Download the GoogleTest source code manually and place it at a known + location. This is the least flexible approach and can make it more difficult + to use with continuous integration systems, etc. +* Embed the GoogleTest source code as a direct copy in the main project's + source tree. This is often the simplest approach, but is also the hardest to + keep up to date. Some organizations may not permit this method. +* Add GoogleTest as a git submodule or equivalent. This may not always be + possible or appropriate. Git submodules, for example, have their own set of + advantages and drawbacks. +* Use CMake to download GoogleTest as part of the build's configure step. This + approach doesn't have the limitations of the other methods. + +The last of the above methods is implemented with a small piece of CMake code +that downloads and pulls the GoogleTest code into the main build. + +Just add to your `CMakeLists.txt`: + +```cmake +include(FetchContent) +FetchContent_Declare( + googletest + # Specify the commit you depend on and update it regularly. + URL https://github.com/google/googletest/archive/e2239ee6043f73722e7aa812a459f54a28552929.zip +) +# For Windows: Prevent overriding the parent project's compiler/linker settings +set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) +FetchContent_MakeAvailable(googletest) + +# Now simply link against gtest or gtest_main as needed. Eg +add_executable(example example.cpp) +target_link_libraries(example gtest_main) +add_test(NAME example_test COMMAND example) +``` + +Note that this approach requires CMake 3.14 or later due to its use of the +`FetchContent_MakeAvailable()` command. + +##### Visual Studio Dynamic vs Static Runtimes + +By default, new Visual Studio projects link the C runtimes dynamically but +GoogleTest links them statically. This will generate an error that looks +something like the following: gtest.lib(gtest-all.obj) : error LNK2038: mismatch +detected for 'RuntimeLibrary': value 'MTd_StaticDebug' doesn't match value +'MDd_DynamicDebug' in main.obj + +GoogleTest already has a CMake option for this: `gtest_force_shared_crt` + +Enabling this option will make gtest link the runtimes dynamically too, and +match the project in which it is included. + +#### C++ Standard Version + +An environment that supports C++11 is required in order to successfully build +GoogleTest. One way to ensure this is to specify the standard in the top-level +project, for example by using the `set(CMAKE_CXX_STANDARD 11)` command. If this +is not feasible, for example in a C project using GoogleTest for validation, +then it can be specified by adding it to the options for cmake via the +`DCMAKE_CXX_FLAGS` option. + +### Tweaking GoogleTest + +GoogleTest can be used in diverse environments. The default configuration may +not work (or may not work well) out of the box in some environments. However, +you can easily tweak GoogleTest by defining control macros on the compiler +command line. Generally, these macros are named like `GTEST_XYZ` and you define +them to either 1 or 0 to enable or disable a certain feature. + +We list the most frequently used macros below. For a complete list, see file +[include/gtest/internal/gtest-port.h](https://github.com/google/googletest/blob/master/googletest/include/gtest/internal/gtest-port.h). + +### Multi-threaded Tests + +GoogleTest is thread-safe where the pthread library is available. After +`#include "gtest/gtest.h"`, you can check the +`GTEST_IS_THREADSAFE` macro to see whether this is the case (yes if the macro is +`#defined` to 1, no if it's undefined.). + +If GoogleTest doesn't correctly detect whether pthread is available in your +environment, you can force it with + + -DGTEST_HAS_PTHREAD=1 + +or + + -DGTEST_HAS_PTHREAD=0 + +When GoogleTest uses pthread, you may need to add flags to your compiler and/or +linker to select the pthread library, or you'll get link errors. If you use the +CMake script, this is taken care of for you. If you use your own build script, +you'll need to read your compiler and linker's manual to figure out what flags +to add. + +### As a Shared Library (DLL) + +GoogleTest is compact, so most users can build and link it as a static library +for the simplicity. You can choose to use GoogleTest as a shared library (known +as a DLL on Windows) if you prefer. + +To compile *gtest* as a shared library, add + + -DGTEST_CREATE_SHARED_LIBRARY=1 + +to the compiler flags. You'll also need to tell the linker to produce a shared +library instead - consult your linker's manual for how to do it. + +To compile your *tests* that use the gtest shared library, add + + -DGTEST_LINKED_AS_SHARED_LIBRARY=1 + +to the compiler flags. + +Note: while the above steps aren't technically necessary today when using some +compilers (e.g. GCC), they may become necessary in the future, if we decide to +improve the speed of loading the library (see + for details). Therefore you are recommended +to always add the above flags when using GoogleTest as a shared library. +Otherwise a future release of GoogleTest may break your build script. + +### Avoiding Macro Name Clashes + +In C++, macros don't obey namespaces. Therefore two libraries that both define a +macro of the same name will clash if you `#include` both definitions. In case a +GoogleTest macro clashes with another library, you can force GoogleTest to +rename its macro to avoid the conflict. + +Specifically, if both GoogleTest and some other code define macro FOO, you can +add + + -DGTEST_DONT_DEFINE_FOO=1 + +to the compiler flags to tell GoogleTest to change the macro's name from `FOO` +to `GTEST_FOO`. Currently `FOO` can be `ASSERT_EQ`, `ASSERT_FALSE`, `ASSERT_GE`, +`ASSERT_GT`, `ASSERT_LE`, `ASSERT_LT`, `ASSERT_NE`, `ASSERT_TRUE`, +`EXPECT_FALSE`, `EXPECT_TRUE`, `FAIL`, `SUCCEED`, `TEST`, or `TEST_F`. For +example, with `-DGTEST_DONT_DEFINE_TEST=1`, you'll need to write + + GTEST_TEST(SomeTest, DoesThis) { ... } + +instead of + + TEST(SomeTest, DoesThis) { ... } + +in order to define a test. diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-assertion-result.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-assertion-result.h new file mode 100644 index 0000000000..addbb59c64 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-assertion-result.h @@ -0,0 +1,237 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This file implements the AssertionResult type. + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_ + +#include +#include +#include +#include + +#include "gtest/gtest-message.h" +#include "gtest/internal/gtest-port.h" + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +namespace testing { + +// A class for indicating whether an assertion was successful. When +// the assertion wasn't successful, the AssertionResult object +// remembers a non-empty message that describes how it failed. +// +// To create an instance of this class, use one of the factory functions +// (AssertionSuccess() and AssertionFailure()). +// +// This class is useful for two purposes: +// 1. Defining predicate functions to be used with Boolean test assertions +// EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts +// 2. Defining predicate-format functions to be +// used with predicate assertions (ASSERT_PRED_FORMAT*, etc). +// +// For example, if you define IsEven predicate: +// +// testing::AssertionResult IsEven(int n) { +// if ((n % 2) == 0) +// return testing::AssertionSuccess(); +// else +// return testing::AssertionFailure() << n << " is odd"; +// } +// +// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5))) +// will print the message +// +// Value of: IsEven(Fib(5)) +// Actual: false (5 is odd) +// Expected: true +// +// instead of a more opaque +// +// Value of: IsEven(Fib(5)) +// Actual: false +// Expected: true +// +// in case IsEven is a simple Boolean predicate. +// +// If you expect your predicate to be reused and want to support informative +// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up +// about half as often as positive ones in our tests), supply messages for +// both success and failure cases: +// +// testing::AssertionResult IsEven(int n) { +// if ((n % 2) == 0) +// return testing::AssertionSuccess() << n << " is even"; +// else +// return testing::AssertionFailure() << n << " is odd"; +// } +// +// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print +// +// Value of: IsEven(Fib(6)) +// Actual: true (8 is even) +// Expected: false +// +// NB: Predicates that support negative Boolean assertions have reduced +// performance in positive ones so be careful not to use them in tests +// that have lots (tens of thousands) of positive Boolean assertions. +// +// To use this class with EXPECT_PRED_FORMAT assertions such as: +// +// // Verifies that Foo() returns an even number. +// EXPECT_PRED_FORMAT1(IsEven, Foo()); +// +// you need to define: +// +// testing::AssertionResult IsEven(const char* expr, int n) { +// if ((n % 2) == 0) +// return testing::AssertionSuccess(); +// else +// return testing::AssertionFailure() +// << "Expected: " << expr << " is even\n Actual: it's " << n; +// } +// +// If Foo() returns 5, you will see the following message: +// +// Expected: Foo() is even +// Actual: it's 5 +// +class GTEST_API_ AssertionResult { + public: + // Copy constructor. + // Used in EXPECT_TRUE/FALSE(assertion_result). + AssertionResult(const AssertionResult& other); + +// C4800 is a level 3 warning in Visual Studio 2015 and earlier. +// This warning is not emitted in Visual Studio 2017. +// This warning is off by default starting in Visual Studio 2019 but can be +// enabled with command-line options. +#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920) + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */) +#endif + + // Used in the EXPECT_TRUE/FALSE(bool_expression). + // + // T must be contextually convertible to bool. + // + // The second parameter prevents this overload from being considered if + // the argument is implicitly convertible to AssertionResult. In that case + // we want AssertionResult's copy constructor to be used. + template + explicit AssertionResult( + const T& success, + typename std::enable_if< + !std::is_convertible::value>::type* + /*enabler*/ + = nullptr) + : success_(success) {} + +#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920) + GTEST_DISABLE_MSC_WARNINGS_POP_() +#endif + + // Assignment operator. + AssertionResult& operator=(AssertionResult other) { + swap(other); + return *this; + } + + // Returns true if and only if the assertion succeeded. + operator bool() const { return success_; } // NOLINT + + // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE. + AssertionResult operator!() const; + + // Returns the text streamed into this AssertionResult. Test assertions + // use it when they fail (i.e., the predicate's outcome doesn't match the + // assertion's expectation). When nothing has been streamed into the + // object, returns an empty string. + const char* message() const { + return message_.get() != nullptr ? message_->c_str() : ""; + } + // Deprecated; please use message() instead. + const char* failure_message() const { return message(); } + + // Streams a custom failure message into this object. + template + AssertionResult& operator<<(const T& value) { + AppendMessage(Message() << value); + return *this; + } + + // Allows streaming basic output manipulators such as endl or flush into + // this object. + AssertionResult& operator<<( + ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) { + AppendMessage(Message() << basic_manipulator); + return *this; + } + + private: + // Appends the contents of message to message_. + void AppendMessage(const Message& a_message) { + if (message_.get() == nullptr) message_.reset(new ::std::string); + message_->append(a_message.GetString().c_str()); + } + + // Swap the contents of this AssertionResult with other. + void swap(AssertionResult& other); + + // Stores result of the assertion predicate. + bool success_; + // Stores the message describing the condition in case the expectation + // construct is not satisfied with the predicate's outcome. + // Referenced via a pointer to avoid taking too much stack frame space + // with test assertions. + std::unique_ptr< ::std::string> message_; +}; + +// Makes a successful assertion result. +GTEST_API_ AssertionResult AssertionSuccess(); + +// Makes a failed assertion result. +GTEST_API_ AssertionResult AssertionFailure(); + +// Makes a failed assertion result with the given failure message. +// Deprecated; use AssertionFailure() << msg. +GTEST_API_ AssertionResult AssertionFailure(const Message& msg); + +} // namespace testing + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h new file mode 100644 index 0000000000..84e5a5bbd3 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h @@ -0,0 +1,345 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This header file defines the public API for death tests. It is +// #included by gtest.h so a user doesn't need to include this +// directly. + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ + +#include "gtest/internal/gtest-death-test-internal.h" + +// This flag controls the style of death tests. Valid values are "threadsafe", +// meaning that the death test child process will re-execute the test binary +// from the start, running only a single death test, or "fast", +// meaning that the child process will execute the test logic immediately +// after forking. +GTEST_DECLARE_string_(death_test_style); + +namespace testing { + +#if GTEST_HAS_DEATH_TEST + +namespace internal { + +// Returns a Boolean value indicating whether the caller is currently +// executing in the context of the death test child process. Tools such as +// Valgrind heap checkers may need this to modify their behavior in death +// tests. IMPORTANT: This is an internal utility. Using it may break the +// implementation of death tests. User code MUST NOT use it. +GTEST_API_ bool InDeathTestChild(); + +} // namespace internal + +// The following macros are useful for writing death tests. + +// Here's what happens when an ASSERT_DEATH* or EXPECT_DEATH* is +// executed: +// +// 1. It generates a warning if there is more than one active +// thread. This is because it's safe to fork() or clone() only +// when there is a single thread. +// +// 2. The parent process clone()s a sub-process and runs the death +// test in it; the sub-process exits with code 0 at the end of the +// death test, if it hasn't exited already. +// +// 3. The parent process waits for the sub-process to terminate. +// +// 4. The parent process checks the exit code and error message of +// the sub-process. +// +// Examples: +// +// ASSERT_DEATH(server.SendMessage(56, "Hello"), "Invalid port number"); +// for (int i = 0; i < 5; i++) { +// EXPECT_DEATH(server.ProcessRequest(i), +// "Invalid request .* in ProcessRequest()") +// << "Failed to die on request " << i; +// } +// +// ASSERT_EXIT(server.ExitNow(), ::testing::ExitedWithCode(0), "Exiting"); +// +// bool KilledBySIGHUP(int exit_code) { +// return WIFSIGNALED(exit_code) && WTERMSIG(exit_code) == SIGHUP; +// } +// +// ASSERT_EXIT(client.HangUpServer(), KilledBySIGHUP, "Hanging up!"); +// +// The final parameter to each of these macros is a matcher applied to any data +// the sub-process wrote to stderr. For compatibility with existing tests, a +// bare string is interpreted as a regular expression matcher. +// +// On the regular expressions used in death tests: +// +// On POSIX-compliant systems (*nix), we use the library, +// which uses the POSIX extended regex syntax. +// +// On other platforms (e.g. Windows or Mac), we only support a simple regex +// syntax implemented as part of Google Test. This limited +// implementation should be enough most of the time when writing +// death tests; though it lacks many features you can find in PCRE +// or POSIX extended regex syntax. For example, we don't support +// union ("x|y"), grouping ("(xy)"), brackets ("[xy]"), and +// repetition count ("x{5,7}"), among others. +// +// Below is the syntax that we do support. We chose it to be a +// subset of both PCRE and POSIX extended regex, so it's easy to +// learn wherever you come from. In the following: 'A' denotes a +// literal character, period (.), or a single \\ escape sequence; +// 'x' and 'y' denote regular expressions; 'm' and 'n' are for +// natural numbers. +// +// c matches any literal character c +// \\d matches any decimal digit +// \\D matches any character that's not a decimal digit +// \\f matches \f +// \\n matches \n +// \\r matches \r +// \\s matches any ASCII whitespace, including \n +// \\S matches any character that's not a whitespace +// \\t matches \t +// \\v matches \v +// \\w matches any letter, _, or decimal digit +// \\W matches any character that \\w doesn't match +// \\c matches any literal character c, which must be a punctuation +// . matches any single character except \n +// A? matches 0 or 1 occurrences of A +// A* matches 0 or many occurrences of A +// A+ matches 1 or many occurrences of A +// ^ matches the beginning of a string (not that of each line) +// $ matches the end of a string (not that of each line) +// xy matches x followed by y +// +// If you accidentally use PCRE or POSIX extended regex features +// not implemented by us, you will get a run-time failure. In that +// case, please try to rewrite your regular expression within the +// above syntax. +// +// This implementation is *not* meant to be as highly tuned or robust +// as a compiled regex library, but should perform well enough for a +// death test, which already incurs significant overhead by launching +// a child process. +// +// Known caveats: +// +// A "threadsafe" style death test obtains the path to the test +// program from argv[0] and re-executes it in the sub-process. For +// simplicity, the current implementation doesn't search the PATH +// when launching the sub-process. This means that the user must +// invoke the test program via a path that contains at least one +// path separator (e.g. path/to/foo_test and +// /absolute/path/to/bar_test are fine, but foo_test is not). This +// is rarely a problem as people usually don't put the test binary +// directory in PATH. +// + +// Asserts that a given `statement` causes the program to exit, with an +// integer exit status that satisfies `predicate`, and emitting error output +// that matches `matcher`. +#define ASSERT_EXIT(statement, predicate, matcher) \ + GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_FATAL_FAILURE_) + +// Like `ASSERT_EXIT`, but continues on to successive tests in the +// test suite, if any: +#define EXPECT_EXIT(statement, predicate, matcher) \ + GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_NONFATAL_FAILURE_) + +// Asserts that a given `statement` causes the program to exit, either by +// explicitly exiting with a nonzero exit code or being killed by a +// signal, and emitting error output that matches `matcher`. +#define ASSERT_DEATH(statement, matcher) \ + ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher) + +// Like `ASSERT_DEATH`, but continues on to successive tests in the +// test suite, if any: +#define EXPECT_DEATH(statement, matcher) \ + EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher) + +// Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*: + +// Tests that an exit code describes a normal exit with a given exit code. +class GTEST_API_ ExitedWithCode { + public: + explicit ExitedWithCode(int exit_code); + ExitedWithCode(const ExitedWithCode&) = default; + void operator=(const ExitedWithCode& other) = delete; + bool operator()(int exit_status) const; + + private: + const int exit_code_; +}; + +#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA +// Tests that an exit code describes an exit due to termination by a +// given signal. +class GTEST_API_ KilledBySignal { + public: + explicit KilledBySignal(int signum); + bool operator()(int exit_status) const; + + private: + const int signum_; +}; +#endif // !GTEST_OS_WINDOWS + +// EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode. +// The death testing framework causes this to have interesting semantics, +// since the sideeffects of the call are only visible in opt mode, and not +// in debug mode. +// +// In practice, this can be used to test functions that utilize the +// LOG(DFATAL) macro using the following style: +// +// int DieInDebugOr12(int* sideeffect) { +// if (sideeffect) { +// *sideeffect = 12; +// } +// LOG(DFATAL) << "death"; +// return 12; +// } +// +// TEST(TestSuite, TestDieOr12WorksInDgbAndOpt) { +// int sideeffect = 0; +// // Only asserts in dbg. +// EXPECT_DEBUG_DEATH(DieInDebugOr12(&sideeffect), "death"); +// +// #ifdef NDEBUG +// // opt-mode has sideeffect visible. +// EXPECT_EQ(12, sideeffect); +// #else +// // dbg-mode no visible sideeffect. +// EXPECT_EQ(0, sideeffect); +// #endif +// } +// +// This will assert that DieInDebugReturn12InOpt() crashes in debug +// mode, usually due to a DCHECK or LOG(DFATAL), but returns the +// appropriate fallback value (12 in this case) in opt mode. If you +// need to test that a function has appropriate side-effects in opt +// mode, include assertions against the side-effects. A general +// pattern for this is: +// +// EXPECT_DEBUG_DEATH({ +// // Side-effects here will have an effect after this statement in +// // opt mode, but none in debug mode. +// EXPECT_EQ(12, DieInDebugOr12(&sideeffect)); +// }, "death"); +// +#ifdef NDEBUG + +#define EXPECT_DEBUG_DEATH(statement, regex) \ + GTEST_EXECUTE_STATEMENT_(statement, regex) + +#define ASSERT_DEBUG_DEATH(statement, regex) \ + GTEST_EXECUTE_STATEMENT_(statement, regex) + +#else + +#define EXPECT_DEBUG_DEATH(statement, regex) EXPECT_DEATH(statement, regex) + +#define ASSERT_DEBUG_DEATH(statement, regex) ASSERT_DEATH(statement, regex) + +#endif // NDEBUG for EXPECT_DEBUG_DEATH +#endif // GTEST_HAS_DEATH_TEST + +// This macro is used for implementing macros such as +// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where +// death tests are not supported. Those macros must compile on such systems +// if and only if EXPECT_DEATH and ASSERT_DEATH compile with the same parameters +// on systems that support death tests. This allows one to write such a macro on +// a system that does not support death tests and be sure that it will compile +// on a death-test supporting system. It is exposed publicly so that systems +// that have death-tests with stricter requirements than GTEST_HAS_DEATH_TEST +// can write their own equivalent of EXPECT_DEATH_IF_SUPPORTED and +// ASSERT_DEATH_IF_SUPPORTED. +// +// Parameters: +// statement - A statement that a macro such as EXPECT_DEATH would test +// for program termination. This macro has to make sure this +// statement is compiled but not executed, to ensure that +// EXPECT_DEATH_IF_SUPPORTED compiles with a certain +// parameter if and only if EXPECT_DEATH compiles with it. +// regex - A regex that a macro such as EXPECT_DEATH would use to test +// the output of statement. This parameter has to be +// compiled but not evaluated by this macro, to ensure that +// this macro only accepts expressions that a macro such as +// EXPECT_DEATH would accept. +// terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED +// and a return statement for ASSERT_DEATH_IF_SUPPORTED. +// This ensures that ASSERT_DEATH_IF_SUPPORTED will not +// compile inside functions where ASSERT_DEATH doesn't +// compile. +// +// The branch that has an always false condition is used to ensure that +// statement and regex are compiled (and thus syntactically correct) but +// never executed. The unreachable code macro protects the terminator +// statement from generating an 'unreachable code' warning in case +// statement unconditionally returns or throws. The Message constructor at +// the end allows the syntax of streaming additional messages into the +// macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH. +#define GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, terminator) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + GTEST_LOG_(WARNING) << "Death tests are not supported on this platform.\n" \ + << "Statement '" #statement "' cannot be verified."; \ + } else if (::testing::internal::AlwaysFalse()) { \ + ::testing::internal::RE::PartialMatch(".*", (regex)); \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + terminator; \ + } else \ + ::testing::Message() + +// EXPECT_DEATH_IF_SUPPORTED(statement, regex) and +// ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if +// death tests are supported; otherwise they just issue a warning. This is +// useful when you are combining death test assertions with normal test +// assertions in one test. +#if GTEST_HAS_DEATH_TEST +#define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \ + EXPECT_DEATH(statement, regex) +#define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \ + ASSERT_DEATH(statement, regex) +#else +#define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \ + GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, ) +#define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \ + GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, return) +#endif + +} // namespace testing + +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-matchers.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-matchers.h new file mode 100644 index 0000000000..bffa00c533 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-matchers.h @@ -0,0 +1,956 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This file implements just enough of the matcher interface to allow +// EXPECT_DEATH and friends to accept a matcher argument. + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_ + +#include +#include +#include +#include +#include + +#include "gtest/gtest-printers.h" +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-port.h" + +// MSVC warning C5046 is new as of VS2017 version 15.8. +#if defined(_MSC_VER) && _MSC_VER >= 1915 +#define GTEST_MAYBE_5046_ 5046 +#else +#define GTEST_MAYBE_5046_ +#endif + +GTEST_DISABLE_MSC_WARNINGS_PUSH_( + 4251 GTEST_MAYBE_5046_ /* class A needs to have dll-interface to be used by + clients of class B */ + /* Symbol involving type with internal linkage not defined */) + +namespace testing { + +// To implement a matcher Foo for type T, define: +// 1. a class FooMatcherMatcher that implements the matcher interface: +// using is_gtest_matcher = void; +// bool MatchAndExplain(const T&, std::ostream*); +// (MatchResultListener* can also be used instead of std::ostream*) +// void DescribeTo(std::ostream*); +// void DescribeNegationTo(std::ostream*); +// +// 2. a factory function that creates a Matcher object from a +// FooMatcherMatcher. + +class MatchResultListener { + public: + // Creates a listener object with the given underlying ostream. The + // listener does not own the ostream, and does not dereference it + // in the constructor or destructor. + explicit MatchResultListener(::std::ostream* os) : stream_(os) {} + virtual ~MatchResultListener() = 0; // Makes this class abstract. + + // Streams x to the underlying ostream; does nothing if the ostream + // is NULL. + template + MatchResultListener& operator<<(const T& x) { + if (stream_ != nullptr) *stream_ << x; + return *this; + } + + // Returns the underlying ostream. + ::std::ostream* stream() { return stream_; } + + // Returns true if and only if the listener is interested in an explanation + // of the match result. A matcher's MatchAndExplain() method can use + // this information to avoid generating the explanation when no one + // intends to hear it. + bool IsInterested() const { return stream_ != nullptr; } + + private: + ::std::ostream* const stream_; + + MatchResultListener(const MatchResultListener&) = delete; + MatchResultListener& operator=(const MatchResultListener&) = delete; +}; + +inline MatchResultListener::~MatchResultListener() {} + +// An instance of a subclass of this knows how to describe itself as a +// matcher. +class GTEST_API_ MatcherDescriberInterface { + public: + virtual ~MatcherDescriberInterface() {} + + // Describes this matcher to an ostream. The function should print + // a verb phrase that describes the property a value matching this + // matcher should have. The subject of the verb phrase is the value + // being matched. For example, the DescribeTo() method of the Gt(7) + // matcher prints "is greater than 7". + virtual void DescribeTo(::std::ostream* os) const = 0; + + // Describes the negation of this matcher to an ostream. For + // example, if the description of this matcher is "is greater than + // 7", the negated description could be "is not greater than 7". + // You are not required to override this when implementing + // MatcherInterface, but it is highly advised so that your matcher + // can produce good error messages. + virtual void DescribeNegationTo(::std::ostream* os) const { + *os << "not ("; + DescribeTo(os); + *os << ")"; + } +}; + +// The implementation of a matcher. +template +class MatcherInterface : public MatcherDescriberInterface { + public: + // Returns true if and only if the matcher matches x; also explains the + // match result to 'listener' if necessary (see the next paragraph), in + // the form of a non-restrictive relative clause ("which ...", + // "whose ...", etc) that describes x. For example, the + // MatchAndExplain() method of the Pointee(...) matcher should + // generate an explanation like "which points to ...". + // + // Implementations of MatchAndExplain() should add an explanation of + // the match result *if and only if* they can provide additional + // information that's not already present (or not obvious) in the + // print-out of x and the matcher's description. Whether the match + // succeeds is not a factor in deciding whether an explanation is + // needed, as sometimes the caller needs to print a failure message + // when the match succeeds (e.g. when the matcher is used inside + // Not()). + // + // For example, a "has at least 10 elements" matcher should explain + // what the actual element count is, regardless of the match result, + // as it is useful information to the reader; on the other hand, an + // "is empty" matcher probably only needs to explain what the actual + // size is when the match fails, as it's redundant to say that the + // size is 0 when the value is already known to be empty. + // + // You should override this method when defining a new matcher. + // + // It's the responsibility of the caller (Google Test) to guarantee + // that 'listener' is not NULL. This helps to simplify a matcher's + // implementation when it doesn't care about the performance, as it + // can talk to 'listener' without checking its validity first. + // However, in order to implement dummy listeners efficiently, + // listener->stream() may be NULL. + virtual bool MatchAndExplain(T x, MatchResultListener* listener) const = 0; + + // Inherits these methods from MatcherDescriberInterface: + // virtual void DescribeTo(::std::ostream* os) const = 0; + // virtual void DescribeNegationTo(::std::ostream* os) const; +}; + +namespace internal { + +struct AnyEq { + template + bool operator()(const A& a, const B& b) const { + return a == b; + } +}; +struct AnyNe { + template + bool operator()(const A& a, const B& b) const { + return a != b; + } +}; +struct AnyLt { + template + bool operator()(const A& a, const B& b) const { + return a < b; + } +}; +struct AnyGt { + template + bool operator()(const A& a, const B& b) const { + return a > b; + } +}; +struct AnyLe { + template + bool operator()(const A& a, const B& b) const { + return a <= b; + } +}; +struct AnyGe { + template + bool operator()(const A& a, const B& b) const { + return a >= b; + } +}; + +// A match result listener that ignores the explanation. +class DummyMatchResultListener : public MatchResultListener { + public: + DummyMatchResultListener() : MatchResultListener(nullptr) {} + + private: + DummyMatchResultListener(const DummyMatchResultListener&) = delete; + DummyMatchResultListener& operator=(const DummyMatchResultListener&) = delete; +}; + +// A match result listener that forwards the explanation to a given +// ostream. The difference between this and MatchResultListener is +// that the former is concrete. +class StreamMatchResultListener : public MatchResultListener { + public: + explicit StreamMatchResultListener(::std::ostream* os) + : MatchResultListener(os) {} + + private: + StreamMatchResultListener(const StreamMatchResultListener&) = delete; + StreamMatchResultListener& operator=(const StreamMatchResultListener&) = + delete; +}; + +struct SharedPayloadBase { + std::atomic ref{1}; + void Ref() { ref.fetch_add(1, std::memory_order_relaxed); } + bool Unref() { return ref.fetch_sub(1, std::memory_order_acq_rel) == 1; } +}; + +template +struct SharedPayload : SharedPayloadBase { + explicit SharedPayload(const T& v) : value(v) {} + explicit SharedPayload(T&& v) : value(std::move(v)) {} + + static void Destroy(SharedPayloadBase* shared) { + delete static_cast(shared); + } + + T value; +}; + +// An internal class for implementing Matcher, which will derive +// from it. We put functionalities common to all Matcher +// specializations here to avoid code duplication. +template +class MatcherBase : private MatcherDescriberInterface { + public: + // Returns true if and only if the matcher matches x; also explains the + // match result to 'listener'. + bool MatchAndExplain(const T& x, MatchResultListener* listener) const { + GTEST_CHECK_(vtable_ != nullptr); + return vtable_->match_and_explain(*this, x, listener); + } + + // Returns true if and only if this matcher matches x. + bool Matches(const T& x) const { + DummyMatchResultListener dummy; + return MatchAndExplain(x, &dummy); + } + + // Describes this matcher to an ostream. + void DescribeTo(::std::ostream* os) const final { + GTEST_CHECK_(vtable_ != nullptr); + vtable_->describe(*this, os, false); + } + + // Describes the negation of this matcher to an ostream. + void DescribeNegationTo(::std::ostream* os) const final { + GTEST_CHECK_(vtable_ != nullptr); + vtable_->describe(*this, os, true); + } + + // Explains why x matches, or doesn't match, the matcher. + void ExplainMatchResultTo(const T& x, ::std::ostream* os) const { + StreamMatchResultListener listener(os); + MatchAndExplain(x, &listener); + } + + // Returns the describer for this matcher object; retains ownership + // of the describer, which is only guaranteed to be alive when + // this matcher object is alive. + const MatcherDescriberInterface* GetDescriber() const { + if (vtable_ == nullptr) return nullptr; + return vtable_->get_describer(*this); + } + + protected: + MatcherBase() : vtable_(nullptr), buffer_() {} + + // Constructs a matcher from its implementation. + template + explicit MatcherBase(const MatcherInterface* impl) + : vtable_(nullptr), buffer_() { + Init(impl); + } + + template ::type::is_gtest_matcher> + MatcherBase(M&& m) : vtable_(nullptr), buffer_() { // NOLINT + Init(std::forward(m)); + } + + MatcherBase(const MatcherBase& other) + : vtable_(other.vtable_), buffer_(other.buffer_) { + if (IsShared()) buffer_.shared->Ref(); + } + + MatcherBase& operator=(const MatcherBase& other) { + if (this == &other) return *this; + Destroy(); + vtable_ = other.vtable_; + buffer_ = other.buffer_; + if (IsShared()) buffer_.shared->Ref(); + return *this; + } + + MatcherBase(MatcherBase&& other) + : vtable_(other.vtable_), buffer_(other.buffer_) { + other.vtable_ = nullptr; + } + + MatcherBase& operator=(MatcherBase&& other) { + if (this == &other) return *this; + Destroy(); + vtable_ = other.vtable_; + buffer_ = other.buffer_; + other.vtable_ = nullptr; + return *this; + } + + ~MatcherBase() override { Destroy(); } + + private: + struct VTable { + bool (*match_and_explain)(const MatcherBase&, const T&, + MatchResultListener*); + void (*describe)(const MatcherBase&, std::ostream*, bool negation); + // Returns the captured object if it implements the interface, otherwise + // returns the MatcherBase itself. + const MatcherDescriberInterface* (*get_describer)(const MatcherBase&); + // Called on shared instances when the reference count reaches 0. + void (*shared_destroy)(SharedPayloadBase*); + }; + + bool IsShared() const { + return vtable_ != nullptr && vtable_->shared_destroy != nullptr; + } + + // If the implementation uses a listener, call that. + template + static auto MatchAndExplainImpl(const MatcherBase& m, const T& value, + MatchResultListener* listener) + -> decltype(P::Get(m).MatchAndExplain(value, listener->stream())) { + return P::Get(m).MatchAndExplain(value, listener->stream()); + } + + template + static auto MatchAndExplainImpl(const MatcherBase& m, const T& value, + MatchResultListener* listener) + -> decltype(P::Get(m).MatchAndExplain(value, listener)) { + return P::Get(m).MatchAndExplain(value, listener); + } + + template + static void DescribeImpl(const MatcherBase& m, std::ostream* os, + bool negation) { + if (negation) { + P::Get(m).DescribeNegationTo(os); + } else { + P::Get(m).DescribeTo(os); + } + } + + template + static const MatcherDescriberInterface* GetDescriberImpl( + const MatcherBase& m) { + // If the impl is a MatcherDescriberInterface, then return it. + // Otherwise use MatcherBase itself. + // This allows us to implement the GetDescriber() function without support + // from the impl, but some users really want to get their impl back when + // they call GetDescriber(). + // We use std::get on a tuple as a workaround of not having `if constexpr`. + return std::get<( + std::is_convertible::value + ? 1 + : 0)>(std::make_tuple(&m, &P::Get(m))); + } + + template + const VTable* GetVTable() { + static constexpr VTable kVTable = {&MatchAndExplainImpl

, + &DescribeImpl

, &GetDescriberImpl

, + P::shared_destroy}; + return &kVTable; + } + + union Buffer { + // Add some types to give Buffer some common alignment/size use cases. + void* ptr; + double d; + int64_t i; + // And add one for the out-of-line cases. + SharedPayloadBase* shared; + }; + + void Destroy() { + if (IsShared() && buffer_.shared->Unref()) { + vtable_->shared_destroy(buffer_.shared); + } + } + + template + static constexpr bool IsInlined() { + return sizeof(M) <= sizeof(Buffer) && alignof(M) <= alignof(Buffer) && + std::is_trivially_copy_constructible::value && + std::is_trivially_destructible::value; + } + + template ()> + struct ValuePolicy { + static const M& Get(const MatcherBase& m) { + // When inlined along with Init, need to be explicit to avoid violating + // strict aliasing rules. + const M* ptr = + static_cast(static_cast(&m.buffer_)); + return *ptr; + } + static void Init(MatcherBase& m, M impl) { + ::new (static_cast(&m.buffer_)) M(impl); + } + static constexpr auto shared_destroy = nullptr; + }; + + template + struct ValuePolicy { + using Shared = SharedPayload; + static const M& Get(const MatcherBase& m) { + return static_cast(m.buffer_.shared)->value; + } + template + static void Init(MatcherBase& m, Arg&& arg) { + m.buffer_.shared = new Shared(std::forward(arg)); + } + static constexpr auto shared_destroy = &Shared::Destroy; + }; + + template + struct ValuePolicy*, B> { + using M = const MatcherInterface; + using Shared = SharedPayload>; + static const M& Get(const MatcherBase& m) { + return *static_cast(m.buffer_.shared)->value; + } + static void Init(MatcherBase& m, M* impl) { + m.buffer_.shared = new Shared(std::unique_ptr(impl)); + } + + static constexpr auto shared_destroy = &Shared::Destroy; + }; + + template + void Init(M&& m) { + using MM = typename std::decay::type; + using Policy = ValuePolicy; + vtable_ = GetVTable(); + Policy::Init(*this, std::forward(m)); + } + + const VTable* vtable_; + Buffer buffer_; +}; + +} // namespace internal + +// A Matcher is a copyable and IMMUTABLE (except by assignment) +// object that can check whether a value of type T matches. The +// implementation of Matcher is just a std::shared_ptr to const +// MatcherInterface. Don't inherit from Matcher! +template +class Matcher : public internal::MatcherBase { + public: + // Constructs a null matcher. Needed for storing Matcher objects in STL + // containers. A default-constructed matcher is not yet initialized. You + // cannot use it until a valid value has been assigned to it. + explicit Matcher() {} // NOLINT + + // Constructs a matcher from its implementation. + explicit Matcher(const MatcherInterface* impl) + : internal::MatcherBase(impl) {} + + template + explicit Matcher( + const MatcherInterface* impl, + typename std::enable_if::value>::type* = + nullptr) + : internal::MatcherBase(impl) {} + + template ::type::is_gtest_matcher> + Matcher(M&& m) : internal::MatcherBase(std::forward(m)) {} // NOLINT + + // Implicit constructor here allows people to write + // EXPECT_CALL(foo, Bar(5)) instead of EXPECT_CALL(foo, Bar(Eq(5))) sometimes + Matcher(T value); // NOLINT +}; + +// The following two specializations allow the user to write str +// instead of Eq(str) and "foo" instead of Eq("foo") when a std::string +// matcher is expected. +template <> +class GTEST_API_ Matcher + : public internal::MatcherBase { + public: + Matcher() {} + + explicit Matcher(const MatcherInterface* impl) + : internal::MatcherBase(impl) {} + + template ::type::is_gtest_matcher> + Matcher(M&& m) // NOLINT + : internal::MatcherBase(std::forward(m)) {} + + // Allows the user to write str instead of Eq(str) sometimes, where + // str is a std::string object. + Matcher(const std::string& s); // NOLINT + + // Allows the user to write "foo" instead of Eq("foo") sometimes. + Matcher(const char* s); // NOLINT +}; + +template <> +class GTEST_API_ Matcher + : public internal::MatcherBase { + public: + Matcher() {} + + explicit Matcher(const MatcherInterface* impl) + : internal::MatcherBase(impl) {} + explicit Matcher(const MatcherInterface* impl) + : internal::MatcherBase(impl) {} + + template ::type::is_gtest_matcher> + Matcher(M&& m) // NOLINT + : internal::MatcherBase(std::forward(m)) {} + + // Allows the user to write str instead of Eq(str) sometimes, where + // str is a string object. + Matcher(const std::string& s); // NOLINT + + // Allows the user to write "foo" instead of Eq("foo") sometimes. + Matcher(const char* s); // NOLINT +}; + +#if GTEST_INTERNAL_HAS_STRING_VIEW +// The following two specializations allow the user to write str +// instead of Eq(str) and "foo" instead of Eq("foo") when a absl::string_view +// matcher is expected. +template <> +class GTEST_API_ Matcher + : public internal::MatcherBase { + public: + Matcher() {} + + explicit Matcher(const MatcherInterface* impl) + : internal::MatcherBase(impl) {} + + template ::type::is_gtest_matcher> + Matcher(M&& m) // NOLINT + : internal::MatcherBase(std::forward(m)) { + } + + // Allows the user to write str instead of Eq(str) sometimes, where + // str is a std::string object. + Matcher(const std::string& s); // NOLINT + + // Allows the user to write "foo" instead of Eq("foo") sometimes. + Matcher(const char* s); // NOLINT + + // Allows the user to pass absl::string_views or std::string_views directly. + Matcher(internal::StringView s); // NOLINT +}; + +template <> +class GTEST_API_ Matcher + : public internal::MatcherBase { + public: + Matcher() {} + + explicit Matcher(const MatcherInterface* impl) + : internal::MatcherBase(impl) {} + explicit Matcher(const MatcherInterface* impl) + : internal::MatcherBase(impl) {} + + template ::type::is_gtest_matcher> + Matcher(M&& m) // NOLINT + : internal::MatcherBase(std::forward(m)) {} + + // Allows the user to write str instead of Eq(str) sometimes, where + // str is a std::string object. + Matcher(const std::string& s); // NOLINT + + // Allows the user to write "foo" instead of Eq("foo") sometimes. + Matcher(const char* s); // NOLINT + + // Allows the user to pass absl::string_views or std::string_views directly. + Matcher(internal::StringView s); // NOLINT +}; +#endif // GTEST_INTERNAL_HAS_STRING_VIEW + +// Prints a matcher in a human-readable format. +template +std::ostream& operator<<(std::ostream& os, const Matcher& matcher) { + matcher.DescribeTo(&os); + return os; +} + +// The PolymorphicMatcher class template makes it easy to implement a +// polymorphic matcher (i.e. a matcher that can match values of more +// than one type, e.g. Eq(n) and NotNull()). +// +// To define a polymorphic matcher, a user should provide an Impl +// class that has a DescribeTo() method and a DescribeNegationTo() +// method, and define a member function (or member function template) +// +// bool MatchAndExplain(const Value& value, +// MatchResultListener* listener) const; +// +// See the definition of NotNull() for a complete example. +template +class PolymorphicMatcher { + public: + explicit PolymorphicMatcher(const Impl& an_impl) : impl_(an_impl) {} + + // Returns a mutable reference to the underlying matcher + // implementation object. + Impl& mutable_impl() { return impl_; } + + // Returns an immutable reference to the underlying matcher + // implementation object. + const Impl& impl() const { return impl_; } + + template + operator Matcher() const { + return Matcher(new MonomorphicImpl(impl_)); + } + + private: + template + class MonomorphicImpl : public MatcherInterface { + public: + explicit MonomorphicImpl(const Impl& impl) : impl_(impl) {} + + void DescribeTo(::std::ostream* os) const override { impl_.DescribeTo(os); } + + void DescribeNegationTo(::std::ostream* os) const override { + impl_.DescribeNegationTo(os); + } + + bool MatchAndExplain(T x, MatchResultListener* listener) const override { + return impl_.MatchAndExplain(x, listener); + } + + private: + const Impl impl_; + }; + + Impl impl_; +}; + +// Creates a matcher from its implementation. +// DEPRECATED: Especially in the generic code, prefer: +// Matcher(new MyMatcherImpl(...)); +// +// MakeMatcher may create a Matcher that accepts its argument by value, which +// leads to unnecessary copies & lack of support for non-copyable types. +template +inline Matcher MakeMatcher(const MatcherInterface* impl) { + return Matcher(impl); +} + +// Creates a polymorphic matcher from its implementation. This is +// easier to use than the PolymorphicMatcher constructor as it +// doesn't require you to explicitly write the template argument, e.g. +// +// MakePolymorphicMatcher(foo); +// vs +// PolymorphicMatcher(foo); +template +inline PolymorphicMatcher MakePolymorphicMatcher(const Impl& impl) { + return PolymorphicMatcher(impl); +} + +namespace internal { +// Implements a matcher that compares a given value with a +// pre-supplied value using one of the ==, <=, <, etc, operators. The +// two values being compared don't have to have the same type. +// +// The matcher defined here is polymorphic (for example, Eq(5) can be +// used to match an int, a short, a double, etc). Therefore we use +// a template type conversion operator in the implementation. +// +// The following template definition assumes that the Rhs parameter is +// a "bare" type (i.e. neither 'const T' nor 'T&'). +template +class ComparisonBase { + public: + explicit ComparisonBase(const Rhs& rhs) : rhs_(rhs) {} + + using is_gtest_matcher = void; + + template + bool MatchAndExplain(const Lhs& lhs, std::ostream*) const { + return Op()(lhs, Unwrap(rhs_)); + } + void DescribeTo(std::ostream* os) const { + *os << D::Desc() << " "; + UniversalPrint(Unwrap(rhs_), os); + } + void DescribeNegationTo(std::ostream* os) const { + *os << D::NegatedDesc() << " "; + UniversalPrint(Unwrap(rhs_), os); + } + + private: + template + static const T& Unwrap(const T& v) { + return v; + } + template + static const T& Unwrap(std::reference_wrapper v) { + return v; + } + + Rhs rhs_; +}; + +template +class EqMatcher : public ComparisonBase, Rhs, AnyEq> { + public: + explicit EqMatcher(const Rhs& rhs) + : ComparisonBase, Rhs, AnyEq>(rhs) {} + static const char* Desc() { return "is equal to"; } + static const char* NegatedDesc() { return "isn't equal to"; } +}; +template +class NeMatcher : public ComparisonBase, Rhs, AnyNe> { + public: + explicit NeMatcher(const Rhs& rhs) + : ComparisonBase, Rhs, AnyNe>(rhs) {} + static const char* Desc() { return "isn't equal to"; } + static const char* NegatedDesc() { return "is equal to"; } +}; +template +class LtMatcher : public ComparisonBase, Rhs, AnyLt> { + public: + explicit LtMatcher(const Rhs& rhs) + : ComparisonBase, Rhs, AnyLt>(rhs) {} + static const char* Desc() { return "is <"; } + static const char* NegatedDesc() { return "isn't <"; } +}; +template +class GtMatcher : public ComparisonBase, Rhs, AnyGt> { + public: + explicit GtMatcher(const Rhs& rhs) + : ComparisonBase, Rhs, AnyGt>(rhs) {} + static const char* Desc() { return "is >"; } + static const char* NegatedDesc() { return "isn't >"; } +}; +template +class LeMatcher : public ComparisonBase, Rhs, AnyLe> { + public: + explicit LeMatcher(const Rhs& rhs) + : ComparisonBase, Rhs, AnyLe>(rhs) {} + static const char* Desc() { return "is <="; } + static const char* NegatedDesc() { return "isn't <="; } +}; +template +class GeMatcher : public ComparisonBase, Rhs, AnyGe> { + public: + explicit GeMatcher(const Rhs& rhs) + : ComparisonBase, Rhs, AnyGe>(rhs) {} + static const char* Desc() { return "is >="; } + static const char* NegatedDesc() { return "isn't >="; } +}; + +template ::value>::type> +using StringLike = T; + +// Implements polymorphic matchers MatchesRegex(regex) and +// ContainsRegex(regex), which can be used as a Matcher as long as +// T can be converted to a string. +class MatchesRegexMatcher { + public: + MatchesRegexMatcher(const RE* regex, bool full_match) + : regex_(regex), full_match_(full_match) {} + +#if GTEST_INTERNAL_HAS_STRING_VIEW + bool MatchAndExplain(const internal::StringView& s, + MatchResultListener* listener) const { + return MatchAndExplain(std::string(s), listener); + } +#endif // GTEST_INTERNAL_HAS_STRING_VIEW + + // Accepts pointer types, particularly: + // const char* + // char* + // const wchar_t* + // wchar_t* + template + bool MatchAndExplain(CharType* s, MatchResultListener* listener) const { + return s != nullptr && MatchAndExplain(std::string(s), listener); + } + + // Matches anything that can convert to std::string. + // + // This is a template, not just a plain function with const std::string&, + // because absl::string_view has some interfering non-explicit constructors. + template + bool MatchAndExplain(const MatcheeStringType& s, + MatchResultListener* /* listener */) const { + const std::string& s2(s); + return full_match_ ? RE::FullMatch(s2, *regex_) + : RE::PartialMatch(s2, *regex_); + } + + void DescribeTo(::std::ostream* os) const { + *os << (full_match_ ? "matches" : "contains") << " regular expression "; + UniversalPrinter::Print(regex_->pattern(), os); + } + + void DescribeNegationTo(::std::ostream* os) const { + *os << "doesn't " << (full_match_ ? "match" : "contain") + << " regular expression "; + UniversalPrinter::Print(regex_->pattern(), os); + } + + private: + const std::shared_ptr regex_; + const bool full_match_; +}; +} // namespace internal + +// Matches a string that fully matches regular expression 'regex'. +// The matcher takes ownership of 'regex'. +inline PolymorphicMatcher MatchesRegex( + const internal::RE* regex) { + return MakePolymorphicMatcher(internal::MatchesRegexMatcher(regex, true)); +} +template +PolymorphicMatcher MatchesRegex( + const internal::StringLike& regex) { + return MatchesRegex(new internal::RE(std::string(regex))); +} + +// Matches a string that contains regular expression 'regex'. +// The matcher takes ownership of 'regex'. +inline PolymorphicMatcher ContainsRegex( + const internal::RE* regex) { + return MakePolymorphicMatcher(internal::MatchesRegexMatcher(regex, false)); +} +template +PolymorphicMatcher ContainsRegex( + const internal::StringLike& regex) { + return ContainsRegex(new internal::RE(std::string(regex))); +} + +// Creates a polymorphic matcher that matches anything equal to x. +// Note: if the parameter of Eq() were declared as const T&, Eq("foo") +// wouldn't compile. +template +inline internal::EqMatcher Eq(T x) { + return internal::EqMatcher(x); +} + +// Constructs a Matcher from a 'value' of type T. The constructed +// matcher matches any value that's equal to 'value'. +template +Matcher::Matcher(T value) { + *this = Eq(value); +} + +// Creates a monomorphic matcher that matches anything with type Lhs +// and equal to rhs. A user may need to use this instead of Eq(...) +// in order to resolve an overloading ambiguity. +// +// TypedEq(x) is just a convenient short-hand for Matcher(Eq(x)) +// or Matcher(x), but more readable than the latter. +// +// We could define similar monomorphic matchers for other comparison +// operations (e.g. TypedLt, TypedGe, and etc), but decided not to do +// it yet as those are used much less than Eq() in practice. A user +// can always write Matcher(Lt(5)) to be explicit about the type, +// for example. +template +inline Matcher TypedEq(const Rhs& rhs) { + return Eq(rhs); +} + +// Creates a polymorphic matcher that matches anything >= x. +template +inline internal::GeMatcher Ge(Rhs x) { + return internal::GeMatcher(x); +} + +// Creates a polymorphic matcher that matches anything > x. +template +inline internal::GtMatcher Gt(Rhs x) { + return internal::GtMatcher(x); +} + +// Creates a polymorphic matcher that matches anything <= x. +template +inline internal::LeMatcher Le(Rhs x) { + return internal::LeMatcher(x); +} + +// Creates a polymorphic matcher that matches anything < x. +template +inline internal::LtMatcher Lt(Rhs x) { + return internal::LtMatcher(x); +} + +// Creates a polymorphic matcher that matches anything != x. +template +inline internal::NeMatcher Ne(Rhs x) { + return internal::NeMatcher(x); +} +} // namespace testing + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 5046 + +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-message.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-message.h new file mode 100644 index 0000000000..6c8bf90009 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-message.h @@ -0,0 +1,218 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This header file defines the Message class. +// +// IMPORTANT NOTE: Due to limitation of the C++ language, we have to +// leave some internal implementation details in this header file. +// They are clearly marked by comments like this: +// +// // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +// +// Such code is NOT meant to be used by a user directly, and is subject +// to CHANGE WITHOUT NOTICE. Therefore DO NOT DEPEND ON IT in a user +// program! + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ + +#include +#include +#include + +#include "gtest/internal/gtest-port.h" + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +// Ensures that there is at least one operator<< in the global namespace. +// See Message& operator<<(...) below for why. +void operator<<(const testing::internal::Secret&, int); + +namespace testing { + +// The Message class works like an ostream repeater. +// +// Typical usage: +// +// 1. You stream a bunch of values to a Message object. +// It will remember the text in a stringstream. +// 2. Then you stream the Message object to an ostream. +// This causes the text in the Message to be streamed +// to the ostream. +// +// For example; +// +// testing::Message foo; +// foo << 1 << " != " << 2; +// std::cout << foo; +// +// will print "1 != 2". +// +// Message is not intended to be inherited from. In particular, its +// destructor is not virtual. +// +// Note that stringstream behaves differently in gcc and in MSVC. You +// can stream a NULL char pointer to it in the former, but not in the +// latter (it causes an access violation if you do). The Message +// class hides this difference by treating a NULL char pointer as +// "(null)". +class GTEST_API_ Message { + private: + // The type of basic IO manipulators (endl, ends, and flush) for + // narrow streams. + typedef std::ostream& (*BasicNarrowIoManip)(std::ostream&); + + public: + // Constructs an empty Message. + Message(); + + // Copy constructor. + Message(const Message& msg) : ss_(new ::std::stringstream) { // NOLINT + *ss_ << msg.GetString(); + } + + // Constructs a Message from a C-string. + explicit Message(const char* str) : ss_(new ::std::stringstream) { + *ss_ << str; + } + + // Streams a non-pointer value to this object. + template + inline Message& operator<<(const T& val) { + // Some libraries overload << for STL containers. These + // overloads are defined in the global namespace instead of ::std. + // + // C++'s symbol lookup rule (i.e. Koenig lookup) says that these + // overloads are visible in either the std namespace or the global + // namespace, but not other namespaces, including the testing + // namespace which Google Test's Message class is in. + // + // To allow STL containers (and other types that has a << operator + // defined in the global namespace) to be used in Google Test + // assertions, testing::Message must access the custom << operator + // from the global namespace. With this using declaration, + // overloads of << defined in the global namespace and those + // visible via Koenig lookup are both exposed in this function. + using ::operator<<; + *ss_ << val; + return *this; + } + + // Streams a pointer value to this object. + // + // This function is an overload of the previous one. When you + // stream a pointer to a Message, this definition will be used as it + // is more specialized. (The C++ Standard, section + // [temp.func.order].) If you stream a non-pointer, then the + // previous definition will be used. + // + // The reason for this overload is that streaming a NULL pointer to + // ostream is undefined behavior. Depending on the compiler, you + // may get "0", "(nil)", "(null)", or an access violation. To + // ensure consistent result across compilers, we always treat NULL + // as "(null)". + template + inline Message& operator<<(T* const& pointer) { // NOLINT + if (pointer == nullptr) { + *ss_ << "(null)"; + } else { + *ss_ << pointer; + } + return *this; + } + + // Since the basic IO manipulators are overloaded for both narrow + // and wide streams, we have to provide this specialized definition + // of operator <<, even though its body is the same as the + // templatized version above. Without this definition, streaming + // endl or other basic IO manipulators to Message will confuse the + // compiler. + Message& operator<<(BasicNarrowIoManip val) { + *ss_ << val; + return *this; + } + + // Instead of 1/0, we want to see true/false for bool values. + Message& operator<<(bool b) { return *this << (b ? "true" : "false"); } + + // These two overloads allow streaming a wide C string to a Message + // using the UTF-8 encoding. + Message& operator<<(const wchar_t* wide_c_str); + Message& operator<<(wchar_t* wide_c_str); + +#if GTEST_HAS_STD_WSTRING + // Converts the given wide string to a narrow string using the UTF-8 + // encoding, and streams the result to this Message object. + Message& operator<<(const ::std::wstring& wstr); +#endif // GTEST_HAS_STD_WSTRING + + // Gets the text streamed to this object so far as an std::string. + // Each '\0' character in the buffer is replaced with "\\0". + // + // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. + std::string GetString() const; + + private: + // We'll hold the text streamed to this object here. + const std::unique_ptr< ::std::stringstream> ss_; + + // We declare (but don't implement) this to prevent the compiler + // from implementing the assignment operator. + void operator=(const Message&); +}; + +// Streams a Message to an ostream. +inline std::ostream& operator<<(std::ostream& os, const Message& sb) { + return os << sb.GetString(); +} + +namespace internal { + +// Converts a streamable value to an std::string. A NULL pointer is +// converted to "(null)". When the input value is a ::string, +// ::std::string, ::wstring, or ::std::wstring object, each NUL +// character in it is replaced with "\\0". +template +std::string StreamableToString(const T& streamable) { + return (Message() << streamable).GetString(); +} + +} // namespace internal +} // namespace testing + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h new file mode 100644 index 0000000000..b55119ac62 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h @@ -0,0 +1,510 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Macros and functions for implementing parameterized tests +// in Google C++ Testing and Mocking Framework (Google Test) + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ + +// Value-parameterized tests allow you to test your code with different +// parameters without writing multiple copies of the same test. +// +// Here is how you use value-parameterized tests: + +#if 0 + +// To write value-parameterized tests, first you should define a fixture +// class. It is usually derived from testing::TestWithParam (see below for +// another inheritance scheme that's sometimes useful in more complicated +// class hierarchies), where the type of your parameter values. +// TestWithParam is itself derived from testing::Test. T can be any +// copyable type. If it's a raw pointer, you are responsible for managing the +// lifespan of the pointed values. + +class FooTest : public ::testing::TestWithParam { + // You can implement all the usual class fixture members here. +}; + +// Then, use the TEST_P macro to define as many parameterized tests +// for this fixture as you want. The _P suffix is for "parameterized" +// or "pattern", whichever you prefer to think. + +TEST_P(FooTest, DoesBlah) { + // Inside a test, access the test parameter with the GetParam() method + // of the TestWithParam class: + EXPECT_TRUE(foo.Blah(GetParam())); + ... +} + +TEST_P(FooTest, HasBlahBlah) { + ... +} + +// Finally, you can use INSTANTIATE_TEST_SUITE_P to instantiate the test +// case with any set of parameters you want. Google Test defines a number +// of functions for generating test parameters. They return what we call +// (surprise!) parameter generators. Here is a summary of them, which +// are all in the testing namespace: +// +// +// Range(begin, end [, step]) - Yields values {begin, begin+step, +// begin+step+step, ...}. The values do not +// include end. step defaults to 1. +// Values(v1, v2, ..., vN) - Yields values {v1, v2, ..., vN}. +// ValuesIn(container) - Yields values from a C-style array, an STL +// ValuesIn(begin,end) container, or an iterator range [begin, end). +// Bool() - Yields sequence {false, true}. +// Combine(g1, g2, ..., gN) - Yields all combinations (the Cartesian product +// for the math savvy) of the values generated +// by the N generators. +// +// For more details, see comments at the definitions of these functions below +// in this file. +// +// The following statement will instantiate tests from the FooTest test suite +// each with parameter values "meeny", "miny", and "moe". + +INSTANTIATE_TEST_SUITE_P(InstantiationName, + FooTest, + Values("meeny", "miny", "moe")); + +// To distinguish different instances of the pattern, (yes, you +// can instantiate it more than once) the first argument to the +// INSTANTIATE_TEST_SUITE_P macro is a prefix that will be added to the +// actual test suite name. Remember to pick unique prefixes for different +// instantiations. The tests from the instantiation above will have +// these names: +// +// * InstantiationName/FooTest.DoesBlah/0 for "meeny" +// * InstantiationName/FooTest.DoesBlah/1 for "miny" +// * InstantiationName/FooTest.DoesBlah/2 for "moe" +// * InstantiationName/FooTest.HasBlahBlah/0 for "meeny" +// * InstantiationName/FooTest.HasBlahBlah/1 for "miny" +// * InstantiationName/FooTest.HasBlahBlah/2 for "moe" +// +// You can use these names in --gtest_filter. +// +// This statement will instantiate all tests from FooTest again, each +// with parameter values "cat" and "dog": + +const char* pets[] = {"cat", "dog"}; +INSTANTIATE_TEST_SUITE_P(AnotherInstantiationName, FooTest, ValuesIn(pets)); + +// The tests from the instantiation above will have these names: +// +// * AnotherInstantiationName/FooTest.DoesBlah/0 for "cat" +// * AnotherInstantiationName/FooTest.DoesBlah/1 for "dog" +// * AnotherInstantiationName/FooTest.HasBlahBlah/0 for "cat" +// * AnotherInstantiationName/FooTest.HasBlahBlah/1 for "dog" +// +// Please note that INSTANTIATE_TEST_SUITE_P will instantiate all tests +// in the given test suite, whether their definitions come before or +// AFTER the INSTANTIATE_TEST_SUITE_P statement. +// +// Please also note that generator expressions (including parameters to the +// generators) are evaluated in InitGoogleTest(), after main() has started. +// This allows the user on one hand, to adjust generator parameters in order +// to dynamically determine a set of tests to run and on the other hand, +// give the user a chance to inspect the generated tests with Google Test +// reflection API before RUN_ALL_TESTS() is executed. +// +// You can see samples/sample7_unittest.cc and samples/sample8_unittest.cc +// for more examples. +// +// In the future, we plan to publish the API for defining new parameter +// generators. But for now this interface remains part of the internal +// implementation and is subject to change. +// +// +// A parameterized test fixture must be derived from testing::Test and from +// testing::WithParamInterface, where T is the type of the parameter +// values. Inheriting from TestWithParam satisfies that requirement because +// TestWithParam inherits from both Test and WithParamInterface. In more +// complicated hierarchies, however, it is occasionally useful to inherit +// separately from Test and WithParamInterface. For example: + +class BaseTest : public ::testing::Test { + // You can inherit all the usual members for a non-parameterized test + // fixture here. +}; + +class DerivedTest : public BaseTest, public ::testing::WithParamInterface { + // The usual test fixture members go here too. +}; + +TEST_F(BaseTest, HasFoo) { + // This is an ordinary non-parameterized test. +} + +TEST_P(DerivedTest, DoesBlah) { + // GetParam works just the same here as if you inherit from TestWithParam. + EXPECT_TRUE(foo.Blah(GetParam())); +} + +#endif // 0 + +#include +#include + +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-param-util.h" +#include "gtest/internal/gtest-port.h" + +namespace testing { + +// Functions producing parameter generators. +// +// Google Test uses these generators to produce parameters for value- +// parameterized tests. When a parameterized test suite is instantiated +// with a particular generator, Google Test creates and runs tests +// for each element in the sequence produced by the generator. +// +// In the following sample, tests from test suite FooTest are instantiated +// each three times with parameter values 3, 5, and 8: +// +// class FooTest : public TestWithParam { ... }; +// +// TEST_P(FooTest, TestThis) { +// } +// TEST_P(FooTest, TestThat) { +// } +// INSTANTIATE_TEST_SUITE_P(TestSequence, FooTest, Values(3, 5, 8)); +// + +// Range() returns generators providing sequences of values in a range. +// +// Synopsis: +// Range(start, end) +// - returns a generator producing a sequence of values {start, start+1, +// start+2, ..., }. +// Range(start, end, step) +// - returns a generator producing a sequence of values {start, start+step, +// start+step+step, ..., }. +// Notes: +// * The generated sequences never include end. For example, Range(1, 5) +// returns a generator producing a sequence {1, 2, 3, 4}. Range(1, 9, 2) +// returns a generator producing {1, 3, 5, 7}. +// * start and end must have the same type. That type may be any integral or +// floating-point type or a user defined type satisfying these conditions: +// * It must be assignable (have operator=() defined). +// * It must have operator+() (operator+(int-compatible type) for +// two-operand version). +// * It must have operator<() defined. +// Elements in the resulting sequences will also have that type. +// * Condition start < end must be satisfied in order for resulting sequences +// to contain any elements. +// +template +internal::ParamGenerator Range(T start, T end, IncrementT step) { + return internal::ParamGenerator( + new internal::RangeGenerator(start, end, step)); +} + +template +internal::ParamGenerator Range(T start, T end) { + return Range(start, end, 1); +} + +// ValuesIn() function allows generation of tests with parameters coming from +// a container. +// +// Synopsis: +// ValuesIn(const T (&array)[N]) +// - returns a generator producing sequences with elements from +// a C-style array. +// ValuesIn(const Container& container) +// - returns a generator producing sequences with elements from +// an STL-style container. +// ValuesIn(Iterator begin, Iterator end) +// - returns a generator producing sequences with elements from +// a range [begin, end) defined by a pair of STL-style iterators. These +// iterators can also be plain C pointers. +// +// Please note that ValuesIn copies the values from the containers +// passed in and keeps them to generate tests in RUN_ALL_TESTS(). +// +// Examples: +// +// This instantiates tests from test suite StringTest +// each with C-string values of "foo", "bar", and "baz": +// +// const char* strings[] = {"foo", "bar", "baz"}; +// INSTANTIATE_TEST_SUITE_P(StringSequence, StringTest, ValuesIn(strings)); +// +// This instantiates tests from test suite StlStringTest +// each with STL strings with values "a" and "b": +// +// ::std::vector< ::std::string> GetParameterStrings() { +// ::std::vector< ::std::string> v; +// v.push_back("a"); +// v.push_back("b"); +// return v; +// } +// +// INSTANTIATE_TEST_SUITE_P(CharSequence, +// StlStringTest, +// ValuesIn(GetParameterStrings())); +// +// +// This will also instantiate tests from CharTest +// each with parameter values 'a' and 'b': +// +// ::std::list GetParameterChars() { +// ::std::list list; +// list.push_back('a'); +// list.push_back('b'); +// return list; +// } +// ::std::list l = GetParameterChars(); +// INSTANTIATE_TEST_SUITE_P(CharSequence2, +// CharTest, +// ValuesIn(l.begin(), l.end())); +// +template +internal::ParamGenerator< + typename std::iterator_traits::value_type> +ValuesIn(ForwardIterator begin, ForwardIterator end) { + typedef typename std::iterator_traits::value_type ParamType; + return internal::ParamGenerator( + new internal::ValuesInIteratorRangeGenerator(begin, end)); +} + +template +internal::ParamGenerator ValuesIn(const T (&array)[N]) { + return ValuesIn(array, array + N); +} + +template +internal::ParamGenerator ValuesIn( + const Container& container) { + return ValuesIn(container.begin(), container.end()); +} + +// Values() allows generating tests from explicitly specified list of +// parameters. +// +// Synopsis: +// Values(T v1, T v2, ..., T vN) +// - returns a generator producing sequences with elements v1, v2, ..., vN. +// +// For example, this instantiates tests from test suite BarTest each +// with values "one", "two", and "three": +// +// INSTANTIATE_TEST_SUITE_P(NumSequence, +// BarTest, +// Values("one", "two", "three")); +// +// This instantiates tests from test suite BazTest each with values 1, 2, 3.5. +// The exact type of values will depend on the type of parameter in BazTest. +// +// INSTANTIATE_TEST_SUITE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5)); +// +// +template +internal::ValueArray Values(T... v) { + return internal::ValueArray(std::move(v)...); +} + +// Bool() allows generating tests with parameters in a set of (false, true). +// +// Synopsis: +// Bool() +// - returns a generator producing sequences with elements {false, true}. +// +// It is useful when testing code that depends on Boolean flags. Combinations +// of multiple flags can be tested when several Bool()'s are combined using +// Combine() function. +// +// In the following example all tests in the test suite FlagDependentTest +// will be instantiated twice with parameters false and true. +// +// class FlagDependentTest : public testing::TestWithParam { +// virtual void SetUp() { +// external_flag = GetParam(); +// } +// } +// INSTANTIATE_TEST_SUITE_P(BoolSequence, FlagDependentTest, Bool()); +// +inline internal::ParamGenerator Bool() { return Values(false, true); } + +// Combine() allows the user to combine two or more sequences to produce +// values of a Cartesian product of those sequences' elements. +// +// Synopsis: +// Combine(gen1, gen2, ..., genN) +// - returns a generator producing sequences with elements coming from +// the Cartesian product of elements from the sequences generated by +// gen1, gen2, ..., genN. The sequence elements will have a type of +// std::tuple where T1, T2, ..., TN are the types +// of elements from sequences produces by gen1, gen2, ..., genN. +// +// Example: +// +// This will instantiate tests in test suite AnimalTest each one with +// the parameter values tuple("cat", BLACK), tuple("cat", WHITE), +// tuple("dog", BLACK), and tuple("dog", WHITE): +// +// enum Color { BLACK, GRAY, WHITE }; +// class AnimalTest +// : public testing::TestWithParam > {...}; +// +// TEST_P(AnimalTest, AnimalLooksNice) {...} +// +// INSTANTIATE_TEST_SUITE_P(AnimalVariations, AnimalTest, +// Combine(Values("cat", "dog"), +// Values(BLACK, WHITE))); +// +// This will instantiate tests in FlagDependentTest with all variations of two +// Boolean flags: +// +// class FlagDependentTest +// : public testing::TestWithParam > { +// virtual void SetUp() { +// // Assigns external_flag_1 and external_flag_2 values from the tuple. +// std::tie(external_flag_1, external_flag_2) = GetParam(); +// } +// }; +// +// TEST_P(FlagDependentTest, TestFeature1) { +// // Test your code using external_flag_1 and external_flag_2 here. +// } +// INSTANTIATE_TEST_SUITE_P(TwoBoolSequence, FlagDependentTest, +// Combine(Bool(), Bool())); +// +template +internal::CartesianProductHolder Combine(const Generator&... g) { + return internal::CartesianProductHolder(g...); +} + +#define TEST_P(test_suite_name, test_name) \ + class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \ + : public test_suite_name { \ + public: \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() {} \ + void TestBody() override; \ + \ + private: \ + static int AddToRegistry() { \ + ::testing::UnitTest::GetInstance() \ + ->parameterized_test_registry() \ + .GetTestSuitePatternHolder( \ + GTEST_STRINGIFY_(test_suite_name), \ + ::testing::internal::CodeLocation(__FILE__, __LINE__)) \ + ->AddTestPattern( \ + GTEST_STRINGIFY_(test_suite_name), GTEST_STRINGIFY_(test_name), \ + new ::testing::internal::TestMetaFactory(), \ + ::testing::internal::CodeLocation(__FILE__, __LINE__)); \ + return 0; \ + } \ + static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_; \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \ + (const GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &) = delete; \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=( \ + const GTEST_TEST_CLASS_NAME_(test_suite_name, \ + test_name) &) = delete; /* NOLINT */ \ + }; \ + int GTEST_TEST_CLASS_NAME_(test_suite_name, \ + test_name)::gtest_registering_dummy_ = \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::AddToRegistry(); \ + void GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::TestBody() + +// The last argument to INSTANTIATE_TEST_SUITE_P allows the user to specify +// generator and an optional function or functor that generates custom test name +// suffixes based on the test parameters. Such a function or functor should +// accept one argument of type testing::TestParamInfo, and +// return std::string. +// +// testing::PrintToStringParamName is a builtin test suffix generator that +// returns the value of testing::PrintToString(GetParam()). +// +// Note: test names must be non-empty, unique, and may only contain ASCII +// alphanumeric characters or underscore. Because PrintToString adds quotes +// to std::string and C strings, it won't work for these types. + +#define GTEST_EXPAND_(arg) arg +#define GTEST_GET_FIRST_(first, ...) first +#define GTEST_GET_SECOND_(first, second, ...) second + +#define INSTANTIATE_TEST_SUITE_P(prefix, test_suite_name, ...) \ + static ::testing::internal::ParamGenerator \ + gtest_##prefix##test_suite_name##_EvalGenerator_() { \ + return GTEST_EXPAND_(GTEST_GET_FIRST_(__VA_ARGS__, DUMMY_PARAM_)); \ + } \ + static ::std::string gtest_##prefix##test_suite_name##_EvalGenerateName_( \ + const ::testing::TestParamInfo& info) { \ + if (::testing::internal::AlwaysFalse()) { \ + ::testing::internal::TestNotEmpty(GTEST_EXPAND_(GTEST_GET_SECOND_( \ + __VA_ARGS__, \ + ::testing::internal::DefaultParamName, \ + DUMMY_PARAM_))); \ + auto t = std::make_tuple(__VA_ARGS__); \ + static_assert(std::tuple_size::value <= 2, \ + "Too Many Args!"); \ + } \ + return ((GTEST_EXPAND_(GTEST_GET_SECOND_( \ + __VA_ARGS__, \ + ::testing::internal::DefaultParamName, \ + DUMMY_PARAM_))))(info); \ + } \ + static int gtest_##prefix##test_suite_name##_dummy_ \ + GTEST_ATTRIBUTE_UNUSED_ = \ + ::testing::UnitTest::GetInstance() \ + ->parameterized_test_registry() \ + .GetTestSuitePatternHolder( \ + GTEST_STRINGIFY_(test_suite_name), \ + ::testing::internal::CodeLocation(__FILE__, __LINE__)) \ + ->AddTestSuiteInstantiation( \ + GTEST_STRINGIFY_(prefix), \ + >est_##prefix##test_suite_name##_EvalGenerator_, \ + >est_##prefix##test_suite_name##_EvalGenerateName_, \ + __FILE__, __LINE__) + +// Allow Marking a Parameterized test class as not needing to be instantiated. +#define GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(T) \ + namespace gtest_do_not_use_outside_namespace_scope {} \ + static const ::testing::internal::MarkAsIgnored gtest_allow_ignore_##T( \ + GTEST_STRINGIFY_(T)) + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +#define INSTANTIATE_TEST_CASE_P \ + static_assert(::testing::internal::InstantiateTestCase_P_IsDeprecated(), \ + ""); \ + INSTANTIATE_TEST_SUITE_P +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +} // namespace testing + +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h new file mode 100644 index 0000000000..a91e8b8b10 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h @@ -0,0 +1,1048 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Google Test - The Google C++ Testing and Mocking Framework +// +// This file implements a universal value printer that can print a +// value of any type T: +// +// void ::testing::internal::UniversalPrinter::Print(value, ostream_ptr); +// +// A user can teach this function how to print a class type T by +// defining either operator<<() or PrintTo() in the namespace that +// defines T. More specifically, the FIRST defined function in the +// following list will be used (assuming T is defined in namespace +// foo): +// +// 1. foo::PrintTo(const T&, ostream*) +// 2. operator<<(ostream&, const T&) defined in either foo or the +// global namespace. +// +// However if T is an STL-style container then it is printed element-wise +// unless foo::PrintTo(const T&, ostream*) is defined. Note that +// operator<<() is ignored for container types. +// +// If none of the above is defined, it will print the debug string of +// the value if it is a protocol buffer, or print the raw bytes in the +// value otherwise. +// +// To aid debugging: when T is a reference type, the address of the +// value is also printed; when T is a (const) char pointer, both the +// pointer value and the NUL-terminated string it points to are +// printed. +// +// We also provide some convenient wrappers: +// +// // Prints a value to a string. For a (const or not) char +// // pointer, the NUL-terminated string (but not the pointer) is +// // printed. +// std::string ::testing::PrintToString(const T& value); +// +// // Prints a value tersely: for a reference type, the referenced +// // value (but not the address) is printed; for a (const or not) char +// // pointer, the NUL-terminated string (but not the pointer) is +// // printed. +// void ::testing::internal::UniversalTersePrint(const T& value, ostream*); +// +// // Prints value using the type inferred by the compiler. The difference +// // from UniversalTersePrint() is that this function prints both the +// // pointer and the NUL-terminated string for a (const or not) char pointer. +// void ::testing::internal::UniversalPrint(const T& value, ostream*); +// +// // Prints the fields of a tuple tersely to a string vector, one +// // element for each field. Tuple support must be enabled in +// // gtest-port.h. +// std::vector UniversalTersePrintTupleFieldsToStrings( +// const Tuple& value); +// +// Known limitation: +// +// The print primitives print the elements of an STL-style container +// using the compiler-inferred type of *iter where iter is a +// const_iterator of the container. When const_iterator is an input +// iterator but not a forward iterator, this inferred type may not +// match value_type, and the print output may be incorrect. In +// practice, this is rarely a problem as for most containers +// const_iterator is a forward iterator. We'll fix this if there's an +// actual need for it. Note that this fix cannot rely on value_type +// being defined as many user-defined container types don't have +// value_type. + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ + +#include +#include +#include // NOLINT +#include +#include +#include +#include +#include +#include + +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-port.h" + +namespace testing { + +// Definitions in the internal* namespaces are subject to change without notice. +// DO NOT USE THEM IN USER CODE! +namespace internal { + +template +void UniversalPrint(const T& value, ::std::ostream* os); + +// Used to print an STL-style container when the user doesn't define +// a PrintTo() for it. +struct ContainerPrinter { + template (0)) == sizeof(IsContainer)) && + !IsRecursiveContainer::value>::type> + static void PrintValue(const T& container, std::ostream* os) { + const size_t kMaxCount = 32; // The maximum number of elements to print. + *os << '{'; + size_t count = 0; + for (auto&& elem : container) { + if (count > 0) { + *os << ','; + if (count == kMaxCount) { // Enough has been printed. + *os << " ..."; + break; + } + } + *os << ' '; + // We cannot call PrintTo(elem, os) here as PrintTo() doesn't + // handle `elem` being a native array. + internal::UniversalPrint(elem, os); + ++count; + } + + if (count > 0) { + *os << ' '; + } + *os << '}'; + } +}; + +// Used to print a pointer that is neither a char pointer nor a member +// pointer, when the user doesn't define PrintTo() for it. (A member +// variable pointer or member function pointer doesn't really point to +// a location in the address space. Their representation is +// implementation-defined. Therefore they will be printed as raw +// bytes.) +struct FunctionPointerPrinter { + template ::value>::type> + static void PrintValue(T* p, ::std::ostream* os) { + if (p == nullptr) { + *os << "NULL"; + } else { + // T is a function type, so '*os << p' doesn't do what we want + // (it just prints p as bool). We want to print p as a const + // void*. + *os << reinterpret_cast(p); + } + } +}; + +struct PointerPrinter { + template + static void PrintValue(T* p, ::std::ostream* os) { + if (p == nullptr) { + *os << "NULL"; + } else { + // T is not a function type. We just call << to print p, + // relying on ADL to pick up user-defined << for their pointer + // types, if any. + *os << p; + } + } +}; + +namespace internal_stream_operator_without_lexical_name_lookup { + +// The presence of an operator<< here will terminate lexical scope lookup +// straight away (even though it cannot be a match because of its argument +// types). Thus, the two operator<< calls in StreamPrinter will find only ADL +// candidates. +struct LookupBlocker {}; +void operator<<(LookupBlocker, LookupBlocker); + +struct StreamPrinter { + template ::value>::type, + // Only accept types for which we can find a streaming operator via + // ADL (possibly involving implicit conversions). + typename = decltype(std::declval() + << std::declval())> + static void PrintValue(const T& value, ::std::ostream* os) { + // Call streaming operator found by ADL, possibly with implicit conversions + // of the arguments. + *os << value; + } +}; + +} // namespace internal_stream_operator_without_lexical_name_lookup + +struct ProtobufPrinter { + // We print a protobuf using its ShortDebugString() when the string + // doesn't exceed this many characters; otherwise we print it using + // DebugString() for better readability. + static const size_t kProtobufOneLinerMaxLength = 50; + + template ::value>::type> + static void PrintValue(const T& value, ::std::ostream* os) { + std::string pretty_str = value.ShortDebugString(); + if (pretty_str.length() > kProtobufOneLinerMaxLength) { + pretty_str = "\n" + value.DebugString(); + } + *os << ("<" + pretty_str + ">"); + } +}; + +struct ConvertibleToIntegerPrinter { + // Since T has no << operator or PrintTo() but can be implicitly + // converted to BiggestInt, we print it as a BiggestInt. + // + // Most likely T is an enum type (either named or unnamed), in which + // case printing it as an integer is the desired behavior. In case + // T is not an enum, printing it as an integer is the best we can do + // given that it has no user-defined printer. + static void PrintValue(internal::BiggestInt value, ::std::ostream* os) { + *os << value; + } +}; + +struct ConvertibleToStringViewPrinter { +#if GTEST_INTERNAL_HAS_STRING_VIEW + static void PrintValue(internal::StringView value, ::std::ostream* os) { + internal::UniversalPrint(value, os); + } +#endif +}; + +// Prints the given number of bytes in the given object to the given +// ostream. +GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes, + size_t count, ::std::ostream* os); +struct RawBytesPrinter { + // SFINAE on `sizeof` to make sure we have a complete type. + template + static void PrintValue(const T& value, ::std::ostream* os) { + PrintBytesInObjectTo( + static_cast( + // Load bearing cast to void* to support iOS + reinterpret_cast(std::addressof(value))), + sizeof(value), os); + } +}; + +struct FallbackPrinter { + template + static void PrintValue(const T&, ::std::ostream* os) { + *os << "(incomplete type)"; + } +}; + +// Try every printer in order and return the first one that works. +template +struct FindFirstPrinter : FindFirstPrinter {}; + +template +struct FindFirstPrinter< + T, decltype(Printer::PrintValue(std::declval(), nullptr)), + Printer, Printers...> { + using type = Printer; +}; + +// Select the best printer in the following order: +// - Print containers (they have begin/end/etc). +// - Print function pointers. +// - Print object pointers. +// - Use the stream operator, if available. +// - Print protocol buffers. +// - Print types convertible to BiggestInt. +// - Print types convertible to StringView, if available. +// - Fallback to printing the raw bytes of the object. +template +void PrintWithFallback(const T& value, ::std::ostream* os) { + using Printer = typename FindFirstPrinter< + T, void, ContainerPrinter, FunctionPointerPrinter, PointerPrinter, + internal_stream_operator_without_lexical_name_lookup::StreamPrinter, + ProtobufPrinter, ConvertibleToIntegerPrinter, + ConvertibleToStringViewPrinter, RawBytesPrinter, FallbackPrinter>::type; + Printer::PrintValue(value, os); +} + +// FormatForComparison::Format(value) formats a +// value of type ToPrint that is an operand of a comparison assertion +// (e.g. ASSERT_EQ). OtherOperand is the type of the other operand in +// the comparison, and is used to help determine the best way to +// format the value. In particular, when the value is a C string +// (char pointer) and the other operand is an STL string object, we +// want to format the C string as a string, since we know it is +// compared by value with the string object. If the value is a char +// pointer but the other operand is not an STL string object, we don't +// know whether the pointer is supposed to point to a NUL-terminated +// string, and thus want to print it as a pointer to be safe. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. + +// The default case. +template +class FormatForComparison { + public: + static ::std::string Format(const ToPrint& value) { + return ::testing::PrintToString(value); + } +}; + +// Array. +template +class FormatForComparison { + public: + static ::std::string Format(const ToPrint* value) { + return FormatForComparison::Format(value); + } +}; + +// By default, print C string as pointers to be safe, as we don't know +// whether they actually point to a NUL-terminated string. + +#define GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(CharType) \ + template \ + class FormatForComparison { \ + public: \ + static ::std::string Format(CharType* value) { \ + return ::testing::PrintToString(static_cast(value)); \ + } \ + } + +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char); +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char); +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t); +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t); +#ifdef __cpp_lib_char8_t +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char8_t); +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char8_t); +#endif +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char16_t); +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char16_t); +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char32_t); +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char32_t); + +#undef GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_ + +// If a C string is compared with an STL string object, we know it's meant +// to point to a NUL-terminated string, and thus can print it as a string. + +#define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \ + template <> \ + class FormatForComparison { \ + public: \ + static ::std::string Format(CharType* value) { \ + return ::testing::PrintToString(value); \ + } \ + } + +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string); +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string); +#ifdef __cpp_char8_t +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char8_t, ::std::u8string); +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char8_t, ::std::u8string); +#endif +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char16_t, ::std::u16string); +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char16_t, ::std::u16string); +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char32_t, ::std::u32string); +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char32_t, ::std::u32string); + +#if GTEST_HAS_STD_WSTRING +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring); +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring); +#endif + +#undef GTEST_IMPL_FORMAT_C_STRING_AS_STRING_ + +// Formats a comparison assertion (e.g. ASSERT_EQ, EXPECT_LT, and etc) +// operand to be used in a failure message. The type (but not value) +// of the other operand may affect the format. This allows us to +// print a char* as a raw pointer when it is compared against another +// char* or void*, and print it as a C string when it is compared +// against an std::string object, for example. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +template +std::string FormatForComparisonFailureMessage(const T1& value, + const T2& /* other_operand */) { + return FormatForComparison::Format(value); +} + +// UniversalPrinter::Print(value, ostream_ptr) prints the given +// value to the given ostream. The caller must ensure that +// 'ostream_ptr' is not NULL, or the behavior is undefined. +// +// We define UniversalPrinter as a class template (as opposed to a +// function template), as we need to partially specialize it for +// reference types, which cannot be done with function templates. +template +class UniversalPrinter; + +// Prints the given value using the << operator if it has one; +// otherwise prints the bytes in it. This is what +// UniversalPrinter::Print() does when PrintTo() is not specialized +// or overloaded for type T. +// +// A user can override this behavior for a class type Foo by defining +// an overload of PrintTo() in the namespace where Foo is defined. We +// give the user this option as sometimes defining a << operator for +// Foo is not desirable (e.g. the coding style may prevent doing it, +// or there is already a << operator but it doesn't do what the user +// wants). +template +void PrintTo(const T& value, ::std::ostream* os) { + internal::PrintWithFallback(value, os); +} + +// The following list of PrintTo() overloads tells +// UniversalPrinter::Print() how to print standard types (built-in +// types, strings, plain arrays, and pointers). + +// Overloads for various char types. +GTEST_API_ void PrintTo(unsigned char c, ::std::ostream* os); +GTEST_API_ void PrintTo(signed char c, ::std::ostream* os); +inline void PrintTo(char c, ::std::ostream* os) { + // When printing a plain char, we always treat it as unsigned. This + // way, the output won't be affected by whether the compiler thinks + // char is signed or not. + PrintTo(static_cast(c), os); +} + +// Overloads for other simple built-in types. +inline void PrintTo(bool x, ::std::ostream* os) { + *os << (x ? "true" : "false"); +} + +// Overload for wchar_t type. +// Prints a wchar_t as a symbol if it is printable or as its internal +// code otherwise and also as its decimal code (except for L'\0'). +// The L'\0' char is printed as "L'\\0'". The decimal code is printed +// as signed integer when wchar_t is implemented by the compiler +// as a signed type and is printed as an unsigned integer when wchar_t +// is implemented as an unsigned type. +GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream* os); + +GTEST_API_ void PrintTo(char32_t c, ::std::ostream* os); +inline void PrintTo(char16_t c, ::std::ostream* os) { + PrintTo(ImplicitCast_(c), os); +} +#ifdef __cpp_char8_t +inline void PrintTo(char8_t c, ::std::ostream* os) { + PrintTo(ImplicitCast_(c), os); +} +#endif + +// gcc/clang __{u,}int128_t +#if defined(__SIZEOF_INT128__) +GTEST_API_ void PrintTo(__uint128_t v, ::std::ostream* os); +GTEST_API_ void PrintTo(__int128_t v, ::std::ostream* os); +#endif // __SIZEOF_INT128__ + +// Overloads for C strings. +GTEST_API_ void PrintTo(const char* s, ::std::ostream* os); +inline void PrintTo(char* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} + +// signed/unsigned char is often used for representing binary data, so +// we print pointers to it as void* to be safe. +inline void PrintTo(const signed char* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} +inline void PrintTo(signed char* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} +inline void PrintTo(const unsigned char* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} +inline void PrintTo(unsigned char* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} +#ifdef __cpp_char8_t +// Overloads for u8 strings. +GTEST_API_ void PrintTo(const char8_t* s, ::std::ostream* os); +inline void PrintTo(char8_t* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} +#endif +// Overloads for u16 strings. +GTEST_API_ void PrintTo(const char16_t* s, ::std::ostream* os); +inline void PrintTo(char16_t* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} +// Overloads for u32 strings. +GTEST_API_ void PrintTo(const char32_t* s, ::std::ostream* os); +inline void PrintTo(char32_t* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} + +// MSVC can be configured to define wchar_t as a typedef of unsigned +// short. It defines _NATIVE_WCHAR_T_DEFINED when wchar_t is a native +// type. When wchar_t is a typedef, defining an overload for const +// wchar_t* would cause unsigned short* be printed as a wide string, +// possibly causing invalid memory accesses. +#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED) +// Overloads for wide C strings +GTEST_API_ void PrintTo(const wchar_t* s, ::std::ostream* os); +inline void PrintTo(wchar_t* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} +#endif + +// Overload for C arrays. Multi-dimensional arrays are printed +// properly. + +// Prints the given number of elements in an array, without printing +// the curly braces. +template +void PrintRawArrayTo(const T a[], size_t count, ::std::ostream* os) { + UniversalPrint(a[0], os); + for (size_t i = 1; i != count; i++) { + *os << ", "; + UniversalPrint(a[i], os); + } +} + +// Overloads for ::std::string. +GTEST_API_ void PrintStringTo(const ::std::string& s, ::std::ostream* os); +inline void PrintTo(const ::std::string& s, ::std::ostream* os) { + PrintStringTo(s, os); +} + +// Overloads for ::std::u8string +#ifdef __cpp_char8_t +GTEST_API_ void PrintU8StringTo(const ::std::u8string& s, ::std::ostream* os); +inline void PrintTo(const ::std::u8string& s, ::std::ostream* os) { + PrintU8StringTo(s, os); +} +#endif + +// Overloads for ::std::u16string +GTEST_API_ void PrintU16StringTo(const ::std::u16string& s, ::std::ostream* os); +inline void PrintTo(const ::std::u16string& s, ::std::ostream* os) { + PrintU16StringTo(s, os); +} + +// Overloads for ::std::u32string +GTEST_API_ void PrintU32StringTo(const ::std::u32string& s, ::std::ostream* os); +inline void PrintTo(const ::std::u32string& s, ::std::ostream* os) { + PrintU32StringTo(s, os); +} + +// Overloads for ::std::wstring. +#if GTEST_HAS_STD_WSTRING +GTEST_API_ void PrintWideStringTo(const ::std::wstring& s, ::std::ostream* os); +inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) { + PrintWideStringTo(s, os); +} +#endif // GTEST_HAS_STD_WSTRING + +#if GTEST_INTERNAL_HAS_STRING_VIEW +// Overload for internal::StringView. +inline void PrintTo(internal::StringView sp, ::std::ostream* os) { + PrintTo(::std::string(sp), os); +} +#endif // GTEST_INTERNAL_HAS_STRING_VIEW + +inline void PrintTo(std::nullptr_t, ::std::ostream* os) { *os << "(nullptr)"; } + +#if GTEST_HAS_RTTI +inline void PrintTo(const std::type_info& info, std::ostream* os) { + *os << internal::GetTypeName(info); +} +#endif // GTEST_HAS_RTTI + +template +void PrintTo(std::reference_wrapper ref, ::std::ostream* os) { + UniversalPrinter::Print(ref.get(), os); +} + +inline const void* VoidifyPointer(const void* p) { return p; } +inline const void* VoidifyPointer(volatile const void* p) { + return const_cast(p); +} + +template +void PrintSmartPointer(const Ptr& ptr, std::ostream* os, char) { + if (ptr == nullptr) { + *os << "(nullptr)"; + } else { + // We can't print the value. Just print the pointer.. + *os << "(" << (VoidifyPointer)(ptr.get()) << ")"; + } +} +template ::value && + !std::is_array::value>::type> +void PrintSmartPointer(const Ptr& ptr, std::ostream* os, int) { + if (ptr == nullptr) { + *os << "(nullptr)"; + } else { + *os << "(ptr = " << (VoidifyPointer)(ptr.get()) << ", value = "; + UniversalPrinter::Print(*ptr, os); + *os << ")"; + } +} + +template +void PrintTo(const std::unique_ptr& ptr, std::ostream* os) { + (PrintSmartPointer)(ptr, os, 0); +} + +template +void PrintTo(const std::shared_ptr& ptr, std::ostream* os) { + (PrintSmartPointer)(ptr, os, 0); +} + +// Helper function for printing a tuple. T must be instantiated with +// a tuple type. +template +void PrintTupleTo(const T&, std::integral_constant, + ::std::ostream*) {} + +template +void PrintTupleTo(const T& t, std::integral_constant, + ::std::ostream* os) { + PrintTupleTo(t, std::integral_constant(), os); + GTEST_INTENTIONAL_CONST_COND_PUSH_() + if (I > 1) { + GTEST_INTENTIONAL_CONST_COND_POP_() + *os << ", "; + } + UniversalPrinter::type>::Print( + std::get(t), os); +} + +template +void PrintTo(const ::std::tuple& t, ::std::ostream* os) { + *os << "("; + PrintTupleTo(t, std::integral_constant(), os); + *os << ")"; +} + +// Overload for std::pair. +template +void PrintTo(const ::std::pair& value, ::std::ostream* os) { + *os << '('; + // We cannot use UniversalPrint(value.first, os) here, as T1 may be + // a reference type. The same for printing value.second. + UniversalPrinter::Print(value.first, os); + *os << ", "; + UniversalPrinter::Print(value.second, os); + *os << ')'; +} + +// Implements printing a non-reference type T by letting the compiler +// pick the right overload of PrintTo() for T. +template +class UniversalPrinter { + public: + // MSVC warns about adding const to a function type, so we want to + // disable the warning. + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4180) + + // Note: we deliberately don't call this PrintTo(), as that name + // conflicts with ::testing::internal::PrintTo in the body of the + // function. + static void Print(const T& value, ::std::ostream* os) { + // By default, ::testing::internal::PrintTo() is used for printing + // the value. + // + // Thanks to Koenig look-up, if T is a class and has its own + // PrintTo() function defined in its namespace, that function will + // be visible here. Since it is more specific than the generic ones + // in ::testing::internal, it will be picked by the compiler in the + // following statement - exactly what we want. + PrintTo(value, os); + } + + GTEST_DISABLE_MSC_WARNINGS_POP_() +}; + +// Remove any const-qualifiers before passing a type to UniversalPrinter. +template +class UniversalPrinter : public UniversalPrinter {}; + +#if GTEST_INTERNAL_HAS_ANY + +// Printer for std::any / absl::any + +template <> +class UniversalPrinter { + public: + static void Print(const Any& value, ::std::ostream* os) { + if (value.has_value()) { + *os << "value of type " << GetTypeName(value); + } else { + *os << "no value"; + } + } + + private: + static std::string GetTypeName(const Any& value) { +#if GTEST_HAS_RTTI + return internal::GetTypeName(value.type()); +#else + static_cast(value); // possibly unused + return ""; +#endif // GTEST_HAS_RTTI + } +}; + +#endif // GTEST_INTERNAL_HAS_ANY + +#if GTEST_INTERNAL_HAS_OPTIONAL + +// Printer for std::optional / absl::optional + +template +class UniversalPrinter> { + public: + static void Print(const Optional& value, ::std::ostream* os) { + *os << '('; + if (!value) { + *os << "nullopt"; + } else { + UniversalPrint(*value, os); + } + *os << ')'; + } +}; + +template <> +class UniversalPrinter { + public: + static void Print(decltype(Nullopt()), ::std::ostream* os) { + *os << "(nullopt)"; + } +}; + +#endif // GTEST_INTERNAL_HAS_OPTIONAL + +#if GTEST_INTERNAL_HAS_VARIANT + +// Printer for std::variant / absl::variant + +template +class UniversalPrinter> { + public: + static void Print(const Variant& value, ::std::ostream* os) { + *os << '('; +#if GTEST_HAS_ABSL + absl::visit(Visitor{os, value.index()}, value); +#else + std::visit(Visitor{os, value.index()}, value); +#endif // GTEST_HAS_ABSL + *os << ')'; + } + + private: + struct Visitor { + template + void operator()(const U& u) const { + *os << "'" << GetTypeName() << "(index = " << index + << ")' with value "; + UniversalPrint(u, os); + } + ::std::ostream* os; + std::size_t index; + }; +}; + +#endif // GTEST_INTERNAL_HAS_VARIANT + +// UniversalPrintArray(begin, len, os) prints an array of 'len' +// elements, starting at address 'begin'. +template +void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) { + if (len == 0) { + *os << "{}"; + } else { + *os << "{ "; + const size_t kThreshold = 18; + const size_t kChunkSize = 8; + // If the array has more than kThreshold elements, we'll have to + // omit some details by printing only the first and the last + // kChunkSize elements. + if (len <= kThreshold) { + PrintRawArrayTo(begin, len, os); + } else { + PrintRawArrayTo(begin, kChunkSize, os); + *os << ", ..., "; + PrintRawArrayTo(begin + len - kChunkSize, kChunkSize, os); + } + *os << " }"; + } +} +// This overload prints a (const) char array compactly. +GTEST_API_ void UniversalPrintArray(const char* begin, size_t len, + ::std::ostream* os); + +#ifdef __cpp_char8_t +// This overload prints a (const) char8_t array compactly. +GTEST_API_ void UniversalPrintArray(const char8_t* begin, size_t len, + ::std::ostream* os); +#endif + +// This overload prints a (const) char16_t array compactly. +GTEST_API_ void UniversalPrintArray(const char16_t* begin, size_t len, + ::std::ostream* os); + +// This overload prints a (const) char32_t array compactly. +GTEST_API_ void UniversalPrintArray(const char32_t* begin, size_t len, + ::std::ostream* os); + +// This overload prints a (const) wchar_t array compactly. +GTEST_API_ void UniversalPrintArray(const wchar_t* begin, size_t len, + ::std::ostream* os); + +// Implements printing an array type T[N]. +template +class UniversalPrinter { + public: + // Prints the given array, omitting some elements when there are too + // many. + static void Print(const T (&a)[N], ::std::ostream* os) { + UniversalPrintArray(a, N, os); + } +}; + +// Implements printing a reference type T&. +template +class UniversalPrinter { + public: + // MSVC warns about adding const to a function type, so we want to + // disable the warning. + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4180) + + static void Print(const T& value, ::std::ostream* os) { + // Prints the address of the value. We use reinterpret_cast here + // as static_cast doesn't compile when T is a function type. + *os << "@" << reinterpret_cast(&value) << " "; + + // Then prints the value itself. + UniversalPrint(value, os); + } + + GTEST_DISABLE_MSC_WARNINGS_POP_() +}; + +// Prints a value tersely: for a reference type, the referenced value +// (but not the address) is printed; for a (const) char pointer, the +// NUL-terminated string (but not the pointer) is printed. + +template +class UniversalTersePrinter { + public: + static void Print(const T& value, ::std::ostream* os) { + UniversalPrint(value, os); + } +}; +template +class UniversalTersePrinter { + public: + static void Print(const T& value, ::std::ostream* os) { + UniversalPrint(value, os); + } +}; +template +class UniversalTersePrinter { + public: + static void Print(const T (&value)[N], ::std::ostream* os) { + UniversalPrinter::Print(value, os); + } +}; +template <> +class UniversalTersePrinter { + public: + static void Print(const char* str, ::std::ostream* os) { + if (str == nullptr) { + *os << "NULL"; + } else { + UniversalPrint(std::string(str), os); + } + } +}; +template <> +class UniversalTersePrinter : public UniversalTersePrinter { +}; + +#ifdef __cpp_char8_t +template <> +class UniversalTersePrinter { + public: + static void Print(const char8_t* str, ::std::ostream* os) { + if (str == nullptr) { + *os << "NULL"; + } else { + UniversalPrint(::std::u8string(str), os); + } + } +}; +template <> +class UniversalTersePrinter + : public UniversalTersePrinter {}; +#endif + +template <> +class UniversalTersePrinter { + public: + static void Print(const char16_t* str, ::std::ostream* os) { + if (str == nullptr) { + *os << "NULL"; + } else { + UniversalPrint(::std::u16string(str), os); + } + } +}; +template <> +class UniversalTersePrinter + : public UniversalTersePrinter {}; + +template <> +class UniversalTersePrinter { + public: + static void Print(const char32_t* str, ::std::ostream* os) { + if (str == nullptr) { + *os << "NULL"; + } else { + UniversalPrint(::std::u32string(str), os); + } + } +}; +template <> +class UniversalTersePrinter + : public UniversalTersePrinter {}; + +#if GTEST_HAS_STD_WSTRING +template <> +class UniversalTersePrinter { + public: + static void Print(const wchar_t* str, ::std::ostream* os) { + if (str == nullptr) { + *os << "NULL"; + } else { + UniversalPrint(::std::wstring(str), os); + } + } +}; +#endif + +template <> +class UniversalTersePrinter { + public: + static void Print(wchar_t* str, ::std::ostream* os) { + UniversalTersePrinter::Print(str, os); + } +}; + +template +void UniversalTersePrint(const T& value, ::std::ostream* os) { + UniversalTersePrinter::Print(value, os); +} + +// Prints a value using the type inferred by the compiler. The +// difference between this and UniversalTersePrint() is that for a +// (const) char pointer, this prints both the pointer and the +// NUL-terminated string. +template +void UniversalPrint(const T& value, ::std::ostream* os) { + // A workarond for the bug in VC++ 7.1 that prevents us from instantiating + // UniversalPrinter with T directly. + typedef T T1; + UniversalPrinter::Print(value, os); +} + +typedef ::std::vector<::std::string> Strings; + +// Tersely prints the first N fields of a tuple to a string vector, +// one element for each field. +template +void TersePrintPrefixToStrings(const Tuple&, std::integral_constant, + Strings*) {} +template +void TersePrintPrefixToStrings(const Tuple& t, + std::integral_constant, + Strings* strings) { + TersePrintPrefixToStrings(t, std::integral_constant(), + strings); + ::std::stringstream ss; + UniversalTersePrint(std::get(t), &ss); + strings->push_back(ss.str()); +} + +// Prints the fields of a tuple tersely to a string vector, one +// element for each field. See the comment before +// UniversalTersePrint() for how we define "tersely". +template +Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) { + Strings result; + TersePrintPrefixToStrings( + value, std::integral_constant::value>(), + &result); + return result; +} + +} // namespace internal + +template +::std::string PrintToString(const T& value) { + ::std::stringstream ss; + internal::UniversalTersePrinter::Print(value, &ss); + return ss.str(); +} + +} // namespace testing + +// Include any custom printer added by the local installation. +// We must include this header at the end to make sure it can use the +// declarations from this file. +#include "gtest/internal/custom/gtest-printers.h" + +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h new file mode 100644 index 0000000000..bec8c4810b --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h @@ -0,0 +1,248 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Utilities for testing Google Test itself and code that uses Google Test +// (e.g. frameworks built on top of Google Test). + +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_ + +#include "gtest/gtest.h" + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +namespace testing { + +// This helper class can be used to mock out Google Test failure reporting +// so that we can test Google Test or code that builds on Google Test. +// +// An object of this class appends a TestPartResult object to the +// TestPartResultArray object given in the constructor whenever a Google Test +// failure is reported. It can either intercept only failures that are +// generated in the same thread that created this object or it can intercept +// all generated failures. The scope of this mock object can be controlled with +// the second argument to the two arguments constructor. +class GTEST_API_ ScopedFakeTestPartResultReporter + : public TestPartResultReporterInterface { + public: + // The two possible mocking modes of this object. + enum InterceptMode { + INTERCEPT_ONLY_CURRENT_THREAD, // Intercepts only thread local failures. + INTERCEPT_ALL_THREADS // Intercepts all failures. + }; + + // The c'tor sets this object as the test part result reporter used + // by Google Test. The 'result' parameter specifies where to report the + // results. This reporter will only catch failures generated in the current + // thread. DEPRECATED + explicit ScopedFakeTestPartResultReporter(TestPartResultArray* result); + + // Same as above, but you can choose the interception scope of this object. + ScopedFakeTestPartResultReporter(InterceptMode intercept_mode, + TestPartResultArray* result); + + // The d'tor restores the previous test part result reporter. + ~ScopedFakeTestPartResultReporter() override; + + // Appends the TestPartResult object to the TestPartResultArray + // received in the constructor. + // + // This method is from the TestPartResultReporterInterface + // interface. + void ReportTestPartResult(const TestPartResult& result) override; + + private: + void Init(); + + const InterceptMode intercept_mode_; + TestPartResultReporterInterface* old_reporter_; + TestPartResultArray* const result_; + + ScopedFakeTestPartResultReporter(const ScopedFakeTestPartResultReporter&) = + delete; + ScopedFakeTestPartResultReporter& operator=( + const ScopedFakeTestPartResultReporter&) = delete; +}; + +namespace internal { + +// A helper class for implementing EXPECT_FATAL_FAILURE() and +// EXPECT_NONFATAL_FAILURE(). Its destructor verifies that the given +// TestPartResultArray contains exactly one failure that has the given +// type and contains the given substring. If that's not the case, a +// non-fatal failure will be generated. +class GTEST_API_ SingleFailureChecker { + public: + // The constructor remembers the arguments. + SingleFailureChecker(const TestPartResultArray* results, + TestPartResult::Type type, const std::string& substr); + ~SingleFailureChecker(); + + private: + const TestPartResultArray* const results_; + const TestPartResult::Type type_; + const std::string substr_; + + SingleFailureChecker(const SingleFailureChecker&) = delete; + SingleFailureChecker& operator=(const SingleFailureChecker&) = delete; +}; + +} // namespace internal + +} // namespace testing + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +// A set of macros for testing Google Test assertions or code that's expected +// to generate Google Test fatal failures (e.g. a failure from an ASSERT_EQ, but +// not a non-fatal failure, as from EXPECT_EQ). It verifies that the given +// statement will cause exactly one fatal Google Test failure with 'substr' +// being part of the failure message. +// +// There are two different versions of this macro. EXPECT_FATAL_FAILURE only +// affects and considers failures generated in the current thread and +// EXPECT_FATAL_FAILURE_ON_ALL_THREADS does the same but for all threads. +// +// The verification of the assertion is done correctly even when the statement +// throws an exception or aborts the current function. +// +// Known restrictions: +// - 'statement' cannot reference local non-static variables or +// non-static members of the current object. +// - 'statement' cannot return a value. +// - You cannot stream a failure message to this macro. +// +// Note that even though the implementations of the following two +// macros are much alike, we cannot refactor them to use a common +// helper macro, due to some peculiarity in how the preprocessor +// works. The AcceptsMacroThatExpandsToUnprotectedComma test in +// gtest_unittest.cc will fail to compile if we do that. +#define EXPECT_FATAL_FAILURE(statement, substr) \ + do { \ + class GTestExpectFatalFailureHelper { \ + public: \ + static void Execute() { statement; } \ + }; \ + ::testing::TestPartResultArray gtest_failures; \ + ::testing::internal::SingleFailureChecker gtest_checker( \ + >est_failures, ::testing::TestPartResult::kFatalFailure, (substr)); \ + { \ + ::testing::ScopedFakeTestPartResultReporter gtest_reporter( \ + ::testing::ScopedFakeTestPartResultReporter:: \ + INTERCEPT_ONLY_CURRENT_THREAD, \ + >est_failures); \ + GTestExpectFatalFailureHelper::Execute(); \ + } \ + } while (::testing::internal::AlwaysFalse()) + +#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \ + do { \ + class GTestExpectFatalFailureHelper { \ + public: \ + static void Execute() { statement; } \ + }; \ + ::testing::TestPartResultArray gtest_failures; \ + ::testing::internal::SingleFailureChecker gtest_checker( \ + >est_failures, ::testing::TestPartResult::kFatalFailure, (substr)); \ + { \ + ::testing::ScopedFakeTestPartResultReporter gtest_reporter( \ + ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \ + >est_failures); \ + GTestExpectFatalFailureHelper::Execute(); \ + } \ + } while (::testing::internal::AlwaysFalse()) + +// A macro for testing Google Test assertions or code that's expected to +// generate Google Test non-fatal failures (e.g. a failure from an EXPECT_EQ, +// but not from an ASSERT_EQ). It asserts that the given statement will cause +// exactly one non-fatal Google Test failure with 'substr' being part of the +// failure message. +// +// There are two different versions of this macro. EXPECT_NONFATAL_FAILURE only +// affects and considers failures generated in the current thread and +// EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS does the same but for all threads. +// +// 'statement' is allowed to reference local variables and members of +// the current object. +// +// The verification of the assertion is done correctly even when the statement +// throws an exception or aborts the current function. +// +// Known restrictions: +// - You cannot stream a failure message to this macro. +// +// Note that even though the implementations of the following two +// macros are much alike, we cannot refactor them to use a common +// helper macro, due to some peculiarity in how the preprocessor +// works. If we do that, the code won't compile when the user gives +// EXPECT_NONFATAL_FAILURE() a statement that contains a macro that +// expands to code containing an unprotected comma. The +// AcceptsMacroThatExpandsToUnprotectedComma test in gtest_unittest.cc +// catches that. +// +// For the same reason, we have to write +// if (::testing::internal::AlwaysTrue()) { statement; } +// instead of +// GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) +// to avoid an MSVC warning on unreachable code. +#define EXPECT_NONFATAL_FAILURE(statement, substr) \ + do { \ + ::testing::TestPartResultArray gtest_failures; \ + ::testing::internal::SingleFailureChecker gtest_checker( \ + >est_failures, ::testing::TestPartResult::kNonFatalFailure, \ + (substr)); \ + { \ + ::testing::ScopedFakeTestPartResultReporter gtest_reporter( \ + ::testing::ScopedFakeTestPartResultReporter:: \ + INTERCEPT_ONLY_CURRENT_THREAD, \ + >est_failures); \ + if (::testing::internal::AlwaysTrue()) { \ + statement; \ + } \ + } \ + } while (::testing::internal::AlwaysFalse()) + +#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \ + do { \ + ::testing::TestPartResultArray gtest_failures; \ + ::testing::internal::SingleFailureChecker gtest_checker( \ + >est_failures, ::testing::TestPartResult::kNonFatalFailure, \ + (substr)); \ + { \ + ::testing::ScopedFakeTestPartResultReporter gtest_reporter( \ + ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \ + >est_failures); \ + if (::testing::internal::AlwaysTrue()) { \ + statement; \ + } \ + } \ + } while (::testing::internal::AlwaysFalse()) + +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h new file mode 100644 index 0000000000..09cc8c34f0 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h @@ -0,0 +1,190 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ + +#include +#include + +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-string.h" + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +namespace testing { + +// A copyable object representing the result of a test part (i.e. an +// assertion or an explicit FAIL(), ADD_FAILURE(), or SUCCESS()). +// +// Don't inherit from TestPartResult as its destructor is not virtual. +class GTEST_API_ TestPartResult { + public: + // The possible outcomes of a test part (i.e. an assertion or an + // explicit SUCCEED(), FAIL(), or ADD_FAILURE()). + enum Type { + kSuccess, // Succeeded. + kNonFatalFailure, // Failed but the test can continue. + kFatalFailure, // Failed and the test should be terminated. + kSkip // Skipped. + }; + + // C'tor. TestPartResult does NOT have a default constructor. + // Always use this constructor (with parameters) to create a + // TestPartResult object. + TestPartResult(Type a_type, const char* a_file_name, int a_line_number, + const char* a_message) + : type_(a_type), + file_name_(a_file_name == nullptr ? "" : a_file_name), + line_number_(a_line_number), + summary_(ExtractSummary(a_message)), + message_(a_message) {} + + // Gets the outcome of the test part. + Type type() const { return type_; } + + // Gets the name of the source file where the test part took place, or + // NULL if it's unknown. + const char* file_name() const { + return file_name_.empty() ? nullptr : file_name_.c_str(); + } + + // Gets the line in the source file where the test part took place, + // or -1 if it's unknown. + int line_number() const { return line_number_; } + + // Gets the summary of the failure message. + const char* summary() const { return summary_.c_str(); } + + // Gets the message associated with the test part. + const char* message() const { return message_.c_str(); } + + // Returns true if and only if the test part was skipped. + bool skipped() const { return type_ == kSkip; } + + // Returns true if and only if the test part passed. + bool passed() const { return type_ == kSuccess; } + + // Returns true if and only if the test part non-fatally failed. + bool nonfatally_failed() const { return type_ == kNonFatalFailure; } + + // Returns true if and only if the test part fatally failed. + bool fatally_failed() const { return type_ == kFatalFailure; } + + // Returns true if and only if the test part failed. + bool failed() const { return fatally_failed() || nonfatally_failed(); } + + private: + Type type_; + + // Gets the summary of the failure message by omitting the stack + // trace in it. + static std::string ExtractSummary(const char* message); + + // The name of the source file where the test part took place, or + // "" if the source file is unknown. + std::string file_name_; + // The line in the source file where the test part took place, or -1 + // if the line number is unknown. + int line_number_; + std::string summary_; // The test failure summary. + std::string message_; // The test failure message. +}; + +// Prints a TestPartResult object. +std::ostream& operator<<(std::ostream& os, const TestPartResult& result); + +// An array of TestPartResult objects. +// +// Don't inherit from TestPartResultArray as its destructor is not +// virtual. +class GTEST_API_ TestPartResultArray { + public: + TestPartResultArray() {} + + // Appends the given TestPartResult to the array. + void Append(const TestPartResult& result); + + // Returns the TestPartResult at the given index (0-based). + const TestPartResult& GetTestPartResult(int index) const; + + // Returns the number of TestPartResult objects in the array. + int size() const; + + private: + std::vector array_; + + TestPartResultArray(const TestPartResultArray&) = delete; + TestPartResultArray& operator=(const TestPartResultArray&) = delete; +}; + +// This interface knows how to report a test part result. +class GTEST_API_ TestPartResultReporterInterface { + public: + virtual ~TestPartResultReporterInterface() {} + + virtual void ReportTestPartResult(const TestPartResult& result) = 0; +}; + +namespace internal { + +// This helper class is used by {ASSERT|EXPECT}_NO_FATAL_FAILURE to check if a +// statement generates new fatal failures. To do so it registers itself as the +// current test part result reporter. Besides checking if fatal failures were +// reported, it only delegates the reporting to the former result reporter. +// The original result reporter is restored in the destructor. +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +class GTEST_API_ HasNewFatalFailureHelper + : public TestPartResultReporterInterface { + public: + HasNewFatalFailureHelper(); + ~HasNewFatalFailureHelper() override; + void ReportTestPartResult(const TestPartResult& result) override; + bool has_new_fatal_failure() const { return has_new_fatal_failure_; } + + private: + bool has_new_fatal_failure_; + TestPartResultReporterInterface* original_reporter_; + + HasNewFatalFailureHelper(const HasNewFatalFailureHelper&) = delete; + HasNewFatalFailureHelper& operator=(const HasNewFatalFailureHelper&) = delete; +}; + +} // namespace internal + +} // namespace testing + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h new file mode 100644 index 0000000000..bd35a32660 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h @@ -0,0 +1,331 @@ +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ + +// This header implements typed tests and type-parameterized tests. + +// Typed (aka type-driven) tests repeat the same test for types in a +// list. You must know which types you want to test with when writing +// typed tests. Here's how you do it: + +#if 0 + +// First, define a fixture class template. It should be parameterized +// by a type. Remember to derive it from testing::Test. +template +class FooTest : public testing::Test { + public: + ... + typedef std::list List; + static T shared_; + T value_; +}; + +// Next, associate a list of types with the test suite, which will be +// repeated for each type in the list. The typedef is necessary for +// the macro to parse correctly. +typedef testing::Types MyTypes; +TYPED_TEST_SUITE(FooTest, MyTypes); + +// If the type list contains only one type, you can write that type +// directly without Types<...>: +// TYPED_TEST_SUITE(FooTest, int); + +// Then, use TYPED_TEST() instead of TEST_F() to define as many typed +// tests for this test suite as you want. +TYPED_TEST(FooTest, DoesBlah) { + // Inside a test, refer to the special name TypeParam to get the type + // parameter. Since we are inside a derived class template, C++ requires + // us to visit the members of FooTest via 'this'. + TypeParam n = this->value_; + + // To visit static members of the fixture, add the TestFixture:: + // prefix. + n += TestFixture::shared_; + + // To refer to typedefs in the fixture, add the "typename + // TestFixture::" prefix. + typename TestFixture::List values; + values.push_back(n); + ... +} + +TYPED_TEST(FooTest, HasPropertyA) { ... } + +// TYPED_TEST_SUITE takes an optional third argument which allows to specify a +// class that generates custom test name suffixes based on the type. This should +// be a class which has a static template function GetName(int index) returning +// a string for each type. The provided integer index equals the index of the +// type in the provided type list. In many cases the index can be ignored. +// +// For example: +// class MyTypeNames { +// public: +// template +// static std::string GetName(int) { +// if (std::is_same()) return "char"; +// if (std::is_same()) return "int"; +// if (std::is_same()) return "unsignedInt"; +// } +// }; +// TYPED_TEST_SUITE(FooTest, MyTypes, MyTypeNames); + +#endif // 0 + +// Type-parameterized tests are abstract test patterns parameterized +// by a type. Compared with typed tests, type-parameterized tests +// allow you to define the test pattern without knowing what the type +// parameters are. The defined pattern can be instantiated with +// different types any number of times, in any number of translation +// units. +// +// If you are designing an interface or concept, you can define a +// suite of type-parameterized tests to verify properties that any +// valid implementation of the interface/concept should have. Then, +// each implementation can easily instantiate the test suite to verify +// that it conforms to the requirements, without having to write +// similar tests repeatedly. Here's an example: + +#if 0 + +// First, define a fixture class template. It should be parameterized +// by a type. Remember to derive it from testing::Test. +template +class FooTest : public testing::Test { + ... +}; + +// Next, declare that you will define a type-parameterized test suite +// (the _P suffix is for "parameterized" or "pattern", whichever you +// prefer): +TYPED_TEST_SUITE_P(FooTest); + +// Then, use TYPED_TEST_P() to define as many type-parameterized tests +// for this type-parameterized test suite as you want. +TYPED_TEST_P(FooTest, DoesBlah) { + // Inside a test, refer to TypeParam to get the type parameter. + TypeParam n = 0; + ... +} + +TYPED_TEST_P(FooTest, HasPropertyA) { ... } + +// Now the tricky part: you need to register all test patterns before +// you can instantiate them. The first argument of the macro is the +// test suite name; the rest are the names of the tests in this test +// case. +REGISTER_TYPED_TEST_SUITE_P(FooTest, + DoesBlah, HasPropertyA); + +// Finally, you are free to instantiate the pattern with the types you +// want. If you put the above code in a header file, you can #include +// it in multiple C++ source files and instantiate it multiple times. +// +// To distinguish different instances of the pattern, the first +// argument to the INSTANTIATE_* macro is a prefix that will be added +// to the actual test suite name. Remember to pick unique prefixes for +// different instances. +typedef testing::Types MyTypes; +INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes); + +// If the type list contains only one type, you can write that type +// directly without Types<...>: +// INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, int); +// +// Similar to the optional argument of TYPED_TEST_SUITE above, +// INSTANTIATE_TEST_SUITE_P takes an optional fourth argument which allows to +// generate custom names. +// INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes, MyTypeNames); + +#endif // 0 + +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-port.h" +#include "gtest/internal/gtest-type-util.h" + +// Implements typed tests. + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Expands to the name of the typedef for the type parameters of the +// given test suite. +#define GTEST_TYPE_PARAMS_(TestSuiteName) gtest_type_params_##TestSuiteName##_ + +// Expands to the name of the typedef for the NameGenerator, responsible for +// creating the suffixes of the name. +#define GTEST_NAME_GENERATOR_(TestSuiteName) \ + gtest_type_params_##TestSuiteName##_NameGenerator + +#define TYPED_TEST_SUITE(CaseName, Types, ...) \ + typedef ::testing::internal::GenerateTypeList::type \ + GTEST_TYPE_PARAMS_(CaseName); \ + typedef ::testing::internal::NameGeneratorSelector<__VA_ARGS__>::type \ + GTEST_NAME_GENERATOR_(CaseName) + +#define TYPED_TEST(CaseName, TestName) \ + static_assert(sizeof(GTEST_STRINGIFY_(TestName)) > 1, \ + "test-name must not be empty"); \ + template \ + class GTEST_TEST_CLASS_NAME_(CaseName, TestName) \ + : public CaseName { \ + private: \ + typedef CaseName TestFixture; \ + typedef gtest_TypeParam_ TypeParam; \ + void TestBody() override; \ + }; \ + static bool gtest_##CaseName##_##TestName##_registered_ \ + GTEST_ATTRIBUTE_UNUSED_ = ::testing::internal::TypeParameterizedTest< \ + CaseName, \ + ::testing::internal::TemplateSel, \ + GTEST_TYPE_PARAMS_( \ + CaseName)>::Register("", \ + ::testing::internal::CodeLocation( \ + __FILE__, __LINE__), \ + GTEST_STRINGIFY_(CaseName), \ + GTEST_STRINGIFY_(TestName), 0, \ + ::testing::internal::GenerateNames< \ + GTEST_NAME_GENERATOR_(CaseName), \ + GTEST_TYPE_PARAMS_(CaseName)>()); \ + template \ + void GTEST_TEST_CLASS_NAME_(CaseName, \ + TestName)::TestBody() + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +#define TYPED_TEST_CASE \ + static_assert(::testing::internal::TypedTestCaseIsDeprecated(), ""); \ + TYPED_TEST_SUITE +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +// Implements type-parameterized tests. + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Expands to the namespace name that the type-parameterized tests for +// the given type-parameterized test suite are defined in. The exact +// name of the namespace is subject to change without notice. +#define GTEST_SUITE_NAMESPACE_(TestSuiteName) gtest_suite_##TestSuiteName##_ + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Expands to the name of the variable used to remember the names of +// the defined tests in the given test suite. +#define GTEST_TYPED_TEST_SUITE_P_STATE_(TestSuiteName) \ + gtest_typed_test_suite_p_state_##TestSuiteName##_ + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE DIRECTLY. +// +// Expands to the name of the variable used to remember the names of +// the registered tests in the given test suite. +#define GTEST_REGISTERED_TEST_NAMES_(TestSuiteName) \ + gtest_registered_test_names_##TestSuiteName##_ + +// The variables defined in the type-parameterized test macros are +// static as typically these macros are used in a .h file that can be +// #included in multiple translation units linked together. +#define TYPED_TEST_SUITE_P(SuiteName) \ + static ::testing::internal::TypedTestSuitePState \ + GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName) + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +#define TYPED_TEST_CASE_P \ + static_assert(::testing::internal::TypedTestCase_P_IsDeprecated(), ""); \ + TYPED_TEST_SUITE_P +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +#define TYPED_TEST_P(SuiteName, TestName) \ + namespace GTEST_SUITE_NAMESPACE_(SuiteName) { \ + template \ + class TestName : public SuiteName { \ + private: \ + typedef SuiteName TestFixture; \ + typedef gtest_TypeParam_ TypeParam; \ + void TestBody() override; \ + }; \ + static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \ + GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).AddTestName( \ + __FILE__, __LINE__, GTEST_STRINGIFY_(SuiteName), \ + GTEST_STRINGIFY_(TestName)); \ + } \ + template \ + void GTEST_SUITE_NAMESPACE_( \ + SuiteName)::TestName::TestBody() + +// Note: this won't work correctly if the trailing arguments are macros. +#define REGISTER_TYPED_TEST_SUITE_P(SuiteName, ...) \ + namespace GTEST_SUITE_NAMESPACE_(SuiteName) { \ + typedef ::testing::internal::Templates<__VA_ARGS__> gtest_AllTests_; \ + } \ + static const char* const GTEST_REGISTERED_TEST_NAMES_( \ + SuiteName) GTEST_ATTRIBUTE_UNUSED_ = \ + GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).VerifyRegisteredTestNames( \ + GTEST_STRINGIFY_(SuiteName), __FILE__, __LINE__, #__VA_ARGS__) + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +#define REGISTER_TYPED_TEST_CASE_P \ + static_assert(::testing::internal::RegisterTypedTestCase_P_IsDeprecated(), \ + ""); \ + REGISTER_TYPED_TEST_SUITE_P +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +#define INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, SuiteName, Types, ...) \ + static_assert(sizeof(GTEST_STRINGIFY_(Prefix)) > 1, \ + "test-suit-prefix must not be empty"); \ + static bool gtest_##Prefix##_##SuiteName GTEST_ATTRIBUTE_UNUSED_ = \ + ::testing::internal::TypeParameterizedTestSuite< \ + SuiteName, GTEST_SUITE_NAMESPACE_(SuiteName)::gtest_AllTests_, \ + ::testing::internal::GenerateTypeList::type>:: \ + Register(GTEST_STRINGIFY_(Prefix), \ + ::testing::internal::CodeLocation(__FILE__, __LINE__), \ + >EST_TYPED_TEST_SUITE_P_STATE_(SuiteName), \ + GTEST_STRINGIFY_(SuiteName), \ + GTEST_REGISTERED_TEST_NAMES_(SuiteName), \ + ::testing::internal::GenerateNames< \ + ::testing::internal::NameGeneratorSelector< \ + __VA_ARGS__>::type, \ + ::testing::internal::GenerateTypeList::type>()) + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +#define INSTANTIATE_TYPED_TEST_CASE_P \ + static_assert( \ + ::testing::internal::InstantiateTypedTestCase_P_IsDeprecated(), ""); \ + INSTANTIATE_TYPED_TEST_SUITE_P +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest.h new file mode 100644 index 0000000000..d19a587a18 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest.h @@ -0,0 +1,2297 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This header file defines the public API for Google Test. It should be +// included by any test program that uses Google Test. +// +// IMPORTANT NOTE: Due to limitation of the C++ language, we have to +// leave some internal implementation details in this header file. +// They are clearly marked by comments like this: +// +// // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +// +// Such code is NOT meant to be used by a user directly, and is subject +// to CHANGE WITHOUT NOTICE. Therefore DO NOT DEPEND ON IT in a user +// program! +// +// Acknowledgment: Google Test borrowed the idea of automatic test +// registration from Barthelemy Dagenais' (barthelemy@prologique.com) +// easyUnit framework. + +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_H_ + +#include +#include +#include +#include +#include +#include + +#include "gtest/gtest-assertion-result.h" +#include "gtest/gtest-death-test.h" +#include "gtest/gtest-matchers.h" +#include "gtest/gtest-message.h" +#include "gtest/gtest-param-test.h" +#include "gtest/gtest-printers.h" +#include "gtest/gtest-test-part.h" +#include "gtest/gtest-typed-test.h" +#include "gtest/gtest_pred_impl.h" +#include "gtest/gtest_prod.h" +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-string.h" + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +// Declares the flags. + +// This flag temporary enables the disabled tests. +GTEST_DECLARE_bool_(also_run_disabled_tests); + +// This flag brings the debugger on an assertion failure. +GTEST_DECLARE_bool_(break_on_failure); + +// This flag controls whether Google Test catches all test-thrown exceptions +// and logs them as failures. +GTEST_DECLARE_bool_(catch_exceptions); + +// This flag enables using colors in terminal output. Available values are +// "yes" to enable colors, "no" (disable colors), or "auto" (the default) +// to let Google Test decide. +GTEST_DECLARE_string_(color); + +// This flag controls whether the test runner should continue execution past +// first failure. +GTEST_DECLARE_bool_(fail_fast); + +// This flag sets up the filter to select by name using a glob pattern +// the tests to run. If the filter is not given all tests are executed. +GTEST_DECLARE_string_(filter); + +// This flag controls whether Google Test installs a signal handler that dumps +// debugging information when fatal signals are raised. +GTEST_DECLARE_bool_(install_failure_signal_handler); + +// This flag causes the Google Test to list tests. None of the tests listed +// are actually run if the flag is provided. +GTEST_DECLARE_bool_(list_tests); + +// This flag controls whether Google Test emits a detailed XML report to a file +// in addition to its normal textual output. +GTEST_DECLARE_string_(output); + +// This flags control whether Google Test prints only test failures. +GTEST_DECLARE_bool_(brief); + +// This flags control whether Google Test prints the elapsed time for each +// test. +GTEST_DECLARE_bool_(print_time); + +// This flags control whether Google Test prints UTF8 characters as text. +GTEST_DECLARE_bool_(print_utf8); + +// This flag specifies the random number seed. +GTEST_DECLARE_int32_(random_seed); + +// This flag sets how many times the tests are repeated. The default value +// is 1. If the value is -1 the tests are repeating forever. +GTEST_DECLARE_int32_(repeat); + +// This flag controls whether Google Test Environments are recreated for each +// repeat of the tests. The default value is true. If set to false the global +// test Environment objects are only set up once, for the first iteration, and +// only torn down once, for the last. +GTEST_DECLARE_bool_(recreate_environments_when_repeating); + +// This flag controls whether Google Test includes Google Test internal +// stack frames in failure stack traces. +GTEST_DECLARE_bool_(show_internal_stack_frames); + +// When this flag is specified, tests' order is randomized on every iteration. +GTEST_DECLARE_bool_(shuffle); + +// This flag specifies the maximum number of stack frames to be +// printed in a failure message. +GTEST_DECLARE_int32_(stack_trace_depth); + +// When this flag is specified, a failed assertion will throw an +// exception if exceptions are enabled, or exit the program with a +// non-zero code otherwise. For use with an external test framework. +GTEST_DECLARE_bool_(throw_on_failure); + +// When this flag is set with a "host:port" string, on supported +// platforms test results are streamed to the specified port on +// the specified host machine. +GTEST_DECLARE_string_(stream_result_to); + +#if GTEST_USE_OWN_FLAGFILE_FLAG_ +GTEST_DECLARE_string_(flagfile); +#endif // GTEST_USE_OWN_FLAGFILE_FLAG_ + +namespace testing { + +// Silence C4100 (unreferenced formal parameter) and 4805 +// unsafe mix of type 'const int' and type 'const bool' +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4805) +#pragma warning(disable : 4100) +#endif + +// The upper limit for valid stack trace depths. +const int kMaxStackTraceDepth = 100; + +namespace internal { + +class AssertHelper; +class DefaultGlobalTestPartResultReporter; +class ExecDeathTest; +class NoExecDeathTest; +class FinalSuccessChecker; +class GTestFlagSaver; +class StreamingListenerTest; +class TestResultAccessor; +class TestEventListenersAccessor; +class TestEventRepeater; +class UnitTestRecordPropertyTestHelper; +class WindowsDeathTest; +class FuchsiaDeathTest; +class UnitTestImpl* GetUnitTestImpl(); +void ReportFailureInUnknownLocation(TestPartResult::Type result_type, + const std::string& message); +std::set* GetIgnoredParameterizedTestSuites(); + +} // namespace internal + +// The friend relationship of some of these classes is cyclic. +// If we don't forward declare them the compiler might confuse the classes +// in friendship clauses with same named classes on the scope. +class Test; +class TestSuite; + +// Old API is still available but deprecated +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +using TestCase = TestSuite; +#endif +class TestInfo; +class UnitTest; + +// The abstract class that all tests inherit from. +// +// In Google Test, a unit test program contains one or many TestSuites, and +// each TestSuite contains one or many Tests. +// +// When you define a test using the TEST macro, you don't need to +// explicitly derive from Test - the TEST macro automatically does +// this for you. +// +// The only time you derive from Test is when defining a test fixture +// to be used in a TEST_F. For example: +// +// class FooTest : public testing::Test { +// protected: +// void SetUp() override { ... } +// void TearDown() override { ... } +// ... +// }; +// +// TEST_F(FooTest, Bar) { ... } +// TEST_F(FooTest, Baz) { ... } +// +// Test is not copyable. +class GTEST_API_ Test { + public: + friend class TestInfo; + + // The d'tor is virtual as we intend to inherit from Test. + virtual ~Test(); + + // Sets up the stuff shared by all tests in this test suite. + // + // Google Test will call Foo::SetUpTestSuite() before running the first + // test in test suite Foo. Hence a sub-class can define its own + // SetUpTestSuite() method to shadow the one defined in the super + // class. + static void SetUpTestSuite() {} + + // Tears down the stuff shared by all tests in this test suite. + // + // Google Test will call Foo::TearDownTestSuite() after running the last + // test in test suite Foo. Hence a sub-class can define its own + // TearDownTestSuite() method to shadow the one defined in the super + // class. + static void TearDownTestSuite() {} + + // Legacy API is deprecated but still available. Use SetUpTestSuite and + // TearDownTestSuite instead. +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + static void TearDownTestCase() {} + static void SetUpTestCase() {} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + // Returns true if and only if the current test has a fatal failure. + static bool HasFatalFailure(); + + // Returns true if and only if the current test has a non-fatal failure. + static bool HasNonfatalFailure(); + + // Returns true if and only if the current test was skipped. + static bool IsSkipped(); + + // Returns true if and only if the current test has a (either fatal or + // non-fatal) failure. + static bool HasFailure() { return HasFatalFailure() || HasNonfatalFailure(); } + + // Logs a property for the current test, test suite, or for the entire + // invocation of the test program when used outside of the context of a + // test suite. Only the last value for a given key is remembered. These + // are public static so they can be called from utility functions that are + // not members of the test fixture. Calls to RecordProperty made during + // lifespan of the test (from the moment its constructor starts to the + // moment its destructor finishes) will be output in XML as attributes of + // the element. Properties recorded from fixture's + // SetUpTestSuite or TearDownTestSuite are logged as attributes of the + // corresponding element. Calls to RecordProperty made in the + // global context (before or after invocation of RUN_ALL_TESTS and from + // SetUp/TearDown method of Environment objects registered with Google + // Test) will be output as attributes of the element. + static void RecordProperty(const std::string& key, const std::string& value); + static void RecordProperty(const std::string& key, int value); + + protected: + // Creates a Test object. + Test(); + + // Sets up the test fixture. + virtual void SetUp(); + + // Tears down the test fixture. + virtual void TearDown(); + + private: + // Returns true if and only if the current test has the same fixture class + // as the first test in the current test suite. + static bool HasSameFixtureClass(); + + // Runs the test after the test fixture has been set up. + // + // A sub-class must implement this to define the test logic. + // + // DO NOT OVERRIDE THIS FUNCTION DIRECTLY IN A USER PROGRAM. + // Instead, use the TEST or TEST_F macro. + virtual void TestBody() = 0; + + // Sets up, executes, and tears down the test. + void Run(); + + // Deletes self. We deliberately pick an unusual name for this + // internal method to avoid clashing with names used in user TESTs. + void DeleteSelf_() { delete this; } + + const std::unique_ptr gtest_flag_saver_; + + // Often a user misspells SetUp() as Setup() and spends a long time + // wondering why it is never called by Google Test. The declaration of + // the following method is solely for catching such an error at + // compile time: + // + // - The return type is deliberately chosen to be not void, so it + // will be a conflict if void Setup() is declared in the user's + // test fixture. + // + // - This method is private, so it will be another compiler error + // if the method is called from the user's test fixture. + // + // DO NOT OVERRIDE THIS FUNCTION. + // + // If you see an error about overriding the following function or + // about it being private, you have mis-spelled SetUp() as Setup(). + struct Setup_should_be_spelled_SetUp {}; + virtual Setup_should_be_spelled_SetUp* Setup() { return nullptr; } + + // We disallow copying Tests. + Test(const Test&) = delete; + Test& operator=(const Test&) = delete; +}; + +typedef internal::TimeInMillis TimeInMillis; + +// A copyable object representing a user specified test property which can be +// output as a key/value string pair. +// +// Don't inherit from TestProperty as its destructor is not virtual. +class TestProperty { + public: + // C'tor. TestProperty does NOT have a default constructor. + // Always use this constructor (with parameters) to create a + // TestProperty object. + TestProperty(const std::string& a_key, const std::string& a_value) + : key_(a_key), value_(a_value) {} + + // Gets the user supplied key. + const char* key() const { return key_.c_str(); } + + // Gets the user supplied value. + const char* value() const { return value_.c_str(); } + + // Sets a new value, overriding the one supplied in the constructor. + void SetValue(const std::string& new_value) { value_ = new_value; } + + private: + // The key supplied by the user. + std::string key_; + // The value supplied by the user. + std::string value_; +}; + +// The result of a single Test. This includes a list of +// TestPartResults, a list of TestProperties, a count of how many +// death tests there are in the Test, and how much time it took to run +// the Test. +// +// TestResult is not copyable. +class GTEST_API_ TestResult { + public: + // Creates an empty TestResult. + TestResult(); + + // D'tor. Do not inherit from TestResult. + ~TestResult(); + + // Gets the number of all test parts. This is the sum of the number + // of successful test parts and the number of failed test parts. + int total_part_count() const; + + // Returns the number of the test properties. + int test_property_count() const; + + // Returns true if and only if the test passed (i.e. no test part failed). + bool Passed() const { return !Skipped() && !Failed(); } + + // Returns true if and only if the test was skipped. + bool Skipped() const; + + // Returns true if and only if the test failed. + bool Failed() const; + + // Returns true if and only if the test fatally failed. + bool HasFatalFailure() const; + + // Returns true if and only if the test has a non-fatal failure. + bool HasNonfatalFailure() const; + + // Returns the elapsed time, in milliseconds. + TimeInMillis elapsed_time() const { return elapsed_time_; } + + // Gets the time of the test case start, in ms from the start of the + // UNIX epoch. + TimeInMillis start_timestamp() const { return start_timestamp_; } + + // Returns the i-th test part result among all the results. i can range from 0 + // to total_part_count() - 1. If i is not in that range, aborts the program. + const TestPartResult& GetTestPartResult(int i) const; + + // Returns the i-th test property. i can range from 0 to + // test_property_count() - 1. If i is not in that range, aborts the + // program. + const TestProperty& GetTestProperty(int i) const; + + private: + friend class TestInfo; + friend class TestSuite; + friend class UnitTest; + friend class internal::DefaultGlobalTestPartResultReporter; + friend class internal::ExecDeathTest; + friend class internal::TestResultAccessor; + friend class internal::UnitTestImpl; + friend class internal::WindowsDeathTest; + friend class internal::FuchsiaDeathTest; + + // Gets the vector of TestPartResults. + const std::vector& test_part_results() const { + return test_part_results_; + } + + // Gets the vector of TestProperties. + const std::vector& test_properties() const { + return test_properties_; + } + + // Sets the start time. + void set_start_timestamp(TimeInMillis start) { start_timestamp_ = start; } + + // Sets the elapsed time. + void set_elapsed_time(TimeInMillis elapsed) { elapsed_time_ = elapsed; } + + // Adds a test property to the list. The property is validated and may add + // a non-fatal failure if invalid (e.g., if it conflicts with reserved + // key names). If a property is already recorded for the same key, the + // value will be updated, rather than storing multiple values for the same + // key. xml_element specifies the element for which the property is being + // recorded and is used for validation. + void RecordProperty(const std::string& xml_element, + const TestProperty& test_property); + + // Adds a failure if the key is a reserved attribute of Google Test + // testsuite tags. Returns true if the property is valid. + // FIXME: Validate attribute names are legal and human readable. + static bool ValidateTestProperty(const std::string& xml_element, + const TestProperty& test_property); + + // Adds a test part result to the list. + void AddTestPartResult(const TestPartResult& test_part_result); + + // Returns the death test count. + int death_test_count() const { return death_test_count_; } + + // Increments the death test count, returning the new count. + int increment_death_test_count() { return ++death_test_count_; } + + // Clears the test part results. + void ClearTestPartResults(); + + // Clears the object. + void Clear(); + + // Protects mutable state of the property vector and of owned + // properties, whose values may be updated. + internal::Mutex test_properties_mutex_; + + // The vector of TestPartResults + std::vector test_part_results_; + // The vector of TestProperties + std::vector test_properties_; + // Running count of death tests. + int death_test_count_; + // The start time, in milliseconds since UNIX Epoch. + TimeInMillis start_timestamp_; + // The elapsed time, in milliseconds. + TimeInMillis elapsed_time_; + + // We disallow copying TestResult. + TestResult(const TestResult&) = delete; + TestResult& operator=(const TestResult&) = delete; +}; // class TestResult + +// A TestInfo object stores the following information about a test: +// +// Test suite name +// Test name +// Whether the test should be run +// A function pointer that creates the test object when invoked +// Test result +// +// The constructor of TestInfo registers itself with the UnitTest +// singleton such that the RUN_ALL_TESTS() macro knows which tests to +// run. +class GTEST_API_ TestInfo { + public: + // Destructs a TestInfo object. This function is not virtual, so + // don't inherit from TestInfo. + ~TestInfo(); + + // Returns the test suite name. + const char* test_suite_name() const { return test_suite_name_.c_str(); } + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + const char* test_case_name() const { return test_suite_name(); } +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + // Returns the test name. + const char* name() const { return name_.c_str(); } + + // Returns the name of the parameter type, or NULL if this is not a typed + // or a type-parameterized test. + const char* type_param() const { + if (type_param_.get() != nullptr) return type_param_->c_str(); + return nullptr; + } + + // Returns the text representation of the value parameter, or NULL if this + // is not a value-parameterized test. + const char* value_param() const { + if (value_param_.get() != nullptr) return value_param_->c_str(); + return nullptr; + } + + // Returns the file name where this test is defined. + const char* file() const { return location_.file.c_str(); } + + // Returns the line where this test is defined. + int line() const { return location_.line; } + + // Return true if this test should not be run because it's in another shard. + bool is_in_another_shard() const { return is_in_another_shard_; } + + // Returns true if this test should run, that is if the test is not + // disabled (or it is disabled but the also_run_disabled_tests flag has + // been specified) and its full name matches the user-specified filter. + // + // Google Test allows the user to filter the tests by their full names. + // The full name of a test Bar in test suite Foo is defined as + // "Foo.Bar". Only the tests that match the filter will run. + // + // A filter is a colon-separated list of glob (not regex) patterns, + // optionally followed by a '-' and a colon-separated list of + // negative patterns (tests to exclude). A test is run if it + // matches one of the positive patterns and does not match any of + // the negative patterns. + // + // For example, *A*:Foo.* is a filter that matches any string that + // contains the character 'A' or starts with "Foo.". + bool should_run() const { return should_run_; } + + // Returns true if and only if this test will appear in the XML report. + bool is_reportable() const { + // The XML report includes tests matching the filter, excluding those + // run in other shards. + return matches_filter_ && !is_in_another_shard_; + } + + // Returns the result of the test. + const TestResult* result() const { return &result_; } + + private: +#if GTEST_HAS_DEATH_TEST + friend class internal::DefaultDeathTestFactory; +#endif // GTEST_HAS_DEATH_TEST + friend class Test; + friend class TestSuite; + friend class internal::UnitTestImpl; + friend class internal::StreamingListenerTest; + friend TestInfo* internal::MakeAndRegisterTestInfo( + const char* test_suite_name, const char* name, const char* type_param, + const char* value_param, internal::CodeLocation code_location, + internal::TypeId fixture_class_id, internal::SetUpTestSuiteFunc set_up_tc, + internal::TearDownTestSuiteFunc tear_down_tc, + internal::TestFactoryBase* factory); + + // Constructs a TestInfo object. The newly constructed instance assumes + // ownership of the factory object. + TestInfo(const std::string& test_suite_name, const std::string& name, + const char* a_type_param, // NULL if not a type-parameterized test + const char* a_value_param, // NULL if not a value-parameterized test + internal::CodeLocation a_code_location, + internal::TypeId fixture_class_id, + internal::TestFactoryBase* factory); + + // Increments the number of death tests encountered in this test so + // far. + int increment_death_test_count() { + return result_.increment_death_test_count(); + } + + // Creates the test object, runs it, records its result, and then + // deletes it. + void Run(); + + // Skip and records the test result for this object. + void Skip(); + + static void ClearTestResult(TestInfo* test_info) { + test_info->result_.Clear(); + } + + // These fields are immutable properties of the test. + const std::string test_suite_name_; // test suite name + const std::string name_; // Test name + // Name of the parameter type, or NULL if this is not a typed or a + // type-parameterized test. + const std::unique_ptr type_param_; + // Text representation of the value parameter, or NULL if this is not a + // value-parameterized test. + const std::unique_ptr value_param_; + internal::CodeLocation location_; + const internal::TypeId fixture_class_id_; // ID of the test fixture class + bool should_run_; // True if and only if this test should run + bool is_disabled_; // True if and only if this test is disabled + bool matches_filter_; // True if this test matches the + // user-specified filter. + bool is_in_another_shard_; // Will be run in another shard. + internal::TestFactoryBase* const factory_; // The factory that creates + // the test object + + // This field is mutable and needs to be reset before running the + // test for the second time. + TestResult result_; + + TestInfo(const TestInfo&) = delete; + TestInfo& operator=(const TestInfo&) = delete; +}; + +// A test suite, which consists of a vector of TestInfos. +// +// TestSuite is not copyable. +class GTEST_API_ TestSuite { + public: + // Creates a TestSuite with the given name. + // + // TestSuite does NOT have a default constructor. Always use this + // constructor to create a TestSuite object. + // + // Arguments: + // + // name: name of the test suite + // a_type_param: the name of the test's type parameter, or NULL if + // this is not a type-parameterized test. + // set_up_tc: pointer to the function that sets up the test suite + // tear_down_tc: pointer to the function that tears down the test suite + TestSuite(const char* name, const char* a_type_param, + internal::SetUpTestSuiteFunc set_up_tc, + internal::TearDownTestSuiteFunc tear_down_tc); + + // Destructor of TestSuite. + virtual ~TestSuite(); + + // Gets the name of the TestSuite. + const char* name() const { return name_.c_str(); } + + // Returns the name of the parameter type, or NULL if this is not a + // type-parameterized test suite. + const char* type_param() const { + if (type_param_.get() != nullptr) return type_param_->c_str(); + return nullptr; + } + + // Returns true if any test in this test suite should run. + bool should_run() const { return should_run_; } + + // Gets the number of successful tests in this test suite. + int successful_test_count() const; + + // Gets the number of skipped tests in this test suite. + int skipped_test_count() const; + + // Gets the number of failed tests in this test suite. + int failed_test_count() const; + + // Gets the number of disabled tests that will be reported in the XML report. + int reportable_disabled_test_count() const; + + // Gets the number of disabled tests in this test suite. + int disabled_test_count() const; + + // Gets the number of tests to be printed in the XML report. + int reportable_test_count() const; + + // Get the number of tests in this test suite that should run. + int test_to_run_count() const; + + // Gets the number of all tests in this test suite. + int total_test_count() const; + + // Returns true if and only if the test suite passed. + bool Passed() const { return !Failed(); } + + // Returns true if and only if the test suite failed. + bool Failed() const { + return failed_test_count() > 0 || ad_hoc_test_result().Failed(); + } + + // Returns the elapsed time, in milliseconds. + TimeInMillis elapsed_time() const { return elapsed_time_; } + + // Gets the time of the test suite start, in ms from the start of the + // UNIX epoch. + TimeInMillis start_timestamp() const { return start_timestamp_; } + + // Returns the i-th test among all the tests. i can range from 0 to + // total_test_count() - 1. If i is not in that range, returns NULL. + const TestInfo* GetTestInfo(int i) const; + + // Returns the TestResult that holds test properties recorded during + // execution of SetUpTestSuite and TearDownTestSuite. + const TestResult& ad_hoc_test_result() const { return ad_hoc_test_result_; } + + private: + friend class Test; + friend class internal::UnitTestImpl; + + // Gets the (mutable) vector of TestInfos in this TestSuite. + std::vector& test_info_list() { return test_info_list_; } + + // Gets the (immutable) vector of TestInfos in this TestSuite. + const std::vector& test_info_list() const { + return test_info_list_; + } + + // Returns the i-th test among all the tests. i can range from 0 to + // total_test_count() - 1. If i is not in that range, returns NULL. + TestInfo* GetMutableTestInfo(int i); + + // Sets the should_run member. + void set_should_run(bool should) { should_run_ = should; } + + // Adds a TestInfo to this test suite. Will delete the TestInfo upon + // destruction of the TestSuite object. + void AddTestInfo(TestInfo* test_info); + + // Clears the results of all tests in this test suite. + void ClearResult(); + + // Clears the results of all tests in the given test suite. + static void ClearTestSuiteResult(TestSuite* test_suite) { + test_suite->ClearResult(); + } + + // Runs every test in this TestSuite. + void Run(); + + // Skips the execution of tests under this TestSuite + void Skip(); + + // Runs SetUpTestSuite() for this TestSuite. This wrapper is needed + // for catching exceptions thrown from SetUpTestSuite(). + void RunSetUpTestSuite() { + if (set_up_tc_ != nullptr) { + (*set_up_tc_)(); + } + } + + // Runs TearDownTestSuite() for this TestSuite. This wrapper is + // needed for catching exceptions thrown from TearDownTestSuite(). + void RunTearDownTestSuite() { + if (tear_down_tc_ != nullptr) { + (*tear_down_tc_)(); + } + } + + // Returns true if and only if test passed. + static bool TestPassed(const TestInfo* test_info) { + return test_info->should_run() && test_info->result()->Passed(); + } + + // Returns true if and only if test skipped. + static bool TestSkipped(const TestInfo* test_info) { + return test_info->should_run() && test_info->result()->Skipped(); + } + + // Returns true if and only if test failed. + static bool TestFailed(const TestInfo* test_info) { + return test_info->should_run() && test_info->result()->Failed(); + } + + // Returns true if and only if the test is disabled and will be reported in + // the XML report. + static bool TestReportableDisabled(const TestInfo* test_info) { + return test_info->is_reportable() && test_info->is_disabled_; + } + + // Returns true if and only if test is disabled. + static bool TestDisabled(const TestInfo* test_info) { + return test_info->is_disabled_; + } + + // Returns true if and only if this test will appear in the XML report. + static bool TestReportable(const TestInfo* test_info) { + return test_info->is_reportable(); + } + + // Returns true if the given test should run. + static bool ShouldRunTest(const TestInfo* test_info) { + return test_info->should_run(); + } + + // Shuffles the tests in this test suite. + void ShuffleTests(internal::Random* random); + + // Restores the test order to before the first shuffle. + void UnshuffleTests(); + + // Name of the test suite. + std::string name_; + // Name of the parameter type, or NULL if this is not a typed or a + // type-parameterized test. + const std::unique_ptr type_param_; + // The vector of TestInfos in their original order. It owns the + // elements in the vector. + std::vector test_info_list_; + // Provides a level of indirection for the test list to allow easy + // shuffling and restoring the test order. The i-th element in this + // vector is the index of the i-th test in the shuffled test list. + std::vector test_indices_; + // Pointer to the function that sets up the test suite. + internal::SetUpTestSuiteFunc set_up_tc_; + // Pointer to the function that tears down the test suite. + internal::TearDownTestSuiteFunc tear_down_tc_; + // True if and only if any test in this test suite should run. + bool should_run_; + // The start time, in milliseconds since UNIX Epoch. + TimeInMillis start_timestamp_; + // Elapsed time, in milliseconds. + TimeInMillis elapsed_time_; + // Holds test properties recorded during execution of SetUpTestSuite and + // TearDownTestSuite. + TestResult ad_hoc_test_result_; + + // We disallow copying TestSuites. + TestSuite(const TestSuite&) = delete; + TestSuite& operator=(const TestSuite&) = delete; +}; + +// An Environment object is capable of setting up and tearing down an +// environment. You should subclass this to define your own +// environment(s). +// +// An Environment object does the set-up and tear-down in virtual +// methods SetUp() and TearDown() instead of the constructor and the +// destructor, as: +// +// 1. You cannot safely throw from a destructor. This is a problem +// as in some cases Google Test is used where exceptions are enabled, and +// we may want to implement ASSERT_* using exceptions where they are +// available. +// 2. You cannot use ASSERT_* directly in a constructor or +// destructor. +class Environment { + public: + // The d'tor is virtual as we need to subclass Environment. + virtual ~Environment() {} + + // Override this to define how to set up the environment. + virtual void SetUp() {} + + // Override this to define how to tear down the environment. + virtual void TearDown() {} + + private: + // If you see an error about overriding the following function or + // about it being private, you have mis-spelled SetUp() as Setup(). + struct Setup_should_be_spelled_SetUp {}; + virtual Setup_should_be_spelled_SetUp* Setup() { return nullptr; } +}; + +#if GTEST_HAS_EXCEPTIONS + +// Exception which can be thrown from TestEventListener::OnTestPartResult. +class GTEST_API_ AssertionException + : public internal::GoogleTestFailureException { + public: + explicit AssertionException(const TestPartResult& result) + : GoogleTestFailureException(result) {} +}; + +#endif // GTEST_HAS_EXCEPTIONS + +// The interface for tracing execution of tests. The methods are organized in +// the order the corresponding events are fired. +class TestEventListener { + public: + virtual ~TestEventListener() {} + + // Fired before any test activity starts. + virtual void OnTestProgramStart(const UnitTest& unit_test) = 0; + + // Fired before each iteration of tests starts. There may be more than + // one iteration if GTEST_FLAG(repeat) is set. iteration is the iteration + // index, starting from 0. + virtual void OnTestIterationStart(const UnitTest& unit_test, + int iteration) = 0; + + // Fired before environment set-up for each iteration of tests starts. + virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test) = 0; + + // Fired after environment set-up for each iteration of tests ends. + virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test) = 0; + + // Fired before the test suite starts. + virtual void OnTestSuiteStart(const TestSuite& /*test_suite*/) {} + + // Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + virtual void OnTestCaseStart(const TestCase& /*test_case*/) {} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + // Fired before the test starts. + virtual void OnTestStart(const TestInfo& test_info) = 0; + + // Fired when a test is disabled + virtual void OnTestDisabled(const TestInfo& /*test_info*/) {} + + // Fired after a failed assertion or a SUCCEED() invocation. + // If you want to throw an exception from this function to skip to the next + // TEST, it must be AssertionException defined above, or inherited from it. + virtual void OnTestPartResult(const TestPartResult& test_part_result) = 0; + + // Fired after the test ends. + virtual void OnTestEnd(const TestInfo& test_info) = 0; + + // Fired after the test suite ends. + virtual void OnTestSuiteEnd(const TestSuite& /*test_suite*/) {} + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + virtual void OnTestCaseEnd(const TestCase& /*test_case*/) {} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + // Fired before environment tear-down for each iteration of tests starts. + virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test) = 0; + + // Fired after environment tear-down for each iteration of tests ends. + virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) = 0; + + // Fired after each iteration of tests finishes. + virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration) = 0; + + // Fired after all test activities have ended. + virtual void OnTestProgramEnd(const UnitTest& unit_test) = 0; +}; + +// The convenience class for users who need to override just one or two +// methods and are not concerned that a possible change to a signature of +// the methods they override will not be caught during the build. For +// comments about each method please see the definition of TestEventListener +// above. +class EmptyTestEventListener : public TestEventListener { + public: + void OnTestProgramStart(const UnitTest& /*unit_test*/) override {} + void OnTestIterationStart(const UnitTest& /*unit_test*/, + int /*iteration*/) override {} + void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) override {} + void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) override {} + void OnTestSuiteStart(const TestSuite& /*test_suite*/) override {} +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestCaseStart(const TestCase& /*test_case*/) override {} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + void OnTestStart(const TestInfo& /*test_info*/) override {} + void OnTestDisabled(const TestInfo& /*test_info*/) override {} + void OnTestPartResult(const TestPartResult& /*test_part_result*/) override {} + void OnTestEnd(const TestInfo& /*test_info*/) override {} + void OnTestSuiteEnd(const TestSuite& /*test_suite*/) override {} +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestCaseEnd(const TestCase& /*test_case*/) override {} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) override {} + void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) override {} + void OnTestIterationEnd(const UnitTest& /*unit_test*/, + int /*iteration*/) override {} + void OnTestProgramEnd(const UnitTest& /*unit_test*/) override {} +}; + +// TestEventListeners lets users add listeners to track events in Google Test. +class GTEST_API_ TestEventListeners { + public: + TestEventListeners(); + ~TestEventListeners(); + + // Appends an event listener to the end of the list. Google Test assumes + // the ownership of the listener (i.e. it will delete the listener when + // the test program finishes). + void Append(TestEventListener* listener); + + // Removes the given event listener from the list and returns it. It then + // becomes the caller's responsibility to delete the listener. Returns + // NULL if the listener is not found in the list. + TestEventListener* Release(TestEventListener* listener); + + // Returns the standard listener responsible for the default console + // output. Can be removed from the listeners list to shut down default + // console output. Note that removing this object from the listener list + // with Release transfers its ownership to the caller and makes this + // function return NULL the next time. + TestEventListener* default_result_printer() const { + return default_result_printer_; + } + + // Returns the standard listener responsible for the default XML output + // controlled by the --gtest_output=xml flag. Can be removed from the + // listeners list by users who want to shut down the default XML output + // controlled by this flag and substitute it with custom one. Note that + // removing this object from the listener list with Release transfers its + // ownership to the caller and makes this function return NULL the next + // time. + TestEventListener* default_xml_generator() const { + return default_xml_generator_; + } + + private: + friend class TestSuite; + friend class TestInfo; + friend class internal::DefaultGlobalTestPartResultReporter; + friend class internal::NoExecDeathTest; + friend class internal::TestEventListenersAccessor; + friend class internal::UnitTestImpl; + + // Returns repeater that broadcasts the TestEventListener events to all + // subscribers. + TestEventListener* repeater(); + + // Sets the default_result_printer attribute to the provided listener. + // The listener is also added to the listener list and previous + // default_result_printer is removed from it and deleted. The listener can + // also be NULL in which case it will not be added to the list. Does + // nothing if the previous and the current listener objects are the same. + void SetDefaultResultPrinter(TestEventListener* listener); + + // Sets the default_xml_generator attribute to the provided listener. The + // listener is also added to the listener list and previous + // default_xml_generator is removed from it and deleted. The listener can + // also be NULL in which case it will not be added to the list. Does + // nothing if the previous and the current listener objects are the same. + void SetDefaultXmlGenerator(TestEventListener* listener); + + // Controls whether events will be forwarded by the repeater to the + // listeners in the list. + bool EventForwardingEnabled() const; + void SuppressEventForwarding(); + + // The actual list of listeners. + internal::TestEventRepeater* repeater_; + // Listener responsible for the standard result output. + TestEventListener* default_result_printer_; + // Listener responsible for the creation of the XML output file. + TestEventListener* default_xml_generator_; + + // We disallow copying TestEventListeners. + TestEventListeners(const TestEventListeners&) = delete; + TestEventListeners& operator=(const TestEventListeners&) = delete; +}; + +// A UnitTest consists of a vector of TestSuites. +// +// This is a singleton class. The only instance of UnitTest is +// created when UnitTest::GetInstance() is first called. This +// instance is never deleted. +// +// UnitTest is not copyable. +// +// This class is thread-safe as long as the methods are called +// according to their specification. +class GTEST_API_ UnitTest { + public: + // Gets the singleton UnitTest object. The first time this method + // is called, a UnitTest object is constructed and returned. + // Consecutive calls will return the same object. + static UnitTest* GetInstance(); + + // Runs all tests in this UnitTest object and prints the result. + // Returns 0 if successful, or 1 otherwise. + // + // This method can only be called from the main thread. + // + // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. + int Run() GTEST_MUST_USE_RESULT_; + + // Returns the working directory when the first TEST() or TEST_F() + // was executed. The UnitTest object owns the string. + const char* original_working_dir() const; + + // Returns the TestSuite object for the test that's currently running, + // or NULL if no test is running. + const TestSuite* current_test_suite() const GTEST_LOCK_EXCLUDED_(mutex_); + +// Legacy API is still available but deprecated +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + const TestCase* current_test_case() const GTEST_LOCK_EXCLUDED_(mutex_); +#endif + + // Returns the TestInfo object for the test that's currently running, + // or NULL if no test is running. + const TestInfo* current_test_info() const GTEST_LOCK_EXCLUDED_(mutex_); + + // Returns the random seed used at the start of the current test run. + int random_seed() const; + + // Returns the ParameterizedTestSuiteRegistry object used to keep track of + // value-parameterized tests and instantiate and register them. + // + // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. + internal::ParameterizedTestSuiteRegistry& parameterized_test_registry() + GTEST_LOCK_EXCLUDED_(mutex_); + + // Gets the number of successful test suites. + int successful_test_suite_count() const; + + // Gets the number of failed test suites. + int failed_test_suite_count() const; + + // Gets the number of all test suites. + int total_test_suite_count() const; + + // Gets the number of all test suites that contain at least one test + // that should run. + int test_suite_to_run_count() const; + + // Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + int successful_test_case_count() const; + int failed_test_case_count() const; + int total_test_case_count() const; + int test_case_to_run_count() const; +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + // Gets the number of successful tests. + int successful_test_count() const; + + // Gets the number of skipped tests. + int skipped_test_count() const; + + // Gets the number of failed tests. + int failed_test_count() const; + + // Gets the number of disabled tests that will be reported in the XML report. + int reportable_disabled_test_count() const; + + // Gets the number of disabled tests. + int disabled_test_count() const; + + // Gets the number of tests to be printed in the XML report. + int reportable_test_count() const; + + // Gets the number of all tests. + int total_test_count() const; + + // Gets the number of tests that should run. + int test_to_run_count() const; + + // Gets the time of the test program start, in ms from the start of the + // UNIX epoch. + TimeInMillis start_timestamp() const; + + // Gets the elapsed time, in milliseconds. + TimeInMillis elapsed_time() const; + + // Returns true if and only if the unit test passed (i.e. all test suites + // passed). + bool Passed() const; + + // Returns true if and only if the unit test failed (i.e. some test suite + // failed or something outside of all tests failed). + bool Failed() const; + + // Gets the i-th test suite among all the test suites. i can range from 0 to + // total_test_suite_count() - 1. If i is not in that range, returns NULL. + const TestSuite* GetTestSuite(int i) const; + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + const TestCase* GetTestCase(int i) const; +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + // Returns the TestResult containing information on test failures and + // properties logged outside of individual test suites. + const TestResult& ad_hoc_test_result() const; + + // Returns the list of event listeners that can be used to track events + // inside Google Test. + TestEventListeners& listeners(); + + private: + // Registers and returns a global test environment. When a test + // program is run, all global test environments will be set-up in + // the order they were registered. After all tests in the program + // have finished, all global test environments will be torn-down in + // the *reverse* order they were registered. + // + // The UnitTest object takes ownership of the given environment. + // + // This method can only be called from the main thread. + Environment* AddEnvironment(Environment* env); + + // Adds a TestPartResult to the current TestResult object. All + // Google Test assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) + // eventually call this to report their results. The user code + // should use the assertion macros instead of calling this directly. + void AddTestPartResult(TestPartResult::Type result_type, + const char* file_name, int line_number, + const std::string& message, + const std::string& os_stack_trace) + GTEST_LOCK_EXCLUDED_(mutex_); + + // Adds a TestProperty to the current TestResult object when invoked from + // inside a test, to current TestSuite's ad_hoc_test_result_ when invoked + // from SetUpTestSuite or TearDownTestSuite, or to the global property set + // when invoked elsewhere. If the result already contains a property with + // the same key, the value will be updated. + void RecordProperty(const std::string& key, const std::string& value); + + // Gets the i-th test suite among all the test suites. i can range from 0 to + // total_test_suite_count() - 1. If i is not in that range, returns NULL. + TestSuite* GetMutableTestSuite(int i); + + // Accessors for the implementation object. + internal::UnitTestImpl* impl() { return impl_; } + const internal::UnitTestImpl* impl() const { return impl_; } + + // These classes and functions are friends as they need to access private + // members of UnitTest. + friend class ScopedTrace; + friend class Test; + friend class internal::AssertHelper; + friend class internal::StreamingListenerTest; + friend class internal::UnitTestRecordPropertyTestHelper; + friend Environment* AddGlobalTestEnvironment(Environment* env); + friend std::set* internal::GetIgnoredParameterizedTestSuites(); + friend internal::UnitTestImpl* internal::GetUnitTestImpl(); + friend void internal::ReportFailureInUnknownLocation( + TestPartResult::Type result_type, const std::string& message); + + // Creates an empty UnitTest. + UnitTest(); + + // D'tor + virtual ~UnitTest(); + + // Pushes a trace defined by SCOPED_TRACE() on to the per-thread + // Google Test trace stack. + void PushGTestTrace(const internal::TraceInfo& trace) + GTEST_LOCK_EXCLUDED_(mutex_); + + // Pops a trace from the per-thread Google Test trace stack. + void PopGTestTrace() GTEST_LOCK_EXCLUDED_(mutex_); + + // Protects mutable state in *impl_. This is mutable as some const + // methods need to lock it too. + mutable internal::Mutex mutex_; + + // Opaque implementation object. This field is never changed once + // the object is constructed. We don't mark it as const here, as + // doing so will cause a warning in the constructor of UnitTest. + // Mutable state in *impl_ is protected by mutex_. + internal::UnitTestImpl* impl_; + + // We disallow copying UnitTest. + UnitTest(const UnitTest&) = delete; + UnitTest& operator=(const UnitTest&) = delete; +}; + +// A convenient wrapper for adding an environment for the test +// program. +// +// You should call this before RUN_ALL_TESTS() is called, probably in +// main(). If you use gtest_main, you need to call this before main() +// starts for it to take effect. For example, you can define a global +// variable like this: +// +// testing::Environment* const foo_env = +// testing::AddGlobalTestEnvironment(new FooEnvironment); +// +// However, we strongly recommend you to write your own main() and +// call AddGlobalTestEnvironment() there, as relying on initialization +// of global variables makes the code harder to read and may cause +// problems when you register multiple environments from different +// translation units and the environments have dependencies among them +// (remember that the compiler doesn't guarantee the order in which +// global variables from different translation units are initialized). +inline Environment* AddGlobalTestEnvironment(Environment* env) { + return UnitTest::GetInstance()->AddEnvironment(env); +} + +// Initializes Google Test. This must be called before calling +// RUN_ALL_TESTS(). In particular, it parses a command line for the +// flags that Google Test recognizes. Whenever a Google Test flag is +// seen, it is removed from argv, and *argc is decremented. +// +// No value is returned. Instead, the Google Test flag variables are +// updated. +// +// Calling the function for the second time has no user-visible effect. +GTEST_API_ void InitGoogleTest(int* argc, char** argv); + +// This overloaded version can be used in Windows programs compiled in +// UNICODE mode. +GTEST_API_ void InitGoogleTest(int* argc, wchar_t** argv); + +// This overloaded version can be used on Arduino/embedded platforms where +// there is no argc/argv. +GTEST_API_ void InitGoogleTest(); + +namespace internal { + +// Separate the error generating code from the code path to reduce the stack +// frame size of CmpHelperEQ. This helps reduce the overhead of some sanitizers +// when calling EXPECT_* in a tight loop. +template +AssertionResult CmpHelperEQFailure(const char* lhs_expression, + const char* rhs_expression, const T1& lhs, + const T2& rhs) { + return EqFailure(lhs_expression, rhs_expression, + FormatForComparisonFailureMessage(lhs, rhs), + FormatForComparisonFailureMessage(rhs, lhs), false); +} + +// This block of code defines operator==/!= +// to block lexical scope lookup. +// It prevents using invalid operator==/!= defined at namespace scope. +struct faketype {}; +inline bool operator==(faketype, faketype) { return true; } +inline bool operator!=(faketype, faketype) { return false; } + +// The helper function for {ASSERT|EXPECT}_EQ. +template +AssertionResult CmpHelperEQ(const char* lhs_expression, + const char* rhs_expression, const T1& lhs, + const T2& rhs) { + if (lhs == rhs) { + return AssertionSuccess(); + } + + return CmpHelperEQFailure(lhs_expression, rhs_expression, lhs, rhs); +} + +class EqHelper { + public: + // This templatized version is for the general case. + template < + typename T1, typename T2, + // Disable this overload for cases where one argument is a pointer + // and the other is the null pointer constant. + typename std::enable_if::value || + !std::is_pointer::value>::type* = nullptr> + static AssertionResult Compare(const char* lhs_expression, + const char* rhs_expression, const T1& lhs, + const T2& rhs) { + return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs); + } + + // With this overloaded version, we allow anonymous enums to be used + // in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous + // enums can be implicitly cast to BiggestInt. + // + // Even though its body looks the same as the above version, we + // cannot merge the two, as it will make anonymous enums unhappy. + static AssertionResult Compare(const char* lhs_expression, + const char* rhs_expression, BiggestInt lhs, + BiggestInt rhs) { + return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs); + } + + template + static AssertionResult Compare( + const char* lhs_expression, const char* rhs_expression, + // Handle cases where '0' is used as a null pointer literal. + std::nullptr_t /* lhs */, T* rhs) { + // We already know that 'lhs' is a null pointer. + return CmpHelperEQ(lhs_expression, rhs_expression, static_cast(nullptr), + rhs); + } +}; + +// Separate the error generating code from the code path to reduce the stack +// frame size of CmpHelperOP. This helps reduce the overhead of some sanitizers +// when calling EXPECT_OP in a tight loop. +template +AssertionResult CmpHelperOpFailure(const char* expr1, const char* expr2, + const T1& val1, const T2& val2, + const char* op) { + return AssertionFailure() + << "Expected: (" << expr1 << ") " << op << " (" << expr2 + << "), actual: " << FormatForComparisonFailureMessage(val1, val2) + << " vs " << FormatForComparisonFailureMessage(val2, val1); +} + +// A macro for implementing the helper functions needed to implement +// ASSERT_?? and EXPECT_??. It is here just to avoid copy-and-paste +// of similar code. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. + +#define GTEST_IMPL_CMP_HELPER_(op_name, op) \ + template \ + AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \ + const T1& val1, const T2& val2) { \ + if (val1 op val2) { \ + return AssertionSuccess(); \ + } else { \ + return CmpHelperOpFailure(expr1, expr2, val1, val2, #op); \ + } \ + } + +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. + +// Implements the helper function for {ASSERT|EXPECT}_NE +GTEST_IMPL_CMP_HELPER_(NE, !=) +// Implements the helper function for {ASSERT|EXPECT}_LE +GTEST_IMPL_CMP_HELPER_(LE, <=) +// Implements the helper function for {ASSERT|EXPECT}_LT +GTEST_IMPL_CMP_HELPER_(LT, <) +// Implements the helper function for {ASSERT|EXPECT}_GE +GTEST_IMPL_CMP_HELPER_(GE, >=) +// Implements the helper function for {ASSERT|EXPECT}_GT +GTEST_IMPL_CMP_HELPER_(GT, >) + +#undef GTEST_IMPL_CMP_HELPER_ + +// The helper function for {ASSERT|EXPECT}_STREQ. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression, + const char* s2_expression, + const char* s1, const char* s2); + +// The helper function for {ASSERT|EXPECT}_STRCASEEQ. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* s1_expression, + const char* s2_expression, + const char* s1, const char* s2); + +// The helper function for {ASSERT|EXPECT}_STRNE. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression, + const char* s2_expression, + const char* s1, const char* s2); + +// The helper function for {ASSERT|EXPECT}_STRCASENE. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char* s1_expression, + const char* s2_expression, + const char* s1, const char* s2); + +// Helper function for *_STREQ on wide strings. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression, + const char* s2_expression, + const wchar_t* s1, const wchar_t* s2); + +// Helper function for *_STRNE on wide strings. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression, + const char* s2_expression, + const wchar_t* s1, const wchar_t* s2); + +} // namespace internal + +// IsSubstring() and IsNotSubstring() are intended to be used as the +// first argument to {EXPECT,ASSERT}_PRED_FORMAT2(), not by +// themselves. They check whether needle is a substring of haystack +// (NULL is considered a substring of itself only), and return an +// appropriate error message when they fail. +// +// The {needle,haystack}_expr arguments are the stringified +// expressions that generated the two real arguments. +GTEST_API_ AssertionResult IsSubstring(const char* needle_expr, + const char* haystack_expr, + const char* needle, + const char* haystack); +GTEST_API_ AssertionResult IsSubstring(const char* needle_expr, + const char* haystack_expr, + const wchar_t* needle, + const wchar_t* haystack); +GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr, + const char* haystack_expr, + const char* needle, + const char* haystack); +GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr, + const char* haystack_expr, + const wchar_t* needle, + const wchar_t* haystack); +GTEST_API_ AssertionResult IsSubstring(const char* needle_expr, + const char* haystack_expr, + const ::std::string& needle, + const ::std::string& haystack); +GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr, + const char* haystack_expr, + const ::std::string& needle, + const ::std::string& haystack); + +#if GTEST_HAS_STD_WSTRING +GTEST_API_ AssertionResult IsSubstring(const char* needle_expr, + const char* haystack_expr, + const ::std::wstring& needle, + const ::std::wstring& haystack); +GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr, + const char* haystack_expr, + const ::std::wstring& needle, + const ::std::wstring& haystack); +#endif // GTEST_HAS_STD_WSTRING + +namespace internal { + +// Helper template function for comparing floating-points. +// +// Template parameter: +// +// RawType: the raw floating-point type (either float or double) +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +template +AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression, + const char* rhs_expression, + RawType lhs_value, RawType rhs_value) { + const FloatingPoint lhs(lhs_value), rhs(rhs_value); + + if (lhs.AlmostEquals(rhs)) { + return AssertionSuccess(); + } + + ::std::stringstream lhs_ss; + lhs_ss << std::setprecision(std::numeric_limits::digits10 + 2) + << lhs_value; + + ::std::stringstream rhs_ss; + rhs_ss << std::setprecision(std::numeric_limits::digits10 + 2) + << rhs_value; + + return EqFailure(lhs_expression, rhs_expression, + StringStreamToString(&lhs_ss), StringStreamToString(&rhs_ss), + false); +} + +// Helper function for implementing ASSERT_NEAR. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1, + const char* expr2, + const char* abs_error_expr, + double val1, double val2, + double abs_error); + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// A class that enables one to stream messages to assertion macros +class GTEST_API_ AssertHelper { + public: + // Constructor. + AssertHelper(TestPartResult::Type type, const char* file, int line, + const char* message); + ~AssertHelper(); + + // Message assignment is a semantic trick to enable assertion + // streaming; see the GTEST_MESSAGE_ macro below. + void operator=(const Message& message) const; + + private: + // We put our data in a struct so that the size of the AssertHelper class can + // be as small as possible. This is important because gcc is incapable of + // re-using stack space even for temporary variables, so every EXPECT_EQ + // reserves stack space for another AssertHelper. + struct AssertHelperData { + AssertHelperData(TestPartResult::Type t, const char* srcfile, int line_num, + const char* msg) + : type(t), file(srcfile), line(line_num), message(msg) {} + + TestPartResult::Type const type; + const char* const file; + int const line; + std::string const message; + + private: + AssertHelperData(const AssertHelperData&) = delete; + AssertHelperData& operator=(const AssertHelperData&) = delete; + }; + + AssertHelperData* const data_; + + AssertHelper(const AssertHelper&) = delete; + AssertHelper& operator=(const AssertHelper&) = delete; +}; + +} // namespace internal + +// The pure interface class that all value-parameterized tests inherit from. +// A value-parameterized class must inherit from both ::testing::Test and +// ::testing::WithParamInterface. In most cases that just means inheriting +// from ::testing::TestWithParam, but more complicated test hierarchies +// may need to inherit from Test and WithParamInterface at different levels. +// +// This interface has support for accessing the test parameter value via +// the GetParam() method. +// +// Use it with one of the parameter generator defining functions, like Range(), +// Values(), ValuesIn(), Bool(), and Combine(). +// +// class FooTest : public ::testing::TestWithParam { +// protected: +// FooTest() { +// // Can use GetParam() here. +// } +// ~FooTest() override { +// // Can use GetParam() here. +// } +// void SetUp() override { +// // Can use GetParam() here. +// } +// void TearDown override { +// // Can use GetParam() here. +// } +// }; +// TEST_P(FooTest, DoesBar) { +// // Can use GetParam() method here. +// Foo foo; +// ASSERT_TRUE(foo.DoesBar(GetParam())); +// } +// INSTANTIATE_TEST_SUITE_P(OneToTenRange, FooTest, ::testing::Range(1, 10)); + +template +class WithParamInterface { + public: + typedef T ParamType; + virtual ~WithParamInterface() {} + + // The current parameter value. Is also available in the test fixture's + // constructor. + static const ParamType& GetParam() { + GTEST_CHECK_(parameter_ != nullptr) + << "GetParam() can only be called inside a value-parameterized test " + << "-- did you intend to write TEST_P instead of TEST_F?"; + return *parameter_; + } + + private: + // Sets parameter value. The caller is responsible for making sure the value + // remains alive and unchanged throughout the current test. + static void SetParam(const ParamType* parameter) { parameter_ = parameter; } + + // Static value used for accessing parameter during a test lifetime. + static const ParamType* parameter_; + + // TestClass must be a subclass of WithParamInterface and Test. + template + friend class internal::ParameterizedTestFactory; +}; + +template +const T* WithParamInterface::parameter_ = nullptr; + +// Most value-parameterized classes can ignore the existence of +// WithParamInterface, and can just inherit from ::testing::TestWithParam. + +template +class TestWithParam : public Test, public WithParamInterface {}; + +// Macros for indicating success/failure in test code. + +// Skips test in runtime. +// Skipping test aborts current function. +// Skipped tests are neither successful nor failed. +#define GTEST_SKIP() GTEST_SKIP_("") + +// ADD_FAILURE unconditionally adds a failure to the current test. +// SUCCEED generates a success - it doesn't automatically make the +// current test successful, as a test is only successful when it has +// no failure. +// +// EXPECT_* verifies that a certain condition is satisfied. If not, +// it behaves like ADD_FAILURE. In particular: +// +// EXPECT_TRUE verifies that a Boolean condition is true. +// EXPECT_FALSE verifies that a Boolean condition is false. +// +// FAIL and ASSERT_* are similar to ADD_FAILURE and EXPECT_*, except +// that they will also abort the current function on failure. People +// usually want the fail-fast behavior of FAIL and ASSERT_*, but those +// writing data-driven tests often find themselves using ADD_FAILURE +// and EXPECT_* more. + +// Generates a nonfatal failure with a generic message. +#define ADD_FAILURE() GTEST_NONFATAL_FAILURE_("Failed") + +// Generates a nonfatal failure at the given source file location with +// a generic message. +#define ADD_FAILURE_AT(file, line) \ + GTEST_MESSAGE_AT_(file, line, "Failed", \ + ::testing::TestPartResult::kNonFatalFailure) + +// Generates a fatal failure with a generic message. +#define GTEST_FAIL() GTEST_FATAL_FAILURE_("Failed") + +// Like GTEST_FAIL(), but at the given source file location. +#define GTEST_FAIL_AT(file, line) \ + GTEST_MESSAGE_AT_(file, line, "Failed", \ + ::testing::TestPartResult::kFatalFailure) + +// Define this macro to 1 to omit the definition of FAIL(), which is a +// generic name and clashes with some other libraries. +#if !GTEST_DONT_DEFINE_FAIL +#define FAIL() GTEST_FAIL() +#endif + +// Generates a success with a generic message. +#define GTEST_SUCCEED() GTEST_SUCCESS_("Succeeded") + +// Define this macro to 1 to omit the definition of SUCCEED(), which +// is a generic name and clashes with some other libraries. +#if !GTEST_DONT_DEFINE_SUCCEED +#define SUCCEED() GTEST_SUCCEED() +#endif + +// Macros for testing exceptions. +// +// * {ASSERT|EXPECT}_THROW(statement, expected_exception): +// Tests that the statement throws the expected exception. +// * {ASSERT|EXPECT}_NO_THROW(statement): +// Tests that the statement doesn't throw any exception. +// * {ASSERT|EXPECT}_ANY_THROW(statement): +// Tests that the statement throws an exception. + +#define EXPECT_THROW(statement, expected_exception) \ + GTEST_TEST_THROW_(statement, expected_exception, GTEST_NONFATAL_FAILURE_) +#define EXPECT_NO_THROW(statement) \ + GTEST_TEST_NO_THROW_(statement, GTEST_NONFATAL_FAILURE_) +#define EXPECT_ANY_THROW(statement) \ + GTEST_TEST_ANY_THROW_(statement, GTEST_NONFATAL_FAILURE_) +#define ASSERT_THROW(statement, expected_exception) \ + GTEST_TEST_THROW_(statement, expected_exception, GTEST_FATAL_FAILURE_) +#define ASSERT_NO_THROW(statement) \ + GTEST_TEST_NO_THROW_(statement, GTEST_FATAL_FAILURE_) +#define ASSERT_ANY_THROW(statement) \ + GTEST_TEST_ANY_THROW_(statement, GTEST_FATAL_FAILURE_) + +// Boolean assertions. Condition can be either a Boolean expression or an +// AssertionResult. For more information on how to use AssertionResult with +// these macros see comments on that class. +#define GTEST_EXPECT_TRUE(condition) \ + GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \ + GTEST_NONFATAL_FAILURE_) +#define GTEST_EXPECT_FALSE(condition) \ + GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \ + GTEST_NONFATAL_FAILURE_) +#define GTEST_ASSERT_TRUE(condition) \ + GTEST_TEST_BOOLEAN_(condition, #condition, false, true, GTEST_FATAL_FAILURE_) +#define GTEST_ASSERT_FALSE(condition) \ + GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \ + GTEST_FATAL_FAILURE_) + +// Define these macros to 1 to omit the definition of the corresponding +// EXPECT or ASSERT, which clashes with some users' own code. + +#if !GTEST_DONT_DEFINE_EXPECT_TRUE +#define EXPECT_TRUE(condition) GTEST_EXPECT_TRUE(condition) +#endif + +#if !GTEST_DONT_DEFINE_EXPECT_FALSE +#define EXPECT_FALSE(condition) GTEST_EXPECT_FALSE(condition) +#endif + +#if !GTEST_DONT_DEFINE_ASSERT_TRUE +#define ASSERT_TRUE(condition) GTEST_ASSERT_TRUE(condition) +#endif + +#if !GTEST_DONT_DEFINE_ASSERT_FALSE +#define ASSERT_FALSE(condition) GTEST_ASSERT_FALSE(condition) +#endif + +// Macros for testing equalities and inequalities. +// +// * {ASSERT|EXPECT}_EQ(v1, v2): Tests that v1 == v2 +// * {ASSERT|EXPECT}_NE(v1, v2): Tests that v1 != v2 +// * {ASSERT|EXPECT}_LT(v1, v2): Tests that v1 < v2 +// * {ASSERT|EXPECT}_LE(v1, v2): Tests that v1 <= v2 +// * {ASSERT|EXPECT}_GT(v1, v2): Tests that v1 > v2 +// * {ASSERT|EXPECT}_GE(v1, v2): Tests that v1 >= v2 +// +// When they are not, Google Test prints both the tested expressions and +// their actual values. The values must be compatible built-in types, +// or you will get a compiler error. By "compatible" we mean that the +// values can be compared by the respective operator. +// +// Note: +// +// 1. It is possible to make a user-defined type work with +// {ASSERT|EXPECT}_??(), but that requires overloading the +// comparison operators and is thus discouraged by the Google C++ +// Usage Guide. Therefore, you are advised to use the +// {ASSERT|EXPECT}_TRUE() macro to assert that two objects are +// equal. +// +// 2. The {ASSERT|EXPECT}_??() macros do pointer comparisons on +// pointers (in particular, C strings). Therefore, if you use it +// with two C strings, you are testing how their locations in memory +// are related, not how their content is related. To compare two C +// strings by content, use {ASSERT|EXPECT}_STR*(). +// +// 3. {ASSERT|EXPECT}_EQ(v1, v2) is preferred to +// {ASSERT|EXPECT}_TRUE(v1 == v2), as the former tells you +// what the actual value is when it fails, and similarly for the +// other comparisons. +// +// 4. Do not depend on the order in which {ASSERT|EXPECT}_??() +// evaluate their arguments, which is undefined. +// +// 5. These macros evaluate their arguments exactly once. +// +// Examples: +// +// EXPECT_NE(Foo(), 5); +// EXPECT_EQ(a_pointer, NULL); +// ASSERT_LT(i, array_size); +// ASSERT_GT(records.size(), 0) << "There is no record left."; + +#define EXPECT_EQ(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal::EqHelper::Compare, val1, val2) +#define EXPECT_NE(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2) +#define EXPECT_LE(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2) +#define EXPECT_LT(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2) +#define EXPECT_GE(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2) +#define EXPECT_GT(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2) + +#define GTEST_ASSERT_EQ(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal::EqHelper::Compare, val1, val2) +#define GTEST_ASSERT_NE(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2) +#define GTEST_ASSERT_LE(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2) +#define GTEST_ASSERT_LT(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2) +#define GTEST_ASSERT_GE(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2) +#define GTEST_ASSERT_GT(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2) + +// Define macro GTEST_DONT_DEFINE_ASSERT_XY to 1 to omit the definition of +// ASSERT_XY(), which clashes with some users' own code. + +#if !GTEST_DONT_DEFINE_ASSERT_EQ +#define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2) +#endif + +#if !GTEST_DONT_DEFINE_ASSERT_NE +#define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2) +#endif + +#if !GTEST_DONT_DEFINE_ASSERT_LE +#define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2) +#endif + +#if !GTEST_DONT_DEFINE_ASSERT_LT +#define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2) +#endif + +#if !GTEST_DONT_DEFINE_ASSERT_GE +#define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2) +#endif + +#if !GTEST_DONT_DEFINE_ASSERT_GT +#define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2) +#endif + +// C-string Comparisons. All tests treat NULL and any non-NULL string +// as different. Two NULLs are equal. +// +// * {ASSERT|EXPECT}_STREQ(s1, s2): Tests that s1 == s2 +// * {ASSERT|EXPECT}_STRNE(s1, s2): Tests that s1 != s2 +// * {ASSERT|EXPECT}_STRCASEEQ(s1, s2): Tests that s1 == s2, ignoring case +// * {ASSERT|EXPECT}_STRCASENE(s1, s2): Tests that s1 != s2, ignoring case +// +// For wide or narrow string objects, you can use the +// {ASSERT|EXPECT}_??() macros. +// +// Don't depend on the order in which the arguments are evaluated, +// which is undefined. +// +// These macros evaluate their arguments exactly once. + +#define EXPECT_STREQ(s1, s2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, s1, s2) +#define EXPECT_STRNE(s1, s2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2) +#define EXPECT_STRCASEEQ(s1, s2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2) +#define EXPECT_STRCASENE(s1, s2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2) + +#define ASSERT_STREQ(s1, s2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, s1, s2) +#define ASSERT_STRNE(s1, s2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2) +#define ASSERT_STRCASEEQ(s1, s2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2) +#define ASSERT_STRCASENE(s1, s2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2) + +// Macros for comparing floating-point numbers. +// +// * {ASSERT|EXPECT}_FLOAT_EQ(val1, val2): +// Tests that two float values are almost equal. +// * {ASSERT|EXPECT}_DOUBLE_EQ(val1, val2): +// Tests that two double values are almost equal. +// * {ASSERT|EXPECT}_NEAR(v1, v2, abs_error): +// Tests that v1 and v2 are within the given distance to each other. +// +// Google Test uses ULP-based comparison to automatically pick a default +// error bound that is appropriate for the operands. See the +// FloatingPoint template class in gtest-internal.h if you are +// interested in the implementation details. + +#define EXPECT_FLOAT_EQ(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ, \ + val1, val2) + +#define EXPECT_DOUBLE_EQ(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ, \ + val1, val2) + +#define ASSERT_FLOAT_EQ(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ, \ + val1, val2) + +#define ASSERT_DOUBLE_EQ(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ, \ + val1, val2) + +#define EXPECT_NEAR(val1, val2, abs_error) \ + EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, val1, val2, \ + abs_error) + +#define ASSERT_NEAR(val1, val2, abs_error) \ + ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, val1, val2, \ + abs_error) + +// These predicate format functions work on floating-point values, and +// can be used in {ASSERT|EXPECT}_PRED_FORMAT2*(), e.g. +// +// EXPECT_PRED_FORMAT2(testing::DoubleLE, Foo(), 5.0); + +// Asserts that val1 is less than, or almost equal to, val2. Fails +// otherwise. In particular, it fails if either val1 or val2 is NaN. +GTEST_API_ AssertionResult FloatLE(const char* expr1, const char* expr2, + float val1, float val2); +GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2, + double val1, double val2); + +#if GTEST_OS_WINDOWS + +// Macros that test for HRESULT failure and success, these are only useful +// on Windows, and rely on Windows SDK macros and APIs to compile. +// +// * {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}(expr) +// +// When expr unexpectedly fails or succeeds, Google Test prints the +// expected result and the actual result with both a human-readable +// string representation of the error, if available, as well as the +// hex result code. +#define EXPECT_HRESULT_SUCCEEDED(expr) \ + EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr)) + +#define ASSERT_HRESULT_SUCCEEDED(expr) \ + ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr)) + +#define EXPECT_HRESULT_FAILED(expr) \ + EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr)) + +#define ASSERT_HRESULT_FAILED(expr) \ + ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr)) + +#endif // GTEST_OS_WINDOWS + +// Macros that execute statement and check that it doesn't generate new fatal +// failures in the current thread. +// +// * {ASSERT|EXPECT}_NO_FATAL_FAILURE(statement); +// +// Examples: +// +// EXPECT_NO_FATAL_FAILURE(Process()); +// ASSERT_NO_FATAL_FAILURE(Process()) << "Process() failed"; +// +#define ASSERT_NO_FATAL_FAILURE(statement) \ + GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_) +#define EXPECT_NO_FATAL_FAILURE(statement) \ + GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_) + +// Causes a trace (including the given source file path and line number, +// and the given message) to be included in every test failure message generated +// by code in the scope of the lifetime of an instance of this class. The effect +// is undone with the destruction of the instance. +// +// The message argument can be anything streamable to std::ostream. +// +// Example: +// testing::ScopedTrace trace("file.cc", 123, "message"); +// +class GTEST_API_ ScopedTrace { + public: + // The c'tor pushes the given source file location and message onto + // a trace stack maintained by Google Test. + + // Template version. Uses Message() to convert the values into strings. + // Slow, but flexible. + template + ScopedTrace(const char* file, int line, const T& message) { + PushTrace(file, line, (Message() << message).GetString()); + } + + // Optimize for some known types. + ScopedTrace(const char* file, int line, const char* message) { + PushTrace(file, line, message ? message : "(null)"); + } + + ScopedTrace(const char* file, int line, const std::string& message) { + PushTrace(file, line, message); + } + + // The d'tor pops the info pushed by the c'tor. + // + // Note that the d'tor is not virtual in order to be efficient. + // Don't inherit from ScopedTrace! + ~ScopedTrace(); + + private: + void PushTrace(const char* file, int line, std::string message); + + ScopedTrace(const ScopedTrace&) = delete; + ScopedTrace& operator=(const ScopedTrace&) = delete; +} GTEST_ATTRIBUTE_UNUSED_; // A ScopedTrace object does its job in its + // c'tor and d'tor. Therefore it doesn't + // need to be used otherwise. + +// Causes a trace (including the source file path, the current line +// number, and the given message) to be included in every test failure +// message generated by code in the current scope. The effect is +// undone when the control leaves the current scope. +// +// The message argument can be anything streamable to std::ostream. +// +// In the implementation, we include the current line number as part +// of the dummy variable name, thus allowing multiple SCOPED_TRACE()s +// to appear in the same block - as long as they are on different +// lines. +// +// Assuming that each thread maintains its own stack of traces. +// Therefore, a SCOPED_TRACE() would (correctly) only affect the +// assertions in its own thread. +#define SCOPED_TRACE(message) \ + ::testing::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)( \ + __FILE__, __LINE__, (message)) + +// Compile-time assertion for type equality. +// StaticAssertTypeEq() compiles if and only if type1 and type2 +// are the same type. The value it returns is not interesting. +// +// Instead of making StaticAssertTypeEq a class template, we make it a +// function template that invokes a helper class template. This +// prevents a user from misusing StaticAssertTypeEq by +// defining objects of that type. +// +// CAVEAT: +// +// When used inside a method of a class template, +// StaticAssertTypeEq() is effective ONLY IF the method is +// instantiated. For example, given: +// +// template class Foo { +// public: +// void Bar() { testing::StaticAssertTypeEq(); } +// }; +// +// the code: +// +// void Test1() { Foo foo; } +// +// will NOT generate a compiler error, as Foo::Bar() is never +// actually instantiated. Instead, you need: +// +// void Test2() { Foo foo; foo.Bar(); } +// +// to cause a compiler error. +template +constexpr bool StaticAssertTypeEq() noexcept { + static_assert(std::is_same::value, "T1 and T2 are not the same type"); + return true; +} + +// Defines a test. +// +// The first parameter is the name of the test suite, and the second +// parameter is the name of the test within the test suite. +// +// The convention is to end the test suite name with "Test". For +// example, a test suite for the Foo class can be named FooTest. +// +// Test code should appear between braces after an invocation of +// this macro. Example: +// +// TEST(FooTest, InitializesCorrectly) { +// Foo foo; +// EXPECT_TRUE(foo.StatusIsOK()); +// } + +// Note that we call GetTestTypeId() instead of GetTypeId< +// ::testing::Test>() here to get the type ID of testing::Test. This +// is to work around a suspected linker bug when using Google Test as +// a framework on Mac OS X. The bug causes GetTypeId< +// ::testing::Test>() to return different values depending on whether +// the call is from the Google Test framework itself or from user test +// code. GetTestTypeId() is guaranteed to always return the same +// value, as it always calls GetTypeId<>() from the Google Test +// framework. +#define GTEST_TEST(test_suite_name, test_name) \ + GTEST_TEST_(test_suite_name, test_name, ::testing::Test, \ + ::testing::internal::GetTestTypeId()) + +// Define this macro to 1 to omit the definition of TEST(), which +// is a generic name and clashes with some other libraries. +#if !GTEST_DONT_DEFINE_TEST +#define TEST(test_suite_name, test_name) GTEST_TEST(test_suite_name, test_name) +#endif + +// Defines a test that uses a test fixture. +// +// The first parameter is the name of the test fixture class, which +// also doubles as the test suite name. The second parameter is the +// name of the test within the test suite. +// +// A test fixture class must be declared earlier. The user should put +// the test code between braces after using this macro. Example: +// +// class FooTest : public testing::Test { +// protected: +// void SetUp() override { b_.AddElement(3); } +// +// Foo a_; +// Foo b_; +// }; +// +// TEST_F(FooTest, InitializesCorrectly) { +// EXPECT_TRUE(a_.StatusIsOK()); +// } +// +// TEST_F(FooTest, ReturnsElementCountCorrectly) { +// EXPECT_EQ(a_.size(), 0); +// EXPECT_EQ(b_.size(), 1); +// } +#define GTEST_TEST_F(test_fixture, test_name) \ + GTEST_TEST_(test_fixture, test_name, test_fixture, \ + ::testing::internal::GetTypeId()) +#if !GTEST_DONT_DEFINE_TEST_F +#define TEST_F(test_fixture, test_name) GTEST_TEST_F(test_fixture, test_name) +#endif + +// Returns a path to temporary directory. +// Tries to determine an appropriate directory for the platform. +GTEST_API_ std::string TempDir(); + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +// Dynamically registers a test with the framework. +// +// This is an advanced API only to be used when the `TEST` macros are +// insufficient. The macros should be preferred when possible, as they avoid +// most of the complexity of calling this function. +// +// The `factory` argument is a factory callable (move-constructible) object or +// function pointer that creates a new instance of the Test object. It +// handles ownership to the caller. The signature of the callable is +// `Fixture*()`, where `Fixture` is the test fixture class for the test. All +// tests registered with the same `test_suite_name` must return the same +// fixture type. This is checked at runtime. +// +// The framework will infer the fixture class from the factory and will call +// the `SetUpTestSuite` and `TearDownTestSuite` for it. +// +// Must be called before `RUN_ALL_TESTS()` is invoked, otherwise behavior is +// undefined. +// +// Use case example: +// +// class MyFixture : public ::testing::Test { +// public: +// // All of these optional, just like in regular macro usage. +// static void SetUpTestSuite() { ... } +// static void TearDownTestSuite() { ... } +// void SetUp() override { ... } +// void TearDown() override { ... } +// }; +// +// class MyTest : public MyFixture { +// public: +// explicit MyTest(int data) : data_(data) {} +// void TestBody() override { ... } +// +// private: +// int data_; +// }; +// +// void RegisterMyTests(const std::vector& values) { +// for (int v : values) { +// ::testing::RegisterTest( +// "MyFixture", ("Test" + std::to_string(v)).c_str(), nullptr, +// std::to_string(v).c_str(), +// __FILE__, __LINE__, +// // Important to use the fixture type as the return type here. +// [=]() -> MyFixture* { return new MyTest(v); }); +// } +// } +// ... +// int main(int argc, char** argv) { +// ::testing::InitGoogleTest(&argc, argv); +// std::vector values_to_test = LoadValuesFromConfig(); +// RegisterMyTests(values_to_test); +// ... +// return RUN_ALL_TESTS(); +// } +// +template +TestInfo* RegisterTest(const char* test_suite_name, const char* test_name, + const char* type_param, const char* value_param, + const char* file, int line, Factory factory) { + using TestT = typename std::remove_pointer::type; + + class FactoryImpl : public internal::TestFactoryBase { + public: + explicit FactoryImpl(Factory f) : factory_(std::move(f)) {} + Test* CreateTest() override { return factory_(); } + + private: + Factory factory_; + }; + + return internal::MakeAndRegisterTestInfo( + test_suite_name, test_name, type_param, value_param, + internal::CodeLocation(file, line), internal::GetTypeId(), + internal::SuiteApiResolver::GetSetUpCaseOrSuite(file, line), + internal::SuiteApiResolver::GetTearDownCaseOrSuite(file, line), + new FactoryImpl{std::move(factory)}); +} + +} // namespace testing + +// Use this function in main() to run all tests. It returns 0 if all +// tests are successful, or 1 otherwise. +// +// RUN_ALL_TESTS() should be invoked after the command line has been +// parsed by InitGoogleTest(). +// +// This function was formerly a macro; thus, it is in the global +// namespace and has an all-caps name. +int RUN_ALL_TESTS() GTEST_MUST_USE_RESULT_; + +inline int RUN_ALL_TESTS() { return ::testing::UnitTest::GetInstance()->Run(); } + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h new file mode 100644 index 0000000000..47a24aa687 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h @@ -0,0 +1,279 @@ +// Copyright 2006, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Implements a family of generic predicate assertion macros. + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ + +#include "gtest/gtest-assertion-result.h" +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-port.h" + +namespace testing { + +// This header implements a family of generic predicate assertion +// macros: +// +// ASSERT_PRED_FORMAT1(pred_format, v1) +// ASSERT_PRED_FORMAT2(pred_format, v1, v2) +// ... +// +// where pred_format is a function or functor that takes n (in the +// case of ASSERT_PRED_FORMATn) values and their source expression +// text, and returns a testing::AssertionResult. See the definition +// of ASSERT_EQ in gtest.h for an example. +// +// If you don't care about formatting, you can use the more +// restrictive version: +// +// ASSERT_PRED1(pred, v1) +// ASSERT_PRED2(pred, v1, v2) +// ... +// +// where pred is an n-ary function or functor that returns bool, +// and the values v1, v2, ..., must support the << operator for +// streaming to std::ostream. +// +// We also define the EXPECT_* variations. +// +// For now we only support predicates whose arity is at most 5. +// Please email googletestframework@googlegroups.com if you need +// support for higher arities. + +// GTEST_ASSERT_ is the basic statement to which all of the assertions +// in this file reduce. Don't use this in your code. + +#define GTEST_ASSERT_(expression, on_failure) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (const ::testing::AssertionResult gtest_ar = (expression)) \ + ; \ + else \ + on_failure(gtest_ar.failure_message()) + +// Helper function for implementing {EXPECT|ASSERT}_PRED1. Don't use +// this in your code. +template +AssertionResult AssertPred1Helper(const char* pred_text, const char* e1, + Pred pred, const T1& v1) { + if (pred(v1)) return AssertionSuccess(); + + return AssertionFailure() + << pred_text << "(" << e1 << ") evaluates to false, where" + << "\n" + << e1 << " evaluates to " << ::testing::PrintToString(v1); +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1. +// Don't use this in your code. +#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure) \ + GTEST_ASSERT_(pred_format(#v1, v1), on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED1. Don't use +// this in your code. +#define GTEST_PRED1_(pred, v1, on_failure) \ + GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, #v1, pred, v1), on_failure) + +// Unary predicate assertion macros. +#define EXPECT_PRED_FORMAT1(pred_format, v1) \ + GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED1(pred, v1) GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT1(pred_format, v1) \ + GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED1(pred, v1) GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_) + +// Helper function for implementing {EXPECT|ASSERT}_PRED2. Don't use +// this in your code. +template +AssertionResult AssertPred2Helper(const char* pred_text, const char* e1, + const char* e2, Pred pred, const T1& v1, + const T2& v2) { + if (pred(v1, v2)) return AssertionSuccess(); + + return AssertionFailure() + << pred_text << "(" << e1 << ", " << e2 + << ") evaluates to false, where" + << "\n" + << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n" + << e2 << " evaluates to " << ::testing::PrintToString(v2); +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2. +// Don't use this in your code. +#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure) \ + GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED2. Don't use +// this in your code. +#define GTEST_PRED2_(pred, v1, v2, on_failure) \ + GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, #v1, #v2, pred, v1, v2), \ + on_failure) + +// Binary predicate assertion macros. +#define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \ + GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED2(pred, v1, v2) \ + GTEST_PRED2_(pred, v1, v2, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT2(pred_format, v1, v2) \ + GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED2(pred, v1, v2) \ + GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_) + +// Helper function for implementing {EXPECT|ASSERT}_PRED3. Don't use +// this in your code. +template +AssertionResult AssertPred3Helper(const char* pred_text, const char* e1, + const char* e2, const char* e3, Pred pred, + const T1& v1, const T2& v2, const T3& v3) { + if (pred(v1, v2, v3)) return AssertionSuccess(); + + return AssertionFailure() + << pred_text << "(" << e1 << ", " << e2 << ", " << e3 + << ") evaluates to false, where" + << "\n" + << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n" + << e2 << " evaluates to " << ::testing::PrintToString(v2) << "\n" + << e3 << " evaluates to " << ::testing::PrintToString(v3); +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3. +// Don't use this in your code. +#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure) \ + GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED3. Don't use +// this in your code. +#define GTEST_PRED3_(pred, v1, v2, v3, on_failure) \ + GTEST_ASSERT_( \ + ::testing::AssertPred3Helper(#pred, #v1, #v2, #v3, pred, v1, v2, v3), \ + on_failure) + +// Ternary predicate assertion macros. +#define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \ + GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED3(pred, v1, v2, v3) \ + GTEST_PRED3_(pred, v1, v2, v3, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT3(pred_format, v1, v2, v3) \ + GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED3(pred, v1, v2, v3) \ + GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_) + +// Helper function for implementing {EXPECT|ASSERT}_PRED4. Don't use +// this in your code. +template +AssertionResult AssertPred4Helper(const char* pred_text, const char* e1, + const char* e2, const char* e3, + const char* e4, Pred pred, const T1& v1, + const T2& v2, const T3& v3, const T4& v4) { + if (pred(v1, v2, v3, v4)) return AssertionSuccess(); + + return AssertionFailure() + << pred_text << "(" << e1 << ", " << e2 << ", " << e3 << ", " << e4 + << ") evaluates to false, where" + << "\n" + << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n" + << e2 << " evaluates to " << ::testing::PrintToString(v2) << "\n" + << e3 << " evaluates to " << ::testing::PrintToString(v3) << "\n" + << e4 << " evaluates to " << ::testing::PrintToString(v4); +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4. +// Don't use this in your code. +#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure) \ + GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED4. Don't use +// this in your code. +#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure) \ + GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, #v1, #v2, #v3, #v4, pred, \ + v1, v2, v3, v4), \ + on_failure) + +// 4-ary predicate assertion macros. +#define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \ + GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED4(pred, v1, v2, v3, v4) \ + GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \ + GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED4(pred, v1, v2, v3, v4) \ + GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_) + +// Helper function for implementing {EXPECT|ASSERT}_PRED5. Don't use +// this in your code. +template +AssertionResult AssertPred5Helper(const char* pred_text, const char* e1, + const char* e2, const char* e3, + const char* e4, const char* e5, Pred pred, + const T1& v1, const T2& v2, const T3& v3, + const T4& v4, const T5& v5) { + if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess(); + + return AssertionFailure() + << pred_text << "(" << e1 << ", " << e2 << ", " << e3 << ", " << e4 + << ", " << e5 << ") evaluates to false, where" + << "\n" + << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n" + << e2 << " evaluates to " << ::testing::PrintToString(v2) << "\n" + << e3 << " evaluates to " << ::testing::PrintToString(v3) << "\n" + << e4 << " evaluates to " << ::testing::PrintToString(v4) << "\n" + << e5 << " evaluates to " << ::testing::PrintToString(v5); +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5. +// Don't use this in your code. +#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure) \ + GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \ + on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED5. Don't use +// this in your code. +#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure) \ + GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, #v1, #v2, #v3, #v4, #v5, \ + pred, v1, v2, v3, v4, v5), \ + on_failure) + +// 5-ary predicate assertion macros. +#define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \ + GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED5(pred, v1, v2, v3, v4, v5) \ + GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \ + GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \ + GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_) + +} // namespace testing + +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h new file mode 100644 index 0000000000..1f37dc31c3 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h @@ -0,0 +1,60 @@ +// Copyright 2006, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Google C++ Testing and Mocking Framework definitions useful in production +// code. + +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_ + +// When you need to test the private or protected members of a class, +// use the FRIEND_TEST macro to declare your tests as friends of the +// class. For example: +// +// class MyClass { +// private: +// void PrivateMethod(); +// FRIEND_TEST(MyClassTest, PrivateMethodWorks); +// }; +// +// class MyClassTest : public testing::Test { +// // ... +// }; +// +// TEST_F(MyClassTest, PrivateMethodWorks) { +// // Can call MyClass::PrivateMethod() here. +// } +// +// Note: The test class must be in the same namespace as the class being tested. +// For example, putting MyClassTest in an anonymous namespace will not work. + +#define FRIEND_TEST(test_case_name, test_name) \ + friend class test_case_name##_##test_name##_Test + +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/README.md b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/README.md new file mode 100644 index 0000000000..cb49e2c754 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/README.md @@ -0,0 +1,44 @@ +# Customization Points + +The custom directory is an injection point for custom user configurations. + +## Header `gtest.h` + +### The following macros can be defined: + +* `GTEST_OS_STACK_TRACE_GETTER_` - The name of an implementation of + `OsStackTraceGetterInterface`. +* `GTEST_CUSTOM_TEMPDIR_FUNCTION_` - An override for `testing::TempDir()`. See + `testing::TempDir` for semantics and signature. + +## Header `gtest-port.h` + +The following macros can be defined: + +### Logging: + +* `GTEST_LOG_(severity)` +* `GTEST_CHECK_(condition)` +* Functions `LogToStderr()` and `FlushInfoLog()` have to be provided too. + +### Threading: + +* `GTEST_HAS_NOTIFICATION_` - Enabled if Notification is already provided. +* `GTEST_HAS_MUTEX_AND_THREAD_LOCAL_` - Enabled if `Mutex` and `ThreadLocal` + are already provided. Must also provide `GTEST_DECLARE_STATIC_MUTEX_(mutex)` + and `GTEST_DEFINE_STATIC_MUTEX_(mutex)` +* `GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)` +* `GTEST_LOCK_EXCLUDED_(locks)` + +### Underlying library support features + +* `GTEST_HAS_CXXABI_H_` + +### Exporting API symbols: + +* `GTEST_API_` - Specifier for exported symbols. + +## Header `gtest-printers.h` + +* See documentation at `gtest/gtest-printers.h` for details on how to define a + custom printer. diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h new file mode 100644 index 0000000000..9b7fb4261a --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h @@ -0,0 +1,68 @@ +// Copyright 2015, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Injection point for custom user configurations. See README for details +// +// ** Custom implementation starts here ** + +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_ + +// Use a stub Notification class. +// +// The built-in Notification class in GoogleTest v1.12.1 uses std::mutex and +// std::condition_variable. The and headers of +// mingw32 g++ (GNU 10.0.0) define std::mutex and std::condition_variable only +// when configured with the posix threads option but don't define them when +// configured with the win32 threads option. The Notification class is only +// used in GoogleTest's internal tests. Since we don't build GoogleTest's +// internal tests, we don't need a working Notification class. Although it's +// not hard to fix the mingw32 g++ compilation errors by implementing the +// Notification class using Windows CRITICAL_SECTION and CONDITION_VARIABLE, +// it's simpler to just use a stub Notification class on all platforms. +// +// The default constructor of the stub class is deleted and the declaration of +// the Notify() method is commented out, so that compilation will fail if any +// code actually uses the Notification class. + +#define GTEST_HAS_NOTIFICATION_ 1 +namespace testing { +namespace internal { +class Notification { + public: + Notification() = delete; + Notification(const Notification&) = delete; + Notification& operator=(const Notification&) = delete; + // void Notify(); + void WaitForNotification() {} +}; +} // namespace internal +} // namespace testing + +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h new file mode 100644 index 0000000000..b9495d8378 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h @@ -0,0 +1,42 @@ +// Copyright 2015, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// This file provides an injection point for custom printers in a local +// installation of gTest. +// It will be included from gtest-printers.h and the overrides in this file +// will be visible to everyone. +// +// Injection point for custom user configurations. See README for details +// +// ** Custom implementation starts here ** + +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_ + +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest.h new file mode 100644 index 0000000000..afaaf17ba2 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest.h @@ -0,0 +1,37 @@ +// Copyright 2015, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Injection point for custom user configurations. See README for details +// +// ** Custom implementation starts here ** + +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_ + +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h new file mode 100644 index 0000000000..45580ae805 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h @@ -0,0 +1,306 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This header file defines internal utilities needed for implementing +// death tests. They are subject to change without notice. + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ + +#include + +#include + +#include "gtest/gtest-matchers.h" +#include "gtest/internal/gtest-internal.h" + +GTEST_DECLARE_string_(internal_run_death_test); + +namespace testing { +namespace internal { + +// Names of the flags (needed for parsing Google Test flags). +const char kDeathTestStyleFlag[] = "death_test_style"; +const char kDeathTestUseFork[] = "death_test_use_fork"; +const char kInternalRunDeathTestFlag[] = "internal_run_death_test"; + +#if GTEST_HAS_DEATH_TEST + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +// DeathTest is a class that hides much of the complexity of the +// GTEST_DEATH_TEST_ macro. It is abstract; its static Create method +// returns a concrete class that depends on the prevailing death test +// style, as defined by the --gtest_death_test_style and/or +// --gtest_internal_run_death_test flags. + +// In describing the results of death tests, these terms are used with +// the corresponding definitions: +// +// exit status: The integer exit information in the format specified +// by wait(2) +// exit code: The integer code passed to exit(3), _exit(2), or +// returned from main() +class GTEST_API_ DeathTest { + public: + // Create returns false if there was an error determining the + // appropriate action to take for the current death test; for example, + // if the gtest_death_test_style flag is set to an invalid value. + // The LastMessage method will return a more detailed message in that + // case. Otherwise, the DeathTest pointer pointed to by the "test" + // argument is set. If the death test should be skipped, the pointer + // is set to NULL; otherwise, it is set to the address of a new concrete + // DeathTest object that controls the execution of the current test. + static bool Create(const char* statement, Matcher matcher, + const char* file, int line, DeathTest** test); + DeathTest(); + virtual ~DeathTest() {} + + // A helper class that aborts a death test when it's deleted. + class ReturnSentinel { + public: + explicit ReturnSentinel(DeathTest* test) : test_(test) {} + ~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); } + + private: + DeathTest* const test_; + ReturnSentinel(const ReturnSentinel&) = delete; + ReturnSentinel& operator=(const ReturnSentinel&) = delete; + } GTEST_ATTRIBUTE_UNUSED_; + + // An enumeration of possible roles that may be taken when a death + // test is encountered. EXECUTE means that the death test logic should + // be executed immediately. OVERSEE means that the program should prepare + // the appropriate environment for a child process to execute the death + // test, then wait for it to complete. + enum TestRole { OVERSEE_TEST, EXECUTE_TEST }; + + // An enumeration of the three reasons that a test might be aborted. + enum AbortReason { + TEST_ENCOUNTERED_RETURN_STATEMENT, + TEST_THREW_EXCEPTION, + TEST_DID_NOT_DIE + }; + + // Assumes one of the above roles. + virtual TestRole AssumeRole() = 0; + + // Waits for the death test to finish and returns its status. + virtual int Wait() = 0; + + // Returns true if the death test passed; that is, the test process + // exited during the test, its exit status matches a user-supplied + // predicate, and its stderr output matches a user-supplied regular + // expression. + // The user-supplied predicate may be a macro expression rather + // than a function pointer or functor, or else Wait and Passed could + // be combined. + virtual bool Passed(bool exit_status_ok) = 0; + + // Signals that the death test did not die as expected. + virtual void Abort(AbortReason reason) = 0; + + // Returns a human-readable outcome message regarding the outcome of + // the last death test. + static const char* LastMessage(); + + static void set_last_death_test_message(const std::string& message); + + private: + // A string containing a description of the outcome of the last death test. + static std::string last_death_test_message_; + + DeathTest(const DeathTest&) = delete; + DeathTest& operator=(const DeathTest&) = delete; +}; + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +// Factory interface for death tests. May be mocked out for testing. +class DeathTestFactory { + public: + virtual ~DeathTestFactory() {} + virtual bool Create(const char* statement, + Matcher matcher, const char* file, + int line, DeathTest** test) = 0; +}; + +// A concrete DeathTestFactory implementation for normal use. +class DefaultDeathTestFactory : public DeathTestFactory { + public: + bool Create(const char* statement, Matcher matcher, + const char* file, int line, DeathTest** test) override; +}; + +// Returns true if exit_status describes a process that was terminated +// by a signal, or exited normally with a nonzero exit code. +GTEST_API_ bool ExitedUnsuccessfully(int exit_status); + +// A string passed to EXPECT_DEATH (etc.) is caught by one of these overloads +// and interpreted as a regex (rather than an Eq matcher) for legacy +// compatibility. +inline Matcher MakeDeathTestMatcher( + ::testing::internal::RE regex) { + return ContainsRegex(regex.pattern()); +} +inline Matcher MakeDeathTestMatcher(const char* regex) { + return ContainsRegex(regex); +} +inline Matcher MakeDeathTestMatcher( + const ::std::string& regex) { + return ContainsRegex(regex); +} + +// If a Matcher is passed to EXPECT_DEATH (etc.), it's +// used directly. +inline Matcher MakeDeathTestMatcher( + Matcher matcher) { + return matcher; +} + +// Traps C++ exceptions escaping statement and reports them as test +// failures. Note that trapping SEH exceptions is not implemented here. +#if GTEST_HAS_EXCEPTIONS +#define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } catch (const ::std::exception& gtest_exception) { \ + fprintf( \ + stderr, \ + "\n%s: Caught std::exception-derived exception escaping the " \ + "death test statement. Exception message: %s\n", \ + ::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \ + gtest_exception.what()); \ + fflush(stderr); \ + death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \ + } catch (...) { \ + death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \ + } + +#else +#define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) + +#endif + +// This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*, +// ASSERT_EXIT*, and EXPECT_EXIT*. +#define GTEST_DEATH_TEST_(statement, predicate, regex_or_matcher, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + ::testing::internal::DeathTest* gtest_dt; \ + if (!::testing::internal::DeathTest::Create( \ + #statement, \ + ::testing::internal::MakeDeathTestMatcher(regex_or_matcher), \ + __FILE__, __LINE__, >est_dt)) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \ + } \ + if (gtest_dt != nullptr) { \ + std::unique_ptr< ::testing::internal::DeathTest> gtest_dt_ptr(gtest_dt); \ + switch (gtest_dt->AssumeRole()) { \ + case ::testing::internal::DeathTest::OVERSEE_TEST: \ + if (!gtest_dt->Passed(predicate(gtest_dt->Wait()))) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \ + } \ + break; \ + case ::testing::internal::DeathTest::EXECUTE_TEST: { \ + ::testing::internal::DeathTest::ReturnSentinel gtest_sentinel( \ + gtest_dt); \ + GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, gtest_dt); \ + gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE); \ + break; \ + } \ + } \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__) \ + : fail(::testing::internal::DeathTest::LastMessage()) +// The symbol "fail" here expands to something into which a message +// can be streamed. + +// This macro is for implementing ASSERT/EXPECT_DEBUG_DEATH when compiled in +// NDEBUG mode. In this case we need the statements to be executed and the macro +// must accept a streamed message even though the message is never printed. +// The regex object is not evaluated, but it is used to prevent "unused" +// warnings and to avoid an expression that doesn't compile in debug mode. +#define GTEST_EXECUTE_STATEMENT_(statement, regex_or_matcher) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } else if (!::testing::internal::AlwaysTrue()) { \ + ::testing::internal::MakeDeathTestMatcher(regex_or_matcher); \ + } else \ + ::testing::Message() + +// A class representing the parsed contents of the +// --gtest_internal_run_death_test flag, as it existed when +// RUN_ALL_TESTS was called. +class InternalRunDeathTestFlag { + public: + InternalRunDeathTestFlag(const std::string& a_file, int a_line, int an_index, + int a_write_fd) + : file_(a_file), line_(a_line), index_(an_index), write_fd_(a_write_fd) {} + + ~InternalRunDeathTestFlag() { + if (write_fd_ >= 0) posix::Close(write_fd_); + } + + const std::string& file() const { return file_; } + int line() const { return line_; } + int index() const { return index_; } + int write_fd() const { return write_fd_; } + + private: + std::string file_; + int line_; + int index_; + int write_fd_; + + InternalRunDeathTestFlag(const InternalRunDeathTestFlag&) = delete; + InternalRunDeathTestFlag& operator=(const InternalRunDeathTestFlag&) = delete; +}; + +// Returns a newly created InternalRunDeathTestFlag object with fields +// initialized from the GTEST_FLAG(internal_run_death_test) flag if +// the flag is specified; otherwise returns NULL. +InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag(); + +#endif // GTEST_HAS_DEATH_TEST + +} // namespace internal +} // namespace testing + +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h new file mode 100644 index 0000000000..a2a60a962b --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h @@ -0,0 +1,210 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Google Test filepath utilities +// +// This header file declares classes and functions used internally by +// Google Test. They are subject to change without notice. +// +// This file is #included in gtest/internal/gtest-internal.h. +// Do not include this header file separately! + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ + +#include "gtest/internal/gtest-string.h" + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +namespace testing { +namespace internal { + +// FilePath - a class for file and directory pathname manipulation which +// handles platform-specific conventions (like the pathname separator). +// Used for helper functions for naming files in a directory for xml output. +// Except for Set methods, all methods are const or static, which provides an +// "immutable value object" -- useful for peace of mind. +// A FilePath with a value ending in a path separator ("like/this/") represents +// a directory, otherwise it is assumed to represent a file. In either case, +// it may or may not represent an actual file or directory in the file system. +// Names are NOT checked for syntax correctness -- no checking for illegal +// characters, malformed paths, etc. + +class GTEST_API_ FilePath { + public: + FilePath() : pathname_("") {} + FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) {} + + explicit FilePath(const std::string& pathname) : pathname_(pathname) { + Normalize(); + } + + FilePath& operator=(const FilePath& rhs) { + Set(rhs); + return *this; + } + + void Set(const FilePath& rhs) { pathname_ = rhs.pathname_; } + + const std::string& string() const { return pathname_; } + const char* c_str() const { return pathname_.c_str(); } + + // Returns the current working directory, or "" if unsuccessful. + static FilePath GetCurrentDir(); + + // Given directory = "dir", base_name = "test", number = 0, + // extension = "xml", returns "dir/test.xml". If number is greater + // than zero (e.g., 12), returns "dir/test_12.xml". + // On Windows platform, uses \ as the separator rather than /. + static FilePath MakeFileName(const FilePath& directory, + const FilePath& base_name, int number, + const char* extension); + + // Given directory = "dir", relative_path = "test.xml", + // returns "dir/test.xml". + // On Windows, uses \ as the separator rather than /. + static FilePath ConcatPaths(const FilePath& directory, + const FilePath& relative_path); + + // Returns a pathname for a file that does not currently exist. The pathname + // will be directory/base_name.extension or + // directory/base_name_.extension if directory/base_name.extension + // already exists. The number will be incremented until a pathname is found + // that does not already exist. + // Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'. + // There could be a race condition if two or more processes are calling this + // function at the same time -- they could both pick the same filename. + static FilePath GenerateUniqueFileName(const FilePath& directory, + const FilePath& base_name, + const char* extension); + + // Returns true if and only if the path is "". + bool IsEmpty() const { return pathname_.empty(); } + + // If input name has a trailing separator character, removes it and returns + // the name, otherwise return the name string unmodified. + // On Windows platform, uses \ as the separator, other platforms use /. + FilePath RemoveTrailingPathSeparator() const; + + // Returns a copy of the FilePath with the directory part removed. + // Example: FilePath("path/to/file").RemoveDirectoryName() returns + // FilePath("file"). If there is no directory part ("just_a_file"), it returns + // the FilePath unmodified. If there is no file part ("just_a_dir/") it + // returns an empty FilePath (""). + // On Windows platform, '\' is the path separator, otherwise it is '/'. + FilePath RemoveDirectoryName() const; + + // RemoveFileName returns the directory path with the filename removed. + // Example: FilePath("path/to/file").RemoveFileName() returns "path/to/". + // If the FilePath is "a_file" or "/a_file", RemoveFileName returns + // FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does + // not have a file, like "just/a/dir/", it returns the FilePath unmodified. + // On Windows platform, '\' is the path separator, otherwise it is '/'. + FilePath RemoveFileName() const; + + // Returns a copy of the FilePath with the case-insensitive extension removed. + // Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns + // FilePath("dir/file"). If a case-insensitive extension is not + // found, returns a copy of the original FilePath. + FilePath RemoveExtension(const char* extension) const; + + // Creates directories so that path exists. Returns true if successful or if + // the directories already exist; returns false if unable to create + // directories for any reason. Will also return false if the FilePath does + // not represent a directory (that is, it doesn't end with a path separator). + bool CreateDirectoriesRecursively() const; + + // Create the directory so that path exists. Returns true if successful or + // if the directory already exists; returns false if unable to create the + // directory for any reason, including if the parent directory does not + // exist. Not named "CreateDirectory" because that's a macro on Windows. + bool CreateFolder() const; + + // Returns true if FilePath describes something in the file-system, + // either a file, directory, or whatever, and that something exists. + bool FileOrDirectoryExists() const; + + // Returns true if pathname describes a directory in the file-system + // that exists. + bool DirectoryExists() const; + + // Returns true if FilePath ends with a path separator, which indicates that + // it is intended to represent a directory. Returns false otherwise. + // This does NOT check that a directory (or file) actually exists. + bool IsDirectory() const; + + // Returns true if pathname describes a root directory. (Windows has one + // root directory per disk drive.) + bool IsRootDirectory() const; + + // Returns true if pathname describes an absolute path. + bool IsAbsolutePath() const; + + private: + // Replaces multiple consecutive separators with a single separator. + // For example, "bar///foo" becomes "bar/foo". Does not eliminate other + // redundancies that might be in a pathname involving "." or "..". + // + // A pathname with multiple consecutive separators may occur either through + // user error or as a result of some scripts or APIs that generate a pathname + // with a trailing separator. On other platforms the same API or script + // may NOT generate a pathname with a trailing "/". Then elsewhere that + // pathname may have another "/" and pathname components added to it, + // without checking for the separator already being there. + // The script language and operating system may allow paths like "foo//bar" + // but some of the functions in FilePath will not handle that correctly. In + // particular, RemoveTrailingPathSeparator() only removes one separator, and + // it is called in CreateDirectoriesRecursively() assuming that it will change + // a pathname from directory syntax (trailing separator) to filename syntax. + // + // On Windows this method also replaces the alternate path separator '/' with + // the primary path separator '\\', so that for example "bar\\/\\foo" becomes + // "bar\\foo". + + void Normalize(); + + // Returns a pointer to the last occurrence of a valid path separator in + // the FilePath. On Windows, for example, both '/' and '\' are valid path + // separators. Returns NULL if no path separator was found. + const char* FindLastPathSeparator() const; + + std::string pathname_; +}; // class FilePath + +} // namespace internal +} // namespace testing + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h new file mode 100644 index 0000000000..9b04e4c85f --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h @@ -0,0 +1,1570 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This header file declares functions and macros used internally by +// Google Test. They are subject to change without notice. + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ + +#include "gtest/internal/gtest-port.h" + +#if GTEST_OS_LINUX +#include +#include +#include +#include +#endif // GTEST_OS_LINUX + +#if GTEST_HAS_EXCEPTIONS +#include +#endif + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gtest/gtest-message.h" +#include "gtest/internal/gtest-filepath.h" +#include "gtest/internal/gtest-string.h" +#include "gtest/internal/gtest-type-util.h" + +// Due to C++ preprocessor weirdness, we need double indirection to +// concatenate two tokens when one of them is __LINE__. Writing +// +// foo ## __LINE__ +// +// will result in the token foo__LINE__, instead of foo followed by +// the current line number. For more details, see +// http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6 +#define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar) +#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo##bar + +// Stringifies its argument. +// Work around a bug in visual studio which doesn't accept code like this: +// +// #define GTEST_STRINGIFY_(name) #name +// #define MACRO(a, b, c) ... GTEST_STRINGIFY_(a) ... +// MACRO(, x, y) +// +// Complaining about the argument to GTEST_STRINGIFY_ being empty. +// This is allowed by the spec. +#define GTEST_STRINGIFY_HELPER_(name, ...) #name +#define GTEST_STRINGIFY_(...) GTEST_STRINGIFY_HELPER_(__VA_ARGS__, ) + +namespace proto2 { +class MessageLite; +} + +namespace testing { + +// Forward declarations. + +class AssertionResult; // Result of an assertion. +class Message; // Represents a failure message. +class Test; // Represents a test. +class TestInfo; // Information about a test. +class TestPartResult; // Result of a test part. +class UnitTest; // A collection of test suites. + +template +::std::string PrintToString(const T& value); + +namespace internal { + +struct TraceInfo; // Information about a trace point. +class TestInfoImpl; // Opaque implementation of TestInfo +class UnitTestImpl; // Opaque implementation of UnitTest + +// The text used in failure messages to indicate the start of the +// stack trace. +GTEST_API_ extern const char kStackTraceMarker[]; + +// An IgnoredValue object can be implicitly constructed from ANY value. +class IgnoredValue { + struct Sink {}; + + public: + // This constructor template allows any value to be implicitly + // converted to IgnoredValue. The object has no data member and + // doesn't try to remember anything about the argument. We + // deliberately omit the 'explicit' keyword in order to allow the + // conversion to be implicit. + // Disable the conversion if T already has a magical conversion operator. + // Otherwise we get ambiguity. + template ::value, + int>::type = 0> + IgnoredValue(const T& /* ignored */) {} // NOLINT(runtime/explicit) +}; + +// Appends the user-supplied message to the Google-Test-generated message. +GTEST_API_ std::string AppendUserMessage(const std::string& gtest_msg, + const Message& user_msg); + +#if GTEST_HAS_EXCEPTIONS + +GTEST_DISABLE_MSC_WARNINGS_PUSH_( + 4275 /* an exported class was derived from a class that was not exported */) + +// This exception is thrown by (and only by) a failed Google Test +// assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions +// are enabled). We derive it from std::runtime_error, which is for +// errors presumably detectable only at run time. Since +// std::runtime_error inherits from std::exception, many testing +// frameworks know how to extract and print the message inside it. +class GTEST_API_ GoogleTestFailureException : public ::std::runtime_error { + public: + explicit GoogleTestFailureException(const TestPartResult& failure); +}; + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4275 + +#endif // GTEST_HAS_EXCEPTIONS + +namespace edit_distance { +// Returns the optimal edits to go from 'left' to 'right'. +// All edits cost the same, with replace having lower priority than +// add/remove. +// Simple implementation of the Wagner-Fischer algorithm. +// See http://en.wikipedia.org/wiki/Wagner-Fischer_algorithm +enum EditType { kMatch, kAdd, kRemove, kReplace }; +GTEST_API_ std::vector CalculateOptimalEdits( + const std::vector& left, const std::vector& right); + +// Same as above, but the input is represented as strings. +GTEST_API_ std::vector CalculateOptimalEdits( + const std::vector& left, + const std::vector& right); + +// Create a diff of the input strings in Unified diff format. +GTEST_API_ std::string CreateUnifiedDiff(const std::vector& left, + const std::vector& right, + size_t context = 2); + +} // namespace edit_distance + +// Constructs and returns the message for an equality assertion +// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure. +// +// The first four parameters are the expressions used in the assertion +// and their values, as strings. For example, for ASSERT_EQ(foo, bar) +// where foo is 5 and bar is 6, we have: +// +// expected_expression: "foo" +// actual_expression: "bar" +// expected_value: "5" +// actual_value: "6" +// +// The ignoring_case parameter is true if and only if the assertion is a +// *_STRCASEEQ*. When it's true, the string " (ignoring case)" will +// be inserted into the message. +GTEST_API_ AssertionResult EqFailure(const char* expected_expression, + const char* actual_expression, + const std::string& expected_value, + const std::string& actual_value, + bool ignoring_case); + +// Constructs a failure message for Boolean assertions such as EXPECT_TRUE. +GTEST_API_ std::string GetBoolAssertionFailureMessage( + const AssertionResult& assertion_result, const char* expression_text, + const char* actual_predicate_value, const char* expected_predicate_value); + +// This template class represents an IEEE floating-point number +// (either single-precision or double-precision, depending on the +// template parameters). +// +// The purpose of this class is to do more sophisticated number +// comparison. (Due to round-off error, etc, it's very unlikely that +// two floating-points will be equal exactly. Hence a naive +// comparison by the == operation often doesn't work.) +// +// Format of IEEE floating-point: +// +// The most-significant bit being the leftmost, an IEEE +// floating-point looks like +// +// sign_bit exponent_bits fraction_bits +// +// Here, sign_bit is a single bit that designates the sign of the +// number. +// +// For float, there are 8 exponent bits and 23 fraction bits. +// +// For double, there are 11 exponent bits and 52 fraction bits. +// +// More details can be found at +// http://en.wikipedia.org/wiki/IEEE_floating-point_standard. +// +// Template parameter: +// +// RawType: the raw floating-point type (either float or double) +template +class FloatingPoint { + public: + // Defines the unsigned integer type that has the same size as the + // floating point number. + typedef typename TypeWithSize::UInt Bits; + + // Constants. + + // # of bits in a number. + static const size_t kBitCount = 8 * sizeof(RawType); + + // # of fraction bits in a number. + static const size_t kFractionBitCount = + std::numeric_limits::digits - 1; + + // # of exponent bits in a number. + static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount; + + // The mask for the sign bit. + static const Bits kSignBitMask = static_cast(1) << (kBitCount - 1); + + // The mask for the fraction bits. + static const Bits kFractionBitMask = ~static_cast(0) >> + (kExponentBitCount + 1); + + // The mask for the exponent bits. + static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask); + + // How many ULP's (Units in the Last Place) we want to tolerate when + // comparing two numbers. The larger the value, the more error we + // allow. A 0 value means that two numbers must be exactly the same + // to be considered equal. + // + // The maximum error of a single floating-point operation is 0.5 + // units in the last place. On Intel CPU's, all floating-point + // calculations are done with 80-bit precision, while double has 64 + // bits. Therefore, 4 should be enough for ordinary use. + // + // See the following article for more details on ULP: + // http://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/ + static const uint32_t kMaxUlps = 4; + + // Constructs a FloatingPoint from a raw floating-point number. + // + // On an Intel CPU, passing a non-normalized NAN (Not a Number) + // around may change its bits, although the new value is guaranteed + // to be also a NAN. Therefore, don't expect this constructor to + // preserve the bits in x when x is a NAN. + explicit FloatingPoint(const RawType& x) { u_.value_ = x; } + + // Static methods + + // Reinterprets a bit pattern as a floating-point number. + // + // This function is needed to test the AlmostEquals() method. + static RawType ReinterpretBits(const Bits bits) { + FloatingPoint fp(0); + fp.u_.bits_ = bits; + return fp.u_.value_; + } + + // Returns the floating-point number that represent positive infinity. + static RawType Infinity() { return ReinterpretBits(kExponentBitMask); } + + // Returns the maximum representable finite floating-point number. + static RawType Max(); + + // Non-static methods + + // Returns the bits that represents this number. + const Bits& bits() const { return u_.bits_; } + + // Returns the exponent bits of this number. + Bits exponent_bits() const { return kExponentBitMask & u_.bits_; } + + // Returns the fraction bits of this number. + Bits fraction_bits() const { return kFractionBitMask & u_.bits_; } + + // Returns the sign bit of this number. + Bits sign_bit() const { return kSignBitMask & u_.bits_; } + + // Returns true if and only if this is NAN (not a number). + bool is_nan() const { + // It's a NAN if the exponent bits are all ones and the fraction + // bits are not entirely zeros. + return (exponent_bits() == kExponentBitMask) && (fraction_bits() != 0); + } + + // Returns true if and only if this number is at most kMaxUlps ULP's away + // from rhs. In particular, this function: + // + // - returns false if either number is (or both are) NAN. + // - treats really large numbers as almost equal to infinity. + // - thinks +0.0 and -0.0 are 0 DLP's apart. + bool AlmostEquals(const FloatingPoint& rhs) const { + // The IEEE standard says that any comparison operation involving + // a NAN must return false. + if (is_nan() || rhs.is_nan()) return false; + + return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_) <= + kMaxUlps; + } + + private: + // The data type used to store the actual floating-point number. + union FloatingPointUnion { + RawType value_; // The raw floating-point number. + Bits bits_; // The bits that represent the number. + }; + + // Converts an integer from the sign-and-magnitude representation to + // the biased representation. More precisely, let N be 2 to the + // power of (kBitCount - 1), an integer x is represented by the + // unsigned number x + N. + // + // For instance, + // + // -N + 1 (the most negative number representable using + // sign-and-magnitude) is represented by 1; + // 0 is represented by N; and + // N - 1 (the biggest number representable using + // sign-and-magnitude) is represented by 2N - 1. + // + // Read http://en.wikipedia.org/wiki/Signed_number_representations + // for more details on signed number representations. + static Bits SignAndMagnitudeToBiased(const Bits& sam) { + if (kSignBitMask & sam) { + // sam represents a negative number. + return ~sam + 1; + } else { + // sam represents a positive number. + return kSignBitMask | sam; + } + } + + // Given two numbers in the sign-and-magnitude representation, + // returns the distance between them as an unsigned number. + static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits& sam1, + const Bits& sam2) { + const Bits biased1 = SignAndMagnitudeToBiased(sam1); + const Bits biased2 = SignAndMagnitudeToBiased(sam2); + return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1); + } + + FloatingPointUnion u_; +}; + +// We cannot use std::numeric_limits::max() as it clashes with the max() +// macro defined by . +template <> +inline float FloatingPoint::Max() { + return FLT_MAX; +} +template <> +inline double FloatingPoint::Max() { + return DBL_MAX; +} + +// Typedefs the instances of the FloatingPoint template class that we +// care to use. +typedef FloatingPoint Float; +typedef FloatingPoint Double; + +// In order to catch the mistake of putting tests that use different +// test fixture classes in the same test suite, we need to assign +// unique IDs to fixture classes and compare them. The TypeId type is +// used to hold such IDs. The user should treat TypeId as an opaque +// type: the only operation allowed on TypeId values is to compare +// them for equality using the == operator. +typedef const void* TypeId; + +template +class TypeIdHelper { + public: + // dummy_ must not have a const type. Otherwise an overly eager + // compiler (e.g. MSVC 7.1 & 8.0) may try to merge + // TypeIdHelper::dummy_ for different Ts as an "optimization". + static bool dummy_; +}; + +template +bool TypeIdHelper::dummy_ = false; + +// GetTypeId() returns the ID of type T. Different values will be +// returned for different types. Calling the function twice with the +// same type argument is guaranteed to return the same ID. +template +TypeId GetTypeId() { + // The compiler is required to allocate a different + // TypeIdHelper::dummy_ variable for each T used to instantiate + // the template. Therefore, the address of dummy_ is guaranteed to + // be unique. + return &(TypeIdHelper::dummy_); +} + +// Returns the type ID of ::testing::Test. Always call this instead +// of GetTypeId< ::testing::Test>() to get the type ID of +// ::testing::Test, as the latter may give the wrong result due to a +// suspected linker bug when compiling Google Test as a Mac OS X +// framework. +GTEST_API_ TypeId GetTestTypeId(); + +// Defines the abstract factory interface that creates instances +// of a Test object. +class TestFactoryBase { + public: + virtual ~TestFactoryBase() {} + + // Creates a test instance to run. The instance is both created and destroyed + // within TestInfoImpl::Run() + virtual Test* CreateTest() = 0; + + protected: + TestFactoryBase() {} + + private: + TestFactoryBase(const TestFactoryBase&) = delete; + TestFactoryBase& operator=(const TestFactoryBase&) = delete; +}; + +// This class provides implementation of TeastFactoryBase interface. +// It is used in TEST and TEST_F macros. +template +class TestFactoryImpl : public TestFactoryBase { + public: + Test* CreateTest() override { return new TestClass; } +}; + +#if GTEST_OS_WINDOWS + +// Predicate-formatters for implementing the HRESULT checking macros +// {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED} +// We pass a long instead of HRESULT to avoid causing an +// include dependency for the HRESULT type. +GTEST_API_ AssertionResult IsHRESULTSuccess(const char* expr, + long hr); // NOLINT +GTEST_API_ AssertionResult IsHRESULTFailure(const char* expr, + long hr); // NOLINT + +#endif // GTEST_OS_WINDOWS + +// Types of SetUpTestSuite() and TearDownTestSuite() functions. +using SetUpTestSuiteFunc = void (*)(); +using TearDownTestSuiteFunc = void (*)(); + +struct CodeLocation { + CodeLocation(const std::string& a_file, int a_line) + : file(a_file), line(a_line) {} + + std::string file; + int line; +}; + +// Helper to identify which setup function for TestCase / TestSuite to call. +// Only one function is allowed, either TestCase or TestSute but not both. + +// Utility functions to help SuiteApiResolver +using SetUpTearDownSuiteFuncType = void (*)(); + +inline SetUpTearDownSuiteFuncType GetNotDefaultOrNull( + SetUpTearDownSuiteFuncType a, SetUpTearDownSuiteFuncType def) { + return a == def ? nullptr : a; +} + +template +// Note that SuiteApiResolver inherits from T because +// SetUpTestSuite()/TearDownTestSuite() could be protected. This way +// SuiteApiResolver can access them. +struct SuiteApiResolver : T { + // testing::Test is only forward declared at this point. So we make it a + // dependent class for the compiler to be OK with it. + using Test = + typename std::conditional::type; + + static SetUpTearDownSuiteFuncType GetSetUpCaseOrSuite(const char* filename, + int line_num) { +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + SetUpTearDownSuiteFuncType test_case_fp = + GetNotDefaultOrNull(&T::SetUpTestCase, &Test::SetUpTestCase); + SetUpTearDownSuiteFuncType test_suite_fp = + GetNotDefaultOrNull(&T::SetUpTestSuite, &Test::SetUpTestSuite); + + GTEST_CHECK_(!test_case_fp || !test_suite_fp) + << "Test can not provide both SetUpTestSuite and SetUpTestCase, please " + "make sure there is only one present at " + << filename << ":" << line_num; + + return test_case_fp != nullptr ? test_case_fp : test_suite_fp; +#else + (void)(filename); + (void)(line_num); + return &T::SetUpTestSuite; +#endif + } + + static SetUpTearDownSuiteFuncType GetTearDownCaseOrSuite(const char* filename, + int line_num) { +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + SetUpTearDownSuiteFuncType test_case_fp = + GetNotDefaultOrNull(&T::TearDownTestCase, &Test::TearDownTestCase); + SetUpTearDownSuiteFuncType test_suite_fp = + GetNotDefaultOrNull(&T::TearDownTestSuite, &Test::TearDownTestSuite); + + GTEST_CHECK_(!test_case_fp || !test_suite_fp) + << "Test can not provide both TearDownTestSuite and TearDownTestCase," + " please make sure there is only one present at" + << filename << ":" << line_num; + + return test_case_fp != nullptr ? test_case_fp : test_suite_fp; +#else + (void)(filename); + (void)(line_num); + return &T::TearDownTestSuite; +#endif + } +}; + +// Creates a new TestInfo object and registers it with Google Test; +// returns the created object. +// +// Arguments: +// +// test_suite_name: name of the test suite +// name: name of the test +// type_param: the name of the test's type parameter, or NULL if +// this is not a typed or a type-parameterized test. +// value_param: text representation of the test's value parameter, +// or NULL if this is not a type-parameterized test. +// code_location: code location where the test is defined +// fixture_class_id: ID of the test fixture class +// set_up_tc: pointer to the function that sets up the test suite +// tear_down_tc: pointer to the function that tears down the test suite +// factory: pointer to the factory that creates a test object. +// The newly created TestInfo instance will assume +// ownership of the factory object. +GTEST_API_ TestInfo* MakeAndRegisterTestInfo( + const char* test_suite_name, const char* name, const char* type_param, + const char* value_param, CodeLocation code_location, + TypeId fixture_class_id, SetUpTestSuiteFunc set_up_tc, + TearDownTestSuiteFunc tear_down_tc, TestFactoryBase* factory); + +// If *pstr starts with the given prefix, modifies *pstr to be right +// past the prefix and returns true; otherwise leaves *pstr unchanged +// and returns false. None of pstr, *pstr, and prefix can be NULL. +GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr); + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +// State of the definition of a type-parameterized test suite. +class GTEST_API_ TypedTestSuitePState { + public: + TypedTestSuitePState() : registered_(false) {} + + // Adds the given test name to defined_test_names_ and return true + // if the test suite hasn't been registered; otherwise aborts the + // program. + bool AddTestName(const char* file, int line, const char* case_name, + const char* test_name) { + if (registered_) { + fprintf(stderr, + "%s Test %s must be defined before " + "REGISTER_TYPED_TEST_SUITE_P(%s, ...).\n", + FormatFileLocation(file, line).c_str(), test_name, case_name); + fflush(stderr); + posix::Abort(); + } + registered_tests_.insert( + ::std::make_pair(test_name, CodeLocation(file, line))); + return true; + } + + bool TestExists(const std::string& test_name) const { + return registered_tests_.count(test_name) > 0; + } + + const CodeLocation& GetCodeLocation(const std::string& test_name) const { + RegisteredTestsMap::const_iterator it = registered_tests_.find(test_name); + GTEST_CHECK_(it != registered_tests_.end()); + return it->second; + } + + // Verifies that registered_tests match the test names in + // defined_test_names_; returns registered_tests if successful, or + // aborts the program otherwise. + const char* VerifyRegisteredTestNames(const char* test_suite_name, + const char* file, int line, + const char* registered_tests); + + private: + typedef ::std::map RegisteredTestsMap; + + bool registered_; + RegisteredTestsMap registered_tests_; +}; + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +using TypedTestCasePState = TypedTestSuitePState; +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +// Skips to the first non-space char after the first comma in 'str'; +// returns NULL if no comma is found in 'str'. +inline const char* SkipComma(const char* str) { + const char* comma = strchr(str, ','); + if (comma == nullptr) { + return nullptr; + } + while (IsSpace(*(++comma))) { + } + return comma; +} + +// Returns the prefix of 'str' before the first comma in it; returns +// the entire string if it contains no comma. +inline std::string GetPrefixUntilComma(const char* str) { + const char* comma = strchr(str, ','); + return comma == nullptr ? str : std::string(str, comma); +} + +// Splits a given string on a given delimiter, populating a given +// vector with the fields. +void SplitString(const ::std::string& str, char delimiter, + ::std::vector<::std::string>* dest); + +// The default argument to the template below for the case when the user does +// not provide a name generator. +struct DefaultNameGenerator { + template + static std::string GetName(int i) { + return StreamableToString(i); + } +}; + +template +struct NameGeneratorSelector { + typedef Provided type; +}; + +template +void GenerateNamesRecursively(internal::None, std::vector*, int) {} + +template +void GenerateNamesRecursively(Types, std::vector* result, int i) { + result->push_back(NameGenerator::template GetName(i)); + GenerateNamesRecursively(typename Types::Tail(), result, + i + 1); +} + +template +std::vector GenerateNames() { + std::vector result; + GenerateNamesRecursively(Types(), &result, 0); + return result; +} + +// TypeParameterizedTest::Register() +// registers a list of type-parameterized tests with Google Test. The +// return value is insignificant - we just need to return something +// such that we can call this function in a namespace scope. +// +// Implementation note: The GTEST_TEMPLATE_ macro declares a template +// template parameter. It's defined in gtest-type-util.h. +template +class TypeParameterizedTest { + public: + // 'index' is the index of the test in the type list 'Types' + // specified in INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, TestSuite, + // Types). Valid values for 'index' are [0, N - 1] where N is the + // length of Types. + static bool Register(const char* prefix, const CodeLocation& code_location, + const char* case_name, const char* test_names, int index, + const std::vector& type_names = + GenerateNames()) { + typedef typename Types::Head Type; + typedef Fixture FixtureClass; + typedef typename GTEST_BIND_(TestSel, Type) TestClass; + + // First, registers the first type-parameterized test in the type + // list. + MakeAndRegisterTestInfo( + (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name + + "/" + type_names[static_cast(index)]) + .c_str(), + StripTrailingSpaces(GetPrefixUntilComma(test_names)).c_str(), + GetTypeName().c_str(), + nullptr, // No value parameter. + code_location, GetTypeId(), + SuiteApiResolver::GetSetUpCaseOrSuite( + code_location.file.c_str(), code_location.line), + SuiteApiResolver::GetTearDownCaseOrSuite( + code_location.file.c_str(), code_location.line), + new TestFactoryImpl); + + // Next, recurses (at compile time) with the tail of the type list. + return TypeParameterizedTest::Register(prefix, + code_location, + case_name, + test_names, + index + 1, + type_names); + } +}; + +// The base case for the compile time recursion. +template +class TypeParameterizedTest { + public: + static bool Register(const char* /*prefix*/, const CodeLocation&, + const char* /*case_name*/, const char* /*test_names*/, + int /*index*/, + const std::vector& = + std::vector() /*type_names*/) { + return true; + } +}; + +GTEST_API_ void RegisterTypeParameterizedTestSuite(const char* test_suite_name, + CodeLocation code_location); +GTEST_API_ void RegisterTypeParameterizedTestSuiteInstantiation( + const char* case_name); + +// TypeParameterizedTestSuite::Register() +// registers *all combinations* of 'Tests' and 'Types' with Google +// Test. The return value is insignificant - we just need to return +// something such that we can call this function in a namespace scope. +template +class TypeParameterizedTestSuite { + public: + static bool Register(const char* prefix, CodeLocation code_location, + const TypedTestSuitePState* state, const char* case_name, + const char* test_names, + const std::vector& type_names = + GenerateNames()) { + RegisterTypeParameterizedTestSuiteInstantiation(case_name); + std::string test_name = + StripTrailingSpaces(GetPrefixUntilComma(test_names)); + if (!state->TestExists(test_name)) { + fprintf(stderr, "Failed to get code location for test %s.%s at %s.", + case_name, test_name.c_str(), + FormatFileLocation(code_location.file.c_str(), code_location.line) + .c_str()); + fflush(stderr); + posix::Abort(); + } + const CodeLocation& test_location = state->GetCodeLocation(test_name); + + typedef typename Tests::Head Head; + + // First, register the first test in 'Test' for each type in 'Types'. + TypeParameterizedTest::Register( + prefix, test_location, case_name, test_names, 0, type_names); + + // Next, recurses (at compile time) with the tail of the test list. + return TypeParameterizedTestSuite::Register(prefix, code_location, + state, case_name, + SkipComma(test_names), + type_names); + } +}; + +// The base case for the compile time recursion. +template +class TypeParameterizedTestSuite { + public: + static bool Register(const char* /*prefix*/, const CodeLocation&, + const TypedTestSuitePState* /*state*/, + const char* /*case_name*/, const char* /*test_names*/, + const std::vector& = + std::vector() /*type_names*/) { + return true; + } +}; + +// Returns the current OS stack trace as an std::string. +// +// The maximum number of stack frames to be included is specified by +// the gtest_stack_trace_depth flag. The skip_count parameter +// specifies the number of top frames to be skipped, which doesn't +// count against the number of frames to be included. +// +// For example, if Foo() calls Bar(), which in turn calls +// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in +// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't. +GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(UnitTest* unit_test, + int skip_count); + +// Helpers for suppressing warnings on unreachable code or constant +// condition. + +// Always returns true. +GTEST_API_ bool AlwaysTrue(); + +// Always returns false. +inline bool AlwaysFalse() { return !AlwaysTrue(); } + +// Helper for suppressing false warning from Clang on a const char* +// variable declared in a conditional expression always being NULL in +// the else branch. +struct GTEST_API_ ConstCharPtr { + ConstCharPtr(const char* str) : value(str) {} + operator bool() const { return true; } + const char* value; +}; + +// Helper for declaring std::string within 'if' statement +// in pre C++17 build environment. +struct TrueWithString { + TrueWithString() = default; + explicit TrueWithString(const char* str) : value(str) {} + explicit TrueWithString(const std::string& str) : value(str) {} + explicit operator bool() const { return true; } + std::string value; +}; + +// A simple Linear Congruential Generator for generating random +// numbers with a uniform distribution. Unlike rand() and srand(), it +// doesn't use global state (and therefore can't interfere with user +// code). Unlike rand_r(), it's portable. An LCG isn't very random, +// but it's good enough for our purposes. +class GTEST_API_ Random { + public: + static const uint32_t kMaxRange = 1u << 31; + + explicit Random(uint32_t seed) : state_(seed) {} + + void Reseed(uint32_t seed) { state_ = seed; } + + // Generates a random number from [0, range). Crashes if 'range' is + // 0 or greater than kMaxRange. + uint32_t Generate(uint32_t range); + + private: + uint32_t state_; + Random(const Random&) = delete; + Random& operator=(const Random&) = delete; +}; + +// Turns const U&, U&, const U, and U all into U. +#define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \ + typename std::remove_const::type>::type + +// HasDebugStringAndShortDebugString::value is a compile-time bool constant +// that's true if and only if T has methods DebugString() and ShortDebugString() +// that return std::string. +template +class HasDebugStringAndShortDebugString { + private: + template + static auto CheckDebugString(C*) -> typename std::is_same< + std::string, decltype(std::declval().DebugString())>::type; + template + static std::false_type CheckDebugString(...); + + template + static auto CheckShortDebugString(C*) -> typename std::is_same< + std::string, decltype(std::declval().ShortDebugString())>::type; + template + static std::false_type CheckShortDebugString(...); + + using HasDebugStringType = decltype(CheckDebugString(nullptr)); + using HasShortDebugStringType = decltype(CheckShortDebugString(nullptr)); + + public: + static constexpr bool value = + HasDebugStringType::value && HasShortDebugStringType::value; +}; + +template +constexpr bool HasDebugStringAndShortDebugString::value; + +// When the compiler sees expression IsContainerTest(0), if C is an +// STL-style container class, the first overload of IsContainerTest +// will be viable (since both C::iterator* and C::const_iterator* are +// valid types and NULL can be implicitly converted to them). It will +// be picked over the second overload as 'int' is a perfect match for +// the type of argument 0. If C::iterator or C::const_iterator is not +// a valid type, the first overload is not viable, and the second +// overload will be picked. Therefore, we can determine whether C is +// a container class by checking the type of IsContainerTest(0). +// The value of the expression is insignificant. +// +// In C++11 mode we check the existence of a const_iterator and that an +// iterator is properly implemented for the container. +// +// For pre-C++11 that we look for both C::iterator and C::const_iterator. +// The reason is that C++ injects the name of a class as a member of the +// class itself (e.g. you can refer to class iterator as either +// 'iterator' or 'iterator::iterator'). If we look for C::iterator +// only, for example, we would mistakenly think that a class named +// iterator is an STL container. +// +// Also note that the simpler approach of overloading +// IsContainerTest(typename C::const_iterator*) and +// IsContainerTest(...) doesn't work with Visual Age C++ and Sun C++. +typedef int IsContainer; +template ().begin()), + class = decltype(::std::declval().end()), + class = decltype(++::std::declval()), + class = decltype(*::std::declval()), + class = typename C::const_iterator> +IsContainer IsContainerTest(int /* dummy */) { + return 0; +} + +typedef char IsNotContainer; +template +IsNotContainer IsContainerTest(long /* dummy */) { + return '\0'; +} + +// Trait to detect whether a type T is a hash table. +// The heuristic used is that the type contains an inner type `hasher` and does +// not contain an inner type `reverse_iterator`. +// If the container is iterable in reverse, then order might actually matter. +template +struct IsHashTable { + private: + template + static char test(typename U::hasher*, typename U::reverse_iterator*); + template + static int test(typename U::hasher*, ...); + template + static char test(...); + + public: + static const bool value = sizeof(test(nullptr, nullptr)) == sizeof(int); +}; + +template +const bool IsHashTable::value; + +template (0)) == sizeof(IsContainer)> +struct IsRecursiveContainerImpl; + +template +struct IsRecursiveContainerImpl : public std::false_type {}; + +// Since the IsRecursiveContainerImpl depends on the IsContainerTest we need to +// obey the same inconsistencies as the IsContainerTest, namely check if +// something is a container is relying on only const_iterator in C++11 and +// is relying on both const_iterator and iterator otherwise +template +struct IsRecursiveContainerImpl { + using value_type = decltype(*std::declval()); + using type = + std::is_same::type>::type, + C>; +}; + +// IsRecursiveContainer is a unary compile-time predicate that +// evaluates whether C is a recursive container type. A recursive container +// type is a container type whose value_type is equal to the container type +// itself. An example for a recursive container type is +// boost::filesystem::path, whose iterator has a value_type that is equal to +// boost::filesystem::path. +template +struct IsRecursiveContainer : public IsRecursiveContainerImpl::type {}; + +// Utilities for native arrays. + +// ArrayEq() compares two k-dimensional native arrays using the +// elements' operator==, where k can be any integer >= 0. When k is +// 0, ArrayEq() degenerates into comparing a single pair of values. + +template +bool ArrayEq(const T* lhs, size_t size, const U* rhs); + +// This generic version is used when k is 0. +template +inline bool ArrayEq(const T& lhs, const U& rhs) { + return lhs == rhs; +} + +// This overload is used when k >= 1. +template +inline bool ArrayEq(const T (&lhs)[N], const U (&rhs)[N]) { + return internal::ArrayEq(lhs, N, rhs); +} + +// This helper reduces code bloat. If we instead put its logic inside +// the previous ArrayEq() function, arrays with different sizes would +// lead to different copies of the template code. +template +bool ArrayEq(const T* lhs, size_t size, const U* rhs) { + for (size_t i = 0; i != size; i++) { + if (!internal::ArrayEq(lhs[i], rhs[i])) return false; + } + return true; +} + +// Finds the first element in the iterator range [begin, end) that +// equals elem. Element may be a native array type itself. +template +Iter ArrayAwareFind(Iter begin, Iter end, const Element& elem) { + for (Iter it = begin; it != end; ++it) { + if (internal::ArrayEq(*it, elem)) return it; + } + return end; +} + +// CopyArray() copies a k-dimensional native array using the elements' +// operator=, where k can be any integer >= 0. When k is 0, +// CopyArray() degenerates into copying a single value. + +template +void CopyArray(const T* from, size_t size, U* to); + +// This generic version is used when k is 0. +template +inline void CopyArray(const T& from, U* to) { + *to = from; +} + +// This overload is used when k >= 1. +template +inline void CopyArray(const T (&from)[N], U (*to)[N]) { + internal::CopyArray(from, N, *to); +} + +// This helper reduces code bloat. If we instead put its logic inside +// the previous CopyArray() function, arrays with different sizes +// would lead to different copies of the template code. +template +void CopyArray(const T* from, size_t size, U* to) { + for (size_t i = 0; i != size; i++) { + internal::CopyArray(from[i], to + i); + } +} + +// The relation between an NativeArray object (see below) and the +// native array it represents. +// We use 2 different structs to allow non-copyable types to be used, as long +// as RelationToSourceReference() is passed. +struct RelationToSourceReference {}; +struct RelationToSourceCopy {}; + +// Adapts a native array to a read-only STL-style container. Instead +// of the complete STL container concept, this adaptor only implements +// members useful for Google Mock's container matchers. New members +// should be added as needed. To simplify the implementation, we only +// support Element being a raw type (i.e. having no top-level const or +// reference modifier). It's the client's responsibility to satisfy +// this requirement. Element can be an array type itself (hence +// multi-dimensional arrays are supported). +template +class NativeArray { + public: + // STL-style container typedefs. + typedef Element value_type; + typedef Element* iterator; + typedef const Element* const_iterator; + + // Constructs from a native array. References the source. + NativeArray(const Element* array, size_t count, RelationToSourceReference) { + InitRef(array, count); + } + + // Constructs from a native array. Copies the source. + NativeArray(const Element* array, size_t count, RelationToSourceCopy) { + InitCopy(array, count); + } + + // Copy constructor. + NativeArray(const NativeArray& rhs) { + (this->*rhs.clone_)(rhs.array_, rhs.size_); + } + + ~NativeArray() { + if (clone_ != &NativeArray::InitRef) delete[] array_; + } + + // STL-style container methods. + size_t size() const { return size_; } + const_iterator begin() const { return array_; } + const_iterator end() const { return array_ + size_; } + bool operator==(const NativeArray& rhs) const { + return size() == rhs.size() && ArrayEq(begin(), size(), rhs.begin()); + } + + private: + static_assert(!std::is_const::value, "Type must not be const"); + static_assert(!std::is_reference::value, + "Type must not be a reference"); + + // Initializes this object with a copy of the input. + void InitCopy(const Element* array, size_t a_size) { + Element* const copy = new Element[a_size]; + CopyArray(array, a_size, copy); + array_ = copy; + size_ = a_size; + clone_ = &NativeArray::InitCopy; + } + + // Initializes this object with a reference of the input. + void InitRef(const Element* array, size_t a_size) { + array_ = array; + size_ = a_size; + clone_ = &NativeArray::InitRef; + } + + const Element* array_; + size_t size_; + void (NativeArray::*clone_)(const Element*, size_t); +}; + +// Backport of std::index_sequence. +template +struct IndexSequence { + using type = IndexSequence; +}; + +// Double the IndexSequence, and one if plus_one is true. +template +struct DoubleSequence; +template +struct DoubleSequence, sizeofT> { + using type = IndexSequence; +}; +template +struct DoubleSequence, sizeofT> { + using type = IndexSequence; +}; + +// Backport of std::make_index_sequence. +// It uses O(ln(N)) instantiation depth. +template +struct MakeIndexSequenceImpl + : DoubleSequence::type, + N / 2>::type {}; + +template <> +struct MakeIndexSequenceImpl<0> : IndexSequence<> {}; + +template +using MakeIndexSequence = typename MakeIndexSequenceImpl::type; + +template +using IndexSequenceFor = typename MakeIndexSequence::type; + +template +struct Ignore { + Ignore(...); // NOLINT +}; + +template +struct ElemFromListImpl; +template +struct ElemFromListImpl> { + // We make Ignore a template to solve a problem with MSVC. + // A non-template Ignore would work fine with `decltype(Ignore(I))...`, but + // MSVC doesn't understand how to deal with that pack expansion. + // Use `0 * I` to have a single instantiation of Ignore. + template + static R Apply(Ignore<0 * I>..., R (*)(), ...); +}; + +template +struct ElemFromList { + using type = + decltype(ElemFromListImpl::type>::Apply( + static_cast(nullptr)...)); +}; + +struct FlatTupleConstructTag {}; + +template +class FlatTuple; + +template +struct FlatTupleElemBase; + +template +struct FlatTupleElemBase, I> { + using value_type = typename ElemFromList::type; + FlatTupleElemBase() = default; + template + explicit FlatTupleElemBase(FlatTupleConstructTag, Arg&& t) + : value(std::forward(t)) {} + value_type value; +}; + +template +struct FlatTupleBase; + +template +struct FlatTupleBase, IndexSequence> + : FlatTupleElemBase, Idx>... { + using Indices = IndexSequence; + FlatTupleBase() = default; + template + explicit FlatTupleBase(FlatTupleConstructTag, Args&&... args) + : FlatTupleElemBase, Idx>(FlatTupleConstructTag{}, + std::forward(args))... {} + + template + const typename ElemFromList::type& Get() const { + return FlatTupleElemBase, I>::value; + } + + template + typename ElemFromList::type& Get() { + return FlatTupleElemBase, I>::value; + } + + template + auto Apply(F&& f) -> decltype(std::forward(f)(this->Get()...)) { + return std::forward(f)(Get()...); + } + + template + auto Apply(F&& f) const -> decltype(std::forward(f)(this->Get()...)) { + return std::forward(f)(Get()...); + } +}; + +// Analog to std::tuple but with different tradeoffs. +// This class minimizes the template instantiation depth, thus allowing more +// elements than std::tuple would. std::tuple has been seen to require an +// instantiation depth of more than 10x the number of elements in some +// implementations. +// FlatTuple and ElemFromList are not recursive and have a fixed depth +// regardless of T... +// MakeIndexSequence, on the other hand, it is recursive but with an +// instantiation depth of O(ln(N)). +template +class FlatTuple + : private FlatTupleBase, + typename MakeIndexSequence::type> { + using Indices = typename FlatTupleBase< + FlatTuple, typename MakeIndexSequence::type>::Indices; + + public: + FlatTuple() = default; + template + explicit FlatTuple(FlatTupleConstructTag tag, Args&&... args) + : FlatTuple::FlatTupleBase(tag, std::forward(args)...) {} + + using FlatTuple::FlatTupleBase::Apply; + using FlatTuple::FlatTupleBase::Get; +}; + +// Utility functions to be called with static_assert to induce deprecation +// warnings. +GTEST_INTERNAL_DEPRECATED( + "INSTANTIATE_TEST_CASE_P is deprecated, please use " + "INSTANTIATE_TEST_SUITE_P") +constexpr bool InstantiateTestCase_P_IsDeprecated() { return true; } + +GTEST_INTERNAL_DEPRECATED( + "TYPED_TEST_CASE_P is deprecated, please use " + "TYPED_TEST_SUITE_P") +constexpr bool TypedTestCase_P_IsDeprecated() { return true; } + +GTEST_INTERNAL_DEPRECATED( + "TYPED_TEST_CASE is deprecated, please use " + "TYPED_TEST_SUITE") +constexpr bool TypedTestCaseIsDeprecated() { return true; } + +GTEST_INTERNAL_DEPRECATED( + "REGISTER_TYPED_TEST_CASE_P is deprecated, please use " + "REGISTER_TYPED_TEST_SUITE_P") +constexpr bool RegisterTypedTestCase_P_IsDeprecated() { return true; } + +GTEST_INTERNAL_DEPRECATED( + "INSTANTIATE_TYPED_TEST_CASE_P is deprecated, please use " + "INSTANTIATE_TYPED_TEST_SUITE_P") +constexpr bool InstantiateTypedTestCase_P_IsDeprecated() { return true; } + +} // namespace internal +} // namespace testing + +namespace std { +// Some standard library implementations use `struct tuple_size` and some use +// `class tuple_size`. Clang warns about the mismatch. +// https://reviews.llvm.org/D55466 +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wmismatched-tags" +#endif +template +struct tuple_size> + : std::integral_constant {}; +#ifdef __clang__ +#pragma clang diagnostic pop +#endif +} // namespace std + +#define GTEST_MESSAGE_AT_(file, line, message, result_type) \ + ::testing::internal::AssertHelper(result_type, file, line, message) = \ + ::testing::Message() + +#define GTEST_MESSAGE_(message, result_type) \ + GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type) + +#define GTEST_FATAL_FAILURE_(message) \ + return GTEST_MESSAGE_(message, ::testing::TestPartResult::kFatalFailure) + +#define GTEST_NONFATAL_FAILURE_(message) \ + GTEST_MESSAGE_(message, ::testing::TestPartResult::kNonFatalFailure) + +#define GTEST_SUCCESS_(message) \ + GTEST_MESSAGE_(message, ::testing::TestPartResult::kSuccess) + +#define GTEST_SKIP_(message) \ + return GTEST_MESSAGE_(message, ::testing::TestPartResult::kSkip) + +// Suppress MSVC warning 4072 (unreachable code) for the code following +// statement if it returns or throws (or doesn't return or throw in some +// situations). +// NOTE: The "else" is important to keep this expansion to prevent a top-level +// "else" from attaching to our "if". +#define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \ + if (::testing::internal::AlwaysTrue()) { \ + statement; \ + } else /* NOLINT */ \ + static_assert(true, "") // User must have a semicolon after expansion. + +#if GTEST_HAS_EXCEPTIONS + +namespace testing { +namespace internal { + +class NeverThrown { + public: + const char* what() const noexcept { + return "this exception should never be thrown"; + } +}; + +} // namespace internal +} // namespace testing + +#if GTEST_HAS_RTTI + +#define GTEST_EXCEPTION_TYPE_(e) ::testing::internal::GetTypeName(typeid(e)) + +#else // GTEST_HAS_RTTI + +#define GTEST_EXCEPTION_TYPE_(e) \ + std::string { "an std::exception-derived error" } + +#endif // GTEST_HAS_RTTI + +#define GTEST_TEST_THROW_CATCH_STD_EXCEPTION_(statement, expected_exception) \ + catch (typename std::conditional< \ + std::is_same::type>::type, \ + std::exception>::value, \ + const ::testing::internal::NeverThrown&, const std::exception&>::type \ + e) { \ + gtest_msg.value = "Expected: " #statement \ + " throws an exception of type " #expected_exception \ + ".\n Actual: it throws "; \ + gtest_msg.value += GTEST_EXCEPTION_TYPE_(e); \ + gtest_msg.value += " with description \""; \ + gtest_msg.value += e.what(); \ + gtest_msg.value += "\"."; \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \ + } + +#else // GTEST_HAS_EXCEPTIONS + +#define GTEST_TEST_THROW_CATCH_STD_EXCEPTION_(statement, expected_exception) + +#endif // GTEST_HAS_EXCEPTIONS + +#define GTEST_TEST_THROW_(statement, expected_exception, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::TrueWithString gtest_msg{}) { \ + bool gtest_caught_expected = false; \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } catch (expected_exception const&) { \ + gtest_caught_expected = true; \ + } \ + GTEST_TEST_THROW_CATCH_STD_EXCEPTION_(statement, expected_exception) \ + catch (...) { \ + gtest_msg.value = "Expected: " #statement \ + " throws an exception of type " #expected_exception \ + ".\n Actual: it throws a different type."; \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \ + } \ + if (!gtest_caught_expected) { \ + gtest_msg.value = "Expected: " #statement \ + " throws an exception of type " #expected_exception \ + ".\n Actual: it throws nothing."; \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \ + } \ + } else /*NOLINT*/ \ + GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__) \ + : fail(gtest_msg.value.c_str()) + +#if GTEST_HAS_EXCEPTIONS + +#define GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_() \ + catch (std::exception const& e) { \ + gtest_msg.value = "it throws "; \ + gtest_msg.value += GTEST_EXCEPTION_TYPE_(e); \ + gtest_msg.value += " with description \""; \ + gtest_msg.value += e.what(); \ + gtest_msg.value += "\"."; \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \ + } + +#else // GTEST_HAS_EXCEPTIONS + +#define GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_() + +#endif // GTEST_HAS_EXCEPTIONS + +#define GTEST_TEST_NO_THROW_(statement, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::TrueWithString gtest_msg{}) { \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } \ + GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_() \ + catch (...) { \ + gtest_msg.value = "it throws."; \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__) \ + : fail(("Expected: " #statement " doesn't throw an exception.\n" \ + " Actual: " + \ + gtest_msg.value) \ + .c_str()) + +#define GTEST_TEST_ANY_THROW_(statement, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + bool gtest_caught_any = false; \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } catch (...) { \ + gtest_caught_any = true; \ + } \ + if (!gtest_caught_any) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__) \ + : fail("Expected: " #statement \ + " throws an exception.\n" \ + " Actual: it doesn't.") + +// Implements Boolean test assertions such as EXPECT_TRUE. expression can be +// either a boolean expression or an AssertionResult. text is a textual +// representation of expression as it was passed into the EXPECT_TRUE. +#define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (const ::testing::AssertionResult gtest_ar_ = \ + ::testing::AssertionResult(expression)) \ + ; \ + else \ + fail(::testing::internal::GetBoolAssertionFailureMessage( \ + gtest_ar_, text, #actual, #expected) \ + .c_str()) + +#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + ::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__) \ + : fail("Expected: " #statement \ + " doesn't generate new fatal " \ + "failures in the current thread.\n" \ + " Actual: it does.") + +// Expands to the name of the class that implements the given test. +#define GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \ + test_suite_name##_##test_name##_Test + +// Helper macro for defining tests. +#define GTEST_TEST_(test_suite_name, test_name, parent_class, parent_id) \ + static_assert(sizeof(GTEST_STRINGIFY_(test_suite_name)) > 1, \ + "test_suite_name must not be empty"); \ + static_assert(sizeof(GTEST_STRINGIFY_(test_name)) > 1, \ + "test_name must not be empty"); \ + class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \ + : public parent_class { \ + public: \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() = default; \ + ~GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() override = default; \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \ + (const GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &) = delete; \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=( \ + const GTEST_TEST_CLASS_NAME_(test_suite_name, \ + test_name) &) = delete; /* NOLINT */ \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \ + (GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &&) noexcept = delete; \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=( \ + GTEST_TEST_CLASS_NAME_(test_suite_name, \ + test_name) &&) noexcept = delete; /* NOLINT */ \ + \ + private: \ + void TestBody() override; \ + static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_; \ + }; \ + \ + ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_suite_name, \ + test_name)::test_info_ = \ + ::testing::internal::MakeAndRegisterTestInfo( \ + #test_suite_name, #test_name, nullptr, nullptr, \ + ::testing::internal::CodeLocation(__FILE__, __LINE__), (parent_id), \ + ::testing::internal::SuiteApiResolver< \ + parent_class>::GetSetUpCaseOrSuite(__FILE__, __LINE__), \ + ::testing::internal::SuiteApiResolver< \ + parent_class>::GetTearDownCaseOrSuite(__FILE__, __LINE__), \ + new ::testing::internal::TestFactoryImpl); \ + void GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::TestBody() + +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h new file mode 100644 index 0000000000..e7af2f904a --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h @@ -0,0 +1,956 @@ +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Type and function utilities for implementing parameterized tests. + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gtest/gtest-printers.h" +#include "gtest/gtest-test-part.h" +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-port.h" + +namespace testing { +// Input to a parameterized test name generator, describing a test parameter. +// Consists of the parameter value and the integer parameter index. +template +struct TestParamInfo { + TestParamInfo(const ParamType& a_param, size_t an_index) + : param(a_param), index(an_index) {} + ParamType param; + size_t index; +}; + +// A builtin parameterized test name generator which returns the result of +// testing::PrintToString. +struct PrintToStringParamName { + template + std::string operator()(const TestParamInfo& info) const { + return PrintToString(info.param); + } +}; + +namespace internal { + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// Utility Functions + +// Outputs a message explaining invalid registration of different +// fixture class for the same test suite. This may happen when +// TEST_P macro is used to define two tests with the same name +// but in different namespaces. +GTEST_API_ void ReportInvalidTestSuiteType(const char* test_suite_name, + CodeLocation code_location); + +template +class ParamGeneratorInterface; +template +class ParamGenerator; + +// Interface for iterating over elements provided by an implementation +// of ParamGeneratorInterface. +template +class ParamIteratorInterface { + public: + virtual ~ParamIteratorInterface() {} + // A pointer to the base generator instance. + // Used only for the purposes of iterator comparison + // to make sure that two iterators belong to the same generator. + virtual const ParamGeneratorInterface* BaseGenerator() const = 0; + // Advances iterator to point to the next element + // provided by the generator. The caller is responsible + // for not calling Advance() on an iterator equal to + // BaseGenerator()->End(). + virtual void Advance() = 0; + // Clones the iterator object. Used for implementing copy semantics + // of ParamIterator. + virtual ParamIteratorInterface* Clone() const = 0; + // Dereferences the current iterator and provides (read-only) access + // to the pointed value. It is the caller's responsibility not to call + // Current() on an iterator equal to BaseGenerator()->End(). + // Used for implementing ParamGenerator::operator*(). + virtual const T* Current() const = 0; + // Determines whether the given iterator and other point to the same + // element in the sequence generated by the generator. + // Used for implementing ParamGenerator::operator==(). + virtual bool Equals(const ParamIteratorInterface& other) const = 0; +}; + +// Class iterating over elements provided by an implementation of +// ParamGeneratorInterface. It wraps ParamIteratorInterface +// and implements the const forward iterator concept. +template +class ParamIterator { + public: + typedef T value_type; + typedef const T& reference; + typedef ptrdiff_t difference_type; + + // ParamIterator assumes ownership of the impl_ pointer. + ParamIterator(const ParamIterator& other) : impl_(other.impl_->Clone()) {} + ParamIterator& operator=(const ParamIterator& other) { + if (this != &other) impl_.reset(other.impl_->Clone()); + return *this; + } + + const T& operator*() const { return *impl_->Current(); } + const T* operator->() const { return impl_->Current(); } + // Prefix version of operator++. + ParamIterator& operator++() { + impl_->Advance(); + return *this; + } + // Postfix version of operator++. + ParamIterator operator++(int /*unused*/) { + ParamIteratorInterface* clone = impl_->Clone(); + impl_->Advance(); + return ParamIterator(clone); + } + bool operator==(const ParamIterator& other) const { + return impl_.get() == other.impl_.get() || impl_->Equals(*other.impl_); + } + bool operator!=(const ParamIterator& other) const { + return !(*this == other); + } + + private: + friend class ParamGenerator; + explicit ParamIterator(ParamIteratorInterface* impl) : impl_(impl) {} + std::unique_ptr> impl_; +}; + +// ParamGeneratorInterface is the binary interface to access generators +// defined in other translation units. +template +class ParamGeneratorInterface { + public: + typedef T ParamType; + + virtual ~ParamGeneratorInterface() {} + + // Generator interface definition + virtual ParamIteratorInterface* Begin() const = 0; + virtual ParamIteratorInterface* End() const = 0; +}; + +// Wraps ParamGeneratorInterface and provides general generator syntax +// compatible with the STL Container concept. +// This class implements copy initialization semantics and the contained +// ParamGeneratorInterface instance is shared among all copies +// of the original object. This is possible because that instance is immutable. +template +class ParamGenerator { + public: + typedef ParamIterator iterator; + + explicit ParamGenerator(ParamGeneratorInterface* impl) : impl_(impl) {} + ParamGenerator(const ParamGenerator& other) : impl_(other.impl_) {} + + ParamGenerator& operator=(const ParamGenerator& other) { + impl_ = other.impl_; + return *this; + } + + iterator begin() const { return iterator(impl_->Begin()); } + iterator end() const { return iterator(impl_->End()); } + + private: + std::shared_ptr> impl_; +}; + +// Generates values from a range of two comparable values. Can be used to +// generate sequences of user-defined types that implement operator+() and +// operator<(). +// This class is used in the Range() function. +template +class RangeGenerator : public ParamGeneratorInterface { + public: + RangeGenerator(T begin, T end, IncrementT step) + : begin_(begin), + end_(end), + step_(step), + end_index_(CalculateEndIndex(begin, end, step)) {} + ~RangeGenerator() override {} + + ParamIteratorInterface* Begin() const override { + return new Iterator(this, begin_, 0, step_); + } + ParamIteratorInterface* End() const override { + return new Iterator(this, end_, end_index_, step_); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, T value, int index, + IncrementT step) + : base_(base), value_(value), index_(index), step_(step) {} + ~Iterator() override {} + + const ParamGeneratorInterface* BaseGenerator() const override { + return base_; + } + void Advance() override { + value_ = static_cast(value_ + step_); + index_++; + } + ParamIteratorInterface* Clone() const override { + return new Iterator(*this); + } + const T* Current() const override { return &value_; } + bool Equals(const ParamIteratorInterface& other) const override { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const int other_index = + CheckedDowncastToActualType(&other)->index_; + return index_ == other_index; + } + + private: + Iterator(const Iterator& other) + : ParamIteratorInterface(), + base_(other.base_), + value_(other.value_), + index_(other.index_), + step_(other.step_) {} + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + T value_; + int index_; + const IncrementT step_; + }; // class RangeGenerator::Iterator + + static int CalculateEndIndex(const T& begin, const T& end, + const IncrementT& step) { + int end_index = 0; + for (T i = begin; i < end; i = static_cast(i + step)) end_index++; + return end_index; + } + + // No implementation - assignment is unsupported. + void operator=(const RangeGenerator& other); + + const T begin_; + const T end_; + const IncrementT step_; + // The index for the end() iterator. All the elements in the generated + // sequence are indexed (0-based) to aid iterator comparison. + const int end_index_; +}; // class RangeGenerator + +// Generates values from a pair of STL-style iterators. Used in the +// ValuesIn() function. The elements are copied from the source range +// since the source can be located on the stack, and the generator +// is likely to persist beyond that stack frame. +template +class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface { + public: + template + ValuesInIteratorRangeGenerator(ForwardIterator begin, ForwardIterator end) + : container_(begin, end) {} + ~ValuesInIteratorRangeGenerator() override {} + + ParamIteratorInterface* Begin() const override { + return new Iterator(this, container_.begin()); + } + ParamIteratorInterface* End() const override { + return new Iterator(this, container_.end()); + } + + private: + typedef typename ::std::vector ContainerType; + + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, + typename ContainerType::const_iterator iterator) + : base_(base), iterator_(iterator) {} + ~Iterator() override {} + + const ParamGeneratorInterface* BaseGenerator() const override { + return base_; + } + void Advance() override { + ++iterator_; + value_.reset(); + } + ParamIteratorInterface* Clone() const override { + return new Iterator(*this); + } + // We need to use cached value referenced by iterator_ because *iterator_ + // can return a temporary object (and of type other then T), so just + // having "return &*iterator_;" doesn't work. + // value_ is updated here and not in Advance() because Advance() + // can advance iterator_ beyond the end of the range, and we cannot + // detect that fact. The client code, on the other hand, is + // responsible for not calling Current() on an out-of-range iterator. + const T* Current() const override { + if (value_.get() == nullptr) value_.reset(new T(*iterator_)); + return value_.get(); + } + bool Equals(const ParamIteratorInterface& other) const override { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + return iterator_ == + CheckedDowncastToActualType(&other)->iterator_; + } + + private: + Iterator(const Iterator& other) + // The explicit constructor call suppresses a false warning + // emitted by gcc when supplied with the -Wextra option. + : ParamIteratorInterface(), + base_(other.base_), + iterator_(other.iterator_) {} + + const ParamGeneratorInterface* const base_; + typename ContainerType::const_iterator iterator_; + // A cached value of *iterator_. We keep it here to allow access by + // pointer in the wrapping iterator's operator->(). + // value_ needs to be mutable to be accessed in Current(). + // Use of std::unique_ptr helps manage cached value's lifetime, + // which is bound by the lifespan of the iterator itself. + mutable std::unique_ptr value_; + }; // class ValuesInIteratorRangeGenerator::Iterator + + // No implementation - assignment is unsupported. + void operator=(const ValuesInIteratorRangeGenerator& other); + + const ContainerType container_; +}; // class ValuesInIteratorRangeGenerator + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Default parameterized test name generator, returns a string containing the +// integer test parameter index. +template +std::string DefaultParamName(const TestParamInfo& info) { + Message name_stream; + name_stream << info.index; + return name_stream.GetString(); +} + +template +void TestNotEmpty() { + static_assert(sizeof(T) == 0, "Empty arguments are not allowed."); +} +template +void TestNotEmpty(const T&) {} + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Stores a parameter value and later creates tests parameterized with that +// value. +template +class ParameterizedTestFactory : public TestFactoryBase { + public: + typedef typename TestClass::ParamType ParamType; + explicit ParameterizedTestFactory(ParamType parameter) + : parameter_(parameter) {} + Test* CreateTest() override { + TestClass::SetParam(¶meter_); + return new TestClass(); + } + + private: + const ParamType parameter_; + + ParameterizedTestFactory(const ParameterizedTestFactory&) = delete; + ParameterizedTestFactory& operator=(const ParameterizedTestFactory&) = delete; +}; + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// TestMetaFactoryBase is a base class for meta-factories that create +// test factories for passing into MakeAndRegisterTestInfo function. +template +class TestMetaFactoryBase { + public: + virtual ~TestMetaFactoryBase() {} + + virtual TestFactoryBase* CreateTestFactory(ParamType parameter) = 0; +}; + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// TestMetaFactory creates test factories for passing into +// MakeAndRegisterTestInfo function. Since MakeAndRegisterTestInfo receives +// ownership of test factory pointer, same factory object cannot be passed +// into that method twice. But ParameterizedTestSuiteInfo is going to call +// it for each Test/Parameter value combination. Thus it needs meta factory +// creator class. +template +class TestMetaFactory + : public TestMetaFactoryBase { + public: + using ParamType = typename TestSuite::ParamType; + + TestMetaFactory() {} + + TestFactoryBase* CreateTestFactory(ParamType parameter) override { + return new ParameterizedTestFactory(parameter); + } + + private: + TestMetaFactory(const TestMetaFactory&) = delete; + TestMetaFactory& operator=(const TestMetaFactory&) = delete; +}; + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// ParameterizedTestSuiteInfoBase is a generic interface +// to ParameterizedTestSuiteInfo classes. ParameterizedTestSuiteInfoBase +// accumulates test information provided by TEST_P macro invocations +// and generators provided by INSTANTIATE_TEST_SUITE_P macro invocations +// and uses that information to register all resulting test instances +// in RegisterTests method. The ParameterizeTestSuiteRegistry class holds +// a collection of pointers to the ParameterizedTestSuiteInfo objects +// and calls RegisterTests() on each of them when asked. +class ParameterizedTestSuiteInfoBase { + public: + virtual ~ParameterizedTestSuiteInfoBase() {} + + // Base part of test suite name for display purposes. + virtual const std::string& GetTestSuiteName() const = 0; + // Test suite id to verify identity. + virtual TypeId GetTestSuiteTypeId() const = 0; + // UnitTest class invokes this method to register tests in this + // test suite right before running them in RUN_ALL_TESTS macro. + // This method should not be called more than once on any single + // instance of a ParameterizedTestSuiteInfoBase derived class. + virtual void RegisterTests() = 0; + + protected: + ParameterizedTestSuiteInfoBase() {} + + private: + ParameterizedTestSuiteInfoBase(const ParameterizedTestSuiteInfoBase&) = + delete; + ParameterizedTestSuiteInfoBase& operator=( + const ParameterizedTestSuiteInfoBase&) = delete; +}; + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Report a the name of a test_suit as safe to ignore +// as the side effect of construction of this type. +struct GTEST_API_ MarkAsIgnored { + explicit MarkAsIgnored(const char* test_suite); +}; + +GTEST_API_ void InsertSyntheticTestCase(const std::string& name, + CodeLocation location, bool has_test_p); + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// ParameterizedTestSuiteInfo accumulates tests obtained from TEST_P +// macro invocations for a particular test suite and generators +// obtained from INSTANTIATE_TEST_SUITE_P macro invocations for that +// test suite. It registers tests with all values generated by all +// generators when asked. +template +class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase { + public: + // ParamType and GeneratorCreationFunc are private types but are required + // for declarations of public methods AddTestPattern() and + // AddTestSuiteInstantiation(). + using ParamType = typename TestSuite::ParamType; + // A function that returns an instance of appropriate generator type. + typedef ParamGenerator(GeneratorCreationFunc)(); + using ParamNameGeneratorFunc = std::string(const TestParamInfo&); + + explicit ParameterizedTestSuiteInfo(const char* name, + CodeLocation code_location) + : test_suite_name_(name), code_location_(code_location) {} + + // Test suite base name for display purposes. + const std::string& GetTestSuiteName() const override { + return test_suite_name_; + } + // Test suite id to verify identity. + TypeId GetTestSuiteTypeId() const override { return GetTypeId(); } + // TEST_P macro uses AddTestPattern() to record information + // about a single test in a LocalTestInfo structure. + // test_suite_name is the base name of the test suite (without invocation + // prefix). test_base_name is the name of an individual test without + // parameter index. For the test SequenceA/FooTest.DoBar/1 FooTest is + // test suite base name and DoBar is test base name. + void AddTestPattern(const char* test_suite_name, const char* test_base_name, + TestMetaFactoryBase* meta_factory, + CodeLocation code_location) { + tests_.push_back(std::shared_ptr(new TestInfo( + test_suite_name, test_base_name, meta_factory, code_location))); + } + // INSTANTIATE_TEST_SUITE_P macro uses AddGenerator() to record information + // about a generator. + int AddTestSuiteInstantiation(const std::string& instantiation_name, + GeneratorCreationFunc* func, + ParamNameGeneratorFunc* name_func, + const char* file, int line) { + instantiations_.push_back( + InstantiationInfo(instantiation_name, func, name_func, file, line)); + return 0; // Return value used only to run this method in namespace scope. + } + // UnitTest class invokes this method to register tests in this test suite + // right before running tests in RUN_ALL_TESTS macro. + // This method should not be called more than once on any single + // instance of a ParameterizedTestSuiteInfoBase derived class. + // UnitTest has a guard to prevent from calling this method more than once. + void RegisterTests() override { + bool generated_instantiations = false; + + for (typename TestInfoContainer::iterator test_it = tests_.begin(); + test_it != tests_.end(); ++test_it) { + std::shared_ptr test_info = *test_it; + for (typename InstantiationContainer::iterator gen_it = + instantiations_.begin(); + gen_it != instantiations_.end(); ++gen_it) { + const std::string& instantiation_name = gen_it->name; + ParamGenerator generator((*gen_it->generator)()); + ParamNameGeneratorFunc* name_func = gen_it->name_func; + const char* file = gen_it->file; + int line = gen_it->line; + + std::string test_suite_name; + if (!instantiation_name.empty()) + test_suite_name = instantiation_name + "/"; + test_suite_name += test_info->test_suite_base_name; + + size_t i = 0; + std::set test_param_names; + for (typename ParamGenerator::iterator param_it = + generator.begin(); + param_it != generator.end(); ++param_it, ++i) { + generated_instantiations = true; + + Message test_name_stream; + + std::string param_name = + name_func(TestParamInfo(*param_it, i)); + + GTEST_CHECK_(IsValidParamName(param_name)) + << "Parameterized test name '" << param_name + << "' is invalid, in " << file << " line " << line << std::endl; + + GTEST_CHECK_(test_param_names.count(param_name) == 0) + << "Duplicate parameterized test name '" << param_name << "', in " + << file << " line " << line << std::endl; + + test_param_names.insert(param_name); + + if (!test_info->test_base_name.empty()) { + test_name_stream << test_info->test_base_name << "/"; + } + test_name_stream << param_name; + MakeAndRegisterTestInfo( + test_suite_name.c_str(), test_name_stream.GetString().c_str(), + nullptr, // No type parameter. + PrintToString(*param_it).c_str(), test_info->code_location, + GetTestSuiteTypeId(), + SuiteApiResolver::GetSetUpCaseOrSuite(file, line), + SuiteApiResolver::GetTearDownCaseOrSuite(file, line), + test_info->test_meta_factory->CreateTestFactory(*param_it)); + } // for param_it + } // for gen_it + } // for test_it + + if (!generated_instantiations) { + // There are no generaotrs, or they all generate nothing ... + InsertSyntheticTestCase(GetTestSuiteName(), code_location_, + !tests_.empty()); + } + } // RegisterTests + + private: + // LocalTestInfo structure keeps information about a single test registered + // with TEST_P macro. + struct TestInfo { + TestInfo(const char* a_test_suite_base_name, const char* a_test_base_name, + TestMetaFactoryBase* a_test_meta_factory, + CodeLocation a_code_location) + : test_suite_base_name(a_test_suite_base_name), + test_base_name(a_test_base_name), + test_meta_factory(a_test_meta_factory), + code_location(a_code_location) {} + + const std::string test_suite_base_name; + const std::string test_base_name; + const std::unique_ptr> test_meta_factory; + const CodeLocation code_location; + }; + using TestInfoContainer = ::std::vector>; + // Records data received from INSTANTIATE_TEST_SUITE_P macros: + // + struct InstantiationInfo { + InstantiationInfo(const std::string& name_in, + GeneratorCreationFunc* generator_in, + ParamNameGeneratorFunc* name_func_in, const char* file_in, + int line_in) + : name(name_in), + generator(generator_in), + name_func(name_func_in), + file(file_in), + line(line_in) {} + + std::string name; + GeneratorCreationFunc* generator; + ParamNameGeneratorFunc* name_func; + const char* file; + int line; + }; + typedef ::std::vector InstantiationContainer; + + static bool IsValidParamName(const std::string& name) { + // Check for empty string + if (name.empty()) return false; + + // Check for invalid characters + for (std::string::size_type index = 0; index < name.size(); ++index) { + if (!IsAlNum(name[index]) && name[index] != '_') return false; + } + + return true; + } + + const std::string test_suite_name_; + CodeLocation code_location_; + TestInfoContainer tests_; + InstantiationContainer instantiations_; + + ParameterizedTestSuiteInfo(const ParameterizedTestSuiteInfo&) = delete; + ParameterizedTestSuiteInfo& operator=(const ParameterizedTestSuiteInfo&) = + delete; +}; // class ParameterizedTestSuiteInfo + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +template +using ParameterizedTestCaseInfo = ParameterizedTestSuiteInfo; +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// ParameterizedTestSuiteRegistry contains a map of +// ParameterizedTestSuiteInfoBase classes accessed by test suite names. TEST_P +// and INSTANTIATE_TEST_SUITE_P macros use it to locate their corresponding +// ParameterizedTestSuiteInfo descriptors. +class ParameterizedTestSuiteRegistry { + public: + ParameterizedTestSuiteRegistry() {} + ~ParameterizedTestSuiteRegistry() { + for (auto& test_suite_info : test_suite_infos_) { + delete test_suite_info; + } + } + + // Looks up or creates and returns a structure containing information about + // tests and instantiations of a particular test suite. + template + ParameterizedTestSuiteInfo* GetTestSuitePatternHolder( + const char* test_suite_name, CodeLocation code_location) { + ParameterizedTestSuiteInfo* typed_test_info = nullptr; + for (auto& test_suite_info : test_suite_infos_) { + if (test_suite_info->GetTestSuiteName() == test_suite_name) { + if (test_suite_info->GetTestSuiteTypeId() != GetTypeId()) { + // Complain about incorrect usage of Google Test facilities + // and terminate the program since we cannot guaranty correct + // test suite setup and tear-down in this case. + ReportInvalidTestSuiteType(test_suite_name, code_location); + posix::Abort(); + } else { + // At this point we are sure that the object we found is of the same + // type we are looking for, so we downcast it to that type + // without further checks. + typed_test_info = CheckedDowncastToActualType< + ParameterizedTestSuiteInfo>(test_suite_info); + } + break; + } + } + if (typed_test_info == nullptr) { + typed_test_info = new ParameterizedTestSuiteInfo( + test_suite_name, code_location); + test_suite_infos_.push_back(typed_test_info); + } + return typed_test_info; + } + void RegisterTests() { + for (auto& test_suite_info : test_suite_infos_) { + test_suite_info->RegisterTests(); + } + } +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + template + ParameterizedTestCaseInfo* GetTestCasePatternHolder( + const char* test_case_name, CodeLocation code_location) { + return GetTestSuitePatternHolder(test_case_name, code_location); + } + +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + private: + using TestSuiteInfoContainer = ::std::vector; + + TestSuiteInfoContainer test_suite_infos_; + + ParameterizedTestSuiteRegistry(const ParameterizedTestSuiteRegistry&) = + delete; + ParameterizedTestSuiteRegistry& operator=( + const ParameterizedTestSuiteRegistry&) = delete; +}; + +// Keep track of what type-parameterized test suite are defined and +// where as well as which are intatiated. This allows susequently +// identifying suits that are defined but never used. +class TypeParameterizedTestSuiteRegistry { + public: + // Add a suite definition + void RegisterTestSuite(const char* test_suite_name, + CodeLocation code_location); + + // Add an instantiation of a suit. + void RegisterInstantiation(const char* test_suite_name); + + // For each suit repored as defined but not reported as instantiation, + // emit a test that reports that fact (configurably, as an error). + void CheckForInstantiations(); + + private: + struct TypeParameterizedTestSuiteInfo { + explicit TypeParameterizedTestSuiteInfo(CodeLocation c) + : code_location(c), instantiated(false) {} + + CodeLocation code_location; + bool instantiated; + }; + + std::map suites_; +}; + +} // namespace internal + +// Forward declarations of ValuesIn(), which is implemented in +// include/gtest/gtest-param-test.h. +template +internal::ParamGenerator ValuesIn( + const Container& container); + +namespace internal { +// Used in the Values() function to provide polymorphic capabilities. + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4100) +#endif + +template +class ValueArray { + public: + explicit ValueArray(Ts... v) : v_(FlatTupleConstructTag{}, std::move(v)...) {} + + template + operator ParamGenerator() const { // NOLINT + return ValuesIn(MakeVector(MakeIndexSequence())); + } + + private: + template + std::vector MakeVector(IndexSequence) const { + return std::vector{static_cast(v_.template Get())...}; + } + + FlatTuple v_; +}; + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +template +class CartesianProductGenerator + : public ParamGeneratorInterface<::std::tuple> { + public: + typedef ::std::tuple ParamType; + + CartesianProductGenerator(const std::tuple...>& g) + : generators_(g) {} + ~CartesianProductGenerator() override {} + + ParamIteratorInterface* Begin() const override { + return new Iterator(this, generators_, false); + } + ParamIteratorInterface* End() const override { + return new Iterator(this, generators_, true); + } + + private: + template + class IteratorImpl; + template + class IteratorImpl> + : public ParamIteratorInterface { + public: + IteratorImpl(const ParamGeneratorInterface* base, + const std::tuple...>& generators, + bool is_end) + : base_(base), + begin_(std::get(generators).begin()...), + end_(std::get(generators).end()...), + current_(is_end ? end_ : begin_) { + ComputeCurrentValue(); + } + ~IteratorImpl() override {} + + const ParamGeneratorInterface* BaseGenerator() const override { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + void Advance() override { + assert(!AtEnd()); + // Advance the last iterator. + ++std::get(current_); + // if that reaches end, propagate that up. + AdvanceIfEnd(); + ComputeCurrentValue(); + } + ParamIteratorInterface* Clone() const override { + return new IteratorImpl(*this); + } + + const ParamType* Current() const override { return current_value_.get(); } + + bool Equals(const ParamIteratorInterface& other) const override { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const IteratorImpl* typed_other = + CheckedDowncastToActualType(&other); + + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + if (AtEnd() && typed_other->AtEnd()) return true; + + bool same = true; + bool dummy[] = { + (same = same && std::get(current_) == + std::get(typed_other->current_))...}; + (void)dummy; + return same; + } + + private: + template + void AdvanceIfEnd() { + if (std::get(current_) != std::get(end_)) return; + + bool last = ThisI == 0; + if (last) { + // We are done. Nothing else to propagate. + return; + } + + constexpr size_t NextI = ThisI - (ThisI != 0); + std::get(current_) = std::get(begin_); + ++std::get(current_); + AdvanceIfEnd(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_ = std::make_shared(*std::get(current_)...); + } + bool AtEnd() const { + bool at_end = false; + bool dummy[] = { + (at_end = at_end || std::get(current_) == std::get(end_))...}; + (void)dummy; + return at_end; + } + + const ParamGeneratorInterface* const base_; + std::tuple::iterator...> begin_; + std::tuple::iterator...> end_; + std::tuple::iterator...> current_; + std::shared_ptr current_value_; + }; + + using Iterator = IteratorImpl::type>; + + std::tuple...> generators_; +}; + +template +class CartesianProductHolder { + public: + CartesianProductHolder(const Gen&... g) : generators_(g...) {} + template + operator ParamGenerator<::std::tuple>() const { + return ParamGenerator<::std::tuple>( + new CartesianProductGenerator(generators_)); + } + + private: + std::tuple generators_; +}; + +} // namespace internal +} // namespace testing + +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h new file mode 100644 index 0000000000..f025db76ad --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h @@ -0,0 +1,116 @@ +// Copyright 2015, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This header file defines the GTEST_OS_* macro. +// It is separate from gtest-port.h so that custom/gtest-port.h can include it. + +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_ + +// Determines the platform on which Google Test is compiled. +#ifdef __CYGWIN__ +#define GTEST_OS_CYGWIN 1 +#elif defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__) +#define GTEST_OS_WINDOWS_MINGW 1 +#define GTEST_OS_WINDOWS 1 +#elif defined _WIN32 +#define GTEST_OS_WINDOWS 1 +#ifdef _WIN32_WCE +#define GTEST_OS_WINDOWS_MOBILE 1 +#elif defined(WINAPI_FAMILY) +#include +#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) +#define GTEST_OS_WINDOWS_DESKTOP 1 +#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP) +#define GTEST_OS_WINDOWS_PHONE 1 +#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) +#define GTEST_OS_WINDOWS_RT 1 +#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_TV_TITLE) +#define GTEST_OS_WINDOWS_PHONE 1 +#define GTEST_OS_WINDOWS_TV_TITLE 1 +#else +// WINAPI_FAMILY defined but no known partition matched. +// Default to desktop. +#define GTEST_OS_WINDOWS_DESKTOP 1 +#endif +#else +#define GTEST_OS_WINDOWS_DESKTOP 1 +#endif // _WIN32_WCE +#elif defined __OS2__ +#define GTEST_OS_OS2 1 +#elif defined __APPLE__ +#define GTEST_OS_MAC 1 +#include +#if TARGET_OS_IPHONE +#define GTEST_OS_IOS 1 +#endif +#elif defined __DragonFly__ +#define GTEST_OS_DRAGONFLY 1 +#elif defined __FreeBSD__ +#define GTEST_OS_FREEBSD 1 +#elif defined __Fuchsia__ +#define GTEST_OS_FUCHSIA 1 +#elif defined(__GNU__) +#define GTEST_OS_GNU_HURD 1 +#elif defined(__GLIBC__) && defined(__FreeBSD_kernel__) +#define GTEST_OS_GNU_KFREEBSD 1 +#elif defined __linux__ +#define GTEST_OS_LINUX 1 +#if defined __ANDROID__ +#define GTEST_OS_LINUX_ANDROID 1 +#endif +#elif defined __MVS__ +#define GTEST_OS_ZOS 1 +#elif defined(__sun) && defined(__SVR4) +#define GTEST_OS_SOLARIS 1 +#elif defined(_AIX) +#define GTEST_OS_AIX 1 +#elif defined(__hpux) +#define GTEST_OS_HPUX 1 +#elif defined __native_client__ +#define GTEST_OS_NACL 1 +#elif defined __NetBSD__ +#define GTEST_OS_NETBSD 1 +#elif defined __OpenBSD__ +#define GTEST_OS_OPENBSD 1 +#elif defined __QNX__ +#define GTEST_OS_QNX 1 +#elif defined(__HAIKU__) +#define GTEST_OS_HAIKU 1 +#elif defined ESP8266 +#define GTEST_OS_ESP8266 1 +#elif defined ESP32 +#define GTEST_OS_ESP32 1 +#elif defined(__XTENSA__) +#define GTEST_OS_XTENSA 1 +#endif // __CYGWIN__ + +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h new file mode 100644 index 0000000000..0003d27658 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h @@ -0,0 +1,2413 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Low-level types and utilities for porting Google Test to various +// platforms. All macros ending with _ and symbols defined in an +// internal namespace are subject to change without notice. Code +// outside Google Test MUST NOT USE THEM DIRECTLY. Macros that don't +// end with _ are part of Google Test's public API and can be used by +// code outside Google Test. +// +// This file is fundamental to Google Test. All other Google Test source +// files are expected to #include this. Therefore, it cannot #include +// any other Google Test header. + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ + +// Environment-describing macros +// ----------------------------- +// +// Google Test can be used in many different environments. Macros in +// this section tell Google Test what kind of environment it is being +// used in, such that Google Test can provide environment-specific +// features and implementations. +// +// Google Test tries to automatically detect the properties of its +// environment, so users usually don't need to worry about these +// macros. However, the automatic detection is not perfect. +// Sometimes it's necessary for a user to define some of the following +// macros in the build script to override Google Test's decisions. +// +// If the user doesn't define a macro in the list, Google Test will +// provide a default definition. After this header is #included, all +// macros in this list will be defined to either 1 or 0. +// +// Notes to maintainers: +// - Each macro here is a user-tweakable knob; do not grow the list +// lightly. +// - Use #if to key off these macros. Don't use #ifdef or "#if +// defined(...)", which will not work as these macros are ALWAYS +// defined. +// +// GTEST_HAS_CLONE - Define it to 1/0 to indicate that clone(2) +// is/isn't available. +// GTEST_HAS_EXCEPTIONS - Define it to 1/0 to indicate that exceptions +// are enabled. +// GTEST_HAS_POSIX_RE - Define it to 1/0 to indicate that POSIX regular +// expressions are/aren't available. +// GTEST_HAS_PTHREAD - Define it to 1/0 to indicate that +// is/isn't available. +// GTEST_HAS_RTTI - Define it to 1/0 to indicate that RTTI is/isn't +// enabled. +// GTEST_HAS_STD_WSTRING - Define it to 1/0 to indicate that +// std::wstring does/doesn't work (Google Test can +// be used where std::wstring is unavailable). +// GTEST_HAS_SEH - Define it to 1/0 to indicate whether the +// compiler supports Microsoft's "Structured +// Exception Handling". +// GTEST_HAS_STREAM_REDIRECTION +// - Define it to 1/0 to indicate whether the +// platform supports I/O stream redirection using +// dup() and dup2(). +// GTEST_LINKED_AS_SHARED_LIBRARY +// - Define to 1 when compiling tests that use +// Google Test as a shared library (known as +// DLL on Windows). +// GTEST_CREATE_SHARED_LIBRARY +// - Define to 1 when compiling Google Test itself +// as a shared library. +// GTEST_DEFAULT_DEATH_TEST_STYLE +// - The default value of --gtest_death_test_style. +// The legacy default has been "fast" in the open +// source version since 2008. The recommended value +// is "threadsafe", and can be set in +// custom/gtest-port.h. + +// Platform-indicating macros +// -------------------------- +// +// Macros indicating the platform on which Google Test is being used +// (a macro is defined to 1 if compiled on the given platform; +// otherwise UNDEFINED -- it's never defined to 0.). Google Test +// defines these macros automatically. Code outside Google Test MUST +// NOT define them. +// +// GTEST_OS_AIX - IBM AIX +// GTEST_OS_CYGWIN - Cygwin +// GTEST_OS_DRAGONFLY - DragonFlyBSD +// GTEST_OS_FREEBSD - FreeBSD +// GTEST_OS_FUCHSIA - Fuchsia +// GTEST_OS_GNU_HURD - GNU/Hurd +// GTEST_OS_GNU_KFREEBSD - GNU/kFreeBSD +// GTEST_OS_HAIKU - Haiku +// GTEST_OS_HPUX - HP-UX +// GTEST_OS_LINUX - Linux +// GTEST_OS_LINUX_ANDROID - Google Android +// GTEST_OS_MAC - Mac OS X +// GTEST_OS_IOS - iOS +// GTEST_OS_NACL - Google Native Client (NaCl) +// GTEST_OS_NETBSD - NetBSD +// GTEST_OS_OPENBSD - OpenBSD +// GTEST_OS_OS2 - OS/2 +// GTEST_OS_QNX - QNX +// GTEST_OS_SOLARIS - Sun Solaris +// GTEST_OS_WINDOWS - Windows (Desktop, MinGW, or Mobile) +// GTEST_OS_WINDOWS_DESKTOP - Windows Desktop +// GTEST_OS_WINDOWS_MINGW - MinGW +// GTEST_OS_WINDOWS_MOBILE - Windows Mobile +// GTEST_OS_WINDOWS_PHONE - Windows Phone +// GTEST_OS_WINDOWS_RT - Windows Store App/WinRT +// GTEST_OS_ZOS - z/OS +// +// Among the platforms, Cygwin, Linux, Mac OS X, and Windows have the +// most stable support. Since core members of the Google Test project +// don't have access to other platforms, support for them may be less +// stable. If you notice any problems on your platform, please notify +// googletestframework@googlegroups.com (patches for fixing them are +// even more welcome!). +// +// It is possible that none of the GTEST_OS_* macros are defined. + +// Feature-indicating macros +// ------------------------- +// +// Macros indicating which Google Test features are available (a macro +// is defined to 1 if the corresponding feature is supported; +// otherwise UNDEFINED -- it's never defined to 0.). Google Test +// defines these macros automatically. Code outside Google Test MUST +// NOT define them. +// +// These macros are public so that portable tests can be written. +// Such tests typically surround code using a feature with an #if +// which controls that code. For example: +// +// #if GTEST_HAS_DEATH_TEST +// EXPECT_DEATH(DoSomethingDeadly()); +// #endif +// +// GTEST_HAS_DEATH_TEST - death tests +// GTEST_HAS_TYPED_TEST - typed tests +// GTEST_HAS_TYPED_TEST_P - type-parameterized tests +// GTEST_IS_THREADSAFE - Google Test is thread-safe. +// GTEST_USES_RE2 - the RE2 regular expression library is used +// GTEST_USES_POSIX_RE - enhanced POSIX regex is used. Do not confuse with +// GTEST_HAS_POSIX_RE (see above) which users can +// define themselves. +// GTEST_USES_SIMPLE_RE - our own simple regex is used; +// the above RE\b(s) are mutually exclusive. + +// Misc public macros +// ------------------ +// +// GTEST_FLAG(flag_name) - references the variable corresponding to +// the given Google Test flag. + +// Internal utilities +// ------------------ +// +// The following macros and utilities are for Google Test's INTERNAL +// use only. Code outside Google Test MUST NOT USE THEM DIRECTLY. +// +// Macros for basic C++ coding: +// GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning. +// GTEST_ATTRIBUTE_UNUSED_ - declares that a class' instances or a +// variable don't have to be used. +// GTEST_MUST_USE_RESULT_ - declares that a function's result must be used. +// GTEST_INTENTIONAL_CONST_COND_PUSH_ - start code section where MSVC C4127 is +// suppressed (constant conditional). +// GTEST_INTENTIONAL_CONST_COND_POP_ - finish code section where MSVC C4127 +// is suppressed. +// GTEST_INTERNAL_HAS_ANY - for enabling UniversalPrinter or +// UniversalPrinter specializations. +// GTEST_INTERNAL_HAS_OPTIONAL - for enabling UniversalPrinter +// or +// UniversalPrinter +// specializations. +// GTEST_INTERNAL_HAS_STRING_VIEW - for enabling Matcher or +// Matcher +// specializations. +// GTEST_INTERNAL_HAS_VARIANT - for enabling UniversalPrinter or +// UniversalPrinter +// specializations. +// +// Synchronization: +// Mutex, MutexLock, ThreadLocal, GetThreadCount() +// - synchronization primitives. +// +// Regular expressions: +// RE - a simple regular expression class using +// 1) the RE2 syntax on all platforms when built with RE2 +// and Abseil as dependencies +// 2) the POSIX Extended Regular Expression syntax on +// UNIX-like platforms, +// 3) A reduced regular exception syntax on other platforms, +// including Windows. +// Logging: +// GTEST_LOG_() - logs messages at the specified severity level. +// LogToStderr() - directs all log messages to stderr. +// FlushInfoLog() - flushes informational log messages. +// +// Stdout and stderr capturing: +// CaptureStdout() - starts capturing stdout. +// GetCapturedStdout() - stops capturing stdout and returns the captured +// string. +// CaptureStderr() - starts capturing stderr. +// GetCapturedStderr() - stops capturing stderr and returns the captured +// string. +// +// Integer types: +// TypeWithSize - maps an integer to a int type. +// TimeInMillis - integers of known sizes. +// BiggestInt - the biggest signed integer type. +// +// Command-line utilities: +// GetInjectableArgvs() - returns the command line as a vector of strings. +// +// Environment variable utilities: +// GetEnv() - gets the value of an environment variable. +// BoolFromGTestEnv() - parses a bool environment variable. +// Int32FromGTestEnv() - parses an int32_t environment variable. +// StringFromGTestEnv() - parses a string environment variable. +// +// Deprecation warnings: +// GTEST_INTERNAL_DEPRECATED(message) - attribute marking a function as +// deprecated; calling a marked function +// should generate a compiler warning + +#include // for isspace, etc +#include // for ptrdiff_t +#include +#include +#include + +#include +// #include // Guarded by GTEST_IS_THREADSAFE below +#include +#include +#include +#include +#include +#include +// #include // Guarded by GTEST_IS_THREADSAFE below +#include +#include +#include + +#ifndef _WIN32_WCE +#include +#include +#endif // !_WIN32_WCE + +#if defined __APPLE__ +#include +#include +#endif + +#include "gtest/internal/custom/gtest-port.h" +#include "gtest/internal/gtest-port-arch.h" + +#if GTEST_HAS_ABSL +#include "absl/flags/declare.h" +#include "absl/flags/flag.h" +#include "absl/flags/reflection.h" +#endif + +#if !defined(GTEST_DEV_EMAIL_) +#define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com" +#define GTEST_FLAG_PREFIX_ "gtest_" +#define GTEST_FLAG_PREFIX_DASH_ "gtest-" +#define GTEST_FLAG_PREFIX_UPPER_ "GTEST_" +#define GTEST_NAME_ "Google Test" +#define GTEST_PROJECT_URL_ "https://github.com/google/googletest/" +#endif // !defined(GTEST_DEV_EMAIL_) + +#if !defined(GTEST_INIT_GOOGLE_TEST_NAME_) +#define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest" +#endif // !defined(GTEST_INIT_GOOGLE_TEST_NAME_) + +// Determines the version of gcc that is used to compile this. +#ifdef __GNUC__ +// 40302 means version 4.3.2. +#define GTEST_GCC_VER_ \ + (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) +#endif // __GNUC__ + +// Macros for disabling Microsoft Visual C++ warnings. +// +// GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 4385) +// /* code that triggers warnings C4800 and C4385 */ +// GTEST_DISABLE_MSC_WARNINGS_POP_() +#if defined(_MSC_VER) +#define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \ + __pragma(warning(push)) __pragma(warning(disable : warnings)) +#define GTEST_DISABLE_MSC_WARNINGS_POP_() __pragma(warning(pop)) +#else +// Not all compilers are MSVC +#define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) +#define GTEST_DISABLE_MSC_WARNINGS_POP_() +#endif + +// Clang on Windows does not understand MSVC's pragma warning. +// We need clang-specific way to disable function deprecation warning. +#ifdef __clang__ +#define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") \ + _Pragma("clang diagnostic ignored \"-Wdeprecated-implementations\"") +#define GTEST_DISABLE_MSC_DEPRECATED_POP_() _Pragma("clang diagnostic pop") +#else +#define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \ + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996) +#define GTEST_DISABLE_MSC_DEPRECATED_POP_() GTEST_DISABLE_MSC_WARNINGS_POP_() +#endif + +// Brings in definitions for functions used in the testing::internal::posix +// namespace (read, write, close, chdir, isatty, stat). We do not currently +// use them on Windows Mobile. +#if GTEST_OS_WINDOWS +#if !GTEST_OS_WINDOWS_MOBILE +#include +#include +#endif +// In order to avoid having to include , use forward declaration +#if GTEST_OS_WINDOWS_MINGW && !defined(__MINGW64_VERSION_MAJOR) +// MinGW defined _CRITICAL_SECTION and _RTL_CRITICAL_SECTION as two +// separate (equivalent) structs, instead of using typedef +typedef struct _CRITICAL_SECTION GTEST_CRITICAL_SECTION; +#else +// Assume CRITICAL_SECTION is a typedef of _RTL_CRITICAL_SECTION. +// This assumption is verified by +// WindowsTypesTest.CRITICAL_SECTIONIs_RTL_CRITICAL_SECTION. +typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; +#endif +#elif GTEST_OS_XTENSA +#include +// Xtensa toolchains define strcasecmp in the string.h header instead of +// strings.h. string.h is already included. +#else +// This assumes that non-Windows OSes provide unistd.h. For OSes where this +// is not the case, we need to include headers that provide the functions +// mentioned above. +#include +#include +#endif // GTEST_OS_WINDOWS + +#if GTEST_OS_LINUX_ANDROID +// Used to define __ANDROID_API__ matching the target NDK API level. +#include // NOLINT +#endif + +// Defines this to true if and only if Google Test can use POSIX regular +// expressions. +#ifndef GTEST_HAS_POSIX_RE +#if GTEST_OS_LINUX_ANDROID +// On Android, is only available starting with Gingerbread. +#define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9) +#else +#define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS && !GTEST_OS_XTENSA) +#endif +#endif + +// Select the regular expression implementation. +#if GTEST_HAS_ABSL +// When using Abseil, RE2 is required. +#include "absl/strings/string_view.h" +#include "re2/re2.h" +#define GTEST_USES_RE2 1 +#elif GTEST_HAS_POSIX_RE +#include // NOLINT +#define GTEST_USES_POSIX_RE 1 +#else +// Use our own simple regex implementation. +#define GTEST_USES_SIMPLE_RE 1 +#endif + +#ifndef GTEST_HAS_EXCEPTIONS +// The user didn't tell us whether exceptions are enabled, so we need +// to figure it out. +#if defined(_MSC_VER) && defined(_CPPUNWIND) +// MSVC defines _CPPUNWIND to 1 if and only if exceptions are enabled. +#define GTEST_HAS_EXCEPTIONS 1 +#elif defined(__BORLANDC__) +// C++Builder's implementation of the STL uses the _HAS_EXCEPTIONS +// macro to enable exceptions, so we'll do the same. +// Assumes that exceptions are enabled by default. +#ifndef _HAS_EXCEPTIONS +#define _HAS_EXCEPTIONS 1 +#endif // _HAS_EXCEPTIONS +#define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS +#elif defined(__clang__) +// clang defines __EXCEPTIONS if and only if exceptions are enabled before clang +// 220714, but if and only if cleanups are enabled after that. In Obj-C++ files, +// there can be cleanups for ObjC exceptions which also need cleanups, even if +// C++ exceptions are disabled. clang has __has_feature(cxx_exceptions) which +// checks for C++ exceptions starting at clang r206352, but which checked for +// cleanups prior to that. To reliably check for C++ exception availability with +// clang, check for +// __EXCEPTIONS && __has_feature(cxx_exceptions). +#define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions)) +#elif defined(__GNUC__) && __EXCEPTIONS +// gcc defines __EXCEPTIONS to 1 if and only if exceptions are enabled. +#define GTEST_HAS_EXCEPTIONS 1 +#elif defined(__SUNPRO_CC) +// Sun Pro CC supports exceptions. However, there is no compile-time way of +// detecting whether they are enabled or not. Therefore, we assume that +// they are enabled unless the user tells us otherwise. +#define GTEST_HAS_EXCEPTIONS 1 +#elif defined(__IBMCPP__) && __EXCEPTIONS +// xlC defines __EXCEPTIONS to 1 if and only if exceptions are enabled. +#define GTEST_HAS_EXCEPTIONS 1 +#elif defined(__HP_aCC) +// Exception handling is in effect by default in HP aCC compiler. It has to +// be turned of by +noeh compiler option if desired. +#define GTEST_HAS_EXCEPTIONS 1 +#else +// For other compilers, we assume exceptions are disabled to be +// conservative. +#define GTEST_HAS_EXCEPTIONS 0 +#endif // defined(_MSC_VER) || defined(__BORLANDC__) +#endif // GTEST_HAS_EXCEPTIONS + +#ifndef GTEST_HAS_STD_WSTRING +// The user didn't tell us whether ::std::wstring is available, so we need +// to figure it out. +// Cygwin 1.7 and below doesn't support ::std::wstring. +// Solaris' libc++ doesn't support it either. Android has +// no support for it at least as recent as Froyo (2.2). +#define GTEST_HAS_STD_WSTRING \ + (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \ + GTEST_OS_HAIKU || GTEST_OS_ESP32 || GTEST_OS_ESP8266 || GTEST_OS_XTENSA)) + +#endif // GTEST_HAS_STD_WSTRING + +// Determines whether RTTI is available. +#ifndef GTEST_HAS_RTTI +// The user didn't tell us whether RTTI is enabled, so we need to +// figure it out. + +#ifdef _MSC_VER + +#ifdef _CPPRTTI // MSVC defines this macro if and only if RTTI is enabled. +#define GTEST_HAS_RTTI 1 +#else +#define GTEST_HAS_RTTI 0 +#endif + +// Starting with version 4.3.2, gcc defines __GXX_RTTI if and only if RTTI is +// enabled. +#elif defined(__GNUC__) + +#ifdef __GXX_RTTI +// When building against STLport with the Android NDK and with +// -frtti -fno-exceptions, the build fails at link time with undefined +// references to __cxa_bad_typeid. Note sure if STL or toolchain bug, +// so disable RTTI when detected. +#if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && !defined(__EXCEPTIONS) +#define GTEST_HAS_RTTI 0 +#else +#define GTEST_HAS_RTTI 1 +#endif // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS +#else +#define GTEST_HAS_RTTI 0 +#endif // __GXX_RTTI + +// Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends +// using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the +// first version with C++ support. +#elif defined(__clang__) + +#define GTEST_HAS_RTTI __has_feature(cxx_rtti) + +// Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if +// both the typeid and dynamic_cast features are present. +#elif defined(__IBMCPP__) && (__IBMCPP__ >= 900) + +#ifdef __RTTI_ALL__ +#define GTEST_HAS_RTTI 1 +#else +#define GTEST_HAS_RTTI 0 +#endif + +#else + +// For all other compilers, we assume RTTI is enabled. +#define GTEST_HAS_RTTI 1 + +#endif // _MSC_VER + +#endif // GTEST_HAS_RTTI + +// It's this header's responsibility to #include when RTTI +// is enabled. +#if GTEST_HAS_RTTI +#include +#endif + +// Determines whether Google Test can use the pthreads library. +#ifndef GTEST_HAS_PTHREAD +// The user didn't tell us explicitly, so we make reasonable assumptions about +// which platforms have pthreads support. +// +// To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0 +// to your compiler flags. +#define GTEST_HAS_PTHREAD \ + (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX || GTEST_OS_QNX || \ + GTEST_OS_FREEBSD || GTEST_OS_NACL || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA || \ + GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_OPENBSD || \ + GTEST_OS_HAIKU || GTEST_OS_GNU_HURD) +#endif // GTEST_HAS_PTHREAD + +#if GTEST_HAS_PTHREAD +// gtest-port.h guarantees to #include when GTEST_HAS_PTHREAD is +// true. +#include // NOLINT + +// For timespec and nanosleep, used below. +#include // NOLINT +#endif + +// Determines whether clone(2) is supported. +// Usually it will only be available on Linux, excluding +// Linux on the Itanium architecture. +// Also see http://linux.die.net/man/2/clone. +#ifndef GTEST_HAS_CLONE +// The user didn't tell us, so we need to figure it out. + +#if GTEST_OS_LINUX && !defined(__ia64__) +#if GTEST_OS_LINUX_ANDROID +// On Android, clone() became available at different API levels for each 32-bit +// architecture. +#if defined(__LP64__) || (defined(__arm__) && __ANDROID_API__ >= 9) || \ + (defined(__mips__) && __ANDROID_API__ >= 12) || \ + (defined(__i386__) && __ANDROID_API__ >= 17) +#define GTEST_HAS_CLONE 1 +#else +#define GTEST_HAS_CLONE 0 +#endif +#else +#define GTEST_HAS_CLONE 1 +#endif +#else +#define GTEST_HAS_CLONE 0 +#endif // GTEST_OS_LINUX && !defined(__ia64__) + +#endif // GTEST_HAS_CLONE + +// Determines whether to support stream redirection. This is used to test +// output correctness and to implement death tests. +#ifndef GTEST_HAS_STREAM_REDIRECTION +// By default, we assume that stream redirection is supported on all +// platforms except known mobile ones. +#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \ + GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_XTENSA +#define GTEST_HAS_STREAM_REDIRECTION 0 +#else +#define GTEST_HAS_STREAM_REDIRECTION 1 +#endif // !GTEST_OS_WINDOWS_MOBILE +#endif // GTEST_HAS_STREAM_REDIRECTION + +// Determines whether to support death tests. +// pops up a dialog window that cannot be suppressed programmatically. +#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \ + (GTEST_OS_MAC && !GTEST_OS_IOS) || \ + (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER) || GTEST_OS_WINDOWS_MINGW || \ + GTEST_OS_AIX || GTEST_OS_HPUX || GTEST_OS_OPENBSD || GTEST_OS_QNX || \ + GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA || \ + GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_HAIKU || \ + GTEST_OS_GNU_HURD) +#define GTEST_HAS_DEATH_TEST 1 +#endif + +// Determines whether to support type-driven tests. + +// Typed tests need and variadic macros, which GCC, VC++ 8.0, +// Sun Pro CC, IBM Visual Age, and HP aCC support. +#if defined(__GNUC__) || defined(_MSC_VER) || defined(__SUNPRO_CC) || \ + defined(__IBMCPP__) || defined(__HP_aCC) +#define GTEST_HAS_TYPED_TEST 1 +#define GTEST_HAS_TYPED_TEST_P 1 +#endif + +// Determines whether the system compiler uses UTF-16 for encoding wide strings. +#define GTEST_WIDE_STRING_USES_UTF16_ \ + (GTEST_OS_WINDOWS || GTEST_OS_CYGWIN || GTEST_OS_AIX || GTEST_OS_OS2) + +// Determines whether test results can be streamed to a socket. +#if GTEST_OS_LINUX || GTEST_OS_GNU_KFREEBSD || GTEST_OS_DRAGONFLY || \ + GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_OPENBSD || \ + GTEST_OS_GNU_HURD +#define GTEST_CAN_STREAM_RESULTS_ 1 +#endif + +// Defines some utility macros. + +// The GNU compiler emits a warning if nested "if" statements are followed by +// an "else" statement and braces are not used to explicitly disambiguate the +// "else" binding. This leads to problems with code like: +// +// if (gate) +// ASSERT_*(condition) << "Some message"; +// +// The "switch (0) case 0:" idiom is used to suppress this. +#ifdef __INTEL_COMPILER +#define GTEST_AMBIGUOUS_ELSE_BLOCKER_ +#else +#define GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + switch (0) \ + case 0: \ + default: // NOLINT +#endif + +// Use this annotation at the end of a struct/class definition to +// prevent the compiler from optimizing away instances that are never +// used. This is useful when all interesting logic happens inside the +// c'tor and / or d'tor. Example: +// +// struct Foo { +// Foo() { ... } +// } GTEST_ATTRIBUTE_UNUSED_; +// +// Also use it after a variable or parameter declaration to tell the +// compiler the variable/parameter does not have to be used. +#if defined(__GNUC__) && !defined(COMPILER_ICC) +#define GTEST_ATTRIBUTE_UNUSED_ __attribute__((unused)) +#elif defined(__clang__) +#if __has_attribute(unused) +#define GTEST_ATTRIBUTE_UNUSED_ __attribute__((unused)) +#endif +#endif +#ifndef GTEST_ATTRIBUTE_UNUSED_ +#define GTEST_ATTRIBUTE_UNUSED_ +#endif + +// Use this annotation before a function that takes a printf format string. +#if (defined(__GNUC__) || defined(__clang__)) && !defined(COMPILER_ICC) +#if defined(__MINGW_PRINTF_FORMAT) +// MinGW has two different printf implementations. Ensure the format macro +// matches the selected implementation. See +// https://sourceforge.net/p/mingw-w64/wiki2/gnu%20printf/. +#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \ + __attribute__(( \ + __format__(__MINGW_PRINTF_FORMAT, string_index, first_to_check))) +#else +#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \ + __attribute__((__format__(__printf__, string_index, first_to_check))) +#endif +#else +#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) +#endif + +// Tell the compiler to warn about unused return values for functions declared +// with this macro. The macro should be used on function declarations +// following the argument list: +// +// Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_; +#if defined(__GNUC__) && !defined(COMPILER_ICC) +#define GTEST_MUST_USE_RESULT_ __attribute__((warn_unused_result)) +#else +#define GTEST_MUST_USE_RESULT_ +#endif // __GNUC__ && !COMPILER_ICC + +// MS C++ compiler emits warning when a conditional expression is compile time +// constant. In some contexts this warning is false positive and needs to be +// suppressed. Use the following two macros in such cases: +// +// GTEST_INTENTIONAL_CONST_COND_PUSH_() +// while (true) { +// GTEST_INTENTIONAL_CONST_COND_POP_() +// } +#define GTEST_INTENTIONAL_CONST_COND_PUSH_() \ + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127) +#define GTEST_INTENTIONAL_CONST_COND_POP_() GTEST_DISABLE_MSC_WARNINGS_POP_() + +// Determine whether the compiler supports Microsoft's Structured Exception +// Handling. This is supported by several Windows compilers but generally +// does not exist on any other system. +#ifndef GTEST_HAS_SEH +// The user didn't tell us, so we need to figure it out. + +#if defined(_MSC_VER) || defined(__BORLANDC__) +// These two compilers are known to support SEH. +#define GTEST_HAS_SEH 1 +#else +// Assume no SEH. +#define GTEST_HAS_SEH 0 +#endif + +#endif // GTEST_HAS_SEH + +#ifndef GTEST_IS_THREADSAFE + +#define GTEST_IS_THREADSAFE \ + (GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ || \ + (GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT) || \ + GTEST_HAS_PTHREAD) + +#endif // GTEST_IS_THREADSAFE + +#if GTEST_IS_THREADSAFE +// Some platforms don't support including these threading related headers. +#include // NOLINT +#include // NOLINT +#endif // GTEST_IS_THREADSAFE + +// GTEST_API_ qualifies all symbols that must be exported. The definitions below +// are guarded by #ifndef to give embedders a chance to define GTEST_API_ in +// gtest/internal/custom/gtest-port.h +#ifndef GTEST_API_ + +#ifdef _MSC_VER +#if GTEST_LINKED_AS_SHARED_LIBRARY +#define GTEST_API_ __declspec(dllimport) +#elif GTEST_CREATE_SHARED_LIBRARY +#define GTEST_API_ __declspec(dllexport) +#endif +#elif __GNUC__ >= 4 || defined(__clang__) +#define GTEST_API_ __attribute__((visibility("default"))) +#endif // _MSC_VER + +#endif // GTEST_API_ + +#ifndef GTEST_API_ +#define GTEST_API_ +#endif // GTEST_API_ + +#ifndef GTEST_DEFAULT_DEATH_TEST_STYLE +#define GTEST_DEFAULT_DEATH_TEST_STYLE "fast" +#endif // GTEST_DEFAULT_DEATH_TEST_STYLE + +#ifdef __GNUC__ +// Ask the compiler to never inline a given function. +#define GTEST_NO_INLINE_ __attribute__((noinline)) +#else +#define GTEST_NO_INLINE_ +#endif + +#if defined(__clang__) +// Nested ifs to avoid triggering MSVC warning. +#if __has_attribute(disable_tail_calls) +// Ask the compiler not to perform tail call optimization inside +// the marked function. +#define GTEST_NO_TAIL_CALL_ __attribute__((disable_tail_calls)) +#endif +#elif __GNUC__ +#define GTEST_NO_TAIL_CALL_ \ + __attribute__((optimize("no-optimize-sibling-calls"))) +#else +#define GTEST_NO_TAIL_CALL_ +#endif + +// _LIBCPP_VERSION is defined by the libc++ library from the LLVM project. +#if !defined(GTEST_HAS_CXXABI_H_) +#if defined(__GLIBCXX__) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)) +#define GTEST_HAS_CXXABI_H_ 1 +#else +#define GTEST_HAS_CXXABI_H_ 0 +#endif +#endif + +// A function level attribute to disable checking for use of uninitialized +// memory when built with MemorySanitizer. +#if defined(__clang__) +#if __has_feature(memory_sanitizer) +#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ __attribute__((no_sanitize_memory)) +#else +#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ +#endif // __has_feature(memory_sanitizer) +#else +#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ +#endif // __clang__ + +// A function level attribute to disable AddressSanitizer instrumentation. +#if defined(__clang__) +#if __has_feature(address_sanitizer) +#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \ + __attribute__((no_sanitize_address)) +#else +#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ +#endif // __has_feature(address_sanitizer) +#else +#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ +#endif // __clang__ + +// A function level attribute to disable HWAddressSanitizer instrumentation. +#if defined(__clang__) +#if __has_feature(hwaddress_sanitizer) +#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ \ + __attribute__((no_sanitize("hwaddress"))) +#else +#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ +#endif // __has_feature(hwaddress_sanitizer) +#else +#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ +#endif // __clang__ + +// A function level attribute to disable ThreadSanitizer instrumentation. +#if defined(__clang__) +#if __has_feature(thread_sanitizer) +#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ __attribute__((no_sanitize_thread)) +#else +#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ +#endif // __has_feature(thread_sanitizer) +#else +#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ +#endif // __clang__ + +namespace testing { + +class Message; + +// Legacy imports for backwards compatibility. +// New code should use std:: names directly. +using std::get; +using std::make_tuple; +using std::tuple; +using std::tuple_element; +using std::tuple_size; + +namespace internal { + +// A secret type that Google Test users don't know about. It has no +// definition on purpose. Therefore it's impossible to create a +// Secret object, which is what we want. +class Secret; + +// A helper for suppressing warnings on constant condition. It just +// returns 'condition'. +GTEST_API_ bool IsTrue(bool condition); + +// Defines RE. + +#if GTEST_USES_RE2 + +// This is almost `using RE = ::RE2`, except it is copy-constructible, and it +// needs to disambiguate the `std::string`, `absl::string_view`, and `const +// char*` constructors. +class GTEST_API_ RE { + public: + RE(absl::string_view regex) : regex_(regex) {} // NOLINT + RE(const char* regex) : RE(absl::string_view(regex)) {} // NOLINT + RE(const std::string& regex) : RE(absl::string_view(regex)) {} // NOLINT + RE(const RE& other) : RE(other.pattern()) {} + + const std::string& pattern() const { return regex_.pattern(); } + + static bool FullMatch(absl::string_view str, const RE& re) { + return RE2::FullMatch(str, re.regex_); + } + static bool PartialMatch(absl::string_view str, const RE& re) { + return RE2::PartialMatch(str, re.regex_); + } + + private: + RE2 regex_; +}; + +#elif GTEST_USES_POSIX_RE || GTEST_USES_SIMPLE_RE + +// A simple C++ wrapper for . It uses the POSIX Extended +// Regular Expression syntax. +class GTEST_API_ RE { + public: + // A copy constructor is required by the Standard to initialize object + // references from r-values. + RE(const RE& other) { Init(other.pattern()); } + + // Constructs an RE from a string. + RE(const ::std::string& regex) { Init(regex.c_str()); } // NOLINT + + RE(const char* regex) { Init(regex); } // NOLINT + ~RE(); + + // Returns the string representation of the regex. + const char* pattern() const { return pattern_; } + + // FullMatch(str, re) returns true if and only if regular expression re + // matches the entire str. + // PartialMatch(str, re) returns true if and only if regular expression re + // matches a substring of str (including str itself). + static bool FullMatch(const ::std::string& str, const RE& re) { + return FullMatch(str.c_str(), re); + } + static bool PartialMatch(const ::std::string& str, const RE& re) { + return PartialMatch(str.c_str(), re); + } + + static bool FullMatch(const char* str, const RE& re); + static bool PartialMatch(const char* str, const RE& re); + + private: + void Init(const char* regex); + const char* pattern_; + bool is_valid_; + +#if GTEST_USES_POSIX_RE + + regex_t full_regex_; // For FullMatch(). + regex_t partial_regex_; // For PartialMatch(). + +#else // GTEST_USES_SIMPLE_RE + + const char* full_pattern_; // For FullMatch(); + +#endif +}; + +#endif // ::testing::internal::RE implementation + +// Formats a source file path and a line number as they would appear +// in an error message from the compiler used to compile this code. +GTEST_API_ ::std::string FormatFileLocation(const char* file, int line); + +// Formats a file location for compiler-independent XML output. +// Although this function is not platform dependent, we put it next to +// FormatFileLocation in order to contrast the two functions. +GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file, + int line); + +// Defines logging utilities: +// GTEST_LOG_(severity) - logs messages at the specified severity level. The +// message itself is streamed into the macro. +// LogToStderr() - directs all log messages to stderr. +// FlushInfoLog() - flushes informational log messages. + +enum GTestLogSeverity { GTEST_INFO, GTEST_WARNING, GTEST_ERROR, GTEST_FATAL }; + +// Formats log entry severity, provides a stream object for streaming the +// log message, and terminates the message with a newline when going out of +// scope. +class GTEST_API_ GTestLog { + public: + GTestLog(GTestLogSeverity severity, const char* file, int line); + + // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program. + ~GTestLog(); + + ::std::ostream& GetStream() { return ::std::cerr; } + + private: + const GTestLogSeverity severity_; + + GTestLog(const GTestLog&) = delete; + GTestLog& operator=(const GTestLog&) = delete; +}; + +#if !defined(GTEST_LOG_) + +#define GTEST_LOG_(severity) \ + ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \ + __FILE__, __LINE__) \ + .GetStream() + +inline void LogToStderr() {} +inline void FlushInfoLog() { fflush(nullptr); } + +#endif // !defined(GTEST_LOG_) + +#if !defined(GTEST_CHECK_) +// INTERNAL IMPLEMENTATION - DO NOT USE. +// +// GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition +// is not satisfied. +// Synopsis: +// GTEST_CHECK_(boolean_condition); +// or +// GTEST_CHECK_(boolean_condition) << "Additional message"; +// +// This checks the condition and if the condition is not satisfied +// it prints message about the condition violation, including the +// condition itself, plus additional message streamed into it, if any, +// and then it aborts the program. It aborts the program irrespective of +// whether it is built in the debug mode or not. +#define GTEST_CHECK_(condition) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::IsTrue(condition)) \ + ; \ + else \ + GTEST_LOG_(FATAL) << "Condition " #condition " failed. " +#endif // !defined(GTEST_CHECK_) + +// An all-mode assert to verify that the given POSIX-style function +// call returns 0 (indicating success). Known limitation: this +// doesn't expand to a balanced 'if' statement, so enclose the macro +// in {} if you need to use it as the only statement in an 'if' +// branch. +#define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \ + if (const int gtest_error = (posix_call)) \ + GTEST_LOG_(FATAL) << #posix_call << "failed with error " << gtest_error + +// Transforms "T" into "const T&" according to standard reference collapsing +// rules (this is only needed as a backport for C++98 compilers that do not +// support reference collapsing). Specifically, it transforms: +// +// char ==> const char& +// const char ==> const char& +// char& ==> char& +// const char& ==> const char& +// +// Note that the non-const reference will not have "const" added. This is +// standard, and necessary so that "T" can always bind to "const T&". +template +struct ConstRef { + typedef const T& type; +}; +template +struct ConstRef { + typedef T& type; +}; + +// The argument T must depend on some template parameters. +#define GTEST_REFERENCE_TO_CONST_(T) \ + typename ::testing::internal::ConstRef::type + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Use ImplicitCast_ as a safe version of static_cast for upcasting in +// the type hierarchy (e.g. casting a Foo* to a SuperclassOfFoo* or a +// const Foo*). When you use ImplicitCast_, the compiler checks that +// the cast is safe. Such explicit ImplicitCast_s are necessary in +// surprisingly many situations where C++ demands an exact type match +// instead of an argument type convertible to a target type. +// +// The syntax for using ImplicitCast_ is the same as for static_cast: +// +// ImplicitCast_(expr) +// +// ImplicitCast_ would have been part of the C++ standard library, +// but the proposal was submitted too late. It will probably make +// its way into the language in the future. +// +// This relatively ugly name is intentional. It prevents clashes with +// similar functions users may have (e.g., implicit_cast). The internal +// namespace alone is not enough because the function can be found by ADL. +template +inline To ImplicitCast_(To x) { + return x; +} + +// When you upcast (that is, cast a pointer from type Foo to type +// SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts +// always succeed. When you downcast (that is, cast a pointer from +// type Foo to type SubclassOfFoo), static_cast<> isn't safe, because +// how do you know the pointer is really of type SubclassOfFoo? It +// could be a bare Foo, or of type DifferentSubclassOfFoo. Thus, +// when you downcast, you should use this macro. In debug mode, we +// use dynamic_cast<> to double-check the downcast is legal (we die +// if it's not). In normal mode, we do the efficient static_cast<> +// instead. Thus, it's important to test in debug mode to make sure +// the cast is legal! +// This is the only place in the code we should use dynamic_cast<>. +// In particular, you SHOULDN'T be using dynamic_cast<> in order to +// do RTTI (eg code like this: +// if (dynamic_cast(foo)) HandleASubclass1Object(foo); +// if (dynamic_cast(foo)) HandleASubclass2Object(foo); +// You should design the code some other way not to need this. +// +// This relatively ugly name is intentional. It prevents clashes with +// similar functions users may have (e.g., down_cast). The internal +// namespace alone is not enough because the function can be found by ADL. +template // use like this: DownCast_(foo); +inline To DownCast_(From* f) { // so we only accept pointers + // Ensures that To is a sub-type of From *. This test is here only + // for compile-time type checking, and has no overhead in an + // optimized build at run-time, as it will be optimized away + // completely. + GTEST_INTENTIONAL_CONST_COND_PUSH_() + if (false) { + GTEST_INTENTIONAL_CONST_COND_POP_() + const To to = nullptr; + ::testing::internal::ImplicitCast_(to); + } + +#if GTEST_HAS_RTTI + // RTTI: debug mode only! + GTEST_CHECK_(f == nullptr || dynamic_cast(f) != nullptr); +#endif + return static_cast(f); +} + +// Downcasts the pointer of type Base to Derived. +// Derived must be a subclass of Base. The parameter MUST +// point to a class of type Derived, not any subclass of it. +// When RTTI is available, the function performs a runtime +// check to enforce this. +template +Derived* CheckedDowncastToActualType(Base* base) { +#if GTEST_HAS_RTTI + GTEST_CHECK_(typeid(*base) == typeid(Derived)); +#endif + +#if GTEST_HAS_DOWNCAST_ + return ::down_cast(base); +#elif GTEST_HAS_RTTI + return dynamic_cast(base); // NOLINT +#else + return static_cast(base); // Poor man's downcast. +#endif +} + +#if GTEST_HAS_STREAM_REDIRECTION + +// Defines the stderr capturer: +// CaptureStdout - starts capturing stdout. +// GetCapturedStdout - stops capturing stdout and returns the captured string. +// CaptureStderr - starts capturing stderr. +// GetCapturedStderr - stops capturing stderr and returns the captured string. +// +GTEST_API_ void CaptureStdout(); +GTEST_API_ std::string GetCapturedStdout(); +GTEST_API_ void CaptureStderr(); +GTEST_API_ std::string GetCapturedStderr(); + +#endif // GTEST_HAS_STREAM_REDIRECTION +// Returns the size (in bytes) of a file. +GTEST_API_ size_t GetFileSize(FILE* file); + +// Reads the entire content of a file as a string. +GTEST_API_ std::string ReadEntireFile(FILE* file); + +// All command line arguments. +GTEST_API_ std::vector GetArgvs(); + +#if GTEST_HAS_DEATH_TEST + +std::vector GetInjectableArgvs(); +// Deprecated: pass the args vector by value instead. +void SetInjectableArgvs(const std::vector* new_argvs); +void SetInjectableArgvs(const std::vector& new_argvs); +void ClearInjectableArgvs(); + +#endif // GTEST_HAS_DEATH_TEST + +// Defines synchronization primitives. +#if GTEST_IS_THREADSAFE + +#if GTEST_OS_WINDOWS +// Provides leak-safe Windows kernel handle ownership. +// Used in death tests and in threading support. +class GTEST_API_ AutoHandle { + public: + // Assume that Win32 HANDLE type is equivalent to void*. Doing so allows us to + // avoid including in this header file. Including is + // undesirable because it defines a lot of symbols and macros that tend to + // conflict with client code. This assumption is verified by + // WindowsTypesTest.HANDLEIsVoidStar. + typedef void* Handle; + AutoHandle(); + explicit AutoHandle(Handle handle); + + ~AutoHandle(); + + Handle Get() const; + void Reset(); + void Reset(Handle handle); + + private: + // Returns true if and only if the handle is a valid handle object that can be + // closed. + bool IsCloseable() const; + + Handle handle_; + + AutoHandle(const AutoHandle&) = delete; + AutoHandle& operator=(const AutoHandle&) = delete; +}; +#endif + +#if GTEST_HAS_NOTIFICATION_ +// Notification has already been imported into the namespace. +// Nothing to do here. + +#else +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +// Allows a controller thread to pause execution of newly created +// threads until notified. Instances of this class must be created +// and destroyed in the controller thread. +// +// This class is only for testing Google Test's own constructs. Do not +// use it in user tests, either directly or indirectly. +// TODO(b/203539622): Replace unconditionally with absl::Notification. +class GTEST_API_ Notification { + public: + Notification() : notified_(false) {} + Notification(const Notification&) = delete; + Notification& operator=(const Notification&) = delete; + + // Notifies all threads created with this notification to start. Must + // be called from the controller thread. + void Notify() { + std::lock_guard lock(mu_); + notified_ = true; + cv_.notify_all(); + } + + // Blocks until the controller thread notifies. Must be called from a test + // thread. + void WaitForNotification() { + std::unique_lock lock(mu_); + cv_.wait(lock, [this]() { return notified_; }); + } + + private: + std::mutex mu_; + std::condition_variable cv_; + bool notified_; +}; +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 +#endif // GTEST_HAS_NOTIFICATION_ + +// On MinGW, we can have both GTEST_OS_WINDOWS and GTEST_HAS_PTHREAD +// defined, but we don't want to use MinGW's pthreads implementation, which +// has conformance problems with some versions of the POSIX standard. +#if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW + +// As a C-function, ThreadFuncWithCLinkage cannot be templated itself. +// Consequently, it cannot select a correct instantiation of ThreadWithParam +// in order to call its Run(). Introducing ThreadWithParamBase as a +// non-templated base class for ThreadWithParam allows us to bypass this +// problem. +class ThreadWithParamBase { + public: + virtual ~ThreadWithParamBase() {} + virtual void Run() = 0; +}; + +// pthread_create() accepts a pointer to a function type with the C linkage. +// According to the Standard (7.5/1), function types with different linkages +// are different even if they are otherwise identical. Some compilers (for +// example, SunStudio) treat them as different types. Since class methods +// cannot be defined with C-linkage we need to define a free C-function to +// pass into pthread_create(). +extern "C" inline void* ThreadFuncWithCLinkage(void* thread) { + static_cast(thread)->Run(); + return nullptr; +} + +// Helper class for testing Google Test's multi-threading constructs. +// To use it, write: +// +// void ThreadFunc(int param) { /* Do things with param */ } +// Notification thread_can_start; +// ... +// // The thread_can_start parameter is optional; you can supply NULL. +// ThreadWithParam thread(&ThreadFunc, 5, &thread_can_start); +// thread_can_start.Notify(); +// +// These classes are only for testing Google Test's own constructs. Do +// not use them in user tests, either directly or indirectly. +template +class ThreadWithParam : public ThreadWithParamBase { + public: + typedef void UserThreadFunc(T); + + ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start) + : func_(func), + param_(param), + thread_can_start_(thread_can_start), + finished_(false) { + ThreadWithParamBase* const base = this; + // The thread can be created only after all fields except thread_ + // have been initialized. + GTEST_CHECK_POSIX_SUCCESS_( + pthread_create(&thread_, nullptr, &ThreadFuncWithCLinkage, base)); + } + ~ThreadWithParam() override { Join(); } + + void Join() { + if (!finished_) { + GTEST_CHECK_POSIX_SUCCESS_(pthread_join(thread_, nullptr)); + finished_ = true; + } + } + + void Run() override { + if (thread_can_start_ != nullptr) thread_can_start_->WaitForNotification(); + func_(param_); + } + + private: + UserThreadFunc* const func_; // User-supplied thread function. + const T param_; // User-supplied parameter to the thread function. + // When non-NULL, used to block execution until the controller thread + // notifies. + Notification* const thread_can_start_; + bool finished_; // true if and only if we know that the thread function has + // finished. + pthread_t thread_; // The native thread object. + + ThreadWithParam(const ThreadWithParam&) = delete; + ThreadWithParam& operator=(const ThreadWithParam&) = delete; +}; +#endif // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD || + // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ + +#if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ +// Mutex and ThreadLocal have already been imported into the namespace. +// Nothing to do here. + +#elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT + +// Mutex implements mutex on Windows platforms. It is used in conjunction +// with class MutexLock: +// +// Mutex mutex; +// ... +// MutexLock lock(&mutex); // Acquires the mutex and releases it at the +// // end of the current scope. +// +// A static Mutex *must* be defined or declared using one of the following +// macros: +// GTEST_DEFINE_STATIC_MUTEX_(g_some_mutex); +// GTEST_DECLARE_STATIC_MUTEX_(g_some_mutex); +// +// (A non-static Mutex is defined/declared in the usual way). +class GTEST_API_ Mutex { + public: + enum MutexType { kStatic = 0, kDynamic = 1 }; + // We rely on kStaticMutex being 0 as it is to what the linker initializes + // type_ in static mutexes. critical_section_ will be initialized lazily + // in ThreadSafeLazyInit(). + enum StaticConstructorSelector { kStaticMutex = 0 }; + + // This constructor intentionally does nothing. It relies on type_ being + // statically initialized to 0 (effectively setting it to kStatic) and on + // ThreadSafeLazyInit() to lazily initialize the rest of the members. + explicit Mutex(StaticConstructorSelector /*dummy*/) {} + + Mutex(); + ~Mutex(); + + void Lock(); + + void Unlock(); + + // Does nothing if the current thread holds the mutex. Otherwise, crashes + // with high probability. + void AssertHeld(); + + private: + // Initializes owner_thread_id_ and critical_section_ in static mutexes. + void ThreadSafeLazyInit(); + + // Per https://blogs.msdn.microsoft.com/oldnewthing/20040223-00/?p=40503, + // we assume that 0 is an invalid value for thread IDs. + unsigned int owner_thread_id_; + + // For static mutexes, we rely on these members being initialized to zeros + // by the linker. + MutexType type_; + long critical_section_init_phase_; // NOLINT + GTEST_CRITICAL_SECTION* critical_section_; + + Mutex(const Mutex&) = delete; + Mutex& operator=(const Mutex&) = delete; +}; + +#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ + extern ::testing::internal::Mutex mutex + +#define GTEST_DEFINE_STATIC_MUTEX_(mutex) \ + ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex) + +// We cannot name this class MutexLock because the ctor declaration would +// conflict with a macro named MutexLock, which is defined on some +// platforms. That macro is used as a defensive measure to prevent against +// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than +// "MutexLock l(&mu)". Hence the typedef trick below. +class GTestMutexLock { + public: + explicit GTestMutexLock(Mutex* mutex) : mutex_(mutex) { mutex_->Lock(); } + + ~GTestMutexLock() { mutex_->Unlock(); } + + private: + Mutex* const mutex_; + + GTestMutexLock(const GTestMutexLock&) = delete; + GTestMutexLock& operator=(const GTestMutexLock&) = delete; +}; + +typedef GTestMutexLock MutexLock; + +// Base class for ValueHolder. Allows a caller to hold and delete a value +// without knowing its type. +class ThreadLocalValueHolderBase { + public: + virtual ~ThreadLocalValueHolderBase() {} +}; + +// Provides a way for a thread to send notifications to a ThreadLocal +// regardless of its parameter type. +class ThreadLocalBase { + public: + // Creates a new ValueHolder object holding a default value passed to + // this ThreadLocal's constructor and returns it. It is the caller's + // responsibility not to call this when the ThreadLocal instance already + // has a value on the current thread. + virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const = 0; + + protected: + ThreadLocalBase() {} + virtual ~ThreadLocalBase() {} + + private: + ThreadLocalBase(const ThreadLocalBase&) = delete; + ThreadLocalBase& operator=(const ThreadLocalBase&) = delete; +}; + +// Maps a thread to a set of ThreadLocals that have values instantiated on that +// thread and notifies them when the thread exits. A ThreadLocal instance is +// expected to persist until all threads it has values on have terminated. +class GTEST_API_ ThreadLocalRegistry { + public: + // Registers thread_local_instance as having value on the current thread. + // Returns a value that can be used to identify the thread from other threads. + static ThreadLocalValueHolderBase* GetValueOnCurrentThread( + const ThreadLocalBase* thread_local_instance); + + // Invoked when a ThreadLocal instance is destroyed. + static void OnThreadLocalDestroyed( + const ThreadLocalBase* thread_local_instance); +}; + +class GTEST_API_ ThreadWithParamBase { + public: + void Join(); + + protected: + class Runnable { + public: + virtual ~Runnable() {} + virtual void Run() = 0; + }; + + ThreadWithParamBase(Runnable* runnable, Notification* thread_can_start); + virtual ~ThreadWithParamBase(); + + private: + AutoHandle thread_; +}; + +// Helper class for testing Google Test's multi-threading constructs. +template +class ThreadWithParam : public ThreadWithParamBase { + public: + typedef void UserThreadFunc(T); + + ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start) + : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) {} + virtual ~ThreadWithParam() {} + + private: + class RunnableImpl : public Runnable { + public: + RunnableImpl(UserThreadFunc* func, T param) : func_(func), param_(param) {} + virtual ~RunnableImpl() {} + virtual void Run() { func_(param_); } + + private: + UserThreadFunc* const func_; + const T param_; + + RunnableImpl(const RunnableImpl&) = delete; + RunnableImpl& operator=(const RunnableImpl&) = delete; + }; + + ThreadWithParam(const ThreadWithParam&) = delete; + ThreadWithParam& operator=(const ThreadWithParam&) = delete; +}; + +// Implements thread-local storage on Windows systems. +// +// // Thread 1 +// ThreadLocal tl(100); // 100 is the default value for each thread. +// +// // Thread 2 +// tl.set(150); // Changes the value for thread 2 only. +// EXPECT_EQ(150, tl.get()); +// +// // Thread 1 +// EXPECT_EQ(100, tl.get()); // In thread 1, tl has the original value. +// tl.set(200); +// EXPECT_EQ(200, tl.get()); +// +// The template type argument T must have a public copy constructor. +// In addition, the default ThreadLocal constructor requires T to have +// a public default constructor. +// +// The users of a TheadLocal instance have to make sure that all but one +// threads (including the main one) using that instance have exited before +// destroying it. Otherwise, the per-thread objects managed for them by the +// ThreadLocal instance are not guaranteed to be destroyed on all platforms. +// +// Google Test only uses global ThreadLocal objects. That means they +// will die after main() has returned. Therefore, no per-thread +// object managed by Google Test will be leaked as long as all threads +// using Google Test have exited when main() returns. +template +class ThreadLocal : public ThreadLocalBase { + public: + ThreadLocal() : default_factory_(new DefaultValueHolderFactory()) {} + explicit ThreadLocal(const T& value) + : default_factory_(new InstanceValueHolderFactory(value)) {} + + ~ThreadLocal() override { ThreadLocalRegistry::OnThreadLocalDestroyed(this); } + + T* pointer() { return GetOrCreateValue(); } + const T* pointer() const { return GetOrCreateValue(); } + const T& get() const { return *pointer(); } + void set(const T& value) { *pointer() = value; } + + private: + // Holds a value of T. Can be deleted via its base class without the caller + // knowing the type of T. + class ValueHolder : public ThreadLocalValueHolderBase { + public: + ValueHolder() : value_() {} + explicit ValueHolder(const T& value) : value_(value) {} + + T* pointer() { return &value_; } + + private: + T value_; + ValueHolder(const ValueHolder&) = delete; + ValueHolder& operator=(const ValueHolder&) = delete; + }; + + T* GetOrCreateValue() const { + return static_cast( + ThreadLocalRegistry::GetValueOnCurrentThread(this)) + ->pointer(); + } + + ThreadLocalValueHolderBase* NewValueForCurrentThread() const override { + return default_factory_->MakeNewHolder(); + } + + class ValueHolderFactory { + public: + ValueHolderFactory() {} + virtual ~ValueHolderFactory() {} + virtual ValueHolder* MakeNewHolder() const = 0; + + private: + ValueHolderFactory(const ValueHolderFactory&) = delete; + ValueHolderFactory& operator=(const ValueHolderFactory&) = delete; + }; + + class DefaultValueHolderFactory : public ValueHolderFactory { + public: + DefaultValueHolderFactory() {} + ValueHolder* MakeNewHolder() const override { return new ValueHolder(); } + + private: + DefaultValueHolderFactory(const DefaultValueHolderFactory&) = delete; + DefaultValueHolderFactory& operator=(const DefaultValueHolderFactory&) = + delete; + }; + + class InstanceValueHolderFactory : public ValueHolderFactory { + public: + explicit InstanceValueHolderFactory(const T& value) : value_(value) {} + ValueHolder* MakeNewHolder() const override { + return new ValueHolder(value_); + } + + private: + const T value_; // The value for each thread. + + InstanceValueHolderFactory(const InstanceValueHolderFactory&) = delete; + InstanceValueHolderFactory& operator=(const InstanceValueHolderFactory&) = + delete; + }; + + std::unique_ptr default_factory_; + + ThreadLocal(const ThreadLocal&) = delete; + ThreadLocal& operator=(const ThreadLocal&) = delete; +}; + +#elif GTEST_HAS_PTHREAD + +// MutexBase and Mutex implement mutex on pthreads-based platforms. +class MutexBase { + public: + // Acquires this mutex. + void Lock() { + GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_lock(&mutex_)); + owner_ = pthread_self(); + has_owner_ = true; + } + + // Releases this mutex. + void Unlock() { + // Since the lock is being released the owner_ field should no longer be + // considered valid. We don't protect writing to has_owner_ here, as it's + // the caller's responsibility to ensure that the current thread holds the + // mutex when this is called. + has_owner_ = false; + GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_unlock(&mutex_)); + } + + // Does nothing if the current thread holds the mutex. Otherwise, crashes + // with high probability. + void AssertHeld() const { + GTEST_CHECK_(has_owner_ && pthread_equal(owner_, pthread_self())) + << "The current thread is not holding the mutex @" << this; + } + + // A static mutex may be used before main() is entered. It may even + // be used before the dynamic initialization stage. Therefore we + // must be able to initialize a static mutex object at link time. + // This means MutexBase has to be a POD and its member variables + // have to be public. + public: + pthread_mutex_t mutex_; // The underlying pthread mutex. + // has_owner_ indicates whether the owner_ field below contains a valid thread + // ID and is therefore safe to inspect (e.g., to use in pthread_equal()). All + // accesses to the owner_ field should be protected by a check of this field. + // An alternative might be to memset() owner_ to all zeros, but there's no + // guarantee that a zero'd pthread_t is necessarily invalid or even different + // from pthread_self(). + bool has_owner_; + pthread_t owner_; // The thread holding the mutex. +}; + +// Forward-declares a static mutex. +#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ + extern ::testing::internal::MutexBase mutex + +// Defines and statically (i.e. at link time) initializes a static mutex. +// The initialization list here does not explicitly initialize each field, +// instead relying on default initialization for the unspecified fields. In +// particular, the owner_ field (a pthread_t) is not explicitly initialized. +// This allows initialization to work whether pthread_t is a scalar or struct. +// The flag -Wmissing-field-initializers must not be specified for this to work. +#define GTEST_DEFINE_STATIC_MUTEX_(mutex) \ + ::testing::internal::MutexBase mutex = {PTHREAD_MUTEX_INITIALIZER, false, 0} + +// The Mutex class can only be used for mutexes created at runtime. It +// shares its API with MutexBase otherwise. +class Mutex : public MutexBase { + public: + Mutex() { + GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr)); + has_owner_ = false; + } + ~Mutex() { GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_)); } + + private: + Mutex(const Mutex&) = delete; + Mutex& operator=(const Mutex&) = delete; +}; + +// We cannot name this class MutexLock because the ctor declaration would +// conflict with a macro named MutexLock, which is defined on some +// platforms. That macro is used as a defensive measure to prevent against +// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than +// "MutexLock l(&mu)". Hence the typedef trick below. +class GTestMutexLock { + public: + explicit GTestMutexLock(MutexBase* mutex) : mutex_(mutex) { mutex_->Lock(); } + + ~GTestMutexLock() { mutex_->Unlock(); } + + private: + MutexBase* const mutex_; + + GTestMutexLock(const GTestMutexLock&) = delete; + GTestMutexLock& operator=(const GTestMutexLock&) = delete; +}; + +typedef GTestMutexLock MutexLock; + +// Helpers for ThreadLocal. + +// pthread_key_create() requires DeleteThreadLocalValue() to have +// C-linkage. Therefore it cannot be templatized to access +// ThreadLocal. Hence the need for class +// ThreadLocalValueHolderBase. +class ThreadLocalValueHolderBase { + public: + virtual ~ThreadLocalValueHolderBase() {} +}; + +// Called by pthread to delete thread-local data stored by +// pthread_setspecific(). +extern "C" inline void DeleteThreadLocalValue(void* value_holder) { + delete static_cast(value_holder); +} + +// Implements thread-local storage on pthreads-based systems. +template +class GTEST_API_ ThreadLocal { + public: + ThreadLocal() + : key_(CreateKey()), default_factory_(new DefaultValueHolderFactory()) {} + explicit ThreadLocal(const T& value) + : key_(CreateKey()), + default_factory_(new InstanceValueHolderFactory(value)) {} + + ~ThreadLocal() { + // Destroys the managed object for the current thread, if any. + DeleteThreadLocalValue(pthread_getspecific(key_)); + + // Releases resources associated with the key. This will *not* + // delete managed objects for other threads. + GTEST_CHECK_POSIX_SUCCESS_(pthread_key_delete(key_)); + } + + T* pointer() { return GetOrCreateValue(); } + const T* pointer() const { return GetOrCreateValue(); } + const T& get() const { return *pointer(); } + void set(const T& value) { *pointer() = value; } + + private: + // Holds a value of type T. + class ValueHolder : public ThreadLocalValueHolderBase { + public: + ValueHolder() : value_() {} + explicit ValueHolder(const T& value) : value_(value) {} + + T* pointer() { return &value_; } + + private: + T value_; + ValueHolder(const ValueHolder&) = delete; + ValueHolder& operator=(const ValueHolder&) = delete; + }; + + static pthread_key_t CreateKey() { + pthread_key_t key; + // When a thread exits, DeleteThreadLocalValue() will be called on + // the object managed for that thread. + GTEST_CHECK_POSIX_SUCCESS_( + pthread_key_create(&key, &DeleteThreadLocalValue)); + return key; + } + + T* GetOrCreateValue() const { + ThreadLocalValueHolderBase* const holder = + static_cast(pthread_getspecific(key_)); + if (holder != nullptr) { + return CheckedDowncastToActualType(holder)->pointer(); + } + + ValueHolder* const new_holder = default_factory_->MakeNewHolder(); + ThreadLocalValueHolderBase* const holder_base = new_holder; + GTEST_CHECK_POSIX_SUCCESS_(pthread_setspecific(key_, holder_base)); + return new_holder->pointer(); + } + + class ValueHolderFactory { + public: + ValueHolderFactory() {} + virtual ~ValueHolderFactory() {} + virtual ValueHolder* MakeNewHolder() const = 0; + + private: + ValueHolderFactory(const ValueHolderFactory&) = delete; + ValueHolderFactory& operator=(const ValueHolderFactory&) = delete; + }; + + class DefaultValueHolderFactory : public ValueHolderFactory { + public: + DefaultValueHolderFactory() {} + ValueHolder* MakeNewHolder() const override { return new ValueHolder(); } + + private: + DefaultValueHolderFactory(const DefaultValueHolderFactory&) = delete; + DefaultValueHolderFactory& operator=(const DefaultValueHolderFactory&) = + delete; + }; + + class InstanceValueHolderFactory : public ValueHolderFactory { + public: + explicit InstanceValueHolderFactory(const T& value) : value_(value) {} + ValueHolder* MakeNewHolder() const override { + return new ValueHolder(value_); + } + + private: + const T value_; // The value for each thread. + + InstanceValueHolderFactory(const InstanceValueHolderFactory&) = delete; + InstanceValueHolderFactory& operator=(const InstanceValueHolderFactory&) = + delete; + }; + + // A key pthreads uses for looking up per-thread values. + const pthread_key_t key_; + std::unique_ptr default_factory_; + + ThreadLocal(const ThreadLocal&) = delete; + ThreadLocal& operator=(const ThreadLocal&) = delete; +}; + +#endif // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ + +#else // GTEST_IS_THREADSAFE + +// A dummy implementation of synchronization primitives (mutex, lock, +// and thread-local variable). Necessary for compiling Google Test where +// mutex is not supported - using Google Test in multiple threads is not +// supported on such platforms. + +class Mutex { + public: + Mutex() {} + void Lock() {} + void Unlock() {} + void AssertHeld() const {} +}; + +#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ + extern ::testing::internal::Mutex mutex + +#define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex + +// We cannot name this class MutexLock because the ctor declaration would +// conflict with a macro named MutexLock, which is defined on some +// platforms. That macro is used as a defensive measure to prevent against +// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than +// "MutexLock l(&mu)". Hence the typedef trick below. +class GTestMutexLock { + public: + explicit GTestMutexLock(Mutex*) {} // NOLINT +}; + +typedef GTestMutexLock MutexLock; + +template +class GTEST_API_ ThreadLocal { + public: + ThreadLocal() : value_() {} + explicit ThreadLocal(const T& value) : value_(value) {} + T* pointer() { return &value_; } + const T* pointer() const { return &value_; } + const T& get() const { return value_; } + void set(const T& value) { value_ = value; } + + private: + T value_; +}; + +#endif // GTEST_IS_THREADSAFE + +// Returns the number of threads running in the process, or 0 to indicate that +// we cannot detect it. +GTEST_API_ size_t GetThreadCount(); + +#if GTEST_OS_WINDOWS +#define GTEST_PATH_SEP_ "\\" +#define GTEST_HAS_ALT_PATH_SEP_ 1 +#else +#define GTEST_PATH_SEP_ "/" +#define GTEST_HAS_ALT_PATH_SEP_ 0 +#endif // GTEST_OS_WINDOWS + +// Utilities for char. + +// isspace(int ch) and friends accept an unsigned char or EOF. char +// may be signed, depending on the compiler (or compiler flags). +// Therefore we need to cast a char to unsigned char before calling +// isspace(), etc. + +inline bool IsAlpha(char ch) { + return isalpha(static_cast(ch)) != 0; +} +inline bool IsAlNum(char ch) { + return isalnum(static_cast(ch)) != 0; +} +inline bool IsDigit(char ch) { + return isdigit(static_cast(ch)) != 0; +} +inline bool IsLower(char ch) { + return islower(static_cast(ch)) != 0; +} +inline bool IsSpace(char ch) { + return isspace(static_cast(ch)) != 0; +} +inline bool IsUpper(char ch) { + return isupper(static_cast(ch)) != 0; +} +inline bool IsXDigit(char ch) { + return isxdigit(static_cast(ch)) != 0; +} +#ifdef __cpp_char8_t +inline bool IsXDigit(char8_t ch) { + return isxdigit(static_cast(ch)) != 0; +} +#endif +inline bool IsXDigit(char16_t ch) { + const unsigned char low_byte = static_cast(ch); + return ch == low_byte && isxdigit(low_byte) != 0; +} +inline bool IsXDigit(char32_t ch) { + const unsigned char low_byte = static_cast(ch); + return ch == low_byte && isxdigit(low_byte) != 0; +} +inline bool IsXDigit(wchar_t ch) { + const unsigned char low_byte = static_cast(ch); + return ch == low_byte && isxdigit(low_byte) != 0; +} + +inline char ToLower(char ch) { + return static_cast(tolower(static_cast(ch))); +} +inline char ToUpper(char ch) { + return static_cast(toupper(static_cast(ch))); +} + +inline std::string StripTrailingSpaces(std::string str) { + std::string::iterator it = str.end(); + while (it != str.begin() && IsSpace(*--it)) it = str.erase(it); + return str; +} + +// The testing::internal::posix namespace holds wrappers for common +// POSIX functions. These wrappers hide the differences between +// Windows/MSVC and POSIX systems. Since some compilers define these +// standard functions as macros, the wrapper cannot have the same name +// as the wrapped function. + +namespace posix { + +// Functions with a different name on Windows. + +#if GTEST_OS_WINDOWS + +typedef struct _stat StatStruct; + +#ifdef __BORLANDC__ +inline int DoIsATTY(int fd) { return isatty(fd); } +inline int StrCaseCmp(const char* s1, const char* s2) { + return stricmp(s1, s2); +} +inline char* StrDup(const char* src) { return strdup(src); } +#else // !__BORLANDC__ +#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS || GTEST_OS_IOS || \ + GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT || defined(ESP_PLATFORM) +inline int DoIsATTY(int /* fd */) { return 0; } +#else +inline int DoIsATTY(int fd) { return _isatty(fd); } +#endif // GTEST_OS_WINDOWS_MOBILE +inline int StrCaseCmp(const char* s1, const char* s2) { + return _stricmp(s1, s2); +} +inline char* StrDup(const char* src) { return _strdup(src); } +#endif // __BORLANDC__ + +#if GTEST_OS_WINDOWS_MOBILE +inline int FileNo(FILE* file) { return reinterpret_cast(_fileno(file)); } +// Stat(), RmDir(), and IsDir() are not needed on Windows CE at this +// time and thus not defined there. +#else +inline int FileNo(FILE* file) { return _fileno(file); } +inline int Stat(const char* path, StatStruct* buf) { return _stat(path, buf); } +inline int RmDir(const char* dir) { return _rmdir(dir); } +inline bool IsDir(const StatStruct& st) { return (_S_IFDIR & st.st_mode) != 0; } +#endif // GTEST_OS_WINDOWS_MOBILE + +#elif GTEST_OS_ESP8266 +typedef struct stat StatStruct; + +inline int FileNo(FILE* file) { return fileno(file); } +inline int DoIsATTY(int fd) { return isatty(fd); } +inline int Stat(const char* path, StatStruct* buf) { + // stat function not implemented on ESP8266 + return 0; +} +inline int StrCaseCmp(const char* s1, const char* s2) { + return strcasecmp(s1, s2); +} +inline char* StrDup(const char* src) { return strdup(src); } +inline int RmDir(const char* dir) { return rmdir(dir); } +inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); } + +#else + +typedef struct stat StatStruct; + +inline int FileNo(FILE* file) { return fileno(file); } +inline int DoIsATTY(int fd) { return isatty(fd); } +inline int Stat(const char* path, StatStruct* buf) { return stat(path, buf); } +inline int StrCaseCmp(const char* s1, const char* s2) { + return strcasecmp(s1, s2); +} +inline char* StrDup(const char* src) { return strdup(src); } +inline int RmDir(const char* dir) { return rmdir(dir); } +inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); } + +#endif // GTEST_OS_WINDOWS + +inline int IsATTY(int fd) { + // DoIsATTY might change errno (for example ENOTTY in case you redirect stdout + // to a file on Linux), which is unexpected, so save the previous value, and + // restore it after the call. + int savedErrno = errno; + int isAttyValue = DoIsATTY(fd); + errno = savedErrno; + + return isAttyValue; +} + +// Functions deprecated by MSVC 8.0. + +GTEST_DISABLE_MSC_DEPRECATED_PUSH_() + +// ChDir(), FReopen(), FDOpen(), Read(), Write(), Close(), and +// StrError() aren't needed on Windows CE at this time and thus not +// defined there. + +#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \ + !GTEST_OS_WINDOWS_RT && !GTEST_OS_ESP8266 && !GTEST_OS_XTENSA +inline int ChDir(const char* dir) { return chdir(dir); } +#endif +inline FILE* FOpen(const char* path, const char* mode) { +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW + struct wchar_codecvt : public std::codecvt {}; + std::wstring_convert converter; + std::wstring wide_path = converter.from_bytes(path); + std::wstring wide_mode = converter.from_bytes(mode); + return _wfopen(wide_path.c_str(), wide_mode.c_str()); +#else // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW + return fopen(path, mode); +#endif // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW +} +#if !GTEST_OS_WINDOWS_MOBILE +inline FILE* FReopen(const char* path, const char* mode, FILE* stream) { + return freopen(path, mode, stream); +} +inline FILE* FDOpen(int fd, const char* mode) { return fdopen(fd, mode); } +#endif +inline int FClose(FILE* fp) { return fclose(fp); } +#if !GTEST_OS_WINDOWS_MOBILE +inline int Read(int fd, void* buf, unsigned int count) { + return static_cast(read(fd, buf, count)); +} +inline int Write(int fd, const void* buf, unsigned int count) { + return static_cast(write(fd, buf, count)); +} +inline int Close(int fd) { return close(fd); } +inline const char* StrError(int errnum) { return strerror(errnum); } +#endif +inline const char* GetEnv(const char* name) { +#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \ + GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_XTENSA + // We are on an embedded platform, which has no environment variables. + static_cast(name); // To prevent 'unused argument' warning. + return nullptr; +#elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9) + // Environment variables which we programmatically clear will be set to the + // empty string rather than unset (NULL). Handle that case. + const char* const env = getenv(name); + return (env != nullptr && env[0] != '\0') ? env : nullptr; +#else + return getenv(name); +#endif +} + +GTEST_DISABLE_MSC_DEPRECATED_POP_() + +#if GTEST_OS_WINDOWS_MOBILE +// Windows CE has no C library. The abort() function is used in +// several places in Google Test. This implementation provides a reasonable +// imitation of standard behaviour. +[[noreturn]] void Abort(); +#else +[[noreturn]] inline void Abort() { abort(); } +#endif // GTEST_OS_WINDOWS_MOBILE + +} // namespace posix + +// MSVC "deprecates" snprintf and issues warnings wherever it is used. In +// order to avoid these warnings, we need to use _snprintf or _snprintf_s on +// MSVC-based platforms. We map the GTEST_SNPRINTF_ macro to the appropriate +// function in order to achieve that. We use macro definition here because +// snprintf is a variadic function. +#if _MSC_VER && !GTEST_OS_WINDOWS_MOBILE +// MSVC 2005 and above support variadic macros. +#define GTEST_SNPRINTF_(buffer, size, format, ...) \ + _snprintf_s(buffer, size, size, format, __VA_ARGS__) +#elif defined(_MSC_VER) +// Windows CE does not define _snprintf_s +#define GTEST_SNPRINTF_ _snprintf +#else +#define GTEST_SNPRINTF_ snprintf +#endif + +// The biggest signed integer type the compiler supports. +// +// long long is guaranteed to be at least 64-bits in C++11. +using BiggestInt = long long; // NOLINT + +// The maximum number a BiggestInt can represent. +constexpr BiggestInt kMaxBiggestInt = (std::numeric_limits::max)(); + +// This template class serves as a compile-time function from size to +// type. It maps a size in bytes to a primitive type with that +// size. e.g. +// +// TypeWithSize<4>::UInt +// +// is typedef-ed to be unsigned int (unsigned integer made up of 4 +// bytes). +// +// Such functionality should belong to STL, but I cannot find it +// there. +// +// Google Test uses this class in the implementation of floating-point +// comparison. +// +// For now it only handles UInt (unsigned int) as that's all Google Test +// needs. Other types can be easily added in the future if need +// arises. +template +class TypeWithSize { + public: + // This prevents the user from using TypeWithSize with incorrect + // values of N. + using UInt = void; +}; + +// The specialization for size 4. +template <> +class TypeWithSize<4> { + public: + using Int = std::int32_t; + using UInt = std::uint32_t; +}; + +// The specialization for size 8. +template <> +class TypeWithSize<8> { + public: + using Int = std::int64_t; + using UInt = std::uint64_t; +}; + +// Integer types of known sizes. +using TimeInMillis = int64_t; // Represents time in milliseconds. + +// Utilities for command line flags and environment variables. + +// Macro for referencing flags. +#if !defined(GTEST_FLAG) +#define GTEST_FLAG_NAME_(name) gtest_##name +#define GTEST_FLAG(name) FLAGS_gtest_##name +#endif // !defined(GTEST_FLAG) + +// Pick a command line flags implementation. +#if GTEST_HAS_ABSL + +// Macros for defining flags. +#define GTEST_DEFINE_bool_(name, default_val, doc) \ + ABSL_FLAG(bool, GTEST_FLAG_NAME_(name), default_val, doc) +#define GTEST_DEFINE_int32_(name, default_val, doc) \ + ABSL_FLAG(int32_t, GTEST_FLAG_NAME_(name), default_val, doc) +#define GTEST_DEFINE_string_(name, default_val, doc) \ + ABSL_FLAG(std::string, GTEST_FLAG_NAME_(name), default_val, doc) + +// Macros for declaring flags. +#define GTEST_DECLARE_bool_(name) \ + ABSL_DECLARE_FLAG(bool, GTEST_FLAG_NAME_(name)) +#define GTEST_DECLARE_int32_(name) \ + ABSL_DECLARE_FLAG(int32_t, GTEST_FLAG_NAME_(name)) +#define GTEST_DECLARE_string_(name) \ + ABSL_DECLARE_FLAG(std::string, GTEST_FLAG_NAME_(name)) + +#define GTEST_FLAG_SAVER_ ::absl::FlagSaver + +#define GTEST_FLAG_GET(name) ::absl::GetFlag(GTEST_FLAG(name)) +#define GTEST_FLAG_SET(name, value) \ + (void)(::absl::SetFlag(>EST_FLAG(name), value)) +#define GTEST_USE_OWN_FLAGFILE_FLAG_ 0 + +#else // GTEST_HAS_ABSL + +// Macros for defining flags. +#define GTEST_DEFINE_bool_(name, default_val, doc) \ + namespace testing { \ + GTEST_API_ bool GTEST_FLAG(name) = (default_val); \ + } \ + static_assert(true, "no-op to require trailing semicolon") +#define GTEST_DEFINE_int32_(name, default_val, doc) \ + namespace testing { \ + GTEST_API_ std::int32_t GTEST_FLAG(name) = (default_val); \ + } \ + static_assert(true, "no-op to require trailing semicolon") +#define GTEST_DEFINE_string_(name, default_val, doc) \ + namespace testing { \ + GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val); \ + } \ + static_assert(true, "no-op to require trailing semicolon") + +// Macros for declaring flags. +#define GTEST_DECLARE_bool_(name) \ + namespace testing { \ + GTEST_API_ extern bool GTEST_FLAG(name); \ + } \ + static_assert(true, "no-op to require trailing semicolon") +#define GTEST_DECLARE_int32_(name) \ + namespace testing { \ + GTEST_API_ extern std::int32_t GTEST_FLAG(name); \ + } \ + static_assert(true, "no-op to require trailing semicolon") +#define GTEST_DECLARE_string_(name) \ + namespace testing { \ + GTEST_API_ extern ::std::string GTEST_FLAG(name); \ + } \ + static_assert(true, "no-op to require trailing semicolon") + +#define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver + +#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name) +#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value) +#define GTEST_USE_OWN_FLAGFILE_FLAG_ 1 + +#endif // GTEST_HAS_ABSL + +// Thread annotations +#if !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_) +#define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks) +#define GTEST_LOCK_EXCLUDED_(locks) +#endif // !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_) + +// Parses 'str' for a 32-bit signed integer. If successful, writes the result +// to *value and returns true; otherwise leaves *value unchanged and returns +// false. +GTEST_API_ bool ParseInt32(const Message& src_text, const char* str, + int32_t* value); + +// Parses a bool/int32_t/string from the environment variable +// corresponding to the given Google Test flag. +bool BoolFromGTestEnv(const char* flag, bool default_val); +GTEST_API_ int32_t Int32FromGTestEnv(const char* flag, int32_t default_val); +std::string OutputFlagAlsoCheckEnvVar(); +const char* StringFromGTestEnv(const char* flag, const char* default_val); + +} // namespace internal +} // namespace testing + +#if !defined(GTEST_INTERNAL_DEPRECATED) + +// Internal Macro to mark an API deprecated, for googletest usage only +// Usage: class GTEST_INTERNAL_DEPRECATED(message) MyClass or +// GTEST_INTERNAL_DEPRECATED(message) myFunction(); Every usage of +// a deprecated entity will trigger a warning when compiled with +// `-Wdeprecated-declarations` option (clang, gcc, any __GNUC__ compiler). +// For msvc /W3 option will need to be used +// Note that for 'other' compilers this macro evaluates to nothing to prevent +// compilations errors. +#if defined(_MSC_VER) +#define GTEST_INTERNAL_DEPRECATED(message) __declspec(deprecated(message)) +#elif defined(__GNUC__) +#define GTEST_INTERNAL_DEPRECATED(message) __attribute__((deprecated(message))) +#else +#define GTEST_INTERNAL_DEPRECATED(message) +#endif + +#endif // !defined(GTEST_INTERNAL_DEPRECATED) + +#if GTEST_HAS_ABSL +// Always use absl::any for UniversalPrinter<> specializations if googletest +// is built with absl support. +#define GTEST_INTERNAL_HAS_ANY 1 +#include "absl/types/any.h" +namespace testing { +namespace internal { +using Any = ::absl::any; +} // namespace internal +} // namespace testing +#else +#ifdef __has_include +#if __has_include() && __cplusplus >= 201703L +// Otherwise for C++17 and higher use std::any for UniversalPrinter<> +// specializations. +#define GTEST_INTERNAL_HAS_ANY 1 +#include +namespace testing { +namespace internal { +using Any = ::std::any; +} // namespace internal +} // namespace testing +// The case where absl is configured NOT to alias std::any is not +// supported. +#endif // __has_include() && __cplusplus >= 201703L +#endif // __has_include +#endif // GTEST_HAS_ABSL + +#if GTEST_HAS_ABSL +// Always use absl::optional for UniversalPrinter<> specializations if +// googletest is built with absl support. +#define GTEST_INTERNAL_HAS_OPTIONAL 1 +#include "absl/types/optional.h" +namespace testing { +namespace internal { +template +using Optional = ::absl::optional; +inline ::absl::nullopt_t Nullopt() { return ::absl::nullopt; } +} // namespace internal +} // namespace testing +#else +#ifdef __has_include +#if __has_include() && __cplusplus >= 201703L +// Otherwise for C++17 and higher use std::optional for UniversalPrinter<> +// specializations. +#define GTEST_INTERNAL_HAS_OPTIONAL 1 +#include +namespace testing { +namespace internal { +template +using Optional = ::std::optional; +inline ::std::nullopt_t Nullopt() { return ::std::nullopt; } +} // namespace internal +} // namespace testing +// The case where absl is configured NOT to alias std::optional is not +// supported. +#endif // __has_include() && __cplusplus >= 201703L +#endif // __has_include +#endif // GTEST_HAS_ABSL + +#if GTEST_HAS_ABSL +// Always use absl::string_view for Matcher<> specializations if googletest +// is built with absl support. +#define GTEST_INTERNAL_HAS_STRING_VIEW 1 +#include "absl/strings/string_view.h" +namespace testing { +namespace internal { +using StringView = ::absl::string_view; +} // namespace internal +} // namespace testing +#else +#ifdef __has_include +#if __has_include() && __cplusplus >= 201703L +// Otherwise for C++17 and higher use std::string_view for Matcher<> +// specializations. +#define GTEST_INTERNAL_HAS_STRING_VIEW 1 +#include +namespace testing { +namespace internal { +using StringView = ::std::string_view; +} // namespace internal +} // namespace testing +// The case where absl is configured NOT to alias std::string_view is not +// supported. +#endif // __has_include() && __cplusplus >= 201703L +#endif // __has_include +#endif // GTEST_HAS_ABSL + +#if GTEST_HAS_ABSL +// Always use absl::variant for UniversalPrinter<> specializations if googletest +// is built with absl support. +#define GTEST_INTERNAL_HAS_VARIANT 1 +#include "absl/types/variant.h" +namespace testing { +namespace internal { +template +using Variant = ::absl::variant; +} // namespace internal +} // namespace testing +#else +#ifdef __has_include +#if __has_include() && __cplusplus >= 201703L +// Otherwise for C++17 and higher use std::variant for UniversalPrinter<> +// specializations. +#define GTEST_INTERNAL_HAS_VARIANT 1 +#include +namespace testing { +namespace internal { +template +using Variant = ::std::variant; +} // namespace internal +} // namespace testing +// The case where absl is configured NOT to alias std::variant is not supported. +#endif // __has_include() && __cplusplus >= 201703L +#endif // __has_include +#endif // GTEST_HAS_ABSL + +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h new file mode 100644 index 0000000000..cca2e1f2ad --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h @@ -0,0 +1,177 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This header file declares the String class and functions used internally by +// Google Test. They are subject to change without notice. They should not used +// by code external to Google Test. +// +// This header file is #included by gtest-internal.h. +// It should not be #included by other files. + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ + +#ifdef __BORLANDC__ +// string.h is not guaranteed to provide strcpy on C++ Builder. +#include +#endif + +#include + +#include +#include + +#include "gtest/internal/gtest-port.h" + +namespace testing { +namespace internal { + +// String - an abstract class holding static string utilities. +class GTEST_API_ String { + public: + // Static utility methods + + // Clones a 0-terminated C string, allocating memory using new. The + // caller is responsible for deleting the return value using + // delete[]. Returns the cloned string, or NULL if the input is + // NULL. + // + // This is different from strdup() in string.h, which allocates + // memory using malloc(). + static const char* CloneCString(const char* c_str); + +#if GTEST_OS_WINDOWS_MOBILE + // Windows CE does not have the 'ANSI' versions of Win32 APIs. To be + // able to pass strings to Win32 APIs on CE we need to convert them + // to 'Unicode', UTF-16. + + // Creates a UTF-16 wide string from the given ANSI string, allocating + // memory using new. The caller is responsible for deleting the return + // value using delete[]. Returns the wide string, or NULL if the + // input is NULL. + // + // The wide string is created using the ANSI codepage (CP_ACP) to + // match the behaviour of the ANSI versions of Win32 calls and the + // C runtime. + static LPCWSTR AnsiToUtf16(const char* c_str); + + // Creates an ANSI string from the given wide string, allocating + // memory using new. The caller is responsible for deleting the return + // value using delete[]. Returns the ANSI string, or NULL if the + // input is NULL. + // + // The returned string is created using the ANSI codepage (CP_ACP) to + // match the behaviour of the ANSI versions of Win32 calls and the + // C runtime. + static const char* Utf16ToAnsi(LPCWSTR utf16_str); +#endif + + // Compares two C strings. Returns true if and only if they have the same + // content. + // + // Unlike strcmp(), this function can handle NULL argument(s). A + // NULL C string is considered different to any non-NULL C string, + // including the empty string. + static bool CStringEquals(const char* lhs, const char* rhs); + + // Converts a wide C string to a String using the UTF-8 encoding. + // NULL will be converted to "(null)". If an error occurred during + // the conversion, "(failed to convert from wide string)" is + // returned. + static std::string ShowWideCString(const wchar_t* wide_c_str); + + // Compares two wide C strings. Returns true if and only if they have the + // same content. + // + // Unlike wcscmp(), this function can handle NULL argument(s). A + // NULL C string is considered different to any non-NULL C string, + // including the empty string. + static bool WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs); + + // Compares two C strings, ignoring case. Returns true if and only if + // they have the same content. + // + // Unlike strcasecmp(), this function can handle NULL argument(s). + // A NULL C string is considered different to any non-NULL C string, + // including the empty string. + static bool CaseInsensitiveCStringEquals(const char* lhs, const char* rhs); + + // Compares two wide C strings, ignoring case. Returns true if and only if + // they have the same content. + // + // Unlike wcscasecmp(), this function can handle NULL argument(s). + // A NULL C string is considered different to any non-NULL wide C string, + // including the empty string. + // NB: The implementations on different platforms slightly differ. + // On windows, this method uses _wcsicmp which compares according to LC_CTYPE + // environment variable. On GNU platform this method uses wcscasecmp + // which compares according to LC_CTYPE category of the current locale. + // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the + // current locale. + static bool CaseInsensitiveWideCStringEquals(const wchar_t* lhs, + const wchar_t* rhs); + + // Returns true if and only if the given string ends with the given suffix, + // ignoring case. Any string is considered to end with an empty suffix. + static bool EndsWithCaseInsensitive(const std::string& str, + const std::string& suffix); + + // Formats an int value as "%02d". + static std::string FormatIntWidth2(int value); // "%02d" for width == 2 + + // Formats an int value to given width with leading zeros. + static std::string FormatIntWidthN(int value, int width); + + // Formats an int value as "%X". + static std::string FormatHexInt(int value); + + // Formats an int value as "%X". + static std::string FormatHexUInt32(uint32_t value); + + // Formats a byte as "%02X". + static std::string FormatByte(unsigned char value); + + private: + String(); // Not meant to be instantiated. +}; // class String + +// Gets the content of the stringstream's buffer as an std::string. Each '\0' +// character in the buffer is replaced with "\\0". +GTEST_API_ std::string StringStreamToString(::std::stringstream* stream); + +} // namespace internal +} // namespace testing + +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h new file mode 100644 index 0000000000..6bc02a7de3 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h @@ -0,0 +1,186 @@ +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Type utilities needed for implementing typed and type-parameterized +// tests. + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ + +#include "gtest/internal/gtest-port.h" + +// #ifdef __GNUC__ is too general here. It is possible to use gcc without using +// libstdc++ (which is where cxxabi.h comes from). +#if GTEST_HAS_CXXABI_H_ +#include +#elif defined(__HP_aCC) +#include +#endif // GTEST_HASH_CXXABI_H_ + +namespace testing { +namespace internal { + +// Canonicalizes a given name with respect to the Standard C++ Library. +// This handles removing the inline namespace within `std` that is +// used by various standard libraries (e.g., `std::__1`). Names outside +// of namespace std are returned unmodified. +inline std::string CanonicalizeForStdLibVersioning(std::string s) { + static const char prefix[] = "std::__"; + if (s.compare(0, strlen(prefix), prefix) == 0) { + std::string::size_type end = s.find("::", strlen(prefix)); + if (end != s.npos) { + // Erase everything between the initial `std` and the second `::`. + s.erase(strlen("std"), end - strlen("std")); + } + } + return s; +} + +#if GTEST_HAS_RTTI +// GetTypeName(const std::type_info&) returns a human-readable name of type T. +inline std::string GetTypeName(const std::type_info& type) { + const char* const name = type.name(); +#if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC) + int status = 0; + // gcc's implementation of typeid(T).name() mangles the type name, + // so we have to demangle it. +#if GTEST_HAS_CXXABI_H_ + using abi::__cxa_demangle; +#endif // GTEST_HAS_CXXABI_H_ + char* const readable_name = __cxa_demangle(name, nullptr, nullptr, &status); + const std::string name_str(status == 0 ? readable_name : name); + free(readable_name); + return CanonicalizeForStdLibVersioning(name_str); +#else + return name; +#endif // GTEST_HAS_CXXABI_H_ || __HP_aCC +} +#endif // GTEST_HAS_RTTI + +// GetTypeName() returns a human-readable name of type T if and only if +// RTTI is enabled, otherwise it returns a dummy type name. +// NB: This function is also used in Google Mock, so don't move it inside of +// the typed-test-only section below. +template +std::string GetTypeName() { +#if GTEST_HAS_RTTI + return GetTypeName(typeid(T)); +#else + return ""; +#endif // GTEST_HAS_RTTI +} + +// A unique type indicating an empty node +struct None {}; + +#define GTEST_TEMPLATE_ \ + template \ + class + +// The template "selector" struct TemplateSel is used to +// represent Tmpl, which must be a class template with one type +// parameter, as a type. TemplateSel::Bind::type is defined +// as the type Tmpl. This allows us to actually instantiate the +// template "selected" by TemplateSel. +// +// This trick is necessary for simulating typedef for class templates, +// which C++ doesn't support directly. +template +struct TemplateSel { + template + struct Bind { + typedef Tmpl type; + }; +}; + +#define GTEST_BIND_(TmplSel, T) TmplSel::template Bind::type + +template +struct Templates { + using Head = TemplateSel; + using Tail = Templates; +}; + +template +struct Templates { + using Head = TemplateSel; + using Tail = None; +}; + +// Tuple-like type lists +template +struct Types { + using Head = Head_; + using Tail = Types; +}; + +template +struct Types { + using Head = Head_; + using Tail = None; +}; + +// Helper metafunctions to tell apart a single type from types +// generated by ::testing::Types +template +struct ProxyTypeList { + using type = Types; +}; + +template +struct is_proxy_type_list : std::false_type {}; + +template +struct is_proxy_type_list> : std::true_type {}; + +// Generator which conditionally creates type lists. +// It recognizes if a requested type list should be created +// and prevents creating a new type list nested within another one. +template +struct GenerateTypeList { + private: + using proxy = typename std::conditional::value, T, + ProxyTypeList>::type; + + public: + using type = typename proxy::type; +}; + +} // namespace internal + +template +using Types = internal::ProxyTypeList; + +} // namespace testing + +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest-all.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-all.cc new file mode 100644 index 0000000000..2a70ed88c7 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-all.cc @@ -0,0 +1,49 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// +// Google C++ Testing and Mocking Framework (Google Test) +// +// Sometimes it's desirable to build Google Test by compiling a single file. +// This file serves this purpose. + +// This line ensures that gtest.h can be compiled on its own, even +// when it's fused. +#include "gtest/gtest.h" + +// The following lines pull in the real gtest *.cc files. +#include "src/gtest-assertion-result.cc" +#include "src/gtest-death-test.cc" +#include "src/gtest-filepath.cc" +#include "src/gtest-matchers.cc" +#include "src/gtest-port.cc" +#include "src/gtest-printers.cc" +#include "src/gtest-test-part.cc" +#include "src/gtest-typed-test.cc" +#include "src/gtest.cc" diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest-assertion-result.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-assertion-result.cc new file mode 100644 index 0000000000..f1c0b10dc9 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-assertion-result.cc @@ -0,0 +1,77 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This file defines the AssertionResult type. + +#include "gtest/gtest-assertion-result.h" + +#include +#include + +#include "gtest/gtest-message.h" + +namespace testing { + +// AssertionResult constructors. +// Used in EXPECT_TRUE/FALSE(assertion_result). +AssertionResult::AssertionResult(const AssertionResult& other) + : success_(other.success_), + message_(other.message_.get() != nullptr + ? new ::std::string(*other.message_) + : static_cast< ::std::string*>(nullptr)) {} + +// Swaps two AssertionResults. +void AssertionResult::swap(AssertionResult& other) { + using std::swap; + swap(success_, other.success_); + swap(message_, other.message_); +} + +// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE. +AssertionResult AssertionResult::operator!() const { + AssertionResult negation(!success_); + if (message_.get() != nullptr) negation << *message_; + return negation; +} + +// Makes a successful assertion result. +AssertionResult AssertionSuccess() { return AssertionResult(true); } + +// Makes a failed assertion result. +AssertionResult AssertionFailure() { return AssertionResult(false); } + +// Makes a failed assertion result with the given failure message. +// Deprecated; use AssertionFailure() << message. +AssertionResult AssertionFailure(const Message& message) { + return AssertionFailure() << message; +} + +} // namespace testing diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest-death-test.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-death-test.cc new file mode 100644 index 0000000000..e6abc6278a --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-death-test.cc @@ -0,0 +1,1620 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// +// This file implements death tests. + +#include "gtest/gtest-death-test.h" + +#include +#include + +#include "gtest/internal/custom/gtest.h" +#include "gtest/internal/gtest-port.h" + +#if GTEST_HAS_DEATH_TEST + +#if GTEST_OS_MAC +#include +#endif // GTEST_OS_MAC + +#include +#include +#include + +#if GTEST_OS_LINUX +#include +#endif // GTEST_OS_LINUX + +#include + +#if GTEST_OS_WINDOWS +#include +#else +#include +#include +#endif // GTEST_OS_WINDOWS + +#if GTEST_OS_QNX +#include +#endif // GTEST_OS_QNX + +#if GTEST_OS_FUCHSIA +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif // GTEST_OS_FUCHSIA + +#endif // GTEST_HAS_DEATH_TEST + +#include "gtest/gtest-message.h" +#include "gtest/internal/gtest-string.h" +#include "src/gtest-internal-inl.h" + +namespace testing { + +// Constants. + +// The default death test style. +// +// This is defined in internal/gtest-port.h as "fast", but can be overridden by +// a definition in internal/custom/gtest-port.h. The recommended value, which is +// used internally at Google, is "threadsafe". +static const char kDefaultDeathTestStyle[] = GTEST_DEFAULT_DEATH_TEST_STYLE; + +} // namespace testing + +GTEST_DEFINE_string_( + death_test_style, + testing::internal::StringFromGTestEnv("death_test_style", + testing::kDefaultDeathTestStyle), + "Indicates how to run a death test in a forked child process: " + "\"threadsafe\" (child process re-executes the test binary " + "from the beginning, running only the specific death test) or " + "\"fast\" (child process runs the death test immediately " + "after forking)."); + +GTEST_DEFINE_bool_( + death_test_use_fork, + testing::internal::BoolFromGTestEnv("death_test_use_fork", false), + "Instructs to use fork()/_exit() instead of clone() in death tests. " + "Ignored and always uses fork() on POSIX systems where clone() is not " + "implemented. Useful when running under valgrind or similar tools if " + "those do not support clone(). Valgrind 3.3.1 will just fail if " + "it sees an unsupported combination of clone() flags. " + "It is not recommended to use this flag w/o valgrind though it will " + "work in 99% of the cases. Once valgrind is fixed, this flag will " + "most likely be removed."); + +GTEST_DEFINE_string_( + internal_run_death_test, "", + "Indicates the file, line number, temporal index of " + "the single death test to run, and a file descriptor to " + "which a success code may be sent, all separated by " + "the '|' characters. This flag is specified if and only if the " + "current process is a sub-process launched for running a thread-safe " + "death test. FOR INTERNAL USE ONLY."); + +namespace testing { + +#if GTEST_HAS_DEATH_TEST + +namespace internal { + +// Valid only for fast death tests. Indicates the code is running in the +// child process of a fast style death test. +#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA +static bool g_in_fast_death_test_child = false; +#endif + +// Returns a Boolean value indicating whether the caller is currently +// executing in the context of the death test child process. Tools such as +// Valgrind heap checkers may need this to modify their behavior in death +// tests. IMPORTANT: This is an internal utility. Using it may break the +// implementation of death tests. User code MUST NOT use it. +bool InDeathTestChild() { +#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA + + // On Windows and Fuchsia, death tests are thread-safe regardless of the value + // of the death_test_style flag. + return !GTEST_FLAG_GET(internal_run_death_test).empty(); + +#else + + if (GTEST_FLAG_GET(death_test_style) == "threadsafe") + return !GTEST_FLAG_GET(internal_run_death_test).empty(); + else + return g_in_fast_death_test_child; +#endif +} + +} // namespace internal + +// ExitedWithCode constructor. +ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {} + +// ExitedWithCode function-call operator. +bool ExitedWithCode::operator()(int exit_status) const { +#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA + + return exit_status == exit_code_; + +#else + + return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_; + +#endif // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA +} + +#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA +// KilledBySignal constructor. +KilledBySignal::KilledBySignal(int signum) : signum_(signum) {} + +// KilledBySignal function-call operator. +bool KilledBySignal::operator()(int exit_status) const { +#if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_) + { + bool result; + if (GTEST_KILLED_BY_SIGNAL_OVERRIDE_(signum_, exit_status, &result)) { + return result; + } + } +#endif // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_) + return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_; +} +#endif // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA + +namespace internal { + +// Utilities needed for death tests. + +// Generates a textual description of a given exit code, in the format +// specified by wait(2). +static std::string ExitSummary(int exit_code) { + Message m; + +#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA + + m << "Exited with exit status " << exit_code; + +#else + + if (WIFEXITED(exit_code)) { + m << "Exited with exit status " << WEXITSTATUS(exit_code); + } else if (WIFSIGNALED(exit_code)) { + m << "Terminated by signal " << WTERMSIG(exit_code); + } +#ifdef WCOREDUMP + if (WCOREDUMP(exit_code)) { + m << " (core dumped)"; + } +#endif +#endif // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA + + return m.GetString(); +} + +// Returns true if exit_status describes a process that was terminated +// by a signal, or exited normally with a nonzero exit code. +bool ExitedUnsuccessfully(int exit_status) { + return !ExitedWithCode(0)(exit_status); +} + +#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA +// Generates a textual failure message when a death test finds more than +// one thread running, or cannot determine the number of threads, prior +// to executing the given statement. It is the responsibility of the +// caller not to pass a thread_count of 1. +static std::string DeathTestThreadWarning(size_t thread_count) { + Message msg; + msg << "Death tests use fork(), which is unsafe particularly" + << " in a threaded context. For this test, " << GTEST_NAME_ << " "; + if (thread_count == 0) { + msg << "couldn't detect the number of threads."; + } else { + msg << "detected " << thread_count << " threads."; + } + msg << " See " + "https://github.com/google/googletest/blob/master/docs/" + "advanced.md#death-tests-and-threads" + << " for more explanation and suggested solutions, especially if" + << " this is the last message you see before your test times out."; + return msg.GetString(); +} +#endif // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA + +// Flag characters for reporting a death test that did not die. +static const char kDeathTestLived = 'L'; +static const char kDeathTestReturned = 'R'; +static const char kDeathTestThrew = 'T'; +static const char kDeathTestInternalError = 'I'; + +#if GTEST_OS_FUCHSIA + +// File descriptor used for the pipe in the child process. +static const int kFuchsiaReadPipeFd = 3; + +#endif + +// An enumeration describing all of the possible ways that a death test can +// conclude. DIED means that the process died while executing the test +// code; LIVED means that process lived beyond the end of the test code; +// RETURNED means that the test statement attempted to execute a return +// statement, which is not allowed; THREW means that the test statement +// returned control by throwing an exception. IN_PROGRESS means the test +// has not yet concluded. +enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW }; + +// Routine for aborting the program which is safe to call from an +// exec-style death test child process, in which case the error +// message is propagated back to the parent process. Otherwise, the +// message is simply printed to stderr. In either case, the program +// then exits with status 1. +static void DeathTestAbort(const std::string& message) { + // On a POSIX system, this function may be called from a threadsafe-style + // death test child process, which operates on a very small stack. Use + // the heap for any additional non-minuscule memory requirements. + const InternalRunDeathTestFlag* const flag = + GetUnitTestImpl()->internal_run_death_test_flag(); + if (flag != nullptr) { + FILE* parent = posix::FDOpen(flag->write_fd(), "w"); + fputc(kDeathTestInternalError, parent); + fprintf(parent, "%s", message.c_str()); + fflush(parent); + _exit(1); + } else { + fprintf(stderr, "%s", message.c_str()); + fflush(stderr); + posix::Abort(); + } +} + +// A replacement for CHECK that calls DeathTestAbort if the assertion +// fails. +#define GTEST_DEATH_TEST_CHECK_(expression) \ + do { \ + if (!::testing::internal::IsTrue(expression)) { \ + DeathTestAbort(::std::string("CHECK failed: File ") + __FILE__ + \ + ", line " + \ + ::testing::internal::StreamableToString(__LINE__) + \ + ": " + #expression); \ + } \ + } while (::testing::internal::AlwaysFalse()) + +// This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for +// evaluating any system call that fulfills two conditions: it must return +// -1 on failure, and set errno to EINTR when it is interrupted and +// should be tried again. The macro expands to a loop that repeatedly +// evaluates the expression as long as it evaluates to -1 and sets +// errno to EINTR. If the expression evaluates to -1 but errno is +// something other than EINTR, DeathTestAbort is called. +#define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \ + do { \ + int gtest_retval; \ + do { \ + gtest_retval = (expression); \ + } while (gtest_retval == -1 && errno == EINTR); \ + if (gtest_retval == -1) { \ + DeathTestAbort(::std::string("CHECK failed: File ") + __FILE__ + \ + ", line " + \ + ::testing::internal::StreamableToString(__LINE__) + \ + ": " + #expression + " != -1"); \ + } \ + } while (::testing::internal::AlwaysFalse()) + +// Returns the message describing the last system error in errno. +std::string GetLastErrnoDescription() { + return errno == 0 ? "" : posix::StrError(errno); +} + +// This is called from a death test parent process to read a failure +// message from the death test child process and log it with the FATAL +// severity. On Windows, the message is read from a pipe handle. On other +// platforms, it is read from a file descriptor. +static void FailFromInternalError(int fd) { + Message error; + char buffer[256]; + int num_read; + + do { + while ((num_read = posix::Read(fd, buffer, 255)) > 0) { + buffer[num_read] = '\0'; + error << buffer; + } + } while (num_read == -1 && errno == EINTR); + + if (num_read == 0) { + GTEST_LOG_(FATAL) << error.GetString(); + } else { + const int last_error = errno; + GTEST_LOG_(FATAL) << "Error while reading death test internal: " + << GetLastErrnoDescription() << " [" << last_error << "]"; + } +} + +// Death test constructor. Increments the running death test count +// for the current test. +DeathTest::DeathTest() { + TestInfo* const info = GetUnitTestImpl()->current_test_info(); + if (info == nullptr) { + DeathTestAbort( + "Cannot run a death test outside of a TEST or " + "TEST_F construct"); + } +} + +// Creates and returns a death test by dispatching to the current +// death test factory. +bool DeathTest::Create(const char* statement, + Matcher matcher, const char* file, + int line, DeathTest** test) { + return GetUnitTestImpl()->death_test_factory()->Create( + statement, std::move(matcher), file, line, test); +} + +const char* DeathTest::LastMessage() { + return last_death_test_message_.c_str(); +} + +void DeathTest::set_last_death_test_message(const std::string& message) { + last_death_test_message_ = message; +} + +std::string DeathTest::last_death_test_message_; + +// Provides cross platform implementation for some death functionality. +class DeathTestImpl : public DeathTest { + protected: + DeathTestImpl(const char* a_statement, Matcher matcher) + : statement_(a_statement), + matcher_(std::move(matcher)), + spawned_(false), + status_(-1), + outcome_(IN_PROGRESS), + read_fd_(-1), + write_fd_(-1) {} + + // read_fd_ is expected to be closed and cleared by a derived class. + ~DeathTestImpl() override { GTEST_DEATH_TEST_CHECK_(read_fd_ == -1); } + + void Abort(AbortReason reason) override; + bool Passed(bool status_ok) override; + + const char* statement() const { return statement_; } + bool spawned() const { return spawned_; } + void set_spawned(bool is_spawned) { spawned_ = is_spawned; } + int status() const { return status_; } + void set_status(int a_status) { status_ = a_status; } + DeathTestOutcome outcome() const { return outcome_; } + void set_outcome(DeathTestOutcome an_outcome) { outcome_ = an_outcome; } + int read_fd() const { return read_fd_; } + void set_read_fd(int fd) { read_fd_ = fd; } + int write_fd() const { return write_fd_; } + void set_write_fd(int fd) { write_fd_ = fd; } + + // Called in the parent process only. Reads the result code of the death + // test child process via a pipe, interprets it to set the outcome_ + // member, and closes read_fd_. Outputs diagnostics and terminates in + // case of unexpected codes. + void ReadAndInterpretStatusByte(); + + // Returns stderr output from the child process. + virtual std::string GetErrorLogs(); + + private: + // The textual content of the code this object is testing. This class + // doesn't own this string and should not attempt to delete it. + const char* const statement_; + // A matcher that's expected to match the stderr output by the child process. + Matcher matcher_; + // True if the death test child process has been successfully spawned. + bool spawned_; + // The exit status of the child process. + int status_; + // How the death test concluded. + DeathTestOutcome outcome_; + // Descriptor to the read end of the pipe to the child process. It is + // always -1 in the child process. The child keeps its write end of the + // pipe in write_fd_. + int read_fd_; + // Descriptor to the child's write end of the pipe to the parent process. + // It is always -1 in the parent process. The parent keeps its end of the + // pipe in read_fd_. + int write_fd_; +}; + +// Called in the parent process only. Reads the result code of the death +// test child process via a pipe, interprets it to set the outcome_ +// member, and closes read_fd_. Outputs diagnostics and terminates in +// case of unexpected codes. +void DeathTestImpl::ReadAndInterpretStatusByte() { + char flag; + int bytes_read; + + // The read() here blocks until data is available (signifying the + // failure of the death test) or until the pipe is closed (signifying + // its success), so it's okay to call this in the parent before + // the child process has exited. + do { + bytes_read = posix::Read(read_fd(), &flag, 1); + } while (bytes_read == -1 && errno == EINTR); + + if (bytes_read == 0) { + set_outcome(DIED); + } else if (bytes_read == 1) { + switch (flag) { + case kDeathTestReturned: + set_outcome(RETURNED); + break; + case kDeathTestThrew: + set_outcome(THREW); + break; + case kDeathTestLived: + set_outcome(LIVED); + break; + case kDeathTestInternalError: + FailFromInternalError(read_fd()); // Does not return. + break; + default: + GTEST_LOG_(FATAL) << "Death test child process reported " + << "unexpected status byte (" + << static_cast(flag) << ")"; + } + } else { + GTEST_LOG_(FATAL) << "Read from death test child process failed: " + << GetLastErrnoDescription(); + } + GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Close(read_fd())); + set_read_fd(-1); +} + +std::string DeathTestImpl::GetErrorLogs() { return GetCapturedStderr(); } + +// Signals that the death test code which should have exited, didn't. +// Should be called only in a death test child process. +// Writes a status byte to the child's status file descriptor, then +// calls _exit(1). +void DeathTestImpl::Abort(AbortReason reason) { + // The parent process considers the death test to be a failure if + // it finds any data in our pipe. So, here we write a single flag byte + // to the pipe, then exit. + const char status_ch = reason == TEST_DID_NOT_DIE ? kDeathTestLived + : reason == TEST_THREW_EXCEPTION ? kDeathTestThrew + : kDeathTestReturned; + + GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1)); + // We are leaking the descriptor here because on some platforms (i.e., + // when built as Windows DLL), destructors of global objects will still + // run after calling _exit(). On such systems, write_fd_ will be + // indirectly closed from the destructor of UnitTestImpl, causing double + // close if it is also closed here. On debug configurations, double close + // may assert. As there are no in-process buffers to flush here, we are + // relying on the OS to close the descriptor after the process terminates + // when the destructors are not run. + _exit(1); // Exits w/o any normal exit hooks (we were supposed to crash) +} + +// Returns an indented copy of stderr output for a death test. +// This makes distinguishing death test output lines from regular log lines +// much easier. +static ::std::string FormatDeathTestOutput(const ::std::string& output) { + ::std::string ret; + for (size_t at = 0;;) { + const size_t line_end = output.find('\n', at); + ret += "[ DEATH ] "; + if (line_end == ::std::string::npos) { + ret += output.substr(at); + break; + } + ret += output.substr(at, line_end + 1 - at); + at = line_end + 1; + } + return ret; +} + +// Assesses the success or failure of a death test, using both private +// members which have previously been set, and one argument: +// +// Private data members: +// outcome: An enumeration describing how the death test +// concluded: DIED, LIVED, THREW, or RETURNED. The death test +// fails in the latter three cases. +// status: The exit status of the child process. On *nix, it is in the +// in the format specified by wait(2). On Windows, this is the +// value supplied to the ExitProcess() API or a numeric code +// of the exception that terminated the program. +// matcher_: A matcher that's expected to match the stderr output by the child +// process. +// +// Argument: +// status_ok: true if exit_status is acceptable in the context of +// this particular death test, which fails if it is false +// +// Returns true if and only if all of the above conditions are met. Otherwise, +// the first failing condition, in the order given above, is the one that is +// reported. Also sets the last death test message string. +bool DeathTestImpl::Passed(bool status_ok) { + if (!spawned()) return false; + + const std::string error_message = GetErrorLogs(); + + bool success = false; + Message buffer; + + buffer << "Death test: " << statement() << "\n"; + switch (outcome()) { + case LIVED: + buffer << " Result: failed to die.\n" + << " Error msg:\n" + << FormatDeathTestOutput(error_message); + break; + case THREW: + buffer << " Result: threw an exception.\n" + << " Error msg:\n" + << FormatDeathTestOutput(error_message); + break; + case RETURNED: + buffer << " Result: illegal return in test statement.\n" + << " Error msg:\n" + << FormatDeathTestOutput(error_message); + break; + case DIED: + if (status_ok) { + if (matcher_.Matches(error_message)) { + success = true; + } else { + std::ostringstream stream; + matcher_.DescribeTo(&stream); + buffer << " Result: died but not with expected error.\n" + << " Expected: " << stream.str() << "\n" + << "Actual msg:\n" + << FormatDeathTestOutput(error_message); + } + } else { + buffer << " Result: died but not with expected exit code:\n" + << " " << ExitSummary(status()) << "\n" + << "Actual msg:\n" + << FormatDeathTestOutput(error_message); + } + break; + case IN_PROGRESS: + default: + GTEST_LOG_(FATAL) + << "DeathTest::Passed somehow called before conclusion of test"; + } + + DeathTest::set_last_death_test_message(buffer.GetString()); + return success; +} + +#if GTEST_OS_WINDOWS +// WindowsDeathTest implements death tests on Windows. Due to the +// specifics of starting new processes on Windows, death tests there are +// always threadsafe, and Google Test considers the +// --gtest_death_test_style=fast setting to be equivalent to +// --gtest_death_test_style=threadsafe there. +// +// A few implementation notes: Like the Linux version, the Windows +// implementation uses pipes for child-to-parent communication. But due to +// the specifics of pipes on Windows, some extra steps are required: +// +// 1. The parent creates a communication pipe and stores handles to both +// ends of it. +// 2. The parent starts the child and provides it with the information +// necessary to acquire the handle to the write end of the pipe. +// 3. The child acquires the write end of the pipe and signals the parent +// using a Windows event. +// 4. Now the parent can release the write end of the pipe on its side. If +// this is done before step 3, the object's reference count goes down to +// 0 and it is destroyed, preventing the child from acquiring it. The +// parent now has to release it, or read operations on the read end of +// the pipe will not return when the child terminates. +// 5. The parent reads child's output through the pipe (outcome code and +// any possible error messages) from the pipe, and its stderr and then +// determines whether to fail the test. +// +// Note: to distinguish Win32 API calls from the local method and function +// calls, the former are explicitly resolved in the global namespace. +// +class WindowsDeathTest : public DeathTestImpl { + public: + WindowsDeathTest(const char* a_statement, Matcher matcher, + const char* file, int line) + : DeathTestImpl(a_statement, std::move(matcher)), + file_(file), + line_(line) {} + + // All of these virtual functions are inherited from DeathTest. + virtual int Wait(); + virtual TestRole AssumeRole(); + + private: + // The name of the file in which the death test is located. + const char* const file_; + // The line number on which the death test is located. + const int line_; + // Handle to the write end of the pipe to the child process. + AutoHandle write_handle_; + // Child process handle. + AutoHandle child_handle_; + // Event the child process uses to signal the parent that it has + // acquired the handle to the write end of the pipe. After seeing this + // event the parent can release its own handles to make sure its + // ReadFile() calls return when the child terminates. + AutoHandle event_handle_; +}; + +// Waits for the child in a death test to exit, returning its exit +// status, or 0 if no child process exists. As a side effect, sets the +// outcome data member. +int WindowsDeathTest::Wait() { + if (!spawned()) return 0; + + // Wait until the child either signals that it has acquired the write end + // of the pipe or it dies. + const HANDLE wait_handles[2] = {child_handle_.Get(), event_handle_.Get()}; + switch (::WaitForMultipleObjects(2, wait_handles, + FALSE, // Waits for any of the handles. + INFINITE)) { + case WAIT_OBJECT_0: + case WAIT_OBJECT_0 + 1: + break; + default: + GTEST_DEATH_TEST_CHECK_(false); // Should not get here. + } + + // The child has acquired the write end of the pipe or exited. + // We release the handle on our side and continue. + write_handle_.Reset(); + event_handle_.Reset(); + + ReadAndInterpretStatusByte(); + + // Waits for the child process to exit if it haven't already. This + // returns immediately if the child has already exited, regardless of + // whether previous calls to WaitForMultipleObjects synchronized on this + // handle or not. + GTEST_DEATH_TEST_CHECK_(WAIT_OBJECT_0 == + ::WaitForSingleObject(child_handle_.Get(), INFINITE)); + DWORD status_code; + GTEST_DEATH_TEST_CHECK_( + ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE); + child_handle_.Reset(); + set_status(static_cast(status_code)); + return status(); +} + +// The AssumeRole process for a Windows death test. It creates a child +// process with the same executable as the current process to run the +// death test. The child process is given the --gtest_filter and +// --gtest_internal_run_death_test flags such that it knows to run the +// current death test only. +DeathTest::TestRole WindowsDeathTest::AssumeRole() { + const UnitTestImpl* const impl = GetUnitTestImpl(); + const InternalRunDeathTestFlag* const flag = + impl->internal_run_death_test_flag(); + const TestInfo* const info = impl->current_test_info(); + const int death_test_index = info->result()->death_test_count(); + + if (flag != nullptr) { + // ParseInternalRunDeathTestFlag() has performed all the necessary + // processing. + set_write_fd(flag->write_fd()); + return EXECUTE_TEST; + } + + // WindowsDeathTest uses an anonymous pipe to communicate results of + // a death test. + SECURITY_ATTRIBUTES handles_are_inheritable = {sizeof(SECURITY_ATTRIBUTES), + nullptr, TRUE}; + HANDLE read_handle, write_handle; + GTEST_DEATH_TEST_CHECK_(::CreatePipe(&read_handle, &write_handle, + &handles_are_inheritable, + 0) // Default buffer size. + != FALSE); + set_read_fd( + ::_open_osfhandle(reinterpret_cast(read_handle), O_RDONLY)); + write_handle_.Reset(write_handle); + event_handle_.Reset(::CreateEvent( + &handles_are_inheritable, + TRUE, // The event will automatically reset to non-signaled state. + FALSE, // The initial state is non-signalled. + nullptr)); // The even is unnamed. + GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != nullptr); + const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ + + "filter=" + info->test_suite_name() + "." + + info->name(); + const std::string internal_flag = + std::string("--") + GTEST_FLAG_PREFIX_ + + "internal_run_death_test=" + file_ + "|" + StreamableToString(line_) + + "|" + StreamableToString(death_test_index) + "|" + + StreamableToString(static_cast(::GetCurrentProcessId())) + + // size_t has the same width as pointers on both 32-bit and 64-bit + // Windows platforms. + // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx. + "|" + StreamableToString(reinterpret_cast(write_handle)) + "|" + + StreamableToString(reinterpret_cast(event_handle_.Get())); + + char executable_path[_MAX_PATH + 1]; // NOLINT + GTEST_DEATH_TEST_CHECK_(_MAX_PATH + 1 != ::GetModuleFileNameA(nullptr, + executable_path, + _MAX_PATH)); + + std::string command_line = std::string(::GetCommandLineA()) + " " + + filter_flag + " \"" + internal_flag + "\""; + + DeathTest::set_last_death_test_message(""); + + CaptureStderr(); + // Flush the log buffers since the log streams are shared with the child. + FlushInfoLog(); + + // The child process will share the standard handles with the parent. + STARTUPINFOA startup_info; + memset(&startup_info, 0, sizeof(STARTUPINFO)); + startup_info.dwFlags = STARTF_USESTDHANDLES; + startup_info.hStdInput = ::GetStdHandle(STD_INPUT_HANDLE); + startup_info.hStdOutput = ::GetStdHandle(STD_OUTPUT_HANDLE); + startup_info.hStdError = ::GetStdHandle(STD_ERROR_HANDLE); + + PROCESS_INFORMATION process_info; + GTEST_DEATH_TEST_CHECK_( + ::CreateProcessA( + executable_path, const_cast(command_line.c_str()), + nullptr, // Returned process handle is not inheritable. + nullptr, // Returned thread handle is not inheritable. + TRUE, // Child inherits all inheritable handles (for write_handle_). + 0x0, // Default creation flags. + nullptr, // Inherit the parent's environment. + UnitTest::GetInstance()->original_working_dir(), &startup_info, + &process_info) != FALSE); + child_handle_.Reset(process_info.hProcess); + ::CloseHandle(process_info.hThread); + set_spawned(true); + return OVERSEE_TEST; +} + +#elif GTEST_OS_FUCHSIA + +class FuchsiaDeathTest : public DeathTestImpl { + public: + FuchsiaDeathTest(const char* a_statement, Matcher matcher, + const char* file, int line) + : DeathTestImpl(a_statement, std::move(matcher)), + file_(file), + line_(line) {} + + // All of these virtual functions are inherited from DeathTest. + int Wait() override; + TestRole AssumeRole() override; + std::string GetErrorLogs() override; + + private: + // The name of the file in which the death test is located. + const char* const file_; + // The line number on which the death test is located. + const int line_; + // The stderr data captured by the child process. + std::string captured_stderr_; + + zx::process child_process_; + zx::channel exception_channel_; + zx::socket stderr_socket_; +}; + +// Utility class for accumulating command-line arguments. +class Arguments { + public: + Arguments() { args_.push_back(nullptr); } + + ~Arguments() { + for (std::vector::iterator i = args_.begin(); i != args_.end(); + ++i) { + free(*i); + } + } + void AddArgument(const char* argument) { + args_.insert(args_.end() - 1, posix::StrDup(argument)); + } + + template + void AddArguments(const ::std::vector& arguments) { + for (typename ::std::vector::const_iterator i = arguments.begin(); + i != arguments.end(); ++i) { + args_.insert(args_.end() - 1, posix::StrDup(i->c_str())); + } + } + char* const* Argv() { return &args_[0]; } + + int size() { return static_cast(args_.size()) - 1; } + + private: + std::vector args_; +}; + +// Waits for the child in a death test to exit, returning its exit +// status, or 0 if no child process exists. As a side effect, sets the +// outcome data member. +int FuchsiaDeathTest::Wait() { + const int kProcessKey = 0; + const int kSocketKey = 1; + const int kExceptionKey = 2; + + if (!spawned()) return 0; + + // Create a port to wait for socket/task/exception events. + zx_status_t status_zx; + zx::port port; + status_zx = zx::port::create(0, &port); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + + // Register to wait for the child process to terminate. + status_zx = + child_process_.wait_async(port, kProcessKey, ZX_PROCESS_TERMINATED, 0); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + + // Register to wait for the socket to be readable or closed. + status_zx = stderr_socket_.wait_async( + port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED, 0); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + + // Register to wait for an exception. + status_zx = exception_channel_.wait_async(port, kExceptionKey, + ZX_CHANNEL_READABLE, 0); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + + bool process_terminated = false; + bool socket_closed = false; + do { + zx_port_packet_t packet = {}; + status_zx = port.wait(zx::time::infinite(), &packet); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + + if (packet.key == kExceptionKey) { + // Process encountered an exception. Kill it directly rather than + // letting other handlers process the event. We will get a kProcessKey + // event when the process actually terminates. + status_zx = child_process_.kill(); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + } else if (packet.key == kProcessKey) { + // Process terminated. + GTEST_DEATH_TEST_CHECK_(ZX_PKT_IS_SIGNAL_ONE(packet.type)); + GTEST_DEATH_TEST_CHECK_(packet.signal.observed & ZX_PROCESS_TERMINATED); + process_terminated = true; + } else if (packet.key == kSocketKey) { + GTEST_DEATH_TEST_CHECK_(ZX_PKT_IS_SIGNAL_ONE(packet.type)); + if (packet.signal.observed & ZX_SOCKET_READABLE) { + // Read data from the socket. + constexpr size_t kBufferSize = 1024; + do { + size_t old_length = captured_stderr_.length(); + size_t bytes_read = 0; + captured_stderr_.resize(old_length + kBufferSize); + status_zx = + stderr_socket_.read(0, &captured_stderr_.front() + old_length, + kBufferSize, &bytes_read); + captured_stderr_.resize(old_length + bytes_read); + } while (status_zx == ZX_OK); + if (status_zx == ZX_ERR_PEER_CLOSED) { + socket_closed = true; + } else { + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_ERR_SHOULD_WAIT); + status_zx = stderr_socket_.wait_async( + port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED, 0); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + } + } else { + GTEST_DEATH_TEST_CHECK_(packet.signal.observed & ZX_SOCKET_PEER_CLOSED); + socket_closed = true; + } + } + } while (!process_terminated && !socket_closed); + + ReadAndInterpretStatusByte(); + + zx_info_process_t buffer; + status_zx = child_process_.get_info(ZX_INFO_PROCESS, &buffer, sizeof(buffer), + nullptr, nullptr); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + + GTEST_DEATH_TEST_CHECK_(buffer.flags & ZX_INFO_PROCESS_FLAG_EXITED); + set_status(static_cast(buffer.return_code)); + return status(); +} + +// The AssumeRole process for a Fuchsia death test. It creates a child +// process with the same executable as the current process to run the +// death test. The child process is given the --gtest_filter and +// --gtest_internal_run_death_test flags such that it knows to run the +// current death test only. +DeathTest::TestRole FuchsiaDeathTest::AssumeRole() { + const UnitTestImpl* const impl = GetUnitTestImpl(); + const InternalRunDeathTestFlag* const flag = + impl->internal_run_death_test_flag(); + const TestInfo* const info = impl->current_test_info(); + const int death_test_index = info->result()->death_test_count(); + + if (flag != nullptr) { + // ParseInternalRunDeathTestFlag() has performed all the necessary + // processing. + set_write_fd(kFuchsiaReadPipeFd); + return EXECUTE_TEST; + } + + // Flush the log buffers since the log streams are shared with the child. + FlushInfoLog(); + + // Build the child process command line. + const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ + + "filter=" + info->test_suite_name() + "." + + info->name(); + const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ + + kInternalRunDeathTestFlag + "=" + file_ + + "|" + StreamableToString(line_) + "|" + + StreamableToString(death_test_index); + Arguments args; + args.AddArguments(GetInjectableArgvs()); + args.AddArgument(filter_flag.c_str()); + args.AddArgument(internal_flag.c_str()); + + // Build the pipe for communication with the child. + zx_status_t status; + zx_handle_t child_pipe_handle; + int child_pipe_fd; + status = fdio_pipe_half(&child_pipe_fd, &child_pipe_handle); + GTEST_DEATH_TEST_CHECK_(status == ZX_OK); + set_read_fd(child_pipe_fd); + + // Set the pipe handle for the child. + fdio_spawn_action_t spawn_actions[2] = {}; + fdio_spawn_action_t* add_handle_action = &spawn_actions[0]; + add_handle_action->action = FDIO_SPAWN_ACTION_ADD_HANDLE; + add_handle_action->h.id = PA_HND(PA_FD, kFuchsiaReadPipeFd); + add_handle_action->h.handle = child_pipe_handle; + + // Create a socket pair will be used to receive the child process' stderr. + zx::socket stderr_producer_socket; + status = zx::socket::create(0, &stderr_producer_socket, &stderr_socket_); + GTEST_DEATH_TEST_CHECK_(status >= 0); + int stderr_producer_fd = -1; + status = + fdio_fd_create(stderr_producer_socket.release(), &stderr_producer_fd); + GTEST_DEATH_TEST_CHECK_(status >= 0); + + // Make the stderr socket nonblocking. + GTEST_DEATH_TEST_CHECK_(fcntl(stderr_producer_fd, F_SETFL, 0) == 0); + + fdio_spawn_action_t* add_stderr_action = &spawn_actions[1]; + add_stderr_action->action = FDIO_SPAWN_ACTION_CLONE_FD; + add_stderr_action->fd.local_fd = stderr_producer_fd; + add_stderr_action->fd.target_fd = STDERR_FILENO; + + // Create a child job. + zx_handle_t child_job = ZX_HANDLE_INVALID; + status = zx_job_create(zx_job_default(), 0, &child_job); + GTEST_DEATH_TEST_CHECK_(status == ZX_OK); + zx_policy_basic_t policy; + policy.condition = ZX_POL_NEW_ANY; + policy.policy = ZX_POL_ACTION_ALLOW; + status = zx_job_set_policy(child_job, ZX_JOB_POL_RELATIVE, ZX_JOB_POL_BASIC, + &policy, 1); + GTEST_DEATH_TEST_CHECK_(status == ZX_OK); + + // Create an exception channel attached to the |child_job|, to allow + // us to suppress the system default exception handler from firing. + status = zx_task_create_exception_channel( + child_job, 0, exception_channel_.reset_and_get_address()); + GTEST_DEATH_TEST_CHECK_(status == ZX_OK); + + // Spawn the child process. + status = fdio_spawn_etc(child_job, FDIO_SPAWN_CLONE_ALL, args.Argv()[0], + args.Argv(), nullptr, 2, spawn_actions, + child_process_.reset_and_get_address(), nullptr); + GTEST_DEATH_TEST_CHECK_(status == ZX_OK); + + set_spawned(true); + return OVERSEE_TEST; +} + +std::string FuchsiaDeathTest::GetErrorLogs() { return captured_stderr_; } + +#else // We are neither on Windows, nor on Fuchsia. + +// ForkingDeathTest provides implementations for most of the abstract +// methods of the DeathTest interface. Only the AssumeRole method is +// left undefined. +class ForkingDeathTest : public DeathTestImpl { + public: + ForkingDeathTest(const char* statement, Matcher matcher); + + // All of these virtual functions are inherited from DeathTest. + int Wait() override; + + protected: + void set_child_pid(pid_t child_pid) { child_pid_ = child_pid; } + + private: + // PID of child process during death test; 0 in the child process itself. + pid_t child_pid_; +}; + +// Constructs a ForkingDeathTest. +ForkingDeathTest::ForkingDeathTest(const char* a_statement, + Matcher matcher) + : DeathTestImpl(a_statement, std::move(matcher)), child_pid_(-1) {} + +// Waits for the child in a death test to exit, returning its exit +// status, or 0 if no child process exists. As a side effect, sets the +// outcome data member. +int ForkingDeathTest::Wait() { + if (!spawned()) return 0; + + ReadAndInterpretStatusByte(); + + int status_value; + GTEST_DEATH_TEST_CHECK_SYSCALL_(waitpid(child_pid_, &status_value, 0)); + set_status(status_value); + return status_value; +} + +// A concrete death test class that forks, then immediately runs the test +// in the child process. +class NoExecDeathTest : public ForkingDeathTest { + public: + NoExecDeathTest(const char* a_statement, Matcher matcher) + : ForkingDeathTest(a_statement, std::move(matcher)) {} + TestRole AssumeRole() override; +}; + +// The AssumeRole process for a fork-and-run death test. It implements a +// straightforward fork, with a simple pipe to transmit the status byte. +DeathTest::TestRole NoExecDeathTest::AssumeRole() { + const size_t thread_count = GetThreadCount(); + if (thread_count != 1) { + GTEST_LOG_(WARNING) << DeathTestThreadWarning(thread_count); + } + + int pipe_fd[2]; + GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1); + + DeathTest::set_last_death_test_message(""); + CaptureStderr(); + // When we fork the process below, the log file buffers are copied, but the + // file descriptors are shared. We flush all log files here so that closing + // the file descriptors in the child process doesn't throw off the + // synchronization between descriptors and buffers in the parent process. + // This is as close to the fork as possible to avoid a race condition in case + // there are multiple threads running before the death test, and another + // thread writes to the log file. + FlushInfoLog(); + + const pid_t child_pid = fork(); + GTEST_DEATH_TEST_CHECK_(child_pid != -1); + set_child_pid(child_pid); + if (child_pid == 0) { + GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[0])); + set_write_fd(pipe_fd[1]); + // Redirects all logging to stderr in the child process to prevent + // concurrent writes to the log files. We capture stderr in the parent + // process and append the child process' output to a log. + LogToStderr(); + // Event forwarding to the listeners of event listener API mush be shut + // down in death test subprocesses. + GetUnitTestImpl()->listeners()->SuppressEventForwarding(); + g_in_fast_death_test_child = true; + return EXECUTE_TEST; + } else { + GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1])); + set_read_fd(pipe_fd[0]); + set_spawned(true); + return OVERSEE_TEST; + } +} + +// A concrete death test class that forks and re-executes the main +// program from the beginning, with command-line flags set that cause +// only this specific death test to be run. +class ExecDeathTest : public ForkingDeathTest { + public: + ExecDeathTest(const char* a_statement, Matcher matcher, + const char* file, int line) + : ForkingDeathTest(a_statement, std::move(matcher)), + file_(file), + line_(line) {} + TestRole AssumeRole() override; + + private: + static ::std::vector GetArgvsForDeathTestChildProcess() { + ::std::vector args = GetInjectableArgvs(); +#if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_) + ::std::vector extra_args = + GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_(); + args.insert(args.end(), extra_args.begin(), extra_args.end()); +#endif // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_) + return args; + } + // The name of the file in which the death test is located. + const char* const file_; + // The line number on which the death test is located. + const int line_; +}; + +// Utility class for accumulating command-line arguments. +class Arguments { + public: + Arguments() { args_.push_back(nullptr); } + + ~Arguments() { + for (std::vector::iterator i = args_.begin(); i != args_.end(); + ++i) { + free(*i); + } + } + void AddArgument(const char* argument) { + args_.insert(args_.end() - 1, posix::StrDup(argument)); + } + + template + void AddArguments(const ::std::vector& arguments) { + for (typename ::std::vector::const_iterator i = arguments.begin(); + i != arguments.end(); ++i) { + args_.insert(args_.end() - 1, posix::StrDup(i->c_str())); + } + } + char* const* Argv() { return &args_[0]; } + + private: + std::vector args_; +}; + +// A struct that encompasses the arguments to the child process of a +// threadsafe-style death test process. +struct ExecDeathTestArgs { + char* const* argv; // Command-line arguments for the child's call to exec + int close_fd; // File descriptor to close; the read end of a pipe +}; + +#if GTEST_OS_QNX +extern "C" char** environ; +#else // GTEST_OS_QNX +// The main function for a threadsafe-style death test child process. +// This function is called in a clone()-ed process and thus must avoid +// any potentially unsafe operations like malloc or libc functions. +static int ExecDeathTestChildMain(void* child_arg) { + ExecDeathTestArgs* const args = static_cast(child_arg); + GTEST_DEATH_TEST_CHECK_SYSCALL_(close(args->close_fd)); + + // We need to execute the test program in the same environment where + // it was originally invoked. Therefore we change to the original + // working directory first. + const char* const original_dir = + UnitTest::GetInstance()->original_working_dir(); + // We can safely call chdir() as it's a direct system call. + if (chdir(original_dir) != 0) { + DeathTestAbort(std::string("chdir(\"") + original_dir + + "\") failed: " + GetLastErrnoDescription()); + return EXIT_FAILURE; + } + + // We can safely call execv() as it's almost a direct system call. We + // cannot use execvp() as it's a libc function and thus potentially + // unsafe. Since execv() doesn't search the PATH, the user must + // invoke the test program via a valid path that contains at least + // one path separator. + execv(args->argv[0], args->argv); + DeathTestAbort(std::string("execv(") + args->argv[0] + ", ...) in " + + original_dir + " failed: " + GetLastErrnoDescription()); + return EXIT_FAILURE; +} +#endif // GTEST_OS_QNX + +#if GTEST_HAS_CLONE +// Two utility routines that together determine the direction the stack +// grows. +// This could be accomplished more elegantly by a single recursive +// function, but we want to guard against the unlikely possibility of +// a smart compiler optimizing the recursion away. +// +// GTEST_NO_INLINE_ is required to prevent GCC 4.6 from inlining +// StackLowerThanAddress into StackGrowsDown, which then doesn't give +// correct answer. +static void StackLowerThanAddress(const void* ptr, + bool* result) GTEST_NO_INLINE_; +// Make sure sanitizers do not tamper with the stack here. +// Ideally, we want to use `__builtin_frame_address` instead of a local variable +// address with sanitizer disabled, but it does not work when the +// compiler optimizes the stack frame out, which happens on PowerPC targets. +// HWAddressSanitizer add a random tag to the MSB of the local variable address, +// making comparison result unpredictable. +GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ +GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ +static void StackLowerThanAddress(const void* ptr, bool* result) { + int dummy = 0; + *result = std::less()(&dummy, ptr); +} + +// Make sure AddressSanitizer does not tamper with the stack here. +GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ +GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ +static bool StackGrowsDown() { + int dummy = 0; + bool result; + StackLowerThanAddress(&dummy, &result); + return result; +} +#endif // GTEST_HAS_CLONE + +// Spawns a child process with the same executable as the current process in +// a thread-safe manner and instructs it to run the death test. The +// implementation uses fork(2) + exec. On systems where clone(2) is +// available, it is used instead, being slightly more thread-safe. On QNX, +// fork supports only single-threaded environments, so this function uses +// spawn(2) there instead. The function dies with an error message if +// anything goes wrong. +static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) { + ExecDeathTestArgs args = {argv, close_fd}; + pid_t child_pid = -1; + +#if GTEST_OS_QNX + // Obtains the current directory and sets it to be closed in the child + // process. + const int cwd_fd = open(".", O_RDONLY); + GTEST_DEATH_TEST_CHECK_(cwd_fd != -1); + GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(cwd_fd, F_SETFD, FD_CLOEXEC)); + // We need to execute the test program in the same environment where + // it was originally invoked. Therefore we change to the original + // working directory first. + const char* const original_dir = + UnitTest::GetInstance()->original_working_dir(); + // We can safely call chdir() as it's a direct system call. + if (chdir(original_dir) != 0) { + DeathTestAbort(std::string("chdir(\"") + original_dir + + "\") failed: " + GetLastErrnoDescription()); + return EXIT_FAILURE; + } + + int fd_flags; + // Set close_fd to be closed after spawn. + GTEST_DEATH_TEST_CHECK_SYSCALL_(fd_flags = fcntl(close_fd, F_GETFD)); + GTEST_DEATH_TEST_CHECK_SYSCALL_( + fcntl(close_fd, F_SETFD, fd_flags | FD_CLOEXEC)); + struct inheritance inherit = {0}; + // spawn is a system call. + child_pid = spawn(args.argv[0], 0, nullptr, &inherit, args.argv, environ); + // Restores the current working directory. + GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1); + GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd)); + +#else // GTEST_OS_QNX +#if GTEST_OS_LINUX + // When a SIGPROF signal is received while fork() or clone() are executing, + // the process may hang. To avoid this, we ignore SIGPROF here and re-enable + // it after the call to fork()/clone() is complete. + struct sigaction saved_sigprof_action; + struct sigaction ignore_sigprof_action; + memset(&ignore_sigprof_action, 0, sizeof(ignore_sigprof_action)); + sigemptyset(&ignore_sigprof_action.sa_mask); + ignore_sigprof_action.sa_handler = SIG_IGN; + GTEST_DEATH_TEST_CHECK_SYSCALL_( + sigaction(SIGPROF, &ignore_sigprof_action, &saved_sigprof_action)); +#endif // GTEST_OS_LINUX + +#if GTEST_HAS_CLONE + const bool use_fork = GTEST_FLAG_GET(death_test_use_fork); + + if (!use_fork) { + static const bool stack_grows_down = StackGrowsDown(); + const auto stack_size = static_cast(getpagesize() * 2); + // MMAP_ANONYMOUS is not defined on Mac, so we use MAP_ANON instead. + void* const stack = mmap(nullptr, stack_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0); + GTEST_DEATH_TEST_CHECK_(stack != MAP_FAILED); + + // Maximum stack alignment in bytes: For a downward-growing stack, this + // amount is subtracted from size of the stack space to get an address + // that is within the stack space and is aligned on all systems we care + // about. As far as I know there is no ABI with stack alignment greater + // than 64. We assume stack and stack_size already have alignment of + // kMaxStackAlignment. + const size_t kMaxStackAlignment = 64; + void* const stack_top = + static_cast(stack) + + (stack_grows_down ? stack_size - kMaxStackAlignment : 0); + GTEST_DEATH_TEST_CHECK_( + static_cast(stack_size) > kMaxStackAlignment && + reinterpret_cast(stack_top) % kMaxStackAlignment == 0); + + child_pid = clone(&ExecDeathTestChildMain, stack_top, SIGCHLD, &args); + + GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1); + } +#else + const bool use_fork = true; +#endif // GTEST_HAS_CLONE + + if (use_fork && (child_pid = fork()) == 0) { + ExecDeathTestChildMain(&args); + _exit(0); + } +#endif // GTEST_OS_QNX +#if GTEST_OS_LINUX + GTEST_DEATH_TEST_CHECK_SYSCALL_( + sigaction(SIGPROF, &saved_sigprof_action, nullptr)); +#endif // GTEST_OS_LINUX + + GTEST_DEATH_TEST_CHECK_(child_pid != -1); + return child_pid; +} + +// The AssumeRole process for a fork-and-exec death test. It re-executes the +// main program from the beginning, setting the --gtest_filter +// and --gtest_internal_run_death_test flags to cause only the current +// death test to be re-run. +DeathTest::TestRole ExecDeathTest::AssumeRole() { + const UnitTestImpl* const impl = GetUnitTestImpl(); + const InternalRunDeathTestFlag* const flag = + impl->internal_run_death_test_flag(); + const TestInfo* const info = impl->current_test_info(); + const int death_test_index = info->result()->death_test_count(); + + if (flag != nullptr) { + set_write_fd(flag->write_fd()); + return EXECUTE_TEST; + } + + int pipe_fd[2]; + GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1); + // Clear the close-on-exec flag on the write end of the pipe, lest + // it be closed when the child process does an exec: + GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1); + + const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ + + "filter=" + info->test_suite_name() + "." + + info->name(); + const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ + + "internal_run_death_test=" + file_ + "|" + + StreamableToString(line_) + "|" + + StreamableToString(death_test_index) + "|" + + StreamableToString(pipe_fd[1]); + Arguments args; + args.AddArguments(GetArgvsForDeathTestChildProcess()); + args.AddArgument(filter_flag.c_str()); + args.AddArgument(internal_flag.c_str()); + + DeathTest::set_last_death_test_message(""); + + CaptureStderr(); + // See the comment in NoExecDeathTest::AssumeRole for why the next line + // is necessary. + FlushInfoLog(); + + const pid_t child_pid = ExecDeathTestSpawnChild(args.Argv(), pipe_fd[0]); + GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1])); + set_child_pid(child_pid); + set_read_fd(pipe_fd[0]); + set_spawned(true); + return OVERSEE_TEST; +} + +#endif // !GTEST_OS_WINDOWS + +// Creates a concrete DeathTest-derived class that depends on the +// --gtest_death_test_style flag, and sets the pointer pointed to +// by the "test" argument to its address. If the test should be +// skipped, sets that pointer to NULL. Returns true, unless the +// flag is set to an invalid value. +bool DefaultDeathTestFactory::Create(const char* statement, + Matcher matcher, + const char* file, int line, + DeathTest** test) { + UnitTestImpl* const impl = GetUnitTestImpl(); + const InternalRunDeathTestFlag* const flag = + impl->internal_run_death_test_flag(); + const int death_test_index = + impl->current_test_info()->increment_death_test_count(); + + if (flag != nullptr) { + if (death_test_index > flag->index()) { + DeathTest::set_last_death_test_message( + "Death test count (" + StreamableToString(death_test_index) + + ") somehow exceeded expected maximum (" + + StreamableToString(flag->index()) + ")"); + return false; + } + + if (!(flag->file() == file && flag->line() == line && + flag->index() == death_test_index)) { + *test = nullptr; + return true; + } + } + +#if GTEST_OS_WINDOWS + + if (GTEST_FLAG_GET(death_test_style) == "threadsafe" || + GTEST_FLAG_GET(death_test_style) == "fast") { + *test = new WindowsDeathTest(statement, std::move(matcher), file, line); + } + +#elif GTEST_OS_FUCHSIA + + if (GTEST_FLAG_GET(death_test_style) == "threadsafe" || + GTEST_FLAG_GET(death_test_style) == "fast") { + *test = new FuchsiaDeathTest(statement, std::move(matcher), file, line); + } + +#else + + if (GTEST_FLAG_GET(death_test_style) == "threadsafe") { + *test = new ExecDeathTest(statement, std::move(matcher), file, line); + } else if (GTEST_FLAG_GET(death_test_style) == "fast") { + *test = new NoExecDeathTest(statement, std::move(matcher)); + } + +#endif // GTEST_OS_WINDOWS + + else { // NOLINT - this is more readable than unbalanced brackets inside #if. + DeathTest::set_last_death_test_message("Unknown death test style \"" + + GTEST_FLAG_GET(death_test_style) + + "\" encountered"); + return false; + } + + return true; +} + +#if GTEST_OS_WINDOWS +// Recreates the pipe and event handles from the provided parameters, +// signals the event, and returns a file descriptor wrapped around the pipe +// handle. This function is called in the child process only. +static int GetStatusFileDescriptor(unsigned int parent_process_id, + size_t write_handle_as_size_t, + size_t event_handle_as_size_t) { + AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE, + FALSE, // Non-inheritable. + parent_process_id)); + if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) { + DeathTestAbort("Unable to open parent process " + + StreamableToString(parent_process_id)); + } + + GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t)); + + const HANDLE write_handle = reinterpret_cast(write_handle_as_size_t); + HANDLE dup_write_handle; + + // The newly initialized handle is accessible only in the parent + // process. To obtain one accessible within the child, we need to use + // DuplicateHandle. + if (!::DuplicateHandle(parent_process_handle.Get(), write_handle, + ::GetCurrentProcess(), &dup_write_handle, + 0x0, // Requested privileges ignored since + // DUPLICATE_SAME_ACCESS is used. + FALSE, // Request non-inheritable handler. + DUPLICATE_SAME_ACCESS)) { + DeathTestAbort("Unable to duplicate the pipe handle " + + StreamableToString(write_handle_as_size_t) + + " from the parent process " + + StreamableToString(parent_process_id)); + } + + const HANDLE event_handle = reinterpret_cast(event_handle_as_size_t); + HANDLE dup_event_handle; + + if (!::DuplicateHandle(parent_process_handle.Get(), event_handle, + ::GetCurrentProcess(), &dup_event_handle, 0x0, FALSE, + DUPLICATE_SAME_ACCESS)) { + DeathTestAbort("Unable to duplicate the event handle " + + StreamableToString(event_handle_as_size_t) + + " from the parent process " + + StreamableToString(parent_process_id)); + } + + const int write_fd = + ::_open_osfhandle(reinterpret_cast(dup_write_handle), O_APPEND); + if (write_fd == -1) { + DeathTestAbort("Unable to convert pipe handle " + + StreamableToString(write_handle_as_size_t) + + " to a file descriptor"); + } + + // Signals the parent that the write end of the pipe has been acquired + // so the parent can release its own write end. + ::SetEvent(dup_event_handle); + + return write_fd; +} +#endif // GTEST_OS_WINDOWS + +// Returns a newly created InternalRunDeathTestFlag object with fields +// initialized from the GTEST_FLAG(internal_run_death_test) flag if +// the flag is specified; otherwise returns NULL. +InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() { + if (GTEST_FLAG_GET(internal_run_death_test) == "") return nullptr; + + // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we + // can use it here. + int line = -1; + int index = -1; + ::std::vector< ::std::string> fields; + SplitString(GTEST_FLAG_GET(internal_run_death_test), '|', &fields); + int write_fd = -1; + +#if GTEST_OS_WINDOWS + + unsigned int parent_process_id = 0; + size_t write_handle_as_size_t = 0; + size_t event_handle_as_size_t = 0; + + if (fields.size() != 6 || !ParseNaturalNumber(fields[1], &line) || + !ParseNaturalNumber(fields[2], &index) || + !ParseNaturalNumber(fields[3], &parent_process_id) || + !ParseNaturalNumber(fields[4], &write_handle_as_size_t) || + !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) { + DeathTestAbort("Bad --gtest_internal_run_death_test flag: " + + GTEST_FLAG_GET(internal_run_death_test)); + } + write_fd = GetStatusFileDescriptor(parent_process_id, write_handle_as_size_t, + event_handle_as_size_t); + +#elif GTEST_OS_FUCHSIA + + if (fields.size() != 3 || !ParseNaturalNumber(fields[1], &line) || + !ParseNaturalNumber(fields[2], &index)) { + DeathTestAbort("Bad --gtest_internal_run_death_test flag: " + + GTEST_FLAG_GET(internal_run_death_test)); + } + +#else + + if (fields.size() != 4 || !ParseNaturalNumber(fields[1], &line) || + !ParseNaturalNumber(fields[2], &index) || + !ParseNaturalNumber(fields[3], &write_fd)) { + DeathTestAbort("Bad --gtest_internal_run_death_test flag: " + + GTEST_FLAG_GET(internal_run_death_test)); + } + +#endif // GTEST_OS_WINDOWS + + return new InternalRunDeathTestFlag(fields[0], line, index, write_fd); +} + +} // namespace internal + +#endif // GTEST_HAS_DEATH_TEST + +} // namespace testing diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest-filepath.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-filepath.cc new file mode 100644 index 0000000000..f6ee90cdb7 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-filepath.cc @@ -0,0 +1,367 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "gtest/internal/gtest-filepath.h" + +#include + +#include "gtest/gtest-message.h" +#include "gtest/internal/gtest-port.h" + +#if GTEST_OS_WINDOWS_MOBILE +#include +#elif GTEST_OS_WINDOWS +#include +#include +#else +#include + +#include // Some Linux distributions define PATH_MAX here. +#endif // GTEST_OS_WINDOWS_MOBILE + +#include "gtest/internal/gtest-string.h" + +#if GTEST_OS_WINDOWS +#define GTEST_PATH_MAX_ _MAX_PATH +#elif defined(PATH_MAX) +#define GTEST_PATH_MAX_ PATH_MAX +#elif defined(_XOPEN_PATH_MAX) +#define GTEST_PATH_MAX_ _XOPEN_PATH_MAX +#else +#define GTEST_PATH_MAX_ _POSIX_PATH_MAX +#endif // GTEST_OS_WINDOWS + +namespace testing { +namespace internal { + +#if GTEST_OS_WINDOWS +// On Windows, '\\' is the standard path separator, but many tools and the +// Windows API also accept '/' as an alternate path separator. Unless otherwise +// noted, a file path can contain either kind of path separators, or a mixture +// of them. +const char kPathSeparator = '\\'; +const char kAlternatePathSeparator = '/'; +const char kAlternatePathSeparatorString[] = "/"; +#if GTEST_OS_WINDOWS_MOBILE +// Windows CE doesn't have a current directory. You should not use +// the current directory in tests on Windows CE, but this at least +// provides a reasonable fallback. +const char kCurrentDirectoryString[] = "\\"; +// Windows CE doesn't define INVALID_FILE_ATTRIBUTES +const DWORD kInvalidFileAttributes = 0xffffffff; +#else +const char kCurrentDirectoryString[] = ".\\"; +#endif // GTEST_OS_WINDOWS_MOBILE +#else +const char kPathSeparator = '/'; +const char kCurrentDirectoryString[] = "./"; +#endif // GTEST_OS_WINDOWS + +// Returns whether the given character is a valid path separator. +static bool IsPathSeparator(char c) { +#if GTEST_HAS_ALT_PATH_SEP_ + return (c == kPathSeparator) || (c == kAlternatePathSeparator); +#else + return c == kPathSeparator; +#endif +} + +// Returns the current working directory, or "" if unsuccessful. +FilePath FilePath::GetCurrentDir() { +#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \ + GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_ESP32 || \ + GTEST_OS_XTENSA + // These platforms do not have a current directory, so we just return + // something reasonable. + return FilePath(kCurrentDirectoryString); +#elif GTEST_OS_WINDOWS + char cwd[GTEST_PATH_MAX_ + 1] = {'\0'}; + return FilePath(_getcwd(cwd, sizeof(cwd)) == nullptr ? "" : cwd); +#else + char cwd[GTEST_PATH_MAX_ + 1] = {'\0'}; + char* result = getcwd(cwd, sizeof(cwd)); +#if GTEST_OS_NACL + // getcwd will likely fail in NaCl due to the sandbox, so return something + // reasonable. The user may have provided a shim implementation for getcwd, + // however, so fallback only when failure is detected. + return FilePath(result == nullptr ? kCurrentDirectoryString : cwd); +#endif // GTEST_OS_NACL + return FilePath(result == nullptr ? "" : cwd); +#endif // GTEST_OS_WINDOWS_MOBILE +} + +// Returns a copy of the FilePath with the case-insensitive extension removed. +// Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns +// FilePath("dir/file"). If a case-insensitive extension is not +// found, returns a copy of the original FilePath. +FilePath FilePath::RemoveExtension(const char* extension) const { + const std::string dot_extension = std::string(".") + extension; + if (String::EndsWithCaseInsensitive(pathname_, dot_extension)) { + return FilePath( + pathname_.substr(0, pathname_.length() - dot_extension.length())); + } + return *this; +} + +// Returns a pointer to the last occurrence of a valid path separator in +// the FilePath. On Windows, for example, both '/' and '\' are valid path +// separators. Returns NULL if no path separator was found. +const char* FilePath::FindLastPathSeparator() const { + const char* const last_sep = strrchr(c_str(), kPathSeparator); +#if GTEST_HAS_ALT_PATH_SEP_ + const char* const last_alt_sep = strrchr(c_str(), kAlternatePathSeparator); + // Comparing two pointers of which only one is NULL is undefined. + if (last_alt_sep != nullptr && + (last_sep == nullptr || last_alt_sep > last_sep)) { + return last_alt_sep; + } +#endif + return last_sep; +} + +// Returns a copy of the FilePath with the directory part removed. +// Example: FilePath("path/to/file").RemoveDirectoryName() returns +// FilePath("file"). If there is no directory part ("just_a_file"), it returns +// the FilePath unmodified. If there is no file part ("just_a_dir/") it +// returns an empty FilePath (""). +// On Windows platform, '\' is the path separator, otherwise it is '/'. +FilePath FilePath::RemoveDirectoryName() const { + const char* const last_sep = FindLastPathSeparator(); + return last_sep ? FilePath(last_sep + 1) : *this; +} + +// RemoveFileName returns the directory path with the filename removed. +// Example: FilePath("path/to/file").RemoveFileName() returns "path/to/". +// If the FilePath is "a_file" or "/a_file", RemoveFileName returns +// FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does +// not have a file, like "just/a/dir/", it returns the FilePath unmodified. +// On Windows platform, '\' is the path separator, otherwise it is '/'. +FilePath FilePath::RemoveFileName() const { + const char* const last_sep = FindLastPathSeparator(); + std::string dir; + if (last_sep) { + dir = std::string(c_str(), static_cast(last_sep + 1 - c_str())); + } else { + dir = kCurrentDirectoryString; + } + return FilePath(dir); +} + +// Helper functions for naming files in a directory for xml output. + +// Given directory = "dir", base_name = "test", number = 0, +// extension = "xml", returns "dir/test.xml". If number is greater +// than zero (e.g., 12), returns "dir/test_12.xml". +// On Windows platform, uses \ as the separator rather than /. +FilePath FilePath::MakeFileName(const FilePath& directory, + const FilePath& base_name, int number, + const char* extension) { + std::string file; + if (number == 0) { + file = base_name.string() + "." + extension; + } else { + file = + base_name.string() + "_" + StreamableToString(number) + "." + extension; + } + return ConcatPaths(directory, FilePath(file)); +} + +// Given directory = "dir", relative_path = "test.xml", returns "dir/test.xml". +// On Windows, uses \ as the separator rather than /. +FilePath FilePath::ConcatPaths(const FilePath& directory, + const FilePath& relative_path) { + if (directory.IsEmpty()) return relative_path; + const FilePath dir(directory.RemoveTrailingPathSeparator()); + return FilePath(dir.string() + kPathSeparator + relative_path.string()); +} + +// Returns true if pathname describes something findable in the file-system, +// either a file, directory, or whatever. +bool FilePath::FileOrDirectoryExists() const { +#if GTEST_OS_WINDOWS_MOBILE + LPCWSTR unicode = String::AnsiToUtf16(pathname_.c_str()); + const DWORD attributes = GetFileAttributes(unicode); + delete[] unicode; + return attributes != kInvalidFileAttributes; +#else + posix::StatStruct file_stat{}; + return posix::Stat(pathname_.c_str(), &file_stat) == 0; +#endif // GTEST_OS_WINDOWS_MOBILE +} + +// Returns true if pathname describes a directory in the file-system +// that exists. +bool FilePath::DirectoryExists() const { + bool result = false; +#if GTEST_OS_WINDOWS + // Don't strip off trailing separator if path is a root directory on + // Windows (like "C:\\"). + const FilePath& path(IsRootDirectory() ? *this + : RemoveTrailingPathSeparator()); +#else + const FilePath& path(*this); +#endif + +#if GTEST_OS_WINDOWS_MOBILE + LPCWSTR unicode = String::AnsiToUtf16(path.c_str()); + const DWORD attributes = GetFileAttributes(unicode); + delete[] unicode; + if ((attributes != kInvalidFileAttributes) && + (attributes & FILE_ATTRIBUTE_DIRECTORY)) { + result = true; + } +#else + posix::StatStruct file_stat{}; + result = + posix::Stat(path.c_str(), &file_stat) == 0 && posix::IsDir(file_stat); +#endif // GTEST_OS_WINDOWS_MOBILE + + return result; +} + +// Returns true if pathname describes a root directory. (Windows has one +// root directory per disk drive.) +bool FilePath::IsRootDirectory() const { +#if GTEST_OS_WINDOWS + return pathname_.length() == 3 && IsAbsolutePath(); +#else + return pathname_.length() == 1 && IsPathSeparator(pathname_.c_str()[0]); +#endif +} + +// Returns true if pathname describes an absolute path. +bool FilePath::IsAbsolutePath() const { + const char* const name = pathname_.c_str(); +#if GTEST_OS_WINDOWS + return pathname_.length() >= 3 && + ((name[0] >= 'a' && name[0] <= 'z') || + (name[0] >= 'A' && name[0] <= 'Z')) && + name[1] == ':' && IsPathSeparator(name[2]); +#else + return IsPathSeparator(name[0]); +#endif +} + +// Returns a pathname for a file that does not currently exist. The pathname +// will be directory/base_name.extension or +// directory/base_name_.extension if directory/base_name.extension +// already exists. The number will be incremented until a pathname is found +// that does not already exist. +// Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'. +// There could be a race condition if two or more processes are calling this +// function at the same time -- they could both pick the same filename. +FilePath FilePath::GenerateUniqueFileName(const FilePath& directory, + const FilePath& base_name, + const char* extension) { + FilePath full_pathname; + int number = 0; + do { + full_pathname.Set(MakeFileName(directory, base_name, number++, extension)); + } while (full_pathname.FileOrDirectoryExists()); + return full_pathname; +} + +// Returns true if FilePath ends with a path separator, which indicates that +// it is intended to represent a directory. Returns false otherwise. +// This does NOT check that a directory (or file) actually exists. +bool FilePath::IsDirectory() const { + return !pathname_.empty() && + IsPathSeparator(pathname_.c_str()[pathname_.length() - 1]); +} + +// Create directories so that path exists. Returns true if successful or if +// the directories already exist; returns false if unable to create directories +// for any reason. +bool FilePath::CreateDirectoriesRecursively() const { + if (!this->IsDirectory()) { + return false; + } + + if (pathname_.length() == 0 || this->DirectoryExists()) { + return true; + } + + const FilePath parent(this->RemoveTrailingPathSeparator().RemoveFileName()); + return parent.CreateDirectoriesRecursively() && this->CreateFolder(); +} + +// Create the directory so that path exists. Returns true if successful or +// if the directory already exists; returns false if unable to create the +// directory for any reason, including if the parent directory does not +// exist. Not named "CreateDirectory" because that's a macro on Windows. +bool FilePath::CreateFolder() const { +#if GTEST_OS_WINDOWS_MOBILE + FilePath removed_sep(this->RemoveTrailingPathSeparator()); + LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str()); + int result = CreateDirectory(unicode, nullptr) ? 0 : -1; + delete[] unicode; +#elif GTEST_OS_WINDOWS + int result = _mkdir(pathname_.c_str()); +#elif GTEST_OS_ESP8266 || GTEST_OS_XTENSA + // do nothing + int result = 0; +#else + int result = mkdir(pathname_.c_str(), 0777); +#endif // GTEST_OS_WINDOWS_MOBILE + + if (result == -1) { + return this->DirectoryExists(); // An error is OK if the directory exists. + } + return true; // No error. +} + +// If input name has a trailing separator character, remove it and return the +// name, otherwise return the name string unmodified. +// On Windows platform, uses \ as the separator, other platforms use /. +FilePath FilePath::RemoveTrailingPathSeparator() const { + return IsDirectory() ? FilePath(pathname_.substr(0, pathname_.length() - 1)) + : *this; +} + +// Removes any redundant separators that might be in the pathname. +// For example, "bar///foo" becomes "bar/foo". Does not eliminate other +// redundancies that might be in a pathname involving "." or "..". +void FilePath::Normalize() { + auto out = pathname_.begin(); + + for (const char character : pathname_) { + if (!IsPathSeparator(character)) { + *(out++) = character; + } else if (out == pathname_.begin() || *std::prev(out) != kPathSeparator) { + *(out++) = kPathSeparator; + } else { + continue; + } + } + + pathname_.erase(out, pathname_.end()); +} + +} // namespace internal +} // namespace testing diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest-internal-inl.h b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-internal-inl.h new file mode 100644 index 0000000000..0b9e929c68 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-internal-inl.h @@ -0,0 +1,1212 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Utility functions and classes used by the Google C++ testing framework.// +// This file contains purely Google Test's internal implementation. Please +// DO NOT #INCLUDE IT IN A USER PROGRAM. + +#ifndef GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_ +#define GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_ + +#ifndef _WIN32_WCE +#include +#endif // !_WIN32_WCE +#include +#include // For strtoll/_strtoul64/malloc/free. +#include // For memmove. + +#include +#include +#include +#include +#include + +#include "gtest/internal/gtest-port.h" + +#if GTEST_CAN_STREAM_RESULTS_ +#include // NOLINT +#include // NOLINT +#endif + +#if GTEST_OS_WINDOWS +#include // NOLINT +#endif // GTEST_OS_WINDOWS + +#include "gtest/gtest-spi.h" +#include "gtest/gtest.h" + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +// Declares the flags. +// +// We don't want the users to modify this flag in the code, but want +// Google Test's own unit tests to be able to access it. Therefore we +// declare it here as opposed to in gtest.h. +GTEST_DECLARE_bool_(death_test_use_fork); + +namespace testing { +namespace internal { + +// The value of GetTestTypeId() as seen from within the Google Test +// library. This is solely for testing GetTestTypeId(). +GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest; + +// A valid random seed must be in [1, kMaxRandomSeed]. +const int kMaxRandomSeed = 99999; + +// g_help_flag is true if and only if the --help flag or an equivalent form +// is specified on the command line. +GTEST_API_ extern bool g_help_flag; + +// Returns the current time in milliseconds. +GTEST_API_ TimeInMillis GetTimeInMillis(); + +// Returns true if and only if Google Test should use colors in the output. +GTEST_API_ bool ShouldUseColor(bool stdout_is_tty); + +// Formats the given time in milliseconds as seconds. +GTEST_API_ std::string FormatTimeInMillisAsSeconds(TimeInMillis ms); + +// Converts the given time in milliseconds to a date string in the ISO 8601 +// format, without the timezone information. N.B.: due to the use the +// non-reentrant localtime() function, this function is not thread safe. Do +// not use it in any code that can be called from multiple threads. +GTEST_API_ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms); + +// Parses a string for an Int32 flag, in the form of "--flag=value". +// +// On success, stores the value of the flag in *value, and returns +// true. On failure, returns false without changing *value. +GTEST_API_ bool ParseFlag(const char* str, const char* flag, int32_t* value); + +// Returns a random seed in range [1, kMaxRandomSeed] based on the +// given --gtest_random_seed flag value. +inline int GetRandomSeedFromFlag(int32_t random_seed_flag) { + const unsigned int raw_seed = + (random_seed_flag == 0) ? static_cast(GetTimeInMillis()) + : static_cast(random_seed_flag); + + // Normalizes the actual seed to range [1, kMaxRandomSeed] such that + // it's easy to type. + const int normalized_seed = + static_cast((raw_seed - 1U) % + static_cast(kMaxRandomSeed)) + + 1; + return normalized_seed; +} + +// Returns the first valid random seed after 'seed'. The behavior is +// undefined if 'seed' is invalid. The seed after kMaxRandomSeed is +// considered to be 1. +inline int GetNextRandomSeed(int seed) { + GTEST_CHECK_(1 <= seed && seed <= kMaxRandomSeed) + << "Invalid random seed " << seed << " - must be in [1, " + << kMaxRandomSeed << "]."; + const int next_seed = seed + 1; + return (next_seed > kMaxRandomSeed) ? 1 : next_seed; +} + +// This class saves the values of all Google Test flags in its c'tor, and +// restores them in its d'tor. +class GTestFlagSaver { + public: + // The c'tor. + GTestFlagSaver() { + also_run_disabled_tests_ = GTEST_FLAG_GET(also_run_disabled_tests); + break_on_failure_ = GTEST_FLAG_GET(break_on_failure); + catch_exceptions_ = GTEST_FLAG_GET(catch_exceptions); + color_ = GTEST_FLAG_GET(color); + death_test_style_ = GTEST_FLAG_GET(death_test_style); + death_test_use_fork_ = GTEST_FLAG_GET(death_test_use_fork); + fail_fast_ = GTEST_FLAG_GET(fail_fast); + filter_ = GTEST_FLAG_GET(filter); + internal_run_death_test_ = GTEST_FLAG_GET(internal_run_death_test); + list_tests_ = GTEST_FLAG_GET(list_tests); + output_ = GTEST_FLAG_GET(output); + brief_ = GTEST_FLAG_GET(brief); + print_time_ = GTEST_FLAG_GET(print_time); + print_utf8_ = GTEST_FLAG_GET(print_utf8); + random_seed_ = GTEST_FLAG_GET(random_seed); + repeat_ = GTEST_FLAG_GET(repeat); + recreate_environments_when_repeating_ = + GTEST_FLAG_GET(recreate_environments_when_repeating); + shuffle_ = GTEST_FLAG_GET(shuffle); + stack_trace_depth_ = GTEST_FLAG_GET(stack_trace_depth); + stream_result_to_ = GTEST_FLAG_GET(stream_result_to); + throw_on_failure_ = GTEST_FLAG_GET(throw_on_failure); + } + + // The d'tor is not virtual. DO NOT INHERIT FROM THIS CLASS. + ~GTestFlagSaver() { + GTEST_FLAG_SET(also_run_disabled_tests, also_run_disabled_tests_); + GTEST_FLAG_SET(break_on_failure, break_on_failure_); + GTEST_FLAG_SET(catch_exceptions, catch_exceptions_); + GTEST_FLAG_SET(color, color_); + GTEST_FLAG_SET(death_test_style, death_test_style_); + GTEST_FLAG_SET(death_test_use_fork, death_test_use_fork_); + GTEST_FLAG_SET(filter, filter_); + GTEST_FLAG_SET(fail_fast, fail_fast_); + GTEST_FLAG_SET(internal_run_death_test, internal_run_death_test_); + GTEST_FLAG_SET(list_tests, list_tests_); + GTEST_FLAG_SET(output, output_); + GTEST_FLAG_SET(brief, brief_); + GTEST_FLAG_SET(print_time, print_time_); + GTEST_FLAG_SET(print_utf8, print_utf8_); + GTEST_FLAG_SET(random_seed, random_seed_); + GTEST_FLAG_SET(repeat, repeat_); + GTEST_FLAG_SET(recreate_environments_when_repeating, + recreate_environments_when_repeating_); + GTEST_FLAG_SET(shuffle, shuffle_); + GTEST_FLAG_SET(stack_trace_depth, stack_trace_depth_); + GTEST_FLAG_SET(stream_result_to, stream_result_to_); + GTEST_FLAG_SET(throw_on_failure, throw_on_failure_); + } + + private: + // Fields for saving the original values of flags. + bool also_run_disabled_tests_; + bool break_on_failure_; + bool catch_exceptions_; + std::string color_; + std::string death_test_style_; + bool death_test_use_fork_; + bool fail_fast_; + std::string filter_; + std::string internal_run_death_test_; + bool list_tests_; + std::string output_; + bool brief_; + bool print_time_; + bool print_utf8_; + int32_t random_seed_; + int32_t repeat_; + bool recreate_environments_when_repeating_; + bool shuffle_; + int32_t stack_trace_depth_; + std::string stream_result_to_; + bool throw_on_failure_; +} GTEST_ATTRIBUTE_UNUSED_; + +// Converts a Unicode code point to a narrow string in UTF-8 encoding. +// code_point parameter is of type UInt32 because wchar_t may not be +// wide enough to contain a code point. +// If the code_point is not a valid Unicode code point +// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted +// to "(Invalid Unicode 0xXXXXXXXX)". +GTEST_API_ std::string CodePointToUtf8(uint32_t code_point); + +// Converts a wide string to a narrow string in UTF-8 encoding. +// The wide string is assumed to have the following encoding: +// UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin) +// UTF-32 if sizeof(wchar_t) == 4 (on Linux) +// Parameter str points to a null-terminated wide string. +// Parameter num_chars may additionally limit the number +// of wchar_t characters processed. -1 is used when the entire string +// should be processed. +// If the string contains code points that are not valid Unicode code points +// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output +// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding +// and contains invalid UTF-16 surrogate pairs, values in those pairs +// will be encoded as individual Unicode characters from Basic Normal Plane. +GTEST_API_ std::string WideStringToUtf8(const wchar_t* str, int num_chars); + +// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file +// if the variable is present. If a file already exists at this location, this +// function will write over it. If the variable is present, but the file cannot +// be created, prints an error and exits. +void WriteToShardStatusFileIfNeeded(); + +// Checks whether sharding is enabled by examining the relevant +// environment variable values. If the variables are present, +// but inconsistent (e.g., shard_index >= total_shards), prints +// an error and exits. If in_subprocess_for_death_test, sharding is +// disabled because it must only be applied to the original test +// process. Otherwise, we could filter out death tests we intended to execute. +GTEST_API_ bool ShouldShard(const char* total_shards_str, + const char* shard_index_str, + bool in_subprocess_for_death_test); + +// Parses the environment variable var as a 32-bit integer. If it is unset, +// returns default_val. If it is not a 32-bit integer, prints an error and +// and aborts. +GTEST_API_ int32_t Int32FromEnvOrDie(const char* env_var, int32_t default_val); + +// Given the total number of shards, the shard index, and the test id, +// returns true if and only if the test should be run on this shard. The test id +// is some arbitrary but unique non-negative integer assigned to each test +// method. Assumes that 0 <= shard_index < total_shards. +GTEST_API_ bool ShouldRunTestOnShard(int total_shards, int shard_index, + int test_id); + +// STL container utilities. + +// Returns the number of elements in the given container that satisfy +// the given predicate. +template +inline int CountIf(const Container& c, Predicate predicate) { + // Implemented as an explicit loop since std::count_if() in libCstd on + // Solaris has a non-standard signature. + int count = 0; + for (auto it = c.begin(); it != c.end(); ++it) { + if (predicate(*it)) ++count; + } + return count; +} + +// Applies a function/functor to each element in the container. +template +void ForEach(const Container& c, Functor functor) { + std::for_each(c.begin(), c.end(), functor); +} + +// Returns the i-th element of the vector, or default_value if i is not +// in range [0, v.size()). +template +inline E GetElementOr(const std::vector& v, int i, E default_value) { + return (i < 0 || i >= static_cast(v.size())) ? default_value + : v[static_cast(i)]; +} + +// Performs an in-place shuffle of a range of the vector's elements. +// 'begin' and 'end' are element indices as an STL-style range; +// i.e. [begin, end) are shuffled, where 'end' == size() means to +// shuffle to the end of the vector. +template +void ShuffleRange(internal::Random* random, int begin, int end, + std::vector* v) { + const int size = static_cast(v->size()); + GTEST_CHECK_(0 <= begin && begin <= size) + << "Invalid shuffle range start " << begin << ": must be in range [0, " + << size << "]."; + GTEST_CHECK_(begin <= end && end <= size) + << "Invalid shuffle range finish " << end << ": must be in range [" + << begin << ", " << size << "]."; + + // Fisher-Yates shuffle, from + // http://en.wikipedia.org/wiki/Fisher-Yates_shuffle + for (int range_width = end - begin; range_width >= 2; range_width--) { + const int last_in_range = begin + range_width - 1; + const int selected = + begin + + static_cast(random->Generate(static_cast(range_width))); + std::swap((*v)[static_cast(selected)], + (*v)[static_cast(last_in_range)]); + } +} + +// Performs an in-place shuffle of the vector's elements. +template +inline void Shuffle(internal::Random* random, std::vector* v) { + ShuffleRange(random, 0, static_cast(v->size()), v); +} + +// A function for deleting an object. Handy for being used as a +// functor. +template +static void Delete(T* x) { + delete x; +} + +// A predicate that checks the key of a TestProperty against a known key. +// +// TestPropertyKeyIs is copyable. +class TestPropertyKeyIs { + public: + // Constructor. + // + // TestPropertyKeyIs has NO default constructor. + explicit TestPropertyKeyIs(const std::string& key) : key_(key) {} + + // Returns true if and only if the test name of test property matches on key_. + bool operator()(const TestProperty& test_property) const { + return test_property.key() == key_; + } + + private: + std::string key_; +}; + +// Class UnitTestOptions. +// +// This class contains functions for processing options the user +// specifies when running the tests. It has only static members. +// +// In most cases, the user can specify an option using either an +// environment variable or a command line flag. E.g. you can set the +// test filter using either GTEST_FILTER or --gtest_filter. If both +// the variable and the flag are present, the latter overrides the +// former. +class GTEST_API_ UnitTestOptions { + public: + // Functions for processing the gtest_output flag. + + // Returns the output format, or "" for normal printed output. + static std::string GetOutputFormat(); + + // Returns the absolute path of the requested output file, or the + // default (test_detail.xml in the original working directory) if + // none was explicitly specified. + static std::string GetAbsolutePathToOutputFile(); + + // Functions for processing the gtest_filter flag. + + // Returns true if and only if the user-specified filter matches the test + // suite name and the test name. + static bool FilterMatchesTest(const std::string& test_suite_name, + const std::string& test_name); + +#if GTEST_OS_WINDOWS + // Function for supporting the gtest_catch_exception flag. + + // Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the + // given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise. + // This function is useful as an __except condition. + static int GTestShouldProcessSEH(DWORD exception_code); +#endif // GTEST_OS_WINDOWS + + // Returns true if "name" matches the ':' separated list of glob-style + // filters in "filter". + static bool MatchesFilter(const std::string& name, const char* filter); +}; + +// Returns the current application's name, removing directory path if that +// is present. Used by UnitTestOptions::GetOutputFile. +GTEST_API_ FilePath GetCurrentExecutableName(); + +// The role interface for getting the OS stack trace as a string. +class OsStackTraceGetterInterface { + public: + OsStackTraceGetterInterface() {} + virtual ~OsStackTraceGetterInterface() {} + + // Returns the current OS stack trace as an std::string. Parameters: + // + // max_depth - the maximum number of stack frames to be included + // in the trace. + // skip_count - the number of top frames to be skipped; doesn't count + // against max_depth. + virtual std::string CurrentStackTrace(int max_depth, int skip_count) = 0; + + // UponLeavingGTest() should be called immediately before Google Test calls + // user code. It saves some information about the current stack that + // CurrentStackTrace() will use to find and hide Google Test stack frames. + virtual void UponLeavingGTest() = 0; + + // This string is inserted in place of stack frames that are part of + // Google Test's implementation. + static const char* const kElidedFramesMarker; + + private: + OsStackTraceGetterInterface(const OsStackTraceGetterInterface&) = delete; + OsStackTraceGetterInterface& operator=(const OsStackTraceGetterInterface&) = + delete; +}; + +// A working implementation of the OsStackTraceGetterInterface interface. +class OsStackTraceGetter : public OsStackTraceGetterInterface { + public: + OsStackTraceGetter() {} + + std::string CurrentStackTrace(int max_depth, int skip_count) override; + void UponLeavingGTest() override; + + private: +#if GTEST_HAS_ABSL + Mutex mutex_; // Protects all internal state. + + // We save the stack frame below the frame that calls user code. + // We do this because the address of the frame immediately below + // the user code changes between the call to UponLeavingGTest() + // and any calls to the stack trace code from within the user code. + void* caller_frame_ = nullptr; +#endif // GTEST_HAS_ABSL + + OsStackTraceGetter(const OsStackTraceGetter&) = delete; + OsStackTraceGetter& operator=(const OsStackTraceGetter&) = delete; +}; + +// Information about a Google Test trace point. +struct TraceInfo { + const char* file; + int line; + std::string message; +}; + +// This is the default global test part result reporter used in UnitTestImpl. +// This class should only be used by UnitTestImpl. +class DefaultGlobalTestPartResultReporter + : public TestPartResultReporterInterface { + public: + explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test); + // Implements the TestPartResultReporterInterface. Reports the test part + // result in the current test. + void ReportTestPartResult(const TestPartResult& result) override; + + private: + UnitTestImpl* const unit_test_; + + DefaultGlobalTestPartResultReporter( + const DefaultGlobalTestPartResultReporter&) = delete; + DefaultGlobalTestPartResultReporter& operator=( + const DefaultGlobalTestPartResultReporter&) = delete; +}; + +// This is the default per thread test part result reporter used in +// UnitTestImpl. This class should only be used by UnitTestImpl. +class DefaultPerThreadTestPartResultReporter + : public TestPartResultReporterInterface { + public: + explicit DefaultPerThreadTestPartResultReporter(UnitTestImpl* unit_test); + // Implements the TestPartResultReporterInterface. The implementation just + // delegates to the current global test part result reporter of *unit_test_. + void ReportTestPartResult(const TestPartResult& result) override; + + private: + UnitTestImpl* const unit_test_; + + DefaultPerThreadTestPartResultReporter( + const DefaultPerThreadTestPartResultReporter&) = delete; + DefaultPerThreadTestPartResultReporter& operator=( + const DefaultPerThreadTestPartResultReporter&) = delete; +}; + +// The private implementation of the UnitTest class. We don't protect +// the methods under a mutex, as this class is not accessible by a +// user and the UnitTest class that delegates work to this class does +// proper locking. +class GTEST_API_ UnitTestImpl { + public: + explicit UnitTestImpl(UnitTest* parent); + virtual ~UnitTestImpl(); + + // There are two different ways to register your own TestPartResultReporter. + // You can register your own repoter to listen either only for test results + // from the current thread or for results from all threads. + // By default, each per-thread test result repoter just passes a new + // TestPartResult to the global test result reporter, which registers the + // test part result for the currently running test. + + // Returns the global test part result reporter. + TestPartResultReporterInterface* GetGlobalTestPartResultReporter(); + + // Sets the global test part result reporter. + void SetGlobalTestPartResultReporter( + TestPartResultReporterInterface* reporter); + + // Returns the test part result reporter for the current thread. + TestPartResultReporterInterface* GetTestPartResultReporterForCurrentThread(); + + // Sets the test part result reporter for the current thread. + void SetTestPartResultReporterForCurrentThread( + TestPartResultReporterInterface* reporter); + + // Gets the number of successful test suites. + int successful_test_suite_count() const; + + // Gets the number of failed test suites. + int failed_test_suite_count() const; + + // Gets the number of all test suites. + int total_test_suite_count() const; + + // Gets the number of all test suites that contain at least one test + // that should run. + int test_suite_to_run_count() const; + + // Gets the number of successful tests. + int successful_test_count() const; + + // Gets the number of skipped tests. + int skipped_test_count() const; + + // Gets the number of failed tests. + int failed_test_count() const; + + // Gets the number of disabled tests that will be reported in the XML report. + int reportable_disabled_test_count() const; + + // Gets the number of disabled tests. + int disabled_test_count() const; + + // Gets the number of tests to be printed in the XML report. + int reportable_test_count() const; + + // Gets the number of all tests. + int total_test_count() const; + + // Gets the number of tests that should run. + int test_to_run_count() const; + + // Gets the time of the test program start, in ms from the start of the + // UNIX epoch. + TimeInMillis start_timestamp() const { return start_timestamp_; } + + // Gets the elapsed time, in milliseconds. + TimeInMillis elapsed_time() const { return elapsed_time_; } + + // Returns true if and only if the unit test passed (i.e. all test suites + // passed). + bool Passed() const { return !Failed(); } + + // Returns true if and only if the unit test failed (i.e. some test suite + // failed or something outside of all tests failed). + bool Failed() const { + return failed_test_suite_count() > 0 || ad_hoc_test_result()->Failed(); + } + + // Gets the i-th test suite among all the test suites. i can range from 0 to + // total_test_suite_count() - 1. If i is not in that range, returns NULL. + const TestSuite* GetTestSuite(int i) const { + const int index = GetElementOr(test_suite_indices_, i, -1); + return index < 0 ? nullptr : test_suites_[static_cast(i)]; + } + + // Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + const TestCase* GetTestCase(int i) const { return GetTestSuite(i); } +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + // Gets the i-th test suite among all the test suites. i can range from 0 to + // total_test_suite_count() - 1. If i is not in that range, returns NULL. + TestSuite* GetMutableSuiteCase(int i) { + const int index = GetElementOr(test_suite_indices_, i, -1); + return index < 0 ? nullptr : test_suites_[static_cast(index)]; + } + + // Provides access to the event listener list. + TestEventListeners* listeners() { return &listeners_; } + + // Returns the TestResult for the test that's currently running, or + // the TestResult for the ad hoc test if no test is running. + TestResult* current_test_result(); + + // Returns the TestResult for the ad hoc test. + const TestResult* ad_hoc_test_result() const { return &ad_hoc_test_result_; } + + // Sets the OS stack trace getter. + // + // Does nothing if the input and the current OS stack trace getter + // are the same; otherwise, deletes the old getter and makes the + // input the current getter. + void set_os_stack_trace_getter(OsStackTraceGetterInterface* getter); + + // Returns the current OS stack trace getter if it is not NULL; + // otherwise, creates an OsStackTraceGetter, makes it the current + // getter, and returns it. + OsStackTraceGetterInterface* os_stack_trace_getter(); + + // Returns the current OS stack trace as an std::string. + // + // The maximum number of stack frames to be included is specified by + // the gtest_stack_trace_depth flag. The skip_count parameter + // specifies the number of top frames to be skipped, which doesn't + // count against the number of frames to be included. + // + // For example, if Foo() calls Bar(), which in turn calls + // CurrentOsStackTraceExceptTop(1), Foo() will be included in the + // trace but Bar() and CurrentOsStackTraceExceptTop() won't. + std::string CurrentOsStackTraceExceptTop(int skip_count) + GTEST_NO_INLINE_ GTEST_NO_TAIL_CALL_; + + // Finds and returns a TestSuite with the given name. If one doesn't + // exist, creates one and returns it. + // + // Arguments: + // + // test_suite_name: name of the test suite + // type_param: the name of the test's type parameter, or NULL if + // this is not a typed or a type-parameterized test. + // set_up_tc: pointer to the function that sets up the test suite + // tear_down_tc: pointer to the function that tears down the test suite + TestSuite* GetTestSuite(const char* test_suite_name, const char* type_param, + internal::SetUpTestSuiteFunc set_up_tc, + internal::TearDownTestSuiteFunc tear_down_tc); + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + TestCase* GetTestCase(const char* test_case_name, const char* type_param, + internal::SetUpTestSuiteFunc set_up_tc, + internal::TearDownTestSuiteFunc tear_down_tc) { + return GetTestSuite(test_case_name, type_param, set_up_tc, tear_down_tc); + } +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + // Adds a TestInfo to the unit test. + // + // Arguments: + // + // set_up_tc: pointer to the function that sets up the test suite + // tear_down_tc: pointer to the function that tears down the test suite + // test_info: the TestInfo object + void AddTestInfo(internal::SetUpTestSuiteFunc set_up_tc, + internal::TearDownTestSuiteFunc tear_down_tc, + TestInfo* test_info) { +#if GTEST_HAS_DEATH_TEST + // In order to support thread-safe death tests, we need to + // remember the original working directory when the test program + // was first invoked. We cannot do this in RUN_ALL_TESTS(), as + // the user may have changed the current directory before calling + // RUN_ALL_TESTS(). Therefore we capture the current directory in + // AddTestInfo(), which is called to register a TEST or TEST_F + // before main() is reached. + if (original_working_dir_.IsEmpty()) { + original_working_dir_.Set(FilePath::GetCurrentDir()); + GTEST_CHECK_(!original_working_dir_.IsEmpty()) + << "Failed to get the current working directory."; + } +#endif // GTEST_HAS_DEATH_TEST + + GetTestSuite(test_info->test_suite_name(), test_info->type_param(), + set_up_tc, tear_down_tc) + ->AddTestInfo(test_info); + } + + // Returns ParameterizedTestSuiteRegistry object used to keep track of + // value-parameterized tests and instantiate and register them. + internal::ParameterizedTestSuiteRegistry& parameterized_test_registry() { + return parameterized_test_registry_; + } + + std::set* ignored_parameterized_test_suites() { + return &ignored_parameterized_test_suites_; + } + + // Returns TypeParameterizedTestSuiteRegistry object used to keep track of + // type-parameterized tests and instantiations of them. + internal::TypeParameterizedTestSuiteRegistry& + type_parameterized_test_registry() { + return type_parameterized_test_registry_; + } + + // Sets the TestSuite object for the test that's currently running. + void set_current_test_suite(TestSuite* a_current_test_suite) { + current_test_suite_ = a_current_test_suite; + } + + // Sets the TestInfo object for the test that's currently running. If + // current_test_info is NULL, the assertion results will be stored in + // ad_hoc_test_result_. + void set_current_test_info(TestInfo* a_current_test_info) { + current_test_info_ = a_current_test_info; + } + + // Registers all parameterized tests defined using TEST_P and + // INSTANTIATE_TEST_SUITE_P, creating regular tests for each test/parameter + // combination. This method can be called more then once; it has guards + // protecting from registering the tests more then once. If + // value-parameterized tests are disabled, RegisterParameterizedTests is + // present but does nothing. + void RegisterParameterizedTests(); + + // Runs all tests in this UnitTest object, prints the result, and + // returns true if all tests are successful. If any exception is + // thrown during a test, this test is considered to be failed, but + // the rest of the tests will still be run. + bool RunAllTests(); + + // Clears the results of all tests, except the ad hoc tests. + void ClearNonAdHocTestResult() { + ForEach(test_suites_, TestSuite::ClearTestSuiteResult); + } + + // Clears the results of ad-hoc test assertions. + void ClearAdHocTestResult() { ad_hoc_test_result_.Clear(); } + + // Adds a TestProperty to the current TestResult object when invoked in a + // context of a test or a test suite, or to the global property set. If the + // result already contains a property with the same key, the value will be + // updated. + void RecordProperty(const TestProperty& test_property); + + enum ReactionToSharding { HONOR_SHARDING_PROTOCOL, IGNORE_SHARDING_PROTOCOL }; + + // Matches the full name of each test against the user-specified + // filter to decide whether the test should run, then records the + // result in each TestSuite and TestInfo object. + // If shard_tests == HONOR_SHARDING_PROTOCOL, further filters tests + // based on sharding variables in the environment. + // Returns the number of tests that should run. + int FilterTests(ReactionToSharding shard_tests); + + // Prints the names of the tests matching the user-specified filter flag. + void ListTestsMatchingFilter(); + + const TestSuite* current_test_suite() const { return current_test_suite_; } + TestInfo* current_test_info() { return current_test_info_; } + const TestInfo* current_test_info() const { return current_test_info_; } + + // Returns the vector of environments that need to be set-up/torn-down + // before/after the tests are run. + std::vector& environments() { return environments_; } + + // Getters for the per-thread Google Test trace stack. + std::vector& gtest_trace_stack() { + return *(gtest_trace_stack_.pointer()); + } + const std::vector& gtest_trace_stack() const { + return gtest_trace_stack_.get(); + } + +#if GTEST_HAS_DEATH_TEST + void InitDeathTestSubprocessControlInfo() { + internal_run_death_test_flag_.reset(ParseInternalRunDeathTestFlag()); + } + // Returns a pointer to the parsed --gtest_internal_run_death_test + // flag, or NULL if that flag was not specified. + // This information is useful only in a death test child process. + // Must not be called before a call to InitGoogleTest. + const InternalRunDeathTestFlag* internal_run_death_test_flag() const { + return internal_run_death_test_flag_.get(); + } + + // Returns a pointer to the current death test factory. + internal::DeathTestFactory* death_test_factory() { + return death_test_factory_.get(); + } + + void SuppressTestEventsIfInSubprocess(); + + friend class ReplaceDeathTestFactory; +#endif // GTEST_HAS_DEATH_TEST + + // Initializes the event listener performing XML output as specified by + // UnitTestOptions. Must not be called before InitGoogleTest. + void ConfigureXmlOutput(); + +#if GTEST_CAN_STREAM_RESULTS_ + // Initializes the event listener for streaming test results to a socket. + // Must not be called before InitGoogleTest. + void ConfigureStreamingOutput(); +#endif + + // Performs initialization dependent upon flag values obtained in + // ParseGoogleTestFlagsOnly. Is called from InitGoogleTest after the call to + // ParseGoogleTestFlagsOnly. In case a user neglects to call InitGoogleTest + // this function is also called from RunAllTests. Since this function can be + // called more than once, it has to be idempotent. + void PostFlagParsingInit(); + + // Gets the random seed used at the start of the current test iteration. + int random_seed() const { return random_seed_; } + + // Gets the random number generator. + internal::Random* random() { return &random_; } + + // Shuffles all test suites, and the tests within each test suite, + // making sure that death tests are still run first. + void ShuffleTests(); + + // Restores the test suites and tests to their order before the first shuffle. + void UnshuffleTests(); + + // Returns the value of GTEST_FLAG(catch_exceptions) at the moment + // UnitTest::Run() starts. + bool catch_exceptions() const { return catch_exceptions_; } + + private: + friend class ::testing::UnitTest; + + // Used by UnitTest::Run() to capture the state of + // GTEST_FLAG(catch_exceptions) at the moment it starts. + void set_catch_exceptions(bool value) { catch_exceptions_ = value; } + + // The UnitTest object that owns this implementation object. + UnitTest* const parent_; + + // The working directory when the first TEST() or TEST_F() was + // executed. + internal::FilePath original_working_dir_; + + // The default test part result reporters. + DefaultGlobalTestPartResultReporter default_global_test_part_result_reporter_; + DefaultPerThreadTestPartResultReporter + default_per_thread_test_part_result_reporter_; + + // Points to (but doesn't own) the global test part result reporter. + TestPartResultReporterInterface* global_test_part_result_repoter_; + + // Protects read and write access to global_test_part_result_reporter_. + internal::Mutex global_test_part_result_reporter_mutex_; + + // Points to (but doesn't own) the per-thread test part result reporter. + internal::ThreadLocal + per_thread_test_part_result_reporter_; + + // The vector of environments that need to be set-up/torn-down + // before/after the tests are run. + std::vector environments_; + + // The vector of TestSuites in their original order. It owns the + // elements in the vector. + std::vector test_suites_; + + // Provides a level of indirection for the test suite list to allow + // easy shuffling and restoring the test suite order. The i-th + // element of this vector is the index of the i-th test suite in the + // shuffled order. + std::vector test_suite_indices_; + + // ParameterizedTestRegistry object used to register value-parameterized + // tests. + internal::ParameterizedTestSuiteRegistry parameterized_test_registry_; + internal::TypeParameterizedTestSuiteRegistry + type_parameterized_test_registry_; + + // The set holding the name of parameterized + // test suites that may go uninstantiated. + std::set ignored_parameterized_test_suites_; + + // Indicates whether RegisterParameterizedTests() has been called already. + bool parameterized_tests_registered_; + + // Index of the last death test suite registered. Initially -1. + int last_death_test_suite_; + + // This points to the TestSuite for the currently running test. It + // changes as Google Test goes through one test suite after another. + // When no test is running, this is set to NULL and Google Test + // stores assertion results in ad_hoc_test_result_. Initially NULL. + TestSuite* current_test_suite_; + + // This points to the TestInfo for the currently running test. It + // changes as Google Test goes through one test after another. When + // no test is running, this is set to NULL and Google Test stores + // assertion results in ad_hoc_test_result_. Initially NULL. + TestInfo* current_test_info_; + + // Normally, a user only writes assertions inside a TEST or TEST_F, + // or inside a function called by a TEST or TEST_F. Since Google + // Test keeps track of which test is current running, it can + // associate such an assertion with the test it belongs to. + // + // If an assertion is encountered when no TEST or TEST_F is running, + // Google Test attributes the assertion result to an imaginary "ad hoc" + // test, and records the result in ad_hoc_test_result_. + TestResult ad_hoc_test_result_; + + // The list of event listeners that can be used to track events inside + // Google Test. + TestEventListeners listeners_; + + // The OS stack trace getter. Will be deleted when the UnitTest + // object is destructed. By default, an OsStackTraceGetter is used, + // but the user can set this field to use a custom getter if that is + // desired. + OsStackTraceGetterInterface* os_stack_trace_getter_; + + // True if and only if PostFlagParsingInit() has been called. + bool post_flag_parse_init_performed_; + + // The random number seed used at the beginning of the test run. + int random_seed_; + + // Our random number generator. + internal::Random random_; + + // The time of the test program start, in ms from the start of the + // UNIX epoch. + TimeInMillis start_timestamp_; + + // How long the test took to run, in milliseconds. + TimeInMillis elapsed_time_; + +#if GTEST_HAS_DEATH_TEST + // The decomposed components of the gtest_internal_run_death_test flag, + // parsed when RUN_ALL_TESTS is called. + std::unique_ptr internal_run_death_test_flag_; + std::unique_ptr death_test_factory_; +#endif // GTEST_HAS_DEATH_TEST + + // A per-thread stack of traces created by the SCOPED_TRACE() macro. + internal::ThreadLocal > gtest_trace_stack_; + + // The value of GTEST_FLAG(catch_exceptions) at the moment RunAllTests() + // starts. + bool catch_exceptions_; + + UnitTestImpl(const UnitTestImpl&) = delete; + UnitTestImpl& operator=(const UnitTestImpl&) = delete; +}; // class UnitTestImpl + +// Convenience function for accessing the global UnitTest +// implementation object. +inline UnitTestImpl* GetUnitTestImpl() { + return UnitTest::GetInstance()->impl(); +} + +#if GTEST_USES_SIMPLE_RE + +// Internal helper functions for implementing the simple regular +// expression matcher. +GTEST_API_ bool IsInSet(char ch, const char* str); +GTEST_API_ bool IsAsciiDigit(char ch); +GTEST_API_ bool IsAsciiPunct(char ch); +GTEST_API_ bool IsRepeat(char ch); +GTEST_API_ bool IsAsciiWhiteSpace(char ch); +GTEST_API_ bool IsAsciiWordChar(char ch); +GTEST_API_ bool IsValidEscape(char ch); +GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch); +GTEST_API_ bool ValidateRegex(const char* regex); +GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str); +GTEST_API_ bool MatchRepetitionAndRegexAtHead(bool escaped, char ch, + char repeat, const char* regex, + const char* str); +GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str); + +#endif // GTEST_USES_SIMPLE_RE + +// Parses the command line for Google Test flags, without initializing +// other parts of Google Test. +GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, char** argv); +GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv); + +#if GTEST_HAS_DEATH_TEST + +// Returns the message describing the last system error, regardless of the +// platform. +GTEST_API_ std::string GetLastErrnoDescription(); + +// Attempts to parse a string into a positive integer pointed to by the +// number parameter. Returns true if that is possible. +// GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we can use +// it here. +template +bool ParseNaturalNumber(const ::std::string& str, Integer* number) { + // Fail fast if the given string does not begin with a digit; + // this bypasses strtoXXX's "optional leading whitespace and plus + // or minus sign" semantics, which are undesirable here. + if (str.empty() || !IsDigit(str[0])) { + return false; + } + errno = 0; + + char* end; + // BiggestConvertible is the largest integer type that system-provided + // string-to-number conversion routines can return. + using BiggestConvertible = unsigned long long; // NOLINT + + const BiggestConvertible parsed = strtoull(str.c_str(), &end, 10); // NOLINT + const bool parse_success = *end == '\0' && errno == 0; + + GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed)); + + const Integer result = static_cast(parsed); + if (parse_success && static_cast(result) == parsed) { + *number = result; + return true; + } + return false; +} +#endif // GTEST_HAS_DEATH_TEST + +// TestResult contains some private methods that should be hidden from +// Google Test user but are required for testing. This class allow our tests +// to access them. +// +// This class is supplied only for the purpose of testing Google Test's own +// constructs. Do not use it in user tests, either directly or indirectly. +class TestResultAccessor { + public: + static void RecordProperty(TestResult* test_result, + const std::string& xml_element, + const TestProperty& property) { + test_result->RecordProperty(xml_element, property); + } + + static void ClearTestPartResults(TestResult* test_result) { + test_result->ClearTestPartResults(); + } + + static const std::vector& test_part_results( + const TestResult& test_result) { + return test_result.test_part_results(); + } +}; + +#if GTEST_CAN_STREAM_RESULTS_ + +// Streams test results to the given port on the given host machine. +class StreamingListener : public EmptyTestEventListener { + public: + // Abstract base class for writing strings to a socket. + class AbstractSocketWriter { + public: + virtual ~AbstractSocketWriter() {} + + // Sends a string to the socket. + virtual void Send(const std::string& message) = 0; + + // Closes the socket. + virtual void CloseConnection() {} + + // Sends a string and a newline to the socket. + void SendLn(const std::string& message) { Send(message + "\n"); } + }; + + // Concrete class for actually writing strings to a socket. + class SocketWriter : public AbstractSocketWriter { + public: + SocketWriter(const std::string& host, const std::string& port) + : sockfd_(-1), host_name_(host), port_num_(port) { + MakeConnection(); + } + + ~SocketWriter() override { + if (sockfd_ != -1) CloseConnection(); + } + + // Sends a string to the socket. + void Send(const std::string& message) override { + GTEST_CHECK_(sockfd_ != -1) + << "Send() can be called only when there is a connection."; + + const auto len = static_cast(message.length()); + if (write(sockfd_, message.c_str(), len) != static_cast(len)) { + GTEST_LOG_(WARNING) << "stream_result_to: failed to stream to " + << host_name_ << ":" << port_num_; + } + } + + private: + // Creates a client socket and connects to the server. + void MakeConnection(); + + // Closes the socket. + void CloseConnection() override { + GTEST_CHECK_(sockfd_ != -1) + << "CloseConnection() can be called only when there is a connection."; + + close(sockfd_); + sockfd_ = -1; + } + + int sockfd_; // socket file descriptor + const std::string host_name_; + const std::string port_num_; + + SocketWriter(const SocketWriter&) = delete; + SocketWriter& operator=(const SocketWriter&) = delete; + }; // class SocketWriter + + // Escapes '=', '&', '%', and '\n' characters in str as "%xx". + static std::string UrlEncode(const char* str); + + StreamingListener(const std::string& host, const std::string& port) + : socket_writer_(new SocketWriter(host, port)) { + Start(); + } + + explicit StreamingListener(AbstractSocketWriter* socket_writer) + : socket_writer_(socket_writer) { + Start(); + } + + void OnTestProgramStart(const UnitTest& /* unit_test */) override { + SendLn("event=TestProgramStart"); + } + + void OnTestProgramEnd(const UnitTest& unit_test) override { + // Note that Google Test current only report elapsed time for each + // test iteration, not for the entire test program. + SendLn("event=TestProgramEnd&passed=" + FormatBool(unit_test.Passed())); + + // Notify the streaming server to stop. + socket_writer_->CloseConnection(); + } + + void OnTestIterationStart(const UnitTest& /* unit_test */, + int iteration) override { + SendLn("event=TestIterationStart&iteration=" + + StreamableToString(iteration)); + } + + void OnTestIterationEnd(const UnitTest& unit_test, + int /* iteration */) override { + SendLn("event=TestIterationEnd&passed=" + FormatBool(unit_test.Passed()) + + "&elapsed_time=" + StreamableToString(unit_test.elapsed_time()) + + "ms"); + } + + // Note that "event=TestCaseStart" is a wire format and has to remain + // "case" for compatibility + void OnTestSuiteStart(const TestSuite& test_suite) override { + SendLn(std::string("event=TestCaseStart&name=") + test_suite.name()); + } + + // Note that "event=TestCaseEnd" is a wire format and has to remain + // "case" for compatibility + void OnTestSuiteEnd(const TestSuite& test_suite) override { + SendLn("event=TestCaseEnd&passed=" + FormatBool(test_suite.Passed()) + + "&elapsed_time=" + StreamableToString(test_suite.elapsed_time()) + + "ms"); + } + + void OnTestStart(const TestInfo& test_info) override { + SendLn(std::string("event=TestStart&name=") + test_info.name()); + } + + void OnTestEnd(const TestInfo& test_info) override { + SendLn("event=TestEnd&passed=" + + FormatBool((test_info.result())->Passed()) + "&elapsed_time=" + + StreamableToString((test_info.result())->elapsed_time()) + "ms"); + } + + void OnTestPartResult(const TestPartResult& test_part_result) override { + const char* file_name = test_part_result.file_name(); + if (file_name == nullptr) file_name = ""; + SendLn("event=TestPartResult&file=" + UrlEncode(file_name) + + "&line=" + StreamableToString(test_part_result.line_number()) + + "&message=" + UrlEncode(test_part_result.message())); + } + + private: + // Sends the given message and a newline to the socket. + void SendLn(const std::string& message) { socket_writer_->SendLn(message); } + + // Called at the start of streaming to notify the receiver what + // protocol we are using. + void Start() { SendLn("gtest_streaming_protocol_version=1.0"); } + + std::string FormatBool(bool value) { return value ? "1" : "0"; } + + const std::unique_ptr socket_writer_; + + StreamingListener(const StreamingListener&) = delete; + StreamingListener& operator=(const StreamingListener&) = delete; +}; // class StreamingListener + +#endif // GTEST_CAN_STREAM_RESULTS_ + +} // namespace internal +} // namespace testing + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +#endif // GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest-matchers.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-matchers.cc new file mode 100644 index 0000000000..7e3bcc0cff --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-matchers.cc @@ -0,0 +1,98 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This file implements just enough of the matcher interface to allow +// EXPECT_DEATH and friends to accept a matcher argument. + +#include "gtest/gtest-matchers.h" + +#include + +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-port.h" + +namespace testing { + +// Constructs a matcher that matches a const std::string& whose value is +// equal to s. +Matcher::Matcher(const std::string& s) { *this = Eq(s); } + +// Constructs a matcher that matches a const std::string& whose value is +// equal to s. +Matcher::Matcher(const char* s) { + *this = Eq(std::string(s)); +} + +// Constructs a matcher that matches a std::string whose value is equal to +// s. +Matcher::Matcher(const std::string& s) { *this = Eq(s); } + +// Constructs a matcher that matches a std::string whose value is equal to +// s. +Matcher::Matcher(const char* s) { *this = Eq(std::string(s)); } + +#if GTEST_INTERNAL_HAS_STRING_VIEW +// Constructs a matcher that matches a const StringView& whose value is +// equal to s. +Matcher::Matcher(const std::string& s) { + *this = Eq(s); +} + +// Constructs a matcher that matches a const StringView& whose value is +// equal to s. +Matcher::Matcher(const char* s) { + *this = Eq(std::string(s)); +} + +// Constructs a matcher that matches a const StringView& whose value is +// equal to s. +Matcher::Matcher(internal::StringView s) { + *this = Eq(std::string(s)); +} + +// Constructs a matcher that matches a StringView whose value is equal to +// s. +Matcher::Matcher(const std::string& s) { *this = Eq(s); } + +// Constructs a matcher that matches a StringView whose value is equal to +// s. +Matcher::Matcher(const char* s) { + *this = Eq(std::string(s)); +} + +// Constructs a matcher that matches a StringView whose value is equal to +// s. +Matcher::Matcher(internal::StringView s) { + *this = Eq(std::string(s)); +} +#endif // GTEST_INTERNAL_HAS_STRING_VIEW + +} // namespace testing diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest-port.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-port.cc new file mode 100644 index 0000000000..d797fe4d58 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-port.cc @@ -0,0 +1,1394 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "gtest/internal/gtest-port.h" + +#include +#include +#include +#include + +#include +#include +#include + +#if GTEST_OS_WINDOWS +#include +#include +#include + +#include // Used in ThreadLocal. +#ifdef _MSC_VER +#include +#endif // _MSC_VER +#else +#include +#endif // GTEST_OS_WINDOWS + +#if GTEST_OS_MAC +#include +#include +#include +#endif // GTEST_OS_MAC + +#if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \ + GTEST_OS_NETBSD || GTEST_OS_OPENBSD +#include +#if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD +#include +#endif +#endif + +#if GTEST_OS_QNX +#include +#include +#include +#endif // GTEST_OS_QNX + +#if GTEST_OS_AIX +#include +#include +#endif // GTEST_OS_AIX + +#if GTEST_OS_FUCHSIA +#include +#include +#endif // GTEST_OS_FUCHSIA + +#include "gtest/gtest-message.h" +#include "gtest/gtest-spi.h" +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-string.h" +#include "src/gtest-internal-inl.h" + +namespace testing { +namespace internal { + +#if GTEST_OS_LINUX || GTEST_OS_GNU_HURD + +namespace { +template +T ReadProcFileField(const std::string& filename, int field) { + std::string dummy; + std::ifstream file(filename.c_str()); + while (field-- > 0) { + file >> dummy; + } + T output = 0; + file >> output; + return output; +} +} // namespace + +// Returns the number of active threads, or 0 when there is an error. +size_t GetThreadCount() { + const std::string filename = + (Message() << "/proc/" << getpid() << "/stat").GetString(); + return ReadProcFileField(filename, 19); +} + +#elif GTEST_OS_MAC + +size_t GetThreadCount() { + const task_t task = mach_task_self(); + mach_msg_type_number_t thread_count; + thread_act_array_t thread_list; + const kern_return_t status = task_threads(task, &thread_list, &thread_count); + if (status == KERN_SUCCESS) { + // task_threads allocates resources in thread_list and we need to free them + // to avoid leaks. + vm_deallocate(task, reinterpret_cast(thread_list), + sizeof(thread_t) * thread_count); + return static_cast(thread_count); + } else { + return 0; + } +} + +#elif GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \ + GTEST_OS_NETBSD + +#if GTEST_OS_NETBSD +#undef KERN_PROC +#define KERN_PROC KERN_PROC2 +#define kinfo_proc kinfo_proc2 +#endif + +#if GTEST_OS_DRAGONFLY +#define KP_NLWP(kp) (kp.kp_nthreads) +#elif GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD +#define KP_NLWP(kp) (kp.ki_numthreads) +#elif GTEST_OS_NETBSD +#define KP_NLWP(kp) (kp.p_nlwps) +#endif + +// Returns the number of threads running in the process, or 0 to indicate that +// we cannot detect it. +size_t GetThreadCount() { + int mib[] = { + CTL_KERN, + KERN_PROC, + KERN_PROC_PID, + getpid(), +#if GTEST_OS_NETBSD + sizeof(struct kinfo_proc), + 1, +#endif + }; + u_int miblen = sizeof(mib) / sizeof(mib[0]); + struct kinfo_proc info; + size_t size = sizeof(info); + if (sysctl(mib, miblen, &info, &size, NULL, 0)) { + return 0; + } + return static_cast(KP_NLWP(info)); +} +#elif GTEST_OS_OPENBSD + +// Returns the number of threads running in the process, or 0 to indicate that +// we cannot detect it. +size_t GetThreadCount() { + int mib[] = { + CTL_KERN, + KERN_PROC, + KERN_PROC_PID | KERN_PROC_SHOW_THREADS, + getpid(), + sizeof(struct kinfo_proc), + 0, + }; + u_int miblen = sizeof(mib) / sizeof(mib[0]); + + // get number of structs + size_t size; + if (sysctl(mib, miblen, NULL, &size, NULL, 0)) { + return 0; + } + + mib[5] = static_cast(size / static_cast(mib[4])); + + // populate array of structs + struct kinfo_proc info[mib[5]]; + if (sysctl(mib, miblen, &info, &size, NULL, 0)) { + return 0; + } + + // exclude empty members + size_t nthreads = 0; + for (size_t i = 0; i < size / static_cast(mib[4]); i++) { + if (info[i].p_tid != -1) nthreads++; + } + return nthreads; +} + +#elif GTEST_OS_QNX + +// Returns the number of threads running in the process, or 0 to indicate that +// we cannot detect it. +size_t GetThreadCount() { + const int fd = open("/proc/self/as", O_RDONLY); + if (fd < 0) { + return 0; + } + procfs_info process_info; + const int status = + devctl(fd, DCMD_PROC_INFO, &process_info, sizeof(process_info), nullptr); + close(fd); + if (status == EOK) { + return static_cast(process_info.num_threads); + } else { + return 0; + } +} + +#elif GTEST_OS_AIX + +size_t GetThreadCount() { + struct procentry64 entry; + pid_t pid = getpid(); + int status = getprocs64(&entry, sizeof(entry), nullptr, 0, &pid, 1); + if (status == 1) { + return entry.pi_thcount; + } else { + return 0; + } +} + +#elif GTEST_OS_FUCHSIA + +size_t GetThreadCount() { + int dummy_buffer; + size_t avail; + zx_status_t status = + zx_object_get_info(zx_process_self(), ZX_INFO_PROCESS_THREADS, + &dummy_buffer, 0, nullptr, &avail); + if (status == ZX_OK) { + return avail; + } else { + return 0; + } +} + +#else + +size_t GetThreadCount() { + // There's no portable way to detect the number of threads, so we just + // return 0 to indicate that we cannot detect it. + return 0; +} + +#endif // GTEST_OS_LINUX + +#if GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS + +AutoHandle::AutoHandle() : handle_(INVALID_HANDLE_VALUE) {} + +AutoHandle::AutoHandle(Handle handle) : handle_(handle) {} + +AutoHandle::~AutoHandle() { Reset(); } + +AutoHandle::Handle AutoHandle::Get() const { return handle_; } + +void AutoHandle::Reset() { Reset(INVALID_HANDLE_VALUE); } + +void AutoHandle::Reset(HANDLE handle) { + // Resetting with the same handle we already own is invalid. + if (handle_ != handle) { + if (IsCloseable()) { + ::CloseHandle(handle_); + } + handle_ = handle; + } else { + GTEST_CHECK_(!IsCloseable()) + << "Resetting a valid handle to itself is likely a programmer error " + "and thus not allowed."; + } +} + +bool AutoHandle::IsCloseable() const { + // Different Windows APIs may use either of these values to represent an + // invalid handle. + return handle_ != nullptr && handle_ != INVALID_HANDLE_VALUE; +} + +Mutex::Mutex() + : owner_thread_id_(0), + type_(kDynamic), + critical_section_init_phase_(0), + critical_section_(new CRITICAL_SECTION) { + ::InitializeCriticalSection(critical_section_); +} + +Mutex::~Mutex() { + // Static mutexes are leaked intentionally. It is not thread-safe to try + // to clean them up. + if (type_ == kDynamic) { + ::DeleteCriticalSection(critical_section_); + delete critical_section_; + critical_section_ = nullptr; + } +} + +void Mutex::Lock() { + ThreadSafeLazyInit(); + ::EnterCriticalSection(critical_section_); + owner_thread_id_ = ::GetCurrentThreadId(); +} + +void Mutex::Unlock() { + ThreadSafeLazyInit(); + // We don't protect writing to owner_thread_id_ here, as it's the + // caller's responsibility to ensure that the current thread holds the + // mutex when this is called. + owner_thread_id_ = 0; + ::LeaveCriticalSection(critical_section_); +} + +// Does nothing if the current thread holds the mutex. Otherwise, crashes +// with high probability. +void Mutex::AssertHeld() { + ThreadSafeLazyInit(); + GTEST_CHECK_(owner_thread_id_ == ::GetCurrentThreadId()) + << "The current thread is not holding the mutex @" << this; +} + +namespace { + +#ifdef _MSC_VER +// Use the RAII idiom to flag mem allocs that are intentionally never +// deallocated. The motivation is to silence the false positive mem leaks +// that are reported by the debug version of MS's CRT which can only detect +// if an alloc is missing a matching deallocation. +// Example: +// MemoryIsNotDeallocated memory_is_not_deallocated; +// critical_section_ = new CRITICAL_SECTION; +// +class MemoryIsNotDeallocated { + public: + MemoryIsNotDeallocated() : old_crtdbg_flag_(0) { + old_crtdbg_flag_ = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG); + // Set heap allocation block type to _IGNORE_BLOCK so that MS debug CRT + // doesn't report mem leak if there's no matching deallocation. + (void)_CrtSetDbgFlag(old_crtdbg_flag_ & ~_CRTDBG_ALLOC_MEM_DF); + } + + ~MemoryIsNotDeallocated() { + // Restore the original _CRTDBG_ALLOC_MEM_DF flag + (void)_CrtSetDbgFlag(old_crtdbg_flag_); + } + + private: + int old_crtdbg_flag_; + + MemoryIsNotDeallocated(const MemoryIsNotDeallocated&) = delete; + MemoryIsNotDeallocated& operator=(const MemoryIsNotDeallocated&) = delete; +}; +#endif // _MSC_VER + +} // namespace + +// Initializes owner_thread_id_ and critical_section_ in static mutexes. +void Mutex::ThreadSafeLazyInit() { + // Dynamic mutexes are initialized in the constructor. + if (type_ == kStatic) { + switch ( + ::InterlockedCompareExchange(&critical_section_init_phase_, 1L, 0L)) { + case 0: + // If critical_section_init_phase_ was 0 before the exchange, we + // are the first to test it and need to perform the initialization. + owner_thread_id_ = 0; + { + // Use RAII to flag that following mem alloc is never deallocated. +#ifdef _MSC_VER + MemoryIsNotDeallocated memory_is_not_deallocated; +#endif // _MSC_VER + critical_section_ = new CRITICAL_SECTION; + } + ::InitializeCriticalSection(critical_section_); + // Updates the critical_section_init_phase_ to 2 to signal + // initialization complete. + GTEST_CHECK_(::InterlockedCompareExchange(&critical_section_init_phase_, + 2L, 1L) == 1L); + break; + case 1: + // Somebody else is already initializing the mutex; spin until they + // are done. + while (::InterlockedCompareExchange(&critical_section_init_phase_, 2L, + 2L) != 2L) { + // Possibly yields the rest of the thread's time slice to other + // threads. + ::Sleep(0); + } + break; + + case 2: + break; // The mutex is already initialized and ready for use. + + default: + GTEST_CHECK_(false) + << "Unexpected value of critical_section_init_phase_ " + << "while initializing a static mutex."; + } + } +} + +namespace { + +class ThreadWithParamSupport : public ThreadWithParamBase { + public: + static HANDLE CreateThread(Runnable* runnable, + Notification* thread_can_start) { + ThreadMainParam* param = new ThreadMainParam(runnable, thread_can_start); + DWORD thread_id; + HANDLE thread_handle = ::CreateThread( + nullptr, // Default security. + 0, // Default stack size. + &ThreadWithParamSupport::ThreadMain, + param, // Parameter to ThreadMainStatic + 0x0, // Default creation flags. + &thread_id); // Need a valid pointer for the call to work under Win98. + GTEST_CHECK_(thread_handle != nullptr) + << "CreateThread failed with error " << ::GetLastError() << "."; + if (thread_handle == nullptr) { + delete param; + } + return thread_handle; + } + + private: + struct ThreadMainParam { + ThreadMainParam(Runnable* runnable, Notification* thread_can_start) + : runnable_(runnable), thread_can_start_(thread_can_start) {} + std::unique_ptr runnable_; + // Does not own. + Notification* thread_can_start_; + }; + + static DWORD WINAPI ThreadMain(void* ptr) { + // Transfers ownership. + std::unique_ptr param(static_cast(ptr)); + if (param->thread_can_start_ != nullptr) + param->thread_can_start_->WaitForNotification(); + param->runnable_->Run(); + return 0; + } + + // Prohibit instantiation. + ThreadWithParamSupport(); + + ThreadWithParamSupport(const ThreadWithParamSupport&) = delete; + ThreadWithParamSupport& operator=(const ThreadWithParamSupport&) = delete; +}; + +} // namespace + +ThreadWithParamBase::ThreadWithParamBase(Runnable* runnable, + Notification* thread_can_start) + : thread_( + ThreadWithParamSupport::CreateThread(runnable, thread_can_start)) {} + +ThreadWithParamBase::~ThreadWithParamBase() { Join(); } + +void ThreadWithParamBase::Join() { + GTEST_CHECK_(::WaitForSingleObject(thread_.Get(), INFINITE) == WAIT_OBJECT_0) + << "Failed to join the thread with error " << ::GetLastError() << "."; +} + +// Maps a thread to a set of ThreadIdToThreadLocals that have values +// instantiated on that thread and notifies them when the thread exits. A +// ThreadLocal instance is expected to persist until all threads it has +// values on have terminated. +class ThreadLocalRegistryImpl { + public: + // Registers thread_local_instance as having value on the current thread. + // Returns a value that can be used to identify the thread from other threads. + static ThreadLocalValueHolderBase* GetValueOnCurrentThread( + const ThreadLocalBase* thread_local_instance) { +#ifdef _MSC_VER + MemoryIsNotDeallocated memory_is_not_deallocated; +#endif // _MSC_VER + DWORD current_thread = ::GetCurrentThreadId(); + MutexLock lock(&mutex_); + ThreadIdToThreadLocals* const thread_to_thread_locals = + GetThreadLocalsMapLocked(); + ThreadIdToThreadLocals::iterator thread_local_pos = + thread_to_thread_locals->find(current_thread); + if (thread_local_pos == thread_to_thread_locals->end()) { + thread_local_pos = + thread_to_thread_locals + ->insert(std::make_pair(current_thread, ThreadLocalValues())) + .first; + StartWatcherThreadFor(current_thread); + } + ThreadLocalValues& thread_local_values = thread_local_pos->second; + ThreadLocalValues::iterator value_pos = + thread_local_values.find(thread_local_instance); + if (value_pos == thread_local_values.end()) { + value_pos = + thread_local_values + .insert(std::make_pair( + thread_local_instance, + std::shared_ptr( + thread_local_instance->NewValueForCurrentThread()))) + .first; + } + return value_pos->second.get(); + } + + static void OnThreadLocalDestroyed( + const ThreadLocalBase* thread_local_instance) { + std::vector > value_holders; + // Clean up the ThreadLocalValues data structure while holding the lock, but + // defer the destruction of the ThreadLocalValueHolderBases. + { + MutexLock lock(&mutex_); + ThreadIdToThreadLocals* const thread_to_thread_locals = + GetThreadLocalsMapLocked(); + for (ThreadIdToThreadLocals::iterator it = + thread_to_thread_locals->begin(); + it != thread_to_thread_locals->end(); ++it) { + ThreadLocalValues& thread_local_values = it->second; + ThreadLocalValues::iterator value_pos = + thread_local_values.find(thread_local_instance); + if (value_pos != thread_local_values.end()) { + value_holders.push_back(value_pos->second); + thread_local_values.erase(value_pos); + // This 'if' can only be successful at most once, so theoretically we + // could break out of the loop here, but we don't bother doing so. + } + } + } + // Outside the lock, let the destructor for 'value_holders' deallocate the + // ThreadLocalValueHolderBases. + } + + static void OnThreadExit(DWORD thread_id) { + GTEST_CHECK_(thread_id != 0) << ::GetLastError(); + std::vector > value_holders; + // Clean up the ThreadIdToThreadLocals data structure while holding the + // lock, but defer the destruction of the ThreadLocalValueHolderBases. + { + MutexLock lock(&mutex_); + ThreadIdToThreadLocals* const thread_to_thread_locals = + GetThreadLocalsMapLocked(); + ThreadIdToThreadLocals::iterator thread_local_pos = + thread_to_thread_locals->find(thread_id); + if (thread_local_pos != thread_to_thread_locals->end()) { + ThreadLocalValues& thread_local_values = thread_local_pos->second; + for (ThreadLocalValues::iterator value_pos = + thread_local_values.begin(); + value_pos != thread_local_values.end(); ++value_pos) { + value_holders.push_back(value_pos->second); + } + thread_to_thread_locals->erase(thread_local_pos); + } + } + // Outside the lock, let the destructor for 'value_holders' deallocate the + // ThreadLocalValueHolderBases. + } + + private: + // In a particular thread, maps a ThreadLocal object to its value. + typedef std::map > + ThreadLocalValues; + // Stores all ThreadIdToThreadLocals having values in a thread, indexed by + // thread's ID. + typedef std::map ThreadIdToThreadLocals; + + // Holds the thread id and thread handle that we pass from + // StartWatcherThreadFor to WatcherThreadFunc. + typedef std::pair ThreadIdAndHandle; + + static void StartWatcherThreadFor(DWORD thread_id) { + // The returned handle will be kept in thread_map and closed by + // watcher_thread in WatcherThreadFunc. + HANDLE thread = + ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION, FALSE, thread_id); + GTEST_CHECK_(thread != nullptr); + // We need to pass a valid thread ID pointer into CreateThread for it + // to work correctly under Win98. + DWORD watcher_thread_id; + HANDLE watcher_thread = ::CreateThread( + nullptr, // Default security. + 0, // Default stack size + &ThreadLocalRegistryImpl::WatcherThreadFunc, + reinterpret_cast(new ThreadIdAndHandle(thread_id, thread)), + CREATE_SUSPENDED, &watcher_thread_id); + GTEST_CHECK_(watcher_thread != nullptr) + << "CreateThread failed with error " << ::GetLastError() << "."; + // Give the watcher thread the same priority as ours to avoid being + // blocked by it. + ::SetThreadPriority(watcher_thread, + ::GetThreadPriority(::GetCurrentThread())); + ::ResumeThread(watcher_thread); + ::CloseHandle(watcher_thread); + } + + // Monitors exit from a given thread and notifies those + // ThreadIdToThreadLocals about thread termination. + static DWORD WINAPI WatcherThreadFunc(LPVOID param) { + const ThreadIdAndHandle* tah = + reinterpret_cast(param); + GTEST_CHECK_(::WaitForSingleObject(tah->second, INFINITE) == WAIT_OBJECT_0); + OnThreadExit(tah->first); + ::CloseHandle(tah->second); + delete tah; + return 0; + } + + // Returns map of thread local instances. + static ThreadIdToThreadLocals* GetThreadLocalsMapLocked() { + mutex_.AssertHeld(); +#ifdef _MSC_VER + MemoryIsNotDeallocated memory_is_not_deallocated; +#endif // _MSC_VER + static ThreadIdToThreadLocals* map = new ThreadIdToThreadLocals(); + return map; + } + + // Protects access to GetThreadLocalsMapLocked() and its return value. + static Mutex mutex_; + // Protects access to GetThreadMapLocked() and its return value. + static Mutex thread_map_mutex_; +}; + +Mutex ThreadLocalRegistryImpl::mutex_(Mutex::kStaticMutex); // NOLINT +Mutex ThreadLocalRegistryImpl::thread_map_mutex_( + Mutex::kStaticMutex); // NOLINT + +ThreadLocalValueHolderBase* ThreadLocalRegistry::GetValueOnCurrentThread( + const ThreadLocalBase* thread_local_instance) { + return ThreadLocalRegistryImpl::GetValueOnCurrentThread( + thread_local_instance); +} + +void ThreadLocalRegistry::OnThreadLocalDestroyed( + const ThreadLocalBase* thread_local_instance) { + ThreadLocalRegistryImpl::OnThreadLocalDestroyed(thread_local_instance); +} + +#endif // GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS + +#if GTEST_USES_POSIX_RE + +// Implements RE. Currently only needed for death tests. + +RE::~RE() { + if (is_valid_) { + // regfree'ing an invalid regex might crash because the content + // of the regex is undefined. Since the regex's are essentially + // the same, one cannot be valid (or invalid) without the other + // being so too. + regfree(&partial_regex_); + regfree(&full_regex_); + } + free(const_cast(pattern_)); +} + +// Returns true if and only if regular expression re matches the entire str. +bool RE::FullMatch(const char* str, const RE& re) { + if (!re.is_valid_) return false; + + regmatch_t match; + return regexec(&re.full_regex_, str, 1, &match, 0) == 0; +} + +// Returns true if and only if regular expression re matches a substring of +// str (including str itself). +bool RE::PartialMatch(const char* str, const RE& re) { + if (!re.is_valid_) return false; + + regmatch_t match; + return regexec(&re.partial_regex_, str, 1, &match, 0) == 0; +} + +// Initializes an RE from its string representation. +void RE::Init(const char* regex) { + pattern_ = posix::StrDup(regex); + + // Reserves enough bytes to hold the regular expression used for a + // full match. + const size_t full_regex_len = strlen(regex) + 10; + char* const full_pattern = new char[full_regex_len]; + + snprintf(full_pattern, full_regex_len, "^(%s)$", regex); + is_valid_ = regcomp(&full_regex_, full_pattern, REG_EXTENDED) == 0; + // We want to call regcomp(&partial_regex_, ...) even if the + // previous expression returns false. Otherwise partial_regex_ may + // not be properly initialized can may cause trouble when it's + // freed. + // + // Some implementation of POSIX regex (e.g. on at least some + // versions of Cygwin) doesn't accept the empty string as a valid + // regex. We change it to an equivalent form "()" to be safe. + if (is_valid_) { + const char* const partial_regex = (*regex == '\0') ? "()" : regex; + is_valid_ = regcomp(&partial_regex_, partial_regex, REG_EXTENDED) == 0; + } + EXPECT_TRUE(is_valid_) + << "Regular expression \"" << regex + << "\" is not a valid POSIX Extended regular expression."; + + delete[] full_pattern; +} + +#elif GTEST_USES_SIMPLE_RE + +// Returns true if and only if ch appears anywhere in str (excluding the +// terminating '\0' character). +bool IsInSet(char ch, const char* str) { + return ch != '\0' && strchr(str, ch) != nullptr; +} + +// Returns true if and only if ch belongs to the given classification. +// Unlike similar functions in , these aren't affected by the +// current locale. +bool IsAsciiDigit(char ch) { return '0' <= ch && ch <= '9'; } +bool IsAsciiPunct(char ch) { + return IsInSet(ch, "^-!\"#$%&'()*+,./:;<=>?@[\\]_`{|}~"); +} +bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); } +bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); } +bool IsAsciiWordChar(char ch) { + return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') || + ('0' <= ch && ch <= '9') || ch == '_'; +} + +// Returns true if and only if "\\c" is a supported escape sequence. +bool IsValidEscape(char c) { + return (IsAsciiPunct(c) || IsInSet(c, "dDfnrsStvwW")); +} + +// Returns true if and only if the given atom (specified by escaped and +// pattern) matches ch. The result is undefined if the atom is invalid. +bool AtomMatchesChar(bool escaped, char pattern_char, char ch) { + if (escaped) { // "\\p" where p is pattern_char. + switch (pattern_char) { + case 'd': + return IsAsciiDigit(ch); + case 'D': + return !IsAsciiDigit(ch); + case 'f': + return ch == '\f'; + case 'n': + return ch == '\n'; + case 'r': + return ch == '\r'; + case 's': + return IsAsciiWhiteSpace(ch); + case 'S': + return !IsAsciiWhiteSpace(ch); + case 't': + return ch == '\t'; + case 'v': + return ch == '\v'; + case 'w': + return IsAsciiWordChar(ch); + case 'W': + return !IsAsciiWordChar(ch); + } + return IsAsciiPunct(pattern_char) && pattern_char == ch; + } + + return (pattern_char == '.' && ch != '\n') || pattern_char == ch; +} + +// Helper function used by ValidateRegex() to format error messages. +static std::string FormatRegexSyntaxError(const char* regex, int index) { + return (Message() << "Syntax error at index " << index + << " in simple regular expression \"" << regex << "\": ") + .GetString(); +} + +// Generates non-fatal failures and returns false if regex is invalid; +// otherwise returns true. +bool ValidateRegex(const char* regex) { + if (regex == nullptr) { + ADD_FAILURE() << "NULL is not a valid simple regular expression."; + return false; + } + + bool is_valid = true; + + // True if and only if ?, *, or + can follow the previous atom. + bool prev_repeatable = false; + for (int i = 0; regex[i]; i++) { + if (regex[i] == '\\') { // An escape sequence + i++; + if (regex[i] == '\0') { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1) + << "'\\' cannot appear at the end."; + return false; + } + + if (!IsValidEscape(regex[i])) { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1) + << "invalid escape sequence \"\\" << regex[i] << "\"."; + is_valid = false; + } + prev_repeatable = true; + } else { // Not an escape sequence. + const char ch = regex[i]; + + if (ch == '^' && i > 0) { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i) + << "'^' can only appear at the beginning."; + is_valid = false; + } else if (ch == '$' && regex[i + 1] != '\0') { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i) + << "'$' can only appear at the end."; + is_valid = false; + } else if (IsInSet(ch, "()[]{}|")) { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i) << "'" << ch + << "' is unsupported."; + is_valid = false; + } else if (IsRepeat(ch) && !prev_repeatable) { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i) << "'" << ch + << "' can only follow a repeatable token."; + is_valid = false; + } + + prev_repeatable = !IsInSet(ch, "^$?*+"); + } + } + + return is_valid; +} + +// Matches a repeated regex atom followed by a valid simple regular +// expression. The regex atom is defined as c if escaped is false, +// or \c otherwise. repeat is the repetition meta character (?, *, +// or +). The behavior is undefined if str contains too many +// characters to be indexable by size_t, in which case the test will +// probably time out anyway. We are fine with this limitation as +// std::string has it too. +bool MatchRepetitionAndRegexAtHead(bool escaped, char c, char repeat, + const char* regex, const char* str) { + const size_t min_count = (repeat == '+') ? 1 : 0; + const size_t max_count = (repeat == '?') ? 1 : static_cast(-1) - 1; + // We cannot call numeric_limits::max() as it conflicts with the + // max() macro on Windows. + + for (size_t i = 0; i <= max_count; ++i) { + // We know that the atom matches each of the first i characters in str. + if (i >= min_count && MatchRegexAtHead(regex, str + i)) { + // We have enough matches at the head, and the tail matches too. + // Since we only care about *whether* the pattern matches str + // (as opposed to *how* it matches), there is no need to find a + // greedy match. + return true; + } + if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i])) return false; + } + return false; +} + +// Returns true if and only if regex matches a prefix of str. regex must +// be a valid simple regular expression and not start with "^", or the +// result is undefined. +bool MatchRegexAtHead(const char* regex, const char* str) { + if (*regex == '\0') // An empty regex matches a prefix of anything. + return true; + + // "$" only matches the end of a string. Note that regex being + // valid guarantees that there's nothing after "$" in it. + if (*regex == '$') return *str == '\0'; + + // Is the first thing in regex an escape sequence? + const bool escaped = *regex == '\\'; + if (escaped) ++regex; + if (IsRepeat(regex[1])) { + // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so + // here's an indirect recursion. It terminates as the regex gets + // shorter in each recursion. + return MatchRepetitionAndRegexAtHead(escaped, regex[0], regex[1], regex + 2, + str); + } else { + // regex isn't empty, isn't "$", and doesn't start with a + // repetition. We match the first atom of regex with the first + // character of str and recurse. + return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) && + MatchRegexAtHead(regex + 1, str + 1); + } +} + +// Returns true if and only if regex matches any substring of str. regex must +// be a valid simple regular expression, or the result is undefined. +// +// The algorithm is recursive, but the recursion depth doesn't exceed +// the regex length, so we won't need to worry about running out of +// stack space normally. In rare cases the time complexity can be +// exponential with respect to the regex length + the string length, +// but usually it's must faster (often close to linear). +bool MatchRegexAnywhere(const char* regex, const char* str) { + if (regex == nullptr || str == nullptr) return false; + + if (*regex == '^') return MatchRegexAtHead(regex + 1, str); + + // A successful match can be anywhere in str. + do { + if (MatchRegexAtHead(regex, str)) return true; + } while (*str++ != '\0'); + return false; +} + +// Implements the RE class. + +RE::~RE() { + free(const_cast(pattern_)); + free(const_cast(full_pattern_)); +} + +// Returns true if and only if regular expression re matches the entire str. +bool RE::FullMatch(const char* str, const RE& re) { + return re.is_valid_ && MatchRegexAnywhere(re.full_pattern_, str); +} + +// Returns true if and only if regular expression re matches a substring of +// str (including str itself). +bool RE::PartialMatch(const char* str, const RE& re) { + return re.is_valid_ && MatchRegexAnywhere(re.pattern_, str); +} + +// Initializes an RE from its string representation. +void RE::Init(const char* regex) { + pattern_ = full_pattern_ = nullptr; + if (regex != nullptr) { + pattern_ = posix::StrDup(regex); + } + + is_valid_ = ValidateRegex(regex); + if (!is_valid_) { + // No need to calculate the full pattern when the regex is invalid. + return; + } + + const size_t len = strlen(regex); + // Reserves enough bytes to hold the regular expression used for a + // full match: we need space to prepend a '^', append a '$', and + // terminate the string with '\0'. + char* buffer = static_cast(malloc(len + 3)); + full_pattern_ = buffer; + + if (*regex != '^') + *buffer++ = '^'; // Makes sure full_pattern_ starts with '^'. + + // We don't use snprintf or strncpy, as they trigger a warning when + // compiled with VC++ 8.0. + memcpy(buffer, regex, len); + buffer += len; + + if (len == 0 || regex[len - 1] != '$') + *buffer++ = '$'; // Makes sure full_pattern_ ends with '$'. + + *buffer = '\0'; +} + +#endif // GTEST_USES_POSIX_RE + +const char kUnknownFile[] = "unknown file"; + +// Formats a source file path and a line number as they would appear +// in an error message from the compiler used to compile this code. +GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) { + const std::string file_name(file == nullptr ? kUnknownFile : file); + + if (line < 0) { + return file_name + ":"; + } +#ifdef _MSC_VER + return file_name + "(" + StreamableToString(line) + "):"; +#else + return file_name + ":" + StreamableToString(line) + ":"; +#endif // _MSC_VER +} + +// Formats a file location for compiler-independent XML output. +// Although this function is not platform dependent, we put it next to +// FormatFileLocation in order to contrast the two functions. +// Note that FormatCompilerIndependentFileLocation() does NOT append colon +// to the file location it produces, unlike FormatFileLocation(). +GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file, + int line) { + const std::string file_name(file == nullptr ? kUnknownFile : file); + + if (line < 0) + return file_name; + else + return file_name + ":" + StreamableToString(line); +} + +GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line) + : severity_(severity) { + const char* const marker = severity == GTEST_INFO ? "[ INFO ]" + : severity == GTEST_WARNING ? "[WARNING]" + : severity == GTEST_ERROR ? "[ ERROR ]" + : "[ FATAL ]"; + GetStream() << ::std::endl + << marker << " " << FormatFileLocation(file, line).c_str() + << ": "; +} + +// Flushes the buffers and, if severity is GTEST_FATAL, aborts the program. +GTestLog::~GTestLog() { + GetStream() << ::std::endl; + if (severity_ == GTEST_FATAL) { + fflush(stderr); + posix::Abort(); + } +} + +// Disable Microsoft deprecation warnings for POSIX functions called from +// this class (creat, dup, dup2, and close) +GTEST_DISABLE_MSC_DEPRECATED_PUSH_() + +#if GTEST_HAS_STREAM_REDIRECTION + +// Object that captures an output stream (stdout/stderr). +class CapturedStream { + public: + // The ctor redirects the stream to a temporary file. + explicit CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) { +#if GTEST_OS_WINDOWS + char temp_dir_path[MAX_PATH + 1] = {'\0'}; // NOLINT + char temp_file_path[MAX_PATH + 1] = {'\0'}; // NOLINT + + ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path); + const UINT success = ::GetTempFileNameA(temp_dir_path, "gtest_redir", + 0, // Generate unique file name. + temp_file_path); + GTEST_CHECK_(success != 0) + << "Unable to create a temporary file in " << temp_dir_path; + const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE); + GTEST_CHECK_(captured_fd != -1) + << "Unable to open temporary file " << temp_file_path; + filename_ = temp_file_path; +#else + // There's no guarantee that a test has write access to the current + // directory, so we create the temporary file in a temporary directory. + std::string name_template; + +#if GTEST_OS_LINUX_ANDROID + // Note: Android applications are expected to call the framework's + // Context.getExternalStorageDirectory() method through JNI to get + // the location of the world-writable SD Card directory. However, + // this requires a Context handle, which cannot be retrieved + // globally from native code. Doing so also precludes running the + // code as part of a regular standalone executable, which doesn't + // run in a Dalvik process (e.g. when running it through 'adb shell'). + // + // The location /data/local/tmp is directly accessible from native code. + // '/sdcard' and other variants cannot be relied on, as they are not + // guaranteed to be mounted, or may have a delay in mounting. + name_template = "/data/local/tmp/"; +#elif GTEST_OS_IOS + char user_temp_dir[PATH_MAX + 1]; + + // Documented alternative to NSTemporaryDirectory() (for obtaining creating + // a temporary directory) at + // https://developer.apple.com/library/archive/documentation/Security/Conceptual/SecureCodingGuide/Articles/RaceConditions.html#//apple_ref/doc/uid/TP40002585-SW10 + // + // _CS_DARWIN_USER_TEMP_DIR (as well as _CS_DARWIN_USER_CACHE_DIR) is not + // documented in the confstr() man page at + // https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man3/confstr.3.html#//apple_ref/doc/man/3/confstr + // but are still available, according to the WebKit patches at + // https://trac.webkit.org/changeset/262004/webkit + // https://trac.webkit.org/changeset/263705/webkit + // + // The confstr() implementation falls back to getenv("TMPDIR"). See + // https://opensource.apple.com/source/Libc/Libc-1439.100.3/gen/confstr.c.auto.html + ::confstr(_CS_DARWIN_USER_TEMP_DIR, user_temp_dir, sizeof(user_temp_dir)); + + name_template = user_temp_dir; + if (name_template.back() != GTEST_PATH_SEP_[0]) + name_template.push_back(GTEST_PATH_SEP_[0]); +#else + name_template = "/tmp/"; +#endif + name_template.append("gtest_captured_stream.XXXXXX"); + + // mkstemp() modifies the string bytes in place, and does not go beyond the + // string's length. This results in well-defined behavior in C++17. + // + // The const_cast is needed below C++17. The constraints on std::string + // implementations in C++11 and above make assumption behind the const_cast + // fairly safe. + const int captured_fd = ::mkstemp(const_cast(name_template.data())); + if (captured_fd == -1) { + GTEST_LOG_(WARNING) + << "Failed to create tmp file " << name_template + << " for test; does the test have access to the /tmp directory?"; + } + filename_ = std::move(name_template); +#endif // GTEST_OS_WINDOWS + fflush(nullptr); + dup2(captured_fd, fd_); + close(captured_fd); + } + + ~CapturedStream() { remove(filename_.c_str()); } + + std::string GetCapturedString() { + if (uncaptured_fd_ != -1) { + // Restores the original stream. + fflush(nullptr); + dup2(uncaptured_fd_, fd_); + close(uncaptured_fd_); + uncaptured_fd_ = -1; + } + + FILE* const file = posix::FOpen(filename_.c_str(), "r"); + if (file == nullptr) { + GTEST_LOG_(FATAL) << "Failed to open tmp file " << filename_ + << " for capturing stream."; + } + const std::string content = ReadEntireFile(file); + posix::FClose(file); + return content; + } + + private: + const int fd_; // A stream to capture. + int uncaptured_fd_; + // Name of the temporary file holding the stderr output. + ::std::string filename_; + + CapturedStream(const CapturedStream&) = delete; + CapturedStream& operator=(const CapturedStream&) = delete; +}; + +GTEST_DISABLE_MSC_DEPRECATED_POP_() + +static CapturedStream* g_captured_stderr = nullptr; +static CapturedStream* g_captured_stdout = nullptr; + +// Starts capturing an output stream (stdout/stderr). +static void CaptureStream(int fd, const char* stream_name, + CapturedStream** stream) { + if (*stream != nullptr) { + GTEST_LOG_(FATAL) << "Only one " << stream_name + << " capturer can exist at a time."; + } + *stream = new CapturedStream(fd); +} + +// Stops capturing the output stream and returns the captured string. +static std::string GetCapturedStream(CapturedStream** captured_stream) { + const std::string content = (*captured_stream)->GetCapturedString(); + + delete *captured_stream; + *captured_stream = nullptr; + + return content; +} + +#if defined(_MSC_VER) || defined(__BORLANDC__) +// MSVC and C++Builder do not provide a definition of STDERR_FILENO. +const int kStdOutFileno = 1; +const int kStdErrFileno = 2; +#else +const int kStdOutFileno = STDOUT_FILENO; +const int kStdErrFileno = STDERR_FILENO; +#endif // defined(_MSC_VER) || defined(__BORLANDC__) + +// Starts capturing stdout. +void CaptureStdout() { + CaptureStream(kStdOutFileno, "stdout", &g_captured_stdout); +} + +// Starts capturing stderr. +void CaptureStderr() { + CaptureStream(kStdErrFileno, "stderr", &g_captured_stderr); +} + +// Stops capturing stdout and returns the captured string. +std::string GetCapturedStdout() { + return GetCapturedStream(&g_captured_stdout); +} + +// Stops capturing stderr and returns the captured string. +std::string GetCapturedStderr() { + return GetCapturedStream(&g_captured_stderr); +} + +#endif // GTEST_HAS_STREAM_REDIRECTION + +size_t GetFileSize(FILE* file) { + fseek(file, 0, SEEK_END); + return static_cast(ftell(file)); +} + +std::string ReadEntireFile(FILE* file) { + const size_t file_size = GetFileSize(file); + char* const buffer = new char[file_size]; + + size_t bytes_last_read = 0; // # of bytes read in the last fread() + size_t bytes_read = 0; // # of bytes read so far + + fseek(file, 0, SEEK_SET); + + // Keeps reading the file until we cannot read further or the + // pre-determined file size is reached. + do { + bytes_last_read = + fread(buffer + bytes_read, 1, file_size - bytes_read, file); + bytes_read += bytes_last_read; + } while (bytes_last_read > 0 && bytes_read < file_size); + + const std::string content(buffer, bytes_read); + delete[] buffer; + + return content; +} + +#if GTEST_HAS_DEATH_TEST +static const std::vector* g_injected_test_argvs = + nullptr; // Owned. + +std::vector GetInjectableArgvs() { + if (g_injected_test_argvs != nullptr) { + return *g_injected_test_argvs; + } + return GetArgvs(); +} + +void SetInjectableArgvs(const std::vector* new_argvs) { + if (g_injected_test_argvs != new_argvs) delete g_injected_test_argvs; + g_injected_test_argvs = new_argvs; +} + +void SetInjectableArgvs(const std::vector& new_argvs) { + SetInjectableArgvs( + new std::vector(new_argvs.begin(), new_argvs.end())); +} + +void ClearInjectableArgvs() { + delete g_injected_test_argvs; + g_injected_test_argvs = nullptr; +} +#endif // GTEST_HAS_DEATH_TEST + +#if GTEST_OS_WINDOWS_MOBILE +namespace posix { +void Abort() { + DebugBreak(); + TerminateProcess(GetCurrentProcess(), 1); +} +} // namespace posix +#endif // GTEST_OS_WINDOWS_MOBILE + +// Returns the name of the environment variable corresponding to the +// given flag. For example, FlagToEnvVar("foo") will return +// "GTEST_FOO" in the open-source version. +static std::string FlagToEnvVar(const char* flag) { + const std::string full_flag = + (Message() << GTEST_FLAG_PREFIX_ << flag).GetString(); + + Message env_var; + for (size_t i = 0; i != full_flag.length(); i++) { + env_var << ToUpper(full_flag.c_str()[i]); + } + + return env_var.GetString(); +} + +// Parses 'str' for a 32-bit signed integer. If successful, writes +// the result to *value and returns true; otherwise leaves *value +// unchanged and returns false. +bool ParseInt32(const Message& src_text, const char* str, int32_t* value) { + // Parses the environment variable as a decimal integer. + char* end = nullptr; + const long long_value = strtol(str, &end, 10); // NOLINT + + // Has strtol() consumed all characters in the string? + if (*end != '\0') { + // No - an invalid character was encountered. + Message msg; + msg << "WARNING: " << src_text + << " is expected to be a 32-bit integer, but actually" + << " has value \"" << str << "\".\n"; + printf("%s", msg.GetString().c_str()); + fflush(stdout); + return false; + } + + // Is the parsed value in the range of an int32_t? + const auto result = static_cast(long_value); + if (long_value == LONG_MAX || long_value == LONG_MIN || + // The parsed value overflows as a long. (strtol() returns + // LONG_MAX or LONG_MIN when the input overflows.) + result != long_value + // The parsed value overflows as an int32_t. + ) { + Message msg; + msg << "WARNING: " << src_text + << " is expected to be a 32-bit integer, but actually" + << " has value " << str << ", which overflows.\n"; + printf("%s", msg.GetString().c_str()); + fflush(stdout); + return false; + } + + *value = result; + return true; +} + +// Reads and returns the Boolean environment variable corresponding to +// the given flag; if it's not set, returns default_value. +// +// The value is considered true if and only if it's not "0". +bool BoolFromGTestEnv(const char* flag, bool default_value) { +#if defined(GTEST_GET_BOOL_FROM_ENV_) + return GTEST_GET_BOOL_FROM_ENV_(flag, default_value); +#else + const std::string env_var = FlagToEnvVar(flag); + const char* const string_value = posix::GetEnv(env_var.c_str()); + return string_value == nullptr ? default_value + : strcmp(string_value, "0") != 0; +#endif // defined(GTEST_GET_BOOL_FROM_ENV_) +} + +// Reads and returns a 32-bit integer stored in the environment +// variable corresponding to the given flag; if it isn't set or +// doesn't represent a valid 32-bit integer, returns default_value. +int32_t Int32FromGTestEnv(const char* flag, int32_t default_value) { +#if defined(GTEST_GET_INT32_FROM_ENV_) + return GTEST_GET_INT32_FROM_ENV_(flag, default_value); +#else + const std::string env_var = FlagToEnvVar(flag); + const char* const string_value = posix::GetEnv(env_var.c_str()); + if (string_value == nullptr) { + // The environment variable is not set. + return default_value; + } + + int32_t result = default_value; + if (!ParseInt32(Message() << "Environment variable " << env_var, string_value, + &result)) { + printf("The default value %s is used.\n", + (Message() << default_value).GetString().c_str()); + fflush(stdout); + return default_value; + } + + return result; +#endif // defined(GTEST_GET_INT32_FROM_ENV_) +} + +// As a special case for the 'output' flag, if GTEST_OUTPUT is not +// set, we look for XML_OUTPUT_FILE, which is set by the Bazel build +// system. The value of XML_OUTPUT_FILE is a filename without the +// "xml:" prefix of GTEST_OUTPUT. +// Note that this is meant to be called at the call site so it does +// not check that the flag is 'output' +// In essence this checks an env variable called XML_OUTPUT_FILE +// and if it is set we prepend "xml:" to its value, if it not set we return "" +std::string OutputFlagAlsoCheckEnvVar() { + std::string default_value_for_output_flag = ""; + const char* xml_output_file_env = posix::GetEnv("XML_OUTPUT_FILE"); + if (nullptr != xml_output_file_env) { + default_value_for_output_flag = std::string("xml:") + xml_output_file_env; + } + return default_value_for_output_flag; +} + +// Reads and returns the string environment variable corresponding to +// the given flag; if it's not set, returns default_value. +const char* StringFromGTestEnv(const char* flag, const char* default_value) { +#if defined(GTEST_GET_STRING_FROM_ENV_) + return GTEST_GET_STRING_FROM_ENV_(flag, default_value); +#else + const std::string env_var = FlagToEnvVar(flag); + const char* const value = posix::GetEnv(env_var.c_str()); + return value == nullptr ? default_value : value; +#endif // defined(GTEST_GET_STRING_FROM_ENV_) +} + +} // namespace internal +} // namespace testing diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest-printers.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-printers.cc new file mode 100644 index 0000000000..f3976d230d --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-printers.cc @@ -0,0 +1,553 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Google Test - The Google C++ Testing and Mocking Framework +// +// This file implements a universal value printer that can print a +// value of any type T: +// +// void ::testing::internal::UniversalPrinter::Print(value, ostream_ptr); +// +// It uses the << operator when possible, and prints the bytes in the +// object otherwise. A user can override its behavior for a class +// type Foo by defining either operator<<(::std::ostream&, const Foo&) +// or void PrintTo(const Foo&, ::std::ostream*) in the namespace that +// defines Foo. + +#include "gtest/gtest-printers.h" + +#include + +#include +#include +#include +#include // NOLINT +#include +#include + +#include "gtest/internal/gtest-port.h" +#include "src/gtest-internal-inl.h" + +namespace testing { + +namespace { + +using ::std::ostream; + +// Prints a segment of bytes in the given object. +GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ +GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ +GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ +GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ +void PrintByteSegmentInObjectTo(const unsigned char* obj_bytes, size_t start, + size_t count, ostream* os) { + char text[5] = ""; + for (size_t i = 0; i != count; i++) { + const size_t j = start + i; + if (i != 0) { + // Organizes the bytes into groups of 2 for easy parsing by + // human. + if ((j % 2) == 0) + *os << ' '; + else + *os << '-'; + } + GTEST_SNPRINTF_(text, sizeof(text), "%02X", obj_bytes[j]); + *os << text; + } +} + +// Prints the bytes in the given value to the given ostream. +void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count, + ostream* os) { + // Tells the user how big the object is. + *os << count << "-byte object <"; + + const size_t kThreshold = 132; + const size_t kChunkSize = 64; + // If the object size is bigger than kThreshold, we'll have to omit + // some details by printing only the first and the last kChunkSize + // bytes. + if (count < kThreshold) { + PrintByteSegmentInObjectTo(obj_bytes, 0, count, os); + } else { + PrintByteSegmentInObjectTo(obj_bytes, 0, kChunkSize, os); + *os << " ... "; + // Rounds up to 2-byte boundary. + const size_t resume_pos = (count - kChunkSize + 1) / 2 * 2; + PrintByteSegmentInObjectTo(obj_bytes, resume_pos, count - resume_pos, os); + } + *os << ">"; +} + +// Helpers for widening a character to char32_t. Since the standard does not +// specify if char / wchar_t is signed or unsigned, it is important to first +// convert it to the unsigned type of the same width before widening it to +// char32_t. +template +char32_t ToChar32(CharType in) { + return static_cast( + static_cast::type>(in)); +} + +} // namespace + +namespace internal { + +// Delegates to PrintBytesInObjectToImpl() to print the bytes in the +// given object. The delegation simplifies the implementation, which +// uses the << operator and thus is easier done outside of the +// ::testing::internal namespace, which contains a << operator that +// sometimes conflicts with the one in STL. +void PrintBytesInObjectTo(const unsigned char* obj_bytes, size_t count, + ostream* os) { + PrintBytesInObjectToImpl(obj_bytes, count, os); +} + +// Depending on the value of a char (or wchar_t), we print it in one +// of three formats: +// - as is if it's a printable ASCII (e.g. 'a', '2', ' '), +// - as a hexadecimal escape sequence (e.g. '\x7F'), or +// - as a special escape sequence (e.g. '\r', '\n'). +enum CharFormat { kAsIs, kHexEscape, kSpecialEscape }; + +// Returns true if c is a printable ASCII character. We test the +// value of c directly instead of calling isprint(), which is buggy on +// Windows Mobile. +inline bool IsPrintableAscii(char32_t c) { return 0x20 <= c && c <= 0x7E; } + +// Prints c (of type char, char8_t, char16_t, char32_t, or wchar_t) as a +// character literal without the quotes, escaping it when necessary; returns how +// c was formatted. +template +static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) { + const char32_t u_c = ToChar32(c); + switch (u_c) { + case L'\0': + *os << "\\0"; + break; + case L'\'': + *os << "\\'"; + break; + case L'\\': + *os << "\\\\"; + break; + case L'\a': + *os << "\\a"; + break; + case L'\b': + *os << "\\b"; + break; + case L'\f': + *os << "\\f"; + break; + case L'\n': + *os << "\\n"; + break; + case L'\r': + *os << "\\r"; + break; + case L'\t': + *os << "\\t"; + break; + case L'\v': + *os << "\\v"; + break; + default: + if (IsPrintableAscii(u_c)) { + *os << static_cast(c); + return kAsIs; + } else { + ostream::fmtflags flags = os->flags(); + *os << "\\x" << std::hex << std::uppercase << static_cast(u_c); + os->flags(flags); + return kHexEscape; + } + } + return kSpecialEscape; +} + +// Prints a char32_t c as if it's part of a string literal, escaping it when +// necessary; returns how c was formatted. +static CharFormat PrintAsStringLiteralTo(char32_t c, ostream* os) { + switch (c) { + case L'\'': + *os << "'"; + return kAsIs; + case L'"': + *os << "\\\""; + return kSpecialEscape; + default: + return PrintAsCharLiteralTo(c, os); + } +} + +static const char* GetCharWidthPrefix(char) { return ""; } + +static const char* GetCharWidthPrefix(signed char) { return ""; } + +static const char* GetCharWidthPrefix(unsigned char) { return ""; } + +#ifdef __cpp_char8_t +static const char* GetCharWidthPrefix(char8_t) { return "u8"; } +#endif + +static const char* GetCharWidthPrefix(char16_t) { return "u"; } + +static const char* GetCharWidthPrefix(char32_t) { return "U"; } + +static const char* GetCharWidthPrefix(wchar_t) { return "L"; } + +// Prints a char c as if it's part of a string literal, escaping it when +// necessary; returns how c was formatted. +static CharFormat PrintAsStringLiteralTo(char c, ostream* os) { + return PrintAsStringLiteralTo(ToChar32(c), os); +} + +#ifdef __cpp_char8_t +static CharFormat PrintAsStringLiteralTo(char8_t c, ostream* os) { + return PrintAsStringLiteralTo(ToChar32(c), os); +} +#endif + +static CharFormat PrintAsStringLiteralTo(char16_t c, ostream* os) { + return PrintAsStringLiteralTo(ToChar32(c), os); +} + +static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) { + return PrintAsStringLiteralTo(ToChar32(c), os); +} + +// Prints a character c (of type char, char8_t, char16_t, char32_t, or wchar_t) +// and its code. '\0' is printed as "'\\0'", other unprintable characters are +// also properly escaped using the standard C++ escape sequence. +template +void PrintCharAndCodeTo(Char c, ostream* os) { + // First, print c as a literal in the most readable form we can find. + *os << GetCharWidthPrefix(c) << "'"; + const CharFormat format = PrintAsCharLiteralTo(c, os); + *os << "'"; + + // To aid user debugging, we also print c's code in decimal, unless + // it's 0 (in which case c was printed as '\\0', making the code + // obvious). + if (c == 0) return; + *os << " (" << static_cast(c); + + // For more convenience, we print c's code again in hexadecimal, + // unless c was already printed in the form '\x##' or the code is in + // [1, 9]. + if (format == kHexEscape || (1 <= c && c <= 9)) { + // Do nothing. + } else { + *os << ", 0x" << String::FormatHexInt(static_cast(c)); + } + *os << ")"; +} + +void PrintTo(unsigned char c, ::std::ostream* os) { PrintCharAndCodeTo(c, os); } +void PrintTo(signed char c, ::std::ostream* os) { PrintCharAndCodeTo(c, os); } + +// Prints a wchar_t as a symbol if it is printable or as its internal +// code otherwise and also as its code. L'\0' is printed as "L'\\0'". +void PrintTo(wchar_t wc, ostream* os) { PrintCharAndCodeTo(wc, os); } + +// TODO(dcheng): Consider making this delegate to PrintCharAndCodeTo() as well. +void PrintTo(char32_t c, ::std::ostream* os) { + *os << std::hex << "U+" << std::uppercase << std::setfill('0') << std::setw(4) + << static_cast(c); +} + +// gcc/clang __{u,}int128_t +#if defined(__SIZEOF_INT128__) +void PrintTo(__uint128_t v, ::std::ostream* os) { + if (v == 0) { + *os << "0"; + return; + } + + // Buffer large enough for ceil(log10(2^128))==39 and the null terminator + char buf[40]; + char* p = buf + sizeof(buf); + + // Some configurations have a __uint128_t, but no support for built in + // division. Do manual long division instead. + + uint64_t high = static_cast(v >> 64); + uint64_t low = static_cast(v); + + *--p = 0; + while (high != 0 || low != 0) { + uint64_t high_mod = high % 10; + high = high / 10; + // This is the long division algorithm specialized for a divisor of 10 and + // only two elements. + // Notable values: + // 2^64 / 10 == 1844674407370955161 + // 2^64 % 10 == 6 + const uint64_t carry = 6 * high_mod + low % 10; + low = low / 10 + high_mod * 1844674407370955161 + carry / 10; + + char digit = static_cast(carry % 10); + *--p = '0' + digit; + } + *os << p; +} +void PrintTo(__int128_t v, ::std::ostream* os) { + __uint128_t uv = static_cast<__uint128_t>(v); + if (v < 0) { + *os << "-"; + uv = -uv; + } + PrintTo(uv, os); +} +#endif // __SIZEOF_INT128__ + +// Prints the given array of characters to the ostream. CharType must be either +// char, char8_t, char16_t, char32_t, or wchar_t. +// The array starts at begin, the length is len, it may include '\0' characters +// and may not be NUL-terminated. +template +GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ + GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ + GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static CharFormat + PrintCharsAsStringTo(const CharType* begin, size_t len, ostream* os) { + const char* const quote_prefix = GetCharWidthPrefix(*begin); + *os << quote_prefix << "\""; + bool is_previous_hex = false; + CharFormat print_format = kAsIs; + for (size_t index = 0; index < len; ++index) { + const CharType cur = begin[index]; + if (is_previous_hex && IsXDigit(cur)) { + // Previous character is of '\x..' form and this character can be + // interpreted as another hexadecimal digit in its number. Break string to + // disambiguate. + *os << "\" " << quote_prefix << "\""; + } + is_previous_hex = PrintAsStringLiteralTo(cur, os) == kHexEscape; + // Remember if any characters required hex escaping. + if (is_previous_hex) { + print_format = kHexEscape; + } + } + *os << "\""; + return print_format; +} + +// Prints a (const) char/wchar_t array of 'len' elements, starting at address +// 'begin'. CharType must be either char or wchar_t. +template +GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ + GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ + GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static void + UniversalPrintCharArray(const CharType* begin, size_t len, + ostream* os) { + // The code + // const char kFoo[] = "foo"; + // generates an array of 4, not 3, elements, with the last one being '\0'. + // + // Therefore when printing a char array, we don't print the last element if + // it's '\0', such that the output matches the string literal as it's + // written in the source code. + if (len > 0 && begin[len - 1] == '\0') { + PrintCharsAsStringTo(begin, len - 1, os); + return; + } + + // If, however, the last element in the array is not '\0', e.g. + // const char kFoo[] = { 'f', 'o', 'o' }; + // we must print the entire array. We also print a message to indicate + // that the array is not NUL-terminated. + PrintCharsAsStringTo(begin, len, os); + *os << " (no terminating NUL)"; +} + +// Prints a (const) char array of 'len' elements, starting at address 'begin'. +void UniversalPrintArray(const char* begin, size_t len, ostream* os) { + UniversalPrintCharArray(begin, len, os); +} + +#ifdef __cpp_char8_t +// Prints a (const) char8_t array of 'len' elements, starting at address +// 'begin'. +void UniversalPrintArray(const char8_t* begin, size_t len, ostream* os) { + UniversalPrintCharArray(begin, len, os); +} +#endif + +// Prints a (const) char16_t array of 'len' elements, starting at address +// 'begin'. +void UniversalPrintArray(const char16_t* begin, size_t len, ostream* os) { + UniversalPrintCharArray(begin, len, os); +} + +// Prints a (const) char32_t array of 'len' elements, starting at address +// 'begin'. +void UniversalPrintArray(const char32_t* begin, size_t len, ostream* os) { + UniversalPrintCharArray(begin, len, os); +} + +// Prints a (const) wchar_t array of 'len' elements, starting at address +// 'begin'. +void UniversalPrintArray(const wchar_t* begin, size_t len, ostream* os) { + UniversalPrintCharArray(begin, len, os); +} + +namespace { + +// Prints a null-terminated C-style string to the ostream. +template +void PrintCStringTo(const Char* s, ostream* os) { + if (s == nullptr) { + *os << "NULL"; + } else { + *os << ImplicitCast_(s) << " pointing to "; + PrintCharsAsStringTo(s, std::char_traits::length(s), os); + } +} + +} // anonymous namespace + +void PrintTo(const char* s, ostream* os) { PrintCStringTo(s, os); } + +#ifdef __cpp_char8_t +void PrintTo(const char8_t* s, ostream* os) { PrintCStringTo(s, os); } +#endif + +void PrintTo(const char16_t* s, ostream* os) { PrintCStringTo(s, os); } + +void PrintTo(const char32_t* s, ostream* os) { PrintCStringTo(s, os); } + +// MSVC compiler can be configured to define whar_t as a typedef +// of unsigned short. Defining an overload for const wchar_t* in that case +// would cause pointers to unsigned shorts be printed as wide strings, +// possibly accessing more memory than intended and causing invalid +// memory accesses. MSVC defines _NATIVE_WCHAR_T_DEFINED symbol when +// wchar_t is implemented as a native type. +#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED) +// Prints the given wide C string to the ostream. +void PrintTo(const wchar_t* s, ostream* os) { PrintCStringTo(s, os); } +#endif // wchar_t is native + +namespace { + +bool ContainsUnprintableControlCodes(const char* str, size_t length) { + const unsigned char* s = reinterpret_cast(str); + + for (size_t i = 0; i < length; i++) { + unsigned char ch = *s++; + if (std::iscntrl(ch)) { + switch (ch) { + case '\t': + case '\n': + case '\r': + break; + default: + return true; + } + } + } + return false; +} + +bool IsUTF8TrailByte(unsigned char t) { return 0x80 <= t && t <= 0xbf; } + +bool IsValidUTF8(const char* str, size_t length) { + const unsigned char* s = reinterpret_cast(str); + + for (size_t i = 0; i < length;) { + unsigned char lead = s[i++]; + + if (lead <= 0x7f) { + continue; // single-byte character (ASCII) 0..7F + } + if (lead < 0xc2) { + return false; // trail byte or non-shortest form + } else if (lead <= 0xdf && (i + 1) <= length && IsUTF8TrailByte(s[i])) { + ++i; // 2-byte character + } else if (0xe0 <= lead && lead <= 0xef && (i + 2) <= length && + IsUTF8TrailByte(s[i]) && IsUTF8TrailByte(s[i + 1]) && + // check for non-shortest form and surrogate + (lead != 0xe0 || s[i] >= 0xa0) && + (lead != 0xed || s[i] < 0xa0)) { + i += 2; // 3-byte character + } else if (0xf0 <= lead && lead <= 0xf4 && (i + 3) <= length && + IsUTF8TrailByte(s[i]) && IsUTF8TrailByte(s[i + 1]) && + IsUTF8TrailByte(s[i + 2]) && + // check for non-shortest form + (lead != 0xf0 || s[i] >= 0x90) && + (lead != 0xf4 || s[i] < 0x90)) { + i += 3; // 4-byte character + } else { + return false; + } + } + return true; +} + +void ConditionalPrintAsText(const char* str, size_t length, ostream* os) { + if (!ContainsUnprintableControlCodes(str, length) && + IsValidUTF8(str, length)) { + *os << "\n As Text: \"" << str << "\""; + } +} + +} // anonymous namespace + +void PrintStringTo(const ::std::string& s, ostream* os) { + if (PrintCharsAsStringTo(s.data(), s.size(), os) == kHexEscape) { + if (GTEST_FLAG_GET(print_utf8)) { + ConditionalPrintAsText(s.data(), s.size(), os); + } + } +} + +#ifdef __cpp_char8_t +void PrintU8StringTo(const ::std::u8string& s, ostream* os) { + PrintCharsAsStringTo(s.data(), s.size(), os); +} +#endif + +void PrintU16StringTo(const ::std::u16string& s, ostream* os) { + PrintCharsAsStringTo(s.data(), s.size(), os); +} + +void PrintU32StringTo(const ::std::u32string& s, ostream* os) { + PrintCharsAsStringTo(s.data(), s.size(), os); +} + +#if GTEST_HAS_STD_WSTRING +void PrintWideStringTo(const ::std::wstring& s, ostream* os) { + PrintCharsAsStringTo(s.data(), s.size(), os); +} +#endif // GTEST_HAS_STD_WSTRING + +} // namespace internal + +} // namespace testing diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest-test-part.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-test-part.cc new file mode 100644 index 0000000000..eb7c8d1cf9 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-test-part.cc @@ -0,0 +1,105 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// +// The Google C++ Testing and Mocking Framework (Google Test) + +#include "gtest/gtest-test-part.h" + +#include "gtest/internal/gtest-port.h" +#include "src/gtest-internal-inl.h" + +namespace testing { + +using internal::GetUnitTestImpl; + +// Gets the summary of the failure message by omitting the stack trace +// in it. +std::string TestPartResult::ExtractSummary(const char* message) { + const char* const stack_trace = strstr(message, internal::kStackTraceMarker); + return stack_trace == nullptr ? message : std::string(message, stack_trace); +} + +// Prints a TestPartResult object. +std::ostream& operator<<(std::ostream& os, const TestPartResult& result) { + return os << internal::FormatFileLocation(result.file_name(), + result.line_number()) + << " " + << (result.type() == TestPartResult::kSuccess ? "Success" + : result.type() == TestPartResult::kSkip ? "Skipped" + : result.type() == TestPartResult::kFatalFailure + ? "Fatal failure" + : "Non-fatal failure") + << ":\n" + << result.message() << std::endl; +} + +// Appends a TestPartResult to the array. +void TestPartResultArray::Append(const TestPartResult& result) { + array_.push_back(result); +} + +// Returns the TestPartResult at the given index (0-based). +const TestPartResult& TestPartResultArray::GetTestPartResult(int index) const { + if (index < 0 || index >= size()) { + printf("\nInvalid index (%d) into TestPartResultArray.\n", index); + internal::posix::Abort(); + } + + return array_[static_cast(index)]; +} + +// Returns the number of TestPartResult objects in the array. +int TestPartResultArray::size() const { + return static_cast(array_.size()); +} + +namespace internal { + +HasNewFatalFailureHelper::HasNewFatalFailureHelper() + : has_new_fatal_failure_(false), + original_reporter_( + GetUnitTestImpl()->GetTestPartResultReporterForCurrentThread()) { + GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this); +} + +HasNewFatalFailureHelper::~HasNewFatalFailureHelper() { + GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread( + original_reporter_); +} + +void HasNewFatalFailureHelper::ReportTestPartResult( + const TestPartResult& result) { + if (result.fatally_failed()) has_new_fatal_failure_ = true; + original_reporter_->ReportTestPartResult(result); +} + +} // namespace internal + +} // namespace testing diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest-typed-test.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-typed-test.cc new file mode 100644 index 0000000000..a2828b83c6 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-typed-test.cc @@ -0,0 +1,104 @@ +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "gtest/gtest-typed-test.h" + +#include "gtest/gtest.h" + +namespace testing { +namespace internal { + +// Skips to the first non-space char in str. Returns an empty string if str +// contains only whitespace characters. +static const char* SkipSpaces(const char* str) { + while (IsSpace(*str)) str++; + return str; +} + +static std::vector SplitIntoTestNames(const char* src) { + std::vector name_vec; + src = SkipSpaces(src); + for (; src != nullptr; src = SkipComma(src)) { + name_vec.push_back(StripTrailingSpaces(GetPrefixUntilComma(src))); + } + return name_vec; +} + +// Verifies that registered_tests match the test names in +// registered_tests_; returns registered_tests if successful, or +// aborts the program otherwise. +const char* TypedTestSuitePState::VerifyRegisteredTestNames( + const char* test_suite_name, const char* file, int line, + const char* registered_tests) { + RegisterTypeParameterizedTestSuite(test_suite_name, CodeLocation(file, line)); + + typedef RegisteredTestsMap::const_iterator RegisteredTestIter; + registered_ = true; + + std::vector name_vec = SplitIntoTestNames(registered_tests); + + Message errors; + + std::set tests; + for (std::vector::const_iterator name_it = name_vec.begin(); + name_it != name_vec.end(); ++name_it) { + const std::string& name = *name_it; + if (tests.count(name) != 0) { + errors << "Test " << name << " is listed more than once.\n"; + continue; + } + + if (registered_tests_.count(name) != 0) { + tests.insert(name); + } else { + errors << "No test named " << name + << " can be found in this test suite.\n"; + } + } + + for (RegisteredTestIter it = registered_tests_.begin(); + it != registered_tests_.end(); ++it) { + if (tests.count(it->first) == 0) { + errors << "You forgot to list test " << it->first << ".\n"; + } + } + + const std::string& errors_str = errors.GetString(); + if (errors_str != "") { + fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(), + errors_str.c_str()); + fflush(stderr); + posix::Abort(); + } + + return registered_tests; +} + +} // namespace internal +} // namespace testing diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest.cc new file mode 100644 index 0000000000..6f31dd2260 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest.cc @@ -0,0 +1,6795 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// +// The Google C++ Testing and Mocking Framework (Google Test) + +#include "gtest/gtest.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include // NOLINT +#include +#include +#include +#include +#include +#include +#include +#include +#include // NOLINT +#include +#include +#include + +#include "gtest/gtest-assertion-result.h" +#include "gtest/gtest-spi.h" +#include "gtest/internal/custom/gtest.h" + +#if GTEST_OS_LINUX + +#include // NOLINT +#include // NOLINT +#include // NOLINT +// Declares vsnprintf(). This header is not available on Windows. +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include // NOLINT + +#include + +#elif GTEST_OS_ZOS +#include // NOLINT + +// On z/OS we additionally need strings.h for strcasecmp. +#include // NOLINT + +#elif GTEST_OS_WINDOWS_MOBILE // We are on Windows CE. + +#include // NOLINT +#undef min + +#elif GTEST_OS_WINDOWS // We are on Windows proper. + +#include // NOLINT +#undef min + +#ifdef _MSC_VER +#include // NOLINT +#endif + +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include // NOLINT + +#if GTEST_OS_WINDOWS_MINGW +#include // NOLINT +#endif // GTEST_OS_WINDOWS_MINGW + +#else + +// cpplint thinks that the header is already included, so we want to +// silence it. +#include // NOLINT +#include // NOLINT + +#endif // GTEST_OS_LINUX + +#if GTEST_HAS_EXCEPTIONS +#include +#endif + +#if GTEST_CAN_STREAM_RESULTS_ +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include // NOLINT +#endif + +#include "src/gtest-internal-inl.h" + +#if GTEST_OS_WINDOWS +#define vsnprintf _vsnprintf +#endif // GTEST_OS_WINDOWS + +#if GTEST_OS_MAC +#ifndef GTEST_OS_IOS +#include +#endif +#endif + +#if GTEST_HAS_ABSL +#include "absl/debugging/failure_signal_handler.h" +#include "absl/debugging/stacktrace.h" +#include "absl/debugging/symbolize.h" +#include "absl/flags/parse.h" +#include "absl/flags/usage.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_replace.h" +#endif // GTEST_HAS_ABSL + +namespace testing { + +using internal::CountIf; +using internal::ForEach; +using internal::GetElementOr; +using internal::Shuffle; + +// Constants. + +// A test whose test suite name or test name matches this filter is +// disabled and not run. +static const char kDisableTestFilter[] = "DISABLED_*:*/DISABLED_*"; + +// A test suite whose name matches this filter is considered a death +// test suite and will be run before test suites whose name doesn't +// match this filter. +static const char kDeathTestSuiteFilter[] = "*DeathTest:*DeathTest/*"; + +// A test filter that matches everything. +static const char kUniversalFilter[] = "*"; + +// The default output format. +static const char kDefaultOutputFormat[] = "xml"; +// The default output file. +static const char kDefaultOutputFile[] = "test_detail"; + +// The environment variable name for the test shard index. +static const char kTestShardIndex[] = "GTEST_SHARD_INDEX"; +// The environment variable name for the total number of test shards. +static const char kTestTotalShards[] = "GTEST_TOTAL_SHARDS"; +// The environment variable name for the test shard status file. +static const char kTestShardStatusFile[] = "GTEST_SHARD_STATUS_FILE"; + +namespace internal { + +// The text used in failure messages to indicate the start of the +// stack trace. +const char kStackTraceMarker[] = "\nStack trace:\n"; + +// g_help_flag is true if and only if the --help flag or an equivalent form +// is specified on the command line. +bool g_help_flag = false; + +// Utility function to Open File for Writing +static FILE* OpenFileForWriting(const std::string& output_file) { + FILE* fileout = nullptr; + FilePath output_file_path(output_file); + FilePath output_dir(output_file_path.RemoveFileName()); + + if (output_dir.CreateDirectoriesRecursively()) { + fileout = posix::FOpen(output_file.c_str(), "w"); + } + if (fileout == nullptr) { + GTEST_LOG_(FATAL) << "Unable to open file \"" << output_file << "\""; + } + return fileout; +} + +} // namespace internal + +// Bazel passes in the argument to '--test_filter' via the TESTBRIDGE_TEST_ONLY +// environment variable. +static const char* GetDefaultFilter() { + const char* const testbridge_test_only = + internal::posix::GetEnv("TESTBRIDGE_TEST_ONLY"); + if (testbridge_test_only != nullptr) { + return testbridge_test_only; + } + return kUniversalFilter; +} + +// Bazel passes in the argument to '--test_runner_fail_fast' via the +// TESTBRIDGE_TEST_RUNNER_FAIL_FAST environment variable. +static bool GetDefaultFailFast() { + const char* const testbridge_test_runner_fail_fast = + internal::posix::GetEnv("TESTBRIDGE_TEST_RUNNER_FAIL_FAST"); + if (testbridge_test_runner_fail_fast != nullptr) { + return strcmp(testbridge_test_runner_fail_fast, "1") == 0; + } + return false; +} + +} // namespace testing + +GTEST_DEFINE_bool_( + fail_fast, + testing::internal::BoolFromGTestEnv("fail_fast", + testing::GetDefaultFailFast()), + "True if and only if a test failure should stop further test execution."); + +GTEST_DEFINE_bool_( + also_run_disabled_tests, + testing::internal::BoolFromGTestEnv("also_run_disabled_tests", false), + "Run disabled tests too, in addition to the tests normally being run."); + +GTEST_DEFINE_bool_( + break_on_failure, + testing::internal::BoolFromGTestEnv("break_on_failure", false), + "True if and only if a failed assertion should be a debugger " + "break-point."); + +GTEST_DEFINE_bool_(catch_exceptions, + testing::internal::BoolFromGTestEnv("catch_exceptions", + true), + "True if and only if " GTEST_NAME_ + " should catch exceptions and treat them as test failures."); + +GTEST_DEFINE_string_( + color, testing::internal::StringFromGTestEnv("color", "auto"), + "Whether to use colors in the output. Valid values: yes, no, " + "and auto. 'auto' means to use colors if the output is " + "being sent to a terminal and the TERM environment variable " + "is set to a terminal type that supports colors."); + +GTEST_DEFINE_string_( + filter, + testing::internal::StringFromGTestEnv("filter", + testing::GetDefaultFilter()), + "A colon-separated list of glob (not regex) patterns " + "for filtering the tests to run, optionally followed by a " + "'-' and a : separated list of negative patterns (tests to " + "exclude). A test is run if it matches one of the positive " + "patterns and does not match any of the negative patterns."); + +GTEST_DEFINE_bool_( + install_failure_signal_handler, + testing::internal::BoolFromGTestEnv("install_failure_signal_handler", + false), + "If true and supported on the current platform, " GTEST_NAME_ + " should " + "install a signal handler that dumps debugging information when fatal " + "signals are raised."); + +GTEST_DEFINE_bool_(list_tests, false, "List all tests without running them."); + +// The net priority order after flag processing is thus: +// --gtest_output command line flag +// GTEST_OUTPUT environment variable +// XML_OUTPUT_FILE environment variable +// '' +GTEST_DEFINE_string_( + output, + testing::internal::StringFromGTestEnv( + "output", testing::internal::OutputFlagAlsoCheckEnvVar().c_str()), + "A format (defaults to \"xml\" but can be specified to be \"json\"), " + "optionally followed by a colon and an output file name or directory. " + "A directory is indicated by a trailing pathname separator. " + "Examples: \"xml:filename.xml\", \"xml::directoryname/\". " + "If a directory is specified, output files will be created " + "within that directory, with file-names based on the test " + "executable's name and, if necessary, made unique by adding " + "digits."); + +GTEST_DEFINE_bool_( + brief, testing::internal::BoolFromGTestEnv("brief", false), + "True if only test failures should be displayed in text output."); + +GTEST_DEFINE_bool_(print_time, + testing::internal::BoolFromGTestEnv("print_time", true), + "True if and only if " GTEST_NAME_ + " should display elapsed time in text output."); + +GTEST_DEFINE_bool_(print_utf8, + testing::internal::BoolFromGTestEnv("print_utf8", true), + "True if and only if " GTEST_NAME_ + " prints UTF8 characters as text."); + +GTEST_DEFINE_int32_( + random_seed, testing::internal::Int32FromGTestEnv("random_seed", 0), + "Random number seed to use when shuffling test orders. Must be in range " + "[1, 99999], or 0 to use a seed based on the current time."); + +GTEST_DEFINE_int32_( + repeat, testing::internal::Int32FromGTestEnv("repeat", 1), + "How many times to repeat each test. Specify a negative number " + "for repeating forever. Useful for shaking out flaky tests."); + +GTEST_DEFINE_bool_( + recreate_environments_when_repeating, + testing::internal::BoolFromGTestEnv("recreate_environments_when_repeating", + false), + "Controls whether global test environments are recreated for each repeat " + "of the tests. If set to false the global test environments are only set " + "up once, for the first iteration, and only torn down once, for the last. " + "Useful for shaking out flaky tests with stable, expensive test " + "environments. If --gtest_repeat is set to a negative number, meaning " + "there is no last run, the environments will always be recreated to avoid " + "leaks."); + +GTEST_DEFINE_bool_(show_internal_stack_frames, false, + "True if and only if " GTEST_NAME_ + " should include internal stack frames when " + "printing test failure stack traces."); + +GTEST_DEFINE_bool_(shuffle, + testing::internal::BoolFromGTestEnv("shuffle", false), + "True if and only if " GTEST_NAME_ + " should randomize tests' order on every run."); + +GTEST_DEFINE_int32_( + stack_trace_depth, + testing::internal::Int32FromGTestEnv("stack_trace_depth", + testing::kMaxStackTraceDepth), + "The maximum number of stack frames to print when an " + "assertion fails. The valid range is 0 through 100, inclusive."); + +GTEST_DEFINE_string_( + stream_result_to, + testing::internal::StringFromGTestEnv("stream_result_to", ""), + "This flag specifies the host name and the port number on which to stream " + "test results. Example: \"localhost:555\". The flag is effective only on " + "Linux."); + +GTEST_DEFINE_bool_( + throw_on_failure, + testing::internal::BoolFromGTestEnv("throw_on_failure", false), + "When this flag is specified, a failed assertion will throw an exception " + "if exceptions are enabled or exit the program with a non-zero code " + "otherwise. For use with an external test framework."); + +#if GTEST_USE_OWN_FLAGFILE_FLAG_ +GTEST_DEFINE_string_( + flagfile, testing::internal::StringFromGTestEnv("flagfile", ""), + "This flag specifies the flagfile to read command-line flags from."); +#endif // GTEST_USE_OWN_FLAGFILE_FLAG_ + +namespace testing { +namespace internal { + +// Generates a random number from [0, range), using a Linear +// Congruential Generator (LCG). Crashes if 'range' is 0 or greater +// than kMaxRange. +uint32_t Random::Generate(uint32_t range) { + // These constants are the same as are used in glibc's rand(3). + // Use wider types than necessary to prevent unsigned overflow diagnostics. + state_ = static_cast(1103515245ULL * state_ + 12345U) % kMaxRange; + + GTEST_CHECK_(range > 0) << "Cannot generate a number in the range [0, 0)."; + GTEST_CHECK_(range <= kMaxRange) + << "Generation of a number in [0, " << range << ") was requested, " + << "but this can only generate numbers in [0, " << kMaxRange << ")."; + + // Converting via modulus introduces a bit of downward bias, but + // it's simple, and a linear congruential generator isn't too good + // to begin with. + return state_ % range; +} + +// GTestIsInitialized() returns true if and only if the user has initialized +// Google Test. Useful for catching the user mistake of not initializing +// Google Test before calling RUN_ALL_TESTS(). +static bool GTestIsInitialized() { return GetArgvs().size() > 0; } + +// Iterates over a vector of TestSuites, keeping a running sum of the +// results of calling a given int-returning method on each. +// Returns the sum. +static int SumOverTestSuiteList(const std::vector& case_list, + int (TestSuite::*method)() const) { + int sum = 0; + for (size_t i = 0; i < case_list.size(); i++) { + sum += (case_list[i]->*method)(); + } + return sum; +} + +// Returns true if and only if the test suite passed. +static bool TestSuitePassed(const TestSuite* test_suite) { + return test_suite->should_run() && test_suite->Passed(); +} + +// Returns true if and only if the test suite failed. +static bool TestSuiteFailed(const TestSuite* test_suite) { + return test_suite->should_run() && test_suite->Failed(); +} + +// Returns true if and only if test_suite contains at least one test that +// should run. +static bool ShouldRunTestSuite(const TestSuite* test_suite) { + return test_suite->should_run(); +} + +// AssertHelper constructor. +AssertHelper::AssertHelper(TestPartResult::Type type, const char* file, + int line, const char* message) + : data_(new AssertHelperData(type, file, line, message)) {} + +AssertHelper::~AssertHelper() { delete data_; } + +// Message assignment, for assertion streaming support. +void AssertHelper::operator=(const Message& message) const { + UnitTest::GetInstance()->AddTestPartResult( + data_->type, data_->file, data_->line, + AppendUserMessage(data_->message, message), + UnitTest::GetInstance()->impl()->CurrentOsStackTraceExceptTop(1) + // Skips the stack frame for this function itself. + ); // NOLINT +} + +namespace { + +// When TEST_P is found without a matching INSTANTIATE_TEST_SUITE_P +// to creates test cases for it, a synthetic test case is +// inserted to report ether an error or a log message. +// +// This configuration bit will likely be removed at some point. +constexpr bool kErrorOnUninstantiatedParameterizedTest = true; +constexpr bool kErrorOnUninstantiatedTypeParameterizedTest = true; + +// A test that fails at a given file/line location with a given message. +class FailureTest : public Test { + public: + explicit FailureTest(const CodeLocation& loc, std::string error_message, + bool as_error) + : loc_(loc), + error_message_(std::move(error_message)), + as_error_(as_error) {} + + void TestBody() override { + if (as_error_) { + AssertHelper(TestPartResult::kNonFatalFailure, loc_.file.c_str(), + loc_.line, "") = Message() << error_message_; + } else { + std::cout << error_message_ << std::endl; + } + } + + private: + const CodeLocation loc_; + const std::string error_message_; + const bool as_error_; +}; + +} // namespace + +std::set* GetIgnoredParameterizedTestSuites() { + return UnitTest::GetInstance()->impl()->ignored_parameterized_test_suites(); +} + +// Add a given test_suit to the list of them allow to go un-instantiated. +MarkAsIgnored::MarkAsIgnored(const char* test_suite) { + GetIgnoredParameterizedTestSuites()->insert(test_suite); +} + +// If this parameterized test suite has no instantiations (and that +// has not been marked as okay), emit a test case reporting that. +void InsertSyntheticTestCase(const std::string& name, CodeLocation location, + bool has_test_p) { + const auto& ignored = *GetIgnoredParameterizedTestSuites(); + if (ignored.find(name) != ignored.end()) return; + + const char kMissingInstantiation[] = // + " is defined via TEST_P, but never instantiated. None of the test cases " + "will run. Either no INSTANTIATE_TEST_SUITE_P is provided or the only " + "ones provided expand to nothing." + "\n\n" + "Ideally, TEST_P definitions should only ever be included as part of " + "binaries that intend to use them. (As opposed to, for example, being " + "placed in a library that may be linked in to get other utilities.)"; + + const char kMissingTestCase[] = // + " is instantiated via INSTANTIATE_TEST_SUITE_P, but no tests are " + "defined via TEST_P . No test cases will run." + "\n\n" + "Ideally, INSTANTIATE_TEST_SUITE_P should only ever be invoked from " + "code that always depend on code that provides TEST_P. Failing to do " + "so is often an indication of dead code, e.g. the last TEST_P was " + "removed but the rest got left behind."; + + std::string message = + "Parameterized test suite " + name + + (has_test_p ? kMissingInstantiation : kMissingTestCase) + + "\n\n" + "To suppress this error for this test suite, insert the following line " + "(in a non-header) in the namespace it is defined in:" + "\n\n" + "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" + + name + ");"; + + std::string full_name = "UninstantiatedParameterizedTestSuite<" + name + ">"; + RegisterTest( // + "GoogleTestVerification", full_name.c_str(), + nullptr, // No type parameter. + nullptr, // No value parameter. + location.file.c_str(), location.line, [message, location] { + return new FailureTest(location, message, + kErrorOnUninstantiatedParameterizedTest); + }); +} + +void RegisterTypeParameterizedTestSuite(const char* test_suite_name, + CodeLocation code_location) { + GetUnitTestImpl()->type_parameterized_test_registry().RegisterTestSuite( + test_suite_name, code_location); +} + +void RegisterTypeParameterizedTestSuiteInstantiation(const char* case_name) { + GetUnitTestImpl()->type_parameterized_test_registry().RegisterInstantiation( + case_name); +} + +void TypeParameterizedTestSuiteRegistry::RegisterTestSuite( + const char* test_suite_name, CodeLocation code_location) { + suites_.emplace(std::string(test_suite_name), + TypeParameterizedTestSuiteInfo(code_location)); +} + +void TypeParameterizedTestSuiteRegistry::RegisterInstantiation( + const char* test_suite_name) { + auto it = suites_.find(std::string(test_suite_name)); + if (it != suites_.end()) { + it->second.instantiated = true; + } else { + GTEST_LOG_(ERROR) << "Unknown type parameterized test suit '" + << test_suite_name << "'"; + } +} + +void TypeParameterizedTestSuiteRegistry::CheckForInstantiations() { + const auto& ignored = *GetIgnoredParameterizedTestSuites(); + for (const auto& testcase : suites_) { + if (testcase.second.instantiated) continue; + if (ignored.find(testcase.first) != ignored.end()) continue; + + std::string message = + "Type parameterized test suite " + testcase.first + + " is defined via REGISTER_TYPED_TEST_SUITE_P, but never instantiated " + "via INSTANTIATE_TYPED_TEST_SUITE_P. None of the test cases will run." + "\n\n" + "Ideally, TYPED_TEST_P definitions should only ever be included as " + "part of binaries that intend to use them. (As opposed to, for " + "example, being placed in a library that may be linked in to get other " + "utilities.)" + "\n\n" + "To suppress this error for this test suite, insert the following line " + "(in a non-header) in the namespace it is defined in:" + "\n\n" + "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" + + testcase.first + ");"; + + std::string full_name = + "UninstantiatedTypeParameterizedTestSuite<" + testcase.first + ">"; + RegisterTest( // + "GoogleTestVerification", full_name.c_str(), + nullptr, // No type parameter. + nullptr, // No value parameter. + testcase.second.code_location.file.c_str(), + testcase.second.code_location.line, [message, testcase] { + return new FailureTest(testcase.second.code_location, message, + kErrorOnUninstantiatedTypeParameterizedTest); + }); + } +} + +// A copy of all command line arguments. Set by InitGoogleTest(). +static ::std::vector g_argvs; + +::std::vector GetArgvs() { +#if defined(GTEST_CUSTOM_GET_ARGVS_) + // GTEST_CUSTOM_GET_ARGVS_() may return a container of std::string or + // ::string. This code converts it to the appropriate type. + const auto& custom = GTEST_CUSTOM_GET_ARGVS_(); + return ::std::vector(custom.begin(), custom.end()); +#else // defined(GTEST_CUSTOM_GET_ARGVS_) + return g_argvs; +#endif // defined(GTEST_CUSTOM_GET_ARGVS_) +} + +// Returns the current application's name, removing directory path if that +// is present. +FilePath GetCurrentExecutableName() { + FilePath result; + +#if GTEST_OS_WINDOWS || GTEST_OS_OS2 + result.Set(FilePath(GetArgvs()[0]).RemoveExtension("exe")); +#else + result.Set(FilePath(GetArgvs()[0])); +#endif // GTEST_OS_WINDOWS + + return result.RemoveDirectoryName(); +} + +// Functions for processing the gtest_output flag. + +// Returns the output format, or "" for normal printed output. +std::string UnitTestOptions::GetOutputFormat() { + std::string s = GTEST_FLAG_GET(output); + const char* const gtest_output_flag = s.c_str(); + const char* const colon = strchr(gtest_output_flag, ':'); + return (colon == nullptr) + ? std::string(gtest_output_flag) + : std::string(gtest_output_flag, + static_cast(colon - gtest_output_flag)); +} + +// Returns the name of the requested output file, or the default if none +// was explicitly specified. +std::string UnitTestOptions::GetAbsolutePathToOutputFile() { + std::string s = GTEST_FLAG_GET(output); + const char* const gtest_output_flag = s.c_str(); + + std::string format = GetOutputFormat(); + if (format.empty()) format = std::string(kDefaultOutputFormat); + + const char* const colon = strchr(gtest_output_flag, ':'); + if (colon == nullptr) + return internal::FilePath::MakeFileName( + internal::FilePath( + UnitTest::GetInstance()->original_working_dir()), + internal::FilePath(kDefaultOutputFile), 0, format.c_str()) + .string(); + + internal::FilePath output_name(colon + 1); + if (!output_name.IsAbsolutePath()) + output_name = internal::FilePath::ConcatPaths( + internal::FilePath(UnitTest::GetInstance()->original_working_dir()), + internal::FilePath(colon + 1)); + + if (!output_name.IsDirectory()) return output_name.string(); + + internal::FilePath result(internal::FilePath::GenerateUniqueFileName( + output_name, internal::GetCurrentExecutableName(), + GetOutputFormat().c_str())); + return result.string(); +} + +// Returns true if and only if the wildcard pattern matches the string. Each +// pattern consists of regular characters, single-character wildcards (?), and +// multi-character wildcards (*). +// +// This function implements a linear-time string globbing algorithm based on +// https://research.swtch.com/glob. +static bool PatternMatchesString(const std::string& name_str, + const char* pattern, const char* pattern_end) { + const char* name = name_str.c_str(); + const char* const name_begin = name; + const char* const name_end = name + name_str.size(); + + const char* pattern_next = pattern; + const char* name_next = name; + + while (pattern < pattern_end || name < name_end) { + if (pattern < pattern_end) { + switch (*pattern) { + default: // Match an ordinary character. + if (name < name_end && *name == *pattern) { + ++pattern; + ++name; + continue; + } + break; + case '?': // Match any single character. + if (name < name_end) { + ++pattern; + ++name; + continue; + } + break; + case '*': + // Match zero or more characters. Start by skipping over the wildcard + // and matching zero characters from name. If that fails, restart and + // match one more character than the last attempt. + pattern_next = pattern; + name_next = name + 1; + ++pattern; + continue; + } + } + // Failed to match a character. Restart if possible. + if (name_begin < name_next && name_next <= name_end) { + pattern = pattern_next; + name = name_next; + continue; + } + return false; + } + return true; +} + +namespace { + +bool IsGlobPattern(const std::string& pattern) { + return std::any_of(pattern.begin(), pattern.end(), + [](const char c) { return c == '?' || c == '*'; }); +} + +class UnitTestFilter { + public: + UnitTestFilter() = default; + + // Constructs a filter from a string of patterns separated by `:`. + explicit UnitTestFilter(const std::string& filter) { + // By design "" filter matches "" string. + std::vector all_patterns; + SplitString(filter, ':', &all_patterns); + const auto exact_match_patterns_begin = std::partition( + all_patterns.begin(), all_patterns.end(), &IsGlobPattern); + + glob_patterns_.reserve(static_cast( + std::distance(all_patterns.begin(), exact_match_patterns_begin))); + std::move(all_patterns.begin(), exact_match_patterns_begin, + std::inserter(glob_patterns_, glob_patterns_.begin())); + std::move( + exact_match_patterns_begin, all_patterns.end(), + std::inserter(exact_match_patterns_, exact_match_patterns_.begin())); + } + + // Returns true if and only if name matches at least one of the patterns in + // the filter. + bool MatchesName(const std::string& name) const { + return exact_match_patterns_.count(name) > 0 || + std::any_of(glob_patterns_.begin(), glob_patterns_.end(), + [&name](const std::string& pattern) { + return PatternMatchesString( + name, pattern.c_str(), + pattern.c_str() + pattern.size()); + }); + } + + private: + std::vector glob_patterns_; + std::unordered_set exact_match_patterns_; +}; + +class PositiveAndNegativeUnitTestFilter { + public: + // Constructs a positive and a negative filter from a string. The string + // contains a positive filter optionally followed by a '-' character and a + // negative filter. In case only a negative filter is provided the positive + // filter will be assumed "*". + // A filter is a list of patterns separated by ':'. + explicit PositiveAndNegativeUnitTestFilter(const std::string& filter) { + std::vector positive_and_negative_filters; + + // NOTE: `SplitString` always returns a non-empty container. + SplitString(filter, '-', &positive_and_negative_filters); + const auto& positive_filter = positive_and_negative_filters.front(); + + if (positive_and_negative_filters.size() > 1) { + positive_filter_ = UnitTestFilter( + positive_filter.empty() ? kUniversalFilter : positive_filter); + + // TODO(b/214626361): Fail on multiple '-' characters + // For the moment to preserve old behavior we concatenate the rest of the + // string parts with `-` as separator to generate the negative filter. + auto negative_filter_string = positive_and_negative_filters[1]; + for (std::size_t i = 2; i < positive_and_negative_filters.size(); i++) + negative_filter_string = + negative_filter_string + '-' + positive_and_negative_filters[i]; + negative_filter_ = UnitTestFilter(negative_filter_string); + } else { + // In case we don't have a negative filter and positive filter is "" + // we do not use kUniversalFilter by design as opposed to when we have a + // negative filter. + positive_filter_ = UnitTestFilter(positive_filter); + } + } + + // Returns true if and only if test name (this is generated by appending test + // suit name and test name via a '.' character) matches the positive filter + // and does not match the negative filter. + bool MatchesTest(const std::string& test_suite_name, + const std::string& test_name) const { + return MatchesName(test_suite_name + "." + test_name); + } + + // Returns true if and only if name matches the positive filter and does not + // match the negative filter. + bool MatchesName(const std::string& name) const { + return positive_filter_.MatchesName(name) && + !negative_filter_.MatchesName(name); + } + + private: + UnitTestFilter positive_filter_; + UnitTestFilter negative_filter_; +}; +} // namespace + +bool UnitTestOptions::MatchesFilter(const std::string& name_str, + const char* filter) { + return UnitTestFilter(filter).MatchesName(name_str); +} + +// Returns true if and only if the user-specified filter matches the test +// suite name and the test name. +bool UnitTestOptions::FilterMatchesTest(const std::string& test_suite_name, + const std::string& test_name) { + // Split --gtest_filter at '-', if there is one, to separate into + // positive filter and negative filter portions + return PositiveAndNegativeUnitTestFilter(GTEST_FLAG_GET(filter)) + .MatchesTest(test_suite_name, test_name); +} + +#if GTEST_HAS_SEH +// Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the +// given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise. +// This function is useful as an __except condition. +int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) { + // Google Test should handle a SEH exception if: + // 1. the user wants it to, AND + // 2. this is not a breakpoint exception, AND + // 3. this is not a C++ exception (VC++ implements them via SEH, + // apparently). + // + // SEH exception code for C++ exceptions. + // (see http://support.microsoft.com/kb/185294 for more information). + const DWORD kCxxExceptionCode = 0xe06d7363; + + bool should_handle = true; + + if (!GTEST_FLAG_GET(catch_exceptions)) + should_handle = false; + else if (exception_code == EXCEPTION_BREAKPOINT) + should_handle = false; + else if (exception_code == kCxxExceptionCode) + should_handle = false; + + return should_handle ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH; +} +#endif // GTEST_HAS_SEH + +} // namespace internal + +// The c'tor sets this object as the test part result reporter used by +// Google Test. The 'result' parameter specifies where to report the +// results. Intercepts only failures from the current thread. +ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter( + TestPartResultArray* result) + : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD), result_(result) { + Init(); +} + +// The c'tor sets this object as the test part result reporter used by +// Google Test. The 'result' parameter specifies where to report the +// results. +ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter( + InterceptMode intercept_mode, TestPartResultArray* result) + : intercept_mode_(intercept_mode), result_(result) { + Init(); +} + +void ScopedFakeTestPartResultReporter::Init() { + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + if (intercept_mode_ == INTERCEPT_ALL_THREADS) { + old_reporter_ = impl->GetGlobalTestPartResultReporter(); + impl->SetGlobalTestPartResultReporter(this); + } else { + old_reporter_ = impl->GetTestPartResultReporterForCurrentThread(); + impl->SetTestPartResultReporterForCurrentThread(this); + } +} + +// The d'tor restores the test part result reporter used by Google Test +// before. +ScopedFakeTestPartResultReporter::~ScopedFakeTestPartResultReporter() { + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + if (intercept_mode_ == INTERCEPT_ALL_THREADS) { + impl->SetGlobalTestPartResultReporter(old_reporter_); + } else { + impl->SetTestPartResultReporterForCurrentThread(old_reporter_); + } +} + +// Increments the test part result count and remembers the result. +// This method is from the TestPartResultReporterInterface interface. +void ScopedFakeTestPartResultReporter::ReportTestPartResult( + const TestPartResult& result) { + result_->Append(result); +} + +namespace internal { + +// Returns the type ID of ::testing::Test. We should always call this +// instead of GetTypeId< ::testing::Test>() to get the type ID of +// testing::Test. This is to work around a suspected linker bug when +// using Google Test as a framework on Mac OS X. The bug causes +// GetTypeId< ::testing::Test>() to return different values depending +// on whether the call is from the Google Test framework itself or +// from user test code. GetTestTypeId() is guaranteed to always +// return the same value, as it always calls GetTypeId<>() from the +// gtest.cc, which is within the Google Test framework. +TypeId GetTestTypeId() { return GetTypeId(); } + +// The value of GetTestTypeId() as seen from within the Google Test +// library. This is solely for testing GetTestTypeId(). +extern const TypeId kTestTypeIdInGoogleTest = GetTestTypeId(); + +// This predicate-formatter checks that 'results' contains a test part +// failure of the given type and that the failure message contains the +// given substring. +static AssertionResult HasOneFailure(const char* /* results_expr */, + const char* /* type_expr */, + const char* /* substr_expr */, + const TestPartResultArray& results, + TestPartResult::Type type, + const std::string& substr) { + const std::string expected(type == TestPartResult::kFatalFailure + ? "1 fatal failure" + : "1 non-fatal failure"); + Message msg; + if (results.size() != 1) { + msg << "Expected: " << expected << "\n" + << " Actual: " << results.size() << " failures"; + for (int i = 0; i < results.size(); i++) { + msg << "\n" << results.GetTestPartResult(i); + } + return AssertionFailure() << msg; + } + + const TestPartResult& r = results.GetTestPartResult(0); + if (r.type() != type) { + return AssertionFailure() << "Expected: " << expected << "\n" + << " Actual:\n" + << r; + } + + if (strstr(r.message(), substr.c_str()) == nullptr) { + return AssertionFailure() + << "Expected: " << expected << " containing \"" << substr << "\"\n" + << " Actual:\n" + << r; + } + + return AssertionSuccess(); +} + +// The constructor of SingleFailureChecker remembers where to look up +// test part results, what type of failure we expect, and what +// substring the failure message should contain. +SingleFailureChecker::SingleFailureChecker(const TestPartResultArray* results, + TestPartResult::Type type, + const std::string& substr) + : results_(results), type_(type), substr_(substr) {} + +// The destructor of SingleFailureChecker verifies that the given +// TestPartResultArray contains exactly one failure that has the given +// type and contains the given substring. If that's not the case, a +// non-fatal failure will be generated. +SingleFailureChecker::~SingleFailureChecker() { + EXPECT_PRED_FORMAT3(HasOneFailure, *results_, type_, substr_); +} + +DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter( + UnitTestImpl* unit_test) + : unit_test_(unit_test) {} + +void DefaultGlobalTestPartResultReporter::ReportTestPartResult( + const TestPartResult& result) { + unit_test_->current_test_result()->AddTestPartResult(result); + unit_test_->listeners()->repeater()->OnTestPartResult(result); +} + +DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter( + UnitTestImpl* unit_test) + : unit_test_(unit_test) {} + +void DefaultPerThreadTestPartResultReporter::ReportTestPartResult( + const TestPartResult& result) { + unit_test_->GetGlobalTestPartResultReporter()->ReportTestPartResult(result); +} + +// Returns the global test part result reporter. +TestPartResultReporterInterface* +UnitTestImpl::GetGlobalTestPartResultReporter() { + internal::MutexLock lock(&global_test_part_result_reporter_mutex_); + return global_test_part_result_repoter_; +} + +// Sets the global test part result reporter. +void UnitTestImpl::SetGlobalTestPartResultReporter( + TestPartResultReporterInterface* reporter) { + internal::MutexLock lock(&global_test_part_result_reporter_mutex_); + global_test_part_result_repoter_ = reporter; +} + +// Returns the test part result reporter for the current thread. +TestPartResultReporterInterface* +UnitTestImpl::GetTestPartResultReporterForCurrentThread() { + return per_thread_test_part_result_reporter_.get(); +} + +// Sets the test part result reporter for the current thread. +void UnitTestImpl::SetTestPartResultReporterForCurrentThread( + TestPartResultReporterInterface* reporter) { + per_thread_test_part_result_reporter_.set(reporter); +} + +// Gets the number of successful test suites. +int UnitTestImpl::successful_test_suite_count() const { + return CountIf(test_suites_, TestSuitePassed); +} + +// Gets the number of failed test suites. +int UnitTestImpl::failed_test_suite_count() const { + return CountIf(test_suites_, TestSuiteFailed); +} + +// Gets the number of all test suites. +int UnitTestImpl::total_test_suite_count() const { + return static_cast(test_suites_.size()); +} + +// Gets the number of all test suites that contain at least one test +// that should run. +int UnitTestImpl::test_suite_to_run_count() const { + return CountIf(test_suites_, ShouldRunTestSuite); +} + +// Gets the number of successful tests. +int UnitTestImpl::successful_test_count() const { + return SumOverTestSuiteList(test_suites_, &TestSuite::successful_test_count); +} + +// Gets the number of skipped tests. +int UnitTestImpl::skipped_test_count() const { + return SumOverTestSuiteList(test_suites_, &TestSuite::skipped_test_count); +} + +// Gets the number of failed tests. +int UnitTestImpl::failed_test_count() const { + return SumOverTestSuiteList(test_suites_, &TestSuite::failed_test_count); +} + +// Gets the number of disabled tests that will be reported in the XML report. +int UnitTestImpl::reportable_disabled_test_count() const { + return SumOverTestSuiteList(test_suites_, + &TestSuite::reportable_disabled_test_count); +} + +// Gets the number of disabled tests. +int UnitTestImpl::disabled_test_count() const { + return SumOverTestSuiteList(test_suites_, &TestSuite::disabled_test_count); +} + +// Gets the number of tests to be printed in the XML report. +int UnitTestImpl::reportable_test_count() const { + return SumOverTestSuiteList(test_suites_, &TestSuite::reportable_test_count); +} + +// Gets the number of all tests. +int UnitTestImpl::total_test_count() const { + return SumOverTestSuiteList(test_suites_, &TestSuite::total_test_count); +} + +// Gets the number of tests that should run. +int UnitTestImpl::test_to_run_count() const { + return SumOverTestSuiteList(test_suites_, &TestSuite::test_to_run_count); +} + +// Returns the current OS stack trace as an std::string. +// +// The maximum number of stack frames to be included is specified by +// the gtest_stack_trace_depth flag. The skip_count parameter +// specifies the number of top frames to be skipped, which doesn't +// count against the number of frames to be included. +// +// For example, if Foo() calls Bar(), which in turn calls +// CurrentOsStackTraceExceptTop(1), Foo() will be included in the +// trace but Bar() and CurrentOsStackTraceExceptTop() won't. +std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) { + return os_stack_trace_getter()->CurrentStackTrace( + static_cast(GTEST_FLAG_GET(stack_trace_depth)), skip_count + 1 + // Skips the user-specified number of frames plus this function + // itself. + ); // NOLINT +} + +// A helper class for measuring elapsed times. +class Timer { + public: + Timer() : start_(std::chrono::steady_clock::now()) {} + + // Return time elapsed in milliseconds since the timer was created. + TimeInMillis Elapsed() { + return std::chrono::duration_cast( + std::chrono::steady_clock::now() - start_) + .count(); + } + + private: + std::chrono::steady_clock::time_point start_; +}; + +// Returns a timestamp as milliseconds since the epoch. Note this time may jump +// around subject to adjustments by the system, to measure elapsed time use +// Timer instead. +TimeInMillis GetTimeInMillis() { + return std::chrono::duration_cast( + std::chrono::system_clock::now() - + std::chrono::system_clock::from_time_t(0)) + .count(); +} + +// Utilities + +// class String. + +#if GTEST_OS_WINDOWS_MOBILE +// Creates a UTF-16 wide string from the given ANSI string, allocating +// memory using new. The caller is responsible for deleting the return +// value using delete[]. Returns the wide string, or NULL if the +// input is NULL. +LPCWSTR String::AnsiToUtf16(const char* ansi) { + if (!ansi) return nullptr; + const int length = strlen(ansi); + const int unicode_length = + MultiByteToWideChar(CP_ACP, 0, ansi, length, nullptr, 0); + WCHAR* unicode = new WCHAR[unicode_length + 1]; + MultiByteToWideChar(CP_ACP, 0, ansi, length, unicode, unicode_length); + unicode[unicode_length] = 0; + return unicode; +} + +// Creates an ANSI string from the given wide string, allocating +// memory using new. The caller is responsible for deleting the return +// value using delete[]. Returns the ANSI string, or NULL if the +// input is NULL. +const char* String::Utf16ToAnsi(LPCWSTR utf16_str) { + if (!utf16_str) return nullptr; + const int ansi_length = WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, nullptr, + 0, nullptr, nullptr); + char* ansi = new char[ansi_length + 1]; + WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, ansi, ansi_length, nullptr, + nullptr); + ansi[ansi_length] = 0; + return ansi; +} + +#endif // GTEST_OS_WINDOWS_MOBILE + +// Compares two C strings. Returns true if and only if they have the same +// content. +// +// Unlike strcmp(), this function can handle NULL argument(s). A NULL +// C string is considered different to any non-NULL C string, +// including the empty string. +bool String::CStringEquals(const char* lhs, const char* rhs) { + if (lhs == nullptr) return rhs == nullptr; + + if (rhs == nullptr) return false; + + return strcmp(lhs, rhs) == 0; +} + +#if GTEST_HAS_STD_WSTRING + +// Converts an array of wide chars to a narrow string using the UTF-8 +// encoding, and streams the result to the given Message object. +static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length, + Message* msg) { + for (size_t i = 0; i != length;) { // NOLINT + if (wstr[i] != L'\0') { + *msg << WideStringToUtf8(wstr + i, static_cast(length - i)); + while (i != length && wstr[i] != L'\0') i++; + } else { + *msg << '\0'; + i++; + } + } +} + +#endif // GTEST_HAS_STD_WSTRING + +void SplitString(const ::std::string& str, char delimiter, + ::std::vector< ::std::string>* dest) { + ::std::vector< ::std::string> parsed; + ::std::string::size_type pos = 0; + while (::testing::internal::AlwaysTrue()) { + const ::std::string::size_type colon = str.find(delimiter, pos); + if (colon == ::std::string::npos) { + parsed.push_back(str.substr(pos)); + break; + } else { + parsed.push_back(str.substr(pos, colon - pos)); + pos = colon + 1; + } + } + dest->swap(parsed); +} + +} // namespace internal + +// Constructs an empty Message. +// We allocate the stringstream separately because otherwise each use of +// ASSERT/EXPECT in a procedure adds over 200 bytes to the procedure's +// stack frame leading to huge stack frames in some cases; gcc does not reuse +// the stack space. +Message::Message() : ss_(new ::std::stringstream) { + // By default, we want there to be enough precision when printing + // a double to a Message. + *ss_ << std::setprecision(std::numeric_limits::digits10 + 2); +} + +// These two overloads allow streaming a wide C string to a Message +// using the UTF-8 encoding. +Message& Message::operator<<(const wchar_t* wide_c_str) { + return *this << internal::String::ShowWideCString(wide_c_str); +} +Message& Message::operator<<(wchar_t* wide_c_str) { + return *this << internal::String::ShowWideCString(wide_c_str); +} + +#if GTEST_HAS_STD_WSTRING +// Converts the given wide string to a narrow string using the UTF-8 +// encoding, and streams the result to this Message object. +Message& Message::operator<<(const ::std::wstring& wstr) { + internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this); + return *this; +} +#endif // GTEST_HAS_STD_WSTRING + +// Gets the text streamed to this object so far as an std::string. +// Each '\0' character in the buffer is replaced with "\\0". +std::string Message::GetString() const { + return internal::StringStreamToString(ss_.get()); +} + +namespace internal { + +namespace edit_distance { +std::vector CalculateOptimalEdits(const std::vector& left, + const std::vector& right) { + std::vector > costs( + left.size() + 1, std::vector(right.size() + 1)); + std::vector > best_move( + left.size() + 1, std::vector(right.size() + 1)); + + // Populate for empty right. + for (size_t l_i = 0; l_i < costs.size(); ++l_i) { + costs[l_i][0] = static_cast(l_i); + best_move[l_i][0] = kRemove; + } + // Populate for empty left. + for (size_t r_i = 1; r_i < costs[0].size(); ++r_i) { + costs[0][r_i] = static_cast(r_i); + best_move[0][r_i] = kAdd; + } + + for (size_t l_i = 0; l_i < left.size(); ++l_i) { + for (size_t r_i = 0; r_i < right.size(); ++r_i) { + if (left[l_i] == right[r_i]) { + // Found a match. Consume it. + costs[l_i + 1][r_i + 1] = costs[l_i][r_i]; + best_move[l_i + 1][r_i + 1] = kMatch; + continue; + } + + const double add = costs[l_i + 1][r_i]; + const double remove = costs[l_i][r_i + 1]; + const double replace = costs[l_i][r_i]; + if (add < remove && add < replace) { + costs[l_i + 1][r_i + 1] = add + 1; + best_move[l_i + 1][r_i + 1] = kAdd; + } else if (remove < add && remove < replace) { + costs[l_i + 1][r_i + 1] = remove + 1; + best_move[l_i + 1][r_i + 1] = kRemove; + } else { + // We make replace a little more expensive than add/remove to lower + // their priority. + costs[l_i + 1][r_i + 1] = replace + 1.00001; + best_move[l_i + 1][r_i + 1] = kReplace; + } + } + } + + // Reconstruct the best path. We do it in reverse order. + std::vector best_path; + for (size_t l_i = left.size(), r_i = right.size(); l_i > 0 || r_i > 0;) { + EditType move = best_move[l_i][r_i]; + best_path.push_back(move); + l_i -= move != kAdd; + r_i -= move != kRemove; + } + std::reverse(best_path.begin(), best_path.end()); + return best_path; +} + +namespace { + +// Helper class to convert string into ids with deduplication. +class InternalStrings { + public: + size_t GetId(const std::string& str) { + IdMap::iterator it = ids_.find(str); + if (it != ids_.end()) return it->second; + size_t id = ids_.size(); + return ids_[str] = id; + } + + private: + typedef std::map IdMap; + IdMap ids_; +}; + +} // namespace + +std::vector CalculateOptimalEdits( + const std::vector& left, + const std::vector& right) { + std::vector left_ids, right_ids; + { + InternalStrings intern_table; + for (size_t i = 0; i < left.size(); ++i) { + left_ids.push_back(intern_table.GetId(left[i])); + } + for (size_t i = 0; i < right.size(); ++i) { + right_ids.push_back(intern_table.GetId(right[i])); + } + } + return CalculateOptimalEdits(left_ids, right_ids); +} + +namespace { + +// Helper class that holds the state for one hunk and prints it out to the +// stream. +// It reorders adds/removes when possible to group all removes before all +// adds. It also adds the hunk header before printint into the stream. +class Hunk { + public: + Hunk(size_t left_start, size_t right_start) + : left_start_(left_start), + right_start_(right_start), + adds_(), + removes_(), + common_() {} + + void PushLine(char edit, const char* line) { + switch (edit) { + case ' ': + ++common_; + FlushEdits(); + hunk_.push_back(std::make_pair(' ', line)); + break; + case '-': + ++removes_; + hunk_removes_.push_back(std::make_pair('-', line)); + break; + case '+': + ++adds_; + hunk_adds_.push_back(std::make_pair('+', line)); + break; + } + } + + void PrintTo(std::ostream* os) { + PrintHeader(os); + FlushEdits(); + for (std::list >::const_iterator it = + hunk_.begin(); + it != hunk_.end(); ++it) { + *os << it->first << it->second << "\n"; + } + } + + bool has_edits() const { return adds_ || removes_; } + + private: + void FlushEdits() { + hunk_.splice(hunk_.end(), hunk_removes_); + hunk_.splice(hunk_.end(), hunk_adds_); + } + + // Print a unified diff header for one hunk. + // The format is + // "@@ -, +, @@" + // where the left/right parts are omitted if unnecessary. + void PrintHeader(std::ostream* ss) const { + *ss << "@@ "; + if (removes_) { + *ss << "-" << left_start_ << "," << (removes_ + common_); + } + if (removes_ && adds_) { + *ss << " "; + } + if (adds_) { + *ss << "+" << right_start_ << "," << (adds_ + common_); + } + *ss << " @@\n"; + } + + size_t left_start_, right_start_; + size_t adds_, removes_, common_; + std::list > hunk_, hunk_adds_, hunk_removes_; +}; + +} // namespace + +// Create a list of diff hunks in Unified diff format. +// Each hunk has a header generated by PrintHeader above plus a body with +// lines prefixed with ' ' for no change, '-' for deletion and '+' for +// addition. +// 'context' represents the desired unchanged prefix/suffix around the diff. +// If two hunks are close enough that their contexts overlap, then they are +// joined into one hunk. +std::string CreateUnifiedDiff(const std::vector& left, + const std::vector& right, + size_t context) { + const std::vector edits = CalculateOptimalEdits(left, right); + + size_t l_i = 0, r_i = 0, edit_i = 0; + std::stringstream ss; + while (edit_i < edits.size()) { + // Find first edit. + while (edit_i < edits.size() && edits[edit_i] == kMatch) { + ++l_i; + ++r_i; + ++edit_i; + } + + // Find the first line to include in the hunk. + const size_t prefix_context = std::min(l_i, context); + Hunk hunk(l_i - prefix_context + 1, r_i - prefix_context + 1); + for (size_t i = prefix_context; i > 0; --i) { + hunk.PushLine(' ', left[l_i - i].c_str()); + } + + // Iterate the edits until we found enough suffix for the hunk or the input + // is over. + size_t n_suffix = 0; + for (; edit_i < edits.size(); ++edit_i) { + if (n_suffix >= context) { + // Continue only if the next hunk is very close. + auto it = edits.begin() + static_cast(edit_i); + while (it != edits.end() && *it == kMatch) ++it; + if (it == edits.end() || + static_cast(it - edits.begin()) - edit_i >= context) { + // There is no next edit or it is too far away. + break; + } + } + + EditType edit = edits[edit_i]; + // Reset count when a non match is found. + n_suffix = edit == kMatch ? n_suffix + 1 : 0; + + if (edit == kMatch || edit == kRemove || edit == kReplace) { + hunk.PushLine(edit == kMatch ? ' ' : '-', left[l_i].c_str()); + } + if (edit == kAdd || edit == kReplace) { + hunk.PushLine('+', right[r_i].c_str()); + } + + // Advance indices, depending on edit type. + l_i += edit != kAdd; + r_i += edit != kRemove; + } + + if (!hunk.has_edits()) { + // We are done. We don't want this hunk. + break; + } + + hunk.PrintTo(&ss); + } + return ss.str(); +} + +} // namespace edit_distance + +namespace { + +// The string representation of the values received in EqFailure() are already +// escaped. Split them on escaped '\n' boundaries. Leave all other escaped +// characters the same. +std::vector SplitEscapedString(const std::string& str) { + std::vector lines; + size_t start = 0, end = str.size(); + if (end > 2 && str[0] == '"' && str[end - 1] == '"') { + ++start; + --end; + } + bool escaped = false; + for (size_t i = start; i + 1 < end; ++i) { + if (escaped) { + escaped = false; + if (str[i] == 'n') { + lines.push_back(str.substr(start, i - start - 1)); + start = i + 1; + } + } else { + escaped = str[i] == '\\'; + } + } + lines.push_back(str.substr(start, end - start)); + return lines; +} + +} // namespace + +// Constructs and returns the message for an equality assertion +// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure. +// +// The first four parameters are the expressions used in the assertion +// and their values, as strings. For example, for ASSERT_EQ(foo, bar) +// where foo is 5 and bar is 6, we have: +// +// lhs_expression: "foo" +// rhs_expression: "bar" +// lhs_value: "5" +// rhs_value: "6" +// +// The ignoring_case parameter is true if and only if the assertion is a +// *_STRCASEEQ*. When it's true, the string "Ignoring case" will +// be inserted into the message. +AssertionResult EqFailure(const char* lhs_expression, + const char* rhs_expression, + const std::string& lhs_value, + const std::string& rhs_value, bool ignoring_case) { + Message msg; + msg << "Expected equality of these values:"; + msg << "\n " << lhs_expression; + if (lhs_value != lhs_expression) { + msg << "\n Which is: " << lhs_value; + } + msg << "\n " << rhs_expression; + if (rhs_value != rhs_expression) { + msg << "\n Which is: " << rhs_value; + } + + if (ignoring_case) { + msg << "\nIgnoring case"; + } + + if (!lhs_value.empty() && !rhs_value.empty()) { + const std::vector lhs_lines = SplitEscapedString(lhs_value); + const std::vector rhs_lines = SplitEscapedString(rhs_value); + if (lhs_lines.size() > 1 || rhs_lines.size() > 1) { + msg << "\nWith diff:\n" + << edit_distance::CreateUnifiedDiff(lhs_lines, rhs_lines); + } + } + + return AssertionFailure() << msg; +} + +// Constructs a failure message for Boolean assertions such as EXPECT_TRUE. +std::string GetBoolAssertionFailureMessage( + const AssertionResult& assertion_result, const char* expression_text, + const char* actual_predicate_value, const char* expected_predicate_value) { + const char* actual_message = assertion_result.message(); + Message msg; + msg << "Value of: " << expression_text + << "\n Actual: " << actual_predicate_value; + if (actual_message[0] != '\0') msg << " (" << actual_message << ")"; + msg << "\nExpected: " << expected_predicate_value; + return msg.GetString(); +} + +// Helper function for implementing ASSERT_NEAR. +AssertionResult DoubleNearPredFormat(const char* expr1, const char* expr2, + const char* abs_error_expr, double val1, + double val2, double abs_error) { + const double diff = fabs(val1 - val2); + if (diff <= abs_error) return AssertionSuccess(); + + // Find the value which is closest to zero. + const double min_abs = std::min(fabs(val1), fabs(val2)); + // Find the distance to the next double from that value. + const double epsilon = + nextafter(min_abs, std::numeric_limits::infinity()) - min_abs; + // Detect the case where abs_error is so small that EXPECT_NEAR is + // effectively the same as EXPECT_EQUAL, and give an informative error + // message so that the situation can be more easily understood without + // requiring exotic floating-point knowledge. + // Don't do an epsilon check if abs_error is zero because that implies + // that an equality check was actually intended. + if (!(std::isnan)(val1) && !(std::isnan)(val2) && abs_error > 0 && + abs_error < epsilon) { + return AssertionFailure() + << "The difference between " << expr1 << " and " << expr2 << " is " + << diff << ", where\n" + << expr1 << " evaluates to " << val1 << ",\n" + << expr2 << " evaluates to " << val2 << ".\nThe abs_error parameter " + << abs_error_expr << " evaluates to " << abs_error + << " which is smaller than the minimum distance between doubles for " + "numbers of this magnitude which is " + << epsilon + << ", thus making this EXPECT_NEAR check equivalent to " + "EXPECT_EQUAL. Consider using EXPECT_DOUBLE_EQ instead."; + } + return AssertionFailure() + << "The difference between " << expr1 << " and " << expr2 << " is " + << diff << ", which exceeds " << abs_error_expr << ", where\n" + << expr1 << " evaluates to " << val1 << ",\n" + << expr2 << " evaluates to " << val2 << ", and\n" + << abs_error_expr << " evaluates to " << abs_error << "."; +} + +// Helper template for implementing FloatLE() and DoubleLE(). +template +AssertionResult FloatingPointLE(const char* expr1, const char* expr2, + RawType val1, RawType val2) { + // Returns success if val1 is less than val2, + if (val1 < val2) { + return AssertionSuccess(); + } + + // or if val1 is almost equal to val2. + const FloatingPoint lhs(val1), rhs(val2); + if (lhs.AlmostEquals(rhs)) { + return AssertionSuccess(); + } + + // Note that the above two checks will both fail if either val1 or + // val2 is NaN, as the IEEE floating-point standard requires that + // any predicate involving a NaN must return false. + + ::std::stringstream val1_ss; + val1_ss << std::setprecision(std::numeric_limits::digits10 + 2) + << val1; + + ::std::stringstream val2_ss; + val2_ss << std::setprecision(std::numeric_limits::digits10 + 2) + << val2; + + return AssertionFailure() + << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n" + << " Actual: " << StringStreamToString(&val1_ss) << " vs " + << StringStreamToString(&val2_ss); +} + +} // namespace internal + +// Asserts that val1 is less than, or almost equal to, val2. Fails +// otherwise. In particular, it fails if either val1 or val2 is NaN. +AssertionResult FloatLE(const char* expr1, const char* expr2, float val1, + float val2) { + return internal::FloatingPointLE(expr1, expr2, val1, val2); +} + +// Asserts that val1 is less than, or almost equal to, val2. Fails +// otherwise. In particular, it fails if either val1 or val2 is NaN. +AssertionResult DoubleLE(const char* expr1, const char* expr2, double val1, + double val2) { + return internal::FloatingPointLE(expr1, expr2, val1, val2); +} + +namespace internal { + +// The helper function for {ASSERT|EXPECT}_STREQ. +AssertionResult CmpHelperSTREQ(const char* lhs_expression, + const char* rhs_expression, const char* lhs, + const char* rhs) { + if (String::CStringEquals(lhs, rhs)) { + return AssertionSuccess(); + } + + return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs), + PrintToString(rhs), false); +} + +// The helper function for {ASSERT|EXPECT}_STRCASEEQ. +AssertionResult CmpHelperSTRCASEEQ(const char* lhs_expression, + const char* rhs_expression, const char* lhs, + const char* rhs) { + if (String::CaseInsensitiveCStringEquals(lhs, rhs)) { + return AssertionSuccess(); + } + + return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs), + PrintToString(rhs), true); +} + +// The helper function for {ASSERT|EXPECT}_STRNE. +AssertionResult CmpHelperSTRNE(const char* s1_expression, + const char* s2_expression, const char* s1, + const char* s2) { + if (!String::CStringEquals(s1, s2)) { + return AssertionSuccess(); + } else { + return AssertionFailure() + << "Expected: (" << s1_expression << ") != (" << s2_expression + << "), actual: \"" << s1 << "\" vs \"" << s2 << "\""; + } +} + +// The helper function for {ASSERT|EXPECT}_STRCASENE. +AssertionResult CmpHelperSTRCASENE(const char* s1_expression, + const char* s2_expression, const char* s1, + const char* s2) { + if (!String::CaseInsensitiveCStringEquals(s1, s2)) { + return AssertionSuccess(); + } else { + return AssertionFailure() + << "Expected: (" << s1_expression << ") != (" << s2_expression + << ") (ignoring case), actual: \"" << s1 << "\" vs \"" << s2 << "\""; + } +} + +} // namespace internal + +namespace { + +// Helper functions for implementing IsSubString() and IsNotSubstring(). + +// This group of overloaded functions return true if and only if needle +// is a substring of haystack. NULL is considered a substring of +// itself only. + +bool IsSubstringPred(const char* needle, const char* haystack) { + if (needle == nullptr || haystack == nullptr) return needle == haystack; + + return strstr(haystack, needle) != nullptr; +} + +bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) { + if (needle == nullptr || haystack == nullptr) return needle == haystack; + + return wcsstr(haystack, needle) != nullptr; +} + +// StringType here can be either ::std::string or ::std::wstring. +template +bool IsSubstringPred(const StringType& needle, const StringType& haystack) { + return haystack.find(needle) != StringType::npos; +} + +// This function implements either IsSubstring() or IsNotSubstring(), +// depending on the value of the expected_to_be_substring parameter. +// StringType here can be const char*, const wchar_t*, ::std::string, +// or ::std::wstring. +template +AssertionResult IsSubstringImpl(bool expected_to_be_substring, + const char* needle_expr, + const char* haystack_expr, + const StringType& needle, + const StringType& haystack) { + if (IsSubstringPred(needle, haystack) == expected_to_be_substring) + return AssertionSuccess(); + + const bool is_wide_string = sizeof(needle[0]) > 1; + const char* const begin_string_quote = is_wide_string ? "L\"" : "\""; + return AssertionFailure() + << "Value of: " << needle_expr << "\n" + << " Actual: " << begin_string_quote << needle << "\"\n" + << "Expected: " << (expected_to_be_substring ? "" : "not ") + << "a substring of " << haystack_expr << "\n" + << "Which is: " << begin_string_quote << haystack << "\""; +} + +} // namespace + +// IsSubstring() and IsNotSubstring() check whether needle is a +// substring of haystack (NULL is considered a substring of itself +// only), and return an appropriate error message when they fail. + +AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr, + const char* needle, const char* haystack) { + return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr, + const wchar_t* needle, const wchar_t* haystack) { + return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsNotSubstring(const char* needle_expr, + const char* haystack_expr, const char* needle, + const char* haystack) { + return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsNotSubstring(const char* needle_expr, + const char* haystack_expr, const wchar_t* needle, + const wchar_t* haystack) { + return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr, + const ::std::string& needle, + const ::std::string& haystack) { + return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsNotSubstring(const char* needle_expr, + const char* haystack_expr, + const ::std::string& needle, + const ::std::string& haystack) { + return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); +} + +#if GTEST_HAS_STD_WSTRING +AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr, + const ::std::wstring& needle, + const ::std::wstring& haystack) { + return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsNotSubstring(const char* needle_expr, + const char* haystack_expr, + const ::std::wstring& needle, + const ::std::wstring& haystack) { + return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); +} +#endif // GTEST_HAS_STD_WSTRING + +namespace internal { + +#if GTEST_OS_WINDOWS + +namespace { + +// Helper function for IsHRESULT{SuccessFailure} predicates +AssertionResult HRESULTFailureHelper(const char* expr, const char* expected, + long hr) { // NOLINT +#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_TV_TITLE + + // Windows CE doesn't support FormatMessage. + const char error_text[] = ""; + +#else + + // Looks up the human-readable system message for the HRESULT code + // and since we're not passing any params to FormatMessage, we don't + // want inserts expanded. + const DWORD kFlags = + FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS; + const DWORD kBufSize = 4096; + // Gets the system's human readable message string for this HRESULT. + char error_text[kBufSize] = {'\0'}; + DWORD message_length = ::FormatMessageA(kFlags, + 0, // no source, we're asking system + static_cast(hr), // the error + 0, // no line width restrictions + error_text, // output buffer + kBufSize, // buf size + nullptr); // no arguments for inserts + // Trims tailing white space (FormatMessage leaves a trailing CR-LF) + for (; message_length && IsSpace(error_text[message_length - 1]); + --message_length) { + error_text[message_length - 1] = '\0'; + } + +#endif // GTEST_OS_WINDOWS_MOBILE + + const std::string error_hex("0x" + String::FormatHexInt(hr)); + return ::testing::AssertionFailure() + << "Expected: " << expr << " " << expected << ".\n" + << " Actual: " << error_hex << " " << error_text << "\n"; +} + +} // namespace + +AssertionResult IsHRESULTSuccess(const char* expr, long hr) { // NOLINT + if (SUCCEEDED(hr)) { + return AssertionSuccess(); + } + return HRESULTFailureHelper(expr, "succeeds", hr); +} + +AssertionResult IsHRESULTFailure(const char* expr, long hr) { // NOLINT + if (FAILED(hr)) { + return AssertionSuccess(); + } + return HRESULTFailureHelper(expr, "fails", hr); +} + +#endif // GTEST_OS_WINDOWS + +// Utility functions for encoding Unicode text (wide strings) in +// UTF-8. + +// A Unicode code-point can have up to 21 bits, and is encoded in UTF-8 +// like this: +// +// Code-point length Encoding +// 0 - 7 bits 0xxxxxxx +// 8 - 11 bits 110xxxxx 10xxxxxx +// 12 - 16 bits 1110xxxx 10xxxxxx 10xxxxxx +// 17 - 21 bits 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + +// The maximum code-point a one-byte UTF-8 sequence can represent. +constexpr uint32_t kMaxCodePoint1 = (static_cast(1) << 7) - 1; + +// The maximum code-point a two-byte UTF-8 sequence can represent. +constexpr uint32_t kMaxCodePoint2 = (static_cast(1) << (5 + 6)) - 1; + +// The maximum code-point a three-byte UTF-8 sequence can represent. +constexpr uint32_t kMaxCodePoint3 = + (static_cast(1) << (4 + 2 * 6)) - 1; + +// The maximum code-point a four-byte UTF-8 sequence can represent. +constexpr uint32_t kMaxCodePoint4 = + (static_cast(1) << (3 + 3 * 6)) - 1; + +// Chops off the n lowest bits from a bit pattern. Returns the n +// lowest bits. As a side effect, the original bit pattern will be +// shifted to the right by n bits. +inline uint32_t ChopLowBits(uint32_t* bits, int n) { + const uint32_t low_bits = *bits & ((static_cast(1) << n) - 1); + *bits >>= n; + return low_bits; +} + +// Converts a Unicode code point to a narrow string in UTF-8 encoding. +// code_point parameter is of type uint32_t because wchar_t may not be +// wide enough to contain a code point. +// If the code_point is not a valid Unicode code point +// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted +// to "(Invalid Unicode 0xXXXXXXXX)". +std::string CodePointToUtf8(uint32_t code_point) { + if (code_point > kMaxCodePoint4) { + return "(Invalid Unicode 0x" + String::FormatHexUInt32(code_point) + ")"; + } + + char str[5]; // Big enough for the largest valid code point. + if (code_point <= kMaxCodePoint1) { + str[1] = '\0'; + str[0] = static_cast(code_point); // 0xxxxxxx + } else if (code_point <= kMaxCodePoint2) { + str[2] = '\0'; + str[1] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[0] = static_cast(0xC0 | code_point); // 110xxxxx + } else if (code_point <= kMaxCodePoint3) { + str[3] = '\0'; + str[2] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[1] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[0] = static_cast(0xE0 | code_point); // 1110xxxx + } else { // code_point <= kMaxCodePoint4 + str[4] = '\0'; + str[3] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[2] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[1] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[0] = static_cast(0xF0 | code_point); // 11110xxx + } + return str; +} + +// The following two functions only make sense if the system +// uses UTF-16 for wide string encoding. All supported systems +// with 16 bit wchar_t (Windows, Cygwin) do use UTF-16. + +// Determines if the arguments constitute UTF-16 surrogate pair +// and thus should be combined into a single Unicode code point +// using CreateCodePointFromUtf16SurrogatePair. +inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) { + return sizeof(wchar_t) == 2 && (first & 0xFC00) == 0xD800 && + (second & 0xFC00) == 0xDC00; +} + +// Creates a Unicode code point from UTF16 surrogate pair. +inline uint32_t CreateCodePointFromUtf16SurrogatePair(wchar_t first, + wchar_t second) { + const auto first_u = static_cast(first); + const auto second_u = static_cast(second); + const uint32_t mask = (1 << 10) - 1; + return (sizeof(wchar_t) == 2) + ? (((first_u & mask) << 10) | (second_u & mask)) + 0x10000 + : + // This function should not be called when the condition is + // false, but we provide a sensible default in case it is. + first_u; +} + +// Converts a wide string to a narrow string in UTF-8 encoding. +// The wide string is assumed to have the following encoding: +// UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin) +// UTF-32 if sizeof(wchar_t) == 4 (on Linux) +// Parameter str points to a null-terminated wide string. +// Parameter num_chars may additionally limit the number +// of wchar_t characters processed. -1 is used when the entire string +// should be processed. +// If the string contains code points that are not valid Unicode code points +// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output +// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding +// and contains invalid UTF-16 surrogate pairs, values in those pairs +// will be encoded as individual Unicode characters from Basic Normal Plane. +std::string WideStringToUtf8(const wchar_t* str, int num_chars) { + if (num_chars == -1) num_chars = static_cast(wcslen(str)); + + ::std::stringstream stream; + for (int i = 0; i < num_chars; ++i) { + uint32_t unicode_code_point; + + if (str[i] == L'\0') { + break; + } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) { + unicode_code_point = + CreateCodePointFromUtf16SurrogatePair(str[i], str[i + 1]); + i++; + } else { + unicode_code_point = static_cast(str[i]); + } + + stream << CodePointToUtf8(unicode_code_point); + } + return StringStreamToString(&stream); +} + +// Converts a wide C string to an std::string using the UTF-8 encoding. +// NULL will be converted to "(null)". +std::string String::ShowWideCString(const wchar_t* wide_c_str) { + if (wide_c_str == nullptr) return "(null)"; + + return internal::WideStringToUtf8(wide_c_str, -1); +} + +// Compares two wide C strings. Returns true if and only if they have the +// same content. +// +// Unlike wcscmp(), this function can handle NULL argument(s). A NULL +// C string is considered different to any non-NULL C string, +// including the empty string. +bool String::WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs) { + if (lhs == nullptr) return rhs == nullptr; + + if (rhs == nullptr) return false; + + return wcscmp(lhs, rhs) == 0; +} + +// Helper function for *_STREQ on wide strings. +AssertionResult CmpHelperSTREQ(const char* lhs_expression, + const char* rhs_expression, const wchar_t* lhs, + const wchar_t* rhs) { + if (String::WideCStringEquals(lhs, rhs)) { + return AssertionSuccess(); + } + + return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs), + PrintToString(rhs), false); +} + +// Helper function for *_STRNE on wide strings. +AssertionResult CmpHelperSTRNE(const char* s1_expression, + const char* s2_expression, const wchar_t* s1, + const wchar_t* s2) { + if (!String::WideCStringEquals(s1, s2)) { + return AssertionSuccess(); + } + + return AssertionFailure() + << "Expected: (" << s1_expression << ") != (" << s2_expression + << "), actual: " << PrintToString(s1) << " vs " << PrintToString(s2); +} + +// Compares two C strings, ignoring case. Returns true if and only if they have +// the same content. +// +// Unlike strcasecmp(), this function can handle NULL argument(s). A +// NULL C string is considered different to any non-NULL C string, +// including the empty string. +bool String::CaseInsensitiveCStringEquals(const char* lhs, const char* rhs) { + if (lhs == nullptr) return rhs == nullptr; + if (rhs == nullptr) return false; + return posix::StrCaseCmp(lhs, rhs) == 0; +} + +// Compares two wide C strings, ignoring case. Returns true if and only if they +// have the same content. +// +// Unlike wcscasecmp(), this function can handle NULL argument(s). +// A NULL C string is considered different to any non-NULL wide C string, +// including the empty string. +// NB: The implementations on different platforms slightly differ. +// On windows, this method uses _wcsicmp which compares according to LC_CTYPE +// environment variable. On GNU platform this method uses wcscasecmp +// which compares according to LC_CTYPE category of the current locale. +// On MacOS X, it uses towlower, which also uses LC_CTYPE category of the +// current locale. +bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs, + const wchar_t* rhs) { + if (lhs == nullptr) return rhs == nullptr; + + if (rhs == nullptr) return false; + +#if GTEST_OS_WINDOWS + return _wcsicmp(lhs, rhs) == 0; +#elif GTEST_OS_LINUX && !GTEST_OS_LINUX_ANDROID + return wcscasecmp(lhs, rhs) == 0; +#else + // Android, Mac OS X and Cygwin don't define wcscasecmp. + // Other unknown OSes may not define it either. + wint_t left, right; + do { + left = towlower(static_cast(*lhs++)); + right = towlower(static_cast(*rhs++)); + } while (left && left == right); + return left == right; +#endif // OS selector +} + +// Returns true if and only if str ends with the given suffix, ignoring case. +// Any string is considered to end with an empty suffix. +bool String::EndsWithCaseInsensitive(const std::string& str, + const std::string& suffix) { + const size_t str_len = str.length(); + const size_t suffix_len = suffix.length(); + return (str_len >= suffix_len) && + CaseInsensitiveCStringEquals(str.c_str() + str_len - suffix_len, + suffix.c_str()); +} + +// Formats an int value as "%02d". +std::string String::FormatIntWidth2(int value) { + return FormatIntWidthN(value, 2); +} + +// Formats an int value to given width with leading zeros. +std::string String::FormatIntWidthN(int value, int width) { + std::stringstream ss; + ss << std::setfill('0') << std::setw(width) << value; + return ss.str(); +} + +// Formats an int value as "%X". +std::string String::FormatHexUInt32(uint32_t value) { + std::stringstream ss; + ss << std::hex << std::uppercase << value; + return ss.str(); +} + +// Formats an int value as "%X". +std::string String::FormatHexInt(int value) { + return FormatHexUInt32(static_cast(value)); +} + +// Formats a byte as "%02X". +std::string String::FormatByte(unsigned char value) { + std::stringstream ss; + ss << std::setfill('0') << std::setw(2) << std::hex << std::uppercase + << static_cast(value); + return ss.str(); +} + +// Converts the buffer in a stringstream to an std::string, converting NUL +// bytes to "\\0" along the way. +std::string StringStreamToString(::std::stringstream* ss) { + const ::std::string& str = ss->str(); + const char* const start = str.c_str(); + const char* const end = start + str.length(); + + std::string result; + result.reserve(static_cast(2 * (end - start))); + for (const char* ch = start; ch != end; ++ch) { + if (*ch == '\0') { + result += "\\0"; // Replaces NUL with "\\0"; + } else { + result += *ch; + } + } + + return result; +} + +// Appends the user-supplied message to the Google-Test-generated message. +std::string AppendUserMessage(const std::string& gtest_msg, + const Message& user_msg) { + // Appends the user message if it's non-empty. + const std::string user_msg_string = user_msg.GetString(); + if (user_msg_string.empty()) { + return gtest_msg; + } + if (gtest_msg.empty()) { + return user_msg_string; + } + return gtest_msg + "\n" + user_msg_string; +} + +} // namespace internal + +// class TestResult + +// Creates an empty TestResult. +TestResult::TestResult() + : death_test_count_(0), start_timestamp_(0), elapsed_time_(0) {} + +// D'tor. +TestResult::~TestResult() {} + +// Returns the i-th test part result among all the results. i can +// range from 0 to total_part_count() - 1. If i is not in that range, +// aborts the program. +const TestPartResult& TestResult::GetTestPartResult(int i) const { + if (i < 0 || i >= total_part_count()) internal::posix::Abort(); + return test_part_results_.at(static_cast(i)); +} + +// Returns the i-th test property. i can range from 0 to +// test_property_count() - 1. If i is not in that range, aborts the +// program. +const TestProperty& TestResult::GetTestProperty(int i) const { + if (i < 0 || i >= test_property_count()) internal::posix::Abort(); + return test_properties_.at(static_cast(i)); +} + +// Clears the test part results. +void TestResult::ClearTestPartResults() { test_part_results_.clear(); } + +// Adds a test part result to the list. +void TestResult::AddTestPartResult(const TestPartResult& test_part_result) { + test_part_results_.push_back(test_part_result); +} + +// Adds a test property to the list. If a property with the same key as the +// supplied property is already represented, the value of this test_property +// replaces the old value for that key. +void TestResult::RecordProperty(const std::string& xml_element, + const TestProperty& test_property) { + if (!ValidateTestProperty(xml_element, test_property)) { + return; + } + internal::MutexLock lock(&test_properties_mutex_); + const std::vector::iterator property_with_matching_key = + std::find_if(test_properties_.begin(), test_properties_.end(), + internal::TestPropertyKeyIs(test_property.key())); + if (property_with_matching_key == test_properties_.end()) { + test_properties_.push_back(test_property); + return; + } + property_with_matching_key->SetValue(test_property.value()); +} + +// The list of reserved attributes used in the element of XML +// output. +static const char* const kReservedTestSuitesAttributes[] = { + "disabled", "errors", "failures", "name", + "random_seed", "tests", "time", "timestamp"}; + +// The list of reserved attributes used in the element of XML +// output. +static const char* const kReservedTestSuiteAttributes[] = { + "disabled", "errors", "failures", "name", + "tests", "time", "timestamp", "skipped"}; + +// The list of reserved attributes used in the element of XML output. +static const char* const kReservedTestCaseAttributes[] = { + "classname", "name", "status", "time", + "type_param", "value_param", "file", "line"}; + +// Use a slightly different set for allowed output to ensure existing tests can +// still RecordProperty("result") or "RecordProperty(timestamp") +static const char* const kReservedOutputTestCaseAttributes[] = { + "classname", "name", "status", "time", "type_param", + "value_param", "file", "line", "result", "timestamp"}; + +template +std::vector ArrayAsVector(const char* const (&array)[kSize]) { + return std::vector(array, array + kSize); +} + +static std::vector GetReservedAttributesForElement( + const std::string& xml_element) { + if (xml_element == "testsuites") { + return ArrayAsVector(kReservedTestSuitesAttributes); + } else if (xml_element == "testsuite") { + return ArrayAsVector(kReservedTestSuiteAttributes); + } else if (xml_element == "testcase") { + return ArrayAsVector(kReservedTestCaseAttributes); + } else { + GTEST_CHECK_(false) << "Unrecognized xml_element provided: " << xml_element; + } + // This code is unreachable but some compilers may not realizes that. + return std::vector(); +} + +// TODO(jdesprez): Merge the two getReserved attributes once skip is improved +static std::vector GetReservedOutputAttributesForElement( + const std::string& xml_element) { + if (xml_element == "testsuites") { + return ArrayAsVector(kReservedTestSuitesAttributes); + } else if (xml_element == "testsuite") { + return ArrayAsVector(kReservedTestSuiteAttributes); + } else if (xml_element == "testcase") { + return ArrayAsVector(kReservedOutputTestCaseAttributes); + } else { + GTEST_CHECK_(false) << "Unrecognized xml_element provided: " << xml_element; + } + // This code is unreachable but some compilers may not realizes that. + return std::vector(); +} + +static std::string FormatWordList(const std::vector& words) { + Message word_list; + for (size_t i = 0; i < words.size(); ++i) { + if (i > 0 && words.size() > 2) { + word_list << ", "; + } + if (i == words.size() - 1) { + word_list << "and "; + } + word_list << "'" << words[i] << "'"; + } + return word_list.GetString(); +} + +static bool ValidateTestPropertyName( + const std::string& property_name, + const std::vector& reserved_names) { + if (std::find(reserved_names.begin(), reserved_names.end(), property_name) != + reserved_names.end()) { + ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name + << " (" << FormatWordList(reserved_names) + << " are reserved by " << GTEST_NAME_ << ")"; + return false; + } + return true; +} + +// Adds a failure if the key is a reserved attribute of the element named +// xml_element. Returns true if the property is valid. +bool TestResult::ValidateTestProperty(const std::string& xml_element, + const TestProperty& test_property) { + return ValidateTestPropertyName(test_property.key(), + GetReservedAttributesForElement(xml_element)); +} + +// Clears the object. +void TestResult::Clear() { + test_part_results_.clear(); + test_properties_.clear(); + death_test_count_ = 0; + elapsed_time_ = 0; +} + +// Returns true off the test part was skipped. +static bool TestPartSkipped(const TestPartResult& result) { + return result.skipped(); +} + +// Returns true if and only if the test was skipped. +bool TestResult::Skipped() const { + return !Failed() && CountIf(test_part_results_, TestPartSkipped) > 0; +} + +// Returns true if and only if the test failed. +bool TestResult::Failed() const { + for (int i = 0; i < total_part_count(); ++i) { + if (GetTestPartResult(i).failed()) return true; + } + return false; +} + +// Returns true if and only if the test part fatally failed. +static bool TestPartFatallyFailed(const TestPartResult& result) { + return result.fatally_failed(); +} + +// Returns true if and only if the test fatally failed. +bool TestResult::HasFatalFailure() const { + return CountIf(test_part_results_, TestPartFatallyFailed) > 0; +} + +// Returns true if and only if the test part non-fatally failed. +static bool TestPartNonfatallyFailed(const TestPartResult& result) { + return result.nonfatally_failed(); +} + +// Returns true if and only if the test has a non-fatal failure. +bool TestResult::HasNonfatalFailure() const { + return CountIf(test_part_results_, TestPartNonfatallyFailed) > 0; +} + +// Gets the number of all test parts. This is the sum of the number +// of successful test parts and the number of failed test parts. +int TestResult::total_part_count() const { + return static_cast(test_part_results_.size()); +} + +// Returns the number of the test properties. +int TestResult::test_property_count() const { + return static_cast(test_properties_.size()); +} + +// class Test + +// Creates a Test object. + +// The c'tor saves the states of all flags. +Test::Test() : gtest_flag_saver_(new GTEST_FLAG_SAVER_) {} + +// The d'tor restores the states of all flags. The actual work is +// done by the d'tor of the gtest_flag_saver_ field, and thus not +// visible here. +Test::~Test() {} + +// Sets up the test fixture. +// +// A sub-class may override this. +void Test::SetUp() {} + +// Tears down the test fixture. +// +// A sub-class may override this. +void Test::TearDown() {} + +// Allows user supplied key value pairs to be recorded for later output. +void Test::RecordProperty(const std::string& key, const std::string& value) { + UnitTest::GetInstance()->RecordProperty(key, value); +} + +// Allows user supplied key value pairs to be recorded for later output. +void Test::RecordProperty(const std::string& key, int value) { + Message value_message; + value_message << value; + RecordProperty(key, value_message.GetString().c_str()); +} + +namespace internal { + +void ReportFailureInUnknownLocation(TestPartResult::Type result_type, + const std::string& message) { + // This function is a friend of UnitTest and as such has access to + // AddTestPartResult. + UnitTest::GetInstance()->AddTestPartResult( + result_type, + nullptr, // No info about the source file where the exception occurred. + -1, // We have no info on which line caused the exception. + message, + ""); // No stack trace, either. +} + +} // namespace internal + +// Google Test requires all tests in the same test suite to use the same test +// fixture class. This function checks if the current test has the +// same fixture class as the first test in the current test suite. If +// yes, it returns true; otherwise it generates a Google Test failure and +// returns false. +bool Test::HasSameFixtureClass() { + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + const TestSuite* const test_suite = impl->current_test_suite(); + + // Info about the first test in the current test suite. + const TestInfo* const first_test_info = test_suite->test_info_list()[0]; + const internal::TypeId first_fixture_id = first_test_info->fixture_class_id_; + const char* const first_test_name = first_test_info->name(); + + // Info about the current test. + const TestInfo* const this_test_info = impl->current_test_info(); + const internal::TypeId this_fixture_id = this_test_info->fixture_class_id_; + const char* const this_test_name = this_test_info->name(); + + if (this_fixture_id != first_fixture_id) { + // Is the first test defined using TEST? + const bool first_is_TEST = first_fixture_id == internal::GetTestTypeId(); + // Is this test defined using TEST? + const bool this_is_TEST = this_fixture_id == internal::GetTestTypeId(); + + if (first_is_TEST || this_is_TEST) { + // Both TEST and TEST_F appear in same test suite, which is incorrect. + // Tell the user how to fix this. + + // Gets the name of the TEST and the name of the TEST_F. Note + // that first_is_TEST and this_is_TEST cannot both be true, as + // the fixture IDs are different for the two tests. + const char* const TEST_name = + first_is_TEST ? first_test_name : this_test_name; + const char* const TEST_F_name = + first_is_TEST ? this_test_name : first_test_name; + + ADD_FAILURE() + << "All tests in the same test suite must use the same test fixture\n" + << "class, so mixing TEST_F and TEST in the same test suite is\n" + << "illegal. In test suite " << this_test_info->test_suite_name() + << ",\n" + << "test " << TEST_F_name << " is defined using TEST_F but\n" + << "test " << TEST_name << " is defined using TEST. You probably\n" + << "want to change the TEST to TEST_F or move it to another test\n" + << "case."; + } else { + // Two fixture classes with the same name appear in two different + // namespaces, which is not allowed. Tell the user how to fix this. + ADD_FAILURE() + << "All tests in the same test suite must use the same test fixture\n" + << "class. However, in test suite " + << this_test_info->test_suite_name() << ",\n" + << "you defined test " << first_test_name << " and test " + << this_test_name << "\n" + << "using two different test fixture classes. This can happen if\n" + << "the two classes are from different namespaces or translation\n" + << "units and have the same name. You should probably rename one\n" + << "of the classes to put the tests into different test suites."; + } + return false; + } + + return true; +} + +#if GTEST_HAS_SEH + +// Adds an "exception thrown" fatal failure to the current test. This +// function returns its result via an output parameter pointer because VC++ +// prohibits creation of objects with destructors on stack in functions +// using __try (see error C2712). +static std::string* FormatSehExceptionMessage(DWORD exception_code, + const char* location) { + Message message; + message << "SEH exception with code 0x" << std::setbase(16) << exception_code + << std::setbase(10) << " thrown in " << location << "."; + + return new std::string(message.GetString()); +} + +#endif // GTEST_HAS_SEH + +namespace internal { + +#if GTEST_HAS_EXCEPTIONS + +// Adds an "exception thrown" fatal failure to the current test. +static std::string FormatCxxExceptionMessage(const char* description, + const char* location) { + Message message; + if (description != nullptr) { + message << "C++ exception with description \"" << description << "\""; + } else { + message << "Unknown C++ exception"; + } + message << " thrown in " << location << "."; + + return message.GetString(); +} + +static std::string PrintTestPartResultToString( + const TestPartResult& test_part_result); + +GoogleTestFailureException::GoogleTestFailureException( + const TestPartResult& failure) + : ::std::runtime_error(PrintTestPartResultToString(failure).c_str()) {} + +#endif // GTEST_HAS_EXCEPTIONS + +// We put these helper functions in the internal namespace as IBM's xlC +// compiler rejects the code if they were declared static. + +// Runs the given method and handles SEH exceptions it throws, when +// SEH is supported; returns the 0-value for type Result in case of an +// SEH exception. (Microsoft compilers cannot handle SEH and C++ +// exceptions in the same function. Therefore, we provide a separate +// wrapper function for handling SEH exceptions.) +template +Result HandleSehExceptionsInMethodIfSupported(T* object, Result (T::*method)(), + const char* location) { +#if GTEST_HAS_SEH + __try { + return (object->*method)(); + } __except (internal::UnitTestOptions::GTestShouldProcessSEH( // NOLINT + GetExceptionCode())) { + // We create the exception message on the heap because VC++ prohibits + // creation of objects with destructors on stack in functions using __try + // (see error C2712). + std::string* exception_message = + FormatSehExceptionMessage(GetExceptionCode(), location); + internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure, + *exception_message); + delete exception_message; + return static_cast(0); + } +#else + (void)location; + return (object->*method)(); +#endif // GTEST_HAS_SEH +} + +// Runs the given method and catches and reports C++ and/or SEH-style +// exceptions, if they are supported; returns the 0-value for type +// Result in case of an SEH exception. +template +Result HandleExceptionsInMethodIfSupported(T* object, Result (T::*method)(), + const char* location) { + // NOTE: The user code can affect the way in which Google Test handles + // exceptions by setting GTEST_FLAG(catch_exceptions), but only before + // RUN_ALL_TESTS() starts. It is technically possible to check the flag + // after the exception is caught and either report or re-throw the + // exception based on the flag's value: + // + // try { + // // Perform the test method. + // } catch (...) { + // if (GTEST_FLAG_GET(catch_exceptions)) + // // Report the exception as failure. + // else + // throw; // Re-throws the original exception. + // } + // + // However, the purpose of this flag is to allow the program to drop into + // the debugger when the exception is thrown. On most platforms, once the + // control enters the catch block, the exception origin information is + // lost and the debugger will stop the program at the point of the + // re-throw in this function -- instead of at the point of the original + // throw statement in the code under test. For this reason, we perform + // the check early, sacrificing the ability to affect Google Test's + // exception handling in the method where the exception is thrown. + if (internal::GetUnitTestImpl()->catch_exceptions()) { +#if GTEST_HAS_EXCEPTIONS + try { + return HandleSehExceptionsInMethodIfSupported(object, method, location); + } catch (const AssertionException&) { // NOLINT + // This failure was reported already. + } catch (const internal::GoogleTestFailureException&) { // NOLINT + // This exception type can only be thrown by a failed Google + // Test assertion with the intention of letting another testing + // framework catch it. Therefore we just re-throw it. + throw; + } catch (const std::exception& e) { // NOLINT + internal::ReportFailureInUnknownLocation( + TestPartResult::kFatalFailure, + FormatCxxExceptionMessage(e.what(), location)); + } catch (...) { // NOLINT + internal::ReportFailureInUnknownLocation( + TestPartResult::kFatalFailure, + FormatCxxExceptionMessage(nullptr, location)); + } + return static_cast(0); +#else + return HandleSehExceptionsInMethodIfSupported(object, method, location); +#endif // GTEST_HAS_EXCEPTIONS + } else { + return (object->*method)(); + } +} + +} // namespace internal + +// Runs the test and updates the test result. +void Test::Run() { + if (!HasSameFixtureClass()) return; + + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported(this, &Test::SetUp, "SetUp()"); + // We will run the test only if SetUp() was successful and didn't call + // GTEST_SKIP(). + if (!HasFatalFailure() && !IsSkipped()) { + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported(this, &Test::TestBody, + "the test body"); + } + + // However, we want to clean up as much as possible. Hence we will + // always call TearDown(), even if SetUp() or the test body has + // failed. + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported(this, &Test::TearDown, + "TearDown()"); +} + +// Returns true if and only if the current test has a fatal failure. +bool Test::HasFatalFailure() { + return internal::GetUnitTestImpl()->current_test_result()->HasFatalFailure(); +} + +// Returns true if and only if the current test has a non-fatal failure. +bool Test::HasNonfatalFailure() { + return internal::GetUnitTestImpl() + ->current_test_result() + ->HasNonfatalFailure(); +} + +// Returns true if and only if the current test was skipped. +bool Test::IsSkipped() { + return internal::GetUnitTestImpl()->current_test_result()->Skipped(); +} + +// class TestInfo + +// Constructs a TestInfo object. It assumes ownership of the test factory +// object. +TestInfo::TestInfo(const std::string& a_test_suite_name, + const std::string& a_name, const char* a_type_param, + const char* a_value_param, + internal::CodeLocation a_code_location, + internal::TypeId fixture_class_id, + internal::TestFactoryBase* factory) + : test_suite_name_(a_test_suite_name), + name_(a_name), + type_param_(a_type_param ? new std::string(a_type_param) : nullptr), + value_param_(a_value_param ? new std::string(a_value_param) : nullptr), + location_(a_code_location), + fixture_class_id_(fixture_class_id), + should_run_(false), + is_disabled_(false), + matches_filter_(false), + is_in_another_shard_(false), + factory_(factory), + result_() {} + +// Destructs a TestInfo object. +TestInfo::~TestInfo() { delete factory_; } + +namespace internal { + +// Creates a new TestInfo object and registers it with Google Test; +// returns the created object. +// +// Arguments: +// +// test_suite_name: name of the test suite +// name: name of the test +// type_param: the name of the test's type parameter, or NULL if +// this is not a typed or a type-parameterized test. +// value_param: text representation of the test's value parameter, +// or NULL if this is not a value-parameterized test. +// code_location: code location where the test is defined +// fixture_class_id: ID of the test fixture class +// set_up_tc: pointer to the function that sets up the test suite +// tear_down_tc: pointer to the function that tears down the test suite +// factory: pointer to the factory that creates a test object. +// The newly created TestInfo instance will assume +// ownership of the factory object. +TestInfo* MakeAndRegisterTestInfo( + const char* test_suite_name, const char* name, const char* type_param, + const char* value_param, CodeLocation code_location, + TypeId fixture_class_id, SetUpTestSuiteFunc set_up_tc, + TearDownTestSuiteFunc tear_down_tc, TestFactoryBase* factory) { + TestInfo* const test_info = + new TestInfo(test_suite_name, name, type_param, value_param, + code_location, fixture_class_id, factory); + GetUnitTestImpl()->AddTestInfo(set_up_tc, tear_down_tc, test_info); + return test_info; +} + +void ReportInvalidTestSuiteType(const char* test_suite_name, + CodeLocation code_location) { + Message errors; + errors + << "Attempted redefinition of test suite " << test_suite_name << ".\n" + << "All tests in the same test suite must use the same test fixture\n" + << "class. However, in test suite " << test_suite_name << ", you tried\n" + << "to define a test using a fixture class different from the one\n" + << "used earlier. This can happen if the two fixture classes are\n" + << "from different namespaces and have the same name. You should\n" + << "probably rename one of the classes to put the tests into different\n" + << "test suites."; + + GTEST_LOG_(ERROR) << FormatFileLocation(code_location.file.c_str(), + code_location.line) + << " " << errors.GetString(); +} +} // namespace internal + +namespace { + +// A predicate that checks the test name of a TestInfo against a known +// value. +// +// This is used for implementation of the TestSuite class only. We put +// it in the anonymous namespace to prevent polluting the outer +// namespace. +// +// TestNameIs is copyable. +class TestNameIs { + public: + // Constructor. + // + // TestNameIs has NO default constructor. + explicit TestNameIs(const char* name) : name_(name) {} + + // Returns true if and only if the test name of test_info matches name_. + bool operator()(const TestInfo* test_info) const { + return test_info && test_info->name() == name_; + } + + private: + std::string name_; +}; + +} // namespace + +namespace internal { + +// This method expands all parameterized tests registered with macros TEST_P +// and INSTANTIATE_TEST_SUITE_P into regular tests and registers those. +// This will be done just once during the program runtime. +void UnitTestImpl::RegisterParameterizedTests() { + if (!parameterized_tests_registered_) { + parameterized_test_registry_.RegisterTests(); + type_parameterized_test_registry_.CheckForInstantiations(); + parameterized_tests_registered_ = true; + } +} + +} // namespace internal + +// Creates the test object, runs it, records its result, and then +// deletes it. +void TestInfo::Run() { + TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater(); + if (!should_run_) { + if (is_disabled_ && matches_filter_) repeater->OnTestDisabled(*this); + return; + } + + // Tells UnitTest where to store test result. + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + impl->set_current_test_info(this); + + // Notifies the unit test event listeners that a test is about to start. + repeater->OnTestStart(*this); + result_.set_start_timestamp(internal::GetTimeInMillis()); + internal::Timer timer; + impl->os_stack_trace_getter()->UponLeavingGTest(); + + // Creates the test object. + Test* const test = internal::HandleExceptionsInMethodIfSupported( + factory_, &internal::TestFactoryBase::CreateTest, + "the test fixture's constructor"); + + // Runs the test if the constructor didn't generate a fatal failure or invoke + // GTEST_SKIP(). + // Note that the object will not be null + if (!Test::HasFatalFailure() && !Test::IsSkipped()) { + // This doesn't throw as all user code that can throw are wrapped into + // exception handling code. + test->Run(); + } + + if (test != nullptr) { + // Deletes the test object. + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported( + test, &Test::DeleteSelf_, "the test fixture's destructor"); + } + + result_.set_elapsed_time(timer.Elapsed()); + + // Notifies the unit test event listener that a test has just finished. + repeater->OnTestEnd(*this); + + // Tells UnitTest to stop associating assertion results to this + // test. + impl->set_current_test_info(nullptr); +} + +// Skip and records a skipped test result for this object. +void TestInfo::Skip() { + if (!should_run_) return; + + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + impl->set_current_test_info(this); + + TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater(); + + // Notifies the unit test event listeners that a test is about to start. + repeater->OnTestStart(*this); + + const TestPartResult test_part_result = + TestPartResult(TestPartResult::kSkip, this->file(), this->line(), ""); + impl->GetTestPartResultReporterForCurrentThread()->ReportTestPartResult( + test_part_result); + + // Notifies the unit test event listener that a test has just finished. + repeater->OnTestEnd(*this); + impl->set_current_test_info(nullptr); +} + +// class TestSuite + +// Gets the number of successful tests in this test suite. +int TestSuite::successful_test_count() const { + return CountIf(test_info_list_, TestPassed); +} + +// Gets the number of successful tests in this test suite. +int TestSuite::skipped_test_count() const { + return CountIf(test_info_list_, TestSkipped); +} + +// Gets the number of failed tests in this test suite. +int TestSuite::failed_test_count() const { + return CountIf(test_info_list_, TestFailed); +} + +// Gets the number of disabled tests that will be reported in the XML report. +int TestSuite::reportable_disabled_test_count() const { + return CountIf(test_info_list_, TestReportableDisabled); +} + +// Gets the number of disabled tests in this test suite. +int TestSuite::disabled_test_count() const { + return CountIf(test_info_list_, TestDisabled); +} + +// Gets the number of tests to be printed in the XML report. +int TestSuite::reportable_test_count() const { + return CountIf(test_info_list_, TestReportable); +} + +// Get the number of tests in this test suite that should run. +int TestSuite::test_to_run_count() const { + return CountIf(test_info_list_, ShouldRunTest); +} + +// Gets the number of all tests. +int TestSuite::total_test_count() const { + return static_cast(test_info_list_.size()); +} + +// Creates a TestSuite with the given name. +// +// Arguments: +// +// a_name: name of the test suite +// a_type_param: the name of the test suite's type parameter, or NULL if +// this is not a typed or a type-parameterized test suite. +// set_up_tc: pointer to the function that sets up the test suite +// tear_down_tc: pointer to the function that tears down the test suite +TestSuite::TestSuite(const char* a_name, const char* a_type_param, + internal::SetUpTestSuiteFunc set_up_tc, + internal::TearDownTestSuiteFunc tear_down_tc) + : name_(a_name), + type_param_(a_type_param ? new std::string(a_type_param) : nullptr), + set_up_tc_(set_up_tc), + tear_down_tc_(tear_down_tc), + should_run_(false), + start_timestamp_(0), + elapsed_time_(0) {} + +// Destructor of TestSuite. +TestSuite::~TestSuite() { + // Deletes every Test in the collection. + ForEach(test_info_list_, internal::Delete); +} + +// Returns the i-th test among all the tests. i can range from 0 to +// total_test_count() - 1. If i is not in that range, returns NULL. +const TestInfo* TestSuite::GetTestInfo(int i) const { + const int index = GetElementOr(test_indices_, i, -1); + return index < 0 ? nullptr : test_info_list_[static_cast(index)]; +} + +// Returns the i-th test among all the tests. i can range from 0 to +// total_test_count() - 1. If i is not in that range, returns NULL. +TestInfo* TestSuite::GetMutableTestInfo(int i) { + const int index = GetElementOr(test_indices_, i, -1); + return index < 0 ? nullptr : test_info_list_[static_cast(index)]; +} + +// Adds a test to this test suite. Will delete the test upon +// destruction of the TestSuite object. +void TestSuite::AddTestInfo(TestInfo* test_info) { + test_info_list_.push_back(test_info); + test_indices_.push_back(static_cast(test_indices_.size())); +} + +// Runs every test in this TestSuite. +void TestSuite::Run() { + if (!should_run_) return; + + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + impl->set_current_test_suite(this); + + TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater(); + + // Call both legacy and the new API + repeater->OnTestSuiteStart(*this); +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + repeater->OnTestCaseStart(*this); +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported( + this, &TestSuite::RunSetUpTestSuite, "SetUpTestSuite()"); + + const bool skip_all = ad_hoc_test_result().Failed(); + + start_timestamp_ = internal::GetTimeInMillis(); + internal::Timer timer; + for (int i = 0; i < total_test_count(); i++) { + if (skip_all) { + GetMutableTestInfo(i)->Skip(); + } else { + GetMutableTestInfo(i)->Run(); + } + if (GTEST_FLAG_GET(fail_fast) && + GetMutableTestInfo(i)->result()->Failed()) { + for (int j = i + 1; j < total_test_count(); j++) { + GetMutableTestInfo(j)->Skip(); + } + break; + } + } + elapsed_time_ = timer.Elapsed(); + + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported( + this, &TestSuite::RunTearDownTestSuite, "TearDownTestSuite()"); + + // Call both legacy and the new API + repeater->OnTestSuiteEnd(*this); +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + repeater->OnTestCaseEnd(*this); +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + impl->set_current_test_suite(nullptr); +} + +// Skips all tests under this TestSuite. +void TestSuite::Skip() { + if (!should_run_) return; + + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + impl->set_current_test_suite(this); + + TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater(); + + // Call both legacy and the new API + repeater->OnTestSuiteStart(*this); +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + repeater->OnTestCaseStart(*this); +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + for (int i = 0; i < total_test_count(); i++) { + GetMutableTestInfo(i)->Skip(); + } + + // Call both legacy and the new API + repeater->OnTestSuiteEnd(*this); + // Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + repeater->OnTestCaseEnd(*this); +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + impl->set_current_test_suite(nullptr); +} + +// Clears the results of all tests in this test suite. +void TestSuite::ClearResult() { + ad_hoc_test_result_.Clear(); + ForEach(test_info_list_, TestInfo::ClearTestResult); +} + +// Shuffles the tests in this test suite. +void TestSuite::ShuffleTests(internal::Random* random) { + Shuffle(random, &test_indices_); +} + +// Restores the test order to before the first shuffle. +void TestSuite::UnshuffleTests() { + for (size_t i = 0; i < test_indices_.size(); i++) { + test_indices_[i] = static_cast(i); + } +} + +// Formats a countable noun. Depending on its quantity, either the +// singular form or the plural form is used. e.g. +// +// FormatCountableNoun(1, "formula", "formuli") returns "1 formula". +// FormatCountableNoun(5, "book", "books") returns "5 books". +static std::string FormatCountableNoun(int count, const char* singular_form, + const char* plural_form) { + return internal::StreamableToString(count) + " " + + (count == 1 ? singular_form : plural_form); +} + +// Formats the count of tests. +static std::string FormatTestCount(int test_count) { + return FormatCountableNoun(test_count, "test", "tests"); +} + +// Formats the count of test suites. +static std::string FormatTestSuiteCount(int test_suite_count) { + return FormatCountableNoun(test_suite_count, "test suite", "test suites"); +} + +// Converts a TestPartResult::Type enum to human-friendly string +// representation. Both kNonFatalFailure and kFatalFailure are translated +// to "Failure", as the user usually doesn't care about the difference +// between the two when viewing the test result. +static const char* TestPartResultTypeToString(TestPartResult::Type type) { + switch (type) { + case TestPartResult::kSkip: + return "Skipped\n"; + case TestPartResult::kSuccess: + return "Success"; + + case TestPartResult::kNonFatalFailure: + case TestPartResult::kFatalFailure: +#ifdef _MSC_VER + return "error: "; +#else + return "Failure\n"; +#endif + default: + return "Unknown result type"; + } +} + +namespace internal { +namespace { +enum class GTestColor { kDefault, kRed, kGreen, kYellow }; +} // namespace + +// Prints a TestPartResult to an std::string. +static std::string PrintTestPartResultToString( + const TestPartResult& test_part_result) { + return (Message() << internal::FormatFileLocation( + test_part_result.file_name(), + test_part_result.line_number()) + << " " + << TestPartResultTypeToString(test_part_result.type()) + << test_part_result.message()) + .GetString(); +} + +// Prints a TestPartResult. +static void PrintTestPartResult(const TestPartResult& test_part_result) { + const std::string& result = PrintTestPartResultToString(test_part_result); + printf("%s\n", result.c_str()); + fflush(stdout); + // If the test program runs in Visual Studio or a debugger, the + // following statements add the test part result message to the Output + // window such that the user can double-click on it to jump to the + // corresponding source code location; otherwise they do nothing. +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE + // We don't call OutputDebugString*() on Windows Mobile, as printing + // to stdout is done by OutputDebugString() there already - we don't + // want the same message printed twice. + ::OutputDebugStringA(result.c_str()); + ::OutputDebugStringA("\n"); +#endif +} + +// class PrettyUnitTestResultPrinter +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \ + !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW + +// Returns the character attribute for the given color. +static WORD GetColorAttribute(GTestColor color) { + switch (color) { + case GTestColor::kRed: + return FOREGROUND_RED; + case GTestColor::kGreen: + return FOREGROUND_GREEN; + case GTestColor::kYellow: + return FOREGROUND_RED | FOREGROUND_GREEN; + default: + return 0; + } +} + +static int GetBitOffset(WORD color_mask) { + if (color_mask == 0) return 0; + + int bitOffset = 0; + while ((color_mask & 1) == 0) { + color_mask >>= 1; + ++bitOffset; + } + return bitOffset; +} + +static WORD GetNewColor(GTestColor color, WORD old_color_attrs) { + // Let's reuse the BG + static const WORD background_mask = BACKGROUND_BLUE | BACKGROUND_GREEN | + BACKGROUND_RED | BACKGROUND_INTENSITY; + static const WORD foreground_mask = FOREGROUND_BLUE | FOREGROUND_GREEN | + FOREGROUND_RED | FOREGROUND_INTENSITY; + const WORD existing_bg = old_color_attrs & background_mask; + + WORD new_color = + GetColorAttribute(color) | existing_bg | FOREGROUND_INTENSITY; + static const int bg_bitOffset = GetBitOffset(background_mask); + static const int fg_bitOffset = GetBitOffset(foreground_mask); + + if (((new_color & background_mask) >> bg_bitOffset) == + ((new_color & foreground_mask) >> fg_bitOffset)) { + new_color ^= FOREGROUND_INTENSITY; // invert intensity + } + return new_color; +} + +#else + +// Returns the ANSI color code for the given color. GTestColor::kDefault is +// an invalid input. +static const char* GetAnsiColorCode(GTestColor color) { + switch (color) { + case GTestColor::kRed: + return "1"; + case GTestColor::kGreen: + return "2"; + case GTestColor::kYellow: + return "3"; + default: + return nullptr; + } +} + +#endif // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE + +// Returns true if and only if Google Test should use colors in the output. +bool ShouldUseColor(bool stdout_is_tty) { + std::string c = GTEST_FLAG_GET(color); + const char* const gtest_color = c.c_str(); + + if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) { +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW + // On Windows the TERM variable is usually not set, but the + // console there does support colors. + return stdout_is_tty; +#else + // On non-Windows platforms, we rely on the TERM variable. + const char* const term = posix::GetEnv("TERM"); + const bool term_supports_color = + String::CStringEquals(term, "xterm") || + String::CStringEquals(term, "xterm-color") || + String::CStringEquals(term, "xterm-256color") || + String::CStringEquals(term, "screen") || + String::CStringEquals(term, "screen-256color") || + String::CStringEquals(term, "tmux") || + String::CStringEquals(term, "tmux-256color") || + String::CStringEquals(term, "rxvt-unicode") || + String::CStringEquals(term, "rxvt-unicode-256color") || + String::CStringEquals(term, "linux") || + String::CStringEquals(term, "cygwin"); + return stdout_is_tty && term_supports_color; +#endif // GTEST_OS_WINDOWS + } + + return String::CaseInsensitiveCStringEquals(gtest_color, "yes") || + String::CaseInsensitiveCStringEquals(gtest_color, "true") || + String::CaseInsensitiveCStringEquals(gtest_color, "t") || + String::CStringEquals(gtest_color, "1"); + // We take "yes", "true", "t", and "1" as meaning "yes". If the + // value is neither one of these nor "auto", we treat it as "no" to + // be conservative. +} + +// Helpers for printing colored strings to stdout. Note that on Windows, we +// cannot simply emit special characters and have the terminal change colors. +// This routine must actually emit the characters rather than return a string +// that would be colored when printed, as can be done on Linux. + +GTEST_ATTRIBUTE_PRINTF_(2, 3) +static void ColoredPrintf(GTestColor color, const char* fmt, ...) { + va_list args; + va_start(args, fmt); + + static const bool in_color_mode = + ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0); + const bool use_color = in_color_mode && (color != GTestColor::kDefault); + + if (!use_color) { + vprintf(fmt, args); + va_end(args); + return; + } + +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \ + !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW + const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE); + + // Gets the current text color. + CONSOLE_SCREEN_BUFFER_INFO buffer_info; + GetConsoleScreenBufferInfo(stdout_handle, &buffer_info); + const WORD old_color_attrs = buffer_info.wAttributes; + const WORD new_color = GetNewColor(color, old_color_attrs); + + // We need to flush the stream buffers into the console before each + // SetConsoleTextAttribute call lest it affect the text that is already + // printed but has not yet reached the console. + fflush(stdout); + SetConsoleTextAttribute(stdout_handle, new_color); + + vprintf(fmt, args); + + fflush(stdout); + // Restores the text color. + SetConsoleTextAttribute(stdout_handle, old_color_attrs); +#else + printf("\033[0;3%sm", GetAnsiColorCode(color)); + vprintf(fmt, args); + printf("\033[m"); // Resets the terminal to default. +#endif // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE + va_end(args); +} + +// Text printed in Google Test's text output and --gtest_list_tests +// output to label the type parameter and value parameter for a test. +static const char kTypeParamLabel[] = "TypeParam"; +static const char kValueParamLabel[] = "GetParam()"; + +static void PrintFullTestCommentIfPresent(const TestInfo& test_info) { + const char* const type_param = test_info.type_param(); + const char* const value_param = test_info.value_param(); + + if (type_param != nullptr || value_param != nullptr) { + printf(", where "); + if (type_param != nullptr) { + printf("%s = %s", kTypeParamLabel, type_param); + if (value_param != nullptr) printf(" and "); + } + if (value_param != nullptr) { + printf("%s = %s", kValueParamLabel, value_param); + } + } +} + +// This class implements the TestEventListener interface. +// +// Class PrettyUnitTestResultPrinter is copyable. +class PrettyUnitTestResultPrinter : public TestEventListener { + public: + PrettyUnitTestResultPrinter() {} + static void PrintTestName(const char* test_suite, const char* test) { + printf("%s.%s", test_suite, test); + } + + // The following methods override what's in the TestEventListener class. + void OnTestProgramStart(const UnitTest& /*unit_test*/) override {} + void OnTestIterationStart(const UnitTest& unit_test, int iteration) override; + void OnEnvironmentsSetUpStart(const UnitTest& unit_test) override; + void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) override {} +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestCaseStart(const TestCase& test_case) override; +#else + void OnTestSuiteStart(const TestSuite& test_suite) override; +#endif // OnTestCaseStart + + void OnTestStart(const TestInfo& test_info) override; + void OnTestDisabled(const TestInfo& test_info) override; + + void OnTestPartResult(const TestPartResult& result) override; + void OnTestEnd(const TestInfo& test_info) override; +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestCaseEnd(const TestCase& test_case) override; +#else + void OnTestSuiteEnd(const TestSuite& test_suite) override; +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + void OnEnvironmentsTearDownStart(const UnitTest& unit_test) override; + void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) override {} + void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override; + void OnTestProgramEnd(const UnitTest& /*unit_test*/) override {} + + private: + static void PrintFailedTests(const UnitTest& unit_test); + static void PrintFailedTestSuites(const UnitTest& unit_test); + static void PrintSkippedTests(const UnitTest& unit_test); +}; + +// Fired before each iteration of tests starts. +void PrettyUnitTestResultPrinter::OnTestIterationStart( + const UnitTest& unit_test, int iteration) { + if (GTEST_FLAG_GET(repeat) != 1) + printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1); + + std::string f = GTEST_FLAG_GET(filter); + const char* const filter = f.c_str(); + + // Prints the filter if it's not *. This reminds the user that some + // tests may be skipped. + if (!String::CStringEquals(filter, kUniversalFilter)) { + ColoredPrintf(GTestColor::kYellow, "Note: %s filter = %s\n", GTEST_NAME_, + filter); + } + + if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) { + const int32_t shard_index = Int32FromEnvOrDie(kTestShardIndex, -1); + ColoredPrintf(GTestColor::kYellow, "Note: This is test shard %d of %s.\n", + static_cast(shard_index) + 1, + internal::posix::GetEnv(kTestTotalShards)); + } + + if (GTEST_FLAG_GET(shuffle)) { + ColoredPrintf(GTestColor::kYellow, + "Note: Randomizing tests' orders with a seed of %d .\n", + unit_test.random_seed()); + } + + ColoredPrintf(GTestColor::kGreen, "[==========] "); + printf("Running %s from %s.\n", + FormatTestCount(unit_test.test_to_run_count()).c_str(), + FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str()); + fflush(stdout); +} + +void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart( + const UnitTest& /*unit_test*/) { + ColoredPrintf(GTestColor::kGreen, "[----------] "); + printf("Global test environment set-up.\n"); + fflush(stdout); +} + +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase& test_case) { + const std::string counts = + FormatCountableNoun(test_case.test_to_run_count(), "test", "tests"); + ColoredPrintf(GTestColor::kGreen, "[----------] "); + printf("%s from %s", counts.c_str(), test_case.name()); + if (test_case.type_param() == nullptr) { + printf("\n"); + } else { + printf(", where %s = %s\n", kTypeParamLabel, test_case.type_param()); + } + fflush(stdout); +} +#else +void PrettyUnitTestResultPrinter::OnTestSuiteStart( + const TestSuite& test_suite) { + const std::string counts = + FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests"); + ColoredPrintf(GTestColor::kGreen, "[----------] "); + printf("%s from %s", counts.c_str(), test_suite.name()); + if (test_suite.type_param() == nullptr) { + printf("\n"); + } else { + printf(", where %s = %s\n", kTypeParamLabel, test_suite.type_param()); + } + fflush(stdout); +} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) { + ColoredPrintf(GTestColor::kGreen, "[ RUN ] "); + PrintTestName(test_info.test_suite_name(), test_info.name()); + printf("\n"); + fflush(stdout); +} + +void PrettyUnitTestResultPrinter::OnTestDisabled(const TestInfo& test_info) { + ColoredPrintf(GTestColor::kYellow, "[ DISABLED ] "); + PrintTestName(test_info.test_suite_name(), test_info.name()); + printf("\n"); + fflush(stdout); +} + +// Called after an assertion failure. +void PrettyUnitTestResultPrinter::OnTestPartResult( + const TestPartResult& result) { + switch (result.type()) { + // If the test part succeeded, we don't need to do anything. + case TestPartResult::kSuccess: + return; + default: + // Print failure message from the assertion + // (e.g. expected this and got that). + PrintTestPartResult(result); + fflush(stdout); + } +} + +void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) { + if (test_info.result()->Passed()) { + ColoredPrintf(GTestColor::kGreen, "[ OK ] "); + } else if (test_info.result()->Skipped()) { + ColoredPrintf(GTestColor::kGreen, "[ SKIPPED ] "); + } else { + ColoredPrintf(GTestColor::kRed, "[ FAILED ] "); + } + PrintTestName(test_info.test_suite_name(), test_info.name()); + if (test_info.result()->Failed()) PrintFullTestCommentIfPresent(test_info); + + if (GTEST_FLAG_GET(print_time)) { + printf(" (%s ms)\n", + internal::StreamableToString(test_info.result()->elapsed_time()) + .c_str()); + } else { + printf("\n"); + } + fflush(stdout); +} + +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) { + if (!GTEST_FLAG_GET(print_time)) return; + + const std::string counts = + FormatCountableNoun(test_case.test_to_run_count(), "test", "tests"); + ColoredPrintf(GTestColor::kGreen, "[----------] "); + printf("%s from %s (%s ms total)\n\n", counts.c_str(), test_case.name(), + internal::StreamableToString(test_case.elapsed_time()).c_str()); + fflush(stdout); +} +#else +void PrettyUnitTestResultPrinter::OnTestSuiteEnd(const TestSuite& test_suite) { + if (!GTEST_FLAG_GET(print_time)) return; + + const std::string counts = + FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests"); + ColoredPrintf(GTestColor::kGreen, "[----------] "); + printf("%s from %s (%s ms total)\n\n", counts.c_str(), test_suite.name(), + internal::StreamableToString(test_suite.elapsed_time()).c_str()); + fflush(stdout); +} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart( + const UnitTest& /*unit_test*/) { + ColoredPrintf(GTestColor::kGreen, "[----------] "); + printf("Global test environment tear-down\n"); + fflush(stdout); +} + +// Internal helper for printing the list of failed tests. +void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) { + const int failed_test_count = unit_test.failed_test_count(); + ColoredPrintf(GTestColor::kRed, "[ FAILED ] "); + printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str()); + + for (int i = 0; i < unit_test.total_test_suite_count(); ++i) { + const TestSuite& test_suite = *unit_test.GetTestSuite(i); + if (!test_suite.should_run() || (test_suite.failed_test_count() == 0)) { + continue; + } + for (int j = 0; j < test_suite.total_test_count(); ++j) { + const TestInfo& test_info = *test_suite.GetTestInfo(j); + if (!test_info.should_run() || !test_info.result()->Failed()) { + continue; + } + ColoredPrintf(GTestColor::kRed, "[ FAILED ] "); + printf("%s.%s", test_suite.name(), test_info.name()); + PrintFullTestCommentIfPresent(test_info); + printf("\n"); + } + } + printf("\n%2d FAILED %s\n", failed_test_count, + failed_test_count == 1 ? "TEST" : "TESTS"); +} + +// Internal helper for printing the list of test suite failures not covered by +// PrintFailedTests. +void PrettyUnitTestResultPrinter::PrintFailedTestSuites( + const UnitTest& unit_test) { + int suite_failure_count = 0; + for (int i = 0; i < unit_test.total_test_suite_count(); ++i) { + const TestSuite& test_suite = *unit_test.GetTestSuite(i); + if (!test_suite.should_run()) { + continue; + } + if (test_suite.ad_hoc_test_result().Failed()) { + ColoredPrintf(GTestColor::kRed, "[ FAILED ] "); + printf("%s: SetUpTestSuite or TearDownTestSuite\n", test_suite.name()); + ++suite_failure_count; + } + } + if (suite_failure_count > 0) { + printf("\n%2d FAILED TEST %s\n", suite_failure_count, + suite_failure_count == 1 ? "SUITE" : "SUITES"); + } +} + +// Internal helper for printing the list of skipped tests. +void PrettyUnitTestResultPrinter::PrintSkippedTests(const UnitTest& unit_test) { + const int skipped_test_count = unit_test.skipped_test_count(); + if (skipped_test_count == 0) { + return; + } + + for (int i = 0; i < unit_test.total_test_suite_count(); ++i) { + const TestSuite& test_suite = *unit_test.GetTestSuite(i); + if (!test_suite.should_run() || (test_suite.skipped_test_count() == 0)) { + continue; + } + for (int j = 0; j < test_suite.total_test_count(); ++j) { + const TestInfo& test_info = *test_suite.GetTestInfo(j); + if (!test_info.should_run() || !test_info.result()->Skipped()) { + continue; + } + ColoredPrintf(GTestColor::kGreen, "[ SKIPPED ] "); + printf("%s.%s", test_suite.name(), test_info.name()); + printf("\n"); + } + } +} + +void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, + int /*iteration*/) { + ColoredPrintf(GTestColor::kGreen, "[==========] "); + printf("%s from %s ran.", + FormatTestCount(unit_test.test_to_run_count()).c_str(), + FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str()); + if (GTEST_FLAG_GET(print_time)) { + printf(" (%s ms total)", + internal::StreamableToString(unit_test.elapsed_time()).c_str()); + } + printf("\n"); + ColoredPrintf(GTestColor::kGreen, "[ PASSED ] "); + printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str()); + + const int skipped_test_count = unit_test.skipped_test_count(); + if (skipped_test_count > 0) { + ColoredPrintf(GTestColor::kGreen, "[ SKIPPED ] "); + printf("%s, listed below:\n", FormatTestCount(skipped_test_count).c_str()); + PrintSkippedTests(unit_test); + } + + if (!unit_test.Passed()) { + PrintFailedTests(unit_test); + PrintFailedTestSuites(unit_test); + } + + int num_disabled = unit_test.reportable_disabled_test_count(); + if (num_disabled && !GTEST_FLAG_GET(also_run_disabled_tests)) { + if (unit_test.Passed()) { + printf("\n"); // Add a spacer if no FAILURE banner is displayed. + } + ColoredPrintf(GTestColor::kYellow, " YOU HAVE %d DISABLED %s\n\n", + num_disabled, num_disabled == 1 ? "TEST" : "TESTS"); + } + // Ensure that Google Test output is printed before, e.g., heapchecker output. + fflush(stdout); +} + +// End PrettyUnitTestResultPrinter + +// This class implements the TestEventListener interface. +// +// Class BriefUnitTestResultPrinter is copyable. +class BriefUnitTestResultPrinter : public TestEventListener { + public: + BriefUnitTestResultPrinter() {} + static void PrintTestName(const char* test_suite, const char* test) { + printf("%s.%s", test_suite, test); + } + + // The following methods override what's in the TestEventListener class. + void OnTestProgramStart(const UnitTest& /*unit_test*/) override {} + void OnTestIterationStart(const UnitTest& /*unit_test*/, + int /*iteration*/) override {} + void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) override {} + void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) override {} +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestCaseStart(const TestCase& /*test_case*/) override {} +#else + void OnTestSuiteStart(const TestSuite& /*test_suite*/) override {} +#endif // OnTestCaseStart + + void OnTestStart(const TestInfo& /*test_info*/) override {} + void OnTestDisabled(const TestInfo& /*test_info*/) override {} + + void OnTestPartResult(const TestPartResult& result) override; + void OnTestEnd(const TestInfo& test_info) override; +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestCaseEnd(const TestCase& /*test_case*/) override {} +#else + void OnTestSuiteEnd(const TestSuite& /*test_suite*/) override {} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) override {} + void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) override {} + void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override; + void OnTestProgramEnd(const UnitTest& /*unit_test*/) override {} +}; + +// Called after an assertion failure. +void BriefUnitTestResultPrinter::OnTestPartResult( + const TestPartResult& result) { + switch (result.type()) { + // If the test part succeeded, we don't need to do anything. + case TestPartResult::kSuccess: + return; + default: + // Print failure message from the assertion + // (e.g. expected this and got that). + PrintTestPartResult(result); + fflush(stdout); + } +} + +void BriefUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) { + if (test_info.result()->Failed()) { + ColoredPrintf(GTestColor::kRed, "[ FAILED ] "); + PrintTestName(test_info.test_suite_name(), test_info.name()); + PrintFullTestCommentIfPresent(test_info); + + if (GTEST_FLAG_GET(print_time)) { + printf(" (%s ms)\n", + internal::StreamableToString(test_info.result()->elapsed_time()) + .c_str()); + } else { + printf("\n"); + } + fflush(stdout); + } +} + +void BriefUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, + int /*iteration*/) { + ColoredPrintf(GTestColor::kGreen, "[==========] "); + printf("%s from %s ran.", + FormatTestCount(unit_test.test_to_run_count()).c_str(), + FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str()); + if (GTEST_FLAG_GET(print_time)) { + printf(" (%s ms total)", + internal::StreamableToString(unit_test.elapsed_time()).c_str()); + } + printf("\n"); + ColoredPrintf(GTestColor::kGreen, "[ PASSED ] "); + printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str()); + + const int skipped_test_count = unit_test.skipped_test_count(); + if (skipped_test_count > 0) { + ColoredPrintf(GTestColor::kGreen, "[ SKIPPED ] "); + printf("%s.\n", FormatTestCount(skipped_test_count).c_str()); + } + + int num_disabled = unit_test.reportable_disabled_test_count(); + if (num_disabled && !GTEST_FLAG_GET(also_run_disabled_tests)) { + if (unit_test.Passed()) { + printf("\n"); // Add a spacer if no FAILURE banner is displayed. + } + ColoredPrintf(GTestColor::kYellow, " YOU HAVE %d DISABLED %s\n\n", + num_disabled, num_disabled == 1 ? "TEST" : "TESTS"); + } + // Ensure that Google Test output is printed before, e.g., heapchecker output. + fflush(stdout); +} + +// End BriefUnitTestResultPrinter + +// class TestEventRepeater +// +// This class forwards events to other event listeners. +class TestEventRepeater : public TestEventListener { + public: + TestEventRepeater() : forwarding_enabled_(true) {} + ~TestEventRepeater() override; + void Append(TestEventListener* listener); + TestEventListener* Release(TestEventListener* listener); + + // Controls whether events will be forwarded to listeners_. Set to false + // in death test child processes. + bool forwarding_enabled() const { return forwarding_enabled_; } + void set_forwarding_enabled(bool enable) { forwarding_enabled_ = enable; } + + void OnTestProgramStart(const UnitTest& unit_test) override; + void OnTestIterationStart(const UnitTest& unit_test, int iteration) override; + void OnEnvironmentsSetUpStart(const UnitTest& unit_test) override; + void OnEnvironmentsSetUpEnd(const UnitTest& unit_test) override; +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestCaseStart(const TestSuite& parameter) override; +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestSuiteStart(const TestSuite& parameter) override; + void OnTestStart(const TestInfo& test_info) override; + void OnTestDisabled(const TestInfo& test_info) override; + void OnTestPartResult(const TestPartResult& result) override; + void OnTestEnd(const TestInfo& test_info) override; +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestCaseEnd(const TestCase& parameter) override; +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestSuiteEnd(const TestSuite& parameter) override; + void OnEnvironmentsTearDownStart(const UnitTest& unit_test) override; + void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) override; + void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override; + void OnTestProgramEnd(const UnitTest& unit_test) override; + + private: + // Controls whether events will be forwarded to listeners_. Set to false + // in death test child processes. + bool forwarding_enabled_; + // The list of listeners that receive events. + std::vector listeners_; + + TestEventRepeater(const TestEventRepeater&) = delete; + TestEventRepeater& operator=(const TestEventRepeater&) = delete; +}; + +TestEventRepeater::~TestEventRepeater() { + ForEach(listeners_, Delete); +} + +void TestEventRepeater::Append(TestEventListener* listener) { + listeners_.push_back(listener); +} + +TestEventListener* TestEventRepeater::Release(TestEventListener* listener) { + for (size_t i = 0; i < listeners_.size(); ++i) { + if (listeners_[i] == listener) { + listeners_.erase(listeners_.begin() + static_cast(i)); + return listener; + } + } + + return nullptr; +} + +// Since most methods are very similar, use macros to reduce boilerplate. +// This defines a member that forwards the call to all listeners. +#define GTEST_REPEATER_METHOD_(Name, Type) \ + void TestEventRepeater::Name(const Type& parameter) { \ + if (forwarding_enabled_) { \ + for (size_t i = 0; i < listeners_.size(); i++) { \ + listeners_[i]->Name(parameter); \ + } \ + } \ + } +// This defines a member that forwards the call to all listeners in reverse +// order. +#define GTEST_REVERSE_REPEATER_METHOD_(Name, Type) \ + void TestEventRepeater::Name(const Type& parameter) { \ + if (forwarding_enabled_) { \ + for (size_t i = listeners_.size(); i != 0; i--) { \ + listeners_[i - 1]->Name(parameter); \ + } \ + } \ + } + +GTEST_REPEATER_METHOD_(OnTestProgramStart, UnitTest) +GTEST_REPEATER_METHOD_(OnEnvironmentsSetUpStart, UnitTest) +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +GTEST_REPEATER_METHOD_(OnTestCaseStart, TestSuite) +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +GTEST_REPEATER_METHOD_(OnTestSuiteStart, TestSuite) +GTEST_REPEATER_METHOD_(OnTestStart, TestInfo) +GTEST_REPEATER_METHOD_(OnTestDisabled, TestInfo) +GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult) +GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest) +GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest) +GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsTearDownEnd, UnitTest) +GTEST_REVERSE_REPEATER_METHOD_(OnTestEnd, TestInfo) +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +GTEST_REVERSE_REPEATER_METHOD_(OnTestCaseEnd, TestSuite) +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +GTEST_REVERSE_REPEATER_METHOD_(OnTestSuiteEnd, TestSuite) +GTEST_REVERSE_REPEATER_METHOD_(OnTestProgramEnd, UnitTest) + +#undef GTEST_REPEATER_METHOD_ +#undef GTEST_REVERSE_REPEATER_METHOD_ + +void TestEventRepeater::OnTestIterationStart(const UnitTest& unit_test, + int iteration) { + if (forwarding_enabled_) { + for (size_t i = 0; i < listeners_.size(); i++) { + listeners_[i]->OnTestIterationStart(unit_test, iteration); + } + } +} + +void TestEventRepeater::OnTestIterationEnd(const UnitTest& unit_test, + int iteration) { + if (forwarding_enabled_) { + for (size_t i = listeners_.size(); i > 0; i--) { + listeners_[i - 1]->OnTestIterationEnd(unit_test, iteration); + } + } +} + +// End TestEventRepeater + +// This class generates an XML output file. +class XmlUnitTestResultPrinter : public EmptyTestEventListener { + public: + explicit XmlUnitTestResultPrinter(const char* output_file); + + void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override; + void ListTestsMatchingFilter(const std::vector& test_suites); + + // Prints an XML summary of all unit tests. + static void PrintXmlTestsList(std::ostream* stream, + const std::vector& test_suites); + + private: + // Is c a whitespace character that is normalized to a space character + // when it appears in an XML attribute value? + static bool IsNormalizableWhitespace(unsigned char c) { + return c == '\t' || c == '\n' || c == '\r'; + } + + // May c appear in a well-formed XML document? + // https://www.w3.org/TR/REC-xml/#charsets + static bool IsValidXmlCharacter(unsigned char c) { + return IsNormalizableWhitespace(c) || c >= 0x20; + } + + // Returns an XML-escaped copy of the input string str. If + // is_attribute is true, the text is meant to appear as an attribute + // value, and normalizable whitespace is preserved by replacing it + // with character references. + static std::string EscapeXml(const std::string& str, bool is_attribute); + + // Returns the given string with all characters invalid in XML removed. + static std::string RemoveInvalidXmlCharacters(const std::string& str); + + // Convenience wrapper around EscapeXml when str is an attribute value. + static std::string EscapeXmlAttribute(const std::string& str) { + return EscapeXml(str, true); + } + + // Convenience wrapper around EscapeXml when str is not an attribute value. + static std::string EscapeXmlText(const char* str) { + return EscapeXml(str, false); + } + + // Verifies that the given attribute belongs to the given element and + // streams the attribute as XML. + static void OutputXmlAttribute(std::ostream* stream, + const std::string& element_name, + const std::string& name, + const std::string& value); + + // Streams an XML CDATA section, escaping invalid CDATA sequences as needed. + static void OutputXmlCDataSection(::std::ostream* stream, const char* data); + + // Streams a test suite XML stanza containing the given test result. + // + // Requires: result.Failed() + static void OutputXmlTestSuiteForTestResult(::std::ostream* stream, + const TestResult& result); + + // Streams an XML representation of a TestResult object. + static void OutputXmlTestResult(::std::ostream* stream, + const TestResult& result); + + // Streams an XML representation of a TestInfo object. + static void OutputXmlTestInfo(::std::ostream* stream, + const char* test_suite_name, + const TestInfo& test_info); + + // Prints an XML representation of a TestSuite object + static void PrintXmlTestSuite(::std::ostream* stream, + const TestSuite& test_suite); + + // Prints an XML summary of unit_test to output stream out. + static void PrintXmlUnitTest(::std::ostream* stream, + const UnitTest& unit_test); + + // Produces a string representing the test properties in a result as space + // delimited XML attributes based on the property key="value" pairs. + // When the std::string is not empty, it includes a space at the beginning, + // to delimit this attribute from prior attributes. + static std::string TestPropertiesAsXmlAttributes(const TestResult& result); + + // Streams an XML representation of the test properties of a TestResult + // object. + static void OutputXmlTestProperties(std::ostream* stream, + const TestResult& result); + + // The output file. + const std::string output_file_; + + XmlUnitTestResultPrinter(const XmlUnitTestResultPrinter&) = delete; + XmlUnitTestResultPrinter& operator=(const XmlUnitTestResultPrinter&) = delete; +}; + +// Creates a new XmlUnitTestResultPrinter. +XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char* output_file) + : output_file_(output_file) { + if (output_file_.empty()) { + GTEST_LOG_(FATAL) << "XML output file may not be null"; + } +} + +// Called after the unit test ends. +void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, + int /*iteration*/) { + FILE* xmlout = OpenFileForWriting(output_file_); + std::stringstream stream; + PrintXmlUnitTest(&stream, unit_test); + fprintf(xmlout, "%s", StringStreamToString(&stream).c_str()); + fclose(xmlout); +} + +void XmlUnitTestResultPrinter::ListTestsMatchingFilter( + const std::vector& test_suites) { + FILE* xmlout = OpenFileForWriting(output_file_); + std::stringstream stream; + PrintXmlTestsList(&stream, test_suites); + fprintf(xmlout, "%s", StringStreamToString(&stream).c_str()); + fclose(xmlout); +} + +// Returns an XML-escaped copy of the input string str. If is_attribute +// is true, the text is meant to appear as an attribute value, and +// normalizable whitespace is preserved by replacing it with character +// references. +// +// Invalid XML characters in str, if any, are stripped from the output. +// It is expected that most, if not all, of the text processed by this +// module will consist of ordinary English text. +// If this module is ever modified to produce version 1.1 XML output, +// most invalid characters can be retained using character references. +std::string XmlUnitTestResultPrinter::EscapeXml(const std::string& str, + bool is_attribute) { + Message m; + + for (size_t i = 0; i < str.size(); ++i) { + const char ch = str[i]; + switch (ch) { + case '<': + m << "<"; + break; + case '>': + m << ">"; + break; + case '&': + m << "&"; + break; + case '\'': + if (is_attribute) + m << "'"; + else + m << '\''; + break; + case '"': + if (is_attribute) + m << """; + else + m << '"'; + break; + default: + if (IsValidXmlCharacter(static_cast(ch))) { + if (is_attribute && + IsNormalizableWhitespace(static_cast(ch))) + m << "&#x" << String::FormatByte(static_cast(ch)) + << ";"; + else + m << ch; + } + break; + } + } + + return m.GetString(); +} + +// Returns the given string with all characters invalid in XML removed. +// Currently invalid characters are dropped from the string. An +// alternative is to replace them with certain characters such as . or ?. +std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters( + const std::string& str) { + std::string output; + output.reserve(str.size()); + for (std::string::const_iterator it = str.begin(); it != str.end(); ++it) + if (IsValidXmlCharacter(static_cast(*it))) + output.push_back(*it); + + return output; +} + +// The following routines generate an XML representation of a UnitTest +// object. +// +// This is how Google Test concepts map to the DTD: +// +// <-- corresponds to a UnitTest object +// <-- corresponds to a TestSuite object +// <-- corresponds to a TestInfo object +// ... +// ... +// ... +// <-- individual assertion failures +// +// +// + +// Formats the given time in milliseconds as seconds. +std::string FormatTimeInMillisAsSeconds(TimeInMillis ms) { + ::std::stringstream ss; + ss << (static_cast(ms) * 1e-3); + return ss.str(); +} + +static bool PortableLocaltime(time_t seconds, struct tm* out) { +#if defined(_MSC_VER) + return localtime_s(out, &seconds) == 0; +#elif defined(__MINGW32__) || defined(__MINGW64__) + // MINGW provides neither localtime_r nor localtime_s, but uses + // Windows' localtime(), which has a thread-local tm buffer. + struct tm* tm_ptr = localtime(&seconds); // NOLINT + if (tm_ptr == nullptr) return false; + *out = *tm_ptr; + return true; +#elif defined(__STDC_LIB_EXT1__) + // Uses localtime_s when available as localtime_r is only available from + // C23 standard. + return localtime_s(&seconds, out) != nullptr; +#else + return localtime_r(&seconds, out) != nullptr; +#endif +} + +// Converts the given epoch time in milliseconds to a date string in the ISO +// 8601 format, without the timezone information. +std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms) { + struct tm time_struct; + if (!PortableLocaltime(static_cast(ms / 1000), &time_struct)) + return ""; + // YYYY-MM-DDThh:mm:ss.sss + return StreamableToString(time_struct.tm_year + 1900) + "-" + + String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" + + String::FormatIntWidth2(time_struct.tm_mday) + "T" + + String::FormatIntWidth2(time_struct.tm_hour) + ":" + + String::FormatIntWidth2(time_struct.tm_min) + ":" + + String::FormatIntWidth2(time_struct.tm_sec) + "." + + String::FormatIntWidthN(static_cast(ms % 1000), 3); +} + +// Streams an XML CDATA section, escaping invalid CDATA sequences as needed. +void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream, + const char* data) { + const char* segment = data; + *stream << ""); + if (next_segment != nullptr) { + stream->write(segment, + static_cast(next_segment - segment)); + *stream << "]]>]]>"); + } else { + *stream << segment; + break; + } + } + *stream << "]]>"; +} + +void XmlUnitTestResultPrinter::OutputXmlAttribute( + std::ostream* stream, const std::string& element_name, + const std::string& name, const std::string& value) { + const std::vector& allowed_names = + GetReservedOutputAttributesForElement(element_name); + + GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) != + allowed_names.end()) + << "Attribute " << name << " is not allowed for element <" << element_name + << ">."; + + *stream << " " << name << "=\"" << EscapeXmlAttribute(value) << "\""; +} + +// Streams a test suite XML stanza containing the given test result. +void XmlUnitTestResultPrinter::OutputXmlTestSuiteForTestResult( + ::std::ostream* stream, const TestResult& result) { + // Output the boilerplate for a minimal test suite with one test. + *stream << " "; + + // Output the boilerplate for a minimal test case with a single test. + *stream << " \n"; +} + +// Prints an XML representation of a TestInfo object. +void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream, + const char* test_suite_name, + const TestInfo& test_info) { + const TestResult& result = *test_info.result(); + const std::string kTestsuite = "testcase"; + + if (test_info.is_in_another_shard()) { + return; + } + + *stream << " \n"; + return; + } + + OutputXmlAttribute(stream, kTestsuite, "status", + test_info.should_run() ? "run" : "notrun"); + OutputXmlAttribute(stream, kTestsuite, "result", + test_info.should_run() + ? (result.Skipped() ? "skipped" : "completed") + : "suppressed"); + OutputXmlAttribute(stream, kTestsuite, "time", + FormatTimeInMillisAsSeconds(result.elapsed_time())); + OutputXmlAttribute( + stream, kTestsuite, "timestamp", + FormatEpochTimeInMillisAsIso8601(result.start_timestamp())); + OutputXmlAttribute(stream, kTestsuite, "classname", test_suite_name); + + OutputXmlTestResult(stream, result); +} + +void XmlUnitTestResultPrinter::OutputXmlTestResult(::std::ostream* stream, + const TestResult& result) { + int failures = 0; + int skips = 0; + for (int i = 0; i < result.total_part_count(); ++i) { + const TestPartResult& part = result.GetTestPartResult(i); + if (part.failed()) { + if (++failures == 1 && skips == 0) { + *stream << ">\n"; + } + const std::string location = + internal::FormatCompilerIndependentFileLocation(part.file_name(), + part.line_number()); + const std::string summary = location + "\n" + part.summary(); + *stream << " "; + const std::string detail = location + "\n" + part.message(); + OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str()); + *stream << "\n"; + } else if (part.skipped()) { + if (++skips == 1 && failures == 0) { + *stream << ">\n"; + } + const std::string location = + internal::FormatCompilerIndependentFileLocation(part.file_name(), + part.line_number()); + const std::string summary = location + "\n" + part.summary(); + *stream << " "; + const std::string detail = location + "\n" + part.message(); + OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str()); + *stream << "\n"; + } + } + + if (failures == 0 && skips == 0 && result.test_property_count() == 0) { + *stream << " />\n"; + } else { + if (failures == 0 && skips == 0) { + *stream << ">\n"; + } + OutputXmlTestProperties(stream, result); + *stream << " \n"; + } +} + +// Prints an XML representation of a TestSuite object +void XmlUnitTestResultPrinter::PrintXmlTestSuite(std::ostream* stream, + const TestSuite& test_suite) { + const std::string kTestsuite = "testsuite"; + *stream << " <" << kTestsuite; + OutputXmlAttribute(stream, kTestsuite, "name", test_suite.name()); + OutputXmlAttribute(stream, kTestsuite, "tests", + StreamableToString(test_suite.reportable_test_count())); + if (!GTEST_FLAG_GET(list_tests)) { + OutputXmlAttribute(stream, kTestsuite, "failures", + StreamableToString(test_suite.failed_test_count())); + OutputXmlAttribute( + stream, kTestsuite, "disabled", + StreamableToString(test_suite.reportable_disabled_test_count())); + OutputXmlAttribute(stream, kTestsuite, "skipped", + StreamableToString(test_suite.skipped_test_count())); + + OutputXmlAttribute(stream, kTestsuite, "errors", "0"); + + OutputXmlAttribute(stream, kTestsuite, "time", + FormatTimeInMillisAsSeconds(test_suite.elapsed_time())); + OutputXmlAttribute( + stream, kTestsuite, "timestamp", + FormatEpochTimeInMillisAsIso8601(test_suite.start_timestamp())); + *stream << TestPropertiesAsXmlAttributes(test_suite.ad_hoc_test_result()); + } + *stream << ">\n"; + for (int i = 0; i < test_suite.total_test_count(); ++i) { + if (test_suite.GetTestInfo(i)->is_reportable()) + OutputXmlTestInfo(stream, test_suite.name(), *test_suite.GetTestInfo(i)); + } + *stream << " \n"; +} + +// Prints an XML summary of unit_test to output stream out. +void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream, + const UnitTest& unit_test) { + const std::string kTestsuites = "testsuites"; + + *stream << "\n"; + *stream << "<" << kTestsuites; + + OutputXmlAttribute(stream, kTestsuites, "tests", + StreamableToString(unit_test.reportable_test_count())); + OutputXmlAttribute(stream, kTestsuites, "failures", + StreamableToString(unit_test.failed_test_count())); + OutputXmlAttribute( + stream, kTestsuites, "disabled", + StreamableToString(unit_test.reportable_disabled_test_count())); + OutputXmlAttribute(stream, kTestsuites, "errors", "0"); + OutputXmlAttribute(stream, kTestsuites, "time", + FormatTimeInMillisAsSeconds(unit_test.elapsed_time())); + OutputXmlAttribute( + stream, kTestsuites, "timestamp", + FormatEpochTimeInMillisAsIso8601(unit_test.start_timestamp())); + + if (GTEST_FLAG_GET(shuffle)) { + OutputXmlAttribute(stream, kTestsuites, "random_seed", + StreamableToString(unit_test.random_seed())); + } + *stream << TestPropertiesAsXmlAttributes(unit_test.ad_hoc_test_result()); + + OutputXmlAttribute(stream, kTestsuites, "name", "AllTests"); + *stream << ">\n"; + + for (int i = 0; i < unit_test.total_test_suite_count(); ++i) { + if (unit_test.GetTestSuite(i)->reportable_test_count() > 0) + PrintXmlTestSuite(stream, *unit_test.GetTestSuite(i)); + } + + // If there was a test failure outside of one of the test suites (like in a + // test environment) include that in the output. + if (unit_test.ad_hoc_test_result().Failed()) { + OutputXmlTestSuiteForTestResult(stream, unit_test.ad_hoc_test_result()); + } + + *stream << "\n"; +} + +void XmlUnitTestResultPrinter::PrintXmlTestsList( + std::ostream* stream, const std::vector& test_suites) { + const std::string kTestsuites = "testsuites"; + + *stream << "\n"; + *stream << "<" << kTestsuites; + + int total_tests = 0; + for (auto test_suite : test_suites) { + total_tests += test_suite->total_test_count(); + } + OutputXmlAttribute(stream, kTestsuites, "tests", + StreamableToString(total_tests)); + OutputXmlAttribute(stream, kTestsuites, "name", "AllTests"); + *stream << ">\n"; + + for (auto test_suite : test_suites) { + PrintXmlTestSuite(stream, *test_suite); + } + *stream << "\n"; +} + +// Produces a string representing the test properties in a result as space +// delimited XML attributes based on the property key="value" pairs. +std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes( + const TestResult& result) { + Message attributes; + for (int i = 0; i < result.test_property_count(); ++i) { + const TestProperty& property = result.GetTestProperty(i); + attributes << " " << property.key() << "=" + << "\"" << EscapeXmlAttribute(property.value()) << "\""; + } + return attributes.GetString(); +} + +void XmlUnitTestResultPrinter::OutputXmlTestProperties( + std::ostream* stream, const TestResult& result) { + const std::string kProperties = "properties"; + const std::string kProperty = "property"; + + if (result.test_property_count() <= 0) { + return; + } + + *stream << " <" << kProperties << ">\n"; + for (int i = 0; i < result.test_property_count(); ++i) { + const TestProperty& property = result.GetTestProperty(i); + *stream << " <" << kProperty; + *stream << " name=\"" << EscapeXmlAttribute(property.key()) << "\""; + *stream << " value=\"" << EscapeXmlAttribute(property.value()) << "\""; + *stream << "/>\n"; + } + *stream << " \n"; +} + +// End XmlUnitTestResultPrinter + +// This class generates an JSON output file. +class JsonUnitTestResultPrinter : public EmptyTestEventListener { + public: + explicit JsonUnitTestResultPrinter(const char* output_file); + + void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override; + + // Prints an JSON summary of all unit tests. + static void PrintJsonTestList(::std::ostream* stream, + const std::vector& test_suites); + + private: + // Returns an JSON-escaped copy of the input string str. + static std::string EscapeJson(const std::string& str); + + //// Verifies that the given attribute belongs to the given element and + //// streams the attribute as JSON. + static void OutputJsonKey(std::ostream* stream, + const std::string& element_name, + const std::string& name, const std::string& value, + const std::string& indent, bool comma = true); + static void OutputJsonKey(std::ostream* stream, + const std::string& element_name, + const std::string& name, int value, + const std::string& indent, bool comma = true); + + // Streams a test suite JSON stanza containing the given test result. + // + // Requires: result.Failed() + static void OutputJsonTestSuiteForTestResult(::std::ostream* stream, + const TestResult& result); + + // Streams a JSON representation of a TestResult object. + static void OutputJsonTestResult(::std::ostream* stream, + const TestResult& result); + + // Streams a JSON representation of a TestInfo object. + static void OutputJsonTestInfo(::std::ostream* stream, + const char* test_suite_name, + const TestInfo& test_info); + + // Prints a JSON representation of a TestSuite object + static void PrintJsonTestSuite(::std::ostream* stream, + const TestSuite& test_suite); + + // Prints a JSON summary of unit_test to output stream out. + static void PrintJsonUnitTest(::std::ostream* stream, + const UnitTest& unit_test); + + // Produces a string representing the test properties in a result as + // a JSON dictionary. + static std::string TestPropertiesAsJson(const TestResult& result, + const std::string& indent); + + // The output file. + const std::string output_file_; + + JsonUnitTestResultPrinter(const JsonUnitTestResultPrinter&) = delete; + JsonUnitTestResultPrinter& operator=(const JsonUnitTestResultPrinter&) = + delete; +}; + +// Creates a new JsonUnitTestResultPrinter. +JsonUnitTestResultPrinter::JsonUnitTestResultPrinter(const char* output_file) + : output_file_(output_file) { + if (output_file_.empty()) { + GTEST_LOG_(FATAL) << "JSON output file may not be null"; + } +} + +void JsonUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, + int /*iteration*/) { + FILE* jsonout = OpenFileForWriting(output_file_); + std::stringstream stream; + PrintJsonUnitTest(&stream, unit_test); + fprintf(jsonout, "%s", StringStreamToString(&stream).c_str()); + fclose(jsonout); +} + +// Returns an JSON-escaped copy of the input string str. +std::string JsonUnitTestResultPrinter::EscapeJson(const std::string& str) { + Message m; + + for (size_t i = 0; i < str.size(); ++i) { + const char ch = str[i]; + switch (ch) { + case '\\': + case '"': + case '/': + m << '\\' << ch; + break; + case '\b': + m << "\\b"; + break; + case '\t': + m << "\\t"; + break; + case '\n': + m << "\\n"; + break; + case '\f': + m << "\\f"; + break; + case '\r': + m << "\\r"; + break; + default: + if (ch < ' ') { + m << "\\u00" << String::FormatByte(static_cast(ch)); + } else { + m << ch; + } + break; + } + } + + return m.GetString(); +} + +// The following routines generate an JSON representation of a UnitTest +// object. + +// Formats the given time in milliseconds as seconds. +static std::string FormatTimeInMillisAsDuration(TimeInMillis ms) { + ::std::stringstream ss; + ss << (static_cast(ms) * 1e-3) << "s"; + return ss.str(); +} + +// Converts the given epoch time in milliseconds to a date string in the +// RFC3339 format, without the timezone information. +static std::string FormatEpochTimeInMillisAsRFC3339(TimeInMillis ms) { + struct tm time_struct; + if (!PortableLocaltime(static_cast(ms / 1000), &time_struct)) + return ""; + // YYYY-MM-DDThh:mm:ss + return StreamableToString(time_struct.tm_year + 1900) + "-" + + String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" + + String::FormatIntWidth2(time_struct.tm_mday) + "T" + + String::FormatIntWidth2(time_struct.tm_hour) + ":" + + String::FormatIntWidth2(time_struct.tm_min) + ":" + + String::FormatIntWidth2(time_struct.tm_sec) + "Z"; +} + +static inline std::string Indent(size_t width) { + return std::string(width, ' '); +} + +void JsonUnitTestResultPrinter::OutputJsonKey(std::ostream* stream, + const std::string& element_name, + const std::string& name, + const std::string& value, + const std::string& indent, + bool comma) { + const std::vector& allowed_names = + GetReservedOutputAttributesForElement(element_name); + + GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) != + allowed_names.end()) + << "Key \"" << name << "\" is not allowed for value \"" << element_name + << "\"."; + + *stream << indent << "\"" << name << "\": \"" << EscapeJson(value) << "\""; + if (comma) *stream << ",\n"; +} + +void JsonUnitTestResultPrinter::OutputJsonKey( + std::ostream* stream, const std::string& element_name, + const std::string& name, int value, const std::string& indent, bool comma) { + const std::vector& allowed_names = + GetReservedOutputAttributesForElement(element_name); + + GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) != + allowed_names.end()) + << "Key \"" << name << "\" is not allowed for value \"" << element_name + << "\"."; + + *stream << indent << "\"" << name << "\": " << StreamableToString(value); + if (comma) *stream << ",\n"; +} + +// Streams a test suite JSON stanza containing the given test result. +void JsonUnitTestResultPrinter::OutputJsonTestSuiteForTestResult( + ::std::ostream* stream, const TestResult& result) { + // Output the boilerplate for a new test suite. + *stream << Indent(4) << "{\n"; + OutputJsonKey(stream, "testsuite", "name", "NonTestSuiteFailure", Indent(6)); + OutputJsonKey(stream, "testsuite", "tests", 1, Indent(6)); + if (!GTEST_FLAG_GET(list_tests)) { + OutputJsonKey(stream, "testsuite", "failures", 1, Indent(6)); + OutputJsonKey(stream, "testsuite", "disabled", 0, Indent(6)); + OutputJsonKey(stream, "testsuite", "skipped", 0, Indent(6)); + OutputJsonKey(stream, "testsuite", "errors", 0, Indent(6)); + OutputJsonKey(stream, "testsuite", "time", + FormatTimeInMillisAsDuration(result.elapsed_time()), + Indent(6)); + OutputJsonKey(stream, "testsuite", "timestamp", + FormatEpochTimeInMillisAsRFC3339(result.start_timestamp()), + Indent(6)); + } + *stream << Indent(6) << "\"testsuite\": [\n"; + + // Output the boilerplate for a new test case. + *stream << Indent(8) << "{\n"; + OutputJsonKey(stream, "testcase", "name", "", Indent(10)); + OutputJsonKey(stream, "testcase", "status", "RUN", Indent(10)); + OutputJsonKey(stream, "testcase", "result", "COMPLETED", Indent(10)); + OutputJsonKey(stream, "testcase", "timestamp", + FormatEpochTimeInMillisAsRFC3339(result.start_timestamp()), + Indent(10)); + OutputJsonKey(stream, "testcase", "time", + FormatTimeInMillisAsDuration(result.elapsed_time()), + Indent(10)); + OutputJsonKey(stream, "testcase", "classname", "", Indent(10), false); + *stream << TestPropertiesAsJson(result, Indent(10)); + + // Output the actual test result. + OutputJsonTestResult(stream, result); + + // Finish the test suite. + *stream << "\n" << Indent(6) << "]\n" << Indent(4) << "}"; +} + +// Prints a JSON representation of a TestInfo object. +void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream* stream, + const char* test_suite_name, + const TestInfo& test_info) { + const TestResult& result = *test_info.result(); + const std::string kTestsuite = "testcase"; + const std::string kIndent = Indent(10); + + *stream << Indent(8) << "{\n"; + OutputJsonKey(stream, kTestsuite, "name", test_info.name(), kIndent); + + if (test_info.value_param() != nullptr) { + OutputJsonKey(stream, kTestsuite, "value_param", test_info.value_param(), + kIndent); + } + if (test_info.type_param() != nullptr) { + OutputJsonKey(stream, kTestsuite, "type_param", test_info.type_param(), + kIndent); + } + + OutputJsonKey(stream, kTestsuite, "file", test_info.file(), kIndent); + OutputJsonKey(stream, kTestsuite, "line", test_info.line(), kIndent, false); + if (GTEST_FLAG_GET(list_tests)) { + *stream << "\n" << Indent(8) << "}"; + return; + } else { + *stream << ",\n"; + } + + OutputJsonKey(stream, kTestsuite, "status", + test_info.should_run() ? "RUN" : "NOTRUN", kIndent); + OutputJsonKey(stream, kTestsuite, "result", + test_info.should_run() + ? (result.Skipped() ? "SKIPPED" : "COMPLETED") + : "SUPPRESSED", + kIndent); + OutputJsonKey(stream, kTestsuite, "timestamp", + FormatEpochTimeInMillisAsRFC3339(result.start_timestamp()), + kIndent); + OutputJsonKey(stream, kTestsuite, "time", + FormatTimeInMillisAsDuration(result.elapsed_time()), kIndent); + OutputJsonKey(stream, kTestsuite, "classname", test_suite_name, kIndent, + false); + *stream << TestPropertiesAsJson(result, kIndent); + + OutputJsonTestResult(stream, result); +} + +void JsonUnitTestResultPrinter::OutputJsonTestResult(::std::ostream* stream, + const TestResult& result) { + const std::string kIndent = Indent(10); + + int failures = 0; + for (int i = 0; i < result.total_part_count(); ++i) { + const TestPartResult& part = result.GetTestPartResult(i); + if (part.failed()) { + *stream << ",\n"; + if (++failures == 1) { + *stream << kIndent << "\"" + << "failures" + << "\": [\n"; + } + const std::string location = + internal::FormatCompilerIndependentFileLocation(part.file_name(), + part.line_number()); + const std::string message = EscapeJson(location + "\n" + part.message()); + *stream << kIndent << " {\n" + << kIndent << " \"failure\": \"" << message << "\",\n" + << kIndent << " \"type\": \"\"\n" + << kIndent << " }"; + } + } + + if (failures > 0) *stream << "\n" << kIndent << "]"; + *stream << "\n" << Indent(8) << "}"; +} + +// Prints an JSON representation of a TestSuite object +void JsonUnitTestResultPrinter::PrintJsonTestSuite( + std::ostream* stream, const TestSuite& test_suite) { + const std::string kTestsuite = "testsuite"; + const std::string kIndent = Indent(6); + + *stream << Indent(4) << "{\n"; + OutputJsonKey(stream, kTestsuite, "name", test_suite.name(), kIndent); + OutputJsonKey(stream, kTestsuite, "tests", test_suite.reportable_test_count(), + kIndent); + if (!GTEST_FLAG_GET(list_tests)) { + OutputJsonKey(stream, kTestsuite, "failures", + test_suite.failed_test_count(), kIndent); + OutputJsonKey(stream, kTestsuite, "disabled", + test_suite.reportable_disabled_test_count(), kIndent); + OutputJsonKey(stream, kTestsuite, "errors", 0, kIndent); + OutputJsonKey( + stream, kTestsuite, "timestamp", + FormatEpochTimeInMillisAsRFC3339(test_suite.start_timestamp()), + kIndent); + OutputJsonKey(stream, kTestsuite, "time", + FormatTimeInMillisAsDuration(test_suite.elapsed_time()), + kIndent, false); + *stream << TestPropertiesAsJson(test_suite.ad_hoc_test_result(), kIndent) + << ",\n"; + } + + *stream << kIndent << "\"" << kTestsuite << "\": [\n"; + + bool comma = false; + for (int i = 0; i < test_suite.total_test_count(); ++i) { + if (test_suite.GetTestInfo(i)->is_reportable()) { + if (comma) { + *stream << ",\n"; + } else { + comma = true; + } + OutputJsonTestInfo(stream, test_suite.name(), *test_suite.GetTestInfo(i)); + } + } + *stream << "\n" << kIndent << "]\n" << Indent(4) << "}"; +} + +// Prints a JSON summary of unit_test to output stream out. +void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream* stream, + const UnitTest& unit_test) { + const std::string kTestsuites = "testsuites"; + const std::string kIndent = Indent(2); + *stream << "{\n"; + + OutputJsonKey(stream, kTestsuites, "tests", unit_test.reportable_test_count(), + kIndent); + OutputJsonKey(stream, kTestsuites, "failures", unit_test.failed_test_count(), + kIndent); + OutputJsonKey(stream, kTestsuites, "disabled", + unit_test.reportable_disabled_test_count(), kIndent); + OutputJsonKey(stream, kTestsuites, "errors", 0, kIndent); + if (GTEST_FLAG_GET(shuffle)) { + OutputJsonKey(stream, kTestsuites, "random_seed", unit_test.random_seed(), + kIndent); + } + OutputJsonKey(stream, kTestsuites, "timestamp", + FormatEpochTimeInMillisAsRFC3339(unit_test.start_timestamp()), + kIndent); + OutputJsonKey(stream, kTestsuites, "time", + FormatTimeInMillisAsDuration(unit_test.elapsed_time()), kIndent, + false); + + *stream << TestPropertiesAsJson(unit_test.ad_hoc_test_result(), kIndent) + << ",\n"; + + OutputJsonKey(stream, kTestsuites, "name", "AllTests", kIndent); + *stream << kIndent << "\"" << kTestsuites << "\": [\n"; + + bool comma = false; + for (int i = 0; i < unit_test.total_test_suite_count(); ++i) { + if (unit_test.GetTestSuite(i)->reportable_test_count() > 0) { + if (comma) { + *stream << ",\n"; + } else { + comma = true; + } + PrintJsonTestSuite(stream, *unit_test.GetTestSuite(i)); + } + } + + // If there was a test failure outside of one of the test suites (like in a + // test environment) include that in the output. + if (unit_test.ad_hoc_test_result().Failed()) { + OutputJsonTestSuiteForTestResult(stream, unit_test.ad_hoc_test_result()); + } + + *stream << "\n" + << kIndent << "]\n" + << "}\n"; +} + +void JsonUnitTestResultPrinter::PrintJsonTestList( + std::ostream* stream, const std::vector& test_suites) { + const std::string kTestsuites = "testsuites"; + const std::string kIndent = Indent(2); + *stream << "{\n"; + int total_tests = 0; + for (auto test_suite : test_suites) { + total_tests += test_suite->total_test_count(); + } + OutputJsonKey(stream, kTestsuites, "tests", total_tests, kIndent); + + OutputJsonKey(stream, kTestsuites, "name", "AllTests", kIndent); + *stream << kIndent << "\"" << kTestsuites << "\": [\n"; + + for (size_t i = 0; i < test_suites.size(); ++i) { + if (i != 0) { + *stream << ",\n"; + } + PrintJsonTestSuite(stream, *test_suites[i]); + } + + *stream << "\n" + << kIndent << "]\n" + << "}\n"; +} +// Produces a string representing the test properties in a result as +// a JSON dictionary. +std::string JsonUnitTestResultPrinter::TestPropertiesAsJson( + const TestResult& result, const std::string& indent) { + Message attributes; + for (int i = 0; i < result.test_property_count(); ++i) { + const TestProperty& property = result.GetTestProperty(i); + attributes << ",\n" + << indent << "\"" << property.key() << "\": " + << "\"" << EscapeJson(property.value()) << "\""; + } + return attributes.GetString(); +} + +// End JsonUnitTestResultPrinter + +#if GTEST_CAN_STREAM_RESULTS_ + +// Checks if str contains '=', '&', '%' or '\n' characters. If yes, +// replaces them by "%xx" where xx is their hexadecimal value. For +// example, replaces "=" with "%3D". This algorithm is O(strlen(str)) +// in both time and space -- important as the input str may contain an +// arbitrarily long test failure message and stack trace. +std::string StreamingListener::UrlEncode(const char* str) { + std::string result; + result.reserve(strlen(str) + 1); + for (char ch = *str; ch != '\0'; ch = *++str) { + switch (ch) { + case '%': + case '=': + case '&': + case '\n': + result.append("%" + String::FormatByte(static_cast(ch))); + break; + default: + result.push_back(ch); + break; + } + } + return result; +} + +void StreamingListener::SocketWriter::MakeConnection() { + GTEST_CHECK_(sockfd_ == -1) + << "MakeConnection() can't be called when there is already a connection."; + + addrinfo hints; + memset(&hints, 0, sizeof(hints)); + hints.ai_family = AF_UNSPEC; // To allow both IPv4 and IPv6 addresses. + hints.ai_socktype = SOCK_STREAM; + addrinfo* servinfo = nullptr; + + // Use the getaddrinfo() to get a linked list of IP addresses for + // the given host name. + const int error_num = + getaddrinfo(host_name_.c_str(), port_num_.c_str(), &hints, &servinfo); + if (error_num != 0) { + GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: " + << gai_strerror(error_num); + } + + // Loop through all the results and connect to the first we can. + for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != nullptr; + cur_addr = cur_addr->ai_next) { + sockfd_ = socket(cur_addr->ai_family, cur_addr->ai_socktype, + cur_addr->ai_protocol); + if (sockfd_ != -1) { + // Connect the client socket to the server socket. + if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) { + close(sockfd_); + sockfd_ = -1; + } + } + } + + freeaddrinfo(servinfo); // all done with this structure + + if (sockfd_ == -1) { + GTEST_LOG_(WARNING) << "stream_result_to: failed to connect to " + << host_name_ << ":" << port_num_; + } +} + +// End of class Streaming Listener +#endif // GTEST_CAN_STREAM_RESULTS__ + +// class OsStackTraceGetter + +const char* const OsStackTraceGetterInterface::kElidedFramesMarker = + "... " GTEST_NAME_ " internal frames ..."; + +std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count) + GTEST_LOCK_EXCLUDED_(mutex_) { +#if GTEST_HAS_ABSL + std::string result; + + if (max_depth <= 0) { + return result; + } + + max_depth = std::min(max_depth, kMaxStackTraceDepth); + + std::vector raw_stack(max_depth); + // Skips the frames requested by the caller, plus this function. + const int raw_stack_size = + absl::GetStackTrace(&raw_stack[0], max_depth, skip_count + 1); + + void* caller_frame = nullptr; + { + MutexLock lock(&mutex_); + caller_frame = caller_frame_; + } + + for (int i = 0; i < raw_stack_size; ++i) { + if (raw_stack[i] == caller_frame && + !GTEST_FLAG_GET(show_internal_stack_frames)) { + // Add a marker to the trace and stop adding frames. + absl::StrAppend(&result, kElidedFramesMarker, "\n"); + break; + } + + char tmp[1024]; + const char* symbol = "(unknown)"; + if (absl::Symbolize(raw_stack[i], tmp, sizeof(tmp))) { + symbol = tmp; + } + + char line[1024]; + snprintf(line, sizeof(line), " %p: %s\n", raw_stack[i], symbol); + result += line; + } + + return result; + +#else // !GTEST_HAS_ABSL + static_cast(max_depth); + static_cast(skip_count); + return ""; +#endif // GTEST_HAS_ABSL +} + +void OsStackTraceGetter::UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_) { +#if GTEST_HAS_ABSL + void* caller_frame = nullptr; + if (absl::GetStackTrace(&caller_frame, 1, 3) <= 0) { + caller_frame = nullptr; + } + + MutexLock lock(&mutex_); + caller_frame_ = caller_frame; +#endif // GTEST_HAS_ABSL +} + +// A helper class that creates the premature-exit file in its +// constructor and deletes the file in its destructor. +class ScopedPrematureExitFile { + public: + explicit ScopedPrematureExitFile(const char* premature_exit_filepath) + : premature_exit_filepath_( + premature_exit_filepath ? premature_exit_filepath : "") { + // If a path to the premature-exit file is specified... + if (!premature_exit_filepath_.empty()) { + // create the file with a single "0" character in it. I/O + // errors are ignored as there's nothing better we can do and we + // don't want to fail the test because of this. + FILE* pfile = posix::FOpen(premature_exit_filepath_.c_str(), "w"); + fwrite("0", 1, 1, pfile); + fclose(pfile); + } + } + + ~ScopedPrematureExitFile() { +#if !defined GTEST_OS_ESP8266 + if (!premature_exit_filepath_.empty()) { + int retval = remove(premature_exit_filepath_.c_str()); + if (retval) { + GTEST_LOG_(ERROR) << "Failed to remove premature exit filepath \"" + << premature_exit_filepath_ << "\" with error " + << retval; + } + } +#endif + } + + private: + const std::string premature_exit_filepath_; + + ScopedPrematureExitFile(const ScopedPrematureExitFile&) = delete; + ScopedPrematureExitFile& operator=(const ScopedPrematureExitFile&) = delete; +}; + +} // namespace internal + +// class TestEventListeners + +TestEventListeners::TestEventListeners() + : repeater_(new internal::TestEventRepeater()), + default_result_printer_(nullptr), + default_xml_generator_(nullptr) {} + +TestEventListeners::~TestEventListeners() { delete repeater_; } + +// Returns the standard listener responsible for the default console +// output. Can be removed from the listeners list to shut down default +// console output. Note that removing this object from the listener list +// with Release transfers its ownership to the user. +void TestEventListeners::Append(TestEventListener* listener) { + repeater_->Append(listener); +} + +// Removes the given event listener from the list and returns it. It then +// becomes the caller's responsibility to delete the listener. Returns +// NULL if the listener is not found in the list. +TestEventListener* TestEventListeners::Release(TestEventListener* listener) { + if (listener == default_result_printer_) + default_result_printer_ = nullptr; + else if (listener == default_xml_generator_) + default_xml_generator_ = nullptr; + return repeater_->Release(listener); +} + +// Returns repeater that broadcasts the TestEventListener events to all +// subscribers. +TestEventListener* TestEventListeners::repeater() { return repeater_; } + +// Sets the default_result_printer attribute to the provided listener. +// The listener is also added to the listener list and previous +// default_result_printer is removed from it and deleted. The listener can +// also be NULL in which case it will not be added to the list. Does +// nothing if the previous and the current listener objects are the same. +void TestEventListeners::SetDefaultResultPrinter(TestEventListener* listener) { + if (default_result_printer_ != listener) { + // It is an error to pass this method a listener that is already in the + // list. + delete Release(default_result_printer_); + default_result_printer_ = listener; + if (listener != nullptr) Append(listener); + } +} + +// Sets the default_xml_generator attribute to the provided listener. The +// listener is also added to the listener list and previous +// default_xml_generator is removed from it and deleted. The listener can +// also be NULL in which case it will not be added to the list. Does +// nothing if the previous and the current listener objects are the same. +void TestEventListeners::SetDefaultXmlGenerator(TestEventListener* listener) { + if (default_xml_generator_ != listener) { + // It is an error to pass this method a listener that is already in the + // list. + delete Release(default_xml_generator_); + default_xml_generator_ = listener; + if (listener != nullptr) Append(listener); + } +} + +// Controls whether events will be forwarded by the repeater to the +// listeners in the list. +bool TestEventListeners::EventForwardingEnabled() const { + return repeater_->forwarding_enabled(); +} + +void TestEventListeners::SuppressEventForwarding() { + repeater_->set_forwarding_enabled(false); +} + +// class UnitTest + +// Gets the singleton UnitTest object. The first time this method is +// called, a UnitTest object is constructed and returned. Consecutive +// calls will return the same object. +// +// We don't protect this under mutex_ as a user is not supposed to +// call this before main() starts, from which point on the return +// value will never change. +UnitTest* UnitTest::GetInstance() { + // CodeGear C++Builder insists on a public destructor for the + // default implementation. Use this implementation to keep good OO + // design with private destructor. + +#if defined(__BORLANDC__) + static UnitTest* const instance = new UnitTest; + return instance; +#else + static UnitTest instance; + return &instance; +#endif // defined(__BORLANDC__) +} + +// Gets the number of successful test suites. +int UnitTest::successful_test_suite_count() const { + return impl()->successful_test_suite_count(); +} + +// Gets the number of failed test suites. +int UnitTest::failed_test_suite_count() const { + return impl()->failed_test_suite_count(); +} + +// Gets the number of all test suites. +int UnitTest::total_test_suite_count() const { + return impl()->total_test_suite_count(); +} + +// Gets the number of all test suites that contain at least one test +// that should run. +int UnitTest::test_suite_to_run_count() const { + return impl()->test_suite_to_run_count(); +} + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +int UnitTest::successful_test_case_count() const { + return impl()->successful_test_suite_count(); +} +int UnitTest::failed_test_case_count() const { + return impl()->failed_test_suite_count(); +} +int UnitTest::total_test_case_count() const { + return impl()->total_test_suite_count(); +} +int UnitTest::test_case_to_run_count() const { + return impl()->test_suite_to_run_count(); +} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +// Gets the number of successful tests. +int UnitTest::successful_test_count() const { + return impl()->successful_test_count(); +} + +// Gets the number of skipped tests. +int UnitTest::skipped_test_count() const { + return impl()->skipped_test_count(); +} + +// Gets the number of failed tests. +int UnitTest::failed_test_count() const { return impl()->failed_test_count(); } + +// Gets the number of disabled tests that will be reported in the XML report. +int UnitTest::reportable_disabled_test_count() const { + return impl()->reportable_disabled_test_count(); +} + +// Gets the number of disabled tests. +int UnitTest::disabled_test_count() const { + return impl()->disabled_test_count(); +} + +// Gets the number of tests to be printed in the XML report. +int UnitTest::reportable_test_count() const { + return impl()->reportable_test_count(); +} + +// Gets the number of all tests. +int UnitTest::total_test_count() const { return impl()->total_test_count(); } + +// Gets the number of tests that should run. +int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); } + +// Gets the time of the test program start, in ms from the start of the +// UNIX epoch. +internal::TimeInMillis UnitTest::start_timestamp() const { + return impl()->start_timestamp(); +} + +// Gets the elapsed time, in milliseconds. +internal::TimeInMillis UnitTest::elapsed_time() const { + return impl()->elapsed_time(); +} + +// Returns true if and only if the unit test passed (i.e. all test suites +// passed). +bool UnitTest::Passed() const { return impl()->Passed(); } + +// Returns true if and only if the unit test failed (i.e. some test suite +// failed or something outside of all tests failed). +bool UnitTest::Failed() const { return impl()->Failed(); } + +// Gets the i-th test suite among all the test suites. i can range from 0 to +// total_test_suite_count() - 1. If i is not in that range, returns NULL. +const TestSuite* UnitTest::GetTestSuite(int i) const { + return impl()->GetTestSuite(i); +} + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +const TestCase* UnitTest::GetTestCase(int i) const { + return impl()->GetTestCase(i); +} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +// Returns the TestResult containing information on test failures and +// properties logged outside of individual test suites. +const TestResult& UnitTest::ad_hoc_test_result() const { + return *impl()->ad_hoc_test_result(); +} + +// Gets the i-th test suite among all the test suites. i can range from 0 to +// total_test_suite_count() - 1. If i is not in that range, returns NULL. +TestSuite* UnitTest::GetMutableTestSuite(int i) { + return impl()->GetMutableSuiteCase(i); +} + +// Returns the list of event listeners that can be used to track events +// inside Google Test. +TestEventListeners& UnitTest::listeners() { return *impl()->listeners(); } + +// Registers and returns a global test environment. When a test +// program is run, all global test environments will be set-up in the +// order they were registered. After all tests in the program have +// finished, all global test environments will be torn-down in the +// *reverse* order they were registered. +// +// The UnitTest object takes ownership of the given environment. +// +// We don't protect this under mutex_, as we only support calling it +// from the main thread. +Environment* UnitTest::AddEnvironment(Environment* env) { + if (env == nullptr) { + return nullptr; + } + + impl_->environments().push_back(env); + return env; +} + +// Adds a TestPartResult to the current TestResult object. All Google Test +// assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call +// this to report their results. The user code should use the +// assertion macros instead of calling this directly. +void UnitTest::AddTestPartResult(TestPartResult::Type result_type, + const char* file_name, int line_number, + const std::string& message, + const std::string& os_stack_trace) + GTEST_LOCK_EXCLUDED_(mutex_) { + Message msg; + msg << message; + + internal::MutexLock lock(&mutex_); + if (impl_->gtest_trace_stack().size() > 0) { + msg << "\n" << GTEST_NAME_ << " trace:"; + + for (size_t i = impl_->gtest_trace_stack().size(); i > 0; --i) { + const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1]; + msg << "\n" + << internal::FormatFileLocation(trace.file, trace.line) << " " + << trace.message; + } + } + + if (os_stack_trace.c_str() != nullptr && !os_stack_trace.empty()) { + msg << internal::kStackTraceMarker << os_stack_trace; + } + + const TestPartResult result = TestPartResult( + result_type, file_name, line_number, msg.GetString().c_str()); + impl_->GetTestPartResultReporterForCurrentThread()->ReportTestPartResult( + result); + + if (result_type != TestPartResult::kSuccess && + result_type != TestPartResult::kSkip) { + // gtest_break_on_failure takes precedence over + // gtest_throw_on_failure. This allows a user to set the latter + // in the code (perhaps in order to use Google Test assertions + // with another testing framework) and specify the former on the + // command line for debugging. + if (GTEST_FLAG_GET(break_on_failure)) { +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT + // Using DebugBreak on Windows allows gtest to still break into a debugger + // when a failure happens and both the --gtest_break_on_failure and + // the --gtest_catch_exceptions flags are specified. + DebugBreak(); +#elif (!defined(__native_client__)) && \ + ((defined(__clang__) || defined(__GNUC__)) && \ + (defined(__x86_64__) || defined(__i386__))) + // with clang/gcc we can achieve the same effect on x86 by invoking int3 + asm("int3"); +#else + // Dereference nullptr through a volatile pointer to prevent the compiler + // from removing. We use this rather than abort() or __builtin_trap() for + // portability: some debuggers don't correctly trap abort(). + *static_cast(nullptr) = 1; +#endif // GTEST_OS_WINDOWS + } else if (GTEST_FLAG_GET(throw_on_failure)) { +#if GTEST_HAS_EXCEPTIONS + throw internal::GoogleTestFailureException(result); +#else + // We cannot call abort() as it generates a pop-up in debug mode + // that cannot be suppressed in VC 7.1 or below. + exit(1); +#endif + } + } +} + +// Adds a TestProperty to the current TestResult object when invoked from +// inside a test, to current TestSuite's ad_hoc_test_result_ when invoked +// from SetUpTestSuite or TearDownTestSuite, or to the global property set +// when invoked elsewhere. If the result already contains a property with +// the same key, the value will be updated. +void UnitTest::RecordProperty(const std::string& key, + const std::string& value) { + impl_->RecordProperty(TestProperty(key, value)); +} + +// Runs all tests in this UnitTest object and prints the result. +// Returns 0 if successful, or 1 otherwise. +// +// We don't protect this under mutex_, as we only support calling it +// from the main thread. +int UnitTest::Run() { + const bool in_death_test_child_process = + GTEST_FLAG_GET(internal_run_death_test).length() > 0; + + // Google Test implements this protocol for catching that a test + // program exits before returning control to Google Test: + // + // 1. Upon start, Google Test creates a file whose absolute path + // is specified by the environment variable + // TEST_PREMATURE_EXIT_FILE. + // 2. When Google Test has finished its work, it deletes the file. + // + // This allows a test runner to set TEST_PREMATURE_EXIT_FILE before + // running a Google-Test-based test program and check the existence + // of the file at the end of the test execution to see if it has + // exited prematurely. + + // If we are in the child process of a death test, don't + // create/delete the premature exit file, as doing so is unnecessary + // and will confuse the parent process. Otherwise, create/delete + // the file upon entering/leaving this function. If the program + // somehow exits before this function has a chance to return, the + // premature-exit file will be left undeleted, causing a test runner + // that understands the premature-exit-file protocol to report the + // test as having failed. + const internal::ScopedPrematureExitFile premature_exit_file( + in_death_test_child_process + ? nullptr + : internal::posix::GetEnv("TEST_PREMATURE_EXIT_FILE")); + + // Captures the value of GTEST_FLAG(catch_exceptions). This value will be + // used for the duration of the program. + impl()->set_catch_exceptions(GTEST_FLAG_GET(catch_exceptions)); + +#if GTEST_OS_WINDOWS + // Either the user wants Google Test to catch exceptions thrown by the + // tests or this is executing in the context of death test child + // process. In either case the user does not want to see pop-up dialogs + // about crashes - they are expected. + if (impl()->catch_exceptions() || in_death_test_child_process) { +#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT + // SetErrorMode doesn't exist on CE. + SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT | + SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX); +#endif // !GTEST_OS_WINDOWS_MOBILE + +#if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE + // Death test children can be terminated with _abort(). On Windows, + // _abort() can show a dialog with a warning message. This forces the + // abort message to go to stderr instead. + _set_error_mode(_OUT_TO_STDERR); +#endif + +#if defined(_MSC_VER) && !GTEST_OS_WINDOWS_MOBILE + // In the debug version, Visual Studio pops up a separate dialog + // offering a choice to debug the aborted program. We need to suppress + // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement + // executed. Google Test will notify the user of any unexpected + // failure via stderr. + if (!GTEST_FLAG_GET(break_on_failure)) + _set_abort_behavior( + 0x0, // Clear the following flags: + _WRITE_ABORT_MSG | _CALL_REPORTFAULT); // pop-up window, core dump. + + // In debug mode, the Windows CRT can crash with an assertion over invalid + // input (e.g. passing an invalid file descriptor). The default handling + // for these assertions is to pop up a dialog and wait for user input. + // Instead ask the CRT to dump such assertions to stderr non-interactively. + if (!IsDebuggerPresent()) { + (void)_CrtSetReportMode(_CRT_ASSERT, + _CRTDBG_MODE_FILE | _CRTDBG_MODE_DEBUG); + (void)_CrtSetReportFile(_CRT_ASSERT, _CRTDBG_FILE_STDERR); + } +#endif + } +#endif // GTEST_OS_WINDOWS + + return internal::HandleExceptionsInMethodIfSupported( + impl(), &internal::UnitTestImpl::RunAllTests, + "auxiliary test code (environments or event listeners)") + ? 0 + : 1; +} + +// Returns the working directory when the first TEST() or TEST_F() was +// executed. +const char* UnitTest::original_working_dir() const { + return impl_->original_working_dir_.c_str(); +} + +// Returns the TestSuite object for the test that's currently running, +// or NULL if no test is running. +const TestSuite* UnitTest::current_test_suite() const + GTEST_LOCK_EXCLUDED_(mutex_) { + internal::MutexLock lock(&mutex_); + return impl_->current_test_suite(); +} + +// Legacy API is still available but deprecated +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +const TestCase* UnitTest::current_test_case() const + GTEST_LOCK_EXCLUDED_(mutex_) { + internal::MutexLock lock(&mutex_); + return impl_->current_test_suite(); +} +#endif + +// Returns the TestInfo object for the test that's currently running, +// or NULL if no test is running. +const TestInfo* UnitTest::current_test_info() const + GTEST_LOCK_EXCLUDED_(mutex_) { + internal::MutexLock lock(&mutex_); + return impl_->current_test_info(); +} + +// Returns the random seed used at the start of the current test run. +int UnitTest::random_seed() const { return impl_->random_seed(); } + +// Returns ParameterizedTestSuiteRegistry object used to keep track of +// value-parameterized tests and instantiate and register them. +internal::ParameterizedTestSuiteRegistry& +UnitTest::parameterized_test_registry() GTEST_LOCK_EXCLUDED_(mutex_) { + return impl_->parameterized_test_registry(); +} + +// Creates an empty UnitTest. +UnitTest::UnitTest() { impl_ = new internal::UnitTestImpl(this); } + +// Destructor of UnitTest. +UnitTest::~UnitTest() { delete impl_; } + +// Pushes a trace defined by SCOPED_TRACE() on to the per-thread +// Google Test trace stack. +void UnitTest::PushGTestTrace(const internal::TraceInfo& trace) + GTEST_LOCK_EXCLUDED_(mutex_) { + internal::MutexLock lock(&mutex_); + impl_->gtest_trace_stack().push_back(trace); +} + +// Pops a trace from the per-thread Google Test trace stack. +void UnitTest::PopGTestTrace() GTEST_LOCK_EXCLUDED_(mutex_) { + internal::MutexLock lock(&mutex_); + impl_->gtest_trace_stack().pop_back(); +} + +namespace internal { + +UnitTestImpl::UnitTestImpl(UnitTest* parent) + : parent_(parent), + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4355 /* using this in initializer */) + default_global_test_part_result_reporter_(this), + default_per_thread_test_part_result_reporter_(this), + GTEST_DISABLE_MSC_WARNINGS_POP_() global_test_part_result_repoter_( + &default_global_test_part_result_reporter_), + per_thread_test_part_result_reporter_( + &default_per_thread_test_part_result_reporter_), + parameterized_test_registry_(), + parameterized_tests_registered_(false), + last_death_test_suite_(-1), + current_test_suite_(nullptr), + current_test_info_(nullptr), + ad_hoc_test_result_(), + os_stack_trace_getter_(nullptr), + post_flag_parse_init_performed_(false), + random_seed_(0), // Will be overridden by the flag before first use. + random_(0), // Will be reseeded before first use. + start_timestamp_(0), + elapsed_time_(0), +#if GTEST_HAS_DEATH_TEST + death_test_factory_(new DefaultDeathTestFactory), +#endif + // Will be overridden by the flag before first use. + catch_exceptions_(false) { + listeners()->SetDefaultResultPrinter(new PrettyUnitTestResultPrinter); +} + +UnitTestImpl::~UnitTestImpl() { + // Deletes every TestSuite. + ForEach(test_suites_, internal::Delete); + + // Deletes every Environment. + ForEach(environments_, internal::Delete); + + delete os_stack_trace_getter_; +} + +// Adds a TestProperty to the current TestResult object when invoked in a +// context of a test, to current test suite's ad_hoc_test_result when invoke +// from SetUpTestSuite/TearDownTestSuite, or to the global property set +// otherwise. If the result already contains a property with the same key, +// the value will be updated. +void UnitTestImpl::RecordProperty(const TestProperty& test_property) { + std::string xml_element; + TestResult* test_result; // TestResult appropriate for property recording. + + if (current_test_info_ != nullptr) { + xml_element = "testcase"; + test_result = &(current_test_info_->result_); + } else if (current_test_suite_ != nullptr) { + xml_element = "testsuite"; + test_result = &(current_test_suite_->ad_hoc_test_result_); + } else { + xml_element = "testsuites"; + test_result = &ad_hoc_test_result_; + } + test_result->RecordProperty(xml_element, test_property); +} + +#if GTEST_HAS_DEATH_TEST +// Disables event forwarding if the control is currently in a death test +// subprocess. Must not be called before InitGoogleTest. +void UnitTestImpl::SuppressTestEventsIfInSubprocess() { + if (internal_run_death_test_flag_.get() != nullptr) + listeners()->SuppressEventForwarding(); +} +#endif // GTEST_HAS_DEATH_TEST + +// Initializes event listeners performing XML output as specified by +// UnitTestOptions. Must not be called before InitGoogleTest. +void UnitTestImpl::ConfigureXmlOutput() { + const std::string& output_format = UnitTestOptions::GetOutputFormat(); + if (output_format == "xml") { + listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter( + UnitTestOptions::GetAbsolutePathToOutputFile().c_str())); + } else if (output_format == "json") { + listeners()->SetDefaultXmlGenerator(new JsonUnitTestResultPrinter( + UnitTestOptions::GetAbsolutePathToOutputFile().c_str())); + } else if (output_format != "") { + GTEST_LOG_(WARNING) << "WARNING: unrecognized output format \"" + << output_format << "\" ignored."; + } +} + +#if GTEST_CAN_STREAM_RESULTS_ +// Initializes event listeners for streaming test results in string form. +// Must not be called before InitGoogleTest. +void UnitTestImpl::ConfigureStreamingOutput() { + const std::string& target = GTEST_FLAG_GET(stream_result_to); + if (!target.empty()) { + const size_t pos = target.find(':'); + if (pos != std::string::npos) { + listeners()->Append( + new StreamingListener(target.substr(0, pos), target.substr(pos + 1))); + } else { + GTEST_LOG_(WARNING) << "unrecognized streaming target \"" << target + << "\" ignored."; + } + } +} +#endif // GTEST_CAN_STREAM_RESULTS_ + +// Performs initialization dependent upon flag values obtained in +// ParseGoogleTestFlagsOnly. Is called from InitGoogleTest after the call to +// ParseGoogleTestFlagsOnly. In case a user neglects to call InitGoogleTest +// this function is also called from RunAllTests. Since this function can be +// called more than once, it has to be idempotent. +void UnitTestImpl::PostFlagParsingInit() { + // Ensures that this function does not execute more than once. + if (!post_flag_parse_init_performed_) { + post_flag_parse_init_performed_ = true; + +#if defined(GTEST_CUSTOM_TEST_EVENT_LISTENER_) + // Register to send notifications about key process state changes. + listeners()->Append(new GTEST_CUSTOM_TEST_EVENT_LISTENER_()); +#endif // defined(GTEST_CUSTOM_TEST_EVENT_LISTENER_) + +#if GTEST_HAS_DEATH_TEST + InitDeathTestSubprocessControlInfo(); + SuppressTestEventsIfInSubprocess(); +#endif // GTEST_HAS_DEATH_TEST + + // Registers parameterized tests. This makes parameterized tests + // available to the UnitTest reflection API without running + // RUN_ALL_TESTS. + RegisterParameterizedTests(); + + // Configures listeners for XML output. This makes it possible for users + // to shut down the default XML output before invoking RUN_ALL_TESTS. + ConfigureXmlOutput(); + + if (GTEST_FLAG_GET(brief)) { + listeners()->SetDefaultResultPrinter(new BriefUnitTestResultPrinter); + } + +#if GTEST_CAN_STREAM_RESULTS_ + // Configures listeners for streaming test results to the specified server. + ConfigureStreamingOutput(); +#endif // GTEST_CAN_STREAM_RESULTS_ + +#if GTEST_HAS_ABSL + if (GTEST_FLAG_GET(install_failure_signal_handler)) { + absl::FailureSignalHandlerOptions options; + absl::InstallFailureSignalHandler(options); + } +#endif // GTEST_HAS_ABSL + } +} + +// A predicate that checks the name of a TestSuite against a known +// value. +// +// This is used for implementation of the UnitTest class only. We put +// it in the anonymous namespace to prevent polluting the outer +// namespace. +// +// TestSuiteNameIs is copyable. +class TestSuiteNameIs { + public: + // Constructor. + explicit TestSuiteNameIs(const std::string& name) : name_(name) {} + + // Returns true if and only if the name of test_suite matches name_. + bool operator()(const TestSuite* test_suite) const { + return test_suite != nullptr && + strcmp(test_suite->name(), name_.c_str()) == 0; + } + + private: + std::string name_; +}; + +// Finds and returns a TestSuite with the given name. If one doesn't +// exist, creates one and returns it. It's the CALLER'S +// RESPONSIBILITY to ensure that this function is only called WHEN THE +// TESTS ARE NOT SHUFFLED. +// +// Arguments: +// +// test_suite_name: name of the test suite +// type_param: the name of the test suite's type parameter, or NULL if +// this is not a typed or a type-parameterized test suite. +// set_up_tc: pointer to the function that sets up the test suite +// tear_down_tc: pointer to the function that tears down the test suite +TestSuite* UnitTestImpl::GetTestSuite( + const char* test_suite_name, const char* type_param, + internal::SetUpTestSuiteFunc set_up_tc, + internal::TearDownTestSuiteFunc tear_down_tc) { + // Can we find a TestSuite with the given name? + const auto test_suite = + std::find_if(test_suites_.rbegin(), test_suites_.rend(), + TestSuiteNameIs(test_suite_name)); + + if (test_suite != test_suites_.rend()) return *test_suite; + + // No. Let's create one. + auto* const new_test_suite = + new TestSuite(test_suite_name, type_param, set_up_tc, tear_down_tc); + + const UnitTestFilter death_test_suite_filter(kDeathTestSuiteFilter); + // Is this a death test suite? + if (death_test_suite_filter.MatchesName(test_suite_name)) { + // Yes. Inserts the test suite after the last death test suite + // defined so far. This only works when the test suites haven't + // been shuffled. Otherwise we may end up running a death test + // after a non-death test. + ++last_death_test_suite_; + test_suites_.insert(test_suites_.begin() + last_death_test_suite_, + new_test_suite); + } else { + // No. Appends to the end of the list. + test_suites_.push_back(new_test_suite); + } + + test_suite_indices_.push_back(static_cast(test_suite_indices_.size())); + return new_test_suite; +} + +// Helpers for setting up / tearing down the given environment. They +// are for use in the ForEach() function. +static void SetUpEnvironment(Environment* env) { env->SetUp(); } +static void TearDownEnvironment(Environment* env) { env->TearDown(); } + +// Runs all tests in this UnitTest object, prints the result, and +// returns true if all tests are successful. If any exception is +// thrown during a test, the test is considered to be failed, but the +// rest of the tests will still be run. +// +// When parameterized tests are enabled, it expands and registers +// parameterized tests first in RegisterParameterizedTests(). +// All other functions called from RunAllTests() may safely assume that +// parameterized tests are ready to be counted and run. +bool UnitTestImpl::RunAllTests() { + // True if and only if Google Test is initialized before RUN_ALL_TESTS() is + // called. + const bool gtest_is_initialized_before_run_all_tests = GTestIsInitialized(); + + // Do not run any test if the --help flag was specified. + if (g_help_flag) return true; + + // Repeats the call to the post-flag parsing initialization in case the + // user didn't call InitGoogleTest. + PostFlagParsingInit(); + + // Even if sharding is not on, test runners may want to use the + // GTEST_SHARD_STATUS_FILE to query whether the test supports the sharding + // protocol. + internal::WriteToShardStatusFileIfNeeded(); + + // True if and only if we are in a subprocess for running a thread-safe-style + // death test. + bool in_subprocess_for_death_test = false; + +#if GTEST_HAS_DEATH_TEST + in_subprocess_for_death_test = + (internal_run_death_test_flag_.get() != nullptr); +#if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_) + if (in_subprocess_for_death_test) { + GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_(); + } +#endif // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_) +#endif // GTEST_HAS_DEATH_TEST + + const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex, + in_subprocess_for_death_test); + + // Compares the full test names with the filter to decide which + // tests to run. + const bool has_tests_to_run = + FilterTests(should_shard ? HONOR_SHARDING_PROTOCOL + : IGNORE_SHARDING_PROTOCOL) > 0; + + // Lists the tests and exits if the --gtest_list_tests flag was specified. + if (GTEST_FLAG_GET(list_tests)) { + // This must be called *after* FilterTests() has been called. + ListTestsMatchingFilter(); + return true; + } + + random_seed_ = GetRandomSeedFromFlag(GTEST_FLAG_GET(random_seed)); + + // True if and only if at least one test has failed. + bool failed = false; + + TestEventListener* repeater = listeners()->repeater(); + + start_timestamp_ = GetTimeInMillis(); + repeater->OnTestProgramStart(*parent_); + + // How many times to repeat the tests? We don't want to repeat them + // when we are inside the subprocess of a death test. + const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG_GET(repeat); + + // Repeats forever if the repeat count is negative. + const bool gtest_repeat_forever = repeat < 0; + + // Should test environments be set up and torn down for each repeat, or only + // set up on the first and torn down on the last iteration? If there is no + // "last" iteration because the tests will repeat forever, always recreate the + // environments to avoid leaks in case one of the environments is using + // resources that are external to this process. Without this check there would + // be no way to clean up those external resources automatically. + const bool recreate_environments_when_repeating = + GTEST_FLAG_GET(recreate_environments_when_repeating) || + gtest_repeat_forever; + + for (int i = 0; gtest_repeat_forever || i != repeat; i++) { + // We want to preserve failures generated by ad-hoc test + // assertions executed before RUN_ALL_TESTS(). + ClearNonAdHocTestResult(); + + Timer timer; + + // Shuffles test suites and tests if requested. + if (has_tests_to_run && GTEST_FLAG_GET(shuffle)) { + random()->Reseed(static_cast(random_seed_)); + // This should be done before calling OnTestIterationStart(), + // such that a test event listener can see the actual test order + // in the event. + ShuffleTests(); + } + + // Tells the unit test event listeners that the tests are about to start. + repeater->OnTestIterationStart(*parent_, i); + + // Runs each test suite if there is at least one test to run. + if (has_tests_to_run) { + // Sets up all environments beforehand. If test environments aren't + // recreated for each iteration, only do so on the first iteration. + if (i == 0 || recreate_environments_when_repeating) { + repeater->OnEnvironmentsSetUpStart(*parent_); + ForEach(environments_, SetUpEnvironment); + repeater->OnEnvironmentsSetUpEnd(*parent_); + } + + // Runs the tests only if there was no fatal failure or skip triggered + // during global set-up. + if (Test::IsSkipped()) { + // Emit diagnostics when global set-up calls skip, as it will not be + // emitted by default. + TestResult& test_result = + *internal::GetUnitTestImpl()->current_test_result(); + for (int j = 0; j < test_result.total_part_count(); ++j) { + const TestPartResult& test_part_result = + test_result.GetTestPartResult(j); + if (test_part_result.type() == TestPartResult::kSkip) { + const std::string& result = test_part_result.message(); + printf("%s\n", result.c_str()); + } + } + fflush(stdout); + } else if (!Test::HasFatalFailure()) { + for (int test_index = 0; test_index < total_test_suite_count(); + test_index++) { + GetMutableSuiteCase(test_index)->Run(); + if (GTEST_FLAG_GET(fail_fast) && + GetMutableSuiteCase(test_index)->Failed()) { + for (int j = test_index + 1; j < total_test_suite_count(); j++) { + GetMutableSuiteCase(j)->Skip(); + } + break; + } + } + } else if (Test::HasFatalFailure()) { + // If there was a fatal failure during the global setup then we know we + // aren't going to run any tests. Explicitly mark all of the tests as + // skipped to make this obvious in the output. + for (int test_index = 0; test_index < total_test_suite_count(); + test_index++) { + GetMutableSuiteCase(test_index)->Skip(); + } + } + + // Tears down all environments in reverse order afterwards. If test + // environments aren't recreated for each iteration, only do so on the + // last iteration. + if (i == repeat - 1 || recreate_environments_when_repeating) { + repeater->OnEnvironmentsTearDownStart(*parent_); + std::for_each(environments_.rbegin(), environments_.rend(), + TearDownEnvironment); + repeater->OnEnvironmentsTearDownEnd(*parent_); + } + } + + elapsed_time_ = timer.Elapsed(); + + // Tells the unit test event listener that the tests have just finished. + repeater->OnTestIterationEnd(*parent_, i); + + // Gets the result and clears it. + if (!Passed()) { + failed = true; + } + + // Restores the original test order after the iteration. This + // allows the user to quickly repro a failure that happens in the + // N-th iteration without repeating the first (N - 1) iterations. + // This is not enclosed in "if (GTEST_FLAG(shuffle)) { ... }", in + // case the user somehow changes the value of the flag somewhere + // (it's always safe to unshuffle the tests). + UnshuffleTests(); + + if (GTEST_FLAG_GET(shuffle)) { + // Picks a new random seed for each iteration. + random_seed_ = GetNextRandomSeed(random_seed_); + } + } + + repeater->OnTestProgramEnd(*parent_); + + if (!gtest_is_initialized_before_run_all_tests) { + ColoredPrintf( + GTestColor::kRed, + "\nIMPORTANT NOTICE - DO NOT IGNORE:\n" + "This test program did NOT call " GTEST_INIT_GOOGLE_TEST_NAME_ + "() before calling RUN_ALL_TESTS(). This is INVALID. Soon " GTEST_NAME_ + " will start to enforce the valid usage. " + "Please fix it ASAP, or IT WILL START TO FAIL.\n"); // NOLINT +#if GTEST_FOR_GOOGLE_ + ColoredPrintf(GTestColor::kRed, + "For more details, see http://wiki/Main/ValidGUnitMain.\n"); +#endif // GTEST_FOR_GOOGLE_ + } + + return !failed; +} + +// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file +// if the variable is present. If a file already exists at this location, this +// function will write over it. If the variable is present, but the file cannot +// be created, prints an error and exits. +void WriteToShardStatusFileIfNeeded() { + const char* const test_shard_file = posix::GetEnv(kTestShardStatusFile); + if (test_shard_file != nullptr) { + FILE* const file = posix::FOpen(test_shard_file, "w"); + if (file == nullptr) { + ColoredPrintf(GTestColor::kRed, + "Could not write to the test shard status file \"%s\" " + "specified by the %s environment variable.\n", + test_shard_file, kTestShardStatusFile); + fflush(stdout); + exit(EXIT_FAILURE); + } + fclose(file); + } +} + +// Checks whether sharding is enabled by examining the relevant +// environment variable values. If the variables are present, +// but inconsistent (i.e., shard_index >= total_shards), prints +// an error and exits. If in_subprocess_for_death_test, sharding is +// disabled because it must only be applied to the original test +// process. Otherwise, we could filter out death tests we intended to execute. +bool ShouldShard(const char* total_shards_env, const char* shard_index_env, + bool in_subprocess_for_death_test) { + if (in_subprocess_for_death_test) { + return false; + } + + const int32_t total_shards = Int32FromEnvOrDie(total_shards_env, -1); + const int32_t shard_index = Int32FromEnvOrDie(shard_index_env, -1); + + if (total_shards == -1 && shard_index == -1) { + return false; + } else if (total_shards == -1 && shard_index != -1) { + const Message msg = Message() << "Invalid environment variables: you have " + << kTestShardIndex << " = " << shard_index + << ", but have left " << kTestTotalShards + << " unset.\n"; + ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str()); + fflush(stdout); + exit(EXIT_FAILURE); + } else if (total_shards != -1 && shard_index == -1) { + const Message msg = Message() + << "Invalid environment variables: you have " + << kTestTotalShards << " = " << total_shards + << ", but have left " << kTestShardIndex << " unset.\n"; + ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str()); + fflush(stdout); + exit(EXIT_FAILURE); + } else if (shard_index < 0 || shard_index >= total_shards) { + const Message msg = + Message() << "Invalid environment variables: we require 0 <= " + << kTestShardIndex << " < " << kTestTotalShards + << ", but you have " << kTestShardIndex << "=" << shard_index + << ", " << kTestTotalShards << "=" << total_shards << ".\n"; + ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str()); + fflush(stdout); + exit(EXIT_FAILURE); + } + + return total_shards > 1; +} + +// Parses the environment variable var as an Int32. If it is unset, +// returns default_val. If it is not an Int32, prints an error +// and aborts. +int32_t Int32FromEnvOrDie(const char* var, int32_t default_val) { + const char* str_val = posix::GetEnv(var); + if (str_val == nullptr) { + return default_val; + } + + int32_t result; + if (!ParseInt32(Message() << "The value of environment variable " << var, + str_val, &result)) { + exit(EXIT_FAILURE); + } + return result; +} + +// Given the total number of shards, the shard index, and the test id, +// returns true if and only if the test should be run on this shard. The test id +// is some arbitrary but unique non-negative integer assigned to each test +// method. Assumes that 0 <= shard_index < total_shards. +bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) { + return (test_id % total_shards) == shard_index; +} + +// Compares the name of each test with the user-specified filter to +// decide whether the test should be run, then records the result in +// each TestSuite and TestInfo object. +// If shard_tests == true, further filters tests based on sharding +// variables in the environment - see +// https://github.com/google/googletest/blob/master/googletest/docs/advanced.md +// . Returns the number of tests that should run. +int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) { + const int32_t total_shards = shard_tests == HONOR_SHARDING_PROTOCOL + ? Int32FromEnvOrDie(kTestTotalShards, -1) + : -1; + const int32_t shard_index = shard_tests == HONOR_SHARDING_PROTOCOL + ? Int32FromEnvOrDie(kTestShardIndex, -1) + : -1; + + const PositiveAndNegativeUnitTestFilter gtest_flag_filter( + GTEST_FLAG_GET(filter)); + const UnitTestFilter disable_test_filter(kDisableTestFilter); + // num_runnable_tests are the number of tests that will + // run across all shards (i.e., match filter and are not disabled). + // num_selected_tests are the number of tests to be run on + // this shard. + int num_runnable_tests = 0; + int num_selected_tests = 0; + for (auto* test_suite : test_suites_) { + const std::string& test_suite_name = test_suite->name(); + test_suite->set_should_run(false); + + for (size_t j = 0; j < test_suite->test_info_list().size(); j++) { + TestInfo* const test_info = test_suite->test_info_list()[j]; + const std::string test_name(test_info->name()); + // A test is disabled if test suite name or test name matches + // kDisableTestFilter. + const bool is_disabled = + disable_test_filter.MatchesName(test_suite_name) || + disable_test_filter.MatchesName(test_name); + test_info->is_disabled_ = is_disabled; + + const bool matches_filter = + gtest_flag_filter.MatchesTest(test_suite_name, test_name); + test_info->matches_filter_ = matches_filter; + + const bool is_runnable = + (GTEST_FLAG_GET(also_run_disabled_tests) || !is_disabled) && + matches_filter; + + const bool is_in_another_shard = + shard_tests != IGNORE_SHARDING_PROTOCOL && + !ShouldRunTestOnShard(total_shards, shard_index, num_runnable_tests); + test_info->is_in_another_shard_ = is_in_another_shard; + const bool is_selected = is_runnable && !is_in_another_shard; + + num_runnable_tests += is_runnable; + num_selected_tests += is_selected; + + test_info->should_run_ = is_selected; + test_suite->set_should_run(test_suite->should_run() || is_selected); + } + } + return num_selected_tests; +} + +// Prints the given C-string on a single line by replacing all '\n' +// characters with string "\\n". If the output takes more than +// max_length characters, only prints the first max_length characters +// and "...". +static void PrintOnOneLine(const char* str, int max_length) { + if (str != nullptr) { + for (int i = 0; *str != '\0'; ++str) { + if (i >= max_length) { + printf("..."); + break; + } + if (*str == '\n') { + printf("\\n"); + i += 2; + } else { + printf("%c", *str); + ++i; + } + } + } +} + +// Prints the names of the tests matching the user-specified filter flag. +void UnitTestImpl::ListTestsMatchingFilter() { + // Print at most this many characters for each type/value parameter. + const int kMaxParamLength = 250; + + for (auto* test_suite : test_suites_) { + bool printed_test_suite_name = false; + + for (size_t j = 0; j < test_suite->test_info_list().size(); j++) { + const TestInfo* const test_info = test_suite->test_info_list()[j]; + if (test_info->matches_filter_) { + if (!printed_test_suite_name) { + printed_test_suite_name = true; + printf("%s.", test_suite->name()); + if (test_suite->type_param() != nullptr) { + printf(" # %s = ", kTypeParamLabel); + // We print the type parameter on a single line to make + // the output easy to parse by a program. + PrintOnOneLine(test_suite->type_param(), kMaxParamLength); + } + printf("\n"); + } + printf(" %s", test_info->name()); + if (test_info->value_param() != nullptr) { + printf(" # %s = ", kValueParamLabel); + // We print the value parameter on a single line to make the + // output easy to parse by a program. + PrintOnOneLine(test_info->value_param(), kMaxParamLength); + } + printf("\n"); + } + } + } + fflush(stdout); + const std::string& output_format = UnitTestOptions::GetOutputFormat(); + if (output_format == "xml" || output_format == "json") { + FILE* fileout = OpenFileForWriting( + UnitTestOptions::GetAbsolutePathToOutputFile().c_str()); + std::stringstream stream; + if (output_format == "xml") { + XmlUnitTestResultPrinter( + UnitTestOptions::GetAbsolutePathToOutputFile().c_str()) + .PrintXmlTestsList(&stream, test_suites_); + } else if (output_format == "json") { + JsonUnitTestResultPrinter( + UnitTestOptions::GetAbsolutePathToOutputFile().c_str()) + .PrintJsonTestList(&stream, test_suites_); + } + fprintf(fileout, "%s", StringStreamToString(&stream).c_str()); + fclose(fileout); + } +} + +// Sets the OS stack trace getter. +// +// Does nothing if the input and the current OS stack trace getter are +// the same; otherwise, deletes the old getter and makes the input the +// current getter. +void UnitTestImpl::set_os_stack_trace_getter( + OsStackTraceGetterInterface* getter) { + if (os_stack_trace_getter_ != getter) { + delete os_stack_trace_getter_; + os_stack_trace_getter_ = getter; + } +} + +// Returns the current OS stack trace getter if it is not NULL; +// otherwise, creates an OsStackTraceGetter, makes it the current +// getter, and returns it. +OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() { + if (os_stack_trace_getter_ == nullptr) { +#ifdef GTEST_OS_STACK_TRACE_GETTER_ + os_stack_trace_getter_ = new GTEST_OS_STACK_TRACE_GETTER_; +#else + os_stack_trace_getter_ = new OsStackTraceGetter; +#endif // GTEST_OS_STACK_TRACE_GETTER_ + } + + return os_stack_trace_getter_; +} + +// Returns the most specific TestResult currently running. +TestResult* UnitTestImpl::current_test_result() { + if (current_test_info_ != nullptr) { + return ¤t_test_info_->result_; + } + if (current_test_suite_ != nullptr) { + return ¤t_test_suite_->ad_hoc_test_result_; + } + return &ad_hoc_test_result_; +} + +// Shuffles all test suites, and the tests within each test suite, +// making sure that death tests are still run first. +void UnitTestImpl::ShuffleTests() { + // Shuffles the death test suites. + ShuffleRange(random(), 0, last_death_test_suite_ + 1, &test_suite_indices_); + + // Shuffles the non-death test suites. + ShuffleRange(random(), last_death_test_suite_ + 1, + static_cast(test_suites_.size()), &test_suite_indices_); + + // Shuffles the tests inside each test suite. + for (auto& test_suite : test_suites_) { + test_suite->ShuffleTests(random()); + } +} + +// Restores the test suites and tests to their order before the first shuffle. +void UnitTestImpl::UnshuffleTests() { + for (size_t i = 0; i < test_suites_.size(); i++) { + // Unshuffles the tests in each test suite. + test_suites_[i]->UnshuffleTests(); + // Resets the index of each test suite. + test_suite_indices_[i] = static_cast(i); + } +} + +// Returns the current OS stack trace as an std::string. +// +// The maximum number of stack frames to be included is specified by +// the gtest_stack_trace_depth flag. The skip_count parameter +// specifies the number of top frames to be skipped, which doesn't +// count against the number of frames to be included. +// +// For example, if Foo() calls Bar(), which in turn calls +// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in +// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't. +GTEST_NO_INLINE_ GTEST_NO_TAIL_CALL_ std::string +GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/, int skip_count) { + // We pass skip_count + 1 to skip this wrapper function in addition + // to what the user really wants to skip. + return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1); +} + +// Used by the GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_ macro to +// suppress unreachable code warnings. +namespace { +class ClassUniqueToAlwaysTrue {}; +} // namespace + +bool IsTrue(bool condition) { return condition; } + +bool AlwaysTrue() { +#if GTEST_HAS_EXCEPTIONS + // This condition is always false so AlwaysTrue() never actually throws, + // but it makes the compiler think that it may throw. + if (IsTrue(false)) throw ClassUniqueToAlwaysTrue(); +#endif // GTEST_HAS_EXCEPTIONS + return true; +} + +// If *pstr starts with the given prefix, modifies *pstr to be right +// past the prefix and returns true; otherwise leaves *pstr unchanged +// and returns false. None of pstr, *pstr, and prefix can be NULL. +bool SkipPrefix(const char* prefix, const char** pstr) { + const size_t prefix_len = strlen(prefix); + if (strncmp(*pstr, prefix, prefix_len) == 0) { + *pstr += prefix_len; + return true; + } + return false; +} + +// Parses a string as a command line flag. The string should have +// the format "--flag=value". When def_optional is true, the "=value" +// part can be omitted. +// +// Returns the value of the flag, or NULL if the parsing failed. +static const char* ParseFlagValue(const char* str, const char* flag_name, + bool def_optional) { + // str and flag must not be NULL. + if (str == nullptr || flag_name == nullptr) return nullptr; + + // The flag must start with "--" followed by GTEST_FLAG_PREFIX_. + const std::string flag_str = + std::string("--") + GTEST_FLAG_PREFIX_ + flag_name; + const size_t flag_len = flag_str.length(); + if (strncmp(str, flag_str.c_str(), flag_len) != 0) return nullptr; + + // Skips the flag name. + const char* flag_end = str + flag_len; + + // When def_optional is true, it's OK to not have a "=value" part. + if (def_optional && (flag_end[0] == '\0')) { + return flag_end; + } + + // If def_optional is true and there are more characters after the + // flag name, or if def_optional is false, there must be a '=' after + // the flag name. + if (flag_end[0] != '=') return nullptr; + + // Returns the string after "=". + return flag_end + 1; +} + +// Parses a string for a bool flag, in the form of either +// "--flag=value" or "--flag". +// +// In the former case, the value is taken as true as long as it does +// not start with '0', 'f', or 'F'. +// +// In the latter case, the value is taken as true. +// +// On success, stores the value of the flag in *value, and returns +// true. On failure, returns false without changing *value. +static bool ParseFlag(const char* str, const char* flag_name, bool* value) { + // Gets the value of the flag as a string. + const char* const value_str = ParseFlagValue(str, flag_name, true); + + // Aborts if the parsing failed. + if (value_str == nullptr) return false; + + // Converts the string value to a bool. + *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F'); + return true; +} + +// Parses a string for an int32_t flag, in the form of "--flag=value". +// +// On success, stores the value of the flag in *value, and returns +// true. On failure, returns false without changing *value. +bool ParseFlag(const char* str, const char* flag_name, int32_t* value) { + // Gets the value of the flag as a string. + const char* const value_str = ParseFlagValue(str, flag_name, false); + + // Aborts if the parsing failed. + if (value_str == nullptr) return false; + + // Sets *value to the value of the flag. + return ParseInt32(Message() << "The value of flag --" << flag_name, value_str, + value); +} + +// Parses a string for a string flag, in the form of "--flag=value". +// +// On success, stores the value of the flag in *value, and returns +// true. On failure, returns false without changing *value. +template +static bool ParseFlag(const char* str, const char* flag_name, String* value) { + // Gets the value of the flag as a string. + const char* const value_str = ParseFlagValue(str, flag_name, false); + + // Aborts if the parsing failed. + if (value_str == nullptr) return false; + + // Sets *value to the value of the flag. + *value = value_str; + return true; +} + +// Determines whether a string has a prefix that Google Test uses for its +// flags, i.e., starts with GTEST_FLAG_PREFIX_ or GTEST_FLAG_PREFIX_DASH_. +// If Google Test detects that a command line flag has its prefix but is not +// recognized, it will print its help message. Flags starting with +// GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test +// internal flags and do not trigger the help message. +static bool HasGoogleTestFlagPrefix(const char* str) { + return (SkipPrefix("--", &str) || SkipPrefix("-", &str) || + SkipPrefix("/", &str)) && + !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) && + (SkipPrefix(GTEST_FLAG_PREFIX_, &str) || + SkipPrefix(GTEST_FLAG_PREFIX_DASH_, &str)); +} + +// Prints a string containing code-encoded text. The following escape +// sequences can be used in the string to control the text color: +// +// @@ prints a single '@' character. +// @R changes the color to red. +// @G changes the color to green. +// @Y changes the color to yellow. +// @D changes to the default terminal text color. +// +static void PrintColorEncoded(const char* str) { + GTestColor color = GTestColor::kDefault; // The current color. + + // Conceptually, we split the string into segments divided by escape + // sequences. Then we print one segment at a time. At the end of + // each iteration, the str pointer advances to the beginning of the + // next segment. + for (;;) { + const char* p = strchr(str, '@'); + if (p == nullptr) { + ColoredPrintf(color, "%s", str); + return; + } + + ColoredPrintf(color, "%s", std::string(str, p).c_str()); + + const char ch = p[1]; + str = p + 2; + if (ch == '@') { + ColoredPrintf(color, "@"); + } else if (ch == 'D') { + color = GTestColor::kDefault; + } else if (ch == 'R') { + color = GTestColor::kRed; + } else if (ch == 'G') { + color = GTestColor::kGreen; + } else if (ch == 'Y') { + color = GTestColor::kYellow; + } else { + --str; + } + } +} + +static const char kColorEncodedHelpMessage[] = + "This program contains tests written using " GTEST_NAME_ + ". You can use the\n" + "following command line flags to control its behavior:\n" + "\n" + "Test Selection:\n" + " @G--" GTEST_FLAG_PREFIX_ + "list_tests@D\n" + " List the names of all tests instead of running them. The name of\n" + " TEST(Foo, Bar) is \"Foo.Bar\".\n" + " @G--" GTEST_FLAG_PREFIX_ + "filter=@YPOSITIVE_PATTERNS" + "[@G-@YNEGATIVE_PATTERNS]@D\n" + " Run only the tests whose name matches one of the positive patterns " + "but\n" + " none of the negative patterns. '?' matches any single character; " + "'*'\n" + " matches any substring; ':' separates two patterns.\n" + " @G--" GTEST_FLAG_PREFIX_ + "also_run_disabled_tests@D\n" + " Run all disabled tests too.\n" + "\n" + "Test Execution:\n" + " @G--" GTEST_FLAG_PREFIX_ + "repeat=@Y[COUNT]@D\n" + " Run the tests repeatedly; use a negative count to repeat forever.\n" + " @G--" GTEST_FLAG_PREFIX_ + "shuffle@D\n" + " Randomize tests' orders on every iteration.\n" + " @G--" GTEST_FLAG_PREFIX_ + "random_seed=@Y[NUMBER]@D\n" + " Random number seed to use for shuffling test orders (between 1 and\n" + " 99999, or 0 to use a seed based on the current time).\n" + " @G--" GTEST_FLAG_PREFIX_ + "recreate_environments_when_repeating@D\n" + " Sets up and tears down the global test environment on each repeat\n" + " of the test.\n" + "\n" + "Test Output:\n" + " @G--" GTEST_FLAG_PREFIX_ + "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n" + " Enable/disable colored output. The default is @Gauto@D.\n" + " @G--" GTEST_FLAG_PREFIX_ + "brief=1@D\n" + " Only print test failures.\n" + " @G--" GTEST_FLAG_PREFIX_ + "print_time=0@D\n" + " Don't print the elapsed time of each test.\n" + " @G--" GTEST_FLAG_PREFIX_ + "output=@Y(@Gjson@Y|@Gxml@Y)[@G:@YDIRECTORY_PATH@G" GTEST_PATH_SEP_ + "@Y|@G:@YFILE_PATH]@D\n" + " Generate a JSON or XML report in the given directory or with the " + "given\n" + " file name. @YFILE_PATH@D defaults to @Gtest_detail.xml@D.\n" +#if GTEST_CAN_STREAM_RESULTS_ + " @G--" GTEST_FLAG_PREFIX_ + "stream_result_to=@YHOST@G:@YPORT@D\n" + " Stream test results to the given server.\n" +#endif // GTEST_CAN_STREAM_RESULTS_ + "\n" + "Assertion Behavior:\n" +#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS + " @G--" GTEST_FLAG_PREFIX_ + "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n" + " Set the default death test style.\n" +#endif // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS + " @G--" GTEST_FLAG_PREFIX_ + "break_on_failure@D\n" + " Turn assertion failures into debugger break-points.\n" + " @G--" GTEST_FLAG_PREFIX_ + "throw_on_failure@D\n" + " Turn assertion failures into C++ exceptions for use by an external\n" + " test framework.\n" + " @G--" GTEST_FLAG_PREFIX_ + "catch_exceptions=0@D\n" + " Do not report exceptions as test failures. Instead, allow them\n" + " to crash the program or throw a pop-up (on Windows).\n" + "\n" + "Except for @G--" GTEST_FLAG_PREFIX_ + "list_tests@D, you can alternatively set " + "the corresponding\n" + "environment variable of a flag (all letters in upper-case). For example, " + "to\n" + "disable colored text output, you can either specify " + "@G--" GTEST_FLAG_PREFIX_ + "color=no@D or set\n" + "the @G" GTEST_FLAG_PREFIX_UPPER_ + "COLOR@D environment variable to @Gno@D.\n" + "\n" + "For more information, please read the " GTEST_NAME_ + " documentation at\n" + "@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_ + "\n" + "(not one in your own code or tests), please report it to\n" + "@G<" GTEST_DEV_EMAIL_ ">@D.\n"; + +static bool ParseGoogleTestFlag(const char* const arg) { +#define GTEST_INTERNAL_PARSE_FLAG(flag_name) \ + do { \ + auto value = GTEST_FLAG_GET(flag_name); \ + if (ParseFlag(arg, #flag_name, &value)) { \ + GTEST_FLAG_SET(flag_name, value); \ + return true; \ + } \ + } while (false) + + GTEST_INTERNAL_PARSE_FLAG(also_run_disabled_tests); + GTEST_INTERNAL_PARSE_FLAG(break_on_failure); + GTEST_INTERNAL_PARSE_FLAG(catch_exceptions); + GTEST_INTERNAL_PARSE_FLAG(color); + GTEST_INTERNAL_PARSE_FLAG(death_test_style); + GTEST_INTERNAL_PARSE_FLAG(death_test_use_fork); + GTEST_INTERNAL_PARSE_FLAG(fail_fast); + GTEST_INTERNAL_PARSE_FLAG(filter); + GTEST_INTERNAL_PARSE_FLAG(internal_run_death_test); + GTEST_INTERNAL_PARSE_FLAG(list_tests); + GTEST_INTERNAL_PARSE_FLAG(output); + GTEST_INTERNAL_PARSE_FLAG(brief); + GTEST_INTERNAL_PARSE_FLAG(print_time); + GTEST_INTERNAL_PARSE_FLAG(print_utf8); + GTEST_INTERNAL_PARSE_FLAG(random_seed); + GTEST_INTERNAL_PARSE_FLAG(repeat); + GTEST_INTERNAL_PARSE_FLAG(recreate_environments_when_repeating); + GTEST_INTERNAL_PARSE_FLAG(shuffle); + GTEST_INTERNAL_PARSE_FLAG(stack_trace_depth); + GTEST_INTERNAL_PARSE_FLAG(stream_result_to); + GTEST_INTERNAL_PARSE_FLAG(throw_on_failure); + return false; +} + +#if GTEST_USE_OWN_FLAGFILE_FLAG_ +static void LoadFlagsFromFile(const std::string& path) { + FILE* flagfile = posix::FOpen(path.c_str(), "r"); + if (!flagfile) { + GTEST_LOG_(FATAL) << "Unable to open file \"" << GTEST_FLAG_GET(flagfile) + << "\""; + } + std::string contents(ReadEntireFile(flagfile)); + posix::FClose(flagfile); + std::vector lines; + SplitString(contents, '\n', &lines); + for (size_t i = 0; i < lines.size(); ++i) { + if (lines[i].empty()) continue; + if (!ParseGoogleTestFlag(lines[i].c_str())) g_help_flag = true; + } +} +#endif // GTEST_USE_OWN_FLAGFILE_FLAG_ + +// Parses the command line for Google Test flags, without initializing +// other parts of Google Test. The type parameter CharType can be +// instantiated to either char or wchar_t. +template +void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) { + std::string flagfile_value; + for (int i = 1; i < *argc; i++) { + const std::string arg_string = StreamableToString(argv[i]); + const char* const arg = arg_string.c_str(); + + using internal::ParseFlag; + + bool remove_flag = false; + if (ParseGoogleTestFlag(arg)) { + remove_flag = true; +#if GTEST_USE_OWN_FLAGFILE_FLAG_ + } else if (ParseFlag(arg, "flagfile", &flagfile_value)) { + GTEST_FLAG_SET(flagfile, flagfile_value); + LoadFlagsFromFile(flagfile_value); + remove_flag = true; +#endif // GTEST_USE_OWN_FLAGFILE_FLAG_ + } else if (arg_string == "--help" || HasGoogleTestFlagPrefix(arg)) { + // Both help flag and unrecognized Google Test flags (excluding + // internal ones) trigger help display. + g_help_flag = true; + } + + if (remove_flag) { + // Shift the remainder of the argv list left by one. Note + // that argv has (*argc + 1) elements, the last one always being + // NULL. The following loop moves the trailing NULL element as + // well. + for (int j = i; j != *argc; j++) { + argv[j] = argv[j + 1]; + } + + // Decrements the argument count. + (*argc)--; + + // We also need to decrement the iterator as we just removed + // an element. + i--; + } + } + + if (g_help_flag) { + // We print the help here instead of in RUN_ALL_TESTS(), as the + // latter may not be called at all if the user is using Google + // Test with another testing framework. + PrintColorEncoded(kColorEncodedHelpMessage); + } +} + +// Parses the command line for Google Test flags, without initializing +// other parts of Google Test. +void ParseGoogleTestFlagsOnly(int* argc, char** argv) { +#if GTEST_HAS_ABSL + if (*argc > 0) { + // absl::ParseCommandLine() requires *argc > 0. + auto positional_args = absl::flags_internal::ParseCommandLineImpl( + *argc, argv, absl::flags_internal::ArgvListAction::kRemoveParsedArgs, + absl::flags_internal::UsageFlagsAction::kHandleUsage, + absl::flags_internal::OnUndefinedFlag::kReportUndefined); + // Any command-line positional arguments not part of any command-line flag + // (or arguments to a flag) are copied back out to argv, with the program + // invocation name at position 0, and argc is resized. This includes + // positional arguments after the flag-terminating delimiter '--'. + // See https://abseil.io/docs/cpp/guides/flags. + std::copy(positional_args.begin(), positional_args.end(), argv); + if (static_cast(positional_args.size()) < *argc) { + argv[positional_args.size()] = nullptr; + *argc = static_cast(positional_args.size()); + } + } +#else + ParseGoogleTestFlagsOnlyImpl(argc, argv); +#endif + + // Fix the value of *_NSGetArgc() on macOS, but if and only if + // *_NSGetArgv() == argv + // Only applicable to char** version of argv +#if GTEST_OS_MAC +#ifndef GTEST_OS_IOS + if (*_NSGetArgv() == argv) { + *_NSGetArgc() = *argc; + } +#endif +#endif +} +void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv) { + ParseGoogleTestFlagsOnlyImpl(argc, argv); +} + +// The internal implementation of InitGoogleTest(). +// +// The type parameter CharType can be instantiated to either char or +// wchar_t. +template +void InitGoogleTestImpl(int* argc, CharType** argv) { + // We don't want to run the initialization code twice. + if (GTestIsInitialized()) return; + + if (*argc <= 0) return; + + g_argvs.clear(); + for (int i = 0; i != *argc; i++) { + g_argvs.push_back(StreamableToString(argv[i])); + } + +#if GTEST_HAS_ABSL + absl::InitializeSymbolizer(g_argvs[0].c_str()); + + // When using the Abseil Flags library, set the program usage message to the + // help message, but remove the color-encoding from the message first. + absl::SetProgramUsageMessage(absl::StrReplaceAll( + kColorEncodedHelpMessage, + {{"@D", ""}, {"@R", ""}, {"@G", ""}, {"@Y", ""}, {"@@", "@"}})); +#endif // GTEST_HAS_ABSL + + ParseGoogleTestFlagsOnly(argc, argv); + GetUnitTestImpl()->PostFlagParsingInit(); +} + +} // namespace internal + +// Initializes Google Test. This must be called before calling +// RUN_ALL_TESTS(). In particular, it parses a command line for the +// flags that Google Test recognizes. Whenever a Google Test flag is +// seen, it is removed from argv, and *argc is decremented. +// +// No value is returned. Instead, the Google Test flag variables are +// updated. +// +// Calling the function for the second time has no user-visible effect. +void InitGoogleTest(int* argc, char** argv) { +#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) + GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv); +#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) + internal::InitGoogleTestImpl(argc, argv); +#endif // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) +} + +// This overloaded version can be used in Windows programs compiled in +// UNICODE mode. +void InitGoogleTest(int* argc, wchar_t** argv) { +#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) + GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv); +#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) + internal::InitGoogleTestImpl(argc, argv); +#endif // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) +} + +// This overloaded version can be used on Arduino/embedded platforms where +// there is no argc/argv. +void InitGoogleTest() { + // Since Arduino doesn't have a command line, fake out the argc/argv arguments + int argc = 1; + const auto arg0 = "dummy"; + char* argv0 = const_cast(arg0); + char** argv = &argv0; + +#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) + GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(&argc, argv); +#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) + internal::InitGoogleTestImpl(&argc, argv); +#endif // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) +} + +#if !defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_) +// Return value of first environment variable that is set and contains +// a non-empty string. If there are none, return the "fallback" string. +// Since we like the temporary directory to have a directory separator suffix, +// add it if not provided in the environment variable value. +static std::string GetTempDirFromEnv( + std::initializer_list environment_variables, + const char* fallback, char separator) { + for (const char* variable_name : environment_variables) { + const char* value = internal::posix::GetEnv(variable_name); + if (value != nullptr && value[0] != '\0') { + if (value[strlen(value) - 1] != separator) { + return std::string(value).append(1, separator); + } + return value; + } + } + return fallback; +} +#endif + +std::string TempDir() { +#if defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_) + return GTEST_CUSTOM_TEMPDIR_FUNCTION_(); +#elif GTEST_OS_WINDOWS || GTEST_OS_WINDOWS_MOBILE + return GetTempDirFromEnv({"TEST_TMPDIR", "TEMP"}, "\\temp\\", '\\'); +#elif GTEST_OS_LINUX_ANDROID + return GetTempDirFromEnv({"TEST_TMPDIR", "TMPDIR"}, "/data/local/tmp/", '/'); +#else + return GetTempDirFromEnv({"TEST_TMPDIR", "TMPDIR"}, "/tmp/", '/'); +#endif +} + +// Class ScopedTrace + +// Pushes the given source file location and message onto a per-thread +// trace stack maintained by Google Test. +void ScopedTrace::PushTrace(const char* file, int line, std::string message) { + internal::TraceInfo trace; + trace.file = file; + trace.line = line; + trace.message.swap(message); + + UnitTest::GetInstance()->PushGTestTrace(trace); +} + +// Pops the info pushed by the c'tor. +ScopedTrace::~ScopedTrace() GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) { + UnitTest::GetInstance()->PopGTestTrace(); +} + +} // namespace testing diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest_main.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest_main.cc new file mode 100644 index 0000000000..44976375c9 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest_main.cc @@ -0,0 +1,53 @@ +// Copyright 2006, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include + +#include "gtest/gtest.h" + +#if GTEST_OS_ESP8266 || GTEST_OS_ESP32 +#if GTEST_OS_ESP8266 +extern "C" { +#endif +void setup() { testing::InitGoogleTest(); } + +void loop() { RUN_ALL_TESTS(); } + +#if GTEST_OS_ESP8266 +} +#endif + +#else + +GTEST_API_ int main(int argc, char **argv) { + printf("Running main() from %s\n", __FILE__); + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/LICENSE b/media/libvpx/libvpx/third_party/libyuv/LICENSE new file mode 100644 index 0000000000..c911747a6b --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/LICENSE @@ -0,0 +1,29 @@ +Copyright 2011 The LibYuv Project Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + + * Neither the name of Google nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/media/libvpx/libvpx/third_party/libyuv/README.libvpx b/media/libvpx/libvpx/third_party/libyuv/README.libvpx new file mode 100644 index 0000000000..9519dc4bee --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/README.libvpx @@ -0,0 +1,23 @@ +Name: libyuv +URL: https://chromium.googlesource.com/libyuv/libyuv +Version: a37e7bfece9e0676ae90a1700b0ec85b0f4f22a1 +License: BSD +License File: LICENSE + +Description: +libyuv is an open source project that includes YUV conversion and scaling +functionality. + +The optimized scaler in libyuv is used in the multiple resolution encoder +example which down-samples the original input video (f.g. 1280x720) a number of +times in order to encode multiple resolution bit streams. + +Local Modifications: +Disable ARGBToRGB24Row_AVX512VBMI due to build failure on Mac. +rm libyuv/include/libyuv.h libyuv/include/libyuv/compare_row.h +mv libyuv/include tmp/ +mv libyuv/source tmp/ +mv libyuv/LICENSE tmp/ +rm -rf libyuv + +mv tmp/* third_party/libyuv/ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/basic_types.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/basic_types.h new file mode 100644 index 0000000000..01d9dfc773 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/basic_types.h @@ -0,0 +1,65 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_ +#define INCLUDE_LIBYUV_BASIC_TYPES_H_ + +#include // For size_t and NULL + +#if !defined(INT_TYPES_DEFINED) && !defined(GG_LONGLONG) +#define INT_TYPES_DEFINED + +#if defined(_MSC_VER) && (_MSC_VER < 1600) +#include // for uintptr_t on x86 +typedef unsigned __int64 uint64_t; +typedef __int64 int64_t; +typedef unsigned int uint32_t; +typedef int int32_t; +typedef unsigned short uint16_t; +typedef short int16_t; +typedef unsigned char uint8_t; +typedef signed char int8_t; +#else +#include // for uintptr_t and C99 types +#endif // defined(_MSC_VER) && (_MSC_VER < 1600) +typedef uint64_t uint64; +typedef int64_t int64; +typedef uint32_t uint32; +typedef int32_t int32; +typedef uint16_t uint16; +typedef int16_t int16; +typedef uint8_t uint8; +typedef int8_t int8; +#endif // INT_TYPES_DEFINED + +#if !defined(LIBYUV_API) +#if defined(_WIN32) || defined(__CYGWIN__) +#if defined(LIBYUV_BUILDING_SHARED_LIBRARY) +#define LIBYUV_API __declspec(dllexport) +#elif defined(LIBYUV_USING_SHARED_LIBRARY) +#define LIBYUV_API __declspec(dllimport) +#else +#define LIBYUV_API +#endif // LIBYUV_BUILDING_SHARED_LIBRARY +#elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \ + (defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \ + defined(LIBYUV_USING_SHARED_LIBRARY)) +#define LIBYUV_API __attribute__((visibility("default"))) +#else +#define LIBYUV_API +#endif // __GNUC__ +#endif // LIBYUV_API + +// TODO(fbarchard): Remove bool macros. +#define LIBYUV_BOOL int +#define LIBYUV_FALSE 0 +#define LIBYUV_TRUE 1 + +#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/compare.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/compare.h new file mode 100644 index 0000000000..3353ad71c6 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/compare.h @@ -0,0 +1,111 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_COMPARE_H_ +#define INCLUDE_LIBYUV_COMPARE_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Compute a hash for specified memory. Seed of 5381 recommended. +LIBYUV_API +uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed); + +// Hamming Distance +LIBYUV_API +uint64_t ComputeHammingDistance(const uint8_t* src_a, + const uint8_t* src_b, + int count); + +// Scan an opaque argb image and return fourcc based on alpha offset. +// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown. +LIBYUV_API +uint32_t ARGBDetect(const uint8_t* argb, + int stride_argb, + int width, + int height); + +// Sum Square Error - used to compute Mean Square Error or PSNR. +LIBYUV_API +uint64_t ComputeSumSquareError(const uint8_t* src_a, + const uint8_t* src_b, + int count); + +LIBYUV_API +uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a, + int stride_a, + const uint8_t* src_b, + int stride_b, + int width, + int height); + +static const int kMaxPsnr = 128; + +LIBYUV_API +double SumSquareErrorToPsnr(uint64_t sse, uint64_t count); + +LIBYUV_API +double CalcFramePsnr(const uint8_t* src_a, + int stride_a, + const uint8_t* src_b, + int stride_b, + int width, + int height); + +LIBYUV_API +double I420Psnr(const uint8_t* src_y_a, + int stride_y_a, + const uint8_t* src_u_a, + int stride_u_a, + const uint8_t* src_v_a, + int stride_v_a, + const uint8_t* src_y_b, + int stride_y_b, + const uint8_t* src_u_b, + int stride_u_b, + const uint8_t* src_v_b, + int stride_v_b, + int width, + int height); + +LIBYUV_API +double CalcFrameSsim(const uint8_t* src_a, + int stride_a, + const uint8_t* src_b, + int stride_b, + int width, + int height); + +LIBYUV_API +double I420Ssim(const uint8_t* src_y_a, + int stride_y_a, + const uint8_t* src_u_a, + int stride_u_a, + const uint8_t* src_v_a, + int stride_v_a, + const uint8_t* src_y_b, + int stride_y_b, + const uint8_t* src_u_b, + int stride_u_b, + const uint8_t* src_v_b, + int stride_v_b, + int width, + int height); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_COMPARE_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert.h new file mode 100644 index 0000000000..d12ef24f79 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert.h @@ -0,0 +1,406 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_CONVERT_H_ +#define INCLUDE_LIBYUV_CONVERT_H_ + +#include "libyuv/basic_types.h" + +#include "libyuv/rotate.h" // For enum RotationMode. + +// TODO(fbarchard): fix WebRTC source to include following libyuv headers: +#include "libyuv/convert_argb.h" // For WebRTC I420ToARGB. b/620 +#include "libyuv/convert_from.h" // For WebRTC ConvertFromI420. b/620 +#include "libyuv/planar_functions.h" // For WebRTC I420Rect, CopyPlane. b/618 + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Convert I444 to I420. +LIBYUV_API +int I444ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Convert I422 to I420. +LIBYUV_API +int I422ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Copy I420 to I420. +#define I420ToI420 I420Copy +LIBYUV_API +int I420Copy(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Copy I010 to I010 +#define I010ToI010 I010Copy +#define H010ToH010 I010Copy +LIBYUV_API +int I010Copy(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Convert 10 bit YUV to 8 bit +#define H010ToH420 I010ToI420 +LIBYUV_API +int I010ToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Convert I400 (grey) to I420. +LIBYUV_API +int I400ToI420(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +#define J400ToJ420 I400ToI420 + +// Convert NV12 to I420. +LIBYUV_API +int NV12ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Convert NV21 to I420. +LIBYUV_API +int NV21ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Convert YUY2 to I420. +LIBYUV_API +int YUY2ToI420(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Convert UYVY to I420. +LIBYUV_API +int UYVYToI420(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Convert M420 to I420. +LIBYUV_API +int M420ToI420(const uint8_t* src_m420, + int src_stride_m420, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Convert Android420 to I420. +LIBYUV_API +int Android420ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_pixel_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// ARGB little endian (bgra in memory) to I420. +LIBYUV_API +int ARGBToI420(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// BGRA little endian (argb in memory) to I420. +LIBYUV_API +int BGRAToI420(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// ABGR little endian (rgba in memory) to I420. +LIBYUV_API +int ABGRToI420(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// RGBA little endian (abgr in memory) to I420. +LIBYUV_API +int RGBAToI420(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// RGB little endian (bgr in memory) to I420. +LIBYUV_API +int RGB24ToI420(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// RGB big endian (rgb in memory) to I420. +LIBYUV_API +int RAWToI420(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// RGB16 (RGBP fourcc) little endian to I420. +LIBYUV_API +int RGB565ToI420(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// RGB15 (RGBO fourcc) little endian to I420. +LIBYUV_API +int ARGB1555ToI420(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// RGB12 (R444 fourcc) little endian to I420. +LIBYUV_API +int ARGB4444ToI420(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +#ifdef HAVE_JPEG +// src_width/height provided by capture. +// dst_width/height for clipping determine final size. +LIBYUV_API +int MJPGToI420(const uint8_t* sample, + size_t sample_size, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int src_width, + int src_height, + int dst_width, + int dst_height); + +// Query size of MJPG in pixels. +LIBYUV_API +int MJPGSize(const uint8_t* sample, + size_t sample_size, + int* width, + int* height); +#endif + +// Convert camera sample to I420 with cropping, rotation and vertical flip. +// "src_size" is needed to parse MJPG. +// "dst_stride_y" number of bytes in a row of the dst_y plane. +// Normally this would be the same as dst_width, with recommended alignment +// to 16 bytes for better efficiency. +// If rotation of 90 or 270 is used, stride is affected. The caller should +// allocate the I420 buffer according to rotation. +// "dst_stride_u" number of bytes in a row of the dst_u plane. +// Normally this would be the same as (dst_width + 1) / 2, with +// recommended alignment to 16 bytes for better efficiency. +// If rotation of 90 or 270 is used, stride is affected. +// "crop_x" and "crop_y" are starting position for cropping. +// To center, crop_x = (src_width - dst_width) / 2 +// crop_y = (src_height - dst_height) / 2 +// "src_width" / "src_height" is size of src_frame in pixels. +// "src_height" can be negative indicating a vertically flipped image source. +// "crop_width" / "crop_height" is the size to crop the src to. +// Must be less than or equal to src_width/src_height +// Cropping parameters are pre-rotation. +// "rotation" can be 0, 90, 180 or 270. +// "fourcc" is a fourcc. ie 'I420', 'YUY2' +// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure. +LIBYUV_API +int ConvertToI420(const uint8_t* sample, + size_t sample_size, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int crop_x, + int crop_y, + int src_width, + int src_height, + int crop_width, + int crop_height, + enum RotationMode rotation, + uint32_t fourcc); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_CONVERT_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_argb.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_argb.h new file mode 100644 index 0000000000..ab772b6c32 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_argb.h @@ -0,0 +1,687 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_ +#define INCLUDE_LIBYUV_CONVERT_ARGB_H_ + +#include "libyuv/basic_types.h" + +#include "libyuv/rotate.h" // For enum RotationMode. + +// TODO(fbarchard): This set of functions should exactly match convert.h +// TODO(fbarchard): Add tests. Create random content of right size and convert +// with C vs Opt and or to I420 and compare. +// TODO(fbarchard): Some of these functions lack parameter setting. + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Alias. +#define ARGBToARGB ARGBCopy + +// Copy ARGB to ARGB. +LIBYUV_API +int ARGBCopy(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert I420 to ARGB. +LIBYUV_API +int I420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Duplicate prototype for function in convert_from.h for remoting. +LIBYUV_API +int I420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + +// Convert I010 to ARGB. +LIBYUV_API +int I010ToARGB(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert I010 to ARGB. +LIBYUV_API +int I010ToARGB(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert I010 to ABGR. +LIBYUV_API +int I010ToABGR(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + +// Convert H010 to ARGB. +LIBYUV_API +int H010ToARGB(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert H010 to ABGR. +LIBYUV_API +int H010ToABGR(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + +// Convert I422 to ARGB. +LIBYUV_API +int I422ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert I444 to ARGB. +LIBYUV_API +int I444ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert J444 to ARGB. +LIBYUV_API +int J444ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert I444 to ABGR. +LIBYUV_API +int I444ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + +// Convert I420 with Alpha to preattenuated ARGB. +LIBYUV_API +int I420AlphaToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + int attenuate); + +// Convert I420 with Alpha to preattenuated ABGR. +LIBYUV_API +int I420AlphaToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height, + int attenuate); + +// Convert I400 (grey) to ARGB. Reverse of ARGBToI400. +LIBYUV_API +int I400ToARGB(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert J400 (jpeg grey) to ARGB. +LIBYUV_API +int J400ToARGB(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Alias. +#define YToARGB I400ToARGB + +// Convert NV12 to ARGB. +LIBYUV_API +int NV12ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert NV21 to ARGB. +LIBYUV_API +int NV21ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert NV12 to ABGR. +int NV12ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + +// Convert NV21 to ABGR. +LIBYUV_API +int NV21ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + +// Convert NV12 to RGB24. +LIBYUV_API +int NV12ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height); + +// Convert NV21 to RGB24. +LIBYUV_API +int NV21ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height); + +// Convert M420 to ARGB. +LIBYUV_API +int M420ToARGB(const uint8_t* src_m420, + int src_stride_m420, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert YUY2 to ARGB. +LIBYUV_API +int YUY2ToARGB(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert UYVY to ARGB. +LIBYUV_API +int UYVYToARGB(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert J420 to ARGB. +LIBYUV_API +int J420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert J422 to ARGB. +LIBYUV_API +int J422ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert J420 to ABGR. +LIBYUV_API +int J420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + +// Convert J422 to ABGR. +LIBYUV_API +int J422ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + +// Convert H420 to ARGB. +LIBYUV_API +int H420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert H422 to ARGB. +LIBYUV_API +int H422ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert H420 to ABGR. +LIBYUV_API +int H420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + +// Convert H422 to ABGR. +LIBYUV_API +int H422ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + +// Convert H010 to ARGB. +LIBYUV_API +int H010ToARGB(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert I010 to AR30. +LIBYUV_API +int I010ToAR30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height); + +// Convert H010 to AR30. +LIBYUV_API +int H010ToAR30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height); + +// Convert I010 to AB30. +LIBYUV_API +int I010ToAB30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height); + +// Convert H010 to AB30. +LIBYUV_API +int H010ToAB30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height); + +// BGRA little endian (argb in memory) to ARGB. +LIBYUV_API +int BGRAToARGB(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// ABGR little endian (rgba in memory) to ARGB. +LIBYUV_API +int ABGRToARGB(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// RGBA little endian (abgr in memory) to ARGB. +LIBYUV_API +int RGBAToARGB(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Deprecated function name. +#define BG24ToARGB RGB24ToARGB + +// RGB little endian (bgr in memory) to ARGB. +LIBYUV_API +int RGB24ToARGB(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// RGB big endian (rgb in memory) to ARGB. +LIBYUV_API +int RAWToARGB(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// RGB16 (RGBP fourcc) little endian to ARGB. +LIBYUV_API +int RGB565ToARGB(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// RGB15 (RGBO fourcc) little endian to ARGB. +LIBYUV_API +int ARGB1555ToARGB(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// RGB12 (R444 fourcc) little endian to ARGB. +LIBYUV_API +int ARGB4444ToARGB(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Aliases +#define AB30ToARGB AR30ToABGR +#define AB30ToABGR AR30ToARGB +#define AB30ToAR30 AR30ToAB30 + +// Convert AR30 To ARGB. +LIBYUV_API +int AR30ToARGB(const uint8_t* src_ar30, + int src_stride_ar30, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert AR30 To ABGR. +LIBYUV_API +int AR30ToABGR(const uint8_t* src_ar30, + int src_stride_ar30, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + +// Convert AR30 To AB30. +LIBYUV_API +int AR30ToAB30(const uint8_t* src_ar30, + int src_stride_ar30, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height); + +#ifdef HAVE_JPEG +// src_width/height provided by capture +// dst_width/height for clipping determine final size. +LIBYUV_API +int MJPGToARGB(const uint8_t* sample, + size_t sample_size, + uint8_t* dst_argb, + int dst_stride_argb, + int src_width, + int src_height, + int dst_width, + int dst_height); +#endif + +// Convert Android420 to ARGB. +LIBYUV_API +int Android420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_pixel_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert Android420 to ABGR. +LIBYUV_API +int Android420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_pixel_stride_uv, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + +// Convert camera sample to ARGB with cropping, rotation and vertical flip. +// "sample_size" is needed to parse MJPG. +// "dst_stride_argb" number of bytes in a row of the dst_argb plane. +// Normally this would be the same as dst_width, with recommended alignment +// to 16 bytes for better efficiency. +// If rotation of 90 or 270 is used, stride is affected. The caller should +// allocate the I420 buffer according to rotation. +// "dst_stride_u" number of bytes in a row of the dst_u plane. +// Normally this would be the same as (dst_width + 1) / 2, with +// recommended alignment to 16 bytes for better efficiency. +// If rotation of 90 or 270 is used, stride is affected. +// "crop_x" and "crop_y" are starting position for cropping. +// To center, crop_x = (src_width - dst_width) / 2 +// crop_y = (src_height - dst_height) / 2 +// "src_width" / "src_height" is size of src_frame in pixels. +// "src_height" can be negative indicating a vertically flipped image source. +// "crop_width" / "crop_height" is the size to crop the src to. +// Must be less than or equal to src_width/src_height +// Cropping parameters are pre-rotation. +// "rotation" can be 0, 90, 180 or 270. +// "fourcc" is a fourcc. ie 'I420', 'YUY2' +// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure. +LIBYUV_API +int ConvertToARGB(const uint8_t* sample, + size_t sample_size, + uint8_t* dst_argb, + int dst_stride_argb, + int crop_x, + int crop_y, + int src_width, + int src_height, + int crop_width, + int crop_height, + enum RotationMode rotation, + uint32_t fourcc); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_CONVERT_ARGB_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_from.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_from.h new file mode 100644 index 0000000000..5cd8a4bfc0 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_from.h @@ -0,0 +1,342 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_ +#define INCLUDE_LIBYUV_CONVERT_FROM_H_ + +#include "libyuv/basic_types.h" +#include "libyuv/rotate.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// See Also convert.h for conversions from formats to I420. + +// Convert 8 bit YUV to 10 bit. +#define H420ToH010 I420ToI010 +int I420ToI010(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height); + +LIBYUV_API +int I420ToI422(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +LIBYUV_API +int I420ToI444(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21. +LIBYUV_API +int I400Copy(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height); + +LIBYUV_API +int I420ToNV12(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height); + +LIBYUV_API +int I420ToNV21(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height); + +LIBYUV_API +int I420ToYUY2(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_yuy2, + int dst_stride_yuy2, + int width, + int height); + +LIBYUV_API +int I420ToUYVY(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uyvy, + int dst_stride_uyvy, + int width, + int height); + +LIBYUV_API +int I420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +LIBYUV_API +int I420ToBGRA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_bgra, + int dst_stride_bgra, + int width, + int height); + +LIBYUV_API +int I420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + +LIBYUV_API +int I420ToRGBA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, + int width, + int height); + +LIBYUV_API +int I420ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height); + +LIBYUV_API +int I420ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height); + +LIBYUV_API +int H420ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height); + +LIBYUV_API +int H420ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height); + +LIBYUV_API +int I420ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height); + +LIBYUV_API +int I422ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height); + +// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes). +// Values in dither matrix from 0 to 7 recommended. +// The order of the dither matrix is first byte is upper left. + +LIBYUV_API +int I420ToRGB565Dither(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + const uint8_t* dither4x4, + int width, + int height); + +LIBYUV_API +int I420ToARGB1555(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb1555, + int dst_stride_argb1555, + int width, + int height); + +LIBYUV_API +int I420ToARGB4444(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb4444, + int dst_stride_argb4444, + int width, + int height); + +// Convert I420 to AR30. +LIBYUV_API +int I420ToAR30(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height); + +// Convert H420 to AR30. +LIBYUV_API +int H420ToAR30(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height); + +// Convert I420 to specified format. +// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the +// buffer has contiguous rows. Can be negative. A multiple of 16 is optimal. +LIBYUV_API +int ConvertFromI420(const uint8_t* y, + int y_stride, + const uint8_t* u, + int u_stride, + const uint8_t* v, + int v_stride, + uint8_t* dst_sample, + int dst_sample_stride, + int width, + int height, + uint32_t fourcc); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_CONVERT_FROM_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h new file mode 100644 index 0000000000..05c815a093 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h @@ -0,0 +1,287 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ +#define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Copy ARGB to ARGB. +#define ARGBToARGB ARGBCopy +LIBYUV_API +int ARGBCopy(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert ARGB To BGRA. +LIBYUV_API +int ARGBToBGRA(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_bgra, + int dst_stride_bgra, + int width, + int height); + +// Convert ARGB To ABGR. +LIBYUV_API +int ARGBToABGR(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + +// Convert ARGB To RGBA. +LIBYUV_API +int ARGBToRGBA(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_rgba, + int dst_stride_rgba, + int width, + int height); + +// Aliases +#define ARGBToAB30 ABGRToAR30 +#define ABGRToAB30 ARGBToAR30 + +// Convert ABGR To AR30. +LIBYUV_API +int ABGRToAR30(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height); + +// Convert ARGB To AR30. +LIBYUV_API +int ARGBToAR30(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height); + +// Convert ARGB To RGB24. +LIBYUV_API +int ARGBToRGB24(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height); + +// Convert ARGB To RAW. +LIBYUV_API +int ARGBToRAW(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height); + +// Convert ARGB To RGB565. +LIBYUV_API +int ARGBToRGB565(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height); + +// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes). +// Values in dither matrix from 0 to 7 recommended. +// The order of the dither matrix is first byte is upper left. +// TODO(fbarchard): Consider pointer to 2d array for dither4x4. +// const uint8_t(*dither)[4][4]; +LIBYUV_API +int ARGBToRGB565Dither(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + const uint8_t* dither4x4, + int width, + int height); + +// Convert ARGB To ARGB1555. +LIBYUV_API +int ARGBToARGB1555(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb1555, + int dst_stride_argb1555, + int width, + int height); + +// Convert ARGB To ARGB4444. +LIBYUV_API +int ARGBToARGB4444(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb4444, + int dst_stride_argb4444, + int width, + int height); + +// Convert ARGB To I444. +LIBYUV_API +int ARGBToI444(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Convert ARGB To I422. +LIBYUV_API +int ARGBToI422(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Convert ARGB To I420. (also in convert.h) +LIBYUV_API +int ARGBToI420(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Convert ARGB to J420. (JPeg full range I420). +LIBYUV_API +int ARGBToJ420(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_yj, + int dst_stride_yj, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Convert ARGB to J422. +LIBYUV_API +int ARGBToJ422(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_yj, + int dst_stride_yj, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Convert ARGB to J400. (JPeg full range). +LIBYUV_API +int ARGBToJ400(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_yj, + int dst_stride_yj, + int width, + int height); + +// Convert ARGB to I400. +LIBYUV_API +int ARGBToI400(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height); + +// Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB) +LIBYUV_API +int ARGBToG(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_g, + int dst_stride_g, + int width, + int height); + +// Convert ARGB To NV12. +LIBYUV_API +int ARGBToNV12(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height); + +// Convert ARGB To NV21. +LIBYUV_API +int ARGBToNV21(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height); + +// Convert ARGB To NV21. +LIBYUV_API +int ARGBToNV21(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height); + +// Convert ARGB To YUY2. +LIBYUV_API +int ARGBToYUY2(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_yuy2, + int dst_stride_yuy2, + int width, + int height); + +// Convert ARGB To UYVY. +LIBYUV_API +int ARGBToUYVY(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_uyvy, + int dst_stride_uyvy, + int width, + int height); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/cpu_id.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/cpu_id.h new file mode 100644 index 0000000000..0229cb5e73 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/cpu_id.h @@ -0,0 +1,119 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_CPU_ID_H_ +#define INCLUDE_LIBYUV_CPU_ID_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Internal flag to indicate cpuid requires initialization. +static const int kCpuInitialized = 0x1; + +// These flags are only valid on ARM processors. +static const int kCpuHasARM = 0x2; +static const int kCpuHasNEON = 0x4; +// 0x8 reserved for future ARM flag. + +// These flags are only valid on x86 processors. +static const int kCpuHasX86 = 0x10; +static const int kCpuHasSSE2 = 0x20; +static const int kCpuHasSSSE3 = 0x40; +static const int kCpuHasSSE41 = 0x80; +static const int kCpuHasSSE42 = 0x100; // unused at this time. +static const int kCpuHasAVX = 0x200; +static const int kCpuHasAVX2 = 0x400; +static const int kCpuHasERMS = 0x800; +static const int kCpuHasFMA3 = 0x1000; +static const int kCpuHasF16C = 0x2000; +static const int kCpuHasGFNI = 0x4000; +static const int kCpuHasAVX512BW = 0x8000; +static const int kCpuHasAVX512VL = 0x10000; +static const int kCpuHasAVX512VBMI = 0x20000; +static const int kCpuHasAVX512VBMI2 = 0x40000; +static const int kCpuHasAVX512VBITALG = 0x80000; +static const int kCpuHasAVX512VPOPCNTDQ = 0x100000; + +// These flags are only valid on MIPS processors. +static const int kCpuHasMIPS = 0x200000; +static const int kCpuHasMSA = 0x400000; + +// Optional init function. TestCpuFlag does an auto-init. +// Returns cpu_info flags. +LIBYUV_API +int InitCpuFlags(void); + +// Detect CPU has SSE2 etc. +// Test_flag parameter should be one of kCpuHas constants above. +// Returns non-zero if instruction set is detected +static __inline int TestCpuFlag(int test_flag) { + LIBYUV_API extern int cpu_info_; +#ifdef __ATOMIC_RELAXED + int cpu_info = __atomic_load_n(&cpu_info_, __ATOMIC_RELAXED); +#else + int cpu_info = cpu_info_; +#endif + return (!cpu_info ? InitCpuFlags() : cpu_info) & test_flag; +} + +// Internal function for parsing /proc/cpuinfo. +LIBYUV_API +int ArmCpuCaps(const char* cpuinfo_name); + +// For testing, allow CPU flags to be disabled. +// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3. +// MaskCpuFlags(-1) to enable all cpu specific optimizations. +// MaskCpuFlags(1) to disable all cpu specific optimizations. +// MaskCpuFlags(0) to reset state so next call will auto init. +// Returns cpu_info flags. +LIBYUV_API +int MaskCpuFlags(int enable_flags); + +// Sets the CPU flags to |cpu_flags|, bypassing the detection code. |cpu_flags| +// should be a valid combination of the kCpuHas constants above and include +// kCpuInitialized. Use this method when running in a sandboxed process where +// the detection code might fail (as it might access /proc/cpuinfo). In such +// cases the cpu_info can be obtained from a non sandboxed process by calling +// InitCpuFlags() and passed to the sandboxed process (via command line +// parameters, IPC...) which can then call this method to initialize the CPU +// flags. +// Notes: +// - when specifying 0 for |cpu_flags|, the auto initialization is enabled +// again. +// - enabling CPU features that are not supported by the CPU will result in +// undefined behavior. +// TODO(fbarchard): consider writing a helper function that translates from +// other library CPU info to libyuv CPU info and add a .md doc that explains +// CPU detection. +static __inline void SetCpuFlags(int cpu_flags) { + LIBYUV_API extern int cpu_info_; +#ifdef __ATOMIC_RELAXED + __atomic_store_n(&cpu_info_, cpu_flags, __ATOMIC_RELAXED); +#else + cpu_info_ = cpu_flags; +#endif +} + +// Low level cpuid for X86. Returns zeros on other CPUs. +// eax is the info type that you want. +// ecx is typically the cpu number, and should normally be zero. +LIBYUV_API +void CpuId(int info_eax, int info_ecx, int* cpu_info); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_CPU_ID_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/macros_msa.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/macros_msa.h new file mode 100644 index 0000000000..bba0e8aeda --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/macros_msa.h @@ -0,0 +1,233 @@ +/* + * Copyright 2016 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_MACROS_MSA_H_ +#define INCLUDE_LIBYUV_MACROS_MSA_H_ + +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#include +#include + +#if (__mips_isa_rev >= 6) +#define LW(psrc) \ + ({ \ + const uint8_t* psrc_lw_m = (const uint8_t*)(psrc); \ + uint32_t val_m; \ + asm volatile("lw %[val_m], %[psrc_lw_m] \n" \ + : [val_m] "=r"(val_m) \ + : [psrc_lw_m] "m"(*psrc_lw_m)); \ + val_m; \ + }) + +#if (__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \ + uint64_t val_m = 0; \ + asm volatile("ld %[val_m], %[psrc_ld_m] \n" \ + : [val_m] "=r"(val_m) \ + : [psrc_ld_m] "m"(*psrc_ld_m)); \ + val_m; \ + }) +#else // !(__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \ + uint32_t val0_m, val1_m; \ + uint64_t val_m = 0; \ + val0_m = LW(psrc_ld_m); \ + val1_m = LW(psrc_ld_m + 4); \ + val_m = (uint64_t)(val1_m); /* NOLINT */ \ + val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \ + val_m = (uint64_t)(val_m | (uint64_t)val0_m); /* NOLINT */ \ + val_m; \ + }) +#endif // (__mips == 64) + +#define SW(val, pdst) \ + ({ \ + uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \ + uint32_t val_m = (val); \ + asm volatile("sw %[val_m], %[pdst_sw_m] \n" \ + : [pdst_sw_m] "=m"(*pdst_sw_m) \ + : [val_m] "r"(val_m)); \ + }) + +#if (__mips == 64) +#define SD(val, pdst) \ + ({ \ + uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \ + uint64_t val_m = (val); \ + asm volatile("sd %[val_m], %[pdst_sd_m] \n" \ + : [pdst_sd_m] "=m"(*pdst_sd_m) \ + : [val_m] "r"(val_m)); \ + }) +#else // !(__mips == 64) +#define SD(val, pdst) \ + ({ \ + uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \ + uint32_t val0_m, val1_m; \ + val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ + val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ + SW(val0_m, pdst_sd_m); \ + SW(val1_m, pdst_sd_m + 4); \ + }) +#endif // !(__mips == 64) +#else // !(__mips_isa_rev >= 6) +#define LW(psrc) \ + ({ \ + const uint8_t* psrc_lw_m = (const uint8_t*)(psrc); \ + uint32_t val_m; \ + asm volatile("ulw %[val_m], %[psrc_lw_m] \n" \ + : [val_m] "=r"(val_m) \ + : [psrc_lw_m] "m"(*psrc_lw_m)); \ + val_m; \ + }) + +#if (__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \ + uint64_t val_m = 0; \ + asm volatile("uld %[val_m], %[psrc_ld_m] \n" \ + : [val_m] "=r"(val_m) \ + : [psrc_ld_m] "m"(*psrc_ld_m)); \ + val_m; \ + }) +#else // !(__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \ + uint32_t val0_m, val1_m; \ + uint64_t val_m = 0; \ + val0_m = LW(psrc_ld_m); \ + val1_m = LW(psrc_ld_m + 4); \ + val_m = (uint64_t)(val1_m); /* NOLINT */ \ + val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \ + val_m = (uint64_t)(val_m | (uint64_t)val0_m); /* NOLINT */ \ + val_m; \ + }) +#endif // (__mips == 64) + +#define SW(val, pdst) \ + ({ \ + uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \ + uint32_t val_m = (val); \ + asm volatile("usw %[val_m], %[pdst_sw_m] \n" \ + : [pdst_sw_m] "=m"(*pdst_sw_m) \ + : [val_m] "r"(val_m)); \ + }) + +#define SD(val, pdst) \ + ({ \ + uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \ + uint32_t val0_m, val1_m; \ + val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ + val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ + SW(val0_m, pdst_sd_m); \ + SW(val1_m, pdst_sd_m + 4); \ + }) +#endif // (__mips_isa_rev >= 6) + +// TODO(fbarchard): Consider removing __VAR_ARGS versions. +#define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */ +#define LD_UB(...) LD_B(const v16u8, __VA_ARGS__) + +#define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */ +#define ST_UB(...) ST_B(v16u8, __VA_ARGS__) + +#define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */ +#define ST_UH(...) ST_H(v8u16, __VA_ARGS__) + +/* Description : Load two vectors with 16 'byte' sized elements + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Load 16 byte elements in 'out0' from (psrc) + Load 16 byte elements in 'out1' from (psrc + stride) +*/ +#define LD_B2(RTYPE, psrc, stride, out0, out1) \ + { \ + out0 = LD_B(RTYPE, (psrc)); \ + out1 = LD_B(RTYPE, (psrc) + stride); \ + } +#define LD_UB2(...) LD_B2(const v16u8, __VA_ARGS__) + +#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \ + { \ + LD_B2(RTYPE, (psrc), stride, out0, out1); \ + LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ + } +#define LD_UB4(...) LD_B4(const v16u8, __VA_ARGS__) + +/* Description : Store two vectors with stride each having 16 'byte' sized + elements + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 16 byte elements from 'in0' to (pdst) + Store 16 byte elements from 'in1' to (pdst + stride) +*/ +#define ST_B2(RTYPE, in0, in1, pdst, stride) \ + { \ + ST_B(RTYPE, in0, (pdst)); \ + ST_B(RTYPE, in1, (pdst) + stride); \ + } +#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) + +#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \ + { \ + ST_B2(RTYPE, in0, in1, (pdst), stride); \ + ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ + } +#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) + +/* Description : Store vectors of 8 halfword elements with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 8 halfword elements from 'in0' to (pdst) + Store 8 halfword elements from 'in1' to (pdst + stride) +*/ +#define ST_H2(RTYPE, in0, in1, pdst, stride) \ + { \ + ST_H(RTYPE, in0, (pdst)); \ + ST_H(RTYPE, in1, (pdst) + stride); \ + } +#define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__) + +// TODO(fbarchard): Consider using __msa_vshf_b and __msa_ilvr_b directly. +/* Description : Shuffle byte vector elements as per mask vector + Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Byte elements from 'in0' & 'in1' are copied selectively to + 'out0' as per control vector 'mask0' +*/ +#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ + out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \ + } +#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) + +/* Description : Interleave both left and right half of input vectors + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of byte elements from 'in0' and 'in1' are + interleaved and written to 'out0' +*/ +#define ILVRL_B2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ + } +#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) + +#endif /* !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) */ + +#endif // INCLUDE_LIBYUV_MACROS_MSA_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h new file mode 100644 index 0000000000..275f8d4c18 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h @@ -0,0 +1,195 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_ +#define INCLUDE_LIBYUV_MJPEG_DECODER_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +// NOTE: For a simplified public API use convert.h MJPGToI420(). + +struct jpeg_common_struct; +struct jpeg_decompress_struct; +struct jpeg_source_mgr; + +namespace libyuv { + +#ifdef __cplusplus +extern "C" { +#endif + +LIBYUV_BOOL ValidateJpeg(const uint8_t* sample, size_t sample_size); + +#ifdef __cplusplus +} // extern "C" +#endif + +static const uint32_t kUnknownDataSize = 0xFFFFFFFF; + +enum JpegSubsamplingType { + kJpegYuv420, + kJpegYuv422, + kJpegYuv444, + kJpegYuv400, + kJpegUnknown +}; + +struct Buffer { + const uint8_t* data; + int len; +}; + +struct BufferVector { + Buffer* buffers; + int len; + int pos; +}; + +struct SetJmpErrorMgr; + +// MJPEG ("Motion JPEG") is a pseudo-standard video codec where the frames are +// simply independent JPEG images with a fixed huffman table (which is omitted). +// It is rarely used in video transmission, but is common as a camera capture +// format, especially in Logitech devices. This class implements a decoder for +// MJPEG frames. +// +// See http://tools.ietf.org/html/rfc2435 +class LIBYUV_API MJpegDecoder { + public: + typedef void (*CallbackFunction)(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows); + + static const int kColorSpaceUnknown; + static const int kColorSpaceGrayscale; + static const int kColorSpaceRgb; + static const int kColorSpaceYCbCr; + static const int kColorSpaceCMYK; + static const int kColorSpaceYCCK; + + MJpegDecoder(); + ~MJpegDecoder(); + + // Loads a new frame, reads its headers, and determines the uncompressed + // image format. + // Returns LIBYUV_TRUE if image looks valid and format is supported. + // If return value is LIBYUV_TRUE, then the values for all the following + // getters are populated. + // src_len is the size of the compressed mjpeg frame in bytes. + LIBYUV_BOOL LoadFrame(const uint8_t* src, size_t src_len); + + // Returns width of the last loaded frame in pixels. + int GetWidth(); + + // Returns height of the last loaded frame in pixels. + int GetHeight(); + + // Returns format of the last loaded frame. The return value is one of the + // kColorSpace* constants. + int GetColorSpace(); + + // Number of color components in the color space. + int GetNumComponents(); + + // Sample factors of the n-th component. + int GetHorizSampFactor(int component); + + int GetVertSampFactor(int component); + + int GetHorizSubSampFactor(int component); + + int GetVertSubSampFactor(int component); + + // Public for testability. + int GetImageScanlinesPerImcuRow(); + + // Public for testability. + int GetComponentScanlinesPerImcuRow(int component); + + // Width of a component in bytes. + int GetComponentWidth(int component); + + // Height of a component. + int GetComponentHeight(int component); + + // Width of a component in bytes with padding for DCTSIZE. Public for testing. + int GetComponentStride(int component); + + // Size of a component in bytes. + int GetComponentSize(int component); + + // Call this after LoadFrame() if you decide you don't want to decode it + // after all. + LIBYUV_BOOL UnloadFrame(); + + // Decodes the entire image into a one-buffer-per-color-component format. + // dst_width must match exactly. dst_height must be <= to image height; if + // less, the image is cropped. "planes" must have size equal to at least + // GetNumComponents() and they must point to non-overlapping buffers of size + // at least GetComponentSize(i). The pointers in planes are incremented + // to point to after the end of the written data. + // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded. + LIBYUV_BOOL DecodeToBuffers(uint8_t** planes, int dst_width, int dst_height); + + // Decodes the entire image and passes the data via repeated calls to a + // callback function. Each call will get the data for a whole number of + // image scanlines. + // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded. + LIBYUV_BOOL DecodeToCallback(CallbackFunction fn, + void* opaque, + int dst_width, + int dst_height); + + // The helper function which recognizes the jpeg sub-sampling type. + static JpegSubsamplingType JpegSubsamplingTypeHelper( + int* subsample_x, + int* subsample_y, + int number_of_components); + + private: + void AllocOutputBuffers(int num_outbufs); + void DestroyOutputBuffers(); + + LIBYUV_BOOL StartDecode(); + LIBYUV_BOOL FinishDecode(); + + void SetScanlinePointers(uint8_t** data); + LIBYUV_BOOL DecodeImcuRow(); + + int GetComponentScanlinePadding(int component); + + // A buffer holding the input data for a frame. + Buffer buf_; + BufferVector buf_vec_; + + jpeg_decompress_struct* decompress_struct_; + jpeg_source_mgr* source_mgr_; + SetJmpErrorMgr* error_mgr_; + + // LIBYUV_TRUE iff at least one component has scanline padding. (i.e., + // GetComponentScanlinePadding() != 0.) + LIBYUV_BOOL has_scanline_padding_; + + // Temporaries used to point to scanline outputs. + int num_outbufs_; // Outermost size of all arrays below. + uint8_t*** scanlines_; + int* scanlines_sizes_; + // Temporary buffer used for decoding when we can't decode directly to the + // output buffers. Large enough for just one iMCU row. + uint8_t** databuf_; + int* databuf_strides_; +}; + +} // namespace libyuv + +#endif // __cplusplus +#endif // INCLUDE_LIBYUV_MJPEG_DECODER_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/planar_functions.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/planar_functions.h new file mode 100644 index 0000000000..91137baba2 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/planar_functions.h @@ -0,0 +1,847 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ +#define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ + +#include "libyuv/basic_types.h" + +// TODO(fbarchard): Remove the following headers includes. +#include "libyuv/convert.h" +#include "libyuv/convert_argb.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// TODO(fbarchard): Move cpu macros to row.h +#if defined(__pnacl__) || defined(__CLR_VER) || \ + (defined(__native_client__) && defined(__x86_64__)) || \ + (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) +#define LIBYUV_DISABLE_X86 +#endif +// MemorySanitizer does not support assembly code yet. http://crbug.com/344505 +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) +#define LIBYUV_DISABLE_X86 +#endif +#endif +// The following are available on all x86 platforms: +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +#define HAS_ARGBAFFINEROW_SSE2 +#endif + +// Copy a plane of data. +LIBYUV_API +void CopyPlane(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height); + +LIBYUV_API +void CopyPlane_16(const uint16_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + int width, + int height); + +LIBYUV_API +void Convert16To8Plane(const uint16_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int scale, // 16384 for 10 bits + int width, + int height); + +LIBYUV_API +void Convert8To16Plane(const uint8_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + int scale, // 1024 for 10 bits + int width, + int height); + +// Set a plane of data to a 32 bit value. +LIBYUV_API +void SetPlane(uint8_t* dst_y, + int dst_stride_y, + int width, + int height, + uint32_t value); + +// Split interleaved UV plane into separate U and V planes. +LIBYUV_API +void SplitUVPlane(const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Merge separate U and V planes into one interleaved UV plane. +LIBYUV_API +void MergeUVPlane(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height); + +// Split interleaved RGB plane into separate R, G and B planes. +LIBYUV_API +void SplitRGBPlane(const uint8_t* src_rgb, + int src_stride_rgb, + uint8_t* dst_r, + int dst_stride_r, + uint8_t* dst_g, + int dst_stride_g, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height); + +// Merge separate R, G and B planes into one interleaved RGB plane. +LIBYUV_API +void MergeRGBPlane(const uint8_t* src_r, + int src_stride_r, + const uint8_t* src_g, + int src_stride_g, + const uint8_t* src_b, + int src_stride_b, + uint8_t* dst_rgb, + int dst_stride_rgb, + int width, + int height); + +// Copy I400. Supports inverting. +LIBYUV_API +int I400ToI400(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height); + +#define J400ToJ400 I400ToI400 + +// Copy I422 to I422. +#define I422ToI422 I422Copy +LIBYUV_API +int I422Copy(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Copy I444 to I444. +#define I444ToI444 I444Copy +LIBYUV_API +int I444Copy(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Convert YUY2 to I422. +LIBYUV_API +int YUY2ToI422(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Convert UYVY to I422. +LIBYUV_API +int UYVYToI422(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +LIBYUV_API +int YUY2ToNV12(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height); + +LIBYUV_API +int UYVYToNV12(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height); + +LIBYUV_API +int YUY2ToY(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height); + +// Convert I420 to I400. (calls CopyPlane ignoring u/v). +LIBYUV_API +int I420ToI400(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height); + +// Alias +#define J420ToJ400 I420ToI400 +#define I420ToI420Mirror I420Mirror + +// I420 mirror. +LIBYUV_API +int I420Mirror(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Alias +#define I400ToI400Mirror I400Mirror + +// I400 mirror. A single plane is mirrored horizontally. +// Pass negative height to achieve 180 degree rotation. +LIBYUV_API +int I400Mirror(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height); + +// Alias +#define ARGBToARGBMirror ARGBMirror + +// ARGB mirror. +LIBYUV_API +int ARGBMirror(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert NV12 to RGB565. +LIBYUV_API +int NV12ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height); + +// I422ToARGB is in convert_argb.h +// Convert I422 to BGRA. +LIBYUV_API +int I422ToBGRA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_bgra, + int dst_stride_bgra, + int width, + int height); + +// Convert I422 to ABGR. +LIBYUV_API +int I422ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + +// Convert I422 to RGBA. +LIBYUV_API +int I422ToRGBA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, + int width, + int height); + +// Alias +#define RGB24ToRAW RAWToRGB24 + +LIBYUV_API +int RAWToRGB24(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height); + +// Draw a rectangle into I420. +LIBYUV_API +int I420Rect(uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int x, + int y, + int width, + int height, + int value_y, + int value_u, + int value_v); + +// Draw a rectangle into ARGB. +LIBYUV_API +int ARGBRect(uint8_t* dst_argb, + int dst_stride_argb, + int dst_x, + int dst_y, + int width, + int height, + uint32_t value); + +// Convert ARGB to gray scale ARGB. +LIBYUV_API +int ARGBGrayTo(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Make a rectangle of ARGB gray scale. +LIBYUV_API +int ARGBGray(uint8_t* dst_argb, + int dst_stride_argb, + int dst_x, + int dst_y, + int width, + int height); + +// Make a rectangle of ARGB Sepia tone. +LIBYUV_API +int ARGBSepia(uint8_t* dst_argb, + int dst_stride_argb, + int dst_x, + int dst_y, + int width, + int height); + +// Apply a matrix rotation to each ARGB pixel. +// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2. +// The first 4 coefficients apply to B, G, R, A and produce B of the output. +// The next 4 coefficients apply to B, G, R, A and produce G of the output. +// The next 4 coefficients apply to B, G, R, A and produce R of the output. +// The last 4 coefficients apply to B, G, R, A and produce A of the output. +LIBYUV_API +int ARGBColorMatrix(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + const int8_t* matrix_argb, + int width, + int height); + +// Deprecated. Use ARGBColorMatrix instead. +// Apply a matrix rotation to each ARGB pixel. +// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1. +// The first 4 coefficients apply to B, G, R, A and produce B of the output. +// The next 4 coefficients apply to B, G, R, A and produce G of the output. +// The last 4 coefficients apply to B, G, R, A and produce R of the output. +LIBYUV_API +int RGBColorMatrix(uint8_t* dst_argb, + int dst_stride_argb, + const int8_t* matrix_rgb, + int dst_x, + int dst_y, + int width, + int height); + +// Apply a color table each ARGB pixel. +// Table contains 256 ARGB values. +LIBYUV_API +int ARGBColorTable(uint8_t* dst_argb, + int dst_stride_argb, + const uint8_t* table_argb, + int dst_x, + int dst_y, + int width, + int height); + +// Apply a color table each ARGB pixel but preserve destination alpha. +// Table contains 256 ARGB values. +LIBYUV_API +int RGBColorTable(uint8_t* dst_argb, + int dst_stride_argb, + const uint8_t* table_argb, + int dst_x, + int dst_y, + int width, + int height); + +// Apply a luma/color table each ARGB pixel but preserve destination alpha. +// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from +// RGB (YJ style) and C is an 8 bit color component (R, G or B). +LIBYUV_API +int ARGBLumaColorTable(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + const uint8_t* luma, + int width, + int height); + +// Apply a 3 term polynomial to ARGB values. +// poly points to a 4x4 matrix. The first row is constants. The 2nd row is +// coefficients for b, g, r and a. The 3rd row is coefficients for b squared, +// g squared, r squared and a squared. The 4rd row is coefficients for b to +// the 3, g to the 3, r to the 3 and a to the 3. The values are summed and +// result clamped to 0 to 255. +// A polynomial approximation can be dirived using software such as 'R'. + +LIBYUV_API +int ARGBPolynomial(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + const float* poly, + int width, + int height); + +// Convert plane of 16 bit shorts to half floats. +// Source values are multiplied by scale before storing as half float. +LIBYUV_API +int HalfFloatPlane(const uint16_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + float scale, + int width, + int height); + +// Convert a buffer of bytes to floats, scale the values and store as floats. +LIBYUV_API +int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width); + +// Quantize a rectangle of ARGB. Alpha unaffected. +// scale is a 16 bit fractional fixed point scaler between 0 and 65535. +// interval_size should be a value between 1 and 255. +// interval_offset should be a value between 0 and 255. +LIBYUV_API +int ARGBQuantize(uint8_t* dst_argb, + int dst_stride_argb, + int scale, + int interval_size, + int interval_offset, + int dst_x, + int dst_y, + int width, + int height); + +// Copy ARGB to ARGB. +LIBYUV_API +int ARGBCopy(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Copy Alpha channel of ARGB to alpha of ARGB. +LIBYUV_API +int ARGBCopyAlpha(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Extract the alpha channel from ARGB. +LIBYUV_API +int ARGBExtractAlpha(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_a, + int dst_stride_a, + int width, + int height); + +// Copy Y channel to Alpha of ARGB. +LIBYUV_API +int ARGBCopyYToAlpha(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +typedef void (*ARGBBlendRow)(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); + +// Get function to Alpha Blend ARGB pixels and store to destination. +LIBYUV_API +ARGBBlendRow GetARGBBlend(); + +// Alpha Blend ARGB images and store to destination. +// Source is pre-multiplied by alpha using ARGBAttenuate. +// Alpha of destination is set to 255. +LIBYUV_API +int ARGBBlend(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Alpha Blend plane and store to destination. +// Source is not pre-multiplied by alpha. +LIBYUV_API +int BlendPlane(const uint8_t* src_y0, + int src_stride_y0, + const uint8_t* src_y1, + int src_stride_y1, + const uint8_t* alpha, + int alpha_stride, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height); + +// Alpha Blend YUV images and store to destination. +// Source is not pre-multiplied by alpha. +// Alpha is full width x height and subsampled to half size to apply to UV. +LIBYUV_API +int I420Blend(const uint8_t* src_y0, + int src_stride_y0, + const uint8_t* src_u0, + int src_stride_u0, + const uint8_t* src_v0, + int src_stride_v0, + const uint8_t* src_y1, + int src_stride_y1, + const uint8_t* src_u1, + int src_stride_u1, + const uint8_t* src_v1, + int src_stride_v1, + const uint8_t* alpha, + int alpha_stride, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255. +LIBYUV_API +int ARGBMultiply(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Add ARGB image with ARGB image. Saturates to 255. +LIBYUV_API +int ARGBAdd(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0. +LIBYUV_API +int ARGBSubtract(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert I422 to YUY2. +LIBYUV_API +int I422ToYUY2(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_yuy2, + int dst_stride_yuy2, + int width, + int height); + +// Convert I422 to UYVY. +LIBYUV_API +int I422ToUYVY(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uyvy, + int dst_stride_uyvy, + int width, + int height); + +// Convert unattentuated ARGB to preattenuated ARGB. +LIBYUV_API +int ARGBAttenuate(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert preattentuated ARGB to unattenuated ARGB. +LIBYUV_API +int ARGBUnattenuate(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Internal function - do not call directly. +// Computes table of cumulative sum for image where the value is the sum +// of all values above and to the left of the entry. Used by ARGBBlur. +LIBYUV_API +int ARGBComputeCumulativeSum(const uint8_t* src_argb, + int src_stride_argb, + int32_t* dst_cumsum, + int dst_stride32_cumsum, + int width, + int height); + +// Blur ARGB image. +// dst_cumsum table of width * (height + 1) * 16 bytes aligned to +// 16 byte boundary. +// dst_stride32_cumsum is number of ints in a row (width * 4). +// radius is number of pixels around the center. e.g. 1 = 3x3. 2=5x5. +// Blur is optimized for radius of 5 (11x11) or less. +LIBYUV_API +int ARGBBlur(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int32_t* dst_cumsum, + int dst_stride32_cumsum, + int width, + int height, + int radius); + +// Multiply ARGB image by ARGB value. +LIBYUV_API +int ARGBShade(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + uint32_t value); + +// Interpolate between two images using specified amount of interpolation +// (0 to 255) and store to destination. +// 'interpolation' is specified as 8 bit fraction where 0 means 100% src0 +// and 255 means 1% src0 and 99% src1. +LIBYUV_API +int InterpolatePlane(const uint8_t* src0, + int src_stride0, + const uint8_t* src1, + int src_stride1, + uint8_t* dst, + int dst_stride, + int width, + int height, + int interpolation); + +// Interpolate between two ARGB images using specified amount of interpolation +// Internally calls InterpolatePlane with width * 4 (bpp). +LIBYUV_API +int ARGBInterpolate(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + int interpolation); + +// Interpolate between two YUV images using specified amount of interpolation +// Internally calls InterpolatePlane on each plane where the U and V planes +// are half width and half height. +LIBYUV_API +int I420Interpolate(const uint8_t* src0_y, + int src0_stride_y, + const uint8_t* src0_u, + int src0_stride_u, + const uint8_t* src0_v, + int src0_stride_v, + const uint8_t* src1_y, + int src1_stride_y, + const uint8_t* src1_u, + int src1_stride_u, + const uint8_t* src1_v, + int src1_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + int interpolation); + +// Row function for copying pixels from a source with a slope to a row +// of destination. Useful for scaling, rotation, mirror, texture mapping. +LIBYUV_API +void ARGBAffineRow_C(const uint8_t* src_argb, + int src_argb_stride, + uint8_t* dst_argb, + const float* uv_dudv, + int width); +// TODO(fbarchard): Move ARGBAffineRow_SSE2 to row.h +LIBYUV_API +void ARGBAffineRow_SSE2(const uint8_t* src_argb, + int src_argb_stride, + uint8_t* dst_argb, + const float* uv_dudv, + int width); + +// Shuffle ARGB channel order. e.g. BGRA to ARGB. +// shuffler is 16 bytes and must be aligned. +LIBYUV_API +int ARGBShuffle(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_argb, + int dst_stride_argb, + const uint8_t* shuffler, + int width, + int height); + +// Sobel ARGB effect with planar output. +LIBYUV_API +int ARGBSobelToPlane(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height); + +// Sobel ARGB effect. +LIBYUV_API +int ARGBSobel(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Sobel ARGB effect w/ Sobel X, Sobel, Sobel Y in ARGB. +LIBYUV_API +int ARGBSobelXY(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate.h new file mode 100644 index 0000000000..76b692be8b --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate.h @@ -0,0 +1,164 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_ROTATE_H_ +#define INCLUDE_LIBYUV_ROTATE_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Supported rotation. +typedef enum RotationMode { + kRotate0 = 0, // No rotation. + kRotate90 = 90, // Rotate 90 degrees clockwise. + kRotate180 = 180, // Rotate 180 degrees. + kRotate270 = 270, // Rotate 270 degrees clockwise. + + // Deprecated. + kRotateNone = 0, + kRotateClockwise = 90, + kRotateCounterClockwise = 270, +} RotationModeEnum; + +// Rotate I420 frame. +LIBYUV_API +int I420Rotate(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + enum RotationMode mode); + +// Rotate NV12 input and store in I420. +LIBYUV_API +int NV12ToI420Rotate(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + enum RotationMode mode); + +// Rotate a plane by 0, 90, 180, or 270. +LIBYUV_API +int RotatePlane(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height, + enum RotationMode mode); + +// Rotate planes by 90, 180, 270. Deprecated. +LIBYUV_API +void RotatePlane90(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height); + +LIBYUV_API +void RotatePlane180(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height); + +LIBYUV_API +void RotatePlane270(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height); + +LIBYUV_API +void RotateUV90(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height); + +// Rotations for when U and V are interleaved. +// These functions take one input pointer and +// split the data into two buffers while +// rotating them. Deprecated. +LIBYUV_API +void RotateUV180(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height); + +LIBYUV_API +void RotateUV270(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height); + +// The 90 and 270 functions are based on transposes. +// Doing a transpose with reversing the read/write +// order will result in a rotation by +- 90 degrees. +// Deprecated. +LIBYUV_API +void TransposePlane(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height); + +LIBYUV_API +void TransposeUV(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_ROTATE_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate_argb.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate_argb.h new file mode 100644 index 0000000000..20432949ab --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate_argb.h @@ -0,0 +1,37 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_ +#define INCLUDE_LIBYUV_ROTATE_ARGB_H_ + +#include "libyuv/basic_types.h" +#include "libyuv/rotate.h" // For RotationMode. + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Rotate ARGB frame +LIBYUV_API +int ARGBRotate(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int src_width, + int src_height, + enum RotationMode mode); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_ROTATE_ARGB_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate_row.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate_row.h new file mode 100644 index 0000000000..5edc0fcf13 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate_row.h @@ -0,0 +1,194 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_ +#define INCLUDE_LIBYUV_ROTATE_ROW_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if defined(__pnacl__) || defined(__CLR_VER) || \ + (defined(__native_client__) && defined(__x86_64__)) || \ + (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) +#define LIBYUV_DISABLE_X86 +#endif +#if defined(__native_client__) +#define LIBYUV_DISABLE_NEON +#endif +// MemorySanitizer does not support assembly code yet. http://crbug.com/344505 +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) +#define LIBYUV_DISABLE_X86 +#endif +#endif +// The following are available for Visual C and clangcl 32 bit: +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) +#define HAS_TRANSPOSEWX8_SSSE3 +#define HAS_TRANSPOSEUVWX8_SSE2 +#endif + +// The following are available for GCC 32 or 64 bit: +#if !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__)) +#define HAS_TRANSPOSEWX8_SSSE3 +#endif + +// The following are available for 64 bit GCC: +#if !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__) +#define HAS_TRANSPOSEWX8_FAST_SSSE3 +#define HAS_TRANSPOSEUVWX8_SSE2 +#endif + +#if !defined(LIBYUV_DISABLE_NEON) && \ + (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) +#define HAS_TRANSPOSEWX8_NEON +#define HAS_TRANSPOSEUVWX8_NEON +#endif + +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#define HAS_TRANSPOSEWX16_MSA +#define HAS_TRANSPOSEUVWX16_MSA +#endif + +void TransposeWxH_C(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height); + +void TransposeWx8_C(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); +void TransposeWx16_C(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); +void TransposeWx8_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); +void TransposeWx8_SSSE3(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); +void TransposeWx8_Fast_SSSE3(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); +void TransposeWx16_MSA(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); + +void TransposeWx8_Any_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); +void TransposeWx8_Any_SSSE3(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); +void TransposeWx8_Fast_Any_SSSE3(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); +void TransposeWx16_Any_MSA(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); + +void TransposeUVWxH_C(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height); + +void TransposeUVWx8_C(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width); +void TransposeUVWx16_C(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width); +void TransposeUVWx8_SSE2(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width); +void TransposeUVWx8_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width); +void TransposeUVWx16_MSA(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width); + +void TransposeUVWx8_Any_SSE2(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width); +void TransposeUVWx8_Any_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width); +void TransposeUVWx16_Any_MSA(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_ROTATE_ROW_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/row.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/row.h new file mode 100644 index 0000000000..65ef448b8c --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/row.h @@ -0,0 +1,3471 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_ROW_H_ +#define INCLUDE_LIBYUV_ROW_H_ + +#include // For malloc. + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if defined(__pnacl__) || defined(__CLR_VER) || \ + (defined(__native_client__) && defined(__x86_64__)) || \ + (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) +#define LIBYUV_DISABLE_X86 +#endif +#if defined(__native_client__) +#define LIBYUV_DISABLE_NEON +#endif +// MemorySanitizer does not support assembly code yet. http://crbug.com/344505 +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) +#define LIBYUV_DISABLE_X86 +#endif +#endif +// clang >= 3.5.0 required for Arm64. +#if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON) +#if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5)) +#define LIBYUV_DISABLE_NEON +#endif // clang >= 3.5 +#endif // __clang__ + +// GCC >= 4.7.0 required for AVX2. +#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) +#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7)) +#define GCC_HAS_AVX2 1 +#endif // GNUC >= 4.7 +#endif // __GNUC__ + +// clang >= 3.4.0 required for AVX2. +#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) +#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4)) +#define CLANG_HAS_AVX2 1 +#endif // clang >= 3.4 +#endif // __clang__ + +// clang >= 6.0.0 required for AVX512. +// TODO(fbarchard): fix xcode 9 ios b/789. +#if 0 // Build fails in libvpx on Mac +#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) +#if (__clang_major__ >= 7) && !defined(__APPLE_EMBEDDED_SIMULATOR__) +#define CLANG_HAS_AVX512 1 +#endif // clang >= 7 +#endif // __clang__ +#endif // 0 + +// Visual C 2012 required for AVX2. +#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \ + _MSC_VER >= 1700 +#define VISUALC_HAS_AVX2 1 +#endif // VisualStudio >= 2012 + +// The following are available on all x86 platforms: +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +// Conversions: +#define HAS_ABGRTOUVROW_SSSE3 +#define HAS_ABGRTOYROW_SSSE3 +#define HAS_ARGB1555TOARGBROW_SSE2 +#define HAS_ARGB4444TOARGBROW_SSE2 +#define HAS_ARGBEXTRACTALPHAROW_SSE2 +#define HAS_ARGBSETROW_X86 +#define HAS_ARGBSHUFFLEROW_SSSE3 +#define HAS_ARGBTOARGB1555ROW_SSE2 +#define HAS_ARGBTOARGB4444ROW_SSE2 +#define HAS_ARGBTORAWROW_SSSE3 +#define HAS_ARGBTORGB24ROW_SSSE3 +#define HAS_ARGBTORGB565DITHERROW_SSE2 +#define HAS_ARGBTORGB565ROW_SSE2 +#define HAS_ARGBTOUV444ROW_SSSE3 +#define HAS_ARGBTOUVJROW_SSSE3 +#define HAS_ARGBTOUVROW_SSSE3 +#define HAS_ARGBTOYJROW_SSSE3 +#define HAS_ARGBTOYROW_SSSE3 +#define HAS_BGRATOUVROW_SSSE3 +#define HAS_BGRATOYROW_SSSE3 +#define HAS_COPYROW_ERMS +#define HAS_COPYROW_SSE2 +#define HAS_H422TOARGBROW_SSSE3 +#define HAS_HALFFLOATROW_SSE2 +#define HAS_I400TOARGBROW_SSE2 +#define HAS_I422TOARGB1555ROW_SSSE3 +#define HAS_I422TOARGB4444ROW_SSSE3 +#define HAS_I422TOARGBROW_SSSE3 +#define HAS_I422TORGB24ROW_SSSE3 +#define HAS_I422TORGB565ROW_SSSE3 +#define HAS_I422TORGBAROW_SSSE3 +#define HAS_I422TOUYVYROW_SSE2 +#define HAS_I422TOYUY2ROW_SSE2 +#define HAS_I444TOARGBROW_SSSE3 +#define HAS_J400TOARGBROW_SSE2 +#define HAS_J422TOARGBROW_SSSE3 +#define HAS_MERGEUVROW_SSE2 +#define HAS_MIRRORROW_SSSE3 +#define HAS_MIRRORUVROW_SSSE3 +#define HAS_NV12TOARGBROW_SSSE3 +#define HAS_NV12TORGB24ROW_SSSE3 +#define HAS_NV12TORGB565ROW_SSSE3 +#define HAS_NV21TOARGBROW_SSSE3 +#define HAS_NV21TORGB24ROW_SSSE3 +#define HAS_RAWTOARGBROW_SSSE3 +#define HAS_RAWTORGB24ROW_SSSE3 +#define HAS_RAWTOYROW_SSSE3 +#define HAS_RGB24TOARGBROW_SSSE3 +#define HAS_RGB24TOYROW_SSSE3 +#define HAS_RGB565TOARGBROW_SSE2 +#define HAS_RGBATOUVROW_SSSE3 +#define HAS_RGBATOYROW_SSSE3 +#define HAS_SETROW_ERMS +#define HAS_SETROW_X86 +#define HAS_SPLITUVROW_SSE2 +#define HAS_UYVYTOARGBROW_SSSE3 +#define HAS_UYVYTOUV422ROW_SSE2 +#define HAS_UYVYTOUVROW_SSE2 +#define HAS_UYVYTOYROW_SSE2 +#define HAS_YUY2TOARGBROW_SSSE3 +#define HAS_YUY2TOUV422ROW_SSE2 +#define HAS_YUY2TOUVROW_SSE2 +#define HAS_YUY2TOYROW_SSE2 + +// Effects: +#define HAS_ARGBADDROW_SSE2 +#define HAS_ARGBAFFINEROW_SSE2 +#define HAS_ARGBATTENUATEROW_SSSE3 +#define HAS_ARGBBLENDROW_SSSE3 +#define HAS_ARGBCOLORMATRIXROW_SSSE3 +#define HAS_ARGBCOLORTABLEROW_X86 +#define HAS_ARGBCOPYALPHAROW_SSE2 +#define HAS_ARGBCOPYYTOALPHAROW_SSE2 +#define HAS_ARGBGRAYROW_SSSE3 +#define HAS_ARGBLUMACOLORTABLEROW_SSSE3 +#define HAS_ARGBMIRRORROW_SSE2 +#define HAS_ARGBMULTIPLYROW_SSE2 +#define HAS_ARGBPOLYNOMIALROW_SSE2 +#define HAS_ARGBQUANTIZEROW_SSE2 +#define HAS_ARGBSEPIAROW_SSSE3 +#define HAS_ARGBSHADEROW_SSE2 +#define HAS_ARGBSUBTRACTROW_SSE2 +#define HAS_ARGBUNATTENUATEROW_SSE2 +#define HAS_BLENDPLANEROW_SSSE3 +#define HAS_COMPUTECUMULATIVESUMROW_SSE2 +#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 +#define HAS_INTERPOLATEROW_SSSE3 +#define HAS_RGBCOLORTABLEROW_X86 +#define HAS_SOBELROW_SSE2 +#define HAS_SOBELTOPLANEROW_SSE2 +#define HAS_SOBELXROW_SSE2 +#define HAS_SOBELXYROW_SSE2 +#define HAS_SOBELYROW_SSE2 + +// The following functions fail on gcc/clang 32 bit with fpic and framepointer. +// caveat: clangcl uses row_win.cc which works. +#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \ + defined(_MSC_VER) +// TODO(fbarchard): fix build error on android_full_debug=1 +// https://code.google.com/p/libyuv/issues/detail?id=517 +#define HAS_I422ALPHATOARGBROW_SSSE3 +#endif +#endif + +// The following are available on all x86 platforms, but +// require VS2012, clang 3.4 or gcc 4.7. +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \ + defined(GCC_HAS_AVX2)) +#define HAS_ARGBCOPYALPHAROW_AVX2 +#define HAS_ARGBCOPYYTOALPHAROW_AVX2 +#define HAS_ARGBEXTRACTALPHAROW_AVX2 +#define HAS_ARGBMIRRORROW_AVX2 +#define HAS_ARGBPOLYNOMIALROW_AVX2 +#define HAS_ARGBSHUFFLEROW_AVX2 +#define HAS_ARGBTORGB565DITHERROW_AVX2 +#define HAS_ARGBTOUVJROW_AVX2 +#define HAS_ARGBTOUVROW_AVX2 +#define HAS_ARGBTOYJROW_AVX2 +#define HAS_ARGBTOYROW_AVX2 +#define HAS_COPYROW_AVX +#define HAS_H422TOARGBROW_AVX2 +#define HAS_HALFFLOATROW_AVX2 +// #define HAS_HALFFLOATROW_F16C // Enable to test halffloat cast +#define HAS_I400TOARGBROW_AVX2 +#define HAS_I422TOARGB1555ROW_AVX2 +#define HAS_I422TOARGB4444ROW_AVX2 +#define HAS_I422TOARGBROW_AVX2 +#define HAS_I422TORGB24ROW_AVX2 +#define HAS_I422TORGB565ROW_AVX2 +#define HAS_I422TORGBAROW_AVX2 +#define HAS_I444TOARGBROW_AVX2 +#define HAS_INTERPOLATEROW_AVX2 +#define HAS_J422TOARGBROW_AVX2 +#define HAS_MERGEUVROW_AVX2 +#define HAS_MIRRORROW_AVX2 +#define HAS_NV12TOARGBROW_AVX2 +#define HAS_NV12TORGB24ROW_AVX2 +#define HAS_NV12TORGB565ROW_AVX2 +#define HAS_NV21TOARGBROW_AVX2 +#define HAS_NV21TORGB24ROW_AVX2 +#define HAS_SPLITUVROW_AVX2 +#define HAS_UYVYTOARGBROW_AVX2 +#define HAS_UYVYTOUV422ROW_AVX2 +#define HAS_UYVYTOUVROW_AVX2 +#define HAS_UYVYTOYROW_AVX2 +#define HAS_YUY2TOARGBROW_AVX2 +#define HAS_YUY2TOUV422ROW_AVX2 +#define HAS_YUY2TOUVROW_AVX2 +#define HAS_YUY2TOYROW_AVX2 + +// Effects: +#define HAS_ARGBADDROW_AVX2 +#define HAS_ARGBATTENUATEROW_AVX2 +#define HAS_ARGBMULTIPLYROW_AVX2 +#define HAS_ARGBSUBTRACTROW_AVX2 +#define HAS_ARGBUNATTENUATEROW_AVX2 +#define HAS_BLENDPLANEROW_AVX2 + +#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \ + defined(_MSC_VER) +// TODO(fbarchard): fix build error on android_full_debug=1 +// https://code.google.com/p/libyuv/issues/detail?id=517 +#define HAS_I422ALPHATOARGBROW_AVX2 +#endif +#endif + +// The following are available for AVX2 Visual C and clangcl 32 bit: +// TODO(fbarchard): Port to gcc. +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \ + (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2)) +#define HAS_ARGB1555TOARGBROW_AVX2 +#define HAS_ARGB4444TOARGBROW_AVX2 +#define HAS_ARGBTOARGB1555ROW_AVX2 +#define HAS_ARGBTOARGB4444ROW_AVX2 +#define HAS_ARGBTORGB565ROW_AVX2 +#define HAS_J400TOARGBROW_AVX2 +#define HAS_RGB565TOARGBROW_AVX2 +#endif + +// The following are also available on x64 Visual C. +#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && defined(_M_X64) && \ + (!defined(__clang__) || defined(__SSSE3__)) +#define HAS_I422ALPHATOARGBROW_SSSE3 +#define HAS_I422TOARGBROW_SSSE3 +#endif + +// The following are available for gcc/clang x86 platforms: +// TODO(fbarchard): Port to Visual C +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) +#define HAS_ABGRTOAR30ROW_SSSE3 +#define HAS_ARGBTOAR30ROW_SSSE3 +#define HAS_CONVERT16TO8ROW_SSSE3 +#define HAS_CONVERT8TO16ROW_SSE2 +// I210 is for H010. 2 = 422. I for 601 vs H for 709. +#define HAS_I210TOAR30ROW_SSSE3 +#define HAS_I210TOARGBROW_SSSE3 +#define HAS_I422TOAR30ROW_SSSE3 +#define HAS_MERGERGBROW_SSSE3 +#define HAS_SPLITRGBROW_SSSE3 +#endif + +// The following are available for AVX2 gcc/clang x86 platforms: +// TODO(fbarchard): Port to Visual C +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \ + (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) +#define HAS_ABGRTOAR30ROW_AVX2 +#define HAS_ARGBTOAR30ROW_AVX2 +#define HAS_ARGBTORAWROW_AVX2 +#define HAS_ARGBTORGB24ROW_AVX2 +#define HAS_CONVERT16TO8ROW_AVX2 +#define HAS_CONVERT8TO16ROW_AVX2 +#define HAS_I210TOAR30ROW_AVX2 +#define HAS_I210TOARGBROW_AVX2 +#define HAS_I422TOAR30ROW_AVX2 +#define HAS_I422TOUYVYROW_AVX2 +#define HAS_I422TOYUY2ROW_AVX2 +#define HAS_MERGEUVROW_16_AVX2 +#define HAS_MULTIPLYROW_16_AVX2 +#endif + +// The following are available for AVX512 clang x86 platforms: +// TODO(fbarchard): Port to GCC and Visual C +// TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789 +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \ + (defined(CLANG_HAS_AVX512)) +#define HAS_ARGBTORGB24ROW_AVX512VBMI +#endif + +// The following are available on Neon platforms: +#if !defined(LIBYUV_DISABLE_NEON) && \ + (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)) +#define HAS_ABGRTOUVROW_NEON +#define HAS_ABGRTOYROW_NEON +#define HAS_ARGB1555TOARGBROW_NEON +#define HAS_ARGB1555TOUVROW_NEON +#define HAS_ARGB1555TOYROW_NEON +#define HAS_ARGB4444TOARGBROW_NEON +#define HAS_ARGB4444TOUVROW_NEON +#define HAS_ARGB4444TOYROW_NEON +#define HAS_ARGBEXTRACTALPHAROW_NEON +#define HAS_ARGBSETROW_NEON +#define HAS_ARGBTOARGB1555ROW_NEON +#define HAS_ARGBTOARGB4444ROW_NEON +#define HAS_ARGBTORAWROW_NEON +#define HAS_ARGBTORGB24ROW_NEON +#define HAS_ARGBTORGB565DITHERROW_NEON +#define HAS_ARGBTORGB565ROW_NEON +#define HAS_ARGBTOUV444ROW_NEON +#define HAS_ARGBTOUVJROW_NEON +#define HAS_ARGBTOUVROW_NEON +#define HAS_ARGBTOYJROW_NEON +#define HAS_ARGBTOYROW_NEON +#define HAS_BGRATOUVROW_NEON +#define HAS_BGRATOYROW_NEON +#define HAS_BYTETOFLOATROW_NEON +#define HAS_COPYROW_NEON +#define HAS_HALFFLOATROW_NEON +#define HAS_I400TOARGBROW_NEON +#define HAS_I422ALPHATOARGBROW_NEON +#define HAS_I422TOARGB1555ROW_NEON +#define HAS_I422TOARGB4444ROW_NEON +#define HAS_I422TOARGBROW_NEON +#define HAS_I422TORGB24ROW_NEON +#define HAS_I422TORGB565ROW_NEON +#define HAS_I422TORGBAROW_NEON +#define HAS_I422TOUYVYROW_NEON +#define HAS_I422TOYUY2ROW_NEON +#define HAS_I444TOARGBROW_NEON +#define HAS_J400TOARGBROW_NEON +#define HAS_MERGEUVROW_NEON +#define HAS_MIRRORROW_NEON +#define HAS_MIRRORUVROW_NEON +#define HAS_NV12TOARGBROW_NEON +#define HAS_NV12TORGB24ROW_NEON +#define HAS_NV12TORGB565ROW_NEON +#define HAS_NV21TOARGBROW_NEON +#define HAS_NV21TORGB24ROW_NEON +#define HAS_RAWTOARGBROW_NEON +#define HAS_RAWTORGB24ROW_NEON +#define HAS_RAWTOUVROW_NEON +#define HAS_RAWTOYROW_NEON +#define HAS_RGB24TOARGBROW_NEON +#define HAS_RGB24TOUVROW_NEON +#define HAS_RGB24TOYROW_NEON +#define HAS_RGB565TOARGBROW_NEON +#define HAS_RGB565TOUVROW_NEON +#define HAS_RGB565TOYROW_NEON +#define HAS_RGBATOUVROW_NEON +#define HAS_RGBATOYROW_NEON +#define HAS_SETROW_NEON +#define HAS_SPLITRGBROW_NEON +#define HAS_SPLITUVROW_NEON +#define HAS_UYVYTOARGBROW_NEON +#define HAS_UYVYTOUV422ROW_NEON +#define HAS_UYVYTOUVROW_NEON +#define HAS_UYVYTOYROW_NEON +#define HAS_YUY2TOARGBROW_NEON +#define HAS_YUY2TOUV422ROW_NEON +#define HAS_YUY2TOUVROW_NEON +#define HAS_YUY2TOYROW_NEON + +// Effects: +#define HAS_ARGBADDROW_NEON +#define HAS_ARGBATTENUATEROW_NEON +#define HAS_ARGBBLENDROW_NEON +#define HAS_ARGBCOLORMATRIXROW_NEON +#define HAS_ARGBGRAYROW_NEON +#define HAS_ARGBMIRRORROW_NEON +#define HAS_ARGBMULTIPLYROW_NEON +#define HAS_ARGBQUANTIZEROW_NEON +#define HAS_ARGBSEPIAROW_NEON +#define HAS_ARGBSHADEROW_NEON +#define HAS_ARGBSHUFFLEROW_NEON +#define HAS_ARGBSUBTRACTROW_NEON +#define HAS_INTERPOLATEROW_NEON +#define HAS_SOBELROW_NEON +#define HAS_SOBELTOPLANEROW_NEON +#define HAS_SOBELXROW_NEON +#define HAS_SOBELXYROW_NEON +#define HAS_SOBELYROW_NEON +#endif + +// The following are available on AArch64 platforms: +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) +#define HAS_SCALESUMSAMPLES_NEON +#endif +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#define HAS_ABGRTOUVROW_MSA +#define HAS_ABGRTOYROW_MSA +#define HAS_ARGB1555TOARGBROW_MSA +#define HAS_ARGB1555TOUVROW_MSA +#define HAS_ARGB1555TOYROW_MSA +#define HAS_ARGB4444TOARGBROW_MSA +#define HAS_ARGBADDROW_MSA +#define HAS_ARGBATTENUATEROW_MSA +#define HAS_ARGBBLENDROW_MSA +#define HAS_ARGBCOLORMATRIXROW_MSA +#define HAS_ARGBEXTRACTALPHAROW_MSA +#define HAS_ARGBGRAYROW_MSA +#define HAS_ARGBMIRRORROW_MSA +#define HAS_ARGBMULTIPLYROW_MSA +#define HAS_ARGBQUANTIZEROW_MSA +#define HAS_ARGBSEPIAROW_MSA +#define HAS_ARGBSETROW_MSA +#define HAS_ARGBSHADEROW_MSA +#define HAS_ARGBSHUFFLEROW_MSA +#define HAS_ARGBSUBTRACTROW_MSA +#define HAS_ARGBTOARGB1555ROW_MSA +#define HAS_ARGBTOARGB4444ROW_MSA +#define HAS_ARGBTORAWROW_MSA +#define HAS_ARGBTORGB24ROW_MSA +#define HAS_ARGBTORGB565DITHERROW_MSA +#define HAS_ARGBTORGB565ROW_MSA +#define HAS_ARGBTOUV444ROW_MSA +#define HAS_ARGBTOUVJROW_MSA +#define HAS_ARGBTOUVROW_MSA +#define HAS_ARGBTOYJROW_MSA +#define HAS_ARGBTOYROW_MSA +#define HAS_BGRATOUVROW_MSA +#define HAS_BGRATOYROW_MSA +#define HAS_HALFFLOATROW_MSA +#define HAS_I400TOARGBROW_MSA +#define HAS_I422ALPHATOARGBROW_MSA +#define HAS_I422TOARGBROW_MSA +#define HAS_I422TORGB24ROW_MSA +#define HAS_I422TORGBAROW_MSA +#define HAS_I422TOUYVYROW_MSA +#define HAS_I422TOYUY2ROW_MSA +#define HAS_I444TOARGBROW_MSA +#define HAS_INTERPOLATEROW_MSA +#define HAS_J400TOARGBROW_MSA +#define HAS_MERGEUVROW_MSA +#define HAS_MIRRORROW_MSA +#define HAS_MIRRORUVROW_MSA +#define HAS_NV12TOARGBROW_MSA +#define HAS_NV12TORGB565ROW_MSA +#define HAS_NV21TOARGBROW_MSA +#define HAS_RAWTOARGBROW_MSA +#define HAS_RAWTORGB24ROW_MSA +#define HAS_RAWTOUVROW_MSA +#define HAS_RAWTOYROW_MSA +#define HAS_RGB24TOARGBROW_MSA +#define HAS_RGB24TOUVROW_MSA +#define HAS_RGB24TOYROW_MSA +#define HAS_RGB565TOARGBROW_MSA +#define HAS_RGB565TOUVROW_MSA +#define HAS_RGB565TOYROW_MSA +#define HAS_RGBATOUVROW_MSA +#define HAS_RGBATOYROW_MSA +#define HAS_SETROW_MSA +#define HAS_SOBELROW_MSA +#define HAS_SOBELTOPLANEROW_MSA +#define HAS_SOBELXROW_MSA +#define HAS_SOBELXYROW_MSA +#define HAS_SOBELYROW_MSA +#define HAS_SPLITUVROW_MSA +#define HAS_UYVYTOARGBROW_MSA +#define HAS_UYVYTOUVROW_MSA +#define HAS_UYVYTOYROW_MSA +#define HAS_YUY2TOARGBROW_MSA +#define HAS_YUY2TOUV422ROW_MSA +#define HAS_YUY2TOUVROW_MSA +#define HAS_YUY2TOYROW_MSA +#endif + +#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) +#if defined(VISUALC_HAS_AVX2) +#define SIMD_ALIGNED(var) __declspec(align(32)) var +#else +#define SIMD_ALIGNED(var) __declspec(align(16)) var +#endif +typedef __declspec(align(16)) int16_t vec16[8]; +typedef __declspec(align(16)) int32_t vec32[4]; +typedef __declspec(align(16)) int8_t vec8[16]; +typedef __declspec(align(16)) uint16_t uvec16[8]; +typedef __declspec(align(16)) uint32_t uvec32[4]; +typedef __declspec(align(16)) uint8_t uvec8[16]; +typedef __declspec(align(32)) int16_t lvec16[16]; +typedef __declspec(align(32)) int32_t lvec32[8]; +typedef __declspec(align(32)) int8_t lvec8[32]; +typedef __declspec(align(32)) uint16_t ulvec16[16]; +typedef __declspec(align(32)) uint32_t ulvec32[8]; +typedef __declspec(align(32)) uint8_t ulvec8[32]; +#elif !defined(__pnacl__) && (defined(__GNUC__) || defined(__clang__)) +// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const. +#if defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2) +#define SIMD_ALIGNED(var) var __attribute__((aligned(32))) +#else +#define SIMD_ALIGNED(var) var __attribute__((aligned(16))) +#endif +typedef int16_t __attribute__((vector_size(16))) vec16; +typedef int32_t __attribute__((vector_size(16))) vec32; +typedef int8_t __attribute__((vector_size(16))) vec8; +typedef uint16_t __attribute__((vector_size(16))) uvec16; +typedef uint32_t __attribute__((vector_size(16))) uvec32; +typedef uint8_t __attribute__((vector_size(16))) uvec8; +typedef int16_t __attribute__((vector_size(32))) lvec16; +typedef int32_t __attribute__((vector_size(32))) lvec32; +typedef int8_t __attribute__((vector_size(32))) lvec8; +typedef uint16_t __attribute__((vector_size(32))) ulvec16; +typedef uint32_t __attribute__((vector_size(32))) ulvec32; +typedef uint8_t __attribute__((vector_size(32))) ulvec8; +#else +#define SIMD_ALIGNED(var) var +typedef int16_t vec16[8]; +typedef int32_t vec32[4]; +typedef int8_t vec8[16]; +typedef uint16_t uvec16[8]; +typedef uint32_t uvec32[4]; +typedef uint8_t uvec8[16]; +typedef int16_t lvec16[16]; +typedef int32_t lvec32[8]; +typedef int8_t lvec8[32]; +typedef uint16_t ulvec16[16]; +typedef uint32_t ulvec32[8]; +typedef uint8_t ulvec8[32]; +#endif + +#if defined(__aarch64__) +// This struct is for Arm64 color conversion. +struct YuvConstants { + uvec16 kUVToRB; + uvec16 kUVToRB2; + uvec16 kUVToG; + uvec16 kUVToG2; + vec16 kUVBiasBGR; + vec32 kYToRgb; +}; +#elif defined(__arm__) +// This struct is for ArmV7 color conversion. +struct YuvConstants { + uvec8 kUVToRB; + uvec8 kUVToG; + vec16 kUVBiasBGR; + vec32 kYToRgb; +}; +#else +// This struct is for Intel color conversion. +struct YuvConstants { + int8_t kUVToB[32]; + int8_t kUVToG[32]; + int8_t kUVToR[32]; + int16_t kUVBiasB[16]; + int16_t kUVBiasG[16]; + int16_t kUVBiasR[16]; + int16_t kYToRgb[16]; +}; + +// Offsets into YuvConstants structure +#define KUVTOB 0 +#define KUVTOG 32 +#define KUVTOR 64 +#define KUVBIASB 96 +#define KUVBIASG 128 +#define KUVBIASR 160 +#define KYTORGB 192 +#endif + +// Conversion matrix for YUV to RGB +extern const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants); // BT.601 +extern const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants); // JPeg +extern const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants); // BT.709 + +// Conversion matrix for YVU to BGR +extern const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants); // BT.601 +extern const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants); // JPeg +extern const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants); // BT.709 + +#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1))) + +#define align_buffer_64(var, size) \ + uint8_t* var##_mem = (uint8_t*)(malloc((size) + 63)); /* NOLINT */ \ + uint8_t* var = (uint8_t*)(((intptr_t)(var##_mem) + 63) & ~63) /* NOLINT */ + +#define free_aligned_buffer_64(var) \ + free(var##_mem); \ + var = 0 + +#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__) +#define OMITFP +#else +#define OMITFP __attribute__((optimize("omit-frame-pointer"))) +#endif + +// NaCL macros for GCC x86 and x64. +#if defined(__native_client__) +#define LABELALIGN ".p2align 5\n" +#else +#define LABELALIGN +#endif + +// Intel Code Analizer markers. Insert IACA_START IACA_END around code to be +// measured and then run with iaca -64 libyuv_unittest. +// IACA_ASM_START amd IACA_ASM_END are equivalents that can be used within +// inline assembly blocks. +// example of iaca: +// ~/iaca-lin64/bin/iaca.sh -64 -analysis LATENCY out/Release/libyuv_unittest + +#if defined(__x86_64__) || defined(__i386__) + +#define IACA_ASM_START \ + ".byte 0x0F, 0x0B\n" \ + " movl $111, %%ebx\n" \ + ".byte 0x64, 0x67, 0x90\n" + +#define IACA_ASM_END \ + " movl $222, %%ebx\n" \ + ".byte 0x64, 0x67, 0x90\n" \ + ".byte 0x0F, 0x0B\n" + +#define IACA_SSC_MARK(MARK_ID) \ + __asm__ __volatile__("\n\t movl $" #MARK_ID \ + ", %%ebx" \ + "\n\t .byte 0x64, 0x67, 0x90" \ + : \ + : \ + : "memory"); + +#define IACA_UD_BYTES __asm__ __volatile__("\n\t .byte 0x0F, 0x0B"); + +#else /* Visual C */ +#define IACA_UD_BYTES \ + { __asm _emit 0x0F __asm _emit 0x0B } + +#define IACA_SSC_MARK(x) \ + { __asm mov ebx, x __asm _emit 0x64 __asm _emit 0x67 __asm _emit 0x90 } + +#define IACA_VC64_START __writegsbyte(111, 111); +#define IACA_VC64_END __writegsbyte(222, 222); +#endif + +#define IACA_START \ + { \ + IACA_UD_BYTES \ + IACA_SSC_MARK(111) \ + } +#define IACA_END \ + { \ + IACA_SSC_MARK(222) \ + IACA_UD_BYTES \ + } + +void I444ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422AlphaToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGBARow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgba, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB565Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB1555Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB4444Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB565Row_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); +void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I444ToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); + +void I422ToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGBARow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422AlphaToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB24Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB565Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB4444Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB1555Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB565Row_MSA(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void UYVYToARGBRow_MSA(const uint8_t* src_uyvy, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); + +void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width); +void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width); +void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width); +void ARGBToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width); +void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width); +void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width); +void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width); +void RGB24ToYRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width); +void RAWToYRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width); +void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width); +void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width); +void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void ARGBToUV444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVRow_NEON(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUV444Row_MSA(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVRow_MSA(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_NEON(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void BGRAToUVRow_NEON(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ABGRToUVRow_NEON(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGBAToUVRow_NEON(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RAWToUVRow_NEON(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void BGRAToUVRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ABGRToUVRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGBAToUVRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGB24ToUVRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RAWToUVRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGB565ToUVRow_MSA(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width); +void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width); +void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width); +void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width); +void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width); +void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width); +void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, + uint8_t* dst_y, + int width); +void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, + uint8_t* dst_y, + int width); +void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width); +void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, uint8_t* dst_y, int width); +void ARGBToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void ARGBToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void BGRAToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void ABGRToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void RGBAToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void RGB24ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void RAWToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width); +void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width); +void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width); +void ARGBToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void BGRAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ABGRToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGBAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYRow_Any_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width); +void RAWToYRow_Any_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width); +void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void BGRAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ABGRToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGBAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB565ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGB1555ToYRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGB4444ToYRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void BGRAToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ABGRToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGBAToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToYJRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB565ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGB1555ToYRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); + +void ARGBToUVRow_AVX2(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0, + int src_stride_bgra, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0, + int src_stride_rgba, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVRow_Any_AVX2(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_Any_AVX2(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVRow_Any_SSSE3(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_Any_SSSE3(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void BGRAToUVRow_Any_SSSE3(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ABGRToUVRow_Any_SSSE3(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGBAToUVRow_Any_SSSE3(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUV444Row_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUV444Row_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void BGRAToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ABGRToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGBAToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGB24ToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RAWToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGB565ToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGB1555ToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGB4444ToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void BGRAToUVRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ABGRToUVRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGBAToUVRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGB24ToUVRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RAWToUVRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGB565ToUVRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGB1555ToUVRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVRow_C(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_C(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVRow_C(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_C(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void BGRAToUVRow_C(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ABGRToUVRow_C(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGBAToUVRow_C(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGB24ToUVRow_C(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RAWToUVRow_C(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGB565ToUVRow_C(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGB1555ToUVRow_C(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGB4444ToUVRow_C(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_u, + uint8_t* dst_v, + int width); + +void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUV444Row_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); + +void ARGBToUV444Row_C(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); + +void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width); +void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width); +void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width); +void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width); +void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width); +void MirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width); +void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); + +void MirrorUVRow_SSSE3(const uint8_t* src, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void MirrorUVRow_NEON(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void MirrorUVRow_MSA(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void MirrorUVRow_C(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width); + +void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width); +void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width); +void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width); +void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width); +void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width); +void ARGBMirrorRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBMirrorRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBMirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); + +void SplitUVRow_C(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void SplitUVRow_SSE2(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void SplitUVRow_AVX2(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void SplitUVRow_NEON(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void SplitUVRow_MSA(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void SplitUVRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void SplitUVRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void SplitUVRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void SplitUVRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); + +void MergeUVRow_C(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width); +void MergeUVRow_SSE2(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width); +void MergeUVRow_AVX2(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width); +void MergeUVRow_NEON(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width); +void MergeUVRow_MSA(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width); +void MergeUVRow_Any_SSE2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void MergeUVRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void MergeUVRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void MergeUVRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); + +void SplitRGBRow_C(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width); +void SplitRGBRow_SSSE3(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width); +void SplitRGBRow_NEON(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width); +void SplitRGBRow_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width); +void SplitRGBRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width); + +void MergeRGBRow_C(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width); +void MergeRGBRow_SSSE3(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width); +void MergeRGBRow_NEON(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width); +void MergeRGBRow_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); +void MergeRGBRow_Any_NEON(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width); + +void MergeUVRow_16_C(const uint16_t* src_u, + const uint16_t* src_v, + uint16_t* dst_uv, + int scale, /* 64 for 10 bit */ + int width); +void MergeUVRow_16_AVX2(const uint16_t* src_u, + const uint16_t* src_v, + uint16_t* dst_uv, + int scale, + int width); + +void MultiplyRow_16_AVX2(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width); +void MultiplyRow_16_C(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width); + +void Convert8To16Row_C(const uint8_t* src_y, + uint16_t* dst_y, + int scale, + int width); +void Convert8To16Row_SSE2(const uint8_t* src_y, + uint16_t* dst_y, + int scale, + int width); +void Convert8To16Row_AVX2(const uint8_t* src_y, + uint16_t* dst_y, + int scale, + int width); +void Convert8To16Row_Any_SSE2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int scale, + int width); +void Convert8To16Row_Any_AVX2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int scale, + int width); + +void Convert16To8Row_C(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width); +void Convert16To8Row_SSSE3(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width); +void Convert16To8Row_AVX2(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width); +void Convert16To8Row_Any_SSSE3(const uint16_t* src_ptr, + uint8_t* dst_ptr, + int scale, + int width); +void Convert16To8Row_Any_AVX2(const uint16_t* src_ptr, + uint8_t* dst_ptr, + int scale, + int width); + +void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width); +void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width); +void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width); +void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width); +void CopyRow_MIPS(const uint8_t* src, uint8_t* dst, int count); +void CopyRow_C(const uint8_t* src, uint8_t* dst, int count); +void CopyRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void CopyRow_Any_AVX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void CopyRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); + +void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count); + +void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width); +void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width); +void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width); +void ARGBCopyAlphaRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBCopyAlphaRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); + +void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width); +void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_a, + int width); +void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_a, + int width); +void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, + uint8_t* dst_a, + int width); +void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb, + uint8_t* dst_a, + int width); +void ARGBExtractAlphaRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBExtractAlphaRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBExtractAlphaRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBExtractAlphaRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); + +void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width); +void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width); +void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width); +void ARGBCopyYToAlphaRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBCopyYToAlphaRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); + +void SetRow_C(uint8_t* dst, uint8_t v8, int width); +void SetRow_MSA(uint8_t* dst, uint8_t v8, int width); +void SetRow_X86(uint8_t* dst, uint8_t v8, int width); +void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width); +void SetRow_NEON(uint8_t* dst, uint8_t v8, int width); +void SetRow_Any_X86(uint8_t* dst_ptr, uint8_t v32, int width); +void SetRow_Any_NEON(uint8_t* dst_ptr, uint8_t v32, int width); + +void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width); +void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width); +void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width); +void ARGBSetRow_Any_NEON(uint8_t* dst_ptr, uint32_t v32, int width); +void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width); +void ARGBSetRow_Any_MSA(uint8_t* dst_ptr, uint32_t v32, int width); + +// ARGBShufflers for BGRAToARGB etc. +void ARGBShuffleRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width); +void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width); +void ARGBShuffleRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width); +void ARGBShuffleRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width); +void ARGBShuffleRow_MSA(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width); +void ARGBShuffleRow_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const uint8_t* param, + int width); +void ARGBShuffleRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const uint8_t* param, + int width); +void ARGBShuffleRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const uint8_t* param, + int width); +void ARGBShuffleRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const uint8_t* param, + int width); + +void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width); +void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width); +void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); +void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width); +void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width); +void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width); +void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width); +void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width); +void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width); + +void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width); +void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); +void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width); +void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width); +void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); +void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); +void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width); +void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width); +void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width); +void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width); +void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width); +void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width); +void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); +void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width); +void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); +void RGB565ToARGBRow_C(const uint8_t* src_rgb565, uint8_t* dst_argb, int width); +void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width); +void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width); +void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width); +void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width); +void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width); +void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width); + +void RGB24ToARGBRow_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void RAWToRGB24Row_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); + +void RGB565ToARGBRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGB1555ToARGBRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGB4444ToARGBRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void RGB565ToARGBRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGB1555ToARGBRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGB4444ToARGBRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); + +void RGB24ToARGBRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void RGB24ToARGBRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void RAWToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToRGB24Row_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void RAWToRGB24Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB565ToARGBRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void RGB565ToARGBRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGB1555ToARGBRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGB1555ToARGBRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGB4444ToARGBRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); + +void ARGB4444ToARGBRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); + +void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width); +void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width); +void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width); +void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width); +void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width); +void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width); +void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width); + +void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width); +void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width); + +void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width); + +void ARGBToRGB565DitherRow_C(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, + int width); +void ARGBToRGB565DitherRow_SSE2(const uint8_t* src, + uint8_t* dst, + const uint32_t dither4, + int width); +void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, + uint8_t* dst, + const uint32_t dither4, + int width); + +void ARGBToRGB565Row_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width); +void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width); +void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width); +void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width); + +void ARGBToRGB24Row_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb24, + int width); +void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width); +void ARGBToRGB565Row_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb565, + int width); +void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, + uint8_t* dst_argb1555, + int width); +void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_argb4444, + int width); +void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, + int width); +void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToARGB1555Row_MSA(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width); +void ARGBToARGB4444Row_MSA(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width); +void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, + int width); + +void ARGBToRGBARow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width); +void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width); + +void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width); +void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width); +void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width); +void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width); +void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width); +void J400ToARGBRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void J400ToARGBRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void J400ToARGBRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void J400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); + +void I444ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I422ToAR30Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I210ToAR30Row_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I210ToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I422AlphaToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB565Row_C(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB24Row_C(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToRGB24Row_C(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void YUY2ToARGBRow_C(const uint8_t* src_yuy2, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void UYVYToARGBRow_C(const uint8_t* src_uyvy, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGBARow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB24Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB4444Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB1555Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB565Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGBARow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I444ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I444ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I444ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I444ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); + +void I422ToAR30Row_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void I210ToAR30Row_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void I210ToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToAR30Row_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void I210ToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I210ToAR30Row_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422AlphaToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB24Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToRGB24Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB565Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB24Row_AVX2(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToRGB24Row_AVX2(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB565Row_AVX2(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* vu_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* vu_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGBARow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgba, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB4444Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB4444Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB1555Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB1555Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB565Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB565Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB24Row_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB24Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGBRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGBARow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I444ToARGBRow_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I444ToARGBRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGBRow_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToAR30Row_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I210ToAR30Row_Any_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I210ToARGBRow_Any_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToAR30Row_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I210ToARGBRow_Any_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I210ToAR30Row_Any_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422AlphaToARGBRow_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422AlphaToARGBRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToARGBRow_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToARGBRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToARGBRow_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToARGBRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB24Row_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToRGB24Row_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB24Row_Any_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToRGB24Row_Any_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB565Row_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB565Row_Any_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void YUY2ToARGBRow_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void UYVYToARGBRow_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void YUY2ToARGBRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void UYVYToARGBRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGBARow_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB4444Row_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB4444Row_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB1555Row_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB1555Row_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB565Row_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB565Row_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB24Row_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB24Row_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); + +void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, int width); +void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width); +void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width); +void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width); +void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width); +void I400ToARGBRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void I400ToARGBRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void I400ToARGBRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void I400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); + +// ARGB preattenuated alpha blend. +void ARGBBlendRow_SSSE3(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBBlendRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBBlendRow_MSA(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBBlendRow_C(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); + +// Unattenuated planar alpha blend. +void BlendPlaneRow_SSSE3(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width); +void BlendPlaneRow_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); +void BlendPlaneRow_AVX2(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width); +void BlendPlaneRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); +void BlendPlaneRow_C(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width); + +// ARGB multiply images. Same API as Blend, but these require +// pointer and width alignment for SSE2. +void ARGBMultiplyRow_C(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBMultiplyRow_Any_SSE2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBMultiplyRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBMultiplyRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void ARGBMultiplyRow_MSA(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBMultiplyRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); + +// ARGB add images. +void ARGBAddRow_C(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBAddRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBAddRow_Any_SSE2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void ARGBAddRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBAddRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void ARGBAddRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBAddRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void ARGBAddRow_MSA(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBAddRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); + +// ARGB subtract images. Same API as Blend, but these require +// pointer and width alignment for SSE2. +void ARGBSubtractRow_C(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBSubtractRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBSubtractRow_Any_SSE2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void ARGBSubtractRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBSubtractRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void ARGBSubtractRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBSubtractRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void ARGBSubtractRow_MSA(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBSubtractRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); + +void ARGBToRGB24Row_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToRAWRow_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToRGB565Row_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToARGB1555Row_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToARGB4444Row_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ABGRToAR30Row_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToAR30Row_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToRAWRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToRGB24Row_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToRGB24Row_Any_AVX512VBMI(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToRGB565DitherRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const uint32_t param, + int width); +void ARGBToRGB565DitherRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const uint32_t param, + int width); + +void ARGBToRGB565Row_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToARGB1555Row_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToARGB4444Row_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ABGRToAR30Row_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToAR30Row_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); + +void ARGBToRGB24Row_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToRAWRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToRGB565Row_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToARGB1555Row_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToARGB4444Row_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToRGB565DitherRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const uint32_t param, + int width); +void ARGBToRGB24Row_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToRAWRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToRGB565Row_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToARGB1555Row_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToARGB4444Row_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToRGB565DitherRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const uint32_t param, + int width); + +void I444ToARGBRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGBRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422AlphaToARGBRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGBARow_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB24Row_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB4444Row_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB1555Row_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB565Row_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToARGBRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToARGBRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB24Row_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToRGB24Row_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB565Row_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void YUY2ToARGBRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void UYVYToARGBRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I444ToARGBRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGBRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGBARow_Any_MSA(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422AlphaToARGBRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB24Row_Any_MSA(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB565Row_Any_MSA(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB4444Row_Any_MSA(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB1555Row_Any_MSA(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToARGBRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB565Row_Any_MSA(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToARGBRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void YUY2ToARGBRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void UYVYToARGBRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); + +void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width); +void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width); +void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width); +void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width); +void YUY2ToUVRow_MSA(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width); +void YUY2ToUVRow_C(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToUV422Row_C(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void YUY2ToUVRow_Any_AVX2(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToUV422Row_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void YUY2ToUVRow_Any_SSE2(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToUV422Row_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void YUY2ToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToUV422Row_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void YUY2ToUVRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToUV422Row_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width); +void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width); +void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width); +void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width); +void UYVYToUVRow_NEON(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width); +void UYVYToUVRow_MSA(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToUV422Row_MSA(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); + +void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width); +void UYVYToUVRow_C(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToUV422Row_C(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void UYVYToUVRow_Any_AVX2(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToUV422Row_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void UYVYToUVRow_Any_SSE2(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToUV422Row_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void UYVYToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToUV422Row_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void UYVYToUVRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToUV422Row_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); + +void I422ToYUY2Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_frame, + int width); +void I422ToUYVYRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_frame, + int width); +void I422ToYUY2Row_SSE2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width); +void I422ToUYVYRow_SSE2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width); +void I422ToYUY2Row_Any_SSE2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); +void I422ToUYVYRow_Any_SSE2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); +void I422ToYUY2Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width); +void I422ToUYVYRow_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width); +void I422ToYUY2Row_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); +void I422ToUYVYRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); +void I422ToYUY2Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width); +void I422ToUYVYRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width); +void I422ToYUY2Row_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); +void I422ToUYVYRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); +void I422ToYUY2Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width); +void I422ToUYVYRow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width); +void I422ToYUY2Row_Any_MSA(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); +void I422ToUYVYRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); + +// Effects related row functions. +void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width); +void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + int width); +void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width); +void ARGBAttenuateRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + int width); +void ARGBAttenuateRow_MSA(const uint8_t* src_argb, + uint8_t* dst_argb, + int width); +void ARGBAttenuateRow_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBAttenuateRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBAttenuateRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBAttenuateRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); + +// Inverse table for unattenuate, shared by C and SSE2. +extern const uint32_t fixed_invtbl8[256]; +void ARGBUnattenuateRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + int width); +void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width); +void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width); +void ARGBUnattenuateRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBUnattenuateRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); + +void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width); +void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width); +void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width); +void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width); + +void ARGBSepiaRow_C(uint8_t* dst_argb, int width); +void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width); +void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width); +void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width); + +void ARGBColorMatrixRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width); +void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width); +void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width); +void ARGBColorMatrixRow_MSA(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width); + +void ARGBColorTableRow_C(uint8_t* dst_argb, + const uint8_t* table_argb, + int width); +void ARGBColorTableRow_X86(uint8_t* dst_argb, + const uint8_t* table_argb, + int width); + +void RGBColorTableRow_C(uint8_t* dst_argb, + const uint8_t* table_argb, + int width); +void RGBColorTableRow_X86(uint8_t* dst_argb, + const uint8_t* table_argb, + int width); + +void ARGBQuantizeRow_C(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width); +void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width); +void ARGBQuantizeRow_NEON(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width); +void ARGBQuantizeRow_MSA(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width); + +void ARGBShadeRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value); +void ARGBShadeRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value); +void ARGBShadeRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value); +void ARGBShadeRow_MSA(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value); + +// Used for blur. +void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, + const int32_t* botleft, + int width, + int area, + uint8_t* dst, + int count); +void ComputeCumulativeSumRow_SSE2(const uint8_t* row, + int32_t* cumsum, + const int32_t* previous_cumsum, + int width); + +void CumulativeSumToAverageRow_C(const int32_t* tl, + const int32_t* bl, + int w, + int area, + uint8_t* dst, + int count); +void ComputeCumulativeSumRow_C(const uint8_t* row, + int32_t* cumsum, + const int32_t* previous_cumsum, + int width); + +LIBYUV_API +void ARGBAffineRow_C(const uint8_t* src_argb, + int src_argb_stride, + uint8_t* dst_argb, + const float* uv_dudv, + int width); +LIBYUV_API +void ARGBAffineRow_SSE2(const uint8_t* src_argb, + int src_argb_stride, + uint8_t* dst_argb, + const float* src_dudv, + int width); + +// Used for I420Scale, ARGBScale, and ARGBInterpolate. +void InterpolateRow_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction); +void InterpolateRow_SSSE3(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction); +void InterpolateRow_AVX2(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction); +void InterpolateRow_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction); +void InterpolateRow_MSA(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction); +void InterpolateRow_Any_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride_ptr, + int width, + int source_y_fraction); +void InterpolateRow_Any_SSSE3(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride_ptr, + int width, + int source_y_fraction); +void InterpolateRow_Any_AVX2(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride_ptr, + int width, + int source_y_fraction); +void InterpolateRow_Any_MSA(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride_ptr, + int width, + int source_y_fraction); + +void InterpolateRow_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction); + +// Sobel images. +void SobelXRow_C(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width); +void SobelXRow_SSE2(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width); +void SobelXRow_NEON(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width); +void SobelXRow_MSA(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width); +void SobelYRow_C(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width); +void SobelYRow_SSE2(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width); +void SobelYRow_NEON(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width); +void SobelYRow_MSA(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width); +void SobelRow_C(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width); +void SobelRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width); +void SobelRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width); +void SobelRow_MSA(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width); +void SobelToPlaneRow_C(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width); +void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width); +void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width); +void SobelToPlaneRow_MSA(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width); +void SobelXYRow_C(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width); +void SobelXYRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width); +void SobelXYRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width); +void SobelXYRow_MSA(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width); +void SobelRow_Any_SSE2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void SobelRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void SobelRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void SobelToPlaneRow_Any_SSE2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void SobelToPlaneRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void SobelToPlaneRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void SobelXYRow_Any_SSE2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void SobelXYRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void SobelXYRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); + +void ARGBPolynomialRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + const float* poly, + int width); +void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, + const float* poly, + int width); +void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + const float* poly, + int width); + +// Scale and convert to half float. +void HalfFloatRow_C(const uint16_t* src, uint16_t* dst, float scale, int width); +void HalfFloatRow_SSE2(const uint16_t* src, + uint16_t* dst, + float scale, + int width); +void HalfFloatRow_Any_SSE2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + float param, + int width); +void HalfFloatRow_AVX2(const uint16_t* src, + uint16_t* dst, + float scale, + int width); +void HalfFloatRow_Any_AVX2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + float param, + int width); +void HalfFloatRow_F16C(const uint16_t* src, + uint16_t* dst, + float scale, + int width); +void HalfFloatRow_Any_F16C(const uint16_t* src, + uint16_t* dst, + float scale, + int width); +void HalfFloat1Row_F16C(const uint16_t* src, + uint16_t* dst, + float scale, + int width); +void HalfFloat1Row_Any_F16C(const uint16_t* src, + uint16_t* dst, + float scale, + int width); +void HalfFloatRow_NEON(const uint16_t* src, + uint16_t* dst, + float scale, + int width); +void HalfFloatRow_Any_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + float param, + int width); +void HalfFloat1Row_NEON(const uint16_t* src, + uint16_t* dst, + float scale, + int width); +void HalfFloat1Row_Any_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + float param, + int width); +void HalfFloatRow_MSA(const uint16_t* src, + uint16_t* dst, + float scale, + int width); +void HalfFloatRow_Any_MSA(const uint16_t* src_ptr, + uint16_t* dst_ptr, + float param, + int width); +void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width); +void ByteToFloatRow_NEON(const uint8_t* src, + float* dst, + float scale, + int width); +void ByteToFloatRow_Any_NEON(const uint8_t* src_ptr, + float* dst_ptr, + float param, + int width); + +void ARGBLumaColorTableRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + const uint8_t* luma, + uint32_t lumacoeff); +void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + const uint8_t* luma, + uint32_t lumacoeff); + +float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width); +float ScaleMaxSamples_NEON(const float* src, + float* dst, + float scale, + int width); +float ScaleSumSamples_C(const float* src, float* dst, float scale, int width); +float ScaleSumSamples_NEON(const float* src, + float* dst, + float scale, + int width); +void ScaleSamples_C(const float* src, float* dst, float scale, int width); +void ScaleSamples_NEON(const float* src, float* dst, float scale, int width); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_ROW_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale.h new file mode 100644 index 0000000000..b937d348ca --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale.h @@ -0,0 +1,131 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_SCALE_H_ +#define INCLUDE_LIBYUV_SCALE_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Supported filtering. +typedef enum FilterMode { + kFilterNone = 0, // Point sample; Fastest. + kFilterLinear = 1, // Filter horizontally only. + kFilterBilinear = 2, // Faster than box, but lower quality scaling down. + kFilterBox = 3 // Highest quality. +} FilterModeEnum; + +// Scale a YUV plane. +LIBYUV_API +void ScalePlane(const uint8_t* src, + int src_stride, + int src_width, + int src_height, + uint8_t* dst, + int dst_stride, + int dst_width, + int dst_height, + enum FilterMode filtering); + +LIBYUV_API +void ScalePlane_16(const uint16_t* src, + int src_stride, + int src_width, + int src_height, + uint16_t* dst, + int dst_stride, + int dst_width, + int dst_height, + enum FilterMode filtering); + +// Scales a YUV 4:2:0 image from the src width and height to the +// dst width and height. +// If filtering is kFilterNone, a simple nearest-neighbor algorithm is +// used. This produces basic (blocky) quality at the fastest speed. +// If filtering is kFilterBilinear, interpolation is used to produce a better +// quality image, at the expense of speed. +// If filtering is kFilterBox, averaging is used to produce ever better +// quality image, at further expense of speed. +// Returns 0 if successful. + +LIBYUV_API +int I420Scale(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, + enum FilterMode filtering); + +LIBYUV_API +int I420Scale_16(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, + enum FilterMode filtering); + +#ifdef __cplusplus +// Legacy API. Deprecated. +LIBYUV_API +int Scale(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + int src_stride_y, + int src_stride_u, + int src_stride_v, + int src_width, + int src_height, + uint8_t* dst_y, + uint8_t* dst_u, + uint8_t* dst_v, + int dst_stride_y, + int dst_stride_u, + int dst_stride_v, + int dst_width, + int dst_height, + LIBYUV_BOOL interpolate); + +// For testing, allow disabling of specialized scalers. +LIBYUV_API +void SetUseReferenceImpl(LIBYUV_BOOL use); +#endif // __cplusplus + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_SCALE_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale_argb.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale_argb.h new file mode 100644 index 0000000000..7641f18e34 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale_argb.h @@ -0,0 +1,76 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_ +#define INCLUDE_LIBYUV_SCALE_ARGB_H_ + +#include "libyuv/basic_types.h" +#include "libyuv/scale.h" // For FilterMode + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +LIBYUV_API +int ARGBScale(const uint8_t* src_argb, + int src_stride_argb, + int src_width, + int src_height, + uint8_t* dst_argb, + int dst_stride_argb, + int dst_width, + int dst_height, + enum FilterMode filtering); + +// Clipped scale takes destination rectangle coordinates for clip values. +LIBYUV_API +int ARGBScaleClip(const uint8_t* src_argb, + int src_stride_argb, + int src_width, + int src_height, + uint8_t* dst_argb, + int dst_stride_argb, + int dst_width, + int dst_height, + int clip_x, + int clip_y, + int clip_width, + int clip_height, + enum FilterMode filtering); + +// Scale with YUV conversion to ARGB and clipping. +LIBYUV_API +int YUVToARGBScaleClip(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint32_t src_fourcc, + int src_width, + int src_height, + uint8_t* dst_argb, + int dst_stride_argb, + uint32_t dst_fourcc, + int dst_width, + int dst_height, + int clip_x, + int clip_y, + int clip_width, + int clip_height, + enum FilterMode filtering); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_SCALE_ARGB_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale_row.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale_row.h new file mode 100644 index 0000000000..7194ba09f8 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale_row.h @@ -0,0 +1,944 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_ +#define INCLUDE_LIBYUV_SCALE_ROW_H_ + +#include "libyuv/basic_types.h" +#include "libyuv/scale.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if defined(__pnacl__) || defined(__CLR_VER) || \ + (defined(__native_client__) && defined(__x86_64__)) || \ + (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) +#define LIBYUV_DISABLE_X86 +#endif +#if defined(__native_client__) +#define LIBYUV_DISABLE_NEON +#endif +// MemorySanitizer does not support assembly code yet. http://crbug.com/344505 +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) +#define LIBYUV_DISABLE_X86 +#endif +#endif +// GCC >= 4.7.0 required for AVX2. +#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) +#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7)) +#define GCC_HAS_AVX2 1 +#endif // GNUC >= 4.7 +#endif // __GNUC__ + +// clang >= 3.4.0 required for AVX2. +#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) +#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4)) +#define CLANG_HAS_AVX2 1 +#endif // clang >= 3.4 +#endif // __clang__ + +// Visual C 2012 required for AVX2. +#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \ + _MSC_VER >= 1700 +#define VISUALC_HAS_AVX2 1 +#endif // VisualStudio >= 2012 + +// The following are available on all x86 platforms: +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +#define HAS_FIXEDDIV1_X86 +#define HAS_FIXEDDIV_X86 +#define HAS_SCALEARGBCOLS_SSE2 +#define HAS_SCALEARGBCOLSUP2_SSE2 +#define HAS_SCALEARGBFILTERCOLS_SSSE3 +#define HAS_SCALEARGBROWDOWN2_SSE2 +#define HAS_SCALEARGBROWDOWNEVEN_SSE2 +#define HAS_SCALECOLSUP2_SSE2 +#define HAS_SCALEFILTERCOLS_SSSE3 +#define HAS_SCALEROWDOWN2_SSSE3 +#define HAS_SCALEROWDOWN34_SSSE3 +#define HAS_SCALEROWDOWN38_SSSE3 +#define HAS_SCALEROWDOWN4_SSSE3 +#define HAS_SCALEADDROW_SSE2 +#endif + +// The following are available on all x86 platforms, but +// require VS2012, clang 3.4 or gcc 4.7. +// The code supports NaCL but requires a new compiler and validator. +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \ + defined(GCC_HAS_AVX2)) +#define HAS_SCALEADDROW_AVX2 +#define HAS_SCALEROWDOWN2_AVX2 +#define HAS_SCALEROWDOWN4_AVX2 +#endif + +// The following are available on Neon platforms: +#if !defined(LIBYUV_DISABLE_NEON) && \ + (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) +#define HAS_SCALEARGBCOLS_NEON +#define HAS_SCALEARGBROWDOWN2_NEON +#define HAS_SCALEARGBROWDOWNEVEN_NEON +#define HAS_SCALEFILTERCOLS_NEON +#define HAS_SCALEROWDOWN2_NEON +#define HAS_SCALEROWDOWN34_NEON +#define HAS_SCALEROWDOWN38_NEON +#define HAS_SCALEROWDOWN4_NEON +#define HAS_SCALEARGBFILTERCOLS_NEON +#endif + +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#define HAS_SCALEADDROW_MSA +#define HAS_SCALEARGBCOLS_MSA +#define HAS_SCALEARGBFILTERCOLS_MSA +#define HAS_SCALEARGBROWDOWN2_MSA +#define HAS_SCALEARGBROWDOWNEVEN_MSA +#define HAS_SCALEFILTERCOLS_MSA +#define HAS_SCALEROWDOWN2_MSA +#define HAS_SCALEROWDOWN34_MSA +#define HAS_SCALEROWDOWN38_MSA +#define HAS_SCALEROWDOWN4_MSA +#endif + +// Scale ARGB vertically with bilinear interpolation. +void ScalePlaneVertical(int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_argb, + uint8_t* dst_argb, + int x, + int y, + int dy, + int bpp, + enum FilterMode filtering); + +void ScalePlaneVertical_16(int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_argb, + uint16_t* dst_argb, + int x, + int y, + int dy, + int wpp, + enum FilterMode filtering); + +// Simplify the filtering based on scale factors. +enum FilterMode ScaleFilterReduce(int src_width, + int src_height, + int dst_width, + int dst_height, + enum FilterMode filtering); + +// Divide num by div and return as 16.16 fixed point result. +int FixedDiv_C(int num, int div); +int FixedDiv_X86(int num, int div); +// Divide num - 1 by div - 1 and return as 16.16 fixed point result. +int FixedDiv1_C(int num, int div); +int FixedDiv1_X86(int num, int div); +#ifdef HAS_FIXEDDIV_X86 +#define FixedDiv FixedDiv_X86 +#define FixedDiv1 FixedDiv1_X86 +#else +#define FixedDiv FixedDiv_C +#define FixedDiv1 FixedDiv1_C +#endif + +// Compute slope values for stepping. +void ScaleSlope(int src_width, + int src_height, + int dst_width, + int dst_height, + enum FilterMode filtering, + int* x, + int* y, + int* dx, + int* dy); + +void ScaleRowDown2_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width); +void ScaleRowDown2Linear_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width); +void ScaleRowDown2Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2Box_Odd_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width); +void ScaleRowDown4_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown4_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width); +void ScaleRowDown4Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown4Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width); +void ScaleRowDown34_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown34_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width); +void ScaleRowDown34_0_Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width); +void ScaleRowDown34_0_Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* d, + int dst_width); +void ScaleRowDown34_1_Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width); +void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* d, + int dst_width); +void ScaleCols_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleCols_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleColsUp2_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int, + int); +void ScaleColsUp2_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + int dst_width, + int, + int); +void ScaleFilterCols_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleFilterCols_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleFilterCols64_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x32, + int dx); +void ScaleFilterCols64_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + int dst_width, + int x32, + int dx); +void ScaleRowDown38_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown38_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width); +void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + int dst_width); +void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + int dst_width); +void ScaleAddRow_C(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); +void ScaleAddRow_16_C(const uint16_t* src_ptr, + uint32_t* dst_ptr, + int src_width); +void ScaleARGBRowDown2_C(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Linear_C(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Box_C(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDownEven_C(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDownEvenBox_C(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBCols_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBCols64_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x32, + int dx); +void ScaleARGBColsUp2_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int, + int); +void ScaleARGBFilterCols_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBFilterCols64_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x32, + int dx); + +// Specialized scalers for x86. +void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); + +void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Box_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Box_Odd_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2_Any_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Linear_Any_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Box_Any_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Box_Odd_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4Box_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4_Any_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4Box_Any_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); + +void ScaleRowDown34_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_1_Box_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_0_Box_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_3_Box_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_2_Box_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); + +void ScaleAddRow_SSE2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); +void ScaleAddRow_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); +void ScaleAddRow_Any_SSE2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width); +void ScaleAddRow_Any_AVX2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width); + +void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleColsUp2_SSE2(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); + +// ARGB Column functions +void ScaleARGBCols_SSE2(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBCols_NEON(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBFilterCols_Any_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleARGBCols_Any_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleARGBFilterCols_MSA(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBCols_MSA(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBFilterCols_Any_MSA(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleARGBCols_Any_MSA(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); + +// ARGB Row functions +void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleARGBRowDown2_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2_Any_SSE2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDown2Linear_Any_SSE2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDown2Box_Any_SSE2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDown2_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDown2Linear_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDown2Box_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDown2_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDown2Linear_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDown2Box_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); + +void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDownEven_Any_SSE2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDownEven_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDownEvenBox_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDownEven_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDownEvenBox_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_ptr, + int dst_width); + +// ScaleRowDown2Box also used by planar functions +// NEON downscalers with interpolation. + +// Note - not static due to reuse in convert for 444 to 420. +void ScaleRowDown2_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); + +void ScaleRowDown4_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); + +// Down scale from 4 to 3 pixels. Use the neon multilane read/write +// to load up the every 4th pixel into a 4 different registers. +// Point samples 32 pixels to 24 pixels. +void ScaleRowDown34_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); + +// 32 -> 12 +void ScaleRowDown38_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +// 32x3 -> 12x1 +void ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +// 32x2 -> 12x1 +void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); + +void ScaleRowDown2_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Linear_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Box_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Box_Odd_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4Box_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_0_Box_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_1_Box_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +// 32 -> 12 +void ScaleRowDown38_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +// 32x3 -> 12x1 +void ScaleRowDown38_3_Box_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +// 32x2 -> 12x1 +void ScaleRowDown38_2_Box_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); + +void ScaleAddRow_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); +void ScaleAddRow_Any_NEON(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width); + +void ScaleFilterCols_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); + +void ScaleFilterCols_Any_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); + +void ScaleRowDown2_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown4_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown4Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown38_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); +void ScaleFilterCols_MSA(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleRowDown34_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown34_0_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width); +void ScaleRowDown34_1_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width); + +void ScaleRowDown2_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Linear_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Box_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4Box_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_2_Box_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_3_Box_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleAddRow_Any_MSA(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width); +void ScaleFilterCols_Any_MSA(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleRowDown34_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_0_Box_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_1_Box_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_SCALE_ROW_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/version.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/version.h new file mode 100644 index 0000000000..7022785d8c --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/version.h @@ -0,0 +1,16 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_VERSION_H_ +#define INCLUDE_LIBYUV_VERSION_H_ + +#define LIBYUV_VERSION 1711 + +#endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/video_common.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/video_common.h new file mode 100644 index 0000000000..bcef378b5a --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/video_common.h @@ -0,0 +1,188 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// Common definitions for video, including fourcc and VideoFormat. + +#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_ +#define INCLUDE_LIBYUV_VIDEO_COMMON_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +////////////////////////////////////////////////////////////////////////////// +// Definition of FourCC codes +////////////////////////////////////////////////////////////////////////////// + +// Convert four characters to a FourCC code. +// Needs to be a macro otherwise the OS X compiler complains when the kFormat* +// constants are used in a switch. +#ifdef __cplusplus +#define FOURCC(a, b, c, d) \ + ((static_cast(a)) | (static_cast(b) << 8) | \ + (static_cast(c) << 16) | (static_cast(d) << 24)) +#else +#define FOURCC(a, b, c, d) \ + (((uint32_t)(a)) | ((uint32_t)(b) << 8) | /* NOLINT */ \ + ((uint32_t)(c) << 16) | ((uint32_t)(d) << 24)) /* NOLINT */ +#endif + +// Some pages discussing FourCC codes: +// http://www.fourcc.org/yuv.php +// http://v4l2spec.bytesex.org/spec/book1.htm +// http://developer.apple.com/quicktime/icefloe/dispatch020.html +// http://msdn.microsoft.com/library/windows/desktop/dd206750.aspx#nv12 +// http://people.xiph.org/~xiphmont/containers/nut/nut4cc.txt + +// FourCC codes grouped according to implementation efficiency. +// Primary formats should convert in 1 efficient step. +// Secondary formats are converted in 2 steps. +// Auxilliary formats call primary converters. +enum FourCC { + // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed. + FOURCC_I420 = FOURCC('I', '4', '2', '0'), + FOURCC_I422 = FOURCC('I', '4', '2', '2'), + FOURCC_I444 = FOURCC('I', '4', '4', '4'), + FOURCC_I400 = FOURCC('I', '4', '0', '0'), + FOURCC_NV21 = FOURCC('N', 'V', '2', '1'), + FOURCC_NV12 = FOURCC('N', 'V', '1', '2'), + FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'), + FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'), + FOURCC_H010 = FOURCC('H', '0', '1', '0'), // unofficial fourcc. 10 bit lsb + + // 1 Secondary YUV format: row biplanar. + FOURCC_M420 = FOURCC('M', '4', '2', '0'), + + // 11 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc + FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'), + FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'), + FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'), + FOURCC_AR30 = FOURCC('A', 'R', '3', '0'), // 10 bit per channel. 2101010. + FOURCC_AB30 = FOURCC('A', 'B', '3', '0'), // ABGR version of 10 bit + FOURCC_24BG = FOURCC('2', '4', 'B', 'G'), + FOURCC_RAW = FOURCC('r', 'a', 'w', ' '), + FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'), + FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'), // rgb565 LE. + FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'), // argb1555 LE. + FOURCC_R444 = FOURCC('R', '4', '4', '4'), // argb4444 LE. + + // 1 Primary Compressed YUV format. + FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'), + + // 7 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias. + FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'), + FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'), + FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'), + FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Linux version of I420. + FOURCC_J420 = FOURCC('J', '4', '2', '0'), + FOURCC_J400 = FOURCC('J', '4', '0', '0'), // unofficial fourcc + FOURCC_H420 = FOURCC('H', '4', '2', '0'), // unofficial fourcc + + // 14 Auxiliary aliases. CanonicalFourCC() maps these to canonical fourcc. + FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420. + FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'), // Alias for I422. + FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'), // Alias for I444. + FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'), // Alias for YUY2. + FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'), // Alias for YUY2 on Mac. + FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'), // Alias for UYVY. + FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'), // Alias for UYVY on Mac. + FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'), // Alias for MJPG. + FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'), // Alias for MJPG on Mac. + FOURCC_BA81 = FOURCC('B', 'A', '8', '1'), // Alias for BGGR. + FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'), // Alias for RAW. + FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'), // Alias for 24BG. + FOURCC_CM32 = FOURCC(0, 0, 0, 32), // Alias for BGRA kCMPixelFormat_32ARGB + FOURCC_CM24 = FOURCC(0, 0, 0, 24), // Alias for RAW kCMPixelFormat_24RGB + FOURCC_L555 = FOURCC('L', '5', '5', '5'), // Alias for RGBO. + FOURCC_L565 = FOURCC('L', '5', '6', '5'), // Alias for RGBP. + FOURCC_5551 = FOURCC('5', '5', '5', '1'), // Alias for RGBO. + + // deprecated formats. Not supported, but defined for backward compatibility. + FOURCC_I411 = FOURCC('I', '4', '1', '1'), + FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), + FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'), + FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'), + FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'), + FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'), + FOURCC_H264 = FOURCC('H', '2', '6', '4'), + + // Match any fourcc. + FOURCC_ANY = -1, +}; + +enum FourCCBpp { + // Canonical fourcc codes used in our code. + FOURCC_BPP_I420 = 12, + FOURCC_BPP_I422 = 16, + FOURCC_BPP_I444 = 24, + FOURCC_BPP_I411 = 12, + FOURCC_BPP_I400 = 8, + FOURCC_BPP_NV21 = 12, + FOURCC_BPP_NV12 = 12, + FOURCC_BPP_YUY2 = 16, + FOURCC_BPP_UYVY = 16, + FOURCC_BPP_M420 = 12, + FOURCC_BPP_Q420 = 12, + FOURCC_BPP_ARGB = 32, + FOURCC_BPP_BGRA = 32, + FOURCC_BPP_ABGR = 32, + FOURCC_BPP_RGBA = 32, + FOURCC_BPP_AR30 = 32, + FOURCC_BPP_AB30 = 32, + FOURCC_BPP_24BG = 24, + FOURCC_BPP_RAW = 24, + FOURCC_BPP_RGBP = 16, + FOURCC_BPP_RGBO = 16, + FOURCC_BPP_R444 = 16, + FOURCC_BPP_RGGB = 8, + FOURCC_BPP_BGGR = 8, + FOURCC_BPP_GRBG = 8, + FOURCC_BPP_GBRG = 8, + FOURCC_BPP_YV12 = 12, + FOURCC_BPP_YV16 = 16, + FOURCC_BPP_YV24 = 24, + FOURCC_BPP_YU12 = 12, + FOURCC_BPP_J420 = 12, + FOURCC_BPP_J400 = 8, + FOURCC_BPP_H420 = 12, + FOURCC_BPP_H010 = 24, + FOURCC_BPP_MJPG = 0, // 0 means unknown. + FOURCC_BPP_H264 = 0, + FOURCC_BPP_IYUV = 12, + FOURCC_BPP_YU16 = 16, + FOURCC_BPP_YU24 = 24, + FOURCC_BPP_YUYV = 16, + FOURCC_BPP_YUVS = 16, + FOURCC_BPP_HDYC = 16, + FOURCC_BPP_2VUY = 16, + FOURCC_BPP_JPEG = 1, + FOURCC_BPP_DMB1 = 1, + FOURCC_BPP_BA81 = 8, + FOURCC_BPP_RGB3 = 24, + FOURCC_BPP_BGR3 = 24, + FOURCC_BPP_CM32 = 32, + FOURCC_BPP_CM24 = 24, + + // Match any fourcc. + FOURCC_BPP_ANY = 0, // 0 means unknown. +}; + +// Converts fourcc aliases into canonical ones. +LIBYUV_API uint32_t CanonicalFourCC(uint32_t fourcc); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_VIDEO_COMMON_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/source/compare.cc b/media/libvpx/libvpx/third_party/libyuv/source/compare.cc new file mode 100644 index 0000000000..50e3abd055 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/compare.cc @@ -0,0 +1,429 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/compare.h" + +#include +#include +#ifdef _OPENMP +#include +#endif + +#include "libyuv/basic_types.h" +#include "libyuv/compare_row.h" +#include "libyuv/cpu_id.h" +#include "libyuv/row.h" +#include "libyuv/video_common.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// hash seed of 5381 recommended. +LIBYUV_API +uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) { + const int kBlockSize = 1 << 15; // 32768; + int remainder; + uint32_t (*HashDjb2_SSE)(const uint8_t* src, int count, uint32_t seed) = + HashDjb2_C; +#if defined(HAS_HASHDJB2_SSE41) + if (TestCpuFlag(kCpuHasSSE41)) { + HashDjb2_SSE = HashDjb2_SSE41; + } +#endif +#if defined(HAS_HASHDJB2_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + HashDjb2_SSE = HashDjb2_AVX2; + } +#endif + + while (count >= (uint64_t)(kBlockSize)) { + seed = HashDjb2_SSE(src, kBlockSize, seed); + src += kBlockSize; + count -= kBlockSize; + } + remainder = (int)count & ~15; + if (remainder) { + seed = HashDjb2_SSE(src, remainder, seed); + src += remainder; + count -= remainder; + } + remainder = (int)count & 15; + if (remainder) { + seed = HashDjb2_C(src, remainder, seed); + } + return seed; +} + +static uint32_t ARGBDetectRow_C(const uint8_t* argb, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB. + return FOURCC_BGRA; + } + if (argb[3] != 255) { // 4th byte is not Alpha of 255, so not BGRA. + return FOURCC_ARGB; + } + if (argb[4] != 255) { // Second pixel first byte is not Alpha of 255. + return FOURCC_BGRA; + } + if (argb[7] != 255) { // Second pixel 4th byte is not Alpha of 255. + return FOURCC_ARGB; + } + argb += 8; + } + if (width & 1) { + if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB. + return FOURCC_BGRA; + } + if (argb[3] != 255) { // 4th byte is not Alpha of 255, so not BGRA. + return FOURCC_ARGB; + } + } + return 0; +} + +// Scan an opaque argb image and return fourcc based on alpha offset. +// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown. +LIBYUV_API +uint32_t ARGBDetect(const uint8_t* argb, + int stride_argb, + int width, + int height) { + uint32_t fourcc = 0; + int h; + + // Coalesce rows. + if (stride_argb == width * 4) { + width *= height; + height = 1; + stride_argb = 0; + } + for (h = 0; h < height && fourcc == 0; ++h) { + fourcc = ARGBDetectRow_C(argb, width); + argb += stride_argb; + } + return fourcc; +} + +// NEON version accumulates in 16 bit shorts which overflow at 65536 bytes. +// So actual maximum is 1 less loop, which is 64436 - 32 bytes. + +LIBYUV_API +uint64_t ComputeHammingDistance(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + const int kBlockSize = 1 << 15; // 32768; + const int kSimdSize = 64; + // SIMD for multiple of 64, and C for remainder + int remainder = count & (kBlockSize - 1) & ~(kSimdSize - 1); + uint64_t diff = 0; + int i; + uint32_t (*HammingDistance)(const uint8_t* src_a, const uint8_t* src_b, + int count) = HammingDistance_C; +#if defined(HAS_HAMMINGDISTANCE_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + HammingDistance = HammingDistance_NEON; + } +#endif +#if defined(HAS_HAMMINGDISTANCE_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + HammingDistance = HammingDistance_SSSE3; + } +#endif +#if defined(HAS_HAMMINGDISTANCE_SSE42) + if (TestCpuFlag(kCpuHasSSE42)) { + HammingDistance = HammingDistance_SSE42; + } +#endif +#if defined(HAS_HAMMINGDISTANCE_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + HammingDistance = HammingDistance_AVX2; + } +#endif +#if defined(HAS_HAMMINGDISTANCE_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + HammingDistance = HammingDistance_MSA; + } +#endif +#ifdef _OPENMP +#pragma omp parallel for reduction(+ : diff) +#endif + for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) { + diff += HammingDistance(src_a + i, src_b + i, kBlockSize); + } + src_a += count & ~(kBlockSize - 1); + src_b += count & ~(kBlockSize - 1); + if (remainder) { + diff += HammingDistance(src_a, src_b, remainder); + src_a += remainder; + src_b += remainder; + } + remainder = count & (kSimdSize - 1); + if (remainder) { + diff += HammingDistance_C(src_a, src_b, remainder); + } + return diff; +} + +// TODO(fbarchard): Refactor into row function. +LIBYUV_API +uint64_t ComputeSumSquareError(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + // SumSquareError returns values 0 to 65535 for each squared difference. + // Up to 65536 of those can be summed and remain within a uint32_t. + // After each block of 65536 pixels, accumulate into a uint64_t. + const int kBlockSize = 65536; + int remainder = count & (kBlockSize - 1) & ~31; + uint64_t sse = 0; + int i; + uint32_t (*SumSquareError)(const uint8_t* src_a, const uint8_t* src_b, + int count) = SumSquareError_C; +#if defined(HAS_SUMSQUAREERROR_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SumSquareError = SumSquareError_NEON; + } +#endif +#if defined(HAS_SUMSQUAREERROR_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + // Note only used for multiples of 16 so count is not checked. + SumSquareError = SumSquareError_SSE2; + } +#endif +#if defined(HAS_SUMSQUAREERROR_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + // Note only used for multiples of 32 so count is not checked. + SumSquareError = SumSquareError_AVX2; + } +#endif +#if defined(HAS_SUMSQUAREERROR_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SumSquareError = SumSquareError_MSA; + } +#endif +#ifdef _OPENMP +#pragma omp parallel for reduction(+ : sse) +#endif + for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) { + sse += SumSquareError(src_a + i, src_b + i, kBlockSize); + } + src_a += count & ~(kBlockSize - 1); + src_b += count & ~(kBlockSize - 1); + if (remainder) { + sse += SumSquareError(src_a, src_b, remainder); + src_a += remainder; + src_b += remainder; + } + remainder = count & 31; + if (remainder) { + sse += SumSquareError_C(src_a, src_b, remainder); + } + return sse; +} + +LIBYUV_API +uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a, + int stride_a, + const uint8_t* src_b, + int stride_b, + int width, + int height) { + uint64_t sse = 0; + int h; + // Coalesce rows. + if (stride_a == width && stride_b == width) { + width *= height; + height = 1; + stride_a = stride_b = 0; + } + for (h = 0; h < height; ++h) { + sse += ComputeSumSquareError(src_a, src_b, width); + src_a += stride_a; + src_b += stride_b; + } + return sse; +} + +LIBYUV_API +double SumSquareErrorToPsnr(uint64_t sse, uint64_t count) { + double psnr; + if (sse > 0) { + double mse = (double)count / (double)sse; + psnr = 10.0 * log10(255.0 * 255.0 * mse); + } else { + psnr = kMaxPsnr; // Limit to prevent divide by 0 + } + + if (psnr > kMaxPsnr) { + psnr = kMaxPsnr; + } + + return psnr; +} + +LIBYUV_API +double CalcFramePsnr(const uint8_t* src_a, + int stride_a, + const uint8_t* src_b, + int stride_b, + int width, + int height) { + const uint64_t samples = (uint64_t)width * (uint64_t)height; + const uint64_t sse = ComputeSumSquareErrorPlane(src_a, stride_a, src_b, + stride_b, width, height); + return SumSquareErrorToPsnr(sse, samples); +} + +LIBYUV_API +double I420Psnr(const uint8_t* src_y_a, + int stride_y_a, + const uint8_t* src_u_a, + int stride_u_a, + const uint8_t* src_v_a, + int stride_v_a, + const uint8_t* src_y_b, + int stride_y_b, + const uint8_t* src_u_b, + int stride_u_b, + const uint8_t* src_v_b, + int stride_v_b, + int width, + int height) { + const uint64_t sse_y = ComputeSumSquareErrorPlane( + src_y_a, stride_y_a, src_y_b, stride_y_b, width, height); + const int width_uv = (width + 1) >> 1; + const int height_uv = (height + 1) >> 1; + const uint64_t sse_u = ComputeSumSquareErrorPlane( + src_u_a, stride_u_a, src_u_b, stride_u_b, width_uv, height_uv); + const uint64_t sse_v = ComputeSumSquareErrorPlane( + src_v_a, stride_v_a, src_v_b, stride_v_b, width_uv, height_uv); + const uint64_t samples = (uint64_t)width * (uint64_t)height + + 2 * ((uint64_t)width_uv * (uint64_t)height_uv); + const uint64_t sse = sse_y + sse_u + sse_v; + return SumSquareErrorToPsnr(sse, samples); +} + +static const int64_t cc1 = 26634; // (64^2*(.01*255)^2 +static const int64_t cc2 = 239708; // (64^2*(.03*255)^2 + +static double Ssim8x8_C(const uint8_t* src_a, + int stride_a, + const uint8_t* src_b, + int stride_b) { + int64_t sum_a = 0; + int64_t sum_b = 0; + int64_t sum_sq_a = 0; + int64_t sum_sq_b = 0; + int64_t sum_axb = 0; + + int i; + for (i = 0; i < 8; ++i) { + int j; + for (j = 0; j < 8; ++j) { + sum_a += src_a[j]; + sum_b += src_b[j]; + sum_sq_a += src_a[j] * src_a[j]; + sum_sq_b += src_b[j] * src_b[j]; + sum_axb += src_a[j] * src_b[j]; + } + + src_a += stride_a; + src_b += stride_b; + } + + { + const int64_t count = 64; + // scale the constants by number of pixels + const int64_t c1 = (cc1 * count * count) >> 12; + const int64_t c2 = (cc2 * count * count) >> 12; + + const int64_t sum_a_x_sum_b = sum_a * sum_b; + + const int64_t ssim_n = (2 * sum_a_x_sum_b + c1) * + (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2); + + const int64_t sum_a_sq = sum_a * sum_a; + const int64_t sum_b_sq = sum_b * sum_b; + + const int64_t ssim_d = + (sum_a_sq + sum_b_sq + c1) * + (count * sum_sq_a - sum_a_sq + count * sum_sq_b - sum_b_sq + c2); + + if (ssim_d == 0.0) { + return DBL_MAX; + } + return ssim_n * 1.0 / ssim_d; + } +} + +// We are using a 8x8 moving window with starting location of each 8x8 window +// on the 4x4 pixel grid. Such arrangement allows the windows to overlap +// block boundaries to penalize blocking artifacts. +LIBYUV_API +double CalcFrameSsim(const uint8_t* src_a, + int stride_a, + const uint8_t* src_b, + int stride_b, + int width, + int height) { + int samples = 0; + double ssim_total = 0; + double (*Ssim8x8)(const uint8_t* src_a, int stride_a, const uint8_t* src_b, + int stride_b) = Ssim8x8_C; + + // sample point start with each 4x4 location + int i; + for (i = 0; i < height - 8; i += 4) { + int j; + for (j = 0; j < width - 8; j += 4) { + ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b); + samples++; + } + + src_a += stride_a * 4; + src_b += stride_b * 4; + } + + ssim_total /= samples; + return ssim_total; +} + +LIBYUV_API +double I420Ssim(const uint8_t* src_y_a, + int stride_y_a, + const uint8_t* src_u_a, + int stride_u_a, + const uint8_t* src_v_a, + int stride_v_a, + const uint8_t* src_y_b, + int stride_y_b, + const uint8_t* src_u_b, + int stride_u_b, + const uint8_t* src_v_b, + int stride_v_b, + int width, + int height) { + const double ssim_y = + CalcFrameSsim(src_y_a, stride_y_a, src_y_b, stride_y_b, width, height); + const int width_uv = (width + 1) >> 1; + const int height_uv = (height + 1) >> 1; + const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a, src_u_b, stride_u_b, + width_uv, height_uv); + const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a, src_v_b, stride_v_b, + width_uv, height_uv); + return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v); +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/compare_common.cc b/media/libvpx/libvpx/third_party/libyuv/source/compare_common.cc new file mode 100644 index 0000000000..d4b170ad98 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/compare_common.cc @@ -0,0 +1,104 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" + +#include "libyuv/compare_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if ORIGINAL_OPT +uint32_t HammingDistance_C1(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + + int i; + for (i = 0; i < count; ++i) { + int x = src_a[i] ^ src_b[i]; + if (x & 1) + ++diff; + if (x & 2) + ++diff; + if (x & 4) + ++diff; + if (x & 8) + ++diff; + if (x & 16) + ++diff; + if (x & 32) + ++diff; + if (x & 64) + ++diff; + if (x & 128) + ++diff; + } + return diff; +} +#endif + +// Hakmem method for hamming distance. +uint32_t HammingDistance_C(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + + int i; + for (i = 0; i < count - 3; i += 4) { + uint32_t x = *((const uint32_t*)src_a) ^ *((const uint32_t*)src_b); + uint32_t u = x - ((x >> 1) & 0x55555555); + u = ((u >> 2) & 0x33333333) + (u & 0x33333333); + diff += ((((u + (u >> 4)) & 0x0f0f0f0f) * 0x01010101) >> 24); + src_a += 4; + src_b += 4; + } + + for (; i < count; ++i) { + uint32_t x = *src_a ^ *src_b; + uint32_t u = x - ((x >> 1) & 0x55); + u = ((u >> 2) & 0x33) + (u & 0x33); + diff += (u + (u >> 4)) & 0x0f; + src_a += 1; + src_b += 1; + } + + return diff; +} + +uint32_t SumSquareError_C(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t sse = 0u; + int i; + for (i = 0; i < count; ++i) { + int diff = src_a[i] - src_b[i]; + sse += (uint32_t)(diff * diff); + } + return sse; +} + +// hash seed of 5381 recommended. +// Internal C version of HashDjb2 with int sized count for efficiency. +uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed) { + uint32_t hash = seed; + int i; + for (i = 0; i < count; ++i) { + hash += (hash << 5) + src[i]; + } + return hash; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/compare_gcc.cc b/media/libvpx/libvpx/third_party/libyuv/source/compare_gcc.cc new file mode 100644 index 0000000000..676527c1b1 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/compare_gcc.cc @@ -0,0 +1,360 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" + +#include "libyuv/compare_row.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC x86 and x64. +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) + +#if defined(__x86_64__) +uint32_t HammingDistance_SSE42(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint64_t diff = 0u; + + asm volatile( + "xor %3,%3 \n" + "xor %%r8,%%r8 \n" + "xor %%r9,%%r9 \n" + "xor %%r10,%%r10 \n" + + // Process 32 bytes per loop. + LABELALIGN + "1: \n" + "mov (%0),%%rcx \n" + "mov 0x8(%0),%%rdx \n" + "xor (%1),%%rcx \n" + "xor 0x8(%1),%%rdx \n" + "popcnt %%rcx,%%rcx \n" + "popcnt %%rdx,%%rdx \n" + "mov 0x10(%0),%%rsi \n" + "mov 0x18(%0),%%rdi \n" + "xor 0x10(%1),%%rsi \n" + "xor 0x18(%1),%%rdi \n" + "popcnt %%rsi,%%rsi \n" + "popcnt %%rdi,%%rdi \n" + "add $0x20,%0 \n" + "add $0x20,%1 \n" + "add %%rcx,%3 \n" + "add %%rdx,%%r8 \n" + "add %%rsi,%%r9 \n" + "add %%rdi,%%r10 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + + "add %%r8, %3 \n" + "add %%r9, %3 \n" + "add %%r10, %3 \n" + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "=r"(diff) // %3 + : + : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10"); + + return static_cast(diff); +} +#else +uint32_t HammingDistance_SSE42(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + + asm volatile( + // Process 16 bytes per loop. + LABELALIGN + "1: \n" + "mov (%0),%%ecx \n" + "mov 0x4(%0),%%edx \n" + "xor (%1),%%ecx \n" + "xor 0x4(%1),%%edx \n" + "popcnt %%ecx,%%ecx \n" + "add %%ecx,%3 \n" + "popcnt %%edx,%%edx \n" + "add %%edx,%3 \n" + "mov 0x8(%0),%%ecx \n" + "mov 0xc(%0),%%edx \n" + "xor 0x8(%1),%%ecx \n" + "xor 0xc(%1),%%edx \n" + "popcnt %%ecx,%%ecx \n" + "add %%ecx,%3 \n" + "popcnt %%edx,%%edx \n" + "add %%edx,%3 \n" + "add $0x10,%0 \n" + "add $0x10,%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "+r"(diff) // %3 + : + : "memory", "cc", "ecx", "edx"); + + return diff; +} +#endif + +static const vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15}; +static const vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4}; + +uint32_t HammingDistance_SSSE3(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + + asm volatile( + "movdqa %4,%%xmm2 \n" + "movdqa %5,%%xmm3 \n" + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "movdqa (%0),%%xmm4 \n" + "movdqa 0x10(%0), %%xmm5 \n" + "pxor (%0,%1), %%xmm4 \n" + "movdqa %%xmm4,%%xmm6 \n" + "pand %%xmm2,%%xmm6 \n" + "psrlw $0x4,%%xmm4 \n" + "movdqa %%xmm3,%%xmm7 \n" + "pshufb %%xmm6,%%xmm7 \n" + "pand %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "paddb %%xmm7,%%xmm6 \n" + "pxor 0x10(%0,%1),%%xmm5 \n" + "add $0x20,%0 \n" + "movdqa %%xmm5,%%xmm4 \n" + "pand %%xmm2,%%xmm5 \n" + "psrlw $0x4,%%xmm4 \n" + "movdqa %%xmm3,%%xmm7 \n" + "pshufb %%xmm5,%%xmm7 \n" + "pand %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm5 \n" + "pshufb %%xmm4,%%xmm5 \n" + "paddb %%xmm7,%%xmm5 \n" + "paddb %%xmm5,%%xmm6 \n" + "psadbw %%xmm1,%%xmm6 \n" + "paddd %%xmm6,%%xmm0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + + "pshufd $0xaa,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "movd %%xmm0, %3 \n" + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "=r"(diff) // %3 + : "m"(kNibbleMask), // %4 + "m"(kBitCount) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); + + return diff; +} + +#ifdef HAS_HAMMINGDISTANCE_AVX2 +uint32_t HammingDistance_AVX2(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + + asm volatile( + "vbroadcastf128 %4,%%ymm2 \n" + "vbroadcastf128 %5,%%ymm3 \n" + "vpxor %%ymm0,%%ymm0,%%ymm0 \n" + "vpxor %%ymm1,%%ymm1,%%ymm1 \n" + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "vmovdqa (%0),%%ymm4 \n" + "vmovdqa 0x20(%0), %%ymm5 \n" + "vpxor (%0,%1), %%ymm4, %%ymm4 \n" + "vpand %%ymm2,%%ymm4,%%ymm6 \n" + "vpsrlw $0x4,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm6,%%ymm3,%%ymm6 \n" + "vpand %%ymm2,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" + "vpaddb %%ymm4,%%ymm6,%%ymm6 \n" + "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n" + "add $0x40,%0 \n" + "vpand %%ymm2,%%ymm4,%%ymm5 \n" + "vpsrlw $0x4,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm5 \n" + "vpand %%ymm2,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" + "vpaddb %%ymm5,%%ymm4,%%ymm4 \n" + "vpaddb %%ymm6,%%ymm4,%%ymm4 \n" + "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n" + "vpaddd %%ymm0,%%ymm4,%%ymm0 \n" + "sub $0x40,%2 \n" + "jg 1b \n" + + "vpermq $0xb1,%%ymm0,%%ymm1 \n" + "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xaa,%%ymm0,%%ymm1 \n" + "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" + "vmovd %%xmm0, %3 \n" + "vzeroupper \n" + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "=r"(diff) // %3 + : "m"(kNibbleMask), // %4 + "m"(kBitCount) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); + + return diff; +} +#endif // HAS_HAMMINGDISTANCE_AVX2 + +uint32_t SumSquareError_SSE2(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t sse; + asm volatile( + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psubusb %%xmm2,%%xmm1 \n" + "psubusb %%xmm3,%%xmm2 \n" + "por %%xmm2,%%xmm1 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpckhbw %%xmm5,%%xmm2 \n" + "pmaddwd %%xmm1,%%xmm1 \n" + "pmaddwd %%xmm2,%%xmm2 \n" + "paddd %%xmm1,%%xmm0 \n" + "paddd %%xmm2,%%xmm0 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + + "pshufd $0xee,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "pshufd $0x1,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "movd %%xmm0,%3 \n" + + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "=g"(sse) // %3 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); + return sse; +} + +static const uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16 +static const uvec32 kHashMul0 = { + 0x0c3525e1, // 33 ^ 15 + 0xa3476dc1, // 33 ^ 14 + 0x3b4039a1, // 33 ^ 13 + 0x4f5f0981, // 33 ^ 12 +}; +static const uvec32 kHashMul1 = { + 0x30f35d61, // 33 ^ 11 + 0x855cb541, // 33 ^ 10 + 0x040a9121, // 33 ^ 9 + 0x747c7101, // 33 ^ 8 +}; +static const uvec32 kHashMul2 = { + 0xec41d4e1, // 33 ^ 7 + 0x4cfa3cc1, // 33 ^ 6 + 0x025528a1, // 33 ^ 5 + 0x00121881, // 33 ^ 4 +}; +static const uvec32 kHashMul3 = { + 0x00008c61, // 33 ^ 3 + 0x00000441, // 33 ^ 2 + 0x00000021, // 33 ^ 1 + 0x00000001, // 33 ^ 0 +}; + +uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { + uint32_t hash; + asm volatile( + "movd %2,%%xmm0 \n" + "pxor %%xmm7,%%xmm7 \n" + "movdqa %4,%%xmm6 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "pmulld %%xmm6,%%xmm0 \n" + "movdqa %5,%%xmm5 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm7,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm7,%%xmm3 \n" + "pmulld %%xmm5,%%xmm3 \n" + "movdqa %6,%%xmm5 \n" + "movdqa %%xmm2,%%xmm4 \n" + "punpckhwd %%xmm7,%%xmm4 \n" + "pmulld %%xmm5,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "punpckhbw %%xmm7,%%xmm1 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklwd %%xmm7,%%xmm2 \n" + "pmulld %%xmm5,%%xmm2 \n" + "movdqa %8,%%xmm5 \n" + "punpckhwd %%xmm7,%%xmm1 \n" + "pmulld %%xmm5,%%xmm1 \n" + "paddd %%xmm4,%%xmm3 \n" + "paddd %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm1 \n" + "pshufd $0xe,%%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm1 \n" + "pshufd $0x1,%%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "sub $0x10,%1 \n" + "jg 1b \n" + "movd %%xmm0,%3 \n" + : "+r"(src), // %0 + "+r"(count), // %1 + "+rm"(seed), // %2 + "=g"(hash) // %3 + : "m"(kHash16x33), // %4 + "m"(kHashMul0), // %5 + "m"(kHashMul1), // %6 + "m"(kHashMul2), // %7 + "m"(kHashMul3) // %8 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); + return hash; +} +#endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/compare_msa.cc b/media/libvpx/libvpx/third_party/libyuv/source/compare_msa.cc new file mode 100644 index 0000000000..0b807d37be --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/compare_msa.cc @@ -0,0 +1,97 @@ +/* + * Copyright 2017 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" + +#include "libyuv/compare_row.h" +#include "libyuv/row.h" + +// This module is for GCC MSA +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#include "libyuv/macros_msa.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +uint32_t HammingDistance_MSA(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + int i; + v16u8 src0, src1, src2, src3; + v2i64 vec0 = {0}, vec1 = {0}; + + for (i = 0; i < count; i += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16); + src0 ^= src2; + src1 ^= src3; + vec0 += __msa_pcnt_d((v2i64)src0); + vec1 += __msa_pcnt_d((v2i64)src1); + src_a += 32; + src_b += 32; + } + + vec0 += vec1; + diff = (uint32_t)__msa_copy_u_w((v4i32)vec0, 0); + diff += (uint32_t)__msa_copy_u_w((v4i32)vec0, 2); + return diff; +} + +uint32_t SumSquareError_MSA(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t sse = 0u; + int i; + v16u8 src0, src1, src2, src3; + v8i16 vec0, vec1, vec2, vec3; + v4i32 reg0 = {0}, reg1 = {0}, reg2 = {0}, reg3 = {0}; + v2i64 tmp0; + + for (i = 0; i < count; i += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16); + vec0 = (v8i16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); + vec1 = (v8i16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); + vec2 = (v8i16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); + vec3 = (v8i16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); + vec0 = __msa_hsub_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = __msa_hsub_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = __msa_hsub_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = __msa_hsub_u_h((v16u8)vec3, (v16u8)vec3); + reg0 = __msa_dpadd_s_w(reg0, vec0, vec0); + reg1 = __msa_dpadd_s_w(reg1, vec1, vec1); + reg2 = __msa_dpadd_s_w(reg2, vec2, vec2); + reg3 = __msa_dpadd_s_w(reg3, vec3, vec3); + src_a += 32; + src_b += 32; + } + + reg0 += reg1; + reg2 += reg3; + reg0 += reg2; + tmp0 = __msa_hadd_s_d(reg0, reg0); + sse = (uint32_t)__msa_copy_u_w((v4i32)tmp0, 0); + sse += (uint32_t)__msa_copy_u_w((v4i32)tmp0, 2); + return sse; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) diff --git a/media/libvpx/libvpx/third_party/libyuv/source/compare_neon.cc b/media/libvpx/libvpx/third_party/libyuv/source/compare_neon.cc new file mode 100644 index 0000000000..2a2181e0cb --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/compare_neon.cc @@ -0,0 +1,96 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" + +#include "libyuv/compare_row.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ + !defined(__aarch64__) + +// 256 bits at a time +// uses short accumulator which restricts count to 131 KB +uint32_t HammingDistance_NEON(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff; + + asm volatile( + "vmov.u16 q4, #0 \n" // accumulator + + "1: \n" + "vld1.8 {q0, q1}, [%0]! \n" + "vld1.8 {q2, q3}, [%1]! \n" + "veor.32 q0, q0, q2 \n" + "veor.32 q1, q1, q3 \n" + "vcnt.i8 q0, q0 \n" + "vcnt.i8 q1, q1 \n" + "subs %2, %2, #32 \n" + "vadd.u8 q0, q0, q1 \n" // 16 byte counts + "vpadal.u8 q4, q0 \n" // 8 shorts + "bgt 1b \n" + + "vpaddl.u16 q0, q4 \n" // 4 ints + "vpadd.u32 d0, d0, d1 \n" + "vpadd.u32 d0, d0, d0 \n" + "vmov.32 %3, d0[0] \n" + + : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) + : + : "cc", "q0", "q1", "q2", "q3", "q4"); + return diff; +} + +uint32_t SumSquareError_NEON(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t sse; + asm volatile( + "vmov.u8 q8, #0 \n" + "vmov.u8 q10, #0 \n" + "vmov.u8 q9, #0 \n" + "vmov.u8 q11, #0 \n" + + "1: \n" + "vld1.8 {q0}, [%0]! \n" + "vld1.8 {q1}, [%1]! \n" + "subs %2, %2, #16 \n" + "vsubl.u8 q2, d0, d2 \n" + "vsubl.u8 q3, d1, d3 \n" + "vmlal.s16 q8, d4, d4 \n" + "vmlal.s16 q9, d6, d6 \n" + "vmlal.s16 q10, d5, d5 \n" + "vmlal.s16 q11, d7, d7 \n" + "bgt 1b \n" + + "vadd.u32 q8, q8, q9 \n" + "vadd.u32 q10, q10, q11 \n" + "vadd.u32 q11, q8, q10 \n" + "vpaddl.u32 q1, q11 \n" + "vadd.u64 d0, d2, d3 \n" + "vmov.32 %3, d0[0] \n" + : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); + return sse; +} + +#endif // defined(__ARM_NEON__) && !defined(__aarch64__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/compare_neon64.cc b/media/libvpx/libvpx/third_party/libyuv/source/compare_neon64.cc new file mode 100644 index 0000000000..6e8f672ab7 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/compare_neon64.cc @@ -0,0 +1,90 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" + +#include "libyuv/compare_row.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +// 256 bits at a time +// uses short accumulator which restricts count to 131 KB +uint32_t HammingDistance_NEON(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff; + asm volatile( + "movi v4.8h, #0 \n" + + "1: \n" + "ld1 {v0.16b, v1.16b}, [%0], #32 \n" + "ld1 {v2.16b, v3.16b}, [%1], #32 \n" + "eor v0.16b, v0.16b, v2.16b \n" + "eor v1.16b, v1.16b, v3.16b \n" + "cnt v0.16b, v0.16b \n" + "cnt v1.16b, v1.16b \n" + "subs %w2, %w2, #32 \n" + "add v0.16b, v0.16b, v1.16b \n" + "uadalp v4.8h, v0.16b \n" + "b.gt 1b \n" + + "uaddlv s4, v4.8h \n" + "fmov %w3, s4 \n" + : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) + : + : "cc", "v0", "v1", "v2", "v3", "v4"); + return diff; +} + +uint32_t SumSquareError_NEON(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t sse; + asm volatile( + "eor v16.16b, v16.16b, v16.16b \n" + "eor v18.16b, v18.16b, v18.16b \n" + "eor v17.16b, v17.16b, v17.16b \n" + "eor v19.16b, v19.16b, v19.16b \n" + + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" + "ld1 {v1.16b}, [%1], #16 \n" + "subs %w2, %w2, #16 \n" + "usubl v2.8h, v0.8b, v1.8b \n" + "usubl2 v3.8h, v0.16b, v1.16b \n" + "smlal v16.4s, v2.4h, v2.4h \n" + "smlal v17.4s, v3.4h, v3.4h \n" + "smlal2 v18.4s, v2.8h, v2.8h \n" + "smlal2 v19.4s, v3.8h, v3.8h \n" + "b.gt 1b \n" + + "add v16.4s, v16.4s, v17.4s \n" + "add v18.4s, v18.4s, v19.4s \n" + "add v19.4s, v16.4s, v18.4s \n" + "addv s0, v19.4s \n" + "fmov %w3, s0 \n" + : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) + : + : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); + return sse; +} + +#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/compare_win.cc b/media/libvpx/libvpx/third_party/libyuv/source/compare_win.cc new file mode 100644 index 0000000000..d57d3d9d1c --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/compare_win.cc @@ -0,0 +1,241 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" + +#include "libyuv/compare_row.h" +#include "libyuv/row.h" + +#if defined(_MSC_VER) +#include // For __popcnt +#endif + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for 32 bit Visual C x86 and clangcl +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) + +uint32_t HammingDistance_SSE42(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + + int i; + for (i = 0; i < count - 3; i += 4) { + uint32_t x = *((uint32_t*)src_a) ^ *((uint32_t*)src_b); // NOLINT + src_a += 4; + src_b += 4; + diff += __popcnt(x); + } + return diff; +} + +__declspec(naked) uint32_t + SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) { + __asm { + mov eax, [esp + 4] // src_a + mov edx, [esp + 8] // src_b + mov ecx, [esp + 12] // count + pxor xmm0, xmm0 + pxor xmm5, xmm5 + + wloop: + movdqu xmm1, [eax] + lea eax, [eax + 16] + movdqu xmm2, [edx] + lea edx, [edx + 16] + movdqa xmm3, xmm1 // abs trick + psubusb xmm1, xmm2 + psubusb xmm2, xmm3 + por xmm1, xmm2 + movdqa xmm2, xmm1 + punpcklbw xmm1, xmm5 + punpckhbw xmm2, xmm5 + pmaddwd xmm1, xmm1 + pmaddwd xmm2, xmm2 + paddd xmm0, xmm1 + paddd xmm0, xmm2 + sub ecx, 16 + jg wloop + + pshufd xmm1, xmm0, 0xee + paddd xmm0, xmm1 + pshufd xmm1, xmm0, 0x01 + paddd xmm0, xmm1 + movd eax, xmm0 + ret + } +} + +// Visual C 2012 required for AVX2. +#if _MSC_VER >= 1700 +// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX. +#pragma warning(disable : 4752) +__declspec(naked) uint32_t + SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count) { + __asm { + mov eax, [esp + 4] // src_a + mov edx, [esp + 8] // src_b + mov ecx, [esp + 12] // count + vpxor ymm0, ymm0, ymm0 // sum + vpxor ymm5, ymm5, ymm5 // constant 0 for unpck + sub edx, eax + + wloop: + vmovdqu ymm1, [eax] + vmovdqu ymm2, [eax + edx] + lea eax, [eax + 32] + vpsubusb ymm3, ymm1, ymm2 // abs difference trick + vpsubusb ymm2, ymm2, ymm1 + vpor ymm1, ymm2, ymm3 + vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order. + vpunpckhbw ymm1, ymm1, ymm5 + vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32. + vpmaddwd ymm1, ymm1, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm0, ymm0, ymm2 + sub ecx, 32 + jg wloop + + vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes. + vpaddd ymm0, ymm0, ymm1 + vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes. + vpaddd ymm0, ymm0, ymm1 + vpermq ymm1, ymm0, 0x02 // high + low lane. + vpaddd ymm0, ymm0, ymm1 + vmovd eax, xmm0 + vzeroupper + ret + } +} +#endif // _MSC_VER >= 1700 + +uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16 +uvec32 kHashMul0 = { + 0x0c3525e1, // 33 ^ 15 + 0xa3476dc1, // 33 ^ 14 + 0x3b4039a1, // 33 ^ 13 + 0x4f5f0981, // 33 ^ 12 +}; +uvec32 kHashMul1 = { + 0x30f35d61, // 33 ^ 11 + 0x855cb541, // 33 ^ 10 + 0x040a9121, // 33 ^ 9 + 0x747c7101, // 33 ^ 8 +}; +uvec32 kHashMul2 = { + 0xec41d4e1, // 33 ^ 7 + 0x4cfa3cc1, // 33 ^ 6 + 0x025528a1, // 33 ^ 5 + 0x00121881, // 33 ^ 4 +}; +uvec32 kHashMul3 = { + 0x00008c61, // 33 ^ 3 + 0x00000441, // 33 ^ 2 + 0x00000021, // 33 ^ 1 + 0x00000001, // 33 ^ 0 +}; + +__declspec(naked) uint32_t + HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { + __asm { + mov eax, [esp + 4] // src + mov ecx, [esp + 8] // count + movd xmm0, [esp + 12] // seed + + pxor xmm7, xmm7 // constant 0 for unpck + movdqa xmm6, xmmword ptr kHash16x33 + + wloop: + movdqu xmm1, [eax] // src[0-15] + lea eax, [eax + 16] + pmulld xmm0, xmm6 // hash *= 33 ^ 16 + movdqa xmm5, xmmword ptr kHashMul0 + movdqa xmm2, xmm1 + punpcklbw xmm2, xmm7 // src[0-7] + movdqa xmm3, xmm2 + punpcklwd xmm3, xmm7 // src[0-3] + pmulld xmm3, xmm5 + movdqa xmm5, xmmword ptr kHashMul1 + movdqa xmm4, xmm2 + punpckhwd xmm4, xmm7 // src[4-7] + pmulld xmm4, xmm5 + movdqa xmm5, xmmword ptr kHashMul2 + punpckhbw xmm1, xmm7 // src[8-15] + movdqa xmm2, xmm1 + punpcklwd xmm2, xmm7 // src[8-11] + pmulld xmm2, xmm5 + movdqa xmm5, xmmword ptr kHashMul3 + punpckhwd xmm1, xmm7 // src[12-15] + pmulld xmm1, xmm5 + paddd xmm3, xmm4 // add 16 results + paddd xmm1, xmm2 + paddd xmm1, xmm3 + + pshufd xmm2, xmm1, 0x0e // upper 2 dwords + paddd xmm1, xmm2 + pshufd xmm2, xmm1, 0x01 + paddd xmm1, xmm2 + paddd xmm0, xmm1 + sub ecx, 16 + jg wloop + + movd eax, xmm0 // return hash + ret + } +} + +// Visual C 2012 required for AVX2. +#if _MSC_VER >= 1700 +__declspec(naked) uint32_t + HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) { + __asm { + mov eax, [esp + 4] // src + mov ecx, [esp + 8] // count + vmovd xmm0, [esp + 12] // seed + + wloop: + vpmovzxbd xmm3, [eax] // src[0-3] + vpmulld xmm0, xmm0, xmmword ptr kHash16x33 // hash *= 33 ^ 16 + vpmovzxbd xmm4, [eax + 4] // src[4-7] + vpmulld xmm3, xmm3, xmmword ptr kHashMul0 + vpmovzxbd xmm2, [eax + 8] // src[8-11] + vpmulld xmm4, xmm4, xmmword ptr kHashMul1 + vpmovzxbd xmm1, [eax + 12] // src[12-15] + vpmulld xmm2, xmm2, xmmword ptr kHashMul2 + lea eax, [eax + 16] + vpmulld xmm1, xmm1, xmmword ptr kHashMul3 + vpaddd xmm3, xmm3, xmm4 // add 16 results + vpaddd xmm1, xmm1, xmm2 + vpaddd xmm1, xmm1, xmm3 + vpshufd xmm2, xmm1, 0x0e // upper 2 dwords + vpaddd xmm1, xmm1,xmm2 + vpshufd xmm2, xmm1, 0x01 + vpaddd xmm1, xmm1, xmm2 + vpaddd xmm0, xmm0, xmm1 + sub ecx, 16 + jg wloop + + vmovd eax, xmm0 // return hash + vzeroupper + ret + } +} +#endif // _MSC_VER >= 1700 + +#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/convert.cc b/media/libvpx/libvpx/third_party/libyuv/source/convert.cc new file mode 100644 index 0000000000..375cc732c1 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/convert.cc @@ -0,0 +1,1740 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert.h" + +#include "libyuv/basic_types.h" +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" +#include "libyuv/rotate.h" +#include "libyuv/row.h" +#include "libyuv/scale.h" // For ScalePlane() + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +// Any I4xx To I420 format with mirroring. +static int I4xxToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int src_y_width, + int src_y_height, + int src_uv_width, + int src_uv_height) { + const int dst_y_width = Abs(src_y_width); + const int dst_y_height = Abs(src_y_height); + const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1); + const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1); + if (src_uv_width == 0 || src_uv_height == 0) { + return -1; + } + if (dst_y) { + ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y, + dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear); + } + ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u, + dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear); + ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v, + dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear); + return 0; +} + +// Copy I420 with optional flipping. +// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure +// is does row coalescing. +LIBYUV_API +int I420Copy(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + // Copy UV planes. + CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); + CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); + return 0; +} + +// Copy I010 with optional flipping. +LIBYUV_API +int I010Copy(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + if (dst_y) { + CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + // Copy UV planes. + CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); + CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); + return 0; +} + +// Convert 10 bit YUV to 8 bit. +LIBYUV_API +int I010ToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + // Convert Y plane. + Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, 16384, width, + height); + // Convert UV planes. + Convert16To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, 16384, halfwidth, + halfheight); + Convert16To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, 16384, halfwidth, + halfheight); + return 0; +} + +// 422 chroma is 1/2 width, 1x height +// 420 chroma is 1/2 width, 1/2 height +LIBYUV_API +int I422ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + const int src_uv_width = SUBSAMPLE(width, 1, 1); + return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, src_uv_width, height); +} + +// 444 chroma is 1x width, 1x height +// 420 chroma is 1/2 width, 1/2 height +LIBYUV_API +int I444ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, width, height); +} + +// I400 is greyscale typically used in MJPG +LIBYUV_API +int I400ToI420(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128); + SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128); + return 0; +} + +static void CopyPlane2(const uint8_t* src, + int src_stride_0, + int src_stride_1, + uint8_t* dst, + int dst_stride, + int width, + int height) { + int y; + void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C; +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; + } +#endif +#if defined(HAS_COPYROW_AVX) + if (TestCpuFlag(kCpuHasAVX)) { + CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX; + } +#endif +#if defined(HAS_COPYROW_ERMS) + if (TestCpuFlag(kCpuHasERMS)) { + CopyRow = CopyRow_ERMS; + } +#endif +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; + } +#endif + + // Copy plane + for (y = 0; y < height - 1; y += 2) { + CopyRow(src, dst, width); + CopyRow(src + src_stride_0, dst + dst_stride, width); + src += src_stride_0 + src_stride_1; + dst += dst_stride * 2; + } + if (height & 1) { + CopyRow(src, dst, width); + } +} + +// Support converting from FOURCC_M420 +// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for +// easy conversion to I420. +// M420 format description: +// M420 is row biplanar 420: 2 rows of Y and 1 row of UV. +// Chroma is half width / half height. (420) +// src_stride_m420 is row planar. Normally this will be the width in pixels. +// The UV plane is half width, but 2 values, so src_stride_m420 applies to +// this as well as the two Y planes. +static int X420ToI420(const uint8_t* src_y, + int src_stride_y0, + int src_stride_y1, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_uv || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + if (dst_y) { + dst_y = dst_y + (height - 1) * dst_stride_y; + } + dst_u = dst_u + (halfheight - 1) * dst_stride_u; + dst_v = dst_v + (halfheight - 1) * dst_stride_v; + dst_stride_y = -dst_stride_y; + dst_stride_u = -dst_stride_u; + dst_stride_v = -dst_stride_v; + } + // Coalesce rows. + if (src_stride_y0 == width && src_stride_y1 == width && + dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y0 = src_stride_y1 = dst_stride_y = 0; + } + // Coalesce rows. + if (src_stride_uv == halfwidth * 2 && dst_stride_u == halfwidth && + dst_stride_v == halfwidth) { + halfwidth *= halfheight; + halfheight = 1; + src_stride_uv = dst_stride_u = dst_stride_v = 0; + } + + if (dst_y) { + if (src_stride_y0 == src_stride_y1) { + CopyPlane(src_y, src_stride_y0, dst_y, dst_stride_y, width, height); + } else { + CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y, + width, height); + } + } + + // Split UV plane - NV12 / NV21 + SplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, dst_stride_v, + halfwidth, halfheight); + + return 0; +} + +// Convert NV12 to I420. +LIBYUV_API +int NV12ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return X420ToI420(src_y, src_stride_y, src_stride_y, src_uv, src_stride_uv, + dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, + dst_stride_v, width, height); +} + +// Convert NV21 to I420. Same as NV12 but u and v pointers swapped. +LIBYUV_API +int NV21ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return X420ToI420(src_y, src_stride_y, src_stride_y, src_vu, src_stride_vu, + dst_y, dst_stride_y, dst_v, dst_stride_v, dst_u, + dst_stride_u, width, height); +} + +// Convert M420 to I420. +LIBYUV_API +int M420ToI420(const uint8_t* src_m420, + int src_stride_m420, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2, + src_m420 + src_stride_m420 * 2, src_stride_m420 * 3, dst_y, + dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, + width, height); +} + +// Convert YUY2 to I420. +LIBYUV_API +int YUY2ToI420(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; + void (*YUY2ToUVRow)(const uint8_t* src_yuy2, int src_stride_yuy2, + uint8_t* dst_u, uint8_t* dst_v, int width) = + YUY2ToUVRow_C; + void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) = + YUY2ToYRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; + src_stride_yuy2 = -src_stride_yuy2; + } +#if defined(HAS_YUY2TOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + YUY2ToUVRow = YUY2ToUVRow_Any_SSE2; + YUY2ToYRow = YUY2ToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + YUY2ToUVRow = YUY2ToUVRow_SSE2; + YUY2ToYRow = YUY2ToYRow_SSE2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + YUY2ToUVRow = YUY2ToUVRow_Any_AVX2; + YUY2ToYRow = YUY2ToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + YUY2ToUVRow = YUY2ToUVRow_AVX2; + YUY2ToYRow = YUY2ToYRow_AVX2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + YUY2ToYRow = YUY2ToYRow_Any_NEON; + YUY2ToUVRow = YUY2ToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_NEON; + YUY2ToUVRow = YUY2ToUVRow_NEON; + } + } +#endif +#if defined(HAS_YUY2TOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + YUY2ToYRow = YUY2ToYRow_Any_MSA; + YUY2ToUVRow = YUY2ToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_MSA; + YUY2ToUVRow = YUY2ToUVRow_MSA; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width); + YUY2ToYRow(src_yuy2, dst_y, width); + YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width); + src_yuy2 += src_stride_yuy2 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width); + YUY2ToYRow(src_yuy2, dst_y, width); + } + return 0; +} + +// Convert UYVY to I420. +LIBYUV_API +int UYVYToI420(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; + void (*UYVYToUVRow)(const uint8_t* src_uyvy, int src_stride_uyvy, + uint8_t* dst_u, uint8_t* dst_v, int width) = + UYVYToUVRow_C; + void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) = + UYVYToYRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; + src_stride_uyvy = -src_stride_uyvy; + } +#if defined(HAS_UYVYTOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + UYVYToUVRow = UYVYToUVRow_Any_SSE2; + UYVYToYRow = UYVYToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + UYVYToUVRow = UYVYToUVRow_SSE2; + UYVYToYRow = UYVYToYRow_SSE2; + } + } +#endif +#if defined(HAS_UYVYTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + UYVYToUVRow = UYVYToUVRow_Any_AVX2; + UYVYToYRow = UYVYToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + UYVYToUVRow = UYVYToUVRow_AVX2; + UYVYToYRow = UYVYToYRow_AVX2; + } + } +#endif +#if defined(HAS_UYVYTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + UYVYToYRow = UYVYToYRow_Any_NEON; + UYVYToUVRow = UYVYToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_NEON; + UYVYToUVRow = UYVYToUVRow_NEON; + } + } +#endif +#if defined(HAS_UYVYTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + UYVYToYRow = UYVYToYRow_Any_MSA; + UYVYToUVRow = UYVYToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + UYVYToYRow = UYVYToYRow_MSA; + UYVYToUVRow = UYVYToUVRow_MSA; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width); + UYVYToYRow(src_uyvy, dst_y, width); + UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width); + src_uyvy += src_stride_uyvy * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width); + UYVYToYRow(src_uyvy, dst_y, width); + } + return 0; +} + +// Convert ARGB to I420. +LIBYUV_API +int ARGBToI420(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYRow_C; + if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVRow = ARGBToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width); + ARGBToYRow(src_argb, dst_y, width); + ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); + src_argb += src_stride_argb * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); + ARGBToYRow(src_argb, dst_y, width); + } + return 0; +} + +// Convert BGRA to I420. +LIBYUV_API +int BGRAToI420(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; + void (*BGRAToUVRow)(const uint8_t* src_bgra0, int src_stride_bgra, + uint8_t* dst_u, uint8_t* dst_v, int width) = + BGRAToUVRow_C; + void (*BGRAToYRow)(const uint8_t* src_bgra, uint8_t* dst_y, int width) = + BGRAToYRow_C; + if (!src_bgra || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_bgra = src_bgra + (height - 1) * src_stride_bgra; + src_stride_bgra = -src_stride_bgra; + } +#if defined(HAS_BGRATOYROW_SSSE3) && defined(HAS_BGRATOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + BGRAToUVRow = BGRAToUVRow_Any_SSSE3; + BGRAToYRow = BGRAToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + BGRAToUVRow = BGRAToUVRow_SSSE3; + BGRAToYRow = BGRAToYRow_SSSE3; + } + } +#endif +#if defined(HAS_BGRATOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + BGRAToYRow = BGRAToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + BGRAToYRow = BGRAToYRow_NEON; + } + } +#endif +#if defined(HAS_BGRATOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + BGRAToUVRow = BGRAToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + BGRAToUVRow = BGRAToUVRow_NEON; + } + } +#endif +#if defined(HAS_BGRATOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + BGRAToYRow = BGRAToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + BGRAToYRow = BGRAToYRow_MSA; + } + } +#endif +#if defined(HAS_BGRATOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + BGRAToUVRow = BGRAToUVRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + BGRAToUVRow = BGRAToUVRow_MSA; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width); + BGRAToYRow(src_bgra, dst_y, width); + BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width); + src_bgra += src_stride_bgra * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + BGRAToUVRow(src_bgra, 0, dst_u, dst_v, width); + BGRAToYRow(src_bgra, dst_y, width); + } + return 0; +} + +// Convert ABGR to I420. +LIBYUV_API +int ABGRToI420(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; + void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ABGRToUVRow_C; + void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) = + ABGRToYRow_C; + if (!src_abgr || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_abgr = src_abgr + (height - 1) * src_stride_abgr; + src_stride_abgr = -src_stride_abgr; + } +#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToUVRow = ABGRToUVRow_Any_SSSE3; + ABGRToYRow = ABGRToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_SSSE3; + ABGRToYRow = ABGRToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ABGRToYRow = ABGRToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ABGRToYRow = ABGRToYRow_NEON; + } + } +#endif +#if defined(HAS_ABGRTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ABGRToUVRow = ABGRToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_NEON; + } + } +#endif +#if defined(HAS_ABGRTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ABGRToYRow = ABGRToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ABGRToYRow = ABGRToYRow_MSA; + } + } +#endif +#if defined(HAS_ABGRTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ABGRToUVRow = ABGRToUVRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_MSA; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width); + ABGRToYRow(src_abgr, dst_y, width); + ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width); + src_abgr += src_stride_abgr * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width); + ABGRToYRow(src_abgr, dst_y, width); + } + return 0; +} + +// Convert RGBA to I420. +LIBYUV_API +int RGBAToI420(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; + void (*RGBAToUVRow)(const uint8_t* src_rgba0, int src_stride_rgba, + uint8_t* dst_u, uint8_t* dst_v, int width) = + RGBAToUVRow_C; + void (*RGBAToYRow)(const uint8_t* src_rgba, uint8_t* dst_y, int width) = + RGBAToYRow_C; + if (!src_rgba || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgba = src_rgba + (height - 1) * src_stride_rgba; + src_stride_rgba = -src_stride_rgba; + } +#if defined(HAS_RGBATOYROW_SSSE3) && defined(HAS_RGBATOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RGBAToUVRow = RGBAToUVRow_Any_SSSE3; + RGBAToYRow = RGBAToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RGBAToUVRow = RGBAToUVRow_SSSE3; + RGBAToYRow = RGBAToYRow_SSSE3; + } + } +#endif +#if defined(HAS_RGBATOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGBAToYRow = RGBAToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RGBAToYRow = RGBAToYRow_NEON; + } + } +#endif +#if defined(HAS_RGBATOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGBAToUVRow = RGBAToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGBAToUVRow = RGBAToUVRow_NEON; + } + } +#endif +#if defined(HAS_RGBATOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGBAToYRow = RGBAToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGBAToYRow = RGBAToYRow_MSA; + } + } +#endif +#if defined(HAS_RGBATOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGBAToUVRow = RGBAToUVRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGBAToUVRow = RGBAToUVRow_MSA; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width); + RGBAToYRow(src_rgba, dst_y, width); + RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width); + src_rgba += src_stride_rgba * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + RGBAToUVRow(src_rgba, 0, dst_u, dst_v, width); + RGBAToYRow(src_rgba, dst_y, width); + } + return 0; +} + +// Convert RGB24 to I420. +LIBYUV_API +int RGB24ToI420(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; +#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA)) + void (*RGB24ToUVRow)(const uint8_t* src_rgb24, int src_stride_rgb24, + uint8_t* dst_u, uint8_t* dst_v, int width) = + RGB24ToUVRow_C; + void (*RGB24ToYRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) = + RGB24ToYRow_C; +#else + void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = + RGB24ToARGBRow_C; + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYRow_C; +#endif + if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; + src_stride_rgb24 = -src_stride_rgb24; + } + +// Neon version does direct RGB24 to YUV. +#if defined(HAS_RGB24TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGB24ToUVRow = RGB24ToUVRow_Any_NEON; + RGB24ToYRow = RGB24ToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RGB24ToYRow = RGB24ToYRow_NEON; + if (IS_ALIGNED(width, 16)) { + RGB24ToUVRow = RGB24ToUVRow_NEON; + } + } + } +#elif defined(HAS_RGB24TOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGB24ToUVRow = RGB24ToUVRow_Any_MSA; + RGB24ToYRow = RGB24ToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGB24ToYRow = RGB24ToYRow_MSA; + RGB24ToUVRow = RGB24ToUVRow_MSA; + } + } +// Other platforms do intermediate conversion from RGB24 to ARGB. +#else +#if defined(HAS_RGB24TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#endif + + { +#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA)) + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 31) & ~31; + align_buffer_64(row, kRowSize * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { +#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA)) + RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width); + RGB24ToYRow(src_rgb24, dst_y, width); + RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); +#else + RGB24ToARGBRow(src_rgb24, row, width); + RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); +#endif + src_rgb24 += src_stride_rgb24 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA)) + RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width); + RGB24ToYRow(src_rgb24, dst_y, width); +#else + RGB24ToARGBRow(src_rgb24, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); +#endif + } +#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA)) + free_aligned_buffer_64(row); +#endif + } + return 0; +} + +// Convert RAW to I420. +LIBYUV_API +int RAWToI420(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; +#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA)) + void (*RAWToUVRow)(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u, + uint8_t* dst_v, int width) = RAWToUVRow_C; + void (*RAWToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) = + RAWToYRow_C; +#else + void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = + RAWToARGBRow_C; + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYRow_C; +#endif + if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_raw = src_raw + (height - 1) * src_stride_raw; + src_stride_raw = -src_stride_raw; + } + +// Neon version does direct RAW to YUV. +#if defined(HAS_RAWTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RAWToUVRow = RAWToUVRow_Any_NEON; + RAWToYRow = RAWToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RAWToYRow = RAWToYRow_NEON; + if (IS_ALIGNED(width, 16)) { + RAWToUVRow = RAWToUVRow_NEON; + } + } + } +#elif defined(HAS_RAWTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RAWToUVRow = RAWToUVRow_Any_MSA; + RAWToYRow = RAWToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RAWToYRow = RAWToYRow_MSA; + RAWToUVRow = RAWToUVRow_MSA; + } + } +// Other platforms do intermediate conversion from RAW to ARGB. +#else +#if defined(HAS_RAWTOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RAWToARGBRow = RAWToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RAWToARGBRow = RAWToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#endif + + { +#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA)) + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 31) & ~31; + align_buffer_64(row, kRowSize * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { +#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA)) + RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width); + RAWToYRow(src_raw, dst_y, width); + RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); +#else + RAWToARGBRow(src_raw, row, width); + RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); +#endif + src_raw += src_stride_raw * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA)) + RAWToUVRow(src_raw, 0, dst_u, dst_v, width); + RAWToYRow(src_raw, dst_y, width); +#else + RAWToARGBRow(src_raw, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); +#endif + } +#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA)) + free_aligned_buffer_64(row); +#endif + } + return 0; +} + +// Convert RGB565 to I420. +LIBYUV_API +int RGB565ToI420(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; +#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA)) + void (*RGB565ToUVRow)(const uint8_t* src_rgb565, int src_stride_rgb565, + uint8_t* dst_u, uint8_t* dst_v, int width) = + RGB565ToUVRow_C; + void (*RGB565ToYRow)(const uint8_t* src_rgb565, uint8_t* dst_y, int width) = + RGB565ToYRow_C; +#else + void (*RGB565ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, + int width) = RGB565ToARGBRow_C; + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYRow_C; +#endif + if (!src_rgb565 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565; + src_stride_rgb565 = -src_stride_rgb565; + } + +// Neon version does direct RGB565 to YUV. +#if defined(HAS_RGB565TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGB565ToUVRow = RGB565ToUVRow_Any_NEON; + RGB565ToYRow = RGB565ToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RGB565ToYRow = RGB565ToYRow_NEON; + if (IS_ALIGNED(width, 16)) { + RGB565ToUVRow = RGB565ToUVRow_NEON; + } + } + } +#elif defined(HAS_RGB565TOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGB565ToUVRow = RGB565ToUVRow_Any_MSA; + RGB565ToYRow = RGB565ToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGB565ToYRow = RGB565ToYRow_MSA; + RGB565ToUVRow = RGB565ToUVRow_MSA; + } + } +// Other platforms do intermediate conversion from RGB565 to ARGB. +#else +#if defined(HAS_RGB565TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + RGB565ToARGBRow = RGB565ToARGBRow_SSE2; + } + } +#endif +#if defined(HAS_RGB565TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + RGB565ToARGBRow = RGB565ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#endif + { +#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA)) + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 31) & ~31; + align_buffer_64(row, kRowSize * 2); +#endif + for (y = 0; y < height - 1; y += 2) { +#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA)) + RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width); + RGB565ToYRow(src_rgb565, dst_y, width); + RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width); +#else + RGB565ToARGBRow(src_rgb565, row, width); + RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); +#endif + src_rgb565 += src_stride_rgb565 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA)) + RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width); + RGB565ToYRow(src_rgb565, dst_y, width); +#else + RGB565ToARGBRow(src_rgb565, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); +#endif + } +#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA)) + free_aligned_buffer_64(row); +#endif + } + return 0; +} + +// Convert ARGB1555 to I420. +LIBYUV_API +int ARGB1555ToI420(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; +#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA)) + void (*ARGB1555ToUVRow)(const uint8_t* src_argb1555, int src_stride_argb1555, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGB1555ToUVRow_C; + void (*ARGB1555ToYRow)(const uint8_t* src_argb1555, uint8_t* dst_y, + int width) = ARGB1555ToYRow_C; +#else + void (*ARGB1555ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, + int width) = ARGB1555ToARGBRow_C; + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYRow_C; +#endif + if (!src_argb1555 || !dst_y || !dst_u || !dst_v || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555; + src_stride_argb1555 = -src_stride_argb1555; + } + +// Neon version does direct ARGB1555 to YUV. +#if defined(HAS_ARGB1555TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON; + ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGB1555ToYRow = ARGB1555ToYRow_NEON; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToUVRow = ARGB1555ToUVRow_NEON; + } + } + } +#elif defined(HAS_ARGB1555TOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA; + ARGB1555ToYRow = ARGB1555ToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToYRow = ARGB1555ToYRow_MSA; + ARGB1555ToUVRow = ARGB1555ToUVRow_MSA; + } + } +// Other platforms do intermediate conversion from ARGB1555 to ARGB. +#else +#if defined(HAS_ARGB1555TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2; + } + } +#endif +#if defined(HAS_ARGB1555TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#endif + { +#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA)) + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 31) & ~31; + align_buffer_64(row, kRowSize * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { +#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA)) + ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width); + ARGB1555ToYRow(src_argb1555, dst_y, width); + ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y, + width); +#else + ARGB1555ToARGBRow(src_argb1555, row, width); + ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize, + width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); +#endif + src_argb1555 += src_stride_argb1555 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA)) + ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width); + ARGB1555ToYRow(src_argb1555, dst_y, width); +#else + ARGB1555ToARGBRow(src_argb1555, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); +#endif + } +#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA)) + free_aligned_buffer_64(row); +#endif + } + return 0; +} + +// Convert ARGB4444 to I420. +LIBYUV_API +int ARGB4444ToI420(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; +#if defined(HAS_ARGB4444TOYROW_NEON) + void (*ARGB4444ToUVRow)(const uint8_t* src_argb4444, int src_stride_argb4444, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGB4444ToUVRow_C; + void (*ARGB4444ToYRow)(const uint8_t* src_argb4444, uint8_t* dst_y, + int width) = ARGB4444ToYRow_C; +#else + void (*ARGB4444ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, + int width) = ARGB4444ToARGBRow_C; + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYRow_C; +#endif + if (!src_argb4444 || !dst_y || !dst_u || !dst_v || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444; + src_stride_argb4444 = -src_stride_argb4444; + } + +// Neon version does direct ARGB4444 to YUV. +#if defined(HAS_ARGB4444TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON; + ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGB4444ToYRow = ARGB4444ToYRow_NEON; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToUVRow = ARGB4444ToUVRow_NEON; + } + } + } +// Other platforms do intermediate conversion from ARGB4444 to ARGB. +#else +#if defined(HAS_ARGB4444TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2; + } + } +#endif +#if defined(HAS_ARGB4444TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_ARGB4444TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVRow = ARGBToUVRow_Any_MSA; + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } + } +#endif +#endif + + { +#if !defined(HAS_ARGB4444TOYROW_NEON) + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 31) & ~31; + align_buffer_64(row, kRowSize * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { +#if defined(HAS_ARGB4444TOYROW_NEON) + ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width); + ARGB4444ToYRow(src_argb4444, dst_y, width); + ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y, + width); +#else + ARGB4444ToARGBRow(src_argb4444, row, width); + ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize, + width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); +#endif + src_argb4444 += src_stride_argb4444 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if defined(HAS_ARGB4444TOYROW_NEON) + ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width); + ARGB4444ToYRow(src_argb4444, dst_y, width); +#else + ARGB4444ToARGBRow(src_argb4444, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); +#endif + } +#if !defined(HAS_ARGB4444TOYROW_NEON) + free_aligned_buffer_64(row); +#endif + } + return 0; +} + +static void SplitPixels(const uint8_t* src_u, + int src_pixel_stride_uv, + uint8_t* dst_u, + int width) { + int i; + for (i = 0; i < width; ++i) { + *dst_u = *src_u; + ++dst_u; + src_u += src_pixel_stride_uv; + } +} + +// Convert Android420 to I420. +LIBYUV_API +int Android420ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_pixel_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; + const ptrdiff_t vu_off = src_v - src_u; + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + + // Copy UV planes as is - I420 + if (src_pixel_stride_uv == 1) { + CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); + CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); + return 0; + // Split UV planes - NV21 + } + if (src_pixel_stride_uv == 2 && vu_off == -1 && + src_stride_u == src_stride_v) { + SplitUVPlane(src_v, src_stride_v, dst_v, dst_stride_v, dst_u, dst_stride_u, + halfwidth, halfheight); + return 0; + // Split UV planes - NV12 + } + if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) { + SplitUVPlane(src_u, src_stride_u, dst_u, dst_stride_u, dst_v, dst_stride_v, + halfwidth, halfheight); + return 0; + } + + for (y = 0; y < halfheight; ++y) { + SplitPixels(src_u, src_pixel_stride_uv, dst_u, halfwidth); + SplitPixels(src_v, src_pixel_stride_uv, dst_v, halfwidth); + src_u += src_stride_u; + src_v += src_stride_v; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + return 0; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/convert_argb.cc b/media/libvpx/libvpx/third_party/libyuv/source/convert_argb.cc new file mode 100644 index 0000000000..f2fe474f70 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/convert_argb.cc @@ -0,0 +1,2231 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert_argb.h" + +#include "libyuv/cpu_id.h" +#ifdef HAVE_JPEG +#include "libyuv/mjpeg_decoder.h" +#endif +#include "libyuv/planar_functions.h" // For CopyPlane and ARGBShuffle. +#include "libyuv/rotate_argb.h" +#include "libyuv/row.h" +#include "libyuv/video_common.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Copy ARGB with optional flipping +LIBYUV_API +int ARGBCopy(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + + CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width * 4, + height); + return 0; +} + +// Convert I420 to ARGB with matrix +static int I420ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToARGBRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToARGBRow = I422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I422TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGBRow = I422ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to ARGB. +LIBYUV_API +int I420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvI601Constants, width, height); +} + +// Convert I420 to ABGR. +LIBYUV_API +int I420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvuI601Constants, // Use Yvu matrix + width, height); +} + +// Convert J420 to ARGB. +LIBYUV_API +int J420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvJPEGConstants, width, height); +} + +// Convert J420 to ABGR. +LIBYUV_API +int J420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvuJPEGConstants, // Use Yvu matrix + width, height); +} + +// Convert H420 to ARGB. +LIBYUV_API +int H420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvH709Constants, width, height); +} + +// Convert H420 to ABGR. +LIBYUV_API +int H420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvuH709Constants, // Use Yvu matrix + width, height); +} + +// Convert I422 to ARGB with matrix +static int I422ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToARGBRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_y == width && src_stride_u * 2 == width && + src_stride_v * 2 == width && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; + } +#if defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToARGBRow = I422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I422TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGBRow = I422ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I422 to ARGB. +LIBYUV_API +int I422ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvI601Constants, width, height); +} + +// Convert I422 to ABGR. +LIBYUV_API +int I422ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvuI601Constants, // Use Yvu matrix + width, height); +} + +// Convert J422 to ARGB. +LIBYUV_API +int J422ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvJPEGConstants, width, height); +} + +// Convert J422 to ABGR. +LIBYUV_API +int J422ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvuJPEGConstants, // Use Yvu matrix + width, height); +} + +// Convert H422 to ARGB. +LIBYUV_API +int H422ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvH709Constants, width, height); +} + +// Convert H422 to ABGR. +LIBYUV_API +int H422ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvuH709Constants, // Use Yvu matrix + width, height); +} + +// Convert 10 bit YUV to ARGB with matrix +// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to +// multiply 10 bit yuv into high bits to allow any number of bits. +static int I010ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I210ToAR30Row_C; + if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } +#if defined(HAS_I210TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I210ToAR30Row = I210ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I210ToAR30Row = I210ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_I210TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I210ToAR30Row = I210ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I210ToAR30Row = I210ToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I010 to AR30. +LIBYUV_API +int I010ToAR30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYuvI601Constants, width, height); +} + +// Convert H010 to AR30. +LIBYUV_API +int H010ToAR30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYuvH709Constants, width, height); +} + +// Convert I010 to AB30. +LIBYUV_API +int I010ToAB30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height) { + return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_ab30, dst_stride_ab30, + &kYvuI601Constants, width, height); +} + +// Convert H010 to AB30. +LIBYUV_API +int H010ToAB30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height) { + return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_ab30, dst_stride_ab30, + &kYvuH709Constants, width, height); +} + +// Convert 10 bit YUV to ARGB with matrix +static int I010ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I210ToARGBRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I210TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I210ToARGBRow = I210ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I210ToARGBRow = I210ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I210TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I210ToARGBRow = I210ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I210ToARGBRow = I210ToARGBRow_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I010 to ARGB. +LIBYUV_API +int I010ToARGB(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvI601Constants, width, height); +} + +// Convert I010 to ABGR. +LIBYUV_API +int I010ToABGR(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I010ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvuI601Constants, // Use Yvu matrix + width, height); +} + +// Convert H010 to ARGB. +LIBYUV_API +int H010ToARGB(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvH709Constants, width, height); +} + +// Convert H010 to ABGR. +LIBYUV_API +int H010ToABGR(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I010ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvuH709Constants, // Use Yvu matrix + width, height); +} + +// Convert I444 to ARGB with matrix +static int I444ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I444ToARGBRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_y == width && src_stride_u == width && src_stride_v == width && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; + } +#if defined(HAS_I444TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I444ToARGBRow = I444ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I444TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I444ToARGBRow = I444ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I444ToARGBRow = I444ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I444TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I444ToARGBRow = I444ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I444TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I444ToARGBRow = I444ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I444 to ARGB. +LIBYUV_API +int I444ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvI601Constants, width, height); +} + +// Convert I444 to ABGR. +LIBYUV_API +int I444ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I444ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvuI601Constants, // Use Yvu matrix + width, height); +} + +// Convert J444 to ARGB. +LIBYUV_API +int J444ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvJPEGConstants, width, height); +} + +// Convert I420 with Alpha to preattenuated ARGB. +static int I420AlphaToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate) { + int y; + void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) = I422AlphaToARGBRow_C; + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBAttenuateRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I422ALPHATOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I422ALPHATOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422ALPHATOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I422ALPHATOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_MSA; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_a += src_stride_a; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 with Alpha to ARGB. +LIBYUV_API +int I420AlphaToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + int attenuate) { + return I420AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, src_a, src_stride_a, dst_argb, + dst_stride_argb, &kYuvI601Constants, width, + height, attenuate); +} + +// Convert I420 with Alpha to ABGR. +LIBYUV_API +int I420AlphaToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height, + int attenuate) { + return I420AlphaToARGBMatrix( + src_y, src_stride_y, src_v, src_stride_v, // Swap U and V + src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr, + &kYvuI601Constants, // Use Yvu matrix + width, height, attenuate); +} + +// Convert I400 to ARGB. +LIBYUV_API +int I400ToARGB(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf, int width) = + I400ToARGBRow_C; + if (!src_y || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_y == width && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = dst_stride_argb = 0; + } +#if defined(HAS_I400TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + I400ToARGBRow = I400ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + I400ToARGBRow = I400ToARGBRow_SSE2; + } + } +#endif +#if defined(HAS_I400TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I400ToARGBRow = I400ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I400ToARGBRow = I400ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I400TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I400ToARGBRow = I400ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I400ToARGBRow = I400ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I400TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I400ToARGBRow = I400ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + I400ToARGBRow = I400ToARGBRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I400ToARGBRow(src_y, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + } + return 0; +} + +// Convert J400 to ARGB. +LIBYUV_API +int J400ToARGB(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*J400ToARGBRow)(const uint8_t* src_y, uint8_t* dst_argb, int width) = + J400ToARGBRow_C; + if (!src_y || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = dst_stride_argb = 0; + } +#if defined(HAS_J400TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + J400ToARGBRow = J400ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + J400ToARGBRow = J400ToARGBRow_SSE2; + } + } +#endif +#if defined(HAS_J400TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + J400ToARGBRow = J400ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + J400ToARGBRow = J400ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_J400TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + J400ToARGBRow = J400ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + J400ToARGBRow = J400ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_J400TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + J400ToARGBRow = J400ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + J400ToARGBRow = J400ToARGBRow_MSA; + } + } +#endif + for (y = 0; y < height; ++y) { + J400ToARGBRow(src_y, dst_argb, width); + src_y += src_stride_y; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Shuffle table for converting BGRA to ARGB. +static const uvec8 kShuffleMaskBGRAToARGB = { + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u}; + +// Shuffle table for converting ABGR to ARGB. +static const uvec8 kShuffleMaskABGRToARGB = { + 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u}; + +// Shuffle table for converting RGBA to ARGB. +static const uvec8 kShuffleMaskRGBAToARGB = { + 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u}; + +// Convert BGRA to ARGB. +LIBYUV_API +int BGRAToARGB(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb, + (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height); +} + +// Convert ARGB to BGRA (same as BGRAToARGB). +LIBYUV_API +int ARGBToBGRA(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb, + (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height); +} + +// Convert ABGR to ARGB. +LIBYUV_API +int ABGRToARGB(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb, + (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height); +} + +// Convert ARGB to ABGR to (same as ABGRToARGB). +LIBYUV_API +int ARGBToABGR(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb, + (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height); +} + +// Convert RGBA to ARGB. +LIBYUV_API +int RGBAToARGB(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return ARGBShuffle(src_rgba, src_stride_rgba, dst_argb, dst_stride_argb, + (const uint8_t*)(&kShuffleMaskRGBAToARGB), width, height); +} + +// Convert RGB24 to ARGB. +LIBYUV_API +int RGB24ToARGB(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = + RGB24ToARGBRow_C; + if (!src_rgb24 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; + src_stride_rgb24 = -src_stride_rgb24; + } + // Coalesce rows. + if (src_stride_rgb24 == width * 3 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_rgb24 = dst_stride_argb = 0; + } +#if defined(HAS_RGB24TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_RGB24TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RGB24ToARGBRow = RGB24ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_RGB24TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGB24ToARGBRow = RGB24ToARGBRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + RGB24ToARGBRow(src_rgb24, dst_argb, width); + src_rgb24 += src_stride_rgb24; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert RAW to ARGB. +LIBYUV_API +int RAWToARGB(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = + RAWToARGBRow_C; + if (!src_raw || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_raw = src_raw + (height - 1) * src_stride_raw; + src_stride_raw = -src_stride_raw; + } + // Coalesce rows. + if (src_stride_raw == width * 3 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_raw = dst_stride_argb = 0; + } +#if defined(HAS_RAWTOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RAWToARGBRow = RAWToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RAWToARGBRow = RAWToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_RAWTOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RAWToARGBRow = RAWToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RAWToARGBRow = RAWToARGBRow_NEON; + } + } +#endif +#if defined(HAS_RAWTOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RAWToARGBRow = RAWToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RAWToARGBRow = RAWToARGBRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + RAWToARGBRow(src_raw, dst_argb, width); + src_raw += src_stride_raw; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert RGB565 to ARGB. +LIBYUV_API +int RGB565ToARGB(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*RGB565ToARGBRow)(const uint8_t* src_rgb565, uint8_t* dst_argb, + int width) = RGB565ToARGBRow_C; + if (!src_rgb565 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565; + src_stride_rgb565 = -src_stride_rgb565; + } + // Coalesce rows. + if (src_stride_rgb565 == width * 2 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_rgb565 = dst_stride_argb = 0; + } +#if defined(HAS_RGB565TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + RGB565ToARGBRow = RGB565ToARGBRow_SSE2; + } + } +#endif +#if defined(HAS_RGB565TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + RGB565ToARGBRow = RGB565ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_RGB565TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RGB565ToARGBRow = RGB565ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_RGB565TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGB565ToARGBRow = RGB565ToARGBRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + RGB565ToARGBRow(src_rgb565, dst_argb, width); + src_rgb565 += src_stride_rgb565; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert ARGB1555 to ARGB. +LIBYUV_API +int ARGB1555ToARGB(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*ARGB1555ToARGBRow)(const uint8_t* src_argb1555, uint8_t* dst_argb, + int width) = ARGB1555ToARGBRow_C; + if (!src_argb1555 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555; + src_stride_argb1555 = -src_stride_argb1555; + } + // Coalesce rows. + if (src_stride_argb1555 == width * 2 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb1555 = dst_stride_argb = 0; + } +#if defined(HAS_ARGB1555TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2; + } + } +#endif +#if defined(HAS_ARGB1555TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_ARGB1555TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_ARGB1555TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGB1555ToARGBRow(src_argb1555, dst_argb, width); + src_argb1555 += src_stride_argb1555; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert ARGB4444 to ARGB. +LIBYUV_API +int ARGB4444ToARGB(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*ARGB4444ToARGBRow)(const uint8_t* src_argb4444, uint8_t* dst_argb, + int width) = ARGB4444ToARGBRow_C; + if (!src_argb4444 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444; + src_stride_argb4444 = -src_stride_argb4444; + } + // Coalesce rows. + if (src_stride_argb4444 == width * 2 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb4444 = dst_stride_argb = 0; + } +#if defined(HAS_ARGB4444TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2; + } + } +#endif +#if defined(HAS_ARGB4444TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_ARGB4444TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_ARGB4444TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGB4444ToARGBRow(src_argb4444, dst_argb, width); + src_argb4444 += src_stride_argb4444; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert AR30 to ARGB. +LIBYUV_API +int AR30ToARGB(const uint8_t* src_ar30, + int src_stride_ar30, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + if (!src_ar30 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ar30 = src_ar30 + (height - 1) * src_stride_ar30; + src_stride_ar30 = -src_stride_ar30; + } + // Coalesce rows. + if (src_stride_ar30 == width * 4 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_ar30 = dst_stride_argb = 0; + } + for (y = 0; y < height; ++y) { + AR30ToARGBRow_C(src_ar30, dst_argb, width); + src_ar30 += src_stride_ar30; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert AR30 to ABGR. +LIBYUV_API +int AR30ToABGR(const uint8_t* src_ar30, + int src_stride_ar30, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + int y; + if (!src_ar30 || !dst_abgr || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ar30 = src_ar30 + (height - 1) * src_stride_ar30; + src_stride_ar30 = -src_stride_ar30; + } + // Coalesce rows. + if (src_stride_ar30 == width * 4 && dst_stride_abgr == width * 4) { + width *= height; + height = 1; + src_stride_ar30 = dst_stride_abgr = 0; + } + for (y = 0; y < height; ++y) { + AR30ToABGRRow_C(src_ar30, dst_abgr, width); + src_ar30 += src_stride_ar30; + dst_abgr += dst_stride_abgr; + } + return 0; +} + +// Convert AR30 to AB30. +LIBYUV_API +int AR30ToAB30(const uint8_t* src_ar30, + int src_stride_ar30, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height) { + int y; + if (!src_ar30 || !dst_ab30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ar30 = src_ar30 + (height - 1) * src_stride_ar30; + src_stride_ar30 = -src_stride_ar30; + } + // Coalesce rows. + if (src_stride_ar30 == width * 4 && dst_stride_ab30 == width * 4) { + width *= height; + height = 1; + src_stride_ar30 = dst_stride_ab30 = 0; + } + for (y = 0; y < height; ++y) { + AR30ToAB30Row_C(src_ar30, dst_ab30, width); + src_ar30 += src_stride_ar30; + dst_ab30 += dst_stride_ab30; + } + return 0; +} + +// Convert NV12 to ARGB with matrix +static int NV12ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*NV12ToARGBRow)( + const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C; + if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_NV12TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_NV12TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV12ToARGBRow = NV12ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + NV12ToARGBRow = NV12ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_NV12TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + NV12ToARGBRow = NV12ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_NV12TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + NV12ToARGBRow = NV12ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + NV12ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_uv += src_stride_uv; + } + } + return 0; +} + +// Convert NV21 to ARGB with matrix +static int NV21ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*NV21ToARGBRow)( + const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV21ToARGBRow_C; + if (!src_y || !src_vu || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_NV21TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + NV21ToARGBRow = NV21ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_NV21TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV21ToARGBRow = NV21ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + NV21ToARGBRow = NV21ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_NV21TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + NV21ToARGBRow = NV21ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV21ToARGBRow = NV21ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_NV21TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + NV21ToARGBRow = NV21ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + NV21ToARGBRow = NV21ToARGBRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + NV21ToARGBRow(src_y, src_vu, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_vu += src_stride_vu; + } + } + return 0; +} + +// Convert NV12 to ARGB. +LIBYUV_API +int NV12ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return NV12ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_argb, + dst_stride_argb, &kYuvI601Constants, width, height); +} + +// Convert NV21 to ARGB. +LIBYUV_API +int NV21ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return NV21ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_argb, + dst_stride_argb, &kYuvI601Constants, width, height); +} + +// Convert NV12 to ABGR. +// To output ABGR instead of ARGB swap the UV and use a mirrrored yuc matrix. +// To swap the UV use NV12 instead of NV21.LIBYUV_API +int NV12ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return NV21ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_abgr, + dst_stride_abgr, &kYvuI601Constants, width, height); +} + +// Convert NV21 to ABGR. +LIBYUV_API +int NV21ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return NV12ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_abgr, + dst_stride_abgr, &kYvuI601Constants, width, height); +} + +// TODO(fbarchard): Consider SSSE3 2 step conversion. +// Convert NV12 to RGB24 with matrix +static int NV12ToRGB24Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*NV12ToRGB24Row)( + const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV12ToRGB24Row_C; + if (!src_y || !src_uv || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; + dst_stride_rgb24 = -dst_stride_rgb24; + } +#if defined(HAS_NV12TORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + NV12ToRGB24Row = NV12ToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV12ToRGB24Row = NV12ToRGB24Row_NEON; + } + } +#endif +#if defined(HAS_NV12TORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + NV12ToRGB24Row = NV12ToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + NV12ToRGB24Row = NV12ToRGB24Row_SSSE3; + } + } +#endif +#if defined(HAS_NV12TORGB24ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV12ToRGB24Row = NV12ToRGB24Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + NV12ToRGB24Row = NV12ToRGB24Row_AVX2; + } + } +#endif + + for (y = 0; y < height; ++y) { + NV12ToRGB24Row(src_y, src_uv, dst_rgb24, yuvconstants, width); + dst_rgb24 += dst_stride_rgb24; + src_y += src_stride_y; + if (y & 1) { + src_uv += src_stride_uv; + } + } + return 0; +} + +// Convert NV21 to RGB24 with matrix +static int NV21ToRGB24Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*NV21ToRGB24Row)( + const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV21ToRGB24Row_C; + if (!src_y || !src_vu || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; + dst_stride_rgb24 = -dst_stride_rgb24; + } +#if defined(HAS_NV21TORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + NV21ToRGB24Row = NV21ToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV21ToRGB24Row = NV21ToRGB24Row_NEON; + } + } +#endif +#if defined(HAS_NV21TORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + NV21ToRGB24Row = NV21ToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + NV21ToRGB24Row = NV21ToRGB24Row_SSSE3; + } + } +#endif +#if defined(HAS_NV21TORGB24ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV21ToRGB24Row = NV21ToRGB24Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + NV21ToRGB24Row = NV21ToRGB24Row_AVX2; + } + } +#endif + + for (y = 0; y < height; ++y) { + NV21ToRGB24Row(src_y, src_vu, dst_rgb24, yuvconstants, width); + dst_rgb24 += dst_stride_rgb24; + src_y += src_stride_y; + if (y & 1) { + src_vu += src_stride_vu; + } + } + return 0; +} + +// TODO(fbarchard): NV12ToRAW can be implemented by mirrored matrix. +// Convert NV12 to RGB24. +LIBYUV_API +int NV12ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + return NV12ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv, + dst_rgb24, dst_stride_rgb24, &kYuvI601Constants, + width, height); +} + +// Convert NV21 to RGB24. +LIBYUV_API +int NV21ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + return NV21ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu, + dst_rgb24, dst_stride_rgb24, &kYuvI601Constants, + width, height); +} + +// Convert M420 to ARGB. +LIBYUV_API +int M420ToARGB(const uint8_t* src_m420, + int src_stride_m420, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*NV12ToARGBRow)( + const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C; + if (!src_m420 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_NV12TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_NV12TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV12ToARGBRow = NV12ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + NV12ToARGBRow = NV12ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_NV12TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + NV12ToARGBRow = NV12ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_NV12TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + NV12ToARGBRow = NV12ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_MSA; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, + &kYuvI601Constants, width); + NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2, + dst_argb + dst_stride_argb, &kYuvI601Constants, width); + dst_argb += dst_stride_argb * 2; + src_m420 += src_stride_m420 * 3; + } + if (height & 1) { + NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, + &kYuvI601Constants, width); + } + return 0; +} + +// Convert YUY2 to ARGB. +LIBYUV_API +int YUY2ToARGB(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*YUY2ToARGBRow)(const uint8_t* src_yuy2, uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, int width) = + YUY2ToARGBRow_C; + if (!src_yuy2 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; + src_stride_yuy2 = -src_stride_yuy2; + } + // Coalesce rows. + if (src_stride_yuy2 == width * 2 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_yuy2 = dst_stride_argb = 0; + } +#if defined(HAS_YUY2TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + YUY2ToARGBRow = YUY2ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_YUY2TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + YUY2ToARGBRow = YUY2ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + YUY2ToARGBRow = YUY2ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_YUY2TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + YUY2ToARGBRow = YUY2ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_YUY2TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + YUY2ToARGBRow = YUY2ToARGBRow_MSA; + } + } +#endif + for (y = 0; y < height; ++y) { + YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width); + src_yuy2 += src_stride_yuy2; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert UYVY to ARGB. +LIBYUV_API +int UYVYToARGB(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*UYVYToARGBRow)(const uint8_t* src_uyvy, uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, int width) = + UYVYToARGBRow_C; + if (!src_uyvy || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; + src_stride_uyvy = -src_stride_uyvy; + } + // Coalesce rows. + if (src_stride_uyvy == width * 2 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_uyvy = dst_stride_argb = 0; + } +#if defined(HAS_UYVYTOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + UYVYToARGBRow = UYVYToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_UYVYTOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + UYVYToARGBRow = UYVYToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + UYVYToARGBRow = UYVYToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_UYVYTOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + UYVYToARGBRow = UYVYToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + UYVYToARGBRow = UYVYToARGBRow_NEON; + } + } +#endif +#if defined(HAS_UYVYTOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + UYVYToARGBRow = UYVYToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + UYVYToARGBRow = UYVYToARGBRow_MSA; + } + } +#endif + for (y = 0; y < height; ++y) { + UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width); + src_uyvy += src_stride_uyvy; + dst_argb += dst_stride_argb; + } + return 0; +} +static void WeavePixels(const uint8_t* src_u, + const uint8_t* src_v, + int src_pixel_stride_uv, + uint8_t* dst_uv, + int width) { + int i; + for (i = 0; i < width; ++i) { + dst_uv[0] = *src_u; + dst_uv[1] = *src_v; + dst_uv += 2; + src_u += src_pixel_stride_uv; + src_v += src_pixel_stride_uv; + } +} + +// Convert Android420 to ARGB. +LIBYUV_API +int Android420ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_pixel_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + uint8_t* dst_uv; + const ptrdiff_t vu_off = src_v - src_u; + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + + // I420 + if (src_pixel_stride_uv == 1) { + return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + yuvconstants, width, height); + // NV21 + } + if (src_pixel_stride_uv == 2 && vu_off == -1 && + src_stride_u == src_stride_v) { + return NV21ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, dst_argb, + dst_stride_argb, yuvconstants, width, height); + // NV12 + } + if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) { + return NV12ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, dst_argb, + dst_stride_argb, yuvconstants, width, height); + } + + // General case fallback creates NV12 + align_buffer_64(plane_uv, halfwidth * 2 * halfheight); + dst_uv = plane_uv; + for (y = 0; y < halfheight; ++y) { + WeavePixels(src_u, src_v, src_pixel_stride_uv, dst_uv, halfwidth); + src_u += src_stride_u; + src_v += src_stride_v; + dst_uv += halfwidth * 2; + } + NV12ToARGBMatrix(src_y, src_stride_y, plane_uv, halfwidth * 2, dst_argb, + dst_stride_argb, yuvconstants, width, height); + free_aligned_buffer_64(plane_uv); + return 0; +} + +// Convert Android420 to ARGB. +LIBYUV_API +int Android420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_pixel_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return Android420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, src_pixel_stride_uv, dst_argb, + dst_stride_argb, &kYuvI601Constants, width, + height); +} + +// Convert Android420 to ABGR. +LIBYUV_API +int Android420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_pixel_stride_uv, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return Android420ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, src_pixel_stride_uv, dst_abgr, + dst_stride_abgr, &kYvuI601Constants, width, + height); +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/convert_from.cc b/media/libvpx/libvpx/third_party/libyuv/source/convert_from.cc new file mode 100644 index 0000000000..6fa253237e --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/convert_from.cc @@ -0,0 +1,1429 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert_from.h" + +#include "libyuv/basic_types.h" +#include "libyuv/convert.h" // For I420Copy +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" +#include "libyuv/rotate.h" +#include "libyuv/row.h" +#include "libyuv/scale.h" // For ScalePlane() +#include "libyuv/video_common.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +// I420 To any I4xx YUV format with mirroring. +static int I420ToI4xx(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int src_y_width, + int src_y_height, + int dst_uv_width, + int dst_uv_height) { + const int dst_y_width = Abs(src_y_width); + const int dst_y_height = Abs(src_y_height); + const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1); + const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1); + if (src_y_width == 0 || src_y_height == 0 || dst_uv_width <= 0 || + dst_uv_height <= 0) { + return -1; + } + if (dst_y) { + ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y, + dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear); + } + ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u, + dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear); + ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v, + dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear); + return 0; +} + +// Convert 8 bit YUV to 10 bit. +LIBYUV_API +int I420ToI010(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + // Convert Y plane. + Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 1024, width, + height); + // Convert UV planes. + Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 1024, halfwidth, + halfheight); + Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 1024, halfwidth, + halfheight); + return 0; +} + +// 420 chroma is 1/2 width, 1/2 height +// 422 chroma is 1/2 width, 1x height +LIBYUV_API +int I420ToI422(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + const int dst_uv_width = (Abs(width) + 1) >> 1; + const int dst_uv_height = Abs(height); + return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, dst_uv_width, + dst_uv_height); +} + +// 420 chroma is 1/2 width, 1/2 height +// 444 chroma is 1x width, 1x height +LIBYUV_API +int I420ToI444(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + const int dst_uv_width = Abs(width); + const int dst_uv_height = Abs(height); + return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, dst_uv_width, + dst_uv_height); +} + +// Copy to I400. Source can be I420,422,444,400,NV12,NV21 +LIBYUV_API +int I400Copy(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + if (!src_y || !dst_y || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + return 0; +} + +LIBYUV_API +int I422ToYUY2(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_yuy2, + int dst_stride_yuy2, + int width, + int height) { + int y; + void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u, + const uint8_t* src_v, uint8_t* dst_yuy2, int width) = + I422ToYUY2Row_C; + if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2; + dst_stride_yuy2 = -dst_stride_yuy2; + } + // Coalesce rows. + if (src_stride_y == width && src_stride_u * 2 == width && + src_stride_v * 2 == width && dst_stride_yuy2 == width * 2) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0; + } +#if defined(HAS_I422TOYUY2ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_SSE2; + } + } +#endif +#if defined(HAS_I422TOYUY2ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToYUY2Row = I422ToYUY2Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_AVX2; + } + } +#endif +#if defined(HAS_I422TOYUY2ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToYUY2Row = I422ToYUY2Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width); + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + dst_yuy2 += dst_stride_yuy2; + } + return 0; +} + +LIBYUV_API +int I420ToYUY2(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_yuy2, + int dst_stride_yuy2, + int width, + int height) { + int y; + void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u, + const uint8_t* src_v, uint8_t* dst_yuy2, int width) = + I422ToYUY2Row_C; + if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2; + dst_stride_yuy2 = -dst_stride_yuy2; + } +#if defined(HAS_I422TOYUY2ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_SSE2; + } + } +#endif +#if defined(HAS_I422TOYUY2ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToYUY2Row = I422ToYUY2Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_AVX2; + } + } +#endif +#if defined(HAS_I422TOYUY2ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToYUY2Row = I422ToYUY2Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_NEON; + } + } +#endif +#if defined(HAS_I422TOYUY2ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToYUY2Row = I422ToYUY2Row_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_MSA; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width); + I422ToYUY2Row(src_y + src_stride_y, src_u, src_v, + dst_yuy2 + dst_stride_yuy2, width); + src_y += src_stride_y * 2; + src_u += src_stride_u; + src_v += src_stride_v; + dst_yuy2 += dst_stride_yuy2 * 2; + } + if (height & 1) { + I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width); + } + return 0; +} + +LIBYUV_API +int I422ToUYVY(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uyvy, + int dst_stride_uyvy, + int width, + int height) { + int y; + void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u, + const uint8_t* src_v, uint8_t* dst_uyvy, int width) = + I422ToUYVYRow_C; + if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy; + dst_stride_uyvy = -dst_stride_uyvy; + } + // Coalesce rows. + if (src_stride_y == width && src_stride_u * 2 == width && + src_stride_v * 2 == width && dst_stride_uyvy == width * 2) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0; + } +#if defined(HAS_I422TOUYVYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_SSE2; + } + } +#endif +#if defined(HAS_I422TOUYVYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToUYVYRow = I422ToUYVYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOUYVYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToUYVYRow = I422ToUYVYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_NEON; + } + } +#endif +#if defined(HAS_I422TOUYVYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToUYVYRow = I422ToUYVYRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + dst_uyvy += dst_stride_uyvy; + } + return 0; +} + +LIBYUV_API +int I420ToUYVY(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uyvy, + int dst_stride_uyvy, + int width, + int height) { + int y; + void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u, + const uint8_t* src_v, uint8_t* dst_uyvy, int width) = + I422ToUYVYRow_C; + if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy; + dst_stride_uyvy = -dst_stride_uyvy; + } +#if defined(HAS_I422TOUYVYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_SSE2; + } + } +#endif +#if defined(HAS_I422TOUYVYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToUYVYRow = I422ToUYVYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOUYVYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToUYVYRow = I422ToUYVYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_NEON; + } + } +#endif +#if defined(HAS_I422TOUYVYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToUYVYRow = I422ToUYVYRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_MSA; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); + I422ToUYVYRow(src_y + src_stride_y, src_u, src_v, + dst_uyvy + dst_stride_uyvy, width); + src_y += src_stride_y * 2; + src_u += src_stride_u; + src_v += src_stride_v; + dst_uyvy += dst_stride_uyvy * 2; + } + if (height & 1) { + I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); + } + return 0; +} + +// TODO(fbarchard): test negative height for invert. +LIBYUV_API +int I420ToNV12(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 || + height == 0) { + return -1; + } + int halfwidth = (width + 1) / 2; + int halfheight = height > 0 ? (height + 1) / 2 : (height - 1) / 2; + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + MergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv, dst_stride_uv, + halfwidth, halfheight); + return 0; +} + +LIBYUV_API +int I420ToNV21(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height) { + return I420ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu, + width, height); +} + +// Convert I422 to RGBA with matrix +static int I420ToRGBAMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGBARow_C; + if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; + dst_stride_rgba = -dst_stride_rgba; + } +#if defined(HAS_I422TORGBAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToRGBARow = I422ToRGBARow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_SSSE3; + } + } +#endif +#if defined(HAS_I422TORGBAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRGBARow = I422ToRGBARow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToRGBARow = I422ToRGBARow_AVX2; + } + } +#endif +#if defined(HAS_I422TORGBAROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRGBARow = I422ToRGBARow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_NEON; + } + } +#endif +#if defined(HAS_I422TORGBAROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGBARow = I422ToRGBARow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width); + dst_rgba += dst_stride_rgba; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to RGBA. +LIBYUV_API +int I420ToRGBA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, + int width, + int height) { + return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgba, dst_stride_rgba, + &kYuvI601Constants, width, height); +} + +// Convert I420 to BGRA. +LIBYUV_API +int I420ToBGRA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_bgra, + int dst_stride_bgra, + int width, + int height) { + return I420ToRGBAMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_bgra, dst_stride_bgra, + &kYvuI601Constants, // Use Yvu matrix + width, height); +} + +// Convert I420 to RGB24 with matrix +static int I420ToRGB24Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGB24Row_C; + if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; + dst_stride_rgb24 = -dst_stride_rgb24; + } +#if defined(HAS_I422TORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGB24Row = I422ToRGB24Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TORGB24ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRGB24Row = I422ToRGB24Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToRGB24Row = I422ToRGB24Row_AVX2; + } + } +#endif +#if defined(HAS_I422TORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRGB24Row = I422ToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGB24Row = I422ToRGB24Row_NEON; + } + } +#endif +#if defined(HAS_I422TORGB24ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGB24Row = I422ToRGB24Row_Any_MSA; + if (IS_ALIGNED(width, 16)) { + I422ToRGB24Row = I422ToRGB24Row_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width); + dst_rgb24 += dst_stride_rgb24; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to RGB24. +LIBYUV_API +int I420ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb24, dst_stride_rgb24, + &kYuvI601Constants, width, height); +} + +// Convert I420 to RAW. +LIBYUV_API +int I420ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_raw, dst_stride_raw, + &kYvuI601Constants, // Use Yvu matrix + width, height); +} + +// Convert H420 to RGB24. +LIBYUV_API +int H420ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb24, dst_stride_rgb24, + &kYuvH709Constants, width, height); +} + +// Convert H420 to RAW. +LIBYUV_API +int H420ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_raw, dst_stride_raw, + &kYvuH709Constants, // Use Yvu matrix + width, height); +} + +// Convert I420 to ARGB1555. +LIBYUV_API +int I420ToARGB1555(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb1555, + int dst_stride_argb1555, + int width, + int height) { + int y; + void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) = I422ToARGB1555Row_C; + if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555; + dst_stride_argb1555 = -dst_stride_argb1555; + } +#if defined(HAS_I422TOARGB1555ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGB1555Row = I422ToARGB1555Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TOARGB1555ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGB1555Row = I422ToARGB1555Row_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGB1555ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGB1555Row = I422ToARGB1555Row_NEON; + } + } +#endif +#if defined(HAS_I422TOARGB1555ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGB1555Row = I422ToARGB1555Row_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants, + width); + dst_argb1555 += dst_stride_argb1555; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to ARGB4444. +LIBYUV_API +int I420ToARGB4444(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb4444, + int dst_stride_argb4444, + int width, + int height) { + int y; + void (*I422ToARGB4444Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) = I422ToARGB4444Row_C; + if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444; + dst_stride_argb4444 = -dst_stride_argb4444; + } +#if defined(HAS_I422TOARGB4444ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGB4444Row = I422ToARGB4444Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TOARGB4444ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGB4444Row = I422ToARGB4444Row_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGB4444ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGB4444Row = I422ToARGB4444Row_NEON; + } + } +#endif +#if defined(HAS_I422TOARGB4444ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGB4444Row = I422ToARGB4444Row_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants, + width); + dst_argb4444 += dst_stride_argb4444; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to RGB565. +LIBYUV_API +int I420ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height) { + int y; + void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGB565Row_C; + if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } +#if defined(HAS_I422TORGB565ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRGB565Row = I422ToRGB565Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToRGB565Row = I422ToRGB565Row_AVX2; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRGB565Row = I422ToRGB565Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_NEON; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGB565Row = I422ToRGB565Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width); + dst_rgb565 += dst_stride_rgb565; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I422 to RGB565. +LIBYUV_API +int I422ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height) { + int y; + void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGB565Row_C; + if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } +#if defined(HAS_I422TORGB565ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRGB565Row = I422ToRGB565Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToRGB565Row = I422ToRGB565Row_AVX2; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRGB565Row = I422ToRGB565Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_NEON; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGB565Row = I422ToRGB565Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width); + dst_rgb565 += dst_stride_rgb565; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Ordered 8x8 dither for 888 to 565. Values from 0 to 7. +static const uint8_t kDither565_4x4[16] = { + 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2, +}; + +// Convert I420 to RGB565 with dithering. +LIBYUV_API +int I420ToRGB565Dither(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + const uint8_t* dither4x4, + int width, + int height) { + int y; + void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToARGBRow_C; + void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb, + const uint32_t dither4, int width) = + ARGBToRGB565DitherRow_C; + if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } + if (!dither4x4) { + dither4x4 = kDither565_4x4; + } +#if defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToARGBRow = I422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I422TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGBRow = I422ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA; + } + } +#endif + { + // Allocate a row of argb. + align_buffer_64(row_argb, width * 4); + for (y = 0; y < height; ++y) { + I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width); + ARGBToRGB565DitherRow(row_argb, dst_rgb565, + *(const uint32_t*)(dither4x4 + ((y & 3) << 2)), + width); + dst_rgb565 += dst_stride_rgb565; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + free_aligned_buffer_64(row_argb); + } + return 0; +} + +// Convert I420 to AR30 with matrix +static int I420ToAR30Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I422ToAR30Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToAR30Row_C; + + if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } + +#if defined(HAS_I422TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToAR30Row = I422ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToAR30Row = I422ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToAR30Row = I422ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToAR30Row = I422ToAR30Row_AVX2; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to AR30. +LIBYUV_API +int I420ToAR30(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYuvI601Constants, width, height); +} + +// Convert H420 to AR30. +LIBYUV_API +int H420ToAR30(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYvuH709Constants, width, height); +} + +// Convert I420 to specified format +LIBYUV_API +int ConvertFromI420(const uint8_t* y, + int y_stride, + const uint8_t* u, + int u_stride, + const uint8_t* v, + int v_stride, + uint8_t* dst_sample, + int dst_sample_stride, + int width, + int height, + uint32_t fourcc) { + uint32_t format = CanonicalFourCC(fourcc); + int r = 0; + if (!y || !u || !v || !dst_sample || width <= 0 || height == 0) { + return -1; + } + switch (format) { + // Single plane formats + case FOURCC_YUY2: + r = I420ToYUY2(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, width, + height); + break; + case FOURCC_UYVY: + r = I420ToUYVY(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, width, + height); + break; + case FOURCC_RGBP: + r = I420ToRGB565(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, width, + height); + break; + case FOURCC_RGBO: + r = I420ToARGB1555(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, + width, height); + break; + case FOURCC_R444: + r = I420ToARGB4444(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, + width, height); + break; + case FOURCC_24BG: + r = I420ToRGB24(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 3, width, + height); + break; + case FOURCC_RAW: + r = I420ToRAW(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 3, width, + height); + break; + case FOURCC_ARGB: + r = I420ToARGB(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, width, + height); + break; + case FOURCC_BGRA: + r = I420ToBGRA(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, width, + height); + break; + case FOURCC_ABGR: + r = I420ToABGR(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, width, + height); + break; + case FOURCC_RGBA: + r = I420ToRGBA(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, width, + height); + break; + case FOURCC_AR30: + r = I420ToAR30(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, width, + height); + break; + case FOURCC_I400: + r = I400Copy(y, y_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width, width, + height); + break; + case FOURCC_NV12: { + uint8_t* dst_uv = dst_sample + width * height; + r = I420ToNV12(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width, dst_uv, + dst_sample_stride ? dst_sample_stride : width, width, + height); + break; + } + case FOURCC_NV21: { + uint8_t* dst_vu = dst_sample + width * height; + r = I420ToNV21(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width, dst_vu, + dst_sample_stride ? dst_sample_stride : width, width, + height); + break; + } + // TODO(fbarchard): Add M420. + // Triplanar formats + case FOURCC_I420: + case FOURCC_YV12: { + dst_sample_stride = dst_sample_stride ? dst_sample_stride : width; + int halfstride = (dst_sample_stride + 1) / 2; + int halfheight = (height + 1) / 2; + uint8_t* dst_u; + uint8_t* dst_v; + if (format == FOURCC_YV12) { + dst_v = dst_sample + dst_sample_stride * height; + dst_u = dst_v + halfstride * halfheight; + } else { + dst_u = dst_sample + dst_sample_stride * height; + dst_v = dst_u + halfstride * halfheight; + } + r = I420Copy(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride, dst_u, halfstride, dst_v, halfstride, + width, height); + break; + } + case FOURCC_I422: + case FOURCC_YV16: { + dst_sample_stride = dst_sample_stride ? dst_sample_stride : width; + int halfstride = (dst_sample_stride + 1) / 2; + uint8_t* dst_u; + uint8_t* dst_v; + if (format == FOURCC_YV16) { + dst_v = dst_sample + dst_sample_stride * height; + dst_u = dst_v + halfstride * height; + } else { + dst_u = dst_sample + dst_sample_stride * height; + dst_v = dst_u + halfstride * height; + } + r = I420ToI422(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride, dst_u, halfstride, dst_v, halfstride, + width, height); + break; + } + case FOURCC_I444: + case FOURCC_YV24: { + dst_sample_stride = dst_sample_stride ? dst_sample_stride : width; + uint8_t* dst_u; + uint8_t* dst_v; + if (format == FOURCC_YV24) { + dst_v = dst_sample + dst_sample_stride * height; + dst_u = dst_v + dst_sample_stride * height; + } else { + dst_u = dst_sample + dst_sample_stride * height; + dst_v = dst_u + dst_sample_stride * height; + } + r = I420ToI444(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride, dst_u, dst_sample_stride, dst_v, + dst_sample_stride, width, height); + break; + } + // Formats not supported - MJPG, biplanar, some rgb formats. + default: + return -1; // unknown fourcc - return failure code. + } + return r; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/convert_from_argb.cc b/media/libvpx/libvpx/third_party/libyuv/source/convert_from_argb.cc new file mode 100644 index 0000000000..c8d91252e9 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/convert_from_argb.cc @@ -0,0 +1,1617 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert_from_argb.h" + +#include "libyuv/basic_types.h" +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// ARGB little endian (bgra in memory) to I444 +LIBYUV_API +int ARGBToI444(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYRow_C; + void (*ARGBToUV444Row)(const uint8_t* src_argb, uint8_t* dst_u, + uint8_t* dst_v, int width) = ARGBToUV444Row_C; + if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_y == width && + dst_stride_u == width && dst_stride_v == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_ARGBTOUV444ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUV444Row = ARGBToUV444Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUV444ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUV444Row = ARGBToUV444Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToUV444Row = ARGBToUV444Row_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUV444ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUV444Row = ARGBToUV444Row_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToUV444Row = ARGBToUV444Row_MSA; + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToUV444Row(src_argb, dst_u, dst_v, width); + ARGBToYRow(src_argb, dst_y, width); + src_argb += src_stride_argb; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + return 0; +} + +// ARGB little endian (bgra in memory) to I422 +LIBYUV_API +int ARGBToI422(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYRow_C; + if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_y == width && + dst_stride_u * 2 == width && dst_stride_v * 2 == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } +#endif + +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVRow = ARGBToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); + ARGBToYRow(src_argb, dst_y, width); + src_argb += src_stride_argb; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + return 0; +} + +LIBYUV_API +int ARGBToNV12(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + int y; + int halfwidth = (width + 1) >> 1; + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYRow_C; + void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v, + uint8_t* dst_uv, int width) = MergeUVRow_C; + if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVRow = ARGBToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif +#if defined(HAS_MERGEUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + MergeUVRow_ = MergeUVRow_Any_SSE2; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_SSE2; + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeUVRow_ = MergeUVRow_Any_AVX2; + if (IS_ALIGNED(halfwidth, 32)) { + MergeUVRow_ = MergeUVRow_AVX2; + } + } +#endif +#if defined(HAS_MERGEUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeUVRow_ = MergeUVRow_Any_NEON; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_NEON; + } + } +#endif +#if defined(HAS_MERGEUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MergeUVRow_ = MergeUVRow_Any_MSA; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_MSA; + } + } +#endif + { + // Allocate a rows of uv. + align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); + uint8_t* row_v = row_u + ((halfwidth + 31) & ~31); + + for (y = 0; y < height - 1; y += 2) { + ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); + MergeUVRow_(row_u, row_v, dst_uv, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); + src_argb += src_stride_argb * 2; + dst_y += dst_stride_y * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + ARGBToUVRow(src_argb, 0, row_u, row_v, width); + MergeUVRow_(row_u, row_v, dst_uv, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + } + free_aligned_buffer_64(row_u); + } + return 0; +} + +// Same as NV12 but U and V swapped. +LIBYUV_API +int ARGBToNV21(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height) { + int y; + int halfwidth = (width + 1) >> 1; + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYRow_C; + void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v, + uint8_t* dst_vu, int width) = MergeUVRow_C; + if (!src_argb || !dst_y || !dst_vu || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVRow = ARGBToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif +#if defined(HAS_MERGEUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + MergeUVRow_ = MergeUVRow_Any_SSE2; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_SSE2; + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeUVRow_ = MergeUVRow_Any_AVX2; + if (IS_ALIGNED(halfwidth, 32)) { + MergeUVRow_ = MergeUVRow_AVX2; + } + } +#endif +#if defined(HAS_MERGEUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeUVRow_ = MergeUVRow_Any_NEON; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_NEON; + } + } +#endif +#if defined(HAS_MERGEUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MergeUVRow_ = MergeUVRow_Any_MSA; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_MSA; + } + } +#endif + { + // Allocate a rows of uv. + align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); + uint8_t* row_v = row_u + ((halfwidth + 31) & ~31); + + for (y = 0; y < height - 1; y += 2) { + ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); + MergeUVRow_(row_v, row_u, dst_vu, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); + src_argb += src_stride_argb * 2; + dst_y += dst_stride_y * 2; + dst_vu += dst_stride_vu; + } + if (height & 1) { + ARGBToUVRow(src_argb, 0, row_u, row_v, width); + MergeUVRow_(row_v, row_u, dst_vu, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + } + free_aligned_buffer_64(row_u); + } + return 0; +} + +// Convert ARGB to YUY2. +LIBYUV_API +int ARGBToYUY2(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_yuy2, + int dst_stride_yuy2, + int width, + int height) { + int y; + void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYRow_C; + void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u, + const uint8_t* src_v, uint8_t* dst_yuy2, int width) = + I422ToYUY2Row_C; + + if (!src_argb || !dst_yuy2 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2; + dst_stride_yuy2 = -dst_stride_yuy2; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_yuy2 == width * 2) { + width *= height; + height = 1; + src_stride_argb = dst_stride_yuy2 = 0; + } +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVRow = ARGBToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif +#if defined(HAS_I422TOYUY2ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_SSE2; + } + } +#endif +#if defined(HAS_I422TOYUY2ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToYUY2Row = I422ToYUY2Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_AVX2; + } + } +#endif +#if defined(HAS_I422TOYUY2ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToYUY2Row = I422ToYUY2Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_NEON; + } + } +#endif +#if defined(HAS_I422TOYUY2ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToYUY2Row = I422ToYUY2Row_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_MSA; + } + } +#endif + + { + // Allocate a rows of yuv. + align_buffer_64(row_y, ((width + 63) & ~63) * 2); + uint8_t* row_u = row_y + ((width + 63) & ~63); + uint8_t* row_v = row_u + ((width + 63) & ~63) / 2; + + for (y = 0; y < height; ++y) { + ARGBToUVRow(src_argb, 0, row_u, row_v, width); + ARGBToYRow(src_argb, row_y, width); + I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width); + src_argb += src_stride_argb; + dst_yuy2 += dst_stride_yuy2; + } + + free_aligned_buffer_64(row_y); + } + return 0; +} + +// Convert ARGB to UYVY. +LIBYUV_API +int ARGBToUYVY(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_uyvy, + int dst_stride_uyvy, + int width, + int height) { + int y; + void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYRow_C; + void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u, + const uint8_t* src_v, uint8_t* dst_uyvy, int width) = + I422ToUYVYRow_C; + + if (!src_argb || !dst_uyvy || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy; + dst_stride_uyvy = -dst_stride_uyvy; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_uyvy == width * 2) { + width *= height; + height = 1; + src_stride_argb = dst_stride_uyvy = 0; + } +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVRow = ARGBToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif +#if defined(HAS_I422TOUYVYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_SSE2; + } + } +#endif +#if defined(HAS_I422TOUYVYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToUYVYRow = I422ToUYVYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOUYVYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToUYVYRow = I422ToUYVYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_NEON; + } + } +#endif +#if defined(HAS_I422TOUYVYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToUYVYRow = I422ToUYVYRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_MSA; + } + } +#endif + + { + // Allocate a rows of yuv. + align_buffer_64(row_y, ((width + 63) & ~63) * 2); + uint8_t* row_u = row_y + ((width + 63) & ~63); + uint8_t* row_v = row_u + ((width + 63) & ~63) / 2; + + for (y = 0; y < height; ++y) { + ARGBToUVRow(src_argb, 0, row_u, row_v, width); + ARGBToYRow(src_argb, row_y, width); + I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width); + src_argb += src_stride_argb; + dst_uyvy += dst_stride_uyvy; + } + + free_aligned_buffer_64(row_y); + } + return 0; +} + +// Convert ARGB to I400. +LIBYUV_API +int ARGBToI400(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + int y; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYRow_C; + if (!src_argb || !dst_y || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_y = 0; + } +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToYRow(src_argb, dst_y, width); + src_argb += src_stride_argb; + dst_y += dst_stride_y; + } + return 0; +} + +// Shuffle table for converting ARGB to RGBA. +static const uvec8 kShuffleMaskARGBToRGBA = { + 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u}; + +// Convert ARGB to RGBA. +LIBYUV_API +int ARGBToRGBA(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_rgba, + int dst_stride_rgba, + int width, + int height) { + return ARGBShuffle(src_argb, src_stride_argb, dst_rgba, dst_stride_rgba, + (const uint8_t*)(&kShuffleMaskARGBToRGBA), width, height); +} + +// Convert ARGB To RGB24. +LIBYUV_API +int ARGBToRGB24(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + int y; + void (*ARGBToRGB24Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) = + ARGBToRGB24Row_C; + if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_rgb24 == width * 3) { + width *= height; + height = 1; + src_stride_argb = dst_stride_rgb24 = 0; + } +#if defined(HAS_ARGBTORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToRGB24Row = ARGBToRGB24Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTORGB24ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToRGB24Row = ARGBToRGB24Row_AVX2; + } + } +#endif +#if defined(HAS_ARGBTORGB24ROW_AVX512VBMI) + if (TestCpuFlag(kCpuHasAVX512VBMI)) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_AVX512VBMI; + if (IS_ALIGNED(width, 32)) { + ARGBToRGB24Row = ARGBToRGB24Row_AVX512VBMI; + } + } +#endif +#if defined(HAS_ARGBTORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB24Row = ARGBToRGB24Row_NEON; + } + } +#endif +#if defined(HAS_ARGBTORGB24ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToRGB24Row = ARGBToRGB24Row_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToRGB24Row(src_argb, dst_rgb24, width); + src_argb += src_stride_argb; + dst_rgb24 += dst_stride_rgb24; + } + return 0; +} + +// Convert ARGB To RAW. +LIBYUV_API +int ARGBToRAW(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height) { + int y; + void (*ARGBToRAWRow)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) = + ARGBToRAWRow_C; + if (!src_argb || !dst_raw || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_raw == width * 3) { + width *= height; + height = 1; + src_stride_argb = dst_stride_raw = 0; + } +#if defined(HAS_ARGBTORAWROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToRAWRow = ARGBToRAWRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTORAWROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToRAWRow = ARGBToRAWRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToRAWRow = ARGBToRAWRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTORAWROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToRAWRow = ARGBToRAWRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToRAWRow = ARGBToRAWRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTORAWROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRAWRow = ARGBToRAWRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToRAWRow = ARGBToRAWRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToRAWRow(src_argb, dst_raw, width); + src_argb += src_stride_argb; + dst_raw += dst_stride_raw; + } + return 0; +} + +// Ordered 8x8 dither for 888 to 565. Values from 0 to 7. +static const uint8_t kDither565_4x4[16] = { + 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2, +}; + +// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes). +LIBYUV_API +int ARGBToRGB565Dither(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + const uint8_t* dither4x4, + int width, + int height) { + int y; + void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb, + const uint32_t dither4, int width) = + ARGBToRGB565DitherRow_C; + if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + if (!dither4x4) { + dither4x4 = kDither565_4x4; + } +#if defined(HAS_ARGBTORGB565DITHERROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToRGB565DitherRow(src_argb, dst_rgb565, + *(const uint32_t*)(dither4x4 + ((y & 3) << 2)), + width); + src_argb += src_stride_argb; + dst_rgb565 += dst_stride_rgb565; + } + return 0; +} + +// Convert ARGB To RGB565. +// TODO(fbarchard): Consider using dither function low level with zeros. +LIBYUV_API +int ARGBToRGB565(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height) { + int y; + void (*ARGBToRGB565Row)(const uint8_t* src_argb, uint8_t* dst_rgb, + int width) = ARGBToRGB565Row_C; + if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_rgb565 == width * 2) { + width *= height; + height = 1; + src_stride_argb = dst_stride_rgb565 = 0; + } +#if defined(HAS_ARGBTORGB565ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBToRGB565Row = ARGBToRGB565Row_SSE2; + } + } +#endif +#if defined(HAS_ARGBTORGB565ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565Row = ARGBToRGB565Row_AVX2; + } + } +#endif +#if defined(HAS_ARGBTORGB565ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565Row = ARGBToRGB565Row_NEON; + } + } +#endif +#if defined(HAS_ARGBTORGB565ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565Row = ARGBToRGB565Row_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToRGB565Row(src_argb, dst_rgb565, width); + src_argb += src_stride_argb; + dst_rgb565 += dst_stride_rgb565; + } + return 0; +} + +// Convert ARGB To ARGB1555. +LIBYUV_API +int ARGBToARGB1555(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb1555, + int dst_stride_argb1555, + int width, + int height) { + int y; + void (*ARGBToARGB1555Row)(const uint8_t* src_argb, uint8_t* dst_rgb, + int width) = ARGBToARGB1555Row_C; + if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_argb1555 == width * 2) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb1555 = 0; + } +#if defined(HAS_ARGBTOARGB1555ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2; + } + } +#endif +#if defined(HAS_ARGBTOARGB1555ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOARGB1555ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_NEON; + } + } +#endif +#if defined(HAS_ARGBTOARGB1555ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToARGB1555Row(src_argb, dst_argb1555, width); + src_argb += src_stride_argb; + dst_argb1555 += dst_stride_argb1555; + } + return 0; +} + +// Convert ARGB To ARGB4444. +LIBYUV_API +int ARGBToARGB4444(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb4444, + int dst_stride_argb4444, + int width, + int height) { + int y; + void (*ARGBToARGB4444Row)(const uint8_t* src_argb, uint8_t* dst_rgb, + int width) = ARGBToARGB4444Row_C; + if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_argb4444 == width * 2) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb4444 = 0; + } +#if defined(HAS_ARGBTOARGB4444ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2; + } + } +#endif +#if defined(HAS_ARGBTOARGB4444ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOARGB4444ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_NEON; + } + } +#endif +#if defined(HAS_ARGBTOARGB4444ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToARGB4444Row(src_argb, dst_argb4444, width); + src_argb += src_stride_argb; + dst_argb4444 += dst_stride_argb4444; + } + return 0; +} + +// Convert ABGR To AR30. +LIBYUV_API +int ABGRToAR30(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + int y; + void (*ABGRToAR30Row)(const uint8_t* src_abgr, uint8_t* dst_rgb, int width) = + ABGRToAR30Row_C; + if (!src_abgr || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_abgr = src_abgr + (height - 1) * src_stride_abgr; + src_stride_abgr = -src_stride_abgr; + } + // Coalesce rows. + if (src_stride_abgr == width * 4 && dst_stride_ar30 == width * 4) { + width *= height; + height = 1; + src_stride_abgr = dst_stride_ar30 = 0; + } +#if defined(HAS_ABGRTOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToAR30Row = ABGRToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ABGRToAR30Row = ABGRToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToAR30Row = ABGRToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ABGRToAR30Row = ABGRToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + ABGRToAR30Row(src_abgr, dst_ar30, width); + src_abgr += src_stride_abgr; + dst_ar30 += dst_stride_ar30; + } + return 0; +} + +// Convert ARGB To AR30. +LIBYUV_API +int ARGBToAR30(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + int y; + void (*ARGBToAR30Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) = + ARGBToAR30Row_C; + if (!src_argb || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_ar30 == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_ar30 = 0; + } +#if defined(HAS_ARGBTOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBToAR30Row = ARGBToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToAR30Row = ARGBToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToAR30Row = ARGBToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + ARGBToAR30Row(src_argb, dst_ar30, width); + src_argb += src_stride_argb; + dst_ar30 += dst_stride_ar30; + } + return 0; +} + +// Convert ARGB to J420. (JPeg full range I420). +LIBYUV_API +int ARGBToJ420(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_yj, + int dst_stride_yj, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; + void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVJRow_C; + void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) = + ARGBToYJRow_C; + if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } +#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; + ARGBToYJRow = ARGBToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_SSSE3; + ARGBToYJRow = ARGBToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYJRow = ARGBToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYJRow = ARGBToYJRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYJRow = ARGBToYJRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVJRow = ARGBToUVJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYJRow = ARGBToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVJRow = ARGBToUVJRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVJRow = ARGBToUVJRow_MSA; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width); + ARGBToYJRow(src_argb, dst_yj, width); + ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width); + src_argb += src_stride_argb * 2; + dst_yj += dst_stride_yj * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width); + ARGBToYJRow(src_argb, dst_yj, width); + } + return 0; +} + +// Convert ARGB to J422. (JPeg full range I422). +LIBYUV_API +int ARGBToJ422(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_yj, + int dst_stride_yj, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; + void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVJRow_C; + void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) = + ARGBToYJRow_C; + if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_yj == width && + dst_stride_u * 2 == width && dst_stride_v * 2 == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; + ARGBToYJRow = ARGBToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_SSSE3; + ARGBToYJRow = ARGBToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYJRow = ARGBToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYJRow = ARGBToYJRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYJRow = ARGBToYJRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVJRow = ARGBToUVJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYJRow = ARGBToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVJRow = ARGBToUVJRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVJRow = ARGBToUVJRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width); + ARGBToYJRow(src_argb, dst_yj, width); + src_argb += src_stride_argb; + dst_yj += dst_stride_yj; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + return 0; +} + +// Convert ARGB to J400. +LIBYUV_API +int ARGBToJ400(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_yj, + int dst_stride_yj, + int width, + int height) { + int y; + void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) = + ARGBToYJRow_C; + if (!src_argb || !dst_yj || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_yj == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_yj = 0; + } +#if defined(HAS_ARGBTOYJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYJRow = ARGBToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYJRow = ARGBToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYJRow = ARGBToYJRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYJRow = ARGBToYJRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYJRow = ARGBToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToYJRow(src_argb, dst_yj, width); + src_argb += src_stride_argb; + dst_yj += dst_stride_yj; + } + return 0; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/convert_jpeg.cc b/media/libvpx/libvpx/third_party/libyuv/source/convert_jpeg.cc new file mode 100644 index 0000000000..ae3cc18cd2 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/convert_jpeg.cc @@ -0,0 +1,332 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert.h" +#include "libyuv/convert_argb.h" + +#ifdef HAVE_JPEG +#include "libyuv/mjpeg_decoder.h" +#endif + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#ifdef HAVE_JPEG +struct I420Buffers { + uint8_t* y; + int y_stride; + uint8_t* u; + int u_stride; + uint8_t* v; + int v_stride; + int w; + int h; +}; + +static void JpegCopyI420(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + I420Buffers* dest = (I420Buffers*)(opaque); + I420Copy(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v, + dest->v_stride, dest->w, rows); + dest->y += rows * dest->y_stride; + dest->u += ((rows + 1) >> 1) * dest->u_stride; + dest->v += ((rows + 1) >> 1) * dest->v_stride; + dest->h -= rows; +} + +static void JpegI422ToI420(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + I420Buffers* dest = (I420Buffers*)(opaque); + I422ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v, + dest->v_stride, dest->w, rows); + dest->y += rows * dest->y_stride; + dest->u += ((rows + 1) >> 1) * dest->u_stride; + dest->v += ((rows + 1) >> 1) * dest->v_stride; + dest->h -= rows; +} + +static void JpegI444ToI420(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + I420Buffers* dest = (I420Buffers*)(opaque); + I444ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v, + dest->v_stride, dest->w, rows); + dest->y += rows * dest->y_stride; + dest->u += ((rows + 1) >> 1) * dest->u_stride; + dest->v += ((rows + 1) >> 1) * dest->v_stride; + dest->h -= rows; +} + +static void JpegI400ToI420(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + I420Buffers* dest = (I420Buffers*)(opaque); + I400ToI420(data[0], strides[0], dest->y, dest->y_stride, dest->u, + dest->u_stride, dest->v, dest->v_stride, dest->w, rows); + dest->y += rows * dest->y_stride; + dest->u += ((rows + 1) >> 1) * dest->u_stride; + dest->v += ((rows + 1) >> 1) * dest->v_stride; + dest->h -= rows; +} + +// Query size of MJPG in pixels. +LIBYUV_API +int MJPGSize(const uint8_t* sample, + size_t sample_size, + int* width, + int* height) { + MJpegDecoder mjpeg_decoder; + LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size); + if (ret) { + *width = mjpeg_decoder.GetWidth(); + *height = mjpeg_decoder.GetHeight(); + } + mjpeg_decoder.UnloadFrame(); + return ret ? 0 : -1; // -1 for runtime failure. +} + +// MJPG (Motion JPeg) to I420 +// TODO(fbarchard): review src_width and src_height requirement. dst_width and +// dst_height may be enough. +LIBYUV_API +int MJPGToI420(const uint8_t* sample, + size_t sample_size, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int src_width, + int src_height, + int dst_width, + int dst_height) { + if (sample_size == kUnknownDataSize) { + // ERROR: MJPEG frame size unknown + return -1; + } + + // TODO(fbarchard): Port MJpeg to C. + MJpegDecoder mjpeg_decoder; + LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size); + if (ret && (mjpeg_decoder.GetWidth() != src_width || + mjpeg_decoder.GetHeight() != src_height)) { + // ERROR: MJPEG frame has unexpected dimensions + mjpeg_decoder.UnloadFrame(); + return 1; // runtime failure + } + if (ret) { + I420Buffers bufs = {dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, dst_width, dst_height}; + // YUV420 + if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 2 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dst_width, + dst_height); + // YUV422 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dst_width, + dst_height); + // YUV444 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dst_width, + dst_height); + // YUV400 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceGrayscale && + mjpeg_decoder.GetNumComponents() == 1 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dst_width, + dst_height); + } else { + // TODO(fbarchard): Implement conversion for any other colorspace/sample + // factors that occur in practice. + // ERROR: Unable to convert MJPEG frame because format is not supported + mjpeg_decoder.UnloadFrame(); + return 1; + } + } + return ret ? 0 : 1; +} + +#ifdef HAVE_JPEG +struct ARGBBuffers { + uint8_t* argb; + int argb_stride; + int w; + int h; +}; + +static void JpegI420ToARGB(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + ARGBBuffers* dest = (ARGBBuffers*)(opaque); + I420ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->argb, dest->argb_stride, dest->w, rows); + dest->argb += rows * dest->argb_stride; + dest->h -= rows; +} + +static void JpegI422ToARGB(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + ARGBBuffers* dest = (ARGBBuffers*)(opaque); + I422ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->argb, dest->argb_stride, dest->w, rows); + dest->argb += rows * dest->argb_stride; + dest->h -= rows; +} + +static void JpegI444ToARGB(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + ARGBBuffers* dest = (ARGBBuffers*)(opaque); + I444ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->argb, dest->argb_stride, dest->w, rows); + dest->argb += rows * dest->argb_stride; + dest->h -= rows; +} + +static void JpegI400ToARGB(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + ARGBBuffers* dest = (ARGBBuffers*)(opaque); + I400ToARGB(data[0], strides[0], dest->argb, dest->argb_stride, dest->w, rows); + dest->argb += rows * dest->argb_stride; + dest->h -= rows; +} + +// MJPG (Motion JPeg) to ARGB +// TODO(fbarchard): review src_width and src_height requirement. dst_width and +// dst_height may be enough. +LIBYUV_API +int MJPGToARGB(const uint8_t* sample, + size_t sample_size, + uint8_t* dst_argb, + int dst_stride_argb, + int src_width, + int src_height, + int dst_width, + int dst_height) { + if (sample_size == kUnknownDataSize) { + // ERROR: MJPEG frame size unknown + return -1; + } + + // TODO(fbarchard): Port MJpeg to C. + MJpegDecoder mjpeg_decoder; + LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size); + if (ret && (mjpeg_decoder.GetWidth() != src_width || + mjpeg_decoder.GetHeight() != src_height)) { + // ERROR: MJPEG frame has unexpected dimensions + mjpeg_decoder.UnloadFrame(); + return 1; // runtime failure + } + if (ret) { + ARGBBuffers bufs = {dst_argb, dst_stride_argb, dst_width, dst_height}; + // YUV420 + if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 2 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dst_width, + dst_height); + // YUV422 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dst_width, + dst_height); + // YUV444 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dst_width, + dst_height); + // YUV400 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceGrayscale && + mjpeg_decoder.GetNumComponents() == 1 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dst_width, + dst_height); + } else { + // TODO(fbarchard): Implement conversion for any other colorspace/sample + // factors that occur in practice. + // ERROR: Unable to convert MJPEG frame because format is not supported + mjpeg_decoder.UnloadFrame(); + return 1; + } + } + return ret ? 0 : 1; +} +#endif + +#endif + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/convert_to_argb.cc b/media/libvpx/libvpx/third_party/libyuv/source/convert_to_argb.cc new file mode 100644 index 0000000000..67484522c0 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/convert_to_argb.cc @@ -0,0 +1,291 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert_argb.h" + +#include "libyuv/cpu_id.h" +#ifdef HAVE_JPEG +#include "libyuv/mjpeg_decoder.h" +#endif +#include "libyuv/rotate_argb.h" +#include "libyuv/row.h" +#include "libyuv/video_common.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Convert camera sample to ARGB with cropping, rotation and vertical flip. +// src_width is used for source stride computation +// src_height is used to compute location of planes, and indicate inversion +// sample_size is measured in bytes and is the size of the frame. +// With MJPEG it is the compressed size of the frame. + +// TODO(fbarchard): Add the following: +// H010ToARGB +// H420ToARGB +// H422ToARGB +// I010ToARGB +// J400ToARGB +// J422ToARGB +// J444ToARGB + +LIBYUV_API +int ConvertToARGB(const uint8_t* sample, + size_t sample_size, + uint8_t* dst_argb, + int dst_stride_argb, + int crop_x, + int crop_y, + int src_width, + int src_height, + int crop_width, + int crop_height, + enum RotationMode rotation, + uint32_t fourcc) { + uint32_t format = CanonicalFourCC(fourcc); + int aligned_src_width = (src_width + 1) & ~1; + const uint8_t* src; + const uint8_t* src_uv; + int abs_src_height = (src_height < 0) ? -src_height : src_height; + int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height; + int r = 0; + + // One pass rotation is available for some formats. For the rest, convert + // to ARGB (with optional vertical flipping) into a temporary ARGB buffer, + // and then rotate the ARGB to the final destination buffer. + // For in-place conversion, if destination dst_argb is same as source sample, + // also enable temporary buffer. + LIBYUV_BOOL need_buf = + (rotation && format != FOURCC_ARGB) || dst_argb == sample; + uint8_t* dest_argb = dst_argb; + int dest_dst_stride_argb = dst_stride_argb; + uint8_t* rotate_buffer = NULL; + int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height; + + if (dst_argb == NULL || sample == NULL || src_width <= 0 || crop_width <= 0 || + src_height == 0 || crop_height == 0) { + return -1; + } + if (src_height < 0) { + inv_crop_height = -inv_crop_height; + } + + if (need_buf) { + int argb_size = crop_width * 4 * abs_crop_height; + rotate_buffer = (uint8_t*)malloc(argb_size); /* NOLINT */ + if (!rotate_buffer) { + return 1; // Out of memory runtime error. + } + dst_argb = rotate_buffer; + dst_stride_argb = crop_width * 4; + } + + switch (format) { + // Single plane formats + case FOURCC_YUY2: + src = sample + (aligned_src_width * crop_y + crop_x) * 2; + r = YUY2ToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb, + crop_width, inv_crop_height); + break; + case FOURCC_UYVY: + src = sample + (aligned_src_width * crop_y + crop_x) * 2; + r = UYVYToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb, + crop_width, inv_crop_height); + break; + case FOURCC_24BG: + src = sample + (src_width * crop_y + crop_x) * 3; + r = RGB24ToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); + break; + case FOURCC_RAW: + src = sample + (src_width * crop_y + crop_x) * 3; + r = RAWToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); + break; + case FOURCC_ARGB: + if (!need_buf && !rotation) { + src = sample + (src_width * crop_y + crop_x) * 4; + r = ARGBToARGB(src, src_width * 4, dst_argb, dst_stride_argb, + crop_width, inv_crop_height); + } + break; + case FOURCC_BGRA: + src = sample + (src_width * crop_y + crop_x) * 4; + r = BGRAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); + break; + case FOURCC_ABGR: + src = sample + (src_width * crop_y + crop_x) * 4; + r = ABGRToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); + break; + case FOURCC_RGBA: + src = sample + (src_width * crop_y + crop_x) * 4; + r = RGBAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); + break; + case FOURCC_AR30: + src = sample + (src_width * crop_y + crop_x) * 4; + r = AR30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); + break; + case FOURCC_AB30: + src = sample + (src_width * crop_y + crop_x) * 4; + r = AB30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); + break; + case FOURCC_RGBP: + src = sample + (src_width * crop_y + crop_x) * 2; + r = RGB565ToARGB(src, src_width * 2, dst_argb, dst_stride_argb, + crop_width, inv_crop_height); + break; + case FOURCC_RGBO: + src = sample + (src_width * crop_y + crop_x) * 2; + r = ARGB1555ToARGB(src, src_width * 2, dst_argb, dst_stride_argb, + crop_width, inv_crop_height); + break; + case FOURCC_R444: + src = sample + (src_width * crop_y + crop_x) * 2; + r = ARGB4444ToARGB(src, src_width * 2, dst_argb, dst_stride_argb, + crop_width, inv_crop_height); + break; + case FOURCC_I400: + src = sample + src_width * crop_y + crop_x; + r = I400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); + break; + + // Biplanar formats + case FOURCC_NV12: + src = sample + (src_width * crop_y + crop_x); + src_uv = sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x; + r = NV12ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb, + dst_stride_argb, crop_width, inv_crop_height); + break; + case FOURCC_NV21: + src = sample + (src_width * crop_y + crop_x); + src_uv = sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x; + // Call NV12 but with u and v parameters swapped. + r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb, + dst_stride_argb, crop_width, inv_crop_height); + break; + case FOURCC_M420: + src = sample + (src_width * crop_y) * 12 / 8 + crop_x; + r = M420ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); + break; + + // Triplanar formats + case FOURCC_I420: + case FOURCC_YV12: { + const uint8_t* src_y = sample + (src_width * crop_y + crop_x); + const uint8_t* src_u; + const uint8_t* src_v; + int halfwidth = (src_width + 1) / 2; + int halfheight = (abs_src_height + 1) / 2; + if (format == FOURCC_YV12) { + src_v = sample + src_width * abs_src_height + + (halfwidth * crop_y + crop_x) / 2; + src_u = sample + src_width * abs_src_height + + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + } else { + src_u = sample + src_width * abs_src_height + + (halfwidth * crop_y + crop_x) / 2; + src_v = sample + src_width * abs_src_height + + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + } + r = I420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + dst_argb, dst_stride_argb, crop_width, inv_crop_height); + break; + } + + case FOURCC_J420: { + const uint8_t* src_y = sample + (src_width * crop_y + crop_x); + const uint8_t* src_u; + const uint8_t* src_v; + int halfwidth = (src_width + 1) / 2; + int halfheight = (abs_src_height + 1) / 2; + src_u = sample + src_width * abs_src_height + + (halfwidth * crop_y + crop_x) / 2; + src_v = sample + src_width * abs_src_height + + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + dst_argb, dst_stride_argb, crop_width, inv_crop_height); + break; + } + + case FOURCC_I422: + case FOURCC_YV16: { + const uint8_t* src_y = sample + src_width * crop_y + crop_x; + const uint8_t* src_u; + const uint8_t* src_v; + int halfwidth = (src_width + 1) / 2; + if (format == FOURCC_YV16) { + src_v = sample + src_width * abs_src_height + halfwidth * crop_y + + crop_x / 2; + src_u = sample + src_width * abs_src_height + + halfwidth * (abs_src_height + crop_y) + crop_x / 2; + } else { + src_u = sample + src_width * abs_src_height + halfwidth * crop_y + + crop_x / 2; + src_v = sample + src_width * abs_src_height + + halfwidth * (abs_src_height + crop_y) + crop_x / 2; + } + r = I422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + dst_argb, dst_stride_argb, crop_width, inv_crop_height); + break; + } + case FOURCC_I444: + case FOURCC_YV24: { + const uint8_t* src_y = sample + src_width * crop_y + crop_x; + const uint8_t* src_u; + const uint8_t* src_v; + if (format == FOURCC_YV24) { + src_v = sample + src_width * (abs_src_height + crop_y) + crop_x; + src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; + } else { + src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; + src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; + } + r = I444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width, + dst_argb, dst_stride_argb, crop_width, inv_crop_height); + break; + } +#ifdef HAVE_JPEG + case FOURCC_MJPG: + r = MJPGToARGB(sample, sample_size, dst_argb, dst_stride_argb, src_width, + abs_src_height, crop_width, inv_crop_height); + break; +#endif + default: + r = -1; // unknown fourcc - return failure code. + } + + if (need_buf) { + if (!r) { + r = ARGBRotate(dst_argb, dst_stride_argb, dest_argb, dest_dst_stride_argb, + crop_width, abs_crop_height, rotation); + } + free(rotate_buffer); + } else if (rotation) { + src = sample + (src_width * crop_y + crop_x) * 4; + r = ARGBRotate(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, + inv_crop_height, rotation); + } + + return r; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/convert_to_i420.cc b/media/libvpx/libvpx/third_party/libyuv/source/convert_to_i420.cc new file mode 100644 index 0000000000..df08309f9b --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/convert_to_i420.cc @@ -0,0 +1,277 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "libyuv/convert.h" + +#include "libyuv/video_common.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Convert camera sample to I420 with cropping, rotation and vertical flip. +// src_width is used for source stride computation +// src_height is used to compute location of planes, and indicate inversion +// sample_size is measured in bytes and is the size of the frame. +// With MJPEG it is the compressed size of the frame. +LIBYUV_API +int ConvertToI420(const uint8_t* sample, + size_t sample_size, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int crop_x, + int crop_y, + int src_width, + int src_height, + int crop_width, + int crop_height, + enum RotationMode rotation, + uint32_t fourcc) { + uint32_t format = CanonicalFourCC(fourcc); + int aligned_src_width = (src_width + 1) & ~1; + const uint8_t* src; + const uint8_t* src_uv; + const int abs_src_height = (src_height < 0) ? -src_height : src_height; + // TODO(nisse): Why allow crop_height < 0? + const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height; + int r = 0; + LIBYUV_BOOL need_buf = + (rotation && format != FOURCC_I420 && format != FOURCC_NV12 && + format != FOURCC_NV21 && format != FOURCC_YV12) || + dst_y == sample; + uint8_t* tmp_y = dst_y; + uint8_t* tmp_u = dst_u; + uint8_t* tmp_v = dst_v; + int tmp_y_stride = dst_stride_y; + int tmp_u_stride = dst_stride_u; + int tmp_v_stride = dst_stride_v; + uint8_t* rotate_buffer = NULL; + const int inv_crop_height = + (src_height < 0) ? -abs_crop_height : abs_crop_height; + + if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 || + crop_width <= 0 || src_height == 0 || crop_height == 0) { + return -1; + } + + // One pass rotation is available for some formats. For the rest, convert + // to I420 (with optional vertical flipping) into a temporary I420 buffer, + // and then rotate the I420 to the final destination buffer. + // For in-place conversion, if destination dst_y is same as source sample, + // also enable temporary buffer. + if (need_buf) { + int y_size = crop_width * abs_crop_height; + int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2); + rotate_buffer = (uint8_t*)malloc(y_size + uv_size * 2); /* NOLINT */ + if (!rotate_buffer) { + return 1; // Out of memory runtime error. + } + dst_y = rotate_buffer; + dst_u = dst_y + y_size; + dst_v = dst_u + uv_size; + dst_stride_y = crop_width; + dst_stride_u = dst_stride_v = ((crop_width + 1) / 2); + } + + switch (format) { + // Single plane formats + case FOURCC_YUY2: + src = sample + (aligned_src_width * crop_y + crop_x) * 2; + r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); + break; + case FOURCC_UYVY: + src = sample + (aligned_src_width * crop_y + crop_x) * 2; + r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); + break; + case FOURCC_RGBP: + src = sample + (src_width * crop_y + crop_x) * 2; + r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); + break; + case FOURCC_RGBO: + src = sample + (src_width * crop_y + crop_x) * 2; + r = ARGB1555ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); + break; + case FOURCC_R444: + src = sample + (src_width * crop_y + crop_x) * 2; + r = ARGB4444ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); + break; + case FOURCC_24BG: + src = sample + (src_width * crop_y + crop_x) * 3; + r = RGB24ToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); + break; + case FOURCC_RAW: + src = sample + (src_width * crop_y + crop_x) * 3; + r = RAWToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); + break; + case FOURCC_ARGB: + src = sample + (src_width * crop_y + crop_x) * 4; + r = ARGBToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); + break; + case FOURCC_BGRA: + src = sample + (src_width * crop_y + crop_x) * 4; + r = BGRAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); + break; + case FOURCC_ABGR: + src = sample + (src_width * crop_y + crop_x) * 4; + r = ABGRToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); + break; + case FOURCC_RGBA: + src = sample + (src_width * crop_y + crop_x) * 4; + r = RGBAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); + break; + // TODO(fbarchard): Add AR30 and AB30 + case FOURCC_I400: + src = sample + src_width * crop_y + crop_x; + r = I400ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, crop_width, inv_crop_height); + break; + // Biplanar formats + case FOURCC_NV12: + src = sample + (src_width * crop_y + crop_x); + src_uv = sample + (src_width * abs_src_height) + + ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2); + r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y, + dst_stride_y, dst_u, dst_stride_u, dst_v, + dst_stride_v, crop_width, inv_crop_height, rotation); + break; + case FOURCC_NV21: + src = sample + (src_width * crop_y + crop_x); + src_uv = sample + (src_width * abs_src_height) + + ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2); + // Call NV12 but with dst_u and dst_v parameters swapped. + r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y, + dst_stride_y, dst_v, dst_stride_v, dst_u, + dst_stride_u, crop_width, inv_crop_height, rotation); + break; + case FOURCC_M420: + src = sample + (src_width * crop_y) * 12 / 8 + crop_x; + r = M420ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, crop_width, inv_crop_height); + break; + // Triplanar formats + case FOURCC_I420: + case FOURCC_YV12: { + const uint8_t* src_y = sample + (src_width * crop_y + crop_x); + const uint8_t* src_u; + const uint8_t* src_v; + int halfwidth = (src_width + 1) / 2; + int halfheight = (abs_src_height + 1) / 2; + if (format == FOURCC_YV12) { + src_v = sample + src_width * abs_src_height + + (halfwidth * crop_y + crop_x) / 2; + src_u = sample + src_width * abs_src_height + + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + } else { + src_u = sample + src_width * abs_src_height + + (halfwidth * crop_y + crop_x) / 2; + src_v = sample + src_width * abs_src_height + + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + } + r = I420Rotate(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, + dst_stride_v, crop_width, inv_crop_height, rotation); + break; + } + case FOURCC_I422: + case FOURCC_YV16: { + const uint8_t* src_y = sample + src_width * crop_y + crop_x; + const uint8_t* src_u; + const uint8_t* src_v; + int halfwidth = (src_width + 1) / 2; + if (format == FOURCC_YV16) { + src_v = sample + src_width * abs_src_height + halfwidth * crop_y + + crop_x / 2; + src_u = sample + src_width * abs_src_height + + halfwidth * (abs_src_height + crop_y) + crop_x / 2; + } else { + src_u = sample + src_width * abs_src_height + halfwidth * crop_y + + crop_x / 2; + src_v = sample + src_width * abs_src_height + + halfwidth * (abs_src_height + crop_y) + crop_x / 2; + } + r = I422ToI420(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, + dst_stride_v, crop_width, inv_crop_height); + break; + } + case FOURCC_I444: + case FOURCC_YV24: { + const uint8_t* src_y = sample + src_width * crop_y + crop_x; + const uint8_t* src_u; + const uint8_t* src_v; + if (format == FOURCC_YV24) { + src_v = sample + src_width * (abs_src_height + crop_y) + crop_x; + src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; + } else { + src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; + src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; + } + r = I444ToI420(src_y, src_width, src_u, src_width, src_v, src_width, + dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, + dst_stride_v, crop_width, inv_crop_height); + break; + } +#ifdef HAVE_JPEG + case FOURCC_MJPG: + r = MJPGToI420(sample, sample_size, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, src_width, + abs_src_height, crop_width, inv_crop_height); + break; +#endif + default: + r = -1; // unknown fourcc - return failure code. + } + + if (need_buf) { + if (!r) { + r = I420Rotate(dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, + dst_stride_v, tmp_y, tmp_y_stride, tmp_u, tmp_u_stride, + tmp_v, tmp_v_stride, crop_width, abs_crop_height, + rotation); + } + free(rotate_buffer); + } + + return r; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/cpu_id.cc b/media/libvpx/libvpx/third_party/libyuv/source/cpu_id.cc new file mode 100644 index 0000000000..31e24b6739 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/cpu_id.cc @@ -0,0 +1,276 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/cpu_id.h" + +#if defined(_MSC_VER) +#include // For __cpuidex() +#endif +#if !defined(__pnacl__) && !defined(__CLR_VER) && \ + !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \ + defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) +#include // For _xgetbv() +#endif + +// For ArmCpuCaps() but unittested on all platforms +#include +#include + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// For functions that use the stack and have runtime checks for overflow, +// use SAFEBUFFERS to avoid additional check. +#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) && \ + !defined(__clang__) +#define SAFEBUFFERS __declspec(safebuffers) +#else +#define SAFEBUFFERS +#endif + +// cpu_info_ variable for SIMD instruction sets detected. +LIBYUV_API int cpu_info_ = 0; + +// TODO(fbarchard): Consider using int for cpuid so casting is not needed. +// Low level cpuid for X86. +#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ + defined(__x86_64__)) && \ + !defined(__pnacl__) && !defined(__CLR_VER) +LIBYUV_API +void CpuId(int info_eax, int info_ecx, int* cpu_info) { +#if defined(_MSC_VER) +// Visual C version uses intrinsic or inline x86 assembly. +#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) + __cpuidex(cpu_info, info_eax, info_ecx); +#elif defined(_M_IX86) + __asm { + mov eax, info_eax + mov ecx, info_ecx + mov edi, cpu_info + cpuid + mov [edi], eax + mov [edi + 4], ebx + mov [edi + 8], ecx + mov [edi + 12], edx + } +#else // Visual C but not x86 + if (info_ecx == 0) { + __cpuid(cpu_info, info_eax); + } else { + cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0u; + } +#endif +// GCC version uses inline x86 assembly. +#else // defined(_MSC_VER) + int info_ebx, info_edx; + asm volatile( +#if defined(__i386__) && defined(__PIC__) + // Preserve ebx for fpic 32 bit. + "mov %%ebx, %%edi \n" + "cpuid \n" + "xchg %%edi, %%ebx \n" + : "=D"(info_ebx), +#else + "cpuid \n" + : "=b"(info_ebx), +#endif // defined( __i386__) && defined(__PIC__) + "+a"(info_eax), "+c"(info_ecx), "=d"(info_edx)); + cpu_info[0] = info_eax; + cpu_info[1] = info_ebx; + cpu_info[2] = info_ecx; + cpu_info[3] = info_edx; +#endif // defined(_MSC_VER) +} +#else // (defined(_M_IX86) || defined(_M_X64) ... +LIBYUV_API +void CpuId(int eax, int ecx, int* cpu_info) { + (void)eax; + (void)ecx; + cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; +} +#endif + +// For VS2010 and earlier emit can be used: +// _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier. +// __asm { +// xor ecx, ecx // xcr 0 +// xgetbv +// mov xcr0, eax +// } +// For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code. +// https://code.google.com/p/libyuv/issues/detail?id=529 +#if defined(_M_IX86) && (_MSC_VER < 1900) +#pragma optimize("g", off) +#endif +#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ + defined(__x86_64__)) && \ + !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__) +// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers. +int GetXCR0() { + int xcr0 = 0; +#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) + xcr0 = (int)_xgetbv(0); // VS2010 SP1 required. NOLINT +#elif defined(__i386__) || defined(__x86_64__) + asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx"); +#endif // defined(__i386__) || defined(__x86_64__) + return xcr0; +} +#else +// xgetbv unavailable to query for OSSave support. Return 0. +#define GetXCR0() 0 +#endif // defined(_M_IX86) || defined(_M_X64) .. +// Return optimization to previous setting. +#if defined(_M_IX86) && (_MSC_VER < 1900) +#pragma optimize("g", on) +#endif + +// based on libvpx arm_cpudetect.c +// For Arm, but public to allow testing on any CPU +LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) { + char cpuinfo_line[512]; + FILE* f = fopen(cpuinfo_name, "r"); + if (!f) { + // Assume Neon if /proc/cpuinfo is unavailable. + // This will occur for Chrome sandbox for Pepper or Render process. + return kCpuHasNEON; + } + while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) { + if (memcmp(cpuinfo_line, "Features", 8) == 0) { + char* p = strstr(cpuinfo_line, " neon"); + if (p && (p[5] == ' ' || p[5] == '\n')) { + fclose(f); + return kCpuHasNEON; + } + // aarch64 uses asimd for Neon. + p = strstr(cpuinfo_line, " asimd"); + if (p) { + fclose(f); + return kCpuHasNEON; + } + } + } + fclose(f); + return 0; +} + +// TODO(fbarchard): Consider read_msa_ir(). +// TODO(fbarchard): Add unittest. +LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name, + const char ase[]) { + char cpuinfo_line[512]; + FILE* f = fopen(cpuinfo_name, "r"); + if (!f) { + // ase enabled if /proc/cpuinfo is unavailable. + if (strcmp(ase, " msa") == 0) { + return kCpuHasMSA; + } + return 0; + } + while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) { + if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) { + char* p = strstr(cpuinfo_line, ase); + if (p) { + fclose(f); + if (strcmp(ase, " msa") == 0) { + return kCpuHasMSA; + } + return 0; + } + } + } + fclose(f); + return 0; +} + +static SAFEBUFFERS int GetCpuFlags(void) { + int cpu_info = 0; +#if !defined(__pnacl__) && !defined(__CLR_VER) && \ + (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \ + defined(_M_IX86)) + int cpu_info0[4] = {0, 0, 0, 0}; + int cpu_info1[4] = {0, 0, 0, 0}; + int cpu_info7[4] = {0, 0, 0, 0}; + CpuId(0, 0, cpu_info0); + CpuId(1, 0, cpu_info1); + if (cpu_info0[0] >= 7) { + CpuId(7, 0, cpu_info7); + } + cpu_info = kCpuHasX86 | ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) | + ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) | + ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) | + ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) | + ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0); + + // AVX requires OS saves YMM registers. + if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) && // AVX and OSXSave + ((GetXCR0() & 6) == 6)) { // Test OS saves YMM registers + cpu_info |= kCpuHasAVX | ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | + ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) | + ((cpu_info1[2] & 0x20000000) ? kCpuHasF16C : 0); + + // Detect AVX512bw + if ((GetXCR0() & 0xe0) == 0xe0) { + cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX512BW : 0; + cpu_info |= (cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0; + cpu_info |= (cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0; + cpu_info |= (cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0; + cpu_info |= (cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0; + cpu_info |= (cpu_info7[2] & 0x00004000) ? kCpuHasAVX512VPOPCNTDQ : 0; + cpu_info |= (cpu_info7[2] & 0x00000100) ? kCpuHasGFNI : 0; + } + } +#endif +#if defined(__mips__) && defined(__linux__) +#if defined(__mips_msa) + cpu_info = MipsCpuCaps("/proc/cpuinfo", " msa"); +#endif + cpu_info |= kCpuHasMIPS; +#endif +#if defined(__arm__) || defined(__aarch64__) +// gcc -mfpu=neon defines __ARM_NEON__ +// __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon. +// For Linux, /proc/cpuinfo can be tested but without that assume Neon. +#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__) + cpu_info = kCpuHasNEON; +// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon +// flag in it. +// So for aarch64, neon enabling is hard coded here. +#endif +#if defined(__aarch64__) + cpu_info = kCpuHasNEON; +#else + // Linux arm parse text file for neon detect. + cpu_info = ArmCpuCaps("/proc/cpuinfo"); +#endif + cpu_info |= kCpuHasARM; +#endif // __arm__ + cpu_info |= kCpuInitialized; + return cpu_info; +} + +// Note that use of this function is not thread safe. +LIBYUV_API +int MaskCpuFlags(int enable_flags) { + int cpu_info = GetCpuFlags() & enable_flags; + SetCpuFlags(cpu_info); + return cpu_info; +} + +LIBYUV_API +int InitCpuFlags(void) { + return MaskCpuFlags(-1); +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/mjpeg_decoder.cc b/media/libvpx/libvpx/third_party/libyuv/source/mjpeg_decoder.cc new file mode 100644 index 0000000000..eaf2530130 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/mjpeg_decoder.cc @@ -0,0 +1,573 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/mjpeg_decoder.h" + +#ifdef HAVE_JPEG +#include + +#if !defined(__pnacl__) && !defined(__CLR_VER) && \ + !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) +// Must be included before jpeglib. +#include +#define HAVE_SETJMP + +#if defined(_MSC_VER) +// disable warning 4324: structure was padded due to __declspec(align()) +#pragma warning(disable : 4324) +#endif + +#endif +struct FILE; // For jpeglib.h. + +// C++ build requires extern C for jpeg internals. +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#ifdef __cplusplus +} // extern "C" +#endif + +#include "libyuv/planar_functions.h" // For CopyPlane(). + +namespace libyuv { + +#ifdef HAVE_SETJMP +struct SetJmpErrorMgr { + jpeg_error_mgr base; // Must be at the top + jmp_buf setjmp_buffer; +}; +#endif + +const int MJpegDecoder::kColorSpaceUnknown = JCS_UNKNOWN; +const int MJpegDecoder::kColorSpaceGrayscale = JCS_GRAYSCALE; +const int MJpegDecoder::kColorSpaceRgb = JCS_RGB; +const int MJpegDecoder::kColorSpaceYCbCr = JCS_YCbCr; +const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK; +const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK; + +// Methods that are passed to jpeglib. +boolean fill_input_buffer(jpeg_decompress_struct* cinfo); +void init_source(jpeg_decompress_struct* cinfo); +void skip_input_data(jpeg_decompress_struct* cinfo, long num_bytes); // NOLINT +void term_source(jpeg_decompress_struct* cinfo); +void ErrorHandler(jpeg_common_struct* cinfo); +void OutputHandler(jpeg_common_struct* cinfo); + +MJpegDecoder::MJpegDecoder() + : has_scanline_padding_(LIBYUV_FALSE), + num_outbufs_(0), + scanlines_(NULL), + scanlines_sizes_(NULL), + databuf_(NULL), + databuf_strides_(NULL) { + decompress_struct_ = new jpeg_decompress_struct; + source_mgr_ = new jpeg_source_mgr; +#ifdef HAVE_SETJMP + error_mgr_ = new SetJmpErrorMgr; + decompress_struct_->err = jpeg_std_error(&error_mgr_->base); + // Override standard exit()-based error handler. + error_mgr_->base.error_exit = &ErrorHandler; + error_mgr_->base.output_message = &OutputHandler; +#endif + decompress_struct_->client_data = NULL; + source_mgr_->init_source = &init_source; + source_mgr_->fill_input_buffer = &fill_input_buffer; + source_mgr_->skip_input_data = &skip_input_data; + source_mgr_->resync_to_restart = &jpeg_resync_to_restart; + source_mgr_->term_source = &term_source; + jpeg_create_decompress(decompress_struct_); + decompress_struct_->src = source_mgr_; + buf_vec_.buffers = &buf_; + buf_vec_.len = 1; +} + +MJpegDecoder::~MJpegDecoder() { + jpeg_destroy_decompress(decompress_struct_); + delete decompress_struct_; + delete source_mgr_; +#ifdef HAVE_SETJMP + delete error_mgr_; +#endif + DestroyOutputBuffers(); +} + +LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8_t* src, size_t src_len) { + if (!ValidateJpeg(src, src_len)) { + return LIBYUV_FALSE; + } + + buf_.data = src; + buf_.len = static_cast(src_len); + buf_vec_.pos = 0; + decompress_struct_->client_data = &buf_vec_; +#ifdef HAVE_SETJMP + if (setjmp(error_mgr_->setjmp_buffer)) { + // We called jpeg_read_header, it experienced an error, and we called + // longjmp() and rewound the stack to here. Return error. + return LIBYUV_FALSE; + } +#endif + if (jpeg_read_header(decompress_struct_, TRUE) != JPEG_HEADER_OK) { + // ERROR: Bad MJPEG header + return LIBYUV_FALSE; + } + AllocOutputBuffers(GetNumComponents()); + for (int i = 0; i < num_outbufs_; ++i) { + int scanlines_size = GetComponentScanlinesPerImcuRow(i); + if (scanlines_sizes_[i] != scanlines_size) { + if (scanlines_[i]) { + delete scanlines_[i]; + } + scanlines_[i] = new uint8_t*[scanlines_size]; + scanlines_sizes_[i] = scanlines_size; + } + + // We allocate padding for the final scanline to pad it up to DCTSIZE bytes + // to avoid memory errors, since jpeglib only reads full MCUs blocks. For + // the preceding scanlines, the padding is not needed/wanted because the + // following addresses will already be valid (they are the initial bytes of + // the next scanline) and will be overwritten when jpeglib writes out that + // next scanline. + int databuf_stride = GetComponentStride(i); + int databuf_size = scanlines_size * databuf_stride; + if (databuf_strides_[i] != databuf_stride) { + if (databuf_[i]) { + delete databuf_[i]; + } + databuf_[i] = new uint8_t[databuf_size]; + databuf_strides_[i] = databuf_stride; + } + + if (GetComponentStride(i) != GetComponentWidth(i)) { + has_scanline_padding_ = LIBYUV_TRUE; + } + } + return LIBYUV_TRUE; +} + +static int DivideAndRoundUp(int numerator, int denominator) { + return (numerator + denominator - 1) / denominator; +} + +static int DivideAndRoundDown(int numerator, int denominator) { + return numerator / denominator; +} + +// Returns width of the last loaded frame. +int MJpegDecoder::GetWidth() { + return decompress_struct_->image_width; +} + +// Returns height of the last loaded frame. +int MJpegDecoder::GetHeight() { + return decompress_struct_->image_height; +} + +// Returns format of the last loaded frame. The return value is one of the +// kColorSpace* constants. +int MJpegDecoder::GetColorSpace() { + return decompress_struct_->jpeg_color_space; +} + +// Number of color components in the color space. +int MJpegDecoder::GetNumComponents() { + return decompress_struct_->num_components; +} + +// Sample factors of the n-th component. +int MJpegDecoder::GetHorizSampFactor(int component) { + return decompress_struct_->comp_info[component].h_samp_factor; +} + +int MJpegDecoder::GetVertSampFactor(int component) { + return decompress_struct_->comp_info[component].v_samp_factor; +} + +int MJpegDecoder::GetHorizSubSampFactor(int component) { + return decompress_struct_->max_h_samp_factor / GetHorizSampFactor(component); +} + +int MJpegDecoder::GetVertSubSampFactor(int component) { + return decompress_struct_->max_v_samp_factor / GetVertSampFactor(component); +} + +int MJpegDecoder::GetImageScanlinesPerImcuRow() { + return decompress_struct_->max_v_samp_factor * DCTSIZE; +} + +int MJpegDecoder::GetComponentScanlinesPerImcuRow(int component) { + int vs = GetVertSubSampFactor(component); + return DivideAndRoundUp(GetImageScanlinesPerImcuRow(), vs); +} + +int MJpegDecoder::GetComponentWidth(int component) { + int hs = GetHorizSubSampFactor(component); + return DivideAndRoundUp(GetWidth(), hs); +} + +int MJpegDecoder::GetComponentHeight(int component) { + int vs = GetVertSubSampFactor(component); + return DivideAndRoundUp(GetHeight(), vs); +} + +// Get width in bytes padded out to a multiple of DCTSIZE +int MJpegDecoder::GetComponentStride(int component) { + return (GetComponentWidth(component) + DCTSIZE - 1) & ~(DCTSIZE - 1); +} + +int MJpegDecoder::GetComponentSize(int component) { + return GetComponentWidth(component) * GetComponentHeight(component); +} + +LIBYUV_BOOL MJpegDecoder::UnloadFrame() { +#ifdef HAVE_SETJMP + if (setjmp(error_mgr_->setjmp_buffer)) { + // We called jpeg_abort_decompress, it experienced an error, and we called + // longjmp() and rewound the stack to here. Return error. + return LIBYUV_FALSE; + } +#endif + jpeg_abort_decompress(decompress_struct_); + return LIBYUV_TRUE; +} + +// TODO(fbarchard): Allow rectangle to be specified: x, y, width, height. +LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(uint8_t** planes, + int dst_width, + int dst_height) { + if (dst_width != GetWidth() || dst_height > GetHeight()) { + // ERROR: Bad dimensions + return LIBYUV_FALSE; + } +#ifdef HAVE_SETJMP + if (setjmp(error_mgr_->setjmp_buffer)) { + // We called into jpeglib, it experienced an error sometime during this + // function call, and we called longjmp() and rewound the stack to here. + // Return error. + return LIBYUV_FALSE; + } +#endif + if (!StartDecode()) { + return LIBYUV_FALSE; + } + SetScanlinePointers(databuf_); + int lines_left = dst_height; + // Compute amount of lines to skip to implement vertical crop. + // TODO(fbarchard): Ensure skip is a multiple of maximum component + // subsample. ie 2 + int skip = (GetHeight() - dst_height) / 2; + if (skip > 0) { + // There is no API to skip lines in the output data, so we read them + // into the temp buffer. + while (skip >= GetImageScanlinesPerImcuRow()) { + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + skip -= GetImageScanlinesPerImcuRow(); + } + if (skip > 0) { + // Have a partial iMCU row left over to skip. Must read it and then + // copy the parts we want into the destination. + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + for (int i = 0; i < num_outbufs_; ++i) { + // TODO(fbarchard): Compute skip to avoid this + assert(skip % GetVertSubSampFactor(i) == 0); + int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i)); + int scanlines_to_copy = + GetComponentScanlinesPerImcuRow(i) - rows_to_skip; + int data_to_skip = rows_to_skip * GetComponentStride(i); + CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i), planes[i], + GetComponentWidth(i), GetComponentWidth(i), + scanlines_to_copy); + planes[i] += scanlines_to_copy * GetComponentWidth(i); + } + lines_left -= (GetImageScanlinesPerImcuRow() - skip); + } + } + + // Read full MCUs but cropped horizontally + for (; lines_left > GetImageScanlinesPerImcuRow(); + lines_left -= GetImageScanlinesPerImcuRow()) { + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + for (int i = 0; i < num_outbufs_; ++i) { + int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i); + CopyPlane(databuf_[i], GetComponentStride(i), planes[i], + GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy); + planes[i] += scanlines_to_copy * GetComponentWidth(i); + } + } + + if (lines_left > 0) { + // Have a partial iMCU row left over to decode. + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + for (int i = 0; i < num_outbufs_; ++i) { + int scanlines_to_copy = + DivideAndRoundUp(lines_left, GetVertSubSampFactor(i)); + CopyPlane(databuf_[i], GetComponentStride(i), planes[i], + GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy); + planes[i] += scanlines_to_copy * GetComponentWidth(i); + } + } + return FinishDecode(); +} + +LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, + void* opaque, + int dst_width, + int dst_height) { + if (dst_width != GetWidth() || dst_height > GetHeight()) { + // ERROR: Bad dimensions + return LIBYUV_FALSE; + } +#ifdef HAVE_SETJMP + if (setjmp(error_mgr_->setjmp_buffer)) { + // We called into jpeglib, it experienced an error sometime during this + // function call, and we called longjmp() and rewound the stack to here. + // Return error. + return LIBYUV_FALSE; + } +#endif + if (!StartDecode()) { + return LIBYUV_FALSE; + } + SetScanlinePointers(databuf_); + int lines_left = dst_height; + // TODO(fbarchard): Compute amount of lines to skip to implement vertical crop + int skip = (GetHeight() - dst_height) / 2; + if (skip > 0) { + while (skip >= GetImageScanlinesPerImcuRow()) { + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + skip -= GetImageScanlinesPerImcuRow(); + } + if (skip > 0) { + // Have a partial iMCU row left over to skip. + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + for (int i = 0; i < num_outbufs_; ++i) { + // TODO(fbarchard): Compute skip to avoid this + assert(skip % GetVertSubSampFactor(i) == 0); + int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i)); + int data_to_skip = rows_to_skip * GetComponentStride(i); + // Change our own data buffer pointers so we can pass them to the + // callback. + databuf_[i] += data_to_skip; + } + int scanlines_to_copy = GetImageScanlinesPerImcuRow() - skip; + (*fn)(opaque, databuf_, databuf_strides_, scanlines_to_copy); + // Now change them back. + for (int i = 0; i < num_outbufs_; ++i) { + int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i)); + int data_to_skip = rows_to_skip * GetComponentStride(i); + databuf_[i] -= data_to_skip; + } + lines_left -= scanlines_to_copy; + } + } + // Read full MCUs until we get to the crop point. + for (; lines_left >= GetImageScanlinesPerImcuRow(); + lines_left -= GetImageScanlinesPerImcuRow()) { + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + (*fn)(opaque, databuf_, databuf_strides_, GetImageScanlinesPerImcuRow()); + } + if (lines_left > 0) { + // Have a partial iMCU row left over to decode. + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + (*fn)(opaque, databuf_, databuf_strides_, lines_left); + } + return FinishDecode(); +} + +void init_source(j_decompress_ptr cinfo) { + fill_input_buffer(cinfo); +} + +boolean fill_input_buffer(j_decompress_ptr cinfo) { + BufferVector* buf_vec = reinterpret_cast(cinfo->client_data); + if (buf_vec->pos >= buf_vec->len) { + assert(0 && "No more data"); + // ERROR: No more data + return FALSE; + } + cinfo->src->next_input_byte = buf_vec->buffers[buf_vec->pos].data; + cinfo->src->bytes_in_buffer = buf_vec->buffers[buf_vec->pos].len; + ++buf_vec->pos; + return TRUE; +} + +void skip_input_data(j_decompress_ptr cinfo, long num_bytes) { // NOLINT + cinfo->src->next_input_byte += num_bytes; +} + +void term_source(j_decompress_ptr cinfo) { + (void)cinfo; // Nothing to do. +} + +#ifdef HAVE_SETJMP +void ErrorHandler(j_common_ptr cinfo) { +// This is called when a jpeglib command experiences an error. Unfortunately +// jpeglib's error handling model is not very flexible, because it expects the +// error handler to not return--i.e., it wants the program to terminate. To +// recover from errors we use setjmp() as shown in their example. setjmp() is +// C's implementation for the "call with current continuation" functionality +// seen in some functional programming languages. +// A formatted message can be output, but is unsafe for release. +#ifdef DEBUG + char buf[JMSG_LENGTH_MAX]; + (*cinfo->err->format_message)(cinfo, buf); +// ERROR: Error in jpeglib: buf +#endif + + SetJmpErrorMgr* mgr = reinterpret_cast(cinfo->err); + // This rewinds the call stack to the point of the corresponding setjmp() + // and causes it to return (for a second time) with value 1. + longjmp(mgr->setjmp_buffer, 1); +} + +// Suppress fprintf warnings. +void OutputHandler(j_common_ptr cinfo) { + (void)cinfo; +} + +#endif // HAVE_SETJMP + +void MJpegDecoder::AllocOutputBuffers(int num_outbufs) { + if (num_outbufs != num_outbufs_) { + // We could perhaps optimize this case to resize the output buffers without + // necessarily having to delete and recreate each one, but it's not worth + // it. + DestroyOutputBuffers(); + + scanlines_ = new uint8_t**[num_outbufs]; + scanlines_sizes_ = new int[num_outbufs]; + databuf_ = new uint8_t*[num_outbufs]; + databuf_strides_ = new int[num_outbufs]; + + for (int i = 0; i < num_outbufs; ++i) { + scanlines_[i] = NULL; + scanlines_sizes_[i] = 0; + databuf_[i] = NULL; + databuf_strides_[i] = 0; + } + + num_outbufs_ = num_outbufs; + } +} + +void MJpegDecoder::DestroyOutputBuffers() { + for (int i = 0; i < num_outbufs_; ++i) { + delete[] scanlines_[i]; + delete[] databuf_[i]; + } + delete[] scanlines_; + delete[] databuf_; + delete[] scanlines_sizes_; + delete[] databuf_strides_; + scanlines_ = NULL; + databuf_ = NULL; + scanlines_sizes_ = NULL; + databuf_strides_ = NULL; + num_outbufs_ = 0; +} + +// JDCT_IFAST and do_block_smoothing improve performance substantially. +LIBYUV_BOOL MJpegDecoder::StartDecode() { + decompress_struct_->raw_data_out = TRUE; + decompress_struct_->dct_method = JDCT_IFAST; // JDCT_ISLOW is default + decompress_struct_->dither_mode = JDITHER_NONE; + // Not applicable to 'raw': + decompress_struct_->do_fancy_upsampling = (boolean)(LIBYUV_FALSE); + // Only for buffered mode: + decompress_struct_->enable_2pass_quant = (boolean)(LIBYUV_FALSE); + // Blocky but fast: + decompress_struct_->do_block_smoothing = (boolean)(LIBYUV_FALSE); + + if (!jpeg_start_decompress(decompress_struct_)) { + // ERROR: Couldn't start JPEG decompressor"; + return LIBYUV_FALSE; + } + return LIBYUV_TRUE; +} + +LIBYUV_BOOL MJpegDecoder::FinishDecode() { + // jpeglib considers it an error if we finish without decoding the whole + // image, so we call "abort" rather than "finish". + jpeg_abort_decompress(decompress_struct_); + return LIBYUV_TRUE; +} + +void MJpegDecoder::SetScanlinePointers(uint8_t** data) { + for (int i = 0; i < num_outbufs_; ++i) { + uint8_t* data_i = data[i]; + for (int j = 0; j < scanlines_sizes_[i]; ++j) { + scanlines_[i][j] = data_i; + data_i += GetComponentStride(i); + } + } +} + +inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() { + return (unsigned int)(GetImageScanlinesPerImcuRow()) == + jpeg_read_raw_data(decompress_struct_, scanlines_, + GetImageScanlinesPerImcuRow()); +} + +// The helper function which recognizes the jpeg sub-sampling type. +JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper( + int* subsample_x, + int* subsample_y, + int number_of_components) { + if (number_of_components == 3) { // Color images. + if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 && + subsample_y[1] == 2 && subsample_x[2] == 2 && subsample_y[2] == 2) { + return kJpegYuv420; + } + if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 && + subsample_y[1] == 1 && subsample_x[2] == 2 && subsample_y[2] == 1) { + return kJpegYuv422; + } + if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 1 && + subsample_y[1] == 1 && subsample_x[2] == 1 && subsample_y[2] == 1) { + return kJpegYuv444; + } + } else if (number_of_components == 1) { // Grey-scale images. + if (subsample_x[0] == 1 && subsample_y[0] == 1) { + return kJpegYuv400; + } + } + return kJpegUnknown; +} + +} // namespace libyuv +#endif // HAVE_JPEG diff --git a/media/libvpx/libvpx/third_party/libyuv/source/mjpeg_validate.cc b/media/libvpx/libvpx/third_party/libyuv/source/mjpeg_validate.cc new file mode 100644 index 0000000000..80c2cc0cb9 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/mjpeg_validate.cc @@ -0,0 +1,70 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/mjpeg_decoder.h" + +#include // For memchr. + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Helper function to scan for EOI marker (0xff 0xd9). +static LIBYUV_BOOL ScanEOI(const uint8_t* sample, size_t sample_size) { + if (sample_size >= 2) { + const uint8_t* end = sample + sample_size - 1; + const uint8_t* it = sample; + while (it < end) { + // TODO(fbarchard): scan for 0xd9 instead. + it = (const uint8_t*)(memchr(it, 0xff, end - it)); + if (it == NULL) { + break; + } + if (it[1] == 0xd9) { + return LIBYUV_TRUE; // Success: Valid jpeg. + } + ++it; // Skip over current 0xff. + } + } + // ERROR: Invalid jpeg end code not found. Size sample_size + return LIBYUV_FALSE; +} + +// Helper function to validate the jpeg appears intact. +LIBYUV_BOOL ValidateJpeg(const uint8_t* sample, size_t sample_size) { + // Maximum size that ValidateJpeg will consider valid. + const size_t kMaxJpegSize = 0x7fffffffull; + const size_t kBackSearchSize = 1024; + if (sample_size < 64 || sample_size > kMaxJpegSize || !sample) { + // ERROR: Invalid jpeg size: sample_size + return LIBYUV_FALSE; + } + if (sample[0] != 0xff || sample[1] != 0xd8) { // SOI marker + // ERROR: Invalid jpeg initial start code + return LIBYUV_FALSE; + } + + // Look for the End Of Image (EOI) marker near the end of the buffer. + if (sample_size > kBackSearchSize) { + if (ScanEOI(sample + sample_size - kBackSearchSize, kBackSearchSize)) { + return LIBYUV_TRUE; // Success: Valid jpeg. + } + // Reduce search size for forward search. + sample_size = sample_size - kBackSearchSize + 1; + } + // Step over SOI marker and scan for EOI. + return ScanEOI(sample + 2, sample_size - 2); +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/planar_functions.cc b/media/libvpx/libvpx/third_party/libyuv/source/planar_functions.cc new file mode 100644 index 0000000000..5eae3f763a --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/planar_functions.cc @@ -0,0 +1,3587 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/planar_functions.h" + +#include // for memset() + +#include "libyuv/cpu_id.h" +#ifdef HAVE_JPEG +#include "libyuv/mjpeg_decoder.h" +#endif +#include "libyuv/row.h" +#include "libyuv/scale_row.h" // for ScaleRowDown2 + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Copy a plane of data +LIBYUV_API +void CopyPlane(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + int y; + void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } + // Nothing to do. + if (src_y == dst_y && src_stride_y == dst_stride_y) { + return; + } + +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; + } +#endif +#if defined(HAS_COPYROW_AVX) + if (TestCpuFlag(kCpuHasAVX)) { + CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX; + } +#endif +#if defined(HAS_COPYROW_ERMS) + if (TestCpuFlag(kCpuHasERMS)) { + CopyRow = CopyRow_ERMS; + } +#endif +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; + } +#endif + + // Copy plane + for (y = 0; y < height; ++y) { + CopyRow(src_y, dst_y, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + +// TODO(fbarchard): Consider support for negative height. +// TODO(fbarchard): Consider stride measured in bytes. +LIBYUV_API +void CopyPlane_16(const uint16_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + int width, + int height) { + int y; + void (*CopyRow)(const uint16_t* src, uint16_t* dst, int width) = CopyRow_16_C; + // Coalesce rows. + if (src_stride_y == width && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } +#if defined(HAS_COPYROW_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) { + CopyRow = CopyRow_16_SSE2; + } +#endif +#if defined(HAS_COPYROW_16_ERMS) + if (TestCpuFlag(kCpuHasERMS)) { + CopyRow = CopyRow_16_ERMS; + } +#endif +#if defined(HAS_COPYROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { + CopyRow = CopyRow_16_NEON; + } +#endif + + // Copy plane + for (y = 0; y < height; ++y) { + CopyRow(src_y, dst_y, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + +// Convert a plane of 16 bit data to 8 bit +LIBYUV_API +void Convert16To8Plane(const uint16_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int scale, // 16384 for 10 bits + int width, + int height) { + int y; + void (*Convert16To8Row)(const uint16_t* src_y, uint8_t* dst_y, int scale, + int width) = Convert16To8Row_C; + + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } +#if defined(HAS_CONVERT16TO8ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + Convert16To8Row = Convert16To8Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + Convert16To8Row = Convert16To8Row_SSSE3; + } + } +#endif +#if defined(HAS_CONVERT16TO8ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + Convert16To8Row = Convert16To8Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + Convert16To8Row = Convert16To8Row_AVX2; + } + } +#endif + + // Convert plane + for (y = 0; y < height; ++y) { + Convert16To8Row(src_y, dst_y, scale, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + +// Convert a plane of 8 bit data to 16 bit +LIBYUV_API +void Convert8To16Plane(const uint8_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + int scale, // 16384 for 10 bits + int width, + int height) { + int y; + void (*Convert8To16Row)(const uint8_t* src_y, uint16_t* dst_y, int scale, + int width) = Convert8To16Row_C; + + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } +#if defined(HAS_CONVERT8TO16ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + Convert8To16Row = Convert8To16Row_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + Convert8To16Row = Convert8To16Row_SSE2; + } + } +#endif +#if defined(HAS_CONVERT8TO16ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + Convert8To16Row = Convert8To16Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + Convert8To16Row = Convert8To16Row_AVX2; + } + } +#endif + + // Convert plane + for (y = 0; y < height; ++y) { + Convert8To16Row(src_y, dst_y, scale, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + +// Copy I422. +LIBYUV_API +int I422Copy(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height); + CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height); + return 0; +} + +// Copy I444. +LIBYUV_API +int I444Copy(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height); + CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height); + return 0; +} + +// Copy I400. +LIBYUV_API +int I400ToI400(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + if (!src_y || !dst_y || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + return 0; +} + +// Convert I420 to I400. +LIBYUV_API +int I420ToI400(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + (void)src_u; + (void)src_stride_u; + (void)src_v; + (void)src_stride_v; + if (!src_y || !dst_y || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + return 0; +} + +// Support function for NV12 etc UV channels. +// Width and height are plane sizes (typically half pixel width). +LIBYUV_API +void SplitUVPlane(const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; + void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, + int width) = SplitUVRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_u = dst_u + (height - 1) * dst_stride_u; + dst_v = dst_v + (height - 1) * dst_stride_v; + dst_stride_u = -dst_stride_u; + dst_stride_v = -dst_stride_v; + } + // Coalesce rows. + if (src_stride_uv == width * 2 && dst_stride_u == width && + dst_stride_v == width) { + width *= height; + height = 1; + src_stride_uv = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_SPLITUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SplitUVRow = SplitUVRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + SplitUVRow = SplitUVRow_SSE2; + } + } +#endif +#if defined(HAS_SPLITUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + SplitUVRow = SplitUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_AVX2; + } + } +#endif +#if defined(HAS_SPLITUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SplitUVRow = SplitUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SplitUVRow = SplitUVRow_NEON; + } + } +#endif +#if defined(HAS_SPLITUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SplitUVRow = SplitUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + // Copy a row of UV. + SplitUVRow(src_uv, dst_u, dst_v, width); + dst_u += dst_stride_u; + dst_v += dst_stride_v; + src_uv += src_stride_uv; + } +} + +LIBYUV_API +void MergeUVPlane(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + int y; + void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v, + uint8_t* dst_uv, int width) = MergeUVRow_C; + // Coalesce rows. + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_uv = dst_uv + (height - 1) * dst_stride_uv; + dst_stride_uv = -dst_stride_uv; + } + // Coalesce rows. + if (src_stride_u == width && src_stride_v == width && + dst_stride_uv == width * 2) { + width *= height; + height = 1; + src_stride_u = src_stride_v = dst_stride_uv = 0; + } +#if defined(HAS_MERGEUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + MergeUVRow = MergeUVRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + MergeUVRow = MergeUVRow_SSE2; + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeUVRow = MergeUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + MergeUVRow = MergeUVRow_AVX2; + } + } +#endif +#if defined(HAS_MERGEUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeUVRow = MergeUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + MergeUVRow = MergeUVRow_NEON; + } + } +#endif +#if defined(HAS_MERGEUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MergeUVRow = MergeUVRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + MergeUVRow = MergeUVRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + // Merge a row of U and V into a row of UV. + MergeUVRow(src_u, src_v, dst_uv, width); + src_u += src_stride_u; + src_v += src_stride_v; + dst_uv += dst_stride_uv; + } +} + +// Support function for NV12 etc RGB channels. +// Width and height are plane sizes (typically half pixel width). +LIBYUV_API +void SplitRGBPlane(const uint8_t* src_rgb, + int src_stride_rgb, + uint8_t* dst_r, + int dst_stride_r, + uint8_t* dst_g, + int dst_stride_g, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { + int y; + void (*SplitRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, + uint8_t* dst_b, int width) = SplitRGBRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_r = dst_r + (height - 1) * dst_stride_r; + dst_g = dst_g + (height - 1) * dst_stride_g; + dst_b = dst_b + (height - 1) * dst_stride_b; + dst_stride_r = -dst_stride_r; + dst_stride_g = -dst_stride_g; + dst_stride_b = -dst_stride_b; + } + // Coalesce rows. + if (src_stride_rgb == width * 3 && dst_stride_r == width && + dst_stride_g == width && dst_stride_b == width) { + width *= height; + height = 1; + src_stride_rgb = dst_stride_r = dst_stride_g = dst_stride_b = 0; + } +#if defined(HAS_SPLITRGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + SplitRGBRow = SplitRGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + SplitRGBRow = SplitRGBRow_SSSE3; + } + } +#endif +#if defined(HAS_SPLITRGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SplitRGBRow = SplitRGBRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SplitRGBRow = SplitRGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + // Copy a row of RGB. + SplitRGBRow(src_rgb, dst_r, dst_g, dst_b, width); + dst_r += dst_stride_r; + dst_g += dst_stride_g; + dst_b += dst_stride_b; + src_rgb += src_stride_rgb; + } +} + +LIBYUV_API +void MergeRGBPlane(const uint8_t* src_r, + int src_stride_r, + const uint8_t* src_g, + int src_stride_g, + const uint8_t* src_b, + int src_stride_b, + uint8_t* dst_rgb, + int dst_stride_rgb, + int width, + int height) { + int y; + void (*MergeRGBRow)(const uint8_t* src_r, const uint8_t* src_g, + const uint8_t* src_b, uint8_t* dst_rgb, int width) = + MergeRGBRow_C; + // Coalesce rows. + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb; + dst_stride_rgb = -dst_stride_rgb; + } + // Coalesce rows. + if (src_stride_r == width && src_stride_g == width && src_stride_b == width && + dst_stride_rgb == width * 3) { + width *= height; + height = 1; + src_stride_r = src_stride_g = src_stride_b = dst_stride_rgb = 0; + } +#if defined(HAS_MERGERGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + MergeRGBRow = MergeRGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + MergeRGBRow = MergeRGBRow_SSSE3; + } + } +#endif +#if defined(HAS_MERGERGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeRGBRow = MergeRGBRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + MergeRGBRow = MergeRGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + // Merge a row of U and V into a row of RGB. + MergeRGBRow(src_r, src_g, src_b, dst_rgb, width); + src_r += src_stride_r; + src_g += src_stride_g; + src_b += src_stride_b; + dst_rgb += dst_stride_rgb; + } +} + +// Mirror a plane of data. +void MirrorPlane(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + int y; + void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } +#if defined(HAS_MIRRORROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MirrorRow = MirrorRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + MirrorRow = MirrorRow_NEON; + } + } +#endif +#if defined(HAS_MIRRORROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + MirrorRow = MirrorRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + MirrorRow = MirrorRow_SSSE3; + } + } +#endif +#if defined(HAS_MIRRORROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MirrorRow = MirrorRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + MirrorRow = MirrorRow_AVX2; + } + } +#endif +#if defined(HAS_MIRRORROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MirrorRow = MirrorRow_Any_MSA; + if (IS_ALIGNED(width, 64)) { + MirrorRow = MirrorRow_MSA; + } + } +#endif + + // Mirror plane + for (y = 0; y < height; ++y) { + MirrorRow(src_y, dst_y, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + +// Convert YUY2 to I422. +LIBYUV_API +int YUY2ToI422(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; + void (*YUY2ToUV422Row)(const uint8_t* src_yuy2, uint8_t* dst_u, + uint8_t* dst_v, int width) = YUY2ToUV422Row_C; + void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) = + YUY2ToYRow_C; + if (!src_yuy2 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; + src_stride_yuy2 = -src_stride_yuy2; + } + // Coalesce rows. + if (src_stride_yuy2 == width * 2 && dst_stride_y == width && + dst_stride_u * 2 == width && dst_stride_v * 2 == width && + width * height <= 32768) { + width *= height; + height = 1; + src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_YUY2TOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2; + YUY2ToYRow = YUY2ToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + YUY2ToUV422Row = YUY2ToUV422Row_SSE2; + YUY2ToYRow = YUY2ToYRow_SSE2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2; + YUY2ToYRow = YUY2ToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + YUY2ToUV422Row = YUY2ToUV422Row_AVX2; + YUY2ToYRow = YUY2ToYRow_AVX2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + YUY2ToYRow = YUY2ToYRow_Any_NEON; + YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_NEON; + YUY2ToUV422Row = YUY2ToUV422Row_NEON; + } + } +#endif +#if defined(HAS_YUY2TOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + YUY2ToYRow = YUY2ToYRow_Any_MSA; + YUY2ToUV422Row = YUY2ToUV422Row_Any_MSA; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_MSA; + YUY2ToUV422Row = YUY2ToUV422Row_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width); + YUY2ToYRow(src_yuy2, dst_y, width); + src_yuy2 += src_stride_yuy2; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + return 0; +} + +// Convert UYVY to I422. +LIBYUV_API +int UYVYToI422(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; + void (*UYVYToUV422Row)(const uint8_t* src_uyvy, uint8_t* dst_u, + uint8_t* dst_v, int width) = UYVYToUV422Row_C; + void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) = + UYVYToYRow_C; + if (!src_uyvy || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; + src_stride_uyvy = -src_stride_uyvy; + } + // Coalesce rows. + if (src_stride_uyvy == width * 2 && dst_stride_y == width && + dst_stride_u * 2 == width && dst_stride_v * 2 == width && + width * height <= 32768) { + width *= height; + height = 1; + src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_UYVYTOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + UYVYToUV422Row = UYVYToUV422Row_Any_SSE2; + UYVYToYRow = UYVYToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + UYVYToUV422Row = UYVYToUV422Row_SSE2; + UYVYToYRow = UYVYToYRow_SSE2; + } + } +#endif +#if defined(HAS_UYVYTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + UYVYToUV422Row = UYVYToUV422Row_Any_AVX2; + UYVYToYRow = UYVYToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + UYVYToUV422Row = UYVYToUV422Row_AVX2; + UYVYToYRow = UYVYToYRow_AVX2; + } + } +#endif +#if defined(HAS_UYVYTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + UYVYToYRow = UYVYToYRow_Any_NEON; + UYVYToUV422Row = UYVYToUV422Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_NEON; + UYVYToUV422Row = UYVYToUV422Row_NEON; + } + } +#endif +#if defined(HAS_UYVYTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + UYVYToYRow = UYVYToYRow_Any_MSA; + UYVYToUV422Row = UYVYToUV422Row_Any_MSA; + if (IS_ALIGNED(width, 32)) { + UYVYToYRow = UYVYToYRow_MSA; + UYVYToUV422Row = UYVYToUV422Row_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + UYVYToUV422Row(src_uyvy, dst_u, dst_v, width); + UYVYToYRow(src_uyvy, dst_y, width); + src_uyvy += src_stride_uyvy; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + return 0; +} + +// Convert YUY2 to Y. +LIBYUV_API +int YUY2ToY(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + int y; + void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) = + YUY2ToYRow_C; + if (!src_yuy2 || !dst_y || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; + src_stride_yuy2 = -src_stride_yuy2; + } + // Coalesce rows. + if (src_stride_yuy2 == width * 2 && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_yuy2 = dst_stride_y = 0; + } +#if defined(HAS_YUY2TOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + YUY2ToYRow = YUY2ToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_SSE2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + YUY2ToYRow = YUY2ToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_AVX2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + YUY2ToYRow = YUY2ToYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_NEON; + } + } +#endif +#if defined(HAS_YUY2TOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + YUY2ToYRow = YUY2ToYRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + YUY2ToYRow(src_yuy2, dst_y, width); + src_yuy2 += src_stride_yuy2; + dst_y += dst_stride_y; + } + return 0; +} + +// Mirror I400 with optional flipping +LIBYUV_API +int I400Mirror(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + if (!src_y || !dst_y || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + + MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + return 0; +} + +// Mirror I420 with optional flipping +LIBYUV_API +int I420Mirror(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + if (dst_y) { + MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + MirrorPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); + MirrorPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); + return 0; +} + +// ARGB mirror. +LIBYUV_API +int ARGBMirror(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*ARGBMirrorRow)(const uint8_t* src, uint8_t* dst, int width) = + ARGBMirrorRow_C; + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } +#if defined(HAS_ARGBMIRRORROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBMirrorRow = ARGBMirrorRow_Any_NEON; + if (IS_ALIGNED(width, 4)) { + ARGBMirrorRow = ARGBMirrorRow_NEON; + } + } +#endif +#if defined(HAS_ARGBMIRRORROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBMirrorRow = ARGBMirrorRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBMirrorRow = ARGBMirrorRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBMIRRORROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBMirrorRow = ARGBMirrorRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBMirrorRow = ARGBMirrorRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBMIRRORROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBMirrorRow = ARGBMirrorRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBMirrorRow = ARGBMirrorRow_MSA; + } + } +#endif + + // Mirror plane + for (y = 0; y < height; ++y) { + ARGBMirrorRow(src_argb, dst_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Get a blender that optimized for the CPU and pixel count. +// As there are 6 blenders to choose from, the caller should try to use +// the same blend function for all pixels if possible. +LIBYUV_API +ARGBBlendRow GetARGBBlend() { + void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1, + uint8_t* dst_argb, int width) = ARGBBlendRow_C; +#if defined(HAS_ARGBBLENDROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBBlendRow = ARGBBlendRow_SSSE3; + return ARGBBlendRow; + } +#endif +#if defined(HAS_ARGBBLENDROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBBlendRow = ARGBBlendRow_NEON; + } +#endif +#if defined(HAS_ARGBBLENDROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBBlendRow = ARGBBlendRow_MSA; + } +#endif + return ARGBBlendRow; +} + +// Alpha Blend 2 ARGB images and store to destination. +LIBYUV_API +int ARGBBlend(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1, + uint8_t* dst_argb, int width) = GetARGBBlend(); + if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; + } + + for (y = 0; y < height; ++y) { + ARGBBlendRow(src_argb0, src_argb1, dst_argb, width); + src_argb0 += src_stride_argb0; + src_argb1 += src_stride_argb1; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Alpha Blend plane and store to destination. +LIBYUV_API +int BlendPlane(const uint8_t* src_y0, + int src_stride_y0, + const uint8_t* src_y1, + int src_stride_y1, + const uint8_t* alpha, + int alpha_stride, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + int y; + void (*BlendPlaneRow)(const uint8_t* src0, const uint8_t* src1, + const uint8_t* alpha, uint8_t* dst, int width) = + BlendPlaneRow_C; + if (!src_y0 || !src_y1 || !alpha || !dst_y || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + + // Coalesce rows for Y plane. + if (src_stride_y0 == width && src_stride_y1 == width && + alpha_stride == width && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y0 = src_stride_y1 = alpha_stride = dst_stride_y = 0; + } + +#if defined(HAS_BLENDPLANEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + BlendPlaneRow = BlendPlaneRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + BlendPlaneRow = BlendPlaneRow_SSSE3; + } + } +#endif +#if defined(HAS_BLENDPLANEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + BlendPlaneRow = BlendPlaneRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + BlendPlaneRow = BlendPlaneRow_AVX2; + } + } +#endif + + for (y = 0; y < height; ++y) { + BlendPlaneRow(src_y0, src_y1, alpha, dst_y, width); + src_y0 += src_stride_y0; + src_y1 += src_stride_y1; + alpha += alpha_stride; + dst_y += dst_stride_y; + } + return 0; +} + +#define MAXTWIDTH 2048 +// Alpha Blend YUV images and store to destination. +LIBYUV_API +int I420Blend(const uint8_t* src_y0, + int src_stride_y0, + const uint8_t* src_u0, + int src_stride_u0, + const uint8_t* src_v0, + int src_stride_v0, + const uint8_t* src_y1, + int src_stride_y1, + const uint8_t* src_u1, + int src_stride_u1, + const uint8_t* src_v1, + int src_stride_v1, + const uint8_t* alpha, + int alpha_stride, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; + // Half width/height for UV. + int halfwidth = (width + 1) >> 1; + void (*BlendPlaneRow)(const uint8_t* src0, const uint8_t* src1, + const uint8_t* alpha, uint8_t* dst, int width) = + BlendPlaneRow_C; + void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width) = ScaleRowDown2Box_C; + if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 || + !alpha || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + + // Blend Y plane. + BlendPlane(src_y0, src_stride_y0, src_y1, src_stride_y1, alpha, alpha_stride, + dst_y, dst_stride_y, width, height); + +#if defined(HAS_BLENDPLANEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + BlendPlaneRow = BlendPlaneRow_Any_SSSE3; + if (IS_ALIGNED(halfwidth, 8)) { + BlendPlaneRow = BlendPlaneRow_SSSE3; + } + } +#endif +#if defined(HAS_BLENDPLANEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + BlendPlaneRow = BlendPlaneRow_Any_AVX2; + if (IS_ALIGNED(halfwidth, 32)) { + BlendPlaneRow = BlendPlaneRow_AVX2; + } + } +#endif + if (!IS_ALIGNED(width, 2)) { + ScaleRowDown2 = ScaleRowDown2Box_Odd_C; + } +#if defined(HAS_SCALEROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowDown2 = ScaleRowDown2Box_Odd_NEON; + if (IS_ALIGNED(width, 2)) { + ScaleRowDown2 = ScaleRowDown2Box_Any_NEON; + if (IS_ALIGNED(halfwidth, 16)) { + ScaleRowDown2 = ScaleRowDown2Box_NEON; + } + } + } +#endif +#if defined(HAS_SCALEROWDOWN2_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowDown2 = ScaleRowDown2Box_Odd_SSSE3; + if (IS_ALIGNED(width, 2)) { + ScaleRowDown2 = ScaleRowDown2Box_Any_SSSE3; + if (IS_ALIGNED(halfwidth, 16)) { + ScaleRowDown2 = ScaleRowDown2Box_SSSE3; + } + } + } +#endif +#if defined(HAS_SCALEROWDOWN2_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowDown2 = ScaleRowDown2Box_Odd_AVX2; + if (IS_ALIGNED(width, 2)) { + ScaleRowDown2 = ScaleRowDown2Box_Any_AVX2; + if (IS_ALIGNED(halfwidth, 32)) { + ScaleRowDown2 = ScaleRowDown2Box_AVX2; + } + } + } +#endif + + // Row buffer for intermediate alpha pixels. + align_buffer_64(halfalpha, halfwidth); + for (y = 0; y < height; y += 2) { + // last row of odd height image use 1 row of alpha instead of 2. + if (y == (height - 1)) { + alpha_stride = 0; + } + // Subsample 2 rows of UV to half width and half height. + ScaleRowDown2(alpha, alpha_stride, halfalpha, halfwidth); + alpha += alpha_stride * 2; + BlendPlaneRow(src_u0, src_u1, halfalpha, dst_u, halfwidth); + BlendPlaneRow(src_v0, src_v1, halfalpha, dst_v, halfwidth); + src_u0 += src_stride_u0; + src_u1 += src_stride_u1; + dst_u += dst_stride_u; + src_v0 += src_stride_v0; + src_v1 += src_stride_v1; + dst_v += dst_stride_v; + } + free_aligned_buffer_64(halfalpha); + return 0; +} + +// Multiply 2 ARGB images and store to destination. +LIBYUV_API +int ARGBMultiply(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*ARGBMultiplyRow)(const uint8_t* src0, const uint8_t* src1, + uint8_t* dst, int width) = ARGBMultiplyRow_C; + if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; + } +#if defined(HAS_ARGBMULTIPLYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBMultiplyRow = ARGBMultiplyRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBMULTIPLYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBMultiplyRow = ARGBMultiplyRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBMULTIPLYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBMultiplyRow = ARGBMultiplyRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBMultiplyRow = ARGBMultiplyRow_NEON; + } + } +#endif +#if defined(HAS_ARGBMULTIPLYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBMultiplyRow = ARGBMultiplyRow_Any_MSA; + if (IS_ALIGNED(width, 4)) { + ARGBMultiplyRow = ARGBMultiplyRow_MSA; + } + } +#endif + + // Multiply plane + for (y = 0; y < height; ++y) { + ARGBMultiplyRow(src_argb0, src_argb1, dst_argb, width); + src_argb0 += src_stride_argb0; + src_argb1 += src_stride_argb1; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Add 2 ARGB images and store to destination. +LIBYUV_API +int ARGBAdd(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*ARGBAddRow)(const uint8_t* src0, const uint8_t* src1, uint8_t* dst, + int width) = ARGBAddRow_C; + if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; + } +#if defined(HAS_ARGBADDROW_SSE2) && (defined(_MSC_VER) && !defined(__clang__)) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBAddRow = ARGBAddRow_SSE2; + } +#endif +#if defined(HAS_ARGBADDROW_SSE2) && !(defined(_MSC_VER) && !defined(__clang__)) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBAddRow = ARGBAddRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBAddRow = ARGBAddRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBADDROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAddRow = ARGBAddRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAddRow = ARGBAddRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBADDROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAddRow = ARGBAddRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAddRow = ARGBAddRow_NEON; + } + } +#endif +#if defined(HAS_ARGBADDROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAddRow = ARGBAddRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAddRow = ARGBAddRow_MSA; + } + } +#endif + + // Add plane + for (y = 0; y < height; ++y) { + ARGBAddRow(src_argb0, src_argb1, dst_argb, width); + src_argb0 += src_stride_argb0; + src_argb1 += src_stride_argb1; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Subtract 2 ARGB images and store to destination. +LIBYUV_API +int ARGBSubtract(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*ARGBSubtractRow)(const uint8_t* src0, const uint8_t* src1, + uint8_t* dst, int width) = ARGBSubtractRow_C; + if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; + } +#if defined(HAS_ARGBSUBTRACTROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBSubtractRow = ARGBSubtractRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBSubtractRow = ARGBSubtractRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBSUBTRACTROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBSubtractRow = ARGBSubtractRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBSubtractRow = ARGBSubtractRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBSUBTRACTROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBSubtractRow = ARGBSubtractRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBSubtractRow = ARGBSubtractRow_NEON; + } + } +#endif +#if defined(HAS_ARGBSUBTRACTROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBSubtractRow = ARGBSubtractRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBSubtractRow = ARGBSubtractRow_MSA; + } + } +#endif + + // Subtract plane + for (y = 0; y < height; ++y) { + ARGBSubtractRow(src_argb0, src_argb1, dst_argb, width); + src_argb0 += src_stride_argb0; + src_argb1 += src_stride_argb1; + dst_argb += dst_stride_argb; + } + return 0; +} +// Convert I422 to RGBA with matrix +static int I422ToRGBAMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGBARow_C; + if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; + dst_stride_rgba = -dst_stride_rgba; + } +#if defined(HAS_I422TORGBAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToRGBARow = I422ToRGBARow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_SSSE3; + } + } +#endif +#if defined(HAS_I422TORGBAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRGBARow = I422ToRGBARow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToRGBARow = I422ToRGBARow_AVX2; + } + } +#endif +#if defined(HAS_I422TORGBAROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRGBARow = I422ToRGBARow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_NEON; + } + } +#endif +#if defined(HAS_I422TORGBAROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGBARow = I422ToRGBARow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width); + dst_rgba += dst_stride_rgba; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I422 to RGBA. +LIBYUV_API +int I422ToRGBA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, + int width, + int height) { + return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgba, dst_stride_rgba, + &kYuvI601Constants, width, height); +} + +// Convert I422 to BGRA. +LIBYUV_API +int I422ToBGRA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_bgra, + int dst_stride_bgra, + int width, + int height) { + return I422ToRGBAMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_bgra, dst_stride_bgra, + &kYvuI601Constants, // Use Yvu matrix + width, height); +} + +// Convert NV12 to RGB565. +LIBYUV_API +int NV12ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height) { + int y; + void (*NV12ToRGB565Row)( + const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C; + if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } +#if defined(HAS_NV12TORGB565ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + NV12ToRGB565Row = NV12ToRGB565Row_SSSE3; + } + } +#endif +#if defined(HAS_NV12TORGB565ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + NV12ToRGB565Row = NV12ToRGB565Row_AVX2; + } + } +#endif +#if defined(HAS_NV12TORGB565ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV12ToRGB565Row = NV12ToRGB565Row_NEON; + } + } +#endif +#if defined(HAS_NV12TORGB565ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + NV12ToRGB565Row = NV12ToRGB565Row_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + NV12ToRGB565Row(src_y, src_uv, dst_rgb565, &kYuvI601Constants, width); + dst_rgb565 += dst_stride_rgb565; + src_y += src_stride_y; + if (y & 1) { + src_uv += src_stride_uv; + } + } + return 0; +} + +// Convert RAW to RGB24. +LIBYUV_API +int RAWToRGB24(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + int y; + void (*RAWToRGB24Row)(const uint8_t* src_rgb, uint8_t* dst_rgb24, int width) = + RAWToRGB24Row_C; + if (!src_raw || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_raw = src_raw + (height - 1) * src_stride_raw; + src_stride_raw = -src_stride_raw; + } + // Coalesce rows. + if (src_stride_raw == width * 3 && dst_stride_rgb24 == width * 3) { + width *= height; + height = 1; + src_stride_raw = dst_stride_rgb24 = 0; + } +#if defined(HAS_RAWTORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RAWToRGB24Row = RAWToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + RAWToRGB24Row = RAWToRGB24Row_SSSE3; + } + } +#endif +#if defined(HAS_RAWTORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RAWToRGB24Row = RAWToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RAWToRGB24Row = RAWToRGB24Row_NEON; + } + } +#endif +#if defined(HAS_RAWTORGB24ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RAWToRGB24Row = RAWToRGB24Row_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RAWToRGB24Row = RAWToRGB24Row_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + RAWToRGB24Row(src_raw, dst_rgb24, width); + src_raw += src_stride_raw; + dst_rgb24 += dst_stride_rgb24; + } + return 0; +} + +LIBYUV_API +void SetPlane(uint8_t* dst_y, + int dst_stride_y, + int width, + int height, + uint32_t value) { + int y; + void (*SetRow)(uint8_t * dst, uint8_t value, int width) = SetRow_C; + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + // Coalesce rows. + if (dst_stride_y == width) { + width *= height; + height = 1; + dst_stride_y = 0; + } +#if defined(HAS_SETROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SetRow = SetRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SetRow = SetRow_NEON; + } + } +#endif +#if defined(HAS_SETROW_X86) + if (TestCpuFlag(kCpuHasX86)) { + SetRow = SetRow_Any_X86; + if (IS_ALIGNED(width, 4)) { + SetRow = SetRow_X86; + } + } +#endif +#if defined(HAS_SETROW_ERMS) + if (TestCpuFlag(kCpuHasERMS)) { + SetRow = SetRow_ERMS; + } +#endif +#if defined(HAS_SETROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 16)) { + SetRow = SetRow_MSA; + } +#endif + + // Set plane + for (y = 0; y < height; ++y) { + SetRow(dst_y, value, width); + dst_y += dst_stride_y; + } +} + +// Draw a rectangle into I420 +LIBYUV_API +int I420Rect(uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int x, + int y, + int width, + int height, + int value_y, + int value_u, + int value_v) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + uint8_t* start_y = dst_y + y * dst_stride_y + x; + uint8_t* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2); + uint8_t* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2); + if (!dst_y || !dst_u || !dst_v || width <= 0 || height == 0 || x < 0 || + y < 0 || value_y < 0 || value_y > 255 || value_u < 0 || value_u > 255 || + value_v < 0 || value_v > 255) { + return -1; + } + + SetPlane(start_y, dst_stride_y, width, height, value_y); + SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u); + SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v); + return 0; +} + +// Draw a rectangle into ARGB +LIBYUV_API +int ARGBRect(uint8_t* dst_argb, + int dst_stride_argb, + int dst_x, + int dst_y, + int width, + int height, + uint32_t value) { + int y; + void (*ARGBSetRow)(uint8_t * dst_argb, uint32_t value, int width) = + ARGBSetRow_C; + if (!dst_argb || width <= 0 || height == 0 || dst_x < 0 || dst_y < 0) { + return -1; + } + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + dst_argb += dst_y * dst_stride_argb + dst_x * 4; + // Coalesce rows. + if (dst_stride_argb == width * 4) { + width *= height; + height = 1; + dst_stride_argb = 0; + } + +#if defined(HAS_ARGBSETROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBSetRow = ARGBSetRow_Any_NEON; + if (IS_ALIGNED(width, 4)) { + ARGBSetRow = ARGBSetRow_NEON; + } + } +#endif +#if defined(HAS_ARGBSETROW_X86) + if (TestCpuFlag(kCpuHasX86)) { + ARGBSetRow = ARGBSetRow_X86; + } +#endif +#if defined(HAS_ARGBSETROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBSetRow = ARGBSetRow_Any_MSA; + if (IS_ALIGNED(width, 4)) { + ARGBSetRow = ARGBSetRow_MSA; + } + } +#endif + + // Set plane + for (y = 0; y < height; ++y) { + ARGBSetRow(dst_argb, value, width); + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert unattentuated ARGB to preattenuated ARGB. +// An unattenutated ARGB alpha blend uses the formula +// p = a * f + (1 - a) * b +// where +// p is output pixel +// f is foreground pixel +// b is background pixel +// a is alpha value from foreground pixel +// An preattenutated ARGB alpha blend uses the formula +// p = f + (1 - a) * b +// where +// f is foreground pixel premultiplied by alpha + +LIBYUV_API +int ARGBAttenuate(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBAttenuateRow_C; + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBAttenuateRow(src_argb, dst_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert preattentuated ARGB to unattenuated ARGB. +LIBYUV_API +int ARGBUnattenuate(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*ARGBUnattenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBUnattenuateRow_C; + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBUNATTENUATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBUNATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2; + } + } +#endif + // TODO(fbarchard): Neon version. + + for (y = 0; y < height; ++y) { + ARGBUnattenuateRow(src_argb, dst_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert ARGB to Grayed ARGB. +LIBYUV_API +int ARGBGrayTo(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*ARGBGrayRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = + ARGBGrayRow_C; + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBGRAYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_SSSE3; + } +#endif +#if defined(HAS_ARGBGRAYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_NEON; + } +#endif +#if defined(HAS_ARGBGRAYROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_MSA; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBGrayRow(src_argb, dst_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Make a rectangle of ARGB gray scale. +LIBYUV_API +int ARGBGray(uint8_t* dst_argb, + int dst_stride_argb, + int dst_x, + int dst_y, + int width, + int height) { + int y; + void (*ARGBGrayRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = + ARGBGrayRow_C; + uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { + return -1; + } + // Coalesce rows. + if (dst_stride_argb == width * 4) { + width *= height; + height = 1; + dst_stride_argb = 0; + } +#if defined(HAS_ARGBGRAYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_SSSE3; + } +#endif +#if defined(HAS_ARGBGRAYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_NEON; + } +#endif +#if defined(HAS_ARGBGRAYROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_MSA; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBGrayRow(dst, dst, width); + dst += dst_stride_argb; + } + return 0; +} + +// Make a rectangle of ARGB Sepia tone. +LIBYUV_API +int ARGBSepia(uint8_t* dst_argb, + int dst_stride_argb, + int dst_x, + int dst_y, + int width, + int height) { + int y; + void (*ARGBSepiaRow)(uint8_t * dst_argb, int width) = ARGBSepiaRow_C; + uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { + return -1; + } + // Coalesce rows. + if (dst_stride_argb == width * 4) { + width *= height; + height = 1; + dst_stride_argb = 0; + } +#if defined(HAS_ARGBSEPIAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { + ARGBSepiaRow = ARGBSepiaRow_SSSE3; + } +#endif +#if defined(HAS_ARGBSEPIAROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBSepiaRow = ARGBSepiaRow_NEON; + } +#endif +#if defined(HAS_ARGBSEPIAROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { + ARGBSepiaRow = ARGBSepiaRow_MSA; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBSepiaRow(dst, width); + dst += dst_stride_argb; + } + return 0; +} + +// Apply a 4x4 matrix to each ARGB pixel. +// Note: Normally for shading, but can be used to swizzle or invert. +LIBYUV_API +int ARGBColorMatrix(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + const int8_t* matrix_argb, + int width, + int height) { + int y; + void (*ARGBColorMatrixRow)(const uint8_t* src_argb, uint8_t* dst_argb, + const int8_t* matrix_argb, int width) = + ARGBColorMatrixRow_C; + if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { + ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3; + } +#endif +#if defined(HAS_ARGBCOLORMATRIXROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBColorMatrixRow = ARGBColorMatrixRow_NEON; + } +#endif +#if defined(HAS_ARGBCOLORMATRIXROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { + ARGBColorMatrixRow = ARGBColorMatrixRow_MSA; + } +#endif + for (y = 0; y < height; ++y) { + ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Apply a 4x3 matrix to each ARGB pixel. +// Deprecated. +LIBYUV_API +int RGBColorMatrix(uint8_t* dst_argb, + int dst_stride_argb, + const int8_t* matrix_rgb, + int dst_x, + int dst_y, + int width, + int height) { + SIMD_ALIGNED(int8_t matrix_argb[16]); + uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 || dst_x < 0 || + dst_y < 0) { + return -1; + } + + // Convert 4x3 7 bit matrix to 4x4 6 bit matrix. + matrix_argb[0] = matrix_rgb[0] / 2; + matrix_argb[1] = matrix_rgb[1] / 2; + matrix_argb[2] = matrix_rgb[2] / 2; + matrix_argb[3] = matrix_rgb[3] / 2; + matrix_argb[4] = matrix_rgb[4] / 2; + matrix_argb[5] = matrix_rgb[5] / 2; + matrix_argb[6] = matrix_rgb[6] / 2; + matrix_argb[7] = matrix_rgb[7] / 2; + matrix_argb[8] = matrix_rgb[8] / 2; + matrix_argb[9] = matrix_rgb[9] / 2; + matrix_argb[10] = matrix_rgb[10] / 2; + matrix_argb[11] = matrix_rgb[11] / 2; + matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0; + matrix_argb[15] = 64; // 1.0 + + return ARGBColorMatrix((const uint8_t*)(dst), dst_stride_argb, dst, + dst_stride_argb, &matrix_argb[0], width, height); +} + +// Apply a color table each ARGB pixel. +// Table contains 256 ARGB values. +LIBYUV_API +int ARGBColorTable(uint8_t* dst_argb, + int dst_stride_argb, + const uint8_t* table_argb, + int dst_x, + int dst_y, + int width, + int height) { + int y; + void (*ARGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb, + int width) = ARGBColorTableRow_C; + uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 || + dst_y < 0) { + return -1; + } + // Coalesce rows. + if (dst_stride_argb == width * 4) { + width *= height; + height = 1; + dst_stride_argb = 0; + } +#if defined(HAS_ARGBCOLORTABLEROW_X86) + if (TestCpuFlag(kCpuHasX86)) { + ARGBColorTableRow = ARGBColorTableRow_X86; + } +#endif + for (y = 0; y < height; ++y) { + ARGBColorTableRow(dst, table_argb, width); + dst += dst_stride_argb; + } + return 0; +} + +// Apply a color table each ARGB pixel but preserve destination alpha. +// Table contains 256 ARGB values. +LIBYUV_API +int RGBColorTable(uint8_t* dst_argb, + int dst_stride_argb, + const uint8_t* table_argb, + int dst_x, + int dst_y, + int width, + int height) { + int y; + void (*RGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb, + int width) = RGBColorTableRow_C; + uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 || + dst_y < 0) { + return -1; + } + // Coalesce rows. + if (dst_stride_argb == width * 4) { + width *= height; + height = 1; + dst_stride_argb = 0; + } +#if defined(HAS_RGBCOLORTABLEROW_X86) + if (TestCpuFlag(kCpuHasX86)) { + RGBColorTableRow = RGBColorTableRow_X86; + } +#endif + for (y = 0; y < height; ++y) { + RGBColorTableRow(dst, table_argb, width); + dst += dst_stride_argb; + } + return 0; +} + +// ARGBQuantize is used to posterize art. +// e.g. rgb / qvalue * qvalue + qvalue / 2 +// But the low levels implement efficiently with 3 parameters, and could be +// used for other high level operations. +// dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset; +// where scale is 1 / interval_size as a fixed point value. +// The divide is replaces with a multiply by reciprocal fixed point multiply. +// Caveat - although SSE2 saturates, the C function does not and should be used +// with care if doing anything but quantization. +LIBYUV_API +int ARGBQuantize(uint8_t* dst_argb, + int dst_stride_argb, + int scale, + int interval_size, + int interval_offset, + int dst_x, + int dst_y, + int width, + int height) { + int y; + void (*ARGBQuantizeRow)(uint8_t * dst_argb, int scale, int interval_size, + int interval_offset, int width) = ARGBQuantizeRow_C; + uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 || + interval_size < 1 || interval_size > 255) { + return -1; + } + // Coalesce rows. + if (dst_stride_argb == width * 4) { + width *= height; + height = 1; + dst_stride_argb = 0; + } +#if defined(HAS_ARGBQUANTIZEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) { + ARGBQuantizeRow = ARGBQuantizeRow_SSE2; + } +#endif +#if defined(HAS_ARGBQUANTIZEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBQuantizeRow = ARGBQuantizeRow_NEON; + } +#endif +#if defined(HAS_ARGBQUANTIZEROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { + ARGBQuantizeRow = ARGBQuantizeRow_MSA; + } +#endif + for (y = 0; y < height; ++y) { + ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width); + dst += dst_stride_argb; + } + return 0; +} + +// Computes table of cumulative sum for image where the value is the sum +// of all values above and to the left of the entry. Used by ARGBBlur. +LIBYUV_API +int ARGBComputeCumulativeSum(const uint8_t* src_argb, + int src_stride_argb, + int32_t* dst_cumsum, + int dst_stride32_cumsum, + int width, + int height) { + int y; + void (*ComputeCumulativeSumRow)(const uint8_t* row, int32_t* cumsum, + const int32_t* previous_cumsum, int width) = + ComputeCumulativeSumRow_C; + int32_t* previous_cumsum = dst_cumsum; + if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) { + return -1; + } +#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2; + } +#endif + memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4); // 4 int per pixel. + for (y = 0; y < height; ++y) { + ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width); + previous_cumsum = dst_cumsum; + dst_cumsum += dst_stride32_cumsum; + src_argb += src_stride_argb; + } + return 0; +} + +// Blur ARGB image. +// Caller should allocate CumulativeSum table of width * height * 16 bytes +// aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory +// as the buffer is treated as circular. +LIBYUV_API +int ARGBBlur(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int32_t* dst_cumsum, + int dst_stride32_cumsum, + int width, + int height, + int radius) { + int y; + void (*ComputeCumulativeSumRow)(const uint8_t* row, int32_t* cumsum, + const int32_t* previous_cumsum, int width) = + ComputeCumulativeSumRow_C; + void (*CumulativeSumToAverageRow)( + const int32_t* topleft, const int32_t* botleft, int width, int area, + uint8_t* dst, int count) = CumulativeSumToAverageRow_C; + int32_t* cumsum_bot_row; + int32_t* max_cumsum_bot_row; + int32_t* cumsum_top_row; + + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + if (radius > height) { + radius = height; + } + if (radius > (width / 2 - 1)) { + radius = width / 2 - 1; + } + if (radius <= 0) { + return -1; + } +#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2; + CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2; + } +#endif + // Compute enough CumulativeSum for first row to be blurred. After this + // one row of CumulativeSum is updated at a time. + ARGBComputeCumulativeSum(src_argb, src_stride_argb, dst_cumsum, + dst_stride32_cumsum, width, radius); + + src_argb = src_argb + radius * src_stride_argb; + cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum]; + + max_cumsum_bot_row = &dst_cumsum[(radius * 2 + 2) * dst_stride32_cumsum]; + cumsum_top_row = &dst_cumsum[0]; + + for (y = 0; y < height; ++y) { + int top_y = ((y - radius - 1) >= 0) ? (y - radius - 1) : 0; + int bot_y = ((y + radius) < height) ? (y + radius) : (height - 1); + int area = radius * (bot_y - top_y); + int boxwidth = radius * 4; + int x; + int n; + + // Increment cumsum_top_row pointer with circular buffer wrap around. + if (top_y) { + cumsum_top_row += dst_stride32_cumsum; + if (cumsum_top_row >= max_cumsum_bot_row) { + cumsum_top_row = dst_cumsum; + } + } + // Increment cumsum_bot_row pointer with circular buffer wrap around and + // then fill in a row of CumulativeSum. + if ((y + radius) < height) { + const int32_t* prev_cumsum_bot_row = cumsum_bot_row; + cumsum_bot_row += dst_stride32_cumsum; + if (cumsum_bot_row >= max_cumsum_bot_row) { + cumsum_bot_row = dst_cumsum; + } + ComputeCumulativeSumRow(src_argb, cumsum_bot_row, prev_cumsum_bot_row, + width); + src_argb += src_stride_argb; + } + + // Left clipped. + for (x = 0; x < radius + 1; ++x) { + CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area, + &dst_argb[x * 4], 1); + area += (bot_y - top_y); + boxwidth += 4; + } + + // Middle unclipped. + n = (width - 1) - radius - x + 1; + CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area, + &dst_argb[x * 4], n); + + // Right clipped. + for (x += n; x <= width - 1; ++x) { + area -= (bot_y - top_y); + boxwidth -= 4; + CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4, + cumsum_bot_row + (x - radius - 1) * 4, boxwidth, + area, &dst_argb[x * 4], 1); + } + dst_argb += dst_stride_argb; + } + return 0; +} + +// Multiply ARGB image by a specified ARGB value. +LIBYUV_API +int ARGBShade(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + uint32_t value) { + int y; + void (*ARGBShadeRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width, + uint32_t value) = ARGBShadeRow_C; + if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBSHADEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) { + ARGBShadeRow = ARGBShadeRow_SSE2; + } +#endif +#if defined(HAS_ARGBSHADEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBShadeRow = ARGBShadeRow_NEON; + } +#endif +#if defined(HAS_ARGBSHADEROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 4)) { + ARGBShadeRow = ARGBShadeRow_MSA; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBShadeRow(src_argb, dst_argb, width, value); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Interpolate 2 planes by specified amount (0 to 255). +LIBYUV_API +int InterpolatePlane(const uint8_t* src0, + int src_stride0, + const uint8_t* src1, + int src_stride1, + uint8_t* dst, + int dst_stride, + int width, + int height, + int interpolation) { + int y; + void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + if (!src0 || !src1 || !dst || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst = dst + (height - 1) * dst_stride; + dst_stride = -dst_stride; + } + // Coalesce rows. + if (src_stride0 == width && src_stride1 == width && dst_stride == width) { + width *= height; + height = 1; + src_stride0 = src_stride1 = dst_stride = 0; + } +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + InterpolateRow(dst, src0, src1 - src0, width, interpolation); + src0 += src_stride0; + src1 += src_stride1; + dst += dst_stride; + } + return 0; +} + +// Interpolate 2 ARGB images by specified amount (0 to 255). +LIBYUV_API +int ARGBInterpolate(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + int interpolation) { + return InterpolatePlane(src_argb0, src_stride_argb0, src_argb1, + src_stride_argb1, dst_argb, dst_stride_argb, + width * 4, height, interpolation); +} + +// Interpolate 2 YUV images by specified amount (0 to 255). +LIBYUV_API +int I420Interpolate(const uint8_t* src0_y, + int src0_stride_y, + const uint8_t* src0_u, + int src0_stride_u, + const uint8_t* src0_v, + int src0_stride_v, + const uint8_t* src1_y, + int src1_stride_y, + const uint8_t* src1_u, + int src1_stride_u, + const uint8_t* src1_v, + int src1_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + int interpolation) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src0_y || !src0_u || !src0_v || !src1_y || !src1_u || !src1_v || + !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + InterpolatePlane(src0_y, src0_stride_y, src1_y, src1_stride_y, dst_y, + dst_stride_y, width, height, interpolation); + InterpolatePlane(src0_u, src0_stride_u, src1_u, src1_stride_u, dst_u, + dst_stride_u, halfwidth, halfheight, interpolation); + InterpolatePlane(src0_v, src0_stride_v, src1_v, src1_stride_v, dst_v, + dst_stride_v, halfwidth, halfheight, interpolation); + return 0; +} + +// Shuffle ARGB channel order. e.g. BGRA to ARGB. +LIBYUV_API +int ARGBShuffle(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_argb, + int dst_stride_argb, + const uint8_t* shuffler, + int width, + int height) { + int y; + void (*ARGBShuffleRow)(const uint8_t* src_bgra, uint8_t* dst_argb, + const uint8_t* shuffler, int width) = ARGBShuffleRow_C; + if (!src_bgra || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_bgra = src_bgra + (height - 1) * src_stride_bgra; + src_stride_bgra = -src_stride_bgra; + } + // Coalesce rows. + if (src_stride_bgra == width * 4 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_bgra = dst_stride_argb = 0; + } +#if defined(HAS_ARGBSHUFFLEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + ARGBShuffleRow = ARGBShuffleRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBSHUFFLEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBShuffleRow = ARGBShuffleRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGBShuffleRow = ARGBShuffleRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBSHUFFLEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBShuffleRow = ARGBShuffleRow_Any_NEON; + if (IS_ALIGNED(width, 4)) { + ARGBShuffleRow = ARGBShuffleRow_NEON; + } + } +#endif +#if defined(HAS_ARGBSHUFFLEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBShuffleRow = ARGBShuffleRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBShuffleRow = ARGBShuffleRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBShuffleRow(src_bgra, dst_argb, shuffler, width); + src_bgra += src_stride_bgra; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Sobel ARGB effect. +static int ARGBSobelize(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + void (*SobelRow)(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst, + int width)) { + int y; + void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_g, int width) = + ARGBToYJRow_C; + void (*SobelYRow)(const uint8_t* src_y0, const uint8_t* src_y1, + uint8_t* dst_sobely, int width) = SobelYRow_C; + void (*SobelXRow)(const uint8_t* src_y0, const uint8_t* src_y1, + const uint8_t* src_y2, uint8_t* dst_sobely, int width) = + SobelXRow_C; + const int kEdge = 16; // Extra pixels at start of row for extrude/align. + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + +#if defined(HAS_ARGBTOYJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYJRow = ARGBToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYJRow = ARGBToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYJRow = ARGBToYJRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYJRow = ARGBToYJRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYJRow = ARGBToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_MSA; + } + } +#endif + +#if defined(HAS_SOBELYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SobelYRow = SobelYRow_SSE2; + } +#endif +#if defined(HAS_SOBELYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SobelYRow = SobelYRow_NEON; + } +#endif +#if defined(HAS_SOBELYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelYRow = SobelYRow_MSA; + } +#endif +#if defined(HAS_SOBELXROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SobelXRow = SobelXRow_SSE2; + } +#endif +#if defined(HAS_SOBELXROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SobelXRow = SobelXRow_NEON; + } +#endif +#if defined(HAS_SOBELXROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelXRow = SobelXRow_MSA; + } +#endif + { + // 3 rows with edges before/after. + const int kRowSize = (width + kEdge + 31) & ~31; + align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge)); + uint8_t* row_sobelx = rows; + uint8_t* row_sobely = rows + kRowSize; + uint8_t* row_y = rows + kRowSize * 2; + + // Convert first row. + uint8_t* row_y0 = row_y + kEdge; + uint8_t* row_y1 = row_y0 + kRowSize; + uint8_t* row_y2 = row_y1 + kRowSize; + ARGBToYJRow(src_argb, row_y0, width); + row_y0[-1] = row_y0[0]; + memset(row_y0 + width, row_y0[width - 1], 16); // Extrude 16 for valgrind. + ARGBToYJRow(src_argb, row_y1, width); + row_y1[-1] = row_y1[0]; + memset(row_y1 + width, row_y1[width - 1], 16); + memset(row_y2 + width, 0, 16); + + for (y = 0; y < height; ++y) { + // Convert next row of ARGB to G. + if (y < (height - 1)) { + src_argb += src_stride_argb; + } + ARGBToYJRow(src_argb, row_y2, width); + row_y2[-1] = row_y2[0]; + row_y2[width] = row_y2[width - 1]; + + SobelXRow(row_y0 - 1, row_y1 - 1, row_y2 - 1, row_sobelx, width); + SobelYRow(row_y0 - 1, row_y2 - 1, row_sobely, width); + SobelRow(row_sobelx, row_sobely, dst_argb, width); + + // Cycle thru circular queue of 3 row_y buffers. + { + uint8_t* row_yt = row_y0; + row_y0 = row_y1; + row_y1 = row_y2; + row_y2 = row_yt; + } + + dst_argb += dst_stride_argb; + } + free_aligned_buffer_64(rows); + } + return 0; +} + +// Sobel ARGB effect. +LIBYUV_API +int ARGBSobel(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + void (*SobelRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely, + uint8_t* dst_argb, int width) = SobelRow_C; +#if defined(HAS_SOBELROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SobelRow = SobelRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + SobelRow = SobelRow_SSE2; + } + } +#endif +#if defined(HAS_SOBELROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SobelRow = SobelRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + SobelRow = SobelRow_NEON; + } + } +#endif +#if defined(HAS_SOBELROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelRow = SobelRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + SobelRow = SobelRow_MSA; + } + } +#endif + return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width, height, SobelRow); +} + +// Sobel ARGB effect with planar output. +LIBYUV_API +int ARGBSobelToPlane(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + void (*SobelToPlaneRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely, + uint8_t* dst_, int width) = SobelToPlaneRow_C; +#if defined(HAS_SOBELTOPLANEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SobelToPlaneRow = SobelToPlaneRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + SobelToPlaneRow = SobelToPlaneRow_SSE2; + } + } +#endif +#if defined(HAS_SOBELTOPLANEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SobelToPlaneRow = SobelToPlaneRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SobelToPlaneRow = SobelToPlaneRow_NEON; + } + } +#endif +#if defined(HAS_SOBELTOPLANEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelToPlaneRow = SobelToPlaneRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + SobelToPlaneRow = SobelToPlaneRow_MSA; + } + } +#endif + return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y, width, + height, SobelToPlaneRow); +} + +// SobelXY ARGB effect. +// Similar to Sobel, but also stores Sobel X in R and Sobel Y in B. G = Sobel. +LIBYUV_API +int ARGBSobelXY(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + void (*SobelXYRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely, + uint8_t* dst_argb, int width) = SobelXYRow_C; +#if defined(HAS_SOBELXYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SobelXYRow = SobelXYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + SobelXYRow = SobelXYRow_SSE2; + } + } +#endif +#if defined(HAS_SOBELXYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SobelXYRow = SobelXYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + SobelXYRow = SobelXYRow_NEON; + } + } +#endif +#if defined(HAS_SOBELXYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelXYRow = SobelXYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + SobelXYRow = SobelXYRow_MSA; + } + } +#endif + return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width, height, SobelXYRow); +} + +// Apply a 4x4 polynomial to each ARGB pixel. +LIBYUV_API +int ARGBPolynomial(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + const float* poly, + int width, + int height) { + int y; + void (*ARGBPolynomialRow)(const uint8_t* src_argb, uint8_t* dst_argb, + const float* poly, int width) = ARGBPolynomialRow_C; + if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBPOLYNOMIALROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 2)) { + ARGBPolynomialRow = ARGBPolynomialRow_SSE2; + } +#endif +#if defined(HAS_ARGBPOLYNOMIALROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasFMA3) && + IS_ALIGNED(width, 2)) { + ARGBPolynomialRow = ARGBPolynomialRow_AVX2; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBPolynomialRow(src_argb, dst_argb, poly, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert plane of 16 bit shorts to half floats. +// Source values are multiplied by scale before storing as half float. +LIBYUV_API +int HalfFloatPlane(const uint16_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + float scale, + int width, + int height) { + int y; + void (*HalfFloatRow)(const uint16_t* src, uint16_t* dst, float scale, + int width) = HalfFloatRow_C; + if (!src_y || !dst_y || width <= 0 || height == 0) { + return -1; + } + src_stride_y >>= 1; + dst_stride_y >>= 1; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } +#if defined(HAS_HALFFLOATROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + HalfFloatRow = HalfFloatRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + HalfFloatRow = HalfFloatRow_SSE2; + } + } +#endif +#if defined(HAS_HALFFLOATROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + HalfFloatRow = HalfFloatRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + HalfFloatRow = HalfFloatRow_AVX2; + } + } +#endif +#if defined(HAS_HALFFLOATROW_F16C) + if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasF16C)) { + HalfFloatRow = + (scale == 1.0f) ? HalfFloat1Row_Any_F16C : HalfFloatRow_Any_F16C; + if (IS_ALIGNED(width, 16)) { + HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_F16C : HalfFloatRow_F16C; + } + } +#endif +#if defined(HAS_HALFFLOATROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + HalfFloatRow = + (scale == 1.0f) ? HalfFloat1Row_Any_NEON : HalfFloatRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_NEON : HalfFloatRow_NEON; + } + } +#endif +#if defined(HAS_HALFFLOATROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + HalfFloatRow = HalfFloatRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + HalfFloatRow = HalfFloatRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + HalfFloatRow(src_y, dst_y, scale, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } + return 0; +} + +// Convert a buffer of bytes to floats, scale the values and store as floats. +LIBYUV_API +int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width) { + void (*ByteToFloatRow)(const uint8_t* src, float* dst, float scale, + int width) = ByteToFloatRow_C; + if (!src_y || !dst_y || width <= 0) { + return -1; + } +#if defined(HAS_BYTETOFLOATROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ByteToFloatRow = ByteToFloatRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ByteToFloatRow = ByteToFloatRow_NEON; + } + } +#endif + + ByteToFloatRow(src_y, dst_y, scale, width); + return 0; +} + +// Apply a lumacolortable to each ARGB pixel. +LIBYUV_API +int ARGBLumaColorTable(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + const uint8_t* luma, + int width, + int height) { + int y; + void (*ARGBLumaColorTableRow)( + const uint8_t* src_argb, uint8_t* dst_argb, int width, + const uint8_t* luma, const uint32_t lumacoeff) = ARGBLumaColorTableRow_C; + if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBLUMACOLORTABLEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) { + ARGBLumaColorTableRow = ARGBLumaColorTableRow_SSSE3; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBLumaColorTableRow(src_argb, dst_argb, width, luma, 0x00264b0f); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Copy Alpha from one ARGB image to another. +LIBYUV_API +int ARGBCopyAlpha(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*ARGBCopyAlphaRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBCopyAlphaRow_C; + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBCOPYALPHAROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBCOPYALPHAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBCopyAlphaRow(src_argb, dst_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Extract just the alpha channel from ARGB. +LIBYUV_API +int ARGBExtractAlpha(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_a, + int dst_stride_a, + int width, + int height) { + if (!src_argb || !dst_a || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb += (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_a == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_a = 0; + } + void (*ARGBExtractAlphaRow)(const uint8_t* src_argb, uint8_t* dst_a, + int width) = ARGBExtractAlphaRow_C; +#if defined(HAS_ARGBEXTRACTALPHAROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2 + : ARGBExtractAlphaRow_Any_SSE2; + } +#endif +#if defined(HAS_ARGBEXTRACTALPHAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBExtractAlphaRow = IS_ALIGNED(width, 32) ? ARGBExtractAlphaRow_AVX2 + : ARGBExtractAlphaRow_Any_AVX2; + } +#endif +#if defined(HAS_ARGBEXTRACTALPHAROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_NEON + : ARGBExtractAlphaRow_Any_NEON; + } +#endif +#if defined(HAS_ARGBEXTRACTALPHAROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_MSA + : ARGBExtractAlphaRow_Any_MSA; + } +#endif + + for (int y = 0; y < height; ++y) { + ARGBExtractAlphaRow(src_argb, dst_a, width); + src_argb += src_stride_argb; + dst_a += dst_stride_a; + } + return 0; +} + +// Copy a planar Y channel to the alpha channel of a destination ARGB image. +LIBYUV_API +int ARGBCopyYToAlpha(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*ARGBCopyYToAlphaRow)(const uint8_t* src_y, uint8_t* dst_argb, + int width) = ARGBCopyYToAlphaRow_C; + if (!src_y || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = dst_stride_argb = 0; + } +#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBCopyYToAlphaRow(src_y, dst_argb, width); + src_y += src_stride_y; + dst_argb += dst_stride_argb; + } + return 0; +} + +// TODO(fbarchard): Consider if width is even Y channel can be split +// directly. A SplitUVRow_Odd function could copy the remaining chroma. + +LIBYUV_API +int YUY2ToNV12(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + int y; + int halfwidth = (width + 1) >> 1; + void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, + int width) = SplitUVRow_C; + void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + if (!src_yuy2 || !dst_y || !dst_uv || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; + src_stride_yuy2 = -src_stride_yuy2; + } +#if defined(HAS_SPLITUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SplitUVRow = SplitUVRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + SplitUVRow = SplitUVRow_SSE2; + } + } +#endif +#if defined(HAS_SPLITUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + SplitUVRow = SplitUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_AVX2; + } + } +#endif +#if defined(HAS_SPLITUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SplitUVRow = SplitUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SplitUVRow = SplitUVRow_NEON; + } + } +#endif +#if defined(HAS_SPLITUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SplitUVRow = SplitUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_MSA; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif + + { + int awidth = halfwidth * 2; + // row of y and 2 rows of uv + align_buffer_64(rows, awidth * 3); + + for (y = 0; y < height - 1; y += 2) { + // Split Y from UV. + SplitUVRow(src_yuy2, rows, rows + awidth, awidth); + memcpy(dst_y, rows, width); + SplitUVRow(src_yuy2 + src_stride_yuy2, rows, rows + awidth * 2, awidth); + memcpy(dst_y + dst_stride_y, rows, width); + InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128); + src_yuy2 += src_stride_yuy2 * 2; + dst_y += dst_stride_y * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + // Split Y from UV. + SplitUVRow(src_yuy2, rows, dst_uv, awidth); + memcpy(dst_y, rows, width); + } + free_aligned_buffer_64(rows); + } + return 0; +} + +LIBYUV_API +int UYVYToNV12(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + int y; + int halfwidth = (width + 1) >> 1; + void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, + int width) = SplitUVRow_C; + void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + if (!src_uyvy || !dst_y || !dst_uv || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; + src_stride_uyvy = -src_stride_uyvy; + } +#if defined(HAS_SPLITUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SplitUVRow = SplitUVRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + SplitUVRow = SplitUVRow_SSE2; + } + } +#endif +#if defined(HAS_SPLITUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + SplitUVRow = SplitUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_AVX2; + } + } +#endif +#if defined(HAS_SPLITUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SplitUVRow = SplitUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SplitUVRow = SplitUVRow_NEON; + } + } +#endif +#if defined(HAS_SPLITUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SplitUVRow = SplitUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_MSA; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif + + { + int awidth = halfwidth * 2; + // row of y and 2 rows of uv + align_buffer_64(rows, awidth * 3); + + for (y = 0; y < height - 1; y += 2) { + // Split Y from UV. + SplitUVRow(src_uyvy, rows + awidth, rows, awidth); + memcpy(dst_y, rows, width); + SplitUVRow(src_uyvy + src_stride_uyvy, rows + awidth * 2, rows, awidth); + memcpy(dst_y + dst_stride_y, rows, width); + InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128); + src_uyvy += src_stride_uyvy * 2; + dst_y += dst_stride_y * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + // Split Y from UV. + SplitUVRow(src_uyvy, dst_uv, rows, awidth); + memcpy(dst_y, rows, width); + } + free_aligned_buffer_64(rows); + } + return 0; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/rotate.cc b/media/libvpx/libvpx/third_party/libyuv/source/rotate.cc new file mode 100644 index 0000000000..f2bed85b75 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/rotate.cc @@ -0,0 +1,514 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate.h" + +#include "libyuv/convert.h" +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" +#include "libyuv/rotate_row.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +LIBYUV_API +void TransposePlane(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height) { + int i = height; +#if defined(HAS_TRANSPOSEWX16_MSA) + void (*TransposeWx16)(const uint8_t* src, int src_stride, uint8_t* dst, + int dst_stride, int width) = TransposeWx16_C; +#else + void (*TransposeWx8)(const uint8_t* src, int src_stride, uint8_t* dst, + int dst_stride, int width) = TransposeWx8_C; +#endif +#if defined(HAS_TRANSPOSEWX8_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + TransposeWx8 = TransposeWx8_NEON; + } +#endif +#if defined(HAS_TRANSPOSEWX8_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + TransposeWx8 = TransposeWx8_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + TransposeWx8 = TransposeWx8_SSSE3; + } + } +#endif +#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + TransposeWx8 = TransposeWx8_Fast_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + TransposeWx8 = TransposeWx8_Fast_SSSE3; + } + } +#endif +#if defined(HAS_TRANSPOSEWX16_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + TransposeWx16 = TransposeWx16_Any_MSA; + if (IS_ALIGNED(width, 16)) { + TransposeWx16 = TransposeWx16_MSA; + } + } +#endif + +#if defined(HAS_TRANSPOSEWX16_MSA) + // Work across the source in 16x16 tiles + while (i >= 16) { + TransposeWx16(src, src_stride, dst, dst_stride, width); + src += 16 * src_stride; // Go down 16 rows. + dst += 16; // Move over 16 columns. + i -= 16; + } +#else + // Work across the source in 8x8 tiles + while (i >= 8) { + TransposeWx8(src, src_stride, dst, dst_stride, width); + src += 8 * src_stride; // Go down 8 rows. + dst += 8; // Move over 8 columns. + i -= 8; + } +#endif + + if (i > 0) { + TransposeWxH_C(src, src_stride, dst, dst_stride, width, i); + } +} + +LIBYUV_API +void RotatePlane90(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height) { + // Rotate by 90 is a transpose with the source read + // from bottom to top. So set the source pointer to the end + // of the buffer and flip the sign of the source stride. + src += src_stride * (height - 1); + src_stride = -src_stride; + TransposePlane(src, src_stride, dst, dst_stride, width, height); +} + +LIBYUV_API +void RotatePlane270(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height) { + // Rotate by 270 is a transpose with the destination written + // from bottom to top. So set the destination pointer to the end + // of the buffer and flip the sign of the destination stride. + dst += dst_stride * (width - 1); + dst_stride = -dst_stride; + TransposePlane(src, src_stride, dst, dst_stride, width, height); +} + +LIBYUV_API +void RotatePlane180(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height) { + // Swap first and last row and mirror the content. Uses a temporary row. + align_buffer_64(row, width); + const uint8_t* src_bot = src + src_stride * (height - 1); + uint8_t* dst_bot = dst + dst_stride * (height - 1); + int half_height = (height + 1) >> 1; + int y; + void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C; + void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C; +#if defined(HAS_MIRRORROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MirrorRow = MirrorRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + MirrorRow = MirrorRow_NEON; + } + } +#endif +#if defined(HAS_MIRRORROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + MirrorRow = MirrorRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + MirrorRow = MirrorRow_SSSE3; + } + } +#endif +#if defined(HAS_MIRRORROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MirrorRow = MirrorRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + MirrorRow = MirrorRow_AVX2; + } + } +#endif +#if defined(HAS_MIRRORROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MirrorRow = MirrorRow_Any_MSA; + if (IS_ALIGNED(width, 64)) { + MirrorRow = MirrorRow_MSA; + } + } +#endif +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; + } +#endif +#if defined(HAS_COPYROW_AVX) + if (TestCpuFlag(kCpuHasAVX)) { + CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX; + } +#endif +#if defined(HAS_COPYROW_ERMS) + if (TestCpuFlag(kCpuHasERMS)) { + CopyRow = CopyRow_ERMS; + } +#endif +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; + } +#endif + + // Odd height will harmlessly mirror the middle row twice. + for (y = 0; y < half_height; ++y) { + MirrorRow(src, row, width); // Mirror first row into a buffer + src += src_stride; + MirrorRow(src_bot, dst, width); // Mirror last row into first row + dst += dst_stride; + CopyRow(row, dst_bot, width); // Copy first mirrored row into last + src_bot -= src_stride; + dst_bot -= dst_stride; + } + free_aligned_buffer_64(row); +} + +LIBYUV_API +void TransposeUV(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { + int i = height; +#if defined(HAS_TRANSPOSEUVWX16_MSA) + void (*TransposeUVWx16)(const uint8_t* src, int src_stride, uint8_t* dst_a, + int dst_stride_a, uint8_t* dst_b, int dst_stride_b, + int width) = TransposeUVWx16_C; +#else + void (*TransposeUVWx8)(const uint8_t* src, int src_stride, uint8_t* dst_a, + int dst_stride_a, uint8_t* dst_b, int dst_stride_b, + int width) = TransposeUVWx8_C; +#endif +#if defined(HAS_TRANSPOSEUVWX8_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + TransposeUVWx8 = TransposeUVWx8_NEON; + } +#endif +#if defined(HAS_TRANSPOSEUVWX8_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + TransposeUVWx8 = TransposeUVWx8_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + TransposeUVWx8 = TransposeUVWx8_SSE2; + } + } +#endif +#if defined(HAS_TRANSPOSEUVWX16_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + TransposeUVWx16 = TransposeUVWx16_Any_MSA; + if (IS_ALIGNED(width, 8)) { + TransposeUVWx16 = TransposeUVWx16_MSA; + } + } +#endif + +#if defined(HAS_TRANSPOSEUVWX16_MSA) + // Work through the source in 8x8 tiles. + while (i >= 16) { + TransposeUVWx16(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, + width); + src += 16 * src_stride; // Go down 16 rows. + dst_a += 16; // Move over 8 columns. + dst_b += 16; // Move over 8 columns. + i -= 16; + } +#else + // Work through the source in 8x8 tiles. + while (i >= 8) { + TransposeUVWx8(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, + width); + src += 8 * src_stride; // Go down 8 rows. + dst_a += 8; // Move over 8 columns. + dst_b += 8; // Move over 8 columns. + i -= 8; + } +#endif + + if (i > 0) { + TransposeUVWxH_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, + width, i); + } +} + +LIBYUV_API +void RotateUV90(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { + src += src_stride * (height - 1); + src_stride = -src_stride; + + TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width, + height); +} + +LIBYUV_API +void RotateUV270(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { + dst_a += dst_stride_a * (width - 1); + dst_b += dst_stride_b * (width - 1); + dst_stride_a = -dst_stride_a; + dst_stride_b = -dst_stride_b; + + TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width, + height); +} + +// Rotate 180 is a horizontal and vertical flip. +LIBYUV_API +void RotateUV180(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { + int i; + void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v, + int width) = MirrorUVRow_C; +#if defined(HAS_MIRRORUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + MirrorUVRow = MirrorUVRow_NEON; + } +#endif +#if defined(HAS_MIRRORUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { + MirrorUVRow = MirrorUVRow_SSSE3; + } +#endif +#if defined(HAS_MIRRORUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) { + MirrorUVRow = MirrorUVRow_MSA; + } +#endif + + dst_a += dst_stride_a * (height - 1); + dst_b += dst_stride_b * (height - 1); + + for (i = 0; i < height; ++i) { + MirrorUVRow(src, dst_a, dst_b, width); + src += src_stride; + dst_a -= dst_stride_a; + dst_b -= dst_stride_b; + } +} + +LIBYUV_API +int RotatePlane(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height, + enum RotationMode mode) { + if (!src || width <= 0 || height == 0 || !dst) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + src = src + (height - 1) * src_stride; + src_stride = -src_stride; + } + + switch (mode) { + case kRotate0: + // copy frame + CopyPlane(src, src_stride, dst, dst_stride, width, height); + return 0; + case kRotate90: + RotatePlane90(src, src_stride, dst, dst_stride, width, height); + return 0; + case kRotate270: + RotatePlane270(src, src_stride, dst, dst_stride, width, height); + return 0; + case kRotate180: + RotatePlane180(src, src_stride, dst, dst_stride, width, height); + return 0; + default: + break; + } + return -1; +} + +LIBYUV_API +int I420Rotate(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + enum RotationMode mode) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y || + !dst_u || !dst_v) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + switch (mode) { + case kRotate0: + // copy frame + return I420Copy(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height); + case kRotate90: + RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, + halfheight); + RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, + halfheight); + return 0; + case kRotate270: + RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, + halfheight); + RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, + halfheight); + return 0; + case kRotate180: + RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, + halfheight); + RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, + halfheight); + return 0; + default: + break; + } + return -1; +} + +LIBYUV_API +int NV12ToI420Rotate(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + enum RotationMode mode) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !src_uv || width <= 0 || height == 0 || !dst_y || !dst_u || + !dst_v) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_uv = src_uv + (halfheight - 1) * src_stride_uv; + src_stride_y = -src_stride_y; + src_stride_uv = -src_stride_uv; + } + + switch (mode) { + case kRotate0: + // copy frame + return NV12ToI420(src_y, src_stride_y, src_uv, src_stride_uv, dst_y, + dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, + width, height); + case kRotate90: + RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, + dst_stride_v, halfwidth, halfheight); + return 0; + case kRotate270: + RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, + dst_stride_v, halfwidth, halfheight); + return 0; + case kRotate180: + RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, + dst_stride_v, halfwidth, halfheight); + return 0; + default: + break; + } + return -1; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/rotate_any.cc b/media/libvpx/libvpx/third_party/libyuv/source/rotate_any.cc new file mode 100644 index 0000000000..c2752e6222 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/rotate_any.cc @@ -0,0 +1,73 @@ +/* + * Copyright 2015 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate.h" +#include "libyuv/rotate_row.h" + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define TANY(NAMEANY, TPOS_SIMD, MASK) \ + void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst, \ + int dst_stride, int width) { \ + int r = width & MASK; \ + int n = width - r; \ + if (n > 0) { \ + TPOS_SIMD(src, src_stride, dst, dst_stride, n); \ + } \ + TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \ + } + +#ifdef HAS_TRANSPOSEWX8_NEON +TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7) +#endif +#ifdef HAS_TRANSPOSEWX8_SSSE3 +TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7) +#endif +#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3 +TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15) +#endif +#ifdef HAS_TRANSPOSEWX16_MSA +TANY(TransposeWx16_Any_MSA, TransposeWx16_MSA, 15) +#endif +#undef TANY + +#define TUVANY(NAMEANY, TPOS_SIMD, MASK) \ + void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst_a, \ + int dst_stride_a, uint8_t* dst_b, int dst_stride_b, \ + int width) { \ + int r = width & MASK; \ + int n = width - r; \ + if (n > 0) { \ + TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, n); \ + } \ + TransposeUVWx8_C(src + n * 2, src_stride, dst_a + n * dst_stride_a, \ + dst_stride_a, dst_b + n * dst_stride_b, dst_stride_b, r); \ + } + +#ifdef HAS_TRANSPOSEUVWX8_NEON +TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7) +#endif +#ifdef HAS_TRANSPOSEUVWX8_SSE2 +TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7) +#endif +#ifdef HAS_TRANSPOSEUVWX16_MSA +TUVANY(TransposeUVWx16_Any_MSA, TransposeUVWx16_MSA, 7) +#endif +#undef TUVANY + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/rotate_argb.cc b/media/libvpx/libvpx/third_party/libyuv/source/rotate_argb.cc new file mode 100644 index 0000000000..5a6e05376f --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/rotate_argb.cc @@ -0,0 +1,224 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate.h" + +#include "libyuv/convert.h" +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" +#include "libyuv/row.h" +#include "libyuv/scale_row.h" /* for ScaleARGBRowDownEven_ */ + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +static void ARGBTranspose(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int i; + int src_pixel_step = src_stride_argb >> 2; + void (*ScaleARGBRowDownEven)( + const uint8_t* src_argb, ptrdiff_t src_stride_argb, int src_step, + uint8_t* dst_argb, int dst_width) = ScaleARGBRowDownEven_C; +#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_SSE2; + if (IS_ALIGNED(height, 4)) { // Width of dest. + ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2; + } + } +#endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_NEON; + if (IS_ALIGNED(height, 4)) { // Width of dest. + ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON; + } + } +#endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MSA; + if (IS_ALIGNED(height, 4)) { // Width of dest. + ScaleARGBRowDownEven = ScaleARGBRowDownEven_MSA; + } + } +#endif + + for (i = 0; i < width; ++i) { // column of source to row of dest. + ScaleARGBRowDownEven(src_argb, 0, src_pixel_step, dst_argb, height); + dst_argb += dst_stride_argb; + src_argb += 4; + } +} + +void ARGBRotate90(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + // Rotate by 90 is a ARGBTranspose with the source read + // from bottom to top. So set the source pointer to the end + // of the buffer and flip the sign of the source stride. + src_argb += src_stride_argb * (height - 1); + src_stride_argb = -src_stride_argb; + ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, + height); +} + +void ARGBRotate270(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + // Rotate by 270 is a ARGBTranspose with the destination written + // from bottom to top. So set the destination pointer to the end + // of the buffer and flip the sign of the destination stride. + dst_argb += dst_stride_argb * (width - 1); + dst_stride_argb = -dst_stride_argb; + ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, + height); +} + +void ARGBRotate180(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + // Swap first and last row and mirror the content. Uses a temporary row. + align_buffer_64(row, width * 4); + const uint8_t* src_bot = src_argb + src_stride_argb * (height - 1); + uint8_t* dst_bot = dst_argb + dst_stride_argb * (height - 1); + int half_height = (height + 1) >> 1; + int y; + void (*ARGBMirrorRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = + ARGBMirrorRow_C; + void (*CopyRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = + CopyRow_C; +#if defined(HAS_ARGBMIRRORROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBMirrorRow = ARGBMirrorRow_Any_NEON; + if (IS_ALIGNED(width, 4)) { + ARGBMirrorRow = ARGBMirrorRow_NEON; + } + } +#endif +#if defined(HAS_ARGBMIRRORROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBMirrorRow = ARGBMirrorRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBMirrorRow = ARGBMirrorRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBMIRRORROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBMirrorRow = ARGBMirrorRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBMirrorRow = ARGBMirrorRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBMIRRORROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBMirrorRow = ARGBMirrorRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBMirrorRow = ARGBMirrorRow_MSA; + } + } +#endif +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; + } +#endif +#if defined(HAS_COPYROW_AVX) + if (TestCpuFlag(kCpuHasAVX)) { + CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX; + } +#endif +#if defined(HAS_COPYROW_ERMS) + if (TestCpuFlag(kCpuHasERMS)) { + CopyRow = CopyRow_ERMS; + } +#endif +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON; + } +#endif + + // Odd height will harmlessly mirror the middle row twice. + for (y = 0; y < half_height; ++y) { + ARGBMirrorRow(src_argb, row, width); // Mirror first row into a buffer + ARGBMirrorRow(src_bot, dst_argb, width); // Mirror last row into first row + CopyRow(row, dst_bot, width * 4); // Copy first mirrored row into last + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + src_bot -= src_stride_argb; + dst_bot -= dst_stride_argb; + } + free_aligned_buffer_64(row); +} + +LIBYUV_API +int ARGBRotate(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + enum RotationMode mode) { + if (!src_argb || width <= 0 || height == 0 || !dst_argb) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + + switch (mode) { + case kRotate0: + // copy frame + return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width, height); + case kRotate90: + ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, + height); + return 0; + case kRotate270: + ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, + height); + return 0; + case kRotate180: + ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, + height); + return 0; + default: + break; + } + return -1; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/rotate_common.cc b/media/libvpx/libvpx/third_party/libyuv/source/rotate_common.cc new file mode 100644 index 0000000000..ff212adebc --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/rotate_common.cc @@ -0,0 +1,106 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate_row.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +void TransposeWx8_C(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + int i; + for (i = 0; i < width; ++i) { + dst[0] = src[0 * src_stride]; + dst[1] = src[1 * src_stride]; + dst[2] = src[2 * src_stride]; + dst[3] = src[3 * src_stride]; + dst[4] = src[4 * src_stride]; + dst[5] = src[5 * src_stride]; + dst[6] = src[6 * src_stride]; + dst[7] = src[7 * src_stride]; + ++src; + dst += dst_stride; + } +} + +void TransposeUVWx8_C(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width) { + int i; + for (i = 0; i < width; ++i) { + dst_a[0] = src[0 * src_stride + 0]; + dst_b[0] = src[0 * src_stride + 1]; + dst_a[1] = src[1 * src_stride + 0]; + dst_b[1] = src[1 * src_stride + 1]; + dst_a[2] = src[2 * src_stride + 0]; + dst_b[2] = src[2 * src_stride + 1]; + dst_a[3] = src[3 * src_stride + 0]; + dst_b[3] = src[3 * src_stride + 1]; + dst_a[4] = src[4 * src_stride + 0]; + dst_b[4] = src[4 * src_stride + 1]; + dst_a[5] = src[5 * src_stride + 0]; + dst_b[5] = src[5 * src_stride + 1]; + dst_a[6] = src[6 * src_stride + 0]; + dst_b[6] = src[6 * src_stride + 1]; + dst_a[7] = src[7 * src_stride + 0]; + dst_b[7] = src[7 * src_stride + 1]; + src += 2; + dst_a += dst_stride_a; + dst_b += dst_stride_b; + } +} + +void TransposeWxH_C(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height) { + int i; + for (i = 0; i < width; ++i) { + int j; + for (j = 0; j < height; ++j) { + dst[i * dst_stride + j] = src[j * src_stride + i]; + } + } +} + +void TransposeUVWxH_C(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { + int i; + for (i = 0; i < width * 2; i += 2) { + int j; + for (j = 0; j < height; ++j) { + dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)]; + dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1]; + } + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/rotate_gcc.cc b/media/libvpx/libvpx/third_party/libyuv/source/rotate_gcc.cc new file mode 100644 index 0000000000..04e19e29ee --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/rotate_gcc.cc @@ -0,0 +1,374 @@ +/* + * Copyright 2015 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate_row.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC x86 and x64. +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) + +// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit. +#if defined(HAS_TRANSPOSEWX8_SSSE3) +void TransposeWx8_SSSE3(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + asm volatile( + // Read in the data from the source pointer. + // First round of bit swap. + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" + "movq (%0,%3),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "movq (%0),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "palignr $0x8,%%xmm1,%%xmm1 \n" + "movq (%0,%3),%%xmm3 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "movq (%0),%%xmm4 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "movq (%0,%3),%%xmm5 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "movq (%0),%%xmm6 \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq (%0,%3),%%xmm7 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "neg %3 \n" + "movdqa %%xmm6,%%xmm7 \n" + "lea 0x8(%0,%3,8),%0 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "neg %3 \n" + // Second round of bit swap. + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "palignr $0x8,%%xmm2,%%xmm2 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "movdqa %%xmm5,%%xmm7 \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + // Third round of bit swap. + // Write to the destination pointer. + "punpckldq %%xmm4,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqa %%xmm0,%%xmm4 \n" + "palignr $0x8,%%xmm4,%%xmm4 \n" + "movq %%xmm4,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movdqa %%xmm2,%%xmm6 \n" + "movq %%xmm2,(%1) \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movq %%xmm6,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm1,%%xmm5 \n" + "movq %%xmm1,(%1) \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq %%xmm5,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movq %%xmm3,(%1) \n" + "movdqa %%xmm3,%%xmm7 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "sub $0x8,%2 \n" + "movq %%xmm7,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // defined(HAS_TRANSPOSEWX8_SSSE3) + +// Transpose 16x8. 64 bit +#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3) +void TransposeWx8_Fast_SSSE3(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + asm volatile( + // Read in the data from the source pointer. + // First round of bit swap. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%0,%3),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm8 \n" + "movdqu (%0),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm8,%%xmm9 \n" + "palignr $0x8,%%xmm1,%%xmm1 \n" + "palignr $0x8,%%xmm9,%%xmm9 \n" + "movdqu (%0,%3),%%xmm3 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm2,%%xmm10 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "punpckhbw %%xmm3,%%xmm10 \n" + "movdqa %%xmm2,%%xmm3 \n" + "movdqa %%xmm10,%%xmm11 \n" + "movdqu (%0),%%xmm4 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "palignr $0x8,%%xmm11,%%xmm11 \n" + "movdqu (%0,%3),%%xmm5 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm4,%%xmm12 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "punpckhbw %%xmm5,%%xmm12 \n" + "movdqa %%xmm4,%%xmm5 \n" + "movdqa %%xmm12,%%xmm13 \n" + "movdqu (%0),%%xmm6 \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "palignr $0x8,%%xmm13,%%xmm13 \n" + "movdqu (%0,%3),%%xmm7 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm6,%%xmm14 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "punpckhbw %%xmm7,%%xmm14 \n" + "neg %3 \n" + "movdqa %%xmm6,%%xmm7 \n" + "movdqa %%xmm14,%%xmm15 \n" + "lea 0x10(%0,%3,8),%0 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + "neg %3 \n" + // Second round of bit swap. + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "palignr $0x8,%%xmm2,%%xmm2 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "movdqa %%xmm5,%%xmm7 \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "punpcklwd %%xmm10,%%xmm8 \n" + "punpcklwd %%xmm11,%%xmm9 \n" + "movdqa %%xmm8,%%xmm10 \n" + "movdqa %%xmm9,%%xmm11 \n" + "palignr $0x8,%%xmm10,%%xmm10 \n" + "palignr $0x8,%%xmm11,%%xmm11 \n" + "punpcklwd %%xmm14,%%xmm12 \n" + "punpcklwd %%xmm15,%%xmm13 \n" + "movdqa %%xmm12,%%xmm14 \n" + "movdqa %%xmm13,%%xmm15 \n" + "palignr $0x8,%%xmm14,%%xmm14 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + // Third round of bit swap. + // Write to the destination pointer. + "punpckldq %%xmm4,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqa %%xmm0,%%xmm4 \n" + "palignr $0x8,%%xmm4,%%xmm4 \n" + "movq %%xmm4,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movdqa %%xmm2,%%xmm6 \n" + "movq %%xmm2,(%1) \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movq %%xmm6,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm1,%%xmm5 \n" + "movq %%xmm1,(%1) \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq %%xmm5,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movq %%xmm3,(%1) \n" + "movdqa %%xmm3,%%xmm7 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "movq %%xmm7,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm12,%%xmm8 \n" + "movq %%xmm8,(%1) \n" + "movdqa %%xmm8,%%xmm12 \n" + "palignr $0x8,%%xmm12,%%xmm12 \n" + "movq %%xmm12,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm14,%%xmm10 \n" + "movdqa %%xmm10,%%xmm14 \n" + "movq %%xmm10,(%1) \n" + "palignr $0x8,%%xmm14,%%xmm14 \n" + "punpckldq %%xmm13,%%xmm9 \n" + "movq %%xmm14,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm9,%%xmm13 \n" + "movq %%xmm9,(%1) \n" + "palignr $0x8,%%xmm13,%%xmm13 \n" + "movq %%xmm13,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm15,%%xmm11 \n" + "movq %%xmm11,(%1) \n" + "movdqa %%xmm11,%%xmm15 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + "sub $0x10,%2 \n" + "movq %%xmm15,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", + "xmm15"); +} +#endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3) + +// Transpose UV 8x8. 64 bit. +#if defined(HAS_TRANSPOSEUVWX8_SSE2) +void TransposeUVWx8_SSE2(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width) { + asm volatile( + // Read in the data from the source pointer. + // First round of bit swap. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%0,%4),%%xmm1 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm8 \n" + "movdqa %%xmm8,%%xmm1 \n" + "movdqu (%0),%%xmm2 \n" + "movdqu (%0,%4),%%xmm3 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm2,%%xmm8 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "punpckhbw %%xmm3,%%xmm8 \n" + "movdqa %%xmm8,%%xmm3 \n" + "movdqu (%0),%%xmm4 \n" + "movdqu (%0,%4),%%xmm5 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm4,%%xmm8 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "punpckhbw %%xmm5,%%xmm8 \n" + "movdqa %%xmm8,%%xmm5 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu (%0,%4),%%xmm7 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm6,%%xmm8 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "neg %4 \n" + "lea 0x10(%0,%4,8),%0 \n" + "punpckhbw %%xmm7,%%xmm8 \n" + "movdqa %%xmm8,%%xmm7 \n" + "neg %4 \n" + // Second round of bit swap. + "movdqa %%xmm0,%%xmm8 \n" + "movdqa %%xmm1,%%xmm9 \n" + "punpckhwd %%xmm2,%%xmm8 \n" + "punpckhwd %%xmm3,%%xmm9 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm8,%%xmm2 \n" + "movdqa %%xmm9,%%xmm3 \n" + "movdqa %%xmm4,%%xmm8 \n" + "movdqa %%xmm5,%%xmm9 \n" + "punpckhwd %%xmm6,%%xmm8 \n" + "punpckhwd %%xmm7,%%xmm9 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm8,%%xmm6 \n" + "movdqa %%xmm9,%%xmm7 \n" + // Third round of bit swap. + // Write to the destination pointer. + "movdqa %%xmm0,%%xmm8 \n" + "punpckldq %%xmm4,%%xmm0 \n" + "movlpd %%xmm0,(%1) \n" // Write back U channel + "movhpd %%xmm0,(%2) \n" // Write back V channel + "punpckhdq %%xmm4,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm2,%%xmm8 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movlpd %%xmm2,(%1) \n" + "movhpd %%xmm2,(%2) \n" + "punpckhdq %%xmm6,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm1,%%xmm8 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movlpd %%xmm1,(%1) \n" + "movhpd %%xmm1,(%2) \n" + "punpckhdq %%xmm5,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm3,%%xmm8 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movlpd %%xmm3,(%1) \n" + "movhpd %%xmm3,(%2) \n" + "punpckhdq %%xmm7,%%xmm8 \n" + "sub $0x8,%3 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst_a), // %1 + "+r"(dst_b), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(src_stride)), // %4 + "r"((intptr_t)(dst_stride_a)), // %5 + "r"((intptr_t)(dst_stride_b)) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7", "xmm8", "xmm9"); +} +#endif // defined(HAS_TRANSPOSEUVWX8_SSE2) +#endif // defined(__x86_64__) || defined(__i386__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/rotate_msa.cc b/media/libvpx/libvpx/third_party/libyuv/source/rotate_msa.cc new file mode 100644 index 0000000000..99bdca65b3 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/rotate_msa.cc @@ -0,0 +1,250 @@ +/* + * Copyright 2016 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate_row.h" + +// This module is for GCC MSA +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#include "libyuv/macros_msa.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define ILVRL_B(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + out0 = (v16u8)__msa_ilvr_b((v16i8)in1, (v16i8)in0); \ + out1 = (v16u8)__msa_ilvl_b((v16i8)in1, (v16i8)in0); \ + out2 = (v16u8)__msa_ilvr_b((v16i8)in3, (v16i8)in2); \ + out3 = (v16u8)__msa_ilvl_b((v16i8)in3, (v16i8)in2); \ + } + +#define ILVRL_H(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + out0 = (v16u8)__msa_ilvr_h((v8i16)in1, (v8i16)in0); \ + out1 = (v16u8)__msa_ilvl_h((v8i16)in1, (v8i16)in0); \ + out2 = (v16u8)__msa_ilvr_h((v8i16)in3, (v8i16)in2); \ + out3 = (v16u8)__msa_ilvl_h((v8i16)in3, (v8i16)in2); \ + } + +#define ILVRL_W(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + out0 = (v16u8)__msa_ilvr_w((v4i32)in1, (v4i32)in0); \ + out1 = (v16u8)__msa_ilvl_w((v4i32)in1, (v4i32)in0); \ + out2 = (v16u8)__msa_ilvr_w((v4i32)in3, (v4i32)in2); \ + out3 = (v16u8)__msa_ilvl_w((v4i32)in3, (v4i32)in2); \ + } + +#define ILVRL_D(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + out0 = (v16u8)__msa_ilvr_d((v2i64)in1, (v2i64)in0); \ + out1 = (v16u8)__msa_ilvl_d((v2i64)in1, (v2i64)in0); \ + out2 = (v16u8)__msa_ilvr_d((v2i64)in3, (v2i64)in2); \ + out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2); \ + } + +void TransposeWx16_C(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + TransposeWx8_C(src, src_stride, dst, dst_stride, width); + TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride, + width); +} + +void TransposeUVWx16_C(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width) { + TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, + width); + TransposeUVWx8_C((src + 8 * src_stride), src_stride, (dst_a + 8), + dst_stride_a, (dst_b + 8), dst_stride_b, width); +} + +void TransposeWx16_MSA(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + int x; + const uint8_t* s; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; + v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9; + + for (x = 0; x < width; x += 16) { + s = src; + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); + ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3); + ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); + res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0); + ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); + dst += dst_stride * 4; + res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1); + ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); + dst += dst_stride * 4; + res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2); + ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); + dst += dst_stride * 4; + res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3); + ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); + src += 16; + dst += dst_stride * 4; + } +} + +void TransposeUVWx16_MSA(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width) { + int x; + const uint8_t* s; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; + v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9; + + for (x = 0; x < width; x += 8) { + s = src; + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); + ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3); + ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); + res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0); + ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3); + ST_UB2(dst0, dst2, dst_a, dst_stride_a); + ST_UB2(dst1, dst3, dst_b, dst_stride_b); + dst_a += dst_stride_a * 2; + dst_b += dst_stride_b * 2; + res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1); + ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3); + ST_UB2(dst0, dst2, dst_a, dst_stride_a); + ST_UB2(dst1, dst3, dst_b, dst_stride_b); + dst_a += dst_stride_a * 2; + dst_b += dst_stride_b * 2; + res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2); + ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3); + ST_UB2(dst0, dst2, dst_a, dst_stride_a); + ST_UB2(dst1, dst3, dst_b, dst_stride_b); + dst_a += dst_stride_a * 2; + dst_b += dst_stride_b * 2; + res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3); + ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3); + ST_UB2(dst0, dst2, dst_a, dst_stride_a); + ST_UB2(dst1, dst3, dst_b, dst_stride_b); + src += 16; + dst_a += dst_stride_a * 2; + dst_b += dst_stride_b * 2; + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) diff --git a/media/libvpx/libvpx/third_party/libyuv/source/rotate_neon.cc b/media/libvpx/libvpx/third_party/libyuv/source/rotate_neon.cc new file mode 100644 index 0000000000..fdc0dd476c --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/rotate_neon.cc @@ -0,0 +1,416 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate_row.h" +#include "libyuv/row.h" + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ + !defined(__aarch64__) + +static const uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13, + 2, 6, 10, 14, 3, 7, 11, 15}; + +void TransposeWx8_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + const uint8_t* src_temp; + asm volatile( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %5, #8 \n" + + // handle 8x8 blocks. this should be the majority of the plane + "1: \n" + "mov %0, %1 \n" + + "vld1.8 {d0}, [%0], %2 \n" + "vld1.8 {d1}, [%0], %2 \n" + "vld1.8 {d2}, [%0], %2 \n" + "vld1.8 {d3}, [%0], %2 \n" + "vld1.8 {d4}, [%0], %2 \n" + "vld1.8 {d5}, [%0], %2 \n" + "vld1.8 {d6}, [%0], %2 \n" + "vld1.8 {d7}, [%0] \n" + + "vtrn.8 d1, d0 \n" + "vtrn.8 d3, d2 \n" + "vtrn.8 d5, d4 \n" + "vtrn.8 d7, d6 \n" + + "vtrn.16 d1, d3 \n" + "vtrn.16 d0, d2 \n" + "vtrn.16 d5, d7 \n" + "vtrn.16 d4, d6 \n" + + "vtrn.32 d1, d5 \n" + "vtrn.32 d0, d4 \n" + "vtrn.32 d3, d7 \n" + "vtrn.32 d2, d6 \n" + + "vrev16.8 q0, q0 \n" + "vrev16.8 q1, q1 \n" + "vrev16.8 q2, q2 \n" + "vrev16.8 q3, q3 \n" + + "mov %0, %3 \n" + + "vst1.8 {d1}, [%0], %4 \n" + "vst1.8 {d0}, [%0], %4 \n" + "vst1.8 {d3}, [%0], %4 \n" + "vst1.8 {d2}, [%0], %4 \n" + "vst1.8 {d5}, [%0], %4 \n" + "vst1.8 {d4}, [%0], %4 \n" + "vst1.8 {d7}, [%0], %4 \n" + "vst1.8 {d6}, [%0] \n" + + "add %1, #8 \n" // src += 8 + "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride + "subs %5, #8 \n" // w -= 8 + "bge 1b \n" + + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %5, #8 \n" + "beq 4f \n" + + // some residual, so between 1 and 7 lines left to transpose + "cmp %5, #2 \n" + "blt 3f \n" + + "cmp %5, #4 \n" + "blt 2f \n" + + // 4x8 block + "mov %0, %1 \n" + "vld1.32 {d0[0]}, [%0], %2 \n" + "vld1.32 {d0[1]}, [%0], %2 \n" + "vld1.32 {d1[0]}, [%0], %2 \n" + "vld1.32 {d1[1]}, [%0], %2 \n" + "vld1.32 {d2[0]}, [%0], %2 \n" + "vld1.32 {d2[1]}, [%0], %2 \n" + "vld1.32 {d3[0]}, [%0], %2 \n" + "vld1.32 {d3[1]}, [%0] \n" + + "mov %0, %3 \n" + + "vld1.8 {q3}, [%6] \n" + + "vtbl.8 d4, {d0, d1}, d6 \n" + "vtbl.8 d5, {d0, d1}, d7 \n" + "vtbl.8 d0, {d2, d3}, d6 \n" + "vtbl.8 d1, {d2, d3}, d7 \n" + + // TODO(frkoenig): Rework shuffle above to + // write out with 4 instead of 8 writes. + "vst1.32 {d4[0]}, [%0], %4 \n" + "vst1.32 {d4[1]}, [%0], %4 \n" + "vst1.32 {d5[0]}, [%0], %4 \n" + "vst1.32 {d5[1]}, [%0] \n" + + "add %0, %3, #4 \n" + "vst1.32 {d0[0]}, [%0], %4 \n" + "vst1.32 {d0[1]}, [%0], %4 \n" + "vst1.32 {d1[0]}, [%0], %4 \n" + "vst1.32 {d1[1]}, [%0] \n" + + "add %1, #4 \n" // src += 4 + "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride + "subs %5, #4 \n" // w -= 4 + "beq 4f \n" + + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %5, #2 \n" + "blt 3f \n" + + // 2x8 block + "2: \n" + "mov %0, %1 \n" + "vld1.16 {d0[0]}, [%0], %2 \n" + "vld1.16 {d1[0]}, [%0], %2 \n" + "vld1.16 {d0[1]}, [%0], %2 \n" + "vld1.16 {d1[1]}, [%0], %2 \n" + "vld1.16 {d0[2]}, [%0], %2 \n" + "vld1.16 {d1[2]}, [%0], %2 \n" + "vld1.16 {d0[3]}, [%0], %2 \n" + "vld1.16 {d1[3]}, [%0] \n" + + "vtrn.8 d0, d1 \n" + + "mov %0, %3 \n" + + "vst1.64 {d0}, [%0], %4 \n" + "vst1.64 {d1}, [%0] \n" + + "add %1, #2 \n" // src += 2 + "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride + "subs %5, #2 \n" // w -= 2 + "beq 4f \n" + + // 1x8 block + "3: \n" + "vld1.8 {d0[0]}, [%1], %2 \n" + "vld1.8 {d0[1]}, [%1], %2 \n" + "vld1.8 {d0[2]}, [%1], %2 \n" + "vld1.8 {d0[3]}, [%1], %2 \n" + "vld1.8 {d0[4]}, [%1], %2 \n" + "vld1.8 {d0[5]}, [%1], %2 \n" + "vld1.8 {d0[6]}, [%1], %2 \n" + "vld1.8 {d0[7]}, [%1] \n" + + "vst1.64 {d0}, [%3] \n" + + "4: \n" + + : "=&r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(src_stride), // %2 + "+r"(dst), // %3 + "+r"(dst_stride), // %4 + "+r"(width) // %5 + : "r"(&kVTbl4x4Transpose) // %6 + : "memory", "cc", "q0", "q1", "q2", "q3"); +} + +static const uvec8 kVTbl4x4TransposeDi = {0, 8, 1, 9, 2, 10, 3, 11, + 4, 12, 5, 13, 6, 14, 7, 15}; + +void TransposeUVWx8_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width) { + const uint8_t* src_temp; + asm volatile( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %7, #8 \n" + + // handle 8x8 blocks. this should be the majority of the plane + "1: \n" + "mov %0, %1 \n" + + "vld2.8 {d0, d1}, [%0], %2 \n" + "vld2.8 {d2, d3}, [%0], %2 \n" + "vld2.8 {d4, d5}, [%0], %2 \n" + "vld2.8 {d6, d7}, [%0], %2 \n" + "vld2.8 {d16, d17}, [%0], %2 \n" + "vld2.8 {d18, d19}, [%0], %2 \n" + "vld2.8 {d20, d21}, [%0], %2 \n" + "vld2.8 {d22, d23}, [%0] \n" + + "vtrn.8 q1, q0 \n" + "vtrn.8 q3, q2 \n" + "vtrn.8 q9, q8 \n" + "vtrn.8 q11, q10 \n" + + "vtrn.16 q1, q3 \n" + "vtrn.16 q0, q2 \n" + "vtrn.16 q9, q11 \n" + "vtrn.16 q8, q10 \n" + + "vtrn.32 q1, q9 \n" + "vtrn.32 q0, q8 \n" + "vtrn.32 q3, q11 \n" + "vtrn.32 q2, q10 \n" + + "vrev16.8 q0, q0 \n" + "vrev16.8 q1, q1 \n" + "vrev16.8 q2, q2 \n" + "vrev16.8 q3, q3 \n" + "vrev16.8 q8, q8 \n" + "vrev16.8 q9, q9 \n" + "vrev16.8 q10, q10 \n" + "vrev16.8 q11, q11 \n" + + "mov %0, %3 \n" + + "vst1.8 {d2}, [%0], %4 \n" + "vst1.8 {d0}, [%0], %4 \n" + "vst1.8 {d6}, [%0], %4 \n" + "vst1.8 {d4}, [%0], %4 \n" + "vst1.8 {d18}, [%0], %4 \n" + "vst1.8 {d16}, [%0], %4 \n" + "vst1.8 {d22}, [%0], %4 \n" + "vst1.8 {d20}, [%0] \n" + + "mov %0, %5 \n" + + "vst1.8 {d3}, [%0], %6 \n" + "vst1.8 {d1}, [%0], %6 \n" + "vst1.8 {d7}, [%0], %6 \n" + "vst1.8 {d5}, [%0], %6 \n" + "vst1.8 {d19}, [%0], %6 \n" + "vst1.8 {d17}, [%0], %6 \n" + "vst1.8 {d23}, [%0], %6 \n" + "vst1.8 {d21}, [%0] \n" + + "add %1, #8*2 \n" // src += 8*2 + "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * dst_stride_a + "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * dst_stride_b + "subs %7, #8 \n" // w -= 8 + "bge 1b \n" + + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %7, #8 \n" + "beq 4f \n" + + // some residual, so between 1 and 7 lines left to transpose + "cmp %7, #2 \n" + "blt 3f \n" + + "cmp %7, #4 \n" + "blt 2f \n" + + // TODO(frkoenig): Clean this up + // 4x8 block + "mov %0, %1 \n" + "vld1.64 {d0}, [%0], %2 \n" + "vld1.64 {d1}, [%0], %2 \n" + "vld1.64 {d2}, [%0], %2 \n" + "vld1.64 {d3}, [%0], %2 \n" + "vld1.64 {d4}, [%0], %2 \n" + "vld1.64 {d5}, [%0], %2 \n" + "vld1.64 {d6}, [%0], %2 \n" + "vld1.64 {d7}, [%0] \n" + + "vld1.8 {q15}, [%8] \n" + + "vtrn.8 q0, q1 \n" + "vtrn.8 q2, q3 \n" + + "vtbl.8 d16, {d0, d1}, d30 \n" + "vtbl.8 d17, {d0, d1}, d31 \n" + "vtbl.8 d18, {d2, d3}, d30 \n" + "vtbl.8 d19, {d2, d3}, d31 \n" + "vtbl.8 d20, {d4, d5}, d30 \n" + "vtbl.8 d21, {d4, d5}, d31 \n" + "vtbl.8 d22, {d6, d7}, d30 \n" + "vtbl.8 d23, {d6, d7}, d31 \n" + + "mov %0, %3 \n" + + "vst1.32 {d16[0]}, [%0], %4 \n" + "vst1.32 {d16[1]}, [%0], %4 \n" + "vst1.32 {d17[0]}, [%0], %4 \n" + "vst1.32 {d17[1]}, [%0], %4 \n" + + "add %0, %3, #4 \n" + "vst1.32 {d20[0]}, [%0], %4 \n" + "vst1.32 {d20[1]}, [%0], %4 \n" + "vst1.32 {d21[0]}, [%0], %4 \n" + "vst1.32 {d21[1]}, [%0] \n" + + "mov %0, %5 \n" + + "vst1.32 {d18[0]}, [%0], %6 \n" + "vst1.32 {d18[1]}, [%0], %6 \n" + "vst1.32 {d19[0]}, [%0], %6 \n" + "vst1.32 {d19[1]}, [%0], %6 \n" + + "add %0, %5, #4 \n" + "vst1.32 {d22[0]}, [%0], %6 \n" + "vst1.32 {d22[1]}, [%0], %6 \n" + "vst1.32 {d23[0]}, [%0], %6 \n" + "vst1.32 {d23[1]}, [%0] \n" + + "add %1, #4*2 \n" // src += 4 * 2 + "add %3, %3, %4, lsl #2 \n" // dst_a += 4 * + // dst_stride_a + "add %5, %5, %6, lsl #2 \n" // dst_b += 4 * + // dst_stride_b + "subs %7, #4 \n" // w -= 4 + "beq 4f \n" + + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %7, #2 \n" + "blt 3f \n" + + // 2x8 block + "2: \n" + "mov %0, %1 \n" + "vld2.16 {d0[0], d2[0]}, [%0], %2 \n" + "vld2.16 {d1[0], d3[0]}, [%0], %2 \n" + "vld2.16 {d0[1], d2[1]}, [%0], %2 \n" + "vld2.16 {d1[1], d3[1]}, [%0], %2 \n" + "vld2.16 {d0[2], d2[2]}, [%0], %2 \n" + "vld2.16 {d1[2], d3[2]}, [%0], %2 \n" + "vld2.16 {d0[3], d2[3]}, [%0], %2 \n" + "vld2.16 {d1[3], d3[3]}, [%0] \n" + + "vtrn.8 d0, d1 \n" + "vtrn.8 d2, d3 \n" + + "mov %0, %3 \n" + + "vst1.64 {d0}, [%0], %4 \n" + "vst1.64 {d2}, [%0] \n" + + "mov %0, %5 \n" + + "vst1.64 {d1}, [%0], %6 \n" + "vst1.64 {d3}, [%0] \n" + + "add %1, #2*2 \n" // src += 2 * 2 + "add %3, %3, %4, lsl #1 \n" // dst_a += 2 * + // dst_stride_a + "add %5, %5, %6, lsl #1 \n" // dst_b += 2 * + // dst_stride_b + "subs %7, #2 \n" // w -= 2 + "beq 4f \n" + + // 1x8 block + "3: \n" + "vld2.8 {d0[0], d1[0]}, [%1], %2 \n" + "vld2.8 {d0[1], d1[1]}, [%1], %2 \n" + "vld2.8 {d0[2], d1[2]}, [%1], %2 \n" + "vld2.8 {d0[3], d1[3]}, [%1], %2 \n" + "vld2.8 {d0[4], d1[4]}, [%1], %2 \n" + "vld2.8 {d0[5], d1[5]}, [%1], %2 \n" + "vld2.8 {d0[6], d1[6]}, [%1], %2 \n" + "vld2.8 {d0[7], d1[7]}, [%1] \n" + + "vst1.64 {d0}, [%3] \n" + "vst1.64 {d1}, [%5] \n" + + "4: \n" + + : "=&r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(src_stride), // %2 + "+r"(dst_a), // %3 + "+r"(dst_stride_a), // %4 + "+r"(dst_b), // %5 + "+r"(dst_stride_b), // %6 + "+r"(width) // %7 + : "r"(&kVTbl4x4TransposeDi) // %8 + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); +} +#endif // defined(__ARM_NEON__) && !defined(__aarch64__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/rotate_neon64.cc b/media/libvpx/libvpx/third_party/libyuv/source/rotate_neon64.cc new file mode 100644 index 0000000000..f469baacf6 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/rotate_neon64.cc @@ -0,0 +1,426 @@ +/* + * Copyright 2014 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate_row.h" +#include "libyuv/row.h" + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC Neon armv8 64 bit. +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +static const uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13, + 2, 6, 10, 14, 3, 7, 11, 15}; + +void TransposeWx8_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + const uint8_t* src_temp; + asm volatile( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %w3, %w3, #8 \n" + + // handle 8x8 blocks. this should be the majority of the plane + "1: \n" + "mov %0, %1 \n" + + "ld1 {v0.8b}, [%0], %5 \n" + "ld1 {v1.8b}, [%0], %5 \n" + "ld1 {v2.8b}, [%0], %5 \n" + "ld1 {v3.8b}, [%0], %5 \n" + "ld1 {v4.8b}, [%0], %5 \n" + "ld1 {v5.8b}, [%0], %5 \n" + "ld1 {v6.8b}, [%0], %5 \n" + "ld1 {v7.8b}, [%0] \n" + + "trn2 v16.8b, v0.8b, v1.8b \n" + "trn1 v17.8b, v0.8b, v1.8b \n" + "trn2 v18.8b, v2.8b, v3.8b \n" + "trn1 v19.8b, v2.8b, v3.8b \n" + "trn2 v20.8b, v4.8b, v5.8b \n" + "trn1 v21.8b, v4.8b, v5.8b \n" + "trn2 v22.8b, v6.8b, v7.8b \n" + "trn1 v23.8b, v6.8b, v7.8b \n" + + "trn2 v3.4h, v17.4h, v19.4h \n" + "trn1 v1.4h, v17.4h, v19.4h \n" + "trn2 v2.4h, v16.4h, v18.4h \n" + "trn1 v0.4h, v16.4h, v18.4h \n" + "trn2 v7.4h, v21.4h, v23.4h \n" + "trn1 v5.4h, v21.4h, v23.4h \n" + "trn2 v6.4h, v20.4h, v22.4h \n" + "trn1 v4.4h, v20.4h, v22.4h \n" + + "trn2 v21.2s, v1.2s, v5.2s \n" + "trn1 v17.2s, v1.2s, v5.2s \n" + "trn2 v20.2s, v0.2s, v4.2s \n" + "trn1 v16.2s, v0.2s, v4.2s \n" + "trn2 v23.2s, v3.2s, v7.2s \n" + "trn1 v19.2s, v3.2s, v7.2s \n" + "trn2 v22.2s, v2.2s, v6.2s \n" + "trn1 v18.2s, v2.2s, v6.2s \n" + + "mov %0, %2 \n" + + "st1 {v17.8b}, [%0], %6 \n" + "st1 {v16.8b}, [%0], %6 \n" + "st1 {v19.8b}, [%0], %6 \n" + "st1 {v18.8b}, [%0], %6 \n" + "st1 {v21.8b}, [%0], %6 \n" + "st1 {v20.8b}, [%0], %6 \n" + "st1 {v23.8b}, [%0], %6 \n" + "st1 {v22.8b}, [%0] \n" + + "add %1, %1, #8 \n" // src += 8 + "add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride + "subs %w3, %w3, #8 \n" // w -= 8 + "b.ge 1b \n" + + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %w3, %w3, #8 \n" + "b.eq 4f \n" + + // some residual, so between 1 and 7 lines left to transpose + "cmp %w3, #2 \n" + "b.lt 3f \n" + + "cmp %w3, #4 \n" + "b.lt 2f \n" + + // 4x8 block + "mov %0, %1 \n" + "ld1 {v0.s}[0], [%0], %5 \n" + "ld1 {v0.s}[1], [%0], %5 \n" + "ld1 {v0.s}[2], [%0], %5 \n" + "ld1 {v0.s}[3], [%0], %5 \n" + "ld1 {v1.s}[0], [%0], %5 \n" + "ld1 {v1.s}[1], [%0], %5 \n" + "ld1 {v1.s}[2], [%0], %5 \n" + "ld1 {v1.s}[3], [%0] \n" + + "mov %0, %2 \n" + + "ld1 {v2.16b}, [%4] \n" + + "tbl v3.16b, {v0.16b}, v2.16b \n" + "tbl v0.16b, {v1.16b}, v2.16b \n" + + // TODO(frkoenig): Rework shuffle above to + // write out with 4 instead of 8 writes. + "st1 {v3.s}[0], [%0], %6 \n" + "st1 {v3.s}[1], [%0], %6 \n" + "st1 {v3.s}[2], [%0], %6 \n" + "st1 {v3.s}[3], [%0] \n" + + "add %0, %2, #4 \n" + "st1 {v0.s}[0], [%0], %6 \n" + "st1 {v0.s}[1], [%0], %6 \n" + "st1 {v0.s}[2], [%0], %6 \n" + "st1 {v0.s}[3], [%0] \n" + + "add %1, %1, #4 \n" // src += 4 + "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride + "subs %w3, %w3, #4 \n" // w -= 4 + "b.eq 4f \n" + + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %w3, #2 \n" + "b.lt 3f \n" + + // 2x8 block + "2: \n" + "mov %0, %1 \n" + "ld1 {v0.h}[0], [%0], %5 \n" + "ld1 {v1.h}[0], [%0], %5 \n" + "ld1 {v0.h}[1], [%0], %5 \n" + "ld1 {v1.h}[1], [%0], %5 \n" + "ld1 {v0.h}[2], [%0], %5 \n" + "ld1 {v1.h}[2], [%0], %5 \n" + "ld1 {v0.h}[3], [%0], %5 \n" + "ld1 {v1.h}[3], [%0] \n" + + "trn2 v2.8b, v0.8b, v1.8b \n" + "trn1 v3.8b, v0.8b, v1.8b \n" + + "mov %0, %2 \n" + + "st1 {v3.8b}, [%0], %6 \n" + "st1 {v2.8b}, [%0] \n" + + "add %1, %1, #2 \n" // src += 2 + "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride + "subs %w3, %w3, #2 \n" // w -= 2 + "b.eq 4f \n" + + // 1x8 block + "3: \n" + "ld1 {v0.b}[0], [%1], %5 \n" + "ld1 {v0.b}[1], [%1], %5 \n" + "ld1 {v0.b}[2], [%1], %5 \n" + "ld1 {v0.b}[3], [%1], %5 \n" + "ld1 {v0.b}[4], [%1], %5 \n" + "ld1 {v0.b}[5], [%1], %5 \n" + "ld1 {v0.b}[6], [%1], %5 \n" + "ld1 {v0.b}[7], [%1] \n" + + "st1 {v0.8b}, [%2] \n" + + "4: \n" + + : "=&r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(dst), // %2 + "+r"(width) // %3 + : "r"(&kVTbl4x4Transpose), // %4 + "r"(static_cast(src_stride)), // %5 + "r"(static_cast(dst_stride)) // %6 + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23"); +} + +static const uint8_t kVTbl4x4TransposeDi[32] = { + 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54, + 1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55}; + +void TransposeUVWx8_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width) { + const uint8_t* src_temp; + asm volatile( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %w4, %w4, #8 \n" + + // handle 8x8 blocks. this should be the majority of the plane + "1: \n" + "mov %0, %1 \n" + + "ld1 {v0.16b}, [%0], %5 \n" + "ld1 {v1.16b}, [%0], %5 \n" + "ld1 {v2.16b}, [%0], %5 \n" + "ld1 {v3.16b}, [%0], %5 \n" + "ld1 {v4.16b}, [%0], %5 \n" + "ld1 {v5.16b}, [%0], %5 \n" + "ld1 {v6.16b}, [%0], %5 \n" + "ld1 {v7.16b}, [%0] \n" + + "trn1 v16.16b, v0.16b, v1.16b \n" + "trn2 v17.16b, v0.16b, v1.16b \n" + "trn1 v18.16b, v2.16b, v3.16b \n" + "trn2 v19.16b, v2.16b, v3.16b \n" + "trn1 v20.16b, v4.16b, v5.16b \n" + "trn2 v21.16b, v4.16b, v5.16b \n" + "trn1 v22.16b, v6.16b, v7.16b \n" + "trn2 v23.16b, v6.16b, v7.16b \n" + + "trn1 v0.8h, v16.8h, v18.8h \n" + "trn2 v1.8h, v16.8h, v18.8h \n" + "trn1 v2.8h, v20.8h, v22.8h \n" + "trn2 v3.8h, v20.8h, v22.8h \n" + "trn1 v4.8h, v17.8h, v19.8h \n" + "trn2 v5.8h, v17.8h, v19.8h \n" + "trn1 v6.8h, v21.8h, v23.8h \n" + "trn2 v7.8h, v21.8h, v23.8h \n" + + "trn1 v16.4s, v0.4s, v2.4s \n" + "trn2 v17.4s, v0.4s, v2.4s \n" + "trn1 v18.4s, v1.4s, v3.4s \n" + "trn2 v19.4s, v1.4s, v3.4s \n" + "trn1 v20.4s, v4.4s, v6.4s \n" + "trn2 v21.4s, v4.4s, v6.4s \n" + "trn1 v22.4s, v5.4s, v7.4s \n" + "trn2 v23.4s, v5.4s, v7.4s \n" + + "mov %0, %2 \n" + + "st1 {v16.d}[0], [%0], %6 \n" + "st1 {v18.d}[0], [%0], %6 \n" + "st1 {v17.d}[0], [%0], %6 \n" + "st1 {v19.d}[0], [%0], %6 \n" + "st1 {v16.d}[1], [%0], %6 \n" + "st1 {v18.d}[1], [%0], %6 \n" + "st1 {v17.d}[1], [%0], %6 \n" + "st1 {v19.d}[1], [%0] \n" + + "mov %0, %3 \n" + + "st1 {v20.d}[0], [%0], %7 \n" + "st1 {v22.d}[0], [%0], %7 \n" + "st1 {v21.d}[0], [%0], %7 \n" + "st1 {v23.d}[0], [%0], %7 \n" + "st1 {v20.d}[1], [%0], %7 \n" + "st1 {v22.d}[1], [%0], %7 \n" + "st1 {v21.d}[1], [%0], %7 \n" + "st1 {v23.d}[1], [%0] \n" + + "add %1, %1, #16 \n" // src += 8*2 + "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * + // dst_stride_a + "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * + // dst_stride_b + "subs %w4, %w4, #8 \n" // w -= 8 + "b.ge 1b \n" + + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %w4, %w4, #8 \n" + "b.eq 4f \n" + + // some residual, so between 1 and 7 lines left to transpose + "cmp %w4, #2 \n" + "b.lt 3f \n" + + "cmp %w4, #4 \n" + "b.lt 2f \n" + + // TODO(frkoenig): Clean this up + // 4x8 block + "mov %0, %1 \n" + "ld1 {v0.8b}, [%0], %5 \n" + "ld1 {v1.8b}, [%0], %5 \n" + "ld1 {v2.8b}, [%0], %5 \n" + "ld1 {v3.8b}, [%0], %5 \n" + "ld1 {v4.8b}, [%0], %5 \n" + "ld1 {v5.8b}, [%0], %5 \n" + "ld1 {v6.8b}, [%0], %5 \n" + "ld1 {v7.8b}, [%0] \n" + + "ld1 {v30.16b}, [%8], #16 \n" + "ld1 {v31.16b}, [%8] \n" + + "tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n" + "tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n" + "tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n" + "tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n" + + "mov %0, %2 \n" + + "st1 {v16.s}[0], [%0], %6 \n" + "st1 {v16.s}[1], [%0], %6 \n" + "st1 {v16.s}[2], [%0], %6 \n" + "st1 {v16.s}[3], [%0], %6 \n" + + "add %0, %2, #4 \n" + "st1 {v18.s}[0], [%0], %6 \n" + "st1 {v18.s}[1], [%0], %6 \n" + "st1 {v18.s}[2], [%0], %6 \n" + "st1 {v18.s}[3], [%0] \n" + + "mov %0, %3 \n" + + "st1 {v17.s}[0], [%0], %7 \n" + "st1 {v17.s}[1], [%0], %7 \n" + "st1 {v17.s}[2], [%0], %7 \n" + "st1 {v17.s}[3], [%0], %7 \n" + + "add %0, %3, #4 \n" + "st1 {v19.s}[0], [%0], %7 \n" + "st1 {v19.s}[1], [%0], %7 \n" + "st1 {v19.s}[2], [%0], %7 \n" + "st1 {v19.s}[3], [%0] \n" + + "add %1, %1, #8 \n" // src += 4 * 2 + "add %2, %2, %6, lsl #2 \n" // dst_a += 4 * + // dst_stride_a + "add %3, %3, %7, lsl #2 \n" // dst_b += 4 * + // dst_stride_b + "subs %w4, %w4, #4 \n" // w -= 4 + "b.eq 4f \n" + + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %w4, #2 \n" + "b.lt 3f \n" + + // 2x8 block + "2: \n" + "mov %0, %1 \n" + "ld2 {v0.h, v1.h}[0], [%0], %5 \n" + "ld2 {v2.h, v3.h}[0], [%0], %5 \n" + "ld2 {v0.h, v1.h}[1], [%0], %5 \n" + "ld2 {v2.h, v3.h}[1], [%0], %5 \n" + "ld2 {v0.h, v1.h}[2], [%0], %5 \n" + "ld2 {v2.h, v3.h}[2], [%0], %5 \n" + "ld2 {v0.h, v1.h}[3], [%0], %5 \n" + "ld2 {v2.h, v3.h}[3], [%0] \n" + + "trn1 v4.8b, v0.8b, v2.8b \n" + "trn2 v5.8b, v0.8b, v2.8b \n" + "trn1 v6.8b, v1.8b, v3.8b \n" + "trn2 v7.8b, v1.8b, v3.8b \n" + + "mov %0, %2 \n" + + "st1 {v4.d}[0], [%0], %6 \n" + "st1 {v6.d}[0], [%0] \n" + + "mov %0, %3 \n" + + "st1 {v5.d}[0], [%0], %7 \n" + "st1 {v7.d}[0], [%0] \n" + + "add %1, %1, #4 \n" // src += 2 * 2 + "add %2, %2, %6, lsl #1 \n" // dst_a += 2 * + // dst_stride_a + "add %3, %3, %7, lsl #1 \n" // dst_b += 2 * + // dst_stride_b + "subs %w4, %w4, #2 \n" // w -= 2 + "b.eq 4f \n" + + // 1x8 block + "3: \n" + "ld2 {v0.b, v1.b}[0], [%1], %5 \n" + "ld2 {v0.b, v1.b}[1], [%1], %5 \n" + "ld2 {v0.b, v1.b}[2], [%1], %5 \n" + "ld2 {v0.b, v1.b}[3], [%1], %5 \n" + "ld2 {v0.b, v1.b}[4], [%1], %5 \n" + "ld2 {v0.b, v1.b}[5], [%1], %5 \n" + "ld2 {v0.b, v1.b}[6], [%1], %5 \n" + "ld2 {v0.b, v1.b}[7], [%1] \n" + + "st1 {v0.d}[0], [%2] \n" + "st1 {v1.d}[0], [%3] \n" + + "4: \n" + + : "=&r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(dst_a), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : "r"(static_cast(src_stride)), // %5 + "r"(static_cast(dst_stride_a)), // %6 + "r"(static_cast(dst_stride_b)), // %7 + "r"(&kVTbl4x4TransposeDi) // %8 + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31"); +} +#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/rotate_win.cc b/media/libvpx/libvpx/third_party/libyuv/source/rotate_win.cc new file mode 100644 index 0000000000..e887dd525c --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/rotate_win.cc @@ -0,0 +1,252 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate_row.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for 32 bit Visual C x86 and clangcl +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) + +__declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + __asm { + push edi + push esi + push ebp + mov eax, [esp + 12 + 4] // src + mov edi, [esp + 12 + 8] // src_stride + mov edx, [esp + 12 + 12] // dst + mov esi, [esp + 12 + 16] // dst_stride + mov ecx, [esp + 12 + 20] // width + + // Read in the data from the source pointer. + // First round of bit swap. + align 4 + convertloop: + movq xmm0, qword ptr [eax] + lea ebp, [eax + 8] + movq xmm1, qword ptr [eax + edi] + lea eax, [eax + 2 * edi] + punpcklbw xmm0, xmm1 + movq xmm2, qword ptr [eax] + movdqa xmm1, xmm0 + palignr xmm1, xmm1, 8 + movq xmm3, qword ptr [eax + edi] + lea eax, [eax + 2 * edi] + punpcklbw xmm2, xmm3 + movdqa xmm3, xmm2 + movq xmm4, qword ptr [eax] + palignr xmm3, xmm3, 8 + movq xmm5, qword ptr [eax + edi] + punpcklbw xmm4, xmm5 + lea eax, [eax + 2 * edi] + movdqa xmm5, xmm4 + movq xmm6, qword ptr [eax] + palignr xmm5, xmm5, 8 + movq xmm7, qword ptr [eax + edi] + punpcklbw xmm6, xmm7 + mov eax, ebp + movdqa xmm7, xmm6 + palignr xmm7, xmm7, 8 + // Second round of bit swap. + punpcklwd xmm0, xmm2 + punpcklwd xmm1, xmm3 + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + palignr xmm2, xmm2, 8 + palignr xmm3, xmm3, 8 + punpcklwd xmm4, xmm6 + punpcklwd xmm5, xmm7 + movdqa xmm6, xmm4 + movdqa xmm7, xmm5 + palignr xmm6, xmm6, 8 + palignr xmm7, xmm7, 8 + // Third round of bit swap. + // Write to the destination pointer. + punpckldq xmm0, xmm4 + movq qword ptr [edx], xmm0 + movdqa xmm4, xmm0 + palignr xmm4, xmm4, 8 + movq qword ptr [edx + esi], xmm4 + lea edx, [edx + 2 * esi] + punpckldq xmm2, xmm6 + movdqa xmm6, xmm2 + palignr xmm6, xmm6, 8 + movq qword ptr [edx], xmm2 + punpckldq xmm1, xmm5 + movq qword ptr [edx + esi], xmm6 + lea edx, [edx + 2 * esi] + movdqa xmm5, xmm1 + movq qword ptr [edx], xmm1 + palignr xmm5, xmm5, 8 + punpckldq xmm3, xmm7 + movq qword ptr [edx + esi], xmm5 + lea edx, [edx + 2 * esi] + movq qword ptr [edx], xmm3 + movdqa xmm7, xmm3 + palignr xmm7, xmm7, 8 + sub ecx, 8 + movq qword ptr [edx + esi], xmm7 + lea edx, [edx + 2 * esi] + jg convertloop + + pop ebp + pop esi + pop edi + ret + } +} + +__declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int w) { + __asm { + push ebx + push esi + push edi + push ebp + mov eax, [esp + 16 + 4] // src + mov edi, [esp + 16 + 8] // src_stride + mov edx, [esp + 16 + 12] // dst_a + mov esi, [esp + 16 + 16] // dst_stride_a + mov ebx, [esp + 16 + 20] // dst_b + mov ebp, [esp + 16 + 24] // dst_stride_b + mov ecx, esp + sub esp, 4 + 16 + and esp, ~15 + mov [esp + 16], ecx + mov ecx, [ecx + 16 + 28] // w + + align 4 + // Read in the data from the source pointer. + // First round of bit swap. + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + edi] + lea eax, [eax + 2 * edi] + movdqa xmm7, xmm0 // use xmm7 as temp register. + punpcklbw xmm0, xmm1 + punpckhbw xmm7, xmm1 + movdqa xmm1, xmm7 + movdqu xmm2, [eax] + movdqu xmm3, [eax + edi] + lea eax, [eax + 2 * edi] + movdqa xmm7, xmm2 + punpcklbw xmm2, xmm3 + punpckhbw xmm7, xmm3 + movdqa xmm3, xmm7 + movdqu xmm4, [eax] + movdqu xmm5, [eax + edi] + lea eax, [eax + 2 * edi] + movdqa xmm7, xmm4 + punpcklbw xmm4, xmm5 + punpckhbw xmm7, xmm5 + movdqa xmm5, xmm7 + movdqu xmm6, [eax] + movdqu xmm7, [eax + edi] + lea eax, [eax + 2 * edi] + movdqu [esp], xmm5 // backup xmm5 + neg edi + movdqa xmm5, xmm6 // use xmm5 as temp register. + punpcklbw xmm6, xmm7 + punpckhbw xmm5, xmm7 + movdqa xmm7, xmm5 + lea eax, [eax + 8 * edi + 16] + neg edi + // Second round of bit swap. + movdqa xmm5, xmm0 + punpcklwd xmm0, xmm2 + punpckhwd xmm5, xmm2 + movdqa xmm2, xmm5 + movdqa xmm5, xmm1 + punpcklwd xmm1, xmm3 + punpckhwd xmm5, xmm3 + movdqa xmm3, xmm5 + movdqa xmm5, xmm4 + punpcklwd xmm4, xmm6 + punpckhwd xmm5, xmm6 + movdqa xmm6, xmm5 + movdqu xmm5, [esp] // restore xmm5 + movdqu [esp], xmm6 // backup xmm6 + movdqa xmm6, xmm5 // use xmm6 as temp register. + punpcklwd xmm5, xmm7 + punpckhwd xmm6, xmm7 + movdqa xmm7, xmm6 + + // Third round of bit swap. + // Write to the destination pointer. + movdqa xmm6, xmm0 + punpckldq xmm0, xmm4 + punpckhdq xmm6, xmm4 + movdqa xmm4, xmm6 + movdqu xmm6, [esp] // restore xmm6 + movlpd qword ptr [edx], xmm0 + movhpd qword ptr [ebx], xmm0 + movlpd qword ptr [edx + esi], xmm4 + lea edx, [edx + 2 * esi] + movhpd qword ptr [ebx + ebp], xmm4 + lea ebx, [ebx + 2 * ebp] + movdqa xmm0, xmm2 // use xmm0 as the temp register. + punpckldq xmm2, xmm6 + movlpd qword ptr [edx], xmm2 + movhpd qword ptr [ebx], xmm2 + punpckhdq xmm0, xmm6 + movlpd qword ptr [edx + esi], xmm0 + lea edx, [edx + 2 * esi] + movhpd qword ptr [ebx + ebp], xmm0 + lea ebx, [ebx + 2 * ebp] + movdqa xmm0, xmm1 // use xmm0 as the temp register. + punpckldq xmm1, xmm5 + movlpd qword ptr [edx], xmm1 + movhpd qword ptr [ebx], xmm1 + punpckhdq xmm0, xmm5 + movlpd qword ptr [edx + esi], xmm0 + lea edx, [edx + 2 * esi] + movhpd qword ptr [ebx + ebp], xmm0 + lea ebx, [ebx + 2 * ebp] + movdqa xmm0, xmm3 // use xmm0 as the temp register. + punpckldq xmm3, xmm7 + movlpd qword ptr [edx], xmm3 + movhpd qword ptr [ebx], xmm3 + punpckhdq xmm0, xmm7 + sub ecx, 8 + movlpd qword ptr [edx + esi], xmm0 + lea edx, [edx + 2 * esi] + movhpd qword ptr [ebx + ebp], xmm0 + lea ebx, [ebx + 2 * ebp] + jg convertloop + + mov esp, [esp + 16] + pop ebp + pop edi + pop esi + pop ebx + ret + } +} + +#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/row_any.cc b/media/libvpx/libvpx/third_party/libyuv/source/row_any.cc new file mode 100644 index 0000000000..e91560c44c --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/row_any.cc @@ -0,0 +1,1211 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#include // For memset. + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// memset for temp is meant to clear the source buffer (not dest) so that +// SIMD that reads full multiple of 16 bytes will not trigger msan errors. +// memset is not needed for production, as the garbage values are processed but +// not used, although there may be edge cases for subsampling. +// The size of the buffer is based on the largest read, which can be inferred +// by the source type (e.g. ARGB) and the mask (last parameter), or by examining +// the source code for how much the source pointers are advanced. + +// Subsampled source needs to be increase by 1 of not even. +#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift)) + +// Any 4 planes to 1 with yuvconstants +#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ + const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \ + const struct YuvConstants* yuvconstants, int width) { \ + SIMD_ALIGNED(uint8_t temp[64 * 5]); \ + memset(temp, 0, 64 * 4); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, y_buf + n, r); \ + memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(temp + 192, a_buf + n, r); \ + ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, \ + yuvconstants, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \ + SS(r, DUVSHIFT) * BPP); \ + } + +#ifdef HAS_I422ALPHATOARGBROW_SSSE3 +ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7) +#endif +#ifdef HAS_I422ALPHATOARGBROW_AVX2 +ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15) +#endif +#ifdef HAS_I422ALPHATOARGBROW_NEON +ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7) +#endif +#ifdef HAS_I422ALPHATOARGBROW_MSA +ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7) +#endif +#undef ANY41C + +// Any 3 planes to 1. +#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ + const uint8_t* v_buf, uint8_t* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8_t temp[64 * 4]); \ + memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n); \ + } \ + memcpy(temp, y_buf + n, r); \ + memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \ + SS(r, DUVSHIFT) * BPP); \ + } + +// Merge functions. +#ifdef HAS_MERGERGBROW_SSSE3 +ANY31(MergeRGBRow_Any_SSSE3, MergeRGBRow_SSSE3, 0, 0, 3, 15) +#endif +#ifdef HAS_MERGERGBROW_NEON +ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15) +#endif +#ifdef HAS_I422TOYUY2ROW_SSE2 +ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15) +ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15) +#endif +#ifdef HAS_I422TOYUY2ROW_AVX2 +ANY31(I422ToYUY2Row_Any_AVX2, I422ToYUY2Row_AVX2, 1, 1, 4, 31) +ANY31(I422ToUYVYRow_Any_AVX2, I422ToUYVYRow_AVX2, 1, 1, 4, 31) +#endif +#ifdef HAS_I422TOYUY2ROW_NEON +ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15) +#endif +#ifdef HAS_I422TOYUY2ROW_MSA +ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31) +#endif +#ifdef HAS_I422TOUYVYROW_NEON +ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15) +#endif +#ifdef HAS_I422TOUYVYROW_MSA +ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31) +#endif +#ifdef HAS_BLENDPLANEROW_AVX2 +ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31) +#endif +#ifdef HAS_BLENDPLANEROW_SSSE3 +ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7) +#endif +#undef ANY31 + +// Note that odd width replication includes 444 due to implementation +// on arm that subsamples 444 to 422 internally. +// Any 3 planes to 1 with yuvconstants +#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ + const uint8_t* v_buf, uint8_t* dst_ptr, \ + const struct YuvConstants* yuvconstants, int width) { \ + SIMD_ALIGNED(uint8_t temp[128 * 4]); \ + memset(temp, 0, 128 * 3); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, y_buf + n, r); \ + memcpy(temp + 128, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(temp + 256, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + if (width & 1) { \ + temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \ + temp[256 + SS(r, UVSHIFT)] = temp[256 + SS(r, UVSHIFT) - 1]; \ + } \ + ANY_SIMD(temp, temp + 128, temp + 256, temp + 384, yuvconstants, \ + MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 384, \ + SS(r, DUVSHIFT) * BPP); \ + } + +#ifdef HAS_I422TOARGBROW_SSSE3 +ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7) +#endif +#ifdef HAS_I422TOAR30ROW_SSSE3 +ANY31C(I422ToAR30Row_Any_SSSE3, I422ToAR30Row_SSSE3, 1, 0, 4, 7) +#endif +#ifdef HAS_I422TOAR30ROW_AVX2 +ANY31C(I422ToAR30Row_Any_AVX2, I422ToAR30Row_AVX2, 1, 0, 4, 15) +#endif +#ifdef HAS_I444TOARGBROW_SSSE3 +ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7) +ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7) +ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7) +ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7) +ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7) +ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 15) +#endif // HAS_I444TOARGBROW_SSSE3 +#ifdef HAS_I422TORGB24ROW_AVX2 +ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31) +#endif +#ifdef HAS_I422TOARGBROW_AVX2 +ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15) +#endif +#ifdef HAS_I422TORGBAROW_AVX2 +ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15) +#endif +#ifdef HAS_I444TOARGBROW_AVX2 +ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15) +#endif +#ifdef HAS_I422TOARGB4444ROW_AVX2 +ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 15) +#endif +#ifdef HAS_I422TOARGB1555ROW_AVX2 +ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 15) +#endif +#ifdef HAS_I422TORGB565ROW_AVX2 +ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 15) +#endif +#ifdef HAS_I422TOARGBROW_NEON +ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7) +ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7) +ANY31C(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7) +ANY31C(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7) +ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7) +ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7) +ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7) +#endif +#ifdef HAS_I422TOARGBROW_MSA +ANY31C(I444ToARGBRow_Any_MSA, I444ToARGBRow_MSA, 0, 0, 4, 7) +ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7) +ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7) +ANY31C(I422ToRGB24Row_Any_MSA, I422ToRGB24Row_MSA, 1, 0, 3, 15) +ANY31C(I422ToARGB4444Row_Any_MSA, I422ToARGB4444Row_MSA, 1, 0, 2, 7) +ANY31C(I422ToARGB1555Row_Any_MSA, I422ToARGB1555Row_MSA, 1, 0, 2, 7) +ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7) +#endif +#undef ANY31C + +// Any 3 planes of 16 bit to 1 with yuvconstants +// TODO(fbarchard): consider sharing this code with ANY31C +#define ANY31CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \ + void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf, \ + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, \ + int width) { \ + SIMD_ALIGNED(T temp[16 * 3]); \ + SIMD_ALIGNED(uint8_t out[64]); \ + memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, y_buf + n, r * SBPP); \ + memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ + memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ + ANY_SIMD(temp, temp + 16, temp + 32, out, yuvconstants, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP); \ + } + +#ifdef HAS_I210TOAR30ROW_SSSE3 +ANY31CT(I210ToAR30Row_Any_SSSE3, I210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_I210TOARGBROW_SSSE3 +ANY31CT(I210ToARGBRow_Any_SSSE3, I210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_I210TOARGBROW_AVX2 +ANY31CT(I210ToARGBRow_Any_AVX2, I210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) +#endif +#ifdef HAS_I210TOAR30ROW_AVX2 +ANY31CT(I210ToAR30Row_Any_AVX2, I210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15) +#endif +#undef ANY31CT + +// Any 2 planes to 1. +#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ + void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \ + int width) { \ + SIMD_ALIGNED(uint8_t temp[64 * 3]); \ + memset(temp, 0, 64 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \ + } \ + memcpy(temp, y_buf + n * SBPP, r * SBPP); \ + memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \ + SS(r, UVSHIFT) * SBPP2); \ + ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ + } + +// Merge functions. +#ifdef HAS_MERGEUVROW_SSE2 +ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15) +#endif +#ifdef HAS_MERGEUVROW_AVX2 +ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31) +#endif +#ifdef HAS_MERGEUVROW_NEON +ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15) +#endif +#ifdef HAS_MERGEUVROW_MSA +ANY21(MergeUVRow_Any_MSA, MergeUVRow_MSA, 0, 1, 1, 2, 15) +#endif + +// Math functions. +#ifdef HAS_ARGBMULTIPLYROW_SSE2 +ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3) +#endif +#ifdef HAS_ARGBADDROW_SSE2 +ANY21(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, 0, 4, 4, 4, 3) +#endif +#ifdef HAS_ARGBSUBTRACTROW_SSE2 +ANY21(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, 0, 4, 4, 4, 3) +#endif +#ifdef HAS_ARGBMULTIPLYROW_AVX2 +ANY21(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_ARGBADDROW_AVX2 +ANY21(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_ARGBSUBTRACTROW_AVX2 +ANY21(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_ARGBMULTIPLYROW_NEON +ANY21(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_ARGBADDROW_NEON +ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_ARGBSUBTRACTROW_NEON +ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_ARGBMULTIPLYROW_MSA +ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3) +#endif +#ifdef HAS_ARGBADDROW_MSA +ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_ARGBSUBTRACTROW_MSA +ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_SOBELROW_SSE2 +ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15) +#endif +#ifdef HAS_SOBELROW_NEON +ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7) +#endif +#ifdef HAS_SOBELROW_MSA +ANY21(SobelRow_Any_MSA, SobelRow_MSA, 0, 1, 1, 4, 15) +#endif +#ifdef HAS_SOBELTOPLANEROW_SSE2 +ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15) +#endif +#ifdef HAS_SOBELTOPLANEROW_NEON +ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15) +#endif +#ifdef HAS_SOBELTOPLANEROW_MSA +ANY21(SobelToPlaneRow_Any_MSA, SobelToPlaneRow_MSA, 0, 1, 1, 1, 31) +#endif +#ifdef HAS_SOBELXYROW_SSE2 +ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15) +#endif +#ifdef HAS_SOBELXYROW_NEON +ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7) +#endif +#ifdef HAS_SOBELXYROW_MSA +ANY21(SobelXYRow_Any_MSA, SobelXYRow_MSA, 0, 1, 1, 4, 15) +#endif +#undef ANY21 + +// Any 2 planes to 1 with yuvconstants +#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ + void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \ + const struct YuvConstants* yuvconstants, int width) { \ + SIMD_ALIGNED(uint8_t temp[128 * 3]); \ + memset(temp, 0, 128 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, y_buf + n * SBPP, r * SBPP); \ + memcpy(temp + 128, uv_buf + (n >> UVSHIFT) * SBPP2, \ + SS(r, UVSHIFT) * SBPP2); \ + ANY_SIMD(temp, temp + 128, temp + 256, yuvconstants, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 256, r * BPP); \ + } + +// Biplanar to RGB. +#ifdef HAS_NV12TOARGBROW_SSSE3 +ANY21C(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7) +#endif +#ifdef HAS_NV12TOARGBROW_AVX2 +ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15) +#endif +#ifdef HAS_NV12TOARGBROW_NEON +ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7) +#endif +#ifdef HAS_NV12TOARGBROW_MSA +ANY21C(NV12ToARGBRow_Any_MSA, NV12ToARGBRow_MSA, 1, 1, 2, 4, 7) +#endif +#ifdef HAS_NV21TOARGBROW_SSSE3 +ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7) +#endif +#ifdef HAS_NV21TOARGBROW_AVX2 +ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15) +#endif +#ifdef HAS_NV21TOARGBROW_NEON +ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7) +#endif +#ifdef HAS_NV21TOARGBROW_MSA +ANY21C(NV21ToARGBRow_Any_MSA, NV21ToARGBRow_MSA, 1, 1, 2, 4, 7) +#endif +#ifdef HAS_NV12TORGB24ROW_NEON +ANY21C(NV12ToRGB24Row_Any_NEON, NV12ToRGB24Row_NEON, 1, 1, 2, 3, 7) +#endif +#ifdef HAS_NV21TORGB24ROW_NEON +ANY21C(NV21ToRGB24Row_Any_NEON, NV21ToRGB24Row_NEON, 1, 1, 2, 3, 7) +#endif +#ifdef HAS_NV12TORGB24ROW_SSSE3 +ANY21C(NV12ToRGB24Row_Any_SSSE3, NV12ToRGB24Row_SSSE3, 1, 1, 2, 3, 15) +#endif +#ifdef HAS_NV21TORGB24ROW_SSSE3 +ANY21C(NV21ToRGB24Row_Any_SSSE3, NV21ToRGB24Row_SSSE3, 1, 1, 2, 3, 15) +#endif +#ifdef HAS_NV12TORGB24ROW_AVX2 +ANY21C(NV12ToRGB24Row_Any_AVX2, NV12ToRGB24Row_AVX2, 1, 1, 2, 3, 31) +#endif +#ifdef HAS_NV21TORGB24ROW_AVX2 +ANY21C(NV21ToRGB24Row_Any_AVX2, NV21ToRGB24Row_AVX2, 1, 1, 2, 3, 31) +#endif +#ifdef HAS_NV12TORGB565ROW_SSSE3 +ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7) +#endif +#ifdef HAS_NV12TORGB565ROW_AVX2 +ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15) +#endif +#ifdef HAS_NV12TORGB565ROW_NEON +ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7) +#endif +#ifdef HAS_NV12TORGB565ROW_MSA +ANY21C(NV12ToRGB565Row_Any_MSA, NV12ToRGB565Row_MSA, 1, 1, 2, 2, 7) +#endif +#undef ANY21C + +// Any 1 to 1. +#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8_t temp[128 * 2]); \ + memset(temp, 0, 128); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, n); \ + } \ + memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ + ANY_SIMD(temp, temp + 128, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ + } + +#ifdef HAS_COPYROW_AVX +ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63) +#endif +#ifdef HAS_COPYROW_SSE2 +ANY11(CopyRow_Any_SSE2, CopyRow_SSE2, 0, 1, 1, 31) +#endif +#ifdef HAS_COPYROW_NEON +ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31) +#endif +#if defined(HAS_ARGBTORGB24ROW_SSSE3) +ANY11(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 0, 4, 3, 15) +ANY11(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 0, 4, 3, 15) +ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3) +ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3) +ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3) +#endif +#if defined(HAS_ARGBTORGB24ROW_AVX2) +ANY11(ARGBToRGB24Row_Any_AVX2, ARGBToRGB24Row_AVX2, 0, 4, 3, 31) +#endif +#if defined(HAS_ARGBTORGB24ROW_AVX512VBMI) +ANY11(ARGBToRGB24Row_Any_AVX512VBMI, ARGBToRGB24Row_AVX512VBMI, 0, 4, 3, 31) +#endif +#if defined(HAS_ARGBTORAWROW_AVX2) +ANY11(ARGBToRAWRow_Any_AVX2, ARGBToRAWRow_AVX2, 0, 4, 3, 31) +#endif +#if defined(HAS_ARGBTORGB565ROW_AVX2) +ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7) +#endif +#if defined(HAS_ARGBTOARGB4444ROW_AVX2) +ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7) +ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7) +#endif +#if defined(HAS_ABGRTOAR30ROW_SSSE3) +ANY11(ABGRToAR30Row_Any_SSSE3, ABGRToAR30Row_SSSE3, 0, 4, 4, 3) +#endif +#if defined(HAS_ARGBTOAR30ROW_SSSE3) +ANY11(ARGBToAR30Row_Any_SSSE3, ARGBToAR30Row_SSSE3, 0, 4, 4, 3) +#endif +#if defined(HAS_ABGRTOAR30ROW_AVX2) +ANY11(ABGRToAR30Row_Any_AVX2, ABGRToAR30Row_AVX2, 0, 4, 4, 7) +#endif +#if defined(HAS_ARGBTOAR30ROW_AVX2) +ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7) +#endif +#if defined(HAS_J400TOARGBROW_SSE2) +ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7) +#endif +#if defined(HAS_J400TOARGBROW_AVX2) +ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15) +#endif +#if defined(HAS_I400TOARGBROW_SSE2) +ANY11(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, 0, 1, 4, 7) +#endif +#if defined(HAS_I400TOARGBROW_AVX2) +ANY11(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, 0, 1, 4, 15) +#endif +#if defined(HAS_RGB24TOARGBROW_SSSE3) +ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15) +ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15) +ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7) +ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7) +ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7) +#endif +#if defined(HAS_RAWTORGB24ROW_SSSE3) +ANY11(RAWToRGB24Row_Any_SSSE3, RAWToRGB24Row_SSSE3, 0, 3, 3, 7) +#endif +#if defined(HAS_RGB565TOARGBROW_AVX2) +ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15) +#endif +#if defined(HAS_ARGB1555TOARGBROW_AVX2) +ANY11(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, 0, 2, 4, 15) +#endif +#if defined(HAS_ARGB4444TOARGBROW_AVX2) +ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15) +#endif +#if defined(HAS_ARGBTORGB24ROW_NEON) +ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 7) +ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7) +ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7) +ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7) +ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7) +ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7) +ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7) +#endif +#if defined(HAS_ARGBTORGB24ROW_MSA) +ANY11(ARGBToRGB24Row_Any_MSA, ARGBToRGB24Row_MSA, 0, 4, 3, 15) +ANY11(ARGBToRAWRow_Any_MSA, ARGBToRAWRow_MSA, 0, 4, 3, 15) +ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7) +ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7) +ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7) +ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15) +ANY11(I400ToARGBRow_Any_MSA, I400ToARGBRow_MSA, 0, 1, 4, 15) +#endif +#if defined(HAS_RAWTORGB24ROW_NEON) +ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7) +#endif +#if defined(HAS_RAWTORGB24ROW_MSA) +ANY11(RAWToRGB24Row_Any_MSA, RAWToRGB24Row_MSA, 0, 3, 3, 15) +#endif +#ifdef HAS_ARGBTOYROW_AVX2 +ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31) +#endif +#ifdef HAS_ARGBTOYJROW_AVX2 +ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31) +#endif +#ifdef HAS_UYVYTOYROW_AVX2 +ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31) +#endif +#ifdef HAS_YUY2TOYROW_AVX2 +ANY11(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 1, 4, 1, 31) +#endif +#ifdef HAS_ARGBTOYROW_SSSE3 +ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15) +#endif +#ifdef HAS_BGRATOYROW_SSSE3 +ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15) +ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15) +ANY11(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 0, 4, 1, 15) +ANY11(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 1, 4, 1, 15) +ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15) +#endif +#ifdef HAS_ARGBTOYJROW_SSSE3 +ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15) +#endif +#ifdef HAS_ARGBTOYROW_NEON +ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7) +#endif +#ifdef HAS_ARGBTOYROW_MSA +ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15) +#endif +#ifdef HAS_ARGBTOYJROW_NEON +ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7) +#endif +#ifdef HAS_ARGBTOYJROW_MSA +ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15) +#endif +#ifdef HAS_BGRATOYROW_NEON +ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7) +#endif +#ifdef HAS_BGRATOYROW_MSA +ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15) +#endif +#ifdef HAS_ABGRTOYROW_NEON +ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7) +#endif +#ifdef HAS_ABGRTOYROW_MSA +ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7) +#endif +#ifdef HAS_RGBATOYROW_NEON +ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7) +#endif +#ifdef HAS_RGBATOYROW_MSA +ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15) +#endif +#ifdef HAS_RGB24TOYROW_NEON +ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7) +#endif +#ifdef HAS_RGB24TOYROW_MSA +ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15) +#endif +#ifdef HAS_RAWTOYROW_NEON +ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7) +#endif +#ifdef HAS_RAWTOYROW_MSA +ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15) +#endif +#ifdef HAS_RGB565TOYROW_NEON +ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7) +#endif +#ifdef HAS_RGB565TOYROW_MSA +ANY11(RGB565ToYRow_Any_MSA, RGB565ToYRow_MSA, 0, 2, 1, 15) +#endif +#ifdef HAS_ARGB1555TOYROW_NEON +ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7) +#endif +#ifdef HAS_ARGB1555TOYROW_MSA +ANY11(ARGB1555ToYRow_Any_MSA, ARGB1555ToYRow_MSA, 0, 2, 1, 15) +#endif +#ifdef HAS_ARGB4444TOYROW_NEON +ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7) +#endif +#ifdef HAS_YUY2TOYROW_NEON +ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15) +#endif +#ifdef HAS_UYVYTOYROW_NEON +ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15) +#endif +#ifdef HAS_YUY2TOYROW_MSA +ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31) +#endif +#ifdef HAS_UYVYTOYROW_MSA +ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31) +#endif +#ifdef HAS_RGB24TOARGBROW_NEON +ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7) +#endif +#ifdef HAS_RGB24TOARGBROW_MSA +ANY11(RGB24ToARGBRow_Any_MSA, RGB24ToARGBRow_MSA, 0, 3, 4, 15) +#endif +#ifdef HAS_RAWTOARGBROW_NEON +ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7) +#endif +#ifdef HAS_RAWTOARGBROW_MSA +ANY11(RAWToARGBRow_Any_MSA, RAWToARGBRow_MSA, 0, 3, 4, 15) +#endif +#ifdef HAS_RGB565TOARGBROW_NEON +ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7) +#endif +#ifdef HAS_RGB565TOARGBROW_MSA +ANY11(RGB565ToARGBRow_Any_MSA, RGB565ToARGBRow_MSA, 0, 2, 4, 15) +#endif +#ifdef HAS_ARGB1555TOARGBROW_NEON +ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7) +#endif +#ifdef HAS_ARGB1555TOARGBROW_MSA +ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15) +#endif +#ifdef HAS_ARGB4444TOARGBROW_NEON +ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7) +#endif +#ifdef HAS_ARGB4444TOARGBROW_MSA +ANY11(ARGB4444ToARGBRow_Any_MSA, ARGB4444ToARGBRow_MSA, 0, 2, 4, 15) +#endif +#ifdef HAS_ARGBATTENUATEROW_SSSE3 +ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3) +#endif +#ifdef HAS_ARGBUNATTENUATEROW_SSE2 +ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3) +#endif +#ifdef HAS_ARGBATTENUATEROW_AVX2 +ANY11(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, 0, 4, 4, 7) +#endif +#ifdef HAS_ARGBUNATTENUATEROW_AVX2 +ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7) +#endif +#ifdef HAS_ARGBATTENUATEROW_NEON +ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7) +#endif +#ifdef HAS_ARGBATTENUATEROW_MSA +ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7) +#endif +#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 +ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7) +#endif +#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 +ANY11(ARGBExtractAlphaRow_Any_AVX2, ARGBExtractAlphaRow_AVX2, 0, 4, 1, 31) +#endif +#ifdef HAS_ARGBEXTRACTALPHAROW_NEON +ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15) +#endif +#ifdef HAS_ARGBEXTRACTALPHAROW_MSA +ANY11(ARGBExtractAlphaRow_Any_MSA, ARGBExtractAlphaRow_MSA, 0, 4, 1, 15) +#endif +#undef ANY11 + +// Any 1 to 1 blended. Destination is read, modify, write. +#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8_t temp[64 * 2]); \ + memset(temp, 0, 64 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, n); \ + } \ + memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ + memcpy(temp + 64, dst_ptr + n * BPP, r * BPP); \ + ANY_SIMD(temp, temp + 64, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \ + } + +#ifdef HAS_ARGBCOPYALPHAROW_AVX2 +ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15) +#endif +#ifdef HAS_ARGBCOPYALPHAROW_SSE2 +ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7) +#endif +#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 +ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15) +#endif +#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 +ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7) +#endif +#undef ANY11B + +// Any 1 to 1 with parameter. +#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, T param, int width) { \ + SIMD_ALIGNED(uint8_t temp[64 * 2]); \ + memset(temp, 0, 64); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, param, n); \ + } \ + memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ + ANY_SIMD(temp, temp + 64, param, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \ + } + +#if defined(HAS_ARGBTORGB565DITHERROW_SSE2) +ANY11P(ARGBToRGB565DitherRow_Any_SSE2, + ARGBToRGB565DitherRow_SSE2, + const uint32_t, + 4, + 2, + 3) +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_AVX2) +ANY11P(ARGBToRGB565DitherRow_Any_AVX2, + ARGBToRGB565DitherRow_AVX2, + const uint32_t, + 4, + 2, + 7) +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_NEON) +ANY11P(ARGBToRGB565DitherRow_Any_NEON, + ARGBToRGB565DitherRow_NEON, + const uint32_t, + 4, + 2, + 7) +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_MSA) +ANY11P(ARGBToRGB565DitherRow_Any_MSA, + ARGBToRGB565DitherRow_MSA, + const uint32_t, + 4, + 2, + 7) +#endif +#ifdef HAS_ARGBSHUFFLEROW_SSSE3 +ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8_t*, 4, 4, 7) +#endif +#ifdef HAS_ARGBSHUFFLEROW_AVX2 +ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8_t*, 4, 4, 15) +#endif +#ifdef HAS_ARGBSHUFFLEROW_NEON +ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8_t*, 4, 4, 3) +#endif +#ifdef HAS_ARGBSHUFFLEROW_MSA +ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8_t*, 4, 4, 7) +#endif +#undef ANY11P + +// Any 1 to 1 with parameter and shorts. BPP measures in shorts. +#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \ + void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \ + SIMD_ALIGNED(STYPE temp[32]); \ + SIMD_ALIGNED(DTYPE out[32]); \ + memset(temp, 0, 32 * SBPP); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, scale, n); \ + } \ + memcpy(temp, src_ptr + n, r * SBPP); \ + ANY_SIMD(temp, out, scale, MASK + 1); \ + memcpy(dst_ptr + n, out, r * BPP); \ + } + +#ifdef HAS_CONVERT16TO8ROW_SSSE3 +ANY11C(Convert16To8Row_Any_SSSE3, + Convert16To8Row_SSSE3, + 2, + 1, + uint16_t, + uint8_t, + 15) +#endif +#ifdef HAS_CONVERT16TO8ROW_AVX2 +ANY11C(Convert16To8Row_Any_AVX2, + Convert16To8Row_AVX2, + 2, + 1, + uint16_t, + uint8_t, + 31) +#endif +#ifdef HAS_CONVERT8TO16ROW_SSE2 +ANY11C(Convert8To16Row_Any_SSE2, + Convert8To16Row_SSE2, + 1, + 2, + uint8_t, + uint16_t, + 15) +#endif +#ifdef HAS_CONVERT8TO16ROW_AVX2 +ANY11C(Convert8To16Row_Any_AVX2, + Convert8To16Row_AVX2, + 1, + 2, + uint8_t, + uint16_t, + 31) +#endif +#undef ANY11C + +// Any 1 to 1 with parameter and shorts to byte. BPP measures in shorts. +#define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK) \ + void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \ + SIMD_ALIGNED(ST temp[32]); \ + SIMD_ALIGNED(T out[32]); \ + memset(temp, 0, SBPP * 32); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, param, n); \ + } \ + memcpy(temp, src_ptr + n, r * SBPP); \ + ANY_SIMD(temp, out, param, MASK + 1); \ + memcpy(dst_ptr + n, out, r * BPP); \ + } + +#ifdef HAS_HALFFLOATROW_SSE2 +ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, uint16_t, uint16_t, 2, 2, 7) +#endif +#ifdef HAS_HALFFLOATROW_AVX2 +ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, uint16_t, uint16_t, 2, 2, 15) +#endif +#ifdef HAS_HALFFLOATROW_F16C +ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, uint16_t, uint16_t, 2, 2, 15) +ANY11P16(HalfFloat1Row_Any_F16C, + HalfFloat1Row_F16C, + uint16_t, + uint16_t, + 2, + 2, + 15) +#endif +#ifdef HAS_HALFFLOATROW_NEON +ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 7) +ANY11P16(HalfFloat1Row_Any_NEON, + HalfFloat1Row_NEON, + uint16_t, + uint16_t, + 2, + 2, + 7) +#endif +#ifdef HAS_HALFFLOATROW_MSA +ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, uint16_t, uint16_t, 2, 2, 31) +#endif +#ifdef HAS_BYTETOFLOATROW_NEON +ANY11P16(ByteToFloatRow_Any_NEON, ByteToFloatRow_NEON, uint8_t, float, 1, 3, 7) +#endif +#undef ANY11P16 + +// Any 1 to 1 with yuvconstants +#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, \ + const struct YuvConstants* yuvconstants, int width) { \ + SIMD_ALIGNED(uint8_t temp[128 * 2]); \ + memset(temp, 0, 128); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ + ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ + } +#if defined(HAS_YUY2TOARGBROW_SSSE3) +ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15) +ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15) +#endif +#if defined(HAS_YUY2TOARGBROW_AVX2) +ANY11C(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31) +ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31) +#endif +#if defined(HAS_YUY2TOARGBROW_NEON) +ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7) +ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7) +#endif +#if defined(HAS_YUY2TOARGBROW_MSA) +ANY11C(YUY2ToARGBRow_Any_MSA, YUY2ToARGBRow_MSA, 1, 4, 4, 7) +ANY11C(UYVYToARGBRow_Any_MSA, UYVYToARGBRow_MSA, 1, 4, 4, 7) +#endif +#undef ANY11C + +// Any 1 to 1 interpolate. Takes 2 rows of source via stride. +#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ + void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, \ + ptrdiff_t src_stride_ptr, int width, int source_y_fraction) { \ + SIMD_ALIGNED(uint8_t temp[64 * 3]); \ + memset(temp, 0, 64 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction); \ + } \ + memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ + memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP); \ + ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \ + memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ + } + +#ifdef HAS_INTERPOLATEROW_AVX2 +ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31) +#endif +#ifdef HAS_INTERPOLATEROW_SSSE3 +ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15) +#endif +#ifdef HAS_INTERPOLATEROW_NEON +ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15) +#endif +#ifdef HAS_INTERPOLATEROW_MSA +ANY11T(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31) +#endif +#undef ANY11T + +// Any 1 to 1 mirror. +#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8_t temp[64 * 2]); \ + memset(temp, 0, 64); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr + r * BPP, dst_ptr, n); \ + } \ + memcpy(temp, src_ptr, r* BPP); \ + ANY_SIMD(temp, temp + 64, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP); \ + } + +#ifdef HAS_MIRRORROW_AVX2 +ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31) +#endif +#ifdef HAS_MIRRORROW_SSSE3 +ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15) +#endif +#ifdef HAS_MIRRORROW_NEON +ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15) +#endif +#ifdef HAS_MIRRORROW_MSA +ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63) +#endif +#ifdef HAS_ARGBMIRRORROW_AVX2 +ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7) +#endif +#ifdef HAS_ARGBMIRRORROW_SSE2 +ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3) +#endif +#ifdef HAS_ARGBMIRRORROW_NEON +ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3) +#endif +#ifdef HAS_ARGBMIRRORROW_MSA +ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15) +#endif +#undef ANY11M + +// Any 1 plane. (memset) +#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \ + void NAMEANY(uint8_t* dst_ptr, T v32, int width) { \ + SIMD_ALIGNED(uint8_t temp[64]); \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(dst_ptr, v32, n); \ + } \ + ANY_SIMD(temp, v32, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp, r * BPP); \ + } + +#ifdef HAS_SETROW_X86 +ANY1(SetRow_Any_X86, SetRow_X86, uint8_t, 1, 3) +#endif +#ifdef HAS_SETROW_NEON +ANY1(SetRow_Any_NEON, SetRow_NEON, uint8_t, 1, 15) +#endif +#ifdef HAS_ARGBSETROW_NEON +ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32_t, 4, 3) +#endif +#ifdef HAS_ARGBSETROW_MSA +ANY1(ARGBSetRow_Any_MSA, ARGBSetRow_MSA, uint32_t, 4, 3) +#endif +#undef ANY1 + +// Any 1 to 2. Outputs UV planes. +#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, \ + int width) { \ + SIMD_ALIGNED(uint8_t temp[128 * 3]); \ + memset(temp, 0, 128); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_u, dst_v, n); \ + } \ + memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ + ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \ + memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT)); \ + memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT)); \ + } + +#ifdef HAS_SPLITUVROW_SSE2 +ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15) +#endif +#ifdef HAS_SPLITUVROW_AVX2 +ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31) +#endif +#ifdef HAS_SPLITUVROW_NEON +ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15) +#endif +#ifdef HAS_SPLITUVROW_MSA +ANY12(SplitUVRow_Any_MSA, SplitUVRow_MSA, 0, 2, 0, 31) +#endif +#ifdef HAS_ARGBTOUV444ROW_SSSE3 +ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15) +#endif +#ifdef HAS_YUY2TOUV422ROW_AVX2 +ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31) +ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31) +#endif +#ifdef HAS_YUY2TOUV422ROW_SSE2 +ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15) +ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15) +#endif +#ifdef HAS_YUY2TOUV422ROW_NEON +ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7) +ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15) +ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15) +#endif +#ifdef HAS_YUY2TOUV422ROW_MSA +ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15) +ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31) +ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31) +#endif +#undef ANY12 + +// Any 1 to 3. Outputs RGB planes. +#define ANY13(NAMEANY, ANY_SIMD, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, \ + uint8_t* dst_b, int width) { \ + SIMD_ALIGNED(uint8_t temp[16 * 6]); \ + memset(temp, 0, 16 * 3); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n); \ + } \ + memcpy(temp, src_ptr + n * BPP, r * BPP); \ + ANY_SIMD(temp, temp + 16 * 3, temp + 16 * 4, temp + 16 * 5, MASK + 1); \ + memcpy(dst_r + n, temp + 16 * 3, r); \ + memcpy(dst_g + n, temp + 16 * 4, r); \ + memcpy(dst_b + n, temp + 16 * 5, r); \ + } + +#ifdef HAS_SPLITRGBROW_SSSE3 +ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15) +#endif +#ifdef HAS_SPLITRGBROW_NEON +ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15) +#endif + +// Any 1 to 2 with source stride (2 rows of source). Outputs UV planes. +// 128 byte row allows for 32 avx ARGB pixels. +#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, \ + uint8_t* dst_v, int width) { \ + SIMD_ALIGNED(uint8_t temp[128 * 4]); \ + memset(temp, 0, 128 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n); \ + } \ + memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ + memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \ + SS(r, UVSHIFT) * BPP); \ + if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \ + memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \ + BPP); \ + memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \ + temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \ + } \ + ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1); \ + memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1)); \ + memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1)); \ + } + +#ifdef HAS_ARGBTOUVROW_AVX2 +ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31) +#endif +#ifdef HAS_ARGBTOUVJROW_AVX2 +ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31) +#endif +#ifdef HAS_ARGBTOUVROW_SSSE3 +ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15) +ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15) +ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15) +ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15) +ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15) +#endif +#ifdef HAS_YUY2TOUVROW_AVX2 +ANY12S(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, 1, 4, 31) +ANY12S(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, 1, 4, 31) +#endif +#ifdef HAS_YUY2TOUVROW_SSE2 +ANY12S(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, 1, 4, 15) +ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15) +#endif +#ifdef HAS_ARGBTOUVROW_NEON +ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15) +#endif +#ifdef HAS_ARGBTOUVROW_MSA +ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31) +#endif +#ifdef HAS_ARGBTOUVJROW_NEON +ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15) +#endif +#ifdef HAS_ARGBTOUVJROW_MSA +ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31) +#endif +#ifdef HAS_BGRATOUVROW_NEON +ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15) +#endif +#ifdef HAS_BGRATOUVROW_MSA +ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 31) +#endif +#ifdef HAS_ABGRTOUVROW_NEON +ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15) +#endif +#ifdef HAS_ABGRTOUVROW_MSA +ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 31) +#endif +#ifdef HAS_RGBATOUVROW_NEON +ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15) +#endif +#ifdef HAS_RGBATOUVROW_MSA +ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 31) +#endif +#ifdef HAS_RGB24TOUVROW_NEON +ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15) +#endif +#ifdef HAS_RGB24TOUVROW_MSA +ANY12S(RGB24ToUVRow_Any_MSA, RGB24ToUVRow_MSA, 0, 3, 15) +#endif +#ifdef HAS_RAWTOUVROW_NEON +ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15) +#endif +#ifdef HAS_RAWTOUVROW_MSA +ANY12S(RAWToUVRow_Any_MSA, RAWToUVRow_MSA, 0, 3, 15) +#endif +#ifdef HAS_RGB565TOUVROW_NEON +ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15) +#endif +#ifdef HAS_RGB565TOUVROW_MSA +ANY12S(RGB565ToUVRow_Any_MSA, RGB565ToUVRow_MSA, 0, 2, 15) +#endif +#ifdef HAS_ARGB1555TOUVROW_NEON +ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15) +#endif +#ifdef HAS_ARGB1555TOUVROW_MSA +ANY12S(ARGB1555ToUVRow_Any_MSA, ARGB1555ToUVRow_MSA, 0, 2, 15) +#endif +#ifdef HAS_ARGB4444TOUVROW_NEON +ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15) +#endif +#ifdef HAS_YUY2TOUVROW_NEON +ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15) +#endif +#ifdef HAS_UYVYTOUVROW_NEON +ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15) +#endif +#ifdef HAS_YUY2TOUVROW_MSA +ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31) +#endif +#ifdef HAS_UYVYTOUVROW_MSA +ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31) +#endif +#undef ANY12S + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/row_common.cc b/media/libvpx/libvpx/third_party/libyuv/source/row_common.cc new file mode 100644 index 0000000000..2bbc5adbf1 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/row_common.cc @@ -0,0 +1,3237 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#include +#include // For memcpy and memset. + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// llvm x86 is poor at ternary operator, so use branchless min/max. + +#define USE_BRANCHLESS 1 +#if USE_BRANCHLESS +static __inline int32_t clamp0(int32_t v) { + return ((-(v) >> 31) & (v)); +} + +static __inline int32_t clamp255(int32_t v) { + return (((255 - (v)) >> 31) | (v)) & 255; +} + +static __inline int32_t clamp1023(int32_t v) { + return (((1023 - (v)) >> 31) | (v)) & 1023; +} + +static __inline uint32_t Abs(int32_t v) { + int m = v >> 31; + return (v + m) ^ m; +} +#else // USE_BRANCHLESS +static __inline int32_t clamp0(int32_t v) { + return (v < 0) ? 0 : v; +} + +static __inline int32_t clamp255(int32_t v) { + return (v > 255) ? 255 : v; +} + +static __inline int32_t clamp1023(int32_t v) { + return (v > 1023) ? 1023 : v; +} + +static __inline uint32_t Abs(int32_t v) { + return (v < 0) ? -v : v; +} +#endif // USE_BRANCHLESS +static __inline uint32_t Clamp(int32_t val) { + int v = clamp0(val); + return (uint32_t)(clamp255(v)); +} + +static __inline uint32_t Clamp10(int32_t val) { + int v = clamp0(val); + return (uint32_t)(clamp1023(v)); +} + +// Little Endian +#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \ + defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) || \ + (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#define WRITEWORD(p, v) *(uint32_t*)(p) = v +#else +static inline void WRITEWORD(uint8_t* p, uint32_t v) { + p[0] = (uint8_t)(v & 255); + p[1] = (uint8_t)((v >> 8) & 255); + p[2] = (uint8_t)((v >> 16) & 255); + p[3] = (uint8_t)((v >> 24) & 255); +} +#endif + +void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t b = src_rgb24[0]; + uint8_t g = src_rgb24[1]; + uint8_t r = src_rgb24[2]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = 255u; + dst_argb += 4; + src_rgb24 += 3; + } +} + +void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t r = src_raw[0]; + uint8_t g = src_raw[1]; + uint8_t b = src_raw[2]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = 255u; + dst_argb += 4; + src_raw += 3; + } +} + +void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t r = src_raw[0]; + uint8_t g = src_raw[1]; + uint8_t b = src_raw[2]; + dst_rgb24[0] = b; + dst_rgb24[1] = g; + dst_rgb24[2] = r; + dst_rgb24 += 3; + src_raw += 3; + } +} + +void RGB565ToARGBRow_C(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t b = src_rgb565[0] & 0x1f; + uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); + uint8_t r = src_rgb565[1] >> 3; + dst_argb[0] = (b << 3) | (b >> 2); + dst_argb[1] = (g << 2) | (g >> 4); + dst_argb[2] = (r << 3) | (r >> 2); + dst_argb[3] = 255u; + dst_argb += 4; + src_rgb565 += 2; + } +} + +void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t b = src_argb1555[0] & 0x1f; + uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); + uint8_t r = (src_argb1555[1] & 0x7c) >> 2; + uint8_t a = src_argb1555[1] >> 7; + dst_argb[0] = (b << 3) | (b >> 2); + dst_argb[1] = (g << 3) | (g >> 2); + dst_argb[2] = (r << 3) | (r >> 2); + dst_argb[3] = -a; + dst_argb += 4; + src_argb1555 += 2; + } +} + +void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t b = src_argb4444[0] & 0x0f; + uint8_t g = src_argb4444[0] >> 4; + uint8_t r = src_argb4444[1] & 0x0f; + uint8_t a = src_argb4444[1] >> 4; + dst_argb[0] = (b << 4) | b; + dst_argb[1] = (g << 4) | g; + dst_argb[2] = (r << 4) | r; + dst_argb[3] = (a << 4) | a; + dst_argb += 4; + src_argb4444 += 2; + } +} + +void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint32_t ar30 = *(const uint32_t*)src_ar30; + uint32_t b = (ar30 >> 2) & 0xff; + uint32_t g = (ar30 >> 12) & 0xff; + uint32_t r = (ar30 >> 22) & 0xff; + uint32_t a = (ar30 >> 30) * 0x55; // Replicate 2 bits to 8 bits. + *(uint32_t*)(dst_argb) = b | (g << 8) | (r << 16) | (a << 24); + dst_argb += 4; + src_ar30 += 4; + } +} + +void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) { + int x; + for (x = 0; x < width; ++x) { + uint32_t ar30 = *(const uint32_t*)src_ar30; + uint32_t b = (ar30 >> 2) & 0xff; + uint32_t g = (ar30 >> 12) & 0xff; + uint32_t r = (ar30 >> 22) & 0xff; + uint32_t a = (ar30 >> 30) * 0x55; // Replicate 2 bits to 8 bits. + *(uint32_t*)(dst_abgr) = r | (g << 8) | (b << 16) | (a << 24); + dst_abgr += 4; + src_ar30 += 4; + } +} + +void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) { + int x; + for (x = 0; x < width; ++x) { + uint32_t ar30 = *(const uint32_t*)src_ar30; + uint32_t b = ar30 & 0x3ff; + uint32_t ga = ar30 & 0xc00ffc00; + uint32_t r = (ar30 >> 20) & 0x3ff; + *(uint32_t*)(dst_ab30) = r | ga | (b << 20); + dst_ab30 += 4; + src_ar30 += 4; + } +} + +void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t b = src_argb[0]; + uint8_t g = src_argb[1]; + uint8_t r = src_argb[2]; + dst_rgb[0] = b; + dst_rgb[1] = g; + dst_rgb[2] = r; + dst_rgb += 3; + src_argb += 4; + } +} + +void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t b = src_argb[0]; + uint8_t g = src_argb[1]; + uint8_t r = src_argb[2]; + dst_rgb[0] = r; + dst_rgb[1] = g; + dst_rgb[2] = b; + dst_rgb += 3; + src_argb += 4; + } +} + +void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + uint8_t b0 = src_argb[0] >> 3; + uint8_t g0 = src_argb[1] >> 2; + uint8_t r0 = src_argb[2] >> 3; + uint8_t b1 = src_argb[4] >> 3; + uint8_t g1 = src_argb[5] >> 2; + uint8_t r1 = src_argb[6] >> 3; + WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | + (r1 << 27)); + dst_rgb += 4; + src_argb += 8; + } + if (width & 1) { + uint8_t b0 = src_argb[0] >> 3; + uint8_t g0 = src_argb[1] >> 2; + uint8_t r0 = src_argb[2] >> 3; + *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11); + } +} + +// dither4 is a row of 4 values from 4x4 dither matrix. +// The 4x4 matrix contains values to increase RGB. When converting to +// fewer bits (565) this provides an ordered dither. +// The order in the 4x4 matrix in first byte is upper left. +// The 4 values are passed as an int, then referenced as an array, so +// endian will not affect order of the original matrix. But the dither4 +// will containing the first pixel in the lower byte for little endian +// or the upper byte for big endian. +void ARGBToRGB565DitherRow_C(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + int dither0 = ((const unsigned char*)(&dither4))[x & 3]; + int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3]; + uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3; + uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2; + uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3; + uint8_t b1 = clamp255(src_argb[4] + dither1) >> 3; + uint8_t g1 = clamp255(src_argb[5] + dither1) >> 2; + uint8_t r1 = clamp255(src_argb[6] + dither1) >> 3; + WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | + (r1 << 27)); + dst_rgb += 4; + src_argb += 8; + } + if (width & 1) { + int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3]; + uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3; + uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2; + uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3; + *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11); + } +} + +void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + uint8_t b0 = src_argb[0] >> 3; + uint8_t g0 = src_argb[1] >> 3; + uint8_t r0 = src_argb[2] >> 3; + uint8_t a0 = src_argb[3] >> 7; + uint8_t b1 = src_argb[4] >> 3; + uint8_t g1 = src_argb[5] >> 3; + uint8_t r1 = src_argb[6] >> 3; + uint8_t a1 = src_argb[7] >> 7; + *(uint32_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) | + (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31); + dst_rgb += 4; + src_argb += 8; + } + if (width & 1) { + uint8_t b0 = src_argb[0] >> 3; + uint8_t g0 = src_argb[1] >> 3; + uint8_t r0 = src_argb[2] >> 3; + uint8_t a0 = src_argb[3] >> 7; + *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15); + } +} + +void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + uint8_t b0 = src_argb[0] >> 4; + uint8_t g0 = src_argb[1] >> 4; + uint8_t r0 = src_argb[2] >> 4; + uint8_t a0 = src_argb[3] >> 4; + uint8_t b1 = src_argb[4] >> 4; + uint8_t g1 = src_argb[5] >> 4; + uint8_t r1 = src_argb[6] >> 4; + uint8_t a1 = src_argb[7] >> 4; + *(uint32_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) | + (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28); + dst_rgb += 4; + src_argb += 8; + } + if (width & 1) { + uint8_t b0 = src_argb[0] >> 4; + uint8_t g0 = src_argb[1] >> 4; + uint8_t r0 = src_argb[2] >> 4; + uint8_t a0 = src_argb[3] >> 4; + *(uint16_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12); + } +} + +void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) { + int x; + for (x = 0; x < width; ++x) { + uint32_t b0 = (src_abgr[0] >> 6) | ((uint32_t)(src_abgr[0]) << 2); + uint32_t g0 = (src_abgr[1] >> 6) | ((uint32_t)(src_abgr[1]) << 2); + uint32_t r0 = (src_abgr[2] >> 6) | ((uint32_t)(src_abgr[2]) << 2); + uint32_t a0 = (src_abgr[3] >> 6); + *(uint32_t*)(dst_ar30) = r0 | (g0 << 10) | (b0 << 20) | (a0 << 30); + dst_ar30 += 4; + src_abgr += 4; + } +} + +void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) { + int x; + for (x = 0; x < width; ++x) { + uint32_t b0 = (src_argb[0] >> 6) | ((uint32_t)(src_argb[0]) << 2); + uint32_t g0 = (src_argb[1] >> 6) | ((uint32_t)(src_argb[1]) << 2); + uint32_t r0 = (src_argb[2] >> 6) | ((uint32_t)(src_argb[2]) << 2); + uint32_t a0 = (src_argb[3] >> 6); + *(uint32_t*)(dst_ar30) = b0 | (g0 << 10) | (r0 << 20) | (a0 << 30); + dst_ar30 += 4; + src_argb += 4; + } +} + +static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) { + return (66 * r + 129 * g + 25 * b + 0x1080) >> 8; +} + +static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) { + return (112 * b - 74 * g - 38 * r + 0x8080) >> 8; +} +static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) { + return (112 * r - 94 * g - 18 * b + 0x8080) >> 8; +} + +// ARGBToY_C and ARGBToUV_C +#define MAKEROWY(NAME, R, G, B, BPP) \ + void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \ + src_argb0 += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint8_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \ + src_rgb1[B + BPP]) >> \ + 2; \ + uint8_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \ + src_rgb1[G + BPP]) >> \ + 2; \ + uint8_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \ + src_rgb1[R + BPP]) >> \ + 2; \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ + src_rgb0 += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint8_t ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \ + uint8_t ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \ + uint8_t ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ + } \ + } + +MAKEROWY(ARGB, 2, 1, 0, 4) +MAKEROWY(BGRA, 1, 2, 3, 4) +MAKEROWY(ABGR, 0, 1, 2, 4) +MAKEROWY(RGBA, 3, 2, 1, 4) +MAKEROWY(RGB24, 2, 1, 0, 3) +MAKEROWY(RAW, 0, 1, 2, 3) +#undef MAKEROWY + +// JPeg uses a variation on BT.601-1 full range +// y = 0.29900 * r + 0.58700 * g + 0.11400 * b +// u = -0.16874 * r - 0.33126 * g + 0.50000 * b + center +// v = 0.50000 * r - 0.41869 * g - 0.08131 * b + center +// BT.601 Mpeg range uses: +// b 0.1016 * 255 = 25.908 = 25 +// g 0.5078 * 255 = 129.489 = 129 +// r 0.2578 * 255 = 65.739 = 66 +// JPeg 8 bit Y (not used): +// b 0.11400 * 256 = 29.184 = 29 +// g 0.58700 * 256 = 150.272 = 150 +// r 0.29900 * 256 = 76.544 = 77 +// JPeg 7 bit Y: +// b 0.11400 * 128 = 14.592 = 15 +// g 0.58700 * 128 = 75.136 = 75 +// r 0.29900 * 128 = 38.272 = 38 +// JPeg 8 bit U: +// b 0.50000 * 255 = 127.5 = 127 +// g -0.33126 * 255 = -84.4713 = -84 +// r -0.16874 * 255 = -43.0287 = -43 +// JPeg 8 bit V: +// b -0.08131 * 255 = -20.73405 = -20 +// g -0.41869 * 255 = -106.76595 = -107 +// r 0.50000 * 255 = 127.5 = 127 + +static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) { + return (38 * r + 75 * g + 15 * b + 64) >> 7; +} + +static __inline int RGBToUJ(uint8_t r, uint8_t g, uint8_t b) { + return (127 * b - 84 * g - 43 * r + 0x8080) >> 8; +} +static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) { + return (127 * r - 107 * g - 20 * b + 0x8080) >> 8; +} + +#define AVGB(a, b) (((a) + (b) + 1) >> 1) + +// ARGBToYJ_C and ARGBToUVJ_C +#define MAKEROWYJ(NAME, R, G, B, BPP) \ + void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \ + src_argb0 += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \ + AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \ + uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \ + AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \ + uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \ + AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \ + dst_u[0] = RGBToUJ(ar, ag, ab); \ + dst_v[0] = RGBToVJ(ar, ag, ab); \ + src_rgb0 += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]); \ + uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]); \ + uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]); \ + dst_u[0] = RGBToUJ(ar, ag, ab); \ + dst_v[0] = RGBToVJ(ar, ag, ab); \ + } \ + } + +MAKEROWYJ(ARGB, 2, 1, 0, 4) +#undef MAKEROWYJ + +void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t b = src_rgb565[0] & 0x1f; + uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); + uint8_t r = src_rgb565[1] >> 3; + b = (b << 3) | (b >> 2); + g = (g << 2) | (g >> 4); + r = (r << 3) | (r >> 2); + dst_y[0] = RGBToY(r, g, b); + src_rgb565 += 2; + dst_y += 1; + } +} + +void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t b = src_argb1555[0] & 0x1f; + uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); + uint8_t r = (src_argb1555[1] & 0x7c) >> 2; + b = (b << 3) | (b >> 2); + g = (g << 3) | (g >> 2); + r = (r << 3) | (r >> 2); + dst_y[0] = RGBToY(r, g, b); + src_argb1555 += 2; + dst_y += 1; + } +} + +void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t b = src_argb4444[0] & 0x0f; + uint8_t g = src_argb4444[0] >> 4; + uint8_t r = src_argb4444[1] & 0x0f; + b = (b << 4) | b; + g = (g << 4) | g; + r = (r << 4) | r; + dst_y[0] = RGBToY(r, g, b); + src_argb4444 += 2; + dst_y += 1; + } +} + +void RGB565ToUVRow_C(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565; + int x; + for (x = 0; x < width - 1; x += 2) { + uint8_t b0 = src_rgb565[0] & 0x1f; + uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); + uint8_t r0 = src_rgb565[1] >> 3; + uint8_t b1 = src_rgb565[2] & 0x1f; + uint8_t g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3); + uint8_t r1 = src_rgb565[3] >> 3; + uint8_t b2 = next_rgb565[0] & 0x1f; + uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3); + uint8_t r2 = next_rgb565[1] >> 3; + uint8_t b3 = next_rgb565[2] & 0x1f; + uint8_t g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3); + uint8_t r3 = next_rgb565[3] >> 3; + uint8_t b = (b0 + b1 + b2 + b3); // 565 * 4 = 787. + uint8_t g = (g0 + g1 + g2 + g3); + uint8_t r = (r0 + r1 + r2 + r3); + b = (b << 1) | (b >> 6); // 787 -> 888. + r = (r << 1) | (r >> 6); + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); + src_rgb565 += 4; + next_rgb565 += 4; + dst_u += 1; + dst_v += 1; + } + if (width & 1) { + uint8_t b0 = src_rgb565[0] & 0x1f; + uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); + uint8_t r0 = src_rgb565[1] >> 3; + uint8_t b2 = next_rgb565[0] & 0x1f; + uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3); + uint8_t r2 = next_rgb565[1] >> 3; + uint8_t b = (b0 + b2); // 565 * 2 = 676. + uint8_t g = (g0 + g2); + uint8_t r = (r0 + r2); + b = (b << 2) | (b >> 4); // 676 -> 888 + g = (g << 1) | (g >> 6); + r = (r << 2) | (r >> 4); + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); + } +} + +void ARGB1555ToUVRow_C(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555; + int x; + for (x = 0; x < width - 1; x += 2) { + uint8_t b0 = src_argb1555[0] & 0x1f; + uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); + uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2; + uint8_t b1 = src_argb1555[2] & 0x1f; + uint8_t g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3); + uint8_t r1 = (src_argb1555[3] & 0x7c) >> 2; + uint8_t b2 = next_argb1555[0] & 0x1f; + uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3); + uint8_t r2 = (next_argb1555[1] & 0x7c) >> 2; + uint8_t b3 = next_argb1555[2] & 0x1f; + uint8_t g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3); + uint8_t r3 = (next_argb1555[3] & 0x7c) >> 2; + uint8_t b = (b0 + b1 + b2 + b3); // 555 * 4 = 777. + uint8_t g = (g0 + g1 + g2 + g3); + uint8_t r = (r0 + r1 + r2 + r3); + b = (b << 1) | (b >> 6); // 777 -> 888. + g = (g << 1) | (g >> 6); + r = (r << 1) | (r >> 6); + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); + src_argb1555 += 4; + next_argb1555 += 4; + dst_u += 1; + dst_v += 1; + } + if (width & 1) { + uint8_t b0 = src_argb1555[0] & 0x1f; + uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); + uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2; + uint8_t b2 = next_argb1555[0] & 0x1f; + uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3); + uint8_t r2 = next_argb1555[1] >> 3; + uint8_t b = (b0 + b2); // 555 * 2 = 666. + uint8_t g = (g0 + g2); + uint8_t r = (r0 + r2); + b = (b << 2) | (b >> 4); // 666 -> 888. + g = (g << 2) | (g >> 4); + r = (r << 2) | (r >> 4); + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); + } +} + +void ARGB4444ToUVRow_C(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* next_argb4444 = src_argb4444 + src_stride_argb4444; + int x; + for (x = 0; x < width - 1; x += 2) { + uint8_t b0 = src_argb4444[0] & 0x0f; + uint8_t g0 = src_argb4444[0] >> 4; + uint8_t r0 = src_argb4444[1] & 0x0f; + uint8_t b1 = src_argb4444[2] & 0x0f; + uint8_t g1 = src_argb4444[2] >> 4; + uint8_t r1 = src_argb4444[3] & 0x0f; + uint8_t b2 = next_argb4444[0] & 0x0f; + uint8_t g2 = next_argb4444[0] >> 4; + uint8_t r2 = next_argb4444[1] & 0x0f; + uint8_t b3 = next_argb4444[2] & 0x0f; + uint8_t g3 = next_argb4444[2] >> 4; + uint8_t r3 = next_argb4444[3] & 0x0f; + uint8_t b = (b0 + b1 + b2 + b3); // 444 * 4 = 666. + uint8_t g = (g0 + g1 + g2 + g3); + uint8_t r = (r0 + r1 + r2 + r3); + b = (b << 2) | (b >> 4); // 666 -> 888. + g = (g << 2) | (g >> 4); + r = (r << 2) | (r >> 4); + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); + src_argb4444 += 4; + next_argb4444 += 4; + dst_u += 1; + dst_v += 1; + } + if (width & 1) { + uint8_t b0 = src_argb4444[0] & 0x0f; + uint8_t g0 = src_argb4444[0] >> 4; + uint8_t r0 = src_argb4444[1] & 0x0f; + uint8_t b2 = next_argb4444[0] & 0x0f; + uint8_t g2 = next_argb4444[0] >> 4; + uint8_t r2 = next_argb4444[1] & 0x0f; + uint8_t b = (b0 + b2); // 444 * 2 = 555. + uint8_t g = (g0 + g2); + uint8_t r = (r0 + r2); + b = (b << 3) | (b >> 2); // 555 -> 888. + g = (g << 3) | (g >> 2); + r = (r << 3) | (r >> 2); + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); + } +} + +void ARGBToUV444Row_C(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t ab = src_argb[0]; + uint8_t ag = src_argb[1]; + uint8_t ar = src_argb[2]; + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); + src_argb += 4; + dst_u += 1; + dst_v += 1; + } +} + +void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]); + dst_argb[2] = dst_argb[1] = dst_argb[0] = y; + dst_argb[3] = src_argb[3]; + dst_argb += 4; + src_argb += 4; + } +} + +// Convert a row of image to Sepia tone. +void ARGBSepiaRow_C(uint8_t* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + int b = dst_argb[0]; + int g = dst_argb[1]; + int r = dst_argb[2]; + int sb = (b * 17 + g * 68 + r * 35) >> 7; + int sg = (b * 22 + g * 88 + r * 45) >> 7; + int sr = (b * 24 + g * 98 + r * 50) >> 7; + // b does not over flow. a is preserved from original. + dst_argb[0] = sb; + dst_argb[1] = clamp255(sg); + dst_argb[2] = clamp255(sr); + dst_argb += 4; + } +} + +// Apply color matrix to a row of image. Matrix is signed. +// TODO(fbarchard): Consider adding rounding (+32). +void ARGBColorMatrixRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width) { + int x; + for (x = 0; x < width; ++x) { + int b = src_argb[0]; + int g = src_argb[1]; + int r = src_argb[2]; + int a = src_argb[3]; + int sb = (b * matrix_argb[0] + g * matrix_argb[1] + r * matrix_argb[2] + + a * matrix_argb[3]) >> + 6; + int sg = (b * matrix_argb[4] + g * matrix_argb[5] + r * matrix_argb[6] + + a * matrix_argb[7]) >> + 6; + int sr = (b * matrix_argb[8] + g * matrix_argb[9] + r * matrix_argb[10] + + a * matrix_argb[11]) >> + 6; + int sa = (b * matrix_argb[12] + g * matrix_argb[13] + r * matrix_argb[14] + + a * matrix_argb[15]) >> + 6; + dst_argb[0] = Clamp(sb); + dst_argb[1] = Clamp(sg); + dst_argb[2] = Clamp(sr); + dst_argb[3] = Clamp(sa); + src_argb += 4; + dst_argb += 4; + } +} + +// Apply color table to a row of image. +void ARGBColorTableRow_C(uint8_t* dst_argb, + const uint8_t* table_argb, + int width) { + int x; + for (x = 0; x < width; ++x) { + int b = dst_argb[0]; + int g = dst_argb[1]; + int r = dst_argb[2]; + int a = dst_argb[3]; + dst_argb[0] = table_argb[b * 4 + 0]; + dst_argb[1] = table_argb[g * 4 + 1]; + dst_argb[2] = table_argb[r * 4 + 2]; + dst_argb[3] = table_argb[a * 4 + 3]; + dst_argb += 4; + } +} + +// Apply color table to a row of image. +void RGBColorTableRow_C(uint8_t* dst_argb, + const uint8_t* table_argb, + int width) { + int x; + for (x = 0; x < width; ++x) { + int b = dst_argb[0]; + int g = dst_argb[1]; + int r = dst_argb[2]; + dst_argb[0] = table_argb[b * 4 + 0]; + dst_argb[1] = table_argb[g * 4 + 1]; + dst_argb[2] = table_argb[r * 4 + 2]; + dst_argb += 4; + } +} + +void ARGBQuantizeRow_C(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width) { + int x; + for (x = 0; x < width; ++x) { + int b = dst_argb[0]; + int g = dst_argb[1]; + int r = dst_argb[2]; + dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset; + dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset; + dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset; + dst_argb += 4; + } +} + +#define REPEAT8(v) (v) | ((v) << 8) +#define SHADE(f, v) v* f >> 24 + +void ARGBShadeRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value) { + const uint32_t b_scale = REPEAT8(value & 0xff); + const uint32_t g_scale = REPEAT8((value >> 8) & 0xff); + const uint32_t r_scale = REPEAT8((value >> 16) & 0xff); + const uint32_t a_scale = REPEAT8(value >> 24); + + int i; + for (i = 0; i < width; ++i) { + const uint32_t b = REPEAT8(src_argb[0]); + const uint32_t g = REPEAT8(src_argb[1]); + const uint32_t r = REPEAT8(src_argb[2]); + const uint32_t a = REPEAT8(src_argb[3]); + dst_argb[0] = SHADE(b, b_scale); + dst_argb[1] = SHADE(g, g_scale); + dst_argb[2] = SHADE(r, r_scale); + dst_argb[3] = SHADE(a, a_scale); + src_argb += 4; + dst_argb += 4; + } +} +#undef REPEAT8 +#undef SHADE + +#define REPEAT8(v) (v) | ((v) << 8) +#define SHADE(f, v) v* f >> 16 + +void ARGBMultiplyRow_C(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int i; + for (i = 0; i < width; ++i) { + const uint32_t b = REPEAT8(src_argb0[0]); + const uint32_t g = REPEAT8(src_argb0[1]); + const uint32_t r = REPEAT8(src_argb0[2]); + const uint32_t a = REPEAT8(src_argb0[3]); + const uint32_t b_scale = src_argb1[0]; + const uint32_t g_scale = src_argb1[1]; + const uint32_t r_scale = src_argb1[2]; + const uint32_t a_scale = src_argb1[3]; + dst_argb[0] = SHADE(b, b_scale); + dst_argb[1] = SHADE(g, g_scale); + dst_argb[2] = SHADE(r, r_scale); + dst_argb[3] = SHADE(a, a_scale); + src_argb0 += 4; + src_argb1 += 4; + dst_argb += 4; + } +} +#undef REPEAT8 +#undef SHADE + +#define SHADE(f, v) clamp255(v + f) + +void ARGBAddRow_C(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int i; + for (i = 0; i < width; ++i) { + const int b = src_argb0[0]; + const int g = src_argb0[1]; + const int r = src_argb0[2]; + const int a = src_argb0[3]; + const int b_add = src_argb1[0]; + const int g_add = src_argb1[1]; + const int r_add = src_argb1[2]; + const int a_add = src_argb1[3]; + dst_argb[0] = SHADE(b, b_add); + dst_argb[1] = SHADE(g, g_add); + dst_argb[2] = SHADE(r, r_add); + dst_argb[3] = SHADE(a, a_add); + src_argb0 += 4; + src_argb1 += 4; + dst_argb += 4; + } +} +#undef SHADE + +#define SHADE(f, v) clamp0(f - v) + +void ARGBSubtractRow_C(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int i; + for (i = 0; i < width; ++i) { + const int b = src_argb0[0]; + const int g = src_argb0[1]; + const int r = src_argb0[2]; + const int a = src_argb0[3]; + const int b_sub = src_argb1[0]; + const int g_sub = src_argb1[1]; + const int r_sub = src_argb1[2]; + const int a_sub = src_argb1[3]; + dst_argb[0] = SHADE(b, b_sub); + dst_argb[1] = SHADE(g, g_sub); + dst_argb[2] = SHADE(r, r_sub); + dst_argb[3] = SHADE(a, a_sub); + src_argb0 += 4; + src_argb1 += 4; + dst_argb += 4; + } +} +#undef SHADE + +// Sobel functions which mimics SSSE3. +void SobelXRow_C(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width) { + int i; + for (i = 0; i < width; ++i) { + int a = src_y0[i]; + int b = src_y1[i]; + int c = src_y2[i]; + int a_sub = src_y0[i + 2]; + int b_sub = src_y1[i + 2]; + int c_sub = src_y2[i + 2]; + int a_diff = a - a_sub; + int b_diff = b - b_sub; + int c_diff = c - c_sub; + int sobel = Abs(a_diff + b_diff * 2 + c_diff); + dst_sobelx[i] = (uint8_t)(clamp255(sobel)); + } +} + +void SobelYRow_C(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width) { + int i; + for (i = 0; i < width; ++i) { + int a = src_y0[i + 0]; + int b = src_y0[i + 1]; + int c = src_y0[i + 2]; + int a_sub = src_y1[i + 0]; + int b_sub = src_y1[i + 1]; + int c_sub = src_y1[i + 2]; + int a_diff = a - a_sub; + int b_diff = b - b_sub; + int c_diff = c - c_sub; + int sobel = Abs(a_diff + b_diff * 2 + c_diff); + dst_sobely[i] = (uint8_t)(clamp255(sobel)); + } +} + +void SobelRow_C(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + int i; + for (i = 0; i < width; ++i) { + int r = src_sobelx[i]; + int b = src_sobely[i]; + int s = clamp255(r + b); + dst_argb[0] = (uint8_t)(s); + dst_argb[1] = (uint8_t)(s); + dst_argb[2] = (uint8_t)(s); + dst_argb[3] = (uint8_t)(255u); + dst_argb += 4; + } +} + +void SobelToPlaneRow_C(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width) { + int i; + for (i = 0; i < width; ++i) { + int r = src_sobelx[i]; + int b = src_sobely[i]; + int s = clamp255(r + b); + dst_y[i] = (uint8_t)(s); + } +} + +void SobelXYRow_C(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + int i; + for (i = 0; i < width; ++i) { + int r = src_sobelx[i]; + int b = src_sobely[i]; + int g = clamp255(r + b); + dst_argb[0] = (uint8_t)(b); + dst_argb[1] = (uint8_t)(g); + dst_argb[2] = (uint8_t)(r); + dst_argb[3] = (uint8_t)(255u); + dst_argb += 4; + } +} + +void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) { + // Copy a Y to RGB. + int x; + for (x = 0; x < width; ++x) { + uint8_t y = src_y[0]; + dst_argb[2] = dst_argb[1] = dst_argb[0] = y; + dst_argb[3] = 255u; + dst_argb += 4; + ++src_y; + } +} + +// TODO(fbarchard): Unify these structures to be platform independent. +// TODO(fbarchard): Generate SIMD structures from float matrix. + +// BT.601 YUV to RGB reference +// R = (Y - 16) * 1.164 - V * -1.596 +// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813 +// B = (Y - 16) * 1.164 - U * -2.018 + +// Y contribution to R,G,B. Scale and bias. +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ +#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ + +// U and V contributions to R,G,B. +#define UB -128 /* max(-128, round(-2.018 * 64)) */ +#define UG 25 /* round(0.391 * 64) */ +#define VG 52 /* round(0.813 * 64) */ +#define VR -102 /* round(-1.596 * 64) */ + +// Bias values to subtract 16 from Y and 128 from U and V. +#define BB (UB * 128 + YGB) +#define BG (UG * 128 + VG * 128 + YGB) +#define BR (VR * 128 + YGB) + +#if defined(__aarch64__) // 64 bit arm +const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = { + {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, + {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, + {UG, VG, UG, VG, UG, VG, UG, VG}, + {UG, VG, UG, VG, UG, VG, UG, VG}, + {BB, BG, BR, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; +const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = { + {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, + {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, + {VG, UG, VG, UG, VG, UG, VG, UG}, + {VG, UG, VG, UG, VG, UG, VG, UG}, + {BR, BG, BB, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; +#elif defined(__arm__) // 32 bit arm +const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = { + {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0}, + {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0}, + {BB, BG, BR, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; +const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = { + {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0}, + {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0}, + {BR, BG, BB, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; +#else +const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = { + {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, + UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0}, + {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, + {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, + 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR}, + {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, + {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, + {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; +const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = { + {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, + VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0}, + {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, + VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG}, + {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, + 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB}, + {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, + {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, + {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; +#endif + +#undef BB +#undef BG +#undef BR +#undef YGB +#undef UB +#undef UG +#undef VG +#undef VR +#undef YG + +// JPEG YUV to RGB reference +// * R = Y - V * -1.40200 +// * G = Y - U * 0.34414 - V * 0.71414 +// * B = Y - U * -1.77200 + +// Y contribution to R,G,B. Scale and bias. +#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ +#define YGB 32 /* 64 / 2 */ + +// U and V contributions to R,G,B. +#define UB -113 /* round(-1.77200 * 64) */ +#define UG 22 /* round(0.34414 * 64) */ +#define VG 46 /* round(0.71414 * 64) */ +#define VR -90 /* round(-1.40200 * 64) */ + +// Bias values to round, and subtract 128 from U and V. +#define BB (UB * 128 + YGB) +#define BG (UG * 128 + VG * 128 + YGB) +#define BR (VR * 128 + YGB) + +#if defined(__aarch64__) +const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = { + {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, + {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, + {UG, VG, UG, VG, UG, VG, UG, VG}, + {UG, VG, UG, VG, UG, VG, UG, VG}, + {BB, BG, BR, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; +const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = { + {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, + {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, + {VG, UG, VG, UG, VG, UG, VG, UG}, + {VG, UG, VG, UG, VG, UG, VG, UG}, + {BR, BG, BB, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; +#elif defined(__arm__) +const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = { + {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0}, + {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0}, + {BB, BG, BR, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; +const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = { + {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0}, + {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0}, + {BR, BG, BB, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; +#else +const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = { + {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, + UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0}, + {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, + {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, + 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR}, + {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, + {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, + {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; +const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = { + {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, + VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0}, + {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, + VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG}, + {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, + 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB}, + {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, + {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, + {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; +#endif + +#undef BB +#undef BG +#undef BR +#undef YGB +#undef UB +#undef UG +#undef VG +#undef VR +#undef YG + +// BT.709 YUV to RGB reference +// R = (Y - 16) * 1.164 - V * -1.793 +// G = (Y - 16) * 1.164 - U * 0.213 - V * 0.533 +// B = (Y - 16) * 1.164 - U * -2.112 +// See also http://www.equasys.de/colorconversion.html + +// Y contribution to R,G,B. Scale and bias. +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ +#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ + +// TODO(fbarchard): Find way to express 2.112 instead of 2.0. +// U and V contributions to R,G,B. +#define UB -128 /* max(-128, round(-2.112 * 64)) */ +#define UG 14 /* round(0.213 * 64) */ +#define VG 34 /* round(0.533 * 64) */ +#define VR -115 /* round(-1.793 * 64) */ + +// Bias values to round, and subtract 128 from U and V. +#define BB (UB * 128 + YGB) +#define BG (UG * 128 + VG * 128 + YGB) +#define BR (VR * 128 + YGB) + +#if defined(__aarch64__) +const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = { + {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, + {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, + {UG, VG, UG, VG, UG, VG, UG, VG}, + {UG, VG, UG, VG, UG, VG, UG, VG}, + {BB, BG, BR, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; +const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { + {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, + {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, + {VG, UG, VG, UG, VG, UG, VG, UG}, + {VG, UG, VG, UG, VG, UG, VG, UG}, + {BR, BG, BB, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; +#elif defined(__arm__) +const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = { + {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0}, + {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0}, + {BB, BG, BR, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; +const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { + {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0}, + {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0}, + {BR, BG, BB, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; +#else +const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = { + {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, + UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0}, + {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, + {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, + 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR}, + {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, + {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, + {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; +const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { + {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, + VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0}, + {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, + VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG}, + {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, + 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB}, + {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, + {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, + {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; +#endif + +#undef BB +#undef BG +#undef BR +#undef YGB +#undef UB +#undef UG +#undef VG +#undef VR +#undef YG + +// C reference code that mimics the YUV assembly. +// Reads 8 bit YUV and leaves result as 16 bit. + +static __inline void YuvPixel(uint8_t y, + uint8_t u, + uint8_t v, + uint8_t* b, + uint8_t* g, + uint8_t* r, + const struct YuvConstants* yuvconstants) { +#if defined(__aarch64__) + int ub = -yuvconstants->kUVToRB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[1]; + int vr = -yuvconstants->kUVToRB[1]; + int bb = yuvconstants->kUVBiasBGR[0]; + int bg = yuvconstants->kUVBiasBGR[1]; + int br = yuvconstants->kUVBiasBGR[2]; + int yg = yuvconstants->kYToRgb[0] / 0x0101; +#elif defined(__arm__) + int ub = -yuvconstants->kUVToRB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[4]; + int vr = -yuvconstants->kUVToRB[4]; + int bb = yuvconstants->kUVBiasBGR[0]; + int bg = yuvconstants->kUVBiasBGR[1]; + int br = yuvconstants->kUVBiasBGR[2]; + int yg = yuvconstants->kYToRgb[0] / 0x0101; +#else + int ub = yuvconstants->kUVToB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[1]; + int vr = yuvconstants->kUVToR[1]; + int bb = yuvconstants->kUVBiasB[0]; + int bg = yuvconstants->kUVBiasG[0]; + int br = yuvconstants->kUVBiasR[0]; + int yg = yuvconstants->kYToRgb[0]; +#endif + + uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16; + *b = Clamp((int32_t)(-(u * ub) + y1 + bb) >> 6); + *g = Clamp((int32_t)(-(u * ug + v * vg) + y1 + bg) >> 6); + *r = Clamp((int32_t)(-(v * vr) + y1 + br) >> 6); +} + +// Reads 8 bit YUV and leaves result as 16 bit. +static __inline void YuvPixel8_16(uint8_t y, + uint8_t u, + uint8_t v, + int* b, + int* g, + int* r, + const struct YuvConstants* yuvconstants) { +#if defined(__aarch64__) + int ub = -yuvconstants->kUVToRB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[1]; + int vr = -yuvconstants->kUVToRB[1]; + int bb = yuvconstants->kUVBiasBGR[0]; + int bg = yuvconstants->kUVBiasBGR[1]; + int br = yuvconstants->kUVBiasBGR[2]; + int yg = yuvconstants->kYToRgb[0] / 0x0101; +#elif defined(__arm__) + int ub = -yuvconstants->kUVToRB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[4]; + int vr = -yuvconstants->kUVToRB[4]; + int bb = yuvconstants->kUVBiasBGR[0]; + int bg = yuvconstants->kUVBiasBGR[1]; + int br = yuvconstants->kUVBiasBGR[2]; + int yg = yuvconstants->kYToRgb[0] / 0x0101; +#else + int ub = yuvconstants->kUVToB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[1]; + int vr = yuvconstants->kUVToR[1]; + int bb = yuvconstants->kUVBiasB[0]; + int bg = yuvconstants->kUVBiasG[0]; + int br = yuvconstants->kUVBiasR[0]; + int yg = yuvconstants->kYToRgb[0]; +#endif + + uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16; + *b = (int)(-(u * ub) + y1 + bb); + *g = (int)(-(u * ug + v * vg) + y1 + bg); + *r = (int)(-(v * vr) + y1 + br); +} + +// C reference code that mimics the YUV 16 bit assembly. +// Reads 10 bit YUV and leaves result as 16 bit. +static __inline void YuvPixel16(int16_t y, + int16_t u, + int16_t v, + int* b, + int* g, + int* r, + const struct YuvConstants* yuvconstants) { +#if defined(__aarch64__) + int ub = -yuvconstants->kUVToRB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[1]; + int vr = -yuvconstants->kUVToRB[1]; + int bb = yuvconstants->kUVBiasBGR[0]; + int bg = yuvconstants->kUVBiasBGR[1]; + int br = yuvconstants->kUVBiasBGR[2]; + int yg = yuvconstants->kYToRgb[0] / 0x0101; +#elif defined(__arm__) + int ub = -yuvconstants->kUVToRB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[4]; + int vr = -yuvconstants->kUVToRB[4]; + int bb = yuvconstants->kUVBiasBGR[0]; + int bg = yuvconstants->kUVBiasBGR[1]; + int br = yuvconstants->kUVBiasBGR[2]; + int yg = yuvconstants->kYToRgb[0] / 0x0101; +#else + int ub = yuvconstants->kUVToB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[1]; + int vr = yuvconstants->kUVToR[1]; + int bb = yuvconstants->kUVBiasB[0]; + int bg = yuvconstants->kUVBiasG[0]; + int br = yuvconstants->kUVBiasR[0]; + int yg = yuvconstants->kYToRgb[0]; +#endif + + uint32_t y1 = (uint32_t)((y << 6) * yg) >> 16; + u = clamp255(u >> 2); + v = clamp255(v >> 2); + *b = (int)(-(u * ub) + y1 + bb); + *g = (int)(-(u * ug + v * vg) + y1 + bg); + *r = (int)(-(v * vr) + y1 + br); +} + +// C reference code that mimics the YUV 10 bit assembly. +// Reads 10 bit YUV and clamps down to 8 bit RGB. +static __inline void YuvPixel10(uint16_t y, + uint16_t u, + uint16_t v, + uint8_t* b, + uint8_t* g, + uint8_t* r, + const struct YuvConstants* yuvconstants) { + int b16; + int g16; + int r16; + YuvPixel16(y, u, v, &b16, &g16, &r16, yuvconstants); + *b = Clamp(b16 >> 6); + *g = Clamp(g16 >> 6); + *r = Clamp(r16 >> 6); +} + +// Y contribution to R,G,B. Scale and bias. +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ +#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ + +// C reference code that mimics the YUV assembly. +static __inline void YPixel(uint8_t y, uint8_t* b, uint8_t* g, uint8_t* r) { + uint32_t y1 = (uint32_t)(y * 0x0101 * YG) >> 16; + *b = Clamp((int32_t)(y1 + YGB) >> 6); + *g = Clamp((int32_t)(y1 + YGB) >> 6); + *r = Clamp((int32_t)(y1 + YGB) >> 6); +} + +#undef YG +#undef YGB + +#if !defined(LIBYUV_DISABLE_NEON) && \ + (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON)) +// C mimic assembly. +// TODO(fbarchard): Remove subsampling from Neon. +void I444ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + uint8_t u = (src_u[0] + src_u[1] + 1) >> 1; + uint8_t v = (src_v[0] + src_v[1] + 1) >> 1; + YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, + yuvconstants); + rgb_buf[3] = 255; + YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, + yuvconstants); + rgb_buf[7] = 255; + src_y += 2; + src_u += 2; + src_v += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + } +} +#else +void I444ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width; ++x) { + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + src_y += 1; + src_u += 1; + src_v += 1; + rgb_buf += 4; // Advance 1 pixel. + } +} +#endif + +// Also used for 420 +void I422ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); + rgb_buf[7] = 255; + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + } +} + +// 10 bit YUV to ARGB +void I210ToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); + rgb_buf[7] = 255; + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + } +} + +static void StoreAR30(uint8_t* rgb_buf, int b, int g, int r) { + uint32_t ar30; + b = b >> 4; // convert 10.6 to 10 bit. + g = g >> 4; + r = r >> 4; + b = Clamp10(b); + g = Clamp10(g); + r = Clamp10(r); + ar30 = b | ((uint32_t)g << 10) | ((uint32_t)r << 20) | 0xc0000000; + (*(uint32_t*)rgb_buf) = ar30; +} + +// 10 bit YUV to 10 bit AR30 +void I210ToAR30Row_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int b; + int g; + int r; + for (x = 0; x < width - 1; x += 2) { + YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + YuvPixel16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf + 4, b, g, r); + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + } +} + +// 8 bit YUV to 10 bit AR30 +// Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits. +void I422ToAR30Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int b; + int g; + int r; + for (x = 0; x < width - 1; x += 2) { + YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + YuvPixel8_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf + 4, b, g, r); + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + } +} + +void I422AlphaToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = src_a[0]; + YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); + rgb_buf[7] = src_a[1]; + src_y += 2; + src_u += 1; + src_v += 1; + src_a += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = src_a[0]; + } +} + +void I422ToRGB24Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 3, rgb_buf + 4, + rgb_buf + 5, yuvconstants); + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 6; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + } +} + +void I422ToARGB4444Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width) { + uint8_t b0; + uint8_t g0; + uint8_t r0; + uint8_t b1; + uint8_t g1; + uint8_t r1; + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); + YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants); + b0 = b0 >> 4; + g0 = g0 >> 4; + r0 = r0 >> 4; + b1 = b1 >> 4; + g1 = g1 >> 4; + r1 = r1 >> 4; + *(uint32_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | (b1 << 16) | + (g1 << 20) | (r1 << 24) | 0xf000f000; + src_y += 2; + src_u += 1; + src_v += 1; + dst_argb4444 += 4; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); + b0 = b0 >> 4; + g0 = g0 >> 4; + r0 = r0 >> 4; + *(uint16_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | 0xf000; + } +} + +void I422ToARGB1555Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width) { + uint8_t b0; + uint8_t g0; + uint8_t r0; + uint8_t b1; + uint8_t g1; + uint8_t r1; + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); + YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants); + b0 = b0 >> 3; + g0 = g0 >> 3; + r0 = r0 >> 3; + b1 = b1 >> 3; + g1 = g1 >> 3; + r1 = r1 >> 3; + *(uint32_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | (b1 << 16) | + (g1 << 21) | (r1 << 26) | 0x80008000; + src_y += 2; + src_u += 1; + src_v += 1; + dst_argb1555 += 4; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); + b0 = b0 >> 3; + g0 = g0 >> 3; + r0 = r0 >> 3; + *(uint16_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | 0x8000; + } +} + +void I422ToRGB565Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + uint8_t b0; + uint8_t g0; + uint8_t r0; + uint8_t b1; + uint8_t g1; + uint8_t r1; + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); + YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants); + b0 = b0 >> 3; + g0 = g0 >> 2; + r0 = r0 >> 3; + b1 = b1 >> 3; + g1 = g1 >> 2; + r1 = r1 >> 3; + *(uint32_t*)(dst_rgb565) = + b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27); + src_y += 2; + src_u += 1; + src_v += 1; + dst_rgb565 += 4; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); + b0 = b0 >> 3; + g0 = g0 >> 2; + r0 = r0 >> 3; + *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11); + } +} + +void NV12ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); + rgb_buf[7] = 255; + src_y += 2; + src_uv += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + } +} + +void NV21ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); + rgb_buf[7] = 255; + src_y += 2; + src_vu += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + } +} + +void NV12ToRGB24Row_C(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 3, rgb_buf + 4, + rgb_buf + 5, yuvconstants); + src_y += 2; + src_uv += 2; + rgb_buf += 6; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + } +} + +void NV21ToRGB24Row_C(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 3, rgb_buf + 4, + rgb_buf + 5, yuvconstants); + src_y += 2; + src_vu += 2; + rgb_buf += 6; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + } +} + +void NV12ToRGB565Row_C(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + uint8_t b0; + uint8_t g0; + uint8_t r0; + uint8_t b1; + uint8_t g1; + uint8_t r1; + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants); + YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1, yuvconstants); + b0 = b0 >> 3; + g0 = g0 >> 2; + r0 = r0 >> 3; + b1 = b1 >> 3; + g1 = g1 >> 2; + r1 = r1 >> 3; + *(uint32_t*)(dst_rgb565) = + b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27); + src_y += 2; + src_uv += 2; + dst_rgb565 += 4; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants); + b0 = b0 >> 3; + g0 = g0 >> 2; + r0 = r0 >> 3; + *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11); + } +} + +void YUY2ToARGBRow_C(const uint8_t* src_yuy2, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); + rgb_buf[7] = 255; + src_yuy2 += 4; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + } +} + +void UYVYToARGBRow_C(const uint8_t* src_uyvy, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); + rgb_buf[7] = 255; + src_uyvy += 4; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + } +} + +void I422ToRGBARow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2, + rgb_buf + 3, yuvconstants); + rgb_buf[0] = 255; + YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 5, rgb_buf + 6, + rgb_buf + 7, yuvconstants); + rgb_buf[4] = 255; + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2, + rgb_buf + 3, yuvconstants); + rgb_buf[0] = 255; + } +} + +void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf[7] = 255; + src_y += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + } +} + +void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) { + int x; + src += width - 1; + for (x = 0; x < width - 1; x += 2) { + dst[x] = src[0]; + dst[x + 1] = src[-1]; + src -= 2; + } + if (width & 1) { + dst[width - 1] = src[0]; + } +} + +void MirrorUVRow_C(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + src_uv += (width - 1) << 1; + for (x = 0; x < width - 1; x += 2) { + dst_u[x] = src_uv[0]; + dst_u[x + 1] = src_uv[-2]; + dst_v[x] = src_uv[1]; + dst_v[x + 1] = src_uv[-2 + 1]; + src_uv -= 4; + } + if (width & 1) { + dst_u[width - 1] = src_uv[0]; + dst_v[width - 1] = src_uv[1]; + } +} + +void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width) { + int x; + const uint32_t* src32 = (const uint32_t*)(src); + uint32_t* dst32 = (uint32_t*)(dst); + src32 += width - 1; + for (x = 0; x < width - 1; x += 2) { + dst32[x] = src32[0]; + dst32[x + 1] = src32[-1]; + src32 -= 2; + } + if (width & 1) { + dst32[width - 1] = src32[0]; + } +} + +void SplitUVRow_C(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst_u[x] = src_uv[0]; + dst_u[x + 1] = src_uv[2]; + dst_v[x] = src_uv[1]; + dst_v[x + 1] = src_uv[3]; + src_uv += 4; + } + if (width & 1) { + dst_u[width - 1] = src_uv[0]; + dst_v[width - 1] = src_uv[1]; + } +} + +void MergeUVRow_C(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst_uv[0] = src_u[x]; + dst_uv[1] = src_v[x]; + dst_uv[2] = src_u[x + 1]; + dst_uv[3] = src_v[x + 1]; + dst_uv += 4; + } + if (width & 1) { + dst_uv[0] = src_u[width - 1]; + dst_uv[1] = src_v[width - 1]; + } +} + +void SplitRGBRow_C(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_r[x] = src_rgb[0]; + dst_g[x] = src_rgb[1]; + dst_b[x] = src_rgb[2]; + src_rgb += 3; + } +} + +void MergeRGBRow_C(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_rgb[0] = src_r[x]; + dst_rgb[1] = src_g[x]; + dst_rgb[2] = src_b[x]; + dst_rgb += 3; + } +} + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 128 = 9 bits +// 64 = 10 bits +// 16 = 12 bits +// 1 = 16 bits +void MergeUVRow_16_C(const uint16_t* src_u, + const uint16_t* src_v, + uint16_t* dst_uv, + int scale, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst_uv[0] = src_u[x] * scale; + dst_uv[1] = src_v[x] * scale; + dst_uv[2] = src_u[x + 1] * scale; + dst_uv[3] = src_v[x + 1] * scale; + dst_uv += 4; + } + if (width & 1) { + dst_uv[0] = src_u[width - 1] * scale; + dst_uv[1] = src_v[width - 1] * scale; + } +} + +void MultiplyRow_16_C(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_y[x] = src_y[x] * scale; + } +} + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 32768 = 9 bits +// 16384 = 10 bits +// 4096 = 12 bits +// 256 = 16 bits +void Convert16To8Row_C(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_y[x] = clamp255((src_y[x] * scale) >> 16); + } +} + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 1024 = 10 bits +void Convert8To16Row_C(const uint8_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + int x; + scale *= 0x0101; // replicates the byte. + for (x = 0; x < width; ++x) { + dst_y[x] = (src_y[x] * scale) >> 16; + } +} + +void CopyRow_C(const uint8_t* src, uint8_t* dst, int count) { + memcpy(dst, src, count); +} + +void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count) { + memcpy(dst, src, count * 2); +} + +void SetRow_C(uint8_t* dst, uint8_t v8, int width) { + memset(dst, v8, width); +} + +void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width) { + uint32_t* d = (uint32_t*)(dst_argb); + int x; + for (x = 0; x < width; ++x) { + d[x] = v32; + } +} + +// Filter 2 rows of YUY2 UV's (422) into U and V (420). +void YUY2ToUVRow_C(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + // Output a row of UV values, filtering 2 rows of YUY2. + int x; + for (x = 0; x < width; x += 2) { + dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1; + dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1; + src_yuy2 += 4; + dst_u += 1; + dst_v += 1; + } +} + +// Copy row of YUY2 UV's (422) into U and V (422). +void YUY2ToUV422Row_C(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + // Output a row of UV values. + int x; + for (x = 0; x < width; x += 2) { + dst_u[0] = src_yuy2[1]; + dst_v[0] = src_yuy2[3]; + src_yuy2 += 4; + dst_u += 1; + dst_v += 1; + } +} + +// Copy row of YUY2 Y's (422) into Y (420/422). +void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + // Output a row of Y values. + int x; + for (x = 0; x < width - 1; x += 2) { + dst_y[x] = src_yuy2[0]; + dst_y[x + 1] = src_yuy2[2]; + src_yuy2 += 4; + } + if (width & 1) { + dst_y[width - 1] = src_yuy2[0]; + } +} + +// Filter 2 rows of UYVY UV's (422) into U and V (420). +void UYVYToUVRow_C(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + // Output a row of UV values. + int x; + for (x = 0; x < width; x += 2) { + dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1; + dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1; + src_uyvy += 4; + dst_u += 1; + dst_v += 1; + } +} + +// Copy row of UYVY UV's (422) into U and V (422). +void UYVYToUV422Row_C(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + // Output a row of UV values. + int x; + for (x = 0; x < width; x += 2) { + dst_u[0] = src_uyvy[0]; + dst_v[0] = src_uyvy[2]; + src_uyvy += 4; + dst_u += 1; + dst_v += 1; + } +} + +// Copy row of UYVY Y's (422) into Y (420/422). +void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + // Output a row of Y values. + int x; + for (x = 0; x < width - 1; x += 2) { + dst_y[x] = src_uyvy[1]; + dst_y[x + 1] = src_uyvy[3]; + src_uyvy += 4; + } + if (width & 1) { + dst_y[width - 1] = src_uyvy[1]; + } +} + +#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f + +// Blend src_argb0 over src_argb1 and store to dst_argb. +// dst_argb may be src_argb0 or src_argb1. +// This code mimics the SSSE3 version for better testability. +void ARGBBlendRow_C(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + uint32_t fb = src_argb0[0]; + uint32_t fg = src_argb0[1]; + uint32_t fr = src_argb0[2]; + uint32_t a = src_argb0[3]; + uint32_t bb = src_argb1[0]; + uint32_t bg = src_argb1[1]; + uint32_t br = src_argb1[2]; + dst_argb[0] = BLEND(fb, bb, a); + dst_argb[1] = BLEND(fg, bg, a); + dst_argb[2] = BLEND(fr, br, a); + dst_argb[3] = 255u; + + fb = src_argb0[4 + 0]; + fg = src_argb0[4 + 1]; + fr = src_argb0[4 + 2]; + a = src_argb0[4 + 3]; + bb = src_argb1[4 + 0]; + bg = src_argb1[4 + 1]; + br = src_argb1[4 + 2]; + dst_argb[4 + 0] = BLEND(fb, bb, a); + dst_argb[4 + 1] = BLEND(fg, bg, a); + dst_argb[4 + 2] = BLEND(fr, br, a); + dst_argb[4 + 3] = 255u; + src_argb0 += 8; + src_argb1 += 8; + dst_argb += 8; + } + + if (width & 1) { + uint32_t fb = src_argb0[0]; + uint32_t fg = src_argb0[1]; + uint32_t fr = src_argb0[2]; + uint32_t a = src_argb0[3]; + uint32_t bb = src_argb1[0]; + uint32_t bg = src_argb1[1]; + uint32_t br = src_argb1[2]; + dst_argb[0] = BLEND(fb, bb, a); + dst_argb[1] = BLEND(fg, bg, a); + dst_argb[2] = BLEND(fr, br, a); + dst_argb[3] = 255u; + } +} +#undef BLEND + +#define UBLEND(f, b, a) (((a)*f) + ((255 - a) * b) + 255) >> 8 +void BlendPlaneRow_C(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst[0] = UBLEND(src0[0], src1[0], alpha[0]); + dst[1] = UBLEND(src0[1], src1[1], alpha[1]); + src0 += 2; + src1 += 2; + alpha += 2; + dst += 2; + } + if (width & 1) { + dst[0] = UBLEND(src0[0], src1[0], alpha[0]); + } +} +#undef UBLEND + +#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24 + +// Multiply source RGB by alpha and store to destination. +// This code mimics the SSSE3 version for better testability. +void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + int i; + for (i = 0; i < width - 1; i += 2) { + uint32_t b = src_argb[0]; + uint32_t g = src_argb[1]; + uint32_t r = src_argb[2]; + uint32_t a = src_argb[3]; + dst_argb[0] = ATTENUATE(b, a); + dst_argb[1] = ATTENUATE(g, a); + dst_argb[2] = ATTENUATE(r, a); + dst_argb[3] = a; + b = src_argb[4]; + g = src_argb[5]; + r = src_argb[6]; + a = src_argb[7]; + dst_argb[4] = ATTENUATE(b, a); + dst_argb[5] = ATTENUATE(g, a); + dst_argb[6] = ATTENUATE(r, a); + dst_argb[7] = a; + src_argb += 8; + dst_argb += 8; + } + + if (width & 1) { + const uint32_t b = src_argb[0]; + const uint32_t g = src_argb[1]; + const uint32_t r = src_argb[2]; + const uint32_t a = src_argb[3]; + dst_argb[0] = ATTENUATE(b, a); + dst_argb[1] = ATTENUATE(g, a); + dst_argb[2] = ATTENUATE(r, a); + dst_argb[3] = a; + } +} +#undef ATTENUATE + +// Divide source RGB by alpha and store to destination. +// b = (b * 255 + (a / 2)) / a; +// g = (g * 255 + (a / 2)) / a; +// r = (r * 255 + (a / 2)) / a; +// Reciprocal method is off by 1 on some values. ie 125 +// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower. +#define T(a) 0x01000000 + (0x10000 / a) +const uint32_t fixed_invtbl8[256] = { + 0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), + T(0x07), T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), + T(0x0e), T(0x0f), T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), + T(0x15), T(0x16), T(0x17), T(0x18), T(0x19), T(0x1a), T(0x1b), + T(0x1c), T(0x1d), T(0x1e), T(0x1f), T(0x20), T(0x21), T(0x22), + T(0x23), T(0x24), T(0x25), T(0x26), T(0x27), T(0x28), T(0x29), + T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f), T(0x30), + T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37), + T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), + T(0x3f), T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), + T(0x46), T(0x47), T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), + T(0x4d), T(0x4e), T(0x4f), T(0x50), T(0x51), T(0x52), T(0x53), + T(0x54), T(0x55), T(0x56), T(0x57), T(0x58), T(0x59), T(0x5a), + T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f), T(0x60), T(0x61), + T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67), T(0x68), + T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f), + T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), + T(0x77), T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), + T(0x7e), T(0x7f), T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), + T(0x85), T(0x86), T(0x87), T(0x88), T(0x89), T(0x8a), T(0x8b), + T(0x8c), T(0x8d), T(0x8e), T(0x8f), T(0x90), T(0x91), T(0x92), + T(0x93), T(0x94), T(0x95), T(0x96), T(0x97), T(0x98), T(0x99), + T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f), T(0xa0), + T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7), + T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), + T(0xaf), T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), + T(0xb6), T(0xb7), T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), + T(0xbd), T(0xbe), T(0xbf), T(0xc0), T(0xc1), T(0xc2), T(0xc3), + T(0xc4), T(0xc5), T(0xc6), T(0xc7), T(0xc8), T(0xc9), T(0xca), + T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf), T(0xd0), T(0xd1), + T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7), T(0xd8), + T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf), + T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), + T(0xe7), T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), + T(0xee), T(0xef), T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), + T(0xf5), T(0xf6), T(0xf7), T(0xf8), T(0xf9), T(0xfa), T(0xfb), + T(0xfc), T(0xfd), T(0xfe), 0x01000100}; +#undef T + +void ARGBUnattenuateRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + int i; + for (i = 0; i < width; ++i) { + uint32_t b = src_argb[0]; + uint32_t g = src_argb[1]; + uint32_t r = src_argb[2]; + const uint32_t a = src_argb[3]; + const uint32_t ia = fixed_invtbl8[a] & 0xffff; // 8.8 fixed point + b = (b * ia) >> 8; + g = (g * ia) >> 8; + r = (r * ia) >> 8; + // Clamping should not be necessary but is free in assembly. + dst_argb[0] = clamp255(b); + dst_argb[1] = clamp255(g); + dst_argb[2] = clamp255(r); + dst_argb[3] = a; + src_argb += 4; + dst_argb += 4; + } +} + +void ComputeCumulativeSumRow_C(const uint8_t* row, + int32_t* cumsum, + const int32_t* previous_cumsum, + int width) { + int32_t row_sum[4] = {0, 0, 0, 0}; + int x; + for (x = 0; x < width; ++x) { + row_sum[0] += row[x * 4 + 0]; + row_sum[1] += row[x * 4 + 1]; + row_sum[2] += row[x * 4 + 2]; + row_sum[3] += row[x * 4 + 3]; + cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0]; + cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1]; + cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2]; + cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3]; + } +} + +void CumulativeSumToAverageRow_C(const int32_t* tl, + const int32_t* bl, + int w, + int area, + uint8_t* dst, + int count) { + float ooa = 1.0f / area; + int i; + for (i = 0; i < count; ++i) { + dst[0] = (uint8_t)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa); + dst[1] = (uint8_t)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa); + dst[2] = (uint8_t)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa); + dst[3] = (uint8_t)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa); + dst += 4; + tl += 4; + bl += 4; + } +} + +// Copy pixels from rotated source to destination row with a slope. +LIBYUV_API +void ARGBAffineRow_C(const uint8_t* src_argb, + int src_argb_stride, + uint8_t* dst_argb, + const float* uv_dudv, + int width) { + int i; + // Render a row of pixels from source into a buffer. + float uv[2]; + uv[0] = uv_dudv[0]; + uv[1] = uv_dudv[1]; + for (i = 0; i < width; ++i) { + int x = (int)(uv[0]); + int y = (int)(uv[1]); + *(uint32_t*)(dst_argb) = + *(const uint32_t*)(src_argb + y * src_argb_stride + x * 4); + dst_argb += 4; + uv[0] += uv_dudv[2]; + uv[1] += uv_dudv[3]; + } +} + +// Blend 2 rows into 1. +static void HalfRow_C(const uint8_t* src_uv, + ptrdiff_t src_uv_stride, + uint8_t* dst_uv, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; + } +} + +static void HalfRow_16_C(const uint16_t* src_uv, + ptrdiff_t src_uv_stride, + uint16_t* dst_uv, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; + } +} + +// C version 2x2 -> 2x1. +void InterpolateRow_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint8_t* src_ptr1 = src_ptr + src_stride; + int x; + if (y1_fraction == 0) { + memcpy(dst_ptr, src_ptr, width); + return; + } + if (y1_fraction == 128) { + HalfRow_C(src_ptr, src_stride, dst_ptr, width); + return; + } + for (x = 0; x < width - 1; x += 2) { + dst_ptr[0] = + (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8; + dst_ptr[1] = + (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction + 128) >> 8; + src_ptr += 2; + src_ptr1 += 2; + dst_ptr += 2; + } + if (width & 1) { + dst_ptr[0] = + (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8; + } +} + +void InterpolateRow_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint16_t* src_ptr1 = src_ptr + src_stride; + int x; + if (source_y_fraction == 0) { + memcpy(dst_ptr, src_ptr, width * 2); + return; + } + if (source_y_fraction == 128) { + HalfRow_16_C(src_ptr, src_stride, dst_ptr, width); + return; + } + for (x = 0; x < width - 1; x += 2) { + dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; + dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; + src_ptr += 2; + src_ptr1 += 2; + dst_ptr += 2; + } + if (width & 1) { + dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; + } +} + +// Use first 4 shuffler values to reorder ARGB channels. +void ARGBShuffleRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + int index0 = shuffler[0]; + int index1 = shuffler[1]; + int index2 = shuffler[2]; + int index3 = shuffler[3]; + // Shuffle a row of ARGB. + int x; + for (x = 0; x < width; ++x) { + // To support in-place conversion. + uint8_t b = src_argb[index0]; + uint8_t g = src_argb[index1]; + uint8_t r = src_argb[index2]; + uint8_t a = src_argb[index3]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = a; + src_argb += 4; + dst_argb += 4; + } +} + +void I422ToYUY2Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_frame, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst_frame[0] = src_y[0]; + dst_frame[1] = src_u[0]; + dst_frame[2] = src_y[1]; + dst_frame[3] = src_v[0]; + dst_frame += 4; + src_y += 2; + src_u += 1; + src_v += 1; + } + if (width & 1) { + dst_frame[0] = src_y[0]; + dst_frame[1] = src_u[0]; + dst_frame[2] = 0; + dst_frame[3] = src_v[0]; + } +} + +void I422ToUYVYRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_frame, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst_frame[0] = src_u[0]; + dst_frame[1] = src_y[0]; + dst_frame[2] = src_v[0]; + dst_frame[3] = src_y[1]; + dst_frame += 4; + src_y += 2; + src_u += 1; + src_v += 1; + } + if (width & 1) { + dst_frame[0] = src_u[0]; + dst_frame[1] = src_y[0]; + dst_frame[2] = src_v[0]; + dst_frame[3] = 0; + } +} + +void ARGBPolynomialRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + const float* poly, + int width) { + int i; + for (i = 0; i < width; ++i) { + float b = (float)(src_argb[0]); + float g = (float)(src_argb[1]); + float r = (float)(src_argb[2]); + float a = (float)(src_argb[3]); + float b2 = b * b; + float g2 = g * g; + float r2 = r * r; + float a2 = a * a; + float db = poly[0] + poly[4] * b; + float dg = poly[1] + poly[5] * g; + float dr = poly[2] + poly[6] * r; + float da = poly[3] + poly[7] * a; + float b3 = b2 * b; + float g3 = g2 * g; + float r3 = r2 * r; + float a3 = a2 * a; + db += poly[8] * b2; + dg += poly[9] * g2; + dr += poly[10] * r2; + da += poly[11] * a2; + db += poly[12] * b3; + dg += poly[13] * g3; + dr += poly[14] * r3; + da += poly[15] * a3; + + dst_argb[0] = Clamp((int32_t)(db)); + dst_argb[1] = Clamp((int32_t)(dg)); + dst_argb[2] = Clamp((int32_t)(dr)); + dst_argb[3] = Clamp((int32_t)(da)); + src_argb += 4; + dst_argb += 4; + } +} + +// Samples assumed to be unsigned in low 9, 10 or 12 bits. Scale factor +// adjust the source integer range to the half float range desired. + +// This magic constant is 2^-112. Multiplying by this +// is the same as subtracting 112 from the exponent, which +// is the difference in exponent bias between 32-bit and +// 16-bit floats. Once we've done this subtraction, we can +// simply extract the low bits of the exponent and the high +// bits of the mantissa from our float and we're done. + +// Work around GCC 7 punning warning -Wstrict-aliasing +#if defined(__GNUC__) +typedef uint32_t __attribute__((__may_alias__)) uint32_alias_t; +#else +typedef uint32_t uint32_alias_t; +#endif + +void HalfFloatRow_C(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + int i; + float mult = 1.9259299444e-34f * scale; + for (i = 0; i < width; ++i) { + float value = src[i] * mult; + dst[i] = (uint16_t)((*(const uint32_alias_t*)&value) >> 13); + } +} + +void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width) { + int i; + for (i = 0; i < width; ++i) { + float value = src[i] * scale; + dst[i] = value; + } +} + +void ARGBLumaColorTableRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + const uint8_t* luma, + uint32_t lumacoeff) { + uint32_t bc = lumacoeff & 0xff; + uint32_t gc = (lumacoeff >> 8) & 0xff; + uint32_t rc = (lumacoeff >> 16) & 0xff; + + int i; + for (i = 0; i < width - 1; i += 2) { + // Luminance in rows, color values in columns. + const uint8_t* luma0 = + ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) + + luma; + const uint8_t* luma1; + dst_argb[0] = luma0[src_argb[0]]; + dst_argb[1] = luma0[src_argb[1]]; + dst_argb[2] = luma0[src_argb[2]]; + dst_argb[3] = src_argb[3]; + luma1 = + ((src_argb[4] * bc + src_argb[5] * gc + src_argb[6] * rc) & 0x7F00u) + + luma; + dst_argb[4] = luma1[src_argb[4]]; + dst_argb[5] = luma1[src_argb[5]]; + dst_argb[6] = luma1[src_argb[6]]; + dst_argb[7] = src_argb[7]; + src_argb += 8; + dst_argb += 8; + } + if (width & 1) { + // Luminance in rows, color values in columns. + const uint8_t* luma0 = + ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) + + luma; + dst_argb[0] = luma0[src_argb[0]]; + dst_argb[1] = luma0[src_argb[1]]; + dst_argb[2] = luma0[src_argb[2]]; + dst_argb[3] = src_argb[3]; + } +} + +void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) { + int i; + for (i = 0; i < width - 1; i += 2) { + dst[3] = src[3]; + dst[7] = src[7]; + dst += 8; + src += 8; + } + if (width & 1) { + dst[3] = src[3]; + } +} + +void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width) { + int i; + for (i = 0; i < width - 1; i += 2) { + dst_a[0] = src_argb[3]; + dst_a[1] = src_argb[7]; + dst_a += 2; + src_argb += 8; + } + if (width & 1) { + dst_a[0] = src_argb[3]; + } +} + +void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) { + int i; + for (i = 0; i < width - 1; i += 2) { + dst[3] = src[0]; + dst[7] = src[1]; + dst += 8; + src += 2; + } + if (width & 1) { + dst[3] = src[0]; + } +} + +// Maximum temporary width for wrappers to process at a time, in pixels. +#define MAXTWIDTH 2048 + +#if !(defined(_MSC_VER) && defined(_M_IX86)) && \ + defined(HAS_I422TORGB565ROW_SSSE3) +// row_win.cc has asm version, but GCC uses 2 step wrapper. +void I422ToRGB565Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth); + ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_rgb565 += twidth * 2; + width -= twidth; + } +} +#endif + +#if defined(HAS_I422TOARGB1555ROW_SSSE3) +void I422ToARGB1555Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth); + ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth); + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_argb1555 += twidth * 2; + width -= twidth; + } +} +#endif + +#if defined(HAS_I422TOARGB4444ROW_SSSE3) +void I422ToARGB4444Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth); + ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth); + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_argb4444 += twidth * 2; + width -= twidth; + } +} +#endif + +#if defined(HAS_NV12TORGB565ROW_SSSE3) +void NV12ToRGB565Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth); + ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); + src_y += twidth; + src_uv += twidth; + dst_rgb565 += twidth * 2; + width -= twidth; + } +} +#endif + +#if defined(HAS_NV12TORGB24ROW_SSSE3) +void NV12ToRGB24Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth); + ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); + src_y += twidth; + src_uv += twidth; + dst_rgb24 += twidth * 3; + width -= twidth; + } +} +#endif + +#if defined(HAS_NV21TORGB24ROW_SSSE3) +void NV21ToRGB24Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV21ToARGBRow_SSSE3(src_y, src_vu, row, yuvconstants, twidth); + ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); + src_y += twidth; + src_vu += twidth; + dst_rgb24 += twidth * 3; + width -= twidth; + } +} +#endif + +#if defined(HAS_NV12TORGB24ROW_AVX2) +void NV12ToRGB24Row_AVX2(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth); +#if defined(HAS_ARGBTORGB24ROW_AVX2) + ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); +#else + ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); +#endif + src_y += twidth; + src_uv += twidth; + dst_rgb24 += twidth * 3; + width -= twidth; + } +} +#endif + +#if defined(HAS_NV21TORGB24ROW_AVX2) +void NV21ToRGB24Row_AVX2(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV21ToARGBRow_AVX2(src_y, src_vu, row, yuvconstants, twidth); +#if defined(HAS_ARGBTORGB24ROW_AVX2) + ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); +#else + ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); +#endif + src_y += twidth; + src_vu += twidth; + dst_rgb24 += twidth * 3; + width -= twidth; + } +} +#endif + +#if defined(HAS_I422TORGB565ROW_AVX2) +void I422ToRGB565Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); +#if defined(HAS_ARGBTORGB565ROW_AVX2) + ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth); +#else + ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); +#endif + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_rgb565 += twidth * 2; + width -= twidth; + } +} +#endif + +#if defined(HAS_I422TOARGB1555ROW_AVX2) +void I422ToARGB1555Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); +#if defined(HAS_ARGBTOARGB1555ROW_AVX2) + ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth); +#else + ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth); +#endif + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_argb1555 += twidth * 2; + width -= twidth; + } +} +#endif + +#if defined(HAS_I422TOARGB4444ROW_AVX2) +void I422ToARGB4444Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); +#if defined(HAS_ARGBTOARGB4444ROW_AVX2) + ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth); +#else + ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth); +#endif + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_argb4444 += twidth * 2; + width -= twidth; + } +} +#endif + +#if defined(HAS_I422TORGB24ROW_AVX2) +void I422ToRGB24Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); +#if defined(HAS_ARGBTORGB24ROW_AVX2) + ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); +#else + ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); +#endif + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_rgb24 += twidth * 3; + width -= twidth; + } +} +#endif + +#if defined(HAS_NV12TORGB565ROW_AVX2) +void NV12ToRGB565Row_AVX2(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth); +#if defined(HAS_ARGBTORGB565ROW_AVX2) + ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth); +#else + ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); +#endif + src_y += twidth; + src_uv += twidth; + dst_rgb565 += twidth * 2; + width -= twidth; + } +} +#endif + +float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) { + float fsum = 0.f; + int i; +#if defined(__clang__) +#pragma clang loop vectorize_width(4) +#endif + for (i = 0; i < width; ++i) { + float v = *src++; + fsum += v * v; + *dst++ = v * scale; + } + return fsum; +} + +float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width) { + float fmax = 0.f; + int i; + for (i = 0; i < width; ++i) { + float v = *src++; + float vs = v * scale; + fmax = (v > fmax) ? v : fmax; + *dst++ = vs; + } + return fmax; +} + +void ScaleSamples_C(const float* src, float* dst, float scale, int width) { + int i; + for (i = 0; i < width; ++i) { + *dst++ = *src++ * scale; + } +} + +void GaussRow_C(const uint32_t* src, uint16_t* dst, int width) { + int i; + for (i = 0; i < width; ++i) { + *dst++ = + (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8; + ++src; + } +} + +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussCol_C(const uint16_t* src0, + const uint16_t* src1, + const uint16_t* src2, + const uint16_t* src3, + const uint16_t* src4, + uint32_t* dst, + int width) { + int i; + for (i = 0; i < width; ++i) { + *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++; + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/row_gcc.cc b/media/libvpx/libvpx/third_party/libyuv/source/row_gcc.cc new file mode 100644 index 0000000000..8d3cb81cec --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/row_gcc.cc @@ -0,0 +1,6677 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC x86 and x64. +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) + +#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) + +// Constants for ARGB +static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0, + 13, 65, 33, 0, 13, 65, 33, 0}; + +// JPeg full range. +static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0, + 15, 75, 38, 0, 15, 75, 38, 0}; +#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) + +#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) + +static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0, + 112, -74, -38, 0, 112, -74, -38, 0}; + +static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0, + 127, -84, -43, 0, 127, -84, -43, 0}; + +static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0, + -18, -94, 112, 0, -18, -94, 112, 0}; + +static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0, + -20, -107, 127, 0, -20, -107, 127, 0}; + +// Constants for BGRA +static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13, + 0, 33, 65, 13, 0, 33, 65, 13}; + +static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112, + 0, -38, -74, 112, 0, -38, -74, 112}; + +static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18, + 0, 112, -94, -18, 0, 112, -94, -18}; + +// Constants for ABGR +static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0, + 33, 65, 13, 0, 33, 65, 13, 0}; + +static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0, + -38, -74, 112, 0, -38, -74, 112, 0}; + +static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0, + 112, -94, -18, 0, 112, -94, -18, 0}; + +// Constants for RGBA. +static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33, + 0, 13, 65, 33, 0, 13, 65, 33}; + +static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38, + 0, 112, -74, -38, 0, 112, -74, -38}; + +static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112, + 0, -18, -94, 112, 0, -18, -94, 112}; + +static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u}; + +// 7 bit fixed point 0.5. +static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64}; + +static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; + +static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, + 0x8080u, 0x8080u, 0x8080u, 0x8080u}; +#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) + +#ifdef HAS_RGB24TOARGBROW_SSSE3 + +// Shuffle table for converting RGB24 to ARGB. +static const uvec8 kShuffleMaskRGB24ToARGB = { + 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u}; + +// Shuffle table for converting RAW to ARGB. +static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, + 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u}; + +// Shuffle table for converting RAW to RGB24. First 8. +static const uvec8 kShuffleMaskRAWToRGB24_0 = { + 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; + +// Shuffle table for converting RAW to RGB24. Middle 8. +static const uvec8 kShuffleMaskRAWToRGB24_1 = { + 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; + +// Shuffle table for converting RAW to RGB24. Last 8. +static const uvec8 kShuffleMaskRAWToRGB24_2 = { + 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; + +// Shuffle table for converting ARGB to RGB24. +static const uvec8 kShuffleMaskARGBToRGB24 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u}; + +// Shuffle table for converting ARGB to RAW. +static const uvec8 kShuffleMaskARGBToRAW = { + 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u}; + +// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 +static const uvec8 kShuffleMaskARGBToRGB24_0 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u}; + +// YUY2 shuf 16 Y to 32 Y. +static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, + 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4, + 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; + +// YUY2 shuf 8 UV to 16 UV. +static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, + 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7, + 5, 7, 9, 11, 9, 11, 13, 15, 13, 15}; + +// UYVY shuf 16 Y to 32 Y. +static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, + 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5, + 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}; + +// UYVY shuf 8 UV to 16 UV. +static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, + 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6, + 4, 6, 8, 10, 8, 10, 12, 14, 12, 14}; + +// NV21 shuf 8 VU to 16 UV. +static const lvec8 kShuffleNV21 = { + 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, + 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, +}; +#endif // HAS_RGB24TOARGBROW_SSSE3 + +#ifdef HAS_J400TOARGBROW_SSE2 +void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm1 \n" + "por %%xmm5,%%xmm0 \n" + "por %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm5"); +} +#endif // HAS_J400TOARGBROW_SSE2 + +#ifdef HAS_RGB24TOARGBROW_SSSE3 +void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 + "pslld $0x18,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm3 \n" + "lea 0x30(%0),%0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "palignr $0x8,%%xmm1,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm2 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu %%xmm2,0x20(%1) \n" + "por %%xmm5,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "palignr $0x4,%%xmm3,%%xmm3 \n" + "pshufb %%xmm4,%%xmm3 \n" + "movdqu %%xmm1,0x10(%1) \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm3,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskRGB24ToARGB) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 + "pslld $0x18,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm3 \n" + "lea 0x30(%0),%0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "palignr $0x8,%%xmm1,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm2 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu %%xmm2,0x20(%1) \n" + "por %%xmm5,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "palignr $0x4,%%xmm3,%%xmm3 \n" + "pshufb %%xmm4,%%xmm3 \n" + "movdqu %%xmm1,0x10(%1) \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm3,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskRAWToARGB) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, + uint8_t* dst_rgb24, + int width) { + asm volatile( + "movdqa %3,%%xmm3 \n" + "movdqa %4,%%xmm4 \n" + "movdqa %5,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x4(%0),%%xmm1 \n" + "movdqu 0x8(%0),%%xmm2 \n" + "lea 0x18(%0),%0 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm5,%%xmm2 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x8(%1) \n" + "movq %%xmm2,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskRAWToRGB24_0), // %3 + "m"(kShuffleMaskRAWToRGB24_1), // %4 + "m"(kShuffleMaskRAWToRGB24_2) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "mov $0x1080108,%%eax \n" + "movd %%eax,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x20802080,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psllw $0xb,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0xa,%%xmm4 \n" + "psrlw $0x5,%%xmm4 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "psllw $0x8,%%xmm7 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm3,%%xmm1 \n" + "psllw $0xb,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "psllw $0x8,%%xmm1 \n" + "por %%xmm2,%%xmm1 \n" + "pand %%xmm4,%%xmm0 \n" + "pmulhuw %%xmm6,%%xmm0 \n" + "por %%xmm7,%%xmm0 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqu %%xmm1,0x00(%1,%0,2) \n" + "movdqu %%xmm2,0x10(%1,%0,2) \n" + "lea 0x10(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7"); +} + +void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "mov $0x1080108,%%eax \n" + "movd %%eax,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x42004200,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psllw $0xb,%%xmm3 \n" + "movdqa %%xmm3,%%xmm4 \n" + "psrlw $0x6,%%xmm4 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "psllw $0x8,%%xmm7 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psllw $0x1,%%xmm1 \n" + "psllw $0xb,%%xmm2 \n" + "pand %%xmm3,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "psllw $0x8,%%xmm1 \n" + "por %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm4,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "pmulhuw %%xmm6,%%xmm0 \n" + "pand %%xmm7,%%xmm2 \n" + "por %%xmm2,%%xmm0 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqu %%xmm1,0x00(%1,%0,2) \n" + "movdqu %%xmm2,0x10(%1,%0,2) \n" + "lea 0x10(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7"); +} + +void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "mov $0xf0f0f0f,%%eax \n" + "movd %%eax,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "pslld $0x4,%%xmm5 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm4,%%xmm0 \n" + "pand %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm3 \n" + "psllw $0x4,%%xmm1 \n" + "psrlw $0x4,%%xmm3 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm3,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,0x00(%1,%0,2) \n" + "movdqu %%xmm1,0x10(%1,%0,2) \n" + "lea 0x10(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + + "movdqa %3,%%xmm6 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "pshufb %%xmm6,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "pshufb %%xmm6,%%xmm2 \n" + "pshufb %%xmm6,%%xmm3 \n" + "movdqa %%xmm1,%%xmm4 \n" + "psrldq $0x4,%%xmm1 \n" + "pslldq $0xc,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "por %%xmm4,%%xmm0 \n" + "pslldq $0x8,%%xmm5 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "psrldq $0x8,%%xmm2 \n" + "pslldq $0x4,%%xmm3 \n" + "por %%xmm3,%%xmm2 \n" + "movdqu %%xmm1,0x10(%1) \n" + "movdqu %%xmm2,0x20(%1) \n" + "lea 0x30(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskARGBToRGB24) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} + +void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + + "movdqa %3,%%xmm6 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "pshufb %%xmm6,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "pshufb %%xmm6,%%xmm2 \n" + "pshufb %%xmm6,%%xmm3 \n" + "movdqa %%xmm1,%%xmm4 \n" + "psrldq $0x4,%%xmm1 \n" + "pslldq $0xc,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "por %%xmm4,%%xmm0 \n" + "pslldq $0x8,%%xmm5 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "psrldq $0x8,%%xmm2 \n" + "pslldq $0x4,%%xmm3 \n" + "por %%xmm3,%%xmm2 \n" + "movdqu %%xmm1,0x10(%1) \n" + "movdqu %%xmm2,0x20(%1) \n" + "lea 0x30(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskARGBToRAW) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} + +#ifdef HAS_ARGBTORGB24ROW_AVX2 +// vpermd for 12+12 to 24 +static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7}; + +void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm6 \n" + "vmovdqa %4,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0 + "vpshufb %%ymm6,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm6,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm6,%%ymm3,%%ymm3 \n" + "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes + "vpermd %%ymm1,%%ymm7,%%ymm1 \n" + "vpermd %%ymm2,%%ymm7,%%ymm2 \n" + "vpermd %%ymm3,%%ymm7,%%ymm3 \n" + "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8 + "vpor %%ymm4,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16 + "vpermq $0x4f,%%ymm2,%%ymm4 \n" + "vpor %%ymm4,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24 + "vpermq $0x93,%%ymm3,%%ymm3 \n" + "vpor %%ymm3,%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm2,0x40(%1) \n" + "lea 0x60(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskARGBToRGB24), // %3 + "m"(kPermdRGB24_AVX) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_ARGBTORGB24ROW_AVX512VBMI +// Shuffle table for converting ARGBToRGB24 +static const ulvec8 kPermARGBToRGB24_0 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, + 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u, + 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u}; +static const ulvec8 kPermARGBToRGB24_1 = { + 10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, + 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, + 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u}; +static const ulvec8 kPermARGBToRGB24_2 = { + 21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, + 36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, + 50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u}; + +void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vmovdqa %3,%%ymm5 \n" + "vmovdqa %4,%%ymm6 \n" + "vmovdqa %5,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n" + "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n" + "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "vmovdqu %%ymm2,0x40(%1) \n" + "lea 0x60(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kPermARGBToRGB24_0), // %3 + "m"(kPermARGBToRGB24_1), // %4 + "m"(kPermARGBToRGB24_2) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7"); +} +#endif + +#ifdef HAS_ARGBTORAWROW_AVX2 +void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm6 \n" + "vmovdqa %4,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0 + "vpshufb %%ymm6,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm6,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm6,%%ymm3,%%ymm3 \n" + "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes + "vpermd %%ymm1,%%ymm7,%%ymm1 \n" + "vpermd %%ymm2,%%ymm7,%%ymm2 \n" + "vpermd %%ymm3,%%ymm7,%%ymm3 \n" + "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8 + "vpor %%ymm4,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16 + "vpermq $0x4f,%%ymm2,%%ymm4 \n" + "vpor %%ymm4,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24 + "vpermq $0x93,%%ymm3,%%ymm3 \n" + "vpor %%ymm3,%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm2,0x40(%1) \n" + "lea 0x60(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskARGBToRAW), // %3 + "m"(kPermdRGB24_AVX) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "pcmpeqb %%xmm3,%%xmm3 \n" + "psrld $0x1b,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1a,%%xmm4 \n" + "pslld $0x5,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0xb,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pslld $0x8,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x5,%%xmm2 \n" + "psrad $0x10,%%xmm0 \n" + "pand %%xmm3,%%xmm1 \n" + "pand %%xmm4,%%xmm2 \n" + "pand %%xmm5,%%xmm0 \n" + "por %%xmm2,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +void ARGBToRGB565DitherRow_SSE2(const uint8_t* src, + uint8_t* dst, + const uint32_t dither4, + int width) { + asm volatile( + "movd %3,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm6 \n" + "movdqa %%xmm6,%%xmm7 \n" + "punpcklwd %%xmm6,%%xmm6 \n" + "punpckhwd %%xmm7,%%xmm7 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psrld $0x1b,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1a,%%xmm4 \n" + "pslld $0x5,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0xb,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "paddusb %%xmm6,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pslld $0x8,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x5,%%xmm2 \n" + "psrad $0x10,%%xmm0 \n" + "pand %%xmm3,%%xmm1 \n" + "pand %%xmm4,%%xmm2 \n" + "pand %%xmm5,%%xmm0 \n" + "por %%xmm2,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(dither4) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} + +#ifdef HAS_ARGBTORGB565DITHERROW_AVX2 +void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, + uint8_t* dst, + const uint32_t dither4, + int width) { + asm volatile( + "vbroadcastss %3,%%xmm6 \n" + "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" + "vpermq $0xd8,%%ymm6,%%ymm6 \n" + "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n" + "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" + "vpsrld $0x1b,%%ymm3,%%ymm3 \n" + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrld $0x1a,%%ymm4,%%ymm4 \n" + "vpslld $0x5,%%ymm4,%%ymm4 \n" + "vpslld $0xb,%%ymm3,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n" + "vpsrld $0x5,%%ymm0,%%ymm2 \n" + "vpsrld $0x3,%%ymm0,%%ymm1 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" + "vpand %%ymm4,%%ymm2,%%ymm2 \n" + "vpand %%ymm3,%%ymm1,%%ymm1 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpor %%ymm2,%%ymm1,%%ymm1 \n" + "vpor %%ymm1,%%ymm0,%%ymm0 \n" + "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "lea 0x20(%0),%0 \n" + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(dither4) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_ARGBTORGB565DITHERROW_AVX2 + +void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1b,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "pslld $0x5,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "pslld $0xa,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "pslld $0xf,%%xmm7 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "psrad $0x10,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x6,%%xmm2 \n" + "psrld $0x9,%%xmm3 \n" + "pand %%xmm7,%%xmm0 \n" + "pand %%xmm4,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm6,%%xmm3 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm3,%%xmm2 \n" + "por %%xmm2,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); +} + +void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0xc,%%xmm4 \n" + "movdqa %%xmm4,%%xmm3 \n" + "psrlw $0x8,%%xmm3 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm3,%%xmm0 \n" + "pand %%xmm4,%%xmm1 \n" + "psrlq $0x4,%%xmm0 \n" + "psrlq $0x8,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} +#endif // HAS_RGB24TOARGBROW_SSSE3 + +/* + +ARGBToAR30Row: + +Red Blue +With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will +produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats +wanted for the blue channel. The red needs to be shifted 4 left, so multiply by +(1024+4)*16 for red. + +Alpha Green +Alpha and Green are already in the high bits so vpand can zero out the other +bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier +could be used for Green - (1024+4) putting the 10 bit green in the lsb. Alpha +would be a simple multiplier to shift it into position. It wants a gap of 10 +above the green. Green is 10 bits, so there are 6 bits in the low short. 4 +more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits, +and then a shift of 4 is a multiply of 16, so (4*16) = 64. Then shift the +result left 10 to position the A and G channels. +*/ + +// Shuffle table for converting RAW to RGB24. Last 8. +static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u, 128u, 4u, 128u, 6u, + 128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u}; + +static const uvec8 kShuffleBR30 = {128u, 2u, 128u, 0u, 128u, 6u, 128u, 4u, + 128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u}; + +static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028; +static const uint32_t kMaskRB10 = 0x3ff003ff; +static const uint32_t kMaskAG10 = 0xc000ff00; +static const uint32_t kMulAG10 = 64 * 65536 + 1028; + +void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "movdqa %3,%%xmm2 \n" // shuffler for RB + "movd %4,%%xmm3 \n" // multipler for RB + "movd %5,%%xmm4 \n" // mask for R10 B10 + "movd %6,%%xmm5 \n" // mask for AG + "movd %7,%%xmm6 \n" // multipler for AG + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "sub %0,%1 \n" + + "1: \n" + "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" // R0B0 + "pand %%xmm5,%%xmm0 \n" // A0G0 + "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10 + "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10 + "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10 + "pslld $10,%%xmm0 \n" // A2 x10 G10 x10 + "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10 + "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels + "add $0x10,%0 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleRB30), // %3 + "m"(kMulRB10), // %4 + "m"(kMaskRB10), // %5 + "m"(kMaskAG10), // %6 + "m"(kMulAG10) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} + +void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "movdqa %3,%%xmm2 \n" // shuffler for RB + "movd %4,%%xmm3 \n" // multipler for RB + "movd %5,%%xmm4 \n" // mask for R10 B10 + "movd %6,%%xmm5 \n" // mask for AG + "movd %7,%%xmm6 \n" // multipler for AG + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "sub %0,%1 \n" + + "1: \n" + "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" // R0B0 + "pand %%xmm5,%%xmm0 \n" // A0G0 + "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10 + "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10 + "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10 + "pslld $10,%%xmm0 \n" // A2 x10 G10 x10 + "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10 + "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels + "add $0x10,%0 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleBR30), // %3 reversed shuffler + "m"(kMulRB10), // %4 + "m"(kMaskRB10), // %5 + "m"(kMaskAG10), // %6 + "m"(kMulAG10) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} + +#ifdef HAS_ARGBTOAR30ROW_AVX2 +void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB + "vbroadcastss %4,%%ymm3 \n" // multipler for RB + "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 + "vbroadcastss %6,%%ymm5 \n" // mask for AG + "vbroadcastss %7,%%ymm6 \n" // multipler for AG + "sub %0,%1 \n" + + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels + "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0 + "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0 + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10 + "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10 + "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10 + "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10 + "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10 + "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels + "add $0x20,%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleRB30), // %3 + "m"(kMulRB10), // %4 + "m"(kMaskRB10), // %5 + "m"(kMaskAG10), // %6 + "m"(kMulAG10) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + +#ifdef HAS_ABGRTOAR30ROW_AVX2 +void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB + "vbroadcastss %4,%%ymm3 \n" // multipler for RB + "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 + "vbroadcastss %6,%%ymm5 \n" // mask for AG + "vbroadcastss %7,%%ymm6 \n" // multipler for AG + "sub %0,%1 \n" + + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels + "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0 + "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0 + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10 + "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10 + "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10 + "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10 + "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10 + "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels + "add $0x20,%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleBR30), // %3 reversed shuffler + "m"(kMulRB10), // %4 + "m"(kMaskRB10), // %5 + "m"(kMaskAG10), // %6 + "m"(kMulAG10) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + +#ifdef HAS_ARGBTOYROW_SSSE3 +// Convert 16 ARGB pixels (64 bytes) to 16 Y values. +void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kARGBToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_ARGBTOYROW_SSSE3 + +#ifdef HAS_ARGBTOYJROW_SSSE3 +// Convert 16 ARGB pixels (64 bytes) to 16 YJ values. +// Same as ARGBToYRow but different coefficients, no add 16, but do rounding. +void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kARGBToYJ), // %3 + "m"(kAddYJ64) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_ARGBTOYJROW_SSSE3 + +#ifdef HAS_ARGBTOYROW_AVX2 +// vpermd for vphaddw + vpackuswb vpermd. +static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; + +// Convert 32 ARGB pixels (128 bytes) to 32 Y values. +void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" + "vmovdqu %5,%%ymm6 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. + "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" + "vpsrlw $0x7,%%ymm0,%%ymm0 \n" + "vpsrlw $0x7,%%ymm2,%%ymm2 \n" + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. + "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. + "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kARGBToY), // %3 + "m"(kAddY16), // %4 + "m"(kPermdARGBToY_AVX) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif // HAS_ARGBTOYROW_AVX2 + +#ifdef HAS_ARGBTOYJROW_AVX2 +// Convert 32 ARGB pixels (128 bytes) to 32 Y values. +void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" + "vmovdqu %5,%%ymm6 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. + "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding. + "vpaddw %%ymm5,%%ymm2,%%ymm2 \n" + "vpsrlw $0x7,%%ymm0,%%ymm0 \n" + "vpsrlw $0x7,%%ymm2,%%ymm2 \n" + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. + "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kARGBToYJ), // %3 + "m"(kAddYJ64), // %4 + "m"(kPermdARGBToY_AVX) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif // HAS_ARGBTOYJROW_AVX2 + +#ifdef HAS_ARGBTOUVROW_SSSE3 +void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)), // %4 + "m"(kARGBToV), // %5 + "m"(kARGBToU), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); +} +#endif // HAS_ARGBTOUVROW_SSSE3 + +#ifdef HAS_ARGBTOUVROW_AVX2 +// vpshufb for vphaddw + vpackuswb packed to shorts. +static const lvec8 kShufARGBToUV_AVX = { + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15}; +void ARGBToUVRow_AVX2(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vbroadcastf128 %5,%%ymm5 \n" + "vbroadcastf128 %6,%%ymm6 \n" + "vbroadcastf128 %7,%%ymm7 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" + "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" + "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" + "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" + "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" + "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" + + "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" + "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" + "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsraw $0x8,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm0,%%ymm0 \n" + "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %8,%%ymm0,%%ymm0 \n" + "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" + + "vextractf128 $0x0,%%ymm0,(%1) \n" + "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)), // %4 + "m"(kAddUV128), // %5 + "m"(kARGBToV), // %6 + "m"(kARGBToU), // %7 + "m"(kShufARGBToUV_AVX) // %8 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_ARGBTOUVROW_AVX2 + +#ifdef HAS_ARGBTOUVJROW_AVX2 +void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vbroadcastf128 %5,%%ymm5 \n" + "vbroadcastf128 %6,%%ymm6 \n" + "vbroadcastf128 %7,%%ymm7 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" + "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" + "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" + "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" + "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" + "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" + + "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" + "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" + "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm0,%%ymm0 \n" + "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %8,%%ymm0,%%ymm0 \n" + + "vextractf128 $0x0,%%ymm0,(%1) \n" + "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)), // %4 + "m"(kAddUVJ128), // %5 + "m"(kARGBToVJ), // %6 + "m"(kARGBToUJ), // %7 + "m"(kShufARGBToUV_AVX) // %8 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_ARGBTOUVJROW_AVX2 + +#ifdef HAS_ARGBTOUVJROW_SSSE3 +void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)), // %4 + "m"(kARGBToVJ), // %5 + "m"(kARGBToUJ), // %6 + "m"(kAddUVJ128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); +} +#endif // HAS_ARGBTOUVJROW_SSSE3 + +#ifdef HAS_ARGBTOUV444ROW_SSSE3 +void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "movdqa %4,%%xmm3 \n" + "movdqa %5,%%xmm4 \n" + "movdqa %6,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm6 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm6,%%xmm2 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "packsswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "pmaddubsw %%xmm3,%%xmm0 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm6,%%xmm2 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "packsswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "lea 0x40(%0),%0 \n" + "movdqu %%xmm0,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "m"(kARGBToV), // %4 + "m"(kARGBToU), // %5 + "m"(kAddUV128) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6"); +} +#endif // HAS_ARGBTOUV444ROW_SSSE3 + +void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) { + asm volatile( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kBGRAToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0, + int src_stride_bgra, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_bgra0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_bgra)), // %4 + "m"(kBGRAToV), // %5 + "m"(kBGRAToU), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); +} + +void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + asm volatile( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kABGRToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { + asm volatile( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kRGBAToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_abgr0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_abgr)), // %4 + "m"(kABGRToV), // %5 + "m"(kABGRToU), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); +} + +void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0, + int src_stride_rgba, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_rgba0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_rgba)), // %4 + "m"(kRGBAToV), // %5 + "m"(kRGBAToU), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); +} + +#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) + +// Read 8 UV from 444 +#define READYUV444 \ + "movq (%[u_buf]),%%xmm0 \n" \ + "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x8(%[u_buf]),%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "movq (%[y_buf]),%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea 0x8(%[y_buf]),%[y_buf] \n" + +// Read 4 UV from 422, upsample to 8 UV +#define READYUV422 \ + "movd (%[u_buf]),%%xmm0 \n" \ + "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x4(%[u_buf]),%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" \ + "movq (%[y_buf]),%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea 0x8(%[y_buf]),%[y_buf] \n" + +// Read 4 UV from 422 10 bit, upsample to 8 UV +// TODO(fbarchard): Consider shufb to replace pack/unpack +// TODO(fbarchard): Consider pmulhuw to replace psraw +// TODO(fbarchard): Consider pmullw to replace psllw and allow different bits. +#define READYUV210 \ + "movq (%[u_buf]),%%xmm0 \n" \ + "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x8(%[u_buf]),%[u_buf] \n" \ + "punpcklwd %%xmm1,%%xmm0 \n" \ + "psraw $0x2,%%xmm0 \n" \ + "packuswb %%xmm0,%%xmm0 \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" \ + "movdqu (%[y_buf]),%%xmm4 \n" \ + "psllw $0x6,%%xmm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" + +// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. +#define READYUVA422 \ + "movd (%[u_buf]),%%xmm0 \n" \ + "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x4(%[u_buf]),%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" \ + "movq (%[y_buf]),%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea 0x8(%[y_buf]),%[y_buf] \n" \ + "movq (%[a_buf]),%%xmm5 \n" \ + "lea 0x8(%[a_buf]),%[a_buf] \n" + +// Read 4 UV from NV12, upsample to 8 UV +#define READNV12 \ + "movq (%[uv_buf]),%%xmm0 \n" \ + "lea 0x8(%[uv_buf]),%[uv_buf] \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" \ + "movq (%[y_buf]),%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea 0x8(%[y_buf]),%[y_buf] \n" + +// Read 4 VU from NV21, upsample to 8 UV +#define READNV21 \ + "movq (%[vu_buf]),%%xmm0 \n" \ + "lea 0x8(%[vu_buf]),%[vu_buf] \n" \ + "pshufb %[kShuffleNV21], %%xmm0 \n" \ + "movq (%[y_buf]),%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea 0x8(%[y_buf]),%[y_buf] \n" + +// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. +#define READYUY2 \ + "movdqu (%[yuy2_buf]),%%xmm4 \n" \ + "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \ + "movdqu (%[yuy2_buf]),%%xmm0 \n" \ + "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \ + "lea 0x10(%[yuy2_buf]),%[yuy2_buf] \n" + +// Read 4 UYVY with 8 Y and update 4 UV to 8 UV. +#define READUYVY \ + "movdqu (%[uyvy_buf]),%%xmm4 \n" \ + "pshufb %[kShuffleUYVYY], %%xmm4 \n" \ + "movdqu (%[uyvy_buf]),%%xmm0 \n" \ + "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \ + "lea 0x10(%[uyvy_buf]),%[uyvy_buf] \n" + +#if defined(__x86_64__) +#define YUVTORGB_SETUP(yuvconstants) \ + "movdqa (%[yuvconstants]),%%xmm8 \n" \ + "movdqa 32(%[yuvconstants]),%%xmm9 \n" \ + "movdqa 64(%[yuvconstants]),%%xmm10 \n" \ + "movdqa 96(%[yuvconstants]),%%xmm11 \n" \ + "movdqa 128(%[yuvconstants]),%%xmm12 \n" \ + "movdqa 160(%[yuvconstants]),%%xmm13 \n" \ + "movdqa 192(%[yuvconstants]),%%xmm14 \n" +// Convert 8 pixels: 8 UV and 8 Y +#define YUVTORGB16(yuvconstants) \ + "movdqa %%xmm0,%%xmm1 \n" \ + "movdqa %%xmm0,%%xmm2 \n" \ + "movdqa %%xmm0,%%xmm3 \n" \ + "movdqa %%xmm11,%%xmm0 \n" \ + "pmaddubsw %%xmm8,%%xmm1 \n" \ + "psubw %%xmm1,%%xmm0 \n" \ + "movdqa %%xmm12,%%xmm1 \n" \ + "pmaddubsw %%xmm9,%%xmm2 \n" \ + "psubw %%xmm2,%%xmm1 \n" \ + "movdqa %%xmm13,%%xmm2 \n" \ + "pmaddubsw %%xmm10,%%xmm3 \n" \ + "psubw %%xmm3,%%xmm2 \n" \ + "pmulhuw %%xmm14,%%xmm4 \n" \ + "paddsw %%xmm4,%%xmm0 \n" \ + "paddsw %%xmm4,%%xmm1 \n" \ + "paddsw %%xmm4,%%xmm2 \n" +#define YUVTORGB_REGS \ + "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", + +#else +#define YUVTORGB_SETUP(yuvconstants) +// Convert 8 pixels: 8 UV and 8 Y +#define YUVTORGB16(yuvconstants) \ + "movdqa %%xmm0,%%xmm1 \n" \ + "movdqa %%xmm0,%%xmm2 \n" \ + "movdqa %%xmm0,%%xmm3 \n" \ + "movdqa 96(%[yuvconstants]),%%xmm0 \n" \ + "pmaddubsw (%[yuvconstants]),%%xmm1 \n" \ + "psubw %%xmm1,%%xmm0 \n" \ + "movdqa 128(%[yuvconstants]),%%xmm1 \n" \ + "pmaddubsw 32(%[yuvconstants]),%%xmm2 \n" \ + "psubw %%xmm2,%%xmm1 \n" \ + "movdqa 160(%[yuvconstants]),%%xmm2 \n" \ + "pmaddubsw 64(%[yuvconstants]),%%xmm3 \n" \ + "psubw %%xmm3,%%xmm2 \n" \ + "pmulhuw 192(%[yuvconstants]),%%xmm4 \n" \ + "paddsw %%xmm4,%%xmm0 \n" \ + "paddsw %%xmm4,%%xmm1 \n" \ + "paddsw %%xmm4,%%xmm2 \n" +#define YUVTORGB_REGS +#endif + +#define YUVTORGB(yuvconstants) \ + YUVTORGB16(yuvconstants) \ + "psraw $0x6,%%xmm0 \n" \ + "psraw $0x6,%%xmm1 \n" \ + "psraw $0x6,%%xmm2 \n" \ + "packuswb %%xmm0,%%xmm0 \n" \ + "packuswb %%xmm1,%%xmm1 \n" \ + "packuswb %%xmm2,%%xmm2 \n" + +// Store 8 ARGB values. +#define STOREARGB \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklbw %%xmm5,%%xmm2 \n" \ + "movdqa %%xmm0,%%xmm1 \n" \ + "punpcklwd %%xmm2,%%xmm0 \n" \ + "punpckhwd %%xmm2,%%xmm1 \n" \ + "movdqu %%xmm0,(%[dst_argb]) \n" \ + "movdqu %%xmm1,0x10(%[dst_argb]) \n" \ + "lea 0x20(%[dst_argb]), %[dst_argb] \n" + +// Store 8 RGBA values. +#define STORERGBA \ + "pcmpeqb %%xmm5,%%xmm5 \n" \ + "punpcklbw %%xmm2,%%xmm1 \n" \ + "punpcklbw %%xmm0,%%xmm5 \n" \ + "movdqa %%xmm5,%%xmm0 \n" \ + "punpcklwd %%xmm1,%%xmm5 \n" \ + "punpckhwd %%xmm1,%%xmm0 \n" \ + "movdqu %%xmm5,(%[dst_rgba]) \n" \ + "movdqu %%xmm0,0x10(%[dst_rgba]) \n" \ + "lea 0x20(%[dst_rgba]),%[dst_rgba] \n" + +// Store 8 AR30 values. +#define STOREAR30 \ + "psraw $0x4,%%xmm0 \n" \ + "psraw $0x4,%%xmm1 \n" \ + "psraw $0x4,%%xmm2 \n" \ + "pminsw %%xmm7,%%xmm0 \n" \ + "pminsw %%xmm7,%%xmm1 \n" \ + "pminsw %%xmm7,%%xmm2 \n" \ + "pmaxsw %%xmm6,%%xmm0 \n" \ + "pmaxsw %%xmm6,%%xmm1 \n" \ + "pmaxsw %%xmm6,%%xmm2 \n" \ + "psllw $0x4,%%xmm2 \n" \ + "movdqa %%xmm0,%%xmm3 \n" \ + "punpcklwd %%xmm2,%%xmm0 \n" \ + "punpckhwd %%xmm2,%%xmm3 \n" \ + "movdqa %%xmm1,%%xmm2 \n" \ + "punpcklwd %%xmm5,%%xmm1 \n" \ + "punpckhwd %%xmm5,%%xmm2 \n" \ + "pslld $0xa,%%xmm1 \n" \ + "pslld $0xa,%%xmm2 \n" \ + "por %%xmm1,%%xmm0 \n" \ + "por %%xmm2,%%xmm3 \n" \ + "movdqu %%xmm0,(%[dst_ar30]) \n" \ + "movdqu %%xmm3,0x10(%[dst_ar30]) \n" \ + "lea 0x20(%[dst_ar30]), %[dst_ar30] \n" + +void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + READYUV444 + YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + +void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" + "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" + "sub %[u_buf],%[v_buf] \n" + + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB(yuvconstants) + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "movq %%xmm0,(%[dst_rgb24]) \n" + "movdqu %%xmm1,0x8(%[dst_rgb24]) \n" + "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n" + "subl $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] +#if defined(__i386__) + [width]"+m"(width) // %[width] +#else + [width]"+rm"(width) // %[width] +#endif + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] + [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), + [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" + ); +} + +void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + +void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min + "psrlw $6,%%xmm7 \n" // 1023 for max + + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB16(yuvconstants) + STOREAR30 + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + +// 10 bit YUV to ARGB +void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + READYUV210 + YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + +// 10 bit YUV to AR30 +void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min + "psrlw $6,%%xmm7 \n" // 1023 for max + + LABELALIGN + "1: \n" + READYUV210 + YUVTORGB16(yuvconstants) + STOREAR30 + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + +#ifdef HAS_I422ALPHATOARGBROW_SSSE3 +void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + + LABELALIGN + "1: \n" + READYUVA422 + YUVTORGB(yuvconstants) + STOREARGB + "subl $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [a_buf]"+r"(a_buf), // %[a_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] +#if defined(__i386__) + [width]"+m"(width) // %[width] +#else + [width]"+rm"(width) // %[width] +#endif + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + // clang-format on +} +#endif // HAS_I422ALPHATOARGBROW_SSSE3 + +void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + READNV12 + YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + // clang-format on +} + +void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* vu_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + READNV21 + YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [vu_buf]"+r"(vu_buf), // %[vu_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] + [kShuffleNV21]"m"(kShuffleNV21) + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + // clang-format on +} + +void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + READYUY2 + YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] + [kShuffleYUY2Y]"m"(kShuffleYUY2Y), + [kShuffleYUY2UV]"m"(kShuffleYUY2UV) + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + // clang-format on +} + +void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + READUYVY + YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] + [kShuffleUYVYY]"m"(kShuffleUYVYY), + [kShuffleUYVYUV]"m"(kShuffleUYVYUV) + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + // clang-format on +} + +void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgba, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB(yuvconstants) + STORERGBA + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + +#endif // HAS_I422TOARGBROW_SSSE3 + +// Read 16 UV from 444 +#define READYUV444_AVX2 \ + "vmovdqu (%[u_buf]),%%xmm0 \n" \ + "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x10(%[u_buf]),%[u_buf] \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ + "vmovdqu (%[y_buf]),%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" + +// Read 8 UV from 422, upsample to 16 UV. +#define READYUV422_AVX2 \ + "vmovq (%[u_buf]),%%xmm0 \n" \ + "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x8(%[u_buf]),%[u_buf] \n" \ + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vmovdqu (%[y_buf]),%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" + +// Read 8 UV from 210 10 bit, upsample to 16 UV +// TODO(fbarchard): Consider vshufb to replace pack/unpack +// TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1. +#define READYUV210_AVX2 \ + "vmovdqu (%[u_buf]),%%xmm0 \n" \ + "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x10(%[u_buf]),%[u_buf] \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ + "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpsraw $0x2,%%ymm0,%%ymm0 \n" \ + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vmovdqu (%[y_buf]),%%ymm4 \n" \ + "vpsllw $0x6,%%ymm4,%%ymm4 \n" \ + "lea 0x20(%[y_buf]),%[y_buf] \n" + +// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. +#define READYUVA422_AVX2 \ + "vmovq (%[u_buf]),%%xmm0 \n" \ + "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x8(%[u_buf]),%[u_buf] \n" \ + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vmovdqu (%[y_buf]),%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" \ + "vmovdqu (%[a_buf]),%%xmm5 \n" \ + "vpermq $0xd8,%%ymm5,%%ymm5 \n" \ + "lea 0x10(%[a_buf]),%[a_buf] \n" + +// Read 8 UV from NV12, upsample to 16 UV. +#define READNV12_AVX2 \ + "vmovdqu (%[uv_buf]),%%xmm0 \n" \ + "lea 0x10(%[uv_buf]),%[uv_buf] \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vmovdqu (%[y_buf]),%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" + +// Read 8 VU from NV21, upsample to 16 UV. +#define READNV21_AVX2 \ + "vmovdqu (%[vu_buf]),%%xmm0 \n" \ + "lea 0x10(%[vu_buf]),%[vu_buf] \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \ + "vmovdqu (%[y_buf]),%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" + +// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. +#define READYUY2_AVX2 \ + "vmovdqu (%[yuy2_buf]),%%ymm4 \n" \ + "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \ + "vmovdqu (%[yuy2_buf]),%%ymm0 \n" \ + "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \ + "lea 0x20(%[yuy2_buf]),%[yuy2_buf] \n" + +// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. +#define READUYVY_AVX2 \ + "vmovdqu (%[uyvy_buf]),%%ymm4 \n" \ + "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \ + "vmovdqu (%[uyvy_buf]),%%ymm0 \n" \ + "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \ + "lea 0x20(%[uyvy_buf]),%[uyvy_buf] \n" + +#if defined(__x86_64__) +#define YUVTORGB_SETUP_AVX2(yuvconstants) \ + "vmovdqa (%[yuvconstants]),%%ymm8 \n" \ + "vmovdqa 32(%[yuvconstants]),%%ymm9 \n" \ + "vmovdqa 64(%[yuvconstants]),%%ymm10 \n" \ + "vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \ + "vmovdqa 128(%[yuvconstants]),%%ymm12 \n" \ + "vmovdqa 160(%[yuvconstants]),%%ymm13 \n" \ + "vmovdqa 192(%[yuvconstants]),%%ymm14 \n" + +#define YUVTORGB16_AVX2(yuvconstants) \ + "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \ + "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \ + "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \ + "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \ + "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \ + "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \ + "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \ + "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ + "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ + "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" + +#define YUVTORGB_REGS_AVX2 \ + "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", + +#else // Convert 16 pixels: 16 UV and 16 Y. + +#define YUVTORGB_SETUP_AVX2(yuvconstants) +#define YUVTORGB16_AVX2(yuvconstants) \ + "vpmaddubsw 64(%[yuvconstants]),%%ymm0,%%ymm2 \n" \ + "vpmaddubsw 32(%[yuvconstants]),%%ymm0,%%ymm1 \n" \ + "vpmaddubsw (%[yuvconstants]),%%ymm0,%%ymm0 \n" \ + "vmovdqu 160(%[yuvconstants]),%%ymm3 \n" \ + "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ + "vmovdqu 128(%[yuvconstants]),%%ymm3 \n" \ + "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ + "vmovdqu 96(%[yuvconstants]),%%ymm3 \n" \ + "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \ + "vpmulhuw 192(%[yuvconstants]),%%ymm4,%%ymm4 \n" \ + "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ + "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ + "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" +#define YUVTORGB_REGS_AVX2 +#endif + +#define YUVTORGB_AVX2(yuvconstants) \ + YUVTORGB16_AVX2(yuvconstants) \ + "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ + "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ + "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ + "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" + +// Store 16 ARGB values. +#define STOREARGB_AVX2 \ + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ + "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ + "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \ + "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ + "vmovdqu %%ymm1,(%[dst_argb]) \n" \ + "vmovdqu %%ymm0,0x20(%[dst_argb]) \n" \ + "lea 0x40(%[dst_argb]), %[dst_argb] \n" + +// Store 16 AR30 values. +#define STOREAR30_AVX2 \ + "vpsraw $0x4,%%ymm0,%%ymm0 \n" \ + "vpsraw $0x4,%%ymm1,%%ymm1 \n" \ + "vpsraw $0x4,%%ymm2,%%ymm2 \n" \ + "vpminsw %%ymm7,%%ymm0,%%ymm0 \n" \ + "vpminsw %%ymm7,%%ymm1,%%ymm1 \n" \ + "vpminsw %%ymm7,%%ymm2,%%ymm2 \n" \ + "vpmaxsw %%ymm6,%%ymm0,%%ymm0 \n" \ + "vpmaxsw %%ymm6,%%ymm1,%%ymm1 \n" \ + "vpmaxsw %%ymm6,%%ymm2,%%ymm2 \n" \ + "vpsllw $0x4,%%ymm2,%%ymm2 \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ + "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ + "vpunpckhwd %%ymm2,%%ymm0,%%ymm3 \n" \ + "vpunpcklwd %%ymm2,%%ymm0,%%ymm0 \n" \ + "vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" \ + "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n" \ + "vpslld $0xa,%%ymm1,%%ymm1 \n" \ + "vpslld $0xa,%%ymm2,%%ymm2 \n" \ + "vpor %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpor %%ymm2,%%ymm3,%%ymm3 \n" \ + "vmovdqu %%ymm0,(%[dst_ar30]) \n" \ + "vmovdqu %%ymm3,0x20(%[dst_ar30]) \n" \ + "lea 0x40(%[dst_ar30]), %[dst_ar30] \n" + +#ifdef HAS_I444TOARGBROW_AVX2 +// 16 pixels +// 16 UV values with 16 Y producing 16 ARGB (64 bytes). +void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READYUV444_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_I444TOARGBROW_AVX2 + +#if defined(HAS_I422TOARGBROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READYUV422_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_I422TOARGBROW_AVX2 + +#if defined(HAS_I422TOAR30ROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes). +void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" + + LABELALIGN + "1: \n" + READYUV422_AVX2 + YUVTORGB16_AVX2(yuvconstants) + STOREAR30_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_I422TOAR30ROW_AVX2 + +#if defined(HAS_I210TOARGBROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READYUV210_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_I210TOARGBROW_AVX2 + +#if defined(HAS_I210TOAR30ROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes). +void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" + + LABELALIGN + "1: \n" + READYUV210_AVX2 + YUVTORGB16_AVX2(yuvconstants) + STOREAR30_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_I210TOAR30ROW_AVX2 + +#if defined(HAS_I422ALPHATOARGBROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. +void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + + LABELALIGN + "1: \n" + READYUVA422_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "subl $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [a_buf]"+r"(a_buf), // %[a_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] +#if defined(__i386__) + [width]"+m"(width) // %[width] +#else + [width]"+rm"(width) // %[width] +#endif + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + // clang-format on +} +#endif // HAS_I422ALPHATOARGBROW_AVX2 + +#if defined(HAS_I422TORGBAROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). +void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READYUV422_AVX2 + YUVTORGB_AVX2(yuvconstants) + + // Step 3: Weave into RGBA + "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n" + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" + "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" + "vmovdqu %%ymm0,(%[dst_argb]) \n" + "vmovdqu %%ymm1,0x20(%[dst_argb]) \n" + "lea 0x40(%[dst_argb]),%[dst_argb] \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_I422TORGBAROW_AVX2 + +#if defined(HAS_NV12TOARGBROW_AVX2) +// 16 pixels. +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READNV12_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + // clang-format on +} +#endif // HAS_NV12TOARGBROW_AVX2 + +#if defined(HAS_NV21TOARGBROW_AVX2) +// 16 pixels. +// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* vu_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READNV21_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [vu_buf]"+r"(vu_buf), // %[vu_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] + [kShuffleNV21]"m"(kShuffleNV21) + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + // clang-format on +} +#endif // HAS_NV21TOARGBROW_AVX2 + +#if defined(HAS_YUY2TOARGBROW_AVX2) +// 16 pixels. +// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). +void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READYUY2_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] + [kShuffleYUY2Y]"m"(kShuffleYUY2Y), + [kShuffleYUY2UV]"m"(kShuffleYUY2UV) + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + // clang-format on +} +#endif // HAS_YUY2TOARGBROW_AVX2 + +#if defined(HAS_UYVYTOARGBROW_AVX2) +// 16 pixels. +// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). +void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READUYVY_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] + [kShuffleUYVYY]"m"(kShuffleUYVYY), + [kShuffleUYVYUV]"m"(kShuffleUYVYUV) + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + // clang-format on +} +#endif // HAS_UYVYTOARGBROW_AVX2 + +#ifdef HAS_I400TOARGBROW_SSE2 +void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) { + asm volatile( + "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 + "movd %%eax,%%xmm2 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * + // 16 + "movd %%eax,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + + LABELALIGN + "1: \n" + // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "psubusw %%xmm3,%%xmm0 \n" + "psrlw $6, %%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + + // Step 2: Weave into ARGB + "punpcklbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "por %%xmm4,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(y_buf), // %0 + "+r"(dst_argb), // %1 + "+rm"(width) // %2 + : + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} +#endif // HAS_I400TOARGBROW_SSE2 + +#ifdef HAS_I400TOARGBROW_AVX2 +// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). +// note: vpunpcklbw mutates and vpackuswb unmutates. +void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) { + asm volatile( + "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * + // 16 + "vmovd %%eax,%%xmm2 \n" + "vbroadcastss %%xmm2,%%ymm2 \n" + "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164 + "vmovd %%eax,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpslld $0x18,%%ymm4,%%ymm4 \n" + + LABELALIGN + "1: \n" + // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164 + "vmovdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n" + "vpsrlw $0x6,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n" + "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n" + "vpor %%ymm4,%%ymm0,%%ymm0 \n" + "vpor %%ymm4,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(y_buf), // %0 + "+r"(dst_argb), // %1 + "+rm"(width) // %2 + : + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} +#endif // HAS_I400TOARGBROW_AVX2 + +#ifdef HAS_MIRRORROW_SSSE3 +// Shuffle table for reversing the bytes. +static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, + 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; + +void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile( + + "movdqa %3,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu -0x10(%0,%2,1),%%xmm0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirror) // %3 + : "memory", "cc", "xmm0", "xmm5"); +} +#endif // HAS_MIRRORROW_SSSE3 + +#ifdef HAS_MIRRORROW_AVX2 +void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile( + + "vbroadcastf128 %3,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu -0x20(%0,%2,1),%%ymm0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpermq $0x4e,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirror) // %3 + : "memory", "cc", "xmm0", "xmm5"); +} +#endif // HAS_MIRRORROW_AVX2 + +#ifdef HAS_MIRRORUVROW_SSSE3 +// Shuffle table for reversing the bytes of UV channels. +static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, + 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; +void MirrorUVRow_SSSE3(const uint8_t* src, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile( + "movdqa %4,%%xmm1 \n" + "lea -0x10(%0,%3,2),%0 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "lea -0x10(%0),%0 \n" + "pshufb %%xmm1,%%xmm0 \n" + "movlpd %%xmm0,(%1) \n" + "movhpd %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $8,%3 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(temp_width) // %3 + : "m"(kShuffleMirrorUV) // %4 + : "memory", "cc", "xmm0", "xmm1"); +} +#endif // HAS_MIRRORUVROW_SSSE3 + +#ifdef HAS_ARGBMIRRORROW_SSE2 + +void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile( + + "lea -0x10(%0,%2,4),%0 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "pshufd $0x1b,%%xmm0,%%xmm0 \n" + "lea -0x10(%0),%0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : + : "memory", "cc", "xmm0"); +} +#endif // HAS_ARGBMIRRORROW_SSE2 + +#ifdef HAS_ARGBMIRRORROW_AVX2 +// Shuffle table for reversing the bytes. +static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; +void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile( + + "vmovdqu %3,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kARGBShuffleMirror_AVX2) // %3 + : "memory", "cc", "xmm0", "xmm5"); +} +#endif // HAS_ARGBMIRRORROW_AVX2 + +#ifdef HAS_SPLITUVROW_AVX2 +void SplitUVRow_AVX2(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm2 \n" + "vpsrlw $0x8,%%ymm1,%%ymm3 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm2,0x00(%1,%2,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); +} +#endif // HAS_SPLITUVROW_AVX2 + +#ifdef HAS_SPLITUVROW_SSE2 +void SplitUVRow_SSE2(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "psrlw $0x8,%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm2,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); +} +#endif // HAS_SPLITUVROW_SSE2 + +#ifdef HAS_MERGEUVROW_AVX2 +void MergeUVRow_AVX2(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + asm volatile( + + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x00(%0,%1,1),%%ymm1 \n" + "lea 0x20(%0),%0 \n" + "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" + "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm2,(%2) \n" + "vextractf128 $0x0,%%ymm0,0x10(%2) \n" + "vextractf128 $0x1,%%ymm2,0x20(%2) \n" + "vextractf128 $0x1,%%ymm0,0x30(%2) \n" + "lea 0x40(%2),%2 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_MERGEUVROW_AVX2 + +#ifdef HAS_MERGEUVROW_SSE2 +void MergeUVRow_SSE2(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + asm volatile( + + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm2 \n" + "movdqu %%xmm0,(%2) \n" + "movdqu %%xmm2,0x10(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_MERGEUVROW_SSE2 + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 128 = 9 bits +// 64 = 10 bits +// 16 = 12 bits +// 1 = 16 bits +#ifdef HAS_MERGEUVROW_16_AVX2 +void MergeUVRow_16_AVX2(const uint16_t* src_u, + const uint16_t* src_v, + uint16_t* dst_uv, + int scale, + int width) { + // clang-format off + asm volatile ( + "vmovd %4,%%xmm3 \n" + "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" + "sub %0,%1 \n" + + // 16 pixels per loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu (%0,%1,1),%%ymm1 \n" + "add $0x20,%0 \n" + + "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" + "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" + "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates + "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm2,(%2) \n" + "vextractf128 $0x0,%%ymm0,0x10(%2) \n" + "vextractf128 $0x1,%%ymm2,0x20(%2) \n" + "vextractf128 $0x1,%%ymm0,0x30(%2) \n" + "add $0x40,%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : "r"(scale) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); + // clang-format on +} +#endif // HAS_MERGEUVROW_AVX2 + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 128 = 9 bits +// 64 = 10 bits +// 16 = 12 bits +// 1 = 16 bits +#ifdef HAS_MULTIPLYROW_16_AVX2 +void MultiplyRow_16_AVX2(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "vmovd %3,%%xmm3 \n" + "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" + "sub %0,%1 \n" + + // 16 pixels per loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" + "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%0,%1) \n" + "vmovdqu %%ymm1,0x20(%0,%1) \n" + "add $0x40,%0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm3"); + // clang-format on +} +#endif // HAS_MULTIPLYROW_16_AVX2 + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 32768 = 9 bits +// 16384 = 10 bits +// 4096 = 12 bits +// 256 = 16 bits +void Convert16To8Row_SSSE3(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "movd %3,%%xmm2 \n" + "punpcklwd %%xmm2,%%xmm2 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + + // 32 pixels per loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "add $0x20,%0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "add $0x10,%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); + // clang-format on +} + +#ifdef HAS_CONVERT16TO8ROW_AVX2 +void Convert16To8Row_AVX2(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "vmovd %3,%%xmm2 \n" + "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" + "vbroadcastss %%xmm2,%%ymm2 \n" + + // 32 pixels per loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "add $0x40,%0 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "add $0x20,%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); + // clang-format on +} +#endif // HAS_CONVERT16TO8ROW_AVX2 + +// Use scale to convert to lsb formats depending how many bits there are: +// 512 = 9 bits +// 1024 = 10 bits +// 4096 = 12 bits +// TODO(fbarchard): reduce to SSE2 +void Convert8To16Row_SSE2(const uint8_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "movd %3,%%xmm2 \n" + "punpcklwd %%xmm2,%%xmm2 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + + // 32 pixels per loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "add $0x10,%0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "add $0x20,%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); + // clang-format on +} + +#ifdef HAS_CONVERT8TO16ROW_AVX2 +void Convert8To16Row_AVX2(const uint8_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "vmovd %3,%%xmm2 \n" + "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" + "vbroadcastss %%xmm2,%%ymm2 \n" + + // 32 pixels per loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "add $0x20,%0 \n" + "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "add $0x40,%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); + // clang-format on +} +#endif // HAS_CONVERT8TO16ROW_AVX2 + +#ifdef HAS_SPLITRGBROW_SSSE3 + +// Shuffle table for converting RGB to Planar. +static const uvec8 kShuffleMaskRGBToR0 = {0u, 3u, 6u, 9u, 12u, 15u, + 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u}; +static const uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u, + 2u, 5u, 8u, 11u, 14u, 128u, + 128u, 128u, 128u, 128u}; +static const uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 1u, + 4u, 7u, 10u, 13u}; + +static const uvec8 kShuffleMaskRGBToG0 = {1u, 4u, 7u, 10u, 13u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u}; +static const uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u, + 3u, 6u, 9u, 12u, 15u, 128u, + 128u, 128u, 128u, 128u}; +static const uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 2u, + 5u, 8u, 11u, 14u}; + +static const uvec8 kShuffleMaskRGBToB0 = {2u, 5u, 8u, 11u, 14u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u}; +static const uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u, + 4u, 7u, 10u, 13u, 128u, 128u, + 128u, 128u, 128u, 128u}; +static const uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 0u, 3u, + 6u, 9u, 12u, 15u}; + +void SplitRGBRow_SSSE3(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "pshufb %5, %%xmm0 \n" + "pshufb %6, %%xmm1 \n" + "pshufb %7, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "pshufb %8, %%xmm0 \n" + "pshufb %9, %%xmm1 \n" + "pshufb %10, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "pshufb %11, %%xmm0 \n" + "pshufb %12, %%xmm1 \n" + "pshufb %13, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%3) \n" + "lea 0x10(%3),%3 \n" + "lea 0x30(%0),%0 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_rgb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : "m"(kShuffleMaskRGBToR0), // %5 + "m"(kShuffleMaskRGBToR1), // %6 + "m"(kShuffleMaskRGBToR2), // %7 + "m"(kShuffleMaskRGBToG0), // %8 + "m"(kShuffleMaskRGBToG1), // %9 + "m"(kShuffleMaskRGBToG2), // %10 + "m"(kShuffleMaskRGBToB0), // %11 + "m"(kShuffleMaskRGBToB1), // %12 + "m"(kShuffleMaskRGBToB2) // %13 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_SPLITRGBROW_SSSE3 + +#ifdef HAS_MERGERGBROW_SSSE3 + +// Shuffle table for converting RGB to Planar. +static const uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u, + 2u, 128u, 128u, 3u, 128u, 128u, + 4u, 128u, 128u, 5u}; +static const uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u, + 128u, 2u, 128u, 128u, 3u, 128u, + 128u, 4u, 128u, 128u}; +static const uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u, + 128u, 128u, 2u, 128u, 128u, 3u, + 128u, 128u, 4u, 128u}; + +static const uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u, + 7u, 128u, 128u, 8u, 128u, 128u, + 9u, 128u, 128u, 10u}; +static const uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u, + 128u, 7u, 128u, 128u, 8u, 128u, + 128u, 9u, 128u, 128u}; +static const uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u, 128u, 128u, 7u, + 128u, 128u, 8u, 128u, 128u, 9u, + 128u, 128u, 10u, 128u}; + +static const uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u, + 12u, 128u, 128u, 13u, 128u, 128u, + 14u, 128u, 128u, 15u}; +static const uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u, + 128u, 13u, 128u, 128u, 14u, 128u, + 128u, 15u, 128u, 128u}; +static const uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u, + 128u, 128u, 13u, 128u, 128u, 14u, + 128u, 128u, 15u, 128u}; + +void MergeRGBRow_SSSE3(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%1),%%xmm1 \n" + "movdqu (%2),%%xmm2 \n" + "pshufb %5, %%xmm0 \n" + "pshufb %6, %%xmm1 \n" + "pshufb %7, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%3) \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu (%1),%%xmm1 \n" + "movdqu (%2),%%xmm2 \n" + "pshufb %8, %%xmm0 \n" + "pshufb %9, %%xmm1 \n" + "pshufb %10, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,16(%3) \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu (%1),%%xmm1 \n" + "movdqu (%2),%%xmm2 \n" + "pshufb %11, %%xmm0 \n" + "pshufb %12, %%xmm1 \n" + "pshufb %13, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,32(%3) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x10(%1),%1 \n" + "lea 0x10(%2),%2 \n" + "lea 0x30(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_rgb), // %3 + "+r"(width) // %4 + : "m"(kShuffleMaskRToRGB0), // %5 + "m"(kShuffleMaskGToRGB0), // %6 + "m"(kShuffleMaskBToRGB0), // %7 + "m"(kShuffleMaskRToRGB1), // %8 + "m"(kShuffleMaskGToRGB1), // %9 + "m"(kShuffleMaskBToRGB1), // %10 + "m"(kShuffleMaskRToRGB2), // %11 + "m"(kShuffleMaskGToRGB2), // %12 + "m"(kShuffleMaskBToRGB2) // %13 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_MERGERGBROW_SSSE3 + +#ifdef HAS_COPYROW_SSE2 +void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "test $0xf,%0 \n" + "jne 2f \n" + "test $0xf,%1 \n" + "jne 2f \n" + + LABELALIGN + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm0,(%1) \n" + "movdqa %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "jmp 9f \n" + + LABELALIGN + "2: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 2b \n" + + LABELALIGN "9: \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); +} +#endif // HAS_COPYROW_SSE2 + +#ifdef HAS_COPYROW_AVX +void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x40,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); +} +#endif // HAS_COPYROW_AVX + +#ifdef HAS_COPYROW_ERMS +// Multiple of 1. +void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) { + size_t width_tmp = (size_t)(width); + asm volatile( + + "rep movsb \n" + : "+S"(src), // %0 + "+D"(dst), // %1 + "+c"(width_tmp) // %2 + : + : "memory", "cc"); +} +#endif // HAS_COPYROW_ERMS + +#ifdef HAS_ARGBCOPYALPHAROW_SSE2 +// width in pixels +void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "pcmpeqb %%xmm0,%%xmm0 \n" + "pslld $0x18,%%xmm0 \n" + "pcmpeqb %%xmm1,%%xmm1 \n" + "psrld $0x8,%%xmm1 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm2 \n" + "movdqu 0x10(%0),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "movdqu (%1),%%xmm4 \n" + "movdqu 0x10(%1),%%xmm5 \n" + "pand %%xmm0,%%xmm2 \n" + "pand %%xmm0,%%xmm3 \n" + "pand %%xmm1,%%xmm4 \n" + "pand %%xmm1,%%xmm5 \n" + "por %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm2,(%1) \n" + "movdqu %%xmm3,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_ARGBCOPYALPHAROW_SSE2 + +#ifdef HAS_ARGBCOPYALPHAROW_AVX2 +// width in pixels +void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm1 \n" + "vmovdqu 0x20(%0),%%ymm2 \n" + "lea 0x40(%0),%0 \n" + "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n" + "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm1,(%1) \n" + "vmovdqu %%ymm2,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_ARGBCOPYALPHAROW_AVX2 + +#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 +// width in pixels +void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0), %%xmm0 \n" + "movdqu 0x10(%0), %%xmm1 \n" + "lea 0x20(%0), %0 \n" + "psrld $0x18, %%xmm0 \n" + "psrld $0x18, %%xmm1 \n" + "packssdw %%xmm1, %%xmm0 \n" + "packuswb %%xmm0, %%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1), %1 \n" + "sub $0x8, %2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_a), // %1 + "+rm"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); +} +#endif // HAS_ARGBEXTRACTALPHAROW_SSE2 + +#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 +static const uvec8 kShuffleAlphaShort_AVX2 = { + 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u, + 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u}; + +void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + asm volatile( + "vmovdqa %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0), %%ymm0 \n" + "vmovdqu 0x20(%0), %%ymm1 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0 + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vmovdqu 0x40(%0), %%ymm2 \n" + "vmovdqu 0x60(%0), %%ymm3 \n" + "lea 0x80(%0), %0 \n" + "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates + "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" + "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. + "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate. + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20, %2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_a), // %1 + "+rm"(width) // %2 + : "m"(kPermdARGBToY_AVX), // %3 + "m"(kShuffleAlphaShort_AVX2) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_ARGBEXTRACTALPHAROW_AVX2 + +#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 +// width in pixels +void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "pcmpeqb %%xmm0,%%xmm0 \n" + "pslld $0x18,%%xmm0 \n" + "pcmpeqb %%xmm1,%%xmm1 \n" + "psrld $0x8,%%xmm1 \n" + + LABELALIGN + "1: \n" + "movq (%0),%%xmm2 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "punpckhwd %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm2,%%xmm2 \n" + "movdqu (%1),%%xmm4 \n" + "movdqu 0x10(%1),%%xmm5 \n" + "pand %%xmm0,%%xmm2 \n" + "pand %%xmm0,%%xmm3 \n" + "pand %%xmm1,%%xmm4 \n" + "pand %%xmm1,%%xmm5 \n" + "por %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm2,(%1) \n" + "movdqu %%xmm3,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 + +#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 +// width in pixels +void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" + + LABELALIGN + "1: \n" + "vpmovzxbd (%0),%%ymm1 \n" + "vpmovzxbd 0x8(%0),%%ymm2 \n" + "lea 0x10(%0),%0 \n" + "vpslld $0x18,%%ymm1,%%ymm1 \n" + "vpslld $0x18,%%ymm2,%%ymm2 \n" + "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n" + "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm1,(%1) \n" + "vmovdqu %%ymm2,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 + +#ifdef HAS_SETROW_X86 +void SetRow_X86(uint8_t* dst, uint8_t v8, int width) { + size_t width_tmp = (size_t)(width >> 2); + const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes. + asm volatile( + + "rep stosl \n" + : "+D"(dst), // %0 + "+c"(width_tmp) // %1 + : "a"(v32) // %2 + : "memory", "cc"); +} + +void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) { + size_t width_tmp = (size_t)(width); + asm volatile( + + "rep stosb \n" + : "+D"(dst), // %0 + "+c"(width_tmp) // %1 + : "a"(v8) // %2 + : "memory", "cc"); +} + +void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) { + size_t width_tmp = (size_t)(width); + asm volatile( + + "rep stosl \n" + : "+D"(dst_argb), // %0 + "+c"(width_tmp) // %1 + : "a"(v32) // %2 + : "memory", "cc"); +} +#endif // HAS_SETROW_X86 + +#ifdef HAS_YUY2TOYROW_SSE2 +void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); +} + +void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%4,1),%%xmm2 \n" + "movdqu 0x10(%0,%4,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(stride_yuy2)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); +} + +void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); +} + +void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); +} + +void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%4,1),%%xmm2 \n" + "movdqu 0x10(%0,%4,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(stride_uyvy)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); +} + +void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); +} +#endif // HAS_YUY2TOYROW_SSE2 + +#ifdef HAS_YUY2TOYROW_AVX2 +void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); +} + +void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm1,(%1) \n" + "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(stride_yuy2)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); +} + +void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm1,(%1) \n" + "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); +} + +void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + asm volatile( + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); +} +void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm1,(%1) \n" + "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(stride_uyvy)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); +} + +void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm1,(%1) \n" + "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); +} +#endif // HAS_YUY2TOYROW_AVX2 + +#ifdef HAS_ARGBBLENDROW_SSSE3 +// Shuffle table for isolating alpha. +static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, + 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; + +// Blend 8 pixels at a time +void ARGBBlendRow_SSSE3(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $0xf,%%xmm7 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x8,%%xmm6 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psllw $0x8,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + "sub $0x4,%3 \n" + "jl 49f \n" + + // 4 pixel loop. + LABELALIGN + "40: \n" + "movdqu (%0),%%xmm3 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movdqu (%1),%%xmm2 \n" + "pshufb %4,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movdqu (%1),%%xmm1 \n" + "lea 0x10(%1),%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jge 40b \n" + + "49: \n" + "add $0x3,%3 \n" + "jl 99f \n" + + // 1 pixel loop. + "91: \n" + "movd (%0),%%xmm3 \n" + "lea 0x4(%0),%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movd (%1),%%xmm2 \n" + "pshufb %4,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movd (%1),%%xmm1 \n" + "lea 0x4(%1),%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movd %%xmm0,(%2) \n" + "lea 0x4(%2),%2 \n" + "sub $0x1,%3 \n" + "jge 91b \n" + "99: \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : "m"(kShuffleAlpha) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_ARGBBLENDROW_SSSE3 + +#ifdef HAS_BLENDPLANEROW_SSSE3 +// Blend 8 pixels at a time. +// unsigned version of math +// =((A2*C2)+(B2*(255-C2))+255)/256 +// signed version of math +// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 +void BlendPlaneRow_SSSE3(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psllw $0x8,%%xmm5 \n" + "mov $0x80808080,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "mov $0x807f807f,%%eax \n" + "movd %%eax,%%xmm7 \n" + "pshufd $0x0,%%xmm7,%%xmm7 \n" + "sub %2,%0 \n" + "sub %2,%1 \n" + "sub %2,%3 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movq (%2),%%xmm0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "pxor %%xmm5,%%xmm0 \n" + "movq (%0,%2,1),%%xmm1 \n" + "movq (%1,%2,1),%%xmm2 \n" + "punpcklbw %%xmm2,%%xmm1 \n" + "psubb %%xmm6,%%xmm1 \n" + "pmaddubsw %%xmm1,%%xmm0 \n" + "paddw %%xmm7,%%xmm0 \n" + "psrlw $0x8,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%3,%2,1) \n" + "lea 0x8(%2),%2 \n" + "sub $0x8,%4 \n" + "jg 1b \n" + : "+r"(src0), // %0 + "+r"(src1), // %1 + "+r"(alpha), // %2 + "+r"(dst), // %3 + "+rm"(width) // %4 + ::"memory", + "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"); +} +#endif // HAS_BLENDPLANEROW_SSSE3 + +#ifdef HAS_BLENDPLANEROW_AVX2 +// Blend 32 pixels at a time. +// unsigned version of math +// =((A2*C2)+(B2*(255-C2))+255)/256 +// signed version of math +// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 +void BlendPlaneRow_AVX2(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width) { + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsllw $0x8,%%ymm5,%%ymm5 \n" + "mov $0x80808080,%%eax \n" + "vmovd %%eax,%%xmm6 \n" + "vbroadcastss %%xmm6,%%ymm6 \n" + "mov $0x807f807f,%%eax \n" + "vmovd %%eax,%%xmm7 \n" + "vbroadcastss %%xmm7,%%ymm7 \n" + "sub %2,%0 \n" + "sub %2,%1 \n" + "sub %2,%3 \n" + + // 32 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu (%2),%%ymm0 \n" + "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vpxor %%ymm5,%%ymm3,%%ymm3 \n" + "vpxor %%ymm5,%%ymm0,%%ymm0 \n" + "vmovdqu (%0,%2,1),%%ymm1 \n" + "vmovdqu (%1,%2,1),%%ymm2 \n" + "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n" + "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" + "vpsubb %%ymm6,%%ymm4,%%ymm4 \n" + "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm7,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm3,%%ymm3 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%3,%2,1) \n" + "lea 0x20(%2),%2 \n" + "sub $0x20,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src0), // %0 + "+r"(src1), // %1 + "+r"(alpha), // %2 + "+r"(dst), // %3 + "+rm"(width) // %4 + ::"memory", + "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_BLENDPLANEROW_AVX2 + +#ifdef HAS_ARGBATTENUATEROW_SSSE3 +// Shuffle table duplicating alpha +static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, + 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u}; +static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, + 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u}; +// Attenuate 4 pixels at a time. +void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + asm volatile( + "pcmpeqb %%xmm3,%%xmm3 \n" + "pslld $0x18,%%xmm3 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "punpcklbw %%xmm1,%%xmm1 \n" + "pmulhuw %%xmm1,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqu (%0),%%xmm2 \n" + "punpckhbw %%xmm2,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "movdqu (%0),%%xmm2 \n" + "lea 0x10(%0),%0 \n" + "pand %%xmm3,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleAlpha0), // %3 + "m"(kShuffleAlpha1) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_ARGBATTENUATEROW_SSSE3 + +#ifdef HAS_ARGBATTENUATEROW_AVX2 +// Shuffle table duplicating alpha. +static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u, + 128u, 128u, 14u, 15u, 14u, 15u, + 14u, 15u, 128u, 128u}; +// Attenuate 8 pixels at a time. +void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm4 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpslld $0x18,%%ymm5,%%ymm5 \n" + "sub %0,%1 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm6 \n" + "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" + "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" + "vpshufb %%ymm4,%%ymm0,%%ymm2 \n" + "vpshufb %%ymm4,%%ymm1,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpand %%ymm5,%%ymm6,%%ymm6 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpor %%ymm6,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%0,%1,1) \n" + "lea 0x20(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleAlpha_AVX2) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif // HAS_ARGBATTENUATEROW_AVX2 + +#ifdef HAS_ARGBUNATTENUATEROW_SSE2 +// Unattenuate 4 pixels at a time. +void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + uintptr_t alpha; + asm volatile( + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movzb 0x03(%0),%3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "movd 0x00(%4,%3,4),%%xmm2 \n" + "movzb 0x07(%0),%3 \n" + "movd 0x00(%4,%3,4),%%xmm3 \n" + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "movlhps %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "movzb 0x0b(%0),%3 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "movd 0x00(%4,%3,4),%%xmm2 \n" + "movzb 0x0f(%0),%3 \n" + "movd 0x00(%4,%3,4),%%xmm3 \n" + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "movlhps %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width), // %2 + "=&r"(alpha) // %3 + : "r"(fixed_invtbl8) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_ARGBUNATTENUATEROW_SSE2 + +#ifdef HAS_ARGBUNATTENUATEROW_AVX2 +// Shuffle table duplicating alpha. +static const uvec8 kUnattenShuffleAlpha_AVX2 = { + 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u}; +// Unattenuate 8 pixels at a time. +void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + uintptr_t alpha; + asm volatile( + "sub %0,%1 \n" + "vbroadcastf128 %5,%%ymm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + // replace VPGATHER + "movzb 0x03(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm0 \n" + "movzb 0x07(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm1 \n" + "movzb 0x0b(%0),%3 \n" + "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n" + "vmovd 0x00(%4,%3,4),%%xmm2 \n" + "movzb 0x0f(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm3 \n" + "movzb 0x13(%0),%3 \n" + "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n" + "vmovd 0x00(%4,%3,4),%%xmm0 \n" + "movzb 0x17(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm1 \n" + "movzb 0x1b(%0),%3 \n" + "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n" + "vmovd 0x00(%4,%3,4),%%xmm2 \n" + "movzb 0x1f(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm3 \n" + "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n" + "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n" + "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n" + "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n" + // end of VPGATHER + + "vmovdqu (%0),%%ymm6 \n" + "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" + "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" + "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n" + "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n" + "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%0,%1,1) \n" + "lea 0x20(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width), // %2 + "=&r"(alpha) // %3 + : "r"(fixed_invtbl8), // %4 + "m"(kUnattenShuffleAlpha_AVX2) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_ARGBUNATTENUATEROW_AVX2 + +#ifdef HAS_ARGBGRAYROW_SSSE3 +// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels +void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "phaddw %%xmm1,%%xmm0 \n" + "paddw %%xmm5,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movdqu (%0),%%xmm2 \n" + "movdqu 0x10(%0),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "psrld $0x18,%%xmm2 \n" + "psrld $0x18,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpcklbw %%xmm2,%%xmm3 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm3,%%xmm0 \n" + "punpckhwd %%xmm3,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kARGBToYJ), // %3 + "m"(kAddYJ64) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_ARGBGRAYROW_SSSE3 + +#ifdef HAS_ARGBSEPIAROW_SSSE3 +// b = (r * 35 + g * 68 + b * 17) >> 7 +// g = (r * 45 + g * 88 + b * 22) >> 7 +// r = (r * 50 + g * 98 + b * 24) >> 7 +// Constant for ARGB color to sepia tone +static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0, + 17, 68, 35, 0, 17, 68, 35, 0}; + +static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0, + 22, 88, 45, 0, 22, 88, 45, 0}; + +static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, + 24, 98, 50, 0, 24, 98, 50, 0}; + +// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. +void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) { + asm volatile( + "movdqa %2,%%xmm2 \n" + "movdqa %3,%%xmm3 \n" + "movdqa %4,%%xmm4 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm6 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm2,%%xmm6 \n" + "phaddw %%xmm6,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movdqu (%0),%%xmm5 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm5 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "phaddw %%xmm1,%%xmm5 \n" + "psrlw $0x7,%%xmm5 \n" + "packuswb %%xmm5,%%xmm5 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "movdqu (%0),%%xmm5 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm5 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "phaddw %%xmm1,%%xmm5 \n" + "psrlw $0x7,%%xmm5 \n" + "packuswb %%xmm5,%%xmm5 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "psrld $0x18,%%xmm6 \n" + "psrld $0x18,%%xmm1 \n" + "packuswb %%xmm1,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm5 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm5,%%xmm0 \n" + "punpckhwd %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%0) \n" + "movdqu %%xmm1,0x10(%0) \n" + "lea 0x20(%0),%0 \n" + "sub $0x8,%1 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "m"(kARGBToSepiaB), // %2 + "m"(kARGBToSepiaG), // %3 + "m"(kARGBToSepiaR) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif // HAS_ARGBSEPIAROW_SSSE3 + +#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 +// Tranform 8 ARGB pixels (32 bytes) with color matrix. +// Same as Sepia except matrix is provided. +void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width) { + asm volatile( + "movdqu (%3),%%xmm5 \n" + "pshufd $0x00,%%xmm5,%%xmm2 \n" + "pshufd $0x55,%%xmm5,%%xmm3 \n" + "pshufd $0xaa,%%xmm5,%%xmm4 \n" + "pshufd $0xff,%%xmm5,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm7 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm2,%%xmm7 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "phaddsw %%xmm7,%%xmm0 \n" + "phaddsw %%xmm1,%%xmm6 \n" + "psraw $0x6,%%xmm0 \n" + "psraw $0x6,%%xmm6 \n" + "packuswb %%xmm0,%%xmm0 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "movdqu 0x10(%0),%%xmm7 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm7 \n" + "phaddsw %%xmm7,%%xmm1 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x10(%0),%%xmm7 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm7 \n" + "phaddsw %%xmm7,%%xmm6 \n" + "psraw $0x6,%%xmm1 \n" + "psraw $0x6,%%xmm6 \n" + "packuswb %%xmm1,%%xmm1 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "punpcklwd %%xmm1,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm6 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm6,0x10(%1) \n" + "lea 0x20(%0),%0 \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(matrix_argb) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_ARGBCOLORMATRIXROW_SSSE3 + +#ifdef HAS_ARGBQUANTIZEROW_SSE2 +// Quantize 4 ARGB pixels (16 bytes). +void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width) { + asm volatile( + "movd %2,%%xmm2 \n" + "movd %3,%%xmm3 \n" + "movd %4,%%xmm4 \n" + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshufd $0x44,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "pshufd $0x44,%%xmm3,%%xmm3 \n" + "pshuflw $0x40,%%xmm4,%%xmm4 \n" + "pshufd $0x44,%%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "pslld $0x18,%%xmm6 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "pmullw %%xmm3,%%xmm0 \n" + "movdqu (%0),%%xmm7 \n" + "pmullw %%xmm3,%%xmm1 \n" + "pand %%xmm6,%%xmm7 \n" + "paddw %%xmm4,%%xmm0 \n" + "paddw %%xmm4,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "por %%xmm7,%%xmm0 \n" + "movdqu %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x4,%1 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "r"(scale), // %2 + "r"(interval_size), // %3 + "r"(interval_offset) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_ARGBQUANTIZEROW_SSE2 + +#ifdef HAS_ARGBSHADEROW_SSE2 +// Shade 4 pixels at a time by specified value. +void ARGBShadeRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value) { + asm volatile( + "movd %3,%%xmm2 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "punpcklqdq %%xmm2,%%xmm2 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(value) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_ARGBSHADEROW_SSE2 + +#ifdef HAS_ARGBMULTIPLYROW_SSE2 +// Multiply 2 rows of ARGB pixels together, 4 pixels at a time. +void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + + "pxor %%xmm5,%%xmm5 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "movdqu %%xmm0,%%xmm1 \n" + "movdqu %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpckhbw %%xmm5,%%xmm3 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); +} +#endif // HAS_ARGBMULTIPLYROW_SSE2 + +#ifdef HAS_ARGBMULTIPLYROW_AVX2 +// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. +void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm1 \n" + "lea 0x20(%0),%0 \n" + "vmovdqu (%1),%%ymm3 \n" + "lea 0x20(%1),%1 \n" + "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n" + "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" + "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__AVX2__) + , + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} +#endif // HAS_ARGBMULTIPLYROW_AVX2 + +#ifdef HAS_ARGBADDROW_SSE2 +// Add 2 rows of ARGB pixels together, 4 pixels at a time. +void ARGBAddRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm1 \n" + "lea 0x10(%1),%1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1"); +} +#endif // HAS_ARGBADDROW_SSE2 + +#ifdef HAS_ARGBADDROW_AVX2 +// Add 2 rows of ARGB pixels together, 4 pixels at a time. +void ARGBAddRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 4 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "lea 0x20(%0),%0 \n" + "vpaddusb (%1),%%ymm0,%%ymm0 \n" + "lea 0x20(%1),%1 \n" + "vmovdqu %%ymm0,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0"); +} +#endif // HAS_ARGBADDROW_AVX2 + +#ifdef HAS_ARGBSUBTRACTROW_SSE2 +// Subtract 2 rows of ARGB pixels, 4 pixels at a time. +void ARGBSubtractRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm1 \n" + "lea 0x10(%1),%1 \n" + "psubusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1"); +} +#endif // HAS_ARGBSUBTRACTROW_SSE2 + +#ifdef HAS_ARGBSUBTRACTROW_AVX2 +// Subtract 2 rows of ARGB pixels, 8 pixels at a time. +void ARGBSubtractRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 4 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "lea 0x20(%0),%0 \n" + "vpsubusb (%1),%%ymm0,%%ymm0 \n" + "lea 0x20(%1),%1 \n" + "vmovdqu %%ymm0,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0"); +} +#endif // HAS_ARGBSUBTRACTROW_AVX2 + +#ifdef HAS_SOBELXROW_SSE2 +// SobelX as a matrix is +// -1 0 1 +// -2 0 2 +// -1 0 1 +void SobelXRow_SSE2(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width) { + asm volatile( + "sub %0,%1 \n" + "sub %0,%2 \n" + "sub %0,%3 \n" + "pxor %%xmm5,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" + "movq 0x2(%0),%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "psubw %%xmm1,%%xmm0 \n" + "movq 0x00(%0,%1,1),%%xmm1 \n" + "movq 0x02(%0,%1,1),%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "psubw %%xmm2,%%xmm1 \n" + "movq 0x00(%0,%2,1),%%xmm2 \n" + "movq 0x02(%0,%2,1),%%xmm3 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "psubw %%xmm3,%%xmm2 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "psubw %%xmm0,%%xmm1 \n" + "pmaxsw %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,0x00(%0,%3,1) \n" + "lea 0x8(%0),%0 \n" + "sub $0x8,%4 \n" + "jg 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(src_y2), // %2 + "+r"(dst_sobelx), // %3 + "+r"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); +} +#endif // HAS_SOBELXROW_SSE2 + +#ifdef HAS_SOBELYROW_SSE2 +// SobelY as a matrix is +// -1 -2 -1 +// 0 0 0 +// 1 2 1 +void SobelYRow_SSE2(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width) { + asm volatile( + "sub %0,%1 \n" + "sub %0,%2 \n" + "pxor %%xmm5,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" + "movq 0x00(%0,%1,1),%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "psubw %%xmm1,%%xmm0 \n" + "movq 0x1(%0),%%xmm1 \n" + "movq 0x01(%0,%1,1),%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "psubw %%xmm2,%%xmm1 \n" + "movq 0x2(%0),%%xmm2 \n" + "movq 0x02(%0,%1,1),%%xmm3 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "psubw %%xmm3,%%xmm2 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "psubw %%xmm0,%%xmm1 \n" + "pmaxsw %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,0x00(%0,%2,1) \n" + "lea 0x8(%0),%0 \n" + "sub $0x8,%3 \n" + "jg 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(dst_sobely), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); +} +#endif // HAS_SOBELYROW_SSE2 + +#ifdef HAS_SOBELROW_SSE2 +// Adds Sobel X and Sobel Y and stores Sobel into ARGB. +// A = 255 +// R = Sobel +// G = Sobel +// B = Sobel +void SobelRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + asm volatile( + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm2 \n" + "punpckhbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm1 \n" + "punpckhwd %%xmm2,%%xmm2 \n" + "por %%xmm5,%%xmm1 \n" + "por %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "punpcklwd %%xmm0,%%xmm3 \n" + "punpckhwd %%xmm0,%%xmm0 \n" + "por %%xmm5,%%xmm3 \n" + "por %%xmm5,%%xmm0 \n" + "movdqu %%xmm1,(%2) \n" + "movdqu %%xmm2,0x10(%2) \n" + "movdqu %%xmm3,0x20(%2) \n" + "movdqu %%xmm0,0x30(%2) \n" + "lea 0x40(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); +} +#endif // HAS_SOBELROW_SSE2 + +#ifdef HAS_SOBELTOPLANEROW_SSE2 +// Adds Sobel X and Sobel Y and stores Sobel into a plane. +void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width) { + asm volatile( + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_y), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1"); +} +#endif // HAS_SOBELTOPLANEROW_SSE2 + +#ifdef HAS_SOBELXYROW_SSE2 +// Mixes Sobel X, Sobel Y and Sobel into ARGB. +// A = 255 +// R = Sobel X +// G = Sobel +// B = Sobel Y +void SobelXYRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + asm volatile( + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "paddusb %%xmm1,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "punpckhbw %%xmm5,%%xmm0 \n" + "movdqa %%xmm1,%%xmm4 \n" + "punpcklbw %%xmm2,%%xmm4 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqa %%xmm4,%%xmm6 \n" + "punpcklwd %%xmm3,%%xmm6 \n" + "punpckhwd %%xmm3,%%xmm4 \n" + "movdqa %%xmm1,%%xmm7 \n" + "punpcklwd %%xmm0,%%xmm7 \n" + "punpckhwd %%xmm0,%%xmm1 \n" + "movdqu %%xmm6,(%2) \n" + "movdqu %%xmm4,0x10(%2) \n" + "movdqu %%xmm7,0x20(%2) \n" + "movdqu %%xmm1,0x30(%2) \n" + "lea 0x40(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_SOBELXYROW_SSE2 + +#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 +// Creates a table of cumulative sums where each value is a sum of all values +// above and to the left of the value, inclusive of the value. +void ComputeCumulativeSumRow_SSE2(const uint8_t* row, + int32_t* cumsum, + const int32_t* previous_cumsum, + int width) { + asm volatile( + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "sub $0x4,%3 \n" + "jl 49f \n" + "test $0xf,%1 \n" + "jne 49f \n" + + // 4 pixel loop. + LABELALIGN + "40: \n" + "movdqu (%0),%%xmm2 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm2,%%xmm4 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm1,%%xmm2 \n" + "punpckhwd %%xmm1,%%xmm3 \n" + "punpckhbw %%xmm1,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "punpcklwd %%xmm1,%%xmm4 \n" + "punpckhwd %%xmm1,%%xmm5 \n" + "paddd %%xmm2,%%xmm0 \n" + "movdqu (%2),%%xmm2 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm0 \n" + "movdqu 0x10(%2),%%xmm3 \n" + "paddd %%xmm0,%%xmm3 \n" + "paddd %%xmm4,%%xmm0 \n" + "movdqu 0x20(%2),%%xmm4 \n" + "paddd %%xmm0,%%xmm4 \n" + "paddd %%xmm5,%%xmm0 \n" + "movdqu 0x30(%2),%%xmm5 \n" + "lea 0x40(%2),%2 \n" + "paddd %%xmm0,%%xmm5 \n" + "movdqu %%xmm2,(%1) \n" + "movdqu %%xmm3,0x10(%1) \n" + "movdqu %%xmm4,0x20(%1) \n" + "movdqu %%xmm5,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x4,%3 \n" + "jge 40b \n" + + "49: \n" + "add $0x3,%3 \n" + "jl 19f \n" + + // 1 pixel loop. + LABELALIGN + "10: \n" + "movd (%0),%%xmm2 \n" + "lea 0x4(%0),%0 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "punpcklwd %%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm0 \n" + "movdqu (%2),%%xmm2 \n" + "lea 0x10(%2),%2 \n" + "paddd %%xmm0,%%xmm2 \n" + "movdqu %%xmm2,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x1,%3 \n" + "jge 10b \n" + + "19: \n" + : "+r"(row), // %0 + "+r"(cumsum), // %1 + "+r"(previous_cumsum), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 + +#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 +void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, + const int32_t* botleft, + int width, + int area, + uint8_t* dst, + int count) { + asm volatile( + "movd %5,%%xmm5 \n" + "cvtdq2ps %%xmm5,%%xmm5 \n" + "rcpss %%xmm5,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "sub $0x4,%3 \n" + "jl 49f \n" + "cmpl $0x80,%5 \n" + "ja 40f \n" + + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrld $0x10,%%xmm6 \n" + "cvtdq2ps %%xmm6,%%xmm6 \n" + "addps %%xmm6,%%xmm5 \n" + "mulps %%xmm4,%%xmm5 \n" + "cvtps2dq %%xmm5,%%xmm5 \n" + "packssdw %%xmm5,%%xmm5 \n" + + // 4 pixel small loop. + LABELALIGN + "4: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "psubd 0x00(%0,%4,4),%%xmm0 \n" + "psubd 0x10(%0,%4,4),%%xmm1 \n" + "psubd 0x20(%0,%4,4),%%xmm2 \n" + "psubd 0x30(%0,%4,4),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "psubd (%1),%%xmm0 \n" + "psubd 0x10(%1),%%xmm1 \n" + "psubd 0x20(%1),%%xmm2 \n" + "psubd 0x30(%1),%%xmm3 \n" + "paddd 0x00(%1,%4,4),%%xmm0 \n" + "paddd 0x10(%1,%4,4),%%xmm1 \n" + "paddd 0x20(%1,%4,4),%%xmm2 \n" + "paddd 0x30(%1,%4,4),%%xmm3 \n" + "lea 0x40(%1),%1 \n" + "packssdw %%xmm1,%%xmm0 \n" + "packssdw %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm0 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jge 4b \n" + "jmp 49f \n" + + // 4 pixel loop + LABELALIGN + "40: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "psubd 0x00(%0,%4,4),%%xmm0 \n" + "psubd 0x10(%0,%4,4),%%xmm1 \n" + "psubd 0x20(%0,%4,4),%%xmm2 \n" + "psubd 0x30(%0,%4,4),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "psubd (%1),%%xmm0 \n" + "psubd 0x10(%1),%%xmm1 \n" + "psubd 0x20(%1),%%xmm2 \n" + "psubd 0x30(%1),%%xmm3 \n" + "paddd 0x00(%1,%4,4),%%xmm0 \n" + "paddd 0x10(%1,%4,4),%%xmm1 \n" + "paddd 0x20(%1,%4,4),%%xmm2 \n" + "paddd 0x30(%1,%4,4),%%xmm3 \n" + "lea 0x40(%1),%1 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "cvtdq2ps %%xmm1,%%xmm1 \n" + "mulps %%xmm4,%%xmm0 \n" + "mulps %%xmm4,%%xmm1 \n" + "cvtdq2ps %%xmm2,%%xmm2 \n" + "cvtdq2ps %%xmm3,%%xmm3 \n" + "mulps %%xmm4,%%xmm2 \n" + "mulps %%xmm4,%%xmm3 \n" + "cvtps2dq %%xmm0,%%xmm0 \n" + "cvtps2dq %%xmm1,%%xmm1 \n" + "cvtps2dq %%xmm2,%%xmm2 \n" + "cvtps2dq %%xmm3,%%xmm3 \n" + "packssdw %%xmm1,%%xmm0 \n" + "packssdw %%xmm3,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jge 40b \n" + + "49: \n" + "add $0x3,%3 \n" + "jl 19f \n" + + // 1 pixel loop + LABELALIGN + "10: \n" + "movdqu (%0),%%xmm0 \n" + "psubd 0x00(%0,%4,4),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "psubd (%1),%%xmm0 \n" + "paddd 0x00(%1,%4,4),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "mulps %%xmm4,%%xmm0 \n" + "cvtps2dq %%xmm0,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%2) \n" + "lea 0x4(%2),%2 \n" + "sub $0x1,%3 \n" + "jge 10b \n" + "19: \n" + : "+r"(topleft), // %0 + "+r"(botleft), // %1 + "+r"(dst), // %2 + "+rm"(count) // %3 + : "r"((intptr_t)(width)), // %4 + "rm"(area) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 + +#ifdef HAS_ARGBAFFINEROW_SSE2 +// Copy ARGB pixels from source image with slope to a row of destination. +LIBYUV_API +void ARGBAffineRow_SSE2(const uint8_t* src_argb, + int src_argb_stride, + uint8_t* dst_argb, + const float* src_dudv, + int width) { + intptr_t src_argb_stride_temp = src_argb_stride; + intptr_t temp; + asm volatile( + "movq (%3),%%xmm2 \n" + "movq 0x08(%3),%%xmm7 \n" + "shl $0x10,%1 \n" + "add $0x4,%1 \n" + "movd %1,%%xmm5 \n" + "sub $0x4,%4 \n" + "jl 49f \n" + + "pshufd $0x44,%%xmm7,%%xmm7 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "movdqa %%xmm2,%%xmm0 \n" + "addps %%xmm7,%%xmm0 \n" + "movlhps %%xmm0,%%xmm2 \n" + "movdqa %%xmm7,%%xmm4 \n" + "addps %%xmm4,%%xmm4 \n" + "movdqa %%xmm2,%%xmm3 \n" + "addps %%xmm4,%%xmm3 \n" + "addps %%xmm4,%%xmm4 \n" + + // 4 pixel loop + LABELALIGN + "40: \n" + "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2 + "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2 + "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts + "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride + "movd %%xmm0,%k1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd %%xmm0,%k5 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd 0x00(%0,%1,1),%%xmm1 \n" + "movd 0x00(%0,%5,1),%%xmm6 \n" + "punpckldq %%xmm6,%%xmm1 \n" + "addps %%xmm4,%%xmm2 \n" + "movq %%xmm1,(%2) \n" + "movd %%xmm0,%k1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd %%xmm0,%k5 \n" + "movd 0x00(%0,%1,1),%%xmm0 \n" + "movd 0x00(%0,%5,1),%%xmm6 \n" + "punpckldq %%xmm6,%%xmm0 \n" + "addps %%xmm4,%%xmm3 \n" + "movq %%xmm0,0x08(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%4 \n" + "jge 40b \n" + + "49: \n" + "add $0x3,%4 \n" + "jl 19f \n" + + // 1 pixel loop + LABELALIGN + "10: \n" + "cvttps2dq %%xmm2,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "pmaddwd %%xmm5,%%xmm0 \n" + "addps %%xmm7,%%xmm2 \n" + "movd %%xmm0,%k1 \n" + "movd 0x00(%0,%1,1),%%xmm0 \n" + "movd %%xmm0,(%2) \n" + "lea 0x04(%2),%2 \n" + "sub $0x1,%4 \n" + "jge 10b \n" + "19: \n" + : "+r"(src_argb), // %0 + "+r"(src_argb_stride_temp), // %1 + "+r"(dst_argb), // %2 + "+r"(src_dudv), // %3 + "+rm"(width), // %4 + "=&r"(temp) // %5 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_ARGBAFFINEROW_SSE2 + +#ifdef HAS_INTERPOLATEROW_SSSE3 +// Bilinear filter 16x2 -> 16x1 +void InterpolateRow_SSSE3(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { + asm volatile( + "sub %1,%0 \n" + "cmp $0x0,%3 \n" + "je 100f \n" + "cmp $0x80,%3 \n" + "je 50f \n" + + "movd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x100,%3 \n" + "movd %3,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x80808080,%%eax \n" + "movd %%eax,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + + // General purpose row blend. + LABELALIGN + "1: \n" + "movdqu (%1),%%xmm0 \n" + "movdqu 0x00(%1,%4,1),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "psubb %%xmm4,%%xmm0 \n" + "psubb %%xmm4,%%xmm1 \n" + "movdqa %%xmm5,%%xmm2 \n" + "movdqa %%xmm5,%%xmm3 \n" + "pmaddubsw %%xmm0,%%xmm2 \n" + "pmaddubsw %%xmm1,%%xmm3 \n" + "paddw %%xmm4,%%xmm2 \n" + "paddw %%xmm4,%%xmm3 \n" + "psrlw $0x8,%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "movdqu %%xmm2,0x00(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "jmp 99f \n" + + // Blend 50 / 50. + LABELALIGN + "50: \n" + "movdqu (%1),%%xmm0 \n" + "movdqu 0x00(%1,%4,1),%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,0x00(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 50b \n" + "jmp 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + LABELALIGN + "100: \n" + "movdqu (%1),%%xmm0 \n" + "movdqu %%xmm0,0x00(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+rm"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_INTERPOLATEROW_SSSE3 + +#ifdef HAS_INTERPOLATEROW_AVX2 +// Bilinear filter 32x2 -> 32x1 +void InterpolateRow_AVX2(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { + asm volatile( + "cmp $0x0,%3 \n" + "je 100f \n" + "sub %1,%0 \n" + "cmp $0x80,%3 \n" + "je 50f \n" + + "vmovd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x100,%3 \n" + "vmovd %3,%%xmm5 \n" + "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n" + "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n" + "vbroadcastss %%xmm5,%%ymm5 \n" + "mov $0x80808080,%%eax \n" + "vmovd %%eax,%%xmm4 \n" + "vbroadcastss %%xmm4,%%ymm4 \n" + + // General purpose row blend. + LABELALIGN + "1: \n" + "vmovdqu (%1),%%ymm0 \n" + "vmovdqu 0x00(%1,%4,1),%%ymm2 \n" + "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" + "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsubb %%ymm4,%%ymm1,%%ymm1 \n" + "vpsubb %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n" + "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n" + "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" + "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%1,%0,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "jmp 99f \n" + + // Blend 50 / 50. + LABELALIGN + "50: \n" + "vmovdqu (%1),%%ymm0 \n" + "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%1,%0,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 50b \n" + "jmp 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + LABELALIGN + "100: \n" + "rep movsb \n" + "jmp 999f \n" + + "99: \n" + "vzeroupper \n" + "999: \n" + : "+D"(dst_ptr), // %0 + "+S"(src_ptr), // %1 + "+cm"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"); +} +#endif // HAS_INTERPOLATEROW_AVX2 + +#ifdef HAS_ARGBSHUFFLEROW_SSSE3 +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + asm volatile( + + "movdqu (%3),%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(shuffler) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); +} +#endif // HAS_ARGBSHUFFLEROW_SSSE3 + +#ifdef HAS_ARGBSHUFFLEROW_AVX2 +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +void ARGBShuffleRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + asm volatile( + + "vbroadcastf128 (%3),%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(shuffler) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); +} +#endif // HAS_ARGBSHUFFLEROW_AVX2 + +#ifdef HAS_I422TOYUY2ROW_SSE2 +void I422ToYUY2Row_SSE2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width) { + asm volatile( + + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movq (%1),%%xmm2 \n" + "movq 0x00(%1,%2,1),%%xmm1 \n" + "add $0x8,%1 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqu (%0),%%xmm0 \n" + "add $0x10,%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,(%3) \n" + "movdqu %%xmm1,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_yuy2), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_I422TOYUY2ROW_SSE2 + +#ifdef HAS_I422TOUYVYROW_SSE2 +void I422ToUYVYRow_SSE2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width) { + asm volatile( + + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movq (%1),%%xmm2 \n" + "movq 0x00(%1,%2,1),%%xmm1 \n" + "add $0x8,%1 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "add $0x10,%0 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqu %%xmm1,(%3) \n" + "movdqu %%xmm2,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_uyvy), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_I422TOUYVYROW_SSE2 + +#ifdef HAS_I422TOYUY2ROW_AVX2 +void I422ToYUY2Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width) { + asm volatile( + + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vpmovzxbw (%1),%%ymm1 \n" + "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" + "add $0x10,%1 \n" + "vpsllw $0x8,%%ymm2,%%ymm2 \n" + "vpor %%ymm1,%%ymm2,%%ymm2 \n" + "vmovdqu (%0),%%ymm0 \n" + "add $0x20,%0 \n" + "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n" + "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n" + "vextractf128 $0x0,%%ymm1,(%3) \n" + "vextractf128 $0x0,%%ymm2,0x10(%3) \n" + "vextractf128 $0x1,%%ymm1,0x20(%3) \n" + "vextractf128 $0x1,%%ymm2,0x30(%3) \n" + "lea 0x40(%3),%3 \n" + "sub $0x20,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_yuy2), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_I422TOYUY2ROW_AVX2 + +#ifdef HAS_I422TOUYVYROW_AVX2 +void I422ToUYVYRow_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width) { + asm volatile( + + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vpmovzxbw (%1),%%ymm1 \n" + "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" + "add $0x10,%1 \n" + "vpsllw $0x8,%%ymm2,%%ymm2 \n" + "vpor %%ymm1,%%ymm2,%%ymm2 \n" + "vmovdqu (%0),%%ymm0 \n" + "add $0x20,%0 \n" + "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n" + "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n" + "vextractf128 $0x0,%%ymm1,(%3) \n" + "vextractf128 $0x0,%%ymm2,0x10(%3) \n" + "vextractf128 $0x1,%%ymm1,0x20(%3) \n" + "vextractf128 $0x1,%%ymm2,0x30(%3) \n" + "lea 0x40(%3),%3 \n" + "sub $0x20,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_uyvy), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_I422TOUYVYROW_AVX2 + +#ifdef HAS_ARGBPOLYNOMIALROW_SSE2 +void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, + const float* poly, + int width) { + asm volatile( + + "pxor %%xmm3,%%xmm3 \n" + + // 2 pixel loop. + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm3,%%xmm0 \n" + "movdqa %%xmm0,%%xmm4 \n" + "punpcklwd %%xmm3,%%xmm0 \n" + "punpckhwd %%xmm3,%%xmm4 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "cvtdq2ps %%xmm4,%%xmm4 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm4,%%xmm5 \n" + "mulps 0x10(%3),%%xmm0 \n" + "mulps 0x10(%3),%%xmm4 \n" + "addps (%3),%%xmm0 \n" + "addps (%3),%%xmm4 \n" + "movdqa %%xmm1,%%xmm2 \n" + "movdqa %%xmm5,%%xmm6 \n" + "mulps %%xmm1,%%xmm2 \n" + "mulps %%xmm5,%%xmm6 \n" + "mulps %%xmm2,%%xmm1 \n" + "mulps %%xmm6,%%xmm5 \n" + "mulps 0x20(%3),%%xmm2 \n" + "mulps 0x20(%3),%%xmm6 \n" + "mulps 0x30(%3),%%xmm1 \n" + "mulps 0x30(%3),%%xmm5 \n" + "addps %%xmm2,%%xmm0 \n" + "addps %%xmm6,%%xmm4 \n" + "addps %%xmm1,%%xmm0 \n" + "addps %%xmm5,%%xmm4 \n" + "cvttps2dq %%xmm0,%%xmm0 \n" + "cvttps2dq %%xmm4,%%xmm4 \n" + "packuswb %%xmm4,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x2,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(poly) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif // HAS_ARGBPOLYNOMIALROW_SSE2 + +#ifdef HAS_ARGBPOLYNOMIALROW_AVX2 +void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + const float* poly, + int width) { + asm volatile( + "vbroadcastf128 (%3),%%ymm4 \n" + "vbroadcastf128 0x10(%3),%%ymm5 \n" + "vbroadcastf128 0x20(%3),%%ymm6 \n" + "vbroadcastf128 0x30(%3),%%ymm7 \n" + + // 2 pixel loop. + LABELALIGN + "1: \n" + "vpmovzxbd (%0),%%ymm0 \n" // 2 ARGB pixels + "lea 0x8(%0),%0 \n" + "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats + "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X + "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X + "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X + "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X + "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * + // X + "vcvttps2dq %%ymm0,%%ymm0 \n" + "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n" + "vmovq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x2,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(poly) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_ARGBPOLYNOMIALROW_AVX2 + +#ifdef HAS_HALFFLOATROW_SSE2 +static float kScaleBias = 1.9259299444e-34f; +void HalfFloatRow_SSE2(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + scale *= kScaleBias; + asm volatile( + "movd %3,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + "sub %0,%1 \n" + + // 16 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm2 \n" // 8 shorts + "add $0x10,%0 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1 + "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats + "punpckhwd %%xmm5,%%xmm3 \n" + "cvtdq2ps %%xmm3,%%xmm3 \n" + "mulps %%xmm4,%%xmm2 \n" + "mulps %%xmm4,%%xmm3 \n" + "psrld $0xd,%%xmm2 \n" + "psrld $0xd,%%xmm3 \n" + "packssdw %%xmm3,%%xmm2 \n" + "movdqu %%xmm2,-0x10(%0,%1,1) \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(scale) // %3 + : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_HALFFLOATROW_SSE2 + +#ifdef HAS_HALFFLOATROW_AVX2 +void HalfFloatRow_AVX2(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + scale *= kScaleBias; + asm volatile( + "vbroadcastss %3, %%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + "sub %0,%1 \n" + + // 16 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm2 \n" // 16 shorts + "add $0x20,%0 \n" + "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates + "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n" + "vcvtdq2ps %%ymm3,%%ymm3 \n" + "vcvtdq2ps %%ymm2,%%ymm2 \n" + "vmulps %%ymm3,%%ymm4,%%ymm3 \n" + "vmulps %%ymm2,%%ymm4,%%ymm2 \n" + "vpsrld $0xd,%%ymm3,%%ymm3 \n" + "vpsrld $0xd,%%ymm2,%%ymm2 \n" + "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates + "vmovdqu %%ymm2,-0x20(%0,%1,1) \n" + "sub $0x10,%2 \n" + "jg 1b \n" + + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 +#if defined(__x86_64__) + : "x"(scale) // %3 +#else + : "m"(scale) // %3 +#endif + : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_HALFFLOATROW_AVX2 + +#ifdef HAS_HALFFLOATROW_F16C +void HalfFloatRow_F16C(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + asm volatile( + "vbroadcastss %3, %%ymm4 \n" + "sub %0,%1 \n" + + // 16 pixel loop. + LABELALIGN + "1: \n" + "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints + "vpmovzxwd 0x10(%0),%%ymm3 \n" + "vcvtdq2ps %%ymm2,%%ymm2 \n" + "vcvtdq2ps %%ymm3,%%ymm3 \n" + "vmulps %%ymm2,%%ymm4,%%ymm2 \n" + "vmulps %%ymm3,%%ymm4,%%ymm3 \n" + "vcvtps2ph $3, %%ymm2, %%xmm2 \n" + "vcvtps2ph $3, %%ymm3, %%xmm3 \n" + "vmovdqu %%xmm2,0x00(%0,%1,1) \n" + "vmovdqu %%xmm3,0x10(%0,%1,1) \n" + "add $0x20,%0 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 +#if defined(__x86_64__) + : "x"(scale) // %3 +#else + : "m"(scale) // %3 +#endif + : "memory", "cc", "xmm2", "xmm3", "xmm4"); +} +#endif // HAS_HALFFLOATROW_F16C + +#ifdef HAS_HALFFLOATROW_F16C +void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) { + asm volatile( + "sub %0,%1 \n" + // 16 pixel loop. + LABELALIGN + "1: \n" + "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints + "vpmovzxwd 0x10(%0),%%ymm3 \n" + "vcvtdq2ps %%ymm2,%%ymm2 \n" + "vcvtdq2ps %%ymm3,%%ymm3 \n" + "vcvtps2ph $3, %%ymm2, %%xmm2 \n" + "vcvtps2ph $3, %%ymm3, %%xmm3 \n" + "vmovdqu %%xmm2,0x00(%0,%1,1) \n" + "vmovdqu %%xmm3,0x10(%0,%1,1) \n" + "add $0x20,%0 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm2", "xmm3"); +} +#endif // HAS_HALFFLOATROW_F16C + +#ifdef HAS_ARGBCOLORTABLEROW_X86 +// Tranform ARGB pixels with color table. +void ARGBColorTableRow_X86(uint8_t* dst_argb, + const uint8_t* table_argb, + int width) { + uintptr_t pixel_temp; + asm volatile( + // 1 pixel loop. + LABELALIGN + "1: \n" + "movzb (%0),%1 \n" + "lea 0x4(%0),%0 \n" + "movzb 0x00(%3,%1,4),%1 \n" + "mov %b1,-0x4(%0) \n" + "movzb -0x3(%0),%1 \n" + "movzb 0x01(%3,%1,4),%1 \n" + "mov %b1,-0x3(%0) \n" + "movzb -0x2(%0),%1 \n" + "movzb 0x02(%3,%1,4),%1 \n" + "mov %b1,-0x2(%0) \n" + "movzb -0x1(%0),%1 \n" + "movzb 0x03(%3,%1,4),%1 \n" + "mov %b1,-0x1(%0) \n" + "dec %2 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "=&d"(pixel_temp), // %1 + "+r"(width) // %2 + : "r"(table_argb) // %3 + : "memory", "cc"); +} +#endif // HAS_ARGBCOLORTABLEROW_X86 + +#ifdef HAS_RGBCOLORTABLEROW_X86 +// Tranform RGB pixels with color table. +void RGBColorTableRow_X86(uint8_t* dst_argb, + const uint8_t* table_argb, + int width) { + uintptr_t pixel_temp; + asm volatile( + // 1 pixel loop. + LABELALIGN + "1: \n" + "movzb (%0),%1 \n" + "lea 0x4(%0),%0 \n" + "movzb 0x00(%3,%1,4),%1 \n" + "mov %b1,-0x4(%0) \n" + "movzb -0x3(%0),%1 \n" + "movzb 0x01(%3,%1,4),%1 \n" + "mov %b1,-0x3(%0) \n" + "movzb -0x2(%0),%1 \n" + "movzb 0x02(%3,%1,4),%1 \n" + "mov %b1,-0x2(%0) \n" + "dec %2 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "=&d"(pixel_temp), // %1 + "+r"(width) // %2 + : "r"(table_argb) // %3 + : "memory", "cc"); +} +#endif // HAS_RGBCOLORTABLEROW_X86 + +#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 +// Tranform RGB pixels with luma table. +void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + const uint8_t* luma, + uint32_t lumacoeff) { + uintptr_t pixel_temp; + uintptr_t table_temp; + asm volatile( + "movd %6,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0x8,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%2),%%xmm0 \n" + "pmaddubsw %%xmm3,%%xmm0 \n" + "phaddw %%xmm0,%%xmm0 \n" + "pand %%xmm4,%%xmm0 \n" + "punpcklwd %%xmm5,%%xmm0 \n" + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb (%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,(%3) \n" + "movzb 0x1(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x1(%3) \n" + "movzb 0x2(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x2(%3) \n" + "movzb 0x3(%2),%0 \n" + "mov %b0,0x3(%3) \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb 0x4(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x4(%3) \n" + "movzb 0x5(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x5(%3) \n" + "movzb 0x6(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x6(%3) \n" + "movzb 0x7(%2),%0 \n" + "mov %b0,0x7(%3) \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb 0x8(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x8(%3) \n" + "movzb 0x9(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x9(%3) \n" + "movzb 0xa(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0xa(%3) \n" + "movzb 0xb(%2),%0 \n" + "mov %b0,0xb(%3) \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + + "movzb 0xc(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0xc(%3) \n" + "movzb 0xd(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0xd(%3) \n" + "movzb 0xe(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0xe(%3) \n" + "movzb 0xf(%2),%0 \n" + "mov %b0,0xf(%3) \n" + "lea 0x10(%2),%2 \n" + "lea 0x10(%3),%3 \n" + "sub $0x4,%4 \n" + "jg 1b \n" + : "=&d"(pixel_temp), // %0 + "=&a"(table_temp), // %1 + "+r"(src_argb), // %2 + "+r"(dst_argb), // %3 + "+rm"(width) // %4 + : "r"(luma), // %5 + "rm"(lumacoeff) // %6 + : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 + +#endif // defined(__x86_64__) || defined(__i386__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/row_msa.cc b/media/libvpx/libvpx/third_party/libyuv/source/row_msa.cc new file mode 100644 index 0000000000..4fb2631f0b --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/row_msa.cc @@ -0,0 +1,3512 @@ +/* + * Copyright 2016 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "libyuv/row.h" + +// This module is for GCC MSA +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#include "libyuv/macros_msa.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define ALPHA_VAL (-1) + +// Fill YUV -> RGB conversion constants into vectors +#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, bb, bg, br, yg) \ + { \ + ub = __msa_fill_w(yuvconst->kUVToB[0]); \ + vr = __msa_fill_w(yuvconst->kUVToR[1]); \ + ug = __msa_fill_w(yuvconst->kUVToG[0]); \ + vg = __msa_fill_w(yuvconst->kUVToG[1]); \ + bb = __msa_fill_w(yuvconst->kUVBiasB[0]); \ + bg = __msa_fill_w(yuvconst->kUVBiasG[0]); \ + br = __msa_fill_w(yuvconst->kUVBiasR[0]); \ + yg = __msa_fill_w(yuvconst->kYToRgb[0]); \ + } + +// Load YUV 422 pixel data +#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \ + { \ + uint64_t y_m; \ + uint32_t u_m, v_m; \ + v4i32 zero_m = {0}; \ + y_m = LD(psrc_y); \ + u_m = LW(psrc_u); \ + v_m = LW(psrc_v); \ + out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64_t)y_m); \ + out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)u_m); \ + out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)v_m); \ + } + +// Clip input vector elements between 0 to 255 +#define CLIP_0TO255(in0, in1, in2, in3, in4, in5) \ + { \ + v4i32 max_m = __msa_ldi_w(0xFF); \ + \ + in0 = __msa_maxi_s_w(in0, 0); \ + in1 = __msa_maxi_s_w(in1, 0); \ + in2 = __msa_maxi_s_w(in2, 0); \ + in3 = __msa_maxi_s_w(in3, 0); \ + in4 = __msa_maxi_s_w(in4, 0); \ + in5 = __msa_maxi_s_w(in5, 0); \ + in0 = __msa_min_s_w(max_m, in0); \ + in1 = __msa_min_s_w(max_m, in1); \ + in2 = __msa_min_s_w(max_m, in2); \ + in3 = __msa_min_s_w(max_m, in3); \ + in4 = __msa_min_s_w(max_m, in4); \ + in5 = __msa_min_s_w(max_m, in5); \ + } + +// Convert 8 pixels of YUV 420 to RGB. +#define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \ + { \ + v8i16 vec0_m, vec1_m; \ + v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \ + v4i32 reg5_m, reg6_m, reg7_m; \ + v16i8 zero_m = {0}; \ + \ + vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \ + vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv); \ + reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m); \ + reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m); \ + reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec1_m); \ + reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec1_m); \ + reg0_m *= yg; \ + reg1_m *= yg; \ + reg2_m *= ubvr; \ + reg3_m *= ubvr; \ + reg0_m = __msa_srai_w(reg0_m, 16); \ + reg1_m = __msa_srai_w(reg1_m, 16); \ + reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \ + reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \ + reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \ + reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \ + reg2_m = __msa_ilvod_w(reg2_m, reg2_m); \ + reg3_m = __msa_ilvod_w(reg3_m, reg3_m); \ + reg4_m = __msa_ilvl_w(reg4_m, reg4_m); \ + reg5_m = reg0_m - reg5_m; \ + reg6_m = reg1_m - reg6_m; \ + reg2_m = reg0_m - reg2_m; \ + reg3_m = reg1_m - reg3_m; \ + reg7_m = reg0_m - reg7_m; \ + reg4_m = reg1_m - reg4_m; \ + reg5_m += bb; \ + reg6_m += bb; \ + reg7_m += bg; \ + reg4_m += bg; \ + reg2_m += br; \ + reg3_m += br; \ + reg5_m = __msa_srai_w(reg5_m, 6); \ + reg6_m = __msa_srai_w(reg6_m, 6); \ + reg7_m = __msa_srai_w(reg7_m, 6); \ + reg4_m = __msa_srai_w(reg4_m, 6); \ + reg2_m = __msa_srai_w(reg2_m, 6); \ + reg3_m = __msa_srai_w(reg3_m, 6); \ + CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m); \ + out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \ + out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \ + out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ + } + +// Pack and Store 8 ARGB values. +#define STOREARGB(in0, in1, in2, in3, pdst_argb) \ + { \ + v8i16 vec0_m, vec1_m; \ + v16u8 dst0_m, dst1_m; \ + vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ + vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ + dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m); \ + dst1_m = (v16u8)__msa_ilvl_h(vec1_m, vec0_m); \ + ST_UB2(dst0_m, dst1_m, pdst_argb, 16); \ + } + +// Takes ARGB input and calculates Y. +#define ARGBTOY(argb0, argb1, argb2, argb3, const0, const1, const2, shift, \ + y_out) \ + { \ + v16u8 vec0_m, vec1_m, vec2_m, vec3_m; \ + v8u16 reg0_m, reg1_m; \ + \ + vec0_m = (v16u8)__msa_pckev_h((v8i16)argb1, (v8i16)argb0); \ + vec1_m = (v16u8)__msa_pckev_h((v8i16)argb3, (v8i16)argb2); \ + vec2_m = (v16u8)__msa_pckod_h((v8i16)argb1, (v8i16)argb0); \ + vec3_m = (v16u8)__msa_pckod_h((v8i16)argb3, (v8i16)argb2); \ + reg0_m = __msa_dotp_u_h(vec0_m, const0); \ + reg1_m = __msa_dotp_u_h(vec1_m, const0); \ + reg0_m = __msa_dpadd_u_h(reg0_m, vec2_m, const1); \ + reg1_m = __msa_dpadd_u_h(reg1_m, vec3_m, const1); \ + reg0_m += const2; \ + reg1_m += const2; \ + reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, shift); \ + reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, shift); \ + y_out = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \ + } + +// Loads current and next row of ARGB input and averages it to calculate U and V +#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3) \ + { \ + v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \ + v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v16u8 vec8_m, vec9_m; \ + v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \ + v8u16 reg8_m, reg9_m; \ + \ + src0_m = (v16u8)__msa_ld_b((v16i8*)s, 0); \ + src1_m = (v16u8)__msa_ld_b((v16i8*)s, 16); \ + src2_m = (v16u8)__msa_ld_b((v16i8*)s, 32); \ + src3_m = (v16u8)__msa_ld_b((v16i8*)s, 48); \ + src4_m = (v16u8)__msa_ld_b((v16i8*)t, 0); \ + src5_m = (v16u8)__msa_ld_b((v16i8*)t, 16); \ + src6_m = (v16u8)__msa_ld_b((v16i8*)t, 32); \ + src7_m = (v16u8)__msa_ld_b((v16i8*)t, 48); \ + vec0_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \ + vec1_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \ + vec2_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \ + vec3_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \ + vec4_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \ + vec5_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \ + vec6_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \ + vec7_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \ + reg0_m = __msa_hadd_u_h(vec0_m, vec0_m); \ + reg1_m = __msa_hadd_u_h(vec1_m, vec1_m); \ + reg2_m = __msa_hadd_u_h(vec2_m, vec2_m); \ + reg3_m = __msa_hadd_u_h(vec3_m, vec3_m); \ + reg4_m = __msa_hadd_u_h(vec4_m, vec4_m); \ + reg5_m = __msa_hadd_u_h(vec5_m, vec5_m); \ + reg6_m = __msa_hadd_u_h(vec6_m, vec6_m); \ + reg7_m = __msa_hadd_u_h(vec7_m, vec7_m); \ + reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \ + reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \ + reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \ + reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \ + reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \ + reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \ + reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \ + reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \ + reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \ + reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \ + reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \ + reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \ + argb0 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \ + argb1 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \ + src0_m = (v16u8)__msa_ld_b((v16i8*)s, 64); \ + src1_m = (v16u8)__msa_ld_b((v16i8*)s, 80); \ + src2_m = (v16u8)__msa_ld_b((v16i8*)s, 96); \ + src3_m = (v16u8)__msa_ld_b((v16i8*)s, 112); \ + src4_m = (v16u8)__msa_ld_b((v16i8*)t, 64); \ + src5_m = (v16u8)__msa_ld_b((v16i8*)t, 80); \ + src6_m = (v16u8)__msa_ld_b((v16i8*)t, 96); \ + src7_m = (v16u8)__msa_ld_b((v16i8*)t, 112); \ + vec2_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \ + vec3_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \ + vec4_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \ + vec5_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \ + vec6_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \ + vec7_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \ + vec8_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \ + vec9_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \ + reg0_m = __msa_hadd_u_h(vec2_m, vec2_m); \ + reg1_m = __msa_hadd_u_h(vec3_m, vec3_m); \ + reg2_m = __msa_hadd_u_h(vec4_m, vec4_m); \ + reg3_m = __msa_hadd_u_h(vec5_m, vec5_m); \ + reg4_m = __msa_hadd_u_h(vec6_m, vec6_m); \ + reg5_m = __msa_hadd_u_h(vec7_m, vec7_m); \ + reg6_m = __msa_hadd_u_h(vec8_m, vec8_m); \ + reg7_m = __msa_hadd_u_h(vec9_m, vec9_m); \ + reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \ + reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \ + reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \ + reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \ + reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \ + reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \ + reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \ + reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \ + reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \ + reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \ + reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \ + reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \ + argb2 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \ + argb3 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \ + } + +// Takes ARGB input and calculates U and V. +#define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \ + shf0, shf1, shf2, shf3, v_out, u_out) \ + { \ + v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v8u16 reg0_m, reg1_m, reg2_m, reg3_m; \ + \ + vec0_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb1, (v16i8)argb0); \ + vec1_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb3, (v16i8)argb2); \ + vec2_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb1, (v16i8)argb0); \ + vec3_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb3, (v16i8)argb2); \ + vec4_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb1, (v16i8)argb0); \ + vec5_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb3, (v16i8)argb2); \ + vec6_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb1, (v16i8)argb0); \ + vec7_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb3, (v16i8)argb2); \ + reg0_m = __msa_dotp_u_h(vec0_m, const1); \ + reg1_m = __msa_dotp_u_h(vec1_m, const1); \ + reg2_m = __msa_dotp_u_h(vec4_m, const1); \ + reg3_m = __msa_dotp_u_h(vec5_m, const1); \ + reg0_m += const3; \ + reg1_m += const3; \ + reg2_m += const3; \ + reg3_m += const3; \ + reg0_m -= __msa_dotp_u_h(vec2_m, const0); \ + reg1_m -= __msa_dotp_u_h(vec3_m, const0); \ + reg2_m -= __msa_dotp_u_h(vec6_m, const2); \ + reg3_m -= __msa_dotp_u_h(vec7_m, const2); \ + v_out = (v16u8)__msa_pckod_b((v16i8)reg1_m, (v16i8)reg0_m); \ + u_out = (v16u8)__msa_pckod_b((v16i8)reg3_m, (v16i8)reg2_m); \ + } + +// Load I444 pixel data +#define READI444(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \ + { \ + uint64_t y_m, u_m, v_m; \ + v2i64 zero_m = {0}; \ + y_m = LD(psrc_y); \ + u_m = LD(psrc_u); \ + v_m = LD(psrc_v); \ + out_y = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)y_m); \ + out_u = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)u_m); \ + out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)v_m); \ + } + +void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) { + int x; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + v16i8 shuffler = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; + src += width - 64; + + for (x = 0; x < width; x += 64) { + LD_UB4(src, 16, src3, src2, src1, src0); + VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2); + VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0); + ST_UB4(dst0, dst1, dst2, dst3, dst, 16); + dst += 64; + src -= 64; + } +} + +void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) { + int x; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + v16i8 shuffler = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3}; + src += width * 4 - 64; + + for (x = 0; x < width; x += 16) { + LD_UB4(src, 16, src3, src2, src1, src0); + VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2); + VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0); + ST_UB4(dst0, dst1, dst2, dst3, dst, 16); + dst += 64; + src -= 64; + } +} + +void I422ToYUY2Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width) { + int x; + v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1; + v16u8 dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3; + + for (x = 0; x < width; x += 32) { + src_u0 = LD_UB(src_u); + src_v0 = LD_UB(src_v); + LD_UB2(src_y, 16, src_y0, src_y1); + ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1); + ILVRL_B2_UB(vec_uv0, src_y0, dst_yuy2_0, dst_yuy2_1); + ILVRL_B2_UB(vec_uv1, src_y1, dst_yuy2_2, dst_yuy2_3); + ST_UB4(dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3, dst_yuy2, 16); + src_u += 16; + src_v += 16; + src_y += 32; + dst_yuy2 += 64; + } +} + +void I422ToUYVYRow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width) { + int x; + v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1; + v16u8 dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3; + + for (x = 0; x < width; x += 32) { + src_u0 = LD_UB(src_u); + src_v0 = LD_UB(src_v); + LD_UB2(src_y, 16, src_y0, src_y1); + ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1); + ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1); + ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3); + ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16); + src_u += 16; + src_v += 16; + src_y += 32; + dst_uyvy += 64; + } +} + +void I422ToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + READYUV422(src_y, src_u, src_v, src0, src1, src2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + STOREARGB(vec0, vec1, vec2, alpha, dst_argb); + src_y += 8; + src_u += 4; + src_v += 4; + dst_argb += 32; + } +} + +void I422ToRGBARow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + READYUV422(src_y, src_u, src_v, src0, src1, src2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + STOREARGB(alpha, vec0, vec1, vec2, dst_argb); + src_y += 8; + src_u += 4; + src_v += 4; + dst_argb += 32; + } +} + +void I422AlphaToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int64_t data_a; + v16u8 src0, src1, src2, src3; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v4i32 zero = {0}; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + data_a = LD(src_a); + READYUV422(src_y, src_u, src_v, src0, src1, src2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3); + STOREARGB(vec0, vec1, vec2, src3, dst_argb); + src_y += 8; + src_u += 4; + src_v += 4; + src_a += 8; + dst_argb += 32; + } +} + +void I422ToRGB24Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int32_t width) { + int x; + int64_t data_u, data_v; + v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 reg0, reg1, reg2, reg3; + v2i64 zero = {0}; + v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10}; + v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10}; + v16i8 shuffler2 = {26, 6, 7, 27, 8, 9, 28, 10, + 11, 29, 12, 13, 30, 14, 15, 31}; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16u8*)src_y, 0); + data_u = LD(src_u); + data_v = LD(src_v); + src1 = (v16u8)__msa_insert_d(zero, 0, data_u); + src2 = (v16u8)__msa_insert_d(zero, 0, data_v); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8); + src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec3, vec4, vec5); + reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); + reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3); + reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2); + reg1 = (v16u8)__msa_sldi_b((v16i8)reg2, (v16i8)reg0, 11); + dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)reg3, (v16i8)reg0); + dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)reg3, (v16i8)reg1); + dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)reg3, (v16i8)reg2); + ST_UB2(dst0, dst1, dst_argb, 16); + ST_UB(dst2, (dst_argb + 32)); + src_y += 16; + src_u += 8; + src_v += 8; + dst_argb += 48; + } +} + +// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R. +void I422ToRGB565Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2, dst0; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + READYUV422(src_y, src_u, src_v, src0, src1, src2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec2, vec1); + vec0 = __msa_srai_h(vec0, 3); + vec1 = __msa_srai_h(vec1, 3); + vec2 = __msa_srai_h(vec2, 2); + vec1 = __msa_slli_h(vec1, 11); + vec2 = __msa_slli_h(vec2, 5); + vec0 |= vec1; + dst0 = (v16u8)(vec2 | vec0); + ST_UB(dst0, dst_rgb565); + src_y += 8; + src_u += 4; + src_v += 4; + dst_rgb565 += 16; + } +} + +// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G. +void I422ToARGB4444Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2, dst0; + v8i16 vec0, vec1, vec2; + v8u16 reg0, reg1, reg2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + READYUV422(src_y, src_u, src_v, src0, src1, src2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + reg0 = (v8u16)__msa_srai_h(vec0, 4); + reg1 = (v8u16)__msa_srai_h(vec1, 4); + reg2 = (v8u16)__msa_srai_h(vec2, 4); + reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 4); + reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 8); + reg1 |= const_0xF000; + reg0 |= reg2; + dst0 = (v16u8)(reg1 | reg0); + ST_UB(dst0, dst_argb4444); + src_y += 8; + src_u += 4; + src_v += 4; + dst_argb4444 += 16; + } +} + +void I422ToARGB1555Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2, dst0; + v8i16 vec0, vec1, vec2; + v8u16 reg0, reg1, reg2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + READYUV422(src_y, src_u, src_v, src0, src1, src2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + reg0 = (v8u16)__msa_srai_h(vec0, 3); + reg1 = (v8u16)__msa_srai_h(vec1, 3); + reg2 = (v8u16)__msa_srai_h(vec2, 3); + reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5); + reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10); + reg1 |= const_0x8000; + reg0 |= reg2; + dst0 = (v16u8)(reg1 | reg0); + ST_UB(dst0, dst_argb1555); + src_y += 8; + src_u += 4; + src_v += 4; + dst_argb1555 += 16; + } +} + +void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_yuy2, 16, src0, src1, src2, src3); + dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_y, 16); + src_yuy2 += 64; + dst_y += 32; + } +} + +void YUY2ToUVRow_MSA(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2; + int x; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 vec0, vec1, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_yuy2, 16, src0, src1, src2, src3); + LD_UB4(src_yuy2_next, 16, src4, src5, src6, src7); + src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + src2 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4); + src3 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6); + vec0 = __msa_aver_u_b(src0, src2); + vec1 = __msa_aver_u_b(src1, src3); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_yuy2 += 64; + src_yuy2_next += 64; + dst_u += 16; + dst_v += 16; + } +} + +void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_yuy2, 16, src0, src1, src2, src3); + src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_yuy2 += 64; + dst_u += 16; + dst_v += 16; + } +} + +void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_uyvy, 16, src0, src1, src2, src3); + dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_y, 16); + src_uyvy += 64; + dst_y += 32; + } +} + +void UYVYToUVRow_MSA(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy; + int x; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 vec0, vec1, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_uyvy, 16, src0, src1, src2, src3); + LD_UB4(src_uyvy_next, 16, src4, src5, src6, src7); + src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + src2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); + src3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6); + vec0 = __msa_aver_u_b(src0, src2); + vec1 = __msa_aver_u_b(src1, src3); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_uyvy += 64; + src_uyvy_next += 64; + dst_u += 16; + dst_v += 16; + } +} + +void UYVYToUV422Row_MSA(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_uyvy, 16, src0, src1, src2, src3); + src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_uyvy += 64; + dst_u += 16; + dst_v += 16; + } +} + +void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0; + v8u16 reg0, reg1, reg2, reg3, reg4, reg5; + v16i8 zero = {0}; + v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19); + v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81); + v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32); + src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48); + vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + vec2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + reg0 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec0); + reg1 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec1); + reg2 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec2); + reg3 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec3); + reg4 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec0); + reg5 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec1); + reg0 *= const_0x19; + reg1 *= const_0x19; + reg2 *= const_0x81; + reg3 *= const_0x81; + reg4 *= const_0x42; + reg5 *= const_0x42; + reg0 += reg2; + reg1 += reg3; + reg0 += reg4; + reg1 += reg5; + reg0 += const_0x1080; + reg1 += const_0x1080; + reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); + reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + ST_UB(dst0, dst_y); + src_argb0 += 64; + dst_y += 16; + } +} + +void ARGBToUVRow_MSA(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* src_argb0_next = src_argb0 + src_stride_argb; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; + v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9; + v16u8 dst0, dst1; + v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70); + v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A); + v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26); + v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E); + v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + + for (x = 0; x < width; x += 32) { + src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32); + src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48); + src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 64); + src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 80); + src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 96); + src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 112); + vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); + vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6); + vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4); + vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6); + vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); + vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); + vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); + vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2); + reg0 = __msa_hadd_u_h(vec8, vec8); + reg1 = __msa_hadd_u_h(vec9, vec9); + reg2 = __msa_hadd_u_h(vec4, vec4); + reg3 = __msa_hadd_u_h(vec5, vec5); + reg4 = __msa_hadd_u_h(vec0, vec0); + reg5 = __msa_hadd_u_h(vec1, vec1); + src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 16); + src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 32); + src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 48); + src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 64); + src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 80); + src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 96); + src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 112); + vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); + vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6); + vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4); + vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6); + vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); + vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); + vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); + vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2); + reg0 += __msa_hadd_u_h(vec8, vec8); + reg1 += __msa_hadd_u_h(vec9, vec9); + reg2 += __msa_hadd_u_h(vec4, vec4); + reg3 += __msa_hadd_u_h(vec5, vec5); + reg4 += __msa_hadd_u_h(vec0, vec0); + reg5 += __msa_hadd_u_h(vec1, vec1); + reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 2); + reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 2); + reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 2); + reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 2); + reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 2); + reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 2); + reg6 = reg0 * const_0x70; + reg7 = reg1 * const_0x70; + reg8 = reg2 * const_0x4A; + reg9 = reg3 * const_0x4A; + reg6 += const_0x8080; + reg7 += const_0x8080; + reg8 += reg4 * const_0x26; + reg9 += reg5 * const_0x26; + reg0 *= const_0x12; + reg1 *= const_0x12; + reg2 *= const_0x5E; + reg3 *= const_0x5E; + reg4 *= const_0x70; + reg5 *= const_0x70; + reg2 += reg0; + reg3 += reg1; + reg4 += const_0x8080; + reg5 += const_0x8080; + reg6 -= reg8; + reg7 -= reg9; + reg4 -= reg2; + reg5 -= reg3; + reg6 = (v8u16)__msa_srai_h((v8i16)reg6, 8); + reg7 = (v8u16)__msa_srai_h((v8i16)reg7, 8); + reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 8); + reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg7, (v16i8)reg6); + dst1 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_argb0 += 128; + src_argb0_next += 128; + dst_u += 16; + dst_v += 16; + } +} + +void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2; + v16i8 shuffler0 = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20}; + v16i8 shuffler1 = {5, 6, 8, 9, 10, 12, 13, 14, + 16, 17, 18, 20, 21, 22, 24, 25}; + v16i8 shuffler2 = {10, 12, 13, 14, 16, 17, 18, 20, + 21, 22, 24, 25, 26, 28, 29, 30}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 48); + dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1); + dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_rgb, 16); + ST_UB(dst2, (dst_rgb + 32)); + src_argb += 64; + dst_rgb += 48; + } +} + +void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2; + v16i8 shuffler0 = {2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22}; + v16i8 shuffler1 = {5, 4, 10, 9, 8, 14, 13, 12, + 18, 17, 16, 22, 21, 20, 26, 25}; + v16i8 shuffler2 = {8, 14, 13, 12, 18, 17, 16, 22, + 21, 20, 26, 25, 24, 30, 29, 28}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 48); + dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1); + dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_rgb, 16); + ST_UB(dst2, (dst_rgb + 32)); + src_argb += 64; + dst_rgb += 48; + } +} + +void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + v16u8 src0, src1, dst0; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16); + vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3); + vec1 = (v16u8)__msa_slli_b((v16i8)src0, 3); + vec2 = (v16u8)__msa_srai_b((v16i8)src0, 5); + vec4 = (v16u8)__msa_srai_b((v16i8)src1, 3); + vec5 = (v16u8)__msa_slli_b((v16i8)src1, 3); + vec6 = (v16u8)__msa_srai_b((v16i8)src1, 5); + vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1); + vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1); + vec5 = (v16u8)__msa_sldi_b(zero, (v16i8)vec5, 1); + vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1); + vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 2); + vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 2); + vec0 = __msa_binsli_b(vec0, vec1, 2); + vec1 = __msa_binsli_b(vec2, vec3, 4); + vec4 = __msa_binsli_b(vec4, vec5, 2); + vec5 = __msa_binsli_b(vec6, vec7, 4); + vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); + vec4 = (v16u8)__msa_ilvev_b((v16i8)vec5, (v16i8)vec4); + dst0 = (v16u8)__msa_pckev_h((v8i16)vec4, (v8i16)vec0); + ST_UB(dst0, dst_rgb); + src_argb += 32; + dst_rgb += 16; + } +} + +void ARGBToARGB1555Row_MSA(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + int x; + v16u8 src0, src1, dst0; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16); + vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3); + vec1 = (v16u8)__msa_slli_b((v16i8)src0, 2); + vec2 = (v16u8)__msa_srai_b((v16i8)vec0, 3); + vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1); + vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1); + vec3 = (v16u8)__msa_srai_b((v16i8)src0, 1); + vec5 = (v16u8)__msa_srai_b((v16i8)src1, 3); + vec6 = (v16u8)__msa_slli_b((v16i8)src1, 2); + vec7 = (v16u8)__msa_srai_b((v16i8)vec5, 3); + vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1); + vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)vec7, 1); + vec8 = (v16u8)__msa_srai_b((v16i8)src1, 1); + vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)vec3, 2); + vec8 = (v16u8)__msa_sldi_b(zero, (v16i8)vec8, 2); + vec4 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 3); + vec9 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 3); + vec0 = __msa_binsli_b(vec0, vec1, 2); + vec5 = __msa_binsli_b(vec5, vec6, 2); + vec1 = __msa_binsli_b(vec2, vec3, 5); + vec6 = __msa_binsli_b(vec7, vec8, 5); + vec1 = __msa_binsli_b(vec1, vec4, 0); + vec6 = __msa_binsli_b(vec6, vec9, 0); + vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); + vec1 = (v16u8)__msa_ilvev_b((v16i8)vec6, (v16i8)vec5); + dst0 = (v16u8)__msa_pckev_h((v8i16)vec1, (v8i16)vec0); + ST_UB(dst0, dst_rgb); + src_argb += 32; + dst_rgb += 16; + } +} + +void ARGBToARGB4444Row_MSA(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + int x; + v16u8 src0, src1; + v16u8 vec0, vec1; + v16u8 dst0; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16); + vec0 = (v16u8)__msa_srai_b((v16i8)src0, 4); + vec1 = (v16u8)__msa_srai_b((v16i8)src1, 4); + src0 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 1); + src1 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 1); + vec0 = __msa_binsli_b(vec0, src0, 3); + vec1 = __msa_binsli_b(vec1, src1, 3); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_rgb); + src_argb += 32; + dst_rgb += 16; + } +} + +void ARGBToUV444Row_MSA(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int32_t width) { + int32_t x; + v16u8 src0, src1, src2, src3, reg0, reg1, reg2, reg3, dst0, dst1; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 vec8, vec9, vec10, vec11; + v8u16 const_112 = (v8u16)__msa_ldi_h(112); + v8u16 const_74 = (v8u16)__msa_ldi_h(74); + v8u16 const_38 = (v8u16)__msa_ldi_h(38); + v8u16 const_94 = (v8u16)__msa_ldi_h(94); + v8u16 const_18 = (v8u16)__msa_ldi_h(18); + v8u16 const_32896 = (v8u16)__msa_fill_h(32896); + v16i8 zero = {0}; + + for (x = width; x > 0; x -= 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 48); + reg0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + reg1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + reg2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + reg3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + src0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + src1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2); + src2 = (v16u8)__msa_pckod_b((v16i8)reg1, (v16i8)reg0); + vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1); + vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2); + vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2); + vec10 = vec0 * const_18; + vec11 = vec1 * const_18; + vec8 = vec2 * const_94; + vec9 = vec3 * const_94; + vec6 = vec4 * const_112; + vec7 = vec5 * const_112; + vec0 *= const_112; + vec1 *= const_112; + vec2 *= const_74; + vec3 *= const_74; + vec4 *= const_38; + vec5 *= const_38; + vec8 += vec10; + vec9 += vec11; + vec6 += const_32896; + vec7 += const_32896; + vec0 += const_32896; + vec1 += const_32896; + vec2 += vec4; + vec3 += vec5; + vec0 -= vec2; + vec1 -= vec3; + vec6 -= vec8; + vec7 -= vec9; + vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8); + vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); + vec6 = (v8u16)__msa_srai_h((v8i16)vec6, 8); + vec7 = (v8u16)__msa_srai_h((v8i16)vec7, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_argb += 64; + dst_u += 16; + dst_v += 16; + } +} + +void ARGBMultiplyRow_MSA(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, dst0; + v8u16 vec0, vec1, vec2, vec3; + v4u32 reg0, reg1, reg2, reg3; + v8i16 zero = {0}; + + for (x = 0; x < width; x += 4) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 0); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src1); + reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0); + reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0); + reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1); + reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1); + reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2); + reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2); + reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3); + reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3); + reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 16); + reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 16); + reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 16); + reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 16); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_argb); + src_argb0 += 16; + src_argb1 += 16; + dst_argb += 16; + } +} + +void ARGBAddRow_MSA(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 0); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 16); + dst0 = __msa_adds_u_b(src0, src2); + dst1 = __msa_adds_u_b(src1, src3); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb0 += 32; + src_argb1 += 32; + dst_argb += 32; + } +} + +void ARGBSubtractRow_MSA(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 0); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 16); + dst0 = __msa_subs_u_b(src0, src2); + dst1 = __msa_subs_u_b(src1, src3); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb0 += 32; + src_argb1 += 32; + dst_argb += 32; + } +} + +void ARGBAttenuateRow_MSA(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, dst0, dst1; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; + v4u32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v8i16 zero = {0}; + v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255}; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)src1, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)src1, (v16i8)src1); + vec4 = (v8u16)__msa_fill_h(vec0[3]); + vec5 = (v8u16)__msa_fill_h(vec0[7]); + vec6 = (v8u16)__msa_fill_h(vec1[3]); + vec7 = (v8u16)__msa_fill_h(vec1[7]); + vec4 = (v8u16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4); + vec5 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); + vec6 = (v8u16)__msa_fill_h(vec2[3]); + vec7 = (v8u16)__msa_fill_h(vec2[7]); + vec8 = (v8u16)__msa_fill_h(vec3[3]); + vec9 = (v8u16)__msa_fill_h(vec3[7]); + vec6 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); + vec7 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8); + reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec4); + reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec4); + reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec5); + reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec5); + reg4 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec6); + reg5 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec6); + reg6 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec7); + reg7 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec7); + reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec0); + reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec0); + reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec1); + reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec1); + reg4 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2); + reg5 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2); + reg6 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3); + reg7 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3); + reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24); + reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24); + reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24); + reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24); + reg4 = (v4u32)__msa_srai_w((v4i32)reg4, 24); + reg5 = (v4u32)__msa_srai_w((v4i32)reg5, 24); + reg6 = (v4u32)__msa_srai_w((v4i32)reg6, 24); + reg7 = (v4u32)__msa_srai_w((v4i32)reg7, 24); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4); + vec3 = (v8u16)__msa_pckev_h((v8i16)reg7, (v8i16)reg6); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + dst0 = __msa_bmnz_v(dst0, src0, mask); + dst1 = __msa_bmnz_v(dst1, src1, mask); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb += 32; + dst_argb += 32; + } +} + +void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb, + uint8_t* dst_rgb, + uint32_t dither4, + int width) { + int x; + v16u8 src0, src1, dst0, vec0, vec1; + v8i16 vec_d0; + v8i16 reg0, reg1, reg2; + v16i8 zero = {0}; + v8i16 max = __msa_ldi_h(0xFF); + + vec_d0 = (v8i16)__msa_fill_w(dither4); + vec_d0 = (v8i16)__msa_ilvr_b(zero, (v16i8)vec_d0); + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16); + vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + reg0 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec0); + reg1 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec1); + reg2 = (v8i16)__msa_ilvod_b(zero, (v16i8)vec0); + reg0 += vec_d0; + reg1 += vec_d0; + reg2 += vec_d0; + reg0 = __msa_maxi_s_h((v8i16)reg0, 0); + reg1 = __msa_maxi_s_h((v8i16)reg1, 0); + reg2 = __msa_maxi_s_h((v8i16)reg2, 0); + reg0 = __msa_min_s_h((v8i16)max, (v8i16)reg0); + reg1 = __msa_min_s_h((v8i16)max, (v8i16)reg1); + reg2 = __msa_min_s_h((v8i16)max, (v8i16)reg2); + reg0 = __msa_srai_h(reg0, 3); + reg2 = __msa_srai_h(reg2, 3); + reg1 = __msa_srai_h(reg1, 2); + reg2 = __msa_slli_h(reg2, 11); + reg1 = __msa_slli_h(reg1, 5); + reg0 |= reg1; + dst0 = (v16u8)(reg0 | reg2); + ST_UB(dst0, dst_rgb); + src_argb += 32; + dst_rgb += 16; + } +} + +void ARGBShuffleRow_MSA(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + int x; + v16u8 src0, src1, dst0, dst1; + v16i8 vec0; + v16i8 shuffler_vec = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; + int32_t val = LW((int32_t*)shuffler); + + vec0 = (v16i8)__msa_fill_w(val); + shuffler_vec += vec0; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16u8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16u8*)src_argb, 16); + dst0 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src0, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src1, (v16i8)src1); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb += 32; + dst_argb += 32; + } +} + +void ARGBShadeRow_MSA(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value) { + int x; + v16u8 src0, dst0; + v8u16 vec0, vec1; + v4u32 reg0, reg1, reg2, reg3, rgba_scale; + v8i16 zero = {0}; + + rgba_scale[0] = value; + rgba_scale = (v4u32)__msa_ilvr_b((v16i8)rgba_scale, (v16i8)rgba_scale); + rgba_scale = (v4u32)__msa_ilvr_h(zero, (v8i16)rgba_scale); + + for (x = 0; x < width; x += 4) { + src0 = (v16u8)__msa_ld_b((const v16u8*)src_argb, 0); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); + reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0); + reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0); + reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1); + reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1); + reg0 *= rgba_scale; + reg1 *= rgba_scale; + reg2 *= rgba_scale; + reg3 *= rgba_scale; + reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24); + reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24); + reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24); + reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_argb); + src_argb += 16; + dst_argb += 16; + } +} + +void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + int x; + v16u8 src0, src1, vec0, vec1, dst0, dst1; + v8u16 reg0; + v16u8 const_0x26 = (v16u8)__msa_ldi_h(0x26); + v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F); + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16u8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16u8*)src_argb, 16); + vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0); + vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0); + reg0 = __msa_dotp_u_h(vec0, const_0x4B0F); + reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x26); + reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 7); + vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0); + vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0); + dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)vec1, (v16i8)vec0); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb += 32; + dst_argb += 32; + } +} + +void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width) { + int x; + v16u8 src0, src1, dst0, dst1, vec0, vec1, vec2, vec3, vec4, vec5; + v8u16 reg0, reg1, reg2; + v16u8 const_0x4411 = (v16u8)__msa_fill_h(0x4411); + v16u8 const_0x23 = (v16u8)__msa_ldi_h(0x23); + v16u8 const_0x5816 = (v16u8)__msa_fill_h(0x5816); + v16u8 const_0x2D = (v16u8)__msa_ldi_h(0x2D); + v16u8 const_0x6218 = (v16u8)__msa_fill_h(0x6218); + v16u8 const_0x32 = (v16u8)__msa_ldi_h(0x32); + v8u16 const_0xFF = (v8u16)__msa_ldi_h(0xFF); + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 16); + vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0); + vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0); + vec3 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec1); + reg0 = (v8u16)__msa_dotp_u_h(vec0, const_0x4411); + reg1 = (v8u16)__msa_dotp_u_h(vec0, const_0x5816); + reg2 = (v8u16)__msa_dotp_u_h(vec0, const_0x6218); + reg0 = (v8u16)__msa_dpadd_u_h(reg0, vec1, const_0x23); + reg1 = (v8u16)__msa_dpadd_u_h(reg1, vec1, const_0x2D); + reg2 = (v8u16)__msa_dpadd_u_h(reg2, vec1, const_0x32); + reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 7); + reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 7); + reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 7); + reg1 = (v8u16)__msa_min_u_h((v8u16)reg1, const_0xFF); + reg2 = (v8u16)__msa_min_u_h((v8u16)reg2, const_0xFF); + vec0 = (v16u8)__msa_pckev_b((v16i8)reg0, (v16i8)reg0); + vec1 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg1); + vec2 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg2); + vec4 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0); + vec5 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1); + dst0 = (v16u8)__msa_ilvr_b((v16i8)vec5, (v16i8)vec4); + dst1 = (v16u8)__msa_ilvl_b((v16i8)vec5, (v16i8)vec4); + ST_UB2(dst0, dst1, dst_argb, 16); + dst_argb += 32; + } +} + +void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1; + v8u16 vec0, vec1, vec2, vec3; + v16u8 dst0, dst1, dst2, dst3; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16u8*)src_argb4444, 0); + src1 = (v16u8)__msa_ld_b((const v16u8*)src_argb4444, 16); + vec0 = (v8u16)__msa_andi_b(src0, 0x0F); + vec1 = (v8u16)__msa_andi_b(src1, 0x0F); + vec2 = (v8u16)__msa_andi_b(src0, 0xF0); + vec3 = (v8u16)__msa_andi_b(src1, 0xF0); + vec0 |= (v8u16)__msa_slli_b((v16i8)vec0, 4); + vec1 |= (v8u16)__msa_slli_b((v16i8)vec1, 4); + vec2 |= (v8u16)__msa_srli_b((v16i8)vec2, 4); + vec3 |= (v8u16)__msa_srli_b((v16i8)vec3, 4); + dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0); + dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1); + dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_argb4444 += 32; + dst_argb += 64; + } +} + +void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width) { + int x; + v8u16 src0, src1; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5; + v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6; + v16u8 dst0, dst1, dst2, dst3; + v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); + + for (x = 0; x < width; x += 16) { + src0 = (v8u16)__msa_ld_h((const v8u16*)src_argb1555, 0); + src1 = (v8u16)__msa_ld_h((const v8u16*)src_argb1555, 16); + vec0 = src0 & const_0x1F; + vec1 = src1 & const_0x1F; + src0 = (v8u16)__msa_srli_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srli_h((v8i16)src1, 5); + vec2 = src0 & const_0x1F; + vec3 = src1 & const_0x1F; + src0 = (v8u16)__msa_srli_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srli_h((v8i16)src1, 5); + vec4 = src0 & const_0x1F; + vec5 = src1 & const_0x1F; + src0 = (v8u16)__msa_srli_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srli_h((v8i16)src1, 5); + reg0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + reg1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + reg2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); + reg3 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + reg4 = (v16u8)__msa_slli_b((v16i8)reg0, 3); + reg5 = (v16u8)__msa_slli_b((v16i8)reg1, 3); + reg6 = (v16u8)__msa_slli_b((v16i8)reg2, 3); + reg4 |= (v16u8)__msa_srai_b((v16i8)reg0, 2); + reg5 |= (v16u8)__msa_srai_b((v16i8)reg1, 2); + reg6 |= (v16u8)__msa_srai_b((v16i8)reg2, 2); + reg3 = -reg3; + reg0 = (v16u8)__msa_ilvr_b((v16i8)reg6, (v16i8)reg4); + reg1 = (v16u8)__msa_ilvl_b((v16i8)reg6, (v16i8)reg4); + reg2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg5); + reg3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg5); + dst0 = (v16u8)__msa_ilvr_b((v16i8)reg2, (v16i8)reg0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)reg2, (v16i8)reg0); + dst2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg1); + dst3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg1); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_argb1555 += 32; + dst_argb += 64; + } +} + +void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { + int x; + v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5; + v8u16 reg0, reg1, reg2, reg3, reg4, reg5; + v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); + v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0); + v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800); + + for (x = 0; x < width; x += 16) { + src0 = (v8u16)__msa_ld_h((const v8u16*)src_rgb565, 0); + src1 = (v8u16)__msa_ld_h((const v8u16*)src_rgb565, 16); + vec0 = src0 & const_0x1F; + vec1 = src0 & const_0x7E0; + vec2 = src0 & const_0xF800; + vec3 = src1 & const_0x1F; + vec4 = src1 & const_0x7E0; + vec5 = src1 & const_0xF800; + reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3); + reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3); + reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8); + reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3); + reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3); + reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8); + reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2); + reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9); + reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13); + reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2); + reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9); + reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13); + res0 = (v16u8)__msa_ilvev_b((v16i8)reg2, (v16i8)reg0); + res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg1); + res2 = (v16u8)__msa_ilvev_b((v16i8)reg5, (v16i8)reg3); + res3 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg4); + dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); + dst2 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res2); + dst3 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res2); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_rgb565 += 32; + dst_argb += 64; + } +} + +void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, src2; + v16u8 vec0, vec1, vec2; + v16u8 dst0, dst1, dst2, dst3; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v16i8 shuffler = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_rgb24, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_rgb24, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_rgb24, 32); + vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12); + vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8); + vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4); + dst0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec0); + dst2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec1); + dst3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec2); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_rgb24 += 48; + dst_argb += 64; + } +} + +void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + int x; + v16u8 src0, src1, src2; + v16u8 vec0, vec1, vec2; + v16u8 dst0, dst1, dst2, dst3; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v16i8 mask = {2, 1, 0, 16, 5, 4, 3, 17, 8, 7, 6, 18, 11, 10, 9, 19}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 32); + vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12); + vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8); + vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4); + dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec0); + dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec1); + dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec2); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_raw += 48; + dst_argb += 64; + } +} + +void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, + uint8_t* dst_y, + int width) { + int x; + v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5; + v8u16 reg0, reg1, reg2, reg3, reg4, reg5; + v16u8 dst0; + v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19); + v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81); + v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42); + v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + + for (x = 0; x < width; x += 16) { + src0 = (v8u16)__msa_ld_b((const v8i16*)src_argb1555, 0); + src1 = (v8u16)__msa_ld_b((const v8i16*)src_argb1555, 16); + vec0 = src0 & const_0x1F; + vec1 = src1 & const_0x1F; + src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); + vec2 = src0 & const_0x1F; + vec3 = src1 & const_0x1F; + src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); + vec4 = src0 & const_0x1F; + vec5 = src1 & const_0x1F; + reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3); + reg1 = (v8u16)__msa_slli_h((v8i16)vec1, 3); + reg0 |= (v8u16)__msa_srai_h((v8i16)vec0, 2); + reg1 |= (v8u16)__msa_srai_h((v8i16)vec1, 2); + reg2 = (v8u16)__msa_slli_h((v8i16)vec2, 3); + reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3); + reg2 |= (v8u16)__msa_srai_h((v8i16)vec2, 2); + reg3 |= (v8u16)__msa_srai_h((v8i16)vec3, 2); + reg4 = (v8u16)__msa_slli_h((v8i16)vec4, 3); + reg5 = (v8u16)__msa_slli_h((v8i16)vec5, 3); + reg4 |= (v8u16)__msa_srai_h((v8i16)vec4, 2); + reg5 |= (v8u16)__msa_srai_h((v8i16)vec5, 2); + reg0 *= const_0x19; + reg1 *= const_0x19; + reg2 *= const_0x81; + reg3 *= const_0x81; + reg4 *= const_0x42; + reg5 *= const_0x42; + reg0 += reg2; + reg1 += reg3; + reg0 += reg4; + reg1 += reg5; + reg0 += const_0x1080; + reg1 += const_0x1080; + reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); + reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + ST_UB(dst0, dst_y); + src_argb1555 += 32; + dst_y += 16; + } +} + +void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { + int x; + v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 reg0, reg1, reg2, reg3, reg4, reg5; + v4u32 res0, res1, res2, res3; + v16u8 dst0; + v4u32 const_0x810019 = (v4u32)__msa_fill_w(0x810019); + v4u32 const_0x010042 = (v4u32)__msa_fill_w(0x010042); + v8i16 const_0x1080 = __msa_fill_h(0x1080); + v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); + v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0); + v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800); + + for (x = 0; x < width; x += 16) { + src0 = (v8u16)__msa_ld_b((const v8i16*)src_rgb565, 0); + src1 = (v8u16)__msa_ld_b((const v8i16*)src_rgb565, 16); + vec0 = src0 & const_0x1F; + vec1 = src0 & const_0x7E0; + vec2 = src0 & const_0xF800; + vec3 = src1 & const_0x1F; + vec4 = src1 & const_0x7E0; + vec5 = src1 & const_0xF800; + reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3); + reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3); + reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8); + reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3); + reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3); + reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8); + reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2); + reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9); + reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13); + reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2); + reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9); + reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13); + vec0 = (v8u16)__msa_ilvr_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_ilvl_h((v8i16)reg1, (v8i16)reg0); + vec2 = (v8u16)__msa_ilvr_h((v8i16)reg4, (v8i16)reg3); + vec3 = (v8u16)__msa_ilvl_h((v8i16)reg4, (v8i16)reg3); + vec4 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg2); + vec5 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg2); + vec6 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg5); + vec7 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg5); + res0 = __msa_dotp_u_w(vec0, (v8u16)const_0x810019); + res1 = __msa_dotp_u_w(vec1, (v8u16)const_0x810019); + res2 = __msa_dotp_u_w(vec2, (v8u16)const_0x810019); + res3 = __msa_dotp_u_w(vec3, (v8u16)const_0x810019); + res0 = __msa_dpadd_u_w(res0, vec4, (v8u16)const_0x010042); + res1 = __msa_dpadd_u_w(res1, vec5, (v8u16)const_0x010042); + res2 = __msa_dpadd_u_w(res2, vec6, (v8u16)const_0x010042); + res3 = __msa_dpadd_u_w(res3, vec7, (v8u16)const_0x010042); + res0 = (v4u32)__msa_srai_w((v4i32)res0, 8); + res1 = (v4u32)__msa_srai_w((v4i32)res1, 8); + res2 = (v4u32)__msa_srai_w((v4i32)res2, 8); + res3 = (v4u32)__msa_srai_w((v4i32)res3, 8); + vec0 = (v8u16)__msa_pckev_h((v8i16)res1, (v8i16)res0); + vec1 = (v8u16)__msa_pckev_h((v8i16)res3, (v8i16)res2); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_y); + src_rgb565 += 32; + dst_y += 16; + } +} + +void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0; + v8u16 vec0, vec1, vec2, vec3; + v8u16 const_0x8119 = (v8u16)__msa_fill_h(0x8119); + v8u16 const_0x42 = (v8u16)__msa_fill_h(0x42); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12}; + v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18, + 18, 19, 20, 21, 21, 22, 23, 24}; + v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20}; + v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16}; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32); + reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0); + reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); + reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1); + reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0); + vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2); + vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8119); + vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8119); + vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x42); + vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x42); + vec0 += const_0x1080; + vec1 += const_0x1080; + vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8); + vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_y); + src_argb0 += 48; + dst_y += 16; + } +} + +void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0; + v8u16 vec0, vec1, vec2, vec3; + v8u16 const_0x8142 = (v8u16)__msa_fill_h(0x8142); + v8u16 const_0x19 = (v8u16)__msa_fill_h(0x19); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12}; + v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18, + 18, 19, 20, 21, 21, 22, 23, 24}; + v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20}; + v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16}; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32); + reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0); + reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); + reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1); + reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0); + vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2); + vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8142); + vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8142); + vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x19); + vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x19); + vec0 += const_0x1080; + vec1 += const_0x1080; + vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8); + vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_y); + src_argb0 += 48; + dst_y += 16; + } +} + +void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint16_t* s = (const uint16_t*)src_argb1555; + const uint16_t* t = (const uint16_t*)(src_argb1555 + src_stride_argb1555); + int64_t res0, res1; + v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6; + v16u8 dst0; + v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70); + v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A); + v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26); + v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E); + v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); + + for (x = 0; x < width; x += 16) { + src0 = (v8u16)__msa_ld_b((v8i16*)s, 0); + src1 = (v8u16)__msa_ld_b((v8i16*)s, 16); + src2 = (v8u16)__msa_ld_b((v8i16*)t, 0); + src3 = (v8u16)__msa_ld_b((v8i16*)t, 16); + vec0 = src0 & const_0x1F; + vec1 = src1 & const_0x1F; + vec0 += src2 & const_0x1F; + vec1 += src3 & const_0x1F; + vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); + src2 = (v8u16)__msa_srai_h((v8i16)src2, 5); + src3 = (v8u16)__msa_srai_h((v8i16)src3, 5); + vec2 = src0 & const_0x1F; + vec3 = src1 & const_0x1F; + vec2 += src2 & const_0x1F; + vec3 += src3 & const_0x1F; + vec2 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); + src2 = (v8u16)__msa_srai_h((v8i16)src2, 5); + src3 = (v8u16)__msa_srai_h((v8i16)src3, 5); + vec4 = src0 & const_0x1F; + vec5 = src1 & const_0x1F; + vec4 += src2 & const_0x1F; + vec5 += src3 & const_0x1F; + vec4 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); + vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); + vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); + vec4 = __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); + vec6 = (v8u16)__msa_slli_h((v8i16)vec0, 1); + vec6 |= (v8u16)__msa_srai_h((v8i16)vec0, 6); + vec0 = (v8u16)__msa_slli_h((v8i16)vec2, 1); + vec0 |= (v8u16)__msa_srai_h((v8i16)vec2, 6); + vec2 = (v8u16)__msa_slli_h((v8i16)vec4, 1); + vec2 |= (v8u16)__msa_srai_h((v8i16)vec4, 6); + reg0 = vec6 * const_0x70; + reg1 = vec0 * const_0x4A; + reg2 = vec2 * const_0x70; + reg3 = vec0 * const_0x5E; + reg0 += const_0x8080; + reg1 += vec2 * const_0x26; + reg2 += const_0x8080; + reg3 += vec6 * const_0x12; + reg0 -= reg1; + reg2 -= reg3; + reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); + reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); + res0 = __msa_copy_u_d((v2i64)dst0, 0); + res1 = __msa_copy_u_d((v2i64)dst0, 1); + SD(res0, dst_u); + SD(res1, dst_v); + s += 16; + t += 16; + dst_u += 8; + dst_v += 8; + } +} + +void RGB565ToUVRow_MSA(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint16_t* s = (const uint16_t*)src_rgb565; + const uint16_t* t = (const uint16_t*)(src_rgb565 + src_stride_rgb565); + int64_t res0, res1; + v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5; + v16u8 dst0; + v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70); + v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A); + v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26); + v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E); + v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12); + v8u16 const_32896 = (v8u16)__msa_fill_h(0x8080); + v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); + v8u16 const_0x3F = (v8u16)__msa_fill_h(0x3F); + + for (x = 0; x < width; x += 16) { + src0 = (v8u16)__msa_ld_b((v8i16*)s, 0); + src1 = (v8u16)__msa_ld_b((v8i16*)s, 16); + src2 = (v8u16)__msa_ld_b((v8i16*)t, 0); + src3 = (v8u16)__msa_ld_b((v8i16*)t, 16); + vec0 = src0 & const_0x1F; + vec1 = src1 & const_0x1F; + vec0 += src2 & const_0x1F; + vec1 += src3 & const_0x1F; + vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); + src2 = (v8u16)__msa_srai_h((v8i16)src2, 5); + src3 = (v8u16)__msa_srai_h((v8i16)src3, 5); + vec2 = src0 & const_0x3F; + vec3 = src1 & const_0x3F; + vec2 += src2 & const_0x3F; + vec3 += src3 & const_0x3F; + vec1 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + src0 = (v8u16)__msa_srai_h((v8i16)src0, 6); + src1 = (v8u16)__msa_srai_h((v8i16)src1, 6); + src2 = (v8u16)__msa_srai_h((v8i16)src2, 6); + src3 = (v8u16)__msa_srai_h((v8i16)src3, 6); + vec4 = src0 & const_0x1F; + vec5 = src1 & const_0x1F; + vec4 += src2 & const_0x1F; + vec5 += src3 & const_0x1F; + vec2 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); + vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = (v8u16)__msa_slli_h((v8i16)vec0, 1); + vec3 |= (v8u16)__msa_srai_h((v8i16)vec0, 6); + vec4 = (v8u16)__msa_slli_h((v8i16)vec2, 1); + vec4 |= (v8u16)__msa_srai_h((v8i16)vec2, 6); + reg0 = vec3 * const_0x70; + reg1 = vec1 * const_0x4A; + reg2 = vec4 * const_0x70; + reg3 = vec1 * const_0x5E; + reg0 += const_32896; + reg1 += vec4 * const_0x26; + reg2 += const_32896; + reg3 += vec3 * const_0x12; + reg0 -= reg1; + reg2 -= reg3; + reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); + reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); + res0 = __msa_copy_u_d((v2i64)dst0, 0); + res1 = __msa_copy_u_d((v2i64)dst0, 1); + SD(res0, dst_u); + SD(res1, dst_v); + s += 16; + t += 16; + dst_u += 8; + dst_v += 8; + } +} + +void RGB24ToUVRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* s = src_rgb0; + const uint8_t* t = src_rgb0 + src_stride_rgb; + int64_t res0, res1; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 inp0, inp1, inp2, inp3, inp4, inp5; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 reg0, reg1, reg2, reg3; + v16u8 dst0; + v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70); + v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A); + v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26); + v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E); + v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 16) { + inp0 = (v16u8)__msa_ld_b((const v16i8*)s, 0); + inp1 = (v16u8)__msa_ld_b((const v16i8*)s, 16); + inp2 = (v16u8)__msa_ld_b((const v16i8*)s, 32); + inp3 = (v16u8)__msa_ld_b((const v16i8*)t, 0); + inp4 = (v16u8)__msa_ld_b((const v16i8*)t, 16); + inp5 = (v16u8)__msa_ld_b((const v16i8*)t, 32); + src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12); + src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12); + src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8); + src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8); + src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4); + src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4); + src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0); + src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1); + src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2); + src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3); + src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3); + src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5); + src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6); + src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1); + vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2); + vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2); + vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3); + vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3); + vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); + vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); + vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5); + vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6); + vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7); + reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0); + reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2); + reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4); + reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); + reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0); + reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2); + reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4); + reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6); + reg0 = __msa_srai_h((v8i16)reg0, 2); + reg1 = __msa_srai_h((v8i16)reg1, 2); + reg2 = __msa_srai_h((v8i16)reg2, 2); + reg3 = __msa_srai_h((v8i16)reg3, 2); + vec4 = (v8u16)__msa_pckev_h(reg1, reg0); + vec5 = (v8u16)__msa_pckev_h(reg3, reg2); + vec6 = (v8u16)__msa_pckod_h(reg1, reg0); + vec7 = (v8u16)__msa_pckod_h(reg3, reg2); + vec0 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4); + vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6); + vec2 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4); + vec3 = vec0 * const_0x70; + vec4 = vec1 * const_0x4A; + vec5 = vec2 * const_0x26; + vec2 *= const_0x70; + vec1 *= const_0x5E; + vec0 *= const_0x12; + reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4); + reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5); + reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1); + reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0); + reg0 += reg1; + reg2 += reg3; + reg0 = __msa_srai_h(reg0, 8); + reg2 = __msa_srai_h(reg2, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); + res0 = __msa_copy_u_d((v2i64)dst0, 0); + res1 = __msa_copy_u_d((v2i64)dst0, 1); + SD(res0, dst_u); + SD(res1, dst_v); + t += 48; + s += 48; + dst_u += 8; + dst_v += 8; + } +} + +void RAWToUVRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* s = src_rgb0; + const uint8_t* t = src_rgb0 + src_stride_rgb; + int64_t res0, res1; + v16u8 inp0, inp1, inp2, inp3, inp4, inp5; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 reg0, reg1, reg2, reg3; + v16u8 dst0; + v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70); + v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A); + v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26); + v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E); + v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 16) { + inp0 = (v16u8)__msa_ld_b((const v16i8*)s, 0); + inp1 = (v16u8)__msa_ld_b((const v16i8*)s, 16); + inp2 = (v16u8)__msa_ld_b((const v16i8*)s, 32); + inp3 = (v16u8)__msa_ld_b((const v16i8*)t, 0); + inp4 = (v16u8)__msa_ld_b((const v16i8*)t, 16); + inp5 = (v16u8)__msa_ld_b((const v16i8*)t, 32); + src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12); + src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12); + src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8); + src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8); + src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4); + src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4); + src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0); + src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1); + src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2); + src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3); + src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3); + src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5); + src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6); + src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1); + vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2); + vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2); + vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3); + vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3); + vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); + vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); + vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5); + vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6); + vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7); + reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0); + reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2); + reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4); + reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); + reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0); + reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2); + reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4); + reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6); + reg0 = __msa_srai_h(reg0, 2); + reg1 = __msa_srai_h(reg1, 2); + reg2 = __msa_srai_h(reg2, 2); + reg3 = __msa_srai_h(reg3, 2); + vec4 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec5 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + vec6 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0); + vec7 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2); + vec0 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4); + vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6); + vec2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4); + vec3 = vec0 * const_0x70; + vec4 = vec1 * const_0x4A; + vec5 = vec2 * const_0x26; + vec2 *= const_0x70; + vec1 *= const_0x5E; + vec0 *= const_0x12; + reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4); + reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5); + reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1); + reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0); + reg0 += reg1; + reg2 += reg3; + reg0 = __msa_srai_h(reg0, 8); + reg2 = __msa_srai_h(reg2, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); + res0 = __msa_copy_u_d((v2i64)dst0, 0); + res1 = __msa_copy_u_d((v2i64)dst0, 1); + SD(res0, dst_u); + SD(res1, dst_v); + t += 48; + s += 48; + dst_u += 8; + dst_v += 8; + } +} + +void NV12ToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + uint64_t val0, val1; + v16u8 src0, src1, res0, res1, dst0, dst1; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 zero = {0}; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + val0 = LD(src_y); + val1 = LD(src_uv); + src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); + src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0); + res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1); + dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); + ST_UB2(dst0, dst1, dst_argb, 16); + src_y += 8; + src_uv += 8; + dst_argb += 32; + } +} + +void NV12ToRGB565Row_MSA(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + int x; + uint64_t val0, val1; + v16u8 src0, src1, dst0; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 zero = {0}; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + val0 = LD(src_y); + val1 = LD(src_uv); + src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); + src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + vec0 = vec0 >> 3; + vec1 = (vec1 >> 2) << 5; + vec2 = (vec2 >> 3) << 11; + dst0 = (v16u8)(vec0 | vec1 | vec2); + ST_UB(dst0, dst_rgb565); + src_y += 8; + src_uv += 8; + dst_rgb565 += 16; + } +} + +void NV21ToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + uint64_t val0, val1; + v16u8 src0, src1, res0, res1, dst0, dst1; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v16u8 zero = {0}; + v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + val0 = LD(src_y); + val1 = LD(src_vu); + src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); + src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); + src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0); + res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1); + dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); + ST_UB2(dst0, dst1, dst_argb, 16); + src_y += 8; + src_vu += 8; + dst_argb += 32; + } +} + +void SobelRow_MSA(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3; + v16i8 mask0 = {0, 0, 0, 16, 1, 1, 1, 16, 2, 2, 2, 16, 3, 3, 3, 16}; + v16i8 const_0x4 = __msa_ldi_b(0x4); + v16i8 mask1 = mask0 + const_0x4; + v16i8 mask2 = mask1 + const_0x4; + v16i8 mask3 = mask2 + const_0x4; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_sobelx, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_sobely, 0); + vec0 = __msa_adds_u_b(src0, src1); + dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)alpha, (v16i8)vec0); + dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)alpha, (v16i8)vec0); + dst2 = (v16u8)__msa_vshf_b(mask2, (v16i8)alpha, (v16i8)vec0); + dst3 = (v16u8)__msa_vshf_b(mask3, (v16i8)alpha, (v16i8)vec0); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_sobelx += 16; + src_sobely += 16; + dst_argb += 64; + } +} + +void SobelToPlaneRow_MSA(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 32) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_sobelx, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_sobelx, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_sobely, 0); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_sobely, 16); + dst0 = __msa_adds_u_b(src0, src2); + dst1 = __msa_adds_u_b(src1, src3); + ST_UB2(dst0, dst1, dst_y, 16); + src_sobelx += 32; + src_sobely += 32; + dst_y += 32; + } +} + +void SobelXYRow_MSA(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, vec0, vec1, vec2; + v16u8 reg0, reg1, dst0, dst1, dst2, dst3; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_sobelx, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_sobely, 0); + vec0 = __msa_adds_u_b(src0, src1); + vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1); + vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1); + reg0 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)vec0); + reg1 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)vec0); + dst0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)vec1); + dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1); + dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2); + dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_sobelx += 16; + src_sobely += 16; + dst_argb += 64; + } +} + +void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0; + v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F); + v16u8 const_0x26 = (v16u8)__msa_fill_h(0x26); + v8u16 const_0x40 = (v8u16)__msa_fill_h(0x40); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 48); + ARGBTOY(src0, src1, src2, src3, const_0x4B0F, const_0x26, const_0x40, 7, + dst0); + ST_UB(dst0, dst_y); + src_argb0 += 64; + dst_y += 16; + } +} + +void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0; + v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200); + v16u8 const_0x1981 = (v16u8)__msa_fill_h(0x1981); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 48); + ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8, + dst0); + ST_UB(dst0, dst_y); + src_argb0 += 64; + dst_y += 16; + } +} + +void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0; + v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142); + v16u8 const_0x19 = (v16u8)__msa_fill_h(0x19); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 48); + ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8, + dst0); + ST_UB(dst0, dst_y); + src_argb0 += 64; + dst_y += 16; + } +} + +void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0; + v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900); + v16u8 const_0x4281 = (v16u8)__msa_fill_h(0x4281); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 48); + ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8, + dst0); + ST_UB(dst0, dst_y); + src_argb0 += 64; + dst_y += 16; + } +} + +void ARGBToUVJRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* s = src_rgb0; + const uint8_t* t = src_rgb0 + src_stride_rgb; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 vec0, vec1, vec2, vec3; + v16u8 dst0, dst1; + v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; + v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, + 18, 19, 22, 23, 26, 27, 30, 31}; + v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; + v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30}; + v16u8 const_0x7F = (v16u8)__msa_fill_h(0x7F); + v16u8 const_0x6B14 = (v16u8)__msa_fill_h(0x6B14); + v16u8 const_0x2B54 = (v16u8)__msa_fill_h(0x2B54); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + + for (x = 0; x < width; x += 32) { + src0 = (v16u8)__msa_ld_b((const v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)s, 32); + src3 = (v16u8)__msa_ld_b((const v16i8*)s, 48); + src4 = (v16u8)__msa_ld_b((const v16i8*)t, 0); + src5 = (v16u8)__msa_ld_b((const v16i8*)t, 16); + src6 = (v16u8)__msa_ld_b((const v16i8*)t, 32); + src7 = (v16u8)__msa_ld_b((const v16i8*)t, 48); + src0 = __msa_aver_u_b(src0, src4); + src1 = __msa_aver_u_b(src1, src5); + src2 = __msa_aver_u_b(src2, src6); + src3 = __msa_aver_u_b(src3, src7); + src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0); + src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2); + src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); + src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2); + vec0 = __msa_aver_u_b(src4, src6); + vec1 = __msa_aver_u_b(src5, src7); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 64); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 80); + src2 = (v16u8)__msa_ld_b((v16i8*)s, 96); + src3 = (v16u8)__msa_ld_b((v16i8*)s, 112); + src4 = (v16u8)__msa_ld_b((v16i8*)t, 64); + src5 = (v16u8)__msa_ld_b((v16i8*)t, 80); + src6 = (v16u8)__msa_ld_b((v16i8*)t, 96); + src7 = (v16u8)__msa_ld_b((v16i8*)t, 112); + src0 = __msa_aver_u_b(src0, src4); + src1 = __msa_aver_u_b(src1, src5); + src2 = __msa_aver_u_b(src2, src6); + src3 = __msa_aver_u_b(src3, src7); + src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0); + src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2); + src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); + src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2); + vec2 = __msa_aver_u_b(src4, src6); + vec3 = __msa_aver_u_b(src5, src7); + ARGBTOUV(vec0, vec1, vec2, vec3, const_0x6B14, const_0x7F, const_0x2B54, + const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0, + dst1); + ST_UB(dst0, dst_v); + ST_UB(dst1, dst_u); + s += 128; + t += 128; + dst_v += 16; + dst_u += 16; + } +} + +void BGRAToUVRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* s = src_rgb0; + const uint8_t* t = src_rgb0 + src_stride_rgb; + v16u8 dst0, dst1, vec0, vec1, vec2, vec3; + v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; + v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, + 18, 19, 22, 23, 26, 27, 30, 31}; + v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; + v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29}; + v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E); + v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000); + v16u8 const_0x264A = (v16u8)__msa_fill_h(0x264A); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + + for (x = 0; x < width; x += 32) { + READ_ARGB(s, t, vec0, vec1, vec2, vec3); + ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A, + const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0, + dst1); + ST_UB(dst0, dst_v); + ST_UB(dst1, dst_u); + s += 128; + t += 128; + dst_v += 16; + dst_u += 16; + } +} + +void ABGRToUVRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* s = src_rgb0; + const uint8_t* t = src_rgb0 + src_stride_rgb; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1; + v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; + v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, + 18, 19, 22, 23, 26, 27, 30, 31}; + v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; + v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30}; + v16u8 const_0x4A26 = (v16u8)__msa_fill_h(0x4A26); + v16u8 const_0x0070 = (v16u8)__msa_fill_h(0x0070); + v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + + for (x = 0; x < width; x += 32) { + READ_ARGB(s, t, src0, src1, src2, src3); + ARGBTOUV(src0, src1, src2, src3, const_0x4A26, const_0x0070, const_0x125E, + const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0, + dst1); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + s += 128; + t += 128; + dst_u += 16; + dst_v += 16; + } +} + +void RGBAToUVRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* s = src_rgb0; + const uint8_t* t = src_rgb0 + src_stride_rgb; + v16u8 dst0, dst1, vec0, vec1, vec2, vec3; + v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; + v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, + 18, 19, 22, 23, 26, 27, 30, 31}; + v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; + v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29}; + v16u8 const_0x125E = (v16u8)__msa_fill_h(0x264A); + v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000); + v16u8 const_0x264A = (v16u8)__msa_fill_h(0x125E); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + + for (x = 0; x < width; x += 32) { + READ_ARGB(s, t, vec0, vec1, vec2, vec3); + ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A, + const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0, + dst1); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + s += 128; + t += 128; + dst_u += 16; + dst_v += 16; + } +} + +void I444ToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2, dst0, dst1; + v8u16 vec0, vec1, vec2; + v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v8i16 zero = {0}; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + + for (x = 0; x < width; x += 8) { + READI444(src_y, src_u, src_v, src0, src1, src2); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); + reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); + reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); + reg0 *= vec_yg; + reg1 *= vec_yg; + reg0 = __msa_srai_w(reg0, 16); + reg1 = __msa_srai_w(reg1, 16); + reg4 = reg0 + vec_br; + reg5 = reg1 + vec_br; + reg2 = reg0 + vec_bg; + reg3 = reg1 + vec_bg; + reg0 += vec_bb; + reg1 += vec_bb; + vec0 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); + vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src2); + reg6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); + reg7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); + reg8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1); + reg9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1); + reg0 -= reg6 * vec_ub; + reg1 -= reg7 * vec_ub; + reg2 -= reg6 * vec_ug; + reg3 -= reg7 * vec_ug; + reg4 -= reg8 * vec_vr; + reg5 -= reg9 * vec_vr; + reg2 -= reg8 * vec_vg; + reg3 -= reg9 * vec_vg; + reg0 = __msa_srai_w(reg0, 6); + reg1 = __msa_srai_w(reg1, 6); + reg2 = __msa_srai_w(reg2, 6); + reg3 = __msa_srai_w(reg3, 6); + reg4 = __msa_srai_w(reg4, 6); + reg5 = __msa_srai_w(reg5, 6); + CLIP_0TO255(reg0, reg1, reg2, reg3, reg4, reg5); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4); + vec0 = (v8u16)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); + vec1 = (v8u16)__msa_ilvev_b((v16i8)alpha, (v16i8)vec2); + dst0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0); + dst1 = (v16u8)__msa_ilvl_h((v8i16)vec1, (v8i16)vec0); + ST_UB2(dst0, dst1, dst_argb, 16); + src_y += 8; + src_u += 8; + src_v += 8; + dst_argb += 32; + } +} + +void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) { + int x; + v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3; + v8i16 vec0, vec1; + v4i32 reg0, reg1, reg2, reg3; + v4i32 vec_yg = __msa_fill_w(0x4A35); + v8i16 vec_ygb = __msa_fill_h(0xFB78); + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v8i16 max = __msa_ldi_h(0xFF); + v8i16 zero = {0}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_y, 0); + vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); + vec1 = (v8i16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); + reg0 = (v4i32)__msa_ilvr_h(zero, vec0); + reg1 = (v4i32)__msa_ilvl_h(zero, vec0); + reg2 = (v4i32)__msa_ilvr_h(zero, vec1); + reg3 = (v4i32)__msa_ilvl_h(zero, vec1); + reg0 *= vec_yg; + reg1 *= vec_yg; + reg2 *= vec_yg; + reg3 *= vec_yg; + reg0 = __msa_srai_w(reg0, 16); + reg1 = __msa_srai_w(reg1, 16); + reg2 = __msa_srai_w(reg2, 16); + reg3 = __msa_srai_w(reg3, 16); + vec0 = (v8i16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8i16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + vec0 += vec_ygb; + vec1 += vec_ygb; + vec0 = __msa_srai_h(vec0, 6); + vec1 = __msa_srai_h(vec1, 6); + vec0 = __msa_maxi_s_h(vec0, 0); + vec1 = __msa_maxi_s_h(vec1, 0); + vec0 = __msa_min_s_h(max, vec0); + vec1 = __msa_min_s_h(max, vec1); + res0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + res1 = (v16u8)__msa_ilvr_b((v16i8)res0, (v16i8)res0); + res2 = (v16u8)__msa_ilvl_b((v16i8)res0, (v16i8)res0); + res3 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)res0); + res4 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)res0); + dst0 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res1); + dst1 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res1); + dst2 = (v16u8)__msa_ilvr_b((v16i8)res4, (v16i8)res2); + dst3 = (v16u8)__msa_ilvl_b((v16i8)res4, (v16i8)res2); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_y += 16; + dst_argb += 64; + } +} + +void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) { + int x; + v16u8 src0, vec0, vec1, vec2, vec3, dst0, dst1, dst2, dst3; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_y, 0); + vec0 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src0); + vec1 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src0); + vec2 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)src0); + vec3 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)src0); + dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0); + dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1); + dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_y += 16; + dst_argb += 64; + } +} + +void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_yuy2, 0); + src1 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0); + src2 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0); + YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + STOREARGB(vec0, vec1, vec2, alpha, dst_argb); + src_yuy2 += 16; + dst_argb += 32; + } +} + +void UYVYToARGBRow_MSA(const uint8_t* src_uyvy, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_uyvy, 0); + src1 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0); + src2 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0); + YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + STOREARGB(vec0, vec1, vec2, alpha, dst_argb); + src_uyvy += 16; + dst_argb += 32; + } +} + +void InterpolateRow_MSA(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int width, + int32_t source_y_fraction) { + int32_t y1_fraction = source_y_fraction; + int32_t y0_fraction = 256 - y1_fraction; + uint16_t y_fractions; + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + v8u16 vec0, vec1, vec2, vec3, y_frac; + + if (0 == y1_fraction) { + memcpy(dst_ptr, src_ptr, width); + return; + } + + if (128 == y1_fraction) { + for (x = 0; x < width; x += 32) { + src0 = (v16u8)__msa_ld_b((const v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)t, 0); + src3 = (v16u8)__msa_ld_b((const v16i8*)t, 16); + dst0 = __msa_aver_u_b(src0, src2); + dst1 = __msa_aver_u_b(src1, src3); + ST_UB2(dst0, dst1, dst_ptr, 16); + s += 32; + t += 32; + dst_ptr += 32; + } + return; + } + + y_fractions = (uint16_t)(y0_fraction + (y1_fraction << 8)); + y_frac = (v8u16)__msa_fill_h(y_fractions); + + for (x = 0; x < width; x += 32) { + src0 = (v16u8)__msa_ld_b((const v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)t, 0); + src3 = (v16u8)__msa_ld_b((const v16i8*)t, 16); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); + vec0 = (v8u16)__msa_dotp_u_h((v16u8)vec0, (v16u8)y_frac); + vec1 = (v8u16)__msa_dotp_u_h((v16u8)vec1, (v16u8)y_frac); + vec2 = (v8u16)__msa_dotp_u_h((v16u8)vec2, (v16u8)y_frac); + vec3 = (v8u16)__msa_dotp_u_h((v16u8)vec3, (v16u8)y_frac); + vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 8); + vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 8); + vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 8); + vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + ST_UB2(dst0, dst1, dst_ptr, 16); + s += 32; + t += 32; + dst_ptr += 32; + } +} + +void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width) { + int x; + v4i32 dst0 = __builtin_msa_fill_w(v32); + + for (x = 0; x < width; x += 4) { + ST_UB(dst0, dst_argb); + dst_argb += 16; + } +} + +void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { + int x; + v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2; + v16i8 shuffler0 = {2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17}; + v16i8 shuffler1 = {8, 7, 12, 11, 10, 15, 14, 13, + 18, 17, 16, 21, 20, 19, 24, 23}; + v16i8 shuffler2 = {14, 19, 18, 17, 22, 21, 20, 25, + 24, 23, 28, 27, 26, 31, 30, 29}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 32); + src3 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 8); + src4 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8); + dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src4, (v16i8)src3); + dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src2, (v16i8)src1); + ST_UB2(dst0, dst1, dst_rgb24, 16); + ST_UB(dst2, (dst_rgb24 + 32)); + src_raw += 48; + dst_rgb24 += 48; + } +} + +void MergeUVRow_MSA(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + int x; + v16u8 src0, src1, dst0, dst1; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_u, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_v, 0); + dst0 = (v16u8)__msa_ilvr_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)src1, (v16i8)src0); + ST_UB2(dst0, dst1, dst_uv, 16); + src_u += 16; + src_v += 16; + dst_uv += 32; + } +} + +void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + int i; + v16u8 src0, src1, src2, src3, vec0, vec1, dst0; + + for (i = 0; i < width; i += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 48); + vec0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_a); + src_argb += 64; + dst_a += 16; + } +} + +void ARGBBlendRow_MSA(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 vec8, vec9, vec10, vec11, vec12, vec13; + v8u16 const_256 = (v8u16)__msa_ldi_h(256); + v16u8 const_255 = (v16u8)__msa_ldi_b(255); + v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255}; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 0); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 16); + vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1); + vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2); + vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2); + vec6 = (v8u16)__msa_ilvr_b(zero, (v16i8)src3); + vec7 = (v8u16)__msa_ilvl_b(zero, (v16i8)src3); + vec8 = (v8u16)__msa_fill_h(vec0[3]); + vec9 = (v8u16)__msa_fill_h(vec0[7]); + vec10 = (v8u16)__msa_fill_h(vec1[3]); + vec11 = (v8u16)__msa_fill_h(vec1[7]); + vec8 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8); + vec9 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10); + vec10 = (v8u16)__msa_fill_h(vec2[3]); + vec11 = (v8u16)__msa_fill_h(vec2[7]); + vec12 = (v8u16)__msa_fill_h(vec3[3]); + vec13 = (v8u16)__msa_fill_h(vec3[7]); + vec10 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10); + vec11 = (v8u16)__msa_pckev_d((v2i64)vec13, (v2i64)vec12); + vec8 = const_256 - vec8; + vec9 = const_256 - vec9; + vec10 = const_256 - vec10; + vec11 = const_256 - vec11; + vec8 *= vec4; + vec9 *= vec5; + vec10 *= vec6; + vec11 *= vec7; + vec8 = (v8u16)__msa_srai_h((v8i16)vec8, 8); + vec9 = (v8u16)__msa_srai_h((v8i16)vec9, 8); + vec10 = (v8u16)__msa_srai_h((v8i16)vec10, 8); + vec11 = (v8u16)__msa_srai_h((v8i16)vec11, 8); + vec0 += vec8; + vec1 += vec9; + vec2 += vec10; + vec3 += vec11; + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + dst0 = __msa_bmnz_v(dst0, const_255, mask); + dst1 = __msa_bmnz_v(dst1, const_255, mask); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb0 += 32; + src_argb1 += 32; + dst_argb += 32; + } +} + +void ARGBQuantizeRow_MSA(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + v4i32 vec_scale = __msa_fill_w(scale); + v16u8 vec_int_sz = (v16u8)__msa_fill_b(interval_size); + v16u8 vec_int_ofst = (v16u8)__msa_fill_b(interval_offset); + v16i8 mask = {0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31}; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 48); + vec0 = (v8i16)__msa_ilvr_b(zero, (v16i8)src0); + vec1 = (v8i16)__msa_ilvl_b(zero, (v16i8)src0); + vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1); + vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1); + vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2); + vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2); + vec6 = (v8i16)__msa_ilvr_b(zero, (v16i8)src3); + vec7 = (v8i16)__msa_ilvl_b(zero, (v16i8)src3); + tmp0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); + tmp1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); + tmp2 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1); + tmp3 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1); + tmp4 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec2); + tmp5 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec2); + tmp6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec3); + tmp7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec3); + tmp8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec4); + tmp9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec4); + tmp10 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec5); + tmp11 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec5); + tmp12 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec6); + tmp13 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec6); + tmp14 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec7); + tmp15 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec7); + tmp0 *= vec_scale; + tmp1 *= vec_scale; + tmp2 *= vec_scale; + tmp3 *= vec_scale; + tmp4 *= vec_scale; + tmp5 *= vec_scale; + tmp6 *= vec_scale; + tmp7 *= vec_scale; + tmp8 *= vec_scale; + tmp9 *= vec_scale; + tmp10 *= vec_scale; + tmp11 *= vec_scale; + tmp12 *= vec_scale; + tmp13 *= vec_scale; + tmp14 *= vec_scale; + tmp15 *= vec_scale; + tmp0 >>= 16; + tmp1 >>= 16; + tmp2 >>= 16; + tmp3 >>= 16; + tmp4 >>= 16; + tmp5 >>= 16; + tmp6 >>= 16; + tmp7 >>= 16; + tmp8 >>= 16; + tmp9 >>= 16; + tmp10 >>= 16; + tmp11 >>= 16; + tmp12 >>= 16; + tmp13 >>= 16; + tmp14 >>= 16; + tmp15 >>= 16; + vec0 = (v8i16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec1 = (v8i16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); + vec2 = (v8i16)__msa_pckev_h((v8i16)tmp5, (v8i16)tmp4); + vec3 = (v8i16)__msa_pckev_h((v8i16)tmp7, (v8i16)tmp6); + vec4 = (v8i16)__msa_pckev_h((v8i16)tmp9, (v8i16)tmp8); + vec5 = (v8i16)__msa_pckev_h((v8i16)tmp11, (v8i16)tmp10); + vec6 = (v8i16)__msa_pckev_h((v8i16)tmp13, (v8i16)tmp12); + vec7 = (v8i16)__msa_pckev_h((v8i16)tmp15, (v8i16)tmp14); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + dst2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); + dst3 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); + dst0 *= vec_int_sz; + dst1 *= vec_int_sz; + dst2 *= vec_int_sz; + dst3 *= vec_int_sz; + dst0 += vec_int_ofst; + dst1 += vec_int_ofst; + dst2 += vec_int_ofst; + dst3 += vec_int_ofst; + dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)src0, (v16i8)dst0); + dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)dst1); + dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)src2, (v16i8)dst2); + dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)src3, (v16i8)dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + dst_argb += 64; + } +} + +void ARGBColorMatrixRow_MSA(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width) { + int32_t x; + v16i8 src0; + v16u8 src1, src2, dst0, dst1; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; + v8i16 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17; + v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + v16i8 zero = {0}; + v8i16 max = __msa_ldi_h(255); + + src0 = __msa_ld_b((v16i8*)matrix_argb, 0); + vec0 = (v8i16)__msa_ilvr_b(zero, src0); + vec1 = (v8i16)__msa_ilvl_b(zero, src0); + + for (x = 0; x < width; x += 8) { + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16); + vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1); + vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1); + vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2); + vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2); + vec6 = (v8i16)__msa_pckod_d((v2i64)vec2, (v2i64)vec2); + vec7 = (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec3); + vec8 = (v8i16)__msa_pckod_d((v2i64)vec4, (v2i64)vec4); + vec9 = (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec5); + vec2 = (v8i16)__msa_pckev_d((v2i64)vec2, (v2i64)vec2); + vec3 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec3); + vec4 = (v8i16)__msa_pckev_d((v2i64)vec4, (v2i64)vec4); + vec5 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec5); + vec10 = vec2 * vec0; + vec11 = vec2 * vec1; + vec12 = vec6 * vec0; + vec13 = vec6 * vec1; + tmp0 = __msa_hadd_s_w(vec10, vec10); + tmp1 = __msa_hadd_s_w(vec11, vec11); + tmp2 = __msa_hadd_s_w(vec12, vec12); + tmp3 = __msa_hadd_s_w(vec13, vec13); + vec14 = vec3 * vec0; + vec15 = vec3 * vec1; + vec16 = vec7 * vec0; + vec17 = vec7 * vec1; + tmp4 = __msa_hadd_s_w(vec14, vec14); + tmp5 = __msa_hadd_s_w(vec15, vec15); + tmp6 = __msa_hadd_s_w(vec16, vec16); + tmp7 = __msa_hadd_s_w(vec17, vec17); + vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); + vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4); + vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6); + tmp0 = __msa_hadd_s_w(vec10, vec10); + tmp1 = __msa_hadd_s_w(vec11, vec11); + tmp2 = __msa_hadd_s_w(vec12, vec12); + tmp3 = __msa_hadd_s_w(vec13, vec13); + tmp0 = __msa_srai_w(tmp0, 6); + tmp1 = __msa_srai_w(tmp1, 6); + tmp2 = __msa_srai_w(tmp2, 6); + tmp3 = __msa_srai_w(tmp3, 6); + vec2 = vec4 * vec0; + vec6 = vec4 * vec1; + vec3 = vec8 * vec0; + vec7 = vec8 * vec1; + tmp8 = __msa_hadd_s_w(vec2, vec2); + tmp9 = __msa_hadd_s_w(vec6, vec6); + tmp10 = __msa_hadd_s_w(vec3, vec3); + tmp11 = __msa_hadd_s_w(vec7, vec7); + vec4 = vec5 * vec0; + vec8 = vec5 * vec1; + vec5 = vec9 * vec0; + vec9 = vec9 * vec1; + tmp12 = __msa_hadd_s_w(vec4, vec4); + tmp13 = __msa_hadd_s_w(vec8, vec8); + tmp14 = __msa_hadd_s_w(vec5, vec5); + tmp15 = __msa_hadd_s_w(vec9, vec9); + vec14 = __msa_pckev_h((v8i16)tmp9, (v8i16)tmp8); + vec15 = __msa_pckev_h((v8i16)tmp11, (v8i16)tmp10); + vec16 = __msa_pckev_h((v8i16)tmp13, (v8i16)tmp12); + vec17 = __msa_pckev_h((v8i16)tmp15, (v8i16)tmp14); + tmp4 = __msa_hadd_s_w(vec14, vec14); + tmp5 = __msa_hadd_s_w(vec15, vec15); + tmp6 = __msa_hadd_s_w(vec16, vec16); + tmp7 = __msa_hadd_s_w(vec17, vec17); + tmp4 = __msa_srai_w(tmp4, 6); + tmp5 = __msa_srai_w(tmp5, 6); + tmp6 = __msa_srai_w(tmp6, 6); + tmp7 = __msa_srai_w(tmp7, 6); + vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); + vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4); + vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6); + vec10 = __msa_maxi_s_h(vec10, 0); + vec11 = __msa_maxi_s_h(vec11, 0); + vec12 = __msa_maxi_s_h(vec12, 0); + vec13 = __msa_maxi_s_h(vec13, 0); + vec10 = __msa_min_s_h(vec10, max); + vec11 = __msa_min_s_h(vec11, max); + vec12 = __msa_min_s_h(vec12, max); + vec13 = __msa_min_s_h(vec13, max); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec13, (v16i8)vec12); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb += 32; + dst_argb += 32; + } +} + +void SplitUVRow_MSA(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; + + for (x = 0; x < width; x += 32) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 32); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 48); + dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + dst2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + dst3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_u, 16); + ST_UB2(dst2, dst3, dst_v, 16); + src_uv += 64; + dst_u += 32; + dst_v += 32; + } +} + +void SetRow_MSA(uint8_t* dst, uint8_t v8, int width) { + int x; + v16u8 dst0 = (v16u8)__msa_fill_b(v8); + + for (x = 0; x < width; x += 16) { + ST_UB(dst0, dst); + dst += 16; + } +} + +void MirrorUVRow_MSA(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + v16i8 mask0 = {30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0}; + v16i8 mask1 = {31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1}; + + src_uv += (2 * width); + + for (x = 0; x < width; x += 32) { + src_uv -= 64; + src2 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 0); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 16); + src0 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 32); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 48); + dst0 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); + dst2 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0); + dst3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_v, 16); + ST_UB2(dst2, dst3, dst_u, 16); + dst_u += 32; + dst_v += 32; + } +} + +void SobelXRow_MSA(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int32_t width) { + int x; + v16u8 src0, src1, src2, src3, src4, src5, dst0; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5; + v16i8 mask0 = {0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9}; + v16i8 tmp = __msa_ldi_b(8); + v16i8 mask1 = mask0 + tmp; + v8i16 zero = {0}; + v8i16 max = __msa_ldi_h(255); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_y0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_y0, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_y1, 0); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_y1, 16); + src4 = (v16u8)__msa_ld_b((const v16i8*)src_y2, 0); + src5 = (v16u8)__msa_ld_b((const v16i8*)src_y2, 16); + vec0 = (v8i16)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0); + vec1 = (v8i16)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); + vec2 = (v8i16)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2); + vec3 = (v8i16)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); + vec4 = (v8i16)__msa_vshf_b(mask0, (v16i8)src5, (v16i8)src4); + vec5 = (v8i16)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4); + vec0 = (v8i16)__msa_hsub_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = (v8i16)__msa_hsub_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = (v8i16)__msa_hsub_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = (v8i16)__msa_hsub_u_h((v16u8)vec3, (v16u8)vec3); + vec4 = (v8i16)__msa_hsub_u_h((v16u8)vec4, (v16u8)vec4); + vec5 = (v8i16)__msa_hsub_u_h((v16u8)vec5, (v16u8)vec5); + vec0 += vec2; + vec1 += vec3; + vec4 += vec2; + vec5 += vec3; + vec0 += vec4; + vec1 += vec5; + vec0 = __msa_add_a_h(zero, vec0); + vec1 = __msa_add_a_h(zero, vec1); + vec0 = __msa_maxi_s_h(vec0, 0); + vec1 = __msa_maxi_s_h(vec1, 0); + vec0 = __msa_min_s_h(max, vec0); + vec1 = __msa_min_s_h(max, vec1); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_sobelx); + src_y0 += 16; + src_y1 += 16; + src_y2 += 16; + dst_sobelx += 16; + } +} + +void SobelYRow_MSA(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int32_t width) { + int x; + v16u8 src0, src1, dst0; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6; + v8i16 zero = {0}; + v8i16 max = __msa_ldi_h(255); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_y0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_y1, 0); + vec0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src0); + vec1 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src0); + vec2 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); + vec3 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src1); + vec0 -= vec2; + vec1 -= vec3; + vec6[0] = src_y0[16] - src_y1[16]; + vec6[1] = src_y0[17] - src_y1[17]; + vec2 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 2); + vec3 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 2); + vec4 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 4); + vec5 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 4); + vec0 += vec2; + vec1 += vec3; + vec4 += vec2; + vec5 += vec3; + vec0 += vec4; + vec1 += vec5; + vec0 = __msa_add_a_h(zero, vec0); + vec1 = __msa_add_a_h(zero, vec1); + vec0 = __msa_maxi_s_h(vec0, 0); + vec1 = __msa_maxi_s_h(vec1, 0); + vec0 = __msa_min_s_h(max, vec0); + vec1 = __msa_min_s_h(max, vec1); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_sobely); + src_y0 += 16; + src_y1 += 16; + dst_sobely += 16; + } +} + +void HalfFloatRow_MSA(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + int i; + v8u16 src0, src1, src2, src3, dst0, dst1, dst2, dst3; + v4u32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v4f32 fvec0, fvec1, fvec2, fvec3, fvec4, fvec5, fvec6, fvec7; + v4f32 mult_vec; + v8i16 zero = {0}; + mult_vec[0] = 1.9259299444e-34f * scale; + mult_vec = (v4f32)__msa_splati_w((v4i32)mult_vec, 0); + + for (i = 0; i < width; i += 32) { + src0 = (v8u16)__msa_ld_h((v8i16*)src, 0); + src1 = (v8u16)__msa_ld_h((v8i16*)src, 16); + src2 = (v8u16)__msa_ld_h((v8i16*)src, 32); + src3 = (v8u16)__msa_ld_h((v8i16*)src, 48); + vec0 = (v4u32)__msa_ilvr_h(zero, (v8i16)src0); + vec1 = (v4u32)__msa_ilvl_h(zero, (v8i16)src0); + vec2 = (v4u32)__msa_ilvr_h(zero, (v8i16)src1); + vec3 = (v4u32)__msa_ilvl_h(zero, (v8i16)src1); + vec4 = (v4u32)__msa_ilvr_h(zero, (v8i16)src2); + vec5 = (v4u32)__msa_ilvl_h(zero, (v8i16)src2); + vec6 = (v4u32)__msa_ilvr_h(zero, (v8i16)src3); + vec7 = (v4u32)__msa_ilvl_h(zero, (v8i16)src3); + fvec0 = __msa_ffint_u_w(vec0); + fvec1 = __msa_ffint_u_w(vec1); + fvec2 = __msa_ffint_u_w(vec2); + fvec3 = __msa_ffint_u_w(vec3); + fvec4 = __msa_ffint_u_w(vec4); + fvec5 = __msa_ffint_u_w(vec5); + fvec6 = __msa_ffint_u_w(vec6); + fvec7 = __msa_ffint_u_w(vec7); + fvec0 *= mult_vec; + fvec1 *= mult_vec; + fvec2 *= mult_vec; + fvec3 *= mult_vec; + fvec4 *= mult_vec; + fvec5 *= mult_vec; + fvec6 *= mult_vec; + fvec7 *= mult_vec; + vec0 = ((v4u32)fvec0) >> 13; + vec1 = ((v4u32)fvec1) >> 13; + vec2 = ((v4u32)fvec2) >> 13; + vec3 = ((v4u32)fvec3) >> 13; + vec4 = ((v4u32)fvec4) >> 13; + vec5 = ((v4u32)fvec5) >> 13; + vec6 = ((v4u32)fvec6) >> 13; + vec7 = ((v4u32)fvec7) >> 13; + dst0 = (v8u16)__msa_pckev_h((v8i16)vec1, (v8i16)vec0); + dst1 = (v8u16)__msa_pckev_h((v8i16)vec3, (v8i16)vec2); + dst2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4); + dst3 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6); + ST_UH2(dst0, dst1, dst, 8); + ST_UH2(dst2, dst3, dst + 16, 8); + src += 32; + dst += 32; + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) diff --git a/media/libvpx/libvpx/third_party/libyuv/source/row_neon.cc b/media/libvpx/libvpx/third_party/libyuv/source/row_neon.cc new file mode 100644 index 0000000000..ff87e74c62 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/row_neon.cc @@ -0,0 +1,2693 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#include + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC Neon +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ + !defined(__aarch64__) + +// Read 8 Y, 4 U and 4 V from 422 +#define READYUV422 \ + "vld1.8 {d0}, [%0]! \n" \ + "vld1.32 {d2[0]}, [%1]! \n" \ + "vld1.32 {d2[1]}, [%2]! \n" + +// Read 8 Y, 8 U and 8 V from 444 +#define READYUV444 \ + "vld1.8 {d0}, [%0]! \n" \ + "vld1.8 {d2}, [%1]! \n" \ + "vld1.8 {d3}, [%2]! \n" \ + "vpaddl.u8 q1, q1 \n" \ + "vrshrn.u16 d2, q1, #1 \n" + +// Read 8 Y, and set 4 U and 4 V to 128 +#define READYUV400 \ + "vld1.8 {d0}, [%0]! \n" \ + "vmov.u8 d2, #128 \n" + +// Read 8 Y and 4 UV from NV12 +#define READNV12 \ + "vld1.8 {d0}, [%0]! \n" \ + "vld1.8 {d2}, [%1]! \n" \ + "vmov.u8 d3, d2 \n" /* split odd/even uv apart */ \ + "vuzp.u8 d2, d3 \n" \ + "vtrn.u32 d2, d3 \n" + +// Read 8 Y and 4 VU from NV21 +#define READNV21 \ + "vld1.8 {d0}, [%0]! \n" \ + "vld1.8 {d2}, [%1]! \n" \ + "vmov.u8 d3, d2 \n" /* split odd/even uv apart */ \ + "vuzp.u8 d3, d2 \n" \ + "vtrn.u32 d2, d3 \n" + +// Read 8 YUY2 +#define READYUY2 \ + "vld2.8 {d0, d2}, [%0]! \n" \ + "vmov.u8 d3, d2 \n" \ + "vuzp.u8 d2, d3 \n" \ + "vtrn.u32 d2, d3 \n" + +// Read 8 UYVY +#define READUYVY \ + "vld2.8 {d2, d3}, [%0]! \n" \ + "vmov.u8 d0, d3 \n" \ + "vmov.u8 d3, d2 \n" \ + "vuzp.u8 d2, d3 \n" \ + "vtrn.u32 d2, d3 \n" + +#define YUVTORGB_SETUP \ + "vld1.8 {d24}, [%[kUVToRB]] \n" \ + "vld1.8 {d25}, [%[kUVToG]] \n" \ + "vld1.16 {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \ + "vld1.16 {d8[], d9[]}, [%[kUVBiasBGR]]! \n" \ + "vld1.16 {d28[], d29[]}, [%[kUVBiasBGR]] \n" \ + "vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n" + +#define YUVTORGB \ + "vmull.u8 q8, d2, d24 \n" /* u/v B/R component */ \ + "vmull.u8 q9, d2, d25 \n" /* u/v G component */ \ + "vmovl.u8 q0, d0 \n" /* Y */ \ + "vmovl.s16 q10, d1 \n" \ + "vmovl.s16 q0, d0 \n" \ + "vmul.s32 q10, q10, q15 \n" \ + "vmul.s32 q0, q0, q15 \n" \ + "vqshrun.s32 d0, q0, #16 \n" \ + "vqshrun.s32 d1, q10, #16 \n" /* Y */ \ + "vadd.s16 d18, d19 \n" \ + "vshll.u16 q1, d16, #16 \n" /* Replicate u * UB */ \ + "vshll.u16 q10, d17, #16 \n" /* Replicate v * VR */ \ + "vshll.u16 q3, d18, #16 \n" /* Replicate (v*VG + u*UG)*/ \ + "vaddw.u16 q1, q1, d16 \n" \ + "vaddw.u16 q10, q10, d17 \n" \ + "vaddw.u16 q3, q3, d18 \n" \ + "vqadd.s16 q8, q0, q13 \n" /* B */ \ + "vqadd.s16 q9, q0, q14 \n" /* R */ \ + "vqadd.s16 q0, q0, q4 \n" /* G */ \ + "vqadd.s16 q8, q8, q1 \n" /* B */ \ + "vqadd.s16 q9, q9, q10 \n" /* R */ \ + "vqsub.s16 q0, q0, q3 \n" /* G */ \ + "vqshrun.s16 d20, q8, #6 \n" /* B */ \ + "vqshrun.s16 d22, q9, #6 \n" /* R */ \ + "vqshrun.s16 d21, q0, #6 \n" /* G */ + +void I444ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" + "1: \n" READYUV444 YUVTORGB + "subs %4, %4, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); +} + +void I422ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" + "1: \n" READYUV422 YUVTORGB + "subs %4, %4, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); +} + +void I422AlphaToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV422 YUVTORGB + "subs %5, %5, #8 \n" + "vld1.8 {d23}, [%3]! \n" + "vst4.8 {d20, d21, d22, d23}, [%4]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 + "+r"(width) // %5 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); +} + +void I422ToRGBARow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgba, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV422 YUVTORGB + "subs %4, %4, #8 \n" + "vmov.u8 d19, #255 \n" // YUVTORGB modified d19 + "vst4.8 {d19, d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgba), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); +} + +void I422ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV422 YUVTORGB + "subs %4, %4, #8 \n" + "vst3.8 {d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgb24), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); +} + +#define ARGBTORGB565 \ + "vshll.u8 q0, d22, #8 \n" /* R */ \ + "vshll.u8 q8, d21, #8 \n" /* G */ \ + "vshll.u8 q9, d20, #8 \n" /* B */ \ + "vsri.16 q0, q8, #5 \n" /* RG */ \ + "vsri.16 q0, q9, #11 \n" /* RGB */ + +void I422ToRGB565Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV422 YUVTORGB + "subs %4, %4, #8 \n" ARGBTORGB565 + "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgb565), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); +} + +#define ARGBTOARGB1555 \ + "vshll.u8 q0, d23, #8 \n" /* A */ \ + "vshll.u8 q8, d22, #8 \n" /* R */ \ + "vshll.u8 q9, d21, #8 \n" /* G */ \ + "vshll.u8 q10, d20, #8 \n" /* B */ \ + "vsri.16 q0, q8, #1 \n" /* AR */ \ + "vsri.16 q0, q9, #6 \n" /* ARG */ \ + "vsri.16 q0, q10, #11 \n" /* ARGB */ + +void I422ToARGB1555Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV422 YUVTORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" ARGBTOARGB1555 + "vst1.8 {q0}, [%3]! \n" // store 8 pixels + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb1555), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); +} + +#define ARGBTOARGB4444 \ + "vshr.u8 d20, d20, #4 \n" /* B */ \ + "vbic.32 d21, d21, d4 \n" /* G */ \ + "vshr.u8 d22, d22, #4 \n" /* R */ \ + "vbic.32 d23, d23, d4 \n" /* A */ \ + "vorr d0, d20, d21 \n" /* BG */ \ + "vorr d1, d22, d23 \n" /* RA */ \ + "vzip.u8 d0, d1 \n" /* BGRA */ + +void I422ToARGB4444Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d4, #0x0f \n" // vbic bits to clear + "1: \n" + + READYUV422 YUVTORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" ARGBTOARGB4444 + "vst1.8 {q0}, [%3]! \n" // store 8 pixels + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb4444), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); +} + +void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" + "1: \n" READYUV400 YUVTORGB + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : [kUVToRB] "r"(&kYuvI601Constants.kUVToRB), + [kUVToG] "r"(&kYuvI601Constants.kUVToG), + [kUVBiasBGR] "r"(&kYuvI601Constants.kUVBiasBGR), + [kYToRgb] "r"(&kYuvI601Constants.kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); +} + +void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { + asm volatile( + "vmov.u8 d23, #255 \n" + "1: \n" + "vld1.8 {d20}, [%0]! \n" + "vmov d21, d20 \n" + "vmov d22, d20 \n" + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d20", "d21", "d22", "d23"); +} + +void NV12ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile(YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" + "1: \n" READNV12 YUVTORGB + "subs %3, %3, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15"); +} + +void NV21ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile(YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" + "1: \n" READNV21 YUVTORGB + "subs %3, %3, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15"); +} + +void NV12ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + + YUVTORGB_SETUP + + "1: \n" + + READNV12 YUVTORGB + "subs %3, %3, #8 \n" + "vst3.8 {d20, d21, d22}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_rgb24), // %2 + "+r"(width) // %3 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); +} + +void NV21ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + + YUVTORGB_SETUP + + "1: \n" + + READNV21 YUVTORGB + "subs %3, %3, #8 \n" + "vst3.8 {d20, d21, d22}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_rgb24), // %2 + "+r"(width) // %3 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); +} + +void NV12ToRGB565Row_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "1: \n" READNV12 YUVTORGB + "subs %3, %3, #8 \n" ARGBTORGB565 + "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_rgb565), // %2 + "+r"(width) // %3 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); +} + +void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile(YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" + "1: \n" READYUY2 YUVTORGB + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15"); +} + +void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile(YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" + "1: \n" READUYVY YUVTORGB + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15"); +} + +// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. +void SplitUVRow_NEON(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "1: \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV + "subs %3, %3, #16 \n" // 16 processed per loop + "vst1.8 {q0}, [%1]! \n" // store U + "vst1.8 {q1}, [%2]! \n" // store V + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// Reads 16 U's and V's and writes out 16 pairs of UV. +void MergeUVRow_NEON(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + asm volatile( + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load U + "vld1.8 {q1}, [%1]! \n" // load V + "subs %3, %3, #16 \n" // 16 processed per loop + "vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV + "bgt 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b. +void SplitRGBRow_NEON(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + asm volatile( + "1: \n" + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB + "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB + "subs %4, %4, #16 \n" // 16 processed per loop + "vst1.8 {q0}, [%1]! \n" // store R + "vst1.8 {q1}, [%2]! \n" // store G + "vst1.8 {q2}, [%3]! \n" // store B + "bgt 1b \n" + : "+r"(src_rgb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : // Input registers + : "cc", "memory", "d0", "d1", "d2" // Clobber List + ); +} + +// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time +void MergeRGBRow_NEON(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width) { + asm volatile( + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load R + "vld1.8 {q1}, [%1]! \n" // load G + "vld1.8 {q2}, [%2]! \n" // load B + "subs %4, %4, #16 \n" // 16 processed per loop + "vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB + "vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_rgb), // %3 + "+r"(width) // %4 + : // Input registers + : "cc", "memory", "q0", "q1", "q2" // Clobber List + ); +} + +// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. +void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "1: \n" + "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 + "subs %2, %2, #32 \n" // 32 processed per loop + "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 // Output registers + : // Input registers + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// SetRow writes 'width' bytes using an 8 bit value repeated. +void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { + asm volatile( + "vdup.8 q0, %2 \n" // duplicate 16 bytes + "1: \n" + "subs %1, %1, #16 \n" // 16 bytes per loop + "vst1.8 {q0}, [%0]! \n" // store + "bgt 1b \n" + : "+r"(dst), // %0 + "+r"(width) // %1 + : "r"(v8) // %2 + : "cc", "memory", "q0"); +} + +// ARGBSetRow writes 'width' pixels using an 32 bit value repeated. +void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { + asm volatile( + "vdup.u32 q0, %2 \n" // duplicate 4 ints + "1: \n" + "subs %1, %1, #4 \n" // 4 pixels per loop + "vst1.8 {q0}, [%0]! \n" // store + "bgt 1b \n" + : "+r"(dst), // %0 + "+r"(width) // %1 + : "r"(v32) // %2 + : "cc", "memory", "q0"); +} + +void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + // Start at end of source row. + "mov r3, #-16 \n" + "add %0, %0, %2 \n" + "sub %0, #16 \n" + + "1: \n" + "vld1.8 {q0}, [%0], r3 \n" // src -= 16 + "subs %2, #16 \n" // 16 pixels per loop. + "vrev64.8 q0, q0 \n" + "vst1.8 {d1}, [%1]! \n" // dst += 16 + "vst1.8 {d0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "r3", "q0"); +} + +void MirrorUVRow_NEON(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + // Start at end of source row. + "mov r12, #-16 \n" + "add %0, %0, %3, lsl #1 \n" + "sub %0, #16 \n" + + "1: \n" + "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 + "subs %3, #8 \n" // 8 pixels per loop. + "vrev64.8 q0, q0 \n" + "vst1.8 {d0}, [%1]! \n" // dst += 8 + "vst1.8 {d1}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "r12", "q0"); +} + +void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + // Start at end of source row. + "mov r3, #-16 \n" + "add %0, %0, %2, lsl #2 \n" + "sub %0, #16 \n" + + "1: \n" + "vld1.8 {q0}, [%0], r3 \n" // src -= 16 + "subs %2, #4 \n" // 4 pixels per loop. + "vrev64.32 q0, q0 \n" + "vst1.8 {d1}, [%1]! \n" // dst += 16 + "vst1.8 {d0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "r3", "q0"); +} + +void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + asm volatile( + "vmov.u8 d4, #255 \n" // Alpha + "1: \n" + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); +} + +void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + asm volatile( + "vmov.u8 d4, #255 \n" // Alpha + "1: \n" + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); +} + +void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { + asm volatile( + "1: \n" + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of + // RGB24. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d1", "d2", "d3" // Clobber List + ); +} + +#define RGB565TOARGB \ + "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \ + "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \ + "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \ + "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \ + "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ + "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ + "vorr.u8 d0, d0, d4 \n" /* B */ \ + "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \ + "vorr.u8 d2, d1, d5 \n" /* R */ \ + "vorr.u8 d1, d4, d6 \n" /* G */ + +void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { + asm volatile( + "vmov.u8 d3, #255 \n" // Alpha + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + RGB565TOARGB + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +#define ARGB1555TOARGB \ + "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \ + "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \ + "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \ + "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \ + "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \ + "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \ + "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \ + "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \ + "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \ + "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \ + "vorr.u8 q1, q1, q3 \n" /* R,A */ \ + "vorr.u8 q0, q0, q2 \n" /* B,G */ + +// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. +#define RGB555TOARGB \ + "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \ + "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \ + "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \ + "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \ + "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ + "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ + "vorr.u8 d0, d0, d4 \n" /* B */ \ + "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \ + "vorr.u8 d2, d1, d5 \n" /* R */ \ + "vorr.u8 d1, d4, d6 \n" /* G */ + +void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width) { + asm volatile( + "vmov.u8 d3, #255 \n" // Alpha + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +#define ARGB4444TOARGB \ + "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \ + "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \ + "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \ + "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \ + "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \ + "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \ + "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \ + "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */ + +void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width) { + asm volatile( + "vmov.u8 d3, #255 \n" // Alpha + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2" // Clobber List + ); +} + +void ARGBToRGB24Row_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb24, + int width) { + asm volatile( + "1: \n" + "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of + // RGB24. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); +} + +void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { + asm volatile( + "1: \n" + "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_raw), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); +} + +void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + asm volatile( + "1: \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. + "subs %2, %2, #16 \n" // 16 processed per loop. + "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + asm volatile( + "1: \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. + "subs %2, %2, #16 \n" // 16 processed per loop. + "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + "vst1.8 {d1}, [%1]! \n" // store 8 U. + "vst1.8 {d3}, [%2]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List + ); +} + +void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + "vst1.8 {d0}, [%1]! \n" // store 8 U. + "vst1.8 {d2}, [%2]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List + ); +} + +void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "add %1, %0, %1 \n" // stride + src_yuy2 + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. + "vrhadd.u8 d1, d1, d5 \n" // average rows of U + "vrhadd.u8 d3, d3, d7 \n" // average rows of V + "vst1.8 {d1}, [%2]! \n" // store 8 U. + "vst1.8 {d3}, [%3]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(stride_yuy2), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", + "d7" // Clobber List + ); +} + +void UYVYToUVRow_NEON(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "add %1, %0, %1 \n" // stride + src_uyvy + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. + "vrhadd.u8 d0, d0, d4 \n" // average rows of U + "vrhadd.u8 d2, d2, d6 \n" // average rows of V + "vst1.8 {d0}, [%2]! \n" // store 8 U. + "vst1.8 {d2}, [%3]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(stride_uyvy), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", + "d7" // Clobber List + ); +} + +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +void ARGBShuffleRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + asm volatile( + "vld1.8 {q2}, [%3] \n" // shuffler + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 4 pixels. + "subs %2, %2, #4 \n" // 4 processed per loop + "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels + "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels + "vst1.8 {q1}, [%1]! \n" // store 4. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(shuffler) // %3 + : "cc", "memory", "q0", "q1", "q2" // Clobber List + ); +} + +void I422ToYUY2Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width) { + asm volatile( + "1: \n" + "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys + "vld1.8 {d1}, [%1]! \n" // load 8 Us + "vld1.8 {d3}, [%2]! \n" // load 8 Vs + "subs %4, %4, #16 \n" // 16 pixels + "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_yuy2), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3"); +} + +void I422ToUYVYRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width) { + asm volatile( + "1: \n" + "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys + "vld1.8 {d0}, [%1]! \n" // load 8 Us + "vld1.8 {d2}, [%2]! \n" // load 8 Vs + "subs %4, %4, #16 \n" // 16 pixels + "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_uyvy), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3"); +} + +void ARGBToRGB565Row_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb565, + int width) { + asm volatile( + "1: \n" + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTORGB565 + "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb565), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q8", "q9", "q10", "q11"); +} + +void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, + int width) { + asm volatile( + "vdup.32 d2, %2 \n" // dither4 + "1: \n" + "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d20, d20, d2 \n" + "vqadd.u8 d21, d21, d2 \n" + "vqadd.u8 d22, d22, d2 \n" // add for dither + ARGBTORGB565 + "vst1.8 {q0}, [%0]! \n" // store 8 RGB565. + "bgt 1b \n" + : "+r"(dst_rgb) // %0 + : "r"(src_argb), // %1 + "r"(dither4), // %2 + "r"(width) // %3 + : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11"); +} + +void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, + uint8_t* dst_argb1555, + int width) { + asm volatile( + "1: \n" + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTOARGB1555 + "vst1.8 {q0}, [%1]! \n" // store 8 ARGB1555. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb1555), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q8", "q9", "q10", "q11"); +} + +void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_argb4444, + int width) { + asm volatile( + "vmov.u8 d4, #0x0f \n" // bits to clear with + // vbic. + "1: \n" + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTOARGB4444 + "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb4444), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q8", "q9", "q10", "q11"); +} + +void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q12", "q13"); +} + +void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + asm volatile( + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "vst1.8 {q3}, [%1]! \n" // store 16 A's. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_a), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient + "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient + "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q12", "q13"); +} + +// 8x1 pixels. +void ARGBToUV444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vmov.u8 d24, #112 \n" // UB / VR 0.875 + // coefficient + "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient + "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient + "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient + "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlsl.u8 q2, d1, d25 \n" // G + "vmlsl.u8 q2, d2, d26 \n" // R + "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned + + "vmull.u8 q3, d2, d24 \n" // R + "vmlsl.u8 q3, d1, d28 \n" // G + "vmlsl.u8 q3, d0, d27 \n" // B + "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned + + "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V + + "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", + "q15"); +} + +// clang-format off +// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. +#define RGBTOUV(QB, QG, QR) \ + "vmul.s16 q8, " #QB ", q10 \n" /* B */ \ + "vmls.s16 q8, " #QG ", q11 \n" /* G */ \ + "vmls.s16 q8, " #QR ", q12 \n" /* R */ \ + "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \ + "vmul.s16 q9, " #QR ", q10 \n" /* R */ \ + "vmls.s16 q9, " #QG ", q14 \n" /* G */ \ + "vmls.s16 q9, " #QB ", q13 \n" /* B */ \ + "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \ + "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \ + "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */ +// clang-format on + +// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. +void ARGBToUVRow_NEON(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q0, q1, q2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride_argb), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// TODO(fbarchard): Subsample match C code. +void ARGBToUVJRow_NEON(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient + "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient + "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient + "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient + "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q0, q1, q2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride_argb), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void BGRAToUVRow_NEON(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_bgra + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. + "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels. + "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q1, q1, #1 \n" // 2x average + "vrshr.u16 q2, q2, #1 \n" + "vrshr.u16 q3, q3, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q3, q2, q1) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_bgra), // %0 + "+r"(src_stride_bgra), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void ABGRToUVRow_NEON(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_abgr + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. + "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. + "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q2, q1, q0) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(src_stride_abgr), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void RGBAToUVRow_NEON(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_rgba + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. + "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels. + "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q0, q1, q2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(src_stride_rgba), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_rgb24 + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. + "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels. + "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q0, q1, q2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(src_stride_rgb24), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void RAWToUVRow_NEON(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_raw + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. + "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. + "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels. + "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. + "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q2, q1, q0) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(src_stride_raw), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. +void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 + // coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + RGB565TOARGB + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels. + RGB565TOARGB + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels. + RGB565TOARGB + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels. + RGB565TOARGB + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(src_stride_rgb565), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", + "q9", "q10", "q11", "q12", "q13", "q14", "q15"); +} + +// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. +void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 + // coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(src_stride_argb1555), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", + "q9", "q10", "q11", "q12", "q13", "q14", "q15"); +} + +// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. +void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 + // coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(src_stride_argb4444), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", + "q9", "q10", "q11", "q12", "q13", "q14", "q15"); +} + +void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + RGB565TOARGB + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"); +} + +void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, + uint8_t* dst_y, + int width) { + asm volatile( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"); +} + +void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, + uint8_t* dst_y, + int width) { + asm volatile( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"); +} + +void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d1, d4 \n" // R + "vmlal.u8 q8, d2, d5 \n" // G + "vmlal.u8 q8, d3, d6 \n" // B + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); +} + +void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d0, d4 \n" // R + "vmlal.u8 q8, d1, d5 \n" // G + "vmlal.u8 q8, d2, d6 \n" // B + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); +} + +void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d1, d4 \n" // B + "vmlal.u8 q8, d2, d5 \n" // G + "vmlal.u8 q8, d3, d6 \n" // R + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); +} + +void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + "1: \n" + "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d0, d4 \n" // B + "vmlal.u8 q8, d1, d5 \n" // G + "vmlal.u8 q8, d2, d6 \n" // R + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); +} + +void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + "1: \n" + "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d0, d4 \n" // B + "vmlal.u8 q8, d1, d5 \n" // G + "vmlal.u8 q8, d2, d6 \n" // R + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); +} + +// Bilinear filter 16x2 -> 16x1 +void InterpolateRow_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { + int y1_fraction = source_y_fraction; + asm volatile( + "cmp %4, #0 \n" + "beq 100f \n" + "add %2, %1 \n" + "cmp %4, #128 \n" + "beq 50f \n" + + "vdup.8 d5, %4 \n" + "rsb %4, #256 \n" + "vdup.8 d4, %4 \n" + // General purpose row blend. + "1: \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vmull.u8 q13, d0, d4 \n" + "vmull.u8 q14, d1, d4 \n" + "vmlal.u8 q13, d2, d5 \n" + "vmlal.u8 q14, d3, d5 \n" + "vrshrn.u16 d0, q13, #8 \n" + "vrshrn.u16 d1, q14, #8 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 1b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 50b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "vld1.8 {q0}, [%1]! \n" + "subs %3, %3, #16 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_stride), // %2 + "+r"(dst_width), // %3 + "+r"(y1_fraction) // %4 + : + : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"); +} + +// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr +void ARGBBlendRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + "subs %3, #8 \n" + "blt 89f \n" + // Blend 8 pixels. + "8: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q10, d4, d3 \n" // db * a + "vmull.u8 q11, d5, d3 \n" // dg * a + "vmull.u8 q12, d6, d3 \n" // dr * a + "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 + "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 + "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 + "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 + "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 + "vqadd.u8 q0, q0, q2 \n" // + sbg + "vqadd.u8 d2, d2, d6 \n" // + sr + "vmov.u8 d3, #255 \n" // a = 255 + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. + "bge 8b \n" + + "89: \n" + "adds %3, #8-1 \n" + "blt 99f \n" + + // Blend 1 pixels. + "1: \n" + "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. + "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. + "subs %3, %3, #1 \n" // 1 processed per loop. + "vmull.u8 q10, d4, d3 \n" // db * a + "vmull.u8 q11, d5, d3 \n" // dg * a + "vmull.u8 q12, d6, d3 \n" // dr * a + "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 + "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 + "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 + "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 + "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 + "vqadd.u8 q0, q0, q2 \n" // + sbg + "vqadd.u8 d2, d2, d6 \n" // + sr + "vmov.u8 d3, #255 \n" // a = 255 + "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel. + "bge 1b \n" + + "99: \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"); +} + +// Attenuate 8 pixels at a time. +void ARGBAttenuateRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + asm volatile( + // Attenuate 8 pixels. + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q10, d0, d3 \n" // b * a + "vmull.u8 q11, d1, d3 \n" // g * a + "vmull.u8 q12, d2, d3 \n" // r * a + "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8 + "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8 + "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8 + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q10", "q11", "q12"); +} + +// Quantize 8 ARGB pixels (32 bytes). +// dst = (dst * scale >> 16) * interval_size + interval_offset; +void ARGBQuantizeRow_NEON(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width) { + asm volatile( + "vdup.u16 q8, %2 \n" + "vshr.u16 q8, q8, #1 \n" // scale >>= 1 + "vdup.u16 q9, %3 \n" // interval multiply. + "vdup.u16 q10, %4 \n" // interval add + + // 8 pixel loop. + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. + "subs %1, %1, #8 \n" // 8 processed per loop. + "vmovl.u8 q0, d0 \n" // b (0 .. 255) + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q2, d4 \n" + "vqdmulh.s16 q0, q0, q8 \n" // b * scale + "vqdmulh.s16 q1, q1, q8 \n" // g + "vqdmulh.s16 q2, q2, q8 \n" // r + "vmul.u16 q0, q0, q9 \n" // b * interval_size + "vmul.u16 q1, q1, q9 \n" // g + "vmul.u16 q2, q2, q9 \n" // r + "vadd.u16 q0, q0, q10 \n" // b + interval_offset + "vadd.u16 q1, q1, q10 \n" // g + "vadd.u16 q2, q2, q10 \n" // r + "vqmovn.u16 d0, q0 \n" + "vqmovn.u16 d2, q1 \n" + "vqmovn.u16 d4, q2 \n" + "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "r"(scale), // %2 + "r"(interval_size), // %3 + "r"(interval_offset) // %4 + : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"); +} + +// Shade 8 pixels at a time by specified value. +// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. +// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. +void ARGBShadeRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value) { + asm volatile( + "vdup.u32 q0, %3 \n" // duplicate scale value. + "vzip.u8 d0, d1 \n" // d0 aarrggbb. + "vshr.u16 q0, q0, #1 \n" // scale / 2. + + // 8 pixel loop. + "1: \n" + "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmovl.u8 q10, d20 \n" // b (0 .. 255) + "vmovl.u8 q11, d22 \n" + "vmovl.u8 q12, d24 \n" + "vmovl.u8 q13, d26 \n" + "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2 + "vqrdmulh.s16 q11, q11, d0[1] \n" // g + "vqrdmulh.s16 q12, q12, d0[2] \n" // r + "vqrdmulh.s16 q13, q13, d0[3] \n" // a + "vqmovn.u16 d20, q10 \n" + "vqmovn.u16 d22, q11 \n" + "vqmovn.u16 d24, q12 \n" + "vqmovn.u16 d26, q13 \n" + "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(value) // %3 + : "cc", "memory", "q0", "q10", "q11", "q12", "q13"); +} + +// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels +// Similar to ARGBToYJ but stores ARGB. +// C code is (15 * b + 75 * g + 38 * r + 64) >> 7; +void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + asm volatile( + "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient + "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient + "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B + "vmov d1, d0 \n" // G + "vmov d2, d0 \n" // R + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q12", "q13"); +} + +// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. +// b = (r * 35 + g * 68 + b * 17) >> 7 +// g = (r * 45 + g * 88 + b * 22) >> 7 +// r = (r * 50 + g * 98 + b * 24) >> 7 +void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { + asm volatile( + "vmov.u8 d20, #17 \n" // BB coefficient + "vmov.u8 d21, #68 \n" // BG coefficient + "vmov.u8 d22, #35 \n" // BR coefficient + "vmov.u8 d24, #22 \n" // GB coefficient + "vmov.u8 d25, #88 \n" // GG coefficient + "vmov.u8 d26, #45 \n" // GR coefficient + "vmov.u8 d28, #24 \n" // BB coefficient + "vmov.u8 d29, #98 \n" // BG coefficient + "vmov.u8 d30, #50 \n" // BR coefficient + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. + "subs %1, %1, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d20 \n" // B to Sepia B + "vmlal.u8 q2, d1, d21 \n" // G + "vmlal.u8 q2, d2, d22 \n" // R + "vmull.u8 q3, d0, d24 \n" // B to Sepia G + "vmlal.u8 q3, d1, d25 \n" // G + "vmlal.u8 q3, d2, d26 \n" // R + "vmull.u8 q8, d0, d28 \n" // B to Sepia R + "vmlal.u8 q8, d1, d29 \n" // G + "vmlal.u8 q8, d2, d30 \n" // R + "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B + "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G + "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R + "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12", "q13", + "q14", "q15"); +} + +// Tranform 8 ARGB pixels (32 bytes) with color matrix. +// TODO(fbarchard): Was same as Sepia except matrix is provided. This function +// needs to saturate. Consider doing a non-saturating version. +void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width) { + asm volatile( + "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. + "vmovl.s8 q0, d4 \n" // B,G coefficients s16. + "vmovl.s8 q1, d5 \n" // R,A coefficients s16. + + "1: \n" + "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit + "vmovl.u8 q9, d18 \n" // g + "vmovl.u8 q10, d20 \n" // r + "vmovl.u8 q11, d22 \n" // a + "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B + "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G + "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R + "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A + "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B + "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G + "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R + "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B + "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G + "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R + "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B + "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G + "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R + "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B + "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G + "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R + "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A + "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(matrix_argb) // %3 + : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15"); +} + +// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. +void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 8 pixel loop. + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q0, d0, d1 \n" // multiply B + "vmull.u8 q1, d2, d3 \n" // multiply G + "vmull.u8 q2, d4, d5 \n" // multiply R + "vmull.u8 q3, d6, d7 \n" // multiply A + "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B + "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G + "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R + "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +} + +// Add 2 rows of ARGB pixels together, 8 pixels at a time. +void ARGBAddRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 8 pixel loop. + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 q0, q0, q2 \n" // add B, G + "vqadd.u8 q1, q1, q3 \n" // add R, A + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +} + +// Subtract 2 rows of ARGB pixels, 8 pixels at a time. +void ARGBSubtractRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 8 pixel loop. + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqsub.u8 q0, q0, q2 \n" // subtract B, G + "vqsub.u8 q1, q1, q3 \n" // subtract R, A + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +} + +// Adds Sobel X and Sobel Y and stores Sobel into ARGB. +// A = 255 +// R = Sobel +// G = Sobel +// B = Sobel +void SobelRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + asm volatile( + "vmov.u8 d3, #255 \n" // alpha + // 8 pixel loop. + "1: \n" + "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. + "vld1.8 {d1}, [%1]! \n" // load 8 sobely. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d0, d0, d1 \n" // add + "vmov.u8 d1, d0 \n" + "vmov.u8 d2, d0 \n" + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1"); +} + +// Adds Sobel X and Sobel Y and stores Sobel into plane. +void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width) { + asm volatile( + // 16 pixel loop. + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. + "vld1.8 {q1}, [%1]! \n" // load 16 sobely. + "subs %3, %3, #16 \n" // 16 processed per loop. + "vqadd.u8 q0, q0, q1 \n" // add + "vst1.8 {q0}, [%2]! \n" // store 16 pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_y), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1"); +} + +// Mixes Sobel X, Sobel Y and Sobel into ARGB. +// A = 255 +// R = Sobel X +// G = Sobel +// B = Sobel Y +void SobelXYRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + asm volatile( + "vmov.u8 d3, #255 \n" // alpha + // 8 pixel loop. + "1: \n" + "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. + "vld1.8 {d0}, [%1]! \n" // load 8 sobely. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d1, d0, d2 \n" // add + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1"); +} + +// SobelX as a matrix is +// -1 0 1 +// -2 0 2 +// -1 0 1 +void SobelXRow_NEON(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width) { + asm volatile( + "1: \n" + "vld1.8 {d0}, [%0],%5 \n" // top + "vld1.8 {d1}, [%0],%6 \n" + "vsubl.u8 q0, d0, d1 \n" + "vld1.8 {d2}, [%1],%5 \n" // center * 2 + "vld1.8 {d3}, [%1],%6 \n" + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vadd.s16 q0, q0, q1 \n" + "vld1.8 {d2}, [%2],%5 \n" // bottom + "vld1.8 {d3}, [%2],%6 \n" + "subs %4, %4, #8 \n" // 8 pixels + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vabs.s16 q0, q0 \n" + "vqmovn.u16 d0, q0 \n" + "vst1.8 {d0}, [%3]! \n" // store 8 sobelx + "bgt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(src_y2), // %2 + "+r"(dst_sobelx), // %3 + "+r"(width) // %4 + : "r"(2), // %5 + "r"(6) // %6 + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// SobelY as a matrix is +// -1 -2 -1 +// 0 0 0 +// 1 2 1 +void SobelYRow_NEON(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width) { + asm volatile( + "1: \n" + "vld1.8 {d0}, [%0],%4 \n" // left + "vld1.8 {d1}, [%1],%4 \n" + "vsubl.u8 q0, d0, d1 \n" + "vld1.8 {d2}, [%0],%4 \n" // center * 2 + "vld1.8 {d3}, [%1],%4 \n" + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vadd.s16 q0, q0, q1 \n" + "vld1.8 {d2}, [%0],%5 \n" // right + "vld1.8 {d3}, [%1],%5 \n" + "subs %3, %3, #8 \n" // 8 pixels + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vabs.s16 q0, q0 \n" + "vqmovn.u16 d0, q0 \n" + "vst1.8 {d0}, [%2]! \n" // store 8 sobely + "bgt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(dst_sobely), // %2 + "+r"(width) // %3 + : "r"(1), // %4 + "r"(6) // %5 + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// %y passes a float as a scalar vector for vector * scalar multiply. +// the regoster must be d0 to d15 and indexed with [0] or [1] to access +// the float in the first or second float of the d-reg + +void HalfFloat1Row_NEON(const uint16_t* src, + uint16_t* dst, + float /*unused*/, + int width) { + asm volatile( + + "1: \n" + "vld1.8 {q1}, [%0]! \n" // load 8 shorts + "subs %2, %2, #8 \n" // 8 pixels per loop + "vmovl.u16 q2, d2 \n" // 8 int's + "vmovl.u16 q3, d3 \n" + "vcvt.f32.u32 q2, q2 \n" // 8 floats + "vcvt.f32.u32 q3, q3 \n" + "vmul.f32 q2, q2, %y3 \n" // adjust exponent + "vmul.f32 q3, q3, %y3 \n" + "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat + "vqshrn.u32 d3, q3, #13 \n" + "vst1.8 {q1}, [%1]! \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(1.9259299444e-34f) // %3 + : "cc", "memory", "q1", "q2", "q3"); +} + +void HalfFloatRow_NEON(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + asm volatile( + + "1: \n" + "vld1.8 {q1}, [%0]! \n" // load 8 shorts + "subs %2, %2, #8 \n" // 8 pixels per loop + "vmovl.u16 q2, d2 \n" // 8 int's + "vmovl.u16 q3, d3 \n" + "vcvt.f32.u32 q2, q2 \n" // 8 floats + "vcvt.f32.u32 q3, q3 \n" + "vmul.f32 q2, q2, %y3 \n" // adjust exponent + "vmul.f32 q3, q3, %y3 \n" + "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat + "vqshrn.u32 d3, q3, #13 \n" + "vst1.8 {q1}, [%1]! \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(scale * 1.9259299444e-34f) // %3 + : "cc", "memory", "q1", "q2", "q3"); +} + +void ByteToFloatRow_NEON(const uint8_t* src, + float* dst, + float scale, + int width) { + asm volatile( + + "1: \n" + "vld1.8 {d2}, [%0]! \n" // load 8 bytes + "subs %2, %2, #8 \n" // 8 pixels per loop + "vmovl.u8 q1, d2 \n" // 8 shorts + "vmovl.u16 q2, d2 \n" // 8 ints + "vmovl.u16 q3, d3 \n" + "vcvt.f32.u32 q2, q2 \n" // 8 floats + "vcvt.f32.u32 q3, q3 \n" + "vmul.f32 q2, q2, %y3 \n" // scale + "vmul.f32 q3, q3, %y3 \n" + "vst1.8 {q2, q3}, [%1]! \n" // store 8 floats + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(scale) // %3 + : "cc", "memory", "q1", "q2", "q3"); +} + +#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__).. + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/row_neon64.cc b/media/libvpx/libvpx/third_party/libyuv/source/row_neon64.cc new file mode 100644 index 0000000000..24b4520bab --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/row_neon64.cc @@ -0,0 +1,2884 @@ +/* + * Copyright 2014 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC Neon armv8 64 bit. +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +// Read 8 Y, 4 U and 4 V from 422 +#define READYUV422 \ + "ld1 {v0.8b}, [%0], #8 \n" \ + "ld1 {v1.s}[0], [%1], #4 \n" \ + "ld1 {v1.s}[1], [%2], #4 \n" + +// Read 8 Y, 8 U and 8 V from 444 +#define READYUV444 \ + "ld1 {v0.8b}, [%0], #8 \n" \ + "ld1 {v1.d}[0], [%1], #8 \n" \ + "ld1 {v1.d}[1], [%2], #8 \n" \ + "uaddlp v1.8h, v1.16b \n" \ + "rshrn v1.8b, v1.8h, #1 \n" + +// Read 8 Y, and set 4 U and 4 V to 128 +#define READYUV400 \ + "ld1 {v0.8b}, [%0], #8 \n" \ + "movi v1.8b , #128 \n" + +// Read 8 Y and 4 UV from NV12 +#define READNV12 \ + "ld1 {v0.8b}, [%0], #8 \n" \ + "ld1 {v2.8b}, [%1], #8 \n" \ + "uzp1 v1.8b, v2.8b, v2.8b \n" \ + "uzp2 v3.8b, v2.8b, v2.8b \n" \ + "ins v1.s[1], v3.s[0] \n" + +// Read 8 Y and 4 VU from NV21 +#define READNV21 \ + "ld1 {v0.8b}, [%0], #8 \n" \ + "ld1 {v2.8b}, [%1], #8 \n" \ + "uzp1 v3.8b, v2.8b, v2.8b \n" \ + "uzp2 v1.8b, v2.8b, v2.8b \n" \ + "ins v1.s[1], v3.s[0] \n" + +// Read 8 YUY2 +#define READYUY2 \ + "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \ + "uzp2 v3.8b, v1.8b, v1.8b \n" \ + "uzp1 v1.8b, v1.8b, v1.8b \n" \ + "ins v1.s[1], v3.s[0] \n" + +// Read 8 UYVY +#define READUYVY \ + "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \ + "orr v0.8b, v3.8b, v3.8b \n" \ + "uzp1 v1.8b, v2.8b, v2.8b \n" \ + "uzp2 v3.8b, v2.8b, v2.8b \n" \ + "ins v1.s[1], v3.s[0] \n" + +#define YUVTORGB_SETUP \ + "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \ + "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \ + "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \ + "ld1r {v31.4s}, [%[kYToRgb]] \n" \ + "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \ + "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n" + +#define YUVTORGB(vR, vG, vB) \ + "uxtl v0.8h, v0.8b \n" /* Extract Y */ \ + "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \ + "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \ + "ushll v0.4s, v0.4h, #0 \n" \ + "mul v3.4s, v3.4s, v31.4s \n" \ + "mul v0.4s, v0.4s, v31.4s \n" \ + "sqshrun v0.4h, v0.4s, #16 \n" \ + "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \ + "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \ + "mov v2.d[0], v1.d[1] \n" /* Extract V */ \ + "uxtl v2.8h, v2.8b \n" \ + "uxtl v1.8h, v1.8b \n" /* Extract U */ \ + "mul v3.8h, v1.8h, v27.8h \n" \ + "mul v5.8h, v1.8h, v29.8h \n" \ + "mul v6.8h, v2.8h, v30.8h \n" \ + "mul v7.8h, v2.8h, v28.8h \n" \ + "sqadd v6.8h, v6.8h, v5.8h \n" \ + "sqadd " #vB \ + ".8h, v24.8h, v0.8h \n" /* B */ \ + "sqadd " #vG \ + ".8h, v25.8h, v0.8h \n" /* G */ \ + "sqadd " #vR \ + ".8h, v26.8h, v0.8h \n" /* R */ \ + "sqadd " #vB ".8h, " #vB \ + ".8h, v3.8h \n" /* B */ \ + "sqsub " #vG ".8h, " #vG \ + ".8h, v6.8h \n" /* G */ \ + "sqadd " #vR ".8h, " #vR \ + ".8h, v7.8h \n" /* R */ \ + "sqshrun " #vB ".8b, " #vB \ + ".8h, #6 \n" /* B */ \ + "sqshrun " #vG ".8b, " #vG \ + ".8h, #6 \n" /* G */ \ + "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ + +void I444ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP + "movi v23.8b, #255 \n" /* A */ + "1: \n" + READYUV444 + YUVTORGB(v22, v21, v20) + "subs %w4, %w4, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} + +void I422ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP + "movi v23.8b, #255 \n" /* A */ + "1: \n" + READYUV422 + YUVTORGB(v22, v21, v20) + "subs %w4, %w4, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} + +void I422AlphaToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP + "1: \n" + READYUV422 + YUVTORGB(v22, v21, v20) + "ld1 {v23.8b}, [%3], #8 \n" + "subs %w5, %w5, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 + "+r"(width) // %5 + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} + +void I422ToRGBARow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgba, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP + "movi v20.8b, #255 \n" /* A */ + "1: \n" + READYUV422 + YUVTORGB(v23, v22, v21) + "subs %w4, %w4, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgba), // %3 + "+r"(width) // %4 + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} + +void I422ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP + "1: \n" + READYUV422 + YUVTORGB(v22, v21, v20) + "subs %w4, %w4, #8 \n" + "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgb24), // %3 + "+r"(width) // %4 + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} + +#define ARGBTORGB565 \ + "shll v0.8h, v22.8b, #8 \n" /* R */ \ + "shll v21.8h, v21.8b, #8 \n" /* G */ \ + "shll v20.8h, v20.8b, #8 \n" /* B */ \ + "sri v0.8h, v21.8h, #5 \n" /* RG */ \ + "sri v0.8h, v20.8h, #11 \n" /* RGB */ + +void I422ToRGB565Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV422 YUVTORGB( + v22, v21, + v20) "subs %w4, %w4, #8 \n" ARGBTORGB565 + "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels + // RGB565. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgb565), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"); +} + +#define ARGBTOARGB1555 \ + "shll v0.8h, v23.8b, #8 \n" /* A */ \ + "shll v22.8h, v22.8b, #8 \n" /* R */ \ + "shll v21.8h, v21.8b, #8 \n" /* G */ \ + "shll v20.8h, v20.8b, #8 \n" /* B */ \ + "sri v0.8h, v22.8h, #1 \n" /* AR */ \ + "sri v0.8h, v21.8h, #6 \n" /* ARG */ \ + "sri v0.8h, v20.8h, #11 \n" /* ARGB */ + +void I422ToARGB1555Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "movi v23.8b, #255 \n" + "1: \n" READYUV422 YUVTORGB( + v22, v21, + v20) "subs %w4, %w4, #8 \n" ARGBTOARGB1555 + "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels + // RGB565. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb1555), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"); +} + +#define ARGBTOARGB4444 \ + /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \ + "ushr v20.8b, v20.8b, #4 \n" /* B */ \ + "bic v21.8b, v21.8b, v4.8b \n" /* G */ \ + "ushr v22.8b, v22.8b, #4 \n" /* R */ \ + "bic v23.8b, v23.8b, v4.8b \n" /* A */ \ + "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \ + "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \ + "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */ + +void I422ToARGB4444Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP + "movi v4.16b, #0x0f \n" // bits to clear with vbic. + "1: \n" + READYUV422 + YUVTORGB(v22, v21, v20) + "subs %w4, %w4, #8 \n" + "movi v23.8b, #255 \n" + ARGBTOARGB4444 + "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb4444), // %3 + "+r"(width) // %4 + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} + +void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { + asm volatile ( + YUVTORGB_SETUP + "movi v23.8b, #255 \n" + "1: \n" + READYUV400 + YUVTORGB(v22, v21, v20) + "subs %w2, %w2, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB), + [kUVToG]"r"(&kYuvI601Constants.kUVToG), + [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR), + [kYToRgb]"r"(&kYuvI601Constants.kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} + +void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { + asm volatile( + "movi v23.8b, #255 \n" + "1: \n" + "ld1 {v20.8b}, [%0], #8 \n" + "orr v21.8b, v20.8b, v20.8b \n" + "orr v22.8b, v20.8b, v20.8b \n" + "subs %w2, %w2, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v20", "v21", "v22", "v23"); +} + +void NV12ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP + "movi v23.8b, #255 \n" + "1: \n" + READNV12 + YUVTORGB(v22, v21, v20) + "subs %w3, %w3, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} + +void NV21ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP + "movi v23.8b, #255 \n" + "1: \n" + READNV21 + YUVTORGB(v22, v21, v20) + "subs %w3, %w3, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} + +void NV12ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP + "1: \n" + READNV12 + YUVTORGB(v22, v21, v20) + "subs %w3, %w3, #8 \n" + "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_rgb24), // %2 + "+r"(width) // %3 + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} + +void NV21ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP + "1: \n" + READNV21 + YUVTORGB(v22, v21, v20) + "subs %w3, %w3, #8 \n" + "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_rgb24), // %2 + "+r"(width) // %3 + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} + +void NV12ToRGB565Row_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "1: \n" READNV12 YUVTORGB( + v22, v21, + v20) "subs %w3, %w3, #8 \n" ARGBTORGB565 + "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels + // RGB565. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_rgb565), // %2 + "+r"(width) // %3 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"); +} + +void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP + "movi v23.8b, #255 \n" + "1: \n" + READYUY2 + YUVTORGB(v22, v21, v20) + "subs %w2, %w2, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" + "b.gt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} + +void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP + "movi v23.8b, #255 \n" + "1: \n" + READUYVY + YUVTORGB(v22, v21, v20) + "subs %w2, %w2, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" + "b.gt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} + +// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. +void SplitUVRow_NEON(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "1: \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV + "subs %w3, %w3, #16 \n" // 16 processed per loop + "st1 {v0.16b}, [%1], #16 \n" // store U + "st1 {v1.16b}, [%2], #16 \n" // store V + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "v0", "v1" // Clobber List + ); +} + +// Reads 16 U's and V's and writes out 16 pairs of UV. +void MergeUVRow_NEON(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load U + "ld1 {v1.16b}, [%1], #16 \n" // load V + "subs %w3, %w3, #16 \n" // 16 processed per loop + "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV + "b.gt 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "v0", "v1" // Clobber List + ); +} + +// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b. +void SplitRGBRow_NEON(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + asm volatile( + "1: \n" + "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB + "subs %w4, %w4, #16 \n" // 16 processed per loop + "st1 {v0.16b}, [%1], #16 \n" // store R + "st1 {v1.16b}, [%2], #16 \n" // store G + "st1 {v2.16b}, [%3], #16 \n" // store B + "b.gt 1b \n" + : "+r"(src_rgb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : // Input registers + : "cc", "memory", "v0", "v1", "v2" // Clobber List + ); +} + +// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time +void MergeRGBRow_NEON(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load R + "ld1 {v1.16b}, [%1], #16 \n" // load G + "ld1 {v2.16b}, [%2], #16 \n" // load B + "subs %w4, %w4, #16 \n" // 16 processed per loop + "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_rgb), // %3 + "+r"(width) // %4 + : // Input registers + : "cc", "memory", "v0", "v1", "v2" // Clobber List + ); +} + +// Copy multiple of 32. +void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "1: \n" + "ldp q0, q1, [%0], #32 \n" + "subs %w2, %w2, #32 \n" // 32 processed per loop + "stp q0, q1, [%1], #32 \n" + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 // Output registers + : // Input registers + : "cc", "memory", "v0", "v1" // Clobber List + ); +} + +// SetRow writes 'width' bytes using an 8 bit value repeated. +void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { + asm volatile( + "dup v0.16b, %w2 \n" // duplicate 16 bytes + "1: \n" + "subs %w1, %w1, #16 \n" // 16 bytes per loop + "st1 {v0.16b}, [%0], #16 \n" // store + "b.gt 1b \n" + : "+r"(dst), // %0 + "+r"(width) // %1 + : "r"(v8) // %2 + : "cc", "memory", "v0"); +} + +void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { + asm volatile( + "dup v0.4s, %w2 \n" // duplicate 4 ints + "1: \n" + "subs %w1, %w1, #4 \n" // 4 ints per loop + "st1 {v0.16b}, [%0], #16 \n" // store + "b.gt 1b \n" + : "+r"(dst), // %0 + "+r"(width) // %1 + : "r"(v32) // %2 + : "cc", "memory", "v0"); +} + +void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + // Start at end of source row. + "add %0, %0, %w2, sxtw \n" + "sub %0, %0, #16 \n" + "1: \n" + "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 + "subs %w2, %w2, #16 \n" // 16 pixels per loop. + "rev64 v0.16b, v0.16b \n" + "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 + "st1 {v0.D}[0], [%1], #8 \n" + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((ptrdiff_t)-16) // %3 + : "cc", "memory", "v0"); +} + +void MirrorUVRow_NEON(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + // Start at end of source row. + "add %0, %0, %w3, sxtw #1 \n" + "sub %0, %0, #16 \n" + "1: \n" + "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 + "subs %w3, %w3, #8 \n" // 8 pixels per loop. + "rev64 v0.8b, v0.8b \n" + "rev64 v1.8b, v1.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // dst += 8 + "st1 {v1.8b}, [%2], #8 \n" + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((ptrdiff_t)-16) // %4 + : "cc", "memory", "v0", "v1"); +} + +void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + // Start at end of source row. + "add %0, %0, %w2, sxtw #2 \n" + "sub %0, %0, #16 \n" + "1: \n" + "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 + "subs %w2, %w2, #4 \n" // 4 pixels per loop. + "rev64 v0.4s, v0.4s \n" + "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 + "st1 {v0.D}[0], [%1], #8 \n" + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((ptrdiff_t)-16) // %3 + : "cc", "memory", "v0"); +} + +void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + asm volatile( + "movi v4.8b, #255 \n" // Alpha + "1: \n" + "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List + ); +} + +void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + asm volatile( + "movi v5.8b, #255 \n" // Alpha + "1: \n" + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v3.8b, v1.8b, v1.8b \n" // move g + "orr v4.8b, v0.8b, v0.8b \n" // move r + "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a + "b.gt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List + ); +} + +void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { + asm volatile( + "1: \n" + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v3.8b, v1.8b, v1.8b \n" // move g + "orr v4.8b, v0.8b, v0.8b \n" // move r + "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r + "b.gt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List + ); +} + +#define RGB565TOARGB \ + "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \ + "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \ + "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \ + "orr v1.8b, v4.8b, v6.8b \n" /* G */ \ + "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ + "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \ + "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \ + "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \ + "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \ + "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ + "dup v2.2D, v0.D[1] \n" /* R */ + +void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { + asm volatile( + "movi v3.8b, #255 \n" // Alpha + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + RGB565TOARGB + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List + ); +} + +#define ARGB1555TOARGB \ + "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ + "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ + "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \ + \ + "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \ + "xtn2 v3.16b, v2.8h \n" \ + \ + "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ + "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ + \ + "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \ + "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ + "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ + \ + "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ + "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \ + "dup v1.2D, v0.D[1] \n" \ + "dup v3.2D, v2.D[1] \n" + +// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. +#define RGB555TOARGB \ + "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ + "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ + "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \ + \ + "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ + "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ + \ + "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \ + "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ + "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ + \ + "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ + "orr v2.16b, v1.16b, v3.16b \n" /* R */ \ + "dup v1.2D, v0.D[1] \n" /* G */ + +void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width) { + asm volatile( + "movi v3.8b, #255 \n" // Alpha + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + // pixels + "b.gt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +#define ARGB4444TOARGB \ + "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \ + "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \ + "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \ + "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \ + "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \ + "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \ + "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \ + "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \ + "dup v0.2D, v2.D[1] \n" \ + "dup v1.2D, v3.D[1] \n" + +void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + // pixels + "b.gt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List + ); +} + +void ARGBToRGB24Row_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb24, + int width) { + asm volatile( + "1: \n" + "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of + // RGB24. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List + ); +} + +void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { + asm volatile( + "1: \n" + "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v4.8b, v2.8b, v2.8b \n" // mov g + "orr v5.8b, v1.8b, v1.8b \n" // mov b + "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_raw), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List + ); +} + +void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + asm volatile( + "1: \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. + "subs %w2, %w2, #16 \n" // 16 processed per loop. + "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. + "b.gt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1" // Clobber List + ); +} + +void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + asm volatile( + "1: \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. + "subs %w2, %w2, #16 \n" // 16 processed per loop. + "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. + "b.gt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1" // Clobber List + ); +} + +void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 + "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. + "st1 {v1.8b}, [%1], #8 \n" // store 8 U. + "st1 {v3.8b}, [%2], #8 \n" // store 8 V. + "b.gt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY + "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. + "st1 {v0.8b}, [%1], #8 \n" // store 8 U. + "st1 {v2.8b}, [%2], #8 \n" // store 8 V. + "b.gt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2; + asm volatile( + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels + "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row + "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U + "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V + "st1 {v1.8b}, [%2], #8 \n" // store 8 U. + "st1 {v3.8b}, [%3], #8 \n" // store 8 V. + "b.gt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(src_yuy2b), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7" // Clobber List + ); +} + +void UYVYToUVRow_NEON(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_uyvyb = src_uyvy + stride_uyvy; + asm volatile( + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels + "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row + "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U + "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V + "st1 {v0.8b}, [%2], #8 \n" // store 8 U. + "st1 {v2.8b}, [%3], #8 \n" // store 8 V. + "b.gt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(src_uyvyb), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7" // Clobber List + ); +} + +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +void ARGBShuffleRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + asm volatile( + "ld1 {v2.16b}, [%3] \n" // shuffler + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. + "subs %w2, %w2, #4 \n" // 4 processed per loop + "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels + "st1 {v1.16b}, [%1], #16 \n" // store 4. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(shuffler) // %3 + : "cc", "memory", "v0", "v1", "v2" // Clobber List + ); +} + +void I422ToYUY2Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width) { + asm volatile( + "1: \n" + "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys + "orr v2.8b, v1.8b, v1.8b \n" + "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us + "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs + "subs %w4, %w4, #16 \n" // 16 pixels + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_yuy2), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3"); +} + +void I422ToUYVYRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width) { + asm volatile( + "1: \n" + "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys + "orr v3.8b, v2.8b, v2.8b \n" + "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us + "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs + "subs %w4, %w4, #16 \n" // 16 pixels + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_uyvy), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3"); +} + +void ARGBToRGB565Row_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb565, + int width) { + asm volatile( + "1: \n" + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGBTORGB565 + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb565), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v20", "v21", "v22", "v23"); +} + +void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, + int width) { + asm volatile( + "dup v1.4s, %w2 \n" // dither4 + "1: \n" + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v20.8b, v20.8b, v1.8b \n" + "uqadd v21.8b, v21.8b, v1.8b \n" + "uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565 + "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" + : "+r"(dst_rgb) // %0 + : "r"(src_argb), // %1 + "r"(dither4), // %2 + "r"(width) // %3 + : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"); +} + +void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, + uint8_t* dst_argb1555, + int width) { + asm volatile( + "1: \n" + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGBTOARGB1555 + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels + // ARGB1555. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb1555), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v20", "v21", "v22", "v23"); +} + +void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_argb4444, + int width) { + asm volatile( + "movi v4.16b, #0x0f \n" // bits to clear with + // vbic. + "1: \n" + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGBTOARGB4444 + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels + // ARGB4444. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb4444), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"); +} + +void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v3.8h, v0.8b, v4.8b \n" // B + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +} + +void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + asm volatile( + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 + // pixels + "subs %w2, %w2, #16 \n" // 16 processed per loop + "st1 {v3.16b}, [%1], #16 \n" // store 16 A's. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_a), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "movi v4.8b, #15 \n" // B * 0.11400 coefficient + "movi v5.8b, #75 \n" // G * 0.58700 coefficient + "movi v6.8b, #38 \n" // R * 0.29900 coefficient + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v3.8h, v0.8b, v4.8b \n" // B + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); +} + +// 8x1 pixels. +void ARGBToUV444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "movi v24.8b, #112 \n" // UB / VR 0.875 + // coefficient + "movi v25.8b, #74 \n" // UG -0.5781 coefficient + "movi v26.8b, #38 \n" // UR -0.2969 coefficient + "movi v27.8b, #18 \n" // VB -0.1406 coefficient + "movi v28.8b, #94 \n" // VG -0.7344 coefficient + "movi v29.16b,#0x80 \n" // 128.5 + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + // pixels. + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v24.8b \n" // B + "umlsl v4.8h, v1.8b, v25.8b \n" // G + "umlsl v4.8h, v2.8b, v26.8b \n" // R + "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned + + "umull v3.8h, v2.8b, v24.8b \n" // R + "umlsl v3.8h, v1.8b, v28.8b \n" // G + "umlsl v3.8h, v0.8b, v27.8b \n" // B + "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned + + "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V + + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26", + "v27", "v28", "v29"); +} + +#define RGBTOUV_SETUP_REG \ + "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ + "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ + "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ + "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ + "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ + "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ + +// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. +// clang-format off +#define RGBTOUV(QB, QG, QR) \ + "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ + "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ + "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ + "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ + "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ + "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ + "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ + "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ + "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ + "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ +// clang-format on + +// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. +// TODO(fbarchard): consider ptrdiff_t for all strides. + +void ARGBToUVRow_NEON(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_argb_1 = src_argb + src_stride_argb; + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 32 processed per loop. + RGBTOUV(v0.8h, v1.8h, v2.8h) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_argb_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} + +// TODO(fbarchard): Subsample match C code. +void ARGBToUVJRow_NEON(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_argb_1 = src_argb + src_stride_argb; + asm volatile ( + "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 + "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 + "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 + "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 + "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 + "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 32 processed per loop. + RGBTOUV(v0.8h, v1.8h, v2.8h) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_argb_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} + +void BGRAToUVRow_NEON(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra; + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more + "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v3.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 32 processed per loop. + RGBTOUV(v0.8h, v1.8h, v2.8h) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_bgra), // %0 + "+r"(src_bgra_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} + +void ABGRToUVRow_NEON(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr; + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. + "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v3.8h, #1 \n" // 2x average + "urshr v2.8h, v2.8h, #1 \n" + "urshr v1.8h, v1.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 32 processed per loop. + RGBTOUV(v0.8h, v2.8h, v1.8h) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(src_abgr_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} + +void RGBAToUVRow_NEON(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba; + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. + "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 32 processed per loop. + RGBTOUV(v0.8h, v1.8h, v2.8h) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(src_rgba_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} + +void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24; + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more. + "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 32 processed per loop. + RGBTOUV(v0.8h, v1.8h, v2.8h) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(src_rgb24_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} + +void RAWToUVRow_NEON(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_raw_1 = src_raw + src_stride_raw; + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels. + "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. + "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels + "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v2.8h, v2.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v0.8h, v0.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 32 processed per loop. + RGBTOUV(v2.8h, v1.8h, v0.8h) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_raw), // %0 + "+r"(src_raw_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} + +// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. +void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565; + asm volatile( + "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / + // 2 + "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2 + "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2 + "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 + "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 + "movi v27.16b, #0x80 \n" // 128.5 0x8080 in 16bit + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. + RGB565TOARGB + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels. + RGB565TOARGB + "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels. + RGB565TOARGB + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels. + RGB565TOARGB + "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + "ins v16.D[1], v17.D[0] \n" + "ins v18.D[1], v19.D[0] \n" + "ins v20.D[1], v21.D[0] \n" + + "urshr v4.8h, v16.8h, #1 \n" // 2x average + "urshr v5.8h, v18.8h, #1 \n" + "urshr v6.8h, v20.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. + "mul v16.8h, v4.8h, v22.8h \n" // B + "mls v16.8h, v5.8h, v23.8h \n" // G + "mls v16.8h, v6.8h, v24.8h \n" // R + "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned + "mul v17.8h, v6.8h, v22.8h \n" // R + "mls v17.8h, v5.8h, v26.8h \n" // G + "mls v17.8h, v4.8h, v25.8h \n" // B + "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned + "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(src_rgb565_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27"); +} + +// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. +void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555; + asm volatile( + RGBTOUV_SETUP_REG + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + "ins v16.D[1], v26.D[0] \n" + "ins v17.D[1], v27.D[0] \n" + "ins v18.D[1], v28.D[0] \n" + + "urshr v4.8h, v16.8h, #1 \n" // 2x average + "urshr v5.8h, v17.8h, #1 \n" + "urshr v6.8h, v18.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. + "mul v2.8h, v4.8h, v20.8h \n" // B + "mls v2.8h, v5.8h, v21.8h \n" // G + "mls v2.8h, v6.8h, v22.8h \n" // R + "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned + "mul v3.8h, v6.8h, v20.8h \n" // R + "mls v3.8h, v5.8h, v24.8h \n" // G + "mls v3.8h, v4.8h, v23.8h \n" // B + "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned + "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(src_argb1555_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", + "v28"); +} + +// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. +void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444; + asm volatile( + RGBTOUV_SETUP_REG + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + "ins v16.D[1], v26.D[0] \n" + "ins v17.D[1], v27.D[0] \n" + "ins v18.D[1], v28.D[0] \n" + + "urshr v4.8h, v16.8h, #1 \n" // 2x average + "urshr v5.8h, v17.8h, #1 \n" + "urshr v6.8h, v18.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. + "mul v2.8h, v4.8h, v20.8h \n" // B + "mls v2.8h, v5.8h, v21.8h \n" // G + "mls v2.8h, v6.8h, v22.8h \n" // R + "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned + "mul v3.8h, v6.8h, v20.8h \n" // R + "mls v3.8h, v5.8h, v24.8h \n" // G + "mls v3.8h, v4.8h, v23.8h \n" // B + "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned + "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(src_argb4444_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", + "v28" + + ); +} + +void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { + asm volatile( + "movi v24.8b, #13 \n" // B * 0.1016 coefficient + "movi v25.8b, #65 \n" // G * 0.5078 coefficient + "movi v26.8b, #33 \n" // R * 0.2578 coefficient + "movi v27.8b, #16 \n" // Add 16 constant + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + RGB565TOARGB + "umull v3.8h, v0.8b, v24.8b \n" // B + "umlal v3.8h, v1.8b, v25.8b \n" // G + "umlal v3.8h, v2.8b, v26.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v27.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26", + "v27"); +} + +void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, + uint8_t* dst_y, + int width) { + asm volatile( + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + "umull v3.8h, v0.8b, v4.8b \n" // B + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +} + +void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, + uint8_t* dst_y, + int width) { + asm volatile( + "movi v24.8b, #13 \n" // B * 0.1016 coefficient + "movi v25.8b, #65 \n" // G * 0.5078 coefficient + "movi v26.8b, #33 \n" // R * 0.2578 coefficient + "movi v27.8b, #16 \n" // Add 16 constant + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + "umull v3.8h, v0.8b, v24.8b \n" // B + "umlal v3.8h, v1.8b, v25.8b \n" // G + "umlal v3.8h, v2.8b, v26.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v27.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"); +} + +void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { + asm volatile( + "movi v4.8b, #33 \n" // R * 0.2578 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #13 \n" // B * 0.1016 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v1.8b, v4.8b \n" // R + "umlal v16.8h, v2.8b, v5.8b \n" // G + "umlal v16.8h, v3.8b, v6.8b \n" // B + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); +} + +void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + asm volatile( + "movi v4.8b, #33 \n" // R * 0.2578 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #13 \n" // B * 0.1016 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v0.8b, v4.8b \n" // R + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // B + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); +} + +void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { + asm volatile( + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v1.8b, v4.8b \n" // B + "umlal v16.8h, v2.8b, v5.8b \n" // G + "umlal v16.8h, v3.8b, v6.8b \n" // R + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); +} + +void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + asm volatile( + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v0.8b, v4.8b \n" // B + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); +} + +void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { + asm volatile( + "movi v4.8b, #33 \n" // R * 0.2578 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #13 \n" // B * 0.1016 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v0.8b, v4.8b \n" // B + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); +} + +// Bilinear filter 16x2 -> 16x1 +void InterpolateRow_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint8_t* src_ptr1 = src_ptr + src_stride; + asm volatile( + "cmp %w4, #0 \n" + "b.eq 100f \n" + "cmp %w4, #128 \n" + "b.eq 50f \n" + + "dup v5.16b, %w4 \n" + "dup v4.16b, %w5 \n" + // General purpose row blend. + "1: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "umull v2.8h, v0.8b, v4.8b \n" + "umull2 v3.8h, v0.16b, v4.16b \n" + "umlal v2.8h, v1.8b, v5.8b \n" + "umlal2 v3.8h, v1.16b, v5.16b \n" + "rshrn v0.8b, v2.8h, #8 \n" + "rshrn2 v0.16b, v3.8h, #8 \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 1b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 50b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "subs %w3, %w3, #16 \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_ptr1), // %2 + "+r"(dst_width), // %3 + "+r"(y1_fraction), // %4 + "+r"(y0_fraction) // %5 + : + : "cc", "memory", "v0", "v1", "v3", "v4", "v5"); +} + +// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr +void ARGBBlendRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + "subs %w3, %w3, #8 \n" + "b.lt 89f \n" + // Blend 8 pixels. + "8: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 + // pixels + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 + // pixels + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "umull v16.8h, v4.8b, v3.8b \n" // db * a + "umull v17.8h, v5.8b, v3.8b \n" // dg * a + "umull v18.8h, v6.8b, v3.8b \n" // dr * a + "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 + "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 + "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 + "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) + "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) + "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) + "uqadd v0.8b, v0.8b, v4.8b \n" // + sb + "uqadd v1.8b, v1.8b, v5.8b \n" // + sg + "uqadd v2.8b, v2.8b, v6.8b \n" // + sr + "movi v3.8b, #255 \n" // a = 255 + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + // pixels + "b.ge 8b \n" + + "89: \n" + "adds %w3, %w3, #8-1 \n" + "b.lt 99f \n" + + // Blend 1 pixels. + "1: \n" + "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. + "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. + "subs %w3, %w3, #1 \n" // 1 processed per loop. + "umull v16.8h, v4.8b, v3.8b \n" // db * a + "umull v17.8h, v5.8b, v3.8b \n" // dg * a + "umull v18.8h, v6.8b, v3.8b \n" // dr * a + "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 + "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 + "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 + "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) + "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) + "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) + "uqadd v0.8b, v0.8b, v4.8b \n" // + sb + "uqadd v1.8b, v1.8b, v5.8b \n" // + sg + "uqadd v2.8b, v2.8b, v6.8b \n" // + sr + "movi v3.8b, #255 \n" // a = 255 + "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel. + "b.ge 1b \n" + + "99: \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18"); +} + +// Attenuate 8 pixels at a time. +void ARGBAttenuateRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + asm volatile( + // Attenuate 8 pixels. + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v3.8b \n" // b * a + "umull v5.8h, v1.8b, v3.8b \n" // g * a + "umull v6.8h, v2.8b, v3.8b \n" // r * a + "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 + "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 + "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + // pixels + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); +} + +// Quantize 8 ARGB pixels (32 bytes). +// dst = (dst * scale >> 16) * interval_size + interval_offset; +void ARGBQuantizeRow_NEON(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width) { + asm volatile( + "dup v4.8h, %w2 \n" + "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 + "dup v5.8h, %w3 \n" // interval multiply. + "dup v6.8h, %w4 \n" // interval add + + // 8 pixel loop. + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB. + "subs %w1, %w1, #8 \n" // 8 processed per loop. + "uxtl v0.8h, v0.8b \n" // b (0 .. 255) + "uxtl v1.8h, v1.8b \n" + "uxtl v2.8h, v2.8b \n" + "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale + "sqdmulh v1.8h, v1.8h, v4.8h \n" // g + "sqdmulh v2.8h, v2.8h, v4.8h \n" // r + "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size + "mul v1.8h, v1.8h, v5.8h \n" // g + "mul v2.8h, v2.8h, v5.8h \n" // r + "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset + "add v1.8h, v1.8h, v6.8h \n" // g + "add v2.8h, v2.8h, v6.8h \n" // r + "uqxtn v0.8b, v0.8h \n" + "uqxtn v1.8b, v1.8h \n" + "uqxtn v2.8b, v2.8h \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "r"(scale), // %2 + "r"(interval_size), // %3 + "r"(interval_offset) // %4 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); +} + +// Shade 8 pixels at a time by specified value. +// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. +// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. +void ARGBShadeRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value) { + asm volatile( + "dup v0.4s, %w3 \n" // duplicate scale value. + "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. + "ushr v0.8h, v0.8h, #1 \n" // scale / 2. + + // 8 pixel loop. + "1: \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "uxtl v4.8h, v4.8b \n" // b (0 .. 255) + "uxtl v5.8h, v5.8b \n" + "uxtl v6.8h, v6.8b \n" + "uxtl v7.8h, v7.8b \n" + "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2 + "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g + "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r + "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a + "uqxtn v4.8b, v4.8h \n" + "uqxtn v5.8b, v5.8h \n" + "uqxtn v6.8b, v6.8h \n" + "uqxtn v7.8b, v7.8h \n" + "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(value) // %3 + : "cc", "memory", "v0", "v4", "v5", "v6", "v7"); +} + +// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels +// Similar to ARGBToYJ but stores ARGB. +// C code is (15 * b + 75 * g + 38 * r + 64) >> 7; +void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + asm volatile( + "movi v24.8b, #15 \n" // B * 0.11400 coefficient + "movi v25.8b, #75 \n" // G * 0.58700 coefficient + "movi v26.8b, #38 \n" // R * 0.29900 coefficient + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v24.8b \n" // B + "umlal v4.8h, v1.8b, v25.8b \n" // G + "umlal v4.8h, v2.8b, v26.8b \n" // R + "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B + "orr v1.8b, v0.8b, v0.8b \n" // G + "orr v2.8b, v0.8b, v0.8b \n" // R + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"); +} + +// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. +// b = (r * 35 + g * 68 + b * 17) >> 7 +// g = (r * 45 + g * 88 + b * 22) >> 7 +// r = (r * 50 + g * 98 + b * 24) >> 7 + +void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { + asm volatile( + "movi v20.8b, #17 \n" // BB coefficient + "movi v21.8b, #68 \n" // BG coefficient + "movi v22.8b, #35 \n" // BR coefficient + "movi v24.8b, #22 \n" // GB coefficient + "movi v25.8b, #88 \n" // GG coefficient + "movi v26.8b, #45 \n" // GR coefficient + "movi v28.8b, #24 \n" // BB coefficient + "movi v29.8b, #98 \n" // BG coefficient + "movi v30.8b, #50 \n" // BR coefficient + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. + "subs %w1, %w1, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B + "umlal v4.8h, v1.8b, v21.8b \n" // G + "umlal v4.8h, v2.8b, v22.8b \n" // R + "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G + "umlal v5.8h, v1.8b, v25.8b \n" // G + "umlal v5.8h, v2.8b, v26.8b \n" // R + "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R + "umlal v6.8h, v1.8b, v29.8b \n" // G + "umlal v6.8h, v2.8b, v30.8b \n" // R + "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B + "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G + "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels. + "b.gt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"); +} + +// Tranform 8 ARGB pixels (32 bytes) with color matrix. +// TODO(fbarchard): Was same as Sepia except matrix is provided. This function +// needs to saturate. Consider doing a non-saturating version. +void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width) { + asm volatile( + "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. + "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. + "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. + + "1: \n" + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit + "uxtl v17.8h, v17.8b \n" // g + "uxtl v18.8h, v18.8b \n" // r + "uxtl v19.8h, v19.8b \n" // a + "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B + "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G + "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R + "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A + "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B + "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G + "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R + "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B + "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G + "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R + "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B + "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G + "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R + "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B + "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G + "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R + "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(matrix_argb) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v22", "v23", "v24", "v25"); +} + +// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. +// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. +void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 8 pixel loop. + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "umull v0.8h, v0.8b, v4.8b \n" // multiply B + "umull v1.8h, v1.8b, v5.8b \n" // multiply G + "umull v2.8h, v2.8b, v6.8b \n" // multiply R + "umull v3.8h, v3.8b, v7.8b \n" // multiply A + "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B + "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G + "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R + "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +} + +// Add 2 rows of ARGB pixels together, 8 pixels at a time. +void ARGBAddRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 8 pixel loop. + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v0.8b, v0.8b, v4.8b \n" + "uqadd v1.8b, v1.8b, v5.8b \n" + "uqadd v2.8b, v2.8b, v6.8b \n" + "uqadd v3.8b, v3.8b, v7.8b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +} + +// Subtract 2 rows of ARGB pixels, 8 pixels at a time. +void ARGBSubtractRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 8 pixel loop. + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqsub v0.8b, v0.8b, v4.8b \n" + "uqsub v1.8b, v1.8b, v5.8b \n" + "uqsub v2.8b, v2.8b, v6.8b \n" + "uqsub v3.8b, v3.8b, v7.8b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +} + +// Adds Sobel X and Sobel Y and stores Sobel into ARGB. +// A = 255 +// R = Sobel +// G = Sobel +// B = Sobel +void SobelRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + asm volatile( + "movi v3.8b, #255 \n" // alpha + // 8 pixel loop. + "1: \n" + "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. + "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v0.8b, v0.8b, v1.8b \n" // add + "orr v1.8b, v0.8b, v0.8b \n" + "orr v2.8b, v0.8b, v0.8b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3"); +} + +// Adds Sobel X and Sobel Y and stores Sobel into plane. +void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width) { + asm volatile( + // 16 pixel loop. + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. + "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. + "subs %w3, %w3, #16 \n" // 16 processed per loop. + "uqadd v0.16b, v0.16b, v1.16b \n" // add + "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. + "b.gt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_y), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1"); +} + +// Mixes Sobel X, Sobel Y and Sobel into ARGB. +// A = 255 +// R = Sobel X +// G = Sobel +// B = Sobel Y +void SobelXYRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + asm volatile( + "movi v3.8b, #255 \n" // alpha + // 8 pixel loop. + "1: \n" + "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. + "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v1.8b, v0.8b, v2.8b \n" // add + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3"); +} + +// SobelX as a matrix is +// -1 0 1 +// -2 0 2 +// -1 0 1 +void SobelXRow_NEON(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.8b}, [%0],%5 \n" // top + "ld1 {v1.8b}, [%0],%6 \n" + "usubl v0.8h, v0.8b, v1.8b \n" + "ld1 {v2.8b}, [%1],%5 \n" // center * 2 + "ld1 {v3.8b}, [%1],%6 \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "add v0.8h, v0.8h, v1.8h \n" + "ld1 {v2.8b}, [%2],%5 \n" // bottom + "ld1 {v3.8b}, [%2],%6 \n" + "subs %w4, %w4, #8 \n" // 8 pixels + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "abs v0.8h, v0.8h \n" + "uqxtn v0.8b, v0.8h \n" + "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx + "b.gt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(src_y2), // %2 + "+r"(dst_sobelx), // %3 + "+r"(width) // %4 + : "r"(2LL), // %5 + "r"(6LL) // %6 + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +// SobelY as a matrix is +// -1 -2 -1 +// 0 0 0 +// 1 2 1 +void SobelYRow_NEON(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.8b}, [%0],%4 \n" // left + "ld1 {v1.8b}, [%1],%4 \n" + "usubl v0.8h, v0.8b, v1.8b \n" + "ld1 {v2.8b}, [%0],%4 \n" // center * 2 + "ld1 {v3.8b}, [%1],%4 \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "add v0.8h, v0.8h, v1.8h \n" + "ld1 {v2.8b}, [%0],%5 \n" // right + "ld1 {v3.8b}, [%1],%5 \n" + "subs %w3, %w3, #8 \n" // 8 pixels + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "abs v0.8h, v0.8h \n" + "uqxtn v0.8b, v0.8h \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely + "b.gt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(dst_sobely), // %2 + "+r"(width) // %3 + : "r"(1LL), // %4 + "r"(6LL) // %5 + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +// Caveat - rounds float to half float whereas scaling version truncates. +void HalfFloat1Row_NEON(const uint16_t* src, + uint16_t* dst, + float /*unused*/, + int width) { + asm volatile( + "1: \n" + "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts + "subs %w2, %w2, #8 \n" // 8 pixels per loop + "uxtl v2.4s, v1.4h \n" // 8 int's + "uxtl2 v3.4s, v1.8h \n" + "scvtf v2.4s, v2.4s \n" // 8 floats + "scvtf v3.4s, v3.4s \n" + "fcvtn v1.4h, v2.4s \n" // 8 half floats + "fcvtn2 v1.8h, v3.4s \n" + "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v1", "v2", "v3"); +} + +void HalfFloatRow_NEON(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + asm volatile( + "1: \n" + "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts + "subs %w2, %w2, #8 \n" // 8 pixels per loop + "uxtl v2.4s, v1.4h \n" // 8 int's + "uxtl2 v3.4s, v1.8h \n" + "scvtf v2.4s, v2.4s \n" // 8 floats + "scvtf v3.4s, v3.4s \n" + "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent + "fmul v3.4s, v3.4s, %3.s[0] \n" + "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat + "uqshrn2 v1.8h, v3.4s, #13 \n" + "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(scale * 1.9259299444e-34f) // %3 + : "cc", "memory", "v1", "v2", "v3"); +} + +void ByteToFloatRow_NEON(const uint8_t* src, + float* dst, + float scale, + int width) { + asm volatile( + "1: \n" + "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes + "subs %w2, %w2, #8 \n" // 8 pixels per loop + "uxtl v1.8h, v1.8b \n" // 8 shorts + "uxtl v2.4s, v1.4h \n" // 8 ints + "uxtl2 v3.4s, v1.8h \n" + "scvtf v2.4s, v2.4s \n" // 8 floats + "scvtf v3.4s, v3.4s \n" + "fmul v2.4s, v2.4s, %3.s[0] \n" // scale + "fmul v3.4s, v3.4s, %3.s[0] \n" + "st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(scale) // %3 + : "cc", "memory", "v1", "v2", "v3"); +} + +float ScaleMaxSamples_NEON(const float* src, + float* dst, + float scale, + int width) { + float fmax; + asm volatile( + "movi v5.4s, #0 \n" // max + "movi v6.4s, #0 \n" + + "1: \n" + "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples + "subs %w2, %w2, #8 \n" // 8 processed per loop + "fmul v3.4s, v1.4s, %4.s[0] \n" // scale + "fmul v4.4s, v2.4s, %4.s[0] \n" // scale + "fmax v5.4s, v5.4s, v1.4s \n" // max + "fmax v6.4s, v6.4s, v2.4s \n" + "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" + "fmax v5.4s, v5.4s, v6.4s \n" // max + "fmaxv %s3, v5.4s \n" // signed max acculator + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width), // %2 + "=w"(fmax) // %3 + : "w"(scale) // %4 + : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6"); + return fmax; +} + +float ScaleSumSamples_NEON(const float* src, + float* dst, + float scale, + int width) { + float fsum; + asm volatile( + "movi v5.4s, #0 \n" // max + "movi v6.4s, #0 \n" // max + + "1: \n" + "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples + "subs %w2, %w2, #8 \n" // 8 processed per loop + "fmul v3.4s, v1.4s, %4.s[0] \n" // scale + "fmul v4.4s, v2.4s, %4.s[0] \n" + "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares + "fmla v6.4s, v2.4s, v2.4s \n" + "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" + "faddp v5.4s, v5.4s, v6.4s \n" + "faddp v5.4s, v5.4s, v5.4s \n" + "faddp %3.4s, v5.4s, v5.4s \n" // sum + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width), // %2 + "=w"(fsum) // %3 + : "w"(scale) // %4 + : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6"); + return fsum; +} + +void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) { + asm volatile( + "1: \n" + "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples + "subs %w2, %w2, #8 \n" // 8 processed per loop + "fmul v1.4s, v1.4s, %3.s[0] \n" // scale + "fmul v2.4s, v2.4s, %3.s[0] \n" // scale + "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(scale) // %3 + : "cc", "memory", "v1", "v2"); +} + +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussCol_NEON(const uint16_t* src0, + const uint16_t* src1, + const uint16_t* src2, + const uint16_t* src3, + const uint16_t* src4, + uint32_t* dst, + int width) { + asm volatile( + "movi v6.8h, #4 \n" // constant 4 + "movi v7.8h, #6 \n" // constant 6 + + "1: \n" + "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows + "ld1 {v2.8h}, [%4], #16 \n" + "uaddl v0.4s, v1.4h, v2.4h \n" // * 1 + "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1 + "ld1 {v2.8h}, [%1], #16 \n" + "umlal v0.4s, v2.4h, v6.4h \n" // * 4 + "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 + "ld1 {v2.8h}, [%2], #16 \n" + "umlal v0.4s, v2.4h, v7.4h \n" // * 6 + "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6 + "ld1 {v2.8h}, [%3], #16 \n" + "umlal v0.4s, v2.4h, v6.4h \n" // * 4 + "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 + "subs %w6, %w6, #8 \n" // 8 processed per loop + "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples + "b.gt 1b \n" + : "+r"(src0), // %0 + "+r"(src1), // %1 + "+r"(src2), // %2 + "+r"(src3), // %3 + "+r"(src4), // %4 + "+r"(dst), // %5 + "+r"(width) // %6 + : + : "cc", "memory", "v0", "v1", "v2", "v6", "v7"); +} + +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { + const uint32_t* src1 = src + 1; + const uint32_t* src2 = src + 2; + const uint32_t* src3 = src + 3; + asm volatile( + "movi v6.4s, #4 \n" // constant 4 + "movi v7.4s, #6 \n" // constant 6 + + "1: \n" + "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples + "add v0.4s, v0.4s, v1.4s \n" // * 1 + "add v1.4s, v1.4s, v2.4s \n" // * 1 + "ld1 {v2.4s,v3.4s}, [%2], #32 \n" + "mla v0.4s, v2.4s, v7.4s \n" // * 6 + "mla v1.4s, v3.4s, v7.4s \n" // * 6 + "ld1 {v2.4s,v3.4s}, [%1], #32 \n" + "ld1 {v4.4s,v5.4s}, [%3], #32 \n" + "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4 + "add v3.4s, v3.4s, v5.4s \n" + "mla v0.4s, v2.4s, v6.4s \n" // * 4 + "mla v1.4s, v3.4s, v6.4s \n" // * 4 + "subs %w5, %w5, #8 \n" // 8 processed per loop + "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack + "uqrshrn2 v0.8h, v1.4s, #8 \n" + "st1 {v0.8h}, [%4], #16 \n" // store 8 samples + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(src1), // %1 + "+r"(src2), // %2 + "+r"(src3), // %3 + "+r"(dst), // %4 + "+r"(width) // %5 + : "r"(32LL) // %6 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +} + +#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/row_win.cc b/media/libvpx/libvpx/third_party/libyuv/source/row_win.cc new file mode 100644 index 0000000000..5500d7f5a6 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/row_win.cc @@ -0,0 +1,6234 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +// This module is for Visual C 32/64 bit and clangcl 32 bit +#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ + (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__))) + +#if defined(_M_X64) +#include +#include // For _mm_maddubs_epi16 +#endif + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// 64 bit +#if defined(_M_X64) + +// Read 4 UV from 422, upsample to 8 UV. +#define READYUV422 \ + xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \ + xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ + xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ + u_buf += 4; \ + xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ + y_buf += 8; + +// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. +#define READYUVA422 \ + xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \ + xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ + xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ + u_buf += 4; \ + xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ + y_buf += 8; \ + xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \ + a_buf += 8; + +// Convert 8 pixels: 8 UV and 8 Y. +#define YUVTORGB(yuvconstants) \ + xmm1 = _mm_loadu_si128(&xmm0); \ + xmm2 = _mm_loadu_si128(&xmm0); \ + xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \ + xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \ + xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \ + xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \ + xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \ + xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \ + xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \ + xmm0 = _mm_adds_epi16(xmm0, xmm4); \ + xmm1 = _mm_adds_epi16(xmm1, xmm4); \ + xmm2 = _mm_adds_epi16(xmm2, xmm4); \ + xmm0 = _mm_srai_epi16(xmm0, 6); \ + xmm1 = _mm_srai_epi16(xmm1, 6); \ + xmm2 = _mm_srai_epi16(xmm2, 6); \ + xmm0 = _mm_packus_epi16(xmm0, xmm0); \ + xmm1 = _mm_packus_epi16(xmm1, xmm1); \ + xmm2 = _mm_packus_epi16(xmm2, xmm2); + +// Store 8 ARGB values. +#define STOREARGB \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ + xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \ + xmm1 = _mm_loadu_si128(&xmm0); \ + xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \ + xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \ + _mm_storeu_si128((__m128i*)dst_argb, xmm0); \ + _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \ + dst_argb += 32; + +#if defined(HAS_I422TOARGBROW_SSSE3) +void I422ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __m128i xmm0, xmm1, xmm2, xmm4; + const __m128i xmm5 = _mm_set1_epi8(-1); + const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; + while (width > 0) { + READYUV422 + YUVTORGB(yuvconstants) + STOREARGB + width -= 8; + } +} +#endif + +#if defined(HAS_I422ALPHATOARGBROW_SSSE3) +void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __m128i xmm0, xmm1, xmm2, xmm4, xmm5; + const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; + while (width > 0) { + READYUVA422 + YUVTORGB(yuvconstants) + STOREARGB + width -= 8; + } +} +#endif + +// 32 bit +#else // defined(_M_X64) +#ifdef HAS_ARGBTOYROW_SSSE3 + +// Constants for ARGB. +static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0, + 13, 65, 33, 0, 13, 65, 33, 0}; + +// JPeg full range. +static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0, + 15, 75, 38, 0, 15, 75, 38, 0}; + +static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0, + 112, -74, -38, 0, 112, -74, -38, 0}; + +static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0, + 127, -84, -43, 0, 127, -84, -43, 0}; + +static const vec8 kARGBToV = { + -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, +}; + +static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0, + -20, -107, 127, 0, -20, -107, 127, 0}; + +// vpshufb for vphaddw + vpackuswb packed to shorts. +static const lvec8 kShufARGBToUV_AVX = { + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15}; + +// Constants for BGRA. +static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13, + 0, 33, 65, 13, 0, 33, 65, 13}; + +static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112, + 0, -38, -74, 112, 0, -38, -74, 112}; + +static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18, + 0, 112, -94, -18, 0, 112, -94, -18}; + +// Constants for ABGR. +static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0, + 33, 65, 13, 0, 33, 65, 13, 0}; + +static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0, + -38, -74, 112, 0, -38, -74, 112, 0}; + +static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0, + 112, -94, -18, 0, 112, -94, -18, 0}; + +// Constants for RGBA. +static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33, + 0, 13, 65, 33, 0, 13, 65, 33}; + +static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38, + 0, 112, -74, -38, 0, 112, -74, -38}; + +static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112, + 0, -18, -94, 112, 0, -18, -94, 112}; + +static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u}; + +// 7 bit fixed point 0.5. +static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64}; + +static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; + +static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, + 0x8080u, 0x8080u, 0x8080u, 0x8080u}; + +// Shuffle table for converting RGB24 to ARGB. +static const uvec8 kShuffleMaskRGB24ToARGB = { + 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u}; + +// Shuffle table for converting RAW to ARGB. +static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, + 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u}; + +// Shuffle table for converting RAW to RGB24. First 8. +static const uvec8 kShuffleMaskRAWToRGB24_0 = { + 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; + +// Shuffle table for converting RAW to RGB24. Middle 8. +static const uvec8 kShuffleMaskRAWToRGB24_1 = { + 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; + +// Shuffle table for converting RAW to RGB24. Last 8. +static const uvec8 kShuffleMaskRAWToRGB24_2 = { + 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; + +// Shuffle table for converting ARGB to RGB24. +static const uvec8 kShuffleMaskARGBToRGB24 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u}; + +// Shuffle table for converting ARGB to RAW. +static const uvec8 kShuffleMaskARGBToRAW = { + 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u}; + +// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 +static const uvec8 kShuffleMaskARGBToRGB24_0 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u}; + +// YUY2 shuf 16 Y to 32 Y. +static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, + 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4, + 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; + +// YUY2 shuf 8 UV to 16 UV. +static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, + 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7, + 5, 7, 9, 11, 9, 11, 13, 15, 13, 15}; + +// UYVY shuf 16 Y to 32 Y. +static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, + 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5, + 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}; + +// UYVY shuf 8 UV to 16 UV. +static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, + 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6, + 4, 6, 8, 10, 8, 10, 12, 14, 12, 14}; + +// NV21 shuf 8 VU to 16 UV. +static const lvec8 kShuffleNV21 = { + 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, + 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, +}; + +// Duplicates gray value 3 times and fills in alpha opaque. +__declspec(naked) void J400ToARGBRow_SSE2(const uint8_t* src_y, + uint8_t* dst_argb, + int width) { + __asm { + mov eax, [esp + 4] // src_y + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + pslld xmm5, 24 + + convertloop: + movq xmm0, qword ptr [eax] + lea eax, [eax + 8] + punpcklbw xmm0, xmm0 + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm0 + punpckhwd xmm1, xmm1 + por xmm0, xmm5 + por xmm1, xmm5 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + ret + } +} + +#ifdef HAS_J400TOARGBROW_AVX2 +// Duplicates gray value 3 times and fills in alpha opaque. +__declspec(naked) void J400ToARGBRow_AVX2(const uint8_t* src_y, + uint8_t* dst_argb, + int width) { + __asm { + mov eax, [esp + 4] // src_y + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 + vpslld ymm5, ymm5, 24 + + convertloop: + vmovdqu xmm0, [eax] + lea eax, [eax + 16] + vpermq ymm0, ymm0, 0xd8 + vpunpcklbw ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 + vpunpckhwd ymm1, ymm0, ymm0 + vpunpcklwd ymm0, ymm0, ymm0 + vpor ymm0, ymm0, ymm5 + vpor ymm1, ymm1, ymm5 + vmovdqu [edx], ymm0 + vmovdqu [edx + 32], ymm1 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_J400TOARGBROW_AVX2 + +__declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + __asm { + mov eax, [esp + 4] // src_rgb24 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + pslld xmm5, 24 + movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm3, [eax + 32] + lea eax, [eax + 48] + movdqa xmm2, xmm3 + palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} + pshufb xmm2, xmm4 + por xmm2, xmm5 + palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} + pshufb xmm0, xmm4 + movdqu [edx + 32], xmm2 + por xmm0, xmm5 + pshufb xmm1, xmm4 + movdqu [edx], xmm0 + por xmm1, xmm5 + palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} + pshufb xmm3, xmm4 + movdqu [edx + 16], xmm1 + por xmm3, xmm5 + movdqu [edx + 48], xmm3 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + ret + } +} + +__declspec(naked) void RAWToARGBRow_SSSE3(const uint8_t* src_raw, + uint8_t* dst_argb, + int width) { + __asm { + mov eax, [esp + 4] // src_raw + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + pslld xmm5, 24 + movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm3, [eax + 32] + lea eax, [eax + 48] + movdqa xmm2, xmm3 + palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} + pshufb xmm2, xmm4 + por xmm2, xmm5 + palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} + pshufb xmm0, xmm4 + movdqu [edx + 32], xmm2 + por xmm0, xmm5 + pshufb xmm1, xmm4 + movdqu [edx], xmm0 + por xmm1, xmm5 + palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} + pshufb xmm3, xmm4 + movdqu [edx + 16], xmm1 + por xmm3, xmm5 + movdqu [edx + 48], xmm3 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + ret + } +} + +__declspec(naked) void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, + uint8_t* dst_rgb24, + int width) { + __asm { + mov eax, [esp + 4] // src_raw + mov edx, [esp + 8] // dst_rgb24 + mov ecx, [esp + 12] // width + movdqa xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0 + movdqa xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1 + movdqa xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2 + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 4] + movdqu xmm2, [eax + 8] + lea eax, [eax + 24] + pshufb xmm0, xmm3 + pshufb xmm1, xmm4 + pshufb xmm2, xmm5 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + 8], xmm1 + movq qword ptr [edx + 16], xmm2 + lea edx, [edx + 24] + sub ecx, 8 + jg convertloop + ret + } +} + +// pmul method to replicate bits. +// Math to replicate bits: +// (v << 8) | (v << 3) +// v * 256 + v * 8 +// v * (256 + 8) +// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 +// 20 instructions. +__declspec(naked) void RGB565ToARGBRow_SSE2(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { + __asm { + mov eax, 0x01080108 // generate multiplier to repeat 5 bits + movd xmm5, eax + pshufd xmm5, xmm5, 0 + mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits + movd xmm6, eax + pshufd xmm6, xmm6, 0 + pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red + psllw xmm3, 11 + pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green + psllw xmm4, 10 + psrlw xmm4, 5 + pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha + psllw xmm7, 8 + + mov eax, [esp + 4] // src_rgb565 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + sub edx, eax + + convertloop: + movdqu xmm0, [eax] // fetch 8 pixels of bgr565 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + pand xmm1, xmm3 // R in upper 5 bits + psllw xmm2, 11 // B in upper 5 bits + pmulhuw xmm1, xmm5 // * (256 + 8) + pmulhuw xmm2, xmm5 // * (256 + 8) + psllw xmm1, 8 + por xmm1, xmm2 // RB + pand xmm0, xmm4 // G in middle 6 bits + pmulhuw xmm0, xmm6 // << 5 * (256 + 4) + por xmm0, xmm7 // AG + movdqa xmm2, xmm1 + punpcklbw xmm1, xmm0 + punpckhbw xmm2, xmm0 + movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB + movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB + lea eax, [eax + 16] + sub ecx, 8 + jg convertloop + ret + } +} + +#ifdef HAS_RGB565TOARGBROW_AVX2 +// pmul method to replicate bits. +// Math to replicate bits: +// (v << 8) | (v << 3) +// v * 256 + v * 8 +// v * (256 + 8) +// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 +__declspec(naked) void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { + __asm { + mov eax, 0x01080108 // generate multiplier to repeat 5 bits + vmovd xmm5, eax + vbroadcastss ymm5, xmm5 + mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits + vmovd xmm6, eax + vbroadcastss ymm6, xmm6 + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red + vpsllw ymm3, ymm3, 11 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green + vpsllw ymm4, ymm4, 10 + vpsrlw ymm4, ymm4, 5 + vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha + vpsllw ymm7, ymm7, 8 + + mov eax, [esp + 4] // src_rgb565 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + sub edx, eax + + convertloop: + vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565 + vpand ymm1, ymm0, ymm3 // R in upper 5 bits + vpsllw ymm2, ymm0, 11 // B in upper 5 bits + vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) + vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) + vpsllw ymm1, ymm1, 8 + vpor ymm1, ymm1, ymm2 // RB + vpand ymm0, ymm0, ymm4 // G in middle 6 bits + vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4) + vpor ymm0, ymm0, ymm7 // AG + vpermq ymm0, ymm0, 0xd8 // mutate for unpack + vpermq ymm1, ymm1, 0xd8 + vpunpckhbw ymm2, ymm1, ymm0 + vpunpcklbw ymm1, ymm1, ymm0 + vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB + vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB + lea eax, [eax + 32] + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_RGB565TOARGBROW_AVX2 + +#ifdef HAS_ARGB1555TOARGBROW_AVX2 +__declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width) { + __asm { + mov eax, 0x01080108 // generate multiplier to repeat 5 bits + vmovd xmm5, eax + vbroadcastss ymm5, xmm5 + mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits + vmovd xmm6, eax + vbroadcastss ymm6, xmm6 + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red + vpsllw ymm3, ymm3, 11 + vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green + vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha + vpsllw ymm7, ymm7, 8 + + mov eax, [esp + 4] // src_argb1555 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + sub edx, eax + + convertloop: + vmovdqu ymm0, [eax] // fetch 16 pixels of 1555 + vpsllw ymm1, ymm0, 1 // R in upper 5 bits + vpsllw ymm2, ymm0, 11 // B in upper 5 bits + vpand ymm1, ymm1, ymm3 + vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) + vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) + vpsllw ymm1, ymm1, 8 + vpor ymm1, ymm1, ymm2 // RB + vpsraw ymm2, ymm0, 8 // A + vpand ymm0, ymm0, ymm4 // G in middle 5 bits + vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8) + vpand ymm2, ymm2, ymm7 + vpor ymm0, ymm0, ymm2 // AG + vpermq ymm0, ymm0, 0xd8 // mutate for unpack + vpermq ymm1, ymm1, 0xd8 + vpunpckhbw ymm2, ymm1, ymm0 + vpunpcklbw ymm1, ymm1, ymm0 + vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB + vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB + lea eax, [eax + 32] + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGB1555TOARGBROW_AVX2 + +#ifdef HAS_ARGB4444TOARGBROW_AVX2 +__declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width) { + __asm { + mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f + vmovd xmm4, eax + vbroadcastss ymm4, xmm4 + vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles + mov eax, [esp + 4] // src_argb4444 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + sub edx, eax + + convertloop: + vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444 + vpand ymm2, ymm0, ymm5 // mask high nibbles + vpand ymm0, ymm0, ymm4 // mask low nibbles + vpsrlw ymm3, ymm2, 4 + vpsllw ymm1, ymm0, 4 + vpor ymm2, ymm2, ymm3 + vpor ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 // mutate for unpack + vpermq ymm2, ymm2, 0xd8 + vpunpckhbw ymm1, ymm0, ymm2 + vpunpcklbw ymm0, ymm0, ymm2 + vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB + vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB + lea eax, [eax + 32] + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGB4444TOARGBROW_AVX2 + +// 24 instructions +__declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width) { + __asm { + mov eax, 0x01080108 // generate multiplier to repeat 5 bits + movd xmm5, eax + pshufd xmm5, xmm5, 0 + mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits + movd xmm6, eax + pshufd xmm6, xmm6, 0 + pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red + psllw xmm3, 11 + movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green + psrlw xmm4, 6 + pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha + psllw xmm7, 8 + + mov eax, [esp + 4] // src_argb1555 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + sub edx, eax + + convertloop: + movdqu xmm0, [eax] // fetch 8 pixels of 1555 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + psllw xmm1, 1 // R in upper 5 bits + psllw xmm2, 11 // B in upper 5 bits + pand xmm1, xmm3 + pmulhuw xmm2, xmm5 // * (256 + 8) + pmulhuw xmm1, xmm5 // * (256 + 8) + psllw xmm1, 8 + por xmm1, xmm2 // RB + movdqa xmm2, xmm0 + pand xmm0, xmm4 // G in middle 5 bits + psraw xmm2, 8 // A + pmulhuw xmm0, xmm6 // << 6 * (256 + 8) + pand xmm2, xmm7 + por xmm0, xmm2 // AG + movdqa xmm2, xmm1 + punpcklbw xmm1, xmm0 + punpckhbw xmm2, xmm0 + movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB + movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB + lea eax, [eax + 16] + sub ecx, 8 + jg convertloop + ret + } +} + +// 18 instructions. +__declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width) { + __asm { + mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f + movd xmm4, eax + pshufd xmm4, xmm4, 0 + movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles + pslld xmm5, 4 + mov eax, [esp + 4] // src_argb4444 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + sub edx, eax + + convertloop: + movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 + movdqa xmm2, xmm0 + pand xmm0, xmm4 // mask low nibbles + pand xmm2, xmm5 // mask high nibbles + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + psllw xmm1, 4 + psrlw xmm3, 4 + por xmm0, xmm1 + por xmm2, xmm3 + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB + movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB + lea eax, [eax + 16] + sub ecx, 8 + jg convertloop + ret + } +} + +__declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // width + movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 + + convertloop: + movdqu xmm0, [eax] // fetch 16 pixels of argb + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + lea eax, [eax + 64] + pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB + pshufb xmm1, xmm6 + pshufb xmm2, xmm6 + pshufb xmm3, xmm6 + movdqa xmm4, xmm1 // 4 bytes from 1 for 0 + psrldq xmm1, 4 // 8 bytes from 1 + pslldq xmm4, 12 // 4 bytes from 1 for 0 + movdqa xmm5, xmm2 // 8 bytes from 2 for 1 + por xmm0, xmm4 // 4 bytes from 1 for 0 + pslldq xmm5, 8 // 8 bytes from 2 for 1 + movdqu [edx], xmm0 // store 0 + por xmm1, xmm5 // 8 bytes from 2 for 1 + psrldq xmm2, 8 // 4 bytes from 2 + pslldq xmm3, 4 // 12 bytes from 3 for 2 + por xmm2, xmm3 // 12 bytes from 3 for 2 + movdqu [edx + 16], xmm1 // store 1 + movdqu [edx + 32], xmm2 // store 2 + lea edx, [edx + 48] + sub ecx, 16 + jg convertloop + ret + } +} + +__declspec(naked) void ARGBToRAWRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // width + movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW + + convertloop: + movdqu xmm0, [eax] // fetch 16 pixels of argb + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + lea eax, [eax + 64] + pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB + pshufb xmm1, xmm6 + pshufb xmm2, xmm6 + pshufb xmm3, xmm6 + movdqa xmm4, xmm1 // 4 bytes from 1 for 0 + psrldq xmm1, 4 // 8 bytes from 1 + pslldq xmm4, 12 // 4 bytes from 1 for 0 + movdqa xmm5, xmm2 // 8 bytes from 2 for 1 + por xmm0, xmm4 // 4 bytes from 1 for 0 + pslldq xmm5, 8 // 8 bytes from 2 for 1 + movdqu [edx], xmm0 // store 0 + por xmm1, xmm5 // 8 bytes from 2 for 1 + psrldq xmm2, 8 // 4 bytes from 2 + pslldq xmm3, 4 // 12 bytes from 3 for 2 + por xmm2, xmm3 // 12 bytes from 3 for 2 + movdqu [edx + 16], xmm1 // store 1 + movdqu [edx + 32], xmm2 // store 2 + lea edx, [edx + 48] + sub ecx, 16 + jg convertloop + ret + } +} + +__declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // width + pcmpeqb xmm3, xmm3 // generate mask 0x0000001f + psrld xmm3, 27 + pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 + psrld xmm4, 26 + pslld xmm4, 5 + pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 + pslld xmm5, 11 + + convertloop: + movdqu xmm0, [eax] // fetch 4 pixels of argb + movdqa xmm1, xmm0 // B + movdqa xmm2, xmm0 // G + pslld xmm0, 8 // R + psrld xmm1, 3 // B + psrld xmm2, 5 // G + psrad xmm0, 16 // R + pand xmm1, xmm3 // B + pand xmm2, xmm4 // G + pand xmm0, xmm5 // R + por xmm1, xmm2 // BG + por xmm0, xmm1 // BGR + packssdw xmm0, xmm0 + lea eax, [eax + 16] + movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 + lea edx, [edx + 8] + sub ecx, 4 + jg convertloop + ret + } +} + +__declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, + int width) { + __asm { + + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + movd xmm6, [esp + 12] // dither4 + mov ecx, [esp + 16] // width + punpcklbw xmm6, xmm6 // make dither 16 bytes + movdqa xmm7, xmm6 + punpcklwd xmm6, xmm6 + punpckhwd xmm7, xmm7 + pcmpeqb xmm3, xmm3 // generate mask 0x0000001f + psrld xmm3, 27 + pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 + psrld xmm4, 26 + pslld xmm4, 5 + pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 + pslld xmm5, 11 + + convertloop: + movdqu xmm0, [eax] // fetch 4 pixels of argb + paddusb xmm0, xmm6 // add dither + movdqa xmm1, xmm0 // B + movdqa xmm2, xmm0 // G + pslld xmm0, 8 // R + psrld xmm1, 3 // B + psrld xmm2, 5 // G + psrad xmm0, 16 // R + pand xmm1, xmm3 // B + pand xmm2, xmm4 // G + pand xmm0, xmm5 // R + por xmm1, xmm2 // BG + por xmm0, xmm1 // BGR + packssdw xmm0, xmm0 + lea eax, [eax + 16] + movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 + lea edx, [edx + 8] + sub ecx, 4 + jg convertloop + ret + } +} + +#ifdef HAS_ARGBTORGB565DITHERROW_AVX2 +__declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + vbroadcastss xmm6, [esp + 12] // dither4 + mov ecx, [esp + 16] // width + vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes + vpermq ymm6, ymm6, 0xd8 + vpunpcklwd ymm6, ymm6, ymm6 + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f + vpsrld ymm3, ymm3, 27 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 + vpsrld ymm4, ymm4, 26 + vpslld ymm4, ymm4, 5 + vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 + + convertloop: + vmovdqu ymm0, [eax] // fetch 8 pixels of argb + vpaddusb ymm0, ymm0, ymm6 // add dither + vpsrld ymm2, ymm0, 5 // G + vpsrld ymm1, ymm0, 3 // B + vpsrld ymm0, ymm0, 8 // R + vpand ymm2, ymm2, ymm4 // G + vpand ymm1, ymm1, ymm3 // B + vpand ymm0, ymm0, ymm5 // R + vpor ymm1, ymm1, ymm2 // BG + vpor ymm0, ymm0, ymm1 // BGR + vpackusdw ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 + lea eax, [eax + 32] + vmovdqu [edx], xmm0 // store 8 pixels of RGB565 + lea edx, [edx + 16] + sub ecx, 8 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBTORGB565DITHERROW_AVX2 + +// TODO(fbarchard): Improve sign extension/packing. +__declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // width + pcmpeqb xmm4, xmm4 // generate mask 0x0000001f + psrld xmm4, 27 + movdqa xmm5, xmm4 // generate mask 0x000003e0 + pslld xmm5, 5 + movdqa xmm6, xmm4 // generate mask 0x00007c00 + pslld xmm6, 10 + pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 + pslld xmm7, 15 + + convertloop: + movdqu xmm0, [eax] // fetch 4 pixels of argb + movdqa xmm1, xmm0 // B + movdqa xmm2, xmm0 // G + movdqa xmm3, xmm0 // R + psrad xmm0, 16 // A + psrld xmm1, 3 // B + psrld xmm2, 6 // G + psrld xmm3, 9 // R + pand xmm0, xmm7 // A + pand xmm1, xmm4 // B + pand xmm2, xmm5 // G + pand xmm3, xmm6 // R + por xmm0, xmm1 // BA + por xmm2, xmm3 // GR + por xmm0, xmm2 // BGRA + packssdw xmm0, xmm0 + lea eax, [eax + 16] + movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 + lea edx, [edx + 8] + sub ecx, 4 + jg convertloop + ret + } +} + +__declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // width + pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 + psllw xmm4, 12 + movdqa xmm3, xmm4 // generate mask 0x00f000f0 + psrlw xmm3, 8 + + convertloop: + movdqu xmm0, [eax] // fetch 4 pixels of argb + movdqa xmm1, xmm0 + pand xmm0, xmm3 // low nibble + pand xmm1, xmm4 // high nibble + psrld xmm0, 4 + psrld xmm1, 8 + por xmm0, xmm1 + packuswb xmm0, xmm0 + lea eax, [eax + 16] + movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 + lea edx, [edx + 8] + sub ecx, 4 + jg convertloop + ret + } +} + +#ifdef HAS_ARGBTORGB565ROW_AVX2 +__declspec(naked) void ARGBToRGB565Row_AVX2(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // width + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f + vpsrld ymm3, ymm3, 27 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 + vpsrld ymm4, ymm4, 26 + vpslld ymm4, ymm4, 5 + vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 + + convertloop: + vmovdqu ymm0, [eax] // fetch 8 pixels of argb + vpsrld ymm2, ymm0, 5 // G + vpsrld ymm1, ymm0, 3 // B + vpsrld ymm0, ymm0, 8 // R + vpand ymm2, ymm2, ymm4 // G + vpand ymm1, ymm1, ymm3 // B + vpand ymm0, ymm0, ymm5 // R + vpor ymm1, ymm1, ymm2 // BG + vpor ymm0, ymm0, ymm1 // BGR + vpackusdw ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 + lea eax, [eax + 32] + vmovdqu [edx], xmm0 // store 8 pixels of RGB565 + lea edx, [edx + 16] + sub ecx, 8 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBTORGB565ROW_AVX2 + +#ifdef HAS_ARGBTOARGB1555ROW_AVX2 +__declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // width + vpcmpeqb ymm4, ymm4, ymm4 + vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f + vpslld ymm5, ymm4, 5 // generate mask 0x000003e0 + vpslld ymm6, ymm4, 10 // generate mask 0x00007c00 + vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000 + vpslld ymm7, ymm7, 15 + + convertloop: + vmovdqu ymm0, [eax] // fetch 8 pixels of argb + vpsrld ymm3, ymm0, 9 // R + vpsrld ymm2, ymm0, 6 // G + vpsrld ymm1, ymm0, 3 // B + vpsrad ymm0, ymm0, 16 // A + vpand ymm3, ymm3, ymm6 // R + vpand ymm2, ymm2, ymm5 // G + vpand ymm1, ymm1, ymm4 // B + vpand ymm0, ymm0, ymm7 // A + vpor ymm0, ymm0, ymm1 // BA + vpor ymm2, ymm2, ymm3 // GR + vpor ymm0, ymm0, ymm2 // BGRA + vpackssdw ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 + lea eax, [eax + 32] + vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555 + lea edx, [edx + 16] + sub ecx, 8 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBTOARGB1555ROW_AVX2 + +#ifdef HAS_ARGBTOARGB4444ROW_AVX2 +__declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // width + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 + vpsllw ymm4, ymm4, 12 + vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 + + convertloop: + vmovdqu ymm0, [eax] // fetch 8 pixels of argb + vpand ymm1, ymm0, ymm4 // high nibble + vpand ymm0, ymm0, ymm3 // low nibble + vpsrld ymm1, ymm1, 8 + vpsrld ymm0, ymm0, 4 + vpor ymm0, ymm0, ymm1 + vpackuswb ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 + lea eax, [eax + 32] + vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444 + lea edx, [edx + 16] + sub ecx, 8 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBTOARGB4444ROW_AVX2 + +// Convert 16 ARGB pixels (64 bytes) to 16 Y values. +__declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_y, + int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ + movdqa xmm4, xmmword ptr kARGBToY + movdqa xmm5, xmmword ptr kAddY16 + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + ret + } +} + +// Convert 16 ARGB pixels (64 bytes) to 16 YJ values. +// Same as ARGBToYRow but different coefficients, no add 16, but do rounding. +__declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_y, + int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ + movdqa xmm4, xmmword ptr kARGBToYJ + movdqa xmm5, xmmword ptr kAddYJ64 + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + paddw xmm0, xmm5 // Add .5 for rounding. + paddw xmm2, xmm5 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + ret + } +} + +#ifdef HAS_ARGBTOYROW_AVX2 +// vpermd for vphaddw + vpackuswb vpermd. +static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; + +// Convert 32 ARGB pixels (128 bytes) to 32 Y values. +__declspec(naked) void ARGBToYRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_y, + int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ + vbroadcastf128 ymm4, xmmword ptr kARGBToY + vbroadcastf128 ymm5, xmmword ptr kAddY16 + vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX + + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vmovdqu ymm2, [eax + 64] + vmovdqu ymm3, [eax + 96] + vpmaddubsw ymm0, ymm0, ymm4 + vpmaddubsw ymm1, ymm1, ymm4 + vpmaddubsw ymm2, ymm2, ymm4 + vpmaddubsw ymm3, ymm3, ymm4 + lea eax, [eax + 128] + vphaddw ymm0, ymm0, ymm1 // mutates. + vphaddw ymm2, ymm2, ymm3 + vpsrlw ymm0, ymm0, 7 + vpsrlw ymm2, ymm2, 7 + vpackuswb ymm0, ymm0, ymm2 // mutates. + vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. + vpaddb ymm0, ymm0, ymm5 // add 16 for Y + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBTOYROW_AVX2 + +#ifdef HAS_ARGBTOYJROW_AVX2 +// Convert 32 ARGB pixels (128 bytes) to 32 Y values. +__declspec(naked) void ARGBToYJRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_y, + int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ + vbroadcastf128 ymm4, xmmword ptr kARGBToYJ + vbroadcastf128 ymm5, xmmword ptr kAddYJ64 + vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX + + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vmovdqu ymm2, [eax + 64] + vmovdqu ymm3, [eax + 96] + vpmaddubsw ymm0, ymm0, ymm4 + vpmaddubsw ymm1, ymm1, ymm4 + vpmaddubsw ymm2, ymm2, ymm4 + vpmaddubsw ymm3, ymm3, ymm4 + lea eax, [eax + 128] + vphaddw ymm0, ymm0, ymm1 // mutates. + vphaddw ymm2, ymm2, ymm3 + vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding. + vpaddw ymm2, ymm2, ymm5 + vpsrlw ymm0, ymm0, 7 + vpsrlw ymm2, ymm2, 7 + vpackuswb ymm0, ymm0, ymm2 // mutates. + vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg convertloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBTOYJROW_AVX2 + +__declspec(naked) void BGRAToYRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_y, + int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ + movdqa xmm4, xmmword ptr kBGRAToY + movdqa xmm5, xmmword ptr kAddY16 + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + ret + } +} + +__declspec(naked) void ABGRToYRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_y, + int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ + movdqa xmm4, xmmword ptr kABGRToY + movdqa xmm5, xmmword ptr kAddY16 + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + ret + } +} + +__declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_y, + int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ + movdqa xmm4, xmmword ptr kRGBAToY + movdqa xmm5, xmmword ptr kAddY16 + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + ret + } +} + +__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + movdqa xmm5, xmmword ptr kAddUV128 + movdqa xmm6, xmmword ptr kARGBToV + movdqa xmm7, xmmword ptr kARGBToU + sub edi, edx // stride from u to v + + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm1, [eax + 16] + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm2, [eax + 32] + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + movdqa xmm5, xmmword ptr kAddUVJ128 + movdqa xmm6, xmmword ptr kARGBToVJ + movdqa xmm7, xmmword ptr kARGBToUJ + sub edi, edx // stride from u to v + + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm1, [eax + 16] + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm2, [eax + 32] + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + paddw xmm0, xmm5 // +.5 rounding -> unsigned + paddw xmm1, xmm5 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +#ifdef HAS_ARGBTOUVROW_AVX2 +__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + vbroadcastf128 ymm5, xmmword ptr kAddUV128 + vbroadcastf128 ymm6, xmmword ptr kARGBToV + vbroadcastf128 ymm7, xmmword ptr kARGBToU + sub edi, edx // stride from u to v + + convertloop: + /* step 1 - subsample 32x2 argb pixels to 16x1 */ + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vmovdqu ymm2, [eax + 64] + vmovdqu ymm3, [eax + 96] + vpavgb ymm0, ymm0, [eax + esi] + vpavgb ymm1, ymm1, [eax + esi + 32] + vpavgb ymm2, ymm2, [eax + esi + 64] + vpavgb ymm3, ymm3, [eax + esi + 96] + lea eax, [eax + 128] + vshufps ymm4, ymm0, ymm1, 0x88 + vshufps ymm0, ymm0, ymm1, 0xdd + vpavgb ymm0, ymm0, ymm4 // mutated by vshufps + vshufps ymm4, ymm2, ymm3, 0x88 + vshufps ymm2, ymm2, ymm3, 0xdd + vpavgb ymm2, ymm2, ymm4 // mutated by vshufps + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 32 different pixels, its 16 pixels of U and 16 of V + vpmaddubsw ymm1, ymm0, ymm7 // U + vpmaddubsw ymm3, ymm2, ymm7 + vpmaddubsw ymm0, ymm0, ymm6 // V + vpmaddubsw ymm2, ymm2, ymm6 + vphaddw ymm1, ymm1, ymm3 // mutates + vphaddw ymm0, ymm0, ymm2 + vpsraw ymm1, ymm1, 8 + vpsraw ymm0, ymm0, 8 + vpacksswb ymm0, ymm1, ymm0 // mutates + vpermq ymm0, ymm0, 0xd8 // For vpacksswb + vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw + vpaddb ymm0, ymm0, ymm5 // -> unsigned + + // step 3 - store 16 U and 16 V values + vextractf128 [edx], ymm0, 0 // U + vextractf128 [edx + edi], ymm0, 1 // V + lea edx, [edx + 16] + sub ecx, 32 + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_ARGBTOUVROW_AVX2 + +#ifdef HAS_ARGBTOUVJROW_AVX2 +__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + vbroadcastf128 ymm5, xmmword ptr kAddUV128 + vbroadcastf128 ymm6, xmmword ptr kARGBToV + vbroadcastf128 ymm7, xmmword ptr kARGBToU + sub edi, edx // stride from u to v + + convertloop: + /* step 1 - subsample 32x2 argb pixels to 16x1 */ + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vmovdqu ymm2, [eax + 64] + vmovdqu ymm3, [eax + 96] + vpavgb ymm0, ymm0, [eax + esi] + vpavgb ymm1, ymm1, [eax + esi + 32] + vpavgb ymm2, ymm2, [eax + esi + 64] + vpavgb ymm3, ymm3, [eax + esi + 96] + lea eax, [eax + 128] + vshufps ymm4, ymm0, ymm1, 0x88 + vshufps ymm0, ymm0, ymm1, 0xdd + vpavgb ymm0, ymm0, ymm4 // mutated by vshufps + vshufps ymm4, ymm2, ymm3, 0x88 + vshufps ymm2, ymm2, ymm3, 0xdd + vpavgb ymm2, ymm2, ymm4 // mutated by vshufps + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 32 different pixels, its 16 pixels of U and 16 of V + vpmaddubsw ymm1, ymm0, ymm7 // U + vpmaddubsw ymm3, ymm2, ymm7 + vpmaddubsw ymm0, ymm0, ymm6 // V + vpmaddubsw ymm2, ymm2, ymm6 + vphaddw ymm1, ymm1, ymm3 // mutates + vphaddw ymm0, ymm0, ymm2 + vpaddw ymm1, ymm1, ymm5 // +.5 rounding -> unsigned + vpaddw ymm0, ymm0, ymm5 + vpsraw ymm1, ymm1, 8 + vpsraw ymm0, ymm0, 8 + vpacksswb ymm0, ymm1, ymm0 // mutates + vpermq ymm0, ymm0, 0xd8 // For vpacksswb + vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw + + // step 3 - store 16 U and 16 V values + vextractf128 [edx], ymm0, 0 // U + vextractf128 [edx + edi], ymm0, 1 // V + lea edx, [edx + 16] + sub ecx, 32 + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_ARGBTOUVJROW_AVX2 + +__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_argb + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + movdqa xmm5, xmmword ptr kAddUV128 + movdqa xmm6, xmmword ptr kARGBToV + movdqa xmm7, xmmword ptr kARGBToU + sub edi, edx // stride from u to v + + convertloop: + /* convert to U and V */ + movdqu xmm0, [eax] // U + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm7 + pmaddubsw xmm1, xmm7 + pmaddubsw xmm2, xmm7 + pmaddubsw xmm3, xmm7 + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psraw xmm0, 8 + psraw xmm2, 8 + packsswb xmm0, xmm2 + paddb xmm0, xmm5 + movdqu [edx], xmm0 + + movdqu xmm0, [eax] // V + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm6 + pmaddubsw xmm1, xmm6 + pmaddubsw xmm2, xmm6 + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psraw xmm0, 8 + psraw xmm2, 8 + packsswb xmm0, xmm2 + paddb xmm0, xmm5 + lea eax, [eax + 64] + movdqu [edx + edi], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} + +__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + movdqa xmm5, xmmword ptr kAddUV128 + movdqa xmm6, xmmword ptr kBGRAToV + movdqa xmm7, xmmword ptr kBGRAToU + sub edi, edx // stride from u to v + + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm1, [eax + 16] + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm2, [eax + 32] + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + movdqa xmm5, xmmword ptr kAddUV128 + movdqa xmm6, xmmword ptr kABGRToV + movdqa xmm7, xmmword ptr kABGRToU + sub edi, edx // stride from u to v + + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm1, [eax + 16] + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm2, [eax + 32] + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + movdqa xmm5, xmmword ptr kAddUV128 + movdqa xmm6, xmmword ptr kRGBAToV + movdqa xmm7, xmmword ptr kRGBAToU + sub edi, edx // stride from u to v + + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm1, [eax + 16] + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm2, [eax + 32] + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} +#endif // HAS_ARGBTOYROW_SSSE3 + +// Read 16 UV from 444 +#define READYUV444_AVX2 \ + __asm { \ + __asm vmovdqu xmm0, [esi] /* U */ \ + __asm vmovdqu xmm1, [esi + edi] /* V */ \ + __asm lea esi, [esi + 16] \ + __asm vpermq ymm0, ymm0, 0xd8 \ + __asm vpermq ymm1, ymm1, 0xd8 \ + __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpermq ymm4, ymm4, 0xd8 \ + __asm vpunpcklbw ymm4, ymm4, ymm4 \ + __asm lea eax, [eax + 16]} + +// Read 8 UV from 422, upsample to 16 UV. +#define READYUV422_AVX2 \ + __asm { \ + __asm vmovq xmm0, qword ptr [esi] /* U */ \ + __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ + __asm lea esi, [esi + 8] \ + __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ + __asm vpermq ymm0, ymm0, 0xd8 \ + __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpermq ymm4, ymm4, 0xd8 \ + __asm vpunpcklbw ymm4, ymm4, ymm4 \ + __asm lea eax, [eax + 16]} + +// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. +#define READYUVA422_AVX2 \ + __asm { \ + __asm vmovq xmm0, qword ptr [esi] /* U */ \ + __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ + __asm lea esi, [esi + 8] \ + __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ + __asm vpermq ymm0, ymm0, 0xd8 \ + __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpermq ymm4, ymm4, 0xd8 \ + __asm vpunpcklbw ymm4, ymm4, ymm4 \ + __asm lea eax, [eax + 16] \ + __asm vmovdqu xmm5, [ebp] /* A */ \ + __asm vpermq ymm5, ymm5, 0xd8 \ + __asm lea ebp, [ebp + 16]} + +// Read 8 UV from NV12, upsample to 16 UV. +#define READNV12_AVX2 \ + __asm { \ + __asm vmovdqu xmm0, [esi] /* UV */ \ + __asm lea esi, [esi + 16] \ + __asm vpermq ymm0, ymm0, 0xd8 \ + __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpermq ymm4, ymm4, 0xd8 \ + __asm vpunpcklbw ymm4, ymm4, ymm4 \ + __asm lea eax, [eax + 16]} + +// Read 8 UV from NV21, upsample to 16 UV. +#define READNV21_AVX2 \ + __asm { \ + __asm vmovdqu xmm0, [esi] /* UV */ \ + __asm lea esi, [esi + 16] \ + __asm vpermq ymm0, ymm0, 0xd8 \ + __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleNV21 \ + __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpermq ymm4, ymm4, 0xd8 \ + __asm vpunpcklbw ymm4, ymm4, ymm4 \ + __asm lea eax, [eax + 16]} + +// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. +#define READYUY2_AVX2 \ + __asm { \ + __asm vmovdqu ymm4, [eax] /* YUY2 */ \ + __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \ + __asm vmovdqu ymm0, [eax] /* UV */ \ + __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \ + __asm lea eax, [eax + 32]} + +// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. +#define READUYVY_AVX2 \ + __asm { \ + __asm vmovdqu ymm4, [eax] /* UYVY */ \ + __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \ + __asm vmovdqu ymm0, [eax] /* UV */ \ + __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \ + __asm lea eax, [eax + 32]} + +// Convert 16 pixels: 16 UV and 16 Y. +#define YUVTORGB_AVX2(YuvConstants) \ + __asm { \ + __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\ + __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\ + __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\ + __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \ + __asm vpsubw ymm2, ymm3, ymm2 \ + __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \ + __asm vpsubw ymm1, ymm3, ymm1 \ + __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \ + __asm vpsubw ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */ \ + __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \ + __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \ + __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \ + __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \ + __asm vpsraw ymm0, ymm0, 6 \ + __asm vpsraw ymm1, ymm1, 6 \ + __asm vpsraw ymm2, ymm2, 6 \ + __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \ + __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \ + __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \ + } + +// Store 16 ARGB values. +#define STOREARGB_AVX2 \ + __asm { \ + __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \ + __asm vpermq ymm0, ymm0, 0xd8 \ + __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ + __asm vpermq ymm2, ymm2, 0xd8 \ + __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ + __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ + __asm vmovdqu 0[edx], ymm1 \ + __asm vmovdqu 32[edx], ymm0 \ + __asm lea edx, [edx + 64]} + +// Store 16 RGBA values. +#define STORERGBA_AVX2 \ + __asm { \ + __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \ + __asm vpermq ymm1, ymm1, 0xd8 \ + __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \ + __asm vpermq ymm2, ymm2, 0xd8 \ + __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \ + __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \ + __asm vmovdqu [edx], ymm0 \ + __asm vmovdqu [edx + 32], ymm1 \ + __asm lea edx, [edx + 64]} + +#ifdef HAS_I422TOARGBROW_AVX2 +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +__declspec(naked) void I422ToARGBRow_AVX2( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push edi + push ebx + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // argb + mov ebx, [esp + 12 + 20] // yuvconstants + mov ecx, [esp + 12 + 24] // width + sub edi, esi + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + + convertloop: + READYUV422_AVX2 + YUVTORGB_AVX2(ebx) + STOREARGB_AVX2 + + sub ecx, 16 + jg convertloop + + pop ebx + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_I422TOARGBROW_AVX2 + +#ifdef HAS_I422ALPHATOARGBROW_AVX2 +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. +__declspec(naked) void I422AlphaToARGBRow_AVX2( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push edi + push ebx + push ebp + mov eax, [esp + 16 + 4] // Y + mov esi, [esp + 16 + 8] // U + mov edi, [esp + 16 + 12] // V + mov ebp, [esp + 16 + 16] // A + mov edx, [esp + 16 + 20] // argb + mov ebx, [esp + 16 + 24] // yuvconstants + mov ecx, [esp + 16 + 28] // width + sub edi, esi + + convertloop: + READYUVA422_AVX2 + YUVTORGB_AVX2(ebx) + STOREARGB_AVX2 + + sub ecx, 16 + jg convertloop + + pop ebp + pop ebx + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_I422ALPHATOARGBROW_AVX2 + +#ifdef HAS_I444TOARGBROW_AVX2 +// 16 pixels +// 16 UV values with 16 Y producing 16 ARGB (64 bytes). +__declspec(naked) void I444ToARGBRow_AVX2( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push edi + push ebx + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // argb + mov ebx, [esp + 12 + 20] // yuvconstants + mov ecx, [esp + 12 + 24] // width + sub edi, esi + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + convertloop: + READYUV444_AVX2 + YUVTORGB_AVX2(ebx) + STOREARGB_AVX2 + + sub ecx, 16 + jg convertloop + + pop ebx + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_I444TOARGBROW_AVX2 + +#ifdef HAS_NV12TOARGBROW_AVX2 +// 16 pixels. +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +__declspec(naked) void NV12ToARGBRow_AVX2( + const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push ebx + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // UV + mov edx, [esp + 8 + 12] // argb + mov ebx, [esp + 8 + 16] // yuvconstants + mov ecx, [esp + 8 + 20] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + + convertloop: + READNV12_AVX2 + YUVTORGB_AVX2(ebx) + STOREARGB_AVX2 + + sub ecx, 16 + jg convertloop + + pop ebx + pop esi + vzeroupper + ret + } +} +#endif // HAS_NV12TOARGBROW_AVX2 + +#ifdef HAS_NV21TOARGBROW_AVX2 +// 16 pixels. +// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +__declspec(naked) void NV21ToARGBRow_AVX2( + const uint8_t* y_buf, + const uint8_t* vu_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push ebx + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // VU + mov edx, [esp + 8 + 12] // argb + mov ebx, [esp + 8 + 16] // yuvconstants + mov ecx, [esp + 8 + 20] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + + convertloop: + READNV21_AVX2 + YUVTORGB_AVX2(ebx) + STOREARGB_AVX2 + + sub ecx, 16 + jg convertloop + + pop ebx + pop esi + vzeroupper + ret + } +} +#endif // HAS_NV21TOARGBROW_AVX2 + +#ifdef HAS_YUY2TOARGBROW_AVX2 +// 16 pixels. +// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). +__declspec(naked) void YUY2ToARGBRow_AVX2( + const uint8_t* src_yuy2, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push ebx + mov eax, [esp + 4 + 4] // yuy2 + mov edx, [esp + 4 + 8] // argb + mov ebx, [esp + 4 + 12] // yuvconstants + mov ecx, [esp + 4 + 16] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + + convertloop: + READYUY2_AVX2 + YUVTORGB_AVX2(ebx) + STOREARGB_AVX2 + + sub ecx, 16 + jg convertloop + + pop ebx + vzeroupper + ret + } +} +#endif // HAS_YUY2TOARGBROW_AVX2 + +#ifdef HAS_UYVYTOARGBROW_AVX2 +// 16 pixels. +// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). +__declspec(naked) void UYVYToARGBRow_AVX2( + const uint8_t* src_uyvy, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push ebx + mov eax, [esp + 4 + 4] // uyvy + mov edx, [esp + 4 + 8] // argb + mov ebx, [esp + 4 + 12] // yuvconstants + mov ecx, [esp + 4 + 16] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + + convertloop: + READUYVY_AVX2 + YUVTORGB_AVX2(ebx) + STOREARGB_AVX2 + + sub ecx, 16 + jg convertloop + + pop ebx + vzeroupper + ret + } +} +#endif // HAS_UYVYTOARGBROW_AVX2 + +#ifdef HAS_I422TORGBAROW_AVX2 +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). +__declspec(naked) void I422ToRGBARow_AVX2( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push edi + push ebx + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // abgr + mov ebx, [esp + 12 + 20] // yuvconstants + mov ecx, [esp + 12 + 24] // width + sub edi, esi + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + + convertloop: + READYUV422_AVX2 + YUVTORGB_AVX2(ebx) + STORERGBA_AVX2 + + sub ecx, 16 + jg convertloop + + pop ebx + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_I422TORGBAROW_AVX2 + +#if defined(HAS_I422TOARGBROW_SSSE3) +// TODO(fbarchard): Read that does half size on Y and treats 420 as 444. +// Allows a conversion with half size scaling. + +// Read 8 UV from 444. +#define READYUV444 \ + __asm { \ + __asm movq xmm0, qword ptr [esi] /* U */ \ + __asm movq xmm1, qword ptr [esi + edi] /* V */ \ + __asm lea esi, [esi + 8] \ + __asm punpcklbw xmm0, xmm1 /* UV */ \ + __asm movq xmm4, qword ptr [eax] \ + __asm punpcklbw xmm4, xmm4 \ + __asm lea eax, [eax + 8]} + +// Read 4 UV from 422, upsample to 8 UV. +#define READYUV422 \ + __asm { \ + __asm movd xmm0, [esi] /* U */ \ + __asm movd xmm1, [esi + edi] /* V */ \ + __asm lea esi, [esi + 4] \ + __asm punpcklbw xmm0, xmm1 /* UV */ \ + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + __asm movq xmm4, qword ptr [eax] \ + __asm punpcklbw xmm4, xmm4 \ + __asm lea eax, [eax + 8]} + +// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. +#define READYUVA422 \ + __asm { \ + __asm movd xmm0, [esi] /* U */ \ + __asm movd xmm1, [esi + edi] /* V */ \ + __asm lea esi, [esi + 4] \ + __asm punpcklbw xmm0, xmm1 /* UV */ \ + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + __asm movq xmm4, qword ptr [eax] /* Y */ \ + __asm punpcklbw xmm4, xmm4 \ + __asm lea eax, [eax + 8] \ + __asm movq xmm5, qword ptr [ebp] /* A */ \ + __asm lea ebp, [ebp + 8]} + +// Read 4 UV from NV12, upsample to 8 UV. +#define READNV12 \ + __asm { \ + __asm movq xmm0, qword ptr [esi] /* UV */ \ + __asm lea esi, [esi + 8] \ + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + __asm movq xmm4, qword ptr [eax] \ + __asm punpcklbw xmm4, xmm4 \ + __asm lea eax, [eax + 8]} + +// Read 4 VU from NV21, upsample to 8 UV. +#define READNV21 \ + __asm { \ + __asm movq xmm0, qword ptr [esi] /* UV */ \ + __asm lea esi, [esi + 8] \ + __asm pshufb xmm0, xmmword ptr kShuffleNV21 \ + __asm movq xmm4, qword ptr [eax] \ + __asm punpcklbw xmm4, xmm4 \ + __asm lea eax, [eax + 8]} + +// Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV. +#define READYUY2 \ + __asm { \ + __asm movdqu xmm4, [eax] /* YUY2 */ \ + __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \ + __asm movdqu xmm0, [eax] /* UV */ \ + __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \ + __asm lea eax, [eax + 16]} + +// Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV. +#define READUYVY \ + __asm { \ + __asm movdqu xmm4, [eax] /* UYVY */ \ + __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \ + __asm movdqu xmm0, [eax] /* UV */ \ + __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \ + __asm lea eax, [eax + 16]} + +// Convert 8 pixels: 8 UV and 8 Y. +#define YUVTORGB(YuvConstants) \ + __asm { \ + __asm movdqa xmm1, xmm0 \ + __asm movdqa xmm2, xmm0 \ + __asm movdqa xmm3, xmm0 \ + __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVBIASB] \ + __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOB] \ + __asm psubw xmm0, xmm1 \ + __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVBIASG] \ + __asm pmaddubsw xmm2, xmmword ptr [YuvConstants + KUVTOG] \ + __asm psubw xmm1, xmm2 \ + __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \ + __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \ + __asm psubw xmm2, xmm3 \ + __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \ + __asm paddsw xmm0, xmm4 /* B += Y */ \ + __asm paddsw xmm1, xmm4 /* G += Y */ \ + __asm paddsw xmm2, xmm4 /* R += Y */ \ + __asm psraw xmm0, 6 \ + __asm psraw xmm1, 6 \ + __asm psraw xmm2, 6 \ + __asm packuswb xmm0, xmm0 /* B */ \ + __asm packuswb xmm1, xmm1 /* G */ \ + __asm packuswb xmm2, xmm2 /* R */ \ + } + +// Store 8 ARGB values. +#define STOREARGB \ + __asm { \ + __asm punpcklbw xmm0, xmm1 /* BG */ \ + __asm punpcklbw xmm2, xmm5 /* RA */ \ + __asm movdqa xmm1, xmm0 \ + __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ + __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ + __asm movdqu 0[edx], xmm0 \ + __asm movdqu 16[edx], xmm1 \ + __asm lea edx, [edx + 32]} + +// Store 8 BGRA values. +#define STOREBGRA \ + __asm { \ + __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ + __asm punpcklbw xmm1, xmm0 /* GB */ \ + __asm punpcklbw xmm5, xmm2 /* AR */ \ + __asm movdqa xmm0, xmm5 \ + __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ + __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ + __asm movdqu 0[edx], xmm5 \ + __asm movdqu 16[edx], xmm0 \ + __asm lea edx, [edx + 32]} + +// Store 8 RGBA values. +#define STORERGBA \ + __asm { \ + __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ + __asm punpcklbw xmm1, xmm2 /* GR */ \ + __asm punpcklbw xmm5, xmm0 /* AB */ \ + __asm movdqa xmm0, xmm5 \ + __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ + __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ + __asm movdqu 0[edx], xmm5 \ + __asm movdqu 16[edx], xmm0 \ + __asm lea edx, [edx + 32]} + +// Store 8 RGB24 values. +#define STORERGB24 \ + __asm {/* Weave into RRGB */ \ + __asm punpcklbw xmm0, xmm1 /* BG */ \ + __asm punpcklbw xmm2, xmm2 /* RR */ \ + __asm movdqa xmm1, xmm0 \ + __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ + __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */ \ + __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ + __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ + __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ + __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ + __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ + __asm lea edx, [edx + 24]} + +// Store 8 RGB565 values. +#define STORERGB565 \ + __asm {/* Weave into RRGB */ \ + __asm punpcklbw xmm0, xmm1 /* BG */ \ + __asm punpcklbw xmm2, xmm2 /* RR */ \ + __asm movdqa xmm1, xmm0 \ + __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ + __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */ \ + __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \ + __asm movdqa xmm2, xmm0 /* G */ \ + __asm pslld xmm0, 8 /* R */ \ + __asm psrld xmm3, 3 /* B */ \ + __asm psrld xmm2, 5 /* G */ \ + __asm psrad xmm0, 16 /* R */ \ + __asm pand xmm3, xmm5 /* B */ \ + __asm pand xmm2, xmm6 /* G */ \ + __asm pand xmm0, xmm7 /* R */ \ + __asm por xmm3, xmm2 /* BG */ \ + __asm por xmm0, xmm3 /* BGR */ \ + __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \ + __asm movdqa xmm2, xmm1 /* G */ \ + __asm pslld xmm1, 8 /* R */ \ + __asm psrld xmm3, 3 /* B */ \ + __asm psrld xmm2, 5 /* G */ \ + __asm psrad xmm1, 16 /* R */ \ + __asm pand xmm3, xmm5 /* B */ \ + __asm pand xmm2, xmm6 /* G */ \ + __asm pand xmm1, xmm7 /* R */ \ + __asm por xmm3, xmm2 /* BG */ \ + __asm por xmm1, xmm3 /* BGR */ \ + __asm packssdw xmm0, xmm1 \ + __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \ + __asm lea edx, [edx + 16]} + +// 8 pixels. +// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) void I444ToARGBRow_SSSE3( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push edi + push ebx + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // argb + mov ebx, [esp + 12 + 20] // yuvconstants + mov ecx, [esp + 12 + 24] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + + convertloop: + READYUV444 + YUVTORGB(ebx) + STOREARGB + + sub ecx, 8 + jg convertloop + + pop ebx + pop edi + pop esi + ret + } +} + +// 8 pixels. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). +__declspec(naked) void I422ToRGB24Row_SSSE3( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push edi + push ebx + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // argb + mov ebx, [esp + 12 + 20] // yuvconstants + mov ecx, [esp + 12 + 24] // width + sub edi, esi + movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0 + movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 + + convertloop: + READYUV422 + YUVTORGB(ebx) + STORERGB24 + + sub ecx, 8 + jg convertloop + + pop ebx + pop edi + pop esi + ret + } +} + +// 8 pixels +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). +__declspec(naked) void I422ToRGB565Row_SSSE3( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb565_buf, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push edi + push ebx + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // argb + mov ebx, [esp + 12 + 20] // yuvconstants + mov ecx, [esp + 12 + 24] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate mask 0x0000001f + psrld xmm5, 27 + pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 + psrld xmm6, 26 + pslld xmm6, 5 + pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 + pslld xmm7, 11 + + convertloop: + READYUV422 + YUVTORGB(ebx) + STORERGB565 + + sub ecx, 8 + jg convertloop + + pop ebx + pop edi + pop esi + ret + } +} + +// 8 pixels. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) void I422ToARGBRow_SSSE3( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push edi + push ebx + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // argb + mov ebx, [esp + 12 + 20] // yuvconstants + mov ecx, [esp + 12 + 24] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + + convertloop: + READYUV422 + YUVTORGB(ebx) + STOREARGB + + sub ecx, 8 + jg convertloop + + pop ebx + pop edi + pop esi + ret + } +} + +// 8 pixels. +// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB. +__declspec(naked) void I422AlphaToARGBRow_SSSE3( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push edi + push ebx + push ebp + mov eax, [esp + 16 + 4] // Y + mov esi, [esp + 16 + 8] // U + mov edi, [esp + 16 + 12] // V + mov ebp, [esp + 16 + 16] // A + mov edx, [esp + 16 + 20] // argb + mov ebx, [esp + 16 + 24] // yuvconstants + mov ecx, [esp + 16 + 28] // width + sub edi, esi + + convertloop: + READYUVA422 + YUVTORGB(ebx) + STOREARGB + + sub ecx, 8 + jg convertloop + + pop ebp + pop ebx + pop edi + pop esi + ret + } +} + +// 8 pixels. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) void NV12ToARGBRow_SSSE3( + const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push ebx + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // UV + mov edx, [esp + 8 + 12] // argb + mov ebx, [esp + 8 + 16] // yuvconstants + mov ecx, [esp + 8 + 20] // width + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + + convertloop: + READNV12 + YUVTORGB(ebx) + STOREARGB + + sub ecx, 8 + jg convertloop + + pop ebx + pop esi + ret + } +} + +// 8 pixels. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) void NV21ToARGBRow_SSSE3( + const uint8_t* y_buf, + const uint8_t* vu_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push ebx + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // VU + mov edx, [esp + 8 + 12] // argb + mov ebx, [esp + 8 + 16] // yuvconstants + mov ecx, [esp + 8 + 20] // width + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + + convertloop: + READNV21 + YUVTORGB(ebx) + STOREARGB + + sub ecx, 8 + jg convertloop + + pop ebx + pop esi + ret + } +} + +// 8 pixels. +// 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes). +__declspec(naked) void YUY2ToARGBRow_SSSE3( + const uint8_t* src_yuy2, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push ebx + mov eax, [esp + 4 + 4] // yuy2 + mov edx, [esp + 4 + 8] // argb + mov ebx, [esp + 4 + 12] // yuvconstants + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + + convertloop: + READYUY2 + YUVTORGB(ebx) + STOREARGB + + sub ecx, 8 + jg convertloop + + pop ebx + ret + } +} + +// 8 pixels. +// 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes). +__declspec(naked) void UYVYToARGBRow_SSSE3( + const uint8_t* src_uyvy, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push ebx + mov eax, [esp + 4 + 4] // uyvy + mov edx, [esp + 4 + 8] // argb + mov ebx, [esp + 4 + 12] // yuvconstants + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + + convertloop: + READUYVY + YUVTORGB(ebx) + STOREARGB + + sub ecx, 8 + jg convertloop + + pop ebx + ret + } +} + +__declspec(naked) void I422ToRGBARow_SSSE3( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgba, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push edi + push ebx + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // argb + mov ebx, [esp + 12 + 20] // yuvconstants + mov ecx, [esp + 12 + 24] // width + sub edi, esi + + convertloop: + READYUV422 + YUVTORGB(ebx) + STORERGBA + + sub ecx, 8 + jg convertloop + + pop ebx + pop edi + pop esi + ret + } +} +#endif // HAS_I422TOARGBROW_SSSE3 + +#ifdef HAS_I400TOARGBROW_SSE2 +// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). +__declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf, + uint8_t* rgb_buf, + int width) { + __asm { + mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) + movd xmm2, eax + pshufd xmm2, xmm2,0 + mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) + movd xmm3, eax + pshufd xmm3, xmm3, 0 + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pslld xmm4, 24 + + mov eax, [esp + 4] // Y + mov edx, [esp + 8] // rgb + mov ecx, [esp + 12] // width + + convertloop: + // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 + movq xmm0, qword ptr [eax] + lea eax, [eax + 8] + punpcklbw xmm0, xmm0 // Y.Y + pmulhuw xmm0, xmm2 + psubusw xmm0, xmm3 + psrlw xmm0, 6 + packuswb xmm0, xmm0 // G + + // Step 2: Weave into ARGB + punpcklbw xmm0, xmm0 // GG + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm0 // BGRA first 4 pixels + punpckhwd xmm1, xmm1 // BGRA next 4 pixels + por xmm0, xmm4 + por xmm1, xmm4 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + ret + } +} +#endif // HAS_I400TOARGBROW_SSE2 + +#ifdef HAS_I400TOARGBROW_AVX2 +// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). +// note: vpunpcklbw mutates and vpackuswb unmutates. +__declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf, + uint8_t* rgb_buf, + int width) { + __asm { + mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) + vmovd xmm2, eax + vbroadcastss ymm2, xmm2 + mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) + vmovd xmm3, eax + vbroadcastss ymm3, xmm3 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000 + vpslld ymm4, ymm4, 24 + + mov eax, [esp + 4] // Y + mov edx, [esp + 8] // rgb + mov ecx, [esp + 12] // width + + convertloop: + // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164 + vmovdqu xmm0, [eax] + lea eax, [eax + 16] + vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates + vpunpcklbw ymm0, ymm0, ymm0 // Y.Y + vpmulhuw ymm0, ymm0, ymm2 + vpsubusw ymm0, ymm0, ymm3 + vpsrlw ymm0, ymm0, 6 + vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120 + + // TODO(fbarchard): Weave alpha with unpack. + // Step 2: Weave into ARGB + vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates + vpermq ymm1, ymm1, 0xd8 + vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels + vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels + vpor ymm0, ymm0, ymm4 + vpor ymm1, ymm1, ymm4 + vmovdqu [edx], ymm0 + vmovdqu [edx + 32], ymm1 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_I400TOARGBROW_AVX2 + +#ifdef HAS_MIRRORROW_SSSE3 +// Shuffle table for reversing the bytes. +static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, + 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; + +// TODO(fbarchard): Replace lea with -16 offset. +__declspec(naked) void MirrorRow_SSSE3(const uint8_t* src, + uint8_t* dst, + int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + movdqa xmm5, xmmword ptr kShuffleMirror + + convertloop: + movdqu xmm0, [eax - 16 + ecx] + pshufb xmm0, xmm5 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + ret + } +} +#endif // HAS_MIRRORROW_SSSE3 + +#ifdef HAS_MIRRORROW_AVX2 +__declspec(naked) void MirrorRow_AVX2(const uint8_t* src, + uint8_t* dst, + int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + vbroadcastf128 ymm5, xmmword ptr kShuffleMirror + + convertloop: + vmovdqu ymm0, [eax - 32 + ecx] + vpshufb ymm0, ymm0, ymm5 + vpermq ymm0, ymm0, 0x4e // swap high and low halfs + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_MIRRORROW_AVX2 + +#ifdef HAS_MIRRORUVROW_SSSE3 +// Shuffle table for reversing the bytes of UV channels. +static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, + 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; + +__declspec(naked) void MirrorUVRow_SSSE3(const uint8_t* src, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + movdqa xmm1, xmmword ptr kShuffleMirrorUV + lea eax, [eax + ecx * 2 - 16] + sub edi, edx + + convertloop: + movdqu xmm0, [eax] + lea eax, [eax - 16] + pshufb xmm0, xmm1 + movlpd qword ptr [edx], xmm0 + movhpd qword ptr [edx + edi], xmm0 + lea edx, [edx + 8] + sub ecx, 8 + jg convertloop + + pop edi + ret + } +} +#endif // HAS_MIRRORUVROW_SSSE3 + +#ifdef HAS_ARGBMIRRORROW_SSE2 +__declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src, + uint8_t* dst, + int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + lea eax, [eax - 16 + ecx * 4] // last 4 pixels. + + convertloop: + movdqu xmm0, [eax] + lea eax, [eax - 16] + pshufd xmm0, xmm0, 0x1b + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg convertloop + ret + } +} +#endif // HAS_ARGBMIRRORROW_SSE2 + +#ifdef HAS_ARGBMIRRORROW_AVX2 +// Shuffle table for reversing the bytes. +static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; + +__declspec(naked) void ARGBMirrorRow_AVX2(const uint8_t* src, + uint8_t* dst, + int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + vmovdqu ymm5, ymmword ptr kARGBShuffleMirror_AVX2 + + convertloop: + vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBMIRRORROW_AVX2 + +#ifdef HAS_SPLITUVROW_SSE2 +__declspec(naked) void SplitUVRow_SSE2(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_uv + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + pand xmm0, xmm5 // even bytes + pand xmm1, xmm5 + packuswb xmm0, xmm1 + psrlw xmm2, 8 // odd bytes + psrlw xmm3, 8 + packuswb xmm2, xmm3 + movdqu [edx], xmm0 + movdqu [edx + edi], xmm2 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} + +#endif // HAS_SPLITUVROW_SSE2 + +#ifdef HAS_SPLITUVROW_AVX2 +__declspec(naked) void SplitUVRow_AVX2(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_uv + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + sub edi, edx + + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpsrlw ymm2, ymm0, 8 // odd bytes + vpsrlw ymm3, ymm1, 8 + vpand ymm0, ymm0, ymm5 // even bytes + vpand ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 + vpackuswb ymm2, ymm2, ymm3 + vpermq ymm0, ymm0, 0xd8 + vpermq ymm2, ymm2, 0xd8 + vmovdqu [edx], ymm0 + vmovdqu [edx + edi], ymm2 + lea edx, [edx + 32] + sub ecx, 32 + jg convertloop + + pop edi + vzeroupper + ret + } +} +#endif // HAS_SPLITUVROW_AVX2 + +#ifdef HAS_MERGEUVROW_SSE2 +__declspec(naked) void MergeUVRow_SSE2(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_u + mov edx, [esp + 4 + 8] // src_v + mov edi, [esp + 4 + 12] // dst_uv + mov ecx, [esp + 4 + 16] // width + sub edx, eax + + convertloop: + movdqu xmm0, [eax] // read 16 U's + movdqu xmm1, [eax + edx] // and 16 V's + lea eax, [eax + 16] + movdqa xmm2, xmm0 + punpcklbw xmm0, xmm1 // first 8 UV pairs + punpckhbw xmm2, xmm1 // next 8 UV pairs + movdqu [edi], xmm0 + movdqu [edi + 16], xmm2 + lea edi, [edi + 32] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} +#endif // HAS_MERGEUVROW_SSE2 + +#ifdef HAS_MERGEUVROW_AVX2 +__declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_u + mov edx, [esp + 4 + 8] // src_v + mov edi, [esp + 4 + 12] // dst_uv + mov ecx, [esp + 4 + 16] // width + sub edx, eax + + convertloop: + vmovdqu ymm0, [eax] // read 32 U's + vmovdqu ymm1, [eax + edx] // and 32 V's + lea eax, [eax + 32] + vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2 + vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3 + vextractf128 [edi], ymm2, 0 // bytes 0..15 + vextractf128 [edi + 16], ymm0, 0 // bytes 16..31 + vextractf128 [edi + 32], ymm2, 1 // bytes 32..47 + vextractf128 [edi + 48], ymm0, 1 // bytes 47..63 + lea edi, [edi + 64] + sub ecx, 32 + jg convertloop + + pop edi + vzeroupper + ret + } +} +#endif // HAS_MERGEUVROW_AVX2 + +#ifdef HAS_COPYROW_SSE2 +// CopyRow copys 'width' bytes using a 16 byte load/store, 32 bytes at time. +__declspec(naked) void CopyRow_SSE2(const uint8_t* src, + uint8_t* dst, + int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + test eax, 15 + jne convertloopu + test edx, 15 + jne convertloopu + + convertloopa: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 32 + jg convertloopa + ret + + convertloopu: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 32 + jg convertloopu + ret + } +} +#endif // HAS_COPYROW_SSE2 + +#ifdef HAS_COPYROW_AVX +// CopyRow copys 'width' bytes using a 32 byte load/store, 64 bytes at time. +__declspec(naked) void CopyRow_AVX(const uint8_t* src, + uint8_t* dst, + int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vmovdqu [edx], ymm0 + vmovdqu [edx + 32], ymm1 + lea edx, [edx + 64] + sub ecx, 64 + jg convertloop + + vzeroupper + ret + } +} +#endif // HAS_COPYROW_AVX + +// Multiple of 1. +__declspec(naked) void CopyRow_ERMS(const uint8_t* src, + uint8_t* dst, + int width) { + __asm { + mov eax, esi + mov edx, edi + mov esi, [esp + 4] // src + mov edi, [esp + 8] // dst + mov ecx, [esp + 12] // width + rep movsb + mov edi, edx + mov esi, eax + ret + } +} + +#ifdef HAS_ARGBCOPYALPHAROW_SSE2 +// width in pixels +__declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8_t* src, + uint8_t* dst, + int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + pcmpeqb xmm0, xmm0 // generate mask 0xff000000 + pslld xmm0, 24 + pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff + psrld xmm1, 8 + + convertloop: + movdqu xmm2, [eax] + movdqu xmm3, [eax + 16] + lea eax, [eax + 32] + movdqu xmm4, [edx] + movdqu xmm5, [edx + 16] + pand xmm2, xmm0 + pand xmm3, xmm0 + pand xmm4, xmm1 + pand xmm5, xmm1 + por xmm2, xmm4 + por xmm3, xmm5 + movdqu [edx], xmm2 + movdqu [edx + 16], xmm3 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + ret + } +} +#endif // HAS_ARGBCOPYALPHAROW_SSE2 + +#ifdef HAS_ARGBCOPYALPHAROW_AVX2 +// width in pixels +__declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8_t* src, + uint8_t* dst, + int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + vpcmpeqb ymm0, ymm0, ymm0 + vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff + + convertloop: + vmovdqu ymm1, [eax] + vmovdqu ymm2, [eax + 32] + lea eax, [eax + 64] + vpblendvb ymm1, ymm1, [edx], ymm0 + vpblendvb ymm2, ymm2, [edx + 32], ymm0 + vmovdqu [edx], ymm1 + vmovdqu [edx + 32], ymm2 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBCOPYALPHAROW_AVX2 + +#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 +// width in pixels +__declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_a + mov ecx, [esp + 12] // width + + extractloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + psrld xmm0, 24 + psrld xmm1, 24 + packssdw xmm0, xmm1 + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + sub ecx, 8 + jg extractloop + + ret + } +} +#endif // HAS_ARGBEXTRACTALPHAROW_SSE2 + +#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 +// width in pixels +__declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_a + mov ecx, [esp + 12] // width + vmovdqa ymm4, ymmword ptr kPermdARGBToY_AVX + + extractloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vpsrld ymm0, ymm0, 24 + vpsrld ymm1, ymm1, 24 + vmovdqu ymm2, [eax + 64] + vmovdqu ymm3, [eax + 96] + lea eax, [eax + 128] + vpackssdw ymm0, ymm0, ymm1 // mutates + vpsrld ymm2, ymm2, 24 + vpsrld ymm3, ymm3, 24 + vpackssdw ymm2, ymm2, ymm3 // mutates + vpackuswb ymm0, ymm0, ymm2 // mutates + vpermd ymm0, ymm4, ymm0 // unmutate + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg extractloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBEXTRACTALPHAROW_AVX2 + +#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 +// width in pixels +__declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, + uint8_t* dst, + int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + pcmpeqb xmm0, xmm0 // generate mask 0xff000000 + pslld xmm0, 24 + pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff + psrld xmm1, 8 + + convertloop: + movq xmm2, qword ptr [eax] // 8 Y's + lea eax, [eax + 8] + punpcklbw xmm2, xmm2 + punpckhwd xmm3, xmm2 + punpcklwd xmm2, xmm2 + movdqu xmm4, [edx] + movdqu xmm5, [edx + 16] + pand xmm2, xmm0 + pand xmm3, xmm0 + pand xmm4, xmm1 + pand xmm5, xmm1 + por xmm2, xmm4 + por xmm3, xmm5 + movdqu [edx], xmm2 + movdqu [edx + 16], xmm3 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + ret + } +} +#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 + +#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 +// width in pixels +__declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, + uint8_t* dst, + int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + vpcmpeqb ymm0, ymm0, ymm0 + vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff + + convertloop: + vpmovzxbd ymm1, qword ptr [eax] + vpmovzxbd ymm2, qword ptr [eax + 8] + lea eax, [eax + 16] + vpslld ymm1, ymm1, 24 + vpslld ymm2, ymm2, 24 + vpblendvb ymm1, ymm1, [edx], ymm0 + vpblendvb ymm2, ymm2, [edx + 32], ymm0 + vmovdqu [edx], ymm1 + vmovdqu [edx + 32], ymm2 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 + +#ifdef HAS_SETROW_X86 +// Write 'width' bytes using an 8 bit value repeated. +// width should be multiple of 4. +__declspec(naked) void SetRow_X86(uint8_t* dst, uint8_t v8, int width) { + __asm { + movzx eax, byte ptr [esp + 8] // v8 + mov edx, 0x01010101 // Duplicate byte to all bytes. + mul edx // overwrites edx with upper part of result. + mov edx, edi + mov edi, [esp + 4] // dst + mov ecx, [esp + 12] // width + shr ecx, 2 + rep stosd + mov edi, edx + ret + } +} + +// Write 'width' bytes using an 8 bit value repeated. +__declspec(naked) void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) { + __asm { + mov edx, edi + mov edi, [esp + 4] // dst + mov eax, [esp + 8] // v8 + mov ecx, [esp + 12] // width + rep stosb + mov edi, edx + ret + } +} + +// Write 'width' 32 bit values. +__declspec(naked) void ARGBSetRow_X86(uint8_t* dst_argb, + uint32_t v32, + int width) { + __asm { + mov edx, edi + mov edi, [esp + 4] // dst + mov eax, [esp + 8] // v32 + mov ecx, [esp + 12] // width + rep stosd + mov edi, edx + ret + } +} +#endif // HAS_SETROW_X86 + +#ifdef HAS_YUY2TOYROW_AVX2 +__declspec(naked) void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, + uint8_t* dst_y, + int width) { + __asm { + mov eax, [esp + 4] // src_yuy2 + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpand ymm0, ymm0, ymm5 // even bytes are Y + vpand ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg convertloop + vzeroupper + ret + } +} + +__declspec(naked) void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + sub edi, edx + + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vpavgb ymm0, ymm0, [eax + esi] + vpavgb ymm1, ymm1, [eax + esi + 32] + lea eax, [eax + 64] + vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV + vpsrlw ymm1, ymm1, 8 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + vpand ymm1, ymm0, ymm5 // U + vpsrlw ymm0, ymm0, 8 // V + vpackuswb ymm1, ymm1, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm0 // mutates. + vpermq ymm1, ymm1, 0xd8 + vpermq ymm0, ymm0, 0xd8 + vextractf128 [edx], ymm1, 0 // U + vextractf128 [edx + edi], ymm0, 0 // V + lea edx, [edx + 16] + sub ecx, 32 + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} + +__declspec(naked) void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + sub edi, edx + + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV + vpsrlw ymm1, ymm1, 8 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + vpand ymm1, ymm0, ymm5 // U + vpsrlw ymm0, ymm0, 8 // V + vpackuswb ymm1, ymm1, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm0 // mutates. + vpermq ymm1, ymm1, 0xd8 + vpermq ymm0, ymm0, 0xd8 + vextractf128 [edx], ymm1, 0 // U + vextractf128 [edx + edi], ymm0, 0 // V + lea edx, [edx + 16] + sub ecx, 32 + jg convertloop + + pop edi + vzeroupper + ret + } +} + +__declspec(naked) void UYVYToYRow_AVX2(const uint8_t* src_uyvy, + uint8_t* dst_y, + int width) { + __asm { + mov eax, [esp + 4] // src_uyvy + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // width + + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpsrlw ymm0, ymm0, 8 // odd bytes are Y + vpsrlw ymm1, ymm1, 8 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg convertloop + vzeroupper + ret + } +} + +__declspec(naked) void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + sub edi, edx + + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vpavgb ymm0, ymm0, [eax + esi] + vpavgb ymm1, ymm1, [eax + esi + 32] + lea eax, [eax + 64] + vpand ymm0, ymm0, ymm5 // UYVY -> UVUV + vpand ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + vpand ymm1, ymm0, ymm5 // U + vpsrlw ymm0, ymm0, 8 // V + vpackuswb ymm1, ymm1, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm0 // mutates. + vpermq ymm1, ymm1, 0xd8 + vpermq ymm0, ymm0, 0xd8 + vextractf128 [edx], ymm1, 0 // U + vextractf128 [edx + edi], ymm0, 0 // V + lea edx, [edx + 16] + sub ecx, 32 + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} + +__declspec(naked) void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + sub edi, edx + + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpand ymm0, ymm0, ymm5 // UYVY -> UVUV + vpand ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + vpand ymm1, ymm0, ymm5 // U + vpsrlw ymm0, ymm0, 8 // V + vpackuswb ymm1, ymm1, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm0 // mutates. + vpermq ymm1, ymm1, 0xd8 + vpermq ymm0, ymm0, 0xd8 + vextractf128 [edx], ymm1, 0 // U + vextractf128 [edx + edi], ymm0, 0 // V + lea edx, [edx + 16] + sub ecx, 32 + jg convertloop + + pop edi + vzeroupper + ret + } +} +#endif // HAS_YUY2TOYROW_AVX2 + +#ifdef HAS_YUY2TOYROW_SSE2 +__declspec(naked) void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, + uint8_t* dst_y, + int width) { + __asm { + mov eax, [esp + 4] // src_yuy2 + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 // even bytes are Y + pand xmm1, xmm5 + packuswb xmm0, xmm1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + ret + } +} + +__declspec(naked) void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + psrlw xmm0, 8 // YUYV -> UVUV + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // YUYV -> UVUV + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} + +__declspec(naked) void UYVYToYRow_SSE2(const uint8_t* src_uyvy, + uint8_t* dst_y, + int width) { + __asm { + mov eax, [esp + 4] // src_uyvy + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // width + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // odd bytes are Y + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + ret + } +} + +__declspec(naked) void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + pand xmm0, xmm5 // UYVY -> UVUV + pand xmm1, xmm5 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 // UYVY -> UVUV + pand xmm1, xmm5 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} +#endif // HAS_YUY2TOYROW_SSE2 + +#ifdef HAS_BLENDPLANEROW_SSSE3 +// Blend 8 pixels at a time. +// unsigned version of math +// =((A2*C2)+(B2*(255-C2))+255)/256 +// signed version of math +// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 +__declspec(naked) void BlendPlaneRow_SSSE3(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width) { + __asm { + push esi + push edi + pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 + psllw xmm5, 8 + mov eax, 0x80808080 // 128 for biasing image to signed. + movd xmm6, eax + pshufd xmm6, xmm6, 0x00 + + mov eax, 0x807f807f // 32768 + 127 for unbias and round. + movd xmm7, eax + pshufd xmm7, xmm7, 0x00 + mov eax, [esp + 8 + 4] // src0 + mov edx, [esp + 8 + 8] // src1 + mov esi, [esp + 8 + 12] // alpha + mov edi, [esp + 8 + 16] // dst + mov ecx, [esp + 8 + 20] // width + sub eax, esi + sub edx, esi + sub edi, esi + + // 8 pixel loop. + convertloop8: + movq xmm0, qword ptr [esi] // alpha + punpcklbw xmm0, xmm0 + pxor xmm0, xmm5 // a, 255-a + movq xmm1, qword ptr [eax + esi] // src0 + movq xmm2, qword ptr [edx + esi] // src1 + punpcklbw xmm1, xmm2 + psubb xmm1, xmm6 // bias src0/1 - 128 + pmaddubsw xmm0, xmm1 + paddw xmm0, xmm7 // unbias result - 32768 and round. + psrlw xmm0, 8 + packuswb xmm0, xmm0 + movq qword ptr [edi + esi], xmm0 + lea esi, [esi + 8] + sub ecx, 8 + jg convertloop8 + + pop edi + pop esi + ret + } +} +#endif // HAS_BLENDPLANEROW_SSSE3 + +#ifdef HAS_BLENDPLANEROW_AVX2 +// Blend 32 pixels at a time. +// unsigned version of math +// =((A2*C2)+(B2*(255-C2))+255)/256 +// signed version of math +// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 +__declspec(naked) void BlendPlaneRow_AVX2(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width) { + __asm { + push esi + push edi + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00 + vpsllw ymm5, ymm5, 8 + mov eax, 0x80808080 // 128 for biasing image to signed. + vmovd xmm6, eax + vbroadcastss ymm6, xmm6 + mov eax, 0x807f807f // 32768 + 127 for unbias and round. + vmovd xmm7, eax + vbroadcastss ymm7, xmm7 + mov eax, [esp + 8 + 4] // src0 + mov edx, [esp + 8 + 8] // src1 + mov esi, [esp + 8 + 12] // alpha + mov edi, [esp + 8 + 16] // dst + mov ecx, [esp + 8 + 20] // width + sub eax, esi + sub edx, esi + sub edi, esi + + // 32 pixel loop. + convertloop32: + vmovdqu ymm0, [esi] // alpha + vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31 + vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23 + vpxor ymm3, ymm3, ymm5 // a, 255-a + vpxor ymm0, ymm0, ymm5 // a, 255-a + vmovdqu ymm1, [eax + esi] // src0 + vmovdqu ymm2, [edx + esi] // src1 + vpunpckhbw ymm4, ymm1, ymm2 + vpunpcklbw ymm1, ymm1, ymm2 + vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128 + vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128 + vpmaddubsw ymm3, ymm3, ymm4 + vpmaddubsw ymm0, ymm0, ymm1 + vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round. + vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round. + vpsrlw ymm3, ymm3, 8 + vpsrlw ymm0, ymm0, 8 + vpackuswb ymm0, ymm0, ymm3 + vmovdqu [edi + esi], ymm0 + lea esi, [esi + 32] + sub ecx, 32 + jg convertloop32 + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_BLENDPLANEROW_AVX2 + +#ifdef HAS_ARGBBLENDROW_SSSE3 +// Shuffle table for isolating alpha. +static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, + 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; + +// Blend 8 pixels at a time. +__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm7, xmm7 // generate constant 0x0001 + psrlw xmm7, 15 + pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff + psrlw xmm6, 8 + pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 + psllw xmm5, 8 + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pslld xmm4, 24 + sub ecx, 4 + jl convertloop4b // less than 4 pixels? + + // 4 pixel loop. + convertloop4: + movdqu xmm3, [eax] // src argb + lea eax, [eax + 16] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movdqu xmm2, [esi] // _r_b + pshufb xmm3, xmmword ptr kShuffleAlpha // alpha + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movdqu xmm1, [esi] // _a_g + lea esi, [esi + 16] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jge convertloop4 + + convertloop4b: + add ecx, 4 - 1 + jl convertloop1b + + // 1 pixel loop. + convertloop1: + movd xmm3, [eax] // src argb + lea eax, [eax + 4] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movd xmm2, [esi] // _r_b + pshufb xmm3, xmmword ptr kShuffleAlpha // alpha + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movd xmm1, [esi] // _a_g + lea esi, [esi + 4] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + movd [edx], xmm0 + lea edx, [edx + 4] + sub ecx, 1 + jge convertloop1 + + convertloop1b: + pop esi + ret + } +} +#endif // HAS_ARGBBLENDROW_SSSE3 + +#ifdef HAS_ARGBATTENUATEROW_SSSE3 +// Shuffle table duplicating alpha. +static const uvec8 kShuffleAlpha0 = { + 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, +}; +static const uvec8 kShuffleAlpha1 = { + 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, + 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, +}; +__declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + __asm { + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + pcmpeqb xmm3, xmm3 // generate mask 0xff000000 + pslld xmm3, 24 + movdqa xmm4, xmmword ptr kShuffleAlpha0 + movdqa xmm5, xmmword ptr kShuffleAlpha1 + + convertloop: + movdqu xmm0, [eax] // read 4 pixels + pshufb xmm0, xmm4 // isolate first 2 alphas + movdqu xmm1, [eax] // read 4 pixels + punpcklbw xmm1, xmm1 // first 2 pixel rgbs + pmulhuw xmm0, xmm1 // rgb * a + movdqu xmm1, [eax] // read 4 pixels + pshufb xmm1, xmm5 // isolate next 2 alphas + movdqu xmm2, [eax] // read 4 pixels + punpckhbw xmm2, xmm2 // next 2 pixel rgbs + pmulhuw xmm1, xmm2 // rgb * a + movdqu xmm2, [eax] // mask original alpha + lea eax, [eax + 16] + pand xmm2, xmm3 + psrlw xmm0, 8 + psrlw xmm1, 8 + packuswb xmm0, xmm1 + por xmm0, xmm2 // copy original alpha + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg convertloop + + ret + } +} +#endif // HAS_ARGBATTENUATEROW_SSSE3 + +#ifdef HAS_ARGBATTENUATEROW_AVX2 +// Shuffle table duplicating alpha. +static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u, + 128u, 128u, 14u, 15u, 14u, 15u, + 14u, 15u, 128u, 128u}; +__declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + __asm { + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2 + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 + vpslld ymm5, ymm5, 24 + + convertloop: + vmovdqu ymm6, [eax] // read 8 pixels. + vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. + vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. + vpshufb ymm2, ymm0, ymm4 // low 4 alphas + vpshufb ymm3, ymm1, ymm4 // high 4 alphas + vpmulhuw ymm0, ymm0, ymm2 // rgb * a + vpmulhuw ymm1, ymm1, ymm3 // rgb * a + vpand ymm6, ymm6, ymm5 // isolate alpha + vpsrlw ymm0, ymm0, 8 + vpsrlw ymm1, ymm1, 8 + vpackuswb ymm0, ymm0, ymm1 // unmutated. + vpor ymm0, ymm0, ymm6 // copy original alpha + vmovdqu [eax + edx], ymm0 + lea eax, [eax + 32] + sub ecx, 8 + jg convertloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBATTENUATEROW_AVX2 + +#ifdef HAS_ARGBUNATTENUATEROW_SSE2 +// Unattenuate 4 pixels at a time. +__declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + __asm { + push ebx + push esi + push edi + mov eax, [esp + 12 + 4] // src_argb + mov edx, [esp + 12 + 8] // dst_argb + mov ecx, [esp + 12 + 12] // width + lea ebx, fixed_invtbl8 + + convertloop: + movdqu xmm0, [eax] // read 4 pixels + movzx esi, byte ptr [eax + 3] // first alpha + movzx edi, byte ptr [eax + 7] // second alpha + punpcklbw xmm0, xmm0 // first 2 + movd xmm2, dword ptr [ebx + esi * 4] + movd xmm3, dword ptr [ebx + edi * 4] + pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a + pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words + movlhps xmm2, xmm3 + pmulhuw xmm0, xmm2 // rgb * a + + movdqu xmm1, [eax] // read 4 pixels + movzx esi, byte ptr [eax + 11] // third alpha + movzx edi, byte ptr [eax + 15] // forth alpha + punpckhbw xmm1, xmm1 // next 2 + movd xmm2, dword ptr [ebx + esi * 4] + movd xmm3, dword ptr [ebx + edi * 4] + pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words + pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words + movlhps xmm2, xmm3 + pmulhuw xmm1, xmm2 // rgb * a + lea eax, [eax + 16] + packuswb xmm0, xmm1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg convertloop + + pop edi + pop esi + pop ebx + ret + } +} +#endif // HAS_ARGBUNATTENUATEROW_SSE2 + +#ifdef HAS_ARGBUNATTENUATEROW_AVX2 +// Shuffle table duplicating alpha. +static const uvec8 kUnattenShuffleAlpha_AVX2 = { + 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u}; +// TODO(fbarchard): Enable USE_GATHER for future hardware if faster. +// USE_GATHER is not on by default, due to being a slow instruction. +#ifdef USE_GATHER +__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + __asm { + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2 + + convertloop: + vmovdqu ymm6, [eax] // read 8 pixels. + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather. + vpsrld ymm2, ymm6, 24 // alpha in low 8 bits. + vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. + vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. + vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a + vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a + vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. + vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a + vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas + vpmulhuw ymm0, ymm0, ymm2 // rgb * ia + vpmulhuw ymm1, ymm1, ymm3 // rgb * ia + vpackuswb ymm0, ymm0, ymm1 // unmutated. + vmovdqu [eax + edx], ymm0 + lea eax, [eax + 32] + sub ecx, 8 + jg convertloop + + vzeroupper + ret + } +} +#else // USE_GATHER +__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + __asm { + + push ebx + push esi + push edi + mov eax, [esp + 12 + 4] // src_argb + mov edx, [esp + 12 + 8] // dst_argb + mov ecx, [esp + 12 + 12] // width + sub edx, eax + lea ebx, fixed_invtbl8 + vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2 + + convertloop: + // replace VPGATHER + movzx esi, byte ptr [eax + 3] // alpha0 + movzx edi, byte ptr [eax + 7] // alpha1 + vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a0] + vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a1] + movzx esi, byte ptr [eax + 11] // alpha2 + movzx edi, byte ptr [eax + 15] // alpha3 + vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0] + vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a2] + vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a3] + movzx esi, byte ptr [eax + 19] // alpha4 + movzx edi, byte ptr [eax + 23] // alpha5 + vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2] + vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a4] + vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a5] + movzx esi, byte ptr [eax + 27] // alpha6 + movzx edi, byte ptr [eax + 31] // alpha7 + vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4] + vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a6] + vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a7] + vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6] + vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0] + vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4] + vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0] + // end of VPGATHER + + vmovdqu ymm6, [eax] // read 8 pixels. + vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. + vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. + vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a + vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. + vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a + vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas + vpmulhuw ymm0, ymm0, ymm2 // rgb * ia + vpmulhuw ymm1, ymm1, ymm3 // rgb * ia + vpackuswb ymm0, ymm0, ymm1 // unmutated. + vmovdqu [eax + edx], ymm0 + lea eax, [eax + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + pop ebx + vzeroupper + ret + } +} +#endif // USE_GATHER +#endif // HAS_ARGBATTENUATEROW_AVX2 + +#ifdef HAS_ARGBGRAYROW_SSSE3 +// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. +__declspec(naked) void ARGBGrayRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_argb */ + mov ecx, [esp + 12] /* width */ + movdqa xmm4, xmmword ptr kARGBToYJ + movdqa xmm5, xmmword ptr kAddYJ64 + + convertloop: + movdqu xmm0, [eax] // G + movdqu xmm1, [eax + 16] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + phaddw xmm0, xmm1 + paddw xmm0, xmm5 // Add .5 for rounding. + psrlw xmm0, 7 + packuswb xmm0, xmm0 // 8 G bytes + movdqu xmm2, [eax] // A + movdqu xmm3, [eax + 16] + lea eax, [eax + 32] + psrld xmm2, 24 + psrld xmm3, 24 + packuswb xmm2, xmm3 + packuswb xmm2, xmm2 // 8 A bytes + movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA + punpcklbw xmm0, xmm0 // 8 GG words + punpcklbw xmm3, xmm2 // 8 GA words + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm3 // GGGA first 4 + punpckhwd xmm1, xmm3 // GGGA next 4 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + ret + } +} +#endif // HAS_ARGBGRAYROW_SSSE3 + +#ifdef HAS_ARGBSEPIAROW_SSSE3 +// b = (r * 35 + g * 68 + b * 17) >> 7 +// g = (r * 45 + g * 88 + b * 22) >> 7 +// r = (r * 50 + g * 98 + b * 24) >> 7 +// Constant for ARGB color to sepia tone. +static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0, + 17, 68, 35, 0, 17, 68, 35, 0}; + +static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0, + 22, 88, 45, 0, 22, 88, 45, 0}; + +static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, + 24, 98, 50, 0, 24, 98, 50, 0}; + +// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. +__declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) { + __asm { + mov eax, [esp + 4] /* dst_argb */ + mov ecx, [esp + 8] /* width */ + movdqa xmm2, xmmword ptr kARGBToSepiaB + movdqa xmm3, xmmword ptr kARGBToSepiaG + movdqa xmm4, xmmword ptr kARGBToSepiaR + + convertloop: + movdqu xmm0, [eax] // B + movdqu xmm6, [eax + 16] + pmaddubsw xmm0, xmm2 + pmaddubsw xmm6, xmm2 + phaddw xmm0, xmm6 + psrlw xmm0, 7 + packuswb xmm0, xmm0 // 8 B values + movdqu xmm5, [eax] // G + movdqu xmm1, [eax + 16] + pmaddubsw xmm5, xmm3 + pmaddubsw xmm1, xmm3 + phaddw xmm5, xmm1 + psrlw xmm5, 7 + packuswb xmm5, xmm5 // 8 G values + punpcklbw xmm0, xmm5 // 8 BG values + movdqu xmm5, [eax] // R + movdqu xmm1, [eax + 16] + pmaddubsw xmm5, xmm4 + pmaddubsw xmm1, xmm4 + phaddw xmm5, xmm1 + psrlw xmm5, 7 + packuswb xmm5, xmm5 // 8 R values + movdqu xmm6, [eax] // A + movdqu xmm1, [eax + 16] + psrld xmm6, 24 + psrld xmm1, 24 + packuswb xmm6, xmm1 + packuswb xmm6, xmm6 // 8 A values + punpcklbw xmm5, xmm6 // 8 RA values + movdqa xmm1, xmm0 // Weave BG, RA together + punpcklwd xmm0, xmm5 // BGRA first 4 + punpckhwd xmm1, xmm5 // BGRA next 4 + movdqu [eax], xmm0 + movdqu [eax + 16], xmm1 + lea eax, [eax + 32] + sub ecx, 8 + jg convertloop + ret + } +} +#endif // HAS_ARGBSEPIAROW_SSSE3 + +#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 +// Tranform 8 ARGB pixels (32 bytes) with color matrix. +// Same as Sepia except matrix is provided. +// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R +// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. +__declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_argb */ + mov ecx, [esp + 12] /* matrix_argb */ + movdqu xmm5, [ecx] + pshufd xmm2, xmm5, 0x00 + pshufd xmm3, xmm5, 0x55 + pshufd xmm4, xmm5, 0xaa + pshufd xmm5, xmm5, 0xff + mov ecx, [esp + 16] /* width */ + + convertloop: + movdqu xmm0, [eax] // B + movdqu xmm7, [eax + 16] + pmaddubsw xmm0, xmm2 + pmaddubsw xmm7, xmm2 + movdqu xmm6, [eax] // G + movdqu xmm1, [eax + 16] + pmaddubsw xmm6, xmm3 + pmaddubsw xmm1, xmm3 + phaddsw xmm0, xmm7 // B + phaddsw xmm6, xmm1 // G + psraw xmm0, 6 // B + psraw xmm6, 6 // G + packuswb xmm0, xmm0 // 8 B values + packuswb xmm6, xmm6 // 8 G values + punpcklbw xmm0, xmm6 // 8 BG values + movdqu xmm1, [eax] // R + movdqu xmm7, [eax + 16] + pmaddubsw xmm1, xmm4 + pmaddubsw xmm7, xmm4 + phaddsw xmm1, xmm7 // R + movdqu xmm6, [eax] // A + movdqu xmm7, [eax + 16] + pmaddubsw xmm6, xmm5 + pmaddubsw xmm7, xmm5 + phaddsw xmm6, xmm7 // A + psraw xmm1, 6 // R + psraw xmm6, 6 // A + packuswb xmm1, xmm1 // 8 R values + packuswb xmm6, xmm6 // 8 A values + punpcklbw xmm1, xmm6 // 8 RA values + movdqa xmm6, xmm0 // Weave BG, RA together + punpcklwd xmm0, xmm1 // BGRA first 4 + punpckhwd xmm6, xmm1 // BGRA next 4 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm6 + lea eax, [eax + 32] + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + ret + } +} +#endif // HAS_ARGBCOLORMATRIXROW_SSSE3 + +#ifdef HAS_ARGBQUANTIZEROW_SSE2 +// Quantize 4 ARGB pixels (16 bytes). +__declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width) { + __asm { + mov eax, [esp + 4] /* dst_argb */ + movd xmm2, [esp + 8] /* scale */ + movd xmm3, [esp + 12] /* interval_size */ + movd xmm4, [esp + 16] /* interval_offset */ + mov ecx, [esp + 20] /* width */ + pshuflw xmm2, xmm2, 040h + pshufd xmm2, xmm2, 044h + pshuflw xmm3, xmm3, 040h + pshufd xmm3, xmm3, 044h + pshuflw xmm4, xmm4, 040h + pshufd xmm4, xmm4, 044h + pxor xmm5, xmm5 // constant 0 + pcmpeqb xmm6, xmm6 // generate mask 0xff000000 + pslld xmm6, 24 + + convertloop: + movdqu xmm0, [eax] // read 4 pixels + punpcklbw xmm0, xmm5 // first 2 pixels + pmulhuw xmm0, xmm2 // pixel * scale >> 16 + movdqu xmm1, [eax] // read 4 pixels + punpckhbw xmm1, xmm5 // next 2 pixels + pmulhuw xmm1, xmm2 + pmullw xmm0, xmm3 // * interval_size + movdqu xmm7, [eax] // read 4 pixels + pmullw xmm1, xmm3 + pand xmm7, xmm6 // mask alpha + paddw xmm0, xmm4 // + interval_size / 2 + paddw xmm1, xmm4 + packuswb xmm0, xmm1 + por xmm0, xmm7 + movdqu [eax], xmm0 + lea eax, [eax + 16] + sub ecx, 4 + jg convertloop + ret + } +} +#endif // HAS_ARGBQUANTIZEROW_SSE2 + +#ifdef HAS_ARGBSHADEROW_SSE2 +// Shade 4 pixels at a time by specified value. +__declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + movd xmm2, [esp + 16] // value + punpcklbw xmm2, xmm2 + punpcklqdq xmm2, xmm2 + + convertloop: + movdqu xmm0, [eax] // read 4 pixels + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm0 // first 2 + punpckhbw xmm1, xmm1 // next 2 + pmulhuw xmm0, xmm2 // argb * value + pmulhuw xmm1, xmm2 // argb * value + psrlw xmm0, 8 + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg convertloop + + ret + } +} +#endif // HAS_ARGBSHADEROW_SSE2 + +#ifdef HAS_ARGBMULTIPLYROW_SSE2 +// Multiply 2 rows of ARGB pixels together, 4 pixels at a time. +__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + pxor xmm5, xmm5 // constant 0 + + convertloop: + movdqu xmm0, [eax] // read 4 pixels from src_argb0 + movdqu xmm2, [esi] // read 4 pixels from src_argb1 + movdqu xmm1, xmm0 + movdqu xmm3, xmm2 + punpcklbw xmm0, xmm0 // first 2 + punpckhbw xmm1, xmm1 // next 2 + punpcklbw xmm2, xmm5 // first 2 + punpckhbw xmm3, xmm5 // next 2 + pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2 + pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2 + lea eax, [eax + 16] + lea esi, [esi + 16] + packuswb xmm0, xmm1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg convertloop + + pop esi + ret + } +} +#endif // HAS_ARGBMULTIPLYROW_SSE2 + +#ifdef HAS_ARGBADDROW_SSE2 +// Add 2 rows of ARGB pixels together, 4 pixels at a time. +// TODO(fbarchard): Port this to posix, neon and other math functions. +__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + + sub ecx, 4 + jl convertloop49 + + convertloop4: + movdqu xmm0, [eax] // read 4 pixels from src_argb0 + lea eax, [eax + 16] + movdqu xmm1, [esi] // read 4 pixels from src_argb1 + lea esi, [esi + 16] + paddusb xmm0, xmm1 // src_argb0 + src_argb1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jge convertloop4 + + convertloop49: + add ecx, 4 - 1 + jl convertloop19 + + convertloop1: + movd xmm0, [eax] // read 1 pixels from src_argb0 + lea eax, [eax + 4] + movd xmm1, [esi] // read 1 pixels from src_argb1 + lea esi, [esi + 4] + paddusb xmm0, xmm1 // src_argb0 + src_argb1 + movd [edx], xmm0 + lea edx, [edx + 4] + sub ecx, 1 + jge convertloop1 + + convertloop19: + pop esi + ret + } +} +#endif // HAS_ARGBADDROW_SSE2 + +#ifdef HAS_ARGBSUBTRACTROW_SSE2 +// Subtract 2 rows of ARGB pixels together, 4 pixels at a time. +__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + + convertloop: + movdqu xmm0, [eax] // read 4 pixels from src_argb0 + lea eax, [eax + 16] + movdqu xmm1, [esi] // read 4 pixels from src_argb1 + lea esi, [esi + 16] + psubusb xmm0, xmm1 // src_argb0 - src_argb1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg convertloop + + pop esi + ret + } +} +#endif // HAS_ARGBSUBTRACTROW_SSE2 + +#ifdef HAS_ARGBMULTIPLYROW_AVX2 +// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. +__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + vpxor ymm5, ymm5, ymm5 // constant 0 + + convertloop: + vmovdqu ymm1, [eax] // read 8 pixels from src_argb0 + lea eax, [eax + 32] + vmovdqu ymm3, [esi] // read 8 pixels from src_argb1 + lea esi, [esi + 32] + vpunpcklbw ymm0, ymm1, ymm1 // low 4 + vpunpckhbw ymm1, ymm1, ymm1 // high 4 + vpunpcklbw ymm2, ymm3, ymm5 // low 4 + vpunpckhbw ymm3, ymm3, ymm5 // high 4 + vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4 + vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4 + vpackuswb ymm0, ymm0, ymm1 + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop esi + vzeroupper + ret + } +} +#endif // HAS_ARGBMULTIPLYROW_AVX2 + +#ifdef HAS_ARGBADDROW_AVX2 +// Add 2 rows of ARGB pixels together, 8 pixels at a time. +__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + + convertloop: + vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 + lea eax, [eax + 32] + vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 + lea esi, [esi + 32] + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop esi + vzeroupper + ret + } +} +#endif // HAS_ARGBADDROW_AVX2 + +#ifdef HAS_ARGBSUBTRACTROW_AVX2 +// Subtract 2 rows of ARGB pixels together, 8 pixels at a time. +__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + + convertloop: + vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 + lea eax, [eax + 32] + vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1 + lea esi, [esi + 32] + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop esi + vzeroupper + ret + } +} +#endif // HAS_ARGBSUBTRACTROW_AVX2 + +#ifdef HAS_SOBELXROW_SSE2 +// SobelX as a matrix is +// -1 0 1 +// -2 0 2 +// -1 0 1 +__declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_y0 + mov esi, [esp + 8 + 8] // src_y1 + mov edi, [esp + 8 + 12] // src_y2 + mov edx, [esp + 8 + 16] // dst_sobelx + mov ecx, [esp + 8 + 20] // width + sub esi, eax + sub edi, eax + sub edx, eax + pxor xmm5, xmm5 // constant 0 + + convertloop: + movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] + movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + psubw xmm0, xmm1 + movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] + movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] + punpcklbw xmm1, xmm5 + punpcklbw xmm2, xmm5 + psubw xmm1, xmm2 + movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] + movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2] + punpcklbw xmm2, xmm5 + punpcklbw xmm3, xmm5 + psubw xmm2, xmm3 + paddw xmm0, xmm2 + paddw xmm0, xmm1 + paddw xmm0, xmm1 + pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw + psubw xmm1, xmm0 + pmaxsw xmm0, xmm1 + packuswb xmm0, xmm0 + movq qword ptr [eax + edx], xmm0 + lea eax, [eax + 8] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} +#endif // HAS_SOBELXROW_SSE2 + +#ifdef HAS_SOBELYROW_SSE2 +// SobelY as a matrix is +// -1 -2 -1 +// 0 0 0 +// 1 2 1 +__declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_y0 + mov esi, [esp + 4 + 8] // src_y1 + mov edx, [esp + 4 + 12] // dst_sobely + mov ecx, [esp + 4 + 16] // width + sub esi, eax + sub edx, eax + pxor xmm5, xmm5 // constant 0 + + convertloop: + movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] + movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + psubw xmm0, xmm1 + movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] + movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1] + punpcklbw xmm1, xmm5 + punpcklbw xmm2, xmm5 + psubw xmm1, xmm2 + movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] + movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] + punpcklbw xmm2, xmm5 + punpcklbw xmm3, xmm5 + psubw xmm2, xmm3 + paddw xmm0, xmm2 + paddw xmm0, xmm1 + paddw xmm0, xmm1 + pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw + psubw xmm1, xmm0 + pmaxsw xmm0, xmm1 + packuswb xmm0, xmm0 + movq qword ptr [eax + edx], xmm0 + lea eax, [eax + 8] + sub ecx, 8 + jg convertloop + + pop esi + ret + } +} +#endif // HAS_SOBELYROW_SSE2 + +#ifdef HAS_SOBELROW_SSE2 +// Adds Sobel X and Sobel Y and stores Sobel into ARGB. +// A = 255 +// R = Sobel +// G = Sobel +// B = Sobel +__declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_sobelx + mov esi, [esp + 4 + 8] // src_sobely + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + sub esi, eax + pcmpeqb xmm5, xmm5 // alpha 255 + pslld xmm5, 24 // 0xff000000 + + convertloop: + movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely + lea eax, [eax + 16] + paddusb xmm0, xmm1 // sobel = sobelx + sobely + movdqa xmm2, xmm0 // GG + punpcklbw xmm2, xmm0 // First 8 + punpckhbw xmm0, xmm0 // Next 8 + movdqa xmm1, xmm2 // GGGG + punpcklwd xmm1, xmm2 // First 4 + punpckhwd xmm2, xmm2 // Next 4 + por xmm1, xmm5 // GGGA + por xmm2, xmm5 + movdqa xmm3, xmm0 // GGGG + punpcklwd xmm3, xmm0 // Next 4 + punpckhwd xmm0, xmm0 // Last 4 + por xmm3, xmm5 // GGGA + por xmm0, xmm5 + movdqu [edx], xmm1 + movdqu [edx + 16], xmm2 + movdqu [edx + 32], xmm3 + movdqu [edx + 48], xmm0 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + + pop esi + ret + } +} +#endif // HAS_SOBELROW_SSE2 + +#ifdef HAS_SOBELTOPLANEROW_SSE2 +// Adds Sobel X and Sobel Y and stores Sobel into a plane. +__declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_sobelx + mov esi, [esp + 4 + 8] // src_sobely + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + sub esi, eax + + convertloop: + movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely + lea eax, [eax + 16] + paddusb xmm0, xmm1 // sobel = sobelx + sobely + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + + pop esi + ret + } +} +#endif // HAS_SOBELTOPLANEROW_SSE2 + +#ifdef HAS_SOBELXYROW_SSE2 +// Mixes Sobel X, Sobel Y and Sobel into ARGB. +// A = 255 +// R = Sobel X +// G = Sobel +// B = Sobel Y +__declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_sobelx + mov esi, [esp + 4 + 8] // src_sobely + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + sub esi, eax + pcmpeqb xmm5, xmm5 // alpha 255 + + convertloop: + movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely + lea eax, [eax + 16] + movdqa xmm2, xmm0 + paddusb xmm2, xmm1 // sobel = sobelx + sobely + movdqa xmm3, xmm0 // XA + punpcklbw xmm3, xmm5 + punpckhbw xmm0, xmm5 + movdqa xmm4, xmm1 // YS + punpcklbw xmm4, xmm2 + punpckhbw xmm1, xmm2 + movdqa xmm6, xmm4 // YSXA + punpcklwd xmm6, xmm3 // First 4 + punpckhwd xmm4, xmm3 // Next 4 + movdqa xmm7, xmm1 // YSXA + punpcklwd xmm7, xmm0 // Next 4 + punpckhwd xmm1, xmm0 // Last 4 + movdqu [edx], xmm6 + movdqu [edx + 16], xmm4 + movdqu [edx + 32], xmm7 + movdqu [edx + 48], xmm1 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + + pop esi + ret + } +} +#endif // HAS_SOBELXYROW_SSE2 + +#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 +// Consider float CumulativeSum. +// Consider calling CumulativeSum one row at time as needed. +// Consider circular CumulativeSum buffer of radius * 2 + 1 height. +// Convert cumulative sum for an area to an average for 1 pixel. +// topleft is pointer to top left of CumulativeSum buffer for area. +// botleft is pointer to bottom left of CumulativeSum buffer. +// width is offset from left to right of area in CumulativeSum buffer measured +// in number of ints. +// area is the number of pixels in the area being averaged. +// dst points to pixel to store result to. +// count is number of averaged pixels to produce. +// Does 4 pixels at a time. +// This function requires alignment on accumulation buffer pointers. +void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, + const int32_t* botleft, + int width, + int area, + uint8_t* dst, + int count) { + __asm { + mov eax, topleft // eax topleft + mov esi, botleft // esi botleft + mov edx, width + movd xmm5, area + mov edi, dst + mov ecx, count + cvtdq2ps xmm5, xmm5 + rcpss xmm4, xmm5 // 1.0f / area + pshufd xmm4, xmm4, 0 + sub ecx, 4 + jl l4b + + cmp area, 128 // 128 pixels will not overflow 15 bits. + ja l4 + + pshufd xmm5, xmm5, 0 // area + pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0 + psrld xmm6, 16 + cvtdq2ps xmm6, xmm6 + addps xmm5, xmm6 // (65536.0 + area - 1) + mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area + cvtps2dq xmm5, xmm5 // 0.16 fixed point + packssdw xmm5, xmm5 // 16 bit shorts + + // 4 pixel loop small blocks. + s4: + // top left + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + + // - top right + psubd xmm0, [eax + edx * 4] + psubd xmm1, [eax + edx * 4 + 16] + psubd xmm2, [eax + edx * 4 + 32] + psubd xmm3, [eax + edx * 4 + 48] + lea eax, [eax + 64] + + // - bottom left + psubd xmm0, [esi] + psubd xmm1, [esi + 16] + psubd xmm2, [esi + 32] + psubd xmm3, [esi + 48] + + // + bottom right + paddd xmm0, [esi + edx * 4] + paddd xmm1, [esi + edx * 4 + 16] + paddd xmm2, [esi + edx * 4 + 32] + paddd xmm3, [esi + edx * 4 + 48] + lea esi, [esi + 64] + + packssdw xmm0, xmm1 // pack 4 pixels into 2 registers + packssdw xmm2, xmm3 + + pmulhuw xmm0, xmm5 + pmulhuw xmm2, xmm5 + + packuswb xmm0, xmm2 + movdqu [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 4 + jge s4 + + jmp l4b + + // 4 pixel loop + l4: + // top left + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + + // - top right + psubd xmm0, [eax + edx * 4] + psubd xmm1, [eax + edx * 4 + 16] + psubd xmm2, [eax + edx * 4 + 32] + psubd xmm3, [eax + edx * 4 + 48] + lea eax, [eax + 64] + + // - bottom left + psubd xmm0, [esi] + psubd xmm1, [esi + 16] + psubd xmm2, [esi + 32] + psubd xmm3, [esi + 48] + + // + bottom right + paddd xmm0, [esi + edx * 4] + paddd xmm1, [esi + edx * 4 + 16] + paddd xmm2, [esi + edx * 4 + 32] + paddd xmm3, [esi + edx * 4 + 48] + lea esi, [esi + 64] + + cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area + cvtdq2ps xmm1, xmm1 + mulps xmm0, xmm4 + mulps xmm1, xmm4 + cvtdq2ps xmm2, xmm2 + cvtdq2ps xmm3, xmm3 + mulps xmm2, xmm4 + mulps xmm3, xmm4 + cvtps2dq xmm0, xmm0 + cvtps2dq xmm1, xmm1 + cvtps2dq xmm2, xmm2 + cvtps2dq xmm3, xmm3 + packssdw xmm0, xmm1 + packssdw xmm2, xmm3 + packuswb xmm0, xmm2 + movdqu [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 4 + jge l4 + + l4b: + add ecx, 4 - 1 + jl l1b + + // 1 pixel loop + l1: + movdqu xmm0, [eax] + psubd xmm0, [eax + edx * 4] + lea eax, [eax + 16] + psubd xmm0, [esi] + paddd xmm0, [esi + edx * 4] + lea esi, [esi + 16] + cvtdq2ps xmm0, xmm0 + mulps xmm0, xmm4 + cvtps2dq xmm0, xmm0 + packssdw xmm0, xmm0 + packuswb xmm0, xmm0 + movd dword ptr [edi], xmm0 + lea edi, [edi + 4] + sub ecx, 1 + jge l1 + l1b: + } +} +#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 + +#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 +// Creates a table of cumulative sums where each value is a sum of all values +// above and to the left of the value. +void ComputeCumulativeSumRow_SSE2(const uint8_t* row, + int32_t* cumsum, + const int32_t* previous_cumsum, + int width) { + __asm { + mov eax, row + mov edx, cumsum + mov esi, previous_cumsum + mov ecx, width + pxor xmm0, xmm0 + pxor xmm1, xmm1 + + sub ecx, 4 + jl l4b + test edx, 15 + jne l4b + + // 4 pixel loop + l4: + movdqu xmm2, [eax] // 4 argb pixels 16 bytes. + lea eax, [eax + 16] + movdqa xmm4, xmm2 + + punpcklbw xmm2, xmm1 + movdqa xmm3, xmm2 + punpcklwd xmm2, xmm1 + punpckhwd xmm3, xmm1 + + punpckhbw xmm4, xmm1 + movdqa xmm5, xmm4 + punpcklwd xmm4, xmm1 + punpckhwd xmm5, xmm1 + + paddd xmm0, xmm2 + movdqu xmm2, [esi] // previous row above. + paddd xmm2, xmm0 + + paddd xmm0, xmm3 + movdqu xmm3, [esi + 16] + paddd xmm3, xmm0 + + paddd xmm0, xmm4 + movdqu xmm4, [esi + 32] + paddd xmm4, xmm0 + + paddd xmm0, xmm5 + movdqu xmm5, [esi + 48] + lea esi, [esi + 64] + paddd xmm5, xmm0 + + movdqu [edx], xmm2 + movdqu [edx + 16], xmm3 + movdqu [edx + 32], xmm4 + movdqu [edx + 48], xmm5 + + lea edx, [edx + 64] + sub ecx, 4 + jge l4 + + l4b: + add ecx, 4 - 1 + jl l1b + + // 1 pixel loop + l1: + movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes. + lea eax, [eax + 4] + punpcklbw xmm2, xmm1 + punpcklwd xmm2, xmm1 + paddd xmm0, xmm2 + movdqu xmm2, [esi] + lea esi, [esi + 16] + paddd xmm2, xmm0 + movdqu [edx], xmm2 + lea edx, [edx + 16] + sub ecx, 1 + jge l1 + + l1b: + } +} +#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 + +#ifdef HAS_ARGBAFFINEROW_SSE2 +// Copy ARGB pixels from source image with slope to a row of destination. +__declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb, + int src_argb_stride, + uint8_t* dst_argb, + const float* uv_dudv, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 12] // src_argb + mov esi, [esp + 16] // stride + mov edx, [esp + 20] // dst_argb + mov ecx, [esp + 24] // pointer to uv_dudv + movq xmm2, qword ptr [ecx] // uv + movq xmm7, qword ptr [ecx + 8] // dudv + mov ecx, [esp + 28] // width + shl esi, 16 // 4, stride + add esi, 4 + movd xmm5, esi + sub ecx, 4 + jl l4b + + // setup for 4 pixel loop + pshufd xmm7, xmm7, 0x44 // dup dudv + pshufd xmm5, xmm5, 0 // dup 4, stride + movdqa xmm0, xmm2 // x0, y0, x1, y1 + addps xmm0, xmm7 + movlhps xmm2, xmm0 + movdqa xmm4, xmm7 + addps xmm4, xmm4 // dudv *= 2 + movdqa xmm3, xmm2 // x2, y2, x3, y3 + addps xmm3, xmm4 + addps xmm4, xmm4 // dudv *= 4 + + // 4 pixel loop + l4: + cvttps2dq xmm0, xmm2 // x, y float to int first 2 + cvttps2dq xmm1, xmm3 // x, y float to int next 2 + packssdw xmm0, xmm1 // x, y as 8 shorts + pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // shift right + movd edi, xmm0 + pshufd xmm0, xmm0, 0x39 // shift right + movd xmm1, [eax + esi] // read pixel 0 + movd xmm6, [eax + edi] // read pixel 1 + punpckldq xmm1, xmm6 // combine pixel 0 and 1 + addps xmm2, xmm4 // x, y += dx, dy first 2 + movq qword ptr [edx], xmm1 + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // shift right + movd edi, xmm0 + movd xmm6, [eax + esi] // read pixel 2 + movd xmm0, [eax + edi] // read pixel 3 + punpckldq xmm6, xmm0 // combine pixel 2 and 3 + addps xmm3, xmm4 // x, y += dx, dy next 2 + movq qword ptr 8[edx], xmm6 + lea edx, [edx + 16] + sub ecx, 4 + jge l4 + + l4b: + add ecx, 4 - 1 + jl l1b + + // 1 pixel loop + l1: + cvttps2dq xmm0, xmm2 // x, y float to int + packssdw xmm0, xmm0 // x, y as shorts + pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride + addps xmm2, xmm7 // x, y += dx, dy + movd esi, xmm0 + movd xmm0, [eax + esi] // copy a pixel + movd [edx], xmm0 + lea edx, [edx + 4] + sub ecx, 1 + jge l1 + l1b: + pop edi + pop esi + ret + } +} +#endif // HAS_ARGBAFFINEROW_SSE2 + +#ifdef HAS_INTERPOLATEROW_AVX2 +// Bilinear filter 32x2 -> 32x1 +__declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + // Dispatch to specialized filters if applicable. + cmp eax, 0 + je xloop100 // 0 / 256. Blend 100 / 0. + sub edi, esi + cmp eax, 128 + je xloop50 // 128 /256 is 0.50. Blend 50 / 50. + + vmovd xmm0, eax // high fraction 0..255 + neg eax + add eax, 256 + vmovd xmm5, eax // low fraction 256..1 + vpunpcklbw xmm5, xmm5, xmm0 + vpunpcklwd xmm5, xmm5, xmm5 + vbroadcastss ymm5, xmm5 + + mov eax, 0x80808080 // 128b for bias and rounding. + vmovd xmm4, eax + vbroadcastss ymm4, xmm4 + + xloop: + vmovdqu ymm0, [esi] + vmovdqu ymm2, [esi + edx] + vpunpckhbw ymm1, ymm0, ymm2 // mutates + vpunpcklbw ymm0, ymm0, ymm2 + vpsubb ymm1, ymm1, ymm4 // bias to signed image + vpsubb ymm0, ymm0, ymm4 + vpmaddubsw ymm1, ymm5, ymm1 + vpmaddubsw ymm0, ymm5, ymm0 + vpaddw ymm1, ymm1, ymm4 // unbias and round + vpaddw ymm0, ymm0, ymm4 + vpsrlw ymm1, ymm1, 8 + vpsrlw ymm0, ymm0, 8 + vpackuswb ymm0, ymm0, ymm1 // unmutates + vmovdqu [esi + edi], ymm0 + lea esi, [esi + 32] + sub ecx, 32 + jg xloop + jmp xloop99 + + // Blend 50 / 50. + xloop50: + vmovdqu ymm0, [esi] + vpavgb ymm0, ymm0, [esi + edx] + vmovdqu [esi + edi], ymm0 + lea esi, [esi + 32] + sub ecx, 32 + jg xloop50 + jmp xloop99 + + // Blend 100 / 0 - Copy row unchanged. + xloop100: + rep movsb + + xloop99: + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_INTERPOLATEROW_AVX2 + +// Bilinear filter 16x2 -> 16x1 +// TODO(fbarchard): Consider allowing 256 using memcpy. +__declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { + __asm { + push esi + push edi + + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + sub edi, esi + // Dispatch to specialized filters if applicable. + cmp eax, 0 + je xloop100 // 0 /256. Blend 100 / 0. + cmp eax, 128 + je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. + + movd xmm0, eax // high fraction 0..255 + neg eax + add eax, 256 + movd xmm5, eax // low fraction 255..1 + punpcklbw xmm5, xmm0 + punpcklwd xmm5, xmm5 + pshufd xmm5, xmm5, 0 + mov eax, 0x80808080 // 128 for biasing image to signed. + movd xmm4, eax + pshufd xmm4, xmm4, 0x00 + + xloop: + movdqu xmm0, [esi] + movdqu xmm2, [esi + edx] + movdqu xmm1, xmm0 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + psubb xmm0, xmm4 // bias image by -128 + psubb xmm1, xmm4 + movdqa xmm2, xmm5 + movdqa xmm3, xmm5 + pmaddubsw xmm2, xmm0 + pmaddubsw xmm3, xmm1 + paddw xmm2, xmm4 + paddw xmm3, xmm4 + psrlw xmm2, 8 + psrlw xmm3, 8 + packuswb xmm2, xmm3 + movdqu [esi + edi], xmm2 + lea esi, [esi + 16] + sub ecx, 16 + jg xloop + jmp xloop99 + + // Blend 50 / 50. + xloop50: + movdqu xmm0, [esi] + movdqu xmm1, [esi + edx] + pavgb xmm0, xmm1 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + sub ecx, 16 + jg xloop50 + jmp xloop99 + + // Blend 100 / 0 - Copy row unchanged. + xloop100: + movdqu xmm0, [esi] + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + sub ecx, 16 + jg xloop100 + + xloop99: + pop edi + pop esi + ret + } +} + +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +__declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // shuffler + movdqu xmm5, [ecx] + mov ecx, [esp + 16] // width + + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + pshufb xmm0, xmm5 + pshufb xmm1, xmm5 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg wloop + ret + } +} + +#ifdef HAS_ARGBSHUFFLEROW_AVX2 +__declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // shuffler + vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. + mov ecx, [esp + 16] // width + + wloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpshufb ymm0, ymm0, ymm5 + vpshufb ymm1, ymm1, ymm5 + vmovdqu [edx], ymm0 + vmovdqu [edx + 32], ymm1 + lea edx, [edx + 64] + sub ecx, 16 + jg wloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBSHUFFLEROW_AVX2 + +// YUY2 - Macro-pixel = 2 image pixels +// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... + +// UYVY - Macro-pixel = 2 image pixels +// U0Y0V0Y1 + +__declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_frame, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_y + mov esi, [esp + 8 + 8] // src_u + mov edx, [esp + 8 + 12] // src_v + mov edi, [esp + 8 + 16] // dst_frame + mov ecx, [esp + 8 + 20] // width + sub edx, esi + + convertloop: + movq xmm2, qword ptr [esi] // U + movq xmm3, qword ptr [esi + edx] // V + lea esi, [esi + 8] + punpcklbw xmm2, xmm3 // UV + movdqu xmm0, [eax] // Y + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm2 // YUYV + punpckhbw xmm1, xmm2 + movdqu [edi], xmm0 + movdqu [edi + 16], xmm1 + lea edi, [edi + 32] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_frame, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_y + mov esi, [esp + 8 + 8] // src_u + mov edx, [esp + 8 + 12] // src_v + mov edi, [esp + 8 + 16] // dst_frame + mov ecx, [esp + 8 + 20] // width + sub edx, esi + + convertloop: + movq xmm2, qword ptr [esi] // U + movq xmm3, qword ptr [esi + edx] // V + lea esi, [esi + 8] + punpcklbw xmm2, xmm3 // UV + movdqu xmm0, [eax] // Y + movdqa xmm1, xmm2 + lea eax, [eax + 16] + punpcklbw xmm1, xmm0 // UYVY + punpckhbw xmm2, xmm0 + movdqu [edi], xmm1 + movdqu [edi + 16], xmm2 + lea edi, [edi + 32] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +#ifdef HAS_ARGBPOLYNOMIALROW_SSE2 +__declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, + const float* poly, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] /* src_argb */ + mov edx, [esp + 4 + 8] /* dst_argb */ + mov esi, [esp + 4 + 12] /* poly */ + mov ecx, [esp + 4 + 16] /* width */ + pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. + + // 2 pixel loop. + convertloop: + // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel + // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel + movq xmm0, qword ptr [eax] // BGRABGRA + lea eax, [eax + 8] + punpcklbw xmm0, xmm3 + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm3 // pixel 0 + punpckhwd xmm4, xmm3 // pixel 1 + cvtdq2ps xmm0, xmm0 // 4 floats + cvtdq2ps xmm4, xmm4 + movdqa xmm1, xmm0 // X + movdqa xmm5, xmm4 + mulps xmm0, [esi + 16] // C1 * X + mulps xmm4, [esi + 16] + addps xmm0, [esi] // result = C0 + C1 * X + addps xmm4, [esi] + movdqa xmm2, xmm1 + movdqa xmm6, xmm5 + mulps xmm2, xmm1 // X * X + mulps xmm6, xmm5 + mulps xmm1, xmm2 // X * X * X + mulps xmm5, xmm6 + mulps xmm2, [esi + 32] // C2 * X * X + mulps xmm6, [esi + 32] + mulps xmm1, [esi + 48] // C3 * X * X * X + mulps xmm5, [esi + 48] + addps xmm0, xmm2 // result += C2 * X * X + addps xmm4, xmm6 + addps xmm0, xmm1 // result += C3 * X * X * X + addps xmm4, xmm5 + cvttps2dq xmm0, xmm0 + cvttps2dq xmm4, xmm4 + packuswb xmm0, xmm4 + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + sub ecx, 2 + jg convertloop + pop esi + ret + } +} +#endif // HAS_ARGBPOLYNOMIALROW_SSE2 + +#ifdef HAS_ARGBPOLYNOMIALROW_AVX2 +__declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + const float* poly, + int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_argb */ + mov ecx, [esp + 12] /* poly */ + vbroadcastf128 ymm4, [ecx] // C0 + vbroadcastf128 ymm5, [ecx + 16] // C1 + vbroadcastf128 ymm6, [ecx + 32] // C2 + vbroadcastf128 ymm7, [ecx + 48] // C3 + mov ecx, [esp + 16] /* width */ + + // 2 pixel loop. + convertloop: + vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels + lea eax, [eax + 8] + vcvtdq2ps ymm0, ymm0 // X 8 floats + vmulps ymm2, ymm0, ymm0 // X * X + vmulps ymm3, ymm0, ymm7 // C3 * X + vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X + vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X + vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X + vcvttps2dq ymm0, ymm0 + vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000 + vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000 + vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000 + vmovq qword ptr [edx], xmm0 + lea edx, [edx + 8] + sub ecx, 2 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBPOLYNOMIALROW_AVX2 + +#ifdef HAS_HALFFLOATROW_SSE2 +static float kExpBias = 1.9259299444e-34f; +__declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + __asm { + mov eax, [esp + 4] /* src */ + mov edx, [esp + 8] /* dst */ + movd xmm4, dword ptr [esp + 12] /* scale */ + mov ecx, [esp + 16] /* width */ + mulss xmm4, kExpBias + pshufd xmm4, xmm4, 0 + pxor xmm5, xmm5 + sub edx, eax + + // 8 pixel loop. + convertloop: + movdqu xmm2, xmmword ptr [eax] // 8 shorts + add eax, 16 + movdqa xmm3, xmm2 + punpcklwd xmm2, xmm5 + cvtdq2ps xmm2, xmm2 // convert 8 ints to floats + punpckhwd xmm3, xmm5 + cvtdq2ps xmm3, xmm3 + mulps xmm2, xmm4 + mulps xmm3, xmm4 + psrld xmm2, 13 + psrld xmm3, 13 + packssdw xmm2, xmm3 + movdqu [eax + edx - 16], xmm2 + sub ecx, 8 + jg convertloop + ret + } +} +#endif // HAS_HALFFLOATROW_SSE2 + +#ifdef HAS_HALFFLOATROW_AVX2 +__declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + __asm { + mov eax, [esp + 4] /* src */ + mov edx, [esp + 8] /* dst */ + movd xmm4, dword ptr [esp + 12] /* scale */ + mov ecx, [esp + 16] /* width */ + + vmulss xmm4, xmm4, kExpBias + vbroadcastss ymm4, xmm4 + vpxor ymm5, ymm5, ymm5 + sub edx, eax + + // 16 pixel loop. + convertloop: + vmovdqu ymm2, [eax] // 16 shorts + add eax, 32 + vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints + vpunpcklwd ymm2, ymm2, ymm5 + vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats + vcvtdq2ps ymm2, ymm2 + vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range. + vmulps ymm2, ymm2, ymm4 + vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate + vpsrld ymm2, ymm2, 13 + vpackssdw ymm2, ymm2, ymm3 + vmovdqu [eax + edx - 32], ymm2 + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_HALFFLOATROW_AVX2 + +#ifdef HAS_HALFFLOATROW_F16C +__declspec(naked) void HalfFloatRow_F16C(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + __asm { + mov eax, [esp + 4] /* src */ + mov edx, [esp + 8] /* dst */ + vbroadcastss ymm4, [esp + 12] /* scale */ + mov ecx, [esp + 16] /* width */ + sub edx, eax + + // 16 pixel loop. + convertloop: + vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints + vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts + add eax, 32 + vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats + vcvtdq2ps ymm3, ymm3 + vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1 + vmulps ymm3, ymm3, ymm4 + vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate + vcvtps2ph xmm3, ymm3, 3 + vmovdqu [eax + edx + 32], xmm2 + vmovdqu [eax + edx + 32 + 16], xmm3 + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_HALFFLOATROW_F16C + +#ifdef HAS_ARGBCOLORTABLEROW_X86 +// Tranform ARGB pixels with color table. +__declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb, + const uint8_t* table_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] /* dst_argb */ + mov esi, [esp + 4 + 8] /* table_argb */ + mov ecx, [esp + 4 + 12] /* width */ + + // 1 pixel loop. + convertloop: + movzx edx, byte ptr [eax] + lea eax, [eax + 4] + movzx edx, byte ptr [esi + edx * 4] + mov byte ptr [eax - 4], dl + movzx edx, byte ptr [eax - 4 + 1] + movzx edx, byte ptr [esi + edx * 4 + 1] + mov byte ptr [eax - 4 + 1], dl + movzx edx, byte ptr [eax - 4 + 2] + movzx edx, byte ptr [esi + edx * 4 + 2] + mov byte ptr [eax - 4 + 2], dl + movzx edx, byte ptr [eax - 4 + 3] + movzx edx, byte ptr [esi + edx * 4 + 3] + mov byte ptr [eax - 4 + 3], dl + dec ecx + jg convertloop + pop esi + ret + } +} +#endif // HAS_ARGBCOLORTABLEROW_X86 + +#ifdef HAS_RGBCOLORTABLEROW_X86 +// Tranform RGB pixels with color table. +__declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb, + const uint8_t* table_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] /* dst_argb */ + mov esi, [esp + 4 + 8] /* table_argb */ + mov ecx, [esp + 4 + 12] /* width */ + + // 1 pixel loop. + convertloop: + movzx edx, byte ptr [eax] + lea eax, [eax + 4] + movzx edx, byte ptr [esi + edx * 4] + mov byte ptr [eax - 4], dl + movzx edx, byte ptr [eax - 4 + 1] + movzx edx, byte ptr [esi + edx * 4 + 1] + mov byte ptr [eax - 4 + 1], dl + movzx edx, byte ptr [eax - 4 + 2] + movzx edx, byte ptr [esi + edx * 4 + 2] + mov byte ptr [eax - 4 + 2], dl + dec ecx + jg convertloop + + pop esi + ret + } +} +#endif // HAS_RGBCOLORTABLEROW_X86 + +#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 +// Tranform RGB pixels with luma table. +__declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + const uint8_t* luma, + uint32_t lumacoeff) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] /* src_argb */ + mov edi, [esp + 8 + 8] /* dst_argb */ + mov ecx, [esp + 8 + 12] /* width */ + movd xmm2, dword ptr [esp + 8 + 16] // luma table + movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff + pshufd xmm2, xmm2, 0 + pshufd xmm3, xmm3, 0 + pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00 + psllw xmm4, 8 + pxor xmm5, xmm5 + + // 4 pixel loop. + convertloop: + movdqu xmm0, xmmword ptr [eax] // generate luma ptr + pmaddubsw xmm0, xmm3 + phaddw xmm0, xmm0 + pand xmm0, xmm4 // mask out low bits + punpcklwd xmm0, xmm5 + paddd xmm0, xmm2 // add table base + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 + + movzx edx, byte ptr [eax] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi], dl + movzx edx, byte ptr [eax + 1] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 1], dl + movzx edx, byte ptr [eax + 2] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 2], dl + movzx edx, byte ptr [eax + 3] // copy alpha. + mov byte ptr [edi + 3], dl + + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 + + movzx edx, byte ptr [eax + 4] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 4], dl + movzx edx, byte ptr [eax + 5] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 5], dl + movzx edx, byte ptr [eax + 6] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 6], dl + movzx edx, byte ptr [eax + 7] // copy alpha. + mov byte ptr [edi + 7], dl + + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 + + movzx edx, byte ptr [eax + 8] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 8], dl + movzx edx, byte ptr [eax + 9] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 9], dl + movzx edx, byte ptr [eax + 10] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 10], dl + movzx edx, byte ptr [eax + 11] // copy alpha. + mov byte ptr [edi + 11], dl + + movd esi, xmm0 + + movzx edx, byte ptr [eax + 12] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 12], dl + movzx edx, byte ptr [eax + 13] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 13], dl + movzx edx, byte ptr [eax + 14] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 14], dl + movzx edx, byte ptr [eax + 15] // copy alpha. + mov byte ptr [edi + 15], dl + + lea eax, [eax + 16] + lea edi, [edi + 16] + sub ecx, 4 + jg convertloop + + pop edi + pop esi + ret + } +} +#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 + +#endif // defined(_M_X64) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) diff --git a/media/libvpx/libvpx/third_party/libyuv/source/scale.cc b/media/libvpx/libvpx/third_party/libyuv/source/scale.cc new file mode 100644 index 0000000000..2cfa1c6cb1 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/scale.cc @@ -0,0 +1,1741 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" + +#include +#include + +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" // For CopyPlane +#include "libyuv/row.h" +#include "libyuv/scale_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) + +// Scale plane, 1/2 +// This is an optimized version for scaling down a plane to 1/2 of +// its original size. + +static void ScalePlaneDown2(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width) = + filtering == kFilterNone + ? ScaleRowDown2_C + : (filtering == kFilterLinear ? ScaleRowDown2Linear_C + : ScaleRowDown2Box_C); + int row_stride = src_stride << 1; + (void)src_width; + (void)src_height; + if (!filtering) { + src_ptr += src_stride; // Point to odd rows. + src_stride = 0; + } + +#if defined(HAS_SCALEROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_Any_NEON + : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON + : ScaleRowDown2Box_Any_NEON); + if (IS_ALIGNED(dst_width, 16)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON + : (filtering == kFilterLinear + ? ScaleRowDown2Linear_NEON + : ScaleRowDown2Box_NEON); + } + } +#endif +#if defined(HAS_SCALEROWDOWN2_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_Any_SSSE3 + : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3 + : ScaleRowDown2Box_Any_SSSE3); + if (IS_ALIGNED(dst_width, 16)) { + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_SSSE3 + : (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3 + : ScaleRowDown2Box_SSSE3); + } + } +#endif +#if defined(HAS_SCALEROWDOWN2_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_Any_AVX2 + : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 + : ScaleRowDown2Box_Any_AVX2); + if (IS_ALIGNED(dst_width, 32)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 + : (filtering == kFilterLinear + ? ScaleRowDown2Linear_AVX2 + : ScaleRowDown2Box_AVX2); + } + } +#endif +#if defined(HAS_SCALEROWDOWN2_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_Any_MSA + : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MSA + : ScaleRowDown2Box_Any_MSA); + if (IS_ALIGNED(dst_width, 32)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MSA + : (filtering == kFilterLinear + ? ScaleRowDown2Linear_MSA + : ScaleRowDown2Box_MSA); + } + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + // TODO(fbarchard): Loop through source height to allow odd height. + for (y = 0; y < dst_height; ++y) { + ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += row_stride; + dst_ptr += dst_stride; + } +} + +static void ScalePlaneDown2_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown2)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, int dst_width) = + filtering == kFilterNone + ? ScaleRowDown2_16_C + : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C + : ScaleRowDown2Box_16_C); + int row_stride = src_stride << 1; + (void)src_width; + (void)src_height; + if (!filtering) { + src_ptr += src_stride; // Point to odd rows. + src_stride = 0; + } + +#if defined(HAS_SCALEROWDOWN2_16_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) { + ScaleRowDown2 = + filtering ? ScaleRowDown2Box_16_NEON : ScaleRowDown2_16_NEON; + } +#endif +#if defined(HAS_SCALEROWDOWN2_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) { + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_16_SSE2 + : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 + : ScaleRowDown2Box_16_SSE2); + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + // TODO(fbarchard): Loop through source height to allow odd height. + for (y = 0; y < dst_height; ++y) { + ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += row_stride; + dst_ptr += dst_stride; + } +} + +// Scale plane, 1/4 +// This is an optimized version for scaling down a plane to 1/4 of +// its original size. + +static void ScalePlaneDown4(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown4)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width) = + filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C; + int row_stride = src_stride << 2; + (void)src_width; + (void)src_height; + if (!filtering) { + src_ptr += src_stride * 2; // Point to row 2. + src_stride = 0; + } +#if defined(HAS_SCALEROWDOWN4_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON; + } + } +#endif +#if defined(HAS_SCALEROWDOWN4_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3; + if (IS_ALIGNED(dst_width, 8)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3; + } + } +#endif +#if defined(HAS_SCALEROWDOWN4_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2; + if (IS_ALIGNED(dst_width, 16)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2; + } + } +#endif +#if defined(HAS_SCALEROWDOWN4_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_Any_MSA : ScaleRowDown4_Any_MSA; + if (IS_ALIGNED(dst_width, 16)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_MSA : ScaleRowDown4_MSA; + } + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + for (y = 0; y < dst_height; ++y) { + ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += row_stride; + dst_ptr += dst_stride; + } +} + +static void ScalePlaneDown4_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown4)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, int dst_width) = + filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C; + int row_stride = src_stride << 2; + (void)src_width; + (void)src_height; + if (!filtering) { + src_ptr += src_stride * 2; // Point to row 2. + src_stride = 0; + } +#if defined(HAS_SCALEROWDOWN4_16_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) { + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_16_NEON : ScaleRowDown4_16_NEON; + } +#endif +#if defined(HAS_SCALEROWDOWN4_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2; + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + for (y = 0; y < dst_height; ++y) { + ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += row_stride; + dst_ptr += dst_stride; + } +} + +// Scale plane down, 3/4 +static void ScalePlaneDown34(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown34_0)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width); + void (*ScaleRowDown34_1)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width); + const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; + (void)src_width; + (void)src_height; + assert(dst_width % 3 == 0); + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_C; + ScaleRowDown34_1 = ScaleRowDown34_C; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_C; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_C; + } +#if defined(HAS_SCALEROWDOWN34_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_Any_NEON; + ScaleRowDown34_1 = ScaleRowDown34_Any_NEON; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_NEON; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_NEON; + } + if (dst_width % 24 == 0) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_NEON; + ScaleRowDown34_1 = ScaleRowDown34_NEON; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON; + } + } + } +#endif +#if defined(HAS_SCALEROWDOWN34_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_Any_MSA; + ScaleRowDown34_1 = ScaleRowDown34_Any_MSA; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_MSA; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_MSA; + } + if (dst_width % 48 == 0) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_MSA; + ScaleRowDown34_1 = ScaleRowDown34_MSA; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_MSA; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_MSA; + } + } + } +#endif +#if defined(HAS_SCALEROWDOWN34_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_Any_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_Any_SSSE3; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_SSSE3; + } + if (dst_width % 24 == 0) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_SSSE3; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3; + } + } + } +#endif + + for (y = 0; y < dst_height - 2; y += 3) { + ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 2; + dst_ptr += dst_stride; + } + + // Remainder 1 or 2 rows with last row vertically unfiltered + if ((dst_height % 3) == 2) { + ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width); + } else if ((dst_height % 3) == 1) { + ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width); + } +} + +static void ScalePlaneDown34_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown34_0)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, int dst_width); + void (*ScaleRowDown34_1)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, int dst_width); + const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; + (void)src_width; + (void)src_height; + assert(dst_width % 3 == 0); + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_16_C; + ScaleRowDown34_1 = ScaleRowDown34_16_C; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_C; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_C; + } +#if defined(HAS_SCALEROWDOWN34_16_NEON) + if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_16_NEON; + ScaleRowDown34_1 = ScaleRowDown34_16_NEON; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_NEON; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_NEON; + } + } +#endif +#if defined(HAS_SCALEROWDOWN34_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_16_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_16_SSSE3; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_SSSE3; + } + } +#endif + + for (y = 0; y < dst_height - 2; y += 3) { + ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 2; + dst_ptr += dst_stride; + } + + // Remainder 1 or 2 rows with last row vertically unfiltered + if ((dst_height % 3) == 2) { + ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width); + } else if ((dst_height % 3) == 1) { + ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width); + } +} + +// Scale plane, 3/8 +// This is an optimized version for scaling down a plane to 3/8 +// of its original size. +// +// Uses box filter arranges like this +// aaabbbcc -> abc +// aaabbbcc def +// aaabbbcc ghi +// dddeeeff +// dddeeeff +// dddeeeff +// ggghhhii +// ggghhhii +// Boxes are 3x3, 2x3, 3x2 and 2x2 + +static void ScalePlaneDown38(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown38_3)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width); + void (*ScaleRowDown38_2)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width); + const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; + assert(dst_width % 3 == 0); + (void)src_width; + (void)src_height; + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_C; + ScaleRowDown38_2 = ScaleRowDown38_C; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_C; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_C; + } + +#if defined(HAS_SCALEROWDOWN38_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_Any_NEON; + ScaleRowDown38_2 = ScaleRowDown38_Any_NEON; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_NEON; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_NEON; + } + if (dst_width % 12 == 0) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_NEON; + ScaleRowDown38_2 = ScaleRowDown38_NEON; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON; + } + } + } +#endif +#if defined(HAS_SCALEROWDOWN38_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_Any_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_Any_SSSE3; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_SSSE3; + } + if (dst_width % 12 == 0 && !filtering) { + ScaleRowDown38_3 = ScaleRowDown38_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_SSSE3; + } + if (dst_width % 6 == 0 && filtering) { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3; + } + } +#endif +#if defined(HAS_SCALEROWDOWN38_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_Any_MSA; + ScaleRowDown38_2 = ScaleRowDown38_Any_MSA; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_MSA; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_MSA; + } + if (dst_width % 12 == 0) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_MSA; + ScaleRowDown38_2 = ScaleRowDown38_MSA; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_MSA; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_MSA; + } + } + } +#endif + + for (y = 0; y < dst_height - 2; y += 3) { + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 2; + dst_ptr += dst_stride; + } + + // Remainder 1 or 2 rows with last row vertically unfiltered + if ((dst_height % 3) == 2) { + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); + } else if ((dst_height % 3) == 1) { + ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); + } +} + +static void ScalePlaneDown38_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown38_3)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, int dst_width); + void (*ScaleRowDown38_2)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, int dst_width); + const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; + (void)src_width; + (void)src_height; + assert(dst_width % 3 == 0); + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_16_C; + ScaleRowDown38_2 = ScaleRowDown38_16_C; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_C; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_C; + } +#if defined(HAS_SCALEROWDOWN38_16_NEON) + if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_16_NEON; + ScaleRowDown38_2 = ScaleRowDown38_16_NEON; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_NEON; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_NEON; + } + } +#endif +#if defined(HAS_SCALEROWDOWN38_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_16_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_16_SSSE3; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_SSSE3; + } + } +#endif + + for (y = 0; y < dst_height - 2; y += 3) { + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 2; + dst_ptr += dst_stride; + } + + // Remainder 1 or 2 rows with last row vertically unfiltered + if ((dst_height % 3) == 2) { + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); + } else if ((dst_height % 3) == 1) { + ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); + } +} + +#define MIN1(x) ((x) < 1 ? 1 : (x)) + +static __inline uint32_t SumPixels(int iboxwidth, const uint16_t* src_ptr) { + uint32_t sum = 0u; + int x; + assert(iboxwidth > 0); + for (x = 0; x < iboxwidth; ++x) { + sum += src_ptr[x]; + } + return sum; +} + +static __inline uint32_t SumPixels_16(int iboxwidth, const uint32_t* src_ptr) { + uint32_t sum = 0u; + int x; + assert(iboxwidth > 0); + for (x = 0; x < iboxwidth; ++x) { + sum += src_ptr[x]; + } + return sum; +} + +static void ScaleAddCols2_C(int dst_width, + int boxheight, + int x, + int dx, + const uint16_t* src_ptr, + uint8_t* dst_ptr) { + int i; + int scaletbl[2]; + int minboxwidth = dx >> 16; + int boxwidth; + scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight); + scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight); + for (i = 0; i < dst_width; ++i) { + int ix = x >> 16; + x += dx; + boxwidth = MIN1((x >> 16) - ix); + *dst_ptr++ = + SumPixels(boxwidth, src_ptr + ix) * scaletbl[boxwidth - minboxwidth] >> + 16; + } +} + +static void ScaleAddCols2_16_C(int dst_width, + int boxheight, + int x, + int dx, + const uint32_t* src_ptr, + uint16_t* dst_ptr) { + int i; + int scaletbl[2]; + int minboxwidth = dx >> 16; + int boxwidth; + scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight); + scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight); + for (i = 0; i < dst_width; ++i) { + int ix = x >> 16; + x += dx; + boxwidth = MIN1((x >> 16) - ix); + *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) * + scaletbl[boxwidth - minboxwidth] >> + 16; + } +} + +static void ScaleAddCols0_C(int dst_width, + int boxheight, + int x, + int dx, + const uint16_t* src_ptr, + uint8_t* dst_ptr) { + int scaleval = 65536 / boxheight; + int i; + (void)dx; + src_ptr += (x >> 16); + for (i = 0; i < dst_width; ++i) { + *dst_ptr++ = src_ptr[i] * scaleval >> 16; + } +} + +static void ScaleAddCols1_C(int dst_width, + int boxheight, + int x, + int dx, + const uint16_t* src_ptr, + uint8_t* dst_ptr) { + int boxwidth = MIN1(dx >> 16); + int scaleval = 65536 / (boxwidth * boxheight); + int i; + x >>= 16; + for (i = 0; i < dst_width; ++i) { + *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16; + x += boxwidth; + } +} + +static void ScaleAddCols1_16_C(int dst_width, + int boxheight, + int x, + int dx, + const uint32_t* src_ptr, + uint16_t* dst_ptr) { + int boxwidth = MIN1(dx >> 16); + int scaleval = 65536 / (boxwidth * boxheight); + int i; + for (i = 0; i < dst_width; ++i) { + *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + x) * scaleval >> 16; + x += boxwidth; + } +} + +// Scale plane down to any dimensions, with interpolation. +// (boxfilter). +// +// Same method as SimpleScale, which is fixed point, outputting +// one pixel of destination using fixed point (16.16) to step +// through source, sampling a box of pixel with simple +// averaging. +static void ScalePlaneBox(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr) { + int j, k; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + const int max_y = (src_height << 16); + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y, + &dx, &dy); + src_width = Abs(src_width); + { + // Allocate a row buffer of uint16_t. + align_buffer_64(row16, src_width * 2); + void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx, + const uint16_t* src_ptr, uint8_t* dst_ptr) = + (dx & 0xffff) ? ScaleAddCols2_C + : ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C); + void (*ScaleAddRow)(const uint8_t* src_ptr, uint16_t* dst_ptr, + int src_width) = ScaleAddRow_C; +#if defined(HAS_SCALEADDROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleAddRow = ScaleAddRow_Any_SSE2; + if (IS_ALIGNED(src_width, 16)) { + ScaleAddRow = ScaleAddRow_SSE2; + } + } +#endif +#if defined(HAS_SCALEADDROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleAddRow = ScaleAddRow_Any_AVX2; + if (IS_ALIGNED(src_width, 32)) { + ScaleAddRow = ScaleAddRow_AVX2; + } + } +#endif +#if defined(HAS_SCALEADDROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleAddRow = ScaleAddRow_Any_NEON; + if (IS_ALIGNED(src_width, 16)) { + ScaleAddRow = ScaleAddRow_NEON; + } + } +#endif +#if defined(HAS_SCALEADDROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleAddRow = ScaleAddRow_Any_MSA; + if (IS_ALIGNED(src_width, 16)) { + ScaleAddRow = ScaleAddRow_MSA; + } + } +#endif + + for (j = 0; j < dst_height; ++j) { + int boxheight; + int iy = y >> 16; + const uint8_t* src = src_ptr + iy * src_stride; + y += dy; + if (y > max_y) { + y = max_y; + } + boxheight = MIN1((y >> 16) - iy); + memset(row16, 0, src_width * 2); + for (k = 0; k < boxheight; ++k) { + ScaleAddRow(src, (uint16_t*)(row16), src_width); + src += src_stride; + } + ScaleAddCols(dst_width, boxheight, x, dx, (uint16_t*)(row16), dst_ptr); + dst_ptr += dst_stride; + } + free_aligned_buffer_64(row16); + } +} + +static void ScalePlaneBox_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { + int j, k; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + const int max_y = (src_height << 16); + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y, + &dx, &dy); + src_width = Abs(src_width); + { + // Allocate a row buffer of uint32_t. + align_buffer_64(row32, src_width * 4); + void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx, + const uint32_t* src_ptr, uint16_t* dst_ptr) = + (dx & 0xffff) ? ScaleAddCols2_16_C : ScaleAddCols1_16_C; + void (*ScaleAddRow)(const uint16_t* src_ptr, uint32_t* dst_ptr, + int src_width) = ScaleAddRow_16_C; + +#if defined(HAS_SCALEADDROW_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) { + ScaleAddRow = ScaleAddRow_16_SSE2; + } +#endif + + for (j = 0; j < dst_height; ++j) { + int boxheight; + int iy = y >> 16; + const uint16_t* src = src_ptr + iy * src_stride; + y += dy; + if (y > max_y) { + y = max_y; + } + boxheight = MIN1((y >> 16) - iy); + memset(row32, 0, src_width * 4); + for (k = 0; k < boxheight; ++k) { + ScaleAddRow(src, (uint32_t*)(row32), src_width); + src += src_stride; + } + ScaleAddCols(dst_width, boxheight, x, dx, (uint32_t*)(row32), dst_ptr); + dst_ptr += dst_stride; + } + free_aligned_buffer_64(row32); + } +} + +// Scale plane down with bilinear interpolation. +void ScalePlaneBilinearDown(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr, + enum FilterMode filtering) { + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. + // Allocate a row buffer. + align_buffer_64(row, src_width); + + const int max_y = (src_height - 1) << 16; + int j; + void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, + int dst_width, int x, int dx) = + (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C; + void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, + &dx, &dy); + src_width = Abs(src_width); + +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(src_width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(src_width, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif + +#if defined(HAS_SCALEFILTERCOLS_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_SSSE3; + } +#endif +#if defined(HAS_SCALEFILTERCOLS_NEON) + if (TestCpuFlag(kCpuHasNEON) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleFilterCols = ScaleFilterCols_NEON; + } + } +#endif +#if defined(HAS_SCALEFILTERCOLS_MSA) + if (TestCpuFlag(kCpuHasMSA) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 16)) { + ScaleFilterCols = ScaleFilterCols_MSA; + } + } +#endif + if (y > max_y) { + y = max_y; + } + + for (j = 0; j < dst_height; ++j) { + int yi = y >> 16; + const uint8_t* src = src_ptr + yi * src_stride; + if (filtering == kFilterLinear) { + ScaleFilterCols(dst_ptr, src, dst_width, x, dx); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(row, src, src_stride, src_width, yf); + ScaleFilterCols(dst_ptr, row, dst_width, x, dx); + } + dst_ptr += dst_stride; + y += dy; + if (y > max_y) { + y = max_y; + } + } + free_aligned_buffer_64(row); +} + +void ScalePlaneBilinearDown_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr, + enum FilterMode filtering) { + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. + // Allocate a row buffer. + align_buffer_64(row, src_width * 2); + + const int max_y = (src_height - 1) << 16; + int j; + void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, + int dst_width, int x, int dx) = + (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C; + void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_16_C; + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, + &dx, &dy); + src_width = Abs(src_width); + +#if defined(HAS_INTERPOLATEROW_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + InterpolateRow = InterpolateRow_Any_16_SSE2; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_16_SSE2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_16_SSSE3; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_16_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_16_AVX2; + if (IS_ALIGNED(src_width, 32)) { + InterpolateRow = InterpolateRow_16_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_16_NEON; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_16_NEON; + } + } +#endif + +#if defined(HAS_SCALEFILTERCOLS_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_16_SSSE3; + } +#endif + if (y > max_y) { + y = max_y; + } + + for (j = 0; j < dst_height; ++j) { + int yi = y >> 16; + const uint16_t* src = src_ptr + yi * src_stride; + if (filtering == kFilterLinear) { + ScaleFilterCols(dst_ptr, src, dst_width, x, dx); + } else { + int yf = (y >> 8) & 255; + InterpolateRow((uint16_t*)row, src, src_stride, src_width, yf); + ScaleFilterCols(dst_ptr, (uint16_t*)row, dst_width, x, dx); + } + dst_ptr += dst_stride; + y += dy; + if (y > max_y) { + y = max_y; + } + } + free_aligned_buffer_64(row); +} + +// Scale up down with bilinear interpolation. +void ScalePlaneBilinearUp(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr, + enum FilterMode filtering) { + int j; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + const int max_y = (src_height - 1) << 16; + void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, + int dst_width, int x, int dx) = + filtering ? ScaleFilterCols_C : ScaleCols_C; + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, + &dx, &dy); + src_width = Abs(src_width); + +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif + + if (filtering && src_width >= 32768) { + ScaleFilterCols = ScaleFilterCols64_C; + } +#if defined(HAS_SCALEFILTERCOLS_SSSE3) + if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_SSSE3; + } +#endif +#if defined(HAS_SCALEFILTERCOLS_NEON) + if (filtering && TestCpuFlag(kCpuHasNEON) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleFilterCols = ScaleFilterCols_NEON; + } + } +#endif +#if defined(HAS_SCALEFILTERCOLS_MSA) + if (filtering && TestCpuFlag(kCpuHasMSA) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 16)) { + ScaleFilterCols = ScaleFilterCols_MSA; + } + } +#endif + if (!filtering && src_width * 2 == dst_width && x < 0x8000) { + ScaleFilterCols = ScaleColsUp2_C; +#if defined(HAS_SCALECOLS_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { + ScaleFilterCols = ScaleColsUp2_SSE2; + } +#endif + } + + if (y > max_y) { + y = max_y; + } + { + int yi = y >> 16; + const uint8_t* src = src_ptr + yi * src_stride; + + // Allocate 2 row buffers. + const int kRowSize = (dst_width + 31) & ~31; + align_buffer_64(row, kRowSize * 2); + + uint8_t* rowptr = row; + int rowstride = kRowSize; + int lasty = yi; + + ScaleFilterCols(rowptr, src, dst_width, x, dx); + if (src_height > 1) { + src += src_stride; + } + ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx); + src += src_stride; + + for (j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lasty) { + if (y > max_y) { + y = max_y; + yi = y >> 16; + src = src_ptr + yi * src_stride; + } + if (yi != lasty) { + ScaleFilterCols(rowptr, src, dst_width, x, dx); + rowptr += rowstride; + rowstride = -rowstride; + lasty = yi; + src += src_stride; + } + } + if (filtering == kFilterLinear) { + InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf); + } + dst_ptr += dst_stride; + y += dy; + } + free_aligned_buffer_64(row); + } +} + +void ScalePlaneBilinearUp_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr, + enum FilterMode filtering) { + int j; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + const int max_y = (src_height - 1) << 16; + void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_16_C; + void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, + int dst_width, int x, int dx) = + filtering ? ScaleFilterCols_16_C : ScaleCols_16_C; + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, + &dx, &dy); + src_width = Abs(src_width); + +#if defined(HAS_INTERPOLATEROW_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + InterpolateRow = InterpolateRow_Any_16_SSE2; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_16_SSE2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_16_SSSE3; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_16_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_16_AVX2; + if (IS_ALIGNED(dst_width, 32)) { + InterpolateRow = InterpolateRow_16_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_16_NEON; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_16_NEON; + } + } +#endif + + if (filtering && src_width >= 32768) { + ScaleFilterCols = ScaleFilterCols64_16_C; + } +#if defined(HAS_SCALEFILTERCOLS_16_SSSE3) + if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_16_SSSE3; + } +#endif + if (!filtering && src_width * 2 == dst_width && x < 0x8000) { + ScaleFilterCols = ScaleColsUp2_16_C; +#if defined(HAS_SCALECOLS_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { + ScaleFilterCols = ScaleColsUp2_16_SSE2; + } +#endif + } + + if (y > max_y) { + y = max_y; + } + { + int yi = y >> 16; + const uint16_t* src = src_ptr + yi * src_stride; + + // Allocate 2 row buffers. + const int kRowSize = (dst_width + 31) & ~31; + align_buffer_64(row, kRowSize * 4); + + uint16_t* rowptr = (uint16_t*)row; + int rowstride = kRowSize; + int lasty = yi; + + ScaleFilterCols(rowptr, src, dst_width, x, dx); + if (src_height > 1) { + src += src_stride; + } + ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx); + src += src_stride; + + for (j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lasty) { + if (y > max_y) { + y = max_y; + yi = y >> 16; + src = src_ptr + yi * src_stride; + } + if (yi != lasty) { + ScaleFilterCols(rowptr, src, dst_width, x, dx); + rowptr += rowstride; + rowstride = -rowstride; + lasty = yi; + src += src_stride; + } + } + if (filtering == kFilterLinear) { + InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf); + } + dst_ptr += dst_stride; + y += dy; + } + free_aligned_buffer_64(row); + } +} + +// Scale Plane to/from any dimensions, without interpolation. +// Fixed point math is used for performance: The upper 16 bits +// of x and dx is the integer part of the source position and +// the lower 16 bits are the fixed decimal part. + +static void ScalePlaneSimple(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr) { + int i; + void (*ScaleCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, int dst_width, + int x, int dx) = ScaleCols_C; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y, + &dx, &dy); + src_width = Abs(src_width); + + if (src_width * 2 == dst_width && x < 0x8000) { + ScaleCols = ScaleColsUp2_C; +#if defined(HAS_SCALECOLS_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { + ScaleCols = ScaleColsUp2_SSE2; + } +#endif + } + + for (i = 0; i < dst_height; ++i) { + ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx); + dst_ptr += dst_stride; + y += dy; + } +} + +static void ScalePlaneSimple_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { + int i; + void (*ScaleCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, int dst_width, + int x, int dx) = ScaleCols_16_C; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y, + &dx, &dy); + src_width = Abs(src_width); + + if (src_width * 2 == dst_width && x < 0x8000) { + ScaleCols = ScaleColsUp2_16_C; +#if defined(HAS_SCALECOLS_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { + ScaleCols = ScaleColsUp2_16_SSE2; + } +#endif + } + + for (i = 0; i < dst_height; ++i) { + ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx); + dst_ptr += dst_stride; + y += dy; + } +} + +// Scale a plane. +// This function dispatches to a specialized scaler based on scale factor. + +LIBYUV_API +void ScalePlane(const uint8_t* src, + int src_stride, + int src_width, + int src_height, + uint8_t* dst, + int dst_stride, + int dst_width, + int dst_height, + enum FilterMode filtering) { + // Simplify filtering when possible. + filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, + filtering); + + // Negative height means invert the image. + if (src_height < 0) { + src_height = -src_height; + src = src + (src_height - 1) * src_stride; + src_stride = -src_stride; + } + + // Use specialized scales to improve performance for common resolutions. + // For example, all the 1/2 scalings will use ScalePlaneDown2() + if (dst_width == src_width && dst_height == src_height) { + // Straight copy. + CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height); + return; + } + if (dst_width == src_width && filtering != kFilterBox) { + int dy = FixedDiv(src_height, dst_height); + // Arbitrary scale vertically, but unscaled horizontally. + ScalePlaneVertical(src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, 0, 0, dy, 1, filtering); + return; + } + if (dst_width <= Abs(src_width) && dst_height <= src_height) { + // Scale down. + if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) { + // optimized, 3/4 + ScalePlaneDown34(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, filtering); + return; + } + if (2 * dst_width == src_width && 2 * dst_height == src_height) { + // optimized, 1/2 + ScalePlaneDown2(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, filtering); + return; + } + // 3/8 rounded up for odd sized chroma height. + if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) { + // optimized, 3/8 + ScalePlaneDown38(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, filtering); + return; + } + if (4 * dst_width == src_width && 4 * dst_height == src_height && + (filtering == kFilterBox || filtering == kFilterNone)) { + // optimized, 1/4 + ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, filtering); + return; + } + } + if (filtering == kFilterBox && dst_height * 2 < src_height) { + ScalePlaneBox(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst); + return; + } + if (filtering && dst_height > src_height) { + ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + if (filtering) { + ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + ScalePlaneSimple(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst); +} + +LIBYUV_API +void ScalePlane_16(const uint16_t* src, + int src_stride, + int src_width, + int src_height, + uint16_t* dst, + int dst_stride, + int dst_width, + int dst_height, + enum FilterMode filtering) { + // Simplify filtering when possible. + filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, + filtering); + + // Negative height means invert the image. + if (src_height < 0) { + src_height = -src_height; + src = src + (src_height - 1) * src_stride; + src_stride = -src_stride; + } + + // Use specialized scales to improve performance for common resolutions. + // For example, all the 1/2 scalings will use ScalePlaneDown2() + if (dst_width == src_width && dst_height == src_height) { + // Straight copy. + CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height); + return; + } + if (dst_width == src_width && filtering != kFilterBox) { + int dy = FixedDiv(src_height, dst_height); + // Arbitrary scale vertically, but unscaled vertically. + ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, 0, 0, dy, 1, filtering); + return; + } + if (dst_width <= Abs(src_width) && dst_height <= src_height) { + // Scale down. + if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) { + // optimized, 3/4 + ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + if (2 * dst_width == src_width && 2 * dst_height == src_height) { + // optimized, 1/2 + ScalePlaneDown2_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + // 3/8 rounded up for odd sized chroma height. + if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) { + // optimized, 3/8 + ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + if (4 * dst_width == src_width && 4 * dst_height == src_height && + (filtering == kFilterBox || filtering == kFilterNone)) { + // optimized, 1/4 + ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + } + if (filtering == kFilterBox && dst_height * 2 < src_height) { + ScalePlaneBox_16(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst); + return; + } + if (filtering && dst_height > src_height) { + ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + if (filtering) { + ScalePlaneBilinearDown_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst); +} + +// Scale an I420 image. +// This function in turn calls a scaling function for each plane. + +LIBYUV_API +int I420Scale(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, + enum FilterMode filtering) { + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int src_halfheight = SUBSAMPLE(src_height, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); + if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u, + dst_stride_u, dst_halfwidth, dst_halfheight, filtering); + ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v, + dst_stride_v, dst_halfwidth, dst_halfheight, filtering); + return 0; +} + +LIBYUV_API +int I420Scale_16(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, + enum FilterMode filtering) { + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int src_halfheight = SUBSAMPLE(src_height, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); + if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u, + dst_stride_u, dst_halfwidth, dst_halfheight, filtering); + ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v, + dst_stride_v, dst_halfwidth, dst_halfheight, filtering); + return 0; +} + +// Deprecated api +LIBYUV_API +int Scale(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + int src_stride_y, + int src_stride_u, + int src_stride_v, + int src_width, + int src_height, + uint8_t* dst_y, + uint8_t* dst_u, + uint8_t* dst_v, + int dst_stride_y, + int dst_stride_u, + int dst_stride_v, + int dst_width, + int dst_height, + LIBYUV_BOOL interpolate) { + return I420Scale(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, src_width, src_height, dst_y, dst_stride_y, + dst_u, dst_stride_u, dst_v, dst_stride_v, dst_width, + dst_height, interpolate ? kFilterBox : kFilterNone); +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/scale_any.cc b/media/libvpx/libvpx/third_party/libyuv/source/scale_any.cc new file mode 100644 index 0000000000..53ad136404 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/scale_any.cc @@ -0,0 +1,464 @@ +/* + * Copyright 2015 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" +#include "libyuv/scale_row.h" + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols +#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \ + void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \ + int dx) { \ + int r = dst_width & MASK; \ + int n = dst_width & ~MASK; \ + if (n > 0) { \ + TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \ + } \ + TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx); \ + } + +#ifdef HAS_SCALEFILTERCOLS_NEON +CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7) +#endif +#ifdef HAS_SCALEFILTERCOLS_MSA +CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15) +#endif +#ifdef HAS_SCALEARGBCOLS_NEON +CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7) +#endif +#ifdef HAS_SCALEARGBCOLS_MSA +CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3) +#endif +#ifdef HAS_SCALEARGBFILTERCOLS_NEON +CANY(ScaleARGBFilterCols_Any_NEON, + ScaleARGBFilterCols_NEON, + ScaleARGBFilterCols_C, + 4, + 3) +#endif +#ifdef HAS_SCALEARGBFILTERCOLS_MSA +CANY(ScaleARGBFilterCols_Any_MSA, + ScaleARGBFilterCols_MSA, + ScaleARGBFilterCols_C, + 4, + 7) +#endif +#undef CANY + +// Fixed scale down. +// Mask may be non-power of 2, so use MOD +#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \ + int dst_width) { \ + int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */ \ + int n = dst_width - r; \ + if (n > 0) { \ + SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ + } \ + SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ + dst_ptr + n * BPP, r); \ + } + +// Fixed scale down for odd source width. Used by I420Blend subsampling. +// Since dst_width is (width + 1) / 2, this function scales one less pixel +// and copies the last pixel. +#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \ + int dst_width) { \ + int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); /* NOLINT */ \ + int n = (dst_width - 1) - r; \ + if (n > 0) { \ + SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ + } \ + SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ + dst_ptr + n * BPP, r + 1); \ + } + +#ifdef HAS_SCALEROWDOWN2_SSSE3 +SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15) +SDANY(ScaleRowDown2Linear_Any_SSSE3, + ScaleRowDown2Linear_SSSE3, + ScaleRowDown2Linear_C, + 2, + 1, + 15) +SDANY(ScaleRowDown2Box_Any_SSSE3, + ScaleRowDown2Box_SSSE3, + ScaleRowDown2Box_C, + 2, + 1, + 15) +SDODD(ScaleRowDown2Box_Odd_SSSE3, + ScaleRowDown2Box_SSSE3, + ScaleRowDown2Box_Odd_C, + 2, + 1, + 15) +#endif +#ifdef HAS_SCALEROWDOWN2_AVX2 +SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31) +SDANY(ScaleRowDown2Linear_Any_AVX2, + ScaleRowDown2Linear_AVX2, + ScaleRowDown2Linear_C, + 2, + 1, + 31) +SDANY(ScaleRowDown2Box_Any_AVX2, + ScaleRowDown2Box_AVX2, + ScaleRowDown2Box_C, + 2, + 1, + 31) +SDODD(ScaleRowDown2Box_Odd_AVX2, + ScaleRowDown2Box_AVX2, + ScaleRowDown2Box_Odd_C, + 2, + 1, + 31) +#endif +#ifdef HAS_SCALEROWDOWN2_NEON +SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15) +SDANY(ScaleRowDown2Linear_Any_NEON, + ScaleRowDown2Linear_NEON, + ScaleRowDown2Linear_C, + 2, + 1, + 15) +SDANY(ScaleRowDown2Box_Any_NEON, + ScaleRowDown2Box_NEON, + ScaleRowDown2Box_C, + 2, + 1, + 15) +SDODD(ScaleRowDown2Box_Odd_NEON, + ScaleRowDown2Box_NEON, + ScaleRowDown2Box_Odd_C, + 2, + 1, + 15) +#endif +#ifdef HAS_SCALEROWDOWN2_MSA +SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31) +SDANY(ScaleRowDown2Linear_Any_MSA, + ScaleRowDown2Linear_MSA, + ScaleRowDown2Linear_C, + 2, + 1, + 31) +SDANY(ScaleRowDown2Box_Any_MSA, + ScaleRowDown2Box_MSA, + ScaleRowDown2Box_C, + 2, + 1, + 31) +#endif +#ifdef HAS_SCALEROWDOWN4_SSSE3 +SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7) +SDANY(ScaleRowDown4Box_Any_SSSE3, + ScaleRowDown4Box_SSSE3, + ScaleRowDown4Box_C, + 4, + 1, + 7) +#endif +#ifdef HAS_SCALEROWDOWN4_AVX2 +SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15) +SDANY(ScaleRowDown4Box_Any_AVX2, + ScaleRowDown4Box_AVX2, + ScaleRowDown4Box_C, + 4, + 1, + 15) +#endif +#ifdef HAS_SCALEROWDOWN4_NEON +SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7) +SDANY(ScaleRowDown4Box_Any_NEON, + ScaleRowDown4Box_NEON, + ScaleRowDown4Box_C, + 4, + 1, + 7) +#endif +#ifdef HAS_SCALEROWDOWN4_MSA +SDANY(ScaleRowDown4_Any_MSA, ScaleRowDown4_MSA, ScaleRowDown4_C, 4, 1, 15) +SDANY(ScaleRowDown4Box_Any_MSA, + ScaleRowDown4Box_MSA, + ScaleRowDown4Box_C, + 4, + 1, + 15) +#endif +#ifdef HAS_SCALEROWDOWN34_SSSE3 +SDANY(ScaleRowDown34_Any_SSSE3, + ScaleRowDown34_SSSE3, + ScaleRowDown34_C, + 4 / 3, + 1, + 23) +SDANY(ScaleRowDown34_0_Box_Any_SSSE3, + ScaleRowDown34_0_Box_SSSE3, + ScaleRowDown34_0_Box_C, + 4 / 3, + 1, + 23) +SDANY(ScaleRowDown34_1_Box_Any_SSSE3, + ScaleRowDown34_1_Box_SSSE3, + ScaleRowDown34_1_Box_C, + 4 / 3, + 1, + 23) +#endif +#ifdef HAS_SCALEROWDOWN34_NEON +SDANY(ScaleRowDown34_Any_NEON, + ScaleRowDown34_NEON, + ScaleRowDown34_C, + 4 / 3, + 1, + 23) +SDANY(ScaleRowDown34_0_Box_Any_NEON, + ScaleRowDown34_0_Box_NEON, + ScaleRowDown34_0_Box_C, + 4 / 3, + 1, + 23) +SDANY(ScaleRowDown34_1_Box_Any_NEON, + ScaleRowDown34_1_Box_NEON, + ScaleRowDown34_1_Box_C, + 4 / 3, + 1, + 23) +#endif +#ifdef HAS_SCALEROWDOWN34_MSA +SDANY(ScaleRowDown34_Any_MSA, + ScaleRowDown34_MSA, + ScaleRowDown34_C, + 4 / 3, + 1, + 47) +SDANY(ScaleRowDown34_0_Box_Any_MSA, + ScaleRowDown34_0_Box_MSA, + ScaleRowDown34_0_Box_C, + 4 / 3, + 1, + 47) +SDANY(ScaleRowDown34_1_Box_Any_MSA, + ScaleRowDown34_1_Box_MSA, + ScaleRowDown34_1_Box_C, + 4 / 3, + 1, + 47) +#endif +#ifdef HAS_SCALEROWDOWN38_SSSE3 +SDANY(ScaleRowDown38_Any_SSSE3, + ScaleRowDown38_SSSE3, + ScaleRowDown38_C, + 8 / 3, + 1, + 11) +SDANY(ScaleRowDown38_3_Box_Any_SSSE3, + ScaleRowDown38_3_Box_SSSE3, + ScaleRowDown38_3_Box_C, + 8 / 3, + 1, + 5) +SDANY(ScaleRowDown38_2_Box_Any_SSSE3, + ScaleRowDown38_2_Box_SSSE3, + ScaleRowDown38_2_Box_C, + 8 / 3, + 1, + 5) +#endif +#ifdef HAS_SCALEROWDOWN38_NEON +SDANY(ScaleRowDown38_Any_NEON, + ScaleRowDown38_NEON, + ScaleRowDown38_C, + 8 / 3, + 1, + 11) +SDANY(ScaleRowDown38_3_Box_Any_NEON, + ScaleRowDown38_3_Box_NEON, + ScaleRowDown38_3_Box_C, + 8 / 3, + 1, + 11) +SDANY(ScaleRowDown38_2_Box_Any_NEON, + ScaleRowDown38_2_Box_NEON, + ScaleRowDown38_2_Box_C, + 8 / 3, + 1, + 11) +#endif +#ifdef HAS_SCALEROWDOWN38_MSA +SDANY(ScaleRowDown38_Any_MSA, + ScaleRowDown38_MSA, + ScaleRowDown38_C, + 8 / 3, + 1, + 11) +SDANY(ScaleRowDown38_3_Box_Any_MSA, + ScaleRowDown38_3_Box_MSA, + ScaleRowDown38_3_Box_C, + 8 / 3, + 1, + 11) +SDANY(ScaleRowDown38_2_Box_Any_MSA, + ScaleRowDown38_2_Box_MSA, + ScaleRowDown38_2_Box_C, + 8 / 3, + 1, + 11) +#endif + +#ifdef HAS_SCALEARGBROWDOWN2_SSE2 +SDANY(ScaleARGBRowDown2_Any_SSE2, + ScaleARGBRowDown2_SSE2, + ScaleARGBRowDown2_C, + 2, + 4, + 3) +SDANY(ScaleARGBRowDown2Linear_Any_SSE2, + ScaleARGBRowDown2Linear_SSE2, + ScaleARGBRowDown2Linear_C, + 2, + 4, + 3) +SDANY(ScaleARGBRowDown2Box_Any_SSE2, + ScaleARGBRowDown2Box_SSE2, + ScaleARGBRowDown2Box_C, + 2, + 4, + 3) +#endif +#ifdef HAS_SCALEARGBROWDOWN2_NEON +SDANY(ScaleARGBRowDown2_Any_NEON, + ScaleARGBRowDown2_NEON, + ScaleARGBRowDown2_C, + 2, + 4, + 7) +SDANY(ScaleARGBRowDown2Linear_Any_NEON, + ScaleARGBRowDown2Linear_NEON, + ScaleARGBRowDown2Linear_C, + 2, + 4, + 7) +SDANY(ScaleARGBRowDown2Box_Any_NEON, + ScaleARGBRowDown2Box_NEON, + ScaleARGBRowDown2Box_C, + 2, + 4, + 7) +#endif +#ifdef HAS_SCALEARGBROWDOWN2_MSA +SDANY(ScaleARGBRowDown2_Any_MSA, + ScaleARGBRowDown2_MSA, + ScaleARGBRowDown2_C, + 2, + 4, + 3) +SDANY(ScaleARGBRowDown2Linear_Any_MSA, + ScaleARGBRowDown2Linear_MSA, + ScaleARGBRowDown2Linear_C, + 2, + 4, + 3) +SDANY(ScaleARGBRowDown2Box_Any_MSA, + ScaleARGBRowDown2Box_MSA, + ScaleARGBRowDown2Box_C, + 2, + 4, + 3) +#endif +#undef SDANY + +// Scale down by even scale factor. +#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, \ + uint8_t* dst_ptr, int dst_width) { \ + int r = dst_width & MASK; \ + int n = dst_width & ~MASK; \ + if (n > 0) { \ + SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \ + } \ + SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx, \ + dst_ptr + n * BPP, r); \ + } + +#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2 +SDAANY(ScaleARGBRowDownEven_Any_SSE2, + ScaleARGBRowDownEven_SSE2, + ScaleARGBRowDownEven_C, + 4, + 3) +SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, + ScaleARGBRowDownEvenBox_SSE2, + ScaleARGBRowDownEvenBox_C, + 4, + 3) +#endif +#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON +SDAANY(ScaleARGBRowDownEven_Any_NEON, + ScaleARGBRowDownEven_NEON, + ScaleARGBRowDownEven_C, + 4, + 3) +SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, + ScaleARGBRowDownEvenBox_NEON, + ScaleARGBRowDownEvenBox_C, + 4, + 3) +#endif +#ifdef HAS_SCALEARGBROWDOWNEVEN_MSA +SDAANY(ScaleARGBRowDownEven_Any_MSA, + ScaleARGBRowDownEven_MSA, + ScaleARGBRowDownEven_C, + 4, + 3) +SDAANY(ScaleARGBRowDownEvenBox_Any_MSA, + ScaleARGBRowDownEvenBox_MSA, + ScaleARGBRowDownEvenBox_C, + 4, + 3) +#endif + +// Add rows box filter scale down. +#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { \ + int n = src_width & ~MASK; \ + if (n > 0) { \ + SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \ + } \ + SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \ + } + +#ifdef HAS_SCALEADDROW_SSE2 +SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15) +#endif +#ifdef HAS_SCALEADDROW_AVX2 +SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31) +#endif +#ifdef HAS_SCALEADDROW_NEON +SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15) +#endif +#ifdef HAS_SCALEADDROW_MSA +SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15) +#endif +#undef SAANY + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/scale_argb.cc b/media/libvpx/libvpx/third_party/libyuv/source/scale_argb.cc new file mode 100644 index 0000000000..53a22e8b41 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/scale_argb.cc @@ -0,0 +1,1010 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" + +#include +#include + +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" // For CopyARGB +#include "libyuv/row.h" +#include "libyuv/scale_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +// ScaleARGB ARGB, 1/2 +// This is an optimized version for scaling down a ARGB to 1/2 of +// its original size. +static void ScaleARGBDown2(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_argb, + uint8_t* dst_argb, + int x, + int dx, + int y, + int dy, + enum FilterMode filtering) { + int j; + int row_stride = src_stride * (dy >> 16); + void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride, + uint8_t* dst_argb, int dst_width) = + filtering == kFilterNone + ? ScaleARGBRowDown2_C + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C + : ScaleARGBRowDown2Box_C); + (void)src_width; + (void)src_height; + (void)dx; + assert(dx == 65536 * 2); // Test scale factor of 2. + assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2. + // Advance to odd row, even column. + if (filtering == kFilterBilinear) { + src_argb += (y >> 16) * src_stride + (x >> 16) * 4; + } else { + src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4; + } + +#if defined(HAS_SCALEARGBROWDOWN2_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_Any_SSE2 + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 + : ScaleARGBRowDown2Box_Any_SSE2); + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_SSE2 + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 + : ScaleARGBRowDown2Box_SSE2); + } + } +#endif +#if defined(HAS_SCALEARGBROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_Any_NEON + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON + : ScaleARGBRowDown2Box_Any_NEON); + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_NEON + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON + : ScaleARGBRowDown2Box_NEON); + } + } +#endif +#if defined(HAS_SCALEARGBROWDOWN2_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_Any_MSA + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MSA + : ScaleARGBRowDown2Box_Any_MSA); + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_MSA + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MSA + : ScaleARGBRowDown2Box_MSA); + } + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + for (j = 0; j < dst_height; ++j) { + ScaleARGBRowDown2(src_argb, src_stride, dst_argb, dst_width); + src_argb += row_stride; + dst_argb += dst_stride; + } +} + +// ScaleARGB ARGB, 1/4 +// This is an optimized version for scaling down a ARGB to 1/4 of +// its original size. +static void ScaleARGBDown4Box(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_argb, + uint8_t* dst_argb, + int x, + int dx, + int y, + int dy) { + int j; + // Allocate 2 rows of ARGB. + const int kRowSize = (dst_width * 2 * 4 + 31) & ~31; + align_buffer_64(row, kRowSize * 2); + int row_stride = src_stride * (dy >> 16); + void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride, + uint8_t* dst_argb, int dst_width) = + ScaleARGBRowDown2Box_C; + // Advance to odd row, even column. + src_argb += (y >> 16) * src_stride + (x >> 16) * 4; + (void)src_width; + (void)src_height; + (void)dx; + assert(dx == 65536 * 4); // Test scale factor of 4. + assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4. +#if defined(HAS_SCALEARGBROWDOWN2_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_SSE2; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2; + } + } +#endif +#if defined(HAS_SCALEARGBROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON; + } + } +#endif + + for (j = 0; j < dst_height; ++j) { + ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2); + ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, row + kRowSize, + dst_width * 2); + ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width); + src_argb += row_stride; + dst_argb += dst_stride; + } + free_aligned_buffer_64(row); +} + +// ScaleARGB ARGB Even +// This is an optimized version for scaling down a ARGB to even +// multiple of its original size. +static void ScaleARGBDownEven(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_argb, + uint8_t* dst_argb, + int x, + int dx, + int y, + int dy, + enum FilterMode filtering) { + int j; + int col_step = dx >> 16; + int row_stride = (dy >> 16) * src_stride; + void (*ScaleARGBRowDownEven)(const uint8_t* src_argb, ptrdiff_t src_stride, + int src_step, uint8_t* dst_argb, int dst_width) = + filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C; + (void)src_width; + (void)src_height; + assert(IS_ALIGNED(src_width, 2)); + assert(IS_ALIGNED(src_height, 2)); + src_argb += (y >> 16) * src_stride + (x >> 16) * 4; +#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 + : ScaleARGBRowDownEven_Any_SSE2; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDownEven = + filtering ? ScaleARGBRowDownEvenBox_SSE2 : ScaleARGBRowDownEven_SSE2; + } + } +#endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON + : ScaleARGBRowDownEven_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDownEven = + filtering ? ScaleARGBRowDownEvenBox_NEON : ScaleARGBRowDownEven_NEON; + } + } +#endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MSA + : ScaleARGBRowDownEven_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDownEven = + filtering ? ScaleARGBRowDownEvenBox_MSA : ScaleARGBRowDownEven_MSA; + } + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + for (j = 0; j < dst_height; ++j) { + ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width); + src_argb += row_stride; + dst_argb += dst_stride; + } +} + +// Scale ARGB down with bilinear interpolation. +static void ScaleARGBBilinearDown(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_argb, + uint8_t* dst_argb, + int x, + int dx, + int y, + int dy, + enum FilterMode filtering) { + int j; + void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb, + int dst_width, int x, int dx) = + (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C; + int64_t xlast = x + (int64_t)(dst_width - 1) * dx; + int64_t xl = (dx >= 0) ? x : xlast; + int64_t xr = (dx >= 0) ? xlast : x; + int clip_src_width; + xl = (xl >> 16) & ~3; // Left edge aligned. + xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels. + xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel. + if (xr > src_width) { + xr = src_width; + } + clip_src_width = (int)(xr - xl) * 4; // Width aligned to 4. + src_argb += xl * 4; + x -= (int)(xl << 16); +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(clip_src_width, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(clip_src_width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(clip_src_width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(clip_src_width, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif +#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; + } +#endif +#if defined(HAS_SCALEARGBFILTERCOLS_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_NEON; + } + } +#endif +#if defined(HAS_SCALEARGBFILTERCOLS_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_MSA; + } + } +#endif + // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. + // Allocate a row of ARGB. + { + align_buffer_64(row, clip_src_width * 4); + + const int max_y = (src_height - 1) << 16; + if (y > max_y) { + y = max_y; + } + for (j = 0; j < dst_height; ++j) { + int yi = y >> 16; + const uint8_t* src = src_argb + yi * src_stride; + if (filtering == kFilterLinear) { + ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(row, src, src_stride, clip_src_width, yf); + ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx); + } + dst_argb += dst_stride; + y += dy; + if (y > max_y) { + y = max_y; + } + } + free_aligned_buffer_64(row); + } +} + +// Scale ARGB up with bilinear interpolation. +static void ScaleARGBBilinearUp(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_argb, + uint8_t* dst_argb, + int x, + int dx, + int y, + int dy, + enum FilterMode filtering) { + int j; + void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb, + int dst_width, int x, int dx) = + filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; + const int max_y = (src_height - 1) << 16; +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif + if (src_width >= 32768) { + ScaleARGBFilterCols = + filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C; + } +#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) + if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; + } +#endif +#if defined(HAS_SCALEARGBFILTERCOLS_NEON) + if (filtering && TestCpuFlag(kCpuHasNEON)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_NEON; + } + } +#endif +#if defined(HAS_SCALEARGBFILTERCOLS_MSA) + if (filtering && TestCpuFlag(kCpuHasMSA)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_MSA; + } + } +#endif +#if defined(HAS_SCALEARGBCOLS_SSE2) + if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { + ScaleARGBFilterCols = ScaleARGBCols_SSE2; + } +#endif +#if defined(HAS_SCALEARGBCOLS_NEON) + if (!filtering && TestCpuFlag(kCpuHasNEON)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBCols_NEON; + } + } +#endif +#if defined(HAS_SCALEARGBCOLS_MSA) + if (!filtering && TestCpuFlag(kCpuHasMSA)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBCols_MSA; + } + } +#endif + if (!filtering && src_width * 2 == dst_width && x < 0x8000) { + ScaleARGBFilterCols = ScaleARGBColsUp2_C; +#if defined(HAS_SCALEARGBCOLSUP2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; + } +#endif + } + + if (y > max_y) { + y = max_y; + } + + { + int yi = y >> 16; + const uint8_t* src = src_argb + yi * src_stride; + + // Allocate 2 rows of ARGB. + const int kRowSize = (dst_width * 4 + 31) & ~31; + align_buffer_64(row, kRowSize * 2); + + uint8_t* rowptr = row; + int rowstride = kRowSize; + int lasty = yi; + + ScaleARGBFilterCols(rowptr, src, dst_width, x, dx); + if (src_height > 1) { + src += src_stride; + } + ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx); + src += src_stride; + + for (j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lasty) { + if (y > max_y) { + y = max_y; + yi = y >> 16; + src = src_argb + yi * src_stride; + } + if (yi != lasty) { + ScaleARGBFilterCols(rowptr, src, dst_width, x, dx); + rowptr += rowstride; + rowstride = -rowstride; + lasty = yi; + src += src_stride; + } + } + if (filtering == kFilterLinear) { + InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf); + } + dst_argb += dst_stride; + y += dy; + } + free_aligned_buffer_64(row); + } +} + +#ifdef YUVSCALEUP +// Scale YUV to ARGB up with bilinear interpolation. +static void ScaleYUVToARGBBilinearUp(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride_y, + int src_stride_u, + int src_stride_v, + int dst_stride_argb, + const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + int x, + int dx, + int y, + int dy, + enum FilterMode filtering) { + int j; + void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, int width) = + I422ToARGBRow_C; +#if defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(src_width, 8)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToARGBRow = I422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(src_width, 16)) { + I422ToARGBRow = I422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(src_width, 8)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I422TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGBRow = I422ToARGBRow_Any_MSA; + if (IS_ALIGNED(src_width, 8)) { + I422ToARGBRow = I422ToARGBRow_MSA; + } + } +#endif + + void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif + + void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb, + int dst_width, int x, int dx) = + filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; + if (src_width >= 32768) { + ScaleARGBFilterCols = + filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C; + } +#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) + if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; + } +#endif +#if defined(HAS_SCALEARGBFILTERCOLS_NEON) + if (filtering && TestCpuFlag(kCpuHasNEON)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_NEON; + } + } +#endif +#if defined(HAS_SCALEARGBFILTERCOLS_MSA) + if (filtering && TestCpuFlag(kCpuHasMSA)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_MSA; + } + } +#endif +#if defined(HAS_SCALEARGBCOLS_SSE2) + if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { + ScaleARGBFilterCols = ScaleARGBCols_SSE2; + } +#endif +#if defined(HAS_SCALEARGBCOLS_NEON) + if (!filtering && TestCpuFlag(kCpuHasNEON)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBCols_NEON; + } + } +#endif +#if defined(HAS_SCALEARGBCOLS_MSA) + if (!filtering && TestCpuFlag(kCpuHasMSA)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBCols_MSA; + } + } +#endif + if (!filtering && src_width * 2 == dst_width && x < 0x8000) { + ScaleARGBFilterCols = ScaleARGBColsUp2_C; +#if defined(HAS_SCALEARGBCOLSUP2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; + } +#endif + } + + const int max_y = (src_height - 1) << 16; + if (y > max_y) { + y = max_y; + } + const int kYShift = 1; // Shift Y by 1 to convert Y plane to UV coordinate. + int yi = y >> 16; + int uv_yi = yi >> kYShift; + const uint8_t* src_row_y = src_y + yi * src_stride_y; + const uint8_t* src_row_u = src_u + uv_yi * src_stride_u; + const uint8_t* src_row_v = src_v + uv_yi * src_stride_v; + + // Allocate 2 rows of ARGB. + const int kRowSize = (dst_width * 4 + 31) & ~31; + align_buffer_64(row, kRowSize * 2); + + // Allocate 1 row of ARGB for source conversion. + align_buffer_64(argb_row, src_width * 4); + + uint8_t* rowptr = row; + int rowstride = kRowSize; + int lasty = yi; + + // TODO(fbarchard): Convert first 2 rows of YUV to ARGB. + ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx); + if (src_height > 1) { + src_row_y += src_stride_y; + if (yi & 1) { + src_row_u += src_stride_u; + src_row_v += src_stride_v; + } + } + ScaleARGBFilterCols(rowptr + rowstride, src_row_y, dst_width, x, dx); + if (src_height > 2) { + src_row_y += src_stride_y; + if (!(yi & 1)) { + src_row_u += src_stride_u; + src_row_v += src_stride_v; + } + } + + for (j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lasty) { + if (y > max_y) { + y = max_y; + yi = y >> 16; + uv_yi = yi >> kYShift; + src_row_y = src_y + yi * src_stride_y; + src_row_u = src_u + uv_yi * src_stride_u; + src_row_v = src_v + uv_yi * src_stride_v; + } + if (yi != lasty) { + // TODO(fbarchard): Convert the clipped region of row. + I422ToARGBRow(src_row_y, src_row_u, src_row_v, argb_row, src_width); + ScaleARGBFilterCols(rowptr, argb_row, dst_width, x, dx); + rowptr += rowstride; + rowstride = -rowstride; + lasty = yi; + src_row_y += src_stride_y; + if (yi & 1) { + src_row_u += src_stride_u; + src_row_v += src_stride_v; + } + } + } + if (filtering == kFilterLinear) { + InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf); + } + dst_argb += dst_stride_argb; + y += dy; + } + free_aligned_buffer_64(row); + free_aligned_buffer_64(row_argb); +} +#endif + +// Scale ARGB to/from any dimensions, without interpolation. +// Fixed point math is used for performance: The upper 16 bits +// of x and dx is the integer part of the source position and +// the lower 16 bits are the fixed decimal part. + +static void ScaleARGBSimple(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_argb, + uint8_t* dst_argb, + int x, + int dx, + int y, + int dy) { + int j; + void (*ScaleARGBCols)(uint8_t * dst_argb, const uint8_t* src_argb, + int dst_width, int x, int dx) = + (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C; + (void)src_height; +#if defined(HAS_SCALEARGBCOLS_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { + ScaleARGBCols = ScaleARGBCols_SSE2; + } +#endif +#if defined(HAS_SCALEARGBCOLS_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBCols = ScaleARGBCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBCols = ScaleARGBCols_NEON; + } + } +#endif +#if defined(HAS_SCALEARGBCOLS_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleARGBCols = ScaleARGBCols_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBCols = ScaleARGBCols_MSA; + } + } +#endif + if (src_width * 2 == dst_width && x < 0x8000) { + ScaleARGBCols = ScaleARGBColsUp2_C; +#if defined(HAS_SCALEARGBCOLSUP2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { + ScaleARGBCols = ScaleARGBColsUp2_SSE2; + } +#endif + } + + for (j = 0; j < dst_height; ++j) { + ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride, dst_width, x, + dx); + dst_argb += dst_stride; + y += dy; + } +} + +// ScaleARGB a ARGB. +// This function in turn calls a scaling function +// suitable for handling the desired resolutions. +static void ScaleARGB(const uint8_t* src, + int src_stride, + int src_width, + int src_height, + uint8_t* dst, + int dst_stride, + int dst_width, + int dst_height, + int clip_x, + int clip_y, + int clip_width, + int clip_height, + enum FilterMode filtering) { + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + // ARGB does not support box filter yet, but allow the user to pass it. + // Simplify filtering when possible. + filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, + filtering); + + // Negative src_height means invert the image. + if (src_height < 0) { + src_height = -src_height; + src = src + (src_height - 1) * src_stride; + src_stride = -src_stride; + } + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, + &dx, &dy); + src_width = Abs(src_width); + if (clip_x) { + int64_t clipf = (int64_t)(clip_x)*dx; + x += (clipf & 0xffff); + src += (clipf >> 16) * 4; + dst += clip_x * 4; + } + if (clip_y) { + int64_t clipf = (int64_t)(clip_y)*dy; + y += (clipf & 0xffff); + src += (clipf >> 16) * src_stride; + dst += clip_y * dst_stride; + } + + // Special case for integer step values. + if (((dx | dy) & 0xffff) == 0) { + if (!dx || !dy) { // 1 pixel wide and/or tall. + filtering = kFilterNone; + } else { + // Optimized even scale down. ie 2, 4, 6, 8, 10x. + if (!(dx & 0x10000) && !(dy & 0x10000)) { + if (dx == 0x20000) { + // Optimized 1/2 downsample. + ScaleARGBDown2(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); + return; + } + if (dx == 0x40000 && filtering == kFilterBox) { + // Optimized 1/4 box downsample. + ScaleARGBDown4Box(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy); + return; + } + ScaleARGBDownEven(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); + return; + } + // Optimized odd scale down. ie 3, 5, 7, 9x. + if ((dx & 0x10000) && (dy & 0x10000)) { + filtering = kFilterNone; + if (dx == 0x10000 && dy == 0x10000) { + // Straight copy. + ARGBCopy(src + (y >> 16) * src_stride + (x >> 16) * 4, src_stride, + dst, dst_stride, clip_width, clip_height); + return; + } + } + } + } + if (dx == 0x10000 && (x & 0xffff) == 0) { + // Arbitrary scale vertically, but unscaled vertically. + ScalePlaneVertical(src_height, clip_width, clip_height, src_stride, + dst_stride, src, dst, x, y, dy, 4, filtering); + return; + } + if (filtering && dy < 65536) { + ScaleARGBBilinearUp(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); + return; + } + if (filtering) { + ScaleARGBBilinearDown(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); + return; + } + ScaleARGBSimple(src_width, src_height, clip_width, clip_height, src_stride, + dst_stride, src, dst, x, dx, y, dy); +} + +LIBYUV_API +int ARGBScaleClip(const uint8_t* src_argb, + int src_stride_argb, + int src_width, + int src_height, + uint8_t* dst_argb, + int dst_stride_argb, + int dst_width, + int dst_height, + int clip_x, + int clip_y, + int clip_width, + int clip_height, + enum FilterMode filtering) { + if (!src_argb || src_width == 0 || src_height == 0 || !dst_argb || + dst_width <= 0 || dst_height <= 0 || clip_x < 0 || clip_y < 0 || + clip_width > 32768 || clip_height > 32768 || + (clip_x + clip_width) > dst_width || + (clip_y + clip_height) > dst_height) { + return -1; + } + ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb, + dst_stride_argb, dst_width, dst_height, clip_x, clip_y, clip_width, + clip_height, filtering); + return 0; +} + +// Scale an ARGB image. +LIBYUV_API +int ARGBScale(const uint8_t* src_argb, + int src_stride_argb, + int src_width, + int src_height, + uint8_t* dst_argb, + int dst_stride_argb, + int dst_width, + int dst_height, + enum FilterMode filtering) { + if (!src_argb || src_width == 0 || src_height == 0 || src_width > 32768 || + src_height > 32768 || !dst_argb || dst_width <= 0 || dst_height <= 0) { + return -1; + } + ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb, + dst_stride_argb, dst_width, dst_height, 0, 0, dst_width, dst_height, + filtering); + return 0; +} + +// Scale with YUV conversion to ARGB and clipping. +LIBYUV_API +int YUVToARGBScaleClip(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint32_t src_fourcc, + int src_width, + int src_height, + uint8_t* dst_argb, + int dst_stride_argb, + uint32_t dst_fourcc, + int dst_width, + int dst_height, + int clip_x, + int clip_y, + int clip_width, + int clip_height, + enum FilterMode filtering) { + uint8_t* argb_buffer = (uint8_t*)malloc(src_width * src_height * 4); + int r; + (void)src_fourcc; // TODO(fbarchard): implement and/or assert. + (void)dst_fourcc; + I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, + argb_buffer, src_width * 4, src_width, src_height); + + r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, src_height, dst_argb, + dst_stride_argb, dst_width, dst_height, clip_x, clip_y, + clip_width, clip_height, filtering); + free(argb_buffer); + return r; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/scale_common.cc b/media/libvpx/libvpx/third_party/libyuv/source/scale_common.cc new file mode 100644 index 0000000000..b28d7da41f --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/scale_common.cc @@ -0,0 +1,1323 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" + +#include +#include + +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" // For CopyARGB +#include "libyuv/row.h" +#include "libyuv/scale_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +// CPU agnostic row functions +void ScaleRowDown2_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + (void)src_stride; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src_ptr[1]; + dst[1] = src_ptr[3]; + dst += 2; + src_ptr += 4; + } + if (dst_width & 1) { + dst[0] = src_ptr[1]; + } +} + +void ScaleRowDown2_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + int x; + (void)src_stride; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src_ptr[1]; + dst[1] = src_ptr[3]; + dst += 2; + src_ptr += 4; + } + if (dst_width & 1) { + dst[0] = src_ptr[1]; + } +} + +void ScaleRowDown2Linear_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + const uint8_t* s = src_ptr; + int x; + (void)src_stride; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (s[0] + s[1] + 1) >> 1; + dst[1] = (s[2] + s[3] + 1) >> 1; + dst += 2; + s += 4; + } + if (dst_width & 1) { + dst[0] = (s[0] + s[1] + 1) >> 1; + } +} + +void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + const uint16_t* s = src_ptr; + int x; + (void)src_stride; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (s[0] + s[1] + 1) >> 1; + dst[1] = (s[2] + s[3] + 1) >> 1; + dst += 2; + s += 4; + } + if (dst_width & 1) { + dst[0] = (s[0] + s[1] + 1) >> 1; + } +} + +void ScaleRowDown2Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2; + dst += 2; + s += 4; + t += 4; + } + if (dst_width & 1) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + } +} + +void ScaleRowDown2Box_Odd_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + int x; + dst_width -= 1; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2; + dst += 2; + s += 4; + t += 4; + } + if (dst_width & 1) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + dst += 1; + s += 2; + t += 2; + } + dst[0] = (s[0] + t[0] + 1) >> 1; +} + +void ScaleRowDown2Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + const uint16_t* s = src_ptr; + const uint16_t* t = src_ptr + src_stride; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2; + dst += 2; + s += 4; + t += 4; + } + if (dst_width & 1) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + } +} + +void ScaleRowDown4_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + (void)src_stride; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src_ptr[2]; + dst[1] = src_ptr[6]; + dst += 2; + src_ptr += 8; + } + if (dst_width & 1) { + dst[0] = src_ptr[2]; + } +} + +void ScaleRowDown4_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + int x; + (void)src_stride; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src_ptr[2]; + dst[1] = src_ptr[6]; + dst += 2; + src_ptr += 8; + } + if (dst_width & 1) { + dst[0] = src_ptr[2]; + } +} + +void ScaleRowDown4Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + intptr_t stride = src_stride; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + + src_ptr[stride * 3 + 3] + 8) >> + 4; + dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] + + src_ptr[stride + 7] + src_ptr[stride * 2 + 4] + + src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] + + src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] + + src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] + + src_ptr[stride * 3 + 7] + 8) >> + 4; + dst += 2; + src_ptr += 8; + } + if (dst_width & 1) { + dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + + src_ptr[stride * 3 + 3] + 8) >> + 4; + } +} + +void ScaleRowDown4Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + intptr_t stride = src_stride; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + + src_ptr[stride * 3 + 3] + 8) >> + 4; + dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] + + src_ptr[stride + 7] + src_ptr[stride * 2 + 4] + + src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] + + src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] + + src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] + + src_ptr[stride * 3 + 7] + 8) >> + 4; + dst += 2; + src_ptr += 8; + } + if (dst_width & 1) { + dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + + src_ptr[stride * 3 + 3] + 8) >> + 4; + } +} + +void ScaleRowDown34_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + (void)src_stride; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[1]; + dst[2] = src_ptr[3]; + dst += 3; + src_ptr += 4; + } +} + +void ScaleRowDown34_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + int x; + (void)src_stride; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[1]; + dst[2] = src_ptr[3]; + dst += 3; + src_ptr += 4; + } +} + +// Filter rows 0 and 1 together, 3 : 1 +void ScaleRowDown34_0_Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + int x; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 * 3 + b0 + 2) >> 2; + d[1] = (a1 * 3 + b1 + 2) >> 2; + d[2] = (a2 * 3 + b2 + 2) >> 2; + d += 3; + s += 4; + t += 4; + } +} + +void ScaleRowDown34_0_Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* d, + int dst_width) { + const uint16_t* s = src_ptr; + const uint16_t* t = src_ptr + src_stride; + int x; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + uint16_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint16_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint16_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint16_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint16_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint16_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 * 3 + b0 + 2) >> 2; + d[1] = (a1 * 3 + b1 + 2) >> 2; + d[2] = (a2 * 3 + b2 + 2) >> 2; + d += 3; + s += 4; + t += 4; + } +} + +// Filter rows 1 and 2 together, 1 : 1 +void ScaleRowDown34_1_Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + int x; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 + b0 + 1) >> 1; + d[1] = (a1 + b1 + 1) >> 1; + d[2] = (a2 + b2 + 1) >> 1; + d += 3; + s += 4; + t += 4; + } +} + +void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* d, + int dst_width) { + const uint16_t* s = src_ptr; + const uint16_t* t = src_ptr + src_stride; + int x; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + uint16_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint16_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint16_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint16_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint16_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint16_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 + b0 + 1) >> 1; + d[1] = (a1 + b1 + 1) >> 1; + d[2] = (a2 + b2 + 1) >> 1; + d += 3; + s += 4; + t += 4; + } +} + +// Scales a single row of pixels using point sampling. +void ScaleCols_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst_ptr[0] = src_ptr[x >> 16]; + x += dx; + dst_ptr[1] = src_ptr[x >> 16]; + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + dst_ptr[0] = src_ptr[x >> 16]; + } +} + +void ScaleCols_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + int dst_width, + int x, + int dx) { + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst_ptr[0] = src_ptr[x >> 16]; + x += dx; + dst_ptr[1] = src_ptr[x >> 16]; + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + dst_ptr[0] = src_ptr[x >> 16]; + } +} + +// Scales a single row of pixels up by 2x using point sampling. +void ScaleColsUp2_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { + int j; + (void)x; + (void)dx; + for (j = 0; j < dst_width - 1; j += 2) { + dst_ptr[1] = dst_ptr[0] = src_ptr[0]; + src_ptr += 1; + dst_ptr += 2; + } + if (dst_width & 1) { + dst_ptr[0] = src_ptr[0]; + } +} + +void ScaleColsUp2_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + int dst_width, + int x, + int dx) { + int j; + (void)x; + (void)dx; + for (j = 0; j < dst_width - 1; j += 2) { + dst_ptr[1] = dst_ptr[0] = src_ptr[0]; + src_ptr += 1; + dst_ptr += 2; + } + if (dst_width & 1) { + dst_ptr[0] = src_ptr[0]; + } +} + +// (1-f)a + fb can be replaced with a + f(b-a) +#if defined(__arm__) || defined(__aarch64__) +#define BLENDER(a, b, f) \ + (uint8_t)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) +#else +// Intel uses 7 bit math with rounding. +#define BLENDER(a, b, f) \ + (uint8_t)((int)(a) + (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7)) +#endif + +void ScaleFilterCols_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + x += dx; + xi = x >> 16; + a = src_ptr[xi]; + b = src_ptr[xi + 1]; + dst_ptr[1] = BLENDER(a, b, x & 0xffff); + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + int xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + } +} + +void ScaleFilterCols64_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x32, + int dx) { + int64_t x = (int64_t)(x32); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int64_t xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + x += dx; + xi = x >> 16; + a = src_ptr[xi]; + b = src_ptr[xi + 1]; + dst_ptr[1] = BLENDER(a, b, x & 0xffff); + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + int64_t xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + } +} +#undef BLENDER + +// Same as 8 bit arm blender but return is cast to uint16_t +#define BLENDER(a, b, f) \ + (uint16_t)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) + +void ScaleFilterCols_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + int dst_width, + int x, + int dx) { + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + x += dx; + xi = x >> 16; + a = src_ptr[xi]; + b = src_ptr[xi + 1]; + dst_ptr[1] = BLENDER(a, b, x & 0xffff); + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + int xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + } +} + +void ScaleFilterCols64_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + int dst_width, + int x32, + int dx) { + int64_t x = (int64_t)(x32); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int64_t xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + x += dx; + xi = x >> 16; + a = src_ptr[xi]; + b = src_ptr[xi + 1]; + dst_ptr[1] = BLENDER(a, b, x & 0xffff); + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + int64_t xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + } +} +#undef BLENDER + +void ScaleRowDown38_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + (void)src_stride; + assert(dst_width % 3 == 0); + for (x = 0; x < dst_width; x += 3) { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[3]; + dst[2] = src_ptr[6]; + dst += 3; + src_ptr += 8; + } +} + +void ScaleRowDown38_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + int x; + (void)src_stride; + assert(dst_width % 3 == 0); + for (x = 0; x < dst_width; x += 3) { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[3]; + dst[2] = src_ptr[6]; + dst += 3; + src_ptr += 8; + } +} + +// 8x3 -> 3x1 +void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + intptr_t stride = src_stride; + int i; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (i = 0; i < dst_width; i += 3) { + dst_ptr[0] = + (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + + src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * + (65536 / 9) >> + 16; + dst_ptr[1] = + (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * + (65536 / 9) >> + 16; + dst_ptr[2] = + (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * + (65536 / 6) >> + 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + int dst_width) { + intptr_t stride = src_stride; + int i; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (i = 0; i < dst_width; i += 3) { + dst_ptr[0] = + (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + + src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * + (65536 / 9) >> + 16; + dst_ptr[1] = + (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * + (65536 / 9) >> + 16; + dst_ptr[2] = + (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * + (65536 / 6) >> + 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +// 8x2 -> 3x1 +void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + intptr_t stride = src_stride; + int i; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (i = 0; i < dst_width; i += 3) { + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + + src_ptr[stride + 1] + src_ptr[stride + 2]) * + (65536 / 6) >> + 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + + src_ptr[stride + 4] + src_ptr[stride + 5]) * + (65536 / 6) >> + 16; + dst_ptr[2] = + (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) * + (65536 / 4) >> + 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + int dst_width) { + intptr_t stride = src_stride; + int i; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (i = 0; i < dst_width; i += 3) { + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + + src_ptr[stride + 1] + src_ptr[stride + 2]) * + (65536 / 6) >> + 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + + src_ptr[stride + 4] + src_ptr[stride + 5]) * + (65536 / 6) >> + 16; + dst_ptr[2] = + (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) * + (65536 / 4) >> + 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +void ScaleAddRow_C(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { + int x; + assert(src_width > 0); + for (x = 0; x < src_width - 1; x += 2) { + dst_ptr[0] += src_ptr[0]; + dst_ptr[1] += src_ptr[1]; + src_ptr += 2; + dst_ptr += 2; + } + if (src_width & 1) { + dst_ptr[0] += src_ptr[0]; + } +} + +void ScaleAddRow_16_C(const uint16_t* src_ptr, + uint32_t* dst_ptr, + int src_width) { + int x; + assert(src_width > 0); + for (x = 0; x < src_width - 1; x += 2) { + dst_ptr[0] += src_ptr[0]; + dst_ptr[1] += src_ptr[1]; + src_ptr += 2; + dst_ptr += 2; + } + if (src_width & 1) { + dst_ptr[0] += src_ptr[0]; + } +} + +void ScaleARGBRowDown2_C(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); + int x; + (void)src_stride; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src[1]; + dst[1] = src[3]; + src += 4; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[1]; + } +} + +void ScaleARGBRowDown2Linear_C(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + int x; + (void)src_stride; + for (x = 0; x < dst_width; ++x) { + dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1; + dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1; + dst_argb[2] = (src_argb[2] + src_argb[6] + 1) >> 1; + dst_argb[3] = (src_argb[3] + src_argb[7] + 1) >> 1; + src_argb += 8; + dst_argb += 4; + } +} + +void ScaleARGBRowDown2Box_C(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] + + src_argb[src_stride + 4] + 2) >> + 2; + dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] + + src_argb[src_stride + 5] + 2) >> + 2; + dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] + + src_argb[src_stride + 6] + 2) >> + 2; + dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] + + src_argb[src_stride + 7] + 2) >> + 2; + src_argb += 8; + dst_argb += 4; + } +} + +void ScaleARGBRowDownEven_C(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); + (void)src_stride; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src[0]; + dst[1] = src[src_stepx]; + src += src_stepx * 2; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[0]; + } +} + +void ScaleARGBRowDownEvenBox_C(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] + + src_argb[src_stride + 4] + 2) >> + 2; + dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] + + src_argb[src_stride + 5] + 2) >> + 2; + dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] + + src_argb[src_stride + 6] + 2) >> + 2; + dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] + + src_argb[src_stride + 7] + 2) >> + 2; + src_argb += src_stepx * 4; + dst_argb += 4; + } +} + +// Scales a single row of pixels using point sampling. +void ScaleARGBCols_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst[0] = src[x >> 16]; + x += dx; + dst[1] = src[x >> 16]; + x += dx; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[x >> 16]; + } +} + +void ScaleARGBCols64_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x32, + int dx) { + int64_t x = (int64_t)(x32); + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst[0] = src[x >> 16]; + x += dx; + dst[1] = src[x >> 16]; + x += dx; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[x >> 16]; + } +} + +// Scales a single row of pixels up by 2x using point sampling. +void ScaleARGBColsUp2_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); + int j; + (void)x; + (void)dx; + for (j = 0; j < dst_width - 1; j += 2) { + dst[1] = dst[0] = src[0]; + src += 1; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[0]; + } +} + +// TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607. +// Mimics SSSE3 blender +#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7 +#define BLENDERC(a, b, f, s) \ + (uint32_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s) +#define BLENDER(a, b, f) \ + BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | BLENDERC(a, b, f, 8) | \ + BLENDERC(a, b, f, 0) + +void ScaleARGBFilterCols_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint32_t a = src[xi]; + uint32_t b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + x += dx; + xi = x >> 16; + xf = (x >> 9) & 0x7f; + a = src[xi]; + b = src[xi + 1]; + dst[1] = BLENDER(a, b, xf); + x += dx; + dst += 2; + } + if (dst_width & 1) { + int xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint32_t a = src[xi]; + uint32_t b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + } +} + +void ScaleARGBFilterCols64_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x32, + int dx) { + int64_t x = (int64_t)(x32); + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int64_t xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint32_t a = src[xi]; + uint32_t b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + x += dx; + xi = x >> 16; + xf = (x >> 9) & 0x7f; + a = src[xi]; + b = src[xi + 1]; + dst[1] = BLENDER(a, b, xf); + x += dx; + dst += 2; + } + if (dst_width & 1) { + int64_t xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint32_t a = src[xi]; + uint32_t b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + } +} +#undef BLENDER1 +#undef BLENDERC +#undef BLENDER + +// Scale plane vertically with bilinear interpolation. +void ScalePlaneVertical(int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_argb, + uint8_t* dst_argb, + int x, + int y, + int dy, + int bpp, + enum FilterMode filtering) { + // TODO(fbarchard): Allow higher bpp. + int dst_width_bytes = dst_width * bpp; + void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; + int j; + assert(bpp >= 1 && bpp <= 4); + assert(src_height != 0); + assert(dst_width > 0); + assert(dst_height > 0); + src_argb += (x >> 16) * bpp; +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(dst_width_bytes, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width_bytes, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(dst_width_bytes, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(dst_width_bytes, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif + for (j = 0; j < dst_height; ++j) { + int yi; + int yf; + if (y > max_y) { + y = max_y; + } + yi = y >> 16; + yf = filtering ? ((y >> 8) & 255) : 0; + InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride, + dst_width_bytes, yf); + dst_argb += dst_stride; + y += dy; + } +} +void ScalePlaneVertical_16(int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_argb, + uint16_t* dst_argb, + int x, + int y, + int dy, + int wpp, + enum FilterMode filtering) { + // TODO(fbarchard): Allow higher wpp. + int dst_width_words = dst_width * wpp; + void (*InterpolateRow)(uint16_t * dst_argb, const uint16_t* src_argb, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_16_C; + const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; + int j; + assert(wpp >= 1 && wpp <= 2); + assert(src_height != 0); + assert(dst_width > 0); + assert(dst_height > 0); + src_argb += (x >> 16) * wpp; +#if defined(HAS_INTERPOLATEROW_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + InterpolateRow = InterpolateRow_Any_16_SSE2; + if (IS_ALIGNED(dst_width_bytes, 16)) { + InterpolateRow = InterpolateRow_16_SSE2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_16_SSSE3; + if (IS_ALIGNED(dst_width_bytes, 16)) { + InterpolateRow = InterpolateRow_16_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_16_AVX2; + if (IS_ALIGNED(dst_width_bytes, 32)) { + InterpolateRow = InterpolateRow_16_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_16_NEON; + if (IS_ALIGNED(dst_width_bytes, 16)) { + InterpolateRow = InterpolateRow_16_NEON; + } + } +#endif + for (j = 0; j < dst_height; ++j) { + int yi; + int yf; + if (y > max_y) { + y = max_y; + } + yi = y >> 16; + yf = filtering ? ((y >> 8) & 255) : 0; + InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride, + dst_width_words, yf); + dst_argb += dst_stride; + y += dy; + } +} + +// Simplify the filtering based on scale factors. +enum FilterMode ScaleFilterReduce(int src_width, + int src_height, + int dst_width, + int dst_height, + enum FilterMode filtering) { + if (src_width < 0) { + src_width = -src_width; + } + if (src_height < 0) { + src_height = -src_height; + } + if (filtering == kFilterBox) { + // If scaling both axis to 0.5 or larger, switch from Box to Bilinear. + if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) { + filtering = kFilterBilinear; + } + } + if (filtering == kFilterBilinear) { + if (src_height == 1) { + filtering = kFilterLinear; + } + // TODO(fbarchard): Detect any odd scale factor and reduce to Linear. + if (dst_height == src_height || dst_height * 3 == src_height) { + filtering = kFilterLinear; + } + // TODO(fbarchard): Remove 1 pixel wide filter restriction, which is to + // avoid reading 2 pixels horizontally that causes memory exception. + if (src_width == 1) { + filtering = kFilterNone; + } + } + if (filtering == kFilterLinear) { + if (src_width == 1) { + filtering = kFilterNone; + } + // TODO(fbarchard): Detect any odd scale factor and reduce to None. + if (dst_width == src_width || dst_width * 3 == src_width) { + filtering = kFilterNone; + } + } + return filtering; +} + +// Divide num by div and return as 16.16 fixed point result. +int FixedDiv_C(int num, int div) { + return (int)(((int64_t)(num) << 16) / div); +} + +// Divide num by div and return as 16.16 fixed point result. +int FixedDiv1_C(int num, int div) { + return (int)((((int64_t)(num) << 16) - 0x00010001) / (div - 1)); +} + +#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s) + +// Compute slope values for stepping. +void ScaleSlope(int src_width, + int src_height, + int dst_width, + int dst_height, + enum FilterMode filtering, + int* x, + int* y, + int* dx, + int* dy) { + assert(x != NULL); + assert(y != NULL); + assert(dx != NULL); + assert(dy != NULL); + assert(src_width != 0); + assert(src_height != 0); + assert(dst_width > 0); + assert(dst_height > 0); + // Check for 1 pixel and avoid FixedDiv overflow. + if (dst_width == 1 && src_width >= 32768) { + dst_width = src_width; + } + if (dst_height == 1 && src_height >= 32768) { + dst_height = src_height; + } + if (filtering == kFilterBox) { + // Scale step for point sampling duplicates all pixels equally. + *dx = FixedDiv(Abs(src_width), dst_width); + *dy = FixedDiv(src_height, dst_height); + *x = 0; + *y = 0; + } else if (filtering == kFilterBilinear) { + // Scale step for bilinear sampling renders last pixel once for upsample. + if (dst_width <= Abs(src_width)) { + *dx = FixedDiv(Abs(src_width), dst_width); + *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter. + } else if (dst_width > 1) { + *dx = FixedDiv1(Abs(src_width), dst_width); + *x = 0; + } + if (dst_height <= src_height) { + *dy = FixedDiv(src_height, dst_height); + *y = CENTERSTART(*dy, -32768); // Subtract 0.5 (32768) to center filter. + } else if (dst_height > 1) { + *dy = FixedDiv1(src_height, dst_height); + *y = 0; + } + } else if (filtering == kFilterLinear) { + // Scale step for bilinear sampling renders last pixel once for upsample. + if (dst_width <= Abs(src_width)) { + *dx = FixedDiv(Abs(src_width), dst_width); + *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter. + } else if (dst_width > 1) { + *dx = FixedDiv1(Abs(src_width), dst_width); + *x = 0; + } + *dy = FixedDiv(src_height, dst_height); + *y = *dy >> 1; + } else { + // Scale step for point sampling duplicates all pixels equally. + *dx = FixedDiv(Abs(src_width), dst_width); + *dy = FixedDiv(src_height, dst_height); + *x = CENTERSTART(*dx, 0); + *y = CENTERSTART(*dy, 0); + } + // Negative src_width means horizontally mirror. + if (src_width < 0) { + *x += (dst_width - 1) * *dx; + *dx = -*dx; + // src_width = -src_width; // Caller must do this. + } +} +#undef CENTERSTART + +// Read 8x2 upsample with filtering and write 16x1. +// actually reads an extra pixel, so 9x2. +void ScaleRowUp2_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + const uint16_t* src2 = src_ptr + src_stride; + + int x; + for (x = 0; x < dst_width - 1; x += 2) { + uint16_t p0 = src_ptr[0]; + uint16_t p1 = src_ptr[1]; + uint16_t p2 = src2[0]; + uint16_t p3 = src2[1]; + dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4; + dst[1] = (p0 * 3 + p1 * 9 + p2 + p3 * 3 + 8) >> 4; + ++src_ptr; + ++src2; + dst += 2; + } + if (dst_width & 1) { + uint16_t p0 = src_ptr[0]; + uint16_t p1 = src_ptr[1]; + uint16_t p2 = src2[0]; + uint16_t p3 = src2[1]; + dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4; + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/scale_gcc.cc b/media/libvpx/libvpx/third_party/libyuv/source/scale_gcc.cc new file mode 100644 index 0000000000..312236d2df --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/scale_gcc.cc @@ -0,0 +1,1374 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" +#include "libyuv/scale_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC x86 and x64. +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) + +// Offsets for source bytes 0 to 9 +static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, + 128, 128, 128, 128, 128, 128, 128, 128}; + +// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. +static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12, + 128, 128, 128, 128, 128, 128, 128, 128}; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15, + 128, 128, 128, 128, 128, 128, 128, 128}; + +// Offsets for source bytes 0 to 10 +static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; + +// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. +static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, + 8, 9, 9, 10, 10, 11, 12, 13}; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10, + 10, 11, 12, 13, 13, 14, 14, 15}; + +// Coefficients for source bytes 0 to 10 +static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2}; + +// Coefficients for source bytes 10 to 21 +static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1}; + +// Coefficients for source bytes 21 to 31 +static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3}; + +// Coefficients for source bytes 21 to 31 +static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2}; + +static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128}; + +static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3, + 6, 8, 11, 14, 128, 128, 128, 128}; + +// Arrange words 0,3,6 into 0,1,2 +static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128}; + +// Arrange words 0,3,6 into 3,4,5 +static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1, + 6, 7, 12, 13, 128, 128, 128, 128}; + +// Scaling values for boxes of 3x3 and 2x3 +static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, + 65536 / 9, 65536 / 6, 0, 0}; + +// Arrange first value for pixels 0,1,2,3,4,5 +static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128, + 11, 128, 14, 128, 128, 128, 128, 128}; + +// Arrange second value for pixels 0,1,2,3,4,5 +static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128, + 12, 128, 15, 128, 128, 128, 128, 128}; + +// Arrange third value for pixels 0,1,2,3,4,5 +static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128, + 13, 128, 128, 128, 128, 128, 128, 128}; + +// Scaling values for boxes of 3x2 and 2x2 +static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, + 65536 / 3, 65536 / 2, 0, 0}; + +// GCC versions of row functions are verbatim conversions from Visual C. +// Generated using gcc disassembly on Visual C object file: +// objdump -D yuvscaler.obj >yuvscaler.txt + +void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + // 16 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); +} + +void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pavgw %%xmm5,%%xmm0 \n" + "pavgw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm4", "xmm5"); +} + +void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%3,1),%%xmm2 \n" + "movdqu 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "psrlw $0x1,%%xmm0 \n" + "psrlw $0x1,%%xmm1 \n" + "pavgw %%xmm5,%%xmm0 \n" + "pavgw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); +} + +#ifdef HAS_SCALEROWDOWN2_AVX2 +void ScaleRowDown2_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); +} + +void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm4", "xmm5"); +} + +void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" + "vmovdqu 0x20(%0,%3,1),%%ymm3 \n" + "lea 0x40(%0),%0 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vpsrlw $0x1,%%ymm0,%%ymm0 \n" + "vpsrlw $0x1,%%ymm1,%%ymm1 \n" + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); +} +#endif // HAS_SCALEROWDOWN2_AVX2 + +void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrld $0x18,%%xmm5 \n" + "pslld $0x10,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "psrlw $0x8,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm5"); +} + +void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + intptr_t stridex3; + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "packuswb %%xmm4,%%xmm4 \n" + "psllw $0x3,%%xmm5 \n" + "lea 0x00(%4,%4,2),%3 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%4,1),%%xmm2 \n" + "movdqu 0x10(%0,%4,1),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "movdqu 0x00(%0,%4,2),%%xmm2 \n" + "movdqu 0x10(%0,%4,2),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "movdqu 0x00(%0,%3,1),%%xmm2 \n" + "movdqu 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "phaddw %%xmm1,%%xmm0 \n" + "paddw %%xmm5,%%xmm0 \n" + "psrlw $0x4,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "=&r"(stridex3) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +#ifdef HAS_SCALEROWDOWN4_AVX2 +void ScaleRowDown4_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrld $0x18,%%ymm5,%%ymm5 \n" + "vpslld $0x10,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm5"); +} + +void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpsllw $0x3,%%ymm4,%%ymm5 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" + "vmovdqu 0x20(%0,%3,1),%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu 0x00(%0,%3,2),%%ymm2 \n" + "vmovdqu 0x20(%0,%3,2),%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu 0x00(%0,%4,1),%%ymm2 \n" + "vmovdqu 0x20(%0,%4,1),%%ymm3 \n" + "lea 0x40(%0),%0 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" + "vpsrlw $0x4,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(src_stride * 3)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_SCALEROWDOWN4_AVX2 + +void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "movdqa %0,%%xmm3 \n" + "movdqa %1,%%xmm4 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kShuf0), // %0 + "m"(kShuf1), // %1 + "m"(kShuf2) // %2 + ); + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm2 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "palignr $0x8,%%xmm0,%%xmm1 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm5,%%xmm2 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x8(%1) \n" + "movq %%xmm2,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "movdqa %0,%%xmm2 \n" // kShuf01 + "movdqa %1,%%xmm3 \n" // kShuf11 + "movdqa %2,%%xmm4 \n" // kShuf21 + : + : "m"(kShuf01), // %0 + "m"(kShuf11), // %1 + "m"(kShuf21) // %2 + ); + asm volatile( + "movdqa %0,%%xmm5 \n" // kMadd01 + "movdqa %1,%%xmm0 \n" // kMadd11 + "movdqa %2,%%xmm1 \n" // kRound34 + : + : "m"(kMadd01), // %0 + "m"(kMadd11), // %1 + "m"(kRound34) // %2 + ); + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x00(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,(%1) \n" + "movdqu 0x8(%0),%%xmm6 \n" + "movdqu 0x8(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x8(%1) \n" + "movdqu 0x10(%0),%%xmm6 \n" + "movdqu 0x10(%0,%3,1),%%xmm7 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "pmaddubsw %4,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kMadd21) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} + +void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "movdqa %0,%%xmm2 \n" // kShuf01 + "movdqa %1,%%xmm3 \n" // kShuf11 + "movdqa %2,%%xmm4 \n" // kShuf21 + : + : "m"(kShuf01), // %0 + "m"(kShuf11), // %1 + "m"(kShuf21) // %2 + ); + asm volatile( + "movdqa %0,%%xmm5 \n" // kMadd01 + "movdqa %1,%%xmm0 \n" // kMadd11 + "movdqa %2,%%xmm1 \n" // kRound34 + : + : "m"(kMadd01), // %0 + "m"(kMadd11), // %1 + "m"(kRound34) // %2 + ); + + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x00(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,(%1) \n" + "movdqu 0x8(%0),%%xmm6 \n" + "movdqu 0x8(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x8(%1) \n" + "movdqu 0x10(%0),%%xmm6 \n" + "movdqu 0x10(%0,%3,1),%%xmm7 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "pmaddubsw %4,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kMadd21) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} + +void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movhlps %%xmm0,%%xmm1 \n" + "movd %%xmm1,0x8(%1) \n" + "lea 0xc(%1),%1 \n" + "sub $0xc,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kShuf38a), // %3 + "m"(kShuf38b) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"); +} + +void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "movdqa %0,%%xmm2 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm4 \n" + "movdqa %3,%%xmm5 \n" + : + : "m"(kShufAb0), // %0 + "m"(kShufAb1), // %1 + "m"(kShufAb2), // %2 + "m"(kScaleAb2) // %3 + ); + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%3,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "paddusw %%xmm6,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "paddusw %%xmm0,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movd %%xmm1,(%1) \n" + "psrlq $0x10,%%xmm1 \n" + "movd %%xmm1,0x2(%1) \n" + "lea 0x6(%1),%1 \n" + "sub $0x6,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} + +void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "movdqa %0,%%xmm2 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + : + : "m"(kShufAc), // %0 + "m"(kShufAc3), // %1 + "m"(kScaleAc33) // %2 + ); + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%3,1),%%xmm6 \n" + "movhlps %%xmm0,%%xmm1 \n" + "movhlps %%xmm6,%%xmm7 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm6 \n" + "punpcklbw %%xmm5,%%xmm7 \n" + "paddusw %%xmm6,%%xmm0 \n" + "paddusw %%xmm7,%%xmm1 \n" + "movdqu 0x00(%0,%3,2),%%xmm6 \n" + "lea 0x10(%0),%0 \n" + "movhlps %%xmm6,%%xmm7 \n" + "punpcklbw %%xmm5,%%xmm6 \n" + "punpcklbw %%xmm5,%%xmm7 \n" + "paddusw %%xmm6,%%xmm0 \n" + "paddusw %%xmm7,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm6 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "movdqa %%xmm1,%%xmm7 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm7 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm7 \n" + "pshufb %%xmm3,%%xmm7 \n" + "paddusw %%xmm7,%%xmm6 \n" + "pmulhuw %%xmm4,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movd %%xmm6,(%1) \n" + "psrlq $0x10,%%xmm6 \n" + "movd %%xmm6,0x2(%1) \n" + "lea 0x6(%1),%1 \n" + "sub $0x6,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} + +// Reads 16xN bytes and produces 16 shorts at a time. +void ScaleAddRow_SSE2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width) { + asm volatile( + + "pxor %%xmm5,%%xmm5 \n" + + // 16 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm3 \n" + "lea 0x10(%0),%0 \n" // src_ptr += 16 + "movdqu (%1),%%xmm0 \n" + "movdqu 0x10(%1),%%xmm1 \n" + "movdqa %%xmm3,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpckhbw %%xmm5,%%xmm3 \n" + "paddusw %%xmm2,%%xmm0 \n" + "paddusw %%xmm3,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); +} + +#ifdef HAS_SCALEADDROW_AVX2 +// Reads 32 bytes and accumulates to 32 shorts at a time. +void ScaleAddRow_AVX2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width) { + asm volatile( + + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm3 \n" + "lea 0x20(%0),%0 \n" // src_ptr += 32 + "vpermq $0xd8,%%ymm3,%%ymm3 \n" + "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" + "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" + "vpaddusw (%1),%%ymm2,%%ymm0 \n" + "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); +} +#endif // HAS_SCALEADDROW_AVX2 + +// Constant for making pixels signed to avoid pmaddubsw +// saturation. +static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; + +// Constant for making pixels unsigned and adding .5 for rounding. +static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040, + 0x4040, 0x4040, 0x4040, 0x4040}; + +// Bilinear column filtering. SSSE3 version. +void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { + intptr_t x0, x1, temp_pixel; + asm volatile( + "movd %6,%%xmm2 \n" + "movd %7,%%xmm3 \n" + "movl $0x04040000,%k2 \n" + "movd %k2,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x9,%%xmm6 \n" // 0x007f007f + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $15,%%xmm7 \n" // 0x00010001 + + "pextrw $0x1,%%xmm2,%k3 \n" + "subl $0x2,%5 \n" + "jl 29f \n" + "movdqa %%xmm2,%%xmm0 \n" + "paddd %%xmm3,%%xmm0 \n" + "punpckldq %%xmm0,%%xmm2 \n" + "punpckldq %%xmm3,%%xmm3 \n" + "paddd %%xmm3,%%xmm3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + + LABELALIGN + "2: \n" + "movdqa %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm2 \n" + "movzwl 0x00(%1,%3,1),%k2 \n" + "movd %k2,%%xmm0 \n" + "psrlw $0x9,%%xmm1 \n" + "movzwl 0x00(%1,%4,1),%k2 \n" + "movd %k2,%%xmm4 \n" + "pshufb %%xmm5,%%xmm1 \n" + "punpcklwd %%xmm4,%%xmm0 \n" + "psubb %8,%%xmm0 \n" // make pixels signed. + "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) + + // 1 + "paddusb %%xmm7,%%xmm1 \n" + "pmaddubsw %%xmm0,%%xmm1 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + "paddw %9,%%xmm1 \n" // make pixels unsigned. + "psrlw $0x7,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movd %%xmm1,%k2 \n" + "mov %w2,(%0) \n" + "lea 0x2(%0),%0 \n" + "subl $0x2,%5 \n" + "jge 2b \n" + + LABELALIGN + "29: \n" + "addl $0x1,%5 \n" + "jl 99f \n" + "movzwl 0x00(%1,%3,1),%k2 \n" + "movd %k2,%%xmm0 \n" + "psrlw $0x9,%%xmm2 \n" + "pshufb %%xmm5,%%xmm2 \n" + "psubb %8,%%xmm0 \n" // make pixels signed. + "pxor %%xmm6,%%xmm2 \n" + "paddusb %%xmm7,%%xmm2 \n" + "pmaddubsw %%xmm0,%%xmm2 \n" + "paddw %9,%%xmm2 \n" // make pixels unsigned. + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" + "movd %%xmm2,%k2 \n" + "mov %b2,(%0) \n" + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "=&a"(temp_pixel), // %2 + "=&r"(x0), // %3 + "=&r"(x1), // %4 +#if defined(__x86_64__) + "+rm"(dst_width) // %5 +#else + "+m"(dst_width) // %5 +#endif + : "rm"(x), // %6 + "rm"(dx), // %7 +#if defined(__x86_64__) + "x"(kFsub80), // %8 + "x"(kFadd40) // %9 +#else + "m"(kFsub80), // %8 + "m"(kFadd40) // %9 +#endif + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} + +// Reads 4 pixels, duplicates them and writes 8 pixels. +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +void ScaleColsUp2_SSE2(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { + (void)x; + (void)dx; + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%1),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "movdqu %%xmm0,(%0) \n" + "movdqu %%xmm1,0x10(%0) \n" + "lea 0x20(%0),%0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); +} + +void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "shufps $0xdd,%%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); +} + +void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); +} + +void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%3,1),%%xmm2 \n" + "movdqu 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); +} + +// Reads 4 pixels at a time. +// Alignment requirement: dst_argb 16 byte aligned. +void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + intptr_t src_stepx_x4 = (intptr_t)(src_stepx); + intptr_t src_stepx_x12; + (void)src_stride; + asm volatile( + "lea 0x00(,%1,4),%1 \n" + "lea 0x00(%1,%1,2),%4 \n" + + LABELALIGN + "1: \n" + "movd (%0),%%xmm0 \n" + "movd 0x00(%0,%1,1),%%xmm1 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "movd 0x00(%0,%1,2),%%xmm2 \n" + "movd 0x00(%0,%4,1),%%xmm3 \n" + "lea 0x00(%0,%1,4),%0 \n" + "punpckldq %%xmm3,%%xmm2 \n" + "punpcklqdq %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stepx_x4), // %1 + "+r"(dst_argb), // %2 + "+r"(dst_width), // %3 + "=&r"(src_stepx_x12) // %4 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3"); +} + +// Blends four 2x2 to 4x1. +// Alignment requirement: dst_argb 16 byte aligned. +void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + intptr_t src_stepx_x4 = (intptr_t)(src_stepx); + intptr_t src_stepx_x12; + intptr_t row1 = (intptr_t)(src_stride); + asm volatile( + "lea 0x00(,%1,4),%1 \n" + "lea 0x00(%1,%1,2),%4 \n" + "lea 0x00(%0,%5,1),%5 \n" + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" + "movhps 0x00(%0,%1,1),%%xmm0 \n" + "movq 0x00(%0,%1,2),%%xmm1 \n" + "movhps 0x00(%0,%4,1),%%xmm1 \n" + "lea 0x00(%0,%1,4),%0 \n" + "movq (%5),%%xmm2 \n" + "movhps 0x00(%5,%1,1),%%xmm2 \n" + "movq 0x00(%5,%1,2),%%xmm3 \n" + "movhps 0x00(%5,%4,1),%%xmm3 \n" + "lea 0x00(%5,%1,4),%5 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stepx_x4), // %1 + "+r"(dst_argb), // %2 + "+rm"(dst_width), // %3 + "=&r"(src_stepx_x12), // %4 + "+r"(row1) // %5 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3"); +} + +void ScaleARGBCols_SSE2(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + intptr_t x0, x1; + asm volatile( + "movd %5,%%xmm2 \n" + "movd %6,%%xmm3 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + "pshufd $0x11,%%xmm3,%%xmm0 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm3 \n" + "pshufd $0x5,%%xmm3,%%xmm0 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pextrw $0x1,%%xmm2,%k0 \n" + "pextrw $0x3,%%xmm2,%k1 \n" + "cmp $0x0,%4 \n" + "jl 99f \n" + "sub $0x4,%4 \n" + "jl 49f \n" + + LABELALIGN + "40: \n" + "movd 0x00(%3,%0,4),%%xmm0 \n" + "movd 0x00(%3,%1,4),%%xmm1 \n" + "pextrw $0x5,%%xmm2,%k0 \n" + "pextrw $0x7,%%xmm2,%k1 \n" + "paddd %%xmm3,%%xmm2 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "movd 0x00(%3,%0,4),%%xmm1 \n" + "movd 0x00(%3,%1,4),%%xmm4 \n" + "pextrw $0x1,%%xmm2,%k0 \n" + "pextrw $0x3,%%xmm2,%k1 \n" + "punpckldq %%xmm4,%%xmm1 \n" + "punpcklqdq %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%4 \n" + "jge 40b \n" + + "49: \n" + "test $0x2,%4 \n" + "je 29f \n" + "movd 0x00(%3,%0,4),%%xmm0 \n" + "movd 0x00(%3,%1,4),%%xmm1 \n" + "pextrw $0x5,%%xmm2,%k0 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "movq %%xmm0,(%2) \n" + "lea 0x8(%2),%2 \n" + "29: \n" + "test $0x1,%4 \n" + "je 99f \n" + "movd 0x00(%3,%0,4),%%xmm0 \n" + "movd %%xmm0,(%2) \n" + "99: \n" + : "=&a"(x0), // %0 + "=&d"(x1), // %1 + "+r"(dst_argb), // %2 + "+r"(src_argb), // %3 + "+r"(dst_width) // %4 + : "rm"(x), // %5 + "rm"(dx) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} + +// Reads 4 pixels, duplicates them and writes 8 pixels. +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + (void)x; + (void)dx; + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%1),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpckldq %%xmm0,%%xmm0 \n" + "punpckhdq %%xmm1,%%xmm1 \n" + "movdqu %%xmm0,(%0) \n" + "movdqu %%xmm1,0x10(%0) \n" + "lea 0x20(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); +} + +// Shuffle table for arranging 2 pixels into pairs for pmaddubsw +static const uvec8 kShuffleColARGB = { + 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel + 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel +}; + +// Shuffle table for duplicating 2 fractions into 8 bytes each +static const uvec8 kShuffleFractions = { + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, +}; + +// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version +void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + intptr_t x0, x1; + asm volatile( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm5 \n" + : + : "m"(kShuffleColARGB), // %0 + "m"(kShuffleFractions) // %1 + ); + + asm volatile( + "movd %5,%%xmm2 \n" + "movd %6,%%xmm3 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x9,%%xmm6 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "sub $0x2,%2 \n" + "jl 29f \n" + "movdqa %%xmm2,%%xmm0 \n" + "paddd %%xmm3,%%xmm0 \n" + "punpckldq %%xmm0,%%xmm2 \n" + "punpckldq %%xmm3,%%xmm3 \n" + "paddd %%xmm3,%%xmm3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + + LABELALIGN + "2: \n" + "movdqa %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm2 \n" + "movq 0x00(%1,%3,4),%%xmm0 \n" + "psrlw $0x9,%%xmm1 \n" + "movhps 0x00(%1,%4,4),%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pxor %%xmm6,%%xmm1 \n" + "pmaddubsw %%xmm1,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%0) \n" + "lea 0x8(%0),%0 \n" + "sub $0x2,%2 \n" + "jge 2b \n" + + LABELALIGN + "29: \n" + "add $0x1,%2 \n" + "jl 99f \n" + "psrlw $0x9,%%xmm2 \n" + "movq 0x00(%1,%3,4),%%xmm0 \n" + "pshufb %%xmm5,%%xmm2 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pxor %%xmm6,%%xmm2 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%0) \n" + + LABELALIGN "99: \n" // clang-format error. + + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+rm"(dst_width), // %2 + "=&r"(x0), // %3 + "=&r"(x1) // %4 + : "rm"(x), // %5 + "rm"(dx) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} + +// Divide num by div and return as 16.16 fixed point result. +int FixedDiv_X86(int num, int div) { + asm volatile( + "cdq \n" + "shld $0x10,%%eax,%%edx \n" + "shl $0x10,%%eax \n" + "idiv %1 \n" + "mov %0, %%eax \n" + : "+a"(num) // %0 + : "c"(div) // %1 + : "memory", "cc", "edx"); + return num; +} + +// Divide num - 1 by div - 1 and return as 16.16 fixed point result. +int FixedDiv1_X86(int num, int div) { + asm volatile( + "cdq \n" + "shld $0x10,%%eax,%%edx \n" + "shl $0x10,%%eax \n" + "sub $0x10001,%%eax \n" + "sbb $0x0,%%edx \n" + "sub $0x1,%1 \n" + "idiv %1 \n" + "mov %0, %%eax \n" + : "+a"(num) // %0 + : "c"(div) // %1 + : "memory", "cc", "edx"); + return num; +} + +#endif // defined(__x86_64__) || defined(__i386__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/scale_msa.cc b/media/libvpx/libvpx/third_party/libyuv/source/scale_msa.cc new file mode 100644 index 0000000000..482a521f0d --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/scale_msa.cc @@ -0,0 +1,949 @@ +/* + * Copyright 2016 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "libyuv/scale_row.h" + +// This module is for GCC MSA +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#include "libyuv/macros_msa.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define LOAD_INDEXED_DATA(srcp, indx0, out0) \ + { \ + out0[0] = srcp[indx0[0]]; \ + out0[1] = srcp[indx0[1]]; \ + out0[2] = srcp[indx0[2]]; \ + out0[3] = srcp[indx0[3]]; \ + } + +void ScaleARGBRowDown2_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + int x; + v16u8 src0, src1, dst0; + (void)src_stride; + + for (x = 0; x < dst_width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); + dst0 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); + ST_UB(dst0, dst_argb); + src_argb += 32; + dst_argb += 16; + } +} + +void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + int x; + v16u8 src0, src1, vec0, vec1, dst0; + (void)src_stride; + + for (x = 0; x < dst_width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); + vec0 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0); + vec1 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); + dst0 = (v16u8)__msa_aver_u_b((v16u8)vec0, (v16u8)vec1); + ST_UB(dst0, dst_argb); + src_argb += 32; + dst_argb += 16; + } +} + +void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + int x; + const uint8_t* s = src_argb; + const uint8_t* t = src_argb + src_stride; + v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0; + v8u16 reg0, reg1, reg2, reg3; + v16i8 shuffler = {0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15}; + + for (x = 0; x < dst_width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)t, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)t, 16); + vec0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src0, (v16i8)src0); + vec1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1); + vec2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src2, (v16i8)src2); + vec3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src3, (v16i8)src3); + reg0 = __msa_hadd_u_h(vec0, vec0); + reg1 = __msa_hadd_u_h(vec1, vec1); + reg2 = __msa_hadd_u_h(vec2, vec2); + reg3 = __msa_hadd_u_h(vec3, vec3); + reg0 += reg2; + reg1 += reg3; + reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 2); + reg1 = (v8u16)__msa_srari_h((v8i16)reg1, 2); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + ST_UB(dst0, dst_argb); + s += 32; + t += 32; + dst_argb += 16; + } +} + +void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_argb, + int dst_width) { + int x; + int32_t stepx = src_stepx * 4; + int32_t data0, data1, data2, data3; + (void)src_stride; + + for (x = 0; x < dst_width; x += 4) { + data0 = LW(src_argb); + data1 = LW(src_argb + stepx); + data2 = LW(src_argb + stepx * 2); + data3 = LW(src_argb + stepx * 3); + SW(data0, dst_argb); + SW(data1, dst_argb + 4); + SW(data2, dst_argb + 8); + SW(data3, dst_argb + 12); + src_argb += stepx * 4; + dst_argb += 16; + } +} + +void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + int x; + const uint8_t* nxt_argb = src_argb + src_stride; + int32_t stepx = src_stepx * 4; + int64_t data0, data1, data2, data3; + v16u8 src0 = {0}, src1 = {0}, src2 = {0}, src3 = {0}; + v16u8 vec0, vec1, vec2, vec3; + v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v16u8 dst0; + + for (x = 0; x < dst_width; x += 4) { + data0 = LD(src_argb); + data1 = LD(src_argb + stepx); + data2 = LD(src_argb + stepx * 2); + data3 = LD(src_argb + stepx * 3); + src0 = (v16u8)__msa_insert_d((v2i64)src0, 0, data0); + src0 = (v16u8)__msa_insert_d((v2i64)src0, 1, data1); + src1 = (v16u8)__msa_insert_d((v2i64)src1, 0, data2); + src1 = (v16u8)__msa_insert_d((v2i64)src1, 1, data3); + data0 = LD(nxt_argb); + data1 = LD(nxt_argb + stepx); + data2 = LD(nxt_argb + stepx * 2); + data3 = LD(nxt_argb + stepx * 3); + src2 = (v16u8)__msa_insert_d((v2i64)src2, 0, data0); + src2 = (v16u8)__msa_insert_d((v2i64)src2, 1, data1); + src3 = (v16u8)__msa_insert_d((v2i64)src3, 0, data2); + src3 = (v16u8)__msa_insert_d((v2i64)src3, 1, data3); + vec0 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0); + vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1); + vec2 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0); + vec3 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1); + reg0 = __msa_hadd_u_h(vec0, vec0); + reg1 = __msa_hadd_u_h(vec1, vec1); + reg2 = __msa_hadd_u_h(vec2, vec2); + reg3 = __msa_hadd_u_h(vec3, vec3); + reg4 = (v8u16)__msa_pckev_d((v2i64)reg2, (v2i64)reg0); + reg5 = (v8u16)__msa_pckev_d((v2i64)reg3, (v2i64)reg1); + reg6 = (v8u16)__msa_pckod_d((v2i64)reg2, (v2i64)reg0); + reg7 = (v8u16)__msa_pckod_d((v2i64)reg3, (v2i64)reg1); + reg4 += reg6; + reg5 += reg7; + reg4 = (v8u16)__msa_srari_h((v8i16)reg4, 2); + reg5 = (v8u16)__msa_srari_h((v8i16)reg5, 2); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); + ST_UB(dst0, dst_argb); + src_argb += stepx * 4; + nxt_argb += stepx * 4; + dst_argb += 16; + } +} + +void ScaleRowDown2_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + (void)src_stride; + + for (x = 0; x < dst_width; x += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); + dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst, 16); + src_ptr += 64; + dst += 32; + } +} + +void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0, dst1; + (void)src_stride; + + for (x = 0; x < dst_width; x += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); + vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + vec2 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + dst0 = __msa_aver_u_b(vec1, vec0); + dst1 = __msa_aver_u_b(vec3, vec2); + ST_UB2(dst0, dst1, dst, 16); + src_ptr += 64; + dst += 32; + } +} + +void ScaleRowDown2Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1; + v8u16 vec0, vec1, vec2, vec3; + + for (x = 0; x < dst_width; x += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); + src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); + src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); + src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); + src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); + vec0 = __msa_hadd_u_h(src0, src0); + vec1 = __msa_hadd_u_h(src1, src1); + vec2 = __msa_hadd_u_h(src2, src2); + vec3 = __msa_hadd_u_h(src3, src3); + vec0 += __msa_hadd_u_h(src4, src4); + vec1 += __msa_hadd_u_h(src5, src5); + vec2 += __msa_hadd_u_h(src6, src6); + vec3 += __msa_hadd_u_h(src7, src7); + vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 2); + vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 2); + vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 2); + vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 2); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + ST_UB2(dst0, dst1, dst, 16); + s += 64; + t += 64; + dst += 32; + } +} + +void ScaleRowDown4_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + v16u8 src0, src1, src2, src3, vec0, vec1, dst0; + (void)src_stride; + + for (x = 0; x < dst_width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); + vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst); + src_ptr += 64; + dst += 16; + } +} + +void ScaleRowDown4Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + const uint8_t* s = src_ptr; + const uint8_t* t0 = s + src_stride; + const uint8_t* t1 = s + src_stride * 2; + const uint8_t* t2 = s + src_stride * 3; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0; + v8u16 vec0, vec1, vec2, vec3; + v4u32 reg0, reg1, reg2, reg3; + + for (x = 0; x < dst_width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); + src4 = (v16u8)__msa_ld_b((v16i8*)t0, 0); + src5 = (v16u8)__msa_ld_b((v16i8*)t0, 16); + src6 = (v16u8)__msa_ld_b((v16i8*)t0, 32); + src7 = (v16u8)__msa_ld_b((v16i8*)t0, 48); + vec0 = __msa_hadd_u_h(src0, src0); + vec1 = __msa_hadd_u_h(src1, src1); + vec2 = __msa_hadd_u_h(src2, src2); + vec3 = __msa_hadd_u_h(src3, src3); + vec0 += __msa_hadd_u_h(src4, src4); + vec1 += __msa_hadd_u_h(src5, src5); + vec2 += __msa_hadd_u_h(src6, src6); + vec3 += __msa_hadd_u_h(src7, src7); + src0 = (v16u8)__msa_ld_b((v16i8*)t1, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)t1, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)t1, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)t1, 48); + src4 = (v16u8)__msa_ld_b((v16i8*)t2, 0); + src5 = (v16u8)__msa_ld_b((v16i8*)t2, 16); + src6 = (v16u8)__msa_ld_b((v16i8*)t2, 32); + src7 = (v16u8)__msa_ld_b((v16i8*)t2, 48); + vec0 += __msa_hadd_u_h(src0, src0); + vec1 += __msa_hadd_u_h(src1, src1); + vec2 += __msa_hadd_u_h(src2, src2); + vec3 += __msa_hadd_u_h(src3, src3); + vec0 += __msa_hadd_u_h(src4, src4); + vec1 += __msa_hadd_u_h(src5, src5); + vec2 += __msa_hadd_u_h(src6, src6); + vec3 += __msa_hadd_u_h(src7, src7); + reg0 = __msa_hadd_u_w(vec0, vec0); + reg1 = __msa_hadd_u_w(vec1, vec1); + reg2 = __msa_hadd_u_w(vec2, vec2); + reg3 = __msa_hadd_u_w(vec3, vec3); + reg0 = (v4u32)__msa_srari_w((v4i32)reg0, 4); + reg1 = (v4u32)__msa_srari_w((v4i32)reg1, 4); + reg2 = (v4u32)__msa_srari_w((v4i32)reg2, 4); + reg3 = (v4u32)__msa_srari_w((v4i32)reg3, 4); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst); + s += 64; + t0 += 64; + t1 += 64; + t2 += 64; + dst += 16; + } +} + +void ScaleRowDown38_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x, width; + uint64_t dst0; + uint32_t dst1; + v16u8 src0, src1, vec0; + v16i8 mask = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0}; + (void)src_stride; + + assert(dst_width % 3 == 0); + width = dst_width / 3; + + for (x = 0; x < width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); + vec0 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)src0); + dst0 = __msa_copy_u_d((v2i64)vec0, 0); + dst1 = __msa_copy_u_w((v4i32)vec0, 2); + SD(dst0, dst); + SW(dst1, dst + 8); + src_ptr += 32; + dst += 12; + } +} + +void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + int x, width; + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + uint64_t dst0; + uint32_t dst1; + v16u8 src0, src1, src2, src3, out; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v4u32 tmp0, tmp1, tmp2, tmp3, tmp4; + v8i16 zero = {0}; + v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9}; + v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0}; + v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA); + v4u32 const_0x4000 = (v4u32)__msa_fill_w(0x4000); + + assert((dst_width % 3 == 0) && (dst_width > 0)); + width = dst_width / 3; + + for (x = 0; x < width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)t, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)t, 16); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); + vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); + vec4 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec0); + vec5 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec1); + vec6 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec2); + vec7 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec3); + vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); + vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2); + vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); + tmp0 = __msa_hadd_u_w(vec4, vec4); + tmp1 = __msa_hadd_u_w(vec5, vec5); + tmp2 = __msa_hadd_u_w(vec6, vec6); + tmp3 = __msa_hadd_u_w(vec7, vec7); + tmp4 = __msa_hadd_u_w(vec0, vec0); + vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); + tmp0 = __msa_hadd_u_w(vec0, vec0); + tmp1 = __msa_hadd_u_w(vec1, vec1); + tmp0 *= const_0x2AAA; + tmp1 *= const_0x2AAA; + tmp4 *= const_0x4000; + tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16); + tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16); + tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16); + vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4); + out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0); + dst0 = __msa_copy_u_d((v2i64)out, 0); + dst1 = __msa_copy_u_w((v4i32)out, 2); + SD(dst0, dst_ptr); + SW(dst1, dst_ptr + 8); + s += 32; + t += 32; + dst_ptr += 12; + } +} + +void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + int x, width; + const uint8_t* s = src_ptr; + const uint8_t* t0 = s + src_stride; + const uint8_t* t1 = s + src_stride * 2; + uint64_t dst0; + uint32_t dst1; + v16u8 src0, src1, src2, src3, src4, src5, out; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v4u32 tmp0, tmp1, tmp2, tmp3, tmp4; + v8u16 zero = {0}; + v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9}; + v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0}; + v4u32 const_0x1C71 = (v4u32)__msa_fill_w(0x1C71); + v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA); + + assert((dst_width % 3 == 0) && (dst_width > 0)); + width = dst_width / 3; + + for (x = 0; x < width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)t0, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)t0, 16); + src4 = (v16u8)__msa_ld_b((v16i8*)t1, 0); + src5 = (v16u8)__msa_ld_b((v16i8*)t1, 16); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); + vec4 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src4); + vec5 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src4); + vec6 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src5); + vec7 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src5); + vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); + vec0 += __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); + vec1 += __msa_hadd_u_h((v16u8)vec5, (v16u8)vec5); + vec2 += __msa_hadd_u_h((v16u8)vec6, (v16u8)vec6); + vec3 += __msa_hadd_u_h((v16u8)vec7, (v16u8)vec7); + vec4 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec0); + vec5 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec1); + vec6 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec2); + vec7 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec3); + vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); + vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2); + vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); + tmp0 = __msa_hadd_u_w(vec4, vec4); + tmp1 = __msa_hadd_u_w(vec5, vec5); + tmp2 = __msa_hadd_u_w(vec6, vec6); + tmp3 = __msa_hadd_u_w(vec7, vec7); + tmp4 = __msa_hadd_u_w(vec0, vec0); + vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); + tmp0 = __msa_hadd_u_w(vec0, vec0); + tmp1 = __msa_hadd_u_w(vec1, vec1); + tmp0 *= const_0x1C71; + tmp1 *= const_0x1C71; + tmp4 *= const_0x2AAA; + tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16); + tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16); + tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16); + vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4); + out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0); + dst0 = __msa_copy_u_d((v2i64)out, 0); + dst1 = __msa_copy_u_w((v4i32)out, 2); + SD(dst0, dst_ptr); + SW(dst1, dst_ptr + 8); + s += 32; + t0 += 32; + t1 += 32; + dst_ptr += 12; + } +} + +void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { + int x; + v16u8 src0; + v8u16 dst0, dst1; + v16i8 zero = {0}; + + assert(src_width > 0); + + for (x = 0; x < src_width; x += 16) { + src0 = LD_UB(src_ptr); + dst0 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 0); + dst1 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 16); + dst0 += (v8u16)__msa_ilvr_b(zero, (v16i8)src0); + dst1 += (v8u16)__msa_ilvl_b(zero, (v16i8)src0); + ST_UH2(dst0, dst1, dst_ptr, 8); + src_ptr += 16; + dst_ptr += 16; + } +} + +void ScaleFilterCols_MSA(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { + int j; + v4i32 vec_x = __msa_fill_w(x); + v4i32 vec_dx = __msa_fill_w(dx); + v4i32 vec_const = {0, 1, 2, 3}; + v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; + v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8u16 reg0, reg1; + v16u8 dst0; + v4i32 const_0xFFFF = __msa_fill_w(0xFFFF); + v4i32 const_0x40 = __msa_fill_w(0x40); + + vec0 = vec_dx * vec_const; + vec1 = vec_dx * 4; + vec_x += vec0; + + for (j = 0; j < dst_width - 1; j += 16) { + vec2 = vec_x >> 16; + vec6 = vec_x & const_0xFFFF; + vec_x += vec1; + vec3 = vec_x >> 16; + vec7 = vec_x & const_0xFFFF; + vec_x += vec1; + vec4 = vec_x >> 16; + vec8 = vec_x & const_0xFFFF; + vec_x += vec1; + vec5 = vec_x >> 16; + vec9 = vec_x & const_0xFFFF; + vec_x += vec1; + vec6 >>= 9; + vec7 >>= 9; + vec8 >>= 9; + vec9 >>= 9; + LOAD_INDEXED_DATA(src_ptr, vec2, tmp0); + LOAD_INDEXED_DATA(src_ptr, vec3, tmp1); + LOAD_INDEXED_DATA(src_ptr, vec4, tmp2); + LOAD_INDEXED_DATA(src_ptr, vec5, tmp3); + vec2 += 1; + vec3 += 1; + vec4 += 1; + vec5 += 1; + LOAD_INDEXED_DATA(src_ptr, vec2, tmp4); + LOAD_INDEXED_DATA(src_ptr, vec3, tmp5); + LOAD_INDEXED_DATA(src_ptr, vec4, tmp6); + LOAD_INDEXED_DATA(src_ptr, vec5, tmp7); + tmp4 -= tmp0; + tmp5 -= tmp1; + tmp6 -= tmp2; + tmp7 -= tmp3; + tmp4 *= vec6; + tmp5 *= vec7; + tmp6 *= vec8; + tmp7 *= vec9; + tmp4 += const_0x40; + tmp5 += const_0x40; + tmp6 += const_0x40; + tmp7 += const_0x40; + tmp4 >>= 7; + tmp5 >>= 7; + tmp6 >>= 7; + tmp7 >>= 7; + tmp0 += tmp4; + tmp1 += tmp5; + tmp2 += tmp6; + tmp3 += tmp7; + reg0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + reg1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + __msa_st_b(dst0, dst_ptr, 0); + dst_ptr += 16; + } +} + +void ScaleARGBCols_MSA(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); + int j; + v4i32 x_vec = __msa_fill_w(x); + v4i32 dx_vec = __msa_fill_w(dx); + v4i32 const_vec = {0, 1, 2, 3}; + v4i32 vec0, vec1, vec2; + v4i32 dst0; + + vec0 = dx_vec * const_vec; + vec1 = dx_vec * 4; + x_vec += vec0; + + for (j = 0; j < dst_width; j += 4) { + vec2 = x_vec >> 16; + x_vec += vec1; + LOAD_INDEXED_DATA(src, vec2, dst0); + __msa_st_w(dst0, dst, 0); + dst += 4; + } +} + +void ScaleARGBFilterCols_MSA(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint32_t* src = (const uint32_t*)(src_argb); + int j; + v4u32 src0, src1, src2, src3; + v4u32 vec0, vec1, vec2, vec3; + v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v16u8 mult0, mult1, mult2, mult3; + v8u16 tmp0, tmp1, tmp2, tmp3; + v16u8 dst0, dst1; + v4u32 vec_x = (v4u32)__msa_fill_w(x); + v4u32 vec_dx = (v4u32)__msa_fill_w(dx); + v4u32 vec_const = {0, 1, 2, 3}; + v16u8 const_0x7f = (v16u8)__msa_fill_b(0x7f); + + vec0 = vec_dx * vec_const; + vec1 = vec_dx * 4; + vec_x += vec0; + + for (j = 0; j < dst_width - 1; j += 8) { + vec2 = vec_x >> 16; + reg0 = (v16u8)(vec_x >> 9); + vec_x += vec1; + vec3 = vec_x >> 16; + reg1 = (v16u8)(vec_x >> 9); + vec_x += vec1; + reg0 = reg0 & const_0x7f; + reg1 = reg1 & const_0x7f; + reg0 = (v16u8)__msa_shf_b((v16i8)reg0, 0); + reg1 = (v16u8)__msa_shf_b((v16i8)reg1, 0); + reg2 = reg0 ^ const_0x7f; + reg3 = reg1 ^ const_0x7f; + mult0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)reg2); + mult1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)reg2); + mult2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)reg3); + mult3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)reg3); + LOAD_INDEXED_DATA(src, vec2, src0); + LOAD_INDEXED_DATA(src, vec3, src1); + vec2 += 1; + vec3 += 1; + LOAD_INDEXED_DATA(src, vec2, src2); + LOAD_INDEXED_DATA(src, vec3, src3); + reg4 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0); + reg5 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0); + reg6 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1); + reg7 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1); + tmp0 = __msa_dotp_u_h(reg4, mult0); + tmp1 = __msa_dotp_u_h(reg5, mult1); + tmp2 = __msa_dotp_u_h(reg6, mult2); + tmp3 = __msa_dotp_u_h(reg7, mult3); + tmp0 >>= 7; + tmp1 >>= 7; + tmp2 >>= 7; + tmp3 >>= 7; + dst0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + dst1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); + __msa_st_b(dst0, dst_argb, 0); + __msa_st_b(dst1, dst_argb, 16); + dst_argb += 32; + } +} + +void ScaleRowDown34_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + (void)src_stride; + v16u8 src0, src1, src2, src3; + v16u8 vec0, vec1, vec2; + v16i8 mask0 = {0, 1, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20}; + v16i8 mask1 = {5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20, 21, 23, 24, 25}; + v16i8 mask2 = {11, 12, 13, 15, 16, 17, 19, 20, + 21, 23, 24, 25, 27, 28, 29, 31}; + + assert((dst_width % 3 == 0) && (dst_width > 0)); + + for (x = 0; x < dst_width; x += 48) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); + vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src2, (v16i8)src1); + vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src2); + __msa_st_b((v16i8)vec0, dst, 0); + __msa_st_b((v16i8)vec1, dst, 16); + __msa_st_b((v16i8)vec2, dst, 32); + src_ptr += 64; + dst += 48; + } +} + +void ScaleRowDown34_0_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + int x; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5; + v16u8 vec6, vec7, vec8, vec9, vec10, vec11; + v8i16 reg0, reg1, reg2, reg3, reg4, reg5; + v8i16 reg6, reg7, reg8, reg9, reg10, reg11; + v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1}; + v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1}; + v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3}; + v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; + v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15, + 16, 17, 17, 18, 18, 19, 20, 21}; + v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15}; + v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1}; + v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2}; + v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2}; + + assert((dst_width % 3 == 0) && (dst_width > 0)); + + for (x = 0; x < dst_width; x += 48) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); + src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); + src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); + src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); + src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); + vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0); + vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); + vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1); + vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2); + vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); + vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3); + vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4); + vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4); + vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5); + vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6); + vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6); + vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7); + reg0 = (v8i16)__msa_dotp_u_h(vec0, const0); + reg1 = (v8i16)__msa_dotp_u_h(vec1, const1); + reg2 = (v8i16)__msa_dotp_u_h(vec2, const2); + reg3 = (v8i16)__msa_dotp_u_h(vec3, const0); + reg4 = (v8i16)__msa_dotp_u_h(vec4, const1); + reg5 = (v8i16)__msa_dotp_u_h(vec5, const2); + reg6 = (v8i16)__msa_dotp_u_h(vec6, const0); + reg7 = (v8i16)__msa_dotp_u_h(vec7, const1); + reg8 = (v8i16)__msa_dotp_u_h(vec8, const2); + reg9 = (v8i16)__msa_dotp_u_h(vec9, const0); + reg10 = (v8i16)__msa_dotp_u_h(vec10, const1); + reg11 = (v8i16)__msa_dotp_u_h(vec11, const2); + reg0 = __msa_srar_h(reg0, shft0); + reg1 = __msa_srar_h(reg1, shft1); + reg2 = __msa_srar_h(reg2, shft2); + reg3 = __msa_srar_h(reg3, shft0); + reg4 = __msa_srar_h(reg4, shft1); + reg5 = __msa_srar_h(reg5, shft2); + reg6 = __msa_srar_h(reg6, shft0); + reg7 = __msa_srar_h(reg7, shft1); + reg8 = __msa_srar_h(reg8, shft2); + reg9 = __msa_srar_h(reg9, shft0); + reg10 = __msa_srar_h(reg10, shft1); + reg11 = __msa_srar_h(reg11, shft2); + reg0 = reg0 * 3 + reg6; + reg1 = reg1 * 3 + reg7; + reg2 = reg2 * 3 + reg8; + reg3 = reg3 * 3 + reg9; + reg4 = reg4 * 3 + reg10; + reg5 = reg5 * 3 + reg11; + reg0 = __msa_srari_h(reg0, 2); + reg1 = __msa_srari_h(reg1, 2); + reg2 = __msa_srari_h(reg2, 2); + reg3 = __msa_srari_h(reg3, 2); + reg4 = __msa_srari_h(reg4, 2); + reg5 = __msa_srari_h(reg5, 2); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2); + dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); + __msa_st_b((v16i8)dst0, d, 0); + __msa_st_b((v16i8)dst1, d, 16); + __msa_st_b((v16i8)dst2, d, 32); + s += 64; + t += 64; + d += 48; + } +} + +void ScaleRowDown34_1_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + int x; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5; + v16u8 vec6, vec7, vec8, vec9, vec10, vec11; + v8i16 reg0, reg1, reg2, reg3, reg4, reg5; + v8i16 reg6, reg7, reg8, reg9, reg10, reg11; + v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1}; + v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1}; + v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3}; + v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; + v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15, + 16, 17, 17, 18, 18, 19, 20, 21}; + v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15}; + v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1}; + v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2}; + v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2}; + + assert((dst_width % 3 == 0) && (dst_width > 0)); + + for (x = 0; x < dst_width; x += 48) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); + src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); + src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); + src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); + src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); + vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0); + vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); + vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1); + vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2); + vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); + vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3); + vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4); + vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4); + vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5); + vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6); + vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6); + vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7); + reg0 = (v8i16)__msa_dotp_u_h(vec0, const0); + reg1 = (v8i16)__msa_dotp_u_h(vec1, const1); + reg2 = (v8i16)__msa_dotp_u_h(vec2, const2); + reg3 = (v8i16)__msa_dotp_u_h(vec3, const0); + reg4 = (v8i16)__msa_dotp_u_h(vec4, const1); + reg5 = (v8i16)__msa_dotp_u_h(vec5, const2); + reg6 = (v8i16)__msa_dotp_u_h(vec6, const0); + reg7 = (v8i16)__msa_dotp_u_h(vec7, const1); + reg8 = (v8i16)__msa_dotp_u_h(vec8, const2); + reg9 = (v8i16)__msa_dotp_u_h(vec9, const0); + reg10 = (v8i16)__msa_dotp_u_h(vec10, const1); + reg11 = (v8i16)__msa_dotp_u_h(vec11, const2); + reg0 = __msa_srar_h(reg0, shft0); + reg1 = __msa_srar_h(reg1, shft1); + reg2 = __msa_srar_h(reg2, shft2); + reg3 = __msa_srar_h(reg3, shft0); + reg4 = __msa_srar_h(reg4, shft1); + reg5 = __msa_srar_h(reg5, shft2); + reg6 = __msa_srar_h(reg6, shft0); + reg7 = __msa_srar_h(reg7, shft1); + reg8 = __msa_srar_h(reg8, shft2); + reg9 = __msa_srar_h(reg9, shft0); + reg10 = __msa_srar_h(reg10, shft1); + reg11 = __msa_srar_h(reg11, shft2); + reg0 += reg6; + reg1 += reg7; + reg2 += reg8; + reg3 += reg9; + reg4 += reg10; + reg5 += reg11; + reg0 = __msa_srari_h(reg0, 1); + reg1 = __msa_srari_h(reg1, 1); + reg2 = __msa_srari_h(reg2, 1); + reg3 = __msa_srari_h(reg3, 1); + reg4 = __msa_srari_h(reg4, 1); + reg5 = __msa_srari_h(reg5, 1); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2); + dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); + __msa_st_b((v16i8)dst0, d, 0); + __msa_st_b((v16i8)dst1, d, 16); + __msa_st_b((v16i8)dst2, d, 32); + s += 64; + t += 64; + d += 48; + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) diff --git a/media/libvpx/libvpx/third_party/libyuv/source/scale_neon.cc b/media/libvpx/libvpx/third_party/libyuv/source/scale_neon.cc new file mode 100644 index 0000000000..459a2995df --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/scale_neon.cc @@ -0,0 +1,970 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC Neon. +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ + !defined(__aarch64__) + +// NEON downscalers with interpolation. +// Provided by Fritz Koenig + +// Read 32x1 throw away even pixels, and write 16x1. +void ScaleRowDown2_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + // load even pixels into q0, odd into q1 + "vld2.8 {q0, q1}, [%0]! \n" + "subs %2, %2, #16 \n" // 16 processed per loop + "vst1.8 {q1}, [%1]! \n" // store odd pixels + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1" // Clobber List + ); +} + +// Read 32x1 average down and write 16x1. +void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "vrhadd.u8 q0, q0, q1 \n" // rounding half add + "vst1.8 {q0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1" // Clobber List + ); +} + +// Read 32x2 average down and write 16x1. +void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + asm volatile( + // change the stride to row 2 pointer + "add %1, %0 \n" + "1: \n" + "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc + "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc + "subs %3, %3, #16 \n" // 16 processed per loop + "vpaddl.u8 q0, q0 \n" // row 1 add adjacent + "vpaddl.u8 q1, q1 \n" + "vpadal.u8 q0, q2 \n" // row 2 add adjacent + + // row1 + "vpadal.u8 q1, q3 \n" + "vrshrn.u16 d0, q0, #2 \n" // downshift, round and + // pack + "vrshrn.u16 d1, q1, #2 \n" + "vst1.8 {q0}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "q0", "q1", "q2", "q3" // Clobber List + ); +} + +void ScaleRowDown4_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "subs %2, %2, #8 \n" // 8 processed per loop + "vst1.8 {d2}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1", "memory", "cc"); +} + +void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src_ptr1 = src_ptr + src_stride; + const uint8_t* src_ptr2 = src_ptr + src_stride * 2; + const uint8_t* src_ptr3 = src_ptr + src_stride * 3; + asm volatile( + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load up 16x4 + "vld1.8 {q1}, [%3]! \n" + "vld1.8 {q2}, [%4]! \n" + "vld1.8 {q3}, [%5]! \n" + "subs %2, %2, #4 \n" + "vpaddl.u8 q0, q0 \n" + "vpadal.u8 q0, q1 \n" + "vpadal.u8 q0, q2 \n" + "vpadal.u8 q0, q3 \n" + "vpaddl.u16 q0, q0 \n" + "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding + "vmovn.u16 d0, q0 \n" + "vst1.32 {d0[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_ptr1), // %3 + "+r"(src_ptr2), // %4 + "+r"(src_ptr3) // %5 + : + : "q0", "q1", "q2", "q3", "memory", "cc"); +} + +// Down scale from 4 to 3 pixels. Use the neon multilane read/write +// to load up the every 4th pixel into a 4 different registers. +// Point samples 32 pixels to 24 pixels. +void ScaleRowDown34_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "subs %2, %2, #24 \n" + "vmov d2, d3 \n" // order d0, d1, d2 + "vst3.8 {d0, d1, d2}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "d0", "d1", "d2", "d3", "memory", "cc"); +} + +void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + "subs %2, %2, #24 \n" + + // filter src line 0 with src line 1 + // expand chars to shorts to allow for room + // when adding lines together + "vmovl.u8 q8, d4 \n" + "vmovl.u8 q9, d5 \n" + "vmovl.u8 q10, d6 \n" + "vmovl.u8 q11, d7 \n" + + // 3 * line_0 + line_1 + "vmlal.u8 q8, d0, d24 \n" + "vmlal.u8 q9, d1, d24 \n" + "vmlal.u8 q10, d2, d24 \n" + "vmlal.u8 q11, d3, d24 \n" + + // (3 * line_0 + line_1) >> 2 + "vqrshrn.u16 d0, q8, #2 \n" + "vqrshrn.u16 d1, q9, #2 \n" + "vqrshrn.u16 d2, q10, #2 \n" + "vqrshrn.u16 d3, q11, #2 \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "vmovl.u8 q8, d1 \n" + "vmlal.u8 q8, d0, d24 \n" + "vqrshrn.u16 d0, q8, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "vmovl.u8 q8, d2 \n" + "vmlal.u8 q8, d3, d24 \n" + "vqrshrn.u16 d2, q8, #2 \n" + + "vst3.8 {d0, d1, d2}, [%1]! \n" + + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", + "cc"); +} + +void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + "subs %2, %2, #24 \n" + // average src line 0 with src line 1 + "vrhadd.u8 q0, q0, q2 \n" + "vrhadd.u8 q1, q1, q3 \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "vmovl.u8 q3, d1 \n" + "vmlal.u8 q3, d0, d24 \n" + "vqrshrn.u16 d0, q3, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "vmovl.u8 q3, d2 \n" + "vmlal.u8 q3, d3, d24 \n" + "vqrshrn.u16 d2, q3, #2 \n" + + "vst3.8 {d0, d1, d2}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"); +} + +#define HAS_SCALEROWDOWN38_NEON +static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, + 22, 24, 27, 30, 0, 0, 0, 0}; +static const uvec8 kShuf38_2 = {0, 8, 16, 2, 10, 17, 4, 12, + 18, 6, 14, 19, 0, 0, 0, 0}; +static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12}; +static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18}; + +// 32 -> 12 +void ScaleRowDown38_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "vld1.8 {q3}, [%3] \n" + "1: \n" + "vld1.8 {d0, d1, d2, d3}, [%0]! \n" + "subs %2, %2, #12 \n" + "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" + "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" + "vst1.8 {d4}, [%1]! \n" + "vst1.32 {d5[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(&kShuf38) // %3 + : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"); +} + +// 32x3 -> 12x1 +void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src_ptr1 = src_ptr + src_stride * 2; + + asm volatile( + "vld1.16 {q13}, [%5] \n" + "vld1.8 {q14}, [%6] \n" + "vld1.8 {q15}, [%7] \n" + "add %3, %0 \n" + "1: \n" + + // d0 = 00 40 01 41 02 42 03 43 + // d1 = 10 50 11 51 12 52 13 53 + // d2 = 20 60 21 61 22 62 23 63 + // d3 = 30 70 31 71 32 72 33 73 + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" + "vld4.8 {d16, d17, d18, d19}, [%4]! \n" + "subs %2, %2, #12 \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // d0 = 00 10 01 11 02 12 03 13 + // d1 = 40 50 41 51 42 52 43 53 + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" + "vtrn.u8 d16, d17 \n" + + // d2 = 20 30 21 31 22 32 23 33 + // d3 = 60 70 61 71 62 72 63 73 + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" + "vtrn.u8 d18, d19 \n" + + // d0 = 00+10 01+11 02+12 03+13 + // d2 = 40+50 41+51 42+52 43+53 + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" + "vpaddl.u8 q8, q8 \n" + + // d3 = 60+70 61+71 62+72 63+73 + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" + "vpaddl.u8 d19, d19 \n" + + // combine source lines + "vadd.u16 q0, q2 \n" + "vadd.u16 q0, q8 \n" + "vadd.u16 d4, d3, d7 \n" + "vadd.u16 d4, d19 \n" + + // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] + // + s[6 + st * 1] + s[7 + st * 1] + // + s[6 + st * 2] + s[7 + st * 2]) / 6 + "vqrdmulh.s16 q2, q2, q13 \n" + "vmovn.u16 d4, q2 \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" + "vmovl.u8 q9, d18 \n" + + // combine source lines + "vadd.u16 q1, q3 \n" + "vadd.u16 q1, q9 \n" + + // d4 = xx 20 xx 30 xx 22 xx 32 + // d5 = xx 21 xx 31 xx 23 xx 33 + "vtrn.u32 d2, d3 \n" + + // d4 = xx 20 xx 21 xx 22 xx 23 + // d5 = xx 30 xx 31 xx 32 xx 33 + "vtrn.u16 d2, d3 \n" + + // 0+1+2, 3+4+5 + "vadd.u16 q0, q1 \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "vqrdmulh.s16 q0, q0, q15 \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + "vmov.u8 d2, d4 \n" + + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + + "vst1.8 {d3}, [%1]! \n" + "vst1.32 {d4[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride), // %3 + "+r"(src_ptr1) // %4 + : "r"(&kMult38_Div6), // %5 + "r"(&kShuf38_2), // %6 + "r"(&kMult38_Div9) // %7 + : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", + "cc"); +} + +// 32x2 -> 12x1 +void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "vld1.16 {q13}, [%4] \n" + "vld1.8 {q14}, [%5] \n" + "add %3, %0 \n" + "1: \n" + + // d0 = 00 40 01 41 02 42 03 43 + // d1 = 10 50 11 51 12 52 13 53 + // d2 = 20 60 21 61 22 62 23 63 + // d3 = 30 70 31 71 32 72 33 73 + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" + "subs %2, %2, #12 \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // d0 = 00 10 01 11 02 12 03 13 + // d1 = 40 50 41 51 42 52 43 53 + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" + + // d2 = 20 30 21 31 22 32 23 33 + // d3 = 60 70 61 71 62 72 63 73 + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" + + // d0 = 00+10 01+11 02+12 03+13 + // d2 = 40+50 41+51 42+52 43+53 + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" + + // d3 = 60+70 61+71 62+72 63+73 + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" + + // combine source lines + "vadd.u16 q0, q2 \n" + "vadd.u16 d4, d3, d7 \n" + + // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 + "vqrshrn.u16 d4, q2, #2 \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" + + // combine source lines + "vadd.u16 q1, q3 \n" + + // d4 = xx 20 xx 30 xx 22 xx 32 + // d5 = xx 21 xx 31 xx 23 xx 33 + "vtrn.u32 d2, d3 \n" + + // d4 = xx 20 xx 21 xx 22 xx 23 + // d5 = xx 30 xx 31 xx 32 xx 33 + "vtrn.u16 d2, d3 \n" + + // 0+1+2, 3+4+5 + "vadd.u16 q0, q1 \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "vqrdmulh.s16 q0, q0, q13 \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + "vmov.u8 d2, d4 \n" + + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + + "vst1.8 {d3}, [%1]! \n" + "vst1.32 {d4[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : "r"(&kMult38_Div6), // %4 + "r"(&kShuf38_2) // %5 + : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"); +} + +void ScaleAddRows_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + int src_width, + int src_height) { + const uint8_t* src_tmp; + asm volatile( + "1: \n" + "mov %0, %1 \n" + "mov r12, %5 \n" + "veor q2, q2, q2 \n" + "veor q3, q3, q3 \n" + "2: \n" + // load 16 pixels into q0 + "vld1.8 {q0}, [%0], %3 \n" + "vaddw.u8 q3, q3, d1 \n" + "vaddw.u8 q2, q2, d0 \n" + "subs r12, r12, #1 \n" + "bgt 2b \n" + "vst1.16 {q2, q3}, [%2]! \n" // store pixels + "add %1, %1, #16 \n" + "subs %4, %4, #16 \n" // 16 processed per loop + "bgt 1b \n" + : "=&r"(src_tmp), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_ptr), // %2 + "+r"(src_stride), // %3 + "+r"(src_width), // %4 + "+r"(src_height) // %5 + : + : "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +// TODO(Yang Zhang): Investigate less load instructions for +// the x/dx stepping +#define LOAD2_DATA8_LANE(n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5 \n" \ + "add %3, %3, %4 \n" \ + "vld2.8 {d6[" #n "], d7[" #n "]}, [%6] \n" + +// The NEON version mimics this formula (from row_common.cc): +// #define BLENDER(a, b, f) (uint8_t)((int)(a) + +// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) + +void ScaleFilterCols_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { + int dx_offset[4] = {0, 1, 2, 3}; + int* tmp = dx_offset; + const uint8_t* src_tmp = src_ptr; + asm volatile ( + "vdup.32 q0, %3 \n" // x + "vdup.32 q1, %4 \n" // dx + "vld1.32 {q2}, [%5] \n" // 0 1 2 3 + "vshl.i32 q3, q1, #2 \n" // 4 * dx + "vmul.s32 q1, q1, q2 \n" + // x , x + 1 * dx, x + 2 * dx, x + 3 * dx + "vadd.s32 q1, q1, q0 \n" + // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx + "vadd.s32 q2, q1, q3 \n" + "vshl.i32 q0, q3, #1 \n" // 8 * dx + "1: \n" + LOAD2_DATA8_LANE(0) + LOAD2_DATA8_LANE(1) + LOAD2_DATA8_LANE(2) + LOAD2_DATA8_LANE(3) + LOAD2_DATA8_LANE(4) + LOAD2_DATA8_LANE(5) + LOAD2_DATA8_LANE(6) + LOAD2_DATA8_LANE(7) + "vmov q10, q1 \n" + "vmov q11, q2 \n" + "vuzp.16 q10, q11 \n" + "vmovl.u8 q8, d6 \n" + "vmovl.u8 q9, d7 \n" + "vsubl.s16 q11, d18, d16 \n" + "vsubl.s16 q12, d19, d17 \n" + "vmovl.u16 q13, d20 \n" + "vmovl.u16 q10, d21 \n" + "vmul.s32 q11, q11, q13 \n" + "vmul.s32 q12, q12, q10 \n" + "vrshrn.s32 d18, q11, #16 \n" + "vrshrn.s32 d19, q12, #16 \n" + "vadd.s16 q8, q8, q9 \n" + "vmovn.s16 d6, q8 \n" + + "vst1.8 {d6}, [%0]! \n" // store pixels + "vadd.s32 q1, q1, q0 \n" + "vadd.s32 q2, q2, q0 \n" + "subs %2, %2, #8 \n" // 8 processed per loop + "bgt 1b \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(x), // %3 + "+r"(dx), // %4 + "+r"(tmp), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13" + ); +} + +#undef LOAD2_DATA8_LANE + +// 16x2 -> 16x1 +void ScaleFilterRows_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { + asm volatile( + "cmp %4, #0 \n" + "beq 100f \n" + "add %2, %1 \n" + "cmp %4, #64 \n" + "beq 75f \n" + "cmp %4, #128 \n" + "beq 50f \n" + "cmp %4, #192 \n" + "beq 25f \n" + + "vdup.8 d5, %4 \n" + "rsb %4, #256 \n" + "vdup.8 d4, %4 \n" + // General purpose row blend. + "1: \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vmull.u8 q13, d0, d4 \n" + "vmull.u8 q14, d1, d4 \n" + "vmlal.u8 q13, d2, d5 \n" + "vmlal.u8 q14, d3, d5 \n" + "vrshrn.u16 d0, q13, #8 \n" + "vrshrn.u16 d1, q14, #8 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 1b \n" + "b 99f \n" + + // Blend 25 / 75. + "25: \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 25b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 50b \n" + "b 99f \n" + + // Blend 75 / 25. + "75: \n" + "vld1.8 {q1}, [%1]! \n" + "vld1.8 {q0}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 75b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "vld1.8 {q0}, [%1]! \n" + "subs %3, %3, #16 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 100b \n" + + "99: \n" + "vst1.8 {d1[7]}, [%0] \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_stride), // %2 + "+r"(dst_width), // %3 + "+r"(source_y_fraction) // %4 + : + : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"); +} + +void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB + "subs %2, %2, #8 \n" // 8 processed per loop + "vmov q2, q1 \n" // load next 8 ARGB + "vst2.32 {q2, q3}, [%1]! \n" // store odd pixels + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +// 46: f964 018d vld4.32 {d16,d18,d20,d22}, [r4]! +// 4a: 3e04 subs r6, #4 +// 4c: f964 118d vld4.32 {d17,d19,d21,d23}, [r4]! +// 50: ef64 21f4 vorr q9, q10, q10 +// 54: f942 038d vst2.32 {d16-d19}, [r2]! +// 58: d1f5 bne.n 46 + +void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB + "subs %2, %2, #8 \n" // 8 processed per loop + "vrhadd.u8 q0, q0, q1 \n" // rounding half add + "vrhadd.u8 q1, q2, q3 \n" // rounding half add + "vst2.32 {q0, q1}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + asm volatile( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB + "subs %3, %3, #8 \n" // 8 processed per loop. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. + "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB + "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB + "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. + "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. + "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes + "vrshrn.u16 d1, q1, #2 \n" + "vrshrn.u16 d2, q2, #2 \n" + "vrshrn.u16 d3, q3, #2 \n" + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); +} + +// Reads 4 pixels at a time. +// Alignment requirement: src_argb 4 byte aligned. +void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + asm volatile( + "mov r12, %3, lsl #2 \n" + "1: \n" + "vld1.32 {d0[0]}, [%0], r12 \n" + "vld1.32 {d0[1]}, [%0], r12 \n" + "vld1.32 {d1[0]}, [%0], r12 \n" + "vld1.32 {d1[1]}, [%0], r12 \n" + "subs %2, %2, #4 \n" // 4 pixels per loop. + "vst1.8 {q0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"(src_stepx) // %3 + : "memory", "cc", "r12", "q0"); +} + +// Reads 4 pixels at a time. +// Alignment requirement: src_argb 4 byte aligned. +void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + asm volatile( + "mov r12, %4, lsl #2 \n" + "add %1, %1, %0 \n" + "1: \n" + "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1 + "vld1.8 {d1}, [%1], r12 \n" + "vld1.8 {d2}, [%0], r12 \n" + "vld1.8 {d3}, [%1], r12 \n" + "vld1.8 {d4}, [%0], r12 \n" + "vld1.8 {d5}, [%1], r12 \n" + "vld1.8 {d6}, [%0], r12 \n" + "vld1.8 {d7}, [%1], r12 \n" + "vaddl.u8 q0, d0, d1 \n" + "vaddl.u8 q1, d2, d3 \n" + "vaddl.u8 q2, d4, d5 \n" + "vaddl.u8 q3, d6, d7 \n" + "vswp.8 d1, d2 \n" // ab_cd -> ac_bd + "vswp.8 d5, d6 \n" // ef_gh -> eg_fh + "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) + "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) + "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. + "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. + "subs %3, %3, #4 \n" // 4 pixels per loop. + "vst1.8 {q0}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride), // %1 + "+r"(dst_argb), // %2 + "+r"(dst_width) // %3 + : "r"(src_stepx) // %4 + : "memory", "cc", "r12", "q0", "q1", "q2", "q3"); +} + +// TODO(Yang Zhang): Investigate less load instructions for +// the x/dx stepping +#define LOAD1_DATA32_LANE(dn, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ + "vld1.32 {" #dn "[" #n "]}, [%6] \n" + +void ScaleARGBCols_NEON(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + int tmp; + const uint8_t* src_tmp = src_argb; + asm volatile( + "1: \n" + // clang-format off + LOAD1_DATA32_LANE(d0, 0) + LOAD1_DATA32_LANE(d0, 1) + LOAD1_DATA32_LANE(d1, 0) + LOAD1_DATA32_LANE(d1, 1) + LOAD1_DATA32_LANE(d2, 0) + LOAD1_DATA32_LANE(d2, 1) + LOAD1_DATA32_LANE(d3, 0) + LOAD1_DATA32_LANE(d3, 1) + // clang-format on + "vst1.32 {q0, q1}, [%0]! \n" // store pixels + "subs %2, %2, #8 \n" // 8 processed per loop + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width), // %2 + "+r"(x), // %3 + "+r"(dx), // %4 + "=&r"(tmp), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "q0", "q1"); +} + +#undef LOAD1_DATA32_LANE + +// TODO(Yang Zhang): Investigate less load instructions for +// the x/dx stepping +#define LOAD2_DATA32_LANE(dn1, dn2, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ + "vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n" + +void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + int dx_offset[4] = {0, 1, 2, 3}; + int* tmp = dx_offset; + const uint8_t* src_tmp = src_argb; + asm volatile ( + "vdup.32 q0, %3 \n" // x + "vdup.32 q1, %4 \n" // dx + "vld1.32 {q2}, [%5] \n" // 0 1 2 3 + "vshl.i32 q9, q1, #2 \n" // 4 * dx + "vmul.s32 q1, q1, q2 \n" + "vmov.i8 q3, #0x7f \n" // 0x7F + "vmov.i16 q15, #0x7f \n" // 0x7F + // x , x + 1 * dx, x + 2 * dx, x + 3 * dx + "vadd.s32 q8, q1, q0 \n" + "1: \n" + // d0, d1: a + // d2, d3: b + LOAD2_DATA32_LANE(d0, d2, 0) + LOAD2_DATA32_LANE(d0, d2, 1) + LOAD2_DATA32_LANE(d1, d3, 0) + LOAD2_DATA32_LANE(d1, d3, 1) + "vshrn.i32 d22, q8, #9 \n" + "vand.16 d22, d22, d30 \n" + "vdup.8 d24, d22[0] \n" + "vdup.8 d25, d22[2] \n" + "vdup.8 d26, d22[4] \n" + "vdup.8 d27, d22[6] \n" + "vext.8 d4, d24, d25, #4 \n" + "vext.8 d5, d26, d27, #4 \n" // f + "veor.8 q10, q2, q3 \n" // 0x7f ^ f + "vmull.u8 q11, d0, d20 \n" + "vmull.u8 q12, d1, d21 \n" + "vmull.u8 q13, d2, d4 \n" + "vmull.u8 q14, d3, d5 \n" + "vadd.i16 q11, q11, q13 \n" + "vadd.i16 q12, q12, q14 \n" + "vshrn.i16 d0, q11, #7 \n" + "vshrn.i16 d1, q12, #7 \n" + + "vst1.32 {d0, d1}, [%0]! \n" // store pixels + "vadd.s32 q8, q8, q9 \n" + "subs %2, %2, #4 \n" // 4 processed per loop + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width), // %2 + "+r"(x), // %3 + "+r"(dx), // %4 + "+r"(tmp), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +#undef LOAD2_DATA32_LANE + +#endif // defined(__ARM_NEON__) && !defined(__aarch64__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/scale_neon64.cc b/media/libvpx/libvpx/third_party/libyuv/source/scale_neon64.cc new file mode 100644 index 0000000000..494a9cfbfb --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/scale_neon64.cc @@ -0,0 +1,1064 @@ +/* + * Copyright 2014 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" +#include "libyuv/scale.h" +#include "libyuv/scale_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC Neon armv8 64 bit. +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +// Read 32x1 throw away even pixels, and write 16x1. +void ScaleRowDown2_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + // load even pixels into v0, odd into v1 + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %w2, %w2, #16 \n" // 16 processed per loop + "st1 {v1.16b}, [%1], #16 \n" // store odd pixels + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1" // Clobber List + ); +} + +// Read 32x1 average down and write 16x1. +void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + // load even pixels into v0, odd into v1 + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %w2, %w2, #16 \n" // 16 processed per loop + "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add + "st1 {v0.16b}, [%1], #16 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1" // Clobber List + ); +} + +// Read 32x2 average down and write 16x1. +void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + asm volatile( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc + "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc + "subs %w3, %w3, #16 \n" // 16 processed per loop + "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent + "uaddlp v1.8h, v1.16b \n" + "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent + "uadalp v1.8h, v3.16b \n" + "rshrn v0.8b, v0.8h, #2 \n" // round and pack + "rshrn2 v0.16b, v1.8h, #2 \n" + "st1 {v0.16b}, [%2], #16 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "v0", "v1", "v2", "v3" // Clobber List + ); +} + +void ScaleRowDown4_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "subs %w2, %w2, #8 \n" // 8 processed per loop + "st1 {v2.8b}, [%1], #8 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1", "v2", "v3", "memory", "cc"); +} + +void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src_ptr1 = src_ptr + src_stride; + const uint8_t* src_ptr2 = src_ptr + src_stride * 2; + const uint8_t* src_ptr3 = src_ptr + src_stride * 3; + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 + "ld1 {v1.16b}, [%2], #16 \n" + "ld1 {v2.16b}, [%3], #16 \n" + "ld1 {v3.16b}, [%4], #16 \n" + "subs %w5, %w5, #4 \n" + "uaddlp v0.8h, v0.16b \n" + "uadalp v0.8h, v1.16b \n" + "uadalp v0.8h, v2.16b \n" + "uadalp v0.8h, v3.16b \n" + "addp v0.8h, v0.8h, v0.8h \n" + "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding + "st1 {v0.s}[0], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_ptr1), // %2 + "+r"(src_ptr2), // %3 + "+r"(src_ptr3), // %4 + "+r"(dst_width) // %5 + : + : "v0", "v1", "v2", "v3", "memory", "cc"); +} + +// Down scale from 4 to 3 pixels. Use the neon multilane read/write +// to load up the every 4th pixel into a 4 different registers. +// Point samples 32 pixels to 24 pixels. +void ScaleRowDown34_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "subs %w2, %w2, #24 \n" + "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2 + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1", "v2", "v3", "memory", "cc"); +} + +void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "movi v20.8b, #3 \n" + "add %3, %3, %0 \n" + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 + "subs %w2, %w2, #24 \n" + + // filter src line 0 with src line 1 + // expand chars to shorts to allow for room + // when adding lines together + "ushll v16.8h, v4.8b, #0 \n" + "ushll v17.8h, v5.8b, #0 \n" + "ushll v18.8h, v6.8b, #0 \n" + "ushll v19.8h, v7.8b, #0 \n" + + // 3 * line_0 + line_1 + "umlal v16.8h, v0.8b, v20.8b \n" + "umlal v17.8h, v1.8b, v20.8b \n" + "umlal v18.8h, v2.8b, v20.8b \n" + "umlal v19.8h, v3.8b, v20.8b \n" + + // (3 * line_0 + line_1) >> 2 + "uqrshrn v0.8b, v16.8h, #2 \n" + "uqrshrn v1.8b, v17.8h, #2 \n" + "uqrshrn v2.8b, v18.8h, #2 \n" + "uqrshrn v3.8b, v19.8h, #2 \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "ushll v16.8h, v1.8b, #0 \n" + "umlal v16.8h, v0.8b, v20.8b \n" + "uqrshrn v0.8b, v16.8h, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "urhadd v1.8b, v1.8b, v2.8b \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "ushll v16.8h, v2.8b, #0 \n" + "umlal v16.8h, v3.8b, v20.8b \n" + "uqrshrn v2.8b, v16.8h, #2 \n" + + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", + "v19", "v20", "memory", "cc"); +} + +void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "movi v20.8b, #3 \n" + "add %3, %3, %0 \n" + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 + "subs %w2, %w2, #24 \n" + // average src line 0 with src line 1 + "urhadd v0.8b, v0.8b, v4.8b \n" + "urhadd v1.8b, v1.8b, v5.8b \n" + "urhadd v2.8b, v2.8b, v6.8b \n" + "urhadd v3.8b, v3.8b, v7.8b \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "ushll v4.8h, v1.8b, #0 \n" + "umlal v4.8h, v0.8b, v20.8b \n" + "uqrshrn v0.8b, v4.8h, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "urhadd v1.8b, v1.8b, v2.8b \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "ushll v4.8h, v2.8b, #0 \n" + "umlal v4.8h, v3.8b, v20.8b \n" + "uqrshrn v2.8b, v4.8h, #2 \n" + + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"); +} + +static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, + 22, 24, 27, 30, 0, 0, 0, 0}; +static const uvec8 kShuf38_2 = {0, 16, 32, 2, 18, 33, 4, 20, + 34, 6, 22, 35, 0, 0, 0, 0}; +static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12}; +static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18}; + +// 32 -> 12 +void ScaleRowDown38_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "ld1 {v3.16b}, [%3] \n" + "1: \n" + "ld1 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %w2, %w2, #12 \n" + "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" + "st1 {v2.8b}, [%1], #8 \n" + "st1 {v2.s}[2], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(&kShuf38) // %3 + : "v0", "v1", "v2", "v3", "memory", "cc"); +} + +// 32x3 -> 12x1 +void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src_ptr1 = src_ptr + src_stride * 2; + ptrdiff_t tmp_src_stride = src_stride; + + asm volatile( + "ld1 {v29.8h}, [%5] \n" + "ld1 {v30.16b}, [%6] \n" + "ld1 {v31.8h}, [%7] \n" + "add %2, %2, %0 \n" + "1: \n" + + // 00 40 01 41 02 42 03 43 + // 10 50 11 51 12 52 13 53 + // 20 60 21 61 22 62 23 63 + // 30 70 31 71 32 72 33 73 + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" + "subs %w4, %w4, #12 \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // 00 10 01 11 02 12 03 13 + // 40 50 41 51 42 52 43 53 + "trn1 v20.8b, v0.8b, v1.8b \n" + "trn2 v21.8b, v0.8b, v1.8b \n" + "trn1 v22.8b, v4.8b, v5.8b \n" + "trn2 v23.8b, v4.8b, v5.8b \n" + "trn1 v24.8b, v16.8b, v17.8b \n" + "trn2 v25.8b, v16.8b, v17.8b \n" + + // 20 30 21 31 22 32 23 33 + // 60 70 61 71 62 72 63 73 + "trn1 v0.8b, v2.8b, v3.8b \n" + "trn2 v1.8b, v2.8b, v3.8b \n" + "trn1 v4.8b, v6.8b, v7.8b \n" + "trn2 v5.8b, v6.8b, v7.8b \n" + "trn1 v16.8b, v18.8b, v19.8b \n" + "trn2 v17.8b, v18.8b, v19.8b \n" + + // 00+10 01+11 02+12 03+13 + // 40+50 41+51 42+52 43+53 + "uaddlp v20.4h, v20.8b \n" + "uaddlp v21.4h, v21.8b \n" + "uaddlp v22.4h, v22.8b \n" + "uaddlp v23.4h, v23.8b \n" + "uaddlp v24.4h, v24.8b \n" + "uaddlp v25.4h, v25.8b \n" + + // 60+70 61+71 62+72 63+73 + "uaddlp v1.4h, v1.8b \n" + "uaddlp v5.4h, v5.8b \n" + "uaddlp v17.4h, v17.8b \n" + + // combine source lines + "add v20.4h, v20.4h, v22.4h \n" + "add v21.4h, v21.4h, v23.4h \n" + "add v20.4h, v20.4h, v24.4h \n" + "add v21.4h, v21.4h, v25.4h \n" + "add v2.4h, v1.4h, v5.4h \n" + "add v2.4h, v2.4h, v17.4h \n" + + // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] + // + s[6 + st * 1] + s[7 + st * 1] + // + s[6 + st * 2] + s[7 + st * 2]) / 6 + "sqrdmulh v2.8h, v2.8h, v29.8h \n" + "xtn v2.8b, v2.8h \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "ushll v16.8h, v16.8b, #0 \n" + "uaddl v0.8h, v0.8b, v4.8b \n" + + // combine source lines + "add v0.8h, v0.8h, v16.8h \n" + + // xx 20 xx 21 xx 22 xx 23 + // xx 30 xx 31 xx 32 xx 33 + "trn1 v1.8h, v0.8h, v0.8h \n" + "trn2 v4.8h, v0.8h, v0.8h \n" + "xtn v0.4h, v1.4s \n" + "xtn v4.4h, v4.4s \n" + + // 0+1+2, 3+4+5 + "add v20.8h, v20.8h, v0.8h \n" + "add v21.8h, v21.8h, v4.8h \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "sqrdmulh v0.8h, v20.8h, v31.8h \n" + "sqrdmulh v1.8h, v21.8h, v31.8h \n" + + // Align for table lookup, vtbl requires registers to be adjacent + "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" + + "st1 {v3.8b}, [%1], #8 \n" + "st1 {v3.s}[2], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(tmp_src_stride), // %2 + "+r"(src_ptr1), // %3 + "+r"(dst_width) // %4 + : "r"(&kMult38_Div6), // %5 + "r"(&kShuf38_2), // %6 + "r"(&kMult38_Div9) // %7 + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", "v30", "v31", + "memory", "cc"); +} + +// 32x2 -> 12x1 +void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + // TODO(fbarchard): use src_stride directly for clang 3.5+. + ptrdiff_t tmp_src_stride = src_stride; + asm volatile( + "ld1 {v30.8h}, [%4] \n" + "ld1 {v31.16b}, [%5] \n" + "add %2, %2, %0 \n" + "1: \n" + + // 00 40 01 41 02 42 03 43 + // 10 50 11 51 12 52 13 53 + // 20 60 21 61 22 62 23 63 + // 30 70 31 71 32 72 33 73 + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" + "subs %w3, %w3, #12 \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // 00 10 01 11 02 12 03 13 + // 40 50 41 51 42 52 43 53 + "trn1 v16.8b, v0.8b, v1.8b \n" + "trn2 v17.8b, v0.8b, v1.8b \n" + "trn1 v18.8b, v4.8b, v5.8b \n" + "trn2 v19.8b, v4.8b, v5.8b \n" + + // 20 30 21 31 22 32 23 33 + // 60 70 61 71 62 72 63 73 + "trn1 v0.8b, v2.8b, v3.8b \n" + "trn2 v1.8b, v2.8b, v3.8b \n" + "trn1 v4.8b, v6.8b, v7.8b \n" + "trn2 v5.8b, v6.8b, v7.8b \n" + + // 00+10 01+11 02+12 03+13 + // 40+50 41+51 42+52 43+53 + "uaddlp v16.4h, v16.8b \n" + "uaddlp v17.4h, v17.8b \n" + "uaddlp v18.4h, v18.8b \n" + "uaddlp v19.4h, v19.8b \n" + + // 60+70 61+71 62+72 63+73 + "uaddlp v1.4h, v1.8b \n" + "uaddlp v5.4h, v5.8b \n" + + // combine source lines + "add v16.4h, v16.4h, v18.4h \n" + "add v17.4h, v17.4h, v19.4h \n" + "add v2.4h, v1.4h, v5.4h \n" + + // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 + "uqrshrn v2.8b, v2.8h, #2 \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + + // combine source lines + "uaddl v0.8h, v0.8b, v4.8b \n" + + // xx 20 xx 21 xx 22 xx 23 + // xx 30 xx 31 xx 32 xx 33 + "trn1 v1.8h, v0.8h, v0.8h \n" + "trn2 v4.8h, v0.8h, v0.8h \n" + "xtn v0.4h, v1.4s \n" + "xtn v4.4h, v4.4s \n" + + // 0+1+2, 3+4+5 + "add v16.8h, v16.8h, v0.8h \n" + "add v17.8h, v17.8h, v4.8h \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "sqrdmulh v0.8h, v16.8h, v30.8h \n" + "sqrdmulh v1.8h, v17.8h, v30.8h \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + + "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" + + "st1 {v3.8b}, [%1], #8 \n" + "st1 {v3.s}[2], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(tmp_src_stride), // %2 + "+r"(dst_width) // %3 + : "r"(&kMult38_Div6), // %4 + "r"(&kShuf38_2) // %5 + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", + "v19", "v30", "v31", "memory", "cc"); +} + +void ScaleAddRows_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + int src_width, + int src_height) { + const uint8_t* src_tmp; + asm volatile( + "1: \n" + "mov %0, %1 \n" + "mov w12, %w5 \n" + "eor v2.16b, v2.16b, v2.16b \n" + "eor v3.16b, v3.16b, v3.16b \n" + "2: \n" + // load 16 pixels into q0 + "ld1 {v0.16b}, [%0], %3 \n" + "uaddw2 v3.8h, v3.8h, v0.16b \n" + "uaddw v2.8h, v2.8h, v0.8b \n" + "subs w12, w12, #1 \n" + "b.gt 2b \n" + "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels + "add %1, %1, #16 \n" + "subs %w4, %w4, #16 \n" // 16 processed per loop + "b.gt 1b \n" + : "=&r"(src_tmp), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_ptr), // %2 + "+r"(src_stride), // %3 + "+r"(src_width), // %4 + "+r"(src_height) // %5 + : + : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +// TODO(Yang Zhang): Investigate less load instructions for +// the x/dx stepping +#define LOAD2_DATA8_LANE(n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5 \n" \ + "add %3, %3, %4 \n" \ + "ld2 {v4.b, v5.b}[" #n "], [%6] \n" + +// The NEON version mimics this formula (from row_common.cc): +// #define BLENDER(a, b, f) (uint8_t)((int)(a) + +// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) + +void ScaleFilterCols_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { + int dx_offset[4] = {0, 1, 2, 3}; + int* tmp = dx_offset; + const uint8_t* src_tmp = src_ptr; + int64_t x64 = (int64_t)x; // NOLINT + int64_t dx64 = (int64_t)dx; // NOLINT + asm volatile ( + "dup v0.4s, %w3 \n" // x + "dup v1.4s, %w4 \n" // dx + "ld1 {v2.4s}, [%5] \n" // 0 1 2 3 + "shl v3.4s, v1.4s, #2 \n" // 4 * dx + "mul v1.4s, v1.4s, v2.4s \n" + // x , x + 1 * dx, x + 2 * dx, x + 3 * dx + "add v1.4s, v1.4s, v0.4s \n" + // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx + "add v2.4s, v1.4s, v3.4s \n" + "shl v0.4s, v3.4s, #1 \n" // 8 * dx + "1: \n" + LOAD2_DATA8_LANE(0) + LOAD2_DATA8_LANE(1) + LOAD2_DATA8_LANE(2) + LOAD2_DATA8_LANE(3) + LOAD2_DATA8_LANE(4) + LOAD2_DATA8_LANE(5) + LOAD2_DATA8_LANE(6) + LOAD2_DATA8_LANE(7) + "mov v6.16b, v1.16b \n" + "mov v7.16b, v2.16b \n" + "uzp1 v6.8h, v6.8h, v7.8h \n" + "ushll v4.8h, v4.8b, #0 \n" + "ushll v5.8h, v5.8b, #0 \n" + "ssubl v16.4s, v5.4h, v4.4h \n" + "ssubl2 v17.4s, v5.8h, v4.8h \n" + "ushll v7.4s, v6.4h, #0 \n" + "ushll2 v6.4s, v6.8h, #0 \n" + "mul v16.4s, v16.4s, v7.4s \n" + "mul v17.4s, v17.4s, v6.4s \n" + "rshrn v6.4h, v16.4s, #16 \n" + "rshrn2 v6.8h, v17.4s, #16 \n" + "add v4.8h, v4.8h, v6.8h \n" + "xtn v4.8b, v4.8h \n" + + "st1 {v4.8b}, [%0], #8 \n" // store pixels + "add v1.4s, v1.4s, v0.4s \n" + "add v2.4s, v2.4s, v0.4s \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + "b.gt 1b \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(x64), // %3 + "+r"(dx64), // %4 + "+r"(tmp), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "v0", "v1", "v2", "v3", + "v4", "v5", "v6", "v7", "v16", "v17" + ); +} + +#undef LOAD2_DATA8_LANE + +// 16x2 -> 16x1 +void ScaleFilterRows_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { + int y_fraction = 256 - source_y_fraction; + asm volatile( + "cmp %w4, #0 \n" + "b.eq 100f \n" + "add %2, %2, %1 \n" + "cmp %w4, #64 \n" + "b.eq 75f \n" + "cmp %w4, #128 \n" + "b.eq 50f \n" + "cmp %w4, #192 \n" + "b.eq 25f \n" + + "dup v5.8b, %w4 \n" + "dup v4.8b, %w5 \n" + // General purpose row blend. + "1: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "umull v6.8h, v0.8b, v4.8b \n" + "umull2 v7.8h, v0.16b, v4.16b \n" + "umlal v6.8h, v1.8b, v5.8b \n" + "umlal2 v7.8h, v1.16b, v5.16b \n" + "rshrn v0.8b, v6.8h, #8 \n" + "rshrn2 v0.16b, v7.8h, #8 \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 1b \n" + "b 99f \n" + + // Blend 25 / 75. + "25: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 25b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 50b \n" + "b 99f \n" + + // Blend 75 / 25. + "75: \n" + "ld1 {v1.16b}, [%1], #16 \n" + "ld1 {v0.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 75b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "subs %w3, %w3, #16 \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 100b \n" + + "99: \n" + "st1 {v0.b}[15], [%0] \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_stride), // %2 + "+r"(dst_width), // %3 + "+r"(source_y_fraction), // %4 + "+r"(y_fraction) // %5 + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"); +} + +void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3 + "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + "mov v2.16b, v3.16b \n" + "st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3 + "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + + "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add + "urhadd v1.16b, v2.16b, v3.16b \n" + "st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + asm volatile( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. + "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 + "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. + "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. + "rshrn v0.8b, v0.8h, #2 \n" // round and pack + "rshrn v1.8b, v1.8h, #2 \n" + "rshrn v2.8b, v2.8h, #2 \n" + "rshrn v3.8b, v3.8h, #2 \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); +} + +// Reads 4 pixels at a time. +// Alignment requirement: src_argb 4 byte aligned. +void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "ld1 {v0.s}[0], [%0], %3 \n" + "ld1 {v0.s}[1], [%0], %3 \n" + "ld1 {v0.s}[2], [%0], %3 \n" + "ld1 {v0.s}[3], [%0], %3 \n" + "subs %w2, %w2, #4 \n" // 4 pixels per loop. + "st1 {v0.16b}, [%1], #16 \n" + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"((int64_t)(src_stepx * 4)) // %3 + : "memory", "cc", "v0"); +} + +// Reads 4 pixels at a time. +// Alignment requirement: src_argb 4 byte aligned. +// TODO(Yang Zhang): Might be worth another optimization pass in future. +// It could be upgraded to 8 pixels at a time to start with. +void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + asm volatile( + "add %1, %1, %0 \n" + "1: \n" + "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1 + "ld1 {v1.8b}, [%1], %4 \n" + "ld1 {v2.8b}, [%0], %4 \n" + "ld1 {v3.8b}, [%1], %4 \n" + "ld1 {v4.8b}, [%0], %4 \n" + "ld1 {v5.8b}, [%1], %4 \n" + "ld1 {v6.8b}, [%0], %4 \n" + "ld1 {v7.8b}, [%1], %4 \n" + "uaddl v0.8h, v0.8b, v1.8b \n" + "uaddl v2.8h, v2.8b, v3.8b \n" + "uaddl v4.8h, v4.8b, v5.8b \n" + "uaddl v6.8h, v6.8b, v7.8b \n" + "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd + "mov v0.d[1], v2.d[0] \n" + "mov v2.d[0], v16.d[1] \n" + "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh + "mov v4.d[1], v6.d[0] \n" + "mov v6.d[0], v16.d[1] \n" + "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) + "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) + "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. + "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. + "subs %w3, %w3, #4 \n" // 4 pixels per loop. + "st1 {v0.16b}, [%2], #16 \n" + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride), // %1 + "+r"(dst_argb), // %2 + "+r"(dst_width) // %3 + : "r"((int64_t)(src_stepx * 4)) // %4 + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); +} + +// TODO(Yang Zhang): Investigate less load instructions for +// the x/dx stepping +#define LOAD1_DATA32_LANE(vn, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ + "ld1 {" #vn ".s}[" #n "], [%6] \n" + +void ScaleARGBCols_NEON(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint8_t* src_tmp = src_argb; + int64_t x64 = (int64_t)x; // NOLINT + int64_t dx64 = (int64_t)dx; // NOLINT + int64_t tmp64; + asm volatile( + "1: \n" + // clang-format off + LOAD1_DATA32_LANE(v0, 0) + LOAD1_DATA32_LANE(v0, 1) + LOAD1_DATA32_LANE(v0, 2) + LOAD1_DATA32_LANE(v0, 3) + LOAD1_DATA32_LANE(v1, 0) + LOAD1_DATA32_LANE(v1, 1) + LOAD1_DATA32_LANE(v1, 2) + LOAD1_DATA32_LANE(v1, 3) + // clang-format on + "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop + "b.gt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width), // %2 + "+r"(x64), // %3 + "+r"(dx64), // %4 + "=&r"(tmp64), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "v0", "v1"); +} + +#undef LOAD1_DATA32_LANE + +// TODO(Yang Zhang): Investigate less load instructions for +// the x/dx stepping +#define LOAD2_DATA32_LANE(vn1, vn2, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ + "ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n" + +void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + int dx_offset[4] = {0, 1, 2, 3}; + int* tmp = dx_offset; + const uint8_t* src_tmp = src_argb; + int64_t x64 = (int64_t)x; // NOLINT + int64_t dx64 = (int64_t)dx; // NOLINT + asm volatile ( + "dup v0.4s, %w3 \n" // x + "dup v1.4s, %w4 \n" // dx + "ld1 {v2.4s}, [%5] \n" // 0 1 2 3 + "shl v6.4s, v1.4s, #2 \n" // 4 * dx + "mul v1.4s, v1.4s, v2.4s \n" + "movi v3.16b, #0x7f \n" // 0x7F + "movi v4.8h, #0x7f \n" // 0x7F + // x , x + 1 * dx, x + 2 * dx, x + 3 * dx + "add v5.4s, v1.4s, v0.4s \n" + "1: \n" + // d0, d1: a + // d2, d3: b + LOAD2_DATA32_LANE(v0, v1, 0) + LOAD2_DATA32_LANE(v0, v1, 1) + LOAD2_DATA32_LANE(v0, v1, 2) + LOAD2_DATA32_LANE(v0, v1, 3) + "shrn v2.4h, v5.4s, #9 \n" + "and v2.8b, v2.8b, v4.8b \n" + "dup v16.8b, v2.b[0] \n" + "dup v17.8b, v2.b[2] \n" + "dup v18.8b, v2.b[4] \n" + "dup v19.8b, v2.b[6] \n" + "ext v2.8b, v16.8b, v17.8b, #4 \n" + "ext v17.8b, v18.8b, v19.8b, #4 \n" + "ins v2.d[1], v17.d[0] \n" // f + "eor v7.16b, v2.16b, v3.16b \n" // 0x7f ^ f + "umull v16.8h, v0.8b, v7.8b \n" + "umull2 v17.8h, v0.16b, v7.16b \n" + "umull v18.8h, v1.8b, v2.8b \n" + "umull2 v19.8h, v1.16b, v2.16b \n" + "add v16.8h, v16.8h, v18.8h \n" + "add v17.8h, v17.8h, v19.8h \n" + "shrn v0.8b, v16.8h, #7 \n" + "shrn2 v0.16b, v17.8h, #7 \n" + + "st1 {v0.4s}, [%0], #16 \n" // store pixels + "add v5.4s, v5.4s, v6.4s \n" + "subs %w2, %w2, #4 \n" // 4 processed per loop + "b.gt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width), // %2 + "+r"(x64), // %3 + "+r"(dx64), // %4 + "+r"(tmp), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v16", "v17", "v18", "v19" + ); +} + +#undef LOAD2_DATA32_LANE + +// Read 16x2 average down and write 8x1. +void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + asm volatile( + // change the stride to row 2 pointer + "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 + "1: \n" + "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc + "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc + "subs %w3, %w3, #8 \n" // 8 processed per loop + "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent + "uaddlp v1.4s, v1.8h \n" + "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent + "uadalp v1.4s, v3.8h \n" + "rshrn v0.4h, v0.4s, #2 \n" // round and pack + "rshrn2 v0.8h, v1.4s, #2 \n" + "st1 {v0.8h}, [%2], #16 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "v0", "v1", "v2", "v3" // Clobber List + ); +} + +// Read 8x2 upsample with filtering and write 16x1. +// Actually reads an extra pixel, so 9x2. +void ScaleRowUp2_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + asm volatile( + "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 + "movi v0.8h, #9 \n" // constants + "movi v1.4s, #3 \n" + + "1: \n" + "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8 + "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1 + "ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row + "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1 + "subs %w3, %w3, #16 \n" // 16 dst pixels per loop + "umull v16.4s, v3.4h, v0.4h \n" + "umull2 v7.4s, v3.8h, v0.8h \n" + "umull v18.4s, v4.4h, v0.4h \n" + "umull2 v17.4s, v4.8h, v0.8h \n" + "uaddw v16.4s, v16.4s, v6.4h \n" + "uaddl2 v19.4s, v6.8h, v3.8h \n" + "uaddl v3.4s, v6.4h, v3.4h \n" + "uaddw2 v6.4s, v7.4s, v6.8h \n" + "uaddl2 v7.4s, v5.8h, v4.8h \n" + "uaddl v4.4s, v5.4h, v4.4h \n" + "uaddw v18.4s, v18.4s, v5.4h \n" + "mla v16.4s, v4.4s, v1.4s \n" + "mla v18.4s, v3.4s, v1.4s \n" + "mla v6.4s, v7.4s, v1.4s \n" + "uaddw2 v4.4s, v17.4s, v5.8h \n" + "uqrshrn v16.4h, v16.4s, #4 \n" + "mla v4.4s, v19.4s, v1.4s \n" + "uqrshrn2 v16.8h, v6.4s, #4 \n" + "uqrshrn v17.4h, v18.4s, #4 \n" + "uqrshrn2 v17.8h, v4.4s, #4 \n" + "st2 {v16.8h-v17.8h}, [%2], #32 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : "r"(2LL), // %4 + "r"(14LL) // %5 + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", + "v19" // Clobber List + ); +} + +#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/scale_win.cc b/media/libvpx/libvpx/third_party/libyuv/source/scale_win.cc new file mode 100644 index 0000000000..c5fc86f3e9 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/scale_win.cc @@ -0,0 +1,1391 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" +#include "libyuv/scale_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for 32 bit Visual C x86 and clangcl +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) + +// Offsets for source bytes 0 to 9 +static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, + 128, 128, 128, 128, 128, 128, 128, 128}; + +// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. +static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12, + 128, 128, 128, 128, 128, 128, 128, 128}; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15, + 128, 128, 128, 128, 128, 128, 128, 128}; + +// Offsets for source bytes 0 to 10 +static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; + +// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. +static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, + 8, 9, 9, 10, 10, 11, 12, 13}; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10, + 10, 11, 12, 13, 13, 14, 14, 15}; + +// Coefficients for source bytes 0 to 10 +static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2}; + +// Coefficients for source bytes 10 to 21 +static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1}; + +// Coefficients for source bytes 21 to 31 +static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3}; + +// Coefficients for source bytes 21 to 31 +static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2}; + +static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128}; + +static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3, + 6, 8, 11, 14, 128, 128, 128, 128}; + +// Arrange words 0,3,6 into 0,1,2 +static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128}; + +// Arrange words 0,3,6 into 3,4,5 +static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1, + 6, 7, 12, 13, 128, 128, 128, 128}; + +// Scaling values for boxes of 3x3 and 2x3 +static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, + 65536 / 9, 65536 / 6, 0, 0}; + +// Arrange first value for pixels 0,1,2,3,4,5 +static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128, + 11, 128, 14, 128, 128, 128, 128, 128}; + +// Arrange second value for pixels 0,1,2,3,4,5 +static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128, + 12, 128, 15, 128, 128, 128, 128, 128}; + +// Arrange third value for pixels 0,1,2,3,4,5 +static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128, + 13, 128, 128, 128, 128, 128, 128, 128}; + +// Scaling values for boxes of 3x2 and 2x2 +static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, + 65536 / 3, 65536 / 2, 0, 0}; + +// Reads 32 pixels, throws half away and writes 16 pixels. +__declspec(naked) void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // isolate odd pixels. + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg wloop + + ret + } +} + +// Blends 32x1 rectangle to 16x1. +__declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + + pcmpeqb xmm4, xmm4 // constant 0x0101 + psrlw xmm4, 15 + packuswb xmm4, xmm4 + pxor xmm5, xmm5 // constant 0 + + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + pmaddubsw xmm0, xmm4 // horizontal add + pmaddubsw xmm1, xmm4 + pavgw xmm0, xmm5 // (x + 1) / 2 + pavgw xmm1, xmm5 + packuswb xmm0, xmm1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg wloop + + ret + } +} + +// Blends 32x2 rectangle to 16x1. +__declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + + pcmpeqb xmm4, xmm4 // constant 0x0101 + psrlw xmm4, 15 + packuswb xmm4, xmm4 + pxor xmm5, xmm5 // constant 0 + + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pmaddubsw xmm0, xmm4 // horizontal add + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + paddw xmm0, xmm2 // vertical add + paddw xmm1, xmm3 + psrlw xmm0, 1 + psrlw xmm1, 1 + pavgw xmm0, xmm5 // (x + 1) / 2 + pavgw xmm1, xmm5 + packuswb xmm0, xmm1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg wloop + + pop esi + ret + } +} + +#ifdef HAS_SCALEROWDOWN2_AVX2 +// Reads 64 pixels, throws half away and writes 32 pixels. +__declspec(naked) void ScaleRowDown2_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + + wloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpsrlw ymm0, ymm0, 8 // isolate odd pixels. + vpsrlw ymm1, ymm1, 8 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg wloop + + vzeroupper + ret + } +} + +// Blends 64x1 rectangle to 32x1. +__declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + + vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b + vpsrlw ymm4, ymm4, 15 + vpackuswb ymm4, ymm4, ymm4 + vpxor ymm5, ymm5, ymm5 // constant 0 + + wloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpmaddubsw ymm0, ymm0, ymm4 // horizontal add + vpmaddubsw ymm1, ymm1, ymm4 + vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 + vpavgw ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg wloop + + vzeroupper + ret + } +} + +// For rounding, average = (sum + 2) / 4 +// becomes average((sum >> 1), 0) +// Blends 64x2 rectangle to 32x1. +__declspec(naked) void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + + vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b + vpsrlw ymm4, ymm4, 15 + vpackuswb ymm4, ymm4, ymm4 + vpxor ymm5, ymm5, ymm5 // constant 0 + + wloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vmovdqu ymm2, [eax + esi] + vmovdqu ymm3, [eax + esi + 32] + lea eax, [eax + 64] + vpmaddubsw ymm0, ymm0, ymm4 // horizontal add + vpmaddubsw ymm1, ymm1, ymm4 + vpmaddubsw ymm2, ymm2, ymm4 + vpmaddubsw ymm3, ymm3, ymm4 + vpaddw ymm0, ymm0, ymm2 // vertical add + vpaddw ymm1, ymm1, ymm3 + vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2 + vpsrlw ymm1, ymm1, 1 + vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 + vpavgw ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg wloop + + pop esi + vzeroupper + ret + } +} +#endif // HAS_SCALEROWDOWN2_AVX2 + +// Point samples 32 pixels to 8 pixels. +__declspec(naked) void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 + psrld xmm5, 24 + pslld xmm5, 16 + + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 + pand xmm1, xmm5 + packuswb xmm0, xmm1 + psrlw xmm0, 8 + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + sub ecx, 8 + jg wloop + + ret + } +} + +// Blends 32x4 rectangle to 8x1. +__declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_ptr + mov esi, [esp + 8 + 8] // src_stride + mov edx, [esp + 8 + 12] // dst_ptr + mov ecx, [esp + 8 + 16] // dst_width + lea edi, [esi + esi * 2] // src_stride * 3 + pcmpeqb xmm4, xmm4 // constant 0x0101 + psrlw xmm4, 15 + movdqa xmm5, xmm4 + packuswb xmm4, xmm4 + psllw xmm5, 3 // constant 0x0008 + + wloop: + movdqu xmm0, [eax] // average rows + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] + pmaddubsw xmm0, xmm4 // horizontal add + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + paddw xmm0, xmm2 // vertical add rows 0, 1 + paddw xmm1, xmm3 + movdqu xmm2, [eax + esi * 2] + movdqu xmm3, [eax + esi * 2 + 16] + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + paddw xmm0, xmm2 // add row 2 + paddw xmm1, xmm3 + movdqu xmm2, [eax + edi] + movdqu xmm3, [eax + edi + 16] + lea eax, [eax + 32] + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + paddw xmm0, xmm2 // add row 3 + paddw xmm1, xmm3 + phaddw xmm0, xmm1 + paddw xmm0, xmm5 // + 8 for round + psrlw xmm0, 4 // /16 for average of 4 * 4 + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + sub ecx, 8 + jg wloop + + pop edi + pop esi + ret + } +} + +#ifdef HAS_SCALEROWDOWN4_AVX2 +// Point samples 64 pixels to 16 pixels. +__declspec(naked) void ScaleRowDown4_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000 + vpsrld ymm5, ymm5, 24 + vpslld ymm5, ymm5, 16 + + wloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpand ymm0, ymm0, ymm5 + vpand ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vpsrlw ymm0, ymm0, 8 + vpackuswb ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vmovdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg wloop + + vzeroupper + ret + } +} + +// Blends 64x4 rectangle to 16x1. +__declspec(naked) void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_ptr + mov esi, [esp + 8 + 8] // src_stride + mov edx, [esp + 8 + 12] // dst_ptr + mov ecx, [esp + 8 + 16] // dst_width + lea edi, [esi + esi * 2] // src_stride * 3 + vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101 + vpsrlw ymm4, ymm4, 15 + vpsllw ymm5, ymm4, 3 // constant 0x0008 + vpackuswb ymm4, ymm4, ymm4 + + wloop: + vmovdqu ymm0, [eax] // average rows + vmovdqu ymm1, [eax + 32] + vmovdqu ymm2, [eax + esi] + vmovdqu ymm3, [eax + esi + 32] + vpmaddubsw ymm0, ymm0, ymm4 // horizontal add + vpmaddubsw ymm1, ymm1, ymm4 + vpmaddubsw ymm2, ymm2, ymm4 + vpmaddubsw ymm3, ymm3, ymm4 + vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1 + vpaddw ymm1, ymm1, ymm3 + vmovdqu ymm2, [eax + esi * 2] + vmovdqu ymm3, [eax + esi * 2 + 32] + vpmaddubsw ymm2, ymm2, ymm4 + vpmaddubsw ymm3, ymm3, ymm4 + vpaddw ymm0, ymm0, ymm2 // add row 2 + vpaddw ymm1, ymm1, ymm3 + vmovdqu ymm2, [eax + edi] + vmovdqu ymm3, [eax + edi + 32] + lea eax, [eax + 64] + vpmaddubsw ymm2, ymm2, ymm4 + vpmaddubsw ymm3, ymm3, ymm4 + vpaddw ymm0, ymm0, ymm2 // add row 3 + vpaddw ymm1, ymm1, ymm3 + vphaddw ymm0, ymm0, ymm1 // mutates + vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw + vpaddw ymm0, ymm0, ymm5 // + 8 for round + vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4 + vpackuswb ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vmovdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg wloop + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_SCALEROWDOWN4_AVX2 + +// Point samples 32 pixels to 24 pixels. +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. +// Then shuffled to do the scaling. + +__declspec(naked) void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + movdqa xmm3, xmmword ptr kShuf0 + movdqa xmm4, xmmword ptr kShuf1 + movdqa xmm5, xmmword ptr kShuf2 + + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + movdqa xmm2, xmm1 + palignr xmm1, xmm0, 8 + pshufb xmm0, xmm3 + pshufb xmm1, xmm4 + pshufb xmm2, xmm5 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + 8], xmm1 + movq qword ptr [edx + 16], xmm2 + lea edx, [edx + 24] + sub ecx, 24 + jg wloop + + ret + } +} + +// Blends 32x2 rectangle to 24x1 +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. +// Then shuffled to do the scaling. + +// Register usage: +// xmm0 src_row 0 +// xmm1 src_row 1 +// xmm2 shuf 0 +// xmm3 shuf 1 +// xmm4 shuf 2 +// xmm5 madd 0 +// xmm6 madd 1 +// xmm7 kRound34 + +// Note that movdqa+palign may be better than movdqu. +__declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + movdqa xmm2, xmmword ptr kShuf01 + movdqa xmm3, xmmword ptr kShuf11 + movdqa xmm4, xmmword ptr kShuf21 + movdqa xmm5, xmmword ptr kMadd01 + movdqa xmm6, xmmword ptr kMadd11 + movdqa xmm7, xmmword ptr kRound34 + + wloop: + movdqu xmm0, [eax] // pixels 0..7 + movdqu xmm1, [eax + esi] + pavgb xmm0, xmm1 + pshufb xmm0, xmm2 + pmaddubsw xmm0, xmm5 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + movdqu xmm0, [eax + 8] // pixels 8..15 + movdqu xmm1, [eax + esi + 8] + pavgb xmm0, xmm1 + pshufb xmm0, xmm3 + pmaddubsw xmm0, xmm6 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx + 8], xmm0 + movdqu xmm0, [eax + 16] // pixels 16..23 + movdqu xmm1, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm1 + pshufb xmm0, xmm4 + movdqa xmm1, xmmword ptr kMadd21 + pmaddubsw xmm0, xmm1 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx + 16], xmm0 + lea edx, [edx + 24] + sub ecx, 24 + jg wloop + + pop esi + ret + } +} + +// Note that movdqa+palign may be better than movdqu. +__declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + movdqa xmm2, xmmword ptr kShuf01 + movdqa xmm3, xmmword ptr kShuf11 + movdqa xmm4, xmmword ptr kShuf21 + movdqa xmm5, xmmword ptr kMadd01 + movdqa xmm6, xmmword ptr kMadd11 + movdqa xmm7, xmmword ptr kRound34 + + wloop: + movdqu xmm0, [eax] // pixels 0..7 + movdqu xmm1, [eax + esi] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm2 + pmaddubsw xmm0, xmm5 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + movdqu xmm0, [eax + 8] // pixels 8..15 + movdqu xmm1, [eax + esi + 8] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm3 + pmaddubsw xmm0, xmm6 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx + 8], xmm0 + movdqu xmm0, [eax + 16] // pixels 16..23 + movdqu xmm1, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm4 + movdqa xmm1, xmmword ptr kMadd21 + pmaddubsw xmm0, xmm1 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx + 16], xmm0 + lea edx, [edx+24] + sub ecx, 24 + jg wloop + + pop esi + ret + } +} + +// 3/8 point sampler + +// Scale 32 pixels to 12 +__declspec(naked) void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + movdqa xmm4, xmmword ptr kShuf38a + movdqa xmm5, xmmword ptr kShuf38b + + xloop: + movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 + movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 + lea eax, [eax + 32] + pshufb xmm0, xmm4 + pshufb xmm1, xmm5 + paddusb xmm0, xmm1 + + movq qword ptr [edx], xmm0 // write 12 pixels + movhlps xmm1, xmm0 + movd [edx + 8], xmm1 + lea edx, [edx + 12] + sub ecx, 12 + jg xloop + + ret + } +} + +// Scale 16x3 pixels to 6x1 with interpolation +__declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + movdqa xmm2, xmmword ptr kShufAc + movdqa xmm3, xmmword ptr kShufAc3 + movdqa xmm4, xmmword ptr kScaleAc33 + pxor xmm5, xmm5 + + xloop: + movdqu xmm0, [eax] // sum up 3 rows into xmm0/1 + movdqu xmm6, [eax + esi] + movhlps xmm1, xmm0 + movhlps xmm7, xmm6 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpcklbw xmm6, xmm5 + punpcklbw xmm7, xmm5 + paddusw xmm0, xmm6 + paddusw xmm1, xmm7 + movdqu xmm6, [eax + esi * 2] + lea eax, [eax + 16] + movhlps xmm7, xmm6 + punpcklbw xmm6, xmm5 + punpcklbw xmm7, xmm5 + paddusw xmm0, xmm6 + paddusw xmm1, xmm7 + + movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 + psrldq xmm0, 2 + paddusw xmm6, xmm0 + psrldq xmm0, 2 + paddusw xmm6, xmm0 + pshufb xmm6, xmm2 + + movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 + psrldq xmm1, 2 + paddusw xmm7, xmm1 + psrldq xmm1, 2 + paddusw xmm7, xmm1 + pshufb xmm7, xmm3 + paddusw xmm6, xmm7 + + pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 + packuswb xmm6, xmm6 + + movd [edx], xmm6 // write 6 pixels + psrlq xmm6, 16 + movd [edx + 2], xmm6 + lea edx, [edx + 6] + sub ecx, 6 + jg xloop + + pop esi + ret + } +} + +// Scale 16x2 pixels to 6x1 with interpolation +__declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + movdqa xmm2, xmmword ptr kShufAb0 + movdqa xmm3, xmmword ptr kShufAb1 + movdqa xmm4, xmmword ptr kShufAb2 + movdqa xmm5, xmmword ptr kScaleAb2 + + xloop: + movdqu xmm0, [eax] // average 2 rows into xmm0 + movdqu xmm1, [eax + esi] + lea eax, [eax + 16] + pavgb xmm0, xmm1 + + movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 + pshufb xmm1, xmm2 + movdqa xmm6, xmm0 + pshufb xmm6, xmm3 + paddusw xmm1, xmm6 + pshufb xmm0, xmm4 + paddusw xmm1, xmm0 + + pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 + packuswb xmm1, xmm1 + + movd [edx], xmm1 // write 6 pixels + psrlq xmm1, 16 + movd [edx + 2], xmm1 + lea edx, [edx + 6] + sub ecx, 6 + jg xloop + + pop esi + ret + } +} + +// Reads 16 bytes and accumulates to 16 shorts at a time. +__declspec(naked) void ScaleAddRow_SSE2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width) { + __asm { + mov eax, [esp + 4] // src_ptr + mov edx, [esp + 8] // dst_ptr + mov ecx, [esp + 12] // src_width + pxor xmm5, xmm5 + + // sum rows + xloop: + movdqu xmm3, [eax] // read 16 bytes + lea eax, [eax + 16] + movdqu xmm0, [edx] // read 16 words from destination + movdqu xmm1, [edx + 16] + movdqa xmm2, xmm3 + punpcklbw xmm2, xmm5 + punpckhbw xmm3, xmm5 + paddusw xmm0, xmm2 // sum 16 words + paddusw xmm1, xmm3 + movdqu [edx], xmm0 // write 16 words to destination + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 16 + jg xloop + ret + } +} + +#ifdef HAS_SCALEADDROW_AVX2 +// Reads 32 bytes and accumulates to 32 shorts at a time. +__declspec(naked) void ScaleAddRow_AVX2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width) { + __asm { + mov eax, [esp + 4] // src_ptr + mov edx, [esp + 8] // dst_ptr + mov ecx, [esp + 12] // src_width + vpxor ymm5, ymm5, ymm5 + + // sum rows + xloop: + vmovdqu ymm3, [eax] // read 32 bytes + lea eax, [eax + 32] + vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck + vpunpcklbw ymm2, ymm3, ymm5 + vpunpckhbw ymm3, ymm3, ymm5 + vpaddusw ymm0, ymm2, [edx] // sum 16 words + vpaddusw ymm1, ymm3, [edx + 32] + vmovdqu [edx], ymm0 // write 32 words to destination + vmovdqu [edx + 32], ymm1 + lea edx, [edx + 64] + sub ecx, 32 + jg xloop + + vzeroupper + ret + } +} +#endif // HAS_SCALEADDROW_AVX2 + +// Constant for making pixels signed to avoid pmaddubsw +// saturation. +static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; + +// Constant for making pixels unsigned and adding .5 for rounding. +static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040, + 0x4040, 0x4040, 0x4040, 0x4040}; + +// Bilinear column filtering. SSSE3 version. +__declspec(naked) void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { + __asm { + push ebx + push esi + push edi + mov edi, [esp + 12 + 4] // dst_ptr + mov esi, [esp + 12 + 8] // src_ptr + mov ecx, [esp + 12 + 12] // dst_width + movd xmm2, [esp + 12 + 16] // x + movd xmm3, [esp + 12 + 20] // dx + mov eax, 0x04040000 // shuffle to line up fractions with pixel. + movd xmm5, eax + pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. + psrlw xmm6, 9 + pcmpeqb xmm7, xmm7 // generate 0x0001 + psrlw xmm7, 15 + pextrw eax, xmm2, 1 // get x0 integer. preroll + sub ecx, 2 + jl xloop29 + + movdqa xmm0, xmm2 // x1 = x0 + dx + paddd xmm0, xmm3 + punpckldq xmm2, xmm0 // x0 x1 + punpckldq xmm3, xmm3 // dx dx + paddd xmm3, xmm3 // dx * 2, dx * 2 + pextrw edx, xmm2, 3 // get x1 integer. preroll + + // 2 Pixel loop. + xloop2: + movdqa xmm1, xmm2 // x0, x1 fractions. + paddd xmm2, xmm3 // x += dx + movzx ebx, word ptr [esi + eax] // 2 source x0 pixels + movd xmm0, ebx + psrlw xmm1, 9 // 7 bit fractions. + movzx ebx, word ptr [esi + edx] // 2 source x1 pixels + movd xmm4, ebx + pshufb xmm1, xmm5 // 0011 + punpcklwd xmm0, xmm4 + psubb xmm0, xmmword ptr kFsub80 // make pixels signed. + pxor xmm1, xmm6 // 0..7f and 7f..0 + paddusb xmm1, xmm7 // +1 so 0..7f and 80..1 + pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels. + pextrw eax, xmm2, 1 // get x0 integer. next iteration. + pextrw edx, xmm2, 3 // get x1 integer. next iteration. + paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round. + psrlw xmm1, 7 // 8.7 fixed point to low 8 bits. + packuswb xmm1, xmm1 // 8 bits, 2 pixels. + movd ebx, xmm1 + mov [edi], bx + lea edi, [edi + 2] + sub ecx, 2 // 2 pixels + jge xloop2 + + xloop29: + add ecx, 2 - 1 + jl xloop99 + + // 1 pixel remainder + movzx ebx, word ptr [esi + eax] // 2 source x0 pixels + movd xmm0, ebx + psrlw xmm2, 9 // 7 bit fractions. + pshufb xmm2, xmm5 // 0011 + psubb xmm0, xmmword ptr kFsub80 // make pixels signed. + pxor xmm2, xmm6 // 0..7f and 7f..0 + paddusb xmm2, xmm7 // +1 so 0..7f and 80..1 + pmaddubsw xmm2, xmm0 // 16 bit + paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round. + psrlw xmm2, 7 // 8.7 fixed point to low 8 bits. + packuswb xmm2, xmm2 // 8 bits + movd ebx, xmm2 + mov [edi], bl + + xloop99: + + pop edi + pop esi + pop ebx + ret + } +} + +// Reads 16 pixels, duplicates them and writes 32 pixels. +__declspec(naked) void ScaleColsUp2_SSE2(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { + __asm { + mov edx, [esp + 4] // dst_ptr + mov eax, [esp + 8] // src_ptr + mov ecx, [esp + 12] // dst_width + + wloop: + movdqu xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm0 + punpckhbw xmm1, xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 32 + jg wloop + + ret + } +} + +// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) +__declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + __asm { + mov eax, [esp + 4] // src_argb + // src_stride ignored + mov edx, [esp + 12] // dst_argb + mov ecx, [esp + 16] // dst_width + + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + shufps xmm0, xmm1, 0xdd + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg wloop + + ret + } +} + +// Blends 8x1 rectangle to 4x1. +__declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + __asm { + mov eax, [esp + 4] // src_argb + // src_stride ignored + mov edx, [esp + 12] // dst_argb + mov ecx, [esp + 16] // dst_width + + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + movdqa xmm2, xmm0 + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels + pavgb xmm0, xmm2 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg wloop + + ret + } +} + +// Blends 8x2 rectangle to 4x1. +__declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // dst_width + + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + movdqa xmm2, xmm0 // average columns (8 to 4 pixels) + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels + pavgb xmm0, xmm2 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg wloop + + pop esi + ret + } +} + +// Reads 4 pixels at a time. +__declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + __asm { + push ebx + push edi + mov eax, [esp + 8 + 4] // src_argb + // src_stride ignored + mov ebx, [esp + 8 + 12] // src_stepx + mov edx, [esp + 8 + 16] // dst_argb + mov ecx, [esp + 8 + 20] // dst_width + lea ebx, [ebx * 4] + lea edi, [ebx + ebx * 2] + + wloop: + movd xmm0, [eax] + movd xmm1, [eax + ebx] + punpckldq xmm0, xmm1 + movd xmm2, [eax + ebx * 2] + movd xmm3, [eax + edi] + lea eax, [eax + ebx * 4] + punpckldq xmm2, xmm3 + punpcklqdq xmm0, xmm2 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg wloop + + pop edi + pop ebx + ret + } +} + +// Blends four 2x2 to 4x1. +__declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + __asm { + push ebx + push esi + push edi + mov eax, [esp + 12 + 4] // src_argb + mov esi, [esp + 12 + 8] // src_stride + mov ebx, [esp + 12 + 12] // src_stepx + mov edx, [esp + 12 + 16] // dst_argb + mov ecx, [esp + 12 + 20] // dst_width + lea esi, [eax + esi] // row1 pointer + lea ebx, [ebx * 4] + lea edi, [ebx + ebx * 2] + + wloop: + movq xmm0, qword ptr [eax] // row0 4 pairs + movhps xmm0, qword ptr [eax + ebx] + movq xmm1, qword ptr [eax + ebx * 2] + movhps xmm1, qword ptr [eax + edi] + lea eax, [eax + ebx * 4] + movq xmm2, qword ptr [esi] // row1 4 pairs + movhps xmm2, qword ptr [esi + ebx] + movq xmm3, qword ptr [esi + ebx * 2] + movhps xmm3, qword ptr [esi + edi] + lea esi, [esi + ebx * 4] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + movdqa xmm2, xmm0 // average columns (8 to 4 pixels) + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels + pavgb xmm0, xmm2 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + jg wloop + + pop edi + pop esi + pop ebx + ret + } +} + +// Column scaling unfiltered. SSE2 version. +__declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + __asm { + push edi + push esi + mov edi, [esp + 8 + 4] // dst_argb + mov esi, [esp + 8 + 8] // src_argb + mov ecx, [esp + 8 + 12] // dst_width + movd xmm2, [esp + 8 + 16] // x + movd xmm3, [esp + 8 + 20] // dx + + pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 + pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 + paddd xmm2, xmm0 + paddd xmm3, xmm3 // 0, 0, 0, dx * 2 + pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 + paddd xmm2, xmm0 // x3 x2 x1 x0 + paddd xmm3, xmm3 // 0, 0, 0, dx * 4 + pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 + + pextrw eax, xmm2, 1 // get x0 integer. + pextrw edx, xmm2, 3 // get x1 integer. + + cmp ecx, 0 + jle xloop99 + sub ecx, 4 + jl xloop49 + + // 4 Pixel loop. + xloop4: + movd xmm0, [esi + eax * 4] // 1 source x0 pixels + movd xmm1, [esi + edx * 4] // 1 source x1 pixels + pextrw eax, xmm2, 5 // get x2 integer. + pextrw edx, xmm2, 7 // get x3 integer. + paddd xmm2, xmm3 // x += dx + punpckldq xmm0, xmm1 // x0 x1 + + movd xmm1, [esi + eax * 4] // 1 source x2 pixels + movd xmm4, [esi + edx * 4] // 1 source x3 pixels + pextrw eax, xmm2, 1 // get x0 integer. next iteration. + pextrw edx, xmm2, 3 // get x1 integer. next iteration. + punpckldq xmm1, xmm4 // x2 x3 + punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 + movdqu [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 4 // 4 pixels + jge xloop4 + + xloop49: + test ecx, 2 + je xloop29 + + // 2 Pixels. + movd xmm0, [esi + eax * 4] // 1 source x0 pixels + movd xmm1, [esi + edx * 4] // 1 source x1 pixels + pextrw eax, xmm2, 5 // get x2 integer. + punpckldq xmm0, xmm1 // x0 x1 + + movq qword ptr [edi], xmm0 + lea edi, [edi + 8] + + xloop29: + test ecx, 1 + je xloop99 + + // 1 Pixels. + movd xmm0, [esi + eax * 4] // 1 source x2 pixels + movd dword ptr [edi], xmm0 + xloop99: + + pop esi + pop edi + ret + } +} + +// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version. +// TODO(fbarchard): Port to Neon + +// Shuffle table for arranging 2 pixels into pairs for pmaddubsw +static const uvec8 kShuffleColARGB = { + 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel + 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel +}; + +// Shuffle table for duplicating 2 fractions into 8 bytes each +static const uvec8 kShuffleFractions = { + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, +}; + +__declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_argb + mov esi, [esp + 8 + 8] // src_argb + mov ecx, [esp + 8 + 12] // dst_width + movd xmm2, [esp + 8 + 16] // x + movd xmm3, [esp + 8 + 20] // dx + movdqa xmm4, xmmword ptr kShuffleColARGB + movdqa xmm5, xmmword ptr kShuffleFractions + pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. + psrlw xmm6, 9 + pextrw eax, xmm2, 1 // get x0 integer. preroll + sub ecx, 2 + jl xloop29 + + movdqa xmm0, xmm2 // x1 = x0 + dx + paddd xmm0, xmm3 + punpckldq xmm2, xmm0 // x0 x1 + punpckldq xmm3, xmm3 // dx dx + paddd xmm3, xmm3 // dx * 2, dx * 2 + pextrw edx, xmm2, 3 // get x1 integer. preroll + + // 2 Pixel loop. + xloop2: + movdqa xmm1, xmm2 // x0, x1 fractions. + paddd xmm2, xmm3 // x += dx + movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels + psrlw xmm1, 9 // 7 bit fractions. + movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels + pshufb xmm1, xmm5 // 0000000011111111 + pshufb xmm0, xmm4 // arrange pixels into pairs + pxor xmm1, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. + pextrw eax, xmm2, 1 // get x0 integer. next iteration. + pextrw edx, xmm2, 3 // get x1 integer. next iteration. + psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. + packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. + movq qword ptr [edi], xmm0 + lea edi, [edi + 8] + sub ecx, 2 // 2 pixels + jge xloop2 + + xloop29: + + add ecx, 2 - 1 + jl xloop99 + + // 1 pixel remainder + psrlw xmm2, 9 // 7 bit fractions. + movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels + pshufb xmm2, xmm5 // 00000000 + pshufb xmm0, xmm4 // arrange pixels into pairs + pxor xmm2, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. + psrlw xmm0, 7 + packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. + movd [edi], xmm0 + + xloop99: + + pop edi + pop esi + ret + } +} + +// Reads 4 pixels, duplicates them and writes 8 pixels. +__declspec(naked) void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + __asm { + mov edx, [esp + 4] // dst_argb + mov eax, [esp + 8] // src_argb + mov ecx, [esp + 12] // dst_width + + wloop: + movdqu xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpckldq xmm0, xmm0 + punpckhdq xmm1, xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg wloop + + ret + } +} + +// Divide num by div and return as 16.16 fixed point result. +__declspec(naked) int FixedDiv_X86(int num, int div) { + __asm { + mov eax, [esp + 4] // num + cdq // extend num to 64 bits + shld edx, eax, 16 // 32.16 + shl eax, 16 + idiv dword ptr [esp + 8] + ret + } +} + +// Divide num by div and return as 16.16 fixed point result. +__declspec(naked) int FixedDiv1_X86(int num, int div) { + __asm { + mov eax, [esp + 4] // num + mov ecx, [esp + 8] // denom + cdq // extend num to 64 bits + shld edx, eax, 16 // 32.16 + shl eax, 16 + sub eax, 0x00010001 + sbb edx, 0 + sub ecx, 1 + idiv ecx + ret + } +} +#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/video_common.cc b/media/libvpx/libvpx/third_party/libyuv/source/video_common.cc new file mode 100644 index 0000000000..92384c050c --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/video_common.cc @@ -0,0 +1,62 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/video_common.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +struct FourCCAliasEntry { + uint32_t alias; + uint32_t canonical; +}; + +#define NUM_ALIASES 18 +static const struct FourCCAliasEntry kFourCCAliases[NUM_ALIASES] = { + {FOURCC_IYUV, FOURCC_I420}, + {FOURCC_YU12, FOURCC_I420}, + {FOURCC_YU16, FOURCC_I422}, + {FOURCC_YU24, FOURCC_I444}, + {FOURCC_YUYV, FOURCC_YUY2}, + {FOURCC_YUVS, FOURCC_YUY2}, // kCMPixelFormat_422YpCbCr8_yuvs + {FOURCC_HDYC, FOURCC_UYVY}, + {FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8 + {FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not. + {FOURCC_DMB1, FOURCC_MJPG}, + {FOURCC_BA81, FOURCC_BGGR}, // deprecated. + {FOURCC_RGB3, FOURCC_RAW}, + {FOURCC_BGR3, FOURCC_24BG}, + {FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB + {FOURCC_CM24, FOURCC_RAW}, // kCMPixelFormat_24RGB + {FOURCC_L555, FOURCC_RGBO}, // kCMPixelFormat_16LE555 + {FOURCC_L565, FOURCC_RGBP}, // kCMPixelFormat_16LE565 + {FOURCC_5551, FOURCC_RGBO}, // kCMPixelFormat_16LE5551 +}; +// TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB. +// {FOURCC_BGRA, FOURCC_ARGB}, // kCMPixelFormat_32BGRA + +LIBYUV_API +uint32_t CanonicalFourCC(uint32_t fourcc) { + int i; + for (i = 0; i < NUM_ALIASES; ++i) { + if (kFourCCAliases[i].alias == fourcc) { + return kFourCCAliases[i].canonical; + } + } + // Not an alias, so return it as-is. + return fourcc; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/media/libvpx/libvpx/third_party/x86inc/LICENSE b/media/libvpx/libvpx/third_party/x86inc/LICENSE new file mode 100644 index 0000000000..7d07645a17 --- /dev/null +++ b/media/libvpx/libvpx/third_party/x86inc/LICENSE @@ -0,0 +1,18 @@ +Copyright (C) 2005-2012 x264 project + +Authors: Loren Merritt + Anton Mitrofanov + Jason Garrett-Glaser + Henrik Gramner + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. diff --git a/media/libvpx/libvpx/third_party/x86inc/README.libvpx b/media/libvpx/libvpx/third_party/x86inc/README.libvpx new file mode 100644 index 0000000000..195654f7bb --- /dev/null +++ b/media/libvpx/libvpx/third_party/x86inc/README.libvpx @@ -0,0 +1,19 @@ +URL: https://git.videolan.org/git/x264.git +Version: 3e5aed95cc470f37e2db3e6506a8deb89b527720 +License: ISC +License File: LICENSE + +Description: +x264/libav's framework for x86 assembly. Contains a variety of macros and +defines that help automatically allow assembly to work cross-platform. + +Local Modifications: +Get configuration from vpx_config.asm. +Prefix functions with vpx by default. +Manage name mangling (prefixing with '_') manually because 'PREFIX' does not + exist in libvpx. +Copy PIC 'GLOBAL' macros from x86_abi_support.asm +Use .text instead of .rodata on macho to avoid broken tables in PIC mode. +Use .text with no alignment for aout. +Only use 'hidden' visibility with Chromium. +Prefix ARCH_* with VPX_. diff --git a/media/libvpx/libvpx/third_party/x86inc/x86inc.asm b/media/libvpx/libvpx/third_party/x86inc/x86inc.asm new file mode 100644 index 0000000000..3d55e921c7 --- /dev/null +++ b/media/libvpx/libvpx/third_party/x86inc/x86inc.asm @@ -0,0 +1,1923 @@ +;***************************************************************************** +;* x86inc.asm: x264asm abstraction layer +;***************************************************************************** +;* Copyright (C) 2005-2019 x264 project +;* +;* Authors: Loren Merritt +;* Henrik Gramner +;* Anton Mitrofanov +;* Fiona Glaser +;* +;* Permission to use, copy, modify, and/or distribute this software for any +;* purpose with or without fee is hereby granted, provided that the above +;* copyright notice and this permission notice appear in all copies. +;* +;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +;***************************************************************************** + +; This is a header file for the x264ASM assembly language, which uses +; NASM/YASM syntax combined with a large number of macros to provide easy +; abstraction between different calling conventions (x86_32, win64, linux64). +; It also has various other useful features to simplify writing the kind of +; DSP functions that are most often used in x264. + +; Unlike the rest of x264, this file is available under an ISC license, as it +; has significant usefulness outside of x264 and we want it to be available +; to the largest audience possible. Of course, if you modify it for your own +; purposes to add a new feature, we strongly encourage contributing a patch +; as this feature might be useful for others as well. Send patches or ideas +; to x264-devel@videolan.org . + +%include "vpx_config.asm" + +%ifndef private_prefix + %define private_prefix vpx +%endif + +%ifndef public_prefix + %define public_prefix private_prefix +%endif + +%ifndef STACK_ALIGNMENT + %if VPX_ARCH_X86_64 + %define STACK_ALIGNMENT 16 + %else + %define STACK_ALIGNMENT 4 + %endif +%endif + +%define WIN64 0 +%define UNIX64 0 +%if VPX_ARCH_X86_64 + %ifidn __OUTPUT_FORMAT__,win32 + %define WIN64 1 + %elifidn __OUTPUT_FORMAT__,win64 + %define WIN64 1 + %elifidn __OUTPUT_FORMAT__,x64 + %define WIN64 1 + %else + %define UNIX64 1 + %endif +%endif + +%define FORMAT_ELF 0 +%define FORMAT_MACHO 0 +%ifidn __OUTPUT_FORMAT__,elf + %define FORMAT_ELF 1 +%elifidn __OUTPUT_FORMAT__,elf32 + %define FORMAT_ELF 1 +%elifidn __OUTPUT_FORMAT__,elf64 + %define FORMAT_ELF 1 +%elifidn __OUTPUT_FORMAT__,macho + %define FORMAT_MACHO 1 +%elifidn __OUTPUT_FORMAT__,macho32 + %define FORMAT_MACHO 1 +%elifidn __OUTPUT_FORMAT__,macho64 + %define FORMAT_MACHO 1 +%endif + +; Set PREFIX for libvpx builds. +%if FORMAT_ELF + %undef PREFIX +%elif WIN64 + %undef PREFIX +%else + %define PREFIX +%endif + +%ifdef PREFIX + %define mangle(x) _ %+ x +%else + %define mangle(x) x +%endif + +; In some instances macho32 tables get misaligned when using .rodata. +; When looking at the disassembly it appears that the offset is either +; correct or consistently off by 90. Placing them in the .text section +; works around the issue. It appears to be specific to the way libvpx +; handles the tables. +%macro SECTION_RODATA 0-1 16 + %ifidn __OUTPUT_FORMAT__,win32 + SECTION .rdata align=%1 + %elif WIN64 + SECTION .rdata align=%1 + %elifidn __OUTPUT_FORMAT__,macho32 + SECTION .text align=%1 + fakegot: + %elifidn __OUTPUT_FORMAT__,aout + SECTION .text + %else + SECTION .rodata align=%1 + %endif +%endmacro + +; PIC macros from vpx_ports/x86_abi_support.asm. +%ifidn __OUTPUT_FORMAT__,elf32 +%define ABI_IS_32BIT 1 +%elifidn __OUTPUT_FORMAT__,macho32 +%define ABI_IS_32BIT 1 +%elifidn __OUTPUT_FORMAT__,win32 +%define ABI_IS_32BIT 1 +%elifidn __OUTPUT_FORMAT__,aout +%define ABI_IS_32BIT 1 +%else +%define ABI_IS_32BIT 0 +%endif + +%if ABI_IS_32BIT + %if CONFIG_PIC=1 + %ifidn __OUTPUT_FORMAT__,elf32 + %define GET_GOT_DEFINED 1 + %define WRT_PLT wrt ..plt + %macro GET_GOT 1 + extern _GLOBAL_OFFSET_TABLE_ + push %1 + call %%get_got + %%sub_offset: + jmp %%exitGG + %%get_got: + mov %1, [esp] + add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc + ret + %%exitGG: + %undef GLOBAL + %define GLOBAL(x) x + %1 wrt ..gotoff + %undef RESTORE_GOT + %define RESTORE_GOT pop %1 + %endmacro + %elifidn __OUTPUT_FORMAT__,macho32 + %define GET_GOT_DEFINED 1 + %macro GET_GOT 1 + push %1 + call %%get_got + %%get_got: + pop %1 + %undef GLOBAL + %define GLOBAL(x) x + %1 - %%get_got + %undef RESTORE_GOT + %define RESTORE_GOT pop %1 + %endmacro + %else + %define GET_GOT_DEFINED 0 + %endif + %endif + + %if VPX_ARCH_X86_64 == 0 + %undef PIC + %endif + +%else + %macro GET_GOT 1 + %endmacro + %define GLOBAL(x) rel x + %define WRT_PLT wrt ..plt + + %if WIN64 + %define PIC + %elifidn __OUTPUT_FORMAT__,macho64 + %define PIC + %elif CONFIG_PIC + %define PIC + %endif +%endif + +%ifnmacro GET_GOT + %macro GET_GOT 1 + %endmacro + %define GLOBAL(x) x +%endif +%ifndef RESTORE_GOT + %define RESTORE_GOT +%endif +%ifndef WRT_PLT + %define WRT_PLT +%endif + +%ifdef PIC + default rel +%endif + +%ifndef GET_GOT_DEFINED + %define GET_GOT_DEFINED 0 +%endif +; End PIC macros from vpx_ports/x86_abi_support.asm. + +; libvpx explicitly sets visibilty in shared object builds. Avoid setting +; visibility to hidden as it may break builds that split sources on e.g., +; directory boundaries. +%ifdef CHROMIUM + %define VISIBILITY hidden + %define HAVE_PRIVATE_EXTERN 1 +%else + %define VISIBILITY + %define HAVE_PRIVATE_EXTERN 0 +%endif + +%ifdef __NASM_VER__ + %use smartalign + %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14 + %define HAVE_PRIVATE_EXTERN 0 + %endif +%endif + +; Macros to eliminate most code duplication between x86_32 and x86_64: +; Currently this works only for leaf functions which load all their arguments +; into registers at the start, and make no other use of the stack. Luckily that +; covers most of x264's asm. + +; PROLOGUE: +; %1 = number of arguments. loads them from stack if needed. +; %2 = number of registers used. pushes callee-saved regs if needed. +; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. +; %4 = (optional) stack size to be allocated. The stack will be aligned before +; allocating the specified stack size. If the required stack alignment is +; larger than the known stack alignment the stack will be manually aligned +; and an extra register will be allocated to hold the original stack +; pointer (to not invalidate r0m etc.). To prevent the use of an extra +; register as stack pointer, request a negative stack size. +; %4+/%5+ = list of names to define to registers +; PROLOGUE can also be invoked by adding the same options to cglobal + +; e.g. +; cglobal foo, 2,3,7,0x40, dst, src, tmp +; declares a function (foo) that automatically loads two arguments (dst and +; src) into registers, uses one additional register (tmp) plus 7 vector +; registers (m0-m6) and allocates 0x40 bytes of stack space. + +; TODO Some functions can use some args directly from the stack. If they're the +; last args then you can just not declare them, but if they're in the middle +; we need more flexible macro. + +; RET: +; Pops anything that was pushed by PROLOGUE, and returns. + +; REP_RET: +; Use this instead of RET if it's a branch target. + +; registers: +; rN and rNq are the native-size register holding function argument N +; rNd, rNw, rNb are dword, word, and byte size +; rNh is the high 8 bits of the word size +; rNm is the original location of arg N (a register or on the stack), dword +; rNmp is native size + +%macro DECLARE_REG 2-3 + %define r%1q %2 + %define r%1d %2d + %define r%1w %2w + %define r%1b %2b + %define r%1h %2h + %define %2q %2 + %if %0 == 2 + %define r%1m %2d + %define r%1mp %2 + %elif VPX_ARCH_X86_64 ; memory + %define r%1m [rstk + stack_offset + %3] + %define r%1mp qword r %+ %1 %+ m + %else + %define r%1m [rstk + stack_offset + %3] + %define r%1mp dword r %+ %1 %+ m + %endif + %define r%1 %2 +%endmacro + +%macro DECLARE_REG_SIZE 3 + %define r%1q r%1 + %define e%1q r%1 + %define r%1d e%1 + %define e%1d e%1 + %define r%1w %1 + %define e%1w %1 + %define r%1h %3 + %define e%1h %3 + %define r%1b %2 + %define e%1b %2 + %if VPX_ARCH_X86_64 == 0 + %define r%1 e%1 + %endif +%endmacro + +DECLARE_REG_SIZE ax, al, ah +DECLARE_REG_SIZE bx, bl, bh +DECLARE_REG_SIZE cx, cl, ch +DECLARE_REG_SIZE dx, dl, dh +DECLARE_REG_SIZE si, sil, null +DECLARE_REG_SIZE di, dil, null +DECLARE_REG_SIZE bp, bpl, null + +; t# defines for when per-arch register allocation is more complex than just function arguments + +%macro DECLARE_REG_TMP 1-* + %assign %%i 0 + %rep %0 + CAT_XDEFINE t, %%i, r%1 + %assign %%i %%i+1 + %rotate 1 + %endrep +%endmacro + +%macro DECLARE_REG_TMP_SIZE 0-* + %rep %0 + %define t%1q t%1 %+ q + %define t%1d t%1 %+ d + %define t%1w t%1 %+ w + %define t%1h t%1 %+ h + %define t%1b t%1 %+ b + %rotate 1 + %endrep +%endmacro + +DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 + +%if VPX_ARCH_X86_64 + %define gprsize 8 +%else + %define gprsize 4 +%endif + +%macro LEA 2 +%if VPX_ARCH_X86_64 + lea %1, [%2] +%elif PIC + call $+5 ; special-cased to not affect the RSB on most CPU:s + pop %1 + add %1, (%2)-$+1 +%else + mov %1, %2 +%endif +%endmacro + +%macro PUSH 1 + push %1 + %ifidn rstk, rsp + %assign stack_offset stack_offset+gprsize + %endif +%endmacro + +%macro POP 1 + pop %1 + %ifidn rstk, rsp + %assign stack_offset stack_offset-gprsize + %endif +%endmacro + +%macro PUSH_IF_USED 1-* + %rep %0 + %if %1 < regs_used + PUSH r%1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro POP_IF_USED 1-* + %rep %0 + %if %1 < regs_used + pop r%1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro LOAD_IF_USED 1-* + %rep %0 + %if %1 < num_args + mov r%1, r %+ %1 %+ mp + %endif + %rotate 1 + %endrep +%endmacro + +%macro SUB 2 + sub %1, %2 + %ifidn %1, rstk + %assign stack_offset stack_offset+(%2) + %endif +%endmacro + +%macro ADD 2 + add %1, %2 + %ifidn %1, rstk + %assign stack_offset stack_offset-(%2) + %endif +%endmacro + +%macro movifnidn 2 + %ifnidn %1, %2 + mov %1, %2 + %endif +%endmacro + +%if VPX_ARCH_X86_64 == 0 + %define movsxd movifnidn +%endif + +%macro movsxdifnidn 2 + %ifnidn %1, %2 + movsxd %1, %2 + %endif +%endmacro + +%macro ASSERT 1 + %if (%1) == 0 + %error assertion ``%1'' failed + %endif +%endmacro + +%macro DEFINE_ARGS 0-* + %ifdef n_arg_names + %assign %%i 0 + %rep n_arg_names + CAT_UNDEF arg_name %+ %%i, q + CAT_UNDEF arg_name %+ %%i, d + CAT_UNDEF arg_name %+ %%i, w + CAT_UNDEF arg_name %+ %%i, h + CAT_UNDEF arg_name %+ %%i, b + CAT_UNDEF arg_name %+ %%i, m + CAT_UNDEF arg_name %+ %%i, mp + CAT_UNDEF arg_name, %%i + %assign %%i %%i+1 + %endrep + %endif + + %xdefine %%stack_offset stack_offset + %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine + %assign %%i 0 + %rep %0 + %xdefine %1q r %+ %%i %+ q + %xdefine %1d r %+ %%i %+ d + %xdefine %1w r %+ %%i %+ w + %xdefine %1h r %+ %%i %+ h + %xdefine %1b r %+ %%i %+ b + %xdefine %1m r %+ %%i %+ m + %xdefine %1mp r %+ %%i %+ mp + CAT_XDEFINE arg_name, %%i, %1 + %assign %%i %%i+1 + %rotate 1 + %endrep + %xdefine stack_offset %%stack_offset + %assign n_arg_names %0 +%endmacro + +%define required_stack_alignment ((mmsize + 15) & ~15) +%define vzeroupper_required (mmsize > 16 && (VPX_ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512))) +%define high_mm_regs (16*cpuflag(avx512)) + +%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only) + %ifnum %1 + %if %1 != 0 + %assign %%pad 0 + %assign stack_size %1 + %if stack_size < 0 + %assign stack_size -stack_size + %endif + %if WIN64 + %assign %%pad %%pad + 32 ; shadow space + %if mmsize != 8 + %assign xmm_regs_used %2 + %if xmm_regs_used > 8 + %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers + %endif + %endif + %endif + %if required_stack_alignment <= STACK_ALIGNMENT + ; maintain the current stack alignment + %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) + SUB rsp, stack_size_padded + %else + %assign %%reg_num (regs_used - 1) + %xdefine rstk r %+ %%reg_num + ; align stack, and save original stack location directly above + ; it, i.e. in [rsp+stack_size_padded], so we can restore the + ; stack in a single instruction (i.e. mov rsp, rstk or mov + ; rsp, [rsp+stack_size_padded]) + %if %1 < 0 ; need to store rsp on stack + %xdefine rstkm [rsp + stack_size + %%pad] + %assign %%pad %%pad + gprsize + %else ; can keep rsp in rstk during whole function + %xdefine rstkm rstk + %endif + %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1)) + mov rstk, rsp + and rsp, ~(required_stack_alignment-1) + sub rsp, stack_size_padded + movifnidn rstkm, rstk + %endif + WIN64_PUSH_XMM + %endif + %endif +%endmacro + +%macro SETUP_STACK_POINTER 1 + %ifnum %1 + %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT + %if %1 > 0 + ; Reserve an additional register for storing the original stack pointer, but avoid using + ; eax/rax for this purpose since it can potentially get overwritten as a return value. + %assign regs_used (regs_used + 1) + %if VPX_ARCH_X86_64 && regs_used == 7 + %assign regs_used 8 + %elif VPX_ARCH_X86_64 == 0 && regs_used == 1 + %assign regs_used 2 + %endif + %endif + %if VPX_ARCH_X86_64 && regs_used < 5 + UNIX64 * 3 + ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax) + ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used. + %assign regs_used 5 + UNIX64 * 3 + %endif + %endif + %endif +%endmacro + +%macro DEFINE_ARGS_INTERNAL 3+ + %ifnum %2 + DEFINE_ARGS %3 + %elif %1 == 4 + DEFINE_ARGS %2 + %elif %1 > 4 + DEFINE_ARGS %2, %3 + %endif +%endmacro + +%if WIN64 ; Windows x64 ;================================================= + +DECLARE_REG 0, rcx +DECLARE_REG 1, rdx +DECLARE_REG 2, R8 +DECLARE_REG 3, R9 +DECLARE_REG 4, R10, 40 +DECLARE_REG 5, R11, 48 +DECLARE_REG 6, rax, 56 +DECLARE_REG 7, rdi, 64 +DECLARE_REG 8, rsi, 72 +DECLARE_REG 9, rbx, 80 +DECLARE_REG 10, rbp, 88 +DECLARE_REG 11, R14, 96 +DECLARE_REG 12, R15, 104 +DECLARE_REG 13, R12, 112 +DECLARE_REG 14, R13, 120 + +%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + SETUP_STACK_POINTER %4 + ASSERT regs_used <= 15 + PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 + ALLOC_STACK %4, %3 + %if mmsize != 8 && stack_size == 0 + WIN64_SPILL_XMM %3 + %endif + LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 + DEFINE_ARGS_INTERNAL %0, %4, %5 +%endmacro + +%macro WIN64_PUSH_XMM 0 + ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. + %if xmm_regs_used > 6 + high_mm_regs + movaps [rstk + stack_offset + 8], xmm6 + %endif + %if xmm_regs_used > 7 + high_mm_regs + movaps [rstk + stack_offset + 24], xmm7 + %endif + %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 + %if %%xmm_regs_on_stack > 0 + %assign %%i 8 + %rep %%xmm_regs_on_stack + movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i + %assign %%i %%i+1 + %endrep + %endif +%endmacro + +%macro WIN64_SPILL_XMM 1 + %assign xmm_regs_used %1 + ASSERT xmm_regs_used <= 16 + high_mm_regs + %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 + %if %%xmm_regs_on_stack > 0 + ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack. + %assign %%pad %%xmm_regs_on_stack*16 + 32 + %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) + SUB rsp, stack_size_padded + %endif + WIN64_PUSH_XMM +%endmacro + +%macro WIN64_RESTORE_XMM_INTERNAL 0 + %assign %%pad_size 0 + %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 + %if %%xmm_regs_on_stack > 0 + %assign %%i xmm_regs_used - high_mm_regs + %rep %%xmm_regs_on_stack + %assign %%i %%i-1 + movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32] + %endrep + %endif + %if stack_size_padded > 0 + %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT + mov rsp, rstkm + %else + add rsp, stack_size_padded + %assign %%pad_size stack_size_padded + %endif + %endif + %if xmm_regs_used > 7 + high_mm_regs + movaps xmm7, [rsp + stack_offset - %%pad_size + 24] + %endif + %if xmm_regs_used > 6 + high_mm_regs + movaps xmm6, [rsp + stack_offset - %%pad_size + 8] + %endif +%endmacro + +%macro WIN64_RESTORE_XMM 0 + WIN64_RESTORE_XMM_INTERNAL + %assign stack_offset (stack_offset-stack_size_padded) + %assign stack_size_padded 0 + %assign xmm_regs_used 0 +%endmacro + +%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs + +%macro RET 0 + WIN64_RESTORE_XMM_INTERNAL + POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 + %if vzeroupper_required + vzeroupper + %endif + AUTO_REP_RET +%endmacro + +%elif VPX_ARCH_X86_64 ; *nix x64 ;============================================= + +DECLARE_REG 0, rdi +DECLARE_REG 1, rsi +DECLARE_REG 2, rdx +DECLARE_REG 3, rcx +DECLARE_REG 4, R8 +DECLARE_REG 5, R9 +DECLARE_REG 6, rax, 8 +DECLARE_REG 7, R10, 16 +DECLARE_REG 8, R11, 24 +DECLARE_REG 9, rbx, 32 +DECLARE_REG 10, rbp, 40 +DECLARE_REG 11, R14, 48 +DECLARE_REG 12, R15, 56 +DECLARE_REG 13, R12, 64 +DECLARE_REG 14, R13, 72 + +%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... + %assign num_args %1 + %assign regs_used %2 + %assign xmm_regs_used %3 + ASSERT regs_used >= num_args + SETUP_STACK_POINTER %4 + ASSERT regs_used <= 15 + PUSH_IF_USED 9, 10, 11, 12, 13, 14 + ALLOC_STACK %4 + LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 + DEFINE_ARGS_INTERNAL %0, %4, %5 +%endmacro + +%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required + +%macro RET 0 + %if stack_size_padded > 0 + %if required_stack_alignment > STACK_ALIGNMENT + mov rsp, rstkm + %else + add rsp, stack_size_padded + %endif + %endif + POP_IF_USED 14, 13, 12, 11, 10, 9 + %if vzeroupper_required + vzeroupper + %endif + AUTO_REP_RET +%endmacro + +%else ; X86_32 ;============================================================== + +DECLARE_REG 0, eax, 4 +DECLARE_REG 1, ecx, 8 +DECLARE_REG 2, edx, 12 +DECLARE_REG 3, ebx, 16 +DECLARE_REG 4, esi, 20 +DECLARE_REG 5, edi, 24 +DECLARE_REG 6, ebp, 28 +%define rsp esp + +%macro DECLARE_ARG 1-* + %rep %0 + %define r%1m [rstk + stack_offset + 4*%1 + 4] + %define r%1mp dword r%1m + %rotate 1 + %endrep +%endmacro + +DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 + +%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + %if num_args > 7 + %assign num_args 7 + %endif + %if regs_used > 7 + %assign regs_used 7 + %endif + SETUP_STACK_POINTER %4 + ASSERT regs_used <= 7 + PUSH_IF_USED 3, 4, 5, 6 + ALLOC_STACK %4 + LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 + DEFINE_ARGS_INTERNAL %0, %4, %5 +%endmacro + +%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required + +%macro RET 0 + %if stack_size_padded > 0 + %if required_stack_alignment > STACK_ALIGNMENT + mov rsp, rstkm + %else + add rsp, stack_size_padded + %endif + %endif + POP_IF_USED 6, 5, 4, 3 + %if vzeroupper_required + vzeroupper + %endif + AUTO_REP_RET +%endmacro + +%endif ;====================================================================== + +%if WIN64 == 0 + %macro WIN64_SPILL_XMM 1 + %endmacro + %macro WIN64_RESTORE_XMM 0 + %endmacro + %macro WIN64_PUSH_XMM 0 + %endmacro +%endif + +; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either +; a branch or a branch target. So switch to a 2-byte form of ret in that case. +; We can automatically detect "follows a branch", but not a branch target. +; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) +%macro REP_RET 0 + %if has_epilogue || cpuflag(ssse3) + RET + %else + rep ret + %endif + annotate_function_size +%endmacro + +%define last_branch_adr $$ +%macro AUTO_REP_RET 0 + %if notcpuflag(ssse3) + times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr. + %endif + ret + annotate_function_size +%endmacro + +%macro BRANCH_INSTR 0-* + %rep %0 + %macro %1 1-2 %1 + %2 %1 + %if notcpuflag(ssse3) + %%branch_instr equ $ + %xdefine last_branch_adr %%branch_instr + %endif + %endmacro + %rotate 1 + %endrep +%endmacro + +BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp + +%macro TAIL_CALL 1-2 1 ; callee, is_nonadjacent + %if has_epilogue + call %1 + RET + %elif %2 + jmp %1 + %endif + annotate_function_size +%endmacro + +;============================================================================= +; arch-independent part +;============================================================================= + +%assign function_align 16 + +; Begin a function. +; Applies any symbol mangling needed for C linkage, and sets up a define such that +; subsequent uses of the function name automatically refer to the mangled version. +; Appends cpuflags to the function name if cpuflags has been specified. +; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX +; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). +%macro cglobal 1-2+ "" ; name, [PROLOGUE args] + cglobal_internal 1, %1 %+ SUFFIX, %2 +%endmacro +%macro cvisible 1-2+ "" ; name, [PROLOGUE args] + cglobal_internal 0, %1 %+ SUFFIX, %2 +%endmacro +%macro cglobal_internal 2-3+ + annotate_function_size + %ifndef cglobaled_%2 + %if %1 + %xdefine %2 mangle(private_prefix %+ _ %+ %2) + %else + %xdefine %2 mangle(public_prefix %+ _ %+ %2) + %endif + %xdefine %2.skip_prologue %2 %+ .skip_prologue + CAT_XDEFINE cglobaled_, %2, 1 + %endif + %xdefine current_function %2 + %xdefine current_function_section __SECT__ + %if FORMAT_ELF + %if %1 + global %2:function VISIBILITY + %else + global %2:function + %endif + %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN && %1 + global %2:private_extern + %else + global %2 + %endif + align function_align + %2: + RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer + %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required + %assign stack_offset 0 ; stack pointer offset relative to the return address + %assign stack_size 0 ; amount of stack space that can be freely used inside a function + %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding + %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper + %ifnidn %3, "" + PROLOGUE %3 + %endif +%endmacro + +; Create a global symbol from a local label with the correct name mangling and type +%macro cglobal_label 1 + %if FORMAT_ELF + global current_function %+ %1:function VISIBILITY + %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN + global current_function %+ %1:private_extern + %else + global current_function %+ %1 + %endif + %1: +%endmacro + +%macro cextern 1 + %xdefine %1 mangle(private_prefix %+ _ %+ %1) + CAT_XDEFINE cglobaled_, %1, 1 + extern %1 +%endmacro + +; like cextern, but without the prefix +%macro cextern_naked 1 + %ifdef PREFIX + %xdefine %1 mangle(%1) + %endif + CAT_XDEFINE cglobaled_, %1, 1 + extern %1 +%endmacro + +%macro const 1-2+ + %xdefine %1 mangle(private_prefix %+ _ %+ %1) + %if FORMAT_ELF + global %1:data VISIBILITY + %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN + global %1:private_extern + %else + global %1 + %endif + %1: %2 +%endmacro + +; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default. +%if FORMAT_ELF + [SECTION .note.GNU-stack noalloc noexec nowrite progbits] +%endif + +; Tell debuggers how large the function was. +; This may be invoked multiple times per function; we rely on later instances overriding earlier ones. +; This is invoked by RET and similar macros, and also cglobal does it for the previous function, +; but if the last function in a source file doesn't use any of the standard macros for its epilogue, +; then its size might be unspecified. +%macro annotate_function_size 0 + %ifdef __YASM_VER__ + %ifdef current_function + %if FORMAT_ELF + current_function_section + %%ecf equ $ + size current_function %%ecf - current_function + __SECT__ + %endif + %endif + %endif +%endmacro + +; cpuflags + +%assign cpuflags_mmx (1<<0) +%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx +%assign cpuflags_3dnow (1<<2) | cpuflags_mmx +%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow +%assign cpuflags_sse (1<<4) | cpuflags_mmx2 +%assign cpuflags_sse2 (1<<5) | cpuflags_sse +%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 +%assign cpuflags_lzcnt (1<<7) | cpuflags_sse2 +%assign cpuflags_sse3 (1<<8) | cpuflags_sse2 +%assign cpuflags_ssse3 (1<<9) | cpuflags_sse3 +%assign cpuflags_sse4 (1<<10)| cpuflags_ssse3 +%assign cpuflags_sse42 (1<<11)| cpuflags_sse4 +%assign cpuflags_aesni (1<<12)| cpuflags_sse42 +%assign cpuflags_gfni (1<<13)| cpuflags_sse42 +%assign cpuflags_avx (1<<14)| cpuflags_sse42 +%assign cpuflags_xop (1<<15)| cpuflags_avx +%assign cpuflags_fma4 (1<<16)| cpuflags_avx +%assign cpuflags_fma3 (1<<17)| cpuflags_avx +%assign cpuflags_bmi1 (1<<18)| cpuflags_avx|cpuflags_lzcnt +%assign cpuflags_bmi2 (1<<19)| cpuflags_bmi1 +%assign cpuflags_avx2 (1<<20)| cpuflags_fma3|cpuflags_bmi2 +%assign cpuflags_avx512 (1<<21)| cpuflags_avx2 ; F, CD, BW, DQ, VL + +%assign cpuflags_cache32 (1<<22) +%assign cpuflags_cache64 (1<<23) +%assign cpuflags_aligned (1<<24) ; not a cpu feature, but a function variant +%assign cpuflags_atom (1<<25) + +; Returns a boolean value expressing whether or not the specified cpuflag is enabled. +%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1) +%define notcpuflag(x) (cpuflag(x) ^ 1) + +; Takes an arbitrary number of cpuflags from the above list. +; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. +; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. +%macro INIT_CPUFLAGS 0-* + %xdefine SUFFIX + %undef cpuname + %assign cpuflags 0 + + %if %0 >= 1 + %rep %0 + %ifdef cpuname + %xdefine cpuname cpuname %+ _%1 + %else + %xdefine cpuname %1 + %endif + %assign cpuflags cpuflags | cpuflags_%1 + %rotate 1 + %endrep + %xdefine SUFFIX _ %+ cpuname + + %if cpuflag(avx) + %assign avx_enabled 1 + %endif + %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2)) + %define mova movaps + %define movu movups + %define movnta movntps + %endif + %if cpuflag(aligned) + %define movu mova + %elif cpuflag(sse3) && notcpuflag(ssse3) + %define movu lddqu + %endif + %endif + + %if VPX_ARCH_X86_64 || cpuflag(sse2) + %ifdef __NASM_VER__ + ALIGNMODE p6 + %else + CPU amdnop + %endif + %else + %ifdef __NASM_VER__ + ALIGNMODE nop + %else + CPU basicnop + %endif + %endif +%endmacro + +; Merge mmx, sse*, and avx* +; m# is a simd register of the currently selected size +; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# +; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# +; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m# +; (All 4 remain in sync through SWAP.) + +%macro CAT_XDEFINE 3 + %xdefine %1%2 %3 +%endmacro + +%macro CAT_UNDEF 2 + %undef %1%2 +%endmacro + +%macro DEFINE_MMREGS 1 ; mmtype + %assign %%prev_mmregs 0 + %ifdef num_mmregs + %assign %%prev_mmregs num_mmregs + %endif + + %assign num_mmregs 8 + %if VPX_ARCH_X86_64 && mmsize >= 16 + %assign num_mmregs 16 + %if cpuflag(avx512) || mmsize == 64 + %assign num_mmregs 32 + %endif + %endif + + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, %1 %+ %%i + CAT_XDEFINE nn%1, %%i, %%i + %assign %%i %%i+1 + %endrep + %if %%prev_mmregs > num_mmregs + %rep %%prev_mmregs - num_mmregs + CAT_UNDEF m, %%i + CAT_UNDEF nn %+ mmtype, %%i + %assign %%i %%i+1 + %endrep + %endif + %xdefine mmtype %1 +%endmacro + +; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper +%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg + %if VPX_ARCH_X86_64 && cpuflag(avx512) + %assign %%i %1 + %rep 16-%1 + %assign %%i_high %%i+16 + SWAP %%i, %%i_high + %assign %%i %%i+1 + %endrep + %endif +%endmacro + +%macro INIT_MMX 0-1+ + %assign avx_enabled 0 + %define RESET_MM_PERMUTATION INIT_MMX %1 + %define mmsize 8 + %define mova movq + %define movu movq + %define movh movd + %define movnta movntq + INIT_CPUFLAGS %1 + DEFINE_MMREGS mm +%endmacro + +%macro INIT_XMM 0-1+ + %assign avx_enabled 0 + %define RESET_MM_PERMUTATION INIT_XMM %1 + %define mmsize 16 + %define mova movdqa + %define movu movdqu + %define movh movq + %define movnta movntdq + INIT_CPUFLAGS %1 + DEFINE_MMREGS xmm + %if WIN64 + AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers + %endif +%endmacro + +%macro INIT_YMM 0-1+ + %assign avx_enabled 1 + %define RESET_MM_PERMUTATION INIT_YMM %1 + %define mmsize 32 + %define mova movdqa + %define movu movdqu + %undef movh + %define movnta movntdq + INIT_CPUFLAGS %1 + DEFINE_MMREGS ymm + AVX512_MM_PERMUTATION +%endmacro + +%macro INIT_ZMM 0-1+ + %assign avx_enabled 1 + %define RESET_MM_PERMUTATION INIT_ZMM %1 + %define mmsize 64 + %define mova movdqa + %define movu movdqu + %undef movh + %define movnta movntdq + INIT_CPUFLAGS %1 + DEFINE_MMREGS zmm + AVX512_MM_PERMUTATION +%endmacro + +INIT_XMM + +%macro DECLARE_MMCAST 1 + %define mmmm%1 mm%1 + %define mmxmm%1 mm%1 + %define mmymm%1 mm%1 + %define mmzmm%1 mm%1 + %define xmmmm%1 mm%1 + %define xmmxmm%1 xmm%1 + %define xmmymm%1 xmm%1 + %define xmmzmm%1 xmm%1 + %define ymmmm%1 mm%1 + %define ymmxmm%1 xmm%1 + %define ymmymm%1 ymm%1 + %define ymmzmm%1 ymm%1 + %define zmmmm%1 mm%1 + %define zmmxmm%1 xmm%1 + %define zmmymm%1 ymm%1 + %define zmmzmm%1 zmm%1 + %define xm%1 xmm %+ m%1 + %define ym%1 ymm %+ m%1 + %define zm%1 zmm %+ m%1 +%endmacro + +%assign i 0 +%rep 32 + DECLARE_MMCAST i + %assign i i+1 +%endrep + +; I often want to use macros that permute their arguments. e.g. there's no +; efficient way to implement butterfly or transpose or dct without swapping some +; arguments. +; +; I would like to not have to manually keep track of the permutations: +; If I insert a permutation in the middle of a function, it should automatically +; change everything that follows. For more complex macros I may also have multiple +; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. +; +; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that +; permutes its arguments. It's equivalent to exchanging the contents of the +; registers, except that this way you exchange the register names instead, so it +; doesn't cost any cycles. + +%macro PERMUTE 2-* ; takes a list of pairs to swap + %rep %0/2 + %xdefine %%tmp%2 m%2 + %rotate 2 + %endrep + %rep %0/2 + %xdefine m%1 %%tmp%2 + CAT_XDEFINE nn, m%1, %1 + %rotate 2 + %endrep +%endmacro + +%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) + %ifnum %1 ; SWAP 0, 1, ... + SWAP_INTERNAL_NUM %1, %2 + %else ; SWAP m0, m1, ... + SWAP_INTERNAL_NAME %1, %2 + %endif +%endmacro + +%macro SWAP_INTERNAL_NUM 2-* + %rep %0-1 + %xdefine %%tmp m%1 + %xdefine m%1 m%2 + %xdefine m%2 %%tmp + CAT_XDEFINE nn, m%1, %1 + CAT_XDEFINE nn, m%2, %2 + %rotate 1 + %endrep +%endmacro + +%macro SWAP_INTERNAL_NAME 2-* + %xdefine %%args nn %+ %1 + %rep %0-1 + %xdefine %%args %%args, nn %+ %2 + %rotate 1 + %endrep + SWAP_INTERNAL_NUM %%args +%endmacro + +; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later +; calls to that function will automatically load the permutation, so values can +; be returned in mmregs. +%macro SAVE_MM_PERMUTATION 0-1 + %if %0 + %xdefine %%f %1_m + %else + %xdefine %%f current_function %+ _m + %endif + %assign %%i 0 + %rep num_mmregs + %xdefine %%tmp m %+ %%i + CAT_XDEFINE %%f, %%i, regnumof %+ %%tmp + %assign %%i %%i+1 + %endrep +%endmacro + +%macro LOAD_MM_PERMUTATION 0-1 ; name to load from + %if %0 + %xdefine %%f %1_m + %else + %xdefine %%f current_function %+ _m + %endif + %xdefine %%tmp %%f %+ 0 + %ifnum %%tmp + RESET_MM_PERMUTATION + %assign %%i 0 + %rep num_mmregs + %xdefine %%tmp %%f %+ %%i + CAT_XDEFINE %%m, %%i, m %+ %%tmp + %assign %%i %%i+1 + %endrep + %rep num_mmregs + %assign %%i %%i-1 + CAT_XDEFINE m, %%i, %%m %+ %%i + CAT_XDEFINE nn, m %+ %%i, %%i + %endrep + %endif +%endmacro + +; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't +%macro call 1 + %ifid %1 + call_internal %1 %+ SUFFIX, %1 + %else + call %1 + %endif +%endmacro +%macro call_internal 2 + %xdefine %%i %2 + %ifndef cglobaled_%2 + %ifdef cglobaled_%1 + %xdefine %%i %1 + %endif + %endif + call %%i + LOAD_MM_PERMUTATION %%i +%endmacro + +; Substitutions that reduce instruction size but are functionally equivalent +%macro add 2 + %ifnum %2 + %if %2==128 + sub %1, -128 + %else + add %1, %2 + %endif + %else + add %1, %2 + %endif +%endmacro + +%macro sub 2 + %ifnum %2 + %if %2==128 + add %1, -128 + %else + sub %1, %2 + %endif + %else + sub %1, %2 + %endif +%endmacro + +;============================================================================= +; AVX abstraction layer +;============================================================================= + +%assign i 0 +%rep 32 + %if i < 8 + CAT_XDEFINE sizeofmm, i, 8 + CAT_XDEFINE regnumofmm, i, i + %endif + CAT_XDEFINE sizeofxmm, i, 16 + CAT_XDEFINE sizeofymm, i, 32 + CAT_XDEFINE sizeofzmm, i, 64 + CAT_XDEFINE regnumofxmm, i, i + CAT_XDEFINE regnumofymm, i, i + CAT_XDEFINE regnumofzmm, i, i + %assign i i+1 +%endrep +%undef i + +%macro CHECK_AVX_INSTR_EMU 3-* + %xdefine %%opcode %1 + %xdefine %%dst %2 + %rep %0-2 + %ifidn %%dst, %3 + %error non-avx emulation of ``%%opcode'' is not supported + %endif + %rotate 1 + %endrep +%endmacro + +;%1 == instruction +;%2 == minimal instruction set +;%3 == 1 if float, 0 if int +;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) +;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not +;%6+: operands +%macro RUN_AVX_INSTR 6-9+ + %ifnum sizeof%7 + %assign __sizeofreg sizeof%7 + %elifnum sizeof%6 + %assign __sizeofreg sizeof%6 + %else + %assign __sizeofreg mmsize + %endif + %assign __emulate_avx 0 + %if avx_enabled && __sizeofreg >= 16 + %xdefine __instr v%1 + %else + %xdefine __instr %1 + %if %0 >= 8+%4 + %assign __emulate_avx 1 + %endif + %endif + %ifnidn %2, fnord + %ifdef cpuname + %if notcpuflag(%2) + %error use of ``%1'' %2 instruction in cpuname function: current_function + %elif %3 == 0 && __sizeofreg == 16 && notcpuflag(sse2) + %error use of ``%1'' sse2 instruction in cpuname function: current_function + %elif %3 == 0 && __sizeofreg == 32 && notcpuflag(avx2) + %error use of ``%1'' avx2 instruction in cpuname function: current_function + %elif __sizeofreg == 16 && notcpuflag(sse) + %error use of ``%1'' sse instruction in cpuname function: current_function + %elif __sizeofreg == 32 && notcpuflag(avx) + %error use of ``%1'' avx instruction in cpuname function: current_function + %elif __sizeofreg == 64 && notcpuflag(avx512) + %error use of ``%1'' avx512 instruction in cpuname function: current_function + %elifidn %1, pextrw ; special case because the base instruction is mmx2, + %ifnid %6 ; but sse4 is required for memory operands + %if notcpuflag(sse4) + %error use of ``%1'' sse4 instruction in cpuname function: current_function + %endif + %endif + %endif + %endif + %endif + + %if __emulate_avx + %xdefine __src1 %7 + %xdefine __src2 %8 + %if %5 && %4 == 0 + %ifnidn %6, %7 + %ifidn %6, %8 + %xdefine __src1 %8 + %xdefine __src2 %7 + %elifnnum sizeof%8 + ; 3-operand AVX instructions with a memory arg can only have it in src2, + ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). + ; So, if the instruction is commutative with a memory arg, swap them. + %xdefine __src1 %8 + %xdefine __src2 %7 + %endif + %endif + %endif + %ifnidn %6, __src1 + %if %0 >= 9 + CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9 + %else + CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2 + %endif + %if __sizeofreg == 8 + MOVQ %6, __src1 + %elif %3 + MOVAPS %6, __src1 + %else + MOVDQA %6, __src1 + %endif + %endif + %if %0 >= 9 + %1 %6, __src2, %9 + %else + %1 %6, __src2 + %endif + %elif %0 >= 9 + __instr %6, %7, %8, %9 + %elif %0 == 8 + %if avx_enabled && %5 + %xdefine __src1 %7 + %xdefine __src2 %8 + %ifnum regnumof%7 + %ifnum regnumof%8 + %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32 + ; Most VEX-encoded instructions require an additional byte to encode when + ; src2 is a high register (e.g. m8..15). If the instruction is commutative + ; we can swap src1 and src2 when doing so reduces the instruction length. + %xdefine __src1 %8 + %xdefine __src2 %7 + %endif + %endif + %endif + __instr %6, __src1, __src2 + %else + __instr %6, %7, %8 + %endif + %elif %0 == 7 + %if avx_enabled && %5 + %xdefine __src1 %6 + %xdefine __src2 %7 + %ifnum regnumof%6 + %ifnum regnumof%7 + %if regnumof%6 < 8 && regnumof%7 >= 8 && regnumof%7 < 16 && sizeof%7 <= 32 + %xdefine __src1 %7 + %xdefine __src2 %6 + %endif + %endif + %endif + __instr %6, __src1, __src2 + %else + __instr %6, %7 + %endif + %else + __instr %6 + %endif +%endmacro + +;%1 == instruction +;%2 == minimal instruction set +;%3 == 1 if float, 0 if int +;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) +;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not +%macro AVX_INSTR 1-5 fnord, 0, 255, 0 + %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5 + %ifidn %2, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1 + %elifidn %3, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2 + %elifidn %4, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3 + %elifidn %5, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4 + %else + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5 + %endif + %endmacro +%endmacro + +; Instructions with both VEX/EVEX and legacy encodings +; Non-destructive instructions are written without parameters +AVX_INSTR addpd, sse2, 1, 0, 1 +AVX_INSTR addps, sse, 1, 0, 1 +AVX_INSTR addsd, sse2, 1, 0, 0 +AVX_INSTR addss, sse, 1, 0, 0 +AVX_INSTR addsubpd, sse3, 1, 0, 0 +AVX_INSTR addsubps, sse3, 1, 0, 0 +AVX_INSTR aesdec, aesni, 0, 0, 0 +AVX_INSTR aesdeclast, aesni, 0, 0, 0 +AVX_INSTR aesenc, aesni, 0, 0, 0 +AVX_INSTR aesenclast, aesni, 0, 0, 0 +AVX_INSTR aesimc, aesni +AVX_INSTR aeskeygenassist, aesni +AVX_INSTR andnpd, sse2, 1, 0, 0 +AVX_INSTR andnps, sse, 1, 0, 0 +AVX_INSTR andpd, sse2, 1, 0, 1 +AVX_INSTR andps, sse, 1, 0, 1 +AVX_INSTR blendpd, sse4, 1, 1, 0 +AVX_INSTR blendps, sse4, 1, 1, 0 +AVX_INSTR blendvpd, sse4 ; can't be emulated +AVX_INSTR blendvps, sse4 ; can't be emulated +AVX_INSTR cmpeqpd, sse2, 1, 0, 1 +AVX_INSTR cmpeqps, sse, 1, 0, 1 +AVX_INSTR cmpeqsd, sse2, 1, 0, 0 +AVX_INSTR cmpeqss, sse, 1, 0, 0 +AVX_INSTR cmplepd, sse2, 1, 0, 0 +AVX_INSTR cmpleps, sse, 1, 0, 0 +AVX_INSTR cmplesd, sse2, 1, 0, 0 +AVX_INSTR cmpless, sse, 1, 0, 0 +AVX_INSTR cmpltpd, sse2, 1, 0, 0 +AVX_INSTR cmpltps, sse, 1, 0, 0 +AVX_INSTR cmpltsd, sse2, 1, 0, 0 +AVX_INSTR cmpltss, sse, 1, 0, 0 +AVX_INSTR cmpneqpd, sse2, 1, 0, 1 +AVX_INSTR cmpneqps, sse, 1, 0, 1 +AVX_INSTR cmpneqsd, sse2, 1, 0, 0 +AVX_INSTR cmpneqss, sse, 1, 0, 0 +AVX_INSTR cmpnlepd, sse2, 1, 0, 0 +AVX_INSTR cmpnleps, sse, 1, 0, 0 +AVX_INSTR cmpnlesd, sse2, 1, 0, 0 +AVX_INSTR cmpnless, sse, 1, 0, 0 +AVX_INSTR cmpnltpd, sse2, 1, 0, 0 +AVX_INSTR cmpnltps, sse, 1, 0, 0 +AVX_INSTR cmpnltsd, sse2, 1, 0, 0 +AVX_INSTR cmpnltss, sse, 1, 0, 0 +AVX_INSTR cmpordpd, sse2 1, 0, 1 +AVX_INSTR cmpordps, sse 1, 0, 1 +AVX_INSTR cmpordsd, sse2 1, 0, 0 +AVX_INSTR cmpordss, sse 1, 0, 0 +AVX_INSTR cmppd, sse2, 1, 1, 0 +AVX_INSTR cmpps, sse, 1, 1, 0 +AVX_INSTR cmpsd, sse2, 1, 1, 0 +AVX_INSTR cmpss, sse, 1, 1, 0 +AVX_INSTR cmpunordpd, sse2, 1, 0, 1 +AVX_INSTR cmpunordps, sse, 1, 0, 1 +AVX_INSTR cmpunordsd, sse2, 1, 0, 0 +AVX_INSTR cmpunordss, sse, 1, 0, 0 +AVX_INSTR comisd, sse2, 1 +AVX_INSTR comiss, sse, 1 +AVX_INSTR cvtdq2pd, sse2, 1 +AVX_INSTR cvtdq2ps, sse2, 1 +AVX_INSTR cvtpd2dq, sse2, 1 +AVX_INSTR cvtpd2ps, sse2, 1 +AVX_INSTR cvtps2dq, sse2, 1 +AVX_INSTR cvtps2pd, sse2, 1 +AVX_INSTR cvtsd2si, sse2, 1 +AVX_INSTR cvtsd2ss, sse2, 1, 0, 0 +AVX_INSTR cvtsi2sd, sse2, 1, 0, 0 +AVX_INSTR cvtsi2ss, sse, 1, 0, 0 +AVX_INSTR cvtss2sd, sse2, 1, 0, 0 +AVX_INSTR cvtss2si, sse, 1 +AVX_INSTR cvttpd2dq, sse2, 1 +AVX_INSTR cvttps2dq, sse2, 1 +AVX_INSTR cvttsd2si, sse2, 1 +AVX_INSTR cvttss2si, sse, 1 +AVX_INSTR divpd, sse2, 1, 0, 0 +AVX_INSTR divps, sse, 1, 0, 0 +AVX_INSTR divsd, sse2, 1, 0, 0 +AVX_INSTR divss, sse, 1, 0, 0 +AVX_INSTR dppd, sse4, 1, 1, 0 +AVX_INSTR dpps, sse4, 1, 1, 0 +AVX_INSTR extractps, sse4, 1 +AVX_INSTR gf2p8affineinvqb, gfni, 0, 1, 0 +AVX_INSTR gf2p8affineqb, gfni, 0, 1, 0 +AVX_INSTR gf2p8mulb, gfni, 0, 0, 0 +AVX_INSTR haddpd, sse3, 1, 0, 0 +AVX_INSTR haddps, sse3, 1, 0, 0 +AVX_INSTR hsubpd, sse3, 1, 0, 0 +AVX_INSTR hsubps, sse3, 1, 0, 0 +AVX_INSTR insertps, sse4, 1, 1, 0 +AVX_INSTR lddqu, sse3 +AVX_INSTR ldmxcsr, sse, 1 +AVX_INSTR maskmovdqu, sse2 +AVX_INSTR maxpd, sse2, 1, 0, 1 +AVX_INSTR maxps, sse, 1, 0, 1 +AVX_INSTR maxsd, sse2, 1, 0, 0 +AVX_INSTR maxss, sse, 1, 0, 0 +AVX_INSTR minpd, sse2, 1, 0, 1 +AVX_INSTR minps, sse, 1, 0, 1 +AVX_INSTR minsd, sse2, 1, 0, 0 +AVX_INSTR minss, sse, 1, 0, 0 +AVX_INSTR movapd, sse2, 1 +AVX_INSTR movaps, sse, 1 +AVX_INSTR movd, mmx +AVX_INSTR movddup, sse3, 1 +AVX_INSTR movdqa, sse2 +AVX_INSTR movdqu, sse2 +AVX_INSTR movhlps, sse, 1, 0, 0 +AVX_INSTR movhpd, sse2, 1, 0, 0 +AVX_INSTR movhps, sse, 1, 0, 0 +AVX_INSTR movlhps, sse, 1, 0, 0 +AVX_INSTR movlpd, sse2, 1, 0, 0 +AVX_INSTR movlps, sse, 1, 0, 0 +AVX_INSTR movmskpd, sse2, 1 +AVX_INSTR movmskps, sse, 1 +AVX_INSTR movntdq, sse2 +AVX_INSTR movntdqa, sse4 +AVX_INSTR movntpd, sse2, 1 +AVX_INSTR movntps, sse, 1 +AVX_INSTR movq, mmx +AVX_INSTR movsd, sse2, 1, 0, 0 +AVX_INSTR movshdup, sse3, 1 +AVX_INSTR movsldup, sse3, 1 +AVX_INSTR movss, sse, 1, 0, 0 +AVX_INSTR movupd, sse2, 1 +AVX_INSTR movups, sse, 1 +AVX_INSTR mpsadbw, sse4, 0, 1, 0 +AVX_INSTR mulpd, sse2, 1, 0, 1 +AVX_INSTR mulps, sse, 1, 0, 1 +AVX_INSTR mulsd, sse2, 1, 0, 0 +AVX_INSTR mulss, sse, 1, 0, 0 +AVX_INSTR orpd, sse2, 1, 0, 1 +AVX_INSTR orps, sse, 1, 0, 1 +AVX_INSTR pabsb, ssse3 +AVX_INSTR pabsd, ssse3 +AVX_INSTR pabsw, ssse3 +AVX_INSTR packsswb, mmx, 0, 0, 0 +AVX_INSTR packssdw, mmx, 0, 0, 0 +AVX_INSTR packuswb, mmx, 0, 0, 0 +AVX_INSTR packusdw, sse4, 0, 0, 0 +AVX_INSTR paddb, mmx, 0, 0, 1 +AVX_INSTR paddw, mmx, 0, 0, 1 +AVX_INSTR paddd, mmx, 0, 0, 1 +AVX_INSTR paddq, sse2, 0, 0, 1 +AVX_INSTR paddsb, mmx, 0, 0, 1 +AVX_INSTR paddsw, mmx, 0, 0, 1 +AVX_INSTR paddusb, mmx, 0, 0, 1 +AVX_INSTR paddusw, mmx, 0, 0, 1 +AVX_INSTR palignr, ssse3, 0, 1, 0 +AVX_INSTR pand, mmx, 0, 0, 1 +AVX_INSTR pandn, mmx, 0, 0, 0 +AVX_INSTR pavgb, mmx2, 0, 0, 1 +AVX_INSTR pavgw, mmx2, 0, 0, 1 +AVX_INSTR pblendvb, sse4 ; can't be emulated +AVX_INSTR pblendw, sse4, 0, 1, 0 +AVX_INSTR pclmulqdq, fnord, 0, 1, 0 +AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0 +AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0 +AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0 +AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0 +AVX_INSTR pcmpestri, sse42 +AVX_INSTR pcmpestrm, sse42 +AVX_INSTR pcmpistri, sse42 +AVX_INSTR pcmpistrm, sse42 +AVX_INSTR pcmpeqb, mmx, 0, 0, 1 +AVX_INSTR pcmpeqw, mmx, 0, 0, 1 +AVX_INSTR pcmpeqd, mmx, 0, 0, 1 +AVX_INSTR pcmpeqq, sse4, 0, 0, 1 +AVX_INSTR pcmpgtb, mmx, 0, 0, 0 +AVX_INSTR pcmpgtw, mmx, 0, 0, 0 +AVX_INSTR pcmpgtd, mmx, 0, 0, 0 +AVX_INSTR pcmpgtq, sse42, 0, 0, 0 +AVX_INSTR pextrb, sse4 +AVX_INSTR pextrd, sse4 +AVX_INSTR pextrq, sse4 +AVX_INSTR pextrw, mmx2 +AVX_INSTR phaddw, ssse3, 0, 0, 0 +AVX_INSTR phaddd, ssse3, 0, 0, 0 +AVX_INSTR phaddsw, ssse3, 0, 0, 0 +AVX_INSTR phminposuw, sse4 +AVX_INSTR phsubw, ssse3, 0, 0, 0 +AVX_INSTR phsubd, ssse3, 0, 0, 0 +AVX_INSTR phsubsw, ssse3, 0, 0, 0 +AVX_INSTR pinsrb, sse4, 0, 1, 0 +AVX_INSTR pinsrd, sse4, 0, 1, 0 +AVX_INSTR pinsrq, sse4, 0, 1, 0 +AVX_INSTR pinsrw, mmx2, 0, 1, 0 +AVX_INSTR pmaddwd, mmx, 0, 0, 1 +AVX_INSTR pmaddubsw, ssse3, 0, 0, 0 +AVX_INSTR pmaxsb, sse4, 0, 0, 1 +AVX_INSTR pmaxsw, mmx2, 0, 0, 1 +AVX_INSTR pmaxsd, sse4, 0, 0, 1 +AVX_INSTR pmaxub, mmx2, 0, 0, 1 +AVX_INSTR pmaxuw, sse4, 0, 0, 1 +AVX_INSTR pmaxud, sse4, 0, 0, 1 +AVX_INSTR pminsb, sse4, 0, 0, 1 +AVX_INSTR pminsw, mmx2, 0, 0, 1 +AVX_INSTR pminsd, sse4, 0, 0, 1 +AVX_INSTR pminub, mmx2, 0, 0, 1 +AVX_INSTR pminuw, sse4, 0, 0, 1 +AVX_INSTR pminud, sse4, 0, 0, 1 +AVX_INSTR pmovmskb, mmx2 +AVX_INSTR pmovsxbw, sse4 +AVX_INSTR pmovsxbd, sse4 +AVX_INSTR pmovsxbq, sse4 +AVX_INSTR pmovsxwd, sse4 +AVX_INSTR pmovsxwq, sse4 +AVX_INSTR pmovsxdq, sse4 +AVX_INSTR pmovzxbw, sse4 +AVX_INSTR pmovzxbd, sse4 +AVX_INSTR pmovzxbq, sse4 +AVX_INSTR pmovzxwd, sse4 +AVX_INSTR pmovzxwq, sse4 +AVX_INSTR pmovzxdq, sse4 +AVX_INSTR pmuldq, sse4, 0, 0, 1 +AVX_INSTR pmulhrsw, ssse3, 0, 0, 1 +AVX_INSTR pmulhuw, mmx2, 0, 0, 1 +AVX_INSTR pmulhw, mmx, 0, 0, 1 +AVX_INSTR pmullw, mmx, 0, 0, 1 +AVX_INSTR pmulld, sse4, 0, 0, 1 +AVX_INSTR pmuludq, sse2, 0, 0, 1 +AVX_INSTR por, mmx, 0, 0, 1 +AVX_INSTR psadbw, mmx2, 0, 0, 1 +AVX_INSTR pshufb, ssse3, 0, 0, 0 +AVX_INSTR pshufd, sse2 +AVX_INSTR pshufhw, sse2 +AVX_INSTR pshuflw, sse2 +AVX_INSTR psignb, ssse3, 0, 0, 0 +AVX_INSTR psignw, ssse3, 0, 0, 0 +AVX_INSTR psignd, ssse3, 0, 0, 0 +AVX_INSTR psllw, mmx, 0, 0, 0 +AVX_INSTR pslld, mmx, 0, 0, 0 +AVX_INSTR psllq, mmx, 0, 0, 0 +AVX_INSTR pslldq, sse2, 0, 0, 0 +AVX_INSTR psraw, mmx, 0, 0, 0 +AVX_INSTR psrad, mmx, 0, 0, 0 +AVX_INSTR psrlw, mmx, 0, 0, 0 +AVX_INSTR psrld, mmx, 0, 0, 0 +AVX_INSTR psrlq, mmx, 0, 0, 0 +AVX_INSTR psrldq, sse2, 0, 0, 0 +AVX_INSTR psubb, mmx, 0, 0, 0 +AVX_INSTR psubw, mmx, 0, 0, 0 +AVX_INSTR psubd, mmx, 0, 0, 0 +AVX_INSTR psubq, sse2, 0, 0, 0 +AVX_INSTR psubsb, mmx, 0, 0, 0 +AVX_INSTR psubsw, mmx, 0, 0, 0 +AVX_INSTR psubusb, mmx, 0, 0, 0 +AVX_INSTR psubusw, mmx, 0, 0, 0 +AVX_INSTR ptest, sse4 +AVX_INSTR punpckhbw, mmx, 0, 0, 0 +AVX_INSTR punpckhwd, mmx, 0, 0, 0 +AVX_INSTR punpckhdq, mmx, 0, 0, 0 +AVX_INSTR punpckhqdq, sse2, 0, 0, 0 +AVX_INSTR punpcklbw, mmx, 0, 0, 0 +AVX_INSTR punpcklwd, mmx, 0, 0, 0 +AVX_INSTR punpckldq, mmx, 0, 0, 0 +AVX_INSTR punpcklqdq, sse2, 0, 0, 0 +AVX_INSTR pxor, mmx, 0, 0, 1 +AVX_INSTR rcpps, sse, 1 +AVX_INSTR rcpss, sse, 1, 0, 0 +AVX_INSTR roundpd, sse4, 1 +AVX_INSTR roundps, sse4, 1 +AVX_INSTR roundsd, sse4, 1, 1, 0 +AVX_INSTR roundss, sse4, 1, 1, 0 +AVX_INSTR rsqrtps, sse, 1 +AVX_INSTR rsqrtss, sse, 1, 0, 0 +AVX_INSTR shufpd, sse2, 1, 1, 0 +AVX_INSTR shufps, sse, 1, 1, 0 +AVX_INSTR sqrtpd, sse2, 1 +AVX_INSTR sqrtps, sse, 1 +AVX_INSTR sqrtsd, sse2, 1, 0, 0 +AVX_INSTR sqrtss, sse, 1, 0, 0 +AVX_INSTR stmxcsr, sse, 1 +AVX_INSTR subpd, sse2, 1, 0, 0 +AVX_INSTR subps, sse, 1, 0, 0 +AVX_INSTR subsd, sse2, 1, 0, 0 +AVX_INSTR subss, sse, 1, 0, 0 +AVX_INSTR ucomisd, sse2, 1 +AVX_INSTR ucomiss, sse, 1 +AVX_INSTR unpckhpd, sse2, 1, 0, 0 +AVX_INSTR unpckhps, sse, 1, 0, 0 +AVX_INSTR unpcklpd, sse2, 1, 0, 0 +AVX_INSTR unpcklps, sse, 1, 0, 0 +AVX_INSTR xorpd, sse2, 1, 0, 1 +AVX_INSTR xorps, sse, 1, 0, 1 + +; 3DNow instructions, for sharing code between AVX, SSE and 3DN +AVX_INSTR pfadd, 3dnow, 1, 0, 1 +AVX_INSTR pfsub, 3dnow, 1, 0, 0 +AVX_INSTR pfmul, 3dnow, 1, 0, 1 + +;%1 == instruction +;%2 == minimal instruction set +%macro GPR_INSTR 2 + %macro %1 2-5 fnord, %1, %2 + %ifdef cpuname + %if notcpuflag(%5) + %error use of ``%4'' %5 instruction in cpuname function: current_function + %endif + %endif + %ifidn %3, fnord + %4 %1, %2 + %else + %4 %1, %2, %3 + %endif + %endmacro +%endmacro + +GPR_INSTR andn, bmi1 +GPR_INSTR bextr, bmi1 +GPR_INSTR blsi, bmi1 +GPR_INSTR blsr, bmi1 +GPR_INSTR blsmsk, bmi1 +GPR_INSTR bzhi, bmi2 +GPR_INSTR mulx, bmi2 +GPR_INSTR pdep, bmi2 +GPR_INSTR pext, bmi2 +GPR_INSTR popcnt, sse42 +GPR_INSTR rorx, bmi2 +GPR_INSTR sarx, bmi2 +GPR_INSTR shlx, bmi2 +GPR_INSTR shrx, bmi2 + +; base-4 constants for shuffles +%assign i 0 +%rep 256 + %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) + %if j < 10 + CAT_XDEFINE q000, j, i + %elif j < 100 + CAT_XDEFINE q00, j, i + %elif j < 1000 + CAT_XDEFINE q0, j, i + %else + CAT_XDEFINE q, j, i + %endif + %assign i i+1 +%endrep +%undef i +%undef j + +%macro FMA_INSTR 3 + %macro %1 4-7 %1, %2, %3 + %if cpuflag(xop) + v%5 %1, %2, %3, %4 + %elifnidn %1, %4 + %6 %1, %2, %3 + %7 %1, %4 + %else + %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported + %endif + %endmacro +%endmacro + +FMA_INSTR pmacsww, pmullw, paddw +FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation +FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation +FMA_INSTR pmadcswd, pmaddwd, paddd + +; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax. +; FMA3 is only possible if dst is the same as one of the src registers. +; Either src2 or src3 can be a memory operand. +%macro FMA4_INSTR 2-* + %push fma4_instr + %xdefine %$prefix %1 + %rep %0 - 1 + %macro %$prefix%2 4-6 %$prefix, %2 + %if notcpuflag(fma3) && notcpuflag(fma4) + %error use of ``%5%6'' fma instruction in cpuname function: current_function + %elif cpuflag(fma4) + v%5%6 %1, %2, %3, %4 + %elifidn %1, %2 + ; If %3 or %4 is a memory operand it needs to be encoded as the last operand. + %ifnum sizeof%3 + v%{5}213%6 %2, %3, %4 + %else + v%{5}132%6 %2, %4, %3 + %endif + %elifidn %1, %3 + v%{5}213%6 %3, %2, %4 + %elifidn %1, %4 + v%{5}231%6 %4, %2, %3 + %else + %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported + %endif + %endmacro + %rotate 1 + %endrep + %pop +%endmacro + +FMA4_INSTR fmadd, pd, ps, sd, ss +FMA4_INSTR fmaddsub, pd, ps +FMA4_INSTR fmsub, pd, ps, sd, ss +FMA4_INSTR fmsubadd, pd, ps +FMA4_INSTR fnmadd, pd, ps, sd, ss +FMA4_INSTR fnmsub, pd, ps, sd, ss + +; Macros for converting VEX instructions to equivalent EVEX ones. +%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex + %macro %1 2-7 fnord, fnord, %1, %2, %3 + %ifidn %3, fnord + %define %%args %1, %2 + %elifidn %4, fnord + %define %%args %1, %2, %3 + %else + %define %%args %1, %2, %3, %4 + %endif + %assign %%evex_required cpuflag(avx512) & %7 + %ifnum regnumof%1 + %if regnumof%1 >= 16 || sizeof%1 > 32 + %assign %%evex_required 1 + %endif + %endif + %ifnum regnumof%2 + %if regnumof%2 >= 16 || sizeof%2 > 32 + %assign %%evex_required 1 + %endif + %endif + %ifnum regnumof%3 + %if regnumof%3 >= 16 || sizeof%3 > 32 + %assign %%evex_required 1 + %endif + %endif + %if %%evex_required + %6 %%args + %else + %5 %%args ; Prefer VEX over EVEX due to shorter instruction length + %endif + %endmacro +%endmacro + +EVEX_INSTR vbroadcastf128, vbroadcastf32x4 +EVEX_INSTR vbroadcasti128, vbroadcasti32x4 +EVEX_INSTR vextractf128, vextractf32x4 +EVEX_INSTR vextracti128, vextracti32x4 +EVEX_INSTR vinsertf128, vinsertf32x4 +EVEX_INSTR vinserti128, vinserti32x4 +EVEX_INSTR vmovdqa, vmovdqa32 +EVEX_INSTR vmovdqu, vmovdqu32 +EVEX_INSTR vpand, vpandd +EVEX_INSTR vpandn, vpandnd +EVEX_INSTR vpor, vpord +EVEX_INSTR vpxor, vpxord +EVEX_INSTR vrcpps, vrcp14ps, 1 ; EVEX versions have higher precision +EVEX_INSTR vrcpss, vrcp14ss, 1 +EVEX_INSTR vrsqrtps, vrsqrt14ps, 1 +EVEX_INSTR vrsqrtss, vrsqrt14ss, 1 diff --git a/media/libvpx/libvpx/tools.mk b/media/libvpx/libvpx/tools.mk new file mode 100644 index 0000000000..dd2ebeb3d5 --- /dev/null +++ b/media/libvpx/libvpx/tools.mk @@ -0,0 +1,116 @@ +## +## Copyright (c) 2016 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +# List of tools to build. +TOOLS-yes += tiny_ssim.c +tiny_ssim.SRCS += vpx/vpx_integer.h y4minput.c y4minput.h \ + vpx/vpx_codec.h vpx/src/vpx_image.c +tiny_ssim.SRCS += vpx_mem/vpx_mem.c vpx_mem/vpx_mem.h +tiny_ssim.SRCS += vpx_dsp/ssim.h vpx_scale/yv12config.h +tiny_ssim.SRCS += vpx_ports/mem.h vpx_ports/mem.h +tiny_ssim.SRCS += vpx_mem/include/vpx_mem_intrnl.h +tiny_ssim.GUID = 3afa9b05-940b-4d68-b5aa-55157d8ed7b4 +tiny_ssim.DESCRIPTION = Generate SSIM/PSNR from raw .yuv files + +# +# End of specified files. The rest of the build rules should happen +# automagically from here. +# + + +# Expand list of selected tools to build (as specified above) +TOOLS = $(addprefix tools/,$(call enabled,TOOLS)) +ALL_SRCS = $(foreach ex,$(TOOLS),$($(notdir $(ex:.c=)).SRCS)) +CFLAGS += -I../include + +ifneq ($(CONFIG_CODEC_SRCS), yes) + CFLAGS += -I../include/vpx +endif + +# Expand all tools sources into a variable containing all sources +# for that tools (not just them main one specified in TOOLS) +# and add this file to the list (for MSVS workspace generation) +$(foreach ex,$(TOOLS),$(eval $(notdir $(ex:.c=)).SRCS += $(ex) tools.mk)) + + +# Create build/install dependencies for all tools. The common case +# is handled here. The MSVS case is handled below. +NOT_MSVS = $(if $(CONFIG_MSVS),,yes) +DIST-BINS-$(NOT_MSVS) += $(addprefix bin/,$(TOOLS:.c=$(EXE_SFX))) +DIST-SRCS-yes += $(ALL_SRCS) +OBJS-$(NOT_MSVS) += $(call objs,$(ALL_SRCS)) +BINS-$(NOT_MSVS) += $(addprefix $(BUILD_PFX),$(TOOLS:.c=$(EXE_SFX))) + +# Instantiate linker template for all tools. +$(foreach bin,$(BINS-yes),\ + $(eval $(bin):)\ + $(eval $(call linker_template,$(bin),\ + $(call objs,$($(notdir $(bin:$(EXE_SFX)=)).SRCS)) -lm))) + +# The following pairs define a mapping of locations in the distribution +# tree to locations in the source/build trees. +INSTALL_MAPS += src/%.c %.c +INSTALL_MAPS += src/% $(SRC_PATH_BARE)/% +INSTALL_MAPS += bin/% % +INSTALL_MAPS += % % + + +# Build Visual Studio Projects. We use a template here to instantiate +# explicit rules rather than using an implicit rule because we want to +# leverage make's VPATH searching rather than specifying the paths on +# each file in TOOLS. This has the unfortunate side effect that +# touching the source files trigger a rebuild of the project files +# even though there is no real dependency there (the dependency is on +# the makefiles). We may want to revisit this. +define vcproj_template +$(1): $($(1:.$(VCPROJ_SFX)=).SRCS) vpx.$(VCPROJ_SFX) + $(if $(quiet),@echo " [vcproj] $$@") + $(qexec)$$(GEN_VCPROJ)\ + --exe\ + --target=$$(TOOLCHAIN)\ + --name=$$(@:.$(VCPROJ_SFX)=)\ + --ver=$$(CONFIG_VS_VERSION)\ + --proj-guid=$$($$(@:.$(VCPROJ_SFX)=).GUID)\ + --src-path-bare="$(SRC_PATH_BARE)" \ + --as=$$(AS) \ + $$(if $$(CONFIG_STATIC_MSVCRT),--static-crt) \ + --out=$$@ $$(INTERNAL_CFLAGS) $$(CFLAGS) \ + $$(INTERNAL_LDFLAGS) $$(LDFLAGS) $$^ +endef +TOOLS_BASENAME := $(notdir $(TOOLS)) +PROJECTS-$(CONFIG_MSVS) += $(TOOLS_BASENAME:.c=.$(VCPROJ_SFX)) +INSTALL-BINS-$(CONFIG_MSVS) += $(foreach p,$(VS_PLATFORMS),\ + $(addprefix bin/$(p)/,$(TOOLS_BASENAME:.c=.exe))) +$(foreach proj,$(call enabled,PROJECTS),\ + $(eval $(call vcproj_template,$(proj)))) + +# +# Documentation Rules +# +%.dox: %.c + @echo " [DOXY] $@" + @mkdir -p $(dir $@) + @echo "/*!\page tools_$(@F:.dox=) $(@F:.dox=)" > $@ + @echo " \includelineno $(> $@ + @echo "*/" >> $@ + +tools.dox: tools.mk + @echo " [DOXY] $@" + @echo "/*!\page tools Tools" > $@ + @echo " This SDK includes a number of tools/utilities."\ + "The following tools are included: ">>$@ + @$(foreach ex,$(sort $(notdir $(TOOLS:.c=))),\ + echo " - \subpage tools_$(ex) $($(ex).DESCRIPTION)" >> $@;) + @echo "*/" >> $@ + +CLEAN-OBJS += tools.doxy tools.dox $(TOOLS:.c=.dox) +DOCS-yes += tools.doxy tools.dox +tools.doxy: tools.dox $(TOOLS:.c=.dox) + @echo "INPUT += $^" > $@ diff --git a/media/libvpx/libvpx/tools_common.c b/media/libvpx/libvpx/tools_common.c new file mode 100644 index 0000000000..5c13781513 --- /dev/null +++ b/media/libvpx/libvpx/tools_common.c @@ -0,0 +1,776 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include +#include + +#include "./tools_common.h" + +#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER +#include "vpx/vp8cx.h" +#endif + +#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER +#include "vpx/vp8dx.h" +#endif + +#include "vpx/vpx_codec.h" + +#if defined(_WIN32) || defined(__OS2__) +#include +#include + +#ifdef __OS2__ +#define _setmode setmode +#define _fileno fileno +#define _O_BINARY O_BINARY +#endif +#endif + +#define LOG_ERROR(label) \ + do { \ + const char *l = label; \ + va_list ap; \ + va_start(ap, fmt); \ + if (l) fprintf(stderr, "%s: ", l); \ + vfprintf(stderr, fmt, ap); \ + fprintf(stderr, "\n"); \ + va_end(ap); \ + } while (0) + +#if CONFIG_ENCODERS +/* Swallow warnings about unused results of fread/fwrite */ +static size_t wrap_fread(void *ptr, size_t size, size_t nmemb, FILE *stream) { + return fread(ptr, size, nmemb, stream); +} +#define fread wrap_fread +#endif + +FILE *set_binary_mode(FILE *stream) { + (void)stream; +#if defined(_WIN32) || defined(__OS2__) + _setmode(_fileno(stream), _O_BINARY); +#endif + return stream; +} + +void die(const char *fmt, ...) { + LOG_ERROR(NULL); + usage_exit(); +} + +void fatal(const char *fmt, ...) { + LOG_ERROR("Fatal"); + exit(EXIT_FAILURE); +} + +void warn(const char *fmt, ...) { LOG_ERROR("Warning"); } + +void die_codec(vpx_codec_ctx_t *ctx, const char *s) { + const char *detail = vpx_codec_error_detail(ctx); + + fprintf(stderr, "%s: %s\n", s, vpx_codec_error(ctx)); + if (detail) fprintf(stderr, " %s\n", detail); + exit(EXIT_FAILURE); +} + +int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame) { + FILE *f = input_ctx->file; + struct FileTypeDetectionBuffer *detect = &input_ctx->detect; + int plane = 0; + int shortread = 0; + const int bytespp = (yuv_frame->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; + + for (plane = 0; plane < 3; ++plane) { + uint8_t *ptr; + int w = vpx_img_plane_width(yuv_frame, plane); + const int h = vpx_img_plane_height(yuv_frame, plane); + int r; + // Assuming that for nv12 we read all chroma data at one time + if (yuv_frame->fmt == VPX_IMG_FMT_NV12 && plane > 1) break; + // Fixing NV12 chroma width it is odd + if (yuv_frame->fmt == VPX_IMG_FMT_NV12 && plane == 1) w = (w + 1) & ~1; + /* Determine the correct plane based on the image format. The for-loop + * always counts in Y,U,V order, but this may not match the order of + * the data on disk. + */ + switch (plane) { + case 1: + ptr = + yuv_frame->planes[yuv_frame->fmt == VPX_IMG_FMT_YV12 ? VPX_PLANE_V + : VPX_PLANE_U]; + break; + case 2: + ptr = + yuv_frame->planes[yuv_frame->fmt == VPX_IMG_FMT_YV12 ? VPX_PLANE_U + : VPX_PLANE_V]; + break; + default: ptr = yuv_frame->planes[plane]; + } + + for (r = 0; r < h; ++r) { + size_t needed = w * bytespp; + size_t buf_position = 0; + const size_t left = detect->buf_read - detect->position; + if (left > 0) { + const size_t more = (left < needed) ? left : needed; + memcpy(ptr, detect->buf + detect->position, more); + buf_position = more; + needed -= more; + detect->position += more; + } + if (needed > 0) { + shortread |= (fread(ptr + buf_position, 1, needed, f) < needed); + } + + ptr += yuv_frame->stride[plane]; + } + } + + return shortread; +} + +#if CONFIG_ENCODERS + +static const VpxInterface vpx_encoders[] = { +#if CONFIG_VP8_ENCODER + { "vp8", VP8_FOURCC, &vpx_codec_vp8_cx }, +#endif + +#if CONFIG_VP9_ENCODER + { "vp9", VP9_FOURCC, &vpx_codec_vp9_cx }, +#endif +}; + +int get_vpx_encoder_count(void) { + return sizeof(vpx_encoders) / sizeof(vpx_encoders[0]); +} + +const VpxInterface *get_vpx_encoder_by_index(int i) { return &vpx_encoders[i]; } + +const VpxInterface *get_vpx_encoder_by_name(const char *name) { + int i; + + for (i = 0; i < get_vpx_encoder_count(); ++i) { + const VpxInterface *encoder = get_vpx_encoder_by_index(i); + if (strcmp(encoder->name, name) == 0) return encoder; + } + + return NULL; +} + +#endif // CONFIG_ENCODERS + +#if CONFIG_DECODERS + +static const VpxInterface vpx_decoders[] = { +#if CONFIG_VP8_DECODER + { "vp8", VP8_FOURCC, &vpx_codec_vp8_dx }, +#endif + +#if CONFIG_VP9_DECODER + { "vp9", VP9_FOURCC, &vpx_codec_vp9_dx }, +#endif +}; + +int get_vpx_decoder_count(void) { + return sizeof(vpx_decoders) / sizeof(vpx_decoders[0]); +} + +const VpxInterface *get_vpx_decoder_by_index(int i) { return &vpx_decoders[i]; } + +const VpxInterface *get_vpx_decoder_by_name(const char *name) { + int i; + + for (i = 0; i < get_vpx_decoder_count(); ++i) { + const VpxInterface *const decoder = get_vpx_decoder_by_index(i); + if (strcmp(decoder->name, name) == 0) return decoder; + } + + return NULL; +} + +const VpxInterface *get_vpx_decoder_by_fourcc(uint32_t fourcc) { + int i; + + for (i = 0; i < get_vpx_decoder_count(); ++i) { + const VpxInterface *const decoder = get_vpx_decoder_by_index(i); + if (decoder->fourcc == fourcc) return decoder; + } + + return NULL; +} + +#endif // CONFIG_DECODERS + +int vpx_img_plane_width(const vpx_image_t *img, int plane) { + if (plane > 0 && img->x_chroma_shift > 0) + return (img->d_w + 1) >> img->x_chroma_shift; + else + return img->d_w; +} + +int vpx_img_plane_height(const vpx_image_t *img, int plane) { + if (plane > 0 && img->y_chroma_shift > 0) + return (img->d_h + 1) >> img->y_chroma_shift; + else + return img->d_h; +} + +void vpx_img_write(const vpx_image_t *img, FILE *file) { + int plane; + + for (plane = 0; plane < 3; ++plane) { + const unsigned char *buf = img->planes[plane]; + const int stride = img->stride[plane]; + const int w = vpx_img_plane_width(img, plane) * + ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); + const int h = vpx_img_plane_height(img, plane); + int y; + + for (y = 0; y < h; ++y) { + fwrite(buf, 1, w, file); + buf += stride; + } + } +} + +int vpx_img_read(vpx_image_t *img, FILE *file) { + int plane; + + for (plane = 0; plane < 3; ++plane) { + unsigned char *buf = img->planes[plane]; + const int stride = img->stride[plane]; + const int w = vpx_img_plane_width(img, plane) * + ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); + const int h = vpx_img_plane_height(img, plane); + int y; + + for (y = 0; y < h; ++y) { + if (fread(buf, 1, w, file) != (size_t)w) return 0; + buf += stride; + } + } + + return 1; +} + +// TODO(dkovalev) change sse_to_psnr signature: double -> int64_t +double sse_to_psnr(double samples, double peak, double sse) { + static const double kMaxPSNR = 100.0; + + if (sse > 0.0) { + const double psnr = 10.0 * log10(samples * peak * peak / sse); + return psnr > kMaxPSNR ? kMaxPSNR : psnr; + } else { + return kMaxPSNR; + } +} + +#if CONFIG_ENCODERS +int read_frame(struct VpxInputContext *input_ctx, vpx_image_t *img) { + FILE *f = input_ctx->file; + y4m_input *y4m = &input_ctx->y4m; + int shortread = 0; + + if (input_ctx->file_type == FILE_TYPE_Y4M) { + if (y4m_input_fetch_frame(y4m, f, img) < 1) return 0; + } else { + shortread = read_yuv_frame(input_ctx, img); + } + + return !shortread; +} + +int file_is_y4m(const char detect[4]) { + if (memcmp(detect, "YUV4", 4) == 0) { + return 1; + } + return 0; +} + +int fourcc_is_ivf(const char detect[4]) { + if (memcmp(detect, "DKIF", 4) == 0) { + return 1; + } + return 0; +} + +void open_input_file(struct VpxInputContext *input) { + /* Parse certain options from the input file, if possible */ + input->file = strcmp(input->filename, "-") ? fopen(input->filename, "rb") + : set_binary_mode(stdin); + + if (!input->file) fatal("Failed to open input file"); + + if (!fseeko(input->file, 0, SEEK_END)) { + /* Input file is seekable. Figure out how long it is, so we can get + * progress info. + */ + input->length = ftello(input->file); + rewind(input->file); + } + + /* Default to 1:1 pixel aspect ratio. */ + input->pixel_aspect_ratio.numerator = 1; + input->pixel_aspect_ratio.denominator = 1; + + /* For RAW input sources, these bytes will applied on the first frame + * in read_frame(). + */ + input->detect.buf_read = fread(input->detect.buf, 1, 4, input->file); + input->detect.position = 0; + + if (input->detect.buf_read == 4 && file_is_y4m(input->detect.buf)) { + if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4, + input->only_i420) >= 0) { + input->file_type = FILE_TYPE_Y4M; + input->width = input->y4m.pic_w; + input->height = input->y4m.pic_h; + input->pixel_aspect_ratio.numerator = input->y4m.par_n; + input->pixel_aspect_ratio.denominator = input->y4m.par_d; + input->framerate.numerator = input->y4m.fps_n; + input->framerate.denominator = input->y4m.fps_d; + input->fmt = input->y4m.vpx_fmt; + input->bit_depth = input->y4m.bit_depth; + } else { + fatal("Unsupported Y4M stream."); + } + } else if (input->detect.buf_read == 4 && fourcc_is_ivf(input->detect.buf)) { + fatal("IVF is not supported as input."); + } else { + input->file_type = FILE_TYPE_RAW; + } +} + +void close_input_file(struct VpxInputContext *input) { + fclose(input->file); + if (input->file_type == FILE_TYPE_Y4M) y4m_input_close(&input->y4m); +} +#endif + +// TODO(debargha): Consolidate the functions below into a separate file. +#if CONFIG_VP9_HIGHBITDEPTH +static void highbd_img_upshift(vpx_image_t *dst, vpx_image_t *src, + int input_shift) { + // Note the offset is 1 less than half. + const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0; + int plane; + if (dst->d_w != src->d_w || dst->d_h != src->d_h || + dst->x_chroma_shift != src->x_chroma_shift || + dst->y_chroma_shift != src->y_chroma_shift || dst->fmt != src->fmt || + input_shift < 0) { + fatal("Unsupported image conversion"); + } + switch (src->fmt) { + case VPX_IMG_FMT_I42016: + case VPX_IMG_FMT_I42216: + case VPX_IMG_FMT_I44416: + case VPX_IMG_FMT_I44016: break; + default: fatal("Unsupported image conversion"); + } + for (plane = 0; plane < 3; plane++) { + int w = src->d_w; + int h = src->d_h; + int x, y; + if (plane) { + w = (w + src->x_chroma_shift) >> src->x_chroma_shift; + h = (h + src->y_chroma_shift) >> src->y_chroma_shift; + } + for (y = 0; y < h; y++) { + uint16_t *p_src = + (uint16_t *)(src->planes[plane] + y * src->stride[plane]); + uint16_t *p_dst = + (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]); + for (x = 0; x < w; x++) *p_dst++ = (*p_src++ << input_shift) + offset; + } + } +} + +static void lowbd_img_upshift(vpx_image_t *dst, vpx_image_t *src, + int input_shift) { + // Note the offset is 1 less than half. + const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0; + int plane; + if (dst->d_w != src->d_w || dst->d_h != src->d_h || + dst->x_chroma_shift != src->x_chroma_shift || + dst->y_chroma_shift != src->y_chroma_shift || + dst->fmt != src->fmt + VPX_IMG_FMT_HIGHBITDEPTH || input_shift < 0) { + fatal("Unsupported image conversion"); + } + switch (src->fmt) { + case VPX_IMG_FMT_I420: + case VPX_IMG_FMT_I422: + case VPX_IMG_FMT_I444: + case VPX_IMG_FMT_I440: break; + default: fatal("Unsupported image conversion"); + } + for (plane = 0; plane < 3; plane++) { + int w = src->d_w; + int h = src->d_h; + int x, y; + if (plane) { + w = (w + src->x_chroma_shift) >> src->x_chroma_shift; + h = (h + src->y_chroma_shift) >> src->y_chroma_shift; + } + for (y = 0; y < h; y++) { + uint8_t *p_src = src->planes[plane] + y * src->stride[plane]; + uint16_t *p_dst = + (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]); + for (x = 0; x < w; x++) { + *p_dst++ = (*p_src++ << input_shift) + offset; + } + } + } +} + +void vpx_img_upshift(vpx_image_t *dst, vpx_image_t *src, int input_shift) { + if (src->fmt & VPX_IMG_FMT_HIGHBITDEPTH) { + highbd_img_upshift(dst, src, input_shift); + } else { + lowbd_img_upshift(dst, src, input_shift); + } +} + +void vpx_img_truncate_16_to_8(vpx_image_t *dst, vpx_image_t *src) { + int plane; + if (dst->fmt + VPX_IMG_FMT_HIGHBITDEPTH != src->fmt || dst->d_w != src->d_w || + dst->d_h != src->d_h || dst->x_chroma_shift != src->x_chroma_shift || + dst->y_chroma_shift != src->y_chroma_shift) { + fatal("Unsupported image conversion"); + } + switch (dst->fmt) { + case VPX_IMG_FMT_I420: + case VPX_IMG_FMT_I422: + case VPX_IMG_FMT_I444: + case VPX_IMG_FMT_I440: break; + default: fatal("Unsupported image conversion"); + } + for (plane = 0; plane < 3; plane++) { + int w = src->d_w; + int h = src->d_h; + int x, y; + if (plane) { + w = (w + src->x_chroma_shift) >> src->x_chroma_shift; + h = (h + src->y_chroma_shift) >> src->y_chroma_shift; + } + for (y = 0; y < h; y++) { + uint16_t *p_src = + (uint16_t *)(src->planes[plane] + y * src->stride[plane]); + uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane]; + for (x = 0; x < w; x++) { + *p_dst++ = (uint8_t)(*p_src++); + } + } + } +} + +static void highbd_img_downshift(vpx_image_t *dst, vpx_image_t *src, + int down_shift) { + int plane; + if (dst->d_w != src->d_w || dst->d_h != src->d_h || + dst->x_chroma_shift != src->x_chroma_shift || + dst->y_chroma_shift != src->y_chroma_shift || dst->fmt != src->fmt || + down_shift < 0) { + fatal("Unsupported image conversion"); + } + switch (src->fmt) { + case VPX_IMG_FMT_I42016: + case VPX_IMG_FMT_I42216: + case VPX_IMG_FMT_I44416: + case VPX_IMG_FMT_I44016: break; + default: fatal("Unsupported image conversion"); + } + for (plane = 0; plane < 3; plane++) { + int w = src->d_w; + int h = src->d_h; + int x, y; + if (plane) { + w = (w + src->x_chroma_shift) >> src->x_chroma_shift; + h = (h + src->y_chroma_shift) >> src->y_chroma_shift; + } + for (y = 0; y < h; y++) { + uint16_t *p_src = + (uint16_t *)(src->planes[plane] + y * src->stride[plane]); + uint16_t *p_dst = + (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]); + for (x = 0; x < w; x++) *p_dst++ = *p_src++ >> down_shift; + } + } +} + +static void lowbd_img_downshift(vpx_image_t *dst, vpx_image_t *src, + int down_shift) { + int plane; + if (dst->d_w != src->d_w || dst->d_h != src->d_h || + dst->x_chroma_shift != src->x_chroma_shift || + dst->y_chroma_shift != src->y_chroma_shift || + src->fmt != dst->fmt + VPX_IMG_FMT_HIGHBITDEPTH || down_shift < 0) { + fatal("Unsupported image conversion"); + } + switch (dst->fmt) { + case VPX_IMG_FMT_I420: + case VPX_IMG_FMT_I422: + case VPX_IMG_FMT_I444: + case VPX_IMG_FMT_I440: break; + default: fatal("Unsupported image conversion"); + } + for (plane = 0; plane < 3; plane++) { + int w = src->d_w; + int h = src->d_h; + int x, y; + if (plane) { + w = (w + src->x_chroma_shift) >> src->x_chroma_shift; + h = (h + src->y_chroma_shift) >> src->y_chroma_shift; + } + for (y = 0; y < h; y++) { + uint16_t *p_src = + (uint16_t *)(src->planes[plane] + y * src->stride[plane]); + uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane]; + for (x = 0; x < w; x++) { + *p_dst++ = *p_src++ >> down_shift; + } + } + } +} + +void vpx_img_downshift(vpx_image_t *dst, vpx_image_t *src, int down_shift) { + if (dst->fmt & VPX_IMG_FMT_HIGHBITDEPTH) { + highbd_img_downshift(dst, src, down_shift); + } else { + lowbd_img_downshift(dst, src, down_shift); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +int compare_img(const vpx_image_t *const img1, const vpx_image_t *const img2) { + uint32_t l_w = img1->d_w; + uint32_t c_w = (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift; + const uint32_t c_h = + (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift; + uint32_t i; + int match = 1; + + match &= (img1->fmt == img2->fmt); + match &= (img1->d_w == img2->d_w); + match &= (img1->d_h == img2->d_h); +#if CONFIG_VP9_HIGHBITDEPTH + if (img1->fmt & VPX_IMG_FMT_HIGHBITDEPTH) { + l_w *= 2; + c_w *= 2; + } +#endif + + for (i = 0; i < img1->d_h; ++i) + match &= (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y], + img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y], + l_w) == 0); + + for (i = 0; i < c_h; ++i) + match &= (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U], + img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U], + c_w) == 0); + + for (i = 0; i < c_h; ++i) + match &= (memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V], + img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V], + c_w) == 0); + + return match; +} + +#define mmin(a, b) ((a) < (b) ? (a) : (b)) + +#if CONFIG_VP9_HIGHBITDEPTH +void find_mismatch_high(const vpx_image_t *const img1, + const vpx_image_t *const img2, int yloc[4], int uloc[4], + int vloc[4]) { + uint16_t *plane1, *plane2; + uint32_t stride1, stride2; + const uint32_t bsize = 64; + const uint32_t bsizey = bsize >> img1->y_chroma_shift; + const uint32_t bsizex = bsize >> img1->x_chroma_shift; + const uint32_t c_w = + (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift; + const uint32_t c_h = + (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift; + int match = 1; + uint32_t i, j; + yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1; + plane1 = (uint16_t *)img1->planes[VPX_PLANE_Y]; + plane2 = (uint16_t *)img2->planes[VPX_PLANE_Y]; + stride1 = img1->stride[VPX_PLANE_Y] / 2; + stride2 = img2->stride[VPX_PLANE_Y] / 2; + for (i = 0, match = 1; match && i < img1->d_h; i += bsize) { + for (j = 0; match && j < img1->d_w; j += bsize) { + int k, l; + const int si = mmin(i + bsize, img1->d_h) - i; + const int sj = mmin(j + bsize, img1->d_w) - j; + for (k = 0; match && k < si; ++k) { + for (l = 0; match && l < sj; ++l) { + if (*(plane1 + (i + k) * stride1 + j + l) != + *(plane2 + (i + k) * stride2 + j + l)) { + yloc[0] = i + k; + yloc[1] = j + l; + yloc[2] = *(plane1 + (i + k) * stride1 + j + l); + yloc[3] = *(plane2 + (i + k) * stride2 + j + l); + match = 0; + break; + } + } + } + } + } + + uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1; + plane1 = (uint16_t *)img1->planes[VPX_PLANE_U]; + plane2 = (uint16_t *)img2->planes[VPX_PLANE_U]; + stride1 = img1->stride[VPX_PLANE_U] / 2; + stride2 = img2->stride[VPX_PLANE_U] / 2; + for (i = 0, match = 1; match && i < c_h; i += bsizey) { + for (j = 0; match && j < c_w; j += bsizex) { + int k, l; + const int si = mmin(i + bsizey, c_h - i); + const int sj = mmin(j + bsizex, c_w - j); + for (k = 0; match && k < si; ++k) { + for (l = 0; match && l < sj; ++l) { + if (*(plane1 + (i + k) * stride1 + j + l) != + *(plane2 + (i + k) * stride2 + j + l)) { + uloc[0] = i + k; + uloc[1] = j + l; + uloc[2] = *(plane1 + (i + k) * stride1 + j + l); + uloc[3] = *(plane2 + (i + k) * stride2 + j + l); + match = 0; + break; + } + } + } + } + } + + vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1; + plane1 = (uint16_t *)img1->planes[VPX_PLANE_V]; + plane2 = (uint16_t *)img2->planes[VPX_PLANE_V]; + stride1 = img1->stride[VPX_PLANE_V] / 2; + stride2 = img2->stride[VPX_PLANE_V] / 2; + for (i = 0, match = 1; match && i < c_h; i += bsizey) { + for (j = 0; match && j < c_w; j += bsizex) { + int k, l; + const int si = mmin(i + bsizey, c_h - i); + const int sj = mmin(j + bsizex, c_w - j); + for (k = 0; match && k < si; ++k) { + for (l = 0; match && l < sj; ++l) { + if (*(plane1 + (i + k) * stride1 + j + l) != + *(plane2 + (i + k) * stride2 + j + l)) { + vloc[0] = i + k; + vloc[1] = j + l; + vloc[2] = *(plane1 + (i + k) * stride1 + j + l); + vloc[3] = *(plane2 + (i + k) * stride2 + j + l); + match = 0; + break; + } + } + } + } + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +void find_mismatch(const vpx_image_t *const img1, const vpx_image_t *const img2, + int yloc[4], int uloc[4], int vloc[4]) { + const uint32_t bsize = 64; + const uint32_t bsizey = bsize >> img1->y_chroma_shift; + const uint32_t bsizex = bsize >> img1->x_chroma_shift; + const uint32_t c_w = + (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift; + const uint32_t c_h = + (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift; + int match = 1; + uint32_t i, j; + yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1; + for (i = 0, match = 1; match && i < img1->d_h; i += bsize) { + for (j = 0; match && j < img1->d_w; j += bsize) { + int k, l; + const int si = mmin(i + bsize, img1->d_h) - i; + const int sj = mmin(j + bsize, img1->d_w) - j; + for (k = 0; match && k < si; ++k) { + for (l = 0; match && l < sj; ++l) { + if (*(img1->planes[VPX_PLANE_Y] + + (i + k) * img1->stride[VPX_PLANE_Y] + j + l) != + *(img2->planes[VPX_PLANE_Y] + + (i + k) * img2->stride[VPX_PLANE_Y] + j + l)) { + yloc[0] = i + k; + yloc[1] = j + l; + yloc[2] = *(img1->planes[VPX_PLANE_Y] + + (i + k) * img1->stride[VPX_PLANE_Y] + j + l); + yloc[3] = *(img2->planes[VPX_PLANE_Y] + + (i + k) * img2->stride[VPX_PLANE_Y] + j + l); + match = 0; + break; + } + } + } + } + } + + uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1; + for (i = 0, match = 1; match && i < c_h; i += bsizey) { + for (j = 0; match && j < c_w; j += bsizex) { + int k, l; + const int si = mmin(i + bsizey, c_h - i); + const int sj = mmin(j + bsizex, c_w - j); + for (k = 0; match && k < si; ++k) { + for (l = 0; match && l < sj; ++l) { + if (*(img1->planes[VPX_PLANE_U] + + (i + k) * img1->stride[VPX_PLANE_U] + j + l) != + *(img2->planes[VPX_PLANE_U] + + (i + k) * img2->stride[VPX_PLANE_U] + j + l)) { + uloc[0] = i + k; + uloc[1] = j + l; + uloc[2] = *(img1->planes[VPX_PLANE_U] + + (i + k) * img1->stride[VPX_PLANE_U] + j + l); + uloc[3] = *(img2->planes[VPX_PLANE_U] + + (i + k) * img2->stride[VPX_PLANE_U] + j + l); + match = 0; + break; + } + } + } + } + } + vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1; + for (i = 0, match = 1; match && i < c_h; i += bsizey) { + for (j = 0; match && j < c_w; j += bsizex) { + int k, l; + const int si = mmin(i + bsizey, c_h - i); + const int sj = mmin(j + bsizex, c_w - j); + for (k = 0; match && k < si; ++k) { + for (l = 0; match && l < sj; ++l) { + if (*(img1->planes[VPX_PLANE_V] + + (i + k) * img1->stride[VPX_PLANE_V] + j + l) != + *(img2->planes[VPX_PLANE_V] + + (i + k) * img2->stride[VPX_PLANE_V] + j + l)) { + vloc[0] = i + k; + vloc[1] = j + l; + vloc[2] = *(img1->planes[VPX_PLANE_V] + + (i + k) * img1->stride[VPX_PLANE_V] + j + l); + vloc[3] = *(img2->planes[VPX_PLANE_V] + + (i + k) * img2->stride[VPX_PLANE_V] + j + l); + match = 0; + break; + } + } + } + } + } +} diff --git a/media/libvpx/libvpx/tools_common.h b/media/libvpx/libvpx/tools_common.h new file mode 100644 index 0000000000..e2942d04b8 --- /dev/null +++ b/media/libvpx/libvpx/tools_common.h @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_TOOLS_COMMON_H_ +#define VPX_TOOLS_COMMON_H_ + +#include + +#include "./vpx_config.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_image.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/msvc.h" + +#if CONFIG_ENCODERS +#include "./y4minput.h" +#endif + +#if defined(_MSC_VER) +/* MSVS uses _f{seek,tell}i64. */ +#define fseeko _fseeki64 +#define ftello _ftelli64 +typedef int64_t FileOffset; +#elif defined(_WIN32) +/* MinGW uses f{seek,tell}o64 for large files. */ +#define fseeko fseeko64 +#define ftello ftello64 +typedef off64_t FileOffset; +#elif CONFIG_OS_SUPPORT && \ + !(defined(__ANDROID__) && __ANDROID_API__ < 24 && !defined(__LP64__) && \ + defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS == 64) +/* POSIX.1 has fseeko and ftello. fseeko and ftello are not available before + * Android API level 24. See + * https://android.googlesource.com/platform/bionic/+/main/docs/32-bit-abi.md */ +#include /* NOLINT */ +typedef off_t FileOffset; +/* Use 32-bit file operations in WebM file format when building ARM + * executables (.axf) with RVCT. */ +#else +#define fseeko fseek +#define ftello ftell +typedef long FileOffset; /* NOLINT */ +#endif /* CONFIG_OS_SUPPORT */ + +#if CONFIG_OS_SUPPORT +#if defined(_MSC_VER) +#include /* NOLINT */ +#define isatty _isatty +#define fileno _fileno +#else +#include /* NOLINT */ +#endif /* _MSC_VER */ +#endif /* CONFIG_OS_SUPPORT */ + +#define LITERALU64(hi, lo) ((((uint64_t)hi) << 32) | lo) + +#ifndef PATH_MAX +#define PATH_MAX 512 +#endif + +#define IVF_FRAME_HDR_SZ (4 + 8) /* 4 byte size + 8 byte timestamp */ +#define IVF_FILE_HDR_SZ 32 + +#define RAW_FRAME_HDR_SZ sizeof(uint32_t) + +#define VP8_FOURCC 0x30385056 +#define VP9_FOURCC 0x30395056 + +enum VideoFileType { + FILE_TYPE_RAW, + FILE_TYPE_IVF, + FILE_TYPE_Y4M, + FILE_TYPE_WEBM +}; + +struct FileTypeDetectionBuffer { + char buf[4]; + size_t buf_read; + size_t position; +}; + +struct VpxRational { + int numerator; + int denominator; +}; + +struct VpxInputContext { + const char *filename; + FILE *file; + int64_t length; + struct FileTypeDetectionBuffer detect; + enum VideoFileType file_type; + uint32_t width; + uint32_t height; + struct VpxRational pixel_aspect_ratio; + vpx_img_fmt_t fmt; + vpx_bit_depth_t bit_depth; + int only_i420; + uint32_t fourcc; + struct VpxRational framerate; +#if CONFIG_ENCODERS + y4m_input y4m; +#endif +}; + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(__GNUC__) +#define VPX_NO_RETURN __attribute__((noreturn)) +#elif defined(_MSC_VER) +#define VPX_NO_RETURN __declspec(noreturn) +#else +#define VPX_NO_RETURN +#endif + +// Tells the compiler to perform `printf` format string checking if the +// compiler supports it; see the 'format' attribute in +// . +#define VPX_TOOLS_FORMAT_PRINTF(string_index, first_to_check) +#if defined(__has_attribute) +#if __has_attribute(format) +#undef VPX_TOOLS_FORMAT_PRINTF +#define VPX_TOOLS_FORMAT_PRINTF(string_index, first_to_check) \ + __attribute__((__format__(__printf__, string_index, first_to_check))) +#endif +#endif + +/* Sets a stdio stream into binary mode */ +FILE *set_binary_mode(FILE *stream); + +VPX_NO_RETURN void die(const char *fmt, ...) VPX_TOOLS_FORMAT_PRINTF(1, 2); +VPX_NO_RETURN void fatal(const char *fmt, ...) VPX_TOOLS_FORMAT_PRINTF(1, 2); +void warn(const char *fmt, ...) VPX_TOOLS_FORMAT_PRINTF(1, 2); + +VPX_NO_RETURN void die_codec(vpx_codec_ctx_t *ctx, const char *s); + +/* The tool including this file must define usage_exit() */ +VPX_NO_RETURN void usage_exit(void); + +#undef VPX_NO_RETURN + +int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame); + +typedef struct VpxInterface { + const char *name; + uint32_t fourcc; + vpx_codec_iface_t *(*codec_interface)(void); +} VpxInterface; + +int get_vpx_encoder_count(void); +const VpxInterface *get_vpx_encoder_by_index(int i); +const VpxInterface *get_vpx_encoder_by_name(const char *name); + +int get_vpx_decoder_count(void); +const VpxInterface *get_vpx_decoder_by_index(int i); +const VpxInterface *get_vpx_decoder_by_name(const char *name); +const VpxInterface *get_vpx_decoder_by_fourcc(uint32_t fourcc); + +int vpx_img_plane_width(const vpx_image_t *img, int plane); +int vpx_img_plane_height(const vpx_image_t *img, int plane); +void vpx_img_write(const vpx_image_t *img, FILE *file); +int vpx_img_read(vpx_image_t *img, FILE *file); + +double sse_to_psnr(double samples, double peak, double mse); + +#if CONFIG_ENCODERS +int read_frame(struct VpxInputContext *input_ctx, vpx_image_t *img); +int file_is_y4m(const char detect[4]); +int fourcc_is_ivf(const char detect[4]); +void open_input_file(struct VpxInputContext *input); +void close_input_file(struct VpxInputContext *input); +#endif + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_img_upshift(vpx_image_t *dst, vpx_image_t *src, int input_shift); +void vpx_img_downshift(vpx_image_t *dst, vpx_image_t *src, int down_shift); +void vpx_img_truncate_16_to_8(vpx_image_t *dst, vpx_image_t *src); +#endif + +int compare_img(const vpx_image_t *const img1, const vpx_image_t *const img2); +#if CONFIG_VP9_HIGHBITDEPTH +void find_mismatch_high(const vpx_image_t *const img1, + const vpx_image_t *const img2, int yloc[4], int uloc[4], + int vloc[4]); +#endif +void find_mismatch(const vpx_image_t *const img1, const vpx_image_t *const img2, + int yloc[4], int uloc[4], int vloc[4]); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif // VPX_TOOLS_COMMON_H_ diff --git a/media/libvpx/libvpx/usage.dox b/media/libvpx/libvpx/usage.dox new file mode 100644 index 0000000000..88235202d1 --- /dev/null +++ b/media/libvpx/libvpx/usage.dox @@ -0,0 +1,136 @@ +/*!\page usage Usage + + The vpx multi-format codec SDK provides a unified interface amongst its + supported codecs. This abstraction allows applications using this SDK to + easily support multiple video formats with minimal code duplication or + "special casing." This section describes the interface common to all codecs. + For codec-specific details, see the \ref codecs page. + + The following sections are common to all codecs: + - \ref usage_types + - \ref usage_features + - \ref usage_init + - \ref usage_errors + + For more information on decoder and encoder specific usage, see the + following pages: + \if decoder + \li \subpage usage_decode + \endif + \if encoder + \li \subpage usage_encode + \endif + + \section usage_types Important Data Types + There are two important data structures to consider in this interface. + + \subsection usage_ctxs Contexts + A context is a storage area allocated by the calling application that the + codec may write into to store details about a single instance of that codec. + Most of the context is implementation specific, and thus opaque to the + application. The context structure as seen by the application is of fixed + size, and thus can be allocated with automatic storage or dynamically + on the heap. + + Most operations require an initialized codec context. Codec context + instances are codec specific. That is, the codec to be used for the encoded + video must be known at initialization time. See #vpx_codec_ctx_t for further + information. + + \subsection usage_ifaces Interfaces + A codec interface is an opaque structure that controls how function calls + into the generic interface are dispatched to their codec-specific + implementations. Applications \ref MUSTNOT attempt to examine or override + this storage, as it contains internal implementation details likely to + change from release to release. + + Each supported codec will expose an interface structure to the application + as an extern reference to a structure of the incomplete type + #vpx_codec_iface_t. + + \section usage_features Features + Several "features" are defined that are optionally implemented by codec + algorithms. Indeed, the same algorithm may support different features on + different platforms. The purpose of defining these features is that when + they are implemented, they conform to a common interface. The features, or + capabilities, of an algorithm can be queried from it's interface by using + the vpx_codec_get_caps() method. Attempts to invoke features not supported + by an algorithm will generally result in #VPX_CODEC_INCAPABLE. + + \if decoder + Currently defined decoder features include: + - \ref usage_cb + - \ref usage_postproc + \endif + + \section usage_init Initialization + To initialize a codec instance, the address of the codec context + and interface structures are passed to an initialization function. Depending + on the \ref usage_features that the codec supports, the codec could be + initialized in different modes. + + To prevent cases of confusion where the ABI of the library changes, + the ABI is versioned. The ABI version number must be passed at + initialization time to ensure the application is using a header file that + matches the library. The current ABI version number is stored in the + preprocessor macros #VPX_CODEC_ABI_VERSION, #VPX_ENCODER_ABI_VERSION, and + #VPX_DECODER_ABI_VERSION. For convenience, each initialization function has + a wrapper macro that inserts the correct version number. These macros are + named like the initialization methods, but without the _ver suffix. + + + The available initialization methods are: + \if encoder + \li #vpx_codec_enc_init (calls vpx_codec_enc_init_ver()) + \li #vpx_codec_enc_init_multi (calls vpx_codec_enc_init_multi_ver()) + \endif + \if decoder + \li #vpx_codec_dec_init (calls vpx_codec_dec_init_ver()) + \endif + + + \section usage_errors Error Handling + Almost all codec functions return an error status of type #vpx_codec_err_t. + The semantics of how each error condition should be processed is clearly + defined in the definitions of each enumerated value. Error values can be + converted into ASCII strings with the vpx_codec_error() and + vpx_codec_err_to_string() methods. The difference between these two methods is + that vpx_codec_error() returns the error state from an initialized context, + whereas vpx_codec_err_to_string() can be used in cases where an error occurs + outside any context. The enumerated value returned from the last call can be + retrieved from the err member of the decoder context as well. + Finally, more detailed error information may be able to be obtained by using + the vpx_codec_error_detail() method. Not all errors produce detailed error + information. + + In addition to error information, the codec library's build configuration + is available at runtime on some platforms. This information can be returned + by calling vpx_codec_build_config(), and is formatted as a base64 coded string + (comprised of characters in the set [a-z_a-Z0-9+/]). This information is not + useful to an application at runtime, but may be of use to vpx for support. + + + \section usage_deadline Deadline + Both the encoding and decoding functions have a deadline + parameter. This parameter indicates the amount of time, in microseconds + (us), that the application wants the codec to spend processing before + returning. This is a soft deadline -- that is, the semantics of the + requested operation take precedence over meeting the deadline. If, for + example, an application sets a deadline of 1000us, and the + frame takes 2000us to decode, the call to vpx_codec_decode() will return + after 2000us. In this case the deadline is not met, but the semantics of the + function are preserved. If, for the same frame, an application instead sets + a deadline of 5000us, the decoder will see that it has 3000us + remaining in its time slice when decoding completes. It could then choose to + run a set of \ref usage_postproc filters, and perhaps would return after + 4000us (instead of the allocated 5000us). In this case the deadline is met, + and the semantics of the call are preserved, as before. + + The special value 0 is reserved to represent an infinite + deadline. In this case, the codec will perform as much processing as + possible to yield the highest quality frame. + + By convention, the value 1 is used to mean "return as fast as + possible." + +*/ diff --git a/media/libvpx/libvpx/usage_cx.dox b/media/libvpx/libvpx/usage_cx.dox new file mode 100644 index 0000000000..b2220cfdde --- /dev/null +++ b/media/libvpx/libvpx/usage_cx.dox @@ -0,0 +1,15 @@ +/*! \page usage_encode Encoding + + The vpx_codec_encode() function is at the core of the encode loop. It + processes raw images passed by the application, producing packets of + compressed data. The deadline parameter controls the amount + of time in microseconds the encoder should spend working on the frame. For + more information on the deadline parameter, see + \ref usage_deadline. + + + \if samples + \ref samples + \endif + +*/ diff --git a/media/libvpx/libvpx/usage_dx.dox b/media/libvpx/libvpx/usage_dx.dox new file mode 100644 index 0000000000..85063f705b --- /dev/null +++ b/media/libvpx/libvpx/usage_dx.dox @@ -0,0 +1,64 @@ +/*! \page usage_decode Decoding + + The vpx_codec_decode() function is at the core of the decode loop. It + processes packets of compressed data passed by the application, producing + decoded images. The decoder expects packets to comprise exactly one image + frame of data. Packets \ref MUST be passed in decode order. If the + application wishes to associate some data with the frame, the + user_priv member may be set. The deadline + parameter controls the amount of time in microseconds the decoder should + spend working on the frame. This is typically used to support adaptive + \ref usage_postproc based on the amount of free CPU time. For more + information on the deadline parameter, see \ref usage_deadline. + + \if samples + \ref samples + \endif + + + \section usage_cb Callback Based Decoding + There are two methods for the application to access decoded frame data. Some + codecs support asynchronous (callback-based) decoding \ref usage_features + that allow the application to register a callback to be invoked by the + decoder when decoded data becomes available. Decoders are not required to + support this feature, however. Like all \ref usage_features, support can be + determined by calling vpx_codec_get_caps(). Callbacks are available in both + frame-based and slice-based variants. Frame based callbacks conform to the + signature of #vpx_codec_put_frame_cb_fn_t and are invoked once the entire + frame has been decoded. Slice based callbacks conform to the signature of + #vpx_codec_put_slice_cb_fn_t and are invoked after a subsection of the frame + is decoded. For example, a slice callback could be issued for each + macroblock row. However, the number and size of slices to return is + implementation specific. Also, the image data passed in a slice callback is + not necessarily in the same memory segment as the data will be when it is + assembled into a full frame. For this reason, the application \ref MUST + examine the rectangles that describe what data is valid to access and what + data has been updated in this call. For all their additional complexity, + slice based decoding callbacks provide substantial speed gains to the + overall application in some cases, due to improved cache behavior. + + + \section usage_frame_iter Frame Iterator Based Decoding + If the codec does not support callback based decoding, or the application + chooses not to make use of that feature, decoded frames are made available + through the vpx_codec_get_frame() iterator. The application initializes the + iterator storage (of type #vpx_codec_iter_t) to NULL, then calls + vpx_codec_get_frame repeatedly until it returns NULL, indicating that all + images have been returned. This process may result in zero, one, or many + frames that are ready for display, depending on the codec. + + + \section usage_postproc Postprocessing + Postprocessing is a process that is applied after a frame is decoded to + enhance the image's appearance by removing artifacts introduced in the + compression process. It is not required to properly decode the frame, and + is generally done only when there is enough spare CPU time to execute + the required filters. Codecs may support a number of different + postprocessing filters, and the available filters may differ from platform + to platform. Embedded devices often do not have enough CPU to implement + postprocessing in software. The filter selection is generally handled + automatically by the codec, depending on the amount of time remaining before + hitting the user-specified \ref usage_deadline after decoding the frame. + + +*/ diff --git a/media/libvpx/libvpx/video_common.h b/media/libvpx/libvpx/video_common.h new file mode 100644 index 0000000000..77eb9fac0c --- /dev/null +++ b/media/libvpx/libvpx/video_common.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VIDEO_COMMON_H_ +#define VPX_VIDEO_COMMON_H_ + +#include "./tools_common.h" + +typedef struct { + uint32_t codec_fourcc; + int frame_width; + int frame_height; + struct VpxRational time_base; +} VpxVideoInfo; + +#endif // VPX_VIDEO_COMMON_H_ diff --git a/media/libvpx/libvpx/video_reader.c b/media/libvpx/libvpx/video_reader.c new file mode 100644 index 0000000000..16822eff3c --- /dev/null +++ b/media/libvpx/libvpx/video_reader.c @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./ivfdec.h" +#include "./video_reader.h" + +#include "vpx_ports/mem_ops.h" + +static const char *const kIVFSignature = "DKIF"; + +struct VpxVideoReaderStruct { + VpxVideoInfo info; + FILE *file; + uint8_t *buffer; + size_t buffer_size; + size_t frame_size; +}; + +VpxVideoReader *vpx_video_reader_open(const char *filename) { + char header[32]; + VpxVideoReader *reader = NULL; + FILE *const file = fopen(filename, "rb"); + if (!file) { + fprintf(stderr, "%s can't be opened.\n", filename); // Can't open file + return NULL; + } + + if (fread(header, 1, 32, file) != 32) { + fprintf(stderr, "File header on %s can't be read.\n", + filename); // Can't read file header + return NULL; + } + if (memcmp(kIVFSignature, header, 4) != 0) { + fprintf(stderr, "The IVF signature on %s is wrong.\n", + filename); // Wrong IVF signature + + return NULL; + } + if (mem_get_le16(header + 4) != 0) { + fprintf(stderr, "%s uses the wrong IVF version.\n", + filename); // Wrong IVF version + + return NULL; + } + + reader = calloc(1, sizeof(*reader)); + if (!reader) { + fprintf( + stderr, + "Can't allocate VpxVideoReader\n"); // Can't allocate VpxVideoReader + + return NULL; + } + + reader->file = file; + reader->info.codec_fourcc = mem_get_le32(header + 8); + reader->info.frame_width = mem_get_le16(header + 12); + reader->info.frame_height = mem_get_le16(header + 14); + reader->info.time_base.numerator = mem_get_le32(header + 16); + reader->info.time_base.denominator = mem_get_le32(header + 20); + + return reader; +} + +void vpx_video_reader_close(VpxVideoReader *reader) { + if (reader) { + fclose(reader->file); + free(reader->buffer); + free(reader); + } +} + +int vpx_video_reader_read_frame(VpxVideoReader *reader) { + return !ivf_read_frame(reader->file, &reader->buffer, &reader->frame_size, + &reader->buffer_size); +} + +const uint8_t *vpx_video_reader_get_frame(VpxVideoReader *reader, + size_t *size) { + if (size) *size = reader->frame_size; + + return reader->buffer; +} + +const VpxVideoInfo *vpx_video_reader_get_info(VpxVideoReader *reader) { + return &reader->info; +} diff --git a/media/libvpx/libvpx/video_reader.h b/media/libvpx/libvpx/video_reader.h new file mode 100644 index 0000000000..1f5c8088bb --- /dev/null +++ b/media/libvpx/libvpx/video_reader.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VIDEO_READER_H_ +#define VPX_VIDEO_READER_H_ + +#include "./video_common.h" + +// The following code is work in progress. It is going to support transparent +// reading of input files. Right now only IVF format is supported for +// simplicity. The main goal the API is to be simple and easy to use in example +// code and in vpxenc/vpxdec later. All low-level details like memory +// buffer management are hidden from API users. +struct VpxVideoReaderStruct; +typedef struct VpxVideoReaderStruct VpxVideoReader; + +#ifdef __cplusplus +extern "C" { +#endif + +// Opens the input file for reading and inspects it to determine file type. +// Returns an opaque VpxVideoReader* upon success, or NULL upon failure. +// Right now only IVF format is supported. +VpxVideoReader *vpx_video_reader_open(const char *filename); + +// Frees all resources associated with VpxVideoReader* returned from +// vpx_video_reader_open() call. +void vpx_video_reader_close(VpxVideoReader *reader); + +// Reads frame from the file and stores it in internal buffer. +int vpx_video_reader_read_frame(VpxVideoReader *reader); + +// Returns the pointer to memory buffer with frame data read by last call to +// vpx_video_reader_read_frame(). +const uint8_t *vpx_video_reader_get_frame(VpxVideoReader *reader, size_t *size); + +// Fills VpxVideoInfo with information from opened video file. +const VpxVideoInfo *vpx_video_reader_get_info(VpxVideoReader *reader); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VIDEO_READER_H_ diff --git a/media/libvpx/libvpx/video_writer.c b/media/libvpx/libvpx/video_writer.c new file mode 100644 index 0000000000..6e9a848bc3 --- /dev/null +++ b/media/libvpx/libvpx/video_writer.c @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./ivfenc.h" +#include "./video_writer.h" +#include "vpx/vpx_encoder.h" + +struct VpxVideoWriterStruct { + VpxVideoInfo info; + FILE *file; + int frame_count; +}; + +static void write_header(FILE *file, const VpxVideoInfo *info, + int frame_count) { + struct vpx_codec_enc_cfg cfg; + cfg.g_w = info->frame_width; + cfg.g_h = info->frame_height; + cfg.g_timebase.num = info->time_base.numerator; + cfg.g_timebase.den = info->time_base.denominator; + + ivf_write_file_header(file, &cfg, info->codec_fourcc, frame_count); +} + +VpxVideoWriter *vpx_video_writer_open(const char *filename, + VpxContainer container, + const VpxVideoInfo *info) { + if (container == kContainerIVF) { + VpxVideoWriter *writer = NULL; + FILE *const file = fopen(filename, "wb"); + if (!file) { + fprintf(stderr, "%s can't be written to.\n", filename); + return NULL; + } + writer = malloc(sizeof(*writer)); + if (!writer) { + fprintf(stderr, "Can't allocate VpxVideoWriter.\n"); + return NULL; + } + writer->frame_count = 0; + writer->info = *info; + writer->file = file; + + write_header(writer->file, info, 0); + + return writer; + } + fprintf(stderr, "VpxVideoWriter supports only IVF.\n"); + return NULL; +} + +void vpx_video_writer_close(VpxVideoWriter *writer) { + if (writer) { + // Rewriting frame header with real frame count + rewind(writer->file); + write_header(writer->file, &writer->info, writer->frame_count); + + fclose(writer->file); + free(writer); + } +} + +int vpx_video_writer_write_frame(VpxVideoWriter *writer, const uint8_t *buffer, + size_t size, int64_t pts) { + ivf_write_frame_header(writer->file, pts, size); + if (fwrite(buffer, 1, size, writer->file) != size) return 0; + + ++writer->frame_count; + + return 1; +} diff --git a/media/libvpx/libvpx/video_writer.h b/media/libvpx/libvpx/video_writer.h new file mode 100644 index 0000000000..b4d242b920 --- /dev/null +++ b/media/libvpx/libvpx/video_writer.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VIDEO_WRITER_H_ +#define VPX_VIDEO_WRITER_H_ + +#include "./video_common.h" + +typedef enum { kContainerIVF } VpxContainer; + +struct VpxVideoWriterStruct; +typedef struct VpxVideoWriterStruct VpxVideoWriter; + +#ifdef __cplusplus +extern "C" { +#endif + +// Finds and opens writer for specified container format. +// Returns an opaque VpxVideoWriter* upon success, or NULL upon failure. +// Right now only IVF format is supported. +VpxVideoWriter *vpx_video_writer_open(const char *filename, + VpxContainer container, + const VpxVideoInfo *info); + +// Frees all resources associated with VpxVideoWriter* returned from +// vpx_video_writer_open() call. +void vpx_video_writer_close(VpxVideoWriter *writer); + +// Writes frame bytes to the file. +int vpx_video_writer_write_frame(VpxVideoWriter *writer, const uint8_t *buffer, + size_t size, int64_t pts); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VIDEO_WRITER_H_ diff --git a/media/libvpx/libvpx/vp8/common/alloccommon.c b/media/libvpx/libvpx/vp8/common/alloccommon.c new file mode 100644 index 0000000000..722b158c3a --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/alloccommon.c @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "alloccommon.h" +#include "blockd.h" +#include "vpx_mem/vpx_mem.h" +#include "onyxc_int.h" +#include "findnearmv.h" +#include "entropymode.h" +#include "systemdependent.h" + +void vp8_de_alloc_frame_buffers(VP8_COMMON *oci) { + int i; + for (i = 0; i < NUM_YV12_BUFFERS; ++i) { + vp8_yv12_de_alloc_frame_buffer(&oci->yv12_fb[i]); + } + + vp8_yv12_de_alloc_frame_buffer(&oci->temp_scale_frame); +#if CONFIG_POSTPROC + vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer); + if (oci->post_proc_buffer_int_used) { + vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer_int); + } + + vpx_free(oci->pp_limits_buffer); + oci->pp_limits_buffer = NULL; + + vpx_free(oci->postproc_state.generated_noise); + oci->postproc_state.generated_noise = NULL; +#endif + + vpx_free(oci->above_context); + vpx_free(oci->mip); +#if CONFIG_ERROR_CONCEALMENT + vpx_free(oci->prev_mip); + oci->prev_mip = NULL; +#endif + + oci->above_context = NULL; + oci->mip = NULL; +} + +int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height) { + int i; + + vp8_de_alloc_frame_buffers(oci); + + /* our internal buffers are always multiples of 16 */ + if ((width & 0xf) != 0) width += 16 - (width & 0xf); + + if ((height & 0xf) != 0) height += 16 - (height & 0xf); + + for (i = 0; i < NUM_YV12_BUFFERS; ++i) { + oci->fb_idx_ref_cnt[i] = 0; + oci->yv12_fb[i].flags = 0; + if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i], width, height, + VP8BORDERINPIXELS) < 0) { + goto allocation_fail; + } + } + + oci->new_fb_idx = 0; + oci->lst_fb_idx = 1; + oci->gld_fb_idx = 2; + oci->alt_fb_idx = 3; + + oci->fb_idx_ref_cnt[0] = 1; + oci->fb_idx_ref_cnt[1] = 1; + oci->fb_idx_ref_cnt[2] = 1; + oci->fb_idx_ref_cnt[3] = 1; + + if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame, width, 16, + VP8BORDERINPIXELS) < 0) { + goto allocation_fail; + } + + oci->mb_rows = height >> 4; + oci->mb_cols = width >> 4; + oci->MBs = oci->mb_rows * oci->mb_cols; + oci->mode_info_stride = oci->mb_cols + 1; + oci->mip = + vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO)); + + if (!oci->mip) goto allocation_fail; + + oci->mi = oci->mip + oci->mode_info_stride + 1; + + /* Allocation of previous mode info will be done in vp8_decode_frame() + * as it is a decoder only data */ + + oci->above_context = + vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * oci->mb_cols, 1); + + if (!oci->above_context) goto allocation_fail; + +#if CONFIG_POSTPROC + if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer, width, height, + VP8BORDERINPIXELS) < 0) { + goto allocation_fail; + } + + oci->post_proc_buffer_int_used = 0; + memset(&oci->postproc_state, 0, sizeof(oci->postproc_state)); + memset(oci->post_proc_buffer.buffer_alloc, 128, + oci->post_proc_buffer.frame_size); + + /* Allocate buffer to store post-processing filter coefficients. + * + * Note: Round up mb_cols to support SIMD reads + */ + oci->pp_limits_buffer = vpx_memalign(16, 24 * ((oci->mb_cols + 1) & ~1)); + if (!oci->pp_limits_buffer) goto allocation_fail; +#endif + + return 0; + +allocation_fail: + vp8_de_alloc_frame_buffers(oci); + return 1; +} + +void vp8_setup_version(VP8_COMMON *cm) { + switch (cm->version) { + case 0: + cm->no_lpf = 0; + cm->filter_type = NORMAL_LOOPFILTER; + cm->use_bilinear_mc_filter = 0; + cm->full_pixel = 0; + break; + case 1: + cm->no_lpf = 0; + cm->filter_type = SIMPLE_LOOPFILTER; + cm->use_bilinear_mc_filter = 1; + cm->full_pixel = 0; + break; + case 2: + cm->no_lpf = 1; + cm->filter_type = NORMAL_LOOPFILTER; + cm->use_bilinear_mc_filter = 1; + cm->full_pixel = 0; + break; + case 3: + cm->no_lpf = 1; + cm->filter_type = SIMPLE_LOOPFILTER; + cm->use_bilinear_mc_filter = 1; + cm->full_pixel = 1; + break; + default: + /*4,5,6,7 are reserved for future use*/ + cm->no_lpf = 0; + cm->filter_type = NORMAL_LOOPFILTER; + cm->use_bilinear_mc_filter = 0; + cm->full_pixel = 0; + break; + } +} +void vp8_create_common(VP8_COMMON *oci) { + vp8_machine_specific_config(oci); + + vp8_init_mbmode_probs(oci); + vp8_default_bmode_probs(oci->fc.bmode_prob); + + oci->mb_no_coeff_skip = 1; + oci->no_lpf = 0; + oci->filter_type = NORMAL_LOOPFILTER; + oci->use_bilinear_mc_filter = 0; + oci->full_pixel = 0; + oci->multi_token_partition = ONE_PARTITION; + oci->clamp_type = RECON_CLAMP_REQUIRED; + + /* Initialize reference frame sign bias structure to defaults */ + memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias)); + + /* Default disable buffer to buffer copying */ + oci->copy_buffer_to_gf = 0; + oci->copy_buffer_to_arf = 0; +} + +void vp8_remove_common(VP8_COMMON *oci) { vp8_de_alloc_frame_buffers(oci); } diff --git a/media/libvpx/libvpx/vp8/common/alloccommon.h b/media/libvpx/libvpx/vp8/common/alloccommon.h new file mode 100644 index 0000000000..2d376bbac3 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/alloccommon.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_ALLOCCOMMON_H_ +#define VPX_VP8_COMMON_ALLOCCOMMON_H_ + +#include "onyxc_int.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_create_common(VP8_COMMON *oci); +void vp8_remove_common(VP8_COMMON *oci); +void vp8_de_alloc_frame_buffers(VP8_COMMON *oci); +int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height); +void vp8_setup_version(VP8_COMMON *cm); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_COMMON_ALLOCCOMMON_H_ diff --git a/media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.c b/media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.c new file mode 100644 index 0000000000..48a1972048 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.c @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vp8_rtcd.h" +#include "vp8/common/arm/loopfilter_arm.h" +#include "vp8/common/loopfilter.h" +#include "vp8/common/onyxc_int.h" + +/* NEON loopfilter functions */ +/* Horizontal MB filtering */ +void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + unsigned char mblim = *lfi->mblim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; + vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, + hev_thr); + + if (u_ptr) + vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, + hev_thr, v_ptr); +} + +/* Vertical MB Filtering */ +void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + unsigned char mblim = *lfi->mblim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; + + vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr); + + if (u_ptr) + vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, + hev_thr, v_ptr); +} + +/* Horizontal B Filtering */ +void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + unsigned char blim = *lfi->blim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; + + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, + lim, hev_thr); + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, + lim, hev_thr); + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, + lim, hev_thr); + + if (u_ptr) + vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, + blim, lim, hev_thr, + v_ptr + 4 * uv_stride); +} + +/* Vertical B Filtering */ +void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + unsigned char blim = *lfi->blim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; + + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr); + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr); + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, + hev_thr); + + if (u_ptr) + vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, + hev_thr, v_ptr + 4); +} diff --git a/media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.h b/media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.h new file mode 100644 index 0000000000..6cf660d228 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_ARM_LOOPFILTER_ARM_H_ +#define VPX_VP8_COMMON_ARM_LOOPFILTER_ARM_H_ + +typedef void loopfilter_y_neon(unsigned char *src, int pitch, + unsigned char blimit, unsigned char limit, + unsigned char thresh); +typedef void loopfilter_uv_neon(unsigned char *u, int pitch, + unsigned char blimit, unsigned char limit, + unsigned char thresh, unsigned char *v); + +loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon; +loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon; +loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon; +loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon; + +loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon; +loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon; +loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon; +loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon; + +#endif // VPX_VP8_COMMON_ARM_LOOPFILTER_ARM_H_ diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c new file mode 100644 index 0000000000..590956dde1 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c @@ -0,0 +1,764 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vp8_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" + +static const uint8_t bifilter4_coeff[8][2] = { { 128, 0 }, { 112, 16 }, + { 96, 32 }, { 80, 48 }, + { 64, 64 }, { 48, 80 }, + { 32, 96 }, { 16, 112 } }; + +static INLINE uint8x8_t load_and_shift(const unsigned char *a) { + return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32)); +} + +void vp8_bilinear_predict4x4_neon(unsigned char *src_ptr, + int src_pixels_per_line, int xoffset, + int yoffset, unsigned char *dst_ptr, + int dst_pitch) { + uint8x8_t e0, e1, e2; + + if (xoffset == 0) { // skip_1stpass_filter + uint8x8_t a0, a1, a2, a3, a4; + + a0 = load_and_shift(src_ptr); + src_ptr += src_pixels_per_line; + a1 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + a2 = load_and_shift(src_ptr); + src_ptr += src_pixels_per_line; + a3 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + a4 = vld1_u8(src_ptr); + + e0 = vext_u8(a0, a1, 4); + e1 = vext_u8(a2, a3, 4); + e2 = a4; + } else { + uint8x8_t a0, a1, a2, a3, a4, b4; + uint8x16_t a01, a23; + uint8x16_t b01, b23; + uint32x2x2_t c0, c1, c2, c3; + uint16x8_t d0, d1, d2; + const uint8x8_t filter0 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + const uint8x8_t filter1 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + a0 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + a1 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + a2 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + a3 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + a4 = vld1_u8(src_ptr); + + a01 = vcombine_u8(a0, a1); + a23 = vcombine_u8(a2, a3); + + b01 = vreinterpretq_u8_u64(vshrq_n_u64(vreinterpretq_u64_u8(a01), 8)); + b23 = vreinterpretq_u8_u64(vshrq_n_u64(vreinterpretq_u64_u8(a23), 8)); + b4 = vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(a4), 8)); + + c0 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a01)), + vreinterpret_u32_u8(vget_high_u8(a01))); + c1 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a23)), + vreinterpret_u32_u8(vget_high_u8(a23))); + c2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b01)), + vreinterpret_u32_u8(vget_high_u8(b01))); + c3 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b23)), + vreinterpret_u32_u8(vget_high_u8(b23))); + + d0 = vmull_u8(vreinterpret_u8_u32(c0.val[0]), filter0); + d1 = vmull_u8(vreinterpret_u8_u32(c1.val[0]), filter0); + d2 = vmull_u8(a4, filter0); + + d0 = vmlal_u8(d0, vreinterpret_u8_u32(c2.val[0]), filter1); + d1 = vmlal_u8(d1, vreinterpret_u8_u32(c3.val[0]), filter1); + d2 = vmlal_u8(d2, b4, filter1); + + e0 = vqrshrn_n_u16(d0, 7); + e1 = vqrshrn_n_u16(d1, 7); + e2 = vqrshrn_n_u16(d2, 7); + } + + // secondpass_filter + if (yoffset == 0) { // skip_2ndpass_filter + store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(e0, e1)); + } else { + uint8x8_t f0, f1; + const uint8x8_t filter0 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + const uint8x8_t filter1 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + uint16x8_t b0 = vmull_u8(e0, filter0); + uint16x8_t b1 = vmull_u8(e1, filter0); + + const uint8x8_t a0 = vext_u8(e0, e1, 4); + const uint8x8_t a1 = vext_u8(e1, e2, 4); + + b0 = vmlal_u8(b0, a0, filter1); + b1 = vmlal_u8(b1, a1, filter1); + + f0 = vqrshrn_n_u16(b0, 7); + f1 = vqrshrn_n_u16(b1, 7); + + store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(f0, f1)); + } +} + +void vp8_bilinear_predict8x4_neon(unsigned char *src_ptr, + int src_pixels_per_line, int xoffset, + int yoffset, unsigned char *dst_ptr, + int dst_pitch) { + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8; + uint8x8_t d7u8, d9u8, d11u8, d22u8, d23u8, d24u8, d25u8, d26u8; + uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8; + uint16x8_t q1u16, q2u16, q3u16, q4u16; + uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16; + + if (xoffset == 0) { // skip_1stpass_filter + d22u8 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + d23u8 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + d24u8 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + d25u8 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + d26u8 = vld1_u8(src_ptr); + } else { + q1u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q2u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q3u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q4u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q5u8 = vld1q_u8(src_ptr); + + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + + d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1); + d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1); + d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + + q6u16 = vmlal_u8(q6u16, d3u8, d1u8); + q7u16 = vmlal_u8(q7u16, d5u8, d1u8); + q8u16 = vmlal_u8(q8u16, d7u8, d1u8); + q9u16 = vmlal_u8(q9u16, d9u8, d1u8); + q10u16 = vmlal_u8(q10u16, d11u8, d1u8); + + d22u8 = vqrshrn_n_u16(q6u16, 7); + d23u8 = vqrshrn_n_u16(q7u16, 7); + d24u8 = vqrshrn_n_u16(q8u16, 7); + d25u8 = vqrshrn_n_u16(q9u16, 7); + d26u8 = vqrshrn_n_u16(q10u16, 7); + } + + // secondpass_filter + if (yoffset == 0) { // skip_2ndpass_filter + vst1_u8((uint8_t *)dst_ptr, d22u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d23u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d24u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d25u8); + } else { + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + q1u16 = vmull_u8(d22u8, d0u8); + q2u16 = vmull_u8(d23u8, d0u8); + q3u16 = vmull_u8(d24u8, d0u8); + q4u16 = vmull_u8(d25u8, d0u8); + + q1u16 = vmlal_u8(q1u16, d23u8, d1u8); + q2u16 = vmlal_u8(q2u16, d24u8, d1u8); + q3u16 = vmlal_u8(q3u16, d25u8, d1u8); + q4u16 = vmlal_u8(q4u16, d26u8, d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + + vst1_u8((uint8_t *)dst_ptr, d2u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d3u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d4u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d5u8); + } + return; +} + +void vp8_bilinear_predict8x8_neon(unsigned char *src_ptr, + int src_pixels_per_line, int xoffset, + int yoffset, unsigned char *dst_ptr, + int dst_pitch) { + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8, d11u8; + uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8; + uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8; + uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16; + uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16; + + if (xoffset == 0) { // skip_1stpass_filter + d22u8 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + d23u8 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + d24u8 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + d25u8 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + d26u8 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + d27u8 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + d28u8 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + d29u8 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + d30u8 = vld1_u8(src_ptr); + } else { + q1u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q2u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q3u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q4u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + + d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1); + d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1); + d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + + q6u16 = vmlal_u8(q6u16, d3u8, d1u8); + q7u16 = vmlal_u8(q7u16, d5u8, d1u8); + q8u16 = vmlal_u8(q8u16, d7u8, d1u8); + q9u16 = vmlal_u8(q9u16, d9u8, d1u8); + + d22u8 = vqrshrn_n_u16(q6u16, 7); + d23u8 = vqrshrn_n_u16(q7u16, 7); + d24u8 = vqrshrn_n_u16(q8u16, 7); + d25u8 = vqrshrn_n_u16(q9u16, 7); + + // first_pass filtering on the rest 5-line data + q1u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q2u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q3u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q4u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q5u8 = vld1q_u8(src_ptr); + + q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + + d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1); + d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1); + d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + + q6u16 = vmlal_u8(q6u16, d3u8, d1u8); + q7u16 = vmlal_u8(q7u16, d5u8, d1u8); + q8u16 = vmlal_u8(q8u16, d7u8, d1u8); + q9u16 = vmlal_u8(q9u16, d9u8, d1u8); + q10u16 = vmlal_u8(q10u16, d11u8, d1u8); + + d26u8 = vqrshrn_n_u16(q6u16, 7); + d27u8 = vqrshrn_n_u16(q7u16, 7); + d28u8 = vqrshrn_n_u16(q8u16, 7); + d29u8 = vqrshrn_n_u16(q9u16, 7); + d30u8 = vqrshrn_n_u16(q10u16, 7); + } + + // secondpass_filter + if (yoffset == 0) { // skip_2ndpass_filter + vst1_u8((uint8_t *)dst_ptr, d22u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d23u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d24u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d25u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d26u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d27u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d28u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d29u8); + } else { + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + q1u16 = vmull_u8(d22u8, d0u8); + q2u16 = vmull_u8(d23u8, d0u8); + q3u16 = vmull_u8(d24u8, d0u8); + q4u16 = vmull_u8(d25u8, d0u8); + q5u16 = vmull_u8(d26u8, d0u8); + q6u16 = vmull_u8(d27u8, d0u8); + q7u16 = vmull_u8(d28u8, d0u8); + q8u16 = vmull_u8(d29u8, d0u8); + + q1u16 = vmlal_u8(q1u16, d23u8, d1u8); + q2u16 = vmlal_u8(q2u16, d24u8, d1u8); + q3u16 = vmlal_u8(q3u16, d25u8, d1u8); + q4u16 = vmlal_u8(q4u16, d26u8, d1u8); + q5u16 = vmlal_u8(q5u16, d27u8, d1u8); + q6u16 = vmlal_u8(q6u16, d28u8, d1u8); + q7u16 = vmlal_u8(q7u16, d29u8, d1u8); + q8u16 = vmlal_u8(q8u16, d30u8, d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + d6u8 = vqrshrn_n_u16(q5u16, 7); + d7u8 = vqrshrn_n_u16(q6u16, 7); + d8u8 = vqrshrn_n_u16(q7u16, 7); + d9u8 = vqrshrn_n_u16(q8u16, 7); + + vst1_u8((uint8_t *)dst_ptr, d2u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d3u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d4u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d5u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d6u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d7u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d8u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d9u8); + } + return; +} + +void vp8_bilinear_predict16x16_neon(unsigned char *src_ptr, + int src_pixels_per_line, int xoffset, + int yoffset, unsigned char *dst_ptr, + int dst_pitch) { + int i; + unsigned char tmp[272]; + unsigned char *tmpp; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8; + uint8x8_t d19u8, d20u8, d21u8; + uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8; + uint8x16_t q11u8, q12u8, q13u8, q14u8, q15u8; + uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16; + uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16; + + if (xoffset == 0) { // secondpass_bfilter16x16_only + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + q11u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + for (i = 4; i > 0; i--) { + q12u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q13u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q14u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q15u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + + q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); + q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); + q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); + q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); + q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); + q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); + q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); + + q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); + q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); + q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); + q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); + q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); + q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); + q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); + q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + d6u8 = vqrshrn_n_u16(q5u16, 7); + d7u8 = vqrshrn_n_u16(q6u16, 7); + d8u8 = vqrshrn_n_u16(q7u16, 7); + d9u8 = vqrshrn_n_u16(q8u16, 7); + + q1u8 = vcombine_u8(d2u8, d3u8); + q2u8 = vcombine_u8(d4u8, d5u8); + q3u8 = vcombine_u8(d6u8, d7u8); + q4u8 = vcombine_u8(d8u8, d9u8); + + q11u8 = q15u8; + + vst1q_u8((uint8_t *)dst_ptr, q1u8); + dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q2u8); + dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q3u8); + dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q4u8); + dst_ptr += dst_pitch; + } + return; + } + + if (yoffset == 0) { // firstpass_bfilter16x16_only + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + for (i = 4; i > 0; i--) { + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + + q7u16 = vmull_u8(d2u8, d0u8); + q8u16 = vmull_u8(d3u8, d0u8); + q9u16 = vmull_u8(d5u8, d0u8); + q10u16 = vmull_u8(d6u8, d0u8); + q11u16 = vmull_u8(d8u8, d0u8); + q12u16 = vmull_u8(d9u8, d0u8); + q13u16 = vmull_u8(d11u8, d0u8); + q14u16 = vmull_u8(d12u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + d11u8 = vext_u8(d11u8, d12u8, 1); + + q7u16 = vmlal_u8(q7u16, d2u8, d1u8); + q9u16 = vmlal_u8(q9u16, d5u8, d1u8); + q11u16 = vmlal_u8(q11u16, d8u8, d1u8); + q13u16 = vmlal_u8(q13u16, d11u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + d12u8 = vext_u8(d12u8, d13u8, 1); + + q8u16 = vmlal_u8(q8u16, d3u8, d1u8); + q10u16 = vmlal_u8(q10u16, d6u8, d1u8); + q12u16 = vmlal_u8(q12u16, d9u8, d1u8); + q14u16 = vmlal_u8(q14u16, d12u8, d1u8); + + d14u8 = vqrshrn_n_u16(q7u16, 7); + d15u8 = vqrshrn_n_u16(q8u16, 7); + d16u8 = vqrshrn_n_u16(q9u16, 7); + d17u8 = vqrshrn_n_u16(q10u16, 7); + d18u8 = vqrshrn_n_u16(q11u16, 7); + d19u8 = vqrshrn_n_u16(q12u16, 7); + d20u8 = vqrshrn_n_u16(q13u16, 7); + d21u8 = vqrshrn_n_u16(q14u16, 7); + + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + q10u8 = vcombine_u8(d20u8, d21u8); + + vst1q_u8((uint8_t *)dst_ptr, q7u8); + dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q8u8); + dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q9u8); + dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q10u8); + dst_ptr += dst_pitch; + } + return; + } + + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + + // First Pass: output_height lines x output_width columns (17x16) + tmpp = tmp; + for (i = 3; i > 0; i--) { + q7u16 = vmull_u8(d2u8, d0u8); + q8u16 = vmull_u8(d3u8, d0u8); + q9u16 = vmull_u8(d5u8, d0u8); + q10u16 = vmull_u8(d6u8, d0u8); + q11u16 = vmull_u8(d8u8, d0u8); + q12u16 = vmull_u8(d9u8, d0u8); + q13u16 = vmull_u8(d11u8, d0u8); + q14u16 = vmull_u8(d12u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + d11u8 = vext_u8(d11u8, d12u8, 1); + + q7u16 = vmlal_u8(q7u16, d2u8, d1u8); + q9u16 = vmlal_u8(q9u16, d5u8, d1u8); + q11u16 = vmlal_u8(q11u16, d8u8, d1u8); + q13u16 = vmlal_u8(q13u16, d11u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + d12u8 = vext_u8(d12u8, d13u8, 1); + + q8u16 = vmlal_u8(q8u16, d3u8, d1u8); + q10u16 = vmlal_u8(q10u16, d6u8, d1u8); + q12u16 = vmlal_u8(q12u16, d9u8, d1u8); + q14u16 = vmlal_u8(q14u16, d12u8, d1u8); + + d14u8 = vqrshrn_n_u16(q7u16, 7); + d15u8 = vqrshrn_n_u16(q8u16, 7); + d16u8 = vqrshrn_n_u16(q9u16, 7); + d17u8 = vqrshrn_n_u16(q10u16, 7); + d18u8 = vqrshrn_n_u16(q11u16, 7); + d19u8 = vqrshrn_n_u16(q12u16, 7); + d20u8 = vqrshrn_n_u16(q13u16, 7); + d21u8 = vqrshrn_n_u16(q14u16, 7); + + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + q10u8 = vcombine_u8(d20u8, d21u8); + + vst1q_u8((uint8_t *)tmpp, q7u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q8u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q9u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q10u8); + tmpp += 16; + } + + // First-pass filtering for rest 5 lines + d14u8 = vld1_u8(src_ptr); + d15u8 = vld1_u8(src_ptr + 8); + d16u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + + q9u16 = vmull_u8(d2u8, d0u8); + q10u16 = vmull_u8(d3u8, d0u8); + q11u16 = vmull_u8(d5u8, d0u8); + q12u16 = vmull_u8(d6u8, d0u8); + q13u16 = vmull_u8(d8u8, d0u8); + q14u16 = vmull_u8(d9u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + + q9u16 = vmlal_u8(q9u16, d2u8, d1u8); + q11u16 = vmlal_u8(q11u16, d5u8, d1u8); + q13u16 = vmlal_u8(q13u16, d8u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + + q10u16 = vmlal_u8(q10u16, d3u8, d1u8); + q12u16 = vmlal_u8(q12u16, d6u8, d1u8); + q14u16 = vmlal_u8(q14u16, d9u8, d1u8); + + q1u16 = vmull_u8(d11u8, d0u8); + q2u16 = vmull_u8(d12u8, d0u8); + q3u16 = vmull_u8(d14u8, d0u8); + q4u16 = vmull_u8(d15u8, d0u8); + + d11u8 = vext_u8(d11u8, d12u8, 1); + d14u8 = vext_u8(d14u8, d15u8, 1); + + q1u16 = vmlal_u8(q1u16, d11u8, d1u8); + q3u16 = vmlal_u8(q3u16, d14u8, d1u8); + + d12u8 = vext_u8(d12u8, d13u8, 1); + d15u8 = vext_u8(d15u8, d16u8, 1); + + q2u16 = vmlal_u8(q2u16, d12u8, d1u8); + q4u16 = vmlal_u8(q4u16, d15u8, d1u8); + + d10u8 = vqrshrn_n_u16(q9u16, 7); + d11u8 = vqrshrn_n_u16(q10u16, 7); + d12u8 = vqrshrn_n_u16(q11u16, 7); + d13u8 = vqrshrn_n_u16(q12u16, 7); + d14u8 = vqrshrn_n_u16(q13u16, 7); + d15u8 = vqrshrn_n_u16(q14u16, 7); + d16u8 = vqrshrn_n_u16(q1u16, 7); + d17u8 = vqrshrn_n_u16(q2u16, 7); + d18u8 = vqrshrn_n_u16(q3u16, 7); + d19u8 = vqrshrn_n_u16(q4u16, 7); + + q5u8 = vcombine_u8(d10u8, d11u8); + q6u8 = vcombine_u8(d12u8, d13u8); + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + + vst1q_u8((uint8_t *)tmpp, q5u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q6u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q7u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q8u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q9u8); + + // secondpass_filter + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + tmpp = tmp; + q11u8 = vld1q_u8(tmpp); + tmpp += 16; + for (i = 4; i > 0; i--) { + q12u8 = vld1q_u8(tmpp); + tmpp += 16; + q13u8 = vld1q_u8(tmpp); + tmpp += 16; + q14u8 = vld1q_u8(tmpp); + tmpp += 16; + q15u8 = vld1q_u8(tmpp); + tmpp += 16; + + q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); + q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); + q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); + q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); + q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); + q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); + q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); + + q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); + q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); + q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); + q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); + q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); + q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); + q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); + q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + d6u8 = vqrshrn_n_u16(q5u16, 7); + d7u8 = vqrshrn_n_u16(q6u16, 7); + d8u8 = vqrshrn_n_u16(q7u16, 7); + d9u8 = vqrshrn_n_u16(q8u16, 7); + + q1u8 = vcombine_u8(d2u8, d3u8); + q2u8 = vcombine_u8(d4u8, d5u8); + q3u8 = vcombine_u8(d6u8, d7u8); + q4u8 = vcombine_u8(d8u8, d9u8); + + q11u8 = q15u8; + + vst1q_u8((uint8_t *)dst_ptr, q1u8); + dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q2u8); + dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q3u8); + dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q4u8); + dst_ptr += dst_pitch; + } + return; +} diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/copymem_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/copymem_neon.c new file mode 100644 index 0000000000..c89b47d628 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/arm/neon/copymem_neon.c @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vp8_rtcd.h" + +void vp8_copy_mem8x4_neon(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride) { + uint8x8_t vtmp; + int r; + + for (r = 0; r < 4; ++r) { + vtmp = vld1_u8(src); + vst1_u8(dst, vtmp); + src += src_stride; + dst += dst_stride; + } +} + +void vp8_copy_mem8x8_neon(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride) { + uint8x8_t vtmp; + int r; + + for (r = 0; r < 8; ++r) { + vtmp = vld1_u8(src); + vst1_u8(dst, vtmp); + src += src_stride; + dst += dst_stride; + } +} + +void vp8_copy_mem16x16_neon(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride) { + int r; + uint8x16_t qtmp; + + for (r = 0; r < 16; ++r) { + qtmp = vld1q_u8(src); + vst1q_u8(dst, qtmp); + src += src_stride; + dst += dst_stride; + } +} diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c new file mode 100644 index 0000000000..d12c3a8392 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vp8_rtcd.h" + +void vp8_dc_only_idct_add_neon(int16_t input_dc, unsigned char *pred_ptr, + int pred_stride, unsigned char *dst_ptr, + int dst_stride) { + int i; + uint16_t a1 = ((input_dc + 4) >> 3); + uint32x2_t d2u32 = vdup_n_u32(0); + uint8x8_t d2u8; + uint16x8_t q1u16; + uint16x8_t qAdd; + + qAdd = vdupq_n_u16(a1); + + for (i = 0; i < 2; ++i) { + d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 0); + pred_ptr += pred_stride; + d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 1); + pred_ptr += pred_stride; + + q1u16 = vaddw_u8(qAdd, vreinterpret_u8_u32(d2u32)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); + + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0); + dst_ptr += dst_stride; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1); + dst_ptr += dst_stride; + } +} diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/dequant_idct_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/dequant_idct_neon.c new file mode 100644 index 0000000000..5445f2965a --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/arm/neon/dequant_idct_neon.c @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vp8_rtcd.h" + +static const int16_t cospi8sqrt2minus1 = 20091; +// 35468 exceeds INT16_MAX and gets converted to a negative number. Because of +// the way it is used in vqdmulh, where the result is doubled, it can be divided +// by 2 beforehand. This saves compensating for the negative value as well as +// shifting the result. +static const int16_t sinpi8sqrt2 = 35468 >> 1; + +void vp8_dequant_idct_add_neon(int16_t *input, int16_t *dq, unsigned char *dst, + int stride) { + unsigned char *dst0; + int32x2_t d14, d15; + int16x4_t d2, d3, d4, d5, d10, d11, d12, d13; + int16x8_t q1, q2, q3, q4, q5, q6; + int16x8_t qEmpty = vdupq_n_s16(0); + int32x2x2_t d2tmp0, d2tmp1; + int16x4x2_t d2tmp2, d2tmp3; + + d14 = d15 = vdup_n_s32(0); + + // load input + q3 = vld1q_s16(input); + vst1q_s16(input, qEmpty); + input += 8; + q4 = vld1q_s16(input); + vst1q_s16(input, qEmpty); + + // load dq + q5 = vld1q_s16(dq); + dq += 8; + q6 = vld1q_s16(dq); + + // load src from dst + dst0 = dst; + d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0); + dst0 += stride; + d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1); + dst0 += stride; + d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0); + dst0 += stride; + d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1); + + q1 = vreinterpretq_s16_u16( + vmulq_u16(vreinterpretq_u16_s16(q3), vreinterpretq_u16_s16(q5))); + q2 = vreinterpretq_s16_u16( + vmulq_u16(vreinterpretq_u16_s16(q4), vreinterpretq_u16_s16(q6))); + + d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2)); + d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2)); + + q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2)); + + q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2); + q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1); + + q4 = vshrq_n_s16(q4, 1); + + q4 = vqaddq_s16(q4, q2); + + d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4)); + d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4)); + + d2 = vqadd_s16(d12, d11); + d3 = vqadd_s16(d13, d10); + d4 = vqsub_s16(d13, d10); + d5 = vqsub_s16(d12, d11); + + d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); + d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); + d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]), + vreinterpret_s16_s32(d2tmp1.val[0])); + d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]), + vreinterpret_s16_s32(d2tmp1.val[1])); + + // loop 2 + q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]); + + q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2); + q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1); + + d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]); + d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]); + + q4 = vshrq_n_s16(q4, 1); + + q4 = vqaddq_s16(q4, q2); + + d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4)); + d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4)); + + d2 = vqadd_s16(d12, d11); + d3 = vqadd_s16(d13, d10); + d4 = vqsub_s16(d13, d10); + d5 = vqsub_s16(d12, d11); + + d2 = vrshr_n_s16(d2, 3); + d3 = vrshr_n_s16(d3, 3); + d4 = vrshr_n_s16(d4, 3); + d5 = vrshr_n_s16(d5, 3); + + d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); + d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); + d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]), + vreinterpret_s16_s32(d2tmp1.val[0])); + d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]), + vreinterpret_s16_s32(d2tmp1.val[1])); + + q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]); + q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]); + + q1 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q1), vreinterpret_u8_s32(d14))); + q2 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q2), vreinterpret_u8_s32(d15))); + + d14 = vreinterpret_s32_u8(vqmovun_s16(q1)); + d15 = vreinterpret_s32_u8(vqmovun_s16(q2)); + + dst0 = dst; + vst1_lane_s32((int32_t *)dst0, d14, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d14, 1); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d15, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d15, 1); + return; +} diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/dequantizeb_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/dequantizeb_neon.c new file mode 100644 index 0000000000..791aaea2ae --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/arm/neon/dequantizeb_neon.c @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vp8_rtcd.h" +#include "vp8/common/blockd.h" + +void vp8_dequantize_b_neon(BLOCKD *d, short *DQC) { + int16x8x2_t qQ, qDQC, qDQ; + + qQ = vld2q_s16(d->qcoeff); + qDQC = vld2q_s16(DQC); + + qDQ.val[0] = vmulq_s16(qQ.val[0], qDQC.val[0]); + qDQ.val[1] = vmulq_s16(qQ.val[1], qDQC.val[1]); + + vst2q_s16(d->dqcoeff, qDQ); +} diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/idct_blk_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/idct_blk_neon.c new file mode 100644 index 0000000000..5c26ce67a4 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/arm/neon/idct_blk_neon.c @@ -0,0 +1,295 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vp8_rtcd.h" + +static void idct_dequant_0_2x_neon(int16_t *q, int16_t dq, unsigned char *dst, + int stride) { + unsigned char *dst0; + int i, a0, a1; + int16x8x2_t q2Add; + int32x2_t d2s32 = vdup_n_s32(0), d4s32 = vdup_n_s32(0); + uint8x8_t d2u8, d4u8; + uint16x8_t q1u16, q2u16; + + a0 = ((q[0] * dq) + 4) >> 3; + a1 = ((q[16] * dq) + 4) >> 3; + q[0] = q[16] = 0; + q2Add.val[0] = vdupq_n_s16((int16_t)a0); + q2Add.val[1] = vdupq_n_s16((int16_t)a1); + + for (i = 0; i < 2; i++, dst += 4) { + dst0 = dst; + d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0); + dst0 += stride; + d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1); + dst0 += stride; + d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0); + dst0 += stride; + d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1); + + q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]), + vreinterpret_u8_s32(d2s32)); + q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]), + vreinterpret_u8_s32(d4s32)); + + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); + d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16)); + + d2s32 = vreinterpret_s32_u8(d2u8); + d4s32 = vreinterpret_s32_u8(d4u8); + + dst0 = dst; + vst1_lane_s32((int32_t *)dst0, d2s32, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d2s32, 1); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d4s32, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d4s32, 1); + } +} + +static const int16_t cospi8sqrt2minus1 = 20091; +static const int16_t sinpi8sqrt2 = 17734; +// because the lowest bit in 0x8a8c is 0, we can pre-shift this + +static void idct_dequant_full_2x_neon(int16_t *q, int16_t *dq, + unsigned char *dst, int stride) { + unsigned char *dst0, *dst1; + int32x2_t d28, d29, d30, d31; + int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11; + int16x8_t qEmpty = vdupq_n_s16(0); + int32x4x2_t q2tmp0, q2tmp1; + int16x8x2_t q2tmp2, q2tmp3; + int16x4_t dLow0, dLow1, dHigh0, dHigh1; + + d28 = d29 = d30 = d31 = vdup_n_s32(0); + + // load dq + q0 = vld1q_s16(dq); + dq += 8; + q1 = vld1q_s16(dq); + + // load q + q2 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + q += 8; + q3 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + q += 8; + q4 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + q += 8; + q5 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + + // load src from dst + dst0 = dst; + dst1 = dst + 4; + d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0); + dst0 += stride; + d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1); + dst1 += stride; + d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0); + dst0 += stride; + d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1); + dst1 += stride; + + d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0); + dst0 += stride; + d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1); + dst1 += stride; + d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0); + d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1); + + q2 = vmulq_s16(q2, q0); + q3 = vmulq_s16(q3, q1); + q4 = vmulq_s16(q4, q0); + q5 = vmulq_s16(q5, q1); + + // vswp + dLow0 = vget_low_s16(q2); + dHigh0 = vget_high_s16(q2); + dLow1 = vget_low_s16(q4); + dHigh1 = vget_high_s16(q4); + q2 = vcombine_s16(dLow0, dLow1); + q4 = vcombine_s16(dHigh0, dHigh1); + + dLow0 = vget_low_s16(q3); + dHigh0 = vget_high_s16(q3); + dLow1 = vget_low_s16(q5); + dHigh1 = vget_high_s16(q5); + q3 = vcombine_s16(dLow0, dLow1); + q5 = vcombine_s16(dHigh0, dHigh1); + + q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2); + q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2); + q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1); + q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1); + + q10 = vqaddq_s16(q2, q3); + q11 = vqsubq_s16(q2, q3); + + q8 = vshrq_n_s16(q8, 1); + q9 = vshrq_n_s16(q9, 1); + + q4 = vqaddq_s16(q4, q8); + q5 = vqaddq_s16(q5, q9); + + q2 = vqsubq_s16(q6, q5); + q3 = vqaddq_s16(q7, q4); + + q4 = vqaddq_s16(q10, q3); + q5 = vqaddq_s16(q11, q2); + q6 = vqsubq_s16(q11, q2); + q7 = vqsubq_s16(q10, q3); + + q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6)); + q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7)); + q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]), + vreinterpretq_s16_s32(q2tmp1.val[0])); + q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]), + vreinterpretq_s16_s32(q2tmp1.val[1])); + + // loop 2 + q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2); + q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2); + q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1); + q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1); + + q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]); + q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]); + + q10 = vshrq_n_s16(q10, 1); + q11 = vshrq_n_s16(q11, 1); + + q10 = vqaddq_s16(q2tmp2.val[1], q10); + q11 = vqaddq_s16(q2tmp3.val[1], q11); + + q8 = vqsubq_s16(q8, q11); + q9 = vqaddq_s16(q9, q10); + + q4 = vqaddq_s16(q2, q9); + q5 = vqaddq_s16(q3, q8); + q6 = vqsubq_s16(q3, q8); + q7 = vqsubq_s16(q2, q9); + + q4 = vrshrq_n_s16(q4, 3); + q5 = vrshrq_n_s16(q5, 3); + q6 = vrshrq_n_s16(q6, 3); + q7 = vrshrq_n_s16(q7, 3); + + q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6)); + q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7)); + q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]), + vreinterpretq_s16_s32(q2tmp1.val[0])); + q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]), + vreinterpretq_s16_s32(q2tmp1.val[1])); + + q4 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]), vreinterpret_u8_s32(d28))); + q5 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]), vreinterpret_u8_s32(d29))); + q6 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]), vreinterpret_u8_s32(d30))); + q7 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]), vreinterpret_u8_s32(d31))); + + d28 = vreinterpret_s32_u8(vqmovun_s16(q4)); + d29 = vreinterpret_s32_u8(vqmovun_s16(q5)); + d30 = vreinterpret_s32_u8(vqmovun_s16(q6)); + d31 = vreinterpret_s32_u8(vqmovun_s16(q7)); + + dst0 = dst; + dst1 = dst + 4; + vst1_lane_s32((int32_t *)dst0, d28, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst1, d28, 1); + dst1 += stride; + vst1_lane_s32((int32_t *)dst0, d29, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst1, d29, 1); + dst1 += stride; + + vst1_lane_s32((int32_t *)dst0, d30, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst1, d30, 1); + dst1 += stride; + vst1_lane_s32((int32_t *)dst0, d31, 0); + vst1_lane_s32((int32_t *)dst1, d31, 1); +} + +void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst, + int stride, char *eobs) { + int i; + + for (i = 0; i < 4; ++i) { + if (((short *)(eobs))[0]) { + if (((short *)eobs)[0] & 0xfefe) + idct_dequant_full_2x_neon(q, dq, dst, stride); + else + idct_dequant_0_2x_neon(q, dq[0], dst, stride); + } + + if (((short *)(eobs))[1]) { + if (((short *)eobs)[1] & 0xfefe) + idct_dequant_full_2x_neon(q + 32, dq, dst + 8, stride); + else + idct_dequant_0_2x_neon(q + 32, dq[0], dst + 8, stride); + } + q += 64; + dst += 4 * stride; + eobs += 4; + } +} + +void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq, + unsigned char *dst_u, + unsigned char *dst_v, int stride, + char *eobs) { + if (((short *)(eobs))[0]) { + if (((short *)eobs)[0] & 0xfefe) + idct_dequant_full_2x_neon(q, dq, dst_u, stride); + else + idct_dequant_0_2x_neon(q, dq[0], dst_u, stride); + } + + q += 32; + dst_u += 4 * stride; + + if (((short *)(eobs))[1]) { + if (((short *)eobs)[1] & 0xfefe) + idct_dequant_full_2x_neon(q, dq, dst_u, stride); + else + idct_dequant_0_2x_neon(q, dq[0], dst_u, stride); + } + + q += 32; + + if (((short *)(eobs))[2]) { + if (((short *)eobs)[2] & 0xfefe) + idct_dequant_full_2x_neon(q, dq, dst_v, stride); + else + idct_dequant_0_2x_neon(q, dq[0], dst_v, stride); + } + + q += 32; + dst_v += 4 * stride; + + if (((short *)(eobs))[3]) { + if (((short *)eobs)[3] & 0xfefe) + idct_dequant_full_2x_neon(q, dq, dst_v, stride); + else + idct_dequant_0_2x_neon(q, dq[0], dst_v, stride); + } +} diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/iwalsh_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/iwalsh_neon.c new file mode 100644 index 0000000000..91600bfc00 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/arm/neon/iwalsh_neon.c @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vp8_rtcd.h" + +void vp8_short_inv_walsh4x4_neon(int16_t *input, int16_t *mb_dqcoeff) { + int16x8_t q0s16, q1s16, q2s16, q3s16; + int16x4_t d4s16, d5s16, d6s16, d7s16; + int16x4x2_t v2tmp0, v2tmp1; + int32x2x2_t v2tmp2, v2tmp3; + int16x8_t qAdd3; + + q0s16 = vld1q_s16(input); + q1s16 = vld1q_s16(input + 8); + + // 1st for loop + d4s16 = vadd_s16(vget_low_s16(q0s16), vget_high_s16(q1s16)); + d6s16 = vadd_s16(vget_high_s16(q0s16), vget_low_s16(q1s16)); + d5s16 = vsub_s16(vget_low_s16(q0s16), vget_high_s16(q1s16)); + d7s16 = vsub_s16(vget_high_s16(q0s16), vget_low_s16(q1s16)); + + q2s16 = vcombine_s16(d4s16, d5s16); + q3s16 = vcombine_s16(d6s16, d7s16); + + q0s16 = vaddq_s16(q2s16, q3s16); + q1s16 = vsubq_s16(q2s16, q3s16); + + v2tmp2 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(q0s16)), + vreinterpret_s32_s16(vget_low_s16(q1s16))); + v2tmp3 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(q0s16)), + vreinterpret_s32_s16(vget_high_s16(q1s16))); + v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), + vreinterpret_s16_s32(v2tmp3.val[0])); + v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), + vreinterpret_s16_s32(v2tmp3.val[1])); + + // 2nd for loop + d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]); + d6s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]); + d5s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]); + d7s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]); + q2s16 = vcombine_s16(d4s16, d5s16); + q3s16 = vcombine_s16(d6s16, d7s16); + + qAdd3 = vdupq_n_s16(3); + + q0s16 = vaddq_s16(q2s16, q3s16); + q1s16 = vsubq_s16(q2s16, q3s16); + + q0s16 = vaddq_s16(q0s16, qAdd3); + q1s16 = vaddq_s16(q1s16, qAdd3); + + q0s16 = vshrq_n_s16(q0s16, 3); + q1s16 = vshrq_n_s16(q1s16, 3); + + // store + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 0); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 0); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 0); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 0); + mb_dqcoeff += 16; + + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 1); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 1); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 1); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 1); + mb_dqcoeff += 16; + + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 2); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 2); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 2); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 2); + mb_dqcoeff += 16; + + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 3); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 3); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 3); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 3); + mb_dqcoeff += 16; + return; +} diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c new file mode 100644 index 0000000000..df983b23a3 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vp8_rtcd.h" + +static INLINE void vp8_loop_filter_simple_horizontal_edge_neon( + unsigned char *s, int p, const unsigned char *blimit) { + uint8_t *sp; + uint8x16_t qblimit, q0u8; + uint8x16_t q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, q14u8, q15u8; + int16x8_t q2s16, q3s16, q13s16; + int8x8_t d8s8, d9s8; + int8x16_t q2s8, q3s8, q4s8, q10s8, q11s8, q14s8; + + qblimit = vdupq_n_u8(*blimit); + + sp = s - (p << 1); + q5u8 = vld1q_u8(sp); + sp += p; + q6u8 = vld1q_u8(sp); + sp += p; + q7u8 = vld1q_u8(sp); + sp += p; + q8u8 = vld1q_u8(sp); + + q15u8 = vabdq_u8(q6u8, q7u8); + q14u8 = vabdq_u8(q5u8, q8u8); + + q15u8 = vqaddq_u8(q15u8, q15u8); + q14u8 = vshrq_n_u8(q14u8, 1); + q0u8 = vdupq_n_u8(0x80); + q13s16 = vdupq_n_s16(3); + q15u8 = vqaddq_u8(q15u8, q14u8); + + q5u8 = veorq_u8(q5u8, q0u8); + q6u8 = veorq_u8(q6u8, q0u8); + q7u8 = veorq_u8(q7u8, q0u8); + q8u8 = veorq_u8(q8u8, q0u8); + + q15u8 = vcgeq_u8(qblimit, q15u8); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7u8)), + vget_low_s8(vreinterpretq_s8_u8(q6u8))); + q3s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7u8)), + vget_high_s8(vreinterpretq_s8_u8(q6u8))); + + q4s8 = vqsubq_s8(vreinterpretq_s8_u8(q5u8), vreinterpretq_s8_u8(q8u8)); + + q2s16 = vmulq_s16(q2s16, q13s16); + q3s16 = vmulq_s16(q3s16, q13s16); + + q10u8 = vdupq_n_u8(3); + q9u8 = vdupq_n_u8(4); + + q2s16 = vaddw_s8(q2s16, vget_low_s8(q4s8)); + q3s16 = vaddw_s8(q3s16, vget_high_s8(q4s8)); + + d8s8 = vqmovn_s16(q2s16); + d9s8 = vqmovn_s16(q3s16); + q4s8 = vcombine_s8(d8s8, d9s8); + + q14s8 = vandq_s8(q4s8, vreinterpretq_s8_u8(q15u8)); + + q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q10u8)); + q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q9u8)); + q2s8 = vshrq_n_s8(q2s8, 3); + q3s8 = vshrq_n_s8(q3s8, 3); + + q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6u8), q2s8); + q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7u8), q3s8); + + q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8); + q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8); + + vst1q_u8(s, q7u8); + s -= p; + vst1q_u8(s, q6u8); + return; +} + +void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { + y_ptr += y_stride * 4; + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit); + y_ptr += y_stride * 4; + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit); + y_ptr += y_stride * 4; + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit); + return; +} + +void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit); + return; +} diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c new file mode 100644 index 0000000000..fbc83ae290 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vp8_rtcd.h" +#include "vpx_ports/arm.h" + +#ifdef VPX_INCOMPATIBLE_GCC +static INLINE void write_2x4(unsigned char *dst, int pitch, + const uint8x8x2_t result) { + /* + * uint8x8x2_t result + 00 01 02 03 | 04 05 06 07 + 10 11 12 13 | 14 15 16 17 + --- + * after vtrn_u8 + 00 10 02 12 | 04 14 06 16 + 01 11 03 13 | 05 15 07 17 + */ + const uint8x8x2_t r01_u8 = vtrn_u8(result.val[0], result.val[1]); + const uint16x4_t x_0_4 = vreinterpret_u16_u8(r01_u8.val[0]); + const uint16x4_t x_1_5 = vreinterpret_u16_u8(r01_u8.val[1]); + vst1_lane_u16((uint16_t *)dst, x_0_4, 0); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_1_5, 0); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_0_4, 1); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_1_5, 1); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_0_4, 2); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_1_5, 2); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_0_4, 3); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_1_5, 3); +} + +static INLINE void write_2x8(unsigned char *dst, int pitch, + const uint8x8x2_t result, + const uint8x8x2_t result2) { + write_2x4(dst, pitch, result); + dst += pitch * 8; + write_2x4(dst, pitch, result2); +} +#else +static INLINE void write_2x8(unsigned char *dst, int pitch, + const uint8x8x2_t result, + const uint8x8x2_t result2) { + vst2_lane_u8(dst, result, 0); + dst += pitch; + vst2_lane_u8(dst, result, 1); + dst += pitch; + vst2_lane_u8(dst, result, 2); + dst += pitch; + vst2_lane_u8(dst, result, 3); + dst += pitch; + vst2_lane_u8(dst, result, 4); + dst += pitch; + vst2_lane_u8(dst, result, 5); + dst += pitch; + vst2_lane_u8(dst, result, 6); + dst += pitch; + vst2_lane_u8(dst, result, 7); + dst += pitch; + + vst2_lane_u8(dst, result2, 0); + dst += pitch; + vst2_lane_u8(dst, result2, 1); + dst += pitch; + vst2_lane_u8(dst, result2, 2); + dst += pitch; + vst2_lane_u8(dst, result2, 3); + dst += pitch; + vst2_lane_u8(dst, result2, 4); + dst += pitch; + vst2_lane_u8(dst, result2, 5); + dst += pitch; + vst2_lane_u8(dst, result2, 6); + dst += pitch; + vst2_lane_u8(dst, result2, 7); +} +#endif // VPX_INCOMPATIBLE_GCC + +#ifdef VPX_INCOMPATIBLE_GCC +static INLINE uint8x8x4_t read_4x8(unsigned char *src, int pitch) { + uint8x8x4_t x; + const uint8x8_t a = vld1_u8(src); + const uint8x8_t b = vld1_u8(src + pitch * 1); + const uint8x8_t c = vld1_u8(src + pitch * 2); + const uint8x8_t d = vld1_u8(src + pitch * 3); + const uint8x8_t e = vld1_u8(src + pitch * 4); + const uint8x8_t f = vld1_u8(src + pitch * 5); + const uint8x8_t g = vld1_u8(src + pitch * 6); + const uint8x8_t h = vld1_u8(src + pitch * 7); + const uint32x2x2_t r04_u32 = + vtrn_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(e)); + const uint32x2x2_t r15_u32 = + vtrn_u32(vreinterpret_u32_u8(b), vreinterpret_u32_u8(f)); + const uint32x2x2_t r26_u32 = + vtrn_u32(vreinterpret_u32_u8(c), vreinterpret_u32_u8(g)); + const uint32x2x2_t r37_u32 = + vtrn_u32(vreinterpret_u32_u8(d), vreinterpret_u32_u8(h)); + const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u32(r04_u32.val[0]), + vreinterpret_u16_u32(r26_u32.val[0])); + const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u32(r15_u32.val[0]), + vreinterpret_u16_u32(r37_u32.val[0])); + const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]), + vreinterpret_u8_u16(r13_u16.val[0])); + const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]), + vreinterpret_u8_u16(r13_u16.val[1])); + /* + * after vtrn_u32 + 00 01 02 03 | 40 41 42 43 + 10 11 12 13 | 50 51 52 53 + 20 21 22 23 | 60 61 62 63 + 30 31 32 33 | 70 71 72 73 + --- + * after vtrn_u16 + 00 01 20 21 | 40 41 60 61 + 02 03 22 23 | 42 43 62 63 + 10 11 30 31 | 50 51 70 71 + 12 13 32 33 | 52 52 72 73 + + 00 01 20 21 | 40 41 60 61 + 10 11 30 31 | 50 51 70 71 + 02 03 22 23 | 42 43 62 63 + 12 13 32 33 | 52 52 72 73 + --- + * after vtrn_u8 + 00 10 20 30 | 40 50 60 70 + 01 11 21 31 | 41 51 61 71 + 02 12 22 32 | 42 52 62 72 + 03 13 23 33 | 43 53 63 73 + */ + x.val[0] = r01_u8.val[0]; + x.val[1] = r01_u8.val[1]; + x.val[2] = r23_u8.val[0]; + x.val[3] = r23_u8.val[1]; + + return x; +} +#else +static INLINE uint8x8x4_t read_4x8(unsigned char *src, int pitch) { + uint8x8x4_t x; + x.val[0] = x.val[1] = x.val[2] = x.val[3] = vdup_n_u8(0); + x = vld4_lane_u8(src, x, 0); + src += pitch; + x = vld4_lane_u8(src, x, 1); + src += pitch; + x = vld4_lane_u8(src, x, 2); + src += pitch; + x = vld4_lane_u8(src, x, 3); + src += pitch; + x = vld4_lane_u8(src, x, 4); + src += pitch; + x = vld4_lane_u8(src, x, 5); + src += pitch; + x = vld4_lane_u8(src, x, 6); + src += pitch; + x = vld4_lane_u8(src, x, 7); + return x; +} +#endif // VPX_INCOMPATIBLE_GCC + +static INLINE void vp8_loop_filter_simple_vertical_edge_neon( + unsigned char *s, int p, const unsigned char *blimit) { + unsigned char *src1; + uint8x16_t qblimit, q0u8; + uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q11u8, q12u8, q14u8, q15u8; + int16x8_t q2s16, q13s16, q11s16; + int8x8_t d28s8, d29s8; + int8x16_t q2s8, q3s8, q10s8, q11s8, q14s8; + uint8x8x4_t d0u8x4; // d6, d7, d8, d9 + uint8x8x4_t d1u8x4; // d10, d11, d12, d13 + uint8x8x2_t d2u8x2; // d12, d13 + uint8x8x2_t d3u8x2; // d14, d15 + + qblimit = vdupq_n_u8(*blimit); + + src1 = s - 2; + d0u8x4 = read_4x8(src1, p); + src1 += p * 8; + d1u8x4 = read_4x8(src1, p); + + q3u8 = vcombine_u8(d0u8x4.val[0], d1u8x4.val[0]); // d6 d10 + q4u8 = vcombine_u8(d0u8x4.val[2], d1u8x4.val[2]); // d8 d12 + q5u8 = vcombine_u8(d0u8x4.val[1], d1u8x4.val[1]); // d7 d11 + q6u8 = vcombine_u8(d0u8x4.val[3], d1u8x4.val[3]); // d9 d13 + + q15u8 = vabdq_u8(q5u8, q4u8); + q14u8 = vabdq_u8(q3u8, q6u8); + + q15u8 = vqaddq_u8(q15u8, q15u8); + q14u8 = vshrq_n_u8(q14u8, 1); + q0u8 = vdupq_n_u8(0x80); + q11s16 = vdupq_n_s16(3); + q15u8 = vqaddq_u8(q15u8, q14u8); + + q3u8 = veorq_u8(q3u8, q0u8); + q4u8 = veorq_u8(q4u8, q0u8); + q5u8 = veorq_u8(q5u8, q0u8); + q6u8 = veorq_u8(q6u8, q0u8); + + q15u8 = vcgeq_u8(qblimit, q15u8); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q4u8)), + vget_low_s8(vreinterpretq_s8_u8(q5u8))); + q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q4u8)), + vget_high_s8(vreinterpretq_s8_u8(q5u8))); + + q14s8 = vqsubq_s8(vreinterpretq_s8_u8(q3u8), vreinterpretq_s8_u8(q6u8)); + + q2s16 = vmulq_s16(q2s16, q11s16); + q13s16 = vmulq_s16(q13s16, q11s16); + + q11u8 = vdupq_n_u8(3); + q12u8 = vdupq_n_u8(4); + + q2s16 = vaddw_s8(q2s16, vget_low_s8(q14s8)); + q13s16 = vaddw_s8(q13s16, vget_high_s8(q14s8)); + + d28s8 = vqmovn_s16(q2s16); + d29s8 = vqmovn_s16(q13s16); + q14s8 = vcombine_s8(d28s8, d29s8); + + q14s8 = vandq_s8(q14s8, vreinterpretq_s8_u8(q15u8)); + + q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q11u8)); + q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q12u8)); + q2s8 = vshrq_n_s8(q2s8, 3); + q14s8 = vshrq_n_s8(q3s8, 3); + + q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q5u8), q2s8); + q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q4u8), q14s8); + + q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8); + q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8); + + d2u8x2.val[0] = vget_low_u8(q6u8); // d12 + d2u8x2.val[1] = vget_low_u8(q7u8); // d14 + d3u8x2.val[0] = vget_high_u8(q6u8); // d13 + d3u8x2.val[1] = vget_high_u8(q7u8); // d15 + + src1 = s - 1; + write_2x8(src1, p, d2u8x2, d3u8x2); +} + +void vp8_loop_filter_bvs_neon(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { + y_ptr += 4; + vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit); + y_ptr += 4; + vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit); + y_ptr += 4; + vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit); + return; +} + +void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { + vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit); + return; +} diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c new file mode 100644 index 0000000000..fafaf2d451 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c @@ -0,0 +1,613 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "vp8/common/arm/loopfilter_arm.h" + +static INLINE void vp8_mbloop_filter_neon(uint8x16_t qblimit, // mblimit + uint8x16_t qlimit, // limit + uint8x16_t qthresh, // thresh + uint8x16_t q3, // p2 + uint8x16_t q4, // p2 + uint8x16_t q5, // p1 + uint8x16_t q6, // p0 + uint8x16_t q7, // q0 + uint8x16_t q8, // q1 + uint8x16_t q9, // q2 + uint8x16_t q10, // q3 + uint8x16_t *q4r, // p1 + uint8x16_t *q5r, // p1 + uint8x16_t *q6r, // p0 + uint8x16_t *q7r, // q0 + uint8x16_t *q8r, // q1 + uint8x16_t *q9r) { // q1 + uint8x16_t q0u8, q1u8, q11u8, q12u8, q13u8, q14u8, q15u8; + int16x8_t q0s16, q2s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int8x16_t q1s8, q6s8, q7s8, q2s8, q11s8, q13s8; + uint16x8_t q0u16, q11u16, q12u16, q13u16, q14u16, q15u16; + int8x16_t q0s8, q12s8, q14s8, q15s8; + int8x8_t d0, d1, d2, d3, d4, d5, d24, d25, d28, d29; + + q11u8 = vabdq_u8(q3, q4); + q12u8 = vabdq_u8(q4, q5); + q13u8 = vabdq_u8(q5, q6); + q14u8 = vabdq_u8(q8, q7); + q1u8 = vabdq_u8(q9, q8); + q0u8 = vabdq_u8(q10, q9); + + q11u8 = vmaxq_u8(q11u8, q12u8); + q12u8 = vmaxq_u8(q13u8, q14u8); + q1u8 = vmaxq_u8(q1u8, q0u8); + q15u8 = vmaxq_u8(q11u8, q12u8); + + q12u8 = vabdq_u8(q6, q7); + + // vp8_hevmask + q13u8 = vcgtq_u8(q13u8, qthresh); + q14u8 = vcgtq_u8(q14u8, qthresh); + q15u8 = vmaxq_u8(q15u8, q1u8); + + q15u8 = vcgeq_u8(qlimit, q15u8); + + q1u8 = vabdq_u8(q5, q8); + q12u8 = vqaddq_u8(q12u8, q12u8); + + // vp8_filter() function + // convert to signed + q0u8 = vdupq_n_u8(0x80); + q9 = veorq_u8(q9, q0u8); + q8 = veorq_u8(q8, q0u8); + q7 = veorq_u8(q7, q0u8); + q6 = veorq_u8(q6, q0u8); + q5 = veorq_u8(q5, q0u8); + q4 = veorq_u8(q4, q0u8); + + q1u8 = vshrq_n_u8(q1u8, 1); + q12u8 = vqaddq_u8(q12u8, q1u8); + + q14u8 = vorrq_u8(q13u8, q14u8); + q12u8 = vcgeq_u8(qblimit, q12u8); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)), + vget_low_s8(vreinterpretq_s8_u8(q6))); + q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)), + vget_high_s8(vreinterpretq_s8_u8(q6))); + + q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8)); + + q11s16 = vdupq_n_s16(3); + q2s16 = vmulq_s16(q2s16, q11s16); + q13s16 = vmulq_s16(q13s16, q11s16); + + q15u8 = vandq_u8(q15u8, q12u8); + + q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8)); + q13s16 = vaddw_s8(q13s16, vget_high_s8(q1s8)); + + q12u8 = vdupq_n_u8(3); + q11u8 = vdupq_n_u8(4); + // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) + d2 = vqmovn_s16(q2s16); + d3 = vqmovn_s16(q13s16); + q1s8 = vcombine_s8(d2, d3); + q1s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q15u8)); + q13s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); + + q2s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q11u8)); + q13s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q12u8)); + q2s8 = vshrq_n_s8(q2s8, 3); + q13s8 = vshrq_n_s8(q13s8, 3); + + q7s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q2s8); + q6s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q13s8); + + q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); + + q0u16 = q11u16 = q12u16 = q13u16 = q14u16 = q15u16 = vdupq_n_u16(63); + d5 = vdup_n_s8(9); + d4 = vdup_n_s8(18); + + q0s16 = vmlal_s8(vreinterpretq_s16_u16(q0u16), vget_low_s8(q1s8), d5); + q11s16 = vmlal_s8(vreinterpretq_s16_u16(q11u16), vget_high_s8(q1s8), d5); + d5 = vdup_n_s8(27); + q12s16 = vmlal_s8(vreinterpretq_s16_u16(q12u16), vget_low_s8(q1s8), d4); + q13s16 = vmlal_s8(vreinterpretq_s16_u16(q13u16), vget_high_s8(q1s8), d4); + q14s16 = vmlal_s8(vreinterpretq_s16_u16(q14u16), vget_low_s8(q1s8), d5); + q15s16 = vmlal_s8(vreinterpretq_s16_u16(q15u16), vget_high_s8(q1s8), d5); + + d0 = vqshrn_n_s16(q0s16, 7); + d1 = vqshrn_n_s16(q11s16, 7); + d24 = vqshrn_n_s16(q12s16, 7); + d25 = vqshrn_n_s16(q13s16, 7); + d28 = vqshrn_n_s16(q14s16, 7); + d29 = vqshrn_n_s16(q15s16, 7); + + q0s8 = vcombine_s8(d0, d1); + q12s8 = vcombine_s8(d24, d25); + q14s8 = vcombine_s8(d28, d29); + + q11s8 = vqsubq_s8(vreinterpretq_s8_u8(q9), q0s8); + q0s8 = vqaddq_s8(vreinterpretq_s8_u8(q4), q0s8); + q13s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q12s8); + q12s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q12s8); + q15s8 = vqsubq_s8((q7s8), q14s8); + q14s8 = vqaddq_s8((q6s8), q14s8); + + q1u8 = vdupq_n_u8(0x80); + *q9r = veorq_u8(vreinterpretq_u8_s8(q11s8), q1u8); + *q8r = veorq_u8(vreinterpretq_u8_s8(q13s8), q1u8); + *q7r = veorq_u8(vreinterpretq_u8_s8(q15s8), q1u8); + *q6r = veorq_u8(vreinterpretq_u8_s8(q14s8), q1u8); + *q5r = veorq_u8(vreinterpretq_u8_s8(q12s8), q1u8); + *q4r = veorq_u8(vreinterpretq_u8_s8(q0s8), q1u8); + return; +} + +void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh) { + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + src -= (pitch << 2); + + q3 = vld1q_u8(src); + src += pitch; + q4 = vld1q_u8(src); + src += pitch; + q5 = vld1q_u8(src); + src += pitch; + q6 = vld1q_u8(src); + src += pitch; + q7 = vld1q_u8(src); + src += pitch; + q8 = vld1q_u8(src); + src += pitch; + q9 = vld1q_u8(src); + src += pitch; + q10 = vld1q_u8(src); + + vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9, + q10, &q4, &q5, &q6, &q7, &q8, &q9); + + src -= (pitch * 6); + vst1q_u8(src, q4); + src += pitch; + vst1q_u8(src, q5); + src += pitch; + vst1q_u8(src, q6); + src += pitch; + vst1q_u8(src, q7); + src += pitch; + vst1q_u8(src, q8); + src += pitch; + vst1q_u8(src, q9); + return; +} + +void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh, + unsigned char *v) { + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + u -= (pitch << 2); + v -= (pitch << 2); + + d6 = vld1_u8(u); + u += pitch; + d7 = vld1_u8(v); + v += pitch; + d8 = vld1_u8(u); + u += pitch; + d9 = vld1_u8(v); + v += pitch; + d10 = vld1_u8(u); + u += pitch; + d11 = vld1_u8(v); + v += pitch; + d12 = vld1_u8(u); + u += pitch; + d13 = vld1_u8(v); + v += pitch; + d14 = vld1_u8(u); + u += pitch; + d15 = vld1_u8(v); + v += pitch; + d16 = vld1_u8(u); + u += pitch; + d17 = vld1_u8(v); + v += pitch; + d18 = vld1_u8(u); + u += pitch; + d19 = vld1_u8(v); + v += pitch; + d20 = vld1_u8(u); + d21 = vld1_u8(v); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9, + q10, &q4, &q5, &q6, &q7, &q8, &q9); + + u -= (pitch * 6); + v -= (pitch * 6); + vst1_u8(u, vget_low_u8(q4)); + u += pitch; + vst1_u8(v, vget_high_u8(q4)); + v += pitch; + vst1_u8(u, vget_low_u8(q5)); + u += pitch; + vst1_u8(v, vget_high_u8(q5)); + v += pitch; + vst1_u8(u, vget_low_u8(q6)); + u += pitch; + vst1_u8(v, vget_high_u8(q6)); + v += pitch; + vst1_u8(u, vget_low_u8(q7)); + u += pitch; + vst1_u8(v, vget_high_u8(q7)); + v += pitch; + vst1_u8(u, vget_low_u8(q8)); + u += pitch; + vst1_u8(v, vget_high_u8(q8)); + v += pitch; + vst1_u8(u, vget_low_u8(q9)); + vst1_u8(v, vget_high_u8(q9)); + return; +} + +void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh) { + unsigned char *s1, *s2; + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; + uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; + uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + s1 = src - 4; + s2 = s1 + 8 * pitch; + d6 = vld1_u8(s1); + s1 += pitch; + d7 = vld1_u8(s2); + s2 += pitch; + d8 = vld1_u8(s1); + s1 += pitch; + d9 = vld1_u8(s2); + s2 += pitch; + d10 = vld1_u8(s1); + s1 += pitch; + d11 = vld1_u8(s2); + s2 += pitch; + d12 = vld1_u8(s1); + s1 += pitch; + d13 = vld1_u8(s2); + s2 += pitch; + d14 = vld1_u8(s1); + s1 += pitch; + d15 = vld1_u8(s2); + s2 += pitch; + d16 = vld1_u8(s1); + s1 += pitch; + d17 = vld1_u8(s2); + s2 += pitch; + d18 = vld1_u8(s1); + s1 += pitch; + d19 = vld1_u8(s2); + s2 += pitch; + d20 = vld1_u8(s1); + d21 = vld1_u8(s2); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9, + q10, &q4, &q5, &q6, &q7, &q8, &q9); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + s1 -= 7 * pitch; + s2 -= 7 * pitch; + + vst1_u8(s1, vget_low_u8(q3)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q3)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q4)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q4)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q5)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q5)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q6)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q6)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q7)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q7)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q8)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q8)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q9)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q9)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q10)); + vst1_u8(s2, vget_high_u8(q10)); + return; +} + +void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh, + unsigned char *v) { + unsigned char *us, *ud; + unsigned char *vs, *vd; + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; + uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; + uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + us = u - 4; + vs = v - 4; + d6 = vld1_u8(us); + us += pitch; + d7 = vld1_u8(vs); + vs += pitch; + d8 = vld1_u8(us); + us += pitch; + d9 = vld1_u8(vs); + vs += pitch; + d10 = vld1_u8(us); + us += pitch; + d11 = vld1_u8(vs); + vs += pitch; + d12 = vld1_u8(us); + us += pitch; + d13 = vld1_u8(vs); + vs += pitch; + d14 = vld1_u8(us); + us += pitch; + d15 = vld1_u8(vs); + vs += pitch; + d16 = vld1_u8(us); + us += pitch; + d17 = vld1_u8(vs); + vs += pitch; + d18 = vld1_u8(us); + us += pitch; + d19 = vld1_u8(vs); + vs += pitch; + d20 = vld1_u8(us); + d21 = vld1_u8(vs); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9, + q10, &q4, &q5, &q6, &q7, &q8, &q9); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + ud = u - 4; + vst1_u8(ud, vget_low_u8(q3)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q4)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q5)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q6)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q7)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q8)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q9)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q10)); + + vd = v - 4; + vst1_u8(vd, vget_high_u8(q3)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q4)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q5)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q6)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q7)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q8)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q9)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q10)); + return; +} diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c new file mode 100644 index 0000000000..2724ca236b --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vp8_rtcd.h" + +static const int16_t cospi8sqrt2minus1 = 20091; +// 35468 exceeds INT16_MAX and gets converted to a negative number. Because of +// the way it is used in vqdmulh, where the result is doubled, it can be divided +// by 2 beforehand. This saves compensating for the negative value as well as +// shifting the result. +static const int16_t sinpi8sqrt2 = 35468 >> 1; + +void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr, + int pred_stride, unsigned char *dst_ptr, + int dst_stride) { + int i; + uint32x2_t d6u32 = vdup_n_u32(0); + uint8x8_t d1u8; + int16x4_t d2, d3, d4, d5, d10, d11, d12, d13; + uint16x8_t q1u16; + int16x8_t q1s16, q2s16, q3s16, q4s16; + int32x2x2_t v2tmp0, v2tmp1; + int16x4x2_t v2tmp2, v2tmp3; + + d2 = vld1_s16(input); + d3 = vld1_s16(input + 4); + d4 = vld1_s16(input + 8); + d5 = vld1_s16(input + 12); + + // 1st for loop + q1s16 = vcombine_s16(d2, d4); // Swap d3 d4 here + q2s16 = vcombine_s16(d3, d5); + + q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2); + q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1); + + d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1 + d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1 + + q4s16 = vshrq_n_s16(q4s16, 1); + + q4s16 = vqaddq_s16(q4s16, q2s16); + + d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1 + d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1 + + d2 = vqadd_s16(d12, d11); + d3 = vqadd_s16(d13, d10); + d4 = vqsub_s16(d13, d10); + d5 = vqsub_s16(d12, d11); + + v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); + v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); + v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]), + vreinterpret_s16_s32(v2tmp1.val[0])); + v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]), + vreinterpret_s16_s32(v2tmp1.val[1])); + + // 2nd for loop + q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp3.val[0]); + q2s16 = vcombine_s16(v2tmp2.val[1], v2tmp3.val[1]); + + q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2); + q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1); + + d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1 + d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1 + + q4s16 = vshrq_n_s16(q4s16, 1); + + q4s16 = vqaddq_s16(q4s16, q2s16); + + d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1 + d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1 + + d2 = vqadd_s16(d12, d11); + d3 = vqadd_s16(d13, d10); + d4 = vqsub_s16(d13, d10); + d5 = vqsub_s16(d12, d11); + + d2 = vrshr_n_s16(d2, 3); + d3 = vrshr_n_s16(d3, 3); + d4 = vrshr_n_s16(d4, 3); + d5 = vrshr_n_s16(d5, 3); + + v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); + v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); + v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]), + vreinterpret_s16_s32(v2tmp1.val[0])); + v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]), + vreinterpret_s16_s32(v2tmp1.val[1])); + + q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp2.val[1]); + q2s16 = vcombine_s16(v2tmp3.val[0], v2tmp3.val[1]); + + // dc_only_idct_add + for (i = 0; i < 2; i++, q1s16 = q2s16) { + d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 0); + pred_ptr += pred_stride; + d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 1); + pred_ptr += pred_stride; + + q1u16 = vaddw_u8(vreinterpretq_u16_s16(q1s16), vreinterpret_u8_u32(d6u32)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); + + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 0); + dst_ptr += dst_stride; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 1); + dst_ptr += dst_stride; + } + return; +} diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c new file mode 100644 index 0000000000..ee3c281f0f --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c @@ -0,0 +1,1729 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include "./vpx_config.h" +#include "./vp8_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_ports/mem.h" + +static const int8_t vp8_sub_pel_filters[8][8] = { + { 0, 0, 128, 0, 0, 0, 0, 0 }, /* note that 1/8 pel positions are */ + { 0, -6, 123, 12, -1, 0, 0, 0 }, /* just as per alpha -0.5 bicubic */ + { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */ + { 0, -9, 93, 50, -6, 0, 0, 0 }, + { 3, -16, 77, 77, -16, 3, 0, 0 }, /* New 1/2 pel 6 tap filter */ + { 0, -6, 50, 93, -9, 0, 0, 0 }, + { 1, -8, 36, 108, -11, 2, 0, 0 }, /* New 1/4 pel 6 tap filter */ + { 0, -1, 12, 123, -6, 0, 0, 0 }, +}; + +// This table is derived from vp8/common/filter.c:vp8_sub_pel_filters. +// Apply abs() to all the values. Elements 0, 2, 3, and 5 are always positive. +// Elements 1 and 4 are either 0 or negative. The code accounts for this with +// multiply/accumulates which either add or subtract as needed. The other +// functions will be updated to use this table later. +// It is also expanded to 8 elements to allow loading into 64 bit neon +// registers. +static const uint8_t abs_filters[8][8] = { + { 0, 0, 128, 0, 0, 0, 0, 0 }, { 0, 6, 123, 12, 1, 0, 0, 0 }, + { 2, 11, 108, 36, 8, 1, 0, 0 }, { 0, 9, 93, 50, 6, 0, 0, 0 }, + { 3, 16, 77, 77, 16, 3, 0, 0 }, { 0, 6, 50, 93, 9, 0, 0, 0 }, + { 1, 8, 36, 108, 11, 2, 0, 0 }, { 0, 1, 12, 123, 6, 0, 0, 0 }, +}; + +static INLINE uint8x8_t load_and_shift(const unsigned char *a) { + return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32)); +} + +static INLINE void filter_add_accumulate(const uint8x16_t a, const uint8x16_t b, + const uint8x8_t filter, uint16x8_t *c, + uint16x8_t *d) { + const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)), + vreinterpret_u32_u8(vget_high_u8(a))); + const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)), + vreinterpret_u32_u8(vget_high_u8(b))); + *c = vmlal_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter); + *d = vmlal_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter); +} + +static INLINE void filter_sub_accumulate(const uint8x16_t a, const uint8x16_t b, + const uint8x8_t filter, uint16x8_t *c, + uint16x8_t *d) { + const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)), + vreinterpret_u32_u8(vget_high_u8(a))); + const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)), + vreinterpret_u32_u8(vget_high_u8(b))); + *c = vmlsl_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter); + *d = vmlsl_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter); +} + +static INLINE void yonly4x4(const unsigned char *src, int src_stride, + int filter_offset, unsigned char *dst, + int dst_stride) { + uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7, a8; + uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8; + uint16x8_t c0, c1, c2, c3; + int16x8_t d0, d1; + uint8x8_t e0, e1; + + const uint8x8_t filter = vld1_u8(abs_filters[filter_offset]); + const uint8x8_t filter0 = vdup_lane_u8(filter, 0); + const uint8x8_t filter1 = vdup_lane_u8(filter, 1); + const uint8x8_t filter2 = vdup_lane_u8(filter, 2); + const uint8x8_t filter3 = vdup_lane_u8(filter, 3); + const uint8x8_t filter4 = vdup_lane_u8(filter, 4); + const uint8x8_t filter5 = vdup_lane_u8(filter, 5); + + src -= src_stride * 2; + // Shift the even rows to allow using 'vext' to combine the vectors. armv8 + // has vcopy_lane which would be interesting. This started as just a + // horrible workaround for clang adding alignment hints to 32bit loads: + // https://llvm.org/bugs/show_bug.cgi?id=24421 + // But it turns out it almost identical to casting the loads. + a0 = load_and_shift(src); + src += src_stride; + a1 = vld1_u8(src); + src += src_stride; + a2 = load_and_shift(src); + src += src_stride; + a3 = vld1_u8(src); + src += src_stride; + a4 = load_and_shift(src); + src += src_stride; + a5 = vld1_u8(src); + src += src_stride; + a6 = load_and_shift(src); + src += src_stride; + a7 = vld1_u8(src); + src += src_stride; + a8 = vld1_u8(src); + + // Combine the rows so we can operate on 8 at a time. + b0 = vext_u8(a0, a1, 4); + b2 = vext_u8(a2, a3, 4); + b4 = vext_u8(a4, a5, 4); + b6 = vext_u8(a6, a7, 4); + b8 = a8; + + // To keep with the 8-at-a-time theme, combine *alternate* rows. This + // allows combining the odd rows with the even. + b1 = vext_u8(b0, b2, 4); + b3 = vext_u8(b2, b4, 4); + b5 = vext_u8(b4, b6, 4); + b7 = vext_u8(b6, b8, 4); + + // Multiply and expand to 16 bits. + c0 = vmull_u8(b0, filter0); + c1 = vmull_u8(b2, filter0); + c2 = vmull_u8(b5, filter5); + c3 = vmull_u8(b7, filter5); + + // Multiply, subtract and accumulate for filters 1 and 4 (the negative + // ones). + c0 = vmlsl_u8(c0, b4, filter4); + c1 = vmlsl_u8(c1, b6, filter4); + c2 = vmlsl_u8(c2, b1, filter1); + c3 = vmlsl_u8(c3, b3, filter1); + + // Add more positive ones. vmlal should really return a signed type. + // It's doing signed math internally, as evidenced by the fact we can do + // subtractions followed by more additions. Ideally we could use + // vqmlal/sl but that instruction doesn't exist. Might be able to + // shoehorn vqdmlal/vqdmlsl in here but it would take some effort. + c0 = vmlal_u8(c0, b2, filter2); + c1 = vmlal_u8(c1, b4, filter2); + c2 = vmlal_u8(c2, b3, filter3); + c3 = vmlal_u8(c3, b5, filter3); + + // Use signed saturation math because vmlsl may have left some negative + // numbers in there. + d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0)); + d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1)); + + // Use signed again because numbers like -200 need to be saturated to 0. + e0 = vqrshrun_n_s16(d0, 7); + e1 = vqrshrun_n_s16(d1, 7); + + store_unaligned_u8q(dst, dst_stride, vcombine_u8(e0, e1)); +} + +void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, + unsigned char *dst_ptr, int dst_pitch) { + uint8x16_t s0, s1, s2, s3, s4; + uint64x2_t s01, s23; + // Variables to hold src[] elements for the given filter[] + uint8x8_t s0_f5, s1_f5, s2_f5, s3_f5, s4_f5; + uint8x8_t s4_f1, s4_f2, s4_f3, s4_f4; + uint8x16_t s01_f0, s23_f0; + uint64x2_t s01_f3, s23_f3; + uint32x2x2_t s01_f3_q, s23_f3_q, s01_f5_q, s23_f5_q; + // Accumulator variables. + uint16x8_t d0123, d4567, d89; + uint16x8_t d0123_a, d4567_a, d89_a; + int16x8_t e0123, e4567, e89; + // Second pass intermediates. + uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8; + uint16x8_t c0, c1, c2, c3; + int16x8_t d0, d1; + uint8x8_t e0, e1; + uint8x8_t filter, filter0, filter1, filter2, filter3, filter4, filter5; + + if (xoffset == 0) { // Second pass only. + yonly4x4(src_ptr, src_pixels_per_line, yoffset, dst_ptr, dst_pitch); + return; + } + + if (yoffset == 0) { // First pass only. + src_ptr -= 2; + } else { // Add context for the second pass. 2 extra lines on top. + src_ptr -= 2 + (src_pixels_per_line * 2); + } + + filter = vld1_u8(abs_filters[xoffset]); + filter0 = vdup_lane_u8(filter, 0); + filter1 = vdup_lane_u8(filter, 1); + filter2 = vdup_lane_u8(filter, 2); + filter3 = vdup_lane_u8(filter, 3); + filter4 = vdup_lane_u8(filter, 4); + filter5 = vdup_lane_u8(filter, 5); + + // 2 bytes of context, 4 bytes of src values, 3 bytes of context, 7 bytes of + // garbage. So much effort for that last single bit. + // The low values of each pair are for filter0. + s0 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + s1 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + s2 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + s3 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + + // Shift to extract values for filter[5] + // If src[] is 0, this puts: + // 3 4 5 6 7 8 9 10 in s0_f5 + // Can't use vshr.u64 because it crosses the double word boundary. + s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5); + s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5); + s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5); + s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5); + + s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1)); + s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3)); + + s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5)); + s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5)); + d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5); + d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5); + + // Keep original src data as 64 bits to simplify shifting and extracting. + s01 = vreinterpretq_u64_u8(s01_f0); + s23 = vreinterpretq_u64_u8(s23_f0); + + // 3 4 5 6 * filter0 + filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567); + + // Shift over one to use -1, 0, 1, 2 for filter1 + // -1 0 1 2 * filter1 + filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)), + vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1, + &d0123, &d4567); + + // 2 3 4 5 * filter4 + filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)), + vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4, + &d0123, &d4567); + + // 0 1 2 3 * filter2 + filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)), + vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2, + &d0123, &d4567); + + // 1 2 3 4 * filter3 + s01_f3 = vshrq_n_u64(s01, 24); + s23_f3 = vshrq_n_u64(s23, 24); + s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)), + vreinterpret_u32_u64(vget_high_u64(s01_f3))); + s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)), + vreinterpret_u32_u64(vget_high_u64(s23_f3))); + // Accumulate into different registers so it can use saturated addition. + d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3); + d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3); + + e0123 = + vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a)); + e4567 = + vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a)); + + // Shift and narrow. + b0 = vqrshrun_n_s16(e0123, 7); + b2 = vqrshrun_n_s16(e4567, 7); + + if (yoffset == 0) { // firstpass_filter4x4_only + store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(b0, b2)); + return; + } + + // Load additional context when doing both filters. + s0 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + s1 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + s2 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + s3 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + s4 = vld1q_u8(src_ptr); + + s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5); + s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5); + s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5); + s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5); + s4_f5 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 5); + + // 3 4 5 6 * filter0 + s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1)); + s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3)); + + s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5)); + s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5)); + // But this time instead of 16 pixels to filter, there are 20. So an extra + // run with a doubleword register. + d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5); + d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5); + d89 = vmull_u8(s4_f5, filter5); + + // Save a copy as u64 for shifting. + s01 = vreinterpretq_u64_u8(s01_f0); + s23 = vreinterpretq_u64_u8(s23_f0); + + filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567); + d89 = vmlal_u8(d89, vget_low_u8(s4), filter0); + + filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)), + vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1, + &d0123, &d4567); + s4_f1 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 1); + d89 = vmlsl_u8(d89, s4_f1, filter1); + + filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)), + vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4, + &d0123, &d4567); + s4_f4 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 4); + d89 = vmlsl_u8(d89, s4_f4, filter4); + + filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)), + vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2, + &d0123, &d4567); + s4_f2 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 2); + d89 = vmlal_u8(d89, s4_f2, filter2); + + s01_f3 = vshrq_n_u64(s01, 24); + s23_f3 = vshrq_n_u64(s23, 24); + s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)), + vreinterpret_u32_u64(vget_high_u64(s01_f3))); + s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)), + vreinterpret_u32_u64(vget_high_u64(s23_f3))); + s4_f3 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 3); + d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3); + d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3); + d89_a = vmull_u8(s4_f3, filter3); + + e0123 = + vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a)); + e4567 = + vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a)); + e89 = vqaddq_s16(vreinterpretq_s16_u16(d89), vreinterpretq_s16_u16(d89_a)); + + b4 = vqrshrun_n_s16(e0123, 7); + b6 = vqrshrun_n_s16(e4567, 7); + b8 = vqrshrun_n_s16(e89, 7); + + // Second pass: 4x4 + filter = vld1_u8(abs_filters[yoffset]); + filter0 = vdup_lane_u8(filter, 0); + filter1 = vdup_lane_u8(filter, 1); + filter2 = vdup_lane_u8(filter, 2); + filter3 = vdup_lane_u8(filter, 3); + filter4 = vdup_lane_u8(filter, 4); + filter5 = vdup_lane_u8(filter, 5); + + b1 = vext_u8(b0, b2, 4); + b3 = vext_u8(b2, b4, 4); + b5 = vext_u8(b4, b6, 4); + b7 = vext_u8(b6, b8, 4); + + c0 = vmull_u8(b0, filter0); + c1 = vmull_u8(b2, filter0); + c2 = vmull_u8(b5, filter5); + c3 = vmull_u8(b7, filter5); + + c0 = vmlsl_u8(c0, b4, filter4); + c1 = vmlsl_u8(c1, b6, filter4); + c2 = vmlsl_u8(c2, b1, filter1); + c3 = vmlsl_u8(c3, b3, filter1); + + c0 = vmlal_u8(c0, b2, filter2); + c1 = vmlal_u8(c1, b4, filter2); + c2 = vmlal_u8(c2, b3, filter3); + c3 = vmlal_u8(c3, b5, filter3); + + d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0)); + d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1)); + + e0 = vqrshrun_n_s16(d0, 7); + e1 = vqrshrun_n_s16(d1, 7); + + store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(e0, e1)); +} + +void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, + unsigned char *dst_ptr, int dst_pitch) { + unsigned char *src; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8; + uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8; + int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; + uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16; + uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16; + int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16; + uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8; + + if (xoffset == 0) { // secondpass_filter8x4_only + // load second_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // load src data + src = src_ptr - src_pixels_per_line * 2; + d22u8 = vld1_u8(src); + src += src_pixels_per_line; + d23u8 = vld1_u8(src); + src += src_pixels_per_line; + d24u8 = vld1_u8(src); + src += src_pixels_per_line; + d25u8 = vld1_u8(src); + src += src_pixels_per_line; + d26u8 = vld1_u8(src); + src += src_pixels_per_line; + d27u8 = vld1_u8(src); + src += src_pixels_per_line; + d28u8 = vld1_u8(src); + src += src_pixels_per_line; + d29u8 = vld1_u8(src); + src += src_pixels_per_line; + d30u8 = vld1_u8(src); + + q3u16 = vmull_u8(d22u8, d0u8); + q4u16 = vmull_u8(d23u8, d0u8); + q5u16 = vmull_u8(d24u8, d0u8); + q6u16 = vmull_u8(d25u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d23u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d24u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d25u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d26u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d26u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d27u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d28u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d29u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d24u8, d2u8); + q4u16 = vmlal_u8(q4u16, d25u8, d2u8); + q5u16 = vmlal_u8(q5u16, d26u8, d2u8); + q6u16 = vmlal_u8(q6u16, d27u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d27u8, d5u8); + q4u16 = vmlal_u8(q4u16, d28u8, d5u8); + q5u16 = vmlal_u8(q5u16, d29u8, d5u8); + q6u16 = vmlal_u8(q6u16, d30u8, d5u8); + + q7u16 = vmull_u8(d25u8, d3u8); + q8u16 = vmull_u8(d26u8, d3u8); + q9u16 = vmull_u8(d27u8, d3u8); + q10u16 = vmull_u8(d28u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + vst1_u8(dst_ptr, d6u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d7u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d8u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d9u8); + return; + } + + // load first_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // First pass: output_height lines x output_width columns (9x4) + if (yoffset == 0) // firstpass_filter4x4_only + src = src_ptr - 2; + else + src = src_ptr - 2 - (src_pixels_per_line * 2); + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + + q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); + + q7u16 = vmlsl_u8(q7u16, d28u8, d1u8); + q8u16 = vmlsl_u8(q8u16, d29u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d30u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d31u8, d1u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); + + q7u16 = vmlsl_u8(q7u16, d28u8, d4u8); + q8u16 = vmlsl_u8(q8u16, d29u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d30u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d31u8, d4u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); + + q7u16 = vmlal_u8(q7u16, d28u8, d2u8); + q8u16 = vmlal_u8(q8u16, d29u8, d2u8); + q9u16 = vmlal_u8(q9u16, d30u8, d2u8); + q10u16 = vmlal_u8(q10u16, d31u8, d2u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + + q7u16 = vmlal_u8(q7u16, d28u8, d5u8); + q8u16 = vmlal_u8(q8u16, d29u8, d5u8); + q9u16 = vmlal_u8(q9u16, d30u8, d5u8); + q10u16 = vmlal_u8(q10u16, d31u8, d5u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); + + q3u16 = vmull_u8(d28u8, d3u8); + q4u16 = vmull_u8(d29u8, d3u8); + q5u16 = vmull_u8(d30u8, d3u8); + q6u16 = vmull_u8(d31u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d22u8 = vqrshrun_n_s16(q7s16, 7); + d23u8 = vqrshrun_n_s16(q8s16, 7); + d24u8 = vqrshrun_n_s16(q9s16, 7); + d25u8 = vqrshrun_n_s16(q10s16, 7); + + if (yoffset == 0) { // firstpass_filter8x4_only + vst1_u8(dst_ptr, d22u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d23u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d24u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d25u8); + return; + } + + // First Pass on rest 5-line data + src += src_pixels_per_line; + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + src += src_pixels_per_line; + q7u8 = vld1q_u8(src); + + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8); + q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1); + + q8u16 = vmlsl_u8(q8u16, d27u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); + q11u16 = vmlsl_u8(q11u16, d30u8, d1u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d1u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4); + + q8u16 = vmlsl_u8(q8u16, d27u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); + q11u16 = vmlsl_u8(q11u16, d30u8, d4u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d4u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2); + + q8u16 = vmlal_u8(q8u16, d27u8, d2u8); + q9u16 = vmlal_u8(q9u16, d28u8, d2u8); + q10u16 = vmlal_u8(q10u16, d29u8, d2u8); + q11u16 = vmlal_u8(q11u16, d30u8, d2u8); + q12u16 = vmlal_u8(q12u16, d31u8, d2u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5); + + q8u16 = vmlal_u8(q8u16, d27u8, d5u8); + q9u16 = vmlal_u8(q9u16, d28u8, d5u8); + q10u16 = vmlal_u8(q10u16, d29u8, d5u8); + q11u16 = vmlal_u8(q11u16, d30u8, d5u8); + q12u16 = vmlal_u8(q12u16, d31u8, d5u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3); + + q3u16 = vmull_u8(d27u8, d3u8); + q4u16 = vmull_u8(d28u8, d3u8); + q5u16 = vmull_u8(d29u8, d3u8); + q6u16 = vmull_u8(d30u8, d3u8); + q7u16 = vmull_u8(d31u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q11s16 = vreinterpretq_s16_u16(q11u16); + q12s16 = vreinterpretq_s16_u16(q12u16); + + q8s16 = vqaddq_s16(q8s16, q3s16); + q9s16 = vqaddq_s16(q9s16, q4s16); + q10s16 = vqaddq_s16(q10s16, q5s16); + q11s16 = vqaddq_s16(q11s16, q6s16); + q12s16 = vqaddq_s16(q12s16, q7s16); + + d26u8 = vqrshrun_n_s16(q8s16, 7); + d27u8 = vqrshrun_n_s16(q9s16, 7); + d28u8 = vqrshrun_n_s16(q10s16, 7); + d29u8 = vqrshrun_n_s16(q11s16, 7); + d30u8 = vqrshrun_n_s16(q12s16, 7); + + // Second pass: 8x4 + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + q3u16 = vmull_u8(d22u8, d0u8); + q4u16 = vmull_u8(d23u8, d0u8); + q5u16 = vmull_u8(d24u8, d0u8); + q6u16 = vmull_u8(d25u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d23u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d24u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d25u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d26u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d26u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d27u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d28u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d29u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d24u8, d2u8); + q4u16 = vmlal_u8(q4u16, d25u8, d2u8); + q5u16 = vmlal_u8(q5u16, d26u8, d2u8); + q6u16 = vmlal_u8(q6u16, d27u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d27u8, d5u8); + q4u16 = vmlal_u8(q4u16, d28u8, d5u8); + q5u16 = vmlal_u8(q5u16, d29u8, d5u8); + q6u16 = vmlal_u8(q6u16, d30u8, d5u8); + + q7u16 = vmull_u8(d25u8, d3u8); + q8u16 = vmull_u8(d26u8, d3u8); + q9u16 = vmull_u8(d27u8, d3u8); + q10u16 = vmull_u8(d28u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + vst1_u8(dst_ptr, d6u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d7u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d8u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d9u8); +} + +void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, + unsigned char *dst_ptr, int dst_pitch) { + unsigned char *src, *tmpp; + unsigned char tmp[64]; + int i; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8; + uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; + int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; + uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16; + uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16; + int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16; + uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8; + + if (xoffset == 0) { // secondpass_filter8x8_only + // load second_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // load src data + src = src_ptr - src_pixels_per_line * 2; + d18u8 = vld1_u8(src); + src += src_pixels_per_line; + d19u8 = vld1_u8(src); + src += src_pixels_per_line; + d20u8 = vld1_u8(src); + src += src_pixels_per_line; + d21u8 = vld1_u8(src); + src += src_pixels_per_line; + d22u8 = vld1_u8(src); + src += src_pixels_per_line; + d23u8 = vld1_u8(src); + src += src_pixels_per_line; + d24u8 = vld1_u8(src); + src += src_pixels_per_line; + d25u8 = vld1_u8(src); + src += src_pixels_per_line; + d26u8 = vld1_u8(src); + src += src_pixels_per_line; + d27u8 = vld1_u8(src); + src += src_pixels_per_line; + d28u8 = vld1_u8(src); + src += src_pixels_per_line; + d29u8 = vld1_u8(src); + src += src_pixels_per_line; + d30u8 = vld1_u8(src); + + for (i = 2; i > 0; i--) { + q3u16 = vmull_u8(d18u8, d0u8); + q4u16 = vmull_u8(d19u8, d0u8); + q5u16 = vmull_u8(d20u8, d0u8); + q6u16 = vmull_u8(d21u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d20u8, d2u8); + q4u16 = vmlal_u8(q4u16, d21u8, d2u8); + q5u16 = vmlal_u8(q5u16, d22u8, d2u8); + q6u16 = vmlal_u8(q6u16, d23u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d23u8, d5u8); + q4u16 = vmlal_u8(q4u16, d24u8, d5u8); + q5u16 = vmlal_u8(q5u16, d25u8, d5u8); + q6u16 = vmlal_u8(q6u16, d26u8, d5u8); + + q7u16 = vmull_u8(d21u8, d3u8); + q8u16 = vmull_u8(d22u8, d3u8); + q9u16 = vmull_u8(d23u8, d3u8); + q10u16 = vmull_u8(d24u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + d18u8 = d22u8; + d19u8 = d23u8; + d20u8 = d24u8; + d21u8 = d25u8; + d22u8 = d26u8; + d23u8 = d27u8; + d24u8 = d28u8; + d25u8 = d29u8; + d26u8 = d30u8; + + vst1_u8(dst_ptr, d6u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d7u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d8u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d9u8); + dst_ptr += dst_pitch; + } + return; + } + + // load first_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // First pass: output_height lines x output_width columns (9x4) + if (yoffset == 0) // firstpass_filter4x4_only + src = src_ptr - 2; + else + src = src_ptr - 2 - (src_pixels_per_line * 2); + + tmpp = tmp; + for (i = 2; i > 0; i--) { + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + src += src_pixels_per_line; + + __builtin_prefetch(src); + __builtin_prefetch(src + src_pixels_per_line); + __builtin_prefetch(src + src_pixels_per_line * 2); + + q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); + + q7u16 = vmlsl_u8(q7u16, d28u8, d1u8); + q8u16 = vmlsl_u8(q8u16, d29u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d30u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d31u8, d1u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); + + q7u16 = vmlsl_u8(q7u16, d28u8, d4u8); + q8u16 = vmlsl_u8(q8u16, d29u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d30u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d31u8, d4u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); + + q7u16 = vmlal_u8(q7u16, d28u8, d2u8); + q8u16 = vmlal_u8(q8u16, d29u8, d2u8); + q9u16 = vmlal_u8(q9u16, d30u8, d2u8); + q10u16 = vmlal_u8(q10u16, d31u8, d2u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + + q7u16 = vmlal_u8(q7u16, d28u8, d5u8); + q8u16 = vmlal_u8(q8u16, d29u8, d5u8); + q9u16 = vmlal_u8(q9u16, d30u8, d5u8); + q10u16 = vmlal_u8(q10u16, d31u8, d5u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); + + q3u16 = vmull_u8(d28u8, d3u8); + q4u16 = vmull_u8(d29u8, d3u8); + q5u16 = vmull_u8(d30u8, d3u8); + q6u16 = vmull_u8(d31u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d22u8 = vqrshrun_n_s16(q7s16, 7); + d23u8 = vqrshrun_n_s16(q8s16, 7); + d24u8 = vqrshrun_n_s16(q9s16, 7); + d25u8 = vqrshrun_n_s16(q10s16, 7); + + if (yoffset == 0) { // firstpass_filter8x4_only + vst1_u8(dst_ptr, d22u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d23u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d24u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d25u8); + dst_ptr += dst_pitch; + } else { + vst1_u8(tmpp, d22u8); + tmpp += 8; + vst1_u8(tmpp, d23u8); + tmpp += 8; + vst1_u8(tmpp, d24u8); + tmpp += 8; + vst1_u8(tmpp, d25u8); + tmpp += 8; + } + } + if (yoffset == 0) return; + + // First Pass on rest 5-line data + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + src += src_pixels_per_line; + q7u8 = vld1q_u8(src); + + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8); + q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1); + + q8u16 = vmlsl_u8(q8u16, d27u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); + q11u16 = vmlsl_u8(q11u16, d30u8, d1u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d1u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4); + + q8u16 = vmlsl_u8(q8u16, d27u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); + q11u16 = vmlsl_u8(q11u16, d30u8, d4u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d4u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2); + + q8u16 = vmlal_u8(q8u16, d27u8, d2u8); + q9u16 = vmlal_u8(q9u16, d28u8, d2u8); + q10u16 = vmlal_u8(q10u16, d29u8, d2u8); + q11u16 = vmlal_u8(q11u16, d30u8, d2u8); + q12u16 = vmlal_u8(q12u16, d31u8, d2u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5); + + q8u16 = vmlal_u8(q8u16, d27u8, d5u8); + q9u16 = vmlal_u8(q9u16, d28u8, d5u8); + q10u16 = vmlal_u8(q10u16, d29u8, d5u8); + q11u16 = vmlal_u8(q11u16, d30u8, d5u8); + q12u16 = vmlal_u8(q12u16, d31u8, d5u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3); + + q3u16 = vmull_u8(d27u8, d3u8); + q4u16 = vmull_u8(d28u8, d3u8); + q5u16 = vmull_u8(d29u8, d3u8); + q6u16 = vmull_u8(d30u8, d3u8); + q7u16 = vmull_u8(d31u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q11s16 = vreinterpretq_s16_u16(q11u16); + q12s16 = vreinterpretq_s16_u16(q12u16); + + q8s16 = vqaddq_s16(q8s16, q3s16); + q9s16 = vqaddq_s16(q9s16, q4s16); + q10s16 = vqaddq_s16(q10s16, q5s16); + q11s16 = vqaddq_s16(q11s16, q6s16); + q12s16 = vqaddq_s16(q12s16, q7s16); + + d26u8 = vqrshrun_n_s16(q8s16, 7); + d27u8 = vqrshrun_n_s16(q9s16, 7); + d28u8 = vqrshrun_n_s16(q10s16, 7); + d29u8 = vqrshrun_n_s16(q11s16, 7); + d30u8 = vqrshrun_n_s16(q12s16, 7); + + // Second pass: 8x8 + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + tmpp = tmp; + q9u8 = vld1q_u8(tmpp); + tmpp += 16; + q10u8 = vld1q_u8(tmpp); + tmpp += 16; + q11u8 = vld1q_u8(tmpp); + tmpp += 16; + q12u8 = vld1q_u8(tmpp); + + d18u8 = vget_low_u8(q9u8); + d19u8 = vget_high_u8(q9u8); + d20u8 = vget_low_u8(q10u8); + d21u8 = vget_high_u8(q10u8); + d22u8 = vget_low_u8(q11u8); + d23u8 = vget_high_u8(q11u8); + d24u8 = vget_low_u8(q12u8); + d25u8 = vget_high_u8(q12u8); + + for (i = 2; i > 0; i--) { + q3u16 = vmull_u8(d18u8, d0u8); + q4u16 = vmull_u8(d19u8, d0u8); + q5u16 = vmull_u8(d20u8, d0u8); + q6u16 = vmull_u8(d21u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d20u8, d2u8); + q4u16 = vmlal_u8(q4u16, d21u8, d2u8); + q5u16 = vmlal_u8(q5u16, d22u8, d2u8); + q6u16 = vmlal_u8(q6u16, d23u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d23u8, d5u8); + q4u16 = vmlal_u8(q4u16, d24u8, d5u8); + q5u16 = vmlal_u8(q5u16, d25u8, d5u8); + q6u16 = vmlal_u8(q6u16, d26u8, d5u8); + + q7u16 = vmull_u8(d21u8, d3u8); + q8u16 = vmull_u8(d22u8, d3u8); + q9u16 = vmull_u8(d23u8, d3u8); + q10u16 = vmull_u8(d24u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + d18u8 = d22u8; + d19u8 = d23u8; + d20u8 = d24u8; + d21u8 = d25u8; + d22u8 = d26u8; + d23u8 = d27u8; + d24u8 = d28u8; + d25u8 = d29u8; + d26u8 = d30u8; + + vst1_u8(dst_ptr, d6u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d7u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d8u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d9u8); + dst_ptr += dst_pitch; + } +} + +void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr, + int src_pixels_per_line, int xoffset, + int yoffset, unsigned char *dst_ptr, + int dst_pitch) { + unsigned char *src, *src_tmp, *dst, *tmpp; + unsigned char tmp[336]; + int i, j; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8; + uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8; + uint8x8_t d28u8, d29u8, d30u8, d31u8; + int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; + uint8x16_t q3u8, q4u8; + uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16; + uint16x8_t q11u16, q12u16, q13u16, q15u16; + int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16; + int16x8_t q11s16, q12s16, q13s16, q15s16; + + if (xoffset == 0) { // secondpass_filter8x8_only + // load second_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // load src data + src_tmp = src_ptr - src_pixels_per_line * 2; + for (i = 0; i < 2; ++i) { + src = src_tmp + i * 8; + dst = dst_ptr + i * 8; + d18u8 = vld1_u8(src); + src += src_pixels_per_line; + d19u8 = vld1_u8(src); + src += src_pixels_per_line; + d20u8 = vld1_u8(src); + src += src_pixels_per_line; + d21u8 = vld1_u8(src); + src += src_pixels_per_line; + d22u8 = vld1_u8(src); + src += src_pixels_per_line; + for (j = 0; j < 4; ++j) { + d23u8 = vld1_u8(src); + src += src_pixels_per_line; + d24u8 = vld1_u8(src); + src += src_pixels_per_line; + d25u8 = vld1_u8(src); + src += src_pixels_per_line; + d26u8 = vld1_u8(src); + src += src_pixels_per_line; + + q3u16 = vmull_u8(d18u8, d0u8); + q4u16 = vmull_u8(d19u8, d0u8); + q5u16 = vmull_u8(d20u8, d0u8); + q6u16 = vmull_u8(d21u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d20u8, d2u8); + q4u16 = vmlal_u8(q4u16, d21u8, d2u8); + q5u16 = vmlal_u8(q5u16, d22u8, d2u8); + q6u16 = vmlal_u8(q6u16, d23u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d23u8, d5u8); + q4u16 = vmlal_u8(q4u16, d24u8, d5u8); + q5u16 = vmlal_u8(q5u16, d25u8, d5u8); + q6u16 = vmlal_u8(q6u16, d26u8, d5u8); + + q7u16 = vmull_u8(d21u8, d3u8); + q8u16 = vmull_u8(d22u8, d3u8); + q9u16 = vmull_u8(d23u8, d3u8); + q10u16 = vmull_u8(d24u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + d18u8 = d22u8; + d19u8 = d23u8; + d20u8 = d24u8; + d21u8 = d25u8; + d22u8 = d26u8; + + vst1_u8(dst, d6u8); + dst += dst_pitch; + vst1_u8(dst, d7u8); + dst += dst_pitch; + vst1_u8(dst, d8u8); + dst += dst_pitch; + vst1_u8(dst, d9u8); + dst += dst_pitch; + } + } + return; + } + + // load first_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // First pass: output_height lines x output_width columns (9x4) + if (yoffset == 0) { // firstpass_filter4x4_only + src = src_ptr - 2; + dst = dst_ptr; + for (i = 0; i < 8; ++i) { + d6u8 = vld1_u8(src); + d7u8 = vld1_u8(src + 8); + d8u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + d9u8 = vld1_u8(src); + d10u8 = vld1_u8(src + 8); + d11u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + + __builtin_prefetch(src); + __builtin_prefetch(src + src_pixels_per_line); + + q6u16 = vmull_u8(d6u8, d0u8); + q7u16 = vmull_u8(d7u8, d0u8); + q8u16 = vmull_u8(d9u8, d0u8); + q9u16 = vmull_u8(d10u8, d0u8); + + d20u8 = vext_u8(d6u8, d7u8, 1); + d21u8 = vext_u8(d9u8, d10u8, 1); + d22u8 = vext_u8(d7u8, d8u8, 1); + d23u8 = vext_u8(d10u8, d11u8, 1); + d24u8 = vext_u8(d6u8, d7u8, 4); + d25u8 = vext_u8(d9u8, d10u8, 4); + d26u8 = vext_u8(d7u8, d8u8, 4); + d27u8 = vext_u8(d10u8, d11u8, 4); + d28u8 = vext_u8(d6u8, d7u8, 5); + d29u8 = vext_u8(d9u8, d10u8, 5); + + q6u16 = vmlsl_u8(q6u16, d20u8, d1u8); + q8u16 = vmlsl_u8(q8u16, d21u8, d1u8); + q7u16 = vmlsl_u8(q7u16, d22u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d23u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d24u8, d4u8); + q8u16 = vmlsl_u8(q8u16, d25u8, d4u8); + q7u16 = vmlsl_u8(q7u16, d26u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d27u8, d4u8); + q6u16 = vmlal_u8(q6u16, d28u8, d5u8); + q8u16 = vmlal_u8(q8u16, d29u8, d5u8); + + d20u8 = vext_u8(d7u8, d8u8, 5); + d21u8 = vext_u8(d10u8, d11u8, 5); + d22u8 = vext_u8(d6u8, d7u8, 2); + d23u8 = vext_u8(d9u8, d10u8, 2); + d24u8 = vext_u8(d7u8, d8u8, 2); + d25u8 = vext_u8(d10u8, d11u8, 2); + d26u8 = vext_u8(d6u8, d7u8, 3); + d27u8 = vext_u8(d9u8, d10u8, 3); + d28u8 = vext_u8(d7u8, d8u8, 3); + d29u8 = vext_u8(d10u8, d11u8, 3); + + q7u16 = vmlal_u8(q7u16, d20u8, d5u8); + q9u16 = vmlal_u8(q9u16, d21u8, d5u8); + q6u16 = vmlal_u8(q6u16, d22u8, d2u8); + q8u16 = vmlal_u8(q8u16, d23u8, d2u8); + q7u16 = vmlal_u8(q7u16, d24u8, d2u8); + q9u16 = vmlal_u8(q9u16, d25u8, d2u8); + + q10u16 = vmull_u8(d26u8, d3u8); + q11u16 = vmull_u8(d27u8, d3u8); + q12u16 = vmull_u8(d28u8, d3u8); + q15u16 = vmull_u8(d29u8, d3u8); + + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q11s16 = vreinterpretq_s16_u16(q11u16); + q12s16 = vreinterpretq_s16_u16(q12u16); + q15s16 = vreinterpretq_s16_u16(q15u16); + + q6s16 = vqaddq_s16(q6s16, q10s16); + q8s16 = vqaddq_s16(q8s16, q11s16); + q7s16 = vqaddq_s16(q7s16, q12s16); + q9s16 = vqaddq_s16(q9s16, q15s16); + + d6u8 = vqrshrun_n_s16(q6s16, 7); + d7u8 = vqrshrun_n_s16(q7s16, 7); + d8u8 = vqrshrun_n_s16(q8s16, 7); + d9u8 = vqrshrun_n_s16(q9s16, 7); + + q3u8 = vcombine_u8(d6u8, d7u8); + q4u8 = vcombine_u8(d8u8, d9u8); + vst1q_u8(dst, q3u8); + dst += dst_pitch; + vst1q_u8(dst, q4u8); + dst += dst_pitch; + } + return; + } + + src = src_ptr - 2 - src_pixels_per_line * 2; + tmpp = tmp; + for (i = 0; i < 7; ++i) { + d6u8 = vld1_u8(src); + d7u8 = vld1_u8(src + 8); + d8u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + d9u8 = vld1_u8(src); + d10u8 = vld1_u8(src + 8); + d11u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + d12u8 = vld1_u8(src); + d13u8 = vld1_u8(src + 8); + // Only 5 pixels are needed, avoid a potential out of bounds read. + d14u8 = vld1_u8(src + 13); + d14u8 = vext_u8(d14u8, d14u8, 3); + src += src_pixels_per_line; + + __builtin_prefetch(src); + __builtin_prefetch(src + src_pixels_per_line); + __builtin_prefetch(src + src_pixels_per_line * 2); + + q8u16 = vmull_u8(d6u8, d0u8); + q9u16 = vmull_u8(d7u8, d0u8); + q10u16 = vmull_u8(d9u8, d0u8); + q11u16 = vmull_u8(d10u8, d0u8); + q12u16 = vmull_u8(d12u8, d0u8); + q13u16 = vmull_u8(d13u8, d0u8); + + d28u8 = vext_u8(d6u8, d7u8, 1); + d29u8 = vext_u8(d9u8, d10u8, 1); + d30u8 = vext_u8(d12u8, d13u8, 1); + q8u16 = vmlsl_u8(q8u16, d28u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); + q12u16 = vmlsl_u8(q12u16, d30u8, d1u8); + d28u8 = vext_u8(d7u8, d8u8, 1); + d29u8 = vext_u8(d10u8, d11u8, 1); + d30u8 = vext_u8(d13u8, d14u8, 1); + q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); + q11u16 = vmlsl_u8(q11u16, d29u8, d1u8); + q13u16 = vmlsl_u8(q13u16, d30u8, d1u8); + + d28u8 = vext_u8(d6u8, d7u8, 4); + d29u8 = vext_u8(d9u8, d10u8, 4); + d30u8 = vext_u8(d12u8, d13u8, 4); + q8u16 = vmlsl_u8(q8u16, d28u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); + q12u16 = vmlsl_u8(q12u16, d30u8, d4u8); + d28u8 = vext_u8(d7u8, d8u8, 4); + d29u8 = vext_u8(d10u8, d11u8, 4); + d30u8 = vext_u8(d13u8, d14u8, 4); + q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); + q11u16 = vmlsl_u8(q11u16, d29u8, d4u8); + q13u16 = vmlsl_u8(q13u16, d30u8, d4u8); + + d28u8 = vext_u8(d6u8, d7u8, 5); + d29u8 = vext_u8(d9u8, d10u8, 5); + d30u8 = vext_u8(d12u8, d13u8, 5); + q8u16 = vmlal_u8(q8u16, d28u8, d5u8); + q10u16 = vmlal_u8(q10u16, d29u8, d5u8); + q12u16 = vmlal_u8(q12u16, d30u8, d5u8); + d28u8 = vext_u8(d7u8, d8u8, 5); + d29u8 = vext_u8(d10u8, d11u8, 5); + d30u8 = vext_u8(d13u8, d14u8, 5); + q9u16 = vmlal_u8(q9u16, d28u8, d5u8); + q11u16 = vmlal_u8(q11u16, d29u8, d5u8); + q13u16 = vmlal_u8(q13u16, d30u8, d5u8); + + d28u8 = vext_u8(d6u8, d7u8, 2); + d29u8 = vext_u8(d9u8, d10u8, 2); + d30u8 = vext_u8(d12u8, d13u8, 2); + q8u16 = vmlal_u8(q8u16, d28u8, d2u8); + q10u16 = vmlal_u8(q10u16, d29u8, d2u8); + q12u16 = vmlal_u8(q12u16, d30u8, d2u8); + d28u8 = vext_u8(d7u8, d8u8, 2); + d29u8 = vext_u8(d10u8, d11u8, 2); + d30u8 = vext_u8(d13u8, d14u8, 2); + q9u16 = vmlal_u8(q9u16, d28u8, d2u8); + q11u16 = vmlal_u8(q11u16, d29u8, d2u8); + q13u16 = vmlal_u8(q13u16, d30u8, d2u8); + + d28u8 = vext_u8(d6u8, d7u8, 3); + d29u8 = vext_u8(d9u8, d10u8, 3); + d30u8 = vext_u8(d12u8, d13u8, 3); + d15u8 = vext_u8(d7u8, d8u8, 3); + d31u8 = vext_u8(d10u8, d11u8, 3); + d6u8 = vext_u8(d13u8, d14u8, 3); + q4u16 = vmull_u8(d28u8, d3u8); + q5u16 = vmull_u8(d29u8, d3u8); + q6u16 = vmull_u8(d30u8, d3u8); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q12s16 = vreinterpretq_s16_u16(q12u16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q10s16 = vqaddq_s16(q10s16, q5s16); + q12s16 = vqaddq_s16(q12s16, q6s16); + + q6u16 = vmull_u8(d15u8, d3u8); + q7u16 = vmull_u8(d31u8, d3u8); + q3u16 = vmull_u8(d6u8, d3u8); + q3s16 = vreinterpretq_s16_u16(q3u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q11s16 = vreinterpretq_s16_u16(q11u16); + q13s16 = vreinterpretq_s16_u16(q13u16); + q9s16 = vqaddq_s16(q9s16, q6s16); + q11s16 = vqaddq_s16(q11s16, q7s16); + q13s16 = vqaddq_s16(q13s16, q3s16); + + d6u8 = vqrshrun_n_s16(q8s16, 7); + d7u8 = vqrshrun_n_s16(q9s16, 7); + d8u8 = vqrshrun_n_s16(q10s16, 7); + d9u8 = vqrshrun_n_s16(q11s16, 7); + d10u8 = vqrshrun_n_s16(q12s16, 7); + d11u8 = vqrshrun_n_s16(q13s16, 7); + + vst1_u8(tmpp, d6u8); + tmpp += 8; + vst1_u8(tmpp, d7u8); + tmpp += 8; + vst1_u8(tmpp, d8u8); + tmpp += 8; + vst1_u8(tmpp, d9u8); + tmpp += 8; + vst1_u8(tmpp, d10u8); + tmpp += 8; + vst1_u8(tmpp, d11u8); + tmpp += 8; + } + + // Second pass: 16x16 + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + for (i = 0; i < 2; ++i) { + dst = dst_ptr + 8 * i; + tmpp = tmp + 8 * i; + d18u8 = vld1_u8(tmpp); + tmpp += 16; + d19u8 = vld1_u8(tmpp); + tmpp += 16; + d20u8 = vld1_u8(tmpp); + tmpp += 16; + d21u8 = vld1_u8(tmpp); + tmpp += 16; + d22u8 = vld1_u8(tmpp); + tmpp += 16; + for (j = 0; j < 4; ++j) { + d23u8 = vld1_u8(tmpp); + tmpp += 16; + d24u8 = vld1_u8(tmpp); + tmpp += 16; + d25u8 = vld1_u8(tmpp); + tmpp += 16; + d26u8 = vld1_u8(tmpp); + tmpp += 16; + + q3u16 = vmull_u8(d18u8, d0u8); + q4u16 = vmull_u8(d19u8, d0u8); + q5u16 = vmull_u8(d20u8, d0u8); + q6u16 = vmull_u8(d21u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d20u8, d2u8); + q4u16 = vmlal_u8(q4u16, d21u8, d2u8); + q5u16 = vmlal_u8(q5u16, d22u8, d2u8); + q6u16 = vmlal_u8(q6u16, d23u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d23u8, d5u8); + q4u16 = vmlal_u8(q4u16, d24u8, d5u8); + q5u16 = vmlal_u8(q5u16, d25u8, d5u8); + q6u16 = vmlal_u8(q6u16, d26u8, d5u8); + + q7u16 = vmull_u8(d21u8, d3u8); + q8u16 = vmull_u8(d22u8, d3u8); + q9u16 = vmull_u8(d23u8, d3u8); + q10u16 = vmull_u8(d24u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + d18u8 = d22u8; + d19u8 = d23u8; + d20u8 = d24u8; + d21u8 = d25u8; + d22u8 = d26u8; + + vst1_u8(dst, d6u8); + dst += dst_pitch; + vst1_u8(dst, d7u8); + dst += dst_pitch; + vst1_u8(dst, d8u8); + dst += dst_pitch; + vst1_u8(dst, d9u8); + dst += dst_pitch; + } + } +} diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c new file mode 100644 index 0000000000..ebc004a048 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c @@ -0,0 +1,538 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "vp8/common/arm/loopfilter_arm.h" +#include "vpx_ports/arm.h" + +static INLINE void vp8_loop_filter_neon(uint8x16_t qblimit, // flimit + uint8x16_t qlimit, // limit + uint8x16_t qthresh, // thresh + uint8x16_t q3, // p3 + uint8x16_t q4, // p2 + uint8x16_t q5, // p1 + uint8x16_t q6, // p0 + uint8x16_t q7, // q0 + uint8x16_t q8, // q1 + uint8x16_t q9, // q2 + uint8x16_t q10, // q3 + uint8x16_t *q5r, // p1 + uint8x16_t *q6r, // p0 + uint8x16_t *q7r, // q0 + uint8x16_t *q8r) { // q1 + uint8x16_t q0u8, q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8; + int16x8_t q2s16, q11s16; + uint16x8_t q4u16; + int8x16_t q1s8, q2s8, q10s8, q11s8, q12s8, q13s8; + int8x8_t d2s8, d3s8; + + q11u8 = vabdq_u8(q3, q4); + q12u8 = vabdq_u8(q4, q5); + q13u8 = vabdq_u8(q5, q6); + q14u8 = vabdq_u8(q8, q7); + q3 = vabdq_u8(q9, q8); + q4 = vabdq_u8(q10, q9); + + q11u8 = vmaxq_u8(q11u8, q12u8); + q12u8 = vmaxq_u8(q13u8, q14u8); + q3 = vmaxq_u8(q3, q4); + q15u8 = vmaxq_u8(q11u8, q12u8); + + q9 = vabdq_u8(q6, q7); + + // vp8_hevmask + q13u8 = vcgtq_u8(q13u8, qthresh); + q14u8 = vcgtq_u8(q14u8, qthresh); + q15u8 = vmaxq_u8(q15u8, q3); + + q2u8 = vabdq_u8(q5, q8); + q9 = vqaddq_u8(q9, q9); + + q15u8 = vcgeq_u8(qlimit, q15u8); + + // vp8_filter() function + // convert to signed + q10 = vdupq_n_u8(0x80); + q8 = veorq_u8(q8, q10); + q7 = veorq_u8(q7, q10); + q6 = veorq_u8(q6, q10); + q5 = veorq_u8(q5, q10); + + q2u8 = vshrq_n_u8(q2u8, 1); + q9 = vqaddq_u8(q9, q2u8); + + q10 = vdupq_n_u8(3); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)), + vget_low_s8(vreinterpretq_s8_u8(q6))); + q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)), + vget_high_s8(vreinterpretq_s8_u8(q6))); + + q9 = vcgeq_u8(qblimit, q9); + + q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8)); + + q14u8 = vorrq_u8(q13u8, q14u8); + + q4u16 = vmovl_u8(vget_low_u8(q10)); + q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16)); + q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16)); + + q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8); + q15u8 = vandq_u8(q15u8, q9); + + q1s8 = vreinterpretq_s8_u8(q1u8); + q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8)); + q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8)); + + q9 = vdupq_n_u8(4); + // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) + d2s8 = vqmovn_s16(q2s16); + d3s8 = vqmovn_s16(q11s16); + q1s8 = vcombine_s8(d2s8, d3s8); + q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8); + q1s8 = vreinterpretq_s8_u8(q1u8); + + q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q10)); + q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9)); + q2s8 = vshrq_n_s8(q2s8, 3); + q1s8 = vshrq_n_s8(q1s8, 3); + + q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8); + q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8); + + q1s8 = vrshrq_n_s8(q1s8, 1); + q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); + + q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8); + q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8); + + q0u8 = vdupq_n_u8(0x80); + *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q0u8); + *q7r = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8); + *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8); + *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q0u8); + return; +} + +void vp8_loop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh) { + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + src -= (pitch << 2); + + q3 = vld1q_u8(src); + src += pitch; + q4 = vld1q_u8(src); + src += pitch; + q5 = vld1q_u8(src); + src += pitch; + q6 = vld1q_u8(src); + src += pitch; + q7 = vld1q_u8(src); + src += pitch; + q8 = vld1q_u8(src); + src += pitch; + q9 = vld1q_u8(src); + src += pitch; + q10 = vld1q_u8(src); + + vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9, + q10, &q5, &q6, &q7, &q8); + + src -= (pitch * 5); + vst1q_u8(src, q5); + src += pitch; + vst1q_u8(src, q6); + src += pitch; + vst1q_u8(src, q7); + src += pitch; + vst1q_u8(src, q8); + return; +} + +void vp8_loop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh, + unsigned char *v) { + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + u -= (pitch << 2); + v -= (pitch << 2); + + d6 = vld1_u8(u); + u += pitch; + d7 = vld1_u8(v); + v += pitch; + d8 = vld1_u8(u); + u += pitch; + d9 = vld1_u8(v); + v += pitch; + d10 = vld1_u8(u); + u += pitch; + d11 = vld1_u8(v); + v += pitch; + d12 = vld1_u8(u); + u += pitch; + d13 = vld1_u8(v); + v += pitch; + d14 = vld1_u8(u); + u += pitch; + d15 = vld1_u8(v); + v += pitch; + d16 = vld1_u8(u); + u += pitch; + d17 = vld1_u8(v); + v += pitch; + d18 = vld1_u8(u); + u += pitch; + d19 = vld1_u8(v); + v += pitch; + d20 = vld1_u8(u); + d21 = vld1_u8(v); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9, + q10, &q5, &q6, &q7, &q8); + + u -= (pitch * 5); + vst1_u8(u, vget_low_u8(q5)); + u += pitch; + vst1_u8(u, vget_low_u8(q6)); + u += pitch; + vst1_u8(u, vget_low_u8(q7)); + u += pitch; + vst1_u8(u, vget_low_u8(q8)); + + v -= (pitch * 5); + vst1_u8(v, vget_high_u8(q5)); + v += pitch; + vst1_u8(v, vget_high_u8(q6)); + v += pitch; + vst1_u8(v, vget_high_u8(q7)); + v += pitch; + vst1_u8(v, vget_high_u8(q8)); + return; +} + +static INLINE void write_4x8(unsigned char *dst, int pitch, + const uint8x8x4_t result) { +#ifdef VPX_INCOMPATIBLE_GCC + /* + * uint8x8x4_t result + 00 01 02 03 | 04 05 06 07 + 10 11 12 13 | 14 15 16 17 + 20 21 22 23 | 24 25 26 27 + 30 31 32 33 | 34 35 36 37 + --- + * after vtrn_u16 + 00 01 20 21 | 04 05 24 25 + 02 03 22 23 | 06 07 26 27 + 10 11 30 31 | 14 15 34 35 + 12 13 32 33 | 16 17 36 37 + --- + * after vtrn_u8 + 00 10 20 30 | 04 14 24 34 + 01 11 21 31 | 05 15 25 35 + 02 12 22 32 | 06 16 26 36 + 03 13 23 33 | 07 17 27 37 + */ + const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]), + vreinterpret_u16_u8(result.val[2])); + const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]), + vreinterpret_u16_u8(result.val[3])); + const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]), + vreinterpret_u8_u16(r13_u16.val[0])); + const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]), + vreinterpret_u8_u16(r13_u16.val[1])); + const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]); + const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]); + const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]); + const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]); + vst1_lane_u32((uint32_t *)dst, x_0_4, 0); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_1_5, 0); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_2_6, 0); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_3_7, 0); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_0_4, 1); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_1_5, 1); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_2_6, 1); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_3_7, 1); +#else + vst4_lane_u8(dst, result, 0); + dst += pitch; + vst4_lane_u8(dst, result, 1); + dst += pitch; + vst4_lane_u8(dst, result, 2); + dst += pitch; + vst4_lane_u8(dst, result, 3); + dst += pitch; + vst4_lane_u8(dst, result, 4); + dst += pitch; + vst4_lane_u8(dst, result, 5); + dst += pitch; + vst4_lane_u8(dst, result, 6); + dst += pitch; + vst4_lane_u8(dst, result, 7); +#endif // VPX_INCOMPATIBLE_GCC +} + +void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh) { + unsigned char *s, *d; + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; + uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; + uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; + uint8x8x4_t q4ResultH, q4ResultL; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + s = src - 4; + d6 = vld1_u8(s); + s += pitch; + d8 = vld1_u8(s); + s += pitch; + d10 = vld1_u8(s); + s += pitch; + d12 = vld1_u8(s); + s += pitch; + d14 = vld1_u8(s); + s += pitch; + d16 = vld1_u8(s); + s += pitch; + d18 = vld1_u8(s); + s += pitch; + d20 = vld1_u8(s); + s += pitch; + d7 = vld1_u8(s); + s += pitch; + d9 = vld1_u8(s); + s += pitch; + d11 = vld1_u8(s); + s += pitch; + d13 = vld1_u8(s); + s += pitch; + d15 = vld1_u8(s); + s += pitch; + d17 = vld1_u8(s); + s += pitch; + d19 = vld1_u8(s); + s += pitch; + d21 = vld1_u8(s); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9, + q10, &q5, &q6, &q7, &q8); + + q4ResultL.val[0] = vget_low_u8(q5); // d10 + q4ResultL.val[1] = vget_low_u8(q6); // d12 + q4ResultL.val[2] = vget_low_u8(q7); // d14 + q4ResultL.val[3] = vget_low_u8(q8); // d16 + q4ResultH.val[0] = vget_high_u8(q5); // d11 + q4ResultH.val[1] = vget_high_u8(q6); // d13 + q4ResultH.val[2] = vget_high_u8(q7); // d15 + q4ResultH.val[3] = vget_high_u8(q8); // d17 + + d = src - 2; + write_4x8(d, pitch, q4ResultL); + d += pitch * 8; + write_4x8(d, pitch, q4ResultH); +} + +void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh, + unsigned char *v) { + unsigned char *us, *ud; + unsigned char *vs, *vd; + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; + uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; + uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; + uint8x8x4_t q4ResultH, q4ResultL; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + us = u - 4; + d6 = vld1_u8(us); + us += pitch; + d8 = vld1_u8(us); + us += pitch; + d10 = vld1_u8(us); + us += pitch; + d12 = vld1_u8(us); + us += pitch; + d14 = vld1_u8(us); + us += pitch; + d16 = vld1_u8(us); + us += pitch; + d18 = vld1_u8(us); + us += pitch; + d20 = vld1_u8(us); + + vs = v - 4; + d7 = vld1_u8(vs); + vs += pitch; + d9 = vld1_u8(vs); + vs += pitch; + d11 = vld1_u8(vs); + vs += pitch; + d13 = vld1_u8(vs); + vs += pitch; + d15 = vld1_u8(vs); + vs += pitch; + d17 = vld1_u8(vs); + vs += pitch; + d19 = vld1_u8(vs); + vs += pitch; + d21 = vld1_u8(vs); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9, + q10, &q5, &q6, &q7, &q8); + + q4ResultL.val[0] = vget_low_u8(q5); // d10 + q4ResultL.val[1] = vget_low_u8(q6); // d12 + q4ResultL.val[2] = vget_low_u8(q7); // d14 + q4ResultL.val[3] = vget_low_u8(q8); // d16 + ud = u - 2; + write_4x8(ud, pitch, q4ResultL); + + q4ResultH.val[0] = vget_high_u8(q5); // d11 + q4ResultH.val[1] = vget_high_u8(q6); // d13 + q4ResultH.val[2] = vget_high_u8(q7); // d15 + q4ResultH.val[3] = vget_high_u8(q8); // d17 + vd = v - 2; + write_4x8(vd, pitch, q4ResultH); +} diff --git a/media/libvpx/libvpx/vp8/common/blockd.c b/media/libvpx/libvpx/vp8/common/blockd.c new file mode 100644 index 0000000000..22905c10a6 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/blockd.c @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "blockd.h" +#include "vpx_mem/vpx_mem.h" + +const unsigned char vp8_block2left[25] = { 0, 0, 0, 0, 1, 1, 1, 1, 2, + 2, 2, 2, 3, 3, 3, 3, 4, 4, + 5, 5, 6, 6, 7, 7, 8 }; +const unsigned char vp8_block2above[25] = { 0, 1, 2, 3, 0, 1, 2, 3, 0, + 1, 2, 3, 0, 1, 2, 3, 4, 5, + 4, 5, 6, 7, 6, 7, 8 }; diff --git a/media/libvpx/libvpx/vp8/common/blockd.h b/media/libvpx/libvpx/vp8/common/blockd.h new file mode 100644 index 0000000000..8300aad941 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/blockd.h @@ -0,0 +1,311 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_BLOCKD_H_ +#define VPX_VP8_COMMON_BLOCKD_H_ + +void vpx_log(const char *format, ...); + +#include "vpx/internal/vpx_codec_internal.h" +#include "vpx_config.h" +#include "vpx_scale/yv12config.h" +#include "mv.h" +#include "treecoder.h" +#include "vpx_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*#define DCPRED 1*/ +#define DCPREDSIMTHRESH 0 +#define DCPREDCNTTHRESH 3 + +#define MB_FEATURE_TREE_PROBS 3 +#define MAX_MB_SEGMENTS 4 + +#define MAX_REF_LF_DELTAS 4 +#define MAX_MODE_LF_DELTAS 4 + +/* Segment Feature Masks */ +#define SEGMENT_DELTADATA 0 +#define SEGMENT_ABSDATA 1 + +typedef struct { + int r, c; +} POS; + +#define PLANE_TYPE_Y_NO_DC 0 +#define PLANE_TYPE_Y2 1 +#define PLANE_TYPE_UV 2 +#define PLANE_TYPE_Y_WITH_DC 3 + +typedef char ENTROPY_CONTEXT; +typedef struct { + ENTROPY_CONTEXT y1[4]; + ENTROPY_CONTEXT u[2]; + ENTROPY_CONTEXT v[2]; + ENTROPY_CONTEXT y2; +} ENTROPY_CONTEXT_PLANES; + +extern const unsigned char vp8_block2left[25]; +extern const unsigned char vp8_block2above[25]; + +#define VP8_COMBINEENTROPYCONTEXTS(Dest, A, B) Dest = (A) + (B) + +typedef enum { KEY_FRAME = 0, INTER_FRAME = 1 } FRAME_TYPE; + +typedef enum { + DC_PRED, /* average of above and left pixels */ + V_PRED, /* vertical prediction */ + H_PRED, /* horizontal prediction */ + TM_PRED, /* Truemotion prediction */ + B_PRED, /* block based prediction, each block has its own prediction mode */ + + NEARESTMV, + NEARMV, + ZEROMV, + NEWMV, + SPLITMV, + + MB_MODE_COUNT +} MB_PREDICTION_MODE; + +/* Macroblock level features */ +typedef enum { + MB_LVL_ALT_Q = 0, /* Use alternate Quantizer .... */ + MB_LVL_ALT_LF = 1, /* Use alternate loop filter value... */ + MB_LVL_MAX = 2 /* Number of MB level features supported */ + +} MB_LVL_FEATURES; + +/* Segment Feature Masks */ +#define SEGMENT_ALTQ 0x01 +#define SEGMENT_ALT_LF 0x02 + +#define VP8_YMODES (B_PRED + 1) +#define VP8_UV_MODES (TM_PRED + 1) + +#define VP8_MVREFS (1 + SPLITMV - NEARESTMV) + +typedef enum { + B_DC_PRED, /* average of above and left pixels */ + B_TM_PRED, + + B_VE_PRED, /* vertical prediction */ + B_HE_PRED, /* horizontal prediction */ + + B_LD_PRED, + B_RD_PRED, + + B_VR_PRED, + B_VL_PRED, + B_HD_PRED, + B_HU_PRED, + + LEFT4X4, + ABOVE4X4, + ZERO4X4, + NEW4X4, + + B_MODE_COUNT +} B_PREDICTION_MODE; + +#define VP8_BINTRAMODES (B_HU_PRED + 1) /* 10 */ +#define VP8_SUBMVREFS (1 + NEW4X4 - LEFT4X4) + +/* For keyframes, intra block modes are predicted by the (already decoded) + modes for the Y blocks to the left and above us; for interframes, there + is a single probability table. */ + +union b_mode_info { + B_PREDICTION_MODE as_mode; + int_mv mv; +}; + +typedef enum { + INTRA_FRAME = 0, + LAST_FRAME = 1, + GOLDEN_FRAME = 2, + ALTREF_FRAME = 3, + MAX_REF_FRAMES = 4 +} MV_REFERENCE_FRAME; + +typedef struct { + uint8_t mode, uv_mode; + uint8_t ref_frame; + uint8_t is_4x4; + int_mv mv; + + uint8_t partitioning; + /* does this mb has coefficients at all, 1=no coefficients, 0=need decode + tokens */ + uint8_t mb_skip_coeff; + uint8_t need_to_clamp_mvs; + /* Which set of segmentation parameters should be used for this MB */ + uint8_t segment_id; +} MB_MODE_INFO; + +typedef struct modeinfo { + MB_MODE_INFO mbmi; + union b_mode_info bmi[16]; +} MODE_INFO; + +#if CONFIG_MULTI_RES_ENCODING +/* The mb-level information needed to be stored for higher-resolution encoder */ +typedef struct { + MB_PREDICTION_MODE mode; + MV_REFERENCE_FRAME ref_frame; + int_mv mv; + int dissim; /* dissimilarity level of the macroblock */ +} LOWER_RES_MB_INFO; + +/* The frame-level information needed to be stored for higher-resolution + * encoder */ +typedef struct { + FRAME_TYPE frame_type; + int is_frame_dropped; + // If frame is dropped due to overshoot after encode_frame. This triggers a + // drop and resets rate control with Q forced to max for following frame. + // The check for this dropping due to overshoot is only done on lowest stream, + // and if set will force drop on all spatial streams for that current frame. + int is_frame_dropped_overshoot_maxqp; + // The frame rate for the lowest resolution. + double low_res_framerate; + /* The frame number of each reference frames */ + unsigned int low_res_ref_frames[MAX_REF_FRAMES]; + // The video frame counter value for the key frame, for lowest resolution. + unsigned int key_frame_counter_value; + // Flags to signal skipped encoding of previous and base layer stream. + unsigned int skip_encoding_prev_stream; + unsigned int skip_encoding_base_stream; + LOWER_RES_MB_INFO *mb_info; +} LOWER_RES_FRAME_INFO; +#endif + +typedef struct blockd { + short *qcoeff; + short *dqcoeff; + unsigned char *predictor; + short *dequant; + + int offset; + char *eob; + + union b_mode_info bmi; +} BLOCKD; + +typedef void (*vp8_subpix_fn_t)(unsigned char *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, + unsigned char *dst_ptr, int dst_pitch); + +typedef struct macroblockd { + DECLARE_ALIGNED(16, unsigned char, predictor[384]); + DECLARE_ALIGNED(16, short, qcoeff[400]); + DECLARE_ALIGNED(16, short, dqcoeff[400]); + DECLARE_ALIGNED(16, char, eobs[25]); + + DECLARE_ALIGNED(16, short, dequant_y1[16]); + DECLARE_ALIGNED(16, short, dequant_y1_dc[16]); + DECLARE_ALIGNED(16, short, dequant_y2[16]); + DECLARE_ALIGNED(16, short, dequant_uv[16]); + + /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */ + BLOCKD block[25]; + int fullpixel_mask; + + YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */ + YV12_BUFFER_CONFIG dst; + + MODE_INFO *mode_info_context; + int mode_info_stride; + + FRAME_TYPE frame_type; + + int up_available; + int left_available; + + unsigned char *recon_above[3]; + unsigned char *recon_left[3]; + int recon_left_stride[2]; + + /* Y,U,V,Y2 */ + ENTROPY_CONTEXT_PLANES *above_context; + ENTROPY_CONTEXT_PLANES *left_context; + + /* 0 indicates segmentation at MB level is not enabled. Otherwise the + * individual bits indicate which features are active. */ + unsigned char segmentation_enabled; + + /* 0 (do not update) 1 (update) the macroblock segmentation map. */ + unsigned char update_mb_segmentation_map; + + /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */ + unsigned char update_mb_segmentation_data; + + /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */ + unsigned char mb_segment_abs_delta; + + /* Per frame flags that define which MB level features (such as quantizer or + * loop filter level) */ + /* are enabled and when enabled the proabilities used to decode the per MB + * flags in MB_MODE_INFO */ + /* Probability Tree used to code Segment number */ + vp8_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS]; + /* Segment parameters */ + signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS]; + + /* mode_based Loop filter adjustment */ + unsigned char mode_ref_lf_delta_enabled; + unsigned char mode_ref_lf_delta_update; + + /* Delta values have the range +/- MAX_LOOP_FILTER */ + signed char + last_ref_lf_deltas[MAX_REF_LF_DELTAS]; /* 0 = Intra, Last, GF, ARF */ + signed char ref_lf_deltas[MAX_REF_LF_DELTAS]; /* 0 = Intra, Last, GF, ARF */ + /* 0 = BPRED, ZERO_MV, MV, SPLIT */ + signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS]; + signed char + mode_lf_deltas[MAX_MODE_LF_DELTAS]; /* 0 = BPRED, ZERO_MV, MV, SPLIT */ + + /* Distance of MB away from frame edges */ + int mb_to_left_edge; + int mb_to_right_edge; + int mb_to_top_edge; + int mb_to_bottom_edge; + + vp8_subpix_fn_t subpixel_predict; + vp8_subpix_fn_t subpixel_predict8x4; + vp8_subpix_fn_t subpixel_predict8x8; + vp8_subpix_fn_t subpixel_predict16x16; + + void *current_bc; + + int corrupted; + + struct vpx_internal_error_info error_info; + +#if VPX_ARCH_X86 || VPX_ARCH_X86_64 + /* This is an intermediate buffer currently used in sub-pixel motion search + * to keep a copy of the reference area. This buffer can be used for other + * purpose. + */ + DECLARE_ALIGNED(32, unsigned char, y_buf[22 * 32]); +#endif +} MACROBLOCKD; + +extern void vp8_build_block_doffsets(MACROBLOCKD *x); +extern void vp8_setup_block_dptrs(MACROBLOCKD *x); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_COMMON_BLOCKD_H_ diff --git a/media/libvpx/libvpx/vp8/common/coefupdateprobs.h b/media/libvpx/libvpx/vp8/common/coefupdateprobs.h new file mode 100644 index 0000000000..b342096b55 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/coefupdateprobs.h @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_COEFUPDATEPROBS_H_ +#define VPX_VP8_COMMON_COEFUPDATEPROBS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* Update probabilities for the nodes in the token entropy tree. + Generated file included by entropy.c */ + +const vp8_prob vp8_coef_update_probs + [BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES] = { + { + { + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255 }, + { 250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255 }, + { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + }, + { + { + { 217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255 }, + { 234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255 }, + }, + { + { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + }, + { + { + { 186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255 }, + { 251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255 }, + }, + { + { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + }, + { + { + { 248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255 }, + { 248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + }, + }; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_COMMON_COEFUPDATEPROBS_H_ diff --git a/media/libvpx/libvpx/vp8/common/common.h b/media/libvpx/libvpx/vp8/common/common.h new file mode 100644 index 0000000000..562569f9ab --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/common.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_COMMON_H_ +#define VPX_VP8_COMMON_COMMON_H_ + +#include + +/* Interface header for common constant data structures and lookup tables */ + +#include "vpx_mem/vpx_mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* Only need this for fixed-size arrays, for structs just assign. */ + +#define vp8_copy(Dest, Src) \ + do { \ + assert(sizeof(Dest) == sizeof(Src)); \ + memcpy(Dest, Src, sizeof(Src)); \ + } while (0) + +/* Use this for variably-sized arrays. */ + +#define vp8_copy_array(Dest, Src, N) \ + do { \ + assert(sizeof(*(Dest)) == sizeof(*(Src))); \ + memcpy(Dest, Src, (N) * sizeof(*(Src))); \ + } while (0) + +#define vp8_zero(Dest) memset(&(Dest), 0, sizeof(Dest)) + +#define vp8_zero_array(Dest, N) memset(Dest, 0, (N) * sizeof(*(Dest))) + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_COMMON_COMMON_H_ diff --git a/media/libvpx/libvpx/vp8/common/context.c b/media/libvpx/libvpx/vp8/common/context.c new file mode 100644 index 0000000000..3c624ae628 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/context.c @@ -0,0 +1,398 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "entropy.h" + +/* *** GENERATED FILE: DO NOT EDIT *** */ + +#if 0 +int Contexts[vp8_coef_counter_dimen]; + +const int default_contexts[vp8_coef_counter_dimen] = +{ + { + // Block Type ( 0 ) + { + // Coeff Band ( 0 ) + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + }, + { + // Coeff Band ( 1 ) + {30190, 26544, 225, 24, 4, 0, 0, 0, 0, 0, 0, 4171593,}, + {26846, 25157, 1241, 130, 26, 6, 1, 0, 0, 0, 0, 149987,}, + {10484, 9538, 1006, 160, 36, 18, 0, 0, 0, 0, 0, 15104,}, + }, + { + // Coeff Band ( 2 ) + {25842, 40456, 1126, 83, 11, 2, 0, 0, 0, 0, 0, 0,}, + {9338, 8010, 512, 73, 7, 3, 2, 0, 0, 0, 0, 43294,}, + {1047, 751, 149, 31, 13, 6, 1, 0, 0, 0, 0, 879,}, + }, + { + // Coeff Band ( 3 ) + {26136, 9826, 252, 13, 0, 0, 0, 0, 0, 0, 0, 0,}, + {8134, 5574, 191, 14, 2, 0, 0, 0, 0, 0, 0, 35302,}, + { 605, 677, 116, 9, 1, 0, 0, 0, 0, 0, 0, 611,}, + }, + { + // Coeff Band ( 4 ) + {10263, 15463, 283, 17, 0, 0, 0, 0, 0, 0, 0, 0,}, + {2773, 2191, 128, 9, 2, 2, 0, 0, 0, 0, 0, 10073,}, + { 134, 125, 32, 4, 0, 2, 0, 0, 0, 0, 0, 50,}, + }, + { + // Coeff Band ( 5 ) + {10483, 2663, 23, 1, 0, 0, 0, 0, 0, 0, 0, 0,}, + {2137, 1251, 27, 1, 1, 0, 0, 0, 0, 0, 0, 14362,}, + { 116, 156, 14, 2, 1, 0, 0, 0, 0, 0, 0, 190,}, + }, + { + // Coeff Band ( 6 ) + {40977, 27614, 412, 28, 0, 0, 0, 0, 0, 0, 0, 0,}, + {6113, 5213, 261, 22, 3, 0, 0, 0, 0, 0, 0, 26164,}, + { 382, 312, 50, 14, 2, 0, 0, 0, 0, 0, 0, 345,}, + }, + { + // Coeff Band ( 7 ) + { 0, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 319,}, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8,}, + }, + }, + { + // Block Type ( 1 ) + { + // Coeff Band ( 0 ) + {3268, 19382, 1043, 250, 93, 82, 49, 26, 17, 8, 25, 82289,}, + {8758, 32110, 5436, 1832, 827, 668, 420, 153, 24, 0, 3, 52914,}, + {9337, 23725, 8487, 3954, 2107, 1836, 1069, 399, 59, 0, 0, 18620,}, + }, + { + // Coeff Band ( 1 ) + {12419, 8420, 452, 62, 9, 1, 0, 0, 0, 0, 0, 0,}, + {11715, 8705, 693, 92, 15, 7, 2, 0, 0, 0, 0, 53988,}, + {7603, 8585, 2306, 778, 270, 145, 39, 5, 0, 0, 0, 9136,}, + }, + { + // Coeff Band ( 2 ) + {15938, 14335, 1207, 184, 55, 13, 4, 1, 0, 0, 0, 0,}, + {7415, 6829, 1138, 244, 71, 26, 7, 0, 0, 0, 0, 9980,}, + {1580, 1824, 655, 241, 89, 46, 10, 2, 0, 0, 0, 429,}, + }, + { + // Coeff Band ( 3 ) + {19453, 5260, 201, 19, 0, 0, 0, 0, 0, 0, 0, 0,}, + {9173, 3758, 213, 22, 1, 1, 0, 0, 0, 0, 0, 9820,}, + {1689, 1277, 276, 51, 17, 4, 0, 0, 0, 0, 0, 679,}, + }, + { + // Coeff Band ( 4 ) + {12076, 10667, 620, 85, 19, 9, 5, 0, 0, 0, 0, 0,}, + {4665, 3625, 423, 55, 19, 9, 0, 0, 0, 0, 0, 5127,}, + { 415, 440, 143, 34, 20, 7, 2, 0, 0, 0, 0, 101,}, + }, + { + // Coeff Band ( 5 ) + {12183, 4846, 115, 11, 1, 0, 0, 0, 0, 0, 0, 0,}, + {4226, 3149, 177, 21, 2, 0, 0, 0, 0, 0, 0, 7157,}, + { 375, 621, 189, 51, 11, 4, 1, 0, 0, 0, 0, 198,}, + }, + { + // Coeff Band ( 6 ) + {61658, 37743, 1203, 94, 10, 3, 0, 0, 0, 0, 0, 0,}, + {15514, 11563, 903, 111, 14, 5, 0, 0, 0, 0, 0, 25195,}, + { 929, 1077, 291, 78, 14, 7, 1, 0, 0, 0, 0, 507,}, + }, + { + // Coeff Band ( 7 ) + { 0, 990, 15, 3, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 0, 412, 13, 0, 0, 0, 0, 0, 0, 0, 0, 1641,}, + { 0, 18, 7, 1, 0, 0, 0, 0, 0, 0, 0, 30,}, + }, + }, + { + // Block Type ( 2 ) + { + // Coeff Band ( 0 ) + { 953, 24519, 628, 120, 28, 12, 4, 0, 0, 0, 0, 2248798,}, + {1525, 25654, 2647, 617, 239, 143, 42, 5, 0, 0, 0, 66837,}, + {1180, 11011, 3001, 1237, 532, 448, 239, 54, 5, 0, 0, 7122,}, + }, + { + // Coeff Band ( 1 ) + {1356, 2220, 67, 10, 4, 1, 0, 0, 0, 0, 0, 0,}, + {1450, 2544, 102, 18, 4, 3, 0, 0, 0, 0, 0, 57063,}, + {1182, 2110, 470, 130, 41, 21, 0, 0, 0, 0, 0, 6047,}, + }, + { + // Coeff Band ( 2 ) + { 370, 3378, 200, 30, 5, 4, 1, 0, 0, 0, 0, 0,}, + { 293, 1006, 131, 29, 11, 0, 0, 0, 0, 0, 0, 5404,}, + { 114, 387, 98, 23, 4, 8, 1, 0, 0, 0, 0, 236,}, + }, + { + // Coeff Band ( 3 ) + { 579, 194, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 395, 213, 5, 1, 0, 0, 0, 0, 0, 0, 0, 4157,}, + { 119, 122, 4, 0, 0, 0, 0, 0, 0, 0, 0, 300,}, + }, + { + // Coeff Band ( 4 ) + { 38, 557, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 21, 114, 12, 1, 0, 0, 0, 0, 0, 0, 0, 427,}, + { 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7,}, + }, + { + // Coeff Band ( 5 ) + { 52, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 18, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 652,}, + { 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30,}, + }, + { + // Coeff Band ( 6 ) + { 640, 569, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 25, 77, 2, 0, 0, 0, 0, 0, 0, 0, 0, 517,}, + { 4, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3,}, + }, + { + // Coeff Band ( 7 ) + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + }, + }, + { + // Block Type ( 3 ) + { + // Coeff Band ( 0 ) + {2506, 20161, 2707, 767, 261, 178, 107, 30, 14, 3, 0, 100694,}, + {8806, 36478, 8817, 3268, 1280, 850, 401, 114, 42, 0, 0, 58572,}, + {11003, 27214, 11798, 5716, 2482, 2072, 1048, 175, 32, 0, 0, 19284,}, + }, + { + // Coeff Band ( 1 ) + {9738, 11313, 959, 205, 70, 18, 11, 1, 0, 0, 0, 0,}, + {12628, 15085, 1507, 273, 52, 19, 9, 0, 0, 0, 0, 54280,}, + {10701, 15846, 5561, 1926, 813, 570, 249, 36, 0, 0, 0, 6460,}, + }, + { + // Coeff Band ( 2 ) + {6781, 22539, 2784, 634, 182, 123, 20, 4, 0, 0, 0, 0,}, + {6263, 11544, 2649, 790, 259, 168, 27, 5, 0, 0, 0, 20539,}, + {3109, 4075, 2031, 896, 457, 386, 158, 29, 0, 0, 0, 1138,}, + }, + { + // Coeff Band ( 3 ) + {11515, 4079, 465, 73, 5, 14, 2, 0, 0, 0, 0, 0,}, + {9361, 5834, 650, 96, 24, 8, 4, 0, 0, 0, 0, 22181,}, + {4343, 3974, 1360, 415, 132, 96, 14, 1, 0, 0, 0, 1267,}, + }, + { + // Coeff Band ( 4 ) + {4787, 9297, 823, 168, 44, 12, 4, 0, 0, 0, 0, 0,}, + {3619, 4472, 719, 198, 60, 31, 3, 0, 0, 0, 0, 8401,}, + {1157, 1175, 483, 182, 88, 31, 8, 0, 0, 0, 0, 268,}, + }, + { + // Coeff Band ( 5 ) + {8299, 1226, 32, 5, 1, 0, 0, 0, 0, 0, 0, 0,}, + {3502, 1568, 57, 4, 1, 1, 0, 0, 0, 0, 0, 9811,}, + {1055, 1070, 166, 29, 6, 1, 0, 0, 0, 0, 0, 527,}, + }, + { + // Coeff Band ( 6 ) + {27414, 27927, 1989, 347, 69, 26, 0, 0, 0, 0, 0, 0,}, + {5876, 10074, 1574, 341, 91, 24, 4, 0, 0, 0, 0, 21954,}, + {1571, 2171, 778, 324, 124, 65, 16, 0, 0, 0, 0, 979,}, + }, + { + // Coeff Band ( 7 ) + { 0, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, + { 0, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 459,}, + { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13,}, + }, + }, +}; + +//Update probabilities for the nodes in the token entropy tree. +const vp8_prob tree_update_probs[vp8_coef_tree_dimen] = +{ + { + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255, }, + {249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255, }, + {234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255, }, + {250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + }, + { + { + {217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255, }, + {234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255, }, + }, + { + {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + }, + { + { + {186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255, }, + {234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255, }, + {251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255, }, + }, + { + {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + }, + { + { + {248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255, }, + {248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255, }, + {248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + }, +}; +#endif diff --git a/media/libvpx/libvpx/vp8/common/debugmodes.c b/media/libvpx/libvpx/vp8/common/debugmodes.c new file mode 100644 index 0000000000..27a97b260c --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/debugmodes.c @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "blockd.h" + +void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, + int frame) { + int mb_row; + int mb_col; + int mb_index = 0; + FILE *mvs = fopen("mvs.stt", "a"); + + /* print out the macroblock Y modes */ + mb_index = 0; + fprintf(mvs, "Mb Modes for Frame %d\n", frame); + + for (mb_row = 0; mb_row < rows; ++mb_row) { + for (mb_col = 0; mb_col < cols; ++mb_col) { + fprintf(mvs, "%2d ", mi[mb_index].mbmi.mode); + + mb_index++; + } + + fprintf(mvs, "\n"); + mb_index++; + } + + fprintf(mvs, "\n"); + + mb_index = 0; + fprintf(mvs, "Mb mv ref for Frame %d\n", frame); + + for (mb_row = 0; mb_row < rows; ++mb_row) { + for (mb_col = 0; mb_col < cols; ++mb_col) { + fprintf(mvs, "%2d ", mi[mb_index].mbmi.ref_frame); + + mb_index++; + } + + fprintf(mvs, "\n"); + mb_index++; + } + + fprintf(mvs, "\n"); + + /* print out the macroblock UV modes */ + mb_index = 0; + fprintf(mvs, "UV Modes for Frame %d\n", frame); + + for (mb_row = 0; mb_row < rows; ++mb_row) { + for (mb_col = 0; mb_col < cols; ++mb_col) { + fprintf(mvs, "%2d ", mi[mb_index].mbmi.uv_mode); + + mb_index++; + } + + mb_index++; + fprintf(mvs, "\n"); + } + + fprintf(mvs, "\n"); + + /* print out the block modes */ + fprintf(mvs, "Mbs for Frame %d\n", frame); + { + int b_row; + + for (b_row = 0; b_row < 4 * rows; ++b_row) { + int b_col; + int bindex; + + for (b_col = 0; b_col < 4 * cols; ++b_col) { + mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2); + bindex = (b_row & 3) * 4 + (b_col & 3); + + if (mi[mb_index].mbmi.mode == B_PRED) + fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode); + else + fprintf(mvs, "xx "); + } + + fprintf(mvs, "\n"); + } + } + fprintf(mvs, "\n"); + + /* print out the macroblock mvs */ + mb_index = 0; + fprintf(mvs, "MVs for Frame %d\n", frame); + + for (mb_row = 0; mb_row < rows; ++mb_row) { + for (mb_col = 0; mb_col < cols; ++mb_col) { + fprintf(mvs, "%5d:%-5d", mi[mb_index].mbmi.mv.as_mv.row / 2, + mi[mb_index].mbmi.mv.as_mv.col / 2); + + mb_index++; + } + + mb_index++; + fprintf(mvs, "\n"); + } + + fprintf(mvs, "\n"); + + /* print out the block modes */ + fprintf(mvs, "MVs for Frame %d\n", frame); + { + int b_row; + + for (b_row = 0; b_row < 4 * rows; ++b_row) { + int b_col; + int bindex; + + for (b_col = 0; b_col < 4 * cols; ++b_col) { + mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2); + bindex = (b_row & 3) * 4 + (b_col & 3); + fprintf(mvs, "%3d:%-3d ", mi[mb_index].bmi[bindex].mv.as_mv.row, + mi[mb_index].bmi[bindex].mv.as_mv.col); + } + + fprintf(mvs, "\n"); + } + } + fprintf(mvs, "\n"); + + fclose(mvs); +} diff --git a/media/libvpx/libvpx/vp8/common/default_coef_probs.h b/media/libvpx/libvpx/vp8/common/default_coef_probs.h new file mode 100644 index 0000000000..b25e4a45a3 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/default_coef_probs.h @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_DEFAULT_COEF_PROBS_H_ +#define VPX_VP8_COMMON_DEFAULT_COEF_PROBS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/*Generated file, included by entropy.c*/ + +static const vp8_prob default_coef_probs + [BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES] = { + { /* Block Type ( 0 ) */ + { /* Coeff Band ( 0 )*/ + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } }, + { /* Coeff Band ( 1 )*/ + { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 }, + { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 }, + { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 } }, + { /* Coeff Band ( 2 )*/ + { 1, 98, 248, 255, 236, 226, 255, 255, 128, 128, 128 }, + { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 }, + { 78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 } }, + { /* Coeff Band ( 3 )*/ + { 1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 }, + { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 }, + { 77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 } }, + { /* Coeff Band ( 4 )*/ + { 1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 }, + { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 }, + { 37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 } }, + { /* Coeff Band ( 5 )*/ + { 1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 }, + { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 }, + { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 } }, + { /* Coeff Band ( 6 )*/ + { 1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 }, + { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 }, + { 80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 } }, + { /* Coeff Band ( 7 )*/ + { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 246, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } } }, + { /* Block Type ( 1 ) */ + { /* Coeff Band ( 0 )*/ + { 198, 35, 237, 223, 193, 187, 162, 160, 145, 155, 62 }, + { 131, 45, 198, 221, 172, 176, 220, 157, 252, 221, 1 }, + { 68, 47, 146, 208, 149, 167, 221, 162, 255, 223, 128 } }, + { /* Coeff Band ( 1 )*/ + { 1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 }, + { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 }, + { 81, 99, 181, 242, 176, 190, 249, 202, 255, 255, 128 } }, + { /* Coeff Band ( 2 )*/ + { 1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 }, + { 99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 }, + { 23, 91, 163, 242, 170, 187, 247, 210, 255, 255, 128 } }, + { /* Coeff Band ( 3 )*/ + { 1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 }, + { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 }, + { 44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 } }, + { /* Coeff Band ( 4 )*/ + { 1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 }, + { 94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 }, + { 22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 } }, + { /* Coeff Band ( 5 )*/ + { 1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 }, + { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 }, + { 35, 77, 181, 251, 193, 211, 255, 205, 128, 128, 128 } }, + { /* Coeff Band ( 6 )*/ + { 1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 }, + { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 }, + { 45, 99, 188, 251, 195, 217, 255, 224, 128, 128, 128 } }, + { /* Coeff Band ( 7 )*/ + { 1, 1, 251, 255, 213, 255, 128, 128, 128, 128, 128 }, + { 203, 1, 248, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 } } }, + { /* Block Type ( 2 ) */ + { /* Coeff Band ( 0 )*/ + { 253, 9, 248, 251, 207, 208, 255, 192, 128, 128, 128 }, + { 175, 13, 224, 243, 193, 185, 249, 198, 255, 255, 128 }, + { 73, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 } }, + { /* Coeff Band ( 1 )*/ + { 1, 95, 247, 253, 212, 183, 255, 255, 128, 128, 128 }, + { 239, 90, 244, 250, 211, 209, 255, 255, 128, 128, 128 }, + { 155, 77, 195, 248, 188, 195, 255, 255, 128, 128, 128 } }, + { /* Coeff Band ( 2 )*/ + { 1, 24, 239, 251, 218, 219, 255, 205, 128, 128, 128 }, + { 201, 51, 219, 255, 196, 186, 128, 128, 128, 128, 128 }, + { 69, 46, 190, 239, 201, 218, 255, 228, 128, 128, 128 } }, + { /* Coeff Band ( 3 )*/ + { 1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 }, + { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 } }, + { /* Coeff Band ( 4 )*/ + { 1, 16, 248, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 190, 36, 230, 255, 236, 255, 128, 128, 128, 128, 128 }, + { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 } }, + { /* Coeff Band ( 5 )*/ + { 1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 } }, + { /* Coeff Band ( 6 )*/ + { 1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 213, 62, 250, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 55, 93, 255, 128, 128, 128, 128, 128, 128, 128, 128 } }, + { /* Coeff Band ( 7 )*/ + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } } }, + { /* Block Type ( 3 ) */ + { /* Coeff Band ( 0 )*/ + { 202, 24, 213, 235, 186, 191, 220, 160, 240, 175, 255 }, + { 126, 38, 182, 232, 169, 184, 228, 174, 255, 187, 128 }, + { 61, 46, 138, 219, 151, 178, 240, 170, 255, 216, 128 } }, + { /* Coeff Band ( 1 )*/ + { 1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 }, + { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 }, + { 39, 77, 162, 232, 172, 180, 245, 178, 255, 255, 128 } }, + { /* Coeff Band ( 2 )*/ + { 1, 52, 220, 246, 198, 199, 249, 220, 255, 255, 128 }, + { 124, 74, 191, 243, 183, 193, 250, 221, 255, 255, 128 }, + { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 } }, + { /* Coeff Band ( 3 )*/ + { 1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 }, + { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 }, + { 28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 } }, + { /* Coeff Band ( 4 )*/ + { 1, 81, 230, 252, 204, 203, 255, 192, 128, 128, 128 }, + { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 }, + { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 } }, + { /* Coeff Band ( 5 )*/ + { 1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 }, + { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 }, + { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 } }, + { /* Coeff Band ( 6 )*/ + { 1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 }, + { 141, 84, 213, 252, 201, 202, 255, 219, 128, 128, 128 }, + { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 } }, + { /* Coeff Band ( 7 )*/ + { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 244, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 } } } + }; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_COMMON_DEFAULT_COEF_PROBS_H_ diff --git a/media/libvpx/libvpx/vp8/common/dequantize.c b/media/libvpx/libvpx/vp8/common/dequantize.c new file mode 100644 index 0000000000..8a56ae6868 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/dequantize.c @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vp8/common/blockd.h" +#include "vpx_mem/vpx_mem.h" + +void vp8_dequantize_b_c(BLOCKD *d, short *DQC) { + int i; + short *DQ = d->dqcoeff; + short *Q = d->qcoeff; + + for (i = 0; i < 16; ++i) { + DQ[i] = Q[i] * DQC[i]; + } +} + +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, + int stride) { + int i; + + for (i = 0; i < 16; ++i) { + input[i] = dq[i] * input[i]; + } + + vp8_short_idct4x4llm_c(input, dest, stride, dest, stride); + + memset(input, 0, 32); +} diff --git a/media/libvpx/libvpx/vp8/common/entropy.c b/media/libvpx/libvpx/vp8/common/entropy.c new file mode 100644 index 0000000000..fc4a3539fd --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/entropy.c @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "entropy.h" +#include "blockd.h" +#include "onyxc_int.h" +#include "vpx_mem/vpx_mem.h" + +#include "coefupdateprobs.h" + +DECLARE_ALIGNED(16, const unsigned char, vp8_norm[256]) = { + 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +DECLARE_ALIGNED(16, const unsigned char, + vp8_coef_bands[16]) = { 0, 1, 2, 3, 6, 4, 5, 6, + 6, 6, 6, 6, 6, 6, 6, 7 }; + +DECLARE_ALIGNED(16, const unsigned char, + vp8_prev_token_class[MAX_ENTROPY_TOKENS]) = { + 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0 +}; + +DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]) = { + 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15, +}; + +DECLARE_ALIGNED(16, const short, + vp8_default_inv_zig_zag[16]) = { 1, 2, 6, 7, 3, 5, 8, 13, + 4, 9, 12, 14, 10, 11, 15, 16 }; + +/* vp8_default_zig_zag_mask generated with: + + void vp8_init_scan_order_mask() + { + int i; + + for (i = 0; i < 16; ++i) + { + vp8_default_zig_zag_mask[vp8_default_zig_zag1d[i]] = 1 << i; + } + + } +*/ +DECLARE_ALIGNED(16, const short, vp8_default_zig_zag_mask[16]) = { + 1, 2, 32, 64, 4, 16, 128, 4096, 8, 256, 2048, 8192, 512, 1024, 16384, -32768 +}; + +const int vp8_mb_feature_data_bits[MB_LVL_MAX] = { 7, 6 }; + +/* Array indices are identical to previously-existing CONTEXT_NODE indices */ +/* corresponding _CONTEXT_NODEs */ +/* clang-format off */ +const vp8_tree_index vp8_coef_tree[22] = { + -DCT_EOB_TOKEN, 2, /* 0 = EOB */ + -ZERO_TOKEN, 4, /* 1 = ZERO */ + -ONE_TOKEN, 6, /* 2 = ONE */ + 8, 12, /* 3 = LOW_VAL */ + -TWO_TOKEN, 10, /* 4 = TWO */ + -THREE_TOKEN, -FOUR_TOKEN, /* 5 = THREE */ + 14, 16, /* 6 = HIGH_LOW */ + -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2, /* 7 = CAT_ONE */ + 18, 20, /* 8 = CAT_THREEFOUR */ + -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4, /* 9 = CAT_THREE */ + -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6 /* 10 = CAT_FIVE */ +}; +/* clang-format on */ + +/* vp8_coef_encodings generated with: + vp8_tokens_from_tree(vp8_coef_encodings, vp8_coef_tree); +*/ +vp8_token vp8_coef_encodings[MAX_ENTROPY_TOKENS] = { + { 2, 2 }, { 6, 3 }, { 28, 5 }, { 58, 6 }, { 59, 6 }, { 60, 6 }, + { 61, 6 }, { 124, 7 }, { 125, 7 }, { 126, 7 }, { 127, 7 }, { 0, 1 } +}; + +/* Trees for extra bits. Probabilities are constant and + do not depend on previously encoded bits */ + +static const vp8_prob Pcat1[] = { 159 }; +static const vp8_prob Pcat2[] = { 165, 145 }; +static const vp8_prob Pcat3[] = { 173, 148, 140 }; +static const vp8_prob Pcat4[] = { 176, 155, 140, 135 }; +static const vp8_prob Pcat5[] = { 180, 157, 141, 134, 130 }; +static const vp8_prob Pcat6[] = { 254, 254, 243, 230, 196, 177, + 153, 140, 133, 130, 129 }; + +/* tree index tables generated with: + + void init_bit_tree(vp8_tree_index *p, int n) { + int i = 0; + + while (++i < n) { + p[0] = p[1] = i << 1; + p += 2; + } + + p[0] = p[1] = 0; + } + + void init_bit_trees() { + init_bit_tree(cat1, 1); + init_bit_tree(cat2, 2); + init_bit_tree(cat3, 3); + init_bit_tree(cat4, 4); + init_bit_tree(cat5, 5); + init_bit_tree(cat6, 11); + } +*/ + +static const vp8_tree_index cat1[2] = { 0, 0 }; +static const vp8_tree_index cat2[4] = { 2, 2, 0, 0 }; +static const vp8_tree_index cat3[6] = { 2, 2, 4, 4, 0, 0 }; +static const vp8_tree_index cat4[8] = { 2, 2, 4, 4, 6, 6, 0, 0 }; +static const vp8_tree_index cat5[10] = { 2, 2, 4, 4, 6, 6, 8, 8, 0, 0 }; +static const vp8_tree_index cat6[22] = { 2, 2, 4, 4, 6, 6, 8, 8, + 10, 10, 12, 12, 14, 14, 16, 16, + 18, 18, 20, 20, 0, 0 }; + +const vp8_extra_bit_struct vp8_extra_bits[12] = { + { 0, 0, 0, 0 }, { 0, 0, 0, 1 }, { 0, 0, 0, 2 }, + { 0, 0, 0, 3 }, { 0, 0, 0, 4 }, { cat1, Pcat1, 1, 5 }, + { cat2, Pcat2, 2, 7 }, { cat3, Pcat3, 3, 11 }, { cat4, Pcat4, 4, 19 }, + { cat5, Pcat5, 5, 35 }, { cat6, Pcat6, 11, 67 }, { 0, 0, 0, 0 } +}; + +#include "default_coef_probs.h" + +void vp8_default_coef_probs(VP8_COMMON *pc) { + memcpy(pc->fc.coef_probs, default_coef_probs, sizeof(default_coef_probs)); +} diff --git a/media/libvpx/libvpx/vp8/common/entropy.h b/media/libvpx/libvpx/vp8/common/entropy.h new file mode 100644 index 0000000000..fbdb7bcfca --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/entropy.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_ENTROPY_H_ +#define VPX_VP8_COMMON_ENTROPY_H_ + +#include "treecoder.h" +#include "blockd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* Coefficient token alphabet */ + +#define ZERO_TOKEN 0 /* 0 Extra Bits 0+0 */ +#define ONE_TOKEN 1 /* 1 Extra Bits 0+1 */ +#define TWO_TOKEN 2 /* 2 Extra Bits 0+1 */ +#define THREE_TOKEN 3 /* 3 Extra Bits 0+1 */ +#define FOUR_TOKEN 4 /* 4 Extra Bits 0+1 */ +#define DCT_VAL_CATEGORY1 5 /* 5-6 Extra Bits 1+1 */ +#define DCT_VAL_CATEGORY2 6 /* 7-10 Extra Bits 2+1 */ +#define DCT_VAL_CATEGORY3 7 /* 11-18 Extra Bits 3+1 */ +#define DCT_VAL_CATEGORY4 8 /* 19-34 Extra Bits 4+1 */ +#define DCT_VAL_CATEGORY5 9 /* 35-66 Extra Bits 5+1 */ +#define DCT_VAL_CATEGORY6 10 /* 67+ Extra Bits 11+1 */ +#define DCT_EOB_TOKEN 11 /* EOB Extra Bits 0+0 */ + +#define MAX_ENTROPY_TOKENS 12 +#define ENTROPY_NODES 11 + +extern const vp8_tree_index vp8_coef_tree[]; + +extern const struct vp8_token_struct vp8_coef_encodings[MAX_ENTROPY_TOKENS]; + +typedef struct { + vp8_tree_p tree; + const vp8_prob *prob; + int Len; + int base_val; +} vp8_extra_bit_struct; + +extern const vp8_extra_bit_struct + vp8_extra_bits[12]; /* indexed by token value */ + +#define PROB_UPDATE_BASELINE_COST 7 + +#define MAX_PROB 255 +#define DCT_MAX_VALUE 2048 + +/* Coefficients are predicted via a 3-dimensional probability table. */ + +/* Outside dimension. 0 = Y no DC, 1 = Y2, 2 = UV, 3 = Y with DC */ + +#define BLOCK_TYPES 4 + +/* Middle dimension is a coarsening of the coefficient's + position within the 4x4 DCT. */ + +#define COEF_BANDS 8 +extern DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]); + +/* Inside dimension is 3-valued measure of nearby complexity, that is, + the extent to which nearby coefficients are nonzero. For the first + coefficient (DC, unless block type is 0), we look at the (already encoded) + blocks above and to the left of the current block. The context index is + then the number (0,1,or 2) of these blocks having nonzero coefficients. + After decoding a coefficient, the measure is roughly the size of the + most recently decoded coefficient (0 for 0, 1 for 1, 2 for >1). + Note that the intuitive meaning of this measure changes as coefficients + are decoded, e.g., prior to the first token, a zero means that my neighbors + are empty while, after the first token, because of the use of end-of-block, + a zero means we just decoded a zero and hence guarantees that a non-zero + coefficient will appear later in this block. However, this shift + in meaning is perfectly OK because our context depends also on the + coefficient band (and since zigzag positions 0, 1, and 2 are in + distinct bands). */ + +/*# define DC_TOKEN_CONTEXTS 3*/ /* 00, 0!0, !0!0 */ +#define PREV_COEF_CONTEXTS 3 + +extern DECLARE_ALIGNED(16, const unsigned char, + vp8_prev_token_class[MAX_ENTROPY_TOKENS]); + +extern const vp8_prob vp8_coef_update_probs[BLOCK_TYPES][COEF_BANDS] + [PREV_COEF_CONTEXTS][ENTROPY_NODES]; + +struct VP8Common; +void vp8_default_coef_probs(struct VP8Common *); + +extern DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]); +extern DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]); +extern DECLARE_ALIGNED(16, const short, vp8_default_zig_zag_mask[16]); +extern const int vp8_mb_feature_data_bits[MB_LVL_MAX]; + +void vp8_coef_tree_initialize(void); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_COMMON_ENTROPY_H_ diff --git a/media/libvpx/libvpx/vp8/common/entropymode.c b/media/libvpx/libvpx/vp8/common/entropymode.c new file mode 100644 index 0000000000..f61e0c2e2b --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/entropymode.c @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#define USE_PREBUILT_TABLES + +#include "entropymode.h" +#include "entropy.h" +#include "vpx_mem/vpx_mem.h" + +#include "vp8_entropymodedata.h" + +int vp8_mv_cont(const int_mv *l, const int_mv *a) { + int lez = (l->as_int == 0); + int aez = (a->as_int == 0); + int lea = (l->as_int == a->as_int); + + if (lea && lez) return SUBMVREF_LEFT_ABOVE_ZED; + + if (lea) return SUBMVREF_LEFT_ABOVE_SAME; + + if (aez) return SUBMVREF_ABOVE_ZED; + + if (lez) return SUBMVREF_LEFT_ZED; + + return SUBMVREF_NORMAL; +} + +static const vp8_prob sub_mv_ref_prob[VP8_SUBMVREFS - 1] = { 180, 162, 25 }; + +const vp8_prob vp8_sub_mv_ref_prob2[SUBMVREF_COUNT][VP8_SUBMVREFS - 1] = { + { 147, 136, 18 }, + { 106, 145, 1 }, + { 179, 121, 1 }, + { 223, 1, 34 }, + { 208, 1, 1 } +}; + +const vp8_mbsplit vp8_mbsplits[VP8_NUMMBSPLITS] = { + { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 }, + { 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1 }, + { 0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } +}; + +const int vp8_mbsplit_count[VP8_NUMMBSPLITS] = { 2, 2, 4, 16 }; + +const vp8_prob vp8_mbsplit_probs[VP8_NUMMBSPLITS - 1] = { 110, 111, 150 }; + +/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */ + +const vp8_tree_index vp8_bmode_tree[18] = /* INTRAMODECONTEXTNODE value */ + { + -B_DC_PRED, 2, /* 0 = DC_NODE */ + -B_TM_PRED, 4, /* 1 = TM_NODE */ + -B_VE_PRED, 6, /* 2 = VE_NODE */ + 8, 12, /* 3 = COM_NODE */ + -B_HE_PRED, 10, /* 4 = HE_NODE */ + -B_RD_PRED, -B_VR_PRED, /* 5 = RD_NODE */ + -B_LD_PRED, 14, /* 6 = LD_NODE */ + -B_VL_PRED, 16, /* 7 = VL_NODE */ + -B_HD_PRED, -B_HU_PRED /* 8 = HD_NODE */ + }; + +/* Again, these trees use the same probability indices as their + explicitly-programmed predecessors. */ + +const vp8_tree_index vp8_ymode_tree[8] = { + -DC_PRED, 2, 4, 6, -V_PRED, -H_PRED, -TM_PRED, -B_PRED +}; + +const vp8_tree_index vp8_kf_ymode_tree[8] = { -B_PRED, 2, 4, + 6, -DC_PRED, -V_PRED, + -H_PRED, -TM_PRED }; + +const vp8_tree_index vp8_uv_mode_tree[6] = { -DC_PRED, 2, -V_PRED, + 4, -H_PRED, -TM_PRED }; + +const vp8_tree_index vp8_mbsplit_tree[6] = { -3, 2, -2, 4, -0, -1 }; + +const vp8_tree_index vp8_mv_ref_tree[8] = { -ZEROMV, 2, -NEARESTMV, 4, + -NEARMV, 6, -NEWMV, -SPLITMV }; + +const vp8_tree_index vp8_sub_mv_ref_tree[6] = { -LEFT4X4, 2, -ABOVE4X4, + 4, -ZERO4X4, -NEW4X4 }; + +const vp8_tree_index vp8_small_mvtree[14] = { 2, 8, 4, 6, -0, -1, -2, + -3, 10, 12, -4, -5, -6, -7 }; + +void vp8_init_mbmode_probs(VP8_COMMON *x) { + memcpy(x->fc.ymode_prob, vp8_ymode_prob, sizeof(vp8_ymode_prob)); + memcpy(x->fc.uv_mode_prob, vp8_uv_mode_prob, sizeof(vp8_uv_mode_prob)); + memcpy(x->fc.sub_mv_ref_prob, sub_mv_ref_prob, sizeof(sub_mv_ref_prob)); +} + +void vp8_default_bmode_probs(vp8_prob dest[VP8_BINTRAMODES - 1]) { + memcpy(dest, vp8_bmode_prob, sizeof(vp8_bmode_prob)); +} diff --git a/media/libvpx/libvpx/vp8/common/entropymode.h b/media/libvpx/libvpx/vp8/common/entropymode.h new file mode 100644 index 0000000000..c772cece57 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/entropymode.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_ENTROPYMODE_H_ +#define VPX_VP8_COMMON_ENTROPYMODE_H_ + +#include "onyxc_int.h" +#include "treecoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + SUBMVREF_NORMAL, + SUBMVREF_LEFT_ZED, + SUBMVREF_ABOVE_ZED, + SUBMVREF_LEFT_ABOVE_SAME, + SUBMVREF_LEFT_ABOVE_ZED +} sumvfref_t; + +typedef int vp8_mbsplit[16]; + +#define VP8_NUMMBSPLITS 4 + +extern const vp8_mbsplit vp8_mbsplits[VP8_NUMMBSPLITS]; + +extern const int vp8_mbsplit_count[VP8_NUMMBSPLITS]; /* # of subsets */ + +extern const vp8_prob vp8_mbsplit_probs[VP8_NUMMBSPLITS - 1]; + +extern int vp8_mv_cont(const int_mv *l, const int_mv *a); +#define SUBMVREF_COUNT 5 +extern const vp8_prob vp8_sub_mv_ref_prob2[SUBMVREF_COUNT][VP8_SUBMVREFS - 1]; + +extern const unsigned int vp8_kf_default_bmode_counts[VP8_BINTRAMODES] + [VP8_BINTRAMODES] + [VP8_BINTRAMODES]; + +extern const vp8_tree_index vp8_bmode_tree[]; + +extern const vp8_tree_index vp8_ymode_tree[]; +extern const vp8_tree_index vp8_kf_ymode_tree[]; +extern const vp8_tree_index vp8_uv_mode_tree[]; + +extern const vp8_tree_index vp8_mbsplit_tree[]; +extern const vp8_tree_index vp8_mv_ref_tree[]; +extern const vp8_tree_index vp8_sub_mv_ref_tree[]; + +extern const struct vp8_token_struct vp8_bmode_encodings[VP8_BINTRAMODES]; +extern const struct vp8_token_struct vp8_ymode_encodings[VP8_YMODES]; +extern const struct vp8_token_struct vp8_kf_ymode_encodings[VP8_YMODES]; +extern const struct vp8_token_struct vp8_uv_mode_encodings[VP8_UV_MODES]; +extern const struct vp8_token_struct vp8_mbsplit_encodings[VP8_NUMMBSPLITS]; + +/* Inter mode values do not start at zero */ + +extern const struct vp8_token_struct vp8_mv_ref_encoding_array[VP8_MVREFS]; +extern const struct vp8_token_struct + vp8_sub_mv_ref_encoding_array[VP8_SUBMVREFS]; + +extern const vp8_tree_index vp8_small_mvtree[]; + +extern const struct vp8_token_struct vp8_small_mvencodings[8]; + +/* Key frame default mode probs */ +extern const vp8_prob vp8_kf_bmode_prob[VP8_BINTRAMODES][VP8_BINTRAMODES] + [VP8_BINTRAMODES - 1]; +extern const vp8_prob vp8_kf_uv_mode_prob[VP8_UV_MODES - 1]; +extern const vp8_prob vp8_kf_ymode_prob[VP8_YMODES - 1]; + +void vp8_init_mbmode_probs(VP8_COMMON *x); +void vp8_default_bmode_probs(vp8_prob dest[VP8_BINTRAMODES - 1]); +void vp8_kf_default_bmode_probs( + vp8_prob dest[VP8_BINTRAMODES][VP8_BINTRAMODES][VP8_BINTRAMODES - 1]); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_COMMON_ENTROPYMODE_H_ diff --git a/media/libvpx/libvpx/vp8/common/entropymv.c b/media/libvpx/libvpx/vp8/common/entropymv.c new file mode 100644 index 0000000000..fb4f0c889f --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/entropymv.c @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "entropymv.h" + +/* clang-format off */ +const MV_CONTEXT vp8_mv_update_probs[2] = { + { { + 237, + 246, + 253, 253, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 250, 250, 252, 254, 254 + } }, + { { + 231, + 243, + 245, 253, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 251, 251, 254, 254, 254 + } } +}; +/* clang-format on */ + +const MV_CONTEXT vp8_default_mv_context[2] = { + { { + /* row */ + 162, /* is short */ + 128, /* sign */ + 225, 146, 172, 147, 214, 39, 156, /* short tree */ + 128, 129, 132, 75, 145, 178, 206, 239, 254, 254 /* long bits */ + } }, + + { { + /* same for column */ + 164, /* is short */ + 128, /**/ + 204, 170, 119, 235, 140, 230, 228, /**/ + 128, 130, 130, 74, 148, 180, 203, 236, 254, 254 /* long bits */ + + } } +}; diff --git a/media/libvpx/libvpx/vp8/common/entropymv.h b/media/libvpx/libvpx/vp8/common/entropymv.h new file mode 100644 index 0000000000..40039f5b2c --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/entropymv.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_ENTROPYMV_H_ +#define VPX_VP8_COMMON_ENTROPYMV_H_ + +#include "treecoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +enum { + mv_max = 1023, /* max absolute value of a MV component */ + MVvals = (2 * mv_max) + 1, /* # possible values "" */ + mvfp_max = 255, /* max absolute value of a full pixel MV component */ + MVfpvals = (2 * mvfp_max) + 1, /* # possible full pixel MV values */ + + mvlong_width = 10, /* Large MVs have 9 bit magnitudes */ + mvnum_short = 8, /* magnitudes 0 through 7 */ + + /* probability offsets for coding each MV component */ + + mvpis_short = 0, /* short (<= 7) vs long (>= 8) */ + MVPsign, /* sign for non-zero */ + MVPshort, /* 8 short values = 7-position tree */ + + MVPbits = MVPshort + mvnum_short - 1, /* mvlong_width long value bits */ + MVPcount = MVPbits + mvlong_width /* (with independent probabilities) */ +}; + +typedef struct mv_context { + vp8_prob prob[MVPcount]; /* often come in row, col pairs */ +} MV_CONTEXT; + +extern const MV_CONTEXT vp8_mv_update_probs[2], vp8_default_mv_context[2]; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_COMMON_ENTROPYMV_H_ diff --git a/media/libvpx/libvpx/vp8/common/extend.c b/media/libvpx/libvpx/vp8/common/extend.c new file mode 100644 index 0000000000..b52e9fe93c --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/extend.c @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "extend.h" +#include "vpx_mem/vpx_mem.h" + +static void copy_and_extend_plane( + unsigned char *s, /* source */ + int sp, /* source pitch */ + unsigned char *d, /* destination */ + int dp, /* destination pitch */ + int h, /* height */ + int w, /* width */ + int et, /* extend top border */ + int el, /* extend left border */ + int eb, /* extend bottom border */ + int er, /* extend right border */ + int interleave_step) { /* step between pixels of the current plane */ + int i, j; + unsigned char *src_ptr1, *src_ptr2; + unsigned char *dest_ptr1, *dest_ptr2; + int linesize; + + if (interleave_step < 1) interleave_step = 1; + + /* copy the left and right most columns out */ + src_ptr1 = s; + src_ptr2 = s + (w - 1) * interleave_step; + dest_ptr1 = d - el; + dest_ptr2 = d + w; + + for (i = 0; i < h; ++i) { + memset(dest_ptr1, src_ptr1[0], el); + if (interleave_step == 1) { + memcpy(dest_ptr1 + el, src_ptr1, w); + } else { + for (j = 0; j < w; j++) { + dest_ptr1[el + j] = src_ptr1[interleave_step * j]; + } + } + memset(dest_ptr2, src_ptr2[0], er); + src_ptr1 += sp; + src_ptr2 += sp; + dest_ptr1 += dp; + dest_ptr2 += dp; + } + + /* Now copy the top and bottom lines into each line of the respective + * borders + */ + src_ptr1 = d - el; + src_ptr2 = d + dp * (h - 1) - el; + dest_ptr1 = d + dp * (-et) - el; + dest_ptr2 = d + dp * (h)-el; + linesize = el + er + w; + + for (i = 0; i < et; ++i) { + memcpy(dest_ptr1, src_ptr1, linesize); + dest_ptr1 += dp; + } + + for (i = 0; i < eb; ++i) { + memcpy(dest_ptr2, src_ptr2, linesize); + dest_ptr2 += dp; + } +} + +void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst) { + int et = dst->border; + int el = dst->border; + int eb = dst->border + dst->y_height - src->y_height; + int er = dst->border + dst->y_width - src->y_width; + + // detect nv12 colorspace + int chroma_step = src->v_buffer - src->u_buffer == 1 ? 2 : 1; + + copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, src->y_height, src->y_width, et, el, eb, + er, 1); + + et = dst->border >> 1; + el = dst->border >> 1; + eb = (dst->border >> 1) + dst->uv_height - src->uv_height; + er = (dst->border >> 1) + dst->uv_width - src->uv_width; + + copy_and_extend_plane(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, src->uv_height, src->uv_width, et, el, + eb, er, chroma_step); + + copy_and_extend_plane(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, src->uv_height, src->uv_width, et, el, + eb, er, chroma_step); +} + +void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, int srcy, + int srcx, int srch, int srcw) { + int et = dst->border; + int el = dst->border; + int eb = dst->border + dst->y_height - src->y_height; + int er = dst->border + dst->y_width - src->y_width; + int src_y_offset = srcy * src->y_stride + srcx; + int dst_y_offset = srcy * dst->y_stride + srcx; + int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1); + int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1); + // detect nv12 colorspace + int chroma_step = src->v_buffer - src->u_buffer == 1 ? 2 : 1; + + /* If the side is not touching the bounder then don't extend. */ + if (srcy) et = 0; + if (srcx) el = 0; + if (srcy + srch != src->y_height) eb = 0; + if (srcx + srcw != src->y_width) er = 0; + + copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride, + dst->y_buffer + dst_y_offset, dst->y_stride, srch, srcw, + et, el, eb, er, 1); + + et = (et + 1) >> 1; + el = (el + 1) >> 1; + eb = (eb + 1) >> 1; + er = (er + 1) >> 1; + srch = (srch + 1) >> 1; + srcw = (srcw + 1) >> 1; + + copy_and_extend_plane(src->u_buffer + src_uv_offset, src->uv_stride, + dst->u_buffer + dst_uv_offset, dst->uv_stride, srch, + srcw, et, el, eb, er, chroma_step); + + copy_and_extend_plane(src->v_buffer + src_uv_offset, src->uv_stride, + dst->v_buffer + dst_uv_offset, dst->uv_stride, srch, + srcw, et, el, eb, er, chroma_step); +} + +/* note the extension is only for the last row, for intra prediction purpose */ +void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, + unsigned char *UPtr, unsigned char *VPtr) { + int i; + + YPtr += ybf->y_stride * 14; + UPtr += ybf->uv_stride * 6; + VPtr += ybf->uv_stride * 6; + + for (i = 0; i < 4; ++i) { + YPtr[i] = YPtr[-1]; + UPtr[i] = UPtr[-1]; + VPtr[i] = VPtr[-1]; + } + + YPtr += ybf->y_stride; + UPtr += ybf->uv_stride; + VPtr += ybf->uv_stride; + + for (i = 0; i < 4; ++i) { + YPtr[i] = YPtr[-1]; + UPtr[i] = UPtr[-1]; + VPtr[i] = VPtr[-1]; + } +} diff --git a/media/libvpx/libvpx/vp8/common/extend.h b/media/libvpx/libvpx/vp8/common/extend.h new file mode 100644 index 0000000000..586a38a4f3 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/extend.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_EXTEND_H_ +#define VPX_VP8_COMMON_EXTEND_H_ + +#include "vpx_scale/yv12config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, + unsigned char *UPtr, unsigned char *VPtr); +void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst); +void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, int srcy, + int srcx, int srch, int srcw); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_COMMON_EXTEND_H_ diff --git a/media/libvpx/libvpx/vp8/common/filter.c b/media/libvpx/libvpx/vp8/common/filter.c new file mode 100644 index 0000000000..267498335c --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/filter.c @@ -0,0 +1,381 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vp8_rtcd.h" +#include "vp8/common/filter.h" + +DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]) = { + { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, + { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 } +}; + +DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]) = { + + { 0, 0, 128, 0, 0, + 0 }, /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */ + { 0, -6, 123, 12, -1, 0 }, + { 2, -11, 108, 36, -8, 1 }, /* New 1/4 pel 6 tap filter */ + { 0, -9, 93, 50, -6, 0 }, + { 3, -16, 77, 77, -16, 3 }, /* New 1/2 pel 6 tap filter */ + { 0, -6, 50, 93, -9, 0 }, + { 1, -8, 36, 108, -11, 2 }, /* New 1/4 pel 6 tap filter */ + { 0, -1, 12, 123, -6, 0 }, +}; + +static void filter_block2d_first_pass(unsigned char *src_ptr, int *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter) { + unsigned int i, j; + int Temp; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) + + ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) + + ((int)src_ptr[0] * vp8_filter[2]) + + ((int)src_ptr[pixel_step] * vp8_filter[3]) + + ((int)src_ptr[2 * pixel_step] * vp8_filter[4]) + + ((int)src_ptr[3 * pixel_step] * vp8_filter[5]) + + (VP8_FILTER_WEIGHT >> 1); /* Rounding */ + + /* Normalize back to 0-255 */ + Temp = Temp >> VP8_FILTER_SHIFT; + + if (Temp < 0) { + Temp = 0; + } else if (Temp > 255) { + Temp = 255; + } + + output_ptr[j] = Temp; + src_ptr++; + } + + /* Next row... */ + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + +static void filter_block2d_second_pass(int *src_ptr, unsigned char *output_ptr, + int output_pitch, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter) { + unsigned int i, j; + int Temp; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + /* Apply filter */ + Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) + + ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) + + ((int)src_ptr[0] * vp8_filter[2]) + + ((int)src_ptr[pixel_step] * vp8_filter[3]) + + ((int)src_ptr[2 * pixel_step] * vp8_filter[4]) + + ((int)src_ptr[3 * pixel_step] * vp8_filter[5]) + + (VP8_FILTER_WEIGHT >> 1); /* Rounding */ + + /* Normalize back to 0-255 */ + Temp = Temp >> VP8_FILTER_SHIFT; + + if (Temp < 0) { + Temp = 0; + } else if (Temp > 255) { + Temp = 255; + } + + output_ptr[j] = (unsigned char)Temp; + src_ptr++; + } + + /* Start next row */ + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_pitch; + } +} + +static void filter_block2d(unsigned char *src_ptr, unsigned char *output_ptr, + unsigned int src_pixels_per_line, int output_pitch, + const short *HFilter, const short *VFilter) { + int FData[9 * 4]; /* Temp data buffer used in filtering */ + + /* First filter 1-D horizontally... */ + filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, + src_pixels_per_line, 1, 9, 4, HFilter); + + /* then filter verticaly... */ + filter_block2d_second_pass(FData + 8, output_ptr, output_pitch, 4, 4, 4, 4, + VFilter); +} + +void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, unsigned char *dst_ptr, + int dst_pitch) { + const short *HFilter; + const short *VFilter; + + HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ + VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ + + filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, + VFilter); +} +void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, unsigned char *dst_ptr, + int dst_pitch) { + const short *HFilter; + const short *VFilter; + int FData[13 * 16]; /* Temp data buffer used in filtering */ + + HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ + VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ + + /* First filter 1-D horizontally... */ + filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, + src_pixels_per_line, 1, 13, 8, HFilter); + + /* then filter verticaly... */ + filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 8, 8, + VFilter); +} + +void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, unsigned char *dst_ptr, + int dst_pitch) { + const short *HFilter; + const short *VFilter; + int FData[13 * 16]; /* Temp data buffer used in filtering */ + + HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ + VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ + + /* First filter 1-D horizontally... */ + filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, + src_pixels_per_line, 1, 9, 8, HFilter); + + /* then filter verticaly... */ + filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 4, 8, + VFilter); +} + +void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, unsigned char *dst_ptr, + int dst_pitch) { + const short *HFilter; + const short *VFilter; + int FData[21 * 24]; /* Temp data buffer used in filtering */ + + HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ + VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ + + /* First filter 1-D horizontally... */ + filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, + src_pixels_per_line, 1, 21, 16, HFilter); + + /* then filter verticaly... */ + filter_block2d_second_pass(FData + 32, dst_ptr, dst_pitch, 16, 16, 16, 16, + VFilter); +} + +/**************************************************************************** + * + * ROUTINE : filter_block2d_bil_first_pass + * + * INPUTS : UINT8 *src_ptr : Pointer to source block. + * UINT32 src_stride : Stride of source block. + * UINT32 height : Block height. + * UINT32 width : Block width. + * INT32 *vp8_filter : Array of 2 bi-linear filter taps. + * + * OUTPUTS : INT32 *dst_ptr : Pointer to filtered block. + * + * RETURNS : void + * + * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block + * in the horizontal direction to produce the filtered output + * block. Used to implement first-pass of 2-D separable filter. + * + * SPECIAL NOTES : Produces INT32 output to retain precision for next pass. + * Two filter taps should sum to VP8_FILTER_WEIGHT. + * + ****************************************************************************/ +static void filter_block2d_bil_first_pass( + unsigned char *src_ptr, unsigned short *dst_ptr, unsigned int src_stride, + unsigned int height, unsigned int width, const short *vp8_filter) { + unsigned int i, j; + + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + /* Apply bilinear filter */ + dst_ptr[j] = + (((int)src_ptr[0] * vp8_filter[0]) + + ((int)src_ptr[1] * vp8_filter[1]) + (VP8_FILTER_WEIGHT / 2)) >> + VP8_FILTER_SHIFT; + src_ptr++; + } + + /* Next row... */ + src_ptr += src_stride - width; + dst_ptr += width; + } +} + +/**************************************************************************** + * + * ROUTINE : filter_block2d_bil_second_pass + * + * INPUTS : INT32 *src_ptr : Pointer to source block. + * UINT32 dst_pitch : Destination block pitch. + * UINT32 height : Block height. + * UINT32 width : Block width. + * INT32 *vp8_filter : Array of 2 bi-linear filter taps. + * + * OUTPUTS : UINT16 *dst_ptr : Pointer to filtered block. + * + * RETURNS : void + * + * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block + * in the vertical direction to produce the filtered output + * block. Used to implement second-pass of 2-D separable + * filter. + * + * SPECIAL NOTES : Requires 32-bit input as produced by + * filter_block2d_bil_first_pass. + * Two filter taps should sum to VP8_FILTER_WEIGHT. + * + ****************************************************************************/ +static void filter_block2d_bil_second_pass(unsigned short *src_ptr, + unsigned char *dst_ptr, + int dst_pitch, unsigned int height, + unsigned int width, + const short *vp8_filter) { + unsigned int i, j; + int Temp; + + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + /* Apply filter */ + Temp = ((int)src_ptr[0] * vp8_filter[0]) + + ((int)src_ptr[width] * vp8_filter[1]) + (VP8_FILTER_WEIGHT / 2); + dst_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT); + src_ptr++; + } + + /* Next row... */ + dst_ptr += dst_pitch; + } +} + +/**************************************************************************** + * + * ROUTINE : filter_block2d_bil + * + * INPUTS : UINT8 *src_ptr : Pointer to source block. + * UINT32 src_pitch : Stride of source block. + * UINT32 dst_pitch : Stride of destination block. + * INT32 *HFilter : Array of 2 horizontal filter + * taps. + * INT32 *VFilter : Array of 2 vertical filter taps. + * INT32 Width : Block width + * INT32 Height : Block height + * + * OUTPUTS : UINT16 *dst_ptr : Pointer to filtered block. + * + * RETURNS : void + * + * FUNCTION : 2-D filters an input block by applying a 2-tap + * bi-linear filter horizontally followed by a 2-tap + * bi-linear filter vertically on the result. + * + * SPECIAL NOTES : The largest block size can be handled here is 16x16 + * + ****************************************************************************/ +static void filter_block2d_bil(unsigned char *src_ptr, unsigned char *dst_ptr, + unsigned int src_pitch, unsigned int dst_pitch, + const short *HFilter, const short *VFilter, + int Width, int Height) { + unsigned short FData[17 * 16]; /* Temp data buffer used in filtering */ + + /* First filter 1-D horizontally... */ + filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, + HFilter); + + /* then 1-D vertically... */ + filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, + VFilter); +} + +void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, unsigned char *dst_ptr, + int dst_pitch) { + const short *HFilter; + const short *VFilter; + + // This represents a copy and is not required to be handled by optimizations. + assert((xoffset | yoffset) != 0); + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, + VFilter, 4, 4); +} + +void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, unsigned char *dst_ptr, + int dst_pitch) { + const short *HFilter; + const short *VFilter; + + assert((xoffset | yoffset) != 0); + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, + VFilter, 8, 8); +} + +void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, unsigned char *dst_ptr, + int dst_pitch) { + const short *HFilter; + const short *VFilter; + + assert((xoffset | yoffset) != 0); + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, + VFilter, 8, 4); +} + +void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, + int src_pixels_per_line, int xoffset, + int yoffset, unsigned char *dst_ptr, + int dst_pitch) { + const short *HFilter; + const short *VFilter; + + assert((xoffset | yoffset) != 0); + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, + VFilter, 16, 16); +} diff --git a/media/libvpx/libvpx/vp8/common/filter.h b/media/libvpx/libvpx/vp8/common/filter.h new file mode 100644 index 0000000000..6acee22b21 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/filter.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_FILTER_H_ +#define VPX_VP8_COMMON_FILTER_H_ + +#include "vpx_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define BLOCK_HEIGHT_WIDTH 4 +#define VP8_FILTER_WEIGHT 128 +#define VP8_FILTER_SHIFT 7 + +extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]); +extern DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_COMMON_FILTER_H_ diff --git a/media/libvpx/libvpx/vp8/common/findnearmv.c b/media/libvpx/libvpx/vp8/common/findnearmv.c new file mode 100644 index 0000000000..3b31923621 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/findnearmv.c @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "findnearmv.h" + +const unsigned char vp8_mbsplit_offset[4][16] = { + { 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } +}; + +/* Predict motion vectors using those from already-decoded nearby blocks. + Note that we only consider one 4x4 subblock from each candidate 16x16 + macroblock. */ +void vp8_find_near_mvs(MACROBLOCKD *xd, const MODE_INFO *here, int_mv *nearest, + int_mv *nearby, int_mv *best_mv, int near_mv_ref_cnts[4], + int refframe, int *ref_frame_sign_bias) { + const MODE_INFO *above = here - xd->mode_info_stride; + const MODE_INFO *left = here - 1; + const MODE_INFO *aboveleft = above - 1; + int_mv near_mvs[4]; + int_mv *mv = near_mvs; + int *cntx = near_mv_ref_cnts; + enum { CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV }; + + /* Zero accumulators */ + mv[0].as_int = mv[1].as_int = mv[2].as_int = 0; + near_mv_ref_cnts[0] = near_mv_ref_cnts[1] = near_mv_ref_cnts[2] = + near_mv_ref_cnts[3] = 0; + + /* Process above */ + if (above->mbmi.ref_frame != INTRA_FRAME) { + if (above->mbmi.mv.as_int) { + (++mv)->as_int = above->mbmi.mv.as_int; + mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame], refframe, mv, + ref_frame_sign_bias); + ++cntx; + } + + *cntx += 2; + } + + /* Process left */ + if (left->mbmi.ref_frame != INTRA_FRAME) { + if (left->mbmi.mv.as_int) { + int_mv this_mv; + + this_mv.as_int = left->mbmi.mv.as_int; + mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame], refframe, &this_mv, + ref_frame_sign_bias); + + if (this_mv.as_int != mv->as_int) { + (++mv)->as_int = this_mv.as_int; + ++cntx; + } + + *cntx += 2; + } else { + near_mv_ref_cnts[CNT_INTRA] += 2; + } + } + + /* Process above left */ + if (aboveleft->mbmi.ref_frame != INTRA_FRAME) { + if (aboveleft->mbmi.mv.as_int) { + int_mv this_mv; + + this_mv.as_int = aboveleft->mbmi.mv.as_int; + mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame], refframe, + &this_mv, ref_frame_sign_bias); + + if (this_mv.as_int != mv->as_int) { + (++mv)->as_int = this_mv.as_int; + ++cntx; + } + + *cntx += 1; + } else { + near_mv_ref_cnts[CNT_INTRA] += 1; + } + } + + /* If we have three distinct MV's ... */ + if (near_mv_ref_cnts[CNT_SPLITMV]) { + /* See if above-left MV can be merged with NEAREST */ + if (mv->as_int == near_mvs[CNT_NEAREST].as_int) + near_mv_ref_cnts[CNT_NEAREST] += 1; + } + + near_mv_ref_cnts[CNT_SPLITMV] = + ((above->mbmi.mode == SPLITMV) + (left->mbmi.mode == SPLITMV)) * 2 + + (aboveleft->mbmi.mode == SPLITMV); + + /* Swap near and nearest if necessary */ + if (near_mv_ref_cnts[CNT_NEAR] > near_mv_ref_cnts[CNT_NEAREST]) { + int tmp; + tmp = near_mv_ref_cnts[CNT_NEAREST]; + near_mv_ref_cnts[CNT_NEAREST] = near_mv_ref_cnts[CNT_NEAR]; + near_mv_ref_cnts[CNT_NEAR] = tmp; + tmp = (int)near_mvs[CNT_NEAREST].as_int; + near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int; + near_mvs[CNT_NEAR].as_int = (uint32_t)tmp; + } + + /* Use near_mvs[0] to store the "best" MV */ + if (near_mv_ref_cnts[CNT_NEAREST] >= near_mv_ref_cnts[CNT_INTRA]) { + near_mvs[CNT_INTRA] = near_mvs[CNT_NEAREST]; + } + + /* Set up return values */ + best_mv->as_int = near_mvs[0].as_int; + nearest->as_int = near_mvs[CNT_NEAREST].as_int; + nearby->as_int = near_mvs[CNT_NEAR].as_int; +} + +static void invert_and_clamp_mvs(int_mv *inv, int_mv *src, MACROBLOCKD *xd) { + inv->as_mv.row = src->as_mv.row * -1; + inv->as_mv.col = src->as_mv.col * -1; + vp8_clamp_mv2(inv, xd); + vp8_clamp_mv2(src, xd); +} + +int vp8_find_near_mvs_bias(MACROBLOCKD *xd, const MODE_INFO *here, + int_mv mode_mv_sb[2][MB_MODE_COUNT], + int_mv best_mv_sb[2], int cnt[4], int refframe, + int *ref_frame_sign_bias) { + int sign_bias = ref_frame_sign_bias[refframe]; + + vp8_find_near_mvs(xd, here, &mode_mv_sb[sign_bias][NEARESTMV], + &mode_mv_sb[sign_bias][NEARMV], &best_mv_sb[sign_bias], cnt, + refframe, ref_frame_sign_bias); + + invert_and_clamp_mvs(&mode_mv_sb[!sign_bias][NEARESTMV], + &mode_mv_sb[sign_bias][NEARESTMV], xd); + invert_and_clamp_mvs(&mode_mv_sb[!sign_bias][NEARMV], + &mode_mv_sb[sign_bias][NEARMV], xd); + invert_and_clamp_mvs(&best_mv_sb[!sign_bias], &best_mv_sb[sign_bias], xd); + + return sign_bias; +} + +vp8_prob *vp8_mv_ref_probs(vp8_prob p[VP8_MVREFS - 1], + const int near_mv_ref_ct[4]) { + p[0] = vp8_mode_contexts[near_mv_ref_ct[0]][0]; + p[1] = vp8_mode_contexts[near_mv_ref_ct[1]][1]; + p[2] = vp8_mode_contexts[near_mv_ref_ct[2]][2]; + p[3] = vp8_mode_contexts[near_mv_ref_ct[3]][3]; + /* p[3] = vp8_mode_contexts[near_mv_ref_ct[1] + near_mv_ref_ct[2] + + near_mv_ref_ct[3]][3]; */ + return p; +} diff --git a/media/libvpx/libvpx/vp8/common/findnearmv.h b/media/libvpx/libvpx/vp8/common/findnearmv.h new file mode 100644 index 0000000000..d7db9544aa --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/findnearmv.h @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_FINDNEARMV_H_ +#define VPX_VP8_COMMON_FINDNEARMV_H_ + +#include "./vpx_config.h" +#include "mv.h" +#include "blockd.h" +#include "modecont.h" +#include "treecoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static INLINE void mv_bias(int refmb_ref_frame_sign_bias, int refframe, + int_mv *mvp, const int *ref_frame_sign_bias) { + if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe]) { + mvp->as_mv.row *= -1; + mvp->as_mv.col *= -1; + } +} + +#define LEFT_TOP_MARGIN (16 << 3) +#define RIGHT_BOTTOM_MARGIN (16 << 3) +static INLINE void vp8_clamp_mv2(int_mv *mv, const MACROBLOCKD *xd) { + if (mv->as_mv.col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN)) { + mv->as_mv.col = xd->mb_to_left_edge - LEFT_TOP_MARGIN; + } else if (mv->as_mv.col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN) { + mv->as_mv.col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN; + } + + if (mv->as_mv.row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN)) { + mv->as_mv.row = xd->mb_to_top_edge - LEFT_TOP_MARGIN; + } else if (mv->as_mv.row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN) { + mv->as_mv.row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN; + } +} + +static INLINE void vp8_clamp_mv(int_mv *mv, int mb_to_left_edge, + int mb_to_right_edge, int mb_to_top_edge, + int mb_to_bottom_edge) { + mv->as_mv.col = + (mv->as_mv.col < mb_to_left_edge) ? mb_to_left_edge : mv->as_mv.col; + mv->as_mv.col = + (mv->as_mv.col > mb_to_right_edge) ? mb_to_right_edge : mv->as_mv.col; + mv->as_mv.row = + (mv->as_mv.row < mb_to_top_edge) ? mb_to_top_edge : mv->as_mv.row; + mv->as_mv.row = + (mv->as_mv.row > mb_to_bottom_edge) ? mb_to_bottom_edge : mv->as_mv.row; +} +static INLINE unsigned int vp8_check_mv_bounds(int_mv *mv, int mb_to_left_edge, + int mb_to_right_edge, + int mb_to_top_edge, + int mb_to_bottom_edge) { + unsigned int need_to_clamp; + need_to_clamp = (mv->as_mv.col < mb_to_left_edge); + need_to_clamp |= (mv->as_mv.col > mb_to_right_edge); + need_to_clamp |= (mv->as_mv.row < mb_to_top_edge); + need_to_clamp |= (mv->as_mv.row > mb_to_bottom_edge); + return need_to_clamp; +} + +void vp8_find_near_mvs(MACROBLOCKD *xd, const MODE_INFO *here, int_mv *nearest, + int_mv *nearby, int_mv *best_mv, int near_mv_ref_cnts[4], + int refframe, int *ref_frame_sign_bias); + +int vp8_find_near_mvs_bias(MACROBLOCKD *xd, const MODE_INFO *here, + int_mv mode_mv_sb[2][MB_MODE_COUNT], + int_mv best_mv_sb[2], int cnt[4], int refframe, + int *ref_frame_sign_bias); + +vp8_prob *vp8_mv_ref_probs(vp8_prob p[VP8_MVREFS - 1], + const int near_mv_ref_ct[4]); + +extern const unsigned char vp8_mbsplit_offset[4][16]; + +static INLINE uint32_t left_block_mv(const MODE_INFO *cur_mb, int b) { + if (!(b & 3)) { + /* On L edge, get from MB to left of us */ + --cur_mb; + + if (cur_mb->mbmi.mode != SPLITMV) return cur_mb->mbmi.mv.as_int; + b += 4; + } + + return (cur_mb->bmi + b - 1)->mv.as_int; +} + +static INLINE uint32_t above_block_mv(const MODE_INFO *cur_mb, int b, + int mi_stride) { + if (!(b >> 2)) { + /* On top edge, get from MB above us */ + cur_mb -= mi_stride; + + if (cur_mb->mbmi.mode != SPLITMV) return cur_mb->mbmi.mv.as_int; + b += 16; + } + + return (cur_mb->bmi + (b - 4))->mv.as_int; +} +static INLINE B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, + int b) { + if (!(b & 3)) { + /* On L edge, get from MB to left of us */ + --cur_mb; + switch (cur_mb->mbmi.mode) { + case B_PRED: return (cur_mb->bmi + b + 3)->as_mode; + case DC_PRED: return B_DC_PRED; + case V_PRED: return B_VE_PRED; + case H_PRED: return B_HE_PRED; + case TM_PRED: return B_TM_PRED; + default: return B_DC_PRED; + } + } + + return (cur_mb->bmi + b - 1)->as_mode; +} + +static INLINE B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, int b, + int mi_stride) { + if (!(b >> 2)) { + /* On top edge, get from MB above us */ + cur_mb -= mi_stride; + + switch (cur_mb->mbmi.mode) { + case B_PRED: return (cur_mb->bmi + b + 12)->as_mode; + case DC_PRED: return B_DC_PRED; + case V_PRED: return B_VE_PRED; + case H_PRED: return B_HE_PRED; + case TM_PRED: return B_TM_PRED; + default: return B_DC_PRED; + } + } + + return (cur_mb->bmi + b - 4)->as_mode; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_COMMON_FINDNEARMV_H_ diff --git a/media/libvpx/libvpx/vp8/common/generic/systemdependent.c b/media/libvpx/libvpx/vp8/common/generic/systemdependent.c new file mode 100644 index 0000000000..71529bdfd8 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/generic/systemdependent.c @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#if VPX_ARCH_ARM +#include "vpx_ports/arm.h" +#elif VPX_ARCH_X86 || VPX_ARCH_X86_64 +#include "vpx_ports/x86.h" +#elif VPX_ARCH_PPC +#include "vpx_ports/ppc.h" +#elif VPX_ARCH_MIPS +#include "vpx_ports/mips.h" +#elif VPX_ARCH_LOONGARCH +#include "vpx_ports/loongarch.h" +#endif +#include "vp8/common/onyxc_int.h" +#include "vp8/common/systemdependent.h" + +#if CONFIG_MULTITHREAD +#if HAVE_UNISTD_H && !defined(__OS2__) +#include +#elif defined(_WIN32) +#include +typedef void(WINAPI *PGNSI)(LPSYSTEM_INFO); +#elif defined(__OS2__) +#define INCL_DOS +#define INCL_DOSSPINLOCK +#include +#endif +#endif + +#if CONFIG_MULTITHREAD +static int get_cpu_count() { + int core_count = 16; + +#if HAVE_UNISTD_H && !defined(__OS2__) +#if defined(_SC_NPROCESSORS_ONLN) + core_count = (int)sysconf(_SC_NPROCESSORS_ONLN); +#elif defined(_SC_NPROC_ONLN) + core_count = (int)sysconf(_SC_NPROC_ONLN); +#endif +#elif defined(_WIN32) + { +#if _WIN32_WINNT >= 0x0501 + SYSTEM_INFO sysinfo; + GetNativeSystemInfo(&sysinfo); +#else + PGNSI pGNSI; + SYSTEM_INFO sysinfo; + + /* Call GetNativeSystemInfo if supported or + * GetSystemInfo otherwise. */ + + pGNSI = (PGNSI)GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), + "GetNativeSystemInfo"); + if (pGNSI != NULL) + pGNSI(&sysinfo); + else + GetSystemInfo(&sysinfo); +#endif + + core_count = (int)sysinfo.dwNumberOfProcessors; + } +#elif defined(__OS2__) + { + ULONG proc_id; + ULONG status; + + core_count = 0; + for (proc_id = 1;; ++proc_id) { + if (DosGetProcessorStatus(proc_id, &status)) break; + + if (status == PROC_ONLINE) core_count++; + } + } +#else +/* other platforms */ +#endif + + return core_count > 0 ? core_count : 1; +} +#endif + +void vp8_machine_specific_config(VP8_COMMON *ctx) { +#if CONFIG_MULTITHREAD + ctx->processor_core_count = get_cpu_count(); +#endif /* CONFIG_MULTITHREAD */ + +#if VPX_ARCH_ARM + ctx->cpu_caps = arm_cpu_caps(); +#elif VPX_ARCH_X86 || VPX_ARCH_X86_64 + ctx->cpu_caps = x86_simd_caps(); +#elif VPX_ARCH_PPC + ctx->cpu_caps = ppc_simd_caps(); +#elif VPX_ARCH_MIPS + ctx->cpu_caps = mips_cpu_caps(); +#elif VPX_ARCH_LOONGARCH + ctx->cpu_caps = loongarch_cpu_caps(); +#else + // generic-gnu targets. + ctx->cpu_caps = 0; +#endif +} diff --git a/media/libvpx/libvpx/vp8/common/header.h b/media/libvpx/libvpx/vp8/common/header.h new file mode 100644 index 0000000000..e64e241908 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/header.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_HEADER_H_ +#define VPX_VP8_COMMON_HEADER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* 24 bits total */ +typedef struct { + unsigned int type : 1; + unsigned int version : 3; + unsigned int show_frame : 1; + + /* Allow 2^20 bytes = 8 megabits for first partition */ + + unsigned int first_partition_length_in_bytes : 19; + +#ifdef PACKET_TESTING + unsigned int frame_number; + unsigned int update_gold : 1; + unsigned int uses_gold : 1; + unsigned int update_last : 1; + unsigned int uses_last : 1; +#endif + +} VP8_HEADER; + +#ifdef PACKET_TESTING +#define VP8_HEADER_SIZE 8 +#else +#define VP8_HEADER_SIZE 3 +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_COMMON_HEADER_H_ diff --git a/media/libvpx/libvpx/vp8/common/idct_blk.c b/media/libvpx/libvpx/vp8/common/idct_blk.c new file mode 100644 index 0000000000..ebe1774f56 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/idct_blk.c @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vpx_mem/vpx_mem.h" + +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, + int stride, char *eobs) { + int i, j; + + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) { + if (*eobs++ > 1) { + vp8_dequant_idct_add_c(q, dq, dst, stride); + } else { + vp8_dc_only_idct_add_c(q[0] * dq[0], dst, stride, dst, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + q += 16; + dst += 4; + } + + dst += 4 * stride - 16; + } +} + +void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, + unsigned char *dst_v, int stride, + char *eobs) { + int i, j; + + for (i = 0; i < 2; ++i) { + for (j = 0; j < 2; ++j) { + if (*eobs++ > 1) { + vp8_dequant_idct_add_c(q, dq, dst_u, stride); + } else { + vp8_dc_only_idct_add_c(q[0] * dq[0], dst_u, stride, dst_u, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + q += 16; + dst_u += 4; + } + + dst_u += 4 * stride - 8; + } + + for (i = 0; i < 2; ++i) { + for (j = 0; j < 2; ++j) { + if (*eobs++ > 1) { + vp8_dequant_idct_add_c(q, dq, dst_v, stride); + } else { + vp8_dc_only_idct_add_c(q[0] * dq[0], dst_v, stride, dst_v, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + q += 16; + dst_v += 4; + } + + dst_v += 4 * stride - 8; + } +} diff --git a/media/libvpx/libvpx/vp8/common/idctllm.c b/media/libvpx/libvpx/vp8/common/idctllm.c new file mode 100644 index 0000000000..2f5adc0b40 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/idctllm.c @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" + +/**************************************************************************** + * Notes: + * + * This implementation makes use of 16 bit fixed point verio of two multiply + * constants: + * 1. sqrt(2) * cos (pi/8) + * 2. sqrt(2) * sin (pi/8) + * Becuase the first constant is bigger than 1, to maintain the same 16 bit + * fixed point precision as the second one, we use a trick of + * x * a = x + x*(a-1) + * so + * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). + **************************************************************************/ +static const int cospi8sqrt2minus1 = 20091; +static const int sinpi8sqrt2 = 35468; + +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, + int pred_stride, unsigned char *dst_ptr, + int dst_stride) { + int i; + int r, c; + int a1, b1, c1, d1; + short output[16]; + short *ip = input; + short *op = output; + int temp1, temp2; + int shortpitch = 4; + + for (i = 0; i < 4; ++i) { + a1 = ip[0] + ip[8]; + b1 = ip[0] - ip[8]; + + temp1 = (ip[4] * sinpi8sqrt2) >> 16; + temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16); + c1 = temp1 - temp2; + + temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16); + temp2 = (ip[12] * sinpi8sqrt2) >> 16; + d1 = temp1 + temp2; + + op[shortpitch * 0] = a1 + d1; + op[shortpitch * 3] = a1 - d1; + + op[shortpitch * 1] = b1 + c1; + op[shortpitch * 2] = b1 - c1; + + ip++; + op++; + } + + ip = output; + op = output; + + for (i = 0; i < 4; ++i) { + a1 = ip[0] + ip[2]; + b1 = ip[0] - ip[2]; + + temp1 = (ip[1] * sinpi8sqrt2) >> 16; + temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16); + c1 = temp1 - temp2; + + temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16); + temp2 = (ip[3] * sinpi8sqrt2) >> 16; + d1 = temp1 + temp2; + + op[0] = (a1 + d1 + 4) >> 3; + op[3] = (a1 - d1 + 4) >> 3; + + op[1] = (b1 + c1 + 4) >> 3; + op[2] = (b1 - c1 + 4) >> 3; + + ip += shortpitch; + op += shortpitch; + } + + ip = output; + for (r = 0; r < 4; ++r) { + for (c = 0; c < 4; ++c) { + int a = ip[c] + pred_ptr[c]; + + if (a < 0) a = 0; + + if (a > 255) a = 255; + + dst_ptr[c] = (unsigned char)a; + } + ip += 4; + dst_ptr += dst_stride; + pred_ptr += pred_stride; + } +} + +void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, + int pred_stride, unsigned char *dst_ptr, + int dst_stride) { + int a1 = ((input_dc + 4) >> 3); + int r, c; + + for (r = 0; r < 4; ++r) { + for (c = 0; c < 4; ++c) { + int a = a1 + pred_ptr[c]; + + if (a < 0) a = 0; + + if (a > 255) a = 255; + + dst_ptr[c] = (unsigned char)a; + } + + dst_ptr += dst_stride; + pred_ptr += pred_stride; + } +} + +void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff) { + short output[16]; + int i; + int a1, b1, c1, d1; + int a2, b2, c2, d2; + short *ip = input; + short *op = output; + + for (i = 0; i < 4; ++i) { + a1 = ip[0] + ip[12]; + b1 = ip[4] + ip[8]; + c1 = ip[4] - ip[8]; + d1 = ip[0] - ip[12]; + + op[0] = a1 + b1; + op[4] = c1 + d1; + op[8] = a1 - b1; + op[12] = d1 - c1; + ip++; + op++; + } + + ip = output; + op = output; + + for (i = 0; i < 4; ++i) { + a1 = ip[0] + ip[3]; + b1 = ip[1] + ip[2]; + c1 = ip[1] - ip[2]; + d1 = ip[0] - ip[3]; + + a2 = a1 + b1; + b2 = c1 + d1; + c2 = a1 - b1; + d2 = d1 - c1; + + op[0] = (a2 + 3) >> 3; + op[1] = (b2 + 3) >> 3; + op[2] = (c2 + 3) >> 3; + op[3] = (d2 + 3) >> 3; + + ip += 4; + op += 4; + } + + for (i = 0; i < 16; ++i) { + mb_dqcoeff[i * 16] = output[i]; + } +} + +void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff) { + int i; + int a1; + + a1 = ((input[0] + 3) >> 3); + for (i = 0; i < 16; ++i) { + mb_dqcoeff[i * 16] = a1; + } +} diff --git a/media/libvpx/libvpx/vp8/common/invtrans.h b/media/libvpx/libvpx/vp8/common/invtrans.h new file mode 100644 index 0000000000..aed7bb0600 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/invtrans.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_INVTRANS_H_ +#define VPX_VP8_COMMON_INVTRANS_H_ + +#include "./vpx_config.h" +#include "vp8_rtcd.h" +#include "blockd.h" +#include "onyxc_int.h" + +#if CONFIG_MULTITHREAD +#include "vpx_mem/vpx_mem.h" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +static void eob_adjust(char *eobs, short *diff) { + /* eob adjust.... the idct can only skip if both the dc and eob are zero */ + int js; + for (js = 0; js < 16; ++js) { + if ((eobs[js] == 0) && (diff[0] != 0)) eobs[js]++; + diff += 16; + } +} + +static INLINE void vp8_inverse_transform_mby(MACROBLOCKD *xd) { + short *DQC = xd->dequant_y1; + + if (xd->mode_info_context->mbmi.mode != SPLITMV) { + /* do 2nd order transform on the dc block */ + if (xd->eobs[24] > 1) { + vp8_short_inv_walsh4x4(&xd->block[24].dqcoeff[0], xd->qcoeff); + } else { + vp8_short_inv_walsh4x4_1(&xd->block[24].dqcoeff[0], xd->qcoeff); + } + eob_adjust(xd->eobs, xd->qcoeff); + + DQC = xd->dequant_y1_dc; + } + vp8_dequant_idct_add_y_block(xd->qcoeff, DQC, xd->dst.y_buffer, + xd->dst.y_stride, xd->eobs); +} +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_COMMON_INVTRANS_H_ diff --git a/media/libvpx/libvpx/vp8/common/loongarch/idct_lsx.c b/media/libvpx/libvpx/vp8/common/loongarch/idct_lsx.c new file mode 100644 index 0000000000..eee871eec4 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/loongarch/idct_lsx.c @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vp8/common/blockd.h" +#include "vpx_util/loongson_intrinsics.h" + +static const int32_t cospi8sqrt2minus1 = 20091; +static const int32_t sinpi8sqrt2 = 35468; + +#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ + do { \ + __m128i tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + DUP2_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, tmp0_m, tmp1_m); \ + DUP2_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, tmp2_m, tmp3_m); \ + DUP2_ARG2(__lsx_vilvl_w, tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \ + DUP2_ARG2(__lsx_vilvh_w, tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \ + } while (0) + +#define TRANSPOSE_TWO_4x4_H(in0, in1, in2, in3, out0, out1, out2, out3) \ + do { \ + __m128i s4_m, s5_m, s6_m, s7_m; \ + \ + TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, s4_m, s5_m, s6_m, s7_m); \ + DUP2_ARG2(__lsx_vilvl_d, s6_m, s4_m, s7_m, s5_m, out0, out2); \ + out1 = __lsx_vilvh_d(s6_m, s4_m); \ + out3 = __lsx_vilvh_d(s7_m, s5_m); \ + } while (0) + +#define EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in0, in1) \ + do { \ + __m128i zero_m = __lsx_vldi(0); \ + __m128i tmp1_m, tmp2_m; \ + __m128i sinpi8_sqrt2_m = __lsx_vreplgr2vr_w(sinpi8sqrt2); \ + \ + tmp1_m = __lsx_vilvl_h(in0, zero_m); \ + tmp2_m = __lsx_vilvh_h(in0, zero_m); \ + tmp1_m = __lsx_vsrai_w(tmp1_m, 16); \ + tmp2_m = __lsx_vsrai_w(tmp2_m, 16); \ + tmp1_m = __lsx_vmul_w(tmp1_m, sinpi8_sqrt2_m); \ + tmp1_m = __lsx_vsrai_w(tmp1_m, 16); \ + tmp2_m = __lsx_vmul_w(tmp2_m, sinpi8_sqrt2_m); \ + tmp2_m = __lsx_vsrai_w(tmp2_m, 16); \ + in1 = __lsx_vpickev_h(tmp2_m, tmp1_m); \ + } while (0) + +#define VP8_IDCT_1D_H(in0, in1, in2, in3, out0, out1, out2, out3) \ + do { \ + __m128i a1_m, b1_m, c1_m, d1_m; \ + __m128i c_tmp1_m, c_tmp2_m; \ + __m128i d_tmp1_m, d_tmp2_m; \ + __m128i const_cospi8sqrt2minus1_m; \ + \ + const_cospi8sqrt2minus1_m = __lsx_vreplgr2vr_h(cospi8sqrt2minus1); \ + a1_m = __lsx_vadd_h(in0, in2); \ + b1_m = __lsx_vsub_h(in0, in2); \ + EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in1, c_tmp1_m); \ + \ + c_tmp2_m = __lsx_vmuh_h(in3, const_cospi8sqrt2minus1_m); \ + c_tmp2_m = __lsx_vslli_h(c_tmp2_m, 1); \ + c_tmp2_m = __lsx_vsrai_h(c_tmp2_m, 1); \ + c_tmp2_m = __lsx_vadd_h(in3, c_tmp2_m); \ + c1_m = __lsx_vsub_h(c_tmp1_m, c_tmp2_m); \ + \ + d_tmp1_m = __lsx_vmuh_h(in1, const_cospi8sqrt2minus1_m); \ + d_tmp1_m = __lsx_vslli_h(d_tmp1_m, 1); \ + d_tmp1_m = __lsx_vsrai_h(d_tmp1_m, 1); \ + d_tmp1_m = __lsx_vadd_h(in1, d_tmp1_m); \ + EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in3, d_tmp2_m); \ + d1_m = __lsx_vadd_h(d_tmp1_m, d_tmp2_m); \ + LSX_BUTTERFLY_4_H(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \ + } while (0) + +#define VP8_IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) \ + do { \ + __m128i a1_m, b1_m, c1_m, d1_m; \ + __m128i c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m; \ + __m128i const_cospi8sqrt2minus1_m, sinpi8_sqrt2_m; \ + \ + const_cospi8sqrt2minus1_m = __lsx_vreplgr2vr_w(cospi8sqrt2minus1); \ + sinpi8_sqrt2_m = __lsx_vreplgr2vr_w(sinpi8sqrt2); \ + a1_m = __lsx_vadd_w(in0, in2); \ + b1_m = __lsx_vsub_w(in0, in2); \ + c_tmp1_m = __lsx_vmul_w(in1, sinpi8_sqrt2_m); \ + c_tmp1_m = __lsx_vsrai_w(c_tmp1_m, 16); \ + c_tmp2_m = __lsx_vmul_w(in3, const_cospi8sqrt2minus1_m); \ + c_tmp2_m = __lsx_vsrai_w(c_tmp2_m, 16); \ + c_tmp2_m = __lsx_vadd_w(in3, c_tmp2_m); \ + c1_m = __lsx_vsub_w(c_tmp1_m, c_tmp2_m); \ + d_tmp1_m = __lsx_vmul_w(in1, const_cospi8sqrt2minus1_m); \ + d_tmp1_m = __lsx_vsrai_w(d_tmp1_m, 16); \ + d_tmp1_m = __lsx_vadd_w(in1, d_tmp1_m); \ + d_tmp2_m = __lsx_vmul_w(in3, sinpi8_sqrt2_m); \ + d_tmp2_m = __lsx_vsrai_w(d_tmp2_m, 16); \ + d1_m = __lsx_vadd_w(d_tmp1_m, d_tmp2_m); \ + LSX_BUTTERFLY_4_W(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \ + } while (0) + +#define UNPCK_SH_SW(in, out0, out1) \ + do { \ + out0 = __lsx_vsllwil_w_h(in, 0); \ + out1 = __lsx_vexth_w_h(in); \ + } while (0) + +static void idct4x4_addconst_lsx(int16_t in_dc, uint8_t *pred, + int32_t pred_stride, uint8_t *dest, + int32_t dest_stride) { + __m128i vec, res0, res1, res2, res3, dst0, dst1; + __m128i pred0, pred1, pred2, pred3; + __m128i zero = __lsx_vldi(0); + + int32_t pred_stride2 = pred_stride << 1; + int32_t pred_stride3 = pred_stride2 + pred_stride; + + vec = __lsx_vreplgr2vr_h(in_dc); + vec = __lsx_vsrari_h(vec, 3); + pred0 = __lsx_vld(pred, 0); + DUP2_ARG2(__lsx_vldx, pred, pred_stride, pred, pred_stride2, pred1, pred2); + pred3 = __lsx_vldx(pred, pred_stride3); + DUP4_ARG2(__lsx_vilvl_b, zero, pred0, zero, pred1, zero, pred2, zero, pred3, + res0, res1, res2, res3); + DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec, res0, + res1, res2, res3); + res0 = __lsx_vclip255_h(res0); + res1 = __lsx_vclip255_h(res1); + res2 = __lsx_vclip255_h(res2); + res3 = __lsx_vclip255_h(res3); + + DUP2_ARG2(__lsx_vpickev_b, res1, res0, res3, res2, dst0, dst1); + dst0 = __lsx_vpickev_w(dst1, dst0); + __lsx_vstelm_w(dst0, dest, 0, 0); + dest += dest_stride; + __lsx_vstelm_w(dst0, dest, 0, 1); + dest += dest_stride; + __lsx_vstelm_w(dst0, dest, 0, 2); + dest += dest_stride; + __lsx_vstelm_w(dst0, dest, 0, 3); +} + +void vp8_dc_only_idct_add_lsx(int16_t input_dc, uint8_t *pred_ptr, + int32_t pred_stride, uint8_t *dst_ptr, + int32_t dst_stride) { + idct4x4_addconst_lsx(input_dc, pred_ptr, pred_stride, dst_ptr, dst_stride); +} + +static void dequant_idct4x4_addblk_2x_lsx(int16_t *input, + int16_t *dequant_input, uint8_t *dest, + int32_t dest_stride) { + __m128i dest0, dest1, dest2, dest3; + __m128i in0, in1, in2, in3, mul0, mul1, mul2, mul3, dequant_in0, dequant_in1; + __m128i hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3, res0, res1, res2, res3; + __m128i hz0l, hz1l, hz2l, hz3l, hz0r, hz1r, hz2r, hz3r; + __m128i vt0l, vt1l, vt2l, vt3l, vt0r, vt1r, vt2r, vt3r; + __m128i zero = __lsx_vldi(0); + + int32_t dest_stride2 = dest_stride << 1; + int32_t dest_stride3 = dest_stride2 + dest_stride; + + DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48, in0, in1, in2, + in3); + DUP2_ARG2(__lsx_vld, dequant_input, 0, dequant_input, 16, dequant_in0, + dequant_in1); + + DUP4_ARG2(__lsx_vmul_h, in0, dequant_in0, in1, dequant_in1, in2, dequant_in0, + in3, dequant_in1, mul0, mul1, mul2, mul3); + DUP2_ARG2(__lsx_vpickev_d, mul2, mul0, mul3, mul1, in0, in2); + DUP2_ARG2(__lsx_vpickod_d, mul2, mul0, mul3, mul1, in1, in3); + + VP8_IDCT_1D_H(in0, in1, in2, in3, hz0, hz1, hz2, hz3); + TRANSPOSE_TWO_4x4_H(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3); + UNPCK_SH_SW(hz0, hz0r, hz0l); + UNPCK_SH_SW(hz1, hz1r, hz1l); + UNPCK_SH_SW(hz2, hz2r, hz2l); + UNPCK_SH_SW(hz3, hz3r, hz3l); + VP8_IDCT_1D_W(hz0l, hz1l, hz2l, hz3l, vt0l, vt1l, vt2l, vt3l); + DUP4_ARG2(__lsx_vsrari_w, vt0l, 3, vt1l, 3, vt2l, 3, vt3l, 3, vt0l, vt1l, + vt2l, vt3l); + VP8_IDCT_1D_W(hz0r, hz1r, hz2r, hz3r, vt0r, vt1r, vt2r, vt3r); + DUP4_ARG2(__lsx_vsrari_w, vt0r, 3, vt1r, 3, vt2r, 3, vt3r, 3, vt0r, vt1r, + vt2r, vt3r); + DUP4_ARG2(__lsx_vpickev_h, vt0l, vt0r, vt1l, vt1r, vt2l, vt2r, vt3l, vt3r, + vt0, vt1, vt2, vt3); + TRANSPOSE_TWO_4x4_H(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3); + dest0 = __lsx_vld(dest, 0); + DUP2_ARG2(__lsx_vldx, dest, dest_stride, dest, dest_stride2, dest1, dest2); + dest3 = __lsx_vldx(dest, dest_stride3); + DUP4_ARG2(__lsx_vilvl_b, zero, dest0, zero, dest1, zero, dest2, zero, dest3, + res0, res1, res2, res3); + DUP4_ARG2(__lsx_vadd_h, res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, + res1, res2, res3); + + res0 = __lsx_vclip255_h(res0); + res1 = __lsx_vclip255_h(res1); + res2 = __lsx_vclip255_h(res2); + res3 = __lsx_vclip255_h(res3); + DUP2_ARG2(__lsx_vpickev_b, res1, res0, res3, res2, vt0l, vt1l); + + __lsx_vstelm_d(vt0l, dest, 0, 0); + __lsx_vstelm_d(vt0l, dest + dest_stride, 0, 1); + __lsx_vstelm_d(vt1l, dest + dest_stride2, 0, 0); + __lsx_vstelm_d(vt1l, dest + dest_stride3, 0, 1); + + __lsx_vst(zero, input, 0); + __lsx_vst(zero, input, 16); + __lsx_vst(zero, input, 32); + __lsx_vst(zero, input, 48); +} + +static void dequant_idct_addconst_2x_lsx(int16_t *input, int16_t *dequant_input, + uint8_t *dest, int32_t dest_stride) { + __m128i input_dc0, input_dc1, vec, res0, res1, res2, res3; + __m128i dest0, dest1, dest2, dest3; + __m128i zero = __lsx_vldi(0); + int32_t dest_stride2 = dest_stride << 1; + int32_t dest_stride3 = dest_stride2 + dest_stride; + + input_dc0 = __lsx_vreplgr2vr_h(input[0] * dequant_input[0]); + input_dc1 = __lsx_vreplgr2vr_h(input[16] * dequant_input[0]); + DUP2_ARG2(__lsx_vsrari_h, input_dc0, 3, input_dc1, 3, input_dc0, input_dc1); + vec = __lsx_vpickev_d(input_dc1, input_dc0); + input[0] = 0; + input[16] = 0; + dest0 = __lsx_vld(dest, 0); + DUP2_ARG2(__lsx_vldx, dest, dest_stride, dest, dest_stride2, dest1, dest2); + dest3 = __lsx_vldx(dest, dest_stride3); + DUP4_ARG2(__lsx_vilvl_b, zero, dest0, zero, dest1, zero, dest2, zero, dest3, + res0, res1, res2, res3); + DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec, res0, + res1, res2, res3); + res0 = __lsx_vclip255_h(res0); + res1 = __lsx_vclip255_h(res1); + res2 = __lsx_vclip255_h(res2); + res3 = __lsx_vclip255_h(res3); + + DUP2_ARG2(__lsx_vpickev_b, res1, res0, res3, res2, res0, res1); + __lsx_vstelm_d(res0, dest, 0, 0); + __lsx_vstelm_d(res0, dest + dest_stride, 0, 1); + __lsx_vstelm_d(res1, dest + dest_stride2, 0, 0); + __lsx_vstelm_d(res1, dest + dest_stride3, 0, 1); +} + +void vp8_dequant_idct_add_y_block_lsx(int16_t *q, int16_t *dq, uint8_t *dst, + int32_t stride, char *eobs) { + int16_t *eobs_h = (int16_t *)eobs; + uint8_t i; + + for (i = 4; i--;) { + if (eobs_h[0]) { + if (eobs_h[0] & 0xfefe) { + dequant_idct4x4_addblk_2x_lsx(q, dq, dst, stride); + } else { + dequant_idct_addconst_2x_lsx(q, dq, dst, stride); + } + } + + q += 32; + + if (eobs_h[1]) { + if (eobs_h[1] & 0xfefe) { + dequant_idct4x4_addblk_2x_lsx(q, dq, dst + 8, stride); + } else { + dequant_idct_addconst_2x_lsx(q, dq, dst + 8, stride); + } + } + + q += 32; + dst += (4 * stride); + eobs_h += 2; + } +} + +void vp8_dequant_idct_add_uv_block_lsx(int16_t *q, int16_t *dq, uint8_t *dst_u, + uint8_t *dst_v, int32_t stride, + char *eobs) { + int16_t *eobs_h = (int16_t *)eobs; + if (eobs_h[0]) { + if (eobs_h[0] & 0xfefe) { + dequant_idct4x4_addblk_2x_lsx(q, dq, dst_u, stride); + } else { + dequant_idct_addconst_2x_lsx(q, dq, dst_u, stride); + } + } + + q += 32; + dst_u += (stride * 4); + + if (eobs_h[1]) { + if (eobs_h[1] & 0xfefe) { + dequant_idct4x4_addblk_2x_lsx(q, dq, dst_u, stride); + } else { + dequant_idct_addconst_2x_lsx(q, dq, dst_u, stride); + } + } + + q += 32; + + if (eobs_h[2]) { + if (eobs_h[2] & 0xfefe) { + dequant_idct4x4_addblk_2x_lsx(q, dq, dst_v, stride); + } else { + dequant_idct_addconst_2x_lsx(q, dq, dst_v, stride); + } + } + q += 32; + dst_v += (stride * 4); + + if (eobs_h[3]) { + if (eobs_h[3] & 0xfefe) { + dequant_idct4x4_addblk_2x_lsx(q, dq, dst_v, stride); + } else { + dequant_idct_addconst_2x_lsx(q, dq, dst_v, stride); + } + } +} diff --git a/media/libvpx/libvpx/vp8/common/loongarch/loopfilter_filters_lsx.c b/media/libvpx/libvpx/vp8/common/loongarch/loopfilter_filters_lsx.c new file mode 100644 index 0000000000..79c3ea6dbb --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/loongarch/loopfilter_filters_lsx.c @@ -0,0 +1,743 @@ +/* + * Copyright (c) 2021 Loongson Technology Corporation Limited + * Contributed by Lu Wang + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vp8/common/loopfilter.h" +#include "vpx_util/loongson_intrinsics.h" + +#define VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev) \ + do { \ + __m128i p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \ + const __m128i cnst4b = __lsx_vldi(4); \ + const __m128i cnst3b = __lsx_vldi(3); \ + \ + p1_m = __lsx_vxori_b(p1, 0x80); \ + p0_m = __lsx_vxori_b(p0, 0x80); \ + q0_m = __lsx_vxori_b(q0, 0x80); \ + q1_m = __lsx_vxori_b(q1, 0x80); \ + \ + filt = __lsx_vssub_b(p1_m, q1_m); \ + filt = __lsx_vand_v(filt, hev); \ + q0_sub_p0 = __lsx_vssub_b(q0_m, p0_m); \ + filt = __lsx_vsadd_b(filt, q0_sub_p0); \ + filt = __lsx_vsadd_b(filt, q0_sub_p0); \ + filt = __lsx_vsadd_b(filt, q0_sub_p0); \ + filt = __lsx_vand_v(filt, mask); \ + t1 = __lsx_vsadd_b(filt, cnst4b); \ + t1 = __lsx_vsra_b(t1, cnst3b); \ + t2 = __lsx_vsadd_b(filt, cnst3b); \ + t2 = __lsx_vsra_b(t2, cnst3b); \ + q0_m = __lsx_vssub_b(q0_m, t1); \ + q0 = __lsx_vxori_b(q0_m, 0x80); \ + p0_m = __lsx_vsadd_b(p0_m, t2); \ + p0 = __lsx_vxori_b(p0_m, 0x80); \ + filt = __lsx_vsrari_b(t1, 1); \ + hev = __lsx_vxori_b(hev, 0xff); \ + filt = __lsx_vand_v(filt, hev); \ + q1_m = __lsx_vssub_b(q1_m, filt); \ + q1 = __lsx_vxori_b(q1_m, 0x80); \ + p1_m = __lsx_vsadd_b(p1_m, filt); \ + p1 = __lsx_vxori_b(p1_m, 0x80); \ + } while (0) + +#define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) \ + do { \ + __m128i p2_m, p1_m, p0_m, q2_m, q1_m, q0_m; \ + __m128i u, filt, t1, t2, filt_sign, q0_sub_p0; \ + __m128i filt_r, filt_l; \ + __m128i temp0, temp1, temp2, temp3; \ + const __m128i cnst4b = __lsx_vldi(4); \ + const __m128i cnst3b = __lsx_vldi(3); \ + const __m128i cnst9h = __lsx_vldi(1033); \ + const __m128i cnst63h = __lsx_vldi(1087); \ + \ + p2_m = __lsx_vxori_b(p2, 0x80); \ + p1_m = __lsx_vxori_b(p1, 0x80); \ + p0_m = __lsx_vxori_b(p0, 0x80); \ + q0_m = __lsx_vxori_b(q0, 0x80); \ + q1_m = __lsx_vxori_b(q1, 0x80); \ + q2_m = __lsx_vxori_b(q2, 0x80); \ + \ + filt = __lsx_vssub_b(p1_m, q1_m); \ + q0_sub_p0 = __lsx_vssub_b(q0_m, p0_m); \ + filt = __lsx_vsadd_b(filt, q0_sub_p0); \ + filt = __lsx_vsadd_b(filt, q0_sub_p0); \ + filt = __lsx_vsadd_b(filt, q0_sub_p0); \ + filt = __lsx_vand_v(filt, mask); \ + \ + t2 = __lsx_vand_v(filt, hev); \ + hev = __lsx_vxori_b(hev, 0xff); \ + filt = __lsx_vand_v(hev, filt); \ + t1 = __lsx_vsadd_b(t2, cnst4b); \ + t1 = __lsx_vsra_b(t1, cnst3b); \ + t2 = __lsx_vsadd_b(t2, cnst3b); \ + t2 = __lsx_vsra_b(t2, cnst3b); \ + q0_m = __lsx_vssub_b(q0_m, t1); \ + p0_m = __lsx_vsadd_b(p0_m, t2); \ + filt_sign = __lsx_vslti_b(filt, 0); \ + filt_r = __lsx_vilvl_b(filt_sign, filt); \ + filt_l = __lsx_vilvh_b(filt_sign, filt); \ + temp0 = __lsx_vmul_h(filt_r, cnst9h); \ + temp1 = __lsx_vadd_h(temp0, cnst63h); \ + temp2 = __lsx_vmul_h(filt_l, cnst9h); \ + temp3 = __lsx_vadd_h(temp2, cnst63h); \ + \ + u = __lsx_vssrani_b_h(temp3, temp1, 7); \ + q2_m = __lsx_vssub_b(q2_m, u); \ + p2_m = __lsx_vsadd_b(p2_m, u); \ + q2 = __lsx_vxori_b(q2_m, 0x80); \ + p2 = __lsx_vxori_b(p2_m, 0x80); \ + \ + temp1 = __lsx_vadd_h(temp1, temp0); \ + temp3 = __lsx_vadd_h(temp3, temp2); \ + \ + u = __lsx_vssrani_b_h(temp3, temp1, 7); \ + q1_m = __lsx_vssub_b(q1_m, u); \ + p1_m = __lsx_vsadd_b(p1_m, u); \ + q1 = __lsx_vxori_b(q1_m, 0x80); \ + p1 = __lsx_vxori_b(p1_m, 0x80); \ + \ + temp1 = __lsx_vadd_h(temp1, temp0); \ + temp3 = __lsx_vadd_h(temp3, temp2); \ + \ + u = __lsx_vssrani_b_h(temp3, temp1, 7); \ + q0_m = __lsx_vssub_b(q0_m, u); \ + p0_m = __lsx_vsadd_b(p0_m, u); \ + q0 = __lsx_vxori_b(q0_m, 0x80); \ + p0 = __lsx_vxori_b(p0_m, 0x80); \ + } while (0) + +#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ + limit_in, b_limit_in, thresh_in, hev_out, mask_out, \ + flat_out) \ + do { \ + __m128i p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \ + __m128i p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \ + \ + p3_asub_p2_m = __lsx_vabsd_bu(p3_in, p2_in); \ + p2_asub_p1_m = __lsx_vabsd_bu(p2_in, p1_in); \ + p1_asub_p0_m = __lsx_vabsd_bu(p1_in, p0_in); \ + q1_asub_q0_m = __lsx_vabsd_bu(q1_in, q0_in); \ + q2_asub_q1_m = __lsx_vabsd_bu(q2_in, q1_in); \ + q3_asub_q2_m = __lsx_vabsd_bu(q3_in, q2_in); \ + p0_asub_q0_m = __lsx_vabsd_bu(p0_in, q0_in); \ + p1_asub_q1_m = __lsx_vabsd_bu(p1_in, q1_in); \ + flat_out = __lsx_vmax_bu(p1_asub_p0_m, q1_asub_q0_m); \ + hev_out = __lsx_vslt_bu(thresh_in, flat_out); \ + p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p0_asub_q0_m); \ + p1_asub_q1_m = __lsx_vsrli_b(p1_asub_q1_m, 1); \ + p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p1_asub_q1_m); \ + mask_out = __lsx_vslt_bu(b_limit_in, p0_asub_q0_m); \ + mask_out = __lsx_vmax_bu(flat_out, mask_out); \ + p3_asub_p2_m = __lsx_vmax_bu(p3_asub_p2_m, p2_asub_p1_m); \ + mask_out = __lsx_vmax_bu(p3_asub_p2_m, mask_out); \ + q2_asub_q1_m = __lsx_vmax_bu(q2_asub_q1_m, q3_asub_q2_m); \ + mask_out = __lsx_vmax_bu(q2_asub_q1_m, mask_out); \ + mask_out = __lsx_vslt_bu(limit_in, mask_out); \ + mask_out = __lsx_vxori_b(mask_out, 0xff); \ + } while (0) + +#define VP8_ST6x1_B(in0, in0_idx, in1, in1_idx, pdst, stride) \ + do { \ + __lsx_vstelm_w(in0, pdst, 0, in0_idx); \ + __lsx_vstelm_h(in1, pdst + stride, 0, in1_idx); \ + } while (0) + +static void loop_filter_horizontal_4_dual_lsx(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0_ptr, + const uint8_t *limit0_ptr, + const uint8_t *thresh0_ptr, + const uint8_t *b_limit1_ptr, + const uint8_t *limit1_ptr, + const uint8_t *thresh1_ptr) { + int32_t pitch_x2 = pitch << 1; + int32_t pitch_x3 = pitch_x2 + pitch; + int32_t pitch_x4 = pitch << 2; + + __m128i mask, hev, flat; + __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + + DUP4_ARG2(__lsx_vldx, src, -pitch_x4, src, -pitch_x3, src, -pitch_x2, src, + -pitch, p3, p2, p1, p0); + q0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch_x2, q1, q2); + q3 = __lsx_vldx(src, pitch_x3); + + thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0); + thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0); + thresh0 = __lsx_vilvl_d(thresh1, thresh0); + + b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0); + b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0); + b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0); + + limit0 = __lsx_vldrepl_b(limit0_ptr, 0); + limit1 = __lsx_vldrepl_b(limit1_ptr, 0); + limit0 = __lsx_vilvl_d(limit1, limit0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, + mask, flat); + VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); + + __lsx_vstx(p1, src, -pitch_x2); + __lsx_vstx(p0, src, -pitch); + __lsx_vst(q0, src, 0); + __lsx_vstx(q1, src, pitch); +} + +static void loop_filter_vertical_4_dual_lsx(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0_ptr, + const uint8_t *limit0_ptr, + const uint8_t *thresh0_ptr, + const uint8_t *b_limit1_ptr, + const uint8_t *limit1_ptr, + const uint8_t *thresh1_ptr) { + uint8_t *src_tmp0 = src - 4; + int32_t pitch_x2 = pitch << 1; + int32_t pitch_x3 = pitch_x2 + pitch; + int32_t pitch_x4 = pitch << 2; + __m128i mask, hev, flat; + __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i row0, row1, row2, row3, row4, row5, row6, row7; + __m128i row8, row9, row10, row11, row12, row13, row14, row15; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + + row0 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row1, row2); + row3 = __lsx_vldx(src_tmp0, pitch_x3); + src_tmp0 += pitch_x4; + row4 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row5, row6); + row7 = __lsx_vldx(src_tmp0, pitch_x3); + src_tmp0 += pitch_x4; + + row8 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row9, row10); + row11 = __lsx_vldx(src_tmp0, pitch_x3); + src_tmp0 += pitch_x4; + row12 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row13, row14); + row15 = __lsx_vldx(src_tmp0, pitch_x3); + + LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p3, p2, + p1, p0, q0, q1, q2, q3); + + thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0); + thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0); + thresh0 = __lsx_vilvl_d(thresh1, thresh0); + + b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0); + b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0); + b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0); + + limit0 = __lsx_vldrepl_b(limit0_ptr, 0); + limit1 = __lsx_vldrepl_b(limit1_ptr, 0); + limit0 = __lsx_vilvl_d(limit1, limit0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, + mask, flat); + VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); + + DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1); + tmp2 = __lsx_vilvl_h(tmp1, tmp0); + tmp3 = __lsx_vilvh_h(tmp1, tmp0); + DUP2_ARG2(__lsx_vilvh_b, p0, p1, q1, q0, tmp0, tmp1); + tmp4 = __lsx_vilvl_h(tmp1, tmp0); + tmp5 = __lsx_vilvh_h(tmp1, tmp0); + + src -= 2; + __lsx_vstelm_w(tmp2, src, 0, 0); + src += pitch; + __lsx_vstelm_w(tmp2, src, 0, 1); + src += pitch; + __lsx_vstelm_w(tmp2, src, 0, 2); + src += pitch; + __lsx_vstelm_w(tmp2, src, 0, 3); + src += pitch; + + __lsx_vstelm_w(tmp3, src, 0, 0); + src += pitch; + __lsx_vstelm_w(tmp3, src, 0, 1); + src += pitch; + __lsx_vstelm_w(tmp3, src, 0, 2); + src += pitch; + __lsx_vstelm_w(tmp3, src, 0, 3); + src += pitch; + + __lsx_vstelm_w(tmp4, src, 0, 0); + src += pitch; + __lsx_vstelm_w(tmp4, src, 0, 1); + src += pitch; + __lsx_vstelm_w(tmp4, src, 0, 2); + src += pitch; + __lsx_vstelm_w(tmp4, src, 0, 3); + src += pitch; + + __lsx_vstelm_w(tmp5, src, 0, 0); + src += pitch; + __lsx_vstelm_w(tmp5, src, 0, 1); + src += pitch; + __lsx_vstelm_w(tmp5, src, 0, 2); + src += pitch; + __lsx_vstelm_w(tmp5, src, 0, 3); +} + +static void loop_filter_horizontal_edge_uv_lsx(uint8_t *src_u, uint8_t *src_v, + int32_t pitch, + const uint8_t b_limit_in, + const uint8_t limit_in, + const uint8_t thresh_in) { + int32_t pitch_x2 = pitch << 1; + int32_t pitch_x3 = pitch_x2 + pitch; + int32_t pitch_x4 = pitch << 2; + + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i mask, hev, flat, thresh, limit, b_limit; + __m128i p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u; + __m128i p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v; + + thresh = __lsx_vreplgr2vr_b(thresh_in); + limit = __lsx_vreplgr2vr_b(limit_in); + b_limit = __lsx_vreplgr2vr_b(b_limit_in); + + DUP4_ARG2(__lsx_vldx, src_u, -pitch_x4, src_u, -pitch_x3, src_u, -pitch_x2, + src_u, -pitch, p3_u, p2_u, p1_u, p0_u); + q0_u = __lsx_vld(src_u, 0); + DUP2_ARG2(__lsx_vldx, src_u, pitch, src_u, pitch_x2, q1_u, q2_u); + q3_u = __lsx_vldx(src_u, pitch_x3); + + DUP4_ARG2(__lsx_vldx, src_v, -pitch_x4, src_v, -pitch_x3, src_v, -pitch_x2, + src_v, -pitch, p3_v, p2_v, p1_v, p0_v); + q0_v = __lsx_vld(src_v, 0); + DUP2_ARG2(__lsx_vldx, src_v, pitch, src_v, pitch_x2, q1_v, q2_v); + q3_v = __lsx_vldx(src_v, pitch_x3); + + /* right 8 element of p3 are u pixel and + left 8 element of p3 are v pixel */ + DUP4_ARG2(__lsx_vilvl_d, p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, + p2, p1, p0); + DUP4_ARG2(__lsx_vilvl_d, q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, + q1, q2, q3); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); + + __lsx_vstelm_d(q1, src_u + pitch, 0, 0); + __lsx_vstelm_d(q0, src_u, 0, 0); + __lsx_vstelm_d(p0, src_u - pitch, 0, 0); + __lsx_vstelm_d(p1, src_u - pitch_x2, 0, 0); + + __lsx_vstelm_d(q1, src_v + pitch, 0, 1); + __lsx_vstelm_d(q0, src_v, 0, 1); + __lsx_vstelm_d(p0, src_v - pitch, 0, 1); + __lsx_vstelm_d(p1, src_v - pitch_x2, 0, 1); +} + +static void loop_filter_vertical_edge_uv_lsx(uint8_t *src_u, uint8_t *src_v, + int32_t pitch, + const uint8_t b_limit_in, + const uint8_t limit_in, + const uint8_t thresh_in) { + uint8_t *src_u_tmp, *src_v_tmp; + int32_t pitch_x2 = pitch << 1; + int32_t pitch_x3 = pitch_x2 + pitch; + int32_t pitch_x4 = pitch << 2; + + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i mask, hev, flat, thresh, limit, b_limit; + __m128i row0, row1, row2, row3, row4, row5, row6, row7, row8; + __m128i row9, row10, row11, row12, row13, row14, row15; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + + thresh = __lsx_vreplgr2vr_b(thresh_in); + limit = __lsx_vreplgr2vr_b(limit_in); + b_limit = __lsx_vreplgr2vr_b(b_limit_in); + + src_u_tmp = src_u - 4; + row0 = __lsx_vld(src_u_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_u_tmp, pitch, src_u_tmp, pitch_x2, row1, row2); + row3 = __lsx_vldx(src_u_tmp, pitch_x3); + src_u_tmp += pitch_x4; + row4 = __lsx_vld(src_u_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_u_tmp, pitch, src_u_tmp, pitch_x2, row5, row6); + row7 = __lsx_vldx(src_u_tmp, pitch_x3); + + src_v_tmp = src_v - 4; + row8 = __lsx_vld(src_v_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_v_tmp, pitch, src_v_tmp, pitch_x2, row9, row10); + row11 = __lsx_vldx(src_v_tmp, pitch_x3); + src_v_tmp += pitch_x4; + row12 = __lsx_vld(src_v_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_v_tmp, pitch, src_v_tmp, pitch_x2, row13, row14); + row15 = __lsx_vldx(src_v_tmp, pitch_x3); + + LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p3, p2, + p1, p0, q0, q1, q2, q3); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); + + DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1); + tmp2 = __lsx_vilvl_h(tmp1, tmp0); + tmp3 = __lsx_vilvh_h(tmp1, tmp0); + + tmp0 = __lsx_vilvh_b(p0, p1); + tmp1 = __lsx_vilvh_b(q1, q0); + tmp4 = __lsx_vilvl_h(tmp1, tmp0); + tmp5 = __lsx_vilvh_h(tmp1, tmp0); + + src_u_tmp += 2; + __lsx_vstelm_w(tmp2, src_u_tmp - pitch_x4, 0, 0); + __lsx_vstelm_w(tmp2, src_u_tmp - pitch_x3, 0, 1); + __lsx_vstelm_w(tmp2, src_u_tmp - pitch_x2, 0, 2); + __lsx_vstelm_w(tmp2, src_u_tmp - pitch, 0, 3); + + __lsx_vstelm_w(tmp3, src_u_tmp, 0, 0); + __lsx_vstelm_w(tmp3, src_u_tmp + pitch, 0, 1); + __lsx_vstelm_w(tmp3, src_u_tmp + pitch_x2, 0, 2); + __lsx_vstelm_w(tmp3, src_u_tmp + pitch_x3, 0, 3); + + src_v_tmp += 2; + __lsx_vstelm_w(tmp4, src_v_tmp - pitch_x4, 0, 0); + __lsx_vstelm_w(tmp4, src_v_tmp - pitch_x3, 0, 1); + __lsx_vstelm_w(tmp4, src_v_tmp - pitch_x2, 0, 2); + __lsx_vstelm_w(tmp4, src_v_tmp - pitch, 0, 3); + + __lsx_vstelm_w(tmp5, src_v_tmp, 0, 0); + __lsx_vstelm_w(tmp5, src_v_tmp + pitch, 0, 1); + __lsx_vstelm_w(tmp5, src_v_tmp + pitch_x2, 0, 2); + __lsx_vstelm_w(tmp5, src_v_tmp + pitch_x3, 0, 3); +} + +static inline void mbloop_filter_horizontal_edge_y_lsx( + uint8_t *src, int32_t pitch, const uint8_t b_limit_in, + const uint8_t limit_in, const uint8_t thresh_in) { + uint8_t *temp_src; + int32_t pitch_x2 = pitch << 1; + int32_t pitch_x3 = pitch_x2 + pitch; + int32_t pitch_x4 = pitch << 2; + + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i mask, hev, flat, thresh, limit, b_limit; + + DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit); + thresh = __lsx_vldrepl_b(&thresh_in, 0); + + temp_src = src - pitch_x4; + DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, + temp_src, pitch_x3, p3, p2, p1, p0); + temp_src += pitch_x4; + DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, + temp_src, pitch_x3, q0, q1, q2, q3); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); + + temp_src = src - pitch_x3; + __lsx_vstx(p2, temp_src, 0); + __lsx_vstx(p1, temp_src, pitch); + __lsx_vstx(p0, temp_src, pitch_x2); + __lsx_vstx(q0, temp_src, pitch_x3); + temp_src += pitch_x4; + __lsx_vstx(q1, temp_src, 0); + __lsx_vstx(q2, temp_src, pitch); +} + +static inline void mbloop_filter_horizontal_edge_uv_lsx( + uint8_t *src_u, uint8_t *src_v, int32_t pitch, const uint8_t b_limit_in, + const uint8_t limit_in, const uint8_t thresh_in) { + uint8_t *temp_src; + int32_t pitch_x2 = pitch << 1; + int32_t pitch_x3 = pitch_x2 + pitch; + int32_t pitch_x4 = pitch << 2; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i mask, hev, flat, thresh, limit, b_limit; + __m128i p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u; + __m128i p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v; + + DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit); + thresh = __lsx_vldrepl_b(&thresh_in, 0); + + temp_src = src_u - pitch_x4; + DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, + temp_src, pitch_x3, p3_u, p2_u, p1_u, p0_u); + temp_src += pitch_x4; + DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, + temp_src, pitch_x3, q0_u, q1_u, q2_u, q3_u); + temp_src = src_v - pitch_x4; + DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, + temp_src, pitch_x3, p3_v, p2_v, p1_v, p0_v); + temp_src += pitch_x4; + DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, + temp_src, pitch_x3, q0_v, q1_v, q2_v, q3_v); + + DUP4_ARG2(__lsx_vilvl_d, p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, + p2, p1, p0); + DUP4_ARG2(__lsx_vilvl_d, q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, + q1, q2, q3); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); + + src_u -= pitch_x3; + __lsx_vstelm_d(p2, src_u, 0, 0); + __lsx_vstelm_d(p1, src_u + pitch, 0, 0); + __lsx_vstelm_d(p0, src_u + pitch_x2, 0, 0); + __lsx_vstelm_d(q0, src_u + pitch_x3, 0, 0); + src_u += pitch_x4; + __lsx_vstelm_d(q1, src_u, 0, 0); + src_u += pitch; + __lsx_vstelm_d(q2, src_u, 0, 0); + + src_v -= pitch_x3; + __lsx_vstelm_d(p2, src_v, 0, 1); + __lsx_vstelm_d(p1, src_v + pitch, 0, 1); + __lsx_vstelm_d(p0, src_v + pitch_x2, 0, 1); + __lsx_vstelm_d(q0, src_v + pitch_x3, 0, 1); + src_v += pitch_x4; + __lsx_vstelm_d(q1, src_v, 0, 1); + src_v += pitch; + __lsx_vstelm_d(q2, src_v, 0, 1); +} + +static inline void mbloop_filter_vertical_edge_y_lsx(uint8_t *src, + int32_t pitch, + const uint8_t b_limit_in, + const uint8_t limit_in, + const uint8_t thresh_in) { + uint8_t *temp_src; + int32_t pitch_x2 = pitch << 1; + int32_t pitch_x3 = pitch_x2 + pitch; + int32_t pitch_x4 = pitch << 2; + + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i mask, hev, flat, thresh, limit, b_limit; + __m128i row0, row1, row2, row3, row4, row5, row6, row7, row8; + __m128i row9, row10, row11, row12, row13, row14, row15; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit); + thresh = __lsx_vldrepl_b(&thresh_in, 0); + temp_src = src - 4; + DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, + temp_src, pitch_x3, row0, row1, row2, row3); + temp_src += pitch_x4; + DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, + temp_src, pitch_x3, row4, row5, row6, row7); + temp_src += pitch_x4; + DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, + temp_src, pitch_x3, row8, row9, row10, row11); + temp_src += pitch_x4; + DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, + temp_src, pitch_x3, row12, row13, row14, row15); + temp_src -= pitch_x4; + LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p3, p2, + p1, p0, q0, q1, q2, q3); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); + DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, tmp0, tmp1); + tmp3 = __lsx_vilvl_h(tmp1, tmp0); + tmp4 = __lsx_vilvh_h(tmp1, tmp0); + DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, tmp0, tmp1); + tmp6 = __lsx_vilvl_h(tmp1, tmp0); + tmp7 = __lsx_vilvh_h(tmp1, tmp0); + tmp2 = __lsx_vilvl_b(q2, q1); + tmp5 = __lsx_vilvh_b(q2, q1); + + temp_src = src - 3; + VP8_ST6x1_B(tmp3, 0, tmp2, 0, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp3, 1, tmp2, 1, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp3, 2, tmp2, 2, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp3, 3, tmp2, 3, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp4, 0, tmp2, 4, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp4, 1, tmp2, 5, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp4, 2, tmp2, 6, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp4, 3, tmp2, 7, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp6, 0, tmp5, 0, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp6, 1, tmp5, 1, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp6, 2, tmp5, 2, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp6, 3, tmp5, 3, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp7, 0, tmp5, 4, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp7, 1, tmp5, 5, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp7, 2, tmp5, 6, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp7, 3, tmp5, 7, temp_src, 4); +} + +static inline void mbloop_filter_vertical_edge_uv_lsx( + uint8_t *src_u, uint8_t *src_v, int32_t pitch, const uint8_t b_limit_in, + const uint8_t limit_in, const uint8_t thresh_in) { + int32_t pitch_x2 = pitch << 1; + int32_t pitch_x3 = pitch_x2 + pitch; + int32_t pitch_x4 = pitch << 2; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i mask, hev, flat, thresh, limit, b_limit; + __m128i row0, row1, row2, row3, row4, row5, row6, row7, row8; + __m128i row9, row10, row11, row12, row13, row14, row15; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit); + thresh = __lsx_vldrepl_b(&thresh_in, 0); + + src_u -= 4; + DUP4_ARG2(__lsx_vldx, src_u, 0, src_u, pitch, src_u, pitch_x2, src_u, + pitch_x3, row0, row1, row2, row3); + src_u += pitch_x4; + DUP4_ARG2(__lsx_vldx, src_u, 0, src_u, pitch, src_u, pitch_x2, src_u, + pitch_x3, row4, row5, row6, row7); + src_v -= 4; + DUP4_ARG2(__lsx_vldx, src_v, 0, src_v, pitch, src_v, pitch_x2, src_v, + pitch_x3, row8, row9, row10, row11); + src_v += pitch_x4; + DUP4_ARG2(__lsx_vldx, src_v, 0, src_v, pitch, src_v, pitch_x2, src_v, + pitch_x3, row12, row13, row14, row15); + LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p3, p2, + p1, p0, q0, q1, q2, q3); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); + + DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, tmp0, tmp1); + tmp3 = __lsx_vilvl_h(tmp1, tmp0); + tmp4 = __lsx_vilvh_h(tmp1, tmp0); + DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, tmp0, tmp1); + tmp6 = __lsx_vilvl_h(tmp1, tmp0); + tmp7 = __lsx_vilvh_h(tmp1, tmp0); + tmp2 = __lsx_vilvl_b(q2, q1); + tmp5 = __lsx_vilvh_b(q2, q1); + + src_u += 1 - pitch_x4; + VP8_ST6x1_B(tmp3, 0, tmp2, 0, src_u, 4); + src_u += pitch; + VP8_ST6x1_B(tmp3, 1, tmp2, 1, src_u, 4); + src_u += pitch; + VP8_ST6x1_B(tmp3, 2, tmp2, 2, src_u, 4); + src_u += pitch; + VP8_ST6x1_B(tmp3, 3, tmp2, 3, src_u, 4); + src_u += pitch; + VP8_ST6x1_B(tmp4, 0, tmp2, 4, src_u, 4); + src_u += pitch; + VP8_ST6x1_B(tmp4, 1, tmp2, 5, src_u, 4); + src_u += pitch; + VP8_ST6x1_B(tmp4, 2, tmp2, 6, src_u, 4); + src_u += pitch; + VP8_ST6x1_B(tmp4, 3, tmp2, 7, src_u, 4); + + src_v += 1 - pitch_x4; + VP8_ST6x1_B(tmp6, 0, tmp5, 0, src_v, 4); + src_v += pitch; + VP8_ST6x1_B(tmp6, 1, tmp5, 1, src_v, 4); + src_v += pitch; + VP8_ST6x1_B(tmp6, 2, tmp5, 2, src_v, 4); + src_v += pitch; + VP8_ST6x1_B(tmp6, 3, tmp5, 3, src_v, 4); + src_v += pitch; + VP8_ST6x1_B(tmp7, 0, tmp5, 4, src_v, 4); + src_v += pitch; + VP8_ST6x1_B(tmp7, 1, tmp5, 5, src_v, 4); + src_v += pitch; + VP8_ST6x1_B(tmp7, 2, tmp5, 6, src_v, 4); + src_v += pitch; + VP8_ST6x1_B(tmp7, 3, tmp5, 7, src_v, 4); +} + +void vp8_loop_filter_mbh_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v, + int32_t pitch_y, int32_t pitch_u_v, + loop_filter_info *lpf_info_ptr) { + mbloop_filter_horizontal_edge_y_lsx(src_y, pitch_y, *lpf_info_ptr->mblim, + *lpf_info_ptr->lim, + *lpf_info_ptr->hev_thr); + if (src_u) { + mbloop_filter_horizontal_edge_uv_lsx( + src_u, src_v, pitch_u_v, *lpf_info_ptr->mblim, *lpf_info_ptr->lim, + *lpf_info_ptr->hev_thr); + } +} + +void vp8_loop_filter_mbv_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v, + int32_t pitch_y, int32_t pitch_u_v, + loop_filter_info *lpf_info_ptr) { + mbloop_filter_vertical_edge_y_lsx(src_y, pitch_y, *lpf_info_ptr->mblim, + *lpf_info_ptr->lim, *lpf_info_ptr->hev_thr); + if (src_u) { + mbloop_filter_vertical_edge_uv_lsx(src_u, src_v, pitch_u_v, + *lpf_info_ptr->mblim, *lpf_info_ptr->lim, + *lpf_info_ptr->hev_thr); + } +} + +void vp8_loop_filter_bh_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v, + int32_t pitch_y, int32_t pitch_u_v, + loop_filter_info *lpf_info_ptr) { + loop_filter_horizontal_4_dual_lsx(src_y + 4 * pitch_y, pitch_y, + lpf_info_ptr->blim, lpf_info_ptr->lim, + lpf_info_ptr->hev_thr, lpf_info_ptr->blim, + lpf_info_ptr->lim, lpf_info_ptr->hev_thr); + loop_filter_horizontal_4_dual_lsx(src_y + 8 * pitch_y, pitch_y, + lpf_info_ptr->blim, lpf_info_ptr->lim, + lpf_info_ptr->hev_thr, lpf_info_ptr->blim, + lpf_info_ptr->lim, lpf_info_ptr->hev_thr); + loop_filter_horizontal_4_dual_lsx(src_y + 12 * pitch_y, pitch_y, + lpf_info_ptr->blim, lpf_info_ptr->lim, + lpf_info_ptr->hev_thr, lpf_info_ptr->blim, + lpf_info_ptr->lim, lpf_info_ptr->hev_thr); + if (src_u) { + loop_filter_horizontal_edge_uv_lsx( + src_u + (4 * pitch_u_v), src_v + (4 * pitch_u_v), pitch_u_v, + *lpf_info_ptr->blim, *lpf_info_ptr->lim, *lpf_info_ptr->hev_thr); + } +} + +void vp8_loop_filter_bv_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v, + int32_t pitch_y, int32_t pitch_u_v, + loop_filter_info *lpf_info_ptr) { + loop_filter_vertical_4_dual_lsx(src_y + 4, pitch_y, lpf_info_ptr->blim, + lpf_info_ptr->lim, lpf_info_ptr->hev_thr, + lpf_info_ptr->blim, lpf_info_ptr->lim, + lpf_info_ptr->hev_thr); + loop_filter_vertical_4_dual_lsx(src_y + 8, pitch_y, lpf_info_ptr->blim, + lpf_info_ptr->lim, lpf_info_ptr->hev_thr, + lpf_info_ptr->blim, lpf_info_ptr->lim, + lpf_info_ptr->hev_thr); + loop_filter_vertical_4_dual_lsx(src_y + 12, pitch_y, lpf_info_ptr->blim, + lpf_info_ptr->lim, lpf_info_ptr->hev_thr, + lpf_info_ptr->blim, lpf_info_ptr->lim, + lpf_info_ptr->hev_thr); + if (src_u) { + loop_filter_vertical_edge_uv_lsx(src_u + 4, src_v + 4, pitch_u_v, + *lpf_info_ptr->blim, *lpf_info_ptr->lim, + *lpf_info_ptr->hev_thr); + } +} diff --git a/media/libvpx/libvpx/vp8/common/loongarch/sixtap_filter_lsx.c b/media/libvpx/libvpx/vp8/common/loongarch/sixtap_filter_lsx.c new file mode 100644 index 0000000000..9867633415 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/loongarch/sixtap_filter_lsx.c @@ -0,0 +1,1904 @@ +/* + * Copyright (c) 2021 Loongson Technology Corporation Limited + * Contributed by Lu Wang + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vp8/common/filter.h" +#include "vpx_ports/mem.h" +#include "vpx_util/loongson_intrinsics.h" + +DECLARE_ALIGNED(16, static const int8_t, vp8_subpel_filters_lsx[7][8]) = { + { 0, -6, 123, 12, -1, 0, 0, 0 }, + { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */ + { 0, -9, 93, 50, -6, 0, 0, 0 }, + { 3, -16, 77, 77, -16, 3, 0, 0 }, /* New 1/2 pel 6 tap filter */ + { 0, -6, 50, 93, -9, 0, 0, 0 }, + { 1, -8, 36, 108, -11, 2, 0, 0 }, /* New 1/4 pel 6 tap filter */ + { 0, -1, 12, 123, -6, 0, 0, 0 }, +}; + +static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = { + /* 8 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + /* 4 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, + /* 4 width cases */ + 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 +}; + +static INLINE __m128i dpadd_h3(__m128i in0, __m128i in1, __m128i in2, + __m128i coeff0, __m128i coeff1, __m128i coeff2) { + __m128i out0_m; + + out0_m = __lsx_vdp2_h_b(in0, coeff0); + out0_m = __lsx_vdp2add_h_b(out0_m, in1, coeff1); + out0_m = __lsx_vdp2add_h_b(out0_m, in2, coeff2); + + return out0_m; +} + +static INLINE __m128i horiz_6tap_filt(__m128i src0, __m128i src1, __m128i mask0, + __m128i mask1, __m128i mask2, + __m128i filt_h0, __m128i filt_h1, + __m128i filt_h2) { + __m128i vec0_m, vec1_m, vec2_m; + __m128i hz_out_m; + + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m, + vec1_m); + vec2_m = __lsx_vshuf_b(src1, src0, mask2); + hz_out_m = dpadd_h3(vec0_m, vec1_m, vec2_m, filt_h0, filt_h1, filt_h2); + hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT); + hz_out_m = __lsx_vsat_h(hz_out_m, 7); + + return hz_out_m; +} + +static INLINE __m128i filt_4tap_dpadd_h(__m128i vec0, __m128i vec1, + __m128i filt0, __m128i filt1) { + __m128i tmp_m; + + tmp_m = __lsx_vdp2_h_b(vec0, filt0); + tmp_m = __lsx_vdp2add_h_b(tmp_m, vec1, filt1); + + return tmp_m; +} + +static INLINE __m128i horiz_4tap_filt(__m128i src0, __m128i src1, __m128i mask0, + __m128i mask1, __m128i filt_h0, + __m128i filt_h1) { + __m128i vec0_m, vec1_m, hz_out_m; + + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m, + vec1_m); + hz_out_m = filt_4tap_dpadd_h(vec0_m, vec1_m, filt_h0, filt_h1); + hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT); + hz_out_m = __lsx_vsat_h(hz_out_m, 7); + + return hz_out_m; +} + +#define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + mask2, filt0, filt1, filt2, out0, out1) \ + do { \ + __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m; \ + \ + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src3, src2, mask0, vec0_m, \ + vec1_m); \ + DUP2_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, out0, out1); \ + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask1, src3, src2, mask1, vec2_m, \ + vec3_m); \ + DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec2_m, filt1, out1, vec3_m, filt1, \ + out0, out1); \ + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask2, src3, src2, mask2, vec4_m, \ + vec5_m); \ + DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2, \ + out0, out1); \ + } while (0) + +#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + mask2, filt0, filt1, filt2, out0, out1, \ + out2, out3) \ + do { \ + __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + \ + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, vec0_m, \ + vec1_m); \ + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0, vec2_m, \ + vec3_m); \ + DUP4_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, vec2_m, filt0, \ + vec3_m, filt0, out0, out1, out2, out3); \ + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, vec0_m, \ + vec1_m); \ + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1, vec2_m, \ + vec3_m); \ + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, vec4_m, \ + vec5_m); \ + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2, vec6_m, \ + vec7_m); \ + DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec0_m, filt1, out1, vec1_m, filt1, \ + out2, vec2_m, filt1, out3, vec3_m, filt1, out0, out1, out2, \ + out3); \ + DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2, \ + out2, vec6_m, filt2, out3, vec7_m, filt2, out0, out1, out2, \ + out3); \ + } while (0) + +#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + filt0, filt1, out0, out1) \ + do { \ + __m128i vec0_m, vec1_m, vec2_m, vec3_m; \ + \ + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src3, src2, mask0, vec0_m, \ + vec1_m); \ + DUP2_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, out0, out1); \ + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask1, src3, src2, mask1, vec2_m, \ + vec3_m); \ + DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec2_m, filt1, out1, vec3_m, filt1, \ + out0, out1); \ + } while (0) + +#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + filt0, filt1, out0, out1, out2, out3) \ + do { \ + __m128i vec0_m, vec1_m, vec2_m, vec3_m; \ + \ + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, vec0_m, \ + vec1_m); \ + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0, vec2_m, \ + vec3_m); \ + DUP4_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, vec2_m, filt0, \ + vec3_m, filt0, out0, out1, out2, out3); \ + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, vec0_m, \ + vec1_m); \ + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1, vec2_m, \ + vec3_m); \ + DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec0_m, filt1, out1, vec1_m, filt1, \ + out2, vec2_m, filt1, out3, vec3_m, filt1, out0, out1, out2, \ + out3); \ + } while (0) + +static inline void common_hz_6t_4x4_lsx(uint8_t *RESTRICT src, + int32_t src_stride, + uint8_t *RESTRICT dst, + int32_t dst_stride, + const int8_t *filter) { + __m128i src0, src1, src2, src3, filt0, filt1, filt2; + __m128i mask0, mask1, mask2, out0, out1; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16); + src -= 2; + + DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); + filt2 = __lsx_vldrepl_h(filter, 4); + + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2); + src3 = __lsx_vldx(src, src_stride_x3); + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0, + filt1, filt2, out0, out1); + out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT); + out0 = __lsx_vxori_b(out0, 128); + + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); +} + +static void common_hz_6t_4x8_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter) { + __m128i src0, src1, src2, src3, filt0, filt1, filt2; + __m128i mask0, mask1, mask2, out0, out1, out2, out3; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride_x2 << 1; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16); + src -= 2; + + DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); + filt2 = __lsx_vldrepl_h(filter, 4); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2); + src3 = __lsx_vldx(src, src_stride_x3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + src += src_stride_x4; + HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0, + filt1, filt2, out0, out1); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2); + src3 = __lsx_vldx(src, src_stride_x3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0, + filt1, filt2, out2, out3); + + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2, + VP8_FILTER_SHIFT, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); + dst += dst_stride; + + __lsx_vstelm_w(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 3); +} + +static void common_hz_6t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + if (height == 4) { + common_hz_6t_4x4_lsx(src, src_stride, dst, dst_stride, filter); + } else if (height == 8) { + common_hz_6t_4x8_lsx(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_6t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src3, filt0, filt1, filt2; + __m128i mask0, mask1, mask2, tmp0, tmp1; + __m128i filt, out0, out1, out2, out3; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0); + src -= 2; + + filt = __lsx_vld(filter, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1); + filt2 = __lsx_vreplvei_h(filt, 2); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + src += src_stride_x4; + HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0, + filt1, filt2, out0, out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2, + VP8_FILTER_SHIFT, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + __lsx_vstelm_d(tmp0, dst, 0, 0); + __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1); + dst += dst_stride_x4; + + for (loop_cnt = (height >> 2) - 1; loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + src += src_stride_x4; + HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + filt0, filt1, filt2, out0, out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2, + VP8_FILTER_SHIFT, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + __lsx_vstelm_d(tmp0, dst, 0, 0); + __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1); + dst += dst_stride_x4; + } +} + +static void common_hz_6t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2; + __m128i mask0, mask1, mask2, out; + __m128i filt, out0, out1, out2, out3, out4, out5, out6, out7; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0); + src -= 2; + + filt = __lsx_vld(filter, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1); + filt2 = __lsx_vreplvei_h(filt, 2); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src0, src2, src4, src6); + src += 8; + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src1, src3, src5, src7); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP4_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src6, 128, src7, 128, src4, + src5, src6, src7); + src += src_stride_x4 - 8; + + HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + filt0, filt1, filt2, out0, out1, out2, out3); + HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2, + filt0, filt1, filt2, out4, out5, out6, out7); + DUP4_ARG2(__lsx_vsrari_h, out0, VP8_FILTER_SHIFT, out1, VP8_FILTER_SHIFT, + out2, VP8_FILTER_SHIFT, out3, VP8_FILTER_SHIFT, out0, out1, out2, + out3); + DUP4_ARG2(__lsx_vsrari_h, out4, VP8_FILTER_SHIFT, out5, VP8_FILTER_SHIFT, + out6, VP8_FILTER_SHIFT, out7, VP8_FILTER_SHIFT, out4, out5, out6, + out7); + DUP4_ARG2(__lsx_vsat_h, out0, 7, out1, 7, out2, 7, out3, 7, out0, out1, + out2, out3); + DUP4_ARG2(__lsx_vsat_h, out4, 7, out5, 7, out6, 7, out7, 7, out4, out5, + out6, out7); + out = __lsx_vpickev_b(out1, out0); + out = __lsx_vxori_b(out, 128); + __lsx_vst(out, dst, 0); + out = __lsx_vpickev_b(out3, out2); + out = __lsx_vxori_b(out, 128); + __lsx_vstx(out, dst, dst_stride); + out = __lsx_vpickev_b(out5, out4); + out = __lsx_vxori_b(out, 128); + __lsx_vstx(out, dst, dst_stride_x2); + out = __lsx_vpickev_b(out7, out6); + out = __lsx_vxori_b(out, 128); + __lsx_vstx(out, dst, dst_stride_x3); + dst += dst_stride_x4; + } +} + +static void common_vt_6t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; + __m128i src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; + __m128i src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2; + __m128i out0, out1; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + + DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); + filt2 = __lsx_vldrepl_h(filter, 4); + + DUP2_ARG2(__lsx_vldx, src, -src_stride_x2, src, -src_stride, src0, src1); + src2 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src3, src4); + src += src_stride_x3; + + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, + src10_r, src21_r, src32_r, src43_r); + DUP2_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r, src2110, + src4332); + DUP2_ARG2(__lsx_vxori_b, src2110, 128, src4332, 128, src2110, src4332); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + src5 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src6, src7); + src8 = __lsx_vldx(src, src_stride_x3); + src += src_stride_x4; + + DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, + src54_r, src65_r, src76_r, src87_r); + DUP2_ARG2(__lsx_vilvl_d, src65_r, src54_r, src87_r, src76_r, src6554, + src8776); + DUP2_ARG2(__lsx_vxori_b, src6554, 128, src8776, 128, src6554, src8776); + out0 = dpadd_h3(src2110, src4332, src6554, filt0, filt1, filt2); + out1 = dpadd_h3(src4332, src6554, src8776, filt0, filt1, filt2); + + out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT); + out0 = __lsx_vxori_b(out0, 128); + + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); + dst += dst_stride; + + src2110 = src6554; + src4332 = src8776; + src4 = src8; + } +} + +static void common_vt_6t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src3, src4, src7, src8, src9, src10; + __m128i src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r; + __m128i src109_r, filt0, filt1, filt2; + __m128i tmp0, tmp1; + __m128i filt, out0_r, out1_r, out2_r, out3_r; + + src -= src_stride_x2; + filt = __lsx_vld(filter, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1); + filt2 = __lsx_vreplvei_h(filt, 2); + + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src0, src1, src2, src3); + src += src_stride_x4; + src4 = __lsx_vld(src, 0); + src += src_stride; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + src4 = __lsx_vxori_b(src4, 128); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src2, src1, src4, src3, + src10_r, src32_r, src21_r, src43_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src7, src8, src9, src10); + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + src += src_stride_x4; + + DUP4_ARG2(__lsx_vilvl_b, src7, src4, src8, src7, src9, src8, src10, src9, + src76_r, src87_r, src98_r, src109_r); + out0_r = dpadd_h3(src10_r, src32_r, src76_r, filt0, filt1, filt2); + out1_r = dpadd_h3(src21_r, src43_r, src87_r, filt0, filt1, filt2); + out2_r = dpadd_h3(src32_r, src76_r, src98_r, filt0, filt1, filt2); + out3_r = dpadd_h3(src43_r, src87_r, src109_r, filt0, filt1, filt2); + DUP2_ARG3(__lsx_vssrarni_b_h, out1_r, out0_r, VP8_FILTER_SHIFT, out3_r, + out2_r, VP8_FILTER_SHIFT, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + __lsx_vstelm_d(tmp0, dst, 0, 0); + __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1); + dst += dst_stride_x4; + + src10_r = src76_r; + src32_r = src98_r; + src21_r = src87_r; + src43_r = src109_r; + src4 = src10; + } +} + +static void common_vt_6t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; + __m128i src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; + __m128i src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l; + __m128i src65_l, src87_l, filt0, filt1, filt2; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; + + src -= src_stride_x2; + filt = __lsx_vld(filter, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1); + filt2 = __lsx_vreplvei_h(filt, 2); + + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src0, src1, src2, src3); + src += src_stride_x4; + src4 = __lsx_vldx(src, 0); + src += src_stride; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + src4 = __lsx_vxori_b(src4, 128); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src4, src3, src2, src1, + src10_r, src32_r, src43_r, src21_r); + DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src4, src3, src2, src1, + src10_l, src32_l, src43_l, src21_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src5, src6, src7, src8); + src += src_stride_x4; + + DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5, + src6, src7, src8); + DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, + src54_r, src65_r, src76_r, src87_r); + DUP4_ARG2(__lsx_vilvh_b, src5, src4, src6, src5, src7, src6, src8, src7, + src54_l, src65_l, src76_l, src87_l); + out0_r = dpadd_h3(src10_r, src32_r, src54_r, filt0, filt1, filt2); + out1_r = dpadd_h3(src21_r, src43_r, src65_r, filt0, filt1, filt2); + out2_r = dpadd_h3(src32_r, src54_r, src76_r, filt0, filt1, filt2); + out3_r = dpadd_h3(src43_r, src65_r, src87_r, filt0, filt1, filt2); + out0_l = dpadd_h3(src10_l, src32_l, src54_l, filt0, filt1, filt2); + out1_l = dpadd_h3(src21_l, src43_l, src65_l, filt0, filt1, filt2); + out2_l = dpadd_h3(src32_l, src54_l, src76_l, filt0, filt1, filt2); + out3_l = dpadd_h3(src43_l, src65_l, src87_l, filt0, filt1, filt2); + DUP4_ARG3(__lsx_vssrarni_b_h, out0_l, out0_r, VP8_FILTER_SHIFT, out1_l, + out1_r, VP8_FILTER_SHIFT, out2_l, out2_r, VP8_FILTER_SHIFT, + out3_l, out3_r, VP8_FILTER_SHIFT, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp2, 128, tmp3, 128, tmp0, + tmp1, tmp2, tmp3); + __lsx_vstx(tmp0, dst, 0); + __lsx_vstx(tmp1, dst, dst_stride); + __lsx_vstx(tmp2, dst, dst_stride_x2); + __lsx_vstx(tmp3, dst, dst_stride_x3); + dst += dst_stride_x4; + + src10_r = src54_r; + src32_r = src76_r; + src21_r = src65_r; + src43_r = src87_r; + src10_l = src54_l; + src32_l = src76_l; + src21_l = src65_l; + src43_l = src87_l; + src4 = src8; + } +} + +static void common_hv_6ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, tmp0, tmp1; + __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + __m128i hz_out7, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16); + src -= 2; + + DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0, + filt_hz1); + filt_hz2 = __lsx_vldrepl_h(filter_horiz, 4); + DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0, + filt_vt1); + filt_vt2 = __lsx_vldrepl_h(filter_vert, 4); + + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + + DUP2_ARG2(__lsx_vldx, src, -src_stride_x2, src, -src_stride, src0, src1); + src2 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src3, src4); + src += src_stride_x3; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + src4 = __lsx_vxori_b(src4, 128); + + hz_out0 = horiz_6tap_filt(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out2 = horiz_6tap_filt(src2, src3, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff); + hz_out3 = horiz_6tap_filt(src3, src4, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + src5 = __lsx_vld(src, 0); + src6 = __lsx_vldx(src, src_stride); + src += src_stride_x2; + + DUP2_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src5, src6); + hz_out5 = horiz_6tap_filt(src5, src6, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff); + + src7 = __lsx_vld(src, 0); + src8 = __lsx_vldx(src, src_stride); + src += src_stride_x2; + + DUP2_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src7, src8); + hz_out7 = horiz_6tap_filt(src7, src8, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + hz_out6 = __lsx_vshuf_b(hz_out7, hz_out5, shuff); + + out2 = __lsx_vpackev_b(hz_out5, hz_out4); + tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); + + out3 = __lsx_vpackev_b(hz_out7, hz_out6); + tmp1 = dpadd_h3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2); + + tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7); + tmp0 = __lsx_vxori_b(tmp0, 128); + __lsx_vstelm_w(tmp0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 3); + dst += dst_stride; + + hz_out3 = hz_out7; + out0 = out2; + out1 = out3; + } +} + +static void common_hv_6ht_6vt_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; + __m128i filt_hz0, filt_hz1, filt_hz2; + __m128i mask0, mask1, mask2, vec0, vec1; + __m128i filt, filt_vt0, filt_vt1, filt_vt2; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + __m128i hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7; + __m128i tmp0, tmp1, tmp2, tmp3; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0); + src -= (2 + src_stride_x2); + + filt = __lsx_vld(filter_horiz, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1); + filt_hz2 = __lsx_vreplvei_h(filt, 2); + + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src0, src1, src2, src3); + src += src_stride_x4; + src4 = __lsx_vldx(src, 0); + src += src_stride; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + src4 = __lsx_vxori_b(src4, 128); + + hz_out0 = horiz_6tap_filt(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out1 = horiz_6tap_filt(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out2 = horiz_6tap_filt(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out3 = horiz_6tap_filt(src3, src3, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out4 = horiz_6tap_filt(src4, src4, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + filt = __lsx_vld(filter_vert, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1); + filt_vt2 = __lsx_vreplvei_h(filt, 2); + + DUP4_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, hz_out2, + hz_out1, hz_out4, hz_out3, out0, out1, out3, out4); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src5, src6, src7, src8); + src += src_stride_x4; + + DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5, + src6, src7, src8); + hz_out5 = horiz_6tap_filt(src5, src5, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + out2 = __lsx_vpackev_b(hz_out5, hz_out4); + tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); + + hz_out6 = horiz_6tap_filt(src6, src6, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + out5 = __lsx_vpackev_b(hz_out6, hz_out5); + tmp1 = dpadd_h3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2); + + hz_out7 = horiz_6tap_filt(src7, src7, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + out7 = __lsx_vpackev_b(hz_out7, hz_out6); + tmp2 = dpadd_h3(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2); + + hz_out8 = horiz_6tap_filt(src8, src8, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + out6 = __lsx_vpackev_b(hz_out8, hz_out7); + tmp3 = dpadd_h3(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2); + + DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, VP8_FILTER_SHIFT, tmp3, tmp2, + VP8_FILTER_SHIFT, vec0, vec1); + DUP2_ARG2(__lsx_vxori_b, vec0, 128, vec1, 128, vec0, vec1); + + __lsx_vstelm_d(vec0, dst, 0, 0); + __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(vec1, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(vec1, dst + dst_stride_x3, 0, 1); + dst += dst_stride_x4; + + hz_out4 = hz_out8; + out0 = out2; + out1 = out7; + out3 = out5; + out4 = out6; + } +} + +static void common_hv_6ht_6vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + common_hv_6ht_6vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + common_hv_6ht_6vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride, + filter_horiz, filter_vert, height); +} + +static void common_hz_4t_4x4_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter) { + __m128i src0, src1, src2, src3, filt0, filt1, mask0, mask1; + __m128i out0, out1; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16); + src -= 1; + + DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); + mask1 = __lsx_vaddi_bu(mask0, 2); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2); + src3 = __lsx_vldx(src, src_stride_x3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, + out0, out1); + + out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT); + out0 = __lsx_vxori_b(out0, 128); + + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); +} + +static void common_hz_4t_4x8_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter) { + __m128i src0, src1, src2, src3, filt0, filt1, mask0, mask1; + __m128i out0, out1, out2, out3; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16); + src -= 1; + + DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); + mask1 = __lsx_vaddi_bu(mask0, 2); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2); + src3 = __lsx_vldx(src, src_stride_x3); + src += src_stride_x4; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, + out0, out1); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2); + src3 = __lsx_vldx(src, src_stride_x3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, + out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2, + VP8_FILTER_SHIFT, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); + dst += dst_stride; + + __lsx_vstelm_w(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 3); +} + +static void common_hz_4t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + if (height == 4) { + common_hz_4t_4x4_lsx(src, src_stride, dst, dst_stride, filter); + } else if (height == 8) { + common_hz_4t_4x8_lsx(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_4t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src3, filt0, filt1, mask0, mask1; + __m128i tmp0, tmp1; + __m128i filt, out0, out1, out2, out3; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0); + src -= 1; + + filt = __lsx_vld(filter, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1); + mask1 = __lsx_vaddi_bu(mask0, 2); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src0, src1, src2, src3); + src += src_stride_x4; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, + filt1, out0, out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2, + VP8_FILTER_SHIFT, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + __lsx_vstelm_d(tmp0, dst, 0, 0); + __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1); + dst += dst_stride_x4; + } +} + +static void common_hz_4t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i filt0, filt1, mask0, mask1; + __m128i filt, out0, out1, out2, out3, out4, out5, out6, out7; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0); + src -= 1; + + filt = __lsx_vld(filter, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1); + mask1 = __lsx_vaddi_bu(mask0, 2); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src0, src2, src4, src6); + src += 8; + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src1, src3, src5, src7); + src += src_stride_x4 - 8; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP4_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src6, 128, src7, 128, src4, + src5, src6, src7); + HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, + filt1, out0, out1, out2, out3); + HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0, + filt1, out4, out5, out6, out7); + DUP4_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2, + VP8_FILTER_SHIFT, out5, out4, VP8_FILTER_SHIFT, out7, out6, + VP8_FILTER_SHIFT, out0, out1, out2, out3); + DUP4_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out2, 128, out3, 128, out0, + out1, out2, out3); + __lsx_vstx(out0, dst, 0); + __lsx_vstx(out1, dst, dst_stride); + __lsx_vstx(out2, dst, dst_stride_x2); + __lsx_vstx(out3, dst, dst_stride_x3); + dst += dst_stride_x4; + } +} + +static void common_vt_4t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + __m128i src0, src1, src2, src3, src4, src5; + __m128i src10_r, src32_r, src54_r, src21_r, src43_r, src65_r; + __m128i src2110, src4332, filt0, filt1, out0, out1; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + + DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); + DUP2_ARG2(__lsx_vldx, src, -src_stride, src, src_stride, src0, src2); + src1 = __lsx_vld(src, 0); + src += src_stride_x2; + + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r); + + src2110 = __lsx_vilvl_d(src21_r, src10_r); + src2110 = __lsx_vxori_b(src2110, 128); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + src3 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src4, src5); + src += src_stride_x3; + DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r); + src4332 = __lsx_vilvl_d(src43_r, src32_r); + src4332 = __lsx_vxori_b(src4332, 128); + out0 = filt_4tap_dpadd_h(src2110, src4332, filt0, filt1); + + src2 = __lsx_vld(src, 0); + src += src_stride; + DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src54_r, src65_r); + src2110 = __lsx_vilvl_d(src65_r, src54_r); + src2110 = __lsx_vxori_b(src2110, 128); + out1 = filt_4tap_dpadd_h(src4332, src2110, filt0, filt1); + out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT); + out0 = __lsx_vxori_b(out0, 128); + + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); + dst += dst_stride; + } +} + +static void common_vt_4t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src7, src8, src9, src10; + __m128i src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1; + __m128i tmp0, tmp1; + __m128i filt, out0_r, out1_r, out2_r, out3_r; + + src -= src_stride; + filt = __lsx_vld(filter, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1); + + DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1); + src2 = __lsx_vldx(src, src_stride_x2); + src += src_stride_x3; + + DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1); + src2 = __lsx_vxori_b(src2, 128); + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src7, src8, src9, src10); + src += src_stride_x4; + + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + DUP4_ARG2(__lsx_vilvl_b, src7, src2, src8, src7, src9, src8, src10, src9, + src72_r, src87_r, src98_r, src109_r); + out0_r = filt_4tap_dpadd_h(src10_r, src72_r, filt0, filt1); + out1_r = filt_4tap_dpadd_h(src21_r, src87_r, filt0, filt1); + out2_r = filt_4tap_dpadd_h(src72_r, src98_r, filt0, filt1); + out3_r = filt_4tap_dpadd_h(src87_r, src109_r, filt0, filt1); + DUP2_ARG3(__lsx_vssrarni_b_h, out1_r, out0_r, VP8_FILTER_SHIFT, out3_r, + out2_r, VP8_FILTER_SHIFT, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + __lsx_vstelm_d(tmp0, dst, 0, 0); + __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1); + dst += dst_stride_x4; + + src10_r = src98_r; + src21_r = src109_r; + src2 = src10; + } +} + +static void common_vt_4t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src3, src4, src5, src6; + __m128i src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l; + __m128i src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; + + src -= src_stride; + filt = __lsx_vld(filter, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1); + + DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1); + src2 = __lsx_vldx(src, src_stride_x2); + src += src_stride_x3; + + DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1); + src2 = __lsx_vxori_b(src2, 128); + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src3, src4, src5, src6); + src += src_stride_x4; + + DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3, + src4, src5, src6); + DUP4_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src5, src4, src6, src5, + src32_r, src43_r, src54_r, src65_r); + DUP4_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src5, src4, src6, src5, + src32_l, src43_l, src54_l, src65_l); + out0_r = filt_4tap_dpadd_h(src10_r, src32_r, filt0, filt1); + out1_r = filt_4tap_dpadd_h(src21_r, src43_r, filt0, filt1); + out2_r = filt_4tap_dpadd_h(src32_r, src54_r, filt0, filt1); + out3_r = filt_4tap_dpadd_h(src43_r, src65_r, filt0, filt1); + out0_l = filt_4tap_dpadd_h(src10_l, src32_l, filt0, filt1); + out1_l = filt_4tap_dpadd_h(src21_l, src43_l, filt0, filt1); + out2_l = filt_4tap_dpadd_h(src32_l, src54_l, filt0, filt1); + out3_l = filt_4tap_dpadd_h(src43_l, src65_l, filt0, filt1); + DUP4_ARG3(__lsx_vssrarni_b_h, out0_l, out0_r, VP8_FILTER_SHIFT, out1_l, + out1_r, VP8_FILTER_SHIFT, out2_l, out2_r, VP8_FILTER_SHIFT, + out3_l, out3_r, VP8_FILTER_SHIFT, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp2, 128, tmp3, 128, tmp0, + tmp1, tmp2, tmp3); + __lsx_vstx(tmp0, dst, 0); + __lsx_vstx(tmp1, dst, dst_stride); + __lsx_vstx(tmp2, dst, dst_stride_x2); + __lsx_vstx(tmp3, dst, dst_stride_x3); + dst += dst_stride_x4; + + src10_r = src54_r; + src21_r = src65_r; + src10_l = src54_l; + src21_l = src65_l; + src2 = src6; + } +} + +static void common_hv_4ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + __m128i src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1; + __m128i mask0, mask1, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16); + src -= 1; + + DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0, + filt_hz1); + mask1 = __lsx_vaddi_bu(mask0, 2); + + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, -src_stride, src, src_stride, src0, src2); + src += src_stride_x2; + + DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1); + src2 = __lsx_vxori_b(src2, 128); + hz_out0 = horiz_4tap_filt(src0, src1, mask0, mask1, filt_hz0, filt_hz1); + hz_out1 = horiz_4tap_filt(src1, src2, mask0, mask1, filt_hz0, filt_hz1); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + + DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0, + filt_vt1); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + src3 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src4, src5); + src6 = __lsx_vldx(src, src_stride_x3); + src += src_stride_x4; + + DUP2_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src3, src4); + hz_out3 = horiz_4tap_filt(src3, src4, mask0, mask1, filt_hz0, filt_hz1); + hz_out2 = __lsx_vshuf_b(hz_out3, hz_out1, shuff); + vec1 = __lsx_vpackev_b(hz_out3, hz_out2); + tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1); + + DUP2_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src5, src6); + hz_out5 = horiz_4tap_filt(src5, src6, mask0, mask1, filt_hz0, filt_hz1); + hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff); + vec2 = __lsx_vpackev_b(hz_out5, hz_out4); + tmp1 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1); + + tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7); + tmp0 = __lsx_vxori_b(tmp0, 128); + __lsx_vstelm_w(tmp0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 3); + dst += dst_stride; + + hz_out1 = hz_out5; + vec0 = vec2; + } +} + +static inline void common_hv_4ht_4vt_8w_lsx( + uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst, + int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1; + __m128i mask0, mask1, out0, out1; + __m128i filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3; + __m128i hz_out0, hz_out1, hz_out2, hz_out3; + __m128i vec0, vec1, vec2, vec3, vec4; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0); + src -= 1 + src_stride; + + filt = __lsx_vld(filter_horiz, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1); + mask1 = __lsx_vaddi_bu(mask0, 2); + + DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1); + src2 = __lsx_vldx(src, src_stride_x2); + src += src_stride_x3; + + DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1); + src2 = __lsx_vxori_b(src2, 128); + hz_out0 = horiz_4tap_filt(src0, src0, mask0, mask1, filt_hz0, filt_hz1); + hz_out1 = horiz_4tap_filt(src1, src1, mask0, mask1, filt_hz0, filt_hz1); + hz_out2 = horiz_4tap_filt(src2, src2, mask0, mask1, filt_hz0, filt_hz1); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2); + + filt = __lsx_vld(filter_vert, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src3, src4, src5, src6); + src += src_stride_x4; + + DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3, + src4, src5, src6); + hz_out3 = horiz_4tap_filt(src3, src3, mask0, mask1, filt_hz0, filt_hz1); + vec1 = __lsx_vpackev_b(hz_out3, hz_out2); + tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1); + + hz_out0 = horiz_4tap_filt(src4, src4, mask0, mask1, filt_hz0, filt_hz1); + vec3 = __lsx_vpackev_b(hz_out0, hz_out3); + tmp1 = filt_4tap_dpadd_h(vec2, vec3, filt_vt0, filt_vt1); + + hz_out1 = horiz_4tap_filt(src5, src5, mask0, mask1, filt_hz0, filt_hz1); + vec4 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp2 = filt_4tap_dpadd_h(vec1, vec4, filt_vt0, filt_vt1); + + hz_out2 = horiz_4tap_filt(src6, src6, mask0, mask1, filt_hz0, filt_hz1); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec0, vec1); + tmp3 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1); + + DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(out1, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(out1, dst + dst_stride_x3, 0, 1); + dst += dst_stride_x4; + + vec0 = vec4; + vec2 = vec1; + } +} + +static void common_hv_4ht_4vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + common_hv_4ht_4vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + common_hv_4ht_4vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride, + filter_horiz, filter_vert, height); +} + +static void common_hv_6ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + __m128i src0, src1, src2, src3, src4, src5, src6; + __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2; + __m128i filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16); + src -= 2; + + DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0, + filt_hz1); + filt_hz2 = __lsx_vldrepl_h(filter_horiz, 4); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, -src_stride, src, src_stride, src0, src2); + src += src_stride_x2; + + DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1); + src2 = __lsx_vxori_b(src2, 128); + + hz_out0 = horiz_6tap_filt(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out1 = horiz_6tap_filt(src1, src2, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + + DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0, + filt_vt1); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + src3 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src4, src5); + src6 = __lsx_vldx(src, src_stride_x3); + src += src_stride_x4; + DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3, + src4, src5, src6); + + hz_out3 = horiz_6tap_filt(src3, src4, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + hz_out2 = __lsx_vshuf_b(hz_out3, hz_out1, shuff); + vec1 = __lsx_vpackev_b(hz_out3, hz_out2); + tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1); + + hz_out5 = horiz_6tap_filt(src5, src6, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff); + vec2 = __lsx_vpackev_b(hz_out5, hz_out4); + tmp1 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1); + + DUP2_ARG3(__lsx_vssrarni_b_h, tmp0, tmp0, 7, tmp1, tmp1, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + + __lsx_vstelm_w(tmp0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(tmp1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(tmp1, dst, 0, 1); + dst += dst_stride; + + hz_out1 = hz_out5; + vec0 = vec2; + } +} + +static inline void common_hv_6ht_4vt_8w_lsx( + uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst, + int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + + __m128i src0, src1, src2, src3, src4, src5, src6; + __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2; + __m128i filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3; + __m128i tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3; + __m128i out0, out1; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0); + src -= (2 + src_stride); + + filt = __lsx_vld(filter_horiz, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1); + filt_hz2 = __lsx_vreplvei_h(filt, 2); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + + DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1); + src2 = __lsx_vldx(src, src_stride_x2); + src += src_stride_x3; + + DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1); + src2 = __lsx_vxori_b(src2, 128); + hz_out0 = horiz_6tap_filt(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out1 = horiz_6tap_filt(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out2 = horiz_6tap_filt(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2); + + filt = __lsx_vld(filter_vert, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src3, src4, src5, src6); + src += src_stride_x4; + DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3, + src4, src5, src6); + + hz_out3 = horiz_6tap_filt(src3, src3, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + vec1 = __lsx_vpackev_b(hz_out3, hz_out2); + tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1); + + hz_out0 = horiz_6tap_filt(src4, src4, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + vec3 = __lsx_vpackev_b(hz_out0, hz_out3); + tmp1 = filt_4tap_dpadd_h(vec2, vec3, filt_vt0, filt_vt1); + + hz_out1 = horiz_6tap_filt(src5, src5, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp2 = filt_4tap_dpadd_h(vec1, vec0, filt_vt0, filt_vt1); + + hz_out2 = horiz_6tap_filt(src6, src6, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec1, vec2); + tmp3 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1); + + DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(out1, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(out1, dst + dst_stride_x3, 0, 1); + dst += dst_stride_x4; + } +} + +static void common_hv_6ht_4vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + common_hv_6ht_4vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + common_hv_6ht_4vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride, + filter_horiz, filter_vert, height); +} + +static void common_hv_4ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; + __m128i filt_hz0, filt_hz1, filt_vt0, filt_vt1, filt_vt2, mask0, mask1; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + __m128i hz_out7, tmp0, tmp1, out0, out1, out2, out3; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16); + + src -= 1; + + DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0, + filt_hz1); + mask1 = __lsx_vaddi_bu(mask0, 2); + + DUP4_ARG2(__lsx_vldx, src, -src_stride_x2, src, -src_stride, src, src_stride, + src, src_stride_x2, src0, src1, src3, src4); + src2 = __lsx_vld(src, 0); + src += src_stride_x3; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + src4 = __lsx_vxori_b(src4, 128); + hz_out0 = horiz_4tap_filt(src0, src1, mask0, mask1, filt_hz0, filt_hz1); + hz_out2 = horiz_4tap_filt(src2, src3, mask0, mask1, filt_hz0, filt_hz1); + hz_out3 = horiz_4tap_filt(src3, src4, mask0, mask1, filt_hz0, filt_hz1); + hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1); + + DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0, + filt_vt1); + filt_vt2 = __lsx_vldrepl_h(filter_vert, 4); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + src5 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src6, src7); + src8 = __lsx_vldx(src, src_stride_x3); + DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5, + src6, src7, src8); + src += src_stride_x4; + + hz_out5 = horiz_4tap_filt(src5, src6, mask0, mask1, filt_hz0, filt_hz1); + hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff); + out2 = __lsx_vpackev_b(hz_out5, hz_out4); + tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); + + hz_out7 = horiz_4tap_filt(src7, src8, mask0, mask1, filt_hz0, filt_hz1); + hz_out6 = __lsx_vshuf_b(hz_out7, hz_out5, shuff); + out3 = __lsx_vpackev_b(hz_out7, hz_out6); + tmp1 = dpadd_h3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2); + + tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7); + tmp0 = __lsx_vxori_b(tmp0, 128); + __lsx_vstelm_w(tmp0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 3); + dst += dst_stride; + + hz_out3 = hz_out7; + out0 = out2; + out1 = out3; + } +} + +static inline void common_hv_4ht_6vt_8w_lsx( + uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst, + int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; + __m128i filt_hz0, filt_hz1, mask0, mask1; + __m128i filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + __m128i hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7; + __m128i vec0, vec1; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0); + src -= 1 + src_stride_x2; + + filt = __lsx_vld(filter_horiz, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1); + mask1 = __lsx_vaddi_bu(mask0, 2); + + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src0, src1, src2, src3); + src += src_stride_x4; + src4 = __lsx_vld(src, 0); + src += src_stride; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + src4 = __lsx_vxori_b(src4, 128); + hz_out0 = horiz_4tap_filt(src0, src0, mask0, mask1, filt_hz0, filt_hz1); + hz_out1 = horiz_4tap_filt(src1, src1, mask0, mask1, filt_hz0, filt_hz1); + hz_out2 = horiz_4tap_filt(src2, src2, mask0, mask1, filt_hz0, filt_hz1); + hz_out3 = horiz_4tap_filt(src3, src3, mask0, mask1, filt_hz0, filt_hz1); + hz_out4 = horiz_4tap_filt(src4, src4, mask0, mask1, filt_hz0, filt_hz1); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1); + DUP2_ARG2(__lsx_vpackev_b, hz_out2, hz_out1, hz_out4, hz_out3, out3, out4); + + filt = __lsx_vld(filter_vert, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1); + filt_vt2 = __lsx_vreplvei_h(filt, 2); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src5, src6, src7, src8); + src += src_stride_x4; + + DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5, + src6, src7, src8); + hz_out5 = horiz_4tap_filt(src5, src5, mask0, mask1, filt_hz0, filt_hz1); + out2 = __lsx_vpackev_b(hz_out5, hz_out4); + tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); + + hz_out6 = horiz_4tap_filt(src6, src6, mask0, mask1, filt_hz0, filt_hz1); + out5 = __lsx_vpackev_b(hz_out6, hz_out5); + tmp1 = dpadd_h3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2); + + hz_out7 = horiz_4tap_filt(src7, src7, mask0, mask1, filt_hz0, filt_hz1); + out6 = __lsx_vpackev_b(hz_out7, hz_out6); + tmp2 = dpadd_h3(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2); + + hz_out8 = horiz_4tap_filt(src8, src8, mask0, mask1, filt_hz0, filt_hz1); + out7 = __lsx_vpackev_b(hz_out8, hz_out7); + tmp3 = dpadd_h3(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, vec0, vec1); + DUP2_ARG2(__lsx_vxori_b, vec0, 128, vec1, 128, vec0, vec1); + __lsx_vstelm_d(vec0, dst, 0, 0); + __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(vec1, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(vec1, dst + dst_stride_x3, 0, 1); + dst += dst_stride_x4; + hz_out4 = hz_out8; + out0 = out2; + out1 = out6; + out3 = out5; + out4 = out7; + } +} + +static void common_hv_4ht_6vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + common_hv_4ht_6vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + common_hv_4ht_6vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride, + filter_horiz, filter_vert, height); +} + +typedef void (*PVp8SixtapPredictFunc1)( + uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst, + int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, + int32_t height); + +typedef void (*PVp8SixtapPredictFunc2)(uint8_t *RESTRICT src, + int32_t src_stride, + uint8_t *RESTRICT dst, + int32_t dst_stride, const int8_t *filter, + int32_t height); + +void vp8_sixtap_predict4x4_lsx(uint8_t *RESTRICT src, int32_t src_stride, + int32_t xoffset, int32_t yoffset, + uint8_t *RESTRICT dst, int32_t dst_stride) { + const int8_t *h_filter = vp8_subpel_filters_lsx[xoffset - 1]; + const int8_t *v_filter = vp8_subpel_filters_lsx[yoffset - 1]; + + static PVp8SixtapPredictFunc1 Predict4x4Funcs1[4] = { + common_hv_6ht_6vt_4w_lsx, + common_hv_6ht_4vt_4w_lsx, + common_hv_4ht_6vt_4w_lsx, + common_hv_4ht_4vt_4w_lsx, + }; + + static PVp8SixtapPredictFunc2 Predict4x4Funcs2[4] = { common_vt_6t_4w_lsx, + common_vt_4t_4w_lsx, + common_hz_6t_4w_lsx, + common_hz_4t_4w_lsx }; + if (yoffset < 8 && xoffset < 8) { + if (yoffset) { + if (xoffset) { + switch (xoffset & 1) { + case 0: + switch (yoffset & 1) { + case 0: + Predict4x4Funcs1[0](src, src_stride, dst, dst_stride, h_filter, + v_filter, 4); + break; + case 1: + Predict4x4Funcs1[1](src, src_stride, dst, dst_stride, h_filter, + v_filter + 1, 4); + break; + } + break; + + case 1: + switch (yoffset & 1) { + case 0: + Predict4x4Funcs1[2](src, src_stride, dst, dst_stride, + h_filter + 1, v_filter, 4); + break; + + case 1: + Predict4x4Funcs1[3](src, src_stride, dst, dst_stride, + h_filter + 1, v_filter + 1, 4); + break; + } + break; + } + } else { + switch (yoffset & 1) { + case 0: + Predict4x4Funcs2[0](src, src_stride, dst, dst_stride, v_filter, 4); + break; + + case 1: + Predict4x4Funcs2[1](src, src_stride, dst, dst_stride, v_filter + 1, + 4); + break; + } + } + } else { + switch (xoffset) { + case 0: { + __m128i tp0; + + tp0 = __lsx_vldrepl_w(src, 0); + src += src_stride; + __lsx_vstelm_w(tp0, dst, 0, 0); + dst += dst_stride; + tp0 = __lsx_vldrepl_w(src, 0); + src += src_stride; + __lsx_vstelm_w(tp0, dst, 0, 0); + dst += dst_stride; + tp0 = __lsx_vldrepl_w(src, 0); + src += src_stride; + __lsx_vstelm_w(tp0, dst, 0, 0); + dst += dst_stride; + tp0 = __lsx_vldrepl_w(src, 0); + __lsx_vstelm_w(tp0, dst, 0, 0); + + break; + } + case 2: + case 4: + case 6: + Predict4x4Funcs2[2](src, src_stride, dst, dst_stride, h_filter, 4); + break; + } + switch (xoffset & 1) { + case 1: + Predict4x4Funcs2[3](src, src_stride, dst, dst_stride, h_filter + 1, + 4); + break; + } + } + } +} + +void vp8_sixtap_predict8x8_lsx(uint8_t *RESTRICT src, int32_t src_stride, + int32_t xoffset, int32_t yoffset, + uint8_t *RESTRICT dst, int32_t dst_stride) { + const int8_t *h_filter = vp8_subpel_filters_lsx[xoffset - 1]; + const int8_t *v_filter = vp8_subpel_filters_lsx[yoffset - 1]; + + static PVp8SixtapPredictFunc1 Predict8x8Funcs1[4] = { + common_hv_6ht_6vt_8w_lsx, + common_hv_6ht_4vt_8w_lsx, + common_hv_4ht_6vt_8w_lsx, + common_hv_4ht_4vt_8w_lsx, + }; + + static PVp8SixtapPredictFunc2 Predict8x8Funcs2[4] = { common_vt_6t_8w_lsx, + common_vt_4t_8w_lsx, + common_hz_6t_8w_lsx, + common_hz_4t_8w_lsx }; + + if (yoffset < 8 && xoffset < 8) { + if (yoffset) { + if (xoffset) { + switch (xoffset & 1) { + case 0: + switch (yoffset & 1) { + case 0: + Predict8x8Funcs1[0](src, src_stride, dst, dst_stride, h_filter, + v_filter, 8); + break; + + case 1: + Predict8x8Funcs1[1](src, src_stride, dst, dst_stride, h_filter, + v_filter + 1, 8); + break; + } + break; + + case 1: + switch (yoffset & 1) { + case 0: + Predict8x8Funcs1[2](src, src_stride, dst, dst_stride, + h_filter + 1, v_filter, 8); + break; + + case 1: + Predict8x8Funcs1[3](src, src_stride, dst, dst_stride, + h_filter + 1, v_filter + 1, 8); + break; + } + break; + } + } else { + switch (yoffset & 1) { + case 0: + Predict8x8Funcs2[0](src, src_stride, dst, dst_stride, v_filter, 8); + break; + + case 1: + Predict8x8Funcs2[1](src, src_stride, dst, dst_stride, v_filter + 1, + 8); + break; + } + } + } else { + switch (xoffset & 1) { + case 1: + Predict8x8Funcs2[3](src, src_stride, dst, dst_stride, h_filter + 1, + 8); + break; + } + switch (xoffset) { + case 0: vp8_copy_mem8x8(src, src_stride, dst, dst_stride); break; + case 2: + case 4: + case 6: + Predict8x8Funcs2[2](src, src_stride, dst, dst_stride, h_filter, 8); + break; + } + } + } +} + +void vp8_sixtap_predict16x16_lsx(uint8_t *RESTRICT src, int32_t src_stride, + int32_t xoffset, int32_t yoffset, + uint8_t *RESTRICT dst, int32_t dst_stride) { + const int8_t *h_filter = vp8_subpel_filters_lsx[xoffset - 1]; + const int8_t *v_filter = vp8_subpel_filters_lsx[yoffset - 1]; + + static PVp8SixtapPredictFunc1 Predict16x16Funcs1[4] = { + common_hv_6ht_6vt_16w_lsx, + common_hv_6ht_4vt_16w_lsx, + common_hv_4ht_6vt_16w_lsx, + common_hv_4ht_4vt_16w_lsx, + }; + + static PVp8SixtapPredictFunc2 Predict16x16Funcs2[4] = { + common_vt_6t_16w_lsx, common_vt_4t_16w_lsx, common_hz_6t_16w_lsx, + common_hz_4t_16w_lsx + }; + + if (yoffset < 8 && xoffset < 8) { + if (yoffset) { + if (xoffset) { + switch (xoffset & 1) { + case 0: + switch (yoffset & 1) { + case 0: + Predict16x16Funcs1[0](src, src_stride, dst, dst_stride, + h_filter, v_filter, 16); + break; + + case 1: + Predict16x16Funcs1[1](src, src_stride, dst, dst_stride, + h_filter, v_filter + 1, 16); + break; + } + break; + + case 1: + switch (yoffset & 1) { + case 0: + Predict16x16Funcs1[2](src, src_stride, dst, dst_stride, + h_filter + 1, v_filter, 16); + break; + + case 1: + Predict16x16Funcs1[3](src, src_stride, dst, dst_stride, + h_filter + 1, v_filter + 1, 16); + break; + } + break; + } + } else { + switch (yoffset & 1) { + case 0: + Predict16x16Funcs2[0](src, src_stride, dst, dst_stride, v_filter, + 16); + break; + + case 1: + Predict16x16Funcs2[1](src, src_stride, dst, dst_stride, + v_filter + 1, 16); + break; + } + } + } else { + switch (xoffset & 1) { + case 1: + Predict16x16Funcs2[3](src, src_stride, dst, dst_stride, h_filter + 1, + 16); + break; + } + switch (xoffset) { + case 0: vp8_copy_mem16x16(src, src_stride, dst, dst_stride); break; + case 2: + case 4: + case 6: + Predict16x16Funcs2[2](src, src_stride, dst, dst_stride, h_filter, 16); + break; + } + } + } +} diff --git a/media/libvpx/libvpx/vp8/common/loopfilter.h b/media/libvpx/libvpx/vp8/common/loopfilter.h new file mode 100644 index 0000000000..909e8df512 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/loopfilter.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_LOOPFILTER_H_ +#define VPX_VP8_COMMON_LOOPFILTER_H_ + +#include "vpx_ports/mem.h" +#include "vpx_config.h" +#include "vp8_rtcd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_LOOP_FILTER 63 +/* fraction of total macroblock rows to be used in fast filter level picking */ +/* has to be > 2 */ +#define PARTIAL_FRAME_FRACTION 8 + +typedef enum { NORMAL_LOOPFILTER = 0, SIMPLE_LOOPFILTER = 1 } LOOPFILTERTYPE; + +#if VPX_ARCH_ARM +#define SIMD_WIDTH 1 +#else +#define SIMD_WIDTH 16 +#endif + +/* Need to align this structure so when it is declared and + * passed it can be loaded into vector registers. + */ +typedef struct { + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, + mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, + blim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, + lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, hev_thr[4][SIMD_WIDTH]); + unsigned char lvl[4][4][4]; + unsigned char hev_thr_lut[2][MAX_LOOP_FILTER + 1]; + unsigned char mode_lf_lut[10]; +} loop_filter_info_n; + +typedef struct loop_filter_info { + const unsigned char *mblim; + const unsigned char *blim; + const unsigned char *lim; + const unsigned char *hev_thr; +} loop_filter_info; + +typedef void loop_filter_uvfunction(unsigned char *u, /* source pointer */ + int p, /* pitch */ + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + unsigned char *v); + +/* assorted loopfilter functions which get used elsewhere */ +struct VP8Common; +struct macroblockd; +struct modeinfo; + +void vp8_loop_filter_init(struct VP8Common *cm); + +void vp8_loop_filter_frame_init(struct VP8Common *cm, struct macroblockd *mbd, + int default_filt_lvl); + +void vp8_loop_filter_frame(struct VP8Common *cm, struct macroblockd *mbd, + int frame_type); + +void vp8_loop_filter_partial_frame(struct VP8Common *cm, + struct macroblockd *mbd, + int default_filt_lvl); + +void vp8_loop_filter_frame_yonly(struct VP8Common *cm, struct macroblockd *mbd, + int default_filt_lvl); + +void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi, + int sharpness_lvl); + +void vp8_loop_filter_row_normal(struct VP8Common *cm, + struct modeinfo *mode_info_context, int mb_row, + int post_ystride, int post_uvstride, + unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr); + +void vp8_loop_filter_row_simple(struct VP8Common *cm, + struct modeinfo *mode_info_context, int mb_row, + int post_ystride, unsigned char *y_ptr); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_COMMON_LOOPFILTER_H_ diff --git a/media/libvpx/libvpx/vp8/common/loopfilter_filters.c b/media/libvpx/libvpx/vp8/common/loopfilter_filters.c new file mode 100644 index 0000000000..61a55d3c92 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/loopfilter_filters.c @@ -0,0 +1,397 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "loopfilter.h" +#include "onyxc_int.h" + +typedef unsigned char uc; + +static signed char vp8_signed_char_clamp(int t) { + t = (t < -128 ? -128 : t); + t = (t > 127 ? 127 : t); + return (signed char)t; +} + +/* should we apply any filter at all ( 11111111 yes, 00000000 no) */ +static signed char vp8_filter_mask(uc limit, uc blimit, uc p3, uc p2, uc p1, + uc p0, uc q0, uc q1, uc q2, uc q3) { + signed char mask = 0; + mask |= (abs(p3 - p2) > limit); + mask |= (abs(p2 - p1) > limit); + mask |= (abs(p1 - p0) > limit); + mask |= (abs(q1 - q0) > limit); + mask |= (abs(q2 - q1) > limit); + mask |= (abs(q3 - q2) > limit); + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit); + return mask - 1; +} + +/* is there high variance internal edge ( 11111111 yes, 00000000 no) */ +static signed char vp8_hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1) { + signed char hev = 0; + hev |= (abs(p1 - p0) > thresh) * -1; + hev |= (abs(q1 - q0) > thresh) * -1; + return hev; +} + +static void vp8_filter(signed char mask, uc hev, uc *op1, uc *op0, uc *oq0, + uc *oq1) { + signed char ps0, qs0; + signed char ps1, qs1; + signed char filter_value, Filter1, Filter2; + signed char u; + + ps1 = (signed char)*op1 ^ 0x80; + ps0 = (signed char)*op0 ^ 0x80; + qs0 = (signed char)*oq0 ^ 0x80; + qs1 = (signed char)*oq1 ^ 0x80; + + /* add outer taps if we have high edge variance */ + filter_value = vp8_signed_char_clamp(ps1 - qs1); + filter_value &= hev; + + /* inner taps */ + filter_value = vp8_signed_char_clamp(filter_value + 3 * (qs0 - ps0)); + filter_value &= mask; + + /* save bottom 3 bits so that we round one side +4 and the other +3 + * if it equals 4 we'll set it to adjust by -1 to account for the fact + * we'd round it by 3 the other way + */ + Filter1 = vp8_signed_char_clamp(filter_value + 4); + Filter2 = vp8_signed_char_clamp(filter_value + 3); + Filter1 >>= 3; + Filter2 >>= 3; + u = vp8_signed_char_clamp(qs0 - Filter1); + *oq0 = u ^ 0x80; + u = vp8_signed_char_clamp(ps0 + Filter2); + *op0 = u ^ 0x80; + filter_value = Filter1; + + /* outer tap adjustments */ + filter_value += 1; + filter_value >>= 1; + filter_value &= ~hev; + + u = vp8_signed_char_clamp(qs1 - filter_value); + *oq1 = u ^ 0x80; + u = vp8_signed_char_clamp(ps1 + filter_value); + *op1 = u ^ 0x80; +} + +static void loop_filter_horizontal_edge_c(unsigned char *s, int p, /* pitch */ + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count) { + int hev = 0; /* high edge variance */ + signed char mask = 0; + int i = 0; + + /* loop filter designed to work using chars so that we can make maximum use + * of 8 bit simd instructions. + */ + do { + mask = vp8_filter_mask(limit[0], blimit[0], s[-4 * p], s[-3 * p], s[-2 * p], + s[-1 * p], s[0 * p], s[1 * p], s[2 * p], s[3 * p]); + + hev = vp8_hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]); + + vp8_filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p); + + ++s; + } while (++i < count * 8); +} + +static void loop_filter_vertical_edge_c(unsigned char *s, int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count) { + int hev = 0; /* high edge variance */ + signed char mask = 0; + int i = 0; + + /* loop filter designed to work using chars so that we can make maximum use + * of 8 bit simd instructions. + */ + do { + mask = vp8_filter_mask(limit[0], blimit[0], s[-4], s[-3], s[-2], s[-1], + s[0], s[1], s[2], s[3]); + + hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]); + + vp8_filter(mask, hev, s - 2, s - 1, s, s + 1); + + s += p; + } while (++i < count * 8); +} + +static void vp8_mbfilter(signed char mask, uc hev, uc *op2, uc *op1, uc *op0, + uc *oq0, uc *oq1, uc *oq2) { + signed char s, u; + signed char filter_value, Filter1, Filter2; + signed char ps2 = (signed char)*op2 ^ 0x80; + signed char ps1 = (signed char)*op1 ^ 0x80; + signed char ps0 = (signed char)*op0 ^ 0x80; + signed char qs0 = (signed char)*oq0 ^ 0x80; + signed char qs1 = (signed char)*oq1 ^ 0x80; + signed char qs2 = (signed char)*oq2 ^ 0x80; + + /* add outer taps if we have high edge variance */ + filter_value = vp8_signed_char_clamp(ps1 - qs1); + filter_value = vp8_signed_char_clamp(filter_value + 3 * (qs0 - ps0)); + filter_value &= mask; + + Filter2 = filter_value; + Filter2 &= hev; + + /* save bottom 3 bits so that we round one side +4 and the other +3 */ + Filter1 = vp8_signed_char_clamp(Filter2 + 4); + Filter2 = vp8_signed_char_clamp(Filter2 + 3); + Filter1 >>= 3; + Filter2 >>= 3; + qs0 = vp8_signed_char_clamp(qs0 - Filter1); + ps0 = vp8_signed_char_clamp(ps0 + Filter2); + + /* only apply wider filter if not high edge variance */ + filter_value &= ~hev; + Filter2 = filter_value; + + /* roughly 3/7th difference across boundary */ + u = vp8_signed_char_clamp((63 + Filter2 * 27) >> 7); + s = vp8_signed_char_clamp(qs0 - u); + *oq0 = s ^ 0x80; + s = vp8_signed_char_clamp(ps0 + u); + *op0 = s ^ 0x80; + + /* roughly 2/7th difference across boundary */ + u = vp8_signed_char_clamp((63 + Filter2 * 18) >> 7); + s = vp8_signed_char_clamp(qs1 - u); + *oq1 = s ^ 0x80; + s = vp8_signed_char_clamp(ps1 + u); + *op1 = s ^ 0x80; + + /* roughly 1/7th difference across boundary */ + u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7); + s = vp8_signed_char_clamp(qs2 - u); + *oq2 = s ^ 0x80; + s = vp8_signed_char_clamp(ps2 + u); + *op2 = s ^ 0x80; +} + +static void mbloop_filter_horizontal_edge_c(unsigned char *s, int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count) { + signed char hev = 0; /* high edge variance */ + signed char mask = 0; + int i = 0; + + /* loop filter designed to work using chars so that we can make maximum use + * of 8 bit simd instructions. + */ + do { + mask = vp8_filter_mask(limit[0], blimit[0], s[-4 * p], s[-3 * p], s[-2 * p], + s[-1 * p], s[0 * p], s[1 * p], s[2 * p], s[3 * p]); + + hev = vp8_hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]); + + vp8_mbfilter(mask, hev, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, + s + 2 * p); + + ++s; + } while (++i < count * 8); +} + +static void mbloop_filter_vertical_edge_c(unsigned char *s, int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count) { + signed char hev = 0; /* high edge variance */ + signed char mask = 0; + int i = 0; + + do { + mask = vp8_filter_mask(limit[0], blimit[0], s[-4], s[-3], s[-2], s[-1], + s[0], s[1], s[2], s[3]); + + hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]); + + vp8_mbfilter(mask, hev, s - 3, s - 2, s - 1, s, s + 1, s + 2); + + s += p; + } while (++i < count * 8); +} + +/* should we apply any filter at all ( 11111111 yes, 00000000 no) */ +static signed char vp8_simple_filter_mask(uc blimit, uc p1, uc p0, uc q0, + uc q1) { + /* Why does this cause problems for win32? + * error C2143: syntax error : missing ';' before 'type' + * (void) limit; + */ + signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= blimit) * -1; + return mask; +} + +static void vp8_simple_filter(signed char mask, uc *op1, uc *op0, uc *oq0, + uc *oq1) { + signed char filter_value, Filter1, Filter2; + signed char p1 = (signed char)*op1 ^ 0x80; + signed char p0 = (signed char)*op0 ^ 0x80; + signed char q0 = (signed char)*oq0 ^ 0x80; + signed char q1 = (signed char)*oq1 ^ 0x80; + signed char u; + + filter_value = vp8_signed_char_clamp(p1 - q1); + filter_value = vp8_signed_char_clamp(filter_value + 3 * (q0 - p0)); + filter_value &= mask; + + /* save bottom 3 bits so that we round one side +4 and the other +3 */ + Filter1 = vp8_signed_char_clamp(filter_value + 4); + Filter1 >>= 3; + u = vp8_signed_char_clamp(q0 - Filter1); + *oq0 = u ^ 0x80; + + Filter2 = vp8_signed_char_clamp(filter_value + 3); + Filter2 >>= 3; + u = vp8_signed_char_clamp(p0 + Filter2); + *op0 = u ^ 0x80; +} + +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, + int y_stride, + const unsigned char *blimit) { + signed char mask = 0; + int i = 0; + + do { + mask = vp8_simple_filter_mask(blimit[0], y_ptr[-2 * y_stride], + y_ptr[-1 * y_stride], y_ptr[0 * y_stride], + y_ptr[1 * y_stride]); + vp8_simple_filter(mask, y_ptr - 2 * y_stride, y_ptr - 1 * y_stride, y_ptr, + y_ptr + 1 * y_stride); + ++y_ptr; + } while (++i < 16); +} + +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { + signed char mask = 0; + int i = 0; + + do { + mask = vp8_simple_filter_mask(blimit[0], y_ptr[-2], y_ptr[-1], y_ptr[0], + y_ptr[1]); + vp8_simple_filter(mask, y_ptr - 2, y_ptr - 1, y_ptr, y_ptr + 1); + y_ptr += y_stride; + } while (++i < 16); +} + +/* Horizontal MB filtering */ +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 2); + + if (u_ptr) { + mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); + } + + if (v_ptr) { + mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); + } +} + +/* Vertical MB Filtering */ +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 2); + + if (u_ptr) { + mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); + } + + if (v_ptr) { + mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); + } +} + +/* Horizontal B Filtering */ +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim, + lfi->lim, lfi->hev_thr, 2); + loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim, + lfi->lim, lfi->hev_thr, 2); + loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim, + lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) { + loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, + lfi->lim, lfi->hev_thr, 1); + } + + if (v_ptr) { + loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, + lfi->lim, lfi->hev_thr, 1); + } +} + +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { + vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, + blimit); + vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, + blimit); + vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, + blimit); +} + +/* Vertical B Filtering */ +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 2); + loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 2); + loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 2); + + if (u_ptr) { + loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 1); + } + + if (v_ptr) { + loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 1); + } +} + +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { + vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit); +} diff --git a/media/libvpx/libvpx/vp8/common/mbpitch.c b/media/libvpx/libvpx/vp8/common/mbpitch.c new file mode 100644 index 0000000000..188b57f389 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mbpitch.c @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "blockd.h" + +void vp8_setup_block_dptrs(MACROBLOCKD *x) { + int r, c; + + for (r = 0; r < 4; ++r) { + for (c = 0; c < 4; ++c) { + x->block[r * 4 + c].predictor = x->predictor + r * 4 * 16 + c * 4; + } + } + + for (r = 0; r < 2; ++r) { + for (c = 0; c < 2; ++c) { + x->block[16 + r * 2 + c].predictor = + x->predictor + 256 + r * 4 * 8 + c * 4; + } + } + + for (r = 0; r < 2; ++r) { + for (c = 0; c < 2; ++c) { + x->block[20 + r * 2 + c].predictor = + x->predictor + 320 + r * 4 * 8 + c * 4; + } + } + + for (r = 0; r < 25; ++r) { + x->block[r].qcoeff = x->qcoeff + r * 16; + x->block[r].dqcoeff = x->dqcoeff + r * 16; + x->block[r].eob = x->eobs + r; + } +} + +void vp8_build_block_doffsets(MACROBLOCKD *x) { + int block; + + for (block = 0; block < 16; ++block) /* y blocks */ + { + x->block[block].offset = + (block >> 2) * 4 * x->dst.y_stride + (block & 3) * 4; + } + + for (block = 16; block < 20; ++block) /* U and V blocks */ + { + x->block[block + 4].offset = x->block[block].offset = + ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4; + } +} diff --git a/media/libvpx/libvpx/vp8/common/mfqe.c b/media/libvpx/libvpx/vp8/common/mfqe.c new file mode 100644 index 0000000000..1fe7363f17 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mfqe.c @@ -0,0 +1,327 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/* MFQE: Multiframe Quality Enhancement + * In rate limited situations keyframes may cause significant visual artifacts + * commonly referred to as "popping." This file implements a postproccesing + * algorithm which blends data from the preceeding frame when there is no + * motion and the q from the previous frame is lower which indicates that it is + * higher quality. + */ + +#include "./vp8_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "vp8/common/common.h" +#include "vp8/common/postproc.h" +#include "vpx_dsp/variance.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_scale/yv12config.h" + +#include +#include + +static void filter_by_weight(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride, int block_size, + int src_weight) { + int dst_weight = (1 << MFQE_PRECISION) - src_weight; + int rounding_bit = 1 << (MFQE_PRECISION - 1); + int r, c; + + for (r = 0; r < block_size; ++r) { + for (c = 0; c < block_size; ++c) { + dst[c] = (src[c] * src_weight + dst[c] * dst_weight + rounding_bit) >> + MFQE_PRECISION; + } + src += src_stride; + dst += dst_stride; + } +} + +void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride, + int src_weight) { + filter_by_weight(src, src_stride, dst, dst_stride, 16, src_weight); +} + +void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride, + int src_weight) { + filter_by_weight(src, src_stride, dst, dst_stride, 8, src_weight); +} + +void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride, + int src_weight) { + filter_by_weight(src, src_stride, dst, dst_stride, 4, src_weight); +} + +static void apply_ifactor(unsigned char *y_src, int y_src_stride, + unsigned char *y_dst, int y_dst_stride, + unsigned char *u_src, unsigned char *v_src, + int uv_src_stride, unsigned char *u_dst, + unsigned char *v_dst, int uv_dst_stride, + int block_size, int src_weight) { + if (block_size == 16) { + vp8_filter_by_weight16x16(y_src, y_src_stride, y_dst, y_dst_stride, + src_weight); + vp8_filter_by_weight8x8(u_src, uv_src_stride, u_dst, uv_dst_stride, + src_weight); + vp8_filter_by_weight8x8(v_src, uv_src_stride, v_dst, uv_dst_stride, + src_weight); + } else { + vp8_filter_by_weight8x8(y_src, y_src_stride, y_dst, y_dst_stride, + src_weight); + vp8_filter_by_weight4x4(u_src, uv_src_stride, u_dst, uv_dst_stride, + src_weight); + vp8_filter_by_weight4x4(v_src, uv_src_stride, v_dst, uv_dst_stride, + src_weight); + } +} + +static unsigned int int_sqrt(unsigned int x) { + unsigned int y = x; + unsigned int guess; + int p = 1; + while (y >>= 1) p++; + p >>= 1; + + guess = 0; + while (p >= 0) { + guess |= (1 << p); + if (x < guess * guess) guess -= (1 << p); + p--; + } + /* choose between guess or guess+1 */ + return guess + (guess * guess + guess + 1 <= x); +} + +#define USE_SSD +static void multiframe_quality_enhance_block( + int blksize, /* Currently only values supported are 16, 8 */ + int qcurr, int qprev, unsigned char *y, unsigned char *u, unsigned char *v, + int y_stride, int uv_stride, unsigned char *yd, unsigned char *ud, + unsigned char *vd, int yd_stride, int uvd_stride) { + static const unsigned char VP8_ZEROS[16] = { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }; + int uvblksize = blksize >> 1; + int qdiff = qcurr - qprev; + + int i; + unsigned char *up; + unsigned char *udp; + unsigned char *vp; + unsigned char *vdp; + + unsigned int act, actd, sad, usad, vsad, sse, thr, thrsq, actrisk; + + if (blksize == 16) { + actd = (vpx_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse) + 128) >> 8; + act = (vpx_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse) + 128) >> 8; +#ifdef USE_SSD + vpx_variance16x16(y, y_stride, yd, yd_stride, &sse); + sad = (sse + 128) >> 8; + vpx_variance8x8(u, uv_stride, ud, uvd_stride, &sse); + usad = (sse + 32) >> 6; + vpx_variance8x8(v, uv_stride, vd, uvd_stride, &sse); + vsad = (sse + 32) >> 6; +#else + sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8; + usad = (vpx_sad8x8(u, uv_stride, ud, uvd_stride) + 32) >> 6; + vsad = (vpx_sad8x8(v, uv_stride, vd, uvd_stride) + 32) >> 6; +#endif + } else { + actd = (vpx_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse) + 32) >> 6; + act = (vpx_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse) + 32) >> 6; +#ifdef USE_SSD + vpx_variance8x8(y, y_stride, yd, yd_stride, &sse); + sad = (sse + 32) >> 6; + vpx_variance4x4(u, uv_stride, ud, uvd_stride, &sse); + usad = (sse + 8) >> 4; + vpx_variance4x4(v, uv_stride, vd, uvd_stride, &sse); + vsad = (sse + 8) >> 4; +#else + sad = (vpx_sad8x8(y, y_stride, yd, yd_stride) + 32) >> 6; + usad = (vpx_sad4x4(u, uv_stride, ud, uvd_stride) + 8) >> 4; + vsad = (vpx_sad4x4(v, uv_stride, vd, uvd_stride) + 8) >> 4; +#endif + } + + actrisk = (actd > act * 5); + + /* thr = qdiff/16 + log2(act) + log4(qprev) */ + thr = (qdiff >> 4); + while (actd >>= 1) thr++; + while (qprev >>= 2) thr++; + +#ifdef USE_SSD + thrsq = thr * thr; + if (sad < thrsq && + /* additional checks for color mismatch and excessive addition of + * high-frequencies */ + 4 * usad < thrsq && 4 * vsad < thrsq && !actrisk) +#else + if (sad < thr && + /* additional checks for color mismatch and excessive addition of + * high-frequencies */ + 2 * usad < thr && 2 * vsad < thr && !actrisk) +#endif + { + int ifactor; +#ifdef USE_SSD + /* TODO: optimize this later to not need sqr root */ + sad = int_sqrt(sad); +#endif + ifactor = (sad << MFQE_PRECISION) / thr; + ifactor >>= (qdiff >> 5); + + if (ifactor) { + apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride, ud, vd, + uvd_stride, blksize, ifactor); + } + } else { /* else implicitly copy from previous frame */ + if (blksize == 16) { + vp8_copy_mem16x16(y, y_stride, yd, yd_stride); + vp8_copy_mem8x8(u, uv_stride, ud, uvd_stride); + vp8_copy_mem8x8(v, uv_stride, vd, uvd_stride); + } else { + vp8_copy_mem8x8(y, y_stride, yd, yd_stride); + for (up = u, udp = ud, i = 0; i < uvblksize; + ++i, up += uv_stride, udp += uvd_stride) { + memcpy(udp, up, uvblksize); + } + for (vp = v, vdp = vd, i = 0; i < uvblksize; + ++i, vp += uv_stride, vdp += uvd_stride) { + memcpy(vdp, vp, uvblksize); + } + } + } +} + +static int qualify_inter_mb(const MODE_INFO *mode_info_context, int *map) { + if (mode_info_context->mbmi.mb_skip_coeff) { + map[0] = map[1] = map[2] = map[3] = 1; + } else if (mode_info_context->mbmi.mode == SPLITMV) { + static int ndx[4][4] = { + { 0, 1, 4, 5 }, { 2, 3, 6, 7 }, { 8, 9, 12, 13 }, { 10, 11, 14, 15 } + }; + int i, j; + vp8_zero(*map); + for (i = 0; i < 4; ++i) { + map[i] = 1; + for (j = 0; j < 4 && map[j]; ++j) { + map[i] &= (mode_info_context->bmi[ndx[i][j]].mv.as_mv.row <= 2 && + mode_info_context->bmi[ndx[i][j]].mv.as_mv.col <= 2); + } + } + } else { + map[0] = map[1] = map[2] = map[3] = + (mode_info_context->mbmi.mode > B_PRED && + abs(mode_info_context->mbmi.mv.as_mv.row) <= 2 && + abs(mode_info_context->mbmi.mv.as_mv.col) <= 2); + } + return (map[0] + map[1] + map[2] + map[3]); +} + +void vp8_multiframe_quality_enhance(VP8_COMMON *cm) { + YV12_BUFFER_CONFIG *show = cm->frame_to_show; + YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer; + + FRAME_TYPE frame_type = cm->frame_type; + /* Point at base of Mb MODE_INFO list has motion vectors etc */ + const MODE_INFO *mode_info_context = cm->mi; + int mb_row; + int mb_col; + int totmap, map[4]; + int qcurr = cm->base_qindex; + int qprev = cm->postproc_state.last_base_qindex; + + unsigned char *y_ptr, *u_ptr, *v_ptr; + unsigned char *yd_ptr, *ud_ptr, *vd_ptr; + + /* Set up the buffer pointers */ + y_ptr = show->y_buffer; + u_ptr = show->u_buffer; + v_ptr = show->v_buffer; + yd_ptr = dest->y_buffer; + ud_ptr = dest->u_buffer; + vd_ptr = dest->v_buffer; + + /* postprocess each macro block */ + for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { + for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) { + /* if motion is high there will likely be no benefit */ + if (frame_type == INTER_FRAME) { + totmap = qualify_inter_mb(mode_info_context, map); + } else { + totmap = (frame_type == KEY_FRAME ? 4 : 0); + } + if (totmap) { + if (totmap < 4) { + int i, j; + for (i = 0; i < 2; ++i) { + for (j = 0; j < 2; ++j) { + if (map[i * 2 + j]) { + multiframe_quality_enhance_block( + 8, qcurr, qprev, y_ptr + 8 * (i * show->y_stride + j), + u_ptr + 4 * (i * show->uv_stride + j), + v_ptr + 4 * (i * show->uv_stride + j), show->y_stride, + show->uv_stride, yd_ptr + 8 * (i * dest->y_stride + j), + ud_ptr + 4 * (i * dest->uv_stride + j), + vd_ptr + 4 * (i * dest->uv_stride + j), dest->y_stride, + dest->uv_stride); + } else { + /* copy a 8x8 block */ + int k; + unsigned char *up = u_ptr + 4 * (i * show->uv_stride + j); + unsigned char *udp = ud_ptr + 4 * (i * dest->uv_stride + j); + unsigned char *vp = v_ptr + 4 * (i * show->uv_stride + j); + unsigned char *vdp = vd_ptr + 4 * (i * dest->uv_stride + j); + vp8_copy_mem8x8( + y_ptr + 8 * (i * show->y_stride + j), show->y_stride, + yd_ptr + 8 * (i * dest->y_stride + j), dest->y_stride); + for (k = 0; k < 4; ++k, up += show->uv_stride, + udp += dest->uv_stride, vp += show->uv_stride, + vdp += dest->uv_stride) { + memcpy(udp, up, 4); + memcpy(vdp, vp, 4); + } + } + } + } + } else { /* totmap = 4 */ + multiframe_quality_enhance_block( + 16, qcurr, qprev, y_ptr, u_ptr, v_ptr, show->y_stride, + show->uv_stride, yd_ptr, ud_ptr, vd_ptr, dest->y_stride, + dest->uv_stride); + } + } else { + vp8_copy_mem16x16(y_ptr, show->y_stride, yd_ptr, dest->y_stride); + vp8_copy_mem8x8(u_ptr, show->uv_stride, ud_ptr, dest->uv_stride); + vp8_copy_mem8x8(v_ptr, show->uv_stride, vd_ptr, dest->uv_stride); + } + y_ptr += 16; + u_ptr += 8; + v_ptr += 8; + yd_ptr += 16; + ud_ptr += 8; + vd_ptr += 8; + mode_info_context++; /* step to next MB */ + } + + y_ptr += show->y_stride * 16 - 16 * cm->mb_cols; + u_ptr += show->uv_stride * 8 - 8 * cm->mb_cols; + v_ptr += show->uv_stride * 8 - 8 * cm->mb_cols; + yd_ptr += dest->y_stride * 16 - 16 * cm->mb_cols; + ud_ptr += dest->uv_stride * 8 - 8 * cm->mb_cols; + vd_ptr += dest->uv_stride * 8 - 8 * cm->mb_cols; + + mode_info_context++; /* Skip border mb */ + } +} diff --git a/media/libvpx/libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c b/media/libvpx/libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c new file mode 100644 index 0000000000..1cfd146189 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vpx_mem/vpx_mem.h" + +#if HAVE_DSPR2 +void vp8_dequant_idct_add_dspr2(short *input, short *dq, unsigned char *dest, + int stride) { + int i; + + for (i = 0; i < 16; ++i) { + input[i] = dq[i] * input[i]; + } + + vp8_short_idct4x4llm_dspr2(input, dest, stride, dest, stride); + + memset(input, 0, 32); +} + +#endif diff --git a/media/libvpx/libvpx/vp8/common/mips/dspr2/filter_dspr2.c b/media/libvpx/libvpx/vp8/common/mips/dspr2/filter_dspr2.c new file mode 100644 index 0000000000..b9da52084d --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mips/dspr2/filter_dspr2.c @@ -0,0 +1,2767 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "vp8_rtcd.h" +#include "vpx_ports/mem.h" + +#if HAVE_DSPR2 +#define CROP_WIDTH 256 +unsigned char ff_cropTbl[256 + 2 * CROP_WIDTH]; + +static const unsigned short sub_pel_filterss[8][3] = { + { 0, 0, 0 }, + { 0, 0x0601, 0x7b0c }, + { 0x0201, 0x0b08, 0x6c24 }, + { 0, 0x0906, 0x5d32 }, + { 0x0303, 0x1010, 0x4d4d }, + { 0, 0x0609, 0x325d }, + { 0x0102, 0x080b, 0x246c }, + { 0, 0x0106, 0x0c7b }, +}; + +static const int sub_pel_filters_int[8][3] = { + { 0, 0, 0 }, + { 0x0000fffa, 0x007b000c, 0xffff0000 }, + { 0x0002fff5, 0x006c0024, 0xfff80001 }, + { 0x0000fff7, 0x005d0032, 0xfffa0000 }, + { 0x0003fff0, 0x004d004d, 0xfff00003 }, + { 0x0000fffa, 0x0032005d, 0xfff70000 }, + { 0x0001fff8, 0x0024006c, 0xfff50002 }, + { 0x0000ffff, 0x000c007b, 0xfffa0000 }, +}; + +static const int sub_pel_filters_inv[8][3] = { + { 0, 0, 0 }, + { 0xfffa0000, 0x000c007b, 0x0000ffff }, + { 0xfff50002, 0x0024006c, 0x0001fff8 }, + { 0xfff70000, 0x0032005d, 0x0000fffa }, + { 0xfff00003, 0x004d004d, 0x0003fff0 }, + { 0xfffa0000, 0x005d0032, 0x0000fff7 }, + { 0xfff80001, 0x006c0024, 0x0002fff5 }, + { 0xffff0000, 0x007b000c, 0x0000fffa }, +}; + +/* clang-format off */ +static const int sub_pel_filters_int_tap_4[8][2] = { + { 0, 0}, + { 0xfffa007b, 0x000cffff}, + { 0, 0}, + { 0xfff7005d, 0x0032fffa}, + { 0, 0}, + { 0xfffa0032, 0x005dfff7}, + { 0, 0}, + { 0xffff000c, 0x007bfffa}, +}; + + +static const int sub_pel_filters_inv_tap_4[8][2] = { + { 0, 0}, + { 0x007bfffa, 0xffff000c}, + { 0, 0}, + { 0x005dfff7, 0xfffa0032}, + { 0, 0}, + { 0x0032fffa, 0xfff7005d}, + { 0, 0}, + { 0x000cffff, 0xfffa007b}, +}; +/* clang-format on */ + +inline void prefetch_load(unsigned char *src) { + __asm__ __volatile__("pref 0, 0(%[src]) \n\t" : : [src] "r"(src)); +} + +inline void prefetch_store(unsigned char *dst) { + __asm__ __volatile__("pref 1, 0(%[dst]) \n\t" : : [dst] "r"(dst)); +} + +void dsputil_static_init(void) { + int i; + + for (i = 0; i < 256; ++i) ff_cropTbl[i + CROP_WIDTH] = i; + + for (i = 0; i < CROP_WIDTH; ++i) { + ff_cropTbl[i] = 0; + ff_cropTbl[i + CROP_WIDTH + 256] = 255; + } +} + +void vp8_filter_block2d_first_pass_4(unsigned char *RESTRICT src_ptr, + unsigned char *RESTRICT dst_ptr, + unsigned int src_pixels_per_line, + unsigned int output_height, int xoffset, + int pitch) { + unsigned int i; + int Temp1, Temp2, Temp3, Temp4; + + unsigned int vector4a = 64; + int vector1b, vector2b, vector3b; + unsigned int tp1, tp2, tn1, tn2; + unsigned int p1, p2, p3; + unsigned int n1, n2, n3; + unsigned char *cm = ff_cropTbl + CROP_WIDTH; + + vector3b = sub_pel_filters_inv[xoffset][2]; + + /* if (xoffset == 0) we don't need any filtering */ + if (vector3b == 0) { + for (i = 0; i < output_height; ++i) { + /* prefetch src_ptr data to cache memory */ + prefetch_load(src_ptr + src_pixels_per_line); + dst_ptr[0] = src_ptr[0]; + dst_ptr[1] = src_ptr[1]; + dst_ptr[2] = src_ptr[2]; + dst_ptr[3] = src_ptr[3]; + + /* next row... */ + src_ptr += src_pixels_per_line; + dst_ptr += 4; + } + } else { + if (vector3b > 65536) { + /* 6 tap filter */ + + vector1b = sub_pel_filters_inv[xoffset][0]; + vector2b = sub_pel_filters_inv[xoffset][1]; + + /* prefetch src_ptr data to cache memory */ + prefetch_load(src_ptr + src_pixels_per_line); + + for (i = output_height; i--;) { + /* apply filter with vectors pairs */ + __asm__ __volatile__( + "ulw %[tp1], -2(%[src_ptr]) \n\t" + "ulw %[tp2], 2(%[src_ptr]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbl %[p1], %[tp2] \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "extp %[Temp1], $ac3, 9 \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" + + /* odd 1. pixel */ + "ulw %[tn2], 3(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[n1], %[tp2] \n\t" + "preceu.ph.qbl %[n2], %[tp2] \n\t" + "preceu.ph.qbr %[n3], %[tn2] \n\t" + "extp %[Temp3], $ac2, 9 \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbl %[n1], %[tn2] \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "extp %[Temp4], $ac2, 9 \n\t" + + /* clamp */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "lbux %[tn1], %[Temp2](%[cm]) \n\t" + "lbux %[tp2], %[Temp3](%[cm]) \n\t" + "lbux %[n2], %[Temp4](%[cm]) \n\t" + + /* store bytes */ + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "sb %[tn1], 1(%[dst_ptr]) \n\t" + "sb %[tp2], 2(%[dst_ptr]) \n\t" + "sb %[n2], 3(%[dst_ptr]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [Temp4] "=&r"(Temp4) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr), + [vector3b] "r"(vector3b), [src_ptr] "r"(src_ptr)); + + /* Next row... */ + src_ptr += src_pixels_per_line; + dst_ptr += pitch; + } + } else { + /* 4 tap filter */ + + vector1b = sub_pel_filters_inv_tap_4[xoffset][0]; + vector2b = sub_pel_filters_inv_tap_4[xoffset][1]; + + for (i = output_height; i--;) { + /* apply filter with vectors pairs */ + __asm__ __volatile__( + "ulw %[tp1], -1(%[src_ptr]) \n\t" + "ulw %[tp2], 3(%[src_ptr]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "extp %[Temp1], $ac3, 9 \n\t" + + /* odd 1. pixel */ + "srl %[tn1], %[tp2], 8 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[n1], %[tp2] \n\t" + "preceu.ph.qbl %[n2], %[tp2] \n\t" + "preceu.ph.qbr %[n3], %[tn1] \n\t" + "extp %[Temp3], $ac2, 9 \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" + "extp %[Temp4], $ac2, 9 \n\t" + + /* clamp and store results */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "lbux %[tn1], %[Temp2](%[cm]) \n\t" + "lbux %[tp2], %[Temp3](%[cm]) \n\t" + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "sb %[tn1], 1(%[dst_ptr]) \n\t" + "lbux %[n2], %[Temp4](%[cm]) \n\t" + "sb %[tp2], 2(%[dst_ptr]) \n\t" + "sb %[n2], 3(%[dst_ptr]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [n1] "=&r"(n1), + [n2] "=&r"(n2), [n3] "=&r"(n3), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr), + [src_ptr] "r"(src_ptr)); + /* Next row... */ + src_ptr += src_pixels_per_line; + dst_ptr += pitch; + } + } + } +} + +void vp8_filter_block2d_first_pass_8_all(unsigned char *RESTRICT src_ptr, + unsigned char *RESTRICT dst_ptr, + unsigned int src_pixels_per_line, + unsigned int output_height, + int xoffset, int pitch) { + unsigned int i; + int Temp1, Temp2, Temp3, Temp4; + + unsigned int vector4a = 64; + unsigned int vector1b, vector2b, vector3b; + unsigned int tp1, tp2, tn1, tn2; + unsigned int p1, p2, p3, p4; + unsigned int n1, n2, n3, n4; + + unsigned char *cm = ff_cropTbl + CROP_WIDTH; + + /* if (xoffset == 0) we don't need any filtering */ + if (xoffset == 0) { + for (i = 0; i < output_height; ++i) { + /* prefetch src_ptr data to cache memory */ + prefetch_load(src_ptr + src_pixels_per_line); + + dst_ptr[0] = src_ptr[0]; + dst_ptr[1] = src_ptr[1]; + dst_ptr[2] = src_ptr[2]; + dst_ptr[3] = src_ptr[3]; + dst_ptr[4] = src_ptr[4]; + dst_ptr[5] = src_ptr[5]; + dst_ptr[6] = src_ptr[6]; + dst_ptr[7] = src_ptr[7]; + + /* next row... */ + src_ptr += src_pixels_per_line; + dst_ptr += 8; + } + } else { + vector3b = sub_pel_filters_inv[xoffset][2]; + + if (vector3b > 65536) { + /* 6 tap filter */ + + vector1b = sub_pel_filters_inv[xoffset][0]; + vector2b = sub_pel_filters_inv[xoffset][1]; + + for (i = output_height; i--;) { + /* prefetch src_ptr data to cache memory */ + prefetch_load(src_ptr + src_pixels_per_line); + + /* apply filter with vectors pairs */ + __asm__ __volatile__( + "ulw %[tp1], -2(%[src_ptr]) \n\t" + "ulw %[tp2], 2(%[src_ptr]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbl %[p1], %[tp2] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" + + "balign %[tp2], %[tp1], 3 \n\t" + "extp %[Temp1], $ac3, 9 \n\t" + "ulw %[tn2], 3(%[src_ptr]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[n1], %[tp2] \n\t" + "preceu.ph.qbl %[n2], %[tp2] \n\t" + "preceu.ph.qbr %[n3], %[tn2] \n\t" + "extp %[Temp3], $ac2, 9 \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbl %[n1], %[tn2] \n\t" + "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "ulw %[tp1], 6(%[src_ptr]) \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[p2], %[tp1] \n\t" + "extp %[Temp4], $ac2, 9 \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn2] "=&r"(tn2), + [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [n1] "=&r"(n1), + [n2] "=&r"(n2), [n3] "=&r"(n3), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector4a] "r"(vector4a), [vector3b] "r"(vector3b), + [src_ptr] "r"(src_ptr)); + + /* clamp and store results */ + dst_ptr[0] = cm[Temp1]; + dst_ptr[1] = cm[Temp2]; + dst_ptr[2] = cm[Temp3]; + dst_ptr[3] = cm[Temp4]; + + /* next 4 pixels */ + __asm__ __volatile__( + /* even 3. pixel */ + "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector3b] \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbl %[p4], %[tp1] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + + "ulw %[tn1], 7(%[src_ptr]) \n\t" + "extp %[Temp1], $ac3, 9 \n\t" + + /* odd 3. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[n2], %[tn1] \n\t" + "dpa.w.ph $ac3, %[n3], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector3b] \n\t" + "extp %[Temp3], $ac2, 9 \n\t" + + /* odd 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbl %[n4], %[tn1] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n2], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + "extp %[Temp4], $ac2, 9 \n\t" + + : [tn1] "=&r"(tn1), [n2] "=&r"(n2), [p4] "=&r"(p4), [n4] "=&r"(n4), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [Temp4] "=r"(Temp4) + : [tp1] "r"(tp1), [vector1b] "r"(vector1b), [p2] "r"(p2), + [vector2b] "r"(vector2b), [n1] "r"(n1), [p1] "r"(p1), + [vector4a] "r"(vector4a), [vector3b] "r"(vector3b), [p3] "r"(p3), + [n3] "r"(n3), [src_ptr] "r"(src_ptr)); + + /* clamp and store results */ + dst_ptr[4] = cm[Temp1]; + dst_ptr[5] = cm[Temp2]; + dst_ptr[6] = cm[Temp3]; + dst_ptr[7] = cm[Temp4]; + + src_ptr += src_pixels_per_line; + dst_ptr += pitch; + } + } else { + /* 4 tap filter */ + + vector1b = sub_pel_filters_inv_tap_4[xoffset][0]; + vector2b = sub_pel_filters_inv_tap_4[xoffset][1]; + + for (i = output_height; i--;) { + /* prefetch src_ptr data to cache memory */ + prefetch_load(src_ptr + src_pixels_per_line); + + /* apply filter with vectors pairs */ + __asm__ __volatile__( + "ulw %[tp1], -1(%[src_ptr]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + + "ulw %[tp2], 3(%[src_ptr]) \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "extp %[Temp1], $ac3, 9 \n\t" + + "balign %[tp2], %[tp1], 3 \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[n1], %[tp2] \n\t" + "preceu.ph.qbl %[n2], %[tp2] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + "extp %[Temp3], $ac2, 9 \n\t" + + "ulw %[tn2], 4(%[src_ptr]) \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbr %[n3], %[tn2] \n\t" + "preceu.ph.qbl %[n4], %[tn2] \n\t" + "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" + "ulw %[tp1], 7(%[src_ptr]) \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "extp %[Temp4], $ac2, 9 \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn2] "=&r"(tn2), + [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), + [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3), [n4] "=&r"(n4), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [Temp4] "=r"(Temp4) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr)); + + /* clamp and store results */ + dst_ptr[0] = cm[Temp1]; + dst_ptr[1] = cm[Temp2]; + dst_ptr[2] = cm[Temp3]; + dst_ptr[3] = cm[Temp4]; + + /* next 4 pixels */ + __asm__ __volatile__( + /* even 3. pixel */ + "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbr %[p2], %[tp1] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" + "extp %[Temp1], $ac3, 9 \n\t" + + /* odd 3. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "dpa.w.ph $ac3, %[n3], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n4], %[vector2b] \n\t" + "ulw %[tn1], 8(%[src_ptr]) \n\t" + "extp %[Temp3], $ac2, 9 \n\t" + + /* odd 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbr %[n2], %[tn1] \n\t" + "dpa.w.ph $ac2, %[n4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n2], %[vector2b] \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + "extp %[Temp4], $ac2, 9 \n\t" + + : [tn1] "=&r"(tn1), [p2] "=&r"(p2), [n2] "=&r"(n2), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [Temp4] "=r"(Temp4) + : [tp1] "r"(tp1), [p3] "r"(p3), [p4] "r"(p4), + [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr), [n3] "r"(n3), + [n4] "r"(n4)); + + /* clamp and store results */ + dst_ptr[4] = cm[Temp1]; + dst_ptr[5] = cm[Temp2]; + dst_ptr[6] = cm[Temp3]; + dst_ptr[7] = cm[Temp4]; + + /* next row... */ + src_ptr += src_pixels_per_line; + dst_ptr += pitch; + } + } + } +} + +void vp8_filter_block2d_first_pass16_6tap(unsigned char *RESTRICT src_ptr, + unsigned char *RESTRICT dst_ptr, + unsigned int src_pixels_per_line, + unsigned int output_height, + int xoffset, int pitch) { + unsigned int i; + int Temp1, Temp2, Temp3, Temp4; + + unsigned int vector4a; + unsigned int vector1b, vector2b, vector3b; + unsigned int tp1, tp2, tn1, tn2; + unsigned int p1, p2, p3, p4; + unsigned int n1, n2, n3, n4; + unsigned char *cm = ff_cropTbl + CROP_WIDTH; + + vector1b = sub_pel_filters_inv[xoffset][0]; + vector2b = sub_pel_filters_inv[xoffset][1]; + vector3b = sub_pel_filters_inv[xoffset][2]; + vector4a = 64; + + for (i = output_height; i--;) { + /* prefetch src_ptr data to cache memory */ + prefetch_load(src_ptr + src_pixels_per_line); + + /* apply filter with vectors pairs */ + __asm__ __volatile__( + "ulw %[tp1], -2(%[src_ptr]) \n\t" + "ulw %[tp2], 2(%[src_ptr]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbl %[p1], %[tp2] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" + + "balign %[tp2], %[tp1], 3 \n\t" + "ulw %[tn2], 3(%[src_ptr]) \n\t" + "extp %[Temp1], $ac3, 9 \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[n1], %[tp2] \n\t" + "preceu.ph.qbl %[n2], %[tp2] \n\t" + "preceu.ph.qbr %[n3], %[tn2] \n\t" + "extp %[Temp3], $ac2, 9 \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbl %[n1], %[tn2] \n\t" + "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "ulw %[tp1], 6(%[src_ptr]) \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[p2], %[tp1] \n\t" + "extp %[Temp4], $ac2, 9 \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn2] "=&r"(tn2), [p1] "=&r"(p1), + [p2] "=&r"(p2), [p3] "=&r"(p3), [n1] "=&r"(n1), [n2] "=&r"(n2), + [n3] "=&r"(n3), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector4a] "r"(vector4a), [vector3b] "r"(vector3b), + [src_ptr] "r"(src_ptr)); + + /* clamp and store results */ + dst_ptr[0] = cm[Temp1]; + dst_ptr[1] = cm[Temp2]; + dst_ptr[2] = cm[Temp3]; + dst_ptr[3] = cm[Temp4]; + + /* next 4 pixels */ + __asm__ __volatile__( + /* even 3. pixel */ + "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector3b] \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbl %[p4], %[tp1] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "ulw %[tn1], 7(%[src_ptr]) \n\t" + "extp %[Temp1], $ac3, 9 \n\t" + + /* odd 3. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[n2], %[tn1] \n\t" + "dpa.w.ph $ac3, %[n3], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector3b] \n\t" + "extp %[Temp3], $ac2, 9 \n\t" + + /* odd 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbl %[n4], %[tn1] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n2], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" + "ulw %[tp2], 10(%[src_ptr]) \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "extp %[Temp4], $ac2, 9 \n\t" + + : [tn1] "=&r"(tn1), [tp2] "=&r"(tp2), [n2] "=&r"(n2), [p4] "=&r"(p4), + [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4), [p1] "+r"(p1) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp1] "r"(tp1), + [n1] "r"(n1), [vector4a] "r"(vector4a), [p2] "r"(p2), + [vector3b] "r"(vector3b), [p3] "r"(p3), [n3] "r"(n3), + [src_ptr] "r"(src_ptr)); + + /* clamp and store results */ + dst_ptr[4] = cm[Temp1]; + dst_ptr[5] = cm[Temp2]; + dst_ptr[6] = cm[Temp3]; + dst_ptr[7] = cm[Temp4]; + + /* next 4 pixels */ + __asm__ __volatile__( + /* even 5. pixel */ + "dpa.w.ph $ac3, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" + + /* even 6. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbl %[p3], %[tp2] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector3b] \n\t" + + "ulw %[tn1], 11(%[src_ptr]) \n\t" + "extp %[Temp1], $ac3, 9 \n\t" + + /* odd 5. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[n1], %[tn1] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n4], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" + "extp %[Temp3], $ac2, 9 \n\t" + + /* odd 6. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbl %[n3], %[tn1] \n\t" + "dpa.w.ph $ac2, %[n4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n3], %[vector3b] \n\t" + "ulw %[tp1], 14(%[src_ptr]) \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[p4], %[tp1] \n\t" + "extp %[Temp4], $ac2, 9 \n\t" + + : [tn1] "=&r"(tn1), [tp1] "=&r"(tp1), [n1] "=&r"(n1), [p3] "=&r"(p3), + [n3] "=&r"(n3), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4), [p4] "+r"(p4) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp2] "r"(tp2), + [p2] "r"(p2), [n2] "r"(n2), [n4] "r"(n4), [p1] "r"(p1), + [src_ptr] "r"(src_ptr), [vector4a] "r"(vector4a), + [vector3b] "r"(vector3b)); + + /* clamp and store results */ + dst_ptr[8] = cm[Temp1]; + dst_ptr[9] = cm[Temp2]; + dst_ptr[10] = cm[Temp3]; + dst_ptr[11] = cm[Temp4]; + + /* next 4 pixels */ + __asm__ __volatile__( + /* even 7. pixel */ + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector3b] \n\t" + + /* even 8. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector3b] \n\t" + "ulw %[tn1], 15(%[src_ptr]) \n\t" + "extp %[Temp1], $ac3, 9 \n\t" + + /* odd 7. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "preceu.ph.qbr %[n4], %[tn1] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n3], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n4], %[vector3b] \n\t" + "extp %[Temp3], $ac2, 9 \n\t" + + /* odd 8. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "preceu.ph.qbl %[n2], %[tn1] \n\t" + "dpa.w.ph $ac2, %[n3], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n4], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n2], %[vector3b] \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + "extp %[Temp4], $ac2, 9 \n\t" + + /* clamp and store results */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "lbux %[tn1], %[Temp2](%[cm]) \n\t" + "lbux %[p2], %[Temp3](%[cm]) \n\t" + "sb %[tp1], 12(%[dst_ptr]) \n\t" + "sb %[tn1], 13(%[dst_ptr]) \n\t" + "lbux %[n2], %[Temp4](%[cm]) \n\t" + "sb %[p2], 14(%[dst_ptr]) \n\t" + "sb %[n2], 15(%[dst_ptr]) \n\t" + + : [tn1] "=&r"(tn1), [p2] "=&r"(p2), [n2] "=&r"(n2), [n4] "=&r"(n4), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [Temp4] "=r"(Temp4), [tp1] "+r"(tp1) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [p4] "r"(p4), + [n1] "r"(n1), [p1] "r"(p1), [vector4a] "r"(vector4a), + [vector3b] "r"(vector3b), [p3] "r"(p3), [n3] "r"(n3), + [src_ptr] "r"(src_ptr), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + + src_ptr += src_pixels_per_line; + dst_ptr += pitch; + } +} + +void vp8_filter_block2d_first_pass16_0(unsigned char *RESTRICT src_ptr, + unsigned char *RESTRICT output_ptr, + unsigned int src_pixels_per_line) { + int Temp1, Temp2, Temp3, Temp4; + int i; + + /* prefetch src_ptr data to cache memory */ + prefetch_store(output_ptr + 32); + + /* copy memory from src buffer to dst buffer */ + for (i = 0; i < 7; ++i) { + __asm__ __volatile__( + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "ulw %[Temp3], 8(%[src_ptr]) \n\t" + "ulw %[Temp4], 12(%[src_ptr]) \n\t" + "sw %[Temp1], 0(%[output_ptr]) \n\t" + "sw %[Temp2], 4(%[output_ptr]) \n\t" + "sw %[Temp3], 8(%[output_ptr]) \n\t" + "sw %[Temp4], 12(%[output_ptr]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr) + : [src_pixels_per_line] "r"(src_pixels_per_line), [output_ptr] "r"( + output_ptr)); + + __asm__ __volatile__( + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "ulw %[Temp3], 8(%[src_ptr]) \n\t" + "ulw %[Temp4], 12(%[src_ptr]) \n\t" + "sw %[Temp1], 16(%[output_ptr]) \n\t" + "sw %[Temp2], 20(%[output_ptr]) \n\t" + "sw %[Temp3], 24(%[output_ptr]) \n\t" + "sw %[Temp4], 28(%[output_ptr]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr) + : [src_pixels_per_line] "r"(src_pixels_per_line), [output_ptr] "r"( + output_ptr)); + + __asm__ __volatile__( + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "ulw %[Temp3], 8(%[src_ptr]) \n\t" + "ulw %[Temp4], 12(%[src_ptr]) \n\t" + "sw %[Temp1], 32(%[output_ptr]) \n\t" + "sw %[Temp2], 36(%[output_ptr]) \n\t" + "sw %[Temp3], 40(%[output_ptr]) \n\t" + "sw %[Temp4], 44(%[output_ptr]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr) + : [src_pixels_per_line] "r"(src_pixels_per_line), [output_ptr] "r"( + output_ptr)); + + output_ptr += 48; + } +} + +void vp8_filter_block2d_first_pass16_4tap( + unsigned char *RESTRICT src_ptr, unsigned char *RESTRICT output_ptr, + unsigned int src_pixels_per_line, unsigned int output_width, + unsigned int output_height, int xoffset, int yoffset, + unsigned char *RESTRICT dst_ptr, int pitch) { + unsigned int i, j; + int Temp1, Temp2, Temp3, Temp4; + + unsigned int vector4a; + int vector1b, vector2b; + unsigned int tp1, tp2, tp3, tn1; + unsigned int p1, p2, p3; + unsigned int n1, n2, n3; + unsigned char *cm = ff_cropTbl + CROP_WIDTH; + + vector4a = 64; + + vector1b = sub_pel_filters_inv_tap_4[xoffset][0]; + vector2b = sub_pel_filters_inv_tap_4[xoffset][1]; + + /* if (yoffset == 0) don't need temp buffer, data will be stored in dst_ptr */ + if (yoffset == 0) { + output_height -= 5; + src_ptr += (src_pixels_per_line + src_pixels_per_line); + + for (i = output_height; i--;) { + __asm__ __volatile__("ulw %[tp3], -1(%[src_ptr]) \n\t" + : [tp3] "=&r"(tp3) + : [src_ptr] "r"(src_ptr)); + + /* processing 4 adjacent pixels */ + for (j = 0; j < 16; j += 4) { + /* apply filter with vectors pairs */ + __asm__ __volatile__( + "ulw %[tp2], 3(%[src_ptr]) " + "\n\t" + "move %[tp1], %[tp3] " + "\n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 " + "\n\t" + "mthi $0, $ac3 " + "\n\t" + "move %[tp3], %[tp2] " + "\n\t" + "preceu.ph.qbr %[p1], %[tp1] " + "\n\t" + "preceu.ph.qbl %[p2], %[tp1] " + "\n\t" + "preceu.ph.qbr %[p3], %[tp2] " + "\n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] " + "\n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] " + "\n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 " + "\n\t" + "mthi $0, $ac2 " + "\n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] " + "\n\t" + "extr.w %[Temp1], $ac3, 7 " + "\n\t" + + /* odd 1. pixel */ + "ulw %[tn1], 4(%[src_ptr]) " + "\n\t" + "balign %[tp2], %[tp1], 3 " + "\n\t" + "mtlo %[vector4a], $ac3 " + "\n\t" + "mthi $0, $ac3 " + "\n\t" + "preceu.ph.qbr %[n1], %[tp2] " + "\n\t" + "preceu.ph.qbl %[n2], %[tp2] " + "\n\t" + "preceu.ph.qbr %[n3], %[tn1] " + "\n\t" + "extr.w %[Temp3], $ac2, 7 " + "\n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] " + "\n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] " + "\n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac2 " + "\n\t" + "mthi $0, $ac2 " + "\n\t" + "extr.w %[Temp2], $ac3, 7 " + "\n\t" + "dpa.w.ph $ac2, %[n2], %[vector1b] " + "\n\t" + "dpa.w.ph $ac2, %[n3], %[vector2b] " + "\n\t" + "extr.w %[Temp4], $ac2, 7 " + "\n\t" + + /* clamp and store results */ + "lbux %[tp1], %[Temp1](%[cm]) " + "\n\t" + "lbux %[tn1], %[Temp2](%[cm]) " + "\n\t" + "lbux %[tp2], %[Temp3](%[cm]) " + "\n\t" + "sb %[tp1], 0(%[dst_ptr]) " + "\n\t" + "sb %[tn1], 1(%[dst_ptr]) " + "\n\t" + "lbux %[n2], %[Temp4](%[cm]) " + "\n\t" + "sb %[tp2], 2(%[dst_ptr]) " + "\n\t" + "sb %[n2], 3(%[dst_ptr]) " + "\n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tn1] "=&r"(tn1), [p1] "=&r"(p1), [p2] "=&r"(p2), [n1] "=&r"(n1), + [n2] "=&r"(n2), [n3] "=&r"(n3), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [p3] "=&r"(p3), [Temp3] "=&r"(Temp3), + [Temp4] "=&r"(Temp4) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr), + [src_ptr] "r"(src_ptr)); + + src_ptr += 4; + } + + /* Next row... */ + src_ptr += src_pixels_per_line - 16; + dst_ptr += pitch; + } + } else { + for (i = output_height; i--;) { + /* processing 4 adjacent pixels */ + for (j = 0; j < 16; j += 4) { + /* apply filter with vectors pairs */ + __asm__ __volatile__( + "ulw %[tp1], -1(%[src_ptr]) " + "\n\t" + "ulw %[tp2], 3(%[src_ptr]) " + "\n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 " + "\n\t" + "mthi $0, $ac3 " + "\n\t" + "preceu.ph.qbr %[p1], %[tp1] " + "\n\t" + "preceu.ph.qbl %[p2], %[tp1] " + "\n\t" + "preceu.ph.qbr %[p3], %[tp2] " + "\n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] " + "\n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] " + "\n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 " + "\n\t" + "mthi $0, $ac2 " + "\n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] " + "\n\t" + "extr.w %[Temp1], $ac3, 7 " + "\n\t" + + /* odd 1. pixel */ + "ulw %[tn1], 4(%[src_ptr]) " + "\n\t" + "balign %[tp2], %[tp1], 3 " + "\n\t" + "mtlo %[vector4a], $ac3 " + "\n\t" + "mthi $0, $ac3 " + "\n\t" + "preceu.ph.qbr %[n1], %[tp2] " + "\n\t" + "preceu.ph.qbl %[n2], %[tp2] " + "\n\t" + "preceu.ph.qbr %[n3], %[tn1] " + "\n\t" + "extr.w %[Temp3], $ac2, 7 " + "\n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] " + "\n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] " + "\n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac2 " + "\n\t" + "mthi $0, $ac2 " + "\n\t" + "extr.w %[Temp2], $ac3, 7 " + "\n\t" + "dpa.w.ph $ac2, %[n2], %[vector1b] " + "\n\t" + "dpa.w.ph $ac2, %[n3], %[vector2b] " + "\n\t" + "extr.w %[Temp4], $ac2, 7 " + "\n\t" + + /* clamp and store results */ + "lbux %[tp1], %[Temp1](%[cm]) " + "\n\t" + "lbux %[tn1], %[Temp2](%[cm]) " + "\n\t" + "lbux %[tp2], %[Temp3](%[cm]) " + "\n\t" + "sb %[tp1], 0(%[output_ptr]) " + "\n\t" + "sb %[tn1], 1(%[output_ptr]) " + "\n\t" + "lbux %[n2], %[Temp4](%[cm]) " + "\n\t" + "sb %[tp2], 2(%[output_ptr]) " + "\n\t" + "sb %[n2], 3(%[output_ptr]) " + "\n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [n1] "=&r"(n1), + [n2] "=&r"(n2), [n3] "=&r"(n3), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector4a] "r"(vector4a), [cm] "r"(cm), + [output_ptr] "r"(output_ptr), [src_ptr] "r"(src_ptr)); + + src_ptr += 4; + } + + /* next row... */ + src_ptr += src_pixels_per_line; + output_ptr += output_width; + } + } +} + +void vp8_filter_block2d_second_pass4(unsigned char *RESTRICT src_ptr, + unsigned char *RESTRICT output_ptr, + int output_pitch, int yoffset) { + unsigned int i; + + int Temp1, Temp2, Temp3, Temp4; + unsigned int vector1b, vector2b, vector3b, vector4a; + + unsigned char src_ptr_l2; + unsigned char src_ptr_l1; + unsigned char src_ptr_0; + unsigned char src_ptr_r1; + unsigned char src_ptr_r2; + unsigned char src_ptr_r3; + + unsigned char *cm = ff_cropTbl + CROP_WIDTH; + + vector4a = 64; + + /* load filter coefficients */ + vector1b = sub_pel_filterss[yoffset][0]; + vector2b = sub_pel_filterss[yoffset][2]; + vector3b = sub_pel_filterss[yoffset][1]; + + if (vector1b) { + /* 6 tap filter */ + + for (i = 2; i--;) { + /* prefetch src_ptr data to cache memory */ + prefetch_load(src_ptr); + + /* do not allow compiler to reorder instructions */ + __asm__ __volatile__( + ".set noreorder \n\t" + : + :); + + /* apply filter with vectors pairs */ + __asm__ __volatile__( + "lbu %[src_ptr_l2], -8(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -4(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 4(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 8(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 12(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -7(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -3(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 5(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 9(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 13(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "extp %[Temp1], $ac2, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -6(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -2(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 6(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 10(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 14(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac0 \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -5(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -1(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 7(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 11(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 15(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "extp %[Temp3], $ac0, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" + "extp %[Temp4], $ac1, 9 \n\t" + + : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [Temp4] "=r"(Temp4), [src_ptr_l1] "=&r"(src_ptr_l1), + [src_ptr_0] "=&r"(src_ptr_0), [src_ptr_r1] "=&r"(src_ptr_r1), + [src_ptr_r2] "=&r"(src_ptr_r2), [src_ptr_l2] "=&r"(src_ptr_l2), + [src_ptr_r3] "=&r"(src_ptr_r3) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4a] "r"(vector4a), + [src_ptr] "r"(src_ptr)); + + /* clamp and store results */ + output_ptr[0] = cm[Temp1]; + output_ptr[1] = cm[Temp2]; + output_ptr[2] = cm[Temp3]; + output_ptr[3] = cm[Temp4]; + + output_ptr += output_pitch; + + /* apply filter with vectors pairs */ + __asm__ __volatile__( + "lbu %[src_ptr_l2], -4(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], 0(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 8(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 12(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 16(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -3(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], 1(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 9(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 13(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 17(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "extp %[Temp1], $ac2, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -2(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], 2(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 10(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 14(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 18(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac0 \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -1(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], 3(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 11(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 15(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 19(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "extp %[Temp3], $ac0, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" + "extp %[Temp4], $ac1, 9 \n\t" + + : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [Temp4] "=r"(Temp4), [src_ptr_l1] "=&r"(src_ptr_l1), + [src_ptr_0] "=&r"(src_ptr_0), [src_ptr_r1] "=&r"(src_ptr_r1), + [src_ptr_r2] "=&r"(src_ptr_r2), [src_ptr_l2] "=&r"(src_ptr_l2), + [src_ptr_r3] "=&r"(src_ptr_r3) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4a] "r"(vector4a), + [src_ptr] "r"(src_ptr)); + + /* clamp and store results */ + output_ptr[0] = cm[Temp1]; + output_ptr[1] = cm[Temp2]; + output_ptr[2] = cm[Temp3]; + output_ptr[3] = cm[Temp4]; + + src_ptr += 8; + output_ptr += output_pitch; + } + } else { + /* 4 tap filter */ + + /* prefetch src_ptr data to cache memory */ + prefetch_load(src_ptr); + + for (i = 2; i--;) { + /* do not allow compiler to reorder instructions */ + __asm__ __volatile__( + ".set noreorder \n\t" + : + :); + + /* apply filter with vectors pairs */ + __asm__ __volatile__( + "lbu %[src_ptr_l1], -4(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 4(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 8(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l1], -3(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 5(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 9(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "extp %[Temp1], $ac2, 9 \n\t" + + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l1], -2(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 6(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 10(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac0 \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l1], -1(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 7(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 11(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "extp %[Temp3], $ac0, 9 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" + "extp %[Temp4], $ac1, 9 \n\t" + + : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [Temp4] "=r"(Temp4), [src_ptr_l1] "=&r"(src_ptr_l1), + [src_ptr_0] "=&r"(src_ptr_0), [src_ptr_r1] "=&r"(src_ptr_r1), + [src_ptr_r2] "=&r"(src_ptr_r2) + : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b), + [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr)); + + /* clamp and store results */ + output_ptr[0] = cm[Temp1]; + output_ptr[1] = cm[Temp2]; + output_ptr[2] = cm[Temp3]; + output_ptr[3] = cm[Temp4]; + + output_ptr += output_pitch; + + /* apply filter with vectors pairs */ + __asm__ __volatile__( + "lbu %[src_ptr_l1], 0(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 8(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 12(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l1], 1(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 9(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 13(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "extp %[Temp1], $ac2, 9 \n\t" + + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l1], 2(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 10(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 14(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac0 \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l1], 3(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 11(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 15(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "extp %[Temp3], $ac0, 9 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" + "extp %[Temp4], $ac1, 9 \n\t" + + : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [Temp4] "=r"(Temp4), [src_ptr_l1] "=&r"(src_ptr_l1), + [src_ptr_0] "=&r"(src_ptr_0), [src_ptr_r1] "=&r"(src_ptr_r1), + [src_ptr_r2] "=&r"(src_ptr_r2) + : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b), + [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr)); + + /* clamp and store results */ + output_ptr[0] = cm[Temp1]; + output_ptr[1] = cm[Temp2]; + output_ptr[2] = cm[Temp3]; + output_ptr[3] = cm[Temp4]; + + src_ptr += 8; + output_ptr += output_pitch; + } + } +} + +void vp8_filter_block2d_second_pass_8(unsigned char *RESTRICT src_ptr, + unsigned char *RESTRICT output_ptr, + int output_pitch, + unsigned int output_height, + unsigned int output_width, + unsigned int yoffset) { + unsigned int i; + + int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6, Temp7, Temp8; + unsigned int vector1b, vector2b, vector3b, vector4a; + + unsigned char src_ptr_l2; + unsigned char src_ptr_l1; + unsigned char src_ptr_0; + unsigned char src_ptr_r1; + unsigned char src_ptr_r2; + unsigned char src_ptr_r3; + unsigned char *cm = ff_cropTbl + CROP_WIDTH; + (void)output_width; + + vector4a = 64; + + vector1b = sub_pel_filterss[yoffset][0]; + vector2b = sub_pel_filterss[yoffset][2]; + vector3b = sub_pel_filterss[yoffset][1]; + + if (vector1b) { + /* 6 tap filter */ + + /* prefetch src_ptr data to cache memory */ + prefetch_load(src_ptr); + + for (i = output_height; i--;) { + /* apply filter with vectors pairs */ + __asm__ __volatile__( + "lbu %[src_ptr_l2], -16(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -8(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 8(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 16(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 24(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -15(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -7(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 9(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 17(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 25(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "extp %[Temp1], $ac2, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -14(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -6(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 10(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 18(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 26(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac0 \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -13(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -5(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 11(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 19(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 27(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "extp %[Temp3], $ac0, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" + + : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0), + [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2), + [src_ptr_l2] "=&r"(src_ptr_l2), [src_ptr_r3] "=&r"(src_ptr_r3) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4a] "r"(vector4a), + [src_ptr] "r"(src_ptr)); + + /* apply filter with vectors pairs */ + __asm__ __volatile__( + "lbu %[src_ptr_l2], -12(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -4(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 12(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 20(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 28(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" + "extp %[Temp4], $ac1, 9 \n\t" + + "lbu %[src_ptr_l2], -11(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -3(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 13(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 21(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 29(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "extp %[Temp5], $ac2, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -10(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -2(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 14(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 22(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 30(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac0 \n\t" + "extp %[Temp6], $ac3, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -9(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -1(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 15(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 23(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 31(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "extp %[Temp7], $ac0, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" + "extp %[Temp8], $ac1, 9 \n\t" + + : [Temp4] "=&r"(Temp4), [Temp5] "=&r"(Temp5), [Temp6] "=&r"(Temp6), + [Temp7] "=&r"(Temp7), [Temp8] "=r"(Temp8), + [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0), + [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2), + [src_ptr_l2] "=&r"(src_ptr_l2), [src_ptr_r3] "=&r"(src_ptr_r3) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4a] "r"(vector4a), + [src_ptr] "r"(src_ptr)); + + /* clamp and store results */ + output_ptr[0] = cm[Temp1]; + output_ptr[1] = cm[Temp2]; + output_ptr[2] = cm[Temp3]; + output_ptr[3] = cm[Temp4]; + output_ptr[4] = cm[Temp5]; + output_ptr[5] = cm[Temp6]; + output_ptr[6] = cm[Temp7]; + output_ptr[7] = cm[Temp8]; + + src_ptr += 8; + output_ptr += output_pitch; + } + } else { + /* 4 tap filter */ + + /* prefetch src_ptr data to cache memory */ + prefetch_load(src_ptr); + + for (i = output_height; i--;) { + __asm__ __volatile__( + "lbu %[src_ptr_l1], -8(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 8(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 16(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" + + : [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0), + [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2) + : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b), + [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr)); + + __asm__ __volatile__( + "lbu %[src_ptr_l1], -7(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 9(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 17(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" + "extp %[Temp1], $ac2, 9 \n\t" + + : [Temp1] "=r"(Temp1), [src_ptr_l1] "=&r"(src_ptr_l1), + [src_ptr_0] "=&r"(src_ptr_0), [src_ptr_r1] "=&r"(src_ptr_r1), + [src_ptr_r2] "=&r"(src_ptr_r2) + : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b), + [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr)); + + src_ptr_l1 = src_ptr[-6]; + src_ptr_0 = src_ptr[2]; + src_ptr_r1 = src_ptr[10]; + src_ptr_r2 = src_ptr[18]; + + __asm__ __volatile__( + "mtlo %[vector4a], $ac0 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" + "extp %[Temp2], $ac3, 9 \n\t" + + : [Temp2] "=r"(Temp2) + : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b), + [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0), + [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2), + [vector4a] "r"(vector4a)); + + src_ptr_l1 = src_ptr[-5]; + src_ptr_0 = src_ptr[3]; + src_ptr_r1 = src_ptr[11]; + src_ptr_r2 = src_ptr[19]; + + __asm__ __volatile__( + "mtlo %[vector4a], $ac1 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" + "extp %[Temp3], $ac0, 9 \n\t" + + : [Temp3] "=r"(Temp3) + : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b), + [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0), + [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2), + [vector4a] "r"(vector4a)); + + src_ptr_l1 = src_ptr[-4]; + src_ptr_0 = src_ptr[4]; + src_ptr_r1 = src_ptr[12]; + src_ptr_r2 = src_ptr[20]; + + __asm__ __volatile__( + "mtlo %[vector4a], $ac2 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" + "extp %[Temp4], $ac1, 9 \n\t" + + : [Temp4] "=r"(Temp4) + : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b), + [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0), + [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2), + [vector4a] "r"(vector4a)); + + src_ptr_l1 = src_ptr[-3]; + src_ptr_0 = src_ptr[5]; + src_ptr_r1 = src_ptr[13]; + src_ptr_r2 = src_ptr[21]; + + __asm__ __volatile__( + "mtlo %[vector4a], $ac3 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" + "extp %[Temp5], $ac2, 9 \n\t" + + : [Temp5] "=&r"(Temp5) + : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b), + [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0), + [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2), + [vector4a] "r"(vector4a)); + + src_ptr_l1 = src_ptr[-2]; + src_ptr_0 = src_ptr[6]; + src_ptr_r1 = src_ptr[14]; + src_ptr_r2 = src_ptr[22]; + + __asm__ __volatile__( + "mtlo %[vector4a], $ac0 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" + "extp %[Temp6], $ac3, 9 \n\t" + + : [Temp6] "=r"(Temp6) + : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b), + [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0), + [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2), + [vector4a] "r"(vector4a)); + + src_ptr_l1 = src_ptr[-1]; + src_ptr_0 = src_ptr[7]; + src_ptr_r1 = src_ptr[15]; + src_ptr_r2 = src_ptr[23]; + + __asm__ __volatile__( + "mtlo %[vector4a], $ac1 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" + "extp %[Temp7], $ac0, 9 \n\t" + "extp %[Temp8], $ac1, 9 \n\t" + + : [Temp7] "=&r"(Temp7), [Temp8] "=r"(Temp8) + : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b), + [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0), + [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2), + [vector4a] "r"(vector4a)); + + /* clamp and store results */ + output_ptr[0] = cm[Temp1]; + output_ptr[1] = cm[Temp2]; + output_ptr[2] = cm[Temp3]; + output_ptr[3] = cm[Temp4]; + output_ptr[4] = cm[Temp5]; + output_ptr[5] = cm[Temp6]; + output_ptr[6] = cm[Temp7]; + output_ptr[7] = cm[Temp8]; + + src_ptr += 8; + output_ptr += output_pitch; + } + } +} + +void vp8_filter_block2d_second_pass161(unsigned char *RESTRICT src_ptr, + unsigned char *RESTRICT output_ptr, + int output_pitch, + const unsigned short *vp8_filter) { + unsigned int i, j; + + int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6, Temp7, Temp8; + unsigned int vector4a; + unsigned int vector1b, vector2b, vector3b; + + unsigned char src_ptr_l2; + unsigned char src_ptr_l1; + unsigned char src_ptr_0; + unsigned char src_ptr_r1; + unsigned char src_ptr_r2; + unsigned char src_ptr_r3; + unsigned char *cm = ff_cropTbl + CROP_WIDTH; + + vector4a = 64; + + vector1b = vp8_filter[0]; + vector2b = vp8_filter[2]; + vector3b = vp8_filter[1]; + + if (vector1b == 0) { + /* 4 tap filter */ + + /* prefetch src_ptr data to cache memory */ + prefetch_load(src_ptr + 16); + + for (i = 16; i--;) { + /* unrolling for loop */ + for (j = 0; j < 16; j += 8) { + /* apply filter with vectors pairs */ + __asm__ __volatile__( + "lbu %[src_ptr_l1], -16(%[src_ptr]) " + "\n\t" + "lbu %[src_ptr_0], 0(%[src_ptr]) " + "\n\t" + "lbu %[src_ptr_r1], 16(%[src_ptr]) " + "\n\t" + "lbu %[src_ptr_r2], 32(%[src_ptr]) " + "\n\t" + "mtlo %[vector4a], $ac2 " + "\n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 " + "\n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 " + "\n\t" + "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] " + "\n\t" + "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] " + "\n\t" + + "lbu %[src_ptr_l1], -15(%[src_ptr]) " + "\n\t" + "lbu %[src_ptr_0], 1(%[src_ptr]) " + "\n\t" + "lbu %[src_ptr_r1], 17(%[src_ptr]) " + "\n\t" + "lbu %[src_ptr_r2], 33(%[src_ptr]) " + "\n\t" + "mtlo %[vector4a], $ac3 " + "\n\t" + "extp %[Temp1], $ac2, 9 " + "\n\t" + + "append %[src_ptr_0], %[src_ptr_r1], 8 " + "\n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 " + "\n\t" + "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] " + "\n\t" + "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] " + "\n\t" + + "lbu %[src_ptr_l1], -14(%[src_ptr]) " + "\n\t" + "lbu %[src_ptr_0], 2(%[src_ptr]) " + "\n\t" + "lbu %[src_ptr_r1], 18(%[src_ptr]) " + "\n\t" + "lbu %[src_ptr_r2], 34(%[src_ptr]) " + "\n\t" + "mtlo %[vector4a], $ac1 " + "\n\t" + "extp %[Temp2], $ac3, 9 " + "\n\t" + + "append %[src_ptr_0], %[src_ptr_r1], 8 " + "\n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 " + "\n\t" + "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] " + "\n\t" + "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] " + "\n\t" + + "lbu %[src_ptr_l1], -13(%[src_ptr]) " + "\n\t" + "lbu %[src_ptr_0], 3(%[src_ptr]) " + "\n\t" + "lbu %[src_ptr_r1], 19(%[src_ptr]) " + "\n\t" + "lbu %[src_ptr_r2], 35(%[src_ptr]) " + "\n\t" + "mtlo %[vector4a], $ac3 " + "\n\t" + "extp %[Temp3], $ac1, 9 " + "\n\t" + + "append %[src_ptr_0], %[src_ptr_r1], 8 " + "\n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 " + "\n\t" + "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] " + "\n\t" + "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] " + "\n\t" + + "lbu %[src_ptr_l1], -12(%[src_ptr]) " + "\n\t" + "lbu %[src_ptr_0], 4(%[src_ptr]) " + "\n\t" + "lbu %[src_ptr_r1], 20(%[src_ptr]) " + "\n\t" + "lbu %[src_ptr_r2], 36(%[src_ptr]) " + "\n\t" + "mtlo %[vector4a], $ac2 " + "\n\t" + "extp %[Temp4], $ac3, 9 " + "\n\t" + + "append %[src_ptr_0], %[src_ptr_r1], 8 " + "\n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 " + "\n\t" + "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] " + "\n\t" + "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] " + "\n\t" + + "lbu %[src_ptr_l1], -11(%[src_ptr]) " + "\n\t" + "lbu %[src_ptr_0], 5(%[src_ptr]) " + "\n\t" + "lbu %[src_ptr_r1], 21(%[src_ptr]) " + "\n\t" + "lbu %[src_ptr_r2], 37(%[src_ptr]) " + "\n\t" + "mtlo %[vector4a], $ac3 " + "\n\t" + "extp %[Temp5], $ac2, 9 " + "\n\t" + + "append %[src_ptr_0], %[src_ptr_r1], 8 " + "\n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 " + "\n\t" + "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] " + "\n\t" + "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] " + "\n\t" + + "lbu %[src_ptr_l1], -10(%[src_ptr]) " + "\n\t" + "lbu %[src_ptr_0], 6(%[src_ptr]) " + "\n\t" + "lbu %[src_ptr_r1], 22(%[src_ptr]) " + "\n\t" + "lbu %[src_ptr_r2], 38(%[src_ptr]) " + "\n\t" + "mtlo %[vector4a], $ac1 " + "\n\t" + "extp %[Temp6], $ac3, 9 " + "\n\t" + + "append %[src_ptr_0], %[src_ptr_r1], 8 " + "\n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 " + "\n\t" + "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] " + "\n\t" + "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] " + "\n\t" + + "lbu %[src_ptr_l1], -9(%[src_ptr]) " + "\n\t" + "lbu %[src_ptr_0], 7(%[src_ptr]) " + "\n\t" + "lbu %[src_ptr_r1], 23(%[src_ptr]) " + "\n\t" + "lbu %[src_ptr_r2], 39(%[src_ptr]) " + "\n\t" + "mtlo %[vector4a], $ac3 " + "\n\t" + "extp %[Temp7], $ac1, 9 " + "\n\t" + + "append %[src_ptr_0], %[src_ptr_r1], 8 " + "\n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 " + "\n\t" + "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] " + "\n\t" + "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] " + "\n\t" + "extp %[Temp8], $ac3, 9 " + "\n\t" + + : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [Temp4] "=&r"(Temp4), [Temp5] "=&r"(Temp5), [Temp6] "=&r"(Temp6), + [Temp7] "=&r"(Temp7), [Temp8] "=r"(Temp8), + [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0), + [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2) + : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b), + [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr)); + + /* clamp and store results */ + output_ptr[j] = cm[Temp1]; + output_ptr[j + 1] = cm[Temp2]; + output_ptr[j + 2] = cm[Temp3]; + output_ptr[j + 3] = cm[Temp4]; + output_ptr[j + 4] = cm[Temp5]; + output_ptr[j + 5] = cm[Temp6]; + output_ptr[j + 6] = cm[Temp7]; + output_ptr[j + 7] = cm[Temp8]; + + src_ptr += 8; + } + + output_ptr += output_pitch; + } + } else { + /* 4 tap filter */ + + /* prefetch src_ptr data to cache memory */ + prefetch_load(src_ptr + 16); + + /* unroll for loop */ + for (i = 16; i--;) { + /* apply filter with vectors pairs */ + __asm__ __volatile__( + "lbu %[src_ptr_l2], -32(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -16(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 16(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 32(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 48(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -31(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -15(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 17(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 33(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 49(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac0 \n\t" + "extp %[Temp1], $ac2, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -30(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -14(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 18(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 34(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 50(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "extp %[Temp2], $ac0, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -29(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -13(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 19(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 35(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 51(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "extp %[Temp3], $ac1, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -28(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -12(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 20(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 36(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 52(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "extp %[Temp4], $ac3, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -27(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -11(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 21(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 37(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 53(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac0 \n\t" + "extp %[Temp5], $ac2, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -26(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -10(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 22(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 38(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 54(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "extp %[Temp6], $ac0, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -25(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -9(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 23(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 39(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 55(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "extp %[Temp7], $ac1, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" + "extp %[Temp8], $ac3, 9 \n\t" + + : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [Temp4] "=&r"(Temp4), [Temp5] "=&r"(Temp5), [Temp6] "=&r"(Temp6), + [Temp7] "=&r"(Temp7), [Temp8] "=r"(Temp8), + [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0), + [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2), + [src_ptr_l2] "=&r"(src_ptr_l2), [src_ptr_r3] "=&r"(src_ptr_r3) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4a] "r"(vector4a), + [src_ptr] "r"(src_ptr)); + + /* clamp and store results */ + output_ptr[0] = cm[Temp1]; + output_ptr[1] = cm[Temp2]; + output_ptr[2] = cm[Temp3]; + output_ptr[3] = cm[Temp4]; + output_ptr[4] = cm[Temp5]; + output_ptr[5] = cm[Temp6]; + output_ptr[6] = cm[Temp7]; + output_ptr[7] = cm[Temp8]; + + /* apply filter with vectors pairs */ + __asm__ __volatile__( + "lbu %[src_ptr_l2], -24(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -8(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 8(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 24(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 40(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 56(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -23(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -7(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 9(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 25(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 41(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 57(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac0 \n\t" + "extp %[Temp1], $ac2, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -22(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -6(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 10(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 26(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 42(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 58(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "extp %[Temp2], $ac0, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -21(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -5(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 11(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 27(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 43(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 59(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "extp %[Temp3], $ac1, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -20(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -4(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 12(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 28(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 44(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 60(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "extp %[Temp4], $ac3, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -19(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -3(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 13(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 29(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 45(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 61(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac0 \n\t" + "extp %[Temp5], $ac2, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -18(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -2(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 14(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 30(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 46(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 62(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "extp %[Temp6], $ac0, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" + + "lbu %[src_ptr_l2], -17(%[src_ptr]) \n\t" + "lbu %[src_ptr_l1], -1(%[src_ptr]) \n\t" + "lbu %[src_ptr_0], 15(%[src_ptr]) \n\t" + "lbu %[src_ptr_r1], 31(%[src_ptr]) \n\t" + "lbu %[src_ptr_r2], 47(%[src_ptr]) \n\t" + "lbu %[src_ptr_r3], 63(%[src_ptr]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "extp %[Temp7], $ac1, 9 \n\t" + + "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" + "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" + "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" + "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t" + "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" + "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" + "extp %[Temp8], $ac3, 9 \n\t" + + : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [Temp4] "=&r"(Temp4), [Temp5] "=&r"(Temp5), [Temp6] "=&r"(Temp6), + [Temp7] "=&r"(Temp7), [Temp8] "=r"(Temp8), + [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0), + [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2), + [src_ptr_l2] "=&r"(src_ptr_l2), [src_ptr_r3] "=&r"(src_ptr_r3) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4a] "r"(vector4a), + [src_ptr] "r"(src_ptr)); + + src_ptr += 16; + output_ptr[8] = cm[Temp1]; + output_ptr[9] = cm[Temp2]; + output_ptr[10] = cm[Temp3]; + output_ptr[11] = cm[Temp4]; + output_ptr[12] = cm[Temp5]; + output_ptr[13] = cm[Temp6]; + output_ptr[14] = cm[Temp7]; + output_ptr[15] = cm[Temp8]; + + output_ptr += output_pitch; + } + } +} + +void vp8_sixtap_predict4x4_dspr2(unsigned char *RESTRICT src_ptr, + int src_pixels_per_line, int xoffset, + int yoffset, unsigned char *RESTRICT dst_ptr, + int dst_pitch) { + unsigned char FData[9 * 4]; /* Temp data bufffer used in filtering */ + unsigned int pos = 16; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + if (yoffset) { + /* First filter 1-D horizontally... */ + vp8_filter_block2d_first_pass_4(src_ptr - (2 * src_pixels_per_line), FData, + src_pixels_per_line, 9, xoffset, 4); + /* then filter verticaly... */ + vp8_filter_block2d_second_pass4(FData + 8, dst_ptr, dst_pitch, yoffset); + } else + /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */ + vp8_filter_block2d_first_pass_4(src_ptr, dst_ptr, src_pixels_per_line, 4, + xoffset, dst_pitch); +} + +void vp8_sixtap_predict8x8_dspr2(unsigned char *RESTRICT src_ptr, + int src_pixels_per_line, int xoffset, + int yoffset, unsigned char *RESTRICT dst_ptr, + int dst_pitch) { + unsigned char FData[13 * 8]; /* Temp data bufffer used in filtering */ + unsigned int pos, Temp1, Temp2; + + pos = 16; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + if (yoffset) { + src_ptr = src_ptr - (2 * src_pixels_per_line); + + if (xoffset) /* filter 1-D horizontally... */ + vp8_filter_block2d_first_pass_8_all(src_ptr, FData, src_pixels_per_line, + 13, xoffset, 8); + + else { + /* prefetch src_ptr data to cache memory */ + prefetch_load(src_ptr + 2 * src_pixels_per_line); + + __asm__ __volatile__( + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 0(%[FData]) \n\t" + "sw %[Temp2], 4(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 8(%[FData]) \n\t" + "sw %[Temp2], 12(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 16(%[FData]) \n\t" + "sw %[Temp2], 20(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 24(%[FData]) \n\t" + "sw %[Temp2], 28(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 32(%[FData]) \n\t" + "sw %[Temp2], 36(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 40(%[FData]) \n\t" + "sw %[Temp2], 44(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 48(%[FData]) \n\t" + "sw %[Temp2], 52(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 56(%[FData]) \n\t" + "sw %[Temp2], 60(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 64(%[FData]) \n\t" + "sw %[Temp2], 68(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 72(%[FData]) \n\t" + "sw %[Temp2], 76(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 80(%[FData]) \n\t" + "sw %[Temp2], 84(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 88(%[FData]) \n\t" + "sw %[Temp2], 92(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 96(%[FData]) \n\t" + "sw %[Temp2], 100(%[FData]) \n\t" + + : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2) + : [FData] "r"(FData), [src_ptr] "r"(src_ptr), + [src_pixels_per_line] "r"(src_pixels_per_line)); + } + + /* filter verticaly... */ + vp8_filter_block2d_second_pass_8(FData + 16, dst_ptr, dst_pitch, 8, 8, + yoffset); + } + + /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */ + else { + if (xoffset) + vp8_filter_block2d_first_pass_8_all(src_ptr, dst_ptr, src_pixels_per_line, + 8, xoffset, dst_pitch); + + else { + /* copy from src buffer to dst buffer */ + __asm__ __volatile__( + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 0(%[dst_ptr]) \n\t" + "sw %[Temp2], 4(%[dst_ptr]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 8(%[dst_ptr]) \n\t" + "sw %[Temp2], 12(%[dst_ptr]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 16(%[dst_ptr]) \n\t" + "sw %[Temp2], 20(%[dst_ptr]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 24(%[dst_ptr]) \n\t" + "sw %[Temp2], 28(%[dst_ptr]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 32(%[dst_ptr]) \n\t" + "sw %[Temp2], 36(%[dst_ptr]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 40(%[dst_ptr]) \n\t" + "sw %[Temp2], 44(%[dst_ptr]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 48(%[dst_ptr]) \n\t" + "sw %[Temp2], 52(%[dst_ptr]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 56(%[dst_ptr]) \n\t" + "sw %[Temp2], 60(%[dst_ptr]) \n\t" + + : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2) + : [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr), + [src_pixels_per_line] "r"(src_pixels_per_line)); + } + } +} + +void vp8_sixtap_predict8x4_dspr2(unsigned char *RESTRICT src_ptr, + int src_pixels_per_line, int xoffset, + int yoffset, unsigned char *RESTRICT dst_ptr, + int dst_pitch) { + unsigned char FData[9 * 8]; /* Temp data bufffer used in filtering */ + unsigned int pos, Temp1, Temp2; + + pos = 16; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + if (yoffset) { + src_ptr = src_ptr - (2 * src_pixels_per_line); + + if (xoffset) /* filter 1-D horizontally... */ + vp8_filter_block2d_first_pass_8_all(src_ptr, FData, src_pixels_per_line, + 9, xoffset, 8); + + else { + /* prefetch src_ptr data to cache memory */ + prefetch_load(src_ptr + 2 * src_pixels_per_line); + + __asm__ __volatile__( + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 0(%[FData]) \n\t" + "sw %[Temp2], 4(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 8(%[FData]) \n\t" + "sw %[Temp2], 12(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 16(%[FData]) \n\t" + "sw %[Temp2], 20(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 24(%[FData]) \n\t" + "sw %[Temp2], 28(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 32(%[FData]) \n\t" + "sw %[Temp2], 36(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 40(%[FData]) \n\t" + "sw %[Temp2], 44(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 48(%[FData]) \n\t" + "sw %[Temp2], 52(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 56(%[FData]) \n\t" + "sw %[Temp2], 60(%[FData]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 64(%[FData]) \n\t" + "sw %[Temp2], 68(%[FData]) \n\t" + + : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2) + : [FData] "r"(FData), [src_ptr] "r"(src_ptr), + [src_pixels_per_line] "r"(src_pixels_per_line)); + } + + /* filter verticaly... */ + vp8_filter_block2d_second_pass_8(FData + 16, dst_ptr, dst_pitch, 4, 8, + yoffset); + } + + /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */ + else { + if (xoffset) + vp8_filter_block2d_first_pass_8_all(src_ptr, dst_ptr, src_pixels_per_line, + 4, xoffset, dst_pitch); + + else { + /* copy from src buffer to dst buffer */ + __asm__ __volatile__( + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 0(%[dst_ptr]) \n\t" + "sw %[Temp2], 4(%[dst_ptr]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 8(%[dst_ptr]) \n\t" + "sw %[Temp2], 12(%[dst_ptr]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 16(%[dst_ptr]) \n\t" + "sw %[Temp2], 20(%[dst_ptr]) \n\t" + "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" + + "ulw %[Temp1], 0(%[src_ptr]) \n\t" + "ulw %[Temp2], 4(%[src_ptr]) \n\t" + "sw %[Temp1], 24(%[dst_ptr]) \n\t" + "sw %[Temp2], 28(%[dst_ptr]) \n\t" + + : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2) + : [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr), + [src_pixels_per_line] "r"(src_pixels_per_line)); + } + } +} + +void vp8_sixtap_predict16x16_dspr2(unsigned char *RESTRICT src_ptr, + int src_pixels_per_line, int xoffset, + int yoffset, unsigned char *RESTRICT dst_ptr, + int dst_pitch) { + const unsigned short *VFilter; + unsigned char FData[21 * 16]; /* Temp data bufffer used in filtering */ + unsigned int pos; + + VFilter = sub_pel_filterss[yoffset]; + + pos = 16; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + if (yoffset) { + src_ptr = src_ptr - (2 * src_pixels_per_line); + + switch (xoffset) { + /* filter 1-D horizontally... */ + case 2: + case 4: + case 6: + /* 6 tap filter */ + vp8_filter_block2d_first_pass16_6tap( + src_ptr, FData, src_pixels_per_line, 21, xoffset, 16); + break; + + case 0: + /* only copy buffer */ + vp8_filter_block2d_first_pass16_0(src_ptr, FData, src_pixels_per_line); + break; + + case 1: + case 3: + case 5: + case 7: + /* 4 tap filter */ + vp8_filter_block2d_first_pass16_4tap( + src_ptr, FData, src_pixels_per_line, 16, 21, xoffset, yoffset, + dst_ptr, dst_pitch); + break; + } + + /* filter verticaly... */ + vp8_filter_block2d_second_pass161(FData + 32, dst_ptr, dst_pitch, VFilter); + } else { + /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */ + switch (xoffset) { + case 2: + case 4: + case 6: + /* 6 tap filter */ + vp8_filter_block2d_first_pass16_6tap( + src_ptr, dst_ptr, src_pixels_per_line, 16, xoffset, dst_pitch); + break; + + case 1: + case 3: + case 5: + case 7: + /* 4 tap filter */ + vp8_filter_block2d_first_pass16_4tap( + src_ptr, dst_ptr, src_pixels_per_line, 16, 21, xoffset, yoffset, + dst_ptr, dst_pitch); + break; + } + } +} + +#endif diff --git a/media/libvpx/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c b/media/libvpx/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c new file mode 100644 index 0000000000..eae852d592 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" + +#if HAVE_DSPR2 + +void vp8_dequant_idct_add_y_block_dspr2(short *q, short *dq, unsigned char *dst, + int stride, char *eobs) { + int i, j; + + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) { + if (*eobs++ > 1) + vp8_dequant_idct_add_dspr2(q, dq, dst, stride); + else { + vp8_dc_only_idct_add_dspr2(q[0] * dq[0], dst, stride, dst, stride); + ((int *)q)[0] = 0; + } + + q += 16; + dst += 4; + } + + dst += 4 * stride - 16; + } +} + +void vp8_dequant_idct_add_uv_block_dspr2(short *q, short *dq, + unsigned char *dst_u, + unsigned char *dst_v, int stride, + char *eobs) { + int i, j; + + for (i = 0; i < 2; ++i) { + for (j = 0; j < 2; ++j) { + if (*eobs++ > 1) + vp8_dequant_idct_add_dspr2(q, dq, dst_u, stride); + else { + vp8_dc_only_idct_add_dspr2(q[0] * dq[0], dst_u, stride, dst_u, stride); + ((int *)q)[0] = 0; + } + + q += 16; + dst_u += 4; + } + + dst_u += 4 * stride - 8; + } + + for (i = 0; i < 2; ++i) { + for (j = 0; j < 2; ++j) { + if (*eobs++ > 1) + vp8_dequant_idct_add_dspr2(q, dq, dst_v, stride); + else { + vp8_dc_only_idct_add_dspr2(q[0] * dq[0], dst_v, stride, dst_v, stride); + ((int *)q)[0] = 0; + } + + q += 16; + dst_v += 4; + } + + dst_v += 4 * stride - 8; + } +} + +#endif diff --git a/media/libvpx/libvpx/vp8/common/mips/dspr2/idctllm_dspr2.c b/media/libvpx/libvpx/vp8/common/mips/dspr2/idctllm_dspr2.c new file mode 100644 index 0000000000..9163ffad1e --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mips/dspr2/idctllm_dspr2.c @@ -0,0 +1,346 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp8_rtcd.h" + +#if HAVE_DSPR2 +#define CROP_WIDTH 256 + +/****************************************************************************** + * Notes: + * + * This implementation makes use of 16 bit fixed point version of two multiply + * constants: + * 1. sqrt(2) * cos (pi/8) + * 2. sqrt(2) * sin (pi/8) + * Since the first constant is bigger than 1, to maintain the same 16 bit + * fixed point precision as the second one, we use a trick of + * x * a = x + x*(a-1) + * so + * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). + ****************************************************************************/ +extern unsigned char ff_cropTbl[256 + 2 * CROP_WIDTH]; +static const int cospi8sqrt2minus1 = 20091; +static const int sinpi8sqrt2 = 35468; + +inline void prefetch_load_short(short *src) { + __asm__ __volatile__("pref 0, 0(%[src]) \n\t" : : [src] "r"(src)); +} + +void vp8_short_idct4x4llm_dspr2(short *input, unsigned char *pred_ptr, + int pred_stride, unsigned char *dst_ptr, + int dst_stride) { + int r, c; + int a1, b1, c1, d1; + short output[16]; + short *ip = input; + short *op = output; + int temp1, temp2; + int shortpitch = 4; + + int c2, d2; + int temp3, temp4; + unsigned char *cm = ff_cropTbl + CROP_WIDTH; + + /* prepare data for load */ + prefetch_load_short(ip + 8); + + /* first loop is unrolled */ + a1 = ip[0] + ip[8]; + b1 = ip[0] - ip[8]; + + temp1 = (ip[4] * sinpi8sqrt2) >> 16; + temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16); + c1 = temp1 - temp2; + + temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16); + temp2 = (ip[12] * sinpi8sqrt2) >> 16; + d1 = temp1 + temp2; + + temp3 = (ip[5] * sinpi8sqrt2) >> 16; + temp4 = ip[13] + ((ip[13] * cospi8sqrt2minus1) >> 16); + c2 = temp3 - temp4; + + temp3 = ip[5] + ((ip[5] * cospi8sqrt2minus1) >> 16); + temp4 = (ip[13] * sinpi8sqrt2) >> 16; + d2 = temp3 + temp4; + + op[0] = a1 + d1; + op[12] = a1 - d1; + op[4] = b1 + c1; + op[8] = b1 - c1; + + a1 = ip[1] + ip[9]; + b1 = ip[1] - ip[9]; + + op[1] = a1 + d2; + op[13] = a1 - d2; + op[5] = b1 + c2; + op[9] = b1 - c2; + + a1 = ip[2] + ip[10]; + b1 = ip[2] - ip[10]; + + temp1 = (ip[6] * sinpi8sqrt2) >> 16; + temp2 = ip[14] + ((ip[14] * cospi8sqrt2minus1) >> 16); + c1 = temp1 - temp2; + + temp1 = ip[6] + ((ip[6] * cospi8sqrt2minus1) >> 16); + temp2 = (ip[14] * sinpi8sqrt2) >> 16; + d1 = temp1 + temp2; + + temp3 = (ip[7] * sinpi8sqrt2) >> 16; + temp4 = ip[15] + ((ip[15] * cospi8sqrt2minus1) >> 16); + c2 = temp3 - temp4; + + temp3 = ip[7] + ((ip[7] * cospi8sqrt2minus1) >> 16); + temp4 = (ip[15] * sinpi8sqrt2) >> 16; + d2 = temp3 + temp4; + + op[2] = a1 + d1; + op[14] = a1 - d1; + op[6] = b1 + c1; + op[10] = b1 - c1; + + a1 = ip[3] + ip[11]; + b1 = ip[3] - ip[11]; + + op[3] = a1 + d2; + op[15] = a1 - d2; + op[7] = b1 + c2; + op[11] = b1 - c2; + + ip = output; + + /* prepare data for load */ + prefetch_load_short(ip + shortpitch); + + /* second loop is unrolled */ + a1 = ip[0] + ip[2]; + b1 = ip[0] - ip[2]; + + temp1 = (ip[1] * sinpi8sqrt2) >> 16; + temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16); + c1 = temp1 - temp2; + + temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16); + temp2 = (ip[3] * sinpi8sqrt2) >> 16; + d1 = temp1 + temp2; + + temp3 = (ip[5] * sinpi8sqrt2) >> 16; + temp4 = ip[7] + ((ip[7] * cospi8sqrt2minus1) >> 16); + c2 = temp3 - temp4; + + temp3 = ip[5] + ((ip[5] * cospi8sqrt2minus1) >> 16); + temp4 = (ip[7] * sinpi8sqrt2) >> 16; + d2 = temp3 + temp4; + + op[0] = (a1 + d1 + 4) >> 3; + op[3] = (a1 - d1 + 4) >> 3; + op[1] = (b1 + c1 + 4) >> 3; + op[2] = (b1 - c1 + 4) >> 3; + + a1 = ip[4] + ip[6]; + b1 = ip[4] - ip[6]; + + op[4] = (a1 + d2 + 4) >> 3; + op[7] = (a1 - d2 + 4) >> 3; + op[5] = (b1 + c2 + 4) >> 3; + op[6] = (b1 - c2 + 4) >> 3; + + a1 = ip[8] + ip[10]; + b1 = ip[8] - ip[10]; + + temp1 = (ip[9] * sinpi8sqrt2) >> 16; + temp2 = ip[11] + ((ip[11] * cospi8sqrt2minus1) >> 16); + c1 = temp1 - temp2; + + temp1 = ip[9] + ((ip[9] * cospi8sqrt2minus1) >> 16); + temp2 = (ip[11] * sinpi8sqrt2) >> 16; + d1 = temp1 + temp2; + + temp3 = (ip[13] * sinpi8sqrt2) >> 16; + temp4 = ip[15] + ((ip[15] * cospi8sqrt2minus1) >> 16); + c2 = temp3 - temp4; + + temp3 = ip[13] + ((ip[13] * cospi8sqrt2minus1) >> 16); + temp4 = (ip[15] * sinpi8sqrt2) >> 16; + d2 = temp3 + temp4; + + op[8] = (a1 + d1 + 4) >> 3; + op[11] = (a1 - d1 + 4) >> 3; + op[9] = (b1 + c1 + 4) >> 3; + op[10] = (b1 - c1 + 4) >> 3; + + a1 = ip[12] + ip[14]; + b1 = ip[12] - ip[14]; + + op[12] = (a1 + d2 + 4) >> 3; + op[15] = (a1 - d2 + 4) >> 3; + op[13] = (b1 + c2 + 4) >> 3; + op[14] = (b1 - c2 + 4) >> 3; + + ip = output; + + for (r = 0; r < 4; ++r) { + for (c = 0; c < 4; ++c) { + short a = ip[c] + pred_ptr[c]; + dst_ptr[c] = cm[a]; + } + + ip += 4; + dst_ptr += dst_stride; + pred_ptr += pred_stride; + } +} + +void vp8_dc_only_idct_add_dspr2(short input_dc, unsigned char *pred_ptr, + int pred_stride, unsigned char *dst_ptr, + int dst_stride) { + int a1; + int i, absa1; + int t2, vector_a1, vector_a; + + /* a1 = ((input_dc + 4) >> 3); */ + __asm__ __volatile__( + "addi %[a1], %[input_dc], 4 \n\t" + "sra %[a1], %[a1], 3 \n\t" + : [a1] "=r"(a1) + : [input_dc] "r"(input_dc)); + + if (a1 < 0) { + /* use quad-byte + * input and output memory are four byte aligned + */ + __asm__ __volatile__( + "abs %[absa1], %[a1] \n\t" + "replv.qb %[vector_a1], %[absa1] \n\t" + : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); + + /* use (a1 - predptr[c]) instead a1 + predptr[c] */ + for (i = 4; i--;) { + __asm__ __volatile__( + "lw %[t2], 0(%[pred_ptr]) \n\t" + "add %[pred_ptr], %[pred_ptr], %[pred_stride] \n\t" + "subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" + "sw %[vector_a], 0(%[dst_ptr]) \n\t" + "add %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), + [dst_ptr] "+&r"(dst_ptr), [pred_ptr] "+&r"(pred_ptr) + : [dst_stride] "r"(dst_stride), [pred_stride] "r"(pred_stride), + [vector_a1] "r"(vector_a1)); + } + } else { + /* use quad-byte + * input and output memory are four byte aligned + */ + __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" + : [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); + + for (i = 4; i--;) { + __asm__ __volatile__( + "lw %[t2], 0(%[pred_ptr]) \n\t" + "add %[pred_ptr], %[pred_ptr], %[pred_stride] \n\t" + "addu_s.qb %[vector_a], %[vector_a1], %[t2] \n\t" + "sw %[vector_a], 0(%[dst_ptr]) \n\t" + "add %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), + [dst_ptr] "+&r"(dst_ptr), [pred_ptr] "+&r"(pred_ptr) + : [dst_stride] "r"(dst_stride), [pred_stride] "r"(pred_stride), + [vector_a1] "r"(vector_a1)); + } + } +} + +void vp8_short_inv_walsh4x4_dspr2(short *input, short *mb_dqcoeff) { + short output[16]; + int i; + int a1, b1, c1, d1; + int a2, b2, c2, d2; + short *ip = input; + short *op = output; + + prefetch_load_short(ip); + + for (i = 4; i--;) { + a1 = ip[0] + ip[12]; + b1 = ip[4] + ip[8]; + c1 = ip[4] - ip[8]; + d1 = ip[0] - ip[12]; + + op[0] = a1 + b1; + op[4] = c1 + d1; + op[8] = a1 - b1; + op[12] = d1 - c1; + + ip++; + op++; + } + + ip = output; + op = output; + + prefetch_load_short(ip); + + for (i = 4; i--;) { + a1 = ip[0] + ip[3] + 3; + b1 = ip[1] + ip[2]; + c1 = ip[1] - ip[2]; + d1 = ip[0] - ip[3] + 3; + + a2 = a1 + b1; + b2 = d1 + c1; + c2 = a1 - b1; + d2 = d1 - c1; + + op[0] = a2 >> 3; + op[1] = b2 >> 3; + op[2] = c2 >> 3; + op[3] = d2 >> 3; + + ip += 4; + op += 4; + } + + for (i = 0; i < 16; ++i) { + mb_dqcoeff[i * 16] = output[i]; + } +} + +void vp8_short_inv_walsh4x4_1_dspr2(short *input, short *mb_dqcoeff) { + int a1; + + a1 = ((input[0] + 3) >> 3); + + __asm__ __volatile__( + "sh %[a1], 0(%[mb_dqcoeff]) \n\t" + "sh %[a1], 32(%[mb_dqcoeff]) \n\t" + "sh %[a1], 64(%[mb_dqcoeff]) \n\t" + "sh %[a1], 96(%[mb_dqcoeff]) \n\t" + "sh %[a1], 128(%[mb_dqcoeff]) \n\t" + "sh %[a1], 160(%[mb_dqcoeff]) \n\t" + "sh %[a1], 192(%[mb_dqcoeff]) \n\t" + "sh %[a1], 224(%[mb_dqcoeff]) \n\t" + "sh %[a1], 256(%[mb_dqcoeff]) \n\t" + "sh %[a1], 288(%[mb_dqcoeff]) \n\t" + "sh %[a1], 320(%[mb_dqcoeff]) \n\t" + "sh %[a1], 352(%[mb_dqcoeff]) \n\t" + "sh %[a1], 384(%[mb_dqcoeff]) \n\t" + "sh %[a1], 416(%[mb_dqcoeff]) \n\t" + "sh %[a1], 448(%[mb_dqcoeff]) \n\t" + "sh %[a1], 480(%[mb_dqcoeff]) \n\t" + + : + : [a1] "r"(a1), [mb_dqcoeff] "r"(mb_dqcoeff)); +} + +#endif diff --git a/media/libvpx/libvpx/vp8/common/mips/dspr2/reconinter_dspr2.c b/media/libvpx/libvpx/vp8/common/mips/dspr2/reconinter_dspr2.c new file mode 100644 index 0000000000..e44ae29278 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mips/dspr2/reconinter_dspr2.c @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vpx/vpx_integer.h" + +#if HAVE_DSPR2 +inline void prefetch_load_int(unsigned char *src) { + __asm__ __volatile__("pref 0, 0(%[src]) \n\t" : : [src] "r"(src)); +} + +__inline void vp8_copy_mem16x16_dspr2(unsigned char *RESTRICT src, + int src_stride, + unsigned char *RESTRICT dst, + int dst_stride) { + int r; + unsigned int a0, a1, a2, a3; + + for (r = 16; r--;) { + /* load src data in cache memory */ + prefetch_load_int(src + src_stride); + + /* use unaligned memory load and store */ + __asm__ __volatile__( + "ulw %[a0], 0(%[src]) \n\t" + "ulw %[a1], 4(%[src]) \n\t" + "ulw %[a2], 8(%[src]) \n\t" + "ulw %[a3], 12(%[src]) \n\t" + "sw %[a0], 0(%[dst]) \n\t" + "sw %[a1], 4(%[dst]) \n\t" + "sw %[a2], 8(%[dst]) \n\t" + "sw %[a3], 12(%[dst]) \n\t" + : [a0] "=&r"(a0), [a1] "=&r"(a1), [a2] "=&r"(a2), [a3] "=&r"(a3) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } +} + +__inline void vp8_copy_mem8x8_dspr2(unsigned char *RESTRICT src, int src_stride, + unsigned char *RESTRICT dst, + int dst_stride) { + int r; + unsigned int a0, a1; + + /* load src data in cache memory */ + prefetch_load_int(src + src_stride); + + for (r = 8; r--;) { + /* use unaligned memory load and store */ + __asm__ __volatile__( + "ulw %[a0], 0(%[src]) \n\t" + "ulw %[a1], 4(%[src]) \n\t" + "sw %[a0], 0(%[dst]) \n\t" + "sw %[a1], 4(%[dst]) \n\t" + : [a0] "=&r"(a0), [a1] "=&r"(a1) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } +} + +__inline void vp8_copy_mem8x4_dspr2(unsigned char *RESTRICT src, int src_stride, + unsigned char *RESTRICT dst, + int dst_stride) { + int r; + unsigned int a0, a1; + + /* load src data in cache memory */ + prefetch_load_int(src + src_stride); + + for (r = 4; r--;) { + /* use unaligned memory load and store */ + __asm__ __volatile__( + "ulw %[a0], 0(%[src]) \n\t" + "ulw %[a1], 4(%[src]) \n\t" + "sw %[a0], 0(%[dst]) \n\t" + "sw %[a1], 4(%[dst]) \n\t" + : [a0] "=&r"(a0), [a1] "=&r"(a1) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } +} + +#endif diff --git a/media/libvpx/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c b/media/libvpx/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c new file mode 100644 index 0000000000..21446fb413 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c @@ -0,0 +1,2401 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "vp8_rtcd.h" +#include "vp8/common/onyxc_int.h" + +#if HAVE_DSPR2 +typedef unsigned char uc; + +/* prefetch data for load */ +inline void prefetch_load_lf(unsigned char *src) { + __asm__ __volatile__("pref 0, 0(%[src]) \n\t" : : [src] "r"(src)); +} + +/* prefetch data for store */ +inline void prefetch_store_lf(unsigned char *dst) { + __asm__ __volatile__("pref 1, 0(%[dst]) \n\t" : : [dst] "r"(dst)); +} + +/* processing 4 pixels at the same time + * compute hev and mask in the same function + */ +static __inline void vp8_filter_mask_vec_mips( + uint32_t limit, uint32_t flimit, uint32_t p1, uint32_t p0, uint32_t p3, + uint32_t p2, uint32_t q0, uint32_t q1, uint32_t q2, uint32_t q3, + uint32_t thresh, uint32_t *hev, uint32_t *mask) { + uint32_t c, r, r3, r_k; + uint32_t s1, s2, s3; + uint32_t ones = 0xFFFFFFFF; + uint32_t hev1; + + __asm__ __volatile__( + /* mask |= (abs(p3 - p2) > limit) */ + "subu_s.qb %[c], %[p3], %[p2] \n\t" + "subu_s.qb %[r_k], %[p2], %[p3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], $0, %[c] \n\t" + + /* mask |= (abs(p2 - p1) > limit) */ + "subu_s.qb %[c], %[p2], %[p1] \n\t" + "subu_s.qb %[r_k], %[p1], %[p2] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + /* mask |= (abs(p1 - p0) > limit) + * hev |= (abs(p1 - p0) > thresh) + */ + "subu_s.qb %[c], %[p1], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" + "or %[r3], $0, %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + /* mask |= (abs(q1 - q0) > limit) + * hev |= (abs(q1 - q0) > thresh) + */ + "subu_s.qb %[c], %[q1], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" + "or %[r3], %[r3], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + /* mask |= (abs(q2 - q1) > limit) */ + "subu_s.qb %[c], %[q2], %[q1] \n\t" + "subu_s.qb %[r_k], %[q1], %[q2] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + "sll %[r3], %[r3], 24 \n\t" + + /* mask |= (abs(q3 - q2) > limit) */ + "subu_s.qb %[c], %[q3], %[q2] \n\t" + "subu_s.qb %[r_k], %[q2], %[q3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3) + : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), + [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3), + [thresh] "r"(thresh)); + + __asm__ __volatile__( + /* abs(p0 - q0) */ + "subu_s.qb %[c], %[p0], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[p0] \n\t" + "wrdsp %[r3] \n\t" + "or %[s1], %[r_k], %[c] \n\t" + + /* abs(p1 - q1) */ + "subu_s.qb %[c], %[p1], %[q1] \n\t" + "addu_s.qb %[s3], %[s1], %[s1] \n\t" + "pick.qb %[hev1], %[ones], $0 \n\t" + "subu_s.qb %[r_k], %[q1], %[p1] \n\t" + "or %[s2], %[r_k], %[c] \n\t" + + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */ + "shrl.qb %[s2], %[s2], 1 \n\t" + "addu_s.qb %[s1], %[s2], %[s3] \n\t" + "cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t" + "or %[r], %[r], %[c] \n\t" + "sll %[r], %[r], 24 \n\t" + + "wrdsp %[r] \n\t" + "pick.qb %[s2], $0, %[ones] \n\t" + + : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1), + [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3) + : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1), + [ones] "r"(ones), [flimit] "r"(flimit)); + + *hev = hev1; + *mask = s2; +} + +/* inputs & outputs are quad-byte vectors */ +static __inline void vp8_filter_mips(uint32_t mask, uint32_t hev, uint32_t *ps1, + uint32_t *ps0, uint32_t *qs0, + uint32_t *qs1) { + int32_t vp8_filter_l, vp8_filter_r; + int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r; + int32_t subr_r, subr_l; + uint32_t t1, t2, HWM, t3; + uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r; + + int32_t vps1, vps0, vqs0, vqs1; + int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r; + uint32_t N128; + + N128 = 0x80808080; + t1 = 0x03000300; + t2 = 0x04000400; + t3 = 0x01000100; + HWM = 0xFF00FF00; + + vps0 = (*ps0) ^ N128; + vps1 = (*ps1) ^ N128; + vqs0 = (*qs0) ^ N128; + vqs1 = (*qs1) ^ N128; + + /* use halfword pairs instead quad-bytes because of accuracy */ + vps0_l = vps0 & HWM; + vps0_r = vps0 << 8; + vps0_r = vps0_r & HWM; + + vps1_l = vps1 & HWM; + vps1_r = vps1 << 8; + vps1_r = vps1_r & HWM; + + vqs0_l = vqs0 & HWM; + vqs0_r = vqs0 << 8; + vqs0_r = vqs0_r & HWM; + + vqs1_l = vqs1 & HWM; + vqs1_r = vqs1 << 8; + vqs1_r = vqs1_r & HWM; + + mask_l = mask & HWM; + mask_r = mask << 8; + mask_r = mask_r & HWM; + + hev_l = hev & HWM; + hev_r = hev << 8; + hev_r = hev_r & HWM; + + __asm__ __volatile__( + /* vp8_filter = vp8_signed_char_clamp(ps1 - qs1); */ + "subq_s.ph %[vp8_filter_l], %[vps1_l], %[vqs1_l] \n\t" + "subq_s.ph %[vp8_filter_r], %[vps1_r], %[vqs1_r] \n\t" + + /* qs0 - ps0 */ + "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t" + "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t" + + /* vp8_filter &= hev; */ + "and %[vp8_filter_l], %[vp8_filter_l], %[hev_l] \n\t" + "and %[vp8_filter_r], %[vp8_filter_r], %[hev_r] \n\t" + + /* vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); */ + "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t" + "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t" + "xor %[invhev_l], %[hev_l], %[HWM] \n\t" + "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t" + "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t" + "xor %[invhev_r], %[hev_r], %[HWM] \n\t" + "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t" + "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t" + + /* vp8_filter &= mask; */ + "and %[vp8_filter_l], %[vp8_filter_l], %[mask_l] \n\t" + "and %[vp8_filter_r], %[vp8_filter_r], %[mask_r] \n\t" + + : [vp8_filter_l] "=&r"(vp8_filter_l), [vp8_filter_r] "=&r"(vp8_filter_r), + [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r), + [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r) + + : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l), + [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r), + [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l), + [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r), + [HWM] "r"(HWM)); + + /* save bottom 3 bits so that we round one side +4 and the other +3 */ + __asm__ __volatile__( + /* Filter2 = vp8_signed_char_clamp(vp8_filter + 3) >>= 3; */ + "addq_s.ph %[Filter1_l], %[vp8_filter_l], %[t2] \n\t" + "addq_s.ph %[Filter1_r], %[vp8_filter_r], %[t2] \n\t" + + /* Filter1 = vp8_signed_char_clamp(vp8_filter + 4) >>= 3; */ + "addq_s.ph %[Filter2_l], %[vp8_filter_l], %[t1] \n\t" + "addq_s.ph %[Filter2_r], %[vp8_filter_r], %[t1] \n\t" + "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t" + "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t" + + "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t" + "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t" + + "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t" + "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t" + + /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */ + "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t" + "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t" + + /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */ + "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t" + "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t" + + : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r), + [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r), + [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l), + [vqs0_r] "+r"(vqs0_r) + + : [t1] "r"(t1), [t2] "r"(t2), [vp8_filter_l] "r"(vp8_filter_l), + [vp8_filter_r] "r"(vp8_filter_r), [HWM] "r"(HWM)); + + __asm__ __volatile__( + /* (vp8_filter += 1) >>= 1 */ + "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t" + "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t" + + /* vp8_filter &= ~hev; */ + "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t" + "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t" + + /* vps1 = vp8_signed_char_clamp(ps1 + vp8_filter); */ + "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t" + "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t" + + /* vqs1 = vp8_signed_char_clamp(qs1 - vp8_filter); */ + "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t" + "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t" + + : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r), + [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l), + [vqs1_r] "+r"(vqs1_r) + + : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r)); + + /* Create quad-bytes from halfword pairs */ + vqs0_l = vqs0_l & HWM; + vqs1_l = vqs1_l & HWM; + vps0_l = vps0_l & HWM; + vps1_l = vps1_l & HWM; + + __asm__ __volatile__( + "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t" + "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t" + "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t" + "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t" + + : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r), + [vqs0_r] "+r"(vqs0_r) + :); + + vqs0 = vqs0_l | vqs0_r; + vqs1 = vqs1_l | vqs1_r; + vps0 = vps0_l | vps0_r; + vps1 = vps1_l | vps1_r; + + *ps0 = vps0 ^ N128; + *ps1 = vps1 ^ N128; + *qs0 = vqs0 ^ N128; + *qs1 = vqs1 ^ N128; +} + +void vp8_loop_filter_horizontal_edge_mips(unsigned char *s, int p, + unsigned int flimit, + unsigned int limit, + unsigned int thresh, int count) { + uint32_t mask; + uint32_t hev; + uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; + unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6; + (void)count; + + mask = 0; + hev = 0; + p1 = 0; + p2 = 0; + p3 = 0; + p4 = 0; + + /* prefetch data for store */ + prefetch_store_lf(s); + + /* loop filter designed to work using chars so that we can make maximum use + * of 8 bit simd instructions. + */ + + sm1 = s - (p << 2); + s0 = s - p - p - p; + s1 = s - p - p; + s2 = s - p; + s3 = s; + s4 = s + p; + s5 = s + p + p; + s6 = s + p + p + p; + + /* load quad-byte vectors + * memory is 4 byte aligned + */ + p1 = *((uint32_t *)(s1)); + p2 = *((uint32_t *)(s2)); + p3 = *((uint32_t *)(s3)); + p4 = *((uint32_t *)(s4)); + + /* if (p1 - p4 == 0) and (p2 - p3 == 0) + * mask will be zero and filtering is not needed + */ + if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { + pm1 = *((uint32_t *)(sm1)); + p0 = *((uint32_t *)(s0)); + p5 = *((uint32_t *)(s5)); + p6 = *((uint32_t *)(s6)); + + vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, + thresh, &hev, &mask); + + /* if mask == 0 do filtering is not needed */ + if (mask) { + /* filtering */ + vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4); + + /* unpack processed 4x4 neighborhood */ + *((uint32_t *)s1) = p1; + *((uint32_t *)s2) = p2; + *((uint32_t *)s3) = p3; + *((uint32_t *)s4) = p4; + } + } + + sm1 += 4; + s0 += 4; + s1 += 4; + s2 += 4; + s3 += 4; + s4 += 4; + s5 += 4; + s6 += 4; + + /* load quad-byte vectors + * memory is 4 byte aligned + */ + p1 = *((uint32_t *)(s1)); + p2 = *((uint32_t *)(s2)); + p3 = *((uint32_t *)(s3)); + p4 = *((uint32_t *)(s4)); + + /* if (p1 - p4 == 0) and (p2 - p3 == 0) + * mask will be zero and filtering is not needed + */ + if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { + pm1 = *((uint32_t *)(sm1)); + p0 = *((uint32_t *)(s0)); + p5 = *((uint32_t *)(s5)); + p6 = *((uint32_t *)(s6)); + + vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, + thresh, &hev, &mask); + + /* if mask == 0 do filtering is not needed */ + if (mask) { + /* filtering */ + vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4); + + /* unpack processed 4x4 neighborhood */ + *((uint32_t *)s1) = p1; + *((uint32_t *)s2) = p2; + *((uint32_t *)s3) = p3; + *((uint32_t *)s4) = p4; + } + } + + sm1 += 4; + s0 += 4; + s1 += 4; + s2 += 4; + s3 += 4; + s4 += 4; + s5 += 4; + s6 += 4; + + /* load quad-byte vectors + * memory is 4 byte aligned + */ + p1 = *((uint32_t *)(s1)); + p2 = *((uint32_t *)(s2)); + p3 = *((uint32_t *)(s3)); + p4 = *((uint32_t *)(s4)); + + /* if (p1 - p4 == 0) and (p2 - p3 == 0) + * mask will be zero and filtering is not needed + */ + if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { + pm1 = *((uint32_t *)(sm1)); + p0 = *((uint32_t *)(s0)); + p5 = *((uint32_t *)(s5)); + p6 = *((uint32_t *)(s6)); + + vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, + thresh, &hev, &mask); + + /* if mask == 0 do filtering is not needed */ + if (mask) { + /* filtering */ + vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4); + + /* unpack processed 4x4 neighborhood */ + *((uint32_t *)s1) = p1; + *((uint32_t *)s2) = p2; + *((uint32_t *)s3) = p3; + *((uint32_t *)s4) = p4; + } + } + + sm1 += 4; + s0 += 4; + s1 += 4; + s2 += 4; + s3 += 4; + s4 += 4; + s5 += 4; + s6 += 4; + + /* load quad-byte vectors + * memory is 4 byte aligned + */ + p1 = *((uint32_t *)(s1)); + p2 = *((uint32_t *)(s2)); + p3 = *((uint32_t *)(s3)); + p4 = *((uint32_t *)(s4)); + + /* if (p1 - p4 == 0) and (p2 - p3 == 0) + * mask will be zero and filtering is not needed + */ + if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { + pm1 = *((uint32_t *)(sm1)); + p0 = *((uint32_t *)(s0)); + p5 = *((uint32_t *)(s5)); + p6 = *((uint32_t *)(s6)); + + vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, + thresh, &hev, &mask); + + /* if mask == 0 do filtering is not needed */ + if (mask) { + /* filtering */ + vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4); + + /* unpack processed 4x4 neighborhood */ + *((uint32_t *)s1) = p1; + *((uint32_t *)s2) = p2; + *((uint32_t *)s3) = p3; + *((uint32_t *)s4) = p4; + } + } +} + +void vp8_loop_filter_uvhorizontal_edge_mips(unsigned char *s, int p, + unsigned int flimit, + unsigned int limit, + unsigned int thresh, int count) { + uint32_t mask; + uint32_t hev; + uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; + unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6; + (void)count; + + mask = 0; + hev = 0; + p1 = 0; + p2 = 0; + p3 = 0; + p4 = 0; + + /* loop filter designed to work using chars so that we can make maximum use + * of 8 bit simd instructions. + */ + + sm1 = s - (p << 2); + s0 = s - p - p - p; + s1 = s - p - p; + s2 = s - p; + s3 = s; + s4 = s + p; + s5 = s + p + p; + s6 = s + p + p + p; + + /* load quad-byte vectors + * memory is 4 byte aligned + */ + p1 = *((uint32_t *)(s1)); + p2 = *((uint32_t *)(s2)); + p3 = *((uint32_t *)(s3)); + p4 = *((uint32_t *)(s4)); + + /* if (p1 - p4 == 0) and (p2 - p3 == 0) + * mask will be zero and filtering is not needed + */ + if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { + pm1 = *((uint32_t *)(sm1)); + p0 = *((uint32_t *)(s0)); + p5 = *((uint32_t *)(s5)); + p6 = *((uint32_t *)(s6)); + + vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, + thresh, &hev, &mask); + + /* if mask == 0 do filtering is not needed */ + if (mask) { + /* filtering */ + vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4); + + /* unpack processed 4x4 neighborhood */ + *((uint32_t *)s1) = p1; + *((uint32_t *)s2) = p2; + *((uint32_t *)s3) = p3; + *((uint32_t *)s4) = p4; + } + } + + sm1 += 4; + s0 += 4; + s1 += 4; + s2 += 4; + s3 += 4; + s4 += 4; + s5 += 4; + s6 += 4; + + /* load quad-byte vectors + * memory is 4 byte aligned + */ + p1 = *((uint32_t *)(s1)); + p2 = *((uint32_t *)(s2)); + p3 = *((uint32_t *)(s3)); + p4 = *((uint32_t *)(s4)); + + /* if (p1 - p4 == 0) and (p2 - p3 == 0) + * mask will be zero and filtering is not needed + */ + if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { + pm1 = *((uint32_t *)(sm1)); + p0 = *((uint32_t *)(s0)); + p5 = *((uint32_t *)(s5)); + p6 = *((uint32_t *)(s6)); + + vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, + thresh, &hev, &mask); + + /* if mask == 0 do filtering is not needed */ + if (mask) { + /* filtering */ + vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4); + + /* unpack processed 4x4 neighborhood */ + *((uint32_t *)s1) = p1; + *((uint32_t *)s2) = p2; + *((uint32_t *)s3) = p3; + *((uint32_t *)s4) = p4; + } + } +} + +void vp8_loop_filter_vertical_edge_mips(unsigned char *s, int p, + const unsigned int flimit, + const unsigned int limit, + const unsigned int thresh, int count) { + int i; + uint32_t mask, hev; + uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; + unsigned char *s1, *s2, *s3, *s4; + uint32_t prim1, prim2, sec3, sec4, prim3, prim4; + + hev = 0; + mask = 0; + i = 0; + pm1 = 0; + p0 = 0; + p1 = 0; + p2 = 0; + p3 = 0; + p4 = 0; + p5 = 0; + p6 = 0; + + /* loop filter designed to work using chars so that we can make maximum use + * of 8 bit simd instructions. + */ + + /* apply filter on 4 pixesl at the same time */ + do { + /* prefetch data for store */ + prefetch_store_lf(s + p); + + s1 = s; + s2 = s + p; + s3 = s2 + p; + s4 = s3 + p; + s = s4 + p; + + /* load quad-byte vectors + * memory is 4 byte aligned + */ + p2 = *((uint32_t *)(s1 - 4)); + p6 = *((uint32_t *)(s1)); + p1 = *((uint32_t *)(s2 - 4)); + p5 = *((uint32_t *)(s2)); + p0 = *((uint32_t *)(s3 - 4)); + p4 = *((uint32_t *)(s3)); + pm1 = *((uint32_t *)(s4 - 4)); + p3 = *((uint32_t *)(s4)); + + /* transpose pm1, p0, p1, p2 */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" + "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" + "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" + "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t" + + "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p2], %[p1], %[sec3] \n\t" + "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t" + "append %[p1], %[sec3], 16 \n\t" + "append %[pm1], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), + [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* transpose p3, p4, p5, p6 */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" + "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" + "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" + "precr.qb.ph %[prim4], %[p4], %[p3] \n\t" + + "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p6], %[p5], %[sec3] \n\t" + "precrq.ph.w %[p4], %[p3], %[sec4] \n\t" + "append %[p5], %[sec3], 16 \n\t" + "append %[p3], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4), + [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* if (p1 - p4 == 0) and (p2 - p3 == 0) + * mask will be zero and filtering is not needed + */ + if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { + vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, + thresh, &hev, &mask); + + /* if mask == 0 do filtering is not needed */ + if (mask) { + /* filtering */ + vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4); + + /* unpack processed 4x4 neighborhood + * don't use transpose on output data + * because memory isn't aligned + */ + __asm__ __volatile__( + "sb %[p4], 1(%[s4]) \n\t" + "sb %[p3], 0(%[s4]) \n\t" + "sb %[p2], -1(%[s4]) \n\t" + "sb %[p1], -2(%[s4]) \n\t" + : + : [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), [p2] "r"(p2), + [p1] "r"(p1)); + + __asm__ __volatile__( + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) + :); + + __asm__ __volatile__( + "sb %[p4], 1(%[s3]) \n\t" + "sb %[p3], 0(%[s3]) \n\t" + "sb %[p2], -1(%[s3]) \n\t" + "sb %[p1], -2(%[s3]) \n\t" + : [p1] "+r"(p1) + : [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), [p2] "r"(p2)); + + __asm__ __volatile__( + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) + :); + + __asm__ __volatile__( + "sb %[p4], 1(%[s2]) \n\t" + "sb %[p3], 0(%[s2]) \n\t" + "sb %[p2], -1(%[s2]) \n\t" + "sb %[p1], -2(%[s2]) \n\t" + : + : [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), [p2] "r"(p2), + [p1] "r"(p1)); + + __asm__ __volatile__( + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) + :); + + __asm__ __volatile__( + "sb %[p4], 1(%[s1]) \n\t" + "sb %[p3], 0(%[s1]) \n\t" + "sb %[p2], -1(%[s1]) \n\t" + "sb %[p1], -2(%[s1]) \n\t" + : + : [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), [p2] "r"(p2), + [p1] "r"(p1)); + } + } + + s1 = s; + s2 = s + p; + s3 = s2 + p; + s4 = s3 + p; + s = s4 + p; + + /* load quad-byte vectors + * memory is 4 byte aligned + */ + p2 = *((uint32_t *)(s1 - 4)); + p6 = *((uint32_t *)(s1)); + p1 = *((uint32_t *)(s2 - 4)); + p5 = *((uint32_t *)(s2)); + p0 = *((uint32_t *)(s3 - 4)); + p4 = *((uint32_t *)(s3)); + pm1 = *((uint32_t *)(s4 - 4)); + p3 = *((uint32_t *)(s4)); + + /* transpose pm1, p0, p1, p2 */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" + "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" + "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" + "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t" + + "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p2], %[p1], %[sec3] \n\t" + "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t" + "append %[p1], %[sec3], 16 \n\t" + "append %[pm1], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), + [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* transpose p3, p4, p5, p6 */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" + "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" + "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" + "precr.qb.ph %[prim4], %[p4], %[p3] \n\t" + + "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p6], %[p5], %[sec3] \n\t" + "precrq.ph.w %[p4], %[p3], %[sec4] \n\t" + "append %[p5], %[sec3], 16 \n\t" + "append %[p3], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4), + [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* if (p1 - p4 == 0) and (p2 - p3 == 0) + * mask will be zero and filtering is not needed + */ + if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { + vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, + thresh, &hev, &mask); + + /* if mask == 0 do filtering is not needed */ + if (mask) { + /* filtering */ + vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4); + + /* unpack processed 4x4 neighborhood + * don't use transpose on output data + * because memory isn't aligned + */ + __asm__ __volatile__( + "sb %[p4], 1(%[s4]) \n\t" + "sb %[p3], 0(%[s4]) \n\t" + "sb %[p2], -1(%[s4]) \n\t" + "sb %[p1], -2(%[s4]) \n\t" + : + : [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), [p2] "r"(p2), + [p1] "r"(p1)); + + __asm__ __volatile__( + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) + :); + + __asm__ __volatile__( + "sb %[p4], 1(%[s3]) \n\t" + "sb %[p3], 0(%[s3]) \n\t" + "sb %[p2], -1(%[s3]) \n\t" + "sb %[p1], -2(%[s3]) \n\t" + : [p1] "+r"(p1) + : [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), [p2] "r"(p2)); + + __asm__ __volatile__( + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) + :); + + __asm__ __volatile__( + "sb %[p4], 1(%[s2]) \n\t" + "sb %[p3], 0(%[s2]) \n\t" + "sb %[p2], -1(%[s2]) \n\t" + "sb %[p1], -2(%[s2]) \n\t" + : + : [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), [p2] "r"(p2), + [p1] "r"(p1)); + + __asm__ __volatile__( + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) + :); + + __asm__ __volatile__( + "sb %[p4], 1(%[s1]) \n\t" + "sb %[p3], 0(%[s1]) \n\t" + "sb %[p2], -1(%[s1]) \n\t" + "sb %[p1], -2(%[s1]) \n\t" + : + : [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), [p2] "r"(p2), + [p1] "r"(p1)); + } + } + + i += 8; + } + + while (i < count); +} + +void vp8_loop_filter_uvvertical_edge_mips(unsigned char *s, int p, + unsigned int flimit, + unsigned int limit, + unsigned int thresh, int count) { + uint32_t mask, hev; + uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; + unsigned char *s1, *s2, *s3, *s4; + uint32_t prim1, prim2, sec3, sec4, prim3, prim4; + (void)count; + + /* loop filter designed to work using chars so that we can make maximum use + * of 8 bit simd instructions. + */ + + /* apply filter on 4 pixesl at the same time */ + + s1 = s; + s2 = s + p; + s3 = s2 + p; + s4 = s3 + p; + + /* load quad-byte vectors + * memory is 4 byte aligned + */ + p2 = *((uint32_t *)(s1 - 4)); + p6 = *((uint32_t *)(s1)); + p1 = *((uint32_t *)(s2 - 4)); + p5 = *((uint32_t *)(s2)); + p0 = *((uint32_t *)(s3 - 4)); + p4 = *((uint32_t *)(s3)); + pm1 = *((uint32_t *)(s4 - 4)); + p3 = *((uint32_t *)(s4)); + + /* transpose pm1, p0, p1, p2 */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" + "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" + "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" + "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t" + + "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p2], %[p1], %[sec3] \n\t" + "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t" + "append %[p1], %[sec3], 16 \n\t" + "append %[pm1], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), + [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* transpose p3, p4, p5, p6 */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" + "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" + "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" + "precr.qb.ph %[prim4], %[p4], %[p3] \n\t" + + "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p6], %[p5], %[sec3] \n\t" + "precrq.ph.w %[p4], %[p3], %[sec4] \n\t" + "append %[p5], %[sec3], 16 \n\t" + "append %[p3], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4), + [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* if (p1 - p4 == 0) and (p2 - p3 == 0) + * mask will be zero and filtering is not needed + */ + if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { + vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, + thresh, &hev, &mask); + + /* if mask == 0 do filtering is not needed */ + if (mask) { + /* filtering */ + vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4); + + /* unpack processed 4x4 neighborhood + * don't use transpose on output data + * because memory isn't aligned + */ + __asm__ __volatile__( + "sb %[p4], 1(%[s4]) \n\t" + "sb %[p3], 0(%[s4]) \n\t" + "sb %[p2], -1(%[s4]) \n\t" + "sb %[p1], -2(%[s4]) \n\t" + : + : + [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), [p2] "r"(p2), [p1] "r"(p1)); + + __asm__ __volatile__( + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) + :); + + __asm__ __volatile__( + "sb %[p4], 1(%[s3]) \n\t" + "sb %[p3], 0(%[s3]) \n\t" + "sb %[p2], -1(%[s3]) \n\t" + "sb %[p1], -2(%[s3]) \n\t" + : [p1] "+r"(p1) + : [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), [p2] "r"(p2)); + + __asm__ __volatile__( + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) + :); + + __asm__ __volatile__( + "sb %[p4], 1(%[s2]) \n\t" + "sb %[p3], 0(%[s2]) \n\t" + "sb %[p2], -1(%[s2]) \n\t" + "sb %[p1], -2(%[s2]) \n\t" + : + : + [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), [p2] "r"(p2), [p1] "r"(p1)); + + __asm__ __volatile__( + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) + :); + + __asm__ __volatile__( + "sb %[p4], 1(%[s1]) \n\t" + "sb %[p3], 0(%[s1]) \n\t" + "sb %[p2], -1(%[s1]) \n\t" + "sb %[p1], -2(%[s1]) \n\t" + : + : + [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), [p2] "r"(p2), [p1] "r"(p1)); + } + } + + s1 = s4 + p; + s2 = s1 + p; + s3 = s2 + p; + s4 = s3 + p; + + /* load quad-byte vectors + * memory is 4 byte aligned + */ + p2 = *((uint32_t *)(s1 - 4)); + p6 = *((uint32_t *)(s1)); + p1 = *((uint32_t *)(s2 - 4)); + p5 = *((uint32_t *)(s2)); + p0 = *((uint32_t *)(s3 - 4)); + p4 = *((uint32_t *)(s3)); + pm1 = *((uint32_t *)(s4 - 4)); + p3 = *((uint32_t *)(s4)); + + /* transpose pm1, p0, p1, p2 */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" + "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" + "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" + "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t" + + "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p2], %[p1], %[sec3] \n\t" + "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t" + "append %[p1], %[sec3], 16 \n\t" + "append %[pm1], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), + [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* transpose p3, p4, p5, p6 */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" + "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" + "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" + "precr.qb.ph %[prim4], %[p4], %[p3] \n\t" + + "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p6], %[p5], %[sec3] \n\t" + "precrq.ph.w %[p4], %[p3], %[sec4] \n\t" + "append %[p5], %[sec3], 16 \n\t" + "append %[p3], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4), + [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* if (p1 - p4 == 0) and (p2 - p3 == 0) + * mask will be zero and filtering is not needed + */ + if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { + vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, + thresh, &hev, &mask); + + /* if mask == 0 do filtering is not needed */ + if (mask) { + /* filtering */ + vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4); + + /* unpack processed 4x4 neighborhood + * don't use transpose on output data + * because memory isn't aligned + */ + __asm__ __volatile__( + "sb %[p4], 1(%[s4]) \n\t" + "sb %[p3], 0(%[s4]) \n\t" + "sb %[p2], -1(%[s4]) \n\t" + "sb %[p1], -2(%[s4]) \n\t" + : + : + [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), [p2] "r"(p2), [p1] "r"(p1)); + + __asm__ __volatile__( + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) + :); + + __asm__ __volatile__( + "sb %[p4], 1(%[s3]) \n\t" + "sb %[p3], 0(%[s3]) \n\t" + "sb %[p2], -1(%[s3]) \n\t" + "sb %[p1], -2(%[s3]) \n\t" + : [p1] "+r"(p1) + : [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), [p2] "r"(p2)); + + __asm__ __volatile__( + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) + :); + + __asm__ __volatile__( + "sb %[p4], 1(%[s2]) \n\t" + "sb %[p3], 0(%[s2]) \n\t" + "sb %[p2], -1(%[s2]) \n\t" + "sb %[p1], -2(%[s2]) \n\t" + : + : + [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), [p2] "r"(p2), [p1] "r"(p1)); + + __asm__ __volatile__( + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) + :); + + __asm__ __volatile__( + "sb %[p4], 1(%[s1]) \n\t" + "sb %[p3], 0(%[s1]) \n\t" + "sb %[p2], -1(%[s1]) \n\t" + "sb %[p1], -2(%[s1]) \n\t" + : + : + [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), [p2] "r"(p2), [p1] "r"(p1)); + } + } +} + +/* inputs & outputs are quad-byte vectors */ +static __inline void vp8_mbfilter_mips(uint32_t mask, uint32_t hev, + uint32_t *ps2, uint32_t *ps1, + uint32_t *ps0, uint32_t *qs0, + uint32_t *qs1, uint32_t *qs2) { + int32_t vps2, vps1, vps0, vqs0, vqs1, vqs2; + int32_t vps2_l, vps1_l, vps0_l, vqs0_l, vqs1_l, vqs2_l; + int32_t vps2_r, vps1_r, vps0_r, vqs0_r, vqs1_r, vqs2_r; + uint32_t HWM, vp8_filter_l, vp8_filter_r, mask_l, mask_r, hev_l, hev_r, + subr_r, subr_l; + uint32_t Filter2_l, Filter2_r, t1, t2, Filter1_l, Filter1_r, invhev_l, + invhev_r; + uint32_t N128, R63; + uint32_t u1_l, u1_r, u2_l, u2_r, u3_l, u3_r; + + R63 = 0x003F003F; + HWM = 0xFF00FF00; + N128 = 0x80808080; + t1 = 0x03000300; + t2 = 0x04000400; + + vps0 = (*ps0) ^ N128; + vps1 = (*ps1) ^ N128; + vps2 = (*ps2) ^ N128; + vqs0 = (*qs0) ^ N128; + vqs1 = (*qs1) ^ N128; + vqs2 = (*qs2) ^ N128; + + /* use halfword pairs instead quad-bytes because of accuracy */ + vps0_l = vps0 & HWM; + vps0_r = vps0 << 8; + vps0_r = vps0_r & HWM; + + vqs0_l = vqs0 & HWM; + vqs0_r = vqs0 << 8; + vqs0_r = vqs0_r & HWM; + + vps1_l = vps1 & HWM; + vps1_r = vps1 << 8; + vps1_r = vps1_r & HWM; + + vqs1_l = vqs1 & HWM; + vqs1_r = vqs1 << 8; + vqs1_r = vqs1_r & HWM; + + vqs2_l = vqs2 & HWM; + vqs2_r = vqs2 << 8; + vqs2_r = vqs2_r & HWM; + + __asm__ __volatile__( + /* qs0 - ps0 */ + "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t" + "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t" + + /* vp8_filter = vp8_signed_char_clamp(ps1 - qs1); */ + "subq_s.ph %[vp8_filter_l], %[vps1_l], %[vqs1_l] \n\t" + "subq_s.ph %[vp8_filter_r], %[vps1_r], %[vqs1_r] \n\t" + + : [vp8_filter_l] "=&r"(vp8_filter_l), [vp8_filter_r] "=r"(vp8_filter_r), + [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r) + : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l), + [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r), + [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r)); + + vps2_l = vps2 & HWM; + vps2_r = vps2 << 8; + vps2_r = vps2_r & HWM; + + /* add outer taps if we have high edge variance */ + __asm__ __volatile__( + /* vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); */ + "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t" + "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t" + "and %[mask_l], %[HWM], %[mask] \n\t" + "sll %[mask_r], %[mask], 8 \n\t" + "and %[mask_r], %[HWM], %[mask_r] \n\t" + "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t" + "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t" + "and %[hev_l], %[HWM], %[hev] \n\t" + "sll %[hev_r], %[hev], 8 \n\t" + "and %[hev_r], %[HWM], %[hev_r] \n\t" + "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t" + "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t" + + /* vp8_filter &= mask; */ + "and %[vp8_filter_l], %[vp8_filter_l], %[mask_l] \n\t" + "and %[vp8_filter_r], %[vp8_filter_r], %[mask_r] \n\t" + + /* Filter2 = vp8_filter & hev; */ + "and %[Filter2_l], %[vp8_filter_l], %[hev_l] \n\t" + "and %[Filter2_r], %[vp8_filter_r], %[hev_r] \n\t" + + : [vp8_filter_l] "+r"(vp8_filter_l), [vp8_filter_r] "+r"(vp8_filter_r), + [hev_l] "=&r"(hev_l), [hev_r] "=&r"(hev_r), [mask_l] "=&r"(mask_l), + [mask_r] "=&r"(mask_r), [Filter2_l] "=&r"(Filter2_l), + [Filter2_r] "=&r"(Filter2_r) + : [subr_l] "r"(subr_l), [subr_r] "r"(subr_r), [HWM] "r"(HWM), + [hev] "r"(hev), [mask] "r"(mask)); + + /* save bottom 3 bits so that we round one side +4 and the other +3 */ + __asm__ __volatile__( + /* Filter1 = vp8_signed_char_clamp(Filter2 + 4) >>= 3; */ + "addq_s.ph %[Filter1_l], %[Filter2_l], %[t2] \n\t" + "xor %[invhev_l], %[hev_l], %[HWM] \n\t" + "addq_s.ph %[Filter1_r], %[Filter2_r], %[t2] \n\t" + + /* Filter2 = vp8_signed_char_clamp(Filter2 + 3) >>= 3; */ + "addq_s.ph %[Filter2_l], %[Filter2_l], %[t1] \n\t" + "addq_s.ph %[Filter2_r], %[Filter2_r], %[t1] \n\t" + + "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t" + "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t" + + "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t" + "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t" + "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t" + "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t" + "xor %[invhev_r], %[hev_r], %[HWM] \n\t" + + /* qs0 = vp8_signed_char_clamp(qs0 - Filter1); */ + "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t" + "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t" + + /* ps0 = vp8_signed_char_clamp(ps0 + Filter2); */ + "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t" + "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t" + + : [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r), + [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r), + [Filter2_l] "+r"(Filter2_l), [Filter2_r] "+r"(Filter2_r), + [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l), + [vqs0_r] "+r"(vqs0_r) + : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM), [hev_l] "r"(hev_l), + [hev_r] "r"(hev_r)); + + /* only apply wider filter if not high edge variance */ + __asm__ __volatile__( + /* vp8_filter &= ~hev; */ + "and %[Filter2_l], %[vp8_filter_l], %[invhev_l] \n\t" + "and %[Filter2_r], %[vp8_filter_r], %[invhev_r] \n\t" + + "shra.ph %[Filter2_l], %[Filter2_l], 8 \n\t" + "shra.ph %[Filter2_r], %[Filter2_r], 8 \n\t" + + : [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r) + : [vp8_filter_l] "r"(vp8_filter_l), [vp8_filter_r] "r"(vp8_filter_r), + [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r)); + + /* roughly 3/7th difference across boundary */ + __asm__ __volatile__( + "shll.ph %[u3_l], %[Filter2_l], 3 \n\t" + "shll.ph %[u3_r], %[Filter2_r], 3 \n\t" + + "addq.ph %[u3_l], %[u3_l], %[Filter2_l] \n\t" + "addq.ph %[u3_r], %[u3_r], %[Filter2_r] \n\t" + + "shll.ph %[u2_l], %[u3_l], 1 \n\t" + "shll.ph %[u2_r], %[u3_r], 1 \n\t" + + "addq.ph %[u1_l], %[u3_l], %[u2_l] \n\t" + "addq.ph %[u1_r], %[u3_r], %[u2_r] \n\t" + + "addq.ph %[u2_l], %[u2_l], %[R63] \n\t" + "addq.ph %[u2_r], %[u2_r], %[R63] \n\t" + + "addq.ph %[u3_l], %[u3_l], %[R63] \n\t" + "addq.ph %[u3_r], %[u3_r], %[R63] \n\t" + + /* vp8_signed_char_clamp((63 + Filter2 * 27) >> 7) + * vp8_signed_char_clamp((63 + Filter2 * 18) >> 7) + */ + "addq.ph %[u1_l], %[u1_l], %[R63] \n\t" + "addq.ph %[u1_r], %[u1_r], %[R63] \n\t" + "shra.ph %[u1_l], %[u1_l], 7 \n\t" + "shra.ph %[u1_r], %[u1_r], 7 \n\t" + "shra.ph %[u2_l], %[u2_l], 7 \n\t" + "shra.ph %[u2_r], %[u2_r], 7 \n\t" + "shll.ph %[u1_l], %[u1_l], 8 \n\t" + "shll.ph %[u1_r], %[u1_r], 8 \n\t" + "shll.ph %[u2_l], %[u2_l], 8 \n\t" + "shll.ph %[u2_r], %[u2_r], 8 \n\t" + + /* vqs0 = vp8_signed_char_clamp(qs0 - u); */ + "subq_s.ph %[vqs0_l], %[vqs0_l], %[u1_l] \n\t" + "subq_s.ph %[vqs0_r], %[vqs0_r], %[u1_r] \n\t" + + /* vps0 = vp8_signed_char_clamp(ps0 + u); */ + "addq_s.ph %[vps0_l], %[vps0_l], %[u1_l] \n\t" + "addq_s.ph %[vps0_r], %[vps0_r], %[u1_r] \n\t" + + : [u1_l] "=&r"(u1_l), [u1_r] "=&r"(u1_r), [u2_l] "=&r"(u2_l), + [u2_r] "=&r"(u2_r), [u3_l] "=&r"(u3_l), [u3_r] "=&r"(u3_r), + [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l), + [vqs0_r] "+r"(vqs0_r) + : [R63] "r"(R63), [Filter2_l] "r"(Filter2_l), [Filter2_r] "r"(Filter2_r)); + + __asm__ __volatile__( + /* vqs1 = vp8_signed_char_clamp(qs1 - u); */ + "subq_s.ph %[vqs1_l], %[vqs1_l], %[u2_l] \n\t" + "addq_s.ph %[vps1_l], %[vps1_l], %[u2_l] \n\t" + + /* vps1 = vp8_signed_char_clamp(ps1 + u); */ + "addq_s.ph %[vps1_r], %[vps1_r], %[u2_r] \n\t" + "subq_s.ph %[vqs1_r], %[vqs1_r], %[u2_r] \n\t" + + : [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l), + [vqs1_r] "+r"(vqs1_r) + : [u2_l] "r"(u2_l), [u2_r] "r"(u2_r)); + + /* roughly 1/7th difference across boundary */ + __asm__ __volatile__( + /* u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7); */ + "shra.ph %[u3_l], %[u3_l], 7 \n\t" + "shra.ph %[u3_r], %[u3_r], 7 \n\t" + "shll.ph %[u3_l], %[u3_l], 8 \n\t" + "shll.ph %[u3_r], %[u3_r], 8 \n\t" + + /* vqs2 = vp8_signed_char_clamp(qs2 - u); */ + "subq_s.ph %[vqs2_l], %[vqs2_l], %[u3_l] \n\t" + "subq_s.ph %[vqs2_r], %[vqs2_r], %[u3_r] \n\t" + + /* vps2 = vp8_signed_char_clamp(ps2 + u); */ + "addq_s.ph %[vps2_l], %[vps2_l], %[u3_l] \n\t" + "addq_s.ph %[vps2_r], %[vps2_r], %[u3_r] \n\t" + + : [u3_l] "+r"(u3_l), [u3_r] "+r"(u3_r), [vps2_l] "+r"(vps2_l), + [vps2_r] "+r"(vps2_r), [vqs2_l] "+r"(vqs2_l), [vqs2_r] "+r"(vqs2_r) + :); + + /* Create quad-bytes from halfword pairs */ + __asm__ __volatile__( + "and %[vqs0_l], %[vqs0_l], %[HWM] \n\t" + "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t" + + "and %[vps0_l], %[vps0_l], %[HWM] \n\t" + "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t" + + "and %[vqs1_l], %[vqs1_l], %[HWM] \n\t" + "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t" + + "and %[vps1_l], %[vps1_l], %[HWM] \n\t" + "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t" + + "and %[vqs2_l], %[vqs2_l], %[HWM] \n\t" + "shrl.ph %[vqs2_r], %[vqs2_r], 8 \n\t" + + "and %[vps2_l], %[vps2_l], %[HWM] \n\t" + "shrl.ph %[vps2_r], %[vps2_r], 8 \n\t" + + "or %[vqs0_r], %[vqs0_l], %[vqs0_r] \n\t" + "or %[vps0_r], %[vps0_l], %[vps0_r] \n\t" + "or %[vqs1_r], %[vqs1_l], %[vqs1_r] \n\t" + "or %[vps1_r], %[vps1_l], %[vps1_r] \n\t" + "or %[vqs2_r], %[vqs2_l], %[vqs2_r] \n\t" + "or %[vps2_r], %[vps2_l], %[vps2_r] \n\t" + + : [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l), + [vqs1_r] "+r"(vqs1_r), [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), + [vqs0_l] "+r"(vqs0_l), [vqs0_r] "+r"(vqs0_r), [vqs2_l] "+r"(vqs2_l), + [vqs2_r] "+r"(vqs2_r), [vps2_r] "+r"(vps2_r), [vps2_l] "+r"(vps2_l) + : [HWM] "r"(HWM)); + + *ps0 = vps0_r ^ N128; + *ps1 = vps1_r ^ N128; + *ps2 = vps2_r ^ N128; + *qs0 = vqs0_r ^ N128; + *qs1 = vqs1_r ^ N128; + *qs2 = vqs2_r ^ N128; +} + +void vp8_mbloop_filter_horizontal_edge_mips(unsigned char *s, int p, + unsigned int flimit, + unsigned int limit, + unsigned int thresh, int count) { + int i; + uint32_t mask, hev; + uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; + unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6; + + mask = 0; + hev = 0; + i = 0; + p1 = 0; + p2 = 0; + p3 = 0; + p4 = 0; + + /* loop filter designed to work using chars so that we can make maximum use + * of 8 bit simd instructions. + */ + + sm1 = s - (p << 2); + s0 = s - p - p - p; + s1 = s - p - p; + s2 = s - p; + s3 = s; + s4 = s + p; + s5 = s + p + p; + s6 = s + p + p + p; + + /* prefetch data for load */ + prefetch_load_lf(s + p); + + /* apply filter on 4 pixesl at the same time */ + do { + /* load quad-byte vectors + * memory is 4 byte aligned + */ + p1 = *((uint32_t *)(s1)); + p2 = *((uint32_t *)(s2)); + p3 = *((uint32_t *)(s3)); + p4 = *((uint32_t *)(s4)); + + /* if (p1 - p4 == 0) and (p2 - p3 == 0) + * mask will be zero and filtering is not needed + */ + if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { + pm1 = *((uint32_t *)(sm1)); + p0 = *((uint32_t *)(s0)); + p5 = *((uint32_t *)(s5)); + p6 = *((uint32_t *)(s6)); + + vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, + thresh, &hev, &mask); + + /* if mask == 0 do filtering is not needed */ + if (mask) { + /* filtering */ + vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5); + + /* unpack processed 4x4 neighborhood + * memory is 4 byte aligned + */ + *((uint32_t *)s0) = p0; + *((uint32_t *)s1) = p1; + *((uint32_t *)s2) = p2; + *((uint32_t *)s3) = p3; + *((uint32_t *)s4) = p4; + *((uint32_t *)s5) = p5; + } + } + + sm1 += 4; + s0 += 4; + s1 += 4; + s2 += 4; + s3 += 4; + s4 += 4; + s5 += 4; + s6 += 4; + + /* load quad-byte vectors + * memory is 4 byte aligned + */ + p1 = *((uint32_t *)(s1)); + p2 = *((uint32_t *)(s2)); + p3 = *((uint32_t *)(s3)); + p4 = *((uint32_t *)(s4)); + + /* if (p1 - p4 == 0) and (p2 - p3 == 0) + * mask will be zero and filtering is not needed + */ + if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { + pm1 = *((uint32_t *)(sm1)); + p0 = *((uint32_t *)(s0)); + p5 = *((uint32_t *)(s5)); + p6 = *((uint32_t *)(s6)); + + vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, + thresh, &hev, &mask); + + /* if mask == 0 do filtering is not needed */ + if (mask) { + /* filtering */ + vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5); + + /* unpack processed 4x4 neighborhood + * memory is 4 byte aligned + */ + *((uint32_t *)s0) = p0; + *((uint32_t *)s1) = p1; + *((uint32_t *)s2) = p2; + *((uint32_t *)s3) = p3; + *((uint32_t *)s4) = p4; + *((uint32_t *)s5) = p5; + } + } + + sm1 += 4; + s0 += 4; + s1 += 4; + s2 += 4; + s3 += 4; + s4 += 4; + s5 += 4; + s6 += 4; + + i += 8; + } + + while (i < count); +} + +void vp8_mbloop_filter_uvhorizontal_edge_mips(unsigned char *s, int p, + unsigned int flimit, + unsigned int limit, + unsigned int thresh, int count) { + uint32_t mask, hev; + uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; + unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6; + (void)count; + + mask = 0; + hev = 0; + p1 = 0; + p2 = 0; + p3 = 0; + p4 = 0; + + /* loop filter designed to work using chars so that we can make maximum use + * of 8 bit simd instructions. + */ + + sm1 = s - (p << 2); + s0 = s - p - p - p; + s1 = s - p - p; + s2 = s - p; + s3 = s; + s4 = s + p; + s5 = s + p + p; + s6 = s + p + p + p; + + /* load quad-byte vectors + * memory is 4 byte aligned + */ + p1 = *((uint32_t *)(s1)); + p2 = *((uint32_t *)(s2)); + p3 = *((uint32_t *)(s3)); + p4 = *((uint32_t *)(s4)); + + /* if (p1 - p4 == 0) and (p2 - p3 == 0) + * mask will be zero and filtering is not needed + */ + if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { + pm1 = *((uint32_t *)(sm1)); + p0 = *((uint32_t *)(s0)); + p5 = *((uint32_t *)(s5)); + p6 = *((uint32_t *)(s6)); + + /* if mask == 0 do filtering is not needed */ + vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, + thresh, &hev, &mask); + + if (mask) { + /* filtering */ + vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5); + + /* unpack processed 4x4 neighborhood + * memory is 4 byte aligned + */ + *((uint32_t *)s0) = p0; + *((uint32_t *)s1) = p1; + *((uint32_t *)s2) = p2; + *((uint32_t *)s3) = p3; + *((uint32_t *)s4) = p4; + *((uint32_t *)s5) = p5; + } + } + + sm1 += 4; + s0 += 4; + s1 += 4; + s2 += 4; + s3 += 4; + s4 += 4; + s5 += 4; + s6 += 4; + + /* load quad-byte vectors + * memory is 4 byte aligned + */ + p1 = *((uint32_t *)(s1)); + p2 = *((uint32_t *)(s2)); + p3 = *((uint32_t *)(s3)); + p4 = *((uint32_t *)(s4)); + + /* if (p1 - p4 == 0) and (p2 - p3 == 0) + * mask will be zero and filtering is not needed + */ + if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { + pm1 = *((uint32_t *)(sm1)); + p0 = *((uint32_t *)(s0)); + p5 = *((uint32_t *)(s5)); + p6 = *((uint32_t *)(s6)); + + vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, + thresh, &hev, &mask); + + /* if mask == 0 do filtering is not needed */ + if (mask) { + /* filtering */ + vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5); + + /* unpack processed 4x4 neighborhood + * memory is 4 byte aligned + */ + *((uint32_t *)s0) = p0; + *((uint32_t *)s1) = p1; + *((uint32_t *)s2) = p2; + *((uint32_t *)s3) = p3; + *((uint32_t *)s4) = p4; + *((uint32_t *)s5) = p5; + } + } +} + +void vp8_mbloop_filter_vertical_edge_mips(unsigned char *s, int p, + unsigned int flimit, + unsigned int limit, + unsigned int thresh, int count) { + int i; + uint32_t mask, hev; + uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; + unsigned char *s1, *s2, *s3, *s4; + uint32_t prim1, prim2, sec3, sec4, prim3, prim4; + + mask = 0; + hev = 0; + i = 0; + pm1 = 0; + p0 = 0; + p1 = 0; + p2 = 0; + p3 = 0; + p4 = 0; + p5 = 0; + p6 = 0; + + /* loop filter designed to work using chars so that we can make maximum use + * of 8 bit simd instructions. + */ + + /* apply filter on 4 pixesl at the same time */ + do { + s1 = s; + s2 = s + p; + s3 = s2 + p; + s4 = s3 + p; + s = s4 + p; + + /* load quad-byte vectors + * memory is 4 byte aligned + */ + p2 = *((uint32_t *)(s1 - 4)); + p6 = *((uint32_t *)(s1)); + p1 = *((uint32_t *)(s2 - 4)); + p5 = *((uint32_t *)(s2)); + p0 = *((uint32_t *)(s3 - 4)); + p4 = *((uint32_t *)(s3)); + pm1 = *((uint32_t *)(s4 - 4)); + p3 = *((uint32_t *)(s4)); + + /* transpose pm1, p0, p1, p2 */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" + "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" + "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" + "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t" + + "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p2], %[p1], %[sec3] \n\t" + "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t" + "append %[p1], %[sec3], 16 \n\t" + "append %[pm1], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), + [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* transpose p3, p4, p5, p6 */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" + "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" + "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" + "precr.qb.ph %[prim4], %[p4], %[p3] \n\t" + + "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p6], %[p5], %[sec3] \n\t" + "precrq.ph.w %[p4], %[p3], %[sec4] \n\t" + "append %[p5], %[sec3], 16 \n\t" + "append %[p3], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4), + [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* if (p1 - p4 == 0) and (p2 - p3 == 0) + * mask will be zero and filtering is not needed + */ + if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { + vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, + thresh, &hev, &mask); + + /* if mask == 0 do filtering is not needed */ + if (mask) { + /* filtering */ + vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5); + + /* don't use transpose on output data + * because memory isn't aligned + */ + __asm__ __volatile__( + "sb %[p5], 2(%[s4]) \n\t" + "sb %[p4], 1(%[s4]) \n\t" + "sb %[p3], 0(%[s4]) \n\t" + "sb %[p2], -1(%[s4]) \n\t" + "sb %[p1], -2(%[s4]) \n\t" + "sb %[p0], -3(%[s4]) \n\t" + : + : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), + [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0)); + + __asm__ __volatile__( + "srl %[p5], %[p5], 8 \n\t" + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + "srl %[p0], %[p0], 8 \n\t" + : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), + [p1] "+r"(p1), [p0] "+r"(p0) + :); + + __asm__ __volatile__( + "sb %[p5], 2(%[s3]) \n\t" + "sb %[p4], 1(%[s3]) \n\t" + "sb %[p3], 0(%[s3]) \n\t" + "sb %[p2], -1(%[s3]) \n\t" + "sb %[p1], -2(%[s3]) \n\t" + "sb %[p0], -3(%[s3]) \n\t" + : + : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), + [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0)); + + __asm__ __volatile__( + "srl %[p5], %[p5], 8 \n\t" + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + "srl %[p0], %[p0], 8 \n\t" + : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), + [p1] "+r"(p1), [p0] "+r"(p0) + :); + + __asm__ __volatile__( + "sb %[p5], 2(%[s2]) \n\t" + "sb %[p4], 1(%[s2]) \n\t" + "sb %[p3], 0(%[s2]) \n\t" + "sb %[p2], -1(%[s2]) \n\t" + "sb %[p1], -2(%[s2]) \n\t" + "sb %[p0], -3(%[s2]) \n\t" + : + : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), + [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0)); + + __asm__ __volatile__( + "srl %[p5], %[p5], 8 \n\t" + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + "srl %[p0], %[p0], 8 \n\t" + : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), + [p1] "+r"(p1), [p0] "+r"(p0) + :); + + __asm__ __volatile__( + "sb %[p5], 2(%[s1]) \n\t" + "sb %[p4], 1(%[s1]) \n\t" + "sb %[p3], 0(%[s1]) \n\t" + "sb %[p2], -1(%[s1]) \n\t" + "sb %[p1], -2(%[s1]) \n\t" + "sb %[p0], -3(%[s1]) \n\t" + : + : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), + [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0)); + } + } + + i += 4; + } + + while (i < count); +} + +void vp8_mbloop_filter_uvvertical_edge_mips(unsigned char *s, int p, + unsigned int flimit, + unsigned int limit, + unsigned int thresh, int count) { + uint32_t mask, hev; + uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; + unsigned char *s1, *s2, *s3, *s4; + uint32_t prim1, prim2, sec3, sec4, prim3, prim4; + (void)count; + + mask = 0; + hev = 0; + pm1 = 0; + p0 = 0; + p1 = 0; + p2 = 0; + p3 = 0; + p4 = 0; + p5 = 0; + p6 = 0; + + /* loop filter designed to work using chars so that we can make maximum use + * of 8 bit simd instructions. + */ + + /* apply filter on 4 pixesl at the same time */ + + s1 = s; + s2 = s + p; + s3 = s2 + p; + s4 = s3 + p; + + /* prefetch data for load */ + prefetch_load_lf(s + 2 * p); + + /* load quad-byte vectors + * memory is 4 byte aligned + */ + p2 = *((uint32_t *)(s1 - 4)); + p6 = *((uint32_t *)(s1)); + p1 = *((uint32_t *)(s2 - 4)); + p5 = *((uint32_t *)(s2)); + p0 = *((uint32_t *)(s3 - 4)); + p4 = *((uint32_t *)(s3)); + pm1 = *((uint32_t *)(s4 - 4)); + p3 = *((uint32_t *)(s4)); + + /* transpose pm1, p0, p1, p2 */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" + "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" + "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" + "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t" + + "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p2], %[p1], %[sec3] \n\t" + "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t" + "append %[p1], %[sec3], 16 \n\t" + "append %[pm1], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), + [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* transpose p3, p4, p5, p6 */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" + "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" + "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" + "precr.qb.ph %[prim4], %[p4], %[p3] \n\t" + + "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p6], %[p5], %[sec3] \n\t" + "precrq.ph.w %[p4], %[p3], %[sec4] \n\t" + "append %[p5], %[sec3], 16 \n\t" + "append %[p3], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4), + [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* if (p1 - p4 == 0) and (p2 - p3 == 0) + * mask will be zero and filtering is not needed + */ + if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { + vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, + thresh, &hev, &mask); + + /* if mask == 0 do filtering is not needed */ + if (mask) { + /* filtering */ + vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5); + + /* don't use transpose on output data + * because memory isn't aligned + */ + __asm__ __volatile__( + "sb %[p5], 2(%[s4]) \n\t" + "sb %[p4], 1(%[s4]) \n\t" + "sb %[p3], 0(%[s4]) \n\t" + "sb %[p2], -1(%[s4]) \n\t" + "sb %[p1], -2(%[s4]) \n\t" + "sb %[p0], -3(%[s4]) \n\t" + : + : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), + [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0)); + + __asm__ __volatile__( + "srl %[p5], %[p5], 8 \n\t" + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + "srl %[p0], %[p0], 8 \n\t" + : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), + [p1] "+r"(p1), [p0] "+r"(p0) + :); + + __asm__ __volatile__( + "sb %[p5], 2(%[s3]) \n\t" + "sb %[p4], 1(%[s3]) \n\t" + "sb %[p3], 0(%[s3]) \n\t" + "sb %[p2], -1(%[s3]) \n\t" + "sb %[p1], -2(%[s3]) \n\t" + "sb %[p0], -3(%[s3]) \n\t" + : + : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), + [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0)); + + __asm__ __volatile__( + "srl %[p5], %[p5], 8 \n\t" + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + "srl %[p0], %[p0], 8 \n\t" + : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), + [p1] "+r"(p1), [p0] "+r"(p0) + :); + + __asm__ __volatile__( + "sb %[p5], 2(%[s2]) \n\t" + "sb %[p4], 1(%[s2]) \n\t" + "sb %[p3], 0(%[s2]) \n\t" + "sb %[p2], -1(%[s2]) \n\t" + "sb %[p1], -2(%[s2]) \n\t" + "sb %[p0], -3(%[s2]) \n\t" + : + : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), + [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0)); + + __asm__ __volatile__( + "srl %[p5], %[p5], 8 \n\t" + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + "srl %[p0], %[p0], 8 \n\t" + : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), + [p1] "+r"(p1), [p0] "+r"(p0) + :); + + __asm__ __volatile__( + "sb %[p5], 2(%[s1]) \n\t" + "sb %[p4], 1(%[s1]) \n\t" + "sb %[p3], 0(%[s1]) \n\t" + "sb %[p2], -1(%[s1]) \n\t" + "sb %[p1], -2(%[s1]) \n\t" + "sb %[p0], -3(%[s1]) \n\t" + : + : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), + [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0)); + } + } + + s1 = s4 + p; + s2 = s1 + p; + s3 = s2 + p; + s4 = s3 + p; + + /* load quad-byte vectors + * memory is 4 byte aligned + */ + p2 = *((uint32_t *)(s1 - 4)); + p6 = *((uint32_t *)(s1)); + p1 = *((uint32_t *)(s2 - 4)); + p5 = *((uint32_t *)(s2)); + p0 = *((uint32_t *)(s3 - 4)); + p4 = *((uint32_t *)(s3)); + pm1 = *((uint32_t *)(s4 - 4)); + p3 = *((uint32_t *)(s4)); + + /* transpose pm1, p0, p1, p2 */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" + "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" + "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" + "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t" + + "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p2], %[p1], %[sec3] \n\t" + "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t" + "append %[p1], %[sec3], 16 \n\t" + "append %[pm1], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), + [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* transpose p3, p4, p5, p6 */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" + "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" + "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" + "precr.qb.ph %[prim4], %[p4], %[p3] \n\t" + + "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p6], %[p5], %[sec3] \n\t" + "precrq.ph.w %[p4], %[p3], %[sec4] \n\t" + "append %[p5], %[sec3], 16 \n\t" + "append %[p3], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4), + [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* if (p1 - p4 == 0) and (p2 - p3 == 0) + * mask will be zero and filtering is not needed + */ + if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { + vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, + thresh, &hev, &mask); + + /* if mask == 0 do filtering is not needed */ + if (mask) { + /* filtering */ + vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5); + + /* don't use transpose on output data + * because memory isn't aligned + */ + __asm__ __volatile__( + "sb %[p5], 2(%[s4]) \n\t" + "sb %[p4], 1(%[s4]) \n\t" + "sb %[p3], 0(%[s4]) \n\t" + "sb %[p2], -1(%[s4]) \n\t" + "sb %[p1], -2(%[s4]) \n\t" + "sb %[p0], -3(%[s4]) \n\t" + : + : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), + [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0)); + + __asm__ __volatile__( + "srl %[p5], %[p5], 8 \n\t" + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + "srl %[p0], %[p0], 8 \n\t" + : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), + [p1] "+r"(p1), [p0] "+r"(p0) + :); + + __asm__ __volatile__( + "sb %[p5], 2(%[s3]) \n\t" + "sb %[p4], 1(%[s3]) \n\t" + "sb %[p3], 0(%[s3]) \n\t" + "sb %[p2], -1(%[s3]) \n\t" + "sb %[p1], -2(%[s3]) \n\t" + "sb %[p0], -3(%[s3]) \n\t" + : + : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), + [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0)); + + __asm__ __volatile__( + "srl %[p5], %[p5], 8 \n\t" + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + "srl %[p0], %[p0], 8 \n\t" + : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), + [p1] "+r"(p1), [p0] "+r"(p0) + :); + + __asm__ __volatile__( + "sb %[p5], 2(%[s2]) \n\t" + "sb %[p4], 1(%[s2]) \n\t" + "sb %[p3], 0(%[s2]) \n\t" + "sb %[p2], -1(%[s2]) \n\t" + "sb %[p1], -2(%[s2]) \n\t" + "sb %[p0], -3(%[s2]) \n\t" + : + : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), + [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0)); + + __asm__ __volatile__( + "srl %[p5], %[p5], 8 \n\t" + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + "srl %[p0], %[p0], 8 \n\t" + : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), + [p1] "+r"(p1), [p0] "+r"(p0) + :); + + __asm__ __volatile__( + "sb %[p5], 2(%[s1]) \n\t" + "sb %[p4], 1(%[s1]) \n\t" + "sb %[p3], 0(%[s1]) \n\t" + "sb %[p2], -1(%[s1]) \n\t" + "sb %[p1], -2(%[s1]) \n\t" + "sb %[p0], -3(%[s1]) \n\t" + : + : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), + [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0)); + } + } +} + +/* Horizontal MB filtering */ +void vp8_loop_filter_mbh_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, + int uv_stride, loop_filter_info *lfi) { + unsigned int thresh_vec, flimit_vec, limit_vec; + unsigned char thresh, flimit, limit, flimit_temp; + + /* use direct value instead pointers */ + limit = *(lfi->lim); + flimit_temp = *(lfi->mblim); + thresh = *(lfi->hev_thr); + flimit = flimit_temp; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[thresh] \n\t" + "replv.qb %[flimit_vec], %[flimit] \n\t" + "replv.qb %[limit_vec], %[limit] \n\t" + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [thresh] "r"(thresh), [flimit] "r"(flimit), [limit] "r"(limit)); + + vp8_mbloop_filter_horizontal_edge_mips(y_ptr, y_stride, flimit_vec, limit_vec, + thresh_vec, 16); + + if (u_ptr) { + vp8_mbloop_filter_uvhorizontal_edge_mips(u_ptr, uv_stride, flimit_vec, + limit_vec, thresh_vec, 0); + } + + if (v_ptr) { + vp8_mbloop_filter_uvhorizontal_edge_mips(v_ptr, uv_stride, flimit_vec, + limit_vec, thresh_vec, 0); + } +} + +/* Vertical MB Filtering */ +void vp8_loop_filter_mbv_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, + int uv_stride, loop_filter_info *lfi) { + unsigned int thresh_vec, flimit_vec, limit_vec; + unsigned char thresh, flimit, limit, flimit_temp; + + /* use direct value instead pointers */ + limit = *(lfi->lim); + flimit_temp = *(lfi->mblim); + thresh = *(lfi->hev_thr); + flimit = flimit_temp; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[thresh] \n\t" + "replv.qb %[flimit_vec], %[flimit] \n\t" + "replv.qb %[limit_vec], %[limit] \n\t" + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [thresh] "r"(thresh), [flimit] "r"(flimit), [limit] "r"(limit)); + + vp8_mbloop_filter_vertical_edge_mips(y_ptr, y_stride, flimit_vec, limit_vec, + thresh_vec, 16); + + if (u_ptr) + vp8_mbloop_filter_uvvertical_edge_mips(u_ptr, uv_stride, flimit_vec, + limit_vec, thresh_vec, 0); + + if (v_ptr) + vp8_mbloop_filter_uvvertical_edge_mips(v_ptr, uv_stride, flimit_vec, + limit_vec, thresh_vec, 0); +} + +/* Horizontal B Filtering */ +void vp8_loop_filter_bh_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + unsigned int thresh_vec, flimit_vec, limit_vec; + unsigned char thresh, flimit, limit, flimit_temp; + + /* use direct value instead pointers */ + limit = *(lfi->lim); + flimit_temp = *(lfi->blim); + thresh = *(lfi->hev_thr); + flimit = flimit_temp; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[thresh] \n\t" + "replv.qb %[flimit_vec], %[flimit] \n\t" + "replv.qb %[limit_vec], %[limit] \n\t" + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [thresh] "r"(thresh), [flimit] "r"(flimit), [limit] "r"(limit)); + + vp8_loop_filter_horizontal_edge_mips(y_ptr + 4 * y_stride, y_stride, + flimit_vec, limit_vec, thresh_vec, 16); + vp8_loop_filter_horizontal_edge_mips(y_ptr + 8 * y_stride, y_stride, + flimit_vec, limit_vec, thresh_vec, 16); + vp8_loop_filter_horizontal_edge_mips(y_ptr + 12 * y_stride, y_stride, + flimit_vec, limit_vec, thresh_vec, 16); + + if (u_ptr) + vp8_loop_filter_uvhorizontal_edge_mips( + u_ptr + 4 * uv_stride, uv_stride, flimit_vec, limit_vec, thresh_vec, 0); + + if (v_ptr) + vp8_loop_filter_uvhorizontal_edge_mips( + v_ptr + 4 * uv_stride, uv_stride, flimit_vec, limit_vec, thresh_vec, 0); +} + +/* Vertical B Filtering */ +void vp8_loop_filter_bv_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + unsigned int thresh_vec, flimit_vec, limit_vec; + unsigned char thresh, flimit, limit, flimit_temp; + + /* use direct value instead pointers */ + limit = *(lfi->lim); + flimit_temp = *(lfi->blim); + thresh = *(lfi->hev_thr); + flimit = flimit_temp; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[thresh] \n\t" + "replv.qb %[flimit_vec], %[flimit] \n\t" + "replv.qb %[limit_vec], %[limit] \n\t" + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [thresh] "r"(thresh), [flimit] "r"(flimit), [limit] "r"(limit)); + + vp8_loop_filter_vertical_edge_mips(y_ptr + 4, y_stride, flimit_vec, limit_vec, + thresh_vec, 16); + vp8_loop_filter_vertical_edge_mips(y_ptr + 8, y_stride, flimit_vec, limit_vec, + thresh_vec, 16); + vp8_loop_filter_vertical_edge_mips(y_ptr + 12, y_stride, flimit_vec, + limit_vec, thresh_vec, 16); + + if (u_ptr) + vp8_loop_filter_uvvertical_edge_mips(u_ptr + 4, uv_stride, flimit_vec, + limit_vec, thresh_vec, 0); + + if (v_ptr) + vp8_loop_filter_uvvertical_edge_mips(v_ptr + 4, uv_stride, flimit_vec, + limit_vec, thresh_vec, 0); +} + +#endif diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/copymem_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/copymem_mmi.c new file mode 100644 index 0000000000..86a32aa9ef --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mips/mmi/copymem_mmi.c @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_ports/asmdefs_mmi.h" + +#define COPY_MEM_16X2 \ + "gsldlc1 %[ftmp0], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp0], 0x00(%[src]) \n\t" \ + "ldl %[tmp0], 0x0f(%[src]) \n\t" \ + "ldr %[tmp0], 0x08(%[src]) \n\t" \ + MMI_ADDU(%[src], %[src], %[src_stride]) \ + "gssdlc1 %[ftmp0], 0x07(%[dst]) \n\t" \ + "gssdrc1 %[ftmp0], 0x00(%[dst]) \n\t" \ + "sdl %[tmp0], 0x0f(%[dst]) \n\t" \ + "sdr %[tmp0], 0x08(%[dst]) \n\t" \ + MMI_ADDU(%[dst], %[dst], %[dst_stride]) \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "ldl %[tmp1], 0x0f(%[src]) \n\t" \ + "ldr %[tmp1], 0x08(%[src]) \n\t" \ + MMI_ADDU(%[src], %[src], %[src_stride]) \ + "gssdlc1 %[ftmp1], 0x07(%[dst]) \n\t" \ + "gssdrc1 %[ftmp1], 0x00(%[dst]) \n\t" \ + "sdl %[tmp1], 0x0f(%[dst]) \n\t" \ + "sdr %[tmp1], 0x08(%[dst]) \n\t" \ + MMI_ADDU(%[dst], %[dst], %[dst_stride]) + +#define COPY_MEM_8X2 \ + "gsldlc1 %[ftmp0], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp0], 0x00(%[src]) \n\t" \ + MMI_ADDU(%[src], %[src], %[src_stride]) \ + "ldl %[tmp0], 0x07(%[src]) \n\t" \ + "ldr %[tmp0], 0x00(%[src]) \n\t" \ + MMI_ADDU(%[src], %[src], %[src_stride]) \ + \ + "gssdlc1 %[ftmp0], 0x07(%[dst]) \n\t" \ + "gssdrc1 %[ftmp0], 0x00(%[dst]) \n\t" \ + MMI_ADDU(%[dst], %[dst], %[dst_stride]) \ + "sdl %[tmp0], 0x07(%[dst]) \n\t" \ + "sdr %[tmp0], 0x00(%[dst]) \n\t" \ + MMI_ADDU(%[dst], %[dst], %[dst_stride]) + +void vp8_copy_mem16x16_mmi(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride) { + double ftmp[2]; + uint64_t tmp[2]; + uint8_t loop_count = 4; + + /* clang-format off */ + __asm__ volatile ( + "1: \n\t" + COPY_MEM_16X2 + COPY_MEM_16X2 + MMI_ADDIU(%[loop_count], %[loop_count], -0x01) + "bnez %[loop_count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [loop_count]"+&r"(loop_count), + [dst]"+&r"(dst), [src]"+&r"(src) + : [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); + /* clang-format on */ +} + +void vp8_copy_mem8x8_mmi(unsigned char *src, int src_stride, unsigned char *dst, + int dst_stride) { + double ftmp[2]; + uint64_t tmp[1]; + uint8_t loop_count = 4; + + /* clang-format off */ + __asm__ volatile ( + "1: \n\t" + COPY_MEM_8X2 + MMI_ADDIU(%[loop_count], %[loop_count], -0x01) + "bnez %[loop_count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [tmp0]"=&r"(tmp[0]), [loop_count]"+&r"(loop_count), + [dst]"+&r"(dst), [src]"+&r"(src) + : [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); + /* clang-format on */ +} + +void vp8_copy_mem8x4_mmi(unsigned char *src, int src_stride, unsigned char *dst, + int dst_stride) { + double ftmp[2]; + uint64_t tmp[1]; + + /* clang-format off */ + __asm__ volatile ( + COPY_MEM_8X2 + COPY_MEM_8X2 + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [tmp0]"=&r"(tmp[0]), + [dst]"+&r"(dst), [src]"+&r"(src) + : [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); + /* clang-format on */ +} diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/dequantize_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/dequantize_mmi.c new file mode 100644 index 0000000000..b9330a6663 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mips/mmi/dequantize_mmi.c @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vp8/common/blockd.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/asmdefs_mmi.h" + +void vp8_dequantize_b_mmi(BLOCKD *d, int16_t *DQC) { + double ftmp[8]; + + __asm__ volatile( + "gsldlc1 %[ftmp0], 0x07(%[qcoeff]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[qcoeff]) \n\t" + "gsldlc1 %[ftmp1], 0x0f(%[qcoeff]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[qcoeff]) \n\t" + "gsldlc1 %[ftmp2], 0x17(%[qcoeff]) \n\t" + "gsldrc1 %[ftmp2], 0x10(%[qcoeff]) \n\t" + "gsldlc1 %[ftmp3], 0x1f(%[qcoeff]) \n\t" + "gsldrc1 %[ftmp3], 0x18(%[qcoeff]) \n\t" + + "gsldlc1 %[ftmp4], 0x07(%[DQC]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[DQC]) \n\t" + "gsldlc1 %[ftmp5], 0x0f(%[DQC]) \n\t" + "gsldrc1 %[ftmp5], 0x08(%[DQC]) \n\t" + "gsldlc1 %[ftmp6], 0x17(%[DQC]) \n\t" + "gsldrc1 %[ftmp6], 0x10(%[DQC]) \n\t" + "gsldlc1 %[ftmp7], 0x1f(%[DQC]) \n\t" + "gsldrc1 %[ftmp7], 0x18(%[DQC]) \n\t" + + "pmullh %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "pmullh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "pmullh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "pmullh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + + "gssdlc1 %[ftmp0], 0x07(%[dqcoeff]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[dqcoeff]) \n\t" + "gssdlc1 %[ftmp1], 0x0f(%[dqcoeff]) \n\t" + "gssdrc1 %[ftmp1], 0x08(%[dqcoeff]) \n\t" + "gssdlc1 %[ftmp2], 0x17(%[dqcoeff]) \n\t" + "gssdrc1 %[ftmp2], 0x10(%[dqcoeff]) \n\t" + "gssdlc1 %[ftmp3], 0x1f(%[dqcoeff]) \n\t" + "gssdrc1 %[ftmp3], 0x18(%[dqcoeff]) \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), + [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]) + : [dqcoeff] "r"(d->dqcoeff), [qcoeff] "r"(d->qcoeff), [DQC] "r"(DQC) + : "memory"); +} + +void vp8_dequant_idct_add_mmi(int16_t *input, int16_t *dq, unsigned char *dest, + int stride) { + double ftmp[8]; + + __asm__ volatile( + "gsldlc1 %[ftmp0], 0x07(%[dq]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[dq]) \n\t" + "gsldlc1 %[ftmp1], 0x0f(%[dq]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[dq]) \n\t" + "gsldlc1 %[ftmp2], 0x17(%[dq]) \n\t" + "gsldrc1 %[ftmp2], 0x10(%[dq]) \n\t" + "gsldlc1 %[ftmp3], 0x1f(%[dq]) \n\t" + "gsldrc1 %[ftmp3], 0x18(%[dq]) \n\t" + + "gsldlc1 %[ftmp4], 0x07(%[input]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[input]) \n\t" + "gsldlc1 %[ftmp5], 0x0f(%[input]) \n\t" + "gsldrc1 %[ftmp5], 0x08(%[input]) \n\t" + "gsldlc1 %[ftmp6], 0x17(%[input]) \n\t" + "gsldrc1 %[ftmp6], 0x10(%[input]) \n\t" + "gsldlc1 %[ftmp7], 0x1f(%[input]) \n\t" + "gsldrc1 %[ftmp7], 0x18(%[input]) \n\t" + + "pmullh %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "pmullh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "pmullh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "pmullh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + + "gssdlc1 %[ftmp0], 0x07(%[input]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[input]) \n\t" + "gssdlc1 %[ftmp1], 0x0f(%[input]) \n\t" + "gssdrc1 %[ftmp1], 0x08(%[input]) \n\t" + "gssdlc1 %[ftmp2], 0x17(%[input]) \n\t" + "gssdrc1 %[ftmp2], 0x10(%[input]) \n\t" + "gssdlc1 %[ftmp3], 0x1f(%[input]) \n\t" + "gssdrc1 %[ftmp3], 0x18(%[input]) \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), + [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]) + : [dq] "r"(dq), [input] "r"(input) + : "memory"); + + vp8_short_idct4x4llm_mmi(input, dest, stride, dest, stride); + + __asm__ volatile( + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gssdlc1 %[ftmp0], 0x07(%[input]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[input]) \n\t" + "sdl $0, 0x0f(%[input]) \n\t" + "sdr $0, 0x08(%[input]) \n\t" + "gssdlc1 %[ftmp0], 0x17(%[input]) \n\t" + "gssdrc1 %[ftmp0], 0x10(%[input]) \n\t" + "sdl $0, 0x1f(%[input]) \n\t" + "sdr $0, 0x18(%[input]) \n\t" + : [ftmp0] "=&f"(ftmp[0]) + : [input] "r"(input) + : "memory"); +} diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c new file mode 100644 index 0000000000..4fd6854c52 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_mem/vpx_mem.h" + +void vp8_dequant_idct_add_y_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst, + int stride, char *eobs) { + int i, j; + + for (i = 0; i < 4; i++) { + for (j = 0; j < 4; j++) { + if (*eobs++ > 1) { + vp8_dequant_idct_add_mmi(q, dq, dst, stride); + } else { + vp8_dc_only_idct_add_mmi(q[0] * dq[0], dst, stride, dst, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + q += 16; + dst += 4; + } + + dst += 4 * stride - 16; + } +} + +void vp8_dequant_idct_add_uv_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst_u, + uint8_t *dst_v, int stride, char *eobs) { + int i, j; + + for (i = 0; i < 2; i++) { + for (j = 0; j < 2; j++) { + if (*eobs++ > 1) { + vp8_dequant_idct_add_mmi(q, dq, dst_u, stride); + } else { + vp8_dc_only_idct_add_mmi(q[0] * dq[0], dst_u, stride, dst_u, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + q += 16; + dst_u += 4; + } + + dst_u += 4 * stride - 8; + } + + for (i = 0; i < 2; i++) { + for (j = 0; j < 2; j++) { + if (*eobs++ > 1) { + vp8_dequant_idct_add_mmi(q, dq, dst_v, stride); + } else { + vp8_dc_only_idct_add_mmi(q[0] * dq[0], dst_v, stride, dst_v, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + q += 16; + dst_v += 4; + } + + dst_v += 4 * stride - 8; + } +} diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/idctllm_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/idctllm_mmi.c new file mode 100644 index 0000000000..a35689dd30 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mips/mmi/idctllm_mmi.c @@ -0,0 +1,335 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/asmdefs_mmi.h" + +#define TRANSPOSE_4H \ + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \ + MMI_LI(%[tmp0], 0x93) \ + "mtc1 %[tmp0], %[ftmp10] \n\t" \ + "punpcklhw %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ + "punpcklhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "por %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \ + "punpckhhw %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "por %[ftmp6], %[ftmp6], %[ftmp9] \n\t" \ + "punpcklhw %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \ + "punpcklhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "por %[ftmp7], %[ftmp7], %[ftmp9] \n\t" \ + "punpckhhw %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "por %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \ + "punpcklwd %[ftmp1], %[ftmp5], %[ftmp7] \n\t" \ + "punpckhwd %[ftmp2], %[ftmp5], %[ftmp7] \n\t" \ + "punpcklwd %[ftmp3], %[ftmp6], %[ftmp8] \n\t" \ + "punpckhwd %[ftmp4], %[ftmp6], %[ftmp8] \n\t" + +void vp8_short_idct4x4llm_mmi(int16_t *input, unsigned char *pred_ptr, + int pred_stride, unsigned char *dst_ptr, + int dst_stride) { + double ftmp[12]; + uint64_t tmp[1]; + double ff_ph_04, ff_ph_4e7b, ff_ph_22a3; + + __asm__ volatile ( + "dli %[tmp0], 0x0004000400040004 \n\t" + "dmtc1 %[tmp0], %[ff_ph_04] \n\t" + "dli %[tmp0], 0x4e7b4e7b4e7b4e7b \n\t" + "dmtc1 %[tmp0], %[ff_ph_4e7b] \n\t" + "dli %[tmp0], 0x22a322a322a322a3 \n\t" + "dmtc1 %[tmp0], %[ff_ph_22a3] \n\t" + MMI_LI(%[tmp0], 0x02) + "dmtc1 %[tmp0], %[ftmp11] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + + "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[ip]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[ip]) \n\t" + "gsldlc1 %[ftmp3], 0x17(%[ip]) \n\t" + "gsldrc1 %[ftmp3], 0x10(%[ip]) \n\t" + "gsldlc1 %[ftmp4], 0x1f(%[ip]) \n\t" + "gsldrc1 %[ftmp4], 0x18(%[ip]) \n\t" + + // ip[0...3] + ip[8...11] + "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t" + // ip[0...3] - ip[8...11] + "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t" + // (ip[12...15] * sinpi8sqrt2) >> 16 + "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t" + "pmulhh %[ftmp7], %[ftmp9], %[ff_ph_22a3] \n\t" + // (ip[ 4... 7] * sinpi8sqrt2) >> 16 + "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t" + "pmulhh %[ftmp8], %[ftmp9], %[ff_ph_22a3] \n\t" + // ip[ 4... 7] + ((ip[ 4... 7] * cospi8sqrt2minus1) >> 16) + "pmulhh %[ftmp9], %[ftmp2], %[ff_ph_4e7b] \n\t" + "paddh %[ftmp9], %[ftmp9], %[ftmp2] \n\t" + // ip[12...15] + ((ip[12...15] * cospi8sqrt2minus1) >> 16) + "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t" + "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t" + + "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t" + "psubh %[ftmp2], %[ftmp2], %[ftmp10] \n\t" + "psubh %[ftmp3], %[ftmp6], %[ftmp8] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp10] \n\t" + "psubh %[ftmp4], %[ftmp5], %[ftmp7] \n\t" + "psubh %[ftmp4], %[ftmp4], %[ftmp9] \n\t" + + TRANSPOSE_4H + // a + "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t" + // b + "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t" + // c + "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t" + "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t" + "psubh %[ftmp7], %[ftmp9], %[ftmp4] \n\t" + "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp10] \n\t" + // d + "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t" + "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t" + "paddh %[ftmp8], %[ftmp9], %[ftmp2] \n\t" + "pmulhh %[ftmp10], %[ftmp2], %[ff_ph_4e7b] \n\t" + "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t" + + MMI_LI(%[tmp0], 0x03) + "mtc1 %[tmp0], %[ftmp11] \n\t" + // a + d + "paddh %[ftmp1], %[ftmp5], %[ftmp8] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_ph_04] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + // b + c + "paddh %[ftmp2], %[ftmp6], %[ftmp7] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ff_ph_04] \n\t" + "psrah %[ftmp2], %[ftmp2], %[ftmp11] \n\t" + // b - c + "psubh %[ftmp3], %[ftmp6], %[ftmp7] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ff_ph_04] \n\t" + "psrah %[ftmp3], %[ftmp3], %[ftmp11] \n\t" + // a - d + "psubh %[ftmp4], %[ftmp5], %[ftmp8] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ff_ph_04] \n\t" + "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t" + + TRANSPOSE_4H +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[pred_prt]) \n\t" + "mtc1 %[tmp0], %[ftmp5] \n\t" +#else + "gslwlc1 %[ftmp5], 0x03(%[pred_ptr]) \n\t" + "gslwrc1 %[ftmp5], 0x00(%[pred_ptr]) \n\t" +#endif + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t" + MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride]) + MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride]) + +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[pred_prt]) \n\t" + "mtc1 %[tmp0], %[ftmp6] \n\t" +#else + "gslwlc1 %[ftmp6], 0x03(%[pred_ptr]) \n\t" + "gslwrc1 %[ftmp6], 0x00(%[pred_ptr]) \n\t" +#endif + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "gsswlc1 %[ftmp2], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp2], 0x00(%[dst_ptr]) \n\t" + MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride]) + MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride]) + +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[pred_prt]) \n\t" + "mtc1 %[tmp0], %[ftmp7] \n\t" +#else + "gslwlc1 %[ftmp7], 0x03(%[pred_ptr]) \n\t" + "gslwrc1 %[ftmp7], 0x00(%[pred_ptr]) \n\t" +#endif + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "gsswlc1 %[ftmp3], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp3], 0x00(%[dst_ptr]) \n\t" + MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride]) + MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride]) + +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[pred_prt]) \n\t" + "mtc1 %[tmp0], %[ftmp8] \n\t" +#else + "gslwlc1 %[ftmp8], 0x03(%[pred_ptr]) \n\t" + "gslwrc1 %[ftmp8], 0x00(%[pred_ptr]) \n\t" +#endif + "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ftmp8] \n\t" + "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "gsswlc1 %[ftmp4], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp4], 0x00(%[dst_ptr]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]), + [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]), + [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), + [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), + [pred_ptr]"+&r"(pred_ptr), [dst_ptr]"+&r"(dst_ptr), + [ff_ph_4e7b]"=&f"(ff_ph_4e7b), [ff_ph_04]"=&f"(ff_ph_04), + [ff_ph_22a3]"=&f"(ff_ph_22a3) + : [ip]"r"(input), + [pred_stride]"r"((mips_reg)pred_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); +} + +void vp8_dc_only_idct_add_mmi(int16_t input_dc, unsigned char *pred_ptr, + int pred_stride, unsigned char *dst_ptr, + int dst_stride) { + int a0 = ((input_dc + 4) >> 3); + double a1, ftmp[5]; + int low32; + + __asm__ volatile ( + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dmtc1 %[a0], %[a1] \n\t" + "pshufh %[a1], %[a1], %[ftmp0] \n\t" + "ulw %[low32], 0x00(%[pred_ptr]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t" + "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t" + + MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride]) + MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride]) + "ulw %[low32], 0x00(%[pred_ptr]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t" + "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t" + + MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride]) + MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride]) + "ulw %[low32], 0x00(%[pred_ptr]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t" + "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t" + + MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride]) + MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride]) + "ulw %[low32], 0x00(%[pred_ptr]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t" + "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]), + [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [low32]"=&r"(low32), + [dst_ptr]"+&r"(dst_ptr), [pred_ptr]"+&r"(pred_ptr), [a1]"=&f"(a1) + : [dst_stride]"r"((mips_reg)dst_stride), + [pred_stride]"r"((mips_reg)pred_stride), [a0]"r"(a0) + : "memory" + ); +} + +void vp8_short_inv_walsh4x4_mmi(int16_t *input, int16_t *mb_dqcoeff) { + int i; + int16_t output[16]; + double ff_ph_03, ftmp[12]; + uint64_t tmp[1]; + + __asm__ volatile ( + "dli %[tmp0], 0x0003000300030003 \n\t" + "dmtc1 %[tmp0], %[ff_ph_03] \n\t" + MMI_LI(%[tmp0], 0x03) + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dmtc1 %[tmp0], %[ftmp11] \n\t" + "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[ip]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[ip]) \n\t" + "gsldlc1 %[ftmp3], 0x17(%[ip]) \n\t" + "gsldrc1 %[ftmp3], 0x10(%[ip]) \n\t" + "gsldlc1 %[ftmp4], 0x1f(%[ip]) \n\t" + "gsldrc1 %[ftmp4], 0x18(%[ip]) \n\t" + "paddh %[ftmp5], %[ftmp1], %[ftmp2] \n\t" + "psubh %[ftmp6], %[ftmp1], %[ftmp2] \n\t" + "paddh %[ftmp7], %[ftmp3], %[ftmp4] \n\t" + "psubh %[ftmp8], %[ftmp3], %[ftmp4] \n\t" + + "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t" + "psubh %[ftmp2], %[ftmp5], %[ftmp7] \n\t" + "psubh %[ftmp3], %[ftmp6], %[ftmp8] \n\t" + "paddh %[ftmp4], %[ftmp6], %[ftmp8] \n\t" + + TRANSPOSE_4H + // a + "paddh %[ftmp5], %[ftmp1], %[ftmp4] \n\t" + // d + "psubh %[ftmp6], %[ftmp1], %[ftmp4] \n\t" + // b + "paddh %[ftmp7], %[ftmp2], %[ftmp3] \n\t" + // c + "psubh %[ftmp8], %[ftmp2], %[ftmp3] \n\t" + + "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t" + "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t" + "psubh %[ftmp3], %[ftmp5], %[ftmp7] \n\t" + "psubh %[ftmp4], %[ftmp6], %[ftmp8] \n\t" + + "paddh %[ftmp1], %[ftmp1], %[ff_ph_03] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ff_ph_03] \n\t" + "psrah %[ftmp2], %[ftmp2], %[ftmp11] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ff_ph_03] \n\t" + "psrah %[ftmp3], %[ftmp3], %[ftmp11] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ff_ph_03] \n\t" + "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t" + + TRANSPOSE_4H + "gssdlc1 %[ftmp1], 0x07(%[op]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[op]) \n\t" + "gssdlc1 %[ftmp2], 0x0f(%[op]) \n\t" + "gssdrc1 %[ftmp2], 0x08(%[op]) \n\t" + "gssdlc1 %[ftmp3], 0x17(%[op]) \n\t" + "gssdrc1 %[ftmp3], 0x10(%[op]) \n\t" + "gssdlc1 %[ftmp4], 0x1f(%[op]) \n\t" + "gssdrc1 %[ftmp4], 0x18(%[op]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]), + [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]), + [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), + [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), [ff_ph_03]"=&f"(ff_ph_03) + : [ip]"r"(input), [op]"r"(output) + : "memory" + ); + + for (i = 0; i < 16; i++) { + mb_dqcoeff[i * 16] = output[i]; + } +} diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c new file mode 100644 index 0000000000..a07a7e3b41 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c @@ -0,0 +1,1415 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vp8/common/loopfilter.h" +#include "vp8/common/onyxc_int.h" +#include "vpx_ports/asmdefs_mmi.h" + +void vp8_loop_filter_horizontal_edge_mmi( + unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit, + const unsigned char *limit, const unsigned char *thresh, int count) { + uint64_t tmp[1]; + mips_reg addr[2]; + double ftmp[12]; + double ff_ph_01, ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03; + /* clang-format off */ + __asm__ volatile ( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ff_ph_01] \n\t" + "dli %[tmp0], 0xfefefefefefefefe \n\t" + "dmtc1 %[tmp0], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x8080808080808080 \n\t" + "dmtc1 %[tmp0], %[ff_pb_80] \n\t" + "dli %[tmp0], 0x0404040404040404 \n\t" + "dmtc1 %[tmp0], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x0303030303030303 \n\t" + "dmtc1 %[tmp0], %[ff_pb_03] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp10], 0x07(%[limit]) \n\t" + "gsldrc1 %[ftmp10], 0x00(%[limit]) \n\t" + + MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step]) + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4]) + "gsldlc1 %[ftmp1], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[addr1]) \n\t" + + MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4]) + "gsldlc1 %[ftmp3], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[addr1]) \n\t" + "pasubub %[ftmp0], %[ftmp1], %[ftmp3] \n\t" + "psubusb %[ftmp0], %[ftmp0], %[ftmp10] \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gsldlc1 %[ftmp4], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[addr1]) \n\t" + "pasubub %[ftmp1], %[ftmp3], %[ftmp4] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp5], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[addr1]) \n\t" + "pasubub %[ftmp9], %[ftmp4], %[ftmp5] \n\t" + "psubusb %[ftmp1], %[ftmp9], %[ftmp10] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + + "gsldlc1 %[ftmp7], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[addr0]) \n\t" + "pasubub %[ftmp11], %[ftmp7], %[ftmp6] \n\t" + "psubusb %[ftmp1], %[ftmp11], %[ftmp10] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gsldlc1 %[ftmp8], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[addr1]) \n\t" + "pasubub %[ftmp1], %[ftmp8], %[ftmp7] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2]) + "gsldlc1 %[ftmp2], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[addr1]) \n\t" + "pasubub %[ftmp1], %[ftmp2], %[ftmp8] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + "pasubub %[ftmp1], %[ftmp5], %[ftmp6] \n\t" + "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + "pasubub %[ftmp2], %[ftmp4], %[ftmp7] \n\t" + "pand %[ftmp2], %[ftmp2], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" + "psrlh %[ftmp2], %[ftmp2], %[ftmp10] \n\t" + "paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "gsldlc1 %[ftmp10], 0x07(%[blimit]) \n\t" + "gsldrc1 %[ftmp10], 0x00(%[blimit]) \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" + "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp10] \n\t" + + "gsldlc1 %[ftmp10], 0x07(%[thresh]) \n\t" + "gsldrc1 %[ftmp10], 0x00(%[thresh]) \n\t" + "psubusb %[ftmp1], %[ftmp9], %[ftmp10] \n\t" + "psubusb %[ftmp2], %[ftmp11], %[ftmp10] \n\t" + "paddb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "pxor %[ftmp2], %[ftmp2], %[ftmp2] \n\t" + "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "pcmpeqb %[ftmp2], %[ftmp2], %[ftmp2] \n\t" + "pxor %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + + "pxor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" + + "psubsb %[ftmp2], %[ftmp4], %[ftmp7] \n\t" + "pand %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + "psubsb %[ftmp3], %[ftmp6], %[ftmp5] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "pand %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + + "paddsb %[ftmp8], %[ftmp2], %[ff_pb_03] \n\t" + "paddsb %[ftmp9], %[ftmp2], %[ff_pb_04] \n\t" + + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp11], %[ftmp11], %[ftmp11] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp8] \n\t" + "punpckhbh %[ftmp11], %[ftmp11], %[ftmp8] \n\t" + + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp10] \n\t" + "psrah %[ftmp11], %[ftmp11], %[ftmp10] \n\t" + "packsshb %[ftmp8], %[ftmp0], %[ftmp11] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp10] \n\t" + "pxor %[ftmp11], %[ftmp11], %[ftmp11] \n\t" + "punpckhbh %[ftmp9], %[ftmp11], %[ftmp9] \n\t" + "psrah %[ftmp9], %[ftmp9], %[ftmp10] \n\t" + "paddsh %[ftmp11], %[ftmp0], %[ff_ph_01] \n\t" + "packsshb %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "paddsh %[ftmp9], %[ftmp9], %[ff_ph_01] \n\t" + + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" + "psrah %[ftmp11], %[ftmp11], %[ftmp10] \n\t" + "psrah %[ftmp9], %[ftmp9], %[ftmp10] \n\t" + "packsshb %[ftmp11], %[ftmp11], %[ftmp9] \n\t" + "pandn %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + "paddsb %[ftmp5], %[ftmp5], %[ftmp8] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp5], 0x07(%[addr1]) \n\t" + "gssdrc1 %[ftmp5], 0x00(%[addr1]) \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "paddsb %[ftmp4], %[ftmp4], %[ftmp1] \n\t" + "pxor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t" + "gssdlc1 %[ftmp4], 0x07(%[addr1]) \n\t" + "gssdrc1 %[ftmp4], 0x00(%[addr1]) \n\t" + + "psubsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "gssdlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + + "psubsb %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" + "gssdlc1 %[ftmp7], 0x07(%[addr0]) \n\t" + "gssdrc1 %[ftmp7], 0x00(%[addr0]) \n\t" + + "addiu %[count], %[count], -0x01 \n\t" + MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08) + "bnez %[count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count), + [ff_ph_01]"=&f"(ff_ph_01), [ff_pb_fe]"=&f"(ff_pb_fe), + [ff_pb_80]"=&f"(ff_pb_80), [ff_pb_04]"=&f"(ff_pb_04), + [ff_pb_03]"=&f"(ff_pb_03) + : [limit]"r"(limit), [blimit]"r"(blimit), + [thresh]"r"(thresh), + [src_pixel_step]"r"((mips_reg)src_pixel_step), + [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)), + [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)) + : "memory" + ); + /* clang-format on */ +} + +void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr, + int src_pixel_step, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, int count) { + uint64_t tmp[1]; + mips_reg addr[2]; + double ftmp[13]; + double ff_pb_fe, ff_ph_01, ff_pb_03, ff_pb_04, ff_pb_80; + + /* clang-format off */ + __asm__ volatile ( + "dli %[tmp0], 0xfefefefefefefefe \n\t" + "dmtc1 %[tmp0], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ff_ph_01] \n\t" + "dli %[tmp0], 0x0303030303030303 \n\t" + "dmtc1 %[tmp0], %[ff_pb_03] \n\t" + "dli %[tmp0], 0x0404040404040404 \n\t" + "dmtc1 %[tmp0], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x8080808080808080 \n\t" + "dmtc1 %[tmp0], %[ff_pb_80] \n\t" + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0]) + MMI_SUBU(%[src_ptr], %[src_ptr], 0x04) + + "1: \n\t" + MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step]) + + MMI_SLL (%[tmp0], %[src_pixel_step], 0x01) + MMI_ADDU(%[addr1], %[src_ptr], %[tmp0]) + "gsldlc1 %[ftmp11], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[addr1]) \n\t" + MMI_ADDU(%[addr1], %[addr0], %[tmp0]) + "gsldlc1 %[ftmp12], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp12], 0x00(%[addr1]) \n\t" + "punpcklbh %[ftmp1], %[ftmp11], %[ftmp12] \n\t" + "punpckhbh %[ftmp2], %[ftmp11], %[ftmp12] \n\t" + + "gsldlc1 %[ftmp11], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp12], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp12], 0x00(%[addr0]) \n\t" + "punpcklbh %[ftmp3], %[ftmp11], %[ftmp12] \n\t" + "punpckhbh %[ftmp4], %[ftmp11], %[ftmp12] \n\t" + + "punpcklhw %[ftmp5], %[ftmp4], %[ftmp2] \n\t" + "punpckhhw %[ftmp6], %[ftmp4], %[ftmp2] \n\t" + "punpcklhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t" + "punpckhhw %[ftmp8], %[ftmp3], %[ftmp1] \n\t" + + MMI_SLL(%[tmp0], %[src_pixel_step], 0x01) + MMI_SUBU(%[addr1], %[src_ptr], %[tmp0]) + "gsldlc1 %[ftmp11], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[addr1]) \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp12], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp12], 0x00(%[addr1]) \n\t" + "punpcklbh %[ftmp9], %[ftmp11], %[ftmp12] \n\t" + "punpckhbh %[ftmp10], %[ftmp11], %[ftmp12] \n\t" + + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_SUBU(%[addr1], %[src_ptr], %[tmp0]) + "gsldlc1 %[ftmp11], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[addr1]) \n\t" + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_SUBU(%[addr1], %[addr0], %[tmp0]) + "gsldlc1 %[ftmp12], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp12], 0x00(%[addr1]) \n\t" + "punpcklbh %[ftmp0], %[ftmp11], %[ftmp12] \n\t" + "punpckhbh %[ftmp11], %[ftmp11], %[ftmp12] \n\t" + + "punpcklhw %[ftmp1], %[ftmp11], %[ftmp10] \n\t" + "punpckhhw %[ftmp2], %[ftmp11], %[ftmp10] \n\t" + "punpcklhw %[ftmp3], %[ftmp0], %[ftmp9] \n\t" + "punpckhhw %[ftmp4], %[ftmp0], %[ftmp9] \n\t" + + /* ftmp9:q0 ftmp10:q1 */ + "punpcklwd %[ftmp9], %[ftmp1], %[ftmp5] \n\t" + "punpckhwd %[ftmp10], %[ftmp1], %[ftmp5] \n\t" + /* ftmp11:q2 ftmp12:q3 */ + "punpcklwd %[ftmp11], %[ftmp2], %[ftmp6] \n\t" + "punpckhwd %[ftmp12], %[ftmp2], %[ftmp6] \n\t" + /* ftmp1:p3 ftmp2:p2 */ + "punpcklwd %[ftmp1], %[ftmp3], %[ftmp7] \n\t" + "punpckhwd %[ftmp2], %[ftmp3], %[ftmp7] \n\t" + /* ftmp5:p1 ftmp6:p0 */ + "punpcklwd %[ftmp5], %[ftmp4], %[ftmp8] \n\t" + "punpckhwd %[ftmp6], %[ftmp4], %[ftmp8] \n\t" + + "gsldlc1 %[ftmp8], 0x07(%[limit]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[limit]) \n\t" + + /* abs (q3-q2) */ + "pasubub %[ftmp7], %[ftmp12], %[ftmp11] \n\t" + "psubusb %[ftmp0], %[ftmp7], %[ftmp8] \n\t" + /* abs (q2-q1) */ + "pasubub %[ftmp7], %[ftmp11], %[ftmp10] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* ftmp3: abs(q1-q0) */ + "pasubub %[ftmp3], %[ftmp10], %[ftmp9] \n\t" + "psubusb %[ftmp7], %[ftmp3], %[ftmp8] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* ftmp4: abs(p1-p0) */ + "pasubub %[ftmp4], %[ftmp5], %[ftmp6] \n\t" + "psubusb %[ftmp7], %[ftmp4], %[ftmp8] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* abs (p2-p1) */ + "pasubub %[ftmp7], %[ftmp2], %[ftmp5] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* abs (p3-p2) */ + "pasubub %[ftmp7], %[ftmp1], %[ftmp2] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + + "gsldlc1 %[ftmp8], 0x07(%[blimit]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[blimit]) \n\t" + + /* abs (p0-q0) */ + "pasubub %[ftmp11], %[ftmp9], %[ftmp6] \n\t" + "paddusb %[ftmp11], %[ftmp11], %[ftmp11] \n\t" + /* abs (p1-q1) */ + "pasubub %[ftmp12], %[ftmp10], %[ftmp5] \n\t" + "pand %[ftmp12], %[ftmp12], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp1] \n\t" + "psrlh %[ftmp12], %[ftmp12], %[ftmp1] \n\t" + "paddusb %[ftmp1], %[ftmp11], %[ftmp12] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp8] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "pxor %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + /* ftmp0:mask */ + "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + "gsldlc1 %[ftmp8], 0x07(%[thresh]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[thresh]) \n\t" + + /* ftmp3: abs(q1-q0) ftmp4: abs(p1-p0) */ + "psubusb %[ftmp4], %[ftmp4], %[ftmp8] \n\t" + "psubusb %[ftmp3], %[ftmp3], %[ftmp8] \n\t" + "por %[ftmp2], %[ftmp4], %[ftmp3] \n\t" + "pcmpeqb %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + /* ftmp1:hev */ + "pxor %[ftmp1], %[ftmp2], %[ftmp1] \n\t" + + "pxor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t" + "pxor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + + "psubsb %[ftmp2], %[ftmp5], %[ftmp10] \n\t" + "pand %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + "psubsb %[ftmp3], %[ftmp9], %[ftmp6] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + /* ftmp2:filter_value */ + "pand %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + + "paddsb %[ftmp11], %[ftmp2], %[ff_pb_04] \n\t" + "paddsb %[ftmp12], %[ftmp2], %[ff_pb_03] \n\t" + + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp7] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp12] \n\t" + "punpckhbh %[ftmp8], %[ftmp8], %[ftmp12] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + "packsshb %[ftmp12], %[ftmp0], %[ftmp8] \n\t" + + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp11] \n\t" + "punpckhbh %[ftmp8], %[ftmp8], %[ftmp11] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + "packsshb %[ftmp11], %[ftmp0], %[ftmp8] \n\t" + + "psubsb %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "pxor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t" + "paddsb %[ftmp6], %[ftmp6], %[ftmp12] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "paddsh %[ftmp0], %[ftmp0], %[ff_ph_01] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ff_ph_01] \n\t" + + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp7] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + "packsshb %[ftmp2], %[ftmp0], %[ftmp8] \n\t" + "pandn %[ftmp2], %[ftmp1], %[ftmp2] \n\t" + "psubsb %[ftmp10], %[ftmp10], %[ftmp2] \n\t" + "pxor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t" + "paddsb %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + + /* ftmp5: *op1 ; ftmp6: *op0 */ + "punpcklbh %[ftmp2], %[ftmp5], %[ftmp6] \n\t" + "punpckhbh %[ftmp1], %[ftmp5], %[ftmp6] \n\t" + /* ftmp9: *oq0 ; ftmp10: *oq1 */ + "punpcklbh %[ftmp4], %[ftmp9], %[ftmp10] \n\t" + "punpckhbh %[ftmp3], %[ftmp9], %[ftmp10] \n\t" + "punpckhhw %[ftmp6], %[ftmp2], %[ftmp4] \n\t" + "punpcklhw %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "punpckhhw %[ftmp5], %[ftmp1], %[ftmp3] \n\t" + "punpcklhw %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_SUBU(%[addr1], %[src_ptr], %[tmp0]) + "gsswlc1 %[ftmp2], 0x05(%[addr1]) \n\t" + "gsswrc1 %[ftmp2], 0x02(%[addr1]) \n\t" + + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "ssrld %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_SUBU(%[addr1], %[addr0], %[tmp0]) + "gsswlc1 %[ftmp2], 0x05(%[addr1]) \n\t" + "gsswrc1 %[ftmp2], 0x02(%[addr1]) \n\t" + + MMI_SLL(%[tmp0], %[src_pixel_step], 0x01) + MMI_SUBU(%[addr1], %[src_ptr], %[tmp0]) + "gsswlc1 %[ftmp6], 0x05(%[addr1]) \n\t" + "gsswrc1 %[ftmp6], 0x02(%[addr1]) \n\t" + + "ssrld %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gsswlc1 %[ftmp6], 0x05(%[addr1]) \n\t" + "gsswrc1 %[ftmp6], 0x02(%[addr1]) \n\t" + "gsswlc1 %[ftmp1], 0x05(%[src_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x02(%[src_ptr]) \n\t" + + "ssrld %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "gsswlc1 %[ftmp1], 0x05(%[addr0]) \n\t" + "gsswrc1 %[ftmp1], 0x02(%[addr0]) \n\t" + MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step]) + "gsswlc1 %[ftmp5], 0x05(%[addr1]) \n\t" + "gsswrc1 %[ftmp5], 0x02(%[addr1]) \n\t" + + "ssrld %[ftmp5], %[ftmp5], %[ftmp9] \n\t" + MMI_ADDU(%[addr1], %[addr0], %[tmp0]) + "gsswlc1 %[ftmp5], 0x05(%[addr1]) \n\t" + "gsswrc1 %[ftmp5], 0x02(%[addr1]) \n\t" + + MMI_ADDIU(%[count], %[count], -0x01) + MMI_SLL(%[tmp0], %[src_pixel_step], 0x03) + MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0]) + "bnez %[count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count), + [ff_ph_01]"=&f"(ff_ph_01), [ff_pb_03]"=&f"(ff_pb_03), + [ff_pb_04]"=&f"(ff_pb_04), [ff_pb_80]"=&f"(ff_pb_80), + [ff_pb_fe]"=&f"(ff_pb_fe) + : [limit]"r"(limit), [blimit]"r"(blimit), + [thresh]"r"(thresh), + [src_pixel_step]"r"((mips_reg)src_pixel_step) + : "memory" + ); + /* clang-format on */ +} + +/* clang-format off */ +#define VP8_MBLOOP_HPSRAB \ + "punpcklbh %[ftmp10], %[ftmp10], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t" \ + "psrah %[ftmp10], %[ftmp10], %[ftmp9] \n\t" \ + "psrah %[ftmp11], %[ftmp11], %[ftmp9] \n\t" \ + "packsshb %[ftmp0], %[ftmp10], %[ftmp11] \n\t" + +#define VP8_MBLOOP_HPSRAB_ADD(reg) \ + "punpcklbh %[ftmp1], %[ftmp0], %[ftmp12] \n\t" \ + "punpckhbh %[ftmp2], %[ftmp0], %[ftmp12] \n\t" \ + "pmulhh %[ftmp1], %[ftmp1], " #reg " \n\t" \ + "pmulhh %[ftmp2], %[ftmp2], " #reg " \n\t" \ + "paddh %[ftmp1], %[ftmp1], %[ff_ph_003f] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ff_ph_003f] \n\t" \ + "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t" \ + "psrah %[ftmp2], %[ftmp2], %[ftmp9] \n\t" \ + "packsshb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" +/* clang-format on */ + +void vp8_mbloop_filter_horizontal_edge_mmi( + unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit, + const unsigned char *limit, const unsigned char *thresh, int count) { + uint64_t tmp[1]; + double ftmp[13]; + double ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03, ff_ph_003f, ff_ph_0900, + ff_ph_1200, ff_ph_1b00; + + /* clang-format off */ + __asm__ volatile ( + "dli %[tmp0], 0xfefefefefefefefe \n\t" + "dmtc1 %[tmp0], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x8080808080808080 \n\t" + "dmtc1 %[tmp0], %[ff_pb_80] \n\t" + "dli %[tmp0], 0x0404040404040404 \n\t" + "dmtc1 %[tmp0], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x0303030303030303 \n\t" + "dmtc1 %[tmp0], %[ff_pb_03] \n\t" + "dli %[tmp0], 0x003f003f003f003f \n\t" + "dmtc1 %[tmp0], %[ff_ph_003f] \n\t" + "dli %[tmp0], 0x0900090009000900 \n\t" + "dmtc1 %[tmp0], %[ff_ph_0900] \n\t" + "dli %[tmp0], 0x1200120012001200 \n\t" + "dmtc1 %[tmp0], %[ff_ph_1200] \n\t" + "dli %[tmp0], 0x1b001b001b001b00 \n\t" + "dmtc1 %[tmp0], %[ff_ph_1b00] \n\t" + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0]) + "1: \n\t" + "gsldlc1 %[ftmp9], 0x07(%[limit]) \n\t" + "gsldrc1 %[ftmp9], 0x00(%[limit]) \n\t" + /* ftmp1: p3 */ + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + /* ftmp3: p2 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp3], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[src_ptr]) \n\t" + /* ftmp4: p1 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp4], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[src_ptr]) \n\t" + /* ftmp5: p0 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t" + /* ftmp6: q0 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + /* ftmp7: q1 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t" + /* ftmp8: q2 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t" + /* ftmp2: q3 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp2], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[src_ptr]) \n\t" + + "gsldlc1 %[ftmp12], 0x07(%[blimit]) \n\t" + "gsldrc1 %[ftmp12], 0x00(%[blimit]) \n\t" + + "pasubub %[ftmp0], %[ftmp1], %[ftmp3] \n\t" + "psubusb %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "pasubub %[ftmp1], %[ftmp3], %[ftmp4] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "pasubub %[ftmp10], %[ftmp4], %[ftmp5] \n\t" + "psubusb %[ftmp1], %[ftmp10], %[ftmp9] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "pasubub %[ftmp11], %[ftmp7], %[ftmp6] \n\t" + "psubusb %[ftmp1], %[ftmp11], %[ftmp9] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "pasubub %[ftmp1], %[ftmp8], %[ftmp7] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "pasubub %[ftmp1], %[ftmp2], %[ftmp8] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + "pasubub %[ftmp1], %[ftmp5], %[ftmp6] \n\t" + "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + "pasubub %[ftmp2], %[ftmp4], %[ftmp7] \n\t" + "pand %[ftmp2], %[ftmp2], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "psrlh %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + "paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp12] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "pxor %[ftmp9], %[ftmp9], %[ftmp9] \n\t" + /* ftmp0: mask */ + "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + + "gsldlc1 %[ftmp9], 0x07(%[thresh]) \n\t" + "gsldrc1 %[ftmp9], 0x00(%[thresh]) \n\t" + "psubusb %[ftmp1], %[ftmp10], %[ftmp9] \n\t" + "psubusb %[ftmp2], %[ftmp11], %[ftmp9] \n\t" + "paddb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "pxor %[ftmp2], %[ftmp2], %[ftmp2] \n\t" + "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "pcmpeqb %[ftmp2], %[ftmp2], %[ftmp2] \n\t" + /* ftmp1: hev */ + "pxor %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + + "pxor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" + "psubsb %[ftmp2], %[ftmp4], %[ftmp7] \n\t" + "psubsb %[ftmp9], %[ftmp6], %[ftmp5] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + "pand %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "pandn %[ftmp12], %[ftmp1], %[ftmp2] \n\t" + "pand %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "paddsb %[ftmp0], %[ftmp2], %[ff_pb_03] \n\t" + VP8_MBLOOP_HPSRAB + "paddsb %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "paddsb %[ftmp0], %[ftmp2], %[ff_pb_04] \n\t" + VP8_MBLOOP_HPSRAB + "psubsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + + "dli %[tmp0], 0x07 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + + VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1b00]) + "psubsb %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "paddsb %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0]) + "gssdlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + + VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1200]) + "paddsb %[ftmp4], %[ftmp4], %[ftmp1] \n\t" + "psubsb %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "pxor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t" + "pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp4], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp4], 0x00(%[src_ptr]) \n\t" + + VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_0900]) + "pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ff_pb_80] \n\t" + "paddsb %[ftmp3], %[ftmp3], %[ftmp1] \n\t" + "psubsb %[ftmp8], %[ftmp8], %[ftmp1] \n\t" + "pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ff_pb_80] \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0]) + "gssdlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0]) + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp3], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp3], 0x00(%[src_ptr]) \n\t" + + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08) + "addiu %[count], %[count], -0x01 \n\t" + "bnez %[count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count), + [ff_pb_fe]"=&f"(ff_pb_fe), [ff_pb_80]"=&f"(ff_pb_80), + [ff_pb_04]"=&f"(ff_pb_04), [ff_pb_03]"=&f"(ff_pb_03), + [ff_ph_0900]"=&f"(ff_ph_0900), [ff_ph_1b00]"=&f"(ff_ph_1b00), + [ff_ph_1200]"=&f"(ff_ph_1200), [ff_ph_003f]"=&f"(ff_ph_003f) + : [limit]"r"(limit), [blimit]"r"(blimit), + [thresh]"r"(thresh), + [src_pixel_step]"r"((mips_reg)src_pixel_step) + : "memory" + ); + /* clang-format on */ +} + +/* clang-format off */ +#define VP8_MBLOOP_VPSRAB_ADDH \ + "pxor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" \ + "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" \ + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t" + +#define VP8_MBLOOP_VPSRAB_ADDT \ + "paddh %[ftmp7], %[ftmp7], %[ff_ph_003f] \n\t" \ + "paddh %[ftmp8], %[ftmp8], %[ff_ph_003f] \n\t" \ + "psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t" \ + "psrah %[ftmp8], %[ftmp8], %[ftmp12] \n\t" \ + "packsshb %[ftmp3], %[ftmp7], %[ftmp8] \n\t" +/* clang-format on */ + +void vp8_mbloop_filter_vertical_edge_mmi( + unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit, + const unsigned char *limit, const unsigned char *thresh, int count) { + mips_reg tmp[1]; + DECLARE_ALIGNED(8, const uint64_t, srct[2]); + double ftmp[14]; + double ff_ph_003f, ff_ph_0900, ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03; + + /* clang-format off */ + __asm__ volatile ( + "dli %[tmp0], 0x003f003f003f003f \n\t" + "dmtc1 %[tmp0], %[ff_ph_003f] \n\t" + "dli %[tmp0], 0x0900090009000900 \n\t" + "dmtc1 %[tmp0], %[ff_ph_0900] \n\t" + "dli %[tmp0], 0xfefefefefefefefe \n\t" + "dmtc1 %[tmp0], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x8080808080808080 \n\t" + "dmtc1 %[tmp0], %[ff_pb_80] \n\t" + "dli %[tmp0], 0x0404040404040404 \n\t" + "dmtc1 %[tmp0], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x0303030303030303 \n\t" + "dmtc1 %[tmp0], %[ff_pb_03] \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], 0x04) + + "1: \n\t" + "gsldlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t" + + "punpcklbh %[ftmp11], %[ftmp5], %[ftmp6] \n\t" + "punpckhbh %[ftmp12], %[ftmp5], %[ftmp6] \n\t" + "punpcklbh %[ftmp9], %[ftmp7], %[ftmp8] \n\t" + "punpckhbh %[ftmp10], %[ftmp7], %[ftmp8] \n\t" + + "punpcklhw %[ftmp1], %[ftmp12], %[ftmp10] \n\t" + "punpckhhw %[ftmp2], %[ftmp12], %[ftmp10] \n\t" + "punpcklhw %[ftmp3], %[ftmp11], %[ftmp9] \n\t" + "punpckhhw %[ftmp4], %[ftmp11], %[ftmp9] \n\t" + + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t" + + "punpcklbh %[ftmp11], %[ftmp5], %[ftmp6] \n\t" + "punpckhbh %[ftmp12], %[ftmp5], %[ftmp6] \n\t" + "punpcklbh %[ftmp9], %[ftmp7], %[ftmp8] \n\t" + "punpckhbh %[ftmp10], %[ftmp7], %[ftmp8] \n\t" + + "punpcklhw %[ftmp5], %[ftmp12], %[ftmp10] \n\t" + "punpckhhw %[ftmp6], %[ftmp12], %[ftmp10] \n\t" + "punpcklhw %[ftmp7], %[ftmp11], %[ftmp9] \n\t" + "punpckhhw %[ftmp8], %[ftmp11], %[ftmp9] \n\t" + + "gsldlc1 %[ftmp13], 0x07(%[limit]) \n\t" + "gsldrc1 %[ftmp13], 0x00(%[limit]) \n\t" + /* ftmp9:q0 ftmp10:q1 */ + "punpcklwd %[ftmp9], %[ftmp1], %[ftmp5] \n\t" + "punpckhwd %[ftmp10], %[ftmp1], %[ftmp5] \n\t" + /* ftmp11:q2 ftmp12:q3 */ + "punpcklwd %[ftmp11], %[ftmp2], %[ftmp6] \n\t" + "punpckhwd %[ftmp12], %[ftmp2], %[ftmp6] \n\t" + /* srct[0x00]: q3 */ + "sdc1 %[ftmp12], 0x00(%[srct]) \n\t" + /* ftmp1:p3 ftmp2:p2 */ + "punpcklwd %[ftmp1], %[ftmp3], %[ftmp7] \n\t" + "punpckhwd %[ftmp2], %[ftmp3], %[ftmp7] \n\t" + /* srct[0x08]: p3 */ + "sdc1 %[ftmp1], 0x08(%[srct]) \n\t" + /* ftmp5:p1 ftmp6:p0 */ + "punpcklwd %[ftmp5], %[ftmp4], %[ftmp8] \n\t" + "punpckhwd %[ftmp6], %[ftmp4], %[ftmp8] \n\t" + + /* abs (q3-q2) */ + "pasubub %[ftmp7], %[ftmp12], %[ftmp11] \n\t" + "psubusb %[ftmp0], %[ftmp7], %[ftmp13] \n\t" + /* abs (q2-q1) */ + "pasubub %[ftmp7], %[ftmp11], %[ftmp10] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* ftmp3: abs(q1-q0) */ + "pasubub %[ftmp3], %[ftmp10], %[ftmp9] \n\t" + "psubusb %[ftmp7], %[ftmp3], %[ftmp13] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* ftmp4: abs(p1-p0) */ + "pasubub %[ftmp4], %[ftmp5], %[ftmp6] \n\t" + "psubusb %[ftmp7], %[ftmp4], %[ftmp13] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* abs (p2-p1) */ + "pasubub %[ftmp7], %[ftmp2], %[ftmp5] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* abs (p3-p2) */ + "pasubub %[ftmp7], %[ftmp1], %[ftmp2] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + + "gsldlc1 %[ftmp13], 0x07(%[blimit]) \n\t" + "gsldrc1 %[ftmp13], 0x00(%[blimit]) \n\t" + "gsldlc1 %[ftmp7], 0x07(%[thresh]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[thresh]) \n\t" + /* abs (p0-q0) * 2 */ + "pasubub %[ftmp1], %[ftmp9], %[ftmp6] \n\t" + "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + /* abs (p1-q1) / 2 */ + "pasubub %[ftmp12], %[ftmp10], %[ftmp5] \n\t" + "pand %[ftmp12], %[ftmp12], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp8] \n\t" + "psrlh %[ftmp12], %[ftmp12], %[ftmp8] \n\t" + "paddusb %[ftmp12], %[ftmp1], %[ftmp12] \n\t" + "psubusb %[ftmp12], %[ftmp12], %[ftmp13] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp12] \n\t" + "pxor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" + /* ftmp0: mask */ + "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp12] \n\t" + + /* abs(p1-p0) - thresh */ + "psubusb %[ftmp4], %[ftmp4], %[ftmp7] \n\t" + /* abs(q1-q0) - thresh */ + "psubusb %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "por %[ftmp3], %[ftmp4], %[ftmp3] \n\t" + "pcmpeqb %[ftmp3], %[ftmp3], %[ftmp12] \n\t" + "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + /* ftmp1: hev */ + "pxor %[ftmp1], %[ftmp3], %[ftmp1] \n\t" + + /* ftmp2:ps2, ftmp5:ps1, ftmp6:ps0, ftmp9:qs0, ftmp10:qs1, ftmp11:qs2 */ + "pxor %[ftmp11], %[ftmp11], %[ff_pb_80] \n\t" + "pxor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t" + "pxor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + "pxor %[ftmp2], %[ftmp2], %[ff_pb_80] \n\t" + + "psubsb %[ftmp3], %[ftmp5], %[ftmp10] \n\t" + "psubsb %[ftmp4], %[ftmp9], %[ftmp6] \n\t" + "paddsb %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "paddsb %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "paddsb %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + /* filter_value &= mask */ + "pand %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + /* Filter2 = filter_value & hev */ + "pand %[ftmp3], %[ftmp1], %[ftmp0] \n\t" + /* filter_value &= ~hev */ + "pandn %[ftmp0], %[ftmp1], %[ftmp0] \n\t" + + "paddsb %[ftmp4], %[ftmp3], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp4] \n\t" + "punpckhbh %[ftmp8], %[ftmp8], %[ftmp4] \n\t" + "psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp12] \n\t" + "packsshb %[ftmp4], %[ftmp7], %[ftmp8] \n\t" + /* ftmp9: qs0 */ + "psubsb %[ftmp9], %[ftmp9], %[ftmp4] \n\t" + "paddsb %[ftmp3], %[ftmp3], %[ff_pb_03] \n\t" + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" + "punpckhbh %[ftmp8], %[ftmp8], %[ftmp3] \n\t" + "psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp12] \n\t" + "packsshb %[ftmp3], %[ftmp7], %[ftmp8] \n\t" + /* ftmp6: ps0 */ + "paddsb %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + + "dli %[tmp0], 0x07 \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" + VP8_MBLOOP_VPSRAB_ADDH + "paddh %[ftmp1], %[ff_ph_0900], %[ff_ph_0900] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_ph_0900] \n\t" + "pmulhh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "pmulhh %[ftmp8], %[ftmp8], %[ftmp1] \n\t" + VP8_MBLOOP_VPSRAB_ADDT + "psubsb %[ftmp4], %[ftmp9], %[ftmp3] \n\t" + /* ftmp9: oq0 */ + "pxor %[ftmp9], %[ftmp4], %[ff_pb_80] \n\t" + "paddsb %[ftmp4], %[ftmp6], %[ftmp3] \n\t" + /* ftmp6: op0 */ + "pxor %[ftmp6], %[ftmp4], %[ff_pb_80] \n\t" + + VP8_MBLOOP_VPSRAB_ADDH + "paddh %[ftmp1], %[ff_ph_0900], %[ff_ph_0900] \n\t" + "pmulhh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "pmulhh %[ftmp8], %[ftmp8], %[ftmp1] \n\t" + VP8_MBLOOP_VPSRAB_ADDT + "psubsb %[ftmp4], %[ftmp10], %[ftmp3] \n\t" + /* ftmp10: oq1 */ + "pxor %[ftmp10], %[ftmp4], %[ff_pb_80] \n\t" + "paddsb %[ftmp4], %[ftmp5], %[ftmp3] \n\t" + /* ftmp5: op1 */ + "pxor %[ftmp5], %[ftmp4], %[ff_pb_80] \n\t" + + VP8_MBLOOP_VPSRAB_ADDH + "pmulhh %[ftmp7], %[ftmp7], %[ff_ph_0900] \n\t" + "pmulhh %[ftmp8], %[ftmp8], %[ff_ph_0900] \n\t" + VP8_MBLOOP_VPSRAB_ADDT + "psubsb %[ftmp4], %[ftmp11], %[ftmp3] \n\t" + /* ftmp11: oq2 */ + "pxor %[ftmp11], %[ftmp4], %[ff_pb_80] \n\t" + "paddsb %[ftmp4], %[ftmp2], %[ftmp3] \n\t" + /* ftmp2: op2 */ + "pxor %[ftmp2], %[ftmp4], %[ff_pb_80] \n\t" + + "ldc1 %[ftmp12], 0x00(%[srct]) \n\t" + "ldc1 %[ftmp8], 0x08(%[srct]) \n\t" + + "punpcklbh %[ftmp0], %[ftmp8], %[ftmp2] \n\t" + "punpckhbh %[ftmp1], %[ftmp8], %[ftmp2] \n\t" + "punpcklbh %[ftmp2], %[ftmp5], %[ftmp6] \n\t" + "punpckhbh %[ftmp3], %[ftmp5], %[ftmp6] \n\t" + "punpcklhw %[ftmp4], %[ftmp0], %[ftmp2] \n\t" + "punpckhhw %[ftmp5], %[ftmp0], %[ftmp2] \n\t" + "punpcklhw %[ftmp6], %[ftmp1], %[ftmp3] \n\t" + "punpckhhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t" + + "punpcklbh %[ftmp0], %[ftmp9], %[ftmp10] \n\t" + "punpckhbh %[ftmp1], %[ftmp9], %[ftmp10] \n\t" + "punpcklbh %[ftmp2], %[ftmp11], %[ftmp12] \n\t" + "punpckhbh %[ftmp3], %[ftmp11], %[ftmp12] \n\t" + "punpcklhw %[ftmp8], %[ftmp0], %[ftmp2] \n\t" + "punpckhhw %[ftmp9], %[ftmp0], %[ftmp2] \n\t" + "punpcklhw %[ftmp10], %[ftmp1], %[ftmp3] \n\t" + "punpckhhw %[ftmp11], %[ftmp1], %[ftmp3] \n\t" + + "punpcklwd %[ftmp0], %[ftmp7], %[ftmp11] \n\t" + "punpckhwd %[ftmp1], %[ftmp7], %[ftmp11] \n\t" + "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + + "punpcklwd %[ftmp0], %[ftmp6], %[ftmp10] \n\t" + "punpckhwd %[ftmp1], %[ftmp6], %[ftmp10] \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + + "punpcklwd %[ftmp1], %[ftmp5], %[ftmp9] \n\t" + "punpckhwd %[ftmp0], %[ftmp5], %[ftmp9] \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + + "punpcklwd %[ftmp1], %[ftmp4], %[ftmp8] \n\t" + "punpckhwd %[ftmp0], %[ftmp4], %[ftmp8] \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + "addiu %[count], %[count], -0x01 \n\t" + + MMI_SLL(%[tmp0], %[src_pixel_step], 0x03) + MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0]) + "bnez %[count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]), + [tmp0]"=&r"(tmp[0]), [src_ptr]"+&r"(src_ptr), + [count]"+&r"(count), + [ff_ph_003f]"=&f"(ff_ph_003f), [ff_ph_0900]"=&f"(ff_ph_0900), + [ff_pb_03]"=&f"(ff_pb_03), [ff_pb_04]"=&f"(ff_pb_04), + [ff_pb_80]"=&f"(ff_pb_80), [ff_pb_fe]"=&f"(ff_pb_fe) + : [limit]"r"(limit), [blimit]"r"(blimit), + [srct]"r"(srct), [thresh]"r"(thresh), + [src_pixel_step]"r"((mips_reg)src_pixel_step) + : "memory" + ); + /* clang-format on */ +} + +/* clang-format off */ +#define VP8_SIMPLE_HPSRAB \ + "psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t" \ + "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t" \ + "psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t" \ + "psrah %[ftmp1], %[ftmp5], %[ftmp10] \n\t" \ + "psllh %[ftmp1], %[ftmp1], %[ftmp8] \n\t" \ + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" +/* clang-format on */ + +void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr, + int src_pixel_step, + const unsigned char *blimit) { + uint64_t tmp[1], count = 2; + mips_reg addr[2]; + double ftmp[12]; + double ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_01; + + /* clang-format off */ + __asm__ volatile ( + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp11] \n\t" + "dli %[tmp0], 0x08 \n\t" + "dmtc1 %[tmp0], %[ftmp8] \n\t" + "dli %[tmp0], 0x03 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp11] \n\t" + "dli %[tmp0], 0xfefefefefefefefe \n\t" + "dmtc1 %[tmp0], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x8080808080808080 \n\t" + "dmtc1 %[tmp0], %[ff_pb_80] \n\t" + "dli %[tmp0], 0x0404040404040404 \n\t" + "dmtc1 %[tmp0], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x0101010101010101 \n\t" + "dmtc1 %[tmp0], %[ff_pb_01] \n\t" + + "1: \n\t" + "gsldlc1 %[ftmp3], 0x07(%[blimit]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[blimit]) \n\t" + + MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step]) + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gsldlc1 %[ftmp2], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[addr1]) \n\t" + "gsldlc1 %[ftmp7], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[addr0]) \n\t" + "pasubub %[ftmp1], %[ftmp7], %[ftmp2] \n\t" + "pand %[ftmp1], %[ftmp1], %[ff_pb_fe] \n\t" + "psrlh %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp6], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[addr1]) \n\t" + "gsldlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + "pasubub %[ftmp5], %[ftmp6], %[ftmp0] \n\t" + "paddusb %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "paddusb %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "psubusb %[ftmp5], %[ftmp5], %[ftmp3] \n\t" + "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "pcmpeqb %[ftmp5], %[ftmp5], %[ftmp3] \n\t" + + "pxor %[ftmp2], %[ftmp2], %[ff_pb_80] \n\t" + "pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" + "psubsb %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "pxor %[ftmp3], %[ftmp0], %[ff_pb_80] \n\t" + "psubsb %[ftmp0], %[ftmp3], %[ftmp6] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "pand %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + + "paddsb %[ftmp5], %[ftmp5], %[ff_pb_04] \n\t" + VP8_SIMPLE_HPSRAB + "psubsb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" + "gssdlc1 %[ftmp3], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp3], 0x00(%[src_ptr]) \n\t" + + "psubsb %[ftmp5], %[ftmp5], %[ff_pb_01] \n\t" + VP8_SIMPLE_HPSRAB + "paddsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp6], 0x07(%[addr1]) \n\t" + "gssdrc1 %[ftmp6], 0x00(%[addr1]) \n\t" + + "addiu %[count], %[count], -0x01 \n\t" + MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08) + "bnez %[count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count), + [ff_pb_fe]"=&f"(ff_pb_fe), [ff_pb_80]"=&f"(ff_pb_80), + [ff_pb_04]"=&f"(ff_pb_04), [ff_pb_01]"=&f"(ff_pb_01) + : [blimit]"r"(blimit), + [src_pixel_step]"r"((mips_reg)src_pixel_step), + [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)) + : "memory" + ); + /* clang-format on */ +} + +void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr, + int src_pixel_step, + const unsigned char *blimit) { + uint64_t tmp[1], count = 2; + mips_reg addr[2]; + DECLARE_ALIGNED(8, const uint64_t, srct[2]); + double ftmp[12], ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_01; + + /* clang-format off */ + __asm__ volatile ( + "dli %[tmp0], 0x08 \n\t" + "dmtc1 %[tmp0], %[ftmp8] \n\t" + "dli %[tmp0], 0x20 \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" + "dli %[tmp0], 0x08 \n\t" + "dmtc1 %[tmp0], %[ftmp8] \n\t" + "dli %[tmp0], 0x20 \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" + "dli %[tmp0], 0xfefefefefefefefe \n\t" + "dmtc1 %[tmp0], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x8080808080808080 \n\t" + "dmtc1 %[tmp0], %[ff_pb_80] \n\t" + "dli %[tmp0], 0x0404040404040404 \n\t" + "dmtc1 %[tmp0], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x0101010101010101 \n\t" + "dmtc1 %[tmp0], %[ff_pb_01] \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x4]) + MMI_SUBU(%[src_ptr], %[src_ptr], 0x02) + + "1: \n\t" + MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step]) + MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2]) + "gslwlc1 %[ftmp0], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp0], 0x00(%[addr1]) \n\t" + MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gslwlc1 %[ftmp6], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp6], 0x00(%[addr1]) \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + + MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gslwlc1 %[ftmp0], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp0], 0x00(%[addr1]) \n\t" + "gslwlc1 %[ftmp4], 0x03(%[src_ptr]) \n\t" + "gslwrc1 %[ftmp4], 0x00(%[src_ptr]) \n\t" + + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "punpckhhw %[ftmp5], %[ftmp4], %[ftmp6] \n\t" + "punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gslwlc1 %[ftmp7], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp7], 0x00(%[addr1]) \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gslwlc1 %[ftmp6], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp6], 0x00(%[addr1]) \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp7] \n\t" + + MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4]) + "gslwlc1 %[ftmp1], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp1], 0x00(%[addr1]) \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4]) + "gslwlc1 %[ftmp0], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp0], 0x00(%[addr1]) \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + "punpckhhw %[ftmp2], %[ftmp0], %[ftmp6] \n\t" + "punpcklhw %[ftmp0], %[ftmp0], %[ftmp6] \n\t" + "punpckhwd %[ftmp1], %[ftmp0], %[ftmp4] \n\t" + "punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "punpckhwd %[ftmp3], %[ftmp2], %[ftmp5] \n\t" + "punpcklwd %[ftmp2], %[ftmp2], %[ftmp5] \n\t" + + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "pasubub %[ftmp6], %[ftmp3], %[ftmp0] \n\t" + "pand %[ftmp6], %[ftmp6], %[ff_pb_fe] \n\t" + "psrlh %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "pasubub %[ftmp5], %[ftmp1], %[ftmp2] \n\t" + "paddusb %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "paddusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t" + + "gsldlc1 %[ftmp7], 0x07(%[blimit]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[blimit]) \n\t" + "psubusb %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "pxor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "pcmpeqb %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + + "sdc1 %[ftmp0], 0x00(%[srct]) \n\t" + "sdc1 %[ftmp3], 0x08(%[srct]) \n\t" + + "pxor %[ftmp0], %[ftmp0], %[ff_pb_80] \n\t" + "pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" + "psubsb %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + + "pxor %[ftmp6], %[ftmp1], %[ff_pb_80] \n\t" + "pxor %[ftmp3], %[ftmp2], %[ff_pb_80] \n\t" + "psubsb %[ftmp7], %[ftmp3], %[ftmp6] \n\t" + "paddsb %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "paddsb %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "paddsb %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "pand %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "paddsb %[ftmp5], %[ftmp5], %[ff_pb_04] \n\t" + + "dli %[tmp0], 0x03 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t" + + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "psrah %[ftmp7], %[ftmp5], %[ftmp9] \n\t" + "psllh %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "psubsb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" + "psubsb %[ftmp5], %[ftmp5], %[ff_pb_01] \n\t" + + "dli %[tmp0], 0x03 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t" + + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "psrah %[ftmp5], %[ftmp5], %[ftmp9] \n\t" + "psllh %[ftmp5], %[ftmp5], %[ftmp8] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp5] \n\t" + "paddsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + + "ldc1 %[ftmp0], 0x00(%[srct]) \n\t" + "ldc1 %[ftmp4], 0x08(%[srct]) \n\t" + + "punpckhbh %[ftmp1], %[ftmp0], %[ftmp6] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp6] \n\t" + "punpcklbh %[ftmp2], %[ftmp3], %[ftmp4] \n\t" + "punpckhbh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + + "punpckhhw %[ftmp6], %[ftmp0], %[ftmp2] \n\t" + "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4]) + "gsswlc1 %[ftmp0], 0x03(%[addr1]) \n\t" + "gsswrc1 %[ftmp0], 0x00(%[addr1]) \n\t" + "punpckhhw %[ftmp5], %[ftmp1], %[ftmp3] \n\t" + "punpcklhw %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + + "ssrld %[ftmp0], %[ftmp0], %[ftmp10] \n\t" + MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4]) + "gsswlc1 %[ftmp0], 0x03(%[addr1]) \n\t" + "gsswrc1 %[ftmp0], 0x00(%[addr1]) \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gsswlc1 %[ftmp6], 0x03(%[addr1]) \n\t" + "gsswrc1 %[ftmp6], 0x00(%[addr1]) \n\t" + + "ssrld %[ftmp6], %[ftmp6], %[ftmp10] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[src_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gsswlc1 %[ftmp6], 0x03(%[addr1]) \n\t" + "gsswrc1 %[ftmp6], 0x00(%[addr1]) \n\t" + + MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gsswlc1 %[ftmp5], 0x03(%[addr1]) \n\t" + "gsswrc1 %[ftmp5], 0x00(%[addr1]) \n\t" + + "ssrld %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[addr0]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[addr0]) \n\t" + + "ssrld %[ftmp5], %[ftmp5], %[ftmp10] \n\t" + MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2]) + "gsswlc1 %[ftmp5], 0x03(%[addr1]) \n\t" + "gsswrc1 %[ftmp5], 0x00(%[addr1]) \n\t" + + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x8]) + "addiu %[count], %[count], -0x01 \n\t" + "bnez %[count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count), + [ff_pb_fe]"=&f"(ff_pb_fe), [ff_pb_80]"=&f"(ff_pb_80), + [ff_pb_04]"=&f"(ff_pb_04), [ff_pb_01]"=&f"(ff_pb_01) + : [blimit]"r"(blimit), [srct]"r"(srct), + [src_pixel_step]"r"((mips_reg)src_pixel_step), + [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)), + [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)), + [src_pixel_step_x8]"r"((mips_reg)(src_pixel_step<<3)) + : "memory" + ); + /* clang-format on */ +} + +/* Horizontal MB filtering */ +void vp8_loop_filter_mbh_mmi(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + vp8_mbloop_filter_horizontal_edge_mmi(y_ptr, y_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 2); + + if (u_ptr) + vp8_mbloop_filter_horizontal_edge_mmi(u_ptr, uv_stride, lfi->mblim, + lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_mbloop_filter_horizontal_edge_mmi(v_ptr, uv_stride, lfi->mblim, + lfi->lim, lfi->hev_thr, 1); +} + +/* Vertical MB Filtering */ +void vp8_loop_filter_mbv_mmi(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + vp8_mbloop_filter_vertical_edge_mmi(y_ptr, y_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 2); + + if (u_ptr) + vp8_mbloop_filter_vertical_edge_mmi(u_ptr, uv_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); + + if (v_ptr) + vp8_mbloop_filter_vertical_edge_mmi(v_ptr, uv_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); +} + +/* Horizontal B Filtering */ +void vp8_loop_filter_bh_mmi(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + vp8_loop_filter_horizontal_edge_mmi(y_ptr + 4 * y_stride, y_stride, lfi->blim, + lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_mmi(y_ptr + 8 * y_stride, y_stride, lfi->blim, + lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_mmi(y_ptr + 12 * y_stride, y_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_loop_filter_horizontal_edge_mmi(u_ptr + 4 * uv_stride, uv_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_loop_filter_horizontal_edge_mmi(v_ptr + 4 * uv_stride, uv_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 1); +} + +/* Vertical B Filtering */ +void vp8_loop_filter_bv_mmi(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + vp8_loop_filter_vertical_edge_mmi(y_ptr + 4, y_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_mmi(y_ptr + 8, y_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_mmi(y_ptr + 12, y_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 2); + + if (u_ptr) + vp8_loop_filter_vertical_edge_mmi(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 1); + + if (v_ptr) + vp8_loop_filter_vertical_edge_mmi(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 1); +} + +void vp8_loop_filter_bhs_mmi(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { + vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 4 * y_stride, y_stride, + blimit); + vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 8 * y_stride, y_stride, + blimit); + vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 12 * y_stride, y_stride, + blimit); +} + +void vp8_loop_filter_bvs_mmi(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { + vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 4, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 8, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 12, y_stride, blimit); +} diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c new file mode 100644 index 0000000000..b85f73fdff --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c @@ -0,0 +1,427 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp8/common/filter.h" +#include "vpx_ports/asmdefs_mmi.h" + +DECLARE_ALIGNED(8, static const int16_t, vp8_six_tap_mmi[8][6 * 8]) = { + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, + 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, + 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + { 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, + 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, + 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, + 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, + 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, + 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001 }, + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, + 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, + 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, + 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + { 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, + 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, + 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, + 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, + 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, + 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003 }, + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, + 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, + 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, + 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + { 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, + 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, + 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, + 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, + 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, + 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002 }, + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, + 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, + 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 } +}; + +/* Horizontal filter: pixel_step is 1, output_height and output_width are + the size of horizontal filtering output, output_height is always H + 5 */ +static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr, + uint16_t *output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_height, + unsigned int output_width, + const int16_t *vp8_filter) { + uint64_t tmp[1]; + double ff_ph_40; +#if _MIPS_SIM == _ABIO32 + register double fzero asm("$f0"); + register double ftmp0 asm("$f2"); + register double ftmp1 asm("$f4"); + register double ftmp2 asm("$f6"); + register double ftmp3 asm("$f8"); + register double ftmp4 asm("$f10"); + register double ftmp5 asm("$f12"); + register double ftmp6 asm("$f14"); + register double ftmp7 asm("$f16"); + register double ftmp8 asm("$f18"); + register double ftmp9 asm("$f20"); + register double ftmp10 asm("$f22"); + register double ftmp11 asm("$f24"); +#else + register double fzero asm("$f0"); + register double ftmp0 asm("$f1"); + register double ftmp1 asm("$f2"); + register double ftmp2 asm("$f3"); + register double ftmp3 asm("$f4"); + register double ftmp4 asm("$f5"); + register double ftmp5 asm("$f6"); + register double ftmp6 asm("$f7"); + register double ftmp7 asm("$f8"); + register double ftmp8 asm("$f9"); + register double ftmp9 asm("$f10"); + register double ftmp10 asm("$f11"); + register double ftmp11 asm("$f12"); +#endif // _MIPS_SIM == _ABIO32 + + /* clang-format off */ + __asm__ volatile ( + "dli %[tmp0], 0x0040004000400040 \n\t" + "dmtc1 %[tmp0], %[ff_ph_40] \n\t" + "ldc1 %[ftmp0], 0x00(%[vp8_filter]) \n\t" + "ldc1 %[ftmp1], 0x10(%[vp8_filter]) \n\t" + "ldc1 %[ftmp2], 0x20(%[vp8_filter]) \n\t" + "ldc1 %[ftmp3], 0x30(%[vp8_filter]) \n\t" + "ldc1 %[ftmp4], 0x40(%[vp8_filter]) \n\t" + "ldc1 %[ftmp5], 0x50(%[vp8_filter]) \n\t" + "pxor %[fzero], %[fzero], %[fzero] \n\t" + "dli %[tmp0], 0x07 \n\t" + "dmtc1 %[tmp0], %[ftmp7] \n\t" + "dli %[tmp0], 0x08 \n\t" + "dmtc1 %[tmp0], %[ftmp11] \n\t" + + "1: \n\t" + "gsldlc1 %[ftmp9], 0x05(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp9], -0x02(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp10], 0x06(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp10], -0x01(%[src_ptr]) \n\t" + + "punpcklbh %[ftmp6], %[ftmp9], %[fzero] \n\t" + "pmullh %[ftmp8], %[ftmp6], %[ftmp0] \n\t" + + "punpckhbh %[ftmp6], %[ftmp9], %[fzero] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + "punpcklbh %[ftmp6], %[ftmp10], %[fzero] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + "punpckhbh %[ftmp6], %[ftmp10], %[fzero] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + "ssrld %[ftmp10], %[ftmp10], %[ftmp11] \n\t" + "punpcklbh %[ftmp6], %[ftmp10], %[fzero] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + "ssrld %[ftmp10], %[ftmp10], %[ftmp11] \n\t" + "punpcklbh %[ftmp6], %[ftmp10], %[fzero] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + "paddsh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + "packushb %[ftmp8], %[ftmp8], %[fzero] \n\t" + "punpcklbh %[ftmp8], %[ftmp8], %[fzero] \n\t" + "gssdlc1 %[ftmp8], 0x07(%[output_ptr]) \n\t" + "gssdrc1 %[ftmp8], 0x00(%[output_ptr]) \n\t" + + "addiu %[output_height], %[output_height], -0x01 \n\t" + MMI_ADDU(%[output_ptr], %[output_ptr], %[output_width]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixels_per_line]) + "bnez %[output_height], 1b \n\t" + : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0), + [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), + [ftmp3]"=&f"(ftmp3), [ftmp4]"=&f"(ftmp4), + [ftmp5]"=&f"(ftmp5), [ftmp6]"=&f"(ftmp6), + [ftmp7]"=&f"(ftmp7), [ftmp8]"=&f"(ftmp8), + [ftmp9]"=&f"(ftmp9), [ftmp10]"=&f"(ftmp10), + [ftmp11]"=&f"(ftmp11), [tmp0]"=&r"(tmp[0]), + [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height), + [src_ptr]"+&r"(src_ptr), [ff_ph_40]"=&f"(ff_ph_40) + : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line), + [vp8_filter]"r"(vp8_filter), [output_width]"r"(output_width) + : "memory" + ); + /* clang-format on */ +} + +/* Horizontal filter: pixel_step is always W */ +static INLINE void vp8_filter_block1dc_v6_mmi( + uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height, + int output_pitch, unsigned int pixels_per_line, const int16_t *vp8_filter) { + double ff_ph_40; + uint64_t tmp[1]; + mips_reg addr[1]; + +#if _MIPS_SIM == _ABIO32 + register double fzero asm("$f0"); + register double ftmp0 asm("$f2"); + register double ftmp1 asm("$f4"); + register double ftmp2 asm("$f6"); + register double ftmp3 asm("$f8"); + register double ftmp4 asm("$f10"); + register double ftmp5 asm("$f12"); + register double ftmp6 asm("$f14"); + register double ftmp7 asm("$f16"); + register double ftmp8 asm("$f18"); + register double ftmp9 asm("$f20"); + register double ftmp10 asm("$f22"); + register double ftmp11 asm("$f24"); + register double ftmp12 asm("$f26"); + register double ftmp13 asm("$f28"); +#else + register double fzero asm("$f0"); + register double ftmp0 asm("$f1"); + register double ftmp1 asm("$f2"); + register double ftmp2 asm("$f3"); + register double ftmp3 asm("$f4"); + register double ftmp4 asm("$f5"); + register double ftmp5 asm("$f6"); + register double ftmp6 asm("$f7"); + register double ftmp7 asm("$f8"); + register double ftmp8 asm("$f9"); + register double ftmp9 asm("$f10"); + register double ftmp10 asm("$f11"); + register double ftmp11 asm("$f12"); + register double ftmp12 asm("$f13"); + register double ftmp13 asm("$f14"); +#endif // _MIPS_SIM == _ABIO32 + + /* clang-format off */ + __asm__ volatile ( + "dli %[tmp0], 0x0040004000400040 \n\t" + "dmtc1 %[tmp0], %[ff_ph_40] \n\t" + "ldc1 %[ftmp0], 0x00(%[vp8_filter]) \n\t" + "ldc1 %[ftmp1], 0x10(%[vp8_filter]) \n\t" + "ldc1 %[ftmp2], 0x20(%[vp8_filter]) \n\t" + "ldc1 %[ftmp3], 0x30(%[vp8_filter]) \n\t" + "ldc1 %[ftmp4], 0x40(%[vp8_filter]) \n\t" + "ldc1 %[ftmp5], 0x50(%[vp8_filter]) \n\t" + "pxor %[fzero], %[fzero], %[fzero] \n\t" + "dli %[tmp0], 0x07 \n\t" + "dmtc1 %[tmp0], %[ftmp13] \n\t" + + /* In order to make full use of memory load delay slot, + * Operation of memory loading and calculating has been rearranged. + */ + "1: \n\t" + "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line]) + "gsldlc1 %[ftmp7], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[addr0]) \n\t" + MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x2]) + "gsldlc1 %[ftmp8], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[addr0]) \n\t" + + MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x4]) + "gsldlc1 %[ftmp9], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp9], 0x00(%[addr0]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[pixels_per_line]) + MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x2]) + "gsldlc1 %[ftmp10], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp10], 0x00(%[addr0]) \n\t" + MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x4]) + "gsldlc1 %[ftmp11], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[addr0]) \n\t" + + "pmullh %[ftmp12], %[ftmp6], %[ftmp0] \n\t" + + "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "paddsh %[ftmp12], %[ftmp12], %[ftmp7] \n\t" + + "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" + "paddsh %[ftmp12], %[ftmp12], %[ftmp8] \n\t" + + "pmullh %[ftmp9], %[ftmp9], %[ftmp4] \n\t" + "paddsh %[ftmp12], %[ftmp12], %[ftmp9] \n\t" + + "pmullh %[ftmp10], %[ftmp10], %[ftmp3] \n\t" + "paddsh %[ftmp12], %[ftmp12], %[ftmp10] \n\t" + + "pmullh %[ftmp11], %[ftmp11], %[ftmp5] \n\t" + "paddsh %[ftmp12], %[ftmp12], %[ftmp11] \n\t" + + "paddsh %[ftmp12], %[ftmp12], %[ff_ph_40] \n\t" + "psrah %[ftmp12], %[ftmp12], %[ftmp13] \n\t" + "packushb %[ftmp12], %[ftmp12], %[fzero] \n\t" + "gsswlc1 %[ftmp12], 0x03(%[output_ptr]) \n\t" + "gsswrc1 %[ftmp12], 0x00(%[output_ptr]) \n\t" + + MMI_ADDIU(%[output_height], %[output_height], -0x01) + MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch]) + "bnez %[output_height], 1b \n\t" + : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0), + [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), + [ftmp3]"=&f"(ftmp3), [ftmp4]"=&f"(ftmp4), + [ftmp5]"=&f"(ftmp5), [ftmp6]"=&f"(ftmp6), + [ftmp7]"=&f"(ftmp7), [ftmp8]"=&f"(ftmp8), + [ftmp9]"=&f"(ftmp9), [ftmp10]"=&f"(ftmp10), + [ftmp11]"=&f"(ftmp11), [ftmp12]"=&f"(ftmp12), + [ftmp13]"=&f"(ftmp13), [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), [src_ptr]"+&r"(src_ptr), + [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height), + [ff_ph_40]"=&f"(ff_ph_40) + : [pixels_per_line]"r"((mips_reg)pixels_per_line), + [pixels_per_line_x2]"r"((mips_reg)(pixels_per_line<<1)), + [pixels_per_line_x4]"r"((mips_reg)(pixels_per_line<<2)), + [vp8_filter]"r"(vp8_filter), + [output_pitch]"r"((mips_reg)output_pitch) + : "memory" + ); + /* clang-format on */ +} + +/* When xoffset == 0, vp8_filter= {0,0,128,0,0,0}, + function vp8_filter_block1d_h6_mmi and vp8_filter_block1d_v6_mmi can + be simplified */ +static INLINE void vp8_filter_block1d_h6_filter0_mmi( + unsigned char *src_ptr, uint16_t *output_ptr, + unsigned int src_pixels_per_line, unsigned int output_height, + unsigned int output_width) { +#if _MIPS_SIM == _ABIO32 + register double fzero asm("$f0"); + register double ftmp0 asm("$f2"); + register double ftmp1 asm("$f4"); +#else + register double fzero asm("$f0"); + register double ftmp0 asm("$f1"); + register double ftmp1 asm("$f2"); +#endif // _MIPS_SIM == _ABIO32 + + /* clang-format off */ + __asm__ volatile ( + "pxor %[fzero], %[fzero], %[fzero] \n\t" + + "1: \n\t" + "gsldlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixels_per_line]) + + "punpcklbh %[ftmp1], %[ftmp0], %[fzero] \n\t" + "gssdlc1 %[ftmp1], 0x07(%[output_ptr]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[output_ptr]) \n\t" + + "addiu %[output_height], %[output_height], -0x01 \n\t" + MMI_ADDU(%[output_ptr], %[output_ptr], %[output_width]) + "bnez %[output_height], 1b \n\t" + : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0), + [ftmp1]"=&f"(ftmp1), [src_ptr]"+&r"(src_ptr), + [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height) + : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line), + [output_width]"r"(output_width) + : "memory" + ); + /* clang-format on */ +} + +static INLINE void vp8_filter_block1dc_v6_filter0_mmi( + uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height, + int output_pitch, unsigned int pixels_per_line) { +#if _MIPS_SIM == _ABIO32 + register double fzero asm("$f0"); + register double ftmp0 asm("$f2"); + register double ftmp1 asm("$f4"); +#else + register double fzero asm("$f0"); + register double ftmp0 asm("$f1"); + register double ftmp1 asm("$f2"); +#endif // _MIPS_SIM == _ABIO32 + + /* clang-format on */ + __asm__ volatile ( + "pxor %[fzero], %[fzero], %[fzero] \n\t" + + "1: \n\t" + "gsldlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[pixels_per_line]) + MMI_ADDIU(%[output_height], %[output_height], -0x01) + "packushb %[ftmp1], %[ftmp0], %[fzero] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[output_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[output_ptr]) \n\t" + + MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch]) + "bnez %[output_height], 1b \n\t" + : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0), + [ftmp1]"=&f"(ftmp1), [src_ptr]"+&r"(src_ptr), + [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height) + : [pixels_per_line]"r"((mips_reg)pixels_per_line), + [output_pitch]"r"((mips_reg)output_pitch) + : "memory" + ); + /* clang-format on */ +} + +#define sixtapNxM(n, m) \ + void vp8_sixtap_predict##n##x##m##_mmi( \ + unsigned char *src_ptr, int src_pixels_per_line, int xoffset, \ + int yoffset, unsigned char *dst_ptr, int dst_pitch) { \ + DECLARE_ALIGNED(16, uint16_t, \ + FData2[(n + 5) * (n == 16 ? 24 : (n == 8 ? 16 : n))]); \ + const int16_t *HFilter, *VFilter; \ + int i, loop = n / 4; \ + HFilter = vp8_six_tap_mmi[xoffset]; \ + VFilter = vp8_six_tap_mmi[yoffset]; \ + \ + if (xoffset == 0) { \ + for (i = 0; i < loop; ++i) { \ + vp8_filter_block1d_h6_filter0_mmi( \ + src_ptr - (2 * src_pixels_per_line) + i * 4, FData2 + i * 4, \ + src_pixels_per_line, m + 5, n * 2); \ + } \ + } else { \ + for (i = 0; i < loop; ++i) { \ + vp8_filter_block1d_h6_mmi(src_ptr - (2 * src_pixels_per_line) + i * 4, \ + FData2 + i * 4, src_pixels_per_line, m + 5, \ + n * 2, HFilter); \ + } \ + } \ + if (yoffset == 0) { \ + for (i = 0; i < loop; ++i) { \ + vp8_filter_block1dc_v6_filter0_mmi( \ + FData2 + n * 2 + i * 4, dst_ptr + i * 4, m, dst_pitch, n * 2); \ + } \ + } else { \ + for (i = 0; i < loop; ++i) { \ + vp8_filter_block1dc_v6_mmi(FData2 + i * 4, dst_ptr + i * 4, m, \ + dst_pitch, n * 2, VFilter); \ + } \ + } \ + } + +sixtapNxM(4, 4); +sixtapNxM(8, 8); +sixtapNxM(8, 4); +sixtapNxM(16, 16); diff --git a/media/libvpx/libvpx/vp8/common/mips/msa/bilinear_filter_msa.c b/media/libvpx/libvpx/vp8/common/mips/msa/bilinear_filter_msa.c new file mode 100644 index 0000000000..c7fb1ed33f --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mips/msa/bilinear_filter_msa.c @@ -0,0 +1,797 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_ports/mem.h" +#include "vp8/common/filter.h" +#include "vp8/common/mips/msa/vp8_macros_msa.h" + +DECLARE_ALIGNED(16, static const int8_t, vp8_bilinear_filters_msa[7][2]) = { + { 112, 16 }, { 96, 32 }, { 80, 48 }, { 64, 64 }, + { 48, 80 }, { 32, 96 }, { 16, 112 } +}; + +static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = { + /* 8 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + /* 4 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, + /* 4 width cases */ + 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 +}; + +static void common_hz_2t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter) { + v16i8 src0, src1, src2, src3, mask; + v16u8 filt0, vec0, vec1, res0, res1; + v8u16 vec2, vec3, filt; + + mask = LD_SB(&vp8_mc_filt_mask_arr[16]); + + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); + SRARI_H2_UH(vec2, vec3, VP8_FILTER_SHIFT); + PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hz_2t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter) { + v16u8 vec0, vec1, vec2, vec3, filt0; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16i8 res0, res1, res2, res3; + v8u16 vec4, vec5, vec6, vec7, filt; + + mask = LD_SB(&vp8_mc_filt_mask_arr[16]); + + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); + VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, + vec6, vec7); + SRARI_H4_UH(vec4, vec5, vec6, vec7, VP8_FILTER_SHIFT); + PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, + res3); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hz_2t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + if (4 == height) { + common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_2t_8x4_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter) { + v16u8 filt0; + v16i8 src0, src1, src2, src3, mask; + v8u16 vec0, vec1, vec2, vec3, filt; + + mask = LD_SB(&vp8_mc_filt_mask_arr[0]); + + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1); + ST8x4_UB(src0, src1, dst, dst_stride); +} + +static void common_hz_2t_8x8mult_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + v16u8 filt0; + v16i8 src0, src1, src2, src3, mask, out0, out1; + v8u16 vec0, vec1, vec2, vec3, filt; + + mask = LD_SB(&vp8_mc_filt_mask_arr[0]); + + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + if (16 == height) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT); + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride); + } +} + +static void common_hz_2t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + if (4 == height) { + common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); + } else { + common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); + } +} + +static void common_hz_2t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; + + mask = LD_SB(&vp8_mc_filt_mask_arr[0]); + + loop_cnt = (height >> 2) - 1; + + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, VP8_FILTER_SHIFT); + SRARI_H4_UH(out4, out5, out6, out7, VP8_FILTER_SHIFT); + PCKEV_ST_SB(out0, out1, dst); + dst += dst_stride; + PCKEV_ST_SB(out2, out3, dst); + dst += dst_stride; + PCKEV_ST_SB(out4, out5, dst); + dst += dst_stride; + PCKEV_ST_SB(out6, out7, dst); + dst += dst_stride; + + for (; loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, VP8_FILTER_SHIFT); + SRARI_H4_UH(out4, out5, out6, out7, VP8_FILTER_SHIFT); + PCKEV_ST_SB(out0, out1, dst); + dst += dst_stride; + PCKEV_ST_SB(out2, out3, dst); + dst += dst_stride; + PCKEV_ST_SB(out4, out5, dst); + dst += dst_stride; + PCKEV_ST_SB(out6, out7, dst); + dst += dst_stride; + } +} + +static void common_vt_2t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter) { + v16i8 src0, src1, src2, src3, src4; + v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332; + v16u8 filt0; + v8i16 filt; + v8u16 tmp0, tmp1; + + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); + DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT); + src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_vt_2t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter) { + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776; + v8u16 tmp0, tmp1, tmp2, tmp3; + v16u8 filt0; + v8i16 filt; + + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + src8 = LD_SB(src); + src += src_stride; + + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, + src76_r, src87_r); + ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r, + src76_r, src2110, src4332, src6554, src8776); + DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, + tmp0, tmp1, tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); + ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); + ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); +} + +static void common_vt_2t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + if (4 == height) { + common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_vt_2t_8x4_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter) { + v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0; + v16i8 out0, out1; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_UB5(src, src_stride, src0, src1, src2, src3, src4); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); + ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); +} + +static void common_vt_2t_8x8mult_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v16i8 out0, out1; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); + src += (8 * src_stride); + + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, + vec3); + ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6, + vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + src0 = src8; + } +} + +static void common_vt_2t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + if (4 == height) { + common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); + } else { + common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); + } +} + +static void common_vt_2t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT); + PCKEV_ST_SB(tmp0, tmp1, dst); + dst += dst_stride; + + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, VP8_FILTER_SHIFT); + PCKEV_ST_SB(tmp2, tmp3, dst); + dst += dst_stride; + + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT); + PCKEV_ST_SB(tmp0, tmp1, dst); + dst += dst_stride; + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, VP8_FILTER_SHIFT); + PCKEV_ST_SB(tmp2, tmp3, dst); + dst += dst_stride; + + src0 = src4; + } +} + +static void common_hv_2ht_2vt_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert) { + v16i8 src0, src1, src2, src3, src4, mask; + v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1; + + mask = LD_SB(&vp8_mc_filt_mask_arr[16]); + + filt = LD_UH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); + filt = LD_UH(filter_vert); + filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, VP8_FILTER_SHIFT); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, VP8_FILTER_SHIFT); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT); + hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); + hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); + + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT); + PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert) { + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; + v16i8 res0, res1, res2, res3; + v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt; + + mask = LD_SB(&vp8_mc_filt_mask_arr[16]); + + filt = LD_UH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); + filt = LD_UH(filter_vert); + filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + src8 = LD_SB(src); + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, VP8_FILTER_SHIFT); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, VP8_FILTER_SHIFT); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, VP8_FILTER_SHIFT); + hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, VP8_FILTER_SHIFT); + hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, VP8_FILTER_SHIFT); + SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, + hz_out3, hz_out5, 8); + hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6); + + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, vec4, + vec5, vec6, vec7); + SRARI_H4_UH(vec4, vec5, vec6, vec7, VP8_FILTER_SHIFT); + PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, + res3); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + if (4 == height) { + common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert); + } else if (8 == height) { + common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert); + } +} + +static void common_hv_2ht_2vt_8x4_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert) { + v16i8 src0, src1, src2, src3, src4, mask, out0, out1; + v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3; + v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + mask = LD_SB(&vp8_mc_filt_mask_arr[0]); + + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT); + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp0 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, VP8_FILTER_SHIFT); + vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp1 = __msa_dotp_u_h(vec1, filt_vt); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, VP8_FILTER_SHIFT); + vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp2 = __msa_dotp_u_h(vec2, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT); + vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp3 = __msa_dotp_u_h(vec3, filt_vt); + + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_8x8mult_msa( + uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst, + int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, mask, out0, out1; + v16u8 filt_hz, filt_vt, vec0; + v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + v8i16 filt; + + mask = LD_SB(&vp8_mc_filt_mask_arr[0]); + + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + src0 = LD_SB(src); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT); + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp1 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, VP8_FILTER_SHIFT); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp2 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, VP8_FILTER_SHIFT); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp3 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT); + LD_SB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp4 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H2_UH(tmp3, tmp4, VP8_FILTER_SHIFT); + PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp5 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, VP8_FILTER_SHIFT); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp6 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, VP8_FILTER_SHIFT); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp7 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp8 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, VP8_FILTER_SHIFT); + PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hv_2ht_2vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + if (4 == height) { + common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert); + } else { + common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + } +} + +static void common_hv_2ht_2vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt_hz, filt_vt, vec0, vec1; + v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3; + v8i16 filt; + + mask = LD_SB(&vp8_mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + LD_SB2(src, 8, src0, src1); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT); + hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT); + hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); + SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT); + PCKEV_ST_SB(tmp1, tmp2, dst); + dst += dst_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, VP8_FILTER_SHIFT); + hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, VP8_FILTER_SHIFT); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); + SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT); + PCKEV_ST_SB(tmp1, tmp2, dst); + dst += dst_stride; + + hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT); + hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, VP8_FILTER_SHIFT); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); + SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT); + PCKEV_ST_SB(tmp1, tmp2, dst); + dst += dst_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, VP8_FILTER_SHIFT); + hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, VP8_FILTER_SHIFT); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); + SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT); + PCKEV_ST_SB(tmp1, tmp2, dst); + dst += dst_stride; + } +} + +void vp8_bilinear_predict4x4_msa(uint8_t *RESTRICT src, int32_t src_stride, + int32_t xoffset, int32_t yoffset, + uint8_t *RESTRICT dst, int32_t dst_stride) { + const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1]; + const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1]; + + if (yoffset) { + if (xoffset) { + common_hv_2ht_2vt_4w_msa(src, src_stride, dst, dst_stride, h_filter, + v_filter, 4); + } else { + common_vt_2t_4w_msa(src, src_stride, dst, dst_stride, v_filter, 4); + } + } else { + if (xoffset) { + common_hz_2t_4w_msa(src, src_stride, dst, dst_stride, h_filter, 4); + } else { + uint32_t tp0, tp1, tp2, tp3; + + LW4(src, src_stride, tp0, tp1, tp2, tp3); + SW4(tp0, tp1, tp2, tp3, dst, dst_stride); + } + } +} + +void vp8_bilinear_predict8x4_msa(uint8_t *RESTRICT src, int32_t src_stride, + int32_t xoffset, int32_t yoffset, + uint8_t *RESTRICT dst, int32_t dst_stride) { + const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1]; + const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1]; + + if (yoffset) { + if (xoffset) { + common_hv_2ht_2vt_8w_msa(src, src_stride, dst, dst_stride, h_filter, + v_filter, 4); + } else { + common_vt_2t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 4); + } + } else { + if (xoffset) { + common_hz_2t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 4); + } else { + vp8_copy_mem8x4(src, src_stride, dst, dst_stride); + } + } +} + +void vp8_bilinear_predict8x8_msa(uint8_t *RESTRICT src, int32_t src_stride, + int32_t xoffset, int32_t yoffset, + uint8_t *RESTRICT dst, int32_t dst_stride) { + const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1]; + const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1]; + + if (yoffset) { + if (xoffset) { + common_hv_2ht_2vt_8w_msa(src, src_stride, dst, dst_stride, h_filter, + v_filter, 8); + } else { + common_vt_2t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 8); + } + } else { + if (xoffset) { + common_hz_2t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 8); + } else { + vp8_copy_mem8x8(src, src_stride, dst, dst_stride); + } + } +} + +void vp8_bilinear_predict16x16_msa(uint8_t *RESTRICT src, int32_t src_stride, + int32_t xoffset, int32_t yoffset, + uint8_t *RESTRICT dst, int32_t dst_stride) { + const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1]; + const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1]; + + if (yoffset) { + if (xoffset) { + common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, h_filter, + v_filter, 16); + } else { + common_vt_2t_16w_msa(src, src_stride, dst, dst_stride, v_filter, 16); + } + } else { + if (xoffset) { + common_hz_2t_16w_msa(src, src_stride, dst, dst_stride, h_filter, 16); + } else { + vp8_copy_mem16x16(src, src_stride, dst, dst_stride); + } + } +} diff --git a/media/libvpx/libvpx/vp8/common/mips/msa/copymem_msa.c b/media/libvpx/libvpx/vp8/common/mips/msa/copymem_msa.c new file mode 100644 index 0000000000..357c99b8b6 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mips/msa/copymem_msa.c @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vp8/common/mips/msa/vp8_macros_msa.h" + +static void copy_8x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, + int32_t dst_stride) { + uint64_t src0, src1, src2, src3; + + LD4(src, src_stride, src0, src1, src2, src3); + SD4(src0, src1, src2, src3, dst, dst_stride); +} + +static void copy_8x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, + int32_t dst_stride) { + uint64_t src0, src1, src2, src3; + + LD4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + SD4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + + LD4(src, src_stride, src0, src1, src2, src3); + SD4(src0, src1, src2, src3, dst, dst_stride); +} + +static void copy_16x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, + int32_t dst_stride) { + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 src8, src9, src10, src11, src12, src13, src14, src15; + + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + LD_UB8(src, src_stride, src8, src9, src10, src11, src12, src13, src14, src15); + + ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); + dst += (8 * dst_stride); + ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, dst, dst_stride); +} + +void vp8_copy_mem16x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, + int32_t dst_stride) { + copy_16x16_msa(src, src_stride, dst, dst_stride); +} + +void vp8_copy_mem8x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, + int32_t dst_stride) { + copy_8x8_msa(src, src_stride, dst, dst_stride); +} + +void vp8_copy_mem8x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, + int32_t dst_stride) { + copy_8x4_msa(src, src_stride, dst, dst_stride); +} diff --git a/media/libvpx/libvpx/vp8/common/mips/msa/idct_msa.c b/media/libvpx/libvpx/vp8/common/mips/msa/idct_msa.c new file mode 100644 index 0000000000..efad0c29f8 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mips/msa/idct_msa.c @@ -0,0 +1,406 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vp8/common/blockd.h" +#include "vp8/common/mips/msa/vp8_macros_msa.h" + +static const int32_t cospi8sqrt2minus1 = 20091; +static const int32_t sinpi8sqrt2 = 35468; + +#define TRANSPOSE_TWO_4x4_H(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 s4_m, s5_m, s6_m, s7_m; \ + \ + TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, s4_m, s5_m, s6_m, s7_m); \ + ILVR_D2_SH(s6_m, s4_m, s7_m, s5_m, out0, out2); \ + out1 = (v8i16)__msa_ilvl_d((v2i64)s6_m, (v2i64)s4_m); \ + out3 = (v8i16)__msa_ilvl_d((v2i64)s7_m, (v2i64)s5_m); \ + } + +#define EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in) \ + ({ \ + v8i16 out_m; \ + v8i16 zero_m = { 0 }; \ + v4i32 tmp1_m, tmp2_m; \ + v4i32 sinpi8_sqrt2_m = __msa_fill_w(sinpi8sqrt2); \ + \ + ILVRL_H2_SW(in, zero_m, tmp1_m, tmp2_m); \ + tmp1_m >>= 16; \ + tmp2_m >>= 16; \ + tmp1_m = (tmp1_m * sinpi8_sqrt2_m) >> 16; \ + tmp2_m = (tmp2_m * sinpi8_sqrt2_m) >> 16; \ + out_m = __msa_pckev_h((v8i16)tmp2_m, (v8i16)tmp1_m); \ + \ + out_m; \ + }) + +#define VP8_IDCT_1D_H(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 a1_m, b1_m, c1_m, d1_m; \ + v8i16 c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m; \ + v8i16 const_cospi8sqrt2minus1_m; \ + \ + const_cospi8sqrt2minus1_m = __msa_fill_h(cospi8sqrt2minus1); \ + a1_m = in0 + in2; \ + b1_m = in0 - in2; \ + c_tmp1_m = EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in1); \ + c_tmp2_m = __msa_mul_q_h(in3, const_cospi8sqrt2minus1_m); \ + c_tmp2_m = c_tmp2_m >> 1; \ + c_tmp2_m = in3 + c_tmp2_m; \ + c1_m = c_tmp1_m - c_tmp2_m; \ + d_tmp1_m = __msa_mul_q_h(in1, const_cospi8sqrt2minus1_m); \ + d_tmp1_m = d_tmp1_m >> 1; \ + d_tmp1_m = in1 + d_tmp1_m; \ + d_tmp2_m = EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in3); \ + d1_m = d_tmp1_m + d_tmp2_m; \ + BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \ + } + +#define VP8_IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v4i32 a1_m, b1_m, c1_m, d1_m; \ + v4i32 c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m; \ + v4i32 const_cospi8sqrt2minus1_m, sinpi8_sqrt2_m; \ + \ + const_cospi8sqrt2minus1_m = __msa_fill_w(cospi8sqrt2minus1); \ + sinpi8_sqrt2_m = __msa_fill_w(sinpi8sqrt2); \ + a1_m = in0 + in2; \ + b1_m = in0 - in2; \ + c_tmp1_m = (in1 * sinpi8_sqrt2_m) >> 16; \ + c_tmp2_m = in3 + ((in3 * const_cospi8sqrt2minus1_m) >> 16); \ + c1_m = c_tmp1_m - c_tmp2_m; \ + d_tmp1_m = in1 + ((in1 * const_cospi8sqrt2minus1_m) >> 16); \ + d_tmp2_m = (in3 * sinpi8_sqrt2_m) >> 16; \ + d1_m = d_tmp1_m + d_tmp2_m; \ + BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \ + } + +static void idct4x4_addblk_msa(int16_t *input, uint8_t *pred, + int32_t pred_stride, uint8_t *dest, + int32_t dest_stride) { + v8i16 input0, input1; + v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3; + v4i32 res0, res1, res2, res3; + v16i8 zero = { 0 }; + v16i8 pred0, pred1, pred2, pred3; + + LD_SH2(input, 8, input0, input1); + UNPCK_SH_SW(input0, in0, in1); + UNPCK_SH_SW(input1, in2, in3); + VP8_IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3); + TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3); + VP8_IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3); + SRARI_W4_SW(vt0, vt1, vt2, vt3, 3); + TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3); + LD_SB4(pred, pred_stride, pred0, pred1, pred2, pred3); + ILVR_B4_SW(zero, pred0, zero, pred1, zero, pred2, zero, pred3, res0, res1, + res2, res3); + ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, res0, res1, res2, + res3); + ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3); + res0 = CLIP_SW_0_255(res0); + res1 = CLIP_SW_0_255(res1); + res2 = CLIP_SW_0_255(res2); + res3 = CLIP_SW_0_255(res3); + PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1); + res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1); + ST4x4_UB(res0, res0, 3, 2, 1, 0, dest, dest_stride); +} + +static void idct4x4_addconst_msa(int16_t in_dc, uint8_t *pred, + int32_t pred_stride, uint8_t *dest, + int32_t dest_stride) { + v8i16 vec, res0, res1, res2, res3, dst0, dst1; + v16i8 zero = { 0 }; + v16i8 pred0, pred1, pred2, pred3; + + vec = __msa_fill_h(in_dc); + vec = __msa_srari_h(vec, 3); + LD_SB4(pred, pred_stride, pred0, pred1, pred2, pred3); + ILVR_B4_SH(zero, pred0, zero, pred1, zero, pred2, zero, pred3, res0, res1, + res2, res3); + ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3); + CLIP_SH4_0_255(res0, res1, res2, res3); + PCKEV_B2_SH(res1, res0, res3, res2, dst0, dst1); + dst0 = (v8i16)__msa_pckev_w((v4i32)dst1, (v4i32)dst0); + ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dest, dest_stride); +} + +void vp8_short_inv_walsh4x4_msa(int16_t *input, int16_t *mb_dqcoeff) { + v8i16 input0, input1, tmp0, tmp1, tmp2, tmp3, out0, out1; + const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 }; + const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 }; + const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 }; + const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 }; + + LD_SH2(input, 8, input0, input1); + input1 = (v8i16)__msa_sldi_b((v16i8)input1, (v16i8)input1, 8); + tmp0 = input0 + input1; + tmp1 = input0 - input1; + VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3); + out0 = tmp2 + tmp3; + out1 = tmp2 - tmp3; + VSHF_H2_SH(out0, out1, out0, out1, mask2, mask3, input0, input1); + tmp0 = input0 + input1; + tmp1 = input0 - input1; + VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3); + tmp0 = tmp2 + tmp3; + tmp1 = tmp2 - tmp3; + ADD2(tmp0, 3, tmp1, 3, out0, out1); + out0 >>= 3; + out1 >>= 3; + mb_dqcoeff[0] = __msa_copy_s_h(out0, 0); + mb_dqcoeff[16] = __msa_copy_s_h(out0, 4); + mb_dqcoeff[32] = __msa_copy_s_h(out1, 0); + mb_dqcoeff[48] = __msa_copy_s_h(out1, 4); + mb_dqcoeff[64] = __msa_copy_s_h(out0, 1); + mb_dqcoeff[80] = __msa_copy_s_h(out0, 5); + mb_dqcoeff[96] = __msa_copy_s_h(out1, 1); + mb_dqcoeff[112] = __msa_copy_s_h(out1, 5); + mb_dqcoeff[128] = __msa_copy_s_h(out0, 2); + mb_dqcoeff[144] = __msa_copy_s_h(out0, 6); + mb_dqcoeff[160] = __msa_copy_s_h(out1, 2); + mb_dqcoeff[176] = __msa_copy_s_h(out1, 6); + mb_dqcoeff[192] = __msa_copy_s_h(out0, 3); + mb_dqcoeff[208] = __msa_copy_s_h(out0, 7); + mb_dqcoeff[224] = __msa_copy_s_h(out1, 3); + mb_dqcoeff[240] = __msa_copy_s_h(out1, 7); +} + +static void dequant_idct4x4_addblk_msa(int16_t *input, int16_t *dequant_input, + uint8_t *dest, int32_t dest_stride) { + v8i16 input0, input1, dequant_in0, dequant_in1, mul0, mul1; + v8i16 in0, in1, in2, in3, hz0_h, hz1_h, hz2_h, hz3_h; + v16u8 dest0, dest1, dest2, dest3; + v4i32 hz0_w, hz1_w, hz2_w, hz3_w, vt0, vt1, vt2, vt3, res0, res1, res2, res3; + v2i64 zero = { 0 }; + + LD_SH2(input, 8, input0, input1); + LD_SH2(dequant_input, 8, dequant_in0, dequant_in1); + MUL2(input0, dequant_in0, input1, dequant_in1, mul0, mul1); + PCKEV_D2_SH(zero, mul0, zero, mul1, in0, in2); + PCKOD_D2_SH(zero, mul0, zero, mul1, in1, in3); + VP8_IDCT_1D_H(in0, in1, in2, in3, hz0_h, hz1_h, hz2_h, hz3_h); + PCKEV_D2_SH(hz1_h, hz0_h, hz3_h, hz2_h, mul0, mul1); + UNPCK_SH_SW(mul0, hz0_w, hz1_w); + UNPCK_SH_SW(mul1, hz2_w, hz3_w); + TRANSPOSE4x4_SW_SW(hz0_w, hz1_w, hz2_w, hz3_w, hz0_w, hz1_w, hz2_w, hz3_w); + VP8_IDCT_1D_W(hz0_w, hz1_w, hz2_w, hz3_w, vt0, vt1, vt2, vt3); + SRARI_W4_SW(vt0, vt1, vt2, vt3, 3); + TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3); + LD_UB4(dest, dest_stride, dest0, dest1, dest2, dest3); + ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3, res0, res1, + res2, res3); + ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, res0, res1, res2, + res3); + ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3); + res0 = CLIP_SW_0_255(res0); + res1 = CLIP_SW_0_255(res1); + res2 = CLIP_SW_0_255(res2); + res3 = CLIP_SW_0_255(res3); + PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1); + res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1); + ST4x4_UB(res0, res0, 3, 2, 1, 0, dest, dest_stride); +} + +static void dequant_idct4x4_addblk_2x_msa(int16_t *input, + int16_t *dequant_input, uint8_t *dest, + int32_t dest_stride) { + v16u8 dest0, dest1, dest2, dest3; + v8i16 in0, in1, in2, in3, mul0, mul1, mul2, mul3, dequant_in0, dequant_in1; + v8i16 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3, res0, res1, res2, res3; + v4i32 hz0l, hz1l, hz2l, hz3l, hz0r, hz1r, hz2r, hz3r; + v4i32 vt0l, vt1l, vt2l, vt3l, vt0r, vt1r, vt2r, vt3r; + v16i8 zero = { 0 }; + + LD_SH4(input, 8, in0, in1, in2, in3); + LD_SH2(dequant_input, 8, dequant_in0, dequant_in1); + MUL4(in0, dequant_in0, in1, dequant_in1, in2, dequant_in0, in3, dequant_in1, + mul0, mul1, mul2, mul3); + PCKEV_D2_SH(mul2, mul0, mul3, mul1, in0, in2); + PCKOD_D2_SH(mul2, mul0, mul3, mul1, in1, in3); + VP8_IDCT_1D_H(in0, in1, in2, in3, hz0, hz1, hz2, hz3); + TRANSPOSE_TWO_4x4_H(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3); + UNPCK_SH_SW(hz0, hz0r, hz0l); + UNPCK_SH_SW(hz1, hz1r, hz1l); + UNPCK_SH_SW(hz2, hz2r, hz2l); + UNPCK_SH_SW(hz3, hz3r, hz3l); + VP8_IDCT_1D_W(hz0l, hz1l, hz2l, hz3l, vt0l, vt1l, vt2l, vt3l); + SRARI_W4_SW(vt0l, vt1l, vt2l, vt3l, 3); + VP8_IDCT_1D_W(hz0r, hz1r, hz2r, hz3r, vt0r, vt1r, vt2r, vt3r); + SRARI_W4_SW(vt0r, vt1r, vt2r, vt3r, 3); + PCKEV_H4_SH(vt0l, vt0r, vt1l, vt1r, vt2l, vt2r, vt3l, vt3r, vt0, vt1, vt2, + vt3); + TRANSPOSE_TWO_4x4_H(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3); + LD_UB4(dest, dest_stride, dest0, dest1, dest2, dest3); + ILVR_B4_SH(zero, dest0, zero, dest1, zero, dest2, zero, dest3, res0, res1, + res2, res3); + ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3); + CLIP_SH4_0_255(res0, res1, res2, res3); + PCKEV_B2_SW(res1, res0, res3, res2, vt0l, vt1l); + ST8x4_UB(vt0l, vt1l, dest, dest_stride); + + __asm__ __volatile__( + "sw $zero, 0(%[input]) \n\t" + "sw $zero, 4(%[input]) \n\t" + "sw $zero, 8(%[input]) \n\t" + "sw $zero, 12(%[input]) \n\t" + "sw $zero, 16(%[input]) \n\t" + "sw $zero, 20(%[input]) \n\t" + "sw $zero, 24(%[input]) \n\t" + "sw $zero, 28(%[input]) \n\t" + "sw $zero, 32(%[input]) \n\t" + "sw $zero, 36(%[input]) \n\t" + "sw $zero, 40(%[input]) \n\t" + "sw $zero, 44(%[input]) \n\t" + "sw $zero, 48(%[input]) \n\t" + "sw $zero, 52(%[input]) \n\t" + "sw $zero, 56(%[input]) \n\t" + "sw $zero, 60(%[input]) \n\t" :: + + [input] "r"(input)); +} + +static void dequant_idct_addconst_2x_msa(int16_t *input, int16_t *dequant_input, + uint8_t *dest, int32_t dest_stride) { + v8i16 input_dc0, input_dc1, vec, res0, res1, res2, res3; + v16u8 dest0, dest1, dest2, dest3; + v16i8 zero = { 0 }; + + input_dc0 = __msa_fill_h(input[0] * dequant_input[0]); + input_dc1 = __msa_fill_h(input[16] * dequant_input[0]); + SRARI_H2_SH(input_dc0, input_dc1, 3); + vec = (v8i16)__msa_pckev_d((v2i64)input_dc1, (v2i64)input_dc0); + input[0] = 0; + input[16] = 0; + LD_UB4(dest, dest_stride, dest0, dest1, dest2, dest3); + ILVR_B4_SH(zero, dest0, zero, dest1, zero, dest2, zero, dest3, res0, res1, + res2, res3); + ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3); + CLIP_SH4_0_255(res0, res1, res2, res3); + PCKEV_B2_SH(res1, res0, res3, res2, res0, res1); + ST8x4_UB(res0, res1, dest, dest_stride); +} + +void vp8_short_idct4x4llm_msa(int16_t *input, uint8_t *pred_ptr, + int32_t pred_stride, uint8_t *dst_ptr, + int32_t dst_stride) { + idct4x4_addblk_msa(input, pred_ptr, pred_stride, dst_ptr, dst_stride); +} + +void vp8_dc_only_idct_add_msa(int16_t input_dc, uint8_t *pred_ptr, + int32_t pred_stride, uint8_t *dst_ptr, + int32_t dst_stride) { + idct4x4_addconst_msa(input_dc, pred_ptr, pred_stride, dst_ptr, dst_stride); +} + +void vp8_dequantize_b_msa(BLOCKD *d, int16_t *DQC) { + v8i16 dqc0, dqc1, q0, q1, dq0, dq1; + + LD_SH2(DQC, 8, dqc0, dqc1); + LD_SH2(d->qcoeff, 8, q0, q1); + MUL2(dqc0, q0, dqc1, q1, dq0, dq1); + ST_SH2(dq0, dq1, d->dqcoeff, 8); +} + +void vp8_dequant_idct_add_msa(int16_t *input, int16_t *dq, uint8_t *dest, + int32_t stride) { + dequant_idct4x4_addblk_msa(input, dq, dest, stride); + + __asm__ __volatile__( + "sw $zero, 0(%[input]) \n\t" + "sw $zero, 4(%[input]) \n\t" + "sw $zero, 8(%[input]) \n\t" + "sw $zero, 12(%[input]) \n\t" + "sw $zero, 16(%[input]) \n\t" + "sw $zero, 20(%[input]) \n\t" + "sw $zero, 24(%[input]) \n\t" + "sw $zero, 28(%[input]) \n\t" + + : + : [input] "r"(input)); +} + +void vp8_dequant_idct_add_y_block_msa(int16_t *q, int16_t *dq, uint8_t *dst, + int32_t stride, char *eobs) { + int16_t *eobs_h = (int16_t *)eobs; + uint8_t i; + + for (i = 4; i--;) { + if (eobs_h[0]) { + if (eobs_h[0] & 0xfefe) { + dequant_idct4x4_addblk_2x_msa(q, dq, dst, stride); + } else { + dequant_idct_addconst_2x_msa(q, dq, dst, stride); + } + } + + q += 32; + + if (eobs_h[1]) { + if (eobs_h[1] & 0xfefe) { + dequant_idct4x4_addblk_2x_msa(q, dq, dst + 8, stride); + } else { + dequant_idct_addconst_2x_msa(q, dq, dst + 8, stride); + } + } + + q += 32; + dst += (4 * stride); + eobs_h += 2; + } +} + +void vp8_dequant_idct_add_uv_block_msa(int16_t *q, int16_t *dq, uint8_t *dst_u, + uint8_t *dst_v, int32_t stride, + char *eobs) { + int16_t *eobs_h = (int16_t *)eobs; + + if (eobs_h[0]) { + if (eobs_h[0] & 0xfefe) { + dequant_idct4x4_addblk_2x_msa(q, dq, dst_u, stride); + } else { + dequant_idct_addconst_2x_msa(q, dq, dst_u, stride); + } + } + + q += 32; + dst_u += (stride * 4); + + if (eobs_h[1]) { + if (eobs_h[1] & 0xfefe) { + dequant_idct4x4_addblk_2x_msa(q, dq, dst_u, stride); + } else { + dequant_idct_addconst_2x_msa(q, dq, dst_u, stride); + } + } + + q += 32; + + if (eobs_h[2]) { + if (eobs_h[2] & 0xfefe) { + dequant_idct4x4_addblk_2x_msa(q, dq, dst_v, stride); + } else { + dequant_idct_addconst_2x_msa(q, dq, dst_v, stride); + } + } + + q += 32; + dst_v += (stride * 4); + + if (eobs_h[3]) { + if (eobs_h[3] & 0xfefe) { + dequant_idct4x4_addblk_2x_msa(q, dq, dst_v, stride); + } else { + dequant_idct_addconst_2x_msa(q, dq, dst_v, stride); + } + } +} diff --git a/media/libvpx/libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c b/media/libvpx/libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c new file mode 100644 index 0000000000..98a4fc09a3 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c @@ -0,0 +1,709 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vp8/common/loopfilter.h" +#include "vp8/common/mips/msa/vp8_macros_msa.h" + +#define VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask) \ + { \ + v16u8 p1_a_sub_q1, p0_a_sub_q0; \ + \ + p0_a_sub_q0 = __msa_asub_u_b(p0, q0); \ + p1_a_sub_q1 = __msa_asub_u_b(p1, q1); \ + p1_a_sub_q1 = (v16u8)__msa_srli_b((v16i8)p1_a_sub_q1, 1); \ + p0_a_sub_q0 = __msa_adds_u_b(p0_a_sub_q0, p0_a_sub_q0); \ + mask = __msa_adds_u_b(p0_a_sub_q0, p1_a_sub_q1); \ + mask = ((v16u8)mask <= b_limit); \ + } + +#define VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev) \ + { \ + v16i8 p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \ + const v16i8 cnst4b = __msa_ldi_b(4); \ + const v16i8 cnst3b = __msa_ldi_b(3); \ + \ + p1_m = (v16i8)__msa_xori_b(p1, 0x80); \ + p0_m = (v16i8)__msa_xori_b(p0, 0x80); \ + q0_m = (v16i8)__msa_xori_b(q0, 0x80); \ + q1_m = (v16i8)__msa_xori_b(q1, 0x80); \ + \ + filt = __msa_subs_s_b(p1_m, q1_m); \ + filt &= hev; \ + q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt &= mask; \ + t1 = __msa_adds_s_b(filt, cnst4b); \ + t1 >>= cnst3b; \ + t2 = __msa_adds_s_b(filt, cnst3b); \ + t2 >>= cnst3b; \ + q0_m = __msa_subs_s_b(q0_m, t1); \ + q0 = __msa_xori_b((v16u8)q0_m, 0x80); \ + p0_m = __msa_adds_s_b(p0_m, t2); \ + p0 = __msa_xori_b((v16u8)p0_m, 0x80); \ + filt = __msa_srari_b(t1, 1); \ + hev = __msa_xori_b(hev, 0xff); \ + filt &= hev; \ + q1_m = __msa_subs_s_b(q1_m, filt); \ + q1 = __msa_xori_b((v16u8)q1_m, 0x80); \ + p1_m = __msa_adds_s_b(p1_m, filt); \ + p1 = __msa_xori_b((v16u8)p1_m, 0x80); \ + } + +#define VP8_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask) \ + { \ + v16i8 p1_m, p0_m, q0_m, q1_m, filt, filt1, filt2; \ + v16i8 q0_sub_p0; \ + const v16i8 cnst4b = __msa_ldi_b(4); \ + const v16i8 cnst3b = __msa_ldi_b(3); \ + \ + p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ + p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ + q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ + q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ + \ + filt = __msa_subs_s_b(p1_m, q1_m); \ + q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt &= mask; \ + filt1 = __msa_adds_s_b(filt, cnst4b); \ + filt1 >>= cnst3b; \ + filt2 = __msa_adds_s_b(filt, cnst3b); \ + filt2 >>= cnst3b; \ + q0_m = __msa_subs_s_b(q0_m, filt1); \ + p0_m = __msa_adds_s_b(p0_m, filt2); \ + q0_in = __msa_xori_b((v16u8)q0_m, 0x80); \ + p0_in = __msa_xori_b((v16u8)p0_m, 0x80); \ + } + +#define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) \ + { \ + v16i8 p2_m, p1_m, p0_m, q2_m, q1_m, q0_m; \ + v16i8 u, filt, t1, t2, filt_sign, q0_sub_p0; \ + v8i16 filt_r, filt_l, u_r, u_l; \ + v8i16 temp0, temp1, temp2, temp3; \ + const v16i8 cnst4b = __msa_ldi_b(4); \ + const v16i8 cnst3b = __msa_ldi_b(3); \ + const v8i16 cnst9h = __msa_ldi_h(9); \ + const v8i16 cnst63h = __msa_ldi_h(63); \ + \ + p2_m = (v16i8)__msa_xori_b(p2, 0x80); \ + p1_m = (v16i8)__msa_xori_b(p1, 0x80); \ + p0_m = (v16i8)__msa_xori_b(p0, 0x80); \ + q0_m = (v16i8)__msa_xori_b(q0, 0x80); \ + q1_m = (v16i8)__msa_xori_b(q1, 0x80); \ + q2_m = (v16i8)__msa_xori_b(q2, 0x80); \ + \ + filt = __msa_subs_s_b(p1_m, q1_m); \ + q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt &= mask; \ + \ + t2 = filt & hev; \ + hev = __msa_xori_b(hev, 0xff); \ + filt &= hev; \ + t1 = __msa_adds_s_b(t2, cnst4b); \ + t1 >>= cnst3b; \ + t2 = __msa_adds_s_b(t2, cnst3b); \ + t2 >>= cnst3b; \ + q0_m = __msa_subs_s_b(q0_m, t1); \ + p0_m = __msa_adds_s_b(p0_m, t2); \ + filt_sign = __msa_clti_s_b(filt, 0); \ + ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l); \ + temp0 = filt_r * cnst9h; \ + temp1 = temp0 + cnst63h; \ + temp2 = filt_l * cnst9h; \ + temp3 = temp2 + cnst63h; \ + \ + u_r = temp1 >> 7; \ + u_r = __msa_sat_s_h(u_r, 7); \ + u_l = temp3 >> 7; \ + u_l = __msa_sat_s_h(u_l, 7); \ + u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r); \ + q2_m = __msa_subs_s_b(q2_m, u); \ + p2_m = __msa_adds_s_b(p2_m, u); \ + q2 = __msa_xori_b((v16u8)q2_m, 0x80); \ + p2 = __msa_xori_b((v16u8)p2_m, 0x80); \ + \ + temp1 += temp0; \ + temp3 += temp2; \ + \ + u_r = temp1 >> 7; \ + u_r = __msa_sat_s_h(u_r, 7); \ + u_l = temp3 >> 7; \ + u_l = __msa_sat_s_h(u_l, 7); \ + u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r); \ + q1_m = __msa_subs_s_b(q1_m, u); \ + p1_m = __msa_adds_s_b(p1_m, u); \ + q1 = __msa_xori_b((v16u8)q1_m, 0x80); \ + p1 = __msa_xori_b((v16u8)p1_m, 0x80); \ + \ + temp1 += temp0; \ + temp3 += temp2; \ + \ + u_r = temp1 >> 7; \ + u_r = __msa_sat_s_h(u_r, 7); \ + u_l = temp3 >> 7; \ + u_l = __msa_sat_s_h(u_l, 7); \ + u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r); \ + q0_m = __msa_subs_s_b(q0_m, u); \ + p0_m = __msa_adds_s_b(p0_m, u); \ + q0 = __msa_xori_b((v16u8)q0_m, 0x80); \ + p0 = __msa_xori_b((v16u8)p0_m, 0x80); \ + } + +#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ + limit_in, b_limit_in, thresh_in, hev_out, mask_out, \ + flat_out) \ + { \ + v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \ + v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \ + \ + p3_asub_p2_m = __msa_asub_u_b((p3_in), (p2_in)); \ + p2_asub_p1_m = __msa_asub_u_b((p2_in), (p1_in)); \ + p1_asub_p0_m = __msa_asub_u_b((p1_in), (p0_in)); \ + q1_asub_q0_m = __msa_asub_u_b((q1_in), (q0_in)); \ + q2_asub_q1_m = __msa_asub_u_b((q2_in), (q1_in)); \ + q3_asub_q2_m = __msa_asub_u_b((q3_in), (q2_in)); \ + p0_asub_q0_m = __msa_asub_u_b((p0_in), (q0_in)); \ + p1_asub_q1_m = __msa_asub_u_b((p1_in), (q1_in)); \ + flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \ + hev_out = (thresh_in) < (v16u8)flat_out; \ + p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \ + p1_asub_q1_m >>= 1; \ + p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \ + mask_out = (b_limit_in) < p0_asub_q0_m; \ + mask_out = __msa_max_u_b(flat_out, mask_out); \ + p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \ + mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \ + q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \ + mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \ + mask_out = (limit_in) < (v16u8)mask_out; \ + mask_out = __msa_xori_b(mask_out, 0xff); \ + } + +#define VP8_ST6x1_UB(in0, in0_idx, in1, in1_idx, pdst, stride) \ + { \ + uint16_t tmp0_h; \ + uint32_t tmp0_w; \ + \ + tmp0_w = __msa_copy_u_w((v4i32)in0, in0_idx); \ + tmp0_h = __msa_copy_u_h((v8i16)in1, in1_idx); \ + SW(tmp0_w, pdst); \ + SH(tmp0_h, pdst + stride); \ + } + +static void loop_filter_horizontal_4_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0_ptr, + const uint8_t *limit0_ptr, + const uint8_t *thresh0_ptr, + const uint8_t *b_limit1_ptr, + const uint8_t *limit1_ptr, + const uint8_t *thresh1_ptr) { + v16u8 mask, hev, flat; + v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + + LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr); + thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr); + thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0); + + b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr); + b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr); + b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0); + + limit0 = (v16u8)__msa_fill_b(*limit0_ptr); + limit1 = (v16u8)__msa_fill_b(*limit1_ptr); + limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, + mask, flat); + VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); + + ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch); +} + +static void loop_filter_vertical_4_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0_ptr, + const uint8_t *limit0_ptr, + const uint8_t *thresh0_ptr, + const uint8_t *b_limit1_ptr, + const uint8_t *limit1_ptr, + const uint8_t *thresh1_ptr) { + v16u8 mask, hev, flat; + v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 row0, row1, row2, row3, row4, row5, row6, row7; + v16u8 row8, row9, row10, row11, row12, row13, row14, row15; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + + LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); + LD_UB8(src - 4 + (8 * pitch), pitch, row8, row9, row10, row11, row12, row13, + row14, row15); + TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p3, p2, + p1, p0, q0, q1, q2, q3); + + thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr); + thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr); + thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0); + + b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr); + b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr); + b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0); + + limit0 = (v16u8)__msa_fill_b(*limit0_ptr); + limit1 = (v16u8)__msa_fill_b(*limit1_ptr); + limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, + mask, flat); + VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); + ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1); + ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3); + ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1); + ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5); + + src -= 2; + ST4x8_UB(tmp2, tmp3, src, pitch); + src += (8 * pitch); + ST4x8_UB(tmp4, tmp5, src, pitch); +} + +static void mbloop_filter_horizontal_edge_y_msa(uint8_t *src, int32_t pitch, + const uint8_t b_limit_in, + const uint8_t limit_in, + const uint8_t thresh_in) { + uint8_t *temp_src; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 mask, hev, flat, thresh, limit, b_limit; + + b_limit = (v16u8)__msa_fill_b(b_limit_in); + limit = (v16u8)__msa_fill_b(limit_in); + thresh = (v16u8)__msa_fill_b(thresh_in); + temp_src = src - (pitch << 2); + LD_UB8(temp_src, pitch, p3, p2, p1, p0, q0, q1, q2, q3); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); + temp_src = src - 3 * pitch; + ST_UB4(p2, p1, p0, q0, temp_src, pitch); + temp_src += (4 * pitch); + ST_UB2(q1, q2, temp_src, pitch); +} + +static void mbloop_filter_horizontal_edge_uv_msa(uint8_t *src_u, uint8_t *src_v, + int32_t pitch, + const uint8_t b_limit_in, + const uint8_t limit_in, + const uint8_t thresh_in) { + uint8_t *temp_src; + uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 mask, hev, flat, thresh, limit, b_limit; + v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u; + v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v; + + b_limit = (v16u8)__msa_fill_b(b_limit_in); + limit = (v16u8)__msa_fill_b(limit_in); + thresh = (v16u8)__msa_fill_b(thresh_in); + + temp_src = src_u - (pitch << 2); + LD_UB8(temp_src, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u); + temp_src = src_v - (pitch << 2); + LD_UB8(temp_src, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v); + + ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0); + ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); + + p2_d = __msa_copy_u_d((v2i64)p2, 0); + p1_d = __msa_copy_u_d((v2i64)p1, 0); + p0_d = __msa_copy_u_d((v2i64)p0, 0); + q0_d = __msa_copy_u_d((v2i64)q0, 0); + q1_d = __msa_copy_u_d((v2i64)q1, 0); + q2_d = __msa_copy_u_d((v2i64)q2, 0); + src_u -= (pitch * 3); + SD4(p2_d, p1_d, p0_d, q0_d, src_u, pitch); + src_u += 4 * pitch; + SD(q1_d, src_u); + src_u += pitch; + SD(q2_d, src_u); + + p2_d = __msa_copy_u_d((v2i64)p2, 1); + p1_d = __msa_copy_u_d((v2i64)p1, 1); + p0_d = __msa_copy_u_d((v2i64)p0, 1); + q0_d = __msa_copy_u_d((v2i64)q0, 1); + q1_d = __msa_copy_u_d((v2i64)q1, 1); + q2_d = __msa_copy_u_d((v2i64)q2, 1); + src_v -= (pitch * 3); + SD4(p2_d, p1_d, p0_d, q0_d, src_v, pitch); + src_v += 4 * pitch; + SD(q1_d, src_v); + src_v += pitch; + SD(q2_d, src_v); +} + +static void mbloop_filter_vertical_edge_y_msa(uint8_t *src, int32_t pitch, + const uint8_t b_limit_in, + const uint8_t limit_in, + const uint8_t thresh_in) { + uint8_t *temp_src; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 mask, hev, flat, thresh, limit, b_limit; + v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8; + v16u8 row9, row10, row11, row12, row13, row14, row15; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + b_limit = (v16u8)__msa_fill_b(b_limit_in); + limit = (v16u8)__msa_fill_b(limit_in); + thresh = (v16u8)__msa_fill_b(thresh_in); + temp_src = src - 4; + LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7); + temp_src += (8 * pitch); + LD_UB8(temp_src, pitch, row8, row9, row10, row11, row12, row13, row14, row15); + TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p3, p2, + p1, p0, q0, q1, q2, q3); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); + ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1); + ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4); + ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1); + ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7); + ILVRL_B2_SH(q2, q1, tmp2, tmp5); + + temp_src = src - 3; + VP8_ST6x1_UB(tmp3, 0, tmp2, 0, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_UB(tmp3, 1, tmp2, 1, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_UB(tmp3, 2, tmp2, 2, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_UB(tmp3, 3, tmp2, 3, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_UB(tmp4, 0, tmp2, 4, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_UB(tmp4, 1, tmp2, 5, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_UB(tmp4, 2, tmp2, 6, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_UB(tmp4, 3, tmp2, 7, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_UB(tmp6, 0, tmp5, 0, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_UB(tmp6, 1, tmp5, 1, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_UB(tmp6, 2, tmp5, 2, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_UB(tmp6, 3, tmp5, 3, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_UB(tmp7, 0, tmp5, 4, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_UB(tmp7, 1, tmp5, 5, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_UB(tmp7, 2, tmp5, 6, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_UB(tmp7, 3, tmp5, 7, temp_src, 4); +} + +static void mbloop_filter_vertical_edge_uv_msa(uint8_t *src_u, uint8_t *src_v, + int32_t pitch, + const uint8_t b_limit_in, + const uint8_t limit_in, + const uint8_t thresh_in) { + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 mask, hev, flat, thresh, limit, b_limit; + v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8; + v16u8 row9, row10, row11, row12, row13, row14, row15; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + b_limit = (v16u8)__msa_fill_b(b_limit_in); + limit = (v16u8)__msa_fill_b(limit_in); + thresh = (v16u8)__msa_fill_b(thresh_in); + + LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); + LD_UB8(src_v - 4, pitch, row8, row9, row10, row11, row12, row13, row14, + row15); + TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p3, p2, + p1, p0, q0, q1, q2, q3); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); + + ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1); + ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4); + ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1); + ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7); + ILVRL_B2_SH(q2, q1, tmp2, tmp5); + + src_u -= 3; + VP8_ST6x1_UB(tmp3, 0, tmp2, 0, src_u, 4); + src_u += pitch; + VP8_ST6x1_UB(tmp3, 1, tmp2, 1, src_u, 4); + src_u += pitch; + VP8_ST6x1_UB(tmp3, 2, tmp2, 2, src_u, 4); + src_u += pitch; + VP8_ST6x1_UB(tmp3, 3, tmp2, 3, src_u, 4); + src_u += pitch; + VP8_ST6x1_UB(tmp4, 0, tmp2, 4, src_u, 4); + src_u += pitch; + VP8_ST6x1_UB(tmp4, 1, tmp2, 5, src_u, 4); + src_u += pitch; + VP8_ST6x1_UB(tmp4, 2, tmp2, 6, src_u, 4); + src_u += pitch; + VP8_ST6x1_UB(tmp4, 3, tmp2, 7, src_u, 4); + + src_v -= 3; + VP8_ST6x1_UB(tmp6, 0, tmp5, 0, src_v, 4); + src_v += pitch; + VP8_ST6x1_UB(tmp6, 1, tmp5, 1, src_v, 4); + src_v += pitch; + VP8_ST6x1_UB(tmp6, 2, tmp5, 2, src_v, 4); + src_v += pitch; + VP8_ST6x1_UB(tmp6, 3, tmp5, 3, src_v, 4); + src_v += pitch; + VP8_ST6x1_UB(tmp7, 0, tmp5, 4, src_v, 4); + src_v += pitch; + VP8_ST6x1_UB(tmp7, 1, tmp5, 5, src_v, 4); + src_v += pitch; + VP8_ST6x1_UB(tmp7, 2, tmp5, 6, src_v, 4); + src_v += pitch; + VP8_ST6x1_UB(tmp7, 3, tmp5, 7, src_v, 4); +} + +void vp8_loop_filter_simple_horizontal_edge_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr) { + v16u8 p1, p0, q1, q0; + v16u8 mask, b_limit; + + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + LD_UB4(src - (pitch << 1), pitch, p1, p0, q0, q1); + VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask); + VP8_SIMPLE_FILT(p1, p0, q0, q1, mask); + ST_UB2(p0, q0, (src - pitch), pitch); +} + +void vp8_loop_filter_simple_vertical_edge_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr) { + uint8_t *temp_src; + v16u8 p1, p0, q1, q0; + v16u8 mask, b_limit; + v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8; + v16u8 row9, row10, row11, row12, row13, row14, row15; + v8i16 tmp0, tmp1; + + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + temp_src = src - 2; + LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7); + temp_src += (8 * pitch); + LD_UB8(temp_src, pitch, row8, row9, row10, row11, row12, row13, row14, row15); + TRANSPOSE16x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p1, p0, + q0, q1); + VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask); + VP8_SIMPLE_FILT(p1, p0, q0, q1, mask); + ILVRL_B2_SH(q0, p0, tmp1, tmp0); + + src -= 1; + ST2x4_UB(tmp1, 0, src, pitch); + src += 4 * pitch; + ST2x4_UB(tmp1, 4, src, pitch); + src += 4 * pitch; + ST2x4_UB(tmp0, 0, src, pitch); + src += 4 * pitch; + ST2x4_UB(tmp0, 4, src, pitch); + src += 4 * pitch; +} + +static void loop_filter_horizontal_edge_uv_msa(uint8_t *src_u, uint8_t *src_v, + int32_t pitch, + const uint8_t b_limit_in, + const uint8_t limit_in, + const uint8_t thresh_in) { + uint64_t p1_d, p0_d, q0_d, q1_d; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 mask, hev, flat, thresh, limit, b_limit; + v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u; + v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v; + + thresh = (v16u8)__msa_fill_b(thresh_in); + limit = (v16u8)__msa_fill_b(limit_in); + b_limit = (v16u8)__msa_fill_b(b_limit_in); + + src_u = src_u - (pitch << 2); + LD_UB8(src_u, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u); + src_u += (5 * pitch); + src_v = src_v - (pitch << 2); + LD_UB8(src_v, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v); + src_v += (5 * pitch); + + /* right 8 element of p3 are u pixel and + left 8 element of p3 are v pixel */ + ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0); + ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); + + p1_d = __msa_copy_u_d((v2i64)p1, 0); + p0_d = __msa_copy_u_d((v2i64)p0, 0); + q0_d = __msa_copy_u_d((v2i64)q0, 0); + q1_d = __msa_copy_u_d((v2i64)q1, 0); + SD4(q1_d, q0_d, p0_d, p1_d, src_u, (-pitch)); + + p1_d = __msa_copy_u_d((v2i64)p1, 1); + p0_d = __msa_copy_u_d((v2i64)p0, 1); + q0_d = __msa_copy_u_d((v2i64)q0, 1); + q1_d = __msa_copy_u_d((v2i64)q1, 1); + SD4(q1_d, q0_d, p0_d, p1_d, src_v, (-pitch)); +} + +static void loop_filter_vertical_edge_uv_msa(uint8_t *src_u, uint8_t *src_v, + int32_t pitch, + const uint8_t b_limit_in, + const uint8_t limit_in, + const uint8_t thresh_in) { + uint8_t *temp_src_u, *temp_src_v; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 mask, hev, flat, thresh, limit, b_limit; + v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8; + v16u8 row9, row10, row11, row12, row13, row14, row15; + v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + + thresh = (v16u8)__msa_fill_b(thresh_in); + limit = (v16u8)__msa_fill_b(limit_in); + b_limit = (v16u8)__msa_fill_b(b_limit_in); + + LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); + LD_UB8(src_v - 4, pitch, row8, row9, row10, row11, row12, row13, row14, + row15); + TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p3, p2, + p1, p0, q0, q1, q2, q3); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); + ILVR_B2_SW(p0, p1, q1, q0, tmp0, tmp1); + ILVRL_H2_SW(tmp1, tmp0, tmp2, tmp3); + tmp0 = (v4i32)__msa_ilvl_b((v16i8)p0, (v16i8)p1); + tmp1 = (v4i32)__msa_ilvl_b((v16i8)q1, (v16i8)q0); + ILVRL_H2_SW(tmp1, tmp0, tmp4, tmp5); + + temp_src_u = src_u - 2; + ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, temp_src_u, pitch); + temp_src_u += 4 * pitch; + ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, temp_src_u, pitch); + + temp_src_v = src_v - 2; + ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, temp_src_v, pitch); + temp_src_v += 4 * pitch; + ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, temp_src_v, pitch); +} + +void vp8_loop_filter_mbh_msa(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v, + int32_t pitch_y, int32_t pitch_u_v, + loop_filter_info *lpf_info_ptr) { + mbloop_filter_horizontal_edge_y_msa(src_y, pitch_y, *lpf_info_ptr->mblim, + *lpf_info_ptr->lim, + *lpf_info_ptr->hev_thr); + if (src_u) { + mbloop_filter_horizontal_edge_uv_msa( + src_u, src_v, pitch_u_v, *lpf_info_ptr->mblim, *lpf_info_ptr->lim, + *lpf_info_ptr->hev_thr); + } +} + +void vp8_loop_filter_mbv_msa(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v, + int32_t pitch_y, int32_t pitch_u_v, + loop_filter_info *lpf_info_ptr) { + mbloop_filter_vertical_edge_y_msa(src_y, pitch_y, *lpf_info_ptr->mblim, + *lpf_info_ptr->lim, *lpf_info_ptr->hev_thr); + if (src_u) { + mbloop_filter_vertical_edge_uv_msa(src_u, src_v, pitch_u_v, + *lpf_info_ptr->mblim, *lpf_info_ptr->lim, + *lpf_info_ptr->hev_thr); + } +} + +void vp8_loop_filter_bh_msa(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v, + int32_t pitch_y, int32_t pitch_u_v, + loop_filter_info *lpf_info_ptr) { + loop_filter_horizontal_4_dual_msa(src_y + 4 * pitch_y, pitch_y, + lpf_info_ptr->blim, lpf_info_ptr->lim, + lpf_info_ptr->hev_thr, lpf_info_ptr->blim, + lpf_info_ptr->lim, lpf_info_ptr->hev_thr); + loop_filter_horizontal_4_dual_msa(src_y + 8 * pitch_y, pitch_y, + lpf_info_ptr->blim, lpf_info_ptr->lim, + lpf_info_ptr->hev_thr, lpf_info_ptr->blim, + lpf_info_ptr->lim, lpf_info_ptr->hev_thr); + loop_filter_horizontal_4_dual_msa(src_y + 12 * pitch_y, pitch_y, + lpf_info_ptr->blim, lpf_info_ptr->lim, + lpf_info_ptr->hev_thr, lpf_info_ptr->blim, + lpf_info_ptr->lim, lpf_info_ptr->hev_thr); + if (src_u) { + loop_filter_horizontal_edge_uv_msa( + src_u + (4 * pitch_u_v), src_v + (4 * pitch_u_v), pitch_u_v, + *lpf_info_ptr->blim, *lpf_info_ptr->lim, *lpf_info_ptr->hev_thr); + } +} + +void vp8_loop_filter_bv_msa(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v, + int32_t pitch_y, int32_t pitch_u_v, + loop_filter_info *lpf_info_ptr) { + loop_filter_vertical_4_dual_msa(src_y + 4, pitch_y, lpf_info_ptr->blim, + lpf_info_ptr->lim, lpf_info_ptr->hev_thr, + lpf_info_ptr->blim, lpf_info_ptr->lim, + lpf_info_ptr->hev_thr); + loop_filter_vertical_4_dual_msa(src_y + 8, pitch_y, lpf_info_ptr->blim, + lpf_info_ptr->lim, lpf_info_ptr->hev_thr, + lpf_info_ptr->blim, lpf_info_ptr->lim, + lpf_info_ptr->hev_thr); + loop_filter_vertical_4_dual_msa(src_y + 12, pitch_y, lpf_info_ptr->blim, + lpf_info_ptr->lim, lpf_info_ptr->hev_thr, + lpf_info_ptr->blim, lpf_info_ptr->lim, + lpf_info_ptr->hev_thr); + if (src_u) { + loop_filter_vertical_edge_uv_msa(src_u + 4, src_v + 4, pitch_u_v, + *lpf_info_ptr->blim, *lpf_info_ptr->lim, + *lpf_info_ptr->hev_thr); + } +} + +void vp8_loop_filter_bhs_msa(uint8_t *src_y, int32_t pitch_y, + const uint8_t *b_limit_ptr) { + vp8_loop_filter_simple_horizontal_edge_msa(src_y + (4 * pitch_y), pitch_y, + b_limit_ptr); + vp8_loop_filter_simple_horizontal_edge_msa(src_y + (8 * pitch_y), pitch_y, + b_limit_ptr); + vp8_loop_filter_simple_horizontal_edge_msa(src_y + (12 * pitch_y), pitch_y, + b_limit_ptr); +} + +void vp8_loop_filter_bvs_msa(uint8_t *src_y, int32_t pitch_y, + const uint8_t *b_limit_ptr) { + vp8_loop_filter_simple_vertical_edge_msa(src_y + 4, pitch_y, b_limit_ptr); + vp8_loop_filter_simple_vertical_edge_msa(src_y + 8, pitch_y, b_limit_ptr); + vp8_loop_filter_simple_vertical_edge_msa(src_y + 12, pitch_y, b_limit_ptr); +} diff --git a/media/libvpx/libvpx/vp8/common/mips/msa/mfqe_msa.c b/media/libvpx/libvpx/vp8/common/mips/msa/mfqe_msa.c new file mode 100644 index 0000000000..9aac95b2fa --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mips/msa/mfqe_msa.c @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vp8/common/postproc.h" +#include "vp8/common/mips/msa/vp8_macros_msa.h" + +static void filter_by_weight8x8_msa(uint8_t *src_ptr, int32_t src_stride, + uint8_t *dst_ptr, int32_t dst_stride, + int32_t src_weight) { + int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight; + int32_t row; + uint64_t src0_d, src1_d, dst0_d, dst1_d; + v16i8 src0 = { 0 }; + v16i8 src1 = { 0 }; + v16i8 dst0 = { 0 }; + v16i8 dst1 = { 0 }; + v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l; + + src_wt = __msa_fill_h(src_weight); + dst_wt = __msa_fill_h(dst_weight); + + for (row = 2; row--;) { + LD2(src_ptr, src_stride, src0_d, src1_d); + src_ptr += (2 * src_stride); + LD2(dst_ptr, dst_stride, dst0_d, dst1_d); + INSERT_D2_SB(src0_d, src1_d, src0); + INSERT_D2_SB(dst0_d, dst1_d, dst0); + + LD2(src_ptr, src_stride, src0_d, src1_d); + src_ptr += (2 * src_stride); + LD2((dst_ptr + 2 * dst_stride), dst_stride, dst0_d, dst1_d); + INSERT_D2_SB(src0_d, src1_d, src1); + INSERT_D2_SB(dst0_d, dst1_d, dst1); + + UNPCK_UB_SH(src0, src_r, src_l); + UNPCK_UB_SH(dst0, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + dst0 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r); + ST8x2_UB(dst0, dst_ptr, dst_stride); + dst_ptr += (2 * dst_stride); + + UNPCK_UB_SH(src1, src_r, src_l); + UNPCK_UB_SH(dst1, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + dst1 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r); + ST8x2_UB(dst1, dst_ptr, dst_stride); + dst_ptr += (2 * dst_stride); + } +} + +static void filter_by_weight16x16_msa(uint8_t *src_ptr, int32_t src_stride, + uint8_t *dst_ptr, int32_t dst_stride, + int32_t src_weight) { + int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight; + int32_t row; + v16i8 src0, src1, src2, src3; + v16i8 dst0, dst1, dst2, dst3; + v8i16 src_wt, dst_wt; + v8i16 res_h_r, res_h_l; + v8i16 src_r, src_l, dst_r, dst_l; + + src_wt = __msa_fill_h(src_weight); + dst_wt = __msa_fill_h(dst_weight); + + for (row = 4; row--;) { + LD_SB4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LD_SB4(dst_ptr, dst_stride, dst0, dst1, dst2, dst3); + + UNPCK_UB_SH(src0, src_r, src_l); + UNPCK_UB_SH(dst0, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); + dst_ptr += dst_stride; + + UNPCK_UB_SH(src1, src_r, src_l); + UNPCK_UB_SH(dst1, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); + dst_ptr += dst_stride; + + UNPCK_UB_SH(src2, src_r, src_l); + UNPCK_UB_SH(dst2, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); + dst_ptr += dst_stride; + + UNPCK_UB_SH(src3, src_r, src_l); + UNPCK_UB_SH(dst3, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); + dst_ptr += dst_stride; + } +} + +void vp8_filter_by_weight16x16_msa(uint8_t *src_ptr, int32_t src_stride, + uint8_t *dst_ptr, int32_t dst_stride, + int32_t src_weight) { + filter_by_weight16x16_msa(src_ptr, src_stride, dst_ptr, dst_stride, + src_weight); +} + +void vp8_filter_by_weight8x8_msa(uint8_t *src_ptr, int32_t src_stride, + uint8_t *dst_ptr, int32_t dst_stride, + int32_t src_weight) { + filter_by_weight8x8_msa(src_ptr, src_stride, dst_ptr, dst_stride, src_weight); +} diff --git a/media/libvpx/libvpx/vp8/common/mips/msa/sixtap_filter_msa.c b/media/libvpx/libvpx/vp8/common/mips/msa/sixtap_filter_msa.c new file mode 100644 index 0000000000..3a1bb7cd57 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mips/msa/sixtap_filter_msa.c @@ -0,0 +1,1738 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_ports/mem.h" +#include "vp8/common/filter.h" +#include "vp8/common/mips/msa/vp8_macros_msa.h" + +DECLARE_ALIGNED(16, static const int8_t, vp8_subpel_filters_msa[7][8]) = { + { 0, -6, 123, 12, -1, 0, 0, 0 }, + { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */ + { 0, -9, 93, 50, -6, 0, 0, 0 }, + { 3, -16, 77, 77, -16, 3, 0, 0 }, /* New 1/2 pel 6 tap filter */ + { 0, -6, 50, 93, -9, 0, 0, 0 }, + { 1, -8, 36, 108, -11, 2, 0, 0 }, /* New 1/4 pel 6 tap filter */ + { 0, -1, 12, 123, -6, 0, 0, 0 }, +}; + +static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = { + /* 8 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + /* 4 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, + /* 4 width cases */ + 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 +}; + +#define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_h0, filt_h1, \ + filt_h2) \ + ({ \ + v16i8 _6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m; \ + v8i16 _6tap_out_m; \ + \ + VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2, \ + _6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m); \ + _6tap_out_m = DPADD_SH3_SH(_6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m, \ + filt_h0, filt_h1, filt_h2); \ + \ + _6tap_out_m = __msa_srari_h(_6tap_out_m, VP8_FILTER_SHIFT); \ + _6tap_out_m = __msa_sat_s_h(_6tap_out_m, 7); \ + \ + _6tap_out_m; \ + }) + +#define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + mask2, filt0, filt1, filt2, out0, out1) \ + { \ + v16i8 _6tap_4wid_vec0_m, _6tap_4wid_vec1_m, _6tap_4wid_vec2_m, \ + _6tap_4wid_vec3_m, _6tap_4wid_vec4_m, _6tap_4wid_vec5_m; \ + \ + VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, _6tap_4wid_vec0_m, \ + _6tap_4wid_vec1_m); \ + DOTP_SB2_SH(_6tap_4wid_vec0_m, _6tap_4wid_vec1_m, filt0, filt0, out0, \ + out1); \ + VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, _6tap_4wid_vec2_m, \ + _6tap_4wid_vec3_m); \ + DPADD_SB2_SH(_6tap_4wid_vec2_m, _6tap_4wid_vec3_m, filt1, filt1, out0, \ + out1); \ + VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, _6tap_4wid_vec4_m, \ + _6tap_4wid_vec5_m); \ + DPADD_SB2_SH(_6tap_4wid_vec4_m, _6tap_4wid_vec5_m, filt2, filt2, out0, \ + out1); \ + } + +#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + mask2, filt0, filt1, filt2, out0, out1, \ + out2, out3) \ + { \ + v16i8 _6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m, \ + _6tap_8wid_vec3_m, _6tap_8wid_vec4_m, _6tap_8wid_vec5_m, \ + _6tap_8wid_vec6_m, _6tap_8wid_vec7_m; \ + \ + VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, _6tap_8wid_vec0_m, \ + _6tap_8wid_vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, _6tap_8wid_vec2_m, \ + _6tap_8wid_vec3_m); \ + DOTP_SB4_SH(_6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m, \ + _6tap_8wid_vec3_m, filt0, filt0, filt0, filt0, out0, out1, \ + out2, out3); \ + VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, _6tap_8wid_vec0_m, \ + _6tap_8wid_vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, _6tap_8wid_vec2_m, \ + _6tap_8wid_vec3_m); \ + VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, _6tap_8wid_vec4_m, \ + _6tap_8wid_vec5_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, _6tap_8wid_vec6_m, \ + _6tap_8wid_vec7_m); \ + DPADD_SB4_SH(_6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m, \ + _6tap_8wid_vec3_m, filt1, filt1, filt1, filt1, out0, out1, \ + out2, out3); \ + DPADD_SB4_SH(_6tap_8wid_vec4_m, _6tap_8wid_vec5_m, _6tap_8wid_vec6_m, \ + _6tap_8wid_vec7_m, filt2, filt2, filt2, filt2, out0, out1, \ + out2, out3); \ + } + +#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \ + ({ \ + v8i16 _4tap_dpadd_tmp0; \ + \ + _4tap_dpadd_tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \ + _4tap_dpadd_tmp0 = \ + __msa_dpadd_s_h(_4tap_dpadd_tmp0, (v16i8)vec1, (v16i8)filt1); \ + \ + _4tap_dpadd_tmp0; \ + }) + +#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1) \ + ({ \ + v16i8 _4tap_vec0_m, _4tap_vec1_m; \ + v8i16 _4tap_out_m; \ + \ + VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, _4tap_vec0_m, \ + _4tap_vec1_m); \ + _4tap_out_m = \ + FILT_4TAP_DPADD_S_H(_4tap_vec0_m, _4tap_vec1_m, filt_h0, filt_h1); \ + \ + _4tap_out_m = __msa_srari_h(_4tap_out_m, VP8_FILTER_SHIFT); \ + _4tap_out_m = __msa_sat_s_h(_4tap_out_m, 7); \ + \ + _4tap_out_m; \ + }) + +#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + filt0, filt1, out0, out1) \ + { \ + v16i8 _4tap_4wid_vec0_m, _4tap_4wid_vec1_m, _4tap_4wid_vec2_m, \ + _4tap_4wid_vec3_m; \ + \ + VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, _4tap_4wid_vec0_m, \ + _4tap_4wid_vec1_m); \ + DOTP_SB2_SH(_4tap_4wid_vec0_m, _4tap_4wid_vec1_m, filt0, filt0, out0, \ + out1); \ + VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, _4tap_4wid_vec2_m, \ + _4tap_4wid_vec3_m); \ + DPADD_SB2_SH(_4tap_4wid_vec2_m, _4tap_4wid_vec3_m, filt1, filt1, out0, \ + out1); \ + } + +#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + filt0, filt1, out0, out1, out2, out3) \ + { \ + v16i8 _4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m, \ + _4tap_8wid_vec3_m; \ + \ + VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, _4tap_8wid_vec0_m, \ + _4tap_8wid_vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, _4tap_8wid_vec2_m, \ + _4tap_8wid_vec3_m); \ + DOTP_SB4_SH(_4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m, \ + _4tap_8wid_vec3_m, filt0, filt0, filt0, filt0, out0, out1, \ + out2, out3); \ + VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, _4tap_8wid_vec0_m, \ + _4tap_8wid_vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, _4tap_8wid_vec2_m, \ + _4tap_8wid_vec3_m); \ + DPADD_SB4_SH(_4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m, \ + _4tap_8wid_vec3_m, filt1, filt1, filt1, filt1, out0, out1, \ + out2, out3); \ + } + +static void common_hz_6t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter) { + v16i8 src0, src1, src2, src3, filt0, filt1, filt2; + v16u8 mask0, mask1, mask2, out; + v8i16 filt, out0, out1; + + mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]); + src -= 2; + + filt = LD_SH(filter); + SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0, + filt1, filt2, out0, out1); + SRARI_H2_SH(out0, out1, VP8_FILTER_SHIFT); + SAT_SH2_SH(out0, out1, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_hz_6t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter) { + v16i8 src0, src1, src2, src3, filt0, filt1, filt2; + v16u8 mask0, mask1, mask2, out; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]); + src -= 2; + + filt = LD_SH(filter); + SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0, + filt1, filt2, out0, out1); + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0, + filt1, filt2, out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + out = PCKEV_XORI128_UB(out2, out3); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_hz_6t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + if (4 == height) { + common_hz_6t_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_hz_6t_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_6t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2; + v16u8 mask0, mask1, mask2, tmp0, tmp1; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]); + src -= 2; + + filt = LD_SH(filter); + SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0, + filt1, filt2, out0, out1, out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT); + SAT_SH4_SH(out0, out1, out2, out3, 7); + tmp0 = PCKEV_XORI128_UB(out0, out1); + tmp1 = PCKEV_XORI128_UB(out2, out3); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + + for (loop_cnt = (height >> 2) - 1; loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + filt0, filt1, filt2, out0, out1, out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT); + SAT_SH4_SH(out0, out1, out2, out3, 7); + tmp0 = PCKEV_XORI128_UB(out0, out1); + tmp1 = PCKEV_XORI128_UB(out2, out3); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hz_6t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2; + v16u8 mask0, mask1, mask2, out; + v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7; + + mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]); + src -= 2; + + filt = LD_SH(filter); + SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); + src += (4 * src_stride); + + HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + filt0, filt1, filt2, out0, out1, out2, out3); + HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2, + filt0, filt1, filt2, out4, out5, out6, out7); + SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT); + SRARI_H4_SH(out4, out5, out6, out7, VP8_FILTER_SHIFT); + SAT_SH4_SH(out0, out1, out2, out3, 7); + SAT_SH4_SH(out4, out5, out6, out7, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + dst += dst_stride; + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst); + dst += dst_stride; + out = PCKEV_XORI128_UB(out4, out5); + ST_UB(out, dst); + dst += dst_stride; + out = PCKEV_XORI128_UB(out6, out7); + ST_UB(out, dst); + dst += dst_stride; + } +} + +static void common_vt_6t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; + v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2; + v16u8 out; + v8i16 filt, out10, out32; + + src -= (2 * src_stride); + + filt = LD_SH(filter); + SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); + XORI_B2_128_SB(src2110, src4332); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src5, src6, src7, src8); + src += (4 * src_stride); + + ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, + src76_r, src87_r); + ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776); + XORI_B2_128_SB(src6554, src8776); + out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2); + out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2); + SRARI_H2_SH(out10, out32, VP8_FILTER_SHIFT); + SAT_SH2_SH(out10, out32, 7); + out = PCKEV_XORI128_UB(out10, out32); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + src2110 = src6554; + src4332 = src8776; + src4 = src8; + } +} + +static void common_vt_6t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r; + v16i8 src109_r, filt0, filt1, filt2; + v16u8 tmp0, tmp1; + v8i16 filt, out0_r, out1_r, out2_r, out3_r; + + src -= (2 * src_stride); + + filt = LD_SH(filter); + SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + XORI_B5_128_SB(src0, src1, src2, src3, src4); + ILVR_B4_SB(src1, src0, src3, src2, src2, src1, src4, src3, src10_r, src32_r, + src21_r, src43_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2); + out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2); + out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2); + out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + tmp0 = PCKEV_XORI128_UB(out0_r, out1_r); + tmp1 = PCKEV_XORI128_UB(out2_r, out3_r); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src76_r; + src32_r = src98_r; + src21_r = src87_r; + src43_r = src109_r; + src4 = src10; + } +} + +static void common_vt_6t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; + v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l; + v16i8 src65_l, src87_l, filt0, filt1, filt2; + v16u8 tmp0, tmp1, tmp2, tmp3; + v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt; + + src -= (2 * src_stride); + + filt = LD_SH(filter); + SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + XORI_B5_128_SB(src0, src1, src2, src3, src4); + ILVR_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_r, src32_r, + src43_r, src21_r); + ILVL_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_l, src32_l, + src43_l, src21_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src5, src6, src7, src8); + src += (4 * src_stride); + + XORI_B4_128_SB(src5, src6, src7, src8); + ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, + src76_r, src87_r); + ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l, + src76_l, src87_l); + out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2); + out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2); + out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2); + out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2); + out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2); + out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2); + out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2); + out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT); + SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, VP8_FILTER_SHIFT); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); + PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r, + tmp0, tmp1, tmp2, tmp3); + XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); + ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src21_r = src65_r; + src43_r = src87_r; + src10_l = src54_l; + src32_l = src76_l; + src21_l = src65_l; + src43_l = src87_l; + src4 = src8; + } +} + +static void common_hv_6ht_6vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 filt_hz0, filt_hz1, filt_hz2; + v16u8 mask0, mask1, mask2, out; + v8i16 tmp0, tmp1; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8i16 hz_out7, filt, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3; + + mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]); + src -= (2 + 2 * src_stride); + + filt = LD_SH(filter_horiz); + SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2); + filt = LD_SH(filter_vert); + SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + XORI_B5_128_SB(src0, src1, src2, src3, src4); + hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out2 = HORIZ_6TAP_FILT(src2, src3, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out1 = (v8i16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); + hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB2(src, src_stride, src5, src6); + src += (2 * src_stride); + + XORI_B2_128_SB(src5, src6); + hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8); + + LD_SB2(src, src_stride, src7, src8); + src += (2 * src_stride); + + XORI_B2_128_SB(src7, src8); + hz_out7 = HORIZ_6TAP_FILT(src7, src8, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8); + + out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); + tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); + + out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); + tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2); + + SRARI_H2_SH(tmp0, tmp1, 7); + SAT_SH2_SH(tmp0, tmp1, 7); + out = PCKEV_XORI128_UB(tmp0, tmp1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out3 = hz_out7; + out0 = out2; + out1 = out3; + } +} + +static void common_hv_6ht_6vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 filt_hz0, filt_hz1, filt_hz2; + v16u8 mask0, mask1, mask2, vec0, vec1; + v8i16 filt, filt_vt0, filt_vt1, filt_vt2; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7; + v8i16 tmp0, tmp1, tmp2, tmp3; + + mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]); + src -= (2 + 2 * src_stride); + + filt = LD_SH(filter_horiz); + SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + XORI_B5_128_SB(src0, src1, src2, src3, src4); + hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + + filt = LD_SH(filter_vert); + SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2); + + ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); + ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src5, src6, src7, src8); + src += (4 * src_stride); + + XORI_B4_128_SB(src5, src6, src7, src8); + hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); + tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); + + hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + out5 = (v8i16)__msa_ilvev_b((v16i8)hz_out6, (v16i8)hz_out5); + tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2); + + hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); + tmp2 = DPADD_SH3_SH(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2); + + hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + out6 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7); + tmp3 = DPADD_SH3_SH(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2); + + SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); + SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); + vec0 = PCKEV_XORI128_UB(tmp0, tmp1); + vec1 = PCKEV_XORI128_UB(tmp2, tmp3); + ST8x4_UB(vec0, vec1, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out4 = hz_out8; + out0 = out2; + out1 = out7; + out3 = out5; + out4 = out6; + } +} + +static void common_hv_6ht_6vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 2; multiple8_cnt--;) { + common_hv_6ht_6vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hz_4t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter) { + v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; + v8i16 filt, out0, out1; + v16u8 out; + + mask0 = LD_SB(&vp8_mc_filt_mask_arr[16]); + src -= 1; + + filt = LD_SH(filter); + SPLATI_H2_SB(filt, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, + out0, out1); + SRARI_H2_SH(out0, out1, VP8_FILTER_SHIFT); + SAT_SH2_SH(out0, out1, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_hz_4t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter) { + v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; + v16u8 out; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_SB(&vp8_mc_filt_mask_arr[16]); + src -= 1; + + filt = LD_SH(filter); + SPLATI_H2_SB(filt, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, + out0, out1); + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + out = PCKEV_XORI128_UB(out2, out3); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_hz_4t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + if (4 == height) { + common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_4t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; + v16u8 tmp0, tmp1; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]); + src -= 1; + + filt = LD_SH(filter); + SPLATI_H2_SB(filt, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, + filt1, out0, out1, out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT); + SAT_SH4_SH(out0, out1, out2, out3, 7); + tmp0 = PCKEV_XORI128_UB(out0, out1); + tmp1 = PCKEV_XORI128_UB(out2, out3); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hz_4t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 filt0, filt1, mask0, mask1; + v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7; + v16u8 out; + + mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]); + src -= 1; + + filt = LD_SH(filter); + SPLATI_H2_SB(filt, 0, 1, filt0, filt1); + + mask1 = mask0 + 2; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); + HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, + filt1, out0, out1, out2, out3); + HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0, + filt1, out4, out5, out6, out7); + SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT); + SRARI_H4_SH(out4, out5, out6, out7, VP8_FILTER_SHIFT); + SAT_SH4_SH(out0, out1, out2, out3, 7); + SAT_SH4_SH(out4, out5, out6, out7, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + dst += dst_stride; + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst); + dst += dst_stride; + out = PCKEV_XORI128_UB(out4, out5); + ST_UB(out, dst); + dst += dst_stride; + out = PCKEV_XORI128_UB(out6, out7); + ST_UB(out, dst); + dst += dst_stride; + } +} + +static void common_vt_4t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5; + v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r; + v16i8 src2110, src4332, filt0, filt1; + v8i16 filt, out10, out32; + v16u8 out; + + src -= src_stride; + + filt = LD_SH(filter); + SPLATI_H2_SB(filt, 0, 1, filt0, filt1); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + + src2110 = (v16i8)__msa_ilvr_d((v2i64)src21_r, (v2i64)src10_r); + src2110 = (v16i8)__msa_xori_b((v16u8)src2110, 128); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB3(src, src_stride, src3, src4, src5); + src += (3 * src_stride); + ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); + src4332 = (v16i8)__msa_ilvr_d((v2i64)src43_r, (v2i64)src32_r); + src4332 = (v16i8)__msa_xori_b((v16u8)src4332, 128); + out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1); + + src2 = LD_SB(src); + src += (src_stride); + ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r); + src2110 = (v16i8)__msa_ilvr_d((v2i64)src65_r, (v2i64)src54_r); + src2110 = (v16i8)__msa_xori_b((v16u8)src2110, 128); + out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1); + SRARI_H2_SH(out10, out32, VP8_FILTER_SHIFT); + SAT_SH2_SH(out10, out32, 7); + out = PCKEV_XORI128_UB(out10, out32); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_vt_4t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src7, src8, src9, src10; + v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1; + v16u8 tmp0, tmp1; + v8i16 filt, out0_r, out1_r, out2_r, out3_r; + + src -= src_stride; + + filt = LD_SH(filter); + SPLATI_H2_SB(filt, 0, 1, filt0, filt1); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + + XORI_B4_128_SB(src7, src8, src9, src10); + ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9, src72_r, + src87_r, src98_r, src109_r); + out0_r = FILT_4TAP_DPADD_S_H(src10_r, src72_r, filt0, filt1); + out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1); + out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1); + out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + tmp0 = PCKEV_XORI128_UB(out0_r, out1_r); + tmp1 = PCKEV_XORI128_UB(out2_r, out3_r); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src98_r; + src21_r = src109_r; + src2 = src10; + } +} + +static void common_vt_4t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6; + v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l; + v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1; + v16u8 tmp0, tmp1, tmp2, tmp3; + v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; + + src -= src_stride; + + filt = LD_SH(filter); + SPLATI_H2_SB(filt, 0, 1, filt0, filt1); + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + + XORI_B3_128_SB(src0, src1, src2); + ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); + ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src3, src4, src5, src6); + src += (4 * src_stride); + + XORI_B4_128_SB(src3, src4, src5, src6); + ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, src32_r, src43_r, + src54_r, src65_r); + ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, src32_l, src43_l, + src54_l, src65_l); + out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1); + out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1); + out2_r = FILT_4TAP_DPADD_S_H(src32_r, src54_r, filt0, filt1); + out3_r = FILT_4TAP_DPADD_S_H(src43_r, src65_r, filt0, filt1); + out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1); + out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1); + out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1); + out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT); + SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, VP8_FILTER_SHIFT); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); + PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r, + tmp0, tmp1, tmp2, tmp3); + XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); + ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src21_r = src65_r; + src10_l = src54_l; + src21_l = src65_l; + src2 = src6; + } +} + +static void common_hv_4ht_4vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1; + v16u8 mask0, mask1, out; + v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5; + + mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]); + src -= (1 + 1 * src_stride); + + filt = LD_SH(filter_horiz); + SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1); + + mask1 = mask0 + 2; + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + + XORI_B3_128_SB(src0, src1, src2); + hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1); + hz_out1 = HORIZ_4TAP_FILT(src1, src2, mask0, mask1, filt_hz0, filt_hz1); + vec0 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + + filt = LD_SH(filter_vert); + SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src3, src4, src5, src6); + src += (4 * src_stride); + + XORI_B2_128_SB(src3, src4); + hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1); + hz_out2 = (v8i16)__msa_sldi_b((v16i8)hz_out3, (v16i8)hz_out1, 8); + vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2); + tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1); + + XORI_B2_128_SB(src5, src6); + hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1); + hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8); + vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); + tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1); + + SRARI_H2_SH(tmp0, tmp1, 7); + SAT_SH2_SH(tmp0, tmp1, 7); + out = PCKEV_XORI128_UB(tmp0, tmp1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out1 = hz_out5; + vec0 = vec2; + } +} + +static void common_hv_4ht_4vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1; + v16u8 mask0, mask1, out0, out1; + v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3; + v8i16 vec0, vec1, vec2, vec3, vec4; + + mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]); + src -= (1 + 1 * src_stride); + + filt = LD_SH(filter_horiz); + SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1); + + mask1 = mask0 + 2; + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + + XORI_B3_128_SB(src0, src1, src2); + hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1); + hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1); + hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1); + ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2); + + filt = LD_SH(filter_vert); + SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src3, src4, src5, src6); + src += (4 * src_stride); + + XORI_B4_128_SB(src3, src4, src5, src6); + hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1); + vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2); + tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1); + + hz_out0 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1); + vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out3); + tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1); + + hz_out1 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1); + vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt_vt0, filt_vt1); + + hz_out2 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1); + ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec0, vec1); + tmp3 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1); + + SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); + SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); + out0 = PCKEV_XORI128_UB(tmp0, tmp1); + out1 = PCKEV_XORI128_UB(tmp2, tmp3); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + vec0 = vec4; + vec2 = vec1; + } +} + +static void common_hv_4ht_4vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 2; multiple8_cnt--;) { + common_hv_4ht_4vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_6ht_4vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6; + v16i8 filt_hz0, filt_hz1, filt_hz2; + v16u8 res0, res1, mask0, mask1, mask2; + v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5; + + mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]); + src -= (2 + 1 * src_stride); + + filt = LD_SH(filter_horiz); + SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + + XORI_B3_128_SB(src0, src1, src2); + hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out1 = HORIZ_6TAP_FILT(src1, src2, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + vec0 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + + filt = LD_SH(filter_vert); + SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src3, src4, src5, src6); + src += (4 * src_stride); + + XORI_B4_128_SB(src3, src4, src5, src6); + hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + hz_out2 = (v8i16)__msa_sldi_b((v16i8)hz_out3, (v16i8)hz_out1, 8); + vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2); + tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1); + + hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8); + vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); + tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1); + + SRARI_H2_SH(tmp0, tmp1, 7); + SAT_SH2_SH(tmp0, tmp1, 7); + PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); + XORI_B2_128_UB(res0, res1); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out1 = hz_out5; + vec0 = vec2; + } +} + +static void common_hv_6ht_4vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6; + v16i8 filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2; + v8i16 filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3; + v8i16 tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3; + v16u8 out0, out1; + + mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]); + src -= (2 + src_stride); + + filt = LD_SH(filter_horiz); + SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + + LD_SB3(src, src_stride, src0, src1, src2); + src += (3 * src_stride); + + XORI_B3_128_SB(src0, src1, src2); + hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2); + + filt = LD_SH(filter_vert); + SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src3, src4, src5, src6); + src += (4 * src_stride); + + XORI_B4_128_SB(src3, src4, src5, src6); + + hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2); + tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1); + + hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out3); + tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1); + + hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + vec0 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec0, filt_vt0, filt_vt1); + + hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec1, vec2); + tmp3 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1); + + SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); + SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); + out0 = PCKEV_XORI128_UB(tmp0, tmp1); + out1 = PCKEV_XORI128_UB(tmp2, tmp3); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hv_6ht_4vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 2; multiple8_cnt--;) { + common_hv_6ht_4vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_4ht_6vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 filt_hz0, filt_hz1, mask0, mask1; + v16u8 out; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8i16 hz_out7, tmp0, tmp1, out0, out1, out2, out3; + v8i16 filt, filt_vt0, filt_vt1, filt_vt2; + + mask0 = LD_SB(&vp8_mc_filt_mask_arr[16]); + + src -= (1 + 2 * src_stride); + + filt = LD_SH(filter_horiz); + SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1); + + mask1 = mask0 + 2; + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + XORI_B5_128_SB(src0, src1, src2, src3, src4); + hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1); + hz_out2 = HORIZ_4TAP_FILT(src2, src3, mask0, mask1, filt_hz0, filt_hz1); + hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1); + hz_out1 = (v8i16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); + ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); + + filt = LD_SH(filter_vert); + SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src5, src6, src7, src8); + XORI_B4_128_SB(src5, src6, src7, src8); + src += (4 * src_stride); + + hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1); + hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8); + out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); + tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); + + hz_out7 = HORIZ_4TAP_FILT(src7, src8, mask0, mask1, filt_hz0, filt_hz1); + hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8); + out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); + tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2); + + SRARI_H2_SH(tmp0, tmp1, 7); + SAT_SH2_SH(tmp0, tmp1, 7); + out = PCKEV_XORI128_UB(tmp0, tmp1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out3 = hz_out7; + out0 = out2; + out1 = out3; + } +} + +static void common_hv_4ht_6vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 filt_hz0, filt_hz1, mask0, mask1; + v8i16 filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7; + v16u8 vec0, vec1; + + mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]); + src -= (1 + 2 * src_stride); + + filt = LD_SH(filter_horiz); + SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1); + + mask1 = mask0 + 2; + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + XORI_B5_128_SB(src0, src1, src2, src3, src4); + hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1); + hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1); + hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1); + hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1); + hz_out4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1); + ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); + ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4); + + filt = LD_SH(filter_vert); + SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src5, src6, src7, src8); + src += (4 * src_stride); + + XORI_B4_128_SB(src5, src6, src7, src8); + + hz_out5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1); + out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); + tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); + + hz_out6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1); + out5 = (v8i16)__msa_ilvev_b((v16i8)hz_out6, (v16i8)hz_out5); + tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2); + + hz_out7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1); + out6 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); + tmp2 = DPADD_SH3_SH(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2); + + hz_out8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1); + out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7); + tmp3 = DPADD_SH3_SH(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2); + + SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); + SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); + vec0 = PCKEV_XORI128_UB(tmp0, tmp1); + vec1 = PCKEV_XORI128_UB(tmp2, tmp3); + ST8x4_UB(vec0, vec1, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out4 = hz_out8; + out0 = out2; + out1 = out6; + out3 = out5; + out4 = out7; + } +} + +static void common_hv_4ht_6vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 2; multiple8_cnt--;) { + common_hv_4ht_6vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 8; + dst += 8; + } +} + +void vp8_sixtap_predict4x4_msa(uint8_t *RESTRICT src, int32_t src_stride, + int32_t xoffset, int32_t yoffset, + uint8_t *RESTRICT dst, int32_t dst_stride) { + const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1]; + const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1]; + + if (yoffset) { + if (xoffset) { + switch (xoffset) { + case 2: + case 4: + case 6: + switch (yoffset) { + case 2: + case 4: + case 6: + common_hv_6ht_6vt_4w_msa(src, src_stride, dst, dst_stride, + h_filter, v_filter, 4); + break; + + case 1: + case 3: + case 5: + case 7: + common_hv_6ht_4vt_4w_msa(src, src_stride, dst, dst_stride, + h_filter, v_filter + 1, 4); + break; + } + break; + + case 1: + case 3: + case 5: + case 7: + switch (yoffset) { + case 2: + case 4: + case 6: + common_hv_4ht_6vt_4w_msa(src, src_stride, dst, dst_stride, + h_filter + 1, v_filter, 4); + break; + + case 1: + case 3: + case 5: + case 7: + common_hv_4ht_4vt_4w_msa(src, src_stride, dst, dst_stride, + h_filter + 1, v_filter + 1, 4); + break; + } + break; + } + } else { + switch (yoffset) { + case 2: + case 4: + case 6: + common_vt_6t_4w_msa(src, src_stride, dst, dst_stride, v_filter, 4); + break; + + case 1: + case 3: + case 5: + case 7: + common_vt_4t_4w_msa(src, src_stride, dst, dst_stride, v_filter + 1, + 4); + break; + } + } + } else { + switch (xoffset) { + case 0: { + uint32_t tp0, tp1, tp2, tp3; + + LW4(src, src_stride, tp0, tp1, tp2, tp3); + SW4(tp0, tp1, tp2, tp3, dst, dst_stride); + break; + } + case 2: + case 4: + case 6: + common_hz_6t_4w_msa(src, src_stride, dst, dst_stride, h_filter, 4); + break; + + case 1: + case 3: + case 5: + case 7: + common_hz_4t_4w_msa(src, src_stride, dst, dst_stride, h_filter + 1, 4); + break; + } + } +} + +void vp8_sixtap_predict8x4_msa(uint8_t *RESTRICT src, int32_t src_stride, + int32_t xoffset, int32_t yoffset, + uint8_t *RESTRICT dst, int32_t dst_stride) { + const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1]; + const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1]; + + if (yoffset) { + if (xoffset) { + switch (xoffset) { + case 2: + case 4: + case 6: + switch (yoffset) { + case 2: + case 4: + case 6: + common_hv_6ht_6vt_8w_msa(src, src_stride, dst, dst_stride, + h_filter, v_filter, 4); + break; + + case 1: + case 3: + case 5: + case 7: + common_hv_6ht_4vt_8w_msa(src, src_stride, dst, dst_stride, + h_filter, v_filter + 1, 4); + break; + } + break; + + case 1: + case 3: + case 5: + case 7: + switch (yoffset) { + case 2: + case 4: + case 6: + common_hv_4ht_6vt_8w_msa(src, src_stride, dst, dst_stride, + h_filter + 1, v_filter, 4); + break; + + case 1: + case 3: + case 5: + case 7: + common_hv_4ht_4vt_8w_msa(src, src_stride, dst, dst_stride, + h_filter + 1, v_filter + 1, 4); + break; + } + break; + } + } else { + switch (yoffset) { + case 2: + case 4: + case 6: + common_vt_6t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 4); + break; + + case 1: + case 3: + case 5: + case 7: + common_vt_4t_8w_msa(src, src_stride, dst, dst_stride, v_filter + 1, + 4); + break; + } + } + } else { + switch (xoffset) { + case 0: vp8_copy_mem8x4(src, src_stride, dst, dst_stride); break; + case 2: + case 4: + case 6: + common_hz_6t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 4); + break; + + case 1: + case 3: + case 5: + case 7: + common_hz_4t_8w_msa(src, src_stride, dst, dst_stride, h_filter + 1, 4); + break; + } + } +} + +void vp8_sixtap_predict8x8_msa(uint8_t *RESTRICT src, int32_t src_stride, + int32_t xoffset, int32_t yoffset, + uint8_t *RESTRICT dst, int32_t dst_stride) { + const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1]; + const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1]; + + if (yoffset) { + if (xoffset) { + switch (xoffset) { + case 2: + case 4: + case 6: + switch (yoffset) { + case 2: + case 4: + case 6: + common_hv_6ht_6vt_8w_msa(src, src_stride, dst, dst_stride, + h_filter, v_filter, 8); + break; + + case 1: + case 3: + case 5: + case 7: + common_hv_6ht_4vt_8w_msa(src, src_stride, dst, dst_stride, + h_filter, v_filter + 1, 8); + break; + } + break; + + case 1: + case 3: + case 5: + case 7: + switch (yoffset) { + case 2: + case 4: + case 6: + common_hv_4ht_6vt_8w_msa(src, src_stride, dst, dst_stride, + h_filter + 1, v_filter, 8); + break; + + case 1: + case 3: + case 5: + case 7: + common_hv_4ht_4vt_8w_msa(src, src_stride, dst, dst_stride, + h_filter + 1, v_filter + 1, 8); + break; + } + break; + } + } else { + switch (yoffset) { + case 2: + case 4: + case 6: + common_vt_6t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 8); + break; + + case 1: + case 3: + case 5: + case 7: + common_vt_4t_8w_msa(src, src_stride, dst, dst_stride, v_filter + 1, + 8); + break; + } + } + } else { + switch (xoffset) { + case 0: vp8_copy_mem8x8(src, src_stride, dst, dst_stride); break; + case 2: + case 4: + case 6: + common_hz_6t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 8); + break; + + case 1: + case 3: + case 5: + case 7: + common_hz_4t_8w_msa(src, src_stride, dst, dst_stride, h_filter + 1, 8); + break; + } + } +} + +void vp8_sixtap_predict16x16_msa(uint8_t *RESTRICT src, int32_t src_stride, + int32_t xoffset, int32_t yoffset, + uint8_t *RESTRICT dst, int32_t dst_stride) { + const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1]; + const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1]; + + if (yoffset) { + if (xoffset) { + switch (xoffset) { + case 2: + case 4: + case 6: + switch (yoffset) { + case 2: + case 4: + case 6: + common_hv_6ht_6vt_16w_msa(src, src_stride, dst, dst_stride, + h_filter, v_filter, 16); + break; + + case 1: + case 3: + case 5: + case 7: + common_hv_6ht_4vt_16w_msa(src, src_stride, dst, dst_stride, + h_filter, v_filter + 1, 16); + break; + } + break; + + case 1: + case 3: + case 5: + case 7: + switch (yoffset) { + case 2: + case 4: + case 6: + common_hv_4ht_6vt_16w_msa(src, src_stride, dst, dst_stride, + h_filter + 1, v_filter, 16); + break; + + case 1: + case 3: + case 5: + case 7: + common_hv_4ht_4vt_16w_msa(src, src_stride, dst, dst_stride, + h_filter + 1, v_filter + 1, 16); + break; + } + break; + } + } else { + switch (yoffset) { + case 2: + case 4: + case 6: + common_vt_6t_16w_msa(src, src_stride, dst, dst_stride, v_filter, 16); + break; + + case 1: + case 3: + case 5: + case 7: + common_vt_4t_16w_msa(src, src_stride, dst, dst_stride, v_filter + 1, + 16); + break; + } + } + } else { + switch (xoffset) { + case 0: vp8_copy_mem16x16(src, src_stride, dst, dst_stride); break; + case 2: + case 4: + case 6: + common_hz_6t_16w_msa(src, src_stride, dst, dst_stride, h_filter, 16); + break; + + case 1: + case 3: + case 5: + case 7: + common_hz_4t_16w_msa(src, src_stride, dst, dst_stride, h_filter + 1, + 16); + break; + } + } +} diff --git a/media/libvpx/libvpx/vp8/common/mips/msa/vp8_macros_msa.h b/media/libvpx/libvpx/vp8/common/mips/msa/vp8_macros_msa.h new file mode 100644 index 0000000000..cc85b9a1f7 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mips/msa/vp8_macros_msa.h @@ -0,0 +1,1762 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_ +#define VPX_VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_ + +#include + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" + +#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc)) +#define LD_UB(...) LD_B(v16u8, __VA_ARGS__) +#define LD_SB(...) LD_B(v16i8, __VA_ARGS__) + +#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc)) +#define LD_UH(...) LD_H(v8u16, __VA_ARGS__) +#define LD_SH(...) LD_H(v8i16, __VA_ARGS__) + +#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc)) +#define LD_UW(...) LD_W(v4u32, __VA_ARGS__) +#define LD_SW(...) LD_W(v4i32, __VA_ARGS__) + +#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_UB(...) ST_B(v16u8, __VA_ARGS__) +#define ST_SB(...) ST_B(v16i8, __VA_ARGS__) + +#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_UH(...) ST_H(v8u16, __VA_ARGS__) +#define ST_SH(...) ST_H(v8i16, __VA_ARGS__) + +#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_SW(...) ST_W(v4i32, __VA_ARGS__) + +#if (__mips_isa_rev >= 6) +#define LW(psrc) \ + ({ \ + const uint8_t *lw_psrc_m = (const uint8_t *)(psrc); \ + uint32_t lw_val_m; \ + \ + asm volatile("lw %[lw_val_m], %[lw_psrc_m] \n\t" \ + \ + : [lw_val_m] "=r"(lw_val_m) \ + : [lw_psrc_m] "m"(*lw_psrc_m)); \ + \ + lw_val_m; \ + }) + +#if (__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t *ld_psrc_m = (const uint8_t *)(psrc); \ + uint64_t ld_val_m = 0; \ + \ + asm volatile("ld %[ld_val_m], %[ld_psrc_m] \n\t" \ + \ + : [ld_val_m] "=r"(ld_val_m) \ + : [ld_psrc_m] "m"(*ld_psrc_m)); \ + \ + ld_val_m; \ + }) +#else // !(__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t *ld_psrc_m = (const uint8_t *)(psrc); \ + uint32_t ld_val0_m, ld_val1_m; \ + uint64_t ld_val_m = 0; \ + \ + ld_val0_m = LW(ld_psrc_m); \ + ld_val1_m = LW(ld_psrc_m + 4); \ + \ + ld_val_m = (uint64_t)(ld_val1_m); \ + ld_val_m = (uint64_t)((ld_val_m << 32) & 0xFFFFFFFF00000000); \ + ld_val_m = (uint64_t)(ld_val_m | (uint64_t)ld_val0_m); \ + \ + ld_val_m; \ + }) +#endif // (__mips == 64) + +#define SH(val, pdst) \ + { \ + uint8_t *sh_pdst_m = (uint8_t *)(pdst); \ + const uint16_t sh_val_m = (val); \ + \ + asm volatile("sh %[sh_val_m], %[sh_pdst_m] \n\t" \ + \ + : [sh_pdst_m] "=m"(*sh_pdst_m) \ + : [sh_val_m] "r"(sh_val_m)); \ + } + +#define SW(val, pdst) \ + { \ + uint8_t *sw_pdst_m = (uint8_t *)(pdst); \ + const uint32_t sw_val_m = (val); \ + \ + asm volatile("sw %[sw_val_m], %[sw_pdst_m] \n\t" \ + \ + : [sw_pdst_m] "=m"(*sw_pdst_m) \ + : [sw_val_m] "r"(sw_val_m)); \ + } + +#define SD(val, pdst) \ + { \ + uint8_t *sd_pdst_m = (uint8_t *)(pdst); \ + const uint64_t sd_val_m = (val); \ + \ + asm volatile("sd %[sd_val_m], %[sd_pdst_m] \n\t" \ + \ + : [sd_pdst_m] "=m"(*sd_pdst_m) \ + : [sd_val_m] "r"(sd_val_m)); \ + } +#else // !(__mips_isa_rev >= 6) +#define LW(psrc) \ + ({ \ + const uint8_t *lw_psrc_m = (const uint8_t *)(psrc); \ + uint32_t lw_val_m; \ + \ + asm volatile( \ + "lwr %[lw_val_m], 0(%[lw_psrc_m]) \n\t" \ + "lwl %[lw_val_m], 3(%[lw_psrc_m]) \n\t" \ + : [lw_val_m] "=&r"(lw_val_m) \ + : [lw_psrc_m] "r"(lw_psrc_m)); \ + \ + lw_val_m; \ + }) + +#if (__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t *ld_psrc_m = (const uint8_t *)(psrc); \ + uint64_t ld_val_m = 0; \ + \ + asm volatile( \ + "ldr %[ld_val_m], 0(%[ld_psrc_m]) \n\t" \ + "ldl %[ld_val_m], 7(%[ld_psrc_m]) \n\t" \ + : [ld_val_m] "=&r"(ld_val_m) \ + : [ld_psrc_m] "r"(ld_psrc_m)); \ + \ + ld_val_m; \ + }) +#else // !(__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t *ld_psrc_m1 = (const uint8_t *)(psrc); \ + uint32_t ld_val0_m, ld_val1_m; \ + uint64_t ld_val_m = 0; \ + \ + ld_val0_m = LW(ld_psrc_m1); \ + ld_val1_m = LW(ld_psrc_m1 + 4); \ + \ + ld_val_m = (uint64_t)(ld_val1_m); \ + ld_val_m = (uint64_t)((ld_val_m << 32) & 0xFFFFFFFF00000000); \ + ld_val_m = (uint64_t)(ld_val_m | (uint64_t)ld_val0_m); \ + \ + ld_val_m; \ + }) +#endif // (__mips == 64) +#define SH(val, pdst) \ + { \ + uint8_t *sh_pdst_m = (uint8_t *)(pdst); \ + const uint16_t sh_val_m = (val); \ + \ + asm volatile("ush %[sh_val_m], %[sh_pdst_m] \n\t" \ + \ + : [sh_pdst_m] "=m"(*sh_pdst_m) \ + : [sh_val_m] "r"(sh_val_m)); \ + } + +#define SW(val, pdst) \ + { \ + uint8_t *sw_pdst_m = (uint8_t *)(pdst); \ + const uint32_t sw_val_m = (val); \ + \ + asm volatile("usw %[sw_val_m], %[sw_pdst_m] \n\t" \ + \ + : [sw_pdst_m] "=m"(*sw_pdst_m) \ + : [sw_val_m] "r"(sw_val_m)); \ + } + +#define SD(val, pdst) \ + { \ + uint8_t *sd_pdst_m1 = (uint8_t *)(pdst); \ + uint32_t sd_val0_m, sd_val1_m; \ + \ + sd_val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ + sd_val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ + \ + SW(sd_val0_m, sd_pdst_m1); \ + SW(sd_val1_m, sd_pdst_m1 + 4); \ + } +#endif // (__mips_isa_rev >= 6) + +/* Description : Load 4 words with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1, out2, out3 + Details : Load word in 'out0' from (psrc) + Load word in 'out1' from (psrc + stride) + Load word in 'out2' from (psrc + 2 * stride) + Load word in 'out3' from (psrc + 3 * stride) +*/ +#define LW4(psrc, stride, out0, out1, out2, out3) \ + { \ + out0 = LW((psrc)); \ + out1 = LW((psrc) + stride); \ + out2 = LW((psrc) + 2 * stride); \ + out3 = LW((psrc) + 3 * stride); \ + } + +/* Description : Load double words with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Details : Load double word in 'out0' from (psrc) + Load double word in 'out1' from (psrc + stride) +*/ +#define LD2(psrc, stride, out0, out1) \ + { \ + out0 = LD((psrc)); \ + out1 = LD((psrc) + stride); \ + } +#define LD4(psrc, stride, out0, out1, out2, out3) \ + { \ + LD2((psrc), stride, out0, out1); \ + LD2((psrc) + 2 * stride, stride, out2, out3); \ + } + +/* Description : Store 4 words with stride + Arguments : Inputs - in0, in1, in2, in3, pdst, stride + Details : Store word from 'in0' to (pdst) + Store word from 'in1' to (pdst + stride) + Store word from 'in2' to (pdst + 2 * stride) + Store word from 'in3' to (pdst + 3 * stride) +*/ +#define SW4(in0, in1, in2, in3, pdst, stride) \ + { \ + SW(in0, (pdst)); \ + SW(in1, (pdst) + stride); \ + SW(in2, (pdst) + 2 * stride); \ + SW(in3, (pdst) + 3 * stride); \ + } + +/* Description : Store 4 double words with stride + Arguments : Inputs - in0, in1, in2, in3, pdst, stride + Details : Store double word from 'in0' to (pdst) + Store double word from 'in1' to (pdst + stride) + Store double word from 'in2' to (pdst + 2 * stride) + Store double word from 'in3' to (pdst + 3 * stride) +*/ +#define SD4(in0, in1, in2, in3, pdst, stride) \ + { \ + SD(in0, (pdst)); \ + SD(in1, (pdst) + stride); \ + SD(in2, (pdst) + 2 * stride); \ + SD(in3, (pdst) + 3 * stride); \ + } + +/* Description : Load vectors with 16 byte elements with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Load 16 byte elements in 'out0' from (psrc) + Load 16 byte elements in 'out1' from (psrc + stride) +*/ +#define LD_B2(RTYPE, psrc, stride, out0, out1) \ + { \ + out0 = LD_B(RTYPE, (psrc)); \ + out1 = LD_B(RTYPE, (psrc) + stride); \ + } +#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) +#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__) + +#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \ + { \ + LD_B2(RTYPE, (psrc), stride, out0, out1); \ + out2 = LD_B(RTYPE, (psrc) + 2 * stride); \ + } +#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__) +#define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__) + +#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \ + { \ + LD_B2(RTYPE, (psrc), stride, out0, out1); \ + LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ + } +#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) +#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__) + +#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \ + { \ + LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + out4 = LD_B(RTYPE, (psrc) + 4 * stride); \ + } +#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__) +#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__) + +#define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ + out7) \ + { \ + LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ + } +#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__) +#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__) + +/* Description : Load vectors with 8 halfword elements with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Details : Load 8 halfword elements in 'out0' from (psrc) + Load 8 halfword elements in 'out1' from (psrc + stride) +*/ +#define LD_H2(RTYPE, psrc, stride, out0, out1) \ + { \ + out0 = LD_H(RTYPE, (psrc)); \ + out1 = LD_H(RTYPE, (psrc) + (stride)); \ + } +#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__) + +#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \ + { \ + LD_H2(RTYPE, (psrc), stride, out0, out1); \ + LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ + } +#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__) + +/* Description : Load 2 vectors of signed word elements with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Return Type - signed word +*/ +#define LD_SW2(psrc, stride, out0, out1) \ + { \ + out0 = LD_SW((psrc)); \ + out1 = LD_SW((psrc) + stride); \ + } + +/* Description : Store vectors of 16 byte elements with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 16 byte elements from 'in0' to (pdst) + Store 16 byte elements from 'in1' to (pdst + stride) +*/ +#define ST_B2(RTYPE, in0, in1, pdst, stride) \ + { \ + ST_B(RTYPE, in0, (pdst)); \ + ST_B(RTYPE, in1, (pdst) + stride); \ + } +#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) + +#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \ + { \ + ST_B2(RTYPE, in0, in1, (pdst), stride); \ + ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ + } +#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) +#define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__) + +#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ + { \ + ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \ + ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ + } +#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__) + +/* Description : Store vectors of 8 halfword elements with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 8 halfword elements from 'in0' to (pdst) + Store 8 halfword elements from 'in1' to (pdst + stride) +*/ +#define ST_H2(RTYPE, in0, in1, pdst, stride) \ + { \ + ST_H(RTYPE, in0, (pdst)); \ + ST_H(RTYPE, in1, (pdst) + stride); \ + } +#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__) + +/* Description : Store vectors of word elements with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 4 word elements from 'in0' to (pdst) + Store 4 word elements from 'in1' to (pdst + stride) +*/ +#define ST_SW2(in0, in1, pdst, stride) \ + { \ + ST_SW(in0, (pdst)); \ + ST_SW(in1, (pdst) + stride); \ + } + +/* Description : Store 2x4 byte block to destination memory from input vector + Arguments : Inputs - in, stidx, pdst, stride + Details : Index 'stidx' halfword element from 'in' vector is copied to + the GP register and stored to (pdst) + Index 'stidx+1' halfword element from 'in' vector is copied to + the GP register and stored to (pdst + stride) + Index 'stidx+2' halfword element from 'in' vector is copied to + the GP register and stored to (pdst + 2 * stride) + Index 'stidx+3' halfword element from 'in' vector is copied to + the GP register and stored to (pdst + 3 * stride) +*/ +#define ST2x4_UB(in, stidx, pdst, stride) \ + { \ + uint16_t out0_m, out1_m, out2_m, out3_m; \ + uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \ + out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \ + out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \ + out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \ + \ + SH(out0_m, pblk_2x4_m); \ + SH(out1_m, pblk_2x4_m + stride); \ + SH(out2_m, pblk_2x4_m + 2 * stride); \ + SH(out3_m, pblk_2x4_m + 3 * stride); \ + } + +/* Description : Store 4x4 byte block to destination memory from input vector + Arguments : Inputs - in0, in1, pdst, stride + Details : 'Idx0' word element from input vector 'in0' is copied to the + GP register and stored to (pdst) + 'Idx1' word element from input vector 'in0' is copied to the + GP register and stored to (pdst + stride) + 'Idx2' word element from input vector 'in0' is copied to the + GP register and stored to (pdst + 2 * stride) + 'Idx3' word element from input vector 'in0' is copied to the + GP register and stored to (pdst + 3 * stride) +*/ +#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \ + { \ + uint32_t out0_m, out1_m, out2_m, out3_m; \ + uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_w((v4i32)in0, idx0); \ + out1_m = __msa_copy_u_w((v4i32)in0, idx1); \ + out2_m = __msa_copy_u_w((v4i32)in1, idx2); \ + out3_m = __msa_copy_u_w((v4i32)in1, idx3); \ + \ + SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \ + } +#define ST4x8_UB(in0, in1, pdst, stride) \ + { \ + uint8_t *pblk_4x8 = (uint8_t *)(pdst); \ + \ + ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \ + ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \ + } + +/* Description : Store 8x1 byte block to destination memory from input vector + Arguments : Inputs - in, pdst + Details : Index 0 double word element from 'in' vector is copied to the + GP register and stored to (pdst) +*/ +#define ST8x1_UB(in, pdst) \ + { \ + uint64_t out0_m; \ + \ + out0_m = __msa_copy_u_d((v2i64)in, 0); \ + SD(out0_m, pdst); \ + } + +/* Description : Store 8x2 byte block to destination memory from input vector + Arguments : Inputs - in, pdst, stride + Details : Index 0 double word element from 'in' vector is copied to the + GP register and stored to (pdst) + Index 1 double word element from 'in' vector is copied to the + GP register and stored to (pdst + stride) +*/ +#define ST8x2_UB(in, pdst, stride) \ + { \ + uint64_t out0_m, out1_m; \ + uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_d((v2i64)in, 0); \ + out1_m = __msa_copy_u_d((v2i64)in, 1); \ + \ + SD(out0_m, pblk_8x2_m); \ + SD(out1_m, pblk_8x2_m + stride); \ + } + +/* Description : Store 8x4 byte block to destination memory from input + vectors + Arguments : Inputs - in0, in1, pdst, stride + Details : Index 0 double word element from 'in0' vector is copied to the + GP register and stored to (pdst) + Index 1 double word element from 'in0' vector is copied to the + GP register and stored to (pdst + stride) + Index 0 double word element from 'in1' vector is copied to the + GP register and stored to (pdst + 2 * stride) + Index 1 double word element from 'in1' vector is copied to the + GP register and stored to (pdst + 3 * stride) +*/ +#define ST8x4_UB(in0, in1, pdst, stride) \ + { \ + uint64_t out0_m, out1_m, out2_m, out3_m; \ + uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_d((v2i64)in0, 0); \ + out1_m = __msa_copy_u_d((v2i64)in0, 1); \ + out2_m = __msa_copy_u_d((v2i64)in1, 0); \ + out3_m = __msa_copy_u_d((v2i64)in1, 1); \ + \ + SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \ + } + +/* Description : Immediate number of elements to slide with zero + Arguments : Inputs - in0, in1, slide_val + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Byte elements from 'zero_m' vector are slid into 'in0' by + value specified in the 'slide_val' +*/ +#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \ + { \ + v16i8 zero_m = { 0 }; \ + \ + out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \ + out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \ + } +#define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__) + +/* Description : Immediate number of elements to slide + Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Byte elements from 'in0_0' vector are slid into 'in1_0' by + value specified in the 'slide_val' +*/ +#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ + { \ + out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \ + out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \ + } + +#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \ + out2, slide_val) \ + { \ + SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val); \ + out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \ + } +#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__) + +/* Description : Shuffle byte vector elements as per mask vector + Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Byte elements from 'in0' & 'in1' are copied selectively to + 'out0' as per control vector 'mask0' +*/ +#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ + out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \ + } +#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) +#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__) +#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__) + +#define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \ + out0, out1, out2) \ + { \ + VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \ + out2 = (RTYPE)__msa_vshf_b((v16i8)mask2, (v16i8)in5, (v16i8)in4); \ + } +#define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__) + +/* Description : Shuffle halfword vector elements as per mask vector + Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : halfword elements from 'in0' & 'in1' are copied selectively to + 'out0' as per control vector 'mask0' +*/ +#define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_vshf_h((v8i16)mask0, (v8i16)in1, (v8i16)in0); \ + out1 = (RTYPE)__msa_vshf_h((v8i16)mask1, (v8i16)in3, (v8i16)in2); \ + } +#define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__) + +/* Description : Dot product of byte vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Unsigned byte elements from 'mult0' are multiplied with + unsigned byte elements from 'cnst0' producing a result + twice the size of input i.e. unsigned halfword. + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \ + out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \ + } +#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__) + +#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ + cnst3, out0, out1, out2, out3) \ + { \ + DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ + } +#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__) + +/* Description : Dot product of byte vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed byte elements from 'mult0' are multiplied with + signed byte elements from 'cnst0' producing a result + twice the size of input i.e. signed halfword. + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \ + out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \ + } +#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__) + +#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ + cnst3, out0, out1, out2, out3) \ + { \ + DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ + } +#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__) + +/* Description : Dot product of halfword vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed halfword elements from 'mult0' are multiplied with + signed halfword elements from 'cnst0' producing a result + twice the size of input i.e. signed word. + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \ + out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \ + } + +#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ + cnst3, out0, out1, out2, out3) \ + { \ + DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ + } +#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__) + +/* Description : Dot product of word vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed word elements from 'mult0' are multiplied with + signed word elements from 'cnst0' producing a result + twice the size of input i.e. signed double word. + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \ + out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \ + } +#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__) + +/* Description : Dot product & addition of byte vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed byte elements from 'mult0' are multiplied with + signed byte elements from 'cnst0' producing a result + twice the size of input i.e. signed halfword. + The multiplication result of adjacent odd-even elements + are added to the 'out0' vector +*/ +#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \ + out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \ + } +#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__) + +#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ + cnst3, out0, out1, out2, out3) \ + { \ + DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ + } +#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__) + +/* Description : Dot product & addition of halfword vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed halfword elements from 'mult0' are multiplied with + signed halfword elements from 'cnst0' producing a result + twice the size of input i.e. signed word. + The multiplication result of adjacent odd-even elements + are added to the 'out0' vector +*/ +#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \ + out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \ + } +#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__) + +#define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ + cnst3, out0, out1, out2, out3) \ + { \ + DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ + } +#define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__) + +/* Description : Dot product & addition of double word vector elements + Arguments : Inputs - mult0, mult1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each signed word element from 'mult0' is multiplied with itself + producing an intermediate result twice the size of it + i.e. signed double word + The multiplication result of adjacent odd-even elements + are added to the 'out0' vector +*/ +#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \ + out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \ + } +#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__) + +/* Description : Clips all signed halfword elements of input vector + between 0 & 255 + Arguments : Input - in + Output - out_m + Return Type - signed halfword +*/ +#define CLIP_SH_0_255(in) \ + ({ \ + v8i16 max_m = __msa_ldi_h(255); \ + v8i16 out_m; \ + \ + out_m = __msa_maxi_s_h((v8i16)in, 0); \ + out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \ + out_m; \ + }) +#define CLIP_SH2_0_255(in0, in1) \ + { \ + in0 = CLIP_SH_0_255(in0); \ + in1 = CLIP_SH_0_255(in1); \ + } +#define CLIP_SH4_0_255(in0, in1, in2, in3) \ + { \ + CLIP_SH2_0_255(in0, in1); \ + CLIP_SH2_0_255(in2, in3); \ + } + +/* Description : Clips all signed word elements of input vector + between 0 & 255 + Arguments : Input - in + Output - out_m + Return Type - signed word +*/ +#define CLIP_SW_0_255(in) \ + ({ \ + v4i32 max_m = __msa_ldi_w(255); \ + v4i32 out_m; \ + \ + out_m = __msa_maxi_s_w((v4i32)in, 0); \ + out_m = __msa_min_s_w((v4i32)max_m, (v4i32)out_m); \ + out_m; \ + }) + +/* Description : Horizontal addition of 4 signed word elements of input vector + Arguments : Input - in (signed word vector) + Output - sum_m (i32 sum) + Return Type - signed word (GP) + Details : 4 signed word elements of 'in' vector are added together and + the resulting integer sum is returned +*/ +#define HADD_SW_S32(in) \ + ({ \ + v2i64 res0_m, res1_m; \ + int32_t sum_m; \ + \ + res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \ + res1_m = __msa_splati_d(res0_m, 1); \ + res0_m = res0_m + res1_m; \ + sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \ + sum_m; \ + }) + +/* Description : Horizontal addition of 8 unsigned halfword elements + Arguments : Inputs - in (unsigned halfword vector) + Outputs - sum_m (u32 sum) + Return Type - unsigned word + Details : 8 unsigned halfword elements of input vector are added + together and the resulting integer sum is returned +*/ +#define HADD_UH_U32(in) \ + ({ \ + v4u32 res_m; \ + v2u64 res0_m, res1_m; \ + uint32_t sum_m; \ + \ + res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \ + res0_m = __msa_hadd_u_d(res_m, res_m); \ + res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \ + res0_m = res0_m + res1_m; \ + sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \ + sum_m; \ + }) + +/* Description : Horizontal addition of unsigned byte vector elements + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each unsigned odd byte element from 'in0' is added to + even unsigned byte element from 'in0' (pairwise) and the + halfword result is written to 'out0' +*/ +#define HADD_UB2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \ + out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \ + } +#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__) + +/* Description : Horizontal subtraction of unsigned byte vector elements + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each unsigned odd byte element from 'in0' is subtracted from + even unsigned byte element from 'in0' (pairwise) and the + halfword result is written to 'out0' +*/ +#define HSUB_UB2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \ + out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \ + } +#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__) + +/* Description : Horizontal subtraction of signed halfword vector elements + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each signed odd halfword element from 'in0' is subtracted from + even signed halfword element from 'in0' (pairwise) and the + word result is written to 'out0' +*/ +#define HSUB_UH2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \ + out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \ + } +#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__) + +/* Description : Set element n input vector to GPR value + Arguments : Inputs - in0, in1, in2, in3 + Output - out + Return Type - as per RTYPE + Details : Set element 0 in vector 'out' to value specified in 'in0' +*/ +#define INSERT_D2(RTYPE, in0, in1, out) \ + { \ + out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \ + out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \ + } +#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__) + +/* Description : Interleave even byte elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even byte elements of 'in0' and 'in1' are interleaved + and written to 'out0' +*/ +#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ + out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ + } +#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__) +#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__) +#define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__) + +/* Description : Interleave even halfword elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even halfword elements of 'in0' and 'in1' are interleaved + and written to 'out0' +*/ +#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \ + out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \ + } +#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) +#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__) + +/* Description : Interleave even word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even word elements of 'in0' and 'in1' are interleaved + and written to 'out0' +*/ +#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \ + out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \ + } +#define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__) + +/* Description : Interleave even double word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even double word elements of 'in0' and 'in1' are interleaved + and written to 'out0' +*/ +#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \ + out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \ + } +#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__) + +/* Description : Interleave left half of byte elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Left half of byte elements of 'in0' and 'in1' are interleaved + and written to 'out0'. +*/ +#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \ + } +#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__) +#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__) +#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__) + +#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__) +#define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__) + +/* Description : Interleave left half of halfword elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Left half of halfword elements of 'in0' and 'in1' are + interleaved and written to 'out0'. +*/ +#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \ + } +#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__) +#define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__) + +/* Description : Interleave left half of word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Left half of word elements of 'in0' and 'in1' are interleaved + and written to 'out0'. +*/ +#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ + out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \ + } +#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__) + +/* Description : Interleave right half of byte elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of byte elements of 'in0' and 'in1' are interleaved + and written to out0. +*/ +#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \ + } +#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__) +#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__) +#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) +#define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__) + +#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__) +#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) +#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) +#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) +#define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__) + +/* Description : Interleave right half of halfword elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of halfword elements of 'in0' and 'in1' are + interleaved and written to 'out0'. +*/ +#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \ + } +#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__) +#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__) + +#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__) +#define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__) + +#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ + out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \ + } +#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__) + +/* Description : Interleave right half of double word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of double word elements of 'in0' and 'in1' are + interleaved and written to 'out0'. +*/ +#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \ + out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \ + } +#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__) +#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__) +#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__) + +#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__) +#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__) + +/* Description : Interleave both left and right half of input vectors + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of byte elements from 'in0' and 'in1' are + interleaved and written to 'out0' +*/ +#define ILVRL_B2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ + } +#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) +#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__) +#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__) +#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__) + +#define ILVRL_H2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ + } +#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__) +#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__) + +#define ILVRL_W2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ + out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ + } +#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__) +#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) +#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) + +/* Description : Maximum values between signed elements of vector and + 5-bit signed immediate value are copied to the output vector + Arguments : Inputs - in0, in1, in2, in3, max_val + Outputs - in place operation + Return Type - unsigned halfword + Details : Maximum of signed halfword element values from 'in0' and + 'max_val' are written in place +*/ +#define MAXI_SH2(RTYPE, in0, in1, max_val) \ + { \ + in0 = (RTYPE)__msa_maxi_s_h((v8i16)in0, (max_val)); \ + in1 = (RTYPE)__msa_maxi_s_h((v8i16)in1, (max_val)); \ + } +#define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__) + +/* Description : Saturate the halfword element values to the max + unsigned value of (sat_val + 1) bits + The element data width remains unchanged + Arguments : Inputs - in0, in1, sat_val + Outputs - in place operation + Return Type - as per RTYPE + Details : Each unsigned halfword element from 'in0' is saturated to the + value generated with (sat_val + 1) bit range. + The results are written in place +*/ +#define SAT_UH2(RTYPE, in0, in1, sat_val) \ + { \ + in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \ + in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \ + } +#define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__) + +/* Description : Saturate the halfword element values to the max + unsigned value of (sat_val + 1) bits + The element data width remains unchanged + Arguments : Inputs - in0, in1, sat_val + Outputs - in place operation + Return Type - as per RTYPE + Details : Each unsigned halfword element from 'in0' is saturated to the + value generated with (sat_val + 1) bit range + The results are written in place +*/ +#define SAT_SH2(RTYPE, in0, in1, sat_val) \ + { \ + in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \ + in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \ + } +#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__) + +#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \ + { \ + SAT_SH2(RTYPE, in0, in1, sat_val); \ + SAT_SH2(RTYPE, in2, in3, sat_val); \ + } +#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__) + +/* Description : Indexed halfword element values are replicated to all + elements in output vector + Arguments : Inputs - in, idx0, idx1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : 'idx0' element value from 'in' vector is replicated to all + elements in 'out0' vector + Valid index range for halfword operation is 0-7 +*/ +#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \ + out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \ + } +#define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__) +#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__) + +#define SPLATI_H3(RTYPE, in, idx0, idx1, idx2, out0, out1, out2) \ + { \ + SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \ + out2 = (RTYPE)__msa_splati_h((v8i16)in, idx2); \ + } +#define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__) +#define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__) + +/* Description : Indexed word element values are replicated to all + elements in output vector + Arguments : Inputs - in, stidx + Outputs - out0, out1 + Return Type - as per RTYPE + Details : 'stidx' element value from 'in' vector is replicated to all + elements in 'out0' vector + 'stidx + 1' element value from 'in' vector is replicated to all + elements in 'out1' vector + Valid index range for word operation is 0-3 +*/ +#define SPLATI_W2(RTYPE, in, stidx, out0, out1) \ + { \ + out0 = (RTYPE)__msa_splati_w((v4i32)in, stidx); \ + out1 = (RTYPE)__msa_splati_w((v4i32)in, (stidx + 1)); \ + } +#define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__) + +/* Description : Pack even byte elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even byte elements of 'in0' are copied to the left half of + 'out0' & even byte elements of 'in1' are copied to the right + half of 'out0'. +*/ +#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \ + } +#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__) +#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__) +#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__) +#define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__) + +#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__) +#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__) +#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__) + +/* Description : Pack even halfword elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even halfword elements of 'in0' are copied to the left half of + 'out0' & even halfword elements of 'in1' are copied to the + right half of 'out0'. +*/ +#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \ + } +#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__) + +#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ + PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__) + +/* Description : Pack even double word elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even double elements of 'in0' are copied to the left half of + 'out0' & even double elements of 'in1' are copied to the right + half of 'out0'. +*/ +#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \ + out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \ + } +#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__) +#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__) + +/* Description : Pack odd double word elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Odd double word elements of 'in0' are copied to the left half + of 'out0' & odd double word elements of 'in1' are copied to + the right half of 'out0'. +*/ +#define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_pckod_d((v2i64)in0, (v2i64)in1); \ + out1 = (RTYPE)__msa_pckod_d((v2i64)in2, (v2i64)in3); \ + } +#define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__) +#define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__) + +/* Description : Each byte element is logically xor'ed with immediate 128 + Arguments : Inputs - in0, in1 + Outputs - in place operation + Return Type - as per RTYPE + Details : Each unsigned byte element from input vector 'in0' is + logically xor'ed with 128 and the result is stored in-place. +*/ +#define XORI_B2_128(RTYPE, in0, in1) \ + { \ + in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \ + in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \ + } +#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__) +#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__) + +#define XORI_B3_128(RTYPE, in0, in1, in2) \ + { \ + XORI_B2_128(RTYPE, in0, in1); \ + in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \ + } +#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__) + +#define XORI_B4_128(RTYPE, in0, in1, in2, in3) \ + { \ + XORI_B2_128(RTYPE, in0, in1); \ + XORI_B2_128(RTYPE, in2, in3); \ + } +#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__) +#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__) + +#define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4) \ + { \ + XORI_B3_128(RTYPE, in0, in1, in2); \ + XORI_B2_128(RTYPE, in3, in4); \ + } +#define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__) + +#define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7) \ + { \ + XORI_B4_128(RTYPE, in0, in1, in2, in3); \ + XORI_B4_128(RTYPE, in4, in5, in6, in7); \ + } +#define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__) + +/* Description : Shift left all elements of vector (generic for all data types) + Arguments : Inputs - in0, in1, in2, in3, shift + Outputs - in place operation + Return Type - as per input vector RTYPE + Details : Each element of vector 'in0' is left shifted by 'shift' and + the result is written in-place. +*/ +#define SLLI_4V(in0, in1, in2, in3, shift) \ + { \ + in0 = in0 << shift; \ + in1 = in1 << shift; \ + in2 = in2 << shift; \ + in3 = in3 << shift; \ + } + +/* Description : Arithmetic shift right all elements of vector + (generic for all data types) + Arguments : Inputs - in0, in1, in2, in3, shift + Outputs - in place operation + Return Type - as per input vector RTYPE + Details : Each element of vector 'in0' is right shifted by 'shift' and + the result is written in-place. 'shift' is a GP variable. +*/ +#define SRA_4V(in0, in1, in2, in3, shift) \ + { \ + in0 = in0 >> shift; \ + in1 = in1 >> shift; \ + in2 = in2 >> shift; \ + in3 = in3 >> shift; \ + } + +/* Description : Shift right arithmetic rounded words + Arguments : Inputs - in0, in1, shift + Outputs - in place operation + Return Type - as per RTYPE + Details : Each element of vector 'in0' is shifted right arithmetically by + the number of bits in the corresponding element in the vector + 'shift'. The last discarded bit is added to shifted value for + rounding and the result is written in-place. + 'shift' is a vector. +*/ +#define SRAR_W2(RTYPE, in0, in1, shift) \ + { \ + in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \ + in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \ + } + +#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \ + { \ + SRAR_W2(RTYPE, in0, in1, shift); \ + SRAR_W2(RTYPE, in2, in3, shift); \ + } +#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__) + +/* Description : Shift right arithmetic rounded (immediate) + Arguments : Inputs - in0, in1, shift + Outputs - in place operation + Return Type - as per RTYPE + Details : Each element of vector 'in0' is shifted right arithmetically by + the value in 'shift'. The last discarded bit is added to the + shifted value for rounding and the result is written in-place. + 'shift' is an immediate value. +*/ +#define SRARI_H2(RTYPE, in0, in1, shift) \ + { \ + in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \ + in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \ + } +#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__) +#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__) + +#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \ + { \ + SRARI_H2(RTYPE, in0, in1, shift); \ + SRARI_H2(RTYPE, in2, in3, shift); \ + } +#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__) +#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__) + +#define SRARI_W2(RTYPE, in0, in1, shift) \ + { \ + in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \ + in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \ + } + +#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \ + { \ + SRARI_W2(RTYPE, in0, in1, shift); \ + SRARI_W2(RTYPE, in2, in3, shift); \ + } +#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__) + +/* Description : Multiplication of pairs of vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element from 'in0' is multiplied with elements from 'in1' + and the result is written to 'out0' +*/ +#define MUL2(in0, in1, in2, in3, out0, out1) \ + { \ + out0 = in0 * in1; \ + out1 = in2 * in3; \ + } +#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ + { \ + MUL2(in0, in1, in2, in3, out0, out1); \ + MUL2(in4, in5, in6, in7, out2, out3); \ + } + +/* Description : Addition of 2 pairs of vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element in 'in0' is added to 'in1' and result is written + to 'out0'. +*/ +#define ADD2(in0, in1, in2, in3, out0, out1) \ + { \ + out0 = in0 + in1; \ + out1 = in2 + in3; \ + } +#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ + { \ + ADD2(in0, in1, in2, in3, out0, out1); \ + ADD2(in4, in5, in6, in7, out2, out3); \ + } + +/* Description : Subtraction of 2 pairs of vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element in 'in1' is subtracted from 'in0' and result is + written to 'out0'. +*/ +#define SUB2(in0, in1, in2, in3, out0, out1) \ + { \ + out0 = in0 - in1; \ + out1 = in2 - in3; \ + } +#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ + { \ + out0 = in0 - in1; \ + out1 = in2 - in3; \ + out2 = in4 - in5; \ + out3 = in6 - in7; \ + } + +/* Description : Sign extend halfword elements from right half of the vector + Arguments : Input - in (halfword vector) + Output - out (sign extended word vector) + Return Type - signed word + Details : Sign bit of halfword elements from input vector 'in' is + extracted and interleaved with same vector 'in0' to generate + 4 word elements keeping sign intact +*/ +#define UNPCK_R_SH_SW(in, out) \ + { \ + v8i16 sign_m; \ + \ + sign_m = __msa_clti_s_h((v8i16)in, 0); \ + out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \ + } + +/* Description : Zero extend unsigned byte elements to halfword elements + Arguments : Input - in (unsigned byte vector) + Outputs - out0, out1 (unsigned halfword vectors) + Return Type - signed halfword + Details : Zero extended right half of vector is returned in 'out0' + Zero extended left half of vector is returned in 'out1' +*/ +#define UNPCK_UB_SH(in, out0, out1) \ + { \ + v16i8 zero_m = { 0 }; \ + \ + ILVRL_B2_SH(zero_m, in, out0, out1); \ + } + +/* Description : Sign extend halfword elements from input vector and return + the result in pair of vectors + Arguments : Input - in (halfword vector) + Outputs - out0, out1 (sign extended word vectors) + Return Type - signed word + Details : Sign bit of halfword elements from input vector 'in' is + extracted and interleaved right with same vector 'in0' to + generate 4 signed word elements in 'out0' + Then interleaved left with same vector 'in0' to + generate 4 signed word elements in 'out1' +*/ +#define UNPCK_SH_SW(in, out0, out1) \ + { \ + v8i16 tmp_m; \ + \ + tmp_m = __msa_clti_s_h((v8i16)in, 0); \ + ILVRL_H2_SW(tmp_m, in, out0, out1); \ + } + +/* Description : Butterfly of 4 input vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1, out2, out3 + Details : Butterfly operation +*/ +#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + out0 = in0 + in3; \ + out1 = in1 + in2; \ + \ + out2 = in1 - in2; \ + out3 = in0 - in3; \ + } + +/* Description : Transpose input 8x8 byte block + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - as per RTYPE +*/ +#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \ + out1, out2, out3, out4, out5, out6, out7) \ + { \ + v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ + \ + ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \ + tmp3_m); \ + ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \ + ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \ + ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \ + ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \ + SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \ + SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \ + } +#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__) + +/* Description : Transpose 16x4 block into 4x16 with byte elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, + in8, in9, in10, in11, in12, in13, in14, in15 + Outputs - out0, out1, out2, out3 + Return Type - unsigned byte +*/ +#define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \ + in10, in11, in12, in13, in14, in15, out0, out1, \ + out2, out3) \ + { \ + v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m); \ + out1 = (v16u8)__msa_ilvev_d(tmp1_m, tmp0_m); \ + \ + ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m); \ + out3 = (v16u8)__msa_ilvev_d(tmp1_m, tmp0_m); \ + \ + ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m); \ + \ + tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m); \ + ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m); \ + \ + tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m); \ + ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \ + out0 = (v16u8)__msa_ilvev_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ + out2 = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ + \ + tmp0_m = (v2i64)__msa_ilvod_b((v16i8)out3, (v16i8)out1); \ + tmp1_m = (v2i64)__msa_ilvod_b((v16i8)tmp3_m, (v16i8)tmp2_m); \ + out1 = (v16u8)__msa_ilvev_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ + out3 = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ + } + +/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, + in8, in9, in10, in11, in12, in13, in14, in15 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - unsigned byte +*/ +#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \ + in10, in11, in12, in13, in14, in15, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ + \ + ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \ + ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \ + ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \ + ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \ + \ + tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \ + tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \ + tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \ + tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \ + out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \ + tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \ + out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \ + tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \ + \ + ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \ + out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + \ + tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ + tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \ + out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + \ + ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \ + out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + \ + tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ + tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ + tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ + tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ + out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + } + +/* Description : Transpose 4x4 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1, out2, out3 + Return Type - signed halfword +*/ +#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 s0_m, s1_m; \ + \ + ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \ + ILVRL_W2_SH(s1_m, s0_m, out0, out2); \ + out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ + out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \ + } + +/* Description : Transpose 8x4 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - signed halfword +*/ +#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \ + ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \ + ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \ + ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \ + } + +/* Description : Transpose 4x4 block with word elements in vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1, out2, out3 + Return Type - signed word +*/ +#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v4i32 s0_m, s1_m, s2_m, s3_m; \ + \ + ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ + ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ + \ + out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \ + out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \ + out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \ + out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \ + } + +/* Description : Dot product and addition of 3 signed halfword input vectors + Arguments : Inputs - in0, in1, in2, coeff0, coeff1, coeff2 + Output - out0_m + Return Type - signed halfword + Details : Dot product of 'in0' with 'coeff0' + Dot product of 'in1' with 'coeff1' + Dot product of 'in2' with 'coeff2' + Addition of all the 3 vector results + out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2) +*/ +#define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \ + ({ \ + v8i16 tmp1_m; \ + v8i16 out0_m; \ + \ + out0_m = __msa_dotp_s_h((v16i8)in0, (v16i8)coeff0); \ + out0_m = __msa_dpadd_s_h(out0_m, (v16i8)in1, (v16i8)coeff1); \ + tmp1_m = __msa_dotp_s_h((v16i8)in2, (v16i8)coeff2); \ + out0_m = __msa_adds_s_h(out0_m, tmp1_m); \ + \ + out0_m; \ + }) + +/* Description : Pack even elements of input vectors & xor with 128 + Arguments : Inputs - in0, in1 + Output - out_m + Return Type - unsigned byte + Details : Signed byte even elements from 'in0' and 'in1' are packed + together in one vector and the resulting vector is xor'ed with + 128 to shift the range from signed to unsigned byte +*/ +#define PCKEV_XORI128_UB(in0, in1) \ + ({ \ + v16u8 out_m; \ + out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \ + out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \ + out_m; \ + }) + +/* Description : Pack even byte elements and store byte vector in destination + memory + Arguments : Inputs - in0, in1, pdst +*/ +#define PCKEV_ST_SB(in0, in1, pdst) \ + { \ + v16i8 tmp_m; \ + tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \ + ST_SB(tmp_m, (pdst)); \ + } + +/* Description : Horizontal 2 tap filter kernel code + Arguments : Inputs - in0, in1, mask, coeff, shift +*/ +#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \ + ({ \ + v16i8 tmp0_m; \ + v8u16 tmp1_m; \ + \ + tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \ + tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \ + tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \ + \ + tmp1_m; \ + }) +#endif // VPX_VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_ diff --git a/media/libvpx/libvpx/vp8/common/modecont.c b/media/libvpx/libvpx/vp8/common/modecont.c new file mode 100644 index 0000000000..bab410374f --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/modecont.c @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "entropy.h" + +const int vp8_mode_contexts[6][4] = { + { /* 0 */ + 7, 1, 1, 143 }, + { /* 1 */ + 14, 18, 14, 107 }, + { /* 2 */ + 135, 64, 57, 68 }, + { /* 3 */ + 60, 56, 128, 65 }, + { /* 4 */ + 159, 134, 128, 34 }, + { /* 5 */ + 234, 188, 128, 28 }, +}; diff --git a/media/libvpx/libvpx/vp8/common/modecont.h b/media/libvpx/libvpx/vp8/common/modecont.h new file mode 100644 index 0000000000..031f74f2ff --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/modecont.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_MODECONT_H_ +#define VPX_VP8_COMMON_MODECONT_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +extern const int vp8_mode_contexts[6][4]; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_COMMON_MODECONT_H_ diff --git a/media/libvpx/libvpx/vp8/common/mv.h b/media/libvpx/libvpx/vp8/common/mv.h new file mode 100644 index 0000000000..4cde12f201 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mv.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_MV_H_ +#define VPX_VP8_COMMON_MV_H_ +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + short row; + short col; +} MV; + +typedef union int_mv { + uint32_t as_int; + MV as_mv; +} int_mv; /* facilitates faster equality tests and copies */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_COMMON_MV_H_ diff --git a/media/libvpx/libvpx/vp8/common/onyx.h b/media/libvpx/libvpx/vp8/common/onyx.h new file mode 100644 index 0000000000..1b70ea5dba --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/onyx.h @@ -0,0 +1,285 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_ONYX_H_ +#define VPX_VP8_COMMON_ONYX_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "vpx_config.h" +#include "vpx/internal/vpx_codec_internal.h" +#include "vpx/vp8cx.h" +#include "vpx/vpx_encoder.h" +#include "vpx_scale/yv12config.h" +#include "ppflags.h" + +struct VP8_COMP; + +/* Create/destroy static data structures. */ + +typedef enum { + USAGE_LOCAL_FILE_PLAYBACK = 0x0, + USAGE_STREAM_FROM_SERVER = 0x1, + USAGE_CONSTRAINED_QUALITY = 0x2, + USAGE_CONSTANT_QUALITY = 0x3 +} END_USAGE; + +typedef enum { + MODE_REALTIME = 0x0, + MODE_GOODQUALITY = 0x1, + MODE_BESTQUALITY = 0x2, + MODE_FIRSTPASS = 0x3, + MODE_SECONDPASS = 0x4, + MODE_SECONDPASS_BEST = 0x5 +} MODE; + +typedef enum { + FRAMEFLAGS_KEY = 1, + FRAMEFLAGS_GOLDEN = 2, + FRAMEFLAGS_ALTREF = 4 +} FRAMETYPE_FLAGS; + +#include +static INLINE void Scale2Ratio(int mode, int *hr, int *hs) { + switch (mode) { + case VP8E_NORMAL: + *hr = 1; + *hs = 1; + break; + case VP8E_FOURFIVE: + *hr = 4; + *hs = 5; + break; + case VP8E_THREEFIVE: + *hr = 3; + *hs = 5; + break; + case VP8E_ONETWO: + *hr = 1; + *hs = 2; + break; + default: + *hr = 1; + *hs = 1; + assert(0); + break; + } +} + +typedef struct { + /* 4 versions of bitstream defined: + * 0 best quality/slowest decode, 3 lowest quality/fastest decode + */ + int Version; + int Width; + int Height; + struct vpx_rational timebase; + /* In either kilobits per second or bits per second, depending on which + * copy of oxcf this is in. + * - ctx->oxcf.target_bandwidth is in kilobits per second. See + * set_vp8e_config(). + * - ctx->cpi->oxcf.target_bandwidth in is bits per second. See + * vp8_change_config(). + */ + unsigned int target_bandwidth; + + /* Parameter used for applying denoiser. + * For temporal denoiser: noise_sensitivity = 0 means off, + * noise_sensitivity = 1 means temporal denoiser on for Y channel only, + * noise_sensitivity = 2 means temporal denoiser on for all channels. + * noise_sensitivity = 3 means aggressive denoising mode. + * noise_sensitivity >= 4 means adaptive denoising mode. + * Temporal denoiser is enabled via the configuration option: + * CONFIG_TEMPORAL_DENOISING. + * For spatial denoiser: noise_sensitivity controls the amount of + * pre-processing blur: noise_sensitivity = 0 means off. + * Spatial denoiser invoked under !CONFIG_TEMPORAL_DENOISING. + */ + int noise_sensitivity; + + /* parameter used for sharpening output: recommendation 0: */ + int Sharpness; + int cpu_used; + unsigned int rc_max_intra_bitrate_pct; + /* percent of rate boost for golden frame in CBR mode. */ + unsigned int gf_cbr_boost_pct; + unsigned int screen_content_mode; + + /* mode -> + *(0)=Realtime/Live Encoding. This mode is optimized for realtim + * encoding (for example, capturing a television signal or feed + * from a live camera). ( speed setting controls how fast ) + *(1)=Good Quality Fast Encoding. The encoder balances quality with + * the amount of time it takes to encode the output. ( speed + * setting controls how fast ) + *(2)=One Pass - Best Quality. The encoder places priority on the + * quality of the output over encoding speed. The output is + * compressed at the highest possible quality. This option takes + * the longest amount of time to encode. ( speed setting ignored + * ) + *(3)=Two Pass - First Pass. The encoder generates a file of + * statistics for use in the second encoding pass. ( speed + * setting controls how fast ) + *(4)=Two Pass - Second Pass. The encoder uses the statistics that + * were generated in the first encoding pass to create the + * compressed output. ( speed setting controls how fast ) + *(5)=Two Pass - Second Pass Best. The encoder uses the statistics + * that were generated in the first encoding pass to create the + * compressed output using the highest possible quality, and + * taking a longer amount of time to encode.. ( speed setting + * ignored ) + */ + int Mode; + + /* Key Framing Operations */ + int auto_key; /* automatically detect cut scenes */ + int key_freq; /* maximum distance to key frame. */ + + /* lagged compression (if allow_lag == 0 lag_in_frames is ignored) */ + int allow_lag; + int lag_in_frames; /* how many frames lag before we start encoding */ + + /* + * DATARATE CONTROL OPTIONS + */ + + int end_usage; /* vbr or cbr */ + + /* buffer targeting aggressiveness */ + int under_shoot_pct; + int over_shoot_pct; + + /* buffering parameters */ + int64_t starting_buffer_level; + int64_t optimal_buffer_level; + int64_t maximum_buffer_size; + + int64_t starting_buffer_level_in_ms; + int64_t optimal_buffer_level_in_ms; + int64_t maximum_buffer_size_in_ms; + + /* controlling quality */ + int fixed_q; + int worst_allowed_q; + int best_allowed_q; + int cq_level; + + /* allow internal resizing */ + int allow_spatial_resampling; + int resample_down_water_mark; + int resample_up_water_mark; + + /* allow internal frame rate alterations */ + int allow_df; + int drop_frames_water_mark; + + /* two pass datarate control */ + int two_pass_vbrbias; + int two_pass_vbrmin_section; + int two_pass_vbrmax_section; + + /* + * END DATARATE CONTROL OPTIONS + */ + + /* these parameters aren't to be used in final build don't use!!! */ + int play_alternate; + int alt_freq; + int alt_q; + int key_q; + int gold_q; + + int multi_threaded; /* how many threads to run the encoder on */ + int token_partitions; /* how many token partitions to create */ + + /* early breakout threshold: for video conf recommend 800 */ + int encode_breakout; + + /* Bitfield defining the error resiliency features to enable. + * Can provide decodable frames after losses in previous + * frames and decodable partitions after losses in the same frame. + */ + unsigned int error_resilient_mode; + + int arnr_max_frames; + int arnr_strength; + int arnr_type; + + vpx_fixed_buf_t two_pass_stats_in; + struct vpx_codec_pkt_list *output_pkt_list; + + vp8e_tuning tuning; + + /* Temporal scaling parameters */ + unsigned int number_of_layers; + /* kilobits per second */ + unsigned int target_bitrate[VPX_TS_MAX_PERIODICITY]; + unsigned int rate_decimator[VPX_TS_MAX_PERIODICITY]; + unsigned int periodicity; + unsigned int layer_id[VPX_TS_MAX_PERIODICITY]; + +#if CONFIG_MULTI_RES_ENCODING + /* Number of total resolutions encoded */ + unsigned int mr_total_resolutions; + + /* Current encoder ID */ + unsigned int mr_encoder_id; + + /* Down-sampling factor */ + vpx_rational_t mr_down_sampling_factor; + + /* Memory location to store low-resolution encoder's mode info */ + void *mr_low_res_mode_info; +#endif +} VP8_CONFIG; + +void vp8_initialize(); + +struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf); +void vp8_remove_compressor(struct VP8_COMP **comp); + +void vp8_init_config(struct VP8_COMP *onyx, VP8_CONFIG *oxcf); +void vp8_change_config(struct VP8_COMP *cpi, const VP8_CONFIG *oxcf); + +int vp8_receive_raw_frame(struct VP8_COMP *cpi, unsigned int frame_flags, + YV12_BUFFER_CONFIG *sd, int64_t time_stamp, + int64_t end_time); +int vp8_get_compressed_data(struct VP8_COMP *cpi, unsigned int *frame_flags, + size_t *size, unsigned char *dest, + unsigned char *dest_end, int64_t *time_stamp, + int64_t *time_end, int flush); +int vp8_get_preview_raw_frame(struct VP8_COMP *cpi, YV12_BUFFER_CONFIG *dest, + vp8_ppflags_t *flags); + +int vp8_use_as_reference(struct VP8_COMP *cpi, int ref_frame_flags); +int vp8_update_reference(struct VP8_COMP *cpi, int ref_frame_flags); +int vp8_get_reference(struct VP8_COMP *cpi, + enum vpx_ref_frame_type ref_frame_flag, + YV12_BUFFER_CONFIG *sd); +int vp8_set_reference(struct VP8_COMP *cpi, + enum vpx_ref_frame_type ref_frame_flag, + YV12_BUFFER_CONFIG *sd); +int vp8_update_entropy(struct VP8_COMP *cpi, int update); +int vp8_set_roimap(struct VP8_COMP *cpi, unsigned char *map, unsigned int rows, + unsigned int cols, int delta_q[4], int delta_lf[4], + unsigned int threshold[4]); +int vp8_set_active_map(struct VP8_COMP *cpi, unsigned char *map, + unsigned int rows, unsigned int cols); +int vp8_set_internal_size(struct VP8_COMP *cpi, VPX_SCALING_MODE horiz_mode, + VPX_SCALING_MODE vert_mode); +int vp8_get_quantizer(struct VP8_COMP *cpi); + +#ifdef __cplusplus +} +#endif + +#endif // VPX_VP8_COMMON_ONYX_H_ diff --git a/media/libvpx/libvpx/vp8/common/onyxc_int.h b/media/libvpx/libvpx/vp8/common/onyxc_int.h new file mode 100644 index 0000000000..ef8d007620 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/onyxc_int.h @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_ONYXC_INT_H_ +#define VPX_VP8_COMMON_ONYXC_INT_H_ + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vpx/internal/vpx_codec_internal.h" +#include "loopfilter.h" +#include "entropymv.h" +#include "entropy.h" +#if CONFIG_POSTPROC +#include "postproc.h" +#endif + +/*#ifdef PACKET_TESTING*/ +#include "header.h" +/*#endif*/ + +#ifdef __cplusplus +extern "C" { +#endif + +#define MINQ 0 +#define MAXQ 127 +#define QINDEX_RANGE (MAXQ + 1) + +#define NUM_YV12_BUFFERS 4 + +#define MAX_PARTITIONS 9 + +typedef struct frame_contexts { + vp8_prob bmode_prob[VP8_BINTRAMODES - 1]; + vp8_prob ymode_prob[VP8_YMODES - 1]; /* interframe intra mode probs */ + vp8_prob uv_mode_prob[VP8_UV_MODES - 1]; + vp8_prob sub_mv_ref_prob[VP8_SUBMVREFS - 1]; + vp8_prob coef_probs[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] + [ENTROPY_NODES]; + MV_CONTEXT mvc[2]; +} FRAME_CONTEXT; + +typedef enum { + ONE_PARTITION = 0, + TWO_PARTITION = 1, + FOUR_PARTITION = 2, + EIGHT_PARTITION = 3 +} TOKEN_PARTITION; + +typedef enum { + RECON_CLAMP_REQUIRED = 0, + RECON_CLAMP_NOTREQUIRED = 1 +} CLAMP_TYPE; + +typedef struct VP8Common { + struct vpx_internal_error_info error; + + DECLARE_ALIGNED(16, short, Y1dequant[QINDEX_RANGE][2]); + DECLARE_ALIGNED(16, short, Y2dequant[QINDEX_RANGE][2]); + DECLARE_ALIGNED(16, short, UVdequant[QINDEX_RANGE][2]); + + int Width; + int Height; + int horiz_scale; + int vert_scale; + + CLAMP_TYPE clamp_type; + + YV12_BUFFER_CONFIG *frame_to_show; + + YV12_BUFFER_CONFIG yv12_fb[NUM_YV12_BUFFERS]; + int fb_idx_ref_cnt[NUM_YV12_BUFFERS]; + int new_fb_idx, lst_fb_idx, gld_fb_idx, alt_fb_idx; + + YV12_BUFFER_CONFIG temp_scale_frame; + +#if CONFIG_POSTPROC + YV12_BUFFER_CONFIG post_proc_buffer; + YV12_BUFFER_CONFIG post_proc_buffer_int; + int post_proc_buffer_int_used; + unsigned char *pp_limits_buffer; /* post-processing filter coefficients */ +#endif + + FRAME_TYPE + last_frame_type; /* Save last frame's frame type for motion search. */ + FRAME_TYPE frame_type; + + int show_frame; + + int frame_flags; + int MBs; + int mb_rows; + int mb_cols; + int mode_info_stride; + + /* profile settings */ + int mb_no_coeff_skip; + int no_lpf; + int use_bilinear_mc_filter; + int full_pixel; + + int base_qindex; + + int y1dc_delta_q; + int y2dc_delta_q; + int y2ac_delta_q; + int uvdc_delta_q; + int uvac_delta_q; + + /* We allocate a MODE_INFO struct for each macroblock, together with + an extra row on top and column on the left to simplify prediction. */ + + MODE_INFO *mip; /* Base of allocated array */ + MODE_INFO *mi; /* Corresponds to upper left visible macroblock */ +#if CONFIG_ERROR_CONCEALMENT + MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */ + MODE_INFO *prev_mi; /* 'mi' from last frame (points into prev_mip) */ +#endif + /* MODE_INFO for the last decoded frame to show */ + MODE_INFO *show_frame_mi; + LOOPFILTERTYPE filter_type; + + loop_filter_info_n lf_info; + + int filter_level; + int last_sharpness_level; + int sharpness_level; + + int refresh_last_frame; /* Two state 0 = NO, 1 = YES */ + int refresh_golden_frame; /* Two state 0 = NO, 1 = YES */ + int refresh_alt_ref_frame; /* Two state 0 = NO, 1 = YES */ + + int copy_buffer_to_gf; /* 0 none, 1 Last to GF, 2 ARF to GF */ + int copy_buffer_to_arf; /* 0 none, 1 Last to ARF, 2 GF to ARF */ + + int refresh_entropy_probs; /* Two state 0 = NO, 1 = YES */ + + int ref_frame_sign_bias[MAX_REF_FRAMES]; /* Two state 0, 1 */ + + /* Y,U,V,Y2 */ + ENTROPY_CONTEXT_PLANES *above_context; /* row of context for each plane */ + ENTROPY_CONTEXT_PLANES left_context; /* (up to) 4 contexts "" */ + + FRAME_CONTEXT lfc; /* last frame entropy */ + FRAME_CONTEXT fc; /* this frame entropy */ + + unsigned int current_video_frame; + + int version; + + TOKEN_PARTITION multi_token_partition; + +#ifdef PACKET_TESTING + VP8_HEADER oh; +#endif + +#if CONFIG_MULTITHREAD + int processor_core_count; +#endif +#if CONFIG_POSTPROC + struct postproc_state postproc_state; +#endif + int cpu_caps; +} VP8_COMMON; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_COMMON_ONYXC_INT_H_ diff --git a/media/libvpx/libvpx/vp8/common/onyxd.h b/media/libvpx/libvpx/vp8/common/onyxd.h new file mode 100644 index 0000000000..217a598de7 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/onyxd.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_ONYXD_H_ +#define VPX_VP8_COMMON_ONYXD_H_ + +/* Create/destroy static data structures. */ +#ifdef __cplusplus +extern "C" { +#endif +#include "vpx_scale/yv12config.h" +#include "ppflags.h" +#include "vpx_ports/mem.h" +#include "vpx/vpx_codec.h" +#include "vpx/vp8.h" + +struct VP8D_COMP; +struct VP8Common; + +typedef struct { + int Width; + int Height; + int Version; + int postprocess; + int max_threads; + int error_concealment; +} VP8D_CONFIG; + +typedef enum { VP8D_OK = 0 } VP8D_SETTING; + +void vp8dx_initialize(void); + +void vp8dx_set_setting(struct VP8D_COMP *comp, VP8D_SETTING oxst, int x); + +int vp8dx_get_setting(struct VP8D_COMP *comp, VP8D_SETTING oxst); + +int vp8dx_receive_compressed_data(struct VP8D_COMP *pbi); +int vp8dx_get_raw_frame(struct VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, + vp8_ppflags_t *flags); +int vp8dx_references_buffer(struct VP8Common *oci, int ref_frame); + +vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP *pbi, + enum vpx_ref_frame_type ref_frame_flag, + YV12_BUFFER_CONFIG *sd); +vpx_codec_err_t vp8dx_set_reference(struct VP8D_COMP *pbi, + enum vpx_ref_frame_type ref_frame_flag, + YV12_BUFFER_CONFIG *sd); +int vp8dx_get_quantizer(const struct VP8D_COMP *pbi); + +#ifdef __cplusplus +} +#endif + +#endif // VPX_VP8_COMMON_ONYXD_H_ diff --git a/media/libvpx/libvpx/vp8/common/postproc.c b/media/libvpx/libvpx/vp8/common/postproc.c new file mode 100644 index 0000000000..c03b16b2f5 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/postproc.c @@ -0,0 +1,264 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vpx_dsp_rtcd.h" +#include "vp8_rtcd.h" +#include "vpx_dsp/postproc.h" +#include "vpx_ports/system_state.h" +#include "vpx_scale_rtcd.h" +#include "vpx_scale/yv12config.h" +#include "postproc.h" +#include "common.h" +#include "vpx_scale/vpx_scale.h" +#include "systemdependent.h" + +#include +#include +#include +#include + +/* clang-format off */ +#define RGB_TO_YUV(t) \ + (unsigned char)((0.257 * (float)(t >> 16)) + \ + (0.504 * (float)(t >> 8 & 0xff)) + \ + (0.098 * (float)(t & 0xff)) + 16), \ + (unsigned char)(-(0.148 * (float)(t >> 16)) - \ + (0.291 * (float)(t >> 8 & 0xff)) + \ + (0.439 * (float)(t & 0xff)) + 128), \ + (unsigned char)((0.439 * (float)(t >> 16)) - \ + (0.368 * (float)(t >> 8 & 0xff)) - \ + (0.071 * (float)(t & 0xff)) + 128) +/* clang-format on */ + +extern void vp8_blit_text(const char *msg, unsigned char *address, + const int pitch); +extern void vp8_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, + const int pitch); +/*********************************************************************************************************** + */ +#if CONFIG_POSTPROC +static int q2mbl(int x) { + if (x < 20) x = 20; + + x = 50 + (x - 50) * 10 / 8; + return x * x / 3; +} + +static void vp8_de_mblock(YV12_BUFFER_CONFIG *post, int q) { + vpx_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height, + post->y_width, q2mbl(q)); + vpx_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height, + post->y_width, q2mbl(q)); +} + +void vp8_deblock(VP8_COMMON *cm, YV12_BUFFER_CONFIG *source, + YV12_BUFFER_CONFIG *post, int q) { + double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065; + int ppl = (int)(level + .5); + + const MODE_INFO *mode_info_context = cm->mi; + int mbr, mbc; + + /* The pixel thresholds are adjusted according to if or not the macroblock + * is a skipped block. */ + unsigned char *ylimits = cm->pp_limits_buffer; + unsigned char *uvlimits = cm->pp_limits_buffer + 16 * cm->mb_cols; + + if (ppl > 0) { + for (mbr = 0; mbr < cm->mb_rows; ++mbr) { + unsigned char *ylptr = ylimits; + unsigned char *uvlptr = uvlimits; + for (mbc = 0; mbc < cm->mb_cols; ++mbc) { + unsigned char mb_ppl; + + if (mode_info_context->mbmi.mb_skip_coeff) { + mb_ppl = (unsigned char)ppl >> 1; + } else { + mb_ppl = (unsigned char)ppl; + } + + memset(ylptr, mb_ppl, 16); + memset(uvlptr, mb_ppl, 8); + + ylptr += 16; + uvlptr += 8; + mode_info_context++; + } + mode_info_context++; + + vpx_post_proc_down_and_across_mb_row( + source->y_buffer + 16 * mbr * source->y_stride, + post->y_buffer + 16 * mbr * post->y_stride, source->y_stride, + post->y_stride, source->y_width, ylimits, 16); + + vpx_post_proc_down_and_across_mb_row( + source->u_buffer + 8 * mbr * source->uv_stride, + post->u_buffer + 8 * mbr * post->uv_stride, source->uv_stride, + post->uv_stride, source->uv_width, uvlimits, 8); + vpx_post_proc_down_and_across_mb_row( + source->v_buffer + 8 * mbr * source->uv_stride, + post->v_buffer + 8 * mbr * post->uv_stride, source->uv_stride, + post->uv_stride, source->uv_width, uvlimits, 8); + } + } else { + vp8_yv12_copy_frame(source, post); + } +} + +void vp8_de_noise(VP8_COMMON *cm, YV12_BUFFER_CONFIG *source, int q, + int uvfilter) { + int mbr; + double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065; + int ppl = (int)(level + .5); + int mb_rows = cm->mb_rows; + int mb_cols = cm->mb_cols; + unsigned char *limits = cm->pp_limits_buffer; + + memset(limits, (unsigned char)ppl, 16 * mb_cols); + + /* TODO: The original code don't filter the 2 outer rows and columns. */ + for (mbr = 0; mbr < mb_rows; ++mbr) { + vpx_post_proc_down_and_across_mb_row( + source->y_buffer + 16 * mbr * source->y_stride, + source->y_buffer + 16 * mbr * source->y_stride, source->y_stride, + source->y_stride, source->y_width, limits, 16); + if (uvfilter == 1) { + vpx_post_proc_down_and_across_mb_row( + source->u_buffer + 8 * mbr * source->uv_stride, + source->u_buffer + 8 * mbr * source->uv_stride, source->uv_stride, + source->uv_stride, source->uv_width, limits, 8); + vpx_post_proc_down_and_across_mb_row( + source->v_buffer + 8 * mbr * source->uv_stride, + source->v_buffer + 8 * mbr * source->uv_stride, source->uv_stride, + source->uv_stride, source->uv_width, limits, 8); + } + } +} +#endif // CONFIG_POSTPROC + +#if CONFIG_POSTPROC +int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, + vp8_ppflags_t *ppflags) { + int q = oci->filter_level * 10 / 6; + int flags = ppflags->post_proc_flag; + int deblock_level = ppflags->deblocking_level; + int noise_level = ppflags->noise_level; + + if (!oci->frame_to_show) return -1; + + if (q > 63) q = 63; + + if (!flags) { + *dest = *oci->frame_to_show; + + /* handle problem with extending borders */ + dest->y_width = oci->Width; + dest->y_height = oci->Height; + dest->uv_height = dest->y_height / 2; + oci->postproc_state.last_base_qindex = oci->base_qindex; + oci->postproc_state.last_frame_valid = 1; + return 0; + } + if (flags & VP8D_ADDNOISE) { + if (!oci->postproc_state.generated_noise) { + oci->postproc_state.generated_noise = vpx_calloc( + oci->Width + 256, sizeof(*oci->postproc_state.generated_noise)); + if (!oci->postproc_state.generated_noise) return 1; + } + } + + /* Allocate post_proc_buffer_int if needed */ + if ((flags & VP8D_MFQE) && !oci->post_proc_buffer_int_used) { + if ((flags & VP8D_DEBLOCK) || (flags & VP8D_DEMACROBLOCK)) { + int width = (oci->Width + 15) & ~15; + int height = (oci->Height + 15) & ~15; + + if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer_int, width, height, + VP8BORDERINPIXELS)) { + vpx_internal_error(&oci->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate MFQE framebuffer"); + } + + oci->post_proc_buffer_int_used = 1; + + /* insure that postproc is set to all 0's so that post proc + * doesn't pull random data in from edge + */ + memset((&oci->post_proc_buffer_int)->buffer_alloc, 128, + (&oci->post_proc_buffer)->frame_size); + } + } + + vpx_clear_system_state(); + + if ((flags & VP8D_MFQE) && oci->postproc_state.last_frame_valid && + oci->current_video_frame > 10 && + oci->postproc_state.last_base_qindex < 60 && + oci->base_qindex - oci->postproc_state.last_base_qindex >= 20) { + vp8_multiframe_quality_enhance(oci); + if (((flags & VP8D_DEBLOCK) || (flags & VP8D_DEMACROBLOCK)) && + oci->post_proc_buffer_int_used) { + vp8_yv12_copy_frame(&oci->post_proc_buffer, &oci->post_proc_buffer_int); + if (flags & VP8D_DEMACROBLOCK) { + vp8_deblock(oci, &oci->post_proc_buffer_int, &oci->post_proc_buffer, + q + (deblock_level - 5) * 10); + vp8_de_mblock(&oci->post_proc_buffer, q + (deblock_level - 5) * 10); + } else if (flags & VP8D_DEBLOCK) { + vp8_deblock(oci, &oci->post_proc_buffer_int, &oci->post_proc_buffer, q); + } + } + /* Move partially towards the base q of the previous frame */ + oci->postproc_state.last_base_qindex = + (3 * oci->postproc_state.last_base_qindex + oci->base_qindex) >> 2; + } else if (flags & VP8D_DEMACROBLOCK) { + vp8_deblock(oci, oci->frame_to_show, &oci->post_proc_buffer, + q + (deblock_level - 5) * 10); + vp8_de_mblock(&oci->post_proc_buffer, q + (deblock_level - 5) * 10); + + oci->postproc_state.last_base_qindex = oci->base_qindex; + } else if (flags & VP8D_DEBLOCK) { + vp8_deblock(oci, oci->frame_to_show, &oci->post_proc_buffer, q); + oci->postproc_state.last_base_qindex = oci->base_qindex; + } else { + vp8_yv12_copy_frame(oci->frame_to_show, &oci->post_proc_buffer); + oci->postproc_state.last_base_qindex = oci->base_qindex; + } + oci->postproc_state.last_frame_valid = 1; + + if (flags & VP8D_ADDNOISE) { + if (oci->postproc_state.last_q != q || + oci->postproc_state.last_noise != noise_level) { + double sigma; + struct postproc_state *ppstate = &oci->postproc_state; + vpx_clear_system_state(); + sigma = noise_level + .5 + .6 * q / 63.0; + ppstate->clamp = + vpx_setup_noise(sigma, ppstate->generated_noise, oci->Width + 256); + ppstate->last_q = q; + ppstate->last_noise = noise_level; + } + + vpx_plane_add_noise( + oci->post_proc_buffer.y_buffer, oci->postproc_state.generated_noise, + oci->postproc_state.clamp, oci->postproc_state.clamp, + oci->post_proc_buffer.y_width, oci->post_proc_buffer.y_height, + oci->post_proc_buffer.y_stride); + } + + *dest = oci->post_proc_buffer; + + /* handle problem with extending borders */ + dest->y_width = oci->Width; + dest->y_height = oci->Height; + dest->uv_height = dest->y_height / 2; + return 0; +} +#endif diff --git a/media/libvpx/libvpx/vp8/common/postproc.h b/media/libvpx/libvpx/vp8/common/postproc.h new file mode 100644 index 0000000000..492c52aef6 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/postproc.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_POSTPROC_H_ +#define VPX_VP8_COMMON_POSTPROC_H_ + +#include "vpx_ports/mem.h" +struct postproc_state { + int last_q; + int last_noise; + int last_base_qindex; + int last_frame_valid; + int clamp; + int8_t *generated_noise; +}; +#include "onyxc_int.h" +#include "ppflags.h" + +#ifdef __cplusplus +extern "C" { +#endif +int vp8_post_proc_frame(struct VP8Common *oci, YV12_BUFFER_CONFIG *dest, + vp8_ppflags_t *ppflags); + +void vp8_de_noise(struct VP8Common *cm, YV12_BUFFER_CONFIG *source, int q, + int uvfilter); + +void vp8_deblock(struct VP8Common *cm, YV12_BUFFER_CONFIG *source, + YV12_BUFFER_CONFIG *post, int q); + +#define MFQE_PRECISION 4 + +void vp8_multiframe_quality_enhance(struct VP8Common *cm); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_COMMON_POSTPROC_H_ diff --git a/media/libvpx/libvpx/vp8/common/ppflags.h b/media/libvpx/libvpx/vp8/common/ppflags.h new file mode 100644 index 0000000000..bdf08734b9 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/ppflags.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_PPFLAGS_H_ +#define VPX_VP8_COMMON_PPFLAGS_H_ + +#ifdef __cplusplus +extern "C" { +#endif +enum { + VP8D_NOFILTERING = 0, + VP8D_DEBLOCK = 1 << 0, + VP8D_DEMACROBLOCK = 1 << 1, + VP8D_ADDNOISE = 1 << 2, + VP8D_MFQE = 1 << 3 +}; + +typedef struct { + int post_proc_flag; + int deblocking_level; + int noise_level; + int display_ref_frame_flag; + int display_mb_modes_flag; + int display_b_modes_flag; + int display_mv_flag; +} vp8_ppflags_t; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_COMMON_PPFLAGS_H_ diff --git a/media/libvpx/libvpx/vp8/common/quant_common.c b/media/libvpx/libvpx/vp8/common/quant_common.c new file mode 100644 index 0000000000..e290eec92b --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/quant_common.c @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "quant_common.h" + +static const int dc_qlookup[QINDEX_RANGE] = { + 4, 5, 6, 7, 8, 9, 10, 10, 11, 12, 13, 14, 15, 16, 17, + 17, 18, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 25, 25, 26, + 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, + 41, 42, 43, 44, 45, 46, 46, 47, 48, 49, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, + 70, 71, 72, 73, 74, 75, 76, 76, 77, 78, 79, 80, 81, 82, 83, + 84, 85, 86, 87, 88, 89, 91, 93, 95, 96, 98, 100, 101, 102, 104, + 106, 108, 110, 112, 114, 116, 118, 122, 124, 126, 128, 130, 132, 134, 136, + 138, 140, 143, 145, 148, 151, 154, 157, +}; + +static const int ac_qlookup[QINDEX_RANGE] = { + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, + 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 60, 62, 64, 66, 68, + 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, + 100, 102, 104, 106, 108, 110, 112, 114, 116, 119, 122, 125, 128, 131, 134, + 137, 140, 143, 146, 149, 152, 155, 158, 161, 164, 167, 170, 173, 177, 181, + 185, 189, 193, 197, 201, 205, 209, 213, 217, 221, 225, 229, 234, 239, 245, + 249, 254, 259, 264, 269, 274, 279, 284, +}; + +int vp8_dc_quant(int QIndex, int Delta) { + int retval; + + QIndex = QIndex + Delta; + + if (QIndex > 127) { + QIndex = 127; + } else if (QIndex < 0) { + QIndex = 0; + } + + retval = dc_qlookup[QIndex]; + return retval; +} + +int vp8_dc2quant(int QIndex, int Delta) { + int retval; + + QIndex = QIndex + Delta; + + if (QIndex > 127) { + QIndex = 127; + } else if (QIndex < 0) { + QIndex = 0; + } + + retval = dc_qlookup[QIndex] * 2; + return retval; +} +int vp8_dc_uv_quant(int QIndex, int Delta) { + int retval; + + QIndex = QIndex + Delta; + + if (QIndex > 127) { + QIndex = 127; + } else if (QIndex < 0) { + QIndex = 0; + } + + retval = dc_qlookup[QIndex]; + + if (retval > 132) retval = 132; + + return retval; +} + +int vp8_ac_yquant(int QIndex) { + int retval; + + if (QIndex > 127) { + QIndex = 127; + } else if (QIndex < 0) { + QIndex = 0; + } + + retval = ac_qlookup[QIndex]; + return retval; +} + +int vp8_ac2quant(int QIndex, int Delta) { + int retval; + + QIndex = QIndex + Delta; + + if (QIndex > 127) { + QIndex = 127; + } else if (QIndex < 0) { + QIndex = 0; + } + + /* For all x in [0..284], x*155/100 is bitwise equal to (x*101581) >> 16. + * The smallest precision for that is '(x*6349) >> 12' but 16 is a good + * word size. */ + retval = (ac_qlookup[QIndex] * 101581) >> 16; + + if (retval < 8) retval = 8; + + return retval; +} +int vp8_ac_uv_quant(int QIndex, int Delta) { + int retval; + + QIndex = QIndex + Delta; + + if (QIndex > 127) { + QIndex = 127; + } else if (QIndex < 0) { + QIndex = 0; + } + + retval = ac_qlookup[QIndex]; + return retval; +} diff --git a/media/libvpx/libvpx/vp8/common/quant_common.h b/media/libvpx/libvpx/vp8/common/quant_common.h new file mode 100644 index 0000000000..049840a272 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/quant_common.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_QUANT_COMMON_H_ +#define VPX_VP8_COMMON_QUANT_COMMON_H_ + +#include "string.h" +#include "blockd.h" +#include "onyxc_int.h" + +#ifdef __cplusplus +extern "C" { +#endif + +extern int vp8_ac_yquant(int QIndex); +extern int vp8_dc_quant(int QIndex, int Delta); +extern int vp8_dc2quant(int QIndex, int Delta); +extern int vp8_ac2quant(int QIndex, int Delta); +extern int vp8_dc_uv_quant(int QIndex, int Delta); +extern int vp8_ac_uv_quant(int QIndex, int Delta); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_COMMON_QUANT_COMMON_H_ diff --git a/media/libvpx/libvpx/vp8/common/reconinter.c b/media/libvpx/libvpx/vp8/common/reconinter.c new file mode 100644 index 0000000000..2cb0709318 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/reconinter.c @@ -0,0 +1,503 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vpx/vpx_integer.h" +#include "blockd.h" +#include "reconinter.h" +#if CONFIG_RUNTIME_CPU_DETECT +#include "onyxc_int.h" +#endif + +void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, + int dst_stride) { + int r; + + for (r = 0; r < 16; ++r) { + memcpy(dst, src, 16); + + src += src_stride; + dst += dst_stride; + } +} + +void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, + int dst_stride) { + int r; + + for (r = 0; r < 8; ++r) { + memcpy(dst, src, 8); + + src += src_stride; + dst += dst_stride; + } +} + +void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, + int dst_stride) { + int r; + + for (r = 0; r < 4; ++r) { + memcpy(dst, src, 8); + + src += src_stride; + dst += dst_stride; + } +} + +void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, unsigned char *base_pre, + int pre_stride, vp8_subpix_fn_t sppf) { + int r; + unsigned char *pred_ptr = d->predictor; + unsigned char *ptr; + ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + + (d->bmi.mv.as_mv.col >> 3); + + if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7) { + sppf(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, + pred_ptr, pitch); + } else { + for (r = 0; r < 4; ++r) { + pred_ptr[0] = ptr[0]; + pred_ptr[1] = ptr[1]; + pred_ptr[2] = ptr[2]; + pred_ptr[3] = ptr[3]; + pred_ptr += pitch; + ptr += pre_stride; + } + } +} + +static void build_inter_predictors4b(MACROBLOCKD *x, BLOCKD *d, + unsigned char *dst, int dst_stride, + unsigned char *base_pre, int pre_stride) { + unsigned char *ptr; + ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + + (d->bmi.mv.as_mv.col >> 3); + + if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7) { + x->subpixel_predict8x8(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, + d->bmi.mv.as_mv.row & 7, dst, dst_stride); + } else { + vp8_copy_mem8x8(ptr, pre_stride, dst, dst_stride); + } +} + +static void build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, + unsigned char *dst, int dst_stride, + unsigned char *base_pre, int pre_stride) { + unsigned char *ptr; + ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + + (d->bmi.mv.as_mv.col >> 3); + + if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7) { + x->subpixel_predict8x4(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, + d->bmi.mv.as_mv.row & 7, dst, dst_stride); + } else { + vp8_copy_mem8x4(ptr, pre_stride, dst, dst_stride); + } +} + +static void build_inter_predictors_b(BLOCKD *d, unsigned char *dst, + int dst_stride, unsigned char *base_pre, + int pre_stride, vp8_subpix_fn_t sppf) { + int r; + unsigned char *ptr; + ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + + (d->bmi.mv.as_mv.col >> 3); + + if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7) { + sppf(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, + dst_stride); + } else { + for (r = 0; r < 4; ++r) { + dst[0] = ptr[0]; + dst[1] = ptr[1]; + dst[2] = ptr[2]; + dst[3] = ptr[3]; + dst += dst_stride; + ptr += pre_stride; + } + } +} + +/*encoder only*/ +void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x) { + unsigned char *uptr, *vptr; + unsigned char *upred_ptr = &x->predictor[256]; + unsigned char *vpred_ptr = &x->predictor[320]; + + int mv_row = x->mode_info_context->mbmi.mv.as_mv.row; + int mv_col = x->mode_info_context->mbmi.mv.as_mv.col; + int offset; + int pre_stride = x->pre.uv_stride; + + /* calc uv motion vectors */ + mv_row += 1 | (mv_row >> (sizeof(int) * CHAR_BIT - 1)); + mv_col += 1 | (mv_col >> (sizeof(int) * CHAR_BIT - 1)); + mv_row /= 2; + mv_col /= 2; + mv_row &= x->fullpixel_mask; + mv_col &= x->fullpixel_mask; + + offset = (mv_row >> 3) * pre_stride + (mv_col >> 3); + uptr = x->pre.u_buffer + offset; + vptr = x->pre.v_buffer + offset; + + if ((mv_row | mv_col) & 7) { + x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, + 8); + x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, + 8); + } else { + vp8_copy_mem8x8(uptr, pre_stride, upred_ptr, 8); + vp8_copy_mem8x8(vptr, pre_stride, vpred_ptr, 8); + } +} + +/*encoder only*/ +void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x) { + int i, j; + int pre_stride = x->pre.uv_stride; + unsigned char *base_pre; + + /* build uv mvs */ + for (i = 0; i < 2; ++i) { + for (j = 0; j < 2; ++j) { + int yoffset = i * 8 + j * 2; + int uoffset = 16 + i * 2 + j; + int voffset = 20 + i * 2 + j; + + int temp; + + temp = x->block[yoffset].bmi.mv.as_mv.row + + x->block[yoffset + 1].bmi.mv.as_mv.row + + x->block[yoffset + 4].bmi.mv.as_mv.row + + x->block[yoffset + 5].bmi.mv.as_mv.row; + + temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8); + + x->block[uoffset].bmi.mv.as_mv.row = (temp / 8) & x->fullpixel_mask; + + temp = x->block[yoffset].bmi.mv.as_mv.col + + x->block[yoffset + 1].bmi.mv.as_mv.col + + x->block[yoffset + 4].bmi.mv.as_mv.col + + x->block[yoffset + 5].bmi.mv.as_mv.col; + + temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8); + + x->block[uoffset].bmi.mv.as_mv.col = (temp / 8) & x->fullpixel_mask; + + x->block[voffset].bmi.mv.as_int = x->block[uoffset].bmi.mv.as_int; + } + } + + base_pre = x->pre.u_buffer; + for (i = 16; i < 20; i += 2) { + BLOCKD *d0 = &x->block[i]; + BLOCKD *d1 = &x->block[i + 1]; + + if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) { + build_inter_predictors2b(x, d0, d0->predictor, 8, base_pre, pre_stride); + } else { + vp8_build_inter_predictors_b(d0, 8, base_pre, pre_stride, + x->subpixel_predict); + vp8_build_inter_predictors_b(d1, 8, base_pre, pre_stride, + x->subpixel_predict); + } + } + + base_pre = x->pre.v_buffer; + for (i = 20; i < 24; i += 2) { + BLOCKD *d0 = &x->block[i]; + BLOCKD *d1 = &x->block[i + 1]; + + if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) { + build_inter_predictors2b(x, d0, d0->predictor, 8, base_pre, pre_stride); + } else { + vp8_build_inter_predictors_b(d0, 8, base_pre, pre_stride, + x->subpixel_predict); + vp8_build_inter_predictors_b(d1, 8, base_pre, pre_stride, + x->subpixel_predict); + } + } +} + +/*encoder only*/ +void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x, unsigned char *dst_y, + int dst_ystride) { + unsigned char *ptr_base; + unsigned char *ptr; + int mv_row = x->mode_info_context->mbmi.mv.as_mv.row; + int mv_col = x->mode_info_context->mbmi.mv.as_mv.col; + int pre_stride = x->pre.y_stride; + + ptr_base = x->pre.y_buffer; + ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3); + + if ((mv_row | mv_col) & 7) { + x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, dst_y, + dst_ystride); + } else { + vp8_copy_mem16x16(ptr, pre_stride, dst_y, dst_ystride); + } +} + +static void clamp_mv_to_umv_border(MV *mv, const MACROBLOCKD *xd) { + /* If the MV points so far into the UMV border that no visible pixels + * are used for reconstruction, the subpel part of the MV can be + * discarded and the MV limited to 16 pixels with equivalent results. + * + * This limit kicks in at 19 pixels for the top and left edges, for + * the 16 pixels plus 3 taps right of the central pixel when subpel + * filtering. The bottom and right edges use 16 pixels plus 2 pixels + * left of the central pixel when filtering. + */ + if (mv->col < (xd->mb_to_left_edge - (19 << 3))) { + mv->col = xd->mb_to_left_edge - (16 << 3); + } else if (mv->col > xd->mb_to_right_edge + (18 << 3)) { + mv->col = xd->mb_to_right_edge + (16 << 3); + } + + if (mv->row < (xd->mb_to_top_edge - (19 << 3))) { + mv->row = xd->mb_to_top_edge - (16 << 3); + } else if (mv->row > xd->mb_to_bottom_edge + (18 << 3)) { + mv->row = xd->mb_to_bottom_edge + (16 << 3); + } +} + +/* A version of the above function for chroma block MVs.*/ +static void clamp_uvmv_to_umv_border(MV *mv, const MACROBLOCKD *xd) { + mv->col = (2 * mv->col < (xd->mb_to_left_edge - (19 << 3))) + ? (xd->mb_to_left_edge - (16 << 3)) >> 1 + : mv->col; + mv->col = (2 * mv->col > xd->mb_to_right_edge + (18 << 3)) + ? (xd->mb_to_right_edge + (16 << 3)) >> 1 + : mv->col; + + mv->row = (2 * mv->row < (xd->mb_to_top_edge - (19 << 3))) + ? (xd->mb_to_top_edge - (16 << 3)) >> 1 + : mv->row; + mv->row = (2 * mv->row > xd->mb_to_bottom_edge + (18 << 3)) + ? (xd->mb_to_bottom_edge + (16 << 3)) >> 1 + : mv->row; +} + +void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x, unsigned char *dst_y, + unsigned char *dst_u, + unsigned char *dst_v, int dst_ystride, + int dst_uvstride) { + int offset; + unsigned char *ptr; + unsigned char *uptr, *vptr; + + int_mv _16x16mv; + + unsigned char *ptr_base = x->pre.y_buffer; + int pre_stride = x->pre.y_stride; + + _16x16mv.as_int = x->mode_info_context->mbmi.mv.as_int; + + if (x->mode_info_context->mbmi.need_to_clamp_mvs) { + clamp_mv_to_umv_border(&_16x16mv.as_mv, x); + } + + ptr = ptr_base + (_16x16mv.as_mv.row >> 3) * pre_stride + + (_16x16mv.as_mv.col >> 3); + + if (_16x16mv.as_int & 0x00070007) { + x->subpixel_predict16x16(ptr, pre_stride, _16x16mv.as_mv.col & 7, + _16x16mv.as_mv.row & 7, dst_y, dst_ystride); + } else { + vp8_copy_mem16x16(ptr, pre_stride, dst_y, dst_ystride); + } + + /* calc uv motion vectors */ + _16x16mv.as_mv.row += + 1 | (_16x16mv.as_mv.row >> (sizeof(int) * CHAR_BIT - 1)); + _16x16mv.as_mv.col += + 1 | (_16x16mv.as_mv.col >> (sizeof(int) * CHAR_BIT - 1)); + _16x16mv.as_mv.row /= 2; + _16x16mv.as_mv.col /= 2; + _16x16mv.as_mv.row &= x->fullpixel_mask; + _16x16mv.as_mv.col &= x->fullpixel_mask; + + if (2 * _16x16mv.as_mv.col < (x->mb_to_left_edge - (19 << 3)) || + 2 * _16x16mv.as_mv.col > x->mb_to_right_edge + (18 << 3) || + 2 * _16x16mv.as_mv.row < (x->mb_to_top_edge - (19 << 3)) || + 2 * _16x16mv.as_mv.row > x->mb_to_bottom_edge + (18 << 3)) { + return; + } + + pre_stride >>= 1; + offset = (_16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3); + uptr = x->pre.u_buffer + offset; + vptr = x->pre.v_buffer + offset; + + if (_16x16mv.as_int & 0x00070007) { + x->subpixel_predict8x8(uptr, pre_stride, _16x16mv.as_mv.col & 7, + _16x16mv.as_mv.row & 7, dst_u, dst_uvstride); + x->subpixel_predict8x8(vptr, pre_stride, _16x16mv.as_mv.col & 7, + _16x16mv.as_mv.row & 7, dst_v, dst_uvstride); + } else { + vp8_copy_mem8x8(uptr, pre_stride, dst_u, dst_uvstride); + vp8_copy_mem8x8(vptr, pre_stride, dst_v, dst_uvstride); + } +} + +static void build_inter4x4_predictors_mb(MACROBLOCKD *x) { + int i; + unsigned char *base_dst = x->dst.y_buffer; + unsigned char *base_pre = x->pre.y_buffer; + + if (x->mode_info_context->mbmi.partitioning < 3) { + BLOCKD *b; + int dst_stride = x->dst.y_stride; + + x->block[0].bmi = x->mode_info_context->bmi[0]; + x->block[2].bmi = x->mode_info_context->bmi[2]; + x->block[8].bmi = x->mode_info_context->bmi[8]; + x->block[10].bmi = x->mode_info_context->bmi[10]; + if (x->mode_info_context->mbmi.need_to_clamp_mvs) { + clamp_mv_to_umv_border(&x->block[0].bmi.mv.as_mv, x); + clamp_mv_to_umv_border(&x->block[2].bmi.mv.as_mv, x); + clamp_mv_to_umv_border(&x->block[8].bmi.mv.as_mv, x); + clamp_mv_to_umv_border(&x->block[10].bmi.mv.as_mv, x); + } + + b = &x->block[0]; + build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, + dst_stride); + b = &x->block[2]; + build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, + dst_stride); + b = &x->block[8]; + build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, + dst_stride); + b = &x->block[10]; + build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, + dst_stride); + } else { + for (i = 0; i < 16; i += 2) { + BLOCKD *d0 = &x->block[i]; + BLOCKD *d1 = &x->block[i + 1]; + int dst_stride = x->dst.y_stride; + + x->block[i + 0].bmi = x->mode_info_context->bmi[i + 0]; + x->block[i + 1].bmi = x->mode_info_context->bmi[i + 1]; + if (x->mode_info_context->mbmi.need_to_clamp_mvs) { + clamp_mv_to_umv_border(&x->block[i + 0].bmi.mv.as_mv, x); + clamp_mv_to_umv_border(&x->block[i + 1].bmi.mv.as_mv, x); + } + + if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) { + build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride, + base_pre, dst_stride); + } else { + build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride, + base_pre, dst_stride, x->subpixel_predict); + build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride, + base_pre, dst_stride, x->subpixel_predict); + } + } + } + base_dst = x->dst.u_buffer; + base_pre = x->pre.u_buffer; + for (i = 16; i < 20; i += 2) { + BLOCKD *d0 = &x->block[i]; + BLOCKD *d1 = &x->block[i + 1]; + int dst_stride = x->dst.uv_stride; + + /* Note: uv mvs already clamped in build_4x4uvmvs() */ + + if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) { + build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride, + base_pre, dst_stride); + } else { + build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride, base_pre, + dst_stride, x->subpixel_predict); + build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride, base_pre, + dst_stride, x->subpixel_predict); + } + } + + base_dst = x->dst.v_buffer; + base_pre = x->pre.v_buffer; + for (i = 20; i < 24; i += 2) { + BLOCKD *d0 = &x->block[i]; + BLOCKD *d1 = &x->block[i + 1]; + int dst_stride = x->dst.uv_stride; + + /* Note: uv mvs already clamped in build_4x4uvmvs() */ + + if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) { + build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride, + base_pre, dst_stride); + } else { + build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride, base_pre, + dst_stride, x->subpixel_predict); + build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride, base_pre, + dst_stride, x->subpixel_predict); + } + } +} + +static void build_4x4uvmvs(MACROBLOCKD *x) { + int i, j; + + for (i = 0; i < 2; ++i) { + for (j = 0; j < 2; ++j) { + int yoffset = i * 8 + j * 2; + int uoffset = 16 + i * 2 + j; + int voffset = 20 + i * 2 + j; + + int temp; + + temp = x->mode_info_context->bmi[yoffset + 0].mv.as_mv.row + + x->mode_info_context->bmi[yoffset + 1].mv.as_mv.row + + x->mode_info_context->bmi[yoffset + 4].mv.as_mv.row + + x->mode_info_context->bmi[yoffset + 5].mv.as_mv.row; + + temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8); + + x->block[uoffset].bmi.mv.as_mv.row = (temp / 8) & x->fullpixel_mask; + + temp = x->mode_info_context->bmi[yoffset + 0].mv.as_mv.col + + x->mode_info_context->bmi[yoffset + 1].mv.as_mv.col + + x->mode_info_context->bmi[yoffset + 4].mv.as_mv.col + + x->mode_info_context->bmi[yoffset + 5].mv.as_mv.col; + + temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8); + + x->block[uoffset].bmi.mv.as_mv.col = (temp / 8) & x->fullpixel_mask; + + if (x->mode_info_context->mbmi.need_to_clamp_mvs) { + clamp_uvmv_to_umv_border(&x->block[uoffset].bmi.mv.as_mv, x); + } + + x->block[voffset].bmi.mv.as_int = x->block[uoffset].bmi.mv.as_int; + } + } +} + +void vp8_build_inter_predictors_mb(MACROBLOCKD *xd) { + if (xd->mode_info_context->mbmi.mode != SPLITMV) { + vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer, xd->dst.u_buffer, + xd->dst.v_buffer, xd->dst.y_stride, + xd->dst.uv_stride); + } else { + build_4x4uvmvs(xd); + build_inter4x4_predictors_mb(xd); + } +} diff --git a/media/libvpx/libvpx/vp8/common/reconinter.h b/media/libvpx/libvpx/vp8/common/reconinter.h new file mode 100644 index 0000000000..974e7ce754 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/reconinter.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_RECONINTER_H_ +#define VPX_VP8_COMMON_RECONINTER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_build_inter_predictors_mb(MACROBLOCKD *xd); +void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x, unsigned char *dst_y, + unsigned char *dst_u, + unsigned char *dst_v, int dst_ystride, + int dst_uvstride); + +void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x, unsigned char *dst_y, + int dst_ystride); +void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, unsigned char *base_pre, + int pre_stride, vp8_subpix_fn_t sppf); + +void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x); +void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_COMMON_RECONINTER_H_ diff --git a/media/libvpx/libvpx/vp8/common/reconintra.c b/media/libvpx/libvpx/vp8/common/reconintra.c new file mode 100644 index 0000000000..8e2094da87 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/reconintra.c @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "./vp8_rtcd.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/vpx_once.h" +#include "blockd.h" +#include "vp8/common/reconintra.h" +#include "vp8/common/reconintra4x4.h" + +enum { + SIZE_16, + SIZE_8, + NUM_SIZES, +}; + +typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left); + +static intra_pred_fn pred[4][NUM_SIZES]; +static intra_pred_fn dc_pred[2][2][NUM_SIZES]; + +static void vp8_init_intra_predictors_internal(void) { +#define INIT_SIZE(sz) \ + pred[V_PRED][SIZE_##sz] = vpx_v_predictor_##sz##x##sz; \ + pred[H_PRED][SIZE_##sz] = vpx_h_predictor_##sz##x##sz; \ + pred[TM_PRED][SIZE_##sz] = vpx_tm_predictor_##sz##x##sz; \ + \ + dc_pred[0][0][SIZE_##sz] = vpx_dc_128_predictor_##sz##x##sz; \ + dc_pred[0][1][SIZE_##sz] = vpx_dc_top_predictor_##sz##x##sz; \ + dc_pred[1][0][SIZE_##sz] = vpx_dc_left_predictor_##sz##x##sz; \ + dc_pred[1][1][SIZE_##sz] = vpx_dc_predictor_##sz##x##sz + + INIT_SIZE(16); + INIT_SIZE(8); + vp8_init_intra4x4_predictors_internal(); +} + +void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x, unsigned char *yabove_row, + unsigned char *yleft, int left_stride, + unsigned char *ypred_ptr, int y_stride) { + MB_PREDICTION_MODE mode = x->mode_info_context->mbmi.mode; + DECLARE_ALIGNED(16, uint8_t, yleft_col[16]); + int i; + intra_pred_fn fn; + + for (i = 0; i < 16; ++i) { + yleft_col[i] = yleft[i * left_stride]; + } + + if (mode == DC_PRED) { + fn = dc_pred[x->left_available][x->up_available][SIZE_16]; + } else { + fn = pred[mode][SIZE_16]; + } + + fn(ypred_ptr, y_stride, yabove_row, yleft_col); +} + +void vp8_build_intra_predictors_mbuv_s( + MACROBLOCKD *x, unsigned char *uabove_row, unsigned char *vabove_row, + unsigned char *uleft, unsigned char *vleft, int left_stride, + unsigned char *upred_ptr, unsigned char *vpred_ptr, int pred_stride) { + MB_PREDICTION_MODE uvmode = x->mode_info_context->mbmi.uv_mode; +#if HAVE_VSX + /* Power PC implementation uses "vec_vsx_ld" to read 16 bytes from + uleft_col and vleft_col. Play it safe by reserving enough stack + space here. */ + unsigned char uleft_col[16]; + unsigned char vleft_col[16]; +#else + unsigned char uleft_col[8]; + unsigned char vleft_col[8]; +#endif + int i; + intra_pred_fn fn; + + for (i = 0; i < 8; ++i) { + uleft_col[i] = uleft[i * left_stride]; + vleft_col[i] = vleft[i * left_stride]; + } + + if (uvmode == DC_PRED) { + fn = dc_pred[x->left_available][x->up_available][SIZE_8]; + } else { + fn = pred[uvmode][SIZE_8]; + } + + fn(upred_ptr, pred_stride, uabove_row, uleft_col); + fn(vpred_ptr, pred_stride, vabove_row, vleft_col); +} + +void vp8_init_intra_predictors(void) { + once(vp8_init_intra_predictors_internal); +} diff --git a/media/libvpx/libvpx/vp8/common/reconintra.h b/media/libvpx/libvpx/vp8/common/reconintra.h new file mode 100644 index 0000000000..029ac00a24 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/reconintra.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_RECONINTRA_H_ +#define VPX_VP8_COMMON_RECONINTRA_H_ + +#include "vp8/common/blockd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x, unsigned char *yabove_row, + unsigned char *yleft, int left_stride, + unsigned char *ypred_ptr, int y_stride); + +void vp8_build_intra_predictors_mbuv_s( + MACROBLOCKD *x, unsigned char *uabove_row, unsigned char *vabove_row, + unsigned char *uleft, unsigned char *vleft, int left_stride, + unsigned char *upred_ptr, unsigned char *vpred_ptr, int pred_stride); + +void vp8_init_intra_predictors(void); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_COMMON_RECONINTRA_H_ diff --git a/media/libvpx/libvpx/vp8/common/reconintra4x4.c b/media/libvpx/libvpx/vp8/common/reconintra4x4.c new file mode 100644 index 0000000000..be936df5e0 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/reconintra4x4.c @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vp8_rtcd.h" +#include "blockd.h" +#include "reconintra4x4.h" +#include "vp8/common/common.h" +#include "vpx_ports/compiler_attributes.h" + +typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left); + +static intra_pred_fn pred[10]; + +void vp8_init_intra4x4_predictors_internal(void) { + pred[B_DC_PRED] = vpx_dc_predictor_4x4; + pred[B_TM_PRED] = vpx_tm_predictor_4x4; + pred[B_VE_PRED] = vpx_ve_predictor_4x4; + pred[B_HE_PRED] = vpx_he_predictor_4x4; + pred[B_LD_PRED] = vpx_d45e_predictor_4x4; + pred[B_RD_PRED] = vpx_d135_predictor_4x4; + pred[B_VR_PRED] = vpx_d117_predictor_4x4; + pred[B_VL_PRED] = vpx_d63e_predictor_4x4; + pred[B_HD_PRED] = vpx_d153_predictor_4x4; + pred[B_HU_PRED] = vpx_d207_predictor_4x4; +} + +void vp8_intra4x4_predict(unsigned char *above, unsigned char *yleft, + int left_stride, B_PREDICTION_MODE b_mode, + unsigned char *dst, int dst_stride, + unsigned char top_left) { +/* Power PC implementation uses "vec_vsx_ld" to read 16 bytes from + Above (aka, Aboveb + 4). Play it safe by reserving enough stack + space here. Similary for "Left". */ +#if HAVE_VSX + unsigned char Aboveb[20]; +#else + unsigned char Aboveb[12]; +#endif + unsigned char *Above = Aboveb + 4; +#if HAVE_NEON + // Neon intrinsics are unable to load 32 bits, or 4 8 bit values. Instead, it + // over reads but does not use the extra 4 values. + unsigned char Left[8]; +#if VPX_WITH_ASAN + // Silence an 'uninitialized read' warning. Although uninitialized values are + // indeed read, they are not used. + vp8_zero_array(Left, 8); +#endif // VPX_WITH_ASAN +#elif HAVE_VSX + unsigned char Left[16]; +#else + unsigned char Left[4]; +#endif // HAVE_NEON + + Left[0] = yleft[0]; + Left[1] = yleft[left_stride]; + Left[2] = yleft[2 * left_stride]; + Left[3] = yleft[3 * left_stride]; + memcpy(Above, above, 8); + Above[-1] = top_left; + + pred[b_mode](dst, dst_stride, Above, Left); +} diff --git a/media/libvpx/libvpx/vp8/common/reconintra4x4.h b/media/libvpx/libvpx/vp8/common/reconintra4x4.h new file mode 100644 index 0000000000..3618ec5cbe --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/reconintra4x4.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_RECONINTRA4X4_H_ +#define VPX_VP8_COMMON_RECONINTRA4X4_H_ +#include "vp8/common/blockd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static INLINE void intra_prediction_down_copy(MACROBLOCKD *xd, + unsigned char *above_right_src) { + int dst_stride = xd->dst.y_stride; + unsigned char *above_right_dst = xd->dst.y_buffer - dst_stride + 16; + + unsigned int *src_ptr = (unsigned int *)above_right_src; + unsigned int *dst_ptr0 = (unsigned int *)(above_right_dst + 4 * dst_stride); + unsigned int *dst_ptr1 = (unsigned int *)(above_right_dst + 8 * dst_stride); + unsigned int *dst_ptr2 = (unsigned int *)(above_right_dst + 12 * dst_stride); + + *dst_ptr0 = *src_ptr; + *dst_ptr1 = *src_ptr; + *dst_ptr2 = *src_ptr; +} + +void vp8_intra4x4_predict(unsigned char *above, unsigned char *yleft, + int left_stride, B_PREDICTION_MODE b_mode, + unsigned char *dst, int dst_stride, + unsigned char top_left); + +void vp8_init_intra4x4_predictors_internal(void); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_COMMON_RECONINTRA4X4_H_ diff --git a/media/libvpx/libvpx/vp8/common/rtcd.c b/media/libvpx/libvpx/vp8/common/rtcd.c new file mode 100644 index 0000000000..09a0e2b4b3 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/rtcd.c @@ -0,0 +1,15 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_config.h" +#define RTCD_C +#include "./vp8_rtcd.h" +#include "vpx_ports/vpx_once.h" + +void vp8_rtcd() { once(setup_rtcd_internal); } diff --git a/media/libvpx/libvpx/vp8/common/rtcd_defs.pl b/media/libvpx/libvpx/vp8/common/rtcd_defs.pl new file mode 100644 index 0000000000..12b474d939 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/rtcd_defs.pl @@ -0,0 +1,244 @@ +## +## Copyright (c) 2017 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +sub vp8_common_forward_decls() { +print <y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5); + for (i = 0; i < ybf->y_height; ++i) { + ybf->y_buffer[ybf->y_stride * i - 1] = (unsigned char)129; + } + + memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5); + for (i = 0; i < ybf->uv_height; ++i) { + ybf->u_buffer[ybf->uv_stride * i - 1] = (unsigned char)129; + } + + memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5); + for (i = 0; i < ybf->uv_height; ++i) { + ybf->v_buffer[ybf->uv_stride * i - 1] = (unsigned char)129; + } +} + +void vp8_setup_intra_recon_top_line(YV12_BUFFER_CONFIG *ybf) { + memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5); + memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5); + memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5); +} diff --git a/media/libvpx/libvpx/vp8/common/setupintrarecon.h b/media/libvpx/libvpx/vp8/common/setupintrarecon.h new file mode 100644 index 0000000000..903a536aed --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/setupintrarecon.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_SETUPINTRARECON_H_ +#define VPX_VP8_COMMON_SETUPINTRARECON_H_ + +#include "./vpx_config.h" +#include "vpx_scale/yv12config.h" + +#ifdef __cplusplus +extern "C" { +#endif +extern void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf); +extern void vp8_setup_intra_recon_top_line(YV12_BUFFER_CONFIG *ybf); + +static INLINE void setup_intra_recon_left(unsigned char *y_buffer, + unsigned char *u_buffer, + unsigned char *v_buffer, int y_stride, + int uv_stride) { + int i; + + for (i = 0; i < 16; ++i) y_buffer[y_stride * i] = (unsigned char)129; + + for (i = 0; i < 8; ++i) u_buffer[uv_stride * i] = (unsigned char)129; + + for (i = 0; i < 8; ++i) v_buffer[uv_stride * i] = (unsigned char)129; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_COMMON_SETUPINTRARECON_H_ diff --git a/media/libvpx/libvpx/vp8/common/swapyv12buffer.c b/media/libvpx/libvpx/vp8/common/swapyv12buffer.c new file mode 100644 index 0000000000..5ff21e94a8 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/swapyv12buffer.c @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "swapyv12buffer.h" + +void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, + YV12_BUFFER_CONFIG *last_frame) { + unsigned char *temp; + + temp = last_frame->buffer_alloc; + last_frame->buffer_alloc = new_frame->buffer_alloc; + new_frame->buffer_alloc = temp; + + temp = last_frame->y_buffer; + last_frame->y_buffer = new_frame->y_buffer; + new_frame->y_buffer = temp; + + temp = last_frame->u_buffer; + last_frame->u_buffer = new_frame->u_buffer; + new_frame->u_buffer = temp; + + temp = last_frame->v_buffer; + last_frame->v_buffer = new_frame->v_buffer; + new_frame->v_buffer = temp; +} diff --git a/media/libvpx/libvpx/vp8/common/swapyv12buffer.h b/media/libvpx/libvpx/vp8/common/swapyv12buffer.h new file mode 100644 index 0000000000..e37c471f63 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/swapyv12buffer.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_SWAPYV12BUFFER_H_ +#define VPX_VP8_COMMON_SWAPYV12BUFFER_H_ + +#include "vpx_scale/yv12config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, + YV12_BUFFER_CONFIG *last_frame); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_COMMON_SWAPYV12BUFFER_H_ diff --git a/media/libvpx/libvpx/vp8/common/systemdependent.h b/media/libvpx/libvpx/vp8/common/systemdependent.h new file mode 100644 index 0000000000..83a5513aae --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/systemdependent.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_SYSTEMDEPENDENT_H_ +#define VPX_VP8_COMMON_SYSTEMDEPENDENT_H_ + +#include "vpx_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP8Common; +void vp8_machine_specific_config(struct VP8Common *); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_COMMON_SYSTEMDEPENDENT_H_ diff --git a/media/libvpx/libvpx/vp8/common/threading.h b/media/libvpx/libvpx/vp8/common/threading.h new file mode 100644 index 0000000000..1cfb9fec51 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/threading.h @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_THREADING_H_ +#define VPX_VP8_COMMON_THREADING_H_ + +#include "./vpx_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD + +/* Thread management macros */ +#if defined(_WIN32) && !HAVE_PTHREAD_H +/* Win32 */ +#include +#include +#if defined(__GNUC__) && \ + (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)) +#define THREAD_FUNCTION \ + __attribute__((force_align_arg_pointer)) unsigned int __stdcall +#else +#define THREAD_FUNCTION unsigned int __stdcall +#endif +#define THREAD_FUNCTION_RETURN DWORD +#define THREAD_SPECIFIC_INDEX DWORD +#define pthread_t HANDLE +#define pthread_attr_t DWORD +#define pthread_detach(thread) \ + if (thread != NULL) CloseHandle(thread) +#define thread_sleep(nms) Sleep(nms) +#define pthread_cancel(thread) terminate_thread(thread, 0) +#define ts_key_create(ts_key, destructor) \ + { ts_key = TlsAlloc(); }; +#define pthread_getspecific(ts_key) TlsGetValue(ts_key) +#define pthread_setspecific(ts_key, value) TlsSetValue(ts_key, (void *)value) +#define pthread_self() GetCurrentThreadId() + +#elif defined(__OS2__) +/* OS/2 */ +#define INCL_DOS +#include + +#include +#define THREAD_FUNCTION void * +#define THREAD_FUNCTION_RETURN void * +#define THREAD_SPECIFIC_INDEX PULONG +#define pthread_t TID +#define pthread_attr_t ULONG +#define pthread_detach(thread) 0 +#define thread_sleep(nms) DosSleep(nms) +#define pthread_cancel(thread) DosKillThread(thread) +#define ts_key_create(ts_key, destructor) \ + DosAllocThreadLocalMemory(1, &(ts_key)); +#define pthread_getspecific(ts_key) ((void *)(*(ts_key))) +#define pthread_setspecific(ts_key, value) (*(ts_key) = (ULONG)(value)) +#define pthread_self() _gettid() +#else +#ifdef __APPLE__ +#include +#include +#include +#include +#include + +#else +#include +#endif + +#include +/* pthreads */ +/* Nearly everything is already defined */ +#define THREAD_FUNCTION void * +#define THREAD_FUNCTION_RETURN void * +#define THREAD_SPECIFIC_INDEX pthread_key_t +#define ts_key_create(ts_key, destructor) \ + pthread_key_create(&(ts_key), destructor); +#endif + +/* Synchronization macros: Win32 and Pthreads */ +#if defined(_WIN32) && !HAVE_PTHREAD_H +#define sem_t HANDLE +#define pause(voidpara) __asm PAUSE +#define sem_init(sem, sem_attr1, sem_init_value) \ + (int)((*sem = CreateSemaphore(NULL, 0, 32768, NULL)) == NULL) +#define sem_wait(sem) \ + (int)(WAIT_OBJECT_0 != WaitForSingleObject(*sem, INFINITE)) +#define sem_post(sem) ReleaseSemaphore(*sem, 1, NULL) +#define sem_destroy(sem) \ + if (*sem) ((int)(CloseHandle(*sem)) == TRUE) +#define thread_sleep(nms) Sleep(nms) + +#elif defined(__OS2__) +typedef struct { + HEV event; + HMTX wait_mutex; + HMTX count_mutex; + int count; +} sem_t; + +static inline int sem_init(sem_t *sem, int pshared, unsigned int value) { + DosCreateEventSem(NULL, &sem->event, pshared ? DC_SEM_SHARED : 0, + value > 0 ? TRUE : FALSE); + DosCreateMutexSem(NULL, &sem->wait_mutex, 0, FALSE); + DosCreateMutexSem(NULL, &sem->count_mutex, 0, FALSE); + + sem->count = value; + + return 0; +} + +static inline int sem_wait(sem_t *sem) { + DosRequestMutexSem(sem->wait_mutex, -1); + + DosWaitEventSem(sem->event, -1); + + DosRequestMutexSem(sem->count_mutex, -1); + + sem->count--; + if (sem->count == 0) { + ULONG post_count; + + DosResetEventSem(sem->event, &post_count); + } + + DosReleaseMutexSem(sem->count_mutex); + + DosReleaseMutexSem(sem->wait_mutex); + + return 0; +} + +static inline int sem_post(sem_t *sem) { + DosRequestMutexSem(sem->count_mutex, -1); + + if (sem->count < 32768) { + sem->count++; + DosPostEventSem(sem->event); + } + + DosReleaseMutexSem(sem->count_mutex); + + return 0; +} + +static inline int sem_destroy(sem_t *sem) { + DosCloseEventSem(sem->event); + DosCloseMutexSem(sem->wait_mutex); + DosCloseMutexSem(sem->count_mutex); + + return 0; +} + +#define thread_sleep(nms) DosSleep(nms) + +#else + +#ifdef __APPLE__ +#define sem_t semaphore_t +#define sem_init(X, Y, Z) \ + semaphore_create(mach_task_self(), X, SYNC_POLICY_FIFO, Z) +#define sem_wait(sem) (semaphore_wait(*sem)) +#define sem_post(sem) semaphore_signal(*sem) +#define sem_destroy(sem) semaphore_destroy(mach_task_self(), *sem) +#else +#include +#include +#endif /* __APPLE__ */ +/* Not Windows. Assume pthreads */ + +/* thread_sleep implementation: yield unless Linux/Unix. */ +#if defined(__unix__) || defined(__APPLE__) +#define thread_sleep(nms) +/* {struct timespec ts;ts.tv_sec=0; + ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} */ +#else +#define thread_sleep(nms) sched_yield(); +#endif /* __unix__ || __APPLE__ */ + +#endif + +#if VPX_ARCH_X86 || VPX_ARCH_X86_64 +#include "vpx_ports/x86.h" +#else +#define x86_pause_hint() +#endif + +#include "vpx_util/vpx_thread.h" +#include "vpx_util/vpx_atomics.h" + +static INLINE void vp8_atomic_spin_wait( + int mb_col, const vpx_atomic_int *last_row_current_mb_col, + const int nsync) { + while (mb_col > (vpx_atomic_load_acquire(last_row_current_mb_col) - nsync)) { + x86_pause_hint(); + thread_sleep(0); + } +} + +#endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_COMMON_THREADING_H_ diff --git a/media/libvpx/libvpx/vp8/common/treecoder.c b/media/libvpx/libvpx/vp8/common/treecoder.c new file mode 100644 index 0000000000..f1e78f4321 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/treecoder.c @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "vp8/common/treecoder.h" +#include "vpx/vpx_integer.h" + +static void tree2tok(struct vp8_token_struct *const p, vp8_tree t, int i, int v, + int L) { + v += v; + ++L; + + do { + const vp8_tree_index j = t[i++]; + + if (j <= 0) { + p[-j].value = v; + p[-j].Len = L; + } else { + tree2tok(p, t, j, v, L); + } + } while (++v & 1); +} + +void vp8_tokens_from_tree(struct vp8_token_struct *p, vp8_tree t) { + tree2tok(p, t, 0, 0, 0); +} + +void vp8_tokens_from_tree_offset(struct vp8_token_struct *p, vp8_tree t, + int offset) { + tree2tok(p - offset, t, 0, 0, 0); +} + +static void branch_counts(int n, /* n = size of alphabet */ + vp8_token tok[/* n */], vp8_tree tree, + unsigned int branch_ct[/* n-1 */][2], + const unsigned int num_events[/* n */]) { + const int tree_len = n - 1; + int t = 0; + + assert(tree_len); + + do { + branch_ct[t][0] = branch_ct[t][1] = 0; + } while (++t < tree_len); + + t = 0; + + do { + int L = tok[t].Len; + const int enc = tok[t].value; + const unsigned int ct = num_events[t]; + + vp8_tree_index i = 0; + + do { + const int b = (enc >> --L) & 1; + const int j = i >> 1; + assert(j < tree_len && 0 <= L); + + branch_ct[j][b] += ct; + i = tree[i + b]; + } while (i > 0); + + assert(!L); + } while (++t < n); +} + +void vp8_tree_probs_from_distribution(int n, /* n = size of alphabet */ + vp8_token tok[/* n */], vp8_tree tree, + vp8_prob probs[/* n-1 */], + unsigned int branch_ct[/* n-1 */][2], + const unsigned int num_events[/* n */], + unsigned int Pfactor, int Round) { + const int tree_len = n - 1; + int t = 0; + + branch_counts(n, tok, tree, branch_ct, num_events); + + do { + const unsigned int *const c = branch_ct[t]; + const unsigned int tot = c[0] + c[1]; + + if (tot) { + const unsigned int p = + (unsigned int)(((uint64_t)c[0] * Pfactor) + (Round ? tot >> 1 : 0)) / + tot; + probs[t] = p < 256 ? (p ? p : 1) : 255; /* agree w/old version for now */ + } else { + probs[t] = vp8_prob_half; + } + } while (++t < tree_len); +} diff --git a/media/libvpx/libvpx/vp8/common/treecoder.h b/media/libvpx/libvpx/vp8/common/treecoder.h new file mode 100644 index 0000000000..d7d8d0ead0 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/treecoder.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_TREECODER_H_ +#define VPX_VP8_COMMON_TREECODER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef unsigned char vp8bc_index_t; /* probability index */ + +typedef unsigned char vp8_prob; + +#define vp8_prob_half ((vp8_prob)128) + +typedef signed char vp8_tree_index; +struct bool_coder_spec; + +typedef struct bool_coder_spec bool_coder_spec; +typedef struct bool_writer bool_writer; +typedef struct bool_reader bool_reader; + +typedef const bool_coder_spec c_bool_coder_spec; +typedef const bool_writer c_bool_writer; +typedef const bool_reader c_bool_reader; + +#define vp8_complement(x) (255 - (x)) + +/* We build coding trees compactly in arrays. + Each node of the tree is a pair of vp8_tree_indices. + Array index often references a corresponding probability table. + Index <= 0 means done encoding/decoding and value = -Index, + Index > 0 means need another bit, specification at index. + Nonnegative indices are always even; processing begins at node 0. */ + +typedef const vp8_tree_index vp8_tree[], *vp8_tree_p; + +typedef const struct vp8_token_struct { + int value; + int Len; +} vp8_token; + +/* Construct encoding array from tree. */ + +void vp8_tokens_from_tree(struct vp8_token_struct *, vp8_tree); +void vp8_tokens_from_tree_offset(struct vp8_token_struct *, vp8_tree, + int offset); + +/* Convert array of token occurrence counts into a table of probabilities + for the associated binary encoding tree. Also writes count of branches + taken for each node on the tree; this facilitiates decisions as to + probability updates. */ + +void vp8_tree_probs_from_distribution(int n, /* n = size of alphabet */ + vp8_token tok[/* n */], vp8_tree tree, + vp8_prob probs[/* n-1 */], + unsigned int branch_ct[/* n-1 */][2], + const unsigned int num_events[/* n */], + unsigned int Pfactor, int Round); + +/* Variant of above using coder spec rather than hardwired 8-bit probs. */ + +void vp8bc_tree_probs_from_distribution(int n, /* n = size of alphabet */ + vp8_token tok[/* n */], vp8_tree tree, + vp8_prob probs[/* n-1 */], + unsigned int branch_ct[/* n-1 */][2], + const unsigned int num_events[/* n */], + c_bool_coder_spec *s); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_COMMON_TREECODER_H_ diff --git a/media/libvpx/libvpx/vp8/common/vp8_entropymodedata.h b/media/libvpx/libvpx/vp8/common/vp8_entropymodedata.h new file mode 100644 index 0000000000..3fc942e050 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/vp8_entropymodedata.h @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_VP8_ENTROPYMODEDATA_H_ +#define VPX_VP8_COMMON_VP8_ENTROPYMODEDATA_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/*Generated file, included by entropymode.c*/ + +const struct vp8_token_struct vp8_bmode_encodings[VP8_BINTRAMODES] = { + { 0, 1 }, { 2, 2 }, { 6, 3 }, { 28, 5 }, { 30, 5 }, + { 58, 6 }, { 59, 6 }, { 62, 6 }, { 126, 7 }, { 127, 7 } +}; + +const struct vp8_token_struct vp8_ymode_encodings[VP8_YMODES] = { + { 0, 1 }, { 4, 3 }, { 5, 3 }, { 6, 3 }, { 7, 3 } +}; + +const struct vp8_token_struct vp8_kf_ymode_encodings[VP8_YMODES] = { + { 4, 3 }, { 5, 3 }, { 6, 3 }, { 7, 3 }, { 0, 1 } +}; + +const struct vp8_token_struct vp8_uv_mode_encodings[VP8_UV_MODES] = { + { 0, 1 }, { 2, 2 }, { 6, 3 }, { 7, 3 } +}; + +const struct vp8_token_struct vp8_mbsplit_encodings[VP8_NUMMBSPLITS] = { + { 6, 3 }, { 7, 3 }, { 2, 2 }, { 0, 1 } +}; + +const struct vp8_token_struct vp8_mv_ref_encoding_array[VP8_MVREFS] = { + { 2, 2 }, { 6, 3 }, { 0, 1 }, { 14, 4 }, { 15, 4 } +}; + +const struct vp8_token_struct vp8_sub_mv_ref_encoding_array[VP8_SUBMVREFS] = { + { 0, 1 }, { 2, 2 }, { 6, 3 }, { 7, 3 } +}; + +const struct vp8_token_struct vp8_small_mvencodings[8] = { + { 0, 3 }, { 1, 3 }, { 2, 3 }, { 3, 3 }, { 4, 3 }, { 5, 3 }, { 6, 3 }, { 7, 3 } +}; + +const vp8_prob vp8_ymode_prob[VP8_YMODES - 1] = { 112, 86, 140, 37 }; + +const vp8_prob vp8_kf_ymode_prob[VP8_YMODES - 1] = { 145, 156, 163, 128 }; + +const vp8_prob vp8_uv_mode_prob[VP8_UV_MODES - 1] = { 162, 101, 204 }; + +const vp8_prob vp8_kf_uv_mode_prob[VP8_UV_MODES - 1] = { 142, 114, 183 }; + +const vp8_prob vp8_bmode_prob[VP8_BINTRAMODES - 1] = { 120, 90, 79, 133, 87, + 85, 80, 111, 151 }; + +const vp8_prob + vp8_kf_bmode_prob[VP8_BINTRAMODES][VP8_BINTRAMODES][VP8_BINTRAMODES - 1] = { + { { 231, 120, 48, 89, 115, 113, 120, 152, 112 }, + { 152, 179, 64, 126, 170, 118, 46, 70, 95 }, + { 175, 69, 143, 80, 85, 82, 72, 155, 103 }, + { 56, 58, 10, 171, 218, 189, 17, 13, 152 }, + { 144, 71, 10, 38, 171, 213, 144, 34, 26 }, + { 114, 26, 17, 163, 44, 195, 21, 10, 173 }, + { 121, 24, 80, 195, 26, 62, 44, 64, 85 }, + { 170, 46, 55, 19, 136, 160, 33, 206, 71 }, + { 63, 20, 8, 114, 114, 208, 12, 9, 226 }, + { 81, 40, 11, 96, 182, 84, 29, 16, 36 } }, + { { 134, 183, 89, 137, 98, 101, 106, 165, 148 }, + { 72, 187, 100, 130, 157, 111, 32, 75, 80 }, + { 66, 102, 167, 99, 74, 62, 40, 234, 128 }, + { 41, 53, 9, 178, 241, 141, 26, 8, 107 }, + { 104, 79, 12, 27, 217, 255, 87, 17, 7 }, + { 74, 43, 26, 146, 73, 166, 49, 23, 157 }, + { 65, 38, 105, 160, 51, 52, 31, 115, 128 }, + { 87, 68, 71, 44, 114, 51, 15, 186, 23 }, + { 47, 41, 14, 110, 182, 183, 21, 17, 194 }, + { 66, 45, 25, 102, 197, 189, 23, 18, 22 } }, + { { 88, 88, 147, 150, 42, 46, 45, 196, 205 }, + { 43, 97, 183, 117, 85, 38, 35, 179, 61 }, + { 39, 53, 200, 87, 26, 21, 43, 232, 171 }, + { 56, 34, 51, 104, 114, 102, 29, 93, 77 }, + { 107, 54, 32, 26, 51, 1, 81, 43, 31 }, + { 39, 28, 85, 171, 58, 165, 90, 98, 64 }, + { 34, 22, 116, 206, 23, 34, 43, 166, 73 }, + { 68, 25, 106, 22, 64, 171, 36, 225, 114 }, + { 34, 19, 21, 102, 132, 188, 16, 76, 124 }, + { 62, 18, 78, 95, 85, 57, 50, 48, 51 } }, + { { 193, 101, 35, 159, 215, 111, 89, 46, 111 }, + { 60, 148, 31, 172, 219, 228, 21, 18, 111 }, + { 112, 113, 77, 85, 179, 255, 38, 120, 114 }, + { 40, 42, 1, 196, 245, 209, 10, 25, 109 }, + { 100, 80, 8, 43, 154, 1, 51, 26, 71 }, + { 88, 43, 29, 140, 166, 213, 37, 43, 154 }, + { 61, 63, 30, 155, 67, 45, 68, 1, 209 }, + { 142, 78, 78, 16, 255, 128, 34, 197, 171 }, + { 41, 40, 5, 102, 211, 183, 4, 1, 221 }, + { 51, 50, 17, 168, 209, 192, 23, 25, 82 } }, + { { 125, 98, 42, 88, 104, 85, 117, 175, 82 }, + { 95, 84, 53, 89, 128, 100, 113, 101, 45 }, + { 75, 79, 123, 47, 51, 128, 81, 171, 1 }, + { 57, 17, 5, 71, 102, 57, 53, 41, 49 }, + { 115, 21, 2, 10, 102, 255, 166, 23, 6 }, + { 38, 33, 13, 121, 57, 73, 26, 1, 85 }, + { 41, 10, 67, 138, 77, 110, 90, 47, 114 }, + { 101, 29, 16, 10, 85, 128, 101, 196, 26 }, + { 57, 18, 10, 102, 102, 213, 34, 20, 43 }, + { 117, 20, 15, 36, 163, 128, 68, 1, 26 } }, + { { 138, 31, 36, 171, 27, 166, 38, 44, 229 }, + { 67, 87, 58, 169, 82, 115, 26, 59, 179 }, + { 63, 59, 90, 180, 59, 166, 93, 73, 154 }, + { 40, 40, 21, 116, 143, 209, 34, 39, 175 }, + { 57, 46, 22, 24, 128, 1, 54, 17, 37 }, + { 47, 15, 16, 183, 34, 223, 49, 45, 183 }, + { 46, 17, 33, 183, 6, 98, 15, 32, 183 }, + { 65, 32, 73, 115, 28, 128, 23, 128, 205 }, + { 40, 3, 9, 115, 51, 192, 18, 6, 223 }, + { 87, 37, 9, 115, 59, 77, 64, 21, 47 } }, + { { 104, 55, 44, 218, 9, 54, 53, 130, 226 }, + { 64, 90, 70, 205, 40, 41, 23, 26, 57 }, + { 54, 57, 112, 184, 5, 41, 38, 166, 213 }, + { 30, 34, 26, 133, 152, 116, 10, 32, 134 }, + { 75, 32, 12, 51, 192, 255, 160, 43, 51 }, + { 39, 19, 53, 221, 26, 114, 32, 73, 255 }, + { 31, 9, 65, 234, 2, 15, 1, 118, 73 }, + { 88, 31, 35, 67, 102, 85, 55, 186, 85 }, + { 56, 21, 23, 111, 59, 205, 45, 37, 192 }, + { 55, 38, 70, 124, 73, 102, 1, 34, 98 } }, + { { 102, 61, 71, 37, 34, 53, 31, 243, 192 }, + { 69, 60, 71, 38, 73, 119, 28, 222, 37 }, + { 68, 45, 128, 34, 1, 47, 11, 245, 171 }, + { 62, 17, 19, 70, 146, 85, 55, 62, 70 }, + { 75, 15, 9, 9, 64, 255, 184, 119, 16 }, + { 37, 43, 37, 154, 100, 163, 85, 160, 1 }, + { 63, 9, 92, 136, 28, 64, 32, 201, 85 }, + { 86, 6, 28, 5, 64, 255, 25, 248, 1 }, + { 56, 8, 17, 132, 137, 255, 55, 116, 128 }, + { 58, 15, 20, 82, 135, 57, 26, 121, 40 } }, + { { 164, 50, 31, 137, 154, 133, 25, 35, 218 }, + { 51, 103, 44, 131, 131, 123, 31, 6, 158 }, + { 86, 40, 64, 135, 148, 224, 45, 183, 128 }, + { 22, 26, 17, 131, 240, 154, 14, 1, 209 }, + { 83, 12, 13, 54, 192, 255, 68, 47, 28 }, + { 45, 16, 21, 91, 64, 222, 7, 1, 197 }, + { 56, 21, 39, 155, 60, 138, 23, 102, 213 }, + { 85, 26, 85, 85, 128, 128, 32, 146, 171 }, + { 18, 11, 7, 63, 144, 171, 4, 4, 246 }, + { 35, 27, 10, 146, 174, 171, 12, 26, 128 } }, + { { 190, 80, 35, 99, 180, 80, 126, 54, 45 }, + { 85, 126, 47, 87, 176, 51, 41, 20, 32 }, + { 101, 75, 128, 139, 118, 146, 116, 128, 85 }, + { 56, 41, 15, 176, 236, 85, 37, 9, 62 }, + { 146, 36, 19, 30, 171, 255, 97, 27, 20 }, + { 71, 30, 17, 119, 118, 255, 17, 18, 138 }, + { 101, 38, 60, 138, 55, 70, 43, 26, 142 }, + { 138, 45, 61, 62, 219, 1, 81, 188, 64 }, + { 32, 41, 20, 117, 151, 142, 20, 21, 163 }, + { 112, 19, 12, 61, 195, 128, 48, 4, 24 } } + }; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_COMMON_VP8_ENTROPYMODEDATA_H_ diff --git a/media/libvpx/libvpx/vp8/common/vp8_loopfilter.c b/media/libvpx/libvpx/vp8/common/vp8_loopfilter.c new file mode 100644 index 0000000000..4576c18537 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/vp8_loopfilter.c @@ -0,0 +1,566 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "loopfilter.h" +#include "onyxc_int.h" +#include "vpx_mem/vpx_mem.h" + +static void lf_init_lut(loop_filter_info_n *lfi) { + int filt_lvl; + + for (filt_lvl = 0; filt_lvl <= MAX_LOOP_FILTER; ++filt_lvl) { + if (filt_lvl >= 40) { + lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 2; + lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 3; + } else if (filt_lvl >= 20) { + lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1; + lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 2; + } else if (filt_lvl >= 15) { + lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1; + lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 1; + } else { + lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 0; + lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 0; + } + } + + lfi->mode_lf_lut[DC_PRED] = 1; + lfi->mode_lf_lut[V_PRED] = 1; + lfi->mode_lf_lut[H_PRED] = 1; + lfi->mode_lf_lut[TM_PRED] = 1; + lfi->mode_lf_lut[B_PRED] = 0; + + lfi->mode_lf_lut[ZEROMV] = 1; + lfi->mode_lf_lut[NEARESTMV] = 2; + lfi->mode_lf_lut[NEARMV] = 2; + lfi->mode_lf_lut[NEWMV] = 2; + lfi->mode_lf_lut[SPLITMV] = 3; +} + +void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi, + int sharpness_lvl) { + int i; + + /* For each possible value for the loop filter fill out limits */ + for (i = 0; i <= MAX_LOOP_FILTER; ++i) { + int filt_lvl = i; + int block_inside_limit = 0; + + /* Set loop filter paramaeters that control sharpness. */ + block_inside_limit = filt_lvl >> (sharpness_lvl > 0); + block_inside_limit = block_inside_limit >> (sharpness_lvl > 4); + + if (sharpness_lvl > 0) { + if (block_inside_limit > (9 - sharpness_lvl)) { + block_inside_limit = (9 - sharpness_lvl); + } + } + + if (block_inside_limit < 1) block_inside_limit = 1; + + memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH); + memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit), SIMD_WIDTH); + memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit), + SIMD_WIDTH); + } +} + +void vp8_loop_filter_init(VP8_COMMON *cm) { + loop_filter_info_n *lfi = &cm->lf_info; + int i; + + /* init limits for given sharpness*/ + vp8_loop_filter_update_sharpness(lfi, cm->sharpness_level); + cm->last_sharpness_level = cm->sharpness_level; + + /* init LUT for lvl and hev thr picking */ + lf_init_lut(lfi); + + /* init hev threshold const vectors */ + for (i = 0; i < 4; ++i) { + memset(lfi->hev_thr[i], i, SIMD_WIDTH); + } +} + +void vp8_loop_filter_frame_init(VP8_COMMON *cm, MACROBLOCKD *mbd, + int default_filt_lvl) { + int seg, /* segment number */ + ref, /* index in ref_lf_deltas */ + mode; /* index in mode_lf_deltas */ + + loop_filter_info_n *lfi = &cm->lf_info; + + /* update limits if sharpness has changed */ + if (cm->last_sharpness_level != cm->sharpness_level) { + vp8_loop_filter_update_sharpness(lfi, cm->sharpness_level); + cm->last_sharpness_level = cm->sharpness_level; + } + + for (seg = 0; seg < MAX_MB_SEGMENTS; ++seg) { + int lvl_seg = default_filt_lvl; + int lvl_ref, lvl_mode; + + /* Note the baseline filter values for each segment */ + if (mbd->segmentation_enabled) { + if (mbd->mb_segment_abs_delta == SEGMENT_ABSDATA) { + lvl_seg = mbd->segment_feature_data[MB_LVL_ALT_LF][seg]; + } else { /* Delta Value */ + lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg]; + } + lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63 : lvl_seg) : 0; + } + + if (!mbd->mode_ref_lf_delta_enabled) { + /* we could get rid of this if we assume that deltas are set to + * zero when not in use; encoder always uses deltas + */ + memset(lfi->lvl[seg][0], lvl_seg, 4 * 4); + continue; + } + + /* INTRA_FRAME */ + ref = INTRA_FRAME; + + /* Apply delta for reference frame */ + lvl_ref = lvl_seg + mbd->ref_lf_deltas[ref]; + + /* Apply delta for Intra modes */ + mode = 0; /* B_PRED */ + /* Only the split mode BPRED has a further special case */ + lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode]; + /* clamp */ + lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; + + lfi->lvl[seg][ref][mode] = lvl_mode; + + mode = 1; /* all the rest of Intra modes */ + /* clamp */ + lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref) : 0; + lfi->lvl[seg][ref][mode] = lvl_mode; + + /* LAST, GOLDEN, ALT */ + for (ref = 1; ref < MAX_REF_FRAMES; ++ref) { + /* Apply delta for reference frame */ + lvl_ref = lvl_seg + mbd->ref_lf_deltas[ref]; + + /* Apply delta for Inter modes */ + for (mode = 1; mode < 4; ++mode) { + lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode]; + /* clamp */ + lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; + + lfi->lvl[seg][ref][mode] = lvl_mode; + } + } + } +} + +void vp8_loop_filter_row_normal(VP8_COMMON *cm, MODE_INFO *mode_info_context, + int mb_row, int post_ystride, int post_uvstride, + unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr) { + int mb_col; + int filter_level; + loop_filter_info_n *lfi_n = &cm->lf_info; + loop_filter_info lfi; + FRAME_TYPE frame_type = cm->frame_type; + + for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) { + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); + + const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; + + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + + if (filter_level) { + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; + + if (mb_col > 0) + vp8_loop_filter_mbv(y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, + &lfi); + + if (!skip_lf) + vp8_loop_filter_bv(y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, + &lfi); + + /* don't apply across umv border */ + if (mb_row > 0) + vp8_loop_filter_mbh(y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, + &lfi); + + if (!skip_lf) + vp8_loop_filter_bh(y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, + &lfi); + } + + y_ptr += 16; + u_ptr += 8; + v_ptr += 8; + + mode_info_context++; /* step to next MB */ + } +} + +void vp8_loop_filter_row_simple(VP8_COMMON *cm, MODE_INFO *mode_info_context, + int mb_row, int post_ystride, + unsigned char *y_ptr) { + int mb_col; + int filter_level; + loop_filter_info_n *lfi_n = &cm->lf_info; + + for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) { + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); + + const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; + + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + + if (filter_level) { + if (mb_col > 0) + vp8_loop_filter_simple_mbv(y_ptr, post_ystride, + lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp8_loop_filter_simple_bv(y_ptr, post_ystride, + lfi_n->blim[filter_level]); + + /* don't apply across umv border */ + if (mb_row > 0) + vp8_loop_filter_simple_mbh(y_ptr, post_ystride, + lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp8_loop_filter_simple_bh(y_ptr, post_ystride, + lfi_n->blim[filter_level]); + } + + y_ptr += 16; + + mode_info_context++; /* step to next MB */ + } +} +void vp8_loop_filter_frame(VP8_COMMON *cm, MACROBLOCKD *mbd, int frame_type) { + YV12_BUFFER_CONFIG *post = cm->frame_to_show; + loop_filter_info_n *lfi_n = &cm->lf_info; + loop_filter_info lfi; + + int mb_row; + int mb_col; + int mb_rows = cm->mb_rows; + int mb_cols = cm->mb_cols; + + int filter_level; + + unsigned char *y_ptr, *u_ptr, *v_ptr; + + /* Point at base of Mb MODE_INFO list */ + const MODE_INFO *mode_info_context = cm->mi; + int post_y_stride = post->y_stride; + int post_uv_stride = post->uv_stride; + + /* Initialize the loop filter for this frame. */ + vp8_loop_filter_frame_init(cm, mbd, cm->filter_level); + + /* Set up the buffer pointers */ + y_ptr = post->y_buffer; + u_ptr = post->u_buffer; + v_ptr = post->v_buffer; + + /* vp8_filter each macro block */ + if (cm->filter_type == NORMAL_LOOPFILTER) { + for (mb_row = 0; mb_row < mb_rows; ++mb_row) { + for (mb_col = 0; mb_col < mb_cols; ++mb_col) { + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); + + const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; + + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + + if (filter_level) { + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; + + if (mb_col > 0) + vp8_loop_filter_mbv(y_ptr, u_ptr, v_ptr, post_y_stride, + post_uv_stride, &lfi); + + if (!skip_lf) + vp8_loop_filter_bv(y_ptr, u_ptr, v_ptr, post_y_stride, + post_uv_stride, &lfi); + + /* don't apply across umv border */ + if (mb_row > 0) + vp8_loop_filter_mbh(y_ptr, u_ptr, v_ptr, post_y_stride, + post_uv_stride, &lfi); + + if (!skip_lf) + vp8_loop_filter_bh(y_ptr, u_ptr, v_ptr, post_y_stride, + post_uv_stride, &lfi); + } + + y_ptr += 16; + u_ptr += 8; + v_ptr += 8; + + mode_info_context++; /* step to next MB */ + } + y_ptr += post_y_stride * 16 - post->y_width; + u_ptr += post_uv_stride * 8 - post->uv_width; + v_ptr += post_uv_stride * 8 - post->uv_width; + + mode_info_context++; /* Skip border mb */ + } + } else { /* SIMPLE_LOOPFILTER */ + for (mb_row = 0; mb_row < mb_rows; ++mb_row) { + for (mb_col = 0; mb_col < mb_cols; ++mb_col) { + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); + + const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; + + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + if (filter_level) { + const unsigned char *mblim = lfi_n->mblim[filter_level]; + const unsigned char *blim = lfi_n->blim[filter_level]; + + if (mb_col > 0) + vp8_loop_filter_simple_mbv(y_ptr, post_y_stride, mblim); + + if (!skip_lf) vp8_loop_filter_simple_bv(y_ptr, post_y_stride, blim); + + /* don't apply across umv border */ + if (mb_row > 0) + vp8_loop_filter_simple_mbh(y_ptr, post_y_stride, mblim); + + if (!skip_lf) vp8_loop_filter_simple_bh(y_ptr, post_y_stride, blim); + } + + y_ptr += 16; + u_ptr += 8; + v_ptr += 8; + + mode_info_context++; /* step to next MB */ + } + y_ptr += post_y_stride * 16 - post->y_width; + u_ptr += post_uv_stride * 8 - post->uv_width; + v_ptr += post_uv_stride * 8 - post->uv_width; + + mode_info_context++; /* Skip border mb */ + } + } +} + +void vp8_loop_filter_frame_yonly(VP8_COMMON *cm, MACROBLOCKD *mbd, + int default_filt_lvl) { + YV12_BUFFER_CONFIG *post = cm->frame_to_show; + + unsigned char *y_ptr; + int mb_row; + int mb_col; + + loop_filter_info_n *lfi_n = &cm->lf_info; + loop_filter_info lfi; + + int filter_level; + FRAME_TYPE frame_type = cm->frame_type; + + /* Point at base of Mb MODE_INFO list */ + const MODE_INFO *mode_info_context = cm->mi; + +#if 0 + if(default_filt_lvl == 0) /* no filter applied */ + return; +#endif + + /* Initialize the loop filter for this frame. */ + vp8_loop_filter_frame_init(cm, mbd, default_filt_lvl); + + /* Set up the buffer pointers */ + y_ptr = post->y_buffer; + + /* vp8_filter each macro block */ + for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { + for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) { + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); + + const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; + + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + + if (filter_level) { + if (cm->filter_type == NORMAL_LOOPFILTER) { + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; + + if (mb_col > 0) + vp8_loop_filter_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi); + + if (!skip_lf) + vp8_loop_filter_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi); + + /* don't apply across umv border */ + if (mb_row > 0) + vp8_loop_filter_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi); + + if (!skip_lf) + vp8_loop_filter_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi); + } else { + if (mb_col > 0) + vp8_loop_filter_simple_mbv(y_ptr, post->y_stride, + lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp8_loop_filter_simple_bv(y_ptr, post->y_stride, + lfi_n->blim[filter_level]); + + /* don't apply across umv border */ + if (mb_row > 0) + vp8_loop_filter_simple_mbh(y_ptr, post->y_stride, + lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp8_loop_filter_simple_bh(y_ptr, post->y_stride, + lfi_n->blim[filter_level]); + } + } + + y_ptr += 16; + mode_info_context++; /* step to next MB */ + } + + y_ptr += post->y_stride * 16 - post->y_width; + mode_info_context++; /* Skip border mb */ + } +} + +void vp8_loop_filter_partial_frame(VP8_COMMON *cm, MACROBLOCKD *mbd, + int default_filt_lvl) { + YV12_BUFFER_CONFIG *post = cm->frame_to_show; + + unsigned char *y_ptr; + int mb_row; + int mb_col; + int mb_cols = post->y_width >> 4; + int mb_rows = post->y_height >> 4; + + int linestocopy; + + loop_filter_info_n *lfi_n = &cm->lf_info; + loop_filter_info lfi; + + int filter_level; + FRAME_TYPE frame_type = cm->frame_type; + + const MODE_INFO *mode_info_context; + +#if 0 + if(default_filt_lvl == 0) /* no filter applied */ + return; +#endif + + /* Initialize the loop filter for this frame. */ + vp8_loop_filter_frame_init(cm, mbd, default_filt_lvl); + + /* number of MB rows to use in partial filtering */ + linestocopy = mb_rows / PARTIAL_FRAME_FRACTION; + linestocopy = linestocopy ? linestocopy << 4 : 16; /* 16 lines per MB */ + + /* Set up the buffer pointers; partial image starts at ~middle of frame */ + y_ptr = post->y_buffer + ((post->y_height >> 5) * 16) * post->y_stride; + mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1); + + /* vp8_filter each macro block */ + for (mb_row = 0; mb_row < (linestocopy >> 4); ++mb_row) { + for (mb_col = 0; mb_col < mb_cols; ++mb_col) { + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); + + const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; + + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + + if (filter_level) { + if (cm->filter_type == NORMAL_LOOPFILTER) { + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; + + if (mb_col > 0) + vp8_loop_filter_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi); + + if (!skip_lf) + vp8_loop_filter_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi); + + vp8_loop_filter_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi); + + if (!skip_lf) + vp8_loop_filter_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi); + } else { + if (mb_col > 0) + vp8_loop_filter_simple_mbv(y_ptr, post->y_stride, + lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp8_loop_filter_simple_bv(y_ptr, post->y_stride, + lfi_n->blim[filter_level]); + + vp8_loop_filter_simple_mbh(y_ptr, post->y_stride, + lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp8_loop_filter_simple_bh(y_ptr, post->y_stride, + lfi_n->blim[filter_level]); + } + } + + y_ptr += 16; + mode_info_context += 1; /* step to next MB */ + } + + y_ptr += post->y_stride * 16 - post->y_width; + mode_info_context += 1; /* Skip border mb */ + } +} diff --git a/media/libvpx/libvpx/vp8/common/vp8_skin_detection.c b/media/libvpx/libvpx/vp8/common/vp8_skin_detection.c new file mode 100644 index 0000000000..6739efa5fe --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/vp8_skin_detection.c @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp8/common/alloccommon.h" +#include "vp8/common/vp8_skin_detection.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_util/vpx_write_yuv_frame.h" + +static int avg_2x2(const uint8_t *s, int p) { + int i, j; + int sum = 0; + for (i = 0; i < 2; ++i, s += p) { + for (j = 0; j < 2; ++j) { + sum += s[j]; + } + } + return (sum + 2) >> 2; +} + +int vp8_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v, + int stride, int strideuv, + SKIN_DETECTION_BLOCK_SIZE bsize, int consec_zeromv, + int curr_motion_magn) { + // No skin if block has been zero/small motion for long consecutive time. + if (consec_zeromv > 60 && curr_motion_magn == 0) { + return 0; + } else { + int motion = 1; + if (consec_zeromv > 25 && curr_motion_magn == 0) motion = 0; + if (bsize == SKIN_16X16) { + // Take the average of center 2x2 pixels. + const int ysource = avg_2x2(y + 7 * stride + 7, stride); + const int usource = avg_2x2(u + 3 * strideuv + 3, strideuv); + const int vsource = avg_2x2(v + 3 * strideuv + 3, strideuv); + return vpx_skin_pixel(ysource, usource, vsource, motion); + } else { + int num_skin = 0; + int i, j; + for (i = 0; i < 2; i++) { + for (j = 0; j < 2; j++) { + // Take the average of center 2x2 pixels. + const int ysource = avg_2x2(y + 3 * stride + 3, stride); + const int usource = avg_2x2(u + strideuv + 1, strideuv); + const int vsource = avg_2x2(v + strideuv + 1, strideuv); + num_skin += vpx_skin_pixel(ysource, usource, vsource, motion); + if (num_skin >= 2) return 1; + y += 8; + u += 4; + v += 4; + } + y += (stride << 3) - 16; + u += (strideuv << 2) - 8; + v += (strideuv << 2) - 8; + } + + return 0; + } + } +} + +#ifdef OUTPUT_YUV_SKINMAP +// For viewing skin map on input source. +void vp8_compute_skin_map(VP8_COMP *const cpi, FILE *yuv_skinmap_file) { + int i, j, mb_row, mb_col, num_bl; + VP8_COMMON *const cm = &cpi->common; + uint8_t *y; + const uint8_t *src_y = cpi->Source->y_buffer; + const int src_ystride = cpi->Source->y_stride; + int offset = 0; + + YV12_BUFFER_CONFIG skinmap; + memset(&skinmap, 0, sizeof(skinmap)); + if (vp8_yv12_alloc_frame_buffer(&skinmap, cm->Width, cm->Height, + VP8BORDERINPIXELS) < 0) { + vpx_free_frame_buffer(&skinmap); + return; + } + memset(skinmap.buffer_alloc, 128, skinmap.frame_size); + y = skinmap.y_buffer; + // Loop through blocks and set skin map based on center pixel of block. + // Set y to white for skin block, otherwise set to source with gray scale. + for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 1) { + num_bl = 0; + for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 1) { + const int is_skin = cpi->skin_map[offset++]; + for (i = 0; i < 16; i++) { + for (j = 0; j < 16; j++) { + y[i * src_ystride + j] = is_skin ? 255 : src_y[i * src_ystride + j]; + } + } + num_bl++; + y += 16; + src_y += 16; + } + y += (src_ystride << 4) - (num_bl << 4); + src_y += (src_ystride << 4) - (num_bl << 4); + } + vpx_write_yuv_frame(yuv_skinmap_file, &skinmap); + vpx_free_frame_buffer(&skinmap); +} +#endif // OUTPUT_YUV_SKINMAP diff --git a/media/libvpx/libvpx/vp8/common/vp8_skin_detection.h b/media/libvpx/libvpx/vp8/common/vp8_skin_detection.h new file mode 100644 index 0000000000..ef0e4ae4fe --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/vp8_skin_detection.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_VP8_SKIN_DETECTION_H_ +#define VPX_VP8_COMMON_VP8_SKIN_DETECTION_H_ + +#include "vp8/encoder/onyx_int.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/skin_detection.h" +#include "vpx_scale/yv12config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP8_COMP; + +typedef enum { + // Skin detection based on 8x8 block. If two of them are identified as skin, + // the macroblock is marked as skin. + SKIN_8X8, + // Skin detection based on 16x16 block. + SKIN_16X16 +} SKIN_DETECTION_BLOCK_SIZE; + +int vp8_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v, + int stride, int strideuv, + SKIN_DETECTION_BLOCK_SIZE bsize, int consec_zeromv, + int curr_motion_magn); + +#ifdef OUTPUT_YUV_SKINMAP +// For viewing skin map on input source. +void vp8_compute_skin_map(struct VP8_COMP *const cpi, FILE *yuv_skinmap_file); +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_COMMON_VP8_SKIN_DETECTION_H_ diff --git a/media/libvpx/libvpx/vp8/common/x86/bilinear_filter_sse2.c b/media/libvpx/libvpx/vp8/common/x86/bilinear_filter_sse2.c new file mode 100644 index 0000000000..ff6cbbd68c --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/x86/bilinear_filter_sse2.c @@ -0,0 +1,336 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp8_rtcd.h" +#include "./vpx_config.h" +#include "vp8/common/filter.h" +#include "vpx_dsp/x86/mem_sse2.h" +#include "vpx_ports/mem.h" + +static INLINE void horizontal_16x16(uint8_t *src, const int stride, + uint16_t *dst, const int xoffset) { + int h; + const __m128i zero = _mm_setzero_si128(); + + if (xoffset == 0) { + for (h = 0; h < 17; ++h) { + const __m128i a = _mm_loadu_si128((__m128i *)src); + const __m128i a_lo = _mm_unpacklo_epi8(a, zero); + const __m128i a_hi = _mm_unpackhi_epi8(a, zero); + _mm_store_si128((__m128i *)dst, a_lo); + _mm_store_si128((__m128i *)(dst + 8), a_hi); + src += stride; + dst += 16; + } + return; + } + + { + const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1)); + const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]); + const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]); + + for (h = 0; h < 17; ++h) { + const __m128i a = _mm_loadu_si128((__m128i *)src); + const __m128i a_lo = _mm_unpacklo_epi8(a, zero); + const __m128i a_hi = _mm_unpackhi_epi8(a, zero); + const __m128i a_lo_filtered = _mm_mullo_epi16(a_lo, hfilter_0); + const __m128i a_hi_filtered = _mm_mullo_epi16(a_hi, hfilter_0); + + const __m128i b = _mm_loadu_si128((__m128i *)(src + 1)); + const __m128i b_lo = _mm_unpacklo_epi8(b, zero); + const __m128i b_hi = _mm_unpackhi_epi8(b, zero); + const __m128i b_lo_filtered = _mm_mullo_epi16(b_lo, hfilter_1); + const __m128i b_hi_filtered = _mm_mullo_epi16(b_hi, hfilter_1); + + const __m128i sum_lo = _mm_add_epi16(a_lo_filtered, b_lo_filtered); + const __m128i sum_hi = _mm_add_epi16(a_hi_filtered, b_hi_filtered); + + const __m128i compensated_lo = _mm_add_epi16(sum_lo, round_factor); + const __m128i compensated_hi = _mm_add_epi16(sum_hi, round_factor); + + const __m128i shifted_lo = + _mm_srai_epi16(compensated_lo, VP8_FILTER_SHIFT); + const __m128i shifted_hi = + _mm_srai_epi16(compensated_hi, VP8_FILTER_SHIFT); + + _mm_store_si128((__m128i *)dst, shifted_lo); + _mm_store_si128((__m128i *)(dst + 8), shifted_hi); + src += stride; + dst += 16; + } + } +} + +static INLINE void vertical_16x16(uint16_t *src, uint8_t *dst, const int stride, + const int yoffset) { + int h; + + if (yoffset == 0) { + for (h = 0; h < 16; ++h) { + const __m128i row_lo = _mm_load_si128((__m128i *)src); + const __m128i row_hi = _mm_load_si128((__m128i *)(src + 8)); + const __m128i packed = _mm_packus_epi16(row_lo, row_hi); + _mm_store_si128((__m128i *)dst, packed); + src += 16; + dst += stride; + } + return; + } + + { + const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1)); + const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]); + const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]); + + __m128i row_0_lo = _mm_load_si128((__m128i *)src); + __m128i row_0_hi = _mm_load_si128((__m128i *)(src + 8)); + src += 16; + for (h = 0; h < 16; ++h) { + const __m128i row_0_lo_filtered = _mm_mullo_epi16(row_0_lo, vfilter_0); + const __m128i row_0_hi_filtered = _mm_mullo_epi16(row_0_hi, vfilter_0); + + const __m128i row_1_lo = _mm_load_si128((__m128i *)src); + const __m128i row_1_hi = _mm_load_si128((__m128i *)(src + 8)); + const __m128i row_1_lo_filtered = _mm_mullo_epi16(row_1_lo, vfilter_1); + const __m128i row_1_hi_filtered = _mm_mullo_epi16(row_1_hi, vfilter_1); + + const __m128i sum_lo = + _mm_add_epi16(row_0_lo_filtered, row_1_lo_filtered); + const __m128i sum_hi = + _mm_add_epi16(row_0_hi_filtered, row_1_hi_filtered); + + const __m128i compensated_lo = _mm_add_epi16(sum_lo, round_factor); + const __m128i compensated_hi = _mm_add_epi16(sum_hi, round_factor); + + const __m128i shifted_lo = + _mm_srai_epi16(compensated_lo, VP8_FILTER_SHIFT); + const __m128i shifted_hi = + _mm_srai_epi16(compensated_hi, VP8_FILTER_SHIFT); + + const __m128i packed = _mm_packus_epi16(shifted_lo, shifted_hi); + _mm_store_si128((__m128i *)dst, packed); + row_0_lo = row_1_lo; + row_0_hi = row_1_hi; + src += 16; + dst += stride; + } + } +} + +void vp8_bilinear_predict16x16_sse2(uint8_t *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, uint8_t *dst_ptr, + int dst_pitch) { + DECLARE_ALIGNED(16, uint16_t, FData[16 * 17]); + + assert((xoffset | yoffset) != 0); + + horizontal_16x16(src_ptr, src_pixels_per_line, FData, xoffset); + + vertical_16x16(FData, dst_ptr, dst_pitch, yoffset); +} + +static INLINE void horizontal_8xN(uint8_t *src, const int stride, uint16_t *dst, + const int xoffset, const int height) { + int h; + const __m128i zero = _mm_setzero_si128(); + + if (xoffset == 0) { + for (h = 0; h < height; ++h) { + const __m128i a = _mm_loadl_epi64((__m128i *)src); + const __m128i a_u16 = _mm_unpacklo_epi8(a, zero); + _mm_store_si128((__m128i *)dst, a_u16); + src += stride; + dst += 8; + } + return; + } + + { + const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1)); + const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]); + const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]); + + // Filter horizontally. Rather than load the whole array and transpose, load + // 16 values (overreading) and shift to set up the second value. Do an + // "extra" 9th line so the vertical pass has the necessary context. + for (h = 0; h < height; ++h) { + const __m128i a = _mm_loadu_si128((__m128i *)src); + const __m128i b = _mm_srli_si128(a, 1); + const __m128i a_u16 = _mm_unpacklo_epi8(a, zero); + const __m128i b_u16 = _mm_unpacklo_epi8(b, zero); + const __m128i a_filtered = _mm_mullo_epi16(a_u16, hfilter_0); + const __m128i b_filtered = _mm_mullo_epi16(b_u16, hfilter_1); + const __m128i sum = _mm_add_epi16(a_filtered, b_filtered); + const __m128i compensated = _mm_add_epi16(sum, round_factor); + const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT); + _mm_store_si128((__m128i *)dst, shifted); + src += stride; + dst += 8; + } + } +} + +static INLINE void vertical_8xN(uint16_t *src, uint8_t *dst, const int stride, + const int yoffset, const int height) { + int h; + + if (yoffset == 0) { + for (h = 0; h < height; ++h) { + const __m128i row = _mm_load_si128((__m128i *)src); + const __m128i packed = _mm_packus_epi16(row, row); + _mm_storel_epi64((__m128i *)dst, packed); + src += 8; + dst += stride; + } + return; + } + + { + const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1)); + const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]); + const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]); + + __m128i row_0 = _mm_load_si128((__m128i *)src); + src += 8; + for (h = 0; h < height; ++h) { + const __m128i row_1 = _mm_load_si128((__m128i *)src); + const __m128i row_0_filtered = _mm_mullo_epi16(row_0, vfilter_0); + const __m128i row_1_filtered = _mm_mullo_epi16(row_1, vfilter_1); + const __m128i sum = _mm_add_epi16(row_0_filtered, row_1_filtered); + const __m128i compensated = _mm_add_epi16(sum, round_factor); + const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT); + const __m128i packed = _mm_packus_epi16(shifted, shifted); + _mm_storel_epi64((__m128i *)dst, packed); + row_0 = row_1; + src += 8; + dst += stride; + } + } +} + +void vp8_bilinear_predict8x8_sse2(uint8_t *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, uint8_t *dst_ptr, + int dst_pitch) { + DECLARE_ALIGNED(16, uint16_t, FData[8 * 9]); + + assert((xoffset | yoffset) != 0); + + horizontal_8xN(src_ptr, src_pixels_per_line, FData, xoffset, 9); + + vertical_8xN(FData, dst_ptr, dst_pitch, yoffset, 8); +} + +void vp8_bilinear_predict8x4_sse2(uint8_t *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, uint8_t *dst_ptr, + int dst_pitch) { + DECLARE_ALIGNED(16, uint16_t, FData[8 * 5]); + + assert((xoffset | yoffset) != 0); + + horizontal_8xN(src_ptr, src_pixels_per_line, FData, xoffset, 5); + + vertical_8xN(FData, dst_ptr, dst_pitch, yoffset, 4); +} + +static INLINE void horizontal_4x4(uint8_t *src, const int stride, uint16_t *dst, + const int xoffset) { + int h; + const __m128i zero = _mm_setzero_si128(); + + if (xoffset == 0) { + for (h = 0; h < 5; ++h) { + const __m128i a = load_unaligned_u32(src); + const __m128i a_u16 = _mm_unpacklo_epi8(a, zero); + _mm_storel_epi64((__m128i *)dst, a_u16); + src += stride; + dst += 4; + } + return; + } + + { + const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1)); + const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]); + const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]); + + for (h = 0; h < 5; ++h) { + const __m128i a = load_unaligned_u32(src); + const __m128i b = load_unaligned_u32(src + 1); + const __m128i a_u16 = _mm_unpacklo_epi8(a, zero); + const __m128i b_u16 = _mm_unpacklo_epi8(b, zero); + const __m128i a_filtered = _mm_mullo_epi16(a_u16, hfilter_0); + const __m128i b_filtered = _mm_mullo_epi16(b_u16, hfilter_1); + const __m128i sum = _mm_add_epi16(a_filtered, b_filtered); + const __m128i compensated = _mm_add_epi16(sum, round_factor); + const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT); + _mm_storel_epi64((__m128i *)dst, shifted); + src += stride; + dst += 4; + } + } +} + +static INLINE void vertical_4x4(uint16_t *src, uint8_t *dst, const int stride, + const int yoffset) { + int h; + + if (yoffset == 0) { + for (h = 0; h < 4; h += 2) { + const __m128i row = _mm_load_si128((__m128i *)src); + __m128i packed = _mm_packus_epi16(row, row); + store_unaligned_u32(dst, packed); + dst += stride; + packed = _mm_srli_si128(packed, 4); + store_unaligned_u32(dst, packed); + dst += stride; + src += 8; + } + return; + } + + { + const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1)); + const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]); + const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]); + + for (h = 0; h < 4; h += 2) { + const __m128i row_0 = _mm_load_si128((__m128i *)src); + const __m128i row_1 = _mm_loadu_si128((__m128i *)(src + 4)); + const __m128i row_0_filtered = _mm_mullo_epi16(row_0, vfilter_0); + const __m128i row_1_filtered = _mm_mullo_epi16(row_1, vfilter_1); + const __m128i sum = _mm_add_epi16(row_0_filtered, row_1_filtered); + const __m128i compensated = _mm_add_epi16(sum, round_factor); + const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT); + __m128i packed = _mm_packus_epi16(shifted, shifted); + storeu_int32(dst, _mm_cvtsi128_si32(packed)); + packed = _mm_srli_si128(packed, 4); + dst += stride; + storeu_int32(dst, _mm_cvtsi128_si32(packed)); + dst += stride; + src += 8; + } + } +} + +void vp8_bilinear_predict4x4_sse2(uint8_t *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, uint8_t *dst_ptr, + int dst_pitch) { + DECLARE_ALIGNED(16, uint16_t, FData[4 * 5]); + + assert((xoffset | yoffset) != 0); + + horizontal_4x4(src_ptr, src_pixels_per_line, FData, xoffset); + + vertical_4x4(FData, dst_ptr, dst_pitch, yoffset); +} diff --git a/media/libvpx/libvpx/vp8/common/x86/dequantize_mmx.asm b/media/libvpx/libvpx/vp8/common/x86/dequantize_mmx.asm new file mode 100644 index 0000000000..0a269e15f7 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/x86/dequantize_mmx.asm @@ -0,0 +1,259 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +SECTION .text + +;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q) +globalsym(vp8_dequantize_b_impl_mmx) +sym(vp8_dequantize_b_impl_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;sq + mov rdi, arg(1) ;dq + mov rax, arg(2) ;q + + movq mm1, [rsi] + pmullw mm1, [rax+0] ; mm4 *= kernel 0 modifiers. + movq [rdi], mm1 + + movq mm1, [rsi+8] + pmullw mm1, [rax+8] ; mm4 *= kernel 0 modifiers. + movq [rdi+8], mm1 + + movq mm1, [rsi+16] + pmullw mm1, [rax+16] ; mm4 *= kernel 0 modifiers. + movq [rdi+16], mm1 + + movq mm1, [rsi+24] + pmullw mm1, [rax+24] ; mm4 *= kernel 0 modifiers. + movq [rdi+24], mm1 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void dequant_idct_add_mmx( +;short *input, 0 +;short *dq, 1 +;unsigned char *dest, 2 +;int stride) 3 +globalsym(vp8_dequant_idct_add_mmx) +sym(vp8_dequant_idct_add_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + GET_GOT rbx + push rdi + ; end prolog + + mov rax, arg(0) ;input + mov rdx, arg(1) ;dq + + + movq mm0, [rax ] + pmullw mm0, [rdx] + + movq mm1, [rax +8] + pmullw mm1, [rdx +8] + + movq mm2, [rax+16] + pmullw mm2, [rdx+16] + + movq mm3, [rax+24] + pmullw mm3, [rdx+24] + + mov rdx, arg(2) ;dest + + pxor mm7, mm7 + + + movq [rax], mm7 + movq [rax+8], mm7 + + movq [rax+16],mm7 + movq [rax+24],mm7 + + + movsxd rdi, dword ptr arg(3) ;stride + + psubw mm0, mm2 ; b1= 0-2 + paddw mm2, mm2 ; + + movq mm5, mm1 + paddw mm2, mm0 ; a1 =0+2 + + pmulhw mm5, [GLOBAL(x_s1sqr2)]; + paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) + + movq mm7, mm3 ; + pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; + + paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw mm7, mm5 ; c1 + + movq mm5, mm1 + movq mm4, mm3 + + pmulhw mm5, [GLOBAL(x_c1sqr2less1)] + paddw mm5, mm1 + + pmulhw mm3, [GLOBAL(x_s1sqr2)] + paddw mm3, mm4 + + paddw mm3, mm5 ; d1 + movq mm6, mm2 ; a1 + + movq mm4, mm0 ; b1 + paddw mm2, mm3 ;0 + + paddw mm4, mm7 ;1 + psubw mm0, mm7 ;2 + + psubw mm6, mm3 ;3 + + movq mm1, mm2 ; 03 02 01 00 + movq mm3, mm4 ; 23 22 21 20 + + punpcklwd mm1, mm0 ; 11 01 10 00 + punpckhwd mm2, mm0 ; 13 03 12 02 + + punpcklwd mm3, mm6 ; 31 21 30 20 + punpckhwd mm4, mm6 ; 33 23 32 22 + + movq mm0, mm1 ; 11 01 10 00 + movq mm5, mm2 ; 13 03 12 02 + + punpckldq mm0, mm3 ; 30 20 10 00 + punpckhdq mm1, mm3 ; 31 21 11 01 + + punpckldq mm2, mm4 ; 32 22 12 02 + punpckhdq mm5, mm4 ; 33 23 13 03 + + movq mm3, mm5 ; 33 23 13 03 + + psubw mm0, mm2 ; b1= 0-2 + paddw mm2, mm2 ; + + movq mm5, mm1 + paddw mm2, mm0 ; a1 =0+2 + + pmulhw mm5, [GLOBAL(x_s1sqr2)]; + paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) + + movq mm7, mm3 ; + pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; + + paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw mm7, mm5 ; c1 + + movq mm5, mm1 + movq mm4, mm3 + + pmulhw mm5, [GLOBAL(x_c1sqr2less1)] + paddw mm5, mm1 + + pmulhw mm3, [GLOBAL(x_s1sqr2)] + paddw mm3, mm4 + + paddw mm3, mm5 ; d1 + paddw mm0, [GLOBAL(fours)] + + paddw mm2, [GLOBAL(fours)] + movq mm6, mm2 ; a1 + + movq mm4, mm0 ; b1 + paddw mm2, mm3 ;0 + + paddw mm4, mm7 ;1 + psubw mm0, mm7 ;2 + + psubw mm6, mm3 ;3 + psraw mm2, 3 + + psraw mm0, 3 + psraw mm4, 3 + + psraw mm6, 3 + + movq mm1, mm2 ; 03 02 01 00 + movq mm3, mm4 ; 23 22 21 20 + + punpcklwd mm1, mm0 ; 11 01 10 00 + punpckhwd mm2, mm0 ; 13 03 12 02 + + punpcklwd mm3, mm6 ; 31 21 30 20 + punpckhwd mm4, mm6 ; 33 23 32 22 + + movq mm0, mm1 ; 11 01 10 00 + movq mm5, mm2 ; 13 03 12 02 + + punpckldq mm0, mm3 ; 30 20 10 00 + punpckhdq mm1, mm3 ; 31 21 11 01 + + punpckldq mm2, mm4 ; 32 22 12 02 + punpckhdq mm5, mm4 ; 33 23 13 03 + + pxor mm7, mm7 + + movd mm4, [rdx] + punpcklbw mm4, mm7 + paddsw mm0, mm4 + packuswb mm0, mm7 + movd [rdx], mm0 + + movd mm4, [rdx+rdi] + punpcklbw mm4, mm7 + paddsw mm1, mm4 + packuswb mm1, mm7 + movd [rdx+rdi], mm1 + + movd mm4, [rdx+2*rdi] + punpcklbw mm4, mm7 + paddsw mm2, mm4 + packuswb mm2, mm7 + movd [rdx+rdi*2], mm2 + + add rdx, rdi + + movd mm4, [rdx+2*rdi] + punpcklbw mm4, mm7 + paddsw mm5, mm4 + packuswb mm5, mm7 + movd [rdx+rdi*2], mm5 + + ; begin epilog + pop rdi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +x_s1sqr2: + times 4 dw 0x8A8C +align 16 +x_c1sqr2less1: + times 4 dw 0x4E7B +align 16 +fours: + times 4 dw 0x0004 diff --git a/media/libvpx/libvpx/vp8/common/x86/idct_blk_mmx.c b/media/libvpx/libvpx/vp8/common/x86/idct_blk_mmx.c new file mode 100644 index 0000000000..fd804b1ca4 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/x86/idct_blk_mmx.c @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vp8/common/blockd.h" +#include "vpx_mem/vpx_mem.h" + +extern void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q); + +void vp8_dequantize_b_mmx(BLOCKD *d, short *DQC) { + short *sq = (short *)d->qcoeff; + short *dq = (short *)d->dqcoeff; + + vp8_dequantize_b_impl_mmx(sq, dq, DQC); +} diff --git a/media/libvpx/libvpx/vp8/common/x86/idct_blk_sse2.c b/media/libvpx/libvpx/vp8/common/x86/idct_blk_sse2.c new file mode 100644 index 0000000000..897ed5b652 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/x86/idct_blk_sse2.c @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" + +void vp8_idct_dequant_0_2x_sse2(short *q, short *dq, unsigned char *dst, + int dst_stride); +void vp8_idct_dequant_full_2x_sse2(short *q, short *dq, unsigned char *dst, + int dst_stride); + +void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, + int stride, char *eobs) { + int i; + + for (i = 0; i < 4; ++i) { + if (((short *)(eobs))[0]) { + if (((short *)(eobs))[0] & 0xfefe) { + vp8_idct_dequant_full_2x_sse2(q, dq, dst, stride); + } else { + vp8_idct_dequant_0_2x_sse2(q, dq, dst, stride); + } + } + if (((short *)(eobs))[1]) { + if (((short *)(eobs))[1] & 0xfefe) { + vp8_idct_dequant_full_2x_sse2(q + 32, dq, dst + 8, stride); + } else { + vp8_idct_dequant_0_2x_sse2(q + 32, dq, dst + 8, stride); + } + } + q += 64; + dst += stride * 4; + eobs += 4; + } +} + +void vp8_dequant_idct_add_uv_block_sse2(short *q, short *dq, + unsigned char *dst_u, + unsigned char *dst_v, int stride, + char *eobs) { + if (((short *)(eobs))[0]) { + if (((short *)(eobs))[0] & 0xfefe) { + vp8_idct_dequant_full_2x_sse2(q, dq, dst_u, stride); + } else { + vp8_idct_dequant_0_2x_sse2(q, dq, dst_u, stride); + } + } + q += 32; + dst_u += stride * 4; + + if (((short *)(eobs))[1]) { + if (((short *)(eobs))[1] & 0xfefe) { + vp8_idct_dequant_full_2x_sse2(q, dq, dst_u, stride); + } else { + vp8_idct_dequant_0_2x_sse2(q, dq, dst_u, stride); + } + } + q += 32; + + if (((short *)(eobs))[2]) { + if (((short *)(eobs))[2] & 0xfefe) { + vp8_idct_dequant_full_2x_sse2(q, dq, dst_v, stride); + } else { + vp8_idct_dequant_0_2x_sse2(q, dq, dst_v, stride); + } + } + q += 32; + dst_v += stride * 4; + + if (((short *)(eobs))[3]) { + if (((short *)(eobs))[3] & 0xfefe) { + vp8_idct_dequant_full_2x_sse2(q, dq, dst_v, stride); + } else { + vp8_idct_dequant_0_2x_sse2(q, dq, dst_v, stride); + } + } +} diff --git a/media/libvpx/libvpx/vp8/common/x86/idctllm_mmx.asm b/media/libvpx/libvpx/vp8/common/x86/idctllm_mmx.asm new file mode 100644 index 0000000000..6cea86fe03 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/x86/idctllm_mmx.asm @@ -0,0 +1,296 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +; /**************************************************************************** +; * Notes: +; * +; * This implementation makes use of 16 bit fixed point version of two multiply +; * constants: +; * 1. sqrt(2) * cos (pi/8) +; * 2. sqrt(2) * sin (pi/8) +; * Because the first constant is bigger than 1, to maintain the same 16 bit +; * fixed point precision as the second one, we use a trick of +; * x * a = x + x*(a-1) +; * so +; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). +; * +; * For the second constant, because of the 16bit version is 35468, which +; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative +; * number. +; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x +; * +; **************************************************************************/ + +SECTION .text + +;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, +;int pitch, unsigned char *dest,int stride) +globalsym(vp8_short_idct4x4llm_mmx) +sym(vp8_short_idct4x4llm_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rax, arg(0) ;input + mov rsi, arg(1) ;pred + + movq mm0, [rax ] + movq mm1, [rax+ 8] + movq mm2, [rax+16] + movq mm3, [rax+24] + +%if 0 + pxor mm7, mm7 + movq [rax], mm7 + movq [rax+8], mm7 + movq [rax+16],mm7 + movq [rax+24],mm7 +%endif + movsxd rax, dword ptr arg(2) ;pitch + mov rdx, arg(3) ;dest + movsxd rdi, dword ptr arg(4) ;stride + + + psubw mm0, mm2 ; b1= 0-2 + paddw mm2, mm2 ; + + movq mm5, mm1 + paddw mm2, mm0 ; a1 =0+2 + + pmulhw mm5, [GLOBAL(x_s1sqr2)]; + paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) + + movq mm7, mm3 ; + pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; + + paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw mm7, mm5 ; c1 + + movq mm5, mm1 + movq mm4, mm3 + + pmulhw mm5, [GLOBAL(x_c1sqr2less1)] + paddw mm5, mm1 + + pmulhw mm3, [GLOBAL(x_s1sqr2)] + paddw mm3, mm4 + + paddw mm3, mm5 ; d1 + movq mm6, mm2 ; a1 + + movq mm4, mm0 ; b1 + paddw mm2, mm3 ;0 + + paddw mm4, mm7 ;1 + psubw mm0, mm7 ;2 + + psubw mm6, mm3 ;3 + + movq mm1, mm2 ; 03 02 01 00 + movq mm3, mm4 ; 23 22 21 20 + + punpcklwd mm1, mm0 ; 11 01 10 00 + punpckhwd mm2, mm0 ; 13 03 12 02 + + punpcklwd mm3, mm6 ; 31 21 30 20 + punpckhwd mm4, mm6 ; 33 23 32 22 + + movq mm0, mm1 ; 11 01 10 00 + movq mm5, mm2 ; 13 03 12 02 + + punpckldq mm0, mm3 ; 30 20 10 00 + punpckhdq mm1, mm3 ; 31 21 11 01 + + punpckldq mm2, mm4 ; 32 22 12 02 + punpckhdq mm5, mm4 ; 33 23 13 03 + + movq mm3, mm5 ; 33 23 13 03 + + psubw mm0, mm2 ; b1= 0-2 + paddw mm2, mm2 ; + + movq mm5, mm1 + paddw mm2, mm0 ; a1 =0+2 + + pmulhw mm5, [GLOBAL(x_s1sqr2)]; + paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) + + movq mm7, mm3 ; + pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; + + paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw mm7, mm5 ; c1 + + movq mm5, mm1 + movq mm4, mm3 + + pmulhw mm5, [GLOBAL(x_c1sqr2less1)] + paddw mm5, mm1 + + pmulhw mm3, [GLOBAL(x_s1sqr2)] + paddw mm3, mm4 + + paddw mm3, mm5 ; d1 + paddw mm0, [GLOBAL(fours)] + + paddw mm2, [GLOBAL(fours)] + movq mm6, mm2 ; a1 + + movq mm4, mm0 ; b1 + paddw mm2, mm3 ;0 + + paddw mm4, mm7 ;1 + psubw mm0, mm7 ;2 + + psubw mm6, mm3 ;3 + psraw mm2, 3 + + psraw mm0, 3 + psraw mm4, 3 + + psraw mm6, 3 + + movq mm1, mm2 ; 03 02 01 00 + movq mm3, mm4 ; 23 22 21 20 + + punpcklwd mm1, mm0 ; 11 01 10 00 + punpckhwd mm2, mm0 ; 13 03 12 02 + + punpcklwd mm3, mm6 ; 31 21 30 20 + punpckhwd mm4, mm6 ; 33 23 32 22 + + movq mm0, mm1 ; 11 01 10 00 + movq mm5, mm2 ; 13 03 12 02 + + punpckldq mm0, mm3 ; 30 20 10 00 + punpckhdq mm1, mm3 ; 31 21 11 01 + + punpckldq mm2, mm4 ; 32 22 12 02 + punpckhdq mm5, mm4 ; 33 23 13 03 + + pxor mm7, mm7 + + movd mm4, [rsi] + punpcklbw mm4, mm7 + paddsw mm0, mm4 + packuswb mm0, mm7 + movd [rdx], mm0 + + movd mm4, [rsi+rax] + punpcklbw mm4, mm7 + paddsw mm1, mm4 + packuswb mm1, mm7 + movd [rdx+rdi], mm1 + + movd mm4, [rsi+2*rax] + punpcklbw mm4, mm7 + paddsw mm2, mm4 + packuswb mm2, mm7 + movd [rdx+rdi*2], mm2 + + add rdx, rdi + add rsi, rax + + movd mm4, [rsi+2*rax] + punpcklbw mm4, mm7 + paddsw mm5, mm4 + packuswb mm5, mm7 + movd [rdx+rdi*2], mm5 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_dc_only_idct_add_mmx( +;short input_dc, +;unsigned char *pred_ptr, +;int pred_stride, +;unsigned char *dst_ptr, +;int stride) +globalsym(vp8_dc_only_idct_add_mmx) +sym(vp8_dc_only_idct_add_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + ; end prolog + + movd mm5, arg(0) ;input_dc + mov rax, arg(1) ;pred_ptr + movsxd rdx, dword ptr arg(2) ;pred_stride + + pxor mm0, mm0 + + paddw mm5, [GLOBAL(fours)] + lea rcx, [rdx + rdx*2] + + psraw mm5, 3 + + punpcklwd mm5, mm5 + + punpckldq mm5, mm5 + + movd mm1, [rax] + movd mm2, [rax+rdx] + movd mm3, [rax+2*rdx] + movd mm4, [rax+rcx] + + mov rax, arg(3) ;d -- destination + movsxd rdx, dword ptr arg(4) ;dst_stride + + punpcklbw mm1, mm0 + paddsw mm1, mm5 + packuswb mm1, mm0 ; pack and unpack to saturate + lea rcx, [rdx + rdx*2] + + punpcklbw mm2, mm0 + paddsw mm2, mm5 + packuswb mm2, mm0 ; pack and unpack to saturate + + punpcklbw mm3, mm0 + paddsw mm3, mm5 + packuswb mm3, mm0 ; pack and unpack to saturate + + punpcklbw mm4, mm0 + paddsw mm4, mm5 + packuswb mm4, mm0 ; pack and unpack to saturate + + movd [rax], mm1 + movd [rax+rdx], mm2 + movd [rax+2*rdx], mm3 + movd [rax+rcx], mm4 + + ; begin epilog + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +x_s1sqr2: + times 4 dw 0x8A8C +align 16 +x_c1sqr2less1: + times 4 dw 0x4E7B +align 16 +fours: + times 4 dw 0x0004 diff --git a/media/libvpx/libvpx/vp8/common/x86/idctllm_sse2.asm b/media/libvpx/libvpx/vp8/common/x86/idctllm_sse2.asm new file mode 100644 index 0000000000..bb79d2da3b --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/x86/idctllm_sse2.asm @@ -0,0 +1,710 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp8_idct_dequant_0_2x_sse2 +; ( +; short *qcoeff - 0 +; short *dequant - 1 +; unsigned char *dst - 2 +; int dst_stride - 3 +; ) + +SECTION .text + +globalsym(vp8_idct_dequant_0_2x_sse2) +sym(vp8_idct_dequant_0_2x_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + GET_GOT rbx + ; end prolog + + mov rdx, arg(1) ; dequant + mov rax, arg(0) ; qcoeff + + movd xmm4, [rax] + movd xmm5, [rdx] + + pinsrw xmm4, [rax+32], 4 + pinsrw xmm5, [rdx], 4 + + pmullw xmm4, xmm5 + + ; Zero out xmm5, for use unpacking + pxor xmm5, xmm5 + + ; clear coeffs + movd [rax], xmm5 + movd [rax+32], xmm5 +;pshufb + mov rax, arg(2) ; dst + movsxd rdx, dword ptr arg(3) ; dst_stride + + pshuflw xmm4, xmm4, 00000000b + pshufhw xmm4, xmm4, 00000000b + + lea rcx, [rdx + rdx*2] + paddw xmm4, [GLOBAL(fours)] + + psraw xmm4, 3 + + movq xmm0, [rax] + movq xmm1, [rax+rdx] + movq xmm2, [rax+2*rdx] + movq xmm3, [rax+rcx] + + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpcklbw xmm2, xmm5 + punpcklbw xmm3, xmm5 + + + ; Add to predict buffer + paddw xmm0, xmm4 + paddw xmm1, xmm4 + paddw xmm2, xmm4 + paddw xmm3, xmm4 + + ; pack up before storing + packuswb xmm0, xmm5 + packuswb xmm1, xmm5 + packuswb xmm2, xmm5 + packuswb xmm3, xmm5 + + ; store blocks back out + movq [rax], xmm0 + movq [rax + rdx], xmm1 + + lea rax, [rax + 2*rdx] + + movq [rax], xmm2 + movq [rax + rdx], xmm3 + + ; begin epilog + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_idct_dequant_full_2x_sse2 +; ( +; short *qcoeff - 0 +; short *dequant - 1 +; unsigned char *dst - 2 +; int dst_stride - 3 +; ) +globalsym(vp8_idct_dequant_full_2x_sse2) +sym(vp8_idct_dequant_full_2x_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ; special case when 2 blocks have 0 or 1 coeffs + ; dc is set as first coeff, so no need to load qcoeff + mov rax, arg(0) ; qcoeff + mov rdx, arg(1) ; dequant + mov rdi, arg(2) ; dst + + + ; Zero out xmm7, for use unpacking + pxor xmm7, xmm7 + + + ; note the transpose of xmm1 and xmm2, necessary for shuffle + ; to spit out sensicle data + movdqa xmm0, [rax] + movdqa xmm2, [rax+16] + movdqa xmm1, [rax+32] + movdqa xmm3, [rax+48] + + ; Clear out coeffs + movdqa [rax], xmm7 + movdqa [rax+16], xmm7 + movdqa [rax+32], xmm7 + movdqa [rax+48], xmm7 + + ; dequantize qcoeff buffer + pmullw xmm0, [rdx] + pmullw xmm2, [rdx+16] + pmullw xmm1, [rdx] + pmullw xmm3, [rdx+16] + movsxd rdx, dword ptr arg(3) ; dst_stride + + ; repack so block 0 row x and block 1 row x are together + movdqa xmm4, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm4, xmm1 + + pshufd xmm0, xmm0, 11011000b + pshufd xmm1, xmm4, 11011000b + + movdqa xmm4, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm4, xmm3 + + pshufd xmm2, xmm2, 11011000b + pshufd xmm3, xmm4, 11011000b + + ; first pass + psubw xmm0, xmm2 ; b1 = 0-2 + paddw xmm2, xmm2 ; + + movdqa xmm5, xmm1 + paddw xmm2, xmm0 ; a1 = 0+2 + + pmulhw xmm5, [GLOBAL(x_s1sqr2)] + lea rcx, [rdx + rdx*2] ;dst_stride * 3 + paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) + + movdqa xmm7, xmm3 + pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] + + paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw xmm7, xmm5 ; c1 + + movdqa xmm5, xmm1 + movdqa xmm4, xmm3 + + pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] + paddw xmm5, xmm1 + + pmulhw xmm3, [GLOBAL(x_s1sqr2)] + paddw xmm3, xmm4 + + paddw xmm3, xmm5 ; d1 + movdqa xmm6, xmm2 ; a1 + + movdqa xmm4, xmm0 ; b1 + paddw xmm2, xmm3 ;0 + + paddw xmm4, xmm7 ;1 + psubw xmm0, xmm7 ;2 + + psubw xmm6, xmm3 ;3 + + ; transpose for the second pass + movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 + punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 + punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 + + movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 + punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 + punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 + + + movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 + punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 + punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 + + movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 + punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 + punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 + + + movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 + punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 + punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 + + movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 + punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 + punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 + + pshufd xmm0, xmm2, 11011000b + pshufd xmm2, xmm1, 11011000b + + pshufd xmm1, xmm5, 11011000b + pshufd xmm3, xmm7, 11011000b + + ; second pass + psubw xmm0, xmm2 ; b1 = 0-2 + paddw xmm2, xmm2 + + movdqa xmm5, xmm1 + paddw xmm2, xmm0 ; a1 = 0+2 + + pmulhw xmm5, [GLOBAL(x_s1sqr2)] + paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) + + movdqa xmm7, xmm3 + pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] + + paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw xmm7, xmm5 ; c1 + + movdqa xmm5, xmm1 + movdqa xmm4, xmm3 + + pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] + paddw xmm5, xmm1 + + pmulhw xmm3, [GLOBAL(x_s1sqr2)] + paddw xmm3, xmm4 + + paddw xmm3, xmm5 ; d1 + paddw xmm0, [GLOBAL(fours)] + + paddw xmm2, [GLOBAL(fours)] + movdqa xmm6, xmm2 ; a1 + + movdqa xmm4, xmm0 ; b1 + paddw xmm2, xmm3 ;0 + + paddw xmm4, xmm7 ;1 + psubw xmm0, xmm7 ;2 + + psubw xmm6, xmm3 ;3 + psraw xmm2, 3 + + psraw xmm0, 3 + psraw xmm4, 3 + + psraw xmm6, 3 + + ; transpose to save + movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 + punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 + punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 + + movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 + punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 + punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 + + + movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 + punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 + punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 + + movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 + punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 + punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 + + + movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 + punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 + punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 + + movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 + punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 + punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 + + pshufd xmm0, xmm2, 11011000b + pshufd xmm2, xmm1, 11011000b + + pshufd xmm1, xmm5, 11011000b + pshufd xmm3, xmm7, 11011000b + + pxor xmm7, xmm7 + + ; Load up predict blocks + movq xmm4, [rdi] + movq xmm5, [rdi+rdx] + + punpcklbw xmm4, xmm7 + punpcklbw xmm5, xmm7 + + paddw xmm0, xmm4 + paddw xmm1, xmm5 + + movq xmm4, [rdi+2*rdx] + movq xmm5, [rdi+rcx] + + punpcklbw xmm4, xmm7 + punpcklbw xmm5, xmm7 + + paddw xmm2, xmm4 + paddw xmm3, xmm5 + +.finish: + + ; pack up before storing + packuswb xmm0, xmm7 + packuswb xmm1, xmm7 + packuswb xmm2, xmm7 + packuswb xmm3, xmm7 + + ; store blocks back out + movq [rdi], xmm0 + movq [rdi + rdx], xmm1 + movq [rdi + rdx*2], xmm2 + movq [rdi + rcx], xmm3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_idct_dequant_dc_0_2x_sse2 +; ( +; short *qcoeff - 0 +; short *dequant - 1 +; unsigned char *dst - 2 +; int dst_stride - 3 +; short *dc - 4 +; ) +globalsym(vp8_idct_dequant_dc_0_2x_sse2) +sym(vp8_idct_dequant_dc_0_2x_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rdi + ; end prolog + + ; special case when 2 blocks have 0 or 1 coeffs + ; dc is set as first coeff, so no need to load qcoeff + mov rax, arg(0) ; qcoeff + + mov rdi, arg(2) ; dst + mov rdx, arg(4) ; dc + + ; Zero out xmm5, for use unpacking + pxor xmm5, xmm5 + + ; load up 2 dc words here == 2*16 = doubleword + movd xmm4, [rdx] + + movsxd rdx, dword ptr arg(3) ; dst_stride + lea rcx, [rdx + rdx*2] + ; Load up predict blocks + movq xmm0, [rdi] + movq xmm1, [rdi+rdx*1] + movq xmm2, [rdi+rdx*2] + movq xmm3, [rdi+rcx] + + ; Duplicate and expand dc across + punpcklwd xmm4, xmm4 + punpckldq xmm4, xmm4 + + ; Rounding to dequant and downshift + paddw xmm4, [GLOBAL(fours)] + psraw xmm4, 3 + + ; Predict buffer needs to be expanded from bytes to words + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpcklbw xmm2, xmm5 + punpcklbw xmm3, xmm5 + + ; Add to predict buffer + paddw xmm0, xmm4 + paddw xmm1, xmm4 + paddw xmm2, xmm4 + paddw xmm3, xmm4 + + ; pack up before storing + packuswb xmm0, xmm5 + packuswb xmm1, xmm5 + packuswb xmm2, xmm5 + packuswb xmm3, xmm5 + + ; store blocks back out + movq [rdi], xmm0 + movq [rdi + rdx], xmm1 + movq [rdi + rdx*2], xmm2 + movq [rdi + rcx], xmm3 + + ; begin epilog + pop rdi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret +;void vp8_idct_dequant_dc_full_2x_sse2 +; ( +; short *qcoeff - 0 +; short *dequant - 1 +; unsigned char *dst - 2 +; int dst_stride - 3 +; short *dc - 4 +; ) +globalsym(vp8_idct_dequant_dc_full_2x_sse2) +sym(vp8_idct_dequant_dc_full_2x_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + GET_GOT rbx + push rdi + ; end prolog + + ; special case when 2 blocks have 0 or 1 coeffs + ; dc is set as first coeff, so no need to load qcoeff + mov rax, arg(0) ; qcoeff + mov rdx, arg(1) ; dequant + + mov rdi, arg(2) ; dst + + ; Zero out xmm7, for use unpacking + pxor xmm7, xmm7 + + + ; note the transpose of xmm1 and xmm2, necessary for shuffle + ; to spit out sensicle data + movdqa xmm0, [rax] + movdqa xmm2, [rax+16] + movdqa xmm1, [rax+32] + movdqa xmm3, [rax+48] + + ; Clear out coeffs + movdqa [rax], xmm7 + movdqa [rax+16], xmm7 + movdqa [rax+32], xmm7 + movdqa [rax+48], xmm7 + + ; dequantize qcoeff buffer + pmullw xmm0, [rdx] + pmullw xmm2, [rdx+16] + pmullw xmm1, [rdx] + pmullw xmm3, [rdx+16] + + ; DC component + mov rdx, arg(4) + + ; repack so block 0 row x and block 1 row x are together + movdqa xmm4, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm4, xmm1 + + pshufd xmm0, xmm0, 11011000b + pshufd xmm1, xmm4, 11011000b + + movdqa xmm4, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm4, xmm3 + + pshufd xmm2, xmm2, 11011000b + pshufd xmm3, xmm4, 11011000b + + ; insert DC component + pinsrw xmm0, [rdx], 0 + pinsrw xmm0, [rdx+2], 4 + + ; first pass + psubw xmm0, xmm2 ; b1 = 0-2 + paddw xmm2, xmm2 ; + + movdqa xmm5, xmm1 + paddw xmm2, xmm0 ; a1 = 0+2 + + pmulhw xmm5, [GLOBAL(x_s1sqr2)] + paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) + + movdqa xmm7, xmm3 + pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] + + paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw xmm7, xmm5 ; c1 + + movdqa xmm5, xmm1 + movdqa xmm4, xmm3 + + pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] + paddw xmm5, xmm1 + + pmulhw xmm3, [GLOBAL(x_s1sqr2)] + paddw xmm3, xmm4 + + paddw xmm3, xmm5 ; d1 + movdqa xmm6, xmm2 ; a1 + + movdqa xmm4, xmm0 ; b1 + paddw xmm2, xmm3 ;0 + + paddw xmm4, xmm7 ;1 + psubw xmm0, xmm7 ;2 + + psubw xmm6, xmm3 ;3 + + ; transpose for the second pass + movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 + punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 + punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 + + movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 + punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 + punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 + + + movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 + punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 + punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 + + movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 + punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 + punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 + + + movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 + punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 + punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 + + movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 + punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 + punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 + + pshufd xmm0, xmm2, 11011000b + pshufd xmm2, xmm1, 11011000b + + pshufd xmm1, xmm5, 11011000b + pshufd xmm3, xmm7, 11011000b + + ; second pass + psubw xmm0, xmm2 ; b1 = 0-2 + paddw xmm2, xmm2 + + movdqa xmm5, xmm1 + paddw xmm2, xmm0 ; a1 = 0+2 + + pmulhw xmm5, [GLOBAL(x_s1sqr2)] + paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) + + movdqa xmm7, xmm3 + pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] + + paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw xmm7, xmm5 ; c1 + + movdqa xmm5, xmm1 + movdqa xmm4, xmm3 + + pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] + paddw xmm5, xmm1 + + pmulhw xmm3, [GLOBAL(x_s1sqr2)] + paddw xmm3, xmm4 + + paddw xmm3, xmm5 ; d1 + paddw xmm0, [GLOBAL(fours)] + + paddw xmm2, [GLOBAL(fours)] + movdqa xmm6, xmm2 ; a1 + + movdqa xmm4, xmm0 ; b1 + paddw xmm2, xmm3 ;0 + + paddw xmm4, xmm7 ;1 + psubw xmm0, xmm7 ;2 + + psubw xmm6, xmm3 ;3 + psraw xmm2, 3 + + psraw xmm0, 3 + psraw xmm4, 3 + + psraw xmm6, 3 + + ; transpose to save + movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 + punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 + punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 + + movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 + punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 + punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 + + + movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 + punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 + punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 + + movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 + punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 + punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 + + + movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 + punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 + punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 + + movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 + punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 + punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 + + pshufd xmm0, xmm2, 11011000b + pshufd xmm2, xmm1, 11011000b + + pshufd xmm1, xmm5, 11011000b + pshufd xmm3, xmm7, 11011000b + + pxor xmm7, xmm7 + + ; Load up predict blocks + movsxd rdx, dword ptr arg(3) ; dst_stride + movq xmm4, [rdi] + movq xmm5, [rdi+rdx] + lea rcx, [rdx + rdx*2] + + punpcklbw xmm4, xmm7 + punpcklbw xmm5, xmm7 + + paddw xmm0, xmm4 + paddw xmm1, xmm5 + + movq xmm4, [rdi+rdx*2] + movq xmm5, [rdi+rcx] + + punpcklbw xmm4, xmm7 + punpcklbw xmm5, xmm7 + + paddw xmm2, xmm4 + paddw xmm3, xmm5 + +.finish: + + ; pack up before storing + packuswb xmm0, xmm7 + packuswb xmm1, xmm7 + packuswb xmm2, xmm7 + packuswb xmm3, xmm7 + + ; Load destination stride before writing out, + ; doesn't need to persist + movsxd rdx, dword ptr arg(3) ; dst_stride + + ; store blocks back out + movq [rdi], xmm0 + movq [rdi + rdx], xmm1 + + lea rdi, [rdi + 2*rdx] + + movq [rdi], xmm2 + movq [rdi + rdx], xmm3 + + + ; begin epilog + pop rdi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +fours: + times 8 dw 0x0004 +align 16 +x_s1sqr2: + times 8 dw 0x8A8C +align 16 +x_c1sqr2less1: + times 8 dw 0x4E7B diff --git a/media/libvpx/libvpx/vp8/common/x86/iwalsh_sse2.asm b/media/libvpx/libvpx/vp8/common/x86/iwalsh_sse2.asm new file mode 100644 index 0000000000..56f37c3e0f --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/x86/iwalsh_sse2.asm @@ -0,0 +1,123 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +SECTION .text + +;void vp8_short_inv_walsh4x4_sse2(short *input, short *mb_dqcoeff) +globalsym(vp8_short_inv_walsh4x4_sse2) +sym(vp8_short_inv_walsh4x4_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + ; end prolog + + mov rcx, arg(0) + mov rdx, arg(1) + mov rax, 30003h + + movdqa xmm0, [rcx + 0] ;ip[4] ip[0] + movdqa xmm1, [rcx + 16] ;ip[12] ip[8] + + + pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] + movdqa xmm3, xmm0 ;ip[4] ip[0] + + paddw xmm0, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 + psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 + + movdqa xmm4, xmm0 + punpcklqdq xmm0, xmm3 ;d1 a1 + punpckhqdq xmm4, xmm3 ;c1 b1 + + movdqa xmm1, xmm4 ;c1 b1 + paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0] + psubw xmm0, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] + + ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; 13 12 11 10 03 02 01 00 + ; + ; 33 32 31 30 23 22 21 20 + ; + movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00 + punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00 + punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10 + movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00 + punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00 + punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02 + ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + movd xmm0, eax + pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] + movdqa xmm3, xmm4 ;ip[4] ip[0] + + pshufd xmm0, xmm0, 0 ;03 03 03 03 03 03 03 03 + + paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 + psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 + + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm3 ;d1 a1 + punpckhqdq xmm5, xmm3 ;c1 b1 + + movdqa xmm1, xmm5 ;c1 b1 + paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0] + psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] + + paddw xmm5, xmm0 + paddw xmm4, xmm0 + psraw xmm5, 3 + psraw xmm4, 3 + + movd eax, xmm5 + movd ecx, xmm4 + psrldq xmm5, 4 + psrldq xmm4, 4 + mov word ptr[rdx+32*0], ax + mov word ptr[rdx+32*2], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*4], ax + mov word ptr[rdx+32*6], cx + movd eax, xmm5 + movd ecx, xmm4 + psrldq xmm5, 4 + psrldq xmm4, 4 + mov word ptr[rdx+32*8], ax + mov word ptr[rdx+32*10], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*12], ax + mov word ptr[rdx+32*14], cx + + movd eax, xmm5 + movd ecx, xmm4 + psrldq xmm5, 4 + psrldq xmm4, 4 + mov word ptr[rdx+32*1], ax + mov word ptr[rdx+32*3], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*5], ax + mov word ptr[rdx+32*7], cx + movd eax, xmm5 + movd ecx, xmm4 + mov word ptr[rdx+32*9], ax + mov word ptr[rdx+32*11], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*13], ax + mov word ptr[rdx+32*15], cx + + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm b/media/libvpx/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm new file mode 100644 index 0000000000..8d12f5385d --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm @@ -0,0 +1,817 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%macro LF_ABS 2 + ; %1 value not preserved + ; %2 value preserved + ; output in %1 + movdqa scratch1, %2 ; v2 + + psubusb scratch1, %1 ; v2 - v1 + psubusb %1, %2 ; v1 - v2 + por %1, scratch1 ; abs(v2 - v1) +%endmacro + +%macro LF_FILTER_HEV_MASK 8-9 + + LF_ABS %1, %2 ; abs(p3 - p2) + LF_ABS %2, %3 ; abs(p2 - p1) + pmaxub %1, %2 ; accumulate mask +%if %0 == 8 + movdqa scratch2, %3 ; save p1 + LF_ABS scratch2, %4 ; abs(p1 - p0) +%endif + LF_ABS %4, %5 ; abs(p0 - q0) + LF_ABS %5, %6 ; abs(q0 - q1) +%if %0 == 8 + pmaxub %5, scratch2 ; accumulate hev +%else + pmaxub %5, %9 +%endif + pmaxub %1, %5 ; accumulate mask + + LF_ABS %3, %6 ; abs(p1 - q1) + LF_ABS %6, %7 ; abs(q1 - q2) + pmaxub %1, %6 ; accumulate mask + LF_ABS %7, %8 ; abs(q2 - q3) + pmaxub %1, %7 ; accumulate mask + + paddusb %4, %4 ; 2 * abs(p0 - q0) + pand %3, [GLOBAL(tfe)] + psrlw %3, 1 ; abs(p1 - q1) / 2 + paddusb %4, %3 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 + + psubusb %1, [limit] + psubusb %4, [blimit] + por %1, %4 + pcmpeqb %1, zero ; mask + + psubusb %5, [thresh] + pcmpeqb %5, zero ; ~hev +%endmacro + +%macro LF_FILTER 6 + ; %1-%4: p1-q1 + ; %5: mask + ; %6: hev + + movdqa scratch2, %6 ; save hev + + pxor %1, [GLOBAL(t80)] ; ps1 + pxor %4, [GLOBAL(t80)] ; qs1 + movdqa scratch1, %1 + psubsb scratch1, %4 ; signed_char_clamp(ps1 - qs1) + pandn scratch2, scratch1 ; vp8_filter &= hev + + pxor %2, [GLOBAL(t80)] ; ps0 + pxor %3, [GLOBAL(t80)] ; qs0 + movdqa scratch1, %3 + psubsb scratch1, %2 ; qs0 - ps0 + paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0) + paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0) + paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0) + pand %5, scratch2 ; &= mask + + movdqa scratch2, %5 + paddsb %5, [GLOBAL(t4)] ; Filter1 + paddsb scratch2, [GLOBAL(t3)] ; Filter2 + + ; Filter1 >> 3 + movdqa scratch1, zero + pcmpgtb scratch1, %5 + psrlw %5, 3 + pand scratch1, [GLOBAL(te0)] + pand %5, [GLOBAL(t1f)] + por %5, scratch1 + + psubsb %3, %5 ; qs0 - Filter1 + pxor %3, [GLOBAL(t80)] + + ; Filter2 >> 3 + movdqa scratch1, zero + pcmpgtb scratch1, scratch2 + psrlw scratch2, 3 + pand scratch1, [GLOBAL(te0)] + pand scratch2, [GLOBAL(t1f)] + por scratch2, scratch1 + + paddsb %2, scratch2 ; ps0 + Filter2 + pxor %2, [GLOBAL(t80)] + + ; outer tap adjustments + paddsb %5, [GLOBAL(t1)] + movdqa scratch1, zero + pcmpgtb scratch1, %5 + psrlw %5, 1 + pand scratch1, [GLOBAL(t80)] + pand %5, [GLOBAL(t7f)] + por %5, scratch1 + pand %5, %6 ; vp8_filter &= ~hev + + psubsb %4, %5 ; qs1 - vp8_filter + pxor %4, [GLOBAL(t80)] + + paddsb %1, %5 ; ps1 + vp8_filter + pxor %1, [GLOBAL(t80)] +%endmacro + +SECTION .text + +;void vp8_loop_filter_bh_y_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh +;) +globalsym(vp8_loop_filter_bh_y_sse2) +sym(vp8_loop_filter_bh_y_sse2): + +%if LIBVPX_YASM_WIN64 + %define src rcx ; src_ptr + %define stride rdx ; src_pixel_step + %define blimit r8 + %define limit r9 + %define thresh r10 + + %define spp rax + %define stride3 r11 + %define stride5 r12 + %define stride7 r13 + + push rbp + mov rbp, rsp + SAVE_XMM 11 + push r12 + push r13 + mov thresh, arg(4) +%else + %define src rdi ; src_ptr + %define stride rsi ; src_pixel_step + %define blimit rdx + %define limit rcx + %define thresh r8 + + %define spp rax + %define stride3 r9 + %define stride5 r10 + %define stride7 r11 +%endif + + %define scratch1 xmm5 + %define scratch2 xmm6 + %define zero xmm7 + + %define i0 [src] + %define i1 [spp] + %define i2 [src + 2 * stride] + %define i3 [spp + 2 * stride] + %define i4 [src + 4 * stride] + %define i5 [spp + 4 * stride] + %define i6 [src + 2 * stride3] + %define i7 [spp + 2 * stride3] + %define i8 [src + 8 * stride] + %define i9 [spp + 8 * stride] + %define i10 [src + 2 * stride5] + %define i11 [spp + 2 * stride5] + %define i12 [src + 4 * stride3] + %define i13 [spp + 4 * stride3] + %define i14 [src + 2 * stride7] + %define i15 [spp + 2 * stride7] + + ; prep work + lea spp, [src + stride] + lea stride3, [stride + 2 * stride] + lea stride5, [stride3 + 2 * stride] + lea stride7, [stride3 + 4 * stride] + pxor zero, zero + + ; load the first set into registers + movdqa xmm0, i0 + movdqa xmm1, i1 + movdqa xmm2, i2 + movdqa xmm3, i3 + movdqa xmm4, i4 + movdqa xmm8, i5 + movdqa xmm9, i6 ; q2, will contain abs(p1-p0) + movdqa xmm10, i7 +LF_FILTER_HEV_MASK xmm0, xmm1, xmm2, xmm3, xmm4, xmm8, xmm9, xmm10 + + movdqa xmm1, i2 + movdqa xmm2, i3 + movdqa xmm3, i4 + movdqa xmm8, i5 +LF_FILTER xmm1, xmm2, xmm3, xmm8, xmm0, xmm4 + movdqa i2, xmm1 + movdqa i3, xmm2 + +; second set + movdqa i4, xmm3 + movdqa i5, xmm8 + + movdqa xmm0, i6 + movdqa xmm1, i7 + movdqa xmm2, i8 + movdqa xmm4, i9 + movdqa xmm10, i10 ; q2, will contain abs(p1-p0) + movdqa xmm11, i11 +LF_FILTER_HEV_MASK xmm3, xmm8, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm9 + + movdqa xmm0, i6 + movdqa xmm1, i7 + movdqa xmm4, i8 + movdqa xmm8, i9 +LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2 + movdqa i6, xmm0 + movdqa i7, xmm1 + +; last set + movdqa i8, xmm4 + movdqa i9, xmm8 + + movdqa xmm0, i10 + movdqa xmm1, i11 + movdqa xmm2, i12 + movdqa xmm3, i13 + movdqa xmm9, i14 ; q2, will contain abs(p1-p0) + movdqa xmm11, i15 +LF_FILTER_HEV_MASK xmm4, xmm8, xmm0, xmm1, xmm2, xmm3, xmm9, xmm11, xmm10 + + movdqa xmm0, i10 + movdqa xmm1, i11 + movdqa xmm3, i12 + movdqa xmm8, i13 +LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2 + movdqa i10, xmm0 + movdqa i11, xmm1 + movdqa i12, xmm3 + movdqa i13, xmm8 + +%if LIBVPX_YASM_WIN64 + pop r13 + pop r12 + RESTORE_XMM + pop rbp +%endif + + ret + + +;void vp8_loop_filter_bv_y_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh +;) + +globalsym(vp8_loop_filter_bv_y_sse2) +sym(vp8_loop_filter_bv_y_sse2): + +%if LIBVPX_YASM_WIN64 + %define src rcx ; src_ptr + %define stride rdx ; src_pixel_step + %define blimit r8 + %define limit r9 + %define thresh r10 + + %define spp rax + %define stride3 r11 + %define stride5 r12 + %define stride7 r13 + + push rbp + mov rbp, rsp + SAVE_XMM 15 + push r12 + push r13 + mov thresh, arg(4) +%else + %define src rdi + %define stride rsi + %define blimit rdx + %define limit rcx + %define thresh r8 + + %define spp rax + %define stride3 r9 + %define stride5 r10 + %define stride7 r11 +%endif + + %define scratch1 xmm5 + %define scratch2 xmm6 + %define zero xmm7 + + %define s0 [src] + %define s1 [spp] + %define s2 [src + 2 * stride] + %define s3 [spp + 2 * stride] + %define s4 [src + 4 * stride] + %define s5 [spp + 4 * stride] + %define s6 [src + 2 * stride3] + %define s7 [spp + 2 * stride3] + %define s8 [src + 8 * stride] + %define s9 [spp + 8 * stride] + %define s10 [src + 2 * stride5] + %define s11 [spp + 2 * stride5] + %define s12 [src + 4 * stride3] + %define s13 [spp + 4 * stride3] + %define s14 [src + 2 * stride7] + %define s15 [spp + 2 * stride7] + + %define i0 [rsp] + %define i1 [rsp + 16] + %define i2 [rsp + 32] + %define i3 [rsp + 48] + %define i4 [rsp + 64] + %define i5 [rsp + 80] + %define i6 [rsp + 96] + %define i7 [rsp + 112] + %define i8 [rsp + 128] + %define i9 [rsp + 144] + %define i10 [rsp + 160] + %define i11 [rsp + 176] + %define i12 [rsp + 192] + %define i13 [rsp + 208] + %define i14 [rsp + 224] + %define i15 [rsp + 240] + + ALIGN_STACK 16, rax + + ; reserve stack space + %define temp_storage 0 ; size is 256 (16*16) + %define stack_size 256 + sub rsp, stack_size + + ; prep work + lea spp, [src + stride] + lea stride3, [stride + 2 * stride] + lea stride5, [stride3 + 2 * stride] + lea stride7, [stride3 + 4 * stride] + + ; 8-f + movdqa xmm0, s8 + movdqa xmm1, xmm0 + punpcklbw xmm0, s9 ; 80 90 + punpckhbw xmm1, s9 ; 88 98 + + movdqa xmm2, s10 + movdqa xmm3, xmm2 + punpcklbw xmm2, s11 ; a0 b0 + punpckhbw xmm3, s11 ; a8 b8 + + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm2 ; 80 90 a0 b0 + punpckhwd xmm4, xmm2 ; 84 94 a4 b4 + + movdqa xmm2, xmm1 + punpcklwd xmm1, xmm3 ; 88 98 a8 b8 + punpckhwd xmm2, xmm3 ; 8c 9c ac bc + + ; using xmm[0124] + ; work on next 4 rows + + movdqa xmm3, s12 + movdqa xmm5, xmm3 + punpcklbw xmm3, s13 ; c0 d0 + punpckhbw xmm5, s13 ; c8 d8 + + movdqa xmm6, s14 + movdqa xmm7, xmm6 + punpcklbw xmm6, s15 ; e0 f0 + punpckhbw xmm7, s15 ; e8 f8 + + movdqa xmm8, xmm3 + punpcklwd xmm3, xmm6 ; c0 d0 e0 f0 + punpckhwd xmm8, xmm6 ; c4 d4 e4 f4 + + movdqa xmm6, xmm5 + punpcklwd xmm5, xmm7 ; c8 d8 e8 f8 + punpckhwd xmm6, xmm7 ; cc dc ec fc + + ; pull the third and fourth sets together + + movdqa xmm7, xmm0 + punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0 + punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2 + + movdqa xmm3, xmm4 + punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4 + punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6 + + movdqa xmm8, xmm1 + punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8 + punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa + + movdqa xmm5, xmm2 + punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc + punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe + + ; save the calculations. we only have 15 registers ... + movdqa i0, xmm0 + movdqa i1, xmm7 + movdqa i2, xmm4 + movdqa i3, xmm3 + movdqa i4, xmm1 + movdqa i5, xmm8 + movdqa i6, xmm2 + movdqa i7, xmm5 + + ; 0-7 + movdqa xmm0, s0 + movdqa xmm1, xmm0 + punpcklbw xmm0, s1 ; 00 10 + punpckhbw xmm1, s1 ; 08 18 + + movdqa xmm2, s2 + movdqa xmm3, xmm2 + punpcklbw xmm2, s3 ; 20 30 + punpckhbw xmm3, s3 ; 28 38 + + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm2 ; 00 10 20 30 + punpckhwd xmm4, xmm2 ; 04 14 24 34 + + movdqa xmm2, xmm1 + punpcklwd xmm1, xmm3 ; 08 18 28 38 + punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c + + ; using xmm[0124] + ; work on next 4 rows + + movdqa xmm3, s4 + movdqa xmm5, xmm3 + punpcklbw xmm3, s5 ; 40 50 + punpckhbw xmm5, s5 ; 48 58 + + movdqa xmm6, s6 + movdqa xmm7, xmm6 + punpcklbw xmm6, s7 ; 60 70 + punpckhbw xmm7, s7 ; 68 78 + + movdqa xmm8, xmm3 + punpcklwd xmm3, xmm6 ; 40 50 60 70 + punpckhwd xmm8, xmm6 ; 44 54 64 74 + + movdqa xmm6, xmm5 + punpcklwd xmm5, xmm7 ; 48 58 68 78 + punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c + + ; pull the first two sets together + + movdqa xmm7, xmm0 + punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70 + punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72 + + movdqa xmm3, xmm4 + punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74 + punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76 + + movdqa xmm8, xmm1 + punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78 + punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a + + movdqa xmm5, xmm2 + punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c + punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e + ; final combination + + movdqa xmm6, xmm0 + punpcklqdq xmm0, i0 + punpckhqdq xmm6, i0 + + movdqa xmm9, xmm7 + punpcklqdq xmm7, i1 + punpckhqdq xmm9, i1 + + movdqa xmm10, xmm4 + punpcklqdq xmm4, i2 + punpckhqdq xmm10, i2 + + movdqa xmm11, xmm3 + punpcklqdq xmm3, i3 + punpckhqdq xmm11, i3 + + movdqa xmm12, xmm1 + punpcklqdq xmm1, i4 + punpckhqdq xmm12, i4 + + movdqa xmm13, xmm8 + punpcklqdq xmm8, i5 + punpckhqdq xmm13, i5 + + movdqa xmm14, xmm2 + punpcklqdq xmm2, i6 + punpckhqdq xmm14, i6 + + movdqa xmm15, xmm5 + punpcklqdq xmm5, i7 + punpckhqdq xmm15, i7 + + movdqa i0, xmm0 + movdqa i1, xmm6 + movdqa i2, xmm7 + movdqa i3, xmm9 + movdqa i4, xmm4 + movdqa i5, xmm10 + movdqa i6, xmm3 + movdqa i7, xmm11 + movdqa i8, xmm1 + movdqa i9, xmm12 + movdqa i10, xmm8 + movdqa i11, xmm13 + movdqa i12, xmm2 + movdqa i13, xmm14 + movdqa i14, xmm5 + movdqa i15, xmm15 + +; TRANSPOSED DATA AVAILABLE ON THE STACK + + movdqa xmm12, xmm6 + movdqa xmm13, xmm7 + + pxor zero, zero + +LF_FILTER_HEV_MASK xmm0, xmm12, xmm13, xmm9, xmm4, xmm10, xmm3, xmm11 + + movdqa xmm1, i2 + movdqa xmm2, i3 + movdqa xmm8, i4 + movdqa xmm9, i5 +LF_FILTER xmm1, xmm2, xmm8, xmm9, xmm0, xmm4 + movdqa i2, xmm1 + movdqa i3, xmm2 + +; second set + movdqa i4, xmm8 + movdqa i5, xmm9 + + movdqa xmm0, i6 + movdqa xmm1, i7 + movdqa xmm2, i8 + movdqa xmm4, i9 + movdqa xmm10, i10 ; q2, will contain abs(p1-p0) + movdqa xmm11, i11 +LF_FILTER_HEV_MASK xmm8, xmm9, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm3 + + movdqa xmm0, i6 + movdqa xmm1, i7 + movdqa xmm3, i8 + movdqa xmm4, i9 +LF_FILTER xmm0, xmm1, xmm3, xmm4, xmm8, xmm2 + movdqa i6, xmm0 + movdqa i7, xmm1 + +; last set + movdqa i8, xmm3 + movdqa i9, xmm4 + + movdqa xmm0, i10 + movdqa xmm1, i11 + movdqa xmm2, i12 + movdqa xmm8, i13 + movdqa xmm9, i14 ; q2, will contain abs(p1-p0) + movdqa xmm11, i15 +LF_FILTER_HEV_MASK xmm3, xmm4, xmm0, xmm1, xmm2, xmm8, xmm9, xmm11, xmm10 + + movdqa xmm0, i10 + movdqa xmm1, i11 + movdqa xmm4, i12 + movdqa xmm8, i13 +LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2 + movdqa i10, xmm0 + movdqa i11, xmm1 + movdqa i12, xmm4 + movdqa i13, xmm8 + + +; RESHUFFLE AND WRITE OUT + ; 8-f + movdqa xmm0, i8 + movdqa xmm1, xmm0 + punpcklbw xmm0, i9 ; 80 90 + punpckhbw xmm1, i9 ; 88 98 + + movdqa xmm2, i10 + movdqa xmm3, xmm2 + punpcklbw xmm2, i11 ; a0 b0 + punpckhbw xmm3, i11 ; a8 b8 + + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm2 ; 80 90 a0 b0 + punpckhwd xmm4, xmm2 ; 84 94 a4 b4 + + movdqa xmm2, xmm1 + punpcklwd xmm1, xmm3 ; 88 98 a8 b8 + punpckhwd xmm2, xmm3 ; 8c 9c ac bc + + ; using xmm[0124] + ; work on next 4 rows + + movdqa xmm3, i12 + movdqa xmm5, xmm3 + punpcklbw xmm3, i13 ; c0 d0 + punpckhbw xmm5, i13 ; c8 d8 + + movdqa xmm6, i14 + movdqa xmm7, xmm6 + punpcklbw xmm6, i15 ; e0 f0 + punpckhbw xmm7, i15 ; e8 f8 + + movdqa xmm8, xmm3 + punpcklwd xmm3, xmm6 ; c0 d0 e0 f0 + punpckhwd xmm8, xmm6 ; c4 d4 e4 f4 + + movdqa xmm6, xmm5 + punpcklwd xmm5, xmm7 ; c8 d8 e8 f8 + punpckhwd xmm6, xmm7 ; cc dc ec fc + + ; pull the third and fourth sets together + + movdqa xmm7, xmm0 + punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0 + punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2 + + movdqa xmm3, xmm4 + punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4 + punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6 + + movdqa xmm8, xmm1 + punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8 + punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa + + movdqa xmm5, xmm2 + punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc + punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe + + ; save the calculations. we only have 15 registers ... + movdqa i8, xmm0 + movdqa i9, xmm7 + movdqa i10, xmm4 + movdqa i11, xmm3 + movdqa i12, xmm1 + movdqa i13, xmm8 + movdqa i14, xmm2 + movdqa i15, xmm5 + + ; 0-7 + movdqa xmm0, i0 + movdqa xmm1, xmm0 + punpcklbw xmm0, i1 ; 00 10 + punpckhbw xmm1, i1 ; 08 18 + + movdqa xmm2, i2 + movdqa xmm3, xmm2 + punpcklbw xmm2, i3 ; 20 30 + punpckhbw xmm3, i3 ; 28 38 + + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm2 ; 00 10 20 30 + punpckhwd xmm4, xmm2 ; 04 14 24 34 + + movdqa xmm2, xmm1 + punpcklwd xmm1, xmm3 ; 08 18 28 38 + punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c + + ; using xmm[0124] + ; work on next 4 rows + + movdqa xmm3, i4 + movdqa xmm5, xmm3 + punpcklbw xmm3, i5 ; 40 50 + punpckhbw xmm5, i5 ; 48 58 + + movdqa xmm6, i6 + movdqa xmm7, xmm6 + punpcklbw xmm6, i7 ; 60 70 + punpckhbw xmm7, i7 ; 68 78 + + movdqa xmm8, xmm3 + punpcklwd xmm3, xmm6 ; 40 50 60 70 + punpckhwd xmm8, xmm6 ; 44 54 64 74 + + movdqa xmm6, xmm5 + punpcklwd xmm5, xmm7 ; 48 58 68 78 + punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c + + ; pull the first two sets together + + movdqa xmm7, xmm0 + punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70 + punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72 + + movdqa xmm3, xmm4 + punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74 + punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76 + + movdqa xmm8, xmm1 + punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78 + punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a + + movdqa xmm5, xmm2 + punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c + punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e + ; final combination + + movdqa xmm6, xmm0 + punpcklqdq xmm0, i8 + punpckhqdq xmm6, i8 + + movdqa xmm9, xmm7 + punpcklqdq xmm7, i9 + punpckhqdq xmm9, i9 + + movdqa xmm10, xmm4 + punpcklqdq xmm4, i10 + punpckhqdq xmm10, i10 + + movdqa xmm11, xmm3 + punpcklqdq xmm3, i11 + punpckhqdq xmm11, i11 + + movdqa xmm12, xmm1 + punpcklqdq xmm1, i12 + punpckhqdq xmm12, i12 + + movdqa xmm13, xmm8 + punpcklqdq xmm8, i13 + punpckhqdq xmm13, i13 + + movdqa xmm14, xmm2 + punpcklqdq xmm2, i14 + punpckhqdq xmm14, i14 + + movdqa xmm15, xmm5 + punpcklqdq xmm5, i15 + punpckhqdq xmm15, i15 + + movdqa s0, xmm0 + movdqa s1, xmm6 + movdqa s2, xmm7 + movdqa s3, xmm9 + movdqa s4, xmm4 + movdqa s5, xmm10 + movdqa s6, xmm3 + movdqa s7, xmm11 + movdqa s8, xmm1 + movdqa s9, xmm12 + movdqa s10, xmm8 + movdqa s11, xmm13 + movdqa s12, xmm2 + movdqa s13, xmm14 + movdqa s14, xmm5 + movdqa s15, xmm15 + + ; free stack space + add rsp, stack_size + + ; un-ALIGN_STACK + pop rsp + +%if LIBVPX_YASM_WIN64 + pop r13 + pop r12 + RESTORE_XMM + pop rbp +%endif + + ret + +SECTION_RODATA +align 16 +te0: + times 16 db 0xe0 +align 16 +t7f: + times 16 db 0x7f +align 16 +tfe: + times 16 db 0xfe +align 16 +t1f: + times 16 db 0x1f +align 16 +t80: + times 16 db 0x80 +align 16 +t1: + times 16 db 0x01 +align 16 +t3: + times 16 db 0x03 +align 16 +t4: + times 16 db 0x04 diff --git a/media/libvpx/libvpx/vp8/common/x86/loopfilter_sse2.asm b/media/libvpx/libvpx/vp8/common/x86/loopfilter_sse2.asm new file mode 100644 index 0000000000..ce5c313138 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/x86/loopfilter_sse2.asm @@ -0,0 +1,1642 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" +%define _t0 0 +%define _t1 _t0 + 16 +%define _p3 _t1 + 16 +%define _p2 _p3 + 16 +%define _p1 _p2 + 16 +%define _p0 _p1 + 16 +%define _q0 _p0 + 16 +%define _q1 _q0 + 16 +%define _q2 _q1 + 16 +%define _q3 _q2 + 16 +%define lf_var_size 160 + +; Use of pmaxub instead of psubusb to compute filter mask was seen +; in ffvp8 + +%macro LFH_FILTER_AND_HEV_MASK 1 +%if %1 + movdqa xmm2, [rdi+2*rax] ; q3 + movdqa xmm1, [rsi+2*rax] ; q2 + movdqa xmm4, [rsi+rax] ; q1 + movdqa xmm5, [rsi] ; q0 + neg rax ; negate pitch to deal with above border +%else + movlps xmm2, [rsi + rcx*2] ; q3 + movlps xmm1, [rsi + rcx] ; q2 + movlps xmm4, [rsi] ; q1 + movlps xmm5, [rsi + rax] ; q0 + + movhps xmm2, [rdi + rcx*2] + movhps xmm1, [rdi + rcx] + movhps xmm4, [rdi] + movhps xmm5, [rdi + rax] + + lea rsi, [rsi + rax*4] + lea rdi, [rdi + rax*4] + + movdqa [rsp+_q2], xmm1 ; store q2 + movdqa [rsp+_q1], xmm4 ; store q1 +%endif + movdqa xmm7, [rdx] ;limit + + movdqa xmm6, xmm1 ; q2 + movdqa xmm3, xmm4 ; q1 + + psubusb xmm1, xmm2 ; q2-=q3 + psubusb xmm2, xmm6 ; q3-=q2 + + psubusb xmm4, xmm6 ; q1-=q2 + psubusb xmm6, xmm3 ; q2-=q1 + + por xmm4, xmm6 ; abs(q2-q1) + por xmm1, xmm2 ; abs(q3-q2) + + movdqa xmm0, xmm5 ; q0 + pmaxub xmm1, xmm4 + + psubusb xmm5, xmm3 ; q0-=q1 + psubusb xmm3, xmm0 ; q1-=q0 + + por xmm5, xmm3 ; abs(q0-q1) + movdqa [rsp+_t0], xmm5 ; save to t0 + + pmaxub xmm1, xmm5 + +%if %1 + movdqa xmm2, [rsi+4*rax] ; p3 + movdqa xmm4, [rdi+4*rax] ; p2 + movdqa xmm6, [rsi+2*rax] ; p1 +%else + movlps xmm2, [rsi + rax] ; p3 + movlps xmm4, [rsi] ; p2 + movlps xmm6, [rsi + rcx] ; p1 + + movhps xmm2, [rdi + rax] + movhps xmm4, [rdi] + movhps xmm6, [rdi + rcx] + + movdqa [rsp+_p2], xmm4 ; store p2 + movdqa [rsp+_p1], xmm6 ; store p1 +%endif + + movdqa xmm5, xmm4 ; p2 + movdqa xmm3, xmm6 ; p1 + + psubusb xmm4, xmm2 ; p2-=p3 + psubusb xmm2, xmm5 ; p3-=p2 + + psubusb xmm3, xmm5 ; p1-=p2 + pmaxub xmm1, xmm4 ; abs(p3 - p2) + + psubusb xmm5, xmm6 ; p2-=p1 + pmaxub xmm1, xmm2 ; abs(p3 - p2) + + pmaxub xmm1, xmm5 ; abs(p2 - p1) + movdqa xmm2, xmm6 ; p1 + + pmaxub xmm1, xmm3 ; abs(p2 - p1) +%if %1 + movdqa xmm4, [rsi+rax] ; p0 + movdqa xmm3, [rdi] ; q1 +%else + movlps xmm4, [rsi + rcx*2] ; p0 + movhps xmm4, [rdi + rcx*2] + movdqa xmm3, [rsp+_q1] ; q1 +%endif + + movdqa xmm5, xmm4 ; p0 + psubusb xmm4, xmm6 ; p0-=p1 + + psubusb xmm6, xmm5 ; p1-=p0 + + por xmm6, xmm4 ; abs(p1 - p0) + mov rdx, arg(2) ; get blimit + + movdqa [rsp+_t1], xmm6 ; save to t1 + + movdqa xmm4, xmm3 ; q1 + pmaxub xmm1, xmm6 + + psubusb xmm3, xmm2 ; q1-=p1 + psubusb xmm2, xmm4 ; p1-=q1 + + psubusb xmm1, xmm7 + por xmm2, xmm3 ; abs(p1-q1) + + movdqa xmm7, [rdx] ; blimit + mov rdx, arg(4) ; hev get thresh + + movdqa xmm3, xmm0 ; q0 + pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero + + movdqa xmm6, xmm5 ; p0 + psrlw xmm2, 1 ; abs(p1-q1)/2 + + psubusb xmm5, xmm3 ; p0-=q0 + psubusb xmm3, xmm6 ; q0-=p0 + por xmm5, xmm3 ; abs(p0 - q0) + + paddusb xmm5, xmm5 ; abs(p0-q0)*2 + + movdqa xmm4, [rsp+_t0] ; hev get abs (q1 - q0) + movdqa xmm3, [rsp+_t1] ; get abs (p1 - p0) + + paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + movdqa xmm2, [rdx] ; hev + + psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit + psubusb xmm4, xmm2 ; hev + + psubusb xmm3, xmm2 ; hev + por xmm1, xmm5 + + pxor xmm7, xmm7 + paddb xmm4, xmm3 ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + + pcmpeqb xmm4, xmm5 ; hev + pcmpeqb xmm3, xmm3 ; hev + + pcmpeqb xmm1, xmm7 ; mask xmm1 + pxor xmm4, xmm3 ; hev +%endmacro + +%macro B_FILTER 1 + movdqa xmm3, [GLOBAL(t80)] +%if %1 == 0 + movdqa xmm2, [rsp+_p1] ; p1 + movdqa xmm7, [rsp+_q1] ; q1 +%elif %1 == 1 + movdqa xmm2, [rsi+2*rax] ; p1 + movdqa xmm7, [rdi] ; q1 +%elif %1 == 2 + movdqa xmm2, [rsp+_p1] ; p1 + movdqa xmm6, [rsp+_p0] ; p0 + movdqa xmm0, [rsp+_q0] ; q0 + movdqa xmm7, [rsp+_q1] ; q1 +%endif + + pxor xmm2, xmm3 ; p1 offset to convert to signed values + pxor xmm7, xmm3 ; q1 offset to convert to signed values + + psubsb xmm2, xmm7 ; p1 - q1 + pxor xmm6, xmm3 ; offset to convert to signed values + + pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1) + pxor xmm0, xmm3 ; offset to convert to signed values + + movdqa xmm3, xmm0 ; q0 + psubsb xmm0, xmm6 ; q0 - p0 + paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1) + paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1) + paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1) + pand xmm1, xmm2 ; mask filter values we don't care about + + movdqa xmm2, xmm1 + paddsb xmm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 + paddsb xmm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 + + punpckhbw xmm5, xmm2 ; axbxcxdx + punpcklbw xmm2, xmm2 ; exfxgxhx + + punpcklbw xmm0, xmm1 ; exfxgxhx + psraw xmm5, 11 ; sign extended shift right by 3 + + punpckhbw xmm1, xmm1 ; axbxcxdx + psraw xmm2, 11 ; sign extended shift right by 3 + + packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; + psraw xmm0, 11 ; sign extended shift right by 3 + + psraw xmm1, 11 ; sign extended shift right by 3 + movdqa xmm5, xmm0 ; save results + + packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 + + paddsb xmm6, xmm2 ; p0+= p0 add + + movdqa xmm2, [GLOBAL(ones)] + paddsw xmm5, xmm2 + paddsw xmm1, xmm2 + psraw xmm5, 1 ; partial shifted one more time for 2nd tap + psraw xmm1, 1 ; partial shifted one more time for 2nd tap + packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 + movdqa xmm2, [GLOBAL(t80)] + +%if %1 == 0 + movdqa xmm1, [rsp+_p1] ; p1 + lea rsi, [rsi + rcx*2] + lea rdi, [rdi + rcx*2] +%elif %1 == 1 + movdqa xmm1, [rsi+2*rax] ; p1 +%elif %1 == 2 + movdqa xmm1, [rsp+_p1] ; p1 +%endif + + pandn xmm4, xmm5 ; high edge variance additive + pxor xmm6, xmm2 ; unoffset + + pxor xmm1, xmm2 ; reoffset + psubsb xmm3, xmm0 ; q0-= q0 add + + paddsb xmm1, xmm4 ; p1+= p1 add + pxor xmm3, xmm2 ; unoffset + + pxor xmm1, xmm2 ; unoffset + psubsb xmm7, xmm4 ; q1-= q1 add + + pxor xmm7, xmm2 ; unoffset +%if %1 == 0 + movq [rsi], xmm6 ; p0 + movhps [rdi], xmm6 + movq [rsi + rax], xmm1 ; p1 + movhps [rdi + rax], xmm1 + movq [rsi + rcx], xmm3 ; q0 + movhps [rdi + rcx], xmm3 + movq [rsi + rcx*2], xmm7 ; q1 + movhps [rdi + rcx*2], xmm7 +%elif %1 == 1 + movdqa [rsi+rax], xmm6 ; write back + movdqa [rsi+2*rax], xmm1 ; write back + movdqa [rsi], xmm3 ; write back + movdqa [rdi], xmm7 ; write back +%endif + +%endmacro + +SECTION .text + +%if ABI_IS_32BIT + +;void vp8_loop_filter_horizontal_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +;) +globalsym(vp8_loop_filter_horizontal_edge_sse2) +sym(vp8_loop_filter_horizontal_edge_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, lf_var_size + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step + + mov rdx, arg(3) ;limit + + lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing + + ; calculate breakout conditions and high edge variance + LFH_FILTER_AND_HEV_MASK 1 + ; filter and write back the result + B_FILTER 1 + + add rsp, lf_var_size + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +%endif + +;void vp8_loop_filter_horizontal_edge_uv_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; int count +;) +globalsym(vp8_loop_filter_horizontal_edge_uv_sse2) +sym(vp8_loop_filter_horizontal_edge_uv_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, lf_var_size + + mov rsi, arg(0) ; u + mov rdi, arg(5) ; v + movsxd rax, dword ptr arg(1) ; src_pixel_step + mov rcx, rax + neg rax ; negate pitch to deal with above border + + mov rdx, arg(3) ;limit + + lea rsi, [rsi + rcx] + lea rdi, [rdi + rcx] + + ; calculate breakout conditions and high edge variance + LFH_FILTER_AND_HEV_MASK 0 + ; filter and write back the result + B_FILTER 0 + + add rsp, lf_var_size + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +%macro MB_FILTER_AND_WRITEBACK 1 + movdqa xmm3, [GLOBAL(t80)] +%if %1 == 0 + movdqa xmm2, [rsp+_p1] ; p1 + movdqa xmm7, [rsp+_q1] ; q1 +%elif %1 == 1 + movdqa xmm2, [rsi+2*rax] ; p1 + movdqa xmm7, [rdi] ; q1 + + mov rcx, rax + neg rcx +%elif %1 == 2 + movdqa xmm2, [rsp+_p1] ; p1 + movdqa xmm6, [rsp+_p0] ; p0 + movdqa xmm0, [rsp+_q0] ; q0 + movdqa xmm7, [rsp+_q1] ; q1 +%endif + + pxor xmm2, xmm3 ; p1 offset to convert to signed values + pxor xmm7, xmm3 ; q1 offset to convert to signed values + pxor xmm6, xmm3 ; offset to convert to signed values + pxor xmm0, xmm3 ; offset to convert to signed values + + psubsb xmm2, xmm7 ; p1 - q1 + + movdqa xmm3, xmm0 ; q0 + psubsb xmm0, xmm6 ; q0 - p0 + paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1) + paddsb xmm2, xmm0 ; 2 * (q0 - p0) + paddsb xmm2, xmm0 ; 3 * (q0 - p0) + (p1 - q1) + pand xmm1, xmm2 ; mask filter values we don't care about + + movdqa xmm2, xmm1 ; vp8_filter + + pand xmm2, xmm4 ; Filter2 = vp8_filter & hev + pxor xmm0, xmm0 + + pandn xmm4, xmm1 ; vp8_filter&=~hev + pxor xmm1, xmm1 + + punpcklbw xmm0, xmm4 ; Filter 2 (hi) + punpckhbw xmm1, xmm4 ; Filter 2 (lo) + + movdqa xmm5, xmm2 + + movdqa xmm4, [GLOBAL(s9)] + paddsb xmm5, [GLOBAL(t3)] ; vp8_signed_char_clamp(Filter2 + 3) + paddsb xmm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) + + pmulhw xmm1, xmm4 ; Filter 2 (lo) * 9 + pmulhw xmm0, xmm4 ; Filter 2 (hi) * 9 + + punpckhbw xmm7, xmm5 ; axbxcxdx + punpcklbw xmm5, xmm5 ; exfxgxhx + + psraw xmm7, 11 ; sign extended shift right by 3 + + psraw xmm5, 11 ; sign extended shift right by 3 + punpckhbw xmm4, xmm2 ; axbxcxdx + + punpcklbw xmm2, xmm2 ; exfxgxhx + psraw xmm4, 11 ; sign extended shift right by 3 + + packsswb xmm5, xmm7 ; Filter2 >>=3; + psraw xmm2, 11 ; sign extended shift right by 3 + + packsswb xmm2, xmm4 ; Filter1 >>=3; + + paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2 + + psubsb xmm3, xmm2 ; qs0 =qs0 - Filter1 + movdqa xmm7, xmm1 + + movdqa xmm4, [GLOBAL(s63)] + movdqa xmm5, xmm0 + movdqa xmm2, xmm5 + paddw xmm0, xmm4 ; Filter 2 (hi) * 9 + 63 + paddw xmm1, xmm4 ; Filter 2 (lo) * 9 + 63 + movdqa xmm4, xmm7 + + paddw xmm5, xmm5 ; Filter 2 (hi) * 18 + + paddw xmm7, xmm7 ; Filter 2 (lo) * 18 + paddw xmm5, xmm0 ; Filter 2 (hi) * 27 + 63 + + paddw xmm7, xmm1 ; Filter 2 (lo) * 27 + 63 + paddw xmm2, xmm0 ; Filter 2 (hi) * 18 + 63 + psraw xmm0, 7 ; (Filter 2 (hi) * 9 + 63) >> 7 + + paddw xmm4, xmm1 ; Filter 2 (lo) * 18 + 63 + psraw xmm1, 7 ; (Filter 2 (lo) * 9 + 63) >> 7 + psraw xmm2, 7 ; (Filter 2 (hi) * 18 + 63) >> 7 + + packsswb xmm0, xmm1 ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) + + psraw xmm4, 7 ; (Filter 2 (lo) * 18 + 63) >> 7 + psraw xmm5, 7 ; (Filter 2 (hi) * 27 + 63) >> 7 + psraw xmm7, 7 ; (Filter 2 (lo) * 27 + 63) >> 7 + + packsswb xmm5, xmm7 ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) + packsswb xmm2, xmm4 ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) + movdqa xmm7, [GLOBAL(t80)] + +%if %1 == 0 + movdqa xmm1, [rsp+_q1] ; q1 + movdqa xmm4, [rsp+_p1] ; p1 + lea rsi, [rsi+rcx*2] + lea rdi, [rdi+rcx*2] + +%elif %1 == 1 + movdqa xmm1, [rdi] ; q1 + movdqa xmm4, [rsi+rax*2] ; p1 +%elif %1 == 2 + movdqa xmm4, [rsp+_p1] ; p1 + movdqa xmm1, [rsp+_q1] ; q1 +%endif + + pxor xmm1, xmm7 + pxor xmm4, xmm7 + + psubsb xmm3, xmm5 ; sq = vp8_signed_char_clamp(qs0 - u3) + paddsb xmm6, xmm5 ; sp = vp8_signed_char_clamp(ps0 - u3) + psubsb xmm1, xmm2 ; sq = vp8_signed_char_clamp(qs1 - u2) + paddsb xmm4, xmm2 ; sp = vp8_signed_char_clamp(ps1 - u2) + +%if %1 == 1 + movdqa xmm2, [rdi+rax*4] ; p2 + movdqa xmm5, [rdi+rcx] ; q2 +%else + movdqa xmm2, [rsp+_p2] ; p2 + movdqa xmm5, [rsp+_q2] ; q2 +%endif + + pxor xmm1, xmm7 ; *oq1 = sq^0x80; + pxor xmm4, xmm7 ; *op1 = sp^0x80; + pxor xmm2, xmm7 + pxor xmm5, xmm7 + paddsb xmm2, xmm0 ; sp = vp8_signed_char_clamp(ps2 - u) + psubsb xmm5, xmm0 ; sq = vp8_signed_char_clamp(qs2 - u) + pxor xmm2, xmm7 ; *op2 = sp^0x80; + pxor xmm5, xmm7 ; *oq2 = sq^0x80; + pxor xmm3, xmm7 ; *oq0 = sq^0x80 + pxor xmm6, xmm7 ; *oq0 = sp^0x80 +%if %1 == 0 + movq [rsi], xmm6 ; p0 + movhps [rdi], xmm6 + movq [rsi + rcx], xmm3 ; q0 + movhps [rdi + rcx], xmm3 + lea rdx, [rcx + rcx*2] + movq [rsi+rcx*2], xmm1 ; q1 + movhps [rdi+rcx*2], xmm1 + + movq [rsi + rax], xmm4 ; p1 + movhps [rdi + rax], xmm4 + + movq [rsi+rax*2], xmm2 ; p2 + movhps [rdi+rax*2], xmm2 + + movq [rsi+rdx], xmm5 ; q2 + movhps [rdi+rdx], xmm5 +%elif %1 == 1 + movdqa [rdi+rcx], xmm5 ; q2 + movdqa [rdi], xmm1 ; q1 + movdqa [rsi], xmm3 ; q0 + movdqa [rsi+rax ], xmm6 ; p0 + movdqa [rsi+rax*2], xmm4 ; p1 + movdqa [rdi+rax*4], xmm2 ; p2 +%elif %1 == 2 + movdqa [rsp+_p1], xmm4 ; p1 + movdqa [rsp+_p0], xmm6 ; p0 + movdqa [rsp+_q0], xmm3 ; q0 + movdqa [rsp+_q1], xmm1 ; q1 +%endif + +%endmacro + + +;void vp8_mbloop_filter_horizontal_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +;) +globalsym(vp8_mbloop_filter_horizontal_edge_sse2) +sym(vp8_mbloop_filter_horizontal_edge_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, lf_var_size + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step + mov rdx, arg(3) ;limit + + lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing + + ; calculate breakout conditions and high edge variance + LFH_FILTER_AND_HEV_MASK 1 + ; filter and write back the results + MB_FILTER_AND_WRITEBACK 1 + + add rsp, lf_var_size + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_mbloop_filter_horizontal_edge_uv_sse2 +;( +; unsigned char *u, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; unsigned char *v +;) +globalsym(vp8_mbloop_filter_horizontal_edge_uv_sse2) +sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, lf_var_size + + mov rsi, arg(0) ; u + mov rdi, arg(5) ; v + movsxd rax, dword ptr arg(1) ; src_pixel_step + mov rcx, rax + neg rax ; negate pitch to deal with above border + mov rdx, arg(3) ;limit + + lea rsi, [rsi + rcx] + lea rdi, [rdi + rcx] + + ; calculate breakout conditions and high edge variance + LFH_FILTER_AND_HEV_MASK 0 + ; filter and write back the results + MB_FILTER_AND_WRITEBACK 0 + + add rsp, lf_var_size + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +%macro TRANSPOSE_16X8 2 + movq xmm4, [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00 + movq xmm1, [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10 + movq xmm0, [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20 + movq xmm7, [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30 + movq xmm5, [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40 + movq xmm2, [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50 + + punpcklbw xmm4, xmm1 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 + + movq xmm1, [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70 + + movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 + punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20 + + movq xmm7, [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60 + + punpcklbw xmm5, xmm2 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 +%if %1 + lea rsi, [rsi+rax*8] + lea rdi, [rdi+rax*8] +%else + mov rsi, arg(5) ; v_ptr +%endif + + movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 + punpcklbw xmm7, xmm1 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60 + punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 + punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44 + punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 + +%if %1 == 0 + lea rdi, [rsi + rax - 4] ; rdi points to row +1 for indirect addressing + lea rsi, [rsi - 4] +%endif + + movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 + punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 + + movdqa xmm7, xmm4 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 + punpckhdq xmm3, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 + + punpckhdq xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 + + punpckldq xmm4, xmm6 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 + + punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 + + movdqa [rsp+_t0], xmm2 ; save to free XMM2 + + movq xmm2, [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80 + movq xmm6, [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90 + movq xmm0, [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0 + movq xmm5, [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0 + movq xmm1, [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0 + + punpcklbw xmm2, xmm6 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 + + movq xmm6, [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0 + + punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0 + + movq xmm5, [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0 + + punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0 + + movq xmm6, [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0 + + punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0 + + movdqa xmm6, xmm1 ; + punpckhwd xmm6, xmm5 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4 + + punpcklwd xmm1, xmm5 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 + movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 + + punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 + + punpckhwd xmm2, xmm0 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 + + movdqa xmm0, xmm5 + punpckldq xmm0, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 + + punpckhdq xmm5, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 + movdqa xmm1, xmm2 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 + + punpckldq xmm1, xmm6 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84 + + punpckhdq xmm2, xmm6 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86 + movdqa xmm6, xmm7 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 + + punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 + + punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07 + +%if %2 == 0 + movdqa [rsp+_q3], xmm7 ; save 7 + movdqa [rsp+_q2], xmm6 ; save 6 +%endif + movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 + punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + movdqa [rsp+_p1], xmm2 ; save 2 + + movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 + punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + movdqa [rsp+_p0], xmm3 ; save 3 + + punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 + + movdqa [rsp+_q0], xmm4 ; save 4 + movdqa [rsp+_q1], xmm5 ; save 5 + movdqa xmm1, [rsp+_t0] + + movdqa xmm2, xmm1 ; + punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 + punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + +%if %2 == 0 + movdqa [rsp+_p2], xmm1 + movdqa [rsp+_p3], xmm2 +%endif + +%endmacro + +%macro LFV_FILTER_MASK_HEV_MASK 0 + movdqa xmm0, xmm6 ; q2 + psubusb xmm0, xmm7 ; q2-q3 + + psubusb xmm7, xmm6 ; q3-q2 + movdqa xmm4, xmm5 ; q1 + + por xmm7, xmm0 ; abs (q3-q2) + psubusb xmm4, xmm6 ; q1-q2 + + movdqa xmm0, xmm1 + psubusb xmm6, xmm5 ; q2-q1 + + por xmm6, xmm4 ; abs (q2-q1) + psubusb xmm0, xmm2 ; p2 - p3; + + psubusb xmm2, xmm1 ; p3 - p2; + por xmm0, xmm2 ; abs(p2-p3) + + movdqa xmm5, [rsp+_p1] ; p1 + pmaxub xmm0, xmm7 + + movdqa xmm2, xmm5 ; p1 + psubusb xmm5, xmm1 ; p1-p2 + psubusb xmm1, xmm2 ; p2-p1 + + movdqa xmm7, xmm3 ; p0 + psubusb xmm7, xmm2 ; p0-p1 + + por xmm1, xmm5 ; abs(p2-p1) + pmaxub xmm0, xmm6 + + pmaxub xmm0, xmm1 + movdqa xmm1, xmm2 ; p1 + + psubusb xmm2, xmm3 ; p1-p0 + + por xmm2, xmm7 ; abs(p1-p0) + + pmaxub xmm0, xmm2 + + movdqa xmm5, [rsp+_q0] ; q0 + movdqa xmm7, [rsp+_q1] ; q1 + + mov rdx, arg(3) ; limit + + movdqa xmm6, xmm5 ; q0 + movdqa xmm4, xmm7 ; q1 + + psubusb xmm5, xmm7 ; q0-q1 + psubusb xmm7, xmm6 ; q1-q0 + + por xmm7, xmm5 ; abs(q1-q0) + + pmaxub xmm0, xmm7 + + psubusb xmm0, [rdx] ; limit + + mov rdx, arg(2) ; blimit + movdqa xmm5, xmm4 ; q1 + + psubusb xmm5, xmm1 ; q1-=p1 + psubusb xmm1, xmm4 ; p1-=q1 + + por xmm5, xmm1 ; abs(p1-q1) + movdqa xmm1, xmm3 ; p0 + + pand xmm5, [GLOBAL(tfe)] ; set lsb of each byte to zero + psubusb xmm1, xmm6 ; p0-q0 + + movdqa xmm4, [rdx] ; blimit + mov rdx, arg(4) ; get thresh + + psrlw xmm5, 1 ; abs(p1-q1)/2 + psubusb xmm6, xmm3 ; q0-p0 + + por xmm1, xmm6 ; abs(q0-p0) + paddusb xmm1, xmm1 ; abs(q0-p0)*2 + movdqa xmm3, [rdx] + + paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + psubusb xmm2, xmm3 ; abs(q1 - q0) > thresh + + psubusb xmm7, xmm3 ; abs(p1 - p0)> thresh + + psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit + por xmm2, xmm7 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + + por xmm1, xmm0 ; mask + pcmpeqb xmm2, xmm0 + + pxor xmm0, xmm0 + pcmpeqb xmm4, xmm4 + + pcmpeqb xmm1, xmm0 + pxor xmm4, xmm2 +%endmacro + +%macro BV_TRANSPOSE 0 + ; xmm1 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + ; xmm6 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + ; xmm3 = f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + ; xmm7 = f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 + movdqa xmm2, xmm1 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + punpcklbw xmm2, xmm6 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 + + movdqa xmm4, xmm3 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + punpckhbw xmm1, xmm6 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 + + punpcklbw xmm4, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 + + punpckhbw xmm3, xmm7 ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 + + movdqa xmm6, xmm2 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 + punpcklwd xmm2, xmm4 ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 + + punpckhwd xmm6, xmm4 ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 + movdqa xmm5, xmm1 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 + + punpcklwd xmm1, xmm3 ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 + + punpckhwd xmm5, xmm3 ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 + ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 + ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 + ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 + ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 +%endmacro + +%macro BV_WRITEBACK 2 + movd [rsi+2], %1 + movd [rsi+4*rax+2], %2 + psrldq %1, 4 + psrldq %2, 4 + movd [rdi+2], %1 + movd [rdi+4*rax+2], %2 + psrldq %1, 4 + psrldq %2, 4 + movd [rsi+2*rax+2], %1 + movd [rsi+2*rcx+2], %2 + psrldq %1, 4 + psrldq %2, 4 + movd [rdi+2*rax+2], %1 + movd [rdi+2*rcx+2], %2 +%endmacro + +%if ABI_IS_32BIT + +;void vp8_loop_filter_vertical_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +;) +globalsym(vp8_loop_filter_vertical_edge_sse2) +sym(vp8_loop_filter_vertical_edge_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, lf_var_size + + mov rsi, arg(0) ; src_ptr + movsxd rax, dword ptr arg(1) ; src_pixel_step + + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + lea rcx, [rax*2+rax] + + ;transpose 16x8 to 8x16, and store the 8-line result on stack. + TRANSPOSE_16X8 1, 1 + + ; calculate filter mask and high edge variance + LFV_FILTER_MASK_HEV_MASK + + ; start work on filters + B_FILTER 2 + + ; transpose and write back - only work on q1, q0, p0, p1 + BV_TRANSPOSE + ; store 16-line result + + lea rdx, [rax] + neg rdx + + BV_WRITEBACK xmm1, xmm5 + + lea rsi, [rsi+rdx*8] + lea rdi, [rdi+rdx*8] + BV_WRITEBACK xmm2, xmm6 + + add rsp, lf_var_size + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +%endif + +;void vp8_loop_filter_vertical_edge_uv_sse2 +;( +; unsigned char *u, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; unsigned char *v +;) +globalsym(vp8_loop_filter_vertical_edge_uv_sse2) +sym(vp8_loop_filter_vertical_edge_uv_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, lf_var_size + + mov rsi, arg(0) ; u_ptr + movsxd rax, dword ptr arg(1) ; src_pixel_step + + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + lea rcx, [rax+2*rax] + + ;transpose 16x8 to 8x16, and store the 8-line result on stack. + TRANSPOSE_16X8 0, 1 + + ; calculate filter mask and high edge variance + LFV_FILTER_MASK_HEV_MASK + + ; start work on filters + B_FILTER 2 + + ; transpose and write back - only work on q1, q0, p0, p1 + BV_TRANSPOSE + + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + + ; store 16-line result + BV_WRITEBACK xmm1, xmm5 + + mov rsi, arg(0) ; u_ptr + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + BV_WRITEBACK xmm2, xmm6 + + add rsp, lf_var_size + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +%macro MBV_TRANSPOSE 0 + movdqa xmm0, [rsp+_p3] ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + movdqa xmm1, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + + punpcklbw xmm0, xmm2 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 + punpckhbw xmm1, xmm2 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 + + movdqa xmm7, [rsp+_p1] ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + movdqa xmm6, xmm7 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + + punpcklbw xmm7, [rsp+_p0] ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 + punpckhbw xmm6, [rsp+_p0] ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 + + movdqa xmm3, xmm0 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 + punpcklwd xmm0, xmm7 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 + + punpckhwd xmm3, xmm7 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 + movdqa xmm4, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 + + punpcklwd xmm1, xmm6 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 + punpckhwd xmm4, xmm6 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 + + movdqa xmm7, [rsp+_q0] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + punpcklbw xmm7, [rsp+_q1] ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 + + movdqa xmm6, xmm5 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 + punpcklbw xmm6, [rsp+_q3] ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06 + + movdqa xmm2, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 + punpcklwd xmm7, xmm6 ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04 + + punpckhwd xmm2, xmm6 ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44 + movdqa xmm6, xmm0 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 + + punpckldq xmm0, xmm7 ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00 + punpckhdq xmm6, xmm7 ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20 +%endmacro + +%macro MBV_WRITEBACK_1 0 + movq [rsi], xmm0 + movhps [rdi], xmm0 + + movq [rsi+2*rax], xmm6 + movhps [rdi+2*rax], xmm6 + + movdqa xmm0, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 + punpckldq xmm0, xmm2 ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40 + punpckhdq xmm3, xmm2 ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60 + + movq [rsi+4*rax], xmm0 + movhps [rdi+4*rax], xmm0 + + movq [rsi+2*rcx], xmm3 + movhps [rdi+2*rcx], xmm3 + + movdqa xmm7, [rsp+_q0] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + punpckhbw xmm7, [rsp+_q1] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 + punpckhbw xmm5, [rsp+_q3] ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86 + + movdqa xmm0, xmm7 + punpcklwd xmm0, xmm5 ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84 + punpckhwd xmm7, xmm5 ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4 + + movdqa xmm5, xmm1 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 + punpckldq xmm1, xmm0 ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80 + punpckhdq xmm5, xmm0 ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0 +%endmacro + +%macro MBV_WRITEBACK_2 0 + movq [rsi], xmm1 + movhps [rdi], xmm1 + + movq [rsi+2*rax], xmm5 + movhps [rdi+2*rax], xmm5 + + movdqa xmm1, xmm4 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 + punpckldq xmm1, xmm7 ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0 + punpckhdq xmm4, xmm7 ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0 + + movq [rsi+4*rax], xmm1 + movhps [rdi+4*rax], xmm1 + + movq [rsi+2*rcx], xmm4 + movhps [rdi+2*rcx], xmm4 +%endmacro + + +;void vp8_mbloop_filter_vertical_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +;) +globalsym(vp8_mbloop_filter_vertical_edge_sse2) +sym(vp8_mbloop_filter_vertical_edge_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, lf_var_size + + mov rsi, arg(0) ; src_ptr + movsxd rax, dword ptr arg(1) ; src_pixel_step + + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + lea rcx, [rax*2+rax] + + ; Transpose + TRANSPOSE_16X8 1, 0 + + ; calculate filter mask and high edge variance + LFV_FILTER_MASK_HEV_MASK + + neg rax + ; start work on filters + MB_FILTER_AND_WRITEBACK 2 + + lea rsi, [rsi+rax*8] + lea rdi, [rdi+rax*8] + + ; transpose and write back + MBV_TRANSPOSE + + neg rax + + MBV_WRITEBACK_1 + + + lea rsi, [rsi+rax*8] + lea rdi, [rdi+rax*8] + MBV_WRITEBACK_2 + + add rsp, lf_var_size + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_mbloop_filter_vertical_edge_uv_sse2 +;( +; unsigned char *u, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; unsigned char *v +;) +globalsym(vp8_mbloop_filter_vertical_edge_uv_sse2) +sym(vp8_mbloop_filter_vertical_edge_uv_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, lf_var_size + + mov rsi, arg(0) ; u_ptr + movsxd rax, dword ptr arg(1) ; src_pixel_step + + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + lea rcx, [rax+2*rax] + + ; Transpose + TRANSPOSE_16X8 0, 0 + + ; calculate filter mask and high edge variance + LFV_FILTER_MASK_HEV_MASK + + ; start work on filters + MB_FILTER_AND_WRITEBACK 2 + + ; transpose and write back + MBV_TRANSPOSE + + mov rsi, arg(0) ;u_ptr + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] + MBV_WRITEBACK_1 + mov rsi, arg(5) ;v_ptr + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] + MBV_WRITEBACK_2 + + add rsp, lf_var_size + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_loop_filter_simple_horizontal_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +;) +globalsym(vp8_loop_filter_simple_horizontal_edge_sse2) +sym(vp8_loop_filter_simple_horizontal_edge_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + SAVE_XMM 7 + GET_GOT rbx + ; end prolog + + mov rcx, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + movdqa xmm6, [GLOBAL(tfe)] + lea rdx, [rcx + rax] + neg rax + + ; calculate mask + movdqa xmm0, [rdx] ; q1 + mov rdx, arg(2) ;blimit + movdqa xmm1, [rcx+2*rax] ; p1 + + movdqa xmm2, xmm1 + movdqa xmm3, xmm0 + + psubusb xmm0, xmm1 ; q1-=p1 + psubusb xmm1, xmm3 ; p1-=q1 + por xmm1, xmm0 ; abs(p1-q1) + pand xmm1, xmm6 ; set lsb of each byte to zero + psrlw xmm1, 1 ; abs(p1-q1)/2 + + movdqa xmm7, XMMWORD PTR [rdx] + + movdqa xmm5, [rcx+rax] ; p0 + movdqa xmm4, [rcx] ; q0 + movdqa xmm0, xmm4 ; q0 + movdqa xmm6, xmm5 ; p0 + psubusb xmm5, xmm4 ; p0-=q0 + psubusb xmm4, xmm6 ; q0-=p0 + por xmm5, xmm4 ; abs(p0 - q0) + + movdqa xmm4, [GLOBAL(t80)] + + paddusb xmm5, xmm5 ; abs(p0-q0)*2 + paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit + pxor xmm7, xmm7 + pcmpeqb xmm5, xmm7 + + + ; start work on filters + pxor xmm2, xmm4 ; p1 offset to convert to signed values + pxor xmm3, xmm4 ; q1 offset to convert to signed values + psubsb xmm2, xmm3 ; p1 - q1 + + pxor xmm6, xmm4 ; offset to convert to signed values + pxor xmm0, xmm4 ; offset to convert to signed values + movdqa xmm3, xmm0 ; q0 + psubsb xmm0, xmm6 ; q0 - p0 + paddsb xmm2, xmm0 ; p1 - q1 + 1 * (q0 - p0) + paddsb xmm2, xmm0 ; p1 - q1 + 2 * (q0 - p0) + paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0) + pand xmm5, xmm2 ; mask filter values we don't care about + + movdqa xmm0, xmm5 + paddsb xmm5, [GLOBAL(t3)] ; 3* (q0 - p0) + (p1 - q1) + 4 + paddsb xmm0, [GLOBAL(t4)] ; +3 instead of +4 + + movdqa xmm1, [GLOBAL(te0)] + movdqa xmm2, [GLOBAL(t1f)] + +; pxor xmm7, xmm7 + pcmpgtb xmm7, xmm0 ;save sign + pand xmm7, xmm1 ;preserve the upper 3 bits + psrlw xmm0, 3 + pand xmm0, xmm2 ;clear out upper 3 bits + por xmm0, xmm7 ;add sign + psubsb xmm3, xmm0 ; q0-= q0sz add + + pxor xmm7, xmm7 + pcmpgtb xmm7, xmm5 ;save sign + pand xmm7, xmm1 ;preserve the upper 3 bits + psrlw xmm5, 3 + pand xmm5, xmm2 ;clear out upper 3 bits + por xmm5, xmm7 ;add sign + paddsb xmm6, xmm5 ; p0+= p0 add + + pxor xmm3, xmm4 ; unoffset + movdqa [rcx], xmm3 ; write back + + pxor xmm6, xmm4 ; unoffset + movdqa [rcx+rax], xmm6 ; write back + + ; begin epilog + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_loop_filter_simple_vertical_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +;) +globalsym(vp8_loop_filter_simple_vertical_edge_sse2) +sym(vp8_loop_filter_simple_vertical_edge_sse2): + push rbp ; save old base pointer value. + mov rbp, rsp ; set new base pointer value. + SHADOW_ARGS_TO_STACK 3 + SAVE_XMM 7 + GET_GOT rbx ; save callee-saved reg + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 32 ; reserve 32 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + lea rsi, [rsi - 2 ] + lea rdi, [rsi + rax] + lea rdx, [rsi + rax*4] + lea rcx, [rdx + rax] + + movd xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00 + movd xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40 + movd xmm2, [rdi] ; 13 12 11 10 + movd xmm3, [rcx] ; 53 52 51 50 + punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00 + punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10 + + movd xmm4, [rsi + rax*2] ; 23 22 21 20 + movd xmm5, [rdx + rax*2] ; 63 62 61 60 + movd xmm6, [rdi + rax*2] ; 33 32 31 30 + movd xmm7, [rcx + rax*2] ; 73 72 71 70 + punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20 + punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30 + + punpcklbw xmm0, xmm2 ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00 + punpcklbw xmm4, xmm6 ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20 + + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm4 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 + punpckhwd xmm1, xmm4 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 + + movdqa xmm2, xmm0 + punpckldq xmm0, xmm1 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 + punpckhdq xmm2, xmm1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 + + lea rsi, [rsi + rax*8] + lea rdi, [rsi + rax] + lea rdx, [rsi + rax*4] + lea rcx, [rdx + rax] + + movd xmm4, [rsi] ; 83 82 81 80 + movd xmm1, [rdx] ; c3 c2 c1 c0 + movd xmm6, [rdi] ; 93 92 91 90 + movd xmm3, [rcx] ; d3 d2 d1 d0 + punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80 + punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90 + + movd xmm1, [rsi + rax*2] ; a3 a2 a1 a0 + movd xmm5, [rdx + rax*2] ; e3 e2 e1 e0 + movd xmm3, [rdi + rax*2] ; b3 b2 b1 b0 + movd xmm7, [rcx + rax*2] ; f3 f2 f1 f0 + punpckldq xmm1, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0 + punpckldq xmm3, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0 + + punpcklbw xmm4, xmm6 ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80 + punpcklbw xmm1, xmm3 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0 + + movdqa xmm7, xmm4 + punpcklwd xmm4, xmm1 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 + punpckhwd xmm7, xmm1 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 + + movdqa xmm6, xmm4 + punpckldq xmm4, xmm7 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 + punpckhdq xmm6, xmm7 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 + + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + + punpcklqdq xmm0, xmm4 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + punpckhqdq xmm1, xmm4 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 + punpcklqdq xmm2, xmm6 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + punpckhqdq xmm3, xmm6 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + + mov rdx, arg(2) ;blimit + + ; calculate mask + movdqa xmm6, xmm0 ; p1 + movdqa xmm7, xmm3 ; q1 + psubusb xmm7, xmm0 ; q1-=p1 + psubusb xmm6, xmm3 ; p1-=q1 + por xmm6, xmm7 ; abs(p1-q1) + pand xmm6, [GLOBAL(tfe)] ; set lsb of each byte to zero + psrlw xmm6, 1 ; abs(p1-q1)/2 + + movdqa xmm7, [rdx] + + movdqa xmm5, xmm1 ; p0 + movdqa xmm4, xmm2 ; q0 + psubusb xmm5, xmm2 ; p0-=q0 + psubusb xmm4, xmm1 ; q0-=p0 + por xmm5, xmm4 ; abs(p0 - q0) + paddusb xmm5, xmm5 ; abs(p0-q0)*2 + paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + movdqa xmm4, [GLOBAL(t80)] + + psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit + pxor xmm7, xmm7 + pcmpeqb xmm5, xmm7 ; mm5 = mask + + ; start work on filters + movdqa t0, xmm0 + movdqa t1, xmm3 + + pxor xmm0, xmm4 ; p1 offset to convert to signed values + pxor xmm3, xmm4 ; q1 offset to convert to signed values + psubsb xmm0, xmm3 ; p1 - q1 + + pxor xmm1, xmm4 ; offset to convert to signed values + pxor xmm2, xmm4 ; offset to convert to signed values + + movdqa xmm3, xmm2 ; offseted ; q0 + psubsb xmm2, xmm1 ; q0 - p0 + paddsb xmm0, xmm2 ; p1 - q1 + 1 * (q0 - p0) + paddsb xmm0, xmm2 ; p1 - q1 + 2 * (q0 - p0) + paddsb xmm0, xmm2 ; p1 - q1 + 3 * (q0 - p0) + pand xmm5, xmm0 ; mask filter values we don't care about + + movdqa xmm0, xmm5 + paddsb xmm5, [GLOBAL(t3)] ; 3* (q0 - p0) + (p1 - q1) + 4 + paddsb xmm0, [GLOBAL(t4)] ; +3 instead of +4 + + movdqa xmm6, [GLOBAL(te0)] + movdqa xmm2, [GLOBAL(t1f)] + +; pxor xmm7, xmm7 + pcmpgtb xmm7, xmm0 ;save sign + pand xmm7, xmm6 ;preserve the upper 3 bits + psrlw xmm0, 3 + pand xmm0, xmm2 ;clear out upper 3 bits + por xmm0, xmm7 ;add sign + psubsb xmm3, xmm0 ; q0-= q0sz add + + pxor xmm7, xmm7 + pcmpgtb xmm7, xmm5 ;save sign + pand xmm7, xmm6 ;preserve the upper 3 bits + psrlw xmm5, 3 + pand xmm5, xmm2 ;clear out upper 3 bits + por xmm5, xmm7 ;add sign + paddsb xmm1, xmm5 ; p0+= p0 add + + pxor xmm3, xmm4 ; unoffset q0 + pxor xmm1, xmm4 ; unoffset p0 + + movdqa xmm0, t0 ; p1 + movdqa xmm4, t1 ; q1 + + ; write out order: xmm0 xmm2 xmm1 xmm3 + lea rdx, [rsi + rax*4] + + ; transpose back to write out + ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 + ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + movdqa xmm6, xmm0 + punpcklbw xmm0, xmm1 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 + punpckhbw xmm6, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 + + movdqa xmm5, xmm3 + punpcklbw xmm3, xmm4 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 + punpckhbw xmm5, xmm4 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 + + movdqa xmm2, xmm0 + punpcklwd xmm0, xmm3 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 + punpckhwd xmm2, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 + + movdqa xmm3, xmm6 + punpcklwd xmm6, xmm5 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 + punpckhwd xmm3, xmm5 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 + + movd [rsi], xmm6 ; write the second 8-line result + movd [rdx], xmm3 + psrldq xmm6, 4 + psrldq xmm3, 4 + movd [rdi], xmm6 + movd [rcx], xmm3 + psrldq xmm6, 4 + psrldq xmm3, 4 + movd [rsi + rax*2], xmm6 + movd [rdx + rax*2], xmm3 + psrldq xmm6, 4 + psrldq xmm3, 4 + movd [rdi + rax*2], xmm6 + movd [rcx + rax*2], xmm3 + + neg rax + lea rsi, [rsi + rax*8] + neg rax + lea rdi, [rsi + rax] + lea rdx, [rsi + rax*4] + lea rcx, [rdx + rax] + + movd [rsi], xmm0 ; write the first 8-line result + movd [rdx], xmm2 + psrldq xmm0, 4 + psrldq xmm2, 4 + movd [rdi], xmm0 + movd [rcx], xmm2 + psrldq xmm0, 4 + psrldq xmm2, 4 + movd [rsi + rax*2], xmm0 + movd [rdx + rax*2], xmm2 + psrldq xmm0, 4 + psrldq xmm2, 4 + movd [rdi + rax*2], xmm0 + movd [rcx + rax*2], xmm2 + + add rsp, 32 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +tfe: + times 16 db 0xfe +align 16 +t80: + times 16 db 0x80 +align 16 +t1s: + times 16 db 0x01 +align 16 +t3: + times 16 db 0x03 +align 16 +t4: + times 16 db 0x04 +align 16 +ones: + times 8 dw 0x0001 +align 16 +s9: + times 8 dw 0x0900 +align 16 +s63: + times 8 dw 0x003f +align 16 +te0: + times 16 db 0xe0 +align 16 +t1f: + times 16 db 0x1f diff --git a/media/libvpx/libvpx/vp8/common/x86/loopfilter_x86.c b/media/libvpx/libvpx/vp8/common/x86/loopfilter_x86.c new file mode 100644 index 0000000000..cfa13a2ddb --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/x86/loopfilter_x86.c @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8/common/loopfilter.h" + +#define prototype_loopfilter(sym) \ + void sym(unsigned char *src, int pitch, const unsigned char *blimit, \ + const unsigned char *limit, const unsigned char *thresh, int count) + +#define prototype_loopfilter_nc(sym) \ + void sym(unsigned char *src, int pitch, const unsigned char *blimit, \ + const unsigned char *limit, const unsigned char *thresh) + +#define prototype_simple_loopfilter(sym) \ + void sym(unsigned char *y, int ystride, const unsigned char *blimit) + +#if HAVE_SSE2 && VPX_ARCH_X86_64 +prototype_loopfilter(vp8_loop_filter_bv_y_sse2); +prototype_loopfilter(vp8_loop_filter_bh_y_sse2); +#else +prototype_loopfilter_nc(vp8_loop_filter_vertical_edge_sse2); +prototype_loopfilter_nc(vp8_loop_filter_horizontal_edge_sse2); +#endif +prototype_loopfilter_nc(vp8_mbloop_filter_vertical_edge_sse2); +prototype_loopfilter_nc(vp8_mbloop_filter_horizontal_edge_sse2); + +extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_sse2; +extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2; +extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_sse2; +extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_sse2; + +/* Horizontal MB filtering */ +#if HAVE_SSE2 +void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, + lfi->hev_thr); + + if (u_ptr) { + vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, + lfi->lim, lfi->hev_thr, v_ptr); + } +} + +/* Vertical MB Filtering */ +void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, + lfi->hev_thr); + + if (u_ptr) { + vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, + lfi->lim, lfi->hev_thr, v_ptr); + } +} + +/* Horizontal B Filtering */ +void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { +#if VPX_ARCH_X86_64 + vp8_loop_filter_bh_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, + 2); +#else + vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, + lfi->blim, lfi->lim, lfi->hev_thr); + vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, + lfi->blim, lfi->lim, lfi->hev_thr); + vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, + lfi->blim, lfi->lim, lfi->hev_thr); +#endif + + if (u_ptr) { + vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, + lfi->blim, lfi->lim, lfi->hev_thr, + v_ptr + 4 * uv_stride); + } +} + +void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { + vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, + blimit); + vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, + blimit); + vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, + blimit); +} + +/* Vertical B Filtering */ +void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { +#if VPX_ARCH_X86_64 + vp8_loop_filter_bv_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, + 2); +#else + vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->blim, lfi->lim, + lfi->hev_thr); + vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->blim, lfi->lim, + lfi->hev_thr); + vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->blim, lfi->lim, + lfi->hev_thr); +#endif + + if (u_ptr) { + vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->blim, + lfi->lim, lfi->hev_thr, v_ptr + 4); + } +} + +void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { + vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit); +} + +#endif diff --git a/media/libvpx/libvpx/vp8/common/x86/mfqe_sse2.asm b/media/libvpx/libvpx/vp8/common/x86/mfqe_sse2.asm new file mode 100644 index 0000000000..3ec2a99ec2 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/x86/mfqe_sse2.asm @@ -0,0 +1,289 @@ +; +; Copyright (c) 2012 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +SECTION .text + +;void vp8_filter_by_weight16x16_sse2 +;( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride, +; int src_weight +;) +globalsym(vp8_filter_by_weight16x16_sse2) +sym(vp8_filter_by_weight16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movd xmm0, arg(4) ; src_weight + pshuflw xmm0, xmm0, 0x0 ; replicate to all low words + punpcklqdq xmm0, xmm0 ; replicate to all hi words + + movdqa xmm1, [GLOBAL(tMFQE)] + psubw xmm1, xmm0 ; dst_weight + + mov rax, arg(0) ; src + mov rsi, arg(1) ; src_stride + mov rdx, arg(2) ; dst + mov rdi, arg(3) ; dst_stride + + mov rcx, 16 ; loop count + pxor xmm6, xmm6 + +.combine: + movdqa xmm2, [rax] + movdqa xmm4, [rdx] + add rax, rsi + + ; src * src_weight + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm6 + punpckhbw xmm3, xmm6 + pmullw xmm2, xmm0 + pmullw xmm3, xmm0 + + ; dst * dst_weight + movdqa xmm5, xmm4 + punpcklbw xmm4, xmm6 + punpckhbw xmm5, xmm6 + pmullw xmm4, xmm1 + pmullw xmm5, xmm1 + + ; sum, round and shift + paddw xmm2, xmm4 + paddw xmm3, xmm5 + paddw xmm2, [GLOBAL(tMFQE_round)] + paddw xmm3, [GLOBAL(tMFQE_round)] + psrlw xmm2, 4 + psrlw xmm3, 4 + + packuswb xmm2, xmm3 + movdqa [rdx], xmm2 + add rdx, rdi + + dec rcx + jnz .combine + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + + ret + +;void vp8_filter_by_weight8x8_sse2 +;( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride, +; int src_weight +;) +globalsym(vp8_filter_by_weight8x8_sse2) +sym(vp8_filter_by_weight8x8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movd xmm0, arg(4) ; src_weight + pshuflw xmm0, xmm0, 0x0 ; replicate to all low words + punpcklqdq xmm0, xmm0 ; replicate to all hi words + + movdqa xmm1, [GLOBAL(tMFQE)] + psubw xmm1, xmm0 ; dst_weight + + mov rax, arg(0) ; src + mov rsi, arg(1) ; src_stride + mov rdx, arg(2) ; dst + mov rdi, arg(3) ; dst_stride + + mov rcx, 8 ; loop count + pxor xmm4, xmm4 + +.combine: + movq xmm2, [rax] + movq xmm3, [rdx] + add rax, rsi + + ; src * src_weight + punpcklbw xmm2, xmm4 + pmullw xmm2, xmm0 + + ; dst * dst_weight + punpcklbw xmm3, xmm4 + pmullw xmm3, xmm1 + + ; sum, round and shift + paddw xmm2, xmm3 + paddw xmm2, [GLOBAL(tMFQE_round)] + psrlw xmm2, 4 + + packuswb xmm2, xmm4 + movq [rdx], xmm2 + add rdx, rdi + + dec rcx + jnz .combine + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + + ret + +;void vp8_variance_and_sad_16x16_sse2 | arg +;( +; unsigned char *src1, 0 +; int stride1, 1 +; unsigned char *src2, 2 +; int stride2, 3 +; unsigned int *variance, 4 +; unsigned int *sad, 5 +;) +globalsym(vp8_variance_and_sad_16x16_sse2) +sym(vp8_variance_and_sad_16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rax, arg(0) ; src1 + mov rcx, arg(1) ; stride1 + mov rdx, arg(2) ; src2 + mov rdi, arg(3) ; stride2 + + mov rsi, 16 ; block height + + ; Prep accumulator registers + pxor xmm3, xmm3 ; SAD + pxor xmm4, xmm4 ; sum of src2 + pxor xmm5, xmm5 ; sum of src2^2 + + ; Because we're working with the actual output frames + ; we can't depend on any kind of data alignment. +.accumulate: + movdqa xmm0, [rax] ; src1 + movdqa xmm1, [rdx] ; src2 + add rax, rcx ; src1 + stride1 + add rdx, rdi ; src2 + stride2 + + ; SAD(src1, src2) + psadbw xmm0, xmm1 + paddusw xmm3, xmm0 + + ; SUM(src2) + pxor xmm2, xmm2 + psadbw xmm2, xmm1 ; sum src2 by misusing SAD against 0 + paddusw xmm4, xmm2 + + ; pmaddubsw would be ideal if it took two unsigned values. instead, + ; it expects a signed and an unsigned value. so instead we zero extend + ; and operate on words. + pxor xmm2, xmm2 + movdqa xmm0, xmm1 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + pmaddwd xmm0, xmm0 + pmaddwd xmm1, xmm1 + paddd xmm5, xmm0 + paddd xmm5, xmm1 + + sub rsi, 1 + jnz .accumulate + + ; phaddd only operates on adjacent double words. + ; Finalize SAD and store + movdqa xmm0, xmm3 + psrldq xmm0, 8 + paddusw xmm0, xmm3 + paddd xmm0, [GLOBAL(t128)] + psrld xmm0, 8 + + mov rax, arg(5) + movd [rax], xmm0 + + ; Accumulate sum of src2 + movdqa xmm0, xmm4 + psrldq xmm0, 8 + paddusw xmm0, xmm4 + ; Square src2. Ignore high value + pmuludq xmm0, xmm0 + psrld xmm0, 8 + + ; phaddw could be used to sum adjacent values but we want + ; all the values summed. promote to doubles, accumulate, + ; shift and sum + pxor xmm2, xmm2 + movdqa xmm1, xmm5 + punpckldq xmm1, xmm2 + punpckhdq xmm5, xmm2 + paddd xmm1, xmm5 + movdqa xmm2, xmm1 + psrldq xmm1, 8 + paddd xmm1, xmm2 + + psubd xmm1, xmm0 + + ; (variance + 128) >> 8 + paddd xmm1, [GLOBAL(t128)] + psrld xmm1, 8 + mov rax, arg(4) + + movd [rax], xmm1 + + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +t128: +%ifndef __NASM_VER__ + ddq 128 +%elif CONFIG_BIG_ENDIAN + dq 0, 128 +%else + dq 128, 0 +%endif +align 16 +tMFQE: ; 1 << MFQE_PRECISION + times 8 dw 0x10 +align 16 +tMFQE_round: ; 1 << (MFQE_PRECISION - 1) + times 8 dw 0x08 + diff --git a/media/libvpx/libvpx/vp8/common/x86/recon_mmx.asm b/media/libvpx/libvpx/vp8/common/x86/recon_mmx.asm new file mode 100644 index 0000000000..01cf066837 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/x86/recon_mmx.asm @@ -0,0 +1,120 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +SECTION .text + +;void copy_mem8x8_mmx( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride +; ) +globalsym(vp8_copy_mem8x8_mmx) +sym(vp8_copy_mem8x8_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src; + movq mm0, [rsi] + + movsxd rax, dword ptr arg(1) ;src_stride; + mov rdi, arg(2) ;dst; + + movq mm1, [rsi+rax] + movq mm2, [rsi+rax*2] + + movsxd rcx, dword ptr arg(3) ;dst_stride + lea rsi, [rsi+rax*2] + + movq [rdi], mm0 + add rsi, rax + + movq [rdi+rcx], mm1 + movq [rdi+rcx*2], mm2 + + + lea rdi, [rdi+rcx*2] + movq mm3, [rsi] + + add rdi, rcx + movq mm4, [rsi+rax] + + movq mm5, [rsi+rax*2] + movq [rdi], mm3 + + lea rsi, [rsi+rax*2] + movq [rdi+rcx], mm4 + + movq [rdi+rcx*2], mm5 + lea rdi, [rdi+rcx*2] + + movq mm0, [rsi+rax] + movq mm1, [rsi+rax*2] + + movq [rdi+rcx], mm0 + movq [rdi+rcx*2],mm1 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void copy_mem8x4_mmx( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride +; ) +globalsym(vp8_copy_mem8x4_mmx) +sym(vp8_copy_mem8x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src; + movq mm0, [rsi] + + movsxd rax, dword ptr arg(1) ;src_stride; + mov rdi, arg(2) ;dst; + + movq mm1, [rsi+rax] + movq mm2, [rsi+rax*2] + + movsxd rcx, dword ptr arg(3) ;dst_stride + lea rsi, [rsi+rax*2] + + movq [rdi], mm0 + movq [rdi+rcx], mm1 + + movq [rdi+rcx*2], mm2 + lea rdi, [rdi+rcx*2] + + movq mm3, [rsi+rax] + movq [rdi+rcx], mm3 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/libvpx/vp8/common/x86/recon_sse2.asm b/media/libvpx/libvpx/vp8/common/x86/recon_sse2.asm new file mode 100644 index 0000000000..17baf094ef --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/x86/recon_sse2.asm @@ -0,0 +1,118 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +SECTION .text + +;void copy_mem16x16_sse2( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride +; ) +globalsym(vp8_copy_mem16x16_sse2) +sym(vp8_copy_mem16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src; + movdqu xmm0, [rsi] + + movsxd rax, dword ptr arg(1) ;src_stride; + mov rdi, arg(2) ;dst; + + movdqu xmm1, [rsi+rax] + movdqu xmm2, [rsi+rax*2] + + movsxd rcx, dword ptr arg(3) ;dst_stride + lea rsi, [rsi+rax*2] + + movdqa [rdi], xmm0 + add rsi, rax + + movdqa [rdi+rcx], xmm1 + movdqa [rdi+rcx*2],xmm2 + + lea rdi, [rdi+rcx*2] + movdqu xmm3, [rsi] + + add rdi, rcx + movdqu xmm4, [rsi+rax] + + movdqu xmm5, [rsi+rax*2] + lea rsi, [rsi+rax*2] + + movdqa [rdi], xmm3 + add rsi, rax + + movdqa [rdi+rcx], xmm4 + movdqa [rdi+rcx*2],xmm5 + + lea rdi, [rdi+rcx*2] + movdqu xmm0, [rsi] + + add rdi, rcx + movdqu xmm1, [rsi+rax] + + movdqu xmm2, [rsi+rax*2] + lea rsi, [rsi+rax*2] + + movdqa [rdi], xmm0 + add rsi, rax + + movdqa [rdi+rcx], xmm1 + + movdqa [rdi+rcx*2], xmm2 + movdqu xmm3, [rsi] + + movdqu xmm4, [rsi+rax] + lea rdi, [rdi+rcx*2] + + add rdi, rcx + movdqu xmm5, [rsi+rax*2] + + lea rsi, [rsi+rax*2] + movdqa [rdi], xmm3 + + add rsi, rax + movdqa [rdi+rcx], xmm4 + + movdqa [rdi+rcx*2],xmm5 + movdqu xmm0, [rsi] + + lea rdi, [rdi+rcx*2] + movdqu xmm1, [rsi+rax] + + add rdi, rcx + movdqu xmm2, [rsi+rax*2] + + lea rsi, [rsi+rax*2] + movdqa [rdi], xmm0 + + movdqa [rdi+rcx], xmm1 + movdqa [rdi+rcx*2],xmm2 + + movdqu xmm3, [rsi+rax] + lea rdi, [rdi+rcx*2] + + movdqa [rdi+rcx], xmm3 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/libvpx/vp8/common/x86/subpixel_mmx.asm b/media/libvpx/libvpx/vp8/common/x86/subpixel_mmx.asm new file mode 100644 index 0000000000..8f0f6fcc89 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/x86/subpixel_mmx.asm @@ -0,0 +1,270 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%define BLOCK_HEIGHT_WIDTH 4 +%define vp8_filter_weight 128 +%define VP8_FILTER_SHIFT 7 + +SECTION .text + +;void vp8_filter_block1d_h6_mmx +;( +; unsigned char *src_ptr, +; unsigned short *output_ptr, +; unsigned int src_pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short * vp8_filter +;) +globalsym(vp8_filter_block1d_h6_mmx) +sym(vp8_filter_block1d_h6_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdx, arg(6) ;vp8_filter + + movq mm1, [rdx + 16] ; do both the negative taps first!!! + movq mm2, [rdx + 32] ; + movq mm6, [rdx + 48] ; + movq mm7, [rdx + 64] ; + + mov rdi, arg(1) ;output_ptr + mov rsi, arg(0) ;src_ptr + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rax, dword ptr arg(5) ;output_width ; destination pitch? + pxor mm0, mm0 ; mm0 = 00000000 + +.nextrow: + movq mm3, [rsi-2] ; mm3 = p-2..p5 + movq mm4, mm3 ; mm4 = p-2..p5 + psrlq mm3, 8 ; mm3 = p-1..p5 + punpcklbw mm3, mm0 ; mm3 = p-1..p2 + pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. + + movq mm5, mm4 ; mm5 = p-2..p5 + punpckhbw mm4, mm0 ; mm5 = p2..p5 + pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers + paddsw mm3, mm4 ; mm3 += mm5 + + movq mm4, mm5 ; mm4 = p-2..p5; + psrlq mm5, 16 ; mm5 = p0..p5; + punpcklbw mm5, mm0 ; mm5 = p0..p3 + pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers + paddsw mm3, mm5 ; mm3 += mm5 + + movq mm5, mm4 ; mm5 = p-2..p5 + psrlq mm4, 24 ; mm4 = p1..p5 + punpcklbw mm4, mm0 ; mm4 = p1..p4 + pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers + paddsw mm3, mm4 ; mm3 += mm5 + + ; do outer positive taps + movd mm4, [rsi+3] + punpcklbw mm4, mm0 ; mm5 = p3..p6 + pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers + paddsw mm3, mm4 ; mm3 += mm5 + + punpcklbw mm5, mm0 ; mm5 = p-2..p1 + pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers + paddsw mm3, mm5 ; mm3 += mm5 + + paddsw mm3, [GLOBAL(rd)] ; mm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 + packuswb mm3, mm0 ; pack and unpack to saturate + punpcklbw mm3, mm0 ; + + movq [rdi], mm3 ; store the results in the destination + +%if ABI_IS_32BIT + add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line + add rdi, rax; +%else + movsxd r8, dword ptr arg(2) ;src_pixels_per_line + add rdi, rax; + + add rsi, r8 ; next line +%endif + + dec rcx ; decrement count + jnz .nextrow ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_filter_block1dc_v6_mmx +;( +; short *src_ptr, +; unsigned char *output_ptr, +; int output_pitch, +; unsigned int pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short * vp8_filter +;) +globalsym(vp8_filter_block1dc_v6_mmx) +sym(vp8_filter_block1dc_v6_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movq mm5, [GLOBAL(rd)] + push rbx + mov rbx, arg(7) ;vp8_filter + movq mm1, [rbx + 16] ; do both the negative taps first!!! + movq mm2, [rbx + 32] ; + movq mm6, [rbx + 48] ; + movq mm7, [rbx + 64] ; + + movsxd rdx, dword ptr arg(3) ;pixels_per_line + mov rdi, arg(1) ;output_ptr + mov rsi, arg(0) ;src_ptr + sub rsi, rdx + sub rsi, rdx + movsxd rcx, DWORD PTR arg(5) ;output_height + movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch? + pxor mm0, mm0 ; mm0 = 00000000 + + +.nextrow_cv: + movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1 + pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. + + + movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2 + pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0 + pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + movq mm4, [rsi] ; mm4 = p0..p3 = row -2 + pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + + add rsi, rdx ; move source forward 1 line to avoid 3 * pitch + movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1 + pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3 + pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + + paddsw mm3, mm5 ; mm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 + packuswb mm3, mm0 ; pack and saturate + + movd [rdi],mm3 ; store the results in the destination + ; the subsequent iterations repeat 3 out of 4 of these reads. Since the + ; recon block should be in cache this shouldn't cost much. Its obviously + ; avoidable!!!. + lea rdi, [rdi+rax] ; + dec rcx ; decrement count + jnz .nextrow_cv ; next row + + pop rbx + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +align 16 +rd: + times 4 dw 0x40 + +align 16 +global HIDDEN_DATA(sym(vp8_six_tap_x86)) +sym(vp8_six_tap_x86): + times 8 dw 0 + times 8 dw 0 + times 8 dw 128 + times 8 dw 0 + times 8 dw 0 + times 8 dw 0 + + times 8 dw 0 + times 8 dw -6 + times 8 dw 123 + times 8 dw 12 + times 8 dw -1 + times 8 dw 0 + + times 8 dw 2 + times 8 dw -11 + times 8 dw 108 + times 8 dw 36 + times 8 dw -8 + times 8 dw 1 + + times 8 dw 0 + times 8 dw -9 + times 8 dw 93 + times 8 dw 50 + times 8 dw -6 + times 8 dw 0 + + times 8 dw 3 + times 8 dw -16 + times 8 dw 77 + times 8 dw 77 + times 8 dw -16 + times 8 dw 3 + + times 8 dw 0 + times 8 dw -6 + times 8 dw 50 + times 8 dw 93 + times 8 dw -9 + times 8 dw 0 + + times 8 dw 1 + times 8 dw -8 + times 8 dw 36 + times 8 dw 108 + times 8 dw -11 + times 8 dw 2 + + times 8 dw 0 + times 8 dw -1 + times 8 dw 12 + times 8 dw 123 + times 8 dw -6 + times 8 dw 0 + + diff --git a/media/libvpx/libvpx/vp8/common/x86/subpixel_sse2.asm b/media/libvpx/libvpx/vp8/common/x86/subpixel_sse2.asm new file mode 100644 index 0000000000..94e14aed6c --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/x86/subpixel_sse2.asm @@ -0,0 +1,963 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%define BLOCK_HEIGHT_WIDTH 4 +%define VP8_FILTER_WEIGHT 128 +%define VP8_FILTER_SHIFT 7 + +SECTION .text + +;/************************************************************************************ +; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The +; input pixel array has output_height rows. This routine assumes that output_height is an +; even number. This function handles 8 pixels in horizontal direction, calculating ONE +; rows each iteration to take advantage of the 128 bits operations. +;*************************************************************************************/ +;void vp8_filter_block1d8_h6_sse2 +;( +; unsigned char *src_ptr, +; unsigned short *output_ptr, +; unsigned int src_pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short *vp8_filter +;) +globalsym(vp8_filter_block1d8_h6_sse2) +sym(vp8_filter_block1d8_h6_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdx, arg(6) ;vp8_filter + mov rsi, arg(0) ;src_ptr + + mov rdi, arg(1) ;output_ptr + + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(5) ;output_width +%endif + pxor xmm0, xmm0 ; clear xmm0 for unpack + +.filter_block1d8_h6_rowloop: + movq xmm3, MMWORD PTR [rsi - 2] + movq xmm1, MMWORD PTR [rsi + 6] + + prefetcht2 [rsi+rax-2] + + pslldq xmm1, 8 + por xmm1, xmm3 + + movdqa xmm4, xmm1 + movdqa xmm5, xmm1 + + movdqa xmm6, xmm1 + movdqa xmm7, xmm1 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm1 + paddsw xmm4, [GLOBAL(rd)] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 + punpcklbw xmm4, xmm0 + + movdqa XMMWORD Ptr [rdi], xmm4 + lea rsi, [rsi + rax] + +%if ABI_IS_32BIT + add rdi, DWORD Ptr arg(5) ;[output_width] +%else + add rdi, r8 +%endif + dec rcx + + jnz .filter_block1d8_h6_rowloop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_filter_block1d16_h6_sse2 +;( +; unsigned char *src_ptr, +; unsigned short *output_ptr, +; unsigned int src_pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short *vp8_filter +;) +;/************************************************************************************ +; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The +; input pixel array has output_height rows. This routine assumes that output_height is an +; even number. This function handles 8 pixels in horizontal direction, calculating ONE +; rows each iteration to take advantage of the 128 bits operations. +;*************************************************************************************/ +globalsym(vp8_filter_block1d16_h6_sse2) +sym(vp8_filter_block1d16_h6_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdx, arg(6) ;vp8_filter + mov rsi, arg(0) ;src_ptr + + mov rdi, arg(1) ;output_ptr + + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(5) ;output_width +%endif + + pxor xmm0, xmm0 ; clear xmm0 for unpack + +.filter_block1d16_h6_sse2_rowloop: + movq xmm3, MMWORD PTR [rsi - 2] + movq xmm1, MMWORD PTR [rsi + 6] + + ; Load from 11 to avoid reading out of bounds. + movq xmm2, MMWORD PTR [rsi +11] + ; The lower bits are not cleared before 'or'ing with xmm1, + ; but that is OK because the values in the overlapping positions + ; are already equal to the ones in xmm1. + pslldq xmm2, 5 + + por xmm2, xmm1 + prefetcht2 [rsi+rax-2] + + pslldq xmm1, 8 + por xmm1, xmm3 + + movdqa xmm4, xmm1 + movdqa xmm5, xmm1 + + movdqa xmm6, xmm1 + movdqa xmm7, xmm1 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm1 + paddsw xmm4, [GLOBAL(rd)] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 + punpcklbw xmm4, xmm0 + + movdqa XMMWORD Ptr [rdi], xmm4 + + movdqa xmm3, xmm2 + movdqa xmm4, xmm2 + + movdqa xmm5, xmm2 + movdqa xmm6, xmm2 + + movdqa xmm7, xmm2 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm2 + paddsw xmm4, [GLOBAL(rd)] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 + punpcklbw xmm4, xmm0 + + movdqa XMMWORD Ptr [rdi+16], xmm4 + + lea rsi, [rsi + rax] +%if ABI_IS_32BIT + add rdi, DWORD Ptr arg(5) ;[output_width] +%else + add rdi, r8 +%endif + + dec rcx + jnz .filter_block1d16_h6_sse2_rowloop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_filter_block1d8_v6_sse2 +;( +; short *src_ptr, +; unsigned char *output_ptr, +; int dst_ptich, +; unsigned int pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short * vp8_filter +;) +;/************************************************************************************ +; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The +; input pixel array has output_height rows. +;*************************************************************************************/ +globalsym(vp8_filter_block1d8_v6_sse2) +sym(vp8_filter_block1d8_v6_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rax, arg(7) ;vp8_filter + movsxd rdx, dword ptr arg(3) ;pixels_per_line + + mov rdi, arg(1) ;output_ptr + mov rsi, arg(0) ;src_ptr + + sub rsi, rdx + sub rsi, rdx + + movsxd rcx, DWORD PTR arg(5) ;[output_height] + pxor xmm0, xmm0 ; clear xmm0 + + movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(2) ; dst_ptich +%endif + +.vp8_filter_block1d8_v6_sse2_loop: + movdqa xmm1, XMMWORD PTR [rsi] + pmullw xmm1, [rax] + + movdqa xmm2, XMMWORD PTR [rsi + rdx] + pmullw xmm2, [rax + 16] + + movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] + pmullw xmm3, [rax + 32] + + movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] + pmullw xmm5, [rax + 64] + + add rsi, rdx + movdqa xmm4, XMMWORD PTR [rsi + rdx * 2] + + pmullw xmm4, [rax + 48] + movdqa xmm6, XMMWORD PTR [rsi + rdx * 4] + + pmullw xmm6, [rax + 80] + + paddsw xmm2, xmm5 + paddsw xmm2, xmm3 + + paddsw xmm2, xmm1 + paddsw xmm2, xmm4 + + paddsw xmm2, xmm6 + paddsw xmm2, xmm7 + + psraw xmm2, 7 + packuswb xmm2, xmm0 ; pack and saturate + + movq QWORD PTR [rdi], xmm2 ; store the results in the destination +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(2) ;[dst_ptich] +%else + add rdi, r8 +%endif + dec rcx ; decrement count + jnz .vp8_filter_block1d8_v6_sse2_loop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_filter_block1d16_v6_sse2 +;( +; unsigned short *src_ptr, +; unsigned char *output_ptr, +; int dst_ptich, +; unsigned int pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; const short *vp8_filter +;) +;/************************************************************************************ +; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The +; input pixel array has output_height rows. +;*************************************************************************************/ +globalsym(vp8_filter_block1d16_v6_sse2) +sym(vp8_filter_block1d16_v6_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rax, arg(7) ;vp8_filter + movsxd rdx, dword ptr arg(3) ;pixels_per_line + + mov rdi, arg(1) ;output_ptr + mov rsi, arg(0) ;src_ptr + + sub rsi, rdx + sub rsi, rdx + + movsxd rcx, DWORD PTR arg(5) ;[output_height] +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(2) ; dst_ptich +%endif + +.vp8_filter_block1d16_v6_sse2_loop: +; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order. + movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2 + movdqa xmm2, XMMWORD PTR [rsi + rdx + 16] + pmullw xmm1, [rax + 16] + pmullw xmm2, [rax + 16] + + movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5 + movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16] + pmullw xmm3, [rax + 64] + pmullw xmm4, [rax + 64] + + movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3 + movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16] + pmullw xmm5, [rax + 32] + pmullw xmm6, [rax + 32] + + movdqa xmm7, XMMWORD PTR [rsi] ; line 1 + movdqa xmm0, XMMWORD PTR [rsi + 16] + pmullw xmm7, [rax] + pmullw xmm0, [rax] + + paddsw xmm1, xmm3 + paddsw xmm2, xmm4 + paddsw xmm1, xmm5 + paddsw xmm2, xmm6 + paddsw xmm1, xmm7 + paddsw xmm2, xmm0 + + add rsi, rdx + + movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4 + movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16] + pmullw xmm3, [rax + 48] + pmullw xmm4, [rax + 48] + + movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6 + movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16] + pmullw xmm5, [rax + 80] + pmullw xmm6, [rax + 80] + + movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] + pxor xmm0, xmm0 ; clear xmm0 + + paddsw xmm1, xmm3 + paddsw xmm2, xmm4 + paddsw xmm1, xmm5 + paddsw xmm2, xmm6 + + paddsw xmm1, xmm7 + paddsw xmm2, xmm7 + + psraw xmm1, 7 + psraw xmm2, 7 + + packuswb xmm1, xmm2 ; pack and saturate + movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the destination +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(2) ;[dst_ptich] +%else + add rdi, r8 +%endif + dec rcx ; decrement count + jnz .vp8_filter_block1d16_v6_sse2_loop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_filter_block1d8_h6_only_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; int dst_ptich, +; unsigned int output_height, +; const short *vp8_filter +;) +; First-pass filter only when yoffset==0 +globalsym(vp8_filter_block1d8_h6_only_sse2) +sym(vp8_filter_block1d8_h6_only_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdx, arg(5) ;vp8_filter + mov rsi, arg(0) ;src_ptr + + mov rdi, arg(2) ;output_ptr + + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(3) ;dst_ptich +%endif + pxor xmm0, xmm0 ; clear xmm0 for unpack + +.filter_block1d8_h6_only_rowloop: + movq xmm3, MMWORD PTR [rsi - 2] + movq xmm1, MMWORD PTR [rsi + 6] + + prefetcht2 [rsi+rax-2] + + pslldq xmm1, 8 + por xmm1, xmm3 + + movdqa xmm4, xmm1 + movdqa xmm5, xmm1 + + movdqa xmm6, xmm1 + movdqa xmm7, xmm1 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm1 + paddsw xmm4, [GLOBAL(rd)] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 + + movq QWORD PTR [rdi], xmm4 ; store the results in the destination + lea rsi, [rsi + rax] + +%if ABI_IS_32BIT + add rdi, DWORD Ptr arg(3) ;dst_ptich +%else + add rdi, r8 +%endif + dec rcx + + jnz .filter_block1d8_h6_only_rowloop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_filter_block1d16_h6_only_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; int dst_ptich, +; unsigned int output_height, +; const short *vp8_filter +;) +; First-pass filter only when yoffset==0 +globalsym(vp8_filter_block1d16_h6_only_sse2) +sym(vp8_filter_block1d16_h6_only_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdx, arg(5) ;vp8_filter + mov rsi, arg(0) ;src_ptr + + mov rdi, arg(2) ;output_ptr + + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(3) ;dst_ptich +%endif + + pxor xmm0, xmm0 ; clear xmm0 for unpack + +.filter_block1d16_h6_only_sse2_rowloop: + movq xmm3, MMWORD PTR [rsi - 2] + movq xmm1, MMWORD PTR [rsi + 6] + + movq xmm2, MMWORD PTR [rsi +14] + pslldq xmm2, 8 + + por xmm2, xmm1 + prefetcht2 [rsi+rax-2] + + pslldq xmm1, 8 + por xmm1, xmm3 + + movdqa xmm4, xmm1 + movdqa xmm5, xmm1 + + movdqa xmm6, xmm1 + movdqa xmm7, xmm1 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm1 + paddsw xmm4, [GLOBAL(rd)] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 ; lower 8 bytes + + movq QWORD Ptr [rdi], xmm4 ; store the results in the destination + + movdqa xmm3, xmm2 + movdqa xmm4, xmm2 + + movdqa xmm5, xmm2 + movdqa xmm6, xmm2 + + movdqa xmm7, xmm2 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm2 + paddsw xmm4, [GLOBAL(rd)] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 ; higher 8 bytes + + movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination + + lea rsi, [rsi + rax] +%if ABI_IS_32BIT + add rdi, DWORD Ptr arg(3) ;dst_ptich +%else + add rdi, r8 +%endif + + dec rcx + jnz .filter_block1d16_h6_only_sse2_rowloop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_filter_block1d8_v6_only_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; int dst_ptich, +; unsigned int output_height, +; const short *vp8_filter +;) +; Second-pass filter only when xoffset==0 +globalsym(vp8_filter_block1d8_v6_only_sse2) +sym(vp8_filter_block1d8_v6_only_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line + + mov rax, arg(5) ;vp8_filter + + pxor xmm0, xmm0 ; clear xmm0 + + movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(3) ; dst_ptich +%endif + +.vp8_filter_block1d8_v6_only_sse2_loop: + movq xmm1, MMWORD PTR [rsi] + movq xmm2, MMWORD PTR [rsi + rdx] + movq xmm3, MMWORD PTR [rsi + rdx * 2] + movq xmm5, MMWORD PTR [rsi + rdx * 4] + add rsi, rdx + movq xmm4, MMWORD PTR [rsi + rdx * 2] + movq xmm6, MMWORD PTR [rsi + rdx * 4] + + punpcklbw xmm1, xmm0 + pmullw xmm1, [rax] + + punpcklbw xmm2, xmm0 + pmullw xmm2, [rax + 16] + + punpcklbw xmm3, xmm0 + pmullw xmm3, [rax + 32] + + punpcklbw xmm5, xmm0 + pmullw xmm5, [rax + 64] + + punpcklbw xmm4, xmm0 + pmullw xmm4, [rax + 48] + + punpcklbw xmm6, xmm0 + pmullw xmm6, [rax + 80] + + paddsw xmm2, xmm5 + paddsw xmm2, xmm3 + + paddsw xmm2, xmm1 + paddsw xmm2, xmm4 + + paddsw xmm2, xmm6 + paddsw xmm2, xmm7 + + psraw xmm2, 7 + packuswb xmm2, xmm0 ; pack and saturate + + movq QWORD PTR [rdi], xmm2 ; store the results in the destination +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;[dst_ptich] +%else + add rdi, r8 +%endif + dec rcx ; decrement count + jnz .vp8_filter_block1d8_v6_only_sse2_loop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_unpack_block1d16_h6_sse2 +;( +; unsigned char *src_ptr, +; unsigned short *output_ptr, +; unsigned int src_pixels_per_line, +; unsigned int output_height, +; unsigned int output_width +;) +globalsym(vp8_unpack_block1d16_h6_sse2) +sym(vp8_unpack_block1d16_h6_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(1) ;output_ptr + + movsxd rcx, dword ptr arg(3) ;output_height + movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source + + pxor xmm0, xmm0 ; clear xmm0 for unpack +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source +%endif + +.unpack_block1d16_h6_sse2_rowloop: + movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2 + movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + punpcklbw xmm1, xmm0 + + movdqa XMMWORD Ptr [rdi], xmm1 + movdqa XMMWORD Ptr [rdi + 16], xmm3 + + lea rsi, [rsi + rax] +%if ABI_IS_32BIT + add rdi, DWORD Ptr arg(4) ;[output_width] +%else + add rdi, r8 +%endif + dec rcx + jnz .unpack_block1d16_h6_sse2_rowloop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +align 16 +rd: + times 8 dw 0x40 diff --git a/media/libvpx/libvpx/vp8/common/x86/subpixel_ssse3.asm b/media/libvpx/libvpx/vp8/common/x86/subpixel_ssse3.asm new file mode 100644 index 0000000000..17247227db --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/x86/subpixel_ssse3.asm @@ -0,0 +1,1515 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%define BLOCK_HEIGHT_WIDTH 4 +%define VP8_FILTER_WEIGHT 128 +%define VP8_FILTER_SHIFT 7 + +SECTION .text + +;/************************************************************************************ +; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The +; input pixel array has output_height rows. This routine assumes that output_height is an +; even number. This function handles 8 pixels in horizontal direction, calculating ONE +; rows each iteration to take advantage of the 128 bits operations. +; +; This is an implementation of some of the SSE optimizations first seen in ffvp8 +; +;*************************************************************************************/ +;void vp8_filter_block1d8_h6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; unsigned int vp8_filter_index +;) +globalsym(vp8_filter_block1d8_h6_ssse3) +sym(vp8_filter_block1d8_h6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 + + movdqa xmm7, [GLOBAL(rd)] + + lea rax, [GLOBAL(k0_k5)] + add rax, rdx + mov rdi, arg(2) ;output_ptr + + cmp esi, DWORD PTR [rax] + je vp8_filter_block1d8_h4_ssse3 + + movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rcx, dword ptr arg(4) ;output_height + + movsxd rdx, dword ptr arg(3) ;output_pitch + + sub rdi, rdx +;xmm3 free +.filter_block1d8_h6_rowloop_ssse3: + movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 + + movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 + + punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 + + movdqa xmm1, xmm0 + pmaddubsw xmm0, xmm4 + + movdqa xmm2, xmm1 + pshufb xmm1, [GLOBAL(shuf2bfrom1)] + + pshufb xmm2, [GLOBAL(shuf3bfrom1)] + pmaddubsw xmm1, xmm5 + + lea rdi, [rdi + rdx] + pmaddubsw xmm2, xmm6 + + lea rsi, [rsi + rax] + dec rcx + + paddsw xmm0, xmm1 + paddsw xmm2, xmm7 + + paddsw xmm0, xmm2 + + psraw xmm0, 7 + + packuswb xmm0, xmm0 + + movq MMWORD Ptr [rdi], xmm0 + jnz .filter_block1d8_h6_rowloop_ssse3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +vp8_filter_block1d8_h4_ssse3: + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 + + movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)] + movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)] + + mov rsi, arg(0) ;src_ptr + + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rcx, dword ptr arg(4) ;output_height + + movsxd rdx, dword ptr arg(3) ;output_pitch + + sub rdi, rdx + +.filter_block1d8_h4_rowloop_ssse3: + movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 + + movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 + + punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 + + movdqa xmm2, xmm0 + pshufb xmm0, xmm3 + + pshufb xmm2, xmm4 + pmaddubsw xmm0, xmm5 + + lea rdi, [rdi + rdx] + pmaddubsw xmm2, xmm6 + + lea rsi, [rsi + rax] + dec rcx + + paddsw xmm0, xmm7 + + paddsw xmm0, xmm2 + + psraw xmm0, 7 + + packuswb xmm0, xmm0 + + movq MMWORD Ptr [rdi], xmm0 + + jnz .filter_block1d8_h4_rowloop_ssse3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +;void vp8_filter_block1d16_h6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; unsigned int vp8_filter_index +;) +globalsym(vp8_filter_block1d16_h6_ssse3) +sym(vp8_filter_block1d16_h6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 ; + + lea rax, [GLOBAL(k0_k5)] + add rax, rdx + + mov rdi, arg(2) ;output_ptr + + mov rsi, arg(0) ;src_ptr + + movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 + + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rdx, dword ptr arg(3) ;output_pitch + +.filter_block1d16_h6_rowloop_ssse3: + movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 + + movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 + + punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 + + movdqa xmm1, xmm0 + pmaddubsw xmm0, xmm4 + + movdqa xmm2, xmm1 + pshufb xmm1, [GLOBAL(shuf2bfrom1)] + + pshufb xmm2, [GLOBAL(shuf3bfrom1)] + movq xmm3, MMWORD PTR [rsi + 6] + + pmaddubsw xmm1, xmm5 + movq xmm7, MMWORD PTR [rsi + 11] + + pmaddubsw xmm2, xmm6 + punpcklbw xmm3, xmm7 + + paddsw xmm0, xmm1 + movdqa xmm1, xmm3 + + pmaddubsw xmm3, xmm4 + paddsw xmm0, xmm2 + + movdqa xmm2, xmm1 + paddsw xmm0, [GLOBAL(rd)] + + pshufb xmm1, [GLOBAL(shuf2bfrom1)] + pshufb xmm2, [GLOBAL(shuf3bfrom1)] + + psraw xmm0, 7 + pmaddubsw xmm1, xmm5 + + pmaddubsw xmm2, xmm6 + packuswb xmm0, xmm0 + + lea rsi, [rsi + rax] + paddsw xmm3, xmm1 + + paddsw xmm3, xmm2 + + paddsw xmm3, [GLOBAL(rd)] + + psraw xmm3, 7 + + packuswb xmm3, xmm3 + + punpcklqdq xmm0, xmm3 + + movdqa XMMWORD Ptr [rdi], xmm0 + + lea rdi, [rdi + rdx] + dec rcx + jnz .filter_block1d16_h6_rowloop_ssse3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_filter_block1d4_h6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; unsigned int vp8_filter_index +;) +globalsym(vp8_filter_block1d4_h6_ssse3) +sym(vp8_filter_block1d4_h6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 ; + + lea rax, [GLOBAL(k0_k5)] + add rax, rdx + movdqa xmm7, [GLOBAL(rd)] + + cmp esi, DWORD PTR [rax] + je .vp8_filter_block1d4_h4_ssse3 + + movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rcx, dword ptr arg(4) ;output_height + + movsxd rdx, dword ptr arg(3) ;output_pitch + +;xmm3 free +.filter_block1d4_h6_rowloop_ssse3: + movdqu xmm0, XMMWORD PTR [rsi - 2] + + movdqa xmm1, xmm0 + pshufb xmm0, [GLOBAL(shuf1b)] + + movdqa xmm2, xmm1 + pshufb xmm1, [GLOBAL(shuf2b)] + pmaddubsw xmm0, xmm4 + pshufb xmm2, [GLOBAL(shuf3b)] + pmaddubsw xmm1, xmm5 + +;-- + pmaddubsw xmm2, xmm6 + + lea rsi, [rsi + rax] +;-- + paddsw xmm0, xmm1 + paddsw xmm0, xmm7 + pxor xmm1, xmm1 + paddsw xmm0, xmm2 + psraw xmm0, 7 + packuswb xmm0, xmm0 + + movd DWORD PTR [rdi], xmm0 + + add rdi, rdx + dec rcx + jnz .filter_block1d4_h6_rowloop_ssse3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +.vp8_filter_block1d4_h4_ssse3: + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 + movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)] + movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)] + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rcx, dword ptr arg(4) ;output_height + + movsxd rdx, dword ptr arg(3) ;output_pitch + +.filter_block1d4_h4_rowloop_ssse3: + movdqu xmm1, XMMWORD PTR [rsi - 2] + + movdqa xmm2, xmm1 + pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)] + pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)] + pmaddubsw xmm1, xmm5 + +;-- + pmaddubsw xmm2, xmm6 + + lea rsi, [rsi + rax] +;-- + paddsw xmm1, xmm7 + paddsw xmm1, xmm2 + psraw xmm1, 7 + packuswb xmm1, xmm1 + + movd DWORD PTR [rdi], xmm1 + + add rdi, rdx + dec rcx + jnz .filter_block1d4_h4_rowloop_ssse3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + + +;void vp8_filter_block1d16_v6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; unsigned int vp8_filter_index +;) +globalsym(vp8_filter_block1d16_v6_ssse3) +sym(vp8_filter_block1d16_v6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 ; + + lea rax, [GLOBAL(k0_k5)] + add rax, rdx + + cmp esi, DWORD PTR [rax] + je .vp8_filter_block1d16_v4_ssse3 + + movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 + movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + mov rdi, arg(2) ;output_ptr + +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ;out_pitch +%endif + mov rax, rsi + movsxd rcx, DWORD PTR arg(4) ;output_height + add rax, rdx + + +.vp8_filter_block1d16_v6_ssse3_loop: + movq xmm1, MMWORD PTR [rsi] ;A + movq xmm2, MMWORD PTR [rsi + rdx] ;B + movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E + + punpcklbw xmm2, xmm4 ;B D + punpcklbw xmm3, xmm0 ;C E + + movq xmm0, MMWORD PTR [rax + rdx * 4] ;F + + pmaddubsw xmm3, xmm6 + punpcklbw xmm1, xmm0 ;A F + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm5 + + paddsw xmm2, xmm3 + paddsw xmm2, xmm1 + paddsw xmm2, [GLOBAL(rd)] + psraw xmm2, 7 + packuswb xmm2, xmm2 + + movq MMWORD PTR [rdi], xmm2 ;store the results + + movq xmm1, MMWORD PTR [rsi + 8] ;A + movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B + movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E + + punpcklbw xmm2, xmm4 ;B D + punpcklbw xmm3, xmm0 ;C E + + movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F + pmaddubsw xmm3, xmm6 + punpcklbw xmm1, xmm0 ;A F + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm5 + + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw xmm2, xmm3 + paddsw xmm2, xmm1 + paddsw xmm2, [GLOBAL(rd)] + psraw xmm2, 7 + packuswb xmm2, xmm2 + + movq MMWORD PTR [rdi+8], xmm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;out_pitch +%else + add rdi, r8 +%endif + dec rcx + jnz .vp8_filter_block1d16_v6_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +.vp8_filter_block1d16_v4_ssse3: + movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + mov rdi, arg(2) ;output_ptr + +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ;out_pitch +%endif + mov rax, rsi + movsxd rcx, DWORD PTR arg(4) ;output_height + add rax, rdx + +.vp8_filter_block1d16_v4_ssse3_loop: + movq xmm2, MMWORD PTR [rsi + rdx] ;B + movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E + + punpcklbw xmm2, xmm4 ;B D + punpcklbw xmm3, xmm0 ;C E + + pmaddubsw xmm3, xmm6 + pmaddubsw xmm2, xmm7 + movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B + movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E + + paddsw xmm2, [GLOBAL(rd)] + paddsw xmm2, xmm3 + psraw xmm2, 7 + packuswb xmm2, xmm2 + + punpcklbw xmm5, xmm4 ;B D + punpcklbw xmm1, xmm0 ;C E + + pmaddubsw xmm1, xmm6 + pmaddubsw xmm5, xmm7 + + movdqa xmm4, [GLOBAL(rd)] + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw xmm5, xmm1 + paddsw xmm5, xmm4 + psraw xmm5, 7 + packuswb xmm5, xmm5 + + punpcklqdq xmm2, xmm5 + + movdqa XMMWORD PTR [rdi], xmm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;out_pitch +%else + add rdi, r8 +%endif + dec rcx + jnz .vp8_filter_block1d16_v4_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_filter_block1d8_v6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; unsigned int vp8_filter_index +;) +globalsym(vp8_filter_block1d8_v6_ssse3) +sym(vp8_filter_block1d8_v6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 ; + + lea rax, [GLOBAL(k0_k5)] + add rax, rdx + + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + mov rdi, arg(2) ;output_ptr +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ; out_pitch +%endif + movsxd rcx, DWORD PTR arg(4) ;[output_height] + + cmp esi, DWORD PTR [rax] + je .vp8_filter_block1d8_v4_ssse3 + + movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 + movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + + mov rax, rsi + add rax, rdx + +.vp8_filter_block1d8_v6_ssse3_loop: + movq xmm1, MMWORD PTR [rsi] ;A + movq xmm2, MMWORD PTR [rsi + rdx] ;B + movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E + + punpcklbw xmm2, xmm4 ;B D + punpcklbw xmm3, xmm0 ;C E + + movq xmm0, MMWORD PTR [rax + rdx * 4] ;F + movdqa xmm4, [GLOBAL(rd)] + + pmaddubsw xmm3, xmm6 + punpcklbw xmm1, xmm0 ;A F + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm5 + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw xmm2, xmm3 + paddsw xmm2, xmm1 + paddsw xmm2, xmm4 + psraw xmm2, 7 + packuswb xmm2, xmm2 + + movq MMWORD PTR [rdi], xmm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;[out_pitch] +%else + add rdi, r8 +%endif + dec rcx + jnz .vp8_filter_block1d8_v6_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +.vp8_filter_block1d8_v4_ssse3: + movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 + movdqa xmm5, [GLOBAL(rd)] + + mov rsi, arg(0) ;src_ptr + + mov rax, rsi + add rax, rdx + +.vp8_filter_block1d8_v4_ssse3_loop: + movq xmm2, MMWORD PTR [rsi + rdx] ;B + movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E + + punpcklbw xmm2, xmm4 ;B D + punpcklbw xmm3, xmm0 ;C E + + pmaddubsw xmm3, xmm6 + pmaddubsw xmm2, xmm7 + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw xmm2, xmm3 + paddsw xmm2, xmm5 + psraw xmm2, 7 + packuswb xmm2, xmm2 + + movq MMWORD PTR [rdi], xmm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;[out_pitch] +%else + add rdi, r8 +%endif + dec rcx + jnz .vp8_filter_block1d8_v4_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +;void vp8_filter_block1d4_v6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; unsigned int vp8_filter_index +;) +globalsym(vp8_filter_block1d4_v6_ssse3) +sym(vp8_filter_block1d4_v6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 ; + + lea rax, [GLOBAL(k0_k5)] + add rax, rdx + + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + mov rdi, arg(2) ;output_ptr +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ; out_pitch +%endif + movsxd rcx, DWORD PTR arg(4) ;[output_height] + + cmp esi, DWORD PTR [rax] + je .vp8_filter_block1d4_v4_ssse3 + + movq mm5, MMWORD PTR [rax] ;k0_k5 + movq mm6, MMWORD PTR [rax+256] ;k2_k4 + movq mm7, MMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + + mov rax, rsi + add rax, rdx + +.vp8_filter_block1d4_v6_ssse3_loop: + movd mm1, DWORD PTR [rsi] ;A + movd mm2, DWORD PTR [rsi + rdx] ;B + movd mm3, DWORD PTR [rsi + rdx * 2] ;C + movd mm4, DWORD PTR [rax + rdx * 2] ;D + movd mm0, DWORD PTR [rsi + rdx * 4] ;E + + punpcklbw mm2, mm4 ;B D + punpcklbw mm3, mm0 ;C E + + movd mm0, DWORD PTR [rax + rdx * 4] ;F + + movq mm4, [GLOBAL(rd)] + + pmaddubsw mm3, mm6 + punpcklbw mm1, mm0 ;A F + pmaddubsw mm2, mm7 + pmaddubsw mm1, mm5 + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw mm2, mm3 + paddsw mm2, mm1 + paddsw mm2, mm4 + psraw mm2, 7 + packuswb mm2, mm2 + + movd DWORD PTR [rdi], mm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;[out_pitch] +%else + add rdi, r8 +%endif + dec rcx + jnz .vp8_filter_block1d4_v6_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +.vp8_filter_block1d4_v4_ssse3: + movq mm6, MMWORD PTR [rax+256] ;k2_k4 + movq mm7, MMWORD PTR [rax+128] ;k1_k3 + movq mm5, MMWORD PTR [GLOBAL(rd)] + + mov rsi, arg(0) ;src_ptr + + mov rax, rsi + add rax, rdx + +.vp8_filter_block1d4_v4_ssse3_loop: + movd mm2, DWORD PTR [rsi + rdx] ;B + movd mm3, DWORD PTR [rsi + rdx * 2] ;C + movd mm4, DWORD PTR [rax + rdx * 2] ;D + movd mm0, DWORD PTR [rsi + rdx * 4] ;E + + punpcklbw mm2, mm4 ;B D + punpcklbw mm3, mm0 ;C E + + pmaddubsw mm3, mm6 + pmaddubsw mm2, mm7 + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw mm2, mm3 + paddsw mm2, mm5 + psraw mm2, 7 + packuswb mm2, mm2 + + movd DWORD PTR [rdi], mm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;[out_pitch] +%else + add rdi, r8 +%endif + dec rcx + jnz .vp8_filter_block1d4_v4_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_bilinear_predict16x16_ssse3 +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +globalsym(vp8_bilinear_predict16x16_ssse3) +sym(vp8_bilinear_predict16x16_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)] + movsxd rax, dword ptr arg(2) ; xoffset + + cmp rax, 0 ; skip first_pass filter if xoffset=0 + je .b16x16_sp_only + + shl rax, 4 + lea rax, [rax + rcx] ; HFilter + + mov rdi, arg(4) ; dst_ptr + mov rsi, arg(0) ; src_ptr + movsxd rdx, dword ptr arg(5) ; dst_pitch + + movdqa xmm1, [rax] + + movsxd rax, dword ptr arg(3) ; yoffset + + cmp rax, 0 ; skip second_pass filter if yoffset=0 + je .b16x16_fp_only + + shl rax, 4 + lea rax, [rax + rcx] ; VFilter + + lea rcx, [rdi+rdx*8] + lea rcx, [rcx+rdx*8] + movsxd rdx, dword ptr arg(1) ; src_pixels_per_line + + movdqa xmm2, [rax] + +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(5) ; dst_pitch +%endif + movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07 + movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08 + + punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 + movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15 + + movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16 + + lea rsi, [rsi + rdx] ; next line + + pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14 + + punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16 + pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value + psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128 + + movdqa xmm7, xmm3 + packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 + +.next_row: + movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07 + movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08 + + punpcklbw xmm6, xmm5 + movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15 + + movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16 + lea rsi, [rsi + rdx] ; next line + + pmaddubsw xmm6, xmm1 + + punpcklbw xmm4, xmm5 + pmaddubsw xmm4, xmm1 + + paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value + psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128 + + paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value + psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128 + + packuswb xmm6, xmm4 + movdqa xmm5, xmm7 + + punpcklbw xmm5, xmm6 + pmaddubsw xmm5, xmm2 + + punpckhbw xmm7, xmm6 + pmaddubsw xmm7, xmm2 + + paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value + psraw xmm5, VP8_FILTER_SHIFT ; xmm5 /= 128 + + paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value + psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128 + + packuswb xmm5, xmm7 + movdqa xmm7, xmm6 + + movdqa [rdi], xmm5 ; store the results in the destination +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(5) ; dst_pitch +%else + add rdi, r8 +%endif + + cmp rdi, rcx + jne .next_row + + jmp .done + +.b16x16_sp_only: + movsxd rax, dword ptr arg(3) ; yoffset + shl rax, 4 + lea rax, [rax + rcx] ; VFilter + + mov rdi, arg(4) ; dst_ptr + mov rsi, arg(0) ; src_ptr + movsxd rdx, dword ptr arg(5) ; dst_pitch + + movdqa xmm1, [rax] ; VFilter + + lea rcx, [rdi+rdx*8] + lea rcx, [rcx+rdx*8] + movsxd rax, dword ptr arg(1) ; src_pixels_per_line + + ; get the first horizontal line done + movq xmm4, [rsi] ; load row 0 + movq xmm2, [rsi + 8] ; load row 0 + + lea rsi, [rsi + rax] ; next line +.next_row_sp: + movq xmm3, [rsi] ; load row + 1 + movq xmm5, [rsi + 8] ; load row + 1 + + punpcklbw xmm4, xmm3 + punpcklbw xmm2, xmm5 + + pmaddubsw xmm4, xmm1 + movq xmm7, [rsi + rax] ; load row + 2 + + pmaddubsw xmm2, xmm1 + movq xmm6, [rsi + rax + 8] ; load row + 2 + + punpcklbw xmm3, xmm7 + punpcklbw xmm5, xmm6 + + pmaddubsw xmm3, xmm1 + paddw xmm4, [GLOBAL(rd)] + + pmaddubsw xmm5, xmm1 + paddw xmm2, [GLOBAL(rd)] + + psraw xmm4, VP8_FILTER_SHIFT + psraw xmm2, VP8_FILTER_SHIFT + + packuswb xmm4, xmm2 + paddw xmm3, [GLOBAL(rd)] + + movdqa [rdi], xmm4 ; store row 0 + paddw xmm5, [GLOBAL(rd)] + + psraw xmm3, VP8_FILTER_SHIFT + psraw xmm5, VP8_FILTER_SHIFT + + packuswb xmm3, xmm5 + movdqa xmm4, xmm7 + + movdqa [rdi + rdx],xmm3 ; store row 1 + lea rsi, [rsi + 2*rax] + + movdqa xmm2, xmm6 + lea rdi, [rdi + 2*rdx] + + cmp rdi, rcx + jne .next_row_sp + + jmp .done + +.b16x16_fp_only: + lea rcx, [rdi+rdx*8] + lea rcx, [rcx+rdx*8] + movsxd rax, dword ptr arg(1) ; src_pixels_per_line + +.next_row_fp: + movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07 + movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08 + + punpcklbw xmm2, xmm4 + movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15 + + pmaddubsw xmm2, xmm1 + movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16 + + lea rsi, [rsi + rax] ; next line + punpcklbw xmm3, xmm4 + + pmaddubsw xmm3, xmm1 + movq xmm5, [rsi] + + paddw xmm2, [GLOBAL(rd)] + movq xmm7, [rsi+1] + + movq xmm6, [rsi+8] + psraw xmm2, VP8_FILTER_SHIFT + + punpcklbw xmm5, xmm7 + movq xmm7, [rsi+9] + + paddw xmm3, [GLOBAL(rd)] + pmaddubsw xmm5, xmm1 + + psraw xmm3, VP8_FILTER_SHIFT + punpcklbw xmm6, xmm7 + + packuswb xmm2, xmm3 + pmaddubsw xmm6, xmm1 + + movdqa [rdi], xmm2 ; store the results in the destination + paddw xmm5, [GLOBAL(rd)] + + lea rdi, [rdi + rdx] ; dst_pitch + psraw xmm5, VP8_FILTER_SHIFT + + paddw xmm6, [GLOBAL(rd)] + psraw xmm6, VP8_FILTER_SHIFT + + packuswb xmm5, xmm6 + lea rsi, [rsi + rax] ; next line + + movdqa [rdi], xmm5 ; store the results in the destination + lea rdi, [rdi + rdx] ; dst_pitch + + cmp rdi, rcx + + jne .next_row_fp + +.done: + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_bilinear_predict8x8_ssse3 +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +globalsym(vp8_bilinear_predict8x8_ssse3) +sym(vp8_bilinear_predict8x8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 144 ; reserve 144 bytes + + lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)] + + mov rsi, arg(0) ;src_ptr + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line + + ;Read 9-line unaligned data in and put them on stack. This gives a big + ;performance boost. + movdqu xmm0, [rsi] + lea rax, [rdx + rdx*2] + movdqu xmm1, [rsi+rdx] + movdqu xmm2, [rsi+rdx*2] + add rsi, rax + movdqu xmm3, [rsi] + movdqu xmm4, [rsi+rdx] + movdqu xmm5, [rsi+rdx*2] + add rsi, rax + movdqu xmm6, [rsi] + movdqu xmm7, [rsi+rdx] + + movdqa XMMWORD PTR [rsp], xmm0 + + movdqu xmm0, [rsi+rdx*2] + + movdqa XMMWORD PTR [rsp+16], xmm1 + movdqa XMMWORD PTR [rsp+32], xmm2 + movdqa XMMWORD PTR [rsp+48], xmm3 + movdqa XMMWORD PTR [rsp+64], xmm4 + movdqa XMMWORD PTR [rsp+80], xmm5 + movdqa XMMWORD PTR [rsp+96], xmm6 + movdqa XMMWORD PTR [rsp+112], xmm7 + movdqa XMMWORD PTR [rsp+128], xmm0 + + movsxd rax, dword ptr arg(2) ; xoffset + cmp rax, 0 ; skip first_pass filter if xoffset=0 + je .b8x8_sp_only + + shl rax, 4 + add rax, rcx ; HFilter + + mov rdi, arg(4) ; dst_ptr + movsxd rdx, dword ptr arg(5) ; dst_pitch + + movdqa xmm0, [rax] + + movsxd rax, dword ptr arg(3) ; yoffset + cmp rax, 0 ; skip second_pass filter if yoffset=0 + je .b8x8_fp_only + + shl rax, 4 + lea rax, [rax + rcx] ; VFilter + + lea rcx, [rdi+rdx*8] + + movdqa xmm1, [rax] + + ; get the first horizontal line done + movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 + movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx + + psrldq xmm5, 1 + lea rsp, [rsp + 16] ; next line + + punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 + pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + movdqa xmm7, xmm3 + packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 + +.next_row: + movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 + lea rsp, [rsp + 16] ; next line + + movdqa xmm5, xmm6 + + psrldq xmm5, 1 + + punpcklbw xmm6, xmm5 + pmaddubsw xmm6, xmm0 + + paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value + psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128 + + packuswb xmm6, xmm6 + + punpcklbw xmm7, xmm6 + pmaddubsw xmm7, xmm1 + + paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value + psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128 + + packuswb xmm7, xmm7 + + movq [rdi], xmm7 ; store the results in the destination + lea rdi, [rdi + rdx] + + movdqa xmm7, xmm6 + + cmp rdi, rcx + jne .next_row + + jmp .done8x8 + +.b8x8_sp_only: + movsxd rax, dword ptr arg(3) ; yoffset + shl rax, 4 + lea rax, [rax + rcx] ; VFilter + + mov rdi, arg(4) ;dst_ptr + movsxd rdx, dword ptr arg(5) ; dst_pitch + + movdqa xmm0, [rax] ; VFilter + + movq xmm1, XMMWORD PTR [rsp] + movq xmm2, XMMWORD PTR [rsp+16] + + movq xmm3, XMMWORD PTR [rsp+32] + punpcklbw xmm1, xmm2 + + movq xmm4, XMMWORD PTR [rsp+48] + punpcklbw xmm2, xmm3 + + movq xmm5, XMMWORD PTR [rsp+64] + punpcklbw xmm3, xmm4 + + movq xmm6, XMMWORD PTR [rsp+80] + punpcklbw xmm4, xmm5 + + movq xmm7, XMMWORD PTR [rsp+96] + punpcklbw xmm5, xmm6 + + ; Because the source register (xmm0) is always treated as signed by + ; pmaddubsw, the constant '128' is treated as '-128'. + pmaddubsw xmm1, xmm0 + pmaddubsw xmm2, xmm0 + + pmaddubsw xmm3, xmm0 + pmaddubsw xmm4, xmm0 + + pmaddubsw xmm5, xmm0 + punpcklbw xmm6, xmm7 + + pmaddubsw xmm6, xmm0 + paddw xmm1, [GLOBAL(rd)] + + paddw xmm2, [GLOBAL(rd)] + psraw xmm1, VP8_FILTER_SHIFT + + paddw xmm3, [GLOBAL(rd)] + psraw xmm2, VP8_FILTER_SHIFT + + paddw xmm4, [GLOBAL(rd)] + psraw xmm3, VP8_FILTER_SHIFT + + paddw xmm5, [GLOBAL(rd)] + psraw xmm4, VP8_FILTER_SHIFT + + paddw xmm6, [GLOBAL(rd)] + psraw xmm5, VP8_FILTER_SHIFT + + psraw xmm6, VP8_FILTER_SHIFT + + ; Having multiplied everything by '-128' and obtained negative + ; numbers, the unsigned saturation truncates those values to 0, + ; resulting in incorrect handling of xoffset == 0 && yoffset == 0 + packuswb xmm1, xmm1 + + packuswb xmm2, xmm2 + movq [rdi], xmm1 + + packuswb xmm3, xmm3 + movq [rdi+rdx], xmm2 + + packuswb xmm4, xmm4 + movq xmm1, XMMWORD PTR [rsp+112] + + lea rdi, [rdi + 2*rdx] + movq xmm2, XMMWORD PTR [rsp+128] + + packuswb xmm5, xmm5 + movq [rdi], xmm3 + + packuswb xmm6, xmm6 + movq [rdi+rdx], xmm4 + + lea rdi, [rdi + 2*rdx] + punpcklbw xmm7, xmm1 + + movq [rdi], xmm5 + pmaddubsw xmm7, xmm0 + + movq [rdi+rdx], xmm6 + punpcklbw xmm1, xmm2 + + pmaddubsw xmm1, xmm0 + paddw xmm7, [GLOBAL(rd)] + + psraw xmm7, VP8_FILTER_SHIFT + paddw xmm1, [GLOBAL(rd)] + + psraw xmm1, VP8_FILTER_SHIFT + packuswb xmm7, xmm7 + + packuswb xmm1, xmm1 + lea rdi, [rdi + 2*rdx] + + movq [rdi], xmm7 + + movq [rdi+rdx], xmm1 + lea rsp, [rsp + 144] + + jmp .done8x8 + +.b8x8_fp_only: + lea rcx, [rdi+rdx*8] + +.next_row_fp: + movdqa xmm1, XMMWORD PTR [rsp] + movdqa xmm3, XMMWORD PTR [rsp+16] + + movdqa xmm2, xmm1 + movdqa xmm5, XMMWORD PTR [rsp+32] + + psrldq xmm2, 1 + movdqa xmm7, XMMWORD PTR [rsp+48] + + movdqa xmm4, xmm3 + psrldq xmm4, 1 + + movdqa xmm6, xmm5 + psrldq xmm6, 1 + + punpcklbw xmm1, xmm2 + pmaddubsw xmm1, xmm0 + + punpcklbw xmm3, xmm4 + pmaddubsw xmm3, xmm0 + + punpcklbw xmm5, xmm6 + pmaddubsw xmm5, xmm0 + + movdqa xmm2, xmm7 + psrldq xmm2, 1 + + punpcklbw xmm7, xmm2 + pmaddubsw xmm7, xmm0 + + paddw xmm1, [GLOBAL(rd)] + psraw xmm1, VP8_FILTER_SHIFT + + paddw xmm3, [GLOBAL(rd)] + psraw xmm3, VP8_FILTER_SHIFT + + paddw xmm5, [GLOBAL(rd)] + psraw xmm5, VP8_FILTER_SHIFT + + paddw xmm7, [GLOBAL(rd)] + psraw xmm7, VP8_FILTER_SHIFT + + packuswb xmm1, xmm1 + packuswb xmm3, xmm3 + + packuswb xmm5, xmm5 + movq [rdi], xmm1 + + packuswb xmm7, xmm7 + movq [rdi+rdx], xmm3 + + lea rdi, [rdi + 2*rdx] + movq [rdi], xmm5 + + lea rsp, [rsp + 4*16] + movq [rdi+rdx], xmm7 + + lea rdi, [rdi + 2*rdx] + cmp rdi, rcx + + jne .next_row_fp + + lea rsp, [rsp + 16] + +.done8x8: + ;add rsp, 144 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +shuf1b: + db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 +shuf2b: + db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11 +shuf3b: + db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10 + +align 16 +shuf2bfrom1: + db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13 +align 16 +shuf3bfrom1: + db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11 + +align 16 +rd: + times 8 dw 0x40 + +align 16 +k0_k5: + times 8 db 0, 0 ;placeholder + times 8 db 0, 0 + times 8 db 2, 1 + times 8 db 0, 0 + times 8 db 3, 3 + times 8 db 0, 0 + times 8 db 1, 2 + times 8 db 0, 0 +k1_k3: + times 8 db 0, 0 ;placeholder + times 8 db -6, 12 + times 8 db -11, 36 + times 8 db -9, 50 + times 8 db -16, 77 + times 8 db -6, 93 + times 8 db -8, 108 + times 8 db -1, 123 +k2_k4: + times 8 db 128, 0 ;placeholder + times 8 db 123, -1 + times 8 db 108, -8 + times 8 db 93, -6 + times 8 db 77, -16 + times 8 db 50, -9 + times 8 db 36, -11 + times 8 db 12, -6 +align 16 +vp8_bilinear_filters_ssse3: + times 8 db 128, 0 + times 8 db 112, 16 + times 8 db 96, 32 + times 8 db 80, 48 + times 8 db 64, 64 + times 8 db 48, 80 + times 8 db 32, 96 + times 8 db 16, 112 + diff --git a/media/libvpx/libvpx/vp8/common/x86/vp8_asm_stubs.c b/media/libvpx/libvpx/vp8/common/x86/vp8_asm_stubs.c new file mode 100644 index 0000000000..7fb83c2d5e --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/x86/vp8_asm_stubs.c @@ -0,0 +1,365 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vpx_ports/mem.h" + +extern const short vp8_six_tap_x86[8][6 * 8]; + +extern void vp8_filter_block1d_h6_mmx(unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter); +extern void vp8_filter_block1dc_v6_mmx( + unsigned short *src_ptr, unsigned char *output_ptr, int output_pitch, + unsigned int pixels_per_line, unsigned int pixel_step, + unsigned int output_height, unsigned int output_width, + const short *vp8_filter); +extern void vp8_filter_block1d8_h6_sse2(unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter); +extern void vp8_filter_block1d16_h6_sse2(unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter); +extern void vp8_filter_block1d8_v6_sse2( + unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich, + unsigned int pixels_per_line, unsigned int pixel_step, + unsigned int output_height, unsigned int output_width, + const short *vp8_filter); +extern void vp8_filter_block1d16_v6_sse2( + unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich, + unsigned int pixels_per_line, unsigned int pixel_step, + unsigned int output_height, unsigned int output_width, + const short *vp8_filter); +extern void vp8_unpack_block1d16_h6_sse2(unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_height, + unsigned int output_width); +extern void vp8_filter_block1d8_h6_only_sse2(unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + int dst_ptich, + unsigned int output_height, + const short *vp8_filter); +extern void vp8_filter_block1d16_h6_only_sse2(unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + int dst_ptich, + unsigned int output_height, + const short *vp8_filter); +extern void vp8_filter_block1d8_v6_only_sse2(unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + int dst_ptich, + unsigned int output_height, + const short *vp8_filter); + +#if HAVE_MMX +void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, unsigned char *dst_ptr, + int dst_pitch) { + DECLARE_ALIGNED(16, unsigned short, + FData2[16 * 16]); /* Temp data bufffer used in filtering */ + const short *HFilter, *VFilter; + HFilter = vp8_six_tap_x86[xoffset]; + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, + src_pixels_per_line, 1, 9, 8, HFilter); + VFilter = vp8_six_tap_x86[yoffset]; + vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4, 4, 4, + VFilter); +} +#endif + +#if HAVE_SSE2 +void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr, + int src_pixels_per_line, int xoffset, + int yoffset, unsigned char *dst_ptr, + int dst_pitch) { + DECLARE_ALIGNED(16, unsigned short, + FData2[24 * 24]); /* Temp data bufffer used in filtering */ + + const short *HFilter, *VFilter; + + if (xoffset) { + if (yoffset) { + HFilter = vp8_six_tap_x86[xoffset]; + vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, + src_pixels_per_line, 1, 21, 32, HFilter); + VFilter = vp8_six_tap_x86[yoffset]; + vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16, + dst_pitch, VFilter); + } else { + /* First-pass only */ + HFilter = vp8_six_tap_x86[xoffset]; + vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, + dst_pitch, 16, HFilter); + } + } else { + /* Second-pass only */ + VFilter = vp8_six_tap_x86[yoffset]; + vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, + src_pixels_per_line, 21, 32); + vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16, + dst_pitch, VFilter); + } +} + +void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, + unsigned char *dst_ptr, int dst_pitch) { + DECLARE_ALIGNED(16, unsigned short, + FData2[256]); /* Temp data bufffer used in filtering */ + const short *HFilter, *VFilter; + + if (xoffset) { + if (yoffset) { + HFilter = vp8_six_tap_x86[xoffset]; + vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, + src_pixels_per_line, 1, 13, 16, HFilter); + VFilter = vp8_six_tap_x86[yoffset]; + vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 8, + dst_pitch, VFilter); + } else { + /* First-pass only */ + HFilter = vp8_six_tap_x86[xoffset]; + vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, + dst_pitch, 8, HFilter); + } + } else { + /* Second-pass only */ + VFilter = vp8_six_tap_x86[yoffset]; + vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, dst_ptr, dst_pitch, 8, + VFilter); + } +} + +void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, + unsigned char *dst_ptr, int dst_pitch) { + DECLARE_ALIGNED(16, unsigned short, + FData2[256]); /* Temp data bufffer used in filtering */ + const short *HFilter, *VFilter; + + if (xoffset) { + if (yoffset) { + HFilter = vp8_six_tap_x86[xoffset]; + vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, + src_pixels_per_line, 1, 9, 16, HFilter); + VFilter = vp8_six_tap_x86[yoffset]; + vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 4, + dst_pitch, VFilter); + } else { + /* First-pass only */ + HFilter = vp8_six_tap_x86[xoffset]; + vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, + dst_pitch, 4, HFilter); + } + } else { + /* Second-pass only */ + VFilter = vp8_six_tap_x86[yoffset]; + vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, dst_ptr, dst_pitch, 4, + VFilter); + } +} + +#endif + +#if HAVE_SSSE3 + +extern void vp8_filter_block1d8_h6_ssse3(unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int output_height, + unsigned int vp8_filter_index); + +extern void vp8_filter_block1d16_h6_ssse3(unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int output_height, + unsigned int vp8_filter_index); + +extern void vp8_filter_block1d16_v6_ssse3(unsigned char *src_ptr, + unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + unsigned int vp8_filter_index); + +extern void vp8_filter_block1d8_v6_ssse3(unsigned char *src_ptr, + unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + unsigned int vp8_filter_index); + +extern void vp8_filter_block1d4_h6_ssse3(unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int output_height, + unsigned int vp8_filter_index); + +extern void vp8_filter_block1d4_v6_ssse3(unsigned char *src_ptr, + unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + unsigned int vp8_filter_index); + +void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr, + int src_pixels_per_line, int xoffset, + int yoffset, unsigned char *dst_ptr, + int dst_pitch) { + DECLARE_ALIGNED(16, unsigned char, FData2[24 * 24]); + + if (xoffset) { + if (yoffset) { + vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, FData2, 16, 21, + xoffset); + vp8_filter_block1d16_v6_ssse3(FData2, 16, dst_ptr, dst_pitch, 16, + yoffset); + } else { + /* First-pass only */ + vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, + dst_pitch, 16, xoffset); + } + } else { + if (yoffset) { + /* Second-pass only */ + vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, dst_ptr, dst_pitch, 16, + yoffset); + } else { + /* ssse3 second-pass only function couldn't handle (xoffset==0 && + * yoffset==0) case correctly. Add copy function here to guarantee + * six-tap function handles all possible offsets. */ + vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); + } + } +} + +void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr, + int src_pixels_per_line, int xoffset, + int yoffset, unsigned char *dst_ptr, + int dst_pitch) { + DECLARE_ALIGNED(16, unsigned char, FData2[256]); + + if (xoffset) { + if (yoffset) { + vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, FData2, 8, 13, xoffset); + vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 8, yoffset); + } else { + vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, + dst_pitch, 8, xoffset); + } + } else { + if (yoffset) { + /* Second-pass only */ + vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, dst_ptr, dst_pitch, 8, + yoffset); + } else { + /* ssse3 second-pass only function couldn't handle (xoffset==0 && + * yoffset==0) case correctly. Add copy function here to guarantee + * six-tap function handles all possible offsets. */ + vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); + } + } +} + +void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr, + int src_pixels_per_line, int xoffset, + int yoffset, unsigned char *dst_ptr, + int dst_pitch) { + DECLARE_ALIGNED(16, unsigned char, FData2[256]); + + if (xoffset) { + if (yoffset) { + vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, FData2, 8, 9, xoffset); + vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 4, yoffset); + } else { + /* First-pass only */ + vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, + dst_pitch, 4, xoffset); + } + } else { + if (yoffset) { + /* Second-pass only */ + vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, dst_ptr, dst_pitch, 4, + yoffset); + } else { + /* ssse3 second-pass only function couldn't handle (xoffset==0 && + * yoffset==0) case correctly. Add copy function here to guarantee + * six-tap function handles all possible offsets. */ + vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); + } + } +} + +void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr, + int src_pixels_per_line, int xoffset, + int yoffset, unsigned char *dst_ptr, + int dst_pitch) { + DECLARE_ALIGNED(16, unsigned char, FData2[4 * 9]); + + if (xoffset) { + if (yoffset) { + vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, FData2, 4, 9, xoffset); + vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, 4, yoffset); + } else { + vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, + dst_pitch, 4, xoffset); + } + } else { + if (yoffset) { + vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, dst_ptr, dst_pitch, 4, + yoffset); + } else { + /* ssse3 second-pass only function couldn't handle (xoffset==0 && + * yoffset==0) case correctly. Add copy function here to guarantee + * six-tap function handles all possible offsets. */ + int r; + + for (r = 0; r < 4; ++r) { + dst_ptr[0] = src_ptr[0]; + dst_ptr[1] = src_ptr[1]; + dst_ptr[2] = src_ptr[2]; + dst_ptr[3] = src_ptr[3]; + dst_ptr += dst_pitch; + src_ptr += src_pixels_per_line; + } + } + } +} + +#endif diff --git a/media/libvpx/libvpx/vp8/decoder/dboolhuff.c b/media/libvpx/libvpx/vp8/decoder/dboolhuff.c new file mode 100644 index 0000000000..11099c453c --- /dev/null +++ b/media/libvpx/libvpx/vp8/decoder/dboolhuff.c @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "dboolhuff.h" +#include "vp8/common/common.h" +#include "vpx_dsp/vpx_dsp_common.h" + +int vp8dx_start_decode(BOOL_DECODER *br, const unsigned char *source, + unsigned int source_sz, vpx_decrypt_cb decrypt_cb, + void *decrypt_state) { + if (source_sz && !source) return 1; + + // To simplify calling code this fuction can be called with |source| == null + // and |source_sz| == 0. This and vp8dx_bool_decoder_fill() are essentially + // no-ops in this case. + // Work around a ubsan warning with a ternary to avoid adding 0 to null. + br->user_buffer_end = source ? source + source_sz : source; + br->user_buffer = source; + br->value = 0; + br->count = -8; + br->range = 255; + br->decrypt_cb = decrypt_cb; + br->decrypt_state = decrypt_state; + + /* Populate the buffer */ + vp8dx_bool_decoder_fill(br); + + return 0; +} + +void vp8dx_bool_decoder_fill(BOOL_DECODER *br) { + const unsigned char *bufptr = br->user_buffer; + VP8_BD_VALUE value = br->value; + int count = br->count; + int shift = VP8_BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT); + size_t bytes_left = br->user_buffer_end - bufptr; + size_t bits_left = bytes_left * CHAR_BIT; + int x = shift + CHAR_BIT - (int)bits_left; + int loop_end = 0; + unsigned char decrypted[sizeof(VP8_BD_VALUE) + 1]; + + if (br->decrypt_cb) { + size_t n = VPXMIN(sizeof(decrypted), bytes_left); + br->decrypt_cb(br->decrypt_state, bufptr, decrypted, (int)n); + bufptr = decrypted; + } + + if (x >= 0) { + count += VP8_LOTS_OF_BITS; + loop_end = x; + } + + if (x < 0 || bits_left) { + while (shift >= loop_end) { + count += CHAR_BIT; + value |= (VP8_BD_VALUE)*bufptr << shift; + ++bufptr; + ++br->user_buffer; + shift -= CHAR_BIT; + } + } + + br->value = value; + br->count = count; +} diff --git a/media/libvpx/libvpx/vp8/decoder/dboolhuff.h b/media/libvpx/libvpx/vp8/decoder/dboolhuff.h new file mode 100644 index 0000000000..673b2fbd5d --- /dev/null +++ b/media/libvpx/libvpx/vp8/decoder/dboolhuff.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_DECODER_DBOOLHUFF_H_ +#define VPX_VP8_DECODER_DBOOLHUFF_H_ + +#include +#include + +#include "./vpx_config.h" +#include "vpx_ports/compiler_attributes.h" +#include "vpx_ports/mem.h" +#include "vpx/vp8dx.h" +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef size_t VP8_BD_VALUE; + +#define VP8_BD_VALUE_SIZE ((int)sizeof(VP8_BD_VALUE) * CHAR_BIT) + +/*This is meant to be a large, positive constant that can still be efficiently + loaded as an immediate (on platforms like ARM, for example). + Even relatively modest values like 100 would work fine.*/ +#define VP8_LOTS_OF_BITS (0x40000000) + +typedef struct { + const unsigned char *user_buffer_end; + const unsigned char *user_buffer; + VP8_BD_VALUE value; + int count; + unsigned int range; + vpx_decrypt_cb decrypt_cb; + void *decrypt_state; +} BOOL_DECODER; + +DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]); + +int vp8dx_start_decode(BOOL_DECODER *br, const unsigned char *source, + unsigned int source_sz, vpx_decrypt_cb decrypt_cb, + void *decrypt_state); + +void vp8dx_bool_decoder_fill(BOOL_DECODER *br); + +static VPX_NO_UNSIGNED_SHIFT_CHECK int vp8dx_decode_bool(BOOL_DECODER *br, + int probability) { + unsigned int bit = 0; + VP8_BD_VALUE value; + unsigned int split; + VP8_BD_VALUE bigsplit; + int count; + unsigned int range; + + split = 1 + (((br->range - 1) * probability) >> 8); + + if (br->count < 0) vp8dx_bool_decoder_fill(br); + + value = br->value; + count = br->count; + + bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8); + + range = split; + + if (value >= bigsplit) { + range = br->range - split; + value = value - bigsplit; + bit = 1; + } + + { + const unsigned char shift = vp8_norm[(unsigned char)range]; + range <<= shift; + value <<= shift; + count -= shift; + } + br->value = value; + br->count = count; + br->range = range; + + return bit; +} + +static INLINE int vp8_decode_value(BOOL_DECODER *br, int bits) { + int z = 0; + int bit; + + for (bit = bits - 1; bit >= 0; bit--) { + z |= (vp8dx_decode_bool(br, 0x80) << bit); + } + + return z; +} + +static INLINE int vp8dx_bool_error(BOOL_DECODER *br) { + /* Check if we have reached the end of the buffer. + * + * Variable 'count' stores the number of bits in the 'value' buffer, minus + * 8. The top byte is part of the algorithm, and the remainder is buffered + * to be shifted into it. So if count == 8, the top 16 bits of 'value' are + * occupied, 8 for the algorithm and 8 in the buffer. + * + * When reading a byte from the user's buffer, count is filled with 8 and + * one byte is filled into the value buffer. When we reach the end of the + * data, count is additionally filled with VP8_LOTS_OF_BITS. So when + * count == VP8_LOTS_OF_BITS - 1, the user's data has been exhausted. + */ + if ((br->count > VP8_BD_VALUE_SIZE) && (br->count < VP8_LOTS_OF_BITS)) { + /* We have tried to decode bits after the end of + * stream was encountered. + */ + return 1; + } + + /* No error. */ + return 0; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_DECODER_DBOOLHUFF_H_ diff --git a/media/libvpx/libvpx/vp8/decoder/decodeframe.c b/media/libvpx/libvpx/vp8/decoder/decodeframe.c new file mode 100644 index 0000000000..af9a98c1de --- /dev/null +++ b/media/libvpx/libvpx/vp8/decoder/decodeframe.c @@ -0,0 +1,1263 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "./vpx_scale_rtcd.h" +#include "onyxd_int.h" +#include "vp8/common/header.h" +#include "vp8/common/reconintra4x4.h" +#include "vp8/common/reconinter.h" +#include "detokenize.h" +#include "vp8/common/common.h" +#include "vp8/common/invtrans.h" +#include "vp8/common/alloccommon.h" +#include "vp8/common/entropymode.h" +#include "vp8/common/quant_common.h" +#include "vpx_scale/vpx_scale.h" +#include "vp8/common/reconintra.h" +#include "vp8/common/setupintrarecon.h" + +#include "decodemv.h" +#include "vp8/common/extend.h" +#if CONFIG_ERROR_CONCEALMENT +#include "error_concealment.h" +#endif +#include "vpx_mem/vpx_mem.h" +#include "vp8/common/threading.h" +#include "decoderthreading.h" +#include "dboolhuff.h" +#include "vpx_dsp/vpx_dsp_common.h" + +#include +#include + +void vp8cx_init_de_quantizer(VP8D_COMP *pbi) { + int Q; + VP8_COMMON *const pc = &pbi->common; + + for (Q = 0; Q < QINDEX_RANGE; ++Q) { + pc->Y1dequant[Q][0] = (short)vp8_dc_quant(Q, pc->y1dc_delta_q); + pc->Y2dequant[Q][0] = (short)vp8_dc2quant(Q, pc->y2dc_delta_q); + pc->UVdequant[Q][0] = (short)vp8_dc_uv_quant(Q, pc->uvdc_delta_q); + + pc->Y1dequant[Q][1] = (short)vp8_ac_yquant(Q); + pc->Y2dequant[Q][1] = (short)vp8_ac2quant(Q, pc->y2ac_delta_q); + pc->UVdequant[Q][1] = (short)vp8_ac_uv_quant(Q, pc->uvac_delta_q); + } +} + +void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd) { + int i; + int QIndex; + MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; + VP8_COMMON *const pc = &pbi->common; + + /* Decide whether to use the default or alternate baseline Q value. */ + if (xd->segmentation_enabled) { + /* Abs Value */ + if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) { + QIndex = xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id]; + + /* Delta Value */ + } else { + QIndex = pc->base_qindex + + xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id]; + } + + QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) + : 0; /* Clamp to valid range */ + } else { + QIndex = pc->base_qindex; + } + + /* Set up the macroblock dequant constants */ + xd->dequant_y1_dc[0] = 1; + xd->dequant_y1[0] = pc->Y1dequant[QIndex][0]; + xd->dequant_y2[0] = pc->Y2dequant[QIndex][0]; + xd->dequant_uv[0] = pc->UVdequant[QIndex][0]; + + for (i = 1; i < 16; ++i) { + xd->dequant_y1_dc[i] = xd->dequant_y1[i] = pc->Y1dequant[QIndex][1]; + xd->dequant_y2[i] = pc->Y2dequant[QIndex][1]; + xd->dequant_uv[i] = pc->UVdequant[QIndex][1]; + } +} + +static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, + unsigned int mb_idx) { + MB_PREDICTION_MODE mode; + int i; +#if CONFIG_ERROR_CONCEALMENT + int corruption_detected = 0; +#else + (void)mb_idx; +#endif + + if (xd->mode_info_context->mbmi.mb_skip_coeff) { + vp8_reset_mb_tokens_context(xd); + } else if (!vp8dx_bool_error(xd->current_bc)) { + int eobtotal; + eobtotal = vp8_decode_mb_tokens(pbi, xd); + + /* Special case: Force the loopfilter to skip when eobtotal is zero */ + xd->mode_info_context->mbmi.mb_skip_coeff = (eobtotal == 0); + } + + mode = xd->mode_info_context->mbmi.mode; + + if (xd->segmentation_enabled) vp8_mb_init_dequantizer(pbi, xd); + +#if CONFIG_ERROR_CONCEALMENT + + if (pbi->ec_active) { + int throw_residual; + /* When we have independent partitions we can apply residual even + * though other partitions within the frame are corrupt. + */ + throw_residual = + (!pbi->independent_partitions && pbi->frame_corrupt_residual); + throw_residual = (throw_residual || vp8dx_bool_error(xd->current_bc)); + + if ((mb_idx >= pbi->mvs_corrupt_from_mb || throw_residual)) { + /* MB with corrupt residuals or corrupt mode/motion vectors. + * Better to use the predictor as reconstruction. + */ + pbi->frame_corrupt_residual = 1; + memset(xd->qcoeff, 0, sizeof(xd->qcoeff)); + + corruption_detected = 1; + + /* force idct to be skipped for B_PRED and use the + * prediction only for reconstruction + * */ + memset(xd->eobs, 0, 25); + } + } +#endif + + /* do prediction */ + if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) { + vp8_build_intra_predictors_mbuv_s( + xd, xd->recon_above[1], xd->recon_above[2], xd->recon_left[1], + xd->recon_left[2], xd->recon_left_stride[1], xd->dst.u_buffer, + xd->dst.v_buffer, xd->dst.uv_stride); + + if (mode != B_PRED) { + vp8_build_intra_predictors_mby_s( + xd, xd->recon_above[0], xd->recon_left[0], xd->recon_left_stride[0], + xd->dst.y_buffer, xd->dst.y_stride); + } else { + short *DQC = xd->dequant_y1; + int dst_stride = xd->dst.y_stride; + + /* clear out residual eob info */ + if (xd->mode_info_context->mbmi.mb_skip_coeff) memset(xd->eobs, 0, 25); + + intra_prediction_down_copy(xd, xd->recon_above[0] + 16); + + for (i = 0; i < 16; ++i) { + BLOCKD *b = &xd->block[i]; + unsigned char *dst = xd->dst.y_buffer + b->offset; + B_PREDICTION_MODE b_mode = xd->mode_info_context->bmi[i].as_mode; + unsigned char *Above = dst - dst_stride; + unsigned char *yleft = dst - 1; + int left_stride = dst_stride; + unsigned char top_left = Above[-1]; + + vp8_intra4x4_predict(Above, yleft, left_stride, b_mode, dst, dst_stride, + top_left); + + if (xd->eobs[i]) { + if (xd->eobs[i] > 1) { + vp8_dequant_idct_add(b->qcoeff, DQC, dst, dst_stride); + } else { + vp8_dc_only_idct_add(b->qcoeff[0] * DQC[0], dst, dst_stride, dst, + dst_stride); + memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0])); + } + } + } + } + } else { + vp8_build_inter_predictors_mb(xd); + } + +#if CONFIG_ERROR_CONCEALMENT + if (corruption_detected) { + return; + } +#endif + + if (!xd->mode_info_context->mbmi.mb_skip_coeff) { + /* dequantization and idct */ + if (mode != B_PRED) { + short *DQC = xd->dequant_y1; + + if (mode != SPLITMV) { + BLOCKD *b = &xd->block[24]; + + /* do 2nd order transform on the dc block */ + if (xd->eobs[24] > 1) { + vp8_dequantize_b(b, xd->dequant_y2); + + vp8_short_inv_walsh4x4(&b->dqcoeff[0], xd->qcoeff); + memset(b->qcoeff, 0, 16 * sizeof(b->qcoeff[0])); + } else { + b->dqcoeff[0] = (short)(b->qcoeff[0] * xd->dequant_y2[0]); + vp8_short_inv_walsh4x4_1(&b->dqcoeff[0], xd->qcoeff); + memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0])); + } + + /* override the dc dequant constant in order to preserve the + * dc components + */ + DQC = xd->dequant_y1_dc; + } + + vp8_dequant_idct_add_y_block(xd->qcoeff, DQC, xd->dst.y_buffer, + xd->dst.y_stride, xd->eobs); + } + + vp8_dequant_idct_add_uv_block(xd->qcoeff + 16 * 16, xd->dequant_uv, + xd->dst.u_buffer, xd->dst.v_buffer, + xd->dst.uv_stride, xd->eobs + 16); + } +} + +static int get_delta_q(vp8_reader *bc, int prev, int *q_update) { + int ret_val = 0; + + if (vp8_read_bit(bc)) { + ret_val = vp8_read_literal(bc, 4); + + if (vp8_read_bit(bc)) ret_val = -ret_val; + } + + /* Trigger a quantizer update if the delta-q value has changed */ + if (ret_val != prev) *q_update = 1; + + return ret_val; +} + +#ifdef PACKET_TESTING +#include +FILE *vpxlog = 0; +#endif + +static void yv12_extend_frame_top_c(YV12_BUFFER_CONFIG *ybf) { + int i; + unsigned char *src_ptr1; + unsigned char *dest_ptr1; + + unsigned int Border; + int plane_stride; + + /***********/ + /* Y Plane */ + /***********/ + Border = ybf->border; + plane_stride = ybf->y_stride; + src_ptr1 = ybf->y_buffer - Border; + dest_ptr1 = src_ptr1 - (Border * plane_stride); + + for (i = 0; i < (int)Border; ++i) { + memcpy(dest_ptr1, src_ptr1, plane_stride); + dest_ptr1 += plane_stride; + } + + /***********/ + /* U Plane */ + /***********/ + plane_stride = ybf->uv_stride; + Border /= 2; + src_ptr1 = ybf->u_buffer - Border; + dest_ptr1 = src_ptr1 - (Border * plane_stride); + + for (i = 0; i < (int)(Border); ++i) { + memcpy(dest_ptr1, src_ptr1, plane_stride); + dest_ptr1 += plane_stride; + } + + /***********/ + /* V Plane */ + /***********/ + + src_ptr1 = ybf->v_buffer - Border; + dest_ptr1 = src_ptr1 - (Border * plane_stride); + + for (i = 0; i < (int)(Border); ++i) { + memcpy(dest_ptr1, src_ptr1, plane_stride); + dest_ptr1 += plane_stride; + } +} + +static void yv12_extend_frame_bottom_c(YV12_BUFFER_CONFIG *ybf) { + int i; + unsigned char *src_ptr1, *src_ptr2; + unsigned char *dest_ptr2; + + unsigned int Border; + int plane_stride; + int plane_height; + + /***********/ + /* Y Plane */ + /***********/ + Border = ybf->border; + plane_stride = ybf->y_stride; + plane_height = ybf->y_height; + + src_ptr1 = ybf->y_buffer - Border; + src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; + dest_ptr2 = src_ptr2 + plane_stride; + + for (i = 0; i < (int)Border; ++i) { + memcpy(dest_ptr2, src_ptr2, plane_stride); + dest_ptr2 += plane_stride; + } + + /***********/ + /* U Plane */ + /***********/ + plane_stride = ybf->uv_stride; + plane_height = ybf->uv_height; + Border /= 2; + + src_ptr1 = ybf->u_buffer - Border; + src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; + dest_ptr2 = src_ptr2 + plane_stride; + + for (i = 0; i < (int)(Border); ++i) { + memcpy(dest_ptr2, src_ptr2, plane_stride); + dest_ptr2 += plane_stride; + } + + /***********/ + /* V Plane */ + /***********/ + + src_ptr1 = ybf->v_buffer - Border; + src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; + dest_ptr2 = src_ptr2 + plane_stride; + + for (i = 0; i < (int)(Border); ++i) { + memcpy(dest_ptr2, src_ptr2, plane_stride); + dest_ptr2 += plane_stride; + } +} + +static void yv12_extend_frame_left_right_c(YV12_BUFFER_CONFIG *ybf, + unsigned char *y_src, + unsigned char *u_src, + unsigned char *v_src) { + int i; + unsigned char *src_ptr1, *src_ptr2; + unsigned char *dest_ptr1, *dest_ptr2; + + unsigned int Border; + int plane_stride; + int plane_height; + int plane_width; + + /***********/ + /* Y Plane */ + /***********/ + Border = ybf->border; + plane_stride = ybf->y_stride; + plane_height = 16; + plane_width = ybf->y_width; + + /* copy the left and right most columns out */ + src_ptr1 = y_src; + src_ptr2 = src_ptr1 + plane_width - 1; + dest_ptr1 = src_ptr1 - Border; + dest_ptr2 = src_ptr2 + 1; + + for (i = 0; i < plane_height; ++i) { + memset(dest_ptr1, src_ptr1[0], Border); + memset(dest_ptr2, src_ptr2[0], Border); + src_ptr1 += plane_stride; + src_ptr2 += plane_stride; + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } + + /***********/ + /* U Plane */ + /***********/ + plane_stride = ybf->uv_stride; + plane_height = 8; + plane_width = ybf->uv_width; + Border /= 2; + + /* copy the left and right most columns out */ + src_ptr1 = u_src; + src_ptr2 = src_ptr1 + plane_width - 1; + dest_ptr1 = src_ptr1 - Border; + dest_ptr2 = src_ptr2 + 1; + + for (i = 0; i < plane_height; ++i) { + memset(dest_ptr1, src_ptr1[0], Border); + memset(dest_ptr2, src_ptr2[0], Border); + src_ptr1 += plane_stride; + src_ptr2 += plane_stride; + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } + + /***********/ + /* V Plane */ + /***********/ + + /* copy the left and right most columns out */ + src_ptr1 = v_src; + src_ptr2 = src_ptr1 + plane_width - 1; + dest_ptr1 = src_ptr1 - Border; + dest_ptr2 = src_ptr2 + 1; + + for (i = 0; i < plane_height; ++i) { + memset(dest_ptr1, src_ptr1[0], Border); + memset(dest_ptr2, src_ptr2[0], Border); + src_ptr1 += plane_stride; + src_ptr2 += plane_stride; + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } +} + +static void decode_mb_rows(VP8D_COMP *pbi) { + VP8_COMMON *const pc = &pbi->common; + MACROBLOCKD *const xd = &pbi->mb; + + MODE_INFO *lf_mic = xd->mode_info_context; + + int ibc = 0; + int num_part = 1 << pc->multi_token_partition; + + int recon_yoffset, recon_uvoffset; + int mb_row, mb_col; + int mb_idx = 0; + + YV12_BUFFER_CONFIG *yv12_fb_new = pbi->dec_fb_ref[INTRA_FRAME]; + + int recon_y_stride = yv12_fb_new->y_stride; + int recon_uv_stride = yv12_fb_new->uv_stride; + + unsigned char *ref_buffer[MAX_REF_FRAMES][3]; + unsigned char *dst_buffer[3]; + unsigned char *lf_dst[3]; + unsigned char *eb_dst[3]; + int i; + int ref_fb_corrupted[MAX_REF_FRAMES]; + + ref_fb_corrupted[INTRA_FRAME] = 0; + + for (i = 1; i < MAX_REF_FRAMES; ++i) { + YV12_BUFFER_CONFIG *this_fb = pbi->dec_fb_ref[i]; + + ref_buffer[i][0] = this_fb->y_buffer; + ref_buffer[i][1] = this_fb->u_buffer; + ref_buffer[i][2] = this_fb->v_buffer; + + ref_fb_corrupted[i] = this_fb->corrupted; + } + + /* Set up the buffer pointers */ + eb_dst[0] = lf_dst[0] = dst_buffer[0] = yv12_fb_new->y_buffer; + eb_dst[1] = lf_dst[1] = dst_buffer[1] = yv12_fb_new->u_buffer; + eb_dst[2] = lf_dst[2] = dst_buffer[2] = yv12_fb_new->v_buffer; + + xd->up_available = 0; + + /* Initialize the loop filter for this frame. */ + if (pc->filter_level) vp8_loop_filter_frame_init(pc, xd, pc->filter_level); + + vp8_setup_intra_recon_top_line(yv12_fb_new); + + /* Decode the individual macro block */ + for (mb_row = 0; mb_row < pc->mb_rows; ++mb_row) { + if (num_part > 1) { + xd->current_bc = &pbi->mbc[ibc]; + ibc++; + + if (ibc == num_part) ibc = 0; + } + + recon_yoffset = mb_row * recon_y_stride * 16; + recon_uvoffset = mb_row * recon_uv_stride * 8; + + /* reset contexts */ + xd->above_context = pc->above_context; + memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES)); + + xd->left_available = 0; + + xd->mb_to_top_edge = -((mb_row * 16) << 3); + xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3; + + xd->recon_above[0] = dst_buffer[0] + recon_yoffset; + xd->recon_above[1] = dst_buffer[1] + recon_uvoffset; + xd->recon_above[2] = dst_buffer[2] + recon_uvoffset; + + xd->recon_left[0] = xd->recon_above[0] - 1; + xd->recon_left[1] = xd->recon_above[1] - 1; + xd->recon_left[2] = xd->recon_above[2] - 1; + + xd->recon_above[0] -= xd->dst.y_stride; + xd->recon_above[1] -= xd->dst.uv_stride; + xd->recon_above[2] -= xd->dst.uv_stride; + + /* TODO: move to outside row loop */ + xd->recon_left_stride[0] = xd->dst.y_stride; + xd->recon_left_stride[1] = xd->dst.uv_stride; + + setup_intra_recon_left(xd->recon_left[0], xd->recon_left[1], + xd->recon_left[2], xd->dst.y_stride, + xd->dst.uv_stride); + + for (mb_col = 0; mb_col < pc->mb_cols; ++mb_col) { + /* Distance of Mb to the various image edges. + * These are specified to 8th pel as they are always compared to values + * that are in 1/8th pel units + */ + xd->mb_to_left_edge = -((mb_col * 16) << 3); + xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3; + +#if CONFIG_ERROR_CONCEALMENT + { + int corrupt_residual = + (!pbi->independent_partitions && pbi->frame_corrupt_residual) || + vp8dx_bool_error(xd->current_bc); + if (pbi->ec_active && + xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME && + corrupt_residual) { + /* We have an intra block with corrupt coefficients, better to + * conceal with an inter block. Interpolate MVs from neighboring + * MBs. + * + * Note that for the first mb with corrupt residual in a frame, + * we might not discover that before decoding the residual. That + * happens after this check, and therefore no inter concealment + * will be done. + */ + vp8_interpolate_motion(xd, mb_row, mb_col, pc->mb_rows, pc->mb_cols); + } + } +#endif + + xd->dst.y_buffer = dst_buffer[0] + recon_yoffset; + xd->dst.u_buffer = dst_buffer[1] + recon_uvoffset; + xd->dst.v_buffer = dst_buffer[2] + recon_uvoffset; + + if (xd->mode_info_context->mbmi.ref_frame >= LAST_FRAME) { + const MV_REFERENCE_FRAME ref = xd->mode_info_context->mbmi.ref_frame; + xd->pre.y_buffer = ref_buffer[ref][0] + recon_yoffset; + xd->pre.u_buffer = ref_buffer[ref][1] + recon_uvoffset; + xd->pre.v_buffer = ref_buffer[ref][2] + recon_uvoffset; + } else { + // ref_frame is INTRA_FRAME, pre buffer should not be used. + xd->pre.y_buffer = 0; + xd->pre.u_buffer = 0; + xd->pre.v_buffer = 0; + } + + /* propagate errors from reference frames */ + xd->corrupted |= ref_fb_corrupted[xd->mode_info_context->mbmi.ref_frame]; + + decode_macroblock(pbi, xd, mb_idx); + + mb_idx++; + xd->left_available = 1; + + /* check if the boolean decoder has suffered an error */ + xd->corrupted |= vp8dx_bool_error(xd->current_bc); + + xd->recon_above[0] += 16; + xd->recon_above[1] += 8; + xd->recon_above[2] += 8; + xd->recon_left[0] += 16; + xd->recon_left[1] += 8; + xd->recon_left[2] += 8; + + recon_yoffset += 16; + recon_uvoffset += 8; + + ++xd->mode_info_context; /* next mb */ + + xd->above_context++; + } + + /* adjust to the next row of mbs */ + vp8_extend_mb_row(yv12_fb_new, xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, + xd->dst.v_buffer + 8); + + ++xd->mode_info_context; /* skip prediction column */ + xd->up_available = 1; + + if (pc->filter_level) { + if (mb_row > 0) { + if (pc->filter_type == NORMAL_LOOPFILTER) { + vp8_loop_filter_row_normal(pc, lf_mic, mb_row - 1, recon_y_stride, + recon_uv_stride, lf_dst[0], lf_dst[1], + lf_dst[2]); + } else { + vp8_loop_filter_row_simple(pc, lf_mic, mb_row - 1, recon_y_stride, + lf_dst[0]); + } + if (mb_row > 1) { + yv12_extend_frame_left_right_c(yv12_fb_new, eb_dst[0], eb_dst[1], + eb_dst[2]); + + eb_dst[0] += recon_y_stride * 16; + eb_dst[1] += recon_uv_stride * 8; + eb_dst[2] += recon_uv_stride * 8; + } + + lf_dst[0] += recon_y_stride * 16; + lf_dst[1] += recon_uv_stride * 8; + lf_dst[2] += recon_uv_stride * 8; + lf_mic += pc->mb_cols; + lf_mic++; /* Skip border mb */ + } + } else { + if (mb_row > 0) { + /**/ + yv12_extend_frame_left_right_c(yv12_fb_new, eb_dst[0], eb_dst[1], + eb_dst[2]); + eb_dst[0] += recon_y_stride * 16; + eb_dst[1] += recon_uv_stride * 8; + eb_dst[2] += recon_uv_stride * 8; + } + } + } + + if (pc->filter_level) { + if (pc->filter_type == NORMAL_LOOPFILTER) { + vp8_loop_filter_row_normal(pc, lf_mic, mb_row - 1, recon_y_stride, + recon_uv_stride, lf_dst[0], lf_dst[1], + lf_dst[2]); + } else { + vp8_loop_filter_row_simple(pc, lf_mic, mb_row - 1, recon_y_stride, + lf_dst[0]); + } + + yv12_extend_frame_left_right_c(yv12_fb_new, eb_dst[0], eb_dst[1], + eb_dst[2]); + eb_dst[0] += recon_y_stride * 16; + eb_dst[1] += recon_uv_stride * 8; + eb_dst[2] += recon_uv_stride * 8; + } + yv12_extend_frame_left_right_c(yv12_fb_new, eb_dst[0], eb_dst[1], eb_dst[2]); + yv12_extend_frame_top_c(yv12_fb_new); + yv12_extend_frame_bottom_c(yv12_fb_new); +} + +static unsigned int read_partition_size(VP8D_COMP *pbi, + const unsigned char *cx_size) { + unsigned char temp[3]; + if (pbi->decrypt_cb) { + pbi->decrypt_cb(pbi->decrypt_state, cx_size, temp, 3); + cx_size = temp; + } + return cx_size[0] + (cx_size[1] << 8) + (cx_size[2] << 16); +} + +static int read_is_valid(const unsigned char *start, size_t len, + const unsigned char *end) { + return len != 0 && end > start && len <= (size_t)(end - start); +} + +static unsigned int read_available_partition_size( + VP8D_COMP *pbi, const unsigned char *token_part_sizes, + const unsigned char *fragment_start, + const unsigned char *first_fragment_end, const unsigned char *fragment_end, + int i, int num_part) { + VP8_COMMON *pc = &pbi->common; + const unsigned char *partition_size_ptr = token_part_sizes + i * 3; + unsigned int partition_size = 0; + ptrdiff_t bytes_left = fragment_end - fragment_start; + if (bytes_left < 0) { + vpx_internal_error( + &pc->error, VPX_CODEC_CORRUPT_FRAME, + "Truncated packet or corrupt partition. No bytes left %d.", + (int)bytes_left); + } + /* Calculate the length of this partition. The last partition + * size is implicit. If the partition size can't be read, then + * either use the remaining data in the buffer (for EC mode) + * or throw an error. + */ + if (i < num_part - 1) { + if (read_is_valid(partition_size_ptr, 3, first_fragment_end)) { + partition_size = read_partition_size(pbi, partition_size_ptr); + } else if (pbi->ec_active) { + partition_size = (unsigned int)bytes_left; + } else { + vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, + "Truncated partition size data"); + } + } else { + partition_size = (unsigned int)bytes_left; + } + + /* Validate the calculated partition length. If the buffer + * described by the partition can't be fully read, then restrict + * it to the portion that can be (for EC mode) or throw an error. + */ + if (!read_is_valid(fragment_start, partition_size, fragment_end)) { + if (pbi->ec_active) { + partition_size = (unsigned int)bytes_left; + } else { + vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, + "Truncated packet or corrupt partition " + "%d length", + i + 1); + } + } + return partition_size; +} + +static void setup_token_decoder(VP8D_COMP *pbi, + const unsigned char *token_part_sizes) { + vp8_reader *bool_decoder = &pbi->mbc[0]; + unsigned int partition_idx; + unsigned int fragment_idx; + unsigned int num_token_partitions; + const unsigned char *first_fragment_end = + pbi->fragments.ptrs[0] + pbi->fragments.sizes[0]; + + TOKEN_PARTITION multi_token_partition = + (TOKEN_PARTITION)vp8_read_literal(&pbi->mbc[8], 2); + if (!vp8dx_bool_error(&pbi->mbc[8])) { + pbi->common.multi_token_partition = multi_token_partition; + } + num_token_partitions = 1 << pbi->common.multi_token_partition; + + /* Check for partitions within the fragments and unpack the fragments + * so that each fragment pointer points to its corresponding partition. */ + for (fragment_idx = 0; fragment_idx < pbi->fragments.count; ++fragment_idx) { + unsigned int fragment_size = pbi->fragments.sizes[fragment_idx]; + const unsigned char *fragment_end = + pbi->fragments.ptrs[fragment_idx] + fragment_size; + /* Special case for handling the first partition since we have already + * read its size. */ + if (fragment_idx == 0) { + /* Size of first partition + token partition sizes element */ + ptrdiff_t ext_first_part_size = token_part_sizes - + pbi->fragments.ptrs[0] + + 3 * (num_token_partitions - 1); + if (fragment_size < (unsigned int)ext_first_part_size) + vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME, + "Corrupted fragment size %d", fragment_size); + fragment_size -= (unsigned int)ext_first_part_size; + if (fragment_size > 0) { + pbi->fragments.sizes[0] = (unsigned int)ext_first_part_size; + /* The fragment contains an additional partition. Move to + * next. */ + fragment_idx++; + pbi->fragments.ptrs[fragment_idx] = + pbi->fragments.ptrs[0] + pbi->fragments.sizes[0]; + } + } + /* Split the chunk into partitions read from the bitstream */ + while (fragment_size > 0) { + ptrdiff_t partition_size = read_available_partition_size( + pbi, token_part_sizes, pbi->fragments.ptrs[fragment_idx], + first_fragment_end, fragment_end, fragment_idx - 1, + num_token_partitions); + pbi->fragments.sizes[fragment_idx] = (unsigned int)partition_size; + if (fragment_size < (unsigned int)partition_size) + vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME, + "Corrupted fragment size %d", fragment_size); + fragment_size -= (unsigned int)partition_size; + assert(fragment_idx <= num_token_partitions); + if (fragment_size > 0) { + /* The fragment contains an additional partition. + * Move to next. */ + fragment_idx++; + pbi->fragments.ptrs[fragment_idx] = + pbi->fragments.ptrs[fragment_idx - 1] + partition_size; + } + } + } + + pbi->fragments.count = num_token_partitions + 1; + + for (partition_idx = 1; partition_idx < pbi->fragments.count; + ++partition_idx) { + if (vp8dx_start_decode(bool_decoder, pbi->fragments.ptrs[partition_idx], + pbi->fragments.sizes[partition_idx], pbi->decrypt_cb, + pbi->decrypt_state)) { + vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, + "Failed to allocate bool decoder %d", partition_idx); + } + + bool_decoder++; + } + +#if CONFIG_MULTITHREAD + /* Clamp number of decoder threads */ + if (pbi->decoding_thread_count > num_token_partitions - 1) { + pbi->decoding_thread_count = num_token_partitions - 1; + } + if ((int)pbi->decoding_thread_count > pbi->common.mb_rows - 1) { + assert(pbi->common.mb_rows > 0); + pbi->decoding_thread_count = pbi->common.mb_rows - 1; + } +#endif +} + +static void init_frame(VP8D_COMP *pbi) { + VP8_COMMON *const pc = &pbi->common; + MACROBLOCKD *const xd = &pbi->mb; + + if (pc->frame_type == KEY_FRAME) { + /* Various keyframe initializations */ + memcpy(pc->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context)); + + vp8_init_mbmode_probs(pc); + + vp8_default_coef_probs(pc); + + /* reset the segment feature data to 0 with delta coding (Default state). */ + memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data)); + xd->mb_segment_abs_delta = SEGMENT_DELTADATA; + + /* reset the mode ref deltasa for loop filter */ + memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas)); + memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas)); + + /* All buffers are implicitly updated on key frames. */ + pc->refresh_golden_frame = 1; + pc->refresh_alt_ref_frame = 1; + pc->copy_buffer_to_gf = 0; + pc->copy_buffer_to_arf = 0; + + /* Note that Golden and Altref modes cannot be used on a key frame so + * ref_frame_sign_bias[] is undefined and meaningless + */ + pc->ref_frame_sign_bias[GOLDEN_FRAME] = 0; + pc->ref_frame_sign_bias[ALTREF_FRAME] = 0; + } else { + /* To enable choice of different interploation filters */ + if (!pc->use_bilinear_mc_filter) { + xd->subpixel_predict = vp8_sixtap_predict4x4; + xd->subpixel_predict8x4 = vp8_sixtap_predict8x4; + xd->subpixel_predict8x8 = vp8_sixtap_predict8x8; + xd->subpixel_predict16x16 = vp8_sixtap_predict16x16; + } else { + xd->subpixel_predict = vp8_bilinear_predict4x4; + xd->subpixel_predict8x4 = vp8_bilinear_predict8x4; + xd->subpixel_predict8x8 = vp8_bilinear_predict8x8; + xd->subpixel_predict16x16 = vp8_bilinear_predict16x16; + } + + if (pbi->decoded_key_frame && pbi->ec_enabled && !pbi->ec_active) { + pbi->ec_active = 1; + } + } + + xd->left_context = &pc->left_context; + xd->mode_info_context = pc->mi; + xd->frame_type = pc->frame_type; + xd->mode_info_context->mbmi.mode = DC_PRED; + xd->mode_info_stride = pc->mode_info_stride; + xd->corrupted = 0; /* init without corruption */ + + xd->fullpixel_mask = ~0; + if (pc->full_pixel) xd->fullpixel_mask = ~7; +} + +int vp8_decode_frame(VP8D_COMP *pbi) { + vp8_reader *const bc = &pbi->mbc[8]; + VP8_COMMON *const pc = &pbi->common; + MACROBLOCKD *const xd = &pbi->mb; + const unsigned char *data = pbi->fragments.ptrs[0]; + const unsigned int data_sz = pbi->fragments.sizes[0]; + const unsigned char *data_end = data + data_sz; + ptrdiff_t first_partition_length_in_bytes; + + int i, j, k, l; + const int *const mb_feature_data_bits = vp8_mb_feature_data_bits; + int corrupt_tokens = 0; + int prev_independent_partitions = pbi->independent_partitions; + + YV12_BUFFER_CONFIG *yv12_fb_new = pbi->dec_fb_ref[INTRA_FRAME]; + + /* start with no corruption of current frame */ + xd->corrupted = 0; + yv12_fb_new->corrupted = 0; + + if (data_end - data < 3) { + if (!pbi->ec_active) { + vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, + "Truncated packet"); + } + + /* Declare the missing frame as an inter frame since it will + be handled as an inter frame when we have estimated its + motion vectors. */ + pc->frame_type = INTER_FRAME; + pc->version = 0; + pc->show_frame = 1; + first_partition_length_in_bytes = 0; + } else { + unsigned char clear_buffer[10]; + const unsigned char *clear = data; + if (pbi->decrypt_cb) { + int n = (int)VPXMIN(sizeof(clear_buffer), data_sz); + pbi->decrypt_cb(pbi->decrypt_state, data, clear_buffer, n); + clear = clear_buffer; + } + + pc->frame_type = (FRAME_TYPE)(clear[0] & 1); + pc->version = (clear[0] >> 1) & 7; + pc->show_frame = (clear[0] >> 4) & 1; + first_partition_length_in_bytes = + (clear[0] | (clear[1] << 8) | (clear[2] << 16)) >> 5; + + if (!pbi->ec_active && (data + first_partition_length_in_bytes > data_end || + data + first_partition_length_in_bytes < data)) { + vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, + "Truncated packet or corrupt partition 0 length"); + } + + data += 3; + clear += 3; + + vp8_setup_version(pc); + + if (pc->frame_type == KEY_FRAME) { + /* vet via sync code */ + /* When error concealment is enabled we should only check the sync + * code if we have enough bits available + */ + if (data + 3 < data_end) { + if (clear[0] != 0x9d || clear[1] != 0x01 || clear[2] != 0x2a) { + vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM, + "Invalid frame sync code"); + } + } + + /* If error concealment is enabled we should only parse the new size + * if we have enough data. Otherwise we will end up with the wrong + * size. + */ + if (data + 6 < data_end) { + pc->Width = (clear[3] | (clear[4] << 8)) & 0x3fff; + pc->horiz_scale = clear[4] >> 6; + pc->Height = (clear[5] | (clear[6] << 8)) & 0x3fff; + pc->vert_scale = clear[6] >> 6; + data += 7; + } else if (!pbi->ec_active) { + vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, + "Truncated key frame header"); + } else { + /* Error concealment is active, clear the frame. */ + data = data_end; + } + } else { + memcpy(&xd->pre, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG)); + memcpy(&xd->dst, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG)); + } + } + if ((!pbi->decoded_key_frame && pc->frame_type != KEY_FRAME)) { + return -1; + } + + init_frame(pbi); + + if (vp8dx_start_decode(bc, data, (unsigned int)(data_end - data), + pbi->decrypt_cb, pbi->decrypt_state)) { + vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate bool decoder 0"); + } + if (pc->frame_type == KEY_FRAME) { + (void)vp8_read_bit(bc); // colorspace + pc->clamp_type = (CLAMP_TYPE)vp8_read_bit(bc); + } + + /* Is segmentation enabled */ + xd->segmentation_enabled = (unsigned char)vp8_read_bit(bc); + + if (xd->segmentation_enabled) { + /* Signal whether or not the segmentation map is being explicitly updated + * this frame. */ + xd->update_mb_segmentation_map = (unsigned char)vp8_read_bit(bc); + xd->update_mb_segmentation_data = (unsigned char)vp8_read_bit(bc); + + if (xd->update_mb_segmentation_data) { + xd->mb_segment_abs_delta = (unsigned char)vp8_read_bit(bc); + + memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data)); + + /* For each segmentation feature (Quant and loop filter level) */ + for (i = 0; i < MB_LVL_MAX; ++i) { + for (j = 0; j < MAX_MB_SEGMENTS; ++j) { + /* Frame level data */ + if (vp8_read_bit(bc)) { + xd->segment_feature_data[i][j] = + (signed char)vp8_read_literal(bc, mb_feature_data_bits[i]); + + if (vp8_read_bit(bc)) { + xd->segment_feature_data[i][j] = -xd->segment_feature_data[i][j]; + } + } else { + xd->segment_feature_data[i][j] = 0; + } + } + } + } + + if (xd->update_mb_segmentation_map) { + /* Which macro block level features are enabled */ + memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs)); + + /* Read the probs used to decode the segment id for each macro block. */ + for (i = 0; i < MB_FEATURE_TREE_PROBS; ++i) { + /* If not explicitly set value is defaulted to 255 by memset above */ + if (vp8_read_bit(bc)) { + xd->mb_segment_tree_probs[i] = (vp8_prob)vp8_read_literal(bc, 8); + } + } + } + } else { + /* No segmentation updates on this frame */ + xd->update_mb_segmentation_map = 0; + xd->update_mb_segmentation_data = 0; + } + + /* Read the loop filter level and type */ + pc->filter_type = (LOOPFILTERTYPE)vp8_read_bit(bc); + pc->filter_level = vp8_read_literal(bc, 6); + pc->sharpness_level = vp8_read_literal(bc, 3); + + /* Read in loop filter deltas applied at the MB level based on mode or ref + * frame. */ + xd->mode_ref_lf_delta_update = 0; + xd->mode_ref_lf_delta_enabled = (unsigned char)vp8_read_bit(bc); + + if (xd->mode_ref_lf_delta_enabled) { + /* Do the deltas need to be updated */ + xd->mode_ref_lf_delta_update = (unsigned char)vp8_read_bit(bc); + + if (xd->mode_ref_lf_delta_update) { + /* Send update */ + for (i = 0; i < MAX_REF_LF_DELTAS; ++i) { + if (vp8_read_bit(bc)) { + /*sign = vp8_read_bit( bc );*/ + xd->ref_lf_deltas[i] = (signed char)vp8_read_literal(bc, 6); + + if (vp8_read_bit(bc)) { /* Apply sign */ + xd->ref_lf_deltas[i] = xd->ref_lf_deltas[i] * -1; + } + } + } + + /* Send update */ + for (i = 0; i < MAX_MODE_LF_DELTAS; ++i) { + if (vp8_read_bit(bc)) { + /*sign = vp8_read_bit( bc );*/ + xd->mode_lf_deltas[i] = (signed char)vp8_read_literal(bc, 6); + + if (vp8_read_bit(bc)) { /* Apply sign */ + xd->mode_lf_deltas[i] = xd->mode_lf_deltas[i] * -1; + } + } + } + } + } + + setup_token_decoder(pbi, data + first_partition_length_in_bytes); + + xd->current_bc = &pbi->mbc[0]; + + /* Read the default quantizers. */ + { + int Q, q_update; + + Q = vp8_read_literal(bc, 7); /* AC 1st order Q = default */ + pc->base_qindex = Q; + q_update = 0; + pc->y1dc_delta_q = get_delta_q(bc, pc->y1dc_delta_q, &q_update); + pc->y2dc_delta_q = get_delta_q(bc, pc->y2dc_delta_q, &q_update); + pc->y2ac_delta_q = get_delta_q(bc, pc->y2ac_delta_q, &q_update); + pc->uvdc_delta_q = get_delta_q(bc, pc->uvdc_delta_q, &q_update); + pc->uvac_delta_q = get_delta_q(bc, pc->uvac_delta_q, &q_update); + + if (q_update) vp8cx_init_de_quantizer(pbi); + + /* MB level dequantizer setup */ + vp8_mb_init_dequantizer(pbi, &pbi->mb); + } + + /* Determine if the golden frame or ARF buffer should be updated and how. + * For all non key frames the GF and ARF refresh flags and sign bias + * flags must be set explicitly. + */ + if (pc->frame_type != KEY_FRAME) { + /* Should the GF or ARF be updated from the current frame */ + pc->refresh_golden_frame = vp8_read_bit(bc); +#if CONFIG_ERROR_CONCEALMENT + /* Assume we shouldn't refresh golden if the bit is missing */ + xd->corrupted |= vp8dx_bool_error(bc); + if (pbi->ec_active && xd->corrupted) pc->refresh_golden_frame = 0; +#endif + + pc->refresh_alt_ref_frame = vp8_read_bit(bc); +#if CONFIG_ERROR_CONCEALMENT + /* Assume we shouldn't refresh altref if the bit is missing */ + xd->corrupted |= vp8dx_bool_error(bc); + if (pbi->ec_active && xd->corrupted) pc->refresh_alt_ref_frame = 0; +#endif + + /* Buffer to buffer copy flags. */ + pc->copy_buffer_to_gf = 0; + + if (!pc->refresh_golden_frame) { + pc->copy_buffer_to_gf = vp8_read_literal(bc, 2); + } + +#if CONFIG_ERROR_CONCEALMENT + /* Assume we shouldn't copy to the golden if the bit is missing */ + xd->corrupted |= vp8dx_bool_error(bc); + if (pbi->ec_active && xd->corrupted) pc->copy_buffer_to_gf = 0; +#endif + + pc->copy_buffer_to_arf = 0; + + if (!pc->refresh_alt_ref_frame) { + pc->copy_buffer_to_arf = vp8_read_literal(bc, 2); + } + +#if CONFIG_ERROR_CONCEALMENT + /* Assume we shouldn't copy to the alt-ref if the bit is missing */ + xd->corrupted |= vp8dx_bool_error(bc); + if (pbi->ec_active && xd->corrupted) pc->copy_buffer_to_arf = 0; +#endif + + pc->ref_frame_sign_bias[GOLDEN_FRAME] = vp8_read_bit(bc); + pc->ref_frame_sign_bias[ALTREF_FRAME] = vp8_read_bit(bc); + } + + pc->refresh_entropy_probs = vp8_read_bit(bc); +#if CONFIG_ERROR_CONCEALMENT + /* Assume we shouldn't refresh the probabilities if the bit is + * missing */ + xd->corrupted |= vp8dx_bool_error(bc); + if (pbi->ec_active && xd->corrupted) pc->refresh_entropy_probs = 0; +#endif + if (pc->refresh_entropy_probs == 0) { + memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc)); + } + + pc->refresh_last_frame = pc->frame_type == KEY_FRAME || vp8_read_bit(bc); + +#if CONFIG_ERROR_CONCEALMENT + /* Assume we should refresh the last frame if the bit is missing */ + xd->corrupted |= vp8dx_bool_error(bc); + if (pbi->ec_active && xd->corrupted) pc->refresh_last_frame = 1; +#endif + + { + pbi->independent_partitions = 1; + + /* read coef probability tree */ + for (i = 0; i < BLOCK_TYPES; ++i) { + for (j = 0; j < COEF_BANDS; ++j) { + for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { + for (l = 0; l < ENTROPY_NODES; ++l) { + vp8_prob *const p = pc->fc.coef_probs[i][j][k] + l; + + if (vp8_read(bc, vp8_coef_update_probs[i][j][k][l])) { + *p = (vp8_prob)vp8_read_literal(bc, 8); + } + if (k > 0 && *p != pc->fc.coef_probs[i][j][k - 1][l]) { + pbi->independent_partitions = 0; + } + } + } + } + } + } + + /* clear out the coeff buffer */ + memset(xd->qcoeff, 0, sizeof(xd->qcoeff)); + + vp8_decode_mode_mvs(pbi); + +#if CONFIG_ERROR_CONCEALMENT + if (pbi->ec_active && + pbi->mvs_corrupt_from_mb < (unsigned int)pc->mb_cols * pc->mb_rows) { + /* Motion vectors are missing in this frame. We will try to estimate + * them and then continue decoding the frame as usual */ + vp8_estimate_missing_mvs(pbi); + } +#endif + + memset(pc->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols); + pbi->frame_corrupt_residual = 0; + +#if CONFIG_MULTITHREAD + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) && + pc->multi_token_partition != ONE_PARTITION) { + unsigned int thread; + if (vp8mt_decode_mb_rows(pbi, xd)) { + vp8_decoder_remove_threads(pbi); + pbi->restart_threads = 1; + vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME, NULL); + } + vp8_yv12_extend_frame_borders(yv12_fb_new); + for (thread = 0; thread < pbi->decoding_thread_count; ++thread) { + corrupt_tokens |= pbi->mb_row_di[thread].mbd.corrupted; + } + } else +#endif + { + decode_mb_rows(pbi); + corrupt_tokens |= xd->corrupted; + } + + /* Collect information about decoder corruption. */ + /* 1. Check first boolean decoder for errors. */ + yv12_fb_new->corrupted = vp8dx_bool_error(bc); + /* 2. Check the macroblock information */ + yv12_fb_new->corrupted |= corrupt_tokens; + + if (!pbi->decoded_key_frame) { + if (pc->frame_type == KEY_FRAME && !yv12_fb_new->corrupted) { + pbi->decoded_key_frame = 1; + } else { + vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME, + "A stream must start with a complete key frame"); + } + } + + /* vpx_log("Decoder: Frame Decoded, Size Roughly:%d bytes + * \n",bc->pos+pbi->bc2.pos); */ + + if (pc->refresh_entropy_probs == 0) { + memcpy(&pc->fc, &pc->lfc, sizeof(pc->fc)); + pbi->independent_partitions = prev_independent_partitions; + } + +#ifdef PACKET_TESTING + { + FILE *f = fopen("decompressor.VP8", "ab"); + unsigned int size = pbi->bc2.pos + pbi->bc.pos + 8; + fwrite((void *)&size, 4, 1, f); + fwrite((void *)pbi->Source, size, 1, f); + fclose(f); + } +#endif + + return 0; +} diff --git a/media/libvpx/libvpx/vp8/decoder/decodemv.c b/media/libvpx/libvpx/vp8/decoder/decodemv.c new file mode 100644 index 0000000000..3f459d623f --- /dev/null +++ b/media/libvpx/libvpx/vp8/decoder/decodemv.c @@ -0,0 +1,562 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "decodemv.h" +#include "treereader.h" +#include "vp8/common/entropymv.h" +#include "vp8/common/entropymode.h" +#include "onyxd_int.h" +#include "vp8/common/findnearmv.h" + +static B_PREDICTION_MODE read_bmode(vp8_reader *bc, const vp8_prob *p) { + const int i = vp8_treed_read(bc, vp8_bmode_tree, p); + + return (B_PREDICTION_MODE)i; +} + +static MB_PREDICTION_MODE read_ymode(vp8_reader *bc, const vp8_prob *p) { + const int i = vp8_treed_read(bc, vp8_ymode_tree, p); + + return (MB_PREDICTION_MODE)i; +} + +static MB_PREDICTION_MODE read_kf_ymode(vp8_reader *bc, const vp8_prob *p) { + const int i = vp8_treed_read(bc, vp8_kf_ymode_tree, p); + + return (MB_PREDICTION_MODE)i; +} + +static MB_PREDICTION_MODE read_uv_mode(vp8_reader *bc, const vp8_prob *p) { + const int i = vp8_treed_read(bc, vp8_uv_mode_tree, p); + + return (MB_PREDICTION_MODE)i; +} + +static void read_kf_modes(VP8D_COMP *pbi, MODE_INFO *mi) { + vp8_reader *const bc = &pbi->mbc[8]; + const int mis = pbi->common.mode_info_stride; + + mi->mbmi.ref_frame = INTRA_FRAME; + mi->mbmi.mode = read_kf_ymode(bc, vp8_kf_ymode_prob); + + if (mi->mbmi.mode == B_PRED) { + int i = 0; + mi->mbmi.is_4x4 = 1; + + do { + const B_PREDICTION_MODE A = above_block_mode(mi, i, mis); + const B_PREDICTION_MODE L = left_block_mode(mi, i); + + mi->bmi[i].as_mode = read_bmode(bc, vp8_kf_bmode_prob[A][L]); + } while (++i < 16); + } + + mi->mbmi.uv_mode = read_uv_mode(bc, vp8_kf_uv_mode_prob); +} + +static int read_mvcomponent(vp8_reader *r, const MV_CONTEXT *mvc) { + const vp8_prob *const p = (const vp8_prob *)mvc; + int x = 0; + + if (vp8_read(r, p[mvpis_short])) { /* Large */ + int i = 0; + + do { + x += vp8_read(r, p[MVPbits + i]) << i; + } while (++i < 3); + + i = mvlong_width - 1; /* Skip bit 3, which is sometimes implicit */ + + do { + x += vp8_read(r, p[MVPbits + i]) << i; + } while (--i > 3); + + if (!(x & 0xFFF0) || vp8_read(r, p[MVPbits + 3])) x += 8; + } else { /* small */ + x = vp8_treed_read(r, vp8_small_mvtree, p + MVPshort); + } + + if (x && vp8_read(r, p[MVPsign])) x = -x; + + return x; +} + +static void read_mv(vp8_reader *r, MV *mv, const MV_CONTEXT *mvc) { + mv->row = (short)(read_mvcomponent(r, mvc) * 2); + mv->col = (short)(read_mvcomponent(r, ++mvc) * 2); +} + +static void read_mvcontexts(vp8_reader *bc, MV_CONTEXT *mvc) { + int i = 0; + + do { + const vp8_prob *up = vp8_mv_update_probs[i].prob; + vp8_prob *p = (vp8_prob *)(mvc + i); + vp8_prob *const pstop = p + MVPcount; + + do { + if (vp8_read(bc, *up++)) { + const vp8_prob x = (vp8_prob)vp8_read_literal(bc, 7); + + *p = x ? x << 1 : 1; + } + } while (++p < pstop); + } while (++i < 2); +} + +static const unsigned char mbsplit_fill_count[4] = { 8, 8, 4, 1 }; +static const unsigned char mbsplit_fill_offset[4][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 }, + { 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } +}; + +static void mb_mode_mv_init(VP8D_COMP *pbi) { + vp8_reader *const bc = &pbi->mbc[8]; + MV_CONTEXT *const mvc = pbi->common.fc.mvc; + +#if CONFIG_ERROR_CONCEALMENT + /* Default is that no macroblock is corrupt, therefore we initialize + * mvs_corrupt_from_mb to something very big, which we can be sure is + * outside the frame. */ + pbi->mvs_corrupt_from_mb = UINT_MAX; +#endif + /* Read the mb_no_coeff_skip flag */ + pbi->common.mb_no_coeff_skip = (int)vp8_read_bit(bc); + + pbi->prob_skip_false = 0; + if (pbi->common.mb_no_coeff_skip) { + pbi->prob_skip_false = (vp8_prob)vp8_read_literal(bc, 8); + } + + if (pbi->common.frame_type != KEY_FRAME) { + pbi->prob_intra = (vp8_prob)vp8_read_literal(bc, 8); + pbi->prob_last = (vp8_prob)vp8_read_literal(bc, 8); + pbi->prob_gf = (vp8_prob)vp8_read_literal(bc, 8); + + if (vp8_read_bit(bc)) { + int i = 0; + + do { + pbi->common.fc.ymode_prob[i] = (vp8_prob)vp8_read_literal(bc, 8); + } while (++i < 4); + } + + if (vp8_read_bit(bc)) { + int i = 0; + + do { + pbi->common.fc.uv_mode_prob[i] = (vp8_prob)vp8_read_literal(bc, 8); + } while (++i < 3); + } + + read_mvcontexts(bc, mvc); + } +} + +const vp8_prob vp8_sub_mv_ref_prob3[8][VP8_SUBMVREFS - 1] = { + { 147, 136, 18 }, /* SUBMVREF_NORMAL */ + { 223, 1, 34 }, /* SUBMVREF_LEFT_ABOVE_SAME */ + { 106, 145, 1 }, /* SUBMVREF_LEFT_ZED */ + { 208, 1, 1 }, /* SUBMVREF_LEFT_ABOVE_ZED */ + { 179, 121, 1 }, /* SUBMVREF_ABOVE_ZED */ + { 223, 1, 34 }, /* SUBMVREF_LEFT_ABOVE_SAME */ + { 179, 121, 1 }, /* SUBMVREF_ABOVE_ZED */ + { 208, 1, 1 } /* SUBMVREF_LEFT_ABOVE_ZED */ +}; + +static const vp8_prob *get_sub_mv_ref_prob(const uint32_t left, + const uint32_t above) { + int lez = (left == 0); + int aez = (above == 0); + int lea = (left == above); + const vp8_prob *prob; + + prob = vp8_sub_mv_ref_prob3[(aez << 2) | (lez << 1) | (lea)]; + + return prob; +} + +static void decode_split_mv(vp8_reader *const bc, MODE_INFO *mi, + const MODE_INFO *left_mb, const MODE_INFO *above_mb, + MB_MODE_INFO *mbmi, int_mv best_mv, + MV_CONTEXT *const mvc, int mb_to_left_edge, + int mb_to_right_edge, int mb_to_top_edge, + int mb_to_bottom_edge) { + int s; /* split configuration (16x8, 8x16, 8x8, 4x4) */ + /* number of partitions in the split configuration (see vp8_mbsplit_count) */ + int num_p; + int j = 0; + + s = 3; + num_p = 16; + if (vp8_read(bc, 110)) { + s = 2; + num_p = 4; + if (vp8_read(bc, 111)) { + s = vp8_read(bc, 150); + num_p = 2; + } + } + + do /* for each subset j */ + { + int_mv leftmv, abovemv; + int_mv blockmv; + int k; /* first block in subset j */ + + const vp8_prob *prob; + k = vp8_mbsplit_offset[s][j]; + + if (!(k & 3)) { + /* On L edge, get from MB to left of us */ + if (left_mb->mbmi.mode != SPLITMV) { + leftmv.as_int = left_mb->mbmi.mv.as_int; + } else { + leftmv.as_int = (left_mb->bmi + k + 4 - 1)->mv.as_int; + } + } else { + leftmv.as_int = (mi->bmi + k - 1)->mv.as_int; + } + + if (!(k >> 2)) { + /* On top edge, get from MB above us */ + if (above_mb->mbmi.mode != SPLITMV) { + abovemv.as_int = above_mb->mbmi.mv.as_int; + } else { + abovemv.as_int = (above_mb->bmi + k + 16 - 4)->mv.as_int; + } + } else { + abovemv.as_int = (mi->bmi + k - 4)->mv.as_int; + } + + prob = get_sub_mv_ref_prob(leftmv.as_int, abovemv.as_int); + + if (vp8_read(bc, prob[0])) { + if (vp8_read(bc, prob[1])) { + blockmv.as_int = 0; + if (vp8_read(bc, prob[2])) { + blockmv.as_mv.row = read_mvcomponent(bc, &mvc[0]) * 2; + blockmv.as_mv.row += best_mv.as_mv.row; + blockmv.as_mv.col = read_mvcomponent(bc, &mvc[1]) * 2; + blockmv.as_mv.col += best_mv.as_mv.col; + } + } else { + blockmv.as_int = abovemv.as_int; + } + } else { + blockmv.as_int = leftmv.as_int; + } + + mbmi->need_to_clamp_mvs |= + vp8_check_mv_bounds(&blockmv, mb_to_left_edge, mb_to_right_edge, + mb_to_top_edge, mb_to_bottom_edge); + + { + /* Fill (uniform) modes, mvs of jth subset. + Must do it here because ensuing subsets can + refer back to us via "left" or "above". */ + const unsigned char *fill_offset; + unsigned int fill_count = mbsplit_fill_count[s]; + + fill_offset = + &mbsplit_fill_offset[s][(unsigned char)j * mbsplit_fill_count[s]]; + + do { + mi->bmi[*fill_offset].mv.as_int = blockmv.as_int; + fill_offset++; + } while (--fill_count); + } + + } while (++j < num_p); + + mbmi->partitioning = s; +} + +static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, + MB_MODE_INFO *mbmi) { + vp8_reader *const bc = &pbi->mbc[8]; + mbmi->ref_frame = (MV_REFERENCE_FRAME)vp8_read(bc, pbi->prob_intra); + if (mbmi->ref_frame) { /* inter MB */ + enum { CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV }; + int cnt[4]; + int *cntx = cnt; + int_mv near_mvs[4]; + int_mv *nmv = near_mvs; + const int mis = pbi->mb.mode_info_stride; + const MODE_INFO *above = mi - mis; + const MODE_INFO *left = mi - 1; + const MODE_INFO *aboveleft = above - 1; + int *ref_frame_sign_bias = pbi->common.ref_frame_sign_bias; + + mbmi->need_to_clamp_mvs = 0; + + if (vp8_read(bc, pbi->prob_last)) { + mbmi->ref_frame = + (MV_REFERENCE_FRAME)((int)(2 + vp8_read(bc, pbi->prob_gf))); + } + + /* Zero accumulators */ + nmv[0].as_int = nmv[1].as_int = nmv[2].as_int = 0; + cnt[0] = cnt[1] = cnt[2] = cnt[3] = 0; + + /* Process above */ + if (above->mbmi.ref_frame != INTRA_FRAME) { + if (above->mbmi.mv.as_int) { + (++nmv)->as_int = above->mbmi.mv.as_int; + mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame], mbmi->ref_frame, + nmv, ref_frame_sign_bias); + ++cntx; + } + + *cntx += 2; + } + + /* Process left */ + if (left->mbmi.ref_frame != INTRA_FRAME) { + if (left->mbmi.mv.as_int) { + int_mv this_mv; + + this_mv.as_int = left->mbmi.mv.as_int; + mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame], mbmi->ref_frame, + &this_mv, ref_frame_sign_bias); + + if (this_mv.as_int != nmv->as_int) { + (++nmv)->as_int = this_mv.as_int; + ++cntx; + } + + *cntx += 2; + } else { + cnt[CNT_INTRA] += 2; + } + } + + /* Process above left */ + if (aboveleft->mbmi.ref_frame != INTRA_FRAME) { + if (aboveleft->mbmi.mv.as_int) { + int_mv this_mv; + + this_mv.as_int = aboveleft->mbmi.mv.as_int; + mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame], mbmi->ref_frame, + &this_mv, ref_frame_sign_bias); + + if (this_mv.as_int != nmv->as_int) { + (++nmv)->as_int = this_mv.as_int; + ++cntx; + } + + *cntx += 1; + } else { + cnt[CNT_INTRA] += 1; + } + } + + if (vp8_read(bc, vp8_mode_contexts[cnt[CNT_INTRA]][0])) { + /* If we have three distinct MV's ... */ + /* See if above-left MV can be merged with NEAREST */ + cnt[CNT_NEAREST] += ((cnt[CNT_SPLITMV] > 0) & + (nmv->as_int == near_mvs[CNT_NEAREST].as_int)); + + /* Swap near and nearest if necessary */ + if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) { + int tmp; + tmp = cnt[CNT_NEAREST]; + cnt[CNT_NEAREST] = cnt[CNT_NEAR]; + cnt[CNT_NEAR] = tmp; + tmp = (int)near_mvs[CNT_NEAREST].as_int; + near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int; + near_mvs[CNT_NEAR].as_int = (uint32_t)tmp; + } + + if (vp8_read(bc, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) { + if (vp8_read(bc, vp8_mode_contexts[cnt[CNT_NEAR]][2])) { + int mb_to_top_edge; + int mb_to_bottom_edge; + int mb_to_left_edge; + int mb_to_right_edge; + MV_CONTEXT *const mvc = pbi->common.fc.mvc; + int near_index; + + mb_to_top_edge = pbi->mb.mb_to_top_edge; + mb_to_bottom_edge = pbi->mb.mb_to_bottom_edge; + mb_to_top_edge -= LEFT_TOP_MARGIN; + mb_to_bottom_edge += RIGHT_BOTTOM_MARGIN; + mb_to_right_edge = pbi->mb.mb_to_right_edge; + mb_to_right_edge += RIGHT_BOTTOM_MARGIN; + mb_to_left_edge = pbi->mb.mb_to_left_edge; + mb_to_left_edge -= LEFT_TOP_MARGIN; + + /* Use near_mvs[0] to store the "best" MV */ + near_index = CNT_INTRA + (cnt[CNT_NEAREST] >= cnt[CNT_INTRA]); + + vp8_clamp_mv2(&near_mvs[near_index], &pbi->mb); + + cnt[CNT_SPLITMV] = + ((above->mbmi.mode == SPLITMV) + (left->mbmi.mode == SPLITMV)) * + 2 + + (aboveleft->mbmi.mode == SPLITMV); + + if (vp8_read(bc, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) { + decode_split_mv(bc, mi, left, above, mbmi, near_mvs[near_index], + mvc, mb_to_left_edge, mb_to_right_edge, + mb_to_top_edge, mb_to_bottom_edge); + mbmi->mv.as_int = mi->bmi[15].mv.as_int; + mbmi->mode = SPLITMV; + mbmi->is_4x4 = 1; + } else { + int_mv *const mbmi_mv = &mbmi->mv; + read_mv(bc, &mbmi_mv->as_mv, (const MV_CONTEXT *)mvc); + mbmi_mv->as_mv.row += near_mvs[near_index].as_mv.row; + mbmi_mv->as_mv.col += near_mvs[near_index].as_mv.col; + + /* Don't need to check this on NEARMV and NEARESTMV + * modes since those modes clamp the MV. The NEWMV mode + * does not, so signal to the prediction stage whether + * special handling may be required. + */ + mbmi->need_to_clamp_mvs = + vp8_check_mv_bounds(mbmi_mv, mb_to_left_edge, mb_to_right_edge, + mb_to_top_edge, mb_to_bottom_edge); + mbmi->mode = NEWMV; + } + } else { + mbmi->mode = NEARMV; + mbmi->mv.as_int = near_mvs[CNT_NEAR].as_int; + vp8_clamp_mv2(&mbmi->mv, &pbi->mb); + } + } else { + mbmi->mode = NEARESTMV; + mbmi->mv.as_int = near_mvs[CNT_NEAREST].as_int; + vp8_clamp_mv2(&mbmi->mv, &pbi->mb); + } + } else { + mbmi->mode = ZEROMV; + mbmi->mv.as_int = 0; + } + +#if CONFIG_ERROR_CONCEALMENT + if (pbi->ec_enabled && (mbmi->mode != SPLITMV)) { + mi->bmi[0].mv.as_int = mi->bmi[1].mv.as_int = mi->bmi[2].mv.as_int = + mi->bmi[3].mv.as_int = mi->bmi[4].mv.as_int = mi->bmi[5].mv.as_int = + mi->bmi[6].mv.as_int = mi->bmi[7].mv.as_int = + mi->bmi[8].mv.as_int = mi->bmi[9].mv.as_int = + mi->bmi[10].mv.as_int = mi->bmi[11].mv.as_int = + mi->bmi[12].mv.as_int = mi->bmi[13].mv.as_int = + mi->bmi[14].mv.as_int = mi->bmi[15].mv.as_int = + mbmi->mv.as_int; + } +#endif + } else { + /* required for left and above block mv */ + mbmi->mv.as_int = 0; + + /* MB is intra coded */ + if ((mbmi->mode = read_ymode(bc, pbi->common.fc.ymode_prob)) == B_PRED) { + int j = 0; + mbmi->is_4x4 = 1; + do { + mi->bmi[j].as_mode = read_bmode(bc, pbi->common.fc.bmode_prob); + } while (++j < 16); + } + + mbmi->uv_mode = read_uv_mode(bc, pbi->common.fc.uv_mode_prob); + } +} + +static void read_mb_features(vp8_reader *r, MB_MODE_INFO *mi, MACROBLOCKD *x) { + /* Is segmentation enabled */ + if (x->segmentation_enabled && x->update_mb_segmentation_map) { + /* If so then read the segment id. */ + if (vp8_read(r, x->mb_segment_tree_probs[0])) { + mi->segment_id = + (unsigned char)(2 + vp8_read(r, x->mb_segment_tree_probs[2])); + } else { + mi->segment_id = + (unsigned char)(vp8_read(r, x->mb_segment_tree_probs[1])); + } + } +} + +static void decode_mb_mode_mvs(VP8D_COMP *pbi, MODE_INFO *mi) { + /* Read the Macroblock segmentation map if it is being updated explicitly + * this frame (reset to 0 above by default) + * By default on a key frame reset all MBs to segment 0 + */ + if (pbi->mb.update_mb_segmentation_map) { + read_mb_features(&pbi->mbc[8], &mi->mbmi, &pbi->mb); + } else if (pbi->common.frame_type == KEY_FRAME) { + mi->mbmi.segment_id = 0; + } + + /* Read the macroblock coeff skip flag if this feature is in use, + * else default to 0 */ + if (pbi->common.mb_no_coeff_skip) { + mi->mbmi.mb_skip_coeff = vp8_read(&pbi->mbc[8], pbi->prob_skip_false); + } else { + mi->mbmi.mb_skip_coeff = 0; + } + + mi->mbmi.is_4x4 = 0; + if (pbi->common.frame_type == KEY_FRAME) { + read_kf_modes(pbi, mi); + } else { + read_mb_modes_mv(pbi, mi, &mi->mbmi); + } +} + +void vp8_decode_mode_mvs(VP8D_COMP *pbi) { + MODE_INFO *mi = pbi->common.mi; + int mb_row = -1; + int mb_to_right_edge_start; + + mb_mode_mv_init(pbi); + + pbi->mb.mb_to_top_edge = 0; + pbi->mb.mb_to_bottom_edge = ((pbi->common.mb_rows - 1) * 16) << 3; + mb_to_right_edge_start = ((pbi->common.mb_cols - 1) * 16) << 3; + + while (++mb_row < pbi->common.mb_rows) { + int mb_col = -1; + + pbi->mb.mb_to_left_edge = 0; + pbi->mb.mb_to_right_edge = mb_to_right_edge_start; + + while (++mb_col < pbi->common.mb_cols) { +#if CONFIG_ERROR_CONCEALMENT + int mb_num = mb_row * pbi->common.mb_cols + mb_col; +#endif + + decode_mb_mode_mvs(pbi, mi); + +#if CONFIG_ERROR_CONCEALMENT + /* look for corruption. set mvs_corrupt_from_mb to the current + * mb_num if the frame is corrupt from this macroblock. */ + if (vp8dx_bool_error(&pbi->mbc[8]) && + mb_num < (int)pbi->mvs_corrupt_from_mb) { + pbi->mvs_corrupt_from_mb = mb_num; + /* no need to continue since the partition is corrupt from + * here on. + */ + return; + } +#endif + + pbi->mb.mb_to_left_edge -= (16 << 3); + pbi->mb.mb_to_right_edge -= (16 << 3); + mi++; /* next macroblock */ + } + pbi->mb.mb_to_top_edge -= (16 << 3); + pbi->mb.mb_to_bottom_edge -= (16 << 3); + + mi++; /* skip left predictor each row */ + } +} diff --git a/media/libvpx/libvpx/vp8/decoder/decodemv.h b/media/libvpx/libvpx/vp8/decoder/decodemv.h new file mode 100644 index 0000000000..504e943d85 --- /dev/null +++ b/media/libvpx/libvpx/vp8/decoder/decodemv.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_DECODER_DECODEMV_H_ +#define VPX_VP8_DECODER_DECODEMV_H_ + +#include "onyxd_int.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_decode_mode_mvs(VP8D_COMP *); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_DECODER_DECODEMV_H_ diff --git a/media/libvpx/libvpx/vp8/decoder/decoderthreading.h b/media/libvpx/libvpx/vp8/decoder/decoderthreading.h new file mode 100644 index 0000000000..3d49bc8317 --- /dev/null +++ b/media/libvpx/libvpx/vp8/decoder/decoderthreading.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_DECODER_DECODERTHREADING_H_ +#define VPX_VP8_DECODER_DECODERTHREADING_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#if CONFIG_MULTITHREAD +int vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd); +void vp8_decoder_remove_threads(VP8D_COMP *pbi); +void vp8_decoder_create_threads(VP8D_COMP *pbi); +void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows); +void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows); +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_DECODER_DECODERTHREADING_H_ diff --git a/media/libvpx/libvpx/vp8/decoder/detokenize.c b/media/libvpx/libvpx/vp8/decoder/detokenize.c new file mode 100644 index 0000000000..1c77873f0b --- /dev/null +++ b/media/libvpx/libvpx/vp8/decoder/detokenize.c @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp8/common/blockd.h" +#include "onyxd_int.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/compiler_attributes.h" +#include "vpx_ports/mem.h" +#include "detokenize.h" + +void vp8_reset_mb_tokens_context(MACROBLOCKD *x) { + ENTROPY_CONTEXT *a_ctx = ((ENTROPY_CONTEXT *)x->above_context); + ENTROPY_CONTEXT *l_ctx = ((ENTROPY_CONTEXT *)x->left_context); + + memset(a_ctx, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1); + memset(l_ctx, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1); + + /* Clear entropy contexts for Y2 blocks */ + if (!x->mode_info_context->mbmi.is_4x4) { + a_ctx[8] = l_ctx[8] = 0; + } +} + +/* + ------------------------------------------------------------------------------ + Residual decoding (Paragraph 13.2 / 13.3) +*/ +static const uint8_t kBands[16 + 1] = { + 0, 1, 2, 3, 6, 4, 5, 6, 6, + 6, 6, 6, 6, 6, 6, 7, 0 /* extra entry as sentinel */ +}; + +static const uint8_t kCat3[] = { 173, 148, 140, 0 }; +static const uint8_t kCat4[] = { 176, 155, 140, 135, 0 }; +static const uint8_t kCat5[] = { 180, 157, 141, 134, 130, 0 }; +static const uint8_t kCat6[] = { 254, 254, 243, 230, 196, 177, + 153, 140, 133, 130, 129, 0 }; +static const uint8_t *const kCat3456[] = { kCat3, kCat4, kCat5, kCat6 }; +static const uint8_t kZigzag[16] = { 0, 1, 4, 8, 5, 2, 3, 6, + 9, 12, 13, 10, 7, 11, 14, 15 }; + +#define VP8GetBit vp8dx_decode_bool +#define NUM_PROBAS 11 +#define NUM_CTX 3 + +/* for const-casting */ +typedef const uint8_t (*ProbaArray)[NUM_CTX][NUM_PROBAS]; + +// With corrupt / fuzzed streams the calculation of br->value may overflow. See +// b/148271109. +static VPX_NO_UNSIGNED_OVERFLOW_CHECK int GetSigned(BOOL_DECODER *br, + int value_to_sign) { + int split = (br->range + 1) >> 1; + VP8_BD_VALUE bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8); + int v; + + if (br->count < 0) vp8dx_bool_decoder_fill(br); + + if (br->value < bigsplit) { + br->range = split; + v = value_to_sign; + } else { + br->range = br->range - split; + br->value = br->value - bigsplit; + v = -value_to_sign; + } + br->range += br->range; + br->value += br->value; + br->count--; + + return v; +} +/* + Returns the position of the last non-zero coeff plus one + (and 0 if there's no coeff at all) +*/ +static int GetCoeffs(BOOL_DECODER *br, ProbaArray prob, int ctx, int n, + int16_t *out) { + const uint8_t *p = prob[n][ctx]; + if (!VP8GetBit(br, p[0])) { /* first EOB is more a 'CBP' bit. */ + return 0; + } + while (1) { + ++n; + if (!VP8GetBit(br, p[1])) { + p = prob[kBands[n]][0]; + } else { /* non zero coeff */ + int v, j; + if (!VP8GetBit(br, p[2])) { + p = prob[kBands[n]][1]; + v = 1; + } else { + if (!VP8GetBit(br, p[3])) { + if (!VP8GetBit(br, p[4])) { + v = 2; + } else { + v = 3 + VP8GetBit(br, p[5]); + } + } else { + if (!VP8GetBit(br, p[6])) { + if (!VP8GetBit(br, p[7])) { + v = 5 + VP8GetBit(br, 159); + } else { + v = 7 + 2 * VP8GetBit(br, 165); + v += VP8GetBit(br, 145); + } + } else { + const uint8_t *tab; + const int bit1 = VP8GetBit(br, p[8]); + const int bit0 = VP8GetBit(br, p[9 + bit1]); + const int cat = 2 * bit1 + bit0; + v = 0; + for (tab = kCat3456[cat]; *tab; ++tab) { + v += v + VP8GetBit(br, *tab); + } + v += 3 + (8 << cat); + } + } + p = prob[kBands[n]][2]; + } + j = kZigzag[n - 1]; + + out[j] = GetSigned(br, v); + + if (n == 16 || !VP8GetBit(br, p[0])) { /* EOB */ + return n; + } + } + if (n == 16) { + return 16; + } + } +} + +int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x) { + BOOL_DECODER *bc = x->current_bc; + const FRAME_CONTEXT *const fc = &dx->common.fc; + char *eobs = x->eobs; + + int i; + int nonzeros; + int eobtotal = 0; + + short *qcoeff_ptr; + ProbaArray coef_probs; + ENTROPY_CONTEXT *a_ctx = ((ENTROPY_CONTEXT *)x->above_context); + ENTROPY_CONTEXT *l_ctx = ((ENTROPY_CONTEXT *)x->left_context); + ENTROPY_CONTEXT *a; + ENTROPY_CONTEXT *l; + int skip_dc = 0; + + qcoeff_ptr = &x->qcoeff[0]; + + if (!x->mode_info_context->mbmi.is_4x4) { + a = a_ctx + 8; + l = l_ctx + 8; + + coef_probs = fc->coef_probs[1]; + + nonzeros = GetCoeffs(bc, coef_probs, (*a + *l), 0, qcoeff_ptr + 24 * 16); + *a = *l = (nonzeros > 0); + + eobs[24] = nonzeros; + eobtotal += nonzeros - 16; + + coef_probs = fc->coef_probs[0]; + skip_dc = 1; + } else { + coef_probs = fc->coef_probs[3]; + skip_dc = 0; + } + + for (i = 0; i < 16; ++i) { + a = a_ctx + (i & 3); + l = l_ctx + ((i & 0xc) >> 2); + + nonzeros = GetCoeffs(bc, coef_probs, (*a + *l), skip_dc, qcoeff_ptr); + *a = *l = (nonzeros > 0); + + nonzeros += skip_dc; + eobs[i] = nonzeros; + eobtotal += nonzeros; + qcoeff_ptr += 16; + } + + coef_probs = fc->coef_probs[2]; + + a_ctx += 4; + l_ctx += 4; + for (i = 16; i < 24; ++i) { + a = a_ctx + ((i > 19) << 1) + (i & 1); + l = l_ctx + ((i > 19) << 1) + ((i & 3) > 1); + + nonzeros = GetCoeffs(bc, coef_probs, (*a + *l), 0, qcoeff_ptr); + *a = *l = (nonzeros > 0); + + eobs[i] = nonzeros; + eobtotal += nonzeros; + qcoeff_ptr += 16; + } + + return eobtotal; +} diff --git a/media/libvpx/libvpx/vp8/decoder/detokenize.h b/media/libvpx/libvpx/vp8/decoder/detokenize.h new file mode 100644 index 0000000000..410a431ba0 --- /dev/null +++ b/media/libvpx/libvpx/vp8/decoder/detokenize.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_DECODER_DETOKENIZE_H_ +#define VPX_VP8_DECODER_DETOKENIZE_H_ + +#include "onyxd_int.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_reset_mb_tokens_context(MACROBLOCKD *x); +int vp8_decode_mb_tokens(VP8D_COMP *, MACROBLOCKD *); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_DECODER_DETOKENIZE_H_ diff --git a/media/libvpx/libvpx/vp8/decoder/ec_types.h b/media/libvpx/libvpx/vp8/decoder/ec_types.h new file mode 100644 index 0000000000..84feb269df --- /dev/null +++ b/media/libvpx/libvpx/vp8/decoder/ec_types.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_DECODER_EC_TYPES_H_ +#define VPX_VP8_DECODER_EC_TYPES_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_OVERLAPS 16 + +/* The area (pixel area in Q6) the block pointed to by bmi overlaps + * another block with. + */ +typedef struct { + int overlap; + union b_mode_info *bmi; +} OVERLAP_NODE; + +/* Structure to keep track of overlapping blocks on a block level. */ +typedef struct { + /* TODO(holmer): This array should be exchanged for a linked list */ + OVERLAP_NODE overlaps[MAX_OVERLAPS]; +} B_OVERLAP; + +/* Structure used to hold all the overlaps of a macroblock. The overlaps of a + * macroblock is further divided into block overlaps. + */ +typedef struct { + B_OVERLAP overlaps[16]; +} MB_OVERLAP; + +/* Structure for keeping track of motion vectors and which reference frame they + * refer to. Used for motion vector interpolation. + */ +typedef struct { + MV mv; + MV_REFERENCE_FRAME ref_frame; +} EC_BLOCK; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_DECODER_EC_TYPES_H_ diff --git a/media/libvpx/libvpx/vp8/decoder/error_concealment.c b/media/libvpx/libvpx/vp8/decoder/error_concealment.c new file mode 100644 index 0000000000..85982e4de3 --- /dev/null +++ b/media/libvpx/libvpx/vp8/decoder/error_concealment.c @@ -0,0 +1,482 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "error_concealment.h" +#include "onyxd_int.h" +#include "decodemv.h" +#include "vpx_mem/vpx_mem.h" +#include "vp8/common/findnearmv.h" +#include "vp8/common/common.h" +#include "vpx_dsp/vpx_dsp_common.h" + +#define FLOOR(x, q) ((x) & -(1 << (q))) + +#define NUM_NEIGHBORS 20 + +typedef struct ec_position { + int row; + int col; +} EC_POS; + +/* + * Regenerate the table in Matlab with: + * x = meshgrid((1:4), (1:4)); + * y = meshgrid((1:4), (1:4))'; + * W = round((1./(sqrt(x.^2 + y.^2))*2^7)); + * W(1,1) = 0; + */ +static const int weights_q7[5][5] = { { 0, 128, 64, 43, 32 }, + { 128, 91, 57, 40, 31 }, + { 64, 57, 45, 36, 29 }, + { 43, 40, 36, 30, 26 }, + { 32, 31, 29, 26, 23 } }; + +int vp8_alloc_overlap_lists(VP8D_COMP *pbi) { + if (pbi->overlaps != NULL) { + vpx_free(pbi->overlaps); + pbi->overlaps = NULL; + } + + pbi->overlaps = + vpx_calloc(pbi->common.mb_rows * pbi->common.mb_cols, sizeof(MB_OVERLAP)); + + if (pbi->overlaps == NULL) return -1; + + return 0; +} + +void vp8_de_alloc_overlap_lists(VP8D_COMP *pbi) { + vpx_free(pbi->overlaps); + pbi->overlaps = NULL; +} + +/* Inserts a new overlap area value to the list of overlaps of a block */ +static void assign_overlap(OVERLAP_NODE *overlaps, union b_mode_info *bmi, + int overlap) { + int i; + if (overlap <= 0) return; + /* Find and assign to the next empty overlap node in the list of overlaps. + * Empty is defined as bmi == NULL */ + for (i = 0; i < MAX_OVERLAPS; ++i) { + if (overlaps[i].bmi == NULL) { + overlaps[i].bmi = bmi; + overlaps[i].overlap = overlap; + break; + } + } +} + +/* Calculates the overlap area between two 4x4 squares, where the first + * square has its upper-left corner at (b1_row, b1_col) and the second + * square has its upper-left corner at (b2_row, b2_col). Doesn't + * properly handle squares which do not overlap. + */ +static int block_overlap(int b1_row, int b1_col, int b2_row, int b2_col) { + const int int_top = VPXMAX(b1_row, b2_row); // top + const int int_left = VPXMAX(b1_col, b2_col); // left + /* Since each block is 4x4 pixels, adding 4 (Q3) to the left/top edge + * gives us the right/bottom edge. + */ + const int int_right = VPXMIN(b1_col + (4 << 3), b2_col + (4 << 3)); // right + const int int_bottom = + VPXMIN(b1_row + (4 << 3), b2_row + (4 << 3)); // bottom + return (int_bottom - int_top) * (int_right - int_left); +} + +/* Calculates the overlap area for all blocks in a macroblock at position + * (mb_row, mb_col) in macroblocks, which are being overlapped by a given + * overlapping block at position (new_row, new_col) (in pixels, Q3). The + * first block being overlapped in the macroblock has position (first_blk_row, + * first_blk_col) in blocks relative the upper-left corner of the image. + */ +static void calculate_overlaps_mb(B_OVERLAP *b_overlaps, union b_mode_info *bmi, + int new_row, int new_col, int mb_row, + int mb_col, int first_blk_row, + int first_blk_col) { + /* Find the blocks within this MB (defined by mb_row, mb_col) which are + * overlapped by bmi and calculate and assign overlap for each of those + * blocks. */ + + /* Block coordinates relative the upper-left block */ + const int rel_ol_blk_row = first_blk_row - mb_row * 4; + const int rel_ol_blk_col = first_blk_col - mb_col * 4; + /* If the block partly overlaps any previous MB, these coordinates + * can be < 0. We don't want to access blocks in previous MBs. + */ + const int blk_idx = VPXMAX(rel_ol_blk_row, 0) * 4 + VPXMAX(rel_ol_blk_col, 0); + /* Upper left overlapping block */ + B_OVERLAP *b_ol_ul = &(b_overlaps[blk_idx]); + + /* Calculate and assign overlaps for all blocks in this MB + * which the motion compensated block overlaps + */ + /* Avoid calculating overlaps for blocks in later MBs */ + int end_row = VPXMIN(4 + mb_row * 4 - first_blk_row, 2); + int end_col = VPXMIN(4 + mb_col * 4 - first_blk_col, 2); + int row, col; + + /* Check if new_row and new_col are evenly divisible by 4 (Q3), + * and if so we shouldn't check neighboring blocks + */ + if (new_row >= 0 && (new_row & 0x1F) == 0) end_row = 1; + if (new_col >= 0 && (new_col & 0x1F) == 0) end_col = 1; + + /* Check if the overlapping block partly overlaps a previous MB + * and if so, we're overlapping fewer blocks in this MB. + */ + if (new_row < (mb_row * 16) << 3) end_row = 1; + if (new_col < (mb_col * 16) << 3) end_col = 1; + + for (row = 0; row < end_row; ++row) { + for (col = 0; col < end_col; ++col) { + /* input in Q3, result in Q6 */ + const int overlap = + block_overlap(new_row, new_col, (((first_blk_row + row) * 4) << 3), + (((first_blk_col + col) * 4) << 3)); + assign_overlap(b_ol_ul[row * 4 + col].overlaps, bmi, overlap); + } + } +} + +static void calculate_overlaps(MB_OVERLAP *overlap_ul, int mb_rows, int mb_cols, + union b_mode_info *bmi, int b_row, int b_col) { + MB_OVERLAP *mb_overlap; + int row, col, rel_row, rel_col; + int new_row, new_col; + int end_row, end_col; + int overlap_b_row, overlap_b_col; + int overlap_mb_row, overlap_mb_col; + + /* mb subpixel position */ + row = (4 * b_row) << 3; /* Q3 */ + col = (4 * b_col) << 3; /* Q3 */ + + /* reverse compensate for motion */ + new_row = row - bmi->mv.as_mv.row; + new_col = col - bmi->mv.as_mv.col; + + if (new_row >= ((16 * mb_rows) << 3) || new_col >= ((16 * mb_cols) << 3)) { + /* the new block ended up outside the frame */ + return; + } + + if (new_row <= -32 || new_col <= -32) { + /* outside the frame */ + return; + } + /* overlapping block's position in blocks */ + overlap_b_row = FLOOR(new_row / 4, 3) >> 3; + overlap_b_col = FLOOR(new_col / 4, 3) >> 3; + + /* overlapping block's MB position in MBs + * operations are done in Q3 + */ + overlap_mb_row = FLOOR((overlap_b_row << 3) / 4, 3) >> 3; + overlap_mb_col = FLOOR((overlap_b_col << 3) / 4, 3) >> 3; + + end_row = VPXMIN(mb_rows - overlap_mb_row, 2); + end_col = VPXMIN(mb_cols - overlap_mb_col, 2); + + /* Don't calculate overlap for MBs we don't overlap */ + /* Check if the new block row starts at the last block row of the MB */ + if (abs(new_row - ((16 * overlap_mb_row) << 3)) < ((3 * 4) << 3)) end_row = 1; + /* Check if the new block col starts at the last block col of the MB */ + if (abs(new_col - ((16 * overlap_mb_col) << 3)) < ((3 * 4) << 3)) end_col = 1; + + /* find the MB(s) this block is overlapping */ + for (rel_row = 0; rel_row < end_row; ++rel_row) { + for (rel_col = 0; rel_col < end_col; ++rel_col) { + if (overlap_mb_row + rel_row < 0 || overlap_mb_col + rel_col < 0) + continue; + mb_overlap = overlap_ul + (overlap_mb_row + rel_row) * mb_cols + + overlap_mb_col + rel_col; + + calculate_overlaps_mb(mb_overlap->overlaps, bmi, new_row, new_col, + overlap_mb_row + rel_row, overlap_mb_col + rel_col, + overlap_b_row + rel_row, overlap_b_col + rel_col); + } + } +} + +/* Estimates a motion vector given the overlapping blocks' motion vectors. + * Filters out all overlapping blocks which do not refer to the correct + * reference frame type. + */ +static void estimate_mv(const OVERLAP_NODE *overlaps, union b_mode_info *bmi) { + int i; + int overlap_sum = 0; + int row_acc = 0; + int col_acc = 0; + + bmi->mv.as_int = 0; + for (i = 0; i < MAX_OVERLAPS; ++i) { + if (overlaps[i].bmi == NULL) break; + col_acc += overlaps[i].overlap * overlaps[i].bmi->mv.as_mv.col; + row_acc += overlaps[i].overlap * overlaps[i].bmi->mv.as_mv.row; + overlap_sum += overlaps[i].overlap; + } + if (overlap_sum > 0) { + /* Q9 / Q6 = Q3 */ + bmi->mv.as_mv.col = col_acc / overlap_sum; + bmi->mv.as_mv.row = row_acc / overlap_sum; + } else { + bmi->mv.as_mv.col = 0; + bmi->mv.as_mv.row = 0; + } +} + +/* Estimates all motion vectors for a macroblock given the lists of + * overlaps for each block. Decides whether or not the MVs must be clamped. + */ +static void estimate_mb_mvs(const B_OVERLAP *block_overlaps, MODE_INFO *mi, + int mb_to_left_edge, int mb_to_right_edge, + int mb_to_top_edge, int mb_to_bottom_edge) { + int row, col; + int non_zero_count = 0; + MV *const filtered_mv = &(mi->mbmi.mv.as_mv); + union b_mode_info *const bmi = mi->bmi; + filtered_mv->col = 0; + filtered_mv->row = 0; + mi->mbmi.need_to_clamp_mvs = 0; + for (row = 0; row < 4; ++row) { + int this_b_to_top_edge = mb_to_top_edge + ((row * 4) << 3); + int this_b_to_bottom_edge = mb_to_bottom_edge - ((row * 4) << 3); + for (col = 0; col < 4; ++col) { + int i = row * 4 + col; + int this_b_to_left_edge = mb_to_left_edge + ((col * 4) << 3); + int this_b_to_right_edge = mb_to_right_edge - ((col * 4) << 3); + /* Estimate vectors for all blocks which are overlapped by this */ + /* type. Interpolate/extrapolate the rest of the block's MVs */ + estimate_mv(block_overlaps[i].overlaps, &(bmi[i])); + mi->mbmi.need_to_clamp_mvs |= vp8_check_mv_bounds( + &bmi[i].mv, this_b_to_left_edge, this_b_to_right_edge, + this_b_to_top_edge, this_b_to_bottom_edge); + if (bmi[i].mv.as_int != 0) { + ++non_zero_count; + filtered_mv->col += bmi[i].mv.as_mv.col; + filtered_mv->row += bmi[i].mv.as_mv.row; + } + } + } + if (non_zero_count > 0) { + filtered_mv->col /= non_zero_count; + filtered_mv->row /= non_zero_count; + } +} + +static void calc_prev_mb_overlaps(MB_OVERLAP *overlaps, MODE_INFO *prev_mi, + int mb_row, int mb_col, int mb_rows, + int mb_cols) { + int sub_row; + int sub_col; + for (sub_row = 0; sub_row < 4; ++sub_row) { + for (sub_col = 0; sub_col < 4; ++sub_col) { + calculate_overlaps(overlaps, mb_rows, mb_cols, + &(prev_mi->bmi[sub_row * 4 + sub_col]), + 4 * mb_row + sub_row, 4 * mb_col + sub_col); + } + } +} + +/* Estimate all missing motion vectors. This function does the same as the one + * above, but has different input arguments. */ +static void estimate_missing_mvs(MB_OVERLAP *overlaps, MODE_INFO *mi, + MODE_INFO *prev_mi, int mb_rows, int mb_cols, + unsigned int first_corrupt) { + int mb_row, mb_col; + memset(overlaps, 0, sizeof(MB_OVERLAP) * mb_rows * mb_cols); + /* First calculate the overlaps for all blocks */ + for (mb_row = 0; mb_row < mb_rows; ++mb_row) { + for (mb_col = 0; mb_col < mb_cols; ++mb_col) { + /* We're only able to use blocks referring to the last frame + * when extrapolating new vectors. + */ + if (prev_mi->mbmi.ref_frame == LAST_FRAME) { + calc_prev_mb_overlaps(overlaps, prev_mi, mb_row, mb_col, mb_rows, + mb_cols); + } + ++prev_mi; + } + ++prev_mi; + } + + mb_row = first_corrupt / mb_cols; + mb_col = first_corrupt - mb_row * mb_cols; + mi += mb_row * (mb_cols + 1) + mb_col; + /* Go through all macroblocks in the current image with missing MVs + * and calculate new MVs using the overlaps. + */ + for (; mb_row < mb_rows; ++mb_row) { + int mb_to_top_edge = -((mb_row * 16)) << 3; + int mb_to_bottom_edge = ((mb_rows - 1 - mb_row) * 16) << 3; + for (; mb_col < mb_cols; ++mb_col) { + int mb_to_left_edge = -((mb_col * 16) << 3); + int mb_to_right_edge = ((mb_cols - 1 - mb_col) * 16) << 3; + const B_OVERLAP *block_overlaps = + overlaps[mb_row * mb_cols + mb_col].overlaps; + mi->mbmi.ref_frame = LAST_FRAME; + mi->mbmi.mode = SPLITMV; + mi->mbmi.uv_mode = DC_PRED; + mi->mbmi.partitioning = 3; + mi->mbmi.segment_id = 0; + estimate_mb_mvs(block_overlaps, mi, mb_to_left_edge, mb_to_right_edge, + mb_to_top_edge, mb_to_bottom_edge); + ++mi; + } + mb_col = 0; + ++mi; + } +} + +void vp8_estimate_missing_mvs(VP8D_COMP *pbi) { + VP8_COMMON *const pc = &pbi->common; + estimate_missing_mvs(pbi->overlaps, pc->mi, pc->prev_mi, pc->mb_rows, + pc->mb_cols, pbi->mvs_corrupt_from_mb); +} + +static void assign_neighbor(EC_BLOCK *neighbor, MODE_INFO *mi, int block_idx) { + assert(mi->mbmi.ref_frame < MAX_REF_FRAMES); + neighbor->ref_frame = mi->mbmi.ref_frame; + neighbor->mv = mi->bmi[block_idx].mv.as_mv; +} + +/* Finds the neighboring blocks of a macroblocks. In the general case + * 20 blocks are found. If a fewer number of blocks are found due to + * image boundaries, those positions in the EC_BLOCK array are left "empty". + * The neighbors are enumerated with the upper-left neighbor as the first + * element, the second element refers to the neighbor to right of the previous + * neighbor, and so on. The last element refers to the neighbor below the first + * neighbor. + */ +static void find_neighboring_blocks(MODE_INFO *mi, EC_BLOCK *neighbors, + int mb_row, int mb_col, int mb_rows, + int mb_cols, int mi_stride) { + int i = 0; + int j; + if (mb_row > 0) { + /* upper left */ + if (mb_col > 0) assign_neighbor(&neighbors[i], mi - mi_stride - 1, 15); + ++i; + /* above */ + for (j = 12; j < 16; ++j, ++i) + assign_neighbor(&neighbors[i], mi - mi_stride, j); + } else + i += 5; + if (mb_col < mb_cols - 1) { + /* upper right */ + if (mb_row > 0) assign_neighbor(&neighbors[i], mi - mi_stride + 1, 12); + ++i; + /* right */ + for (j = 0; j <= 12; j += 4, ++i) assign_neighbor(&neighbors[i], mi + 1, j); + } else + i += 5; + if (mb_row < mb_rows - 1) { + /* lower right */ + if (mb_col < mb_cols - 1) + assign_neighbor(&neighbors[i], mi + mi_stride + 1, 0); + ++i; + /* below */ + for (j = 0; j < 4; ++j, ++i) + assign_neighbor(&neighbors[i], mi + mi_stride, j); + } else + i += 5; + if (mb_col > 0) { + /* lower left */ + if (mb_row < mb_rows - 1) + assign_neighbor(&neighbors[i], mi + mi_stride - 1, 4); + ++i; + /* left */ + for (j = 3; j < 16; j += 4, ++i) { + assign_neighbor(&neighbors[i], mi - 1, j); + } + } else + i += 5; + assert(i == 20); +} + +/* Interpolates all motion vectors for a macroblock from the neighboring blocks' + * motion vectors. + */ +static void interpolate_mvs(MACROBLOCKD *mb, EC_BLOCK *neighbors, + MV_REFERENCE_FRAME dom_ref_frame) { + int row, col, i; + MODE_INFO *const mi = mb->mode_info_context; + /* Table with the position of the neighboring blocks relative the position + * of the upper left block of the current MB. Starting with the upper left + * neighbor and going to the right. + */ + const EC_POS neigh_pos[NUM_NEIGHBORS] = { + { -1, -1 }, { -1, 0 }, { -1, 1 }, { -1, 2 }, { -1, 3 }, { -1, 4 }, { 0, 4 }, + { 1, 4 }, { 2, 4 }, { 3, 4 }, { 4, 4 }, { 4, 3 }, { 4, 2 }, { 4, 1 }, + { 4, 0 }, { 4, -1 }, { 3, -1 }, { 2, -1 }, { 1, -1 }, { 0, -1 } + }; + mi->mbmi.need_to_clamp_mvs = 0; + for (row = 0; row < 4; ++row) { + int mb_to_top_edge = mb->mb_to_top_edge + ((row * 4) << 3); + int mb_to_bottom_edge = mb->mb_to_bottom_edge - ((row * 4) << 3); + for (col = 0; col < 4; ++col) { + int mb_to_left_edge = mb->mb_to_left_edge + ((col * 4) << 3); + int mb_to_right_edge = mb->mb_to_right_edge - ((col * 4) << 3); + int w_sum = 0; + int mv_row_sum = 0; + int mv_col_sum = 0; + int_mv *const mv = &(mi->bmi[row * 4 + col].mv); + mv->as_int = 0; + for (i = 0; i < NUM_NEIGHBORS; ++i) { + /* Calculate the weighted sum of neighboring MVs referring + * to the dominant frame type. + */ + const int w = weights_q7[abs(row - neigh_pos[i].row)] + [abs(col - neigh_pos[i].col)]; + if (neighbors[i].ref_frame != dom_ref_frame) continue; + w_sum += w; + /* Q7 * Q3 = Q10 */ + mv_row_sum += w * neighbors[i].mv.row; + mv_col_sum += w * neighbors[i].mv.col; + } + if (w_sum > 0) { + /* Avoid division by zero. + * Normalize with the sum of the coefficients + * Q3 = Q10 / Q7 + */ + mv->as_mv.row = mv_row_sum / w_sum; + mv->as_mv.col = mv_col_sum / w_sum; + mi->mbmi.need_to_clamp_mvs |= + vp8_check_mv_bounds(mv, mb_to_left_edge, mb_to_right_edge, + mb_to_top_edge, mb_to_bottom_edge); + } + } + } +} + +void vp8_interpolate_motion(MACROBLOCKD *mb, int mb_row, int mb_col, + int mb_rows, int mb_cols) { + /* Find relevant neighboring blocks */ + EC_BLOCK neighbors[NUM_NEIGHBORS]; + int i; + /* Initialize the array. MAX_REF_FRAMES is interpreted as "doesn't exist" */ + for (i = 0; i < NUM_NEIGHBORS; ++i) { + neighbors[i].ref_frame = MAX_REF_FRAMES; + neighbors[i].mv.row = neighbors[i].mv.col = 0; + } + find_neighboring_blocks(mb->mode_info_context, neighbors, mb_row, mb_col, + mb_rows, mb_cols, mb->mode_info_stride); + /* Interpolate MVs for the missing blocks from the surrounding + * blocks which refer to the last frame. */ + interpolate_mvs(mb, neighbors, LAST_FRAME); + + mb->mode_info_context->mbmi.ref_frame = LAST_FRAME; + mb->mode_info_context->mbmi.mode = SPLITMV; + mb->mode_info_context->mbmi.uv_mode = DC_PRED; + mb->mode_info_context->mbmi.partitioning = 3; + mb->mode_info_context->mbmi.segment_id = 0; +} diff --git a/media/libvpx/libvpx/vp8/decoder/error_concealment.h b/media/libvpx/libvpx/vp8/decoder/error_concealment.h new file mode 100644 index 0000000000..608a79f189 --- /dev/null +++ b/media/libvpx/libvpx/vp8/decoder/error_concealment.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_DECODER_ERROR_CONCEALMENT_H_ +#define VPX_VP8_DECODER_ERROR_CONCEALMENT_H_ + +#include "onyxd_int.h" +#include "ec_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* Allocate memory for the overlap lists */ +int vp8_alloc_overlap_lists(VP8D_COMP *pbi); + +/* Deallocate the overlap lists */ +void vp8_de_alloc_overlap_lists(VP8D_COMP *pbi); + +/* Estimate all missing motion vectors. */ +void vp8_estimate_missing_mvs(VP8D_COMP *pbi); + +/* Functions for spatial MV interpolation */ + +/* Interpolates all motion vectors for a macroblock mb at position + * (mb_row, mb_col). */ +void vp8_interpolate_motion(MACROBLOCKD *mb, int mb_row, int mb_col, + int mb_rows, int mb_cols); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_DECODER_ERROR_CONCEALMENT_H_ diff --git a/media/libvpx/libvpx/vp8/decoder/onyxd_if.c b/media/libvpx/libvpx/vp8/decoder/onyxd_if.c new file mode 100644 index 0000000000..2248345ba2 --- /dev/null +++ b/media/libvpx/libvpx/vp8/decoder/onyxd_if.c @@ -0,0 +1,460 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp8/common/onyxc_int.h" +#if CONFIG_POSTPROC +#include "vp8/common/postproc.h" +#endif +#include "vp8/common/onyxd.h" +#include "onyxd_int.h" +#include "vpx_mem/vpx_mem.h" +#include "vp8/common/alloccommon.h" +#include "vp8/common/common.h" +#include "vp8/common/loopfilter.h" +#include "vp8/common/swapyv12buffer.h" +#include "vp8/common/threading.h" +#include "decoderthreading.h" +#include +#include + +#include "vp8/common/quant_common.h" +#include "vp8/common/reconintra.h" +#include "./vpx_dsp_rtcd.h" +#include "./vpx_scale_rtcd.h" +#include "vpx_scale/vpx_scale.h" +#include "vp8/common/systemdependent.h" +#include "vpx_ports/system_state.h" +#include "vpx_ports/vpx_once.h" +#include "vpx_ports/vpx_timer.h" +#include "detokenize.h" +#if CONFIG_ERROR_CONCEALMENT +#include "error_concealment.h" +#endif +#if VPX_ARCH_ARM +#include "vpx_ports/arm.h" +#endif + +extern void vp8_init_loop_filter(VP8_COMMON *cm); +static int get_free_fb(VP8_COMMON *cm); +static void ref_cnt_fb(int *buf, int *idx, int new_idx); + +static void initialize_dec(void) { + static volatile int init_done = 0; + + if (!init_done) { + vpx_dsp_rtcd(); + vp8_init_intra_predictors(); + init_done = 1; + } +} + +static void remove_decompressor(VP8D_COMP *pbi) { +#if CONFIG_ERROR_CONCEALMENT + vp8_de_alloc_overlap_lists(pbi); +#endif + vp8_remove_common(&pbi->common); + vpx_free(pbi); +} + +static struct VP8D_COMP *create_decompressor(VP8D_CONFIG *oxcf) { + VP8D_COMP *pbi = vpx_memalign(32, sizeof(VP8D_COMP)); + + if (!pbi) return NULL; + + memset(pbi, 0, sizeof(VP8D_COMP)); + + if (setjmp(pbi->common.error.jmp)) { + pbi->common.error.setjmp = 0; + remove_decompressor(pbi); + return 0; + } + + pbi->common.error.setjmp = 1; + + vp8_create_common(&pbi->common); + + pbi->common.current_video_frame = 0; + pbi->ready_for_new_data = 1; + + /* vp8cx_init_de_quantizer() is first called here. Add check in + * frame_init_dequantizer() to avoid + * unnecessary calling of vp8cx_init_de_quantizer() for every frame. + */ + vp8cx_init_de_quantizer(pbi); + + vp8_loop_filter_init(&pbi->common); + + pbi->common.error.setjmp = 0; + +#if CONFIG_ERROR_CONCEALMENT + pbi->ec_enabled = oxcf->error_concealment; + pbi->overlaps = NULL; +#else + (void)oxcf; + pbi->ec_enabled = 0; +#endif + /* Error concealment is activated after a key frame has been + * decoded without errors when error concealment is enabled. + */ + pbi->ec_active = 0; + + pbi->decoded_key_frame = 0; + + /* Independent partitions is activated when a frame updates the + * token probability table to have equal probabilities over the + * PREV_COEF context. + */ + pbi->independent_partitions = 0; + + vp8_setup_block_dptrs(&pbi->mb); + + once(initialize_dec); + + return pbi; +} + +vpx_codec_err_t vp8dx_get_reference(VP8D_COMP *pbi, + enum vpx_ref_frame_type ref_frame_flag, + YV12_BUFFER_CONFIG *sd) { + VP8_COMMON *cm = &pbi->common; + int ref_fb_idx; + + if (ref_frame_flag == VP8_LAST_FRAME) { + ref_fb_idx = cm->lst_fb_idx; + } else if (ref_frame_flag == VP8_GOLD_FRAME) { + ref_fb_idx = cm->gld_fb_idx; + } else if (ref_frame_flag == VP8_ALTR_FRAME) { + ref_fb_idx = cm->alt_fb_idx; + } else { + vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR, + "Invalid reference frame"); + return pbi->common.error.error_code; + } + + if (cm->yv12_fb[ref_fb_idx].y_height != sd->y_height || + cm->yv12_fb[ref_fb_idx].y_width != sd->y_width || + cm->yv12_fb[ref_fb_idx].uv_height != sd->uv_height || + cm->yv12_fb[ref_fb_idx].uv_width != sd->uv_width) { + vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR, + "Incorrect buffer dimensions"); + } else + vp8_yv12_copy_frame(&cm->yv12_fb[ref_fb_idx], sd); + + return pbi->common.error.error_code; +} + +vpx_codec_err_t vp8dx_set_reference(VP8D_COMP *pbi, + enum vpx_ref_frame_type ref_frame_flag, + YV12_BUFFER_CONFIG *sd) { + VP8_COMMON *cm = &pbi->common; + int *ref_fb_ptr = NULL; + int free_fb; + + if (ref_frame_flag == VP8_LAST_FRAME) { + ref_fb_ptr = &cm->lst_fb_idx; + } else if (ref_frame_flag == VP8_GOLD_FRAME) { + ref_fb_ptr = &cm->gld_fb_idx; + } else if (ref_frame_flag == VP8_ALTR_FRAME) { + ref_fb_ptr = &cm->alt_fb_idx; + } else { + vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR, + "Invalid reference frame"); + return pbi->common.error.error_code; + } + + if (cm->yv12_fb[*ref_fb_ptr].y_height != sd->y_height || + cm->yv12_fb[*ref_fb_ptr].y_width != sd->y_width || + cm->yv12_fb[*ref_fb_ptr].uv_height != sd->uv_height || + cm->yv12_fb[*ref_fb_ptr].uv_width != sd->uv_width) { + vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR, + "Incorrect buffer dimensions"); + } else { + /* Find an empty frame buffer. */ + free_fb = get_free_fb(cm); + /* Decrease fb_idx_ref_cnt since it will be increased again in + * ref_cnt_fb() below. */ + cm->fb_idx_ref_cnt[free_fb]--; + + /* Manage the reference counters and copy image. */ + ref_cnt_fb(cm->fb_idx_ref_cnt, ref_fb_ptr, free_fb); + vp8_yv12_copy_frame(sd, &cm->yv12_fb[*ref_fb_ptr]); + } + + return pbi->common.error.error_code; +} + +static int get_free_fb(VP8_COMMON *cm) { + int i; + for (i = 0; i < NUM_YV12_BUFFERS; ++i) { + if (cm->fb_idx_ref_cnt[i] == 0) break; + } + + assert(i < NUM_YV12_BUFFERS); + cm->fb_idx_ref_cnt[i] = 1; + return i; +} + +static void ref_cnt_fb(int *buf, int *idx, int new_idx) { + if (buf[*idx] > 0) buf[*idx]--; + + *idx = new_idx; + + buf[new_idx]++; +} + +/* If any buffer copy / swapping is signalled it should be done here. */ +static int swap_frame_buffers(VP8_COMMON *cm) { + int err = 0; + + /* The alternate reference frame or golden frame can be updated + * using the new, last, or golden/alt ref frame. If it + * is updated using the newly decoded frame it is a refresh. + * An update using the last or golden/alt ref frame is a copy. + */ + if (cm->copy_buffer_to_arf) { + int new_fb = 0; + + if (cm->copy_buffer_to_arf == 1) { + new_fb = cm->lst_fb_idx; + } else if (cm->copy_buffer_to_arf == 2) { + new_fb = cm->gld_fb_idx; + } else { + err = -1; + } + + ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->alt_fb_idx, new_fb); + } + + if (cm->copy_buffer_to_gf) { + int new_fb = 0; + + if (cm->copy_buffer_to_gf == 1) { + new_fb = cm->lst_fb_idx; + } else if (cm->copy_buffer_to_gf == 2) { + new_fb = cm->alt_fb_idx; + } else { + err = -1; + } + + ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->gld_fb_idx, new_fb); + } + + if (cm->refresh_golden_frame) { + ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->gld_fb_idx, cm->new_fb_idx); + } + + if (cm->refresh_alt_ref_frame) { + ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->alt_fb_idx, cm->new_fb_idx); + } + + if (cm->refresh_last_frame) { + ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->lst_fb_idx, cm->new_fb_idx); + + cm->frame_to_show = &cm->yv12_fb[cm->lst_fb_idx]; + } else { + cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx]; + } + + cm->fb_idx_ref_cnt[cm->new_fb_idx]--; + + return err; +} + +static int check_fragments_for_errors(VP8D_COMP *pbi) { + if (!pbi->ec_active && pbi->fragments.count <= 1 && + pbi->fragments.sizes[0] == 0) { + VP8_COMMON *cm = &pbi->common; + + /* If error concealment is disabled we won't signal missing frames + * to the decoder. + */ + if (cm->fb_idx_ref_cnt[cm->lst_fb_idx] > 1) { + /* The last reference shares buffer with another reference + * buffer. Move it to its own buffer before setting it as + * corrupt, otherwise we will make multiple buffers corrupt. + */ + const int prev_idx = cm->lst_fb_idx; + cm->fb_idx_ref_cnt[prev_idx]--; + cm->lst_fb_idx = get_free_fb(cm); + vp8_yv12_copy_frame(&cm->yv12_fb[prev_idx], &cm->yv12_fb[cm->lst_fb_idx]); + } + /* This is used to signal that we are missing frames. + * We do not know if the missing frame(s) was supposed to update + * any of the reference buffers, but we act conservative and + * mark only the last buffer as corrupted. + */ + cm->yv12_fb[cm->lst_fb_idx].corrupted = 1; + + /* Signal that we have no frame to show. */ + cm->show_frame = 0; + + /* Nothing more to do. */ + return 0; + } + + return 1; +} + +int vp8dx_receive_compressed_data(VP8D_COMP *pbi) { + VP8_COMMON *cm = &pbi->common; + int retcode = -1; + + pbi->common.error.error_code = VPX_CODEC_OK; + + retcode = check_fragments_for_errors(pbi); + if (retcode <= 0) return retcode; + + cm->new_fb_idx = get_free_fb(cm); + + /* setup reference frames for vp8_decode_frame */ + pbi->dec_fb_ref[INTRA_FRAME] = &cm->yv12_fb[cm->new_fb_idx]; + pbi->dec_fb_ref[LAST_FRAME] = &cm->yv12_fb[cm->lst_fb_idx]; + pbi->dec_fb_ref[GOLDEN_FRAME] = &cm->yv12_fb[cm->gld_fb_idx]; + pbi->dec_fb_ref[ALTREF_FRAME] = &cm->yv12_fb[cm->alt_fb_idx]; + + retcode = vp8_decode_frame(pbi); + + if (retcode < 0) { + if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0) { + cm->fb_idx_ref_cnt[cm->new_fb_idx]--; + } + + pbi->common.error.error_code = VPX_CODEC_ERROR; + // Propagate the error info. + if (pbi->mb.error_info.error_code != 0) { + pbi->common.error.error_code = pbi->mb.error_info.error_code; + memcpy(pbi->common.error.detail, pbi->mb.error_info.detail, + sizeof(pbi->mb.error_info.detail)); + } + goto decode_exit; + } + + if (swap_frame_buffers(cm)) { + pbi->common.error.error_code = VPX_CODEC_ERROR; + goto decode_exit; + } + + vpx_clear_system_state(); + + if (cm->show_frame) { + cm->current_video_frame++; + cm->show_frame_mi = cm->mi; + } + +#if CONFIG_ERROR_CONCEALMENT + /* swap the mode infos to storage for future error concealment */ + if (pbi->ec_enabled && pbi->common.prev_mi) { + MODE_INFO *tmp = pbi->common.prev_mi; + int row, col; + pbi->common.prev_mi = pbi->common.mi; + pbi->common.mi = tmp; + + /* Propagate the segment_ids to the next frame */ + for (row = 0; row < pbi->common.mb_rows; ++row) { + for (col = 0; col < pbi->common.mb_cols; ++col) { + const int i = row * pbi->common.mode_info_stride + col; + pbi->common.mi[i].mbmi.segment_id = + pbi->common.prev_mi[i].mbmi.segment_id; + } + } + } +#endif + + pbi->ready_for_new_data = 0; + +decode_exit: + vpx_clear_system_state(); + return retcode; +} +int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, + vp8_ppflags_t *flags) { + int ret = -1; + + if (pbi->ready_for_new_data == 1) return ret; + + /* ie no raw frame to show!!! */ + if (pbi->common.show_frame == 0) return ret; + + pbi->ready_for_new_data = 1; + +#if CONFIG_POSTPROC + ret = vp8_post_proc_frame(&pbi->common, sd, flags); +#else + (void)flags; + + if (pbi->common.frame_to_show) { + *sd = *pbi->common.frame_to_show; + sd->y_width = pbi->common.Width; + sd->y_height = pbi->common.Height; + sd->uv_height = pbi->common.Height / 2; + ret = 0; + } else { + ret = -1; + } + +#endif /*!CONFIG_POSTPROC*/ + vpx_clear_system_state(); + return ret; +} + +/* This function as written isn't decoder specific, but the encoder has + * much faster ways of computing this, so it's ok for it to live in a + * decode specific file. + */ +int vp8dx_references_buffer(VP8_COMMON *oci, int ref_frame) { + const MODE_INFO *mi = oci->mi; + int mb_row, mb_col; + + for (mb_row = 0; mb_row < oci->mb_rows; ++mb_row) { + for (mb_col = 0; mb_col < oci->mb_cols; mb_col++, mi++) { + if (mi->mbmi.ref_frame == ref_frame) return 1; + } + mi++; + } + return 0; +} + +int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf) { + /* decoder instance for single thread mode */ + fb->pbi[0] = create_decompressor(oxcf); + if (!fb->pbi[0]) return VPX_CODEC_ERROR; + +#if CONFIG_MULTITHREAD + if (setjmp(fb->pbi[0]->common.error.jmp)) { + vp8_remove_decoder_instances(fb); + vp8_zero(fb->pbi); + vpx_clear_system_state(); + return VPX_CODEC_ERROR; + } + + fb->pbi[0]->common.error.setjmp = 1; + fb->pbi[0]->max_threads = oxcf->max_threads; + vp8_decoder_create_threads(fb->pbi[0]); + fb->pbi[0]->common.error.setjmp = 0; +#endif + return VPX_CODEC_OK; +} + +int vp8_remove_decoder_instances(struct frame_buffers *fb) { + VP8D_COMP *pbi = fb->pbi[0]; + + if (!pbi) return VPX_CODEC_ERROR; +#if CONFIG_MULTITHREAD + vp8_decoder_remove_threads(pbi); +#endif + + /* decoder instance for single thread mode */ + remove_decompressor(pbi); + return VPX_CODEC_OK; +} + +int vp8dx_get_quantizer(const VP8D_COMP *pbi) { + return pbi->common.base_qindex; +} diff --git a/media/libvpx/libvpx/vp8/decoder/onyxd_int.h b/media/libvpx/libvpx/vp8/decoder/onyxd_int.h new file mode 100644 index 0000000000..1070849620 --- /dev/null +++ b/media/libvpx/libvpx/vp8/decoder/onyxd_int.h @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_DECODER_ONYXD_INT_H_ +#define VPX_VP8_DECODER_ONYXD_INT_H_ + +#include + +#include "vpx_config.h" +#include "vp8/common/onyxd.h" +#include "treereader.h" +#include "vp8/common/onyxc_int.h" +#include "vp8/common/threading.h" + +#if CONFIG_ERROR_CONCEALMENT +#include "ec_types.h" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + int ithread; + void *ptr1; + void *ptr2; +} DECODETHREAD_DATA; + +typedef struct { + MACROBLOCKD mbd; +} MB_ROW_DEC; + +typedef struct { + int enabled; + unsigned int count; + const unsigned char *ptrs[MAX_PARTITIONS]; + unsigned int sizes[MAX_PARTITIONS]; +} FRAGMENT_DATA; + +#define MAX_FB_MT_DEC 32 + +struct frame_buffers { + /* + * this struct will be populated with frame buffer management + * info in future commits. */ + + /* decoder instances */ + struct VP8D_COMP *pbi[MAX_FB_MT_DEC]; +}; + +typedef struct VP8D_COMP { + DECLARE_ALIGNED(16, MACROBLOCKD, mb); + + YV12_BUFFER_CONFIG *dec_fb_ref[NUM_YV12_BUFFERS]; + + DECLARE_ALIGNED(16, VP8_COMMON, common); + + /* the last partition will be used for the modes/mvs */ + vp8_reader mbc[MAX_PARTITIONS]; + + VP8D_CONFIG oxcf; + + FRAGMENT_DATA fragments; + +#if CONFIG_MULTITHREAD + /* variable for threading */ + + vpx_atomic_int b_multithreaded_rd; + int max_threads; + int current_mb_col_main; + unsigned int decoding_thread_count; + int allocated_decoding_thread_count; + + int mt_baseline_filter_level[MAX_MB_SEGMENTS]; + int sync_range; + /* Each row remembers its already decoded column. */ + vpx_atomic_int *mt_current_mb_col; + + unsigned char **mt_yabove_row; /* mb_rows x width */ + unsigned char **mt_uabove_row; + unsigned char **mt_vabove_row; + unsigned char **mt_yleft_col; /* mb_rows x 16 */ + unsigned char **mt_uleft_col; /* mb_rows x 8 */ + unsigned char **mt_vleft_col; /* mb_rows x 8 */ + + MB_ROW_DEC *mb_row_di; + DECODETHREAD_DATA *de_thread_data; + + pthread_t *h_decoding_thread; + sem_t *h_event_start_decoding; + sem_t h_event_end_decoding; +/* end of threading data */ +#endif + + int ready_for_new_data; + + vp8_prob prob_intra; + vp8_prob prob_last; + vp8_prob prob_gf; + vp8_prob prob_skip_false; + +#if CONFIG_ERROR_CONCEALMENT + MB_OVERLAP *overlaps; + /* the mb num from which modes and mvs (first partition) are corrupt */ + unsigned int mvs_corrupt_from_mb; +#endif + int ec_enabled; + int ec_active; + int decoded_key_frame; + int independent_partitions; + int frame_corrupt_residual; + + vpx_decrypt_cb decrypt_cb; + void *decrypt_state; +#if CONFIG_MULTITHREAD + // Restart threads on next frame if set to 1. + // This is set when error happens in multithreaded decoding and all threads + // are shut down. + int restart_threads; +#endif +} VP8D_COMP; + +void vp8cx_init_de_quantizer(VP8D_COMP *pbi); +void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd); +int vp8_decode_frame(VP8D_COMP *pbi); + +int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf); +int vp8_remove_decoder_instances(struct frame_buffers *fb); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_DECODER_ONYXD_INT_H_ diff --git a/media/libvpx/libvpx/vp8/decoder/threading.c b/media/libvpx/libvpx/vp8/decoder/threading.c new file mode 100644 index 0000000000..6ccb080cf9 --- /dev/null +++ b/media/libvpx/libvpx/vp8/decoder/threading.c @@ -0,0 +1,907 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#if !defined(_WIN32) && CONFIG_OS_SUPPORT == 1 +#include +#endif +#include "onyxd_int.h" +#include "vpx_mem/vpx_mem.h" +#include "vp8/common/common.h" +#include "vp8/common/threading.h" +#include "vp8/common/loopfilter.h" +#include "vp8/common/extend.h" +#include "vpx_ports/vpx_timer.h" +#include "decoderthreading.h" +#include "detokenize.h" +#include "vp8/common/reconintra4x4.h" +#include "vp8/common/reconinter.h" +#include "vp8/common/reconintra.h" +#include "vp8/common/setupintrarecon.h" +#if CONFIG_ERROR_CONCEALMENT +#include "error_concealment.h" +#endif + +#define CALLOC_ARRAY(p, n) \ + CHECK_MEM_ERROR(&pbi->common.error, (p), vpx_calloc(sizeof(*(p)), (n))) +#define CALLOC_ARRAY_ALIGNED(p, n, algn) \ + do { \ + CHECK_MEM_ERROR(&pbi->common.error, (p), \ + vpx_memalign((algn), sizeof(*(p)) * (n))); \ + memset((p), 0, (n) * sizeof(*(p))); \ + } while (0) + +static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, + MB_ROW_DEC *mbrd, int count) { + VP8_COMMON *const pc = &pbi->common; + int i; + + for (i = 0; i < count; ++i) { + MACROBLOCKD *mbd = &mbrd[i].mbd; + mbd->subpixel_predict = xd->subpixel_predict; + mbd->subpixel_predict8x4 = xd->subpixel_predict8x4; + mbd->subpixel_predict8x8 = xd->subpixel_predict8x8; + mbd->subpixel_predict16x16 = xd->subpixel_predict16x16; + + mbd->frame_type = pc->frame_type; + mbd->pre = xd->pre; + mbd->dst = xd->dst; + + mbd->segmentation_enabled = xd->segmentation_enabled; + mbd->mb_segment_abs_delta = xd->mb_segment_abs_delta; + memcpy(mbd->segment_feature_data, xd->segment_feature_data, + sizeof(xd->segment_feature_data)); + + /*signed char ref_lf_deltas[MAX_REF_LF_DELTAS];*/ + memcpy(mbd->ref_lf_deltas, xd->ref_lf_deltas, sizeof(xd->ref_lf_deltas)); + /*signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];*/ + memcpy(mbd->mode_lf_deltas, xd->mode_lf_deltas, sizeof(xd->mode_lf_deltas)); + /*unsigned char mode_ref_lf_delta_enabled; + unsigned char mode_ref_lf_delta_update;*/ + mbd->mode_ref_lf_delta_enabled = xd->mode_ref_lf_delta_enabled; + mbd->mode_ref_lf_delta_update = xd->mode_ref_lf_delta_update; + + mbd->current_bc = &pbi->mbc[0]; + + memcpy(mbd->dequant_y1_dc, xd->dequant_y1_dc, sizeof(xd->dequant_y1_dc)); + memcpy(mbd->dequant_y1, xd->dequant_y1, sizeof(xd->dequant_y1)); + memcpy(mbd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2)); + memcpy(mbd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv)); + + mbd->fullpixel_mask = ~0; + + if (pc->full_pixel) mbd->fullpixel_mask = ~7; + } + + for (i = 0; i < pc->mb_rows; ++i) + vpx_atomic_store_release(&pbi->mt_current_mb_col[i], -1); +} + +static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, + unsigned int mb_idx) { + MB_PREDICTION_MODE mode; + int i; +#if CONFIG_ERROR_CONCEALMENT + int corruption_detected = 0; +#else + (void)mb_idx; +#endif + + if (xd->mode_info_context->mbmi.mb_skip_coeff) { + vp8_reset_mb_tokens_context(xd); + } else if (!vp8dx_bool_error(xd->current_bc)) { + int eobtotal; + eobtotal = vp8_decode_mb_tokens(pbi, xd); + + /* Special case: Force the loopfilter to skip when eobtotal is zero */ + xd->mode_info_context->mbmi.mb_skip_coeff = (eobtotal == 0); + } + + mode = xd->mode_info_context->mbmi.mode; + + if (xd->segmentation_enabled) vp8_mb_init_dequantizer(pbi, xd); + +#if CONFIG_ERROR_CONCEALMENT + + if (pbi->ec_active) { + int throw_residual; + /* When we have independent partitions we can apply residual even + * though other partitions within the frame are corrupt. + */ + throw_residual = + (!pbi->independent_partitions && pbi->frame_corrupt_residual); + throw_residual = (throw_residual || vp8dx_bool_error(xd->current_bc)); + + if ((mb_idx >= pbi->mvs_corrupt_from_mb || throw_residual)) { + /* MB with corrupt residuals or corrupt mode/motion vectors. + * Better to use the predictor as reconstruction. + */ + pbi->frame_corrupt_residual = 1; + memset(xd->qcoeff, 0, sizeof(xd->qcoeff)); + + corruption_detected = 1; + + /* force idct to be skipped for B_PRED and use the + * prediction only for reconstruction + * */ + memset(xd->eobs, 0, 25); + } + } +#endif + + /* do prediction */ + if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) { + vp8_build_intra_predictors_mbuv_s( + xd, xd->recon_above[1], xd->recon_above[2], xd->recon_left[1], + xd->recon_left[2], xd->recon_left_stride[1], xd->dst.u_buffer, + xd->dst.v_buffer, xd->dst.uv_stride); + + if (mode != B_PRED) { + vp8_build_intra_predictors_mby_s( + xd, xd->recon_above[0], xd->recon_left[0], xd->recon_left_stride[0], + xd->dst.y_buffer, xd->dst.y_stride); + } else { + short *DQC = xd->dequant_y1; + int dst_stride = xd->dst.y_stride; + + /* clear out residual eob info */ + if (xd->mode_info_context->mbmi.mb_skip_coeff) memset(xd->eobs, 0, 25); + + intra_prediction_down_copy(xd, xd->recon_above[0] + 16); + + for (i = 0; i < 16; ++i) { + BLOCKD *b = &xd->block[i]; + unsigned char *dst = xd->dst.y_buffer + b->offset; + B_PREDICTION_MODE b_mode = xd->mode_info_context->bmi[i].as_mode; + unsigned char *Above; + unsigned char *yleft; + int left_stride; + unsigned char top_left; + + /*Caution: For some b_mode, it needs 8 pixels (4 above + 4 + * above-right).*/ + if (i < 4 && pbi->common.filter_level) { + Above = xd->recon_above[0] + b->offset; + } else { + Above = dst - dst_stride; + } + + if (i % 4 == 0 && pbi->common.filter_level) { + yleft = xd->recon_left[0] + i; + left_stride = 1; + } else { + yleft = dst - 1; + left_stride = dst_stride; + } + + if ((i == 4 || i == 8 || i == 12) && pbi->common.filter_level) { + top_left = *(xd->recon_left[0] + i - 1); + } else { + top_left = Above[-1]; + } + + vp8_intra4x4_predict(Above, yleft, left_stride, b_mode, dst, dst_stride, + top_left); + + if (xd->eobs[i]) { + if (xd->eobs[i] > 1) { + vp8_dequant_idct_add(b->qcoeff, DQC, dst, dst_stride); + } else { + vp8_dc_only_idct_add(b->qcoeff[0] * DQC[0], dst, dst_stride, dst, + dst_stride); + memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0])); + } + } + } + } + } else { + vp8_build_inter_predictors_mb(xd); + } + +#if CONFIG_ERROR_CONCEALMENT + if (corruption_detected) { + return; + } +#endif + + if (!xd->mode_info_context->mbmi.mb_skip_coeff) { + /* dequantization and idct */ + if (mode != B_PRED) { + short *DQC = xd->dequant_y1; + + if (mode != SPLITMV) { + BLOCKD *b = &xd->block[24]; + + /* do 2nd order transform on the dc block */ + if (xd->eobs[24] > 1) { + vp8_dequantize_b(b, xd->dequant_y2); + + vp8_short_inv_walsh4x4(&b->dqcoeff[0], xd->qcoeff); + memset(b->qcoeff, 0, 16 * sizeof(b->qcoeff[0])); + } else { + b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0]; + vp8_short_inv_walsh4x4_1(&b->dqcoeff[0], xd->qcoeff); + memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0])); + } + + /* override the dc dequant constant in order to preserve the + * dc components + */ + DQC = xd->dequant_y1_dc; + } + + vp8_dequant_idct_add_y_block(xd->qcoeff, DQC, xd->dst.y_buffer, + xd->dst.y_stride, xd->eobs); + } + + vp8_dequant_idct_add_uv_block(xd->qcoeff + 16 * 16, xd->dequant_uv, + xd->dst.u_buffer, xd->dst.v_buffer, + xd->dst.uv_stride, xd->eobs + 16); + } +} + +static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, + int start_mb_row) { + const vpx_atomic_int *last_row_current_mb_col; + vpx_atomic_int *current_mb_col; + int mb_row; + VP8_COMMON *pc = &pbi->common; + const int nsync = pbi->sync_range; + const vpx_atomic_int first_row_no_sync_above = + VPX_ATOMIC_INIT(pc->mb_cols + nsync); + int num_part = 1 << pbi->common.multi_token_partition; + int last_mb_row = start_mb_row; + + YV12_BUFFER_CONFIG *yv12_fb_new = pbi->dec_fb_ref[INTRA_FRAME]; + YV12_BUFFER_CONFIG *yv12_fb_lst = pbi->dec_fb_ref[LAST_FRAME]; + + int recon_y_stride = yv12_fb_new->y_stride; + int recon_uv_stride = yv12_fb_new->uv_stride; + + unsigned char *ref_buffer[MAX_REF_FRAMES][3]; + unsigned char *dst_buffer[3]; + int i; + int ref_fb_corrupted[MAX_REF_FRAMES]; + + ref_fb_corrupted[INTRA_FRAME] = 0; + + for (i = 1; i < MAX_REF_FRAMES; ++i) { + YV12_BUFFER_CONFIG *this_fb = pbi->dec_fb_ref[i]; + + ref_buffer[i][0] = this_fb->y_buffer; + ref_buffer[i][1] = this_fb->u_buffer; + ref_buffer[i][2] = this_fb->v_buffer; + + ref_fb_corrupted[i] = this_fb->corrupted; + } + + dst_buffer[0] = yv12_fb_new->y_buffer; + dst_buffer[1] = yv12_fb_new->u_buffer; + dst_buffer[2] = yv12_fb_new->v_buffer; + + xd->up_available = (start_mb_row != 0); + + xd->mode_info_context = pc->mi + pc->mode_info_stride * start_mb_row; + xd->mode_info_stride = pc->mode_info_stride; + + for (mb_row = start_mb_row; mb_row < pc->mb_rows; + mb_row += (pbi->decoding_thread_count + 1)) { + int recon_yoffset, recon_uvoffset; + int mb_col; + int filter_level; + loop_filter_info_n *lfi_n = &pc->lf_info; + + /* save last row processed by this thread */ + last_mb_row = mb_row; + /* select bool coder for current partition */ + xd->current_bc = &pbi->mbc[mb_row % num_part]; + + if (mb_row > 0) { + last_row_current_mb_col = &pbi->mt_current_mb_col[mb_row - 1]; + } else { + last_row_current_mb_col = &first_row_no_sync_above; + } + + current_mb_col = &pbi->mt_current_mb_col[mb_row]; + + recon_yoffset = mb_row * recon_y_stride * 16; + recon_uvoffset = mb_row * recon_uv_stride * 8; + + /* reset contexts */ + xd->above_context = pc->above_context; + memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES)); + + xd->left_available = 0; + + xd->mb_to_top_edge = -((mb_row * 16) << 3); + xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3; + + if (pbi->common.filter_level) { + xd->recon_above[0] = pbi->mt_yabove_row[mb_row] + 0 * 16 + 32; + xd->recon_above[1] = pbi->mt_uabove_row[mb_row] + 0 * 8 + 16; + xd->recon_above[2] = pbi->mt_vabove_row[mb_row] + 0 * 8 + 16; + + xd->recon_left[0] = pbi->mt_yleft_col[mb_row]; + xd->recon_left[1] = pbi->mt_uleft_col[mb_row]; + xd->recon_left[2] = pbi->mt_vleft_col[mb_row]; + + /* TODO: move to outside row loop */ + xd->recon_left_stride[0] = 1; + xd->recon_left_stride[1] = 1; + } else { + xd->recon_above[0] = dst_buffer[0] + recon_yoffset; + xd->recon_above[1] = dst_buffer[1] + recon_uvoffset; + xd->recon_above[2] = dst_buffer[2] + recon_uvoffset; + + xd->recon_left[0] = xd->recon_above[0] - 1; + xd->recon_left[1] = xd->recon_above[1] - 1; + xd->recon_left[2] = xd->recon_above[2] - 1; + + xd->recon_above[0] -= xd->dst.y_stride; + xd->recon_above[1] -= xd->dst.uv_stride; + xd->recon_above[2] -= xd->dst.uv_stride; + + /* TODO: move to outside row loop */ + xd->recon_left_stride[0] = xd->dst.y_stride; + xd->recon_left_stride[1] = xd->dst.uv_stride; + + setup_intra_recon_left(xd->recon_left[0], xd->recon_left[1], + xd->recon_left[2], xd->dst.y_stride, + xd->dst.uv_stride); + } + + for (mb_col = 0; mb_col < pc->mb_cols; ++mb_col) { + if (((mb_col - 1) % nsync) == 0) { + vpx_atomic_store_release(current_mb_col, mb_col - 1); + } + + if (mb_row && !(mb_col & (nsync - 1))) { + vp8_atomic_spin_wait(mb_col, last_row_current_mb_col, nsync); + } + + /* Distance of MB to the various image edges. + * These are specified to 8th pel as they are always + * compared to values that are in 1/8th pel units. + */ + xd->mb_to_left_edge = -((mb_col * 16) << 3); + xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3; + +#if CONFIG_ERROR_CONCEALMENT + { + int corrupt_residual = + (!pbi->independent_partitions && pbi->frame_corrupt_residual) || + vp8dx_bool_error(xd->current_bc); + if (pbi->ec_active && + (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) && + corrupt_residual) { + /* We have an intra block with corrupt + * coefficients, better to conceal with an inter + * block. + * Interpolate MVs from neighboring MBs + * + * Note that for the first mb with corrupt + * residual in a frame, we might not discover + * that before decoding the residual. That + * happens after this check, and therefore no + * inter concealment will be done. + */ + vp8_interpolate_motion(xd, mb_row, mb_col, pc->mb_rows, pc->mb_cols); + } + } +#endif + + xd->dst.y_buffer = dst_buffer[0] + recon_yoffset; + xd->dst.u_buffer = dst_buffer[1] + recon_uvoffset; + xd->dst.v_buffer = dst_buffer[2] + recon_uvoffset; + + /* propagate errors from reference frames */ + xd->corrupted |= ref_fb_corrupted[xd->mode_info_context->mbmi.ref_frame]; + + if (xd->corrupted) { + // Move current decoding marcoblock to the end of row for all rows + // assigned to this thread, such that other threads won't be waiting. + for (; mb_row < pc->mb_rows; + mb_row += (pbi->decoding_thread_count + 1)) { + current_mb_col = &pbi->mt_current_mb_col[mb_row]; + vpx_atomic_store_release(current_mb_col, pc->mb_cols + nsync); + } + vpx_internal_error(&xd->error_info, VPX_CODEC_CORRUPT_FRAME, + "Corrupted reference frame"); + } + + if (xd->mode_info_context->mbmi.ref_frame >= LAST_FRAME) { + const MV_REFERENCE_FRAME ref = xd->mode_info_context->mbmi.ref_frame; + xd->pre.y_buffer = ref_buffer[ref][0] + recon_yoffset; + xd->pre.u_buffer = ref_buffer[ref][1] + recon_uvoffset; + xd->pre.v_buffer = ref_buffer[ref][2] + recon_uvoffset; + } else { + // ref_frame is INTRA_FRAME, pre buffer should not be used. + xd->pre.y_buffer = 0; + xd->pre.u_buffer = 0; + xd->pre.v_buffer = 0; + } + mt_decode_macroblock(pbi, xd, 0); + + xd->left_available = 1; + + /* check if the boolean decoder has suffered an error */ + xd->corrupted |= vp8dx_bool_error(xd->current_bc); + + xd->recon_above[0] += 16; + xd->recon_above[1] += 8; + xd->recon_above[2] += 8; + + if (!pbi->common.filter_level) { + xd->recon_left[0] += 16; + xd->recon_left[1] += 8; + xd->recon_left[2] += 8; + } + + if (pbi->common.filter_level) { + int skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED && + xd->mode_info_context->mbmi.mode != SPLITMV && + xd->mode_info_context->mbmi.mb_skip_coeff); + + const int mode_index = + lfi_n->mode_lf_lut[xd->mode_info_context->mbmi.mode]; + const int seg = xd->mode_info_context->mbmi.segment_id; + const int ref_frame = xd->mode_info_context->mbmi.ref_frame; + + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + + if (mb_row != pc->mb_rows - 1) { + /* Save decoded MB last row data for next-row decoding */ + memcpy((pbi->mt_yabove_row[mb_row + 1] + 32 + mb_col * 16), + (xd->dst.y_buffer + 15 * recon_y_stride), 16); + memcpy((pbi->mt_uabove_row[mb_row + 1] + 16 + mb_col * 8), + (xd->dst.u_buffer + 7 * recon_uv_stride), 8); + memcpy((pbi->mt_vabove_row[mb_row + 1] + 16 + mb_col * 8), + (xd->dst.v_buffer + 7 * recon_uv_stride), 8); + } + + /* save left_col for next MB decoding */ + if (mb_col != pc->mb_cols - 1) { + MODE_INFO *next = xd->mode_info_context + 1; + + if (next->mbmi.ref_frame == INTRA_FRAME) { + for (i = 0; i < 16; ++i) { + pbi->mt_yleft_col[mb_row][i] = + xd->dst.y_buffer[i * recon_y_stride + 15]; + } + for (i = 0; i < 8; ++i) { + pbi->mt_uleft_col[mb_row][i] = + xd->dst.u_buffer[i * recon_uv_stride + 7]; + pbi->mt_vleft_col[mb_row][i] = + xd->dst.v_buffer[i * recon_uv_stride + 7]; + } + } + } + + /* loopfilter on this macroblock. */ + if (filter_level) { + if (pc->filter_type == NORMAL_LOOPFILTER) { + loop_filter_info lfi; + FRAME_TYPE frame_type = pc->frame_type; + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; + + if (mb_col > 0) + vp8_loop_filter_mbv(xd->dst.y_buffer, xd->dst.u_buffer, + xd->dst.v_buffer, recon_y_stride, + recon_uv_stride, &lfi); + + if (!skip_lf) + vp8_loop_filter_bv(xd->dst.y_buffer, xd->dst.u_buffer, + xd->dst.v_buffer, recon_y_stride, + recon_uv_stride, &lfi); + + /* don't apply across umv border */ + if (mb_row > 0) + vp8_loop_filter_mbh(xd->dst.y_buffer, xd->dst.u_buffer, + xd->dst.v_buffer, recon_y_stride, + recon_uv_stride, &lfi); + + if (!skip_lf) + vp8_loop_filter_bh(xd->dst.y_buffer, xd->dst.u_buffer, + xd->dst.v_buffer, recon_y_stride, + recon_uv_stride, &lfi); + } else { + if (mb_col > 0) + vp8_loop_filter_simple_mbv(xd->dst.y_buffer, recon_y_stride, + lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp8_loop_filter_simple_bv(xd->dst.y_buffer, recon_y_stride, + lfi_n->blim[filter_level]); + + /* don't apply across umv border */ + if (mb_row > 0) + vp8_loop_filter_simple_mbh(xd->dst.y_buffer, recon_y_stride, + lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp8_loop_filter_simple_bh(xd->dst.y_buffer, recon_y_stride, + lfi_n->blim[filter_level]); + } + } + } + + recon_yoffset += 16; + recon_uvoffset += 8; + + ++xd->mode_info_context; /* next mb */ + + xd->above_context++; + } + + /* adjust to the next row of mbs */ + if (pbi->common.filter_level) { + if (mb_row != pc->mb_rows - 1) { + int lasty = yv12_fb_lst->y_width + VP8BORDERINPIXELS; + int lastuv = (yv12_fb_lst->y_width >> 1) + (VP8BORDERINPIXELS >> 1); + + for (i = 0; i < 4; ++i) { + pbi->mt_yabove_row[mb_row + 1][lasty + i] = + pbi->mt_yabove_row[mb_row + 1][lasty - 1]; + pbi->mt_uabove_row[mb_row + 1][lastuv + i] = + pbi->mt_uabove_row[mb_row + 1][lastuv - 1]; + pbi->mt_vabove_row[mb_row + 1][lastuv + i] = + pbi->mt_vabove_row[mb_row + 1][lastuv - 1]; + } + } + } else { + vp8_extend_mb_row(yv12_fb_new, xd->dst.y_buffer + 16, + xd->dst.u_buffer + 8, xd->dst.v_buffer + 8); + } + + /* last MB of row is ready just after extension is done */ + vpx_atomic_store_release(current_mb_col, mb_col + nsync); + + ++xd->mode_info_context; /* skip prediction column */ + xd->up_available = 1; + + /* since we have multithread */ + xd->mode_info_context += xd->mode_info_stride * pbi->decoding_thread_count; + } + + /* signal end of decoding of current thread for current frame */ + if (last_mb_row + (int)pbi->decoding_thread_count + 1 >= pc->mb_rows) + sem_post(&pbi->h_event_end_decoding); +} + +static THREAD_FUNCTION thread_decoding_proc(void *p_data) { + int ithread = ((DECODETHREAD_DATA *)p_data)->ithread; + VP8D_COMP *pbi = (VP8D_COMP *)(((DECODETHREAD_DATA *)p_data)->ptr1); + MB_ROW_DEC *mbrd = (MB_ROW_DEC *)(((DECODETHREAD_DATA *)p_data)->ptr2); + ENTROPY_CONTEXT_PLANES mb_row_left_context; + + while (1) { + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) break; + + if (sem_wait(&pbi->h_event_start_decoding[ithread]) == 0) { + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) { + break; + } else { + MACROBLOCKD *xd = &mbrd->mbd; + xd->left_context = &mb_row_left_context; + if (setjmp(xd->error_info.jmp)) { + xd->error_info.setjmp = 0; + // Signal the end of decoding for current thread. + sem_post(&pbi->h_event_end_decoding); + continue; + } + xd->error_info.setjmp = 1; + mt_decode_mb_rows(pbi, xd, ithread + 1); + } + } + } + + return 0; +} + +void vp8_decoder_create_threads(VP8D_COMP *pbi) { + int core_count = 0; + unsigned int ithread; + + vpx_atomic_init(&pbi->b_multithreaded_rd, 0); + pbi->allocated_decoding_thread_count = 0; + + /* limit decoding threads to the max number of token partitions */ + core_count = (pbi->max_threads > 8) ? 8 : pbi->max_threads; + + /* limit decoding threads to the available cores */ + if (core_count > pbi->common.processor_core_count) { + core_count = pbi->common.processor_core_count; + } + + if (core_count > 1) { + vpx_atomic_init(&pbi->b_multithreaded_rd, 1); + pbi->decoding_thread_count = core_count - 1; + + CALLOC_ARRAY(pbi->h_decoding_thread, pbi->decoding_thread_count); + CALLOC_ARRAY(pbi->h_event_start_decoding, pbi->decoding_thread_count); + CALLOC_ARRAY_ALIGNED(pbi->mb_row_di, pbi->decoding_thread_count, 32); + CALLOC_ARRAY(pbi->de_thread_data, pbi->decoding_thread_count); + + if (sem_init(&pbi->h_event_end_decoding, 0, 0)) { + vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, + "Failed to initialize semaphore"); + } + + for (ithread = 0; ithread < pbi->decoding_thread_count; ++ithread) { + if (sem_init(&pbi->h_event_start_decoding[ithread], 0, 0)) break; + + vp8_setup_block_dptrs(&pbi->mb_row_di[ithread].mbd); + + pbi->de_thread_data[ithread].ithread = ithread; + pbi->de_thread_data[ithread].ptr1 = (void *)pbi; + pbi->de_thread_data[ithread].ptr2 = (void *)&pbi->mb_row_di[ithread]; + + if (pthread_create(&pbi->h_decoding_thread[ithread], 0, + thread_decoding_proc, &pbi->de_thread_data[ithread])) { + sem_destroy(&pbi->h_event_start_decoding[ithread]); + break; + } + } + + pbi->allocated_decoding_thread_count = ithread; + if (pbi->allocated_decoding_thread_count != + (int)pbi->decoding_thread_count) { + /* the remainder of cleanup cases will be handled in + * vp8_decoder_remove_threads(). */ + if (pbi->allocated_decoding_thread_count == 0) { + sem_destroy(&pbi->h_event_end_decoding); + } + vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, + "Failed to create threads"); + } + } +} + +void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows) { + int i; + + vpx_free(pbi->mt_current_mb_col); + pbi->mt_current_mb_col = NULL; + + /* Free above_row buffers. */ + if (pbi->mt_yabove_row) { + for (i = 0; i < mb_rows; ++i) { + vpx_free(pbi->mt_yabove_row[i]); + pbi->mt_yabove_row[i] = NULL; + } + vpx_free(pbi->mt_yabove_row); + pbi->mt_yabove_row = NULL; + } + + if (pbi->mt_uabove_row) { + for (i = 0; i < mb_rows; ++i) { + vpx_free(pbi->mt_uabove_row[i]); + pbi->mt_uabove_row[i] = NULL; + } + vpx_free(pbi->mt_uabove_row); + pbi->mt_uabove_row = NULL; + } + + if (pbi->mt_vabove_row) { + for (i = 0; i < mb_rows; ++i) { + vpx_free(pbi->mt_vabove_row[i]); + pbi->mt_vabove_row[i] = NULL; + } + vpx_free(pbi->mt_vabove_row); + pbi->mt_vabove_row = NULL; + } + + /* Free left_col buffers. */ + if (pbi->mt_yleft_col) { + for (i = 0; i < mb_rows; ++i) { + vpx_free(pbi->mt_yleft_col[i]); + pbi->mt_yleft_col[i] = NULL; + } + vpx_free(pbi->mt_yleft_col); + pbi->mt_yleft_col = NULL; + } + + if (pbi->mt_uleft_col) { + for (i = 0; i < mb_rows; ++i) { + vpx_free(pbi->mt_uleft_col[i]); + pbi->mt_uleft_col[i] = NULL; + } + vpx_free(pbi->mt_uleft_col); + pbi->mt_uleft_col = NULL; + } + + if (pbi->mt_vleft_col) { + for (i = 0; i < mb_rows; ++i) { + vpx_free(pbi->mt_vleft_col[i]); + pbi->mt_vleft_col[i] = NULL; + } + vpx_free(pbi->mt_vleft_col); + pbi->mt_vleft_col = NULL; + } +} + +void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) { + VP8_COMMON *const pc = &pbi->common; + int i; + int uv_width; + + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) { + vp8mt_de_alloc_temp_buffers(pbi, prev_mb_rows); + + /* our internal buffers are always multiples of 16 */ + if ((width & 0xf) != 0) width += 16 - (width & 0xf); + + if (width < 640) { + pbi->sync_range = 1; + } else if (width <= 1280) { + pbi->sync_range = 8; + } else if (width <= 2560) { + pbi->sync_range = 16; + } else { + pbi->sync_range = 32; + } + + uv_width = width >> 1; + + /* Allocate a vpx_atomic_int for each mb row. */ + CHECK_MEM_ERROR(&pc->error, pbi->mt_current_mb_col, + vpx_malloc(sizeof(*pbi->mt_current_mb_col) * pc->mb_rows)); + for (i = 0; i < pc->mb_rows; ++i) + vpx_atomic_init(&pbi->mt_current_mb_col[i], 0); + + /* Allocate memory for above_row buffers. */ + CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows); + for (i = 0; i < pc->mb_rows; ++i) { + CHECK_MEM_ERROR(&pc->error, pbi->mt_yabove_row[i], + vpx_memalign(16, sizeof(unsigned char) * + (width + (VP8BORDERINPIXELS << 1)))); + vp8_zero_array(pbi->mt_yabove_row[i], width + (VP8BORDERINPIXELS << 1)); + } + + CALLOC_ARRAY(pbi->mt_uabove_row, pc->mb_rows); + for (i = 0; i < pc->mb_rows; ++i) { + CHECK_MEM_ERROR(&pc->error, pbi->mt_uabove_row[i], + vpx_memalign(16, sizeof(unsigned char) * + (uv_width + VP8BORDERINPIXELS))); + vp8_zero_array(pbi->mt_uabove_row[i], uv_width + VP8BORDERINPIXELS); + } + + CALLOC_ARRAY(pbi->mt_vabove_row, pc->mb_rows); + for (i = 0; i < pc->mb_rows; ++i) { + CHECK_MEM_ERROR(&pc->error, pbi->mt_vabove_row[i], + vpx_memalign(16, sizeof(unsigned char) * + (uv_width + VP8BORDERINPIXELS))); + vp8_zero_array(pbi->mt_vabove_row[i], uv_width + VP8BORDERINPIXELS); + } + + /* Allocate memory for left_col buffers. */ + CALLOC_ARRAY(pbi->mt_yleft_col, pc->mb_rows); + for (i = 0; i < pc->mb_rows; ++i) + CHECK_MEM_ERROR(&pc->error, pbi->mt_yleft_col[i], + vpx_calloc(sizeof(unsigned char) * 16, 1)); + + CALLOC_ARRAY(pbi->mt_uleft_col, pc->mb_rows); + for (i = 0; i < pc->mb_rows; ++i) + CHECK_MEM_ERROR(&pc->error, pbi->mt_uleft_col[i], + vpx_calloc(sizeof(unsigned char) * 8, 1)); + + CALLOC_ARRAY(pbi->mt_vleft_col, pc->mb_rows); + for (i = 0; i < pc->mb_rows; ++i) + CHECK_MEM_ERROR(&pc->error, pbi->mt_vleft_col[i], + vpx_calloc(sizeof(unsigned char) * 8, 1)); + } +} + +void vp8_decoder_remove_threads(VP8D_COMP *pbi) { + /* shutdown MB Decoding thread; */ + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) { + int i; + vpx_atomic_store_release(&pbi->b_multithreaded_rd, 0); + + /* allow all threads to exit */ + for (i = 0; i < pbi->allocated_decoding_thread_count; ++i) { + sem_post(&pbi->h_event_start_decoding[i]); + pthread_join(pbi->h_decoding_thread[i], NULL); + } + + for (i = 0; i < pbi->allocated_decoding_thread_count; ++i) { + sem_destroy(&pbi->h_event_start_decoding[i]); + } + + if (pbi->allocated_decoding_thread_count) { + sem_destroy(&pbi->h_event_end_decoding); + } + + vpx_free(pbi->h_decoding_thread); + pbi->h_decoding_thread = NULL; + + vpx_free(pbi->h_event_start_decoding); + pbi->h_event_start_decoding = NULL; + + vpx_free(pbi->mb_row_di); + pbi->mb_row_di = NULL; + + vpx_free(pbi->de_thread_data); + pbi->de_thread_data = NULL; + + vp8mt_de_alloc_temp_buffers(pbi, pbi->common.mb_rows); + } +} + +int vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) { + VP8_COMMON *pc = &pbi->common; + unsigned int i; + int j; + + int filter_level = pc->filter_level; + YV12_BUFFER_CONFIG *yv12_fb_new = pbi->dec_fb_ref[INTRA_FRAME]; + + if (filter_level) { + /* Set above_row buffer to 127 for decoding first MB row */ + memset(pbi->mt_yabove_row[0] + VP8BORDERINPIXELS - 1, 127, + yv12_fb_new->y_width + 5); + memset(pbi->mt_uabove_row[0] + (VP8BORDERINPIXELS >> 1) - 1, 127, + (yv12_fb_new->y_width >> 1) + 5); + memset(pbi->mt_vabove_row[0] + (VP8BORDERINPIXELS >> 1) - 1, 127, + (yv12_fb_new->y_width >> 1) + 5); + + for (j = 1; j < pc->mb_rows; ++j) { + memset(pbi->mt_yabove_row[j] + VP8BORDERINPIXELS - 1, (unsigned char)129, + 1); + memset(pbi->mt_uabove_row[j] + (VP8BORDERINPIXELS >> 1) - 1, + (unsigned char)129, 1); + memset(pbi->mt_vabove_row[j] + (VP8BORDERINPIXELS >> 1) - 1, + (unsigned char)129, 1); + } + + /* Set left_col to 129 initially */ + for (j = 0; j < pc->mb_rows; ++j) { + memset(pbi->mt_yleft_col[j], (unsigned char)129, 16); + memset(pbi->mt_uleft_col[j], (unsigned char)129, 8); + memset(pbi->mt_vleft_col[j], (unsigned char)129, 8); + } + + /* Initialize the loop filter for this frame. */ + vp8_loop_filter_frame_init(pc, &pbi->mb, filter_level); + } else { + vp8_setup_intra_recon_top_line(yv12_fb_new); + } + + setup_decoding_thread_data(pbi, xd, pbi->mb_row_di, + pbi->decoding_thread_count); + + for (i = 0; i < pbi->decoding_thread_count; ++i) { + sem_post(&pbi->h_event_start_decoding[i]); + } + + if (setjmp(xd->error_info.jmp)) { + xd->error_info.setjmp = 0; + xd->corrupted = 1; + // Wait for other threads to finish. This prevents other threads decoding + // the current frame while the main thread starts decoding the next frame, + // which causes a data race. + for (i = 0; i < pbi->decoding_thread_count; ++i) + sem_wait(&pbi->h_event_end_decoding); + return -1; + } + + xd->error_info.setjmp = 1; + mt_decode_mb_rows(pbi, xd, 0); + + for (i = 0; i < pbi->decoding_thread_count + 1; ++i) + sem_wait(&pbi->h_event_end_decoding); /* add back for each frame */ + + return 0; +} diff --git a/media/libvpx/libvpx/vp8/decoder/treereader.h b/media/libvpx/libvpx/vp8/decoder/treereader.h new file mode 100644 index 0000000000..4bf938a741 --- /dev/null +++ b/media/libvpx/libvpx/vp8/decoder/treereader.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_DECODER_TREEREADER_H_ +#define VPX_VP8_DECODER_TREEREADER_H_ + +#include "./vpx_config.h" +#include "vp8/common/treecoder.h" +#include "dboolhuff.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef BOOL_DECODER vp8_reader; + +#define vp8_read vp8dx_decode_bool +#define vp8_read_literal vp8_decode_value +#define vp8_read_bit(R) vp8_read(R, vp8_prob_half) + +/* Intent of tree data structure is to make decoding trivial. */ + +static INLINE int vp8_treed_read( + vp8_reader *const r, /* !!! must return a 0 or 1 !!! */ + vp8_tree t, const vp8_prob *const p) { + vp8_tree_index i = 0; + + while ((i = t[i + vp8_read(r, p[i >> 1])]) > 0) { + } + + return -i; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_DECODER_TREEREADER_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/arm/neon/denoising_neon.c b/media/libvpx/libvpx/vp8/encoder/arm/neon/denoising_neon.c new file mode 100644 index 0000000000..67267b8f3a --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/arm/neon/denoising_neon.c @@ -0,0 +1,460 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vp8/encoder/denoising.h" +#include "vpx_mem/vpx_mem.h" +#include "./vp8_rtcd.h" + +/* + * The filter function was modified to reduce the computational complexity. + * + * Step 1: + * Instead of applying tap coefficients for each pixel, we calculated the + * pixel adjustments vs. pixel diff value ahead of time. + * adjustment = filtered_value - current_raw + * = (filter_coefficient * diff + 128) >> 8 + * where + * filter_coefficient = (255 << 8) / (256 + ((abs_diff * 330) >> 3)); + * filter_coefficient += filter_coefficient / + * (3 + motion_magnitude_adjustment); + * filter_coefficient is clamped to 0 ~ 255. + * + * Step 2: + * The adjustment vs. diff curve becomes flat very quick when diff increases. + * This allowed us to use only several levels to approximate the curve without + * changing the filtering algorithm too much. + * The adjustments were further corrected by checking the motion magnitude. + * The levels used are: + * diff level adjustment w/o adjustment w/ + * motion correction motion correction + * [-255, -16] 3 -6 -7 + * [-15, -8] 2 -4 -5 + * [-7, -4] 1 -3 -4 + * [-3, 3] 0 diff diff + * [4, 7] 1 3 4 + * [8, 15] 2 4 5 + * [16, 255] 3 6 7 + */ + +int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y, + int mc_running_avg_y_stride, + unsigned char *running_avg_y, + int running_avg_y_stride, unsigned char *sig, + int sig_stride, unsigned int motion_magnitude, + int increase_denoising) { + /* If motion_magnitude is small, making the denoiser more aggressive by + * increasing the adjustment for each level, level1 adjustment is + * increased, the deltas stay the same. + */ + int shift_inc = + (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) + ? 1 + : 0; + const uint8x16_t v_level1_adjustment = vmovq_n_u8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3); + const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1); + const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2); + const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc); + const uint8x16_t v_level2_threshold = vdupq_n_u8(8); + const uint8x16_t v_level3_threshold = vdupq_n_u8(16); + int64x2_t v_sum_diff_total = vdupq_n_s64(0); + + /* Go over lines. */ + int r; + for (r = 0; r < 16; ++r) { + /* Load inputs. */ + const uint8x16_t v_sig = vld1q_u8(sig); + const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y); + + /* Calculate absolute difference and sign masks. */ + const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y); + + /* Figure out which level that put us in. */ + const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold, v_abs_diff); + const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold, v_abs_diff); + const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold, v_abs_diff); + + /* Calculate absolute adjustments for level 1, 2 and 3. */ + const uint8x16_t v_level2_adjustment = + vandq_u8(v_level2_mask, v_delta_level_1_and_2); + const uint8x16_t v_level3_adjustment = + vandq_u8(v_level3_mask, v_delta_level_2_and_3); + const uint8x16_t v_level1and2_adjustment = + vaddq_u8(v_level1_adjustment, v_level2_adjustment); + const uint8x16_t v_level1and2and3_adjustment = + vaddq_u8(v_level1and2_adjustment, v_level3_adjustment); + + /* Figure adjustment absolute value by selecting between the absolute + * difference if in level0 or the value for level 1, 2 and 3. + */ + const uint8x16_t v_abs_adjustment = + vbslq_u8(v_level1_mask, v_level1and2and3_adjustment, v_abs_diff); + + /* Calculate positive and negative adjustments. Apply them to the signal + * and accumulate them. Adjustments are less than eight and the maximum + * sum of them (7 * 16) can fit in a signed char. + */ + const uint8x16_t v_pos_adjustment = + vandq_u8(v_diff_pos_mask, v_abs_adjustment); + const uint8x16_t v_neg_adjustment = + vandq_u8(v_diff_neg_mask, v_abs_adjustment); + + uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment); + v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment); + + /* Store results. */ + vst1q_u8(running_avg_y, v_running_avg_y); + + /* Sum all the accumulators to have the sum of all pixel differences + * for this macroblock. + */ + { + const int8x16_t v_sum_diff = + vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment), + vreinterpretq_s8_u8(v_neg_adjustment)); + + const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff); + + const int32x4_t fedc_ba98_7654_3210 = + vpaddlq_s16(fe_dc_ba_98_76_54_32_10); + + const int64x2_t fedcba98_76543210 = vpaddlq_s32(fedc_ba98_7654_3210); + + v_sum_diff_total = vqaddq_s64(v_sum_diff_total, fedcba98_76543210); + } + + /* Update pointers for next iteration. */ + sig += sig_stride; + mc_running_avg_y += mc_running_avg_y_stride; + running_avg_y += running_avg_y_stride; + } + + /* Too much adjustments => copy block. */ + { + int64x1_t x = vqadd_s64(vget_high_s64(v_sum_diff_total), + vget_low_s64(v_sum_diff_total)); + int sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0); + int sum_diff_thresh = SUM_DIFF_THRESHOLD; + + if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH; + if (sum_diff > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), + // checK if we can still apply some (weaker) temporal filtering to + // this block, that would otherwise not be denoised at all. Simplest + // is to apply an additional adjustment to running_avg_y to bring it + // closer to sig. The adjustment is capped by a maximum delta, and + // chosen such that in most cases the resulting sum_diff will be + // within the accceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over the + // threshold. + int delta = ((sum_diff - sum_diff_thresh) >> 8) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const uint8x16_t k_delta = vmovq_n_u8(delta); + sig -= sig_stride * 16; + mc_running_avg_y -= mc_running_avg_y_stride * 16; + running_avg_y -= running_avg_y_stride * 16; + for (r = 0; r < 16; ++r) { + uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y); + const uint8x16_t v_sig = vld1q_u8(sig); + const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y); + + /* Calculate absolute difference and sign masks. */ + const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_pos_mask = + vcltq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_neg_mask = + vcgtq_u8(v_sig, v_mc_running_avg_y); + // Clamp absolute difference to delta to get the adjustment. + const uint8x16_t v_abs_adjustment = vminq_u8(v_abs_diff, (k_delta)); + + const uint8x16_t v_pos_adjustment = + vandq_u8(v_diff_pos_mask, v_abs_adjustment); + const uint8x16_t v_neg_adjustment = + vandq_u8(v_diff_neg_mask, v_abs_adjustment); + + v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment); + v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment); + + /* Store results. */ + vst1q_u8(running_avg_y, v_running_avg_y); + + { + const int8x16_t v_sum_diff = + vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment), + vreinterpretq_s8_u8(v_pos_adjustment)); + + const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff); + const int32x4_t fedc_ba98_7654_3210 = + vpaddlq_s16(fe_dc_ba_98_76_54_32_10); + const int64x2_t fedcba98_76543210 = + vpaddlq_s32(fedc_ba98_7654_3210); + + v_sum_diff_total = vqaddq_s64(v_sum_diff_total, fedcba98_76543210); + } + /* Update pointers for next iteration. */ + sig += sig_stride; + mc_running_avg_y += mc_running_avg_y_stride; + running_avg_y += running_avg_y_stride; + } + { + // Update the sum of all pixel differences of this MB. + x = vqadd_s64(vget_high_s64(v_sum_diff_total), + vget_low_s64(v_sum_diff_total)); + sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0); + + if (sum_diff > sum_diff_thresh) { + return COPY_BLOCK; + } + } + } else { + return COPY_BLOCK; + } + } + } + + /* Tell above level that block was filtered. */ + running_avg_y -= running_avg_y_stride * 16; + sig -= sig_stride * 16; + + vp8_copy_mem16x16(running_avg_y, running_avg_y_stride, sig, sig_stride); + + return FILTER_BLOCK; +} + +int vp8_denoiser_filter_uv_neon(unsigned char *mc_running_avg, + int mc_running_avg_stride, + unsigned char *running_avg, + int running_avg_stride, unsigned char *sig, + int sig_stride, unsigned int motion_magnitude, + int increase_denoising) { + /* If motion_magnitude is small, making the denoiser more aggressive by + * increasing the adjustment for each level, level1 adjustment is + * increased, the deltas stay the same. + */ + int shift_inc = + (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) + ? 1 + : 0; + const uint8x16_t v_level1_adjustment = vmovq_n_u8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? 4 + shift_inc : 3); + + const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1); + const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2); + const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc); + const uint8x16_t v_level2_threshold = vdupq_n_u8(8); + const uint8x16_t v_level3_threshold = vdupq_n_u8(16); + int64x2_t v_sum_diff_total = vdupq_n_s64(0); + int r; + + { + uint16x4_t v_sum_block = vdup_n_u16(0); + + // Avoid denoising color signal if its close to average level. + for (r = 0; r < 8; ++r) { + const uint8x8_t v_sig = vld1_u8(sig); + const uint16x4_t _76_54_32_10 = vpaddl_u8(v_sig); + v_sum_block = vqadd_u16(v_sum_block, _76_54_32_10); + sig += sig_stride; + } + sig -= sig_stride * 8; + { + const uint32x2_t _7654_3210 = vpaddl_u16(v_sum_block); + const uint64x1_t _76543210 = vpaddl_u32(_7654_3210); + const int sum_block = vget_lane_s32(vreinterpret_s32_u64(_76543210), 0); + if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) { + return COPY_BLOCK; + } + } + } + + /* Go over lines. */ + for (r = 0; r < 4; ++r) { + /* Load inputs. */ + const uint8x8_t v_sig_lo = vld1_u8(sig); + const uint8x8_t v_sig_hi = vld1_u8(&sig[sig_stride]); + const uint8x16_t v_sig = vcombine_u8(v_sig_lo, v_sig_hi); + const uint8x8_t v_mc_running_avg_lo = vld1_u8(mc_running_avg); + const uint8x8_t v_mc_running_avg_hi = + vld1_u8(&mc_running_avg[mc_running_avg_stride]); + const uint8x16_t v_mc_running_avg = + vcombine_u8(v_mc_running_avg_lo, v_mc_running_avg_hi); + /* Calculate absolute difference and sign masks. */ + const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg); + const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg); + const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg); + + /* Figure out which level that put us in. */ + const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold, v_abs_diff); + const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold, v_abs_diff); + const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold, v_abs_diff); + + /* Calculate absolute adjustments for level 1, 2 and 3. */ + const uint8x16_t v_level2_adjustment = + vandq_u8(v_level2_mask, v_delta_level_1_and_2); + const uint8x16_t v_level3_adjustment = + vandq_u8(v_level3_mask, v_delta_level_2_and_3); + const uint8x16_t v_level1and2_adjustment = + vaddq_u8(v_level1_adjustment, v_level2_adjustment); + const uint8x16_t v_level1and2and3_adjustment = + vaddq_u8(v_level1and2_adjustment, v_level3_adjustment); + + /* Figure adjustment absolute value by selecting between the absolute + * difference if in level0 or the value for level 1, 2 and 3. + */ + const uint8x16_t v_abs_adjustment = + vbslq_u8(v_level1_mask, v_level1and2and3_adjustment, v_abs_diff); + + /* Calculate positive and negative adjustments. Apply them to the signal + * and accumulate them. Adjustments are less than eight and the maximum + * sum of them (7 * 16) can fit in a signed char. + */ + const uint8x16_t v_pos_adjustment = + vandq_u8(v_diff_pos_mask, v_abs_adjustment); + const uint8x16_t v_neg_adjustment = + vandq_u8(v_diff_neg_mask, v_abs_adjustment); + + uint8x16_t v_running_avg = vqaddq_u8(v_sig, v_pos_adjustment); + v_running_avg = vqsubq_u8(v_running_avg, v_neg_adjustment); + + /* Store results. */ + vst1_u8(running_avg, vget_low_u8(v_running_avg)); + vst1_u8(&running_avg[running_avg_stride], vget_high_u8(v_running_avg)); + + /* Sum all the accumulators to have the sum of all pixel differences + * for this macroblock. + */ + { + const int8x16_t v_sum_diff = + vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment), + vreinterpretq_s8_u8(v_neg_adjustment)); + + const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff); + + const int32x4_t fedc_ba98_7654_3210 = + vpaddlq_s16(fe_dc_ba_98_76_54_32_10); + + const int64x2_t fedcba98_76543210 = vpaddlq_s32(fedc_ba98_7654_3210); + + v_sum_diff_total = vqaddq_s64(v_sum_diff_total, fedcba98_76543210); + } + + /* Update pointers for next iteration. */ + sig += sig_stride * 2; + mc_running_avg += mc_running_avg_stride * 2; + running_avg += running_avg_stride * 2; + } + + /* Too much adjustments => copy block. */ + { + int64x1_t x = vqadd_s64(vget_high_s64(v_sum_diff_total), + vget_low_s64(v_sum_diff_total)); + int sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0); + int sum_diff_thresh = SUM_DIFF_THRESHOLD_UV; + if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV; + if (sum_diff > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), + // checK if we can still apply some (weaker) temporal filtering to + // this block, that would otherwise not be denoised at all. Simplest + // is to apply an additional adjustment to running_avg_y to bring it + // closer to sig. The adjustment is capped by a maximum delta, and + // chosen such that in most cases the resulting sum_diff will be + // within the accceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over the + // threshold. + int delta = ((sum_diff - sum_diff_thresh) >> 8) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const uint8x16_t k_delta = vmovq_n_u8(delta); + sig -= sig_stride * 8; + mc_running_avg -= mc_running_avg_stride * 8; + running_avg -= running_avg_stride * 8; + for (r = 0; r < 4; ++r) { + const uint8x8_t v_sig_lo = vld1_u8(sig); + const uint8x8_t v_sig_hi = vld1_u8(&sig[sig_stride]); + const uint8x16_t v_sig = vcombine_u8(v_sig_lo, v_sig_hi); + const uint8x8_t v_mc_running_avg_lo = vld1_u8(mc_running_avg); + const uint8x8_t v_mc_running_avg_hi = + vld1_u8(&mc_running_avg[mc_running_avg_stride]); + const uint8x16_t v_mc_running_avg = + vcombine_u8(v_mc_running_avg_lo, v_mc_running_avg_hi); + /* Calculate absolute difference and sign masks. */ + const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg); + const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg); + const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg); + // Clamp absolute difference to delta to get the adjustment. + const uint8x16_t v_abs_adjustment = vminq_u8(v_abs_diff, (k_delta)); + + const uint8x16_t v_pos_adjustment = + vandq_u8(v_diff_pos_mask, v_abs_adjustment); + const uint8x16_t v_neg_adjustment = + vandq_u8(v_diff_neg_mask, v_abs_adjustment); + const uint8x8_t v_running_avg_lo = vld1_u8(running_avg); + const uint8x8_t v_running_avg_hi = + vld1_u8(&running_avg[running_avg_stride]); + uint8x16_t v_running_avg = + vcombine_u8(v_running_avg_lo, v_running_avg_hi); + + v_running_avg = vqsubq_u8(v_running_avg, v_pos_adjustment); + v_running_avg = vqaddq_u8(v_running_avg, v_neg_adjustment); + + /* Store results. */ + vst1_u8(running_avg, vget_low_u8(v_running_avg)); + vst1_u8(&running_avg[running_avg_stride], + vget_high_u8(v_running_avg)); + + { + const int8x16_t v_sum_diff = + vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment), + vreinterpretq_s8_u8(v_pos_adjustment)); + + const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff); + const int32x4_t fedc_ba98_7654_3210 = + vpaddlq_s16(fe_dc_ba_98_76_54_32_10); + const int64x2_t fedcba98_76543210 = + vpaddlq_s32(fedc_ba98_7654_3210); + + v_sum_diff_total = vqaddq_s64(v_sum_diff_total, fedcba98_76543210); + } + /* Update pointers for next iteration. */ + sig += sig_stride * 2; + mc_running_avg += mc_running_avg_stride * 2; + running_avg += running_avg_stride * 2; + } + { + // Update the sum of all pixel differences of this MB. + x = vqadd_s64(vget_high_s64(v_sum_diff_total), + vget_low_s64(v_sum_diff_total)); + sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0); + + if (sum_diff > sum_diff_thresh) { + return COPY_BLOCK; + } + } + } else { + return COPY_BLOCK; + } + } + } + + /* Tell above level that block was filtered. */ + running_avg -= running_avg_stride * 8; + sig -= sig_stride * 8; + + vp8_copy_mem8x8(running_avg, running_avg_stride, sig, sig_stride); + + return FILTER_BLOCK; +} diff --git a/media/libvpx/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c b/media/libvpx/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c new file mode 100644 index 0000000000..950c943343 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vp8_rtcd.h" +#include "vp8/encoder/block.h" + +static const uint16_t inv_zig_zag[16] = { 1, 2, 6, 7, 3, 5, 8, 13, + 4, 9, 12, 14, 10, 11, 15, 16 }; + +void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) { + const int16x8_t one_q = vdupq_n_s16(-1), z0 = vld1q_s16(b->coeff), + z1 = vld1q_s16(b->coeff + 8), round0 = vld1q_s16(b->round), + round1 = vld1q_s16(b->round + 8), + quant0 = vld1q_s16(b->quant_fast), + quant1 = vld1q_s16(b->quant_fast + 8), + dequant0 = vld1q_s16(d->dequant), + dequant1 = vld1q_s16(d->dequant + 8); + const uint16x8_t zig_zag0 = vld1q_u16(inv_zig_zag), + zig_zag1 = vld1q_u16(inv_zig_zag + 8); + int16x8_t x0, x1, sz0, sz1, y0, y1; + uint16x8_t eob0, eob1; +#if !VPX_ARCH_AARCH64 + uint16x4_t eob_d16; + uint32x2_t eob_d32; + uint32x4_t eob_q32; +#endif // !VPX_ARCH_AARCH64 + + /* sign of z: z >> 15 */ + sz0 = vshrq_n_s16(z0, 15); + sz1 = vshrq_n_s16(z1, 15); + + /* x = abs(z) */ + x0 = vabsq_s16(z0); + x1 = vabsq_s16(z1); + + /* x += round */ + x0 = vaddq_s16(x0, round0); + x1 = vaddq_s16(x1, round1); + + /* y = 2 * (x * quant) >> 16 */ + y0 = vqdmulhq_s16(x0, quant0); + y1 = vqdmulhq_s16(x1, quant1); + + /* Compensate for doubling in vqdmulhq */ + y0 = vshrq_n_s16(y0, 1); + y1 = vshrq_n_s16(y1, 1); + + /* Restore sign bit */ + y0 = veorq_s16(y0, sz0); + y1 = veorq_s16(y1, sz1); + x0 = vsubq_s16(y0, sz0); + x1 = vsubq_s16(y1, sz1); + + /* find non-zero elements */ + eob0 = vtstq_s16(x0, one_q); + eob1 = vtstq_s16(x1, one_q); + + /* mask zig zag */ + eob0 = vandq_u16(eob0, zig_zag0); + eob1 = vandq_u16(eob1, zig_zag1); + + /* select the largest value */ + eob0 = vmaxq_u16(eob0, eob1); +#if VPX_ARCH_AARCH64 + *d->eob = (int8_t)vmaxvq_u16(eob0); +#else + eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0)); + eob_q32 = vmovl_u16(eob_d16); + eob_d32 = vmax_u32(vget_low_u32(eob_q32), vget_high_u32(eob_q32)); + eob_d32 = vpmax_u32(eob_d32, eob_d32); + + vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0); +#endif // VPX_ARCH_AARCH64 + + /* qcoeff = x */ + vst1q_s16(d->qcoeff, x0); + vst1q_s16(d->qcoeff + 8, x1); + + /* dqcoeff = x * dequant */ + vst1q_s16(d->dqcoeff, vmulq_s16(dequant0, x0)); + vst1q_s16(d->dqcoeff + 8, vmulq_s16(dequant1, x1)); +} diff --git a/media/libvpx/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c b/media/libvpx/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c new file mode 100644 index 0000000000..99dff6b520 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c @@ -0,0 +1,261 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vp8_rtcd.h" + +void vp8_short_fdct4x4_neon(int16_t *input, int16_t *output, int pitch) { + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + int16x4_t d16s16, d17s16, d26s16, dEmptys16; + uint16x4_t d4u16; + int16x8_t q0s16, q1s16; + int32x4_t q9s32, q10s32, q11s32, q12s32; + int16x4x2_t v2tmp0, v2tmp1; + int32x2x2_t v2tmp2, v2tmp3; + + d16s16 = vdup_n_s16(5352); + d17s16 = vdup_n_s16(2217); + q9s32 = vdupq_n_s32(14500); + q10s32 = vdupq_n_s32(7500); + q11s32 = vdupq_n_s32(12000); + q12s32 = vdupq_n_s32(51000); + + // Part one + pitch >>= 1; + d0s16 = vld1_s16(input); + input += pitch; + d1s16 = vld1_s16(input); + input += pitch; + d2s16 = vld1_s16(input); + input += pitch; + d3s16 = vld1_s16(input); + + v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16), vreinterpret_s32_s16(d2s16)); + v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16), vreinterpret_s32_s16(d3s16)); + v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0 + vreinterpret_s16_s32(v2tmp3.val[0])); // d1 + v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2 + vreinterpret_s16_s32(v2tmp3.val[1])); // d3 + + d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]); + d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]); + d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]); + d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]); + + d4s16 = vshl_n_s16(d4s16, 3); + d5s16 = vshl_n_s16(d5s16, 3); + d6s16 = vshl_n_s16(d6s16, 3); + d7s16 = vshl_n_s16(d7s16, 3); + + d0s16 = vadd_s16(d4s16, d5s16); + d2s16 = vsub_s16(d4s16, d5s16); + + q9s32 = vmlal_s16(q9s32, d7s16, d16s16); + q10s32 = vmlal_s16(q10s32, d7s16, d17s16); + q9s32 = vmlal_s16(q9s32, d6s16, d17s16); + q10s32 = vmlsl_s16(q10s32, d6s16, d16s16); + + d1s16 = vshrn_n_s32(q9s32, 12); + d3s16 = vshrn_n_s32(q10s32, 12); + + // Part two + v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16), vreinterpret_s32_s16(d2s16)); + v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16), vreinterpret_s32_s16(d3s16)); + v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0 + vreinterpret_s16_s32(v2tmp3.val[0])); // d1 + v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2 + vreinterpret_s16_s32(v2tmp3.val[1])); // d3 + + d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]); + d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]); + d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]); + d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]); + + d26s16 = vdup_n_s16(7); + d4s16 = vadd_s16(d4s16, d26s16); + + d0s16 = vadd_s16(d4s16, d5s16); + d2s16 = vsub_s16(d4s16, d5s16); + + q11s32 = vmlal_s16(q11s32, d7s16, d16s16); + q12s32 = vmlal_s16(q12s32, d7s16, d17s16); + + dEmptys16 = vdup_n_s16(0); + d4u16 = vceq_s16(d7s16, dEmptys16); + + d0s16 = vshr_n_s16(d0s16, 4); + d2s16 = vshr_n_s16(d2s16, 4); + + q11s32 = vmlal_s16(q11s32, d6s16, d17s16); + q12s32 = vmlsl_s16(q12s32, d6s16, d16s16); + + d4u16 = vmvn_u16(d4u16); + d1s16 = vshrn_n_s32(q11s32, 16); + d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d4u16)); + d3s16 = vshrn_n_s32(q12s32, 16); + + q0s16 = vcombine_s16(d0s16, d1s16); + q1s16 = vcombine_s16(d2s16, d3s16); + + vst1q_s16(output, q0s16); + vst1q_s16(output + 8, q1s16); + return; +} + +void vp8_short_fdct8x4_neon(int16_t *input, int16_t *output, int pitch) { + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + int16x4_t d16s16, d17s16, d26s16, d27s16, d28s16, d29s16; + uint16x4_t d28u16, d29u16; + uint16x8_t q14u16; + int16x8_t q0s16, q1s16, q2s16, q3s16; + int16x8_t q11s16, q12s16, q13s16, q14s16, q15s16, qEmptys16; + int32x4_t q9s32, q10s32, q11s32, q12s32; + int16x8x2_t v2tmp0, v2tmp1; + int32x4x2_t v2tmp2, v2tmp3; + + d16s16 = vdup_n_s16(5352); + d17s16 = vdup_n_s16(2217); + q9s32 = vdupq_n_s32(14500); + q10s32 = vdupq_n_s32(7500); + + // Part one + pitch >>= 1; + q0s16 = vld1q_s16(input); + input += pitch; + q1s16 = vld1q_s16(input); + input += pitch; + q2s16 = vld1q_s16(input); + input += pitch; + q3s16 = vld1q_s16(input); + + v2tmp2 = + vtrnq_s32(vreinterpretq_s32_s16(q0s16), vreinterpretq_s32_s16(q2s16)); + v2tmp3 = + vtrnq_s32(vreinterpretq_s32_s16(q1s16), vreinterpretq_s32_s16(q3s16)); + v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]), // q0 + vreinterpretq_s16_s32(v2tmp3.val[0])); // q1 + v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]), // q2 + vreinterpretq_s16_s32(v2tmp3.val[1])); // q3 + + q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]); + q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]); + q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]); + q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]); + + q11s16 = vshlq_n_s16(q11s16, 3); + q12s16 = vshlq_n_s16(q12s16, 3); + q13s16 = vshlq_n_s16(q13s16, 3); + q14s16 = vshlq_n_s16(q14s16, 3); + + q0s16 = vaddq_s16(q11s16, q12s16); + q2s16 = vsubq_s16(q11s16, q12s16); + + q11s32 = q9s32; + q12s32 = q10s32; + + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + + q9s32 = vmlal_s16(q9s32, d28s16, d16s16); + q10s32 = vmlal_s16(q10s32, d28s16, d17s16); + q11s32 = vmlal_s16(q11s32, d29s16, d16s16); + q12s32 = vmlal_s16(q12s32, d29s16, d17s16); + + q9s32 = vmlal_s16(q9s32, d26s16, d17s16); + q10s32 = vmlsl_s16(q10s32, d26s16, d16s16); + q11s32 = vmlal_s16(q11s32, d27s16, d17s16); + q12s32 = vmlsl_s16(q12s32, d27s16, d16s16); + + d2s16 = vshrn_n_s32(q9s32, 12); + d6s16 = vshrn_n_s32(q10s32, 12); + d3s16 = vshrn_n_s32(q11s32, 12); + d7s16 = vshrn_n_s32(q12s32, 12); + q1s16 = vcombine_s16(d2s16, d3s16); + q3s16 = vcombine_s16(d6s16, d7s16); + + // Part two + q9s32 = vdupq_n_s32(12000); + q10s32 = vdupq_n_s32(51000); + + v2tmp2 = + vtrnq_s32(vreinterpretq_s32_s16(q0s16), vreinterpretq_s32_s16(q2s16)); + v2tmp3 = + vtrnq_s32(vreinterpretq_s32_s16(q1s16), vreinterpretq_s32_s16(q3s16)); + v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]), // q0 + vreinterpretq_s16_s32(v2tmp3.val[0])); // q1 + v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]), // q2 + vreinterpretq_s16_s32(v2tmp3.val[1])); // q3 + + q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]); + q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]); + q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]); + q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]); + + q15s16 = vdupq_n_s16(7); + q11s16 = vaddq_s16(q11s16, q15s16); + q0s16 = vaddq_s16(q11s16, q12s16); + q1s16 = vsubq_s16(q11s16, q12s16); + + q11s32 = q9s32; + q12s32 = q10s32; + + d0s16 = vget_low_s16(q0s16); + d1s16 = vget_high_s16(q0s16); + d2s16 = vget_low_s16(q1s16); + d3s16 = vget_high_s16(q1s16); + + d0s16 = vshr_n_s16(d0s16, 4); + d4s16 = vshr_n_s16(d1s16, 4); + d2s16 = vshr_n_s16(d2s16, 4); + d6s16 = vshr_n_s16(d3s16, 4); + + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + + q9s32 = vmlal_s16(q9s32, d28s16, d16s16); + q10s32 = vmlal_s16(q10s32, d28s16, d17s16); + q11s32 = vmlal_s16(q11s32, d29s16, d16s16); + q12s32 = vmlal_s16(q12s32, d29s16, d17s16); + + q9s32 = vmlal_s16(q9s32, d26s16, d17s16); + q10s32 = vmlsl_s16(q10s32, d26s16, d16s16); + q11s32 = vmlal_s16(q11s32, d27s16, d17s16); + q12s32 = vmlsl_s16(q12s32, d27s16, d16s16); + + d1s16 = vshrn_n_s32(q9s32, 16); + d3s16 = vshrn_n_s32(q10s32, 16); + d5s16 = vshrn_n_s32(q11s32, 16); + d7s16 = vshrn_n_s32(q12s32, 16); + + qEmptys16 = vdupq_n_s16(0); + q14u16 = vceqq_s16(q14s16, qEmptys16); + q14u16 = vmvnq_u16(q14u16); + + d28u16 = vget_low_u16(q14u16); + d29u16 = vget_high_u16(q14u16); + d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d28u16)); + d5s16 = vsub_s16(d5s16, vreinterpret_s16_u16(d29u16)); + + q0s16 = vcombine_s16(d0s16, d1s16); + q1s16 = vcombine_s16(d2s16, d3s16); + q2s16 = vcombine_s16(d4s16, d5s16); + q3s16 = vcombine_s16(d6s16, d7s16); + + vst1q_s16(output, q0s16); + vst1q_s16(output + 8, q1s16); + vst1q_s16(output + 16, q2s16); + vst1q_s16(output + 24, q3s16); + return; +} diff --git a/media/libvpx/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c b/media/libvpx/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c new file mode 100644 index 0000000000..02056f2f90 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vp8_rtcd.h" +#include "vpx_ports/arm.h" + +#ifdef VPX_INCOMPATIBLE_GCC +#include "./vp8_rtcd.h" +void vp8_short_walsh4x4_neon(int16_t *input, int16_t *output, int pitch) { + vp8_short_walsh4x4_c(input, output, pitch); +} +#else +void vp8_short_walsh4x4_neon(int16_t *input, int16_t *output, int pitch) { + uint16x4_t d16u16; + int16x8_t q0s16, q1s16; + int16x4_t dEmptys16, d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + int32x4_t qEmptys32, q0s32, q1s32, q2s32, q3s32, q8s32; + int32x4_t q9s32, q10s32, q11s32, q15s32; + uint32x4_t q8u32, q9u32, q10u32, q11u32; + int16x4x2_t v2tmp0, v2tmp1; + int32x2x2_t v2tmp2, v2tmp3; + + dEmptys16 = vdup_n_s16(0); + qEmptys32 = vdupq_n_s32(0); + q15s32 = vdupq_n_s32(3); + + d0s16 = vld1_s16(input); + input += pitch / 2; + d1s16 = vld1_s16(input); + input += pitch / 2; + d2s16 = vld1_s16(input); + input += pitch / 2; + d3s16 = vld1_s16(input); + + v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16), vreinterpret_s32_s16(d2s16)); + v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16), vreinterpret_s32_s16(d3s16)); + v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0 + vreinterpret_s16_s32(v2tmp3.val[0])); // d1 + v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2 + vreinterpret_s16_s32(v2tmp3.val[1])); // d3 + + d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[0]); + d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[1]); + d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[1]); + d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[0]); + + d4s16 = vshl_n_s16(d4s16, 2); + d5s16 = vshl_n_s16(d5s16, 2); + d6s16 = vshl_n_s16(d6s16, 2); + d7s16 = vshl_n_s16(d7s16, 2); + + d16u16 = vceq_s16(d4s16, dEmptys16); + d16u16 = vmvn_u16(d16u16); + + d0s16 = vadd_s16(d4s16, d5s16); + d3s16 = vsub_s16(d4s16, d5s16); + d1s16 = vadd_s16(d7s16, d6s16); + d2s16 = vsub_s16(d7s16, d6s16); + + d0s16 = vsub_s16(d0s16, vreinterpret_s16_u16(d16u16)); + + // Second for-loop + v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d1s16), vreinterpret_s32_s16(d3s16)); + v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d0s16), vreinterpret_s32_s16(d2s16)); + v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp3.val[1]), // d2 + vreinterpret_s16_s32(v2tmp2.val[1])); // d3 + v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp3.val[0]), // d0 + vreinterpret_s16_s32(v2tmp2.val[0])); // d1 + + q8s32 = vaddl_s16(v2tmp1.val[0], v2tmp0.val[0]); + q9s32 = vaddl_s16(v2tmp1.val[1], v2tmp0.val[1]); + q10s32 = vsubl_s16(v2tmp1.val[1], v2tmp0.val[1]); + q11s32 = vsubl_s16(v2tmp1.val[0], v2tmp0.val[0]); + + q0s32 = vaddq_s32(q8s32, q9s32); + q1s32 = vaddq_s32(q11s32, q10s32); + q2s32 = vsubq_s32(q11s32, q10s32); + q3s32 = vsubq_s32(q8s32, q9s32); + + q8u32 = vcltq_s32(q0s32, qEmptys32); + q9u32 = vcltq_s32(q1s32, qEmptys32); + q10u32 = vcltq_s32(q2s32, qEmptys32); + q11u32 = vcltq_s32(q3s32, qEmptys32); + + q8s32 = vreinterpretq_s32_u32(q8u32); + q9s32 = vreinterpretq_s32_u32(q9u32); + q10s32 = vreinterpretq_s32_u32(q10u32); + q11s32 = vreinterpretq_s32_u32(q11u32); + + q0s32 = vsubq_s32(q0s32, q8s32); + q1s32 = vsubq_s32(q1s32, q9s32); + q2s32 = vsubq_s32(q2s32, q10s32); + q3s32 = vsubq_s32(q3s32, q11s32); + + q8s32 = vaddq_s32(q0s32, q15s32); + q9s32 = vaddq_s32(q1s32, q15s32); + q10s32 = vaddq_s32(q2s32, q15s32); + q11s32 = vaddq_s32(q3s32, q15s32); + + d0s16 = vshrn_n_s32(q8s32, 3); + d1s16 = vshrn_n_s32(q9s32, 3); + d2s16 = vshrn_n_s32(q10s32, 3); + d3s16 = vshrn_n_s32(q11s32, 3); + + q0s16 = vcombine_s16(d0s16, d1s16); + q1s16 = vcombine_s16(d2s16, d3s16); + + vst1q_s16(output, q0s16); + vst1q_s16(output + 8, q1s16); + return; +} +#endif // VPX_INCOMPATIBLE_GCC diff --git a/media/libvpx/libvpx/vp8/encoder/bitstream.c b/media/libvpx/libvpx/vp8/encoder/bitstream.c new file mode 100644 index 0000000000..03691fc9d1 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/bitstream.c @@ -0,0 +1,1381 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp8/common/header.h" +#include "encodemv.h" +#include "vp8/common/entropymode.h" +#include "vp8/common/findnearmv.h" +#include "mcomp.h" +#include "vp8/common/systemdependent.h" +#include +#include +#include +#include "vpx/vpx_encoder.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/compiler_attributes.h" +#include "vpx_ports/system_state.h" +#include "bitstream.h" + +#include "defaultcoefcounts.h" +#include "vp8/common/common.h" + +const int vp8cx_base_skip_false_prob[128] = { + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 251, 248, 244, 240, + 236, 232, 229, 225, 221, 217, 213, 208, 204, 199, 194, 190, 187, 183, 179, + 175, 172, 168, 164, 160, 157, 153, 149, 145, 142, 138, 134, 130, 127, 124, + 120, 117, 114, 110, 107, 104, 101, 98, 95, 92, 89, 86, 83, 80, 77, + 74, 71, 68, 65, 62, 59, 56, 53, 50, 47, 44, 41, 38, 35, 32, + 30, 28, 26, 24, 22, 20, 18, 16, +}; + +#if defined(SECTIONBITS_OUTPUT) +unsigned __int64 Sectionbits[500]; +#endif + +#ifdef MODE_STATS +int count_mb_seg[4] = { 0, 0, 0, 0 }; +#endif + +static void update_mode(vp8_writer *const w, int n, vp8_token tok[/* n */], + vp8_tree tree, vp8_prob Pnew[/* n-1 */], + vp8_prob Pcur[/* n-1 */], + unsigned int bct[/* n-1 */][2], + const unsigned int num_events[/* n */]) { + unsigned int new_b = 0, old_b = 0; + int i = 0; + + vp8_tree_probs_from_distribution(n--, tok, tree, Pnew, bct, num_events, 256, + 1); + + do { + new_b += vp8_cost_branch(bct[i], Pnew[i]); + old_b += vp8_cost_branch(bct[i], Pcur[i]); + } while (++i < n); + + if (new_b + (n << 8) < old_b) { + int j = 0; + + vp8_write_bit(w, 1); + + do { + const vp8_prob p = Pnew[j]; + + vp8_write_literal(w, Pcur[j] = p ? p : 1, 8); + } while (++j < n); + } else + vp8_write_bit(w, 0); +} + +static void update_mbintra_mode_probs(VP8_COMP *cpi) { + VP8_COMMON *const x = &cpi->common; + + vp8_writer *const w = cpi->bc; + + { + vp8_prob Pnew[VP8_YMODES - 1]; + unsigned int bct[VP8_YMODES - 1][2]; + + update_mode(w, VP8_YMODES, vp8_ymode_encodings, vp8_ymode_tree, Pnew, + x->fc.ymode_prob, bct, (unsigned int *)cpi->mb.ymode_count); + } + { + vp8_prob Pnew[VP8_UV_MODES - 1]; + unsigned int bct[VP8_UV_MODES - 1][2]; + + update_mode(w, VP8_UV_MODES, vp8_uv_mode_encodings, vp8_uv_mode_tree, Pnew, + x->fc.uv_mode_prob, bct, (unsigned int *)cpi->mb.uv_mode_count); + } +} + +static void write_ymode(vp8_writer *bc, int m, const vp8_prob *p) { + vp8_write_token(bc, vp8_ymode_tree, p, vp8_ymode_encodings + m); +} + +static void kfwrite_ymode(vp8_writer *bc, int m, const vp8_prob *p) { + vp8_write_token(bc, vp8_kf_ymode_tree, p, vp8_kf_ymode_encodings + m); +} + +static void write_uv_mode(vp8_writer *bc, int m, const vp8_prob *p) { + vp8_write_token(bc, vp8_uv_mode_tree, p, vp8_uv_mode_encodings + m); +} + +static void write_bmode(vp8_writer *bc, int m, const vp8_prob *p) { + vp8_write_token(bc, vp8_bmode_tree, p, vp8_bmode_encodings + m); +} + +static void write_split(vp8_writer *bc, int x) { + vp8_write_token(bc, vp8_mbsplit_tree, vp8_mbsplit_probs, + vp8_mbsplit_encodings + x); +} + +void VPX_NO_UNSIGNED_SHIFT_CHECK vp8_pack_tokens(vp8_writer *w, + const TOKENEXTRA *p, + int xcount) { + const TOKENEXTRA *stop = p + xcount; + unsigned int split; + int shift; + int count = w->count; + unsigned int range = w->range; + unsigned int lowvalue = w->lowvalue; + + while (p < stop) { + const int t = p->Token; + vp8_token *a = vp8_coef_encodings + t; + const vp8_extra_bit_struct *b = vp8_extra_bits + t; + int i = 0; + const unsigned char *pp = p->context_tree; + int v = a->value; + int n = a->Len; + + if (p->skip_eob_node) { + n--; + i = 2; + } + + do { + const int bb = (v >> --n) & 1; + split = 1 + (((range - 1) * pp[i >> 1]) >> 8); + i = vp8_coef_tree[i + bb]; + + if (bb) { + lowvalue += split; + range = range - split; + } else { + range = split; + } + + shift = vp8_norm[range]; + range <<= shift; + count += shift; + + if (count >= 0) { + int offset = shift - count; + + if ((lowvalue << (offset - 1)) & 0x80000000) { + int x = w->pos - 1; + + while (x >= 0 && w->buffer[x] == 0xff) { + w->buffer[x] = (unsigned char)0; + x--; + } + + w->buffer[x] += 1; + } + + validate_buffer(w->buffer + w->pos, 1, w->buffer_end, w->error); + + w->buffer[w->pos++] = (lowvalue >> (24 - offset)) & 0xff; + shift = count; + lowvalue = (int)(((uint64_t)lowvalue << offset) & 0xffffff); + count -= 8; + } + + lowvalue <<= shift; + } while (n); + + if (b->base_val) { + const int e = p->Extra, L = b->Len; + + if (L) { + const unsigned char *proba = b->prob; + const int v2 = e >> 1; + int n2 = L; /* number of bits in v2, assumed nonzero */ + i = 0; + + do { + const int bb = (v2 >> --n2) & 1; + split = 1 + (((range - 1) * proba[i >> 1]) >> 8); + i = b->tree[i + bb]; + + if (bb) { + lowvalue += split; + range = range - split; + } else { + range = split; + } + + shift = vp8_norm[range]; + range <<= shift; + count += shift; + + if (count >= 0) { + int offset = shift - count; + + if ((lowvalue << (offset - 1)) & 0x80000000) { + int x = w->pos - 1; + + while (x >= 0 && w->buffer[x] == 0xff) { + w->buffer[x] = (unsigned char)0; + x--; + } + + w->buffer[x] += 1; + } + + validate_buffer(w->buffer + w->pos, 1, w->buffer_end, w->error); + + w->buffer[w->pos++] = (lowvalue >> (24 - offset)) & 0xff; + shift = count; + lowvalue = (int)(((uint64_t)lowvalue << offset) & 0xffffff); + count -= 8; + } + + lowvalue <<= shift; + } while (n2); + } + + { + split = (range + 1) >> 1; + + if (e & 1) { + lowvalue += split; + range = range - split; + } else { + range = split; + } + + range <<= 1; + + if ((lowvalue & 0x80000000)) { + int x = w->pos - 1; + + while (x >= 0 && w->buffer[x] == 0xff) { + w->buffer[x] = (unsigned char)0; + x--; + } + + w->buffer[x] += 1; + } + + lowvalue <<= 1; + + if (!++count) { + count = -8; + + validate_buffer(w->buffer + w->pos, 1, w->buffer_end, w->error); + + w->buffer[w->pos++] = (lowvalue >> 24); + lowvalue &= 0xffffff; + } + } + } + + ++p; + } + + w->count = count; + w->lowvalue = lowvalue; + w->range = range; +} + +static void write_partition_size(unsigned char *cx_data, int size) { + signed char csize; + + csize = size & 0xff; + *cx_data = csize; + csize = (size >> 8) & 0xff; + *(cx_data + 1) = csize; + csize = (size >> 16) & 0xff; + *(cx_data + 2) = csize; +} + +static void pack_tokens_into_partitions(VP8_COMP *cpi, unsigned char *cx_data, + unsigned char *cx_data_end, + int num_part) { + int i; + unsigned char *ptr = cx_data; + unsigned char *ptr_end = cx_data_end; + vp8_writer *w; + + for (i = 0; i < num_part; ++i) { + int mb_row; + + w = cpi->bc + i + 1; + + vp8_start_encode(w, ptr, ptr_end); + + for (mb_row = i; mb_row < cpi->common.mb_rows; mb_row += num_part) { + const TOKENEXTRA *p = cpi->tplist[mb_row].start; + const TOKENEXTRA *stop = cpi->tplist[mb_row].stop; + int tokens = (int)(stop - p); + + vp8_pack_tokens(w, p, tokens); + } + + vp8_stop_encode(w); + ptr += w->pos; + } +} + +#if CONFIG_MULTITHREAD +static void pack_mb_row_tokens(VP8_COMP *cpi, vp8_writer *w) { + int mb_row; + + for (mb_row = 0; mb_row < cpi->common.mb_rows; ++mb_row) { + const TOKENEXTRA *p = cpi->tplist[mb_row].start; + const TOKENEXTRA *stop = cpi->tplist[mb_row].stop; + int tokens = (int)(stop - p); + + vp8_pack_tokens(w, p, tokens); + } +} +#endif // CONFIG_MULTITHREAD + +static void write_mv_ref(vp8_writer *w, MB_PREDICTION_MODE m, + const vp8_prob *p) { + assert(NEARESTMV <= m && m <= SPLITMV); + vp8_write_token(w, vp8_mv_ref_tree, p, + vp8_mv_ref_encoding_array + (m - NEARESTMV)); +} + +static void write_sub_mv_ref(vp8_writer *w, B_PREDICTION_MODE m, + const vp8_prob *p) { + assert(LEFT4X4 <= m && m <= NEW4X4); + vp8_write_token(w, vp8_sub_mv_ref_tree, p, + vp8_sub_mv_ref_encoding_array + (m - LEFT4X4)); +} + +static void write_mv(vp8_writer *w, const MV *mv, const int_mv *ref, + const MV_CONTEXT *mvc) { + MV e; + e.row = mv->row - ref->as_mv.row; + e.col = mv->col - ref->as_mv.col; + + vp8_encode_motion_vector(w, &e, mvc); +} + +static void write_mb_features(vp8_writer *w, const MB_MODE_INFO *mi, + const MACROBLOCKD *x) { + /* Encode the MB segment id. */ + if (x->segmentation_enabled && x->update_mb_segmentation_map) { + switch (mi->segment_id) { + case 0: + vp8_write(w, 0, x->mb_segment_tree_probs[0]); + vp8_write(w, 0, x->mb_segment_tree_probs[1]); + break; + case 1: + vp8_write(w, 0, x->mb_segment_tree_probs[0]); + vp8_write(w, 1, x->mb_segment_tree_probs[1]); + break; + case 2: + vp8_write(w, 1, x->mb_segment_tree_probs[0]); + vp8_write(w, 0, x->mb_segment_tree_probs[2]); + break; + case 3: + vp8_write(w, 1, x->mb_segment_tree_probs[0]); + vp8_write(w, 1, x->mb_segment_tree_probs[2]); + break; + + /* TRAP.. This should not happen */ + default: + vp8_write(w, 0, x->mb_segment_tree_probs[0]); + vp8_write(w, 0, x->mb_segment_tree_probs[1]); + break; + } + } +} +void vp8_convert_rfct_to_prob(VP8_COMP *const cpi) { + const int *const rfct = cpi->mb.count_mb_ref_frame_usage; + const int rf_intra = rfct[INTRA_FRAME]; + const int rf_inter = + rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]; + + /* Calculate the probabilities used to code the ref frame based on usage */ + if (!(cpi->prob_intra_coded = rf_intra * 255 / (rf_intra + rf_inter))) { + cpi->prob_intra_coded = 1; + } + + cpi->prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128; + + if (!cpi->prob_last_coded) cpi->prob_last_coded = 1; + + cpi->prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) + ? (rfct[GOLDEN_FRAME] * 255) / + (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) + : 128; + + if (!cpi->prob_gf_coded) cpi->prob_gf_coded = 1; +} + +static void pack_inter_mode_mvs(VP8_COMP *const cpi) { + VP8_COMMON *const pc = &cpi->common; + vp8_writer *const w = cpi->bc; + const MV_CONTEXT *mvc = pc->fc.mvc; + + MODE_INFO *m = pc->mi; + const int mis = pc->mode_info_stride; + int mb_row = -1; + + int prob_skip_false = 0; + + cpi->mb.partition_info = cpi->mb.pi; + + vp8_convert_rfct_to_prob(cpi); + + if (pc->mb_no_coeff_skip) { + int total_mbs = pc->mb_rows * pc->mb_cols; + + prob_skip_false = (total_mbs - cpi->mb.skip_true_count) * 256 / total_mbs; + + if (prob_skip_false <= 1) prob_skip_false = 1; + + if (prob_skip_false > 255) prob_skip_false = 255; + + cpi->prob_skip_false = prob_skip_false; + vp8_write_literal(w, prob_skip_false, 8); + } + + vp8_write_literal(w, cpi->prob_intra_coded, 8); + vp8_write_literal(w, cpi->prob_last_coded, 8); + vp8_write_literal(w, cpi->prob_gf_coded, 8); + + update_mbintra_mode_probs(cpi); + + vp8_write_mvprobs(cpi); + + while (++mb_row < pc->mb_rows) { + int mb_col = -1; + + while (++mb_col < pc->mb_cols) { + const MB_MODE_INFO *const mi = &m->mbmi; + const MV_REFERENCE_FRAME rf = mi->ref_frame; + const MB_PREDICTION_MODE mode = mi->mode; + + MACROBLOCKD *xd = &cpi->mb.e_mbd; + + /* Distance of Mb to the various image edges. + * These specified to 8th pel as they are always compared to MV + * values that are in 1/8th pel units + */ + xd->mb_to_left_edge = -((mb_col * 16) << 3); + xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3; + xd->mb_to_top_edge = -((mb_row * 16) << 3); + xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3; + + if (cpi->mb.e_mbd.update_mb_segmentation_map) { + write_mb_features(w, mi, &cpi->mb.e_mbd); + } + + if (pc->mb_no_coeff_skip) { + vp8_encode_bool(w, m->mbmi.mb_skip_coeff, prob_skip_false); + } + + if (rf == INTRA_FRAME) { + vp8_write(w, 0, cpi->prob_intra_coded); + write_ymode(w, mode, pc->fc.ymode_prob); + + if (mode == B_PRED) { + int j = 0; + + do { + write_bmode(w, m->bmi[j].as_mode, pc->fc.bmode_prob); + } while (++j < 16); + } + + write_uv_mode(w, mi->uv_mode, pc->fc.uv_mode_prob); + } else { /* inter coded */ + int_mv best_mv; + vp8_prob mv_ref_p[VP8_MVREFS - 1]; + + vp8_write(w, 1, cpi->prob_intra_coded); + + if (rf == LAST_FRAME) + vp8_write(w, 0, cpi->prob_last_coded); + else { + vp8_write(w, 1, cpi->prob_last_coded); + vp8_write(w, (rf == GOLDEN_FRAME) ? 0 : 1, cpi->prob_gf_coded); + } + + { + int_mv n1, n2; + int ct[4]; + + vp8_find_near_mvs(xd, m, &n1, &n2, &best_mv, ct, rf, + cpi->common.ref_frame_sign_bias); + vp8_clamp_mv2(&best_mv, xd); + + vp8_mv_ref_probs(mv_ref_p, ct); + } + + write_mv_ref(w, mode, mv_ref_p); + + switch (mode) /* new, split require MVs */ + { + case NEWMV: write_mv(w, &mi->mv.as_mv, &best_mv, mvc); break; + + case SPLITMV: { + int j = 0; + +#ifdef MODE_STATS + ++count_mb_seg[mi->partitioning]; +#endif + + write_split(w, mi->partitioning); + + do { + B_PREDICTION_MODE blockmode; + int_mv blockmv; + const int *const L = vp8_mbsplits[mi->partitioning]; + int k = -1; /* first block in subset j */ + int mv_contz; + int_mv leftmv, abovemv; + + blockmode = cpi->mb.partition_info->bmi[j].mode; + blockmv = cpi->mb.partition_info->bmi[j].mv; + while (j != L[++k]) { + assert(k < 16); + } + leftmv.as_int = left_block_mv(m, k); + abovemv.as_int = above_block_mv(m, k, mis); + mv_contz = vp8_mv_cont(&leftmv, &abovemv); + + write_sub_mv_ref(w, blockmode, vp8_sub_mv_ref_prob2[mv_contz]); + + if (blockmode == NEW4X4) { + write_mv(w, &blockmv.as_mv, &best_mv, (const MV_CONTEXT *)mvc); + } + } while (++j < cpi->mb.partition_info->count); + break; + } + default: break; + } + } + + ++m; + cpi->mb.partition_info++; + } + + ++m; /* skip L prediction border */ + cpi->mb.partition_info++; + } +} + +static void write_kfmodes(VP8_COMP *cpi) { + vp8_writer *const bc = cpi->bc; + const VP8_COMMON *const c = &cpi->common; + /* const */ + MODE_INFO *m = c->mi; + + int mb_row = -1; + int prob_skip_false = 0; + + if (c->mb_no_coeff_skip) { + int total_mbs = c->mb_rows * c->mb_cols; + + prob_skip_false = (total_mbs - cpi->mb.skip_true_count) * 256 / total_mbs; + + if (prob_skip_false <= 1) prob_skip_false = 1; + + if (prob_skip_false >= 255) prob_skip_false = 255; + + cpi->prob_skip_false = prob_skip_false; + vp8_write_literal(bc, prob_skip_false, 8); + } + + while (++mb_row < c->mb_rows) { + int mb_col = -1; + + while (++mb_col < c->mb_cols) { + const int ym = m->mbmi.mode; + + if (cpi->mb.e_mbd.update_mb_segmentation_map) { + write_mb_features(bc, &m->mbmi, &cpi->mb.e_mbd); + } + + if (c->mb_no_coeff_skip) { + vp8_encode_bool(bc, m->mbmi.mb_skip_coeff, prob_skip_false); + } + + kfwrite_ymode(bc, ym, vp8_kf_ymode_prob); + + if (ym == B_PRED) { + const int mis = c->mode_info_stride; + int i = 0; + + do { + const B_PREDICTION_MODE A = above_block_mode(m, i, mis); + const B_PREDICTION_MODE L = left_block_mode(m, i); + const int bm = m->bmi[i].as_mode; + + write_bmode(bc, bm, vp8_kf_bmode_prob[A][L]); + } while (++i < 16); + } + + write_uv_mode(bc, (m++)->mbmi.uv_mode, vp8_kf_uv_mode_prob); + } + + m++; /* skip L prediction border */ + } +} + +#if 0 +/* This function is used for debugging probability trees. */ +static void print_prob_tree(vp8_prob + coef_probs[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]) +{ + /* print coef probability tree */ + int i,j,k,l; + FILE* f = fopen("enc_tree_probs.txt", "a"); + fprintf(f, "{\n"); + for (i = 0; i < BLOCK_TYPES; ++i) + { + fprintf(f, " {\n"); + for (j = 0; j < COEF_BANDS; ++j) + { + fprintf(f, " {\n"); + for (k = 0; k < PREV_COEF_CONTEXTS; ++k) + { + fprintf(f, " {"); + for (l = 0; l < ENTROPY_NODES; ++l) + { + fprintf(f, "%3u, ", + (unsigned int)(coef_probs [i][j][k][l])); + } + fprintf(f, " }\n"); + } + fprintf(f, " }\n"); + } + fprintf(f, " }\n"); + } + fprintf(f, "}\n"); + fclose(f); +} +#endif + +static void sum_probs_over_prev_coef_context( + const unsigned int probs[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS], + unsigned int *out) { + int i, j; + for (i = 0; i < MAX_ENTROPY_TOKENS; ++i) { + for (j = 0; j < PREV_COEF_CONTEXTS; ++j) { + const unsigned int tmp = out[i]; + out[i] += probs[j][i]; + /* check for wrap */ + if (out[i] < tmp) out[i] = UINT_MAX; + } + } +} + +static int prob_update_savings(const unsigned int *ct, const vp8_prob oldp, + const vp8_prob newp, const vp8_prob upd) { + const int old_b = vp8_cost_branch(ct, oldp); + const int new_b = vp8_cost_branch(ct, newp); + const int update_b = 8 + ((vp8_cost_one(upd) - vp8_cost_zero(upd)) >> 8); + + return old_b - new_b - update_b; +} + +static int independent_coef_context_savings(VP8_COMP *cpi) { + MACROBLOCK *const x = &cpi->mb; + int savings = 0; + int i = 0; + do { + int j = 0; + do { + int k = 0; + unsigned int prev_coef_count_sum[MAX_ENTROPY_TOKENS] = { 0 }; + int prev_coef_savings[MAX_ENTROPY_TOKENS] = { 0 }; + const unsigned int(*probs)[MAX_ENTROPY_TOKENS]; + /* Calculate new probabilities given the constraint that + * they must be equal over the prev coef contexts + */ + + probs = (const unsigned int(*)[MAX_ENTROPY_TOKENS])x->coef_counts[i][j]; + + /* Reset to default probabilities at key frames */ + if (cpi->common.frame_type == KEY_FRAME) { + probs = default_coef_counts[i][j]; + } + + sum_probs_over_prev_coef_context(probs, prev_coef_count_sum); + + do { + /* at every context */ + + /* calc probs and branch cts for this frame only */ + int t = 0; /* token/prob index */ + + vp8_tree_probs_from_distribution( + MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree, + cpi->frame_coef_probs[i][j][k], cpi->frame_branch_ct[i][j][k], + prev_coef_count_sum, 256, 1); + + do { + const unsigned int *ct = cpi->frame_branch_ct[i][j][k][t]; + const vp8_prob newp = cpi->frame_coef_probs[i][j][k][t]; + const vp8_prob oldp = cpi->common.fc.coef_probs[i][j][k][t]; + const vp8_prob upd = vp8_coef_update_probs[i][j][k][t]; + const int s = prob_update_savings(ct, oldp, newp, upd); + + if (cpi->common.frame_type != KEY_FRAME || + (cpi->common.frame_type == KEY_FRAME && newp != oldp)) { + prev_coef_savings[t] += s; + } + } while (++t < ENTROPY_NODES); + } while (++k < PREV_COEF_CONTEXTS); + k = 0; + do { + /* We only update probabilities if we can save bits, except + * for key frames where we have to update all probabilities + * to get the equal probabilities across the prev coef + * contexts. + */ + if (prev_coef_savings[k] > 0 || cpi->common.frame_type == KEY_FRAME) { + savings += prev_coef_savings[k]; + } + } while (++k < ENTROPY_NODES); + } while (++j < COEF_BANDS); + } while (++i < BLOCK_TYPES); + return savings; +} + +static int default_coef_context_savings(VP8_COMP *cpi) { + MACROBLOCK *const x = &cpi->mb; + int savings = 0; + int i = 0; + do { + int j = 0; + do { + int k = 0; + do { + /* at every context */ + + /* calc probs and branch cts for this frame only */ + int t = 0; /* token/prob index */ + + vp8_tree_probs_from_distribution( + MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree, + cpi->frame_coef_probs[i][j][k], cpi->frame_branch_ct[i][j][k], + x->coef_counts[i][j][k], 256, 1); + + do { + const unsigned int *ct = cpi->frame_branch_ct[i][j][k][t]; + const vp8_prob newp = cpi->frame_coef_probs[i][j][k][t]; + const vp8_prob oldp = cpi->common.fc.coef_probs[i][j][k][t]; + const vp8_prob upd = vp8_coef_update_probs[i][j][k][t]; + const int s = prob_update_savings(ct, oldp, newp, upd); + + if (s > 0) { + savings += s; + } + } while (++t < ENTROPY_NODES); + } while (++k < PREV_COEF_CONTEXTS); + } while (++j < COEF_BANDS); + } while (++i < BLOCK_TYPES); + return savings; +} + +void vp8_calc_ref_frame_costs(int *ref_frame_cost, int prob_intra, + int prob_last, int prob_garf) { + assert(prob_intra >= 0); + assert(prob_intra <= 255); + assert(prob_last >= 0); + assert(prob_last <= 255); + assert(prob_garf >= 0); + assert(prob_garf <= 255); + ref_frame_cost[INTRA_FRAME] = vp8_cost_zero(prob_intra); + ref_frame_cost[LAST_FRAME] = + vp8_cost_one(prob_intra) + vp8_cost_zero(prob_last); + ref_frame_cost[GOLDEN_FRAME] = vp8_cost_one(prob_intra) + + vp8_cost_one(prob_last) + + vp8_cost_zero(prob_garf); + ref_frame_cost[ALTREF_FRAME] = vp8_cost_one(prob_intra) + + vp8_cost_one(prob_last) + + vp8_cost_one(prob_garf); +} + +int vp8_estimate_entropy_savings(VP8_COMP *cpi) { + int savings = 0; + + const int *const rfct = cpi->mb.count_mb_ref_frame_usage; + const int rf_intra = rfct[INTRA_FRAME]; + const int rf_inter = + rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]; + int new_intra, new_last, new_garf, oldtotal, newtotal; + int ref_frame_cost[MAX_REF_FRAMES]; + + vpx_clear_system_state(); + + if (cpi->common.frame_type != KEY_FRAME) { + if (!(new_intra = rf_intra * 255 / (rf_intra + rf_inter))) new_intra = 1; + + new_last = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128; + + new_garf = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) + ? (rfct[GOLDEN_FRAME] * 255) / + (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) + : 128; + + vp8_calc_ref_frame_costs(ref_frame_cost, new_intra, new_last, new_garf); + + newtotal = rfct[INTRA_FRAME] * ref_frame_cost[INTRA_FRAME] + + rfct[LAST_FRAME] * ref_frame_cost[LAST_FRAME] + + rfct[GOLDEN_FRAME] * ref_frame_cost[GOLDEN_FRAME] + + rfct[ALTREF_FRAME] * ref_frame_cost[ALTREF_FRAME]; + + /* old costs */ + vp8_calc_ref_frame_costs(ref_frame_cost, cpi->prob_intra_coded, + cpi->prob_last_coded, cpi->prob_gf_coded); + + oldtotal = rfct[INTRA_FRAME] * ref_frame_cost[INTRA_FRAME] + + rfct[LAST_FRAME] * ref_frame_cost[LAST_FRAME] + + rfct[GOLDEN_FRAME] * ref_frame_cost[GOLDEN_FRAME] + + rfct[ALTREF_FRAME] * ref_frame_cost[ALTREF_FRAME]; + + savings += (oldtotal - newtotal) / 256; + } + + if (cpi->oxcf.error_resilient_mode & VPX_ERROR_RESILIENT_PARTITIONS) { + savings += independent_coef_context_savings(cpi); + } else { + savings += default_coef_context_savings(cpi); + } + + return savings; +} + +#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING +int vp8_update_coef_context(VP8_COMP *cpi) { + int savings = 0; + + if (cpi->common.frame_type == KEY_FRAME) { + /* Reset to default counts/probabilities at key frames */ + vp8_copy(cpi->mb.coef_counts, default_coef_counts); + } + + if (cpi->oxcf.error_resilient_mode & VPX_ERROR_RESILIENT_PARTITIONS) + savings += independent_coef_context_savings(cpi); + else + savings += default_coef_context_savings(cpi); + + return savings; +} +#endif + +void vp8_update_coef_probs(VP8_COMP *cpi) { + int i = 0; +#if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING) + vp8_writer *const w = cpi->bc; +#endif + + vpx_clear_system_state(); + + do { + int j = 0; + + do { + int k = 0; + int prev_coef_savings[ENTROPY_NODES] = { 0 }; + if (cpi->oxcf.error_resilient_mode & VPX_ERROR_RESILIENT_PARTITIONS) { + for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { + int t; /* token/prob index */ + for (t = 0; t < ENTROPY_NODES; ++t) { + const unsigned int *ct = cpi->frame_branch_ct[i][j][k][t]; + const vp8_prob newp = cpi->frame_coef_probs[i][j][k][t]; + const vp8_prob oldp = cpi->common.fc.coef_probs[i][j][k][t]; + const vp8_prob upd = vp8_coef_update_probs[i][j][k][t]; + + prev_coef_savings[t] += prob_update_savings(ct, oldp, newp, upd); + } + } + k = 0; + } + do { + /* note: use result from vp8_estimate_entropy_savings, so no + * need to call vp8_tree_probs_from_distribution here. + */ + + /* at every context */ + + /* calc probs and branch cts for this frame only */ + int t = 0; /* token/prob index */ + + do { + const vp8_prob newp = cpi->frame_coef_probs[i][j][k][t]; + + vp8_prob *Pold = cpi->common.fc.coef_probs[i][j][k] + t; + const vp8_prob upd = vp8_coef_update_probs[i][j][k][t]; + + int s = prev_coef_savings[t]; + int u = 0; + + if (!(cpi->oxcf.error_resilient_mode & + VPX_ERROR_RESILIENT_PARTITIONS)) { + s = prob_update_savings(cpi->frame_branch_ct[i][j][k][t], *Pold, + newp, upd); + } + + if (s > 0) u = 1; + + /* Force updates on key frames if the new is different, + * so that we can be sure we end up with equal probabilities + * over the prev coef contexts. + */ + if ((cpi->oxcf.error_resilient_mode & + VPX_ERROR_RESILIENT_PARTITIONS) && + cpi->common.frame_type == KEY_FRAME && newp != *Pold) { + u = 1; + } + +#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING + cpi->update_probs[i][j][k][t] = u; +#else + vp8_write(w, u, upd); +#endif + + if (u) { + /* send/use new probability */ + + *Pold = newp; +#if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING) + vp8_write_literal(w, newp, 8); +#endif + } + + } while (++t < ENTROPY_NODES); + + } while (++k < PREV_COEF_CONTEXTS); + } while (++j < COEF_BANDS); + } while (++i < BLOCK_TYPES); +} + +#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING +static void pack_coef_probs(VP8_COMP *cpi) { + int i = 0; + vp8_writer *const w = cpi->bc; + + do { + int j = 0; + + do { + int k = 0; + + do { + int t = 0; /* token/prob index */ + + do { + const vp8_prob newp = cpi->common.fc.coef_probs[i][j][k][t]; + const vp8_prob upd = vp8_coef_update_probs[i][j][k][t]; + + const char u = cpi->update_probs[i][j][k][t]; + + vp8_write(w, u, upd); + + if (u) { + /* send/use new probability */ + vp8_write_literal(w, newp, 8); + } + } while (++t < ENTROPY_NODES); + } while (++k < PREV_COEF_CONTEXTS); + } while (++j < COEF_BANDS); + } while (++i < BLOCK_TYPES); +} +#endif + +#ifdef PACKET_TESTING +FILE *vpxlogc = 0; +#endif + +static void put_delta_q(vp8_writer *bc, int delta_q) { + if (delta_q != 0) { + vp8_write_bit(bc, 1); + vp8_write_literal(bc, abs(delta_q), 4); + + if (delta_q < 0) + vp8_write_bit(bc, 1); + else + vp8_write_bit(bc, 0); + } else + vp8_write_bit(bc, 0); +} + +void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, + unsigned char *dest_end, size_t *size) { + int i, j; + VP8_HEADER oh; + VP8_COMMON *const pc = &cpi->common; + vp8_writer *const bc = cpi->bc; + MACROBLOCKD *const xd = &cpi->mb.e_mbd; + int extra_bytes_packed = 0; + + unsigned char *cx_data = dest; + unsigned char *cx_data_end = dest_end; + const int *mb_feature_data_bits; + + oh.show_frame = (int)pc->show_frame; + oh.type = (int)pc->frame_type; + oh.version = pc->version; + oh.first_partition_length_in_bytes = 0; + + mb_feature_data_bits = vp8_mb_feature_data_bits; + + bc[0].error = &pc->error; + + validate_buffer(cx_data, 3, cx_data_end, &cpi->common.error); + cx_data += 3; + +#if defined(SECTIONBITS_OUTPUT) + Sectionbits[active_section = 1] += sizeof(VP8_HEADER) * 8 * 256; +#endif + + /* every keyframe send startcode, width, height, scale factor, clamp + * and color type + */ + if (oh.type == KEY_FRAME) { + int v; + + validate_buffer(cx_data, 7, cx_data_end, &cpi->common.error); + + /* Start / synch code */ + cx_data[0] = 0x9D; + cx_data[1] = 0x01; + cx_data[2] = 0x2a; + + /* Pack scale and frame size into 16 bits. Store it 8 bits at a time. + * https://tools.ietf.org/html/rfc6386 + * 9.1. Uncompressed Data Chunk + * 16 bits : (2 bits Horizontal Scale << 14) | Width (14 bits) + * 16 bits : (2 bits Vertical Scale << 14) | Height (14 bits) + */ + v = (pc->horiz_scale << 14) | pc->Width; + cx_data[3] = v & 0xff; + cx_data[4] = v >> 8; + + v = (pc->vert_scale << 14) | pc->Height; + cx_data[5] = v & 0xff; + cx_data[6] = v >> 8; + + extra_bytes_packed = 7; + cx_data += extra_bytes_packed; + + vp8_start_encode(bc, cx_data, cx_data_end); + + /* signal clr type */ + vp8_write_bit(bc, 0); + vp8_write_bit(bc, pc->clamp_type); + + } else { + vp8_start_encode(bc, cx_data, cx_data_end); + } + + /* Signal whether or not Segmentation is enabled */ + vp8_write_bit(bc, xd->segmentation_enabled); + + /* Indicate which features are enabled */ + if (xd->segmentation_enabled) { + /* Signal whether or not the segmentation map is being updated. */ + vp8_write_bit(bc, xd->update_mb_segmentation_map); + vp8_write_bit(bc, xd->update_mb_segmentation_data); + + if (xd->update_mb_segmentation_data) { + signed char Data; + + vp8_write_bit(bc, xd->mb_segment_abs_delta); + + /* For each segmentation feature (Quant and loop filter level) */ + for (i = 0; i < MB_LVL_MAX; ++i) { + /* For each of the segments */ + for (j = 0; j < MAX_MB_SEGMENTS; ++j) { + Data = xd->segment_feature_data[i][j]; + + /* Frame level data */ + if (Data) { + vp8_write_bit(bc, 1); + + if (Data < 0) { + Data = -Data; + vp8_write_literal(bc, Data, mb_feature_data_bits[i]); + vp8_write_bit(bc, 1); + } else { + vp8_write_literal(bc, Data, mb_feature_data_bits[i]); + vp8_write_bit(bc, 0); + } + } else + vp8_write_bit(bc, 0); + } + } + } + + if (xd->update_mb_segmentation_map) { + /* Write the probs used to decode the segment id for each mb */ + for (i = 0; i < MB_FEATURE_TREE_PROBS; ++i) { + int Data = xd->mb_segment_tree_probs[i]; + + if (Data != 255) { + vp8_write_bit(bc, 1); + vp8_write_literal(bc, Data, 8); + } else + vp8_write_bit(bc, 0); + } + } + } + + vp8_write_bit(bc, pc->filter_type); + vp8_write_literal(bc, pc->filter_level, 6); + vp8_write_literal(bc, pc->sharpness_level, 3); + + /* Write out loop filter deltas applied at the MB level based on mode + * or ref frame (if they are enabled). + */ + vp8_write_bit(bc, xd->mode_ref_lf_delta_enabled); + + if (xd->mode_ref_lf_delta_enabled) { + /* Do the deltas need to be updated */ + int send_update = + xd->mode_ref_lf_delta_update || cpi->oxcf.error_resilient_mode; + + vp8_write_bit(bc, send_update); + if (send_update) { + int Data; + + /* Send update */ + for (i = 0; i < MAX_REF_LF_DELTAS; ++i) { + Data = xd->ref_lf_deltas[i]; + + /* Frame level data */ + if (xd->ref_lf_deltas[i] != xd->last_ref_lf_deltas[i] || + cpi->oxcf.error_resilient_mode) { + xd->last_ref_lf_deltas[i] = xd->ref_lf_deltas[i]; + vp8_write_bit(bc, 1); + + if (Data > 0) { + vp8_write_literal(bc, (Data & 0x3F), 6); + vp8_write_bit(bc, 0); /* sign */ + } else { + Data = -Data; + vp8_write_literal(bc, (Data & 0x3F), 6); + vp8_write_bit(bc, 1); /* sign */ + } + } else + vp8_write_bit(bc, 0); + } + + /* Send update */ + for (i = 0; i < MAX_MODE_LF_DELTAS; ++i) { + Data = xd->mode_lf_deltas[i]; + + if (xd->mode_lf_deltas[i] != xd->last_mode_lf_deltas[i] || + cpi->oxcf.error_resilient_mode) { + xd->last_mode_lf_deltas[i] = xd->mode_lf_deltas[i]; + vp8_write_bit(bc, 1); + + if (Data > 0) { + vp8_write_literal(bc, (Data & 0x3F), 6); + vp8_write_bit(bc, 0); /* sign */ + } else { + Data = -Data; + vp8_write_literal(bc, (Data & 0x3F), 6); + vp8_write_bit(bc, 1); /* sign */ + } + } else + vp8_write_bit(bc, 0); + } + } + } + + /* signal here is multi token partition is enabled */ + vp8_write_literal(bc, pc->multi_token_partition, 2); + + /* Frame Qbaseline quantizer index */ + vp8_write_literal(bc, pc->base_qindex, 7); + + /* Transmit Dc, Second order and Uv quantizer delta information */ + put_delta_q(bc, pc->y1dc_delta_q); + put_delta_q(bc, pc->y2dc_delta_q); + put_delta_q(bc, pc->y2ac_delta_q); + put_delta_q(bc, pc->uvdc_delta_q); + put_delta_q(bc, pc->uvac_delta_q); + + /* When there is a key frame all reference buffers are updated using + * the new key frame + */ + if (pc->frame_type != KEY_FRAME) { + /* Should the GF or ARF be updated using the transmitted frame + * or buffer + */ + vp8_write_bit(bc, pc->refresh_golden_frame); + vp8_write_bit(bc, pc->refresh_alt_ref_frame); + + /* If not being updated from current frame should either GF or ARF + * be updated from another buffer + */ + if (!pc->refresh_golden_frame) + vp8_write_literal(bc, pc->copy_buffer_to_gf, 2); + + if (!pc->refresh_alt_ref_frame) + vp8_write_literal(bc, pc->copy_buffer_to_arf, 2); + + /* Indicate reference frame sign bias for Golden and ARF frames + * (always 0 for last frame buffer) + */ + vp8_write_bit(bc, pc->ref_frame_sign_bias[GOLDEN_FRAME]); + vp8_write_bit(bc, pc->ref_frame_sign_bias[ALTREF_FRAME]); + } + +#if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING) + if (cpi->oxcf.error_resilient_mode & VPX_ERROR_RESILIENT_PARTITIONS) { + if (pc->frame_type == KEY_FRAME) { + pc->refresh_entropy_probs = 1; + } else { + pc->refresh_entropy_probs = 0; + } + } +#endif + + vp8_write_bit(bc, pc->refresh_entropy_probs); + + if (pc->frame_type != KEY_FRAME) vp8_write_bit(bc, pc->refresh_last_frame); + + vpx_clear_system_state(); + +#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING + pack_coef_probs(cpi); +#else + if (pc->refresh_entropy_probs == 0) { + /* save a copy for later refresh */ + memcpy(&cpi->common.lfc, &cpi->common.fc, sizeof(cpi->common.fc)); + } + + vp8_update_coef_probs(cpi); +#endif + + /* Write out the mb_no_coeff_skip flag */ + vp8_write_bit(bc, pc->mb_no_coeff_skip); + + if (pc->frame_type == KEY_FRAME) { + write_kfmodes(cpi); + } else { + pack_inter_mode_mvs(cpi); + } + + vp8_stop_encode(bc); + + cx_data += bc->pos; + + oh.first_partition_length_in_bytes = cpi->bc->pos; + + /* update frame tag */ + { + /* Pack partition size, show frame, version and frame type into to 24 bits. + * Store it 8 bits at a time. + * https://tools.ietf.org/html/rfc6386 + * 9.1. Uncompressed Data Chunk + * The uncompressed data chunk comprises a common (for key frames and + * interframes) 3-byte frame tag that contains four fields, as follows: + * + * 1. A 1-bit frame type (0 for key frames, 1 for interframes). + * + * 2. A 3-bit version number (0 - 3 are defined as four different + * profiles with different decoding complexity; other values may be + * defined for future variants of the VP8 data format). + * + * 3. A 1-bit show_frame flag (0 when current frame is not for display, + * 1 when current frame is for display). + * + * 4. A 19-bit field containing the size of the first data partition in + * bytes + */ + int v = (oh.first_partition_length_in_bytes << 5) | (oh.show_frame << 4) | + (oh.version << 1) | oh.type; + + dest[0] = v & 0xff; + dest[1] = (v >> 8) & 0xff; + dest[2] = v >> 16; + } + + *size = VP8_HEADER_SIZE + extra_bytes_packed + cpi->bc->pos; + + cpi->partition_sz[0] = (unsigned int)*size; + +#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING + { + const int num_part = (1 << pc->multi_token_partition); + unsigned char *dp = cpi->partition_d[0] + cpi->partition_sz[0]; + + if (num_part > 1) { + /* write token part sizes (all but last) if more than 1 */ + validate_buffer(dp, 3 * (num_part - 1), cpi->partition_d_end[0], + &pc->error); + + cpi->partition_sz[0] += 3 * (num_part - 1); + + for (i = 1; i < num_part; ++i) { + write_partition_size(dp, cpi->partition_sz[i]); + dp += 3; + } + } + + if (!cpi->output_partition) { + /* concatenate partition buffers */ + for (i = 0; i < num_part; ++i) { + memmove(dp, cpi->partition_d[i + 1], cpi->partition_sz[i + 1]); + cpi->partition_d[i + 1] = dp; + dp += cpi->partition_sz[i + 1]; + } + } + + /* update total size */ + *size = 0; + for (i = 0; i < num_part + 1; ++i) { + *size += cpi->partition_sz[i]; + } + } +#else + if (pc->multi_token_partition != ONE_PARTITION) { + int num_part = 1 << pc->multi_token_partition; + + /* partition size table at the end of first partition */ + cpi->partition_sz[0] += 3 * (num_part - 1); + *size += 3 * (num_part - 1); + + validate_buffer(cx_data, 3 * (num_part - 1), cx_data_end, &pc->error); + + for (i = 1; i < num_part + 1; ++i) { + cpi->bc[i].error = &pc->error; + } + + pack_tokens_into_partitions(cpi, cx_data + 3 * (num_part - 1), cx_data_end, + num_part); + + for (i = 1; i < num_part; ++i) { + cpi->partition_sz[i] = cpi->bc[i].pos; + write_partition_size(cx_data, cpi->partition_sz[i]); + cx_data += 3; + *size += cpi->partition_sz[i]; /* add to total */ + } + + /* add last partition to total size */ + cpi->partition_sz[i] = cpi->bc[i].pos; + *size += cpi->partition_sz[i]; + } else { + bc[1].error = &pc->error; + + vp8_start_encode(&cpi->bc[1], cx_data, cx_data_end); + +#if CONFIG_MULTITHREAD + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) { + pack_mb_row_tokens(cpi, &cpi->bc[1]); + } else { + vp8_pack_tokens(&cpi->bc[1], cpi->tok, cpi->tok_count); + } +#else + vp8_pack_tokens(&cpi->bc[1], cpi->tok, cpi->tok_count); +#endif // CONFIG_MULTITHREAD + + vp8_stop_encode(&cpi->bc[1]); + + *size += cpi->bc[1].pos; + cpi->partition_sz[1] = cpi->bc[1].pos; + } +#endif +} diff --git a/media/libvpx/libvpx/vp8/encoder/bitstream.h b/media/libvpx/libvpx/vp8/encoder/bitstream.h new file mode 100644 index 0000000000..ee3f3e4aab --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/bitstream.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_ENCODER_BITSTREAM_H_ +#define VPX_VP8_ENCODER_BITSTREAM_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "vp8/encoder/treewriter.h" +#include "vp8/encoder/tokenize.h" + +void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount); +void vp8_convert_rfct_to_prob(struct VP8_COMP *const cpi); +void vp8_calc_ref_frame_costs(int *ref_frame_cost, int prob_intra, + int prob_last, int prob_garf); +int vp8_estimate_entropy_savings(struct VP8_COMP *cpi); +void vp8_update_coef_probs(struct VP8_COMP *cpi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_ENCODER_BITSTREAM_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/block.h b/media/libvpx/libvpx/vp8/encoder/block.h new file mode 100644 index 0000000000..1bc5ef75bc --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/block.h @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_ENCODER_BLOCK_H_ +#define VPX_VP8_ENCODER_BLOCK_H_ + +#include "vp8/common/onyx.h" +#include "vp8/common/blockd.h" +#include "vp8/common/entropymv.h" +#include "vp8/common/entropy.h" +#include "vpx_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_MODES 20 +#define MAX_ERROR_BINS 1024 + +/* motion search site */ +typedef struct { + MV mv; + int offset; +} search_site; + +typedef struct block { + /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */ + short *src_diff; + short *coeff; + + /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */ + short *quant; + short *quant_fast; + short *quant_shift; + short *zbin; + short *zrun_zbin_boost; + short *round; + + /* Zbin Over Quant value */ + short zbin_extra; + + unsigned char **base_src; + int src; + int src_stride; +} BLOCK; + +typedef struct { + int count; + struct { + B_PREDICTION_MODE mode; + int_mv mv; + } bmi[16]; +} PARTITION_INFO; + +typedef struct macroblock { + DECLARE_ALIGNED(16, short, src_diff[400]); /* 25 blocks Y,U,V,Y2 */ + DECLARE_ALIGNED(16, short, coeff[400]); /* 25 blocks Y,U,V,Y2 */ + DECLARE_ALIGNED(16, unsigned char, thismb[256]); + + unsigned char *thismb_ptr; + /* 16 Y, 4 U, 4 V, 1 DC 2nd order block */ + BLOCK block[25]; + + YV12_BUFFER_CONFIG src; + + MACROBLOCKD e_mbd; + PARTITION_INFO *partition_info; /* work pointer */ + PARTITION_INFO *pi; /* Corresponds to upper left visible macroblock */ + PARTITION_INFO *pip; /* Base of allocated array */ + + int ref_frame_cost[MAX_REF_FRAMES]; + + search_site *ss; + int ss_count; + int searches_per_step; + + int errorperbit; + int sadperbit16; + int sadperbit4; + int rddiv; + int rdmult; + unsigned int *mb_activity_ptr; + int *mb_norm_activity_ptr; + signed int act_zbin_adj; + signed int last_act_zbin_adj; + + int *mvcost[2]; + int *mvsadcost[2]; + int (*mbmode_cost)[MB_MODE_COUNT]; + int (*intra_uv_mode_cost)[MB_MODE_COUNT]; + int (*bmode_costs)[10][10]; + int *inter_bmode_costs; + int (*token_costs)[COEF_BANDS][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS]; + + /* These define limits to motion vector components to prevent + * them from extending outside the UMV borders. + */ + int mv_col_min; + int mv_col_max; + int mv_row_min; + int mv_row_max; + + int skip; + + unsigned int encode_breakout; + + signed char *gf_active_ptr; + + unsigned char *active_ptr; + MV_CONTEXT *mvc; + + int optimize; + int q_index; + int is_skin; + int denoise_zeromv; + +#if CONFIG_TEMPORAL_DENOISING + int increase_denoising; + MB_PREDICTION_MODE best_sse_inter_mode; + int_mv best_sse_mv; + MV_REFERENCE_FRAME best_reference_frame; + MV_REFERENCE_FRAME best_zeromv_reference_frame; + unsigned char need_to_clamp_best_mvs; +#endif + + int skip_true_count; + unsigned int coef_counts[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] + [MAX_ENTROPY_TOKENS]; + unsigned int MVcount[2][MVvals]; /* (row,col) MV cts this frame */ + int ymode_count[VP8_YMODES]; /* intra MB type cts this frame */ + int uv_mode_count[VP8_UV_MODES]; /* intra MB type cts this frame */ + int64_t prediction_error; + int64_t intra_error; + int count_mb_ref_frame_usage[MAX_REF_FRAMES]; + + int rd_thresh_mult[MAX_MODES]; + int rd_threshes[MAX_MODES]; + unsigned int mbs_tested_so_far; + unsigned int mode_test_hit_counts[MAX_MODES]; + int zbin_mode_boost_enabled; + int zbin_mode_boost; + int last_zbin_mode_boost; + + int last_zbin_over_quant; + int zbin_over_quant; + int error_bins[MAX_ERROR_BINS]; + + void (*short_fdct4x4)(short *input, short *output, int pitch); + void (*short_fdct8x4)(short *input, short *output, int pitch); + void (*short_walsh4x4)(short *input, short *output, int pitch); + void (*quantize_b)(BLOCK *b, BLOCKD *d); + + unsigned int mbs_zero_last_dot_suppress; + int zero_last_dot_suppress; +} MACROBLOCK; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_ENCODER_BLOCK_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/boolhuff.c b/media/libvpx/libvpx/vp8/encoder/boolhuff.c new file mode 100644 index 0000000000..819c2f22a0 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/boolhuff.c @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "boolhuff.h" + +#if defined(SECTIONBITS_OUTPUT) +unsigned __int64 Sectionbits[500]; + +#endif + +const unsigned int vp8_prob_cost[256] = { + 2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, + 1099, 1072, 1046, 1023, 1000, 979, 959, 940, 922, 905, 889, 873, 858, + 843, 829, 816, 803, 790, 778, 767, 755, 744, 733, 723, 713, 703, + 693, 684, 675, 666, 657, 649, 641, 633, 625, 617, 609, 602, 594, + 587, 580, 573, 567, 560, 553, 547, 541, 534, 528, 522, 516, 511, + 505, 499, 494, 488, 483, 477, 472, 467, 462, 457, 452, 447, 442, + 437, 433, 428, 424, 419, 415, 410, 406, 401, 397, 393, 389, 385, + 381, 377, 373, 369, 365, 361, 357, 353, 349, 346, 342, 338, 335, + 331, 328, 324, 321, 317, 314, 311, 307, 304, 301, 297, 294, 291, + 288, 285, 281, 278, 275, 272, 269, 266, 263, 260, 257, 255, 252, + 249, 246, 243, 240, 238, 235, 232, 229, 227, 224, 221, 219, 216, + 214, 211, 208, 206, 203, 201, 198, 196, 194, 191, 189, 186, 184, + 181, 179, 177, 174, 172, 170, 168, 165, 163, 161, 159, 156, 154, + 152, 150, 148, 145, 143, 141, 139, 137, 135, 133, 131, 129, 127, + 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, + 99, 97, 95, 93, 92, 90, 88, 86, 84, 82, 81, 79, 77, + 75, 73, 72, 70, 68, 66, 65, 63, 61, 60, 58, 56, 55, + 53, 51, 50, 48, 46, 45, 43, 41, 40, 38, 37, 35, 33, + 32, 30, 29, 27, 25, 24, 22, 21, 19, 18, 16, 15, 13, + 12, 10, 9, 7, 6, 4, 3, 1, 1 +}; + +void vp8_start_encode(BOOL_CODER *bc, unsigned char *source, + unsigned char *source_end) { + bc->lowvalue = 0; + bc->range = 255; + bc->count = -24; + bc->buffer = source; + bc->buffer_end = source_end; + bc->pos = 0; +} + +void vp8_stop_encode(BOOL_CODER *bc) { + int i; + + for (i = 0; i < 32; ++i) vp8_encode_bool(bc, 0, 128); +} + +void vp8_encode_value(BOOL_CODER *bc, int data, int bits) { + int bit; + + for (bit = bits - 1; bit >= 0; bit--) { + vp8_encode_bool(bc, (1 & (data >> bit)), 0x80); + } +} diff --git a/media/libvpx/libvpx/vp8/encoder/boolhuff.h b/media/libvpx/libvpx/vp8/encoder/boolhuff.h new file mode 100644 index 0000000000..a8c536b99c --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/boolhuff.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/**************************************************************************** + * + * Module Title : boolhuff.h + * + * Description : Bool Coder header file. + * + ****************************************************************************/ +#ifndef VPX_VP8_ENCODER_BOOLHUFF_H_ +#define VPX_VP8_ENCODER_BOOLHUFF_H_ + +#include "vpx_ports/mem.h" +#include "vpx/internal/vpx_codec_internal.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + unsigned int lowvalue; + unsigned int range; + int count; + unsigned int pos; + unsigned char *buffer; + unsigned char *buffer_end; + struct vpx_internal_error_info *error; +} BOOL_CODER; + +void vp8_start_encode(BOOL_CODER *bc, unsigned char *source, + unsigned char *source_end); + +void vp8_encode_value(BOOL_CODER *bc, int data, int bits); +void vp8_stop_encode(BOOL_CODER *bc); +extern const unsigned int vp8_prob_cost[256]; + +DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]); + +static int validate_buffer(const unsigned char *start, size_t len, + const unsigned char *end, + struct vpx_internal_error_info *error) { + if (start + len > start && start + len < end) { + return 1; + } else { + vpx_internal_error(error, VPX_CODEC_CORRUPT_FRAME, + "Truncated packet or corrupt partition "); + } + + return 0; +} +static void vp8_encode_bool(BOOL_CODER *bc, int bit, int probability) { + unsigned int split; + int count = bc->count; + unsigned int range = bc->range; + unsigned int lowvalue = bc->lowvalue; + int shift; + + split = 1 + (((range - 1) * probability) >> 8); + + range = split; + + if (bit) { + lowvalue += split; + range = bc->range - split; + } + + shift = vp8_norm[range]; + + range <<= shift; + count += shift; + + if (count >= 0) { + int offset = shift - count; + + if ((lowvalue << (offset - 1)) & 0x80000000) { + int x = bc->pos - 1; + + while (x >= 0 && bc->buffer[x] == 0xff) { + bc->buffer[x] = (unsigned char)0; + x--; + } + + bc->buffer[x] += 1; + } + + validate_buffer(bc->buffer + bc->pos, 1, bc->buffer_end, bc->error); + bc->buffer[bc->pos++] = (lowvalue >> (24 - offset) & 0xff); + + shift = count; + lowvalue = (int)(((uint64_t)lowvalue << offset) & 0xffffff); + count -= 8; + } + + lowvalue <<= shift; + bc->count = count; + bc->lowvalue = lowvalue; + bc->range = range; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_ENCODER_BOOLHUFF_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/copy_c.c b/media/libvpx/libvpx/vp8/encoder/copy_c.c new file mode 100644 index 0000000000..4746125245 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/copy_c.c @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vp8_rtcd.h" +#include "vpx/vpx_integer.h" + +/* Copy 2 macroblocks to a buffer */ +void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, + unsigned char *dst_ptr, int dst_stride, int height) { + int r; + + for (r = 0; r < height; ++r) { + memcpy(dst_ptr, src_ptr, 32); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} diff --git a/media/libvpx/libvpx/vp8/encoder/dct.c b/media/libvpx/libvpx/vp8/encoder/dct.c new file mode 100644 index 0000000000..7d214eafb0 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/dct.c @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vp8_rtcd.h" + +void vp8_short_fdct4x4_c(short *input, short *output, int pitch) { + int i; + int a1, b1, c1, d1; + short *ip = input; + short *op = output; + + for (i = 0; i < 4; ++i) { + a1 = ((ip[0] + ip[3]) * 8); + b1 = ((ip[1] + ip[2]) * 8); + c1 = ((ip[1] - ip[2]) * 8); + d1 = ((ip[0] - ip[3]) * 8); + + op[0] = a1 + b1; + op[2] = a1 - b1; + + op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12; + op[3] = (d1 * 2217 - c1 * 5352 + 7500) >> 12; + + ip += pitch / 2; + op += 4; + } + ip = output; + op = output; + for (i = 0; i < 4; ++i) { + a1 = ip[0] + ip[12]; + b1 = ip[4] + ip[8]; + c1 = ip[4] - ip[8]; + d1 = ip[0] - ip[12]; + + op[0] = (a1 + b1 + 7) >> 4; + op[8] = (a1 - b1 + 7) >> 4; + + op[4] = ((c1 * 2217 + d1 * 5352 + 12000) >> 16) + (d1 != 0); + op[12] = (d1 * 2217 - c1 * 5352 + 51000) >> 16; + + ip++; + op++; + } +} + +void vp8_short_fdct8x4_c(short *input, short *output, int pitch) { + vp8_short_fdct4x4_c(input, output, pitch); + vp8_short_fdct4x4_c(input + 4, output + 16, pitch); +} + +void vp8_short_walsh4x4_c(short *input, short *output, int pitch) { + int i; + int a1, b1, c1, d1; + int a2, b2, c2, d2; + short *ip = input; + short *op = output; + + for (i = 0; i < 4; ++i) { + a1 = ((ip[0] + ip[2]) * 4); + d1 = ((ip[1] + ip[3]) * 4); + c1 = ((ip[1] - ip[3]) * 4); + b1 = ((ip[0] - ip[2]) * 4); + + op[0] = a1 + d1 + (a1 != 0); + op[1] = b1 + c1; + op[2] = b1 - c1; + op[3] = a1 - d1; + ip += pitch / 2; + op += 4; + } + + ip = output; + op = output; + + for (i = 0; i < 4; ++i) { + a1 = ip[0] + ip[8]; + d1 = ip[4] + ip[12]; + c1 = ip[4] - ip[12]; + b1 = ip[0] - ip[8]; + + a2 = a1 + d1; + b2 = b1 + c1; + c2 = b1 - c1; + d2 = a1 - d1; + + a2 += a2 < 0; + b2 += b2 < 0; + c2 += c2 < 0; + d2 += d2 < 0; + + op[0] = (a2 + 3) >> 3; + op[4] = (b2 + 3) >> 3; + op[8] = (c2 + 3) >> 3; + op[12] = (d2 + 3) >> 3; + + ip++; + op++; + } +} diff --git a/media/libvpx/libvpx/vp8/encoder/dct_value_cost.h b/media/libvpx/libvpx/vp8/encoder/dct_value_cost.h new file mode 100644 index 0000000000..0cd6cb4e65 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/dct_value_cost.h @@ -0,0 +1,344 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_ENCODER_DCT_VALUE_COST_H_ +#define VPX_VP8_ENCODER_DCT_VALUE_COST_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* Generated file, included by tokenize.c */ +/* Values generated by fill_value_tokens() */ + +static const short dct_value_cost[2048 * 2] = { + 8285, 8277, 8267, 8259, 8253, 8245, 8226, 8218, 8212, 8204, 8194, 8186, 8180, + 8172, 8150, 8142, 8136, 8128, 8118, 8110, 8104, 8096, 8077, 8069, 8063, 8055, + 8045, 8037, 8031, 8023, 7997, 7989, 7983, 7975, 7965, 7957, 7951, 7943, 7924, + 7916, 7910, 7902, 7892, 7884, 7878, 7870, 7848, 7840, 7834, 7826, 7816, 7808, + 7802, 7794, 7775, 7767, 7761, 7753, 7743, 7735, 7729, 7721, 7923, 7915, 7909, + 7901, 7891, 7883, 7877, 7869, 7850, 7842, 7836, 7828, 7818, 7810, 7804, 7796, + 7774, 7766, 7760, 7752, 7742, 7734, 7728, 7720, 7701, 7693, 7687, 7679, 7669, + 7661, 7655, 7647, 7621, 7613, 7607, 7599, 7589, 7581, 7575, 7567, 7548, 7540, + 7534, 7526, 7516, 7508, 7502, 7494, 7472, 7464, 7458, 7450, 7440, 7432, 7426, + 7418, 7399, 7391, 7385, 7377, 7367, 7359, 7353, 7345, 7479, 7471, 7465, 7457, + 7447, 7439, 7433, 7425, 7406, 7398, 7392, 7384, 7374, 7366, 7360, 7352, 7330, + 7322, 7316, 7308, 7298, 7290, 7284, 7276, 7257, 7249, 7243, 7235, 7225, 7217, + 7211, 7203, 7177, 7169, 7163, 7155, 7145, 7137, 7131, 7123, 7104, 7096, 7090, + 7082, 7072, 7064, 7058, 7050, 7028, 7020, 7014, 7006, 6996, 6988, 6982, 6974, + 6955, 6947, 6941, 6933, 6923, 6915, 6909, 6901, 7632, 7624, 7618, 7610, 7600, + 7592, 7586, 7578, 7559, 7551, 7545, 7537, 7527, 7519, 7513, 7505, 7483, 7475, + 7469, 7461, 7451, 7443, 7437, 7429, 7410, 7402, 7396, 7388, 7378, 7370, 7364, + 7356, 7330, 7322, 7316, 7308, 7298, 7290, 7284, 7276, 7257, 7249, 7243, 7235, + 7225, 7217, 7211, 7203, 7181, 7173, 7167, 7159, 7149, 7141, 7135, 7127, 7108, + 7100, 7094, 7086, 7076, 7068, 7062, 7054, 7188, 7180, 7174, 7166, 7156, 7148, + 7142, 7134, 7115, 7107, 7101, 7093, 7083, 7075, 7069, 7061, 7039, 7031, 7025, + 7017, 7007, 6999, 6993, 6985, 6966, 6958, 6952, 6944, 6934, 6926, 6920, 6912, + 6886, 6878, 6872, 6864, 6854, 6846, 6840, 6832, 6813, 6805, 6799, 6791, 6781, + 6773, 6767, 6759, 6737, 6729, 6723, 6715, 6705, 6697, 6691, 6683, 6664, 6656, + 6650, 6642, 6632, 6624, 6618, 6610, 6812, 6804, 6798, 6790, 6780, 6772, 6766, + 6758, 6739, 6731, 6725, 6717, 6707, 6699, 6693, 6685, 6663, 6655, 6649, 6641, + 6631, 6623, 6617, 6609, 6590, 6582, 6576, 6568, 6558, 6550, 6544, 6536, 6510, + 6502, 6496, 6488, 6478, 6470, 6464, 6456, 6437, 6429, 6423, 6415, 6405, 6397, + 6391, 6383, 6361, 6353, 6347, 6339, 6329, 6321, 6315, 6307, 6288, 6280, 6274, + 6266, 6256, 6248, 6242, 6234, 6368, 6360, 6354, 6346, 6336, 6328, 6322, 6314, + 6295, 6287, 6281, 6273, 6263, 6255, 6249, 6241, 6219, 6211, 6205, 6197, 6187, + 6179, 6173, 6165, 6146, 6138, 6132, 6124, 6114, 6106, 6100, 6092, 6066, 6058, + 6052, 6044, 6034, 6026, 6020, 6012, 5993, 5985, 5979, 5971, 5961, 5953, 5947, + 5939, 5917, 5909, 5903, 5895, 5885, 5877, 5871, 5863, 5844, 5836, 5830, 5822, + 5812, 5804, 5798, 5790, 6697, 6689, 6683, 6675, 6665, 6657, 6651, 6643, 6624, + 6616, 6610, 6602, 6592, 6584, 6578, 6570, 6548, 6540, 6534, 6526, 6516, 6508, + 6502, 6494, 6475, 6467, 6461, 6453, 6443, 6435, 6429, 6421, 6395, 6387, 6381, + 6373, 6363, 6355, 6349, 6341, 6322, 6314, 6308, 6300, 6290, 6282, 6276, 6268, + 6246, 6238, 6232, 6224, 6214, 6206, 6200, 6192, 6173, 6165, 6159, 6151, 6141, + 6133, 6127, 6119, 6253, 6245, 6239, 6231, 6221, 6213, 6207, 6199, 6180, 6172, + 6166, 6158, 6148, 6140, 6134, 6126, 6104, 6096, 6090, 6082, 6072, 6064, 6058, + 6050, 6031, 6023, 6017, 6009, 5999, 5991, 5985, 5977, 5951, 5943, 5937, 5929, + 5919, 5911, 5905, 5897, 5878, 5870, 5864, 5856, 5846, 5838, 5832, 5824, 5802, + 5794, 5788, 5780, 5770, 5762, 5756, 5748, 5729, 5721, 5715, 5707, 5697, 5689, + 5683, 5675, 5877, 5869, 5863, 5855, 5845, 5837, 5831, 5823, 5804, 5796, 5790, + 5782, 5772, 5764, 5758, 5750, 5728, 5720, 5714, 5706, 5696, 5688, 5682, 5674, + 5655, 5647, 5641, 5633, 5623, 5615, 5609, 5601, 5575, 5567, 5561, 5553, 5543, + 5535, 5529, 5521, 5502, 5494, 5488, 5480, 5470, 5462, 5456, 5448, 5426, 5418, + 5412, 5404, 5394, 5386, 5380, 5372, 5353, 5345, 5339, 5331, 5321, 5313, 5307, + 5299, 5433, 5425, 5419, 5411, 5401, 5393, 5387, 5379, 5360, 5352, 5346, 5338, + 5328, 5320, 5314, 5306, 5284, 5276, 5270, 5262, 5252, 5244, 5238, 5230, 5211, + 5203, 5197, 5189, 5179, 5171, 5165, 5157, 5131, 5123, 5117, 5109, 5099, 5091, + 5085, 5077, 5058, 5050, 5044, 5036, 5026, 5018, 5012, 5004, 4982, 4974, 4968, + 4960, 4950, 4942, 4936, 4928, 4909, 4901, 4895, 4887, 4877, 4869, 4863, 4855, + 5586, 5578, 5572, 5564, 5554, 5546, 5540, 5532, 5513, 5505, 5499, 5491, 5481, + 5473, 5467, 5459, 5437, 5429, 5423, 5415, 5405, 5397, 5391, 5383, 5364, 5356, + 5350, 5342, 5332, 5324, 5318, 5310, 5284, 5276, 5270, 5262, 5252, 5244, 5238, + 5230, 5211, 5203, 5197, 5189, 5179, 5171, 5165, 5157, 5135, 5127, 5121, 5113, + 5103, 5095, 5089, 5081, 5062, 5054, 5048, 5040, 5030, 5022, 5016, 5008, 5142, + 5134, 5128, 5120, 5110, 5102, 5096, 5088, 5069, 5061, 5055, 5047, 5037, 5029, + 5023, 5015, 4993, 4985, 4979, 4971, 4961, 4953, 4947, 4939, 4920, 4912, 4906, + 4898, 4888, 4880, 4874, 4866, 4840, 4832, 4826, 4818, 4808, 4800, 4794, 4786, + 4767, 4759, 4753, 4745, 4735, 4727, 4721, 4713, 4691, 4683, 4677, 4669, 4659, + 4651, 4645, 4637, 4618, 4610, 4604, 4596, 4586, 4578, 4572, 4564, 4766, 4758, + 4752, 4744, 4734, 4726, 4720, 4712, 4693, 4685, 4679, 4671, 4661, 4653, 4647, + 4639, 4617, 4609, 4603, 4595, 4585, 4577, 4571, 4563, 4544, 4536, 4530, 4522, + 4512, 4504, 4498, 4490, 4464, 4456, 4450, 4442, 4432, 4424, 4418, 4410, 4391, + 4383, 4377, 4369, 4359, 4351, 4345, 4337, 4315, 4307, 4301, 4293, 4283, 4275, + 4269, 4261, 4242, 4234, 4228, 4220, 4210, 4202, 4196, 4188, 4322, 4314, 4308, + 4300, 4290, 4282, 4276, 4268, 4249, 4241, 4235, 4227, 4217, 4209, 4203, 4195, + 4173, 4165, 4159, 4151, 4141, 4133, 4127, 4119, 4100, 4092, 4086, 4078, 4068, + 4060, 4054, 4046, 4020, 4012, 4006, 3998, 3988, 3980, 3974, 3966, 3947, 3939, + 3933, 3925, 3915, 3907, 3901, 3893, 3871, 3863, 3857, 3849, 3839, 3831, 3825, + 3817, 3798, 3790, 3784, 3776, 3766, 3758, 3752, 3744, 6697, 6689, 6683, 6675, + 6665, 6657, 6651, 6643, 6624, 6616, 6610, 6602, 6592, 6584, 6578, 6570, 6548, + 6540, 6534, 6526, 6516, 6508, 6502, 6494, 6475, 6467, 6461, 6453, 6443, 6435, + 6429, 6421, 6395, 6387, 6381, 6373, 6363, 6355, 6349, 6341, 6322, 6314, 6308, + 6300, 6290, 6282, 6276, 6268, 6246, 6238, 6232, 6224, 6214, 6206, 6200, 6192, + 6173, 6165, 6159, 6151, 6141, 6133, 6127, 6119, 6253, 6245, 6239, 6231, 6221, + 6213, 6207, 6199, 6180, 6172, 6166, 6158, 6148, 6140, 6134, 6126, 6104, 6096, + 6090, 6082, 6072, 6064, 6058, 6050, 6031, 6023, 6017, 6009, 5999, 5991, 5985, + 5977, 5951, 5943, 5937, 5929, 5919, 5911, 5905, 5897, 5878, 5870, 5864, 5856, + 5846, 5838, 5832, 5824, 5802, 5794, 5788, 5780, 5770, 5762, 5756, 5748, 5729, + 5721, 5715, 5707, 5697, 5689, 5683, 5675, 5877, 5869, 5863, 5855, 5845, 5837, + 5831, 5823, 5804, 5796, 5790, 5782, 5772, 5764, 5758, 5750, 5728, 5720, 5714, + 5706, 5696, 5688, 5682, 5674, 5655, 5647, 5641, 5633, 5623, 5615, 5609, 5601, + 5575, 5567, 5561, 5553, 5543, 5535, 5529, 5521, 5502, 5494, 5488, 5480, 5470, + 5462, 5456, 5448, 5426, 5418, 5412, 5404, 5394, 5386, 5380, 5372, 5353, 5345, + 5339, 5331, 5321, 5313, 5307, 5299, 5433, 5425, 5419, 5411, 5401, 5393, 5387, + 5379, 5360, 5352, 5346, 5338, 5328, 5320, 5314, 5306, 5284, 5276, 5270, 5262, + 5252, 5244, 5238, 5230, 5211, 5203, 5197, 5189, 5179, 5171, 5165, 5157, 5131, + 5123, 5117, 5109, 5099, 5091, 5085, 5077, 5058, 5050, 5044, 5036, 5026, 5018, + 5012, 5004, 4982, 4974, 4968, 4960, 4950, 4942, 4936, 4928, 4909, 4901, 4895, + 4887, 4877, 4869, 4863, 4855, 5586, 5578, 5572, 5564, 5554, 5546, 5540, 5532, + 5513, 5505, 5499, 5491, 5481, 5473, 5467, 5459, 5437, 5429, 5423, 5415, 5405, + 5397, 5391, 5383, 5364, 5356, 5350, 5342, 5332, 5324, 5318, 5310, 5284, 5276, + 5270, 5262, 5252, 5244, 5238, 5230, 5211, 5203, 5197, 5189, 5179, 5171, 5165, + 5157, 5135, 5127, 5121, 5113, 5103, 5095, 5089, 5081, 5062, 5054, 5048, 5040, + 5030, 5022, 5016, 5008, 5142, 5134, 5128, 5120, 5110, 5102, 5096, 5088, 5069, + 5061, 5055, 5047, 5037, 5029, 5023, 5015, 4993, 4985, 4979, 4971, 4961, 4953, + 4947, 4939, 4920, 4912, 4906, 4898, 4888, 4880, 4874, 4866, 4840, 4832, 4826, + 4818, 4808, 4800, 4794, 4786, 4767, 4759, 4753, 4745, 4735, 4727, 4721, 4713, + 4691, 4683, 4677, 4669, 4659, 4651, 4645, 4637, 4618, 4610, 4604, 4596, 4586, + 4578, 4572, 4564, 4766, 4758, 4752, 4744, 4734, 4726, 4720, 4712, 4693, 4685, + 4679, 4671, 4661, 4653, 4647, 4639, 4617, 4609, 4603, 4595, 4585, 4577, 4571, + 4563, 4544, 4536, 4530, 4522, 4512, 4504, 4498, 4490, 4464, 4456, 4450, 4442, + 4432, 4424, 4418, 4410, 4391, 4383, 4377, 4369, 4359, 4351, 4345, 4337, 4315, + 4307, 4301, 4293, 4283, 4275, 4269, 4261, 4242, 4234, 4228, 4220, 4210, 4202, + 4196, 4188, 4322, 4314, 4308, 4300, 4290, 4282, 4276, 4268, 4249, 4241, 4235, + 4227, 4217, 4209, 4203, 4195, 4173, 4165, 4159, 4151, 4141, 4133, 4127, 4119, + 4100, 4092, 4086, 4078, 4068, 4060, 4054, 4046, 4020, 4012, 4006, 3998, 3988, + 3980, 3974, 3966, 3947, 3939, 3933, 3925, 3915, 3907, 3901, 3893, 3871, 3863, + 3857, 3849, 3839, 3831, 3825, 3817, 3798, 3790, 3784, 3776, 3766, 3758, 3752, + 3744, 4651, 4643, 4637, 4629, 4619, 4611, 4605, 4597, 4578, 4570, 4564, 4556, + 4546, 4538, 4532, 4524, 4502, 4494, 4488, 4480, 4470, 4462, 4456, 4448, 4429, + 4421, 4415, 4407, 4397, 4389, 4383, 4375, 4349, 4341, 4335, 4327, 4317, 4309, + 4303, 4295, 4276, 4268, 4262, 4254, 4244, 4236, 4230, 4222, 4200, 4192, 4186, + 4178, 4168, 4160, 4154, 4146, 4127, 4119, 4113, 4105, 4095, 4087, 4081, 4073, + 4207, 4199, 4193, 4185, 4175, 4167, 4161, 4153, 4134, 4126, 4120, 4112, 4102, + 4094, 4088, 4080, 4058, 4050, 4044, 4036, 4026, 4018, 4012, 4004, 3985, 3977, + 3971, 3963, 3953, 3945, 3939, 3931, 3905, 3897, 3891, 3883, 3873, 3865, 3859, + 3851, 3832, 3824, 3818, 3810, 3800, 3792, 3786, 3778, 3756, 3748, 3742, 3734, + 3724, 3716, 3710, 3702, 3683, 3675, 3669, 3661, 3651, 3643, 3637, 3629, 3831, + 3823, 3817, 3809, 3799, 3791, 3785, 3777, 3758, 3750, 3744, 3736, 3726, 3718, + 3712, 3704, 3682, 3674, 3668, 3660, 3650, 3642, 3636, 3628, 3609, 3601, 3595, + 3587, 3577, 3569, 3563, 3555, 3529, 3521, 3515, 3507, 3497, 3489, 3483, 3475, + 3456, 3448, 3442, 3434, 3424, 3416, 3410, 3402, 3380, 3372, 3366, 3358, 3348, + 3340, 3334, 3326, 3307, 3299, 3293, 3285, 3275, 3267, 3261, 3253, 3387, 3379, + 3373, 3365, 3355, 3347, 3341, 3333, 3314, 3306, 3300, 3292, 3282, 3274, 3268, + 3260, 3238, 3230, 3224, 3216, 3206, 3198, 3192, 3184, 3165, 3157, 3151, 3143, + 3133, 3125, 3119, 3111, 3085, 3077, 3071, 3063, 3053, 3045, 3039, 3031, 3012, + 3004, 2998, 2990, 2980, 2972, 2966, 2958, 2936, 2928, 2922, 2914, 2904, 2896, + 2890, 2882, 2863, 2855, 2849, 2841, 2831, 2823, 2817, 2809, 3540, 3532, 3526, + 3518, 3508, 3500, 3494, 3486, 3467, 3459, 3453, 3445, 3435, 3427, 3421, 3413, + 3391, 3383, 3377, 3369, 3359, 3351, 3345, 3337, 3318, 3310, 3304, 3296, 3286, + 3278, 3272, 3264, 3238, 3230, 3224, 3216, 3206, 3198, 3192, 3184, 3165, 3157, + 3151, 3143, 3133, 3125, 3119, 3111, 3089, 3081, 3075, 3067, 3057, 3049, 3043, + 3035, 3016, 3008, 3002, 2994, 2984, 2976, 2970, 2962, 3096, 3088, 3082, 3074, + 3064, 3056, 3050, 3042, 3023, 3015, 3009, 3001, 2991, 2983, 2977, 2969, 2947, + 2939, 2933, 2925, 2915, 2907, 2901, 2893, 2874, 2866, 2860, 2852, 2842, 2834, + 2828, 2820, 2794, 2786, 2780, 2772, 2762, 2754, 2748, 2740, 2721, 2713, 2707, + 2699, 2689, 2681, 2675, 2667, 2645, 2637, 2631, 2623, 2613, 2605, 2599, 2591, + 2572, 2564, 2558, 2550, 2540, 2532, 2526, 2518, 2720, 2712, 2706, 2698, 2688, + 2680, 2674, 2666, 2647, 2639, 2633, 2625, 2615, 2607, 2601, 2593, 2571, 2563, + 2557, 2549, 2539, 2531, 2525, 2517, 2498, 2490, 2484, 2476, 2466, 2458, 2452, + 2444, 2418, 2410, 2404, 2396, 2386, 2378, 2372, 2364, 2345, 2337, 2331, 2323, + 2313, 2305, 2299, 2291, 2269, 2261, 2255, 2247, 2237, 2229, 2223, 2215, 2196, + 2188, 2182, 2174, 2164, 2156, 2150, 2142, 2276, 2268, 2262, 2254, 2244, 2236, + 2230, 2222, 2203, 2195, 2189, 2181, 2171, 2163, 2157, 2149, 2127, 2119, 2113, + 2105, 2095, 2087, 2081, 2073, 2054, 2046, 2040, 2032, 2022, 2014, 2008, 2000, + 1974, 1966, 1960, 1952, 1942, 1934, 1928, 1920, 1901, 1893, 1887, 1879, 1869, + 1861, 1855, 1847, 1825, 1817, 1811, 1803, 1793, 1785, 1779, 1771, 1752, 1744, + 1738, 1730, 1720, 1712, 1706, 1698, 1897, 1883, 1860, 1846, 1819, 1805, 1782, + 1768, 1723, 1709, 1686, 1672, 1645, 1631, 1608, 1594, 1574, 1560, 1537, 1523, + 1496, 1482, 1459, 1445, 1400, 1386, 1363, 1349, 1322, 1308, 1285, 1271, 1608, + 1565, 1535, 1492, 1446, 1403, 1373, 1330, 1312, 1269, 1239, 1196, 1150, 1107, + 1077, 1034, 1291, 1218, 1171, 1098, 1015, 942, 895, 822, 953, 850, 729, + 626, 618, 431, 257, 257, 257, 257, 0, 255, 255, 255, 255, 429, + 616, 624, 727, 848, 951, 820, 893, 940, 1013, 1096, 1169, 1216, 1289, + 1032, 1075, 1105, 1148, 1194, 1237, 1267, 1310, 1328, 1371, 1401, 1444, 1490, + 1533, 1563, 1606, 1269, 1283, 1306, 1320, 1347, 1361, 1384, 1398, 1443, 1457, + 1480, 1494, 1521, 1535, 1558, 1572, 1592, 1606, 1629, 1643, 1670, 1684, 1707, + 1721, 1766, 1780, 1803, 1817, 1844, 1858, 1881, 1895, 1696, 1704, 1710, 1718, + 1728, 1736, 1742, 1750, 1769, 1777, 1783, 1791, 1801, 1809, 1815, 1823, 1845, + 1853, 1859, 1867, 1877, 1885, 1891, 1899, 1918, 1926, 1932, 1940, 1950, 1958, + 1964, 1972, 1998, 2006, 2012, 2020, 2030, 2038, 2044, 2052, 2071, 2079, 2085, + 2093, 2103, 2111, 2117, 2125, 2147, 2155, 2161, 2169, 2179, 2187, 2193, 2201, + 2220, 2228, 2234, 2242, 2252, 2260, 2266, 2274, 2140, 2148, 2154, 2162, 2172, + 2180, 2186, 2194, 2213, 2221, 2227, 2235, 2245, 2253, 2259, 2267, 2289, 2297, + 2303, 2311, 2321, 2329, 2335, 2343, 2362, 2370, 2376, 2384, 2394, 2402, 2408, + 2416, 2442, 2450, 2456, 2464, 2474, 2482, 2488, 2496, 2515, 2523, 2529, 2537, + 2547, 2555, 2561, 2569, 2591, 2599, 2605, 2613, 2623, 2631, 2637, 2645, 2664, + 2672, 2678, 2686, 2696, 2704, 2710, 2718, 2516, 2524, 2530, 2538, 2548, 2556, + 2562, 2570, 2589, 2597, 2603, 2611, 2621, 2629, 2635, 2643, 2665, 2673, 2679, + 2687, 2697, 2705, 2711, 2719, 2738, 2746, 2752, 2760, 2770, 2778, 2784, 2792, + 2818, 2826, 2832, 2840, 2850, 2858, 2864, 2872, 2891, 2899, 2905, 2913, 2923, + 2931, 2937, 2945, 2967, 2975, 2981, 2989, 2999, 3007, 3013, 3021, 3040, 3048, + 3054, 3062, 3072, 3080, 3086, 3094, 2960, 2968, 2974, 2982, 2992, 3000, 3006, + 3014, 3033, 3041, 3047, 3055, 3065, 3073, 3079, 3087, 3109, 3117, 3123, 3131, + 3141, 3149, 3155, 3163, 3182, 3190, 3196, 3204, 3214, 3222, 3228, 3236, 3262, + 3270, 3276, 3284, 3294, 3302, 3308, 3316, 3335, 3343, 3349, 3357, 3367, 3375, + 3381, 3389, 3411, 3419, 3425, 3433, 3443, 3451, 3457, 3465, 3484, 3492, 3498, + 3506, 3516, 3524, 3530, 3538, 2807, 2815, 2821, 2829, 2839, 2847, 2853, 2861, + 2880, 2888, 2894, 2902, 2912, 2920, 2926, 2934, 2956, 2964, 2970, 2978, 2988, + 2996, 3002, 3010, 3029, 3037, 3043, 3051, 3061, 3069, 3075, 3083, 3109, 3117, + 3123, 3131, 3141, 3149, 3155, 3163, 3182, 3190, 3196, 3204, 3214, 3222, 3228, + 3236, 3258, 3266, 3272, 3280, 3290, 3298, 3304, 3312, 3331, 3339, 3345, 3353, + 3363, 3371, 3377, 3385, 3251, 3259, 3265, 3273, 3283, 3291, 3297, 3305, 3324, + 3332, 3338, 3346, 3356, 3364, 3370, 3378, 3400, 3408, 3414, 3422, 3432, 3440, + 3446, 3454, 3473, 3481, 3487, 3495, 3505, 3513, 3519, 3527, 3553, 3561, 3567, + 3575, 3585, 3593, 3599, 3607, 3626, 3634, 3640, 3648, 3658, 3666, 3672, 3680, + 3702, 3710, 3716, 3724, 3734, 3742, 3748, 3756, 3775, 3783, 3789, 3797, 3807, + 3815, 3821, 3829, 3627, 3635, 3641, 3649, 3659, 3667, 3673, 3681, 3700, 3708, + 3714, 3722, 3732, 3740, 3746, 3754, 3776, 3784, 3790, 3798, 3808, 3816, 3822, + 3830, 3849, 3857, 3863, 3871, 3881, 3889, 3895, 3903, 3929, 3937, 3943, 3951, + 3961, 3969, 3975, 3983, 4002, 4010, 4016, 4024, 4034, 4042, 4048, 4056, 4078, + 4086, 4092, 4100, 4110, 4118, 4124, 4132, 4151, 4159, 4165, 4173, 4183, 4191, + 4197, 4205, 4071, 4079, 4085, 4093, 4103, 4111, 4117, 4125, 4144, 4152, 4158, + 4166, 4176, 4184, 4190, 4198, 4220, 4228, 4234, 4242, 4252, 4260, 4266, 4274, + 4293, 4301, 4307, 4315, 4325, 4333, 4339, 4347, 4373, 4381, 4387, 4395, 4405, + 4413, 4419, 4427, 4446, 4454, 4460, 4468, 4478, 4486, 4492, 4500, 4522, 4530, + 4536, 4544, 4554, 4562, 4568, 4576, 4595, 4603, 4609, 4617, 4627, 4635, 4641, + 4649, 3742, 3750, 3756, 3764, 3774, 3782, 3788, 3796, 3815, 3823, 3829, 3837, + 3847, 3855, 3861, 3869, 3891, 3899, 3905, 3913, 3923, 3931, 3937, 3945, 3964, + 3972, 3978, 3986, 3996, 4004, 4010, 4018, 4044, 4052, 4058, 4066, 4076, 4084, + 4090, 4098, 4117, 4125, 4131, 4139, 4149, 4157, 4163, 4171, 4193, 4201, 4207, + 4215, 4225, 4233, 4239, 4247, 4266, 4274, 4280, 4288, 4298, 4306, 4312, 4320, + 4186, 4194, 4200, 4208, 4218, 4226, 4232, 4240, 4259, 4267, 4273, 4281, 4291, + 4299, 4305, 4313, 4335, 4343, 4349, 4357, 4367, 4375, 4381, 4389, 4408, 4416, + 4422, 4430, 4440, 4448, 4454, 4462, 4488, 4496, 4502, 4510, 4520, 4528, 4534, + 4542, 4561, 4569, 4575, 4583, 4593, 4601, 4607, 4615, 4637, 4645, 4651, 4659, + 4669, 4677, 4683, 4691, 4710, 4718, 4724, 4732, 4742, 4750, 4756, 4764, 4562, + 4570, 4576, 4584, 4594, 4602, 4608, 4616, 4635, 4643, 4649, 4657, 4667, 4675, + 4681, 4689, 4711, 4719, 4725, 4733, 4743, 4751, 4757, 4765, 4784, 4792, 4798, + 4806, 4816, 4824, 4830, 4838, 4864, 4872, 4878, 4886, 4896, 4904, 4910, 4918, + 4937, 4945, 4951, 4959, 4969, 4977, 4983, 4991, 5013, 5021, 5027, 5035, 5045, + 5053, 5059, 5067, 5086, 5094, 5100, 5108, 5118, 5126, 5132, 5140, 5006, 5014, + 5020, 5028, 5038, 5046, 5052, 5060, 5079, 5087, 5093, 5101, 5111, 5119, 5125, + 5133, 5155, 5163, 5169, 5177, 5187, 5195, 5201, 5209, 5228, 5236, 5242, 5250, + 5260, 5268, 5274, 5282, 5308, 5316, 5322, 5330, 5340, 5348, 5354, 5362, 5381, + 5389, 5395, 5403, 5413, 5421, 5427, 5435, 5457, 5465, 5471, 5479, 5489, 5497, + 5503, 5511, 5530, 5538, 5544, 5552, 5562, 5570, 5576, 5584, 4853, 4861, 4867, + 4875, 4885, 4893, 4899, 4907, 4926, 4934, 4940, 4948, 4958, 4966, 4972, 4980, + 5002, 5010, 5016, 5024, 5034, 5042, 5048, 5056, 5075, 5083, 5089, 5097, 5107, + 5115, 5121, 5129, 5155, 5163, 5169, 5177, 5187, 5195, 5201, 5209, 5228, 5236, + 5242, 5250, 5260, 5268, 5274, 5282, 5304, 5312, 5318, 5326, 5336, 5344, 5350, + 5358, 5377, 5385, 5391, 5399, 5409, 5417, 5423, 5431, 5297, 5305, 5311, 5319, + 5329, 5337, 5343, 5351, 5370, 5378, 5384, 5392, 5402, 5410, 5416, 5424, 5446, + 5454, 5460, 5468, 5478, 5486, 5492, 5500, 5519, 5527, 5533, 5541, 5551, 5559, + 5565, 5573, 5599, 5607, 5613, 5621, 5631, 5639, 5645, 5653, 5672, 5680, 5686, + 5694, 5704, 5712, 5718, 5726, 5748, 5756, 5762, 5770, 5780, 5788, 5794, 5802, + 5821, 5829, 5835, 5843, 5853, 5861, 5867, 5875, 5673, 5681, 5687, 5695, 5705, + 5713, 5719, 5727, 5746, 5754, 5760, 5768, 5778, 5786, 5792, 5800, 5822, 5830, + 5836, 5844, 5854, 5862, 5868, 5876, 5895, 5903, 5909, 5917, 5927, 5935, 5941, + 5949, 5975, 5983, 5989, 5997, 6007, 6015, 6021, 6029, 6048, 6056, 6062, 6070, + 6080, 6088, 6094, 6102, 6124, 6132, 6138, 6146, 6156, 6164, 6170, 6178, 6197, + 6205, 6211, 6219, 6229, 6237, 6243, 6251, 6117, 6125, 6131, 6139, 6149, 6157, + 6163, 6171, 6190, 6198, 6204, 6212, 6222, 6230, 6236, 6244, 6266, 6274, 6280, + 6288, 6298, 6306, 6312, 6320, 6339, 6347, 6353, 6361, 6371, 6379, 6385, 6393, + 6419, 6427, 6433, 6441, 6451, 6459, 6465, 6473, 6492, 6500, 6506, 6514, 6524, + 6532, 6538, 6546, 6568, 6576, 6582, 6590, 6600, 6608, 6614, 6622, 6641, 6649, + 6655, 6663, 6673, 6681, 6687, 6695, 3742, 3750, 3756, 3764, 3774, 3782, 3788, + 3796, 3815, 3823, 3829, 3837, 3847, 3855, 3861, 3869, 3891, 3899, 3905, 3913, + 3923, 3931, 3937, 3945, 3964, 3972, 3978, 3986, 3996, 4004, 4010, 4018, 4044, + 4052, 4058, 4066, 4076, 4084, 4090, 4098, 4117, 4125, 4131, 4139, 4149, 4157, + 4163, 4171, 4193, 4201, 4207, 4215, 4225, 4233, 4239, 4247, 4266, 4274, 4280, + 4288, 4298, 4306, 4312, 4320, 4186, 4194, 4200, 4208, 4218, 4226, 4232, 4240, + 4259, 4267, 4273, 4281, 4291, 4299, 4305, 4313, 4335, 4343, 4349, 4357, 4367, + 4375, 4381, 4389, 4408, 4416, 4422, 4430, 4440, 4448, 4454, 4462, 4488, 4496, + 4502, 4510, 4520, 4528, 4534, 4542, 4561, 4569, 4575, 4583, 4593, 4601, 4607, + 4615, 4637, 4645, 4651, 4659, 4669, 4677, 4683, 4691, 4710, 4718, 4724, 4732, + 4742, 4750, 4756, 4764, 4562, 4570, 4576, 4584, 4594, 4602, 4608, 4616, 4635, + 4643, 4649, 4657, 4667, 4675, 4681, 4689, 4711, 4719, 4725, 4733, 4743, 4751, + 4757, 4765, 4784, 4792, 4798, 4806, 4816, 4824, 4830, 4838, 4864, 4872, 4878, + 4886, 4896, 4904, 4910, 4918, 4937, 4945, 4951, 4959, 4969, 4977, 4983, 4991, + 5013, 5021, 5027, 5035, 5045, 5053, 5059, 5067, 5086, 5094, 5100, 5108, 5118, + 5126, 5132, 5140, 5006, 5014, 5020, 5028, 5038, 5046, 5052, 5060, 5079, 5087, + 5093, 5101, 5111, 5119, 5125, 5133, 5155, 5163, 5169, 5177, 5187, 5195, 5201, + 5209, 5228, 5236, 5242, 5250, 5260, 5268, 5274, 5282, 5308, 5316, 5322, 5330, + 5340, 5348, 5354, 5362, 5381, 5389, 5395, 5403, 5413, 5421, 5427, 5435, 5457, + 5465, 5471, 5479, 5489, 5497, 5503, 5511, 5530, 5538, 5544, 5552, 5562, 5570, + 5576, 5584, 4853, 4861, 4867, 4875, 4885, 4893, 4899, 4907, 4926, 4934, 4940, + 4948, 4958, 4966, 4972, 4980, 5002, 5010, 5016, 5024, 5034, 5042, 5048, 5056, + 5075, 5083, 5089, 5097, 5107, 5115, 5121, 5129, 5155, 5163, 5169, 5177, 5187, + 5195, 5201, 5209, 5228, 5236, 5242, 5250, 5260, 5268, 5274, 5282, 5304, 5312, + 5318, 5326, 5336, 5344, 5350, 5358, 5377, 5385, 5391, 5399, 5409, 5417, 5423, + 5431, 5297, 5305, 5311, 5319, 5329, 5337, 5343, 5351, 5370, 5378, 5384, 5392, + 5402, 5410, 5416, 5424, 5446, 5454, 5460, 5468, 5478, 5486, 5492, 5500, 5519, + 5527, 5533, 5541, 5551, 5559, 5565, 5573, 5599, 5607, 5613, 5621, 5631, 5639, + 5645, 5653, 5672, 5680, 5686, 5694, 5704, 5712, 5718, 5726, 5748, 5756, 5762, + 5770, 5780, 5788, 5794, 5802, 5821, 5829, 5835, 5843, 5853, 5861, 5867, 5875, + 5673, 5681, 5687, 5695, 5705, 5713, 5719, 5727, 5746, 5754, 5760, 5768, 5778, + 5786, 5792, 5800, 5822, 5830, 5836, 5844, 5854, 5862, 5868, 5876, 5895, 5903, + 5909, 5917, 5927, 5935, 5941, 5949, 5975, 5983, 5989, 5997, 6007, 6015, 6021, + 6029, 6048, 6056, 6062, 6070, 6080, 6088, 6094, 6102, 6124, 6132, 6138, 6146, + 6156, 6164, 6170, 6178, 6197, 6205, 6211, 6219, 6229, 6237, 6243, 6251, 6117, + 6125, 6131, 6139, 6149, 6157, 6163, 6171, 6190, 6198, 6204, 6212, 6222, 6230, + 6236, 6244, 6266, 6274, 6280, 6288, 6298, 6306, 6312, 6320, 6339, 6347, 6353, + 6361, 6371, 6379, 6385, 6393, 6419, 6427, 6433, 6441, 6451, 6459, 6465, 6473, + 6492, 6500, 6506, 6514, 6524, 6532, 6538, 6546, 6568, 6576, 6582, 6590, 6600, + 6608, 6614, 6622, 6641, 6649, 6655, 6663, 6673, 6681, 6687, 6695, 5788, 5796, + 5802, 5810, 5820, 5828, 5834, 5842, 5861, 5869, 5875, 5883, 5893, 5901, 5907, + 5915, 5937, 5945, 5951, 5959, 5969, 5977, 5983, 5991, 6010, 6018, 6024, 6032, + 6042, 6050, 6056, 6064, 6090, 6098, 6104, 6112, 6122, 6130, 6136, 6144, 6163, + 6171, 6177, 6185, 6195, 6203, 6209, 6217, 6239, 6247, 6253, 6261, 6271, 6279, + 6285, 6293, 6312, 6320, 6326, 6334, 6344, 6352, 6358, 6366, 6232, 6240, 6246, + 6254, 6264, 6272, 6278, 6286, 6305, 6313, 6319, 6327, 6337, 6345, 6351, 6359, + 6381, 6389, 6395, 6403, 6413, 6421, 6427, 6435, 6454, 6462, 6468, 6476, 6486, + 6494, 6500, 6508, 6534, 6542, 6548, 6556, 6566, 6574, 6580, 6588, 6607, 6615, + 6621, 6629, 6639, 6647, 6653, 6661, 6683, 6691, 6697, 6705, 6715, 6723, 6729, + 6737, 6756, 6764, 6770, 6778, 6788, 6796, 6802, 6810, 6608, 6616, 6622, 6630, + 6640, 6648, 6654, 6662, 6681, 6689, 6695, 6703, 6713, 6721, 6727, 6735, 6757, + 6765, 6771, 6779, 6789, 6797, 6803, 6811, 6830, 6838, 6844, 6852, 6862, 6870, + 6876, 6884, 6910, 6918, 6924, 6932, 6942, 6950, 6956, 6964, 6983, 6991, 6997, + 7005, 7015, 7023, 7029, 7037, 7059, 7067, 7073, 7081, 7091, 7099, 7105, 7113, + 7132, 7140, 7146, 7154, 7164, 7172, 7178, 7186, 7052, 7060, 7066, 7074, 7084, + 7092, 7098, 7106, 7125, 7133, 7139, 7147, 7157, 7165, 7171, 7179, 7201, 7209, + 7215, 7223, 7233, 7241, 7247, 7255, 7274, 7282, 7288, 7296, 7306, 7314, 7320, + 7328, 7354, 7362, 7368, 7376, 7386, 7394, 7400, 7408, 7427, 7435, 7441, 7449, + 7459, 7467, 7473, 7481, 7503, 7511, 7517, 7525, 7535, 7543, 7549, 7557, 7576, + 7584, 7590, 7598, 7608, 7616, 7622, 7630, 6899, 6907, 6913, 6921, 6931, 6939, + 6945, 6953, 6972, 6980, 6986, 6994, 7004, 7012, 7018, 7026, 7048, 7056, 7062, + 7070, 7080, 7088, 7094, 7102, 7121, 7129, 7135, 7143, 7153, 7161, 7167, 7175, + 7201, 7209, 7215, 7223, 7233, 7241, 7247, 7255, 7274, 7282, 7288, 7296, 7306, + 7314, 7320, 7328, 7350, 7358, 7364, 7372, 7382, 7390, 7396, 7404, 7423, 7431, + 7437, 7445, 7455, 7463, 7469, 7477, 7343, 7351, 7357, 7365, 7375, 7383, 7389, + 7397, 7416, 7424, 7430, 7438, 7448, 7456, 7462, 7470, 7492, 7500, 7506, 7514, + 7524, 7532, 7538, 7546, 7565, 7573, 7579, 7587, 7597, 7605, 7611, 7619, 7645, + 7653, 7659, 7667, 7677, 7685, 7691, 7699, 7718, 7726, 7732, 7740, 7750, 7758, + 7764, 7772, 7794, 7802, 7808, 7816, 7826, 7834, 7840, 7848, 7867, 7875, 7881, + 7889, 7899, 7907, 7913, 7921, 7719, 7727, 7733, 7741, 7751, 7759, 7765, 7773, + 7792, 7800, 7806, 7814, 7824, 7832, 7838, 7846, 7868, 7876, 7882, 7890, 7900, + 7908, 7914, 7922, 7941, 7949, 7955, 7963, 7973, 7981, 7987, 7995, 8021, 8029, + 8035, 8043, 8053, 8061, 8067, 8075, 8094, 8102, 8108, 8116, 8126, 8134, 8140, + 8148, 8170, 8178, 8184, 8192, 8202, 8210, 8216, 8224, 8243, 8251, 8257, 8265, + 8275 +}; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_ENCODER_DCT_VALUE_COST_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/dct_value_tokens.h b/media/libvpx/libvpx/vp8/encoder/dct_value_tokens.h new file mode 100644 index 0000000000..5cc4505f09 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/dct_value_tokens.h @@ -0,0 +1,848 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_ENCODER_DCT_VALUE_TOKENS_H_ +#define VPX_VP8_ENCODER_DCT_VALUE_TOKENS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* Generated file, included by tokenize.c */ +/* Values generated by fill_value_tokens() */ + +static const TOKENVALUE dct_value_tokens[2048 * 2] = { + { 10, 3963 }, { 10, 3961 }, { 10, 3959 }, { 10, 3957 }, { 10, 3955 }, + { 10, 3953 }, { 10, 3951 }, { 10, 3949 }, { 10, 3947 }, { 10, 3945 }, + { 10, 3943 }, { 10, 3941 }, { 10, 3939 }, { 10, 3937 }, { 10, 3935 }, + { 10, 3933 }, { 10, 3931 }, { 10, 3929 }, { 10, 3927 }, { 10, 3925 }, + { 10, 3923 }, { 10, 3921 }, { 10, 3919 }, { 10, 3917 }, { 10, 3915 }, + { 10, 3913 }, { 10, 3911 }, { 10, 3909 }, { 10, 3907 }, { 10, 3905 }, + { 10, 3903 }, { 10, 3901 }, { 10, 3899 }, { 10, 3897 }, { 10, 3895 }, + { 10, 3893 }, { 10, 3891 }, { 10, 3889 }, { 10, 3887 }, { 10, 3885 }, + { 10, 3883 }, { 10, 3881 }, { 10, 3879 }, { 10, 3877 }, { 10, 3875 }, + { 10, 3873 }, { 10, 3871 }, { 10, 3869 }, { 10, 3867 }, { 10, 3865 }, + { 10, 3863 }, { 10, 3861 }, { 10, 3859 }, { 10, 3857 }, { 10, 3855 }, + { 10, 3853 }, { 10, 3851 }, { 10, 3849 }, { 10, 3847 }, { 10, 3845 }, + { 10, 3843 }, { 10, 3841 }, { 10, 3839 }, { 10, 3837 }, { 10, 3835 }, + { 10, 3833 }, { 10, 3831 }, { 10, 3829 }, { 10, 3827 }, { 10, 3825 }, + { 10, 3823 }, { 10, 3821 }, { 10, 3819 }, { 10, 3817 }, { 10, 3815 }, + { 10, 3813 }, { 10, 3811 }, { 10, 3809 }, { 10, 3807 }, { 10, 3805 }, + { 10, 3803 }, { 10, 3801 }, { 10, 3799 }, { 10, 3797 }, { 10, 3795 }, + { 10, 3793 }, { 10, 3791 }, { 10, 3789 }, { 10, 3787 }, { 10, 3785 }, + { 10, 3783 }, { 10, 3781 }, { 10, 3779 }, { 10, 3777 }, { 10, 3775 }, + { 10, 3773 }, { 10, 3771 }, { 10, 3769 }, { 10, 3767 }, { 10, 3765 }, + { 10, 3763 }, { 10, 3761 }, { 10, 3759 }, { 10, 3757 }, { 10, 3755 }, + { 10, 3753 }, { 10, 3751 }, { 10, 3749 }, { 10, 3747 }, { 10, 3745 }, + { 10, 3743 }, { 10, 3741 }, { 10, 3739 }, { 10, 3737 }, { 10, 3735 }, + { 10, 3733 }, { 10, 3731 }, { 10, 3729 }, { 10, 3727 }, { 10, 3725 }, + { 10, 3723 }, { 10, 3721 }, { 10, 3719 }, { 10, 3717 }, { 10, 3715 }, + { 10, 3713 }, { 10, 3711 }, { 10, 3709 }, { 10, 3707 }, { 10, 3705 }, + { 10, 3703 }, { 10, 3701 }, { 10, 3699 }, { 10, 3697 }, { 10, 3695 }, + { 10, 3693 }, { 10, 3691 }, { 10, 3689 }, { 10, 3687 }, { 10, 3685 }, + { 10, 3683 }, { 10, 3681 }, { 10, 3679 }, { 10, 3677 }, { 10, 3675 }, + { 10, 3673 }, { 10, 3671 }, { 10, 3669 }, { 10, 3667 }, { 10, 3665 }, + { 10, 3663 }, { 10, 3661 }, { 10, 3659 }, { 10, 3657 }, { 10, 3655 }, + { 10, 3653 }, { 10, 3651 }, { 10, 3649 }, { 10, 3647 }, { 10, 3645 }, + { 10, 3643 }, { 10, 3641 }, { 10, 3639 }, { 10, 3637 }, { 10, 3635 }, + { 10, 3633 }, { 10, 3631 }, { 10, 3629 }, { 10, 3627 }, { 10, 3625 }, + { 10, 3623 }, { 10, 3621 }, { 10, 3619 }, { 10, 3617 }, { 10, 3615 }, + { 10, 3613 }, { 10, 3611 }, { 10, 3609 }, { 10, 3607 }, { 10, 3605 }, + { 10, 3603 }, { 10, 3601 }, { 10, 3599 }, { 10, 3597 }, { 10, 3595 }, + { 10, 3593 }, { 10, 3591 }, { 10, 3589 }, { 10, 3587 }, { 10, 3585 }, + { 10, 3583 }, { 10, 3581 }, { 10, 3579 }, { 10, 3577 }, { 10, 3575 }, + { 10, 3573 }, { 10, 3571 }, { 10, 3569 }, { 10, 3567 }, { 10, 3565 }, + { 10, 3563 }, { 10, 3561 }, { 10, 3559 }, { 10, 3557 }, { 10, 3555 }, + { 10, 3553 }, { 10, 3551 }, { 10, 3549 }, { 10, 3547 }, { 10, 3545 }, + { 10, 3543 }, { 10, 3541 }, { 10, 3539 }, { 10, 3537 }, { 10, 3535 }, + { 10, 3533 }, { 10, 3531 }, { 10, 3529 }, { 10, 3527 }, { 10, 3525 }, + { 10, 3523 }, { 10, 3521 }, { 10, 3519 }, { 10, 3517 }, { 10, 3515 }, + { 10, 3513 }, { 10, 3511 }, { 10, 3509 }, { 10, 3507 }, { 10, 3505 }, + { 10, 3503 }, { 10, 3501 }, { 10, 3499 }, { 10, 3497 }, { 10, 3495 }, + { 10, 3493 }, { 10, 3491 }, { 10, 3489 }, { 10, 3487 }, { 10, 3485 }, + { 10, 3483 }, { 10, 3481 }, { 10, 3479 }, { 10, 3477 }, { 10, 3475 }, + { 10, 3473 }, { 10, 3471 }, { 10, 3469 }, { 10, 3467 }, { 10, 3465 }, + { 10, 3463 }, { 10, 3461 }, { 10, 3459 }, { 10, 3457 }, { 10, 3455 }, + { 10, 3453 }, { 10, 3451 }, { 10, 3449 }, { 10, 3447 }, { 10, 3445 }, + { 10, 3443 }, { 10, 3441 }, { 10, 3439 }, { 10, 3437 }, { 10, 3435 }, + { 10, 3433 }, { 10, 3431 }, { 10, 3429 }, { 10, 3427 }, { 10, 3425 }, + { 10, 3423 }, { 10, 3421 }, { 10, 3419 }, { 10, 3417 }, { 10, 3415 }, + { 10, 3413 }, { 10, 3411 }, { 10, 3409 }, { 10, 3407 }, { 10, 3405 }, + { 10, 3403 }, { 10, 3401 }, { 10, 3399 }, { 10, 3397 }, { 10, 3395 }, + { 10, 3393 }, { 10, 3391 }, { 10, 3389 }, { 10, 3387 }, { 10, 3385 }, + { 10, 3383 }, { 10, 3381 }, { 10, 3379 }, { 10, 3377 }, { 10, 3375 }, + { 10, 3373 }, { 10, 3371 }, { 10, 3369 }, { 10, 3367 }, { 10, 3365 }, + { 10, 3363 }, { 10, 3361 }, { 10, 3359 }, { 10, 3357 }, { 10, 3355 }, + { 10, 3353 }, { 10, 3351 }, { 10, 3349 }, { 10, 3347 }, { 10, 3345 }, + { 10, 3343 }, { 10, 3341 }, { 10, 3339 }, { 10, 3337 }, { 10, 3335 }, + { 10, 3333 }, { 10, 3331 }, { 10, 3329 }, { 10, 3327 }, { 10, 3325 }, + { 10, 3323 }, { 10, 3321 }, { 10, 3319 }, { 10, 3317 }, { 10, 3315 }, + { 10, 3313 }, { 10, 3311 }, { 10, 3309 }, { 10, 3307 }, { 10, 3305 }, + { 10, 3303 }, { 10, 3301 }, { 10, 3299 }, { 10, 3297 }, { 10, 3295 }, + { 10, 3293 }, { 10, 3291 }, { 10, 3289 }, { 10, 3287 }, { 10, 3285 }, + { 10, 3283 }, { 10, 3281 }, { 10, 3279 }, { 10, 3277 }, { 10, 3275 }, + { 10, 3273 }, { 10, 3271 }, { 10, 3269 }, { 10, 3267 }, { 10, 3265 }, + { 10, 3263 }, { 10, 3261 }, { 10, 3259 }, { 10, 3257 }, { 10, 3255 }, + { 10, 3253 }, { 10, 3251 }, { 10, 3249 }, { 10, 3247 }, { 10, 3245 }, + { 10, 3243 }, { 10, 3241 }, { 10, 3239 }, { 10, 3237 }, { 10, 3235 }, + { 10, 3233 }, { 10, 3231 }, { 10, 3229 }, { 10, 3227 }, { 10, 3225 }, + { 10, 3223 }, { 10, 3221 }, { 10, 3219 }, { 10, 3217 }, { 10, 3215 }, + { 10, 3213 }, { 10, 3211 }, { 10, 3209 }, { 10, 3207 }, { 10, 3205 }, + { 10, 3203 }, { 10, 3201 }, { 10, 3199 }, { 10, 3197 }, { 10, 3195 }, + { 10, 3193 }, { 10, 3191 }, { 10, 3189 }, { 10, 3187 }, { 10, 3185 }, + { 10, 3183 }, { 10, 3181 }, { 10, 3179 }, { 10, 3177 }, { 10, 3175 }, + { 10, 3173 }, { 10, 3171 }, { 10, 3169 }, { 10, 3167 }, { 10, 3165 }, + { 10, 3163 }, { 10, 3161 }, { 10, 3159 }, { 10, 3157 }, { 10, 3155 }, + { 10, 3153 }, { 10, 3151 }, { 10, 3149 }, { 10, 3147 }, { 10, 3145 }, + { 10, 3143 }, { 10, 3141 }, { 10, 3139 }, { 10, 3137 }, { 10, 3135 }, + { 10, 3133 }, { 10, 3131 }, { 10, 3129 }, { 10, 3127 }, { 10, 3125 }, + { 10, 3123 }, { 10, 3121 }, { 10, 3119 }, { 10, 3117 }, { 10, 3115 }, + { 10, 3113 }, { 10, 3111 }, { 10, 3109 }, { 10, 3107 }, { 10, 3105 }, + { 10, 3103 }, { 10, 3101 }, { 10, 3099 }, { 10, 3097 }, { 10, 3095 }, + { 10, 3093 }, { 10, 3091 }, { 10, 3089 }, { 10, 3087 }, { 10, 3085 }, + { 10, 3083 }, { 10, 3081 }, { 10, 3079 }, { 10, 3077 }, { 10, 3075 }, + { 10, 3073 }, { 10, 3071 }, { 10, 3069 }, { 10, 3067 }, { 10, 3065 }, + { 10, 3063 }, { 10, 3061 }, { 10, 3059 }, { 10, 3057 }, { 10, 3055 }, + { 10, 3053 }, { 10, 3051 }, { 10, 3049 }, { 10, 3047 }, { 10, 3045 }, + { 10, 3043 }, { 10, 3041 }, { 10, 3039 }, { 10, 3037 }, { 10, 3035 }, + { 10, 3033 }, { 10, 3031 }, { 10, 3029 }, { 10, 3027 }, { 10, 3025 }, + { 10, 3023 }, { 10, 3021 }, { 10, 3019 }, { 10, 3017 }, { 10, 3015 }, + { 10, 3013 }, { 10, 3011 }, { 10, 3009 }, { 10, 3007 }, { 10, 3005 }, + { 10, 3003 }, { 10, 3001 }, { 10, 2999 }, { 10, 2997 }, { 10, 2995 }, + { 10, 2993 }, { 10, 2991 }, { 10, 2989 }, { 10, 2987 }, { 10, 2985 }, + { 10, 2983 }, { 10, 2981 }, { 10, 2979 }, { 10, 2977 }, { 10, 2975 }, + { 10, 2973 }, { 10, 2971 }, { 10, 2969 }, { 10, 2967 }, { 10, 2965 }, + { 10, 2963 }, { 10, 2961 }, { 10, 2959 }, { 10, 2957 }, { 10, 2955 }, + { 10, 2953 }, { 10, 2951 }, { 10, 2949 }, { 10, 2947 }, { 10, 2945 }, + { 10, 2943 }, { 10, 2941 }, { 10, 2939 }, { 10, 2937 }, { 10, 2935 }, + { 10, 2933 }, { 10, 2931 }, { 10, 2929 }, { 10, 2927 }, { 10, 2925 }, + { 10, 2923 }, { 10, 2921 }, { 10, 2919 }, { 10, 2917 }, { 10, 2915 }, + { 10, 2913 }, { 10, 2911 }, { 10, 2909 }, { 10, 2907 }, { 10, 2905 }, + { 10, 2903 }, { 10, 2901 }, { 10, 2899 }, { 10, 2897 }, { 10, 2895 }, + { 10, 2893 }, { 10, 2891 }, { 10, 2889 }, { 10, 2887 }, { 10, 2885 }, + { 10, 2883 }, { 10, 2881 }, { 10, 2879 }, { 10, 2877 }, { 10, 2875 }, + { 10, 2873 }, { 10, 2871 }, { 10, 2869 }, { 10, 2867 }, { 10, 2865 }, + { 10, 2863 }, { 10, 2861 }, { 10, 2859 }, { 10, 2857 }, { 10, 2855 }, + { 10, 2853 }, { 10, 2851 }, { 10, 2849 }, { 10, 2847 }, { 10, 2845 }, + { 10, 2843 }, { 10, 2841 }, { 10, 2839 }, { 10, 2837 }, { 10, 2835 }, + { 10, 2833 }, { 10, 2831 }, { 10, 2829 }, { 10, 2827 }, { 10, 2825 }, + { 10, 2823 }, { 10, 2821 }, { 10, 2819 }, { 10, 2817 }, { 10, 2815 }, + { 10, 2813 }, { 10, 2811 }, { 10, 2809 }, { 10, 2807 }, { 10, 2805 }, + { 10, 2803 }, { 10, 2801 }, { 10, 2799 }, { 10, 2797 }, { 10, 2795 }, + { 10, 2793 }, { 10, 2791 }, { 10, 2789 }, { 10, 2787 }, { 10, 2785 }, + { 10, 2783 }, { 10, 2781 }, { 10, 2779 }, { 10, 2777 }, { 10, 2775 }, + { 10, 2773 }, { 10, 2771 }, { 10, 2769 }, { 10, 2767 }, { 10, 2765 }, + { 10, 2763 }, { 10, 2761 }, { 10, 2759 }, { 10, 2757 }, { 10, 2755 }, + { 10, 2753 }, { 10, 2751 }, { 10, 2749 }, { 10, 2747 }, { 10, 2745 }, + { 10, 2743 }, { 10, 2741 }, { 10, 2739 }, { 10, 2737 }, { 10, 2735 }, + { 10, 2733 }, { 10, 2731 }, { 10, 2729 }, { 10, 2727 }, { 10, 2725 }, + { 10, 2723 }, { 10, 2721 }, { 10, 2719 }, { 10, 2717 }, { 10, 2715 }, + { 10, 2713 }, { 10, 2711 }, { 10, 2709 }, { 10, 2707 }, { 10, 2705 }, + { 10, 2703 }, { 10, 2701 }, { 10, 2699 }, { 10, 2697 }, { 10, 2695 }, + { 10, 2693 }, { 10, 2691 }, { 10, 2689 }, { 10, 2687 }, { 10, 2685 }, + { 10, 2683 }, { 10, 2681 }, { 10, 2679 }, { 10, 2677 }, { 10, 2675 }, + { 10, 2673 }, { 10, 2671 }, { 10, 2669 }, { 10, 2667 }, { 10, 2665 }, + { 10, 2663 }, { 10, 2661 }, { 10, 2659 }, { 10, 2657 }, { 10, 2655 }, + { 10, 2653 }, { 10, 2651 }, { 10, 2649 }, { 10, 2647 }, { 10, 2645 }, + { 10, 2643 }, { 10, 2641 }, { 10, 2639 }, { 10, 2637 }, { 10, 2635 }, + { 10, 2633 }, { 10, 2631 }, { 10, 2629 }, { 10, 2627 }, { 10, 2625 }, + { 10, 2623 }, { 10, 2621 }, { 10, 2619 }, { 10, 2617 }, { 10, 2615 }, + { 10, 2613 }, { 10, 2611 }, { 10, 2609 }, { 10, 2607 }, { 10, 2605 }, + { 10, 2603 }, { 10, 2601 }, { 10, 2599 }, { 10, 2597 }, { 10, 2595 }, + { 10, 2593 }, { 10, 2591 }, { 10, 2589 }, { 10, 2587 }, { 10, 2585 }, + { 10, 2583 }, { 10, 2581 }, { 10, 2579 }, { 10, 2577 }, { 10, 2575 }, + { 10, 2573 }, { 10, 2571 }, { 10, 2569 }, { 10, 2567 }, { 10, 2565 }, + { 10, 2563 }, { 10, 2561 }, { 10, 2559 }, { 10, 2557 }, { 10, 2555 }, + { 10, 2553 }, { 10, 2551 }, { 10, 2549 }, { 10, 2547 }, { 10, 2545 }, + { 10, 2543 }, { 10, 2541 }, { 10, 2539 }, { 10, 2537 }, { 10, 2535 }, + { 10, 2533 }, { 10, 2531 }, { 10, 2529 }, { 10, 2527 }, { 10, 2525 }, + { 10, 2523 }, { 10, 2521 }, { 10, 2519 }, { 10, 2517 }, { 10, 2515 }, + { 10, 2513 }, { 10, 2511 }, { 10, 2509 }, { 10, 2507 }, { 10, 2505 }, + { 10, 2503 }, { 10, 2501 }, { 10, 2499 }, { 10, 2497 }, { 10, 2495 }, + { 10, 2493 }, { 10, 2491 }, { 10, 2489 }, { 10, 2487 }, { 10, 2485 }, + { 10, 2483 }, { 10, 2481 }, { 10, 2479 }, { 10, 2477 }, { 10, 2475 }, + { 10, 2473 }, { 10, 2471 }, { 10, 2469 }, { 10, 2467 }, { 10, 2465 }, + { 10, 2463 }, { 10, 2461 }, { 10, 2459 }, { 10, 2457 }, { 10, 2455 }, + { 10, 2453 }, { 10, 2451 }, { 10, 2449 }, { 10, 2447 }, { 10, 2445 }, + { 10, 2443 }, { 10, 2441 }, { 10, 2439 }, { 10, 2437 }, { 10, 2435 }, + { 10, 2433 }, { 10, 2431 }, { 10, 2429 }, { 10, 2427 }, { 10, 2425 }, + { 10, 2423 }, { 10, 2421 }, { 10, 2419 }, { 10, 2417 }, { 10, 2415 }, + { 10, 2413 }, { 10, 2411 }, { 10, 2409 }, { 10, 2407 }, { 10, 2405 }, + { 10, 2403 }, { 10, 2401 }, { 10, 2399 }, { 10, 2397 }, { 10, 2395 }, + { 10, 2393 }, { 10, 2391 }, { 10, 2389 }, { 10, 2387 }, { 10, 2385 }, + { 10, 2383 }, { 10, 2381 }, { 10, 2379 }, { 10, 2377 }, { 10, 2375 }, + { 10, 2373 }, { 10, 2371 }, { 10, 2369 }, { 10, 2367 }, { 10, 2365 }, + { 10, 2363 }, { 10, 2361 }, { 10, 2359 }, { 10, 2357 }, { 10, 2355 }, + { 10, 2353 }, { 10, 2351 }, { 10, 2349 }, { 10, 2347 }, { 10, 2345 }, + { 10, 2343 }, { 10, 2341 }, { 10, 2339 }, { 10, 2337 }, { 10, 2335 }, + { 10, 2333 }, { 10, 2331 }, { 10, 2329 }, { 10, 2327 }, { 10, 2325 }, + { 10, 2323 }, { 10, 2321 }, { 10, 2319 }, { 10, 2317 }, { 10, 2315 }, + { 10, 2313 }, { 10, 2311 }, { 10, 2309 }, { 10, 2307 }, { 10, 2305 }, + { 10, 2303 }, { 10, 2301 }, { 10, 2299 }, { 10, 2297 }, { 10, 2295 }, + { 10, 2293 }, { 10, 2291 }, { 10, 2289 }, { 10, 2287 }, { 10, 2285 }, + { 10, 2283 }, { 10, 2281 }, { 10, 2279 }, { 10, 2277 }, { 10, 2275 }, + { 10, 2273 }, { 10, 2271 }, { 10, 2269 }, { 10, 2267 }, { 10, 2265 }, + { 10, 2263 }, { 10, 2261 }, { 10, 2259 }, { 10, 2257 }, { 10, 2255 }, + { 10, 2253 }, { 10, 2251 }, { 10, 2249 }, { 10, 2247 }, { 10, 2245 }, + { 10, 2243 }, { 10, 2241 }, { 10, 2239 }, { 10, 2237 }, { 10, 2235 }, + { 10, 2233 }, { 10, 2231 }, { 10, 2229 }, { 10, 2227 }, { 10, 2225 }, + { 10, 2223 }, { 10, 2221 }, { 10, 2219 }, { 10, 2217 }, { 10, 2215 }, + { 10, 2213 }, { 10, 2211 }, { 10, 2209 }, { 10, 2207 }, { 10, 2205 }, + { 10, 2203 }, { 10, 2201 }, { 10, 2199 }, { 10, 2197 }, { 10, 2195 }, + { 10, 2193 }, { 10, 2191 }, { 10, 2189 }, { 10, 2187 }, { 10, 2185 }, + { 10, 2183 }, { 10, 2181 }, { 10, 2179 }, { 10, 2177 }, { 10, 2175 }, + { 10, 2173 }, { 10, 2171 }, { 10, 2169 }, { 10, 2167 }, { 10, 2165 }, + { 10, 2163 }, { 10, 2161 }, { 10, 2159 }, { 10, 2157 }, { 10, 2155 }, + { 10, 2153 }, { 10, 2151 }, { 10, 2149 }, { 10, 2147 }, { 10, 2145 }, + { 10, 2143 }, { 10, 2141 }, { 10, 2139 }, { 10, 2137 }, { 10, 2135 }, + { 10, 2133 }, { 10, 2131 }, { 10, 2129 }, { 10, 2127 }, { 10, 2125 }, + { 10, 2123 }, { 10, 2121 }, { 10, 2119 }, { 10, 2117 }, { 10, 2115 }, + { 10, 2113 }, { 10, 2111 }, { 10, 2109 }, { 10, 2107 }, { 10, 2105 }, + { 10, 2103 }, { 10, 2101 }, { 10, 2099 }, { 10, 2097 }, { 10, 2095 }, + { 10, 2093 }, { 10, 2091 }, { 10, 2089 }, { 10, 2087 }, { 10, 2085 }, + { 10, 2083 }, { 10, 2081 }, { 10, 2079 }, { 10, 2077 }, { 10, 2075 }, + { 10, 2073 }, { 10, 2071 }, { 10, 2069 }, { 10, 2067 }, { 10, 2065 }, + { 10, 2063 }, { 10, 2061 }, { 10, 2059 }, { 10, 2057 }, { 10, 2055 }, + { 10, 2053 }, { 10, 2051 }, { 10, 2049 }, { 10, 2047 }, { 10, 2045 }, + { 10, 2043 }, { 10, 2041 }, { 10, 2039 }, { 10, 2037 }, { 10, 2035 }, + { 10, 2033 }, { 10, 2031 }, { 10, 2029 }, { 10, 2027 }, { 10, 2025 }, + { 10, 2023 }, { 10, 2021 }, { 10, 2019 }, { 10, 2017 }, { 10, 2015 }, + { 10, 2013 }, { 10, 2011 }, { 10, 2009 }, { 10, 2007 }, { 10, 2005 }, + { 10, 2003 }, { 10, 2001 }, { 10, 1999 }, { 10, 1997 }, { 10, 1995 }, + { 10, 1993 }, { 10, 1991 }, { 10, 1989 }, { 10, 1987 }, { 10, 1985 }, + { 10, 1983 }, { 10, 1981 }, { 10, 1979 }, { 10, 1977 }, { 10, 1975 }, + { 10, 1973 }, { 10, 1971 }, { 10, 1969 }, { 10, 1967 }, { 10, 1965 }, + { 10, 1963 }, { 10, 1961 }, { 10, 1959 }, { 10, 1957 }, { 10, 1955 }, + { 10, 1953 }, { 10, 1951 }, { 10, 1949 }, { 10, 1947 }, { 10, 1945 }, + { 10, 1943 }, { 10, 1941 }, { 10, 1939 }, { 10, 1937 }, { 10, 1935 }, + { 10, 1933 }, { 10, 1931 }, { 10, 1929 }, { 10, 1927 }, { 10, 1925 }, + { 10, 1923 }, { 10, 1921 }, { 10, 1919 }, { 10, 1917 }, { 10, 1915 }, + { 10, 1913 }, { 10, 1911 }, { 10, 1909 }, { 10, 1907 }, { 10, 1905 }, + { 10, 1903 }, { 10, 1901 }, { 10, 1899 }, { 10, 1897 }, { 10, 1895 }, + { 10, 1893 }, { 10, 1891 }, { 10, 1889 }, { 10, 1887 }, { 10, 1885 }, + { 10, 1883 }, { 10, 1881 }, { 10, 1879 }, { 10, 1877 }, { 10, 1875 }, + { 10, 1873 }, { 10, 1871 }, { 10, 1869 }, { 10, 1867 }, { 10, 1865 }, + { 10, 1863 }, { 10, 1861 }, { 10, 1859 }, { 10, 1857 }, { 10, 1855 }, + { 10, 1853 }, { 10, 1851 }, { 10, 1849 }, { 10, 1847 }, { 10, 1845 }, + { 10, 1843 }, { 10, 1841 }, { 10, 1839 }, { 10, 1837 }, { 10, 1835 }, + { 10, 1833 }, { 10, 1831 }, { 10, 1829 }, { 10, 1827 }, { 10, 1825 }, + { 10, 1823 }, { 10, 1821 }, { 10, 1819 }, { 10, 1817 }, { 10, 1815 }, + { 10, 1813 }, { 10, 1811 }, { 10, 1809 }, { 10, 1807 }, { 10, 1805 }, + { 10, 1803 }, { 10, 1801 }, { 10, 1799 }, { 10, 1797 }, { 10, 1795 }, + { 10, 1793 }, { 10, 1791 }, { 10, 1789 }, { 10, 1787 }, { 10, 1785 }, + { 10, 1783 }, { 10, 1781 }, { 10, 1779 }, { 10, 1777 }, { 10, 1775 }, + { 10, 1773 }, { 10, 1771 }, { 10, 1769 }, { 10, 1767 }, { 10, 1765 }, + { 10, 1763 }, { 10, 1761 }, { 10, 1759 }, { 10, 1757 }, { 10, 1755 }, + { 10, 1753 }, { 10, 1751 }, { 10, 1749 }, { 10, 1747 }, { 10, 1745 }, + { 10, 1743 }, { 10, 1741 }, { 10, 1739 }, { 10, 1737 }, { 10, 1735 }, + { 10, 1733 }, { 10, 1731 }, { 10, 1729 }, { 10, 1727 }, { 10, 1725 }, + { 10, 1723 }, { 10, 1721 }, { 10, 1719 }, { 10, 1717 }, { 10, 1715 }, + { 10, 1713 }, { 10, 1711 }, { 10, 1709 }, { 10, 1707 }, { 10, 1705 }, + { 10, 1703 }, { 10, 1701 }, { 10, 1699 }, { 10, 1697 }, { 10, 1695 }, + { 10, 1693 }, { 10, 1691 }, { 10, 1689 }, { 10, 1687 }, { 10, 1685 }, + { 10, 1683 }, { 10, 1681 }, { 10, 1679 }, { 10, 1677 }, { 10, 1675 }, + { 10, 1673 }, { 10, 1671 }, { 10, 1669 }, { 10, 1667 }, { 10, 1665 }, + { 10, 1663 }, { 10, 1661 }, { 10, 1659 }, { 10, 1657 }, { 10, 1655 }, + { 10, 1653 }, { 10, 1651 }, { 10, 1649 }, { 10, 1647 }, { 10, 1645 }, + { 10, 1643 }, { 10, 1641 }, { 10, 1639 }, { 10, 1637 }, { 10, 1635 }, + { 10, 1633 }, { 10, 1631 }, { 10, 1629 }, { 10, 1627 }, { 10, 1625 }, + { 10, 1623 }, { 10, 1621 }, { 10, 1619 }, { 10, 1617 }, { 10, 1615 }, + { 10, 1613 }, { 10, 1611 }, { 10, 1609 }, { 10, 1607 }, { 10, 1605 }, + { 10, 1603 }, { 10, 1601 }, { 10, 1599 }, { 10, 1597 }, { 10, 1595 }, + { 10, 1593 }, { 10, 1591 }, { 10, 1589 }, { 10, 1587 }, { 10, 1585 }, + { 10, 1583 }, { 10, 1581 }, { 10, 1579 }, { 10, 1577 }, { 10, 1575 }, + { 10, 1573 }, { 10, 1571 }, { 10, 1569 }, { 10, 1567 }, { 10, 1565 }, + { 10, 1563 }, { 10, 1561 }, { 10, 1559 }, { 10, 1557 }, { 10, 1555 }, + { 10, 1553 }, { 10, 1551 }, { 10, 1549 }, { 10, 1547 }, { 10, 1545 }, + { 10, 1543 }, { 10, 1541 }, { 10, 1539 }, { 10, 1537 }, { 10, 1535 }, + { 10, 1533 }, { 10, 1531 }, { 10, 1529 }, { 10, 1527 }, { 10, 1525 }, + { 10, 1523 }, { 10, 1521 }, { 10, 1519 }, { 10, 1517 }, { 10, 1515 }, + { 10, 1513 }, { 10, 1511 }, { 10, 1509 }, { 10, 1507 }, { 10, 1505 }, + { 10, 1503 }, { 10, 1501 }, { 10, 1499 }, { 10, 1497 }, { 10, 1495 }, + { 10, 1493 }, { 10, 1491 }, { 10, 1489 }, { 10, 1487 }, { 10, 1485 }, + { 10, 1483 }, { 10, 1481 }, { 10, 1479 }, { 10, 1477 }, { 10, 1475 }, + { 10, 1473 }, { 10, 1471 }, { 10, 1469 }, { 10, 1467 }, { 10, 1465 }, + { 10, 1463 }, { 10, 1461 }, { 10, 1459 }, { 10, 1457 }, { 10, 1455 }, + { 10, 1453 }, { 10, 1451 }, { 10, 1449 }, { 10, 1447 }, { 10, 1445 }, + { 10, 1443 }, { 10, 1441 }, { 10, 1439 }, { 10, 1437 }, { 10, 1435 }, + { 10, 1433 }, { 10, 1431 }, { 10, 1429 }, { 10, 1427 }, { 10, 1425 }, + { 10, 1423 }, { 10, 1421 }, { 10, 1419 }, { 10, 1417 }, { 10, 1415 }, + { 10, 1413 }, { 10, 1411 }, { 10, 1409 }, { 10, 1407 }, { 10, 1405 }, + { 10, 1403 }, { 10, 1401 }, { 10, 1399 }, { 10, 1397 }, { 10, 1395 }, + { 10, 1393 }, { 10, 1391 }, { 10, 1389 }, { 10, 1387 }, { 10, 1385 }, + { 10, 1383 }, { 10, 1381 }, { 10, 1379 }, { 10, 1377 }, { 10, 1375 }, + { 10, 1373 }, { 10, 1371 }, { 10, 1369 }, { 10, 1367 }, { 10, 1365 }, + { 10, 1363 }, { 10, 1361 }, { 10, 1359 }, { 10, 1357 }, { 10, 1355 }, + { 10, 1353 }, { 10, 1351 }, { 10, 1349 }, { 10, 1347 }, { 10, 1345 }, + { 10, 1343 }, { 10, 1341 }, { 10, 1339 }, { 10, 1337 }, { 10, 1335 }, + { 10, 1333 }, { 10, 1331 }, { 10, 1329 }, { 10, 1327 }, { 10, 1325 }, + { 10, 1323 }, { 10, 1321 }, { 10, 1319 }, { 10, 1317 }, { 10, 1315 }, + { 10, 1313 }, { 10, 1311 }, { 10, 1309 }, { 10, 1307 }, { 10, 1305 }, + { 10, 1303 }, { 10, 1301 }, { 10, 1299 }, { 10, 1297 }, { 10, 1295 }, + { 10, 1293 }, { 10, 1291 }, { 10, 1289 }, { 10, 1287 }, { 10, 1285 }, + { 10, 1283 }, { 10, 1281 }, { 10, 1279 }, { 10, 1277 }, { 10, 1275 }, + { 10, 1273 }, { 10, 1271 }, { 10, 1269 }, { 10, 1267 }, { 10, 1265 }, + { 10, 1263 }, { 10, 1261 }, { 10, 1259 }, { 10, 1257 }, { 10, 1255 }, + { 10, 1253 }, { 10, 1251 }, { 10, 1249 }, { 10, 1247 }, { 10, 1245 }, + { 10, 1243 }, { 10, 1241 }, { 10, 1239 }, { 10, 1237 }, { 10, 1235 }, + { 10, 1233 }, { 10, 1231 }, { 10, 1229 }, { 10, 1227 }, { 10, 1225 }, + { 10, 1223 }, { 10, 1221 }, { 10, 1219 }, { 10, 1217 }, { 10, 1215 }, + { 10, 1213 }, { 10, 1211 }, { 10, 1209 }, { 10, 1207 }, { 10, 1205 }, + { 10, 1203 }, { 10, 1201 }, { 10, 1199 }, { 10, 1197 }, { 10, 1195 }, + { 10, 1193 }, { 10, 1191 }, { 10, 1189 }, { 10, 1187 }, { 10, 1185 }, + { 10, 1183 }, { 10, 1181 }, { 10, 1179 }, { 10, 1177 }, { 10, 1175 }, + { 10, 1173 }, { 10, 1171 }, { 10, 1169 }, { 10, 1167 }, { 10, 1165 }, + { 10, 1163 }, { 10, 1161 }, { 10, 1159 }, { 10, 1157 }, { 10, 1155 }, + { 10, 1153 }, { 10, 1151 }, { 10, 1149 }, { 10, 1147 }, { 10, 1145 }, + { 10, 1143 }, { 10, 1141 }, { 10, 1139 }, { 10, 1137 }, { 10, 1135 }, + { 10, 1133 }, { 10, 1131 }, { 10, 1129 }, { 10, 1127 }, { 10, 1125 }, + { 10, 1123 }, { 10, 1121 }, { 10, 1119 }, { 10, 1117 }, { 10, 1115 }, + { 10, 1113 }, { 10, 1111 }, { 10, 1109 }, { 10, 1107 }, { 10, 1105 }, + { 10, 1103 }, { 10, 1101 }, { 10, 1099 }, { 10, 1097 }, { 10, 1095 }, + { 10, 1093 }, { 10, 1091 }, { 10, 1089 }, { 10, 1087 }, { 10, 1085 }, + { 10, 1083 }, { 10, 1081 }, { 10, 1079 }, { 10, 1077 }, { 10, 1075 }, + { 10, 1073 }, { 10, 1071 }, { 10, 1069 }, { 10, 1067 }, { 10, 1065 }, + { 10, 1063 }, { 10, 1061 }, { 10, 1059 }, { 10, 1057 }, { 10, 1055 }, + { 10, 1053 }, { 10, 1051 }, { 10, 1049 }, { 10, 1047 }, { 10, 1045 }, + { 10, 1043 }, { 10, 1041 }, { 10, 1039 }, { 10, 1037 }, { 10, 1035 }, + { 10, 1033 }, { 10, 1031 }, { 10, 1029 }, { 10, 1027 }, { 10, 1025 }, + { 10, 1023 }, { 10, 1021 }, { 10, 1019 }, { 10, 1017 }, { 10, 1015 }, + { 10, 1013 }, { 10, 1011 }, { 10, 1009 }, { 10, 1007 }, { 10, 1005 }, + { 10, 1003 }, { 10, 1001 }, { 10, 999 }, { 10, 997 }, { 10, 995 }, + { 10, 993 }, { 10, 991 }, { 10, 989 }, { 10, 987 }, { 10, 985 }, + { 10, 983 }, { 10, 981 }, { 10, 979 }, { 10, 977 }, { 10, 975 }, + { 10, 973 }, { 10, 971 }, { 10, 969 }, { 10, 967 }, { 10, 965 }, + { 10, 963 }, { 10, 961 }, { 10, 959 }, { 10, 957 }, { 10, 955 }, + { 10, 953 }, { 10, 951 }, { 10, 949 }, { 10, 947 }, { 10, 945 }, + { 10, 943 }, { 10, 941 }, { 10, 939 }, { 10, 937 }, { 10, 935 }, + { 10, 933 }, { 10, 931 }, { 10, 929 }, { 10, 927 }, { 10, 925 }, + { 10, 923 }, { 10, 921 }, { 10, 919 }, { 10, 917 }, { 10, 915 }, + { 10, 913 }, { 10, 911 }, { 10, 909 }, { 10, 907 }, { 10, 905 }, + { 10, 903 }, { 10, 901 }, { 10, 899 }, { 10, 897 }, { 10, 895 }, + { 10, 893 }, { 10, 891 }, { 10, 889 }, { 10, 887 }, { 10, 885 }, + { 10, 883 }, { 10, 881 }, { 10, 879 }, { 10, 877 }, { 10, 875 }, + { 10, 873 }, { 10, 871 }, { 10, 869 }, { 10, 867 }, { 10, 865 }, + { 10, 863 }, { 10, 861 }, { 10, 859 }, { 10, 857 }, { 10, 855 }, + { 10, 853 }, { 10, 851 }, { 10, 849 }, { 10, 847 }, { 10, 845 }, + { 10, 843 }, { 10, 841 }, { 10, 839 }, { 10, 837 }, { 10, 835 }, + { 10, 833 }, { 10, 831 }, { 10, 829 }, { 10, 827 }, { 10, 825 }, + { 10, 823 }, { 10, 821 }, { 10, 819 }, { 10, 817 }, { 10, 815 }, + { 10, 813 }, { 10, 811 }, { 10, 809 }, { 10, 807 }, { 10, 805 }, + { 10, 803 }, { 10, 801 }, { 10, 799 }, { 10, 797 }, { 10, 795 }, + { 10, 793 }, { 10, 791 }, { 10, 789 }, { 10, 787 }, { 10, 785 }, + { 10, 783 }, { 10, 781 }, { 10, 779 }, { 10, 777 }, { 10, 775 }, + { 10, 773 }, { 10, 771 }, { 10, 769 }, { 10, 767 }, { 10, 765 }, + { 10, 763 }, { 10, 761 }, { 10, 759 }, { 10, 757 }, { 10, 755 }, + { 10, 753 }, { 10, 751 }, { 10, 749 }, { 10, 747 }, { 10, 745 }, + { 10, 743 }, { 10, 741 }, { 10, 739 }, { 10, 737 }, { 10, 735 }, + { 10, 733 }, { 10, 731 }, { 10, 729 }, { 10, 727 }, { 10, 725 }, + { 10, 723 }, { 10, 721 }, { 10, 719 }, { 10, 717 }, { 10, 715 }, + { 10, 713 }, { 10, 711 }, { 10, 709 }, { 10, 707 }, { 10, 705 }, + { 10, 703 }, { 10, 701 }, { 10, 699 }, { 10, 697 }, { 10, 695 }, + { 10, 693 }, { 10, 691 }, { 10, 689 }, { 10, 687 }, { 10, 685 }, + { 10, 683 }, { 10, 681 }, { 10, 679 }, { 10, 677 }, { 10, 675 }, + { 10, 673 }, { 10, 671 }, { 10, 669 }, { 10, 667 }, { 10, 665 }, + { 10, 663 }, { 10, 661 }, { 10, 659 }, { 10, 657 }, { 10, 655 }, + { 10, 653 }, { 10, 651 }, { 10, 649 }, { 10, 647 }, { 10, 645 }, + { 10, 643 }, { 10, 641 }, { 10, 639 }, { 10, 637 }, { 10, 635 }, + { 10, 633 }, { 10, 631 }, { 10, 629 }, { 10, 627 }, { 10, 625 }, + { 10, 623 }, { 10, 621 }, { 10, 619 }, { 10, 617 }, { 10, 615 }, + { 10, 613 }, { 10, 611 }, { 10, 609 }, { 10, 607 }, { 10, 605 }, + { 10, 603 }, { 10, 601 }, { 10, 599 }, { 10, 597 }, { 10, 595 }, + { 10, 593 }, { 10, 591 }, { 10, 589 }, { 10, 587 }, { 10, 585 }, + { 10, 583 }, { 10, 581 }, { 10, 579 }, { 10, 577 }, { 10, 575 }, + { 10, 573 }, { 10, 571 }, { 10, 569 }, { 10, 567 }, { 10, 565 }, + { 10, 563 }, { 10, 561 }, { 10, 559 }, { 10, 557 }, { 10, 555 }, + { 10, 553 }, { 10, 551 }, { 10, 549 }, { 10, 547 }, { 10, 545 }, + { 10, 543 }, { 10, 541 }, { 10, 539 }, { 10, 537 }, { 10, 535 }, + { 10, 533 }, { 10, 531 }, { 10, 529 }, { 10, 527 }, { 10, 525 }, + { 10, 523 }, { 10, 521 }, { 10, 519 }, { 10, 517 }, { 10, 515 }, + { 10, 513 }, { 10, 511 }, { 10, 509 }, { 10, 507 }, { 10, 505 }, + { 10, 503 }, { 10, 501 }, { 10, 499 }, { 10, 497 }, { 10, 495 }, + { 10, 493 }, { 10, 491 }, { 10, 489 }, { 10, 487 }, { 10, 485 }, + { 10, 483 }, { 10, 481 }, { 10, 479 }, { 10, 477 }, { 10, 475 }, + { 10, 473 }, { 10, 471 }, { 10, 469 }, { 10, 467 }, { 10, 465 }, + { 10, 463 }, { 10, 461 }, { 10, 459 }, { 10, 457 }, { 10, 455 }, + { 10, 453 }, { 10, 451 }, { 10, 449 }, { 10, 447 }, { 10, 445 }, + { 10, 443 }, { 10, 441 }, { 10, 439 }, { 10, 437 }, { 10, 435 }, + { 10, 433 }, { 10, 431 }, { 10, 429 }, { 10, 427 }, { 10, 425 }, + { 10, 423 }, { 10, 421 }, { 10, 419 }, { 10, 417 }, { 10, 415 }, + { 10, 413 }, { 10, 411 }, { 10, 409 }, { 10, 407 }, { 10, 405 }, + { 10, 403 }, { 10, 401 }, { 10, 399 }, { 10, 397 }, { 10, 395 }, + { 10, 393 }, { 10, 391 }, { 10, 389 }, { 10, 387 }, { 10, 385 }, + { 10, 383 }, { 10, 381 }, { 10, 379 }, { 10, 377 }, { 10, 375 }, + { 10, 373 }, { 10, 371 }, { 10, 369 }, { 10, 367 }, { 10, 365 }, + { 10, 363 }, { 10, 361 }, { 10, 359 }, { 10, 357 }, { 10, 355 }, + { 10, 353 }, { 10, 351 }, { 10, 349 }, { 10, 347 }, { 10, 345 }, + { 10, 343 }, { 10, 341 }, { 10, 339 }, { 10, 337 }, { 10, 335 }, + { 10, 333 }, { 10, 331 }, { 10, 329 }, { 10, 327 }, { 10, 325 }, + { 10, 323 }, { 10, 321 }, { 10, 319 }, { 10, 317 }, { 10, 315 }, + { 10, 313 }, { 10, 311 }, { 10, 309 }, { 10, 307 }, { 10, 305 }, + { 10, 303 }, { 10, 301 }, { 10, 299 }, { 10, 297 }, { 10, 295 }, + { 10, 293 }, { 10, 291 }, { 10, 289 }, { 10, 287 }, { 10, 285 }, + { 10, 283 }, { 10, 281 }, { 10, 279 }, { 10, 277 }, { 10, 275 }, + { 10, 273 }, { 10, 271 }, { 10, 269 }, { 10, 267 }, { 10, 265 }, + { 10, 263 }, { 10, 261 }, { 10, 259 }, { 10, 257 }, { 10, 255 }, + { 10, 253 }, { 10, 251 }, { 10, 249 }, { 10, 247 }, { 10, 245 }, + { 10, 243 }, { 10, 241 }, { 10, 239 }, { 10, 237 }, { 10, 235 }, + { 10, 233 }, { 10, 231 }, { 10, 229 }, { 10, 227 }, { 10, 225 }, + { 10, 223 }, { 10, 221 }, { 10, 219 }, { 10, 217 }, { 10, 215 }, + { 10, 213 }, { 10, 211 }, { 10, 209 }, { 10, 207 }, { 10, 205 }, + { 10, 203 }, { 10, 201 }, { 10, 199 }, { 10, 197 }, { 10, 195 }, + { 10, 193 }, { 10, 191 }, { 10, 189 }, { 10, 187 }, { 10, 185 }, + { 10, 183 }, { 10, 181 }, { 10, 179 }, { 10, 177 }, { 10, 175 }, + { 10, 173 }, { 10, 171 }, { 10, 169 }, { 10, 167 }, { 10, 165 }, + { 10, 163 }, { 10, 161 }, { 10, 159 }, { 10, 157 }, { 10, 155 }, + { 10, 153 }, { 10, 151 }, { 10, 149 }, { 10, 147 }, { 10, 145 }, + { 10, 143 }, { 10, 141 }, { 10, 139 }, { 10, 137 }, { 10, 135 }, + { 10, 133 }, { 10, 131 }, { 10, 129 }, { 10, 127 }, { 10, 125 }, + { 10, 123 }, { 10, 121 }, { 10, 119 }, { 10, 117 }, { 10, 115 }, + { 10, 113 }, { 10, 111 }, { 10, 109 }, { 10, 107 }, { 10, 105 }, + { 10, 103 }, { 10, 101 }, { 10, 99 }, { 10, 97 }, { 10, 95 }, + { 10, 93 }, { 10, 91 }, { 10, 89 }, { 10, 87 }, { 10, 85 }, + { 10, 83 }, { 10, 81 }, { 10, 79 }, { 10, 77 }, { 10, 75 }, + { 10, 73 }, { 10, 71 }, { 10, 69 }, { 10, 67 }, { 10, 65 }, + { 10, 63 }, { 10, 61 }, { 10, 59 }, { 10, 57 }, { 10, 55 }, + { 10, 53 }, { 10, 51 }, { 10, 49 }, { 10, 47 }, { 10, 45 }, + { 10, 43 }, { 10, 41 }, { 10, 39 }, { 10, 37 }, { 10, 35 }, + { 10, 33 }, { 10, 31 }, { 10, 29 }, { 10, 27 }, { 10, 25 }, + { 10, 23 }, { 10, 21 }, { 10, 19 }, { 10, 17 }, { 10, 15 }, + { 10, 13 }, { 10, 11 }, { 10, 9 }, { 10, 7 }, { 10, 5 }, + { 10, 3 }, { 10, 1 }, { 9, 63 }, { 9, 61 }, { 9, 59 }, + { 9, 57 }, { 9, 55 }, { 9, 53 }, { 9, 51 }, { 9, 49 }, + { 9, 47 }, { 9, 45 }, { 9, 43 }, { 9, 41 }, { 9, 39 }, + { 9, 37 }, { 9, 35 }, { 9, 33 }, { 9, 31 }, { 9, 29 }, + { 9, 27 }, { 9, 25 }, { 9, 23 }, { 9, 21 }, { 9, 19 }, + { 9, 17 }, { 9, 15 }, { 9, 13 }, { 9, 11 }, { 9, 9 }, + { 9, 7 }, { 9, 5 }, { 9, 3 }, { 9, 1 }, { 8, 31 }, + { 8, 29 }, { 8, 27 }, { 8, 25 }, { 8, 23 }, { 8, 21 }, + { 8, 19 }, { 8, 17 }, { 8, 15 }, { 8, 13 }, { 8, 11 }, + { 8, 9 }, { 8, 7 }, { 8, 5 }, { 8, 3 }, { 8, 1 }, + { 7, 15 }, { 7, 13 }, { 7, 11 }, { 7, 9 }, { 7, 7 }, + { 7, 5 }, { 7, 3 }, { 7, 1 }, { 6, 7 }, { 6, 5 }, + { 6, 3 }, { 6, 1 }, { 5, 3 }, { 5, 1 }, { 4, 1 }, + { 3, 1 }, { 2, 1 }, { 1, 1 }, { 0, 0 }, { 1, 0 }, + { 2, 0 }, { 3, 0 }, { 4, 0 }, { 5, 0 }, { 5, 2 }, + { 6, 0 }, { 6, 2 }, { 6, 4 }, { 6, 6 }, { 7, 0 }, + { 7, 2 }, { 7, 4 }, { 7, 6 }, { 7, 8 }, { 7, 10 }, + { 7, 12 }, { 7, 14 }, { 8, 0 }, { 8, 2 }, { 8, 4 }, + { 8, 6 }, { 8, 8 }, { 8, 10 }, { 8, 12 }, { 8, 14 }, + { 8, 16 }, { 8, 18 }, { 8, 20 }, { 8, 22 }, { 8, 24 }, + { 8, 26 }, { 8, 28 }, { 8, 30 }, { 9, 0 }, { 9, 2 }, + { 9, 4 }, { 9, 6 }, { 9, 8 }, { 9, 10 }, { 9, 12 }, + { 9, 14 }, { 9, 16 }, { 9, 18 }, { 9, 20 }, { 9, 22 }, + { 9, 24 }, { 9, 26 }, { 9, 28 }, { 9, 30 }, { 9, 32 }, + { 9, 34 }, { 9, 36 }, { 9, 38 }, { 9, 40 }, { 9, 42 }, + { 9, 44 }, { 9, 46 }, { 9, 48 }, { 9, 50 }, { 9, 52 }, + { 9, 54 }, { 9, 56 }, { 9, 58 }, { 9, 60 }, { 9, 62 }, + { 10, 0 }, { 10, 2 }, { 10, 4 }, { 10, 6 }, { 10, 8 }, + { 10, 10 }, { 10, 12 }, { 10, 14 }, { 10, 16 }, { 10, 18 }, + { 10, 20 }, { 10, 22 }, { 10, 24 }, { 10, 26 }, { 10, 28 }, + { 10, 30 }, { 10, 32 }, { 10, 34 }, { 10, 36 }, { 10, 38 }, + { 10, 40 }, { 10, 42 }, { 10, 44 }, { 10, 46 }, { 10, 48 }, + { 10, 50 }, { 10, 52 }, { 10, 54 }, { 10, 56 }, { 10, 58 }, + { 10, 60 }, { 10, 62 }, { 10, 64 }, { 10, 66 }, { 10, 68 }, + { 10, 70 }, { 10, 72 }, { 10, 74 }, { 10, 76 }, { 10, 78 }, + { 10, 80 }, { 10, 82 }, { 10, 84 }, { 10, 86 }, { 10, 88 }, + { 10, 90 }, { 10, 92 }, { 10, 94 }, { 10, 96 }, { 10, 98 }, + { 10, 100 }, { 10, 102 }, { 10, 104 }, { 10, 106 }, { 10, 108 }, + { 10, 110 }, { 10, 112 }, { 10, 114 }, { 10, 116 }, { 10, 118 }, + { 10, 120 }, { 10, 122 }, { 10, 124 }, { 10, 126 }, { 10, 128 }, + { 10, 130 }, { 10, 132 }, { 10, 134 }, { 10, 136 }, { 10, 138 }, + { 10, 140 }, { 10, 142 }, { 10, 144 }, { 10, 146 }, { 10, 148 }, + { 10, 150 }, { 10, 152 }, { 10, 154 }, { 10, 156 }, { 10, 158 }, + { 10, 160 }, { 10, 162 }, { 10, 164 }, { 10, 166 }, { 10, 168 }, + { 10, 170 }, { 10, 172 }, { 10, 174 }, { 10, 176 }, { 10, 178 }, + { 10, 180 }, { 10, 182 }, { 10, 184 }, { 10, 186 }, { 10, 188 }, + { 10, 190 }, { 10, 192 }, { 10, 194 }, { 10, 196 }, { 10, 198 }, + { 10, 200 }, { 10, 202 }, { 10, 204 }, { 10, 206 }, { 10, 208 }, + { 10, 210 }, { 10, 212 }, { 10, 214 }, { 10, 216 }, { 10, 218 }, + { 10, 220 }, { 10, 222 }, { 10, 224 }, { 10, 226 }, { 10, 228 }, + { 10, 230 }, { 10, 232 }, { 10, 234 }, { 10, 236 }, { 10, 238 }, + { 10, 240 }, { 10, 242 }, { 10, 244 }, { 10, 246 }, { 10, 248 }, + { 10, 250 }, { 10, 252 }, { 10, 254 }, { 10, 256 }, { 10, 258 }, + { 10, 260 }, { 10, 262 }, { 10, 264 }, { 10, 266 }, { 10, 268 }, + { 10, 270 }, { 10, 272 }, { 10, 274 }, { 10, 276 }, { 10, 278 }, + { 10, 280 }, { 10, 282 }, { 10, 284 }, { 10, 286 }, { 10, 288 }, + { 10, 290 }, { 10, 292 }, { 10, 294 }, { 10, 296 }, { 10, 298 }, + { 10, 300 }, { 10, 302 }, { 10, 304 }, { 10, 306 }, { 10, 308 }, + { 10, 310 }, { 10, 312 }, { 10, 314 }, { 10, 316 }, { 10, 318 }, + { 10, 320 }, { 10, 322 }, { 10, 324 }, { 10, 326 }, { 10, 328 }, + { 10, 330 }, { 10, 332 }, { 10, 334 }, { 10, 336 }, { 10, 338 }, + { 10, 340 }, { 10, 342 }, { 10, 344 }, { 10, 346 }, { 10, 348 }, + { 10, 350 }, { 10, 352 }, { 10, 354 }, { 10, 356 }, { 10, 358 }, + { 10, 360 }, { 10, 362 }, { 10, 364 }, { 10, 366 }, { 10, 368 }, + { 10, 370 }, { 10, 372 }, { 10, 374 }, { 10, 376 }, { 10, 378 }, + { 10, 380 }, { 10, 382 }, { 10, 384 }, { 10, 386 }, { 10, 388 }, + { 10, 390 }, { 10, 392 }, { 10, 394 }, { 10, 396 }, { 10, 398 }, + { 10, 400 }, { 10, 402 }, { 10, 404 }, { 10, 406 }, { 10, 408 }, + { 10, 410 }, { 10, 412 }, { 10, 414 }, { 10, 416 }, { 10, 418 }, + { 10, 420 }, { 10, 422 }, { 10, 424 }, { 10, 426 }, { 10, 428 }, + { 10, 430 }, { 10, 432 }, { 10, 434 }, { 10, 436 }, { 10, 438 }, + { 10, 440 }, { 10, 442 }, { 10, 444 }, { 10, 446 }, { 10, 448 }, + { 10, 450 }, { 10, 452 }, { 10, 454 }, { 10, 456 }, { 10, 458 }, + { 10, 460 }, { 10, 462 }, { 10, 464 }, { 10, 466 }, { 10, 468 }, + { 10, 470 }, { 10, 472 }, { 10, 474 }, { 10, 476 }, { 10, 478 }, + { 10, 480 }, { 10, 482 }, { 10, 484 }, { 10, 486 }, { 10, 488 }, + { 10, 490 }, { 10, 492 }, { 10, 494 }, { 10, 496 }, { 10, 498 }, + { 10, 500 }, { 10, 502 }, { 10, 504 }, { 10, 506 }, { 10, 508 }, + { 10, 510 }, { 10, 512 }, { 10, 514 }, { 10, 516 }, { 10, 518 }, + { 10, 520 }, { 10, 522 }, { 10, 524 }, { 10, 526 }, { 10, 528 }, + { 10, 530 }, { 10, 532 }, { 10, 534 }, { 10, 536 }, { 10, 538 }, + { 10, 540 }, { 10, 542 }, { 10, 544 }, { 10, 546 }, { 10, 548 }, + { 10, 550 }, { 10, 552 }, { 10, 554 }, { 10, 556 }, { 10, 558 }, + { 10, 560 }, { 10, 562 }, { 10, 564 }, { 10, 566 }, { 10, 568 }, + { 10, 570 }, { 10, 572 }, { 10, 574 }, { 10, 576 }, { 10, 578 }, + { 10, 580 }, { 10, 582 }, { 10, 584 }, { 10, 586 }, { 10, 588 }, + { 10, 590 }, { 10, 592 }, { 10, 594 }, { 10, 596 }, { 10, 598 }, + { 10, 600 }, { 10, 602 }, { 10, 604 }, { 10, 606 }, { 10, 608 }, + { 10, 610 }, { 10, 612 }, { 10, 614 }, { 10, 616 }, { 10, 618 }, + { 10, 620 }, { 10, 622 }, { 10, 624 }, { 10, 626 }, { 10, 628 }, + { 10, 630 }, { 10, 632 }, { 10, 634 }, { 10, 636 }, { 10, 638 }, + { 10, 640 }, { 10, 642 }, { 10, 644 }, { 10, 646 }, { 10, 648 }, + { 10, 650 }, { 10, 652 }, { 10, 654 }, { 10, 656 }, { 10, 658 }, + { 10, 660 }, { 10, 662 }, { 10, 664 }, { 10, 666 }, { 10, 668 }, + { 10, 670 }, { 10, 672 }, { 10, 674 }, { 10, 676 }, { 10, 678 }, + { 10, 680 }, { 10, 682 }, { 10, 684 }, { 10, 686 }, { 10, 688 }, + { 10, 690 }, { 10, 692 }, { 10, 694 }, { 10, 696 }, { 10, 698 }, + { 10, 700 }, { 10, 702 }, { 10, 704 }, { 10, 706 }, { 10, 708 }, + { 10, 710 }, { 10, 712 }, { 10, 714 }, { 10, 716 }, { 10, 718 }, + { 10, 720 }, { 10, 722 }, { 10, 724 }, { 10, 726 }, { 10, 728 }, + { 10, 730 }, { 10, 732 }, { 10, 734 }, { 10, 736 }, { 10, 738 }, + { 10, 740 }, { 10, 742 }, { 10, 744 }, { 10, 746 }, { 10, 748 }, + { 10, 750 }, { 10, 752 }, { 10, 754 }, { 10, 756 }, { 10, 758 }, + { 10, 760 }, { 10, 762 }, { 10, 764 }, { 10, 766 }, { 10, 768 }, + { 10, 770 }, { 10, 772 }, { 10, 774 }, { 10, 776 }, { 10, 778 }, + { 10, 780 }, { 10, 782 }, { 10, 784 }, { 10, 786 }, { 10, 788 }, + { 10, 790 }, { 10, 792 }, { 10, 794 }, { 10, 796 }, { 10, 798 }, + { 10, 800 }, { 10, 802 }, { 10, 804 }, { 10, 806 }, { 10, 808 }, + { 10, 810 }, { 10, 812 }, { 10, 814 }, { 10, 816 }, { 10, 818 }, + { 10, 820 }, { 10, 822 }, { 10, 824 }, { 10, 826 }, { 10, 828 }, + { 10, 830 }, { 10, 832 }, { 10, 834 }, { 10, 836 }, { 10, 838 }, + { 10, 840 }, { 10, 842 }, { 10, 844 }, { 10, 846 }, { 10, 848 }, + { 10, 850 }, { 10, 852 }, { 10, 854 }, { 10, 856 }, { 10, 858 }, + { 10, 860 }, { 10, 862 }, { 10, 864 }, { 10, 866 }, { 10, 868 }, + { 10, 870 }, { 10, 872 }, { 10, 874 }, { 10, 876 }, { 10, 878 }, + { 10, 880 }, { 10, 882 }, { 10, 884 }, { 10, 886 }, { 10, 888 }, + { 10, 890 }, { 10, 892 }, { 10, 894 }, { 10, 896 }, { 10, 898 }, + { 10, 900 }, { 10, 902 }, { 10, 904 }, { 10, 906 }, { 10, 908 }, + { 10, 910 }, { 10, 912 }, { 10, 914 }, { 10, 916 }, { 10, 918 }, + { 10, 920 }, { 10, 922 }, { 10, 924 }, { 10, 926 }, { 10, 928 }, + { 10, 930 }, { 10, 932 }, { 10, 934 }, { 10, 936 }, { 10, 938 }, + { 10, 940 }, { 10, 942 }, { 10, 944 }, { 10, 946 }, { 10, 948 }, + { 10, 950 }, { 10, 952 }, { 10, 954 }, { 10, 956 }, { 10, 958 }, + { 10, 960 }, { 10, 962 }, { 10, 964 }, { 10, 966 }, { 10, 968 }, + { 10, 970 }, { 10, 972 }, { 10, 974 }, { 10, 976 }, { 10, 978 }, + { 10, 980 }, { 10, 982 }, { 10, 984 }, { 10, 986 }, { 10, 988 }, + { 10, 990 }, { 10, 992 }, { 10, 994 }, { 10, 996 }, { 10, 998 }, + { 10, 1000 }, { 10, 1002 }, { 10, 1004 }, { 10, 1006 }, { 10, 1008 }, + { 10, 1010 }, { 10, 1012 }, { 10, 1014 }, { 10, 1016 }, { 10, 1018 }, + { 10, 1020 }, { 10, 1022 }, { 10, 1024 }, { 10, 1026 }, { 10, 1028 }, + { 10, 1030 }, { 10, 1032 }, { 10, 1034 }, { 10, 1036 }, { 10, 1038 }, + { 10, 1040 }, { 10, 1042 }, { 10, 1044 }, { 10, 1046 }, { 10, 1048 }, + { 10, 1050 }, { 10, 1052 }, { 10, 1054 }, { 10, 1056 }, { 10, 1058 }, + { 10, 1060 }, { 10, 1062 }, { 10, 1064 }, { 10, 1066 }, { 10, 1068 }, + { 10, 1070 }, { 10, 1072 }, { 10, 1074 }, { 10, 1076 }, { 10, 1078 }, + { 10, 1080 }, { 10, 1082 }, { 10, 1084 }, { 10, 1086 }, { 10, 1088 }, + { 10, 1090 }, { 10, 1092 }, { 10, 1094 }, { 10, 1096 }, { 10, 1098 }, + { 10, 1100 }, { 10, 1102 }, { 10, 1104 }, { 10, 1106 }, { 10, 1108 }, + { 10, 1110 }, { 10, 1112 }, { 10, 1114 }, { 10, 1116 }, { 10, 1118 }, + { 10, 1120 }, { 10, 1122 }, { 10, 1124 }, { 10, 1126 }, { 10, 1128 }, + { 10, 1130 }, { 10, 1132 }, { 10, 1134 }, { 10, 1136 }, { 10, 1138 }, + { 10, 1140 }, { 10, 1142 }, { 10, 1144 }, { 10, 1146 }, { 10, 1148 }, + { 10, 1150 }, { 10, 1152 }, { 10, 1154 }, { 10, 1156 }, { 10, 1158 }, + { 10, 1160 }, { 10, 1162 }, { 10, 1164 }, { 10, 1166 }, { 10, 1168 }, + { 10, 1170 }, { 10, 1172 }, { 10, 1174 }, { 10, 1176 }, { 10, 1178 }, + { 10, 1180 }, { 10, 1182 }, { 10, 1184 }, { 10, 1186 }, { 10, 1188 }, + { 10, 1190 }, { 10, 1192 }, { 10, 1194 }, { 10, 1196 }, { 10, 1198 }, + { 10, 1200 }, { 10, 1202 }, { 10, 1204 }, { 10, 1206 }, { 10, 1208 }, + { 10, 1210 }, { 10, 1212 }, { 10, 1214 }, { 10, 1216 }, { 10, 1218 }, + { 10, 1220 }, { 10, 1222 }, { 10, 1224 }, { 10, 1226 }, { 10, 1228 }, + { 10, 1230 }, { 10, 1232 }, { 10, 1234 }, { 10, 1236 }, { 10, 1238 }, + { 10, 1240 }, { 10, 1242 }, { 10, 1244 }, { 10, 1246 }, { 10, 1248 }, + { 10, 1250 }, { 10, 1252 }, { 10, 1254 }, { 10, 1256 }, { 10, 1258 }, + { 10, 1260 }, { 10, 1262 }, { 10, 1264 }, { 10, 1266 }, { 10, 1268 }, + { 10, 1270 }, { 10, 1272 }, { 10, 1274 }, { 10, 1276 }, { 10, 1278 }, + { 10, 1280 }, { 10, 1282 }, { 10, 1284 }, { 10, 1286 }, { 10, 1288 }, + { 10, 1290 }, { 10, 1292 }, { 10, 1294 }, { 10, 1296 }, { 10, 1298 }, + { 10, 1300 }, { 10, 1302 }, { 10, 1304 }, { 10, 1306 }, { 10, 1308 }, + { 10, 1310 }, { 10, 1312 }, { 10, 1314 }, { 10, 1316 }, { 10, 1318 }, + { 10, 1320 }, { 10, 1322 }, { 10, 1324 }, { 10, 1326 }, { 10, 1328 }, + { 10, 1330 }, { 10, 1332 }, { 10, 1334 }, { 10, 1336 }, { 10, 1338 }, + { 10, 1340 }, { 10, 1342 }, { 10, 1344 }, { 10, 1346 }, { 10, 1348 }, + { 10, 1350 }, { 10, 1352 }, { 10, 1354 }, { 10, 1356 }, { 10, 1358 }, + { 10, 1360 }, { 10, 1362 }, { 10, 1364 }, { 10, 1366 }, { 10, 1368 }, + { 10, 1370 }, { 10, 1372 }, { 10, 1374 }, { 10, 1376 }, { 10, 1378 }, + { 10, 1380 }, { 10, 1382 }, { 10, 1384 }, { 10, 1386 }, { 10, 1388 }, + { 10, 1390 }, { 10, 1392 }, { 10, 1394 }, { 10, 1396 }, { 10, 1398 }, + { 10, 1400 }, { 10, 1402 }, { 10, 1404 }, { 10, 1406 }, { 10, 1408 }, + { 10, 1410 }, { 10, 1412 }, { 10, 1414 }, { 10, 1416 }, { 10, 1418 }, + { 10, 1420 }, { 10, 1422 }, { 10, 1424 }, { 10, 1426 }, { 10, 1428 }, + { 10, 1430 }, { 10, 1432 }, { 10, 1434 }, { 10, 1436 }, { 10, 1438 }, + { 10, 1440 }, { 10, 1442 }, { 10, 1444 }, { 10, 1446 }, { 10, 1448 }, + { 10, 1450 }, { 10, 1452 }, { 10, 1454 }, { 10, 1456 }, { 10, 1458 }, + { 10, 1460 }, { 10, 1462 }, { 10, 1464 }, { 10, 1466 }, { 10, 1468 }, + { 10, 1470 }, { 10, 1472 }, { 10, 1474 }, { 10, 1476 }, { 10, 1478 }, + { 10, 1480 }, { 10, 1482 }, { 10, 1484 }, { 10, 1486 }, { 10, 1488 }, + { 10, 1490 }, { 10, 1492 }, { 10, 1494 }, { 10, 1496 }, { 10, 1498 }, + { 10, 1500 }, { 10, 1502 }, { 10, 1504 }, { 10, 1506 }, { 10, 1508 }, + { 10, 1510 }, { 10, 1512 }, { 10, 1514 }, { 10, 1516 }, { 10, 1518 }, + { 10, 1520 }, { 10, 1522 }, { 10, 1524 }, { 10, 1526 }, { 10, 1528 }, + { 10, 1530 }, { 10, 1532 }, { 10, 1534 }, { 10, 1536 }, { 10, 1538 }, + { 10, 1540 }, { 10, 1542 }, { 10, 1544 }, { 10, 1546 }, { 10, 1548 }, + { 10, 1550 }, { 10, 1552 }, { 10, 1554 }, { 10, 1556 }, { 10, 1558 }, + { 10, 1560 }, { 10, 1562 }, { 10, 1564 }, { 10, 1566 }, { 10, 1568 }, + { 10, 1570 }, { 10, 1572 }, { 10, 1574 }, { 10, 1576 }, { 10, 1578 }, + { 10, 1580 }, { 10, 1582 }, { 10, 1584 }, { 10, 1586 }, { 10, 1588 }, + { 10, 1590 }, { 10, 1592 }, { 10, 1594 }, { 10, 1596 }, { 10, 1598 }, + { 10, 1600 }, { 10, 1602 }, { 10, 1604 }, { 10, 1606 }, { 10, 1608 }, + { 10, 1610 }, { 10, 1612 }, { 10, 1614 }, { 10, 1616 }, { 10, 1618 }, + { 10, 1620 }, { 10, 1622 }, { 10, 1624 }, { 10, 1626 }, { 10, 1628 }, + { 10, 1630 }, { 10, 1632 }, { 10, 1634 }, { 10, 1636 }, { 10, 1638 }, + { 10, 1640 }, { 10, 1642 }, { 10, 1644 }, { 10, 1646 }, { 10, 1648 }, + { 10, 1650 }, { 10, 1652 }, { 10, 1654 }, { 10, 1656 }, { 10, 1658 }, + { 10, 1660 }, { 10, 1662 }, { 10, 1664 }, { 10, 1666 }, { 10, 1668 }, + { 10, 1670 }, { 10, 1672 }, { 10, 1674 }, { 10, 1676 }, { 10, 1678 }, + { 10, 1680 }, { 10, 1682 }, { 10, 1684 }, { 10, 1686 }, { 10, 1688 }, + { 10, 1690 }, { 10, 1692 }, { 10, 1694 }, { 10, 1696 }, { 10, 1698 }, + { 10, 1700 }, { 10, 1702 }, { 10, 1704 }, { 10, 1706 }, { 10, 1708 }, + { 10, 1710 }, { 10, 1712 }, { 10, 1714 }, { 10, 1716 }, { 10, 1718 }, + { 10, 1720 }, { 10, 1722 }, { 10, 1724 }, { 10, 1726 }, { 10, 1728 }, + { 10, 1730 }, { 10, 1732 }, { 10, 1734 }, { 10, 1736 }, { 10, 1738 }, + { 10, 1740 }, { 10, 1742 }, { 10, 1744 }, { 10, 1746 }, { 10, 1748 }, + { 10, 1750 }, { 10, 1752 }, { 10, 1754 }, { 10, 1756 }, { 10, 1758 }, + { 10, 1760 }, { 10, 1762 }, { 10, 1764 }, { 10, 1766 }, { 10, 1768 }, + { 10, 1770 }, { 10, 1772 }, { 10, 1774 }, { 10, 1776 }, { 10, 1778 }, + { 10, 1780 }, { 10, 1782 }, { 10, 1784 }, { 10, 1786 }, { 10, 1788 }, + { 10, 1790 }, { 10, 1792 }, { 10, 1794 }, { 10, 1796 }, { 10, 1798 }, + { 10, 1800 }, { 10, 1802 }, { 10, 1804 }, { 10, 1806 }, { 10, 1808 }, + { 10, 1810 }, { 10, 1812 }, { 10, 1814 }, { 10, 1816 }, { 10, 1818 }, + { 10, 1820 }, { 10, 1822 }, { 10, 1824 }, { 10, 1826 }, { 10, 1828 }, + { 10, 1830 }, { 10, 1832 }, { 10, 1834 }, { 10, 1836 }, { 10, 1838 }, + { 10, 1840 }, { 10, 1842 }, { 10, 1844 }, { 10, 1846 }, { 10, 1848 }, + { 10, 1850 }, { 10, 1852 }, { 10, 1854 }, { 10, 1856 }, { 10, 1858 }, + { 10, 1860 }, { 10, 1862 }, { 10, 1864 }, { 10, 1866 }, { 10, 1868 }, + { 10, 1870 }, { 10, 1872 }, { 10, 1874 }, { 10, 1876 }, { 10, 1878 }, + { 10, 1880 }, { 10, 1882 }, { 10, 1884 }, { 10, 1886 }, { 10, 1888 }, + { 10, 1890 }, { 10, 1892 }, { 10, 1894 }, { 10, 1896 }, { 10, 1898 }, + { 10, 1900 }, { 10, 1902 }, { 10, 1904 }, { 10, 1906 }, { 10, 1908 }, + { 10, 1910 }, { 10, 1912 }, { 10, 1914 }, { 10, 1916 }, { 10, 1918 }, + { 10, 1920 }, { 10, 1922 }, { 10, 1924 }, { 10, 1926 }, { 10, 1928 }, + { 10, 1930 }, { 10, 1932 }, { 10, 1934 }, { 10, 1936 }, { 10, 1938 }, + { 10, 1940 }, { 10, 1942 }, { 10, 1944 }, { 10, 1946 }, { 10, 1948 }, + { 10, 1950 }, { 10, 1952 }, { 10, 1954 }, { 10, 1956 }, { 10, 1958 }, + { 10, 1960 }, { 10, 1962 }, { 10, 1964 }, { 10, 1966 }, { 10, 1968 }, + { 10, 1970 }, { 10, 1972 }, { 10, 1974 }, { 10, 1976 }, { 10, 1978 }, + { 10, 1980 }, { 10, 1982 }, { 10, 1984 }, { 10, 1986 }, { 10, 1988 }, + { 10, 1990 }, { 10, 1992 }, { 10, 1994 }, { 10, 1996 }, { 10, 1998 }, + { 10, 2000 }, { 10, 2002 }, { 10, 2004 }, { 10, 2006 }, { 10, 2008 }, + { 10, 2010 }, { 10, 2012 }, { 10, 2014 }, { 10, 2016 }, { 10, 2018 }, + { 10, 2020 }, { 10, 2022 }, { 10, 2024 }, { 10, 2026 }, { 10, 2028 }, + { 10, 2030 }, { 10, 2032 }, { 10, 2034 }, { 10, 2036 }, { 10, 2038 }, + { 10, 2040 }, { 10, 2042 }, { 10, 2044 }, { 10, 2046 }, { 10, 2048 }, + { 10, 2050 }, { 10, 2052 }, { 10, 2054 }, { 10, 2056 }, { 10, 2058 }, + { 10, 2060 }, { 10, 2062 }, { 10, 2064 }, { 10, 2066 }, { 10, 2068 }, + { 10, 2070 }, { 10, 2072 }, { 10, 2074 }, { 10, 2076 }, { 10, 2078 }, + { 10, 2080 }, { 10, 2082 }, { 10, 2084 }, { 10, 2086 }, { 10, 2088 }, + { 10, 2090 }, { 10, 2092 }, { 10, 2094 }, { 10, 2096 }, { 10, 2098 }, + { 10, 2100 }, { 10, 2102 }, { 10, 2104 }, { 10, 2106 }, { 10, 2108 }, + { 10, 2110 }, { 10, 2112 }, { 10, 2114 }, { 10, 2116 }, { 10, 2118 }, + { 10, 2120 }, { 10, 2122 }, { 10, 2124 }, { 10, 2126 }, { 10, 2128 }, + { 10, 2130 }, { 10, 2132 }, { 10, 2134 }, { 10, 2136 }, { 10, 2138 }, + { 10, 2140 }, { 10, 2142 }, { 10, 2144 }, { 10, 2146 }, { 10, 2148 }, + { 10, 2150 }, { 10, 2152 }, { 10, 2154 }, { 10, 2156 }, { 10, 2158 }, + { 10, 2160 }, { 10, 2162 }, { 10, 2164 }, { 10, 2166 }, { 10, 2168 }, + { 10, 2170 }, { 10, 2172 }, { 10, 2174 }, { 10, 2176 }, { 10, 2178 }, + { 10, 2180 }, { 10, 2182 }, { 10, 2184 }, { 10, 2186 }, { 10, 2188 }, + { 10, 2190 }, { 10, 2192 }, { 10, 2194 }, { 10, 2196 }, { 10, 2198 }, + { 10, 2200 }, { 10, 2202 }, { 10, 2204 }, { 10, 2206 }, { 10, 2208 }, + { 10, 2210 }, { 10, 2212 }, { 10, 2214 }, { 10, 2216 }, { 10, 2218 }, + { 10, 2220 }, { 10, 2222 }, { 10, 2224 }, { 10, 2226 }, { 10, 2228 }, + { 10, 2230 }, { 10, 2232 }, { 10, 2234 }, { 10, 2236 }, { 10, 2238 }, + { 10, 2240 }, { 10, 2242 }, { 10, 2244 }, { 10, 2246 }, { 10, 2248 }, + { 10, 2250 }, { 10, 2252 }, { 10, 2254 }, { 10, 2256 }, { 10, 2258 }, + { 10, 2260 }, { 10, 2262 }, { 10, 2264 }, { 10, 2266 }, { 10, 2268 }, + { 10, 2270 }, { 10, 2272 }, { 10, 2274 }, { 10, 2276 }, { 10, 2278 }, + { 10, 2280 }, { 10, 2282 }, { 10, 2284 }, { 10, 2286 }, { 10, 2288 }, + { 10, 2290 }, { 10, 2292 }, { 10, 2294 }, { 10, 2296 }, { 10, 2298 }, + { 10, 2300 }, { 10, 2302 }, { 10, 2304 }, { 10, 2306 }, { 10, 2308 }, + { 10, 2310 }, { 10, 2312 }, { 10, 2314 }, { 10, 2316 }, { 10, 2318 }, + { 10, 2320 }, { 10, 2322 }, { 10, 2324 }, { 10, 2326 }, { 10, 2328 }, + { 10, 2330 }, { 10, 2332 }, { 10, 2334 }, { 10, 2336 }, { 10, 2338 }, + { 10, 2340 }, { 10, 2342 }, { 10, 2344 }, { 10, 2346 }, { 10, 2348 }, + { 10, 2350 }, { 10, 2352 }, { 10, 2354 }, { 10, 2356 }, { 10, 2358 }, + { 10, 2360 }, { 10, 2362 }, { 10, 2364 }, { 10, 2366 }, { 10, 2368 }, + { 10, 2370 }, { 10, 2372 }, { 10, 2374 }, { 10, 2376 }, { 10, 2378 }, + { 10, 2380 }, { 10, 2382 }, { 10, 2384 }, { 10, 2386 }, { 10, 2388 }, + { 10, 2390 }, { 10, 2392 }, { 10, 2394 }, { 10, 2396 }, { 10, 2398 }, + { 10, 2400 }, { 10, 2402 }, { 10, 2404 }, { 10, 2406 }, { 10, 2408 }, + { 10, 2410 }, { 10, 2412 }, { 10, 2414 }, { 10, 2416 }, { 10, 2418 }, + { 10, 2420 }, { 10, 2422 }, { 10, 2424 }, { 10, 2426 }, { 10, 2428 }, + { 10, 2430 }, { 10, 2432 }, { 10, 2434 }, { 10, 2436 }, { 10, 2438 }, + { 10, 2440 }, { 10, 2442 }, { 10, 2444 }, { 10, 2446 }, { 10, 2448 }, + { 10, 2450 }, { 10, 2452 }, { 10, 2454 }, { 10, 2456 }, { 10, 2458 }, + { 10, 2460 }, { 10, 2462 }, { 10, 2464 }, { 10, 2466 }, { 10, 2468 }, + { 10, 2470 }, { 10, 2472 }, { 10, 2474 }, { 10, 2476 }, { 10, 2478 }, + { 10, 2480 }, { 10, 2482 }, { 10, 2484 }, { 10, 2486 }, { 10, 2488 }, + { 10, 2490 }, { 10, 2492 }, { 10, 2494 }, { 10, 2496 }, { 10, 2498 }, + { 10, 2500 }, { 10, 2502 }, { 10, 2504 }, { 10, 2506 }, { 10, 2508 }, + { 10, 2510 }, { 10, 2512 }, { 10, 2514 }, { 10, 2516 }, { 10, 2518 }, + { 10, 2520 }, { 10, 2522 }, { 10, 2524 }, { 10, 2526 }, { 10, 2528 }, + { 10, 2530 }, { 10, 2532 }, { 10, 2534 }, { 10, 2536 }, { 10, 2538 }, + { 10, 2540 }, { 10, 2542 }, { 10, 2544 }, { 10, 2546 }, { 10, 2548 }, + { 10, 2550 }, { 10, 2552 }, { 10, 2554 }, { 10, 2556 }, { 10, 2558 }, + { 10, 2560 }, { 10, 2562 }, { 10, 2564 }, { 10, 2566 }, { 10, 2568 }, + { 10, 2570 }, { 10, 2572 }, { 10, 2574 }, { 10, 2576 }, { 10, 2578 }, + { 10, 2580 }, { 10, 2582 }, { 10, 2584 }, { 10, 2586 }, { 10, 2588 }, + { 10, 2590 }, { 10, 2592 }, { 10, 2594 }, { 10, 2596 }, { 10, 2598 }, + { 10, 2600 }, { 10, 2602 }, { 10, 2604 }, { 10, 2606 }, { 10, 2608 }, + { 10, 2610 }, { 10, 2612 }, { 10, 2614 }, { 10, 2616 }, { 10, 2618 }, + { 10, 2620 }, { 10, 2622 }, { 10, 2624 }, { 10, 2626 }, { 10, 2628 }, + { 10, 2630 }, { 10, 2632 }, { 10, 2634 }, { 10, 2636 }, { 10, 2638 }, + { 10, 2640 }, { 10, 2642 }, { 10, 2644 }, { 10, 2646 }, { 10, 2648 }, + { 10, 2650 }, { 10, 2652 }, { 10, 2654 }, { 10, 2656 }, { 10, 2658 }, + { 10, 2660 }, { 10, 2662 }, { 10, 2664 }, { 10, 2666 }, { 10, 2668 }, + { 10, 2670 }, { 10, 2672 }, { 10, 2674 }, { 10, 2676 }, { 10, 2678 }, + { 10, 2680 }, { 10, 2682 }, { 10, 2684 }, { 10, 2686 }, { 10, 2688 }, + { 10, 2690 }, { 10, 2692 }, { 10, 2694 }, { 10, 2696 }, { 10, 2698 }, + { 10, 2700 }, { 10, 2702 }, { 10, 2704 }, { 10, 2706 }, { 10, 2708 }, + { 10, 2710 }, { 10, 2712 }, { 10, 2714 }, { 10, 2716 }, { 10, 2718 }, + { 10, 2720 }, { 10, 2722 }, { 10, 2724 }, { 10, 2726 }, { 10, 2728 }, + { 10, 2730 }, { 10, 2732 }, { 10, 2734 }, { 10, 2736 }, { 10, 2738 }, + { 10, 2740 }, { 10, 2742 }, { 10, 2744 }, { 10, 2746 }, { 10, 2748 }, + { 10, 2750 }, { 10, 2752 }, { 10, 2754 }, { 10, 2756 }, { 10, 2758 }, + { 10, 2760 }, { 10, 2762 }, { 10, 2764 }, { 10, 2766 }, { 10, 2768 }, + { 10, 2770 }, { 10, 2772 }, { 10, 2774 }, { 10, 2776 }, { 10, 2778 }, + { 10, 2780 }, { 10, 2782 }, { 10, 2784 }, { 10, 2786 }, { 10, 2788 }, + { 10, 2790 }, { 10, 2792 }, { 10, 2794 }, { 10, 2796 }, { 10, 2798 }, + { 10, 2800 }, { 10, 2802 }, { 10, 2804 }, { 10, 2806 }, { 10, 2808 }, + { 10, 2810 }, { 10, 2812 }, { 10, 2814 }, { 10, 2816 }, { 10, 2818 }, + { 10, 2820 }, { 10, 2822 }, { 10, 2824 }, { 10, 2826 }, { 10, 2828 }, + { 10, 2830 }, { 10, 2832 }, { 10, 2834 }, { 10, 2836 }, { 10, 2838 }, + { 10, 2840 }, { 10, 2842 }, { 10, 2844 }, { 10, 2846 }, { 10, 2848 }, + { 10, 2850 }, { 10, 2852 }, { 10, 2854 }, { 10, 2856 }, { 10, 2858 }, + { 10, 2860 }, { 10, 2862 }, { 10, 2864 }, { 10, 2866 }, { 10, 2868 }, + { 10, 2870 }, { 10, 2872 }, { 10, 2874 }, { 10, 2876 }, { 10, 2878 }, + { 10, 2880 }, { 10, 2882 }, { 10, 2884 }, { 10, 2886 }, { 10, 2888 }, + { 10, 2890 }, { 10, 2892 }, { 10, 2894 }, { 10, 2896 }, { 10, 2898 }, + { 10, 2900 }, { 10, 2902 }, { 10, 2904 }, { 10, 2906 }, { 10, 2908 }, + { 10, 2910 }, { 10, 2912 }, { 10, 2914 }, { 10, 2916 }, { 10, 2918 }, + { 10, 2920 }, { 10, 2922 }, { 10, 2924 }, { 10, 2926 }, { 10, 2928 }, + { 10, 2930 }, { 10, 2932 }, { 10, 2934 }, { 10, 2936 }, { 10, 2938 }, + { 10, 2940 }, { 10, 2942 }, { 10, 2944 }, { 10, 2946 }, { 10, 2948 }, + { 10, 2950 }, { 10, 2952 }, { 10, 2954 }, { 10, 2956 }, { 10, 2958 }, + { 10, 2960 }, { 10, 2962 }, { 10, 2964 }, { 10, 2966 }, { 10, 2968 }, + { 10, 2970 }, { 10, 2972 }, { 10, 2974 }, { 10, 2976 }, { 10, 2978 }, + { 10, 2980 }, { 10, 2982 }, { 10, 2984 }, { 10, 2986 }, { 10, 2988 }, + { 10, 2990 }, { 10, 2992 }, { 10, 2994 }, { 10, 2996 }, { 10, 2998 }, + { 10, 3000 }, { 10, 3002 }, { 10, 3004 }, { 10, 3006 }, { 10, 3008 }, + { 10, 3010 }, { 10, 3012 }, { 10, 3014 }, { 10, 3016 }, { 10, 3018 }, + { 10, 3020 }, { 10, 3022 }, { 10, 3024 }, { 10, 3026 }, { 10, 3028 }, + { 10, 3030 }, { 10, 3032 }, { 10, 3034 }, { 10, 3036 }, { 10, 3038 }, + { 10, 3040 }, { 10, 3042 }, { 10, 3044 }, { 10, 3046 }, { 10, 3048 }, + { 10, 3050 }, { 10, 3052 }, { 10, 3054 }, { 10, 3056 }, { 10, 3058 }, + { 10, 3060 }, { 10, 3062 }, { 10, 3064 }, { 10, 3066 }, { 10, 3068 }, + { 10, 3070 }, { 10, 3072 }, { 10, 3074 }, { 10, 3076 }, { 10, 3078 }, + { 10, 3080 }, { 10, 3082 }, { 10, 3084 }, { 10, 3086 }, { 10, 3088 }, + { 10, 3090 }, { 10, 3092 }, { 10, 3094 }, { 10, 3096 }, { 10, 3098 }, + { 10, 3100 }, { 10, 3102 }, { 10, 3104 }, { 10, 3106 }, { 10, 3108 }, + { 10, 3110 }, { 10, 3112 }, { 10, 3114 }, { 10, 3116 }, { 10, 3118 }, + { 10, 3120 }, { 10, 3122 }, { 10, 3124 }, { 10, 3126 }, { 10, 3128 }, + { 10, 3130 }, { 10, 3132 }, { 10, 3134 }, { 10, 3136 }, { 10, 3138 }, + { 10, 3140 }, { 10, 3142 }, { 10, 3144 }, { 10, 3146 }, { 10, 3148 }, + { 10, 3150 }, { 10, 3152 }, { 10, 3154 }, { 10, 3156 }, { 10, 3158 }, + { 10, 3160 }, { 10, 3162 }, { 10, 3164 }, { 10, 3166 }, { 10, 3168 }, + { 10, 3170 }, { 10, 3172 }, { 10, 3174 }, { 10, 3176 }, { 10, 3178 }, + { 10, 3180 }, { 10, 3182 }, { 10, 3184 }, { 10, 3186 }, { 10, 3188 }, + { 10, 3190 }, { 10, 3192 }, { 10, 3194 }, { 10, 3196 }, { 10, 3198 }, + { 10, 3200 }, { 10, 3202 }, { 10, 3204 }, { 10, 3206 }, { 10, 3208 }, + { 10, 3210 }, { 10, 3212 }, { 10, 3214 }, { 10, 3216 }, { 10, 3218 }, + { 10, 3220 }, { 10, 3222 }, { 10, 3224 }, { 10, 3226 }, { 10, 3228 }, + { 10, 3230 }, { 10, 3232 }, { 10, 3234 }, { 10, 3236 }, { 10, 3238 }, + { 10, 3240 }, { 10, 3242 }, { 10, 3244 }, { 10, 3246 }, { 10, 3248 }, + { 10, 3250 }, { 10, 3252 }, { 10, 3254 }, { 10, 3256 }, { 10, 3258 }, + { 10, 3260 }, { 10, 3262 }, { 10, 3264 }, { 10, 3266 }, { 10, 3268 }, + { 10, 3270 }, { 10, 3272 }, { 10, 3274 }, { 10, 3276 }, { 10, 3278 }, + { 10, 3280 }, { 10, 3282 }, { 10, 3284 }, { 10, 3286 }, { 10, 3288 }, + { 10, 3290 }, { 10, 3292 }, { 10, 3294 }, { 10, 3296 }, { 10, 3298 }, + { 10, 3300 }, { 10, 3302 }, { 10, 3304 }, { 10, 3306 }, { 10, 3308 }, + { 10, 3310 }, { 10, 3312 }, { 10, 3314 }, { 10, 3316 }, { 10, 3318 }, + { 10, 3320 }, { 10, 3322 }, { 10, 3324 }, { 10, 3326 }, { 10, 3328 }, + { 10, 3330 }, { 10, 3332 }, { 10, 3334 }, { 10, 3336 }, { 10, 3338 }, + { 10, 3340 }, { 10, 3342 }, { 10, 3344 }, { 10, 3346 }, { 10, 3348 }, + { 10, 3350 }, { 10, 3352 }, { 10, 3354 }, { 10, 3356 }, { 10, 3358 }, + { 10, 3360 }, { 10, 3362 }, { 10, 3364 }, { 10, 3366 }, { 10, 3368 }, + { 10, 3370 }, { 10, 3372 }, { 10, 3374 }, { 10, 3376 }, { 10, 3378 }, + { 10, 3380 }, { 10, 3382 }, { 10, 3384 }, { 10, 3386 }, { 10, 3388 }, + { 10, 3390 }, { 10, 3392 }, { 10, 3394 }, { 10, 3396 }, { 10, 3398 }, + { 10, 3400 }, { 10, 3402 }, { 10, 3404 }, { 10, 3406 }, { 10, 3408 }, + { 10, 3410 }, { 10, 3412 }, { 10, 3414 }, { 10, 3416 }, { 10, 3418 }, + { 10, 3420 }, { 10, 3422 }, { 10, 3424 }, { 10, 3426 }, { 10, 3428 }, + { 10, 3430 }, { 10, 3432 }, { 10, 3434 }, { 10, 3436 }, { 10, 3438 }, + { 10, 3440 }, { 10, 3442 }, { 10, 3444 }, { 10, 3446 }, { 10, 3448 }, + { 10, 3450 }, { 10, 3452 }, { 10, 3454 }, { 10, 3456 }, { 10, 3458 }, + { 10, 3460 }, { 10, 3462 }, { 10, 3464 }, { 10, 3466 }, { 10, 3468 }, + { 10, 3470 }, { 10, 3472 }, { 10, 3474 }, { 10, 3476 }, { 10, 3478 }, + { 10, 3480 }, { 10, 3482 }, { 10, 3484 }, { 10, 3486 }, { 10, 3488 }, + { 10, 3490 }, { 10, 3492 }, { 10, 3494 }, { 10, 3496 }, { 10, 3498 }, + { 10, 3500 }, { 10, 3502 }, { 10, 3504 }, { 10, 3506 }, { 10, 3508 }, + { 10, 3510 }, { 10, 3512 }, { 10, 3514 }, { 10, 3516 }, { 10, 3518 }, + { 10, 3520 }, { 10, 3522 }, { 10, 3524 }, { 10, 3526 }, { 10, 3528 }, + { 10, 3530 }, { 10, 3532 }, { 10, 3534 }, { 10, 3536 }, { 10, 3538 }, + { 10, 3540 }, { 10, 3542 }, { 10, 3544 }, { 10, 3546 }, { 10, 3548 }, + { 10, 3550 }, { 10, 3552 }, { 10, 3554 }, { 10, 3556 }, { 10, 3558 }, + { 10, 3560 }, { 10, 3562 }, { 10, 3564 }, { 10, 3566 }, { 10, 3568 }, + { 10, 3570 }, { 10, 3572 }, { 10, 3574 }, { 10, 3576 }, { 10, 3578 }, + { 10, 3580 }, { 10, 3582 }, { 10, 3584 }, { 10, 3586 }, { 10, 3588 }, + { 10, 3590 }, { 10, 3592 }, { 10, 3594 }, { 10, 3596 }, { 10, 3598 }, + { 10, 3600 }, { 10, 3602 }, { 10, 3604 }, { 10, 3606 }, { 10, 3608 }, + { 10, 3610 }, { 10, 3612 }, { 10, 3614 }, { 10, 3616 }, { 10, 3618 }, + { 10, 3620 }, { 10, 3622 }, { 10, 3624 }, { 10, 3626 }, { 10, 3628 }, + { 10, 3630 }, { 10, 3632 }, { 10, 3634 }, { 10, 3636 }, { 10, 3638 }, + { 10, 3640 }, { 10, 3642 }, { 10, 3644 }, { 10, 3646 }, { 10, 3648 }, + { 10, 3650 }, { 10, 3652 }, { 10, 3654 }, { 10, 3656 }, { 10, 3658 }, + { 10, 3660 }, { 10, 3662 }, { 10, 3664 }, { 10, 3666 }, { 10, 3668 }, + { 10, 3670 }, { 10, 3672 }, { 10, 3674 }, { 10, 3676 }, { 10, 3678 }, + { 10, 3680 }, { 10, 3682 }, { 10, 3684 }, { 10, 3686 }, { 10, 3688 }, + { 10, 3690 }, { 10, 3692 }, { 10, 3694 }, { 10, 3696 }, { 10, 3698 }, + { 10, 3700 }, { 10, 3702 }, { 10, 3704 }, { 10, 3706 }, { 10, 3708 }, + { 10, 3710 }, { 10, 3712 }, { 10, 3714 }, { 10, 3716 }, { 10, 3718 }, + { 10, 3720 }, { 10, 3722 }, { 10, 3724 }, { 10, 3726 }, { 10, 3728 }, + { 10, 3730 }, { 10, 3732 }, { 10, 3734 }, { 10, 3736 }, { 10, 3738 }, + { 10, 3740 }, { 10, 3742 }, { 10, 3744 }, { 10, 3746 }, { 10, 3748 }, + { 10, 3750 }, { 10, 3752 }, { 10, 3754 }, { 10, 3756 }, { 10, 3758 }, + { 10, 3760 }, { 10, 3762 }, { 10, 3764 }, { 10, 3766 }, { 10, 3768 }, + { 10, 3770 }, { 10, 3772 }, { 10, 3774 }, { 10, 3776 }, { 10, 3778 }, + { 10, 3780 }, { 10, 3782 }, { 10, 3784 }, { 10, 3786 }, { 10, 3788 }, + { 10, 3790 }, { 10, 3792 }, { 10, 3794 }, { 10, 3796 }, { 10, 3798 }, + { 10, 3800 }, { 10, 3802 }, { 10, 3804 }, { 10, 3806 }, { 10, 3808 }, + { 10, 3810 }, { 10, 3812 }, { 10, 3814 }, { 10, 3816 }, { 10, 3818 }, + { 10, 3820 }, { 10, 3822 }, { 10, 3824 }, { 10, 3826 }, { 10, 3828 }, + { 10, 3830 }, { 10, 3832 }, { 10, 3834 }, { 10, 3836 }, { 10, 3838 }, + { 10, 3840 }, { 10, 3842 }, { 10, 3844 }, { 10, 3846 }, { 10, 3848 }, + { 10, 3850 }, { 10, 3852 }, { 10, 3854 }, { 10, 3856 }, { 10, 3858 }, + { 10, 3860 }, { 10, 3862 }, { 10, 3864 }, { 10, 3866 }, { 10, 3868 }, + { 10, 3870 }, { 10, 3872 }, { 10, 3874 }, { 10, 3876 }, { 10, 3878 }, + { 10, 3880 }, { 10, 3882 }, { 10, 3884 }, { 10, 3886 }, { 10, 3888 }, + { 10, 3890 }, { 10, 3892 }, { 10, 3894 }, { 10, 3896 }, { 10, 3898 }, + { 10, 3900 }, { 10, 3902 }, { 10, 3904 }, { 10, 3906 }, { 10, 3908 }, + { 10, 3910 }, { 10, 3912 }, { 10, 3914 }, { 10, 3916 }, { 10, 3918 }, + { 10, 3920 }, { 10, 3922 }, { 10, 3924 }, { 10, 3926 }, { 10, 3928 }, + { 10, 3930 }, { 10, 3932 }, { 10, 3934 }, { 10, 3936 }, { 10, 3938 }, + { 10, 3940 }, { 10, 3942 }, { 10, 3944 }, { 10, 3946 }, { 10, 3948 }, + { 10, 3950 }, { 10, 3952 }, { 10, 3954 }, { 10, 3956 }, { 10, 3958 }, + { 10, 3960 } +}; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_ENCODER_DCT_VALUE_TOKENS_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/defaultcoefcounts.h b/media/libvpx/libvpx/vp8/encoder/defaultcoefcounts.h new file mode 100644 index 0000000000..a3ab34c8a0 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/defaultcoefcounts.h @@ -0,0 +1,235 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_ENCODER_DEFAULTCOEFCOUNTS_H_ +#define VPX_VP8_ENCODER_DEFAULTCOEFCOUNTS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* Generated file, included by entropy.c */ + +static const unsigned int default_coef_counts + [BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] = { + + { + /* Block Type ( 0 ) */ + { + /* Coeff Band ( 0 ) */ + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + }, + { + /* Coeff Band ( 1 ) */ + { 30190, 26544, 225, 24, 4, 0, 0, 0, 0, 0, 0, 4171593 }, + { 26846, 25157, 1241, 130, 26, 6, 1, 0, 0, 0, 0, 149987 }, + { 10484, 9538, 1006, 160, 36, 18, 0, 0, 0, 0, 0, 15104 }, + }, + { + /* Coeff Band ( 2 ) */ + { 25842, 40456, 1126, 83, 11, 2, 0, 0, 0, 0, 0, 0 }, + { 9338, 8010, 512, 73, 7, 3, 2, 0, 0, 0, 0, 43294 }, + { 1047, 751, 149, 31, 13, 6, 1, 0, 0, 0, 0, 879 }, + }, + { + /* Coeff Band ( 3 ) */ + { 26136, 9826, 252, 13, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 8134, 5574, 191, 14, 2, 0, 0, 0, 0, 0, 0, 35302 }, + { 605, 677, 116, 9, 1, 0, 0, 0, 0, 0, 0, 611 }, + }, + { + /* Coeff Band ( 4 ) */ + { 10263, 15463, 283, 17, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 2773, 2191, 128, 9, 2, 2, 0, 0, 0, 0, 0, 10073 }, + { 134, 125, 32, 4, 0, 2, 0, 0, 0, 0, 0, 50 }, + }, + { + /* Coeff Band ( 5 ) */ + { 10483, 2663, 23, 1, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 2137, 1251, 27, 1, 1, 0, 0, 0, 0, 0, 0, 14362 }, + { 116, 156, 14, 2, 1, 0, 0, 0, 0, 0, 0, 190 }, + }, + { + /* Coeff Band ( 6 ) */ + { 40977, 27614, 412, 28, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 6113, 5213, 261, 22, 3, 0, 0, 0, 0, 0, 0, 26164 }, + { 382, 312, 50, 14, 2, 0, 0, 0, 0, 0, 0, 345 }, + }, + { + /* Coeff Band ( 7 ) */ + { 0, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 319 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8 }, + }, + }, + { + /* Block Type ( 1 ) */ + { + /* Coeff Band ( 0 ) */ + { 3268, 19382, 1043, 250, 93, 82, 49, 26, 17, 8, 25, 82289 }, + { 8758, 32110, 5436, 1832, 827, 668, 420, 153, 24, 0, 3, 52914 }, + { 9337, 23725, 8487, 3954, 2107, 1836, 1069, 399, 59, 0, 0, + 18620 }, + }, + { + /* Coeff Band ( 1 ) */ + { 12419, 8420, 452, 62, 9, 1, 0, 0, 0, 0, 0, 0 }, + { 11715, 8705, 693, 92, 15, 7, 2, 0, 0, 0, 0, 53988 }, + { 7603, 8585, 2306, 778, 270, 145, 39, 5, 0, 0, 0, 9136 }, + }, + { + /* Coeff Band ( 2 ) */ + { 15938, 14335, 1207, 184, 55, 13, 4, 1, 0, 0, 0, 0 }, + { 7415, 6829, 1138, 244, 71, 26, 7, 0, 0, 0, 0, 9980 }, + { 1580, 1824, 655, 241, 89, 46, 10, 2, 0, 0, 0, 429 }, + }, + { + /* Coeff Band ( 3 ) */ + { 19453, 5260, 201, 19, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 9173, 3758, 213, 22, 1, 1, 0, 0, 0, 0, 0, 9820 }, + { 1689, 1277, 276, 51, 17, 4, 0, 0, 0, 0, 0, 679 }, + }, + { + /* Coeff Band ( 4 ) */ + { 12076, 10667, 620, 85, 19, 9, 5, 0, 0, 0, 0, 0 }, + { 4665, 3625, 423, 55, 19, 9, 0, 0, 0, 0, 0, 5127 }, + { 415, 440, 143, 34, 20, 7, 2, 0, 0, 0, 0, 101 }, + }, + { + /* Coeff Band ( 5 ) */ + { 12183, 4846, 115, 11, 1, 0, 0, 0, 0, 0, 0, 0 }, + { 4226, 3149, 177, 21, 2, 0, 0, 0, 0, 0, 0, 7157 }, + { 375, 621, 189, 51, 11, 4, 1, 0, 0, 0, 0, 198 }, + }, + { + /* Coeff Band ( 6 ) */ + { 61658, 37743, 1203, 94, 10, 3, 0, 0, 0, 0, 0, 0 }, + { 15514, 11563, 903, 111, 14, 5, 0, 0, 0, 0, 0, 25195 }, + { 929, 1077, 291, 78, 14, 7, 1, 0, 0, 0, 0, 507 }, + }, + { + /* Coeff Band ( 7 ) */ + { 0, 990, 15, 3, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 412, 13, 0, 0, 0, 0, 0, 0, 0, 0, 1641 }, + { 0, 18, 7, 1, 0, 0, 0, 0, 0, 0, 0, 30 }, + }, + }, + { + /* Block Type ( 2 ) */ + { + /* Coeff Band ( 0 ) */ + { 953, 24519, 628, 120, 28, 12, 4, 0, 0, 0, 0, 2248798 }, + { 1525, 25654, 2647, 617, 239, 143, 42, 5, 0, 0, 0, 66837 }, + { 1180, 11011, 3001, 1237, 532, 448, 239, 54, 5, 0, 0, 7122 }, + }, + { + /* Coeff Band ( 1 ) */ + { 1356, 2220, 67, 10, 4, 1, 0, 0, 0, 0, 0, 0 }, + { 1450, 2544, 102, 18, 4, 3, 0, 0, 0, 0, 0, 57063 }, + { 1182, 2110, 470, 130, 41, 21, 0, 0, 0, 0, 0, 6047 }, + }, + { + /* Coeff Band ( 2 ) */ + { 370, 3378, 200, 30, 5, 4, 1, 0, 0, 0, 0, 0 }, + { 293, 1006, 131, 29, 11, 0, 0, 0, 0, 0, 0, 5404 }, + { 114, 387, 98, 23, 4, 8, 1, 0, 0, 0, 0, 236 }, + }, + { + /* Coeff Band ( 3 ) */ + { 579, 194, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 395, 213, 5, 1, 0, 0, 0, 0, 0, 0, 0, 4157 }, + { 119, 122, 4, 0, 0, 0, 0, 0, 0, 0, 0, 300 }, + }, + { + /* Coeff Band ( 4 ) */ + { 38, 557, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 21, 114, 12, 1, 0, 0, 0, 0, 0, 0, 0, 427 }, + { 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7 }, + }, + { + /* Coeff Band ( 5 ) */ + { 52, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 18, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 652 }, + { 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30 }, + }, + { + /* Coeff Band ( 6 ) */ + { 640, 569, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 25, 77, 2, 0, 0, 0, 0, 0, 0, 0, 0, 517 }, + { 4, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 }, + }, + { + /* Coeff Band ( 7 ) */ + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + }, + }, + { + /* Block Type ( 3 ) */ + { + /* Coeff Band ( 0 ) */ + { 2506, 20161, 2707, 767, 261, 178, 107, 30, 14, 3, 0, 100694 }, + { 8806, 36478, 8817, 3268, 1280, 850, 401, 114, 42, 0, 0, 58572 }, + { 11003, 27214, 11798, 5716, 2482, 2072, 1048, 175, 32, 0, 0, + 19284 }, + }, + { + /* Coeff Band ( 1 ) */ + { 9738, 11313, 959, 205, 70, 18, 11, 1, 0, 0, 0, 0 }, + { 12628, 15085, 1507, 273, 52, 19, 9, 0, 0, 0, 0, 54280 }, + { 10701, 15846, 5561, 1926, 813, 570, 249, 36, 0, 0, 0, 6460 }, + }, + { + /* Coeff Band ( 2 ) */ + { 6781, 22539, 2784, 634, 182, 123, 20, 4, 0, 0, 0, 0 }, + { 6263, 11544, 2649, 790, 259, 168, 27, 5, 0, 0, 0, 20539 }, + { 3109, 4075, 2031, 896, 457, 386, 158, 29, 0, 0, 0, 1138 }, + }, + { + /* Coeff Band ( 3 ) */ + { 11515, 4079, 465, 73, 5, 14, 2, 0, 0, 0, 0, 0 }, + { 9361, 5834, 650, 96, 24, 8, 4, 0, 0, 0, 0, 22181 }, + { 4343, 3974, 1360, 415, 132, 96, 14, 1, 0, 0, 0, 1267 }, + }, + { + /* Coeff Band ( 4 ) */ + { 4787, 9297, 823, 168, 44, 12, 4, 0, 0, 0, 0, 0 }, + { 3619, 4472, 719, 198, 60, 31, 3, 0, 0, 0, 0, 8401 }, + { 1157, 1175, 483, 182, 88, 31, 8, 0, 0, 0, 0, 268 }, + }, + { + /* Coeff Band ( 5 ) */ + { 8299, 1226, 32, 5, 1, 0, 0, 0, 0, 0, 0, 0 }, + { 3502, 1568, 57, 4, 1, 1, 0, 0, 0, 0, 0, 9811 }, + { 1055, 1070, 166, 29, 6, 1, 0, 0, 0, 0, 0, 527 }, + }, + { + /* Coeff Band ( 6 ) */ + { 27414, 27927, 1989, 347, 69, 26, 0, 0, 0, 0, 0, 0 }, + { 5876, 10074, 1574, 341, 91, 24, 4, 0, 0, 0, 0, 21954 }, + { 1571, 2171, 778, 324, 124, 65, 16, 0, 0, 0, 0, 979 }, + }, + { + /* Coeff Band ( 7 ) */ + { 0, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 459 }, + { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13 }, + }, + }, + }; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_ENCODER_DEFAULTCOEFCOUNTS_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/denoising.c b/media/libvpx/libvpx/vp8/encoder/denoising.c new file mode 100644 index 0000000000..a666bca4d2 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/denoising.c @@ -0,0 +1,725 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "denoising.h" + +#include "vp8/common/reconinter.h" +#include "vpx/vpx_integer.h" +#include "vpx_mem/vpx_mem.h" +#include "vp8_rtcd.h" + +static const unsigned int NOISE_MOTION_THRESHOLD = 25 * 25; +/* SSE_DIFF_THRESHOLD is selected as ~95% confidence assuming + * var(noise) ~= 100. + */ +static const unsigned int SSE_DIFF_THRESHOLD = 16 * 16 * 20; +static const unsigned int SSE_THRESHOLD = 16 * 16 * 40; +static const unsigned int SSE_THRESHOLD_HIGH = 16 * 16 * 80; + +/* + * The filter function was modified to reduce the computational complexity. + * Step 1: + * Instead of applying tap coefficients for each pixel, we calculated the + * pixel adjustments vs. pixel diff value ahead of time. + * adjustment = filtered_value - current_raw + * = (filter_coefficient * diff + 128) >> 8 + * where + * filter_coefficient = (255 << 8) / (256 + ((absdiff * 330) >> 3)); + * filter_coefficient += filter_coefficient / + * (3 + motion_magnitude_adjustment); + * filter_coefficient is clamped to 0 ~ 255. + * + * Step 2: + * The adjustment vs. diff curve becomes flat very quick when diff increases. + * This allowed us to use only several levels to approximate the curve without + * changing the filtering algorithm too much. + * The adjustments were further corrected by checking the motion magnitude. + * The levels used are: + * diff adjustment w/o motion correction adjustment w/ motion correction + * [-255, -16] -6 -7 + * [-15, -8] -4 -5 + * [-7, -4] -3 -4 + * [-3, 3] diff diff + * [4, 7] 3 4 + * [8, 15] 4 5 + * [16, 255] 6 7 + */ + +int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, + unsigned char *running_avg_y, int avg_y_stride, + unsigned char *sig, int sig_stride, + unsigned int motion_magnitude, + int increase_denoising) { + unsigned char *running_avg_y_start = running_avg_y; + unsigned char *sig_start = sig; + int sum_diff_thresh; + int r, c; + int sum_diff = 0; + int adj_val[3] = { 3, 4, 6 }; + int shift_inc1 = 0; + int shift_inc2 = 1; + int col_sum[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + /* If motion_magnitude is small, making the denoiser more aggressive by + * increasing the adjustment for each level. Add another increment for + * blocks that are labeled for increase denoising. */ + if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) { + if (increase_denoising) { + shift_inc1 = 1; + shift_inc2 = 2; + } + adj_val[0] += shift_inc2; + adj_val[1] += shift_inc2; + adj_val[2] += shift_inc2; + } + + for (r = 0; r < 16; ++r) { + for (c = 0; c < 16; ++c) { + int diff = 0; + int adjustment = 0; + int absdiff = 0; + + diff = mc_running_avg_y[c] - sig[c]; + absdiff = abs(diff); + + // When |diff| <= |3 + shift_inc1|, use pixel value from + // last denoised raw. + if (absdiff <= 3 + shift_inc1) { + running_avg_y[c] = mc_running_avg_y[c]; + col_sum[c] += diff; + } else { + if (absdiff >= 4 + shift_inc1 && absdiff <= 7) { + adjustment = adj_val[0]; + } else if (absdiff >= 8 && absdiff <= 15) { + adjustment = adj_val[1]; + } else { + adjustment = adj_val[2]; + } + + if (diff > 0) { + if ((sig[c] + adjustment) > 255) { + running_avg_y[c] = 255; + } else { + running_avg_y[c] = sig[c] + adjustment; + } + + col_sum[c] += adjustment; + } else { + if ((sig[c] - adjustment) < 0) { + running_avg_y[c] = 0; + } else { + running_avg_y[c] = sig[c] - adjustment; + } + + col_sum[c] -= adjustment; + } + } + } + + /* Update pointers for next iteration. */ + sig += sig_stride; + mc_running_avg_y += mc_avg_y_stride; + running_avg_y += avg_y_stride; + } + + for (c = 0; c < 16; ++c) { + // Below we clip the value in the same way which SSE code use. + // When adopting aggressive denoiser, the adj_val for each pixel + // could be at most 8 (this is current max adjustment of the map). + // In SSE code, we calculate the sum of adj_val for + // the columns, so the sum could be up to 128(16 rows). However, + // the range of the value is -128 ~ 127 in SSE code, that's why + // we do this change in C code. + // We don't do this for UV denoiser, since there are only 8 rows, + // and max adjustments <= 8, so the sum of the columns will not + // exceed 64. + if (col_sum[c] >= 128) { + col_sum[c] = 127; + } + sum_diff += col_sum[c]; + } + + sum_diff_thresh = SUM_DIFF_THRESHOLD; + if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH; + if (abs(sum_diff) > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), check + // if we can still apply some (weaker) temporal filtering to this block, + // that would otherwise not be denoised at all. Simplest is to apply + // an additional adjustment to running_avg_y to bring it closer to sig. + // The adjustment is capped by a maximum delta, and chosen such that + // in most cases the resulting sum_diff will be within the + // accceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over threshold. + int delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + sig -= sig_stride * 16; + mc_running_avg_y -= mc_avg_y_stride * 16; + running_avg_y -= avg_y_stride * 16; + for (r = 0; r < 16; ++r) { + for (c = 0; c < 16; ++c) { + int diff = mc_running_avg_y[c] - sig[c]; + int adjustment = abs(diff); + if (adjustment > delta) adjustment = delta; + if (diff > 0) { + // Bring denoised signal down. + if (running_avg_y[c] - adjustment < 0) { + running_avg_y[c] = 0; + } else { + running_avg_y[c] = running_avg_y[c] - adjustment; + } + col_sum[c] -= adjustment; + } else if (diff < 0) { + // Bring denoised signal up. + if (running_avg_y[c] + adjustment > 255) { + running_avg_y[c] = 255; + } else { + running_avg_y[c] = running_avg_y[c] + adjustment; + } + col_sum[c] += adjustment; + } + } + // TODO(marpan): Check here if abs(sum_diff) has gone below the + // threshold sum_diff_thresh, and if so, we can exit the row loop. + sig += sig_stride; + mc_running_avg_y += mc_avg_y_stride; + running_avg_y += avg_y_stride; + } + + sum_diff = 0; + for (c = 0; c < 16; ++c) { + if (col_sum[c] >= 128) { + col_sum[c] = 127; + } + sum_diff += col_sum[c]; + } + + if (abs(sum_diff) > sum_diff_thresh) return COPY_BLOCK; + } else { + return COPY_BLOCK; + } + } + + vp8_copy_mem16x16(running_avg_y_start, avg_y_stride, sig_start, sig_stride); + return FILTER_BLOCK; +} + +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, + unsigned char *running_avg, int avg_stride, + unsigned char *sig, int sig_stride, + unsigned int motion_magnitude, + int increase_denoising) { + unsigned char *running_avg_start = running_avg; + unsigned char *sig_start = sig; + int sum_diff_thresh; + int r, c; + int sum_diff = 0; + int sum_block = 0; + int adj_val[3] = { 3, 4, 6 }; + int shift_inc1 = 0; + int shift_inc2 = 1; + /* If motion_magnitude is small, making the denoiser more aggressive by + * increasing the adjustment for each level. Add another increment for + * blocks that are labeled for increase denoising. */ + if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) { + if (increase_denoising) { + shift_inc1 = 1; + shift_inc2 = 2; + } + adj_val[0] += shift_inc2; + adj_val[1] += shift_inc2; + adj_val[2] += shift_inc2; + } + + // Avoid denoising color signal if its close to average level. + for (r = 0; r < 8; ++r) { + for (c = 0; c < 8; ++c) { + sum_block += sig[c]; + } + sig += sig_stride; + } + if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) { + return COPY_BLOCK; + } + + sig -= sig_stride * 8; + for (r = 0; r < 8; ++r) { + for (c = 0; c < 8; ++c) { + int diff = 0; + int adjustment = 0; + int absdiff = 0; + + diff = mc_running_avg[c] - sig[c]; + absdiff = abs(diff); + + // When |diff| <= |3 + shift_inc1|, use pixel value from + // last denoised raw. + if (absdiff <= 3 + shift_inc1) { + running_avg[c] = mc_running_avg[c]; + sum_diff += diff; + } else { + if (absdiff >= 4 && absdiff <= 7) { + adjustment = adj_val[0]; + } else if (absdiff >= 8 && absdiff <= 15) { + adjustment = adj_val[1]; + } else { + adjustment = adj_val[2]; + } + if (diff > 0) { + if ((sig[c] + adjustment) > 255) { + running_avg[c] = 255; + } else { + running_avg[c] = sig[c] + adjustment; + } + sum_diff += adjustment; + } else { + if ((sig[c] - adjustment) < 0) { + running_avg[c] = 0; + } else { + running_avg[c] = sig[c] - adjustment; + } + sum_diff -= adjustment; + } + } + } + /* Update pointers for next iteration. */ + sig += sig_stride; + mc_running_avg += mc_avg_stride; + running_avg += avg_stride; + } + + sum_diff_thresh = SUM_DIFF_THRESHOLD_UV; + if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV; + if (abs(sum_diff) > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), check + // if we can still apply some (weaker) temporal filtering to this block, + // that would otherwise not be denoised at all. Simplest is to apply + // an additional adjustment to running_avg_y to bring it closer to sig. + // The adjustment is capped by a maximum delta, and chosen such that + // in most cases the resulting sum_diff will be within the + // accceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over threshold. + int delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + sig -= sig_stride * 8; + mc_running_avg -= mc_avg_stride * 8; + running_avg -= avg_stride * 8; + for (r = 0; r < 8; ++r) { + for (c = 0; c < 8; ++c) { + int diff = mc_running_avg[c] - sig[c]; + int adjustment = abs(diff); + if (adjustment > delta) adjustment = delta; + if (diff > 0) { + // Bring denoised signal down. + if (running_avg[c] - adjustment < 0) { + running_avg[c] = 0; + } else { + running_avg[c] = running_avg[c] - adjustment; + } + sum_diff -= adjustment; + } else if (diff < 0) { + // Bring denoised signal up. + if (running_avg[c] + adjustment > 255) { + running_avg[c] = 255; + } else { + running_avg[c] = running_avg[c] + adjustment; + } + sum_diff += adjustment; + } + } + // TODO(marpan): Check here if abs(sum_diff) has gone below the + // threshold sum_diff_thresh, and if so, we can exit the row loop. + sig += sig_stride; + mc_running_avg += mc_avg_stride; + running_avg += avg_stride; + } + if (abs(sum_diff) > sum_diff_thresh) return COPY_BLOCK; + } else { + return COPY_BLOCK; + } + } + + vp8_copy_mem8x8(running_avg_start, avg_stride, sig_start, sig_stride); + return FILTER_BLOCK; +} + +void vp8_denoiser_set_parameters(VP8_DENOISER *denoiser, int mode) { + assert(mode > 0); // Denoiser is allocated only if mode > 0. + if (mode == 1) { + denoiser->denoiser_mode = kDenoiserOnYOnly; + } else if (mode == 2) { + denoiser->denoiser_mode = kDenoiserOnYUV; + } else if (mode == 3) { + denoiser->denoiser_mode = kDenoiserOnYUVAggressive; + } else { + denoiser->denoiser_mode = kDenoiserOnYUV; + } + if (denoiser->denoiser_mode != kDenoiserOnYUVAggressive) { + denoiser->denoise_pars.scale_sse_thresh = 1; + denoiser->denoise_pars.scale_motion_thresh = 8; + denoiser->denoise_pars.scale_increase_filter = 0; + denoiser->denoise_pars.denoise_mv_bias = 95; + denoiser->denoise_pars.pickmode_mv_bias = 100; + denoiser->denoise_pars.qp_thresh = 0; + denoiser->denoise_pars.consec_zerolast = UINT_MAX; + denoiser->denoise_pars.spatial_blur = 0; + } else { + denoiser->denoise_pars.scale_sse_thresh = 2; + denoiser->denoise_pars.scale_motion_thresh = 16; + denoiser->denoise_pars.scale_increase_filter = 1; + denoiser->denoise_pars.denoise_mv_bias = 60; + denoiser->denoise_pars.pickmode_mv_bias = 75; + denoiser->denoise_pars.qp_thresh = 80; + denoiser->denoise_pars.consec_zerolast = 15; + denoiser->denoise_pars.spatial_blur = 0; + } +} + +int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height, + int num_mb_rows, int num_mb_cols, int mode) { + int i; + assert(denoiser); + denoiser->num_mb_cols = num_mb_cols; + + for (i = 0; i < MAX_REF_FRAMES; ++i) { + denoiser->yv12_running_avg[i].flags = 0; + + if (vp8_yv12_alloc_frame_buffer(&(denoiser->yv12_running_avg[i]), width, + height, VP8BORDERINPIXELS) < 0) { + vp8_denoiser_free(denoiser); + return 1; + } + memset(denoiser->yv12_running_avg[i].buffer_alloc, 0, + denoiser->yv12_running_avg[i].frame_size); + } + denoiser->yv12_mc_running_avg.flags = 0; + + if (vp8_yv12_alloc_frame_buffer(&(denoiser->yv12_mc_running_avg), width, + height, VP8BORDERINPIXELS) < 0) { + vp8_denoiser_free(denoiser); + return 1; + } + + memset(denoiser->yv12_mc_running_avg.buffer_alloc, 0, + denoiser->yv12_mc_running_avg.frame_size); + + if (vp8_yv12_alloc_frame_buffer(&denoiser->yv12_last_source, width, height, + VP8BORDERINPIXELS) < 0) { + vp8_denoiser_free(denoiser); + return 1; + } + memset(denoiser->yv12_last_source.buffer_alloc, 0, + denoiser->yv12_last_source.frame_size); + + denoiser->denoise_state = vpx_calloc((num_mb_rows * num_mb_cols), 1); + if (!denoiser->denoise_state) { + vp8_denoiser_free(denoiser); + return 1; + } + memset(denoiser->denoise_state, 0, (num_mb_rows * num_mb_cols)); + vp8_denoiser_set_parameters(denoiser, mode); + denoiser->nmse_source_diff = 0; + denoiser->nmse_source_diff_count = 0; + denoiser->qp_avg = 0; + // QP threshold below which we can go up to aggressive mode. + denoiser->qp_threshold_up = 80; + // QP threshold above which we can go back down to normal mode. + // For now keep this second threshold high, so not used currently. + denoiser->qp_threshold_down = 128; + // Bitrate thresholds and noise metric (nmse) thresholds for switching to + // aggressive mode. + // TODO(marpan): Adjust thresholds, including effect on resolution. + denoiser->bitrate_threshold = 400000; // (bits/sec). + denoiser->threshold_aggressive_mode = 80; + if (width * height > 1280 * 720) { + denoiser->bitrate_threshold = 3000000; + denoiser->threshold_aggressive_mode = 200; + } else if (width * height > 960 * 540) { + denoiser->bitrate_threshold = 1200000; + denoiser->threshold_aggressive_mode = 120; + } else if (width * height > 640 * 480) { + denoiser->bitrate_threshold = 600000; + denoiser->threshold_aggressive_mode = 100; + } + return 0; +} + +void vp8_denoiser_free(VP8_DENOISER *denoiser) { + int i; + assert(denoiser); + + for (i = 0; i < MAX_REF_FRAMES; ++i) { + vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_running_avg[i]); + } + vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_mc_running_avg); + vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_last_source); + vpx_free(denoiser->denoise_state); +} + +void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser, MACROBLOCK *x, + unsigned int best_sse, unsigned int zero_mv_sse, + int recon_yoffset, int recon_uvoffset, + loop_filter_info_n *lfi_n, int mb_row, int mb_col, + int block_index, int consec_zero_last) + +{ + int mv_row; + int mv_col; + unsigned int motion_threshold; + unsigned int motion_magnitude2; + unsigned int sse_thresh; + int sse_diff_thresh = 0; + // Spatial loop filter: only applied selectively based on + // temporal filter state of block relative to top/left neighbors. + int apply_spatial_loop_filter = 1; + MV_REFERENCE_FRAME frame = x->best_reference_frame; + MV_REFERENCE_FRAME zero_frame = x->best_zeromv_reference_frame; + + enum vp8_denoiser_decision decision = FILTER_BLOCK; + enum vp8_denoiser_decision decision_u = COPY_BLOCK; + enum vp8_denoiser_decision decision_v = COPY_BLOCK; + + if (zero_frame) { + YV12_BUFFER_CONFIG *src = &denoiser->yv12_running_avg[frame]; + YV12_BUFFER_CONFIG *dst = &denoiser->yv12_mc_running_avg; + YV12_BUFFER_CONFIG saved_pre, saved_dst; + MB_MODE_INFO saved_mbmi; + MACROBLOCKD *filter_xd = &x->e_mbd; + MB_MODE_INFO *mbmi = &filter_xd->mode_info_context->mbmi; + int sse_diff = 0; + // Bias on zero motion vector sse. + const int zero_bias = denoiser->denoise_pars.denoise_mv_bias; + zero_mv_sse = (unsigned int)((int64_t)zero_mv_sse * zero_bias / 100); + sse_diff = (int)zero_mv_sse - (int)best_sse; + + saved_mbmi = *mbmi; + + /* Use the best MV for the compensation. */ + mbmi->ref_frame = x->best_reference_frame; + mbmi->mode = x->best_sse_inter_mode; + mbmi->mv = x->best_sse_mv; + mbmi->need_to_clamp_mvs = x->need_to_clamp_best_mvs; + mv_col = x->best_sse_mv.as_mv.col; + mv_row = x->best_sse_mv.as_mv.row; + // Bias to zero_mv if small amount of motion. + // Note sse_diff_thresh is intialized to zero, so this ensures + // we will always choose zero_mv for denoising if + // zero_mv_see <= best_sse (i.e., sse_diff <= 0). + if ((unsigned int)(mv_row * mv_row + mv_col * mv_col) <= + NOISE_MOTION_THRESHOLD) { + sse_diff_thresh = (int)SSE_DIFF_THRESHOLD; + } + + if (frame == INTRA_FRAME || sse_diff <= sse_diff_thresh) { + /* + * Handle intra blocks as referring to last frame with zero motion + * and let the absolute pixel difference affect the filter factor. + * Also consider small amount of motion as being random walk due + * to noise, if it doesn't mean that we get a much bigger error. + * Note that any changes to the mode info only affects the + * denoising. + */ + x->denoise_zeromv = 1; + mbmi->ref_frame = x->best_zeromv_reference_frame; + + src = &denoiser->yv12_running_avg[zero_frame]; + + mbmi->mode = ZEROMV; + mbmi->mv.as_int = 0; + x->best_sse_inter_mode = ZEROMV; + x->best_sse_mv.as_int = 0; + best_sse = zero_mv_sse; + } + + mv_row = x->best_sse_mv.as_mv.row; + mv_col = x->best_sse_mv.as_mv.col; + motion_magnitude2 = mv_row * mv_row + mv_col * mv_col; + motion_threshold = + denoiser->denoise_pars.scale_motion_thresh * NOISE_MOTION_THRESHOLD; + + if (motion_magnitude2 < + denoiser->denoise_pars.scale_increase_filter * NOISE_MOTION_THRESHOLD) { + x->increase_denoising = 1; + } + + sse_thresh = denoiser->denoise_pars.scale_sse_thresh * SSE_THRESHOLD; + if (x->increase_denoising) { + sse_thresh = denoiser->denoise_pars.scale_sse_thresh * SSE_THRESHOLD_HIGH; + } + + if (best_sse > sse_thresh || motion_magnitude2 > motion_threshold) { + decision = COPY_BLOCK; + } + + // If block is considered skin, don't denoise if the block + // (1) is selected as non-zero motion for current frame, or + // (2) has not been selected as ZERO_LAST mode at least x past frames + // in a row. + // TODO(marpan): Parameter "x" should be varied with framerate. + // In particualar, should be reduced for layers (base layer/LAST). + if (x->is_skin && (consec_zero_last < 2 || motion_magnitude2 > 0)) { + decision = COPY_BLOCK; + } + + if (decision == FILTER_BLOCK) { + saved_pre = filter_xd->pre; + saved_dst = filter_xd->dst; + + /* Compensate the running average. */ + filter_xd->pre.y_buffer = src->y_buffer + recon_yoffset; + filter_xd->pre.u_buffer = src->u_buffer + recon_uvoffset; + filter_xd->pre.v_buffer = src->v_buffer + recon_uvoffset; + /* Write the compensated running average to the destination buffer. */ + filter_xd->dst.y_buffer = dst->y_buffer + recon_yoffset; + filter_xd->dst.u_buffer = dst->u_buffer + recon_uvoffset; + filter_xd->dst.v_buffer = dst->v_buffer + recon_uvoffset; + + if (!x->skip) { + vp8_build_inter_predictors_mb(filter_xd); + } else { + vp8_build_inter16x16_predictors_mb( + filter_xd, filter_xd->dst.y_buffer, filter_xd->dst.u_buffer, + filter_xd->dst.v_buffer, filter_xd->dst.y_stride, + filter_xd->dst.uv_stride); + } + filter_xd->pre = saved_pre; + filter_xd->dst = saved_dst; + *mbmi = saved_mbmi; + } + } else { + // zero_frame should always be 1 for real-time mode, as the + // ZEROMV mode is always checked, so we should never go into this branch. + // If case ZEROMV is not checked, then we will force no denoise (COPY). + decision = COPY_BLOCK; + } + + if (decision == FILTER_BLOCK) { + unsigned char *mc_running_avg_y = + denoiser->yv12_mc_running_avg.y_buffer + recon_yoffset; + int mc_avg_y_stride = denoiser->yv12_mc_running_avg.y_stride; + unsigned char *running_avg_y = + denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset; + int avg_y_stride = denoiser->yv12_running_avg[INTRA_FRAME].y_stride; + + /* Filter. */ + decision = vp8_denoiser_filter(mc_running_avg_y, mc_avg_y_stride, + running_avg_y, avg_y_stride, x->thismb, 16, + motion_magnitude2, x->increase_denoising); + denoiser->denoise_state[block_index] = + motion_magnitude2 > 0 ? kFilterNonZeroMV : kFilterZeroMV; + // Only denoise UV for zero motion, and if y channel was denoised. + if (denoiser->denoiser_mode != kDenoiserOnYOnly && motion_magnitude2 == 0 && + decision == FILTER_BLOCK) { + unsigned char *mc_running_avg_u = + denoiser->yv12_mc_running_avg.u_buffer + recon_uvoffset; + unsigned char *running_avg_u = + denoiser->yv12_running_avg[INTRA_FRAME].u_buffer + recon_uvoffset; + unsigned char *mc_running_avg_v = + denoiser->yv12_mc_running_avg.v_buffer + recon_uvoffset; + unsigned char *running_avg_v = + denoiser->yv12_running_avg[INTRA_FRAME].v_buffer + recon_uvoffset; + int mc_avg_uv_stride = denoiser->yv12_mc_running_avg.uv_stride; + int avg_uv_stride = denoiser->yv12_running_avg[INTRA_FRAME].uv_stride; + int signal_stride = x->block[16].src_stride; + decision_u = vp8_denoiser_filter_uv( + mc_running_avg_u, mc_avg_uv_stride, running_avg_u, avg_uv_stride, + x->block[16].src + *x->block[16].base_src, signal_stride, + motion_magnitude2, 0); + decision_v = vp8_denoiser_filter_uv( + mc_running_avg_v, mc_avg_uv_stride, running_avg_v, avg_uv_stride, + x->block[20].src + *x->block[20].base_src, signal_stride, + motion_magnitude2, 0); + } + } + if (decision == COPY_BLOCK) { + /* No filtering of this block; it differs too much from the predictor, + * or the motion vector magnitude is considered too big. + */ + x->denoise_zeromv = 0; + vp8_copy_mem16x16( + x->thismb, 16, + denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset, + denoiser->yv12_running_avg[INTRA_FRAME].y_stride); + denoiser->denoise_state[block_index] = kNoFilter; + } + if (denoiser->denoiser_mode != kDenoiserOnYOnly) { + if (decision_u == COPY_BLOCK) { + vp8_copy_mem8x8( + x->block[16].src + *x->block[16].base_src, x->block[16].src_stride, + denoiser->yv12_running_avg[INTRA_FRAME].u_buffer + recon_uvoffset, + denoiser->yv12_running_avg[INTRA_FRAME].uv_stride); + } + if (decision_v == COPY_BLOCK) { + vp8_copy_mem8x8( + x->block[20].src + *x->block[20].base_src, x->block[16].src_stride, + denoiser->yv12_running_avg[INTRA_FRAME].v_buffer + recon_uvoffset, + denoiser->yv12_running_avg[INTRA_FRAME].uv_stride); + } + } + // Option to selectively deblock the denoised signal, for y channel only. + if (apply_spatial_loop_filter) { + loop_filter_info lfi; + int apply_filter_col = 0; + int apply_filter_row = 0; + int apply_filter = 0; + int y_stride = denoiser->yv12_running_avg[INTRA_FRAME].y_stride; + int uv_stride = denoiser->yv12_running_avg[INTRA_FRAME].uv_stride; + + // Fix filter level to some nominal value for now. + int filter_level = 48; + + int hev_index = lfi_n->hev_thr_lut[INTER_FRAME][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; + + // Apply filter if there is a difference in the denoiser filter state + // between the current and left/top block, or if non-zero motion vector + // is used for the motion-compensated filtering. + if (mb_col > 0) { + apply_filter_col = + !((denoiser->denoise_state[block_index] == + denoiser->denoise_state[block_index - 1]) && + denoiser->denoise_state[block_index] != kFilterNonZeroMV); + if (apply_filter_col) { + // Filter left vertical edge. + apply_filter = 1; + vp8_loop_filter_mbv( + denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset, + NULL, NULL, y_stride, uv_stride, &lfi); + } + } + if (mb_row > 0) { + apply_filter_row = + !((denoiser->denoise_state[block_index] == + denoiser->denoise_state[block_index - denoiser->num_mb_cols]) && + denoiser->denoise_state[block_index] != kFilterNonZeroMV); + if (apply_filter_row) { + // Filter top horizontal edge. + apply_filter = 1; + vp8_loop_filter_mbh( + denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset, + NULL, NULL, y_stride, uv_stride, &lfi); + } + } + if (apply_filter) { + // Update the signal block |x|. Pixel changes are only to top and/or + // left boundary pixels: can we avoid full block copy here. + vp8_copy_mem16x16( + denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset, + y_stride, x->thismb, 16); + } + } +} diff --git a/media/libvpx/libvpx/vp8/encoder/denoising.h b/media/libvpx/libvpx/vp8/encoder/denoising.h new file mode 100644 index 0000000000..51ae3b0ab3 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/denoising.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_ENCODER_DENOISING_H_ +#define VPX_VP8_ENCODER_DENOISING_H_ + +#include "block.h" +#include "vp8/common/loopfilter.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define SUM_DIFF_THRESHOLD 512 +#define SUM_DIFF_THRESHOLD_HIGH 600 +#define MOTION_MAGNITUDE_THRESHOLD (8 * 3) + +#define SUM_DIFF_THRESHOLD_UV (96) // (8 * 8 * 1.5) +#define SUM_DIFF_THRESHOLD_HIGH_UV (8 * 8 * 2) +#define SUM_DIFF_FROM_AVG_THRESH_UV (8 * 8 * 8) +#define MOTION_MAGNITUDE_THRESHOLD_UV (8 * 3) + +#define MAX_GF_ARF_DENOISE_RANGE (8) + +enum vp8_denoiser_decision { COPY_BLOCK, FILTER_BLOCK }; + +enum vp8_denoiser_filter_state { kNoFilter, kFilterZeroMV, kFilterNonZeroMV }; + +enum vp8_denoiser_mode { + kDenoiserOff, + kDenoiserOnYOnly, + kDenoiserOnYUV, + kDenoiserOnYUVAggressive, + kDenoiserOnAdaptive +}; + +typedef struct { + // Scale factor on sse threshold above which no denoising is done. + unsigned int scale_sse_thresh; + // Scale factor on motion magnitude threshold above which no + // denoising is done. + unsigned int scale_motion_thresh; + // Scale factor on motion magnitude below which we increase the strength of + // the temporal filter (in function vp8_denoiser_filter). + unsigned int scale_increase_filter; + // Scale factor to bias to ZEROMV for denoising. + unsigned int denoise_mv_bias; + // Scale factor to bias to ZEROMV for coding mode selection. + unsigned int pickmode_mv_bias; + // Quantizer threshold below which we use the segmentation map to switch off + // loop filter for blocks that have been coded as ZEROMV-LAST a certain number + // (consec_zerolast) of consecutive frames. Note that the delta-QP is set to + // 0 when segmentation map is used for shutting off loop filter. + unsigned int qp_thresh; + // Threshold for number of consecutive frames for blocks coded as ZEROMV-LAST. + unsigned int consec_zerolast; + // Threshold for amount of spatial blur on Y channel. 0 means no spatial blur. + unsigned int spatial_blur; +} denoise_params; + +typedef struct vp8_denoiser { + YV12_BUFFER_CONFIG yv12_running_avg[MAX_REF_FRAMES]; + YV12_BUFFER_CONFIG yv12_mc_running_avg; + // TODO(marpan): Should remove yv12_last_source and use vp8_lookahead_peak. + YV12_BUFFER_CONFIG yv12_last_source; + unsigned char *denoise_state; + int num_mb_cols; + int denoiser_mode; + int threshold_aggressive_mode; + int nmse_source_diff; + int nmse_source_diff_count; + int qp_avg; + int qp_threshold_up; + int qp_threshold_down; + int bitrate_threshold; + denoise_params denoise_pars; +} VP8_DENOISER; + +int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height, + int num_mb_rows, int num_mb_cols, int mode); + +void vp8_denoiser_free(VP8_DENOISER *denoiser); + +void vp8_denoiser_set_parameters(VP8_DENOISER *denoiser, int mode); + +void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser, MACROBLOCK *x, + unsigned int best_sse, unsigned int zero_mv_sse, + int recon_yoffset, int recon_uvoffset, + loop_filter_info_n *lfi_n, int mb_row, int mb_col, + int block_index, int consec_zero_last); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_ENCODER_DENOISING_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/encodeframe.c b/media/libvpx/libvpx/vp8/encoder/encodeframe.c new file mode 100644 index 0000000000..82c48b13a7 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/encodeframe.c @@ -0,0 +1,1306 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include +#include + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "bitstream.h" +#include "encodemb.h" +#include "encodemv.h" +#if CONFIG_MULTITHREAD +#include "ethreading.h" +#endif +#include "vp8/common/common.h" +#include "onyx_int.h" +#include "vp8/common/extend.h" +#include "vp8/common/entropymode.h" +#include "vp8/common/quant_common.h" +#include "segmentation.h" +#include "vp8/common/setupintrarecon.h" +#include "encodeintra.h" +#include "vp8/common/reconinter.h" +#include "rdopt.h" +#include "pickinter.h" +#include "vp8/common/findnearmv.h" +#include "vp8/common/invtrans.h" +#include "vpx/internal/vpx_codec_internal.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/vpx_timer.h" +#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING +#include "bitstream.h" +#endif +#include "encodeframe.h" + +extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t); +static void adjust_act_zbin(VP8_COMP *cpi, MACROBLOCK *x); + +#ifdef MODE_STATS +unsigned int inter_y_modes[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; +unsigned int inter_uv_modes[4] = { 0, 0, 0, 0 }; +unsigned int inter_b_modes[15] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; +unsigned int y_modes[5] = { 0, 0, 0, 0, 0 }; +unsigned int uv_modes[4] = { 0, 0, 0, 0 }; +unsigned int b_modes[14] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; +#endif + +/* activity_avg must be positive, or flat regions could get a zero weight + * (infinite lambda), which confounds analysis. + * This also avoids the need for divide by zero checks in + * vp8_activity_masking(). + */ +#define VP8_ACTIVITY_AVG_MIN (64) + +/* This is used as a reference when computing the source variance for the + * purposes of activity masking. + * Eventually this should be replaced by custom no-reference routines, + * which will be faster. + */ +static const unsigned char VP8_VAR_OFFS[16] = { 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128 }; + +/* Original activity measure from Tim T's code. */ +static unsigned int tt_activity_measure(MACROBLOCK *x) { + unsigned int act; + unsigned int sse; + /* TODO: This could also be done over smaller areas (8x8), but that would + * require extensive changes elsewhere, as lambda is assumed to be fixed + * over an entire MB in most of the code. + * Another option is to compute four 8x8 variances, and pick a single + * lambda using a non-linear combination (e.g., the smallest, or second + * smallest, etc.). + */ + act = vpx_variance16x16(x->src.y_buffer, x->src.y_stride, VP8_VAR_OFFS, 0, + &sse); + act = act << 4; + + /* If the region is flat, lower the activity some more. */ + if (act < 8 << 12) act = act < 5 << 12 ? act : 5 << 12; + + return act; +} + +/* Measure the activity of the current macroblock + * What we measure here is TBD so abstracted to this function + */ +#define ALT_ACT_MEASURE 1 +static unsigned int mb_activity_measure(MACROBLOCK *x, int mb_row, int mb_col) { + unsigned int mb_activity; + + if (ALT_ACT_MEASURE) { + int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row); + + /* Or use an alternative. */ + mb_activity = vp8_encode_intra(x, use_dc_pred); + } else { + /* Original activity measure from Tim T's code. */ + mb_activity = tt_activity_measure(x); + } + + if (mb_activity < VP8_ACTIVITY_AVG_MIN) mb_activity = VP8_ACTIVITY_AVG_MIN; + + return mb_activity; +} + +/* Calculate an "average" mb activity value for the frame */ +#define ACT_MEDIAN 0 +static void calc_av_activity(VP8_COMP *cpi, int64_t activity_sum) { +#if ACT_MEDIAN + /* Find median: Simple n^2 algorithm for experimentation */ + { + unsigned int median; + unsigned int i, j; + unsigned int *sortlist; + unsigned int tmp; + + /* Create a list to sort to */ + CHECK_MEM_ERROR(&cpi->common.error, sortlist, + vpx_calloc(sizeof(unsigned int), cpi->common.MBs)); + + /* Copy map to sort list */ + memcpy(sortlist, cpi->mb_activity_map, + sizeof(unsigned int) * cpi->common.MBs); + + /* Ripple each value down to its correct position */ + for (i = 1; i < cpi->common.MBs; ++i) { + for (j = i; j > 0; j--) { + if (sortlist[j] < sortlist[j - 1]) { + /* Swap values */ + tmp = sortlist[j - 1]; + sortlist[j - 1] = sortlist[j]; + sortlist[j] = tmp; + } else + break; + } + } + + /* Even number MBs so estimate median as mean of two either side. */ + median = (1 + sortlist[cpi->common.MBs >> 1] + + sortlist[(cpi->common.MBs >> 1) + 1]) >> + 1; + + cpi->activity_avg = median; + + vpx_free(sortlist); + } +#else + /* Simple mean for now */ + cpi->activity_avg = (unsigned int)(activity_sum / cpi->common.MBs); +#endif + + if (cpi->activity_avg < VP8_ACTIVITY_AVG_MIN) { + cpi->activity_avg = VP8_ACTIVITY_AVG_MIN; + } + + /* Experimental code: return fixed value normalized for several clips */ + if (ALT_ACT_MEASURE) cpi->activity_avg = 100000; +} + +#define USE_ACT_INDEX 0 +#define OUTPUT_NORM_ACT_STATS 0 + +#if USE_ACT_INDEX +/* Calculate and activity index for each mb */ +static void calc_activity_index(VP8_COMP *cpi, MACROBLOCK *x) { + VP8_COMMON *const cm = &cpi->common; + int mb_row, mb_col; + + int64_t act; + int64_t a; + int64_t b; + +#if OUTPUT_NORM_ACT_STATS + FILE *f = fopen("norm_act.stt", "a"); + fprintf(f, "\n%12d\n", cpi->activity_avg); +#endif + + /* Reset pointers to start of activity map */ + x->mb_activity_ptr = cpi->mb_activity_map; + + /* Calculate normalized mb activity number. */ + for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { + /* for each macroblock col in image */ + for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) { + /* Read activity from the map */ + act = *(x->mb_activity_ptr); + + /* Calculate a normalized activity number */ + a = act + 4 * cpi->activity_avg; + b = 4 * act + cpi->activity_avg; + + if (b >= a) + *(x->activity_ptr) = (int)((b + (a >> 1)) / a) - 1; + else + *(x->activity_ptr) = 1 - (int)((a + (b >> 1)) / b); + +#if OUTPUT_NORM_ACT_STATS + fprintf(f, " %6d", *(x->mb_activity_ptr)); +#endif + /* Increment activity map pointers */ + x->mb_activity_ptr++; + } + +#if OUTPUT_NORM_ACT_STATS + fprintf(f, "\n"); +#endif + } + +#if OUTPUT_NORM_ACT_STATS + fclose(f); +#endif +} +#endif + +/* Loop through all MBs. Note activity of each, average activity and + * calculate a normalized activity for each + */ +static void build_activity_map(VP8_COMP *cpi) { + MACROBLOCK *const x = &cpi->mb; + MACROBLOCKD *xd = &x->e_mbd; + VP8_COMMON *const cm = &cpi->common; + +#if ALT_ACT_MEASURE + YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx]; + int recon_yoffset; + int recon_y_stride = new_yv12->y_stride; +#endif + + int mb_row, mb_col; + unsigned int mb_activity; + int64_t activity_sum = 0; + + /* for each macroblock row in image */ + for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { +#if ALT_ACT_MEASURE + /* reset above block coeffs */ + xd->up_available = (mb_row != 0); + recon_yoffset = (mb_row * recon_y_stride * 16); +#endif + /* for each macroblock col in image */ + for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) { +#if ALT_ACT_MEASURE + xd->dst.y_buffer = new_yv12->y_buffer + recon_yoffset; + xd->left_available = (mb_col != 0); + recon_yoffset += 16; +#endif + /* Copy current mb to a buffer */ + vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16); + + /* measure activity */ + mb_activity = mb_activity_measure(x, mb_row, mb_col); + + /* Keep frame sum */ + activity_sum += mb_activity; + + /* Store MB level activity details. */ + *x->mb_activity_ptr = mb_activity; + + /* Increment activity map pointer */ + x->mb_activity_ptr++; + + /* adjust to the next column of source macroblocks */ + x->src.y_buffer += 16; + } + + /* adjust to the next row of mbs */ + x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols; + +#if ALT_ACT_MEASURE + /* extend the recon for intra prediction */ + vp8_extend_mb_row(new_yv12, xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, + xd->dst.v_buffer + 8); +#endif + } + + /* Calculate an "average" MB activity */ + calc_av_activity(cpi, activity_sum); + +#if USE_ACT_INDEX + /* Calculate an activity index number of each mb */ + calc_activity_index(cpi, x); +#endif +} + +/* Macroblock activity masking */ +void vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x) { +#if USE_ACT_INDEX + x->rdmult += *(x->mb_activity_ptr) * (x->rdmult >> 2); + x->errorperbit = x->rdmult * 100 / (110 * x->rddiv); + x->errorperbit += (x->errorperbit == 0); +#else + int64_t a; + int64_t b; + int64_t act = *(x->mb_activity_ptr); + + /* Apply the masking to the RD multiplier. */ + a = act + (2 * cpi->activity_avg); + b = (2 * act) + cpi->activity_avg; + + x->rdmult = (unsigned int)(((int64_t)x->rdmult * b + (a >> 1)) / a); + x->errorperbit = x->rdmult * 100 / (110 * x->rddiv); + x->errorperbit += (x->errorperbit == 0); +#endif + + /* Activity based Zbin adjustment */ + adjust_act_zbin(cpi, x); +} + +static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row, + MACROBLOCK *x, MACROBLOCKD *xd, TOKENEXTRA **tp, + int *segment_counts, int *totalrate) { + int recon_yoffset, recon_uvoffset; + int mb_col; + int ref_fb_idx = cm->lst_fb_idx; + int dst_fb_idx = cm->new_fb_idx; + int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride; + int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride; + int map_index = (mb_row * cpi->common.mb_cols); + +#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING) + const int num_part = (1 << cm->multi_token_partition); + TOKENEXTRA *tp_start = cpi->tok; + vp8_writer *w; +#endif + +#if CONFIG_MULTITHREAD + const int nsync = cpi->mt_sync_range; + vpx_atomic_int rightmost_col = VPX_ATOMIC_INIT(cm->mb_cols + nsync); + const vpx_atomic_int *last_row_current_mb_col; + vpx_atomic_int *current_mb_col = NULL; + + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0) { + current_mb_col = &cpi->mt_current_mb_col[mb_row]; + } + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0 && mb_row != 0) { + last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1]; + } else { + last_row_current_mb_col = &rightmost_col; + } +#endif + +#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING) + if (num_part > 1) + w = &cpi->bc[1 + (mb_row % num_part)]; + else + w = &cpi->bc[1]; +#endif + + /* reset above block coeffs */ + xd->above_context = cm->above_context; + + xd->up_available = (mb_row != 0); + recon_yoffset = (mb_row * recon_y_stride * 16); + recon_uvoffset = (mb_row * recon_uv_stride * 8); + + cpi->tplist[mb_row].start = *tp; + /* printf("Main mb_row = %d\n", mb_row); */ + + /* Distance of Mb to the top & bottom edges, specified in 1/8th pel + * units as they are always compared to values that are in 1/8th pel + */ + xd->mb_to_top_edge = -((mb_row * 16) << 3); + xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3; + + /* Set up limit values for vertical motion vector components + * to prevent them extending beyond the UMV borders + */ + x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16)); + x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16); + + /* Set the mb activity pointer to the start of the row. */ + x->mb_activity_ptr = &cpi->mb_activity_map[map_index]; + + /* for each macroblock col in image */ + for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) { +#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING) + *tp = cpi->tok; +#endif + /* Distance of Mb to the left & right edges, specified in + * 1/8th pel units as they are always compared to values + * that are in 1/8th pel units + */ + xd->mb_to_left_edge = -((mb_col * 16) << 3); + xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3; + + /* Set up limit values for horizontal motion vector components + * to prevent them extending beyond the UMV borders + */ + x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16)); + x->mv_col_max = + ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16); + + xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset; + xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset; + xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset; + xd->left_available = (mb_col != 0); + + x->rddiv = cpi->RDDIV; + x->rdmult = cpi->RDMULT; + + /* Copy current mb to a buffer */ + vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16); + +#if CONFIG_MULTITHREAD + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0) { + if (((mb_col - 1) % nsync) == 0) { + vpx_atomic_store_release(current_mb_col, mb_col - 1); + } + + if (mb_row && !(mb_col & (nsync - 1))) { + vp8_atomic_spin_wait(mb_col, last_row_current_mb_col, nsync); + } + } +#endif + + if (cpi->oxcf.tuning == VP8_TUNE_SSIM) vp8_activity_masking(cpi, x); + + /* Is segmentation enabled */ + /* MB level adjustment to quantizer */ + if (xd->segmentation_enabled) { + /* Code to set segment id in xd->mbmi.segment_id for current MB + * (with range checking) + */ + if (cpi->segmentation_map[map_index + mb_col] <= 3) { + xd->mode_info_context->mbmi.segment_id = + cpi->segmentation_map[map_index + mb_col]; + } else { + xd->mode_info_context->mbmi.segment_id = 0; + } + + vp8cx_mb_init_quantizer(cpi, x, 1); + } else { + /* Set to Segment 0 by default */ + xd->mode_info_context->mbmi.segment_id = 0; + } + + x->active_ptr = cpi->active_map + map_index + mb_col; + + if (cm->frame_type == KEY_FRAME) { + const int intra_rate_cost = vp8cx_encode_intra_macroblock(cpi, x, tp); + if (INT_MAX - *totalrate > intra_rate_cost) + *totalrate += intra_rate_cost; + else + *totalrate = INT_MAX; +#ifdef MODE_STATS + y_modes[xd->mbmi.mode]++; +#endif + } else { + const int inter_rate_cost = vp8cx_encode_inter_macroblock( + cpi, x, tp, recon_yoffset, recon_uvoffset, mb_row, mb_col); + if (INT_MAX - *totalrate > inter_rate_cost) + *totalrate += inter_rate_cost; + else + *totalrate = INT_MAX; + +#ifdef MODE_STATS + inter_y_modes[xd->mbmi.mode]++; + + if (xd->mbmi.mode == SPLITMV) { + int b; + + for (b = 0; b < xd->mbmi.partition_count; ++b) { + inter_b_modes[x->partition->bmi[b].mode]++; + } + } + +#endif + + // Keep track of how many (consecutive) times a block is coded + // as ZEROMV_LASTREF, for base layer frames. + // Reset to 0 if its coded as anything else. + if (cpi->current_layer == 0) { + if (xd->mode_info_context->mbmi.mode == ZEROMV && + xd->mode_info_context->mbmi.ref_frame == LAST_FRAME) { + // Increment, check for wrap-around. + if (cpi->consec_zero_last[map_index + mb_col] < 255) { + cpi->consec_zero_last[map_index + mb_col] += 1; + } + if (cpi->consec_zero_last_mvbias[map_index + mb_col] < 255) { + cpi->consec_zero_last_mvbias[map_index + mb_col] += 1; + } + } else { + cpi->consec_zero_last[map_index + mb_col] = 0; + cpi->consec_zero_last_mvbias[map_index + mb_col] = 0; + } + if (x->zero_last_dot_suppress) { + cpi->consec_zero_last_mvbias[map_index + mb_col] = 0; + } + } + + /* Special case code for cyclic refresh + * If cyclic update enabled then copy xd->mbmi.segment_id; (which + * may have been updated based on mode during + * vp8cx_encode_inter_macroblock()) back into the global + * segmentation map + */ + if ((cpi->current_layer == 0) && + (cpi->cyclic_refresh_mode_enabled && xd->segmentation_enabled)) { + cpi->segmentation_map[map_index + mb_col] = + xd->mode_info_context->mbmi.segment_id; + + /* If the block has been refreshed mark it as clean (the + * magnitude of the -ve influences how long it will be before + * we consider another refresh): + * Else if it was coded (last frame 0,0) and has not already + * been refreshed then mark it as a candidate for cleanup + * next time (marked 0) else mark it as dirty (1). + */ + if (xd->mode_info_context->mbmi.segment_id) { + cpi->cyclic_refresh_map[map_index + mb_col] = -1; + } else if ((xd->mode_info_context->mbmi.mode == ZEROMV) && + (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)) { + if (cpi->cyclic_refresh_map[map_index + mb_col] == 1) { + cpi->cyclic_refresh_map[map_index + mb_col] = 0; + } + } else { + cpi->cyclic_refresh_map[map_index + mb_col] = 1; + } + } + } + + cpi->tplist[mb_row].stop = *tp; + +#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING + /* pack tokens for this MB */ + { + int tok_count = *tp - tp_start; + vp8_pack_tokens(w, tp_start, tok_count); + } +#endif + /* Increment pointer into gf usage flags structure. */ + x->gf_active_ptr++; + + /* Increment the activity mask pointers. */ + x->mb_activity_ptr++; + + /* adjust to the next column of macroblocks */ + x->src.y_buffer += 16; + x->src.u_buffer += 8; + x->src.v_buffer += 8; + + recon_yoffset += 16; + recon_uvoffset += 8; + + /* Keep track of segment usage */ + segment_counts[xd->mode_info_context->mbmi.segment_id]++; + + /* skip to next mb */ + xd->mode_info_context++; + x->partition_info++; + xd->above_context++; + } + + /* extend the recon for intra prediction */ + vp8_extend_mb_row(&cm->yv12_fb[dst_fb_idx], xd->dst.y_buffer + 16, + xd->dst.u_buffer + 8, xd->dst.v_buffer + 8); + +#if CONFIG_MULTITHREAD + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0) { + vpx_atomic_store_release(current_mb_col, + vpx_atomic_load_acquire(&rightmost_col)); + } +#endif + + /* this is to account for the border */ + xd->mode_info_context++; + x->partition_info++; +} + +static void init_encode_frame_mb_context(VP8_COMP *cpi) { + MACROBLOCK *const x = &cpi->mb; + VP8_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + + /* GF active flags data structure */ + x->gf_active_ptr = (signed char *)cpi->gf_active_flags; + + /* Activity map pointer */ + x->mb_activity_ptr = cpi->mb_activity_map; + + x->act_zbin_adj = 0; + + x->partition_info = x->pi; + + xd->mode_info_context = cm->mi; + xd->mode_info_stride = cm->mode_info_stride; + + xd->frame_type = cm->frame_type; + + /* reset intra mode contexts */ + if (cm->frame_type == KEY_FRAME) vp8_init_mbmode_probs(cm); + + /* Copy data over into macro block data structures. */ + x->src = *cpi->Source; + xd->pre = cm->yv12_fb[cm->lst_fb_idx]; + xd->dst = cm->yv12_fb[cm->new_fb_idx]; + + /* set up frame for intra coded blocks */ + vp8_setup_intra_recon(&cm->yv12_fb[cm->new_fb_idx]); + + vp8_build_block_offsets(x); + + xd->mode_info_context->mbmi.mode = DC_PRED; + xd->mode_info_context->mbmi.uv_mode = DC_PRED; + + xd->left_context = &cm->left_context; + + x->mvc = cm->fc.mvc; + + memset(cm->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols); + + /* Special case treatment when GF and ARF are not sensible options + * for reference + */ + if (cpi->ref_frame_flags == VP8_LAST_FRAME) { + vp8_calc_ref_frame_costs(x->ref_frame_cost, cpi->prob_intra_coded, 255, + 128); + } else if ((cpi->oxcf.number_of_layers > 1) && + (cpi->ref_frame_flags == VP8_GOLD_FRAME)) { + vp8_calc_ref_frame_costs(x->ref_frame_cost, cpi->prob_intra_coded, 1, 255); + } else if ((cpi->oxcf.number_of_layers > 1) && + (cpi->ref_frame_flags == VP8_ALTR_FRAME)) { + vp8_calc_ref_frame_costs(x->ref_frame_cost, cpi->prob_intra_coded, 1, 1); + } else { + vp8_calc_ref_frame_costs(x->ref_frame_cost, cpi->prob_intra_coded, + cpi->prob_last_coded, cpi->prob_gf_coded); + } + + xd->fullpixel_mask = ~0; + if (cm->full_pixel) xd->fullpixel_mask = ~7; + + vp8_zero(x->coef_counts); + vp8_zero(x->ymode_count); + vp8_zero(x->uv_mode_count); + x->prediction_error = 0; + x->intra_error = 0; + vp8_zero(x->count_mb_ref_frame_usage); +} + +#if CONFIG_MULTITHREAD +static void sum_coef_counts(MACROBLOCK *x, MACROBLOCK *x_thread) { + int i = 0; + do { + int j = 0; + do { + int k = 0; + do { + /* at every context */ + + /* calc probs and branch cts for this frame only */ + int t = 0; /* token/prob index */ + + do { + x->coef_counts[i][j][k][t] += x_thread->coef_counts[i][j][k][t]; + } while (++t < ENTROPY_NODES); + } while (++k < PREV_COEF_CONTEXTS); + } while (++j < COEF_BANDS); + } while (++i < BLOCK_TYPES); +} +#endif // CONFIG_MULTITHREAD + +void vp8_encode_frame(VP8_COMP *cpi) { + int mb_row; + MACROBLOCK *const x = &cpi->mb; + VP8_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + TOKENEXTRA *tp = cpi->tok; + int segment_counts[MAX_MB_SEGMENTS]; + int totalrate; +#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING + BOOL_CODER *bc = &cpi->bc[1]; /* bc[0] is for control partition */ + const int num_part = (1 << cm->multi_token_partition); +#endif + + memset(segment_counts, 0, sizeof(segment_counts)); + totalrate = 0; + + if (cpi->compressor_speed == 2) { + if (cpi->oxcf.cpu_used < 0) { + cpi->Speed = -(cpi->oxcf.cpu_used); + } else { + vp8_auto_select_speed(cpi); + } + } + + /* Functions setup for all frame types so we can use MC in AltRef */ + if (!cm->use_bilinear_mc_filter) { + xd->subpixel_predict = vp8_sixtap_predict4x4; + xd->subpixel_predict8x4 = vp8_sixtap_predict8x4; + xd->subpixel_predict8x8 = vp8_sixtap_predict8x8; + xd->subpixel_predict16x16 = vp8_sixtap_predict16x16; + } else { + xd->subpixel_predict = vp8_bilinear_predict4x4; + xd->subpixel_predict8x4 = vp8_bilinear_predict8x4; + xd->subpixel_predict8x8 = vp8_bilinear_predict8x8; + xd->subpixel_predict16x16 = vp8_bilinear_predict16x16; + } + + cpi->mb.skip_true_count = 0; + cpi->tok_count = 0; + +#if 0 + /* Experimental code */ + cpi->frame_distortion = 0; + cpi->last_mb_distortion = 0; +#endif + + xd->mode_info_context = cm->mi; + + vp8_zero(cpi->mb.MVcount); + + vp8cx_frame_init_quantizer(cpi); + + vp8_initialize_rd_consts(cpi, x, + vp8_dc_quant(cm->base_qindex, cm->y1dc_delta_q)); + + vp8cx_initialize_me_consts(cpi, cm->base_qindex); + + if (cpi->oxcf.tuning == VP8_TUNE_SSIM) { + /* Initialize encode frame context. */ + init_encode_frame_mb_context(cpi); + + /* Build a frame level activity map */ + build_activity_map(cpi); + } + + /* re-init encode frame context. */ + init_encode_frame_mb_context(cpi); + +#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING + { + int i; + for (i = 0; i < num_part; ++i) { + vp8_start_encode(&bc[i], cpi->partition_d[i + 1], + cpi->partition_d_end[i + 1]); + bc[i].error = &cm->error; + } + } + +#endif + + { + struct vpx_usec_timer emr_timer; + vpx_usec_timer_start(&emr_timer); + +#if CONFIG_MULTITHREAD + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) { + int i; + + vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei, + cpi->encoding_thread_count); + + if (cpi->mt_current_mb_col_size != cm->mb_rows) { + vpx_free(cpi->mt_current_mb_col); + cpi->mt_current_mb_col = NULL; + cpi->mt_current_mb_col_size = 0; + CHECK_MEM_ERROR( + &cpi->common.error, cpi->mt_current_mb_col, + vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows)); + cpi->mt_current_mb_col_size = cm->mb_rows; + } + for (i = 0; i < cm->mb_rows; ++i) + vpx_atomic_store_release(&cpi->mt_current_mb_col[i], -1); + + for (i = 0; i < cpi->encoding_thread_count; ++i) { + sem_post(&cpi->h_event_start_encoding[i]); + } + + for (mb_row = 0; mb_row < cm->mb_rows; + mb_row += (cpi->encoding_thread_count + 1)) { + vp8_zero(cm->left_context); + +#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING + tp = cpi->tok; +#else + tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24); +#endif + + encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate); + + /* adjust to the next row of mbs */ + x->src.y_buffer += + 16 * x->src.y_stride * (cpi->encoding_thread_count + 1) - + 16 * cm->mb_cols; + x->src.u_buffer += + 8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - + 8 * cm->mb_cols; + x->src.v_buffer += + 8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - + 8 * cm->mb_cols; + + xd->mode_info_context += + xd->mode_info_stride * cpi->encoding_thread_count; + x->partition_info += xd->mode_info_stride * cpi->encoding_thread_count; + x->gf_active_ptr += cm->mb_cols * cpi->encoding_thread_count; + } + /* Wait for all the threads to finish. */ + for (i = 0; i < cpi->encoding_thread_count; ++i) { + sem_wait(&cpi->h_event_end_encoding[i]); + } + + for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { + cpi->tok_count += (unsigned int)(cpi->tplist[mb_row].stop - + cpi->tplist[mb_row].start); + } + + if (xd->segmentation_enabled) { + int j; + + if (xd->segmentation_enabled) { + for (i = 0; i < cpi->encoding_thread_count; ++i) { + for (j = 0; j < 4; ++j) { + segment_counts[j] += cpi->mb_row_ei[i].segment_counts[j]; + } + } + } + } + + for (i = 0; i < cpi->encoding_thread_count; ++i) { + int mode_count; + int c_idx; + totalrate += cpi->mb_row_ei[i].totalrate; + + cpi->mb.skip_true_count += cpi->mb_row_ei[i].mb.skip_true_count; + + for (mode_count = 0; mode_count < VP8_YMODES; ++mode_count) { + cpi->mb.ymode_count[mode_count] += + cpi->mb_row_ei[i].mb.ymode_count[mode_count]; + } + + for (mode_count = 0; mode_count < VP8_UV_MODES; ++mode_count) { + cpi->mb.uv_mode_count[mode_count] += + cpi->mb_row_ei[i].mb.uv_mode_count[mode_count]; + } + + for (c_idx = 0; c_idx < MVvals; ++c_idx) { + cpi->mb.MVcount[0][c_idx] += cpi->mb_row_ei[i].mb.MVcount[0][c_idx]; + cpi->mb.MVcount[1][c_idx] += cpi->mb_row_ei[i].mb.MVcount[1][c_idx]; + } + + cpi->mb.prediction_error += cpi->mb_row_ei[i].mb.prediction_error; + cpi->mb.intra_error += cpi->mb_row_ei[i].mb.intra_error; + + for (c_idx = 0; c_idx < MAX_REF_FRAMES; ++c_idx) { + cpi->mb.count_mb_ref_frame_usage[c_idx] += + cpi->mb_row_ei[i].mb.count_mb_ref_frame_usage[c_idx]; + } + + for (c_idx = 0; c_idx < MAX_ERROR_BINS; ++c_idx) { + cpi->mb.error_bins[c_idx] += cpi->mb_row_ei[i].mb.error_bins[c_idx]; + } + + /* add up counts for each thread */ + sum_coef_counts(x, &cpi->mb_row_ei[i].mb); + } + + } else +#endif // CONFIG_MULTITHREAD + { + + /* for each macroblock row in image */ + for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { + vp8_zero(cm->left_context); + +#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING + tp = cpi->tok; +#endif + + encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate); + + /* adjust to the next row of mbs */ + x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols; + x->src.u_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols; + x->src.v_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols; + } + + cpi->tok_count = (unsigned int)(tp - cpi->tok); + } + +#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING + { + int i; + for (i = 0; i < num_part; ++i) { + vp8_stop_encode(&bc[i]); + cpi->partition_sz[i + 1] = bc[i].pos; + } + } +#endif + + vpx_usec_timer_mark(&emr_timer); + cpi->time_encode_mb_row += vpx_usec_timer_elapsed(&emr_timer); + } + + // Work out the segment probabilities if segmentation is enabled + // and needs to be updated + if (xd->segmentation_enabled && xd->update_mb_segmentation_map) { + int tot_count; + int i; + + /* Set to defaults */ + memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs)); + + tot_count = segment_counts[0] + segment_counts[1] + segment_counts[2] + + segment_counts[3]; + + if (tot_count) { + xd->mb_segment_tree_probs[0] = + ((segment_counts[0] + segment_counts[1]) * 255) / tot_count; + + tot_count = segment_counts[0] + segment_counts[1]; + + if (tot_count > 0) { + xd->mb_segment_tree_probs[1] = (segment_counts[0] * 255) / tot_count; + } + + tot_count = segment_counts[2] + segment_counts[3]; + + if (tot_count > 0) { + xd->mb_segment_tree_probs[2] = (segment_counts[2] * 255) / tot_count; + } + + /* Zero probabilities not allowed */ + for (i = 0; i < MB_FEATURE_TREE_PROBS; ++i) { + if (xd->mb_segment_tree_probs[i] == 0) xd->mb_segment_tree_probs[i] = 1; + } + } + } + + /* projected_frame_size in units of BYTES */ + cpi->projected_frame_size = totalrate >> 8; + + /* Make a note of the percentage MBs coded Intra. */ + if (cm->frame_type == KEY_FRAME) { + cpi->this_frame_percent_intra = 100; + } else { + int tot_modes; + + tot_modes = cpi->mb.count_mb_ref_frame_usage[INTRA_FRAME] + + cpi->mb.count_mb_ref_frame_usage[LAST_FRAME] + + cpi->mb.count_mb_ref_frame_usage[GOLDEN_FRAME] + + cpi->mb.count_mb_ref_frame_usage[ALTREF_FRAME]; + + if (tot_modes) { + cpi->this_frame_percent_intra = + cpi->mb.count_mb_ref_frame_usage[INTRA_FRAME] * 100 / tot_modes; + } + } + +#if !CONFIG_REALTIME_ONLY + /* Adjust the projected reference frame usage probability numbers to + * reflect what we have just seen. This may be useful when we make + * multiple iterations of the recode loop rather than continuing to use + * values from the previous frame. + */ + if ((cm->frame_type != KEY_FRAME) && + ((cpi->oxcf.number_of_layers > 1) || + (!cm->refresh_alt_ref_frame && !cm->refresh_golden_frame))) { + vp8_convert_rfct_to_prob(cpi); + } +#endif +} +void vp8_setup_block_ptrs(MACROBLOCK *x) { + int r, c; + int i; + + for (r = 0; r < 4; ++r) { + for (c = 0; c < 4; ++c) { + x->block[r * 4 + c].src_diff = x->src_diff + r * 4 * 16 + c * 4; + } + } + + for (r = 0; r < 2; ++r) { + for (c = 0; c < 2; ++c) { + x->block[16 + r * 2 + c].src_diff = x->src_diff + 256 + r * 4 * 8 + c * 4; + } + } + + for (r = 0; r < 2; ++r) { + for (c = 0; c < 2; ++c) { + x->block[20 + r * 2 + c].src_diff = x->src_diff + 320 + r * 4 * 8 + c * 4; + } + } + + x->block[24].src_diff = x->src_diff + 384; + + for (i = 0; i < 25; ++i) { + x->block[i].coeff = x->coeff + i * 16; + } +} + +void vp8_build_block_offsets(MACROBLOCK *x) { + int block = 0; + int br, bc; + + vp8_build_block_doffsets(&x->e_mbd); + + /* y blocks */ + x->thismb_ptr = &x->thismb[0]; + for (br = 0; br < 4; ++br) { + for (bc = 0; bc < 4; ++bc) { + BLOCK *this_block = &x->block[block]; + this_block->base_src = &x->thismb_ptr; + this_block->src_stride = 16; + this_block->src = 4 * br * 16 + 4 * bc; + ++block; + } + } + + /* u blocks */ + for (br = 0; br < 2; ++br) { + for (bc = 0; bc < 2; ++bc) { + BLOCK *this_block = &x->block[block]; + this_block->base_src = &x->src.u_buffer; + this_block->src_stride = x->src.uv_stride; + this_block->src = 4 * br * this_block->src_stride + 4 * bc; + ++block; + } + } + + /* v blocks */ + for (br = 0; br < 2; ++br) { + for (bc = 0; bc < 2; ++bc) { + BLOCK *this_block = &x->block[block]; + this_block->base_src = &x->src.v_buffer; + this_block->src_stride = x->src.uv_stride; + this_block->src = 4 * br * this_block->src_stride + 4 * bc; + ++block; + } + } +} + +static void sum_intra_stats(VP8_COMP *cpi, MACROBLOCK *x) { + const MACROBLOCKD *xd = &x->e_mbd; + const MB_PREDICTION_MODE m = xd->mode_info_context->mbmi.mode; + const MB_PREDICTION_MODE uvm = xd->mode_info_context->mbmi.uv_mode; + +#ifdef MODE_STATS + const int is_key = cpi->common.frame_type == KEY_FRAME; + + ++(is_key ? uv_modes : inter_uv_modes)[uvm]; + + if (m == B_PRED) { + unsigned int *const bct = is_key ? b_modes : inter_b_modes; + + int b = 0; + + do { + ++bct[xd->block[b].bmi.mode]; + } while (++b < 16); + } + +#else + (void)cpi; +#endif + + ++x->ymode_count[m]; + ++x->uv_mode_count[uvm]; +} + +/* Experimental stub function to create a per MB zbin adjustment based on + * some previously calculated measure of MB activity. + */ +static void adjust_act_zbin(VP8_COMP *cpi, MACROBLOCK *x) { +#if USE_ACT_INDEX + x->act_zbin_adj = *(x->mb_activity_ptr); +#else + int64_t a; + int64_t b; + int64_t act = *(x->mb_activity_ptr); + + /* Apply the masking to the RD multiplier. */ + a = act + 4 * cpi->activity_avg; + b = 4 * act + cpi->activity_avg; + + if (act > cpi->activity_avg) { + x->act_zbin_adj = (int)(((int64_t)b + (a >> 1)) / a) - 1; + } else { + x->act_zbin_adj = 1 - (int)(((int64_t)a + (b >> 1)) / b); + } +#endif +} + +int vp8cx_encode_intra_macroblock(VP8_COMP *cpi, MACROBLOCK *x, + TOKENEXTRA **t) { + MACROBLOCKD *xd = &x->e_mbd; + int rate; + + if (cpi->sf.RD && cpi->compressor_speed != 2) { + vp8_rd_pick_intra_mode(x, &rate); + } else { + vp8_pick_intra_mode(x, &rate); + } + + if (cpi->oxcf.tuning == VP8_TUNE_SSIM) { + adjust_act_zbin(cpi, x); + vp8_update_zbin_extra(cpi, x); + } + + if (x->e_mbd.mode_info_context->mbmi.mode == B_PRED) { + vp8_encode_intra4x4mby(x); + } else { + vp8_encode_intra16x16mby(x); + } + + vp8_encode_intra16x16mbuv(x); + + sum_intra_stats(cpi, x); + + vp8_tokenize_mb(cpi, x, t); + + if (xd->mode_info_context->mbmi.mode != B_PRED) vp8_inverse_transform_mby(xd); + + vp8_dequant_idct_add_uv_block(xd->qcoeff + 16 * 16, xd->dequant_uv, + xd->dst.u_buffer, xd->dst.v_buffer, + xd->dst.uv_stride, xd->eobs + 16); + return rate; +} +#ifdef SPEEDSTATS +extern int cnt_pm; +#endif + +extern void vp8_fix_contexts(MACROBLOCKD *x); + +int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, + int recon_yoffset, int recon_uvoffset, + int mb_row, int mb_col) { + MACROBLOCKD *const xd = &x->e_mbd; + int intra_error = 0; + int rate; + int distortion; + + x->skip = 0; + + if (xd->segmentation_enabled) { + x->encode_breakout = + cpi->segment_encode_breakout[xd->mode_info_context->mbmi.segment_id]; + } else { + x->encode_breakout = cpi->oxcf.encode_breakout; + } + +#if CONFIG_TEMPORAL_DENOISING + /* Reset the best sse mode/mv for each macroblock. */ + x->best_reference_frame = INTRA_FRAME; + x->best_zeromv_reference_frame = INTRA_FRAME; + x->best_sse_inter_mode = 0; + x->best_sse_mv.as_int = 0; + x->need_to_clamp_best_mvs = 0; +#endif + + if (cpi->sf.RD) { + int zbin_mode_boost_enabled = x->zbin_mode_boost_enabled; + + /* Are we using the fast quantizer for the mode selection? */ + if (cpi->sf.use_fastquant_for_pick) { + x->quantize_b = vp8_fast_quantize_b; + + /* the fast quantizer does not use zbin_extra, so + * do not recalculate */ + x->zbin_mode_boost_enabled = 0; + } + vp8_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, + &distortion, &intra_error, mb_row, mb_col); + + /* switch back to the regular quantizer for the encode */ + if (cpi->sf.improved_quant) { + x->quantize_b = vp8_regular_quantize_b; + } + + /* restore cpi->zbin_mode_boost_enabled */ + x->zbin_mode_boost_enabled = zbin_mode_boost_enabled; + + } else { + vp8_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, + &distortion, &intra_error, mb_row, mb_col); + } + + x->prediction_error += distortion; + x->intra_error += intra_error; + + if (cpi->oxcf.tuning == VP8_TUNE_SSIM) { + /* Adjust the zbin based on this MB rate. */ + adjust_act_zbin(cpi, x); + } + +#if 0 + /* Experimental RD code */ + cpi->frame_distortion += distortion; + cpi->last_mb_distortion = distortion; +#endif + + /* MB level adjutment to quantizer setup */ + if (xd->segmentation_enabled) { + /* If cyclic update enabled */ + if (cpi->current_layer == 0 && cpi->cyclic_refresh_mode_enabled) { + /* Clear segment_id back to 0 if not coded (last frame 0,0) */ + if ((xd->mode_info_context->mbmi.segment_id == 1) && + ((xd->mode_info_context->mbmi.ref_frame != LAST_FRAME) || + (xd->mode_info_context->mbmi.mode != ZEROMV))) { + xd->mode_info_context->mbmi.segment_id = 0; + + /* segment_id changed, so update */ + vp8cx_mb_init_quantizer(cpi, x, 1); + } + } + } + + { + /* Experimental code. + * Special case for gf and arf zeromv modes, for 1 temporal layer. + * Increase zbin size to supress noise. + */ + x->zbin_mode_boost = 0; + if (x->zbin_mode_boost_enabled) { + if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) { + if (xd->mode_info_context->mbmi.mode == ZEROMV) { + if (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME && + cpi->oxcf.number_of_layers == 1) { + x->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST; + } else { + x->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST; + } + } else if (xd->mode_info_context->mbmi.mode == SPLITMV) { + x->zbin_mode_boost = 0; + } else { + x->zbin_mode_boost = MV_ZBIN_BOOST; + } + } + } + + /* The fast quantizer doesn't use zbin_extra, only do so with + * the regular quantizer. */ + if (cpi->sf.improved_quant) vp8_update_zbin_extra(cpi, x); + } + + x->count_mb_ref_frame_usage[xd->mode_info_context->mbmi.ref_frame]++; + + if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) { + vp8_encode_intra16x16mbuv(x); + + if (xd->mode_info_context->mbmi.mode == B_PRED) { + vp8_encode_intra4x4mby(x); + } else { + vp8_encode_intra16x16mby(x); + } + + sum_intra_stats(cpi, x); + } else { + int ref_fb_idx; + + if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME) { + ref_fb_idx = cpi->common.lst_fb_idx; + } else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME) { + ref_fb_idx = cpi->common.gld_fb_idx; + } else { + ref_fb_idx = cpi->common.alt_fb_idx; + } + + xd->pre.y_buffer = cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset; + xd->pre.u_buffer = + cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset; + xd->pre.v_buffer = + cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset; + + if (!x->skip) { + vp8_encode_inter16x16(x); + } else { + vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer, xd->dst.u_buffer, + xd->dst.v_buffer, xd->dst.y_stride, + xd->dst.uv_stride); + } + } + + if (!x->skip) { + vp8_tokenize_mb(cpi, x, t); + + if (xd->mode_info_context->mbmi.mode != B_PRED) { + vp8_inverse_transform_mby(xd); + } + + vp8_dequant_idct_add_uv_block(xd->qcoeff + 16 * 16, xd->dequant_uv, + xd->dst.u_buffer, xd->dst.v_buffer, + xd->dst.uv_stride, xd->eobs + 16); + } else { + /* always set mb_skip_coeff as it is needed by the loopfilter */ + xd->mode_info_context->mbmi.mb_skip_coeff = 1; + + if (cpi->common.mb_no_coeff_skip) { + x->skip_true_count++; + vp8_fix_contexts(xd); + } else { + vp8_stuff_mb(cpi, x, t); + } + } + + return rate; +} diff --git a/media/libvpx/libvpx/vp8/encoder/encodeframe.h b/media/libvpx/libvpx/vp8/encoder/encodeframe.h new file mode 100644 index 0000000000..cc8cf4d713 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/encodeframe.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_VP8_ENCODER_ENCODEFRAME_H_ +#define VPX_VP8_ENCODER_ENCODEFRAME_H_ + +#include "vp8/encoder/tokenize.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP8_COMP; +struct macroblock; + +void vp8_activity_masking(struct VP8_COMP *cpi, MACROBLOCK *x); + +void vp8_build_block_offsets(struct macroblock *x); + +void vp8_setup_block_ptrs(struct macroblock *x); + +void vp8_encode_frame(struct VP8_COMP *cpi); + +int vp8cx_encode_inter_macroblock(struct VP8_COMP *cpi, struct macroblock *x, + TOKENEXTRA **t, int recon_yoffset, + int recon_uvoffset, int mb_row, int mb_col); + +int vp8cx_encode_intra_macroblock(struct VP8_COMP *cpi, struct macroblock *x, + TOKENEXTRA **t); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_ENCODER_ENCODEFRAME_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/encodeintra.c b/media/libvpx/libvpx/vp8/encoder/encodeintra.c new file mode 100644 index 0000000000..7d448c0ea0 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/encodeintra.c @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "vp8/encoder/quantize.h" +#include "vp8/common/reconintra.h" +#include "vp8/common/reconintra4x4.h" +#include "encodemb.h" +#include "vp8/common/invtrans.h" +#include "encodeintra.h" + +int vp8_encode_intra(MACROBLOCK *x, int use_dc_pred) { + int i; + int intra_pred_var = 0; + + if (use_dc_pred) { + x->e_mbd.mode_info_context->mbmi.mode = DC_PRED; + x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED; + x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME; + + vp8_encode_intra16x16mby(x); + + vp8_inverse_transform_mby(&x->e_mbd); + } else { + for (i = 0; i < 16; ++i) { + x->e_mbd.block[i].bmi.as_mode = B_DC_PRED; + vp8_encode_intra4x4block(x, i); + } + } + + intra_pred_var = vpx_get_mb_ss(x->src_diff); + + return intra_pred_var; +} + +void vp8_encode_intra4x4block(MACROBLOCK *x, int ib) { + BLOCKD *b = &x->e_mbd.block[ib]; + BLOCK *be = &x->block[ib]; + int dst_stride = x->e_mbd.dst.y_stride; + unsigned char *dst = x->e_mbd.dst.y_buffer + b->offset; + unsigned char *Above = dst - dst_stride; + unsigned char *yleft = dst - 1; + unsigned char top_left = Above[-1]; + + vp8_intra4x4_predict(Above, yleft, dst_stride, b->bmi.as_mode, b->predictor, + 16, top_left); + + vp8_subtract_b(be, b, 16); + + x->short_fdct4x4(be->src_diff, be->coeff, 32); + + x->quantize_b(be, b); + + if (*b->eob > 1) { + vp8_short_idct4x4llm(b->dqcoeff, b->predictor, 16, dst, dst_stride); + } else { + vp8_dc_only_idct_add(b->dqcoeff[0], b->predictor, 16, dst, dst_stride); + } +} + +void vp8_encode_intra4x4mby(MACROBLOCK *mb) { + int i; + + MACROBLOCKD *xd = &mb->e_mbd; + intra_prediction_down_copy(xd, xd->dst.y_buffer - xd->dst.y_stride + 16); + + for (i = 0; i < 16; ++i) vp8_encode_intra4x4block(mb, i); + return; +} + +void vp8_encode_intra16x16mby(MACROBLOCK *x) { + BLOCK *b = &x->block[0]; + MACROBLOCKD *xd = &x->e_mbd; + + vp8_build_intra_predictors_mby_s(xd, xd->dst.y_buffer - xd->dst.y_stride, + xd->dst.y_buffer - 1, xd->dst.y_stride, + xd->dst.y_buffer, xd->dst.y_stride); + + vp8_subtract_mby(x->src_diff, *(b->base_src), b->src_stride, xd->dst.y_buffer, + xd->dst.y_stride); + + vp8_transform_intra_mby(x); + + vp8_quantize_mby(x); + + if (x->optimize) vp8_optimize_mby(x); +} + +void vp8_encode_intra16x16mbuv(MACROBLOCK *x) { + MACROBLOCKD *xd = &x->e_mbd; + + vp8_build_intra_predictors_mbuv_s(xd, xd->dst.u_buffer - xd->dst.uv_stride, + xd->dst.v_buffer - xd->dst.uv_stride, + xd->dst.u_buffer - 1, xd->dst.v_buffer - 1, + xd->dst.uv_stride, xd->dst.u_buffer, + xd->dst.v_buffer, xd->dst.uv_stride); + + vp8_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer, + x->src.uv_stride, xd->dst.u_buffer, xd->dst.v_buffer, + xd->dst.uv_stride); + + vp8_transform_mbuv(x); + + vp8_quantize_mbuv(x); + + if (x->optimize) vp8_optimize_mbuv(x); +} diff --git a/media/libvpx/libvpx/vp8/encoder/encodeintra.h b/media/libvpx/libvpx/vp8/encoder/encodeintra.h new file mode 100644 index 0000000000..9a378abf49 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/encodeintra.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_ENCODER_ENCODEINTRA_H_ +#define VPX_VP8_ENCODER_ENCODEINTRA_H_ +#include "onyx_int.h" + +#ifdef __cplusplus +extern "C" { +#endif + +int vp8_encode_intra(MACROBLOCK *x, int use_dc_pred); +void vp8_encode_intra16x16mby(MACROBLOCK *x); +void vp8_encode_intra16x16mbuv(MACROBLOCK *x); +void vp8_encode_intra4x4mby(MACROBLOCK *mb); +void vp8_encode_intra4x4block(MACROBLOCK *x, int ib); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_ENCODER_ENCODEINTRA_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/encodemb.c b/media/libvpx/libvpx/vp8/encoder/encodemb.c new file mode 100644 index 0000000000..3fd8d5fabe --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/encodemb.c @@ -0,0 +1,512 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "encodemb.h" +#include "vp8/common/reconinter.h" +#include "vp8/encoder/quantize.h" +#include "tokenize.h" +#include "vp8/common/invtrans.h" +#include "vpx_mem/vpx_mem.h" +#include "rdopt.h" + +void vp8_subtract_b(BLOCK *be, BLOCKD *bd, int pitch) { + unsigned char *src_ptr = (*(be->base_src) + be->src); + short *diff_ptr = be->src_diff; + unsigned char *pred_ptr = bd->predictor; + int src_stride = be->src_stride; + + vpx_subtract_block(4, 4, diff_ptr, pitch, src_ptr, src_stride, pred_ptr, + pitch); +} + +void vp8_subtract_mbuv(short *diff, unsigned char *usrc, unsigned char *vsrc, + int src_stride, unsigned char *upred, + unsigned char *vpred, int pred_stride) { + short *udiff = diff + 256; + short *vdiff = diff + 320; + + vpx_subtract_block(8, 8, udiff, 8, usrc, src_stride, upred, pred_stride); + vpx_subtract_block(8, 8, vdiff, 8, vsrc, src_stride, vpred, pred_stride); +} + +void vp8_subtract_mby(short *diff, unsigned char *src, int src_stride, + unsigned char *pred, int pred_stride) { + vpx_subtract_block(16, 16, diff, 16, src, src_stride, pred, pred_stride); +} + +static void vp8_subtract_mb(MACROBLOCK *x) { + BLOCK *b = &x->block[0]; + + vp8_subtract_mby(x->src_diff, *(b->base_src), b->src_stride, + x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride); + vp8_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer, + x->src.uv_stride, x->e_mbd.dst.u_buffer, + x->e_mbd.dst.v_buffer, x->e_mbd.dst.uv_stride); +} + +static void build_dcblock(MACROBLOCK *x) { + short *src_diff_ptr = &x->src_diff[384]; + int i; + + for (i = 0; i < 16; ++i) { + src_diff_ptr[i] = x->coeff[i * 16]; + } +} + +void vp8_transform_mbuv(MACROBLOCK *x) { + int i; + + for (i = 16; i < 24; i += 2) { + x->short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16); + } +} + +void vp8_transform_intra_mby(MACROBLOCK *x) { + int i; + + for (i = 0; i < 16; i += 2) { + x->short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32); + } + + /* build dc block from 16 y dc values */ + build_dcblock(x); + + /* do 2nd order transform on the dc block */ + x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8); +} + +static void transform_mb(MACROBLOCK *x) { + int i; + + for (i = 0; i < 16; i += 2) { + x->short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32); + } + + /* build dc block from 16 y dc values */ + if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) build_dcblock(x); + + for (i = 16; i < 24; i += 2) { + x->short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16); + } + + /* do 2nd order transform on the dc block */ + if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) { + x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8); + } +} + +static void transform_mby(MACROBLOCK *x) { + int i; + + for (i = 0; i < 16; i += 2) { + x->short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32); + } + + /* build dc block from 16 y dc values */ + if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) { + build_dcblock(x); + x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8); + } +} + +#define RDTRUNC(RM, DM, R, D) ((128 + (R) * (RM)) & 0xFF) + +typedef struct vp8_token_state vp8_token_state; + +struct vp8_token_state { + int rate; + int error; + signed char next; + signed char token; + short qc; +}; + +/* TODO: experiments to find optimal multiple numbers */ +#define Y1_RD_MULT 4 +#define UV_RD_MULT 2 +#define Y2_RD_MULT 16 + +static const int plane_rd_mult[4] = { Y1_RD_MULT, Y2_RD_MULT, UV_RD_MULT, + Y1_RD_MULT }; + +static void optimize_b(MACROBLOCK *mb, int ib, int type, ENTROPY_CONTEXT *a, + ENTROPY_CONTEXT *l) { + BLOCK *b; + BLOCKD *d; + vp8_token_state tokens[17][2]; + unsigned best_mask[2]; + const short *dequant_ptr; + const short *coeff_ptr; + short *qcoeff_ptr; + short *dqcoeff_ptr; + int eob; + int i0; + int rc; + int x; + int sz = 0; + int next; + int rdmult; + int rddiv; + int final_eob; + int rd_cost0; + int rd_cost1; + int rate0; + int rate1; + int error0; + int error1; + int t0; + int t1; + int best; + int band; + int pt; + int i; + int err_mult = plane_rd_mult[type]; + + b = &mb->block[ib]; + d = &mb->e_mbd.block[ib]; + + dequant_ptr = d->dequant; + coeff_ptr = b->coeff; + qcoeff_ptr = d->qcoeff; + dqcoeff_ptr = d->dqcoeff; + i0 = !type; + eob = *d->eob; + + /* Now set up a Viterbi trellis to evaluate alternative roundings. */ + rdmult = mb->rdmult * err_mult; + if (mb->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME) { + rdmult = (rdmult * 9) >> 4; + } + + rddiv = mb->rddiv; + best_mask[0] = best_mask[1] = 0; + /* Initialize the sentinel node of the trellis. */ + tokens[eob][0].rate = 0; + tokens[eob][0].error = 0; + tokens[eob][0].next = 16; + tokens[eob][0].token = DCT_EOB_TOKEN; + tokens[eob][0].qc = 0; + *(tokens[eob] + 1) = *(tokens[eob] + 0); + next = eob; + for (i = eob; i-- > i0;) { + int base_bits; + int d2; + int dx; + + rc = vp8_default_zig_zag1d[i]; + x = qcoeff_ptr[rc]; + /* Only add a trellis state for non-zero coefficients. */ + if (x) { + int shortcut = 0; + error0 = tokens[next][0].error; + error1 = tokens[next][1].error; + /* Evaluate the first possibility for this state. */ + rate0 = tokens[next][0].rate; + rate1 = tokens[next][1].rate; + t0 = (vp8_dct_value_tokens_ptr + x)->Token; + /* Consider both possible successor states. */ + if (next < 16) { + band = vp8_coef_bands[i + 1]; + pt = vp8_prev_token_class[t0]; + rate0 += mb->token_costs[type][band][pt][tokens[next][0].token]; + rate1 += mb->token_costs[type][band][pt][tokens[next][1].token]; + } + rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0); + rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1); + if (rd_cost0 == rd_cost1) { + rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0); + rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1); + } + /* And pick the best. */ + best = rd_cost1 < rd_cost0; + base_bits = *(vp8_dct_value_cost_ptr + x); + dx = dqcoeff_ptr[rc] - coeff_ptr[rc]; + d2 = dx * dx; + tokens[i][0].rate = base_bits + (best ? rate1 : rate0); + tokens[i][0].error = d2 + (best ? error1 : error0); + tokens[i][0].next = next; + tokens[i][0].token = t0; + tokens[i][0].qc = x; + best_mask[0] |= best << i; + /* Evaluate the second possibility for this state. */ + rate0 = tokens[next][0].rate; + rate1 = tokens[next][1].rate; + + if ((abs(x) * dequant_ptr[rc] > abs(coeff_ptr[rc])) && + (abs(x) * dequant_ptr[rc] < abs(coeff_ptr[rc]) + dequant_ptr[rc])) { + shortcut = 1; + } else { + shortcut = 0; + } + + if (shortcut) { + sz = -(x < 0); + x -= 2 * sz + 1; + } + + /* Consider both possible successor states. */ + if (!x) { + /* If we reduced this coefficient to zero, check to see if + * we need to move the EOB back here. + */ + t0 = + tokens[next][0].token == DCT_EOB_TOKEN ? DCT_EOB_TOKEN : ZERO_TOKEN; + t1 = + tokens[next][1].token == DCT_EOB_TOKEN ? DCT_EOB_TOKEN : ZERO_TOKEN; + } else { + t0 = t1 = (vp8_dct_value_tokens_ptr + x)->Token; + } + if (next < 16) { + band = vp8_coef_bands[i + 1]; + if (t0 != DCT_EOB_TOKEN) { + pt = vp8_prev_token_class[t0]; + rate0 += mb->token_costs[type][band][pt][tokens[next][0].token]; + } + if (t1 != DCT_EOB_TOKEN) { + pt = vp8_prev_token_class[t1]; + rate1 += mb->token_costs[type][band][pt][tokens[next][1].token]; + } + } + + rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0); + rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1); + if (rd_cost0 == rd_cost1) { + rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0); + rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1); + } + /* And pick the best. */ + best = rd_cost1 < rd_cost0; + base_bits = *(vp8_dct_value_cost_ptr + x); + + if (shortcut) { + dx -= (dequant_ptr[rc] + sz) ^ sz; + d2 = dx * dx; + } + tokens[i][1].rate = base_bits + (best ? rate1 : rate0); + tokens[i][1].error = d2 + (best ? error1 : error0); + tokens[i][1].next = next; + tokens[i][1].token = best ? t1 : t0; + tokens[i][1].qc = x; + best_mask[1] |= best << i; + /* Finally, make this the new head of the trellis. */ + next = i; + } + /* There's no choice to make for a zero coefficient, so we don't + * add a new trellis node, but we do need to update the costs. + */ + else { + band = vp8_coef_bands[i + 1]; + t0 = tokens[next][0].token; + t1 = tokens[next][1].token; + /* Update the cost of each path if we're past the EOB token. */ + if (t0 != DCT_EOB_TOKEN) { + tokens[next][0].rate += mb->token_costs[type][band][0][t0]; + tokens[next][0].token = ZERO_TOKEN; + } + if (t1 != DCT_EOB_TOKEN) { + tokens[next][1].rate += mb->token_costs[type][band][0][t1]; + tokens[next][1].token = ZERO_TOKEN; + } + /* Don't update next, because we didn't add a new node. */ + } + } + + /* Now pick the best path through the whole trellis. */ + band = vp8_coef_bands[i + 1]; + VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l); + rate0 = tokens[next][0].rate; + rate1 = tokens[next][1].rate; + error0 = tokens[next][0].error; + error1 = tokens[next][1].error; + t0 = tokens[next][0].token; + t1 = tokens[next][1].token; + rate0 += mb->token_costs[type][band][pt][t0]; + rate1 += mb->token_costs[type][band][pt][t1]; + rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0); + rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1); + if (rd_cost0 == rd_cost1) { + rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0); + rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1); + } + best = rd_cost1 < rd_cost0; + final_eob = i0 - 1; + for (i = next; i < eob; i = next) { + x = tokens[i][best].qc; + if (x) final_eob = i; + rc = vp8_default_zig_zag1d[i]; + qcoeff_ptr[rc] = x; + dqcoeff_ptr[rc] = x * dequant_ptr[rc]; + next = tokens[i][best].next; + best = (best_mask[best] >> i) & 1; + } + final_eob++; + + *a = *l = (final_eob != !type); + *d->eob = (char)final_eob; +} +static void check_reset_2nd_coeffs(MACROBLOCKD *x, int type, ENTROPY_CONTEXT *a, + ENTROPY_CONTEXT *l) { + int sum = 0; + int i; + BLOCKD *bd = &x->block[24]; + + if (bd->dequant[0] >= 35 && bd->dequant[1] >= 35) return; + + for (i = 0; i < (*bd->eob); ++i) { + int coef = bd->dqcoeff[vp8_default_zig_zag1d[i]]; + sum += (coef >= 0) ? coef : -coef; + if (sum >= 35) return; + } + /************************************************************************** + our inverse hadamard transform effectively is weighted sum of all 16 inputs + with weight either 1 or -1. It has a last stage scaling of (sum+3)>>3. And + dc only idct is (dc+4)>>3. So if all the sums are between -35 and 29, the + output after inverse wht and idct will be all zero. A sum of absolute value + smaller than 35 guarantees all 16 different (+1/-1) weighted sums in wht + fall between -35 and +35. + **************************************************************************/ + if (sum < 35) { + for (i = 0; i < (*bd->eob); ++i) { + int rc = vp8_default_zig_zag1d[i]; + bd->qcoeff[rc] = 0; + bd->dqcoeff[rc] = 0; + } + *bd->eob = 0; + *a = *l = (*bd->eob != !type); + } +} + +static void optimize_mb(MACROBLOCK *x) { + int b; + int type; + int has_2nd_order; + + ENTROPY_CONTEXT_PLANES t_above, t_left; + ENTROPY_CONTEXT *ta; + ENTROPY_CONTEXT *tl; + + memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES)); + memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES)); + + ta = (ENTROPY_CONTEXT *)&t_above; + tl = (ENTROPY_CONTEXT *)&t_left; + + has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED && + x->e_mbd.mode_info_context->mbmi.mode != SPLITMV); + type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC; + + for (b = 0; b < 16; ++b) { + optimize_b(x, b, type, ta + vp8_block2above[b], tl + vp8_block2left[b]); + } + + for (b = 16; b < 24; ++b) { + optimize_b(x, b, PLANE_TYPE_UV, ta + vp8_block2above[b], + tl + vp8_block2left[b]); + } + + if (has_2nd_order) { + b = 24; + optimize_b(x, b, PLANE_TYPE_Y2, ta + vp8_block2above[b], + tl + vp8_block2left[b]); + check_reset_2nd_coeffs(&x->e_mbd, PLANE_TYPE_Y2, ta + vp8_block2above[b], + tl + vp8_block2left[b]); + } +} + +void vp8_optimize_mby(MACROBLOCK *x) { + int b; + int type; + int has_2nd_order; + + ENTROPY_CONTEXT_PLANES t_above, t_left; + ENTROPY_CONTEXT *ta; + ENTROPY_CONTEXT *tl; + + if (!x->e_mbd.above_context) return; + + if (!x->e_mbd.left_context) return; + + memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES)); + memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES)); + + ta = (ENTROPY_CONTEXT *)&t_above; + tl = (ENTROPY_CONTEXT *)&t_left; + + has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED && + x->e_mbd.mode_info_context->mbmi.mode != SPLITMV); + type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC; + + for (b = 0; b < 16; ++b) { + optimize_b(x, b, type, ta + vp8_block2above[b], tl + vp8_block2left[b]); + } + + if (has_2nd_order) { + b = 24; + optimize_b(x, b, PLANE_TYPE_Y2, ta + vp8_block2above[b], + tl + vp8_block2left[b]); + check_reset_2nd_coeffs(&x->e_mbd, PLANE_TYPE_Y2, ta + vp8_block2above[b], + tl + vp8_block2left[b]); + } +} + +void vp8_optimize_mbuv(MACROBLOCK *x) { + int b; + ENTROPY_CONTEXT_PLANES t_above, t_left; + ENTROPY_CONTEXT *ta; + ENTROPY_CONTEXT *tl; + + if (!x->e_mbd.above_context) return; + + if (!x->e_mbd.left_context) return; + + memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES)); + memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES)); + + ta = (ENTROPY_CONTEXT *)&t_above; + tl = (ENTROPY_CONTEXT *)&t_left; + + for (b = 16; b < 24; ++b) { + optimize_b(x, b, PLANE_TYPE_UV, ta + vp8_block2above[b], + tl + vp8_block2left[b]); + } +} + +void vp8_encode_inter16x16(MACROBLOCK *x) { + vp8_build_inter_predictors_mb(&x->e_mbd); + + vp8_subtract_mb(x); + + transform_mb(x); + + vp8_quantize_mb(x); + + if (x->optimize) optimize_mb(x); +} + +/* this funciton is used by first pass only */ +void vp8_encode_inter16x16y(MACROBLOCK *x) { + BLOCK *b = &x->block[0]; + + vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.dst.y_buffer, + x->e_mbd.dst.y_stride); + + vp8_subtract_mby(x->src_diff, *(b->base_src), b->src_stride, + x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride); + + transform_mby(x); + + vp8_quantize_mby(x); + + vp8_inverse_transform_mby(&x->e_mbd); +} diff --git a/media/libvpx/libvpx/vp8/encoder/encodemb.h b/media/libvpx/libvpx/vp8/encoder/encodemb.h new file mode 100644 index 0000000000..db577ddc10 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/encodemb.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_ENCODER_ENCODEMB_H_ +#define VPX_VP8_ENCODER_ENCODEMB_H_ + +#include "onyx_int.h" + +#ifdef __cplusplus +extern "C" { +#endif +void vp8_encode_inter16x16(MACROBLOCK *x); + +void vp8_subtract_b(BLOCK *be, BLOCKD *bd, int pitch); +void vp8_subtract_mbuv(short *diff, unsigned char *usrc, unsigned char *vsrc, + int src_stride, unsigned char *upred, + unsigned char *vpred, int pred_stride); +void vp8_subtract_mby(short *diff, unsigned char *src, int src_stride, + unsigned char *pred, int pred_stride); + +void vp8_build_dcblock(MACROBLOCK *b); +void vp8_transform_mb(MACROBLOCK *mb); +void vp8_transform_mbuv(MACROBLOCK *x); +void vp8_transform_intra_mby(MACROBLOCK *x); + +void vp8_optimize_mby(MACROBLOCK *x); +void vp8_optimize_mbuv(MACROBLOCK *x); +void vp8_encode_inter16x16y(MACROBLOCK *x); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_ENCODER_ENCODEMB_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/encodemv.c b/media/libvpx/libvpx/vp8/encoder/encodemv.c new file mode 100644 index 0000000000..384bb29389 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/encodemv.c @@ -0,0 +1,320 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp8/common/common.h" +#include "encodemv.h" +#include "vp8/common/entropymode.h" +#include "vp8/common/systemdependent.h" +#include "vpx_ports/system_state.h" + +#include + +static void encode_mvcomponent(vp8_writer *const w, const int v, + const struct mv_context *mvc) { + const vp8_prob *p = mvc->prob; + const int x = v < 0 ? -v : v; + + if (x < mvnum_short) { /* Small */ + vp8_write(w, 0, p[mvpis_short]); + vp8_treed_write(w, vp8_small_mvtree, p + MVPshort, x, 3); + + if (!x) return; /* no sign bit */ + } else { /* Large */ + int i = 0; + + vp8_write(w, 1, p[mvpis_short]); + + do { + vp8_write(w, (x >> i) & 1, p[MVPbits + i]); + } while (++i < 3); + + i = mvlong_width - 1; /* Skip bit 3, which is sometimes implicit */ + + do { + vp8_write(w, (x >> i) & 1, p[MVPbits + i]); + } while (--i > 3); + + if (x & 0xFFF0) vp8_write(w, (x >> 3) & 1, p[MVPbits + 3]); + } + + vp8_write(w, v < 0, p[MVPsign]); +} +#if 0 +static int max_mv_r = 0; +static int max_mv_c = 0; +#endif +void vp8_encode_motion_vector(vp8_writer *w, const MV *mv, + const MV_CONTEXT *mvc) { +#if 0 + { + if (abs(mv->row >> 1) > max_mv_r) + { + FILE *f = fopen("maxmv.stt", "a"); + max_mv_r = abs(mv->row >> 1); + fprintf(f, "New Mv Row Max %6d\n", (mv->row >> 1)); + + if ((abs(mv->row) / 2) != max_mv_r) + fprintf(f, "MV Row conversion error %6d\n", abs(mv->row) / 2); + + fclose(f); + } + + if (abs(mv->col >> 1) > max_mv_c) + { + FILE *f = fopen("maxmv.stt", "a"); + fprintf(f, "New Mv Col Max %6d\n", (mv->col >> 1)); + max_mv_c = abs(mv->col >> 1); + fclose(f); + } + } +#endif + + encode_mvcomponent(w, mv->row >> 1, &mvc[0]); + encode_mvcomponent(w, mv->col >> 1, &mvc[1]); +} + +static unsigned int cost_mvcomponent(const int v, + const struct mv_context *mvc) { + const vp8_prob *p = mvc->prob; + const int x = v; + unsigned int cost; + + if (x < mvnum_short) { + cost = vp8_cost_zero(p[mvpis_short]) + + vp8_treed_cost(vp8_small_mvtree, p + MVPshort, x, 3); + + if (!x) return cost; + } else { + int i = 0; + cost = vp8_cost_one(p[mvpis_short]); + + do { + cost += vp8_cost_bit(p[MVPbits + i], (x >> i) & 1); + + } while (++i < 3); + + i = mvlong_width - 1; /* Skip bit 3, which is sometimes implicit */ + + do { + cost += vp8_cost_bit(p[MVPbits + i], (x >> i) & 1); + + } while (--i > 3); + + if (x & 0xFFF0) cost += vp8_cost_bit(p[MVPbits + 3], (x >> 3) & 1); + } + + return cost; /* + vp8_cost_bit( p [MVPsign], v < 0); */ +} + +void vp8_build_component_cost_table(int *mvcost[2], const MV_CONTEXT *mvc, + int mvc_flag[2]) { + int i = 1; + unsigned int cost0 = 0; + unsigned int cost1 = 0; + + vpx_clear_system_state(); + + i = 1; + + if (mvc_flag[0]) { + mvcost[0][0] = cost_mvcomponent(0, &mvc[0]); + + do { + cost0 = cost_mvcomponent(i, &mvc[0]); + + mvcost[0][i] = cost0 + vp8_cost_zero(mvc[0].prob[MVPsign]); + mvcost[0][-i] = cost0 + vp8_cost_one(mvc[0].prob[MVPsign]); + } while (++i <= mv_max); + } + + i = 1; + + if (mvc_flag[1]) { + mvcost[1][0] = cost_mvcomponent(0, &mvc[1]); + + do { + cost1 = cost_mvcomponent(i, &mvc[1]); + + mvcost[1][i] = cost1 + vp8_cost_zero(mvc[1].prob[MVPsign]); + mvcost[1][-i] = cost1 + vp8_cost_one(mvc[1].prob[MVPsign]); + } while (++i <= mv_max); + } +} + +/* Motion vector probability table update depends on benefit. + * Small correction allows for the fact that an update to an MV probability + * may have benefit in subsequent frames as well as the current one. + */ +#define MV_PROB_UPDATE_CORRECTION -1 + +static void calc_prob(vp8_prob *p, const unsigned int ct[2]) { + const unsigned int tot = ct[0] + ct[1]; + + if (tot) { + const vp8_prob x = ((ct[0] * 255) / tot) & ~1u; + *p = x ? x : 1; + } +} + +static void update(vp8_writer *const w, const unsigned int ct[2], + vp8_prob *const cur_p, const vp8_prob new_p, + const vp8_prob update_p, int *updated) { + const int cur_b = vp8_cost_branch(ct, *cur_p); + const int new_b = vp8_cost_branch(ct, new_p); + const int cost = + 7 + MV_PROB_UPDATE_CORRECTION + + ((vp8_cost_one(update_p) - vp8_cost_zero(update_p) + 128) >> 8); + + if (cur_b - new_b > cost) { + *cur_p = new_p; + vp8_write(w, 1, update_p); + vp8_write_literal(w, new_p >> 1, 7); + *updated = 1; + + } else + vp8_write(w, 0, update_p); +} + +static void write_component_probs(vp8_writer *const w, + struct mv_context *cur_mvc, + const struct mv_context *default_mvc_, + const struct mv_context *update_mvc, + const unsigned int events[MVvals], + unsigned int rc, int *updated) { + vp8_prob *Pcur = cur_mvc->prob; + const vp8_prob *default_mvc = default_mvc_->prob; + const vp8_prob *Pupdate = update_mvc->prob; + unsigned int is_short_ct[2], sign_ct[2]; + + unsigned int bit_ct[mvlong_width][2]; + + unsigned int short_ct[mvnum_short]; + unsigned int short_bct[mvnum_short - 1][2]; + + vp8_prob Pnew[MVPcount]; + + (void)rc; + vp8_copy_array(Pnew, default_mvc, MVPcount); + + vp8_zero(is_short_ct); + vp8_zero(sign_ct); + vp8_zero(bit_ct); + vp8_zero(short_ct); + vp8_zero(short_bct); + + /* j=0 */ + { + const int c = events[mv_max]; + + is_short_ct[0] += c; /* Short vector */ + short_ct[0] += c; /* Magnitude distribution */ + } + + /* j: 1 ~ mv_max (1023) */ + { + int j = 1; + + do { + const int c1 = events[mv_max + j]; /* positive */ + const int c2 = events[mv_max - j]; /* negative */ + const int c = c1 + c2; + int a = j; + + sign_ct[0] += c1; + sign_ct[1] += c2; + + if (a < mvnum_short) { + is_short_ct[0] += c; /* Short vector */ + short_ct[a] += c; /* Magnitude distribution */ + } else { + int k = mvlong_width - 1; + is_short_ct[1] += c; /* Long vector */ + + /* bit 3 not always encoded. */ + do { + bit_ct[k][(a >> k) & 1] += c; + + } while (--k >= 0); + } + } while (++j <= mv_max); + } + + calc_prob(Pnew + mvpis_short, is_short_ct); + + calc_prob(Pnew + MVPsign, sign_ct); + + { + vp8_prob p[mvnum_short - 1]; /* actually only need branch ct */ + int j = 0; + + vp8_tree_probs_from_distribution(8, vp8_small_mvencodings, vp8_small_mvtree, + p, short_bct, short_ct, 256, 1); + + do { + calc_prob(Pnew + MVPshort + j, short_bct[j]); + + } while (++j < mvnum_short - 1); + } + + { + int j = 0; + + do { + calc_prob(Pnew + MVPbits + j, bit_ct[j]); + + } while (++j < mvlong_width); + } + + update(w, is_short_ct, Pcur + mvpis_short, Pnew[mvpis_short], *Pupdate++, + updated); + + update(w, sign_ct, Pcur + MVPsign, Pnew[MVPsign], *Pupdate++, updated); + + { + const vp8_prob *const new_p = Pnew + MVPshort; + vp8_prob *const cur_p = Pcur + MVPshort; + + int j = 0; + + do { + update(w, short_bct[j], cur_p + j, new_p[j], *Pupdate++, updated); + + } while (++j < mvnum_short - 1); + } + + { + const vp8_prob *const new_p = Pnew + MVPbits; + vp8_prob *const cur_p = Pcur + MVPbits; + + int j = 0; + + do { + update(w, bit_ct[j], cur_p + j, new_p[j], *Pupdate++, updated); + + } while (++j < mvlong_width); + } +} + +void vp8_write_mvprobs(VP8_COMP *cpi) { + vp8_writer *const w = cpi->bc; + MV_CONTEXT *mvc = cpi->common.fc.mvc; + int flags[2] = { 0, 0 }; + write_component_probs(w, &mvc[0], &vp8_default_mv_context[0], + &vp8_mv_update_probs[0], cpi->mb.MVcount[0], 0, + &flags[0]); + write_component_probs(w, &mvc[1], &vp8_default_mv_context[1], + &vp8_mv_update_probs[1], cpi->mb.MVcount[1], 1, + &flags[1]); + + if (flags[0] || flags[1]) { + vp8_build_component_cost_table( + cpi->mb.mvcost, (const MV_CONTEXT *)cpi->common.fc.mvc, flags); + } +} diff --git a/media/libvpx/libvpx/vp8/encoder/encodemv.h b/media/libvpx/libvpx/vp8/encoder/encodemv.h new file mode 100644 index 0000000000..347b9feffe --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/encodemv.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_ENCODER_ENCODEMV_H_ +#define VPX_VP8_ENCODER_ENCODEMV_H_ + +#include "onyx_int.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_write_mvprobs(VP8_COMP *); +void vp8_encode_motion_vector(vp8_writer *, const MV *, const MV_CONTEXT *); +void vp8_build_component_cost_table(int *mvcost[2], const MV_CONTEXT *mvc, + int mvc_flag[2]); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_ENCODER_ENCODEMV_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/ethreading.c b/media/libvpx/libvpx/vp8/encoder/ethreading.c new file mode 100644 index 0000000000..e2f8b89d46 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/ethreading.c @@ -0,0 +1,664 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include + +#include "onyx_int.h" +#include "vp8/common/threading.h" +#include "vp8/common/common.h" +#include "vp8/common/extend.h" +#include "bitstream.h" +#include "encodeframe.h" +#include "ethreading.h" + +#if CONFIG_MULTITHREAD + +extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, + int ok_to_skip); + +static THREAD_FUNCTION thread_loopfilter(void *p_data) { + VP8_COMP *cpi = (VP8_COMP *)(((LPFTHREAD_DATA *)p_data)->ptr1); + VP8_COMMON *cm = &cpi->common; + + while (1) { + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break; + + if (sem_wait(&cpi->h_event_start_lpf) == 0) { + /* we're shutting down */ + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break; + + vp8_loopfilter_frame(cpi, cm); + + sem_post(&cpi->h_event_end_lpf); + } + } + + return 0; +} + +static THREAD_FUNCTION thread_encoding_proc(void *p_data) { + int ithread = ((ENCODETHREAD_DATA *)p_data)->ithread; + VP8_COMP *cpi = (VP8_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr1); + MB_ROW_COMP *mbri = (MB_ROW_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr2); + ENTROPY_CONTEXT_PLANES mb_row_left_context; + + while (1) { + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break; + + if (sem_wait(&cpi->h_event_start_encoding[ithread]) == 0) { + const int nsync = cpi->mt_sync_range; + VP8_COMMON *cm = &cpi->common; + int mb_row; + MACROBLOCK *x = &mbri->mb; + MACROBLOCKD *xd = &x->e_mbd; + TOKENEXTRA *tp; +#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING + TOKENEXTRA *tp_start = cpi->tok + (1 + ithread) * (16 * 24); + const int num_part = (1 << cm->multi_token_partition); +#endif + + int *segment_counts = mbri->segment_counts; + int *totalrate = &mbri->totalrate; + + /* we're shutting down */ + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break; + + xd->mode_info_context = cm->mi + cm->mode_info_stride * (ithread + 1); + xd->mode_info_stride = cm->mode_info_stride; + + for (mb_row = ithread + 1; mb_row < cm->mb_rows; + mb_row += (cpi->encoding_thread_count + 1)) { + int recon_yoffset, recon_uvoffset; + int mb_col; + int ref_fb_idx = cm->lst_fb_idx; + int dst_fb_idx = cm->new_fb_idx; + int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride; + int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride; + int map_index = (mb_row * cm->mb_cols); + const vpx_atomic_int *last_row_current_mb_col; + vpx_atomic_int *current_mb_col = &cpi->mt_current_mb_col[mb_row]; + +#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING) + vp8_writer *w = &cpi->bc[1 + (mb_row % num_part)]; +#else + tp = cpi->tok + (mb_row * (cm->mb_cols * 16 * 24)); + cpi->tplist[mb_row].start = tp; +#endif + + last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1]; + + /* reset above block coeffs */ + xd->above_context = cm->above_context; + xd->left_context = &mb_row_left_context; + + vp8_zero(mb_row_left_context); + + xd->up_available = (mb_row != 0); + recon_yoffset = (mb_row * recon_y_stride * 16); + recon_uvoffset = (mb_row * recon_uv_stride * 8); + + /* Set the mb activity pointer to the start of the row. */ + x->mb_activity_ptr = &cpi->mb_activity_map[map_index]; + + /* for each macroblock col in image */ + for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) { + if (((mb_col - 1) % nsync) == 0) { + vpx_atomic_store_release(current_mb_col, mb_col - 1); + } + + if (mb_row && !(mb_col & (nsync - 1))) { + vp8_atomic_spin_wait(mb_col, last_row_current_mb_col, nsync); + } + +#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING + tp = tp_start; +#endif + + /* Distance of Mb to the various image edges. + * These specified to 8th pel as they are always compared + * to values that are in 1/8th pel units + */ + xd->mb_to_left_edge = -((mb_col * 16) << 3); + xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3; + xd->mb_to_top_edge = -((mb_row * 16) << 3); + xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3; + + /* Set up limit values for motion vectors used to prevent + * them extending outside the UMV borders + */ + x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16)); + x->mv_col_max = + ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16); + x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16)); + x->mv_row_max = + ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16); + + xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset; + xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset; + xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset; + xd->left_available = (mb_col != 0); + + x->rddiv = cpi->RDDIV; + x->rdmult = cpi->RDMULT; + + /* Copy current mb to a buffer */ + vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16); + + if (cpi->oxcf.tuning == VP8_TUNE_SSIM) vp8_activity_masking(cpi, x); + + /* Is segmentation enabled */ + /* MB level adjustment to quantizer */ + if (xd->segmentation_enabled) { + /* Code to set segment id in xd->mbmi.segment_id for + * current MB (with range checking) + */ + if (cpi->segmentation_map[map_index + mb_col] <= 3) { + xd->mode_info_context->mbmi.segment_id = + cpi->segmentation_map[map_index + mb_col]; + } else { + xd->mode_info_context->mbmi.segment_id = 0; + } + + vp8cx_mb_init_quantizer(cpi, x, 1); + } else { + /* Set to Segment 0 by default */ + xd->mode_info_context->mbmi.segment_id = 0; + } + + x->active_ptr = cpi->active_map + map_index + mb_col; + + if (cm->frame_type == KEY_FRAME) { + *totalrate += vp8cx_encode_intra_macroblock(cpi, x, &tp); +#ifdef MODE_STATS + y_modes[xd->mbmi.mode]++; +#endif + } else { + *totalrate += vp8cx_encode_inter_macroblock( + cpi, x, &tp, recon_yoffset, recon_uvoffset, mb_row, mb_col); + +#ifdef MODE_STATS + inter_y_modes[xd->mbmi.mode]++; + + if (xd->mbmi.mode == SPLITMV) { + int b; + + for (b = 0; b < xd->mbmi.partition_count; ++b) { + inter_b_modes[x->partition->bmi[b].mode]++; + } + } + +#endif + // Keep track of how many (consecutive) times a block + // is coded as ZEROMV_LASTREF, for base layer frames. + // Reset to 0 if its coded as anything else. + if (cpi->current_layer == 0) { + if (xd->mode_info_context->mbmi.mode == ZEROMV && + xd->mode_info_context->mbmi.ref_frame == LAST_FRAME) { + // Increment, check for wrap-around. + if (cpi->consec_zero_last[map_index + mb_col] < 255) { + cpi->consec_zero_last[map_index + mb_col] += 1; + } + if (cpi->consec_zero_last_mvbias[map_index + mb_col] < 255) { + cpi->consec_zero_last_mvbias[map_index + mb_col] += 1; + } + } else { + cpi->consec_zero_last[map_index + mb_col] = 0; + cpi->consec_zero_last_mvbias[map_index + mb_col] = 0; + } + if (x->zero_last_dot_suppress) { + cpi->consec_zero_last_mvbias[map_index + mb_col] = 0; + } + } + + /* Special case code for cyclic refresh + * If cyclic update enabled then copy + * xd->mbmi.segment_id; (which may have been updated + * based on mode during + * vp8cx_encode_inter_macroblock()) back into the + * global segmentation map + */ + if ((cpi->current_layer == 0) && + (cpi->cyclic_refresh_mode_enabled && + xd->segmentation_enabled)) { + const MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; + cpi->segmentation_map[map_index + mb_col] = mbmi->segment_id; + + /* If the block has been refreshed mark it as clean + * (the magnitude of the -ve influences how long it + * will be before we consider another refresh): + * Else if it was coded (last frame 0,0) and has + * not already been refreshed then mark it as a + * candidate for cleanup next time (marked 0) else + * mark it as dirty (1). + */ + if (mbmi->segment_id) { + cpi->cyclic_refresh_map[map_index + mb_col] = -1; + } else if ((mbmi->mode == ZEROMV) && + (mbmi->ref_frame == LAST_FRAME)) { + if (cpi->cyclic_refresh_map[map_index + mb_col] == 1) { + cpi->cyclic_refresh_map[map_index + mb_col] = 0; + } + } else { + cpi->cyclic_refresh_map[map_index + mb_col] = 1; + } + } + } + +#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING + /* pack tokens for this MB */ + { + int tok_count = tp - tp_start; + vp8_pack_tokens(w, tp_start, tok_count); + } +#else + cpi->tplist[mb_row].stop = tp; +#endif + /* Increment pointer into gf usage flags structure. */ + x->gf_active_ptr++; + + /* Increment the activity mask pointers. */ + x->mb_activity_ptr++; + + /* adjust to the next column of macroblocks */ + x->src.y_buffer += 16; + x->src.u_buffer += 8; + x->src.v_buffer += 8; + + recon_yoffset += 16; + recon_uvoffset += 8; + + /* Keep track of segment usage */ + segment_counts[xd->mode_info_context->mbmi.segment_id]++; + + /* skip to next mb */ + xd->mode_info_context++; + x->partition_info++; + xd->above_context++; + } + + vp8_extend_mb_row(&cm->yv12_fb[dst_fb_idx], xd->dst.y_buffer + 16, + xd->dst.u_buffer + 8, xd->dst.v_buffer + 8); + + vpx_atomic_store_release(current_mb_col, mb_col + nsync); + + /* this is to account for the border */ + xd->mode_info_context++; + x->partition_info++; + + x->src.y_buffer += + 16 * x->src.y_stride * (cpi->encoding_thread_count + 1) - + 16 * cm->mb_cols; + x->src.u_buffer += + 8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - + 8 * cm->mb_cols; + x->src.v_buffer += + 8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - + 8 * cm->mb_cols; + + xd->mode_info_context += + xd->mode_info_stride * cpi->encoding_thread_count; + x->partition_info += xd->mode_info_stride * cpi->encoding_thread_count; + x->gf_active_ptr += cm->mb_cols * cpi->encoding_thread_count; + } + /* Signal that this thread has completed processing its rows. */ + sem_post(&cpi->h_event_end_encoding[ithread]); + } + } + + /* printf("exit thread %d\n", ithread); */ + return 0; +} + +static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc) { + MACROBLOCK *x = mbsrc; + MACROBLOCK *z = mbdst; + int i; + + z->ss = x->ss; + z->ss_count = x->ss_count; + z->searches_per_step = x->searches_per_step; + z->errorperbit = x->errorperbit; + + z->sadperbit16 = x->sadperbit16; + z->sadperbit4 = x->sadperbit4; + + /* + z->mv_col_min = x->mv_col_min; + z->mv_col_max = x->mv_col_max; + z->mv_row_min = x->mv_row_min; + z->mv_row_max = x->mv_row_max; + */ + + z->short_fdct4x4 = x->short_fdct4x4; + z->short_fdct8x4 = x->short_fdct8x4; + z->short_walsh4x4 = x->short_walsh4x4; + z->quantize_b = x->quantize_b; + z->optimize = x->optimize; + + /* + z->mvc = x->mvc; + z->src.y_buffer = x->src.y_buffer; + z->src.u_buffer = x->src.u_buffer; + z->src.v_buffer = x->src.v_buffer; + */ + + z->mvcost[0] = x->mvcost[0]; + z->mvcost[1] = x->mvcost[1]; + z->mvsadcost[0] = x->mvsadcost[0]; + z->mvsadcost[1] = x->mvsadcost[1]; + + z->token_costs = x->token_costs; + z->inter_bmode_costs = x->inter_bmode_costs; + z->mbmode_cost = x->mbmode_cost; + z->intra_uv_mode_cost = x->intra_uv_mode_cost; + z->bmode_costs = x->bmode_costs; + + for (i = 0; i < 25; ++i) { + z->block[i].quant = x->block[i].quant; + z->block[i].quant_fast = x->block[i].quant_fast; + z->block[i].quant_shift = x->block[i].quant_shift; + z->block[i].zbin = x->block[i].zbin; + z->block[i].zrun_zbin_boost = x->block[i].zrun_zbin_boost; + z->block[i].round = x->block[i].round; + z->block[i].src_stride = x->block[i].src_stride; + } + + z->q_index = x->q_index; + z->act_zbin_adj = x->act_zbin_adj; + z->last_act_zbin_adj = x->last_act_zbin_adj; + + { + MACROBLOCKD *xd = &x->e_mbd; + MACROBLOCKD *zd = &z->e_mbd; + + /* + zd->mode_info_context = xd->mode_info_context; + zd->mode_info = xd->mode_info; + + zd->mode_info_stride = xd->mode_info_stride; + zd->frame_type = xd->frame_type; + zd->up_available = xd->up_available ; + zd->left_available = xd->left_available; + zd->left_context = xd->left_context; + zd->last_frame_dc = xd->last_frame_dc; + zd->last_frame_dccons = xd->last_frame_dccons; + zd->gold_frame_dc = xd->gold_frame_dc; + zd->gold_frame_dccons = xd->gold_frame_dccons; + zd->mb_to_left_edge = xd->mb_to_left_edge; + zd->mb_to_right_edge = xd->mb_to_right_edge; + zd->mb_to_top_edge = xd->mb_to_top_edge ; + zd->mb_to_bottom_edge = xd->mb_to_bottom_edge; + zd->gf_active_ptr = xd->gf_active_ptr; + zd->frames_since_golden = xd->frames_since_golden; + zd->frames_till_alt_ref_frame = xd->frames_till_alt_ref_frame; + */ + zd->subpixel_predict = xd->subpixel_predict; + zd->subpixel_predict8x4 = xd->subpixel_predict8x4; + zd->subpixel_predict8x8 = xd->subpixel_predict8x8; + zd->subpixel_predict16x16 = xd->subpixel_predict16x16; + zd->segmentation_enabled = xd->segmentation_enabled; + zd->mb_segment_abs_delta = xd->mb_segment_abs_delta; + memcpy(zd->segment_feature_data, xd->segment_feature_data, + sizeof(xd->segment_feature_data)); + + memcpy(zd->dequant_y1_dc, xd->dequant_y1_dc, sizeof(xd->dequant_y1_dc)); + memcpy(zd->dequant_y1, xd->dequant_y1, sizeof(xd->dequant_y1)); + memcpy(zd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2)); + memcpy(zd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv)); + +#if 1 + /*TODO: Remove dequant from BLOCKD. This is a temporary solution until + * the quantizer code uses a passed in pointer to the dequant constants. + * This will also require modifications to the x86 and neon assembly. + * */ + for (i = 0; i < 16; ++i) zd->block[i].dequant = zd->dequant_y1; + for (i = 16; i < 24; ++i) zd->block[i].dequant = zd->dequant_uv; + zd->block[24].dequant = zd->dequant_y2; +#endif + + memcpy(z->rd_threshes, x->rd_threshes, sizeof(x->rd_threshes)); + memcpy(z->rd_thresh_mult, x->rd_thresh_mult, sizeof(x->rd_thresh_mult)); + + z->zbin_over_quant = x->zbin_over_quant; + z->zbin_mode_boost_enabled = x->zbin_mode_boost_enabled; + z->zbin_mode_boost = x->zbin_mode_boost; + + memset(z->error_bins, 0, sizeof(z->error_bins)); + } +} + +void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x, + MB_ROW_COMP *mbr_ei, int count) { + VP8_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + int i; + + for (i = 0; i < count; ++i) { + MACROBLOCK *mb = &mbr_ei[i].mb; + MACROBLOCKD *mbd = &mb->e_mbd; + + mbd->subpixel_predict = xd->subpixel_predict; + mbd->subpixel_predict8x4 = xd->subpixel_predict8x4; + mbd->subpixel_predict8x8 = xd->subpixel_predict8x8; + mbd->subpixel_predict16x16 = xd->subpixel_predict16x16; + mb->gf_active_ptr = x->gf_active_ptr; + + memset(mbr_ei[i].segment_counts, 0, sizeof(mbr_ei[i].segment_counts)); + mbr_ei[i].totalrate = 0; + + mb->partition_info = x->pi + x->e_mbd.mode_info_stride * (i + 1); + + mbd->frame_type = cm->frame_type; + + mb->src = *cpi->Source; + mbd->pre = cm->yv12_fb[cm->lst_fb_idx]; + mbd->dst = cm->yv12_fb[cm->new_fb_idx]; + + mb->src.y_buffer += 16 * x->src.y_stride * (i + 1); + mb->src.u_buffer += 8 * x->src.uv_stride * (i + 1); + mb->src.v_buffer += 8 * x->src.uv_stride * (i + 1); + + vp8_build_block_offsets(mb); + + mbd->left_context = &cm->left_context; + mb->mvc = cm->fc.mvc; + + setup_mbby_copy(&mbr_ei[i].mb, x); + + mbd->fullpixel_mask = ~0; + if (cm->full_pixel) mbd->fullpixel_mask = ~7; + + vp8_zero(mb->coef_counts); + vp8_zero(x->ymode_count); + mb->skip_true_count = 0; + vp8_zero(mb->MVcount); + mb->prediction_error = 0; + mb->intra_error = 0; + vp8_zero(mb->count_mb_ref_frame_usage); + mb->mbs_tested_so_far = 0; + mb->mbs_zero_last_dot_suppress = 0; + } +} + +int vp8cx_create_encoder_threads(VP8_COMP *cpi) { + const VP8_COMMON *cm = &cpi->common; + int th_count = 0; + + if (cm->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1) { + th_count = cpi->oxcf.multi_threaded - 1; + + /* don't allocate more threads than cores available */ + if (cpi->oxcf.multi_threaded > cm->processor_core_count) { + th_count = cm->processor_core_count - 1; + } + + /* we have th_count + 1 (main) threads processing one row each */ + /* no point to have more threads than the sync range allows */ + if (th_count > ((cm->mb_cols / cpi->mt_sync_range) - 1)) { + th_count = (cm->mb_cols / cpi->mt_sync_range) - 1; + } + } + if (th_count == cpi->encoding_thread_count) return 0; + + vp8cx_remove_encoder_threads(cpi); + if (th_count != 0) { + int ithread; + int rc = 0; + + CHECK_MEM_ERROR(&cpi->common.error, cpi->h_encoding_thread, + vpx_malloc(sizeof(pthread_t) * th_count)); + CHECK_MEM_ERROR(&cpi->common.error, cpi->h_event_start_encoding, + vpx_malloc(sizeof(sem_t) * th_count)); + CHECK_MEM_ERROR(&cpi->common.error, cpi->h_event_end_encoding, + vpx_malloc(sizeof(sem_t) * th_count)); + CHECK_MEM_ERROR(&cpi->common.error, cpi->mb_row_ei, + vpx_memalign(32, sizeof(MB_ROW_COMP) * th_count)); + memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * th_count); + CHECK_MEM_ERROR(&cpi->common.error, cpi->en_thread_data, + vpx_malloc(sizeof(ENCODETHREAD_DATA) * th_count)); + + vpx_atomic_store_release(&cpi->b_multi_threaded, 1); + cpi->encoding_thread_count = th_count; + + /* + printf("[VP8:] multi_threaded encoding is enabled with %d threads\n\n", + (cpi->encoding_thread_count +1)); + */ + + for (ithread = 0; ithread < th_count; ++ithread) { + ENCODETHREAD_DATA *ethd = &cpi->en_thread_data[ithread]; + + /* Setup block ptrs and offsets */ + vp8_setup_block_ptrs(&cpi->mb_row_ei[ithread].mb); + vp8_setup_block_dptrs(&cpi->mb_row_ei[ithread].mb.e_mbd); + + sem_init(&cpi->h_event_start_encoding[ithread], 0, 0); + sem_init(&cpi->h_event_end_encoding[ithread], 0, 0); + + ethd->ithread = ithread; + ethd->ptr1 = (void *)cpi; + ethd->ptr2 = (void *)&cpi->mb_row_ei[ithread]; + + rc = pthread_create(&cpi->h_encoding_thread[ithread], 0, + thread_encoding_proc, ethd); + if (rc) break; + } + + if (rc) { + /* shutdown other threads */ + vpx_atomic_store_release(&cpi->b_multi_threaded, 0); + for (--ithread; ithread >= 0; ithread--) { + sem_post(&cpi->h_event_start_encoding[ithread]); + sem_post(&cpi->h_event_end_encoding[ithread]); + pthread_join(cpi->h_encoding_thread[ithread], 0); + sem_destroy(&cpi->h_event_start_encoding[ithread]); + sem_destroy(&cpi->h_event_end_encoding[ithread]); + } + + /* free thread related resources */ + vpx_free(cpi->h_event_start_encoding); + cpi->h_event_start_encoding = NULL; + vpx_free(cpi->h_event_end_encoding); + cpi->h_event_end_encoding = NULL; + vpx_free(cpi->h_encoding_thread); + cpi->h_encoding_thread = NULL; + vpx_free(cpi->mb_row_ei); + cpi->mb_row_ei = NULL; + vpx_free(cpi->en_thread_data); + cpi->en_thread_data = NULL; + cpi->encoding_thread_count = 0; + + return -1; + } + + { + LPFTHREAD_DATA *lpfthd = &cpi->lpf_thread_data; + + sem_init(&cpi->h_event_start_lpf, 0, 0); + sem_init(&cpi->h_event_end_lpf, 0, 0); + + lpfthd->ptr1 = (void *)cpi; + rc = pthread_create(&cpi->h_filter_thread, 0, thread_loopfilter, lpfthd); + + if (rc) { + /* shutdown other threads */ + vpx_atomic_store_release(&cpi->b_multi_threaded, 0); + for (--ithread; ithread >= 0; ithread--) { + sem_post(&cpi->h_event_start_encoding[ithread]); + sem_post(&cpi->h_event_end_encoding[ithread]); + pthread_join(cpi->h_encoding_thread[ithread], 0); + sem_destroy(&cpi->h_event_start_encoding[ithread]); + sem_destroy(&cpi->h_event_end_encoding[ithread]); + } + sem_destroy(&cpi->h_event_end_lpf); + sem_destroy(&cpi->h_event_start_lpf); + + /* free thread related resources */ + vpx_free(cpi->h_event_start_encoding); + cpi->h_event_start_encoding = NULL; + vpx_free(cpi->h_event_end_encoding); + cpi->h_event_end_encoding = NULL; + vpx_free(cpi->h_encoding_thread); + cpi->h_encoding_thread = NULL; + vpx_free(cpi->mb_row_ei); + cpi->mb_row_ei = NULL; + vpx_free(cpi->en_thread_data); + cpi->en_thread_data = NULL; + cpi->encoding_thread_count = 0; + + return -2; + } + } + } + return 0; +} + +void vp8cx_remove_encoder_threads(VP8_COMP *cpi) { + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) { + /* shutdown other threads */ + vpx_atomic_store_release(&cpi->b_multi_threaded, 0); + { + int i; + + for (i = 0; i < cpi->encoding_thread_count; ++i) { + sem_post(&cpi->h_event_start_encoding[i]); + sem_post(&cpi->h_event_end_encoding[i]); + + pthread_join(cpi->h_encoding_thread[i], 0); + + sem_destroy(&cpi->h_event_start_encoding[i]); + sem_destroy(&cpi->h_event_end_encoding[i]); + } + + sem_post(&cpi->h_event_start_lpf); + pthread_join(cpi->h_filter_thread, 0); + } + + sem_destroy(&cpi->h_event_end_lpf); + sem_destroy(&cpi->h_event_start_lpf); + cpi->b_lpf_running = 0; + + /* free thread related resources */ + vpx_free(cpi->mt_current_mb_col); + cpi->mt_current_mb_col = NULL; + cpi->mt_current_mb_col_size = 0; + vpx_free(cpi->h_event_start_encoding); + cpi->h_event_start_encoding = NULL; + vpx_free(cpi->h_event_end_encoding); + cpi->h_event_end_encoding = NULL; + vpx_free(cpi->h_encoding_thread); + cpi->h_encoding_thread = NULL; + vpx_free(cpi->mb_row_ei); + cpi->mb_row_ei = NULL; + vpx_free(cpi->en_thread_data); + cpi->en_thread_data = NULL; + cpi->encoding_thread_count = 0; + } +} +#endif diff --git a/media/libvpx/libvpx/vp8/encoder/ethreading.h b/media/libvpx/libvpx/vp8/encoder/ethreading.h new file mode 100644 index 0000000000..598fe60559 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/ethreading.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_ENCODER_ETHREADING_H_ +#define VPX_VP8_ENCODER_ETHREADING_H_ + +#include "vp8/encoder/onyx_int.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP8_COMP; +struct macroblock; + +void vp8cx_init_mbrthread_data(struct VP8_COMP *cpi, struct macroblock *x, + MB_ROW_COMP *mbr_ei, int count); +int vp8cx_create_encoder_threads(struct VP8_COMP *cpi); +void vp8cx_remove_encoder_threads(struct VP8_COMP *cpi); + +#ifdef __cplusplus +} +#endif + +#endif // VPX_VP8_ENCODER_ETHREADING_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/firstpass.c b/media/libvpx/libvpx/vp8/encoder/firstpass.c new file mode 100644 index 0000000000..4443f5e7cd --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/firstpass.c @@ -0,0 +1,3090 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_scale_rtcd.h" +#include "block.h" +#include "onyx_int.h" +#include "vpx_dsp/variance.h" +#include "encodeintra.h" +#include "vp8/common/common.h" +#include "vp8/common/setupintrarecon.h" +#include "vp8/common/systemdependent.h" +#include "mcomp.h" +#include "firstpass.h" +#include "vpx_scale/vpx_scale.h" +#include "encodemb.h" +#include "vp8/common/extend.h" +#include "vpx_ports/system_state.h" +#include "vpx_mem/vpx_mem.h" +#include "vp8/common/swapyv12buffer.h" +#include "rdopt.h" +#include "vp8/common/quant_common.h" +#include "encodemv.h" +#include "encodeframe.h" + +#define OUTPUT_FPF 0 + +extern void vp8cx_frame_init_quantizer(VP8_COMP *cpi); + +#define GFQ_ADJUSTMENT vp8_gf_boost_qadjustment[Q] +extern int vp8_kf_boost_qadjustment[QINDEX_RANGE]; + +extern const int vp8_gf_boost_qadjustment[QINDEX_RANGE]; + +#define IIFACTOR 1.5 +#define IIKFACTOR1 1.40 +#define IIKFACTOR2 1.5 +#define RMAX 14.0 +#define GF_RMAX 48.0 + +#define KF_MB_INTRA_MIN 300 +#define GF_MB_INTRA_MIN 200 + +#define DOUBLE_DIVIDE_CHECK(X) ((X) < 0 ? (X)-.000001 : (X) + .000001) + +#define POW1 (double)cpi->oxcf.two_pass_vbrbias / 100.0 +#define POW2 (double)cpi->oxcf.two_pass_vbrbias / 100.0 + +#define NEW_BOOST 1 + +static int vscale_lookup[7] = { 0, 1, 1, 2, 2, 3, 3 }; +static int hscale_lookup[7] = { 0, 0, 1, 1, 2, 2, 3 }; + +static const int cq_level[QINDEX_RANGE] = { + 0, 0, 1, 1, 2, 3, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9, 9, 10, 11, + 11, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 22, 23, 24, + 24, 25, 26, 27, 27, 28, 29, 30, 30, 31, 32, 33, 33, 34, 35, 36, 36, 37, 38, + 39, 39, 40, 41, 42, 42, 43, 44, 45, 46, 46, 47, 48, 49, 50, 50, 51, 52, 53, + 54, 55, 55, 56, 57, 58, 59, 60, 60, 61, 62, 63, 64, 65, 66, 67, 67, 68, 69, + 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 86, + 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100 +}; + +static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame); + +/* Resets the first pass file to the given position using a relative seek + * from the current position + */ +static void reset_fpf_position(VP8_COMP *cpi, FIRSTPASS_STATS *Position) { + cpi->twopass.stats_in = Position; +} + +static int lookup_next_frame_stats(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame) { + if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end) return EOF; + + *next_frame = *cpi->twopass.stats_in; + return 1; +} + +/* Read frame stats at an offset from the current position */ +static int read_frame_stats(VP8_COMP *cpi, FIRSTPASS_STATS *frame_stats, + int offset) { + FIRSTPASS_STATS *fps_ptr = cpi->twopass.stats_in; + + /* Check legality of offset */ + if (offset >= 0) { + if (&fps_ptr[offset] >= cpi->twopass.stats_in_end) return EOF; + } else if (offset < 0) { + if (&fps_ptr[offset] < cpi->twopass.stats_in_start) return EOF; + } + + *frame_stats = fps_ptr[offset]; + return 1; +} + +static int input_stats(VP8_COMP *cpi, FIRSTPASS_STATS *fps) { + if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end) return EOF; + + *fps = *cpi->twopass.stats_in; + cpi->twopass.stats_in = + (void *)((char *)cpi->twopass.stats_in + sizeof(FIRSTPASS_STATS)); + return 1; +} + +static void output_stats(struct vpx_codec_pkt_list *pktlist, + FIRSTPASS_STATS *stats) { + struct vpx_codec_cx_pkt pkt; + pkt.kind = VPX_CODEC_STATS_PKT; + pkt.data.twopass_stats.buf = stats; + pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS); + vpx_codec_pkt_list_add(pktlist, &pkt); + +/* TEMP debug code */ +#if OUTPUT_FPF + + { + FILE *fpfile; + fpfile = fopen("firstpass.stt", "a"); + + fprintf(fpfile, + "%12.0f %12.0f %12.0f %12.4f %12.4f %12.4f %12.4f" + " %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f" + " %12.0f %12.0f %12.4f\n", + stats->frame, stats->intra_error, stats->coded_error, + stats->ssim_weighted_pred_err, stats->pcnt_inter, + stats->pcnt_motion, stats->pcnt_second_ref, stats->pcnt_neutral, + stats->MVr, stats->mvr_abs, stats->MVc, stats->mvc_abs, stats->MVrv, + stats->MVcv, stats->mv_in_out_count, stats->new_mv_count, + stats->count, stats->duration); + fclose(fpfile); + } +#endif +} + +static void zero_stats(FIRSTPASS_STATS *section) { + section->frame = 0.0; + section->intra_error = 0.0; + section->coded_error = 0.0; + section->ssim_weighted_pred_err = 0.0; + section->pcnt_inter = 0.0; + section->pcnt_motion = 0.0; + section->pcnt_second_ref = 0.0; + section->pcnt_neutral = 0.0; + section->MVr = 0.0; + section->mvr_abs = 0.0; + section->MVc = 0.0; + section->mvc_abs = 0.0; + section->MVrv = 0.0; + section->MVcv = 0.0; + section->mv_in_out_count = 0.0; + section->new_mv_count = 0.0; + section->count = 0.0; + section->duration = 1.0; +} + +static void accumulate_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame) { + section->frame += frame->frame; + section->intra_error += frame->intra_error; + section->coded_error += frame->coded_error; + section->ssim_weighted_pred_err += frame->ssim_weighted_pred_err; + section->pcnt_inter += frame->pcnt_inter; + section->pcnt_motion += frame->pcnt_motion; + section->pcnt_second_ref += frame->pcnt_second_ref; + section->pcnt_neutral += frame->pcnt_neutral; + section->MVr += frame->MVr; + section->mvr_abs += frame->mvr_abs; + section->MVc += frame->MVc; + section->mvc_abs += frame->mvc_abs; + section->MVrv += frame->MVrv; + section->MVcv += frame->MVcv; + section->mv_in_out_count += frame->mv_in_out_count; + section->new_mv_count += frame->new_mv_count; + section->count += frame->count; + section->duration += frame->duration; +} + +static void subtract_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame) { + section->frame -= frame->frame; + section->intra_error -= frame->intra_error; + section->coded_error -= frame->coded_error; + section->ssim_weighted_pred_err -= frame->ssim_weighted_pred_err; + section->pcnt_inter -= frame->pcnt_inter; + section->pcnt_motion -= frame->pcnt_motion; + section->pcnt_second_ref -= frame->pcnt_second_ref; + section->pcnt_neutral -= frame->pcnt_neutral; + section->MVr -= frame->MVr; + section->mvr_abs -= frame->mvr_abs; + section->MVc -= frame->MVc; + section->mvc_abs -= frame->mvc_abs; + section->MVrv -= frame->MVrv; + section->MVcv -= frame->MVcv; + section->mv_in_out_count -= frame->mv_in_out_count; + section->new_mv_count -= frame->new_mv_count; + section->count -= frame->count; + section->duration -= frame->duration; +} + +static void avg_stats(FIRSTPASS_STATS *section) { + if (section->count < 1.0) return; + + section->intra_error /= section->count; + section->coded_error /= section->count; + section->ssim_weighted_pred_err /= section->count; + section->pcnt_inter /= section->count; + section->pcnt_second_ref /= section->count; + section->pcnt_neutral /= section->count; + section->pcnt_motion /= section->count; + section->MVr /= section->count; + section->mvr_abs /= section->count; + section->MVc /= section->count; + section->mvc_abs /= section->count; + section->MVrv /= section->count; + section->MVcv /= section->count; + section->mv_in_out_count /= section->count; + section->duration /= section->count; +} + +/* Calculate a modified Error used in distributing bits between easier + * and harder frames + */ +static double calculate_modified_err(VP8_COMP *cpi, + FIRSTPASS_STATS *this_frame) { + double av_err = (cpi->twopass.total_stats.ssim_weighted_pred_err / + cpi->twopass.total_stats.count); + double this_err = this_frame->ssim_weighted_pred_err; + double modified_err; + + if (this_err > av_err) { + modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW1); + } else { + modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW2); + } + + return modified_err; +} + +static const double weight_table[256] = { + 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, + 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, + 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, + 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, + 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.031250, 0.062500, + 0.093750, 0.125000, 0.156250, 0.187500, 0.218750, 0.250000, 0.281250, + 0.312500, 0.343750, 0.375000, 0.406250, 0.437500, 0.468750, 0.500000, + 0.531250, 0.562500, 0.593750, 0.625000, 0.656250, 0.687500, 0.718750, + 0.750000, 0.781250, 0.812500, 0.843750, 0.875000, 0.906250, 0.937500, + 0.968750, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000 +}; + +static double simple_weight(YV12_BUFFER_CONFIG *source) { + int i, j; + + unsigned char *src = source->y_buffer; + double sum_weights = 0.0; + + /* Loop throught the Y plane raw examining levels and creating a weight + * for the image + */ + i = source->y_height; + do { + j = source->y_width; + do { + sum_weights += weight_table[*src]; + src++; + } while (--j); + src -= source->y_width; + src += source->y_stride; + } while (--i); + + sum_weights /= (source->y_height * source->y_width); + + return sum_weights; +} + +/* This function returns the current per frame maximum bitrate target */ +static int frame_max_bits(VP8_COMP *cpi) { + /* Max allocation for a single frame based on the max section guidelines + * passed in and how many bits are left + */ + int max_bits; + + /* For CBR we need to also consider buffer fullness. + * If we are running below the optimal level then we need to gradually + * tighten up on max_bits. + */ + if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { + double buffer_fullness_ratio = + (double)cpi->buffer_level / + DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.optimal_buffer_level); + + /* For CBR base this on the target average bits per frame plus the + * maximum sedction rate passed in by the user + */ + max_bits = (int)(cpi->av_per_frame_bandwidth * + ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0)); + + /* If our buffer is below the optimum level */ + if (buffer_fullness_ratio < 1.0) { + /* The lower of max_bits / 4 or cpi->av_per_frame_bandwidth / 4. */ + int min_max_bits = ((cpi->av_per_frame_bandwidth >> 2) < (max_bits >> 2)) + ? cpi->av_per_frame_bandwidth >> 2 + : max_bits >> 2; + + max_bits = (int)(max_bits * buffer_fullness_ratio); + + /* Lowest value we will set ... which should allow the buffer to + * refill. + */ + if (max_bits < min_max_bits) max_bits = min_max_bits; + } + } + /* VBR */ + else { + /* For VBR base this on the bits and frames left plus the + * two_pass_vbrmax_section rate passed in by the user + */ + max_bits = (int)(((double)cpi->twopass.bits_left / + (cpi->twopass.total_stats.count - + (double)cpi->common.current_video_frame)) * + ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0)); + } + + /* Trap case where we are out of bits */ + if (max_bits < 0) max_bits = 0; + + return max_bits; +} + +void vp8_init_first_pass(VP8_COMP *cpi) { + zero_stats(&cpi->twopass.total_stats); +} + +void vp8_end_first_pass(VP8_COMP *cpi) { + output_stats(cpi->output_pkt_list, &cpi->twopass.total_stats); +} + +static void zz_motion_search(MACROBLOCK *x, YV12_BUFFER_CONFIG *raw_buffer, + int *raw_motion_err, + YV12_BUFFER_CONFIG *recon_buffer, + int *best_motion_err, int recon_yoffset) { + MACROBLOCKD *const xd = &x->e_mbd; + BLOCK *b = &x->block[0]; + BLOCKD *d = &x->e_mbd.block[0]; + + unsigned char *src_ptr = (*(b->base_src) + b->src); + int src_stride = b->src_stride; + unsigned char *raw_ptr; + int raw_stride = raw_buffer->y_stride; + unsigned char *ref_ptr; + int ref_stride = x->e_mbd.pre.y_stride; + + /* Set up pointers for this macro block raw buffer */ + raw_ptr = (unsigned char *)(raw_buffer->y_buffer + recon_yoffset + d->offset); + vpx_mse16x16(src_ptr, src_stride, raw_ptr, raw_stride, + (unsigned int *)(raw_motion_err)); + + /* Set up pointers for this macro block recon buffer */ + xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset; + ref_ptr = (unsigned char *)(xd->pre.y_buffer + d->offset); + vpx_mse16x16(src_ptr, src_stride, ref_ptr, ref_stride, + (unsigned int *)(best_motion_err)); +} + +static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, + int_mv *ref_mv, MV *best_mv, + YV12_BUFFER_CONFIG *recon_buffer, + int *best_motion_err, int recon_yoffset) { + MACROBLOCKD *const xd = &x->e_mbd; + BLOCK *b = &x->block[0]; + BLOCKD *d = &x->e_mbd.block[0]; + int num00; + + int_mv tmp_mv; + int_mv ref_mv_full; + + int tmp_err; + int step_param = 3; /* Don't search over full range for first pass */ + int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; + int n; + vp8_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16]; + int new_mv_mode_penalty = 256; + + /* override the default variance function to use MSE */ + v_fn_ptr.vf = vpx_mse16x16; + + /* Set up pointers for this macro block recon buffer */ + xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset; + + /* Initial step/diamond search centred on best mv */ + tmp_mv.as_int = 0; + ref_mv_full.as_mv.col = ref_mv->as_mv.col >> 3; + ref_mv_full.as_mv.row = ref_mv->as_mv.row >> 3; + tmp_err = cpi->diamond_search_sad(x, b, d, &ref_mv_full, &tmp_mv, step_param, + x->sadperbit16, &num00, &v_fn_ptr, + x->mvcost, ref_mv); + if (tmp_err < INT_MAX - new_mv_mode_penalty) tmp_err += new_mv_mode_penalty; + + if (tmp_err < *best_motion_err) { + *best_motion_err = tmp_err; + best_mv->row = tmp_mv.as_mv.row; + best_mv->col = tmp_mv.as_mv.col; + } + + /* Further step/diamond searches as necessary */ + n = num00; + num00 = 0; + + while (n < further_steps) { + n++; + + if (num00) { + num00--; + } else { + tmp_err = cpi->diamond_search_sad(x, b, d, &ref_mv_full, &tmp_mv, + step_param + n, x->sadperbit16, &num00, + &v_fn_ptr, x->mvcost, ref_mv); + if (tmp_err < INT_MAX - new_mv_mode_penalty) { + tmp_err += new_mv_mode_penalty; + } + + if (tmp_err < *best_motion_err) { + *best_motion_err = tmp_err; + best_mv->row = tmp_mv.as_mv.row; + best_mv->col = tmp_mv.as_mv.col; + } + } + } +} + +void vp8_first_pass(VP8_COMP *cpi) { + int mb_row, mb_col; + MACROBLOCK *const x = &cpi->mb; + VP8_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + + int recon_yoffset, recon_uvoffset; + YV12_BUFFER_CONFIG *lst_yv12 = &cm->yv12_fb[cm->lst_fb_idx]; + YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx]; + YV12_BUFFER_CONFIG *gld_yv12 = &cm->yv12_fb[cm->gld_fb_idx]; + int recon_y_stride = lst_yv12->y_stride; + int recon_uv_stride = lst_yv12->uv_stride; + int64_t intra_error = 0; + int64_t coded_error = 0; + + int sum_mvr = 0, sum_mvc = 0; + int sum_mvr_abs = 0, sum_mvc_abs = 0; + int sum_mvrs = 0, sum_mvcs = 0; + int mvcount = 0; + int intercount = 0; + int second_ref_count = 0; + int intrapenalty = 256; + int neutral_count = 0; + int new_mv_count = 0; + int sum_in_vectors = 0; + uint32_t lastmv_as_int = 0; + + int_mv zero_ref_mv; + + zero_ref_mv.as_int = 0; + + vpx_clear_system_state(); + + x->src = *cpi->Source; + xd->pre = *lst_yv12; + xd->dst = *new_yv12; + + x->partition_info = x->pi; + + xd->mode_info_context = cm->mi; + + if (!cm->use_bilinear_mc_filter) { + xd->subpixel_predict = vp8_sixtap_predict4x4; + xd->subpixel_predict8x4 = vp8_sixtap_predict8x4; + xd->subpixel_predict8x8 = vp8_sixtap_predict8x8; + xd->subpixel_predict16x16 = vp8_sixtap_predict16x16; + } else { + xd->subpixel_predict = vp8_bilinear_predict4x4; + xd->subpixel_predict8x4 = vp8_bilinear_predict8x4; + xd->subpixel_predict8x8 = vp8_bilinear_predict8x8; + xd->subpixel_predict16x16 = vp8_bilinear_predict16x16; + } + + vp8_build_block_offsets(x); + + /* set up frame new frame for intra coded blocks */ + vp8_setup_intra_recon(new_yv12); + vp8cx_frame_init_quantizer(cpi); + + /* Initialise the MV cost table to the defaults */ + { + int flag[2] = { 1, 1 }; + vp8_initialize_rd_consts(cpi, x, + vp8_dc_quant(cm->base_qindex, cm->y1dc_delta_q)); + memcpy(cm->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context)); + vp8_build_component_cost_table(cpi->mb.mvcost, + (const MV_CONTEXT *)cm->fc.mvc, flag); + } + + /* for each macroblock row in image */ + for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { + int_mv best_ref_mv; + + best_ref_mv.as_int = 0; + + /* reset above block coeffs */ + xd->up_available = (mb_row != 0); + recon_yoffset = (mb_row * recon_y_stride * 16); + recon_uvoffset = (mb_row * recon_uv_stride * 8); + + /* Set up limit values for motion vectors to prevent them extending + * outside the UMV borders + */ + x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16)); + x->mv_row_max = + ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16); + + /* for each macroblock col in image */ + for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) { + int this_error; + int gf_motion_error = INT_MAX; + int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row); + + xd->dst.y_buffer = new_yv12->y_buffer + recon_yoffset; + xd->dst.u_buffer = new_yv12->u_buffer + recon_uvoffset; + xd->dst.v_buffer = new_yv12->v_buffer + recon_uvoffset; + xd->left_available = (mb_col != 0); + + /* Copy current mb to a buffer */ + vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16); + + /* do intra 16x16 prediction */ + this_error = vp8_encode_intra(x, use_dc_pred); + + /* "intrapenalty" below deals with situations where the intra + * and inter error scores are very low (eg a plain black frame) + * We do not have special cases in first pass for 0,0 and + * nearest etc so all inter modes carry an overhead cost + * estimate fot the mv. When the error score is very low this + * causes us to pick all or lots of INTRA modes and throw lots + * of key frames. This penalty adds a cost matching that of a + * 0,0 mv to the intra case. + */ + this_error += intrapenalty; + + /* Cumulative intra error total */ + intra_error += (int64_t)this_error; + + /* Set up limit values for motion vectors to prevent them + * extending outside the UMV borders + */ + x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16)); + x->mv_col_max = + ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16); + + /* Other than for the first frame do a motion search */ + if (cm->current_video_frame > 0) { + BLOCKD *d = &x->e_mbd.block[0]; + MV tmp_mv = { 0, 0 }; + int tmp_err; + int motion_error = INT_MAX; + int raw_motion_error = INT_MAX; + + /* Simple 0,0 motion with no mv overhead */ + zz_motion_search(x, cpi->last_frame_unscaled_source, &raw_motion_error, + lst_yv12, &motion_error, recon_yoffset); + d->bmi.mv.as_mv.row = 0; + d->bmi.mv.as_mv.col = 0; + + if (raw_motion_error < cpi->oxcf.encode_breakout) { + goto skip_motion_search; + } + + /* Test last reference frame using the previous best mv as the + * starting point (best reference) for the search + */ + first_pass_motion_search(cpi, x, &best_ref_mv, &d->bmi.mv.as_mv, + lst_yv12, &motion_error, recon_yoffset); + + /* If the current best reference mv is not centred on 0,0 + * then do a 0,0 based search as well + */ + if (best_ref_mv.as_int) { + tmp_err = INT_MAX; + first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv, lst_yv12, + &tmp_err, recon_yoffset); + + if (tmp_err < motion_error) { + motion_error = tmp_err; + d->bmi.mv.as_mv.row = tmp_mv.row; + d->bmi.mv.as_mv.col = tmp_mv.col; + } + } + + /* Experimental search in a second reference frame ((0,0) + * based only) + */ + if (cm->current_video_frame > 1) { + first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv, gld_yv12, + &gf_motion_error, recon_yoffset); + + if ((gf_motion_error < motion_error) && + (gf_motion_error < this_error)) { + second_ref_count++; + } + + /* Reset to last frame as reference buffer */ + xd->pre.y_buffer = lst_yv12->y_buffer + recon_yoffset; + xd->pre.u_buffer = lst_yv12->u_buffer + recon_uvoffset; + xd->pre.v_buffer = lst_yv12->v_buffer + recon_uvoffset; + } + + skip_motion_search: + /* Intra assumed best */ + best_ref_mv.as_int = 0; + + if (motion_error <= this_error) { + /* Keep a count of cases where the inter and intra were + * very close and very low. This helps with scene cut + * detection for example in cropped clips with black bars + * at the sides or top and bottom. + */ + if ((((this_error - intrapenalty) * 9) <= (motion_error * 10)) && + (this_error < (2 * intrapenalty))) { + neutral_count++; + } + + d->bmi.mv.as_mv.row *= 8; + d->bmi.mv.as_mv.col *= 8; + this_error = motion_error; + vp8_set_mbmode_and_mvs(x, NEWMV, &d->bmi.mv); + vp8_encode_inter16x16y(x); + sum_mvr += d->bmi.mv.as_mv.row; + sum_mvr_abs += abs(d->bmi.mv.as_mv.row); + sum_mvc += d->bmi.mv.as_mv.col; + sum_mvc_abs += abs(d->bmi.mv.as_mv.col); + sum_mvrs += d->bmi.mv.as_mv.row * d->bmi.mv.as_mv.row; + sum_mvcs += d->bmi.mv.as_mv.col * d->bmi.mv.as_mv.col; + intercount++; + + best_ref_mv.as_int = d->bmi.mv.as_int; + + /* Was the vector non-zero */ + if (d->bmi.mv.as_int) { + mvcount++; + + /* Was it different from the last non zero vector */ + if (d->bmi.mv.as_int != lastmv_as_int) new_mv_count++; + lastmv_as_int = d->bmi.mv.as_int; + + /* Does the Row vector point inwards or outwards */ + if (mb_row < cm->mb_rows / 2) { + if (d->bmi.mv.as_mv.row > 0) { + sum_in_vectors--; + } else if (d->bmi.mv.as_mv.row < 0) { + sum_in_vectors++; + } + } else if (mb_row > cm->mb_rows / 2) { + if (d->bmi.mv.as_mv.row > 0) { + sum_in_vectors++; + } else if (d->bmi.mv.as_mv.row < 0) { + sum_in_vectors--; + } + } + + /* Does the Row vector point inwards or outwards */ + if (mb_col < cm->mb_cols / 2) { + if (d->bmi.mv.as_mv.col > 0) { + sum_in_vectors--; + } else if (d->bmi.mv.as_mv.col < 0) { + sum_in_vectors++; + } + } else if (mb_col > cm->mb_cols / 2) { + if (d->bmi.mv.as_mv.col > 0) { + sum_in_vectors++; + } else if (d->bmi.mv.as_mv.col < 0) { + sum_in_vectors--; + } + } + } + } + } + + coded_error += (int64_t)this_error; + + /* adjust to the next column of macroblocks */ + x->src.y_buffer += 16; + x->src.u_buffer += 8; + x->src.v_buffer += 8; + + recon_yoffset += 16; + recon_uvoffset += 8; + } + + /* adjust to the next row of mbs */ + x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols; + x->src.u_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols; + x->src.v_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols; + + /* extend the recon for intra prediction */ + vp8_extend_mb_row(new_yv12, xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, + xd->dst.v_buffer + 8); + vpx_clear_system_state(); + } + + vpx_clear_system_state(); + { + double weight = 0.0; + + FIRSTPASS_STATS fps; + + fps.frame = cm->current_video_frame; + fps.intra_error = (double)(intra_error >> 8); + fps.coded_error = (double)(coded_error >> 8); + weight = simple_weight(cpi->Source); + + if (weight < 0.1) weight = 0.1; + + fps.ssim_weighted_pred_err = fps.coded_error * weight; + + fps.pcnt_inter = 0.0; + fps.pcnt_motion = 0.0; + fps.MVr = 0.0; + fps.mvr_abs = 0.0; + fps.MVc = 0.0; + fps.mvc_abs = 0.0; + fps.MVrv = 0.0; + fps.MVcv = 0.0; + fps.mv_in_out_count = 0.0; + fps.new_mv_count = 0.0; + fps.count = 1.0; + + fps.pcnt_inter = 1.0 * (double)intercount / cm->MBs; + fps.pcnt_second_ref = 1.0 * (double)second_ref_count / cm->MBs; + fps.pcnt_neutral = 1.0 * (double)neutral_count / cm->MBs; + + if (mvcount > 0) { + fps.MVr = (double)sum_mvr / (double)mvcount; + fps.mvr_abs = (double)sum_mvr_abs / (double)mvcount; + fps.MVc = (double)sum_mvc / (double)mvcount; + fps.mvc_abs = (double)sum_mvc_abs / (double)mvcount; + fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / (double)mvcount)) / + (double)mvcount; + fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / (double)mvcount)) / + (double)mvcount; + fps.mv_in_out_count = (double)sum_in_vectors / (double)(mvcount * 2); + fps.new_mv_count = new_mv_count; + + fps.pcnt_motion = 1.0 * (double)mvcount / cpi->common.MBs; + } + + /* TODO: handle the case when duration is set to 0, or something less + * than the full time between subsequent cpi->source_time_stamps + */ + fps.duration = (double)(cpi->source->ts_end - cpi->source->ts_start); + + /* don't want to do output stats with a stack variable! */ + memcpy(&cpi->twopass.this_frame_stats, &fps, sizeof(FIRSTPASS_STATS)); + output_stats(cpi->output_pkt_list, &cpi->twopass.this_frame_stats); + accumulate_stats(&cpi->twopass.total_stats, &fps); + } + + /* Copy the previous Last Frame into the GF buffer if specific + * conditions for doing so are met + */ + if ((cm->current_video_frame > 0) && + (cpi->twopass.this_frame_stats.pcnt_inter > 0.20) && + ((cpi->twopass.this_frame_stats.intra_error / + DOUBLE_DIVIDE_CHECK(cpi->twopass.this_frame_stats.coded_error)) > + 2.0)) { + vp8_yv12_copy_frame(lst_yv12, gld_yv12); + } + + /* swap frame pointers so last frame refers to the frame we just + * compressed + */ + vp8_swap_yv12_buffer(lst_yv12, new_yv12); + vp8_yv12_extend_frame_borders(lst_yv12); + + /* Special case for the first frame. Copy into the GF buffer as a + * second reference. + */ + if (cm->current_video_frame == 0) { + vp8_yv12_copy_frame(lst_yv12, gld_yv12); + } + + cm->current_video_frame++; +} +extern const int vp8_bits_per_mb[2][QINDEX_RANGE]; + +/* Estimate a cost per mb attributable to overheads such as the coding of + * modes and motion vectors. + * Currently simplistic in its assumptions for testing. + */ + +static double bitcost(double prob) { + if (prob > 0.000122) { + return -log(prob) / log(2.0); + } else { + return 13.0; + } +} +static int64_t estimate_modemvcost(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats) { + int mv_cost; + int64_t mode_cost; + + double av_pct_inter = fpstats->pcnt_inter / fpstats->count; + double av_pct_motion = fpstats->pcnt_motion / fpstats->count; + double av_intra = (1.0 - av_pct_inter); + + double zz_cost; + double motion_cost; + double intra_cost; + + zz_cost = bitcost(av_pct_inter - av_pct_motion); + motion_cost = bitcost(av_pct_motion); + intra_cost = bitcost(av_intra); + + /* Estimate of extra bits per mv overhead for mbs + * << 9 is the normalization to the (bits * 512) used in vp8_bits_per_mb + */ + mv_cost = ((int)(fpstats->new_mv_count / fpstats->count) * 8) << 9; + + /* Crude estimate of overhead cost from modes + * << 9 is the normalization to (bits * 512) used in vp8_bits_per_mb + */ + mode_cost = + (int64_t)((((av_pct_inter - av_pct_motion) * zz_cost) + + (av_pct_motion * motion_cost) + (av_intra * intra_cost)) * + cpi->common.MBs) * + 512; + + return mv_cost + mode_cost; +} + +static double calc_correction_factor(double err_per_mb, double err_devisor, + double pt_low, double pt_high, int Q) { + double power_term; + double error_term = err_per_mb / err_devisor; + double correction_factor; + + /* Adjustment based on Q to power term. */ + power_term = pt_low + (Q * 0.01); + power_term = (power_term > pt_high) ? pt_high : power_term; + + /* Adjustments to error term */ + /* TBD */ + + /* Calculate correction factor */ + correction_factor = pow(error_term, power_term); + + /* Clip range */ + correction_factor = (correction_factor < 0.05) ? 0.05 + : (correction_factor > 5.0) ? 5.0 + : correction_factor; + + return correction_factor; +} + +static int estimate_max_q(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats, + int section_target_bandwitdh, int overhead_bits) { + int Q; + int num_mbs = cpi->common.MBs; + int target_norm_bits_per_mb; + + double section_err = (fpstats->coded_error / fpstats->count); + double err_per_mb = section_err / num_mbs; + double err_correction_factor; + double speed_correction = 1.0; + int overhead_bits_per_mb; + + if (section_target_bandwitdh <= 0) { + return cpi->twopass.maxq_max_limit; /* Highest value allowed */ + } + + target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20)) + ? (512 * section_target_bandwitdh) / num_mbs + : 512 * (section_target_bandwitdh / num_mbs); + + /* Calculate a corrective factor based on a rolling ratio of bits spent + * vs target bits + */ + if ((cpi->rolling_target_bits > 0) && + (cpi->active_worst_quality < cpi->worst_quality)) { + double rolling_ratio; + + rolling_ratio = + (double)cpi->rolling_actual_bits / (double)cpi->rolling_target_bits; + + if (rolling_ratio < 0.95) { + cpi->twopass.est_max_qcorrection_factor -= 0.005; + } else if (rolling_ratio > 1.05) { + cpi->twopass.est_max_qcorrection_factor += 0.005; + } + + cpi->twopass.est_max_qcorrection_factor = + (cpi->twopass.est_max_qcorrection_factor < 0.1) ? 0.1 + : (cpi->twopass.est_max_qcorrection_factor > 10.0) + ? 10.0 + : cpi->twopass.est_max_qcorrection_factor; + } + + /* Corrections for higher compression speed settings + * (reduced compression expected) + */ + if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1)) { + if (cpi->oxcf.cpu_used <= 5) { + speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04); + } else { + speed_correction = 1.25; + } + } + + /* Estimate of overhead bits per mb */ + /* Correction to overhead bits for min allowed Q. */ + overhead_bits_per_mb = overhead_bits / num_mbs; + overhead_bits_per_mb = (int)(overhead_bits_per_mb * + pow(0.98, (double)cpi->twopass.maxq_min_limit)); + + /* Try and pick a max Q that will be high enough to encode the + * content at the given rate. + */ + for (Q = cpi->twopass.maxq_min_limit; Q < cpi->twopass.maxq_max_limit; ++Q) { + int bits_per_mb_at_this_q; + + /* Error per MB based correction factor */ + err_correction_factor = + calc_correction_factor(err_per_mb, 150.0, 0.40, 0.90, Q); + + bits_per_mb_at_this_q = + vp8_bits_per_mb[INTER_FRAME][Q] + overhead_bits_per_mb; + + bits_per_mb_at_this_q = + (int)(.5 + err_correction_factor * speed_correction * + cpi->twopass.est_max_qcorrection_factor * + cpi->twopass.section_max_qfactor * + (double)bits_per_mb_at_this_q); + + /* Mode and motion overhead */ + /* As Q rises in real encode loop rd code will force overhead down + * We make a crude adjustment for this here as *.98 per Q step. + */ + overhead_bits_per_mb = (int)((double)overhead_bits_per_mb * 0.98); + + if (bits_per_mb_at_this_q <= target_norm_bits_per_mb) break; + } + + /* Restriction on active max q for constrained quality mode. */ + if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) && + (Q < cpi->cq_target_quality)) { + Q = cpi->cq_target_quality; + } + + /* Adjust maxq_min_limit and maxq_max_limit limits based on + * average q observed in clip for non kf/gf.arf frames + * Give average a chance to settle though. + */ + if ((cpi->ni_frames > ((int)cpi->twopass.total_stats.count >> 8)) && + (cpi->ni_frames > 150)) { + cpi->twopass.maxq_max_limit = ((cpi->ni_av_qi + 32) < cpi->worst_quality) + ? (cpi->ni_av_qi + 32) + : cpi->worst_quality; + cpi->twopass.maxq_min_limit = ((cpi->ni_av_qi - 32) > cpi->best_quality) + ? (cpi->ni_av_qi - 32) + : cpi->best_quality; + } + + return Q; +} + +/* For cq mode estimate a cq level that matches the observed + * complexity and data rate. + */ +static int estimate_cq(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats, + int section_target_bandwitdh, int overhead_bits) { + int Q; + int num_mbs = cpi->common.MBs; + int target_norm_bits_per_mb; + + double section_err = (fpstats->coded_error / fpstats->count); + double err_per_mb = section_err / num_mbs; + double err_correction_factor; + double speed_correction = 1.0; + double clip_iiratio; + double clip_iifactor; + int overhead_bits_per_mb; + + target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20)) + ? (512 * section_target_bandwitdh) / num_mbs + : 512 * (section_target_bandwitdh / num_mbs); + + /* Estimate of overhead bits per mb */ + overhead_bits_per_mb = overhead_bits / num_mbs; + + /* Corrections for higher compression speed settings + * (reduced compression expected) + */ + if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1)) { + if (cpi->oxcf.cpu_used <= 5) { + speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04); + } else { + speed_correction = 1.25; + } + } + + /* II ratio correction factor for clip as a whole */ + clip_iiratio = cpi->twopass.total_stats.intra_error / + DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats.coded_error); + clip_iifactor = 1.0 - ((clip_iiratio - 10.0) * 0.025); + if (clip_iifactor < 0.80) clip_iifactor = 0.80; + + /* Try and pick a Q that can encode the content at the given rate. */ + for (Q = 0; Q < MAXQ; ++Q) { + int bits_per_mb_at_this_q; + + /* Error per MB based correction factor */ + err_correction_factor = + calc_correction_factor(err_per_mb, 100.0, 0.40, 0.90, Q); + + bits_per_mb_at_this_q = + vp8_bits_per_mb[INTER_FRAME][Q] + overhead_bits_per_mb; + + bits_per_mb_at_this_q = + (int)(.5 + err_correction_factor * speed_correction * clip_iifactor * + (double)bits_per_mb_at_this_q); + + /* Mode and motion overhead */ + /* As Q rises in real encode loop rd code will force overhead down + * We make a crude adjustment for this here as *.98 per Q step. + */ + overhead_bits_per_mb = (int)((double)overhead_bits_per_mb * 0.98); + + if (bits_per_mb_at_this_q <= target_norm_bits_per_mb) break; + } + + /* Clip value to range "best allowed to (worst allowed - 1)" */ + Q = cq_level[Q]; + if (Q >= cpi->worst_quality) Q = cpi->worst_quality - 1; + if (Q < cpi->best_quality) Q = cpi->best_quality; + + return Q; +} + +static int estimate_q(VP8_COMP *cpi, double section_err, + int section_target_bandwitdh) { + int Q; + int num_mbs = cpi->common.MBs; + int target_norm_bits_per_mb; + + double err_per_mb = section_err / num_mbs; + double err_correction_factor; + double speed_correction = 1.0; + + target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20)) + ? (512 * section_target_bandwitdh) / num_mbs + : 512 * (section_target_bandwitdh / num_mbs); + + /* Corrections for higher compression speed settings + * (reduced compression expected) + */ + if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1)) { + if (cpi->oxcf.cpu_used <= 5) { + speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04); + } else { + speed_correction = 1.25; + } + } + + /* Try and pick a Q that can encode the content at the given rate. */ + for (Q = 0; Q < MAXQ; ++Q) { + int bits_per_mb_at_this_q; + + /* Error per MB based correction factor */ + err_correction_factor = + calc_correction_factor(err_per_mb, 150.0, 0.40, 0.90, Q); + + bits_per_mb_at_this_q = + (int)(.5 + (err_correction_factor * speed_correction * + cpi->twopass.est_max_qcorrection_factor * + (double)vp8_bits_per_mb[INTER_FRAME][Q] / 1.0)); + + if (bits_per_mb_at_this_q <= target_norm_bits_per_mb) break; + } + + return Q; +} + +/* Estimate a worst case Q for a KF group */ +static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, + int section_target_bandwitdh, + double group_iiratio) { + int Q; + int num_mbs = cpi->common.MBs; + int target_norm_bits_per_mb = (512 * section_target_bandwitdh) / num_mbs; + int bits_per_mb_at_this_q; + + double err_per_mb = section_err / num_mbs; + double err_correction_factor; + double speed_correction = 1.0; + double current_spend_ratio = 1.0; + + double pow_highq = (POW1 < 0.6) ? POW1 + 0.3 : 0.90; + double pow_lowq = (POW1 < 0.7) ? POW1 + 0.1 : 0.80; + + double iiratio_correction_factor = 1.0; + + double combined_correction_factor; + + /* Trap special case where the target is <= 0 */ + if (target_norm_bits_per_mb <= 0) return MAXQ * 2; + + /* Calculate a corrective factor based on a rolling ratio of bits spent + * vs target bits + * This is clamped to the range 0.1 to 10.0 + */ + if (cpi->long_rolling_target_bits <= 0) { + current_spend_ratio = 10.0; + } else { + current_spend_ratio = (double)cpi->long_rolling_actual_bits / + (double)cpi->long_rolling_target_bits; + current_spend_ratio = (current_spend_ratio > 10.0) ? 10.0 + : (current_spend_ratio < 0.1) ? 0.1 + : current_spend_ratio; + } + + /* Calculate a correction factor based on the quality of prediction in + * the sequence as indicated by intra_inter error score ratio (IIRatio) + * The idea here is to favour subsampling in the hardest sections vs + * the easyest. + */ + iiratio_correction_factor = 1.0 - ((group_iiratio - 6.0) * 0.1); + + if (iiratio_correction_factor < 0.5) iiratio_correction_factor = 0.5; + + /* Corrections for higher compression speed settings + * (reduced compression expected) + */ + if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1)) { + if (cpi->oxcf.cpu_used <= 5) { + speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04); + } else { + speed_correction = 1.25; + } + } + + /* Combine the various factors calculated above */ + combined_correction_factor = + speed_correction * iiratio_correction_factor * current_spend_ratio; + + /* Try and pick a Q that should be high enough to encode the content at + * the given rate. + */ + for (Q = 0; Q < MAXQ; ++Q) { + /* Error per MB based correction factor */ + err_correction_factor = + calc_correction_factor(err_per_mb, 150.0, pow_lowq, pow_highq, Q); + + bits_per_mb_at_this_q = + (int)(.5 + (err_correction_factor * combined_correction_factor * + (double)vp8_bits_per_mb[INTER_FRAME][Q])); + + if (bits_per_mb_at_this_q <= target_norm_bits_per_mb) break; + } + + /* If we could not hit the target even at Max Q then estimate what Q + * would have been required + */ + while ((bits_per_mb_at_this_q > target_norm_bits_per_mb) && + (Q < (MAXQ * 2))) { + bits_per_mb_at_this_q = (int)(0.96 * bits_per_mb_at_this_q); + Q++; + } + + return Q; +} + +void vp8_init_second_pass(VP8_COMP *cpi) { + FIRSTPASS_STATS this_frame; + FIRSTPASS_STATS *start_pos; + + double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * + cpi->oxcf.two_pass_vbrmin_section / 100); + + zero_stats(&cpi->twopass.total_stats); + zero_stats(&cpi->twopass.total_left_stats); + + if (!cpi->twopass.stats_in_end) return; + + cpi->twopass.total_stats = *cpi->twopass.stats_in_end; + cpi->twopass.total_left_stats = cpi->twopass.total_stats; + + /* each frame can have a different duration, as the frame rate in the + * source isn't guaranteed to be constant. The frame rate prior to + * the first frame encoded in the second pass is a guess. However the + * sum duration is not. Its calculated based on the actual durations of + * all frames from the first pass. + */ + vp8_new_framerate(cpi, 10000000.0 * cpi->twopass.total_stats.count / + cpi->twopass.total_stats.duration); + + cpi->output_framerate = cpi->framerate; + cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration * + cpi->oxcf.target_bandwidth / 10000000.0); + cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats.duration * + two_pass_min_rate / 10000000.0); + + /* Calculate a minimum intra value to be used in determining the IIratio + * scores used in the second pass. We have this minimum to make sure + * that clips that are static but "low complexity" in the intra domain + * are still boosted appropriately for KF/GF/ARF + */ + cpi->twopass.kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs; + cpi->twopass.gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs; + + /* Scan the first pass file and calculate an average Intra / Inter error + * score ratio for the sequence + */ + { + double sum_iiratio = 0.0; + double IIRatio; + + start_pos = cpi->twopass.stats_in; /* Note starting "file" position */ + + while (input_stats(cpi, &this_frame) != EOF) { + IIRatio = + this_frame.intra_error / DOUBLE_DIVIDE_CHECK(this_frame.coded_error); + IIRatio = (IIRatio < 1.0) ? 1.0 : (IIRatio > 20.0) ? 20.0 : IIRatio; + sum_iiratio += IIRatio; + } + + cpi->twopass.avg_iiratio = + sum_iiratio / + DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats.count); + + /* Reset file position */ + reset_fpf_position(cpi, start_pos); + } + + /* Scan the first pass file and calculate a modified total error based + * upon the bias/power function used to allocate bits + */ + { + start_pos = cpi->twopass.stats_in; /* Note starting "file" position */ + + cpi->twopass.modified_error_total = 0.0; + cpi->twopass.modified_error_used = 0.0; + + while (input_stats(cpi, &this_frame) != EOF) { + cpi->twopass.modified_error_total += + calculate_modified_err(cpi, &this_frame); + } + cpi->twopass.modified_error_left = cpi->twopass.modified_error_total; + + reset_fpf_position(cpi, start_pos); /* Reset file position */ + } +} + +void vp8_end_second_pass(VP8_COMP *cpi) { (void)cpi; } + +/* This function gives and estimate of how badly we believe the prediction + * quality is decaying from frame to frame. + */ +static double get_prediction_decay_rate(FIRSTPASS_STATS *next_frame) { + double prediction_decay_rate; + double motion_decay; + double motion_pct = next_frame->pcnt_motion; + + /* Initial basis is the % mbs inter coded */ + prediction_decay_rate = next_frame->pcnt_inter; + + /* High % motion -> somewhat higher decay rate */ + motion_decay = (1.0 - (motion_pct / 20.0)); + if (motion_decay < prediction_decay_rate) { + prediction_decay_rate = motion_decay; + } + + /* Adjustment to decay rate based on speed of motion */ + { + double this_mv_rabs; + double this_mv_cabs; + double distance_factor; + + this_mv_rabs = fabs(next_frame->mvr_abs * motion_pct); + this_mv_cabs = fabs(next_frame->mvc_abs * motion_pct); + + distance_factor = + sqrt((this_mv_rabs * this_mv_rabs) + (this_mv_cabs * this_mv_cabs)) / + 250.0; + distance_factor = ((distance_factor > 1.0) ? 0.0 : (1.0 - distance_factor)); + if (distance_factor < prediction_decay_rate) { + prediction_decay_rate = distance_factor; + } + } + + return prediction_decay_rate; +} + +/* Function to test for a condition where a complex transition is followed + * by a static section. For example in slide shows where there is a fade + * between slides. This is to help with more optimal kf and gf positioning. + */ +static int detect_transition_to_still(VP8_COMP *cpi, int frame_interval, + int still_interval, + double loop_decay_rate, + double decay_accumulator) { + int trans_to_still = 0; + + /* Break clause to detect very still sections after motion + * For example a static image after a fade or other transition + * instead of a clean scene cut. + */ + if ((frame_interval > MIN_GF_INTERVAL) && (loop_decay_rate >= 0.999) && + (decay_accumulator < 0.9)) { + int j; + FIRSTPASS_STATS *position = cpi->twopass.stats_in; + FIRSTPASS_STATS tmp_next_frame; + double decay_rate; + + /* Look ahead a few frames to see if static condition persists... */ + for (j = 0; j < still_interval; ++j) { + if (EOF == input_stats(cpi, &tmp_next_frame)) break; + + decay_rate = get_prediction_decay_rate(&tmp_next_frame); + if (decay_rate < 0.999) break; + } + /* Reset file position */ + reset_fpf_position(cpi, position); + + /* Only if it does do we signal a transition to still */ + if (j == still_interval) trans_to_still = 1; + } + + return trans_to_still; +} + +/* This function detects a flash through the high relative pcnt_second_ref + * score in the frame following a flash frame. The offset passed in should + * reflect this + */ +static int detect_flash(VP8_COMP *cpi, int offset) { + FIRSTPASS_STATS next_frame; + + int flash_detected = 0; + + /* Read the frame data. */ + /* The return is 0 (no flash detected) if not a valid frame */ + if (read_frame_stats(cpi, &next_frame, offset) != EOF) { + /* What we are looking for here is a situation where there is a + * brief break in prediction (such as a flash) but subsequent frames + * are reasonably well predicted by an earlier (pre flash) frame. + * The recovery after a flash is indicated by a high pcnt_second_ref + * comapred to pcnt_inter. + */ + if ((next_frame.pcnt_second_ref > next_frame.pcnt_inter) && + (next_frame.pcnt_second_ref >= 0.5)) { + flash_detected = 1; + + /*if (1) + { + FILE *f = fopen("flash.stt", "a"); + fprintf(f, "%8.0f %6.2f %6.2f\n", + next_frame.frame, + next_frame.pcnt_inter, + next_frame.pcnt_second_ref); + fclose(f); + }*/ + } + } + + return flash_detected; +} + +/* Update the motion related elements to the GF arf boost calculation */ +static void accumulate_frame_motion_stats(FIRSTPASS_STATS *this_frame, + double *this_frame_mv_in_out, + double *mv_in_out_accumulator, + double *abs_mv_in_out_accumulator, + double *mv_ratio_accumulator) { + double this_frame_mvr_ratio; + double this_frame_mvc_ratio; + double motion_pct; + + /* Accumulate motion stats. */ + motion_pct = this_frame->pcnt_motion; + + /* Accumulate Motion In/Out of frame stats */ + *this_frame_mv_in_out = this_frame->mv_in_out_count * motion_pct; + *mv_in_out_accumulator += this_frame->mv_in_out_count * motion_pct; + *abs_mv_in_out_accumulator += fabs(this_frame->mv_in_out_count * motion_pct); + + /* Accumulate a measure of how uniform (or conversely how random) + * the motion field is. (A ratio of absmv / mv) + */ + if (motion_pct > 0.05) { + this_frame_mvr_ratio = + fabs(this_frame->mvr_abs) / DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVr)); + + this_frame_mvc_ratio = + fabs(this_frame->mvc_abs) / DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVc)); + + *mv_ratio_accumulator += (this_frame_mvr_ratio < this_frame->mvr_abs) + ? (this_frame_mvr_ratio * motion_pct) + : this_frame->mvr_abs * motion_pct; + + *mv_ratio_accumulator += (this_frame_mvc_ratio < this_frame->mvc_abs) + ? (this_frame_mvc_ratio * motion_pct) + : this_frame->mvc_abs * motion_pct; + } +} + +/* Calculate a baseline boost number for the current frame. */ +static double calc_frame_boost(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame, + double this_frame_mv_in_out) { + double frame_boost; + + /* Underlying boost factor is based on inter intra error ratio */ + if (this_frame->intra_error > cpi->twopass.gf_intra_err_min) { + frame_boost = (IIFACTOR * this_frame->intra_error / + DOUBLE_DIVIDE_CHECK(this_frame->coded_error)); + } else { + frame_boost = (IIFACTOR * cpi->twopass.gf_intra_err_min / + DOUBLE_DIVIDE_CHECK(this_frame->coded_error)); + } + + /* Increase boost for frames where new data coming into frame + * (eg zoom out). Slightly reduce boost if there is a net balance + * of motion out of the frame (zoom in). + * The range for this_frame_mv_in_out is -1.0 to +1.0 + */ + if (this_frame_mv_in_out > 0.0) { + frame_boost += frame_boost * (this_frame_mv_in_out * 2.0); + /* In extreme case boost is halved */ + } else { + frame_boost += frame_boost * (this_frame_mv_in_out / 2.0); + } + + /* Clip to maximum */ + if (frame_boost > GF_RMAX) frame_boost = GF_RMAX; + + return frame_boost; +} + +#if NEW_BOOST +static int calc_arf_boost(VP8_COMP *cpi, int offset, int f_frames, int b_frames, + int *f_boost, int *b_boost) { + FIRSTPASS_STATS this_frame; + + int i; + double boost_score = 0.0; + double mv_ratio_accumulator = 0.0; + double decay_accumulator = 1.0; + double this_frame_mv_in_out = 0.0; + double mv_in_out_accumulator = 0.0; + double abs_mv_in_out_accumulator = 0.0; + double r; + int flash_detected = 0; + + /* Search forward from the proposed arf/next gf position */ + for (i = 0; i < f_frames; ++i) { + if (read_frame_stats(cpi, &this_frame, (i + offset)) == EOF) break; + + /* Update the motion related elements to the boost calculation */ + accumulate_frame_motion_stats( + &this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, + &abs_mv_in_out_accumulator, &mv_ratio_accumulator); + + /* Calculate the baseline boost number for this frame */ + r = calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out); + + /* We want to discount the flash frame itself and the recovery + * frame that follows as both will have poor scores. + */ + flash_detected = + detect_flash(cpi, (i + offset)) || detect_flash(cpi, (i + offset + 1)); + + /* Cumulative effect of prediction quality decay */ + if (!flash_detected) { + decay_accumulator = + decay_accumulator * get_prediction_decay_rate(&this_frame); + decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator; + } + boost_score += (decay_accumulator * r); + + /* Break out conditions. */ + if ((!flash_detected) && + ((mv_ratio_accumulator > 100.0) || (abs_mv_in_out_accumulator > 3.0) || + (mv_in_out_accumulator < -2.0))) { + break; + } + } + + *f_boost = (int)(boost_score * 100.0) >> 4; + + /* Reset for backward looking loop */ + boost_score = 0.0; + mv_ratio_accumulator = 0.0; + decay_accumulator = 1.0; + this_frame_mv_in_out = 0.0; + mv_in_out_accumulator = 0.0; + abs_mv_in_out_accumulator = 0.0; + + /* Search forward from the proposed arf/next gf position */ + for (i = -1; i >= -b_frames; i--) { + if (read_frame_stats(cpi, &this_frame, (i + offset)) == EOF) break; + + /* Update the motion related elements to the boost calculation */ + accumulate_frame_motion_stats( + &this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, + &abs_mv_in_out_accumulator, &mv_ratio_accumulator); + + /* Calculate the baseline boost number for this frame */ + r = calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out); + + /* We want to discount the flash frame itself and the recovery + * frame that follows as both will have poor scores. + */ + flash_detected = + detect_flash(cpi, (i + offset)) || detect_flash(cpi, (i + offset + 1)); + + /* Cumulative effect of prediction quality decay */ + if (!flash_detected) { + decay_accumulator = + decay_accumulator * get_prediction_decay_rate(&this_frame); + decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator; + } + + boost_score += (decay_accumulator * r); + + /* Break out conditions. */ + if ((!flash_detected) && + ((mv_ratio_accumulator > 100.0) || (abs_mv_in_out_accumulator > 3.0) || + (mv_in_out_accumulator < -2.0))) { + break; + } + } + *b_boost = (int)(boost_score * 100.0) >> 4; + + return (*f_boost + *b_boost); +} +#endif + +/* Analyse and define a gf/arf group . */ +static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { + FIRSTPASS_STATS next_frame; + FIRSTPASS_STATS *start_pos; + int i; + double r; + double boost_score = 0.0; + double old_boost_score = 0.0; + double gf_group_err = 0.0; + double gf_first_frame_err = 0.0; + double mod_frame_err = 0.0; + + double mv_ratio_accumulator = 0.0; + double decay_accumulator = 1.0; + + double loop_decay_rate = 1.00; /* Starting decay rate */ + + double this_frame_mv_in_out = 0.0; + double mv_in_out_accumulator = 0.0; + double abs_mv_in_out_accumulator = 0.0; + + int max_bits = frame_max_bits(cpi); /* Max for a single frame */ + + unsigned int allow_alt_ref = + cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames; + + int alt_boost = 0; + int f_boost = 0; + int b_boost = 0; + int flash_detected; + + cpi->twopass.gf_group_bits = 0; + cpi->twopass.gf_decay_rate = 0; + + vpx_clear_system_state(); + + start_pos = cpi->twopass.stats_in; + + memset(&next_frame, 0, sizeof(next_frame)); /* assure clean */ + + /* Load stats for the current frame. */ + mod_frame_err = calculate_modified_err(cpi, this_frame); + + /* Note the error of the frame at the start of the group (this will be + * the GF frame error if we code a normal gf + */ + gf_first_frame_err = mod_frame_err; + + /* Special treatment if the current frame is a key frame (which is also + * a gf). If it is then its error score (and hence bit allocation) need + * to be subtracted out from the calculation for the GF group + */ + if (cpi->common.frame_type == KEY_FRAME) gf_group_err -= gf_first_frame_err; + + /* Scan forward to try and work out how many frames the next gf group + * should contain and what level of boost is appropriate for the GF + * or ARF that will be coded with the group + */ + i = 0; + + while (((i < cpi->twopass.static_scene_max_gf_interval) || + ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL)) && + (i < cpi->twopass.frames_to_key)) { + i++; + + /* Accumulate error score of frames in this gf group */ + mod_frame_err = calculate_modified_err(cpi, this_frame); + + gf_group_err += mod_frame_err; + + if (EOF == input_stats(cpi, &next_frame)) break; + + /* Test for the case where there is a brief flash but the prediction + * quality back to an earlier frame is then restored. + */ + flash_detected = detect_flash(cpi, 0); + + /* Update the motion related elements to the boost calculation */ + accumulate_frame_motion_stats( + &next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, + &abs_mv_in_out_accumulator, &mv_ratio_accumulator); + + /* Calculate a baseline boost number for this frame */ + r = calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out); + + /* Cumulative effect of prediction quality decay */ + if (!flash_detected) { + loop_decay_rate = get_prediction_decay_rate(&next_frame); + decay_accumulator = decay_accumulator * loop_decay_rate; + decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator; + } + boost_score += (decay_accumulator * r); + + /* Break clause to detect very still sections after motion + * For example a staic image after a fade or other transition. + */ + if (detect_transition_to_still(cpi, i, 5, loop_decay_rate, + decay_accumulator)) { + allow_alt_ref = 0; + boost_score = old_boost_score; + break; + } + + /* Break out conditions. */ + if ( + /* Break at cpi->max_gf_interval unless almost totally static */ + (i >= cpi->max_gf_interval && (decay_accumulator < 0.995)) || + ( + /* Don't break out with a very short interval */ + (i > MIN_GF_INTERVAL) && + /* Don't break out very close to a key frame */ + ((cpi->twopass.frames_to_key - i) >= MIN_GF_INTERVAL) && + ((boost_score > 20.0) || (next_frame.pcnt_inter < 0.75)) && + (!flash_detected) && + ((mv_ratio_accumulator > 100.0) || + (abs_mv_in_out_accumulator > 3.0) || + (mv_in_out_accumulator < -2.0) || + ((boost_score - old_boost_score) < 2.0)))) { + boost_score = old_boost_score; + break; + } + + memcpy(this_frame, &next_frame, sizeof(*this_frame)); + + old_boost_score = boost_score; + } + + cpi->twopass.gf_decay_rate = + (i > 0) ? (int)(100.0 * (1.0 - decay_accumulator)) / i : 0; + + /* When using CBR apply additional buffer related upper limits */ + if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { + double max_boost; + + /* For cbr apply buffer related limits */ + if (cpi->drop_frames_allowed) { + int64_t df_buffer_level = cpi->oxcf.drop_frames_water_mark * + (cpi->oxcf.optimal_buffer_level / 100); + + if (cpi->buffer_level > df_buffer_level) { + max_boost = + ((double)((cpi->buffer_level - df_buffer_level) * 2 / 3) * 16.0) / + DOUBLE_DIVIDE_CHECK((double)cpi->av_per_frame_bandwidth); + } else { + max_boost = 0.0; + } + } else if (cpi->buffer_level > 0) { + max_boost = ((double)(cpi->buffer_level * 2 / 3) * 16.0) / + DOUBLE_DIVIDE_CHECK((double)cpi->av_per_frame_bandwidth); + } else { + max_boost = 0.0; + } + + if (boost_score > max_boost) boost_score = max_boost; + } + + /* Don't allow conventional gf too near the next kf */ + if ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL) { + while (i < cpi->twopass.frames_to_key) { + i++; + + if (EOF == input_stats(cpi, this_frame)) break; + + if (i < cpi->twopass.frames_to_key) { + mod_frame_err = calculate_modified_err(cpi, this_frame); + gf_group_err += mod_frame_err; + } + } + } + + cpi->gfu_boost = (int)(boost_score * 100.0) >> 4; + +#if NEW_BOOST + /* Alterrnative boost calculation for alt ref */ + alt_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost); +#endif + + /* Should we use the alternate reference frame */ + if (allow_alt_ref && (i >= MIN_GF_INTERVAL) && + /* don't use ARF very near next kf */ + (i <= (cpi->twopass.frames_to_key - MIN_GF_INTERVAL)) && +#if NEW_BOOST + ((next_frame.pcnt_inter > 0.75) || (next_frame.pcnt_second_ref > 0.5)) && + ((mv_in_out_accumulator / (double)i > -0.2) || + (mv_in_out_accumulator > -2.0)) && + (b_boost > 100) && (f_boost > 100)) +#else + (next_frame.pcnt_inter > 0.75) && + ((mv_in_out_accumulator / (double)i > -0.2) || + (mv_in_out_accumulator > -2.0)) && + (cpi->gfu_boost > 100) && + (cpi->twopass.gf_decay_rate <= + (ARF_DECAY_THRESH + (cpi->gfu_boost / 200)))) +#endif + { + int Boost; + int allocation_chunks; + int Q = + (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q; + int tmp_q; + int arf_frame_bits = 0; + int group_bits; + +#if NEW_BOOST + cpi->gfu_boost = alt_boost; +#endif + + /* Estimate the bits to be allocated to the group as a whole */ + if ((cpi->twopass.kf_group_bits > 0) && + (cpi->twopass.kf_group_error_left > 0)) { + group_bits = + (int)((double)cpi->twopass.kf_group_bits * + (gf_group_err / (double)cpi->twopass.kf_group_error_left)); + } else { + group_bits = 0; + } + +/* Boost for arf frame */ +#if NEW_BOOST + Boost = (alt_boost * GFQ_ADJUSTMENT) / 100; +#else + Boost = (cpi->gfu_boost * 3 * GFQ_ADJUSTMENT) / (2 * 100); +#endif + Boost += (i * 50); + + /* Set max and minimum boost and hence minimum allocation */ + if (Boost > ((cpi->baseline_gf_interval + 1) * 200)) { + Boost = ((cpi->baseline_gf_interval + 1) * 200); + } else if (Boost < 125) { + Boost = 125; + } + + allocation_chunks = (i * 100) + Boost; + + /* Normalize Altboost and allocations chunck down to prevent overflow */ + while (Boost > 1000) { + Boost /= 2; + allocation_chunks /= 2; + } + + /* Calculate the number of bits to be spent on the arf based on the + * boost number + */ + arf_frame_bits = + (int)((double)Boost * (group_bits / (double)allocation_chunks)); + + /* Estimate if there are enough bits available to make worthwhile use + * of an arf. + */ + tmp_q = estimate_q(cpi, mod_frame_err, (int)arf_frame_bits); + + /* Only use an arf if it is likely we will be able to code + * it at a lower Q than the surrounding frames. + */ + if (tmp_q < cpi->worst_quality) { + int half_gf_int; + int frames_after_arf; + int frames_bwd = cpi->oxcf.arnr_max_frames - 1; + int frames_fwd = cpi->oxcf.arnr_max_frames - 1; + + cpi->source_alt_ref_pending = 1; + + /* + * For alt ref frames the error score for the end frame of the + * group (the alt ref frame) should not contribute to the group + * total and hence the number of bit allocated to the group. + * Rather it forms part of the next group (it is the GF at the + * start of the next group) + * gf_group_err -= mod_frame_err; + * + * For alt ref frames alt ref frame is technically part of the + * GF frame for the next group but we always base the error + * calculation and bit allocation on the current group of frames. + * + * Set the interval till the next gf or arf. + * For ARFs this is the number of frames to be coded before the + * future frame that is coded as an ARF. + * The future frame itself is part of the next group + */ + cpi->baseline_gf_interval = i; + + /* + * Define the arnr filter width for this group of frames: + * We only filter frames that lie within a distance of half + * the GF interval from the ARF frame. We also have to trap + * cases where the filter extends beyond the end of clip. + * Note: this_frame->frame has been updated in the loop + * so it now points at the ARF frame. + */ + half_gf_int = cpi->baseline_gf_interval >> 1; + frames_after_arf = + (int)(cpi->twopass.total_stats.count - this_frame->frame - 1); + + switch (cpi->oxcf.arnr_type) { + case 1: /* Backward filter */ + frames_fwd = 0; + if (frames_bwd > half_gf_int) frames_bwd = half_gf_int; + break; + + case 2: /* Forward filter */ + if (frames_fwd > half_gf_int) frames_fwd = half_gf_int; + if (frames_fwd > frames_after_arf) frames_fwd = frames_after_arf; + frames_bwd = 0; + break; + + case 3: /* Centered filter */ + default: + frames_fwd >>= 1; + if (frames_fwd > frames_after_arf) frames_fwd = frames_after_arf; + if (frames_fwd > half_gf_int) frames_fwd = half_gf_int; + + frames_bwd = frames_fwd; + + /* For even length filter there is one more frame backward + * than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff. + */ + if (frames_bwd < half_gf_int) { + frames_bwd += (cpi->oxcf.arnr_max_frames + 1) & 0x1; + } + break; + } + + cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd; + } else { + cpi->source_alt_ref_pending = 0; + cpi->baseline_gf_interval = i; + } + } else { + cpi->source_alt_ref_pending = 0; + cpi->baseline_gf_interval = i; + } + + /* + * Now decide how many bits should be allocated to the GF group as a + * proportion of those remaining in the kf group. + * The final key frame group in the clip is treated as a special case + * where cpi->twopass.kf_group_bits is tied to cpi->twopass.bits_left. + * This is also important for short clips where there may only be one + * key frame. + */ + if (cpi->twopass.frames_to_key >= + (int)(cpi->twopass.total_stats.count - cpi->common.current_video_frame)) { + cpi->twopass.kf_group_bits = + (cpi->twopass.bits_left > 0) ? cpi->twopass.bits_left : 0; + } + + /* Calculate the bits to be allocated to the group as a whole */ + if ((cpi->twopass.kf_group_bits > 0) && + (cpi->twopass.kf_group_error_left > 0)) { + cpi->twopass.gf_group_bits = + (int64_t)(cpi->twopass.kf_group_bits * + (gf_group_err / cpi->twopass.kf_group_error_left)); + } else { + cpi->twopass.gf_group_bits = 0; + } + + cpi->twopass.gf_group_bits = + (cpi->twopass.gf_group_bits < 0) ? 0 + : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits) + ? cpi->twopass.kf_group_bits + : cpi->twopass.gf_group_bits; + + /* Clip cpi->twopass.gf_group_bits based on user supplied data rate + * variability limit (cpi->oxcf.two_pass_vbrmax_section) + */ + if (cpi->twopass.gf_group_bits > + (int64_t)max_bits * cpi->baseline_gf_interval) { + cpi->twopass.gf_group_bits = (int64_t)max_bits * cpi->baseline_gf_interval; + } + + /* Reset the file position */ + reset_fpf_position(cpi, start_pos); + + /* Update the record of error used so far (only done once per gf group) */ + cpi->twopass.modified_error_used += gf_group_err; + + /* Assign bits to the arf or gf. */ + for (i = 0; i <= (cpi->source_alt_ref_pending && + cpi->common.frame_type != KEY_FRAME); + i++) { + int Boost; + int allocation_chunks; + int Q = + (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q; + int gf_bits; + + /* For ARF frames */ + if (cpi->source_alt_ref_pending && i == 0) { +#if NEW_BOOST + Boost = (alt_boost * GFQ_ADJUSTMENT) / 100; +#else + Boost = (cpi->gfu_boost * 3 * GFQ_ADJUSTMENT) / (2 * 100); +#endif + Boost += (cpi->baseline_gf_interval * 50); + + /* Set max and minimum boost and hence minimum allocation */ + if (Boost > ((cpi->baseline_gf_interval + 1) * 200)) { + Boost = ((cpi->baseline_gf_interval + 1) * 200); + } else if (Boost < 125) { + Boost = 125; + } + + allocation_chunks = ((cpi->baseline_gf_interval + 1) * 100) + Boost; + } + /* Else for standard golden frames */ + else { + /* boost based on inter / intra ratio of subsequent frames */ + Boost = (cpi->gfu_boost * GFQ_ADJUSTMENT) / 100; + + /* Set max and minimum boost and hence minimum allocation */ + if (Boost > (cpi->baseline_gf_interval * 150)) { + Boost = (cpi->baseline_gf_interval * 150); + } else if (Boost < 125) { + Boost = 125; + } + + allocation_chunks = (cpi->baseline_gf_interval * 100) + (Boost - 100); + } + + /* Normalize Altboost and allocations chunck down to prevent overflow */ + while (Boost > 1000) { + Boost /= 2; + allocation_chunks /= 2; + } + + /* Calculate the number of bits to be spent on the gf or arf based on + * the boost number + */ + gf_bits = (int)((double)Boost * + (cpi->twopass.gf_group_bits / (double)allocation_chunks)); + + /* If the frame that is to be boosted is simpler than the average for + * the gf/arf group then use an alternative calculation + * based on the error score of the frame itself + */ + if (mod_frame_err < gf_group_err / (double)cpi->baseline_gf_interval) { + double alt_gf_grp_bits; + int alt_gf_bits; + + alt_gf_grp_bits = + (double)cpi->twopass.kf_group_bits * + (mod_frame_err * (double)cpi->baseline_gf_interval) / + DOUBLE_DIVIDE_CHECK((double)cpi->twopass.kf_group_error_left); + + alt_gf_bits = + (int)((double)Boost * (alt_gf_grp_bits / (double)allocation_chunks)); + + if (gf_bits > alt_gf_bits) { + gf_bits = alt_gf_bits; + } + } + /* Else if it is harder than other frames in the group make sure it at + * least receives an allocation in keeping with its relative error + * score, otherwise it may be worse off than an "un-boosted" frame + */ + else { + // Avoid division by 0 by clamping cpi->twopass.kf_group_error_left to 1 + int alt_gf_bits = + (int)((double)cpi->twopass.kf_group_bits * mod_frame_err / + (double)VPXMAX(cpi->twopass.kf_group_error_left, 1)); + + if (alt_gf_bits > gf_bits) { + gf_bits = alt_gf_bits; + } + } + + /* Apply an additional limit for CBR */ + if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { + if (cpi->twopass.gf_bits > (int)(cpi->buffer_level >> 1)) { + cpi->twopass.gf_bits = (int)(cpi->buffer_level >> 1); + } + } + + /* Don't allow a negative value for gf_bits */ + if (gf_bits < 0) gf_bits = 0; + + /* Add in minimum for a frame */ + gf_bits += cpi->min_frame_bandwidth; + + if (i == 0) { + cpi->twopass.gf_bits = gf_bits; + } + if (i == 1 || (!cpi->source_alt_ref_pending && + (cpi->common.frame_type != KEY_FRAME))) { + /* Per frame bit target for this frame */ + cpi->per_frame_bandwidth = gf_bits; + } + } + + { + /* Adjust KF group bits and error remainin */ + cpi->twopass.kf_group_error_left -= (int64_t)gf_group_err; + cpi->twopass.kf_group_bits -= cpi->twopass.gf_group_bits; + + if (cpi->twopass.kf_group_bits < 0) cpi->twopass.kf_group_bits = 0; + + /* Note the error score left in the remaining frames of the group. + * For normal GFs we want to remove the error score for the first + * frame of the group (except in Key frame case where this has + * already happened) + */ + if (!cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME) { + cpi->twopass.gf_group_error_left = + (int)(gf_group_err - gf_first_frame_err); + } else { + cpi->twopass.gf_group_error_left = (int)gf_group_err; + } + + cpi->twopass.gf_group_bits -= + cpi->twopass.gf_bits - cpi->min_frame_bandwidth; + + if (cpi->twopass.gf_group_bits < 0) cpi->twopass.gf_group_bits = 0; + + /* This condition could fail if there are two kfs very close together + * despite (MIN_GF_INTERVAL) and would cause a divide by 0 in the + * calculation of cpi->twopass.alt_extra_bits. + */ + if (cpi->baseline_gf_interval >= 3) { +#if NEW_BOOST + int boost = (cpi->source_alt_ref_pending) ? b_boost : cpi->gfu_boost; +#else + int boost = cpi->gfu_boost; +#endif + if (boost >= 150) { + int pct_extra; + + pct_extra = (boost - 100) / 50; + pct_extra = (pct_extra > 20) ? 20 : pct_extra; + + cpi->twopass.alt_extra_bits = + (int)(cpi->twopass.gf_group_bits * pct_extra) / 100; + cpi->twopass.gf_group_bits -= cpi->twopass.alt_extra_bits; + cpi->twopass.alt_extra_bits /= ((cpi->baseline_gf_interval - 1) >> 1); + } else { + cpi->twopass.alt_extra_bits = 0; + } + } else { + cpi->twopass.alt_extra_bits = 0; + } + } + + /* Adjustments based on a measure of complexity of the section */ + if (cpi->common.frame_type != KEY_FRAME) { + FIRSTPASS_STATS sectionstats; + double Ratio; + + zero_stats(§ionstats); + reset_fpf_position(cpi, start_pos); + + for (i = 0; i < cpi->baseline_gf_interval; ++i) { + input_stats(cpi, &next_frame); + accumulate_stats(§ionstats, &next_frame); + } + + avg_stats(§ionstats); + + cpi->twopass.section_intra_rating = + (unsigned int)(sectionstats.intra_error / + DOUBLE_DIVIDE_CHECK(sectionstats.coded_error)); + + Ratio = sectionstats.intra_error / + DOUBLE_DIVIDE_CHECK(sectionstats.coded_error); + cpi->twopass.section_max_qfactor = 1.0 - ((Ratio - 10.0) * 0.025); + + if (cpi->twopass.section_max_qfactor < 0.80) { + cpi->twopass.section_max_qfactor = 0.80; + } + + reset_fpf_position(cpi, start_pos); + } +} + +/* Allocate bits to a normal frame that is neither a gf an arf or a key frame. + */ +static void assign_std_frame_bits(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { + int target_frame_size; + + double modified_err; + double err_fraction; + + int max_bits = frame_max_bits(cpi); /* Max for a single frame */ + + /* Calculate modified prediction error used in bit allocation */ + modified_err = calculate_modified_err(cpi, this_frame); + + /* What portion of the remaining GF group error is used by this frame */ + if (cpi->twopass.gf_group_error_left > 0) { + err_fraction = modified_err / cpi->twopass.gf_group_error_left; + } else { + err_fraction = 0.0; + } + + /* How many of those bits available for allocation should we give it? */ + target_frame_size = (int)((double)cpi->twopass.gf_group_bits * err_fraction); + + /* Clip to target size to 0 - max_bits (or cpi->twopass.gf_group_bits) + * at the top end. + */ + if (target_frame_size < 0) { + target_frame_size = 0; + } else { + if (target_frame_size > max_bits) target_frame_size = max_bits; + + if (target_frame_size > cpi->twopass.gf_group_bits) { + target_frame_size = (int)cpi->twopass.gf_group_bits; + } + } + + /* Adjust error and bits remaining */ + cpi->twopass.gf_group_error_left -= (int)modified_err; + cpi->twopass.gf_group_bits -= target_frame_size; + + if (cpi->twopass.gf_group_bits < 0) cpi->twopass.gf_group_bits = 0; + + /* Add in the minimum number of bits that is set aside for every frame. */ + target_frame_size += cpi->min_frame_bandwidth; + + /* Every other frame gets a few extra bits */ + if ((cpi->frames_since_golden & 0x01) && + (cpi->frames_till_gf_update_due > 0)) { + target_frame_size += cpi->twopass.alt_extra_bits; + } + + /* Per frame bit target for this frame */ + cpi->per_frame_bandwidth = target_frame_size; +} + +void vp8_second_pass(VP8_COMP *cpi) { + int tmp_q; + int frames_left = + (int)(cpi->twopass.total_stats.count - cpi->common.current_video_frame); + + FIRSTPASS_STATS this_frame; + FIRSTPASS_STATS this_frame_copy; + + double this_frame_intra_error; + double this_frame_coded_error; + + int overhead_bits; + + vp8_zero(this_frame); + + if (!cpi->twopass.stats_in) { + return; + } + + vpx_clear_system_state(); + + if (EOF == input_stats(cpi, &this_frame)) return; + + this_frame_intra_error = this_frame.intra_error; + this_frame_coded_error = this_frame.coded_error; + + /* keyframe and section processing ! */ + if (cpi->twopass.frames_to_key == 0) { + /* Define next KF group and assign bits to it */ + memcpy(&this_frame_copy, &this_frame, sizeof(this_frame)); + find_next_key_frame(cpi, &this_frame_copy); + + /* Special case: Error error_resilient_mode mode does not make much + * sense for two pass but with its current meaning this code is + * designed to stop outlandish behaviour if someone does set it when + * using two pass. It effectively disables GF groups. This is + * temporary code until we decide what should really happen in this + * case. + */ + if (cpi->oxcf.error_resilient_mode) { + cpi->twopass.gf_group_bits = cpi->twopass.kf_group_bits; + cpi->twopass.gf_group_error_left = (int)cpi->twopass.kf_group_error_left; + cpi->baseline_gf_interval = cpi->twopass.frames_to_key; + cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; + cpi->source_alt_ref_pending = 0; + } + } + + /* Is this a GF / ARF (Note that a KF is always also a GF) */ + if (cpi->frames_till_gf_update_due == 0) { + /* Define next gf group and assign bits to it */ + memcpy(&this_frame_copy, &this_frame, sizeof(this_frame)); + define_gf_group(cpi, &this_frame_copy); + + /* If we are going to code an altref frame at the end of the group + * and the current frame is not a key frame.... If the previous + * group used an arf this frame has already benefited from that arf + * boost and it should not be given extra bits If the previous + * group was NOT coded using arf we may want to apply some boost to + * this GF as well + */ + if (cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME)) { + /* Assign a standard frames worth of bits from those allocated + * to the GF group + */ + int bak = cpi->per_frame_bandwidth; + memcpy(&this_frame_copy, &this_frame, sizeof(this_frame)); + assign_std_frame_bits(cpi, &this_frame_copy); + cpi->per_frame_bandwidth = bak; + } + } + + /* Otherwise this is an ordinary frame */ + else { + /* Special case: Error error_resilient_mode mode does not make much + * sense for two pass but with its current meaning but this code is + * designed to stop outlandish behaviour if someone does set it + * when using two pass. It effectively disables GF groups. This is + * temporary code till we decide what should really happen in this + * case. + */ + if (cpi->oxcf.error_resilient_mode) { + cpi->frames_till_gf_update_due = cpi->twopass.frames_to_key; + + if (cpi->common.frame_type != KEY_FRAME) { + /* Assign bits from those allocated to the GF group */ + memcpy(&this_frame_copy, &this_frame, sizeof(this_frame)); + assign_std_frame_bits(cpi, &this_frame_copy); + } + } else { + /* Assign bits from those allocated to the GF group */ + memcpy(&this_frame_copy, &this_frame, sizeof(this_frame)); + assign_std_frame_bits(cpi, &this_frame_copy); + } + } + + /* Keep a globally available copy of this and the next frame's iiratio. */ + cpi->twopass.this_iiratio = + (unsigned int)(this_frame_intra_error / + DOUBLE_DIVIDE_CHECK(this_frame_coded_error)); + { + FIRSTPASS_STATS next_frame; + if (lookup_next_frame_stats(cpi, &next_frame) != EOF) { + cpi->twopass.next_iiratio = + (unsigned int)(next_frame.intra_error / + DOUBLE_DIVIDE_CHECK(next_frame.coded_error)); + } + } + + /* Set nominal per second bandwidth for this frame */ + cpi->target_bandwidth = + (int)(cpi->per_frame_bandwidth * cpi->output_framerate); + if (cpi->target_bandwidth < 0) cpi->target_bandwidth = 0; + + /* Account for mv, mode and other overheads. */ + overhead_bits = (int)estimate_modemvcost(cpi, &cpi->twopass.total_left_stats); + + /* Special case code for first frame. */ + if (cpi->common.current_video_frame == 0) { + cpi->twopass.est_max_qcorrection_factor = 1.0; + + /* Set a cq_level in constrained quality mode. */ + if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) { + int est_cq; + + est_cq = estimate_cq(cpi, &cpi->twopass.total_left_stats, + (int)(cpi->twopass.bits_left / frames_left), + overhead_bits); + + cpi->cq_target_quality = cpi->oxcf.cq_level; + if (est_cq > cpi->cq_target_quality) cpi->cq_target_quality = est_cq; + } + + /* guess at maxq needed in 2nd pass */ + cpi->twopass.maxq_max_limit = cpi->worst_quality; + cpi->twopass.maxq_min_limit = cpi->best_quality; + + tmp_q = estimate_max_q(cpi, &cpi->twopass.total_left_stats, + (int)(cpi->twopass.bits_left / frames_left), + overhead_bits); + + /* Limit the maxq value returned subsequently. + * This increases the risk of overspend or underspend if the initial + * estimate for the clip is bad, but helps prevent excessive + * variation in Q, especially near the end of a clip + * where for example a small overspend may cause Q to crash + */ + cpi->twopass.maxq_max_limit = + ((tmp_q + 32) < cpi->worst_quality) ? (tmp_q + 32) : cpi->worst_quality; + cpi->twopass.maxq_min_limit = + ((tmp_q - 32) > cpi->best_quality) ? (tmp_q - 32) : cpi->best_quality; + + cpi->active_worst_quality = tmp_q; + cpi->ni_av_qi = tmp_q; + } + + /* The last few frames of a clip almost always have to few or too many + * bits and for the sake of over exact rate control we don't want to make + * radical adjustments to the allowed quantizer range just to use up a + * few surplus bits or get beneath the target rate. + */ + else if ((cpi->common.current_video_frame < + (((unsigned int)cpi->twopass.total_stats.count * 255) >> 8)) && + ((cpi->common.current_video_frame + cpi->baseline_gf_interval) < + (unsigned int)cpi->twopass.total_stats.count)) { + if (frames_left < 1) frames_left = 1; + + tmp_q = estimate_max_q(cpi, &cpi->twopass.total_left_stats, + (int)(cpi->twopass.bits_left / frames_left), + overhead_bits); + + /* Move active_worst_quality but in a damped way */ + if (tmp_q > cpi->active_worst_quality) { + cpi->active_worst_quality++; + } else if (tmp_q < cpi->active_worst_quality) { + cpi->active_worst_quality--; + } + + cpi->active_worst_quality = + ((cpi->active_worst_quality * 3) + tmp_q + 2) / 4; + } + + cpi->twopass.frames_to_key--; + + /* Update the total stats remaining sturcture */ + subtract_stats(&cpi->twopass.total_left_stats, &this_frame); +} + +static int test_candidate_kf(VP8_COMP *cpi, FIRSTPASS_STATS *last_frame, + FIRSTPASS_STATS *this_frame, + FIRSTPASS_STATS *next_frame) { + int is_viable_kf = 0; + + /* Does the frame satisfy the primary criteria of a key frame + * If so, then examine how well it predicts subsequent frames + */ + if ((this_frame->pcnt_second_ref < 0.10) && + (next_frame->pcnt_second_ref < 0.10) && + ((this_frame->pcnt_inter < 0.05) || + (((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .25) && + ((this_frame->intra_error / + DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) && + ((fabs(last_frame->coded_error - this_frame->coded_error) / + DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > + .40) || + (fabs(last_frame->intra_error - this_frame->intra_error) / + DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > + .40) || + ((next_frame->intra_error / + DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5))))) { + int i; + FIRSTPASS_STATS *start_pos; + + FIRSTPASS_STATS local_next_frame; + + double boost_score = 0.0; + double old_boost_score = 0.0; + double decay_accumulator = 1.0; + double next_iiratio; + + memcpy(&local_next_frame, next_frame, sizeof(*next_frame)); + + /* Note the starting file position so we can reset to it */ + start_pos = cpi->twopass.stats_in; + + /* Examine how well the key frame predicts subsequent frames */ + for (i = 0; i < 16; ++i) { + next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error / + DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error)); + + if (next_iiratio > RMAX) next_iiratio = RMAX; + + /* Cumulative effect of decay in prediction quality */ + if (local_next_frame.pcnt_inter > 0.85) { + decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter; + } else { + decay_accumulator = + decay_accumulator * ((0.85 + local_next_frame.pcnt_inter) / 2.0); + } + + /* Keep a running total */ + boost_score += (decay_accumulator * next_iiratio); + + /* Test various breakout clauses */ + if ((local_next_frame.pcnt_inter < 0.05) || (next_iiratio < 1.5) || + (((local_next_frame.pcnt_inter - local_next_frame.pcnt_neutral) < + 0.20) && + (next_iiratio < 3.0)) || + ((boost_score - old_boost_score) < 0.5) || + (local_next_frame.intra_error < 200)) { + break; + } + + old_boost_score = boost_score; + + /* Get the next frame details */ + if (EOF == input_stats(cpi, &local_next_frame)) break; + } + + /* If there is tolerable prediction for at least the next 3 frames + * then break out else discard this pottential key frame and move on + */ + if (boost_score > 5.0 && (i > 3)) { + is_viable_kf = 1; + } else { + /* Reset the file position */ + reset_fpf_position(cpi, start_pos); + + is_viable_kf = 0; + } + } + + return is_viable_kf; +} +static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { + int i, j; + FIRSTPASS_STATS last_frame; + FIRSTPASS_STATS first_frame; + FIRSTPASS_STATS next_frame; + FIRSTPASS_STATS *start_position; + + double decay_accumulator = 1.0; + double boost_score = 0; + double old_boost_score = 0.0; + double loop_decay_rate; + + double kf_mod_err = 0.0; + double kf_group_err = 0.0; + double kf_group_intra_err = 0.0; + double kf_group_coded_err = 0.0; + double recent_loop_decay[8] = { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 }; + + memset(&next_frame, 0, sizeof(next_frame)); + + vpx_clear_system_state(); + start_position = cpi->twopass.stats_in; + + cpi->common.frame_type = KEY_FRAME; + + /* is this a forced key frame by interval */ + cpi->this_key_frame_forced = cpi->next_key_frame_forced; + + /* Clear the alt ref active flag as this can never be active on a key + * frame + */ + cpi->source_alt_ref_active = 0; + + /* Kf is always a gf so clear frames till next gf counter */ + cpi->frames_till_gf_update_due = 0; + + cpi->twopass.frames_to_key = 1; + + /* Take a copy of the initial frame details */ + memcpy(&first_frame, this_frame, sizeof(*this_frame)); + + cpi->twopass.kf_group_bits = 0; + cpi->twopass.kf_group_error_left = 0; + + kf_mod_err = calculate_modified_err(cpi, this_frame); + + /* find the next keyframe */ + i = 0; + while (cpi->twopass.stats_in < cpi->twopass.stats_in_end) { + /* Accumulate kf group error */ + kf_group_err += calculate_modified_err(cpi, this_frame); + + /* These figures keep intra and coded error counts for all frames + * including key frames in the group. The effect of the key frame + * itself can be subtracted out using the first_frame data + * collected above + */ + kf_group_intra_err += this_frame->intra_error; + kf_group_coded_err += this_frame->coded_error; + + /* Load the next frame's stats. */ + memcpy(&last_frame, this_frame, sizeof(*this_frame)); + input_stats(cpi, this_frame); + + /* Provided that we are not at the end of the file... */ + if (cpi->oxcf.auto_key && + lookup_next_frame_stats(cpi, &next_frame) != EOF) { + /* Normal scene cut check */ + if ((i >= MIN_GF_INTERVAL) && + test_candidate_kf(cpi, &last_frame, this_frame, &next_frame)) { + break; + } + + /* How fast is prediction quality decaying */ + loop_decay_rate = get_prediction_decay_rate(&next_frame); + + /* We want to know something about the recent past... rather than + * as used elsewhere where we are concened with decay in prediction + * quality since the last GF or KF. + */ + recent_loop_decay[i % 8] = loop_decay_rate; + decay_accumulator = 1.0; + for (j = 0; j < 8; ++j) { + decay_accumulator = decay_accumulator * recent_loop_decay[j]; + } + + /* Special check for transition or high motion followed by a + * static scene. + */ + if (detect_transition_to_still(cpi, i, + ((int)(cpi->key_frame_frequency) - (int)i), + loop_decay_rate, decay_accumulator)) { + break; + } + + /* Step on to the next frame */ + cpi->twopass.frames_to_key++; + + /* If we don't have a real key frame within the next two + * forcekeyframeevery intervals then break out of the loop. + */ + if (cpi->twopass.frames_to_key >= 2 * (int)cpi->key_frame_frequency) { + break; + } + } else { + cpi->twopass.frames_to_key++; + } + + i++; + } + + /* If there is a max kf interval set by the user we must obey it. + * We already breakout of the loop above at 2x max. + * This code centers the extra kf if the actual natural + * interval is between 1x and 2x + */ + if (cpi->oxcf.auto_key && + cpi->twopass.frames_to_key > (int)cpi->key_frame_frequency) { + FIRSTPASS_STATS *current_pos = cpi->twopass.stats_in; + FIRSTPASS_STATS tmp_frame; + + cpi->twopass.frames_to_key /= 2; + + /* Copy first frame details */ + memcpy(&tmp_frame, &first_frame, sizeof(first_frame)); + + /* Reset to the start of the group */ + reset_fpf_position(cpi, start_position); + + kf_group_err = 0; + kf_group_intra_err = 0; + kf_group_coded_err = 0; + + /* Rescan to get the correct error data for the forced kf group */ + for (i = 0; i < cpi->twopass.frames_to_key; ++i) { + /* Accumulate kf group errors */ + kf_group_err += calculate_modified_err(cpi, &tmp_frame); + kf_group_intra_err += tmp_frame.intra_error; + kf_group_coded_err += tmp_frame.coded_error; + + /* Load a the next frame's stats */ + input_stats(cpi, &tmp_frame); + } + + /* Reset to the start of the group */ + reset_fpf_position(cpi, current_pos); + + cpi->next_key_frame_forced = 1; + } else { + cpi->next_key_frame_forced = 0; + } + + /* Special case for the last frame of the file */ + if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end) { + /* Accumulate kf group error */ + kf_group_err += calculate_modified_err(cpi, this_frame); + + /* These figures keep intra and coded error counts for all frames + * including key frames in the group. The effect of the key frame + * itself can be subtracted out using the first_frame data + * collected above + */ + kf_group_intra_err += this_frame->intra_error; + kf_group_coded_err += this_frame->coded_error; + } + + /* Calculate the number of bits that should be assigned to the kf group. */ + if ((cpi->twopass.bits_left > 0) && + (cpi->twopass.modified_error_left > 0.0)) { + /* Max for a single normal frame (not key frame) */ + int max_bits = frame_max_bits(cpi); + + /* Maximum bits for the kf group */ + int64_t max_grp_bits; + + /* Default allocation based on bits left and relative + * complexity of the section + */ + cpi->twopass.kf_group_bits = + (int64_t)(cpi->twopass.bits_left * + (kf_group_err / cpi->twopass.modified_error_left)); + + /* Clip based on maximum per frame rate defined by the user. */ + max_grp_bits = (int64_t)max_bits * (int64_t)cpi->twopass.frames_to_key; + if (cpi->twopass.kf_group_bits > max_grp_bits) { + cpi->twopass.kf_group_bits = max_grp_bits; + } + + /* Additional special case for CBR if buffer is getting full. */ + if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { + int64_t opt_buffer_lvl = cpi->oxcf.optimal_buffer_level; + int64_t buffer_lvl = cpi->buffer_level; + + /* If the buffer is near or above the optimal and this kf group is + * not being allocated much then increase the allocation a bit. + */ + if (buffer_lvl >= opt_buffer_lvl) { + int64_t high_water_mark = + (opt_buffer_lvl + cpi->oxcf.maximum_buffer_size) >> 1; + + int64_t av_group_bits; + + /* Av bits per frame * number of frames */ + av_group_bits = (int64_t)cpi->av_per_frame_bandwidth * + (int64_t)cpi->twopass.frames_to_key; + + /* We are at or above the maximum. */ + if (cpi->buffer_level >= high_water_mark) { + int64_t min_group_bits; + + min_group_bits = + av_group_bits + (int64_t)(buffer_lvl - high_water_mark); + + if (cpi->twopass.kf_group_bits < min_group_bits) { + cpi->twopass.kf_group_bits = min_group_bits; + } + } + /* We are above optimal but below the maximum */ + else if (cpi->twopass.kf_group_bits < av_group_bits) { + int64_t bits_below_av = av_group_bits - cpi->twopass.kf_group_bits; + + cpi->twopass.kf_group_bits += (int64_t)( + (double)bits_below_av * (double)(buffer_lvl - opt_buffer_lvl) / + (double)(high_water_mark - opt_buffer_lvl)); + } + } + } + } else { + cpi->twopass.kf_group_bits = 0; + } + + /* Reset the first pass file position */ + reset_fpf_position(cpi, start_position); + + /* determine how big to make this keyframe based on how well the + * subsequent frames use inter blocks + */ + decay_accumulator = 1.0; + boost_score = 0.0; + + for (i = 0; i < cpi->twopass.frames_to_key; ++i) { + double r; + + if (EOF == input_stats(cpi, &next_frame)) break; + + if (next_frame.intra_error > cpi->twopass.kf_intra_err_min) { + r = (IIKFACTOR2 * next_frame.intra_error / + DOUBLE_DIVIDE_CHECK(next_frame.coded_error)); + } else { + r = (IIKFACTOR2 * cpi->twopass.kf_intra_err_min / + DOUBLE_DIVIDE_CHECK(next_frame.coded_error)); + } + + if (r > RMAX) r = RMAX; + + /* How fast is prediction quality decaying */ + loop_decay_rate = get_prediction_decay_rate(&next_frame); + + decay_accumulator = decay_accumulator * loop_decay_rate; + decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator; + + boost_score += (decay_accumulator * r); + + if ((i > MIN_GF_INTERVAL) && ((boost_score - old_boost_score) < 1.0)) { + break; + } + + old_boost_score = boost_score; + } + + if (1) { + FIRSTPASS_STATS sectionstats; + double Ratio; + + zero_stats(§ionstats); + reset_fpf_position(cpi, start_position); + + for (i = 0; i < cpi->twopass.frames_to_key; ++i) { + input_stats(cpi, &next_frame); + accumulate_stats(§ionstats, &next_frame); + } + + avg_stats(§ionstats); + + cpi->twopass.section_intra_rating = + (unsigned int)(sectionstats.intra_error / + DOUBLE_DIVIDE_CHECK(sectionstats.coded_error)); + + Ratio = sectionstats.intra_error / + DOUBLE_DIVIDE_CHECK(sectionstats.coded_error); + cpi->twopass.section_max_qfactor = 1.0 - ((Ratio - 10.0) * 0.025); + + if (cpi->twopass.section_max_qfactor < 0.80) { + cpi->twopass.section_max_qfactor = 0.80; + } + } + + /* When using CBR apply additional buffer fullness related upper limits */ + if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { + double max_boost; + + if (cpi->drop_frames_allowed) { + int df_buffer_level = (int)(cpi->oxcf.drop_frames_water_mark * + (cpi->oxcf.optimal_buffer_level / 100)); + + if (cpi->buffer_level > df_buffer_level) { + max_boost = + ((double)((cpi->buffer_level - df_buffer_level) * 2 / 3) * 16.0) / + DOUBLE_DIVIDE_CHECK((double)cpi->av_per_frame_bandwidth); + } else { + max_boost = 0.0; + } + } else if (cpi->buffer_level > 0) { + max_boost = ((double)(cpi->buffer_level * 2 / 3) * 16.0) / + DOUBLE_DIVIDE_CHECK((double)cpi->av_per_frame_bandwidth); + } else { + max_boost = 0.0; + } + + if (boost_score > max_boost) boost_score = max_boost; + } + + /* Reset the first pass file position */ + reset_fpf_position(cpi, start_position); + + /* Work out how many bits to allocate for the key frame itself */ + if (1) { + int kf_boost = (int)boost_score; + int allocation_chunks; + int Counter = cpi->twopass.frames_to_key; + int alt_kf_bits; + YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx]; +/* Min boost based on kf interval */ +#if 0 + + while ((kf_boost < 48) && (Counter > 0)) + { + Counter -= 2; + kf_boost ++; + } + +#endif + + if (kf_boost < 48) { + kf_boost += ((Counter + 1) >> 1); + + if (kf_boost > 48) kf_boost = 48; + } + + /* bigger frame sizes need larger kf boosts, smaller frames smaller + * boosts... + */ + if ((lst_yv12->y_width * lst_yv12->y_height) > (320 * 240)) { + kf_boost += 2 * (lst_yv12->y_width * lst_yv12->y_height) / (320 * 240); + } else if ((lst_yv12->y_width * lst_yv12->y_height) < (320 * 240)) { + kf_boost -= 4 * (320 * 240) / (lst_yv12->y_width * lst_yv12->y_height); + } + + /* Min KF boost */ + kf_boost = (int)((double)kf_boost * 100.0) >> 4; /* Scale 16 to 100 */ + if (kf_boost < 250) kf_boost = 250; + + /* + * We do three calculations for kf size. + * The first is based on the error score for the whole kf group. + * The second (optionaly) on the key frames own error if this is + * smaller than the average for the group. + * The final one insures that the frame receives at least the + * allocation it would have received based on its own error score vs + * the error score remaining + * Special case if the sequence appears almost totaly static + * as measured by the decay accumulator. In this case we want to + * spend almost all of the bits on the key frame. + * cpi->twopass.frames_to_key-1 because key frame itself is taken + * care of by kf_boost. + */ + if (decay_accumulator >= 0.99) { + allocation_chunks = ((cpi->twopass.frames_to_key - 1) * 10) + kf_boost; + } else { + allocation_chunks = ((cpi->twopass.frames_to_key - 1) * 100) + kf_boost; + } + + /* Normalize Altboost and allocations chunck down to prevent overflow */ + while (kf_boost > 1000) { + kf_boost /= 2; + allocation_chunks /= 2; + } + + cpi->twopass.kf_group_bits = + (cpi->twopass.kf_group_bits < 0) ? 0 : cpi->twopass.kf_group_bits; + + /* Calculate the number of bits to be spent on the key frame */ + cpi->twopass.kf_bits = + (int)((double)kf_boost * + ((double)cpi->twopass.kf_group_bits / (double)allocation_chunks)); + + /* Apply an additional limit for CBR */ + if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { + if (cpi->twopass.kf_bits > (int)((3 * cpi->buffer_level) >> 2)) { + cpi->twopass.kf_bits = (int)((3 * cpi->buffer_level) >> 2); + } + } + + /* If the key frame is actually easier than the average for the + * kf group (which does sometimes happen... eg a blank intro frame) + * Then use an alternate calculation based on the kf error score + * which should give a smaller key frame. + */ + if (kf_mod_err < kf_group_err / cpi->twopass.frames_to_key) { + double alt_kf_grp_bits = + ((double)cpi->twopass.bits_left * + (kf_mod_err * (double)cpi->twopass.frames_to_key) / + DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left)); + + alt_kf_bits = (int)((double)kf_boost * + (alt_kf_grp_bits / (double)allocation_chunks)); + + if (cpi->twopass.kf_bits > alt_kf_bits) { + cpi->twopass.kf_bits = alt_kf_bits; + } + } + /* Else if it is much harder than other frames in the group make sure + * it at least receives an allocation in keeping with its relative + * error score + */ + else { + alt_kf_bits = (int)((double)cpi->twopass.bits_left * + (kf_mod_err / DOUBLE_DIVIDE_CHECK( + cpi->twopass.modified_error_left))); + + if (alt_kf_bits > cpi->twopass.kf_bits) { + cpi->twopass.kf_bits = alt_kf_bits; + } + } + + cpi->twopass.kf_group_bits -= cpi->twopass.kf_bits; + /* Add in the minimum frame allowance */ + cpi->twopass.kf_bits += cpi->min_frame_bandwidth; + + /* Peer frame bit target for this frame */ + cpi->per_frame_bandwidth = cpi->twopass.kf_bits; + + /* Convert to a per second bitrate */ + cpi->target_bandwidth = (int)(cpi->twopass.kf_bits * cpi->output_framerate); + } + + /* Note the total error score of the kf group minus the key frame itself */ + cpi->twopass.kf_group_error_left = (int)(kf_group_err - kf_mod_err); + + /* Adjust the count of total modified error left. The count of bits left + * is adjusted elsewhere based on real coded frame sizes + */ + cpi->twopass.modified_error_left -= kf_group_err; + + if (cpi->oxcf.allow_spatial_resampling) { + int resample_trigger = 0; + int last_kf_resampled = 0; + int kf_q; + int scale_val = 0; + int hr, hs, vr, vs; + int new_width = cpi->oxcf.Width; + int new_height = cpi->oxcf.Height; + + int projected_buffer_level; + int tmp_q; + + double projected_bits_perframe; + double group_iiratio = (kf_group_intra_err - first_frame.intra_error) / + (kf_group_coded_err - first_frame.coded_error); + double err_per_frame = kf_group_err / cpi->twopass.frames_to_key; + double bits_per_frame; + double av_bits_per_frame; + double effective_size_ratio; + + if ((cpi->common.Width != cpi->oxcf.Width) || + (cpi->common.Height != cpi->oxcf.Height)) { + last_kf_resampled = 1; + } + + /* Set back to unscaled by defaults */ + cpi->common.horiz_scale = VP8E_NORMAL; + cpi->common.vert_scale = VP8E_NORMAL; + + /* Calculate Average bits per frame. */ + av_bits_per_frame = cpi->oxcf.target_bandwidth / + DOUBLE_DIVIDE_CHECK((double)cpi->framerate); + + /* CBR... Use the clip average as the target for deciding resample */ + if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { + bits_per_frame = av_bits_per_frame; + } + + /* In VBR we want to avoid downsampling in easy section unless we + * are under extreme pressure So use the larger of target bitrate + * for this section or average bitrate for sequence + */ + else { + /* This accounts for how hard the section is... */ + bits_per_frame = + (double)(cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key); + + /* Don't turn to resampling in easy sections just because they + * have been assigned a small number of bits + */ + if (bits_per_frame < av_bits_per_frame) { + bits_per_frame = av_bits_per_frame; + } + } + + /* bits_per_frame should comply with our minimum */ + if (bits_per_frame < (cpi->oxcf.target_bandwidth * + cpi->oxcf.two_pass_vbrmin_section / 100)) { + bits_per_frame = (cpi->oxcf.target_bandwidth * + cpi->oxcf.two_pass_vbrmin_section / 100); + } + + /* Work out if spatial resampling is necessary */ + kf_q = estimate_kf_group_q(cpi, err_per_frame, (int)bits_per_frame, + group_iiratio); + + /* If we project a required Q higher than the maximum allowed Q then + * make a guess at the actual size of frames in this section + */ + projected_bits_perframe = bits_per_frame; + tmp_q = kf_q; + + while (tmp_q > cpi->worst_quality) { + projected_bits_perframe *= 1.04; + tmp_q--; + } + + /* Guess at buffer level at the end of the section */ + projected_buffer_level = + (int)(cpi->buffer_level - + (int)((projected_bits_perframe - av_bits_per_frame) * + cpi->twopass.frames_to_key)); + + /* The trigger for spatial resampling depends on the various + * parameters such as whether we are streaming (CBR) or VBR. + */ + if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { + /* Trigger resample if we are projected to fall below down + * sample level or resampled last time and are projected to + * remain below the up sample level + */ + if ((projected_buffer_level < (cpi->oxcf.resample_down_water_mark * + cpi->oxcf.optimal_buffer_level / 100)) || + (last_kf_resampled && + (projected_buffer_level < (cpi->oxcf.resample_up_water_mark * + cpi->oxcf.optimal_buffer_level / 100)))) { + resample_trigger = 1; + } else { + resample_trigger = 0; + } + } else { + int64_t clip_bits = (int64_t)( + cpi->twopass.total_stats.count * cpi->oxcf.target_bandwidth / + DOUBLE_DIVIDE_CHECK((double)cpi->framerate)); + int64_t over_spend = cpi->oxcf.starting_buffer_level - cpi->buffer_level; + + /* If triggered last time the threshold for triggering again is + * reduced: + * + * Projected Q higher than allowed and Overspend > 5% of total + * bits + */ + if ((last_kf_resampled && (kf_q > cpi->worst_quality)) || + ((kf_q > cpi->worst_quality) && (over_spend > clip_bits / 20))) { + resample_trigger = 1; + } else { + resample_trigger = 0; + } + } + + if (resample_trigger) { + while ((kf_q >= cpi->worst_quality) && (scale_val < 6)) { + scale_val++; + + cpi->common.vert_scale = vscale_lookup[scale_val]; + cpi->common.horiz_scale = hscale_lookup[scale_val]; + + Scale2Ratio(cpi->common.horiz_scale, &hr, &hs); + Scale2Ratio(cpi->common.vert_scale, &vr, &vs); + + new_width = ((hs - 1) + (cpi->oxcf.Width * hr)) / hs; + new_height = ((vs - 1) + (cpi->oxcf.Height * vr)) / vs; + + /* Reducing the area to 1/4 does not reduce the complexity + * (err_per_frame) to 1/4... effective_sizeratio attempts + * to provide a crude correction for this + */ + effective_size_ratio = (double)(new_width * new_height) / + (double)(cpi->oxcf.Width * cpi->oxcf.Height); + effective_size_ratio = (1.0 + (3.0 * effective_size_ratio)) / 4.0; + + /* Now try again and see what Q we get with the smaller + * image size + */ + kf_q = estimate_kf_group_q(cpi, err_per_frame * effective_size_ratio, + (int)bits_per_frame, group_iiratio); + } + } + + if ((cpi->common.Width != new_width) || + (cpi->common.Height != new_height)) { + cpi->common.Width = new_width; + cpi->common.Height = new_height; + vp8_alloc_compressor_data(cpi); + } + } +} diff --git a/media/libvpx/libvpx/vp8/encoder/firstpass.h b/media/libvpx/libvpx/vp8/encoder/firstpass.h new file mode 100644 index 0000000000..f5490f1eff --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/firstpass.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_ENCODER_FIRSTPASS_H_ +#define VPX_VP8_ENCODER_FIRSTPASS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +extern void vp8_init_first_pass(VP8_COMP *cpi); +extern void vp8_first_pass(VP8_COMP *cpi); +extern void vp8_end_first_pass(VP8_COMP *cpi); + +extern void vp8_init_second_pass(VP8_COMP *cpi); +extern void vp8_second_pass(VP8_COMP *cpi); +extern void vp8_end_second_pass(VP8_COMP *cpi); + +extern size_t vp8_firstpass_stats_sz(unsigned int mb_count); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_ENCODER_FIRSTPASS_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/lookahead.c b/media/libvpx/libvpx/vp8/encoder/lookahead.c new file mode 100644 index 0000000000..49f851d019 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/lookahead.c @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include +#include +#include "vpx_config.h" +#include "lookahead.h" +#include "vp8/common/extend.h" + +#define MAX_LAG_BUFFERS (CONFIG_REALTIME_ONLY ? 1 : 25) + +struct lookahead_ctx { + unsigned int max_sz; /* Absolute size of the queue */ + unsigned int sz; /* Number of buffers currently in the queue */ + unsigned int read_idx; /* Read index */ + unsigned int write_idx; /* Write index */ + struct lookahead_entry *buf; /* Buffer list */ +}; + +/* Return the buffer at the given absolute index and increment the index */ +static struct lookahead_entry *pop(struct lookahead_ctx *ctx, + unsigned int *idx) { + unsigned int index = *idx; + struct lookahead_entry *buf = ctx->buf + index; + + assert(index < ctx->max_sz); + if (++index >= ctx->max_sz) index -= ctx->max_sz; + *idx = index; + return buf; +} + +void vp8_lookahead_destroy(struct lookahead_ctx *ctx) { + if (ctx) { + if (ctx->buf) { + unsigned int i; + + for (i = 0; i < ctx->max_sz; ++i) { + vp8_yv12_de_alloc_frame_buffer(&ctx->buf[i].img); + } + free(ctx->buf); + } + free(ctx); + } +} + +struct lookahead_ctx *vp8_lookahead_init(unsigned int width, + unsigned int height, + unsigned int depth) { + struct lookahead_ctx *ctx = NULL; + unsigned int i; + + /* Clamp the lookahead queue depth */ + if (depth < 1) { + depth = 1; + } else if (depth > MAX_LAG_BUFFERS) { + depth = MAX_LAG_BUFFERS; + } + + /* Keep last frame in lookahead buffer by increasing depth by 1.*/ + depth += 1; + + /* Align the buffer dimensions */ + width = (width + 15) & ~15u; + height = (height + 15) & ~15u; + + /* Allocate the lookahead structures */ + ctx = calloc(1, sizeof(*ctx)); + if (ctx) { + ctx->max_sz = depth; + ctx->buf = calloc(depth, sizeof(*ctx->buf)); + if (!ctx->buf) goto bail; + for (i = 0; i < depth; ++i) { + if (vp8_yv12_alloc_frame_buffer(&ctx->buf[i].img, width, height, + VP8BORDERINPIXELS)) { + goto bail; + } + } + } + return ctx; +bail: + vp8_lookahead_destroy(ctx); + return NULL; +} + +int vp8_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src, + int64_t ts_start, int64_t ts_end, unsigned int flags, + unsigned char *active_map) { + struct lookahead_entry *buf; + int row, col, active_end; + int mb_rows = (src->y_height + 15) >> 4; + int mb_cols = (src->y_width + 15) >> 4; + + if (ctx->sz + 2 > ctx->max_sz) return 1; + ctx->sz++; + buf = pop(ctx, &ctx->write_idx); + + /* Only do this partial copy if the following conditions are all met: + * 1. Lookahead queue has has size of 1. + * 2. Active map is provided. + * 3. This is not a key frame, golden nor altref frame. + */ + if (ctx->max_sz == 1 && active_map && !flags) { + for (row = 0; row < mb_rows; ++row) { + col = 0; + + while (1) { + /* Find the first active macroblock in this row. */ + for (; col < mb_cols; ++col) { + if (active_map[col]) break; + } + + /* No more active macroblock in this row. */ + if (col == mb_cols) break; + + /* Find the end of active region in this row. */ + active_end = col; + + for (; active_end < mb_cols; ++active_end) { + if (!active_map[active_end]) break; + } + + /* Only copy this active region. */ + vp8_copy_and_extend_frame_with_rect(src, &buf->img, row << 4, col << 4, + 16, (active_end - col) << 4); + + /* Start again from the end of this active region. */ + col = active_end; + } + + active_map += mb_cols; + } + } else { + vp8_copy_and_extend_frame(src, &buf->img); + } + buf->ts_start = ts_start; + buf->ts_end = ts_end; + buf->flags = flags; + return 0; +} + +struct lookahead_entry *vp8_lookahead_pop(struct lookahead_ctx *ctx, + int drain) { + struct lookahead_entry *buf = NULL; + + assert(ctx != NULL); + if (ctx->sz && (drain || ctx->sz == ctx->max_sz - 1)) { + buf = pop(ctx, &ctx->read_idx); + ctx->sz--; + } + return buf; +} + +struct lookahead_entry *vp8_lookahead_peek(struct lookahead_ctx *ctx, + unsigned int index, int direction) { + struct lookahead_entry *buf = NULL; + + if (direction == PEEK_FORWARD) { + assert(index < ctx->max_sz - 1); + if (index < ctx->sz) { + index += ctx->read_idx; + if (index >= ctx->max_sz) index -= ctx->max_sz; + buf = ctx->buf + index; + } + } else if (direction == PEEK_BACKWARD) { + assert(index == 1); + + if (ctx->read_idx == 0) { + index = ctx->max_sz - 1; + } else { + index = ctx->read_idx - index; + } + buf = ctx->buf + index; + } + + return buf; +} + +unsigned int vp8_lookahead_depth(struct lookahead_ctx *ctx) { return ctx->sz; } diff --git a/media/libvpx/libvpx/vp8/encoder/lookahead.h b/media/libvpx/libvpx/vp8/encoder/lookahead.h new file mode 100644 index 0000000000..bf0401190b --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/lookahead.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_VP8_ENCODER_LOOKAHEAD_H_ +#define VPX_VP8_ENCODER_LOOKAHEAD_H_ +#include "vpx_scale/yv12config.h" +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct lookahead_entry { + YV12_BUFFER_CONFIG img; + int64_t ts_start; + int64_t ts_end; + unsigned int flags; +}; + +struct lookahead_ctx; + +/**\brief Initializes the lookahead stage + * + * The lookahead stage is a queue of frame buffers on which some analysis + * may be done when buffers are enqueued. + * + * + */ +struct lookahead_ctx *vp8_lookahead_init(unsigned int width, + unsigned int height, + unsigned int depth); + +/**\brief Destroys the lookahead stage + * + */ +void vp8_lookahead_destroy(struct lookahead_ctx *ctx); + +/**\brief Enqueue a source buffer + * + * This function will copy the source image into a new framebuffer with + * the expected stride/border. + * + * If active_map is non-NULL and there is only one frame in the queue, then copy + * only active macroblocks. + * + * \param[in] ctx Pointer to the lookahead context + * \param[in] src Pointer to the image to enqueue + * \param[in] ts_start Timestamp for the start of this frame + * \param[in] ts_end Timestamp for the end of this frame + * \param[in] flags Flags set on this frame + * \param[in] active_map Map that specifies which macroblock is active + */ +int vp8_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src, + int64_t ts_start, int64_t ts_end, unsigned int flags, + unsigned char *active_map); + +/**\brief Get the next source buffer to encode + * + * + * \param[in] ctx Pointer to the lookahead context + * \param[in] drain Flag indicating the buffer should be drained + * (return a buffer regardless of the current queue depth) + * + * \retval NULL, if drain set and queue is empty + * \retval NULL, if drain not set and queue not of the configured depth + * + */ +struct lookahead_entry *vp8_lookahead_pop(struct lookahead_ctx *ctx, int drain); + +#define PEEK_FORWARD 1 +#define PEEK_BACKWARD (-1) +/**\brief Get a future source buffer to encode + * + * \param[in] ctx Pointer to the lookahead context + * \param[in] index Index of the frame to be returned, 0 == next frame + * + * \retval NULL, if no buffer exists at the specified index + * + */ +struct lookahead_entry *vp8_lookahead_peek(struct lookahead_ctx *ctx, + unsigned int index, int direction); + +/**\brief Get the number of frames currently in the lookahead queue + * + * \param[in] ctx Pointer to the lookahead context + */ +unsigned int vp8_lookahead_depth(struct lookahead_ctx *ctx); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_ENCODER_LOOKAHEAD_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/loongarch/dct_lsx.c b/media/libvpx/libvpx/vp8/encoder/loongarch/dct_lsx.c new file mode 100644 index 0000000000..a08d4d3f63 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/loongarch/dct_lsx.c @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vp8_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" + +#define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + __m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3; \ + \ + DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1); \ + DUP2_ARG2(__lsx_vilvh_h, _in2, _in0, _in3, _in1, _s2, _s3); \ + _t0 = __lsx_vilvl_h(_s1, _s0); \ + _t1 = __lsx_vilvh_h(_s1, _s0); \ + _t2 = __lsx_vilvl_h(_s3, _s2); \ + _t3 = __lsx_vilvh_h(_s3, _s2); \ + DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2); \ + DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3); \ + } + +#define SET_DOTP_VALUES(coeff, val0, val1, val2, const1, const2) \ + { \ + __m128i tmp0_m, tmp1_m, tmp2_m; \ + \ + tmp0_m = __lsx_vreplvei_h(coeff, val0); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff, val1, coeff, val2, tmp1_m, tmp2_m); \ + DUP2_ARG2(__lsx_vpackev_h, tmp1_m, tmp0_m, tmp0_m, tmp2_m, const1, \ + const2); \ + } + +#define RET_1_IF_NZERO_H(_in) \ + ({ \ + __m128i tmp_m; \ + __m128i one_m = __lsx_vldi(0x401); \ + __m128i max_m = __lsx_vldi(0xFF); \ + \ + tmp_m = __lsx_vseqi_h(_in, 0); \ + tmp_m = __lsx_vxor_v(tmp_m, max_m); \ + tmp_m = __lsx_vand_v(tmp_m, one_m); \ + \ + tmp_m; \ + }) + +void vp8_short_fdct4x4_lsx(int16_t *input, int16_t *output, int32_t pitch) { + __m128i in0, in1, in2, in3; + __m128i tmp0, tmp1, tmp2, tmp3, const0, const1; + __m128i coeff = { 0x38a4eb1814e808a9, 0x659061a82ee01d4c }; + __m128i out0, out1, out2, out3; + __m128i zero = __lsx_vldi(0); + int32_t pitch2 = pitch << 1; + int32_t pitch3 = pitch2 + pitch; + + in0 = __lsx_vld(input, 0); + DUP2_ARG2(__lsx_vldx, input, pitch, input, pitch2, in1, in2); + in3 = __lsx_vldx(input, pitch3); + + LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, tmp0, tmp1, in1, in3); + DUP4_ARG2(__lsx_vslli_h, tmp0, 3, tmp1, 3, in1, 3, in3, 3, tmp0, tmp1, in1, + in3); + in0 = __lsx_vadd_h(tmp0, tmp1); + in2 = __lsx_vsub_h(tmp0, tmp1); + SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1); + tmp0 = __lsx_vilvl_h(in3, in1); + in1 = __lsx_vreplvei_h(coeff, 3); + out0 = __lsx_vpackev_h(zero, in1); + coeff = __lsx_vilvl_h(zero, coeff); + out1 = __lsx_vreplvei_w(coeff, 0); + DUP2_ARG3(__lsx_vdp2add_w_h, out0, tmp0, const0, out1, tmp0, const1, out0, + out1); + DUP2_ARG3(__lsx_vsrani_h_w, out0, out0, 12, out1, out1, 12, in1, in3); + LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, tmp0, tmp1, in1, in3); + tmp2 = __lsx_vadd_h(tmp0, tmp1); + tmp3 = __lsx_vsub_h(tmp0, tmp1); + DUP2_ARG2(__lsx_vaddi_hu, tmp2, 7, tmp3, 7, in0, in2); + DUP2_ARG2(__lsx_vsrai_h, in0, 4, in2, 4, in0, in2); + DUP2_ARG2(__lsx_vilvl_h, zero, in0, zero, in2, out0, out2); + tmp1 = RET_1_IF_NZERO_H(in3); + DUP2_ARG2(__lsx_vilvl_h, zero, tmp1, in3, in1, tmp1, tmp0); + DUP2_ARG2(__lsx_vreplvei_w, coeff, 2, coeff, 3, out3, out1); + out3 = __lsx_vadd_w(out3, out1); + out1 = __lsx_vreplvei_w(coeff, 1); + DUP2_ARG3(__lsx_vdp2add_w_h, out1, tmp0, const0, out3, tmp0, const1, out1, + out3); + DUP2_ARG2(__lsx_vsrai_w, out1, 16, out3, 16, out1, out3); + out1 = __lsx_vadd_w(out1, tmp1); + DUP2_ARG2(__lsx_vpickev_h, out1, out0, out3, out2, in0, in2); + __lsx_vst(in0, output, 0); + __lsx_vst(in2, output, 16); +} + +void vp8_short_fdct8x4_lsx(int16_t *input, int16_t *output, int32_t pitch) { + __m128i in0, in1, in2, in3, temp0, temp1, tmp0, tmp1; + __m128i const0, const1, const2, vec0_w, vec1_w, vec2_w, vec3_w; + __m128i coeff = { 0x38a4eb1814e808a9, 0x659061a82ee01d4c }; + __m128i zero = __lsx_vldi(0); + int32_t pitch2 = pitch << 1; + int32_t pitch3 = pitch2 + pitch; + + in0 = __lsx_vld(input, 0); + DUP2_ARG2(__lsx_vldx, input, pitch, input, pitch2, in1, in2); + in3 = __lsx_vldx(input, pitch3); + LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, temp1, in1, in3); + DUP4_ARG2(__lsx_vslli_h, temp0, 3, temp1, 3, in1, 3, in3, 3, temp0, temp1, + in1, in3); + in0 = __lsx_vadd_h(temp0, temp1); + in2 = __lsx_vsub_h(temp0, temp1); + SET_DOTP_VALUES(coeff, 0, 1, 2, const1, const2); + temp0 = __lsx_vreplvei_h(coeff, 3); + vec1_w = __lsx_vpackev_h(zero, temp0); + coeff = __lsx_vilvh_h(zero, coeff); + vec3_w = __lsx_vreplvei_w(coeff, 0); + tmp1 = __lsx_vilvl_h(in3, in1); + tmp0 = __lsx_vilvh_h(in3, in1); + vec0_w = vec1_w; + vec2_w = vec3_w; + DUP4_ARG3(__lsx_vdp2add_w_h, vec0_w, tmp1, const1, vec1_w, tmp0, const1, + vec2_w, tmp1, const2, vec3_w, tmp0, const2, vec0_w, vec1_w, vec2_w, + vec3_w); + DUP2_ARG3(__lsx_vsrani_h_w, vec1_w, vec0_w, 12, vec3_w, vec2_w, 12, in1, in3); + LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, temp1, in1, in3); + in0 = __lsx_vadd_h(temp0, temp1); + in0 = __lsx_vaddi_hu(in0, 7); + in2 = __lsx_vsub_h(temp0, temp1); + in2 = __lsx_vaddi_hu(in2, 7); + in0 = __lsx_vsrai_h(in0, 4); + in2 = __lsx_vsrai_h(in2, 4); + DUP2_ARG2(__lsx_vreplvei_w, coeff, 2, coeff, 3, vec3_w, vec1_w); + vec3_w = __lsx_vadd_w(vec3_w, vec1_w); + vec1_w = __lsx_vreplvei_w(coeff, 1); + const0 = RET_1_IF_NZERO_H(in3); + tmp1 = __lsx_vilvl_h(in3, in1); + tmp0 = __lsx_vilvh_h(in3, in1); + vec0_w = vec1_w; + vec2_w = vec3_w; + DUP4_ARG3(__lsx_vdp2add_w_h, vec0_w, tmp1, const1, vec1_w, tmp0, const1, + vec2_w, tmp1, const2, vec3_w, tmp0, const2, vec0_w, vec1_w, vec2_w, + vec3_w); + DUP2_ARG3(__lsx_vsrani_h_w, vec1_w, vec0_w, 16, vec3_w, vec2_w, 16, in1, in3); + in1 = __lsx_vadd_h(in1, const0); + DUP2_ARG2(__lsx_vpickev_d, in1, in0, in3, in2, temp0, temp1); + __lsx_vst(temp0, output, 0); + __lsx_vst(temp1, output, 16); + + DUP2_ARG2(__lsx_vpickod_d, in1, in0, in3, in2, in0, in2); + __lsx_vst(in0, output, 32); + __lsx_vst(in2, output, 48); +} diff --git a/media/libvpx/libvpx/vp8/encoder/loongarch/encodeopt_lsx.c b/media/libvpx/libvpx/vp8/encoder/loongarch/encodeopt_lsx.c new file mode 100644 index 0000000000..4ad4caba60 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/loongarch/encodeopt_lsx.c @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" +#include "vp8/encoder/block.h" + +int32_t vp8_block_error_lsx(int16_t *coeff_ptr, int16_t *dq_coeff_ptr) { + int32_t err = 0; + __m128i dq_coeff0, dq_coeff1, coeff0, coeff1; + __m128i reg0, reg1, reg2, reg3, error; + + DUP4_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, dq_coeff_ptr, 0, + dq_coeff_ptr, 16, coeff0, coeff1, dq_coeff0, dq_coeff1); + DUP2_ARG2(__lsx_vsubwev_w_h, coeff0, dq_coeff0, coeff1, dq_coeff1, reg0, + reg2); + DUP2_ARG2(__lsx_vsubwod_w_h, coeff0, dq_coeff0, coeff1, dq_coeff1, reg1, + reg3); + error = __lsx_vmul_w(reg0, reg0); + DUP2_ARG3(__lsx_vmadd_w, error, reg1, reg1, error, reg2, reg2, error, error); + error = __lsx_vmadd_w(error, reg3, reg3); + error = __lsx_vhaddw_d_w(error, error); + err = __lsx_vpickve2gr_w(error, 0); + err += __lsx_vpickve2gr_w(error, 2); + return err; +} + +int32_t vp8_mbblock_error_lsx(MACROBLOCK *mb, int32_t dc) { + BLOCK *be; + BLOCKD *bd; + int16_t *coeff, *dq_coeff; + int32_t err = 0; + uint32_t loop_cnt; + __m128i src0, src1, src2, src3; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, error; + __m128i mask0 = __lsx_vldi(0xFF); + __m128i zero = __lsx_vldi(0); + + if (dc == 1) { + mask0 = __lsx_vinsgr2vr_w(mask0, 0, 0); + } + + for (loop_cnt = 0; loop_cnt < 8; loop_cnt++) { + int32_t loop_tmp = loop_cnt << 1; + be = &mb->block[loop_tmp]; + bd = &mb->e_mbd.block[loop_tmp]; + coeff = be->coeff; + dq_coeff = bd->dqcoeff; + DUP4_ARG2(__lsx_vld, coeff, 0, coeff, 16, dq_coeff, 0, dq_coeff, 16, src0, + src1, tmp0, tmp1); + be = &mb->block[loop_tmp + 1]; + bd = &mb->e_mbd.block[loop_tmp + 1]; + coeff = be->coeff; + dq_coeff = bd->dqcoeff; + DUP4_ARG2(__lsx_vld, coeff, 0, coeff, 16, dq_coeff, 0, dq_coeff, 16, src2, + src3, tmp2, tmp3); + DUP4_ARG2(__lsx_vsubwev_w_h, src0, tmp0, src1, tmp1, src2, tmp2, src3, tmp3, + reg0, reg2, reg4, reg6); + DUP4_ARG2(__lsx_vsubwod_w_h, src0, tmp0, src1, tmp1, src2, tmp2, src3, tmp3, + reg1, reg3, reg5, reg7); + DUP2_ARG3(__lsx_vbitsel_v, zero, reg0, mask0, zero, reg4, mask0, reg0, + reg4); + error = __lsx_vmul_w(reg0, reg0); + DUP4_ARG3(__lsx_vmadd_w, error, reg1, reg1, error, reg2, reg2, error, reg3, + reg3, error, reg4, reg4, error, error, error, error); + DUP2_ARG3(__lsx_vmadd_w, error, reg5, reg5, error, reg6, reg6, error, + error); + error = __lsx_vmadd_w(error, reg7, reg7); + error = __lsx_vhaddw_d_w(error, error); + error = __lsx_vhaddw_q_d(error, error); + err += __lsx_vpickve2gr_w(error, 0); + } + return err; +} diff --git a/media/libvpx/libvpx/vp8/encoder/loongarch/vp8_quantize_lsx.c b/media/libvpx/libvpx/vp8/encoder/loongarch/vp8_quantize_lsx.c new file mode 100644 index 0000000000..75889192a7 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/loongarch/vp8_quantize_lsx.c @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vp8_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" +#include "vp8/encoder/block.h" + +#define BOOST_QUANT1(_in0, _in1, _in2, _ui) \ + { \ + if (boost_temp[0] <= __lsx_vpickve2gr_h(_in0, _ui)) { \ + if (__lsx_vpickve2gr_h(_in1, _ui)) { \ + eob = _ui; \ + boost_temp = zbin_boost; \ + } else { \ + boost_temp++; \ + } \ + } else { \ + _in2 = __lsx_vinsgr2vr_h(_in2, 0, _ui); \ + boost_temp++; \ + } \ + } + +#define BOOST_QUANT2(_in0, _in1, _in2, _ui) \ + { \ + if (boost_temp[0] <= __lsx_vpickve2gr_h(_in0, _ui)) { \ + if (__lsx_vpickve2gr_h(_in1, _ui)) { \ + eob = _ui + 8; \ + boost_temp = zbin_boost; \ + } else { \ + boost_temp++; \ + } \ + } else { \ + _in2 = __lsx_vinsgr2vr_h(_in2, 0, _ui); \ + boost_temp++; \ + } \ + } + +static int8_t exact_regular_quantize_b_lsx( + int16_t *zbin_boost, int16_t *coeff_ptr, int16_t *zbin, int16_t *round, + int16_t *quant, int16_t *quant_shift, int16_t *de_quant, int16_t zbin_oq_in, + int16_t *q_coeff, int16_t *dq_coeff) { + int32_t eob; + int16_t *boost_temp = zbin_boost; + __m128i inv_zig_zag = { 0x0C07040206050100, 0x0F0E0A090D0B0803 }; + __m128i sign_z0, sign_z1, q_coeff0, q_coeff1; + __m128i z_bin0, z_bin1, zbin_o_q, x0, x1, sign_x0, sign_x1, de_quant0, + de_quant1; + __m128i z0, z1, round0, round1, quant0, quant2; + __m128i inv_zig_zag0, inv_zig_zag1; + __m128i zigzag_mask0 = { 0x0008000400010000, 0x0006000300020005 }; + __m128i zigzag_mask1 = { 0x000A000D000C0009, 0X000F000E000B0007 }; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i zero = __lsx_vldi(0); + + zbin_o_q = __lsx_vreplgr2vr_h(zbin_oq_in); + inv_zig_zag0 = __lsx_vilvl_b(zero, inv_zig_zag); + inv_zig_zag1 = __lsx_vilvh_b(zero, inv_zig_zag); + eob = -1; + DUP4_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, round, 0, round, 16, tmp0, + tmp1, tmp2, tmp3); + DUP4_ARG3(__lsx_vshuf_h, zigzag_mask0, tmp1, tmp0, zigzag_mask1, tmp1, tmp0, + zigzag_mask0, tmp3, tmp2, zigzag_mask1, tmp3, tmp2, z0, z1, round0, + round1); + DUP4_ARG2(__lsx_vld, quant, 0, quant, 16, zbin, 0, zbin, 16, tmp0, tmp1, tmp2, + tmp3); + DUP4_ARG3(__lsx_vshuf_h, zigzag_mask0, tmp1, tmp0, zigzag_mask1, tmp1, tmp0, + zigzag_mask0, tmp3, tmp2, zigzag_mask1, tmp3, tmp2, quant0, quant2, + z_bin0, z_bin1); + DUP2_ARG2(__lsx_vsrai_h, z0, 15, z1, 15, sign_z0, sign_z1); + DUP2_ARG2(__lsx_vadda_h, z0, zero, z1, zero, x0, x1); + DUP2_ARG2(__lsx_vsub_h, x0, z_bin0, x1, z_bin1, z_bin0, z_bin1); + DUP2_ARG2(__lsx_vsub_h, z_bin0, zbin_o_q, z_bin1, zbin_o_q, z_bin0, z_bin1); + DUP2_ARG2(__lsx_vmulwev_w_h, quant0, round0, quant2, round1, tmp0, tmp2); + DUP2_ARG2(__lsx_vmulwod_w_h, quant0, round0, quant2, round1, tmp1, tmp3); + DUP2_ARG3(__lsx_vmaddwev_w_h, tmp0, quant0, x0, tmp2, quant2, x1, tmp0, tmp2); + DUP2_ARG3(__lsx_vmaddwod_w_h, tmp1, quant0, x0, tmp3, quant2, x1, tmp1, tmp3); + DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, q_coeff0, q_coeff1); + + DUP2_ARG2(__lsx_vld, quant_shift, 0, quant_shift, 16, tmp1, tmp3); + DUP2_ARG3(__lsx_vshuf_h, zigzag_mask0, tmp3, tmp1, zigzag_mask1, tmp3, tmp1, + quant0, quant2); + DUP2_ARG2(__lsx_vadd_h, x0, round0, x1, round1, x0, x1); + DUP2_ARG2(__lsx_vmulwev_w_h, quant0, q_coeff0, quant2, q_coeff1, tmp0, tmp2); + DUP2_ARG2(__lsx_vmulwod_w_h, quant0, q_coeff0, quant2, q_coeff1, tmp1, tmp3); + DUP2_ARG3(__lsx_vmaddwev_w_h, tmp0, quant0, x0, tmp2, quant2, x1, tmp0, tmp2); + DUP2_ARG3(__lsx_vmaddwod_w_h, tmp1, quant0, x0, tmp3, quant2, x1, tmp1, tmp3); + DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, x0, x1); + DUP2_ARG2(__lsx_vxor_v, x0, sign_z0, x1, sign_z1, sign_x0, sign_x1); + DUP2_ARG2(__lsx_vsub_h, sign_x0, sign_z0, sign_x1, sign_z1, sign_x0, sign_x1); + + BOOST_QUANT1(z_bin0, x0, sign_x0, 0); + BOOST_QUANT1(z_bin0, x0, sign_x0, 1); + BOOST_QUANT1(z_bin0, x0, sign_x0, 2); + BOOST_QUANT1(z_bin0, x0, sign_x0, 3); + BOOST_QUANT1(z_bin0, x0, sign_x0, 4); + BOOST_QUANT1(z_bin0, x0, sign_x0, 5); + BOOST_QUANT1(z_bin0, x0, sign_x0, 6); + BOOST_QUANT1(z_bin0, x0, sign_x0, 7); + + BOOST_QUANT2(z_bin1, x1, sign_x1, 0); + BOOST_QUANT2(z_bin1, x1, sign_x1, 1); + BOOST_QUANT2(z_bin1, x1, sign_x1, 2); + BOOST_QUANT2(z_bin1, x1, sign_x1, 3); + BOOST_QUANT2(z_bin1, x1, sign_x1, 4); + BOOST_QUANT2(z_bin1, x1, sign_x1, 5); + BOOST_QUANT2(z_bin1, x1, sign_x1, 6); + BOOST_QUANT2(z_bin1, x1, sign_x1, 7); + + DUP2_ARG2(__lsx_vld, de_quant, 0, de_quant, 16, de_quant0, de_quant1); + DUP2_ARG3(__lsx_vshuf_h, inv_zig_zag0, sign_x1, sign_x0, inv_zig_zag1, + sign_x1, sign_x0, q_coeff0, q_coeff1); + DUP2_ARG2(__lsx_vmul_h, de_quant0, q_coeff0, de_quant1, q_coeff1, de_quant0, + de_quant1); + __lsx_vst(q_coeff0, q_coeff, 0); + __lsx_vst(q_coeff1, q_coeff, 16); + __lsx_vst(de_quant0, dq_coeff, 0); + __lsx_vst(de_quant1, dq_coeff, 16); + + return (int8_t)(eob + 1); +} + +void vp8_regular_quantize_b_lsx(BLOCK *b, BLOCKD *d) { + int16_t *zbin_boost_ptr = b->zrun_zbin_boost; + int16_t *coeff_ptr = b->coeff; + int16_t *zbin_ptr = b->zbin; + int16_t *round_ptr = b->round; + int16_t *quant_ptr = b->quant; + int16_t *quant_shift_ptr = b->quant_shift; + int16_t *qcoeff_ptr = d->qcoeff; + int16_t *dqcoeff_ptr = d->dqcoeff; + int16_t *dequant_ptr = d->dequant; + int16_t zbin_oq_value = b->zbin_extra; + + *d->eob = exact_regular_quantize_b_lsx( + zbin_boost_ptr, coeff_ptr, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, dequant_ptr, zbin_oq_value, qcoeff_ptr, dqcoeff_ptr); +} diff --git a/media/libvpx/libvpx/vp8/encoder/mcomp.c b/media/libvpx/libvpx/vp8/encoder/mcomp.c new file mode 100644 index 0000000000..bc150e482b --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/mcomp.c @@ -0,0 +1,1561 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "onyx_int.h" +#include "mcomp.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_config.h" +#include +#include +#include +#include "vp8/common/findnearmv.h" +#include "vp8/common/common.h" +#include "vpx_dsp/vpx_dsp_common.h" + +int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight) { + /* MV costing is based on the distribution of vectors in the previous + * frame and as such will tend to over state the cost of vectors. In + * addition coding a new vector can have a knock on effect on the cost + * of subsequent vectors and the quality of prediction from NEAR and + * NEAREST for subsequent blocks. The "Weight" parameter allows, to a + * limited extent, for some account to be taken of these factors. + */ + const int mv_idx_row = + clamp((mv->as_mv.row - ref->as_mv.row) >> 1, 0, MVvals); + const int mv_idx_col = + clamp((mv->as_mv.col - ref->as_mv.col) >> 1, 0, MVvals); + return ((mvcost[0][mv_idx_row] + mvcost[1][mv_idx_col]) * Weight) >> 7; +} + +static int mv_err_cost(int_mv *mv, int_mv *ref, int *mvcost[2], + int error_per_bit) { + /* Ignore mv costing if mvcost is NULL */ + if (mvcost) { + const int mv_idx_row = + clamp((mv->as_mv.row - ref->as_mv.row) >> 1, 0, MVvals); + const int mv_idx_col = + clamp((mv->as_mv.col - ref->as_mv.col) >> 1, 0, MVvals); + return ((mvcost[0][mv_idx_row] + mvcost[1][mv_idx_col]) * error_per_bit + + 128) >> + 8; + } + return 0; +} + +static int mvsad_err_cost(int_mv *mv, int_mv *ref, int *mvsadcost[2], + int error_per_bit) { + /* Calculate sad error cost on full pixel basis. */ + /* Ignore mv costing if mvsadcost is NULL */ + if (mvsadcost) { + return ((mvsadcost[0][(mv->as_mv.row - ref->as_mv.row)] + + mvsadcost[1][(mv->as_mv.col - ref->as_mv.col)]) * + error_per_bit + + 128) >> + 8; + } + return 0; +} + +void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride) { + int Len; + int search_site_count = 0; + + /* Generate offsets for 4 search sites per step. */ + Len = MAX_FIRST_STEP; + x->ss[search_site_count].mv.col = 0; + x->ss[search_site_count].mv.row = 0; + x->ss[search_site_count].offset = 0; + search_site_count++; + + while (Len > 0) { + /* Compute offsets for search sites. */ + x->ss[search_site_count].mv.col = 0; + x->ss[search_site_count].mv.row = -Len; + x->ss[search_site_count].offset = -Len * stride; + search_site_count++; + + /* Compute offsets for search sites. */ + x->ss[search_site_count].mv.col = 0; + x->ss[search_site_count].mv.row = Len; + x->ss[search_site_count].offset = Len * stride; + search_site_count++; + + /* Compute offsets for search sites. */ + x->ss[search_site_count].mv.col = -Len; + x->ss[search_site_count].mv.row = 0; + x->ss[search_site_count].offset = -Len; + search_site_count++; + + /* Compute offsets for search sites. */ + x->ss[search_site_count].mv.col = Len; + x->ss[search_site_count].mv.row = 0; + x->ss[search_site_count].offset = Len; + search_site_count++; + + /* Contract. */ + Len /= 2; + } + + x->ss_count = search_site_count; + x->searches_per_step = 4; +} + +void vp8_init3smotion_compensation(MACROBLOCK *x, int stride) { + int Len; + int search_site_count = 0; + + /* Generate offsets for 8 search sites per step. */ + Len = MAX_FIRST_STEP; + x->ss[search_site_count].mv.col = 0; + x->ss[search_site_count].mv.row = 0; + x->ss[search_site_count].offset = 0; + search_site_count++; + + while (Len > 0) { + /* Compute offsets for search sites. */ + x->ss[search_site_count].mv.col = 0; + x->ss[search_site_count].mv.row = -Len; + x->ss[search_site_count].offset = -Len * stride; + search_site_count++; + + /* Compute offsets for search sites. */ + x->ss[search_site_count].mv.col = 0; + x->ss[search_site_count].mv.row = Len; + x->ss[search_site_count].offset = Len * stride; + search_site_count++; + + /* Compute offsets for search sites. */ + x->ss[search_site_count].mv.col = -Len; + x->ss[search_site_count].mv.row = 0; + x->ss[search_site_count].offset = -Len; + search_site_count++; + + /* Compute offsets for search sites. */ + x->ss[search_site_count].mv.col = Len; + x->ss[search_site_count].mv.row = 0; + x->ss[search_site_count].offset = Len; + search_site_count++; + + /* Compute offsets for search sites. */ + x->ss[search_site_count].mv.col = -Len; + x->ss[search_site_count].mv.row = -Len; + x->ss[search_site_count].offset = -Len * stride - Len; + search_site_count++; + + /* Compute offsets for search sites. */ + x->ss[search_site_count].mv.col = Len; + x->ss[search_site_count].mv.row = -Len; + x->ss[search_site_count].offset = -Len * stride + Len; + search_site_count++; + + /* Compute offsets for search sites. */ + x->ss[search_site_count].mv.col = -Len; + x->ss[search_site_count].mv.row = Len; + x->ss[search_site_count].offset = Len * stride - Len; + search_site_count++; + + /* Compute offsets for search sites. */ + x->ss[search_site_count].mv.col = Len; + x->ss[search_site_count].mv.row = Len; + x->ss[search_site_count].offset = Len * stride + Len; + search_site_count++; + + /* Contract. */ + Len /= 2; + } + + x->ss_count = search_site_count; + x->searches_per_step = 8; +} + +/* + * To avoid the penalty for crossing cache-line read, preload the reference + * area in a small buffer, which is aligned to make sure there won't be crossing + * cache-line read while reading from this buffer. This reduced the cpu + * cycles spent on reading ref data in sub-pixel filter functions. + * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x + * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we + * could reduce the area. + */ + +/* estimated cost of a motion vector (r,c) */ +#define MVC(r, c) \ + (mvcost \ + ? ((mvcost[0][(r)-rr] + mvcost[1][(c)-rc]) * error_per_bit + 128) >> 8 \ + : 0) +/* pointer to predictor base of a motionvector */ +#define PRE(r, c) (y + (((r) >> 2) * y_stride + ((c) >> 2) - (offset))) +/* convert motion vector component to offset for svf calc */ +#define SP(x) (((x)&3) << 1) +/* returns subpixel variance error function. */ +#define DIST(r, c) \ + vfp->svf(PRE(r, c), y_stride, SP(c), SP(r), z, b->src_stride, &sse) +#define IFMVCV(r, c, s, e) \ + if (c >= minc && c <= maxc && r >= minr && r <= maxr) s else e; +/* returns distortion + motion vector cost */ +#define ERR(r, c) (MVC(r, c) + DIST(r, c)) +/* checks if (r,c) has better score than previous best */ +#define CHECK_BETTER(v, r, c) \ + do { \ + IFMVCV( \ + r, c, \ + { \ + thismse = DIST(r, c); \ + if ((v = (MVC(r, c) + thismse)) < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + }, \ + v = UINT_MAX;) \ + } while (0) + +int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, + int_mv *bestmv, int_mv *ref_mv, + int error_per_bit, + const vp8_variance_fn_ptr_t *vfp, + int *mvcost[2], int *distortion, + unsigned int *sse1) { + unsigned char *z = (*(b->base_src) + b->src); + + int rr = ref_mv->as_mv.row >> 1, rc = ref_mv->as_mv.col >> 1; + int br = bestmv->as_mv.row * 4, bc = bestmv->as_mv.col * 4; + int tr = br, tc = bc; + unsigned int besterr; + unsigned int left, right, up, down, diag; + unsigned int sse; + unsigned int whichdir; + unsigned int halfiters = 4; + unsigned int quarteriters = 4; + int thismse; + + int minc = VPXMAX(x->mv_col_min * 4, + (ref_mv->as_mv.col >> 1) - ((1 << mvlong_width) - 1)); + int maxc = VPXMIN(x->mv_col_max * 4, + (ref_mv->as_mv.col >> 1) + ((1 << mvlong_width) - 1)); + int minr = VPXMAX(x->mv_row_min * 4, + (ref_mv->as_mv.row >> 1) - ((1 << mvlong_width) - 1)); + int maxr = VPXMIN(x->mv_row_max * 4, + (ref_mv->as_mv.row >> 1) + ((1 << mvlong_width) - 1)); + + int y_stride; + int offset; + int pre_stride = x->e_mbd.pre.y_stride; + unsigned char *base_pre = x->e_mbd.pre.y_buffer; + +#if VPX_ARCH_X86 || VPX_ARCH_X86_64 + MACROBLOCKD *xd = &x->e_mbd; + unsigned char *y_0 = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + + bestmv->as_mv.col; + unsigned char *y; + int buf_r1, buf_r2, buf_c1; + + /* Clamping to avoid out-of-range data access */ + buf_r1 = ((bestmv->as_mv.row - 3) < x->mv_row_min) + ? (bestmv->as_mv.row - x->mv_row_min) + : 3; + buf_r2 = ((bestmv->as_mv.row + 3) > x->mv_row_max) + ? (x->mv_row_max - bestmv->as_mv.row) + : 3; + buf_c1 = ((bestmv->as_mv.col - 3) < x->mv_col_min) + ? (bestmv->as_mv.col - x->mv_col_min) + : 3; + y_stride = 32; + + /* Copy to intermediate buffer before searching. */ + vfp->copymem(y_0 - buf_c1 - pre_stride * buf_r1, pre_stride, xd->y_buf, + y_stride, 16 + buf_r1 + buf_r2); + y = xd->y_buf + y_stride * buf_r1 + buf_c1; +#else + unsigned char *y = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + + bestmv->as_mv.col; + y_stride = pre_stride; +#endif + + offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col; + + /* central mv */ + bestmv->as_mv.row *= 8; + bestmv->as_mv.col *= 8; + + /* calculate central point error */ + besterr = vfp->vf(y, y_stride, z, b->src_stride, sse1); + *distortion = besterr; + besterr += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit); + + /* TODO: Each subsequent iteration checks at least one point in common + * with the last iteration could be 2 ( if diag selected) + */ + while (--halfiters) { + /* 1/2 pel */ + CHECK_BETTER(left, tr, tc - 2); + CHECK_BETTER(right, tr, tc + 2); + CHECK_BETTER(up, tr - 2, tc); + CHECK_BETTER(down, tr + 2, tc); + + whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); + + switch (whichdir) { + case 0: CHECK_BETTER(diag, tr - 2, tc - 2); break; + case 1: CHECK_BETTER(diag, tr - 2, tc + 2); break; + case 2: CHECK_BETTER(diag, tr + 2, tc - 2); break; + case 3: CHECK_BETTER(diag, tr + 2, tc + 2); break; + } + + /* no reason to check the same one again. */ + if (tr == br && tc == bc) break; + + tr = br; + tc = bc; + } + + /* TODO: Each subsequent iteration checks at least one point in common + * with the last iteration could be 2 ( if diag selected) + */ + + /* 1/4 pel */ + while (--quarteriters) { + CHECK_BETTER(left, tr, tc - 1); + CHECK_BETTER(right, tr, tc + 1); + CHECK_BETTER(up, tr - 1, tc); + CHECK_BETTER(down, tr + 1, tc); + + whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); + + switch (whichdir) { + case 0: CHECK_BETTER(diag, tr - 1, tc - 1); break; + case 1: CHECK_BETTER(diag, tr - 1, tc + 1); break; + case 2: CHECK_BETTER(diag, tr + 1, tc - 1); break; + case 3: CHECK_BETTER(diag, tr + 1, tc + 1); break; + } + + /* no reason to check the same one again. */ + if (tr == br && tc == bc) break; + + tr = br; + tc = bc; + } + + bestmv->as_mv.row = br * 2; + bestmv->as_mv.col = bc * 2; + + if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) || + (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3))) { + return INT_MAX; + } + + return besterr; +} +#undef MVC +#undef PRE +#undef SP +#undef DIST +#undef IFMVCV +#undef ERR +#undef CHECK_BETTER + +int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, + int_mv *bestmv, int_mv *ref_mv, + int error_per_bit, + const vp8_variance_fn_ptr_t *vfp, + int *mvcost[2], int *distortion, + unsigned int *sse1) { + int bestmse = INT_MAX; + int_mv startmv; + int_mv this_mv; + unsigned char *z = (*(b->base_src) + b->src); + int left, right, up, down, diag; + unsigned int sse; + int whichdir; + int thismse; + int y_stride; + int pre_stride = x->e_mbd.pre.y_stride; + unsigned char *base_pre = x->e_mbd.pre.y_buffer; + +#if VPX_ARCH_X86 || VPX_ARCH_X86_64 + MACROBLOCKD *xd = &x->e_mbd; + unsigned char *y_0 = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + + bestmv->as_mv.col; + unsigned char *y; + + y_stride = 32; + /* Copy 18 rows x 32 cols area to intermediate buffer before searching. */ + vfp->copymem(y_0 - 1 - pre_stride, pre_stride, xd->y_buf, y_stride, 18); + y = xd->y_buf + y_stride + 1; +#else + unsigned char *y = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + + bestmv->as_mv.col; + y_stride = pre_stride; +#endif + + /* central mv */ + bestmv->as_mv.row *= 8; + bestmv->as_mv.col *= 8; + startmv = *bestmv; + + /* calculate central point error */ + bestmse = vfp->vf(y, y_stride, z, b->src_stride, sse1); + *distortion = bestmse; + bestmse += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit); + + /* go left then right and check error */ + this_mv.as_mv.row = startmv.as_mv.row; + this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4); + /* "halfpix" horizontal variance */ + thismse = vfp->svf(y - 1, y_stride, 4, 0, z, b->src_stride, &sse); + left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (left < bestmse) { + *bestmv = this_mv; + bestmse = left; + *distortion = thismse; + *sse1 = sse; + } + + this_mv.as_mv.col += 8; + /* "halfpix" horizontal variance */ + thismse = vfp->svf(y, y_stride, 4, 0, z, b->src_stride, &sse); + right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (right < bestmse) { + *bestmv = this_mv; + bestmse = right; + *distortion = thismse; + *sse1 = sse; + } + + /* go up then down and check error */ + this_mv.as_mv.col = startmv.as_mv.col; + this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4); + /* "halfpix" vertical variance */ + thismse = vfp->svf(y - y_stride, y_stride, 0, 4, z, b->src_stride, &sse); + up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (up < bestmse) { + *bestmv = this_mv; + bestmse = up; + *distortion = thismse; + *sse1 = sse; + } + + this_mv.as_mv.row += 8; + /* "halfpix" vertical variance */ + thismse = vfp->svf(y, y_stride, 0, 4, z, b->src_stride, &sse); + down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (down < bestmse) { + *bestmv = this_mv; + bestmse = down; + *distortion = thismse; + *sse1 = sse; + } + + /* now check 1 more diagonal */ + whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); + this_mv = startmv; + + switch (whichdir) { + case 0: + this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4; + this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4; + /* "halfpix" horizontal/vertical variance */ + thismse = + vfp->svf(y - 1 - y_stride, y_stride, 4, 4, z, b->src_stride, &sse); + break; + case 1: + this_mv.as_mv.col += 4; + this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4; + /* "halfpix" horizontal/vertical variance */ + thismse = vfp->svf(y - y_stride, y_stride, 4, 4, z, b->src_stride, &sse); + break; + case 2: + this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4; + this_mv.as_mv.row += 4; + /* "halfpix" horizontal/vertical variance */ + thismse = vfp->svf(y - 1, y_stride, 4, 4, z, b->src_stride, &sse); + break; + case 3: + default: + this_mv.as_mv.col += 4; + this_mv.as_mv.row += 4; + /* "halfpix" horizontal/vertical variance */ + thismse = vfp->svf(y, y_stride, 4, 4, z, b->src_stride, &sse); + break; + } + + diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (diag < bestmse) { + *bestmv = this_mv; + bestmse = diag; + *distortion = thismse; + *sse1 = sse; + } + + /* time to check quarter pels. */ + if (bestmv->as_mv.row < startmv.as_mv.row) y -= y_stride; + + if (bestmv->as_mv.col < startmv.as_mv.col) y--; + + startmv = *bestmv; + + /* go left then right and check error */ + this_mv.as_mv.row = startmv.as_mv.row; + + if (startmv.as_mv.col & 7) { + this_mv.as_mv.col = startmv.as_mv.col - 2; + thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, + this_mv.as_mv.row & 7, z, b->src_stride, &sse); + } else { + this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6; + thismse = vfp->svf(y - 1, y_stride, 6, this_mv.as_mv.row & 7, z, + b->src_stride, &sse); + } + + left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (left < bestmse) { + *bestmv = this_mv; + bestmse = left; + *distortion = thismse; + *sse1 = sse; + } + + this_mv.as_mv.col += 4; + thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, + z, b->src_stride, &sse); + right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (right < bestmse) { + *bestmv = this_mv; + bestmse = right; + *distortion = thismse; + *sse1 = sse; + } + + /* go up then down and check error */ + this_mv.as_mv.col = startmv.as_mv.col; + + if (startmv.as_mv.row & 7) { + this_mv.as_mv.row = startmv.as_mv.row - 2; + thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, + this_mv.as_mv.row & 7, z, b->src_stride, &sse); + } else { + this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6; + thismse = vfp->svf(y - y_stride, y_stride, this_mv.as_mv.col & 7, 6, z, + b->src_stride, &sse); + } + + up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (up < bestmse) { + *bestmv = this_mv; + bestmse = up; + *distortion = thismse; + *sse1 = sse; + } + + this_mv.as_mv.row += 4; + thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, + z, b->src_stride, &sse); + down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (down < bestmse) { + *bestmv = this_mv; + bestmse = down; + *distortion = thismse; + *sse1 = sse; + } + + /* now check 1 more diagonal */ + whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); + + this_mv = startmv; + + switch (whichdir) { + case 0: + + if (startmv.as_mv.row & 7) { + this_mv.as_mv.row -= 2; + + if (startmv.as_mv.col & 7) { + this_mv.as_mv.col -= 2; + thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, + this_mv.as_mv.row & 7, z, b->src_stride, &sse); + } else { + this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6; + thismse = vfp->svf(y - 1, y_stride, 6, this_mv.as_mv.row & 7, z, + b->src_stride, &sse); + } + } else { + this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6; + + if (startmv.as_mv.col & 7) { + this_mv.as_mv.col -= 2; + thismse = vfp->svf(y - y_stride, y_stride, this_mv.as_mv.col & 7, 6, + z, b->src_stride, &sse); + } else { + this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6; + thismse = vfp->svf(y - y_stride - 1, y_stride, 6, 6, z, b->src_stride, + &sse); + } + } + + break; + case 1: + this_mv.as_mv.col += 2; + + if (startmv.as_mv.row & 7) { + this_mv.as_mv.row -= 2; + thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, + this_mv.as_mv.row & 7, z, b->src_stride, &sse); + } else { + this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6; + thismse = vfp->svf(y - y_stride, y_stride, this_mv.as_mv.col & 7, 6, z, + b->src_stride, &sse); + } + + break; + case 2: + this_mv.as_mv.row += 2; + + if (startmv.as_mv.col & 7) { + this_mv.as_mv.col -= 2; + thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, + this_mv.as_mv.row & 7, z, b->src_stride, &sse); + } else { + this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6; + thismse = vfp->svf(y - 1, y_stride, 6, this_mv.as_mv.row & 7, z, + b->src_stride, &sse); + } + + break; + case 3: + this_mv.as_mv.col += 2; + this_mv.as_mv.row += 2; + thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, + this_mv.as_mv.row & 7, z, b->src_stride, &sse); + break; + } + + diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (diag < bestmse) { + *bestmv = this_mv; + bestmse = diag; + *distortion = thismse; + *sse1 = sse; + } + + return bestmse; +} + +int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, + int_mv *bestmv, int_mv *ref_mv, + int error_per_bit, + const vp8_variance_fn_ptr_t *vfp, + int *mvcost[2], int *distortion, + unsigned int *sse1) { + int bestmse = INT_MAX; + int_mv startmv; + int_mv this_mv; + unsigned char *z = (*(b->base_src) + b->src); + int left, right, up, down, diag; + unsigned int sse; + int whichdir; + int thismse; + int y_stride; + int pre_stride = x->e_mbd.pre.y_stride; + unsigned char *base_pre = x->e_mbd.pre.y_buffer; + +#if VPX_ARCH_X86 || VPX_ARCH_X86_64 + MACROBLOCKD *xd = &x->e_mbd; + unsigned char *y_0 = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + + bestmv->as_mv.col; + unsigned char *y; + + y_stride = 32; + /* Copy 18 rows x 32 cols area to intermediate buffer before searching. */ + vfp->copymem(y_0 - 1 - pre_stride, pre_stride, xd->y_buf, y_stride, 18); + y = xd->y_buf + y_stride + 1; +#else + unsigned char *y = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + + bestmv->as_mv.col; + y_stride = pre_stride; +#endif + + /* central mv */ + bestmv->as_mv.row *= 8; + bestmv->as_mv.col *= 8; + startmv = *bestmv; + + /* calculate central point error */ + bestmse = vfp->vf(y, y_stride, z, b->src_stride, sse1); + *distortion = bestmse; + bestmse += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit); + + /* go left then right and check error */ + this_mv.as_mv.row = startmv.as_mv.row; + this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4); + /* "halfpix" horizontal variance */ + thismse = vfp->svf(y - 1, y_stride, 4, 0, z, b->src_stride, &sse); + left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (left < bestmse) { + *bestmv = this_mv; + bestmse = left; + *distortion = thismse; + *sse1 = sse; + } + + this_mv.as_mv.col += 8; + /* "halfpix" horizontal variance */ + thismse = vfp->svf(y, y_stride, 4, 0, z, b->src_stride, &sse); + right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (right < bestmse) { + *bestmv = this_mv; + bestmse = right; + *distortion = thismse; + *sse1 = sse; + } + + /* go up then down and check error */ + this_mv.as_mv.col = startmv.as_mv.col; + this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4); + /* "halfpix" vertical variance */ + thismse = vfp->svf(y - y_stride, y_stride, 0, 4, z, b->src_stride, &sse); + up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (up < bestmse) { + *bestmv = this_mv; + bestmse = up; + *distortion = thismse; + *sse1 = sse; + } + + this_mv.as_mv.row += 8; + /* "halfpix" vertical variance */ + thismse = vfp->svf(y, y_stride, 0, 4, z, b->src_stride, &sse); + down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (down < bestmse) { + *bestmv = this_mv; + bestmse = down; + *distortion = thismse; + *sse1 = sse; + } + + /* now check 1 more diagonal - */ + whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); + this_mv = startmv; + + switch (whichdir) { + case 0: + this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4; + this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4; + /* "halfpix" horizontal/vertical variance */ + thismse = + vfp->svf(y - 1 - y_stride, y_stride, 4, 4, z, b->src_stride, &sse); + break; + case 1: + this_mv.as_mv.col += 4; + this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4; + /* "halfpix" horizontal/vertical variance */ + thismse = vfp->svf(y - y_stride, y_stride, 4, 4, z, b->src_stride, &sse); + break; + case 2: + this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4; + this_mv.as_mv.row += 4; + /* "halfpix" horizontal/vertical variance */ + thismse = vfp->svf(y - 1, y_stride, 4, 4, z, b->src_stride, &sse); + break; + case 3: + default: + this_mv.as_mv.col += 4; + this_mv.as_mv.row += 4; + /* "halfpix" horizontal/vertical variance */ + thismse = vfp->svf(y, y_stride, 4, 4, z, b->src_stride, &sse); + break; + } + + diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + + if (diag < bestmse) { + *bestmv = this_mv; + bestmse = diag; + *distortion = thismse; + *sse1 = sse; + } + + return bestmse; +} + +#define CHECK_BOUNDS(range) \ + do { \ + all_in = 1; \ + all_in &= ((br - range) >= x->mv_row_min); \ + all_in &= ((br + range) <= x->mv_row_max); \ + all_in &= ((bc - range) >= x->mv_col_min); \ + all_in &= ((bc + range) <= x->mv_col_max); \ + } while (0) + +#define CHECK_POINT \ + { \ + if (this_mv.as_mv.col < x->mv_col_min) continue; \ + if (this_mv.as_mv.col > x->mv_col_max) continue; \ + if (this_mv.as_mv.row < x->mv_row_min) continue; \ + if (this_mv.as_mv.row > x->mv_row_max) continue; \ + } + +#define CHECK_BETTER \ + do { \ + if (thissad < bestsad) { \ + thissad += \ + mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit); \ + if (thissad < bestsad) { \ + bestsad = thissad; \ + best_site = i; \ + } \ + } \ + } while (0) + +static const MV next_chkpts[6][3] = { + { { -2, 0 }, { -1, -2 }, { 1, -2 } }, { { -1, -2 }, { 1, -2 }, { 2, 0 } }, + { { 1, -2 }, { 2, 0 }, { 1, 2 } }, { { 2, 0 }, { 1, 2 }, { -1, 2 } }, + { { 1, 2 }, { -1, 2 }, { -2, 0 } }, { { -1, 2 }, { -2, 0 }, { -1, -2 } } +}; + +int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, + int_mv *best_mv, int search_param, int sad_per_bit, + const vp8_variance_fn_ptr_t *vfp, int *mvsadcost[2], + int_mv *center_mv) { + MV hex[6] = { + { -1, -2 }, { 1, -2 }, { 2, 0 }, { 1, 2 }, { -1, 2 }, { -2, 0 } + }; + MV neighbors[4] = { { 0, -1 }, { -1, 0 }, { 1, 0 }, { 0, 1 } }; + int i, j; + + unsigned char *what = (*(b->base_src) + b->src); + int what_stride = b->src_stride; + int pre_stride = x->e_mbd.pre.y_stride; + unsigned char *base_pre = x->e_mbd.pre.y_buffer; + + int in_what_stride = pre_stride; + int br, bc; + int_mv this_mv; + unsigned int bestsad; + unsigned int thissad; + unsigned char *base_offset; + unsigned char *this_offset; + int k = -1; + int all_in; + int best_site = -1; + int hex_range = 127; + int dia_range = 8; + + int_mv fcenter_mv; + fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; + fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3; + + /* adjust ref_mv to make sure it is within MV range */ + vp8_clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, + x->mv_row_max); + br = ref_mv->as_mv.row; + bc = ref_mv->as_mv.col; + + /* Work out the start point for the search */ + base_offset = (unsigned char *)(base_pre + d->offset); + this_offset = base_offset + (br * (pre_stride)) + bc; + this_mv.as_mv.row = br; + this_mv.as_mv.col = bc; + bestsad = vfp->sdf(what, what_stride, this_offset, in_what_stride) + + mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit); + +#if CONFIG_MULTI_RES_ENCODING + /* Lower search range based on prediction info */ + if (search_param >= 6) + goto cal_neighbors; + else if (search_param >= 5) + hex_range = 4; + else if (search_param >= 4) + hex_range = 6; + else if (search_param >= 3) + hex_range = 15; + else if (search_param >= 2) + hex_range = 31; + else if (search_param >= 1) + hex_range = 63; + + dia_range = 8; +#else + (void)search_param; +#endif + + /* hex search */ + CHECK_BOUNDS(2); + + if (all_in) { + for (i = 0; i < 6; ++i) { + this_mv.as_mv.row = br + hex[i].row; + this_mv.as_mv.col = bc + hex[i].col; + this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + + this_mv.as_mv.col; + thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride); + CHECK_BETTER; + } + } else { + for (i = 0; i < 6; ++i) { + this_mv.as_mv.row = br + hex[i].row; + this_mv.as_mv.col = bc + hex[i].col; + CHECK_POINT + this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + + this_mv.as_mv.col; + thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride); + CHECK_BETTER; + } + } + + if (best_site == -1) { + goto cal_neighbors; + } else { + br += hex[best_site].row; + bc += hex[best_site].col; + k = best_site; + } + + for (j = 1; j < hex_range; ++j) { + best_site = -1; + CHECK_BOUNDS(2); + + if (all_in) { + for (i = 0; i < 3; ++i) { + this_mv.as_mv.row = br + next_chkpts[k][i].row; + this_mv.as_mv.col = bc + next_chkpts[k][i].col; + this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + + this_mv.as_mv.col; + thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride); + CHECK_BETTER; + } + } else { + for (i = 0; i < 3; ++i) { + this_mv.as_mv.row = br + next_chkpts[k][i].row; + this_mv.as_mv.col = bc + next_chkpts[k][i].col; + CHECK_POINT + this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + + this_mv.as_mv.col; + thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride); + CHECK_BETTER; + } + } + + if (best_site == -1) { + break; + } else { + br += next_chkpts[k][best_site].row; + bc += next_chkpts[k][best_site].col; + k += 5 + best_site; + if (k >= 12) { + k -= 12; + } else if (k >= 6) { + k -= 6; + } + } + } + +/* check 4 1-away neighbors */ +cal_neighbors: + for (j = 0; j < dia_range; ++j) { + best_site = -1; + CHECK_BOUNDS(1); + + if (all_in) { + for (i = 0; i < 4; ++i) { + this_mv.as_mv.row = br + neighbors[i].row; + this_mv.as_mv.col = bc + neighbors[i].col; + this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + + this_mv.as_mv.col; + thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride); + CHECK_BETTER; + } + } else { + for (i = 0; i < 4; ++i) { + this_mv.as_mv.row = br + neighbors[i].row; + this_mv.as_mv.col = bc + neighbors[i].col; + CHECK_POINT + this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + + this_mv.as_mv.col; + thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride); + CHECK_BETTER; + } + } + + if (best_site == -1) { + break; + } else { + br += neighbors[best_site].row; + bc += neighbors[best_site].col; + } + } + + best_mv->as_mv.row = br; + best_mv->as_mv.col = bc; + + return bestsad; +} +#undef CHECK_BOUNDS +#undef CHECK_POINT +#undef CHECK_BETTER + +int vp8_diamond_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, + int_mv *best_mv, int search_param, int sad_per_bit, + int *num00, vp8_variance_fn_ptr_t *fn_ptr, + int *mvcost[2], int_mv *center_mv) { + int i, j, step; + + unsigned char *what = (*(b->base_src) + b->src); + int what_stride = b->src_stride; + unsigned char *in_what; + int pre_stride = x->e_mbd.pre.y_stride; + unsigned char *base_pre = x->e_mbd.pre.y_buffer; + int in_what_stride = pre_stride; + unsigned char *best_address; + + int tot_steps; + int_mv this_mv; + + unsigned int bestsad; + unsigned int thissad; + int best_site = 0; + int last_site = 0; + + int ref_row; + int ref_col; + int this_row_offset; + int this_col_offset; + search_site *ss; + + unsigned char *check_here; + + int *mvsadcost[2]; + int_mv fcenter_mv; + + mvsadcost[0] = x->mvsadcost[0]; + mvsadcost[1] = x->mvsadcost[1]; + fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; + fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3; + + vp8_clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, + x->mv_row_max); + ref_row = ref_mv->as_mv.row; + ref_col = ref_mv->as_mv.col; + *num00 = 0; + best_mv->as_mv.row = ref_row; + best_mv->as_mv.col = ref_col; + + /* Work out the start point for the search */ + in_what = (unsigned char *)(base_pre + d->offset + (ref_row * pre_stride) + + ref_col); + best_address = in_what; + + /* Check the starting position */ + bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride) + + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit); + + /* search_param determines the length of the initial step and hence + * the number of iterations 0 = initial step (MAX_FIRST_STEP) pel : + * 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc. + */ + ss = &x->ss[search_param * x->searches_per_step]; + tot_steps = (x->ss_count / x->searches_per_step) - search_param; + + i = 1; + + for (step = 0; step < tot_steps; ++step) { + for (j = 0; j < x->searches_per_step; ++j) { + /* Trap illegal vectors */ + this_row_offset = best_mv->as_mv.row + ss[i].mv.row; + this_col_offset = best_mv->as_mv.col + ss[i].mv.col; + + if ((this_col_offset > x->mv_col_min) && + (this_col_offset < x->mv_col_max) && + (this_row_offset > x->mv_row_min) && + (this_row_offset < x->mv_row_max)) + + { + check_here = ss[i].offset + best_address; + thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride); + + if (thissad < bestsad) { + this_mv.as_mv.row = this_row_offset; + this_mv.as_mv.col = this_col_offset; + thissad += + mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit); + + if (thissad < bestsad) { + bestsad = thissad; + best_site = i; + } + } + } + + i++; + } + + if (best_site != last_site) { + best_mv->as_mv.row += ss[best_site].mv.row; + best_mv->as_mv.col += ss[best_site].mv.col; + best_address += ss[best_site].offset; + last_site = best_site; + } else if (best_address == in_what) { + (*num00)++; + } + } + + this_mv.as_mv.row = best_mv->as_mv.row * 8; + this_mv.as_mv.col = best_mv->as_mv.col * 8; + + return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) + + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit); +} + +#if HAVE_SSE2 || HAVE_MSA || HAVE_LSX +int vp8_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, + int_mv *best_mv, int search_param, int sad_per_bit, + int *num00, vp8_variance_fn_ptr_t *fn_ptr, + int *mvcost[2], int_mv *center_mv) { + int i, j, step; + + unsigned char *what = (*(b->base_src) + b->src); + int what_stride = b->src_stride; + unsigned char *in_what; + int pre_stride = x->e_mbd.pre.y_stride; + unsigned char *base_pre = x->e_mbd.pre.y_buffer; + int in_what_stride = pre_stride; + unsigned char *best_address; + + int tot_steps; + int_mv this_mv; + + unsigned int bestsad; + unsigned int thissad; + int best_site = 0; + int last_site = 0; + + int ref_row; + int ref_col; + int this_row_offset; + int this_col_offset; + search_site *ss; + + unsigned char *check_here; + + int *mvsadcost[2]; + int_mv fcenter_mv; + + mvsadcost[0] = x->mvsadcost[0]; + mvsadcost[1] = x->mvsadcost[1]; + fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; + fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3; + + vp8_clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, + x->mv_row_max); + ref_row = ref_mv->as_mv.row; + ref_col = ref_mv->as_mv.col; + *num00 = 0; + best_mv->as_mv.row = ref_row; + best_mv->as_mv.col = ref_col; + + /* Work out the start point for the search */ + in_what = (unsigned char *)(base_pre + d->offset + (ref_row * pre_stride) + + ref_col); + best_address = in_what; + + /* Check the starting position */ + bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride) + + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit); + + /* search_param determines the length of the initial step and hence the + * number of iterations 0 = initial step (MAX_FIRST_STEP) pel : 1 = + * (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc. + */ + ss = &x->ss[search_param * x->searches_per_step]; + tot_steps = (x->ss_count / x->searches_per_step) - search_param; + + i = 1; + + for (step = 0; step < tot_steps; ++step) { + int all_in = 1, t; + + /* To know if all neighbor points are within the bounds, 4 bounds + * checking are enough instead of checking 4 bounds for each + * points. + */ + all_in &= ((best_mv->as_mv.row + ss[i].mv.row) > x->mv_row_min); + all_in &= ((best_mv->as_mv.row + ss[i + 1].mv.row) < x->mv_row_max); + all_in &= ((best_mv->as_mv.col + ss[i + 2].mv.col) > x->mv_col_min); + all_in &= ((best_mv->as_mv.col + ss[i + 3].mv.col) < x->mv_col_max); + + if (all_in) { + unsigned int sad_array[4]; + + for (j = 0; j < x->searches_per_step; j += 4) { + const unsigned char *block_offset[4]; + + for (t = 0; t < 4; ++t) { + block_offset[t] = ss[i + t].offset + best_address; + } + + fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, + sad_array); + + for (t = 0; t < 4; t++, i++) { + if (sad_array[t] < bestsad) { + this_mv.as_mv.row = best_mv->as_mv.row + ss[i].mv.row; + this_mv.as_mv.col = best_mv->as_mv.col + ss[i].mv.col; + sad_array[t] += + mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit); + + if (sad_array[t] < bestsad) { + bestsad = sad_array[t]; + best_site = i; + } + } + } + } + } else { + for (j = 0; j < x->searches_per_step; ++j) { + /* Trap illegal vectors */ + this_row_offset = best_mv->as_mv.row + ss[i].mv.row; + this_col_offset = best_mv->as_mv.col + ss[i].mv.col; + + if ((this_col_offset > x->mv_col_min) && + (this_col_offset < x->mv_col_max) && + (this_row_offset > x->mv_row_min) && + (this_row_offset < x->mv_row_max)) { + check_here = ss[i].offset + best_address; + thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride); + + if (thissad < bestsad) { + this_mv.as_mv.row = this_row_offset; + this_mv.as_mv.col = this_col_offset; + thissad += + mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit); + + if (thissad < bestsad) { + bestsad = thissad; + best_site = i; + } + } + } + i++; + } + } + + if (best_site != last_site) { + best_mv->as_mv.row += ss[best_site].mv.row; + best_mv->as_mv.col += ss[best_site].mv.col; + best_address += ss[best_site].offset; + last_site = best_site; + } else if (best_address == in_what) { + (*num00)++; + } + } + + this_mv.as_mv.row = best_mv->as_mv.row * 8; + this_mv.as_mv.col = best_mv->as_mv.col * 8; + + return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) + + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit); +} +#endif // HAVE_SSE2 || HAVE_MSA || HAVE_LSX + +int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, + int sad_per_bit, int distance, + vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], + int_mv *center_mv) { + unsigned char *what = (*(b->base_src) + b->src); + int what_stride = b->src_stride; + unsigned char *in_what; + int pre_stride = x->e_mbd.pre.y_stride; + unsigned char *base_pre = x->e_mbd.pre.y_buffer; + int in_what_stride = pre_stride; + int mv_stride = pre_stride; + unsigned char *bestaddress; + int_mv *best_mv = &d->bmi.mv; + int_mv this_mv; + unsigned int bestsad; + unsigned int thissad; + int r, c; + + unsigned char *check_here; + + int ref_row = ref_mv->as_mv.row; + int ref_col = ref_mv->as_mv.col; + + int row_min = ref_row - distance; + int row_max = ref_row + distance; + int col_min = ref_col - distance; + int col_max = ref_col + distance; + + int *mvsadcost[2]; + int_mv fcenter_mv; + + mvsadcost[0] = x->mvsadcost[0]; + mvsadcost[1] = x->mvsadcost[1]; + fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; + fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3; + + /* Work out the mid point for the search */ + in_what = base_pre + d->offset; + bestaddress = in_what + (ref_row * pre_stride) + ref_col; + + best_mv->as_mv.row = ref_row; + best_mv->as_mv.col = ref_col; + + /* Baseline value at the centre */ + bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride) + + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit); + + /* Apply further limits to prevent us looking using vectors that stretch + * beyond the UMV border + */ + if (col_min < x->mv_col_min) col_min = x->mv_col_min; + + if (col_max > x->mv_col_max) col_max = x->mv_col_max; + + if (row_min < x->mv_row_min) row_min = x->mv_row_min; + + if (row_max > x->mv_row_max) row_max = x->mv_row_max; + + for (r = row_min; r < row_max; ++r) { + this_mv.as_mv.row = r; + check_here = r * mv_stride + in_what + col_min; + + for (c = col_min; c < col_max; ++c) { + thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride); + + if (thissad < bestsad) { + this_mv.as_mv.col = c; + thissad += + mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit); + + if (thissad < bestsad) { + bestsad = thissad; + best_mv->as_mv.row = r; + best_mv->as_mv.col = c; + bestaddress = check_here; + } + } + + check_here++; + } + } + + this_mv.as_mv.row = best_mv->as_mv.row * 8; + this_mv.as_mv.col = best_mv->as_mv.col * 8; + + return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad) + + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit); +} + +int vp8_refining_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, + int_mv *ref_mv, int error_per_bit, + int search_range, vp8_variance_fn_ptr_t *fn_ptr, + int *mvcost[2], int_mv *center_mv) { + MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } }; + int i, j; + short this_row_offset, this_col_offset; + + int what_stride = b->src_stride; + int pre_stride = x->e_mbd.pre.y_stride; + unsigned char *base_pre = x->e_mbd.pre.y_buffer; + int in_what_stride = pre_stride; + unsigned char *what = (*(b->base_src) + b->src); + unsigned char *best_address = + (unsigned char *)(base_pre + d->offset + + (ref_mv->as_mv.row * pre_stride) + ref_mv->as_mv.col); + unsigned char *check_here; + int_mv this_mv; + unsigned int bestsad; + unsigned int thissad; + + int *mvsadcost[2]; + int_mv fcenter_mv; + + mvsadcost[0] = x->mvsadcost[0]; + mvsadcost[1] = x->mvsadcost[1]; + fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; + fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3; + + bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride) + + mvsad_err_cost(ref_mv, &fcenter_mv, mvsadcost, error_per_bit); + + for (i = 0; i < search_range; ++i) { + int best_site = -1; + + for (j = 0; j < 4; ++j) { + this_row_offset = ref_mv->as_mv.row + neighbors[j].row; + this_col_offset = ref_mv->as_mv.col + neighbors[j].col; + + if ((this_col_offset > x->mv_col_min) && + (this_col_offset < x->mv_col_max) && + (this_row_offset > x->mv_row_min) && + (this_row_offset < x->mv_row_max)) { + check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col + + best_address; + thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride); + + if (thissad < bestsad) { + this_mv.as_mv.row = this_row_offset; + this_mv.as_mv.col = this_col_offset; + thissad += + mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, error_per_bit); + + if (thissad < bestsad) { + bestsad = thissad; + best_site = j; + } + } + } + } + + if (best_site == -1) { + break; + } else { + ref_mv->as_mv.row += neighbors[best_site].row; + ref_mv->as_mv.col += neighbors[best_site].col; + best_address += (neighbors[best_site].row) * in_what_stride + + neighbors[best_site].col; + } + } + + this_mv.as_mv.row = ref_mv->as_mv.row * 8; + this_mv.as_mv.col = ref_mv->as_mv.col * 8; + + return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) + + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit); +} + +#if HAVE_SSE2 || HAVE_MSA +int vp8_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, + int_mv *ref_mv, int error_per_bit, + int search_range, vp8_variance_fn_ptr_t *fn_ptr, + int *mvcost[2], int_mv *center_mv) { + MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } }; + int i, j; + short this_row_offset, this_col_offset; + + int what_stride = b->src_stride; + int pre_stride = x->e_mbd.pre.y_stride; + unsigned char *base_pre = x->e_mbd.pre.y_buffer; + int in_what_stride = pre_stride; + unsigned char *what = (*(b->base_src) + b->src); + unsigned char *best_address = + (unsigned char *)(base_pre + d->offset + + (ref_mv->as_mv.row * pre_stride) + ref_mv->as_mv.col); + unsigned char *check_here; + int_mv this_mv; + unsigned int bestsad; + unsigned int thissad; + + int *mvsadcost[2]; + int_mv fcenter_mv; + + mvsadcost[0] = x->mvsadcost[0]; + mvsadcost[1] = x->mvsadcost[1]; + fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; + fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3; + + bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride) + + mvsad_err_cost(ref_mv, &fcenter_mv, mvsadcost, error_per_bit); + + for (i = 0; i < search_range; ++i) { + int best_site = -1; + int all_in = 1; + + all_in &= ((ref_mv->as_mv.row - 1) > x->mv_row_min); + all_in &= ((ref_mv->as_mv.row + 1) < x->mv_row_max); + all_in &= ((ref_mv->as_mv.col - 1) > x->mv_col_min); + all_in &= ((ref_mv->as_mv.col + 1) < x->mv_col_max); + + if (all_in) { + unsigned int sad_array[4]; + const unsigned char *block_offset[4]; + block_offset[0] = best_address - in_what_stride; + block_offset[1] = best_address - 1; + block_offset[2] = best_address + 1; + block_offset[3] = best_address + in_what_stride; + + fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, + sad_array); + + for (j = 0; j < 4; ++j) { + if (sad_array[j] < bestsad) { + this_mv.as_mv.row = ref_mv->as_mv.row + neighbors[j].row; + this_mv.as_mv.col = ref_mv->as_mv.col + neighbors[j].col; + sad_array[j] += + mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, error_per_bit); + + if (sad_array[j] < bestsad) { + bestsad = sad_array[j]; + best_site = j; + } + } + } + } else { + for (j = 0; j < 4; ++j) { + this_row_offset = ref_mv->as_mv.row + neighbors[j].row; + this_col_offset = ref_mv->as_mv.col + neighbors[j].col; + + if ((this_col_offset > x->mv_col_min) && + (this_col_offset < x->mv_col_max) && + (this_row_offset > x->mv_row_min) && + (this_row_offset < x->mv_row_max)) { + check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col + + best_address; + thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride); + + if (thissad < bestsad) { + this_mv.as_mv.row = this_row_offset; + this_mv.as_mv.col = this_col_offset; + thissad += + mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, error_per_bit); + + if (thissad < bestsad) { + bestsad = thissad; + best_site = j; + } + } + } + } + } + + if (best_site == -1) { + break; + } else { + ref_mv->as_mv.row += neighbors[best_site].row; + ref_mv->as_mv.col += neighbors[best_site].col; + best_address += (neighbors[best_site].row) * in_what_stride + + neighbors[best_site].col; + } + } + + this_mv.as_mv.row = ref_mv->as_mv.row * 8; + this_mv.as_mv.col = ref_mv->as_mv.col * 8; + + return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) + + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit); +} +#endif // HAVE_SSE2 || HAVE_MSA diff --git a/media/libvpx/libvpx/vp8/encoder/mcomp.h b/media/libvpx/libvpx/vp8/encoder/mcomp.h new file mode 100644 index 0000000000..1ee6fe5dd6 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/mcomp.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_ENCODER_MCOMP_H_ +#define VPX_VP8_ENCODER_MCOMP_H_ + +#include "block.h" +#include "vpx_dsp/variance.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* The maximum number of steps in a step search given the largest allowed + * initial step + */ +#define MAX_MVSEARCH_STEPS 8 + +/* Max full pel mv specified in 1 pel units */ +#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1) + +/* Maximum size of the first step in full pel units */ +#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS - 1)) + +int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight); +void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride); +void vp8_init3smotion_compensation(MACROBLOCK *x, int stride); + +int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, + int_mv *best_mv, int search_param, int sad_per_bit, + const vp8_variance_fn_ptr_t *vfp, int *mvsadcost[2], + int_mv *center_mv); + +typedef int(fractional_mv_step_fp)(MACROBLOCK *x, BLOCK *b, BLOCKD *d, + int_mv *bestmv, int_mv *ref_mv, + int error_per_bit, + const vp8_variance_fn_ptr_t *vfp, + int *mvcost[2], int *distortion, + unsigned int *sse); + +fractional_mv_step_fp vp8_find_best_sub_pixel_step_iteratively; +fractional_mv_step_fp vp8_find_best_sub_pixel_step; +fractional_mv_step_fp vp8_find_best_half_pixel_step; +fractional_mv_step_fp vp8_skip_fractional_mv_step; + +int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, + int sad_per_bit, int distance, + vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], + int_mv *center_mv); + +typedef int (*vp8_refining_search_fn_t)(MACROBLOCK *x, BLOCK *b, BLOCKD *d, + int_mv *ref_mv, int sad_per_bit, + int distance, + vp8_variance_fn_ptr_t *fn_ptr, + int *mvcost[2], int_mv *center_mv); + +typedef int (*vp8_diamond_search_fn_t)(MACROBLOCK *x, BLOCK *b, BLOCKD *d, + int_mv *ref_mv, int_mv *best_mv, + int search_param, int sad_per_bit, + int *num00, + vp8_variance_fn_ptr_t *fn_ptr, + int *mvcost[2], int_mv *center_mv); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_ENCODER_MCOMP_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/mips/mmi/dct_mmi.c b/media/libvpx/libvpx/vp8/encoder/mips/mmi/dct_mmi.c new file mode 100644 index 0000000000..0fd25fcda5 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/mips/mmi/dct_mmi.c @@ -0,0 +1,434 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/asmdefs_mmi.h" + +/* clang-format off */ +/* TRANSPOSE_4H: transpose 4x4 matrix. + Input: ftmp1,ftmp2,ftmp3,ftmp4 + Output: ftmp1,ftmp2,ftmp3,ftmp4 + Note: ftmp0 always be 0, ftmp5~9 used for temporary value. + */ +#define TRANSPOSE_4H \ + MMI_LI(%[tmp0], 0x93) \ + "mtc1 %[tmp0], %[ftmp10] \n\t" \ + "punpcklhw %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ + "punpcklhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "por %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \ + "punpckhhw %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "por %[ftmp6], %[ftmp6], %[ftmp9] \n\t" \ + "punpcklhw %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \ + "punpcklhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "por %[ftmp7], %[ftmp7], %[ftmp9] \n\t" \ + "punpckhhw %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "por %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \ + "punpcklwd %[ftmp1], %[ftmp5], %[ftmp7] \n\t" \ + "punpckhwd %[ftmp2], %[ftmp5], %[ftmp7] \n\t" \ + "punpcklwd %[ftmp3], %[ftmp6], %[ftmp8] \n\t" \ + "punpckhwd %[ftmp4], %[ftmp6], %[ftmp8] \n\t" +/* clang-format on */ + +void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) { + uint64_t tmp[1]; + int16_t *ip = input; + double ff_ph_op1, ff_ph_op3; + +#if _MIPS_SIM == _ABIO32 + register double ftmp0 asm("$f0"); + register double ftmp1 asm("$f2"); + register double ftmp2 asm("$f4"); + register double ftmp3 asm("$f6"); + register double ftmp4 asm("$f8"); + register double ftmp5 asm("$f10"); + register double ftmp6 asm("$f12"); + register double ftmp7 asm("$f14"); + register double ftmp8 asm("$f16"); + register double ftmp9 asm("$f18"); + register double ftmp10 asm("$f20"); + register double ftmp11 asm("$f22"); + register double ftmp12 asm("$f24"); +#else + register double ftmp0 asm("$f0"); + register double ftmp1 asm("$f1"); + register double ftmp2 asm("$f2"); + register double ftmp3 asm("$f3"); + register double ftmp4 asm("$f4"); + register double ftmp5 asm("$f5"); + register double ftmp6 asm("$f6"); + register double ftmp7 asm("$f7"); + register double ftmp8 asm("$f8"); + register double ftmp9 asm("$f9"); + register double ftmp10 asm("$f10"); + register double ftmp11 asm("$f11"); + register double ftmp12 asm("$f12"); +#endif // _MIPS_SIM == _ABIO32 + + DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_07) = { 0x0007000700070007ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_12000) = { 0x00002ee000002ee0ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_51000) = { 0x0000c7380000c738ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_14500) = { 0x000038a4000038a4ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_7500) = { 0x00001d4c00001d4cULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_5352) = { 0x000014e8000014e8ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_2217) = { 0x000008a9000008a9ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_8) = { 0x0008000800080008ULL }; + + /* clang-format off */ + __asm__ volatile ( + "dli %[tmp0], 0x14e808a914e808a9 \n\t" + "dmtc1 %[tmp0], %[ff_ph_op1] \n\t" + "dli %[tmp0], 0xeb1808a9eb1808a9 \n\t" + "dmtc1 %[tmp0], %[ff_ph_op3] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + "gsldlc1 %[ftmp2], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + "gsldlc1 %[ftmp3], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + "gsldlc1 %[ftmp4], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + TRANSPOSE_4H + + "ldc1 %[ftmp11], %[ff_ph_8] \n\t" + // f1 + f4 + "paddh %[ftmp5], %[ftmp1], %[ftmp4] \n\t" + // a1 + "pmullh %[ftmp5], %[ftmp5], %[ftmp11] \n\t" + // f2 + f3 + "paddh %[ftmp6], %[ftmp2], %[ftmp3] \n\t" + // b1 + "pmullh %[ftmp6], %[ftmp6], %[ftmp11] \n\t" + // f2 - f3 + "psubh %[ftmp7], %[ftmp2], %[ftmp3] \n\t" + // c1 + "pmullh %[ftmp7], %[ftmp7], %[ftmp11] \n\t" + // f1 - f4 + "psubh %[ftmp8], %[ftmp1], %[ftmp4] \n\t" + // d1 + "pmullh %[ftmp8], %[ftmp8], %[ftmp11] \n\t" + // op[0] = a1 + b1 + "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t" + // op[2] = a1 - b1 + "psubh %[ftmp3], %[ftmp5], %[ftmp6] \n\t" + + // op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12 + MMI_LI(%[tmp0], 0x0c) + "dmtc1 %[tmp0], %[ftmp11] \n\t" + "ldc1 %[ftmp12], %[ff_pw_14500] \n\t" + "punpcklhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t" + "pmaddhw %[ftmp5], %[ftmp9], %[ff_ph_op1] \n\t" + "punpckhhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t" + "pmaddhw %[ftmp6], %[ftmp9], %[ff_ph_op1] \n\t" + "paddw %[ftmp5], %[ftmp5], %[ftmp12] \n\t" + "paddw %[ftmp6], %[ftmp6], %[ftmp12] \n\t" + "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t" + "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t" + "packsswh %[ftmp2], %[ftmp5], %[ftmp6] \n\t" + + // op[3] = (d1 * 2217 - c1 * 5352 + 7500) >> 12 + "ldc1 %[ftmp12], %[ff_pw_7500] \n\t" + "punpcklhw %[ftmp9], %[ftmp8], %[ftmp7] \n\t" + "pmaddhw %[ftmp5], %[ftmp9], %[ff_ph_op3] \n\t" + "punpckhhw %[ftmp9], %[ftmp8], %[ftmp7] \n\t" + "pmaddhw %[ftmp6], %[ftmp9], %[ff_ph_op3] \n\t" + "paddw %[ftmp5], %[ftmp5], %[ftmp12] \n\t" + "paddw %[ftmp6], %[ftmp6], %[ftmp12] \n\t" + "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t" + "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t" + "packsswh %[ftmp4], %[ftmp5], %[ftmp6] \n\t" + TRANSPOSE_4H + + "paddh %[ftmp5], %[ftmp1], %[ftmp4] \n\t" + "paddh %[ftmp6], %[ftmp2], %[ftmp3] \n\t" + "psubh %[ftmp7], %[ftmp2], %[ftmp3] \n\t" + "psubh %[ftmp8], %[ftmp1], %[ftmp4] \n\t" + + "pcmpeqh %[ftmp0], %[ftmp8], %[ftmp0] \n\t" + "ldc1 %[ftmp9], %[ff_ph_01] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + + "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t" + "psubh %[ftmp2], %[ftmp5], %[ftmp6] \n\t" + "ldc1 %[ftmp9], %[ff_ph_07] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + MMI_LI(%[tmp0], 0x04) + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "psrah %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + + MMI_LI(%[tmp0], 0x10) + "mtc1 %[tmp0], %[ftmp9] \n\t" + "ldc1 %[ftmp12], %[ff_pw_12000] \n\t" + "punpcklhw %[ftmp5], %[ftmp7], %[ftmp8] \n\t" + "pmaddhw %[ftmp10], %[ftmp5], %[ff_ph_op1] \n\t" + "punpckhhw %[ftmp5], %[ftmp7], %[ftmp8] \n\t" + "pmaddhw %[ftmp11], %[ftmp5], %[ff_ph_op1] \n\t" + "paddw %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t" + "psraw %[ftmp10], %[ftmp10], %[ftmp9] \n\t" + "psraw %[ftmp11], %[ftmp11], %[ftmp9] \n\t" + "packsswh %[ftmp3], %[ftmp10], %[ftmp11] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + + "ldc1 %[ftmp12], %[ff_pw_51000] \n\t" + "punpcklhw %[ftmp5], %[ftmp8], %[ftmp7] \n\t" + "pmaddhw %[ftmp10], %[ftmp5], %[ff_ph_op3] \n\t" + "punpckhhw %[ftmp5], %[ftmp8], %[ftmp7] \n\t" + "pmaddhw %[ftmp11], %[ftmp5], %[ff_ph_op3] \n\t" + "paddw %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t" + "psraw %[ftmp10], %[ftmp10], %[ftmp9] \n\t" + "psraw %[ftmp11], %[ftmp11], %[ftmp9] \n\t" + "packsswh %[ftmp4], %[ftmp10], %[ftmp11] \n\t" + + "gssdlc1 %[ftmp1], 0x07(%[output]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[output]) \n\t" + "gssdlc1 %[ftmp3], 0x0f(%[output]) \n\t" + "gssdrc1 %[ftmp3], 0x08(%[output]) \n\t" + "gssdlc1 %[ftmp2], 0x17(%[output]) \n\t" + "gssdrc1 %[ftmp2], 0x10(%[output]) \n\t" + "gssdlc1 %[ftmp4], 0x1f(%[output]) \n\t" + "gssdrc1 %[ftmp4], 0x18(%[output]) \n\t" + + : [ftmp0] "=&f"(ftmp0), [ftmp1] "=&f"(ftmp1), [ftmp2] "=&f"(ftmp2), + [ftmp3] "=&f"(ftmp3), [ftmp4] "=&f"(ftmp4), [ftmp5] "=&f"(ftmp5), + [ftmp6] "=&f"(ftmp6), [ftmp7] "=&f"(ftmp7), [ftmp8] "=&f"(ftmp8), + [ftmp9] "=&f"(ftmp9), [ftmp10] "=&f"(ftmp10), [ftmp11] "=&f"(ftmp11), + [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]), [ip]"+&r"(ip), + [ff_ph_op1] "=&f"(ff_ph_op1), [ff_ph_op3] "=&f"(ff_ph_op3) + : [ff_ph_01] "m"(ff_ph_01), [ff_ph_07] "m"(ff_ph_07), + [ff_pw_14500] "m"(ff_pw_14500), [ff_pw_7500] "m"(ff_pw_7500), + [ff_pw_12000] "m"(ff_pw_12000), [ff_pw_51000] "m"(ff_pw_51000), + [ff_pw_5352]"m"(ff_pw_5352), [ff_pw_2217]"m"(ff_pw_2217), + [ff_ph_8]"m"(ff_ph_8), [pitch]"r"(pitch), [output] "r"(output) + : "memory" + ); + /* clang-format on */ +} + +void vp8_short_fdct8x4_mmi(int16_t *input, int16_t *output, int pitch) { + vp8_short_fdct4x4_mmi(input, output, pitch); + vp8_short_fdct4x4_mmi(input + 4, output + 16, pitch); +} + +void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) { + double ftmp[13], ff_ph_01, ff_pw_01, ff_pw_03, ff_pw_mask; + uint64_t tmp[1]; + + /* clang-format off */ + __asm__ volatile ( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ff_ph_01] \n\t" + "dli %[tmp0], 0x0000000100000001 \n\t" + "dmtc1 %[tmp0], %[ff_pw_01] \n\t" + "dli %[tmp0], 0x0000000300000003 \n\t" + "dmtc1 %[tmp0], %[ff_pw_03] \n\t" + "dli %[tmp0], 0x0001000000010000 \n\t" + "dmtc1 %[tmp0], %[ff_pw_mask] \n\t" + MMI_LI(%[tmp0], 0x02) + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dmtc1 %[tmp0], %[ftmp11] \n\t" + + "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + "gsldlc1 %[ftmp2], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + "gsldlc1 %[ftmp3], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + "gsldlc1 %[ftmp4], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[ip]) \n\t" + TRANSPOSE_4H + + "psllh %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + "psllh %[ftmp2], %[ftmp2], %[ftmp11] \n\t" + "psllh %[ftmp3], %[ftmp3], %[ftmp11] \n\t" + "psllh %[ftmp4], %[ftmp4], %[ftmp11] \n\t" + // a + "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t" + // d + "paddh %[ftmp6], %[ftmp2], %[ftmp4] \n\t" + // c + "psubh %[ftmp7], %[ftmp2], %[ftmp4] \n\t" + // b + "psubh %[ftmp8], %[ftmp1], %[ftmp3] \n\t" + + // a + d + "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t" + // b + c + "paddh %[ftmp2], %[ftmp8], %[ftmp7] \n\t" + // b - c + "psubh %[ftmp3], %[ftmp8], %[ftmp7] \n\t" + // a - d + "psubh %[ftmp4], %[ftmp5], %[ftmp6] \n\t" + + "pcmpeqh %[ftmp6], %[ftmp5], %[ftmp0] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ff_ph_01] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp6] \n\t" + TRANSPOSE_4H + + // op[2], op[0] + "pmaddhw %[ftmp5], %[ftmp1], %[ff_pw_01] \n\t" + // op[3], op[1] + "pmaddhw %[ftmp1], %[ftmp1], %[ff_pw_mask] \n\t" + + // op[6], op[4] + "pmaddhw %[ftmp6], %[ftmp2], %[ff_pw_01] \n\t" + // op[7], op[5] + "pmaddhw %[ftmp2], %[ftmp2], %[ff_pw_mask] \n\t" + + // op[10], op[8] + "pmaddhw %[ftmp7], %[ftmp3], %[ff_pw_01] \n\t" + // op[11], op[9] + "pmaddhw %[ftmp3], %[ftmp3], %[ff_pw_mask] \n\t" + + // op[14], op[12] + "pmaddhw %[ftmp8], %[ftmp4], %[ff_pw_01] \n\t" + // op[15], op[13] + "pmaddhw %[ftmp4], %[ftmp4], %[ff_pw_mask] \n\t" + + // a1, a3 + "paddw %[ftmp9], %[ftmp5], %[ftmp7] \n\t" + // d1, d3 + "paddw %[ftmp10], %[ftmp6], %[ftmp8] \n\t" + // c1, c3 + "psubw %[ftmp11], %[ftmp6], %[ftmp8] \n\t" + // b1, b3 + "psubw %[ftmp12], %[ftmp5], %[ftmp7] \n\t" + + // a1 + d1, a3 + d3 + "paddw %[ftmp5], %[ftmp9], %[ftmp10] \n\t" + // b1 + c1, b3 + c3 + "paddw %[ftmp6], %[ftmp12], %[ftmp11] \n\t" + // b1 - c1, b3 - c3 + "psubw %[ftmp7], %[ftmp12], %[ftmp11] \n\t" + // a1 - d1, a3 - d3 + "psubw %[ftmp8], %[ftmp9], %[ftmp10] \n\t" + + // a2, a4 + "paddw %[ftmp9], %[ftmp1], %[ftmp3] \n\t" + // d2, d4 + "paddw %[ftmp10], %[ftmp2], %[ftmp4] \n\t" + // c2, c4 + "psubw %[ftmp11], %[ftmp2], %[ftmp4] \n\t" + // b2, b4 + "psubw %[ftmp12], %[ftmp1], %[ftmp3] \n\t" + + // a2 + d2, a4 + d4 + "paddw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" + // b2 + c2, b4 + c4 + "paddw %[ftmp2], %[ftmp12], %[ftmp11] \n\t" + // b2 - c2, b4 - c4 + "psubw %[ftmp3], %[ftmp12], %[ftmp11] \n\t" + // a2 - d2, a4 - d4 + "psubw %[ftmp4], %[ftmp9], %[ftmp10] \n\t" + + MMI_LI(%[tmp0], 0x03) + "dmtc1 %[tmp0], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp1] \n\t" + "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "paddw %[ftmp1], %[ftmp1], %[ff_pw_03] \n\t" + "psraw %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp2] \n\t" + "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + "paddw %[ftmp2], %[ftmp2], %[ff_pw_03] \n\t" + "psraw %[ftmp2], %[ftmp2], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp3] \n\t" + "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp3], %[ftmp3], %[ftmp9] \n\t" + "paddw %[ftmp3], %[ftmp3], %[ff_pw_03] \n\t" + "psraw %[ftmp3], %[ftmp3], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp4] \n\t" + "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp4], %[ftmp4], %[ftmp9] \n\t" + "paddw %[ftmp4], %[ftmp4], %[ff_pw_03] \n\t" + "psraw %[ftmp4], %[ftmp4], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp5] \n\t" + "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp5], %[ftmp5], %[ftmp9] \n\t" + "paddw %[ftmp5], %[ftmp5], %[ff_pw_03] \n\t" + "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp6] \n\t" + "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "paddw %[ftmp6], %[ftmp6], %[ff_pw_03] \n\t" + "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp7] \n\t" + "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp7], %[ftmp7], %[ftmp9] \n\t" + "paddw %[ftmp7], %[ftmp7], %[ff_pw_03] \n\t" + "psraw %[ftmp7], %[ftmp7], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp8] \n\t" + "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp8], %[ftmp8], %[ftmp9] \n\t" + "paddw %[ftmp8], %[ftmp8], %[ff_pw_03] \n\t" + "psraw %[ftmp8], %[ftmp8], %[ftmp11] \n\t" + + "packsswh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "packsswh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "packsswh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "packsswh %[ftmp4], %[ftmp4], %[ftmp8] \n\t" + + MMI_LI(%[tmp0], 0x72) + "dmtc1 %[tmp0], %[ftmp11] \n\t" + "pshufh %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + "pshufh %[ftmp2], %[ftmp2], %[ftmp11] \n\t" + "pshufh %[ftmp3], %[ftmp3], %[ftmp11] \n\t" + "pshufh %[ftmp4], %[ftmp4], %[ftmp11] \n\t" + + "gssdlc1 %[ftmp1], 0x07(%[op]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[op]) \n\t" + "gssdlc1 %[ftmp2], 0x0f(%[op]) \n\t" + "gssdrc1 %[ftmp2], 0x08(%[op]) \n\t" + "gssdlc1 %[ftmp3], 0x17(%[op]) \n\t" + "gssdrc1 %[ftmp3], 0x10(%[op]) \n\t" + "gssdlc1 %[ftmp4], 0x1f(%[op]) \n\t" + "gssdrc1 %[ftmp4], 0x18(%[op]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), [ff_pw_mask]"=&f"(ff_pw_mask), + [tmp0]"=&r"(tmp[0]), [ff_pw_01]"=&f"(ff_pw_01), + [ip]"+&r"(input), [ff_pw_03]"=&f"(ff_pw_03), + [ff_ph_01]"=&f"(ff_ph_01) + : [op]"r"(output), [pitch]"r"((mips_reg)pitch) + : "memory" + ); + /* clang-format on */ +} diff --git a/media/libvpx/libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c b/media/libvpx/libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c new file mode 100644 index 0000000000..1986444aa3 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/asmdefs_mmi.h" +#include "vp8/encoder/onyx_int.h" +#include "vp8/encoder/quantize.h" +#include "vp8/common/quant_common.h" + +#define REGULAR_SELECT_EOB(i, rc) \ + z = coeff_ptr[rc]; \ + sz = (z >> 31); \ + x = (z ^ sz) - sz; \ + zbin = zbin_ptr[rc] + *(zbin_boost_ptr++) + zbin_oq_value; \ + if (x >= zbin) { \ + x += round_ptr[rc]; \ + y = ((((x * quant_ptr[rc]) >> 16) + x) * quant_shift_ptr[rc]) >> 16; \ + if (y) { \ + x = (y ^ sz) - sz; \ + qcoeff_ptr[rc] = x; \ + dqcoeff_ptr[rc] = x * dequant_ptr[rc]; \ + eob = i; \ + zbin_boost_ptr = b->zrun_zbin_boost; \ + } \ + } + +void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) { + const int16_t *coeff_ptr = b->coeff; + const int16_t *round_ptr = b->round; + const int16_t *quant_ptr = b->quant_fast; + int16_t *qcoeff_ptr = d->qcoeff; + int16_t *dqcoeff_ptr = d->dqcoeff; + const int16_t *dequant_ptr = d->dequant; + const int16_t *inv_zig_zag = vp8_default_inv_zig_zag; + + double ftmp[13]; + uint64_t tmp[1]; + int64_t eob = 0; + double ones; + + __asm__ volatile( + // loop 0 ~ 7 + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pcmpeqh %[ones], %[ones], %[ones] \n\t" + "gsldlc1 %[ftmp1], 0x07(%[coeff_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[coeff_ptr]) \n\t" + "dli %[tmp0], 0x0f \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[coeff_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[coeff_ptr]) \n\t" + + "psrah %[ftmp3], %[ftmp1], %[ftmp9] \n\t" + "pxor %[ftmp1], %[ftmp3], %[ftmp1] \n\t" + "psubh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "psrah %[ftmp4], %[ftmp2], %[ftmp9] \n\t" + "pxor %[ftmp2], %[ftmp4], %[ftmp2] \n\t" + "psubh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + + "gsldlc1 %[ftmp5], 0x07(%[round_ptr]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[round_ptr]) \n\t" + "gsldlc1 %[ftmp6], 0x0f(%[round_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x08(%[round_ptr]) \n\t" + "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "gsldlc1 %[ftmp7], 0x07(%[quant_ptr]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[quant_ptr]) \n\t" + "gsldlc1 %[ftmp8], 0x0f(%[quant_ptr]) \n\t" + "gsldrc1 %[ftmp8], 0x08(%[quant_ptr]) \n\t" + "pmulhuh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "pmulhuh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + + "pxor %[ftmp7], %[ftmp5], %[ftmp3] \n\t" + "pxor %[ftmp8], %[ftmp6], %[ftmp4] \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" + "psubh %[ftmp8], %[ftmp8], %[ftmp4] \n\t" + "gssdlc1 %[ftmp7], 0x07(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp7], 0x00(%[qcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp8], 0x0f(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp8], 0x08(%[qcoeff_ptr]) \n\t" + + "gsldlc1 %[ftmp1], 0x07(%[inv_zig_zag]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[inv_zig_zag]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[inv_zig_zag]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[inv_zig_zag]) \n\t" + "pcmpeqh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "pcmpeqh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ones] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ones] \n\t" + "pand %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "pand %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "pmaxsh %[ftmp10], %[ftmp5], %[ftmp6] \n\t" + + "gsldlc1 %[ftmp5], 0x07(%[dequant_ptr]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[dequant_ptr]) \n\t" + "gsldlc1 %[ftmp6], 0x0f(%[dequant_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x08(%[dequant_ptr]) \n\t" + "pmullh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "gssdlc1 %[ftmp5], 0x07(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp5], 0x00(%[dqcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp6], 0x0f(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp6], 0x08(%[dqcoeff_ptr]) \n\t" + + // loop 8 ~ 15 + "gsldlc1 %[ftmp1], 0x17(%[coeff_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x10(%[coeff_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x1f(%[coeff_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x18(%[coeff_ptr]) \n\t" + + "psrah %[ftmp3], %[ftmp1], %[ftmp9] \n\t" + "pxor %[ftmp1], %[ftmp3], %[ftmp1] \n\t" + "psubh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "psrah %[ftmp4], %[ftmp2], %[ftmp9] \n\t" + "pxor %[ftmp2], %[ftmp4], %[ftmp2] \n\t" + "psubh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + + "gsldlc1 %[ftmp5], 0x17(%[round_ptr]) \n\t" + "gsldrc1 %[ftmp5], 0x10(%[round_ptr]) \n\t" + "gsldlc1 %[ftmp6], 0x1f(%[round_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x18(%[round_ptr]) \n\t" + "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "gsldlc1 %[ftmp7], 0x17(%[quant_ptr]) \n\t" + "gsldrc1 %[ftmp7], 0x10(%[quant_ptr]) \n\t" + "gsldlc1 %[ftmp8], 0x1f(%[quant_ptr]) \n\t" + "gsldrc1 %[ftmp8], 0x18(%[quant_ptr]) \n\t" + "pmulhuh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "pmulhuh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + + "pxor %[ftmp7], %[ftmp5], %[ftmp3] \n\t" + "pxor %[ftmp8], %[ftmp6], %[ftmp4] \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" + "psubh %[ftmp8], %[ftmp8], %[ftmp4] \n\t" + "gssdlc1 %[ftmp7], 0x17(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp7], 0x10(%[qcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp8], 0x1f(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp8], 0x18(%[qcoeff_ptr]) \n\t" + + "gsldlc1 %[ftmp1], 0x17(%[inv_zig_zag]) \n\t" + "gsldrc1 %[ftmp1], 0x10(%[inv_zig_zag]) \n\t" + "gsldlc1 %[ftmp2], 0x1f(%[inv_zig_zag]) \n\t" + "gsldrc1 %[ftmp2], 0x18(%[inv_zig_zag]) \n\t" + "pcmpeqh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "pcmpeqh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ones] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ones] \n\t" + "pand %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "pand %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "pmaxsh %[ftmp11], %[ftmp5], %[ftmp6] \n\t" + + "gsldlc1 %[ftmp5], 0x17(%[dequant_ptr]) \n\t" + "gsldrc1 %[ftmp5], 0x10(%[dequant_ptr]) \n\t" + "gsldlc1 %[ftmp6], 0x1f(%[dequant_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x18(%[dequant_ptr]) \n\t" + "pmullh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "gssdlc1 %[ftmp5], 0x17(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp5], 0x10(%[dqcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp6], 0x1f(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp6], 0x18(%[dqcoeff_ptr]) \n\t" + + "dli %[tmp0], 0x10 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + + "pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t" + "psrlw %[ftmp11], %[ftmp10], %[ftmp9] \n\t" + "pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t" + "dli %[tmp0], 0xaa \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "pshufh %[ftmp11], %[ftmp10], %[ftmp9] \n\t" + "pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t" + "dli %[tmp0], 0xffff \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "pand %[ftmp10], %[ftmp10], %[ftmp9] \n\t" + "gssdlc1 %[ftmp10], 0x07(%[eob]) \n\t" + "gssdrc1 %[ftmp10], 0x00(%[eob]) \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), + [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]), + [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]), + [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), + [tmp0] "=&r"(tmp[0]), [ones] "=&f"(ones) + : [coeff_ptr] "r"((mips_reg)coeff_ptr), + [qcoeff_ptr] "r"((mips_reg)qcoeff_ptr), + [dequant_ptr] "r"((mips_reg)dequant_ptr), + [round_ptr] "r"((mips_reg)round_ptr), + [quant_ptr] "r"((mips_reg)quant_ptr), + [dqcoeff_ptr] "r"((mips_reg)dqcoeff_ptr), + [inv_zig_zag] "r"((mips_reg)inv_zig_zag), [eob] "r"((mips_reg)&eob) + : "memory"); + + *d->eob = eob; +} + +void vp8_regular_quantize_b_mmi(BLOCK *b, BLOCKD *d) { + int eob = 0; + int x, y, z, sz, zbin; + const int16_t *zbin_boost_ptr = b->zrun_zbin_boost; + const int16_t *coeff_ptr = b->coeff; + const int16_t *zbin_ptr = b->zbin; + const int16_t *round_ptr = b->round; + const int16_t *quant_ptr = b->quant; + const int16_t *quant_shift_ptr = b->quant_shift; + int16_t *qcoeff_ptr = d->qcoeff; + int16_t *dqcoeff_ptr = d->dqcoeff; + const int16_t *dequant_ptr = d->dequant; + const int16_t zbin_oq_value = b->zbin_extra; + register double ftmp0 asm("$f0"); + + // memset(qcoeff_ptr, 0, 32); + // memset(dqcoeff_ptr, 0, 32); + /* clang-format off */ + __asm__ volatile ( + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gssdlc1 %[ftmp0], 0x07(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[qcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp0], 0x0f(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x08(%[qcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp0], 0x17(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x10(%[qcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp0], 0x1f(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x18(%[qcoeff_ptr]) \n\t" + + "gssdlc1 %[ftmp0], 0x07(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[dqcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp0], 0x0f(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x08(%[dqcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp0], 0x17(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x10(%[dqcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp0], 0x1f(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x18(%[dqcoeff_ptr]) \n\t" + : [ftmp0]"=&f"(ftmp0) + : [qcoeff_ptr]"r"(qcoeff_ptr), [dqcoeff_ptr]"r"(dqcoeff_ptr) + : "memory" + ); + /* clang-format on */ + + REGULAR_SELECT_EOB(1, 0); + REGULAR_SELECT_EOB(2, 1); + REGULAR_SELECT_EOB(3, 4); + REGULAR_SELECT_EOB(4, 8); + REGULAR_SELECT_EOB(5, 5); + REGULAR_SELECT_EOB(6, 2); + REGULAR_SELECT_EOB(7, 3); + REGULAR_SELECT_EOB(8, 6); + REGULAR_SELECT_EOB(9, 9); + REGULAR_SELECT_EOB(10, 12); + REGULAR_SELECT_EOB(11, 13); + REGULAR_SELECT_EOB(12, 10); + REGULAR_SELECT_EOB(13, 7); + REGULAR_SELECT_EOB(14, 11); + REGULAR_SELECT_EOB(15, 14); + REGULAR_SELECT_EOB(16, 15); + + *d->eob = (char)eob; +} diff --git a/media/libvpx/libvpx/vp8/encoder/mips/msa/dct_msa.c b/media/libvpx/libvpx/vp8/encoder/mips/msa/dct_msa.c new file mode 100644 index 0000000000..3084667552 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/mips/msa/dct_msa.c @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vp8/common/mips/msa/vp8_macros_msa.h" + +#define TRANSPOSE4x4_H(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 s0_m, s1_m, tp0_m, tp1_m, tp2_m, tp3_m; \ + \ + ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tp0_m, tp1_m); \ + ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tp2_m, tp3_m); \ + PCKEV_D2_SH(tp2_m, tp0_m, tp3_m, tp1_m, out0, out2); \ + PCKOD_D2_SH(tp2_m, tp0_m, tp3_m, tp1_m, out1, out3); \ + } + +#define SET_DOTP_VALUES(coeff, val0, val1, val2, const1, const2) \ + { \ + v8i16 tmp0_m; \ + \ + SPLATI_H3_SH(coeff, val0, val1, val2, tmp0_m, const1, const2); \ + ILVEV_H2_SH(tmp0_m, const1, const2, tmp0_m, const1, const2); \ + } + +#define RET_1_IF_NZERO_H(in0) \ + ({ \ + v8i16 tmp0_m; \ + v8i16 one_m = __msa_ldi_h(1); \ + \ + tmp0_m = __msa_ceqi_h(in0, 0); \ + tmp0_m = tmp0_m ^ 255; \ + tmp0_m = one_m & tmp0_m; \ + \ + tmp0_m; \ + }) + +#define RET_1_IF_NZERO_W(in0) \ + ({ \ + v4i32 tmp0_m; \ + v4i32 one_m = __msa_ldi_w(1); \ + \ + tmp0_m = __msa_ceqi_w(in0, 0); \ + tmp0_m = tmp0_m ^ 255; \ + tmp0_m = one_m & tmp0_m; \ + \ + tmp0_m; \ + }) + +#define RET_1_IF_NEG_W(in0) \ + ({ \ + v4i32 tmp0_m; \ + \ + v4i32 one_m = __msa_ldi_w(1); \ + tmp0_m = __msa_clti_s_w(in0, 0); \ + tmp0_m = one_m & tmp0_m; \ + \ + tmp0_m; \ + }) + +void vp8_short_fdct4x4_msa(int16_t *input, int16_t *output, int32_t pitch) { + v8i16 in0, in1, in2, in3; + v8i16 temp0, temp1; + v8i16 const0, const1; + v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 }; + v4i32 out0, out1, out2, out3; + v8i16 zero = { 0 }; + + LD_SH4(input, pitch / 2, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + + BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); + SLLI_4V(temp0, temp1, in1, in3, 3); + in0 = temp0 + temp1; + in2 = temp0 - temp1; + SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1); + temp0 = __msa_ilvr_h(in3, in1); + in1 = __msa_splati_h(coeff, 3); + out0 = (v4i32)__msa_ilvev_h(zero, in1); + coeff = __msa_ilvl_h(zero, coeff); + out1 = __msa_splati_w((v4i32)coeff, 0); + DPADD_SH2_SW(temp0, temp0, const0, const1, out0, out1); + out0 >>= 12; + out1 >>= 12; + PCKEV_H2_SH(out0, out0, out1, out1, in1, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + + BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); + in0 = temp0 + temp1 + 7; + in2 = temp0 - temp1 + 7; + in0 >>= 4; + in2 >>= 4; + ILVR_H2_SW(zero, in0, zero, in2, out0, out2); + temp1 = RET_1_IF_NZERO_H(in3); + ILVR_H2_SH(zero, temp1, in3, in1, temp1, temp0); + SPLATI_W2_SW(coeff, 2, out3, out1); + out3 += out1; + out1 = __msa_splati_w((v4i32)coeff, 1); + DPADD_SH2_SW(temp0, temp0, const0, const1, out1, out3); + out1 >>= 16; + out3 >>= 16; + out1 += (v4i32)temp1; + PCKEV_H2_SH(out1, out0, out3, out2, in0, in2); + ST_SH2(in0, in2, output, 8); +} + +void vp8_short_fdct8x4_msa(int16_t *input, int16_t *output, int32_t pitch) { + v8i16 in0, in1, in2, in3; + v8i16 temp0, temp1, tmp0, tmp1; + v8i16 const0, const1, const2; + v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 }; + v8i16 zero = { 0 }; + v4i32 vec0_w, vec1_w, vec2_w, vec3_w; + + LD_SH4(input, pitch / 2, in0, in1, in2, in3); + TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + + BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); + SLLI_4V(temp0, temp1, in1, in3, 3); + in0 = temp0 + temp1; + in2 = temp0 - temp1; + SET_DOTP_VALUES(coeff, 0, 1, 2, const1, const2); + temp0 = __msa_splati_h(coeff, 3); + vec1_w = (v4i32)__msa_ilvev_h(zero, temp0); + coeff = __msa_ilvl_h(zero, coeff); + vec3_w = __msa_splati_w((v4i32)coeff, 0); + ILVRL_H2_SH(in3, in1, tmp1, tmp0); + vec0_w = vec1_w; + vec2_w = vec3_w; + DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2, vec0_w, + vec1_w, vec2_w, vec3_w); + SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 12); + PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3); + TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + + BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); + in0 = temp0 + temp1 + 7; + in2 = temp0 - temp1 + 7; + in0 >>= 4; + in2 >>= 4; + SPLATI_W2_SW(coeff, 2, vec3_w, vec1_w); + vec3_w += vec1_w; + vec1_w = __msa_splati_w((v4i32)coeff, 1); + const0 = RET_1_IF_NZERO_H(in3); + ILVRL_H2_SH(in3, in1, tmp1, tmp0); + vec0_w = vec1_w; + vec2_w = vec3_w; + DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2, vec0_w, + vec1_w, vec2_w, vec3_w); + SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 16); + PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3); + in1 += const0; + PCKEV_D2_SH(in1, in0, in3, in2, temp0, temp1); + ST_SH2(temp0, temp1, output, 8); + + PCKOD_D2_SH(in1, in0, in3, in2, in0, in2); + ST_SH2(in0, in2, output + 16, 8); +} + +void vp8_short_walsh4x4_msa(int16_t *input, int16_t *output, int32_t pitch) { + v8i16 in0_h, in1_h, in2_h, in3_h; + v4i32 in0_w, in1_w, in2_w, in3_w, temp0, temp1, temp2, temp3; + + LD_SH4(input, pitch / 2, in0_h, in1_h, in2_h, in3_h); + TRANSPOSE4x4_SH_SH(in0_h, in1_h, in2_h, in3_h, in0_h, in1_h, in2_h, in3_h); + + UNPCK_R_SH_SW(in0_h, in0_w); + UNPCK_R_SH_SW(in1_h, in1_w); + UNPCK_R_SH_SW(in2_h, in2_w); + UNPCK_R_SH_SW(in3_h, in3_w); + BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1); + SLLI_4V(temp0, temp1, temp2, temp3, 2); + BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w); + temp0 = RET_1_IF_NZERO_W(temp0); + in0_w += temp0; + TRANSPOSE4x4_SW_SW(in0_w, in1_w, in2_w, in3_w, in0_w, in1_w, in2_w, in3_w); + + BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1); + BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w); + in0_w += RET_1_IF_NEG_W(in0_w); + in1_w += RET_1_IF_NEG_W(in1_w); + in2_w += RET_1_IF_NEG_W(in2_w); + in3_w += RET_1_IF_NEG_W(in3_w); + ADD4(in0_w, 3, in1_w, 3, in2_w, 3, in3_w, 3, in0_w, in1_w, in2_w, in3_w); + SRA_4V(in0_w, in1_w, in2_w, in3_w, 3); + PCKEV_H2_SH(in1_w, in0_w, in3_w, in2_w, in0_h, in1_h); + ST_SH2(in0_h, in1_h, output, 8); +} diff --git a/media/libvpx/libvpx/vp8/encoder/mips/msa/denoising_msa.c b/media/libvpx/libvpx/vp8/encoder/mips/msa/denoising_msa.c new file mode 100644 index 0000000000..f8b653a9a7 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/mips/msa/denoising_msa.c @@ -0,0 +1,568 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vp8_rtcd.h" +#include "vp8/common/mips/msa/vp8_macros_msa.h" +#include "vp8/encoder/denoising.h" + +int32_t vp8_denoiser_filter_msa(uint8_t *mc_running_avg_y_ptr, + int32_t mc_avg_y_stride, + uint8_t *running_avg_y_ptr, + int32_t avg_y_stride, uint8_t *sig_ptr, + int32_t sig_stride, uint32_t motion_magnitude, + int32_t increase_denoising) { + uint8_t *running_avg_y_start = running_avg_y_ptr; + uint8_t *sig_start = sig_ptr; + int32_t cnt = 0; + int32_t sum_diff = 0; + int32_t shift_inc1 = 3; + int32_t delta = 0; + int32_t sum_diff_thresh; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 src8, src9, src10, src11, src12, src13, src14, src15; + v16u8 mc_running_avg_y0, running_avg_y, sig0; + v16u8 mc_running_avg_y1, running_avg_y1, sig1; + v16u8 coeff0, coeff1; + v8i16 diff0, diff1, abs_diff0, abs_diff1, abs_diff_neg0, abs_diff_neg1; + v8i16 adjust0, adjust1, adjust2, adjust3; + v8i16 shift_inc1_vec = { 0 }; + v8i16 col_sum0 = { 0 }; + v8i16 col_sum1 = { 0 }; + v8i16 col_sum2 = { 0 }; + v8i16 col_sum3 = { 0 }; + v8i16 temp0_h, temp1_h, temp2_h, temp3_h, cmp, delta_vec; + v4i32 temp0_w; + v2i64 temp0_d, temp1_d; + v8i16 zero = { 0 }; + v8i16 one = __msa_ldi_h(1); + v8i16 four = __msa_ldi_h(4); + v8i16 val_127 = __msa_ldi_h(127); + v8i16 adj_val = { 6, 4, 3, 0, -6, -4, -3, 0 }; + + if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) { + adj_val = __msa_add_a_h(adj_val, one); + if (increase_denoising) { + adj_val = __msa_add_a_h(adj_val, one); + shift_inc1 = 4; + } + + temp0_h = zero - adj_val; + adj_val = (v8i16)__msa_ilvev_d((v2i64)temp0_h, (v2i64)adj_val); + } + + adj_val = __msa_insert_h(adj_val, 3, cnt); + adj_val = __msa_insert_h(adj_val, 7, cnt); + shift_inc1_vec = __msa_fill_h(shift_inc1); + + for (cnt = 8; cnt--;) { + v8i16 mask0 = { 0 }; + v8i16 mask1 = { 0 }; + + mc_running_avg_y0 = LD_UB(mc_running_avg_y_ptr); + sig0 = LD_UB(sig_ptr); + sig_ptr += sig_stride; + mc_running_avg_y_ptr += mc_avg_y_stride; + + mc_running_avg_y1 = LD_UB(mc_running_avg_y_ptr); + sig1 = LD_UB(sig_ptr); + + ILVRL_B2_UB(mc_running_avg_y0, sig0, coeff0, coeff1); + HSUB_UB2_SH(coeff0, coeff1, diff0, diff1); + abs_diff0 = __msa_add_a_h(diff0, zero); + abs_diff1 = __msa_add_a_h(diff1, zero); + cmp = __msa_clei_s_h(abs_diff0, 15); + cmp = cmp & one; + mask0 += cmp; + cmp = __msa_clei_s_h(abs_diff0, 7); + cmp = cmp & one; + mask0 += cmp; + cmp = abs_diff0 < shift_inc1_vec; + cmp = cmp & one; + mask0 += cmp; + cmp = __msa_clei_s_h(abs_diff1, 15); + cmp = cmp & one; + mask1 += cmp; + cmp = __msa_clei_s_h(abs_diff1, 7); + cmp = cmp & one; + mask1 += cmp; + cmp = abs_diff1 < shift_inc1_vec; + cmp = cmp & one; + mask1 += cmp; + temp0_h = __msa_clei_s_h(diff0, 0); + temp0_h = temp0_h & four; + mask0 += temp0_h; + temp1_h = __msa_clei_s_h(diff1, 0); + temp1_h = temp1_h & four; + mask1 += temp1_h; + VSHF_H2_SH(adj_val, adj_val, adj_val, adj_val, mask0, mask1, adjust0, + adjust1); + temp2_h = __msa_ceqi_h(adjust0, 0); + temp3_h = __msa_ceqi_h(adjust1, 0); + adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)diff0, (v16u8)temp2_h); + adjust1 = (v8i16)__msa_bmnz_v((v16u8)adjust1, (v16u8)diff1, (v16u8)temp3_h); + ADD2(col_sum0, adjust0, col_sum1, adjust1, col_sum0, col_sum1); + UNPCK_UB_SH(sig0, temp0_h, temp1_h); + ADD2(temp0_h, adjust0, temp1_h, adjust1, temp0_h, temp1_h); + MAXI_SH2_SH(temp0_h, temp1_h, 0); + SAT_UH2_SH(temp0_h, temp1_h, 7); + temp2_h = (v8i16)__msa_pckev_b((v16i8)temp3_h, (v16i8)temp2_h); + running_avg_y = (v16u8)__msa_pckev_b((v16i8)temp1_h, (v16i8)temp0_h); + running_avg_y = + __msa_bmnz_v(running_avg_y, mc_running_avg_y0, (v16u8)temp2_h); + ST_UB(running_avg_y, running_avg_y_ptr); + running_avg_y_ptr += avg_y_stride; + + mask0 = zero; + mask1 = zero; + ILVRL_B2_UB(mc_running_avg_y1, sig1, coeff0, coeff1); + HSUB_UB2_SH(coeff0, coeff1, diff0, diff1); + abs_diff0 = __msa_add_a_h(diff0, zero); + abs_diff1 = __msa_add_a_h(diff1, zero); + cmp = __msa_clei_s_h(abs_diff0, 15); + cmp = cmp & one; + mask0 += cmp; + cmp = __msa_clei_s_h(abs_diff0, 7); + cmp = cmp & one; + mask0 += cmp; + cmp = abs_diff0 < shift_inc1_vec; + cmp = cmp & one; + mask0 += cmp; + cmp = __msa_clei_s_h(abs_diff1, 15); + cmp = cmp & one; + mask1 += cmp; + cmp = __msa_clei_s_h(abs_diff1, 7); + cmp = cmp & one; + mask1 += cmp; + cmp = abs_diff1 < shift_inc1_vec; + cmp = cmp & one; + mask1 += cmp; + temp0_h = __msa_clei_s_h(diff0, 0); + temp0_h = temp0_h & four; + mask0 += temp0_h; + temp1_h = __msa_clei_s_h(diff1, 0); + temp1_h = temp1_h & four; + mask1 += temp1_h; + VSHF_H2_SH(adj_val, adj_val, adj_val, adj_val, mask0, mask1, adjust0, + adjust1); + temp2_h = __msa_ceqi_h(adjust0, 0); + temp3_h = __msa_ceqi_h(adjust1, 0); + adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)diff0, (v16u8)temp2_h); + adjust1 = (v8i16)__msa_bmnz_v((v16u8)adjust1, (v16u8)diff1, (v16u8)temp3_h); + ADD2(col_sum0, adjust0, col_sum1, adjust1, col_sum0, col_sum1); + UNPCK_UB_SH(sig1, temp0_h, temp1_h); + ADD2(temp0_h, adjust0, temp1_h, adjust1, temp0_h, temp1_h); + MAXI_SH2_SH(temp0_h, temp1_h, 0); + SAT_UH2_SH(temp0_h, temp1_h, 7); + temp2_h = (v8i16)__msa_pckev_b((v16i8)temp3_h, (v16i8)temp2_h); + running_avg_y = (v16u8)__msa_pckev_b((v16i8)temp1_h, (v16i8)temp0_h); + running_avg_y = + __msa_bmnz_v(running_avg_y, mc_running_avg_y1, (v16u8)temp2_h); + ST_UB(running_avg_y, running_avg_y_ptr); + sig_ptr += sig_stride; + mc_running_avg_y_ptr += mc_avg_y_stride; + running_avg_y_ptr += avg_y_stride; + } + + col_sum0 = __msa_min_s_h(col_sum0, val_127); + col_sum1 = __msa_min_s_h(col_sum1, val_127); + temp0_h = col_sum0 + col_sum1; + temp0_w = __msa_hadd_s_w(temp0_h, temp0_h); + temp0_d = __msa_hadd_s_d(temp0_w, temp0_w); + temp1_d = __msa_splati_d(temp0_d, 1); + temp0_d += temp1_d; + sum_diff = __msa_copy_s_w((v4i32)temp0_d, 0); + sig_ptr -= sig_stride * 16; + mc_running_avg_y_ptr -= mc_avg_y_stride * 16; + running_avg_y_ptr -= avg_y_stride * 16; + + if (increase_denoising) { + sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH; + } + + if (abs(sum_diff) > sum_diff_thresh) { + delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1; + delta_vec = __msa_fill_h(delta); + if (delta < 4) { + for (cnt = 8; cnt--;) { + running_avg_y = LD_UB(running_avg_y_ptr); + mc_running_avg_y0 = LD_UB(mc_running_avg_y_ptr); + sig0 = LD_UB(sig_ptr); + sig_ptr += sig_stride; + mc_running_avg_y_ptr += mc_avg_y_stride; + running_avg_y_ptr += avg_y_stride; + mc_running_avg_y1 = LD_UB(mc_running_avg_y_ptr); + sig1 = LD_UB(sig_ptr); + running_avg_y1 = LD_UB(running_avg_y_ptr); + ILVRL_B2_UB(mc_running_avg_y0, sig0, coeff0, coeff1); + HSUB_UB2_SH(coeff0, coeff1, diff0, diff1); + abs_diff0 = __msa_add_a_h(diff0, zero); + abs_diff1 = __msa_add_a_h(diff1, zero); + temp0_h = abs_diff0 < delta_vec; + temp1_h = abs_diff1 < delta_vec; + abs_diff0 = (v8i16)__msa_bmz_v((v16u8)abs_diff0, (v16u8)delta_vec, + (v16u8)temp0_h); + abs_diff1 = (v8i16)__msa_bmz_v((v16u8)abs_diff1, (v16u8)delta_vec, + (v16u8)temp1_h); + SUB2(zero, abs_diff0, zero, abs_diff1, abs_diff_neg0, abs_diff_neg1); + abs_diff_neg0 = zero - abs_diff0; + abs_diff_neg1 = zero - abs_diff1; + temp0_h = __msa_clei_s_h(diff0, 0); + temp1_h = __msa_clei_s_h(diff1, 0); + adjust0 = (v8i16)__msa_bmnz_v((v16u8)abs_diff0, (v16u8)abs_diff_neg0, + (v16u8)temp0_h); + adjust1 = (v8i16)__msa_bmnz_v((v16u8)abs_diff1, (v16u8)abs_diff_neg1, + (v16u8)temp1_h); + ILVRL_B2_SH(zero, running_avg_y, temp2_h, temp3_h); + ADD2(temp2_h, adjust0, temp3_h, adjust1, adjust2, adjust3); + MAXI_SH2_SH(adjust2, adjust3, 0); + SAT_UH2_SH(adjust2, adjust3, 7); + temp0_h = __msa_ceqi_h(diff0, 0); + temp1_h = __msa_ceqi_h(diff1, 0); + adjust2 = + (v8i16)__msa_bmz_v((v16u8)adjust2, (v16u8)temp2_h, (v16u8)temp0_h); + adjust3 = + (v8i16)__msa_bmz_v((v16u8)adjust3, (v16u8)temp3_h, (v16u8)temp1_h); + adjust0 = + (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)zero, (v16u8)temp0_h); + adjust1 = + (v8i16)__msa_bmnz_v((v16u8)adjust1, (v16u8)zero, (v16u8)temp1_h); + ADD2(col_sum2, adjust0, col_sum3, adjust1, col_sum2, col_sum3); + running_avg_y = (v16u8)__msa_pckev_b((v16i8)adjust3, (v16i8)adjust2); + ST_UB(running_avg_y, running_avg_y_ptr - avg_y_stride); + ILVRL_B2_UB(mc_running_avg_y1, sig1, coeff0, coeff1); + HSUB_UB2_SH(coeff0, coeff1, diff0, diff1); + abs_diff0 = __msa_add_a_h(diff0, zero); + abs_diff1 = __msa_add_a_h(diff1, zero); + temp0_h = abs_diff0 < delta_vec; + temp1_h = abs_diff1 < delta_vec; + abs_diff0 = (v8i16)__msa_bmz_v((v16u8)abs_diff0, (v16u8)delta_vec, + (v16u8)temp0_h); + abs_diff1 = (v8i16)__msa_bmz_v((v16u8)abs_diff1, (v16u8)delta_vec, + (v16u8)temp1_h); + SUB2(zero, abs_diff0, zero, abs_diff1, abs_diff_neg0, abs_diff_neg1); + temp0_h = __msa_clei_s_h(diff0, 0); + temp1_h = __msa_clei_s_h(diff1, 0); + adjust0 = (v8i16)__msa_bmnz_v((v16u8)abs_diff0, (v16u8)abs_diff_neg0, + (v16u8)temp0_h); + adjust1 = (v8i16)__msa_bmnz_v((v16u8)abs_diff1, (v16u8)abs_diff_neg1, + (v16u8)temp1_h); + ILVRL_H2_SH(zero, running_avg_y1, temp2_h, temp3_h); + ADD2(temp2_h, adjust0, temp3_h, adjust1, adjust2, adjust3); + MAXI_SH2_SH(adjust2, adjust3, 0); + SAT_UH2_SH(adjust2, adjust3, 7); + temp0_h = __msa_ceqi_h(diff0, 0); + temp1_h = __msa_ceqi_h(diff1, 0); + adjust2 = + (v8i16)__msa_bmz_v((v16u8)adjust2, (v16u8)temp2_h, (v16u8)temp0_h); + adjust3 = + (v8i16)__msa_bmz_v((v16u8)adjust3, (v16u8)temp3_h, (v16u8)temp1_h); + adjust0 = + (v8i16)__msa_bmz_v((v16u8)adjust0, (v16u8)zero, (v16u8)temp0_h); + adjust1 = + (v8i16)__msa_bmz_v((v16u8)adjust1, (v16u8)zero, (v16u8)temp1_h); + ADD2(col_sum2, adjust0, col_sum3, adjust1, col_sum2, col_sum3); + running_avg_y = (v16u8)__msa_pckev_b((v16i8)adjust3, (v16i8)adjust2); + ST_UB(running_avg_y, running_avg_y_ptr); + running_avg_y_ptr += avg_y_stride; + } + + col_sum2 = __msa_min_s_h(col_sum2, val_127); + col_sum3 = __msa_min_s_h(col_sum3, val_127); + temp0_h = col_sum2 + col_sum3; + temp0_w = __msa_hadd_s_w(temp0_h, temp0_h); + temp0_d = __msa_hadd_s_d(temp0_w, temp0_w); + temp1_d = __msa_splati_d(temp0_d, 1); + temp0_d += (v2i64)temp1_d; + sum_diff = __msa_copy_s_w((v4i32)temp0_d, 0); + if (abs(sum_diff) > SUM_DIFF_THRESHOLD) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + + LD_UB8(sig_start, sig_stride, src0, src1, src2, src3, src4, src5, src6, src7); + sig_start += (8 * sig_stride); + LD_UB8(sig_start, sig_stride, src8, src9, src10, src11, src12, src13, src14, + src15); + + ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, running_avg_y_start, + avg_y_stride); + running_avg_y_start += (8 * avg_y_stride); + ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, + running_avg_y_start, avg_y_stride); + + return FILTER_BLOCK; +} + +int32_t vp8_denoiser_filter_uv_msa( + uint8_t *mc_running_avg_y_ptr, int32_t mc_avg_y_stride, + uint8_t *running_avg_y_ptr, int32_t avg_y_stride, uint8_t *sig_ptr, + int32_t sig_stride, uint32_t motion_magnitude, int32_t increase_denoising) { + uint8_t *running_avg_y_start = running_avg_y_ptr; + uint8_t *sig_start = sig_ptr; + int32_t cnt = 0; + int32_t sum_diff = 0; + int32_t shift_inc1 = 3; + int32_t delta = 0; + int32_t sum_block = 0; + int32_t sum_diff_thresh; + int64_t dst0, dst1, src0, src1, src2, src3; + v16u8 mc_running_avg_y0, running_avg_y, sig0; + v16u8 mc_running_avg_y1, running_avg_y1, sig1; + v16u8 sig2, sig3, sig4, sig5, sig6, sig7; + v16u8 coeff0; + v8i16 diff0, abs_diff0, abs_diff_neg0; + v8i16 adjust0, adjust2; + v8i16 shift_inc1_vec = { 0 }; + v8i16 col_sum0 = { 0 }; + v8i16 temp0_h, temp2_h, cmp, delta_vec; + v4i32 temp0_w; + v2i64 temp0_d, temp1_d; + v16i8 zero = { 0 }; + v8i16 one = __msa_ldi_h(1); + v8i16 four = __msa_ldi_h(4); + v8i16 adj_val = { 6, 4, 3, 0, -6, -4, -3, 0 }; + + sig0 = LD_UB(sig_ptr); + sig_ptr += sig_stride; + temp0_h = (v8i16)__msa_ilvr_b(zero, (v16i8)sig0); + sig1 = LD_UB(sig_ptr); + sig_ptr += sig_stride; + temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig1); + sig2 = LD_UB(sig_ptr); + sig_ptr += sig_stride; + temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig2); + sig3 = LD_UB(sig_ptr); + sig_ptr += sig_stride; + temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig3); + sig4 = LD_UB(sig_ptr); + sig_ptr += sig_stride; + temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig4); + sig5 = LD_UB(sig_ptr); + sig_ptr += sig_stride; + temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig5); + sig6 = LD_UB(sig_ptr); + sig_ptr += sig_stride; + temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig6); + sig7 = LD_UB(sig_ptr); + sig_ptr += sig_stride; + temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig7); + temp0_w = __msa_hadd_s_w(temp0_h, temp0_h); + temp0_d = __msa_hadd_s_d(temp0_w, temp0_w); + temp1_d = __msa_splati_d(temp0_d, 1); + temp0_d += temp1_d; + sum_block = __msa_copy_s_w((v4i32)temp0_d, 0); + sig_ptr -= sig_stride * 8; + + if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) { + return COPY_BLOCK; + } + + if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) { + adj_val = __msa_add_a_h(adj_val, one); + + if (increase_denoising) { + adj_val = __msa_add_a_h(adj_val, one); + shift_inc1 = 4; + } + + temp0_h = (v8i16)zero - adj_val; + adj_val = (v8i16)__msa_ilvev_d((v2i64)temp0_h, (v2i64)adj_val); + } + + adj_val = __msa_insert_h(adj_val, 3, cnt); + adj_val = __msa_insert_h(adj_val, 7, cnt); + shift_inc1_vec = __msa_fill_h(shift_inc1); + for (cnt = 4; cnt--;) { + v8i16 mask0 = { 0 }; + mc_running_avg_y0 = LD_UB(mc_running_avg_y_ptr); + sig0 = LD_UB(sig_ptr); + sig_ptr += sig_stride; + mc_running_avg_y_ptr += mc_avg_y_stride; + mc_running_avg_y1 = LD_UB(mc_running_avg_y_ptr); + sig1 = LD_UB(sig_ptr); + coeff0 = (v16u8)__msa_ilvr_b((v16i8)mc_running_avg_y0, (v16i8)sig0); + diff0 = __msa_hsub_u_h(coeff0, coeff0); + abs_diff0 = __msa_add_a_h(diff0, (v8i16)zero); + cmp = __msa_clei_s_h(abs_diff0, 15); + cmp = cmp & one; + mask0 += cmp; + cmp = __msa_clei_s_h(abs_diff0, 7); + cmp = cmp & one; + mask0 += cmp; + cmp = abs_diff0 < shift_inc1_vec; + cmp = cmp & one; + mask0 += cmp; + temp0_h = __msa_clei_s_h(diff0, 0); + temp0_h = temp0_h & four; + mask0 += temp0_h; + adjust0 = __msa_vshf_h(mask0, adj_val, adj_val); + temp2_h = __msa_ceqi_h(adjust0, 0); + adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)diff0, (v16u8)temp2_h); + col_sum0 += adjust0; + temp0_h = (v8i16)__msa_ilvr_b(zero, (v16i8)sig0); + temp0_h += adjust0; + temp0_h = __msa_maxi_s_h(temp0_h, 0); + temp0_h = (v8i16)__msa_sat_u_h((v8u16)temp0_h, 7); + temp2_h = (v8i16)__msa_pckev_b((v16i8)temp2_h, (v16i8)temp2_h); + running_avg_y = (v16u8)__msa_pckev_b((v16i8)temp0_h, (v16i8)temp0_h); + running_avg_y = + __msa_bmnz_v(running_avg_y, mc_running_avg_y0, (v16u8)temp2_h); + dst0 = __msa_copy_s_d((v2i64)running_avg_y, 0); + SD(dst0, running_avg_y_ptr); + running_avg_y_ptr += avg_y_stride; + + mask0 = __msa_ldi_h(0); + coeff0 = (v16u8)__msa_ilvr_b((v16i8)mc_running_avg_y1, (v16i8)sig1); + diff0 = __msa_hsub_u_h(coeff0, coeff0); + abs_diff0 = __msa_add_a_h(diff0, (v8i16)zero); + cmp = __msa_clei_s_h(abs_diff0, 15); + cmp = cmp & one; + mask0 += cmp; + cmp = __msa_clei_s_h(abs_diff0, 7); + cmp = cmp & one; + mask0 += cmp; + cmp = abs_diff0 < shift_inc1_vec; + cmp = cmp & one; + mask0 += cmp; + temp0_h = __msa_clei_s_h(diff0, 0); + temp0_h = temp0_h & four; + mask0 += temp0_h; + adjust0 = __msa_vshf_h(mask0, adj_val, adj_val); + temp2_h = __msa_ceqi_h(adjust0, 0); + adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)diff0, (v16u8)temp2_h); + col_sum0 += adjust0; + temp0_h = (v8i16)__msa_ilvr_b(zero, (v16i8)sig1); + temp0_h += adjust0; + temp0_h = __msa_maxi_s_h(temp0_h, 0); + temp0_h = (v8i16)__msa_sat_u_h((v8u16)temp0_h, 7); + + temp2_h = (v8i16)__msa_pckev_b((v16i8)temp2_h, (v16i8)temp2_h); + running_avg_y = (v16u8)__msa_pckev_b((v16i8)temp0_h, (v16i8)temp0_h); + running_avg_y = + __msa_bmnz_v(running_avg_y, mc_running_avg_y1, (v16u8)temp2_h); + dst1 = __msa_copy_s_d((v2i64)running_avg_y, 0); + SD(dst1, running_avg_y_ptr); + + sig_ptr += sig_stride; + mc_running_avg_y_ptr += mc_avg_y_stride; + running_avg_y_ptr += avg_y_stride; + } + + temp0_h = col_sum0; + temp0_w = __msa_hadd_s_w(temp0_h, temp0_h); + temp0_d = __msa_hadd_s_d(temp0_w, temp0_w); + temp1_d = __msa_splati_d(temp0_d, 1); + temp0_d += temp1_d; + sum_diff = __msa_copy_s_w((v4i32)temp0_d, 0); + sig_ptr -= sig_stride * 8; + mc_running_avg_y_ptr -= mc_avg_y_stride * 8; + running_avg_y_ptr -= avg_y_stride * 8; + sum_diff_thresh = SUM_DIFF_THRESHOLD_UV; + + if (increase_denoising) { + sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV; + } + + if (abs(sum_diff) > sum_diff_thresh) { + delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1; + delta_vec = __msa_fill_h(delta); + if (delta < 4) { + for (cnt = 4; cnt--;) { + running_avg_y = LD_UB(running_avg_y_ptr); + mc_running_avg_y0 = LD_UB(mc_running_avg_y_ptr); + sig0 = LD_UB(sig_ptr); + /* Update pointers for next iteration. */ + sig_ptr += sig_stride; + mc_running_avg_y_ptr += mc_avg_y_stride; + running_avg_y_ptr += avg_y_stride; + + mc_running_avg_y1 = LD_UB(mc_running_avg_y_ptr); + sig1 = LD_UB(sig_ptr); + running_avg_y1 = LD_UB(running_avg_y_ptr); + + coeff0 = (v16u8)__msa_ilvr_b((v16i8)mc_running_avg_y0, (v16i8)sig0); + diff0 = __msa_hsub_u_h(coeff0, coeff0); + abs_diff0 = __msa_add_a_h(diff0, (v8i16)zero); + temp0_h = delta_vec < abs_diff0; + abs_diff0 = (v8i16)__msa_bmnz_v((v16u8)abs_diff0, (v16u8)delta_vec, + (v16u8)temp0_h); + abs_diff_neg0 = (v8i16)zero - abs_diff0; + temp0_h = __msa_clei_s_h(diff0, 0); + adjust0 = (v8i16)__msa_bmz_v((v16u8)abs_diff0, (v16u8)abs_diff_neg0, + (v16u8)temp0_h); + temp2_h = (v8i16)__msa_ilvr_b(zero, (v16i8)running_avg_y); + adjust2 = temp2_h + adjust0; + adjust2 = __msa_maxi_s_h(adjust2, 0); + adjust2 = (v8i16)__msa_sat_u_h((v8u16)adjust2, 7); + temp0_h = __msa_ceqi_h(diff0, 0); + adjust2 = + (v8i16)__msa_bmnz_v((v16u8)adjust2, (v16u8)temp2_h, (v16u8)temp0_h); + adjust0 = + (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)zero, (v16u8)temp0_h); + col_sum0 += adjust0; + running_avg_y = (v16u8)__msa_pckev_b((v16i8)adjust2, (v16i8)adjust2); + dst0 = __msa_copy_s_d((v2i64)running_avg_y, 0); + SD(dst0, running_avg_y_ptr - avg_y_stride); + + coeff0 = (v16u8)__msa_ilvr_b((v16i8)mc_running_avg_y1, (v16i8)sig1); + diff0 = __msa_hsub_u_h(coeff0, coeff0); + abs_diff0 = __msa_add_a_h(diff0, (v8i16)zero); + temp0_h = delta_vec < abs_diff0; + abs_diff0 = (v8i16)__msa_bmnz_v((v16u8)abs_diff0, (v16u8)delta_vec, + (v16u8)temp0_h); + abs_diff_neg0 = (v8i16)zero - abs_diff0; + temp0_h = __msa_clei_s_h(diff0, 0); + adjust0 = (v8i16)__msa_bmz_v((v16u8)abs_diff0, (v16u8)abs_diff_neg0, + (v16u8)temp0_h); + temp2_h = (v8i16)__msa_ilvr_b(zero, (v16i8)running_avg_y1); + adjust2 = temp2_h + adjust0; + adjust2 = __msa_maxi_s_h(adjust2, 0); + adjust2 = (v8i16)__msa_sat_u_h((v8u16)adjust2, 7); + temp0_h = __msa_ceqi_h(diff0, 0); + adjust2 = + (v8i16)__msa_bmnz_v((v16u8)adjust2, (v16u8)temp2_h, (v16u8)temp0_h); + adjust0 = + (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)zero, (v16u8)temp0_h); + col_sum0 += adjust0; + running_avg_y = (v16u8)__msa_pckev_b((v16i8)adjust2, (v16i8)adjust2); + dst1 = __msa_copy_s_d((v2i64)running_avg_y, 0); + SD(dst1, running_avg_y_ptr); + running_avg_y_ptr += avg_y_stride; + } + + temp0_h = col_sum0; + temp0_w = __msa_hadd_s_w(temp0_h, temp0_h); + temp0_d = __msa_hadd_s_d(temp0_w, temp0_w); + temp1_d = __msa_splati_d(temp0_d, 1); + temp0_d += temp1_d; + sum_diff = __msa_copy_s_w((v4i32)temp0_d, 0); + + if (abs(sum_diff) > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + + LD4(sig_start, sig_stride, src0, src1, src2, src3); + sig_start += (4 * sig_stride); + SD4(src0, src1, src2, src3, running_avg_y_start, avg_y_stride); + running_avg_y_start += (4 * avg_y_stride); + + LD4(sig_start, sig_stride, src0, src1, src2, src3); + SD4(src0, src1, src2, src3, running_avg_y_start, avg_y_stride); + + return FILTER_BLOCK; +} diff --git a/media/libvpx/libvpx/vp8/encoder/mips/msa/encodeopt_msa.c b/media/libvpx/libvpx/vp8/encoder/mips/msa/encodeopt_msa.c new file mode 100644 index 0000000000..2bcddb6235 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/mips/msa/encodeopt_msa.c @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vp8/common/mips/msa/vp8_macros_msa.h" +#include "vp8/encoder/block.h" + +int32_t vp8_block_error_msa(int16_t *coeff_ptr, int16_t *dq_coeff_ptr) { + int32_t err = 0; + uint32_t loop_cnt; + v8i16 coeff, dq_coeff, coeff0, coeff1; + v4i32 diff0, diff1; + v2i64 err0 = { 0 }; + v2i64 err1 = { 0 }; + + for (loop_cnt = 2; loop_cnt--;) { + coeff = LD_SH(coeff_ptr); + dq_coeff = LD_SH(dq_coeff_ptr); + ILVRL_H2_SH(coeff, dq_coeff, coeff0, coeff1); + HSUB_UH2_SW(coeff0, coeff1, diff0, diff1); + DPADD_SD2_SD(diff0, diff1, err0, err1); + coeff_ptr += 8; + dq_coeff_ptr += 8; + } + + err0 += __msa_splati_d(err0, 1); + err1 += __msa_splati_d(err1, 1); + err = __msa_copy_s_d(err0, 0); + err += __msa_copy_s_d(err1, 0); + + return err; +} + +int32_t vp8_mbblock_error_msa(MACROBLOCK *mb, int32_t dc) { + BLOCK *be; + BLOCKD *bd; + int16_t *coeff_ptr, *dq_coeff_ptr; + int32_t err = 0; + uint32_t loop_cnt; + v8i16 coeff, coeff0, coeff1, coeff2, coeff3, coeff4; + v8i16 dq_coeff, dq_coeff2, dq_coeff3, dq_coeff4; + v4i32 diff0, diff1; + v2i64 err0, err1; + v16u8 zero = { 0 }; + v16u8 mask0 = (v16u8)__msa_ldi_b(255); + + if (1 == dc) { + mask0 = (v16u8)__msa_insve_w((v4i32)mask0, 0, (v4i32)zero); + } + + for (loop_cnt = 0; loop_cnt < 8; ++loop_cnt) { + be = &mb->block[2 * loop_cnt]; + bd = &mb->e_mbd.block[2 * loop_cnt]; + coeff_ptr = be->coeff; + dq_coeff_ptr = bd->dqcoeff; + coeff = LD_SH(coeff_ptr); + dq_coeff = LD_SH(dq_coeff_ptr); + coeff_ptr += 8; + dq_coeff_ptr += 8; + coeff2 = LD_SH(coeff_ptr); + dq_coeff2 = LD_SH(dq_coeff_ptr); + be = &mb->block[2 * loop_cnt + 1]; + bd = &mb->e_mbd.block[2 * loop_cnt + 1]; + coeff_ptr = be->coeff; + dq_coeff_ptr = bd->dqcoeff; + coeff3 = LD_SH(coeff_ptr); + dq_coeff3 = LD_SH(dq_coeff_ptr); + coeff_ptr += 8; + dq_coeff_ptr += 8; + coeff4 = LD_SH(coeff_ptr); + dq_coeff4 = LD_SH(dq_coeff_ptr); + ILVRL_H2_SH(coeff, dq_coeff, coeff0, coeff1); + HSUB_UH2_SW(coeff0, coeff1, diff0, diff1); + diff0 = (v4i32)__msa_bmnz_v(zero, (v16u8)diff0, mask0); + DOTP_SW2_SD(diff0, diff1, diff0, diff1, err0, err1); + ILVRL_H2_SH(coeff2, dq_coeff2, coeff0, coeff1); + HSUB_UH2_SW(coeff0, coeff1, diff0, diff1); + DPADD_SD2_SD(diff0, diff1, err0, err1); + err0 += __msa_splati_d(err0, 1); + err1 += __msa_splati_d(err1, 1); + err += __msa_copy_s_d(err0, 0); + err += __msa_copy_s_d(err1, 0); + + ILVRL_H2_SH(coeff3, dq_coeff3, coeff0, coeff1); + HSUB_UH2_SW(coeff0, coeff1, diff0, diff1); + diff0 = (v4i32)__msa_bmnz_v(zero, (v16u8)diff0, mask0); + DOTP_SW2_SD(diff0, diff1, diff0, diff1, err0, err1); + ILVRL_H2_SH(coeff4, dq_coeff4, coeff0, coeff1); + HSUB_UH2_SW(coeff0, coeff1, diff0, diff1); + DPADD_SD2_SD(diff0, diff1, err0, err1); + err0 += __msa_splati_d(err0, 1); + err1 += __msa_splati_d(err1, 1); + err += __msa_copy_s_d(err0, 0); + err += __msa_copy_s_d(err1, 0); + } + + return err; +} + +int32_t vp8_mbuverror_msa(MACROBLOCK *mb) { + BLOCK *be; + BLOCKD *bd; + int16_t *coeff_ptr, *dq_coeff_ptr; + int32_t err = 0; + uint32_t loop_cnt; + v8i16 coeff, coeff0, coeff1, coeff2, coeff3, coeff4; + v8i16 dq_coeff, dq_coeff2, dq_coeff3, dq_coeff4; + v4i32 diff0, diff1; + v2i64 err0, err1, err_dup0, err_dup1; + + for (loop_cnt = 16; loop_cnt < 24; loop_cnt += 2) { + be = &mb->block[loop_cnt]; + bd = &mb->e_mbd.block[loop_cnt]; + coeff_ptr = be->coeff; + dq_coeff_ptr = bd->dqcoeff; + coeff = LD_SH(coeff_ptr); + dq_coeff = LD_SH(dq_coeff_ptr); + coeff_ptr += 8; + dq_coeff_ptr += 8; + coeff2 = LD_SH(coeff_ptr); + dq_coeff2 = LD_SH(dq_coeff_ptr); + be = &mb->block[loop_cnt + 1]; + bd = &mb->e_mbd.block[loop_cnt + 1]; + coeff_ptr = be->coeff; + dq_coeff_ptr = bd->dqcoeff; + coeff3 = LD_SH(coeff_ptr); + dq_coeff3 = LD_SH(dq_coeff_ptr); + coeff_ptr += 8; + dq_coeff_ptr += 8; + coeff4 = LD_SH(coeff_ptr); + dq_coeff4 = LD_SH(dq_coeff_ptr); + + ILVRL_H2_SH(coeff, dq_coeff, coeff0, coeff1); + HSUB_UH2_SW(coeff0, coeff1, diff0, diff1); + DOTP_SW2_SD(diff0, diff1, diff0, diff1, err0, err1); + + ILVRL_H2_SH(coeff2, dq_coeff2, coeff0, coeff1); + HSUB_UH2_SW(coeff0, coeff1, diff0, diff1); + DPADD_SD2_SD(diff0, diff1, err0, err1); + err_dup0 = __msa_splati_d(err0, 1); + err_dup1 = __msa_splati_d(err1, 1); + ADD2(err0, err_dup0, err1, err_dup1, err0, err1); + err += __msa_copy_s_d(err0, 0); + err += __msa_copy_s_d(err1, 0); + + ILVRL_H2_SH(coeff3, dq_coeff3, coeff0, coeff1); + HSUB_UH2_SW(coeff0, coeff1, diff0, diff1); + DOTP_SW2_SD(diff0, diff1, diff0, diff1, err0, err1); + ILVRL_H2_SH(coeff4, dq_coeff4, coeff0, coeff1); + HSUB_UH2_SW(coeff0, coeff1, diff0, diff1); + DPADD_SD2_SD(diff0, diff1, err0, err1); + err_dup0 = __msa_splati_d(err0, 1); + err_dup1 = __msa_splati_d(err1, 1); + ADD2(err0, err_dup0, err1, err_dup1, err0, err1); + err += __msa_copy_s_d(err0, 0); + err += __msa_copy_s_d(err1, 0); + } + + return err; +} diff --git a/media/libvpx/libvpx/vp8/encoder/mips/msa/quantize_msa.c b/media/libvpx/libvpx/vp8/encoder/mips/msa/quantize_msa.c new file mode 100644 index 0000000000..9f5fbd39c8 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/mips/msa/quantize_msa.c @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vp8/common/mips/msa/vp8_macros_msa.h" +#include "vp8/encoder/block.h" + +static int8_t fast_quantize_b_msa(int16_t *coeff_ptr, int16_t *round, + int16_t *quant, int16_t *de_quant, + int16_t *q_coeff, int16_t *dq_coeff) { + int32_t cnt, eob; + v16i8 inv_zig_zag = { 0, 1, 5, 6, 2, 4, 7, 12, 3, 8, 11, 13, 9, 10, 14, 15 }; + v8i16 round0, round1; + v8i16 sign_z0, sign_z1; + v8i16 q_coeff0, q_coeff1; + v8i16 x0, x1, de_quant0, de_quant1; + v8i16 coeff0, coeff1, z0, z1; + v8i16 quant0, quant1, quant2, quant3; + v8i16 zero = { 0 }; + v8i16 inv_zig_zag0, inv_zig_zag1; + v8i16 zigzag_mask0 = { 0, 1, 4, 8, 5, 2, 3, 6 }; + v8i16 zigzag_mask1 = { 9, 12, 13, 10, 7, 11, 14, 15 }; + v8i16 temp0_h, temp1_h, temp2_h, temp3_h; + v4i32 temp0_w, temp1_w, temp2_w, temp3_w; + + ILVRL_B2_SH(zero, inv_zig_zag, inv_zig_zag0, inv_zig_zag1); + eob = -1; + LD_SH2(coeff_ptr, 8, coeff0, coeff1); + VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, z0, + z1); + LD_SH2(round, 8, coeff0, coeff1); + VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, round0, + round1); + LD_SH2(quant, 8, coeff0, coeff1); + VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, quant0, + quant2); + sign_z0 = z0 >> 15; + sign_z1 = z1 >> 15; + x0 = __msa_add_a_h(z0, zero); + x1 = __msa_add_a_h(z1, zero); + ILVL_H2_SH(quant0, quant0, quant2, quant2, quant1, quant3); + ILVR_H2_SH(quant0, quant0, quant2, quant2, quant0, quant2); + ILVL_H2_SH(round0, x0, round1, x1, temp1_h, temp3_h); + ILVR_H2_SH(round0, x0, round1, x1, temp0_h, temp2_h); + DOTP_SH4_SW(temp0_h, temp1_h, temp2_h, temp3_h, quant0, quant1, quant2, + quant3, temp0_w, temp1_w, temp2_w, temp3_w); + SRA_4V(temp0_w, temp1_w, temp2_w, temp3_w, 16); + PCKEV_H2_SH(temp1_w, temp0_w, temp3_w, temp2_w, x0, x1); + x0 = x0 ^ sign_z0; + x1 = x1 ^ sign_z1; + SUB2(x0, sign_z0, x1, sign_z1, x0, x1); + VSHF_H2_SH(x0, x1, x0, x1, inv_zig_zag0, inv_zig_zag1, q_coeff0, q_coeff1); + ST_SH2(q_coeff0, q_coeff1, q_coeff, 8); + LD_SH2(de_quant, 8, de_quant0, de_quant1); + q_coeff0 *= de_quant0; + q_coeff1 *= de_quant1; + ST_SH2(q_coeff0, q_coeff1, dq_coeff, 8); + + for (cnt = 0; cnt < 16; ++cnt) { + if ((cnt <= 7) && (x1[7 - cnt] != 0)) { + eob = (15 - cnt); + break; + } + + if ((cnt > 7) && (x0[7 - (cnt - 8)] != 0)) { + eob = (7 - (cnt - 8)); + break; + } + } + + return (int8_t)(eob + 1); +} + +static int8_t exact_regular_quantize_b_msa( + int16_t *zbin_boost, int16_t *coeff_ptr, int16_t *zbin, int16_t *round, + int16_t *quant, int16_t *quant_shift, int16_t *de_quant, int16_t zbin_oq_in, + int16_t *q_coeff, int16_t *dq_coeff) { + int32_t cnt, eob; + int16_t *boost_temp = zbin_boost; + v16i8 inv_zig_zag = { 0, 1, 5, 6, 2, 4, 7, 12, 3, 8, 11, 13, 9, 10, 14, 15 }; + v8i16 round0, round1; + v8i16 sign_z0, sign_z1; + v8i16 q_coeff0, q_coeff1; + v8i16 z_bin0, z_bin1, zbin_o_q; + v8i16 x0, x1, sign_x0, sign_x1, de_quant0, de_quant1; + v8i16 coeff0, coeff1, z0, z1; + v8i16 quant0, quant1, quant2, quant3; + v8i16 zero = { 0 }; + v8i16 inv_zig_zag0, inv_zig_zag1; + v8i16 zigzag_mask0 = { 0, 1, 4, 8, 5, 2, 3, 6 }; + v8i16 zigzag_mask1 = { 9, 12, 13, 10, 7, 11, 14, 15 }; + v8i16 temp0_h, temp1_h, temp2_h, temp3_h; + v4i32 temp0_w, temp1_w, temp2_w, temp3_w; + + ILVRL_B2_SH(zero, inv_zig_zag, inv_zig_zag0, inv_zig_zag1); + zbin_o_q = __msa_fill_h(zbin_oq_in); + eob = -1; + LD_SH2(coeff_ptr, 8, coeff0, coeff1); + VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, z0, + z1); + LD_SH2(round, 8, coeff0, coeff1); + VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, round0, + round1); + LD_SH2(quant, 8, coeff0, coeff1); + VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, quant0, + quant2); + LD_SH2(zbin, 8, coeff0, coeff1); + VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, z_bin0, + z_bin1); + sign_z0 = z0 >> 15; + sign_z1 = z1 >> 15; + x0 = __msa_add_a_h(z0, zero); + x1 = __msa_add_a_h(z1, zero); + SUB2(x0, z_bin0, x1, z_bin1, z_bin0, z_bin1); + SUB2(z_bin0, zbin_o_q, z_bin1, zbin_o_q, z_bin0, z_bin1); + ILVL_H2_SH(quant0, quant0, quant2, quant2, quant1, quant3); + ILVR_H2_SH(quant0, quant0, quant2, quant2, quant0, quant2); + ILVL_H2_SH(round0, x0, round1, x1, temp1_h, temp3_h); + ILVR_H2_SH(round0, x0, round1, x1, temp0_h, temp2_h); + DOTP_SH4_SW(temp0_h, temp1_h, temp2_h, temp3_h, quant0, quant1, quant2, + quant3, temp0_w, temp1_w, temp2_w, temp3_w); + SRA_4V(temp0_w, temp1_w, temp2_w, temp3_w, 16); + PCKEV_H2_SH(temp1_w, temp0_w, temp3_w, temp2_w, temp0_h, temp2_h); + LD_SH2(quant_shift, 8, coeff0, coeff1); + VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, quant0, + quant2); + ILVL_H2_SH(quant0, quant0, quant2, quant2, quant1, quant3); + ILVR_H2_SH(quant0, quant0, quant2, quant2, quant0, quant2); + ADD2(x0, round0, x1, round1, x0, x1); + ILVL_H2_SH(temp0_h, x0, temp2_h, x1, temp1_h, temp3_h); + ILVR_H2_SH(temp0_h, x0, temp2_h, x1, temp0_h, temp2_h); + DOTP_SH4_SW(temp0_h, temp1_h, temp2_h, temp3_h, quant0, quant1, quant2, + quant3, temp0_w, temp1_w, temp2_w, temp3_w); + SRA_4V(temp0_w, temp1_w, temp2_w, temp3_w, 16); + PCKEV_H2_SH(temp1_w, temp0_w, temp3_w, temp2_w, x0, x1); + sign_x0 = x0 ^ sign_z0; + sign_x1 = x1 ^ sign_z1; + SUB2(sign_x0, sign_z0, sign_x1, sign_z1, sign_x0, sign_x1); + for (cnt = 0; cnt < 16; ++cnt) { + if (cnt <= 7) { + if (boost_temp[0] <= z_bin0[cnt]) { + if (x0[cnt]) { + eob = cnt; + boost_temp = zbin_boost; + } else { + boost_temp++; + } + } else { + sign_x0[cnt] = 0; + boost_temp++; + } + } else { + if (boost_temp[0] <= z_bin1[cnt - 8]) { + if (x1[cnt - 8]) { + eob = cnt; + boost_temp = zbin_boost; + } else { + boost_temp++; + } + } else { + sign_x1[cnt - 8] = 0; + boost_temp++; + } + } + } + + VSHF_H2_SH(sign_x0, sign_x1, sign_x0, sign_x1, inv_zig_zag0, inv_zig_zag1, + q_coeff0, q_coeff1); + ST_SH2(q_coeff0, q_coeff1, q_coeff, 8); + LD_SH2(de_quant, 8, de_quant0, de_quant1); + MUL2(de_quant0, q_coeff0, de_quant1, q_coeff1, de_quant0, de_quant1); + ST_SH2(de_quant0, de_quant1, dq_coeff, 8); + + return (int8_t)(eob + 1); +} + +void vp8_fast_quantize_b_msa(BLOCK *b, BLOCKD *d) { + int16_t *coeff_ptr = b->coeff; + int16_t *round_ptr = b->round; + int16_t *quant_ptr = b->quant_fast; + int16_t *qcoeff_ptr = d->qcoeff; + int16_t *dqcoeff_ptr = d->dqcoeff; + int16_t *dequant_ptr = d->dequant; + + *d->eob = fast_quantize_b_msa(coeff_ptr, round_ptr, quant_ptr, dequant_ptr, + qcoeff_ptr, dqcoeff_ptr); +} + +void vp8_regular_quantize_b_msa(BLOCK *b, BLOCKD *d) { + int16_t *zbin_boost_ptr = b->zrun_zbin_boost; + int16_t *coeff_ptr = b->coeff; + int16_t *zbin_ptr = b->zbin; + int16_t *round_ptr = b->round; + int16_t *quant_ptr = b->quant; + int16_t *quant_shift_ptr = b->quant_shift; + int16_t *qcoeff_ptr = d->qcoeff; + int16_t *dqcoeff_ptr = d->dqcoeff; + int16_t *dequant_ptr = d->dequant; + int16_t zbin_oq_value = b->zbin_extra; + + *d->eob = exact_regular_quantize_b_msa( + zbin_boost_ptr, coeff_ptr, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, dequant_ptr, zbin_oq_value, qcoeff_ptr, dqcoeff_ptr); +} diff --git a/media/libvpx/libvpx/vp8/encoder/mips/msa/temporal_filter_msa.c b/media/libvpx/libvpx/vp8/encoder/mips/msa/temporal_filter_msa.c new file mode 100644 index 0000000000..fb83f07bd2 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/mips/msa/temporal_filter_msa.c @@ -0,0 +1,284 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vp8/common/mips/msa/vp8_macros_msa.h" + +static void temporal_filter_apply_16size_msa( + uint8_t *frame1_ptr, uint32_t stride, uint8_t *frame2_ptr, + int32_t strength_in, int32_t filter_wt_in, uint32_t *acc, uint16_t *cnt) { + uint32_t row; + v16i8 frame1_0_b, frame1_1_b, frame2_0_b, frame2_1_b; + v16u8 frame_l, frame_h; + v16i8 zero = { 0 }; + v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h; + v8i16 diff0, diff1, cnt0, cnt1; + v4i32 const3, const16, filter_wt, strength; + v4i32 mod0_w, mod1_w, mod2_w, mod3_w; + v4i32 diff0_r, diff0_l, diff1_r, diff1_l; + v4i32 frame2_0, frame2_1, frame2_2, frame2_3; + v4i32 acc0, acc1, acc2, acc3; + + filter_wt = __msa_fill_w(filter_wt_in); + strength = __msa_fill_w(strength_in); + const3 = __msa_ldi_w(3); + const16 = __msa_ldi_w(16); + + for (row = 8; row--;) { + frame1_0_b = LD_SB(frame1_ptr); + frame2_0_b = LD_SB(frame2_ptr); + frame1_ptr += stride; + frame2_ptr += 16; + frame1_1_b = LD_SB(frame1_ptr); + frame2_1_b = LD_SB(frame2_ptr); + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc + 8, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + ILVRL_B2_UB(frame1_0_b, frame2_0_b, frame_l, frame_h); + HSUB_UB2_SH(frame_l, frame_h, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, + mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w, + mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + diff0_r = (mod0_w < const16); + diff0_l = (mod1_w < const16); + diff1_r = (mod2_w < const16); + diff1_l = (mod3_w < const16); + SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w, + mod0_w, mod1_w, mod2_w, mod3_w); + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w, + filter_wt, mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h) + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + ILVRL_B2_SH(zero, frame2_0_b, frame2_0_h, frame2_1_h); + UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1); + UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3); + MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3, + mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, + mod2_w, mod3_w); + ST_SW2(mod0_w, mod1_w, acc, 4); + ST_SW2(mod2_w, mod3_w, acc + 8, 4); + acc += 16; + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc + 8, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + ILVRL_B2_UB(frame1_1_b, frame2_1_b, frame_l, frame_h); + HSUB_UB2_SH(frame_l, frame_h, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, + mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w, + mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + diff0_r = (mod0_w < const16); + diff0_l = (mod1_w < const16); + diff1_r = (mod2_w < const16); + diff1_l = (mod3_w < const16); + SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w, + mod0_w, mod1_w, mod2_w, mod3_w); + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w, + filter_wt, mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + + UNPCK_UB_SH(frame2_1_b, frame2_0_h, frame2_1_h); + UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1); + UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3); + MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3, + mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, + mod2_w, mod3_w); + ST_SW2(mod0_w, mod1_w, acc, 4); + ST_SW2(mod2_w, mod3_w, acc + 8, 4); + acc += 16; + frame1_ptr += stride; + frame2_ptr += 16; + } +} + +static void temporal_filter_apply_8size_msa( + uint8_t *frame1_ptr, uint32_t stride, uint8_t *frame2_ptr, + int32_t strength_in, int32_t filter_wt_in, uint32_t *acc, uint16_t *cnt) { + uint32_t row; + uint64_t f0, f1, f2, f3, f4, f5, f6, f7; + v16i8 frame1 = { 0 }; + v16i8 frame2 = { 0 }; + v16i8 frame3 = { 0 }; + v16i8 frame4 = { 0 }; + v16u8 frame_l, frame_h; + v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h; + v8i16 diff0, diff1, cnt0, cnt1; + v4i32 const3, const16; + v4i32 filter_wt, strength; + v4i32 mod0_w, mod1_w, mod2_w, mod3_w; + v4i32 diff0_r, diff0_l, diff1_r, diff1_l; + v4i32 frame2_0, frame2_1, frame2_2, frame2_3; + v4i32 acc0, acc1, acc2, acc3; + + filter_wt = __msa_fill_w(filter_wt_in); + strength = __msa_fill_w(strength_in); + const3 = __msa_ldi_w(3); + const16 = __msa_ldi_w(16); + + for (row = 2; row--;) { + LD2(frame1_ptr, stride, f0, f1); + frame1_ptr += (2 * stride); + LD2(frame2_ptr, 8, f2, f3); + frame2_ptr += 16; + LD2(frame1_ptr, stride, f4, f5); + frame1_ptr += (2 * stride); + LD2(frame2_ptr, 8, f6, f7); + frame2_ptr += 16; + + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc + 8, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + INSERT_D2_SB(f0, f1, frame1); + INSERT_D2_SB(f2, f3, frame2); + INSERT_D2_SB(f4, f5, frame3); + INSERT_D2_SB(f6, f7, frame4); + ILVRL_B2_UB(frame1, frame2, frame_l, frame_h); + HSUB_UB2_SH(frame_l, frame_h, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, + mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w, + mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + diff0_r = (mod0_w < const16); + diff0_l = (mod1_w < const16); + diff1_r = (mod2_w < const16); + diff1_l = (mod3_w < const16); + SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w, + mod0_w, mod1_w, mod2_w, mod3_w); + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w, + filter_wt, mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + + UNPCK_UB_SH(frame2, frame2_0_h, frame2_1_h); + UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1); + UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3); + MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3, + mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, + mod2_w, mod3_w); + ST_SW2(mod0_w, mod1_w, acc, 4); + ST_SW2(mod2_w, mod3_w, acc + 8, 4); + acc += 16; + + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc + 8, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + ILVRL_B2_UB(frame3, frame4, frame_l, frame_h); + HSUB_UB2_SH(frame_l, frame_h, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, + mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w, + mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + diff0_r = (mod0_w < const16); + diff0_l = (mod1_w < const16); + diff1_r = (mod2_w < const16); + diff1_l = (mod3_w < const16); + SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w, + mod0_w, mod1_w, mod2_w, mod3_w); + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w, + filter_wt, mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + + UNPCK_UB_SH(frame4, frame2_0_h, frame2_1_h); + UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1); + UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3); + MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3, + mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, + mod2_w, mod3_w); + ST_SW2(mod0_w, mod1_w, acc, 4); + ST_SW2(mod2_w, mod3_w, acc + 8, 4); + acc += 16; + } +} + +void vp8_temporal_filter_apply_msa(uint8_t *frame1, uint32_t stride, + uint8_t *frame2, uint32_t block_size, + int32_t strength, int32_t filter_weight, + uint32_t *accumulator, uint16_t *count) { + if (8 == block_size) { + temporal_filter_apply_8size_msa(frame1, stride, frame2, strength, + filter_weight, accumulator, count); + } else if (16 == block_size) { + temporal_filter_apply_16size_msa(frame1, stride, frame2, strength, + filter_weight, accumulator, count); + } else { + uint32_t i, j, k; + int32_t modifier; + int32_t byte = 0; + const int32_t rounding = strength > 0 ? 1 << (strength - 1) : 0; + + for (i = 0, k = 0; i < block_size; ++i) { + for (j = 0; j < block_size; ++j, ++k) { + int src_byte = frame1[byte]; + int pixel_value = *frame2++; + + modifier = src_byte - pixel_value; + modifier *= modifier; + modifier *= 3; + modifier += rounding; + modifier >>= strength; + + if (modifier > 16) modifier = 16; + + modifier = 16 - modifier; + modifier *= filter_weight; + + count[k] += modifier; + accumulator[k] += modifier * pixel_value; + + byte++; + } + + byte += stride - block_size; + } + } +} diff --git a/media/libvpx/libvpx/vp8/encoder/modecosts.c b/media/libvpx/libvpx/vp8/encoder/modecosts.c new file mode 100644 index 0000000000..b1c3120a92 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/modecosts.c @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp8/common/blockd.h" +#include "modecosts.h" +#include "onyx_int.h" +#include "treewriter.h" +#include "vp8/common/entropymode.h" + +void vp8_init_mode_costs(VP8_COMP *c) { + VP8_COMMON *x = &c->common; + struct rd_costs_struct *rd_costs = &c->rd_costs; + + { + const vp8_tree_p T = vp8_bmode_tree; + + int i = 0; + + do { + int j = 0; + + do { + vp8_cost_tokens(rd_costs->bmode_costs[i][j], vp8_kf_bmode_prob[i][j], + T); + } while (++j < VP8_BINTRAMODES); + } while (++i < VP8_BINTRAMODES); + + vp8_cost_tokens(rd_costs->inter_bmode_costs, x->fc.bmode_prob, T); + } + vp8_cost_tokens(rd_costs->inter_bmode_costs, x->fc.sub_mv_ref_prob, + vp8_sub_mv_ref_tree); + + vp8_cost_tokens(rd_costs->mbmode_cost[1], x->fc.ymode_prob, vp8_ymode_tree); + vp8_cost_tokens(rd_costs->mbmode_cost[0], vp8_kf_ymode_prob, + vp8_kf_ymode_tree); + + vp8_cost_tokens(rd_costs->intra_uv_mode_cost[1], x->fc.uv_mode_prob, + vp8_uv_mode_tree); + vp8_cost_tokens(rd_costs->intra_uv_mode_cost[0], vp8_kf_uv_mode_prob, + vp8_uv_mode_tree); +} diff --git a/media/libvpx/libvpx/vp8/encoder/modecosts.h b/media/libvpx/libvpx/vp8/encoder/modecosts.h new file mode 100644 index 0000000000..09ee2b5520 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/modecosts.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_ENCODER_MODECOSTS_H_ +#define VPX_VP8_ENCODER_MODECOSTS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP8_COMP; + +void vp8_init_mode_costs(struct VP8_COMP *c); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_ENCODER_MODECOSTS_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/mr_dissim.c b/media/libvpx/libvpx/vp8/encoder/mr_dissim.c new file mode 100644 index 0000000000..b1bfb4b54a --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/mr_dissim.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "vpx_config.h" +#include "onyx_int.h" +#include "mr_dissim.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_mem/vpx_mem.h" +#include "rdopt.h" +#include "vp8/common/common.h" + +void vp8_cal_low_res_mb_cols(VP8_COMP *cpi) { + int low_res_w; + + /* Support arbitrary down-sampling factor */ + unsigned int iw = cpi->oxcf.Width * cpi->oxcf.mr_down_sampling_factor.den + + cpi->oxcf.mr_down_sampling_factor.num - 1; + + low_res_w = iw / cpi->oxcf.mr_down_sampling_factor.num; + cpi->mr_low_res_mb_cols = ((low_res_w + 15) >> 4); +} + +#define GET_MV(x) \ + if (x->mbmi.ref_frame != INTRA_FRAME) { \ + mvx[cnt] = x->mbmi.mv.as_mv.row; \ + mvy[cnt] = x->mbmi.mv.as_mv.col; \ + cnt++; \ + } + +#define GET_MV_SIGN(x) \ + if (x->mbmi.ref_frame != INTRA_FRAME) { \ + mvx[cnt] = x->mbmi.mv.as_mv.row; \ + mvy[cnt] = x->mbmi.mv.as_mv.col; \ + if (cm->ref_frame_sign_bias[x->mbmi.ref_frame] != \ + cm->ref_frame_sign_bias[tmp->mbmi.ref_frame]) { \ + mvx[cnt] *= -1; \ + mvy[cnt] *= -1; \ + } \ + cnt++; \ + } + +void vp8_cal_dissimilarity(VP8_COMP *cpi) { + VP8_COMMON *cm = &cpi->common; + + /* Note: The first row & first column in mip are outside the frame, which + * were initialized to all 0.(ref_frame, mode, mv...) + * Their ref_frame = 0 means they won't be counted in the following + * calculation. + */ + if (cpi->oxcf.mr_total_resolutions > 1 && + cpi->oxcf.mr_encoder_id < (cpi->oxcf.mr_total_resolutions - 1)) { + /* Store info for show/no-show frames for supporting alt_ref. + * If parent frame is alt_ref, child has one too. + */ + LOWER_RES_FRAME_INFO *store_info = + (LOWER_RES_FRAME_INFO *)cpi->oxcf.mr_low_res_mode_info; + + store_info->frame_type = cm->frame_type; + + if (cm->frame_type != KEY_FRAME) { + int i; + store_info->is_frame_dropped = 0; + for (i = 1; i < MAX_REF_FRAMES; ++i) + store_info->low_res_ref_frames[i] = cpi->current_ref_frames[i]; + } + + if (cm->frame_type != KEY_FRAME) { + int mb_row; + int mb_col; + /* Point to beginning of allocated MODE_INFO arrays. */ + MODE_INFO *tmp = cm->mip + cm->mode_info_stride; + LOWER_RES_MB_INFO *store_mode_info = store_info->mb_info; + + for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { + tmp++; + for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) { + int dissim = INT_MAX; + + if (tmp->mbmi.ref_frame != INTRA_FRAME) { + int mvx[8]; + int mvy[8]; + int mmvx; + int mmvy; + int cnt = 0; + const MODE_INFO *here = tmp; + const MODE_INFO *above = here - cm->mode_info_stride; + const MODE_INFO *left = here - 1; + const MODE_INFO *aboveleft = above - 1; + const MODE_INFO *aboveright = NULL; + const MODE_INFO *right = NULL; + const MODE_INFO *belowleft = NULL; + const MODE_INFO *below = NULL; + const MODE_INFO *belowright = NULL; + + /* If alternate reference frame is used, we have to + * check sign of MV. */ + if (cpi->oxcf.play_alternate) { + /* Gather mv of neighboring MBs */ + GET_MV_SIGN(above) + GET_MV_SIGN(left) + GET_MV_SIGN(aboveleft) + + if (mb_col < (cm->mb_cols - 1)) { + right = here + 1; + aboveright = above + 1; + GET_MV_SIGN(right) + GET_MV_SIGN(aboveright) + } + + if (mb_row < (cm->mb_rows - 1)) { + below = here + cm->mode_info_stride; + belowleft = below - 1; + GET_MV_SIGN(below) + GET_MV_SIGN(belowleft) + } + + if (mb_col < (cm->mb_cols - 1) && mb_row < (cm->mb_rows - 1)) { + belowright = below + 1; + GET_MV_SIGN(belowright) + } + } else { + /* No alt_ref and gather mv of neighboring MBs */ + GET_MV(above) + GET_MV(left) + GET_MV(aboveleft) + + if (mb_col < (cm->mb_cols - 1)) { + right = here + 1; + aboveright = above + 1; + GET_MV(right) + GET_MV(aboveright) + } + + if (mb_row < (cm->mb_rows - 1)) { + below = here + cm->mode_info_stride; + belowleft = below - 1; + GET_MV(below) + GET_MV(belowleft) + } + + if (mb_col < (cm->mb_cols - 1) && mb_row < (cm->mb_rows - 1)) { + belowright = below + 1; + GET_MV(belowright) + } + } + + if (cnt > 0) { + int max_mvx = mvx[0]; + int min_mvx = mvx[0]; + int max_mvy = mvy[0]; + int min_mvy = mvy[0]; + int i; + + if (cnt > 1) { + for (i = 1; i < cnt; ++i) { + if (mvx[i] > max_mvx) + max_mvx = mvx[i]; + else if (mvx[i] < min_mvx) + min_mvx = mvx[i]; + if (mvy[i] > max_mvy) + max_mvy = mvy[i]; + else if (mvy[i] < min_mvy) + min_mvy = mvy[i]; + } + } + + mmvx = VPXMAX(abs(min_mvx - here->mbmi.mv.as_mv.row), + abs(max_mvx - here->mbmi.mv.as_mv.row)); + mmvy = VPXMAX(abs(min_mvy - here->mbmi.mv.as_mv.col), + abs(max_mvy - here->mbmi.mv.as_mv.col)); + dissim = VPXMAX(mmvx, mmvy); + } + } + + /* Store mode info for next resolution encoding */ + store_mode_info->mode = tmp->mbmi.mode; + store_mode_info->ref_frame = tmp->mbmi.ref_frame; + store_mode_info->mv.as_int = tmp->mbmi.mv.as_int; + store_mode_info->dissim = dissim; + tmp++; + store_mode_info++; + } + } + } + } +} + +/* This function is called only when this frame is dropped at current + resolution level. */ +void vp8_store_drop_frame_info(VP8_COMP *cpi) { + /* If the frame is dropped in lower-resolution encoding, this information + is passed to higher resolution level so that the encoder knows there + is no mode & motion info available. + */ + if (cpi->oxcf.mr_total_resolutions > 1 && + cpi->oxcf.mr_encoder_id < (cpi->oxcf.mr_total_resolutions - 1)) { + /* Store info for show/no-show frames for supporting alt_ref. + * If parent frame is alt_ref, child has one too. + */ + LOWER_RES_FRAME_INFO *store_info = + (LOWER_RES_FRAME_INFO *)cpi->oxcf.mr_low_res_mode_info; + + /* Set frame_type to be INTER_FRAME since we won't drop key frame. */ + store_info->frame_type = INTER_FRAME; + store_info->is_frame_dropped = 1; + } +} diff --git a/media/libvpx/libvpx/vp8/encoder/mr_dissim.h b/media/libvpx/libvpx/vp8/encoder/mr_dissim.h new file mode 100644 index 0000000000..58f5a97623 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/mr_dissim.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_ENCODER_MR_DISSIM_H_ +#define VPX_VP8_ENCODER_MR_DISSIM_H_ +#include "vpx_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +extern void vp8_cal_low_res_mb_cols(VP8_COMP *cpi); +extern void vp8_cal_dissimilarity(VP8_COMP *cpi); +extern void vp8_store_drop_frame_info(VP8_COMP *cpi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_ENCODER_MR_DISSIM_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/onyx_if.c b/media/libvpx/libvpx/vp8/encoder/onyx_if.c new file mode 100644 index 0000000000..4e128e3c49 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/onyx_if.c @@ -0,0 +1,5420 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "./vpx_scale_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "./vp8_rtcd.h" +#include "bitstream.h" +#include "vp8/common/onyxc_int.h" +#include "vp8/common/blockd.h" +#include "onyx_int.h" +#include "vp8/common/systemdependent.h" +#include "vp8/common/vp8_skin_detection.h" +#include "vp8/encoder/quantize.h" +#include "vp8/common/alloccommon.h" +#include "mcomp.h" +#include "firstpass.h" +#include "vpx_dsp/psnr.h" +#include "vpx_scale/vpx_scale.h" +#include "vp8/common/extend.h" +#include "ratectrl.h" +#include "vp8/common/quant_common.h" +#include "segmentation.h" +#if CONFIG_POSTPROC +#include "vp8/common/postproc.h" +#endif +#include "vpx_mem/vpx_mem.h" +#include "vp8/common/reconintra.h" +#include "vp8/common/swapyv12buffer.h" +#include "vp8/common/threading.h" +#include "vpx_ports/system_state.h" +#include "vpx_ports/vpx_once.h" +#include "vpx_ports/vpx_timer.h" +#include "vpx_util/vpx_write_yuv_frame.h" +#if VPX_ARCH_ARM +#include "vpx_ports/arm.h" +#endif +#if CONFIG_MULTI_RES_ENCODING +#include "mr_dissim.h" +#endif +#include "encodeframe.h" +#if CONFIG_MULTITHREAD +#include "ethreading.h" +#endif +#include "picklpf.h" +#if !CONFIG_REALTIME_ONLY +#include "temporal_filter.h" +#endif + +#include +#include +#include +#include + +#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING +extern int vp8_update_coef_context(VP8_COMP *cpi); +#endif + +extern unsigned int vp8_get_processor_freq(); + +int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest); + +static void set_default_lf_deltas(VP8_COMP *cpi); + +extern const int vp8_gf_interval_table[101]; + +#if CONFIG_INTERNAL_STATS +#include "math.h" +#include "vpx_dsp/ssim.h" +#endif + +#ifdef OUTPUT_YUV_SRC +FILE *yuv_file; +#endif +#ifdef OUTPUT_YUV_DENOISED +FILE *yuv_denoised_file; +#endif +#ifdef OUTPUT_YUV_SKINMAP +static FILE *yuv_skinmap_file = NULL; +#endif + +#if 0 +FILE *framepsnr; +FILE *kf_list; +FILE *keyfile; +#endif + +#if 0 +extern int skip_true_count; +extern int skip_false_count; +#endif + +#ifdef SPEEDSTATS +unsigned int frames_at_speed[16] = { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }; +unsigned int tot_pm = 0; +unsigned int cnt_pm = 0; +unsigned int tot_ef = 0; +unsigned int cnt_ef = 0; +#endif + +#ifdef MODE_STATS +extern unsigned __int64 Sectionbits[50]; +extern int y_modes[5]; +extern int uv_modes[4]; +extern int b_modes[10]; + +extern int inter_y_modes[10]; +extern int inter_uv_modes[4]; +extern unsigned int inter_b_modes[15]; +#endif + +extern const int vp8_bits_per_mb[2][QINDEX_RANGE]; + +extern const int qrounding_factors[129]; +extern const int qzbin_factors[129]; +extern void vp8cx_init_quantizer(VP8_COMP *cpi); +extern const int vp8cx_base_skip_false_prob[128]; + +/* Tables relating active max Q to active min Q */ +static const unsigned char kf_low_motion_minq[QINDEX_RANGE] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, + 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, + 5, 5, 6, 6, 6, 6, 7, 7, 8, 8, 8, 8, 9, 9, 10, 10, 10, 10, 11, + 11, 11, 11, 12, 12, 13, 13, 13, 13, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, + 17, 17, 18, 18, 18, 18, 19, 20, 20, 21, 21, 22, 23, 23 +}; +static const unsigned char kf_high_motion_minq[QINDEX_RANGE] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, + 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, + 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 8, 8, 8, 8, 9, 9, 10, 10, + 10, 10, 11, 11, 11, 11, 12, 12, 13, 13, 13, 13, 14, 14, 15, 15, 15, 15, 16, + 16, 16, 16, 17, 17, 18, 18, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, + 22, 22, 23, 23, 24, 25, 25, 26, 26, 27, 28, 28, 29, 30 +}; +static const unsigned char gf_low_motion_minq[QINDEX_RANGE] = { + 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, + 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, + 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, + 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 24, + 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32, 32, 33, 33, 34, + 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40, 40, 41, 41, 42, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58 +}; +static const unsigned char gf_mid_motion_minq[QINDEX_RANGE] = { + 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, + 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 10, 11, + 11, 11, 12, 12, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 16, 16, 17, 17, 18, + 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, + 28, 28, 29, 29, 30, 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, + 37, 38, 39, 39, 40, 40, 41, 41, 42, 42, 43, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64 +}; +static const unsigned char gf_high_motion_minq[QINDEX_RANGE] = { + 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, + 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, + 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, + 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, + 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40, + 40, 41, 41, 42, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, + 57, 58, 59, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80 +}; +static const unsigned char inter_minq[QINDEX_RANGE] = { + 0, 0, 1, 1, 2, 3, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9, 9, 10, 11, + 11, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 22, 23, 24, + 24, 25, 26, 27, 27, 28, 29, 30, 30, 31, 32, 33, 33, 34, 35, 36, 36, 37, 38, + 39, 39, 40, 41, 42, 42, 43, 44, 45, 46, 46, 47, 48, 49, 50, 50, 51, 52, 53, + 54, 55, 55, 56, 57, 58, 59, 60, 60, 61, 62, 63, 64, 65, 66, 67, 67, 68, 69, + 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 86, + 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100 +}; + +#ifdef PACKET_TESTING +extern FILE *vpxlogc; +#endif + +void vp8_save_layer_context(VP8_COMP *cpi) { + LAYER_CONTEXT *lc = &cpi->layer_context[cpi->current_layer]; + + /* Save layer dependent coding state */ + lc->target_bandwidth = cpi->target_bandwidth; + lc->starting_buffer_level = cpi->oxcf.starting_buffer_level; + lc->optimal_buffer_level = cpi->oxcf.optimal_buffer_level; + lc->maximum_buffer_size = cpi->oxcf.maximum_buffer_size; + lc->starting_buffer_level_in_ms = cpi->oxcf.starting_buffer_level_in_ms; + lc->optimal_buffer_level_in_ms = cpi->oxcf.optimal_buffer_level_in_ms; + lc->maximum_buffer_size_in_ms = cpi->oxcf.maximum_buffer_size_in_ms; + lc->buffer_level = cpi->buffer_level; + lc->bits_off_target = cpi->bits_off_target; + lc->total_actual_bits = cpi->total_actual_bits; + lc->worst_quality = cpi->worst_quality; + lc->active_worst_quality = cpi->active_worst_quality; + lc->best_quality = cpi->best_quality; + lc->active_best_quality = cpi->active_best_quality; + lc->ni_av_qi = cpi->ni_av_qi; + lc->ni_tot_qi = cpi->ni_tot_qi; + lc->ni_frames = cpi->ni_frames; + lc->avg_frame_qindex = cpi->avg_frame_qindex; + lc->rate_correction_factor = cpi->rate_correction_factor; + lc->key_frame_rate_correction_factor = cpi->key_frame_rate_correction_factor; + lc->gf_rate_correction_factor = cpi->gf_rate_correction_factor; + lc->zbin_over_quant = cpi->mb.zbin_over_quant; + lc->inter_frame_target = cpi->inter_frame_target; + lc->total_byte_count = cpi->total_byte_count; + lc->filter_level = cpi->common.filter_level; + lc->frames_since_last_drop_overshoot = cpi->frames_since_last_drop_overshoot; + lc->force_maxqp = cpi->force_maxqp; + lc->last_frame_percent_intra = cpi->last_frame_percent_intra; + lc->last_q[0] = cpi->last_q[0]; + lc->last_q[1] = cpi->last_q[1]; + + memcpy(lc->count_mb_ref_frame_usage, cpi->mb.count_mb_ref_frame_usage, + sizeof(cpi->mb.count_mb_ref_frame_usage)); +} + +void vp8_restore_layer_context(VP8_COMP *cpi, const int layer) { + LAYER_CONTEXT *lc = &cpi->layer_context[layer]; + + /* Restore layer dependent coding state */ + cpi->current_layer = layer; + cpi->target_bandwidth = lc->target_bandwidth; + cpi->oxcf.target_bandwidth = lc->target_bandwidth; + cpi->oxcf.starting_buffer_level = lc->starting_buffer_level; + cpi->oxcf.optimal_buffer_level = lc->optimal_buffer_level; + cpi->oxcf.maximum_buffer_size = lc->maximum_buffer_size; + cpi->oxcf.starting_buffer_level_in_ms = lc->starting_buffer_level_in_ms; + cpi->oxcf.optimal_buffer_level_in_ms = lc->optimal_buffer_level_in_ms; + cpi->oxcf.maximum_buffer_size_in_ms = lc->maximum_buffer_size_in_ms; + cpi->buffer_level = lc->buffer_level; + cpi->bits_off_target = lc->bits_off_target; + cpi->total_actual_bits = lc->total_actual_bits; + cpi->active_worst_quality = lc->active_worst_quality; + cpi->active_best_quality = lc->active_best_quality; + cpi->ni_av_qi = lc->ni_av_qi; + cpi->ni_tot_qi = lc->ni_tot_qi; + cpi->ni_frames = lc->ni_frames; + cpi->avg_frame_qindex = lc->avg_frame_qindex; + cpi->rate_correction_factor = lc->rate_correction_factor; + cpi->key_frame_rate_correction_factor = lc->key_frame_rate_correction_factor; + cpi->gf_rate_correction_factor = lc->gf_rate_correction_factor; + cpi->mb.zbin_over_quant = lc->zbin_over_quant; + cpi->inter_frame_target = lc->inter_frame_target; + cpi->total_byte_count = lc->total_byte_count; + cpi->common.filter_level = lc->filter_level; + cpi->frames_since_last_drop_overshoot = lc->frames_since_last_drop_overshoot; + cpi->force_maxqp = lc->force_maxqp; + cpi->last_frame_percent_intra = lc->last_frame_percent_intra; + cpi->last_q[0] = lc->last_q[0]; + cpi->last_q[1] = lc->last_q[1]; + + memcpy(cpi->mb.count_mb_ref_frame_usage, lc->count_mb_ref_frame_usage, + sizeof(cpi->mb.count_mb_ref_frame_usage)); +} + +static int rescale(int val, int num, int denom) { + int64_t llnum = num; + int64_t llden = denom; + int64_t llval = val; + + return (int)(llval * llnum / llden); +} + +void vp8_init_temporal_layer_context(VP8_COMP *cpi, const VP8_CONFIG *oxcf, + const int layer, + double prev_layer_framerate) { + LAYER_CONTEXT *lc = &cpi->layer_context[layer]; + + lc->framerate = cpi->output_framerate / cpi->oxcf.rate_decimator[layer]; + lc->target_bandwidth = cpi->oxcf.target_bitrate[layer] * 1000; + + lc->starting_buffer_level_in_ms = oxcf->starting_buffer_level; + lc->optimal_buffer_level_in_ms = oxcf->optimal_buffer_level; + lc->maximum_buffer_size_in_ms = oxcf->maximum_buffer_size; + + lc->starting_buffer_level = + rescale((int)(oxcf->starting_buffer_level), lc->target_bandwidth, 1000); + + if (oxcf->optimal_buffer_level == 0) { + lc->optimal_buffer_level = lc->target_bandwidth / 8; + } else { + lc->optimal_buffer_level = + rescale((int)(oxcf->optimal_buffer_level), lc->target_bandwidth, 1000); + } + + if (oxcf->maximum_buffer_size == 0) { + lc->maximum_buffer_size = lc->target_bandwidth / 8; + } else { + lc->maximum_buffer_size = + rescale((int)(oxcf->maximum_buffer_size), lc->target_bandwidth, 1000); + } + + /* Work out the average size of a frame within this layer */ + if (layer > 0) { + lc->avg_frame_size_for_layer = + (int)round((cpi->oxcf.target_bitrate[layer] - + cpi->oxcf.target_bitrate[layer - 1]) * + 1000 / (lc->framerate - prev_layer_framerate)); + } + + lc->active_worst_quality = cpi->oxcf.worst_allowed_q; + lc->active_best_quality = cpi->oxcf.best_allowed_q; + lc->avg_frame_qindex = cpi->oxcf.worst_allowed_q; + + lc->buffer_level = lc->starting_buffer_level; + lc->bits_off_target = lc->starting_buffer_level; + + lc->total_actual_bits = 0; + lc->ni_av_qi = 0; + lc->ni_tot_qi = 0; + lc->ni_frames = 0; + lc->rate_correction_factor = 1.0; + lc->key_frame_rate_correction_factor = 1.0; + lc->gf_rate_correction_factor = 1.0; + lc->inter_frame_target = 0; +} + +// Upon a run-time change in temporal layers, reset the layer context parameters +// for any "new" layers. For "existing" layers, let them inherit the parameters +// from the previous layer state (at the same layer #). In future we may want +// to better map the previous layer state(s) to the "new" ones. +void vp8_reset_temporal_layer_change(VP8_COMP *cpi, const VP8_CONFIG *oxcf, + const int prev_num_layers) { + int i; + double prev_layer_framerate = 0; + const int curr_num_layers = cpi->oxcf.number_of_layers; + // If the previous state was 1 layer, get current layer context from cpi. + // We need this to set the layer context for the new layers below. + if (prev_num_layers == 1) { + cpi->current_layer = 0; + vp8_save_layer_context(cpi); + } + for (i = 0; i < curr_num_layers; ++i) { + LAYER_CONTEXT *lc = &cpi->layer_context[i]; + if (i >= prev_num_layers) { + vp8_init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate); + } + // The initial buffer levels are set based on their starting levels. + // We could set the buffer levels based on the previous state (normalized + // properly by the layer bandwidths) but we would need to keep track of + // the previous set of layer bandwidths (i.e., target_bitrate[i]) + // before the layer change. For now, reset to the starting levels. + lc->buffer_level = + cpi->oxcf.starting_buffer_level_in_ms * cpi->oxcf.target_bitrate[i]; + lc->bits_off_target = lc->buffer_level; + // TDOD(marpan): Should we set the rate_correction_factor and + // active_worst/best_quality to values derived from the previous layer + // state (to smooth-out quality dips/rate fluctuation at transition)? + + // We need to treat the 1 layer case separately: oxcf.target_bitrate[i] + // is not set for 1 layer, and the vp8_restore_layer_context/save_context() + // are not called in the encoding loop, so we need to call it here to + // pass the layer context state to |cpi|. + if (curr_num_layers == 1) { + lc->target_bandwidth = cpi->oxcf.target_bandwidth; + lc->buffer_level = + cpi->oxcf.starting_buffer_level_in_ms * lc->target_bandwidth / 1000; + lc->bits_off_target = lc->buffer_level; + vp8_restore_layer_context(cpi, 0); + } + prev_layer_framerate = cpi->output_framerate / cpi->oxcf.rate_decimator[i]; + } +} + +static void setup_features(VP8_COMP *cpi) { + // If segmentation enabled set the update flags + if (cpi->mb.e_mbd.segmentation_enabled) { + cpi->mb.e_mbd.update_mb_segmentation_map = 1; + cpi->mb.e_mbd.update_mb_segmentation_data = 1; + } else { + cpi->mb.e_mbd.update_mb_segmentation_map = 0; + cpi->mb.e_mbd.update_mb_segmentation_data = 0; + } + + cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 0; + cpi->mb.e_mbd.mode_ref_lf_delta_update = 0; + memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas)); + memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas)); + memset(cpi->mb.e_mbd.last_ref_lf_deltas, 0, + sizeof(cpi->mb.e_mbd.ref_lf_deltas)); + memset(cpi->mb.e_mbd.last_mode_lf_deltas, 0, + sizeof(cpi->mb.e_mbd.mode_lf_deltas)); + + set_default_lf_deltas(cpi); +} + +static void dealloc_raw_frame_buffers(VP8_COMP *cpi); + +static void initialize_enc(void) { + vpx_dsp_rtcd(); + vp8_init_intra_predictors(); +} + +void vp8_initialize_enc(void) { once(initialize_enc); } + +static void dealloc_compressor_data(VP8_COMP *cpi) { + vpx_free(cpi->tplist); + cpi->tplist = NULL; + + /* Delete last frame MV storage buffers */ + vpx_free(cpi->lfmv); + cpi->lfmv = 0; + + vpx_free(cpi->lf_ref_frame_sign_bias); + cpi->lf_ref_frame_sign_bias = 0; + + vpx_free(cpi->lf_ref_frame); + cpi->lf_ref_frame = 0; + + /* Delete sementation map */ + vpx_free(cpi->segmentation_map); + cpi->segmentation_map = 0; + + vpx_free(cpi->active_map); + cpi->active_map = 0; + + vp8_de_alloc_frame_buffers(&cpi->common); + + vp8_yv12_de_alloc_frame_buffer(&cpi->pick_lf_lvl_frame); + vp8_yv12_de_alloc_frame_buffer(&cpi->scaled_source); + dealloc_raw_frame_buffers(cpi); + + vpx_free(cpi->tok); + cpi->tok = 0; + + /* Structure used to monitor GF usage */ + vpx_free(cpi->gf_active_flags); + cpi->gf_active_flags = 0; + + /* Activity mask based per mb zbin adjustments */ + vpx_free(cpi->mb_activity_map); + cpi->mb_activity_map = 0; + + vpx_free(cpi->mb.pip); + cpi->mb.pip = 0; +} + +static void enable_segmentation(VP8_COMP *cpi) { + /* Set the appropriate feature bit */ + cpi->mb.e_mbd.segmentation_enabled = 1; + cpi->mb.e_mbd.update_mb_segmentation_map = 1; + cpi->mb.e_mbd.update_mb_segmentation_data = 1; +} +static void disable_segmentation(VP8_COMP *cpi) { + /* Clear the appropriate feature bit */ + cpi->mb.e_mbd.segmentation_enabled = 0; +} + +/* Valid values for a segment are 0 to 3 + * Segmentation map is arrange as [Rows][Columns] + */ +static void set_segmentation_map(VP8_COMP *cpi, + unsigned char *segmentation_map) { + /* Copy in the new segmentation map */ + memcpy(cpi->segmentation_map, segmentation_map, + (cpi->common.mb_rows * cpi->common.mb_cols)); + + /* Signal that the map should be updated. */ + cpi->mb.e_mbd.update_mb_segmentation_map = 1; + cpi->mb.e_mbd.update_mb_segmentation_data = 1; +} + +/* The values given for each segment can be either deltas (from the default + * value chosen for the frame) or absolute values. + * + * Valid range for abs values is: + * (0-127 for MB_LVL_ALT_Q), (0-63 for SEGMENT_ALT_LF) + * Valid range for delta values are: + * (+/-127 for MB_LVL_ALT_Q), (+/-63 for SEGMENT_ALT_LF) + * + * abs_delta = SEGMENT_DELTADATA (deltas) + * abs_delta = SEGMENT_ABSDATA (use the absolute values given). + * + */ +static void set_segment_data(VP8_COMP *cpi, signed char *feature_data, + unsigned char abs_delta) { + cpi->mb.e_mbd.mb_segment_abs_delta = abs_delta; + memcpy(cpi->segment_feature_data, feature_data, + sizeof(cpi->segment_feature_data)); +} + +/* A simple function to cyclically refresh the background at a lower Q */ +static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment) { + unsigned char *seg_map = cpi->segmentation_map; + signed char feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS]; + int i; + int block_count = cpi->cyclic_refresh_mode_max_mbs_perframe; + int mbs_in_frame = cpi->common.mb_rows * cpi->common.mb_cols; + + cpi->cyclic_refresh_q = Q / 2; + + if (cpi->oxcf.screen_content_mode) { + // Modify quality ramp-up based on Q. Above some Q level, increase the + // number of blocks to be refreshed, and reduce it below the thredhold. + // Turn-off under certain conditions (i.e., away from key frame, and if + // we are at good quality (low Q) and most of the blocks were + // skipped-encoded + // in previous frame. + int qp_thresh = (cpi->oxcf.screen_content_mode == 2) ? 80 : 100; + if (Q >= qp_thresh) { + cpi->cyclic_refresh_mode_max_mbs_perframe = + (cpi->common.mb_rows * cpi->common.mb_cols) / 10; + } else if (cpi->frames_since_key > 250 && Q < 20 && + cpi->mb.skip_true_count > (int)(0.95 * mbs_in_frame)) { + cpi->cyclic_refresh_mode_max_mbs_perframe = 0; + } else { + cpi->cyclic_refresh_mode_max_mbs_perframe = + (cpi->common.mb_rows * cpi->common.mb_cols) / 20; + } + block_count = cpi->cyclic_refresh_mode_max_mbs_perframe; + } + + // Set every macroblock to be eligible for update. + // For key frame this will reset seg map to 0. + memset(cpi->segmentation_map, 0, mbs_in_frame); + + if (cpi->common.frame_type != KEY_FRAME && block_count > 0) { + /* Cycle through the macro_block rows */ + /* MB loop to set local segmentation map */ + i = cpi->cyclic_refresh_mode_index; + assert(i < mbs_in_frame); + do { + /* If the MB is as a candidate for clean up then mark it for + * possible boost/refresh (segment 1) The segment id may get + * reset to 0 later if the MB gets coded anything other than + * last frame 0,0 as only (last frame 0,0) MBs are eligable for + * refresh : that is to say Mbs likely to be background blocks. + */ + if (cpi->cyclic_refresh_map[i] == 0) { + seg_map[i] = 1; + block_count--; + } else if (cpi->cyclic_refresh_map[i] < 0) { + cpi->cyclic_refresh_map[i]++; + } + + i++; + if (i == mbs_in_frame) i = 0; + + } while (block_count && i != cpi->cyclic_refresh_mode_index); + + cpi->cyclic_refresh_mode_index = i; + +#if CONFIG_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0) { + if (cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive && + Q < (int)cpi->denoiser.denoise_pars.qp_thresh && + (cpi->frames_since_key > + 2 * cpi->denoiser.denoise_pars.consec_zerolast)) { + // Under aggressive denoising, use segmentation to turn off loop + // filter below some qp thresh. The filter is reduced for all + // blocks that have been encoded as ZEROMV LAST x frames in a row, + // where x is set by cpi->denoiser.denoise_pars.consec_zerolast. + // This is to avoid "dot" artifacts that can occur from repeated + // loop filtering on noisy input source. + cpi->cyclic_refresh_q = Q; + // lf_adjustment = -MAX_LOOP_FILTER; + lf_adjustment = -40; + for (i = 0; i < mbs_in_frame; ++i) { + seg_map[i] = (cpi->consec_zero_last[i] > + cpi->denoiser.denoise_pars.consec_zerolast) + ? 1 + : 0; + } + } + } +#endif + } + + /* Activate segmentation. */ + cpi->mb.e_mbd.update_mb_segmentation_map = 1; + cpi->mb.e_mbd.update_mb_segmentation_data = 1; + enable_segmentation(cpi); + + /* Set up the quant segment data */ + feature_data[MB_LVL_ALT_Q][0] = 0; + feature_data[MB_LVL_ALT_Q][1] = (cpi->cyclic_refresh_q - Q); + feature_data[MB_LVL_ALT_Q][2] = 0; + feature_data[MB_LVL_ALT_Q][3] = 0; + + /* Set up the loop segment data */ + feature_data[MB_LVL_ALT_LF][0] = 0; + feature_data[MB_LVL_ALT_LF][1] = lf_adjustment; + feature_data[MB_LVL_ALT_LF][2] = 0; + feature_data[MB_LVL_ALT_LF][3] = 0; + + /* Initialise the feature data structure */ + set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA); +} + +static void compute_skin_map(VP8_COMP *cpi) { + int mb_row, mb_col, num_bl; + VP8_COMMON *cm = &cpi->common; + const uint8_t *src_y = cpi->Source->y_buffer; + const uint8_t *src_u = cpi->Source->u_buffer; + const uint8_t *src_v = cpi->Source->v_buffer; + const int src_ystride = cpi->Source->y_stride; + const int src_uvstride = cpi->Source->uv_stride; + + const SKIN_DETECTION_BLOCK_SIZE bsize = + (cm->Width * cm->Height <= 352 * 288) ? SKIN_8X8 : SKIN_16X16; + + for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) { + num_bl = 0; + for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { + const int bl_index = mb_row * cm->mb_cols + mb_col; + cpi->skin_map[bl_index] = + vp8_compute_skin_block(src_y, src_u, src_v, src_ystride, src_uvstride, + bsize, cpi->consec_zero_last[bl_index], 0); + num_bl++; + src_y += 16; + src_u += 8; + src_v += 8; + } + src_y += (src_ystride << 4) - (num_bl << 4); + src_u += (src_uvstride << 3) - (num_bl << 3); + src_v += (src_uvstride << 3) - (num_bl << 3); + } + + // Remove isolated skin blocks (none of its neighbors are skin) and isolated + // non-skin blocks (all of its neighbors are skin). Skip the boundary. + for (mb_row = 1; mb_row < cm->mb_rows - 1; mb_row++) { + for (mb_col = 1; mb_col < cm->mb_cols - 1; mb_col++) { + const int bl_index = mb_row * cm->mb_cols + mb_col; + int num_neighbor = 0; + int mi, mj; + int non_skin_threshold = 8; + + for (mi = -1; mi <= 1; mi += 1) { + for (mj = -1; mj <= 1; mj += 1) { + int bl_neighbor_index = (mb_row + mi) * cm->mb_cols + mb_col + mj; + if (cpi->skin_map[bl_neighbor_index]) num_neighbor++; + } + } + + if (cpi->skin_map[bl_index] && num_neighbor < 2) + cpi->skin_map[bl_index] = 0; + if (!cpi->skin_map[bl_index] && num_neighbor == non_skin_threshold) + cpi->skin_map[bl_index] = 1; + } + } +} + +static void set_default_lf_deltas(VP8_COMP *cpi) { + cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1; + cpi->mb.e_mbd.mode_ref_lf_delta_update = 1; + + memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas)); + memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas)); + + /* Test of ref frame deltas */ + cpi->mb.e_mbd.ref_lf_deltas[INTRA_FRAME] = 2; + cpi->mb.e_mbd.ref_lf_deltas[LAST_FRAME] = 0; + cpi->mb.e_mbd.ref_lf_deltas[GOLDEN_FRAME] = -2; + cpi->mb.e_mbd.ref_lf_deltas[ALTREF_FRAME] = -2; + + cpi->mb.e_mbd.mode_lf_deltas[0] = 4; /* BPRED */ + + if (cpi->oxcf.Mode == MODE_REALTIME) { + cpi->mb.e_mbd.mode_lf_deltas[1] = -12; /* Zero */ + } else { + cpi->mb.e_mbd.mode_lf_deltas[1] = -2; /* Zero */ + } + + cpi->mb.e_mbd.mode_lf_deltas[2] = 2; /* New mv */ + cpi->mb.e_mbd.mode_lf_deltas[3] = 4; /* Split mv */ +} + +/* Convenience macros for mapping speed and mode into a continuous + * range + */ +#define GOOD(x) ((x) + 1) +#define RT(x) ((x) + 7) + +static int speed_map(int speed, const int *map) { + int res; + + do { + res = *map++; + } while (speed >= *map++); + return res; +} + +static const int thresh_mult_map_znn[] = { + /* map common to zero, nearest, and near */ + 0, GOOD(2), 1500, GOOD(3), 2000, RT(0), 1000, RT(2), 2000, INT_MAX +}; + +static const int thresh_mult_map_vhpred[] = { 1000, GOOD(2), 1500, GOOD(3), + 2000, RT(0), 1000, RT(1), + 2000, RT(7), INT_MAX, INT_MAX }; + +static const int thresh_mult_map_bpred[] = { 2000, GOOD(0), 2500, GOOD(2), + 5000, GOOD(3), 7500, RT(0), + 2500, RT(1), 5000, RT(6), + INT_MAX, INT_MAX }; + +static const int thresh_mult_map_tm[] = { 1000, GOOD(2), 1500, GOOD(3), + 2000, RT(0), 0, RT(1), + 1000, RT(2), 2000, RT(7), + INT_MAX, INT_MAX }; + +static const int thresh_mult_map_new1[] = { 1000, GOOD(2), 2000, + RT(0), 2000, INT_MAX }; + +static const int thresh_mult_map_new2[] = { 1000, GOOD(2), 2000, GOOD(3), + 2500, GOOD(5), 4000, RT(0), + 2000, RT(2), 2500, RT(5), + 4000, INT_MAX }; + +static const int thresh_mult_map_split1[] = { + 2500, GOOD(0), 1700, GOOD(2), 10000, GOOD(3), 25000, GOOD(4), INT_MAX, + RT(0), 5000, RT(1), 10000, RT(2), 25000, RT(3), INT_MAX, INT_MAX +}; + +static const int thresh_mult_map_split2[] = { + 5000, GOOD(0), 4500, GOOD(2), 20000, GOOD(3), 50000, GOOD(4), INT_MAX, + RT(0), 10000, RT(1), 20000, RT(2), 50000, RT(3), INT_MAX, INT_MAX +}; + +static const int mode_check_freq_map_zn2[] = { + /* {zero,nearest}{2,3} */ + 0, RT(10), 1 << 1, RT(11), 1 << 2, RT(12), 1 << 3, INT_MAX +}; + +static const int mode_check_freq_map_vhbpred[] = { 0, GOOD(5), 2, RT(0), + 0, RT(3), 2, RT(5), + 4, INT_MAX }; + +static const int mode_check_freq_map_near2[] = { + 0, GOOD(5), 2, RT(0), 0, RT(3), 2, + RT(10), 1 << 2, RT(11), 1 << 3, RT(12), 1 << 4, INT_MAX +}; + +static const int mode_check_freq_map_new1[] = { + 0, RT(10), 1 << 1, RT(11), 1 << 2, RT(12), 1 << 3, INT_MAX +}; + +static const int mode_check_freq_map_new2[] = { 0, GOOD(5), 4, RT(0), + 0, RT(3), 4, RT(10), + 1 << 3, RT(11), 1 << 4, RT(12), + 1 << 5, INT_MAX }; + +static const int mode_check_freq_map_split1[] = { 0, GOOD(2), 2, GOOD(3), + 7, RT(1), 2, RT(2), + 7, INT_MAX }; + +static const int mode_check_freq_map_split2[] = { 0, GOOD(1), 2, GOOD(2), + 4, GOOD(3), 15, RT(1), + 4, RT(2), 15, INT_MAX }; + +void vp8_set_speed_features(VP8_COMP *cpi) { + SPEED_FEATURES *sf = &cpi->sf; + int Mode = cpi->compressor_speed; + int Speed = cpi->Speed; + int Speed2; + int i; + VP8_COMMON *cm = &cpi->common; + int last_improved_quant = sf->improved_quant; + int ref_frames; + + /* Initialise default mode frequency sampling variables */ + for (i = 0; i < MAX_MODES; ++i) { + cpi->mode_check_freq[i] = 0; + } + + cpi->mb.mbs_tested_so_far = 0; + cpi->mb.mbs_zero_last_dot_suppress = 0; + + /* best quality defaults */ + sf->RD = 1; + sf->search_method = NSTEP; + sf->improved_quant = 1; + sf->improved_dct = 1; + sf->auto_filter = 1; + sf->recode_loop = 1; + sf->quarter_pixel_search = 1; + sf->half_pixel_search = 1; + sf->iterative_sub_pixel = 1; + sf->optimize_coefficients = 1; + sf->use_fastquant_for_pick = 0; + sf->no_skip_block4x4_search = 1; + + sf->first_step = 0; + sf->max_step_search_steps = MAX_MVSEARCH_STEPS; + sf->improved_mv_pred = 1; + + /* default thresholds to 0 */ + for (i = 0; i < MAX_MODES; ++i) sf->thresh_mult[i] = 0; + + /* Count enabled references */ + ref_frames = 1; + if (cpi->ref_frame_flags & VP8_LAST_FRAME) ref_frames++; + if (cpi->ref_frame_flags & VP8_GOLD_FRAME) ref_frames++; + if (cpi->ref_frame_flags & VP8_ALTR_FRAME) ref_frames++; + + /* Convert speed to continuous range, with clamping */ + if (Mode == 0) { + Speed = 0; + } else if (Mode == 2) { + Speed = RT(Speed); + } else { + if (Speed > 5) Speed = 5; + Speed = GOOD(Speed); + } + + sf->thresh_mult[THR_ZERO1] = sf->thresh_mult[THR_NEAREST1] = + sf->thresh_mult[THR_NEAR1] = sf->thresh_mult[THR_DC] = 0; /* always */ + + sf->thresh_mult[THR_ZERO2] = sf->thresh_mult[THR_ZERO3] = + sf->thresh_mult[THR_NEAREST2] = sf->thresh_mult[THR_NEAREST3] = + sf->thresh_mult[THR_NEAR2] = sf->thresh_mult[THR_NEAR3] = + speed_map(Speed, thresh_mult_map_znn); + + sf->thresh_mult[THR_V_PRED] = sf->thresh_mult[THR_H_PRED] = + speed_map(Speed, thresh_mult_map_vhpred); + sf->thresh_mult[THR_B_PRED] = speed_map(Speed, thresh_mult_map_bpred); + sf->thresh_mult[THR_TM] = speed_map(Speed, thresh_mult_map_tm); + sf->thresh_mult[THR_NEW1] = speed_map(Speed, thresh_mult_map_new1); + sf->thresh_mult[THR_NEW2] = sf->thresh_mult[THR_NEW3] = + speed_map(Speed, thresh_mult_map_new2); + sf->thresh_mult[THR_SPLIT1] = speed_map(Speed, thresh_mult_map_split1); + sf->thresh_mult[THR_SPLIT2] = sf->thresh_mult[THR_SPLIT3] = + speed_map(Speed, thresh_mult_map_split2); + + // Special case for temporal layers. + // Reduce the thresholds for zero/nearest/near for GOLDEN, if GOLDEN is + // used as second reference. We don't modify thresholds for ALTREF case + // since ALTREF is usually used as long-term reference in temporal layers. + if ((cpi->Speed <= 6) && (cpi->oxcf.number_of_layers > 1) && + (cpi->ref_frame_flags & VP8_LAST_FRAME) && + (cpi->ref_frame_flags & VP8_GOLD_FRAME)) { + if (cpi->closest_reference_frame == GOLDEN_FRAME) { + sf->thresh_mult[THR_ZERO2] = sf->thresh_mult[THR_ZERO2] >> 3; + sf->thresh_mult[THR_NEAREST2] = sf->thresh_mult[THR_NEAREST2] >> 3; + sf->thresh_mult[THR_NEAR2] = sf->thresh_mult[THR_NEAR2] >> 3; + } else { + sf->thresh_mult[THR_ZERO2] = sf->thresh_mult[THR_ZERO2] >> 1; + sf->thresh_mult[THR_NEAREST2] = sf->thresh_mult[THR_NEAREST2] >> 1; + sf->thresh_mult[THR_NEAR2] = sf->thresh_mult[THR_NEAR2] >> 1; + } + } + + cpi->mode_check_freq[THR_ZERO1] = cpi->mode_check_freq[THR_NEAREST1] = + cpi->mode_check_freq[THR_NEAR1] = cpi->mode_check_freq[THR_TM] = + cpi->mode_check_freq[THR_DC] = 0; /* always */ + + cpi->mode_check_freq[THR_ZERO2] = cpi->mode_check_freq[THR_ZERO3] = + cpi->mode_check_freq[THR_NEAREST2] = cpi->mode_check_freq[THR_NEAREST3] = + speed_map(Speed, mode_check_freq_map_zn2); + + cpi->mode_check_freq[THR_NEAR2] = cpi->mode_check_freq[THR_NEAR3] = + speed_map(Speed, mode_check_freq_map_near2); + + cpi->mode_check_freq[THR_V_PRED] = cpi->mode_check_freq[THR_H_PRED] = + cpi->mode_check_freq[THR_B_PRED] = + speed_map(Speed, mode_check_freq_map_vhbpred); + + // For real-time mode at speed 10 keep the mode_check_freq threshold + // for NEW1 similar to that of speed 9. + Speed2 = Speed; + if (cpi->Speed == 10 && Mode == 2) Speed2 = RT(9); + cpi->mode_check_freq[THR_NEW1] = speed_map(Speed2, mode_check_freq_map_new1); + + cpi->mode_check_freq[THR_NEW2] = cpi->mode_check_freq[THR_NEW3] = + speed_map(Speed, mode_check_freq_map_new2); + + cpi->mode_check_freq[THR_SPLIT1] = + speed_map(Speed, mode_check_freq_map_split1); + cpi->mode_check_freq[THR_SPLIT2] = cpi->mode_check_freq[THR_SPLIT3] = + speed_map(Speed, mode_check_freq_map_split2); + Speed = cpi->Speed; + switch (Mode) { +#if !CONFIG_REALTIME_ONLY + case 0: /* best quality mode */ + sf->first_step = 0; + sf->max_step_search_steps = MAX_MVSEARCH_STEPS; + break; + case 1: + case 3: + if (Speed > 0) { + /* Disable coefficient optimization above speed 0 */ + sf->optimize_coefficients = 0; + sf->use_fastquant_for_pick = 1; + sf->no_skip_block4x4_search = 0; + + sf->first_step = 1; + } + + if (Speed > 2) { + sf->improved_quant = 0; + sf->improved_dct = 0; + + /* Only do recode loop on key frames, golden frames and + * alt ref frames + */ + sf->recode_loop = 2; + } + + if (Speed > 3) { + sf->auto_filter = 1; + sf->recode_loop = 0; /* recode loop off */ + sf->RD = 0; /* Turn rd off */ + } + + if (Speed > 4) { + sf->auto_filter = 0; /* Faster selection of loop filter */ + } + + break; +#endif + case 2: + sf->optimize_coefficients = 0; + sf->recode_loop = 0; + sf->auto_filter = 1; + sf->iterative_sub_pixel = 1; + sf->search_method = NSTEP; + + if (Speed > 0) { + sf->improved_quant = 0; + sf->improved_dct = 0; + + sf->use_fastquant_for_pick = 1; + sf->no_skip_block4x4_search = 0; + sf->first_step = 1; + } + + if (Speed > 2) sf->auto_filter = 0; /* Faster selection of loop filter */ + + if (Speed > 3) { + sf->RD = 0; + sf->auto_filter = 1; + } + + if (Speed > 4) { + sf->auto_filter = 0; /* Faster selection of loop filter */ + sf->search_method = HEX; + sf->iterative_sub_pixel = 0; + } + + if (Speed > 6) { + unsigned int sum = 0; + unsigned int total_mbs = cm->MBs; + int thresh; + unsigned int total_skip; + + int min = 2000; + + if (cpi->oxcf.encode_breakout > 2000) min = cpi->oxcf.encode_breakout; + + min >>= 7; + + for (i = 0; i < min; ++i) { + sum += cpi->mb.error_bins[i]; + } + + total_skip = sum; + sum = 0; + + /* i starts from 2 to make sure thresh started from 2048 */ + for (; i < 1024; ++i) { + sum += cpi->mb.error_bins[i]; + + if (10 * sum >= + (unsigned int)(cpi->Speed - 6) * (total_mbs - total_skip)) { + break; + } + } + + i--; + thresh = (i << 7); + + if (thresh < 2000) thresh = 2000; + + if (ref_frames > 1) { + sf->thresh_mult[THR_NEW1] = thresh; + sf->thresh_mult[THR_NEAREST1] = thresh >> 1; + sf->thresh_mult[THR_NEAR1] = thresh >> 1; + } + + if (ref_frames > 2) { + sf->thresh_mult[THR_NEW2] = thresh << 1; + sf->thresh_mult[THR_NEAREST2] = thresh; + sf->thresh_mult[THR_NEAR2] = thresh; + } + + if (ref_frames > 3) { + sf->thresh_mult[THR_NEW3] = thresh << 1; + sf->thresh_mult[THR_NEAREST3] = thresh; + sf->thresh_mult[THR_NEAR3] = thresh; + } + + sf->improved_mv_pred = 0; + } + + if (Speed > 8) sf->quarter_pixel_search = 0; + + if (cm->version == 0) { + cm->filter_type = NORMAL_LOOPFILTER; + + if (Speed >= 14) cm->filter_type = SIMPLE_LOOPFILTER; + } else { + cm->filter_type = SIMPLE_LOOPFILTER; + } + + /* This has a big hit on quality. Last resort */ + if (Speed >= 15) sf->half_pixel_search = 0; + + memset(cpi->mb.error_bins, 0, sizeof(cpi->mb.error_bins)); + + } /* switch */ + + /* Slow quant, dct and trellis not worthwhile for first pass + * so make sure they are always turned off. + */ + if (cpi->pass == 1) { + sf->improved_quant = 0; + sf->optimize_coefficients = 0; + sf->improved_dct = 0; + } + + if (cpi->sf.search_method == NSTEP) { + vp8_init3smotion_compensation(&cpi->mb, + cm->yv12_fb[cm->lst_fb_idx].y_stride); + } else if (cpi->sf.search_method == DIAMOND) { + vp8_init_dsmotion_compensation(&cpi->mb, + cm->yv12_fb[cm->lst_fb_idx].y_stride); + } + + if (cpi->sf.improved_dct) { + cpi->mb.short_fdct8x4 = vp8_short_fdct8x4; + cpi->mb.short_fdct4x4 = vp8_short_fdct4x4; + } else { + /* No fast FDCT defined for any platform at this time. */ + cpi->mb.short_fdct8x4 = vp8_short_fdct8x4; + cpi->mb.short_fdct4x4 = vp8_short_fdct4x4; + } + + cpi->mb.short_walsh4x4 = vp8_short_walsh4x4; + + if (cpi->sf.improved_quant) { + cpi->mb.quantize_b = vp8_regular_quantize_b; + } else { + cpi->mb.quantize_b = vp8_fast_quantize_b; + } + if (cpi->sf.improved_quant != last_improved_quant) vp8cx_init_quantizer(cpi); + + if (cpi->sf.iterative_sub_pixel == 1) { + cpi->find_fractional_mv_step = vp8_find_best_sub_pixel_step_iteratively; + } else if (cpi->sf.quarter_pixel_search) { + cpi->find_fractional_mv_step = vp8_find_best_sub_pixel_step; + } else if (cpi->sf.half_pixel_search) { + cpi->find_fractional_mv_step = vp8_find_best_half_pixel_step; + } else { + cpi->find_fractional_mv_step = vp8_skip_fractional_mv_step; + } + + if (cpi->sf.optimize_coefficients == 1 && cpi->pass != 1) { + cpi->mb.optimize = 1; + } else { + cpi->mb.optimize = 0; + } + + if (cpi->common.full_pixel) { + cpi->find_fractional_mv_step = vp8_skip_fractional_mv_step; + } + +#ifdef SPEEDSTATS + frames_at_speed[cpi->Speed]++; +#endif +} +#undef GOOD +#undef RT + +static void alloc_raw_frame_buffers(VP8_COMP *cpi) { +#if VP8_TEMPORAL_ALT_REF + int width = (cpi->oxcf.Width + 15) & ~15; + int height = (cpi->oxcf.Height + 15) & ~15; +#endif + + cpi->lookahead = vp8_lookahead_init(cpi->oxcf.Width, cpi->oxcf.Height, + cpi->oxcf.lag_in_frames); + if (!cpi->lookahead) { + vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, + "Failed to allocate lag buffers"); + } + +#if VP8_TEMPORAL_ALT_REF + + if (vp8_yv12_alloc_frame_buffer(&cpi->alt_ref_buffer, width, height, + VP8BORDERINPIXELS)) { + vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, + "Failed to allocate altref buffer"); + } + +#endif +} + +static void dealloc_raw_frame_buffers(VP8_COMP *cpi) { +#if VP8_TEMPORAL_ALT_REF + vp8_yv12_de_alloc_frame_buffer(&cpi->alt_ref_buffer); +#endif + vp8_lookahead_destroy(cpi->lookahead); +} + +static int vp8_alloc_partition_data(VP8_COMP *cpi) { + vpx_free(cpi->mb.pip); + + cpi->mb.pip = + vpx_calloc((cpi->common.mb_cols + 1) * (cpi->common.mb_rows + 1), + sizeof(PARTITION_INFO)); + if (!cpi->mb.pip) return 1; + + cpi->mb.pi = cpi->mb.pip + cpi->common.mode_info_stride + 1; + + return 0; +} + +void vp8_alloc_compressor_data(VP8_COMP *cpi) { + VP8_COMMON *cm = &cpi->common; + + int width = cm->Width; + int height = cm->Height; + + if (vp8_alloc_frame_buffers(cm, width, height)) { + vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, + "Failed to allocate frame buffers"); + } + + if (vp8_alloc_partition_data(cpi)) { + vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, + "Failed to allocate partition data"); + } + + if ((width & 0xf) != 0) width += 16 - (width & 0xf); + + if ((height & 0xf) != 0) height += 16 - (height & 0xf); + + if (vp8_yv12_alloc_frame_buffer(&cpi->pick_lf_lvl_frame, width, height, + VP8BORDERINPIXELS)) { + vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, + "Failed to allocate last frame buffer"); + } + + if (vp8_yv12_alloc_frame_buffer(&cpi->scaled_source, width, height, + VP8BORDERINPIXELS)) { + vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, + "Failed to allocate scaled source buffer"); + } + + vpx_free(cpi->tok); + + { +#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING + unsigned int tokens = 8 * 24 * 16; /* one MB for each thread */ +#else + unsigned int tokens = cm->mb_rows * cm->mb_cols * 24 * 16; +#endif + CHECK_MEM_ERROR(&cpi->common.error, cpi->tok, + vpx_calloc(tokens, sizeof(*cpi->tok))); + } + + /* Data used for real time vc mode to see if gf needs refreshing */ + cpi->zeromv_count = 0; + + /* Structures used to monitor GF usage */ + vpx_free(cpi->gf_active_flags); + CHECK_MEM_ERROR( + &cpi->common.error, cpi->gf_active_flags, + vpx_calloc(sizeof(*cpi->gf_active_flags), cm->mb_rows * cm->mb_cols)); + cpi->gf_active_count = cm->mb_rows * cm->mb_cols; + + vpx_free(cpi->mb_activity_map); + CHECK_MEM_ERROR( + &cpi->common.error, cpi->mb_activity_map, + vpx_calloc(sizeof(*cpi->mb_activity_map), cm->mb_rows * cm->mb_cols)); + + /* allocate memory for storing last frame's MVs for MV prediction. */ + vpx_free(cpi->lfmv); + CHECK_MEM_ERROR( + &cpi->common.error, cpi->lfmv, + vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2), sizeof(*cpi->lfmv))); + vpx_free(cpi->lf_ref_frame_sign_bias); + CHECK_MEM_ERROR(&cpi->common.error, cpi->lf_ref_frame_sign_bias, + vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2), + sizeof(*cpi->lf_ref_frame_sign_bias))); + vpx_free(cpi->lf_ref_frame); + CHECK_MEM_ERROR(&cpi->common.error, cpi->lf_ref_frame, + vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2), + sizeof(*cpi->lf_ref_frame))); + + /* Create the encoder segmentation map and set all entries to 0 */ + vpx_free(cpi->segmentation_map); + CHECK_MEM_ERROR( + &cpi->common.error, cpi->segmentation_map, + vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->segmentation_map))); + cpi->cyclic_refresh_mode_index = 0; + vpx_free(cpi->active_map); + CHECK_MEM_ERROR( + &cpi->common.error, cpi->active_map, + vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->active_map))); + memset(cpi->active_map, 1, (cm->mb_rows * cm->mb_cols)); + +#if CONFIG_MULTITHREAD + if (width < 640) { + cpi->mt_sync_range = 1; + } else if (width <= 1280) { + cpi->mt_sync_range = 4; + } else if (width <= 2560) { + cpi->mt_sync_range = 8; + } else { + cpi->mt_sync_range = 16; + } +#endif + + vpx_free(cpi->tplist); + CHECK_MEM_ERROR(&cpi->common.error, cpi->tplist, + vpx_malloc(sizeof(TOKENLIST) * cm->mb_rows)); + +#if CONFIG_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0) { + vp8_denoiser_free(&cpi->denoiser); + if (vp8_denoiser_allocate(&cpi->denoiser, width, height, cm->mb_rows, + cm->mb_cols, cpi->oxcf.noise_sensitivity)) { + vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, + "Failed to allocate denoiser"); + } + } +#endif +} + +/* Quant MOD */ +static const int q_trans[] = { + 0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 12, 13, 15, 17, 18, 19, + 20, 21, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 35, 37, 39, 41, + 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 64, 67, 70, 73, 76, 79, + 82, 85, 88, 91, 94, 97, 100, 103, 106, 109, 112, 115, 118, 121, 124, 127, +}; + +int vp8_reverse_trans(int x) { + int i; + + for (i = 0; i < 64; ++i) { + if (q_trans[i] >= x) return i; + } + + return 63; +} +void vp8_new_framerate(VP8_COMP *cpi, double framerate) { + if (framerate < .1) framerate = 30; + + cpi->framerate = framerate; + cpi->output_framerate = framerate; + cpi->per_frame_bandwidth = + (int)round(cpi->oxcf.target_bandwidth / cpi->output_framerate); + cpi->av_per_frame_bandwidth = cpi->per_frame_bandwidth; + cpi->min_frame_bandwidth = (int)(cpi->av_per_frame_bandwidth * + cpi->oxcf.two_pass_vbrmin_section / 100); + + /* Set Maximum gf/arf interval */ + cpi->max_gf_interval = ((int)(cpi->output_framerate / 2.0) + 2); + + if (cpi->max_gf_interval < 12) cpi->max_gf_interval = 12; + + /* Extended interval for genuinely static scenes */ + cpi->twopass.static_scene_max_gf_interval = cpi->key_frame_frequency >> 1; + + /* Special conditions when altr ref frame enabled in lagged compress mode */ + if (cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames) { + if (cpi->max_gf_interval > cpi->oxcf.lag_in_frames - 1) { + cpi->max_gf_interval = cpi->oxcf.lag_in_frames - 1; + } + + if (cpi->twopass.static_scene_max_gf_interval > + cpi->oxcf.lag_in_frames - 1) { + cpi->twopass.static_scene_max_gf_interval = cpi->oxcf.lag_in_frames - 1; + } + } + + if (cpi->max_gf_interval > cpi->twopass.static_scene_max_gf_interval) { + cpi->max_gf_interval = cpi->twopass.static_scene_max_gf_interval; + } +} + +static void init_config(VP8_COMP *cpi, const VP8_CONFIG *oxcf) { + VP8_COMMON *cm = &cpi->common; + + cpi->oxcf = *oxcf; + + cpi->auto_gold = 1; + cpi->auto_adjust_gold_quantizer = 1; + + cm->version = oxcf->Version; + vp8_setup_version(cm); + + /* Frame rate is not available on the first frame, as it's derived from + * the observed timestamps. The actual value used here doesn't matter + * too much, as it will adapt quickly. + */ + if (oxcf->timebase.num > 0) { + cpi->framerate = + (double)(oxcf->timebase.den) / (double)(oxcf->timebase.num); + } else { + cpi->framerate = 30; + } + + /* If the reciprocal of the timebase seems like a reasonable framerate, + * then use that as a guess, otherwise use 30. + */ + if (cpi->framerate > 180) cpi->framerate = 30; + + cpi->ref_framerate = cpi->framerate; + + cpi->ref_frame_flags = VP8_ALTR_FRAME | VP8_GOLD_FRAME | VP8_LAST_FRAME; + + cm->refresh_golden_frame = 0; + cm->refresh_last_frame = 1; + cm->refresh_entropy_probs = 1; + + /* change includes all joint functionality */ + vp8_change_config(cpi, oxcf); + + /* Initialize active best and worst q and average q values. */ + cpi->active_worst_quality = cpi->oxcf.worst_allowed_q; + cpi->active_best_quality = cpi->oxcf.best_allowed_q; + cpi->avg_frame_qindex = cpi->oxcf.worst_allowed_q; + + /* Initialise the starting buffer levels */ + cpi->buffer_level = cpi->oxcf.starting_buffer_level; + cpi->bits_off_target = cpi->oxcf.starting_buffer_level; + + cpi->rolling_target_bits = cpi->av_per_frame_bandwidth; + cpi->rolling_actual_bits = cpi->av_per_frame_bandwidth; + cpi->long_rolling_target_bits = cpi->av_per_frame_bandwidth; + cpi->long_rolling_actual_bits = cpi->av_per_frame_bandwidth; + + cpi->total_actual_bits = 0; + cpi->total_target_vs_actual = 0; + + /* Temporal scalabilty */ + if (cpi->oxcf.number_of_layers > 1) { + unsigned int i; + double prev_layer_framerate = 0; + + for (i = 0; i < cpi->oxcf.number_of_layers; ++i) { + vp8_init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate); + prev_layer_framerate = + cpi->output_framerate / cpi->oxcf.rate_decimator[i]; + } + } + +#if VP8_TEMPORAL_ALT_REF + { + int i; + + cpi->fixed_divide[0] = 0; + + for (i = 1; i < 512; ++i) cpi->fixed_divide[i] = 0x80000 / i; + } +#endif +} + +void vp8_update_layer_contexts(VP8_COMP *cpi) { + VP8_CONFIG *oxcf = &cpi->oxcf; + + /* Update snapshots of the layer contexts to reflect new parameters */ + if (oxcf->number_of_layers > 1) { + unsigned int i; + double prev_layer_framerate = 0; + + assert(oxcf->number_of_layers <= VPX_TS_MAX_LAYERS); + for (i = 0; i < oxcf->number_of_layers && i < VPX_TS_MAX_LAYERS; ++i) { + LAYER_CONTEXT *lc = &cpi->layer_context[i]; + + lc->framerate = cpi->ref_framerate / oxcf->rate_decimator[i]; + lc->target_bandwidth = oxcf->target_bitrate[i] * 1000; + + lc->starting_buffer_level = rescale( + (int)oxcf->starting_buffer_level_in_ms, lc->target_bandwidth, 1000); + + if (oxcf->optimal_buffer_level == 0) { + lc->optimal_buffer_level = lc->target_bandwidth / 8; + } else { + lc->optimal_buffer_level = rescale( + (int)oxcf->optimal_buffer_level_in_ms, lc->target_bandwidth, 1000); + } + + if (oxcf->maximum_buffer_size == 0) { + lc->maximum_buffer_size = lc->target_bandwidth / 8; + } else { + lc->maximum_buffer_size = rescale((int)oxcf->maximum_buffer_size_in_ms, + lc->target_bandwidth, 1000); + } + + /* Work out the average size of a frame within this layer */ + if (i > 0) { + lc->avg_frame_size_for_layer = + (int)round((oxcf->target_bitrate[i] - oxcf->target_bitrate[i - 1]) * + 1000 / (lc->framerate - prev_layer_framerate)); + } + + prev_layer_framerate = lc->framerate; + } + } +} + +void vp8_change_config(VP8_COMP *cpi, const VP8_CONFIG *oxcf) { + VP8_COMMON *cm = &cpi->common; + int last_w, last_h; + unsigned int prev_number_of_layers; + double raw_target_rate; + + if (!cpi) return; + + if (!oxcf) return; + + if (cm->version != oxcf->Version) { + cm->version = oxcf->Version; + vp8_setup_version(cm); + } + + last_w = cpi->oxcf.Width; + last_h = cpi->oxcf.Height; + prev_number_of_layers = cpi->oxcf.number_of_layers; + + cpi->oxcf = *oxcf; + + switch (cpi->oxcf.Mode) { + case MODE_REALTIME: + cpi->pass = 0; + cpi->compressor_speed = 2; + + if (cpi->oxcf.cpu_used < -16) { + cpi->oxcf.cpu_used = -16; + } + + if (cpi->oxcf.cpu_used > 16) cpi->oxcf.cpu_used = 16; + + break; + + case MODE_GOODQUALITY: + cpi->pass = 0; + cpi->compressor_speed = 1; + + if (cpi->oxcf.cpu_used < -5) { + cpi->oxcf.cpu_used = -5; + } + + if (cpi->oxcf.cpu_used > 5) cpi->oxcf.cpu_used = 5; + + break; + + case MODE_BESTQUALITY: + cpi->pass = 0; + cpi->compressor_speed = 0; + break; + + case MODE_FIRSTPASS: + cpi->pass = 1; + cpi->compressor_speed = 1; + break; + case MODE_SECONDPASS: + cpi->pass = 2; + cpi->compressor_speed = 1; + + if (cpi->oxcf.cpu_used < -5) { + cpi->oxcf.cpu_used = -5; + } + + if (cpi->oxcf.cpu_used > 5) cpi->oxcf.cpu_used = 5; + + break; + case MODE_SECONDPASS_BEST: + cpi->pass = 2; + cpi->compressor_speed = 0; + break; + } + + if (cpi->pass == 0) cpi->auto_worst_q = 1; + + cpi->oxcf.worst_allowed_q = q_trans[oxcf->worst_allowed_q]; + cpi->oxcf.best_allowed_q = q_trans[oxcf->best_allowed_q]; + cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level]; + + if (oxcf->fixed_q >= 0) { + if (oxcf->worst_allowed_q < 0) { + cpi->oxcf.fixed_q = q_trans[0]; + } else { + cpi->oxcf.fixed_q = q_trans[oxcf->worst_allowed_q]; + } + + if (oxcf->alt_q < 0) { + cpi->oxcf.alt_q = q_trans[0]; + } else { + cpi->oxcf.alt_q = q_trans[oxcf->alt_q]; + } + + if (oxcf->key_q < 0) { + cpi->oxcf.key_q = q_trans[0]; + } else { + cpi->oxcf.key_q = q_trans[oxcf->key_q]; + } + + if (oxcf->gold_q < 0) { + cpi->oxcf.gold_q = q_trans[0]; + } else { + cpi->oxcf.gold_q = q_trans[oxcf->gold_q]; + } + } + + cpi->ext_refresh_frame_flags_pending = 0; + + cpi->baseline_gf_interval = + cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL; + + // GF behavior for 1 pass CBR, used when error_resilience is off. + if (!cpi->oxcf.error_resilient_mode && + cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER && + cpi->oxcf.Mode == MODE_REALTIME) + cpi->baseline_gf_interval = cpi->gf_interval_onepass_cbr; + +#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING) + cpi->oxcf.token_partitions = 3; +#endif + + if (cpi->oxcf.token_partitions >= 0 && cpi->oxcf.token_partitions <= 3) { + cm->multi_token_partition = (TOKEN_PARTITION)cpi->oxcf.token_partitions; + } + + setup_features(cpi); + + if (!cpi->use_roi_static_threshold) { + int i; + for (i = 0; i < MAX_MB_SEGMENTS; ++i) { + cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout; + } + } + + /* At the moment the first order values may not be > MAXQ */ + if (cpi->oxcf.fixed_q > MAXQ) cpi->oxcf.fixed_q = MAXQ; + + /* local file playback mode == really big buffer */ + if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK) { + cpi->oxcf.starting_buffer_level = 60000; + cpi->oxcf.optimal_buffer_level = 60000; + cpi->oxcf.maximum_buffer_size = 240000; + cpi->oxcf.starting_buffer_level_in_ms = 60000; + cpi->oxcf.optimal_buffer_level_in_ms = 60000; + cpi->oxcf.maximum_buffer_size_in_ms = 240000; + } + + raw_target_rate = ((int64_t)cpi->oxcf.Width * cpi->oxcf.Height * 8 * 3 * + cpi->framerate / 1000.0); + if (cpi->oxcf.target_bandwidth > raw_target_rate) + cpi->oxcf.target_bandwidth = (unsigned int)raw_target_rate; + /* Convert target bandwidth from Kbit/s to Bit/s */ + cpi->oxcf.target_bandwidth *= 1000; + + cpi->oxcf.starting_buffer_level = rescale( + (int)cpi->oxcf.starting_buffer_level, cpi->oxcf.target_bandwidth, 1000); + + /* Set or reset optimal and maximum buffer levels. */ + if (cpi->oxcf.optimal_buffer_level == 0) { + cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8; + } else { + cpi->oxcf.optimal_buffer_level = rescale( + (int)cpi->oxcf.optimal_buffer_level, cpi->oxcf.target_bandwidth, 1000); + } + + if (cpi->oxcf.maximum_buffer_size == 0) { + cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8; + } else { + cpi->oxcf.maximum_buffer_size = rescale((int)cpi->oxcf.maximum_buffer_size, + cpi->oxcf.target_bandwidth, 1000); + } + // Under a configuration change, where maximum_buffer_size may change, + // keep buffer level clipped to the maximum allowed buffer size. + if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size) { + cpi->bits_off_target = cpi->oxcf.maximum_buffer_size; + cpi->buffer_level = cpi->bits_off_target; + } + + /* Set up frame rate and related parameters rate control values. */ + vp8_new_framerate(cpi, cpi->framerate); + + /* Set absolute upper and lower quality limits */ + cpi->worst_quality = cpi->oxcf.worst_allowed_q; + cpi->best_quality = cpi->oxcf.best_allowed_q; + + /* active values should only be modified if out of new range */ + if (cpi->active_worst_quality > cpi->oxcf.worst_allowed_q) { + cpi->active_worst_quality = cpi->oxcf.worst_allowed_q; + } + /* less likely */ + else if (cpi->active_worst_quality < cpi->oxcf.best_allowed_q) { + cpi->active_worst_quality = cpi->oxcf.best_allowed_q; + } + if (cpi->active_best_quality < cpi->oxcf.best_allowed_q) { + cpi->active_best_quality = cpi->oxcf.best_allowed_q; + } + /* less likely */ + else if (cpi->active_best_quality > cpi->oxcf.worst_allowed_q) { + cpi->active_best_quality = cpi->oxcf.worst_allowed_q; + } + + cpi->buffered_mode = cpi->oxcf.optimal_buffer_level > 0; + + cpi->cq_target_quality = cpi->oxcf.cq_level; + + /* Only allow dropped frames in buffered mode */ + cpi->drop_frames_allowed = cpi->oxcf.allow_df && cpi->buffered_mode; + + cpi->target_bandwidth = cpi->oxcf.target_bandwidth; + + // Check if the number of temporal layers has changed, and if so reset the + // pattern counter and set/initialize the temporal layer context for the + // new layer configuration. + if (cpi->oxcf.number_of_layers != prev_number_of_layers) { + // If the number of temporal layers are changed we must start at the + // base of the pattern cycle, so set the layer id to 0 and reset + // the temporal pattern counter. + if (cpi->temporal_layer_id > 0) { + cpi->temporal_layer_id = 0; + } + cpi->temporal_pattern_counter = 0; + vp8_reset_temporal_layer_change(cpi, oxcf, prev_number_of_layers); + } + + if (!cpi->initial_width) { + cpi->initial_width = cpi->oxcf.Width; + cpi->initial_height = cpi->oxcf.Height; + } + + cm->Width = cpi->oxcf.Width; + cm->Height = cpi->oxcf.Height; + assert(cm->Width <= cpi->initial_width); + assert(cm->Height <= cpi->initial_height); + + /* TODO(jkoleszar): if an internal spatial resampling is active, + * and we downsize the input image, maybe we should clear the + * internal scale immediately rather than waiting for it to + * correct. + */ + + /* VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs) */ + if (cpi->oxcf.Sharpness > 7) cpi->oxcf.Sharpness = 7; + + cm->sharpness_level = cpi->oxcf.Sharpness; + + if (cm->horiz_scale != VP8E_NORMAL || cm->vert_scale != VP8E_NORMAL) { + int hr, hs, vr, vs; + + Scale2Ratio(cm->horiz_scale, &hr, &hs); + Scale2Ratio(cm->vert_scale, &vr, &vs); + + /* always go to the next whole number */ + cm->Width = (hs - 1 + cpi->oxcf.Width * hr) / hs; + cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs; + } + + if (last_w != cpi->oxcf.Width || last_h != cpi->oxcf.Height) { + cpi->force_next_frame_intra = 1; + } + + if (((cm->Width + 15) & ~15) != cm->yv12_fb[cm->lst_fb_idx].y_width || + ((cm->Height + 15) & ~15) != cm->yv12_fb[cm->lst_fb_idx].y_height || + cm->yv12_fb[cm->lst_fb_idx].y_width == 0) { + dealloc_raw_frame_buffers(cpi); + alloc_raw_frame_buffers(cpi); + vp8_alloc_compressor_data(cpi); + } + + if (cpi->oxcf.fixed_q >= 0) { + cpi->last_q[0] = cpi->oxcf.fixed_q; + cpi->last_q[1] = cpi->oxcf.fixed_q; + } + + cpi->Speed = cpi->oxcf.cpu_used; + + /* force to allowlag to 0 if lag_in_frames is 0; */ + if (cpi->oxcf.lag_in_frames == 0) { + cpi->oxcf.allow_lag = 0; + } + /* Limit on lag buffers as these are not currently dynamically allocated */ + else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS) { + cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS; + } + + /* YX Temp */ + cpi->alt_ref_source = NULL; + cpi->is_src_frame_alt_ref = 0; + +#if CONFIG_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity) { + if (!cpi->denoiser.yv12_mc_running_avg.buffer_alloc) { + int width = (cpi->oxcf.Width + 15) & ~15; + int height = (cpi->oxcf.Height + 15) & ~15; + if (vp8_denoiser_allocate(&cpi->denoiser, width, height, cm->mb_rows, + cm->mb_cols, cpi->oxcf.noise_sensitivity)) { + vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, + "Failed to allocate denoiser"); + } + } + } +#endif + +#if 0 + /* Experimental RD Code */ + cpi->frame_distortion = 0; + cpi->last_frame_distortion = 0; +#endif +} + +#ifndef M_LOG2_E +#define M_LOG2_E 0.693147180559945309417 +#endif +#define log2f(x) (log(x) / (float)M_LOG2_E) + +static void cal_mvsadcosts(int *mvsadcost[2]) { + int i = 1; + + mvsadcost[0][0] = 300; + mvsadcost[1][0] = 300; + + do { + double z = 256 * (2 * (log2f(8 * i) + .6)); + mvsadcost[0][i] = (int)z; + mvsadcost[1][i] = (int)z; + mvsadcost[0][-i] = (int)z; + mvsadcost[1][-i] = (int)z; + } while (++i <= mvfp_max); +} + +struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf) { + int i; + + VP8_COMP *cpi; + VP8_COMMON *cm; + + cpi = vpx_memalign(32, sizeof(VP8_COMP)); + /* Check that the CPI instance is valid */ + if (!cpi) return 0; + + cm = &cpi->common; + + memset(cpi, 0, sizeof(VP8_COMP)); + + if (setjmp(cm->error.jmp)) { + cpi->common.error.setjmp = 0; + vp8_remove_compressor(&cpi); + return 0; + } + + cpi->common.error.setjmp = 1; + + CHECK_MEM_ERROR( + &cpi->common.error, cpi->mb.ss, + vpx_calloc(sizeof(search_site), (MAX_MVSEARCH_STEPS * 8) + 1)); + + vp8_create_common(&cpi->common); + + init_config(cpi, oxcf); + + memcpy(cpi->base_skip_false_prob, vp8cx_base_skip_false_prob, + sizeof(vp8cx_base_skip_false_prob)); + cpi->common.current_video_frame = 0; + cpi->temporal_pattern_counter = 0; + cpi->temporal_layer_id = -1; + cpi->kf_overspend_bits = 0; + cpi->kf_bitrate_adjustment = 0; + cpi->frames_till_gf_update_due = 0; + cpi->gf_overspend_bits = 0; + cpi->non_gf_bitrate_adjustment = 0; + cpi->prob_last_coded = 128; + cpi->prob_gf_coded = 128; + cpi->prob_intra_coded = 63; + + /* Prime the recent reference frame usage counters. + * Hereafter they will be maintained as a sort of moving average + */ + cpi->recent_ref_frame_usage[INTRA_FRAME] = 1; + cpi->recent_ref_frame_usage[LAST_FRAME] = 1; + cpi->recent_ref_frame_usage[GOLDEN_FRAME] = 1; + cpi->recent_ref_frame_usage[ALTREF_FRAME] = 1; + + /* Set reference frame sign bias for ALTREF frame to 1 (for now) */ + cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1; + + cpi->twopass.gf_decay_rate = 0; + cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL; + + cpi->gold_is_last = 0; + cpi->alt_is_last = 0; + cpi->gold_is_alt = 0; + + cpi->active_map_enabled = 0; + + cpi->use_roi_static_threshold = 0; + +#if 0 + /* Experimental code for lagged and one pass */ + /* Initialise one_pass GF frames stats */ + /* Update stats used for GF selection */ + if (cpi->pass == 0) + { + cpi->one_pass_frame_index = 0; + + for (i = 0; i < MAX_LAG_BUFFERS; ++i) + { + cpi->one_pass_frame_stats[i].frames_so_far = 0; + cpi->one_pass_frame_stats[i].frame_intra_error = 0.0; + cpi->one_pass_frame_stats[i].frame_coded_error = 0.0; + cpi->one_pass_frame_stats[i].frame_pcnt_inter = 0.0; + cpi->one_pass_frame_stats[i].frame_pcnt_motion = 0.0; + cpi->one_pass_frame_stats[i].frame_mvr = 0.0; + cpi->one_pass_frame_stats[i].frame_mvr_abs = 0.0; + cpi->one_pass_frame_stats[i].frame_mvc = 0.0; + cpi->one_pass_frame_stats[i].frame_mvc_abs = 0.0; + } + } +#endif + + cpi->mse_source_denoised = 0; + + /* Should we use the cyclic refresh method. + * Currently there is no external control for this. + * Enable it for error_resilient_mode, or for 1 pass CBR mode. + */ + cpi->cyclic_refresh_mode_enabled = + (cpi->oxcf.error_resilient_mode || + (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER && + cpi->oxcf.Mode <= 2)); + cpi->cyclic_refresh_mode_max_mbs_perframe = + (cpi->common.mb_rows * cpi->common.mb_cols) / 7; + if (cpi->oxcf.number_of_layers == 1) { + cpi->cyclic_refresh_mode_max_mbs_perframe = + (cpi->common.mb_rows * cpi->common.mb_cols) / 20; + } else if (cpi->oxcf.number_of_layers == 2) { + cpi->cyclic_refresh_mode_max_mbs_perframe = + (cpi->common.mb_rows * cpi->common.mb_cols) / 10; + } + cpi->cyclic_refresh_mode_index = 0; + cpi->cyclic_refresh_q = 32; + + // GF behavior for 1 pass CBR, used when error_resilience is off. + cpi->gf_update_onepass_cbr = 0; + cpi->gf_noboost_onepass_cbr = 0; + if (!cpi->oxcf.error_resilient_mode && + cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER && cpi->oxcf.Mode <= 2) { + cpi->gf_update_onepass_cbr = 1; + cpi->gf_noboost_onepass_cbr = 1; + cpi->gf_interval_onepass_cbr = + cpi->cyclic_refresh_mode_max_mbs_perframe > 0 + ? (2 * (cpi->common.mb_rows * cpi->common.mb_cols) / + cpi->cyclic_refresh_mode_max_mbs_perframe) + : 10; + cpi->gf_interval_onepass_cbr = + VPXMIN(40, VPXMAX(6, cpi->gf_interval_onepass_cbr)); + cpi->baseline_gf_interval = cpi->gf_interval_onepass_cbr; + } + + if (cpi->cyclic_refresh_mode_enabled) { + CHECK_MEM_ERROR(&cpi->common.error, cpi->cyclic_refresh_map, + vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1)); + } else { + cpi->cyclic_refresh_map = (signed char *)NULL; + } + + CHECK_MEM_ERROR( + &cpi->common.error, cpi->skin_map, + vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(cpi->skin_map[0]))); + + CHECK_MEM_ERROR(&cpi->common.error, cpi->consec_zero_last, + vpx_calloc(cm->mb_rows * cm->mb_cols, 1)); + CHECK_MEM_ERROR(&cpi->common.error, cpi->consec_zero_last_mvbias, + vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1)); + + /*Initialize the feed-forward activity masking.*/ + cpi->activity_avg = 90 << 12; + + /* Give a sensible default for the first frame. */ + cpi->frames_since_key = 8; + cpi->key_frame_frequency = cpi->oxcf.key_freq; + cpi->this_key_frame_forced = 0; + cpi->next_key_frame_forced = 0; + + cpi->source_alt_ref_pending = 0; + cpi->source_alt_ref_active = 0; + cpi->common.refresh_alt_ref_frame = 0; + + cpi->force_maxqp = 0; + cpi->frames_since_last_drop_overshoot = 0; + cpi->rt_always_update_correction_factor = 0; + cpi->rt_drop_recode_on_overshoot = 1; + + cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS; +#if CONFIG_INTERNAL_STATS + cpi->b_calculate_ssimg = 0; + + cpi->count = 0; + cpi->bytes = 0; + + if (cpi->b_calculate_psnr) { + cpi->total_sq_error = 0.0; + cpi->total_sq_error2 = 0.0; + cpi->total_y = 0.0; + cpi->total_u = 0.0; + cpi->total_v = 0.0; + cpi->total = 0.0; + cpi->totalp_y = 0.0; + cpi->totalp_u = 0.0; + cpi->totalp_v = 0.0; + cpi->totalp = 0.0; + cpi->tot_recode_hits = 0; + cpi->summed_quality = 0; + cpi->summed_weights = 0; + } + +#endif + + cpi->first_time_stamp_ever = 0x7FFFFFFF; + + cpi->frames_till_gf_update_due = 0; + cpi->key_frame_count = 1; + + cpi->ni_av_qi = cpi->oxcf.worst_allowed_q; + cpi->ni_tot_qi = 0; + cpi->ni_frames = 0; + cpi->total_byte_count = 0; + + cpi->drop_frame = 0; + + cpi->rate_correction_factor = 1.0; + cpi->key_frame_rate_correction_factor = 1.0; + cpi->gf_rate_correction_factor = 1.0; + cpi->twopass.est_max_qcorrection_factor = 1.0; + + for (i = 0; i < KEY_FRAME_CONTEXT; ++i) { + cpi->prior_key_frame_distance[i] = (int)cpi->output_framerate; + } + +#ifdef OUTPUT_YUV_SRC + yuv_file = fopen("bd.yuv", "ab"); +#endif +#ifdef OUTPUT_YUV_DENOISED + yuv_denoised_file = fopen("denoised.yuv", "ab"); +#endif +#ifdef OUTPUT_YUV_SKINMAP + yuv_skinmap_file = fopen("skinmap.yuv", "wb"); +#endif + +#if 0 + framepsnr = fopen("framepsnr.stt", "a"); + kf_list = fopen("kf_list.stt", "w"); +#endif + + cpi->output_pkt_list = oxcf->output_pkt_list; + +#if !CONFIG_REALTIME_ONLY + + if (cpi->pass == 1) { + vp8_init_first_pass(cpi); + } else if (cpi->pass == 2) { + size_t packet_sz = sizeof(FIRSTPASS_STATS); + int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz); + + cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf; + cpi->twopass.stats_in = cpi->twopass.stats_in_start; + cpi->twopass.stats_in_end = + (void *)((char *)cpi->twopass.stats_in + (packets - 1) * packet_sz); + vp8_init_second_pass(cpi); + } + +#endif + + if (cpi->compressor_speed == 2) { + cpi->avg_encode_time = 0; + cpi->avg_pick_mode_time = 0; + } + + vp8_set_speed_features(cpi); + + /* Set starting values of RD threshold multipliers (128 = *1) */ + for (i = 0; i < MAX_MODES; ++i) { + cpi->mb.rd_thresh_mult[i] = 128; + } + +#if CONFIG_MULTITHREAD + if (vp8cx_create_encoder_threads(cpi)) { + vp8_remove_compressor(&cpi); + return 0; + } +#endif + + cpi->fn_ptr[BLOCK_16X16].sdf = vpx_sad16x16; + cpi->fn_ptr[BLOCK_16X16].vf = vpx_variance16x16; + cpi->fn_ptr[BLOCK_16X16].svf = vpx_sub_pixel_variance16x16; + cpi->fn_ptr[BLOCK_16X16].sdx4df = vpx_sad16x16x4d; + + cpi->fn_ptr[BLOCK_16X8].sdf = vpx_sad16x8; + cpi->fn_ptr[BLOCK_16X8].vf = vpx_variance16x8; + cpi->fn_ptr[BLOCK_16X8].svf = vpx_sub_pixel_variance16x8; + cpi->fn_ptr[BLOCK_16X8].sdx4df = vpx_sad16x8x4d; + + cpi->fn_ptr[BLOCK_8X16].sdf = vpx_sad8x16; + cpi->fn_ptr[BLOCK_8X16].vf = vpx_variance8x16; + cpi->fn_ptr[BLOCK_8X16].svf = vpx_sub_pixel_variance8x16; + cpi->fn_ptr[BLOCK_8X16].sdx4df = vpx_sad8x16x4d; + + cpi->fn_ptr[BLOCK_8X8].sdf = vpx_sad8x8; + cpi->fn_ptr[BLOCK_8X8].vf = vpx_variance8x8; + cpi->fn_ptr[BLOCK_8X8].svf = vpx_sub_pixel_variance8x8; + cpi->fn_ptr[BLOCK_8X8].sdx4df = vpx_sad8x8x4d; + + cpi->fn_ptr[BLOCK_4X4].sdf = vpx_sad4x4; + cpi->fn_ptr[BLOCK_4X4].vf = vpx_variance4x4; + cpi->fn_ptr[BLOCK_4X4].svf = vpx_sub_pixel_variance4x4; + cpi->fn_ptr[BLOCK_4X4].sdx4df = vpx_sad4x4x4d; + +#if VPX_ARCH_X86 || VPX_ARCH_X86_64 + cpi->fn_ptr[BLOCK_16X16].copymem = vp8_copy32xn; + cpi->fn_ptr[BLOCK_16X8].copymem = vp8_copy32xn; + cpi->fn_ptr[BLOCK_8X16].copymem = vp8_copy32xn; + cpi->fn_ptr[BLOCK_8X8].copymem = vp8_copy32xn; + cpi->fn_ptr[BLOCK_4X4].copymem = vp8_copy32xn; +#endif + + cpi->diamond_search_sad = vp8_diamond_search_sad; + cpi->refining_search_sad = vp8_refining_search_sad; + + /* make sure frame 1 is okay */ + cpi->mb.error_bins[0] = cpi->common.MBs; + + /* vp8cx_init_quantizer() is first called here. Add check in + * vp8cx_frame_init_quantizer() so that vp8cx_init_quantizer is only + * called later when needed. This will avoid unnecessary calls of + * vp8cx_init_quantizer() for every frame. + */ + vp8cx_init_quantizer(cpi); + + vp8_loop_filter_init(cm); + + cpi->common.error.setjmp = 0; + +#if CONFIG_MULTI_RES_ENCODING + + /* Calculate # of MBs in a row in lower-resolution level image. */ + if (cpi->oxcf.mr_encoder_id > 0) vp8_cal_low_res_mb_cols(cpi); + +#endif + + /* setup RD costs to MACROBLOCK struct */ + + cpi->mb.mvcost[0] = &cpi->rd_costs.mvcosts[0][mv_max + 1]; + cpi->mb.mvcost[1] = &cpi->rd_costs.mvcosts[1][mv_max + 1]; + cpi->mb.mvsadcost[0] = &cpi->rd_costs.mvsadcosts[0][mvfp_max + 1]; + cpi->mb.mvsadcost[1] = &cpi->rd_costs.mvsadcosts[1][mvfp_max + 1]; + + cal_mvsadcosts(cpi->mb.mvsadcost); + + cpi->mb.mbmode_cost = cpi->rd_costs.mbmode_cost; + cpi->mb.intra_uv_mode_cost = cpi->rd_costs.intra_uv_mode_cost; + cpi->mb.bmode_costs = cpi->rd_costs.bmode_costs; + cpi->mb.inter_bmode_costs = cpi->rd_costs.inter_bmode_costs; + cpi->mb.token_costs = cpi->rd_costs.token_costs; + + /* setup block ptrs & offsets */ + vp8_setup_block_ptrs(&cpi->mb); + vp8_setup_block_dptrs(&cpi->mb.e_mbd); + + return cpi; +} + +void vp8_remove_compressor(VP8_COMP **comp) { + VP8_COMP *cpi = *comp; + + if (!cpi) return; + + if (cpi && (cpi->common.current_video_frame > 0)) { +#if !CONFIG_REALTIME_ONLY + + if (cpi->pass == 2) { + vp8_end_second_pass(cpi); + } + +#endif + +#if CONFIG_INTERNAL_STATS + + if (cpi->pass != 1) { + FILE *f = fopen("opsnr.stt", "a"); + double time_encoded = + (cpi->last_end_time_stamp_seen - cpi->first_time_stamp_ever) / + 10000000.000; + + if (cpi->b_calculate_psnr) { + if (cpi->oxcf.number_of_layers > 1) { + int i; + + fprintf(f, + "Layer\tBitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\t" + "GLPsnrP\tVPXSSIM\n"); + for (i = 0; i < (int)cpi->oxcf.number_of_layers; ++i) { + double dr = + (double)cpi->bytes_in_layer[i] * 8.0 / 1000.0 / time_encoded; + double samples = 3.0 / 2 * cpi->frames_in_layer[i] * + cpi->common.Width * cpi->common.Height; + double total_psnr = + vpx_sse_to_psnr(samples, 255.0, cpi->total_error2[i]); + double total_psnr2 = + vpx_sse_to_psnr(samples, 255.0, cpi->total_error2_p[i]); + double total_ssim = + 100 * pow(cpi->sum_ssim[i] / cpi->sum_weights[i], 8.0); + + fprintf(f, + "%5d\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t" + "%7.3f\t%7.3f\n", + i, dr, cpi->sum_psnr[i] / cpi->frames_in_layer[i], + total_psnr, cpi->sum_psnr_p[i] / cpi->frames_in_layer[i], + total_psnr2, total_ssim); + } + } else { + double dr = (double)cpi->bytes * 8.0 / 1000.0 / time_encoded; + double samples = + 3.0 / 2 * cpi->count * cpi->common.Width * cpi->common.Height; + double total_psnr = + vpx_sse_to_psnr(samples, 255.0, cpi->total_sq_error); + double total_psnr2 = + vpx_sse_to_psnr(samples, 255.0, cpi->total_sq_error2); + double total_ssim = + 100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0); + + fprintf(f, + "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\t" + "GLPsnrP\tVPXSSIM\n"); + fprintf(f, + "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t" + "%7.3f\n", + dr, cpi->total / cpi->count, total_psnr, + cpi->totalp / cpi->count, total_psnr2, total_ssim); + } + } + fclose(f); +#if 0 + f = fopen("qskip.stt", "a"); + fprintf(f, "minq:%d -maxq:%d skiptrue:skipfalse = %d:%d\n", cpi->oxcf.best_allowed_q, cpi->oxcf.worst_allowed_q, skiptruecount, skipfalsecount); + fclose(f); +#endif + } + +#endif + +#ifdef SPEEDSTATS + + if (cpi->compressor_speed == 2) { + int i; + FILE *f = fopen("cxspeed.stt", "a"); + cnt_pm /= cpi->common.MBs; + + for (i = 0; i < 16; ++i) fprintf(f, "%5d", frames_at_speed[i]); + + fprintf(f, "\n"); + fclose(f); + } + +#endif + +#ifdef MODE_STATS + { + extern int count_mb_seg[4]; + FILE *f = fopen("modes.stt", "a"); + double dr = (double)cpi->framerate * (double)bytes * (double)8 / + (double)count / (double)1000; + fprintf(f, "intra_mode in Intra Frames:\n"); + fprintf(f, "Y: %8d, %8d, %8d, %8d, %8d\n", y_modes[0], y_modes[1], + y_modes[2], y_modes[3], y_modes[4]); + fprintf(f, "UV:%8d, %8d, %8d, %8d\n", uv_modes[0], uv_modes[1], + uv_modes[2], uv_modes[3]); + fprintf(f, "B: "); + { + int i; + + for (i = 0; i < 10; ++i) fprintf(f, "%8d, ", b_modes[i]); + + fprintf(f, "\n"); + } + + fprintf(f, "Modes in Inter Frames:\n"); + fprintf(f, "Y: %8d, %8d, %8d, %8d, %8d, %8d, %8d, %8d, %8d, %8d\n", + inter_y_modes[0], inter_y_modes[1], inter_y_modes[2], + inter_y_modes[3], inter_y_modes[4], inter_y_modes[5], + inter_y_modes[6], inter_y_modes[7], inter_y_modes[8], + inter_y_modes[9]); + fprintf(f, "UV:%8d, %8d, %8d, %8d\n", inter_uv_modes[0], + inter_uv_modes[1], inter_uv_modes[2], inter_uv_modes[3]); + fprintf(f, "B: "); + { + int i; + + for (i = 0; i < 15; ++i) fprintf(f, "%8d, ", inter_b_modes[i]); + + fprintf(f, "\n"); + } + fprintf(f, "P:%8d, %8d, %8d, %8d\n", count_mb_seg[0], count_mb_seg[1], + count_mb_seg[2], count_mb_seg[3]); + fprintf(f, "PB:%8d, %8d, %8d, %8d\n", inter_b_modes[LEFT4X4], + inter_b_modes[ABOVE4X4], inter_b_modes[ZERO4X4], + inter_b_modes[NEW4X4]); + + fclose(f); + } +#endif + +#if defined(SECTIONBITS_OUTPUT) + + if (0) { + int i; + FILE *f = fopen("tokenbits.stt", "a"); + + for (i = 0; i < 28; ++i) fprintf(f, "%8d", (int)(Sectionbits[i] / 256)); + + fprintf(f, "\n"); + fclose(f); + } + +#endif + +#if 0 + { + printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000); + printf("\n_frames receive_data encod_mb_row compress_frame Total\n"); + printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame, cpi->time_receive_data / 1000, cpi->time_encode_mb_row / 1000, cpi->time_compress_data / 1000, (cpi->time_receive_data + cpi->time_compress_data) / 1000); + } +#endif + } + +#if CONFIG_MULTITHREAD + vp8cx_remove_encoder_threads(cpi); +#endif + +#if CONFIG_TEMPORAL_DENOISING + vp8_denoiser_free(&cpi->denoiser); +#endif + dealloc_compressor_data(cpi); + vpx_free(cpi->mb.ss); + vpx_free(cpi->tok); + vpx_free(cpi->skin_map); + vpx_free(cpi->cyclic_refresh_map); + vpx_free(cpi->consec_zero_last); + vpx_free(cpi->consec_zero_last_mvbias); + + vp8_remove_common(&cpi->common); + vpx_free(cpi); + *comp = 0; + +#ifdef OUTPUT_YUV_SRC + fclose(yuv_file); +#endif +#ifdef OUTPUT_YUV_DENOISED + fclose(yuv_denoised_file); +#endif +#ifdef OUTPUT_YUV_SKINMAP + fclose(yuv_skinmap_file); +#endif + +#if 0 + + if (keyfile) + fclose(keyfile); + + if (framepsnr) + fclose(framepsnr); + + if (kf_list) + fclose(kf_list); + +#endif +} + +static uint64_t calc_plane_error(unsigned char *orig, int orig_stride, + unsigned char *recon, int recon_stride, + unsigned int cols, unsigned int rows) { + unsigned int row, col; + uint64_t total_sse = 0; + int diff; + + for (row = 0; row + 16 <= rows; row += 16) { + for (col = 0; col + 16 <= cols; col += 16) { + unsigned int sse; + + vpx_mse16x16(orig + col, orig_stride, recon + col, recon_stride, &sse); + total_sse += sse; + } + + /* Handle odd-sized width */ + if (col < cols) { + unsigned int border_row, border_col; + unsigned char *border_orig = orig; + unsigned char *border_recon = recon; + + for (border_row = 0; border_row < 16; ++border_row) { + for (border_col = col; border_col < cols; ++border_col) { + diff = border_orig[border_col] - border_recon[border_col]; + total_sse += diff * diff; + } + + border_orig += orig_stride; + border_recon += recon_stride; + } + } + + orig += orig_stride * 16; + recon += recon_stride * 16; + } + + /* Handle odd-sized height */ + for (; row < rows; ++row) { + for (col = 0; col < cols; ++col) { + diff = orig[col] - recon[col]; + total_sse += diff * diff; + } + + orig += orig_stride; + recon += recon_stride; + } + + vpx_clear_system_state(); + return total_sse; +} + +static void generate_psnr_packet(VP8_COMP *cpi) { + YV12_BUFFER_CONFIG *orig = cpi->Source; + YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show; + struct vpx_codec_cx_pkt pkt; + uint64_t sse; + int i; + unsigned int width = cpi->common.Width; + unsigned int height = cpi->common.Height; + + pkt.kind = VPX_CODEC_PSNR_PKT; + sse = calc_plane_error(orig->y_buffer, orig->y_stride, recon->y_buffer, + recon->y_stride, width, height); + pkt.data.psnr.sse[0] = sse; + pkt.data.psnr.sse[1] = sse; + pkt.data.psnr.samples[0] = width * height; + pkt.data.psnr.samples[1] = width * height; + + width = (width + 1) / 2; + height = (height + 1) / 2; + + sse = calc_plane_error(orig->u_buffer, orig->uv_stride, recon->u_buffer, + recon->uv_stride, width, height); + pkt.data.psnr.sse[0] += sse; + pkt.data.psnr.sse[2] = sse; + pkt.data.psnr.samples[0] += width * height; + pkt.data.psnr.samples[2] = width * height; + + sse = calc_plane_error(orig->v_buffer, orig->uv_stride, recon->v_buffer, + recon->uv_stride, width, height); + pkt.data.psnr.sse[0] += sse; + pkt.data.psnr.sse[3] = sse; + pkt.data.psnr.samples[0] += width * height; + pkt.data.psnr.samples[3] = width * height; + + for (i = 0; i < 4; ++i) { + pkt.data.psnr.psnr[i] = vpx_sse_to_psnr(pkt.data.psnr.samples[i], 255.0, + (double)(pkt.data.psnr.sse[i])); + } + + vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt); +} + +int vp8_use_as_reference(VP8_COMP *cpi, int ref_frame_flags) { + if (ref_frame_flags > 7) return -1; + + cpi->ref_frame_flags = ref_frame_flags; + return 0; +} +int vp8_update_reference(VP8_COMP *cpi, int ref_frame_flags) { + if (ref_frame_flags > 7) return -1; + + cpi->common.refresh_golden_frame = 0; + cpi->common.refresh_alt_ref_frame = 0; + cpi->common.refresh_last_frame = 0; + + if (ref_frame_flags & VP8_LAST_FRAME) cpi->common.refresh_last_frame = 1; + + if (ref_frame_flags & VP8_GOLD_FRAME) cpi->common.refresh_golden_frame = 1; + + if (ref_frame_flags & VP8_ALTR_FRAME) cpi->common.refresh_alt_ref_frame = 1; + + cpi->ext_refresh_frame_flags_pending = 1; + return 0; +} + +int vp8_get_reference(VP8_COMP *cpi, enum vpx_ref_frame_type ref_frame_flag, + YV12_BUFFER_CONFIG *sd) { + VP8_COMMON *cm = &cpi->common; + int ref_fb_idx; + + if (ref_frame_flag == VP8_LAST_FRAME) { + ref_fb_idx = cm->lst_fb_idx; + } else if (ref_frame_flag == VP8_GOLD_FRAME) { + ref_fb_idx = cm->gld_fb_idx; + } else if (ref_frame_flag == VP8_ALTR_FRAME) { + ref_fb_idx = cm->alt_fb_idx; + } else { + return -1; + } + + vp8_yv12_copy_frame(&cm->yv12_fb[ref_fb_idx], sd); + + return 0; +} +int vp8_set_reference(VP8_COMP *cpi, enum vpx_ref_frame_type ref_frame_flag, + YV12_BUFFER_CONFIG *sd) { + VP8_COMMON *cm = &cpi->common; + + int ref_fb_idx; + + if (ref_frame_flag == VP8_LAST_FRAME) { + ref_fb_idx = cm->lst_fb_idx; + } else if (ref_frame_flag == VP8_GOLD_FRAME) { + ref_fb_idx = cm->gld_fb_idx; + } else if (ref_frame_flag == VP8_ALTR_FRAME) { + ref_fb_idx = cm->alt_fb_idx; + } else { + return -1; + } + + vp8_yv12_copy_frame(sd, &cm->yv12_fb[ref_fb_idx]); + + return 0; +} +int vp8_update_entropy(VP8_COMP *cpi, int update) { + VP8_COMMON *cm = &cpi->common; + cm->refresh_entropy_probs = update; + + return 0; +} + +static void scale_and_extend_source(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) { + VP8_COMMON *cm = &cpi->common; + + /* are we resizing the image */ + if (cm->horiz_scale != 0 || cm->vert_scale != 0) { +#if CONFIG_SPATIAL_RESAMPLING + int hr, hs, vr, vs; + int tmp_height; + + if (cm->vert_scale == 3) { + tmp_height = 9; + } else { + tmp_height = 11; + } + + Scale2Ratio(cm->horiz_scale, &hr, &hs); + Scale2Ratio(cm->vert_scale, &vr, &vs); + + vpx_scale_frame(sd, &cpi->scaled_source, cm->temp_scale_frame.y_buffer, + tmp_height, hs, hr, vs, vr, 0); + + vp8_yv12_extend_frame_borders(&cpi->scaled_source); + cpi->Source = &cpi->scaled_source; +#endif + } else { + cpi->Source = sd; + } +} + +static int resize_key_frame(VP8_COMP *cpi) { +#if CONFIG_SPATIAL_RESAMPLING + VP8_COMMON *cm = &cpi->common; + + /* Do we need to apply resampling for one pass cbr. + * In one pass this is more limited than in two pass cbr. + * The test and any change is only made once per key frame sequence. + */ + if (cpi->oxcf.allow_spatial_resampling && + (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)) { + int hr, hs, vr, vs; + int new_width, new_height; + + /* If we are below the resample DOWN watermark then scale down a + * notch. + */ + if (cpi->buffer_level < (cpi->oxcf.resample_down_water_mark * + cpi->oxcf.optimal_buffer_level / 100)) { + cm->horiz_scale = + (cm->horiz_scale < VP8E_ONETWO) ? cm->horiz_scale + 1 : VP8E_ONETWO; + cm->vert_scale = + (cm->vert_scale < VP8E_ONETWO) ? cm->vert_scale + 1 : VP8E_ONETWO; + } + /* Should we now start scaling back up */ + else if (cpi->buffer_level > (cpi->oxcf.resample_up_water_mark * + cpi->oxcf.optimal_buffer_level / 100)) { + cm->horiz_scale = + (cm->horiz_scale > VP8E_NORMAL) ? cm->horiz_scale - 1 : VP8E_NORMAL; + cm->vert_scale = + (cm->vert_scale > VP8E_NORMAL) ? cm->vert_scale - 1 : VP8E_NORMAL; + } + + /* Get the new height and width */ + Scale2Ratio(cm->horiz_scale, &hr, &hs); + Scale2Ratio(cm->vert_scale, &vr, &vs); + new_width = ((hs - 1) + (cpi->oxcf.Width * hr)) / hs; + new_height = ((vs - 1) + (cpi->oxcf.Height * vr)) / vs; + + /* If the image size has changed we need to reallocate the buffers + * and resample the source image + */ + if ((cm->Width != new_width) || (cm->Height != new_height)) { + cm->Width = new_width; + cm->Height = new_height; + vp8_alloc_compressor_data(cpi); + scale_and_extend_source(cpi->un_scaled_source, cpi); + return 1; + } + } + +#endif + return 0; +} + +static void update_alt_ref_frame_stats(VP8_COMP *cpi) { + VP8_COMMON *cm = &cpi->common; + + /* Select an interval before next GF or altref */ + if (!cpi->auto_gold) cpi->frames_till_gf_update_due = DEFAULT_GF_INTERVAL; + + if ((cpi->pass != 2) && cpi->frames_till_gf_update_due) { + cpi->current_gf_interval = cpi->frames_till_gf_update_due; + + /* Set the bits per frame that we should try and recover in + * subsequent inter frames to account for the extra GF spend... + * note that his does not apply for GF updates that occur + * coincident with a key frame as the extra cost of key frames is + * dealt with elsewhere. + */ + cpi->gf_overspend_bits += cpi->projected_frame_size; + cpi->non_gf_bitrate_adjustment = + cpi->gf_overspend_bits / cpi->frames_till_gf_update_due; + } + + /* Update data structure that monitors level of reference to last GF */ + memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols)); + cpi->gf_active_count = cm->mb_rows * cm->mb_cols; + + /* this frame refreshes means next frames don't unless specified by user */ + cpi->frames_since_golden = 0; + + /* Clear the alternate reference update pending flag. */ + cpi->source_alt_ref_pending = 0; + + /* Set the alternate reference frame active flag */ + cpi->source_alt_ref_active = 1; +} +static void update_golden_frame_stats(VP8_COMP *cpi) { + VP8_COMMON *cm = &cpi->common; + + /* Update the Golden frame usage counts. */ + if (cm->refresh_golden_frame) { + /* Select an interval before next GF */ + if (!cpi->auto_gold) cpi->frames_till_gf_update_due = DEFAULT_GF_INTERVAL; + + if ((cpi->pass != 2) && (cpi->frames_till_gf_update_due > 0)) { + cpi->current_gf_interval = cpi->frames_till_gf_update_due; + + /* Set the bits per frame that we should try and recover in + * subsequent inter frames to account for the extra GF spend... + * note that his does not apply for GF updates that occur + * coincident with a key frame as the extra cost of key frames + * is dealt with elsewhere. + */ + if ((cm->frame_type != KEY_FRAME) && !cpi->source_alt_ref_active) { + /* Calcluate GF bits to be recovered + * Projected size - av frame bits available for inter + * frames for clip as a whole + */ + cpi->gf_overspend_bits += + (cpi->projected_frame_size - cpi->inter_frame_target); + } + + cpi->non_gf_bitrate_adjustment = + cpi->gf_overspend_bits / cpi->frames_till_gf_update_due; + } + + /* Update data structure that monitors level of reference to last GF */ + memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols)); + cpi->gf_active_count = cm->mb_rows * cm->mb_cols; + + /* this frame refreshes means next frames don't unless specified by + * user + */ + cm->refresh_golden_frame = 0; + cpi->frames_since_golden = 0; + + cpi->recent_ref_frame_usage[INTRA_FRAME] = 1; + cpi->recent_ref_frame_usage[LAST_FRAME] = 1; + cpi->recent_ref_frame_usage[GOLDEN_FRAME] = 1; + cpi->recent_ref_frame_usage[ALTREF_FRAME] = 1; + + /* ******** Fixed Q test code only ************ */ + /* If we are going to use the ALT reference for the next group of + * frames set a flag to say so. + */ + if (cpi->oxcf.fixed_q >= 0 && cpi->oxcf.play_alternate && + !cpi->common.refresh_alt_ref_frame) { + cpi->source_alt_ref_pending = 1; + cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; + } + + if (!cpi->source_alt_ref_pending) cpi->source_alt_ref_active = 0; + + /* Decrement count down till next gf */ + if (cpi->frames_till_gf_update_due > 0) cpi->frames_till_gf_update_due--; + + } else if (!cpi->common.refresh_alt_ref_frame) { + /* Decrement count down till next gf */ + if (cpi->frames_till_gf_update_due > 0) cpi->frames_till_gf_update_due--; + + if (cpi->frames_till_alt_ref_frame) cpi->frames_till_alt_ref_frame--; + + cpi->frames_since_golden++; + + if (cpi->frames_since_golden > 1) { + cpi->recent_ref_frame_usage[INTRA_FRAME] += + cpi->mb.count_mb_ref_frame_usage[INTRA_FRAME]; + cpi->recent_ref_frame_usage[LAST_FRAME] += + cpi->mb.count_mb_ref_frame_usage[LAST_FRAME]; + cpi->recent_ref_frame_usage[GOLDEN_FRAME] += + cpi->mb.count_mb_ref_frame_usage[GOLDEN_FRAME]; + cpi->recent_ref_frame_usage[ALTREF_FRAME] += + cpi->mb.count_mb_ref_frame_usage[ALTREF_FRAME]; + } + } +} + +/* This function updates the reference frame probability estimates that + * will be used during mode selection + */ +static void update_rd_ref_frame_probs(VP8_COMP *cpi) { + VP8_COMMON *cm = &cpi->common; + + const int *const rfct = cpi->mb.count_mb_ref_frame_usage; + const int rf_intra = rfct[INTRA_FRAME]; + const int rf_inter = + rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]; + + if (cm->frame_type == KEY_FRAME) { + cpi->prob_intra_coded = 255; + cpi->prob_last_coded = 128; + cpi->prob_gf_coded = 128; + } else if (!(rf_intra + rf_inter)) { + cpi->prob_intra_coded = 63; + cpi->prob_last_coded = 128; + cpi->prob_gf_coded = 128; + } + + /* update reference frame costs since we can do better than what we got + * last frame. + */ + if (cpi->oxcf.number_of_layers == 1) { + if (cpi->common.refresh_alt_ref_frame) { + cpi->prob_intra_coded += 40; + if (cpi->prob_intra_coded > 255) cpi->prob_intra_coded = 255; + cpi->prob_last_coded = 200; + cpi->prob_gf_coded = 1; + } else if (cpi->frames_since_golden == 0) { + cpi->prob_last_coded = 214; + } else if (cpi->frames_since_golden == 1) { + cpi->prob_last_coded = 192; + cpi->prob_gf_coded = 220; + } else if (cpi->source_alt_ref_active) { + cpi->prob_gf_coded -= 20; + + if (cpi->prob_gf_coded < 10) cpi->prob_gf_coded = 10; + } + if (!cpi->source_alt_ref_active) cpi->prob_gf_coded = 255; + } +} + +#if !CONFIG_REALTIME_ONLY +/* 1 = key, 0 = inter */ +static int decide_key_frame(VP8_COMP *cpi) { + VP8_COMMON *cm = &cpi->common; + + int code_key_frame = 0; + + cpi->kf_boost = 0; + + if (cpi->Speed > 11) return 0; + + /* Clear down mmx registers */ + vpx_clear_system_state(); + + if ((cpi->compressor_speed == 2) && (cpi->Speed >= 5) && (cpi->sf.RD == 0)) { + double change = 1.0 * + abs((int)(cpi->mb.intra_error - cpi->last_intra_error)) / + (1 + cpi->last_intra_error); + double change2 = + 1.0 * + abs((int)(cpi->mb.prediction_error - cpi->last_prediction_error)) / + (1 + cpi->last_prediction_error); + double minerror = cm->MBs * 256; + + cpi->last_intra_error = cpi->mb.intra_error; + cpi->last_prediction_error = cpi->mb.prediction_error; + + if (10 * cpi->mb.intra_error / (1 + cpi->mb.prediction_error) < 15 && + cpi->mb.prediction_error > minerror && + (change > .25 || change2 > .25)) { + /*(change > 1.4 || change < .75)&& cpi->this_frame_percent_intra > + * cpi->last_frame_percent_intra + 3*/ + return 1; + } + + return 0; + } + + /* If the following are true we might as well code a key frame */ + if (((cpi->this_frame_percent_intra == 100) && + (cpi->this_frame_percent_intra > (cpi->last_frame_percent_intra + 2))) || + ((cpi->this_frame_percent_intra > 95) && + (cpi->this_frame_percent_intra >= + (cpi->last_frame_percent_intra + 5)))) { + code_key_frame = 1; + } + /* in addition if the following are true and this is not a golden frame + * then code a key frame Note that on golden frames there often seems + * to be a pop in intra usage anyway hence this restriction is + * designed to prevent spurious key frames. The Intra pop needs to be + * investigated. + */ + else if (((cpi->this_frame_percent_intra > 60) && + (cpi->this_frame_percent_intra > + (cpi->last_frame_percent_intra * 2))) || + ((cpi->this_frame_percent_intra > 75) && + (cpi->this_frame_percent_intra > + (cpi->last_frame_percent_intra * 3 / 2))) || + ((cpi->this_frame_percent_intra > 90) && + (cpi->this_frame_percent_intra > + (cpi->last_frame_percent_intra + 10)))) { + if (!cm->refresh_golden_frame) code_key_frame = 1; + } + + return code_key_frame; +} + +static void Pass1Encode(VP8_COMP *cpi) { + vp8_set_quantizer(cpi, 26); + vp8_first_pass(cpi); +} +#endif + +#if 0 +void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) +{ + + /* write the frame */ + FILE *yframe; + int i; + char filename[255]; + + sprintf(filename, "cx\\y%04d.raw", this_frame); + yframe = fopen(filename, "wb"); + + for (i = 0; i < frame->y_height; ++i) + fwrite(frame->y_buffer + i * frame->y_stride, frame->y_width, 1, yframe); + + fclose(yframe); + sprintf(filename, "cx\\u%04d.raw", this_frame); + yframe = fopen(filename, "wb"); + + for (i = 0; i < frame->uv_height; ++i) + fwrite(frame->u_buffer + i * frame->uv_stride, frame->uv_width, 1, yframe); + + fclose(yframe); + sprintf(filename, "cx\\v%04d.raw", this_frame); + yframe = fopen(filename, "wb"); + + for (i = 0; i < frame->uv_height; ++i) + fwrite(frame->v_buffer + i * frame->uv_stride, frame->uv_width, 1, yframe); + + fclose(yframe); +} +#endif + +#if !CONFIG_REALTIME_ONLY +/* Function to test for conditions that indeicate we should loop + * back and recode a frame. + */ +static int recode_loop_test(VP8_COMP *cpi, int high_limit, int low_limit, int q, + int maxq, int minq) { + int force_recode = 0; + VP8_COMMON *cm = &cpi->common; + + /* Is frame recode allowed at all + * Yes if either recode mode 1 is selected or mode two is selcted + * and the frame is a key frame. golden frame or alt_ref_frame + */ + if ((cpi->sf.recode_loop == 1) || + ((cpi->sf.recode_loop == 2) && + ((cm->frame_type == KEY_FRAME) || cm->refresh_golden_frame || + cm->refresh_alt_ref_frame))) { + /* General over and under shoot tests */ + if (((cpi->projected_frame_size > high_limit) && (q < maxq)) || + ((cpi->projected_frame_size < low_limit) && (q > minq))) { + force_recode = 1; + } + /* Special Constrained quality tests */ + else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) { + /* Undershoot and below auto cq level */ + if ((q > cpi->cq_target_quality) && + (cpi->projected_frame_size < ((cpi->this_frame_target * 7) >> 3))) { + force_recode = 1; + } + /* Severe undershoot and between auto and user cq level */ + else if ((q > cpi->oxcf.cq_level) && + (cpi->projected_frame_size < cpi->min_frame_bandwidth) && + (cpi->active_best_quality > cpi->oxcf.cq_level)) { + force_recode = 1; + cpi->active_best_quality = cpi->oxcf.cq_level; + } + } + } + + return force_recode; +} +#endif // !CONFIG_REALTIME_ONLY + +static void update_reference_frames(VP8_COMP *cpi) { + VP8_COMMON *cm = &cpi->common; + YV12_BUFFER_CONFIG *yv12_fb = cm->yv12_fb; + + /* At this point the new frame has been encoded. + * If any buffer copy / swapping is signaled it should be done here. + */ + + if (cm->frame_type == KEY_FRAME) { + yv12_fb[cm->new_fb_idx].flags |= VP8_GOLD_FRAME | VP8_ALTR_FRAME; + + yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME; + yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME; + + cm->alt_fb_idx = cm->gld_fb_idx = cm->new_fb_idx; + + cpi->current_ref_frames[GOLDEN_FRAME] = cm->current_video_frame; + cpi->current_ref_frames[ALTREF_FRAME] = cm->current_video_frame; + } else { + if (cm->refresh_alt_ref_frame) { + assert(!cm->copy_buffer_to_arf); + + cm->yv12_fb[cm->new_fb_idx].flags |= VP8_ALTR_FRAME; + cm->yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME; + cm->alt_fb_idx = cm->new_fb_idx; + + cpi->current_ref_frames[ALTREF_FRAME] = cm->current_video_frame; + } else if (cm->copy_buffer_to_arf) { + assert(!(cm->copy_buffer_to_arf & ~0x3)); + + if (cm->copy_buffer_to_arf == 1) { + if (cm->alt_fb_idx != cm->lst_fb_idx) { + yv12_fb[cm->lst_fb_idx].flags |= VP8_ALTR_FRAME; + yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME; + cm->alt_fb_idx = cm->lst_fb_idx; + + cpi->current_ref_frames[ALTREF_FRAME] = + cpi->current_ref_frames[LAST_FRAME]; + } + } else { + if (cm->alt_fb_idx != cm->gld_fb_idx) { + yv12_fb[cm->gld_fb_idx].flags |= VP8_ALTR_FRAME; + yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME; + cm->alt_fb_idx = cm->gld_fb_idx; + + cpi->current_ref_frames[ALTREF_FRAME] = + cpi->current_ref_frames[GOLDEN_FRAME]; + } + } + } + + if (cm->refresh_golden_frame) { + assert(!cm->copy_buffer_to_gf); + + cm->yv12_fb[cm->new_fb_idx].flags |= VP8_GOLD_FRAME; + cm->yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME; + cm->gld_fb_idx = cm->new_fb_idx; + + cpi->current_ref_frames[GOLDEN_FRAME] = cm->current_video_frame; + } else if (cm->copy_buffer_to_gf) { + assert(!(cm->copy_buffer_to_arf & ~0x3)); + + if (cm->copy_buffer_to_gf == 1) { + if (cm->gld_fb_idx != cm->lst_fb_idx) { + yv12_fb[cm->lst_fb_idx].flags |= VP8_GOLD_FRAME; + yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME; + cm->gld_fb_idx = cm->lst_fb_idx; + + cpi->current_ref_frames[GOLDEN_FRAME] = + cpi->current_ref_frames[LAST_FRAME]; + } + } else { + if (cm->alt_fb_idx != cm->gld_fb_idx) { + yv12_fb[cm->alt_fb_idx].flags |= VP8_GOLD_FRAME; + yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME; + cm->gld_fb_idx = cm->alt_fb_idx; + + cpi->current_ref_frames[GOLDEN_FRAME] = + cpi->current_ref_frames[ALTREF_FRAME]; + } + } + } + } + + if (cm->refresh_last_frame) { + cm->yv12_fb[cm->new_fb_idx].flags |= VP8_LAST_FRAME; + cm->yv12_fb[cm->lst_fb_idx].flags &= ~VP8_LAST_FRAME; + cm->lst_fb_idx = cm->new_fb_idx; + + cpi->current_ref_frames[LAST_FRAME] = cm->current_video_frame; + } + +#if CONFIG_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity) { + /* we shouldn't have to keep multiple copies as we know in advance which + * buffer we should start - for now to get something up and running + * I've chosen to copy the buffers + */ + if (cm->frame_type == KEY_FRAME) { + int i; + for (i = LAST_FRAME; i < MAX_REF_FRAMES; ++i) + vp8_yv12_copy_frame(cpi->Source, &cpi->denoiser.yv12_running_avg[i]); + } else { + vp8_yv12_extend_frame_borders( + &cpi->denoiser.yv12_running_avg[INTRA_FRAME]); + + if (cm->refresh_alt_ref_frame || cm->copy_buffer_to_arf) { + vp8_yv12_copy_frame(&cpi->denoiser.yv12_running_avg[INTRA_FRAME], + &cpi->denoiser.yv12_running_avg[ALTREF_FRAME]); + } + if (cm->refresh_golden_frame || cm->copy_buffer_to_gf) { + vp8_yv12_copy_frame(&cpi->denoiser.yv12_running_avg[INTRA_FRAME], + &cpi->denoiser.yv12_running_avg[GOLDEN_FRAME]); + } + if (cm->refresh_last_frame) { + vp8_yv12_copy_frame(&cpi->denoiser.yv12_running_avg[INTRA_FRAME], + &cpi->denoiser.yv12_running_avg[LAST_FRAME]); + } + } + if (cpi->oxcf.noise_sensitivity == 4) + vp8_yv12_copy_frame(cpi->Source, &cpi->denoiser.yv12_last_source); + } +#endif +} + +static int measure_square_diff_partial(YV12_BUFFER_CONFIG *source, + YV12_BUFFER_CONFIG *dest, + VP8_COMP *cpi) { + int i, j; + int Total = 0; + int num_blocks = 0; + int skip = 2; + int min_consec_zero_last = 10; + int tot_num_blocks = (source->y_height * source->y_width) >> 8; + unsigned char *src = source->y_buffer; + unsigned char *dst = dest->y_buffer; + + /* Loop through the Y plane, every |skip| blocks along rows and colmumns, + * summing the square differences, and only for blocks that have been + * zero_last mode at least |x| frames in a row. + */ + for (i = 0; i < source->y_height; i += 16 * skip) { + int block_index_row = (i >> 4) * cpi->common.mb_cols; + for (j = 0; j < source->y_width; j += 16 * skip) { + int index = block_index_row + (j >> 4); + if (cpi->consec_zero_last[index] >= min_consec_zero_last) { + unsigned int sse; + Total += vpx_mse16x16(src + j, source->y_stride, dst + j, + dest->y_stride, &sse); + num_blocks++; + } + } + src += 16 * skip * source->y_stride; + dst += 16 * skip * dest->y_stride; + } + // Only return non-zero if we have at least ~1/16 samples for estimate. + if (num_blocks > (tot_num_blocks >> 4)) { + assert(num_blocks != 0); + return (Total / num_blocks); + } else { + return 0; + } +} + +#if CONFIG_TEMPORAL_DENOISING +static void process_denoiser_mode_change(VP8_COMP *cpi) { + const VP8_COMMON *const cm = &cpi->common; + int i, j; + int total = 0; + int num_blocks = 0; + // Number of blocks skipped along row/column in computing the + // nmse (normalized mean square error) of source. + int skip = 2; + // Only select blocks for computing nmse that have been encoded + // as ZERO LAST min_consec_zero_last frames in a row. + // Scale with number of temporal layers. + int min_consec_zero_last = 12 / cpi->oxcf.number_of_layers; + // Decision is tested for changing the denoising mode every + // num_mode_change times this function is called. Note that this + // function called every 8 frames, so (8 * num_mode_change) is number + // of frames where denoising mode change is tested for switch. + int num_mode_change = 20; + // Framerate factor, to compensate for larger mse at lower framerates. + // Use ref_framerate, which is full source framerate for temporal layers. + // TODO(marpan): Adjust this factor. + int fac_framerate = cpi->ref_framerate < 25.0f ? 80 : 100; + int tot_num_blocks = cm->mb_rows * cm->mb_cols; + int ystride = cpi->Source->y_stride; + unsigned char *src = cpi->Source->y_buffer; + unsigned char *dst = cpi->denoiser.yv12_last_source.y_buffer; + static const unsigned char const_source[16] = { 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128 }; + int bandwidth = (int)(cpi->target_bandwidth); + // For temporal layers, use full bandwidth (top layer). + if (cpi->oxcf.number_of_layers > 1) { + LAYER_CONTEXT *lc = &cpi->layer_context[cpi->oxcf.number_of_layers - 1]; + bandwidth = (int)(lc->target_bandwidth); + } + // Loop through the Y plane, every skip blocks along rows and columns, + // summing the normalized mean square error, only for blocks that have + // been encoded as ZEROMV LAST at least min_consec_zero_last least frames in + // a row and have small sum difference between current and previous frame. + // Normalization here is by the contrast of the current frame block. + for (i = 0; i < cm->Height; i += 16 * skip) { + int block_index_row = (i >> 4) * cm->mb_cols; + for (j = 0; j < cm->Width; j += 16 * skip) { + int index = block_index_row + (j >> 4); + if (cpi->consec_zero_last[index] >= min_consec_zero_last) { + unsigned int sse; + const unsigned int var = + vpx_variance16x16(src + j, ystride, dst + j, ystride, &sse); + // Only consider this block as valid for noise measurement + // if the sum_diff average of the current and previous frame + // is small (to avoid effects from lighting change). + if ((sse - var) < 128) { + unsigned int sse2; + const unsigned int act = + vpx_variance16x16(src + j, ystride, const_source, 0, &sse2); + if (act > 0) total += sse / act; + num_blocks++; + } + } + } + src += 16 * skip * ystride; + dst += 16 * skip * ystride; + } + total = total * fac_framerate / 100; + + // Only consider this frame as valid sample if we have computed nmse over + // at least ~1/16 blocks, and Total > 0 (Total == 0 can happen if the + // application inputs duplicate frames, or contrast is all zero). + if (total > 0 && (num_blocks > (tot_num_blocks >> 4))) { + // Update the recursive mean square source_diff. + total = (total << 8) / num_blocks; + if (cpi->denoiser.nmse_source_diff_count == 0) { + // First sample in new interval. + cpi->denoiser.nmse_source_diff = total; + cpi->denoiser.qp_avg = cm->base_qindex; + } else { + // For subsequent samples, use average with weight ~1/4 for new sample. + cpi->denoiser.nmse_source_diff = + (int)((total + 3 * cpi->denoiser.nmse_source_diff) >> 2); + cpi->denoiser.qp_avg = + (int)((cm->base_qindex + 3 * cpi->denoiser.qp_avg) >> 2); + } + cpi->denoiser.nmse_source_diff_count++; + } + // Check for changing the denoiser mode, when we have obtained #samples = + // num_mode_change. Condition the change also on the bitrate and QP. + if (cpi->denoiser.nmse_source_diff_count == num_mode_change) { + // Check for going up: from normal to aggressive mode. + if ((cpi->denoiser.denoiser_mode == kDenoiserOnYUV) && + (cpi->denoiser.nmse_source_diff > + cpi->denoiser.threshold_aggressive_mode) && + (cpi->denoiser.qp_avg < cpi->denoiser.qp_threshold_up && + bandwidth > cpi->denoiser.bitrate_threshold)) { + vp8_denoiser_set_parameters(&cpi->denoiser, kDenoiserOnYUVAggressive); + } else { + // Check for going down: from aggressive to normal mode. + if (((cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive) && + (cpi->denoiser.nmse_source_diff < + cpi->denoiser.threshold_aggressive_mode)) || + ((cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive) && + (cpi->denoiser.qp_avg > cpi->denoiser.qp_threshold_down || + bandwidth < cpi->denoiser.bitrate_threshold))) { + vp8_denoiser_set_parameters(&cpi->denoiser, kDenoiserOnYUV); + } + } + // Reset metric and counter for next interval. + cpi->denoiser.nmse_source_diff = 0; + cpi->denoiser.qp_avg = 0; + cpi->denoiser.nmse_source_diff_count = 0; + } +} +#endif + +void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) { + const FRAME_TYPE frame_type = cm->frame_type; + + int update_any_ref_buffers = 1; + if (cpi->common.refresh_last_frame == 0 && + cpi->common.refresh_golden_frame == 0 && + cpi->common.refresh_alt_ref_frame == 0) { + update_any_ref_buffers = 0; + } + + if (cm->no_lpf) { + cm->filter_level = 0; + } else { + struct vpx_usec_timer timer; + + vpx_clear_system_state(); + + vpx_usec_timer_start(&timer); + if (cpi->sf.auto_filter == 0) { +#if CONFIG_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity && cm->frame_type != KEY_FRAME) { + // Use the denoised buffer for selecting base loop filter level. + // Denoised signal for current frame is stored in INTRA_FRAME. + // No denoising on key frames. + vp8cx_pick_filter_level_fast( + &cpi->denoiser.yv12_running_avg[INTRA_FRAME], cpi); + } else { + vp8cx_pick_filter_level_fast(cpi->Source, cpi); + } +#else + vp8cx_pick_filter_level_fast(cpi->Source, cpi); +#endif + } else { +#if CONFIG_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity && cm->frame_type != KEY_FRAME) { + // Use the denoised buffer for selecting base loop filter level. + // Denoised signal for current frame is stored in INTRA_FRAME. + // No denoising on key frames. + vp8cx_pick_filter_level(&cpi->denoiser.yv12_running_avg[INTRA_FRAME], + cpi); + } else { + vp8cx_pick_filter_level(cpi->Source, cpi); + } +#else + vp8cx_pick_filter_level(cpi->Source, cpi); +#endif + } + + if (cm->filter_level > 0) { + vp8cx_set_alt_lf_level(cpi, cm->filter_level); + } + + vpx_usec_timer_mark(&timer); + cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer); + } + +#if CONFIG_MULTITHREAD + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) { + sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */ + } +#endif + + // No need to apply loop-filter if the encoded frame does not update + // any reference buffers. + if (cm->filter_level > 0 && update_any_ref_buffers) { + vp8_loop_filter_frame(cm, &cpi->mb.e_mbd, frame_type); + } + + vp8_yv12_extend_frame_borders(cm->frame_to_show); +} +// Return 1 if frame is to be dropped. Update frame drop decimation +// counters. +int vp8_check_drop_buffer(VP8_COMP *cpi) { + VP8_COMMON *cm = &cpi->common; + int drop_mark = (int)(cpi->oxcf.drop_frames_water_mark * + cpi->oxcf.optimal_buffer_level / 100); + int drop_mark75 = drop_mark * 2 / 3; + int drop_mark50 = drop_mark / 4; + int drop_mark25 = drop_mark / 8; + if (cpi->drop_frames_allowed) { + /* The reset to decimation 0 is only done here for one pass. + * Once it is set two pass leaves decimation on till the next kf. + */ + if (cpi->buffer_level > drop_mark && cpi->decimation_factor > 0) { + cpi->decimation_factor--; + } + + if (cpi->buffer_level > drop_mark75 && cpi->decimation_factor > 0) { + cpi->decimation_factor = 1; + + } else if (cpi->buffer_level < drop_mark25 && + (cpi->decimation_factor == 2 || cpi->decimation_factor == 3)) { + cpi->decimation_factor = 3; + } else if (cpi->buffer_level < drop_mark50 && + (cpi->decimation_factor == 1 || cpi->decimation_factor == 2)) { + cpi->decimation_factor = 2; + } else if (cpi->buffer_level < drop_mark75 && + (cpi->decimation_factor == 0 || cpi->decimation_factor == 1)) { + cpi->decimation_factor = 1; + } + } + + /* The following decimates the frame rate according to a regular + * pattern (i.e. to 1/2 or 2/3 frame rate) This can be used to help + * prevent buffer under-run in CBR mode. Alternatively it might be + * desirable in some situations to drop frame rate but throw more bits + * at each frame. + * + * Note that dropping a key frame can be problematic if spatial + * resampling is also active + */ + if (cpi->decimation_factor > 0 && cpi->drop_frames_allowed) { + switch (cpi->decimation_factor) { + case 1: + cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 3 / 2; + break; + case 2: + cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4; + break; + case 3: + cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4; + break; + } + + /* Note that we should not throw out a key frame (especially when + * spatial resampling is enabled). + */ + if (cm->frame_type == KEY_FRAME) { + cpi->decimation_count = cpi->decimation_factor; + } else if (cpi->decimation_count > 0) { + cpi->decimation_count--; + + cpi->bits_off_target += cpi->av_per_frame_bandwidth; + if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size) { + cpi->bits_off_target = cpi->oxcf.maximum_buffer_size; + } + +#if CONFIG_MULTI_RES_ENCODING + vp8_store_drop_frame_info(cpi); +#endif + + cm->current_video_frame++; + cpi->frames_since_key++; + cpi->ext_refresh_frame_flags_pending = 0; + // We advance the temporal pattern for dropped frames. + cpi->temporal_pattern_counter++; + +#if CONFIG_INTERNAL_STATS + cpi->count++; +#endif + + cpi->buffer_level = cpi->bits_off_target; + + if (cpi->oxcf.number_of_layers > 1) { + unsigned int i; + + /* Propagate bits saved by dropping the frame to higher + * layers + */ + for (i = cpi->current_layer + 1; i < cpi->oxcf.number_of_layers; ++i) { + LAYER_CONTEXT *lc = &cpi->layer_context[i]; + lc->bits_off_target += (int)(lc->target_bandwidth / lc->framerate); + if (lc->bits_off_target > lc->maximum_buffer_size) { + lc->bits_off_target = lc->maximum_buffer_size; + } + lc->buffer_level = lc->bits_off_target; + } + } + return 1; + } else { + cpi->decimation_count = cpi->decimation_factor; + } + } else { + cpi->decimation_count = 0; + } + return 0; +} + +static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, + unsigned char *dest, + unsigned char *dest_end, + unsigned int *frame_flags) { + int Q; + int frame_over_shoot_limit; + int frame_under_shoot_limit; + + int Loop = 0; + + VP8_COMMON *cm = &cpi->common; + int active_worst_qchanged = 0; + +#if !CONFIG_REALTIME_ONLY + int q_low; + int q_high; + int zbin_oq_high; + int zbin_oq_low = 0; + int top_index; + int bottom_index; + int overshoot_seen = 0; + int undershoot_seen = 0; +#endif + + /* Clear down mmx registers to allow floating point in what follows */ + vpx_clear_system_state(); + + if (cpi->force_next_frame_intra) { + cm->frame_type = KEY_FRAME; /* delayed intra frame */ + cpi->force_next_frame_intra = 0; + } + + /* For an alt ref frame in 2 pass we skip the call to the second pass + * function that sets the target bandwidth + */ + switch (cpi->pass) { +#if !CONFIG_REALTIME_ONLY + case 2: + if (cpi->common.refresh_alt_ref_frame) { + /* Per frame bit target for the alt ref frame */ + cpi->per_frame_bandwidth = cpi->twopass.gf_bits; + /* per second target bitrate */ + cpi->target_bandwidth = + (int)(cpi->twopass.gf_bits * cpi->output_framerate); + } + break; +#endif // !CONFIG_REALTIME_ONLY + default: + cpi->per_frame_bandwidth = + (int)round(cpi->target_bandwidth / cpi->output_framerate); + break; + } + + /* Default turn off buffer to buffer copying */ + cm->copy_buffer_to_gf = 0; + cm->copy_buffer_to_arf = 0; + + /* Clear zbin over-quant value and mode boost values. */ + cpi->mb.zbin_over_quant = 0; + cpi->mb.zbin_mode_boost = 0; + + /* Enable or disable mode based tweaking of the zbin + * For 2 Pass Only used where GF/ARF prediction quality + * is above a threshold + */ + cpi->mb.zbin_mode_boost_enabled = 1; + if (cpi->pass == 2) { + if (cpi->gfu_boost <= 400) { + cpi->mb.zbin_mode_boost_enabled = 0; + } + } + + /* Current default encoder behaviour for the altref sign bias */ + if (cpi->source_alt_ref_active) { + cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1; + } else { + cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 0; + } + + /* Check to see if a key frame is signaled + * For two pass with auto key frame enabled cm->frame_type may already + * be set, but not for one pass. + */ + if ((cm->current_video_frame == 0) || (cm->frame_flags & FRAMEFLAGS_KEY) || + (cpi->oxcf.auto_key && + (cpi->frames_since_key % cpi->key_frame_frequency == 0))) { + /* Key frame from VFW/auto-keyframe/first frame */ + cm->frame_type = KEY_FRAME; +#if CONFIG_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity == 4) { + // For adaptive mode, reset denoiser to normal mode on key frame. + vp8_denoiser_set_parameters(&cpi->denoiser, kDenoiserOnYUV); + } +#endif + } + +#if CONFIG_MULTI_RES_ENCODING + if (cpi->oxcf.mr_total_resolutions > 1) { + LOWER_RES_FRAME_INFO *low_res_frame_info = + (LOWER_RES_FRAME_INFO *)cpi->oxcf.mr_low_res_mode_info; + + if (cpi->oxcf.mr_encoder_id) { + // Check if lower resolution is available for motion vector reuse. + if (cm->frame_type != KEY_FRAME) { + cpi->mr_low_res_mv_avail = 1; + cpi->mr_low_res_mv_avail &= !(low_res_frame_info->is_frame_dropped); + + if (cpi->ref_frame_flags & VP8_LAST_FRAME) + cpi->mr_low_res_mv_avail &= + (cpi->current_ref_frames[LAST_FRAME] == + low_res_frame_info->low_res_ref_frames[LAST_FRAME]); + + if (cpi->ref_frame_flags & VP8_GOLD_FRAME) + cpi->mr_low_res_mv_avail &= + (cpi->current_ref_frames[GOLDEN_FRAME] == + low_res_frame_info->low_res_ref_frames[GOLDEN_FRAME]); + + // Don't use altref to determine whether low res is available. + // TODO (marpan): Should we make this type of condition on a + // per-reference frame basis? + /* + if (cpi->ref_frame_flags & VP8_ALTR_FRAME) + cpi->mr_low_res_mv_avail &= (cpi->current_ref_frames[ALTREF_FRAME] + == low_res_frame_info->low_res_ref_frames[ALTREF_FRAME]); + */ + } + // Disable motion vector reuse (i.e., disable any usage of the low_res) + // if the previous lower stream is skipped/disabled. + if (low_res_frame_info->skip_encoding_prev_stream) { + cpi->mr_low_res_mv_avail = 0; + } + } + // This stream is not skipped (i.e., it's being encoded), so set this skip + // flag to 0. This is needed for the next stream (i.e., which is the next + // frame to be encoded). + low_res_frame_info->skip_encoding_prev_stream = 0; + + // On a key frame: For the lowest resolution, keep track of the key frame + // counter value. For the higher resolutions, reset the current video + // frame counter to that of the lowest resolution. + // This is done to the handle the case where we may stop/start encoding + // higher layer(s). The restart-encoding of higher layer is only signaled + // by a key frame for now. + // TODO (marpan): Add flag to indicate restart-encoding of higher layer. + if (cm->frame_type == KEY_FRAME) { + if (cpi->oxcf.mr_encoder_id) { + // If the initial starting value of the buffer level is zero (this can + // happen because we may have not started encoding this higher stream), + // then reset it to non-zero value based on |starting_buffer_level|. + if (cpi->common.current_video_frame == 0 && cpi->buffer_level == 0) { + unsigned int i; + cpi->bits_off_target = cpi->oxcf.starting_buffer_level; + cpi->buffer_level = cpi->oxcf.starting_buffer_level; + for (i = 0; i < cpi->oxcf.number_of_layers; ++i) { + LAYER_CONTEXT *lc = &cpi->layer_context[i]; + lc->bits_off_target = lc->starting_buffer_level; + lc->buffer_level = lc->starting_buffer_level; + } + } + cpi->common.current_video_frame = + low_res_frame_info->key_frame_counter_value; + } else { + low_res_frame_info->key_frame_counter_value = + cpi->common.current_video_frame; + } + } + } +#endif + + // Find the reference frame closest to the current frame. + cpi->closest_reference_frame = LAST_FRAME; + if (cm->frame_type != KEY_FRAME) { + int i; + MV_REFERENCE_FRAME closest_ref = INTRA_FRAME; + if (cpi->ref_frame_flags & VP8_LAST_FRAME) { + closest_ref = LAST_FRAME; + } else if (cpi->ref_frame_flags & VP8_GOLD_FRAME) { + closest_ref = GOLDEN_FRAME; + } else if (cpi->ref_frame_flags & VP8_ALTR_FRAME) { + closest_ref = ALTREF_FRAME; + } + for (i = 1; i <= 3; ++i) { + vpx_ref_frame_type_t ref_frame_type = + (vpx_ref_frame_type_t)((i == 3) ? 4 : i); + if (cpi->ref_frame_flags & ref_frame_type) { + if ((cm->current_video_frame - cpi->current_ref_frames[i]) < + (cm->current_video_frame - cpi->current_ref_frames[closest_ref])) { + closest_ref = i; + } + } + } + cpi->closest_reference_frame = closest_ref; + } + + /* Set various flags etc to special state if it is a key frame */ + if (cm->frame_type == KEY_FRAME) { + int i; + + // Set the loop filter deltas and segmentation map update + setup_features(cpi); + + /* The alternate reference frame cannot be active for a key frame */ + cpi->source_alt_ref_active = 0; + + /* Reset the RD threshold multipliers to default of * 1 (128) */ + for (i = 0; i < MAX_MODES; ++i) { + cpi->mb.rd_thresh_mult[i] = 128; + } + + // Reset the zero_last counter to 0 on key frame. + memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols); + memset(cpi->consec_zero_last_mvbias, 0, + (cpi->common.mb_rows * cpi->common.mb_cols)); + } + +#if 0 + /* Experimental code for lagged compress and one pass + * Initialise one_pass GF frames stats + * Update stats used for GF selection + */ + { + cpi->one_pass_frame_index = cm->current_video_frame % MAX_LAG_BUFFERS; + + cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frames_so_far = 0; + cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_intra_error = 0.0; + cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_coded_error = 0.0; + cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_pcnt_inter = 0.0; + cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_pcnt_motion = 0.0; + cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_mvr = 0.0; + cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_mvr_abs = 0.0; + cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_mvc = 0.0; + cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_mvc_abs = 0.0; + } +#endif + + update_rd_ref_frame_probs(cpi); + + if (vp8_check_drop_buffer(cpi)) { + return; + } + + /* Decide how big to make the frame */ + if (!vp8_pick_frame_size(cpi)) { +/*TODO: 2 drop_frame and return code could be put together. */ +#if CONFIG_MULTI_RES_ENCODING + vp8_store_drop_frame_info(cpi); +#endif + cm->current_video_frame++; + cpi->frames_since_key++; + cpi->ext_refresh_frame_flags_pending = 0; + // We advance the temporal pattern for dropped frames. + cpi->temporal_pattern_counter++; + return; + } + + /* Reduce active_worst_allowed_q for CBR if our buffer is getting too full. + * This has a knock on effect on active best quality as well. + * For CBR if the buffer reaches its maximum level then we can no longer + * save up bits for later frames so we might as well use them up + * on the current frame. + */ + if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) && + (cpi->buffer_level >= cpi->oxcf.optimal_buffer_level) && + cpi->buffered_mode) { + /* Max adjustment is 1/4 */ + int Adjustment = cpi->active_worst_quality / 4; + + if (Adjustment) { + int buff_lvl_step; + + if (cpi->buffer_level < cpi->oxcf.maximum_buffer_size) { + buff_lvl_step = (int)((cpi->oxcf.maximum_buffer_size - + cpi->oxcf.optimal_buffer_level) / + Adjustment); + + if (buff_lvl_step) { + Adjustment = + (int)((cpi->buffer_level - cpi->oxcf.optimal_buffer_level) / + buff_lvl_step); + } else { + Adjustment = 0; + } + } + + cpi->active_worst_quality -= Adjustment; + + if (cpi->active_worst_quality < cpi->active_best_quality) { + cpi->active_worst_quality = cpi->active_best_quality; + } + } + } + + /* Set an active best quality and if necessary active worst quality + * There is some odd behavior for one pass here that needs attention. + */ + if ((cpi->pass == 2) || (cpi->ni_frames > 150)) { + vpx_clear_system_state(); + + Q = cpi->active_worst_quality; + + if (cm->frame_type == KEY_FRAME) { + if (cpi->pass == 2) { + if (cpi->gfu_boost > 600) { + cpi->active_best_quality = kf_low_motion_minq[Q]; + } else { + cpi->active_best_quality = kf_high_motion_minq[Q]; + } + + /* Special case for key frames forced because we have reached + * the maximum key frame interval. Here force the Q to a range + * based on the ambient Q to reduce the risk of popping + */ + if (cpi->this_key_frame_forced) { + if (cpi->active_best_quality > cpi->avg_frame_qindex * 7 / 8) { + cpi->active_best_quality = cpi->avg_frame_qindex * 7 / 8; + } else if (cpi->active_best_quality < (cpi->avg_frame_qindex >> 2)) { + cpi->active_best_quality = cpi->avg_frame_qindex >> 2; + } + } + } + /* One pass more conservative */ + else { + cpi->active_best_quality = kf_high_motion_minq[Q]; + } + } + + else if (cpi->oxcf.number_of_layers == 1 && + (cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame)) { + /* Use the lower of cpi->active_worst_quality and recent + * average Q as basis for GF/ARF Q limit unless last frame was + * a key frame. + */ + if ((cpi->frames_since_key > 1) && + (cpi->avg_frame_qindex < cpi->active_worst_quality)) { + Q = cpi->avg_frame_qindex; + } + + /* For constrained quality don't allow Q less than the cq level */ + if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) && + (Q < cpi->cq_target_quality)) { + Q = cpi->cq_target_quality; + } + + if (cpi->pass == 2) { + if (cpi->gfu_boost > 1000) { + cpi->active_best_quality = gf_low_motion_minq[Q]; + } else if (cpi->gfu_boost < 400) { + cpi->active_best_quality = gf_high_motion_minq[Q]; + } else { + cpi->active_best_quality = gf_mid_motion_minq[Q]; + } + + /* Constrained quality use slightly lower active best. */ + if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) { + cpi->active_best_quality = cpi->active_best_quality * 15 / 16; + } + } + /* One pass more conservative */ + else { + cpi->active_best_quality = gf_high_motion_minq[Q]; + } + } else { + cpi->active_best_quality = inter_minq[Q]; + + /* For the constant/constrained quality mode we don't want + * q to fall below the cq level. + */ + if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) && + (cpi->active_best_quality < cpi->cq_target_quality)) { + /* If we are strongly undershooting the target rate in the last + * frames then use the user passed in cq value not the auto + * cq value. + */ + if (cpi->rolling_actual_bits < cpi->min_frame_bandwidth) { + cpi->active_best_quality = cpi->oxcf.cq_level; + } else { + cpi->active_best_quality = cpi->cq_target_quality; + } + } + } + + /* If CBR and the buffer is as full then it is reasonable to allow + * higher quality on the frames to prevent bits just going to waste. + */ + if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { + /* Note that the use of >= here elliminates the risk of a divide + * by 0 error in the else if clause + */ + if (cpi->buffer_level >= cpi->oxcf.maximum_buffer_size) { + cpi->active_best_quality = cpi->best_quality; + + } else if (cpi->buffer_level > cpi->oxcf.optimal_buffer_level) { + int Fraction = + (int)(((cpi->buffer_level - cpi->oxcf.optimal_buffer_level) * 128) / + (cpi->oxcf.maximum_buffer_size - + cpi->oxcf.optimal_buffer_level)); + int min_qadjustment = + ((cpi->active_best_quality - cpi->best_quality) * Fraction) / 128; + + cpi->active_best_quality -= min_qadjustment; + } + } + } + /* Make sure constrained quality mode limits are adhered to for the first + * few frames of one pass encodes + */ + else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) { + if ((cm->frame_type == KEY_FRAME) || cm->refresh_golden_frame || + cpi->common.refresh_alt_ref_frame) { + cpi->active_best_quality = cpi->best_quality; + } else if (cpi->active_best_quality < cpi->cq_target_quality) { + cpi->active_best_quality = cpi->cq_target_quality; + } + } + + /* Clip the active best and worst quality values to limits */ + if (cpi->active_worst_quality > cpi->worst_quality) { + cpi->active_worst_quality = cpi->worst_quality; + } + + if (cpi->active_best_quality < cpi->best_quality) { + cpi->active_best_quality = cpi->best_quality; + } + + if (cpi->active_worst_quality < cpi->active_best_quality) { + cpi->active_worst_quality = cpi->active_best_quality; + } + + /* Determine initial Q to try */ + Q = vp8_regulate_q(cpi, cpi->this_frame_target); + +#if !CONFIG_REALTIME_ONLY + + /* Set highest allowed value for Zbin over quant */ + if (cm->frame_type == KEY_FRAME) { + zbin_oq_high = 0; + } else if ((cpi->oxcf.number_of_layers == 1) && + ((cm->refresh_alt_ref_frame || + (cm->refresh_golden_frame && !cpi->source_alt_ref_active)))) { + zbin_oq_high = 16; + } else { + zbin_oq_high = ZBIN_OQ_MAX; + } +#endif + + compute_skin_map(cpi); + + /* Setup background Q adjustment for error resilient mode. + * For multi-layer encodes only enable this for the base layer. + */ + if (cpi->cyclic_refresh_mode_enabled) { + // Special case for screen_content_mode with golden frame updates. + int disable_cr_gf = + (cpi->oxcf.screen_content_mode == 2 && cm->refresh_golden_frame); + if (cpi->current_layer == 0 && cpi->force_maxqp == 0 && !disable_cr_gf) { + cyclic_background_refresh(cpi, Q, 0); + } else { + disable_segmentation(cpi); + } + } + + vp8_compute_frame_size_bounds(cpi, &frame_under_shoot_limit, + &frame_over_shoot_limit); + +#if !CONFIG_REALTIME_ONLY + /* Limit Q range for the adaptive loop. */ + bottom_index = cpi->active_best_quality; + top_index = cpi->active_worst_quality; + q_low = cpi->active_best_quality; + q_high = cpi->active_worst_quality; +#endif + + vp8_save_coding_context(cpi); + + scale_and_extend_source(cpi->un_scaled_source, cpi); + +#if CONFIG_TEMPORAL_DENOISING && CONFIG_POSTPROC + // Option to apply spatial blur under the aggressive or adaptive + // (temporal denoising) mode. + if (cpi->oxcf.noise_sensitivity >= 3) { + if (cpi->denoiser.denoise_pars.spatial_blur != 0) { + vp8_de_noise(cm, cpi->Source, cpi->denoiser.denoise_pars.spatial_blur, 1); + } + } +#endif + +#if !(CONFIG_REALTIME_ONLY) && CONFIG_POSTPROC && !(CONFIG_TEMPORAL_DENOISING) + + if (cpi->oxcf.noise_sensitivity > 0) { + unsigned char *src; + int l = 0; + + switch (cpi->oxcf.noise_sensitivity) { + case 1: l = 20; break; + case 2: l = 40; break; + case 3: l = 60; break; + case 4: l = 80; break; + case 5: l = 100; break; + case 6: l = 150; break; + } + + if (cm->frame_type == KEY_FRAME) { + vp8_de_noise(cm, cpi->Source, l, 1); + } else { + vp8_de_noise(cm, cpi->Source, l, 1); + + src = cpi->Source->y_buffer; + + if (cpi->Source->y_stride < 0) { + src += cpi->Source->y_stride * (cpi->Source->y_height - 1); + } + } + } + +#endif + +#ifdef OUTPUT_YUV_SRC + vpx_write_yuv_frame(yuv_file, cpi->Source); +#endif + + do { + vpx_clear_system_state(); + + vp8_set_quantizer(cpi, Q); + + /* setup skip prob for costing in mode/mv decision */ + if (cpi->common.mb_no_coeff_skip) { + cpi->prob_skip_false = cpi->base_skip_false_prob[Q]; + + if (cm->frame_type != KEY_FRAME) { + if (cpi->common.refresh_alt_ref_frame) { + if (cpi->last_skip_false_probs[2] != 0) { + cpi->prob_skip_false = cpi->last_skip_false_probs[2]; + } + + /* + if(cpi->last_skip_false_probs[2]!=0 && abs(Q- + cpi->last_skip_probs_q[2])<=16 ) + cpi->prob_skip_false = cpi->last_skip_false_probs[2]; + else if (cpi->last_skip_false_probs[2]!=0) + cpi->prob_skip_false = (cpi->last_skip_false_probs[2] + + cpi->prob_skip_false ) / 2; + */ + } else if (cpi->common.refresh_golden_frame) { + if (cpi->last_skip_false_probs[1] != 0) { + cpi->prob_skip_false = cpi->last_skip_false_probs[1]; + } + + /* + if(cpi->last_skip_false_probs[1]!=0 && abs(Q- + cpi->last_skip_probs_q[1])<=16 ) + cpi->prob_skip_false = cpi->last_skip_false_probs[1]; + else if (cpi->last_skip_false_probs[1]!=0) + cpi->prob_skip_false = (cpi->last_skip_false_probs[1] + + cpi->prob_skip_false ) / 2; + */ + } else { + if (cpi->last_skip_false_probs[0] != 0) { + cpi->prob_skip_false = cpi->last_skip_false_probs[0]; + } + + /* + if(cpi->last_skip_false_probs[0]!=0 && abs(Q- + cpi->last_skip_probs_q[0])<=16 ) + cpi->prob_skip_false = cpi->last_skip_false_probs[0]; + else if(cpi->last_skip_false_probs[0]!=0) + cpi->prob_skip_false = (cpi->last_skip_false_probs[0] + + cpi->prob_skip_false ) / 2; + */ + } + + /* as this is for cost estimate, let's make sure it does not + * go extreme eitehr way + */ + if (cpi->prob_skip_false < 5) cpi->prob_skip_false = 5; + + if (cpi->prob_skip_false > 250) cpi->prob_skip_false = 250; + + if (cpi->oxcf.number_of_layers == 1 && cpi->is_src_frame_alt_ref) { + cpi->prob_skip_false = 1; + } + } + +#if 0 + + if (cpi->pass != 1) + { + FILE *f = fopen("skip.stt", "a"); + fprintf(f, "%d, %d, %4d ", cpi->common.refresh_golden_frame, cpi->common.refresh_alt_ref_frame, cpi->prob_skip_false); + fclose(f); + } + +#endif + } + + if (cm->frame_type == KEY_FRAME) { + if (resize_key_frame(cpi)) { + /* If the frame size has changed, need to reset Q, quantizer, + * and background refresh. + */ + Q = vp8_regulate_q(cpi, cpi->this_frame_target); + if (cpi->cyclic_refresh_mode_enabled) { + if (cpi->current_layer == 0) { + cyclic_background_refresh(cpi, Q, 0); + } else { + disable_segmentation(cpi); + } + } + // Reset the zero_last counter to 0 on key frame. + memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols); + memset(cpi->consec_zero_last_mvbias, 0, + (cpi->common.mb_rows * cpi->common.mb_cols)); + vp8_set_quantizer(cpi, Q); + } + + vp8_setup_key_frame(cpi); + } + +#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING + { + if (cpi->oxcf.error_resilient_mode) cm->refresh_entropy_probs = 0; + + if (cpi->oxcf.error_resilient_mode & VPX_ERROR_RESILIENT_PARTITIONS) { + if (cm->frame_type == KEY_FRAME) cm->refresh_entropy_probs = 1; + } + + if (cm->refresh_entropy_probs == 0) { + /* save a copy for later refresh */ + memcpy(&cm->lfc, &cm->fc, sizeof(cm->fc)); + } + + vp8_update_coef_context(cpi); + + vp8_update_coef_probs(cpi); + + /* transform / motion compensation build reconstruction frame + * +pack coef partitions + */ + vp8_encode_frame(cpi); + + /* cpi->projected_frame_size is not needed for RT mode */ + } +#else + /* transform / motion compensation build reconstruction frame */ + vp8_encode_frame(cpi); + + if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER && + cpi->rt_drop_recode_on_overshoot == 1) { + if (vp8_drop_encodedframe_overshoot(cpi, Q)) { + vpx_clear_system_state(); + return; + } + if (cm->frame_type != KEY_FRAME) + cpi->last_pred_err_mb = + (int)(cpi->mb.prediction_error / cpi->common.MBs); + } + + cpi->projected_frame_size -= vp8_estimate_entropy_savings(cpi); + cpi->projected_frame_size = + (cpi->projected_frame_size > 0) ? cpi->projected_frame_size : 0; +#endif + vpx_clear_system_state(); + + /* Test to see if the stats generated for this frame indicate that + * we should have coded a key frame (assuming that we didn't)! + */ + + if (cpi->pass != 2 && cpi->oxcf.auto_key && cm->frame_type != KEY_FRAME && + cpi->compressor_speed != 2) { +#if !CONFIG_REALTIME_ONLY + if (decide_key_frame(cpi)) { + /* Reset all our sizing numbers and recode */ + cm->frame_type = KEY_FRAME; + + vp8_pick_frame_size(cpi); + + /* Clear the Alt reference frame active flag when we have + * a key frame + */ + cpi->source_alt_ref_active = 0; + + // Set the loop filter deltas and segmentation map update + setup_features(cpi); + + vp8_restore_coding_context(cpi); + + Q = vp8_regulate_q(cpi, cpi->this_frame_target); + + vp8_compute_frame_size_bounds(cpi, &frame_under_shoot_limit, + &frame_over_shoot_limit); + + /* Limit Q range for the adaptive loop. */ + bottom_index = cpi->active_best_quality; + top_index = cpi->active_worst_quality; + q_low = cpi->active_best_quality; + q_high = cpi->active_worst_quality; + + Loop = 1; + + continue; + } +#endif + } + + vpx_clear_system_state(); + + if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1; + + /* Are we are overshooting and up against the limit of active max Q. */ + if (!cpi->rt_always_update_correction_factor && + ((cpi->pass != 2) || + (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)) && + (Q == cpi->active_worst_quality) && + (cpi->active_worst_quality < cpi->worst_quality) && + (cpi->projected_frame_size > frame_over_shoot_limit)) { + int over_size_percent = + ((cpi->projected_frame_size - frame_over_shoot_limit) * 100) / + frame_over_shoot_limit; + + /* If so is there any scope for relaxing it */ + while ((cpi->active_worst_quality < cpi->worst_quality) && + (over_size_percent > 0)) { + cpi->active_worst_quality++; + /* Assume 1 qstep = about 4% on frame size. */ + over_size_percent = (int)(over_size_percent * 0.96); + } +#if !CONFIG_REALTIME_ONLY + top_index = cpi->active_worst_quality; +#endif // !CONFIG_REALTIME_ONLY + /* If we have updated the active max Q do not call + * vp8_update_rate_correction_factors() this loop. + */ + active_worst_qchanged = 1; + } else { + active_worst_qchanged = 0; + } + +#if CONFIG_REALTIME_ONLY + Loop = 0; +#else + /* Special case handling for forced key frames */ + if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) { + int last_q = Q; + int kf_err = vp8_calc_ss_err(cpi->Source, &cm->yv12_fb[cm->new_fb_idx]); + + /* The key frame is not good enough */ + if (kf_err > ((cpi->ambient_err * 7) >> 3)) { + /* Lower q_high */ + q_high = (Q > q_low) ? (Q - 1) : q_low; + + /* Adjust Q */ + Q = (q_high + q_low) >> 1; + } + /* The key frame is much better than the previous frame */ + else if (kf_err < (cpi->ambient_err >> 1)) { + /* Raise q_low */ + q_low = (Q < q_high) ? (Q + 1) : q_high; + + /* Adjust Q */ + Q = (q_high + q_low + 1) >> 1; + } + + /* Clamp Q to upper and lower limits: */ + if (Q > q_high) { + Q = q_high; + } else if (Q < q_low) { + Q = q_low; + } + + Loop = Q != last_q; + } + + /* Is the projected frame size out of range and are we allowed + * to attempt to recode. + */ + else if (recode_loop_test(cpi, frame_over_shoot_limit, + frame_under_shoot_limit, Q, top_index, + bottom_index)) { + int last_q = Q; + int Retries = 0; + + /* Frame size out of permitted range. Update correction factor + * & compute new Q to try... + */ + + /* Frame is too large */ + if (cpi->projected_frame_size > cpi->this_frame_target) { + /* Raise Qlow as to at least the current value */ + q_low = (Q < q_high) ? (Q + 1) : q_high; + + /* If we are using over quant do the same for zbin_oq_low */ + if (cpi->mb.zbin_over_quant > 0) { + zbin_oq_low = (cpi->mb.zbin_over_quant < zbin_oq_high) + ? (cpi->mb.zbin_over_quant + 1) + : zbin_oq_high; + } + + if (undershoot_seen) { + /* Update rate_correction_factor unless + * cpi->active_worst_quality has changed. + */ + if (!active_worst_qchanged) { + vp8_update_rate_correction_factors(cpi, 1); + } + + Q = (q_high + q_low + 1) / 2; + + /* Adjust cpi->zbin_over_quant (only allowed when Q + * is max) + */ + if (Q < MAXQ) { + cpi->mb.zbin_over_quant = 0; + } else { + zbin_oq_low = (cpi->mb.zbin_over_quant < zbin_oq_high) + ? (cpi->mb.zbin_over_quant + 1) + : zbin_oq_high; + cpi->mb.zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2; + } + } else { + /* Update rate_correction_factor unless + * cpi->active_worst_quality has changed. + */ + if (!active_worst_qchanged) { + vp8_update_rate_correction_factors(cpi, 0); + } + + Q = vp8_regulate_q(cpi, cpi->this_frame_target); + + while (((Q < q_low) || (cpi->mb.zbin_over_quant < zbin_oq_low)) && + (Retries < 10)) { + vp8_update_rate_correction_factors(cpi, 0); + Q = vp8_regulate_q(cpi, cpi->this_frame_target); + Retries++; + } + } + + overshoot_seen = 1; + } + /* Frame is too small */ + else { + if (cpi->mb.zbin_over_quant == 0) { + /* Lower q_high if not using over quant */ + q_high = (Q > q_low) ? (Q - 1) : q_low; + } else { + /* else lower zbin_oq_high */ + zbin_oq_high = (cpi->mb.zbin_over_quant > zbin_oq_low) + ? (cpi->mb.zbin_over_quant - 1) + : zbin_oq_low; + } + + if (overshoot_seen) { + /* Update rate_correction_factor unless + * cpi->active_worst_quality has changed. + */ + if (!active_worst_qchanged) { + vp8_update_rate_correction_factors(cpi, 1); + } + + Q = (q_high + q_low) / 2; + + /* Adjust cpi->zbin_over_quant (only allowed when Q + * is max) + */ + if (Q < MAXQ) { + cpi->mb.zbin_over_quant = 0; + } else { + cpi->mb.zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2; + } + } else { + /* Update rate_correction_factor unless + * cpi->active_worst_quality has changed. + */ + if (!active_worst_qchanged) { + vp8_update_rate_correction_factors(cpi, 0); + } + + Q = vp8_regulate_q(cpi, cpi->this_frame_target); + + /* Special case reset for qlow for constrained quality. + * This should only trigger where there is very substantial + * undershoot on a frame and the auto cq level is above + * the user passsed in value. + */ + if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) && + (Q < q_low)) { + q_low = Q; + } + + while (((Q > q_high) || (cpi->mb.zbin_over_quant > zbin_oq_high)) && + (Retries < 10)) { + vp8_update_rate_correction_factors(cpi, 0); + Q = vp8_regulate_q(cpi, cpi->this_frame_target); + Retries++; + } + } + + undershoot_seen = 1; + } + + /* Clamp Q to upper and lower limits: */ + if (Q > q_high) { + Q = q_high; + } else if (Q < q_low) { + Q = q_low; + } + + /* Clamp cpi->zbin_over_quant */ + cpi->mb.zbin_over_quant = + (cpi->mb.zbin_over_quant < zbin_oq_low) ? zbin_oq_low + : (cpi->mb.zbin_over_quant > zbin_oq_high) ? zbin_oq_high + : cpi->mb.zbin_over_quant; + + Loop = Q != last_q; + } else { + Loop = 0; + } +#endif // CONFIG_REALTIME_ONLY + + if (cpi->is_src_frame_alt_ref) Loop = 0; + + if (Loop == 1) { + vp8_restore_coding_context(cpi); +#if CONFIG_INTERNAL_STATS + cpi->tot_recode_hits++; +#endif + } + } while (Loop == 1); + +#if defined(DROP_UNCODED_FRAMES) + /* if there are no coded macroblocks at all drop this frame */ + if (cpi->common.MBs == cpi->mb.skip_true_count && + (cpi->drop_frame_count & 7) != 7 && cm->frame_type != KEY_FRAME) { + cpi->common.current_video_frame++; + cpi->frames_since_key++; + cpi->drop_frame_count++; + cpi->ext_refresh_frame_flags_pending = 0; + // We advance the temporal pattern for dropped frames. + cpi->temporal_pattern_counter++; + return; + } + cpi->drop_frame_count = 0; +#endif + +#if 0 + /* Experimental code for lagged and one pass + * Update stats used for one pass GF selection + */ + { + cpi->one_pass_frame_stats[cpi->one_pass_frame_index].frame_coded_error = (double)cpi->prediction_error; + cpi->one_pass_frame_stats[cpi->one_pass_frame_index].frame_intra_error = (double)cpi->intra_error; + cpi->one_pass_frame_stats[cpi->one_pass_frame_index].frame_pcnt_inter = (double)(100 - cpi->this_frame_percent_intra) / 100.0; + } +#endif + + /* Special case code to reduce pulsing when key frames are forced at a + * fixed interval. Note the reconstruction error if it is the frame before + * the force key frame + */ + if (cpi->next_key_frame_forced && (cpi->twopass.frames_to_key == 0)) { + cpi->ambient_err = + vp8_calc_ss_err(cpi->Source, &cm->yv12_fb[cm->new_fb_idx]); + } + +/* This frame's MVs are saved and will be used in next frame's MV predictor. + * Last frame has one more line(add to bottom) and one more column(add to + * right) than cm->mip. The edge elements are initialized to 0. + */ +#if CONFIG_MULTI_RES_ENCODING + if (!cpi->oxcf.mr_encoder_id && cm->show_frame) +#else + if (cm->show_frame) /* do not save for altref frame */ +#endif + { + int mb_row; + int mb_col; + /* Point to beginning of allocated MODE_INFO arrays. */ + MODE_INFO *tmp = cm->mip; + + if (cm->frame_type != KEY_FRAME) { + for (mb_row = 0; mb_row < cm->mb_rows + 1; ++mb_row) { + for (mb_col = 0; mb_col < cm->mb_cols + 1; ++mb_col) { + if (tmp->mbmi.ref_frame != INTRA_FRAME) { + cpi->lfmv[mb_col + mb_row * (cm->mode_info_stride + 1)].as_int = + tmp->mbmi.mv.as_int; + } + + cpi->lf_ref_frame_sign_bias[mb_col + + mb_row * (cm->mode_info_stride + 1)] = + cm->ref_frame_sign_bias[tmp->mbmi.ref_frame]; + cpi->lf_ref_frame[mb_col + mb_row * (cm->mode_info_stride + 1)] = + tmp->mbmi.ref_frame; + tmp++; + } + } + } + } + + /* Count last ref frame 0,0 usage on current encoded frame. */ + { + int mb_row; + int mb_col; + /* Point to beginning of MODE_INFO arrays. */ + MODE_INFO *tmp = cm->mi; + + cpi->zeromv_count = 0; + + if (cm->frame_type != KEY_FRAME) { + for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { + for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) { + if (tmp->mbmi.mode == ZEROMV && tmp->mbmi.ref_frame == LAST_FRAME) { + cpi->zeromv_count++; + } + tmp++; + } + tmp++; + } + } + } + +#if CONFIG_MULTI_RES_ENCODING + vp8_cal_dissimilarity(cpi); +#endif + + /* Update the GF usage maps. + * This is done after completing the compression of a frame when all + * modes etc. are finalized but before loop filter + */ + if (cpi->oxcf.number_of_layers == 1) { + vp8_update_gf_usage_maps(cpi, cm, &cpi->mb); + } + + if (cm->frame_type == KEY_FRAME) cm->refresh_last_frame = 1; + +#if 0 + { + FILE *f = fopen("gfactive.stt", "a"); + fprintf(f, "%8d %8d %8d %8d %8d\n", cm->current_video_frame, (100 * cpi->gf_active_count) / (cpi->common.mb_rows * cpi->common.mb_cols), cpi->this_iiratio, cpi->next_iiratio, cm->refresh_golden_frame); + fclose(f); + } +#endif + + /* For inter frames the current default behavior is that when + * cm->refresh_golden_frame is set we copy the old GF over to the ARF buffer + * This is purely an encoder decision at present. + * Avoid this behavior when refresh flags are set by the user. + */ + if (!cpi->oxcf.error_resilient_mode && cm->refresh_golden_frame && + !cpi->ext_refresh_frame_flags_pending) { + cm->copy_buffer_to_arf = 2; + } else { + cm->copy_buffer_to_arf = 0; + } + + cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx]; + +#if CONFIG_TEMPORAL_DENOISING + // Get some measure of the amount of noise, by measuring the (partial) mse + // between source and denoised buffer, for y channel. Partial refers to + // computing the sse for a sub-sample of the frame (i.e., skip x blocks along + // row/column), + // and only for blocks in that set that are consecutive ZEROMV_LAST mode. + // Do this every ~8 frames, to further reduce complexity. + // TODO(marpan): Keep this for now for the case cpi->oxcf.noise_sensitivity < + // 4, + // should be removed in favor of the process_denoiser_mode_change() function + // below. + if (cpi->oxcf.noise_sensitivity > 0 && cpi->oxcf.noise_sensitivity < 4 && + !cpi->oxcf.screen_content_mode && cpi->frames_since_key % 8 == 0 && + cm->frame_type != KEY_FRAME) { + cpi->mse_source_denoised = measure_square_diff_partial( + &cpi->denoiser.yv12_running_avg[INTRA_FRAME], cpi->Source, cpi); + } + + // For the adaptive denoising mode (noise_sensitivity == 4), sample the mse + // of source diff (between current and previous frame), and determine if we + // should switch the denoiser mode. Sampling refers to computing the mse for + // a sub-sample of the frame (i.e., skip x blocks along row/column), and + // only for blocks in that set that have used ZEROMV LAST, along with some + // constraint on the sum diff between blocks. This process is called every + // ~8 frames, to further reduce complexity. + if (cpi->oxcf.noise_sensitivity == 4 && !cpi->oxcf.screen_content_mode && + cpi->frames_since_key % 8 == 0 && cm->frame_type != KEY_FRAME) { + process_denoiser_mode_change(cpi); + } +#endif + +#ifdef OUTPUT_YUV_SKINMAP + if (cpi->common.current_video_frame > 1) { + vp8_compute_skin_map(cpi, yuv_skinmap_file); + } +#endif + +#if CONFIG_MULTITHREAD + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) { + /* start loopfilter in separate thread */ + sem_post(&cpi->h_event_start_lpf); + cpi->b_lpf_running = 1; + /* wait for the filter_level to be picked so that we can continue with + * stream packing */ + sem_wait(&cpi->h_event_end_lpf); + } else +#endif + { + vp8_loopfilter_frame(cpi, cm); + } + + update_reference_frames(cpi); + +#ifdef OUTPUT_YUV_DENOISED + vpx_write_yuv_frame(yuv_denoised_file, + &cpi->denoiser.yv12_running_avg[INTRA_FRAME]); +#endif + +#if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING) + if (cpi->oxcf.error_resilient_mode) { + cm->refresh_entropy_probs = 0; + } +#endif + + /* build the bitstream */ + vp8_pack_bitstream(cpi, dest, dest_end, size); + + /* Move storing frame_type out of the above loop since it is also + * needed in motion search besides loopfilter */ + cm->last_frame_type = cm->frame_type; + + /* Update rate control heuristics */ + cpi->total_byte_count += (*size); + cpi->projected_frame_size = (int)(*size) << 3; + + if (cpi->oxcf.number_of_layers > 1) { + unsigned int i; + for (i = cpi->current_layer + 1; i < cpi->oxcf.number_of_layers; ++i) { + cpi->layer_context[i].total_byte_count += (*size); + } + } + + if (!active_worst_qchanged) vp8_update_rate_correction_factors(cpi, 2); + + cpi->last_q[cm->frame_type] = cm->base_qindex; + + if (cm->frame_type == KEY_FRAME) { + vp8_adjust_key_frame_context(cpi); + } + + /* Keep a record of ambient average Q. */ + if (cm->frame_type != KEY_FRAME) { + cpi->avg_frame_qindex = + (2 + 3 * cpi->avg_frame_qindex + cm->base_qindex) >> 2; + } + + /* Keep a record from which we can calculate the average Q excluding + * GF updates and key frames + */ + if ((cm->frame_type != KEY_FRAME) && + ((cpi->oxcf.number_of_layers > 1) || + (!cm->refresh_golden_frame && !cm->refresh_alt_ref_frame))) { + cpi->ni_frames++; + + /* Calculate the average Q for normal inter frames (not key or GFU + * frames). + */ + if (cpi->pass == 2) { + cpi->ni_tot_qi += Q; + cpi->ni_av_qi = (cpi->ni_tot_qi / cpi->ni_frames); + } else { + /* Damp value for first few frames */ + if (cpi->ni_frames > 150) { + cpi->ni_tot_qi += Q; + cpi->ni_av_qi = (cpi->ni_tot_qi / cpi->ni_frames); + } + /* For one pass, early in the clip ... average the current frame Q + * value with the worstq entered by the user as a dampening measure + */ + else { + cpi->ni_tot_qi += Q; + cpi->ni_av_qi = + ((cpi->ni_tot_qi / cpi->ni_frames) + cpi->worst_quality + 1) / 2; + } + + /* If the average Q is higher than what was used in the last + * frame (after going through the recode loop to keep the frame + * size within range) then use the last frame value - 1. The -1 + * is designed to stop Q and hence the data rate, from + * progressively falling away during difficult sections, but at + * the same time reduce the number of iterations around the + * recode loop. + */ + if (Q > cpi->ni_av_qi) cpi->ni_av_qi = Q - 1; + } + } + + /* Update the buffer level variable. */ + /* Non-viewable frames are a special case and are treated as pure overhead. */ + if (!cm->show_frame) { + cpi->bits_off_target -= cpi->projected_frame_size; + } else { + cpi->bits_off_target += + cpi->av_per_frame_bandwidth - cpi->projected_frame_size; + } + + /* Clip the buffer level to the maximum specified buffer size */ + if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size) { + cpi->bits_off_target = cpi->oxcf.maximum_buffer_size; + } + + // Don't let the buffer level go below some threshold, given here + // by -|maximum_buffer_size|. For now we only do this for + // screen content input. + if (cpi->oxcf.screen_content_mode && + cpi->bits_off_target < -cpi->oxcf.maximum_buffer_size) { + cpi->bits_off_target = -cpi->oxcf.maximum_buffer_size; + } + + /* Rolling monitors of whether we are over or underspending used to + * help regulate min and Max Q in two pass. + */ + cpi->rolling_target_bits = (int)ROUND64_POWER_OF_TWO( + (int64_t)cpi->rolling_target_bits * 3 + cpi->this_frame_target, 2); + cpi->rolling_actual_bits = (int)ROUND64_POWER_OF_TWO( + (int64_t)cpi->rolling_actual_bits * 3 + cpi->projected_frame_size, 2); + cpi->long_rolling_target_bits = (int)ROUND64_POWER_OF_TWO( + (int64_t)cpi->long_rolling_target_bits * 31 + cpi->this_frame_target, 5); + cpi->long_rolling_actual_bits = (int)ROUND64_POWER_OF_TWO( + (int64_t)cpi->long_rolling_actual_bits * 31 + cpi->projected_frame_size, + 5); + + /* Actual bits spent */ + cpi->total_actual_bits += cpi->projected_frame_size; + +#if 0 && CONFIG_INTERNAL_STATS + /* Debug stats */ + cpi->total_target_vs_actual += + (cpi->this_frame_target - cpi->projected_frame_size); +#endif + + cpi->buffer_level = cpi->bits_off_target; + + /* Propagate values to higher temporal layers */ + if (cpi->oxcf.number_of_layers > 1) { + unsigned int i; + + for (i = cpi->current_layer + 1; i < cpi->oxcf.number_of_layers; ++i) { + LAYER_CONTEXT *lc = &cpi->layer_context[i]; + int bits_off_for_this_layer = (int)round( + lc->target_bandwidth / lc->framerate - cpi->projected_frame_size); + + lc->bits_off_target += bits_off_for_this_layer; + + /* Clip buffer level to maximum buffer size for the layer */ + if (lc->bits_off_target > lc->maximum_buffer_size) { + lc->bits_off_target = lc->maximum_buffer_size; + } + + lc->total_actual_bits += cpi->projected_frame_size; + lc->total_target_vs_actual += bits_off_for_this_layer; + lc->buffer_level = lc->bits_off_target; + } + } + + /* Update bits left to the kf and gf groups to account for overshoot + * or undershoot on these frames + */ + if (cm->frame_type == KEY_FRAME) { + cpi->twopass.kf_group_bits += + cpi->this_frame_target - cpi->projected_frame_size; + + if (cpi->twopass.kf_group_bits < 0) cpi->twopass.kf_group_bits = 0; + } else if (cm->refresh_golden_frame || cm->refresh_alt_ref_frame) { + cpi->twopass.gf_group_bits += + cpi->this_frame_target - cpi->projected_frame_size; + + if (cpi->twopass.gf_group_bits < 0) cpi->twopass.gf_group_bits = 0; + } + + if (cm->frame_type != KEY_FRAME) { + if (cpi->common.refresh_alt_ref_frame) { + cpi->last_skip_false_probs[2] = cpi->prob_skip_false; + cpi->last_skip_probs_q[2] = cm->base_qindex; + } else if (cpi->common.refresh_golden_frame) { + cpi->last_skip_false_probs[1] = cpi->prob_skip_false; + cpi->last_skip_probs_q[1] = cm->base_qindex; + } else { + cpi->last_skip_false_probs[0] = cpi->prob_skip_false; + cpi->last_skip_probs_q[0] = cm->base_qindex; + + /* update the baseline */ + cpi->base_skip_false_prob[cm->base_qindex] = cpi->prob_skip_false; + } + } + +#if 0 && CONFIG_INTERNAL_STATS + { + FILE *f = fopen("tmp.stt", "a"); + + vpx_clear_system_state(); + + if (cpi->twopass.total_left_stats.coded_error != 0.0) + fprintf(f, "%10d %10d %10d %10d %10d %10"PRId64" %10"PRId64 + "%10"PRId64" %10d %6d %6d %6d %6d %5d %5d %5d %8d " + "%8.2lf %"PRId64" %10.3lf %10"PRId64" %8d\n", + cpi->common.current_video_frame, cpi->this_frame_target, + cpi->projected_frame_size, + (cpi->projected_frame_size - cpi->this_frame_target), + cpi->total_target_vs_actual, + cpi->buffer_level, + (cpi->oxcf.starting_buffer_level-cpi->bits_off_target), + cpi->total_actual_bits, cm->base_qindex, + cpi->active_best_quality, cpi->active_worst_quality, + cpi->ni_av_qi, cpi->cq_target_quality, + cm->refresh_golden_frame, cm->refresh_alt_ref_frame, + cm->frame_type, cpi->gfu_boost, + cpi->twopass.est_max_qcorrection_factor, + cpi->twopass.bits_left, + cpi->twopass.total_left_stats.coded_error, + (double)cpi->twopass.bits_left / + cpi->twopass.total_left_stats.coded_error, + cpi->tot_recode_hits); + else + fprintf(f, "%10d %10d %10d %10d %10d %10"PRId64" %10"PRId64 + "%10"PRId64" %10d %6d %6d %6d %6d %5d %5d %5d %8d " + "%8.2lf %"PRId64" %10.3lf %8d\n", + cpi->common.current_video_frame, cpi->this_frame_target, + cpi->projected_frame_size, + (cpi->projected_frame_size - cpi->this_frame_target), + cpi->total_target_vs_actual, + cpi->buffer_level, + (cpi->oxcf.starting_buffer_level-cpi->bits_off_target), + cpi->total_actual_bits, cm->base_qindex, + cpi->active_best_quality, cpi->active_worst_quality, + cpi->ni_av_qi, cpi->cq_target_quality, + cm->refresh_golden_frame, cm->refresh_alt_ref_frame, + cm->frame_type, cpi->gfu_boost, + cpi->twopass.est_max_qcorrection_factor, + cpi->twopass.bits_left, + cpi->twopass.total_left_stats.coded_error, + cpi->tot_recode_hits); + + fclose(f); + + { + FILE *fmodes = fopen("Modes.stt", "a"); + + fprintf(fmodes, "%6d:%1d:%1d:%1d ", + cpi->common.current_video_frame, + cm->frame_type, cm->refresh_golden_frame, + cm->refresh_alt_ref_frame); + + fprintf(fmodes, "\n"); + + fclose(fmodes); + } + } + +#endif + + cpi->ext_refresh_frame_flags_pending = 0; + + if (cm->refresh_golden_frame == 1) { + cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN; + } else { + cm->frame_flags = cm->frame_flags & ~FRAMEFLAGS_GOLDEN; + } + + if (cm->refresh_alt_ref_frame == 1) { + cm->frame_flags = cm->frame_flags | FRAMEFLAGS_ALTREF; + } else { + cm->frame_flags = cm->frame_flags & ~FRAMEFLAGS_ALTREF; + } + + if (cm->refresh_last_frame & cm->refresh_golden_frame) { /* both refreshed */ + cpi->gold_is_last = 1; + } else if (cm->refresh_last_frame ^ cm->refresh_golden_frame) { + /* 1 refreshed but not the other */ + cpi->gold_is_last = 0; + } + + if (cm->refresh_last_frame & cm->refresh_alt_ref_frame) { /* both refreshed */ + cpi->alt_is_last = 1; + } else if (cm->refresh_last_frame ^ cm->refresh_alt_ref_frame) { + /* 1 refreshed but not the other */ + cpi->alt_is_last = 0; + } + + if (cm->refresh_alt_ref_frame & + cm->refresh_golden_frame) { /* both refreshed */ + cpi->gold_is_alt = 1; + } else if (cm->refresh_alt_ref_frame ^ cm->refresh_golden_frame) { + /* 1 refreshed but not the other */ + cpi->gold_is_alt = 0; + } + + cpi->ref_frame_flags = VP8_ALTR_FRAME | VP8_GOLD_FRAME | VP8_LAST_FRAME; + + if (cpi->gold_is_last) cpi->ref_frame_flags &= ~VP8_GOLD_FRAME; + + if (cpi->alt_is_last) cpi->ref_frame_flags &= ~VP8_ALTR_FRAME; + + if (cpi->gold_is_alt) cpi->ref_frame_flags &= ~VP8_ALTR_FRAME; + + if (!cpi->oxcf.error_resilient_mode) { + if (cpi->oxcf.play_alternate && cm->refresh_alt_ref_frame && + (cm->frame_type != KEY_FRAME)) { + /* Update the alternate reference frame stats as appropriate. */ + update_alt_ref_frame_stats(cpi); + } else { + /* Update the Golden frame stats as appropriate. */ + update_golden_frame_stats(cpi); + } + } + + if (cm->frame_type == KEY_FRAME) { + /* Tell the caller that the frame was coded as a key frame */ + *frame_flags = cm->frame_flags | FRAMEFLAGS_KEY; + + /* As this frame is a key frame the next defaults to an inter frame. */ + cm->frame_type = INTER_FRAME; + + cpi->last_frame_percent_intra = 100; + } else { + *frame_flags = cm->frame_flags & ~FRAMEFLAGS_KEY; + + cpi->last_frame_percent_intra = cpi->this_frame_percent_intra; + } + + /* Clear the one shot update flags for segmentation map and mode/ref + * loop filter deltas. + */ + cpi->mb.e_mbd.update_mb_segmentation_map = 0; + cpi->mb.e_mbd.update_mb_segmentation_data = 0; + cpi->mb.e_mbd.mode_ref_lf_delta_update = 0; + + /* Don't increment frame counters if this was an altref buffer update + * not a real frame + */ + if (cm->show_frame) { + cm->current_video_frame++; + cpi->frames_since_key++; + cpi->temporal_pattern_counter++; + } + +#if 0 + { + char filename[512]; + FILE *recon_file; + sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame); + recon_file = fopen(filename, "wb"); + fwrite(cm->yv12_fb[cm->lst_fb_idx].buffer_alloc, + cm->yv12_fb[cm->lst_fb_idx].frame_size, 1, recon_file); + fclose(recon_file); + } +#endif + + /* DEBUG */ + /* vpx_write_yuv_frame("encoder_recon.yuv", cm->frame_to_show); */ +} +#if !CONFIG_REALTIME_ONLY +static void Pass2Encode(VP8_COMP *cpi, size_t *size, unsigned char *dest, + unsigned char *dest_end, unsigned int *frame_flags) { + if (!cpi->common.refresh_alt_ref_frame) vp8_second_pass(cpi); + + encode_frame_to_data_rate(cpi, size, dest, dest_end, frame_flags); + cpi->twopass.bits_left -= 8 * (int)(*size); + + if (!cpi->common.refresh_alt_ref_frame) { + double two_pass_min_rate = + (double)(cpi->oxcf.target_bandwidth * + cpi->oxcf.two_pass_vbrmin_section / 100); + cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->framerate); + } +} +#endif + +int vp8_receive_raw_frame(VP8_COMP *cpi, unsigned int frame_flags, + YV12_BUFFER_CONFIG *sd, int64_t time_stamp, + int64_t end_time) { + struct vpx_usec_timer timer; + int res = 0; + + vpx_usec_timer_start(&timer); + + /* Reinit the lookahead buffer if the frame size changes */ + if (sd->y_width != cpi->oxcf.Width || sd->y_height != cpi->oxcf.Height) { + assert(cpi->oxcf.lag_in_frames < 2); + dealloc_raw_frame_buffers(cpi); + alloc_raw_frame_buffers(cpi); + } + + if (vp8_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags, + cpi->active_map_enabled ? cpi->active_map : NULL)) { + res = -1; + } + vpx_usec_timer_mark(&timer); + cpi->time_receive_data += vpx_usec_timer_elapsed(&timer); + + return res; +} + +static int frame_is_reference(const VP8_COMP *cpi) { + const VP8_COMMON *cm = &cpi->common; + const MACROBLOCKD *xd = &cpi->mb.e_mbd; + + return cm->frame_type == KEY_FRAME || cm->refresh_last_frame || + cm->refresh_golden_frame || cm->refresh_alt_ref_frame || + cm->copy_buffer_to_gf || cm->copy_buffer_to_arf || + cm->refresh_entropy_probs || xd->mode_ref_lf_delta_update || + xd->update_mb_segmentation_map || xd->update_mb_segmentation_data; +} + +int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, + size_t *size, unsigned char *dest, + unsigned char *dest_end, int64_t *time_stamp, + int64_t *time_end, int flush) { + VP8_COMMON *cm; + struct vpx_usec_timer tsctimer; + struct vpx_usec_timer ticktimer; + struct vpx_usec_timer cmptimer; + YV12_BUFFER_CONFIG *force_src_buffer = NULL; + + if (!cpi) return -1; + + cm = &cpi->common; + + vpx_usec_timer_start(&cmptimer); + + cpi->source = NULL; + +#if !CONFIG_REALTIME_ONLY + /* Should we code an alternate reference frame */ + if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.play_alternate && + cpi->source_alt_ref_pending) { + if ((cpi->source = vp8_lookahead_peek( + cpi->lookahead, cpi->frames_till_gf_update_due, PEEK_FORWARD))) { + cpi->alt_ref_source = cpi->source; + if (cpi->oxcf.arnr_max_frames > 0) { + vp8_temporal_filter_prepare_c(cpi, cpi->frames_till_gf_update_due); + force_src_buffer = &cpi->alt_ref_buffer; + } + cpi->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due; + cm->refresh_alt_ref_frame = 1; + cm->refresh_golden_frame = 0; + cm->refresh_last_frame = 0; + cm->show_frame = 0; + /* Clear Pending alt Ref flag. */ + cpi->source_alt_ref_pending = 0; + cpi->is_src_frame_alt_ref = 0; + } + } +#endif + + if (!cpi->source) { + /* Read last frame source if we are encoding first pass. */ + if (cpi->pass == 1 && cm->current_video_frame > 0) { + if ((cpi->last_source = + vp8_lookahead_peek(cpi->lookahead, 1, PEEK_BACKWARD)) == NULL) { + return -1; + } + } + + if ((cpi->source = vp8_lookahead_pop(cpi->lookahead, flush))) { + cm->show_frame = 1; + + cpi->is_src_frame_alt_ref = + cpi->alt_ref_source && (cpi->source == cpi->alt_ref_source); + + if (cpi->is_src_frame_alt_ref) cpi->alt_ref_source = NULL; + } + } + + if (cpi->source) { + cpi->Source = force_src_buffer ? force_src_buffer : &cpi->source->img; + cpi->un_scaled_source = cpi->Source; + *time_stamp = cpi->source->ts_start; + *time_end = cpi->source->ts_end; + *frame_flags = cpi->source->flags; + + if (cpi->pass == 1 && cm->current_video_frame > 0) { + cpi->last_frame_unscaled_source = &cpi->last_source->img; + } + } else { + *size = 0; +#if !CONFIG_REALTIME_ONLY + + if (flush && cpi->pass == 1 && !cpi->twopass.first_pass_done) { + vp8_end_first_pass(cpi); /* get last stats packet */ + cpi->twopass.first_pass_done = 1; + } + +#endif + + return -1; + } + + if (cpi->source->ts_start < cpi->first_time_stamp_ever) { + cpi->first_time_stamp_ever = cpi->source->ts_start; + cpi->last_end_time_stamp_seen = cpi->source->ts_start; + } + + /* adjust frame rates based on timestamps given */ + if (cm->show_frame) { + int64_t this_duration; + int step = 0; + + if (cpi->source->ts_start == cpi->first_time_stamp_ever) { + this_duration = cpi->source->ts_end - cpi->source->ts_start; + step = 1; + } else { + int64_t last_duration; + + this_duration = cpi->source->ts_end - cpi->last_end_time_stamp_seen; + last_duration = cpi->last_end_time_stamp_seen - cpi->last_time_stamp_seen; + // Cap this to avoid overflow of (this_duration - last_duration) * 10 + this_duration = VPXMIN(this_duration, INT64_MAX / 10); + /* do a step update if the duration changes by 10% */ + if (last_duration) { + step = (int)(((this_duration - last_duration) * 10 / last_duration)); + } + } + + if (this_duration) { + if (step) { + cpi->ref_framerate = 10000000.0 / this_duration; + } else { + double avg_duration, interval; + + /* Average this frame's rate into the last second's average + * frame rate. If we haven't seen 1 second yet, then average + * over the whole interval seen. + */ + interval = (double)(cpi->source->ts_end - cpi->first_time_stamp_ever); + if (interval > 10000000.0) interval = 10000000; + + avg_duration = 10000000.0 / cpi->ref_framerate; + avg_duration *= (interval - avg_duration + this_duration); + avg_duration /= interval; + + cpi->ref_framerate = 10000000.0 / avg_duration; + } +#if CONFIG_MULTI_RES_ENCODING + if (cpi->oxcf.mr_total_resolutions > 1) { + LOWER_RES_FRAME_INFO *low_res_frame_info = + (LOWER_RES_FRAME_INFO *)cpi->oxcf.mr_low_res_mode_info; + // Frame rate should be the same for all spatial layers in + // multi-res-encoding (simulcast), so we constrain the frame for + // higher layers to be that of lowest resolution. This is needed + // as he application may decide to skip encoding a high layer and + // then start again, in which case a big jump in time-stamps will + // be received for that high layer, which will yield an incorrect + // frame rate (from time-stamp adjustment in above calculation). + if (cpi->oxcf.mr_encoder_id) { + if (!low_res_frame_info->skip_encoding_base_stream) + cpi->ref_framerate = low_res_frame_info->low_res_framerate; + } else { + // Keep track of frame rate for lowest resolution. + low_res_frame_info->low_res_framerate = cpi->ref_framerate; + // The base stream is being encoded so set skip flag to 0. + low_res_frame_info->skip_encoding_base_stream = 0; + } + } +#endif + if (cpi->oxcf.number_of_layers > 1) { + unsigned int i; + + /* Update frame rates for each layer */ + assert(cpi->oxcf.number_of_layers <= VPX_TS_MAX_LAYERS); + for (i = 0; i < cpi->oxcf.number_of_layers && i < VPX_TS_MAX_LAYERS; + ++i) { + LAYER_CONTEXT *lc = &cpi->layer_context[i]; + lc->framerate = cpi->ref_framerate / cpi->oxcf.rate_decimator[i]; + } + } else { + vp8_new_framerate(cpi, cpi->ref_framerate); + } + } + + cpi->last_time_stamp_seen = cpi->source->ts_start; + cpi->last_end_time_stamp_seen = cpi->source->ts_end; + } + + if (cpi->oxcf.number_of_layers > 1) { + int layer; + + vp8_update_layer_contexts(cpi); + + /* Restore layer specific context & set frame rate */ + if (cpi->temporal_layer_id >= 0) { + layer = cpi->temporal_layer_id; + } else { + layer = + cpi->oxcf + .layer_id[cpi->temporal_pattern_counter % cpi->oxcf.periodicity]; + } + vp8_restore_layer_context(cpi, layer); + vp8_new_framerate(cpi, cpi->layer_context[layer].framerate); + } + + if (cpi->compressor_speed == 2) { + vpx_usec_timer_start(&tsctimer); + vpx_usec_timer_start(&ticktimer); + } + + cpi->lf_zeromv_pct = (cpi->zeromv_count * 100) / cm->MBs; + +#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING + { + int i; + const int num_part = (1 << cm->multi_token_partition); + /* the available bytes in dest */ + const unsigned long dest_size = dest_end - dest; + const int tok_part_buff_size = (dest_size * 9) / (10 * num_part); + + unsigned char *dp = dest; + + cpi->partition_d[0] = dp; + dp += dest_size / 10; /* reserve 1/10 for control partition */ + cpi->partition_d_end[0] = dp; + + for (i = 0; i < num_part; ++i) { + cpi->partition_d[i + 1] = dp; + dp += tok_part_buff_size; + cpi->partition_d_end[i + 1] = dp; + } + } +#endif + + /* start with a 0 size frame */ + *size = 0; + + /* Clear down mmx registers */ + vpx_clear_system_state(); + + cm->frame_type = INTER_FRAME; + cm->frame_flags = *frame_flags; + +#if 0 + + if (cm->refresh_alt_ref_frame) + { + cm->refresh_golden_frame = 0; + cm->refresh_last_frame = 0; + } + else + { + cm->refresh_golden_frame = 0; + cm->refresh_last_frame = 1; + } + +#endif + /* find a free buffer for the new frame */ + { + int i = 0; + for (; i < NUM_YV12_BUFFERS; ++i) { + if (!cm->yv12_fb[i].flags) { + cm->new_fb_idx = i; + break; + } + } + + assert(i < NUM_YV12_BUFFERS); + } + switch (cpi->pass) { +#if !CONFIG_REALTIME_ONLY + case 1: Pass1Encode(cpi); break; + case 2: Pass2Encode(cpi, size, dest, dest_end, frame_flags); break; +#endif // !CONFIG_REALTIME_ONLY + default: + encode_frame_to_data_rate(cpi, size, dest, dest_end, frame_flags); + break; + } + + if (cpi->compressor_speed == 2) { + unsigned int duration, duration2; + vpx_usec_timer_mark(&tsctimer); + vpx_usec_timer_mark(&ticktimer); + + duration = (int)(vpx_usec_timer_elapsed(&ticktimer)); + duration2 = (unsigned int)((double)duration / 2); + + if (cm->frame_type != KEY_FRAME) { + if (cpi->avg_encode_time == 0) { + cpi->avg_encode_time = duration; + } else { + cpi->avg_encode_time = (7 * cpi->avg_encode_time + duration) >> 3; + } + } + + if (duration2) { + { + if (cpi->avg_pick_mode_time == 0) { + cpi->avg_pick_mode_time = duration2; + } else { + cpi->avg_pick_mode_time = + (7 * cpi->avg_pick_mode_time + duration2) >> 3; + } + } + } + } + + if (cm->refresh_entropy_probs == 0) { + memcpy(&cm->fc, &cm->lfc, sizeof(cm->fc)); + } + + /* Save the contexts separately for alt ref, gold and last. */ + /* (TODO jbb -> Optimize this with pointers to avoid extra copies. ) */ + if (cm->refresh_alt_ref_frame) memcpy(&cpi->lfc_a, &cm->fc, sizeof(cm->fc)); + + if (cm->refresh_golden_frame) memcpy(&cpi->lfc_g, &cm->fc, sizeof(cm->fc)); + + if (cm->refresh_last_frame) memcpy(&cpi->lfc_n, &cm->fc, sizeof(cm->fc)); + + /* if it's a dropped frame honor the requests on subsequent frames */ + if (*size > 0) { + cpi->droppable = !frame_is_reference(cpi); + + /* return to normal state */ + cm->refresh_entropy_probs = 1; + cm->refresh_alt_ref_frame = 0; + cm->refresh_golden_frame = 0; + cm->refresh_last_frame = 1; + cm->frame_type = INTER_FRAME; + } + + /* Save layer specific state */ + if (cpi->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi); + + vpx_usec_timer_mark(&cmptimer); + cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer); + + if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame) { + generate_psnr_packet(cpi); + } + +#if CONFIG_INTERNAL_STATS + + if (cpi->pass != 1) { + cpi->bytes += *size; + + if (cm->show_frame) { + cpi->common.show_frame_mi = cpi->common.mi; + cpi->count++; + + if (cpi->b_calculate_psnr) { + uint64_t ye, ue, ve; + double frame_psnr; + YV12_BUFFER_CONFIG *orig = cpi->Source; + YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show; + unsigned int y_width = cpi->common.Width; + unsigned int y_height = cpi->common.Height; + unsigned int uv_width = (y_width + 1) / 2; + unsigned int uv_height = (y_height + 1) / 2; + int y_samples = y_height * y_width; + int uv_samples = uv_height * uv_width; + int t_samples = y_samples + 2 * uv_samples; + double sq_error; + + ye = calc_plane_error(orig->y_buffer, orig->y_stride, recon->y_buffer, + recon->y_stride, y_width, y_height); + + ue = calc_plane_error(orig->u_buffer, orig->uv_stride, recon->u_buffer, + recon->uv_stride, uv_width, uv_height); + + ve = calc_plane_error(orig->v_buffer, orig->uv_stride, recon->v_buffer, + recon->uv_stride, uv_width, uv_height); + + sq_error = (double)(ye + ue + ve); + + frame_psnr = vpx_sse_to_psnr(t_samples, 255.0, sq_error); + + cpi->total_y += vpx_sse_to_psnr(y_samples, 255.0, (double)ye); + cpi->total_u += vpx_sse_to_psnr(uv_samples, 255.0, (double)ue); + cpi->total_v += vpx_sse_to_psnr(uv_samples, 255.0, (double)ve); + cpi->total_sq_error += sq_error; + cpi->total += frame_psnr; +#if CONFIG_POSTPROC + { + YV12_BUFFER_CONFIG *pp = &cm->post_proc_buffer; + double sq_error2; + double frame_psnr2, frame_ssim2 = 0; + double weight = 0; + + vp8_deblock(cm, cm->frame_to_show, &cm->post_proc_buffer, + cm->filter_level * 10 / 6); + vpx_clear_system_state(); + + ye = calc_plane_error(orig->y_buffer, orig->y_stride, pp->y_buffer, + pp->y_stride, y_width, y_height); + + ue = calc_plane_error(orig->u_buffer, orig->uv_stride, pp->u_buffer, + pp->uv_stride, uv_width, uv_height); + + ve = calc_plane_error(orig->v_buffer, orig->uv_stride, pp->v_buffer, + pp->uv_stride, uv_width, uv_height); + + sq_error2 = (double)(ye + ue + ve); + + frame_psnr2 = vpx_sse_to_psnr(t_samples, 255.0, sq_error2); + + cpi->totalp_y += vpx_sse_to_psnr(y_samples, 255.0, (double)ye); + cpi->totalp_u += vpx_sse_to_psnr(uv_samples, 255.0, (double)ue); + cpi->totalp_v += vpx_sse_to_psnr(uv_samples, 255.0, (double)ve); + cpi->total_sq_error2 += sq_error2; + cpi->totalp += frame_psnr2; + + frame_ssim2 = + vpx_calc_ssim(cpi->Source, &cm->post_proc_buffer, &weight); + + cpi->summed_quality += frame_ssim2 * weight; + cpi->summed_weights += weight; + + if (cpi->oxcf.number_of_layers > 1) { + unsigned int i; + + for (i = cpi->current_layer; i < cpi->oxcf.number_of_layers; ++i) { + cpi->frames_in_layer[i]++; + + cpi->bytes_in_layer[i] += *size; + cpi->sum_psnr[i] += frame_psnr; + cpi->sum_psnr_p[i] += frame_psnr2; + cpi->total_error2[i] += sq_error; + cpi->total_error2_p[i] += sq_error2; + cpi->sum_ssim[i] += frame_ssim2 * weight; + cpi->sum_weights[i] += weight; + } + } + } +#endif + } + } + } + +#if 0 + + if (cpi->common.frame_type != 0 && cpi->common.base_qindex == cpi->oxcf.worst_allowed_q) + { + skiptruecount += cpi->skip_true_count; + skipfalsecount += cpi->skip_false_count; + } + +#endif +#if 0 + + if (cpi->pass != 1) + { + FILE *f = fopen("skip.stt", "a"); + fprintf(f, "frame:%4d flags:%4x Q:%4d P:%4d Size:%5d\n", cpi->common.current_video_frame, *frame_flags, cpi->common.base_qindex, cpi->prob_skip_false, *size); + + if (cpi->is_src_frame_alt_ref == 1) + fprintf(f, "skipcount: %4d framesize: %d\n", cpi->skip_true_count , *size); + + fclose(f); + } + +#endif +#endif + + cpi->common.error.setjmp = 0; + +#if CONFIG_MULTITHREAD + /* wait for the lpf thread done */ + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) && cpi->b_lpf_running) { + sem_wait(&cpi->h_event_end_lpf); + cpi->b_lpf_running = 0; + } +#endif + + return 0; +} + +int vp8_get_preview_raw_frame(VP8_COMP *cpi, YV12_BUFFER_CONFIG *dest, + vp8_ppflags_t *flags) { + if (cpi->common.refresh_alt_ref_frame) { + return -1; + } else { + int ret; + +#if CONFIG_POSTPROC + cpi->common.show_frame_mi = cpi->common.mi; + ret = vp8_post_proc_frame(&cpi->common, dest, flags); +#else + (void)flags; + + if (cpi->common.frame_to_show) { + *dest = *cpi->common.frame_to_show; + dest->y_width = cpi->common.Width; + dest->y_height = cpi->common.Height; + dest->uv_height = cpi->common.Height / 2; + ret = 0; + } else { + ret = -1; + } + +#endif + vpx_clear_system_state(); + return ret; + } +} + +int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows, + unsigned int cols, int delta_q[4], int delta_lf[4], + unsigned int threshold[4]) { + signed char feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS]; + int internal_delta_q[MAX_MB_SEGMENTS]; + const int range = 63; + int i; + + // Check number of rows and columns match + if (cpi->common.mb_rows != (int)rows || cpi->common.mb_cols != (int)cols) { + return -1; + } + + for (i = 0; i < MAX_MB_SEGMENTS; ++i) { + // Note abs() alone can't be used as the behavior of abs(INT_MIN) is + // undefined. + if (delta_q[i] > range || delta_q[i] < -range || delta_lf[i] > range || + delta_lf[i] < -range) { + return -1; + } + } + + // Also disable segmentation if no deltas are specified. + if (!map || (delta_q[0] == 0 && delta_q[1] == 0 && delta_q[2] == 0 && + delta_q[3] == 0 && delta_lf[0] == 0 && delta_lf[1] == 0 && + delta_lf[2] == 0 && delta_lf[3] == 0 && threshold[0] == 0 && + threshold[1] == 0 && threshold[2] == 0 && threshold[3] == 0)) { + disable_segmentation(cpi); + return 0; + } + + // Translate the external delta q values to internal values. + for (i = 0; i < MAX_MB_SEGMENTS; ++i) { + internal_delta_q[i] = + (delta_q[i] >= 0) ? q_trans[delta_q[i]] : -q_trans[-delta_q[i]]; + } + + /* Set the segmentation Map */ + set_segmentation_map(cpi, map); + + /* Activate segmentation. */ + enable_segmentation(cpi); + + /* Set up the quant segment data */ + feature_data[MB_LVL_ALT_Q][0] = internal_delta_q[0]; + feature_data[MB_LVL_ALT_Q][1] = internal_delta_q[1]; + feature_data[MB_LVL_ALT_Q][2] = internal_delta_q[2]; + feature_data[MB_LVL_ALT_Q][3] = internal_delta_q[3]; + + /* Set up the loop segment data s */ + feature_data[MB_LVL_ALT_LF][0] = delta_lf[0]; + feature_data[MB_LVL_ALT_LF][1] = delta_lf[1]; + feature_data[MB_LVL_ALT_LF][2] = delta_lf[2]; + feature_data[MB_LVL_ALT_LF][3] = delta_lf[3]; + + cpi->segment_encode_breakout[0] = threshold[0]; + cpi->segment_encode_breakout[1] = threshold[1]; + cpi->segment_encode_breakout[2] = threshold[2]; + cpi->segment_encode_breakout[3] = threshold[3]; + + /* Initialise the feature data structure */ + set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA); + + if (threshold[0] != 0 || threshold[1] != 0 || threshold[2] != 0 || + threshold[3] != 0) + cpi->use_roi_static_threshold = 1; + cpi->cyclic_refresh_mode_enabled = 0; + + return 0; +} + +int vp8_set_active_map(VP8_COMP *cpi, unsigned char *map, unsigned int rows, + unsigned int cols) { + if ((int)rows == cpi->common.mb_rows && (int)cols == cpi->common.mb_cols) { + if (map) { + memcpy(cpi->active_map, map, rows * cols); + cpi->active_map_enabled = 1; + } else { + cpi->active_map_enabled = 0; + } + + return 0; + } else { + return -1; + } +} + +int vp8_set_internal_size(VP8_COMP *cpi, VPX_SCALING_MODE horiz_mode, + VPX_SCALING_MODE vert_mode) { + if (horiz_mode <= VP8E_ONETWO) { + cpi->common.horiz_scale = horiz_mode; + } else { + return -1; + } + + if (vert_mode <= VP8E_ONETWO) { + cpi->common.vert_scale = vert_mode; + } else { + return -1; + } + + return 0; +} + +int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest) { + int i, j; + int Total = 0; + + unsigned char *src = source->y_buffer; + unsigned char *dst = dest->y_buffer; + + /* Loop through the Y plane raw and reconstruction data summing + * (square differences) + */ + for (i = 0; i < source->y_height; i += 16) { + for (j = 0; j < source->y_width; j += 16) { + unsigned int sse; + Total += vpx_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride, + &sse); + } + + src += 16 * source->y_stride; + dst += 16 * dest->y_stride; + } + + return Total; +} + +int vp8_get_quantizer(VP8_COMP *cpi) { return cpi->common.base_qindex; } diff --git a/media/libvpx/libvpx/vp8/encoder/onyx_int.h b/media/libvpx/libvpx/vp8/encoder/onyx_int.h new file mode 100644 index 0000000000..1451a27812 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/onyx_int.h @@ -0,0 +1,745 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_ENCODER_ONYX_INT_H_ +#define VPX_VP8_ENCODER_ONYX_INT_H_ + +#include +#include + +#include "vpx_config.h" +#include "vp8/common/onyx.h" +#include "treewriter.h" +#include "tokenize.h" +#include "vp8/common/onyxc_int.h" +#include "vpx_dsp/variance.h" +#include "encodemb.h" +#include "vp8/encoder/quantize.h" +#include "vp8/common/entropy.h" +#include "vp8/common/threading.h" +#include "vpx_ports/mem.h" +#include "vpx/internal/vpx_codec_internal.h" +#include "vpx/vp8.h" +#include "mcomp.h" +#include "vp8/common/findnearmv.h" +#include "lookahead.h" +#if CONFIG_TEMPORAL_DENOISING +#include "vp8/encoder/denoising.h" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#define MIN_GF_INTERVAL 4 +#define DEFAULT_GF_INTERVAL 7 + +#define KEY_FRAME_CONTEXT 5 + +#define MAX_LAG_BUFFERS (CONFIG_REALTIME_ONLY ? 1 : 25) + +#define AF_THRESH 25 +#define AF_THRESH2 100 +#define ARF_DECAY_THRESH 12 + +#define MIN_THRESHMULT 32 +#define MAX_THRESHMULT 512 + +#define GF_ZEROMV_ZBIN_BOOST 12 +#define LF_ZEROMV_ZBIN_BOOST 6 +#define MV_ZBIN_BOOST 4 +#define ZBIN_OQ_MAX 192 + +#define VP8_TEMPORAL_ALT_REF !CONFIG_REALTIME_ONLY + +/* vp8 uses 10,000,000 ticks/second as time stamp */ +#define TICKS_PER_SEC 10000000 + +typedef struct { + int kf_indicated; + unsigned int frames_since_key; + unsigned int frames_since_golden; + int filter_level; + int frames_till_gf_update_due; + int recent_ref_frame_usage[MAX_REF_FRAMES]; + + MV_CONTEXT mvc[2]; + int mvcosts[2][MVvals + 1]; + +#ifdef MODE_STATS + int y_modes[5]; + int uv_modes[4]; + int b_modes[10]; + int inter_y_modes[10]; + int inter_uv_modes[4]; + int inter_b_modes[10]; +#endif + + vp8_prob ymode_prob[4], uv_mode_prob[3]; /* interframe intra mode probs */ + vp8_prob kf_ymode_prob[4], kf_uv_mode_prob[3]; /* keyframe "" */ + + int ymode_count[5], uv_mode_count[4]; /* intra MB type cts this frame */ + + int count_mb_ref_frame_usage[MAX_REF_FRAMES]; + + int this_frame_percent_intra; + int last_frame_percent_intra; + +} CODING_CONTEXT; + +typedef struct { + double frame; + double intra_error; + double coded_error; + double ssim_weighted_pred_err; + double pcnt_inter; + double pcnt_motion; + double pcnt_second_ref; + double pcnt_neutral; + double MVr; + double mvr_abs; + double MVc; + double mvc_abs; + double MVrv; + double MVcv; + double mv_in_out_count; + double new_mv_count; + double duration; + double count; +} FIRSTPASS_STATS; + +typedef struct { + int frames_so_far; + double frame_intra_error; + double frame_coded_error; + double frame_pcnt_inter; + double frame_pcnt_motion; + double frame_mvr; + double frame_mvr_abs; + double frame_mvc; + double frame_mvc_abs; + +} ONEPASS_FRAMESTATS; + +typedef enum { + THR_ZERO1 = 0, + THR_DC = 1, + + THR_NEAREST1 = 2, + THR_NEAR1 = 3, + + THR_ZERO2 = 4, + THR_NEAREST2 = 5, + + THR_ZERO3 = 6, + THR_NEAREST3 = 7, + + THR_NEAR2 = 8, + THR_NEAR3 = 9, + + THR_V_PRED = 10, + THR_H_PRED = 11, + THR_TM = 12, + + THR_NEW1 = 13, + THR_NEW2 = 14, + THR_NEW3 = 15, + + THR_SPLIT1 = 16, + THR_SPLIT2 = 17, + THR_SPLIT3 = 18, + + THR_B_PRED = 19 +} THR_MODES; + +typedef enum { DIAMOND = 0, NSTEP = 1, HEX = 2 } SEARCH_METHODS; + +typedef struct { + int RD; + SEARCH_METHODS search_method; + int improved_quant; + int improved_dct; + int auto_filter; + int recode_loop; + int iterative_sub_pixel; + int half_pixel_search; + int quarter_pixel_search; + int thresh_mult[MAX_MODES]; + int max_step_search_steps; + int first_step; + int optimize_coefficients; + + int use_fastquant_for_pick; + int no_skip_block4x4_search; + int improved_mv_pred; + +} SPEED_FEATURES; + +typedef struct { + MACROBLOCK mb; + int segment_counts[MAX_MB_SEGMENTS]; + int totalrate; +} MB_ROW_COMP; + +typedef struct { + TOKENEXTRA *start; + TOKENEXTRA *stop; +} TOKENLIST; + +typedef struct { + int ithread; + void *ptr1; + void *ptr2; +} ENCODETHREAD_DATA; +typedef struct { + int ithread; + void *ptr1; +} LPFTHREAD_DATA; + +enum { + BLOCK_16X8, + BLOCK_8X16, + BLOCK_8X8, + BLOCK_4X4, + BLOCK_16X16, + BLOCK_MAX_SEGMENTS +}; + +typedef struct { + /* Layer configuration */ + double framerate; + int target_bandwidth; /* bits per second */ + + /* Layer specific coding parameters */ + int64_t starting_buffer_level; + int64_t optimal_buffer_level; + int64_t maximum_buffer_size; + int64_t starting_buffer_level_in_ms; + int64_t optimal_buffer_level_in_ms; + int64_t maximum_buffer_size_in_ms; + + int avg_frame_size_for_layer; + + int64_t buffer_level; + int64_t bits_off_target; + + int64_t total_actual_bits; + int total_target_vs_actual; + + int worst_quality; + int active_worst_quality; + int best_quality; + int active_best_quality; + + int ni_av_qi; + int ni_tot_qi; + int ni_frames; + int avg_frame_qindex; + + double rate_correction_factor; + double key_frame_rate_correction_factor; + double gf_rate_correction_factor; + + int zbin_over_quant; + + int inter_frame_target; + int64_t total_byte_count; + + int filter_level; + + int frames_since_last_drop_overshoot; + + int force_maxqp; + + int last_frame_percent_intra; + + int count_mb_ref_frame_usage[MAX_REF_FRAMES]; + + int last_q[2]; +} LAYER_CONTEXT; + +typedef struct VP8_COMP { + DECLARE_ALIGNED(16, short, Y1quant[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, short, Y1quant_shift[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][16]); + + DECLARE_ALIGNED(16, short, Y2quant[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, short, Y2quant_shift[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, short, Y2zbin[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, short, Y2round[QINDEX_RANGE][16]); + + DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, short, UVquant_shift[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][16]); + + DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, short, Y1quant_fast[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, short, Y2quant_fast[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, short, UVquant_fast[QINDEX_RANGE][16]); + + MACROBLOCK mb; + VP8_COMMON common; + vp8_writer bc[9]; /* one boolcoder for each partition */ + + VP8_CONFIG oxcf; + + struct lookahead_ctx *lookahead; + struct lookahead_entry *source; + struct lookahead_entry *alt_ref_source; + struct lookahead_entry *last_source; + + YV12_BUFFER_CONFIG *Source; + YV12_BUFFER_CONFIG *un_scaled_source; + YV12_BUFFER_CONFIG scaled_source; + YV12_BUFFER_CONFIG *last_frame_unscaled_source; + + unsigned int frames_till_alt_ref_frame; + /* frame in src_buffers has been identified to be encoded as an alt ref */ + int source_alt_ref_pending; + /* an alt ref frame has been encoded and is usable */ + int source_alt_ref_active; + /* source of frame to encode is an exact copy of an alt ref frame */ + int is_src_frame_alt_ref; + + /* golden frame same as last frame ( short circuit gold searches) */ + int gold_is_last; + /* Alt reference frame same as last ( short circuit altref search) */ + int alt_is_last; + /* don't do both alt and gold search ( just do gold). */ + int gold_is_alt; + + YV12_BUFFER_CONFIG pick_lf_lvl_frame; + + TOKENEXTRA *tok; + unsigned int tok_count; + + unsigned int frames_since_key; + unsigned int key_frame_frequency; + unsigned int this_key_frame_forced; + unsigned int next_key_frame_forced; + + /* Ambient reconstruction err target for force key frames */ + int ambient_err; + + unsigned int mode_check_freq[MAX_MODES]; + + int rd_baseline_thresh[MAX_MODES]; + + int RDMULT; + int RDDIV; + + CODING_CONTEXT coding_context; + + /* Rate targeting variables */ + int64_t last_prediction_error; + int64_t last_intra_error; + + int this_frame_target; + int projected_frame_size; + int last_q[2]; /* Separate values for Intra/Inter */ + + double rate_correction_factor; + double key_frame_rate_correction_factor; + double gf_rate_correction_factor; + + int frames_since_golden; + /* Count down till next GF */ + int frames_till_gf_update_due; + + /* GF interval chosen when we coded the last GF */ + int current_gf_interval; + + /* Total bits overspent because of GF boost (cumulative) */ + int gf_overspend_bits; + + /* Used in the few frames following a GF to recover the extra bits + * spent in that GF + */ + int non_gf_bitrate_adjustment; + + /* Extra bits spent on key frames that need to be recovered */ + int kf_overspend_bits; + + /* Current number of bit s to try and recover on each inter frame. */ + int kf_bitrate_adjustment; + int max_gf_interval; + int baseline_gf_interval; + int active_arnr_frames; + + int64_t key_frame_count; + int prior_key_frame_distance[KEY_FRAME_CONTEXT]; + /* Current section per frame bandwidth target */ + int per_frame_bandwidth; + /* Average frame size target for clip */ + int av_per_frame_bandwidth; + /* Minimum allocation that should be used for any frame */ + int min_frame_bandwidth; + int inter_frame_target; + double output_framerate; + int64_t last_time_stamp_seen; + int64_t last_end_time_stamp_seen; + int64_t first_time_stamp_ever; + + int ni_av_qi; + int ni_tot_qi; + int ni_frames; + int avg_frame_qindex; + + int64_t total_byte_count; + + int buffered_mode; + + double framerate; + double ref_framerate; + int64_t buffer_level; + int64_t bits_off_target; + + int rolling_target_bits; + int rolling_actual_bits; + + int long_rolling_target_bits; + int long_rolling_actual_bits; + + int64_t total_actual_bits; + int total_target_vs_actual; /* debug stats */ + + int worst_quality; + int active_worst_quality; + int best_quality; + int active_best_quality; + + int cq_target_quality; + + int drop_frames_allowed; /* Are we permitted to drop frames? */ + int drop_frame; /* Drop this frame? */ +#if defined(DROP_UNCODED_FRAMES) + int drop_frame_count; +#endif + + vp8_prob frame_coef_probs[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] + [ENTROPY_NODES]; + char update_probs[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]; + + unsigned int frame_branch_ct[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] + [ENTROPY_NODES][2]; + + int gfu_boost; + int kf_boost; + int last_boost; + + int target_bandwidth; /* bits per second */ + struct vpx_codec_pkt_list *output_pkt_list; + +#if 0 + /* Experimental code for lagged and one pass */ + ONEPASS_FRAMESTATS one_pass_frame_stats[MAX_LAG_BUFFERS]; + int one_pass_frame_index; +#endif + + int decimation_factor; + int decimation_count; + + /* for real time encoding */ + int avg_encode_time; /* microsecond */ + int avg_pick_mode_time; /* microsecond */ + int Speed; + int compressor_speed; + + int auto_gold; + int auto_adjust_gold_quantizer; + int auto_worst_q; + int cpu_used; + int pass; + + int prob_intra_coded; + int prob_last_coded; + int prob_gf_coded; + int prob_skip_false; + int last_skip_false_probs[3]; + int last_skip_probs_q[3]; + int recent_ref_frame_usage[MAX_REF_FRAMES]; + + int this_frame_percent_intra; + int last_frame_percent_intra; + + int ref_frame_flags; + + SPEED_FEATURES sf; + + /* Count ZEROMV on all reference frames. */ + int zeromv_count; + int lf_zeromv_pct; + + unsigned char *skin_map; + + unsigned char *segmentation_map; + signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS]; + unsigned int segment_encode_breakout[MAX_MB_SEGMENTS]; + + unsigned char *active_map; + unsigned int active_map_enabled; + + /* Video conferencing cyclic refresh mode flags. This is a mode + * designed to clean up the background over time in live encoding + * scenarious. It uses segmentation. + */ + int cyclic_refresh_mode_enabled; + int cyclic_refresh_mode_max_mbs_perframe; + int cyclic_refresh_mode_index; + int cyclic_refresh_q; + signed char *cyclic_refresh_map; + // Count on how many (consecutive) times a macroblock uses ZER0MV_LAST. + unsigned char *consec_zero_last; + // Counter that is reset when a block is checked for a mode-bias against + // ZEROMV_LASTREF. + unsigned char *consec_zero_last_mvbias; + + // Frame counter for the temporal pattern. Counter is rest when the temporal + // layers are changed dynamically (run-time change). + unsigned int temporal_pattern_counter; + // Temporal layer id. + int temporal_layer_id; + + // Measure of average squared difference between source and denoised signal. + int mse_source_denoised; + + int force_maxqp; + int frames_since_last_drop_overshoot; + int last_pred_err_mb; + + // GF update for 1 pass cbr. + int gf_update_onepass_cbr; + int gf_interval_onepass_cbr; + int gf_noboost_onepass_cbr; + +#if CONFIG_MULTITHREAD + /* multithread data */ + vpx_atomic_int *mt_current_mb_col; + int mt_current_mb_col_size; + int mt_sync_range; + vpx_atomic_int b_multi_threaded; + int encoding_thread_count; + int b_lpf_running; + + pthread_t *h_encoding_thread; + pthread_t h_filter_thread; + + MB_ROW_COMP *mb_row_ei; + ENCODETHREAD_DATA *en_thread_data; + LPFTHREAD_DATA lpf_thread_data; + + /* events */ + sem_t *h_event_start_encoding; + sem_t *h_event_end_encoding; + sem_t h_event_start_lpf; + sem_t h_event_end_lpf; +#endif + + TOKENLIST *tplist; + unsigned int partition_sz[MAX_PARTITIONS]; + unsigned char *partition_d[MAX_PARTITIONS]; + unsigned char *partition_d_end[MAX_PARTITIONS]; + + fractional_mv_step_fp *find_fractional_mv_step; + vp8_refining_search_fn_t refining_search_sad; + vp8_diamond_search_fn_t diamond_search_sad; + vp8_variance_fn_ptr_t fn_ptr[BLOCK_MAX_SEGMENTS]; + uint64_t time_receive_data; + uint64_t time_compress_data; + uint64_t time_pick_lpf; + uint64_t time_encode_mb_row; + + int base_skip_false_prob[128]; + + FRAME_CONTEXT lfc_n; /* last frame entropy */ + FRAME_CONTEXT lfc_a; /* last alt ref entropy */ + FRAME_CONTEXT lfc_g; /* last gold ref entropy */ + + struct twopass_rc { + unsigned int section_intra_rating; + double section_max_qfactor; + unsigned int next_iiratio; + unsigned int this_iiratio; + FIRSTPASS_STATS total_stats; + FIRSTPASS_STATS this_frame_stats; + FIRSTPASS_STATS *stats_in, *stats_in_end, *stats_in_start; + FIRSTPASS_STATS total_left_stats; + int first_pass_done; + int64_t bits_left; + int64_t clip_bits_total; + double avg_iiratio; + double modified_error_total; + double modified_error_used; + double modified_error_left; + double kf_intra_err_min; + double gf_intra_err_min; + int frames_to_key; + int maxq_max_limit; + int maxq_min_limit; + int gf_decay_rate; + int static_scene_max_gf_interval; + int kf_bits; + /* Remaining error from uncoded frames in a gf group. */ + int gf_group_error_left; + /* Projected total bits available for a key frame group of frames */ + int64_t kf_group_bits; + /* Error score of frames still to be coded in kf group */ + int64_t kf_group_error_left; + /* Projected Bits available for a group including 1 GF or ARF */ + int64_t gf_group_bits; + /* Bits for the golden frame or ARF */ + int gf_bits; + int alt_extra_bits; + double est_max_qcorrection_factor; + } twopass; + +#if VP8_TEMPORAL_ALT_REF + YV12_BUFFER_CONFIG alt_ref_buffer; + YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS]; + int fixed_divide[512]; +#endif + +#if CONFIG_INTERNAL_STATS + int count; + double total_y; + double total_u; + double total_v; + double total; + double total_sq_error; + double totalp_y; + double totalp_u; + double totalp_v; + double totalp; + double total_sq_error2; + int bytes; + double summed_quality; + double summed_weights; + unsigned int tot_recode_hits; + + int b_calculate_ssimg; +#endif + int b_calculate_psnr; + + /* Per MB activity measurement */ + unsigned int activity_avg; + unsigned int *mb_activity_map; + + /* Record of which MBs still refer to last golden frame either + * directly or through 0,0 + */ + unsigned char *gf_active_flags; + int gf_active_count; + + int output_partition; + + /* Store last frame's MV info for next frame MV prediction */ + int_mv *lfmv; + int *lf_ref_frame_sign_bias; + int *lf_ref_frame; + + /* force next frame to intra when kf_auto says so */ + int force_next_frame_intra; + + int droppable; + + int initial_width; + int initial_height; + +#if CONFIG_TEMPORAL_DENOISING + VP8_DENOISER denoiser; +#endif + + /* Coding layer state variables */ + unsigned int current_layer; + LAYER_CONTEXT layer_context[VPX_TS_MAX_LAYERS]; + + int64_t frames_in_layer[VPX_TS_MAX_LAYERS]; + int64_t bytes_in_layer[VPX_TS_MAX_LAYERS]; + double sum_psnr[VPX_TS_MAX_LAYERS]; + double sum_psnr_p[VPX_TS_MAX_LAYERS]; + double total_error2[VPX_TS_MAX_LAYERS]; + double total_error2_p[VPX_TS_MAX_LAYERS]; + double sum_ssim[VPX_TS_MAX_LAYERS]; + double sum_weights[VPX_TS_MAX_LAYERS]; + + double total_ssimg_y_in_layer[VPX_TS_MAX_LAYERS]; + double total_ssimg_u_in_layer[VPX_TS_MAX_LAYERS]; + double total_ssimg_v_in_layer[VPX_TS_MAX_LAYERS]; + double total_ssimg_all_in_layer[VPX_TS_MAX_LAYERS]; + +#if CONFIG_MULTI_RES_ENCODING + /* Number of MBs per row at lower-resolution level */ + int mr_low_res_mb_cols; + /* Indicate if lower-res mv info is available */ + unsigned char mr_low_res_mv_avail; +#endif + /* The frame number of each reference frames */ + unsigned int current_ref_frames[MAX_REF_FRAMES]; + // Closest reference frame to current frame. + MV_REFERENCE_FRAME closest_reference_frame; + + struct rd_costs_struct { + int mvcosts[2][MVvals + 1]; + int mvsadcosts[2][MVfpvals + 1]; + int mbmode_cost[2][MB_MODE_COUNT]; + int intra_uv_mode_cost[2][MB_MODE_COUNT]; + int bmode_costs[10][10][10]; + int inter_bmode_costs[B_MODE_COUNT]; + int token_costs[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] + [MAX_ENTROPY_TOKENS]; + } rd_costs; + + // Use the static threshold from ROI settings. + int use_roi_static_threshold; + + int ext_refresh_frame_flags_pending; + + // Always update correction factor used for rate control after each frame for + // realtime encoding. + int rt_always_update_correction_factor; + + // Flag to indicate frame may be dropped due to large expected overshoot, + // and re-encoded on next frame at max_qp. + int rt_drop_recode_on_overshoot; +} VP8_COMP; + +void vp8_initialize_enc(void); + +void vp8_alloc_compressor_data(VP8_COMP *cpi); +int vp8_reverse_trans(int x); +void vp8_reset_temporal_layer_change(VP8_COMP *cpi, const VP8_CONFIG *oxcf, + const int prev_num_layers); +void vp8_init_temporal_layer_context(VP8_COMP *cpi, const VP8_CONFIG *oxcf, + const int layer, + double prev_layer_framerate); +void vp8_update_layer_contexts(VP8_COMP *cpi); +void vp8_save_layer_context(VP8_COMP *cpi); +void vp8_restore_layer_context(VP8_COMP *cpi, const int layer); +void vp8_new_framerate(VP8_COMP *cpi, double framerate); +void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm); + +void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, + unsigned char *dest_end, size_t *size); + +void vp8_tokenize_mb(VP8_COMP *, MACROBLOCK *, TOKENEXTRA **); + +void vp8_set_speed_features(VP8_COMP *cpi); + +int vp8_check_drop_buffer(VP8_COMP *cpi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_ENCODER_ONYX_INT_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/pickinter.c b/media/libvpx/libvpx/vp8/encoder/pickinter.c new file mode 100644 index 0000000000..1af8a2f9b2 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/pickinter.c @@ -0,0 +1,1347 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include "vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "onyx_int.h" +#include "modecosts.h" +#include "encodeintra.h" +#include "vp8/common/common.h" +#include "vp8/common/entropymode.h" +#include "pickinter.h" +#include "vp8/common/findnearmv.h" +#include "encodemb.h" +#include "vp8/common/reconinter.h" +#include "vp8/common/reconintra.h" +#include "vp8/common/reconintra4x4.h" +#include "vpx_dsp/variance.h" +#include "mcomp.h" +#include "vp8/common/vp8_skin_detection.h" +#include "rdopt.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_mem/vpx_mem.h" +#if CONFIG_TEMPORAL_DENOISING +#include "denoising.h" +#endif + +#ifdef SPEEDSTATS +extern unsigned int cnt_pm; +#endif + +extern const int vp8_ref_frame_order[MAX_MODES]; +extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES]; + +static int macroblock_corner_grad(unsigned char *signal, int stride, + int offsetx, int offsety, int sgnx, + int sgny) { + int y1 = signal[offsetx * stride + offsety]; + int y2 = signal[offsetx * stride + offsety + sgny]; + int y3 = signal[(offsetx + sgnx) * stride + offsety]; + int y4 = signal[(offsetx + sgnx) * stride + offsety + sgny]; + return VPXMAX(VPXMAX(abs(y1 - y2), abs(y1 - y3)), abs(y1 - y4)); +} + +static int check_dot_artifact_candidate(VP8_COMP *cpi, MACROBLOCK *x, + unsigned char *target_last, int stride, + unsigned char *last_ref, int mb_row, + int mb_col, int channel) { + int threshold1 = 6; + int threshold2 = 3; + unsigned int max_num = (cpi->common.MBs) / 10; + int grad_last = 0; + int grad_source = 0; + int index = mb_row * cpi->common.mb_cols + mb_col; + // Threshold for #consecutive (base layer) frames using zero_last mode. + int num_frames = 30; + int shift = 15; + if (channel > 0) { + shift = 7; + } + if (cpi->oxcf.number_of_layers > 1) { + num_frames = 20; + } + x->zero_last_dot_suppress = 0; + // Blocks on base layer frames that have been using ZEROMV_LAST repeatedly + // (i.e, at least |x| consecutive frames are candidates for increasing the + // rd adjustment for zero_last mode. + // Only allow this for at most |max_num| blocks per frame. + // Don't allow this for screen content input. + if (cpi->current_layer == 0 && + cpi->consec_zero_last_mvbias[index] > num_frames && + x->mbs_zero_last_dot_suppress < max_num && + !cpi->oxcf.screen_content_mode) { + // If this block is checked here, label it so we don't check it again until + // ~|x| framaes later. + x->zero_last_dot_suppress = 1; + // Dot artifact is noticeable as strong gradient at corners of macroblock, + // for flat areas. As a simple detector for now, we look for a high + // corner gradient on last ref, and a smaller gradient on source. + // Check 4 corners, return if any satisfy condition. + // Top-left: + grad_last = macroblock_corner_grad(last_ref, stride, 0, 0, 1, 1); + grad_source = macroblock_corner_grad(target_last, stride, 0, 0, 1, 1); + if (grad_last >= threshold1 && grad_source <= threshold2) { + x->mbs_zero_last_dot_suppress++; + return 1; + } + // Top-right: + grad_last = macroblock_corner_grad(last_ref, stride, 0, shift, 1, -1); + grad_source = macroblock_corner_grad(target_last, stride, 0, shift, 1, -1); + if (grad_last >= threshold1 && grad_source <= threshold2) { + x->mbs_zero_last_dot_suppress++; + return 1; + } + // Bottom-left: + grad_last = macroblock_corner_grad(last_ref, stride, shift, 0, -1, 1); + grad_source = macroblock_corner_grad(target_last, stride, shift, 0, -1, 1); + if (grad_last >= threshold1 && grad_source <= threshold2) { + x->mbs_zero_last_dot_suppress++; + return 1; + } + // Bottom-right: + grad_last = macroblock_corner_grad(last_ref, stride, shift, shift, -1, -1); + grad_source = + macroblock_corner_grad(target_last, stride, shift, shift, -1, -1); + if (grad_last >= threshold1 && grad_source <= threshold2) { + x->mbs_zero_last_dot_suppress++; + return 1; + } + return 0; + } + return 0; +} + +int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, + int_mv *bestmv, int_mv *ref_mv, + int error_per_bit, + const vp8_variance_fn_ptr_t *vfp, + int *mvcost[2], int *distortion, + unsigned int *sse) { + (void)b; + (void)d; + (void)ref_mv; + (void)error_per_bit; + (void)vfp; + (void)mb; + (void)mvcost; + (void)distortion; + (void)sse; + bestmv->as_mv.row *= 8; + bestmv->as_mv.col *= 8; + return 0; +} + +int vp8_get_inter_mbpred_error(MACROBLOCK *mb, const vp8_variance_fn_ptr_t *vfp, + unsigned int *sse, int_mv this_mv) { + BLOCK *b = &mb->block[0]; + BLOCKD *d = &mb->e_mbd.block[0]; + unsigned char *what = (*(b->base_src) + b->src); + int what_stride = b->src_stride; + int pre_stride = mb->e_mbd.pre.y_stride; + unsigned char *in_what = mb->e_mbd.pre.y_buffer + d->offset; + int in_what_stride = pre_stride; + int xoffset = this_mv.as_mv.col & 7; + int yoffset = this_mv.as_mv.row & 7; + + in_what += (this_mv.as_mv.row >> 3) * pre_stride + (this_mv.as_mv.col >> 3); + + if (xoffset | yoffset) { + return vfp->svf(in_what, in_what_stride, xoffset, yoffset, what, + what_stride, sse); + } else { + return vfp->vf(what, what_stride, in_what, in_what_stride, sse); + } +} + +static int get_prediction_error(BLOCK *be, BLOCKD *b) { + unsigned char *sptr; + unsigned char *dptr; + sptr = (*(be->base_src) + be->src); + dptr = b->predictor; + + return vpx_get4x4sse_cs(sptr, be->src_stride, dptr, 16); +} + +static int pick_intra4x4block(MACROBLOCK *x, int ib, + B_PREDICTION_MODE *best_mode, + const int *mode_costs, int *bestrate, + int *bestdistortion) { + BLOCKD *b = &x->e_mbd.block[ib]; + BLOCK *be = &x->block[ib]; + int dst_stride = x->e_mbd.dst.y_stride; + unsigned char *dst = x->e_mbd.dst.y_buffer + b->offset; + B_PREDICTION_MODE mode; + int best_rd = INT_MAX; + int rate; + int distortion; + + unsigned char *Above = dst - dst_stride; + unsigned char *yleft = dst - 1; + unsigned char top_left = Above[-1]; + + for (mode = B_DC_PRED; mode <= B_HE_PRED; ++mode) { + int this_rd; + + rate = mode_costs[mode]; + + vp8_intra4x4_predict(Above, yleft, dst_stride, mode, b->predictor, 16, + top_left); + distortion = get_prediction_error(be, b); + this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion); + + if (this_rd < best_rd) { + *bestrate = rate; + *bestdistortion = distortion; + best_rd = this_rd; + *best_mode = mode; + } + } + + b->bmi.as_mode = *best_mode; + vp8_encode_intra4x4block(x, ib); + return best_rd; +} + +static int pick_intra4x4mby_modes(MACROBLOCK *mb, int *Rate, int *best_dist) { + MACROBLOCKD *const xd = &mb->e_mbd; + int i; + int cost = mb->mbmode_cost[xd->frame_type][B_PRED]; + int error; + int distortion = 0; + const int *bmode_costs; + + intra_prediction_down_copy(xd, xd->dst.y_buffer - xd->dst.y_stride + 16); + + bmode_costs = mb->inter_bmode_costs; + + for (i = 0; i < 16; ++i) { + MODE_INFO *const mic = xd->mode_info_context; + const int mis = xd->mode_info_stride; + + B_PREDICTION_MODE best_mode = B_MODE_COUNT; + int r = 0, d = 0; + + if (mb->e_mbd.frame_type == KEY_FRAME) { + const B_PREDICTION_MODE A = above_block_mode(mic, i, mis); + const B_PREDICTION_MODE L = left_block_mode(mic, i); + + bmode_costs = mb->bmode_costs[A][L]; + } + + pick_intra4x4block(mb, i, &best_mode, bmode_costs, &r, &d); + + cost += r; + distortion += d; + assert(best_mode != B_MODE_COUNT); + mic->bmi[i].as_mode = best_mode; + + /* Break out case where we have already exceeded best so far value + * that was passed in + */ + if (distortion > *best_dist) break; + } + + *Rate = cost; + + if (i == 16) { + *best_dist = distortion; + error = RDCOST(mb->rdmult, mb->rddiv, cost, distortion); + } else { + *best_dist = INT_MAX; + error = INT_MAX; + } + + return error; +} + +static void pick_intra_mbuv_mode(MACROBLOCK *mb) { + MACROBLOCKD *x = &mb->e_mbd; + unsigned char *uabove_row = x->dst.u_buffer - x->dst.uv_stride; + unsigned char *vabove_row = x->dst.v_buffer - x->dst.uv_stride; + unsigned char *usrc_ptr = (mb->block[16].src + *mb->block[16].base_src); + unsigned char *vsrc_ptr = (mb->block[20].src + *mb->block[20].base_src); + int uvsrc_stride = mb->block[16].src_stride; + unsigned char uleft_col[8]; + unsigned char vleft_col[8]; + unsigned char utop_left = uabove_row[-1]; + unsigned char vtop_left = vabove_row[-1]; + int i, j; + int expected_udc; + int expected_vdc; + int shift; + int Uaverage = 0; + int Vaverage = 0; + int diff; + int pred_error[4] = { 0, 0, 0, 0 }, best_error = INT_MAX; + MB_PREDICTION_MODE best_mode = MB_MODE_COUNT; + + for (i = 0; i < 8; ++i) { + uleft_col[i] = x->dst.u_buffer[i * x->dst.uv_stride - 1]; + vleft_col[i] = x->dst.v_buffer[i * x->dst.uv_stride - 1]; + } + + if (!x->up_available && !x->left_available) { + expected_udc = 128; + expected_vdc = 128; + } else { + shift = 2; + + if (x->up_available) { + for (i = 0; i < 8; ++i) { + Uaverage += uabove_row[i]; + Vaverage += vabove_row[i]; + } + + shift++; + } + + if (x->left_available) { + for (i = 0; i < 8; ++i) { + Uaverage += uleft_col[i]; + Vaverage += vleft_col[i]; + } + + shift++; + } + + expected_udc = (Uaverage + (1 << (shift - 1))) >> shift; + expected_vdc = (Vaverage + (1 << (shift - 1))) >> shift; + } + + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) { + int predu = uleft_col[i] + uabove_row[j] - utop_left; + int predv = vleft_col[i] + vabove_row[j] - vtop_left; + int u_p, v_p; + + u_p = usrc_ptr[j]; + v_p = vsrc_ptr[j]; + + if (predu < 0) predu = 0; + + if (predu > 255) predu = 255; + + if (predv < 0) predv = 0; + + if (predv > 255) predv = 255; + + diff = u_p - expected_udc; + pred_error[DC_PRED] += diff * diff; + diff = v_p - expected_vdc; + pred_error[DC_PRED] += diff * diff; + + diff = u_p - uabove_row[j]; + pred_error[V_PRED] += diff * diff; + diff = v_p - vabove_row[j]; + pred_error[V_PRED] += diff * diff; + + diff = u_p - uleft_col[i]; + pred_error[H_PRED] += diff * diff; + diff = v_p - vleft_col[i]; + pred_error[H_PRED] += diff * diff; + + diff = u_p - predu; + pred_error[TM_PRED] += diff * diff; + diff = v_p - predv; + pred_error[TM_PRED] += diff * diff; + } + + usrc_ptr += uvsrc_stride; + vsrc_ptr += uvsrc_stride; + + if (i == 3) { + usrc_ptr = (mb->block[18].src + *mb->block[18].base_src); + vsrc_ptr = (mb->block[22].src + *mb->block[22].base_src); + } + } + + for (i = DC_PRED; i <= TM_PRED; ++i) { + if (best_error > pred_error[i]) { + best_error = pred_error[i]; + best_mode = (MB_PREDICTION_MODE)i; + } + } + + assert(best_mode != MB_MODE_COUNT); + mb->e_mbd.mode_info_context->mbmi.uv_mode = best_mode; +} + +static void update_mvcount(MACROBLOCK *x, int_mv *best_ref_mv) { + MACROBLOCKD *xd = &x->e_mbd; + /* Split MV modes currently not supported when RD is nopt enabled, + * therefore, only need to modify MVcount in NEWMV mode. */ + if (xd->mode_info_context->mbmi.mode == NEWMV) { + x->MVcount[0][mv_max + ((xd->mode_info_context->mbmi.mv.as_mv.row - + best_ref_mv->as_mv.row) >> + 1)]++; + x->MVcount[1][mv_max + ((xd->mode_info_context->mbmi.mv.as_mv.col - + best_ref_mv->as_mv.col) >> + 1)]++; + } +} + +#if CONFIG_MULTI_RES_ENCODING +static void get_lower_res_motion_info(VP8_COMP *cpi, MACROBLOCKD *xd, + int *dissim, int *parent_ref_frame, + MB_PREDICTION_MODE *parent_mode, + int_mv *parent_ref_mv, int mb_row, + int mb_col) { + LOWER_RES_MB_INFO *store_mode_info = + ((LOWER_RES_FRAME_INFO *)cpi->oxcf.mr_low_res_mode_info)->mb_info; + unsigned int parent_mb_index; + + /* Consider different down_sampling_factor. */ + { + /* TODO: Removed the loop that supports special down_sampling_factor + * such as 2, 4, 8. Will revisit it if needed. + * Should also try using a look-up table to see if it helps + * performance. */ + int parent_mb_row, parent_mb_col; + + parent_mb_row = mb_row * cpi->oxcf.mr_down_sampling_factor.den / + cpi->oxcf.mr_down_sampling_factor.num; + parent_mb_col = mb_col * cpi->oxcf.mr_down_sampling_factor.den / + cpi->oxcf.mr_down_sampling_factor.num; + parent_mb_index = parent_mb_row * cpi->mr_low_res_mb_cols + parent_mb_col; + } + + /* Read lower-resolution mode & motion result from memory.*/ + *parent_ref_frame = store_mode_info[parent_mb_index].ref_frame; + *parent_mode = store_mode_info[parent_mb_index].mode; + *dissim = store_mode_info[parent_mb_index].dissim; + + /* For highest-resolution encoder, adjust dissim value. Lower its quality + * for good performance. */ + if (cpi->oxcf.mr_encoder_id == (cpi->oxcf.mr_total_resolutions - 1)) + *dissim >>= 1; + + if (*parent_ref_frame != INTRA_FRAME) { + /* Consider different down_sampling_factor. + * The result can be rounded to be more precise, but it takes more time. + */ + (*parent_ref_mv).as_mv.row = store_mode_info[parent_mb_index].mv.as_mv.row * + cpi->oxcf.mr_down_sampling_factor.num / + cpi->oxcf.mr_down_sampling_factor.den; + (*parent_ref_mv).as_mv.col = store_mode_info[parent_mb_index].mv.as_mv.col * + cpi->oxcf.mr_down_sampling_factor.num / + cpi->oxcf.mr_down_sampling_factor.den; + + vp8_clamp_mv2(parent_ref_mv, xd); + } +} +#endif + +static void check_for_encode_breakout(unsigned int sse, MACROBLOCK *x) { + MACROBLOCKD *xd = &x->e_mbd; + + unsigned int threshold = + (xd->block[0].dequant[1] * xd->block[0].dequant[1] >> 4); + + if (threshold < x->encode_breakout) threshold = x->encode_breakout; + + if (sse < threshold) { + /* Check u and v to make sure skip is ok */ + unsigned int sse2 = 0; + + sse2 = VP8_UVSSE(x); + + if (sse2 * 2 < x->encode_breakout) { + x->skip = 1; + } else { + x->skip = 0; + } + } +} + +static int evaluate_inter_mode(unsigned int *sse, int rate2, int *distortion2, + VP8_COMP *cpi, MACROBLOCK *x, int rd_adj) { + MB_PREDICTION_MODE this_mode = x->e_mbd.mode_info_context->mbmi.mode; + int_mv mv = x->e_mbd.mode_info_context->mbmi.mv; + int this_rd; + int denoise_aggressive = 0; + /* Exit early and don't compute the distortion if this macroblock + * is marked inactive. */ + if (cpi->active_map_enabled && x->active_ptr[0] == 0) { + *sse = 0; + *distortion2 = 0; + x->skip = 1; + return INT_MAX; + } + + if ((this_mode != NEWMV) || !(cpi->sf.half_pixel_search) || + cpi->common.full_pixel == 1) { + *distortion2 = + vp8_get_inter_mbpred_error(x, &cpi->fn_ptr[BLOCK_16X16], sse, mv); + } + + this_rd = RDCOST(x->rdmult, x->rddiv, rate2, *distortion2); + +#if CONFIG_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0) { + denoise_aggressive = + (cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive) ? 1 : 0; + } +#endif + + // Adjust rd for ZEROMV and LAST, if LAST is the closest reference frame. + // TODO: We should also add condition on distance of closest to current. + if (!cpi->oxcf.screen_content_mode && this_mode == ZEROMV && + x->e_mbd.mode_info_context->mbmi.ref_frame == LAST_FRAME && + (denoise_aggressive || (cpi->closest_reference_frame == LAST_FRAME))) { + // No adjustment if block is considered to be skin area. + if (x->is_skin) rd_adj = 100; + + this_rd = (int)(((int64_t)this_rd) * rd_adj / 100); + } + + check_for_encode_breakout(*sse, x); + return this_rd; +} + +static void calculate_zeromv_rd_adjustment(VP8_COMP *cpi, MACROBLOCK *x, + int *rd_adjustment) { + MODE_INFO *mic = x->e_mbd.mode_info_context; + int_mv mv_l, mv_a, mv_al; + int local_motion_check = 0; + + if (cpi->lf_zeromv_pct > 40) { + /* left mb */ + mic -= 1; + mv_l = mic->mbmi.mv; + + if (mic->mbmi.ref_frame != INTRA_FRAME) { + if (abs(mv_l.as_mv.row) < 8 && abs(mv_l.as_mv.col) < 8) { + local_motion_check++; + } + } + + /* above-left mb */ + mic -= x->e_mbd.mode_info_stride; + mv_al = mic->mbmi.mv; + + if (mic->mbmi.ref_frame != INTRA_FRAME) { + if (abs(mv_al.as_mv.row) < 8 && abs(mv_al.as_mv.col) < 8) { + local_motion_check++; + } + } + + /* above mb */ + mic += 1; + mv_a = mic->mbmi.mv; + + if (mic->mbmi.ref_frame != INTRA_FRAME) { + if (abs(mv_a.as_mv.row) < 8 && abs(mv_a.as_mv.col) < 8) { + local_motion_check++; + } + } + + if (((!x->e_mbd.mb_to_top_edge || !x->e_mbd.mb_to_left_edge) && + local_motion_check > 0) || + local_motion_check > 2) { + *rd_adjustment = 80; + } else if (local_motion_check > 0) { + *rd_adjustment = 90; + } + } +} + +void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, + int recon_uvoffset, int *returnrate, + int *returndistortion, int *returnintra, int mb_row, + int mb_col) { + BLOCK *b = &x->block[0]; + BLOCKD *d = &x->e_mbd.block[0]; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO best_mbmode; + + int_mv best_ref_mv_sb[2] = { { 0 }, { 0 } }; + int_mv mode_mv_sb[2][MB_MODE_COUNT]; + int_mv best_ref_mv; + int_mv *mode_mv; + MB_PREDICTION_MODE this_mode; + int num00; + int mdcounts[4]; + int best_rd = INT_MAX; + int rd_adjustment = 100; + int best_intra_rd = INT_MAX; + int mode_index; + int rate; + int rate2; + int distortion2; + int bestsme = INT_MAX; + int best_mode_index = 0; + unsigned int sse = UINT_MAX, best_rd_sse = UINT_MAX; +#if CONFIG_TEMPORAL_DENOISING + unsigned int zero_mv_sse = UINT_MAX, best_sse = UINT_MAX; +#endif + + int sf_improved_mv_pred = cpi->sf.improved_mv_pred; + +#if CONFIG_MULTI_RES_ENCODING + int dissim = INT_MAX; + int parent_ref_frame = 0; + int_mv parent_ref_mv; + MB_PREDICTION_MODE parent_mode = 0; + int parent_ref_valid = 0; +#endif + + int_mv mvp; + + int near_sadidx[8] = { 0, 1, 2, 3, 4, 5, 6, 7 }; + int saddone = 0; + /* search range got from mv_pred(). It uses step_param levels. (0-7) */ + int sr = 0; + + unsigned char *plane[4][3] = { { 0, 0 } }; + int ref_frame_map[4]; + int sign_bias = 0; + int dot_artifact_candidate = 0; + get_predictor_pointers(cpi, plane, recon_yoffset, recon_uvoffset); + + // If the current frame is using LAST as a reference, check for + // biasing the mode selection for dot artifacts. + if (cpi->ref_frame_flags & VP8_LAST_FRAME) { + unsigned char *target_y = x->src.y_buffer; + unsigned char *target_u = x->block[16].src + *x->block[16].base_src; + unsigned char *target_v = x->block[20].src + *x->block[20].base_src; + int stride = x->src.y_stride; + int stride_uv = x->block[16].src_stride; +#if CONFIG_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity) { + const int uv_denoise = (cpi->oxcf.noise_sensitivity >= 2) ? 1 : 0; + target_y = + cpi->denoiser.yv12_running_avg[LAST_FRAME].y_buffer + recon_yoffset; + stride = cpi->denoiser.yv12_running_avg[LAST_FRAME].y_stride; + if (uv_denoise) { + target_u = cpi->denoiser.yv12_running_avg[LAST_FRAME].u_buffer + + recon_uvoffset; + target_v = cpi->denoiser.yv12_running_avg[LAST_FRAME].v_buffer + + recon_uvoffset; + stride_uv = cpi->denoiser.yv12_running_avg[LAST_FRAME].uv_stride; + } + } +#endif + assert(plane[LAST_FRAME][0] != NULL); + dot_artifact_candidate = check_dot_artifact_candidate( + cpi, x, target_y, stride, plane[LAST_FRAME][0], mb_row, mb_col, 0); + // If not found in Y channel, check UV channel. + if (!dot_artifact_candidate) { + assert(plane[LAST_FRAME][1] != NULL); + dot_artifact_candidate = check_dot_artifact_candidate( + cpi, x, target_u, stride_uv, plane[LAST_FRAME][1], mb_row, mb_col, 1); + if (!dot_artifact_candidate) { + assert(plane[LAST_FRAME][2] != NULL); + dot_artifact_candidate = check_dot_artifact_candidate( + cpi, x, target_v, stride_uv, plane[LAST_FRAME][2], mb_row, mb_col, + 2); + } + } + } + +#if CONFIG_MULTI_RES_ENCODING + // |parent_ref_valid| will be set here if potentially we can do mv resue for + // this higher resol (|cpi->oxcf.mr_encoder_id| > 0) frame. + // |parent_ref_valid| may be reset depending on |parent_ref_frame| for + // the current macroblock below. + parent_ref_valid = cpi->oxcf.mr_encoder_id && cpi->mr_low_res_mv_avail; + if (parent_ref_valid) { + int parent_ref_flag; + + get_lower_res_motion_info(cpi, xd, &dissim, &parent_ref_frame, &parent_mode, + &parent_ref_mv, mb_row, mb_col); + + /* TODO(jkoleszar): The references available (ref_frame_flags) to the + * lower res encoder should match those available to this encoder, but + * there seems to be a situation where this mismatch can happen in the + * case of frame dropping and temporal layers. For example, + * GOLD being disallowed in ref_frame_flags, but being returned as + * parent_ref_frame. + * + * In this event, take the conservative approach of disabling the + * lower res info for this MB. + */ + + parent_ref_flag = 0; + // Note availability for mv reuse is only based on last and golden. + if (parent_ref_frame == LAST_FRAME) + parent_ref_flag = (cpi->ref_frame_flags & VP8_LAST_FRAME); + else if (parent_ref_frame == GOLDEN_FRAME) + parent_ref_flag = (cpi->ref_frame_flags & VP8_GOLD_FRAME); + + // assert(!parent_ref_frame || parent_ref_flag); + + // If |parent_ref_frame| did not match either last or golden then + // shut off mv reuse. + if (parent_ref_frame && !parent_ref_flag) parent_ref_valid = 0; + + // Don't do mv reuse since we want to allow for another mode besides + // ZEROMV_LAST to remove dot artifact. + if (dot_artifact_candidate) parent_ref_valid = 0; + } +#endif + + // Check if current macroblock is in skin area. + x->is_skin = 0; + if (!cpi->oxcf.screen_content_mode) { + int block_index = mb_row * cpi->common.mb_cols + mb_col; + x->is_skin = cpi->skin_map[block_index]; + } +#if CONFIG_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity) { + // Under aggressive denoising mode, should we use skin map to reduce + // denoiser + // and ZEROMV bias? Will need to revisit the accuracy of this detection for + // very noisy input. For now keep this as is (i.e., don't turn it off). + // if (cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive) + // x->is_skin = 0; + } +#endif + + mode_mv = mode_mv_sb[sign_bias]; + best_ref_mv.as_int = 0; + memset(mode_mv_sb, 0, sizeof(mode_mv_sb)); + memset(&best_mbmode, 0, sizeof(best_mbmode)); + +/* Setup search priorities */ +#if CONFIG_MULTI_RES_ENCODING + if (parent_ref_valid && parent_ref_frame && dissim < 8) { + ref_frame_map[0] = -1; + ref_frame_map[1] = parent_ref_frame; + ref_frame_map[2] = -1; + ref_frame_map[3] = -1; + } else +#endif + get_reference_search_order(cpi, ref_frame_map); + + /* Check to see if there is at least 1 valid reference frame that we need + * to calculate near_mvs. + */ + if (ref_frame_map[1] > 0) { + sign_bias = vp8_find_near_mvs_bias( + &x->e_mbd, x->e_mbd.mode_info_context, mode_mv_sb, best_ref_mv_sb, + mdcounts, ref_frame_map[1], cpi->common.ref_frame_sign_bias); + + mode_mv = mode_mv_sb[sign_bias]; + best_ref_mv.as_int = best_ref_mv_sb[sign_bias].as_int; + } + + /* Count of the number of MBs tested so far this frame */ + x->mbs_tested_so_far++; + + *returnintra = INT_MAX; + x->skip = 0; + + x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME; + + /* If the frame has big static background and current MB is in low + * motion area, its mode decision is biased to ZEROMV mode. + * No adjustment if cpu_used is <= -12 (i.e., cpi->Speed >= 12). + * At such speed settings, ZEROMV is already heavily favored. + */ + if (cpi->Speed < 12) { + calculate_zeromv_rd_adjustment(cpi, x, &rd_adjustment); + } + +#if CONFIG_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity) { + rd_adjustment = (int)(rd_adjustment * + cpi->denoiser.denoise_pars.pickmode_mv_bias / 100); + } +#endif + + if (dot_artifact_candidate) { + // Bias against ZEROMV_LAST mode. + rd_adjustment = 150; + } + + /* if we encode a new mv this is important + * find the best new motion vector + */ + for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) { + int frame_cost; + int this_rd = INT_MAX; + int this_ref_frame = ref_frame_map[vp8_ref_frame_order[mode_index]]; + + if (best_rd <= x->rd_threshes[mode_index]) continue; + + if (this_ref_frame < 0) continue; + + x->e_mbd.mode_info_context->mbmi.ref_frame = this_ref_frame; + + /* everything but intra */ + if (x->e_mbd.mode_info_context->mbmi.ref_frame) { + x->e_mbd.pre.y_buffer = plane[this_ref_frame][0]; + x->e_mbd.pre.u_buffer = plane[this_ref_frame][1]; + x->e_mbd.pre.v_buffer = plane[this_ref_frame][2]; + + if (sign_bias != cpi->common.ref_frame_sign_bias[this_ref_frame]) { + sign_bias = cpi->common.ref_frame_sign_bias[this_ref_frame]; + mode_mv = mode_mv_sb[sign_bias]; + best_ref_mv.as_int = best_ref_mv_sb[sign_bias].as_int; + } + +#if CONFIG_MULTI_RES_ENCODING + if (parent_ref_valid) { + if (vp8_mode_order[mode_index] == NEARESTMV && + mode_mv[NEARESTMV].as_int == 0) + continue; + if (vp8_mode_order[mode_index] == NEARMV && mode_mv[NEARMV].as_int == 0) + continue; + + if (vp8_mode_order[mode_index] == NEWMV && parent_mode == ZEROMV && + best_ref_mv.as_int == 0) + continue; + else if (vp8_mode_order[mode_index] == NEWMV && dissim == 0 && + best_ref_mv.as_int == parent_ref_mv.as_int) + continue; + } +#endif + } + + /* Check to see if the testing frequency for this mode is at its max + * If so then prevent it from being tested and increase the threshold + * for its testing */ + if (x->mode_test_hit_counts[mode_index] && + (cpi->mode_check_freq[mode_index] > 1)) { + if (x->mbs_tested_so_far <= (cpi->mode_check_freq[mode_index] * + x->mode_test_hit_counts[mode_index])) { + /* Increase the threshold for coding this mode to make it less + * likely to be chosen */ + x->rd_thresh_mult[mode_index] += 4; + + if (x->rd_thresh_mult[mode_index] > MAX_THRESHMULT) { + x->rd_thresh_mult[mode_index] = MAX_THRESHMULT; + } + + x->rd_threshes[mode_index] = + (cpi->rd_baseline_thresh[mode_index] >> 7) * + x->rd_thresh_mult[mode_index]; + continue; + } + } + + /* We have now reached the point where we are going to test the current + * mode so increment the counter for the number of times it has been + * tested */ + x->mode_test_hit_counts[mode_index]++; + + rate2 = 0; + distortion2 = 0; + + this_mode = vp8_mode_order[mode_index]; + + x->e_mbd.mode_info_context->mbmi.mode = this_mode; + x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED; + + /* Work out the cost assosciated with selecting the reference frame */ + frame_cost = x->ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame]; + rate2 += frame_cost; + + /* Only consider ZEROMV/ALTREF_FRAME for alt ref frame, + * unless ARNR filtering is enabled in which case we want + * an unfiltered alternative */ + if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) { + if (this_mode != ZEROMV || + x->e_mbd.mode_info_context->mbmi.ref_frame != ALTREF_FRAME) { + continue; + } + } + + switch (this_mode) { + case B_PRED: + /* Pass best so far to pick_intra4x4mby_modes to use as breakout */ + distortion2 = best_rd_sse; + pick_intra4x4mby_modes(x, &rate, &distortion2); + + if (distortion2 == INT_MAX) { + this_rd = INT_MAX; + } else { + rate2 += rate; + distortion2 = vpx_variance16x16(*(b->base_src), b->src_stride, + x->e_mbd.predictor, 16, &sse); + this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); + + if (this_rd < best_intra_rd) { + best_intra_rd = this_rd; + *returnintra = distortion2; + } + } + + break; + + case SPLITMV: + + /* Split MV modes currently not supported when RD is not enabled. */ + break; + + case DC_PRED: + case V_PRED: + case H_PRED: + case TM_PRED: + vp8_build_intra_predictors_mby_s( + xd, xd->dst.y_buffer - xd->dst.y_stride, xd->dst.y_buffer - 1, + xd->dst.y_stride, xd->predictor, 16); + distortion2 = vpx_variance16x16(*(b->base_src), b->src_stride, + x->e_mbd.predictor, 16, &sse); + rate2 += x->mbmode_cost[x->e_mbd.frame_type] + [x->e_mbd.mode_info_context->mbmi.mode]; + this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); + + if (this_rd < best_intra_rd) { + best_intra_rd = this_rd; + *returnintra = distortion2; + } + break; + + case NEWMV: { + int thissme; + int step_param; + int further_steps; + int n = 0; + int sadpb = x->sadperbit16; + int_mv mvp_full; + + int col_min = ((best_ref_mv.as_mv.col + 7) >> 3) - MAX_FULL_PEL_VAL; + int row_min = ((best_ref_mv.as_mv.row + 7) >> 3) - MAX_FULL_PEL_VAL; + int col_max = (best_ref_mv.as_mv.col >> 3) + MAX_FULL_PEL_VAL; + int row_max = (best_ref_mv.as_mv.row >> 3) + MAX_FULL_PEL_VAL; + + int tmp_col_min = x->mv_col_min; + int tmp_col_max = x->mv_col_max; + int tmp_row_min = x->mv_row_min; + int tmp_row_max = x->mv_row_max; + + int speed_adjust = (cpi->Speed > 5) ? ((cpi->Speed >= 8) ? 3 : 2) : 1; + + /* Further step/diamond searches as necessary */ + step_param = cpi->sf.first_step + speed_adjust; + +#if CONFIG_MULTI_RES_ENCODING + /* If lower-res frame is not available for mv reuse (because of + frame dropping or different temporal layer pattern), then higher + resol encoder does motion search without any previous knowledge. + Also, since last frame motion info is not stored, then we can not + use improved_mv_pred. */ + if (cpi->oxcf.mr_encoder_id) sf_improved_mv_pred = 0; + + // Only use parent MV as predictor if this candidate reference frame + // (|this_ref_frame|) is equal to |parent_ref_frame|. + if (parent_ref_valid && (parent_ref_frame == this_ref_frame)) { + /* Use parent MV as predictor. Adjust search range + * accordingly. + */ + mvp.as_int = parent_ref_mv.as_int; + mvp_full.as_mv.col = parent_ref_mv.as_mv.col >> 3; + mvp_full.as_mv.row = parent_ref_mv.as_mv.row >> 3; + + if (dissim <= 32) + step_param += 3; + else if (dissim <= 128) + step_param += 2; + else + step_param += 1; + } else +#endif + { + if (sf_improved_mv_pred) { + if (!saddone) { + vp8_cal_sad(cpi, xd, x, recon_yoffset, &near_sadidx[0]); + saddone = 1; + } + + vp8_mv_pred(cpi, &x->e_mbd, x->e_mbd.mode_info_context, &mvp, + x->e_mbd.mode_info_context->mbmi.ref_frame, + cpi->common.ref_frame_sign_bias, &sr, &near_sadidx[0]); + + sr += speed_adjust; + /* adjust search range according to sr from mv prediction */ + if (sr > step_param) step_param = sr; + + mvp_full.as_mv.col = mvp.as_mv.col >> 3; + mvp_full.as_mv.row = mvp.as_mv.row >> 3; + } else { + mvp.as_int = best_ref_mv.as_int; + mvp_full.as_mv.col = best_ref_mv.as_mv.col >> 3; + mvp_full.as_mv.row = best_ref_mv.as_mv.row >> 3; + } + } + +#if CONFIG_MULTI_RES_ENCODING + if (parent_ref_valid && (parent_ref_frame == this_ref_frame) && + dissim <= 2 && + VPXMAX(abs(best_ref_mv.as_mv.row - parent_ref_mv.as_mv.row), + abs(best_ref_mv.as_mv.col - parent_ref_mv.as_mv.col)) <= 4) { + d->bmi.mv.as_int = mvp_full.as_int; + mode_mv[NEWMV].as_int = mvp_full.as_int; + + cpi->find_fractional_mv_step( + x, b, d, &d->bmi.mv, &best_ref_mv, x->errorperbit, + &cpi->fn_ptr[BLOCK_16X16], cpi->mb.mvcost, &distortion2, &sse); + } else +#endif + { + /* Get intersection of UMV window and valid MV window to + * reduce # of checks in diamond search. */ + if (x->mv_col_min < col_min) x->mv_col_min = col_min; + if (x->mv_col_max > col_max) x->mv_col_max = col_max; + if (x->mv_row_min < row_min) x->mv_row_min = row_min; + if (x->mv_row_max > row_max) x->mv_row_max = row_max; + + further_steps = + (cpi->Speed >= 8) + ? 0 + : (cpi->sf.max_step_search_steps - 1 - step_param); + + if (cpi->sf.search_method == HEX) { +#if CONFIG_MULTI_RES_ENCODING + /* TODO: In higher-res pick_inter_mode, step_param is used to + * modify hex search range. Here, set step_param to 0 not to + * change the behavior in lowest-resolution encoder. + * Will improve it later. + */ + /* Set step_param to 0 to ensure large-range motion search + * when mv reuse if not valid (i.e. |parent_ref_valid| = 0), + * or if this candidate reference frame (|this_ref_frame|) is + * not equal to |parent_ref_frame|. + */ + if (!parent_ref_valid || (parent_ref_frame != this_ref_frame)) + step_param = 0; +#endif + bestsme = vp8_hex_search(x, b, d, &mvp_full, &d->bmi.mv, step_param, + sadpb, &cpi->fn_ptr[BLOCK_16X16], + x->mvsadcost, &best_ref_mv); + mode_mv[NEWMV].as_int = d->bmi.mv.as_int; + } else { + bestsme = cpi->diamond_search_sad( + x, b, d, &mvp_full, &d->bmi.mv, step_param, sadpb, &num00, + &cpi->fn_ptr[BLOCK_16X16], x->mvcost, &best_ref_mv); + mode_mv[NEWMV].as_int = d->bmi.mv.as_int; + + /* Further step/diamond searches as necessary */ + n = num00; + num00 = 0; + + while (n < further_steps) { + n++; + + if (num00) { + num00--; + } else { + thissme = cpi->diamond_search_sad( + x, b, d, &mvp_full, &d->bmi.mv, step_param + n, sadpb, + &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvcost, &best_ref_mv); + if (thissme < bestsme) { + bestsme = thissme; + mode_mv[NEWMV].as_int = d->bmi.mv.as_int; + } else { + d->bmi.mv.as_int = mode_mv[NEWMV].as_int; + } + } + } + } + + x->mv_col_min = tmp_col_min; + x->mv_col_max = tmp_col_max; + x->mv_row_min = tmp_row_min; + x->mv_row_max = tmp_row_max; + + if (bestsme < INT_MAX) { + cpi->find_fractional_mv_step( + x, b, d, &d->bmi.mv, &best_ref_mv, x->errorperbit, + &cpi->fn_ptr[BLOCK_16X16], cpi->mb.mvcost, &distortion2, &sse); + } + } + + mode_mv[NEWMV].as_int = d->bmi.mv.as_int; + // The clamp below is not necessary from the perspective + // of VP8 bitstream, but is added to improve ChromeCast + // mirroring's robustness. Please do not remove. + vp8_clamp_mv2(&mode_mv[this_mode], xd); + /* mv cost; */ + rate2 += + vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, cpi->mb.mvcost, 128); + } + // fall through + + case NEARESTMV: + case NEARMV: + if (mode_mv[this_mode].as_int == 0) continue; + // fall through + + case ZEROMV: + + /* Trap vectors that reach beyond the UMV borders + * Note that ALL New MV, Nearest MV Near MV and Zero MV code drops + * through to this point because of the lack of break statements + * in the previous two cases. + */ + if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) || + ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) || + ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) || + ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max)) { + continue; + } + + rate2 += vp8_cost_mv_ref(this_mode, mdcounts); + x->e_mbd.mode_info_context->mbmi.mv.as_int = mode_mv[this_mode].as_int; + this_rd = evaluate_inter_mode(&sse, rate2, &distortion2, cpi, x, + rd_adjustment); + + break; + default: break; + } + +#if CONFIG_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity) { + /* Store for later use by denoiser. */ + // Don't denoise with GOLDEN OR ALTREF is they are old reference + // frames (greater than MAX_GF_ARF_DENOISE_RANGE frames in past). + int skip_old_reference = ((this_ref_frame != LAST_FRAME) && + (cpi->common.current_video_frame - + cpi->current_ref_frames[this_ref_frame] > + MAX_GF_ARF_DENOISE_RANGE)) + ? 1 + : 0; + if (this_mode == ZEROMV && sse < zero_mv_sse && !skip_old_reference) { + zero_mv_sse = sse; + x->best_zeromv_reference_frame = + x->e_mbd.mode_info_context->mbmi.ref_frame; + } + + // Store the best NEWMV in x for later use in the denoiser. + if (x->e_mbd.mode_info_context->mbmi.mode == NEWMV && sse < best_sse && + !skip_old_reference) { + best_sse = sse; + x->best_sse_inter_mode = NEWMV; + x->best_sse_mv = x->e_mbd.mode_info_context->mbmi.mv; + x->need_to_clamp_best_mvs = + x->e_mbd.mode_info_context->mbmi.need_to_clamp_mvs; + x->best_reference_frame = x->e_mbd.mode_info_context->mbmi.ref_frame; + } + } +#endif + + if (this_rd < best_rd || x->skip) { + /* Note index of best mode */ + best_mode_index = mode_index; + + *returnrate = rate2; + *returndistortion = distortion2; + best_rd_sse = sse; + best_rd = this_rd; + memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi, + sizeof(MB_MODE_INFO)); + + /* Testing this mode gave rise to an improvement in best error + * score. Lower threshold a bit for next time + */ + x->rd_thresh_mult[mode_index] = + (x->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) + ? x->rd_thresh_mult[mode_index] - 2 + : MIN_THRESHMULT; + x->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * + x->rd_thresh_mult[mode_index]; + } + + /* If the mode did not help improve the best error case then raise the + * threshold for testing that mode next time around. + */ + else { + x->rd_thresh_mult[mode_index] += 4; + + if (x->rd_thresh_mult[mode_index] > MAX_THRESHMULT) { + x->rd_thresh_mult[mode_index] = MAX_THRESHMULT; + } + + x->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * + x->rd_thresh_mult[mode_index]; + } + + if (x->skip) break; + } + + /* Reduce the activation RD thresholds for the best choice mode */ + if ((cpi->rd_baseline_thresh[best_mode_index] > 0) && + (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) { + int best_adjustment = (x->rd_thresh_mult[best_mode_index] >> 3); + + x->rd_thresh_mult[best_mode_index] = + (x->rd_thresh_mult[best_mode_index] >= + (MIN_THRESHMULT + best_adjustment)) + ? x->rd_thresh_mult[best_mode_index] - best_adjustment + : MIN_THRESHMULT; + x->rd_threshes[best_mode_index] = + (cpi->rd_baseline_thresh[best_mode_index] >> 7) * + x->rd_thresh_mult[best_mode_index]; + } + + { + int this_rdbin = (*returndistortion >> 7); + + if (this_rdbin >= 1024) { + this_rdbin = 1023; + } + + x->error_bins[this_rdbin]++; + } + +#if CONFIG_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity) { + int block_index = mb_row * cpi->common.mb_cols + mb_col; + int reevaluate = 0; + int is_noisy = 0; + if (x->best_sse_inter_mode == DC_PRED) { + /* No best MV found. */ + x->best_sse_inter_mode = best_mbmode.mode; + x->best_sse_mv = best_mbmode.mv; + x->need_to_clamp_best_mvs = best_mbmode.need_to_clamp_mvs; + x->best_reference_frame = best_mbmode.ref_frame; + best_sse = best_rd_sse; + } + // For non-skin blocks that have selected ZEROMV for this current frame, + // and have been selecting ZEROMV_LAST (on the base layer frame) at + // least |x~20| consecutive past frames in a row, label the block for + // possible increase in denoising strength. We also condition this + // labeling on there being significant denoising in the scene + if (cpi->oxcf.noise_sensitivity == 4) { + if (cpi->denoiser.nmse_source_diff > + 70 * cpi->denoiser.threshold_aggressive_mode / 100) { + is_noisy = 1; + } + } else { + if (cpi->mse_source_denoised > 1000) is_noisy = 1; + } + x->increase_denoising = 0; + if (!x->is_skin && x->best_sse_inter_mode == ZEROMV && + (x->best_reference_frame == LAST_FRAME || + x->best_reference_frame == cpi->closest_reference_frame) && + cpi->consec_zero_last[block_index] >= 20 && is_noisy) { + x->increase_denoising = 1; + } + x->denoise_zeromv = 0; + vp8_denoiser_denoise_mb(&cpi->denoiser, x, best_sse, zero_mv_sse, + recon_yoffset, recon_uvoffset, &cpi->common.lf_info, + mb_row, mb_col, block_index, + cpi->consec_zero_last_mvbias[block_index]); + + // Reevaluate ZEROMV after denoising: for large noise content + // (i.e., cpi->mse_source_denoised is above threshold), do this for all + // blocks that did not pick ZEROMV as best mode but are using ZEROMV + // for denoising. Otherwise, always re-evaluate for blocks that picked + // INTRA mode as best mode. + // Avoid blocks that have been biased against ZERO_LAST + // (i.e., dot artifact candidate blocks). + reevaluate = (best_mbmode.ref_frame == INTRA_FRAME) || + (best_mbmode.mode != ZEROMV && x->denoise_zeromv && + cpi->mse_source_denoised > 2000); + if (!dot_artifact_candidate && reevaluate && + x->best_zeromv_reference_frame != INTRA_FRAME) { + int this_rd = 0; + int this_ref_frame = x->best_zeromv_reference_frame; + rd_adjustment = 100; + rate2 = + x->ref_frame_cost[this_ref_frame] + vp8_cost_mv_ref(ZEROMV, mdcounts); + distortion2 = 0; + + /* set up the proper prediction buffers for the frame */ + x->e_mbd.mode_info_context->mbmi.ref_frame = this_ref_frame; + x->e_mbd.pre.y_buffer = plane[this_ref_frame][0]; + x->e_mbd.pre.u_buffer = plane[this_ref_frame][1]; + x->e_mbd.pre.v_buffer = plane[this_ref_frame][2]; + + x->e_mbd.mode_info_context->mbmi.mode = ZEROMV; + x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED; + x->e_mbd.mode_info_context->mbmi.mv.as_int = 0; + this_rd = + evaluate_inter_mode(&sse, rate2, &distortion2, cpi, x, rd_adjustment); + + if (this_rd < best_rd) { + memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi, + sizeof(MB_MODE_INFO)); + } + } + } +#endif + + if (cpi->is_src_frame_alt_ref && + (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) { + x->e_mbd.mode_info_context->mbmi.mode = ZEROMV; + x->e_mbd.mode_info_context->mbmi.ref_frame = ALTREF_FRAME; + x->e_mbd.mode_info_context->mbmi.mv.as_int = 0; + x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED; + x->e_mbd.mode_info_context->mbmi.mb_skip_coeff = + (cpi->common.mb_no_coeff_skip); + x->e_mbd.mode_info_context->mbmi.partitioning = 0; + + return; + } + + /* set to the best mb mode, this copy can be skip if x->skip since it + * already has the right content */ + if (!x->skip) { + memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mbmode, + sizeof(MB_MODE_INFO)); + } + + if (best_mbmode.mode <= B_PRED) { + /* set mode_info_context->mbmi.uv_mode */ + pick_intra_mbuv_mode(x); + } + + if (sign_bias != + cpi->common.ref_frame_sign_bias[xd->mode_info_context->mbmi.ref_frame]) { + best_ref_mv.as_int = best_ref_mv_sb[!sign_bias].as_int; + } + + update_mvcount(x, &best_ref_mv); +} + +void vp8_pick_intra_mode(MACROBLOCK *x, int *rate) { + int error4x4, error16x16 = INT_MAX; + int rate_, best_rate = 0, distortion, best_sse; + MB_PREDICTION_MODE mode, best_mode = DC_PRED; + int this_rd; + unsigned int sse; + BLOCK *b = &x->block[0]; + MACROBLOCKD *xd = &x->e_mbd; + + xd->mode_info_context->mbmi.ref_frame = INTRA_FRAME; + + pick_intra_mbuv_mode(x); + + for (mode = DC_PRED; mode <= TM_PRED; ++mode) { + xd->mode_info_context->mbmi.mode = mode; + vp8_build_intra_predictors_mby_s(xd, xd->dst.y_buffer - xd->dst.y_stride, + xd->dst.y_buffer - 1, xd->dst.y_stride, + xd->predictor, 16); + distortion = vpx_variance16x16(*(b->base_src), b->src_stride, xd->predictor, + 16, &sse); + rate_ = x->mbmode_cost[xd->frame_type][mode]; + this_rd = RDCOST(x->rdmult, x->rddiv, rate_, distortion); + + if (error16x16 > this_rd) { + error16x16 = this_rd; + best_mode = mode; + best_sse = sse; + best_rate = rate_; + } + } + xd->mode_info_context->mbmi.mode = best_mode; + + error4x4 = pick_intra4x4mby_modes(x, &rate_, &best_sse); + if (error4x4 < error16x16) { + xd->mode_info_context->mbmi.mode = B_PRED; + best_rate = rate_; + } + + *rate = best_rate; +} diff --git a/media/libvpx/libvpx/vp8/encoder/pickinter.h b/media/libvpx/libvpx/vp8/encoder/pickinter.h new file mode 100644 index 0000000000..392fb41593 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/pickinter.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_ENCODER_PICKINTER_H_ +#define VPX_VP8_ENCODER_PICKINTER_H_ +#include "vpx_config.h" +#include "vp8/common/onyxc_int.h" + +#ifdef __cplusplus +extern "C" { +#endif + +extern void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, + int recon_uvoffset, int *returnrate, + int *returndistortion, int *returnintra, + int mb_row, int mb_col); +extern void vp8_pick_intra_mode(MACROBLOCK *x, int *rate); + +extern int vp8_get_inter_mbpred_error(MACROBLOCK *mb, + const vp8_variance_fn_ptr_t *vfp, + unsigned int *sse, int_mv this_mv); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_ENCODER_PICKINTER_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/picklpf.c b/media/libvpx/libvpx/vp8/encoder/picklpf.c new file mode 100644 index 0000000000..387ac9788b --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/picklpf.c @@ -0,0 +1,392 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_scale_rtcd.h" +#include "vp8/common/onyxc_int.h" +#include "onyx_int.h" +#include "vp8/encoder/picklpf.h" +#include "vp8/encoder/quantize.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_scale/vpx_scale.h" +#include "vp8/common/alloccommon.h" +#include "vp8/common/loopfilter.h" +#if VPX_ARCH_ARM +#include "vpx_ports/arm.h" +#endif + +extern int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, + YV12_BUFFER_CONFIG *dest); + +static void yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, + YV12_BUFFER_CONFIG *dst_ybc) { + unsigned char *src_y, *dst_y; + int yheight; + int ystride; + int yoffset; + int linestocopy; + + yheight = src_ybc->y_height; + ystride = src_ybc->y_stride; + + /* number of MB rows to use in partial filtering */ + linestocopy = (yheight >> 4) / PARTIAL_FRAME_FRACTION; + linestocopy = linestocopy ? linestocopy << 4 : 16; /* 16 lines per MB */ + + /* Copy extra 4 so that full filter context is available if filtering done + * on the copied partial frame and not original. Partial filter does mb + * filtering for top row also, which can modify3 pixels above. + */ + linestocopy += 4; + /* partial image starts at ~middle of frame (macroblock border)*/ + yoffset = ystride * (((yheight >> 5) * 16) - 4); + src_y = src_ybc->y_buffer + yoffset; + dst_y = dst_ybc->y_buffer + yoffset; + + memcpy(dst_y, src_y, ystride * linestocopy); +} + +static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source, + YV12_BUFFER_CONFIG *dest) { + int i, j; + int Total = 0; + int srcoffset, dstoffset; + unsigned char *src = source->y_buffer; + unsigned char *dst = dest->y_buffer; + + int linestocopy; + + /* number of MB rows to use in partial filtering */ + linestocopy = (source->y_height >> 4) / PARTIAL_FRAME_FRACTION; + linestocopy = linestocopy ? linestocopy << 4 : 16; /* 16 lines per MB */ + + /* partial image starts at ~middle of frame (macroblock border)*/ + srcoffset = source->y_stride * ((dest->y_height >> 5) * 16); + dstoffset = dest->y_stride * ((dest->y_height >> 5) * 16); + + src += srcoffset; + dst += dstoffset; + + /* Loop through the Y plane raw and reconstruction data summing + * (square differences) + */ + for (i = 0; i < linestocopy; i += 16) { + for (j = 0; j < source->y_width; j += 16) { + unsigned int sse; + Total += vpx_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride, + &sse); + } + + src += 16 * source->y_stride; + dst += 16 * dest->y_stride; + } + + return Total; +} + +/* Enforce a minimum filter level based upon baseline Q */ +static int get_min_filter_level(VP8_COMP *cpi, int base_qindex) { + int min_filter_level; + + if (cpi->source_alt_ref_active && cpi->common.refresh_golden_frame && + !cpi->common.refresh_alt_ref_frame) { + min_filter_level = 0; + } else { + if (base_qindex <= 6) { + min_filter_level = 0; + } else if (base_qindex <= 16) { + min_filter_level = 1; + } else { + min_filter_level = (base_qindex / 8); + } + } + + return min_filter_level; +} + +/* Enforce a maximum filter level based upon baseline Q */ +static int get_max_filter_level(VP8_COMP *cpi, int base_qindex) { + /* PGW August 2006: Highest filter values almost always a bad idea */ + + /* jbb chg: 20100118 - not so any more with this overquant stuff allow + * high values with lots of intra coming in. + */ + int max_filter_level = MAX_LOOP_FILTER; + (void)base_qindex; + + if (cpi->twopass.section_intra_rating > 8) { + max_filter_level = MAX_LOOP_FILTER * 3 / 4; + } + + return max_filter_level; +} + +void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) { + VP8_COMMON *cm = &cpi->common; + + int best_err = 0; + int filt_err = 0; + int min_filter_level = get_min_filter_level(cpi, cm->base_qindex); + int max_filter_level = get_max_filter_level(cpi, cm->base_qindex); + int filt_val; + int best_filt_val; + YV12_BUFFER_CONFIG *saved_frame = cm->frame_to_show; + + /* Replace unfiltered frame buffer with a new one */ + cm->frame_to_show = &cpi->pick_lf_lvl_frame; + + if (cm->frame_type == KEY_FRAME) { + cm->sharpness_level = 0; + } else { + cm->sharpness_level = cpi->oxcf.Sharpness; + } + + if (cm->sharpness_level != cm->last_sharpness_level) { + vp8_loop_filter_update_sharpness(&cm->lf_info, cm->sharpness_level); + cm->last_sharpness_level = cm->sharpness_level; + } + + /* Start the search at the previous frame filter level unless it is + * now out of range. + */ + if (cm->filter_level < min_filter_level) { + cm->filter_level = min_filter_level; + } else if (cm->filter_level > max_filter_level) { + cm->filter_level = max_filter_level; + } + + filt_val = cm->filter_level; + best_filt_val = filt_val; + + /* Get the err using the previous frame's filter value. */ + + /* Copy the unfiltered / processed recon buffer to the new buffer */ + yv12_copy_partial_frame(saved_frame, cm->frame_to_show); + vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val); + + best_err = calc_partial_ssl_err(sd, cm->frame_to_show); + + filt_val -= 1 + (filt_val > 10); + + /* Search lower filter levels */ + while (filt_val >= min_filter_level) { + /* Apply the loop filter */ + yv12_copy_partial_frame(saved_frame, cm->frame_to_show); + vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val); + + /* Get the err for filtered frame */ + filt_err = calc_partial_ssl_err(sd, cm->frame_to_show); + + /* Update the best case record or exit loop. */ + if (filt_err < best_err) { + best_err = filt_err; + best_filt_val = filt_val; + } else { + break; + } + + /* Adjust filter level */ + filt_val -= 1 + (filt_val > 10); + } + + /* Search up (note that we have already done filt_val = cm->filter_level) */ + filt_val = cm->filter_level + 1 + (filt_val > 10); + + if (best_filt_val == cm->filter_level) { + /* Resist raising filter level for very small gains */ + best_err -= (best_err >> 10); + + while (filt_val < max_filter_level) { + /* Apply the loop filter */ + yv12_copy_partial_frame(saved_frame, cm->frame_to_show); + + vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val); + + /* Get the err for filtered frame */ + filt_err = calc_partial_ssl_err(sd, cm->frame_to_show); + + /* Update the best case record or exit loop. */ + if (filt_err < best_err) { + /* Do not raise filter level if improvement is < 1 part + * in 4096 + */ + best_err = filt_err - (filt_err >> 10); + + best_filt_val = filt_val; + } else { + break; + } + + /* Adjust filter level */ + filt_val += 1 + (filt_val > 10); + } + } + + cm->filter_level = best_filt_val; + + if (cm->filter_level < min_filter_level) cm->filter_level = min_filter_level; + + if (cm->filter_level > max_filter_level) cm->filter_level = max_filter_level; + + /* restore unfiltered frame pointer */ + cm->frame_to_show = saved_frame; +} + +/* Stub function for now Alt LF not used */ +void vp8cx_set_alt_lf_level(VP8_COMP *cpi, int filt_val) { + MACROBLOCKD *mbd = &cpi->mb.e_mbd; + (void)filt_val; + + mbd->segment_feature_data[MB_LVL_ALT_LF][0] = + cpi->segment_feature_data[MB_LVL_ALT_LF][0]; + mbd->segment_feature_data[MB_LVL_ALT_LF][1] = + cpi->segment_feature_data[MB_LVL_ALT_LF][1]; + mbd->segment_feature_data[MB_LVL_ALT_LF][2] = + cpi->segment_feature_data[MB_LVL_ALT_LF][2]; + mbd->segment_feature_data[MB_LVL_ALT_LF][3] = + cpi->segment_feature_data[MB_LVL_ALT_LF][3]; +} + +void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) { + VP8_COMMON *cm = &cpi->common; + + int best_err = 0; + int filt_err = 0; + int min_filter_level = get_min_filter_level(cpi, cm->base_qindex); + int max_filter_level = get_max_filter_level(cpi, cm->base_qindex); + + int filter_step; + int filt_high = 0; + int filt_mid; + int filt_low = 0; + int filt_best; + int filt_direction = 0; + + /* Bias against raising loop filter and in favor of lowering it */ + int Bias = 0; + + int ss_err[MAX_LOOP_FILTER + 1]; + + YV12_BUFFER_CONFIG *saved_frame = cm->frame_to_show; + + memset(ss_err, 0, sizeof(ss_err)); + + /* Replace unfiltered frame buffer with a new one */ + cm->frame_to_show = &cpi->pick_lf_lvl_frame; + + if (cm->frame_type == KEY_FRAME) { + cm->sharpness_level = 0; + } else { + cm->sharpness_level = cpi->oxcf.Sharpness; + } + + /* Start the search at the previous frame filter level unless it is + * now out of range. + */ + filt_mid = cm->filter_level; + + if (filt_mid < min_filter_level) { + filt_mid = min_filter_level; + } else if (filt_mid > max_filter_level) { + filt_mid = max_filter_level; + } + + /* Define the initial step size */ + filter_step = (filt_mid < 16) ? 4 : filt_mid / 4; + + /* Get baseline error score */ + + /* Copy the unfiltered / processed recon buffer to the new buffer */ + vpx_yv12_copy_y(saved_frame, cm->frame_to_show); + + vp8cx_set_alt_lf_level(cpi, filt_mid); + vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_mid); + + best_err = vp8_calc_ss_err(sd, cm->frame_to_show); + + ss_err[filt_mid] = best_err; + + filt_best = filt_mid; + + while (filter_step > 0) { + Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; + + if (cpi->twopass.section_intra_rating < 20) { + Bias = Bias * cpi->twopass.section_intra_rating / 20; + } + + filt_high = ((filt_mid + filter_step) > max_filter_level) + ? max_filter_level + : (filt_mid + filter_step); + filt_low = ((filt_mid - filter_step) < min_filter_level) + ? min_filter_level + : (filt_mid - filter_step); + + if ((filt_direction <= 0) && (filt_low != filt_mid)) { + if (ss_err[filt_low] == 0) { + /* Get Low filter error score */ + vpx_yv12_copy_y(saved_frame, cm->frame_to_show); + vp8cx_set_alt_lf_level(cpi, filt_low); + vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low); + + filt_err = vp8_calc_ss_err(sd, cm->frame_to_show); + ss_err[filt_low] = filt_err; + } else { + filt_err = ss_err[filt_low]; + } + + /* If value is close to the best so far then bias towards a + * lower loop filter value. + */ + if ((filt_err - Bias) < best_err) { + /* Was it actually better than the previous best? */ + if (filt_err < best_err) best_err = filt_err; + + filt_best = filt_low; + } + } + + /* Now look at filt_high */ + if ((filt_direction >= 0) && (filt_high != filt_mid)) { + if (ss_err[filt_high] == 0) { + vpx_yv12_copy_y(saved_frame, cm->frame_to_show); + vp8cx_set_alt_lf_level(cpi, filt_high); + vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_high); + + filt_err = vp8_calc_ss_err(sd, cm->frame_to_show); + ss_err[filt_high] = filt_err; + } else { + filt_err = ss_err[filt_high]; + } + + /* Was it better than the previous best? */ + if (filt_err < (best_err - Bias)) { + best_err = filt_err; + filt_best = filt_high; + } + } + + /* Half the step distance if the best filter value was the same + * as last time + */ + if (filt_best == filt_mid) { + filter_step = filter_step / 2; + filt_direction = 0; + } else { + filt_direction = (filt_best < filt_mid) ? -1 : 1; + filt_mid = filt_best; + } + } + + cm->filter_level = filt_best; + + /* restore unfiltered frame pointer */ + cm->frame_to_show = saved_frame; +} diff --git a/media/libvpx/libvpx/vp8/encoder/picklpf.h b/media/libvpx/libvpx/vp8/encoder/picklpf.h new file mode 100644 index 0000000000..03597e5427 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/picklpf.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_ENCODER_PICKLPF_H_ +#define VPX_VP8_ENCODER_PICKLPF_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP8_COMP; +struct yv12_buffer_config; + +void vp8cx_pick_filter_level_fast(struct yv12_buffer_config *sd, + struct VP8_COMP *cpi); +void vp8cx_set_alt_lf_level(struct VP8_COMP *cpi, int filt_val); +void vp8cx_pick_filter_level(struct yv12_buffer_config *sd, VP8_COMP *cpi); + +#ifdef __cplusplus +} +#endif + +#endif // VPX_VP8_ENCODER_PICKLPF_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/quantize.h b/media/libvpx/libvpx/vp8/encoder/quantize.h new file mode 100644 index 0000000000..78746c0c20 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/quantize.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_ENCODER_QUANTIZE_H_ +#define VPX_VP8_ENCODER_QUANTIZE_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP8_COMP; +struct macroblock; +extern void vp8_quantize_mb(struct macroblock *x); +extern void vp8_quantize_mby(struct macroblock *x); +extern void vp8_quantize_mbuv(struct macroblock *x); +extern void vp8_set_quantizer(struct VP8_COMP *cpi, int Q); +extern void vp8cx_frame_init_quantizer(struct VP8_COMP *cpi); +extern void vp8_update_zbin_extra(struct VP8_COMP *cpi, struct macroblock *x); +extern void vp8cx_mb_init_quantizer(struct VP8_COMP *cpi, struct macroblock *x, + int ok_to_skip); +extern void vp8cx_init_quantizer(struct VP8_COMP *cpi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_ENCODER_QUANTIZE_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/ratectrl.c b/media/libvpx/libvpx/vp8/encoder/ratectrl.c new file mode 100644 index 0000000000..fcd4eb04eb --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/ratectrl.c @@ -0,0 +1,1591 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include +#include + +#include "math.h" +#include "vp8/common/common.h" +#include "ratectrl.h" +#include "vp8/common/entropymode.h" +#include "vpx_mem/vpx_mem.h" +#include "vp8/common/systemdependent.h" +#include "encodemv.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_ports/system_state.h" + +#define MIN_BPB_FACTOR 0.01 +#define MAX_BPB_FACTOR 50 + +extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES]; + +#ifdef MODE_STATS +extern int y_modes[5]; +extern int uv_modes[4]; +extern int b_modes[10]; + +extern int inter_y_modes[10]; +extern int inter_uv_modes[4]; +extern int inter_b_modes[10]; +#endif + +/* Bits Per MB at different Q (Multiplied by 512) */ +#define BPER_MB_NORMBITS 9 + +/* Work in progress recalibration of baseline rate tables based on + * the assumption that bits per mb is inversely proportional to the + * quantizer value. + */ +const int vp8_bits_per_mb[2][QINDEX_RANGE] = { + /* Intra case 450000/Qintra */ + { + 1125000, 900000, 750000, 642857, 562500, 500000, 450000, 450000, 409090, + 375000, 346153, 321428, 300000, 281250, 264705, 264705, 250000, 236842, + 225000, 225000, 214285, 214285, 204545, 204545, 195652, 195652, 187500, + 180000, 180000, 173076, 166666, 160714, 155172, 150000, 145161, 140625, + 136363, 132352, 128571, 125000, 121621, 121621, 118421, 115384, 112500, + 109756, 107142, 104651, 102272, 100000, 97826, 97826, 95744, 93750, + 91836, 90000, 88235, 86538, 84905, 83333, 81818, 80357, 78947, + 77586, 76271, 75000, 73770, 72580, 71428, 70312, 69230, 68181, + 67164, 66176, 65217, 64285, 63380, 62500, 61643, 60810, 60000, + 59210, 59210, 58441, 57692, 56962, 56250, 55555, 54878, 54216, + 53571, 52941, 52325, 51724, 51136, 50561, 49450, 48387, 47368, + 46875, 45918, 45000, 44554, 44117, 43269, 42452, 41666, 40909, + 40178, 39473, 38793, 38135, 36885, 36290, 35714, 35156, 34615, + 34090, 33582, 33088, 32608, 32142, 31468, 31034, 30405, 29801, + 29220, 28662, + }, + /* Inter case 285000/Qinter */ + { + 712500, 570000, 475000, 407142, 356250, 316666, 285000, 259090, 237500, + 219230, 203571, 190000, 178125, 167647, 158333, 150000, 142500, 135714, + 129545, 123913, 118750, 114000, 109615, 105555, 101785, 98275, 95000, + 91935, 89062, 86363, 83823, 81428, 79166, 77027, 75000, 73076, + 71250, 69512, 67857, 66279, 64772, 63333, 61956, 60638, 59375, + 58163, 57000, 55882, 54807, 53773, 52777, 51818, 50892, 50000, + 49137, 47500, 45967, 44531, 43181, 41911, 40714, 39583, 38513, + 37500, 36538, 35625, 34756, 33928, 33139, 32386, 31666, 30978, + 30319, 29687, 29081, 28500, 27941, 27403, 26886, 26388, 25909, + 25446, 25000, 24568, 23949, 23360, 22800, 22265, 21755, 21268, + 20802, 20357, 19930, 19520, 19127, 18750, 18387, 18037, 17701, + 17378, 17065, 16764, 16473, 16101, 15745, 15405, 15079, 14766, + 14467, 14179, 13902, 13636, 13380, 13133, 12895, 12666, 12445, + 12179, 11924, 11632, 11445, 11220, 11003, 10795, 10594, 10401, + 10215, 10035, + } +}; + +static const int kf_boost_qadjustment[QINDEX_RANGE] = { + 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, + 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, + 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, + 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, + 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 200, 201, + 201, 202, 203, 203, 203, 204, 204, 205, 205, 206, 206, 207, 207, 208, 208, + 209, 209, 210, 210, 211, 211, 212, 212, 213, 213, 214, 214, 215, 215, 216, + 216, 217, 217, 218, 218, 219, 219, 220, 220, 220, 220, 220, 220, 220, 220, + 220, 220, 220, 220, 220, 220, 220, 220, +}; + +/* #define GFQ_ADJUSTMENT (Q+100) */ +#define GFQ_ADJUSTMENT vp8_gf_boost_qadjustment[Q] +const int vp8_gf_boost_qadjustment[QINDEX_RANGE] = { + 80, 82, 84, 86, 88, 90, 92, 94, 96, 97, 98, 99, 100, 101, 102, + 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, + 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, + 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, + 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, + 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, + 178, 179, 180, 181, 182, 183, 184, 184, 185, 185, 186, 186, 187, 187, 188, + 188, 189, 189, 190, 190, 191, 191, 192, 192, 193, 193, 194, 194, 194, 194, + 195, 195, 196, 196, 197, 197, 198, 198 +}; + +/* +const int vp8_gf_boost_qadjustment[QINDEX_RANGE] = +{ + 100,101,102,103,104,105,105,106, + 106,107,107,108,109,109,110,111, + 112,113,114,115,116,117,118,119, + 120,121,122,123,124,125,126,127, + 128,129,130,131,132,133,134,135, + 136,137,138,139,140,141,142,143, + 144,145,146,147,148,149,150,151, + 152,153,154,155,156,157,158,159, + 160,161,162,163,164,165,166,167, + 168,169,170,170,171,171,172,172, + 173,173,173,174,174,174,175,175, + 175,176,176,176,177,177,177,177, + 178,178,179,179,180,180,181,181, + 182,182,183,183,184,184,185,185, + 186,186,187,187,188,188,189,189, + 190,190,191,191,192,192,193,193, +}; +*/ + +static const int kf_gf_boost_qlimits[QINDEX_RANGE] = { + 150, 155, 160, 165, 170, 175, 180, 185, 190, 195, 200, 205, 210, 215, 220, + 225, 230, 235, 240, 245, 250, 255, 260, 265, 270, 275, 280, 285, 290, 295, + 300, 305, 310, 320, 330, 340, 350, 360, 370, 380, 390, 400, 410, 420, 430, + 440, 450, 460, 470, 480, 490, 500, 510, 520, 530, 540, 550, 560, 570, 580, + 590, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, + 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, + 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, + 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, + 600, 600, 600, 600, 600, 600, 600, 600, +}; + +static const int gf_adjust_table[101] = { + 100, 115, 130, 145, 160, 175, 190, 200, 210, 220, 230, 240, 260, 270, 280, + 290, 300, 310, 320, 330, 340, 350, 360, 370, 380, 390, 400, 400, 400, 400, + 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, + 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, + 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, + 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, + 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, +}; + +static const int gf_intra_usage_adjustment[20] = { + 125, 120, 115, 110, 105, 100, 95, 85, 80, 75, + 70, 65, 60, 55, 50, 50, 50, 50, 50, 50, +}; + +static const int gf_interval_table[101] = { + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, +}; + +static const unsigned int prior_key_frame_weight[KEY_FRAME_CONTEXT] = { 1, 2, 3, + 4, 5 }; + +void vp8_save_coding_context(VP8_COMP *cpi) { + CODING_CONTEXT *const cc = &cpi->coding_context; + + /* Stores a snapshot of key state variables which can subsequently be + * restored with a call to vp8_restore_coding_context. These functions are + * intended for use in a re-code loop in vp8_compress_frame where the + * quantizer value is adjusted between loop iterations. + */ + + cc->frames_since_key = cpi->frames_since_key; + cc->filter_level = cpi->common.filter_level; + cc->frames_till_gf_update_due = cpi->frames_till_gf_update_due; + cc->frames_since_golden = cpi->frames_since_golden; + + vp8_copy(cc->mvc, cpi->common.fc.mvc); + vp8_copy(cc->mvcosts, cpi->rd_costs.mvcosts); + + vp8_copy(cc->ymode_prob, cpi->common.fc.ymode_prob); + vp8_copy(cc->uv_mode_prob, cpi->common.fc.uv_mode_prob); + + vp8_copy(cc->ymode_count, cpi->mb.ymode_count); + vp8_copy(cc->uv_mode_count, cpi->mb.uv_mode_count); + +/* Stats */ +#ifdef MODE_STATS + vp8_copy(cc->y_modes, y_modes); + vp8_copy(cc->uv_modes, uv_modes); + vp8_copy(cc->b_modes, b_modes); + vp8_copy(cc->inter_y_modes, inter_y_modes); + vp8_copy(cc->inter_uv_modes, inter_uv_modes); + vp8_copy(cc->inter_b_modes, inter_b_modes); +#endif + + cc->this_frame_percent_intra = cpi->this_frame_percent_intra; +} + +void vp8_restore_coding_context(VP8_COMP *cpi) { + CODING_CONTEXT *const cc = &cpi->coding_context; + + /* Restore key state variables to the snapshot state stored in the + * previous call to vp8_save_coding_context. + */ + + cpi->frames_since_key = cc->frames_since_key; + cpi->common.filter_level = cc->filter_level; + cpi->frames_till_gf_update_due = cc->frames_till_gf_update_due; + cpi->frames_since_golden = cc->frames_since_golden; + + vp8_copy(cpi->common.fc.mvc, cc->mvc); + + vp8_copy(cpi->rd_costs.mvcosts, cc->mvcosts); + + vp8_copy(cpi->common.fc.ymode_prob, cc->ymode_prob); + vp8_copy(cpi->common.fc.uv_mode_prob, cc->uv_mode_prob); + + vp8_copy(cpi->mb.ymode_count, cc->ymode_count); + vp8_copy(cpi->mb.uv_mode_count, cc->uv_mode_count); + +/* Stats */ +#ifdef MODE_STATS + vp8_copy(y_modes, cc->y_modes); + vp8_copy(uv_modes, cc->uv_modes); + vp8_copy(b_modes, cc->b_modes); + vp8_copy(inter_y_modes, cc->inter_y_modes); + vp8_copy(inter_uv_modes, cc->inter_uv_modes); + vp8_copy(inter_b_modes, cc->inter_b_modes); +#endif + + cpi->this_frame_percent_intra = cc->this_frame_percent_intra; +} + +void vp8_setup_key_frame(VP8_COMP *cpi) { + /* Setup for Key frame: */ + + vp8_default_coef_probs(&cpi->common); + + memcpy(cpi->common.fc.mvc, vp8_default_mv_context, + sizeof(vp8_default_mv_context)); + { + int flag[2] = { 1, 1 }; + vp8_build_component_cost_table( + cpi->mb.mvcost, (const MV_CONTEXT *)cpi->common.fc.mvc, flag); + } + + /* Make sure we initialize separate contexts for altref,gold, and normal. + * TODO shouldn't need 3 different copies of structure to do this! + */ + memcpy(&cpi->lfc_a, &cpi->common.fc, sizeof(cpi->common.fc)); + memcpy(&cpi->lfc_g, &cpi->common.fc, sizeof(cpi->common.fc)); + memcpy(&cpi->lfc_n, &cpi->common.fc, sizeof(cpi->common.fc)); + + cpi->common.filter_level = cpi->common.base_qindex * 3 / 8; + + /* Provisional interval before next GF */ + if (cpi->auto_gold) { + cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; + } else { + cpi->frames_till_gf_update_due = DEFAULT_GF_INTERVAL; + } + + cpi->common.refresh_golden_frame = 1; + cpi->common.refresh_alt_ref_frame = 1; +} + +static int estimate_bits_at_q(int frame_kind, int Q, int MBs, + double correction_factor) { + int Bpm = (int)(.5 + correction_factor * vp8_bits_per_mb[frame_kind][Q]); + + /* Attempt to retain reasonable accuracy without overflow. The cutoff is + * chosen such that the maximum product of Bpm and MBs fits 31 bits. The + * largest Bpm takes 20 bits. + */ + if (MBs > (1 << 11)) { + return (Bpm >> BPER_MB_NORMBITS) * MBs; + } else { + return (Bpm * MBs) >> BPER_MB_NORMBITS; + } +} + +static void calc_iframe_target_size(VP8_COMP *cpi) { + /* boost defaults to half second */ + int kf_boost; + uint64_t target; + + /* Clear down mmx registers to allow floating point in what follows */ + vpx_clear_system_state(); + + if (cpi->oxcf.fixed_q >= 0) { + int Q = cpi->oxcf.key_q; + + target = estimate_bits_at_q(INTRA_FRAME, Q, cpi->common.MBs, + cpi->key_frame_rate_correction_factor); + } else if (cpi->pass == 2) { + /* New Two pass RC */ + target = cpi->per_frame_bandwidth; + } + /* First Frame is a special case */ + else if (cpi->common.current_video_frame == 0) { + /* 1 Pass there is no information on which to base size so use + * bandwidth per second * fraction of the initial buffer + * level + */ + target = (uint64_t)cpi->oxcf.starting_buffer_level / 2; + + if (target > cpi->oxcf.target_bandwidth * 3 / 2) { + target = cpi->oxcf.target_bandwidth * 3 / 2; + } + } else { + /* if this keyframe was forced, use a more recent Q estimate */ + int Q = (cpi->common.frame_flags & FRAMEFLAGS_KEY) ? cpi->avg_frame_qindex + : cpi->ni_av_qi; + + int initial_boost = 32; /* |3.0 * per_frame_bandwidth| */ + /* Boost depends somewhat on frame rate: only used for 1 layer case. */ + if (cpi->oxcf.number_of_layers == 1) { + kf_boost = + VPXMAX(initial_boost, (int)round(2 * cpi->output_framerate - 16)); + } else { + /* Initial factor: set target size to: |3.0 * per_frame_bandwidth|. */ + kf_boost = initial_boost; + } + + /* adjustment up based on q: this factor ranges from ~1.2 to 2.2. */ + kf_boost = kf_boost * kf_boost_qadjustment[Q] / 100; + + /* frame separation adjustment ( down) */ + if (cpi->frames_since_key < cpi->output_framerate / 2) { + kf_boost = + (int)(kf_boost * cpi->frames_since_key / (cpi->output_framerate / 2)); + } + + /* Minimal target size is |2* per_frame_bandwidth|. */ + if (kf_boost < 16) kf_boost = 16; + + target = ((uint64_t)(16 + kf_boost) * cpi->per_frame_bandwidth) >> 4; + target = VPXMIN(INT_MAX, target); + } + + if (cpi->oxcf.rc_max_intra_bitrate_pct) { + unsigned int max_rate; + // This product may overflow unsigned int + uint64_t product = cpi->per_frame_bandwidth; + product *= cpi->oxcf.rc_max_intra_bitrate_pct; + product /= 100; + max_rate = (unsigned int)VPXMIN(INT_MAX, product); + + if (target > max_rate) target = max_rate; + } + + cpi->this_frame_target = (int)target; + + /* TODO: if we separate rate targeting from Q targeting, move this. + * Reset the active worst quality to the baseline value for key frames. + */ + if (cpi->pass != 2) cpi->active_worst_quality = cpi->worst_quality; + +#if 0 + { + FILE *f; + + f = fopen("kf_boost.stt", "a"); + fprintf(f, " %8u %10d %10d %10d\n", + cpi->common.current_video_frame, cpi->gfu_boost, cpi->baseline_gf_interval, cpi->source_alt_ref_pending); + + fclose(f); + } +#endif +} + +/* Do the best we can to define the parameters for the next GF based on what + * information we have available. + */ +static void calc_gf_params(VP8_COMP *cpi) { + int Q = + (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q; + int Boost = 0; + + int gf_frame_usage = 0; /* Golden frame usage since last GF */ + int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME] + + cpi->recent_ref_frame_usage[LAST_FRAME] + + cpi->recent_ref_frame_usage[GOLDEN_FRAME] + + cpi->recent_ref_frame_usage[ALTREF_FRAME]; + + int pct_gf_active = (100 * cpi->gf_active_count) / + (cpi->common.mb_rows * cpi->common.mb_cols); + + if (tot_mbs) { + gf_frame_usage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] + + cpi->recent_ref_frame_usage[ALTREF_FRAME]) * + 100 / tot_mbs; + } + + if (pct_gf_active > gf_frame_usage) gf_frame_usage = pct_gf_active; + + /* Not two pass */ + if (cpi->pass != 2) { + /* Single Pass lagged mode: TBD */ + if (0) { + } + + /* Single Pass compression: Has to use current and historical data */ + else { +#if 0 + /* Experimental code */ + int index = cpi->one_pass_frame_index; + int frames_to_scan = (cpi->max_gf_interval <= MAX_LAG_BUFFERS) ? cpi->max_gf_interval : MAX_LAG_BUFFERS; + + /* ************** Experimental code - incomplete */ + /* + double decay_val = 1.0; + double IIAccumulator = 0.0; + double last_iiaccumulator = 0.0; + double IIRatio; + + cpi->one_pass_frame_index = cpi->common.current_video_frame%MAX_LAG_BUFFERS; + + for ( i = 0; i < (frames_to_scan - 1); i++ ) + { + if ( index < 0 ) + index = MAX_LAG_BUFFERS; + index --; + + if ( cpi->one_pass_frame_stats[index].frame_coded_error > 0.0 ) + { + IIRatio = cpi->one_pass_frame_stats[index].frame_intra_error / cpi->one_pass_frame_stats[index].frame_coded_error; + + if ( IIRatio > 30.0 ) + IIRatio = 30.0; + } + else + IIRatio = 30.0; + + IIAccumulator += IIRatio * decay_val; + + decay_val = decay_val * cpi->one_pass_frame_stats[index].frame_pcnt_inter; + + if ( (i > MIN_GF_INTERVAL) && + ((IIAccumulator - last_iiaccumulator) < 2.0) ) + { + break; + } + last_iiaccumulator = IIAccumulator; + } + + Boost = IIAccumulator*100.0/16.0; + cpi->baseline_gf_interval = i; + + */ +#else + + /*************************************************************/ + /* OLD code */ + + /* Adjust boost based upon ambient Q */ + Boost = GFQ_ADJUSTMENT; + + /* Adjust based upon most recently measure intra usage */ + Boost = Boost * + gf_intra_usage_adjustment[(cpi->this_frame_percent_intra < 15) + ? cpi->this_frame_percent_intra + : 14] / + 100; + + /* Adjust gf boost based upon GF usage since last GF */ + Boost = Boost * gf_adjust_table[gf_frame_usage] / 100; +#endif + } + + /* golden frame boost without recode loop often goes awry. be + * safe by keeping numbers down. + */ + if (!cpi->sf.recode_loop) { + if (cpi->compressor_speed == 2) Boost = Boost / 2; + } + + /* Apply an upper limit based on Q for 1 pass encodes */ + if (Boost > kf_gf_boost_qlimits[Q] && (cpi->pass == 0)) { + Boost = kf_gf_boost_qlimits[Q]; + + /* Apply lower limits to boost. */ + } else if (Boost < 110) { + Boost = 110; + } + + /* Note the boost used */ + cpi->last_boost = Boost; + } + + /* Estimate next interval + * This is updated once the real frame size/boost is known. + */ + if (cpi->oxcf.fixed_q == -1) { + if (cpi->pass == 2) { /* 2 Pass */ + cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; + } else { /* 1 Pass */ + cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; + + if (cpi->last_boost > 750) cpi->frames_till_gf_update_due++; + + if (cpi->last_boost > 1000) cpi->frames_till_gf_update_due++; + + if (cpi->last_boost > 1250) cpi->frames_till_gf_update_due++; + + if (cpi->last_boost >= 1500) cpi->frames_till_gf_update_due++; + + if (gf_interval_table[gf_frame_usage] > cpi->frames_till_gf_update_due) { + cpi->frames_till_gf_update_due = gf_interval_table[gf_frame_usage]; + } + + if (cpi->frames_till_gf_update_due > cpi->max_gf_interval) { + cpi->frames_till_gf_update_due = cpi->max_gf_interval; + } + } + } else { + cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; + } + + /* ARF on or off */ + if (cpi->pass != 2) { + /* For now Alt ref is not allowed except in 2 pass modes. */ + cpi->source_alt_ref_pending = 0; + + /*if ( cpi->oxcf.fixed_q == -1) + { + if ( cpi->oxcf.play_alternate && (cpi->last_boost > (100 + + (AF_THRESH*cpi->frames_till_gf_update_due)) ) ) + cpi->source_alt_ref_pending = 1; + else + cpi->source_alt_ref_pending = 0; + }*/ + } +} + +static void calc_pframe_target_size(VP8_COMP *cpi) { + int min_frame_target; + int old_per_frame_bandwidth = cpi->per_frame_bandwidth; + + if (cpi->current_layer > 0) { + cpi->per_frame_bandwidth = + cpi->layer_context[cpi->current_layer].avg_frame_size_for_layer; + } + + min_frame_target = 0; + + if (cpi->pass == 2) { + min_frame_target = cpi->min_frame_bandwidth; + + if (min_frame_target < (cpi->av_per_frame_bandwidth >> 5)) { + min_frame_target = cpi->av_per_frame_bandwidth >> 5; + } + } else if (min_frame_target < cpi->per_frame_bandwidth / 4) { + min_frame_target = cpi->per_frame_bandwidth / 4; + } + + /* Special alt reference frame case */ + if ((cpi->common.refresh_alt_ref_frame) && + (cpi->oxcf.number_of_layers == 1)) { + if (cpi->pass == 2) { + /* Per frame bit target for the alt ref frame */ + cpi->per_frame_bandwidth = cpi->twopass.gf_bits; + cpi->this_frame_target = cpi->per_frame_bandwidth; + } + + /* One Pass ??? TBD */ + } + + /* Normal frames (gf,and inter) */ + else { + /* 2 pass */ + if (cpi->pass == 2) { + cpi->this_frame_target = cpi->per_frame_bandwidth; + } + /* 1 pass */ + else { + int Adjustment; + /* Make rate adjustment to recover bits spent in key frame + * Test to see if the key frame inter data rate correction + * should still be in force + */ + if (cpi->kf_overspend_bits > 0) { + Adjustment = (cpi->kf_bitrate_adjustment <= cpi->kf_overspend_bits) + ? cpi->kf_bitrate_adjustment + : cpi->kf_overspend_bits; + + if (Adjustment > (cpi->per_frame_bandwidth - min_frame_target)) { + Adjustment = (cpi->per_frame_bandwidth - min_frame_target); + } + + cpi->kf_overspend_bits -= Adjustment; + + /* Calculate an inter frame bandwidth target for the next + * few frames designed to recover any extra bits spent on + * the key frame. + */ + cpi->this_frame_target = cpi->per_frame_bandwidth - Adjustment; + + if (cpi->this_frame_target < min_frame_target) { + cpi->this_frame_target = min_frame_target; + } + } else { + cpi->this_frame_target = cpi->per_frame_bandwidth; + } + + /* If appropriate make an adjustment to recover bits spent on a + * recent GF + */ + if ((cpi->gf_overspend_bits > 0) && + (cpi->this_frame_target > min_frame_target)) { + Adjustment = (cpi->non_gf_bitrate_adjustment <= cpi->gf_overspend_bits) + ? cpi->non_gf_bitrate_adjustment + : cpi->gf_overspend_bits; + + if (Adjustment > (cpi->this_frame_target - min_frame_target)) { + Adjustment = (cpi->this_frame_target - min_frame_target); + } + + cpi->gf_overspend_bits -= Adjustment; + cpi->this_frame_target -= Adjustment; + } + + /* Apply small + and - boosts for non gf frames */ + if ((cpi->last_boost > 150) && (cpi->frames_till_gf_update_due > 0) && + (cpi->current_gf_interval >= (MIN_GF_INTERVAL << 1))) { + /* % Adjustment limited to the range 1% to 10% */ + Adjustment = (cpi->last_boost - 100) >> 5; + + if (Adjustment < 1) { + Adjustment = 1; + } else if (Adjustment > 10) { + Adjustment = 10; + } + + /* Convert to bits */ + Adjustment = (cpi->this_frame_target * Adjustment) / 100; + + if (Adjustment > (cpi->this_frame_target - min_frame_target)) { + Adjustment = (cpi->this_frame_target - min_frame_target); + } + + if (cpi->frames_since_golden == (cpi->current_gf_interval >> 1)) { + Adjustment = (cpi->current_gf_interval - 1) * Adjustment; + // Limit adjustment to 10% of current target. + if (Adjustment > (10 * cpi->this_frame_target) / 100) { + Adjustment = (10 * cpi->this_frame_target) / 100; + } + cpi->this_frame_target += Adjustment; + } else { + cpi->this_frame_target -= Adjustment; + } + } + } + } + + /* Sanity check that the total sum of adjustments is not above the + * maximum allowed That is that having allowed for KF and GF penalties + * we have not pushed the current interframe target to low. If the + * adjustment we apply here is not capable of recovering all the extra + * bits we have spent in the KF or GF then the remainder will have to + * be recovered over a longer time span via other buffer / rate control + * mechanisms. + */ + if (cpi->this_frame_target < min_frame_target) { + cpi->this_frame_target = min_frame_target; + } + + if (!cpi->common.refresh_alt_ref_frame) { + /* Note the baseline target data rate for this inter frame. */ + cpi->inter_frame_target = cpi->this_frame_target; + } + + /* One Pass specific code */ + if (cpi->pass == 0) { + /* Adapt target frame size with respect to any buffering constraints: */ + if (cpi->buffered_mode) { + int one_percent_bits = (int)(1 + cpi->oxcf.optimal_buffer_level / 100); + + if ((cpi->buffer_level < cpi->oxcf.optimal_buffer_level) || + (cpi->bits_off_target < cpi->oxcf.optimal_buffer_level)) { + int percent_low = 0; + + /* Decide whether or not we need to adjust the frame data + * rate target. + * + * If we are are below the optimal buffer fullness level + * and adherence to buffering constraints is important to + * the end usage then adjust the per frame target. + */ + if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) && + (cpi->buffer_level < cpi->oxcf.optimal_buffer_level)) { + percent_low = + (int)((cpi->oxcf.optimal_buffer_level - cpi->buffer_level) / + one_percent_bits); + } + /* Are we overshooting the long term clip data rate... */ + else if (cpi->bits_off_target < 0) { + /* Adjust per frame data target downwards to compensate. */ + percent_low = + (int)(100 * -cpi->bits_off_target / (cpi->total_byte_count * 8)); + } + + if (percent_low > cpi->oxcf.under_shoot_pct) { + percent_low = cpi->oxcf.under_shoot_pct; + } else if (percent_low < 0) { + percent_low = 0; + } + + /* lower the target bandwidth for this frame. */ + cpi->this_frame_target -= + (int)(((int64_t)cpi->this_frame_target * percent_low) / 200); + + /* Are we using allowing control of active_worst_allowed_q + * according to buffer level. + */ + if (cpi->auto_worst_q && cpi->ni_frames > 150) { + int64_t critical_buffer_level; + + /* For streaming applications the most important factor is + * cpi->buffer_level as this takes into account the + * specified short term buffering constraints. However, + * hitting the long term clip data rate target is also + * important. + */ + if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { + /* Take the smaller of cpi->buffer_level and + * cpi->bits_off_target + */ + critical_buffer_level = (cpi->buffer_level < cpi->bits_off_target) + ? cpi->buffer_level + : cpi->bits_off_target; + } + /* For local file playback short term buffering constraints + * are less of an issue + */ + else { + /* Consider only how we are doing for the clip as a + * whole + */ + critical_buffer_level = cpi->bits_off_target; + } + + /* Set the active worst quality based upon the selected + * buffer fullness number. + */ + if (critical_buffer_level < cpi->oxcf.optimal_buffer_level) { + if (critical_buffer_level > (cpi->oxcf.optimal_buffer_level >> 2)) { + int64_t qadjustment_range = cpi->worst_quality - cpi->ni_av_qi; + int64_t above_base = (critical_buffer_level - + (cpi->oxcf.optimal_buffer_level >> 2)); + + /* Step active worst quality down from + * cpi->ni_av_qi when (critical_buffer_level == + * cpi->optimal_buffer_level) to + * cpi->worst_quality when + * (critical_buffer_level == + * cpi->optimal_buffer_level >> 2) + */ + cpi->active_worst_quality = + cpi->worst_quality - + (int)((qadjustment_range * above_base) / + (cpi->oxcf.optimal_buffer_level * 3 >> 2)); + } else { + cpi->active_worst_quality = cpi->worst_quality; + } + } else { + cpi->active_worst_quality = cpi->ni_av_qi; + } + } else { + cpi->active_worst_quality = cpi->worst_quality; + } + } else { + int percent_high = 0; + int64_t target = cpi->this_frame_target; + + if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) && + (cpi->buffer_level > cpi->oxcf.optimal_buffer_level)) { + percent_high = + (int)((cpi->buffer_level - cpi->oxcf.optimal_buffer_level) / + one_percent_bits); + } else if (cpi->bits_off_target > cpi->oxcf.optimal_buffer_level) { + percent_high = + (int)((100 * cpi->bits_off_target) / (cpi->total_byte_count * 8)); + } + + if (percent_high > cpi->oxcf.over_shoot_pct) { + percent_high = cpi->oxcf.over_shoot_pct; + } else if (percent_high < 0) { + percent_high = 0; + } + + target += (target * percent_high) / 200; + target = VPXMIN(target, INT_MAX); + cpi->this_frame_target = (int)target; + + /* Are we allowing control of active_worst_allowed_q according + * to buffer level. + */ + if (cpi->auto_worst_q && cpi->ni_frames > 150) { + /* When using the relaxed buffer model stick to the + * user specified value + */ + cpi->active_worst_quality = cpi->ni_av_qi; + } else { + cpi->active_worst_quality = cpi->worst_quality; + } + } + + /* Set active_best_quality to prevent quality rising too high */ + cpi->active_best_quality = cpi->best_quality; + + /* Worst quality obviously must not be better than best quality */ + if (cpi->active_worst_quality <= cpi->active_best_quality) { + cpi->active_worst_quality = cpi->active_best_quality + 1; + } + + if (cpi->active_worst_quality > 127) cpi->active_worst_quality = 127; + } + /* Unbuffered mode (eg. video conferencing) */ + else { + /* Set the active worst quality */ + cpi->active_worst_quality = cpi->worst_quality; + } + + /* Special trap for constrained quality mode + * "active_worst_quality" may never drop below cq level + * for any frame type. + */ + if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY && + cpi->active_worst_quality < cpi->cq_target_quality) { + cpi->active_worst_quality = cpi->cq_target_quality; + } + } + + /* Test to see if we have to drop a frame + * The auto-drop frame code is only used in buffered mode. + * In unbufferd mode (eg vide conferencing) the descision to + * code or drop a frame is made outside the codec in response to real + * world comms or buffer considerations. + */ + if (cpi->drop_frames_allowed && + (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) && + ((cpi->common.frame_type != KEY_FRAME))) { + /* Check for a buffer underun-crisis in which case we have to drop + * a frame + */ + if ((cpi->buffer_level < 0)) { +#if 0 + FILE *f = fopen("dec.stt", "a"); + fprintf(f, "%10d %10d %10d %10d ***** BUFFER EMPTY\n", + (int) cpi->common.current_video_frame, + cpi->decimation_factor, cpi->common.horiz_scale, + (cpi->buffer_level * 100) / cpi->oxcf.optimal_buffer_level); + fclose(f); +#endif + cpi->drop_frame = 1; + + /* Update the buffer level variable. */ + cpi->bits_off_target += cpi->av_per_frame_bandwidth; + if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size) { + cpi->bits_off_target = (int)cpi->oxcf.maximum_buffer_size; + } + cpi->buffer_level = cpi->bits_off_target; + + if (cpi->oxcf.number_of_layers > 1) { + unsigned int i; + + // Propagate bits saved by dropping the frame to higher layers. + for (i = cpi->current_layer + 1; i < cpi->oxcf.number_of_layers; ++i) { + LAYER_CONTEXT *lc = &cpi->layer_context[i]; + lc->bits_off_target += (int)(lc->target_bandwidth / lc->framerate); + if (lc->bits_off_target > lc->maximum_buffer_size) { + lc->bits_off_target = lc->maximum_buffer_size; + } + lc->buffer_level = lc->bits_off_target; + } + } + } + } + + /* Adjust target frame size for Golden Frames: */ + if (cpi->oxcf.error_resilient_mode == 0 && + (cpi->frames_till_gf_update_due == 0) && !cpi->drop_frame) { + if (!cpi->gf_update_onepass_cbr) { + int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] + : cpi->oxcf.fixed_q; + + int gf_frame_usage = 0; /* Golden frame usage since last GF */ + int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME] + + cpi->recent_ref_frame_usage[LAST_FRAME] + + cpi->recent_ref_frame_usage[GOLDEN_FRAME] + + cpi->recent_ref_frame_usage[ALTREF_FRAME]; + + int pct_gf_active = (100 * cpi->gf_active_count) / + (cpi->common.mb_rows * cpi->common.mb_cols); + + if (tot_mbs) { + gf_frame_usage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] + + cpi->recent_ref_frame_usage[ALTREF_FRAME]) * + 100 / tot_mbs; + } + + if (pct_gf_active > gf_frame_usage) gf_frame_usage = pct_gf_active; + + /* Is a fixed manual GF frequency being used */ + if (cpi->auto_gold) { + /* For one pass throw a GF if recent frame intra usage is + * low or the GF usage is high + */ + if ((cpi->pass == 0) && + (cpi->this_frame_percent_intra < 15 || gf_frame_usage >= 5)) { + cpi->common.refresh_golden_frame = 1; + + /* Two pass GF descision */ + } else if (cpi->pass == 2) { + cpi->common.refresh_golden_frame = 1; + } + } + +#if 0 + + /* Debug stats */ + if (0) { + FILE *f; + + f = fopen("gf_usaget.stt", "a"); + fprintf(f, " %8ld %10ld %10ld %10ld %10ld\n", + cpi->common.current_video_frame, cpi->gfu_boost, + GFQ_ADJUSTMENT, cpi->gfu_boost, gf_frame_usage); + fclose(f); + } + +#endif + + if (cpi->common.refresh_golden_frame == 1) { +#if 0 + + if (0) { + FILE *f; + + f = fopen("GFexit.stt", "a"); + fprintf(f, "%8ld GF coded\n", cpi->common.current_video_frame); + fclose(f); + } + +#endif + + if (cpi->auto_adjust_gold_quantizer) { + calc_gf_params(cpi); + } + + /* If we are using alternate ref instead of gf then do not apply the + * boost It will instead be applied to the altref update Jims + * modified boost + */ + if (!cpi->source_alt_ref_active) { + if (cpi->oxcf.fixed_q < 0) { + if (cpi->pass == 2) { + /* The spend on the GF is defined in the two pass + * code for two pass encodes + */ + cpi->this_frame_target = cpi->per_frame_bandwidth; + } else { + int Boost = cpi->last_boost; + int frames_in_section = cpi->frames_till_gf_update_due + 1; + int allocation_chunks = (frames_in_section * 100) + (Boost - 100); + int bits_in_section = cpi->inter_frame_target * frames_in_section; + + /* Normalize Altboost and allocations chunck down to + * prevent overflow + */ + while (Boost > 1000) { + Boost /= 2; + allocation_chunks /= 2; + } + + /* Avoid loss of precision but avoid overflow */ + if ((bits_in_section >> 7) > allocation_chunks) { + cpi->this_frame_target = + Boost * (bits_in_section / allocation_chunks); + } else { + cpi->this_frame_target = + (Boost * bits_in_section) / allocation_chunks; + } + } + } else { + cpi->this_frame_target = + (estimate_bits_at_q(1, Q, cpi->common.MBs, 1.0) * + cpi->last_boost) / + 100; + } + } else { + /* If there is an active ARF at this location use the minimum + * bits on this frame even if it is a contructed arf. + * The active maximum quantizer insures that an appropriate + * number of bits will be spent if needed for contstructed ARFs. + */ + cpi->this_frame_target = 0; + } + + cpi->current_gf_interval = cpi->frames_till_gf_update_due; + } + } else { + // Special case for 1 pass CBR: fixed gf period. + // TODO(marpan): Adjust this boost/interval logic. + // If gf_cbr_boost_pct is small (below threshold) set the flag + // gf_noboost_onepass_cbr = 1, which forces the gf to use the same + // rate correction factor as last. + cpi->gf_noboost_onepass_cbr = (cpi->oxcf.gf_cbr_boost_pct <= 100); + cpi->baseline_gf_interval = cpi->gf_interval_onepass_cbr; + // Skip this update if the zero_mvcount is low. + if (cpi->zeromv_count > (cpi->common.MBs >> 1)) { + cpi->common.refresh_golden_frame = 1; + cpi->this_frame_target = + (cpi->this_frame_target * (100 + cpi->oxcf.gf_cbr_boost_pct)) / 100; + } + cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; + cpi->current_gf_interval = cpi->frames_till_gf_update_due; + } + } + + cpi->per_frame_bandwidth = old_per_frame_bandwidth; +} + +void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var) { + int Q = cpi->common.base_qindex; + int correction_factor = 100; + double rate_correction_factor; + double adjustment_limit; + + int projected_size_based_on_q = 0; + + /* Clear down mmx registers to allow floating point in what follows */ + vpx_clear_system_state(); + + if (cpi->common.frame_type == KEY_FRAME) { + rate_correction_factor = cpi->key_frame_rate_correction_factor; + } else { + if (cpi->oxcf.number_of_layers == 1 && !cpi->gf_noboost_onepass_cbr && + (cpi->common.refresh_alt_ref_frame || + cpi->common.refresh_golden_frame)) { + rate_correction_factor = cpi->gf_rate_correction_factor; + } else { + rate_correction_factor = cpi->rate_correction_factor; + } + } + + /* Work out how big we would have expected the frame to be at this Q + * given the current correction factor. Stay in double to avoid int + * overflow when values are large + */ + projected_size_based_on_q = + (int)(((.5 + rate_correction_factor * + vp8_bits_per_mb[cpi->common.frame_type][Q]) * + cpi->common.MBs) / + (1 << BPER_MB_NORMBITS)); + + /* Make some allowance for cpi->zbin_over_quant */ + if (cpi->mb.zbin_over_quant > 0) { + int Z = cpi->mb.zbin_over_quant; + double Factor = 0.99; + double factor_adjustment = 0.01 / 256.0; + + while (Z > 0) { + Z--; + projected_size_based_on_q = (int)(Factor * projected_size_based_on_q); + Factor += factor_adjustment; + + if (Factor >= 0.999) Factor = 0.999; + } + } + + /* Work out a size correction factor. */ + if (projected_size_based_on_q > 0) { + correction_factor = (int)((100 * (int64_t)cpi->projected_frame_size) / + projected_size_based_on_q); + } + + /* More heavily damped adjustment used if we have been oscillating + * either side of target + */ + switch (damp_var) { + case 0: adjustment_limit = 0.75; break; + case 1: adjustment_limit = 0.375; break; + case 2: + default: adjustment_limit = 0.25; break; + } + + if (correction_factor > 102) { + /* We are not already at the worst allowable quality */ + correction_factor = + (int)(100.5 + ((correction_factor - 100) * adjustment_limit)); + rate_correction_factor = + ((rate_correction_factor * correction_factor) / 100); + + /* Keep rate_correction_factor within limits */ + if (rate_correction_factor > MAX_BPB_FACTOR) { + rate_correction_factor = MAX_BPB_FACTOR; + } + } else if (correction_factor < 99) { + /* We are not already at the best allowable quality */ + correction_factor = + (int)(100.5 - ((100 - correction_factor) * adjustment_limit)); + rate_correction_factor = + ((rate_correction_factor * correction_factor) / 100); + + /* Keep rate_correction_factor within limits */ + if (rate_correction_factor < MIN_BPB_FACTOR) { + rate_correction_factor = MIN_BPB_FACTOR; + } + } + + if (cpi->common.frame_type == KEY_FRAME) { + cpi->key_frame_rate_correction_factor = rate_correction_factor; + } else { + if (cpi->oxcf.number_of_layers == 1 && !cpi->gf_noboost_onepass_cbr && + (cpi->common.refresh_alt_ref_frame || + cpi->common.refresh_golden_frame)) { + cpi->gf_rate_correction_factor = rate_correction_factor; + } else { + cpi->rate_correction_factor = rate_correction_factor; + } + } +} + +static int limit_q_cbr_inter(int last_q, int current_q) { + int limit_down = 12; + if (last_q - current_q > limit_down) + return (last_q - limit_down); + else + return current_q; +} + +int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) { + int Q = cpi->active_worst_quality; + + if (cpi->force_maxqp == 1) { + cpi->active_worst_quality = cpi->worst_quality; + return cpi->worst_quality; + } + /* Reset Zbin OQ value */ + cpi->mb.zbin_over_quant = 0; + + if (cpi->oxcf.fixed_q >= 0) { + Q = cpi->oxcf.fixed_q; + + if (cpi->common.frame_type == KEY_FRAME) { + Q = cpi->oxcf.key_q; + } else if (cpi->oxcf.number_of_layers == 1 && + cpi->common.refresh_alt_ref_frame && + !cpi->gf_noboost_onepass_cbr) { + Q = cpi->oxcf.alt_q; + } else if (cpi->oxcf.number_of_layers == 1 && + cpi->common.refresh_golden_frame && + !cpi->gf_noboost_onepass_cbr) { + Q = cpi->oxcf.gold_q; + } + } else { + int i; + int last_error = INT_MAX; + int target_bits_per_mb; + int bits_per_mb_at_this_q; + double correction_factor; + + /* Select the appropriate correction factor based upon type of frame. */ + if (cpi->common.frame_type == KEY_FRAME) { + correction_factor = cpi->key_frame_rate_correction_factor; + } else { + if (cpi->oxcf.number_of_layers == 1 && !cpi->gf_noboost_onepass_cbr && + (cpi->common.refresh_alt_ref_frame || + cpi->common.refresh_golden_frame)) { + correction_factor = cpi->gf_rate_correction_factor; + } else { + correction_factor = cpi->rate_correction_factor; + } + } + + /* Calculate required scaling factor based on target frame size and + * size of frame produced using previous Q + */ + if (target_bits_per_frame >= (INT_MAX >> BPER_MB_NORMBITS)) { + /* Case where we would overflow int */ + target_bits_per_mb = (target_bits_per_frame / cpi->common.MBs) + << BPER_MB_NORMBITS; + } else { + target_bits_per_mb = + (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs; + } + + i = cpi->active_best_quality; + + do { + bits_per_mb_at_this_q = + (int)(.5 + + correction_factor * vp8_bits_per_mb[cpi->common.frame_type][i]); + + if (bits_per_mb_at_this_q <= target_bits_per_mb) { + if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error) { + Q = i; + } else { + Q = i - 1; + } + + break; + } else { + last_error = bits_per_mb_at_this_q - target_bits_per_mb; + } + } while (++i <= cpi->active_worst_quality); + + /* If we are at MAXQ then enable Q over-run which seeks to claw + * back additional bits through things like the RD multiplier + * and zero bin size. + */ + if (Q >= MAXQ) { + int zbin_oqmax; + + double Factor = 0.99; + double factor_adjustment = 0.01 / 256.0; + + if (cpi->common.frame_type == KEY_FRAME) { + zbin_oqmax = 0; + } else if (cpi->oxcf.number_of_layers == 1 && + !cpi->gf_noboost_onepass_cbr && + (cpi->common.refresh_alt_ref_frame || + (cpi->common.refresh_golden_frame && + !cpi->source_alt_ref_active))) { + zbin_oqmax = 16; + } else { + zbin_oqmax = ZBIN_OQ_MAX; + } + + /*{ + double Factor = + (double)target_bits_per_mb/(double)bits_per_mb_at_this_q; + double Oq; + + Factor = Factor/1.2683; + + Oq = pow( Factor, (1.0/-0.165) ); + + if ( Oq > zbin_oqmax ) + Oq = zbin_oqmax; + + cpi->zbin_over_quant = (int)Oq; + }*/ + + /* Each incrment in the zbin is assumed to have a fixed effect + * on bitrate. This is not of course true. The effect will be + * highly clip dependent and may well have sudden steps. The + * idea here is to acheive higher effective quantizers than the + * normal maximum by expanding the zero bin and hence + * decreasing the number of low magnitude non zero coefficients. + */ + while (cpi->mb.zbin_over_quant < zbin_oqmax) { + cpi->mb.zbin_over_quant++; + + if (cpi->mb.zbin_over_quant > zbin_oqmax) { + cpi->mb.zbin_over_quant = zbin_oqmax; + } + + /* Adjust bits_per_mb_at_this_q estimate */ + bits_per_mb_at_this_q = (int)(Factor * bits_per_mb_at_this_q); + Factor += factor_adjustment; + + if (Factor >= 0.999) Factor = 0.999; + + /* Break out if we get down to the target rate */ + if (bits_per_mb_at_this_q <= target_bits_per_mb) break; + } + } + } + + // Limit decrease in Q for 1 pass CBR screen content mode. + if (cpi->common.frame_type != KEY_FRAME && cpi->pass == 0 && + cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER && + cpi->oxcf.screen_content_mode) + Q = limit_q_cbr_inter(cpi->last_q[1], Q); + + return Q; +} + +static int estimate_keyframe_frequency(VP8_COMP *cpi) { + int i; + + /* Average key frame frequency */ + int av_key_frame_frequency = 0; + + /* First key frame at start of sequence is a special case. We have no + * frequency data. + */ + if (cpi->key_frame_count == 1) { + /* Assume a default of 1 kf every 2 seconds, or the max kf interval, + * whichever is smaller. + */ + int key_freq = cpi->oxcf.key_freq > 0 ? cpi->oxcf.key_freq : 1; + av_key_frame_frequency = 1 + (int)cpi->output_framerate * 2; + + if (cpi->oxcf.auto_key && av_key_frame_frequency > key_freq) { + av_key_frame_frequency = key_freq; + } + + cpi->prior_key_frame_distance[KEY_FRAME_CONTEXT - 1] = + av_key_frame_frequency; + } else { + unsigned int total_weight = 0; + int last_kf_interval = + (cpi->frames_since_key > 0) ? cpi->frames_since_key : 1; + + /* reset keyframe context and calculate weighted average of last + * KEY_FRAME_CONTEXT keyframes + */ + for (i = 0; i < KEY_FRAME_CONTEXT; ++i) { + if (i < KEY_FRAME_CONTEXT - 1) { + cpi->prior_key_frame_distance[i] = cpi->prior_key_frame_distance[i + 1]; + } else { + cpi->prior_key_frame_distance[i] = last_kf_interval; + } + + av_key_frame_frequency += + prior_key_frame_weight[i] * cpi->prior_key_frame_distance[i]; + total_weight += prior_key_frame_weight[i]; + } + + av_key_frame_frequency /= total_weight; + } + // TODO (marpan): Given the checks above, |av_key_frame_frequency| + // should always be above 0. But for now we keep the sanity check in. + if (av_key_frame_frequency == 0) av_key_frame_frequency = 1; + return av_key_frame_frequency; +} + +void vp8_adjust_key_frame_context(VP8_COMP *cpi) { + /* Clear down mmx registers to allow floating point in what follows */ + vpx_clear_system_state(); + + /* Do we have any key frame overspend to recover? */ + /* Two-pass overspend handled elsewhere. */ + if ((cpi->pass != 2) && + (cpi->projected_frame_size > cpi->per_frame_bandwidth)) { + int overspend; + + /* Update the count of key frame overspend to be recovered in + * subsequent frames. A portion of the KF overspend is treated as gf + * overspend (and hence recovered more quickly) as the kf is also a + * gf. Otherwise the few frames following each kf tend to get more + * bits allocated than those following other gfs. + */ + overspend = (cpi->projected_frame_size - cpi->per_frame_bandwidth); + + if (cpi->oxcf.number_of_layers > 1) { + cpi->kf_overspend_bits += overspend; + } else { + cpi->kf_overspend_bits += overspend * 7 / 8; + cpi->gf_overspend_bits += overspend * 1 / 8; + } + + /* Work out how much to try and recover per frame. */ + cpi->kf_bitrate_adjustment = + cpi->kf_overspend_bits / estimate_keyframe_frequency(cpi); + } + + cpi->frames_since_key = 0; + cpi->key_frame_count++; +} + +void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit, + int *frame_over_shoot_limit) { + /* Set-up bounds on acceptable frame size: */ + if (cpi->oxcf.fixed_q >= 0) { + /* Fixed Q scenario: frame size never outranges target + * (there is no target!) + */ + *frame_under_shoot_limit = 0; + *frame_over_shoot_limit = INT_MAX; + } else { + const int64_t this_frame_target = cpi->this_frame_target; + int64_t over_shoot_limit, under_shoot_limit; + + if (cpi->common.frame_type == KEY_FRAME) { + over_shoot_limit = this_frame_target * 9 / 8; + under_shoot_limit = this_frame_target * 7 / 8; + } else { + if (cpi->oxcf.number_of_layers > 1 || cpi->common.refresh_alt_ref_frame || + cpi->common.refresh_golden_frame) { + over_shoot_limit = this_frame_target * 9 / 8; + under_shoot_limit = this_frame_target * 7 / 8; + } else { + /* For CBR take buffer fullness into account */ + if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { + if (cpi->buffer_level >= ((cpi->oxcf.optimal_buffer_level + + cpi->oxcf.maximum_buffer_size) >> + 1)) { + /* Buffer is too full so relax overshoot and tighten + * undershoot + */ + over_shoot_limit = this_frame_target * 12 / 8; + under_shoot_limit = this_frame_target * 6 / 8; + } else if (cpi->buffer_level <= + (cpi->oxcf.optimal_buffer_level >> 1)) { + /* Buffer is too low so relax undershoot and tighten + * overshoot + */ + over_shoot_limit = this_frame_target * 10 / 8; + under_shoot_limit = this_frame_target * 4 / 8; + } else { + over_shoot_limit = this_frame_target * 11 / 8; + under_shoot_limit = this_frame_target * 5 / 8; + } + } + /* VBR and CQ mode */ + /* Note that tighter restrictions here can help quality + * but hurt encode speed + */ + else { + /* Stron overshoot limit for constrained quality */ + if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) { + over_shoot_limit = this_frame_target * 11 / 8; + under_shoot_limit = this_frame_target * 2 / 8; + } else { + over_shoot_limit = this_frame_target * 11 / 8; + under_shoot_limit = this_frame_target * 5 / 8; + } + } + } + } + + /* For very small rate targets where the fractional adjustment + * (eg * 7/8) may be tiny make sure there is at least a minimum + * range. + */ + over_shoot_limit += 200; + under_shoot_limit -= 200; + if (under_shoot_limit < 0) under_shoot_limit = 0; + if (under_shoot_limit > INT_MAX) under_shoot_limit = INT_MAX; + if (over_shoot_limit > INT_MAX) over_shoot_limit = INT_MAX; + *frame_under_shoot_limit = (int)under_shoot_limit; + *frame_over_shoot_limit = (int)over_shoot_limit; + } +} + +/* return of 0 means drop frame */ +int vp8_pick_frame_size(VP8_COMP *cpi) { + VP8_COMMON *cm = &cpi->common; + + if (cm->frame_type == KEY_FRAME) { + calc_iframe_target_size(cpi); + } else { + calc_pframe_target_size(cpi); + + /* Check if we're dropping the frame: */ + if (cpi->drop_frame) { + cpi->drop_frame = 0; + return 0; + } + } + return 1; +} +// If this just encoded frame (mcomp/transform/quant, but before loopfilter and +// pack_bitstream) has large overshoot, and was not being encoded close to the +// max QP, then drop this frame and force next frame to be encoded at max QP. +// Allow this for screen_content_mode = 2, or if drop frames is allowed. +// TODO(marpan): Should do this exit condition during the encode_frame +// (i.e., halfway during the encoding of the frame) to save cycles. +int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) { + int force_drop_overshoot = 0; +#if CONFIG_MULTI_RES_ENCODING + // Only check for dropping due to overshoot on the lowest stream. + // If the lowest stream of the multi-res encoding was dropped due to + // overshoot, then force dropping on all upper layer streams + // (mr_encoder_id > 0). + LOWER_RES_FRAME_INFO *low_res_frame_info = + (LOWER_RES_FRAME_INFO *)cpi->oxcf.mr_low_res_mode_info; + if (cpi->oxcf.mr_total_resolutions > 1 && cpi->oxcf.mr_encoder_id > 0) { + force_drop_overshoot = low_res_frame_info->is_frame_dropped_overshoot_maxqp; + if (!force_drop_overshoot) { + cpi->force_maxqp = 0; + cpi->frames_since_last_drop_overshoot++; + return 0; + } + } +#endif + if (cpi->common.frame_type != KEY_FRAME && + (cpi->oxcf.screen_content_mode == 2 || + (cpi->drop_frames_allowed && + (force_drop_overshoot || + (cpi->rate_correction_factor < (8.0f * MIN_BPB_FACTOR) && + cpi->frames_since_last_drop_overshoot > (int)cpi->framerate))))) { + // Note: the "projected_frame_size" from encode_frame() only gives estimate + // of mode/motion vector rate (in non-rd mode): so below we only require + // that projected_frame_size is somewhat greater than per-frame-bandwidth, + // but add additional condition with high threshold on prediction residual. + + // QP threshold: only allow dropping if we are not close to qp_max. + int thresh_qp = 3 * cpi->worst_quality >> 2; + // Rate threshold, in bytes. + int thresh_rate = 2 * (cpi->av_per_frame_bandwidth >> 3); + // Threshold for the average (over all macroblocks) of the pixel-sum + // residual error over 16x16 block. + int thresh_pred_err_mb = (200 << 4); + int pred_err_mb = (int)(cpi->mb.prediction_error / cpi->common.MBs); + // Reduce/ignore thresh_rate if pred_err_mb much larger than its threshold, + // give more weight to pred_err metric for overshoot detection. + if (cpi->drop_frames_allowed && pred_err_mb > (thresh_pred_err_mb << 4)) + thresh_rate = thresh_rate >> 3; + if ((Q < thresh_qp && cpi->projected_frame_size > thresh_rate && + pred_err_mb > thresh_pred_err_mb && + pred_err_mb > 2 * cpi->last_pred_err_mb) || + force_drop_overshoot) { + unsigned int i; + double new_correction_factor; + int target_bits_per_mb; + const int target_size = cpi->av_per_frame_bandwidth; + // Flag to indicate we will force next frame to be encoded at max QP. + cpi->force_maxqp = 1; + // Reset the buffer levels. + cpi->buffer_level = cpi->oxcf.optimal_buffer_level; + cpi->bits_off_target = cpi->oxcf.optimal_buffer_level; + // Compute a new rate correction factor, corresponding to the current + // target frame size and max_QP, and adjust the rate correction factor + // upwards, if needed. + // This is to prevent a bad state where the re-encoded frame at max_QP + // undershoots significantly, and then we end up dropping every other + // frame because the QP/rate_correction_factor may have been too low + // before the drop and then takes too long to come up. + if (target_size >= (INT_MAX >> BPER_MB_NORMBITS)) { + target_bits_per_mb = (target_size / cpi->common.MBs) + << BPER_MB_NORMBITS; + } else { + target_bits_per_mb = + (target_size << BPER_MB_NORMBITS) / cpi->common.MBs; + } + // Rate correction factor based on target_size_per_mb and max_QP. + new_correction_factor = + (double)target_bits_per_mb / + (double)vp8_bits_per_mb[INTER_FRAME][cpi->worst_quality]; + if (new_correction_factor > cpi->rate_correction_factor) { + cpi->rate_correction_factor = + VPXMIN(2.0 * cpi->rate_correction_factor, new_correction_factor); + } + if (cpi->rate_correction_factor > MAX_BPB_FACTOR) { + cpi->rate_correction_factor = MAX_BPB_FACTOR; + } + // Drop this frame: update frame counters. + cpi->common.current_video_frame++; + cpi->frames_since_key++; + cpi->temporal_pattern_counter++; + cpi->frames_since_last_drop_overshoot = 0; + if (cpi->oxcf.number_of_layers > 1) { + // Set max_qp and rate correction for all temporal layers if overshoot + // is detected. + for (i = 0; i < cpi->oxcf.number_of_layers; ++i) { + LAYER_CONTEXT *lc = &cpi->layer_context[i]; + lc->force_maxqp = 1; + lc->frames_since_last_drop_overshoot = 0; + lc->rate_correction_factor = cpi->rate_correction_factor; + } + } +#if CONFIG_MULTI_RES_ENCODING + if (cpi->oxcf.mr_total_resolutions > 1) + low_res_frame_info->is_frame_dropped_overshoot_maxqp = 1; +#endif + return 1; + } + cpi->force_maxqp = 0; + cpi->frames_since_last_drop_overshoot++; +#if CONFIG_MULTI_RES_ENCODING + if (cpi->oxcf.mr_total_resolutions > 1) + low_res_frame_info->is_frame_dropped_overshoot_maxqp = 0; +#endif + return 0; + } + cpi->force_maxqp = 0; + cpi->frames_since_last_drop_overshoot++; +#if CONFIG_MULTI_RES_ENCODING + if (cpi->oxcf.mr_total_resolutions > 1) + low_res_frame_info->is_frame_dropped_overshoot_maxqp = 0; +#endif + return 0; +} diff --git a/media/libvpx/libvpx/vp8/encoder/ratectrl.h b/media/libvpx/libvpx/vp8/encoder/ratectrl.h new file mode 100644 index 0000000000..844c72cb86 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/ratectrl.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_ENCODER_RATECTRL_H_ +#define VPX_VP8_ENCODER_RATECTRL_H_ + +#include "onyx_int.h" + +#ifdef __cplusplus +extern "C" { +#endif + +extern void vp8_save_coding_context(VP8_COMP *cpi); +extern void vp8_restore_coding_context(VP8_COMP *cpi); + +extern void vp8_setup_key_frame(VP8_COMP *cpi); +extern void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var); +extern int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame); +extern void vp8_adjust_key_frame_context(VP8_COMP *cpi); +extern void vp8_compute_frame_size_bounds(VP8_COMP *cpi, + int *frame_under_shoot_limit, + int *frame_over_shoot_limit); + +/* return of 0 means drop frame */ +extern int vp8_pick_frame_size(VP8_COMP *cpi); + +extern int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_ENCODER_RATECTRL_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/rdopt.c b/media/libvpx/libvpx/vp8/encoder/rdopt.c new file mode 100644 index 0000000000..5d539ef30c --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/rdopt.c @@ -0,0 +1,2394 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include +#include +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "encodeframe.h" +#include "tokenize.h" +#include "treewriter.h" +#include "onyx_int.h" +#include "modecosts.h" +#include "encodeintra.h" +#include "pickinter.h" +#include "vp8/common/common.h" +#include "vp8/common/entropymode.h" +#include "vp8/common/reconinter.h" +#include "vp8/common/reconintra.h" +#include "vp8/common/reconintra4x4.h" +#include "vp8/common/findnearmv.h" +#include "vp8/common/quant_common.h" +#include "encodemb.h" +#include "vp8/encoder/quantize.h" +#include "vpx_dsp/variance.h" +#include "vpx_ports/system_state.h" +#include "mcomp.h" +#include "rdopt.h" +#include "vpx_mem/vpx_mem.h" +#include "vp8/common/systemdependent.h" +#if CONFIG_TEMPORAL_DENOISING +#include "denoising.h" +#endif +extern void vp8_update_zbin_extra(VP8_COMP *cpi, MACROBLOCK *x); + +#define MAXF(a, b) (((a) > (b)) ? (a) : (b)) + +typedef struct rate_distortion_struct { + int rate2; + int rate_y; + int rate_uv; + int distortion2; + int distortion_uv; +} RATE_DISTORTION; + +typedef struct best_mode_struct { + int yrd; + int rd; + int intra_rd; + MB_MODE_INFO mbmode; + union b_mode_info bmodes[16]; + PARTITION_INFO partition; +} BEST_MODE; + +static const int auto_speed_thresh[17] = { 1000, 200, 150, 130, 150, 125, + 120, 115, 115, 115, 115, 115, + 115, 115, 115, 115, 105 }; + +const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES] = { + ZEROMV, DC_PRED, + + NEARESTMV, NEARMV, + + ZEROMV, NEARESTMV, + + ZEROMV, NEARESTMV, + + NEARMV, NEARMV, + + V_PRED, H_PRED, TM_PRED, + + NEWMV, NEWMV, NEWMV, + + SPLITMV, SPLITMV, SPLITMV, + + B_PRED, +}; + +/* This table determines the search order in reference frame priority order, + * which may not necessarily match INTRA,LAST,GOLDEN,ARF + */ +const int vp8_ref_frame_order[MAX_MODES] = { + 1, 0, + + 1, 1, + + 2, 2, + + 3, 3, + + 2, 3, + + 0, 0, 0, + + 1, 2, 3, + + 1, 2, 3, + + 0, +}; + +static void fill_token_costs( + int c[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS], + const vp8_prob p[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] + [ENTROPY_NODES]) { + int i, j, k; + + for (i = 0; i < BLOCK_TYPES; ++i) { + for (j = 0; j < COEF_BANDS; ++j) { + for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { + /* check for pt=0 and band > 1 if block type 0 + * and 0 if blocktype 1 + */ + if (k == 0 && j > (i == 0)) { + vp8_cost_tokens2(c[i][j][k], p[i][j][k], vp8_coef_tree, 2); + } else { + vp8_cost_tokens(c[i][j][k], p[i][j][k], vp8_coef_tree); + } + } + } + } +} + +static const int rd_iifactor[32] = { 4, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + +/* values are now correlated to quantizer */ +static const int sad_per_bit16lut[QINDEX_RANGE] = { + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, + 11, 11, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14 +}; +static const int sad_per_bit4lut[QINDEX_RANGE] = { + 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, + 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, + 12, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, + 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 20, 20, 20, +}; + +void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex) { + cpi->mb.sadperbit16 = sad_per_bit16lut[QIndex]; + cpi->mb.sadperbit4 = sad_per_bit4lut[QIndex]; +} + +void vp8_initialize_rd_consts(VP8_COMP *cpi, MACROBLOCK *x, int Qvalue) { + int q; + int i; + double capped_q = (Qvalue < 160) ? (double)Qvalue : 160.0; + double rdconst = 2.80; + + vpx_clear_system_state(); + + /* Further tests required to see if optimum is different + * for key frames, golden frames and arf frames. + */ + cpi->RDMULT = (int)(rdconst * (capped_q * capped_q)); + + /* Extend rate multiplier along side quantizer zbin increases */ + if (cpi->mb.zbin_over_quant > 0) { + double oq_factor; + double modq; + + /* Experimental code using the same basic equation as used for Q above + * The units of cpi->mb.zbin_over_quant are 1/128 of Q bin size + */ + oq_factor = 1.0 + ((double)0.0015625 * cpi->mb.zbin_over_quant); + modq = (int)((double)capped_q * oq_factor); + cpi->RDMULT = (int)(rdconst * (modq * modq)); + } + + if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) { + if (cpi->twopass.next_iiratio > 31) { + cpi->RDMULT += (cpi->RDMULT * rd_iifactor[31]) >> 4; + } else { + cpi->RDMULT += + (cpi->RDMULT * rd_iifactor[cpi->twopass.next_iiratio]) >> 4; + } + } + + cpi->mb.errorperbit = (cpi->RDMULT / 110); + cpi->mb.errorperbit += (cpi->mb.errorperbit == 0); + + vp8_set_speed_features(cpi); + + for (i = 0; i < MAX_MODES; ++i) { + x->mode_test_hit_counts[i] = 0; + } + + q = (int)pow(Qvalue, 1.25); + + if (q < 8) q = 8; + + if (cpi->RDMULT > 1000) { + cpi->RDDIV = 1; + cpi->RDMULT /= 100; + + for (i = 0; i < MAX_MODES; ++i) { + if (cpi->sf.thresh_mult[i] < INT_MAX) { + x->rd_threshes[i] = cpi->sf.thresh_mult[i] * q / 100; + } else { + x->rd_threshes[i] = INT_MAX; + } + + cpi->rd_baseline_thresh[i] = x->rd_threshes[i]; + } + } else { + cpi->RDDIV = 100; + + for (i = 0; i < MAX_MODES; ++i) { + if (cpi->sf.thresh_mult[i] < (INT_MAX / q)) { + x->rd_threshes[i] = cpi->sf.thresh_mult[i] * q; + } else { + x->rd_threshes[i] = INT_MAX; + } + + cpi->rd_baseline_thresh[i] = x->rd_threshes[i]; + } + } + + { + /* build token cost array for the type of frame we have now */ + FRAME_CONTEXT *l = &cpi->lfc_n; + + if (cpi->common.refresh_alt_ref_frame) { + l = &cpi->lfc_a; + } else if (cpi->common.refresh_golden_frame) { + l = &cpi->lfc_g; + } + + fill_token_costs(cpi->mb.token_costs, + (const vp8_prob(*)[8][3][11])l->coef_probs); + /* + fill_token_costs( + cpi->mb.token_costs, + (const vp8_prob( *)[8][3][11]) cpi->common.fc.coef_probs); + */ + + /* TODO make these mode costs depend on last,alt or gold too. (jbb) */ + vp8_init_mode_costs(cpi); + } +} + +void vp8_auto_select_speed(VP8_COMP *cpi) { + int milliseconds_for_compress = (int)(1000000 / cpi->framerate); + + milliseconds_for_compress = + milliseconds_for_compress * (16 - cpi->oxcf.cpu_used) / 16; + +#if 0 + + if (0) + { + FILE *f; + + f = fopen("speed.stt", "a"); + fprintf(f, " %8ld %10ld %10ld %10ld\n", + cpi->common.current_video_frame, cpi->Speed, milliseconds_for_compress, cpi->avg_pick_mode_time); + fclose(f); + } + +#endif + + if (cpi->avg_pick_mode_time < milliseconds_for_compress && + (cpi->avg_encode_time - cpi->avg_pick_mode_time) < + milliseconds_for_compress) { + if (cpi->avg_pick_mode_time == 0) { + cpi->Speed = 4; + } else { + if (milliseconds_for_compress * 100 < cpi->avg_encode_time * 95) { + cpi->Speed += 2; + cpi->avg_pick_mode_time = 0; + cpi->avg_encode_time = 0; + + if (cpi->Speed > 16) { + cpi->Speed = 16; + } + } + + if (milliseconds_for_compress * 100 > + cpi->avg_encode_time * auto_speed_thresh[cpi->Speed]) { + cpi->Speed -= 1; + cpi->avg_pick_mode_time = 0; + cpi->avg_encode_time = 0; + + /* In real-time mode, cpi->speed is in [4, 16]. */ + if (cpi->Speed < 4) { + cpi->Speed = 4; + } + } + } + } else { + cpi->Speed += 4; + + if (cpi->Speed > 16) cpi->Speed = 16; + + cpi->avg_pick_mode_time = 0; + cpi->avg_encode_time = 0; + } +} + +int vp8_block_error_c(short *coeff, short *dqcoeff) { + int i; + int error = 0; + + for (i = 0; i < 16; ++i) { + int this_diff = coeff[i] - dqcoeff[i]; + error += this_diff * this_diff; + } + + return error; +} + +int vp8_mbblock_error_c(MACROBLOCK *mb, int dc) { + BLOCK *be; + BLOCKD *bd; + int i, j; + int berror, error = 0; + + for (i = 0; i < 16; ++i) { + be = &mb->block[i]; + bd = &mb->e_mbd.block[i]; + + berror = 0; + + for (j = dc; j < 16; ++j) { + int this_diff = be->coeff[j] - bd->dqcoeff[j]; + berror += this_diff * this_diff; + } + + error += berror; + } + + return error; +} + +int vp8_mbuverror_c(MACROBLOCK *mb) { + BLOCK *be; + BLOCKD *bd; + + int i; + int error = 0; + + for (i = 16; i < 24; ++i) { + be = &mb->block[i]; + bd = &mb->e_mbd.block[i]; + + error += vp8_block_error_c(be->coeff, bd->dqcoeff); + } + + return error; +} + +int VP8_UVSSE(MACROBLOCK *x) { + unsigned char *uptr, *vptr; + unsigned char *upred_ptr = (*(x->block[16].base_src) + x->block[16].src); + unsigned char *vpred_ptr = (*(x->block[20].base_src) + x->block[20].src); + int uv_stride = x->block[16].src_stride; + + unsigned int sse1 = 0; + unsigned int sse2 = 0; + int mv_row = x->e_mbd.mode_info_context->mbmi.mv.as_mv.row; + int mv_col = x->e_mbd.mode_info_context->mbmi.mv.as_mv.col; + int offset; + int pre_stride = x->e_mbd.pre.uv_stride; + + if (mv_row < 0) { + mv_row -= 1; + } else { + mv_row += 1; + } + + if (mv_col < 0) { + mv_col -= 1; + } else { + mv_col += 1; + } + + mv_row /= 2; + mv_col /= 2; + + offset = (mv_row >> 3) * pre_stride + (mv_col >> 3); + uptr = x->e_mbd.pre.u_buffer + offset; + vptr = x->e_mbd.pre.v_buffer + offset; + + if ((mv_row | mv_col) & 7) { + vpx_sub_pixel_variance8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, + upred_ptr, uv_stride, &sse2); + vpx_sub_pixel_variance8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, + vpred_ptr, uv_stride, &sse1); + sse2 += sse1; + } else { + vpx_variance8x8(uptr, pre_stride, upred_ptr, uv_stride, &sse2); + vpx_variance8x8(vptr, pre_stride, vpred_ptr, uv_stride, &sse1); + sse2 += sse1; + } + return sse2; +} + +static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, int type, ENTROPY_CONTEXT *a, + ENTROPY_CONTEXT *l) { + int c = !type; /* start at coef 0, unless Y with Y2 */ + int eob = (int)(*b->eob); + int pt; /* surrounding block/prev coef predictor */ + int cost = 0; + short *qcoeff_ptr = b->qcoeff; + + VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l); + + assert(eob <= 16); + for (; c < eob; ++c) { + const int v = qcoeff_ptr[vp8_default_zig_zag1d[c]]; + const int t = vp8_dct_value_tokens_ptr[v].Token; + cost += mb->token_costs[type][vp8_coef_bands[c]][pt][t]; + cost += vp8_dct_value_cost_ptr[v]; + pt = vp8_prev_token_class[t]; + } + + if (c < 16) { + cost += mb->token_costs[type][vp8_coef_bands[c]][pt][DCT_EOB_TOKEN]; + } + + pt = (c != !type); /* is eob first coefficient; */ + *a = *l = pt; + + return cost; +} + +static int vp8_rdcost_mby(MACROBLOCK *mb) { + int cost = 0; + int b; + MACROBLOCKD *x = &mb->e_mbd; + ENTROPY_CONTEXT_PLANES t_above, t_left; + ENTROPY_CONTEXT *ta; + ENTROPY_CONTEXT *tl; + + memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES)); + memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES)); + + ta = (ENTROPY_CONTEXT *)&t_above; + tl = (ENTROPY_CONTEXT *)&t_left; + + for (b = 0; b < 16; ++b) { + cost += cost_coeffs(mb, x->block + b, PLANE_TYPE_Y_NO_DC, + ta + vp8_block2above[b], tl + vp8_block2left[b]); + } + + cost += cost_coeffs(mb, x->block + 24, PLANE_TYPE_Y2, + ta + vp8_block2above[24], tl + vp8_block2left[24]); + + return cost; +} + +static void macro_block_yrd(MACROBLOCK *mb, int *Rate, int *Distortion) { + int b; + MACROBLOCKD *const x = &mb->e_mbd; + BLOCK *const mb_y2 = mb->block + 24; + BLOCKD *const x_y2 = x->block + 24; + short *Y2DCPtr = mb_y2->src_diff; + BLOCK *beptr; + int d; + + vp8_subtract_mby(mb->src_diff, *(mb->block[0].base_src), + mb->block[0].src_stride, mb->e_mbd.predictor, 16); + + /* Fdct and building the 2nd order block */ + for (beptr = mb->block; beptr < mb->block + 16; beptr += 2) { + mb->short_fdct8x4(beptr->src_diff, beptr->coeff, 32); + *Y2DCPtr++ = beptr->coeff[0]; + *Y2DCPtr++ = beptr->coeff[16]; + } + + /* 2nd order fdct */ + mb->short_walsh4x4(mb_y2->src_diff, mb_y2->coeff, 8); + + /* Quantization */ + for (b = 0; b < 16; ++b) { + mb->quantize_b(&mb->block[b], &mb->e_mbd.block[b]); + } + + /* DC predication and Quantization of 2nd Order block */ + mb->quantize_b(mb_y2, x_y2); + + /* Distortion */ + d = vp8_mbblock_error(mb, 1) << 2; + d += vp8_block_error(mb_y2->coeff, x_y2->dqcoeff); + + *Distortion = (d >> 4); + + /* rate */ + *Rate = vp8_rdcost_mby(mb); +} + +static void copy_predictor(unsigned char *dst, const unsigned char *predictor) { + const unsigned int *p = (const unsigned int *)predictor; + unsigned int *d = (unsigned int *)dst; + d[0] = p[0]; + d[4] = p[4]; + d[8] = p[8]; + d[12] = p[12]; +} +static int rd_pick_intra4x4block(MACROBLOCK *x, BLOCK *be, BLOCKD *b, + B_PREDICTION_MODE *best_mode, + const int *bmode_costs, ENTROPY_CONTEXT *a, + ENTROPY_CONTEXT *l, + + int *bestrate, int *bestratey, + int *bestdistortion) { + B_PREDICTION_MODE mode; + int best_rd = INT_MAX; + int rate = 0; + int distortion; + + ENTROPY_CONTEXT ta = *a, tempa = *a; + ENTROPY_CONTEXT tl = *l, templ = *l; + /* + * The predictor buffer is a 2d buffer with a stride of 16. Create + * a temp buffer that meets the stride requirements, but we are only + * interested in the left 4x4 block + * */ + DECLARE_ALIGNED(16, unsigned char, best_predictor[16 * 4]); + DECLARE_ALIGNED(16, short, best_dqcoeff[16]); + int dst_stride = x->e_mbd.dst.y_stride; + unsigned char *dst = x->e_mbd.dst.y_buffer + b->offset; + + unsigned char *Above = dst - dst_stride; + unsigned char *yleft = dst - 1; + unsigned char top_left = Above[-1]; + + for (mode = B_DC_PRED; mode <= B_HU_PRED; ++mode) { + int this_rd; + int ratey; + + rate = bmode_costs[mode]; + + vp8_intra4x4_predict(Above, yleft, dst_stride, mode, b->predictor, 16, + top_left); + vp8_subtract_b(be, b, 16); + x->short_fdct4x4(be->src_diff, be->coeff, 32); + x->quantize_b(be, b); + + tempa = ta; + templ = tl; + + ratey = cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC, &tempa, &templ); + rate += ratey; + distortion = vp8_block_error(be->coeff, b->dqcoeff) >> 2; + + this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion); + + if (this_rd < best_rd) { + *bestrate = rate; + *bestratey = ratey; + *bestdistortion = distortion; + best_rd = this_rd; + *best_mode = mode; + *a = tempa; + *l = templ; + copy_predictor(best_predictor, b->predictor); + memcpy(best_dqcoeff, b->dqcoeff, 32); + } + } + b->bmi.as_mode = *best_mode; + + vp8_short_idct4x4llm(best_dqcoeff, best_predictor, 16, dst, dst_stride); + + return best_rd; +} + +static int rd_pick_intra4x4mby_modes(MACROBLOCK *mb, int *Rate, int *rate_y, + int *Distortion, int best_rd) { + MACROBLOCKD *const xd = &mb->e_mbd; + int i; + int cost = mb->mbmode_cost[xd->frame_type][B_PRED]; + int distortion = 0; + int tot_rate_y = 0; + int64_t total_rd = 0; + ENTROPY_CONTEXT_PLANES t_above, t_left; + ENTROPY_CONTEXT *ta; + ENTROPY_CONTEXT *tl; + const int *bmode_costs; + + memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES)); + memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES)); + + ta = (ENTROPY_CONTEXT *)&t_above; + tl = (ENTROPY_CONTEXT *)&t_left; + + intra_prediction_down_copy(xd, xd->dst.y_buffer - xd->dst.y_stride + 16); + + bmode_costs = mb->inter_bmode_costs; + + for (i = 0; i < 16; ++i) { + MODE_INFO *const mic = xd->mode_info_context; + const int mis = xd->mode_info_stride; + B_PREDICTION_MODE best_mode = B_MODE_COUNT; + int r = 0, ry = 0, d = 0; + + if (mb->e_mbd.frame_type == KEY_FRAME) { + const B_PREDICTION_MODE A = above_block_mode(mic, i, mis); + const B_PREDICTION_MODE L = left_block_mode(mic, i); + + bmode_costs = mb->bmode_costs[A][L]; + } + + total_rd += rd_pick_intra4x4block( + mb, mb->block + i, xd->block + i, &best_mode, bmode_costs, + ta + vp8_block2above[i], tl + vp8_block2left[i], &r, &ry, &d); + + cost += r; + distortion += d; + tot_rate_y += ry; + + assert(best_mode != B_MODE_COUNT); + mic->bmi[i].as_mode = best_mode; + + if (total_rd >= (int64_t)best_rd) break; + } + + if (total_rd >= (int64_t)best_rd) return INT_MAX; + + *Rate = cost; + *rate_y = tot_rate_y; + *Distortion = distortion; + + return RDCOST(mb->rdmult, mb->rddiv, cost, distortion); +} + +static int rd_pick_intra16x16mby_mode(MACROBLOCK *x, int *Rate, int *rate_y, + int *Distortion) { + MB_PREDICTION_MODE mode; + MB_PREDICTION_MODE mode_selected = MB_MODE_COUNT; + int rate, ratey; + int distortion; + int best_rd = INT_MAX; + int this_rd; + MACROBLOCKD *xd = &x->e_mbd; + + /* Y Search for 16x16 intra prediction mode */ + for (mode = DC_PRED; mode <= TM_PRED; ++mode) { + xd->mode_info_context->mbmi.mode = mode; + + vp8_build_intra_predictors_mby_s(xd, xd->dst.y_buffer - xd->dst.y_stride, + xd->dst.y_buffer - 1, xd->dst.y_stride, + xd->predictor, 16); + + macro_block_yrd(x, &ratey, &distortion); + rate = ratey + + x->mbmode_cost[xd->frame_type][xd->mode_info_context->mbmi.mode]; + + this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion); + + if (this_rd < best_rd) { + mode_selected = mode; + best_rd = this_rd; + *Rate = rate; + *rate_y = ratey; + *Distortion = distortion; + } + } + + assert(mode_selected != MB_MODE_COUNT); + xd->mode_info_context->mbmi.mode = mode_selected; + return best_rd; +} + +static int rd_cost_mbuv(MACROBLOCK *mb) { + int b; + int cost = 0; + MACROBLOCKD *x = &mb->e_mbd; + ENTROPY_CONTEXT_PLANES t_above, t_left; + ENTROPY_CONTEXT *ta; + ENTROPY_CONTEXT *tl; + + memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES)); + memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES)); + + ta = (ENTROPY_CONTEXT *)&t_above; + tl = (ENTROPY_CONTEXT *)&t_left; + + for (b = 16; b < 24; ++b) { + cost += cost_coeffs(mb, x->block + b, PLANE_TYPE_UV, + ta + vp8_block2above[b], tl + vp8_block2left[b]); + } + + return cost; +} + +static int rd_inter16x16_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate, + int *distortion, int fullpixel) { + (void)cpi; + (void)fullpixel; + + vp8_build_inter16x16_predictors_mbuv(&x->e_mbd); + vp8_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer, + x->src.uv_stride, &x->e_mbd.predictor[256], + &x->e_mbd.predictor[320], 8); + + vp8_transform_mbuv(x); + vp8_quantize_mbuv(x); + + *rate = rd_cost_mbuv(x); + *distortion = vp8_mbuverror(x) / 4; + + return RDCOST(x->rdmult, x->rddiv, *rate, *distortion); +} + +static int rd_inter4x4_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate, + int *distortion, int fullpixel) { + (void)cpi; + (void)fullpixel; + + vp8_build_inter4x4_predictors_mbuv(&x->e_mbd); + vp8_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer, + x->src.uv_stride, &x->e_mbd.predictor[256], + &x->e_mbd.predictor[320], 8); + + vp8_transform_mbuv(x); + vp8_quantize_mbuv(x); + + *rate = rd_cost_mbuv(x); + *distortion = vp8_mbuverror(x) / 4; + + return RDCOST(x->rdmult, x->rddiv, *rate, *distortion); +} + +static void rd_pick_intra_mbuv_mode(MACROBLOCK *x, int *rate, + int *rate_tokenonly, int *distortion) { + MB_PREDICTION_MODE mode; + MB_PREDICTION_MODE mode_selected = MB_MODE_COUNT; + int best_rd = INT_MAX; + int d = 0, r = 0; + int rate_to; + MACROBLOCKD *xd = &x->e_mbd; + + for (mode = DC_PRED; mode <= TM_PRED; ++mode) { + int this_rate; + int this_distortion; + int this_rd; + + xd->mode_info_context->mbmi.uv_mode = mode; + + vp8_build_intra_predictors_mbuv_s( + xd, xd->dst.u_buffer - xd->dst.uv_stride, + xd->dst.v_buffer - xd->dst.uv_stride, xd->dst.u_buffer - 1, + xd->dst.v_buffer - 1, xd->dst.uv_stride, &xd->predictor[256], + &xd->predictor[320], 8); + + vp8_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer, + x->src.uv_stride, &xd->predictor[256], + &xd->predictor[320], 8); + vp8_transform_mbuv(x); + vp8_quantize_mbuv(x); + + rate_to = rd_cost_mbuv(x); + this_rate = + rate_to + x->intra_uv_mode_cost[xd->frame_type] + [xd->mode_info_context->mbmi.uv_mode]; + + this_distortion = vp8_mbuverror(x) / 4; + + this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion); + + if (this_rd < best_rd) { + best_rd = this_rd; + d = this_distortion; + r = this_rate; + *rate_tokenonly = rate_to; + mode_selected = mode; + } + } + + *rate = r; + *distortion = d; + + assert(mode_selected != MB_MODE_COUNT); + xd->mode_info_context->mbmi.uv_mode = mode_selected; +} + +int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]) { + vp8_prob p[VP8_MVREFS - 1]; + assert(NEARESTMV <= m && m <= SPLITMV); + vp8_mv_ref_probs(p, near_mv_ref_ct); + return vp8_cost_token(vp8_mv_ref_tree, p, + vp8_mv_ref_encoding_array + (m - NEARESTMV)); +} + +void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv) { + x->e_mbd.mode_info_context->mbmi.mode = mb; + x->e_mbd.mode_info_context->mbmi.mv.as_int = mv->as_int; +} + +static int labels2mode(MACROBLOCK *x, int const *labelings, int which_label, + B_PREDICTION_MODE this_mode, int_mv *this_mv, + int_mv *best_ref_mv, int *mvcost[2]) { + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mic = xd->mode_info_context; + const int mis = xd->mode_info_stride; + + int cost = 0; + int thismvcost = 0; + + /* We have to be careful retrieving previously-encoded motion vectors. + Ones from this macroblock have to be pulled from the BLOCKD array + as they have not yet made it to the bmi array in our MB_MODE_INFO. */ + + int i = 0; + + do { + BLOCKD *const d = xd->block + i; + const int row = i >> 2, col = i & 3; + + B_PREDICTION_MODE m; + + if (labelings[i] != which_label) continue; + + if (col && labelings[i] == labelings[i - 1]) { + m = LEFT4X4; + } else if (row && labelings[i] == labelings[i - 4]) { + m = ABOVE4X4; + } else { + /* the only time we should do costing for new motion vector + * or mode is when we are on a new label (jbb May 08, 2007) + */ + switch (m = this_mode) { + case NEW4X4: + thismvcost = vp8_mv_bit_cost(this_mv, best_ref_mv, mvcost, 102); + break; + case LEFT4X4: + this_mv->as_int = col ? d[-1].bmi.mv.as_int : left_block_mv(mic, i); + break; + case ABOVE4X4: + this_mv->as_int = + row ? d[-4].bmi.mv.as_int : above_block_mv(mic, i, mis); + break; + case ZERO4X4: this_mv->as_int = 0; break; + default: break; + } + + if (m == ABOVE4X4) { /* replace above with left if same */ + int_mv left_mv; + + left_mv.as_int = col ? d[-1].bmi.mv.as_int : left_block_mv(mic, i); + + if (left_mv.as_int == this_mv->as_int) m = LEFT4X4; + } + + cost = x->inter_bmode_costs[m]; + } + + d->bmi.mv.as_int = this_mv->as_int; + + x->partition_info->bmi[i].mode = m; + x->partition_info->bmi[i].mv.as_int = this_mv->as_int; + + } while (++i < 16); + + cost += thismvcost; + return cost; +} + +static int rdcost_mbsegment_y(MACROBLOCK *mb, const int *labels, + int which_label, ENTROPY_CONTEXT *ta, + ENTROPY_CONTEXT *tl) { + int cost = 0; + int b; + MACROBLOCKD *x = &mb->e_mbd; + + for (b = 0; b < 16; ++b) { + if (labels[b] == which_label) { + cost += cost_coeffs(mb, x->block + b, PLANE_TYPE_Y_WITH_DC, + ta + vp8_block2above[b], tl + vp8_block2left[b]); + } + } + + return cost; +} +static unsigned int vp8_encode_inter_mb_segment(MACROBLOCK *x, + int const *labels, + int which_label) { + int i; + unsigned int distortion = 0; + int pre_stride = x->e_mbd.pre.y_stride; + unsigned char *base_pre = x->e_mbd.pre.y_buffer; + + for (i = 0; i < 16; ++i) { + if (labels[i] == which_label) { + BLOCKD *bd = &x->e_mbd.block[i]; + BLOCK *be = &x->block[i]; + + vp8_build_inter_predictors_b(bd, 16, base_pre, pre_stride, + x->e_mbd.subpixel_predict); + vp8_subtract_b(be, bd, 16); + x->short_fdct4x4(be->src_diff, be->coeff, 32); + x->quantize_b(be, bd); + + distortion += vp8_block_error(be->coeff, bd->dqcoeff); + } + } + + return distortion; +} + +static const unsigned int segmentation_to_sseshift[4] = { 3, 3, 2, 0 }; + +typedef struct { + int_mv *ref_mv; + int_mv mvp; + + int segment_rd; + int segment_num; + int r; + int d; + int segment_yrate; + B_PREDICTION_MODE modes[16]; + int_mv mvs[16]; + unsigned char eobs[16]; + + int mvthresh; + int *mdcounts; + + int_mv sv_mvp[4]; /* save 4 mvp from 8x8 */ + int sv_istep[2]; /* save 2 initial step_param for 16x8/8x16 */ + +} BEST_SEG_INFO; + +static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi, + unsigned int segmentation) { + int i; + int const *labels; + int br = 0; + int bd = 0; + B_PREDICTION_MODE this_mode; + + int label_count; + int this_segment_rd = 0; + int label_mv_thresh; + int rate = 0; + int sbr = 0; + int sbd = 0; + int segmentyrate = 0; + + vp8_variance_fn_ptr_t *v_fn_ptr; + + ENTROPY_CONTEXT_PLANES t_above, t_left; + ENTROPY_CONTEXT_PLANES t_above_b, t_left_b; + + memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES)); + memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES)); + + vp8_zero(t_above_b); + vp8_zero(t_left_b); + + br = 0; + bd = 0; + + v_fn_ptr = &cpi->fn_ptr[segmentation]; + labels = vp8_mbsplits[segmentation]; + label_count = vp8_mbsplit_count[segmentation]; + + /* 64 makes this threshold really big effectively making it so that we + * very rarely check mvs on segments. setting this to 1 would make mv + * thresh roughly equal to what it is for macroblocks + */ + label_mv_thresh = 1 * bsi->mvthresh / label_count; + + /* Segmentation method overheads */ + rate = vp8_cost_token(vp8_mbsplit_tree, vp8_mbsplit_probs, + vp8_mbsplit_encodings + segmentation); + rate += vp8_cost_mv_ref(SPLITMV, bsi->mdcounts); + this_segment_rd += RDCOST(x->rdmult, x->rddiv, rate, 0); + br += rate; + + for (i = 0; i < label_count; ++i) { + int_mv mode_mv[B_MODE_COUNT] = { { 0 }, { 0 } }; + int best_label_rd = INT_MAX; + B_PREDICTION_MODE mode_selected = ZERO4X4; + int bestlabelyrate = 0; + + /* search for the best motion vector on this segment */ + for (this_mode = LEFT4X4; this_mode <= NEW4X4; ++this_mode) { + int this_rd; + int distortion; + int labelyrate; + ENTROPY_CONTEXT_PLANES t_above_s, t_left_s; + ENTROPY_CONTEXT *ta_s; + ENTROPY_CONTEXT *tl_s; + + memcpy(&t_above_s, &t_above, sizeof(ENTROPY_CONTEXT_PLANES)); + memcpy(&t_left_s, &t_left, sizeof(ENTROPY_CONTEXT_PLANES)); + + ta_s = (ENTROPY_CONTEXT *)&t_above_s; + tl_s = (ENTROPY_CONTEXT *)&t_left_s; + + if (this_mode == NEW4X4) { + int sseshift; + int num00; + int step_param = 0; + int further_steps; + int n; + int thissme; + int bestsme = INT_MAX; + int_mv temp_mv; + BLOCK *c; + BLOCKD *e; + + /* Is the best so far sufficiently good that we can't justify + * doing a new motion search. + */ + if (best_label_rd < label_mv_thresh) break; + + if (cpi->compressor_speed) { + if (segmentation == BLOCK_8X16 || segmentation == BLOCK_16X8) { + bsi->mvp.as_int = bsi->sv_mvp[i].as_int; + if (i == 1 && segmentation == BLOCK_16X8) { + bsi->mvp.as_int = bsi->sv_mvp[2].as_int; + } + + step_param = bsi->sv_istep[i]; + } + + /* use previous block's result as next block's MV + * predictor. + */ + if (segmentation == BLOCK_4X4 && i > 0) { + bsi->mvp.as_int = x->e_mbd.block[i - 1].bmi.mv.as_int; + if (i == 4 || i == 8 || i == 12) { + bsi->mvp.as_int = x->e_mbd.block[i - 4].bmi.mv.as_int; + } + step_param = 2; + } + } + + further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; + + { + int sadpb = x->sadperbit4; + int_mv mvp_full; + + mvp_full.as_mv.row = bsi->mvp.as_mv.row >> 3; + mvp_full.as_mv.col = bsi->mvp.as_mv.col >> 3; + + /* find first label */ + n = vp8_mbsplit_offset[segmentation][i]; + + c = &x->block[n]; + e = &x->e_mbd.block[n]; + + { + bestsme = cpi->diamond_search_sad( + x, c, e, &mvp_full, &mode_mv[NEW4X4], step_param, sadpb, &num00, + v_fn_ptr, x->mvcost, bsi->ref_mv); + + n = num00; + num00 = 0; + + while (n < further_steps) { + n++; + + if (num00) { + num00--; + } else { + thissme = cpi->diamond_search_sad( + x, c, e, &mvp_full, &temp_mv, step_param + n, sadpb, &num00, + v_fn_ptr, x->mvcost, bsi->ref_mv); + + if (thissme < bestsme) { + bestsme = thissme; + mode_mv[NEW4X4].as_int = temp_mv.as_int; + } + } + } + } + + sseshift = segmentation_to_sseshift[segmentation]; + + /* Should we do a full search (best quality only) */ + if ((cpi->compressor_speed == 0) && (bestsme >> sseshift) > 4000) { + /* Check if mvp_full is within the range. */ + vp8_clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max, x->mv_row_min, + x->mv_row_max); + + thissme = vp8_full_search_sad(x, c, e, &mvp_full, sadpb, 16, + v_fn_ptr, x->mvcost, bsi->ref_mv); + + if (thissme < bestsme) { + bestsme = thissme; + mode_mv[NEW4X4].as_int = e->bmi.mv.as_int; + } else { + /* The full search result is actually worse so + * re-instate the previous best vector + */ + e->bmi.mv.as_int = mode_mv[NEW4X4].as_int; + } + } + } + + if (bestsme < INT_MAX) { + int disto; + unsigned int sse; + cpi->find_fractional_mv_step(x, c, e, &mode_mv[NEW4X4], bsi->ref_mv, + x->errorperbit, v_fn_ptr, x->mvcost, + &disto, &sse); + } + } /* NEW4X4 */ + + rate = labels2mode(x, labels, i, this_mode, &mode_mv[this_mode], + bsi->ref_mv, x->mvcost); + + /* Trap vectors that reach beyond the UMV borders */ + if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) || + ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) || + ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) || + ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max)) { + continue; + } + + distortion = vp8_encode_inter_mb_segment(x, labels, i) / 4; + + labelyrate = rdcost_mbsegment_y(x, labels, i, ta_s, tl_s); + rate += labelyrate; + + this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion); + + if (this_rd < best_label_rd) { + sbr = rate; + sbd = distortion; + bestlabelyrate = labelyrate; + mode_selected = this_mode; + best_label_rd = this_rd; + + memcpy(&t_above_b, &t_above_s, sizeof(ENTROPY_CONTEXT_PLANES)); + memcpy(&t_left_b, &t_left_s, sizeof(ENTROPY_CONTEXT_PLANES)); + } + } /*for each 4x4 mode*/ + + memcpy(&t_above, &t_above_b, sizeof(ENTROPY_CONTEXT_PLANES)); + memcpy(&t_left, &t_left_b, sizeof(ENTROPY_CONTEXT_PLANES)); + + labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected], + bsi->ref_mv, x->mvcost); + + br += sbr; + bd += sbd; + segmentyrate += bestlabelyrate; + this_segment_rd += best_label_rd; + + if (this_segment_rd >= bsi->segment_rd) break; + + } /* for each label */ + + if (this_segment_rd < bsi->segment_rd) { + bsi->r = br; + bsi->d = bd; + bsi->segment_yrate = segmentyrate; + bsi->segment_rd = this_segment_rd; + bsi->segment_num = segmentation; + + /* store everything needed to come back to this!! */ + for (i = 0; i < 16; ++i) { + bsi->mvs[i].as_mv = x->partition_info->bmi[i].mv.as_mv; + bsi->modes[i] = x->partition_info->bmi[i].mode; + bsi->eobs[i] = x->e_mbd.eobs[i]; + } + } +} + +static void vp8_cal_step_param(int sr, int *sp) { + int step = 0; + + if (sr > MAX_FIRST_STEP) { + sr = MAX_FIRST_STEP; + } else if (sr < 1) { + sr = 1; + } + + while (sr >>= 1) step++; + + *sp = MAX_MVSEARCH_STEPS - 1 - step; +} + +static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x, + int_mv *best_ref_mv, int best_rd, + int *mdcounts, int *returntotrate, + int *returnyrate, + int *returndistortion, + int mvthresh) { + int i; + BEST_SEG_INFO bsi; + + memset(&bsi, 0, sizeof(bsi)); + + bsi.segment_rd = best_rd; + bsi.ref_mv = best_ref_mv; + bsi.mvp.as_int = best_ref_mv->as_int; + bsi.mvthresh = mvthresh; + bsi.mdcounts = mdcounts; + + for (i = 0; i < 16; ++i) { + bsi.modes[i] = ZERO4X4; + } + + if (cpi->compressor_speed == 0) { + /* for now, we will keep the original segmentation order + when in best quality mode */ + rd_check_segment(cpi, x, &bsi, BLOCK_16X8); + rd_check_segment(cpi, x, &bsi, BLOCK_8X16); + rd_check_segment(cpi, x, &bsi, BLOCK_8X8); + rd_check_segment(cpi, x, &bsi, BLOCK_4X4); + } else { + int sr; + + rd_check_segment(cpi, x, &bsi, BLOCK_8X8); + + if (bsi.segment_rd < best_rd) { + int col_min = ((best_ref_mv->as_mv.col + 7) >> 3) - MAX_FULL_PEL_VAL; + int row_min = ((best_ref_mv->as_mv.row + 7) >> 3) - MAX_FULL_PEL_VAL; + int col_max = (best_ref_mv->as_mv.col >> 3) + MAX_FULL_PEL_VAL; + int row_max = (best_ref_mv->as_mv.row >> 3) + MAX_FULL_PEL_VAL; + + int tmp_col_min = x->mv_col_min; + int tmp_col_max = x->mv_col_max; + int tmp_row_min = x->mv_row_min; + int tmp_row_max = x->mv_row_max; + + /* Get intersection of UMV window and valid MV window to reduce # of + * checks in diamond search. */ + if (x->mv_col_min < col_min) x->mv_col_min = col_min; + if (x->mv_col_max > col_max) x->mv_col_max = col_max; + if (x->mv_row_min < row_min) x->mv_row_min = row_min; + if (x->mv_row_max > row_max) x->mv_row_max = row_max; + + /* Get 8x8 result */ + bsi.sv_mvp[0].as_int = bsi.mvs[0].as_int; + bsi.sv_mvp[1].as_int = bsi.mvs[2].as_int; + bsi.sv_mvp[2].as_int = bsi.mvs[8].as_int; + bsi.sv_mvp[3].as_int = bsi.mvs[10].as_int; + + /* Use 8x8 result as 16x8/8x16's predictor MV. Adjust search range + * according to the closeness of 2 MV. */ + /* block 8X16 */ + { + sr = + MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[2].as_mv.row)) >> 3, + (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[2].as_mv.col)) >> 3); + vp8_cal_step_param(sr, &bsi.sv_istep[0]); + + sr = + MAXF((abs(bsi.sv_mvp[1].as_mv.row - bsi.sv_mvp[3].as_mv.row)) >> 3, + (abs(bsi.sv_mvp[1].as_mv.col - bsi.sv_mvp[3].as_mv.col)) >> 3); + vp8_cal_step_param(sr, &bsi.sv_istep[1]); + + rd_check_segment(cpi, x, &bsi, BLOCK_8X16); + } + + /* block 16X8 */ + { + sr = + MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[1].as_mv.row)) >> 3, + (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[1].as_mv.col)) >> 3); + vp8_cal_step_param(sr, &bsi.sv_istep[0]); + + sr = + MAXF((abs(bsi.sv_mvp[2].as_mv.row - bsi.sv_mvp[3].as_mv.row)) >> 3, + (abs(bsi.sv_mvp[2].as_mv.col - bsi.sv_mvp[3].as_mv.col)) >> 3); + vp8_cal_step_param(sr, &bsi.sv_istep[1]); + + rd_check_segment(cpi, x, &bsi, BLOCK_16X8); + } + + /* If 8x8 is better than 16x8/8x16, then do 4x4 search */ + /* Not skip 4x4 if speed=0 (good quality) */ + if (cpi->sf.no_skip_block4x4_search || bsi.segment_num == BLOCK_8X8) + /* || (sv_segment_rd8x8-bsi.segment_rd) < sv_segment_rd8x8>>5) */ + { + bsi.mvp.as_int = bsi.sv_mvp[0].as_int; + rd_check_segment(cpi, x, &bsi, BLOCK_4X4); + } + + /* restore UMV window */ + x->mv_col_min = tmp_col_min; + x->mv_col_max = tmp_col_max; + x->mv_row_min = tmp_row_min; + x->mv_row_max = tmp_row_max; + } + } + + /* set it to the best */ + for (i = 0; i < 16; ++i) { + BLOCKD *bd = &x->e_mbd.block[i]; + + bd->bmi.mv.as_int = bsi.mvs[i].as_int; + *bd->eob = bsi.eobs[i]; + } + + *returntotrate = bsi.r; + *returndistortion = bsi.d; + *returnyrate = bsi.segment_yrate; + + /* save partitions */ + x->e_mbd.mode_info_context->mbmi.partitioning = bsi.segment_num; + x->partition_info->count = vp8_mbsplit_count[bsi.segment_num]; + + for (i = 0; i < x->partition_info->count; ++i) { + int j; + + j = vp8_mbsplit_offset[bsi.segment_num][i]; + + x->partition_info->bmi[i].mode = bsi.modes[j]; + x->partition_info->bmi[i].mv.as_mv = bsi.mvs[j].as_mv; + } + /* + * used to set x->e_mbd.mode_info_context->mbmi.mv.as_int + */ + x->partition_info->bmi[15].mv.as_int = bsi.mvs[15].as_int; + + return bsi.segment_rd; +} + +/* The improved MV prediction */ +void vp8_mv_pred(VP8_COMP *cpi, MACROBLOCKD *xd, const MODE_INFO *here, + int_mv *mvp, int refframe, int *ref_frame_sign_bias, int *sr, + int near_sadidx[]) { + const MODE_INFO *above = here - xd->mode_info_stride; + const MODE_INFO *left = here - 1; + const MODE_INFO *aboveleft = above - 1; + int_mv near_mvs[8]; + int near_ref[8]; + int_mv mv; + int vcnt = 0; + int find = 0; + int mb_offset; + + int mvx[8]; + int mvy[8]; + int i; + + mv.as_int = 0; + + if (here->mbmi.ref_frame != INTRA_FRAME) { + near_mvs[0].as_int = near_mvs[1].as_int = near_mvs[2].as_int = + near_mvs[3].as_int = near_mvs[4].as_int = near_mvs[5].as_int = + near_mvs[6].as_int = near_mvs[7].as_int = 0; + near_ref[0] = near_ref[1] = near_ref[2] = near_ref[3] = near_ref[4] = + near_ref[5] = near_ref[6] = near_ref[7] = 0; + + /* read in 3 nearby block's MVs from current frame as prediction + * candidates. + */ + if (above->mbmi.ref_frame != INTRA_FRAME) { + near_mvs[vcnt].as_int = above->mbmi.mv.as_int; + mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame], refframe, + &near_mvs[vcnt], ref_frame_sign_bias); + near_ref[vcnt] = above->mbmi.ref_frame; + } + vcnt++; + if (left->mbmi.ref_frame != INTRA_FRAME) { + near_mvs[vcnt].as_int = left->mbmi.mv.as_int; + mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame], refframe, + &near_mvs[vcnt], ref_frame_sign_bias); + near_ref[vcnt] = left->mbmi.ref_frame; + } + vcnt++; + if (aboveleft->mbmi.ref_frame != INTRA_FRAME) { + near_mvs[vcnt].as_int = aboveleft->mbmi.mv.as_int; + mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame], refframe, + &near_mvs[vcnt], ref_frame_sign_bias); + near_ref[vcnt] = aboveleft->mbmi.ref_frame; + } + vcnt++; + + /* read in 5 nearby block's MVs from last frame. */ + if (cpi->common.last_frame_type != KEY_FRAME) { + mb_offset = (-xd->mb_to_top_edge / 128 + 1) * (xd->mode_info_stride + 1) + + (-xd->mb_to_left_edge / 128 + 1); + + /* current in last frame */ + if (cpi->lf_ref_frame[mb_offset] != INTRA_FRAME) { + near_mvs[vcnt].as_int = cpi->lfmv[mb_offset].as_int; + mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset], refframe, + &near_mvs[vcnt], ref_frame_sign_bias); + near_ref[vcnt] = cpi->lf_ref_frame[mb_offset]; + } + vcnt++; + + /* above in last frame */ + if (cpi->lf_ref_frame[mb_offset - xd->mode_info_stride - 1] != + INTRA_FRAME) { + near_mvs[vcnt].as_int = + cpi->lfmv[mb_offset - xd->mode_info_stride - 1].as_int; + mv_bias( + cpi->lf_ref_frame_sign_bias[mb_offset - xd->mode_info_stride - 1], + refframe, &near_mvs[vcnt], ref_frame_sign_bias); + near_ref[vcnt] = + cpi->lf_ref_frame[mb_offset - xd->mode_info_stride - 1]; + } + vcnt++; + + /* left in last frame */ + if (cpi->lf_ref_frame[mb_offset - 1] != INTRA_FRAME) { + near_mvs[vcnt].as_int = cpi->lfmv[mb_offset - 1].as_int; + mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset - 1], refframe, + &near_mvs[vcnt], ref_frame_sign_bias); + near_ref[vcnt] = cpi->lf_ref_frame[mb_offset - 1]; + } + vcnt++; + + /* right in last frame */ + if (cpi->lf_ref_frame[mb_offset + 1] != INTRA_FRAME) { + near_mvs[vcnt].as_int = cpi->lfmv[mb_offset + 1].as_int; + mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset + 1], refframe, + &near_mvs[vcnt], ref_frame_sign_bias); + near_ref[vcnt] = cpi->lf_ref_frame[mb_offset + 1]; + } + vcnt++; + + /* below in last frame */ + if (cpi->lf_ref_frame[mb_offset + xd->mode_info_stride + 1] != + INTRA_FRAME) { + near_mvs[vcnt].as_int = + cpi->lfmv[mb_offset + xd->mode_info_stride + 1].as_int; + mv_bias( + cpi->lf_ref_frame_sign_bias[mb_offset + xd->mode_info_stride + 1], + refframe, &near_mvs[vcnt], ref_frame_sign_bias); + near_ref[vcnt] = + cpi->lf_ref_frame[mb_offset + xd->mode_info_stride + 1]; + } + vcnt++; + } + + for (i = 0; i < vcnt; ++i) { + if (near_ref[near_sadidx[i]] != INTRA_FRAME) { + if (here->mbmi.ref_frame == near_ref[near_sadidx[i]]) { + mv.as_int = near_mvs[near_sadidx[i]].as_int; + find = 1; + if (i < 3) { + *sr = 3; + } else { + *sr = 2; + } + break; + } + } + } + + if (!find) { + for (i = 0; i < vcnt; ++i) { + mvx[i] = near_mvs[i].as_mv.row; + mvy[i] = near_mvs[i].as_mv.col; + } + + insertsortmv(mvx, vcnt); + insertsortmv(mvy, vcnt); + mv.as_mv.row = mvx[vcnt / 2]; + mv.as_mv.col = mvy[vcnt / 2]; + + /* sr is set to 0 to allow calling function to decide the search + * range. + */ + *sr = 0; + } + } + + /* Set up return values */ + mvp->as_int = mv.as_int; + vp8_clamp_mv2(mvp, xd); +} + +void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, + int recon_yoffset, int near_sadidx[]) { + /* near_sad indexes: + * 0-cf above, 1-cf left, 2-cf aboveleft, + * 3-lf current, 4-lf above, 5-lf left, 6-lf right, 7-lf below + */ + int near_sad[8] = { 0 }; + BLOCK *b = &x->block[0]; + unsigned char *src_y_ptr = *(b->base_src); + + /* calculate sad for current frame 3 nearby MBs. */ + if (xd->mb_to_top_edge == 0 && xd->mb_to_left_edge == 0) { + near_sad[0] = near_sad[1] = near_sad[2] = INT_MAX; + } else if (xd->mb_to_top_edge == + 0) { /* only has left MB for sad calculation. */ + near_sad[0] = near_sad[2] = INT_MAX; + near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf( + src_y_ptr, b->src_stride, xd->dst.y_buffer - 16, xd->dst.y_stride); + } else if (xd->mb_to_left_edge == + 0) { /* only has left MB for sad calculation. */ + near_sad[1] = near_sad[2] = INT_MAX; + near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf( + src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride * 16, + xd->dst.y_stride); + } else { + near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf( + src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride * 16, + xd->dst.y_stride); + near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf( + src_y_ptr, b->src_stride, xd->dst.y_buffer - 16, xd->dst.y_stride); + near_sad[2] = cpi->fn_ptr[BLOCK_16X16].sdf( + src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride * 16 - 16, + xd->dst.y_stride); + } + + if (cpi->common.last_frame_type != KEY_FRAME) { + /* calculate sad for last frame 5 nearby MBs. */ + unsigned char *pre_y_buffer = + cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_buffer + recon_yoffset; + int pre_y_stride = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_stride; + + if (xd->mb_to_top_edge == 0) near_sad[4] = INT_MAX; + if (xd->mb_to_left_edge == 0) near_sad[5] = INT_MAX; + if (xd->mb_to_right_edge == 0) near_sad[6] = INT_MAX; + if (xd->mb_to_bottom_edge == 0) near_sad[7] = INT_MAX; + + if (near_sad[4] != INT_MAX) { + near_sad[4] = cpi->fn_ptr[BLOCK_16X16].sdf( + src_y_ptr, b->src_stride, pre_y_buffer - pre_y_stride * 16, + pre_y_stride); + } + if (near_sad[5] != INT_MAX) { + near_sad[5] = cpi->fn_ptr[BLOCK_16X16].sdf( + src_y_ptr, b->src_stride, pre_y_buffer - 16, pre_y_stride); + } + near_sad[3] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, + pre_y_buffer, pre_y_stride); + if (near_sad[6] != INT_MAX) { + near_sad[6] = cpi->fn_ptr[BLOCK_16X16].sdf( + src_y_ptr, b->src_stride, pre_y_buffer + 16, pre_y_stride); + } + if (near_sad[7] != INT_MAX) { + near_sad[7] = cpi->fn_ptr[BLOCK_16X16].sdf( + src_y_ptr, b->src_stride, pre_y_buffer + pre_y_stride * 16, + pre_y_stride); + } + } + + if (cpi->common.last_frame_type != KEY_FRAME) { + insertsortsad(near_sad, near_sadidx, 8); + } else { + insertsortsad(near_sad, near_sadidx, 3); + } +} + +static void rd_update_mvcount(MACROBLOCK *x, int_mv *best_ref_mv) { + if (x->e_mbd.mode_info_context->mbmi.mode == SPLITMV) { + int i; + + for (i = 0; i < x->partition_info->count; ++i) { + if (x->partition_info->bmi[i].mode == NEW4X4) { + x->MVcount[0][mv_max + ((x->partition_info->bmi[i].mv.as_mv.row - + best_ref_mv->as_mv.row) >> + 1)]++; + x->MVcount[1][mv_max + ((x->partition_info->bmi[i].mv.as_mv.col - + best_ref_mv->as_mv.col) >> + 1)]++; + } + } + } else if (x->e_mbd.mode_info_context->mbmi.mode == NEWMV) { + x->MVcount[0][mv_max + ((x->e_mbd.mode_info_context->mbmi.mv.as_mv.row - + best_ref_mv->as_mv.row) >> + 1)]++; + x->MVcount[1][mv_max + ((x->e_mbd.mode_info_context->mbmi.mv.as_mv.col - + best_ref_mv->as_mv.col) >> + 1)]++; + } +} + +static int evaluate_inter_mode_rd(int mdcounts[4], RATE_DISTORTION *rd, + int *disable_skip, VP8_COMP *cpi, + MACROBLOCK *x) { + MB_PREDICTION_MODE this_mode = x->e_mbd.mode_info_context->mbmi.mode; + BLOCK *b = &x->block[0]; + MACROBLOCKD *xd = &x->e_mbd; + int distortion; + vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.predictor, 16); + + if (cpi->active_map_enabled && x->active_ptr[0] == 0) { + x->skip = 1; + } else if (x->encode_breakout) { + unsigned int sse; + unsigned int var; + unsigned int threshold = + (xd->block[0].dequant[1] * xd->block[0].dequant[1] >> 4); + + if (threshold < x->encode_breakout) threshold = x->encode_breakout; + + var = vpx_variance16x16(*(b->base_src), b->src_stride, x->e_mbd.predictor, + 16, &sse); + + if (sse < threshold) { + unsigned int q2dc = xd->block[24].dequant[0]; + /* If theres is no codeable 2nd order dc + or a very small uniform pixel change change */ + if ((sse - var < q2dc * q2dc >> 4) || (sse / 2 > var && sse - var < 64)) { + /* Check u and v to make sure skip is ok */ + unsigned int sse2 = VP8_UVSSE(x); + if (sse2 * 2 < threshold) { + x->skip = 1; + rd->distortion2 = sse + sse2; + rd->rate2 = 500; + + /* for best_yrd calculation */ + rd->rate_uv = 0; + rd->distortion_uv = sse2; + + *disable_skip = 1; + return RDCOST(x->rdmult, x->rddiv, rd->rate2, rd->distortion2); + } + } + } + } + + /* Add in the Mv/mode cost */ + rd->rate2 += vp8_cost_mv_ref(this_mode, mdcounts); + + /* Y cost and distortion */ + macro_block_yrd(x, &rd->rate_y, &distortion); + rd->rate2 += rd->rate_y; + rd->distortion2 += distortion; + + /* UV cost and distortion */ + rd_inter16x16_uv(cpi, x, &rd->rate_uv, &rd->distortion_uv, + cpi->common.full_pixel); + rd->rate2 += rd->rate_uv; + rd->distortion2 += rd->distortion_uv; + return INT_MAX; +} + +static int calculate_final_rd_costs(int this_rd, RATE_DISTORTION *rd, + int *other_cost, int disable_skip, + int uv_intra_tteob, int intra_rd_penalty, + VP8_COMP *cpi, MACROBLOCK *x) { + MB_PREDICTION_MODE this_mode = x->e_mbd.mode_info_context->mbmi.mode; + + /* Where skip is allowable add in the default per mb cost for the no + * skip case. where we then decide to skip we have to delete this and + * replace it with the cost of signalling a skip + */ + if (cpi->common.mb_no_coeff_skip) { + *other_cost += vp8_cost_bit(cpi->prob_skip_false, 0); + rd->rate2 += *other_cost; + } + + /* Estimate the reference frame signaling cost and add it + * to the rolling cost variable. + */ + rd->rate2 += x->ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame]; + + if (!disable_skip) { + /* Test for the condition where skip block will be activated + * because there are no non zero coefficients and make any + * necessary adjustment for rate + */ + if (cpi->common.mb_no_coeff_skip) { + int i; + int tteob; + int has_y2_block = (this_mode != SPLITMV && this_mode != B_PRED); + + tteob = 0; + if (has_y2_block) tteob += x->e_mbd.eobs[24]; + + for (i = 0; i < 16; ++i) tteob += (x->e_mbd.eobs[i] > has_y2_block); + + if (x->e_mbd.mode_info_context->mbmi.ref_frame) { + for (i = 16; i < 24; ++i) tteob += x->e_mbd.eobs[i]; + } else { + tteob += uv_intra_tteob; + } + + if (tteob == 0) { + rd->rate2 -= (rd->rate_y + rd->rate_uv); + /* for best_yrd calculation */ + rd->rate_uv = 0; + + /* Back out no skip flag costing and add in skip flag costing */ + if (cpi->prob_skip_false) { + int prob_skip_cost; + + prob_skip_cost = vp8_cost_bit(cpi->prob_skip_false, 1); + prob_skip_cost -= (int)vp8_cost_bit(cpi->prob_skip_false, 0); + rd->rate2 += prob_skip_cost; + *other_cost += prob_skip_cost; + } + } + } + /* Calculate the final RD estimate for this mode */ + this_rd = RDCOST(x->rdmult, x->rddiv, rd->rate2, rd->distortion2); + if (this_rd < INT_MAX && + x->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME) { + this_rd += intra_rd_penalty; + } + } + return this_rd; +} + +static void update_best_mode(BEST_MODE *best_mode, int this_rd, + RATE_DISTORTION *rd, int other_cost, + MACROBLOCK *x) { + MB_PREDICTION_MODE this_mode = x->e_mbd.mode_info_context->mbmi.mode; + + other_cost += x->ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame]; + + /* Calculate the final y RD estimate for this mode */ + best_mode->yrd = + RDCOST(x->rdmult, x->rddiv, (rd->rate2 - rd->rate_uv - other_cost), + (rd->distortion2 - rd->distortion_uv)); + + best_mode->rd = this_rd; + memcpy(&best_mode->mbmode, &x->e_mbd.mode_info_context->mbmi, + sizeof(MB_MODE_INFO)); + memcpy(&best_mode->partition, x->partition_info, sizeof(PARTITION_INFO)); + + if ((this_mode == B_PRED) || (this_mode == SPLITMV)) { + int i; + for (i = 0; i < 16; ++i) { + best_mode->bmodes[i] = x->e_mbd.block[i].bmi; + } + } +} + +void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, + int recon_uvoffset, int *returnrate, + int *returndistortion, int *returnintra, int mb_row, + int mb_col) { + BLOCK *b = &x->block[0]; + BLOCKD *d = &x->e_mbd.block[0]; + MACROBLOCKD *xd = &x->e_mbd; + int_mv best_ref_mv_sb[2]; + int_mv mode_mv_sb[2][MB_MODE_COUNT]; + int_mv best_ref_mv; + int_mv *mode_mv; + MB_PREDICTION_MODE this_mode; + int num00; + int best_mode_index = 0; + BEST_MODE best_mode; + + int i; + int mode_index; + int mdcounts[4]; + int rate; + RATE_DISTORTION rd; + int uv_intra_rate, uv_intra_distortion, uv_intra_rate_tokenonly; + int uv_intra_tteob = 0; + int uv_intra_done = 0; + + MB_PREDICTION_MODE uv_intra_mode = 0; + int_mv mvp; + int near_sadidx[8] = { 0, 1, 2, 3, 4, 5, 6, 7 }; + int saddone = 0; + /* search range got from mv_pred(). It uses step_param levels. (0-7) */ + int sr = 0; + + unsigned char *plane[4][3] = { { 0, 0 } }; + int ref_frame_map[4]; + int sign_bias = 0; + + int intra_rd_penalty = + 10 * vp8_dc_quant(cpi->common.base_qindex, cpi->common.y1dc_delta_q); + +#if CONFIG_TEMPORAL_DENOISING + unsigned int zero_mv_sse = UINT_MAX, best_sse = UINT_MAX, + best_rd_sse = UINT_MAX; +#endif + + // _uv variables are not set consistantly before calling update_best_mode. + rd.rate_uv = 0; + rd.distortion_uv = 0; + + mode_mv = mode_mv_sb[sign_bias]; + best_ref_mv.as_int = 0; + best_mode.rd = INT_MAX; + best_mode.yrd = INT_MAX; + best_mode.intra_rd = INT_MAX; + memset(mode_mv_sb, 0, sizeof(mode_mv_sb)); + memset(&best_mode.mbmode, 0, sizeof(best_mode.mbmode)); + memset(&best_mode.bmodes, 0, sizeof(best_mode.bmodes)); + + /* Setup search priorities */ + get_reference_search_order(cpi, ref_frame_map); + + /* Check to see if there is at least 1 valid reference frame that we need + * to calculate near_mvs. + */ + if (ref_frame_map[1] > 0) { + sign_bias = vp8_find_near_mvs_bias( + &x->e_mbd, x->e_mbd.mode_info_context, mode_mv_sb, best_ref_mv_sb, + mdcounts, ref_frame_map[1], cpi->common.ref_frame_sign_bias); + + mode_mv = mode_mv_sb[sign_bias]; + best_ref_mv.as_int = best_ref_mv_sb[sign_bias].as_int; + } + + get_predictor_pointers(cpi, plane, recon_yoffset, recon_uvoffset); + + *returnintra = INT_MAX; + /* Count of the number of MBs tested so far this frame */ + x->mbs_tested_so_far++; + + x->skip = 0; + + for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) { + int this_rd = INT_MAX; + int disable_skip = 0; + int other_cost = 0; + int this_ref_frame = ref_frame_map[vp8_ref_frame_order[mode_index]]; + + /* Test best rd so far against threshold for trying this mode. */ + if (best_mode.rd <= x->rd_threshes[mode_index]) continue; + + if (this_ref_frame < 0) continue; + + /* These variables hold are rolling total cost and distortion for + * this mode + */ + rd.rate2 = 0; + rd.distortion2 = 0; + + this_mode = vp8_mode_order[mode_index]; + + x->e_mbd.mode_info_context->mbmi.mode = this_mode; + x->e_mbd.mode_info_context->mbmi.ref_frame = this_ref_frame; + + /* Only consider ZEROMV/ALTREF_FRAME for alt ref frame, + * unless ARNR filtering is enabled in which case we want + * an unfiltered alternative + */ + if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) { + if (this_mode != ZEROMV || + x->e_mbd.mode_info_context->mbmi.ref_frame != ALTREF_FRAME) { + continue; + } + } + + /* everything but intra */ + if (x->e_mbd.mode_info_context->mbmi.ref_frame) { + assert(plane[this_ref_frame][0] != NULL && + plane[this_ref_frame][1] != NULL && + plane[this_ref_frame][2] != NULL); + x->e_mbd.pre.y_buffer = plane[this_ref_frame][0]; + x->e_mbd.pre.u_buffer = plane[this_ref_frame][1]; + x->e_mbd.pre.v_buffer = plane[this_ref_frame][2]; + + if (sign_bias != cpi->common.ref_frame_sign_bias[this_ref_frame]) { + sign_bias = cpi->common.ref_frame_sign_bias[this_ref_frame]; + mode_mv = mode_mv_sb[sign_bias]; + best_ref_mv.as_int = best_ref_mv_sb[sign_bias].as_int; + } + } + + /* Check to see if the testing frequency for this mode is at its + * max If so then prevent it from being tested and increase the + * threshold for its testing + */ + if (x->mode_test_hit_counts[mode_index] && + (cpi->mode_check_freq[mode_index] > 1)) { + if (x->mbs_tested_so_far <= cpi->mode_check_freq[mode_index] * + x->mode_test_hit_counts[mode_index]) { + /* Increase the threshold for coding this mode to make it + * less likely to be chosen + */ + x->rd_thresh_mult[mode_index] += 4; + + if (x->rd_thresh_mult[mode_index] > MAX_THRESHMULT) { + x->rd_thresh_mult[mode_index] = MAX_THRESHMULT; + } + + x->rd_threshes[mode_index] = + (cpi->rd_baseline_thresh[mode_index] >> 7) * + x->rd_thresh_mult[mode_index]; + + continue; + } + } + + /* We have now reached the point where we are going to test the + * current mode so increment the counter for the number of times + * it has been tested + */ + x->mode_test_hit_counts[mode_index]++; + + /* Experimental code. Special case for gf and arf zeromv modes. + * Increase zbin size to supress noise + */ + if (x->zbin_mode_boost_enabled) { + if (this_ref_frame == INTRA_FRAME) { + x->zbin_mode_boost = 0; + } else { + if (vp8_mode_order[mode_index] == ZEROMV) { + if (this_ref_frame != LAST_FRAME) { + x->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST; + } else { + x->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST; + } + } else if (vp8_mode_order[mode_index] == SPLITMV) { + x->zbin_mode_boost = 0; + } else { + x->zbin_mode_boost = MV_ZBIN_BOOST; + } + } + + vp8_update_zbin_extra(cpi, x); + } + + if (!uv_intra_done && this_ref_frame == INTRA_FRAME) { + rd_pick_intra_mbuv_mode(x, &uv_intra_rate, &uv_intra_rate_tokenonly, + &uv_intra_distortion); + uv_intra_mode = x->e_mbd.mode_info_context->mbmi.uv_mode; + + /* + * Total of the eobs is used later to further adjust rate2. Since uv + * block's intra eobs will be overwritten when we check inter modes, + * we need to save uv_intra_tteob here. + */ + for (i = 16; i < 24; ++i) uv_intra_tteob += x->e_mbd.eobs[i]; + + uv_intra_done = 1; + } + + switch (this_mode) { + case B_PRED: { + int tmp_rd; + + /* Note the rate value returned here includes the cost of + * coding the BPRED mode: x->mbmode_cost[x->e_mbd.frame_type][BPRED] + */ + int distortion; + tmp_rd = rd_pick_intra4x4mby_modes(x, &rate, &rd.rate_y, &distortion, + best_mode.yrd); + rd.rate2 += rate; + rd.distortion2 += distortion; + + if (tmp_rd < best_mode.yrd) { + assert(uv_intra_done); + rd.rate2 += uv_intra_rate; + rd.rate_uv = uv_intra_rate_tokenonly; + rd.distortion2 += uv_intra_distortion; + rd.distortion_uv = uv_intra_distortion; + } else { + this_rd = INT_MAX; + disable_skip = 1; + } + break; + } + + case SPLITMV: { + int tmp_rd; + int this_rd_thresh; + int distortion; + + this_rd_thresh = (vp8_ref_frame_order[mode_index] == 1) + ? x->rd_threshes[THR_NEW1] + : x->rd_threshes[THR_NEW3]; + this_rd_thresh = (vp8_ref_frame_order[mode_index] == 2) + ? x->rd_threshes[THR_NEW2] + : this_rd_thresh; + + tmp_rd = vp8_rd_pick_best_mbsegmentation( + cpi, x, &best_ref_mv, best_mode.yrd, mdcounts, &rate, &rd.rate_y, + &distortion, this_rd_thresh); + + rd.rate2 += rate; + rd.distortion2 += distortion; + + /* If even the 'Y' rd value of split is higher than best so far + * then don't bother looking at UV + */ + if (tmp_rd < best_mode.yrd) { + /* Now work out UV cost and add it in */ + rd_inter4x4_uv(cpi, x, &rd.rate_uv, &rd.distortion_uv, + cpi->common.full_pixel); + rd.rate2 += rd.rate_uv; + rd.distortion2 += rd.distortion_uv; + } else { + this_rd = INT_MAX; + disable_skip = 1; + } + break; + } + case DC_PRED: + case V_PRED: + case H_PRED: + case TM_PRED: { + int distortion; + x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME; + + vp8_build_intra_predictors_mby_s( + xd, xd->dst.y_buffer - xd->dst.y_stride, xd->dst.y_buffer - 1, + xd->dst.y_stride, xd->predictor, 16); + macro_block_yrd(x, &rd.rate_y, &distortion); + rd.rate2 += rd.rate_y; + rd.distortion2 += distortion; + rd.rate2 += x->mbmode_cost[x->e_mbd.frame_type] + [x->e_mbd.mode_info_context->mbmi.mode]; + assert(uv_intra_done); + rd.rate2 += uv_intra_rate; + rd.rate_uv = uv_intra_rate_tokenonly; + rd.distortion2 += uv_intra_distortion; + rd.distortion_uv = uv_intra_distortion; + break; + } + + case NEWMV: { + int thissme; + int bestsme = INT_MAX; + int step_param = cpi->sf.first_step; + int further_steps; + int n; + /* If last step (1-away) of n-step search doesn't pick the center point + as the best match, we will do a final 1-away diamond refining search + */ + int do_refine = 1; + + int sadpb = x->sadperbit16; + int_mv mvp_full; + + int col_min = ((best_ref_mv.as_mv.col + 7) >> 3) - MAX_FULL_PEL_VAL; + int row_min = ((best_ref_mv.as_mv.row + 7) >> 3) - MAX_FULL_PEL_VAL; + int col_max = (best_ref_mv.as_mv.col >> 3) + MAX_FULL_PEL_VAL; + int row_max = (best_ref_mv.as_mv.row >> 3) + MAX_FULL_PEL_VAL; + + int tmp_col_min = x->mv_col_min; + int tmp_col_max = x->mv_col_max; + int tmp_row_min = x->mv_row_min; + int tmp_row_max = x->mv_row_max; + + if (!saddone) { + vp8_cal_sad(cpi, xd, x, recon_yoffset, &near_sadidx[0]); + saddone = 1; + } + + vp8_mv_pred(cpi, &x->e_mbd, x->e_mbd.mode_info_context, &mvp, + x->e_mbd.mode_info_context->mbmi.ref_frame, + cpi->common.ref_frame_sign_bias, &sr, &near_sadidx[0]); + + mvp_full.as_mv.col = mvp.as_mv.col >> 3; + mvp_full.as_mv.row = mvp.as_mv.row >> 3; + + /* Get intersection of UMV window and valid MV window to + * reduce # of checks in diamond search. + */ + if (x->mv_col_min < col_min) x->mv_col_min = col_min; + if (x->mv_col_max > col_max) x->mv_col_max = col_max; + if (x->mv_row_min < row_min) x->mv_row_min = row_min; + if (x->mv_row_max > row_max) x->mv_row_max = row_max; + + /* adjust search range according to sr from mv prediction */ + if (sr > step_param) step_param = sr; + + /* Initial step/diamond search */ + { + bestsme = cpi->diamond_search_sad( + x, b, d, &mvp_full, &d->bmi.mv, step_param, sadpb, &num00, + &cpi->fn_ptr[BLOCK_16X16], x->mvcost, &best_ref_mv); + mode_mv[NEWMV].as_int = d->bmi.mv.as_int; + + /* Further step/diamond searches as necessary */ + further_steps = (cpi->sf.max_step_search_steps - 1) - step_param; + + n = num00; + num00 = 0; + + /* If there won't be more n-step search, check to see if refining + * search is needed. */ + if (n > further_steps) do_refine = 0; + + while (n < further_steps) { + n++; + + if (num00) { + num00--; + } else { + thissme = cpi->diamond_search_sad( + x, b, d, &mvp_full, &d->bmi.mv, step_param + n, sadpb, &num00, + &cpi->fn_ptr[BLOCK_16X16], x->mvcost, &best_ref_mv); + + /* check to see if refining search is needed. */ + if (num00 > (further_steps - n)) do_refine = 0; + + if (thissme < bestsme) { + bestsme = thissme; + mode_mv[NEWMV].as_int = d->bmi.mv.as_int; + } else { + d->bmi.mv.as_int = mode_mv[NEWMV].as_int; + } + } + } + } + + /* final 1-away diamond refining search */ + if (do_refine == 1) { + int search_range; + + search_range = 8; + + thissme = cpi->refining_search_sad( + x, b, d, &d->bmi.mv, sadpb, search_range, + &cpi->fn_ptr[BLOCK_16X16], x->mvcost, &best_ref_mv); + + if (thissme < bestsme) { + bestsme = thissme; + mode_mv[NEWMV].as_int = d->bmi.mv.as_int; + } else { + d->bmi.mv.as_int = mode_mv[NEWMV].as_int; + } + } + + x->mv_col_min = tmp_col_min; + x->mv_col_max = tmp_col_max; + x->mv_row_min = tmp_row_min; + x->mv_row_max = tmp_row_max; + + if (bestsme < INT_MAX) { + int dis; /* TODO: use dis in distortion calculation later. */ + unsigned int sse; + cpi->find_fractional_mv_step( + x, b, d, &d->bmi.mv, &best_ref_mv, x->errorperbit, + &cpi->fn_ptr[BLOCK_16X16], x->mvcost, &dis, &sse); + } + + mode_mv[NEWMV].as_int = d->bmi.mv.as_int; + + /* Add the new motion vector cost to our rolling cost variable */ + rd.rate2 += + vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, x->mvcost, 96); + } + // fall through + + case NEARESTMV: + case NEARMV: + /* Clip "next_nearest" so that it does not extend to far out + * of image + */ + vp8_clamp_mv2(&mode_mv[this_mode], xd); + + /* Do not bother proceeding if the vector (from newmv, nearest + * or near) is 0,0 as this should then be coded using the zeromv + * mode. + */ + if (((this_mode == NEARMV) || (this_mode == NEARESTMV)) && + (mode_mv[this_mode].as_int == 0)) { + continue; + } + // fall through + + case ZEROMV: + + /* Trap vectors that reach beyond the UMV borders + * Note that ALL New MV, Nearest MV Near MV and Zero MV code + * drops through to this point because of the lack of break + * statements in the previous two cases. + */ + if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) || + ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) || + ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) || + ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max)) { + continue; + } + + vp8_set_mbmode_and_mvs(x, this_mode, &mode_mv[this_mode]); + this_rd = evaluate_inter_mode_rd(mdcounts, &rd, &disable_skip, cpi, x); + break; + + default: break; + } + + this_rd = + calculate_final_rd_costs(this_rd, &rd, &other_cost, disable_skip, + uv_intra_tteob, intra_rd_penalty, cpi, x); + + /* Keep record of best intra distortion */ + if ((x->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME) && + (this_rd < best_mode.intra_rd)) { + best_mode.intra_rd = this_rd; + *returnintra = rd.distortion2; + } +#if CONFIG_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity) { + unsigned int sse; + vp8_get_inter_mbpred_error(x, &cpi->fn_ptr[BLOCK_16X16], &sse, + mode_mv[this_mode]); + + if (sse < best_rd_sse) best_rd_sse = sse; + + /* Store for later use by denoiser. */ + if (this_mode == ZEROMV && sse < zero_mv_sse) { + zero_mv_sse = sse; + x->best_zeromv_reference_frame = + x->e_mbd.mode_info_context->mbmi.ref_frame; + } + + /* Store the best NEWMV in x for later use in the denoiser. */ + if (x->e_mbd.mode_info_context->mbmi.mode == NEWMV && sse < best_sse) { + best_sse = sse; + vp8_get_inter_mbpred_error(x, &cpi->fn_ptr[BLOCK_16X16], &best_sse, + mode_mv[this_mode]); + x->best_sse_inter_mode = NEWMV; + x->best_sse_mv = x->e_mbd.mode_info_context->mbmi.mv; + x->need_to_clamp_best_mvs = + x->e_mbd.mode_info_context->mbmi.need_to_clamp_mvs; + x->best_reference_frame = x->e_mbd.mode_info_context->mbmi.ref_frame; + } + } +#endif + + /* Did this mode help.. i.i is it the new best mode */ + if (this_rd < best_mode.rd || x->skip) { + /* Note index of best mode so far */ + best_mode_index = mode_index; + *returnrate = rd.rate2; + *returndistortion = rd.distortion2; + if (this_mode <= B_PRED) { + x->e_mbd.mode_info_context->mbmi.uv_mode = uv_intra_mode; + /* required for left and above block mv */ + x->e_mbd.mode_info_context->mbmi.mv.as_int = 0; + } + update_best_mode(&best_mode, this_rd, &rd, other_cost, x); + + /* Testing this mode gave rise to an improvement in best error + * score. Lower threshold a bit for next time + */ + x->rd_thresh_mult[mode_index] = + (x->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) + ? x->rd_thresh_mult[mode_index] - 2 + : MIN_THRESHMULT; + } + + /* If the mode did not help improve the best error case then raise + * the threshold for testing that mode next time around. + */ + else { + x->rd_thresh_mult[mode_index] += 4; + + if (x->rd_thresh_mult[mode_index] > MAX_THRESHMULT) { + x->rd_thresh_mult[mode_index] = MAX_THRESHMULT; + } + } + x->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * + x->rd_thresh_mult[mode_index]; + + if (x->skip) break; + } + + /* Reduce the activation RD thresholds for the best choice mode */ + if ((cpi->rd_baseline_thresh[best_mode_index] > 0) && + (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) { + int best_adjustment = (x->rd_thresh_mult[best_mode_index] >> 2); + + x->rd_thresh_mult[best_mode_index] = + (x->rd_thresh_mult[best_mode_index] >= + (MIN_THRESHMULT + best_adjustment)) + ? x->rd_thresh_mult[best_mode_index] - best_adjustment + : MIN_THRESHMULT; + x->rd_threshes[best_mode_index] = + (cpi->rd_baseline_thresh[best_mode_index] >> 7) * + x->rd_thresh_mult[best_mode_index]; + } + +#if CONFIG_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity) { + int block_index = mb_row * cpi->common.mb_cols + mb_col; + if (x->best_sse_inter_mode == DC_PRED) { + /* No best MV found. */ + x->best_sse_inter_mode = best_mode.mbmode.mode; + x->best_sse_mv = best_mode.mbmode.mv; + x->need_to_clamp_best_mvs = best_mode.mbmode.need_to_clamp_mvs; + x->best_reference_frame = best_mode.mbmode.ref_frame; + best_sse = best_rd_sse; + } + vp8_denoiser_denoise_mb(&cpi->denoiser, x, best_sse, zero_mv_sse, + recon_yoffset, recon_uvoffset, &cpi->common.lf_info, + mb_row, mb_col, block_index, 0); + + /* Reevaluate ZEROMV after denoising. */ + if (best_mode.mbmode.ref_frame == INTRA_FRAME && + x->best_zeromv_reference_frame != INTRA_FRAME) { + int this_rd = INT_MAX; + int disable_skip = 0; + int other_cost = 0; + int this_ref_frame = x->best_zeromv_reference_frame; + rd.rate2 = + x->ref_frame_cost[this_ref_frame] + vp8_cost_mv_ref(ZEROMV, mdcounts); + rd.distortion2 = 0; + + /* set up the proper prediction buffers for the frame */ + x->e_mbd.mode_info_context->mbmi.ref_frame = this_ref_frame; + x->e_mbd.pre.y_buffer = plane[this_ref_frame][0]; + x->e_mbd.pre.u_buffer = plane[this_ref_frame][1]; + x->e_mbd.pre.v_buffer = plane[this_ref_frame][2]; + + x->e_mbd.mode_info_context->mbmi.mode = ZEROMV; + x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED; + x->e_mbd.mode_info_context->mbmi.mv.as_int = 0; + + this_rd = evaluate_inter_mode_rd(mdcounts, &rd, &disable_skip, cpi, x); + this_rd = + calculate_final_rd_costs(this_rd, &rd, &other_cost, disable_skip, + uv_intra_tteob, intra_rd_penalty, cpi, x); + if (this_rd < best_mode.rd || x->skip) { + *returnrate = rd.rate2; + *returndistortion = rd.distortion2; + update_best_mode(&best_mode, this_rd, &rd, other_cost, x); + } + } + } +#endif + + if (cpi->is_src_frame_alt_ref && + (best_mode.mbmode.mode != ZEROMV || + best_mode.mbmode.ref_frame != ALTREF_FRAME)) { + x->e_mbd.mode_info_context->mbmi.mode = ZEROMV; + x->e_mbd.mode_info_context->mbmi.ref_frame = ALTREF_FRAME; + x->e_mbd.mode_info_context->mbmi.mv.as_int = 0; + x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED; + x->e_mbd.mode_info_context->mbmi.mb_skip_coeff = + (cpi->common.mb_no_coeff_skip); + x->e_mbd.mode_info_context->mbmi.partitioning = 0; + return; + } + + /* macroblock modes */ + memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mode.mbmode, + sizeof(MB_MODE_INFO)); + + if (best_mode.mbmode.mode == B_PRED) { + for (i = 0; i < 16; ++i) { + xd->mode_info_context->bmi[i].as_mode = best_mode.bmodes[i].as_mode; + } + } + + if (best_mode.mbmode.mode == SPLITMV) { + for (i = 0; i < 16; ++i) { + xd->mode_info_context->bmi[i].mv.as_int = best_mode.bmodes[i].mv.as_int; + } + + memcpy(x->partition_info, &best_mode.partition, sizeof(PARTITION_INFO)); + + x->e_mbd.mode_info_context->mbmi.mv.as_int = + x->partition_info->bmi[15].mv.as_int; + } + + if (sign_bias != + cpi->common.ref_frame_sign_bias[xd->mode_info_context->mbmi.ref_frame]) { + best_ref_mv.as_int = best_ref_mv_sb[!sign_bias].as_int; + } + + rd_update_mvcount(x, &best_ref_mv); +} + +void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate) { + int error4x4, error16x16; + int rate4x4, rate16x16 = 0, rateuv; + int dist4x4, dist16x16, distuv; + int rate_; + int rate4x4_tokenonly = 0; + int rate16x16_tokenonly = 0; + int rateuv_tokenonly = 0; + + x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME; + + rd_pick_intra_mbuv_mode(x, &rateuv, &rateuv_tokenonly, &distuv); + rate_ = rateuv; + + error16x16 = rd_pick_intra16x16mby_mode(x, &rate16x16, &rate16x16_tokenonly, + &dist16x16); + + error4x4 = rd_pick_intra4x4mby_modes(x, &rate4x4, &rate4x4_tokenonly, + &dist4x4, error16x16); + + if (error4x4 < error16x16) { + x->e_mbd.mode_info_context->mbmi.mode = B_PRED; + rate_ += rate4x4; + } else { + rate_ += rate16x16; + } + + *rate = rate_; +} diff --git a/media/libvpx/libvpx/vp8/encoder/rdopt.h b/media/libvpx/libvpx/vp8/encoder/rdopt.h new file mode 100644 index 0000000000..cc3db8197c --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/rdopt.h @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_ENCODER_RDOPT_H_ +#define VPX_VP8_ENCODER_RDOPT_H_ + +#include "./vpx_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define RDCOST(RM, DM, R, D) (((128 + (R) * (RM)) >> 8) + (DM) * (D)) + +void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex); +void vp8_auto_select_speed(VP8_COMP *cpi); + +static INLINE void insertsortmv(int arr[], int len) { + int i, j, k; + + for (i = 1; i <= len - 1; ++i) { + for (j = 0; j < i; ++j) { + if (arr[j] > arr[i]) { + int temp; + + temp = arr[i]; + + for (k = i; k > j; k--) arr[k] = arr[k - 1]; + + arr[j] = temp; + } + } + } +} + +static INLINE void insertsortsad(int arr[], int idx[], int len) { + int i, j, k; + + for (i = 1; i <= len - 1; ++i) { + for (j = 0; j < i; ++j) { + if (arr[j] > arr[i]) { + int temp, tempi; + + temp = arr[i]; + tempi = idx[i]; + + for (k = i; k > j; k--) { + arr[k] = arr[k - 1]; + idx[k] = idx[k - 1]; + } + + arr[j] = temp; + idx[j] = tempi; + } + } + } +} + +void vp8_initialize_rd_consts(VP8_COMP *cpi, MACROBLOCK *x, int Qvalue); +void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, + int recon_uvoffset, int *returnrate, + int *returndistortion, int *returnintra, int mb_row, + int mb_col); +void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate); + +static INLINE void get_plane_pointers(const YV12_BUFFER_CONFIG *fb, + unsigned char *plane[3], + unsigned int recon_yoffset, + unsigned int recon_uvoffset) { + plane[0] = fb->y_buffer + recon_yoffset; + plane[1] = fb->u_buffer + recon_uvoffset; + plane[2] = fb->v_buffer + recon_uvoffset; +} + +static INLINE void get_predictor_pointers(const VP8_COMP *cpi, + unsigned char *plane[4][3], + unsigned int recon_yoffset, + unsigned int recon_uvoffset) { + if (cpi->ref_frame_flags & VP8_LAST_FRAME) { + get_plane_pointers(&cpi->common.yv12_fb[cpi->common.lst_fb_idx], + plane[LAST_FRAME], recon_yoffset, recon_uvoffset); + } + + if (cpi->ref_frame_flags & VP8_GOLD_FRAME) { + get_plane_pointers(&cpi->common.yv12_fb[cpi->common.gld_fb_idx], + plane[GOLDEN_FRAME], recon_yoffset, recon_uvoffset); + } + + if (cpi->ref_frame_flags & VP8_ALTR_FRAME) { + get_plane_pointers(&cpi->common.yv12_fb[cpi->common.alt_fb_idx], + plane[ALTREF_FRAME], recon_yoffset, recon_uvoffset); + } +} + +static INLINE void get_reference_search_order(const VP8_COMP *cpi, + int ref_frame_map[4]) { + int i = 0; + + ref_frame_map[i++] = INTRA_FRAME; + if (cpi->ref_frame_flags & VP8_LAST_FRAME) ref_frame_map[i++] = LAST_FRAME; + if (cpi->ref_frame_flags & VP8_GOLD_FRAME) ref_frame_map[i++] = GOLDEN_FRAME; + if (cpi->ref_frame_flags & VP8_ALTR_FRAME) ref_frame_map[i++] = ALTREF_FRAME; + for (; i < 4; ++i) ref_frame_map[i] = -1; +} + +void vp8_mv_pred(VP8_COMP *cpi, MACROBLOCKD *xd, const MODE_INFO *here, + int_mv *mvp, int refframe, int *ref_frame_sign_bias, int *sr, + int near_sadidx[]); +void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, + int recon_yoffset, int near_sadidx[]); +int VP8_UVSSE(MACROBLOCK *x); +int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]); +void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_ENCODER_RDOPT_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/segmentation.c b/media/libvpx/libvpx/vp8/encoder/segmentation.c new file mode 100644 index 0000000000..2127258111 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/segmentation.c @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "segmentation.h" +#include "vpx_mem/vpx_mem.h" + +void vp8_update_gf_usage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x) { + int mb_row, mb_col; + + MODE_INFO *this_mb_mode_info = cm->mi; + + x->gf_active_ptr = (signed char *)cpi->gf_active_flags; + + if ((cm->frame_type == KEY_FRAME) || (cm->refresh_golden_frame)) { + /* Reset Gf usage monitors */ + memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols)); + cpi->gf_active_count = cm->mb_rows * cm->mb_cols; + } else { + /* for each macroblock row in image */ + for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { + /* for each macroblock col in image */ + for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) { + /* If using golden then set GF active flag if not already set. + * If using last frame 0,0 mode then leave flag as it is + * else if using non 0,0 motion or intra modes then clear + * flag if it is currently set + */ + if ((this_mb_mode_info->mbmi.ref_frame == GOLDEN_FRAME) || + (this_mb_mode_info->mbmi.ref_frame == ALTREF_FRAME)) { + if (*(x->gf_active_ptr) == 0) { + *(x->gf_active_ptr) = 1; + cpi->gf_active_count++; + } + } else if ((this_mb_mode_info->mbmi.mode != ZEROMV) && + *(x->gf_active_ptr)) { + *(x->gf_active_ptr) = 0; + cpi->gf_active_count--; + } + + x->gf_active_ptr++; /* Step onto next entry */ + this_mb_mode_info++; /* skip to next mb */ + } + + /* this is to account for the border */ + this_mb_mode_info++; + } + } +} diff --git a/media/libvpx/libvpx/vp8/encoder/segmentation.h b/media/libvpx/libvpx/vp8/encoder/segmentation.h new file mode 100644 index 0000000000..0fecfc2212 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/segmentation.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_ENCODER_SEGMENTATION_H_ +#define VPX_VP8_ENCODER_SEGMENTATION_H_ + +#include "string.h" +#include "vp8/common/blockd.h" +#include "onyx_int.h" + +#ifdef __cplusplus +extern "C" { +#endif + +extern void vp8_update_gf_usage_maps(VP8_COMP *cpi, VP8_COMMON *cm, + MACROBLOCK *x); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_ENCODER_SEGMENTATION_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/temporal_filter.c b/media/libvpx/libvpx/vp8/encoder/temporal_filter.c new file mode 100644 index 0000000000..1c1a55fde6 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/temporal_filter.c @@ -0,0 +1,434 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp8/common/onyxc_int.h" +#include "onyx_int.h" +#include "vp8/common/systemdependent.h" +#include "vp8/encoder/quantize.h" +#include "vp8/common/alloccommon.h" +#include "mcomp.h" +#include "firstpass.h" +#include "vpx_scale/vpx_scale.h" +#include "vp8/common/extend.h" +#include "ratectrl.h" +#include "vp8/common/quant_common.h" +#include "segmentation.h" +#include "temporal_filter.h" +#include "vpx_mem/vpx_mem.h" +#include "vp8/common/swapyv12buffer.h" +#include "vp8/common/threading.h" +#include "vpx_ports/vpx_timer.h" + +#include +#include + +#define ALT_REF_MC_ENABLED 1 /* toggle MC in AltRef filtering */ +#define ALT_REF_SUBPEL_ENABLED 1 /* toggle subpel in MC AltRef filtering */ + +#if VP8_TEMPORAL_ALT_REF + +static void vp8_temporal_filter_predictors_mb_c( + MACROBLOCKD *x, unsigned char *y_mb_ptr, unsigned char *u_mb_ptr, + unsigned char *v_mb_ptr, int stride, int mv_row, int mv_col, + unsigned char *pred) { + int offset; + unsigned char *yptr, *uptr, *vptr; + + /* Y */ + yptr = y_mb_ptr + (mv_row >> 3) * stride + (mv_col >> 3); + + if ((mv_row | mv_col) & 7) { + x->subpixel_predict16x16(yptr, stride, mv_col & 7, mv_row & 7, &pred[0], + 16); + } else { + vp8_copy_mem16x16(yptr, stride, &pred[0], 16); + } + + /* U & V */ + mv_row >>= 1; + mv_col >>= 1; + stride = (stride + 1) >> 1; + offset = (mv_row >> 3) * stride + (mv_col >> 3); + uptr = u_mb_ptr + offset; + vptr = v_mb_ptr + offset; + + if ((mv_row | mv_col) & 7) { + x->subpixel_predict8x8(uptr, stride, mv_col & 7, mv_row & 7, &pred[256], 8); + x->subpixel_predict8x8(vptr, stride, mv_col & 7, mv_row & 7, &pred[320], 8); + } else { + vp8_copy_mem8x8(uptr, stride, &pred[256], 8); + vp8_copy_mem8x8(vptr, stride, &pred[320], 8); + } +} +void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, + unsigned char *frame2, unsigned int block_size, + int strength, int filter_weight, + unsigned int *accumulator, + unsigned short *count) { + unsigned int i, j, k; + int modifier; + int byte = 0; + const int rounding = strength > 0 ? 1 << (strength - 1) : 0; + + for (i = 0, k = 0; i < block_size; ++i) { + for (j = 0; j < block_size; j++, k++) { + int src_byte = frame1[byte]; + int pixel_value = *frame2++; + + modifier = src_byte - pixel_value; + /* This is an integer approximation of: + * float coeff = (3.0 * modifer * modifier) / pow(2, strength); + * modifier = (int)roundf(coeff > 16 ? 0 : 16-coeff); + */ + modifier *= modifier; + modifier *= 3; + modifier += rounding; + modifier >>= strength; + + if (modifier > 16) modifier = 16; + + modifier = 16 - modifier; + modifier *= filter_weight; + + count[k] += modifier; + accumulator[k] += modifier * pixel_value; + + byte++; + } + + byte += stride - block_size; + } +} + +#if ALT_REF_MC_ENABLED + +static int vp8_temporal_filter_find_matching_mb_c(VP8_COMP *cpi, + YV12_BUFFER_CONFIG *arf_frame, + YV12_BUFFER_CONFIG *frame_ptr, + int mb_offset, + int error_thresh) { + MACROBLOCK *x = &cpi->mb; + int step_param; + int sadpb = x->sadperbit16; + int bestsme = INT_MAX; + + BLOCK *b = &x->block[0]; + BLOCKD *d = &x->e_mbd.block[0]; + int_mv best_ref_mv1; + int_mv best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ + + /* Save input state */ + unsigned char **base_src = b->base_src; + int src = b->src; + int src_stride = b->src_stride; + unsigned char *base_pre = x->e_mbd.pre.y_buffer; + int pre = d->offset; + int pre_stride = x->e_mbd.pre.y_stride; + + (void)error_thresh; + + best_ref_mv1.as_int = 0; + best_ref_mv1_full.as_mv.col = best_ref_mv1.as_mv.col >> 3; + best_ref_mv1_full.as_mv.row = best_ref_mv1.as_mv.row >> 3; + + /* Setup frame pointers */ + b->base_src = &arf_frame->y_buffer; + b->src_stride = arf_frame->y_stride; + b->src = mb_offset; + + x->e_mbd.pre.y_buffer = frame_ptr->y_buffer; + x->e_mbd.pre.y_stride = frame_ptr->y_stride; + d->offset = mb_offset; + + /* Further step/diamond searches as necessary */ + if (cpi->Speed < 8) { + step_param = cpi->sf.first_step + (cpi->Speed > 5); + } else { + step_param = cpi->sf.first_step + 2; + } + + /* TODO Check that the 16x16 vf & sdf are selected here */ + /* Ignore mv costing by sending NULL cost arrays */ + bestsme = + vp8_hex_search(x, b, d, &best_ref_mv1_full, &d->bmi.mv, step_param, sadpb, + &cpi->fn_ptr[BLOCK_16X16], NULL, &best_ref_mv1); + (void)bestsme; // Ignore unused return value. + +#if ALT_REF_SUBPEL_ENABLED + /* Try sub-pixel MC? */ + { + int distortion; + unsigned int sse; + /* Ignore mv costing by sending NULL cost array */ + bestsme = cpi->find_fractional_mv_step( + x, b, d, &d->bmi.mv, &best_ref_mv1, x->errorperbit, + &cpi->fn_ptr[BLOCK_16X16], NULL, &distortion, &sse); + } +#endif + + /* Save input state */ + b->base_src = base_src; + b->src = src; + b->src_stride = src_stride; + x->e_mbd.pre.y_buffer = base_pre; + d->offset = pre; + x->e_mbd.pre.y_stride = pre_stride; + + return bestsme; +} +#endif + +static void vp8_temporal_filter_iterate_c(VP8_COMP *cpi, int frame_count, + int alt_ref_index, int strength) { + int byte; + int frame; + int mb_col, mb_row; + unsigned int filter_weight; + int mb_cols = cpi->common.mb_cols; + int mb_rows = cpi->common.mb_rows; + int mb_y_offset = 0; + int mb_uv_offset = 0; + DECLARE_ALIGNED(16, unsigned int, accumulator[16 * 16 + 8 * 8 + 8 * 8]); + DECLARE_ALIGNED(16, unsigned short, count[16 * 16 + 8 * 8 + 8 * 8]); + MACROBLOCKD *mbd = &cpi->mb.e_mbd; + YV12_BUFFER_CONFIG *f = cpi->frames[alt_ref_index]; + unsigned char *dst1, *dst2; + DECLARE_ALIGNED(16, unsigned char, predictor[16 * 16 + 8 * 8 + 8 * 8]); + + /* Save input state */ + unsigned char *y_buffer = mbd->pre.y_buffer; + unsigned char *u_buffer = mbd->pre.u_buffer; + unsigned char *v_buffer = mbd->pre.v_buffer; + + for (mb_row = 0; mb_row < mb_rows; ++mb_row) { +#if ALT_REF_MC_ENABLED + /* Source frames are extended to 16 pixels. This is different than + * L/A/G reference frames that have a border of 32 (VP8BORDERINPIXELS) + * A 6 tap filter is used for motion search. This requires 2 pixels + * before and 3 pixels after. So the largest Y mv on a border would + * then be 16 - 3. The UV blocks are half the size of the Y and + * therefore only extended by 8. The largest mv that a UV block + * can support is 8 - 3. A UV mv is half of a Y mv. + * (16 - 3) >> 1 == 6 which is greater than 8 - 3. + * To keep the mv in play for both Y and UV planes the max that it + * can be on a border is therefore 16 - 5. + */ + cpi->mb.mv_row_min = -((mb_row * 16) + (16 - 5)); + cpi->mb.mv_row_max = ((cpi->common.mb_rows - 1 - mb_row) * 16) + (16 - 5); +#endif + + for (mb_col = 0; mb_col < mb_cols; ++mb_col) { + int i, j, k; + int stride; + + memset(accumulator, 0, 384 * sizeof(unsigned int)); + memset(count, 0, 384 * sizeof(unsigned short)); + +#if ALT_REF_MC_ENABLED + cpi->mb.mv_col_min = -((mb_col * 16) + (16 - 5)); + cpi->mb.mv_col_max = ((cpi->common.mb_cols - 1 - mb_col) * 16) + (16 - 5); +#endif + + for (frame = 0; frame < frame_count; ++frame) { + if (cpi->frames[frame] == NULL) continue; + + mbd->block[0].bmi.mv.as_mv.row = 0; + mbd->block[0].bmi.mv.as_mv.col = 0; + + if (frame == alt_ref_index) { + filter_weight = 2; + } else { + int err = 0; +#if ALT_REF_MC_ENABLED +#define THRESH_LOW 10000 +#define THRESH_HIGH 20000 + /* Find best match in this frame by MC */ + err = vp8_temporal_filter_find_matching_mb_c( + cpi, cpi->frames[alt_ref_index], cpi->frames[frame], mb_y_offset, + THRESH_LOW); +#endif + /* Assign higher weight to matching MB if it's error + * score is lower. If not applying MC default behavior + * is to weight all MBs equal. + */ + filter_weight = err < THRESH_LOW ? 2 : err < THRESH_HIGH ? 1 : 0; + } + + if (filter_weight != 0) { + /* Construct the predictors */ + vp8_temporal_filter_predictors_mb_c( + mbd, cpi->frames[frame]->y_buffer + mb_y_offset, + cpi->frames[frame]->u_buffer + mb_uv_offset, + cpi->frames[frame]->v_buffer + mb_uv_offset, + cpi->frames[frame]->y_stride, mbd->block[0].bmi.mv.as_mv.row, + mbd->block[0].bmi.mv.as_mv.col, predictor); + + /* Apply the filter (YUV) */ + vp8_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride, + predictor, 16, strength, filter_weight, + accumulator, count); + + vp8_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride, + predictor + 256, 8, strength, filter_weight, + accumulator + 256, count + 256); + + vp8_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride, + predictor + 320, 8, strength, filter_weight, + accumulator + 320, count + 320); + } + } + + /* Normalize filter output to produce AltRef frame */ + dst1 = cpi->alt_ref_buffer.y_buffer; + stride = cpi->alt_ref_buffer.y_stride; + byte = mb_y_offset; + for (i = 0, k = 0; i < 16; ++i) { + for (j = 0; j < 16; j++, k++) { + unsigned int pval = accumulator[k] + (count[k] >> 1); + pval *= cpi->fixed_divide[count[k]]; + pval >>= 19; + + dst1[byte] = (unsigned char)pval; + + /* move to next pixel */ + byte++; + } + + byte += stride - 16; + } + + dst1 = cpi->alt_ref_buffer.u_buffer; + dst2 = cpi->alt_ref_buffer.v_buffer; + stride = cpi->alt_ref_buffer.uv_stride; + byte = mb_uv_offset; + for (i = 0, k = 256; i < 8; ++i) { + for (j = 0; j < 8; j++, k++) { + int m = k + 64; + + /* U */ + unsigned int pval = accumulator[k] + (count[k] >> 1); + pval *= cpi->fixed_divide[count[k]]; + pval >>= 19; + dst1[byte] = (unsigned char)pval; + + /* V */ + pval = accumulator[m] + (count[m] >> 1); + pval *= cpi->fixed_divide[count[m]]; + pval >>= 19; + dst2[byte] = (unsigned char)pval; + + /* move to next pixel */ + byte++; + } + + byte += stride - 8; + } + + mb_y_offset += 16; + mb_uv_offset += 8; + } + + mb_y_offset += 16 * (f->y_stride - mb_cols); + mb_uv_offset += 8 * (f->uv_stride - mb_cols); + } + + /* Restore input state */ + mbd->pre.y_buffer = y_buffer; + mbd->pre.u_buffer = u_buffer; + mbd->pre.v_buffer = v_buffer; +} + +void vp8_temporal_filter_prepare_c(VP8_COMP *cpi, int distance) { + int frame = 0; + + int num_frames_backward = 0; + int num_frames_forward = 0; + int frames_to_blur_backward = 0; + int frames_to_blur_forward = 0; + int frames_to_blur = 0; + int start_frame = 0; + + int strength = cpi->oxcf.arnr_strength; + + int blur_type = cpi->oxcf.arnr_type; + + int max_frames = cpi->active_arnr_frames; + + num_frames_backward = distance; + num_frames_forward = + vp8_lookahead_depth(cpi->lookahead) - (num_frames_backward + 1); + + switch (blur_type) { + case 1: + /* Backward Blur */ + + frames_to_blur_backward = num_frames_backward; + + if (frames_to_blur_backward >= max_frames) { + frames_to_blur_backward = max_frames - 1; + } + + frames_to_blur = frames_to_blur_backward + 1; + break; + + case 2: + /* Forward Blur */ + + frames_to_blur_forward = num_frames_forward; + + if (frames_to_blur_forward >= max_frames) { + frames_to_blur_forward = max_frames - 1; + } + + frames_to_blur = frames_to_blur_forward + 1; + break; + + case 3: + default: + /* Center Blur */ + frames_to_blur_forward = num_frames_forward; + frames_to_blur_backward = num_frames_backward; + + if (frames_to_blur_forward > frames_to_blur_backward) { + frames_to_blur_forward = frames_to_blur_backward; + } + + if (frames_to_blur_backward > frames_to_blur_forward) { + frames_to_blur_backward = frames_to_blur_forward; + } + + /* When max_frames is even we have 1 more frame backward than forward */ + if (frames_to_blur_forward > (max_frames - 1) / 2) { + frames_to_blur_forward = ((max_frames - 1) / 2); + } + + if (frames_to_blur_backward > (max_frames / 2)) { + frames_to_blur_backward = (max_frames / 2); + } + + frames_to_blur = frames_to_blur_backward + frames_to_blur_forward + 1; + break; + } + + start_frame = distance + frames_to_blur_forward; + + /* Setup frame pointers, NULL indicates frame not included in filter */ + memset(cpi->frames, 0, max_frames * sizeof(YV12_BUFFER_CONFIG *)); + for (frame = 0; frame < frames_to_blur; ++frame) { + int which_buffer = start_frame - frame; + struct lookahead_entry *buf = + vp8_lookahead_peek(cpi->lookahead, which_buffer, PEEK_FORWARD); + cpi->frames[frames_to_blur - 1 - frame] = &buf->img; + } + + vp8_temporal_filter_iterate_c(cpi, frames_to_blur, frames_to_blur_backward, + strength); +} +#endif diff --git a/media/libvpx/libvpx/vp8/encoder/temporal_filter.h b/media/libvpx/libvpx/vp8/encoder/temporal_filter.h new file mode 100644 index 0000000000..fd39f5cb87 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/temporal_filter.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_ENCODER_TEMPORAL_FILTER_H_ +#define VPX_VP8_ENCODER_TEMPORAL_FILTER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP8_COMP; + +void vp8_temporal_filter_prepare_c(struct VP8_COMP *cpi, int distance); + +#ifdef __cplusplus +} +#endif + +#endif // VPX_VP8_ENCODER_TEMPORAL_FILTER_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/tokenize.c b/media/libvpx/libvpx/vp8/encoder/tokenize.c new file mode 100644 index 0000000000..c3d7026607 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/tokenize.c @@ -0,0 +1,468 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include +#include "onyx_int.h" +#include "tokenize.h" +#include "vpx_mem/vpx_mem.h" + +/* Global event counters used for accumulating statistics across several + compressions, then generating context.c = initial stats. */ + +void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t); +void vp8_fix_contexts(MACROBLOCKD *x); + +#include "dct_value_tokens.h" +#include "dct_value_cost.h" + +const TOKENVALUE *const vp8_dct_value_tokens_ptr = + dct_value_tokens + DCT_MAX_VALUE; +const short *const vp8_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE; + +#if 0 +int skip_true_count = 0; +int skip_false_count = 0; +#endif + +/* function used to generate dct_value_tokens and dct_value_cost tables */ +/* +static void fill_value_tokens() +{ + + TOKENVALUE *t = dct_value_tokens + DCT_MAX_VALUE; + const vp8_extra_bit_struct *e = vp8_extra_bits; + + int i = -DCT_MAX_VALUE; + int sign = 1; + + do + { + if (!i) + sign = 0; + + { + const int a = sign ? -i : i; + int eb = sign; + + if (a > 4) + { + int j = 4; + + while (++j < 11 && e[j].base_val <= a) {} + + t[i].Token = --j; + eb |= (a - e[j].base_val) << 1; + } + else + t[i].Token = a; + + t[i].Extra = eb; + } + + // initialize the cost for extra bits for all possible coefficient +value. + { + int cost = 0; + const vp8_extra_bit_struct *p = vp8_extra_bits + t[i].Token; + + if (p->base_val) + { + const int extra = t[i].Extra; + const int Length = p->Len; + + if (Length) + cost += vp8_treed_cost(p->tree, p->prob, extra >> 1, +Length); + + cost += vp8_cost_bit(vp8_prob_half, extra & 1); // sign + dct_value_cost[i + DCT_MAX_VALUE] = cost; + } + + } + + } + while (++i < DCT_MAX_VALUE); + + vp8_dct_value_tokens_ptr = dct_value_tokens + DCT_MAX_VALUE; + vp8_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE; +} +*/ + +static void tokenize2nd_order_b(MACROBLOCK *x, TOKENEXTRA **tp, VP8_COMP *cpi) { + MACROBLOCKD *xd = &x->e_mbd; + int pt; /* near block/prev token context index */ + int c; /* start at DC */ + TOKENEXTRA *t = *tp; /* store tokens starting here */ + const BLOCKD *b; + const short *qcoeff_ptr; + ENTROPY_CONTEXT *a; + ENTROPY_CONTEXT *l; + int band, rc, v, token; + int eob; + + b = xd->block + 24; + qcoeff_ptr = b->qcoeff; + a = (ENTROPY_CONTEXT *)xd->above_context + 8; + l = (ENTROPY_CONTEXT *)xd->left_context + 8; + eob = xd->eobs[24]; + VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l); + + if (!eob) { + /* c = band for this case */ + t->Token = DCT_EOB_TOKEN; + t->context_tree = cpi->common.fc.coef_probs[1][0][pt]; + t->skip_eob_node = 0; + + ++x->coef_counts[1][0][pt][DCT_EOB_TOKEN]; + t++; + *tp = t; + *a = *l = 0; + return; + } + + v = qcoeff_ptr[0]; + t->Extra = vp8_dct_value_tokens_ptr[v].Extra; + token = vp8_dct_value_tokens_ptr[v].Token; + t->Token = token; + + t->context_tree = cpi->common.fc.coef_probs[1][0][pt]; + t->skip_eob_node = 0; + ++x->coef_counts[1][0][pt][token]; + pt = vp8_prev_token_class[token]; + t++; + c = 1; + + for (; c < eob; ++c) { + rc = vp8_default_zig_zag1d[c]; + band = vp8_coef_bands[c]; + v = qcoeff_ptr[rc]; + + t->Extra = vp8_dct_value_tokens_ptr[v].Extra; + token = vp8_dct_value_tokens_ptr[v].Token; + + t->Token = token; + t->context_tree = cpi->common.fc.coef_probs[1][band][pt]; + + t->skip_eob_node = ((pt == 0)); + + ++x->coef_counts[1][band][pt][token]; + + pt = vp8_prev_token_class[token]; + t++; + } + if (c < 16) { + band = vp8_coef_bands[c]; + t->Token = DCT_EOB_TOKEN; + t->context_tree = cpi->common.fc.coef_probs[1][band][pt]; + + t->skip_eob_node = 0; + + ++x->coef_counts[1][band][pt][DCT_EOB_TOKEN]; + + t++; + } + + *tp = t; + *a = *l = 1; +} + +static void tokenize1st_order_b( + MACROBLOCK *x, TOKENEXTRA **tp, + int type, /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */ + VP8_COMP *cpi) { + MACROBLOCKD *xd = &x->e_mbd; + unsigned int block; + const BLOCKD *b; + int pt; /* near block/prev token context index */ + int c; + int token; + TOKENEXTRA *t = *tp; /* store tokens starting here */ + const short *qcoeff_ptr; + ENTROPY_CONTEXT *a; + ENTROPY_CONTEXT *l; + int band, rc, v; + int tmp1, tmp2; + + b = xd->block; + /* Luma */ + for (block = 0; block < 16; block++, b++) { + const int eob = *b->eob; + tmp1 = vp8_block2above[block]; + tmp2 = vp8_block2left[block]; + qcoeff_ptr = b->qcoeff; + a = (ENTROPY_CONTEXT *)xd->above_context + tmp1; + l = (ENTROPY_CONTEXT *)xd->left_context + tmp2; + + VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l); + + c = type ? 0 : 1; + + if (c >= eob) { + /* c = band for this case */ + t->Token = DCT_EOB_TOKEN; + t->context_tree = cpi->common.fc.coef_probs[type][c][pt]; + t->skip_eob_node = 0; + + ++x->coef_counts[type][c][pt][DCT_EOB_TOKEN]; + t++; + *tp = t; + *a = *l = 0; + continue; + } + + v = qcoeff_ptr[c]; + + t->Extra = vp8_dct_value_tokens_ptr[v].Extra; + token = vp8_dct_value_tokens_ptr[v].Token; + t->Token = token; + + t->context_tree = cpi->common.fc.coef_probs[type][c][pt]; + t->skip_eob_node = 0; + ++x->coef_counts[type][c][pt][token]; + pt = vp8_prev_token_class[token]; + t++; + c++; + + assert(eob <= 16); + for (; c < eob; ++c) { + rc = vp8_default_zig_zag1d[c]; + band = vp8_coef_bands[c]; + v = qcoeff_ptr[rc]; + + t->Extra = vp8_dct_value_tokens_ptr[v].Extra; + token = vp8_dct_value_tokens_ptr[v].Token; + + t->Token = token; + t->context_tree = cpi->common.fc.coef_probs[type][band][pt]; + + t->skip_eob_node = (pt == 0); + ++x->coef_counts[type][band][pt][token]; + + pt = vp8_prev_token_class[token]; + t++; + } + if (c < 16) { + band = vp8_coef_bands[c]; + t->Token = DCT_EOB_TOKEN; + t->context_tree = cpi->common.fc.coef_probs[type][band][pt]; + + t->skip_eob_node = 0; + ++x->coef_counts[type][band][pt][DCT_EOB_TOKEN]; + + t++; + } + *tp = t; + *a = *l = 1; + } + + /* Chroma */ + for (block = 16; block < 24; block++, b++) { + const int eob = *b->eob; + tmp1 = vp8_block2above[block]; + tmp2 = vp8_block2left[block]; + qcoeff_ptr = b->qcoeff; + a = (ENTROPY_CONTEXT *)xd->above_context + tmp1; + l = (ENTROPY_CONTEXT *)xd->left_context + tmp2; + + VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l); + + if (!eob) { + /* c = band for this case */ + t->Token = DCT_EOB_TOKEN; + t->context_tree = cpi->common.fc.coef_probs[2][0][pt]; + t->skip_eob_node = 0; + + ++x->coef_counts[2][0][pt][DCT_EOB_TOKEN]; + t++; + *tp = t; + *a = *l = 0; + continue; + } + + v = qcoeff_ptr[0]; + + t->Extra = vp8_dct_value_tokens_ptr[v].Extra; + token = vp8_dct_value_tokens_ptr[v].Token; + t->Token = token; + + t->context_tree = cpi->common.fc.coef_probs[2][0][pt]; + t->skip_eob_node = 0; + ++x->coef_counts[2][0][pt][token]; + pt = vp8_prev_token_class[token]; + t++; + c = 1; + + assert(eob <= 16); + for (; c < eob; ++c) { + rc = vp8_default_zig_zag1d[c]; + band = vp8_coef_bands[c]; + v = qcoeff_ptr[rc]; + + t->Extra = vp8_dct_value_tokens_ptr[v].Extra; + token = vp8_dct_value_tokens_ptr[v].Token; + + t->Token = token; + t->context_tree = cpi->common.fc.coef_probs[2][band][pt]; + + t->skip_eob_node = (pt == 0); + + ++x->coef_counts[2][band][pt][token]; + + pt = vp8_prev_token_class[token]; + t++; + } + if (c < 16) { + band = vp8_coef_bands[c]; + t->Token = DCT_EOB_TOKEN; + t->context_tree = cpi->common.fc.coef_probs[2][band][pt]; + + t->skip_eob_node = 0; + + ++x->coef_counts[2][band][pt][DCT_EOB_TOKEN]; + + t++; + } + *tp = t; + *a = *l = 1; + } +} + +static int mb_is_skippable(MACROBLOCKD *x, int has_y2_block) { + int skip = 1; + int i = 0; + + if (has_y2_block) { + for (i = 0; i < 16; ++i) skip &= (x->eobs[i] < 2); + } + + for (; i < 24 + has_y2_block; ++i) skip &= (!x->eobs[i]); + + return skip; +} + +void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t) { + MACROBLOCKD *xd = &x->e_mbd; + int plane_type; + int has_y2_block; + + has_y2_block = (xd->mode_info_context->mbmi.mode != B_PRED && + xd->mode_info_context->mbmi.mode != SPLITMV); + + xd->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable(xd, has_y2_block); + if (xd->mode_info_context->mbmi.mb_skip_coeff) { + if (!cpi->common.mb_no_coeff_skip) { + vp8_stuff_mb(cpi, x, t); + } else { + vp8_fix_contexts(xd); + x->skip_true_count++; + } + + return; + } + + plane_type = 3; + if (has_y2_block) { + tokenize2nd_order_b(x, t, cpi); + plane_type = 0; + } + + tokenize1st_order_b(x, t, plane_type, cpi); +} + +static void stuff2nd_order_b(TOKENEXTRA **tp, ENTROPY_CONTEXT *a, + ENTROPY_CONTEXT *l, VP8_COMP *cpi, MACROBLOCK *x) { + int pt; /* near block/prev token context index */ + TOKENEXTRA *t = *tp; /* store tokens starting here */ + VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l); + + t->Token = DCT_EOB_TOKEN; + t->context_tree = cpi->common.fc.coef_probs[1][0][pt]; + t->skip_eob_node = 0; + ++x->coef_counts[1][0][pt][DCT_EOB_TOKEN]; + ++t; + + *tp = t; + pt = 0; + *a = *l = pt; +} + +static void stuff1st_order_b(TOKENEXTRA **tp, ENTROPY_CONTEXT *a, + ENTROPY_CONTEXT *l, int type, VP8_COMP *cpi, + MACROBLOCK *x) { + int pt; /* near block/prev token context index */ + int band; + TOKENEXTRA *t = *tp; /* store tokens starting here */ + VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l); + band = type ? 0 : 1; + t->Token = DCT_EOB_TOKEN; + t->context_tree = cpi->common.fc.coef_probs[type][band][pt]; + t->skip_eob_node = 0; + ++x->coef_counts[type][band][pt][DCT_EOB_TOKEN]; + ++t; + *tp = t; + pt = 0; /* 0 <-> all coeff data is zero */ + *a = *l = pt; +} + +static void stuff1st_order_buv(TOKENEXTRA **tp, ENTROPY_CONTEXT *a, + ENTROPY_CONTEXT *l, VP8_COMP *cpi, + MACROBLOCK *x) { + int pt; /* near block/prev token context index */ + TOKENEXTRA *t = *tp; /* store tokens starting here */ + VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l); + + t->Token = DCT_EOB_TOKEN; + t->context_tree = cpi->common.fc.coef_probs[2][0][pt]; + t->skip_eob_node = 0; + ++x->coef_counts[2][0][pt][DCT_EOB_TOKEN]; + ++t; + *tp = t; + pt = 0; /* 0 <-> all coeff data is zero */ + *a = *l = pt; +} + +void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t) { + MACROBLOCKD *xd = &x->e_mbd; + ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)xd->above_context; + ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)xd->left_context; + int plane_type; + int b; + plane_type = 3; + if ((xd->mode_info_context->mbmi.mode != B_PRED && + xd->mode_info_context->mbmi.mode != SPLITMV)) { + stuff2nd_order_b(t, A + vp8_block2above[24], L + vp8_block2left[24], cpi, + x); + plane_type = 0; + } + + for (b = 0; b < 16; ++b) { + stuff1st_order_b(t, A + vp8_block2above[b], L + vp8_block2left[b], + plane_type, cpi, x); + } + + for (b = 16; b < 24; ++b) { + stuff1st_order_buv(t, A + vp8_block2above[b], L + vp8_block2left[b], cpi, + x); + } +} +void vp8_fix_contexts(MACROBLOCKD *x) { + /* Clear entropy contexts for Y2 blocks */ + if (x->mode_info_context->mbmi.mode != B_PRED && + x->mode_info_context->mbmi.mode != SPLITMV) { + memset(x->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES)); + memset(x->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES)); + } else { + memset(x->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1); + memset(x->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1); + } +} diff --git a/media/libvpx/libvpx/vp8/encoder/tokenize.h b/media/libvpx/libvpx/vp8/encoder/tokenize.h new file mode 100644 index 0000000000..47b5be17f1 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/tokenize.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_ENCODER_TOKENIZE_H_ +#define VPX_VP8_ENCODER_TOKENIZE_H_ + +#include "vp8/common/entropy.h" +#include "block.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_tokenize_initialize(); + +typedef struct { + short Token; + short Extra; +} TOKENVALUE; + +typedef struct { + const vp8_prob *context_tree; + short Extra; + unsigned char Token; + unsigned char skip_eob_node; +} TOKENEXTRA; + +int rd_cost_mby(MACROBLOCKD *); + +extern const short *const vp8_dct_value_cost_ptr; +/* TODO: The Token field should be broken out into a separate char array to + * improve cache locality, since it's needed for costing when the rest of the + * fields are not. + */ +extern const TOKENVALUE *const vp8_dct_value_tokens_ptr; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_ENCODER_TOKENIZE_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/treewriter.c b/media/libvpx/libvpx/vp8/encoder/treewriter.c new file mode 100644 index 0000000000..f055f05229 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/treewriter.c @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "treewriter.h" + +static void cost(int *const C, vp8_tree T, const vp8_prob *const P, int i, + int c) { + const vp8_prob p = P[i >> 1]; + + do { + const vp8_tree_index j = T[i]; + const int d = c + vp8_cost_bit(p, i & 1); + + if (j <= 0) { + C[-j] = d; + } else { + cost(C, T, P, j, d); + } + } while (++i & 1); +} +void vp8_cost_tokens(int *c, const vp8_prob *p, vp8_tree t) { + cost(c, t, p, 0, 0); +} +void vp8_cost_tokens2(int *c, const vp8_prob *p, vp8_tree t, int start) { + cost(c, t, p, start, 0); +} diff --git a/media/libvpx/libvpx/vp8/encoder/treewriter.h b/media/libvpx/libvpx/vp8/encoder/treewriter.h new file mode 100644 index 0000000000..4e9ed6af17 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/treewriter.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_ENCODER_TREEWRITER_H_ +#define VPX_VP8_ENCODER_TREEWRITER_H_ + +/* Trees map alphabets into huffman-like codes suitable for an arithmetic + bit coder. Timothy S Murphy 11 October 2004 */ + +#include + +#include "./vpx_config.h" +#include "vp8/common/treecoder.h" + +#include "boolhuff.h" /* for now */ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef BOOL_CODER vp8_writer; + +#define vp8_write vp8_encode_bool +#define vp8_write_literal vp8_encode_value +#define vp8_write_bit(W, V) vp8_write(W, V, vp8_prob_half) + +#define vp8bc_write vp8bc_write_bool +#define vp8bc_write_literal vp8bc_write_bits +#define vp8bc_write_bit(W, V) vp8bc_write_bits(W, V, 1) + +/* Approximate length of an encoded bool in 256ths of a bit at given prob */ + +#define vp8_cost_zero(x) (vp8_prob_cost[x]) +#define vp8_cost_one(x) vp8_cost_zero(vp8_complement(x)) + +#define vp8_cost_bit(x, b) vp8_cost_zero((b) ? vp8_complement(x) : (x)) + +/* VP8BC version is scaled by 2^20 rather than 2^8; see bool_coder.h */ + +/* Both of these return bits, not scaled bits. */ + +static INLINE unsigned int vp8_cost_branch(const unsigned int ct[2], + vp8_prob p) { + /* Imitate existing calculation */ + + return (unsigned int)(((((uint64_t)ct[0]) * vp8_cost_zero(p)) + + (((uint64_t)ct[1]) * vp8_cost_one(p))) >> + 8); +} + +/* Small functions to write explicit values and tokens, as well as + estimate their lengths. */ + +static void vp8_treed_write(vp8_writer *const w, vp8_tree t, + const vp8_prob *const p, int v, + int n) { /* number of bits in v, assumed nonzero */ + vp8_tree_index i = 0; + + do { + const int b = (v >> --n) & 1; + vp8_write(w, b, p[i >> 1]); + i = t[i + b]; + } while (n); +} +static INLINE void vp8_write_token(vp8_writer *const w, vp8_tree t, + const vp8_prob *const p, + vp8_token *const x) { + vp8_treed_write(w, t, p, x->value, x->Len); +} + +static int vp8_treed_cost(vp8_tree t, const vp8_prob *const p, int v, + int n) { /* number of bits in v, assumed nonzero */ + int c = 0; + vp8_tree_index i = 0; + + do { + const int b = (v >> --n) & 1; + c += vp8_cost_bit(p[i >> 1], b); + i = t[i + b]; + } while (n); + + return c; +} +static INLINE int vp8_cost_token(vp8_tree t, const vp8_prob *const p, + vp8_token *const x) { + return vp8_treed_cost(t, p, x->value, x->Len); +} + +/* Fill array of costs for all possible token values. */ + +void vp8_cost_tokens(int *c, const vp8_prob *, vp8_tree); + +void vp8_cost_tokens2(int *c, const vp8_prob *, vp8_tree, int); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_ENCODER_TREEWRITER_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/vp8_quantize.c b/media/libvpx/libvpx/vp8/encoder/vp8_quantize.c new file mode 100644 index 0000000000..8b9b22babe --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/vp8_quantize.c @@ -0,0 +1,492 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "vpx_ports/bitops.h" +#include "vpx_mem/vpx_mem.h" + +#include "onyx_int.h" +#include "vp8/encoder/quantize.h" +#include "vp8/common/quant_common.h" + +void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d) { + int i, rc, eob; + int x, y, z, sz; + short *coeff_ptr = b->coeff; + short *round_ptr = b->round; + short *quant_ptr = b->quant_fast; + short *qcoeff_ptr = d->qcoeff; + short *dqcoeff_ptr = d->dqcoeff; + short *dequant_ptr = d->dequant; + + eob = -1; + for (i = 0; i < 16; ++i) { + rc = vp8_default_zig_zag1d[i]; + z = coeff_ptr[rc]; + + sz = (z >> 31); /* sign of z */ + x = (z ^ sz) - sz; /* x = abs(z) */ + + y = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; /* quantize (x) */ + x = (y ^ sz) - sz; /* get the sign back */ + qcoeff_ptr[rc] = x; /* write to destination */ + dqcoeff_ptr[rc] = x * dequant_ptr[rc]; /* dequantized value */ + + if (y) { + eob = i; /* last nonzero coeffs */ + } + } + *d->eob = (char)(eob + 1); +} + +void vp8_regular_quantize_b_c(BLOCK *b, BLOCKD *d) { + int i, rc, eob; + int zbin; + int x, y, z, sz; + short *zbin_boost_ptr = b->zrun_zbin_boost; + short *coeff_ptr = b->coeff; + short *zbin_ptr = b->zbin; + short *round_ptr = b->round; + short *quant_ptr = b->quant; + short *quant_shift_ptr = b->quant_shift; + short *qcoeff_ptr = d->qcoeff; + short *dqcoeff_ptr = d->dqcoeff; + short *dequant_ptr = d->dequant; + short zbin_oq_value = b->zbin_extra; + + memset(qcoeff_ptr, 0, 32); + memset(dqcoeff_ptr, 0, 32); + + eob = -1; + + for (i = 0; i < 16; ++i) { + rc = vp8_default_zig_zag1d[i]; + z = coeff_ptr[rc]; + + zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value; + + zbin_boost_ptr++; + sz = (z >> 31); /* sign of z */ + x = (z ^ sz) - sz; /* x = abs(z) */ + + if (x >= zbin) { + x += round_ptr[rc]; + y = ((((x * quant_ptr[rc]) >> 16) + x) * quant_shift_ptr[rc]) >> + 16; /* quantize (x) */ + x = (y ^ sz) - sz; /* get the sign back */ + qcoeff_ptr[rc] = x; /* write to destination */ + dqcoeff_ptr[rc] = x * dequant_ptr[rc]; /* dequantized value */ + + if (y) { + eob = i; /* last nonzero coeffs */ + zbin_boost_ptr = b->zrun_zbin_boost; /* reset zero runlength */ + } + } + } + + *d->eob = (char)(eob + 1); +} + +void vp8_quantize_mby(MACROBLOCK *x) { + int i; + int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED && + x->e_mbd.mode_info_context->mbmi.mode != SPLITMV); + + for (i = 0; i < 16; ++i) x->quantize_b(&x->block[i], &x->e_mbd.block[i]); + + if (has_2nd_order) x->quantize_b(&x->block[24], &x->e_mbd.block[24]); +} + +void vp8_quantize_mb(MACROBLOCK *x) { + int i; + int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED && + x->e_mbd.mode_info_context->mbmi.mode != SPLITMV); + + for (i = 0; i < 24 + has_2nd_order; ++i) { + x->quantize_b(&x->block[i], &x->e_mbd.block[i]); + } +} + +void vp8_quantize_mbuv(MACROBLOCK *x) { + int i; + + for (i = 16; i < 24; ++i) x->quantize_b(&x->block[i], &x->e_mbd.block[i]); +} + +static const int qrounding_factors[129] = { + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48 +}; + +static const int qzbin_factors[129] = { + 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 80, 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80 +}; + +static const int qrounding_factors_y2[129] = { + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48 +}; + +static const int qzbin_factors_y2[129] = { + 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 80, 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80 +}; + +static void invert_quant(int improved_quant, short *quant, short *shift, + short d) { + if (improved_quant) { + unsigned int t; + int l, m; + t = (unsigned int)d; + l = get_msb(t); + m = 1 + (1 << (16 + l)) / d; + *quant = (short)(m - (1 << 16)); + *shift = l; + /* use multiplication and constant shift by 16 */ + *shift = 1 << (16 - *shift); + } else { + *quant = (1 << 16) / d; + *shift = 0; + } +} + +void vp8cx_init_quantizer(VP8_COMP *cpi) { + int i; + int quant_val; + int Q; + + int zbin_boost[16] = { 0, 0, 8, 10, 12, 14, 16, 20, + 24, 28, 32, 36, 40, 44, 44, 44 }; + + for (Q = 0; Q < QINDEX_RANGE; ++Q) { + /* dc values */ + quant_val = vp8_dc_quant(Q, cpi->common.y1dc_delta_q); + cpi->Y1quant_fast[Q][0] = (1 << 16) / quant_val; + invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + 0, + cpi->Y1quant_shift[Q] + 0, quant_val); + cpi->Y1zbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7; + cpi->Y1round[Q][0] = (qrounding_factors[Q] * quant_val) >> 7; + cpi->common.Y1dequant[Q][0] = quant_val; + cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7; + + quant_val = vp8_dc2quant(Q, cpi->common.y2dc_delta_q); + cpi->Y2quant_fast[Q][0] = (1 << 16) / quant_val; + invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + 0, + cpi->Y2quant_shift[Q] + 0, quant_val); + cpi->Y2zbin[Q][0] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7; + cpi->Y2round[Q][0] = (qrounding_factors_y2[Q] * quant_val) >> 7; + cpi->common.Y2dequant[Q][0] = quant_val; + cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7; + + quant_val = vp8_dc_uv_quant(Q, cpi->common.uvdc_delta_q); + cpi->UVquant_fast[Q][0] = (1 << 16) / quant_val; + invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + 0, + cpi->UVquant_shift[Q] + 0, quant_val); + cpi->UVzbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7; + cpi->UVround[Q][0] = (qrounding_factors[Q] * quant_val) >> 7; + cpi->common.UVdequant[Q][0] = quant_val; + cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7; + + /* all the ac values = ; */ + quant_val = vp8_ac_yquant(Q); + cpi->Y1quant_fast[Q][1] = (1 << 16) / quant_val; + invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + 1, + cpi->Y1quant_shift[Q] + 1, quant_val); + cpi->Y1zbin[Q][1] = ((qzbin_factors[Q] * quant_val) + 64) >> 7; + cpi->Y1round[Q][1] = (qrounding_factors[Q] * quant_val) >> 7; + cpi->common.Y1dequant[Q][1] = quant_val; + cpi->zrun_zbin_boost_y1[Q][1] = (quant_val * zbin_boost[1]) >> 7; + + quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q); + cpi->Y2quant_fast[Q][1] = (1 << 16) / quant_val; + invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + 1, + cpi->Y2quant_shift[Q] + 1, quant_val); + cpi->Y2zbin[Q][1] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7; + cpi->Y2round[Q][1] = (qrounding_factors_y2[Q] * quant_val) >> 7; + cpi->common.Y2dequant[Q][1] = quant_val; + cpi->zrun_zbin_boost_y2[Q][1] = (quant_val * zbin_boost[1]) >> 7; + + quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q); + cpi->UVquant_fast[Q][1] = (1 << 16) / quant_val; + invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + 1, + cpi->UVquant_shift[Q] + 1, quant_val); + cpi->UVzbin[Q][1] = ((qzbin_factors[Q] * quant_val) + 64) >> 7; + cpi->UVround[Q][1] = (qrounding_factors[Q] * quant_val) >> 7; + cpi->common.UVdequant[Q][1] = quant_val; + cpi->zrun_zbin_boost_uv[Q][1] = (quant_val * zbin_boost[1]) >> 7; + + for (i = 2; i < 16; ++i) { + cpi->Y1quant_fast[Q][i] = cpi->Y1quant_fast[Q][1]; + cpi->Y1quant[Q][i] = cpi->Y1quant[Q][1]; + cpi->Y1quant_shift[Q][i] = cpi->Y1quant_shift[Q][1]; + cpi->Y1zbin[Q][i] = cpi->Y1zbin[Q][1]; + cpi->Y1round[Q][i] = cpi->Y1round[Q][1]; + cpi->zrun_zbin_boost_y1[Q][i] = + (cpi->common.Y1dequant[Q][1] * zbin_boost[i]) >> 7; + + cpi->Y2quant_fast[Q][i] = cpi->Y2quant_fast[Q][1]; + cpi->Y2quant[Q][i] = cpi->Y2quant[Q][1]; + cpi->Y2quant_shift[Q][i] = cpi->Y2quant_shift[Q][1]; + cpi->Y2zbin[Q][i] = cpi->Y2zbin[Q][1]; + cpi->Y2round[Q][i] = cpi->Y2round[Q][1]; + cpi->zrun_zbin_boost_y2[Q][i] = + (cpi->common.Y2dequant[Q][1] * zbin_boost[i]) >> 7; + + cpi->UVquant_fast[Q][i] = cpi->UVquant_fast[Q][1]; + cpi->UVquant[Q][i] = cpi->UVquant[Q][1]; + cpi->UVquant_shift[Q][i] = cpi->UVquant_shift[Q][1]; + cpi->UVzbin[Q][i] = cpi->UVzbin[Q][1]; + cpi->UVround[Q][i] = cpi->UVround[Q][1]; + cpi->zrun_zbin_boost_uv[Q][i] = + (cpi->common.UVdequant[Q][1] * zbin_boost[i]) >> 7; + } + } +} + +#define ZBIN_EXTRA_Y \ + ((cpi->common.Y1dequant[QIndex][1] * \ + (x->zbin_over_quant + x->zbin_mode_boost + x->act_zbin_adj)) >> \ + 7) + +#define ZBIN_EXTRA_UV \ + ((cpi->common.UVdequant[QIndex][1] * \ + (x->zbin_over_quant + x->zbin_mode_boost + x->act_zbin_adj)) >> \ + 7) + +#define ZBIN_EXTRA_Y2 \ + ((cpi->common.Y2dequant[QIndex][1] * \ + ((x->zbin_over_quant / 2) + x->zbin_mode_boost + x->act_zbin_adj)) >> \ + 7) + +void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip) { + int i; + int QIndex; + MACROBLOCKD *xd = &x->e_mbd; + int zbin_extra; + + /* Select the baseline MB Q index. */ + if (xd->segmentation_enabled) { + /* Abs Value */ + if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) { + QIndex = xd->segment_feature_data[MB_LVL_ALT_Q] + [xd->mode_info_context->mbmi.segment_id]; + /* Delta Value */ + } else { + QIndex = cpi->common.base_qindex + + xd->segment_feature_data[MB_LVL_ALT_Q] + [xd->mode_info_context->mbmi.segment_id]; + /* Clamp to valid range */ + QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0; + } + } else { + QIndex = cpi->common.base_qindex; + } + + /* This initialization should be called at least once. Use ok_to_skip to + * decide if it is ok to skip. + * Before encoding a frame, this function is always called with ok_to_skip + * =0, which means no skiping of calculations. The "last" values are + * initialized at that time. + */ + if (!ok_to_skip || QIndex != x->q_index) { + xd->dequant_y1_dc[0] = 1; + xd->dequant_y1[0] = cpi->common.Y1dequant[QIndex][0]; + xd->dequant_y2[0] = cpi->common.Y2dequant[QIndex][0]; + xd->dequant_uv[0] = cpi->common.UVdequant[QIndex][0]; + + for (i = 1; i < 16; ++i) { + xd->dequant_y1_dc[i] = xd->dequant_y1[i] = + cpi->common.Y1dequant[QIndex][1]; + xd->dequant_y2[i] = cpi->common.Y2dequant[QIndex][1]; + xd->dequant_uv[i] = cpi->common.UVdequant[QIndex][1]; + } +#if 1 + /*TODO: Remove dequant from BLOCKD. This is a temporary solution until + * the quantizer code uses a passed in pointer to the dequant constants. + * This will also require modifications to the x86 and neon assembly. + * */ + for (i = 0; i < 16; ++i) x->e_mbd.block[i].dequant = xd->dequant_y1; + for (i = 16; i < 24; ++i) x->e_mbd.block[i].dequant = xd->dequant_uv; + x->e_mbd.block[24].dequant = xd->dequant_y2; +#endif + + /* Y */ + zbin_extra = ZBIN_EXTRA_Y; + + for (i = 0; i < 16; ++i) { + x->block[i].quant = cpi->Y1quant[QIndex]; + x->block[i].quant_fast = cpi->Y1quant_fast[QIndex]; + x->block[i].quant_shift = cpi->Y1quant_shift[QIndex]; + x->block[i].zbin = cpi->Y1zbin[QIndex]; + x->block[i].round = cpi->Y1round[QIndex]; + x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_y1[QIndex]; + x->block[i].zbin_extra = (short)zbin_extra; + } + + /* UV */ + zbin_extra = ZBIN_EXTRA_UV; + + for (i = 16; i < 24; ++i) { + x->block[i].quant = cpi->UVquant[QIndex]; + x->block[i].quant_fast = cpi->UVquant_fast[QIndex]; + x->block[i].quant_shift = cpi->UVquant_shift[QIndex]; + x->block[i].zbin = cpi->UVzbin[QIndex]; + x->block[i].round = cpi->UVround[QIndex]; + x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[QIndex]; + x->block[i].zbin_extra = (short)zbin_extra; + } + + /* Y2 */ + zbin_extra = ZBIN_EXTRA_Y2; + + x->block[24].quant_fast = cpi->Y2quant_fast[QIndex]; + x->block[24].quant = cpi->Y2quant[QIndex]; + x->block[24].quant_shift = cpi->Y2quant_shift[QIndex]; + x->block[24].zbin = cpi->Y2zbin[QIndex]; + x->block[24].round = cpi->Y2round[QIndex]; + x->block[24].zrun_zbin_boost = cpi->zrun_zbin_boost_y2[QIndex]; + x->block[24].zbin_extra = (short)zbin_extra; + + /* save this macroblock QIndex for vp8_update_zbin_extra() */ + x->q_index = QIndex; + + x->last_zbin_over_quant = x->zbin_over_quant; + x->last_zbin_mode_boost = x->zbin_mode_boost; + x->last_act_zbin_adj = x->act_zbin_adj; + + } else if (x->last_zbin_over_quant != x->zbin_over_quant || + x->last_zbin_mode_boost != x->zbin_mode_boost || + x->last_act_zbin_adj != x->act_zbin_adj) { + /* Y */ + zbin_extra = ZBIN_EXTRA_Y; + + for (i = 0; i < 16; ++i) x->block[i].zbin_extra = (short)zbin_extra; + + /* UV */ + zbin_extra = ZBIN_EXTRA_UV; + + for (i = 16; i < 24; ++i) x->block[i].zbin_extra = (short)zbin_extra; + + /* Y2 */ + zbin_extra = ZBIN_EXTRA_Y2; + x->block[24].zbin_extra = (short)zbin_extra; + + x->last_zbin_over_quant = x->zbin_over_quant; + x->last_zbin_mode_boost = x->zbin_mode_boost; + x->last_act_zbin_adj = x->act_zbin_adj; + } +} + +void vp8_update_zbin_extra(VP8_COMP *cpi, MACROBLOCK *x) { + int i; + int QIndex = x->q_index; + int zbin_extra; + + /* Y */ + zbin_extra = ZBIN_EXTRA_Y; + + for (i = 0; i < 16; ++i) x->block[i].zbin_extra = (short)zbin_extra; + + /* UV */ + zbin_extra = ZBIN_EXTRA_UV; + + for (i = 16; i < 24; ++i) x->block[i].zbin_extra = (short)zbin_extra; + + /* Y2 */ + zbin_extra = ZBIN_EXTRA_Y2; + x->block[24].zbin_extra = (short)zbin_extra; +} +#undef ZBIN_EXTRA_Y +#undef ZBIN_EXTRA_UV +#undef ZBIN_EXTRA_Y2 + +void vp8cx_frame_init_quantizer(VP8_COMP *cpi) { + /* Clear Zbin mode boost for default case */ + cpi->mb.zbin_mode_boost = 0; + + /* MB level quantizer setup */ + vp8cx_mb_init_quantizer(cpi, &cpi->mb, 0); +} + +void vp8_set_quantizer(struct VP8_COMP *cpi, int Q) { + VP8_COMMON *cm = &cpi->common; + MACROBLOCKD *mbd = &cpi->mb.e_mbd; + int update = 0; + int new_delta_q; + int new_uv_delta_q; + cm->base_qindex = Q; + + /* if any of the delta_q values are changing update flag has to be set */ + /* currently only y2dc_delta_q may change */ + + cm->y1dc_delta_q = 0; + cm->y2ac_delta_q = 0; + + if (Q < 4) { + new_delta_q = 4 - Q; + } else { + new_delta_q = 0; + } + + update |= cm->y2dc_delta_q != new_delta_q; + cm->y2dc_delta_q = new_delta_q; + + new_uv_delta_q = 0; + // For screen content, lower the q value for UV channel. For now, select + // conservative delta; same delta for dc and ac, and decrease it with lower + // Q, and set to 0 below some threshold. May want to condition this in + // future on the variance/energy in UV channel. + if (cpi->oxcf.screen_content_mode && Q > 40) { + new_uv_delta_q = -(int)(0.15 * Q); + // Check range: magnitude of delta is 4 bits. + if (new_uv_delta_q < -15) { + new_uv_delta_q = -15; + } + } + update |= cm->uvdc_delta_q != new_uv_delta_q; + cm->uvdc_delta_q = new_uv_delta_q; + cm->uvac_delta_q = new_uv_delta_q; + + /* Set Segment specific quatizers */ + mbd->segment_feature_data[MB_LVL_ALT_Q][0] = + cpi->segment_feature_data[MB_LVL_ALT_Q][0]; + mbd->segment_feature_data[MB_LVL_ALT_Q][1] = + cpi->segment_feature_data[MB_LVL_ALT_Q][1]; + mbd->segment_feature_data[MB_LVL_ALT_Q][2] = + cpi->segment_feature_data[MB_LVL_ALT_Q][2]; + mbd->segment_feature_data[MB_LVL_ALT_Q][3] = + cpi->segment_feature_data[MB_LVL_ALT_Q][3]; + + /* quantizer has to be reinitialized for any delta_q changes */ + if (update) vp8cx_init_quantizer(cpi); +} diff --git a/media/libvpx/libvpx/vp8/encoder/x86/block_error_sse2.asm b/media/libvpx/libvpx/vp8/encoder/x86/block_error_sse2.asm new file mode 100644 index 0000000000..200b4ccfe6 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/x86/block_error_sse2.asm @@ -0,0 +1,188 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +SECTION .text + +;int vp8_block_error_sse2(short *coeff_ptr, short *dcoef_ptr) +globalsym(vp8_block_error_sse2) +sym(vp8_block_error_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + push rsi + push rdi + ; end prologue + + mov rsi, arg(0) ;coeff_ptr + mov rdi, arg(1) ;dcoef_ptr + + movdqa xmm0, [rsi] + movdqa xmm1, [rdi] + + movdqa xmm2, [rsi+16] + movdqa xmm3, [rdi+16] + + psubw xmm0, xmm1 + psubw xmm2, xmm3 + + pmaddwd xmm0, xmm0 + pmaddwd xmm2, xmm2 + + paddd xmm0, xmm2 + + pxor xmm5, xmm5 + movdqa xmm1, xmm0 + + punpckldq xmm0, xmm5 + punpckhdq xmm1, xmm5 + + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + + psrldq xmm0, 8 + paddd xmm0, xmm1 + + movq rax, xmm0 + + pop rdi + pop rsi + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + +;int vp8_mbblock_error_sse2_impl(short *coeff_ptr, short *dcoef_ptr, int dc); +globalsym(vp8_mbblock_error_sse2_impl) +sym(vp8_mbblock_error_sse2_impl): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + SAVE_XMM 6 + push rsi + push rdi + ; end prolog + + + mov rsi, arg(0) ;coeff_ptr + pxor xmm6, xmm6 + + mov rdi, arg(1) ;dcoef_ptr + pxor xmm4, xmm4 + + movd xmm5, dword ptr arg(2) ;dc + por xmm5, xmm4 + + pcmpeqw xmm5, xmm6 + mov rcx, 16 + +.mberror_loop: + movdqa xmm0, [rsi] + movdqa xmm1, [rdi] + + movdqa xmm2, [rsi+16] + movdqa xmm3, [rdi+16] + + + psubw xmm2, xmm3 + pmaddwd xmm2, xmm2 + + psubw xmm0, xmm1 + pand xmm0, xmm5 + + pmaddwd xmm0, xmm0 + add rsi, 32 + + add rdi, 32 + + sub rcx, 1 + paddd xmm4, xmm2 + + paddd xmm4, xmm0 + jnz .mberror_loop + + movdqa xmm0, xmm4 + punpckldq xmm0, xmm6 + + punpckhdq xmm4, xmm6 + paddd xmm0, xmm4 + + movdqa xmm1, xmm0 + psrldq xmm0, 8 + + paddd xmm0, xmm1 + movq rax, xmm0 + + pop rdi + pop rsi + ; begin epilog + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;int vp8_mbuverror_sse2_impl(short *s_ptr, short *d_ptr); +globalsym(vp8_mbuverror_sse2_impl) +sym(vp8_mbuverror_sse2_impl): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + push rsi + push rdi + ; end prolog + + + mov rsi, arg(0) ;s_ptr + mov rdi, arg(1) ;d_ptr + + mov rcx, 16 + pxor xmm3, xmm3 + +.mbuverror_loop: + + movdqa xmm1, [rsi] + movdqa xmm2, [rdi] + + psubw xmm1, xmm2 + pmaddwd xmm1, xmm1 + + paddd xmm3, xmm1 + + add rsi, 16 + add rdi, 16 + + dec rcx + jnz .mbuverror_loop + + pxor xmm0, xmm0 + movdqa xmm1, xmm3 + + movdqa xmm2, xmm1 + punpckldq xmm1, xmm0 + + punpckhdq xmm2, xmm0 + paddd xmm1, xmm2 + + movdqa xmm2, xmm1 + + psrldq xmm1, 8 + paddd xmm1, xmm2 + + movq rax, xmm1 + + pop rdi + pop rsi + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/libvpx/vp8/encoder/x86/copy_sse2.asm b/media/libvpx/libvpx/vp8/encoder/x86/copy_sse2.asm new file mode 100644 index 0000000000..fe78da398e --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/x86/copy_sse2.asm @@ -0,0 +1,94 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +SECTION .text + +;void vp8_copy32xn_sse2( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *dst_ptr, +; int dst_stride, +; int height); +globalsym(vp8_copy32xn_sse2) +sym(vp8_copy32xn_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;dst_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;dst_stride + movsxd rcx, dword ptr arg(4) ;height + +.block_copy_sse2_loopx4: + movdqu xmm0, XMMWORD PTR [rsi] + movdqu xmm1, XMMWORD PTR [rsi + 16] + movdqu xmm2, XMMWORD PTR [rsi + rax] + movdqu xmm3, XMMWORD PTR [rsi + rax + 16] + + lea rsi, [rsi+rax*2] + + movdqu xmm4, XMMWORD PTR [rsi] + movdqu xmm5, XMMWORD PTR [rsi + 16] + movdqu xmm6, XMMWORD PTR [rsi + rax] + movdqu xmm7, XMMWORD PTR [rsi + rax + 16] + + lea rsi, [rsi+rax*2] + + movdqa XMMWORD PTR [rdi], xmm0 + movdqa XMMWORD PTR [rdi + 16], xmm1 + movdqa XMMWORD PTR [rdi + rdx], xmm2 + movdqa XMMWORD PTR [rdi + rdx + 16], xmm3 + + lea rdi, [rdi+rdx*2] + + movdqa XMMWORD PTR [rdi], xmm4 + movdqa XMMWORD PTR [rdi + 16], xmm5 + movdqa XMMWORD PTR [rdi + rdx], xmm6 + movdqa XMMWORD PTR [rdi + rdx + 16], xmm7 + + lea rdi, [rdi+rdx*2] + + sub rcx, 4 + cmp rcx, 4 + jge .block_copy_sse2_loopx4 + + cmp rcx, 0 + je .copy_is_done + +.block_copy_sse2_loop: + movdqu xmm0, XMMWORD PTR [rsi] + movdqu xmm1, XMMWORD PTR [rsi + 16] + lea rsi, [rsi+rax] + + movdqa XMMWORD PTR [rdi], xmm0 + movdqa XMMWORD PTR [rdi + 16], xmm1 + lea rdi, [rdi+rdx] + + sub rcx, 1 + jne .block_copy_sse2_loop + +.copy_is_done: + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/libvpx/vp8/encoder/x86/copy_sse3.asm b/media/libvpx/libvpx/vp8/encoder/x86/copy_sse3.asm new file mode 100644 index 0000000000..c40b2d8bf6 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/x86/copy_sse3.asm @@ -0,0 +1,147 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +%macro STACK_FRAME_CREATE_X3 0 +%if ABI_IS_32BIT + %define src_ptr rsi + %define src_stride rax + %define ref_ptr rdi + %define ref_stride rdx + %define end_ptr rcx + %define ret_var rbx + %define result_ptr arg(4) + %define max_sad arg(4) + %define height dword ptr arg(4) + push rbp + mov rbp, rsp + push rsi + push rdi + push rbx + + mov rsi, arg(0) ; src_ptr + mov rdi, arg(2) ; ref_ptr + + movsxd rax, dword ptr arg(1) ; src_stride + movsxd rdx, dword ptr arg(3) ; ref_stride +%else + %if LIBVPX_YASM_WIN64 + SAVE_XMM 7, u + %define src_ptr rcx + %define src_stride rdx + %define ref_ptr r8 + %define ref_stride r9 + %define end_ptr r10 + %define ret_var r11 + %define result_ptr [rsp+xmm_stack_space+8+4*8] + %define max_sad [rsp+xmm_stack_space+8+4*8] + %define height dword ptr [rsp+xmm_stack_space+8+4*8] + %else + %define src_ptr rdi + %define src_stride rsi + %define ref_ptr rdx + %define ref_stride rcx + %define end_ptr r9 + %define ret_var r10 + %define result_ptr r8 + %define max_sad r8 + %define height r8 + %endif +%endif + +%endmacro + +%macro STACK_FRAME_DESTROY_X3 0 + %define src_ptr + %define src_stride + %define ref_ptr + %define ref_stride + %define end_ptr + %define ret_var + %define result_ptr + %define max_sad + %define height + +%if ABI_IS_32BIT + pop rbx + pop rdi + pop rsi + pop rbp +%else + %if LIBVPX_YASM_WIN64 + RESTORE_XMM + %endif +%endif + ret +%endmacro + +SECTION .text + +;void vp8_copy32xn_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *dst_ptr, +; int dst_stride, +; int height); +globalsym(vp8_copy32xn_sse3) +sym(vp8_copy32xn_sse3): + + STACK_FRAME_CREATE_X3 + +.block_copy_sse3_loopx4: + lea end_ptr, [src_ptr+src_stride*2] + + movdqu xmm0, XMMWORD PTR [src_ptr] + movdqu xmm1, XMMWORD PTR [src_ptr + 16] + movdqu xmm2, XMMWORD PTR [src_ptr + src_stride] + movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16] + movdqu xmm4, XMMWORD PTR [end_ptr] + movdqu xmm5, XMMWORD PTR [end_ptr + 16] + movdqu xmm6, XMMWORD PTR [end_ptr + src_stride] + movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16] + + lea src_ptr, [src_ptr+src_stride*4] + + lea end_ptr, [ref_ptr+ref_stride*2] + + movdqa XMMWORD PTR [ref_ptr], xmm0 + movdqa XMMWORD PTR [ref_ptr + 16], xmm1 + movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2 + movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3 + movdqa XMMWORD PTR [end_ptr], xmm4 + movdqa XMMWORD PTR [end_ptr + 16], xmm5 + movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6 + movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7 + + lea ref_ptr, [ref_ptr+ref_stride*4] + + sub height, 4 + cmp height, 4 + jge .block_copy_sse3_loopx4 + + ;Check to see if there is more rows need to be copied. + cmp height, 0 + je .copy_is_done + +.block_copy_sse3_loop: + movdqu xmm0, XMMWORD PTR [src_ptr] + movdqu xmm1, XMMWORD PTR [src_ptr + 16] + lea src_ptr, [src_ptr+src_stride] + + movdqa XMMWORD PTR [ref_ptr], xmm0 + movdqa XMMWORD PTR [ref_ptr + 16], xmm1 + lea ref_ptr, [ref_ptr+ref_stride] + + sub height, 1 + jne .block_copy_sse3_loop + +.copy_is_done: + STACK_FRAME_DESTROY_X3 diff --git a/media/libvpx/libvpx/vp8/encoder/x86/dct_sse2.asm b/media/libvpx/libvpx/vp8/encoder/x86/dct_sse2.asm new file mode 100644 index 0000000000..3c28cb902e --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/x86/dct_sse2.asm @@ -0,0 +1,434 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%macro STACK_FRAME_CREATE 0 +%if ABI_IS_32BIT + %define input rsi + %define output rdi + %define pitch rax + push rbp + mov rbp, rsp + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) + mov rdi, arg(1) + + movsxd rax, dword ptr arg(2) + lea rcx, [rsi + rax*2] +%else + %if LIBVPX_YASM_WIN64 + %define input rcx + %define output rdx + %define pitch r8 + SAVE_XMM 7, u + %else + %define input rdi + %define output rsi + %define pitch rdx + %endif +%endif +%endmacro + +%macro STACK_FRAME_DESTROY 0 + %define input + %define output + %define pitch + +%if ABI_IS_32BIT + pop rdi + pop rsi + RESTORE_GOT + pop rbp +%else + %if LIBVPX_YASM_WIN64 + RESTORE_XMM + %endif +%endif + ret +%endmacro + +SECTION .text + +;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch) +globalsym(vp8_short_fdct4x4_sse2) +sym(vp8_short_fdct4x4_sse2): + + STACK_FRAME_CREATE + + movq xmm0, MMWORD PTR[input ] ;03 02 01 00 + movq xmm2, MMWORD PTR[input+ pitch] ;13 12 11 10 + lea input, [input+2*pitch] + movq xmm1, MMWORD PTR[input ] ;23 22 21 20 + movq xmm3, MMWORD PTR[input+ pitch] ;33 32 31 30 + + punpcklqdq xmm0, xmm2 ;13 12 11 10 03 02 01 00 + punpcklqdq xmm1, xmm3 ;33 32 31 30 23 22 21 20 + + movdqa xmm2, xmm0 + punpckldq xmm0, xmm1 ;23 22 03 02 21 20 01 00 + punpckhdq xmm2, xmm1 ;33 32 13 12 31 30 11 10 + movdqa xmm1, xmm0 + punpckldq xmm0, xmm2 ;31 21 30 20 11 10 01 00 + pshufhw xmm1, xmm1, 0b1h ;22 23 02 03 xx xx xx xx + pshufhw xmm2, xmm2, 0b1h ;32 33 12 13 xx xx xx xx + + punpckhdq xmm1, xmm2 ;32 33 22 23 12 13 02 03 + movdqa xmm3, xmm0 + paddw xmm0, xmm1 ;b1 a1 b1 a1 b1 a1 b1 a1 + psubw xmm3, xmm1 ;c1 d1 c1 d1 c1 d1 c1 d1 + psllw xmm0, 3 ;b1 <<= 3 a1 <<= 3 + psllw xmm3, 3 ;c1 <<= 3 d1 <<= 3 + + movdqa xmm1, xmm0 + pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1 + pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1 + movdqa xmm4, xmm3 + pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352 + pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352 + + paddd xmm3, XMMWORD PTR[GLOBAL(_14500)] + paddd xmm4, XMMWORD PTR[GLOBAL(_7500)] + psrad xmm3, 12 ;(c1 * 2217 + d1 * 5352 + 14500)>>12 + psrad xmm4, 12 ;(d1 * 2217 - c1 * 5352 + 7500)>>12 + + packssdw xmm0, xmm1 ;op[2] op[0] + packssdw xmm3, xmm4 ;op[3] op[1] + ; 23 22 21 20 03 02 01 00 + ; + ; 33 32 31 30 13 12 11 10 + ; + movdqa xmm2, xmm0 + punpcklqdq xmm0, xmm3 ;13 12 11 10 03 02 01 00 + punpckhqdq xmm2, xmm3 ;23 22 21 20 33 32 31 30 + + movdqa xmm3, xmm0 + punpcklwd xmm0, xmm2 ;32 30 22 20 12 10 02 00 + punpckhwd xmm3, xmm2 ;33 31 23 21 13 11 03 01 + movdqa xmm2, xmm0 + punpcklwd xmm0, xmm3 ;13 12 11 10 03 02 01 00 + punpckhwd xmm2, xmm3 ;33 32 31 30 23 22 21 20 + + movdqa xmm5, XMMWORD PTR[GLOBAL(_7)] + pshufd xmm2, xmm2, 04eh + movdqa xmm3, xmm0 + paddw xmm0, xmm2 ;b1 b1 b1 b1 a1 a1 a1 a1 + psubw xmm3, xmm2 ;c1 c1 c1 c1 d1 d1 d1 d1 + + pshufd xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 b1 a1 a1 + movdqa xmm2, xmm3 ;save d1 for compare + pshufd xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 c1 d1 d1 + pshuflw xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 a1 b1 a1 + pshuflw xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 d1 c1 d1 + pshufhw xmm0, xmm0, 0d8h ;b1 a1 b1 a1 b1 a1 b1 a1 + pshufhw xmm3, xmm3, 0d8h ;c1 d1 c1 d1 c1 d1 c1 d1 + movdqa xmm1, xmm0 + pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1 + pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1 + + pxor xmm4, xmm4 ;zero out for compare + paddd xmm0, xmm5 + paddd xmm1, xmm5 + pcmpeqw xmm2, xmm4 + psrad xmm0, 4 ;(a1 + b1 + 7)>>4 + psrad xmm1, 4 ;(a1 - b1 + 7)>>4 + pandn xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper, + ;and keep bit 0 of lower + + movdqa xmm4, xmm3 + pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352 + pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352 + paddd xmm3, XMMWORD PTR[GLOBAL(_12000)] + paddd xmm4, XMMWORD PTR[GLOBAL(_51000)] + packssdw xmm0, xmm1 ;op[8] op[0] + psrad xmm3, 16 ;(c1 * 2217 + d1 * 5352 + 12000)>>16 + psrad xmm4, 16 ;(d1 * 2217 - c1 * 5352 + 51000)>>16 + + packssdw xmm3, xmm4 ;op[12] op[4] + movdqa xmm1, xmm0 + paddw xmm3, xmm2 ;op[4] += (d1!=0) + punpcklqdq xmm0, xmm3 ;op[4] op[0] + punpckhqdq xmm1, xmm3 ;op[12] op[8] + + movdqa XMMWORD PTR[output + 0], xmm0 + movdqa XMMWORD PTR[output + 16], xmm1 + + STACK_FRAME_DESTROY + +;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch) +globalsym(vp8_short_fdct8x4_sse2) +sym(vp8_short_fdct8x4_sse2): + + STACK_FRAME_CREATE + + ; read the input data + movdqa xmm0, [input ] + movdqa xmm2, [input+ pitch] + lea input, [input+2*pitch] + movdqa xmm4, [input ] + movdqa xmm3, [input+ pitch] + + ; transpose for the first stage + movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07 + movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27 + + punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13 + punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17 + + punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33 + punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37 + + movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13 + punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31 + + punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33 + + movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17 + punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35 + + punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37 + movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33 + + punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37 + punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36 + + movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31 + punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34 + + punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35 + + ; xmm0 0 + ; xmm1 1 + ; xmm2 2 + ; xmm3 3 + + ; first stage + movdqa xmm5, xmm0 + movdqa xmm4, xmm1 + + paddw xmm0, xmm3 ; a1 = 0 + 3 + paddw xmm1, xmm2 ; b1 = 1 + 2 + + psubw xmm4, xmm2 ; c1 = 1 - 2 + psubw xmm5, xmm3 ; d1 = 0 - 3 + + psllw xmm5, 3 + psllw xmm4, 3 + + psllw xmm0, 3 + psllw xmm1, 3 + + ; output 0 and 2 + movdqa xmm2, xmm0 ; a1 + + paddw xmm0, xmm1 ; op[0] = a1 + b1 + psubw xmm2, xmm1 ; op[2] = a1 - b1 + + ; output 1 and 3 + ; interleave c1, d1 + movdqa xmm1, xmm5 ; d1 + punpcklwd xmm1, xmm4 ; c1 d1 + punpckhwd xmm5, xmm4 ; c1 d1 + + movdqa xmm3, xmm1 + movdqa xmm4, xmm5 + + pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 + pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 + + pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 + pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 + + paddd xmm1, XMMWORD PTR[GLOBAL(_14500)] + paddd xmm4, XMMWORD PTR[GLOBAL(_14500)] + paddd xmm3, XMMWORD PTR[GLOBAL(_7500)] + paddd xmm5, XMMWORD PTR[GLOBAL(_7500)] + + psrad xmm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 + psrad xmm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 + psrad xmm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 + psrad xmm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 + + packssdw xmm1, xmm4 ; op[1] + packssdw xmm3, xmm5 ; op[3] + + ; done with vertical + ; transpose for the second stage + movdqa xmm4, xmm0 ; 00 10 20 30 04 14 24 34 + movdqa xmm5, xmm2 ; 02 12 22 32 06 16 26 36 + + punpcklwd xmm0, xmm1 ; 00 01 10 11 20 21 30 31 + punpckhwd xmm4, xmm1 ; 04 05 14 15 24 25 34 35 + + punpcklwd xmm2, xmm3 ; 02 03 12 13 22 23 32 33 + punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37 + + movdqa xmm1, xmm0 ; 00 01 10 11 20 21 30 31 + punpckldq xmm0, xmm2 ; 00 01 02 03 10 11 12 13 + + punpckhdq xmm1, xmm2 ; 20 21 22 23 30 31 32 33 + + movdqa xmm2, xmm4 ; 04 05 14 15 24 25 34 35 + punpckldq xmm2, xmm5 ; 04 05 06 07 14 15 16 17 + + punpckhdq xmm4, xmm5 ; 24 25 26 27 34 35 36 37 + movdqa xmm3, xmm1 ; 20 21 22 23 30 31 32 33 + + punpckhqdq xmm3, xmm4 ; 30 31 32 33 34 35 36 37 + punpcklqdq xmm1, xmm4 ; 20 21 22 23 24 25 26 27 + + movdqa xmm4, xmm0 ; 00 01 02 03 10 11 12 13 + punpcklqdq xmm0, xmm2 ; 00 01 02 03 04 05 06 07 + + punpckhqdq xmm4, xmm2 ; 10 11 12 13 14 15 16 17 + + ; xmm0 0 + ; xmm1 4 + ; xmm2 1 + ; xmm3 3 + + movdqa xmm5, xmm0 + movdqa xmm2, xmm1 + + paddw xmm0, xmm3 ; a1 = 0 + 3 + paddw xmm1, xmm4 ; b1 = 1 + 2 + + psubw xmm4, xmm2 ; c1 = 1 - 2 + psubw xmm5, xmm3 ; d1 = 0 - 3 + + pxor xmm6, xmm6 ; zero out for compare + + pcmpeqw xmm6, xmm5 ; d1 != 0 + + pandn xmm6, XMMWORD PTR[GLOBAL(_cmp_mask8x4)] ; clear upper, + ; and keep bit 0 of lower + + ; output 0 and 2 + movdqa xmm2, xmm0 ; a1 + + paddw xmm0, xmm1 ; a1 + b1 + psubw xmm2, xmm1 ; a1 - b1 + + paddw xmm0, XMMWORD PTR[GLOBAL(_7w)] + paddw xmm2, XMMWORD PTR[GLOBAL(_7w)] + + psraw xmm0, 4 ; op[0] = (a1 + b1 + 7)>>4 + psraw xmm2, 4 ; op[8] = (a1 - b1 + 7)>>4 + + ; output 1 and 3 + ; interleave c1, d1 + movdqa xmm1, xmm5 ; d1 + punpcklwd xmm1, xmm4 ; c1 d1 + punpckhwd xmm5, xmm4 ; c1 d1 + + movdqa xmm3, xmm1 + movdqa xmm4, xmm5 + + pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 + pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 + + pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 + pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 + + paddd xmm1, XMMWORD PTR[GLOBAL(_12000)] + paddd xmm4, XMMWORD PTR[GLOBAL(_12000)] + paddd xmm3, XMMWORD PTR[GLOBAL(_51000)] + paddd xmm5, XMMWORD PTR[GLOBAL(_51000)] + + psrad xmm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 + psrad xmm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 + psrad xmm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 + psrad xmm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 + + packssdw xmm1, xmm4 ; op[4] + packssdw xmm3, xmm5 ; op[12] + + paddw xmm1, xmm6 ; op[4] += (d1!=0) + + movdqa xmm4, xmm0 + movdqa xmm5, xmm2 + + punpcklqdq xmm0, xmm1 + punpckhqdq xmm4, xmm1 + + punpcklqdq xmm2, xmm3 + punpckhqdq xmm5, xmm3 + + movdqa XMMWORD PTR[output + 0 ], xmm0 + movdqa XMMWORD PTR[output + 16], xmm2 + movdqa XMMWORD PTR[output + 32], xmm4 + movdqa XMMWORD PTR[output + 48], xmm5 + + STACK_FRAME_DESTROY + +SECTION_RODATA +align 16 +_5352_2217: + dw 5352 + dw 2217 + dw 5352 + dw 2217 + dw 5352 + dw 2217 + dw 5352 + dw 2217 +align 16 +_2217_neg5352: + dw 2217 + dw -5352 + dw 2217 + dw -5352 + dw 2217 + dw -5352 + dw 2217 + dw -5352 +align 16 +_mult_add: + times 8 dw 1 +align 16 +_cmp_mask: + times 4 dw 1 + times 4 dw 0 +align 16 +_cmp_mask8x4: + times 8 dw 1 +align 16 +_mult_sub: + dw 1 + dw -1 + dw 1 + dw -1 + dw 1 + dw -1 + dw 1 + dw -1 +align 16 +_7: + times 4 dd 7 +align 16 +_7w: + times 8 dw 7 +align 16 +_14500: + times 4 dd 14500 +align 16 +_7500: + times 4 dd 7500 +align 16 +_12000: + times 4 dd 12000 +align 16 +_51000: + times 4 dd 51000 diff --git a/media/libvpx/libvpx/vp8/encoder/x86/denoising_sse2.c b/media/libvpx/libvpx/vp8/encoder/x86/denoising_sse2.c new file mode 100644 index 0000000000..f35b930169 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/x86/denoising_sse2.c @@ -0,0 +1,372 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp8/encoder/denoising.h" +#include "vp8/common/reconinter.h" +#include "vpx/vpx_integer.h" +#include "vpx_mem/vpx_mem.h" +#include "vp8_rtcd.h" + +#include +#include "vpx_ports/emmintrin_compat.h" + +/* Compute the sum of all pixel differences of this MB. */ +static INLINE unsigned int abs_sum_diff_16x1(__m128i acc_diff) { + const __m128i k_1 = _mm_set1_epi16(1); + const __m128i acc_diff_lo = + _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8); + const __m128i acc_diff_hi = + _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8); + const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi); + const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1); + const __m128i hgfe_dcba = + _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8)); + const __m128i hgfedcba = + _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4)); + unsigned int sum_diff = (unsigned int)abs(_mm_cvtsi128_si32(hgfedcba)); + + return sum_diff; +} + +int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, + int mc_avg_y_stride, unsigned char *running_avg_y, + int avg_y_stride, unsigned char *sig, + int sig_stride, unsigned int motion_magnitude, + int increase_denoising) { + unsigned char *running_avg_y_start = running_avg_y; + unsigned char *sig_start = sig; + unsigned int sum_diff_thresh; + int r; + int shift_inc = + (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) + ? 1 + : 0; + __m128i acc_diff = _mm_setzero_si128(); + const __m128i k_0 = _mm_setzero_si128(); + const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); + const __m128i k_8 = _mm_set1_epi8(8); + const __m128i k_16 = _mm_set1_epi8(16); + /* Modify each level's adjustment according to motion_magnitude. */ + const __m128i l3 = _mm_set1_epi8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6); + /* Difference between level 3 and level 2 is 2. */ + const __m128i l32 = _mm_set1_epi8(2); + /* Difference between level 2 and level 1 is 1. */ + const __m128i l21 = _mm_set1_epi8(1); + + for (r = 0; r < 16; ++r) { + /* Calculate differences */ + const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0])); + const __m128i v_mc_running_avg_y = + _mm_loadu_si128((__m128i *)(&mc_running_avg_y[0])); + __m128i v_running_avg_y; + const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); + const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); + /* Obtain the sign. FF if diff is negative. */ + const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); + /* Clamp absolute difference to 16 to be used to get mask. Doing this + * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */ + const __m128i clamped_absdiff = + _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_16); + /* Get masks for l2 l1 and l0 adjustments */ + const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff); + const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff); + const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff); + /* Get adjustments for l2, l1, and l0 */ + __m128i adj2 = _mm_and_si128(mask2, l32); + const __m128i adj1 = _mm_and_si128(mask1, l21); + const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff); + __m128i adj, padj, nadj; + + /* Combine the adjustments and get absolute adjustments. */ + adj2 = _mm_add_epi8(adj2, adj1); + adj = _mm_sub_epi8(l3, adj2); + adj = _mm_andnot_si128(mask0, adj); + adj = _mm_or_si128(adj, adj0); + + /* Restore the sign and get positive and negative adjustments. */ + padj = _mm_andnot_si128(diff_sign, adj); + nadj = _mm_and_si128(diff_sign, adj); + + /* Calculate filtered value. */ + v_running_avg_y = _mm_adds_epu8(v_sig, padj); + v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj); + _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); + + /* Adjustments <=7, and each element in acc_diff can fit in signed + * char. + */ + acc_diff = _mm_adds_epi8(acc_diff, padj); + acc_diff = _mm_subs_epi8(acc_diff, nadj); + + /* Update pointers for next iteration. */ + sig += sig_stride; + mc_running_avg_y += mc_avg_y_stride; + running_avg_y += avg_y_stride; + } + + { + /* Compute the sum of all pixel differences of this MB. */ + unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff); + sum_diff_thresh = SUM_DIFF_THRESHOLD; + if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH; + if (abs_sum_diff > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), + // check if we can still apply some (weaker) temporal filtering to + // this block, that would otherwise not be denoised at all. Simplest + // is to apply an additional adjustment to running_avg_y to bring it + // closer to sig. The adjustment is capped by a maximum delta, and + // chosen such that in most cases the resulting sum_diff will be + // within the acceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over the + // threshold. + int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const __m128i k_delta = _mm_set1_epi8(delta); + sig -= sig_stride * 16; + mc_running_avg_y -= mc_avg_y_stride * 16; + running_avg_y -= avg_y_stride * 16; + for (r = 0; r < 16; ++r) { + __m128i v_running_avg_y = + _mm_loadu_si128((__m128i *)(&running_avg_y[0])); + // Calculate differences. + const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0])); + const __m128i v_mc_running_avg_y = + _mm_loadu_si128((__m128i *)(&mc_running_avg_y[0])); + const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); + const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); + // Obtain the sign. FF if diff is negative. + const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); + // Clamp absolute difference to delta to get the adjustment. + const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta); + // Restore the sign and get positive and negative adjustments. + __m128i padj, nadj; + padj = _mm_andnot_si128(diff_sign, adj); + nadj = _mm_and_si128(diff_sign, adj); + // Calculate filtered value. + v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj); + v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj); + _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); + + // Accumulate the adjustments. + acc_diff = _mm_subs_epi8(acc_diff, padj); + acc_diff = _mm_adds_epi8(acc_diff, nadj); + + // Update pointers for next iteration. + sig += sig_stride; + mc_running_avg_y += mc_avg_y_stride; + running_avg_y += avg_y_stride; + } + abs_sum_diff = abs_sum_diff_16x1(acc_diff); + if (abs_sum_diff > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + + vp8_copy_mem16x16(running_avg_y_start, avg_y_stride, sig_start, sig_stride); + return FILTER_BLOCK; +} + +int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, + int mc_avg_stride, unsigned char *running_avg, + int avg_stride, unsigned char *sig, + int sig_stride, unsigned int motion_magnitude, + int increase_denoising) { + unsigned char *running_avg_start = running_avg; + unsigned char *sig_start = sig; + unsigned int sum_diff_thresh; + int r; + int shift_inc = + (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) + ? 1 + : 0; + __m128i acc_diff = _mm_setzero_si128(); + const __m128i k_0 = _mm_setzero_si128(); + const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); + const __m128i k_8 = _mm_set1_epi8(8); + const __m128i k_16 = _mm_set1_epi8(16); + /* Modify each level's adjustment according to motion_magnitude. */ + const __m128i l3 = _mm_set1_epi8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? 7 + shift_inc : 6); + /* Difference between level 3 and level 2 is 2. */ + const __m128i l32 = _mm_set1_epi8(2); + /* Difference between level 2 and level 1 is 1. */ + const __m128i l21 = _mm_set1_epi8(1); + + { + const __m128i k_1 = _mm_set1_epi16(1); + __m128i vec_sum_block = _mm_setzero_si128(); + + // Avoid denoising color signal if its close to average level. + for (r = 0; r < 8; ++r) { + const __m128i v_sig = _mm_loadl_epi64((__m128i *)(&sig[0])); + const __m128i v_sig_unpack = _mm_unpacklo_epi8(v_sig, k_0); + vec_sum_block = _mm_add_epi16(vec_sum_block, v_sig_unpack); + sig += sig_stride; + } + sig -= sig_stride * 8; + { + const __m128i hg_fe_dc_ba = _mm_madd_epi16(vec_sum_block, k_1); + const __m128i hgfe_dcba = + _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8)); + const __m128i hgfedcba = + _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4)); + const int sum_block = _mm_cvtsi128_si32(hgfedcba); + if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) { + return COPY_BLOCK; + } + } + } + + for (r = 0; r < 4; ++r) { + /* Calculate differences */ + const __m128i v_sig_low = + _mm_castpd_si128(_mm_load_sd((double *)(&sig[0]))); + const __m128i v_sig = _mm_castpd_si128(_mm_loadh_pd( + _mm_castsi128_pd(v_sig_low), (double *)(&sig[sig_stride]))); + const __m128i v_mc_running_avg_low = + _mm_castpd_si128(_mm_load_sd((double *)(&mc_running_avg[0]))); + const __m128i v_mc_running_avg = _mm_castpd_si128( + _mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low), + (double *)(&mc_running_avg[mc_avg_stride]))); + const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig); + const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg); + /* Obtain the sign. FF if diff is negative. */ + const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); + /* Clamp absolute difference to 16 to be used to get mask. Doing this + * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */ + const __m128i clamped_absdiff = + _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_16); + /* Get masks for l2 l1 and l0 adjustments */ + const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff); + const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff); + const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff); + /* Get adjustments for l2, l1, and l0 */ + __m128i adj2 = _mm_and_si128(mask2, l32); + const __m128i adj1 = _mm_and_si128(mask1, l21); + const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff); + __m128i adj, padj, nadj; + __m128i v_running_avg; + + /* Combine the adjustments and get absolute adjustments. */ + adj2 = _mm_add_epi8(adj2, adj1); + adj = _mm_sub_epi8(l3, adj2); + adj = _mm_andnot_si128(mask0, adj); + adj = _mm_or_si128(adj, adj0); + + /* Restore the sign and get positive and negative adjustments. */ + padj = _mm_andnot_si128(diff_sign, adj); + nadj = _mm_and_si128(diff_sign, adj); + + /* Calculate filtered value. */ + v_running_avg = _mm_adds_epu8(v_sig, padj); + v_running_avg = _mm_subs_epu8(v_running_avg, nadj); + + _mm_storel_pd((double *)&running_avg[0], _mm_castsi128_pd(v_running_avg)); + _mm_storeh_pd((double *)&running_avg[avg_stride], + _mm_castsi128_pd(v_running_avg)); + + /* Adjustments <=7, and each element in acc_diff can fit in signed + * char. + */ + acc_diff = _mm_adds_epi8(acc_diff, padj); + acc_diff = _mm_subs_epi8(acc_diff, nadj); + + /* Update pointers for next iteration. */ + sig += sig_stride * 2; + mc_running_avg += mc_avg_stride * 2; + running_avg += avg_stride * 2; + } + + { + unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff); + sum_diff_thresh = SUM_DIFF_THRESHOLD_UV; + if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV; + if (abs_sum_diff > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), + // check if we can still apply some (weaker) temporal filtering to + // this block, that would otherwise not be denoised at all. Simplest + // is to apply an additional adjustment to running_avg_y to bring it + // closer to sig. The adjustment is capped by a maximum delta, and + // chosen such that in most cases the resulting sum_diff will be + // within the acceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over the + // threshold. + int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const __m128i k_delta = _mm_set1_epi8(delta); + sig -= sig_stride * 8; + mc_running_avg -= mc_avg_stride * 8; + running_avg -= avg_stride * 8; + for (r = 0; r < 4; ++r) { + // Calculate differences. + const __m128i v_sig_low = + _mm_castpd_si128(_mm_load_sd((double *)(&sig[0]))); + const __m128i v_sig = _mm_castpd_si128(_mm_loadh_pd( + _mm_castsi128_pd(v_sig_low), (double *)(&sig[sig_stride]))); + const __m128i v_mc_running_avg_low = + _mm_castpd_si128(_mm_load_sd((double *)(&mc_running_avg[0]))); + const __m128i v_mc_running_avg = _mm_castpd_si128( + _mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low), + (double *)(&mc_running_avg[mc_avg_stride]))); + const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig); + const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg); + // Obtain the sign. FF if diff is negative. + const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); + // Clamp absolute difference to delta to get the adjustment. + const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta); + // Restore the sign and get positive and negative adjustments. + __m128i padj, nadj; + const __m128i v_running_avg_low = + _mm_castpd_si128(_mm_load_sd((double *)(&running_avg[0]))); + __m128i v_running_avg = _mm_castpd_si128( + _mm_loadh_pd(_mm_castsi128_pd(v_running_avg_low), + (double *)(&running_avg[avg_stride]))); + padj = _mm_andnot_si128(diff_sign, adj); + nadj = _mm_and_si128(diff_sign, adj); + // Calculate filtered value. + v_running_avg = _mm_subs_epu8(v_running_avg, padj); + v_running_avg = _mm_adds_epu8(v_running_avg, nadj); + + _mm_storel_pd((double *)&running_avg[0], + _mm_castsi128_pd(v_running_avg)); + _mm_storeh_pd((double *)&running_avg[avg_stride], + _mm_castsi128_pd(v_running_avg)); + + // Accumulate the adjustments. + acc_diff = _mm_subs_epi8(acc_diff, padj); + acc_diff = _mm_adds_epi8(acc_diff, nadj); + + // Update pointers for next iteration. + sig += sig_stride * 2; + mc_running_avg += mc_avg_stride * 2; + running_avg += avg_stride * 2; + } + abs_sum_diff = abs_sum_diff_16x1(acc_diff); + if (abs_sum_diff > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + + vp8_copy_mem8x8(running_avg_start, avg_stride, sig_start, sig_stride); + return FILTER_BLOCK; +} diff --git a/media/libvpx/libvpx/vp8/encoder/x86/fwalsh_sse2.asm b/media/libvpx/libvpx/vp8/encoder/x86/fwalsh_sse2.asm new file mode 100644 index 0000000000..938fc173ff --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/x86/fwalsh_sse2.asm @@ -0,0 +1,166 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +SECTION .text + +;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch) +globalsym(vp8_short_walsh4x4_sse2) +sym(vp8_short_walsh4x4_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ; input + mov rdi, arg(1) ; output + movsxd rdx, dword ptr arg(2) ; pitch + + ; first for loop + movq xmm0, MMWORD PTR [rsi] ; load input + movq xmm1, MMWORD PTR [rsi + rdx] + lea rsi, [rsi + rdx*2] + movq xmm2, MMWORD PTR [rsi] + movq xmm3, MMWORD PTR [rsi + rdx] + + punpcklwd xmm0, xmm1 + punpcklwd xmm2, xmm3 + + movdqa xmm1, xmm0 + punpckldq xmm0, xmm2 ; ip[1] ip[0] + punpckhdq xmm1, xmm2 ; ip[3] ip[2] + + movdqa xmm2, xmm0 + paddw xmm0, xmm1 + psubw xmm2, xmm1 + + psllw xmm0, 2 ; d1 a1 + psllw xmm2, 2 ; c1 b1 + + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 ; b1 a1 + punpckhqdq xmm1, xmm2 ; c1 d1 + + pxor xmm6, xmm6 + movq xmm6, xmm0 + pxor xmm7, xmm7 + pcmpeqw xmm7, xmm6 + paddw xmm7, [GLOBAL(c1)] + + movdqa xmm2, xmm0 + paddw xmm0, xmm1 ; b1+c1 a1+d1 + psubw xmm2, xmm1 ; b1-c1 a1-d1 + paddw xmm0, xmm7 ; b1+c1 a1+d1+(a1!=0) + + ; second for loop + ; input: 13 9 5 1 12 8 4 0 (xmm0) + ; 14 10 6 2 15 11 7 3 (xmm2) + ; after shuffle: + ; 13 5 9 1 12 4 8 0 (xmm0) + ; 14 6 10 2 15 7 11 3 (xmm1) + pshuflw xmm3, xmm0, 0xd8 + pshufhw xmm0, xmm3, 0xd8 + pshuflw xmm3, xmm2, 0xd8 + pshufhw xmm1, xmm3, 0xd8 + + movdqa xmm2, xmm0 + pmaddwd xmm0, [GLOBAL(c1)] ; d11 a11 d10 a10 + pmaddwd xmm2, [GLOBAL(cn1)] ; c11 b11 c10 b10 + movdqa xmm3, xmm1 + pmaddwd xmm1, [GLOBAL(c1)] ; d12 a12 d13 a13 + pmaddwd xmm3, [GLOBAL(cn1)] ; c12 b12 c13 b13 + + pshufd xmm4, xmm0, 0xd8 ; d11 d10 a11 a10 + pshufd xmm5, xmm2, 0xd8 ; c11 c10 b11 b10 + pshufd xmm6, xmm1, 0x72 ; d13 d12 a13 a12 + pshufd xmm7, xmm3, 0x72 ; c13 c12 b13 b12 + + movdqa xmm0, xmm4 + punpcklqdq xmm0, xmm5 ; b11 b10 a11 a10 + punpckhqdq xmm4, xmm5 ; c11 c10 d11 d10 + movdqa xmm1, xmm6 + punpcklqdq xmm1, xmm7 ; b13 b12 a13 a12 + punpckhqdq xmm6, xmm7 ; c13 c12 d13 d12 + + movdqa xmm2, xmm0 + paddd xmm0, xmm4 ; b21 b20 a21 a20 + psubd xmm2, xmm4 ; c21 c20 d21 d20 + movdqa xmm3, xmm1 + paddd xmm1, xmm6 ; b23 b22 a23 a22 + psubd xmm3, xmm6 ; c23 c22 d23 d22 + + pxor xmm4, xmm4 + movdqa xmm5, xmm4 + pcmpgtd xmm4, xmm0 + pcmpgtd xmm5, xmm2 + pand xmm4, [GLOBAL(cd1)] + pand xmm5, [GLOBAL(cd1)] + + pxor xmm6, xmm6 + movdqa xmm7, xmm6 + pcmpgtd xmm6, xmm1 + pcmpgtd xmm7, xmm3 + pand xmm6, [GLOBAL(cd1)] + pand xmm7, [GLOBAL(cd1)] + + paddd xmm0, xmm4 + paddd xmm2, xmm5 + paddd xmm0, [GLOBAL(cd3)] + paddd xmm2, [GLOBAL(cd3)] + paddd xmm1, xmm6 + paddd xmm3, xmm7 + paddd xmm1, [GLOBAL(cd3)] + paddd xmm3, [GLOBAL(cd3)] + + psrad xmm0, 3 + psrad xmm1, 3 + psrad xmm2, 3 + psrad xmm3, 3 + movdqa xmm4, xmm0 + punpcklqdq xmm0, xmm1 ; a23 a22 a21 a20 + punpckhqdq xmm4, xmm1 ; b23 b22 b21 b20 + movdqa xmm5, xmm2 + punpckhqdq xmm2, xmm3 ; c23 c22 c21 c20 + punpcklqdq xmm5, xmm3 ; d23 d22 d21 d20 + + packssdw xmm0, xmm4 ; b23 b22 b21 b20 a23 a22 a21 a20 + packssdw xmm2, xmm5 ; d23 d22 d21 d20 c23 c22 c21 c20 + + movdqa XMMWORD PTR [rdi], xmm0 + movdqa XMMWORD PTR [rdi + 16], xmm2 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +c1: + dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001 +align 16 +cn1: + dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff +align 16 +cd1: + dd 0x00000001, 0x00000001, 0x00000001, 0x00000001 +align 16 +cd3: + dd 0x00000003, 0x00000003, 0x00000003, 0x00000003 diff --git a/media/libvpx/libvpx/vp8/encoder/x86/quantize_sse4.c b/media/libvpx/libvpx/vp8/encoder/x86/quantize_sse4.c new file mode 100644 index 0000000000..4c2d24cc27 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/x86/quantize_sse4.c @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include /* SSE4.1 */ + +#include "./vp8_rtcd.h" +#include "vp8/encoder/block.h" +#include "vpx_ports/bitops.h" /* get_lsb */ +#include "vpx_ports/compiler_attributes.h" + +// Unsigned shift overflow is disabled for the use of ~1U << eob with ymask. +VPX_NO_UNSIGNED_SHIFT_CHECK void vp8_regular_quantize_b_sse4_1(BLOCK *b, + BLOCKD *d) { + int eob = -1; + short *zbin_boost_ptr = b->zrun_zbin_boost; + __m128i zbin_boost0 = _mm_load_si128((__m128i *)(zbin_boost_ptr)); + __m128i zbin_boost1 = _mm_load_si128((__m128i *)(zbin_boost_ptr + 8)); + __m128i x0, x1, y0, y1, x_minus_zbin0, x_minus_zbin1, dqcoeff0, dqcoeff1; + __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift)); + __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8)); + __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); + __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); + __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra); + __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin)); + __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8)); + __m128i round0 = _mm_load_si128((__m128i *)(b->round)); + __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); + __m128i quant0 = _mm_load_si128((__m128i *)(b->quant)); + __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8)); + __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); + __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); + __m128i qcoeff0, qcoeff1, t0, t1, x_shuf0, x_shuf1; + uint32_t mask, ymask; + DECLARE_ALIGNED(16, static const uint8_t, + zig_zag_mask[16]) = { 0, 1, 4, 8, 5, 2, 3, 6, + 9, 12, 13, 10, 7, 11, 14, 15 }; + DECLARE_ALIGNED(16, uint16_t, qcoeff[16]) = { 0 }; + + /* Duplicate to all lanes. */ + zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0); + zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra); + + /* x = abs(z) */ + x0 = _mm_abs_epi16(z0); + x1 = _mm_abs_epi16(z1); + + /* zbin[] + zbin_extra */ + zbin0 = _mm_add_epi16(zbin0, zbin_extra); + zbin1 = _mm_add_epi16(zbin1, zbin_extra); + + /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance + * the equation because boost is the only value which can change: + * x - (zbin[] + extra) >= boost */ + x_minus_zbin0 = _mm_sub_epi16(x0, zbin0); + x_minus_zbin1 = _mm_sub_epi16(x1, zbin1); + + /* All the remaining calculations are valid whether they are done now with + * simd or later inside the loop one at a time. */ + x0 = _mm_add_epi16(x0, round0); + x1 = _mm_add_epi16(x1, round1); + + y0 = _mm_mulhi_epi16(x0, quant0); + y1 = _mm_mulhi_epi16(x1, quant1); + + y0 = _mm_add_epi16(y0, x0); + y1 = _mm_add_epi16(y1, x1); + + /* Instead of shifting each value independently we convert the scaling + * factor with 1 << (16 - shift) so we can use multiply/return high half. */ + y0 = _mm_mulhi_epi16(y0, quant_shift0); + y1 = _mm_mulhi_epi16(y1, quant_shift1); + + /* Restore the sign. */ + y0 = _mm_sign_epi16(y0, z0); + y1 = _mm_sign_epi16(y1, z1); + + { + const __m128i zig_zag_i16_0 = + _mm_setr_epi8(0, 1, 2, 3, 8, 9, 14, 15, 10, 11, 4, 5, 6, 7, 12, 13); + const __m128i zig_zag_i16_1 = + _mm_setr_epi8(0, 1, 6, 7, 8, 9, 2, 3, 14, 15, 4, 5, 10, 11, 12, 13); + + /* The first part of the zig zag needs a value + * from x_minus_zbin1 and vice versa. */ + t1 = _mm_alignr_epi8(x_minus_zbin1, x_minus_zbin1, 2); + t0 = _mm_blend_epi16(x_minus_zbin0, t1, 0x80); + t1 = _mm_blend_epi16(t1, x_minus_zbin0, 0x80); + x_shuf0 = _mm_shuffle_epi8(t0, zig_zag_i16_0); + x_shuf1 = _mm_shuffle_epi8(t1, zig_zag_i16_1); + } + + /* Check if y is nonzero and put it in zig zag order. */ + t0 = _mm_packs_epi16(y0, y1); + t0 = _mm_cmpeq_epi8(t0, _mm_setzero_si128()); + t0 = _mm_shuffle_epi8(t0, _mm_load_si128((const __m128i *)zig_zag_mask)); + ymask = _mm_movemask_epi8(t0) ^ 0xffff; + + for (;;) { + t0 = _mm_cmpgt_epi16(zbin_boost0, x_shuf0); + t1 = _mm_cmpgt_epi16(zbin_boost1, x_shuf1); + t0 = _mm_packs_epi16(t0, t1); + mask = _mm_movemask_epi8(t0); + mask = ~mask & ymask; + if (!mask) break; + /* |eob| will contain the index of the next found element where: + * boost[i - old_eob - 1] <= x[zigzag[i]] && y[zigzag[i]] != 0 */ + eob = get_lsb(mask); + /* Need to clear the mask from processed elements so that + * they are no longer counted in the next iteration. */ + ymask &= ~1U << eob; + /* It's safe to read ahead of this buffer if struct VP8_COMP has at + * least 32 bytes before the zrun_zbin_boost_* fields (it has 384). + * Any data read outside of the buffer is masked by the updated |ymask|. */ + zbin_boost0 = _mm_loadu_si128((__m128i *)(zbin_boost_ptr - eob - 1)); + zbin_boost1 = _mm_loadu_si128((__m128i *)(zbin_boost_ptr - eob + 7)); + qcoeff[zig_zag_mask[eob]] = 0xffff; + } + + qcoeff0 = _mm_load_si128((__m128i *)(qcoeff)); + qcoeff1 = _mm_load_si128((__m128i *)(qcoeff + 8)); + qcoeff0 = _mm_and_si128(qcoeff0, y0); + qcoeff1 = _mm_and_si128(qcoeff1, y1); + + _mm_store_si128((__m128i *)(d->qcoeff), qcoeff0); + _mm_store_si128((__m128i *)(d->qcoeff + 8), qcoeff1); + + dqcoeff0 = _mm_mullo_epi16(qcoeff0, dequant0); + dqcoeff1 = _mm_mullo_epi16(qcoeff1, dequant1); + + _mm_store_si128((__m128i *)(d->dqcoeff), dqcoeff0); + _mm_store_si128((__m128i *)(d->dqcoeff + 8), dqcoeff1); + + *d->eob = eob + 1; +} diff --git a/media/libvpx/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/media/libvpx/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm new file mode 100644 index 0000000000..67102064a1 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm @@ -0,0 +1,209 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +SECTION .text + +; void vp8_temporal_filter_apply_sse2 | arg +; (unsigned char *frame1, | 0 +; unsigned int stride, | 1 +; unsigned char *frame2, | 2 +; unsigned int block_size, | 3 +; int strength, | 4 +; int filter_weight, | 5 +; unsigned int *accumulator, | 6 +; unsigned short *count) | 7 +globalsym(vp8_temporal_filter_apply_sse2) +sym(vp8_temporal_filter_apply_sse2): + + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ALIGN_STACK 16, rax + %define block_size 0 + %define strength 16 + %define filter_weight 32 + %define rounding_bit 48 + %define rbp_backup 64 + %define stack_size 80 + sub rsp, stack_size + mov [rsp + rbp_backup], rbp + ; end prolog + + mov rdx, arg(3) + mov [rsp + block_size], rdx + movd xmm6, arg(4) + movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read + + ; calculate the rounding bit outside the loop + ; 0x8000 >> (16 - strength) + mov rdx, 16 + sub rdx, arg(4) ; 16 - strength + movq xmm4, rdx ; can't use rdx w/ shift + movdqa xmm5, [GLOBAL(_const_top_bit)] + psrlw xmm5, xmm4 + movdqa [rsp + rounding_bit], xmm5 + + mov rsi, arg(0) ; src/frame1 + mov rdx, arg(2) ; predictor frame + mov rdi, arg(6) ; accumulator + mov rax, arg(7) ; count + + ; dup the filter weight and store for later + movd xmm0, arg(5) ; filter_weight + pshuflw xmm0, xmm0, 0 + punpcklwd xmm0, xmm0 + movdqa [rsp + filter_weight], xmm0 + + mov rbp, arg(1) ; stride + pxor xmm7, xmm7 ; zero for extraction + + lea rcx, [rdx + 16*16*1] + cmp dword ptr [rsp + block_size], 8 + jne .temporal_filter_apply_load_16 + lea rcx, [rdx + 8*8*1] + +.temporal_filter_apply_load_8: + movq xmm0, [rsi] ; first row + lea rsi, [rsi + rbp] ; += stride + punpcklbw xmm0, xmm7 ; src[ 0- 7] + movq xmm1, [rsi] ; second row + lea rsi, [rsi + rbp] ; += stride + punpcklbw xmm1, xmm7 ; src[ 8-15] + jmp .temporal_filter_apply_load_finished + +.temporal_filter_apply_load_16: + movdqa xmm0, [rsi] ; src (frame1) + lea rsi, [rsi + rbp] ; += stride + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm7 ; src[ 0- 7] + punpckhbw xmm1, xmm7 ; src[ 8-15] + +.temporal_filter_apply_load_finished: + movdqa xmm2, [rdx] ; predictor (frame2) + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm7 ; pred[ 0- 7] + punpckhbw xmm3, xmm7 ; pred[ 8-15] + + ; modifier = src_byte - pixel_value + psubw xmm0, xmm2 ; src - pred[ 0- 7] + psubw xmm1, xmm3 ; src - pred[ 8-15] + + ; modifier *= modifier + pmullw xmm0, xmm0 ; modifer[ 0- 7]^2 + pmullw xmm1, xmm1 ; modifer[ 8-15]^2 + + ; modifier *= 3 + pmullw xmm0, [GLOBAL(_const_3w)] + pmullw xmm1, [GLOBAL(_const_3w)] + + ; modifer += 0x8000 >> (16 - strength) + paddw xmm0, [rsp + rounding_bit] + paddw xmm1, [rsp + rounding_bit] + + ; modifier >>= strength + psrlw xmm0, [rsp + strength] + psrlw xmm1, [rsp + strength] + + ; modifier = 16 - modifier + ; saturation takes care of modifier > 16 + movdqa xmm3, [GLOBAL(_const_16w)] + movdqa xmm2, [GLOBAL(_const_16w)] + psubusw xmm3, xmm1 + psubusw xmm2, xmm0 + + ; modifier *= filter_weight + pmullw xmm2, [rsp + filter_weight] + pmullw xmm3, [rsp + filter_weight] + + ; count + movdqa xmm4, [rax] + movdqa xmm5, [rax+16] + ; += modifier + paddw xmm4, xmm2 + paddw xmm5, xmm3 + ; write back + movdqa [rax], xmm4 + movdqa [rax+16], xmm5 + lea rax, [rax + 16*2] ; count += 16*(sizeof(short)) + + ; load and extract the predictor up to shorts + pxor xmm7, xmm7 + movdqa xmm0, [rdx] + lea rdx, [rdx + 16*1] ; pred += 16*(sizeof(char)) + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm7 ; pred[ 0- 7] + punpckhbw xmm1, xmm7 ; pred[ 8-15] + + ; modifier *= pixel_value + pmullw xmm0, xmm2 + pmullw xmm1, xmm3 + + ; expand to double words + movdqa xmm2, xmm0 + punpcklwd xmm0, xmm7 ; [ 0- 3] + punpckhwd xmm2, xmm7 ; [ 4- 7] + movdqa xmm3, xmm1 + punpcklwd xmm1, xmm7 ; [ 8-11] + punpckhwd xmm3, xmm7 ; [12-15] + + ; accumulator + movdqa xmm4, [rdi] + movdqa xmm5, [rdi+16] + movdqa xmm6, [rdi+32] + movdqa xmm7, [rdi+48] + ; += modifier + paddd xmm4, xmm0 + paddd xmm5, xmm2 + paddd xmm6, xmm1 + paddd xmm7, xmm3 + ; write back + movdqa [rdi], xmm4 + movdqa [rdi+16], xmm5 + movdqa [rdi+32], xmm6 + movdqa [rdi+48], xmm7 + lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int)) + + cmp rdx, rcx + je .temporal_filter_apply_epilog + pxor xmm7, xmm7 ; zero for extraction + cmp dword ptr [rsp + block_size], 16 + je .temporal_filter_apply_load_16 + jmp .temporal_filter_apply_load_8 + +.temporal_filter_apply_epilog: + ; begin epilog + mov rbp, [rsp + rbp_backup] + add rsp, stack_size + pop rsp + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +_const_3w: + times 8 dw 3 +align 16 +_const_top_bit: + times 8 dw 1<<15 +align 16 +_const_16w: + times 8 dw 16 diff --git a/media/libvpx/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c b/media/libvpx/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c new file mode 100644 index 0000000000..d0752453ee --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vpx_ports/x86.h" +#include "vp8/encoder/block.h" + +int vp8_mbblock_error_sse2_impl(short *coeff_ptr, short *dcoef_ptr, int dc); +int vp8_mbblock_error_sse2(MACROBLOCK *mb, int dc) { + short *coeff_ptr = mb->block[0].coeff; + short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff; + return vp8_mbblock_error_sse2_impl(coeff_ptr, dcoef_ptr, dc); +} + +int vp8_mbuverror_sse2_impl(short *s_ptr, short *d_ptr); +int vp8_mbuverror_sse2(MACROBLOCK *mb) { + short *s_ptr = &mb->coeff[256]; + short *d_ptr = &mb->e_mbd.dqcoeff[256]; + return vp8_mbuverror_sse2_impl(s_ptr, d_ptr); +} diff --git a/media/libvpx/libvpx/vp8/encoder/x86/vp8_quantize_sse2.c b/media/libvpx/libvpx/vp8/encoder/x86/vp8_quantize_sse2.c new file mode 100644 index 0000000000..581d2565ee --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/x86/vp8_quantize_sse2.c @@ -0,0 +1,226 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vpx_ports/x86.h" +#include "vpx_mem/vpx_mem.h" +#include "vp8/encoder/block.h" +#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */ + +#include /* MMX */ +#include /* SSE */ +#include /* SSE2 */ + +#define SELECT_EOB(i, z) \ + do { \ + short boost = *zbin_boost_ptr; \ + int cmp = (x[z] < boost) | (y[z] == 0); \ + zbin_boost_ptr++; \ + if (cmp) break; \ + qcoeff_ptr[z] = y[z]; \ + eob = i; \ + zbin_boost_ptr = b->zrun_zbin_boost; \ + } while (0) + +void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d) { + char eob = 0; + short *zbin_boost_ptr; + short *qcoeff_ptr = d->qcoeff; + DECLARE_ALIGNED(16, short, x[16]); + DECLARE_ALIGNED(16, short, y[16]); + + __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1; + __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift)); + __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8)); + __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); + __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); + __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra); + __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin)); + __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8)); + __m128i round0 = _mm_load_si128((__m128i *)(b->round)); + __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); + __m128i quant0 = _mm_load_si128((__m128i *)(b->quant)); + __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8)); + __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); + __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); + + memset(qcoeff_ptr, 0, 32); + + /* Duplicate to all lanes. */ + zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0); + zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra); + + /* Sign of z: z >> 15 */ + sz0 = _mm_srai_epi16(z0, 15); + sz1 = _mm_srai_epi16(z1, 15); + + /* x = abs(z): (z ^ sz) - sz */ + x0 = _mm_xor_si128(z0, sz0); + x1 = _mm_xor_si128(z1, sz1); + x0 = _mm_sub_epi16(x0, sz0); + x1 = _mm_sub_epi16(x1, sz1); + + /* zbin[] + zbin_extra */ + zbin0 = _mm_add_epi16(zbin0, zbin_extra); + zbin1 = _mm_add_epi16(zbin1, zbin_extra); + + /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance + * the equation because boost is the only value which can change: + * x - (zbin[] + extra) >= boost */ + x_minus_zbin0 = _mm_sub_epi16(x0, zbin0); + x_minus_zbin1 = _mm_sub_epi16(x1, zbin1); + + _mm_store_si128((__m128i *)(x), x_minus_zbin0); + _mm_store_si128((__m128i *)(x + 8), x_minus_zbin1); + + /* All the remaining calculations are valid whether they are done now with + * simd or later inside the loop one at a time. */ + x0 = _mm_add_epi16(x0, round0); + x1 = _mm_add_epi16(x1, round1); + + y0 = _mm_mulhi_epi16(x0, quant0); + y1 = _mm_mulhi_epi16(x1, quant1); + + y0 = _mm_add_epi16(y0, x0); + y1 = _mm_add_epi16(y1, x1); + + /* Instead of shifting each value independently we convert the scaling + * factor with 1 << (16 - shift) so we can use multiply/return high half. */ + y0 = _mm_mulhi_epi16(y0, quant_shift0); + y1 = _mm_mulhi_epi16(y1, quant_shift1); + + /* Return the sign: (y ^ sz) - sz */ + y0 = _mm_xor_si128(y0, sz0); + y1 = _mm_xor_si128(y1, sz1); + y0 = _mm_sub_epi16(y0, sz0); + y1 = _mm_sub_epi16(y1, sz1); + + _mm_store_si128((__m128i *)(y), y0); + _mm_store_si128((__m128i *)(y + 8), y1); + + zbin_boost_ptr = b->zrun_zbin_boost; + + /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */ + SELECT_EOB(1, 0); + SELECT_EOB(2, 1); + SELECT_EOB(3, 4); + SELECT_EOB(4, 8); + SELECT_EOB(5, 5); + SELECT_EOB(6, 2); + SELECT_EOB(7, 3); + SELECT_EOB(8, 6); + SELECT_EOB(9, 9); + SELECT_EOB(10, 12); + SELECT_EOB(11, 13); + SELECT_EOB(12, 10); + SELECT_EOB(13, 7); + SELECT_EOB(14, 11); + SELECT_EOB(15, 14); + SELECT_EOB(16, 15); + + y0 = _mm_load_si128((__m128i *)(d->qcoeff)); + y1 = _mm_load_si128((__m128i *)(d->qcoeff + 8)); + + /* dqcoeff = qcoeff * dequant */ + y0 = _mm_mullo_epi16(y0, dequant0); + y1 = _mm_mullo_epi16(y1, dequant1); + + _mm_store_si128((__m128i *)(d->dqcoeff), y0); + _mm_store_si128((__m128i *)(d->dqcoeff + 8), y1); + + *d->eob = eob; +} + +void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d) { + __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); + __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); + __m128i round0 = _mm_load_si128((__m128i *)(b->round)); + __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); + __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast)); + __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8)); + __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); + __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); + __m128i inv_zig_zag0 = + _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag)); + __m128i inv_zig_zag1 = + _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag + 8)); + + __m128i sz0, sz1, x0, x1, y0, y1, xdq0, xdq1, zeros, ones; + + /* sign of z: z >> 15 */ + sz0 = _mm_srai_epi16(z0, 15); + sz1 = _mm_srai_epi16(z1, 15); + + /* x = abs(z): (z ^ sz) - sz */ + x0 = _mm_xor_si128(z0, sz0); + x1 = _mm_xor_si128(z1, sz1); + x0 = _mm_sub_epi16(x0, sz0); + x1 = _mm_sub_epi16(x1, sz1); + + /* x += round */ + x0 = _mm_add_epi16(x0, round0); + x1 = _mm_add_epi16(x1, round1); + + /* y = (x * quant) >> 16 */ + y0 = _mm_mulhi_epi16(x0, quant_fast0); + y1 = _mm_mulhi_epi16(x1, quant_fast1); + + /* x = abs(y) = (y ^ sz) - sz */ + y0 = _mm_xor_si128(y0, sz0); + y1 = _mm_xor_si128(y1, sz1); + x0 = _mm_sub_epi16(y0, sz0); + x1 = _mm_sub_epi16(y1, sz1); + + /* qcoeff = x */ + _mm_store_si128((__m128i *)(d->qcoeff), x0); + _mm_store_si128((__m128i *)(d->qcoeff + 8), x1); + + /* x * dequant */ + xdq0 = _mm_mullo_epi16(x0, dequant0); + xdq1 = _mm_mullo_epi16(x1, dequant1); + + /* dqcoeff = x * dequant */ + _mm_store_si128((__m128i *)(d->dqcoeff), xdq0); + _mm_store_si128((__m128i *)(d->dqcoeff + 8), xdq1); + + /* build a mask for the zig zag */ + zeros = _mm_setzero_si128(); + + x0 = _mm_cmpeq_epi16(x0, zeros); + x1 = _mm_cmpeq_epi16(x1, zeros); + + ones = _mm_cmpeq_epi16(zeros, zeros); + + x0 = _mm_xor_si128(x0, ones); + x1 = _mm_xor_si128(x1, ones); + + x0 = _mm_and_si128(x0, inv_zig_zag0); + x1 = _mm_and_si128(x1, inv_zig_zag1); + + x0 = _mm_max_epi16(x0, x1); + + /* now down to 8 */ + x1 = _mm_shuffle_epi32(x0, 0xE); // 0b00001110 + + x0 = _mm_max_epi16(x0, x1); + + /* only 4 left */ + x1 = _mm_shufflelo_epi16(x0, 0xE); // 0b00001110 + + x0 = _mm_max_epi16(x0, x1); + + /* okay, just 2! */ + x1 = _mm_shufflelo_epi16(x0, 0x1); // 0b00000001 + + x0 = _mm_max_epi16(x0, x1); + + *d->eob = 0xFF & _mm_cvtsi128_si32(x0); +} diff --git a/media/libvpx/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c b/media/libvpx/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c new file mode 100644 index 0000000000..f6df146f08 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include /* SSSE3 */ + +#include "./vp8_rtcd.h" +#include "vp8/encoder/block.h" +#include "vpx_ports/bitops.h" /* get_msb */ + +void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) { + int eob, mask; + + __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); + __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); + __m128i round0 = _mm_load_si128((__m128i *)(b->round)); + __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); + __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast)); + __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8)); + __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); + __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); + + __m128i sz0, sz1, x, x0, x1, y0, y1, zeros, abs0, abs1; + + DECLARE_ALIGNED(16, const uint8_t, + pshufb_zig_zag_mask[16]) = { 0, 1, 4, 8, 5, 2, 3, 6, + 9, 12, 13, 10, 7, 11, 14, 15 }; + __m128i zig_zag = _mm_load_si128((const __m128i *)pshufb_zig_zag_mask); + + /* sign of z: z >> 15 */ + sz0 = _mm_srai_epi16(z0, 15); + sz1 = _mm_srai_epi16(z1, 15); + + /* x = abs(z) */ + x0 = _mm_abs_epi16(z0); + x1 = _mm_abs_epi16(z1); + + /* x += round */ + x0 = _mm_add_epi16(x0, round0); + x1 = _mm_add_epi16(x1, round1); + + /* y = (x * quant) >> 16 */ + y0 = _mm_mulhi_epi16(x0, quant_fast0); + y1 = _mm_mulhi_epi16(x1, quant_fast1); + + /* ASM saves Y for EOB */ + /* I think we can ignore that because adding the sign doesn't change anything + * and multiplying 0 by dequant is OK as well */ + abs0 = y0; + abs1 = y1; + + /* Restore the sign bit. */ + y0 = _mm_xor_si128(y0, sz0); + y1 = _mm_xor_si128(y1, sz1); + x0 = _mm_sub_epi16(y0, sz0); + x1 = _mm_sub_epi16(y1, sz1); + + /* qcoeff = x */ + _mm_store_si128((__m128i *)(d->qcoeff), x0); + _mm_store_si128((__m128i *)(d->qcoeff + 8), x1); + + /* x * dequant */ + x0 = _mm_mullo_epi16(x0, dequant0); + x1 = _mm_mullo_epi16(x1, dequant1); + + /* dqcoeff = x * dequant */ + _mm_store_si128((__m128i *)(d->dqcoeff), x0); + _mm_store_si128((__m128i *)(d->dqcoeff + 8), x1); + + zeros = _mm_setzero_si128(); + + x0 = _mm_cmpgt_epi16(abs0, zeros); + x1 = _mm_cmpgt_epi16(abs1, zeros); + + x = _mm_packs_epi16(x0, x1); + + x = _mm_shuffle_epi8(x, zig_zag); + + mask = _mm_movemask_epi8(x); + + /* x2 is needed to increase the result from non-zero masks by 1, + * +1 is needed to mask undefined behavior for a null argument, + * the result of get_msb(1) is 0 */ + eob = get_msb(mask * 2 + 1); + + *d->eob = eob; +} diff --git a/media/libvpx/libvpx/vp8/exports_dec b/media/libvpx/libvpx/vp8/exports_dec new file mode 100644 index 0000000000..100ac5c27d --- /dev/null +++ b/media/libvpx/libvpx/vp8/exports_dec @@ -0,0 +1,2 @@ +data vpx_codec_vp8_dx_algo +text vpx_codec_vp8_dx diff --git a/media/libvpx/libvpx/vp8/exports_enc b/media/libvpx/libvpx/vp8/exports_enc new file mode 100644 index 0000000000..29ff35ef7b --- /dev/null +++ b/media/libvpx/libvpx/vp8/exports_enc @@ -0,0 +1,2 @@ +data vpx_codec_vp8_cx_algo +text vpx_codec_vp8_cx diff --git a/media/libvpx/libvpx/vp8/vp8_common.mk b/media/libvpx/libvpx/vp8/vp8_common.mk new file mode 100644 index 0000000000..d485965d3d --- /dev/null +++ b/media/libvpx/libvpx/vp8/vp8_common.mk @@ -0,0 +1,149 @@ +## +## Copyright (c) 2010 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +VP8_COMMON_SRCS-yes += vp8_common.mk +VP8_COMMON_SRCS-yes += common/ppflags.h +VP8_COMMON_SRCS-yes += common/onyx.h +VP8_COMMON_SRCS-yes += common/onyxd.h +VP8_COMMON_SRCS-yes += common/alloccommon.c +VP8_COMMON_SRCS-yes += common/blockd.c +VP8_COMMON_SRCS-yes += common/coefupdateprobs.h +# VP8_COMMON_SRCS-yes += common/debugmodes.c +VP8_COMMON_SRCS-yes += common/default_coef_probs.h +VP8_COMMON_SRCS-yes += common/dequantize.c +VP8_COMMON_SRCS-yes += common/entropy.c +VP8_COMMON_SRCS-yes += common/entropymode.c +VP8_COMMON_SRCS-yes += common/entropymv.c +VP8_COMMON_SRCS-yes += common/extend.c +VP8_COMMON_SRCS-yes += common/filter.c +VP8_COMMON_SRCS-yes += common/filter.h +VP8_COMMON_SRCS-yes += common/findnearmv.c +VP8_COMMON_SRCS-yes += common/generic/systemdependent.c +VP8_COMMON_SRCS-yes += common/idct_blk.c +VP8_COMMON_SRCS-yes += common/idctllm.c +VP8_COMMON_SRCS-yes += common/alloccommon.h +VP8_COMMON_SRCS-yes += common/blockd.h +VP8_COMMON_SRCS-yes += common/common.h +VP8_COMMON_SRCS-yes += common/entropy.h +VP8_COMMON_SRCS-yes += common/entropymode.h +VP8_COMMON_SRCS-yes += common/entropymv.h +VP8_COMMON_SRCS-yes += common/extend.h +VP8_COMMON_SRCS-yes += common/findnearmv.h +VP8_COMMON_SRCS-yes += common/header.h +VP8_COMMON_SRCS-yes += common/invtrans.h +VP8_COMMON_SRCS-yes += common/loopfilter.h +VP8_COMMON_SRCS-yes += common/modecont.h +VP8_COMMON_SRCS-yes += common/mv.h +VP8_COMMON_SRCS-yes += common/onyxc_int.h +VP8_COMMON_SRCS-yes += common/quant_common.h +VP8_COMMON_SRCS-yes += common/reconinter.h +VP8_COMMON_SRCS-yes += common/reconintra.h +VP8_COMMON_SRCS-yes += common/reconintra4x4.h +VP8_COMMON_SRCS-yes += common/rtcd.c +VP8_COMMON_SRCS-yes += common/rtcd_defs.pl +VP8_COMMON_SRCS-yes += common/setupintrarecon.h +VP8_COMMON_SRCS-yes += common/swapyv12buffer.h +VP8_COMMON_SRCS-yes += common/systemdependent.h +VP8_COMMON_SRCS-yes += common/threading.h +VP8_COMMON_SRCS-yes += common/treecoder.h +VP8_COMMON_SRCS-yes += common/vp8_loopfilter.c +VP8_COMMON_SRCS-yes += common/loopfilter_filters.c +VP8_COMMON_SRCS-yes += common/mbpitch.c +VP8_COMMON_SRCS-yes += common/modecont.c +VP8_COMMON_SRCS-yes += common/quant_common.c +VP8_COMMON_SRCS-yes += common/reconinter.c +VP8_COMMON_SRCS-yes += common/reconintra.c +VP8_COMMON_SRCS-yes += common/reconintra4x4.c +VP8_COMMON_SRCS-yes += common/setupintrarecon.c +VP8_COMMON_SRCS-yes += common/swapyv12buffer.c +VP8_COMMON_SRCS-yes += common/vp8_entropymodedata.h + + + +VP8_COMMON_SRCS-yes += common/treecoder.c + +VP8_COMMON_SRCS-$(VPX_ARCH_X86)$(VPX_ARCH_X86_64) += common/x86/vp8_asm_stubs.c +VP8_COMMON_SRCS-$(VPX_ARCH_X86)$(VPX_ARCH_X86_64) += common/x86/loopfilter_x86.c +VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/mfqe.c +VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.h +VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.c +VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/dequantize_mmx.asm +VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idct_blk_mmx.c +VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm +VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm +VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm +VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c +VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm +VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm +VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/bilinear_filter_sse2.c +VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm +VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm +VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm +VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm + +ifeq ($(CONFIG_POSTPROC),yes) +VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/mfqe_sse2.asm +endif + +ifeq ($(VPX_ARCH_X86_64),yes) +VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_block_sse2_x86_64.asm +endif + +# common (c) +VP8_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/idctllm_dspr2.c +VP8_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/filter_dspr2.c +VP8_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp8_loopfilter_filters_dspr2.c +VP8_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/reconinter_dspr2.c +VP8_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/idct_blk_dspr2.c +VP8_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/dequantize_dspr2.c + +# common (c) +VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/bilinear_filter_msa.c +VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/copymem_msa.c +VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/idct_msa.c +VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/loopfilter_filters_msa.c +VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/sixtap_filter_msa.c +VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp8_macros_msa.h + +# common (c) +VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/sixtap_filter_mmi.c +VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/loopfilter_filters_mmi.c +VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/idctllm_mmi.c +VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/dequantize_mmi.c +VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/copymem_mmi.c +VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/idct_blk_mmi.c + +ifeq ($(CONFIG_POSTPROC),yes) +VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c +endif + +# common (loongarch LSX intrinsics) +VP8_COMMON_SRCS-$(HAVE_LSX) += common/loongarch/loopfilter_filters_lsx.c +VP8_COMMON_SRCS-$(HAVE_LSX) += common/loongarch/sixtap_filter_lsx.c +VP8_COMMON_SRCS-$(HAVE_LSX) += common/loongarch/idct_lsx.c + +# common (neon intrinsics) +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/loopfilter_arm.c +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/loopfilter_arm.h +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/bilinearpredict_neon.c +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/copymem_neon.c +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dc_only_idct_add_neon.c +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequant_idct_neon.c +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequantizeb_neon.c +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_blk_neon.c +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iwalsh_neon.c +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_loopfilter_neon.c +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimplehorizontaledge_neon.c +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimpleverticaledge_neon.c +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/mbloopfilter_neon.c +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/shortidct4x4llm_neon.c +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict_neon.c + +$(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl)) diff --git a/media/libvpx/libvpx/vp8/vp8_cx_iface.c b/media/libvpx/libvpx/vp8/vp8_cx_iface.c new file mode 100644 index 0000000000..1f16cc53d3 --- /dev/null +++ b/media/libvpx/libvpx/vp8/vp8_cx_iface.c @@ -0,0 +1,1398 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vp8_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "./vpx_scale_rtcd.h" +#include "vpx/vpx_encoder.h" +#include "vpx/internal/vpx_codec_internal.h" +#include "vpx_version.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/static_assert.h" +#include "vpx_ports/system_state.h" +#include "vpx_util/vpx_timestamp.h" +#if CONFIG_MULTITHREAD +#include "vp8/encoder/ethreading.h" +#endif +#include "vp8/encoder/onyx_int.h" +#include "vpx/vp8cx.h" +#include "vp8/encoder/firstpass.h" +#include "vp8/common/onyx.h" +#include "vp8/common/common.h" +#include +#include + +struct vp8_extracfg { + struct vpx_codec_pkt_list *pkt_list; + int cpu_used; /** available cpu percentage in 1/16*/ + /** if encoder decides to uses alternate reference frame */ + unsigned int enable_auto_alt_ref; + unsigned int noise_sensitivity; + unsigned int Sharpness; + unsigned int static_thresh; + unsigned int token_partitions; + unsigned int arnr_max_frames; /* alt_ref Noise Reduction Max Frame Count */ + unsigned int arnr_strength; /* alt_ref Noise Reduction Strength */ + unsigned int arnr_type; /* alt_ref filter type */ + vp8e_tuning tuning; + unsigned int cq_level; /* constrained quality level */ + unsigned int rc_max_intra_bitrate_pct; + unsigned int gf_cbr_boost_pct; + unsigned int screen_content_mode; +}; + +static struct vp8_extracfg default_extracfg = { + NULL, +#if !(CONFIG_REALTIME_ONLY) + 0, /* cpu_used */ +#else + 4, /* cpu_used */ +#endif + 0, /* enable_auto_alt_ref */ + 0, /* noise_sensitivity */ + 0, /* Sharpness */ + 0, /* static_thresh */ +#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING) + VP8_EIGHT_TOKENPARTITION, +#else + VP8_ONE_TOKENPARTITION, /* token_partitions */ +#endif + 0, /* arnr_max_frames */ + 3, /* arnr_strength */ + 3, /* arnr_type*/ + 0, /* tuning*/ + 10, /* cq_level */ + 0, /* rc_max_intra_bitrate_pct */ + 0, /* gf_cbr_boost_pct */ + 0, /* screen_content_mode */ +}; + +struct vpx_codec_alg_priv { + vpx_codec_priv_t base; + vpx_codec_enc_cfg_t cfg; + struct vp8_extracfg vp8_cfg; + vpx_rational64_t timestamp_ratio; + vpx_codec_pts_t pts_offset; + unsigned char pts_offset_initialized; + VP8_CONFIG oxcf; + struct VP8_COMP *cpi; + unsigned char *cx_data; + unsigned int cx_data_sz; + vpx_image_t preview_img; + unsigned int next_frame_flag; + vp8_postproc_cfg_t preview_ppcfg; + /* pkt_list size depends on the maximum number of lagged frames allowed. */ + vpx_codec_pkt_list_decl(64) pkt_list; + unsigned int fixed_kf_cntr; + vpx_enc_frame_flags_t control_frame_flags; +}; + +// Called by vp8e_set_config() and vp8e_encode() only. Must not be called +// by vp8e_init() because the `error` paramerer (cpi->common.error) will be +// destroyed by vpx_codec_enc_init_ver() after vp8e_init() returns an error. +// See the "IMPORTANT" comment in vpx_codec_enc_init_ver(). +static vpx_codec_err_t update_error_state( + vpx_codec_alg_priv_t *ctx, const struct vpx_internal_error_info *error) { + const vpx_codec_err_t res = error->error_code; + + if (res != VPX_CODEC_OK) + ctx->base.err_detail = error->has_detail ? error->detail : NULL; + + return res; +} + +#undef ERROR +#define ERROR(str) \ + do { \ + ctx->base.err_detail = str; \ + return VPX_CODEC_INVALID_PARAM; \ + } while (0) + +#define RANGE_CHECK(p, memb, lo, hi) \ + do { \ + if (!(((p)->memb == (lo) || (p)->memb > (lo)) && (p)->memb <= (hi))) \ + ERROR(#memb " out of range [" #lo ".." #hi "]"); \ + } while (0) + +#define RANGE_CHECK_HI(p, memb, hi) \ + do { \ + if (!((p)->memb <= (hi))) ERROR(#memb " out of range [.." #hi "]"); \ + } while (0) + +#define RANGE_CHECK_LO(p, memb, lo) \ + do { \ + if (!((p)->memb >= (lo))) ERROR(#memb " out of range [" #lo "..]"); \ + } while (0) + +#define RANGE_CHECK_BOOL(p, memb) \ + do { \ + if (!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean"); \ + } while (0) + +static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, + const vpx_codec_enc_cfg_t *cfg, + const struct vp8_extracfg *vp8_cfg, + int finalize) { + RANGE_CHECK(cfg, g_w, 1, 16383); /* 14 bits available */ + RANGE_CHECK(cfg, g_h, 1, 16383); /* 14 bits available */ + RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000); + RANGE_CHECK(cfg, g_timebase.num, 1, 1000000000); + RANGE_CHECK_HI(cfg, g_profile, 3); + RANGE_CHECK_HI(cfg, rc_max_quantizer, 63); + RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer); + RANGE_CHECK_HI(cfg, g_threads, 64); +#if CONFIG_REALTIME_ONLY + RANGE_CHECK_HI(cfg, g_lag_in_frames, 0); +#elif CONFIG_MULTI_RES_ENCODING + if (ctx->base.enc.total_encoders > 1) RANGE_CHECK_HI(cfg, g_lag_in_frames, 0); +#else + RANGE_CHECK_HI(cfg, g_lag_in_frames, 25); +#endif + RANGE_CHECK(cfg, rc_end_usage, VPX_VBR, VPX_Q); + RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100); + RANGE_CHECK_HI(cfg, rc_overshoot_pct, 100); + RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100); + RANGE_CHECK(cfg, kf_mode, VPX_KF_DISABLED, VPX_KF_AUTO); + +/* TODO: add spatial re-sampling support and frame dropping in + * multi-res-encoder.*/ +#if CONFIG_MULTI_RES_ENCODING + if (ctx->base.enc.total_encoders > 1) + RANGE_CHECK_HI(cfg, rc_resize_allowed, 0); +#else + RANGE_CHECK_BOOL(cfg, rc_resize_allowed); +#endif + RANGE_CHECK_HI(cfg, rc_dropframe_thresh, 100); + RANGE_CHECK_HI(cfg, rc_resize_up_thresh, 100); + RANGE_CHECK_HI(cfg, rc_resize_down_thresh, 100); + +#if CONFIG_REALTIME_ONLY + RANGE_CHECK(cfg, g_pass, VPX_RC_ONE_PASS, VPX_RC_ONE_PASS); +#elif CONFIG_MULTI_RES_ENCODING + if (ctx->base.enc.total_encoders > 1) + RANGE_CHECK(cfg, g_pass, VPX_RC_ONE_PASS, VPX_RC_ONE_PASS); +#else + RANGE_CHECK(cfg, g_pass, VPX_RC_ONE_PASS, VPX_RC_LAST_PASS); +#endif + + /* VP8 does not support a lower bound on the keyframe interval in + * automatic keyframe placement mode. + */ + if (cfg->kf_mode != VPX_KF_DISABLED && cfg->kf_min_dist != cfg->kf_max_dist && + cfg->kf_min_dist > 0) + ERROR( + "kf_min_dist not supported in auto mode, use 0 " + "or kf_max_dist instead."); + + RANGE_CHECK_BOOL(vp8_cfg, enable_auto_alt_ref); + RANGE_CHECK(vp8_cfg, cpu_used, -16, 16); + +#if CONFIG_REALTIME_ONLY && !CONFIG_TEMPORAL_DENOISING + RANGE_CHECK(vp8_cfg, noise_sensitivity, 0, 0); +#else + RANGE_CHECK_HI(vp8_cfg, noise_sensitivity, 6); +#endif + + RANGE_CHECK(vp8_cfg, token_partitions, VP8_ONE_TOKENPARTITION, + VP8_EIGHT_TOKENPARTITION); + RANGE_CHECK_HI(vp8_cfg, Sharpness, 7); + RANGE_CHECK(vp8_cfg, arnr_max_frames, 0, 15); + RANGE_CHECK_HI(vp8_cfg, arnr_strength, 6); + RANGE_CHECK(vp8_cfg, arnr_type, 1, 3); + RANGE_CHECK(vp8_cfg, cq_level, 0, 63); + RANGE_CHECK_HI(vp8_cfg, screen_content_mode, 2); + if (finalize && (cfg->rc_end_usage == VPX_CQ || cfg->rc_end_usage == VPX_Q)) + RANGE_CHECK(vp8_cfg, cq_level, cfg->rc_min_quantizer, + cfg->rc_max_quantizer); + +#if !(CONFIG_REALTIME_ONLY) + if (cfg->g_pass == VPX_RC_LAST_PASS) { + size_t packet_sz = sizeof(FIRSTPASS_STATS); + int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz); + FIRSTPASS_STATS *stats; + + if (!cfg->rc_twopass_stats_in.buf) + ERROR("rc_twopass_stats_in.buf not set."); + + if (cfg->rc_twopass_stats_in.sz % packet_sz) + ERROR("rc_twopass_stats_in.sz indicates truncated packet."); + + if (cfg->rc_twopass_stats_in.sz < 2 * packet_sz) + ERROR("rc_twopass_stats_in requires at least two packets."); + + stats = (void *)((char *)cfg->rc_twopass_stats_in.buf + + (n_packets - 1) * packet_sz); + + if ((int)(stats->count + 0.5) != n_packets - 1) + ERROR("rc_twopass_stats_in missing EOS stats packet"); + } +#endif + + RANGE_CHECK(cfg, ts_number_layers, 1, 5); + + if (cfg->ts_number_layers > 1) { + unsigned int i; + RANGE_CHECK_HI(cfg, ts_periodicity, 16); + + for (i = 1; i < cfg->ts_number_layers; ++i) { + if (cfg->ts_target_bitrate[i] <= cfg->ts_target_bitrate[i - 1] && + cfg->rc_target_bitrate > 0) + ERROR("ts_target_bitrate entries are not strictly increasing"); + } + + RANGE_CHECK(cfg, ts_rate_decimator[cfg->ts_number_layers - 1], 1, 1); + for (i = cfg->ts_number_layers - 2; i > 0; i--) { + if (cfg->ts_rate_decimator[i - 1] != 2 * cfg->ts_rate_decimator[i]) + ERROR("ts_rate_decimator factors are not powers of 2"); + } + + RANGE_CHECK_HI(cfg, ts_layer_id[i], cfg->ts_number_layers - 1); + } + +#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING) + if (cfg->g_threads > (1 << vp8_cfg->token_partitions)) + ERROR("g_threads cannot be bigger than number of token partitions"); +#endif + + // The range below shall be further tuned. + RANGE_CHECK(cfg, use_vizier_rc_params, 0, 1); + RANGE_CHECK(cfg, active_wq_factor.den, 1, 1000); + RANGE_CHECK(cfg, err_per_mb_factor.den, 1, 1000); + RANGE_CHECK(cfg, sr_default_decay_limit.den, 1, 1000); + RANGE_CHECK(cfg, sr_diff_factor.den, 1, 1000); + RANGE_CHECK(cfg, kf_err_per_mb_factor.den, 1, 1000); + RANGE_CHECK(cfg, kf_frame_min_boost_factor.den, 1, 1000); + RANGE_CHECK(cfg, kf_frame_max_boost_subs_factor.den, 1, 1000); + RANGE_CHECK(cfg, kf_max_total_boost_factor.den, 1, 1000); + RANGE_CHECK(cfg, gf_max_total_boost_factor.den, 1, 1000); + RANGE_CHECK(cfg, gf_frame_max_boost_factor.den, 1, 1000); + RANGE_CHECK(cfg, zm_factor.den, 1, 1000); + RANGE_CHECK(cfg, rd_mult_inter_qp_fac.den, 1, 1000); + RANGE_CHECK(cfg, rd_mult_arf_qp_fac.den, 1, 1000); + RANGE_CHECK(cfg, rd_mult_key_qp_fac.den, 1, 1000); + + return VPX_CODEC_OK; +} + +static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx, + const vpx_image_t *img) { + switch (img->fmt) { + case VPX_IMG_FMT_YV12: + case VPX_IMG_FMT_I420: + case VPX_IMG_FMT_NV12: break; + default: + ERROR( + "Invalid image format. Only YV12, I420 and NV12 images are " + "supported"); + } + + if ((img->d_w != ctx->cfg.g_w) || (img->d_h != ctx->cfg.g_h)) + ERROR("Image size must match encoder init configuration size"); + + return VPX_CODEC_OK; +} + +static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf, + vpx_codec_enc_cfg_t cfg, + struct vp8_extracfg vp8_cfg, + vpx_codec_priv_enc_mr_cfg_t *mr_cfg) { + oxcf->multi_threaded = cfg.g_threads; + oxcf->Version = cfg.g_profile; + + oxcf->Width = cfg.g_w; + oxcf->Height = cfg.g_h; + oxcf->timebase = cfg.g_timebase; + + oxcf->error_resilient_mode = cfg.g_error_resilient; + + switch (cfg.g_pass) { + case VPX_RC_ONE_PASS: oxcf->Mode = MODE_BESTQUALITY; break; + case VPX_RC_FIRST_PASS: oxcf->Mode = MODE_FIRSTPASS; break; + case VPX_RC_LAST_PASS: oxcf->Mode = MODE_SECONDPASS_BEST; break; + } + + if (cfg.g_pass == VPX_RC_FIRST_PASS || cfg.g_pass == VPX_RC_ONE_PASS) { + oxcf->allow_lag = 0; + oxcf->lag_in_frames = 0; + } else { + oxcf->allow_lag = (cfg.g_lag_in_frames) > 0; + oxcf->lag_in_frames = cfg.g_lag_in_frames; + } + + oxcf->allow_df = (cfg.rc_dropframe_thresh > 0); + oxcf->drop_frames_water_mark = cfg.rc_dropframe_thresh; + + oxcf->allow_spatial_resampling = cfg.rc_resize_allowed; + oxcf->resample_up_water_mark = cfg.rc_resize_up_thresh; + oxcf->resample_down_water_mark = cfg.rc_resize_down_thresh; + + if (cfg.rc_end_usage == VPX_VBR) { + oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK; + } else if (cfg.rc_end_usage == VPX_CBR) { + oxcf->end_usage = USAGE_STREAM_FROM_SERVER; + } else if (cfg.rc_end_usage == VPX_CQ) { + oxcf->end_usage = USAGE_CONSTRAINED_QUALITY; + } else if (cfg.rc_end_usage == VPX_Q) { + oxcf->end_usage = USAGE_CONSTANT_QUALITY; + } + + // Cap the target rate to 1000 Mbps to avoid some integer overflows in + // target bandwidth calculations. + oxcf->target_bandwidth = VPXMIN(cfg.rc_target_bitrate, 1000000); + oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct; + oxcf->gf_cbr_boost_pct = vp8_cfg.gf_cbr_boost_pct; + + oxcf->best_allowed_q = cfg.rc_min_quantizer; + oxcf->worst_allowed_q = cfg.rc_max_quantizer; + oxcf->cq_level = vp8_cfg.cq_level; + oxcf->fixed_q = -1; + + oxcf->under_shoot_pct = cfg.rc_undershoot_pct; + oxcf->over_shoot_pct = cfg.rc_overshoot_pct; + + oxcf->maximum_buffer_size_in_ms = cfg.rc_buf_sz; + oxcf->starting_buffer_level_in_ms = cfg.rc_buf_initial_sz; + oxcf->optimal_buffer_level_in_ms = cfg.rc_buf_optimal_sz; + + oxcf->maximum_buffer_size = cfg.rc_buf_sz; + oxcf->starting_buffer_level = cfg.rc_buf_initial_sz; + oxcf->optimal_buffer_level = cfg.rc_buf_optimal_sz; + + oxcf->two_pass_vbrbias = cfg.rc_2pass_vbr_bias_pct; + oxcf->two_pass_vbrmin_section = cfg.rc_2pass_vbr_minsection_pct; + oxcf->two_pass_vbrmax_section = cfg.rc_2pass_vbr_maxsection_pct; + + oxcf->auto_key = + cfg.kf_mode == VPX_KF_AUTO && cfg.kf_min_dist != cfg.kf_max_dist; + oxcf->key_freq = cfg.kf_max_dist; + + oxcf->number_of_layers = cfg.ts_number_layers; + oxcf->periodicity = cfg.ts_periodicity; + + if (oxcf->number_of_layers > 1) { + memcpy(oxcf->target_bitrate, cfg.ts_target_bitrate, + sizeof(cfg.ts_target_bitrate)); + memcpy(oxcf->rate_decimator, cfg.ts_rate_decimator, + sizeof(cfg.ts_rate_decimator)); + memcpy(oxcf->layer_id, cfg.ts_layer_id, sizeof(cfg.ts_layer_id)); + } + +#if CONFIG_MULTI_RES_ENCODING + /* When mr_cfg is NULL, oxcf->mr_total_resolutions and oxcf->mr_encoder_id + * are both memset to 0, which ensures the correct logic under this + * situation. + */ + if (mr_cfg) { + oxcf->mr_total_resolutions = mr_cfg->mr_total_resolutions; + oxcf->mr_encoder_id = mr_cfg->mr_encoder_id; + oxcf->mr_down_sampling_factor.num = mr_cfg->mr_down_sampling_factor.num; + oxcf->mr_down_sampling_factor.den = mr_cfg->mr_down_sampling_factor.den; + oxcf->mr_low_res_mode_info = mr_cfg->mr_low_res_mode_info; + } +#else + (void)mr_cfg; +#endif + + oxcf->cpu_used = vp8_cfg.cpu_used; + if (cfg.g_pass == VPX_RC_FIRST_PASS) { + oxcf->cpu_used = VPXMAX(4, oxcf->cpu_used); + } + oxcf->encode_breakout = vp8_cfg.static_thresh; + oxcf->play_alternate = vp8_cfg.enable_auto_alt_ref; + oxcf->noise_sensitivity = vp8_cfg.noise_sensitivity; + oxcf->Sharpness = vp8_cfg.Sharpness; + oxcf->token_partitions = vp8_cfg.token_partitions; + + oxcf->two_pass_stats_in = cfg.rc_twopass_stats_in; + oxcf->output_pkt_list = vp8_cfg.pkt_list; + + oxcf->arnr_max_frames = vp8_cfg.arnr_max_frames; + oxcf->arnr_strength = vp8_cfg.arnr_strength; + oxcf->arnr_type = vp8_cfg.arnr_type; + + oxcf->tuning = vp8_cfg.tuning; + + oxcf->screen_content_mode = vp8_cfg.screen_content_mode; + + /* + printf("Current VP8 Settings: \n"); + printf("target_bandwidth: %d\n", oxcf->target_bandwidth); + printf("noise_sensitivity: %d\n", oxcf->noise_sensitivity); + printf("Sharpness: %d\n", oxcf->Sharpness); + printf("cpu_used: %d\n", oxcf->cpu_used); + printf("Mode: %d\n", oxcf->Mode); + printf("auto_key: %d\n", oxcf->auto_key); + printf("key_freq: %d\n", oxcf->key_freq); + printf("end_usage: %d\n", oxcf->end_usage); + printf("under_shoot_pct: %d\n", oxcf->under_shoot_pct); + printf("over_shoot_pct: %d\n", oxcf->over_shoot_pct); + printf("starting_buffer_level: %d\n", oxcf->starting_buffer_level); + printf("optimal_buffer_level: %d\n", oxcf->optimal_buffer_level); + printf("maximum_buffer_size: %d\n", oxcf->maximum_buffer_size); + printf("fixed_q: %d\n", oxcf->fixed_q); + printf("worst_allowed_q: %d\n", oxcf->worst_allowed_q); + printf("best_allowed_q: %d\n", oxcf->best_allowed_q); + printf("allow_spatial_resampling: %d\n", oxcf->allow_spatial_resampling); + printf("resample_down_water_mark: %d\n", oxcf->resample_down_water_mark); + printf("resample_up_water_mark: %d\n", oxcf->resample_up_water_mark); + printf("allow_df: %d\n", oxcf->allow_df); + printf("drop_frames_water_mark: %d\n", oxcf->drop_frames_water_mark); + printf("two_pass_vbrbias: %d\n", oxcf->two_pass_vbrbias); + printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section); + printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section); + printf("allow_lag: %d\n", oxcf->allow_lag); + printf("lag_in_frames: %d\n", oxcf->lag_in_frames); + printf("play_alternate: %d\n", oxcf->play_alternate); + printf("Version: %d\n", oxcf->Version); + printf("multi_threaded: %d\n", oxcf->multi_threaded); + printf("encode_breakout: %d\n", oxcf->encode_breakout); + */ + return VPX_CODEC_OK; +} + +static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t *ctx, + const vpx_codec_enc_cfg_t *cfg) { + vpx_codec_err_t res; + + if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) { + if (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS) + ERROR("Cannot change width or height after initialization"); + if ((ctx->cpi->initial_width && (int)cfg->g_w > ctx->cpi->initial_width) || + (ctx->cpi->initial_height && (int)cfg->g_h > ctx->cpi->initial_height)) + ERROR("Cannot increase width or height larger than their initial values"); + } + + /* Prevent increasing lag_in_frames. This check is stricter than it needs + * to be -- the limit is not increasing past the first lag_in_frames + * value, but we don't track the initial config, only the last successful + * config. + */ + if ((cfg->g_lag_in_frames > ctx->cfg.g_lag_in_frames)) + ERROR("Cannot increase lag_in_frames"); + + res = validate_config(ctx, cfg, &ctx->vp8_cfg, 0); + if (res != VPX_CODEC_OK) return res; + + if (setjmp(ctx->cpi->common.error.jmp)) { + const vpx_codec_err_t codec_err = + update_error_state(ctx, &ctx->cpi->common.error); + ctx->cpi->common.error.setjmp = 0; + vpx_clear_system_state(); + assert(codec_err != VPX_CODEC_OK); + return codec_err; + } + + ctx->cpi->common.error.setjmp = 1; + ctx->cfg = *cfg; + set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL); + vp8_change_config(ctx->cpi, &ctx->oxcf); +#if CONFIG_MULTITHREAD + if (vp8cx_create_encoder_threads(ctx->cpi)) return VPX_CODEC_ERROR; +#endif + ctx->cpi->common.error.setjmp = 0; + return VPX_CODEC_OK; +} + +static vpx_codec_err_t get_quantizer(vpx_codec_alg_priv_t *ctx, va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL) return VPX_CODEC_INVALID_PARAM; + *arg = vp8_get_quantizer(ctx->cpi); + return VPX_CODEC_OK; +} + +static vpx_codec_err_t get_quantizer64(vpx_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL) return VPX_CODEC_INVALID_PARAM; + *arg = vp8_reverse_trans(vp8_get_quantizer(ctx->cpi)); + return VPX_CODEC_OK; +} + +static vpx_codec_err_t update_extracfg(vpx_codec_alg_priv_t *ctx, + const struct vp8_extracfg *extra_cfg) { + const vpx_codec_err_t res = validate_config(ctx, &ctx->cfg, extra_cfg, 0); + if (res == VPX_CODEC_OK) { + ctx->vp8_cfg = *extra_cfg; + set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL); + vp8_change_config(ctx->cpi, &ctx->oxcf); + } + return res; +} + +static vpx_codec_err_t set_cpu_used(vpx_codec_alg_priv_t *ctx, va_list args) { + struct vp8_extracfg extra_cfg = ctx->vp8_cfg; + extra_cfg.cpu_used = CAST(VP8E_SET_CPUUSED, args); + // Use fastest speed setting (speed 16 or -16) if it's set beyond the range. + extra_cfg.cpu_used = VPXMIN(16, extra_cfg.cpu_used); + extra_cfg.cpu_used = VPXMAX(-16, extra_cfg.cpu_used); + return update_extracfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t set_enable_auto_alt_ref(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp8_extracfg extra_cfg = ctx->vp8_cfg; + extra_cfg.enable_auto_alt_ref = CAST(VP8E_SET_ENABLEAUTOALTREF, args); + return update_extracfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t set_noise_sensitivity(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp8_extracfg extra_cfg = ctx->vp8_cfg; + extra_cfg.noise_sensitivity = CAST(VP8E_SET_NOISE_SENSITIVITY, args); + return update_extracfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t set_sharpness(vpx_codec_alg_priv_t *ctx, va_list args) { + struct vp8_extracfg extra_cfg = ctx->vp8_cfg; + extra_cfg.Sharpness = CAST(VP8E_SET_SHARPNESS, args); + return update_extracfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t set_static_thresh(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp8_extracfg extra_cfg = ctx->vp8_cfg; + extra_cfg.static_thresh = CAST(VP8E_SET_STATIC_THRESHOLD, args); + return update_extracfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t set_token_partitions(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp8_extracfg extra_cfg = ctx->vp8_cfg; + extra_cfg.token_partitions = CAST(VP8E_SET_TOKEN_PARTITIONS, args); + return update_extracfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t set_arnr_max_frames(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp8_extracfg extra_cfg = ctx->vp8_cfg; + extra_cfg.arnr_max_frames = CAST(VP8E_SET_ARNR_MAXFRAMES, args); + return update_extracfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t set_arnr_strength(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp8_extracfg extra_cfg = ctx->vp8_cfg; + extra_cfg.arnr_strength = CAST(VP8E_SET_ARNR_STRENGTH, args); + return update_extracfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t set_arnr_type(vpx_codec_alg_priv_t *ctx, va_list args) { + struct vp8_extracfg extra_cfg = ctx->vp8_cfg; + extra_cfg.arnr_type = CAST(VP8E_SET_ARNR_TYPE, args); + return update_extracfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t set_tuning(vpx_codec_alg_priv_t *ctx, va_list args) { + struct vp8_extracfg extra_cfg = ctx->vp8_cfg; + extra_cfg.tuning = CAST(VP8E_SET_TUNING, args); + return update_extracfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t set_cq_level(vpx_codec_alg_priv_t *ctx, va_list args) { + struct vp8_extracfg extra_cfg = ctx->vp8_cfg; + extra_cfg.cq_level = CAST(VP8E_SET_CQ_LEVEL, args); + return update_extracfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t set_rc_max_intra_bitrate_pct(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp8_extracfg extra_cfg = ctx->vp8_cfg; + extra_cfg.rc_max_intra_bitrate_pct = + CAST(VP8E_SET_MAX_INTRA_BITRATE_PCT, args); + return update_extracfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_rc_gf_cbr_boost_pct(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp8_extracfg extra_cfg = ctx->vp8_cfg; + extra_cfg.gf_cbr_boost_pct = CAST(VP8E_SET_GF_CBR_BOOST_PCT, args); + return update_extracfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t set_screen_content_mode(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp8_extracfg extra_cfg = ctx->vp8_cfg; + extra_cfg.screen_content_mode = CAST(VP8E_SET_SCREEN_CONTENT_MODE, args); + return update_extracfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_rtc_external_ratectrl(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP8_COMP *cpi = ctx->cpi; + const unsigned int data = CAST(VP8E_SET_RTC_EXTERNAL_RATECTRL, args); + if (data) { + cpi->cyclic_refresh_mode_enabled = 0; + cpi->rt_always_update_correction_factor = 1; + cpi->rt_drop_recode_on_overshoot = 0; + } + return VPX_CODEC_OK; +} + +static vpx_codec_err_t vp8e_mr_alloc_mem(const vpx_codec_enc_cfg_t *cfg, + void **mem_loc) { + vpx_codec_err_t res = VPX_CODEC_OK; + +#if CONFIG_MULTI_RES_ENCODING + LOWER_RES_FRAME_INFO *shared_mem_loc; + int mb_rows = ((cfg->g_w + 15) >> 4); + int mb_cols = ((cfg->g_h + 15) >> 4); + + shared_mem_loc = calloc(1, sizeof(LOWER_RES_FRAME_INFO)); + if (!shared_mem_loc) { + return VPX_CODEC_MEM_ERROR; + } + + shared_mem_loc->mb_info = + calloc(mb_rows * mb_cols, sizeof(LOWER_RES_MB_INFO)); + if (!(shared_mem_loc->mb_info)) { + free(shared_mem_loc); + res = VPX_CODEC_MEM_ERROR; + } else { + *mem_loc = (void *)shared_mem_loc; + res = VPX_CODEC_OK; + } +#else + (void)cfg; + (void)mem_loc; +#endif + return res; +} + +static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx, + vpx_codec_priv_enc_mr_cfg_t *mr_cfg) { + vpx_codec_err_t res = VPX_CODEC_OK; + + vp8_rtcd(); + vpx_dsp_rtcd(); + vpx_scale_rtcd(); + + if (!ctx->priv) { + struct vpx_codec_alg_priv *priv = + (struct vpx_codec_alg_priv *)vpx_calloc(1, sizeof(*priv)); + + if (!priv) { + return VPX_CODEC_MEM_ERROR; + } + + ctx->priv = (vpx_codec_priv_t *)priv; + ctx->priv->init_flags = ctx->init_flags; + + if (ctx->config.enc) { + /* Update the reference to the config structure to an + * internal copy. + */ + priv->cfg = *ctx->config.enc; + ctx->config.enc = &priv->cfg; + } + + priv->vp8_cfg = default_extracfg; + priv->vp8_cfg.pkt_list = &priv->pkt_list.head; + + priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 2; + + if (priv->cx_data_sz < 32768) priv->cx_data_sz = 32768; + + priv->cx_data = malloc(priv->cx_data_sz); + + if (!priv->cx_data) { + return VPX_CODEC_MEM_ERROR; + } + + if (mr_cfg) { + ctx->priv->enc.total_encoders = mr_cfg->mr_total_resolutions; + } else { + ctx->priv->enc.total_encoders = 1; + } + + vp8_initialize_enc(); + + res = validate_config(priv, &priv->cfg, &priv->vp8_cfg, 0); + + if (!res) { + priv->pts_offset_initialized = 0; + priv->timestamp_ratio.den = priv->cfg.g_timebase.den; + priv->timestamp_ratio.num = (int64_t)priv->cfg.g_timebase.num; + priv->timestamp_ratio.num *= TICKS_PER_SEC; + reduce_ratio(&priv->timestamp_ratio); + + set_vp8e_config(&priv->oxcf, priv->cfg, priv->vp8_cfg, mr_cfg); + priv->cpi = vp8_create_compressor(&priv->oxcf); + if (!priv->cpi) res = VPX_CODEC_MEM_ERROR; + } + } + + return res; +} + +static vpx_codec_err_t vp8e_destroy(vpx_codec_alg_priv_t *ctx) { +#if CONFIG_MULTI_RES_ENCODING + /* Free multi-encoder shared memory */ + if (ctx->oxcf.mr_total_resolutions > 0 && + (ctx->oxcf.mr_encoder_id == ctx->oxcf.mr_total_resolutions - 1)) { + LOWER_RES_FRAME_INFO *shared_mem_loc = + (LOWER_RES_FRAME_INFO *)ctx->oxcf.mr_low_res_mode_info; + free(shared_mem_loc->mb_info); + free(ctx->oxcf.mr_low_res_mode_info); + } +#endif + + free(ctx->cx_data); + vp8_remove_compressor(&ctx->cpi); + vpx_free(ctx); + return VPX_CODEC_OK; +} + +static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, + YV12_BUFFER_CONFIG *yv12) { + const int y_w = img->d_w; + const int y_h = img->d_h; + const int uv_w = (img->d_w + 1) / 2; + const int uv_h = (img->d_h + 1) / 2; + vpx_codec_err_t res = VPX_CODEC_OK; + yv12->y_buffer = img->planes[VPX_PLANE_Y]; + yv12->u_buffer = img->planes[VPX_PLANE_U]; + yv12->v_buffer = img->planes[VPX_PLANE_V]; + + yv12->y_crop_width = y_w; + yv12->y_crop_height = y_h; + yv12->y_width = y_w; + yv12->y_height = y_h; + yv12->uv_crop_width = uv_w; + yv12->uv_crop_height = uv_h; + yv12->uv_width = uv_w; + yv12->uv_height = uv_h; + + yv12->y_stride = img->stride[VPX_PLANE_Y]; + yv12->uv_stride = img->stride[VPX_PLANE_U]; + + yv12->border = (img->stride[VPX_PLANE_Y] - img->w) / 2; + return res; +} + +static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, + unsigned long duration, + vpx_enc_deadline_t deadline) { + int new_qc; + +#if !(CONFIG_REALTIME_ONLY) + /* Use best quality mode if no deadline is given. */ + new_qc = MODE_BESTQUALITY; + + if (deadline) { + /* Convert duration parameter from stream timebase to microseconds */ + uint64_t duration_us; + + VPX_STATIC_ASSERT(TICKS_PER_SEC > 1000000 && + (TICKS_PER_SEC % 1000000) == 0); + + duration_us = duration * (uint64_t)ctx->timestamp_ratio.num / + (ctx->timestamp_ratio.den * (TICKS_PER_SEC / 1000000)); + + /* If the deadline is more that the duration this frame is to be shown, + * use good quality mode. Otherwise use realtime mode. + */ + new_qc = (deadline > duration_us) ? MODE_GOODQUALITY : MODE_REALTIME; + } + +#else + (void)duration; + new_qc = MODE_REALTIME; +#endif + + if (deadline == VPX_DL_REALTIME) { + new_qc = MODE_REALTIME; + } else if (ctx->cfg.g_pass == VPX_RC_FIRST_PASS) { + new_qc = MODE_FIRSTPASS; + } else if (ctx->cfg.g_pass == VPX_RC_LAST_PASS) { + new_qc = + (new_qc == MODE_BESTQUALITY) ? MODE_SECONDPASS_BEST : MODE_SECONDPASS; + } + + if (ctx->oxcf.Mode != new_qc) { + ctx->oxcf.Mode = new_qc; + vp8_change_config(ctx->cpi, &ctx->oxcf); + } +} + +static vpx_codec_err_t set_reference_and_update(vpx_codec_alg_priv_t *ctx, + vpx_enc_frame_flags_t flags) { + /* Handle Flags */ + if (((flags & VP8_EFLAG_NO_UPD_GF) && (flags & VP8_EFLAG_FORCE_GF)) || + ((flags & VP8_EFLAG_NO_UPD_ARF) && (flags & VP8_EFLAG_FORCE_ARF))) { + ctx->base.err_detail = "Conflicting flags."; + return VPX_CODEC_INVALID_PARAM; + } + + if (flags & + (VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF)) { + int ref = 7; + + if (flags & VP8_EFLAG_NO_REF_LAST) ref ^= VP8_LAST_FRAME; + + if (flags & VP8_EFLAG_NO_REF_GF) ref ^= VP8_GOLD_FRAME; + + if (flags & VP8_EFLAG_NO_REF_ARF) ref ^= VP8_ALTR_FRAME; + + vp8_use_as_reference(ctx->cpi, ref); + } + + if (flags & + (VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | + VP8_EFLAG_FORCE_GF | VP8_EFLAG_FORCE_ARF)) { + int upd = 7; + + if (flags & VP8_EFLAG_NO_UPD_LAST) upd ^= VP8_LAST_FRAME; + + if (flags & VP8_EFLAG_NO_UPD_GF) upd ^= VP8_GOLD_FRAME; + + if (flags & VP8_EFLAG_NO_UPD_ARF) upd ^= VP8_ALTR_FRAME; + + vp8_update_reference(ctx->cpi, upd); + } + + if (flags & VP8_EFLAG_NO_UPD_ENTROPY) { + vp8_update_entropy(ctx->cpi, 0); + } + + return VPX_CODEC_OK; +} + +static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, + const vpx_image_t *img, vpx_codec_pts_t pts, + unsigned long duration, + vpx_enc_frame_flags_t enc_flags, + vpx_enc_deadline_t deadline) { + volatile vpx_codec_err_t res = VPX_CODEC_OK; + // Make a copy as volatile to avoid -Wclobbered with longjmp. + volatile vpx_enc_frame_flags_t flags = enc_flags; + volatile vpx_codec_pts_t pts_val = pts; + + if (!ctx->cfg.rc_target_bitrate) { +#if CONFIG_MULTI_RES_ENCODING + if (!ctx->cpi) return VPX_CODEC_ERROR; + if (ctx->cpi->oxcf.mr_total_resolutions > 1) { + LOWER_RES_FRAME_INFO *low_res_frame_info = + (LOWER_RES_FRAME_INFO *)ctx->cpi->oxcf.mr_low_res_mode_info; + if (!low_res_frame_info) return VPX_CODEC_ERROR; + low_res_frame_info->skip_encoding_prev_stream = 1; + if (ctx->cpi->oxcf.mr_encoder_id == 0) + low_res_frame_info->skip_encoding_base_stream = 1; + } +#endif + return res; + } + + if (img) res = validate_img(ctx, img); + + if (!res) res = validate_config(ctx, &ctx->cfg, &ctx->vp8_cfg, 1); + + if (!ctx->pts_offset_initialized) { + ctx->pts_offset = pts_val; + ctx->pts_offset_initialized = 1; + } + pts_val -= ctx->pts_offset; + + pick_quickcompress_mode(ctx, duration, deadline); + vpx_codec_pkt_list_init(&ctx->pkt_list); + + // If no flags are set in the encode call, then use the frame flags as + // defined via the control function: vp8e_set_frame_flags. + if (!flags) { + flags = ctx->control_frame_flags; + } + ctx->control_frame_flags = 0; + + if (!res) res = set_reference_and_update(ctx, flags); + + /* Handle fixed keyframe intervals */ + if (ctx->cfg.kf_mode == VPX_KF_AUTO && + ctx->cfg.kf_min_dist == ctx->cfg.kf_max_dist) { + if (++ctx->fixed_kf_cntr > ctx->cfg.kf_min_dist) { + flags |= VPX_EFLAG_FORCE_KF; + ctx->fixed_kf_cntr = 1; + } + } + + /* Initialize the encoder instance on the first frame*/ + if (!res && ctx->cpi) { + unsigned int lib_flags; + YV12_BUFFER_CONFIG sd; + int64_t dst_time_stamp, dst_end_time_stamp; + size_t size, cx_data_sz; + unsigned char *cx_data; + unsigned char *cx_data_end; + int comp_data_state = 0; + + if (setjmp(ctx->cpi->common.error.jmp)) { + ctx->cpi->common.error.setjmp = 0; + res = update_error_state(ctx, &ctx->cpi->common.error); + vpx_clear_system_state(); + return res; + } + ctx->cpi->common.error.setjmp = 1; + + /* Set up internal flags */ + if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) { + ((VP8_COMP *)ctx->cpi)->b_calculate_psnr = 1; + } + + if (ctx->base.init_flags & VPX_CODEC_USE_OUTPUT_PARTITION) { + ((VP8_COMP *)ctx->cpi)->output_partition = 1; + } + + /* Convert API flags to internal codec lib flags */ + lib_flags = (flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0; + + dst_time_stamp = + pts_val * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den; + dst_end_time_stamp = (pts_val + (int64_t)duration) * + ctx->timestamp_ratio.num / ctx->timestamp_ratio.den; + + if (img != NULL) { + res = image2yuvconfig(img, &sd); + + if (sd.y_width != ctx->cfg.g_w || sd.y_height != ctx->cfg.g_h) { + /* from vpx_encoder.h for g_w/g_h: + "Note that the frames passed as input to the encoder must have this + resolution" + */ + ctx->base.err_detail = "Invalid input frame resolution"; + res = VPX_CODEC_INVALID_PARAM; + } else { + if (vp8_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags, + &sd, dst_time_stamp, dst_end_time_stamp)) { + VP8_COMP *cpi = (VP8_COMP *)ctx->cpi; + res = update_error_state(ctx, &cpi->common.error); + } + } + + /* reset for next frame */ + ctx->next_frame_flag = 0; + } + + cx_data = ctx->cx_data; + cx_data_sz = ctx->cx_data_sz; + cx_data_end = ctx->cx_data + cx_data_sz; + lib_flags = 0; + + while (cx_data_sz >= ctx->cx_data_sz / 2) { + comp_data_state = vp8_get_compressed_data( + ctx->cpi, &lib_flags, &size, cx_data, cx_data_end, &dst_time_stamp, + &dst_end_time_stamp, !img); + + if (comp_data_state == VPX_CODEC_CORRUPT_FRAME) { + return VPX_CODEC_CORRUPT_FRAME; + } else if (comp_data_state == -1) { + break; + } + + if (size) { + vpx_codec_pts_t round, delta; + vpx_codec_cx_pkt_t pkt; + VP8_COMP *cpi = (VP8_COMP *)ctx->cpi; + + /* Add the frame packet to the list of returned packets. */ + round = (vpx_codec_pts_t)ctx->timestamp_ratio.num / 2; + if (round > 0) --round; + delta = (dst_end_time_stamp - dst_time_stamp); + pkt.kind = VPX_CODEC_CX_FRAME_PKT; + pkt.data.frame.pts = + (dst_time_stamp * ctx->timestamp_ratio.den + round) / + ctx->timestamp_ratio.num + + ctx->pts_offset; + pkt.data.frame.duration = + (unsigned long)((delta * ctx->timestamp_ratio.den + round) / + ctx->timestamp_ratio.num); + pkt.data.frame.flags = lib_flags << 16; + pkt.data.frame.width[0] = cpi->common.Width; + pkt.data.frame.height[0] = cpi->common.Height; + pkt.data.frame.spatial_layer_encoded[0] = 1; + + if (lib_flags & FRAMEFLAGS_KEY) { + pkt.data.frame.flags |= VPX_FRAME_IS_KEY; + } + + if (!cpi->common.show_frame) { + pkt.data.frame.flags |= VPX_FRAME_IS_INVISIBLE; + + /* This timestamp should be as close as possible to the + * prior PTS so that if a decoder uses pts to schedule when + * to do this, we start right after last frame was decoded. + * Invisible frames have no duration. + */ + pkt.data.frame.pts = + ((cpi->last_time_stamp_seen * ctx->timestamp_ratio.den + round) / + ctx->timestamp_ratio.num) + + ctx->pts_offset + 1; + pkt.data.frame.duration = 0; + } + + if (cpi->droppable) pkt.data.frame.flags |= VPX_FRAME_IS_DROPPABLE; + + if (cpi->output_partition) { + int i; + const int num_partitions = + (1 << cpi->common.multi_token_partition) + 1; + + pkt.data.frame.flags |= VPX_FRAME_IS_FRAGMENT; + + for (i = 0; i < num_partitions; ++i) { +#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING + pkt.data.frame.buf = cpi->partition_d[i]; +#else + pkt.data.frame.buf = cx_data; + cx_data += cpi->partition_sz[i]; + cx_data_sz -= cpi->partition_sz[i]; +#endif + pkt.data.frame.sz = cpi->partition_sz[i]; + pkt.data.frame.partition_id = i; + /* don't set the fragment bit for the last partition */ + if (i == (num_partitions - 1)) { + pkt.data.frame.flags &= ~VPX_FRAME_IS_FRAGMENT; + } + vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt); + } +#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING + /* In lagged mode the encoder can buffer multiple frames. + * We don't want this in partitioned output because + * partitions are spread all over the output buffer. + * So, force an exit! + */ + cx_data_sz -= ctx->cx_data_sz / 2; +#endif + } else { + pkt.data.frame.buf = cx_data; + pkt.data.frame.sz = size; + pkt.data.frame.partition_id = -1; + vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt); + cx_data += size; + cx_data_sz -= size; + } + } + } + ctx->cpi->common.error.setjmp = 0; + } + + return res; +} + +static const vpx_codec_cx_pkt_t *vp8e_get_cxdata(vpx_codec_alg_priv_t *ctx, + vpx_codec_iter_t *iter) { + return vpx_codec_pkt_list_get(&ctx->pkt_list.head, iter); +} + +static vpx_codec_err_t vp8e_set_reference(vpx_codec_alg_priv_t *ctx, + va_list args) { + vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *); + + if (data) { + vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data; + YV12_BUFFER_CONFIG sd; + + image2yuvconfig(&frame->img, &sd); + vp8_set_reference(ctx->cpi, frame->frame_type, &sd); + return VPX_CODEC_OK; + } else { + return VPX_CODEC_INVALID_PARAM; + } +} + +static vpx_codec_err_t vp8e_get_reference(vpx_codec_alg_priv_t *ctx, + va_list args) { + vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *); + + if (data) { + vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data; + YV12_BUFFER_CONFIG sd; + + image2yuvconfig(&frame->img, &sd); + vp8_get_reference(ctx->cpi, frame->frame_type, &sd); + return VPX_CODEC_OK; + } else { + return VPX_CODEC_INVALID_PARAM; + } +} + +static vpx_codec_err_t vp8e_set_previewpp(vpx_codec_alg_priv_t *ctx, + va_list args) { +#if CONFIG_POSTPROC + vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *); + + if (data) { + ctx->preview_ppcfg = *((vp8_postproc_cfg_t *)data); + return VPX_CODEC_OK; + } else { + return VPX_CODEC_INVALID_PARAM; + } +#else + (void)ctx; + (void)args; + return VPX_CODEC_INCAPABLE; +#endif +} + +static vpx_image_t *vp8e_get_preview(vpx_codec_alg_priv_t *ctx) { + YV12_BUFFER_CONFIG sd; + vp8_ppflags_t flags; + vp8_zero(flags); + + if (ctx->preview_ppcfg.post_proc_flag) { + flags.post_proc_flag = ctx->preview_ppcfg.post_proc_flag; + flags.deblocking_level = ctx->preview_ppcfg.deblocking_level; + flags.noise_level = ctx->preview_ppcfg.noise_level; + } + + if (0 == vp8_get_preview_raw_frame(ctx->cpi, &sd, &flags)) { + /* + vpx_img_wrap(&ctx->preview_img, VPX_IMG_FMT_YV12, + sd.y_width + 2*VP8BORDERINPIXELS, + sd.y_height + 2*VP8BORDERINPIXELS, + 1, + sd.buffer_alloc); + vpx_img_set_rect(&ctx->preview_img, + VP8BORDERINPIXELS, VP8BORDERINPIXELS, + sd.y_width, sd.y_height); + */ + + ctx->preview_img.bps = 12; + ctx->preview_img.planes[VPX_PLANE_Y] = sd.y_buffer; + ctx->preview_img.planes[VPX_PLANE_U] = sd.u_buffer; + ctx->preview_img.planes[VPX_PLANE_V] = sd.v_buffer; + + ctx->preview_img.fmt = VPX_IMG_FMT_I420; + ctx->preview_img.x_chroma_shift = 1; + ctx->preview_img.y_chroma_shift = 1; + + ctx->preview_img.d_w = sd.y_width; + ctx->preview_img.d_h = sd.y_height; + ctx->preview_img.stride[VPX_PLANE_Y] = sd.y_stride; + ctx->preview_img.stride[VPX_PLANE_U] = sd.uv_stride; + ctx->preview_img.stride[VPX_PLANE_V] = sd.uv_stride; + ctx->preview_img.w = sd.y_width; + ctx->preview_img.h = sd.y_height; + + return &ctx->preview_img; + } else { + return NULL; + } +} + +static vpx_codec_err_t vp8e_set_frame_flags(vpx_codec_alg_priv_t *ctx, + va_list args) { + int frame_flags = va_arg(args, int); + ctx->control_frame_flags = frame_flags; + return set_reference_and_update(ctx, frame_flags); +} + +static vpx_codec_err_t vp8e_set_temporal_layer_id(vpx_codec_alg_priv_t *ctx, + va_list args) { + int layer_id = va_arg(args, int); + if (layer_id < 0 || layer_id >= (int)ctx->cfg.ts_number_layers) { + return VPX_CODEC_INVALID_PARAM; + } + ctx->cpi->temporal_layer_id = layer_id; + return VPX_CODEC_OK; +} + +static vpx_codec_err_t vp8e_set_roi_map(vpx_codec_alg_priv_t *ctx, + va_list args) { + vpx_roi_map_t *data = va_arg(args, vpx_roi_map_t *); + + if (data) { + vpx_roi_map_t *roi = (vpx_roi_map_t *)data; + + if (!vp8_set_roimap(ctx->cpi, roi->roi_map, roi->rows, roi->cols, + roi->delta_q, roi->delta_lf, roi->static_threshold)) { + return VPX_CODEC_OK; + } else { + return VPX_CODEC_INVALID_PARAM; + } + } else { + return VPX_CODEC_INVALID_PARAM; + } +} + +static vpx_codec_err_t vp8e_set_activemap(vpx_codec_alg_priv_t *ctx, + va_list args) { + vpx_active_map_t *data = va_arg(args, vpx_active_map_t *); + + if (data) { + vpx_active_map_t *map = (vpx_active_map_t *)data; + + if (!vp8_set_active_map(ctx->cpi, map->active_map, map->rows, map->cols)) { + return VPX_CODEC_OK; + } else { + return VPX_CODEC_INVALID_PARAM; + } + } else { + return VPX_CODEC_INVALID_PARAM; + } +} + +static vpx_codec_err_t vp8e_set_scalemode(vpx_codec_alg_priv_t *ctx, + va_list args) { + vpx_scaling_mode_t *data = va_arg(args, vpx_scaling_mode_t *); + + if (data) { + int res; + vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data; + res = vp8_set_internal_size(ctx->cpi, scalemode.h_scaling_mode, + scalemode.v_scaling_mode); + + if (!res) { + /*force next frame a key frame to effect scaling mode */ + ctx->next_frame_flag |= FRAMEFLAGS_KEY; + return VPX_CODEC_OK; + } else { + return VPX_CODEC_INVALID_PARAM; + } + } else { + return VPX_CODEC_INVALID_PARAM; + } +} + +static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] = { + { VP8_SET_REFERENCE, vp8e_set_reference }, + { VP8_COPY_REFERENCE, vp8e_get_reference }, + { VP8_SET_POSTPROC, vp8e_set_previewpp }, + { VP8E_SET_FRAME_FLAGS, vp8e_set_frame_flags }, + { VP8E_SET_TEMPORAL_LAYER_ID, vp8e_set_temporal_layer_id }, + { VP8E_SET_ROI_MAP, vp8e_set_roi_map }, + { VP8E_SET_ACTIVEMAP, vp8e_set_activemap }, + { VP8E_SET_SCALEMODE, vp8e_set_scalemode }, + { VP8E_SET_CPUUSED, set_cpu_used }, + { VP8E_SET_NOISE_SENSITIVITY, set_noise_sensitivity }, + { VP8E_SET_ENABLEAUTOALTREF, set_enable_auto_alt_ref }, + { VP8E_SET_SHARPNESS, set_sharpness }, + { VP8E_SET_STATIC_THRESHOLD, set_static_thresh }, + { VP8E_SET_TOKEN_PARTITIONS, set_token_partitions }, + { VP8E_GET_LAST_QUANTIZER, get_quantizer }, + { VP8E_GET_LAST_QUANTIZER_64, get_quantizer64 }, + { VP8E_SET_ARNR_MAXFRAMES, set_arnr_max_frames }, + { VP8E_SET_ARNR_STRENGTH, set_arnr_strength }, + { VP8E_SET_ARNR_TYPE, set_arnr_type }, + { VP8E_SET_TUNING, set_tuning }, + { VP8E_SET_CQ_LEVEL, set_cq_level }, + { VP8E_SET_MAX_INTRA_BITRATE_PCT, set_rc_max_intra_bitrate_pct }, + { VP8E_SET_SCREEN_CONTENT_MODE, set_screen_content_mode }, + { VP8E_SET_GF_CBR_BOOST_PCT, ctrl_set_rc_gf_cbr_boost_pct }, + { VP8E_SET_RTC_EXTERNAL_RATECTRL, ctrl_set_rtc_external_ratectrl }, + { -1, NULL }, +}; + +static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = { + { 0, + { + 0, /* g_usage (unused) */ + 0, /* g_threads */ + 0, /* g_profile */ + + 320, /* g_width */ + 240, /* g_height */ + VPX_BITS_8, /* g_bit_depth */ + 8, /* g_input_bit_depth */ + + { 1, 30 }, /* g_timebase */ + + 0, /* g_error_resilient */ + + VPX_RC_ONE_PASS, /* g_pass */ + + 0, /* g_lag_in_frames */ + + 0, /* rc_dropframe_thresh */ + 0, /* rc_resize_allowed */ + 1, /* rc_scaled_width */ + 1, /* rc_scaled_height */ + 60, /* rc_resize_down_thresh */ + 30, /* rc_resize_up_thresh */ + + VPX_VBR, /* rc_end_usage */ + { NULL, 0 }, /* rc_twopass_stats_in */ + { NULL, 0 }, /* rc_firstpass_mb_stats_in */ + 256, /* rc_target_bitrate */ + 4, /* rc_min_quantizer */ + 63, /* rc_max_quantizer */ + 100, /* rc_undershoot_pct */ + 100, /* rc_overshoot_pct */ + + 6000, /* rc_max_buffer_size */ + 4000, /* rc_buffer_initial_size; */ + 5000, /* rc_buffer_optimal_size; */ + + 50, /* rc_two_pass_vbrbias */ + 0, /* rc_two_pass_vbrmin_section */ + 400, /* rc_two_pass_vbrmax_section */ + 0, // rc_2pass_vbr_corpus_complexity (only has meaningfull for VP9) + + /* keyframing settings (kf) */ + VPX_KF_AUTO, /* g_kfmode*/ + 0, /* kf_min_dist */ + 128, /* kf_max_dist */ + + VPX_SS_DEFAULT_LAYERS, /* ss_number_layers */ + { 0 }, + { 0 }, /* ss_target_bitrate */ + 1, /* ts_number_layers */ + { 0 }, /* ts_target_bitrate */ + { 0 }, /* ts_rate_decimator */ + 0, /* ts_periodicity */ + { 0 }, /* ts_layer_id */ + { 0 }, /* layer_target_bitrate */ + 0, /* temporal_layering_mode */ + 0, /* use_vizier_rc_params */ + { 1, 1 }, /* active_wq_factor */ + { 1, 1 }, /* err_per_mb_factor */ + { 1, 1 }, /* sr_default_decay_limit */ + { 1, 1 }, /* sr_diff_factor */ + { 1, 1 }, /* kf_err_per_mb_factor */ + { 1, 1 }, /* kf_frame_min_boost_factor */ + { 1, 1 }, /* kf_frame_max_boost_first_factor */ + { 1, 1 }, /* kf_frame_max_boost_subs_factor */ + { 1, 1 }, /* kf_max_total_boost_factor */ + { 1, 1 }, /* gf_max_total_boost_factor */ + { 1, 1 }, /* gf_frame_max_boost_factor */ + { 1, 1 }, /* zm_factor */ + { 1, 1 }, /* rd_mult_inter_qp_fac */ + { 1, 1 }, /* rd_mult_arf_qp_fac */ + { 1, 1 }, /* rd_mult_key_qp_fac */ + } }, +}; + +#ifndef VERSION_STRING +#define VERSION_STRING +#endif +CODEC_INTERFACE(vpx_codec_vp8_cx) = { + "WebM Project VP8 Encoder" VERSION_STRING, + VPX_CODEC_INTERNAL_ABI_VERSION, + VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR | VPX_CODEC_CAP_OUTPUT_PARTITION, + /* vpx_codec_caps_t caps; */ + vp8e_init, /* vpx_codec_init_fn_t init; */ + vp8e_destroy, /* vpx_codec_destroy_fn_t destroy; */ + vp8e_ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */ + { + NULL, /* vpx_codec_peek_si_fn_t peek_si; */ + NULL, /* vpx_codec_get_si_fn_t get_si; */ + NULL, /* vpx_codec_decode_fn_t decode; */ + NULL, /* vpx_codec_frame_get_fn_t frame_get; */ + NULL, /* vpx_codec_set_fb_fn_t set_fb_fn; */ + }, + { + 1, /* 1 cfg map */ + vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t cfg_maps; */ + vp8e_encode, /* vpx_codec_encode_fn_t encode; */ + vp8e_get_cxdata, /* vpx_codec_get_cx_data_fn_t get_cx_data; */ + vp8e_set_config, + NULL, + vp8e_get_preview, + vp8e_mr_alloc_mem, + } /* encoder functions */ +}; diff --git a/media/libvpx/libvpx/vp8/vp8_dx_iface.c b/media/libvpx/libvpx/vp8/vp8_dx_iface.c new file mode 100644 index 0000000000..e81deaf4ea --- /dev/null +++ b/media/libvpx/libvpx/vp8/vp8_dx_iface.c @@ -0,0 +1,752 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include "./vp8_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "./vpx_scale_rtcd.h" +#include "vpx/vpx_decoder.h" +#include "vpx/vp8dx.h" +#include "vpx/internal/vpx_codec_internal.h" +#include "vpx_version.h" +#include "common/alloccommon.h" +#include "common/common.h" +#include "common/onyxc_int.h" +#include "common/onyxd.h" +#include "decoder/onyxd_int.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/system_state.h" +#if CONFIG_ERROR_CONCEALMENT +#include "decoder/error_concealment.h" +#endif +#include "decoder/decoderthreading.h" + +#define VP8_CAP_POSTPROC (CONFIG_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0) +#define VP8_CAP_ERROR_CONCEALMENT \ + (CONFIG_ERROR_CONCEALMENT ? VPX_CODEC_CAP_ERROR_CONCEALMENT : 0) + +typedef vpx_codec_stream_info_t vp8_stream_info_t; + +/* Structures for handling memory allocations */ +typedef enum { VP8_SEG_ALG_PRIV = 256, VP8_SEG_MAX } mem_seg_id_t; +#define NELEMENTS(x) ((int)(sizeof(x) / sizeof((x)[0]))) + +struct vpx_codec_alg_priv { + vpx_codec_priv_t base; + vpx_codec_dec_cfg_t cfg; + vp8_stream_info_t si; + int decoder_init; +#if CONFIG_MULTITHREAD + // Restart threads on next frame if set to 1. + // This is set when error happens in multithreaded decoding and all threads + // are shut down. + int restart_threads; +#endif + int postproc_cfg_set; + vp8_postproc_cfg_t postproc_cfg; + vpx_decrypt_cb decrypt_cb; + void *decrypt_state; + vpx_image_t img; + int img_setup; + struct frame_buffers yv12_frame_buffers; + void *user_priv; + FRAGMENT_DATA fragments; +}; + +static int vp8_init_ctx(vpx_codec_ctx_t *ctx) { + vpx_codec_alg_priv_t *priv = + (vpx_codec_alg_priv_t *)vpx_calloc(1, sizeof(*priv)); + if (!priv) return 1; + + ctx->priv = (vpx_codec_priv_t *)priv; + ctx->priv->init_flags = ctx->init_flags; + + priv->si.sz = sizeof(priv->si); + priv->decrypt_cb = NULL; + priv->decrypt_state = NULL; + + if (ctx->config.dec) { + /* Update the reference to the config structure to an internal copy. */ + priv->cfg = *ctx->config.dec; + ctx->config.dec = &priv->cfg; + } + + return 0; +} + +static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx, + vpx_codec_priv_enc_mr_cfg_t *data) { + vpx_codec_err_t res = VPX_CODEC_OK; + (void)data; + + vp8_rtcd(); + vpx_dsp_rtcd(); + vpx_scale_rtcd(); + + /* This function only allocates space for the vpx_codec_alg_priv_t + * structure. More memory may be required at the time the stream + * information becomes known. + */ + if (!ctx->priv) { + vpx_codec_alg_priv_t *priv; + + if (vp8_init_ctx(ctx)) return VPX_CODEC_MEM_ERROR; + + priv = (vpx_codec_alg_priv_t *)ctx->priv; + + /* initialize number of fragments to zero */ + priv->fragments.count = 0; + /* is input fragments enabled? */ + priv->fragments.enabled = + (priv->base.init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS); + + /*post processing level initialized to do nothing */ + } + + return res; +} + +static vpx_codec_err_t vp8_destroy(vpx_codec_alg_priv_t *ctx) { + vp8_remove_decoder_instances(&ctx->yv12_frame_buffers); + + vpx_free(ctx); + + return VPX_CODEC_OK; +} + +static vpx_codec_err_t vp8_peek_si_internal(const uint8_t *data, + unsigned int data_sz, + vpx_codec_stream_info_t *si, + vpx_decrypt_cb decrypt_cb, + void *decrypt_state) { + vpx_codec_err_t res = VPX_CODEC_OK; + + assert(data != NULL); + + if (data + data_sz <= data) { + res = VPX_CODEC_INVALID_PARAM; + } else { + /* Parse uncompresssed part of key frame header. + * 3 bytes:- including version, frame type and an offset + * 3 bytes:- sync code (0x9d, 0x01, 0x2a) + * 4 bytes:- including image width and height in the lowest 14 bits + * of each 2-byte value. + */ + uint8_t clear_buffer[10]; + const uint8_t *clear = data; + if (decrypt_cb) { + int n = VPXMIN(sizeof(clear_buffer), data_sz); + decrypt_cb(decrypt_state, data, clear_buffer, n); + clear = clear_buffer; + } + si->is_kf = 0; + + if (data_sz >= 10 && !(clear[0] & 0x01)) { /* I-Frame */ + si->is_kf = 1; + + /* vet via sync code */ + if (clear[3] != 0x9d || clear[4] != 0x01 || clear[5] != 0x2a) { + return VPX_CODEC_UNSUP_BITSTREAM; + } + + si->w = (clear[6] | (clear[7] << 8)) & 0x3fff; + si->h = (clear[8] | (clear[9] << 8)) & 0x3fff; + + /*printf("w=%d, h=%d\n", si->w, si->h);*/ + if (!(si->h && si->w)) { + si->w = si->h = 0; + res = VPX_CODEC_CORRUPT_FRAME; + } + } else { + res = VPX_CODEC_UNSUP_BITSTREAM; + } + } + + return res; +} + +static vpx_codec_err_t vp8_peek_si(const uint8_t *data, unsigned int data_sz, + vpx_codec_stream_info_t *si) { + return vp8_peek_si_internal(data, data_sz, si, NULL, NULL); +} + +static vpx_codec_err_t vp8_get_si(vpx_codec_alg_priv_t *ctx, + vpx_codec_stream_info_t *si) { + unsigned int sz; + + if (si->sz >= sizeof(vp8_stream_info_t)) { + sz = sizeof(vp8_stream_info_t); + } else { + sz = sizeof(vpx_codec_stream_info_t); + } + + memcpy(si, &ctx->si, sz); + si->sz = sz; + + return VPX_CODEC_OK; +} + +static vpx_codec_err_t update_error_state( + vpx_codec_alg_priv_t *ctx, const struct vpx_internal_error_info *error) { + vpx_codec_err_t res; + + if ((res = error->error_code)) { + ctx->base.err_detail = error->has_detail ? error->detail : NULL; + } + + return res; +} + +static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12, + void *user_priv) { + /** vpx_img_wrap() doesn't allow specifying independent strides for + * the Y, U, and V planes, nor other alignment adjustments that + * might be representable by a YV12_BUFFER_CONFIG, so we just + * initialize all the fields.*/ + img->fmt = VPX_IMG_FMT_I420; + img->w = yv12->y_stride; + img->h = (yv12->y_height + 2 * VP8BORDERINPIXELS + 15) & ~15; + img->d_w = img->r_w = yv12->y_width; + img->d_h = img->r_h = yv12->y_height; + img->x_chroma_shift = 1; + img->y_chroma_shift = 1; + img->planes[VPX_PLANE_Y] = yv12->y_buffer; + img->planes[VPX_PLANE_U] = yv12->u_buffer; + img->planes[VPX_PLANE_V] = yv12->v_buffer; + img->planes[VPX_PLANE_ALPHA] = NULL; + img->stride[VPX_PLANE_Y] = yv12->y_stride; + img->stride[VPX_PLANE_U] = yv12->uv_stride; + img->stride[VPX_PLANE_V] = yv12->uv_stride; + img->stride[VPX_PLANE_ALPHA] = yv12->y_stride; + img->bit_depth = 8; + img->bps = 12; + img->user_priv = user_priv; + img->img_data = yv12->buffer_alloc; + img->img_data_owner = 0; + img->self_allocd = 0; +} + +static int update_fragments(vpx_codec_alg_priv_t *ctx, const uint8_t *data, + unsigned int data_sz, + volatile vpx_codec_err_t *res) { + *res = VPX_CODEC_OK; + + if (ctx->fragments.count == 0) { + /* New frame, reset fragment pointers and sizes */ + memset((void *)ctx->fragments.ptrs, 0, sizeof(ctx->fragments.ptrs)); + memset(ctx->fragments.sizes, 0, sizeof(ctx->fragments.sizes)); + } + if (ctx->fragments.enabled && !(data == NULL && data_sz == 0)) { + /* Store a pointer to this fragment and return. We haven't + * received the complete frame yet, so we will wait with decoding. + */ + if (ctx->fragments.count >= MAX_PARTITIONS) { + ctx->fragments.count = 0; + *res = VPX_CODEC_INVALID_PARAM; + return -1; + } + ctx->fragments.ptrs[ctx->fragments.count] = data; + ctx->fragments.sizes[ctx->fragments.count] = data_sz; + ctx->fragments.count++; + return 0; + } + + if (!ctx->fragments.enabled && (data == NULL && data_sz == 0)) { + return 0; + } + + if (!ctx->fragments.enabled) { + ctx->fragments.ptrs[0] = data; + ctx->fragments.sizes[0] = data_sz; + ctx->fragments.count = 1; + } + + return 1; +} + +static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, + const uint8_t *data, unsigned int data_sz, + void *user_priv) { + volatile vpx_codec_err_t res; + volatile unsigned int resolution_change = 0; + volatile unsigned int w, h; + + if (!ctx->fragments.enabled && (data == NULL && data_sz == 0)) { + return 0; + } + + /* Update the input fragment data */ + if (update_fragments(ctx, data, data_sz, &res) <= 0) return res; + + /* Determine the stream parameters. Note that we rely on peek_si to + * validate that we have a buffer that does not wrap around the top + * of the heap. + */ + w = ctx->si.w; + h = ctx->si.h; + + res = vp8_peek_si_internal(ctx->fragments.ptrs[0], ctx->fragments.sizes[0], + &ctx->si, ctx->decrypt_cb, ctx->decrypt_state); + + if ((res == VPX_CODEC_UNSUP_BITSTREAM) && !ctx->si.is_kf) { + /* the peek function returns an error for non keyframes, however for + * this case, it is not an error */ + res = VPX_CODEC_OK; + } + + if (!ctx->decoder_init && !ctx->si.is_kf) res = VPX_CODEC_UNSUP_BITSTREAM; + if (!res && ctx->decoder_init && w == 0 && h == 0 && ctx->si.h == 0 && + ctx->si.w == 0) { + VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0]; + assert(pbi != NULL); + assert(!pbi->common.error.setjmp); + res = VPX_CODEC_CORRUPT_FRAME; + vpx_internal_error(&pbi->common.error, res, + "Keyframe / intra-only frame required to reset decoder" + " state"); + } + + if ((ctx->si.h != h) || (ctx->si.w != w)) resolution_change = 1; + +#if CONFIG_MULTITHREAD + if (!res && ctx->restart_threads) { + VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0]; + VP8_COMMON *const pc = &pbi->common; + if (setjmp(pbi->common.error.jmp)) { + pbi->common.error.setjmp = 0; + vp8_decoder_remove_threads(pbi); + vpx_clear_system_state(); + return VPX_CODEC_ERROR; + } + pbi->common.error.setjmp = 1; + pbi->max_threads = ctx->cfg.threads; + vp8_decoder_create_threads(pbi); + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) { + vp8mt_alloc_temp_buffers(pbi, pc->Width, pc->mb_rows); + } + ctx->restart_threads = 0; + pbi->common.error.setjmp = 0; + } +#endif + /* Initialize the decoder instance on the first frame*/ + if (!res && !ctx->decoder_init) { + VP8D_CONFIG oxcf; + + oxcf.Width = ctx->si.w; + oxcf.Height = ctx->si.h; + oxcf.Version = 9; + oxcf.postprocess = 0; + oxcf.max_threads = ctx->cfg.threads; + oxcf.error_concealment = + (ctx->base.init_flags & VPX_CODEC_USE_ERROR_CONCEALMENT); + + /* If postprocessing was enabled by the application and a + * configuration has not been provided, default it. + */ + if (!ctx->postproc_cfg_set && + (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)) { + ctx->postproc_cfg.post_proc_flag = + VP8_DEBLOCK | VP8_DEMACROBLOCK | VP8_MFQE; + ctx->postproc_cfg.deblocking_level = 4; + ctx->postproc_cfg.noise_level = 0; + } + + res = vp8_create_decoder_instances(&ctx->yv12_frame_buffers, &oxcf); + if (res == VPX_CODEC_OK) { + ctx->decoder_init = 1; + } else { + /* on failure clear the cached resolution to ensure a full + * reallocation is attempted on resync. */ + ctx->si.w = 0; + ctx->si.h = 0; + } + } + + /* Set these even if already initialized. The caller may have changed the + * decrypt config between frames. + */ + if (ctx->decoder_init) { + ctx->yv12_frame_buffers.pbi[0]->decrypt_cb = ctx->decrypt_cb; + ctx->yv12_frame_buffers.pbi[0]->decrypt_state = ctx->decrypt_state; + } + + if (!res) { + VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0]; + VP8_COMMON *const pc = &pbi->common; + if (resolution_change) { + MACROBLOCKD *const xd = &pbi->mb; +#if CONFIG_MULTITHREAD + int i; +#endif + pc->Width = ctx->si.w; + pc->Height = ctx->si.h; + { + if (setjmp(pbi->common.error.jmp)) { + pbi->common.error.setjmp = 0; + /* on failure clear the cached resolution to ensure a full + * reallocation is attempted on resync. */ + ctx->si.w = 0; + ctx->si.h = 0; + vpx_clear_system_state(); + /* same return value as used in vp8dx_receive_compressed_data */ + return -1; + } + + pbi->common.error.setjmp = 1; + + if (pc->Width <= 0) { + pc->Width = w; + vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, + "Invalid frame width"); + } + + if (pc->Height <= 0) { + pc->Height = h; + vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, + "Invalid frame height"); + } + +#if CONFIG_MULTITHREAD + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) { + vp8mt_de_alloc_temp_buffers(pbi, pc->mb_rows); + } +#endif + + if (vp8_alloc_frame_buffers(pc, pc->Width, pc->Height)) { + vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate frame buffers"); + } + + xd->pre = pc->yv12_fb[pc->lst_fb_idx]; + xd->dst = pc->yv12_fb[pc->new_fb_idx]; + +#if CONFIG_MULTITHREAD + for (i = 0; i < pbi->allocated_decoding_thread_count; ++i) { + pbi->mb_row_di[i].mbd.dst = pc->yv12_fb[pc->new_fb_idx]; + vp8_build_block_doffsets(&pbi->mb_row_di[i].mbd); + } +#endif + vp8_build_block_doffsets(&pbi->mb); + +/* allocate memory for last frame MODE_INFO array */ +#if CONFIG_ERROR_CONCEALMENT + + if (pbi->ec_enabled) { + /* old prev_mip was released by vp8_de_alloc_frame_buffers() + * called in vp8_alloc_frame_buffers() */ + pc->prev_mip = vpx_calloc((pc->mb_cols + 1) * (pc->mb_rows + 1), + sizeof(MODE_INFO)); + + if (!pc->prev_mip) { + vp8_de_alloc_frame_buffers(pc); + vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate" + "last frame MODE_INFO array"); + } + + pc->prev_mi = pc->prev_mip + pc->mode_info_stride + 1; + + if (vp8_alloc_overlap_lists(pbi)) + vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate overlap lists " + "for error concealment"); + } + +#endif + +#if CONFIG_MULTITHREAD + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) { + vp8mt_alloc_temp_buffers(pbi, pc->Width, 0); + } +#endif + } + + pbi->common.error.setjmp = 0; + + /* required to get past the first get_free_fb() call */ + pbi->common.fb_idx_ref_cnt[0] = 0; + } + + if (setjmp(pbi->common.error.jmp)) { + vpx_clear_system_state(); + /* We do not know if the missing frame(s) was supposed to update + * any of the reference buffers, but we act conservative and + * mark only the last buffer as corrupted. + */ + pc->yv12_fb[pc->lst_fb_idx].corrupted = 1; + + if (pc->fb_idx_ref_cnt[pc->new_fb_idx] > 0) { + pc->fb_idx_ref_cnt[pc->new_fb_idx]--; + } + pc->error.setjmp = 0; +#if CONFIG_MULTITHREAD + if (pbi->restart_threads) { + ctx->si.w = 0; + ctx->si.h = 0; + ctx->restart_threads = 1; + } +#endif + res = update_error_state(ctx, &pbi->common.error); + return res; + } + + pbi->common.error.setjmp = 1; + + /* update the pbi fragment data */ + pbi->fragments = ctx->fragments; +#if CONFIG_MULTITHREAD + pbi->restart_threads = 0; +#endif + ctx->user_priv = user_priv; + if (vp8dx_receive_compressed_data(pbi)) { + res = update_error_state(ctx, &pbi->common.error); + } + + /* get ready for the next series of fragments */ + ctx->fragments.count = 0; + pbi->common.error.setjmp = 0; + } + + return res; +} + +static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t *ctx, + vpx_codec_iter_t *iter) { + vpx_image_t *img = NULL; + + /* iter acts as a flip flop, so an image is only returned on the first + * call to get_frame. + */ + if (!(*iter) && ctx->yv12_frame_buffers.pbi[0]) { + YV12_BUFFER_CONFIG sd; + vp8_ppflags_t flags; + vp8_zero(flags); + + if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) { + flags.post_proc_flag = ctx->postproc_cfg.post_proc_flag; + flags.deblocking_level = ctx->postproc_cfg.deblocking_level; + flags.noise_level = ctx->postproc_cfg.noise_level; + } + + if (0 == vp8dx_get_raw_frame(ctx->yv12_frame_buffers.pbi[0], &sd, &flags)) { + yuvconfig2image(&ctx->img, &sd, ctx->user_priv); + + img = &ctx->img; + *iter = img; + } + } + + return img; +} + +static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, + YV12_BUFFER_CONFIG *yv12) { + const int y_w = img->d_w; + const int y_h = img->d_h; + const int uv_w = (img->d_w + 1) / 2; + const int uv_h = (img->d_h + 1) / 2; + vpx_codec_err_t res = VPX_CODEC_OK; + yv12->y_buffer = img->planes[VPX_PLANE_Y]; + yv12->u_buffer = img->planes[VPX_PLANE_U]; + yv12->v_buffer = img->planes[VPX_PLANE_V]; + + yv12->y_crop_width = y_w; + yv12->y_crop_height = y_h; + yv12->y_width = y_w; + yv12->y_height = y_h; + yv12->uv_crop_width = uv_w; + yv12->uv_crop_height = uv_h; + yv12->uv_width = uv_w; + yv12->uv_height = uv_h; + + yv12->y_stride = img->stride[VPX_PLANE_Y]; + yv12->uv_stride = img->stride[VPX_PLANE_U]; + + yv12->border = (img->stride[VPX_PLANE_Y] - img->d_w) / 2; + return res; +} + +static vpx_codec_err_t vp8_set_reference(vpx_codec_alg_priv_t *ctx, + va_list args) { + vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *); + + if (data) { + vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data; + YV12_BUFFER_CONFIG sd; + + image2yuvconfig(&frame->img, &sd); + + return vp8dx_set_reference(ctx->yv12_frame_buffers.pbi[0], + frame->frame_type, &sd); + } else { + return VPX_CODEC_INVALID_PARAM; + } +} + +static vpx_codec_err_t vp8_get_reference(vpx_codec_alg_priv_t *ctx, + va_list args) { + vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *); + + if (data) { + vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data; + YV12_BUFFER_CONFIG sd; + + image2yuvconfig(&frame->img, &sd); + + return vp8dx_get_reference(ctx->yv12_frame_buffers.pbi[0], + frame->frame_type, &sd); + } else { + return VPX_CODEC_INVALID_PARAM; + } +} + +static vpx_codec_err_t vp8_get_quantizer(vpx_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0]; + if (arg == NULL) return VPX_CODEC_INVALID_PARAM; + if (pbi == NULL) return VPX_CODEC_CORRUPT_FRAME; + *arg = vp8dx_get_quantizer(pbi); + return VPX_CODEC_OK; +} + +static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx, + va_list args) { +#if CONFIG_POSTPROC + vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *); + + if (data) { + ctx->postproc_cfg_set = 1; + ctx->postproc_cfg = *((vp8_postproc_cfg_t *)data); + return VPX_CODEC_OK; + } else { + return VPX_CODEC_INVALID_PARAM; + } + +#else + (void)ctx; + (void)args; + return VPX_CODEC_INCAPABLE; +#endif +} + +static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx, + va_list args) { + int *update_info = va_arg(args, int *); + + if (update_info) { + VP8D_COMP *pbi = (VP8D_COMP *)ctx->yv12_frame_buffers.pbi[0]; + if (pbi == NULL) return VPX_CODEC_CORRUPT_FRAME; + + *update_info = pbi->common.refresh_alt_ref_frame * (int)VP8_ALTR_FRAME + + pbi->common.refresh_golden_frame * (int)VP8_GOLD_FRAME + + pbi->common.refresh_last_frame * (int)VP8_LAST_FRAME; + + return VPX_CODEC_OK; + } else { + return VPX_CODEC_INVALID_PARAM; + } +} + +static vpx_codec_err_t vp8_get_last_ref_frame(vpx_codec_alg_priv_t *ctx, + va_list args) { + int *ref_info = va_arg(args, int *); + + if (ref_info) { + VP8D_COMP *pbi = (VP8D_COMP *)ctx->yv12_frame_buffers.pbi[0]; + if (pbi) { + VP8_COMMON *oci = &pbi->common; + *ref_info = + (vp8dx_references_buffer(oci, ALTREF_FRAME) ? VP8_ALTR_FRAME : 0) | + (vp8dx_references_buffer(oci, GOLDEN_FRAME) ? VP8_GOLD_FRAME : 0) | + (vp8dx_references_buffer(oci, LAST_FRAME) ? VP8_LAST_FRAME : 0); + return VPX_CODEC_OK; + } else { + return VPX_CODEC_CORRUPT_FRAME; + } + } else { + return VPX_CODEC_INVALID_PARAM; + } +} + +static vpx_codec_err_t vp8_get_frame_corrupted(vpx_codec_alg_priv_t *ctx, + va_list args) { + int *corrupted = va_arg(args, int *); + VP8D_COMP *pbi = (VP8D_COMP *)ctx->yv12_frame_buffers.pbi[0]; + + if (corrupted && pbi) { + const YV12_BUFFER_CONFIG *const frame = pbi->common.frame_to_show; + if (frame == NULL) return VPX_CODEC_ERROR; + *corrupted = frame->corrupted; + return VPX_CODEC_OK; + } else { + return VPX_CODEC_INVALID_PARAM; + } +} + +static vpx_codec_err_t vp8_set_decryptor(vpx_codec_alg_priv_t *ctx, + va_list args) { + vpx_decrypt_init *init = va_arg(args, vpx_decrypt_init *); + + if (init) { + ctx->decrypt_cb = init->decrypt_cb; + ctx->decrypt_state = init->decrypt_state; + } else { + ctx->decrypt_cb = NULL; + ctx->decrypt_state = NULL; + } + return VPX_CODEC_OK; +} + +static vpx_codec_ctrl_fn_map_t vp8_ctf_maps[] = { + { VP8_SET_REFERENCE, vp8_set_reference }, + { VP8_COPY_REFERENCE, vp8_get_reference }, + { VP8_SET_POSTPROC, vp8_set_postproc }, + { VP8D_GET_LAST_REF_UPDATES, vp8_get_last_ref_updates }, + { VP8D_GET_FRAME_CORRUPTED, vp8_get_frame_corrupted }, + { VP8D_GET_LAST_REF_USED, vp8_get_last_ref_frame }, + { VPXD_GET_LAST_QUANTIZER, vp8_get_quantizer }, + { VPXD_SET_DECRYPTOR, vp8_set_decryptor }, + { -1, NULL }, +}; + +#ifndef VERSION_STRING +#define VERSION_STRING +#endif +CODEC_INTERFACE(vpx_codec_vp8_dx) = { + "WebM Project VP8 Decoder" VERSION_STRING, + VPX_CODEC_INTERNAL_ABI_VERSION, + VPX_CODEC_CAP_DECODER | VP8_CAP_POSTPROC | VP8_CAP_ERROR_CONCEALMENT | + VPX_CODEC_CAP_INPUT_FRAGMENTS, + /* vpx_codec_caps_t caps; */ + vp8_init, /* vpx_codec_init_fn_t init; */ + vp8_destroy, /* vpx_codec_destroy_fn_t destroy; */ + vp8_ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */ + { + vp8_peek_si, /* vpx_codec_peek_si_fn_t peek_si; */ + vp8_get_si, /* vpx_codec_get_si_fn_t get_si; */ + vp8_decode, /* vpx_codec_decode_fn_t decode; */ + vp8_get_frame, /* vpx_codec_frame_get_fn_t frame_get; */ + NULL, + }, + { + /* encoder functions */ + 0, NULL, /* vpx_codec_enc_cfg_map_t */ + NULL, /* vpx_codec_encode_fn_t */ + NULL, /* vpx_codec_get_cx_data_fn_t */ + NULL, /* vpx_codec_enc_config_set_fn_t */ + NULL, /* vpx_codec_get_global_headers_fn_t */ + NULL, /* vpx_codec_get_preview_frame_fn_t */ + NULL /* vpx_codec_enc_mr_get_mem_loc_fn_t */ + } +}; diff --git a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc new file mode 100644 index 0000000000..261c316fd1 --- /dev/null +++ b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc @@ -0,0 +1,429 @@ +/* + * Copyright (c) 2021 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include "vp8/common/common.h" +#include "vp8/vp8_ratectrl_rtc.h" +#include "vp8/encoder/onyx_int.h" +#include "vp8/encoder/ratectrl.h" +#include "vpx_ports/system_state.h" + +namespace libvpx { +/* Quant MOD */ +static const int kQTrans[] = { + 0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 12, 13, 15, 17, 18, 19, + 20, 21, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 35, 37, 39, 41, + 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 64, 67, 70, 73, 76, 79, + 82, 85, 88, 91, 94, 97, 100, 103, 106, 109, 112, 115, 118, 121, 124, 127, +}; + +static const unsigned char kf_high_motion_minq[QINDEX_RANGE] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, + 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, + 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 8, 8, 8, 8, 9, 9, 10, 10, + 10, 10, 11, 11, 11, 11, 12, 12, 13, 13, 13, 13, 14, 14, 15, 15, 15, 15, 16, + 16, 16, 16, 17, 17, 18, 18, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, + 22, 22, 23, 23, 24, 25, 25, 26, 26, 27, 28, 28, 29, 30 +}; + +static const unsigned char inter_minq[QINDEX_RANGE] = { + 0, 0, 1, 1, 2, 3, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9, 9, 10, 11, + 11, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 22, 23, 24, + 24, 25, 26, 27, 27, 28, 29, 30, 30, 31, 32, 33, 33, 34, 35, 36, 36, 37, 38, + 39, 39, 40, 41, 42, 42, 43, 44, 45, 46, 46, 47, 48, 49, 50, 50, 51, 52, 53, + 54, 55, 55, 56, 57, 58, 59, 60, 60, 61, 62, 63, 64, 65, 66, 67, 67, 68, 69, + 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 86, + 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100 +}; + +static int rescale(int val, int num, int denom) { + int64_t llnum = num; + int64_t llden = denom; + int64_t llval = val; + + return (int)(llval * llnum / llden); +} + +std::unique_ptr VP8RateControlRTC::Create( + const VP8RateControlRtcConfig &cfg) { + std::unique_ptr rc_api(new (std::nothrow) + VP8RateControlRTC()); + if (!rc_api) return nullptr; + rc_api->cpi_ = static_cast(vpx_memalign(32, sizeof(*cpi_))); + if (!rc_api->cpi_) return nullptr; + vp8_zero(*rc_api->cpi_); + + if (!rc_api->InitRateControl(cfg)) return nullptr; + + return rc_api; +} + +VP8RateControlRTC::~VP8RateControlRTC() { + if (cpi_) { + vpx_free(cpi_->gf_active_flags); + vpx_free(cpi_); + } +} + +bool VP8RateControlRTC::InitRateControl(const VP8RateControlRtcConfig &rc_cfg) { + VP8_COMMON *cm = &cpi_->common; + VP8_CONFIG *oxcf = &cpi_->oxcf; + oxcf->end_usage = USAGE_STREAM_FROM_SERVER; + cpi_->pass = 0; + cm->show_frame = 1; + oxcf->drop_frames_water_mark = 0; + cm->current_video_frame = 0; + cpi_->auto_gold = 1; + cpi_->key_frame_count = 1; + cpi_->rate_correction_factor = 1.0; + cpi_->key_frame_rate_correction_factor = 1.0; + cpi_->cyclic_refresh_mode_enabled = 0; + cpi_->auto_worst_q = 1; + cpi_->kf_overspend_bits = 0; + cpi_->kf_bitrate_adjustment = 0; + cpi_->gf_overspend_bits = 0; + cpi_->non_gf_bitrate_adjustment = 0; + if (!UpdateRateControl(rc_cfg)) return false; + cpi_->buffer_level = oxcf->starting_buffer_level; + cpi_->bits_off_target = oxcf->starting_buffer_level; + return true; +} + +bool VP8RateControlRTC::UpdateRateControl( + const VP8RateControlRtcConfig &rc_cfg) { + if (rc_cfg.ts_number_layers < 1 || + rc_cfg.ts_number_layers > VPX_TS_MAX_LAYERS) { + return false; + } + + VP8_COMMON *cm = &cpi_->common; + VP8_CONFIG *oxcf = &cpi_->oxcf; + const unsigned int prev_number_of_layers = oxcf->number_of_layers; + vpx_clear_system_state(); + cm->Width = rc_cfg.width; + cm->Height = rc_cfg.height; + oxcf->Width = rc_cfg.width; + oxcf->Height = rc_cfg.height; + oxcf->worst_allowed_q = kQTrans[rc_cfg.max_quantizer]; + oxcf->best_allowed_q = kQTrans[rc_cfg.min_quantizer]; + cpi_->worst_quality = oxcf->worst_allowed_q; + cpi_->best_quality = oxcf->best_allowed_q; + cpi_->output_framerate = rc_cfg.framerate; + oxcf->target_bandwidth = + static_cast(1000 * rc_cfg.target_bandwidth); + cpi_->ref_framerate = cpi_->output_framerate; + oxcf->fixed_q = -1; + oxcf->error_resilient_mode = 1; + oxcf->starting_buffer_level_in_ms = rc_cfg.buf_initial_sz; + oxcf->optimal_buffer_level_in_ms = rc_cfg.buf_optimal_sz; + oxcf->maximum_buffer_size_in_ms = rc_cfg.buf_sz; + oxcf->starting_buffer_level = rc_cfg.buf_initial_sz; + oxcf->optimal_buffer_level = rc_cfg.buf_optimal_sz; + oxcf->maximum_buffer_size = rc_cfg.buf_sz; + oxcf->number_of_layers = rc_cfg.ts_number_layers; + cpi_->buffered_mode = oxcf->optimal_buffer_level > 0; + oxcf->under_shoot_pct = rc_cfg.undershoot_pct; + oxcf->over_shoot_pct = rc_cfg.overshoot_pct; + oxcf->drop_frames_water_mark = rc_cfg.frame_drop_thresh; + if (oxcf->drop_frames_water_mark > 0) cpi_->drop_frames_allowed = 1; + cpi_->oxcf.rc_max_intra_bitrate_pct = rc_cfg.max_intra_bitrate_pct; + cpi_->framerate = rc_cfg.framerate; + for (int i = 0; i < KEY_FRAME_CONTEXT; ++i) { + cpi_->prior_key_frame_distance[i] = + static_cast(cpi_->output_framerate); + } + oxcf->screen_content_mode = rc_cfg.is_screen; + if (oxcf->number_of_layers > 1 || prev_number_of_layers > 1) { + memcpy(oxcf->target_bitrate, rc_cfg.layer_target_bitrate, + sizeof(rc_cfg.layer_target_bitrate)); + memcpy(oxcf->rate_decimator, rc_cfg.ts_rate_decimator, + sizeof(rc_cfg.ts_rate_decimator)); + if (cm->current_video_frame == 0) { + double prev_layer_framerate = 0; + for (unsigned int i = 0; i < oxcf->number_of_layers; ++i) { + vp8_init_temporal_layer_context(cpi_, oxcf, i, prev_layer_framerate); + prev_layer_framerate = cpi_->output_framerate / oxcf->rate_decimator[i]; + } + } else if (oxcf->number_of_layers != prev_number_of_layers) { + // The number of temporal layers has changed, so reset/initialize the + // temporal layer context for the new layer configuration: this means + // calling vp8_reset_temporal_layer_change() below. + + // Start at the base of the pattern cycle, so set the layer id to 0 and + // reset the temporal pattern counter. + // TODO(marpan/jianj): don't think lines 148-151 are needed (user controls + // the layer_id) so remove. + if (cpi_->temporal_layer_id > 0) { + cpi_->temporal_layer_id = 0; + } + cpi_->temporal_pattern_counter = 0; + + vp8_reset_temporal_layer_change(cpi_, oxcf, + static_cast(prev_number_of_layers)); + } + } + + cpi_->total_actual_bits = 0; + cpi_->total_target_vs_actual = 0; + + cm->mb_rows = cm->Height >> 4; + cm->mb_cols = cm->Width >> 4; + cm->MBs = cm->mb_rows * cm->mb_cols; + cm->mode_info_stride = cm->mb_cols + 1; + + // For temporal layers: starting/maximum/optimal_buffer_level is already set + // via vp8_init_temporal_layer_context() or vp8_reset_temporal_layer_change(). + if (oxcf->number_of_layers <= 1 && prev_number_of_layers <= 1) { + oxcf->starting_buffer_level = + rescale((int)oxcf->starting_buffer_level, oxcf->target_bandwidth, 1000); + /* Set or reset optimal and maximum buffer levels. */ + if (oxcf->optimal_buffer_level == 0) { + oxcf->optimal_buffer_level = oxcf->target_bandwidth / 8; + } else { + oxcf->optimal_buffer_level = rescale((int)oxcf->optimal_buffer_level, + oxcf->target_bandwidth, 1000); + } + if (oxcf->maximum_buffer_size == 0) { + oxcf->maximum_buffer_size = oxcf->target_bandwidth / 8; + } else { + oxcf->maximum_buffer_size = + rescale((int)oxcf->maximum_buffer_size, oxcf->target_bandwidth, 1000); + } + } + + if (cpi_->bits_off_target > oxcf->maximum_buffer_size) { + cpi_->bits_off_target = oxcf->maximum_buffer_size; + cpi_->buffer_level = cpi_->bits_off_target; + } + + vp8_new_framerate(cpi_, cpi_->framerate); + vpx_clear_system_state(); + return true; +} + +FrameDropDecision VP8RateControlRTC::ComputeQP( + const VP8FrameParamsQpRTC &frame_params) { + VP8_COMMON *const cm = &cpi_->common; + vpx_clear_system_state(); + if (cpi_->oxcf.number_of_layers > 1) { + cpi_->temporal_layer_id = frame_params.temporal_layer_id; + const int layer = frame_params.temporal_layer_id; + vp8_update_layer_contexts(cpi_); + /* Restore layer specific context & set frame rate */ + vp8_restore_layer_context(cpi_, layer); + vp8_new_framerate(cpi_, cpi_->layer_context[layer].framerate); + } + cm->frame_type = static_cast(frame_params.frame_type); + cm->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0; + cm->refresh_alt_ref_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0; + if (cm->frame_type == KEY_FRAME && cpi_->common.current_video_frame > 0) { + cpi_->common.frame_flags |= FRAMEFLAGS_KEY; + } + + cpi_->per_frame_bandwidth = static_cast( + round(cpi_->oxcf.target_bandwidth / cpi_->output_framerate)); + if (vp8_check_drop_buffer(cpi_)) { + if (cpi_->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi_); + return FrameDropDecision::kDrop; + } + + if (!vp8_pick_frame_size(cpi_)) { + cm->current_video_frame++; + cpi_->frames_since_key++; + cpi_->ext_refresh_frame_flags_pending = 0; + if (cpi_->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi_); + return FrameDropDecision::kDrop; + } + + if (cpi_->buffer_level >= cpi_->oxcf.optimal_buffer_level && + cpi_->buffered_mode) { + /* Max adjustment is 1/4 */ + int Adjustment = cpi_->active_worst_quality / 4; + if (Adjustment) { + int buff_lvl_step; + if (cpi_->buffer_level < cpi_->oxcf.maximum_buffer_size) { + buff_lvl_step = (int)((cpi_->oxcf.maximum_buffer_size - + cpi_->oxcf.optimal_buffer_level) / + Adjustment); + if (buff_lvl_step) { + Adjustment = + (int)((cpi_->buffer_level - cpi_->oxcf.optimal_buffer_level) / + buff_lvl_step); + } else { + Adjustment = 0; + } + } + cpi_->active_worst_quality -= Adjustment; + if (cpi_->active_worst_quality < cpi_->active_best_quality) { + cpi_->active_worst_quality = cpi_->active_best_quality; + } + } + } + + if (cpi_->ni_frames > 150) { + int q = cpi_->active_worst_quality; + if (cm->frame_type == KEY_FRAME) { + cpi_->active_best_quality = kf_high_motion_minq[q]; + } else { + cpi_->active_best_quality = inter_minq[q]; + } + + if (cpi_->buffer_level >= cpi_->oxcf.maximum_buffer_size) { + cpi_->active_best_quality = cpi_->best_quality; + + } else if (cpi_->buffer_level > cpi_->oxcf.optimal_buffer_level) { + int Fraction = + (int)(((cpi_->buffer_level - cpi_->oxcf.optimal_buffer_level) * 128) / + (cpi_->oxcf.maximum_buffer_size - + cpi_->oxcf.optimal_buffer_level)); + int min_qadjustment = + ((cpi_->active_best_quality - cpi_->best_quality) * Fraction) / 128; + + cpi_->active_best_quality -= min_qadjustment; + } + } + + /* Clip the active best and worst quality values to limits */ + if (cpi_->active_worst_quality > cpi_->worst_quality) { + cpi_->active_worst_quality = cpi_->worst_quality; + } + if (cpi_->active_best_quality < cpi_->best_quality) { + cpi_->active_best_quality = cpi_->best_quality; + } + if (cpi_->active_worst_quality < cpi_->active_best_quality) { + cpi_->active_worst_quality = cpi_->active_best_quality; + } + + q_ = vp8_regulate_q(cpi_, cpi_->this_frame_target); + vp8_set_quantizer(cpi_, q_); + vpx_clear_system_state(); + return FrameDropDecision::kOk; +} + +int VP8RateControlRTC::GetQP() const { return q_; } + +int VP8RateControlRTC::GetLoopfilterLevel() const { + VP8_COMMON *cm = &cpi_->common; + const double qp = q_; + + // This model is from linear regression + if (cm->Width * cm->Height <= 320 * 240) { + cm->filter_level = static_cast(0.352685 * qp + 2.957774); + } else if (cm->Width * cm->Height <= 640 * 480) { + cm->filter_level = static_cast(0.485069 * qp - 0.534462); + } else { + cm->filter_level = static_cast(0.314875 * qp + 7.959003); + } + + int min_filter_level = 0; + // This logic is from get_min_filter_level() in picklpf.c + if (q_ > 6 && q_ <= 16) { + min_filter_level = 1; + } else { + min_filter_level = (q_ / 8); + } + + const int max_filter_level = 63; + if (cm->filter_level < min_filter_level) cm->filter_level = min_filter_level; + if (cm->filter_level > max_filter_level) cm->filter_level = max_filter_level; + + return cm->filter_level; +} + +void VP8RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) { + VP8_COMMON *const cm = &cpi_->common; + vpx_clear_system_state(); + cpi_->total_byte_count += encoded_frame_size; + cpi_->projected_frame_size = static_cast(encoded_frame_size << 3); + if (cpi_->oxcf.number_of_layers > 1) { + for (unsigned int i = cpi_->current_layer + 1; + i < cpi_->oxcf.number_of_layers; ++i) { + cpi_->layer_context[i].total_byte_count += encoded_frame_size; + } + } + + vp8_update_rate_correction_factors(cpi_, 2); + + cpi_->last_q[cm->frame_type] = cm->base_qindex; + + if (cm->frame_type == KEY_FRAME) { + vp8_adjust_key_frame_context(cpi_); + } + + /* Keep a record of ambient average Q. */ + if (cm->frame_type != KEY_FRAME) { + cpi_->avg_frame_qindex = + (2 + 3 * cpi_->avg_frame_qindex + cm->base_qindex) >> 2; + } + /* Keep a record from which we can calculate the average Q excluding + * key frames. + */ + if (cm->frame_type != KEY_FRAME) { + cpi_->ni_frames++; + /* Damp value for first few frames */ + if (cpi_->ni_frames > 150) { + cpi_->ni_tot_qi += q_; + cpi_->ni_av_qi = (cpi_->ni_tot_qi / cpi_->ni_frames); + } else { + cpi_->ni_tot_qi += q_; + cpi_->ni_av_qi = + ((cpi_->ni_tot_qi / cpi_->ni_frames) + cpi_->worst_quality + 1) / 2; + } + + /* If the average Q is higher than what was used in the last + * frame (after going through the recode loop to keep the frame + * size within range) then use the last frame value - 1. The -1 + * is designed to stop Q and hence the data rate, from + * progressively falling away during difficult sections, but at + * the same time reduce the number of itterations around the + * recode loop. + */ + if (q_ > cpi_->ni_av_qi) cpi_->ni_av_qi = q_ - 1; + } + + cpi_->bits_off_target += + cpi_->av_per_frame_bandwidth - cpi_->projected_frame_size; + if (cpi_->bits_off_target > cpi_->oxcf.maximum_buffer_size) { + cpi_->bits_off_target = cpi_->oxcf.maximum_buffer_size; + } + + cpi_->total_actual_bits += cpi_->projected_frame_size; + cpi_->buffer_level = cpi_->bits_off_target; + + /* Propagate values to higher temporal layers */ + if (cpi_->oxcf.number_of_layers > 1) { + for (unsigned int i = cpi_->current_layer + 1; + i < cpi_->oxcf.number_of_layers; ++i) { + LAYER_CONTEXT *lc = &cpi_->layer_context[i]; + int bits_off_for_this_layer = (int)round( + lc->target_bandwidth / lc->framerate - cpi_->projected_frame_size); + + lc->bits_off_target += bits_off_for_this_layer; + + /* Clip buffer level to maximum buffer size for the layer */ + if (lc->bits_off_target > lc->maximum_buffer_size) { + lc->bits_off_target = lc->maximum_buffer_size; + } + + lc->total_actual_bits += cpi_->projected_frame_size; + lc->total_target_vs_actual += bits_off_for_this_layer; + lc->buffer_level = lc->bits_off_target; + } + } + + cpi_->common.current_video_frame++; + cpi_->frames_since_key++; + + if (cpi_->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi_); + vpx_clear_system_state(); +} +} // namespace libvpx diff --git a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h new file mode 100644 index 0000000000..59fb607526 --- /dev/null +++ b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2021 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_RATECTRL_RTC_H_ +#define VPX_VP8_RATECTRL_RTC_H_ + +#include +#include +#include + +#include "vpx/internal/vpx_ratectrl_rtc.h" + +struct VP8_COMP; + +namespace libvpx { +struct VP8RateControlRtcConfig : public VpxRateControlRtcConfig { + public: + VP8RateControlRtcConfig() { + memset(&layer_target_bitrate, 0, sizeof(layer_target_bitrate)); + memset(&ts_rate_decimator, 0, sizeof(ts_rate_decimator)); + } +}; + +struct VP8FrameParamsQpRTC { + RcFrameType frame_type; + int temporal_layer_id; +}; + +class VP8RateControlRTC { + public: + static std::unique_ptr Create( + const VP8RateControlRtcConfig &cfg); + ~VP8RateControlRTC(); + + bool UpdateRateControl(const VP8RateControlRtcConfig &rc_cfg); + // GetQP() needs to be called after ComputeQP() to get the latest QP + int GetQP() const; + // GetLoopfilterLevel() needs to be called after ComputeQP() since loopfilter + // level is calculated from frame qp. + int GetLoopfilterLevel() const; + // ComputeQP computes the QP if the frame is not dropped (kOk return), + // otherwise it returns kDrop and subsequent GetQP and PostEncodeUpdate + // are not to be called. + FrameDropDecision ComputeQP(const VP8FrameParamsQpRTC &frame_params); + // Feedback to rate control with the size of current encoded frame + void PostEncodeUpdate(uint64_t encoded_frame_size); + + private: + VP8RateControlRTC() {} + bool InitRateControl(const VP8RateControlRtcConfig &cfg); + struct VP8_COMP *cpi_; + int q_; +}; + +} // namespace libvpx + +#endif // VPX_VP8_RATECTRL_RTC_H_ diff --git a/media/libvpx/libvpx/vp8/vp8cx.mk b/media/libvpx/libvpx/vp8/vp8cx.mk new file mode 100644 index 0000000000..b4b3fda9ea --- /dev/null +++ b/media/libvpx/libvpx/vp8/vp8cx.mk @@ -0,0 +1,132 @@ +## +## Copyright (c) 2010 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + + +VP8_CX_EXPORTS += exports_enc + +VP8_CX_SRCS-yes += $(VP8_COMMON_SRCS-yes) +VP8_CX_SRCS-no += $(VP8_COMMON_SRCS-no) +VP8_CX_SRCS_REMOVE-yes += $(VP8_COMMON_SRCS_REMOVE-yes) +VP8_CX_SRCS_REMOVE-no += $(VP8_COMMON_SRCS_REMOVE-no) + +VP8_CX_SRCS-yes += vp8cx.mk + +VP8_CX_SRCS-yes += vp8_cx_iface.c + +VP8_CX_SRCS-yes += encoder/defaultcoefcounts.h +VP8_CX_SRCS-yes += encoder/bitstream.c +VP8_CX_SRCS-yes += encoder/boolhuff.c +VP8_CX_SRCS-yes += encoder/copy_c.c +VP8_CX_SRCS-yes += encoder/dct.c +VP8_CX_SRCS-yes += encoder/encodeframe.c +VP8_CX_SRCS-yes += encoder/encodeframe.h +VP8_CX_SRCS-yes += encoder/encodeintra.c +VP8_CX_SRCS-yes += encoder/encodemb.c +VP8_CX_SRCS-yes += encoder/encodemv.c +VP8_CX_SRCS-$(CONFIG_MULTITHREAD) += encoder/ethreading.c +VP8_CX_SRCS-$(CONFIG_MULTITHREAD) += encoder/ethreading.h +VP8_CX_SRCS-yes += encoder/firstpass.c +VP8_CX_SRCS-yes += encoder/block.h +VP8_CX_SRCS-yes += encoder/boolhuff.h +VP8_CX_SRCS-yes += encoder/bitstream.h +VP8_CX_SRCS-$(CONFIG_TEMPORAL_DENOISING) += encoder/denoising.h +VP8_CX_SRCS-$(CONFIG_TEMPORAL_DENOISING) += encoder/denoising.c +VP8_CX_SRCS-yes += encoder/encodeintra.h +VP8_CX_SRCS-yes += encoder/encodemb.h +VP8_CX_SRCS-yes += encoder/encodemv.h +VP8_CX_SRCS-yes += encoder/firstpass.h +VP8_CX_SRCS-yes += encoder/lookahead.c +VP8_CX_SRCS-yes += encoder/lookahead.h +VP8_CX_SRCS-yes += encoder/mcomp.h +VP8_CX_SRCS-yes += encoder/modecosts.h +VP8_CX_SRCS-yes += encoder/onyx_int.h +VP8_CX_SRCS-yes += encoder/pickinter.h +VP8_CX_SRCS-yes += encoder/quantize.h +VP8_CX_SRCS-yes += encoder/ratectrl.h +VP8_CX_SRCS-yes += encoder/rdopt.h +VP8_CX_SRCS-yes += encoder/tokenize.h +VP8_CX_SRCS-yes += encoder/treewriter.h +VP8_CX_SRCS-yes += encoder/mcomp.c +VP8_CX_SRCS-yes += encoder/modecosts.c +VP8_CX_SRCS-yes += encoder/onyx_if.c +VP8_CX_SRCS-yes += encoder/pickinter.c +VP8_CX_SRCS-yes += encoder/picklpf.c +VP8_CX_SRCS-yes += encoder/picklpf.h +VP8_CX_SRCS-yes += encoder/vp8_quantize.c +VP8_CX_SRCS-yes += encoder/ratectrl.c +VP8_CX_SRCS-yes += encoder/rdopt.c +VP8_CX_SRCS-yes += encoder/segmentation.c +VP8_CX_SRCS-yes += encoder/segmentation.h +VP8_CX_SRCS-yes += common/vp8_skin_detection.c +VP8_CX_SRCS-yes += common/vp8_skin_detection.h +VP8_CX_SRCS-yes += encoder/tokenize.c +VP8_CX_SRCS-yes += encoder/dct_value_cost.h +VP8_CX_SRCS-yes += encoder/dct_value_tokens.h +VP8_CX_SRCS-yes += encoder/treewriter.c +VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.h +VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c +VP8_CX_SRCS-yes += encoder/temporal_filter.c +VP8_CX_SRCS-yes += encoder/temporal_filter.h +VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.c +VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.h + +ifeq ($(CONFIG_REALTIME_ONLY),yes) +VP8_CX_SRCS_REMOVE-yes += encoder/firstpass.c +VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.c +VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.h +endif + +VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/copy_sse2.asm +VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/copy_sse3.asm +VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm +VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm +VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_quantize_sse2.c +VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp8_quantize_ssse3.c +VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.c + +ifeq ($(CONFIG_TEMPORAL_DENOISING),yes) +VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c +endif + +VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/block_error_sse2.asm +VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm +VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c + +ifeq ($(CONFIG_REALTIME_ONLY),yes) +VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm +endif + +VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/denoising_neon.c +VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/fastquantizeb_neon.c +VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/shortfdct_neon.c +VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_shortwalsh4x4_neon.c + +VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/dct_msa.c +VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/encodeopt_msa.c +VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/quantize_msa.c +VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c + +VP8_CX_SRCS-$(HAVE_MMI) += encoder/mips/mmi/vp8_quantize_mmi.c +VP8_CX_SRCS-$(HAVE_MMI) += encoder/mips/mmi/dct_mmi.c + +ifeq ($(CONFIG_TEMPORAL_DENOISING),yes) +VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/denoising_msa.c +endif + +ifeq ($(CONFIG_REALTIME_ONLY),yes) +VP8_CX_SRCS_REMOVE-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c +endif + +# common (loongarch LSX intrinsics) +VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/dct_lsx.c +VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/encodeopt_lsx.c +VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/vp8_quantize_lsx.c + +VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes)) diff --git a/media/libvpx/libvpx/vp8/vp8dx.mk b/media/libvpx/libvpx/vp8/vp8dx.mk new file mode 100644 index 0000000000..892ed70f52 --- /dev/null +++ b/media/libvpx/libvpx/vp8/vp8dx.mk @@ -0,0 +1,39 @@ +## +## Copyright (c) 2010 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + + +VP8_DX_EXPORTS += exports_dec + +VP8_DX_SRCS-yes += $(VP8_COMMON_SRCS-yes) +VP8_DX_SRCS-no += $(VP8_COMMON_SRCS-no) +VP8_DX_SRCS_REMOVE-yes += $(VP8_COMMON_SRCS_REMOVE-yes) +VP8_DX_SRCS_REMOVE-no += $(VP8_COMMON_SRCS_REMOVE-no) + +VP8_DX_SRCS-yes += vp8dx.mk + +VP8_DX_SRCS-yes += vp8_dx_iface.c + +VP8_DX_SRCS-yes += decoder/dboolhuff.c +VP8_DX_SRCS-yes += decoder/decodemv.c +VP8_DX_SRCS-yes += decoder/decodeframe.c +VP8_DX_SRCS-yes += decoder/detokenize.c +VP8_DX_SRCS-$(CONFIG_ERROR_CONCEALMENT) += decoder/ec_types.h +VP8_DX_SRCS-$(CONFIG_ERROR_CONCEALMENT) += decoder/error_concealment.h +VP8_DX_SRCS-$(CONFIG_ERROR_CONCEALMENT) += decoder/error_concealment.c +VP8_DX_SRCS-yes += decoder/dboolhuff.h +VP8_DX_SRCS-yes += decoder/decodemv.h +VP8_DX_SRCS-yes += decoder/decoderthreading.h +VP8_DX_SRCS-yes += decoder/detokenize.h +VP8_DX_SRCS-yes += decoder/onyxd_int.h +VP8_DX_SRCS-yes += decoder/treereader.h +VP8_DX_SRCS-yes += decoder/onyxd_if.c +VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/threading.c + +VP8_DX_SRCS-yes := $(filter-out $(VP8_DX_SRCS_REMOVE-yes),$(VP8_DX_SRCS-yes)) diff --git a/media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c new file mode 100644 index 0000000000..b43d7fa4f9 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c @@ -0,0 +1,446 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vp9/common/vp9_enums.h" +#include "vp9/common/arm/neon/vp9_iht_neon.h" +#include "vpx_dsp/arm/highbd_idct_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/inv_txfm.h" + +// Use macros to make sure argument lane is passed in as a constant integer. + +#define vmull_lane_s32_dual(in, c, lane, out) \ + do { \ + out[0].val[0] = vmull_lane_s32(vget_low_s32(in.val[0]), c, lane); \ + out[0].val[1] = vmull_lane_s32(vget_low_s32(in.val[1]), c, lane); \ + out[1].val[0] = vmull_lane_s32(vget_high_s32(in.val[0]), c, lane); \ + out[1].val[1] = vmull_lane_s32(vget_high_s32(in.val[1]), c, lane); \ + } while (0) + +#define vmlal_lane_s32_dual(in, c, lane, out) \ + do { \ + out[0].val[0] = \ + vmlal_lane_s32(out[0].val[0], vget_low_s32(in.val[0]), c, lane); \ + out[0].val[1] = \ + vmlal_lane_s32(out[0].val[1], vget_low_s32(in.val[1]), c, lane); \ + out[1].val[0] = \ + vmlal_lane_s32(out[1].val[0], vget_high_s32(in.val[0]), c, lane); \ + out[1].val[1] = \ + vmlal_lane_s32(out[1].val[1], vget_high_s32(in.val[1]), c, lane); \ + } while (0) + +#define vmlsl_lane_s32_dual(in, c, lane, out) \ + do { \ + out[0].val[0] = \ + vmlsl_lane_s32(out[0].val[0], vget_low_s32(in.val[0]), c, lane); \ + out[0].val[1] = \ + vmlsl_lane_s32(out[0].val[1], vget_low_s32(in.val[1]), c, lane); \ + out[1].val[0] = \ + vmlsl_lane_s32(out[1].val[0], vget_high_s32(in.val[0]), c, lane); \ + out[1].val[1] = \ + vmlsl_lane_s32(out[1].val[1], vget_high_s32(in.val[1]), c, lane); \ + } while (0) + +static INLINE int32x4x2_t +highbd_dct_const_round_shift_low_8(const int64x2x2_t *const in) { + int32x4x2_t out; + out.val[0] = vcombine_s32(vrshrn_n_s64(in[0].val[0], DCT_CONST_BITS), + vrshrn_n_s64(in[1].val[0], DCT_CONST_BITS)); + out.val[1] = vcombine_s32(vrshrn_n_s64(in[0].val[1], DCT_CONST_BITS), + vrshrn_n_s64(in[1].val[1], DCT_CONST_BITS)); + return out; +} + +#define highbd_iadst_half_butterfly(in, c, lane, out) \ + do { \ + int64x2x2_t _t[2]; \ + vmull_lane_s32_dual(in, c, lane, _t); \ + out = highbd_dct_const_round_shift_low_8(_t); \ + } while (0) + +#define highbd_iadst_butterfly(in0, in1, c, lane0, lane1, s0, s1) \ + do { \ + vmull_lane_s32_dual(in0, c, lane0, s0); \ + vmull_lane_s32_dual(in0, c, lane1, s1); \ + vmlal_lane_s32_dual(in1, c, lane1, s0); \ + vmlsl_lane_s32_dual(in1, c, lane0, s1); \ + } while (0) + +static INLINE int32x4x2_t vaddq_s32_dual(const int32x4x2_t in0, + const int32x4x2_t in1) { + int32x4x2_t out; + out.val[0] = vaddq_s32(in0.val[0], in1.val[0]); + out.val[1] = vaddq_s32(in0.val[1], in1.val[1]); + return out; +} + +static INLINE int64x2x2_t vaddq_s64_dual(const int64x2x2_t in0, + const int64x2x2_t in1) { + int64x2x2_t out; + out.val[0] = vaddq_s64(in0.val[0], in1.val[0]); + out.val[1] = vaddq_s64(in0.val[1], in1.val[1]); + return out; +} + +static INLINE int32x4x2_t vsubq_s32_dual(const int32x4x2_t in0, + const int32x4x2_t in1) { + int32x4x2_t out; + out.val[0] = vsubq_s32(in0.val[0], in1.val[0]); + out.val[1] = vsubq_s32(in0.val[1], in1.val[1]); + return out; +} + +static INLINE int64x2x2_t vsubq_s64_dual(const int64x2x2_t in0, + const int64x2x2_t in1) { + int64x2x2_t out; + out.val[0] = vsubq_s64(in0.val[0], in1.val[0]); + out.val[1] = vsubq_s64(in0.val[1], in1.val[1]); + return out; +} + +static INLINE int32x4x2_t vcombine_s32_dual(const int32x2x2_t in0, + const int32x2x2_t in1) { + int32x4x2_t out; + out.val[0] = vcombine_s32(in0.val[0], in1.val[0]); + out.val[1] = vcombine_s32(in0.val[1], in1.val[1]); + return out; +} + +static INLINE int32x4x2_t highbd_add_dct_const_round_shift_low_8( + const int64x2x2_t *const in0, const int64x2x2_t *const in1) { + const int64x2x2_t sum_lo = vaddq_s64_dual(in0[0], in1[0]); + const int64x2x2_t sum_hi = vaddq_s64_dual(in0[1], in1[1]); + int32x2x2_t out_lo, out_hi; + + out_lo.val[0] = vrshrn_n_s64(sum_lo.val[0], DCT_CONST_BITS); + out_lo.val[1] = vrshrn_n_s64(sum_lo.val[1], DCT_CONST_BITS); + out_hi.val[0] = vrshrn_n_s64(sum_hi.val[0], DCT_CONST_BITS); + out_hi.val[1] = vrshrn_n_s64(sum_hi.val[1], DCT_CONST_BITS); + return vcombine_s32_dual(out_lo, out_hi); +} + +static INLINE int32x4x2_t highbd_sub_dct_const_round_shift_low_8( + const int64x2x2_t *const in0, const int64x2x2_t *const in1) { + const int64x2x2_t sub_lo = vsubq_s64_dual(in0[0], in1[0]); + const int64x2x2_t sub_hi = vsubq_s64_dual(in0[1], in1[1]); + int32x2x2_t out_lo, out_hi; + + out_lo.val[0] = vrshrn_n_s64(sub_lo.val[0], DCT_CONST_BITS); + out_lo.val[1] = vrshrn_n_s64(sub_lo.val[1], DCT_CONST_BITS); + out_hi.val[0] = vrshrn_n_s64(sub_hi.val[0], DCT_CONST_BITS); + out_hi.val[1] = vrshrn_n_s64(sub_hi.val[1], DCT_CONST_BITS); + return vcombine_s32_dual(out_lo, out_hi); +} + +static INLINE int32x4x2_t vnegq_s32_dual(const int32x4x2_t in) { + int32x4x2_t out; + out.val[0] = vnegq_s32(in.val[0]); + out.val[1] = vnegq_s32(in.val[1]); + return out; +} + +static void highbd_iadst16_neon(const int32_t *input, int32_t *output, + uint16_t *dest, const int stride, + const int bd) { + const int32x4_t c_1_31_5_27 = + create_s32x4_neon(cospi_1_64, cospi_31_64, cospi_5_64, cospi_27_64); + const int32x4_t c_9_23_13_19 = + create_s32x4_neon(cospi_9_64, cospi_23_64, cospi_13_64, cospi_19_64); + const int32x4_t c_17_15_21_11 = + create_s32x4_neon(cospi_17_64, cospi_15_64, cospi_21_64, cospi_11_64); + const int32x4_t c_25_7_29_3 = + create_s32x4_neon(cospi_25_64, cospi_7_64, cospi_29_64, cospi_3_64); + const int32x4_t c_4_28_20_12 = + create_s32x4_neon(cospi_4_64, cospi_28_64, cospi_20_64, cospi_12_64); + const int32x4_t c_16_n16_8_24 = + create_s32x4_neon(cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64); + int32x4x2_t in[16], out[16]; + int32x4x2_t x[16], t[12]; + int64x2x2_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2]; + int64x2x2_t s8[2], s9[2], s10[2], s11[2], s12[2], s13[2], s14[2], s15[2]; + + // Load input (16x8) + in[0].val[0] = vld1q_s32(input); + in[0].val[1] = vld1q_s32(input + 4); + input += 8; + in[8].val[0] = vld1q_s32(input); + in[8].val[1] = vld1q_s32(input + 4); + input += 8; + in[1].val[0] = vld1q_s32(input); + in[1].val[1] = vld1q_s32(input + 4); + input += 8; + in[9].val[0] = vld1q_s32(input); + in[9].val[1] = vld1q_s32(input + 4); + input += 8; + in[2].val[0] = vld1q_s32(input); + in[2].val[1] = vld1q_s32(input + 4); + input += 8; + in[10].val[0] = vld1q_s32(input); + in[10].val[1] = vld1q_s32(input + 4); + input += 8; + in[3].val[0] = vld1q_s32(input); + in[3].val[1] = vld1q_s32(input + 4); + input += 8; + in[11].val[0] = vld1q_s32(input); + in[11].val[1] = vld1q_s32(input + 4); + input += 8; + in[4].val[0] = vld1q_s32(input); + in[4].val[1] = vld1q_s32(input + 4); + input += 8; + in[12].val[0] = vld1q_s32(input); + in[12].val[1] = vld1q_s32(input + 4); + input += 8; + in[5].val[0] = vld1q_s32(input); + in[5].val[1] = vld1q_s32(input + 4); + input += 8; + in[13].val[0] = vld1q_s32(input); + in[13].val[1] = vld1q_s32(input + 4); + input += 8; + in[6].val[0] = vld1q_s32(input); + in[6].val[1] = vld1q_s32(input + 4); + input += 8; + in[14].val[0] = vld1q_s32(input); + in[14].val[1] = vld1q_s32(input + 4); + input += 8; + in[7].val[0] = vld1q_s32(input); + in[7].val[1] = vld1q_s32(input + 4); + input += 8; + in[15].val[0] = vld1q_s32(input); + in[15].val[1] = vld1q_s32(input + 4); + + // Transpose + transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + transpose_s32_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14], + &in[15]); + + x[0] = in[15]; + x[1] = in[0]; + x[2] = in[13]; + x[3] = in[2]; + x[4] = in[11]; + x[5] = in[4]; + x[6] = in[9]; + x[7] = in[6]; + x[8] = in[7]; + x[9] = in[8]; + x[10] = in[5]; + x[11] = in[10]; + x[12] = in[3]; + x[13] = in[12]; + x[14] = in[1]; + x[15] = in[14]; + + // stage 1 + highbd_iadst_butterfly(x[0], x[1], vget_low_s32(c_1_31_5_27), 0, 1, s0, s1); + highbd_iadst_butterfly(x[2], x[3], vget_high_s32(c_1_31_5_27), 0, 1, s2, s3); + highbd_iadst_butterfly(x[4], x[5], vget_low_s32(c_9_23_13_19), 0, 1, s4, s5); + highbd_iadst_butterfly(x[6], x[7], vget_high_s32(c_9_23_13_19), 0, 1, s6, s7); + highbd_iadst_butterfly(x[8], x[9], vget_low_s32(c_17_15_21_11), 0, 1, s8, s9); + highbd_iadst_butterfly(x[10], x[11], vget_high_s32(c_17_15_21_11), 0, 1, s10, + s11); + highbd_iadst_butterfly(x[12], x[13], vget_low_s32(c_25_7_29_3), 0, 1, s12, + s13); + highbd_iadst_butterfly(x[14], x[15], vget_high_s32(c_25_7_29_3), 0, 1, s14, + s15); + + x[0] = highbd_add_dct_const_round_shift_low_8(s0, s8); + x[1] = highbd_add_dct_const_round_shift_low_8(s1, s9); + x[2] = highbd_add_dct_const_round_shift_low_8(s2, s10); + x[3] = highbd_add_dct_const_round_shift_low_8(s3, s11); + x[4] = highbd_add_dct_const_round_shift_low_8(s4, s12); + x[5] = highbd_add_dct_const_round_shift_low_8(s5, s13); + x[6] = highbd_add_dct_const_round_shift_low_8(s6, s14); + x[7] = highbd_add_dct_const_round_shift_low_8(s7, s15); + x[8] = highbd_sub_dct_const_round_shift_low_8(s0, s8); + x[9] = highbd_sub_dct_const_round_shift_low_8(s1, s9); + x[10] = highbd_sub_dct_const_round_shift_low_8(s2, s10); + x[11] = highbd_sub_dct_const_round_shift_low_8(s3, s11); + x[12] = highbd_sub_dct_const_round_shift_low_8(s4, s12); + x[13] = highbd_sub_dct_const_round_shift_low_8(s5, s13); + x[14] = highbd_sub_dct_const_round_shift_low_8(s6, s14); + x[15] = highbd_sub_dct_const_round_shift_low_8(s7, s15); + + // stage 2 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + t[4] = x[4]; + t[5] = x[5]; + t[6] = x[6]; + t[7] = x[7]; + highbd_iadst_butterfly(x[8], x[9], vget_low_s32(c_4_28_20_12), 0, 1, s8, s9); + highbd_iadst_butterfly(x[10], x[11], vget_high_s32(c_4_28_20_12), 0, 1, s10, + s11); + highbd_iadst_butterfly(x[13], x[12], vget_low_s32(c_4_28_20_12), 1, 0, s13, + s12); + highbd_iadst_butterfly(x[15], x[14], vget_high_s32(c_4_28_20_12), 1, 0, s15, + s14); + + x[0] = vaddq_s32_dual(t[0], t[4]); + x[1] = vaddq_s32_dual(t[1], t[5]); + x[2] = vaddq_s32_dual(t[2], t[6]); + x[3] = vaddq_s32_dual(t[3], t[7]); + x[4] = vsubq_s32_dual(t[0], t[4]); + x[5] = vsubq_s32_dual(t[1], t[5]); + x[6] = vsubq_s32_dual(t[2], t[6]); + x[7] = vsubq_s32_dual(t[3], t[7]); + x[8] = highbd_add_dct_const_round_shift_low_8(s8, s12); + x[9] = highbd_add_dct_const_round_shift_low_8(s9, s13); + x[10] = highbd_add_dct_const_round_shift_low_8(s10, s14); + x[11] = highbd_add_dct_const_round_shift_low_8(s11, s15); + x[12] = highbd_sub_dct_const_round_shift_low_8(s8, s12); + x[13] = highbd_sub_dct_const_round_shift_low_8(s9, s13); + x[14] = highbd_sub_dct_const_round_shift_low_8(s10, s14); + x[15] = highbd_sub_dct_const_round_shift_low_8(s11, s15); + + // stage 3 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + highbd_iadst_butterfly(x[4], x[5], vget_high_s32(c_16_n16_8_24), 0, 1, s4, + s5); + highbd_iadst_butterfly(x[7], x[6], vget_high_s32(c_16_n16_8_24), 1, 0, s7, + s6); + t[8] = x[8]; + t[9] = x[9]; + t[10] = x[10]; + t[11] = x[11]; + highbd_iadst_butterfly(x[12], x[13], vget_high_s32(c_16_n16_8_24), 0, 1, s12, + s13); + highbd_iadst_butterfly(x[15], x[14], vget_high_s32(c_16_n16_8_24), 1, 0, s15, + s14); + + x[0] = vaddq_s32_dual(t[0], t[2]); + x[1] = vaddq_s32_dual(t[1], t[3]); + x[2] = vsubq_s32_dual(t[0], t[2]); + x[3] = vsubq_s32_dual(t[1], t[3]); + x[4] = highbd_add_dct_const_round_shift_low_8(s4, s6); + x[5] = highbd_add_dct_const_round_shift_low_8(s5, s7); + x[6] = highbd_sub_dct_const_round_shift_low_8(s4, s6); + x[7] = highbd_sub_dct_const_round_shift_low_8(s5, s7); + x[8] = vaddq_s32_dual(t[8], t[10]); + x[9] = vaddq_s32_dual(t[9], t[11]); + x[10] = vsubq_s32_dual(t[8], t[10]); + x[11] = vsubq_s32_dual(t[9], t[11]); + x[12] = highbd_add_dct_const_round_shift_low_8(s12, s14); + x[13] = highbd_add_dct_const_round_shift_low_8(s13, s15); + x[14] = highbd_sub_dct_const_round_shift_low_8(s12, s14); + x[15] = highbd_sub_dct_const_round_shift_low_8(s13, s15); + + // stage 4 + { + const int32x4x2_t sum = vaddq_s32_dual(x[2], x[3]); + const int32x4x2_t sub = vsubq_s32_dual(x[2], x[3]); + highbd_iadst_half_butterfly(sum, vget_low_s32(c_16_n16_8_24), 1, x[2]); + highbd_iadst_half_butterfly(sub, vget_low_s32(c_16_n16_8_24), 0, x[3]); + } + { + const int32x4x2_t sum = vaddq_s32_dual(x[7], x[6]); + const int32x4x2_t sub = vsubq_s32_dual(x[7], x[6]); + highbd_iadst_half_butterfly(sum, vget_low_s32(c_16_n16_8_24), 0, x[6]); + highbd_iadst_half_butterfly(sub, vget_low_s32(c_16_n16_8_24), 0, x[7]); + } + { + const int32x4x2_t sum = vaddq_s32_dual(x[11], x[10]); + const int32x4x2_t sub = vsubq_s32_dual(x[11], x[10]); + highbd_iadst_half_butterfly(sum, vget_low_s32(c_16_n16_8_24), 0, x[10]); + highbd_iadst_half_butterfly(sub, vget_low_s32(c_16_n16_8_24), 0, x[11]); + } + { + const int32x4x2_t sum = vaddq_s32_dual(x[14], x[15]); + const int32x4x2_t sub = vsubq_s32_dual(x[14], x[15]); + highbd_iadst_half_butterfly(sum, vget_low_s32(c_16_n16_8_24), 1, x[14]); + highbd_iadst_half_butterfly(sub, vget_low_s32(c_16_n16_8_24), 0, x[15]); + } + + out[0] = x[0]; + out[1] = vnegq_s32_dual(x[8]); + out[2] = x[12]; + out[3] = vnegq_s32_dual(x[4]); + out[4] = x[6]; + out[5] = x[14]; + out[6] = x[10]; + out[7] = x[2]; + out[8] = x[3]; + out[9] = x[11]; + out[10] = x[15]; + out[11] = x[7]; + out[12] = x[5]; + out[13] = vnegq_s32_dual(x[13]); + out[14] = x[9]; + out[15] = vnegq_s32_dual(x[1]); + + if (output) { + highbd_idct16x16_store_pass1(out, output); + } else { + highbd_idct16x16_add_store(out, dest, stride, bd); + } +} + +typedef void (*highbd_iht_1d)(const int32_t *input, int32_t *output, + uint16_t *dest, const int stride, const int bd); + +typedef struct { + highbd_iht_1d cols, rows; // vertical and horizontal +} highbd_iht_2d; + +void vp9_highbd_iht16x16_256_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int tx_type, int bd) { + if (bd == 8) { + static const iht_2d IHT_16[] = { + { vpx_idct16x16_256_add_half1d, + vpx_idct16x16_256_add_half1d }, // DCT_DCT = 0 + { vpx_iadst16x16_256_add_half1d, + vpx_idct16x16_256_add_half1d }, // ADST_DCT = 1 + { vpx_idct16x16_256_add_half1d, + vpx_iadst16x16_256_add_half1d }, // DCT_ADST = 2 + { vpx_iadst16x16_256_add_half1d, + vpx_iadst16x16_256_add_half1d } // ADST_ADST = 3 + }; + const iht_2d ht = IHT_16[tx_type]; + int16_t row_output[16 * 16]; + + // pass 1 + ht.rows(input, row_output, dest, stride, 1); // upper 8 rows + ht.rows(input + 8 * 16, row_output + 8, dest, stride, 1); // lower 8 rows + + // pass 2 + ht.cols(row_output, NULL, dest, stride, 1); // left 8 columns + ht.cols(row_output + 16 * 8, NULL, dest + 8, stride, 1); // right 8 columns + } else { + static const highbd_iht_2d IHT_16[] = { + { vpx_highbd_idct16x16_256_add_half1d, + vpx_highbd_idct16x16_256_add_half1d }, // DCT_DCT = 0 + { highbd_iadst16_neon, + vpx_highbd_idct16x16_256_add_half1d }, // ADST_DCT = 1 + { vpx_highbd_idct16x16_256_add_half1d, + highbd_iadst16_neon }, // DCT_ADST = 2 + { highbd_iadst16_neon, highbd_iadst16_neon } // ADST_ADST = 3 + }; + const highbd_iht_2d ht = IHT_16[tx_type]; + int32_t row_output[16 * 16]; + + // pass 1 + ht.rows(input, row_output, dest, stride, bd); // upper 8 rows + ht.rows(input + 8 * 16, row_output + 8, dest, stride, bd); // lower 8 rows + + // pass 2 + ht.cols(row_output, NULL, dest, stride, bd); // left 8 columns + ht.cols(row_output + 8 * 16, NULL, dest + 8, stride, + bd); // right 8 columns + } +} diff --git a/media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c new file mode 100644 index 0000000000..52c4f1937d --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/arm/neon/vp9_iht_neon.h" +#include "vpx_dsp/arm/highbd_idct_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/txfm_common.h" + +static INLINE void highbd_iadst4(int32x4_t *const io) { + const int32_t sinpis[4] = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9 }; + const int32x4_t sinpi = vld1q_s32(sinpis); + int64x2x2_t s[7], t[4]; + int32x4_t s7; + + s[0].val[0] = vmull_lane_s32(vget_low_s32(io[0]), vget_low_s32(sinpi), 0); + s[0].val[1] = vmull_lane_s32(vget_high_s32(io[0]), vget_low_s32(sinpi), 0); + s[1].val[0] = vmull_lane_s32(vget_low_s32(io[0]), vget_low_s32(sinpi), 1); + s[1].val[1] = vmull_lane_s32(vget_high_s32(io[0]), vget_low_s32(sinpi), 1); + s[2].val[0] = vmull_lane_s32(vget_low_s32(io[1]), vget_high_s32(sinpi), 0); + s[2].val[1] = vmull_lane_s32(vget_high_s32(io[1]), vget_high_s32(sinpi), 0); + s[3].val[0] = vmull_lane_s32(vget_low_s32(io[2]), vget_high_s32(sinpi), 1); + s[3].val[1] = vmull_lane_s32(vget_high_s32(io[2]), vget_high_s32(sinpi), 1); + s[4].val[0] = vmull_lane_s32(vget_low_s32(io[2]), vget_low_s32(sinpi), 0); + s[4].val[1] = vmull_lane_s32(vget_high_s32(io[2]), vget_low_s32(sinpi), 0); + s[5].val[0] = vmull_lane_s32(vget_low_s32(io[3]), vget_low_s32(sinpi), 1); + s[5].val[1] = vmull_lane_s32(vget_high_s32(io[3]), vget_low_s32(sinpi), 1); + s[6].val[0] = vmull_lane_s32(vget_low_s32(io[3]), vget_high_s32(sinpi), 1); + s[6].val[1] = vmull_lane_s32(vget_high_s32(io[3]), vget_high_s32(sinpi), 1); + s7 = vsubq_s32(io[0], io[2]); + s7 = vaddq_s32(s7, io[3]); + + s[0].val[0] = vaddq_s64(s[0].val[0], s[3].val[0]); + s[0].val[1] = vaddq_s64(s[0].val[1], s[3].val[1]); + s[0].val[0] = vaddq_s64(s[0].val[0], s[5].val[0]); + s[0].val[1] = vaddq_s64(s[0].val[1], s[5].val[1]); + s[1].val[0] = vsubq_s64(s[1].val[0], s[4].val[0]); + s[1].val[1] = vsubq_s64(s[1].val[1], s[4].val[1]); + s[1].val[0] = vsubq_s64(s[1].val[0], s[6].val[0]); + s[1].val[1] = vsubq_s64(s[1].val[1], s[6].val[1]); + s[3] = s[2]; + s[2].val[0] = vmull_lane_s32(vget_low_s32(s7), vget_high_s32(sinpi), 0); + s[2].val[1] = vmull_lane_s32(vget_high_s32(s7), vget_high_s32(sinpi), 0); + + t[0].val[0] = vaddq_s64(s[0].val[0], s[3].val[0]); + t[0].val[1] = vaddq_s64(s[0].val[1], s[3].val[1]); + t[1].val[0] = vaddq_s64(s[1].val[0], s[3].val[0]); + t[1].val[1] = vaddq_s64(s[1].val[1], s[3].val[1]); + t[2] = s[2]; + t[3].val[0] = vaddq_s64(s[0].val[0], s[1].val[0]); + t[3].val[1] = vaddq_s64(s[0].val[1], s[1].val[1]); + t[3].val[0] = vsubq_s64(t[3].val[0], s[3].val[0]); + t[3].val[1] = vsubq_s64(t[3].val[1], s[3].val[1]); + io[0] = vcombine_s32(vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS), + vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS)); + io[1] = vcombine_s32(vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS), + vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS)); + io[2] = vcombine_s32(vrshrn_n_s64(t[2].val[0], DCT_CONST_BITS), + vrshrn_n_s64(t[2].val[1], DCT_CONST_BITS)); + io[3] = vcombine_s32(vrshrn_n_s64(t[3].val[0], DCT_CONST_BITS), + vrshrn_n_s64(t[3].val[1], DCT_CONST_BITS)); +} + +void vp9_highbd_iht4x4_16_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int tx_type, int bd) { + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + int16x8_t a[2]; + int32x4_t c[4]; + + c[0] = vld1q_s32(input); + c[1] = vld1q_s32(input + 4); + c[2] = vld1q_s32(input + 8); + c[3] = vld1q_s32(input + 12); + + if (bd == 8) { + a[0] = vcombine_s16(vmovn_s32(c[0]), vmovn_s32(c[1])); + a[1] = vcombine_s16(vmovn_s32(c[2]), vmovn_s32(c[3])); + transpose_s16_4x4q(&a[0], &a[1]); + + switch (tx_type) { + case DCT_DCT: + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + transpose_s16_4x4q(&a[0], &a[1]); + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + break; + + case ADST_DCT: + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + transpose_s16_4x4q(&a[0], &a[1]); + iadst4(a); + break; + + case DCT_ADST: + iadst4(a); + transpose_s16_4x4q(&a[0], &a[1]); + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + break; + + default: + assert(tx_type == ADST_ADST); + iadst4(a); + transpose_s16_4x4q(&a[0], &a[1]); + iadst4(a); + break; + } + a[0] = vrshrq_n_s16(a[0], 4); + a[1] = vrshrq_n_s16(a[1], 4); + } else { + switch (tx_type) { + case DCT_DCT: { + const int32x4_t cospis = vld1q_s32(kCospi32); + + if (bd == 10) { + idct4x4_16_kernel_bd10(cospis, c); + idct4x4_16_kernel_bd10(cospis, c); + } else { + idct4x4_16_kernel_bd12(cospis, c); + idct4x4_16_kernel_bd12(cospis, c); + } + break; + } + + case ADST_DCT: { + const int32x4_t cospis = vld1q_s32(kCospi32); + + if (bd == 10) { + idct4x4_16_kernel_bd10(cospis, c); + } else { + idct4x4_16_kernel_bd12(cospis, c); + } + transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]); + highbd_iadst4(c); + break; + } + + case DCT_ADST: { + const int32x4_t cospis = vld1q_s32(kCospi32); + + transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]); + highbd_iadst4(c); + if (bd == 10) { + idct4x4_16_kernel_bd10(cospis, c); + } else { + idct4x4_16_kernel_bd12(cospis, c); + } + break; + } + + default: { + assert(tx_type == ADST_ADST); + transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]); + highbd_iadst4(c); + transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]); + highbd_iadst4(c); + break; + } + } + a[0] = vcombine_s16(vqrshrn_n_s32(c[0], 4), vqrshrn_n_s32(c[1], 4)); + a[1] = vcombine_s16(vqrshrn_n_s32(c[2], 4), vqrshrn_n_s32(c[3], 4)); + } + + highbd_idct4x4_1_add_kernel1(&dest, stride, a[0], max); + highbd_idct4x4_1_add_kernel1(&dest, stride, a[1], max); +} diff --git a/media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c new file mode 100644 index 0000000000..2232c6841c --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c @@ -0,0 +1,345 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vp9/common/vp9_enums.h" +#include "vp9/common/arm/neon/vp9_iht_neon.h" +#include "vpx_dsp/arm/highbd_idct_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/inv_txfm.h" + +static INLINE void highbd_iadst_half_butterfly_neon(int32x4_t *const x, + const int32x2_t c) { + const int32x4_t sum = vaddq_s32(x[0], x[1]); + const int32x4_t sub = vsubq_s32(x[0], x[1]); + const int64x2_t t0_lo = vmull_lane_s32(vget_low_s32(sum), c, 0); + const int64x2_t t1_lo = vmull_lane_s32(vget_low_s32(sub), c, 0); + const int64x2_t t0_hi = vmull_lane_s32(vget_high_s32(sum), c, 0); + const int64x2_t t1_hi = vmull_lane_s32(vget_high_s32(sub), c, 0); + const int32x2_t out0_lo = vrshrn_n_s64(t0_lo, DCT_CONST_BITS); + const int32x2_t out1_lo = vrshrn_n_s64(t1_lo, DCT_CONST_BITS); + const int32x2_t out0_hi = vrshrn_n_s64(t0_hi, DCT_CONST_BITS); + const int32x2_t out1_hi = vrshrn_n_s64(t1_hi, DCT_CONST_BITS); + + x[0] = vcombine_s32(out0_lo, out0_hi); + x[1] = vcombine_s32(out1_lo, out1_hi); +} + +static INLINE void highbd_iadst_butterfly_lane_0_1_neon(const int32x4_t in0, + const int32x4_t in1, + const int32x2_t c, + int64x2_t *const s0, + int64x2_t *const s1) { + const int64x2_t t0_lo = vmull_lane_s32(vget_low_s32(in0), c, 0); + const int64x2_t t1_lo = vmull_lane_s32(vget_low_s32(in0), c, 1); + const int64x2_t t0_hi = vmull_lane_s32(vget_high_s32(in0), c, 0); + const int64x2_t t1_hi = vmull_lane_s32(vget_high_s32(in0), c, 1); + + s0[0] = vmlal_lane_s32(t0_lo, vget_low_s32(in1), c, 1); + s1[0] = vmlsl_lane_s32(t1_lo, vget_low_s32(in1), c, 0); + s0[1] = vmlal_lane_s32(t0_hi, vget_high_s32(in1), c, 1); + s1[1] = vmlsl_lane_s32(t1_hi, vget_high_s32(in1), c, 0); +} + +static INLINE void highbd_iadst_butterfly_lane_1_0_neon(const int32x4_t in0, + const int32x4_t in1, + const int32x2_t c, + int64x2_t *const s0, + int64x2_t *const s1) { + const int64x2_t t0_lo = vmull_lane_s32(vget_low_s32(in0), c, 1); + const int64x2_t t1_lo = vmull_lane_s32(vget_low_s32(in0), c, 0); + const int64x2_t t0_hi = vmull_lane_s32(vget_high_s32(in0), c, 1); + const int64x2_t t1_hi = vmull_lane_s32(vget_high_s32(in0), c, 0); + + s0[0] = vmlal_lane_s32(t0_lo, vget_low_s32(in1), c, 0); + s1[0] = vmlsl_lane_s32(t1_lo, vget_low_s32(in1), c, 1); + s0[1] = vmlal_lane_s32(t0_hi, vget_high_s32(in1), c, 0); + s1[1] = vmlsl_lane_s32(t1_hi, vget_high_s32(in1), c, 1); +} + +static INLINE int32x4_t highbd_add_dct_const_round_shift_low_8( + const int64x2_t *const in0, const int64x2_t *const in1) { + const int64x2_t sum_lo = vaddq_s64(in0[0], in1[0]); + const int64x2_t sum_hi = vaddq_s64(in0[1], in1[1]); + const int32x2_t out_lo = vrshrn_n_s64(sum_lo, DCT_CONST_BITS); + const int32x2_t out_hi = vrshrn_n_s64(sum_hi, DCT_CONST_BITS); + return vcombine_s32(out_lo, out_hi); +} + +static INLINE int32x4_t highbd_sub_dct_const_round_shift_low_8( + const int64x2_t *const in0, const int64x2_t *const in1) { + const int64x2_t sub_lo = vsubq_s64(in0[0], in1[0]); + const int64x2_t sub_hi = vsubq_s64(in0[1], in1[1]); + const int32x2_t out_lo = vrshrn_n_s64(sub_lo, DCT_CONST_BITS); + const int32x2_t out_hi = vrshrn_n_s64(sub_hi, DCT_CONST_BITS); + return vcombine_s32(out_lo, out_hi); +} + +static INLINE void highbd_iadst8(int32x4_t *const io0, int32x4_t *const io1, + int32x4_t *const io2, int32x4_t *const io3, + int32x4_t *const io4, int32x4_t *const io5, + int32x4_t *const io6, int32x4_t *const io7) { + const int32x4_t c0 = + create_s32x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64); + const int32x4_t c1 = + create_s32x4_neon(cospi_18_64, cospi_14_64, cospi_26_64, cospi_6_64); + const int32x4_t c2 = + create_s32x4_neon(cospi_16_64, 0, cospi_8_64, cospi_24_64); + int32x4_t x[8], t[4]; + int64x2_t s[8][2]; + + x[0] = *io7; + x[1] = *io0; + x[2] = *io5; + x[3] = *io2; + x[4] = *io3; + x[5] = *io4; + x[6] = *io1; + x[7] = *io6; + + // stage 1 + highbd_iadst_butterfly_lane_0_1_neon(x[0], x[1], vget_low_s32(c0), s[0], + s[1]); + highbd_iadst_butterfly_lane_0_1_neon(x[2], x[3], vget_high_s32(c0), s[2], + s[3]); + highbd_iadst_butterfly_lane_0_1_neon(x[4], x[5], vget_low_s32(c1), s[4], + s[5]); + highbd_iadst_butterfly_lane_0_1_neon(x[6], x[7], vget_high_s32(c1), s[6], + s[7]); + + x[0] = highbd_add_dct_const_round_shift_low_8(s[0], s[4]); + x[1] = highbd_add_dct_const_round_shift_low_8(s[1], s[5]); + x[2] = highbd_add_dct_const_round_shift_low_8(s[2], s[6]); + x[3] = highbd_add_dct_const_round_shift_low_8(s[3], s[7]); + x[4] = highbd_sub_dct_const_round_shift_low_8(s[0], s[4]); + x[5] = highbd_sub_dct_const_round_shift_low_8(s[1], s[5]); + x[6] = highbd_sub_dct_const_round_shift_low_8(s[2], s[6]); + x[7] = highbd_sub_dct_const_round_shift_low_8(s[3], s[7]); + + // stage 2 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + highbd_iadst_butterfly_lane_0_1_neon(x[4], x[5], vget_high_s32(c2), s[4], + s[5]); + highbd_iadst_butterfly_lane_1_0_neon(x[7], x[6], vget_high_s32(c2), s[7], + s[6]); + + x[0] = vaddq_s32(t[0], t[2]); + x[1] = vaddq_s32(t[1], t[3]); + x[2] = vsubq_s32(t[0], t[2]); + x[3] = vsubq_s32(t[1], t[3]); + x[4] = highbd_add_dct_const_round_shift_low_8(s[4], s[6]); + x[5] = highbd_add_dct_const_round_shift_low_8(s[5], s[7]); + x[6] = highbd_sub_dct_const_round_shift_low_8(s[4], s[6]); + x[7] = highbd_sub_dct_const_round_shift_low_8(s[5], s[7]); + + // stage 3 + highbd_iadst_half_butterfly_neon(x + 2, vget_low_s32(c2)); + highbd_iadst_half_butterfly_neon(x + 6, vget_low_s32(c2)); + + *io0 = x[0]; + *io1 = vnegq_s32(x[4]); + *io2 = x[6]; + *io3 = vnegq_s32(x[2]); + *io4 = x[3]; + *io5 = vnegq_s32(x[7]); + *io6 = x[5]; + *io7 = vnegq_s32(x[1]); +} + +void vp9_highbd_iht8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int tx_type, int bd) { + int32x4_t a[16]; + int16x8_t c[8]; + + a[0] = vld1q_s32(input); + a[1] = vld1q_s32(input + 4); + a[2] = vld1q_s32(input + 8); + a[3] = vld1q_s32(input + 12); + a[4] = vld1q_s32(input + 16); + a[5] = vld1q_s32(input + 20); + a[6] = vld1q_s32(input + 24); + a[7] = vld1q_s32(input + 28); + a[8] = vld1q_s32(input + 32); + a[9] = vld1q_s32(input + 36); + a[10] = vld1q_s32(input + 40); + a[11] = vld1q_s32(input + 44); + a[12] = vld1q_s32(input + 48); + a[13] = vld1q_s32(input + 52); + a[14] = vld1q_s32(input + 56); + a[15] = vld1q_s32(input + 60); + + if (bd == 8) { + c[0] = vcombine_s16(vmovn_s32(a[0]), vmovn_s32(a[1])); + c[1] = vcombine_s16(vmovn_s32(a[2]), vmovn_s32(a[3])); + c[2] = vcombine_s16(vmovn_s32(a[4]), vmovn_s32(a[5])); + c[3] = vcombine_s16(vmovn_s32(a[6]), vmovn_s32(a[7])); + c[4] = vcombine_s16(vmovn_s32(a[8]), vmovn_s32(a[9])); + c[5] = vcombine_s16(vmovn_s32(a[10]), vmovn_s32(a[11])); + c[6] = vcombine_s16(vmovn_s32(a[12]), vmovn_s32(a[13])); + c[7] = vcombine_s16(vmovn_s32(a[14]), vmovn_s32(a[15])); + + switch (tx_type) { + case DCT_DCT: { + const int16x8_t cospis = vld1q_s16(kCospi); + const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 + const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 + + idct8x8_64_1d_bd8(cospis0, cospis1, c); + idct8x8_64_1d_bd8(cospis0, cospis1, c); + break; + } + + case ADST_DCT: { + const int16x8_t cospis = vld1q_s16(kCospi); + const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 + const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 + + idct8x8_64_1d_bd8(cospis0, cospis1, c); + transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6], + &c[7]); + iadst8(c); + break; + } + + case DCT_ADST: { + const int16x8_t cospis = vld1q_s16(kCospi); + const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 + const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 + + transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6], + &c[7]); + iadst8(c); + idct8x8_64_1d_bd8(cospis0, cospis1, c); + break; + } + + default: { + transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6], + &c[7]); + iadst8(c); + transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6], + &c[7]); + iadst8(c); + break; + } + } + + c[0] = vrshrq_n_s16(c[0], 5); + c[1] = vrshrq_n_s16(c[1], 5); + c[2] = vrshrq_n_s16(c[2], 5); + c[3] = vrshrq_n_s16(c[3], 5); + c[4] = vrshrq_n_s16(c[4], 5); + c[5] = vrshrq_n_s16(c[5], 5); + c[6] = vrshrq_n_s16(c[6], 5); + c[7] = vrshrq_n_s16(c[7], 5); + } else { + switch (tx_type) { + case DCT_DCT: { + const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24 + const int32x4_t cospis1 = + vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28 + + if (bd == 10) { + idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6], &a[7]); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11], + &a[12], &a[13], &a[14], &a[15]); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9], + &a[2], &a[10], &a[3], &a[11]); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13], + &a[6], &a[14], &a[7], &a[15]); + } else { + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6], &a[7]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11], + &a[12], &a[13], &a[14], &a[15]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9], + &a[2], &a[10], &a[3], &a[11]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13], + &a[6], &a[14], &a[7], &a[15]); + } + break; + } + + case ADST_DCT: { + const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24 + const int32x4_t cospis1 = + vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28 + + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6], &a[7]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11], + &a[12], &a[13], &a[14], &a[15]); + transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], + &a[11]); + highbd_iadst8(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); + transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], + &a[15]); + highbd_iadst8(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], + &a[15]); + break; + } + + case DCT_ADST: { + const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24 + const int32x4_t cospis1 = + vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28 + + transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], + &a[7]); + highbd_iadst8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], + &a[15]); + highbd_iadst8(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], + &a[15]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9], + &a[2], &a[10], &a[3], &a[11]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13], + &a[6], &a[14], &a[7], &a[15]); + break; + } + + default: { + assert(tx_type == ADST_ADST); + transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], + &a[7]); + highbd_iadst8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], + &a[15]); + highbd_iadst8(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], + &a[15]); + transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], + &a[11]); + highbd_iadst8(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); + transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], + &a[15]); + highbd_iadst8(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], + &a[15]); + break; + } + } + + c[0] = vcombine_s16(vrshrn_n_s32(a[0], 5), vrshrn_n_s32(a[4], 5)); + c[1] = vcombine_s16(vrshrn_n_s32(a[8], 5), vrshrn_n_s32(a[12], 5)); + c[2] = vcombine_s16(vrshrn_n_s32(a[1], 5), vrshrn_n_s32(a[5], 5)); + c[3] = vcombine_s16(vrshrn_n_s32(a[9], 5), vrshrn_n_s32(a[13], 5)); + c[4] = vcombine_s16(vrshrn_n_s32(a[2], 5), vrshrn_n_s32(a[6], 5)); + c[5] = vcombine_s16(vrshrn_n_s32(a[10], 5), vrshrn_n_s32(a[14], 5)); + c[6] = vcombine_s16(vrshrn_n_s32(a[3], 5), vrshrn_n_s32(a[7], 5)); + c[7] = vcombine_s16(vrshrn_n_s32(a[11], 5), vrshrn_n_s32(a[15], 5)); + } + highbd_add8x8(c, dest, stride, bd); +} diff --git a/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht16x16_add_neon.c b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht16x16_add_neon.c new file mode 100644 index 0000000000..db72ff1161 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht16x16_add_neon.c @@ -0,0 +1,279 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/arm/neon/vp9_iht_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" + +void vpx_iadst16x16_256_add_half1d(const void *const input, int16_t *output, + void *const dest, const int stride, + const int highbd_flag) { + int16x8_t in[16], out[16]; + const int16x4_t c_1_31_5_27 = + create_s16x4_neon(cospi_1_64, cospi_31_64, cospi_5_64, cospi_27_64); + const int16x4_t c_9_23_13_19 = + create_s16x4_neon(cospi_9_64, cospi_23_64, cospi_13_64, cospi_19_64); + const int16x4_t c_17_15_21_11 = + create_s16x4_neon(cospi_17_64, cospi_15_64, cospi_21_64, cospi_11_64); + const int16x4_t c_25_7_29_3 = + create_s16x4_neon(cospi_25_64, cospi_7_64, cospi_29_64, cospi_3_64); + const int16x4_t c_4_28_20_12 = + create_s16x4_neon(cospi_4_64, cospi_28_64, cospi_20_64, cospi_12_64); + const int16x4_t c_16_n16_8_24 = + create_s16x4_neon(cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64); + int16x8_t x[16], t[12]; + int32x4_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2]; + int32x4_t s8[2], s9[2], s10[2], s11[2], s12[2], s13[2], s14[2], s15[2]; + + // Load input (16x8) + if (output) { + const tran_low_t *inputT = (const tran_low_t *)input; + in[0] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[8] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[1] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[9] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[2] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[10] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[3] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[11] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[4] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[12] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[5] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[13] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[6] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[14] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[7] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[15] = load_tran_low_to_s16q(inputT); + } else { + const int16_t *inputT = (const int16_t *)input; + in[0] = vld1q_s16(inputT); + inputT += 8; + in[8] = vld1q_s16(inputT); + inputT += 8; + in[1] = vld1q_s16(inputT); + inputT += 8; + in[9] = vld1q_s16(inputT); + inputT += 8; + in[2] = vld1q_s16(inputT); + inputT += 8; + in[10] = vld1q_s16(inputT); + inputT += 8; + in[3] = vld1q_s16(inputT); + inputT += 8; + in[11] = vld1q_s16(inputT); + inputT += 8; + in[4] = vld1q_s16(inputT); + inputT += 8; + in[12] = vld1q_s16(inputT); + inputT += 8; + in[5] = vld1q_s16(inputT); + inputT += 8; + in[13] = vld1q_s16(inputT); + inputT += 8; + in[6] = vld1q_s16(inputT); + inputT += 8; + in[14] = vld1q_s16(inputT); + inputT += 8; + in[7] = vld1q_s16(inputT); + inputT += 8; + in[15] = vld1q_s16(inputT); + } + + // Transpose + transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + transpose_s16_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14], + &in[15]); + + x[0] = in[15]; + x[1] = in[0]; + x[2] = in[13]; + x[3] = in[2]; + x[4] = in[11]; + x[5] = in[4]; + x[6] = in[9]; + x[7] = in[6]; + x[8] = in[7]; + x[9] = in[8]; + x[10] = in[5]; + x[11] = in[10]; + x[12] = in[3]; + x[13] = in[12]; + x[14] = in[1]; + x[15] = in[14]; + + // stage 1 + iadst_butterfly_lane_0_1_neon(x[0], x[1], c_1_31_5_27, s0, s1); + iadst_butterfly_lane_2_3_neon(x[2], x[3], c_1_31_5_27, s2, s3); + iadst_butterfly_lane_0_1_neon(x[4], x[5], c_9_23_13_19, s4, s5); + iadst_butterfly_lane_2_3_neon(x[6], x[7], c_9_23_13_19, s6, s7); + iadst_butterfly_lane_0_1_neon(x[8], x[9], c_17_15_21_11, s8, s9); + iadst_butterfly_lane_2_3_neon(x[10], x[11], c_17_15_21_11, s10, s11); + iadst_butterfly_lane_0_1_neon(x[12], x[13], c_25_7_29_3, s12, s13); + iadst_butterfly_lane_2_3_neon(x[14], x[15], c_25_7_29_3, s14, s15); + + x[0] = add_dct_const_round_shift_low_8(s0, s8); + x[1] = add_dct_const_round_shift_low_8(s1, s9); + x[2] = add_dct_const_round_shift_low_8(s2, s10); + x[3] = add_dct_const_round_shift_low_8(s3, s11); + x[4] = add_dct_const_round_shift_low_8(s4, s12); + x[5] = add_dct_const_round_shift_low_8(s5, s13); + x[6] = add_dct_const_round_shift_low_8(s6, s14); + x[7] = add_dct_const_round_shift_low_8(s7, s15); + x[8] = sub_dct_const_round_shift_low_8(s0, s8); + x[9] = sub_dct_const_round_shift_low_8(s1, s9); + x[10] = sub_dct_const_round_shift_low_8(s2, s10); + x[11] = sub_dct_const_round_shift_low_8(s3, s11); + x[12] = sub_dct_const_round_shift_low_8(s4, s12); + x[13] = sub_dct_const_round_shift_low_8(s5, s13); + x[14] = sub_dct_const_round_shift_low_8(s6, s14); + x[15] = sub_dct_const_round_shift_low_8(s7, s15); + + // stage 2 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + t[4] = x[4]; + t[5] = x[5]; + t[6] = x[6]; + t[7] = x[7]; + iadst_butterfly_lane_0_1_neon(x[8], x[9], c_4_28_20_12, s8, s9); + iadst_butterfly_lane_2_3_neon(x[10], x[11], c_4_28_20_12, s10, s11); + iadst_butterfly_lane_1_0_neon(x[13], x[12], c_4_28_20_12, s13, s12); + iadst_butterfly_lane_3_2_neon(x[15], x[14], c_4_28_20_12, s15, s14); + + x[0] = vaddq_s16(t[0], t[4]); + x[1] = vaddq_s16(t[1], t[5]); + x[2] = vaddq_s16(t[2], t[6]); + x[3] = vaddq_s16(t[3], t[7]); + x[4] = vsubq_s16(t[0], t[4]); + x[5] = vsubq_s16(t[1], t[5]); + x[6] = vsubq_s16(t[2], t[6]); + x[7] = vsubq_s16(t[3], t[7]); + x[8] = add_dct_const_round_shift_low_8(s8, s12); + x[9] = add_dct_const_round_shift_low_8(s9, s13); + x[10] = add_dct_const_round_shift_low_8(s10, s14); + x[11] = add_dct_const_round_shift_low_8(s11, s15); + x[12] = sub_dct_const_round_shift_low_8(s8, s12); + x[13] = sub_dct_const_round_shift_low_8(s9, s13); + x[14] = sub_dct_const_round_shift_low_8(s10, s14); + x[15] = sub_dct_const_round_shift_low_8(s11, s15); + + // stage 3 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + iadst_butterfly_lane_2_3_neon(x[4], x[5], c_16_n16_8_24, s4, s5); + iadst_butterfly_lane_3_2_neon(x[7], x[6], c_16_n16_8_24, s7, s6); + t[8] = x[8]; + t[9] = x[9]; + t[10] = x[10]; + t[11] = x[11]; + iadst_butterfly_lane_2_3_neon(x[12], x[13], c_16_n16_8_24, s12, s13); + iadst_butterfly_lane_3_2_neon(x[15], x[14], c_16_n16_8_24, s15, s14); + + x[0] = vaddq_s16(t[0], t[2]); + x[1] = vaddq_s16(t[1], t[3]); + x[2] = vsubq_s16(t[0], t[2]); + x[3] = vsubq_s16(t[1], t[3]); + x[4] = add_dct_const_round_shift_low_8(s4, s6); + x[5] = add_dct_const_round_shift_low_8(s5, s7); + x[6] = sub_dct_const_round_shift_low_8(s4, s6); + x[7] = sub_dct_const_round_shift_low_8(s5, s7); + x[8] = vaddq_s16(t[8], t[10]); + x[9] = vaddq_s16(t[9], t[11]); + x[10] = vsubq_s16(t[8], t[10]); + x[11] = vsubq_s16(t[9], t[11]); + x[12] = add_dct_const_round_shift_low_8(s12, s14); + x[13] = add_dct_const_round_shift_low_8(s13, s15); + x[14] = sub_dct_const_round_shift_low_8(s12, s14); + x[15] = sub_dct_const_round_shift_low_8(s13, s15); + + // stage 4 + iadst_half_butterfly_neg_neon(&x[3], &x[2], c_16_n16_8_24); + iadst_half_butterfly_pos_neon(&x[7], &x[6], c_16_n16_8_24); + iadst_half_butterfly_pos_neon(&x[11], &x[10], c_16_n16_8_24); + iadst_half_butterfly_neg_neon(&x[15], &x[14], c_16_n16_8_24); + + out[0] = x[0]; + out[1] = vnegq_s16(x[8]); + out[2] = x[12]; + out[3] = vnegq_s16(x[4]); + out[4] = x[6]; + out[5] = x[14]; + out[6] = x[10]; + out[7] = x[2]; + out[8] = x[3]; + out[9] = x[11]; + out[10] = x[15]; + out[11] = x[7]; + out[12] = x[5]; + out[13] = vnegq_s16(x[13]); + out[14] = x[9]; + out[15] = vnegq_s16(x[1]); + + if (output) { + idct16x16_store_pass1(out, output); + } else { + if (highbd_flag) { + idct16x16_add_store_bd8(out, dest, stride); + } else { + idct16x16_add_store(out, dest, stride); + } + } +} + +void vp9_iht16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, + int stride, int tx_type) { + static const iht_2d IHT_16[] = { + { vpx_idct16x16_256_add_half1d, + vpx_idct16x16_256_add_half1d }, // DCT_DCT = 0 + { vpx_iadst16x16_256_add_half1d, + vpx_idct16x16_256_add_half1d }, // ADST_DCT = 1 + { vpx_idct16x16_256_add_half1d, + vpx_iadst16x16_256_add_half1d }, // DCT_ADST = 2 + { vpx_iadst16x16_256_add_half1d, + vpx_iadst16x16_256_add_half1d } // ADST_ADST = 3 + }; + const iht_2d ht = IHT_16[tx_type]; + int16_t row_output[16 * 16]; + + // pass 1 + ht.rows(input, row_output, dest, stride, 0); // upper 8 rows + ht.rows(input + 8 * 16, row_output + 8, dest, stride, 0); // lower 8 rows + + // pass 2 + ht.cols(row_output, NULL, dest, stride, 0); // left 8 columns + ht.cols(row_output + 16 * 8, NULL, dest + 8, stride, 0); // right 8 columns +} diff --git a/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c new file mode 100644 index 0000000000..4f0a90f215 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/arm/neon/vp9_iht_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/txfm_common.h" + +void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { + int16x8_t a[2]; + uint8x8_t s[2], d[2]; + uint16x8_t sum[2]; + + assert(!((intptr_t)dest % sizeof(uint32_t))); + assert(!(stride % sizeof(uint32_t))); + + a[0] = load_tran_low_to_s16q(input); + a[1] = load_tran_low_to_s16q(input + 8); + transpose_s16_4x4q(&a[0], &a[1]); + + switch (tx_type) { + case DCT_DCT: + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + transpose_s16_4x4q(&a[0], &a[1]); + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + break; + + case ADST_DCT: + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + transpose_s16_4x4q(&a[0], &a[1]); + iadst4(a); + break; + + case DCT_ADST: + iadst4(a); + transpose_s16_4x4q(&a[0], &a[1]); + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + break; + + default: + assert(tx_type == ADST_ADST); + iadst4(a); + transpose_s16_4x4q(&a[0], &a[1]); + iadst4(a); + break; + } + + a[0] = vrshrq_n_s16(a[0], 4); + a[1] = vrshrq_n_s16(a[1], 4); + s[0] = load_u8(dest, stride); + s[1] = load_u8(dest + 2 * stride, stride); + sum[0] = vaddw_u8(vreinterpretq_u16_s16(a[0]), s[0]); + sum[1] = vaddw_u8(vreinterpretq_u16_s16(a[1]), s[1]); + d[0] = vqmovun_s16(vreinterpretq_s16_u16(sum[0])); + d[1] = vqmovun_s16(vreinterpretq_s16_u16(sum[1])); + store_u8(dest, stride, d[0]); + store_u8(dest + 2 * stride, stride, d[1]); +} diff --git a/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c new file mode 100644 index 0000000000..46ee632e01 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/arm/neon/vp9_iht_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" + +void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { + const int16x8_t cospis = vld1q_s16(kCospi); + const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 + const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 + int16x8_t a[8]; + + a[0] = load_tran_low_to_s16q(input + 0 * 8); + a[1] = load_tran_low_to_s16q(input + 1 * 8); + a[2] = load_tran_low_to_s16q(input + 2 * 8); + a[3] = load_tran_low_to_s16q(input + 3 * 8); + a[4] = load_tran_low_to_s16q(input + 4 * 8); + a[5] = load_tran_low_to_s16q(input + 5 * 8); + a[6] = load_tran_low_to_s16q(input + 6 * 8); + a[7] = load_tran_low_to_s16q(input + 7 * 8); + + transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + + switch (tx_type) { + case DCT_DCT: + idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a); + transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a); + break; + + case ADST_DCT: + idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a); + transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + iadst8(a); + break; + + case DCT_ADST: + iadst8(a); + transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a); + break; + + default: + assert(tx_type == ADST_ADST); + iadst8(a); + transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + iadst8(a); + break; + } + + idct8x8_add8x8_neon(a, dest, stride); +} diff --git a/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht_neon.h b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht_neon.h new file mode 100644 index 0000000000..c64822e27c --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht_neon.h @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_ +#define VPX_VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_ + +#include + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vp9/common/vp9_common.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/txfm_common.h" + +static INLINE void iadst4(int16x8_t *const io) { + const int32x4_t c3 = vdupq_n_s32(sinpi_3_9); + int16x4_t x[4]; + int32x4_t s[8], output[4]; + const int16x4_t c = + create_s16x4_neon(sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9); + + x[0] = vget_low_s16(io[0]); + x[1] = vget_low_s16(io[1]); + x[2] = vget_high_s16(io[0]); + x[3] = vget_high_s16(io[1]); + + s[0] = vmull_lane_s16(x[0], c, 0); + s[1] = vmull_lane_s16(x[0], c, 1); + s[2] = vmull_lane_s16(x[1], c, 2); + s[3] = vmull_lane_s16(x[2], c, 3); + s[4] = vmull_lane_s16(x[2], c, 0); + s[5] = vmull_lane_s16(x[3], c, 1); + s[6] = vmull_lane_s16(x[3], c, 3); + s[7] = vaddl_s16(x[0], x[3]); + s[7] = vsubw_s16(s[7], x[2]); + + s[0] = vaddq_s32(s[0], s[3]); + s[0] = vaddq_s32(s[0], s[5]); + s[1] = vsubq_s32(s[1], s[4]); + s[1] = vsubq_s32(s[1], s[6]); + s[3] = s[2]; + s[2] = vmulq_s32(c3, s[7]); + + output[0] = vaddq_s32(s[0], s[3]); + output[1] = vaddq_s32(s[1], s[3]); + output[2] = s[2]; + output[3] = vaddq_s32(s[0], s[1]); + output[3] = vsubq_s32(output[3], s[3]); + dct_const_round_shift_low_8_dual(output, &io[0], &io[1]); +} + +static INLINE void iadst_half_butterfly_neon(int16x8_t *const x, + const int16x4_t c) { + // Don't add/sub before multiply, which will overflow in iadst8. + const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(x[0]), c, 0); + const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(x[0]), c, 0); + const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(x[1]), c, 0); + const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(x[1]), c, 0); + int32x4_t t0[2], t1[2]; + + t0[0] = vaddq_s32(x0_lo, x1_lo); + t0[1] = vaddq_s32(x0_hi, x1_hi); + t1[0] = vsubq_s32(x0_lo, x1_lo); + t1[1] = vsubq_s32(x0_hi, x1_hi); + x[0] = dct_const_round_shift_low_8(t0); + x[1] = dct_const_round_shift_low_8(t1); +} + +static INLINE void iadst_half_butterfly_neg_neon(int16x8_t *const x0, + int16x8_t *const x1, + const int16x4_t c) { + // Don't add/sub before multiply, which will overflow in iadst8. + const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(*x0), c, 1); + const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(*x0), c, 1); + const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(*x1), c, 1); + const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(*x1), c, 1); + int32x4_t t0[2], t1[2]; + + t0[0] = vaddq_s32(x0_lo, x1_lo); + t0[1] = vaddq_s32(x0_hi, x1_hi); + t1[0] = vsubq_s32(x0_lo, x1_lo); + t1[1] = vsubq_s32(x0_hi, x1_hi); + *x1 = dct_const_round_shift_low_8(t0); + *x0 = dct_const_round_shift_low_8(t1); +} + +static INLINE void iadst_half_butterfly_pos_neon(int16x8_t *const x0, + int16x8_t *const x1, + const int16x4_t c) { + // Don't add/sub before multiply, which will overflow in iadst8. + const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(*x0), c, 0); + const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(*x0), c, 0); + const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(*x1), c, 0); + const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(*x1), c, 0); + int32x4_t t0[2], t1[2]; + + t0[0] = vaddq_s32(x0_lo, x1_lo); + t0[1] = vaddq_s32(x0_hi, x1_hi); + t1[0] = vsubq_s32(x0_lo, x1_lo); + t1[1] = vsubq_s32(x0_hi, x1_hi); + *x1 = dct_const_round_shift_low_8(t0); + *x0 = dct_const_round_shift_low_8(t1); +} + +static INLINE void iadst_butterfly_lane_0_1_neon(const int16x8_t in0, + const int16x8_t in1, + const int16x4_t c, + int32x4_t *const s0, + int32x4_t *const s1) { + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0); +} + +static INLINE void iadst_butterfly_lane_2_3_neon(const int16x8_t in0, + const int16x8_t in1, + const int16x4_t c, + int32x4_t *const s0, + int32x4_t *const s1) { + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2); +} + +static INLINE void iadst_butterfly_lane_1_0_neon(const int16x8_t in0, + const int16x8_t in1, + const int16x4_t c, + int32x4_t *const s0, + int32x4_t *const s1) { + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 1); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 1); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 0); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 0); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 0); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 0); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 1); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 1); +} + +static INLINE void iadst_butterfly_lane_3_2_neon(const int16x8_t in0, + const int16x8_t in1, + const int16x4_t c, + int32x4_t *const s0, + int32x4_t *const s1) { + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3); +} + +static INLINE int16x8_t add_dct_const_round_shift_low_8( + const int32x4_t *const in0, const int32x4_t *const in1) { + int32x4_t sum[2]; + + sum[0] = vaddq_s32(in0[0], in1[0]); + sum[1] = vaddq_s32(in0[1], in1[1]); + return dct_const_round_shift_low_8(sum); +} + +static INLINE int16x8_t sub_dct_const_round_shift_low_8( + const int32x4_t *const in0, const int32x4_t *const in1) { + int32x4_t sum[2]; + + sum[0] = vsubq_s32(in0[0], in1[0]); + sum[1] = vsubq_s32(in0[1], in1[1]); + return dct_const_round_shift_low_8(sum); +} + +static INLINE void iadst8(int16x8_t *const io) { + const int16x4_t c0 = + create_s16x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64); + const int16x4_t c1 = + create_s16x4_neon(cospi_18_64, cospi_14_64, cospi_26_64, cospi_6_64); + const int16x4_t c2 = + create_s16x4_neon(cospi_16_64, 0, cospi_8_64, cospi_24_64); + int16x8_t x[8], t[4]; + int32x4_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2]; + + x[0] = io[7]; + x[1] = io[0]; + x[2] = io[5]; + x[3] = io[2]; + x[4] = io[3]; + x[5] = io[4]; + x[6] = io[1]; + x[7] = io[6]; + + // stage 1 + iadst_butterfly_lane_0_1_neon(x[0], x[1], c0, s0, s1); + iadst_butterfly_lane_2_3_neon(x[2], x[3], c0, s2, s3); + iadst_butterfly_lane_0_1_neon(x[4], x[5], c1, s4, s5); + iadst_butterfly_lane_2_3_neon(x[6], x[7], c1, s6, s7); + + x[0] = add_dct_const_round_shift_low_8(s0, s4); + x[1] = add_dct_const_round_shift_low_8(s1, s5); + x[2] = add_dct_const_round_shift_low_8(s2, s6); + x[3] = add_dct_const_round_shift_low_8(s3, s7); + x[4] = sub_dct_const_round_shift_low_8(s0, s4); + x[5] = sub_dct_const_round_shift_low_8(s1, s5); + x[6] = sub_dct_const_round_shift_low_8(s2, s6); + x[7] = sub_dct_const_round_shift_low_8(s3, s7); + + // stage 2 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + iadst_butterfly_lane_2_3_neon(x[4], x[5], c2, s4, s5); + iadst_butterfly_lane_3_2_neon(x[7], x[6], c2, s7, s6); + + x[0] = vaddq_s16(t[0], t[2]); + x[1] = vaddq_s16(t[1], t[3]); + x[2] = vsubq_s16(t[0], t[2]); + x[3] = vsubq_s16(t[1], t[3]); + x[4] = add_dct_const_round_shift_low_8(s4, s6); + x[5] = add_dct_const_round_shift_low_8(s5, s7); + x[6] = sub_dct_const_round_shift_low_8(s4, s6); + x[7] = sub_dct_const_round_shift_low_8(s5, s7); + + // stage 3 + iadst_half_butterfly_neon(x + 2, c2); + iadst_half_butterfly_neon(x + 6, c2); + + io[0] = x[0]; + io[1] = vnegq_s16(x[4]); + io[2] = x[6]; + io[3] = vnegq_s16(x[2]); + io[4] = x[3]; + io[5] = vnegq_s16(x[7]); + io[6] = x[5]; + io[7] = vnegq_s16(x[1]); +} + +void vpx_iadst16x16_256_add_half1d(const void *const input, int16_t *output, + void *const dest, const int stride, + const int highbd_flag); + +typedef void (*iht_1d)(const void *const input, int16_t *output, + void *const dest, const int stride, + const int highbd_flag); + +typedef struct { + iht_1d cols, rows; // vertical and horizontal +} iht_2d; + +#endif // VPX_VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_ diff --git a/media/libvpx/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c b/media/libvpx/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c new file mode 100644 index 0000000000..e68d01e9fd --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_idct.h" +#include "vpx_dsp/mips/inv_txfm_dspr2.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_ports/mem.h" + +#if HAVE_DSPR2 +void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, int pitch, + int tx_type) { + int i, j; + DECLARE_ALIGNED(32, int16_t, out[16 * 16]); + int16_t *outptr = out; + int16_t temp_out[16]; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); + + switch (tx_type) { + case DCT_DCT: // DCT in both horizontal and vertical + idct16_rows_dspr2(input, outptr, 16); + idct16_cols_add_blk_dspr2(out, dest, pitch); + break; + case ADST_DCT: // ADST in vertical, DCT in horizontal + idct16_rows_dspr2(input, outptr, 16); + + outptr = out; + + for (i = 0; i < 16; ++i) { + iadst16_dspr2(outptr, temp_out); + + for (j = 0; j < 16; ++j) + dest[j * pitch + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + + dest[j * pitch + i]); + outptr += 16; + } + break; + case DCT_ADST: // DCT in vertical, ADST in horizontal + { + int16_t temp_in[16 * 16]; + + for (i = 0; i < 16; ++i) { + /* prefetch row */ + prefetch_load((const uint8_t *)(input + 16)); + + iadst16_dspr2(input, outptr); + input += 16; + outptr += 16; + } + + for (i = 0; i < 16; ++i) + for (j = 0; j < 16; ++j) temp_in[j * 16 + i] = out[i * 16 + j]; + + idct16_cols_add_blk_dspr2(temp_in, dest, pitch); + break; + } + case ADST_ADST: // ADST in both directions + { + int16_t temp_in[16]; + + for (i = 0; i < 16; ++i) { + /* prefetch row */ + prefetch_load((const uint8_t *)(input + 16)); + + iadst16_dspr2(input, outptr); + input += 16; + outptr += 16; + } + + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; + iadst16_dspr2(temp_in, temp_out); + for (j = 0; j < 16; ++j) + dest[j * pitch + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + + dest[j * pitch + i]); + } + break; + } + default: printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n"); break; + } +} +#endif // #if HAVE_DSPR2 diff --git a/media/libvpx/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c b/media/libvpx/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c new file mode 100644 index 0000000000..f6b29265e6 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_idct.h" +#include "vpx_dsp/mips/inv_txfm_dspr2.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_ports/mem.h" + +#if HAVE_DSPR2 +void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, int stride, + int tx_type) { + int i, j; + DECLARE_ALIGNED(32, int16_t, out[4 * 4]); + int16_t *outptr = out; + int16_t temp_in[4 * 4], temp_out[4]; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + switch (tx_type) { + case DCT_DCT: // DCT in both horizontal and vertical + vpx_idct4_rows_dspr2(input, outptr); + vpx_idct4_columns_add_blk_dspr2(&out[0], dest, stride); + break; + case ADST_DCT: // ADST in vertical, DCT in horizontal + vpx_idct4_rows_dspr2(input, outptr); + + outptr = out; + + for (i = 0; i < 4; ++i) { + iadst4_dspr2(outptr, temp_out); + + for (j = 0; j < 4; ++j) + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) + + dest[j * stride + i]); + + outptr += 4; + } + break; + case DCT_ADST: // DCT in vertical, ADST in horizontal + for (i = 0; i < 4; ++i) { + iadst4_dspr2(input, outptr); + input += 4; + outptr += 4; + } + + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) { + temp_in[i * 4 + j] = out[j * 4 + i]; + } + } + vpx_idct4_columns_add_blk_dspr2(&temp_in[0], dest, stride); + break; + case ADST_ADST: // ADST in both directions + for (i = 0; i < 4; ++i) { + iadst4_dspr2(input, outptr); + input += 4; + outptr += 4; + } + + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; + iadst4_dspr2(temp_in, temp_out); + + for (j = 0; j < 4; ++j) + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) + + dest[j * stride + i]); + } + break; + default: printf("vp9_short_iht4x4_add_dspr2 : Invalid tx_type\n"); break; + } +} +#endif // #if HAVE_DSPR2 diff --git a/media/libvpx/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/media/libvpx/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c new file mode 100644 index 0000000000..b945e307e6 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_blockd.h" +#include "vpx_dsp/mips/inv_txfm_dspr2.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_ports/mem.h" + +#if HAVE_DSPR2 +void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, int stride, + int tx_type) { + int i, j; + DECLARE_ALIGNED(32, int16_t, out[8 * 8]); + int16_t *outptr = out; + int16_t temp_in[8 * 8], temp_out[8]; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); + + switch (tx_type) { + case DCT_DCT: // DCT in both horizontal and vertical + idct8_rows_dspr2(input, outptr, 8); + idct8_columns_add_blk_dspr2(&out[0], dest, stride); + break; + case ADST_DCT: // ADST in vertical, DCT in horizontal + idct8_rows_dspr2(input, outptr, 8); + + for (i = 0; i < 8; ++i) { + iadst8_dspr2(&out[i * 8], temp_out); + + for (j = 0; j < 8; ++j) + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + + dest[j * stride + i]); + } + break; + case DCT_ADST: // DCT in vertical, ADST in horizontal + for (i = 0; i < 8; ++i) { + iadst8_dspr2(input, outptr); + input += 8; + outptr += 8; + } + + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) { + temp_in[i * 8 + j] = out[j * 8 + i]; + } + } + idct8_columns_add_blk_dspr2(&temp_in[0], dest, stride); + break; + case ADST_ADST: // ADST in both directions + for (i = 0; i < 8; ++i) { + iadst8_dspr2(input, outptr); + input += 8; + outptr += 8; + } + + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; + + iadst8_dspr2(temp_in, temp_out); + + for (j = 0; j < 8; ++j) + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + + dest[j * stride + i]); + } + break; + default: printf("vp9_short_iht8x8_add_dspr2 : Invalid tx_type\n"); break; + } +} +#endif // #if HAVE_DSPR2 diff --git a/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c b/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c new file mode 100644 index 0000000000..c031322806 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_enums.h" +#include "vpx_dsp/mips/inv_txfm_msa.h" + +void vp9_iht16x16_256_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride, int32_t tx_type) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, out[16 * 16]); + int16_t *out_ptr = &out[0]; + + switch (tx_type) { + case DCT_DCT: + /* transform rows */ + for (i = 0; i < 2; ++i) { + /* process 16 * 8 block */ + vpx_idct16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7))); + } + + /* transform columns */ + for (i = 0; i < 2; ++i) { + /* process 8 * 16 block */ + vpx_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)), + dst_stride); + } + break; + case ADST_DCT: + /* transform rows */ + for (i = 0; i < 2; ++i) { + /* process 16 * 8 block */ + vpx_idct16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7))); + } + + /* transform columns */ + for (i = 0; i < 2; ++i) { + vpx_iadst16_1d_columns_addblk_msa((out_ptr + (i << 3)), + (dst + (i << 3)), dst_stride); + } + break; + case DCT_ADST: + /* transform rows */ + for (i = 0; i < 2; ++i) { + /* process 16 * 8 block */ + vpx_iadst16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7))); + } + + /* transform columns */ + for (i = 0; i < 2; ++i) { + /* process 8 * 16 block */ + vpx_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)), + dst_stride); + } + break; + case ADST_ADST: + /* transform rows */ + for (i = 0; i < 2; ++i) { + /* process 16 * 8 block */ + vpx_iadst16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7))); + } + + /* transform columns */ + for (i = 0; i < 2; ++i) { + vpx_iadst16_1d_columns_addblk_msa((out_ptr + (i << 3)), + (dst + (i << 3)), dst_stride); + } + break; + default: assert(0); break; + } +} diff --git a/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c b/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c new file mode 100644 index 0000000000..aaccd5ca7b --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_enums.h" +#include "vpx_dsp/mips/inv_txfm_msa.h" + +void vp9_iht4x4_16_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride, int32_t tx_type) { + v8i16 in0, in1, in2, in3; + + /* load vector elements of 4x4 block */ + LD4x4_SH(input, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + + switch (tx_type) { + case DCT_DCT: + /* DCT in horizontal */ + VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); + /* DCT in vertical */ + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); + break; + case ADST_DCT: + /* DCT in horizontal */ + VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); + /* ADST in vertical */ + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3); + break; + case DCT_ADST: + /* ADST in horizontal */ + VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3); + /* DCT in vertical */ + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); + break; + case ADST_ADST: + /* ADST in horizontal */ + VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3); + /* ADST in vertical */ + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3); + break; + default: assert(0); break; + } + + /* final rounding (add 2^3, divide by 2^4) and shift */ + SRARI_H4_SH(in0, in1, in2, in3, 4); + /* add block and store 4x4 */ + ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride); +} diff --git a/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c b/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c new file mode 100644 index 0000000000..76d15ff8c0 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_enums.h" +#include "vpx_dsp/mips/inv_txfm_msa.h" + +void vp9_iht8x8_64_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride, int32_t tx_type) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + + /* load vector elements of 8x8 block */ + LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); + + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + + switch (tx_type) { + case DCT_DCT: + /* DCT in horizontal */ + VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + /* DCT in vertical */ + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, + in3, in4, in5, in6, in7); + VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + break; + case ADST_DCT: + /* DCT in horizontal */ + VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + /* ADST in vertical */ + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, + in3, in4, in5, in6, in7); + VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + break; + case DCT_ADST: + /* ADST in horizontal */ + VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + /* DCT in vertical */ + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, + in3, in4, in5, in6, in7); + VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + break; + case ADST_ADST: + /* ADST in horizontal */ + VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + /* ADST in vertical */ + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, + in3, in4, in5, in6, in7); + VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + break; + default: assert(0); break; + } + + /* final rounding (add 2^4, divide by 2^5) and shift */ + SRARI_H4_SH(in0, in1, in2, in3, 5); + SRARI_H4_SH(in4, in5, in6, in7, 5); + + /* add block and store 8x8 */ + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3); + dst += (4 * dst_stride); + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7); +} diff --git a/media/libvpx/libvpx/vp9/common/mips/msa/vp9_mfqe_msa.c b/media/libvpx/libvpx/vp9/common/mips/msa/vp9_mfqe_msa.c new file mode 100644 index 0000000000..2c3840958e --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/mips/msa/vp9_mfqe_msa.c @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vpx_dsp/mips/macros_msa.h" + +static void filter_by_weight8x8_msa(const uint8_t *src_ptr, int32_t src_stride, + uint8_t *dst_ptr, int32_t dst_stride, + int32_t src_weight) { + int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight; + int32_t row; + uint64_t src0_d, src1_d, dst0_d, dst1_d; + v16i8 src0 = { 0 }; + v16i8 src1 = { 0 }; + v16i8 dst0 = { 0 }; + v16i8 dst1 = { 0 }; + v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l; + + src_wt = __msa_fill_h(src_weight); + dst_wt = __msa_fill_h(dst_weight); + + for (row = 2; row--;) { + LD2(src_ptr, src_stride, src0_d, src1_d); + src_ptr += (2 * src_stride); + LD2(dst_ptr, dst_stride, dst0_d, dst1_d); + INSERT_D2_SB(src0_d, src1_d, src0); + INSERT_D2_SB(dst0_d, dst1_d, dst0); + + LD2(src_ptr, src_stride, src0_d, src1_d); + src_ptr += (2 * src_stride); + LD2((dst_ptr + 2 * dst_stride), dst_stride, dst0_d, dst1_d); + INSERT_D2_SB(src0_d, src1_d, src1); + INSERT_D2_SB(dst0_d, dst1_d, dst1); + + UNPCK_UB_SH(src0, src_r, src_l); + UNPCK_UB_SH(dst0, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + dst0 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r); + ST8x2_UB(dst0, dst_ptr, dst_stride); + dst_ptr += (2 * dst_stride); + + UNPCK_UB_SH(src1, src_r, src_l); + UNPCK_UB_SH(dst1, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + dst1 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r); + ST8x2_UB(dst1, dst_ptr, dst_stride); + dst_ptr += (2 * dst_stride); + } +} + +static void filter_by_weight16x16_msa(const uint8_t *src_ptr, + int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, int32_t src_weight) { + int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight; + int32_t row; + v16i8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; + v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l; + + src_wt = __msa_fill_h(src_weight); + dst_wt = __msa_fill_h(dst_weight); + + for (row = 4; row--;) { + LD_SB4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LD_SB4(dst_ptr, dst_stride, dst0, dst1, dst2, dst3); + + UNPCK_UB_SH(src0, src_r, src_l); + UNPCK_UB_SH(dst0, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); + dst_ptr += dst_stride; + + UNPCK_UB_SH(src1, src_r, src_l); + UNPCK_UB_SH(dst1, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); + dst_ptr += dst_stride; + + UNPCK_UB_SH(src2, src_r, src_l); + UNPCK_UB_SH(dst2, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); + dst_ptr += dst_stride; + + UNPCK_UB_SH(src3, src_r, src_l); + UNPCK_UB_SH(dst3, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); + dst_ptr += dst_stride; + } +} + +void vp9_filter_by_weight8x8_msa(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int src_weight) { + filter_by_weight8x8_msa(src, src_stride, dst, dst_stride, src_weight); +} + +void vp9_filter_by_weight16x16_msa(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + int src_weight) { + filter_by_weight16x16_msa(src, src_stride, dst, dst_stride, src_weight); +} diff --git a/media/libvpx/libvpx/vp9/common/ppc/vp9_idct_vsx.c b/media/libvpx/libvpx/vp9/common/ppc/vp9_idct_vsx.c new file mode 100644 index 0000000000..e861596ad4 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/ppc/vp9_idct_vsx.c @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vp9_rtcd.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/ppc/inv_txfm_vsx.h" +#include "vpx_dsp/ppc/bitdepth_conversion_vsx.h" + +#include "vp9/common/vp9_enums.h" + +void vp9_iht4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { + int16x8_t in[2], out[2]; + + in[0] = load_tran_low(0, input); + in[1] = load_tran_low(8 * sizeof(*input), input); + + switch (tx_type) { + case DCT_DCT: + vpx_idct4_vsx(in, out); + vpx_idct4_vsx(out, in); + break; + case ADST_DCT: + vpx_idct4_vsx(in, out); + vp9_iadst4_vsx(out, in); + break; + case DCT_ADST: + vp9_iadst4_vsx(in, out); + vpx_idct4_vsx(out, in); + break; + default: + assert(tx_type == ADST_ADST); + vp9_iadst4_vsx(in, out); + vp9_iadst4_vsx(out, in); + break; + } + + vpx_round_store4x4_vsx(in, out, dest, stride); +} + +void vp9_iht8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { + int16x8_t in[8], out[8]; + + // load input data + in[0] = load_tran_low(0, input); + in[1] = load_tran_low(8 * sizeof(*input), input); + in[2] = load_tran_low(2 * 8 * sizeof(*input), input); + in[3] = load_tran_low(3 * 8 * sizeof(*input), input); + in[4] = load_tran_low(4 * 8 * sizeof(*input), input); + in[5] = load_tran_low(5 * 8 * sizeof(*input), input); + in[6] = load_tran_low(6 * 8 * sizeof(*input), input); + in[7] = load_tran_low(7 * 8 * sizeof(*input), input); + + switch (tx_type) { + case DCT_DCT: + vpx_idct8_vsx(in, out); + vpx_idct8_vsx(out, in); + break; + case ADST_DCT: + vpx_idct8_vsx(in, out); + vp9_iadst8_vsx(out, in); + break; + case DCT_ADST: + vp9_iadst8_vsx(in, out); + vpx_idct8_vsx(out, in); + break; + default: + assert(tx_type == ADST_ADST); + vp9_iadst8_vsx(in, out); + vp9_iadst8_vsx(out, in); + break; + } + + vpx_round_store8x8_vsx(in, dest, stride); +} + +void vp9_iht16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride, int tx_type) { + int16x8_t in0[16], in1[16]; + + LOAD_INPUT16(load_tran_low, input, 0, 8 * sizeof(*input), in0); + LOAD_INPUT16(load_tran_low, input, 8 * 8 * 2 * sizeof(*input), + 8 * sizeof(*input), in1); + + switch (tx_type) { + case DCT_DCT: + vpx_idct16_vsx(in0, in1); + vpx_idct16_vsx(in0, in1); + break; + case ADST_DCT: + vpx_idct16_vsx(in0, in1); + vpx_iadst16_vsx(in0, in1); + break; + case DCT_ADST: + vpx_iadst16_vsx(in0, in1); + vpx_idct16_vsx(in0, in1); + break; + default: + assert(tx_type == ADST_ADST); + vpx_iadst16_vsx(in0, in1); + vpx_iadst16_vsx(in0, in1); + break; + } + + vpx_round_store16x16_vsx(in0, in1, dest, stride); +} diff --git a/media/libvpx/libvpx/vp9/common/vp9_alloccommon.c b/media/libvpx/libvpx/vp9/common/vp9_alloccommon.c new file mode 100644 index 0000000000..9e73e40ea0 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_alloccommon.c @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "vpx_mem/vpx_mem.h" + +#include "vp9/common/vp9_alloccommon.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_entropymv.h" +#include "vp9/common/vp9_onyxc_int.h" + +void vp9_set_mi_size(int *mi_rows, int *mi_cols, int *mi_stride, int width, + int height) { + const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2); + const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2); + *mi_cols = aligned_width >> MI_SIZE_LOG2; + *mi_rows = aligned_height >> MI_SIZE_LOG2; + *mi_stride = calc_mi_size(*mi_cols); +} + +void vp9_set_mb_size(int *mb_rows, int *mb_cols, int *mb_num, int mi_rows, + int mi_cols) { + *mb_cols = (mi_cols + 1) >> 1; + *mb_rows = (mi_rows + 1) >> 1; + *mb_num = (*mb_rows) * (*mb_cols); +} + +void vp9_set_mb_mi(VP9_COMMON *cm, int width, int height) { + vp9_set_mi_size(&cm->mi_rows, &cm->mi_cols, &cm->mi_stride, width, height); + vp9_set_mb_size(&cm->mb_rows, &cm->mb_cols, &cm->MBs, cm->mi_rows, + cm->mi_cols); +} + +static int alloc_seg_map(VP9_COMMON *cm, int seg_map_size) { + int i; + + for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) { + cm->seg_map_array[i] = (uint8_t *)vpx_calloc(seg_map_size, 1); + if (cm->seg_map_array[i] == NULL) return 1; + } + cm->seg_map_alloc_size = seg_map_size; + + // Init the index. + cm->seg_map_idx = 0; + cm->prev_seg_map_idx = 1; + + cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx]; + cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx]; + + return 0; +} + +static void free_seg_map(VP9_COMMON *cm) { + int i; + + for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) { + vpx_free(cm->seg_map_array[i]); + cm->seg_map_array[i] = NULL; + } + cm->seg_map_alloc_size = 0; + + cm->current_frame_seg_map = NULL; + cm->last_frame_seg_map = NULL; +} + +void vp9_free_ref_frame_buffers(BufferPool *pool) { + int i; + + if (!pool) return; + + for (i = 0; i < FRAME_BUFFERS; ++i) { + if (!pool->frame_bufs[i].released && + pool->frame_bufs[i].raw_frame_buffer.data != NULL) { + pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer); + pool->frame_bufs[i].ref_count = 0; + pool->frame_bufs[i].released = 1; + } + vpx_free(pool->frame_bufs[i].mvs); + pool->frame_bufs[i].mvs = NULL; + vpx_free_frame_buffer(&pool->frame_bufs[i].buf); + } +} + +void vp9_free_postproc_buffers(VP9_COMMON *cm) { +#if CONFIG_VP9_POSTPROC + vpx_free_frame_buffer(&cm->post_proc_buffer); + vpx_free_frame_buffer(&cm->post_proc_buffer_int); + vpx_free(cm->postproc_state.limits); + cm->postproc_state.limits = NULL; + vpx_free(cm->postproc_state.generated_noise); + cm->postproc_state.generated_noise = NULL; +#else + (void)cm; +#endif +} + +void vp9_free_context_buffers(VP9_COMMON *cm) { + if (cm->free_mi) cm->free_mi(cm); + free_seg_map(cm); + vpx_free(cm->above_context); + cm->above_context = NULL; + vpx_free(cm->above_seg_context); + cm->above_seg_context = NULL; + cm->above_context_alloc_cols = 0; + vpx_free(cm->lf.lfm); + cm->lf.lfm = NULL; +} + +int vp9_alloc_loop_filter(VP9_COMMON *cm) { + vpx_free(cm->lf.lfm); + // Each lfm holds bit masks for all the 8x8 blocks in a 64x64 region. The + // stride and rows are rounded up / truncated to a multiple of 8. + cm->lf.lfm_stride = (cm->mi_cols + (MI_BLOCK_SIZE - 1)) >> 3; + cm->lf.lfm = (LOOP_FILTER_MASK *)vpx_calloc( + ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) * cm->lf.lfm_stride, + sizeof(*cm->lf.lfm)); + if (!cm->lf.lfm) return 1; + return 0; +} + +int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) { + int new_mi_size; + + vp9_set_mb_mi(cm, width, height); + new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows); + if (cm->mi_alloc_size < new_mi_size) { + cm->free_mi(cm); + if (cm->alloc_mi(cm, new_mi_size)) goto fail; + } + if (cm->above_context_alloc_cols < cm->mi_cols) { + vpx_free(cm->above_context); + cm->above_context = (ENTROPY_CONTEXT *)vpx_calloc( + 2 * mi_cols_aligned_to_sb(cm->mi_cols) * MAX_MB_PLANE, + sizeof(*cm->above_context)); + if (!cm->above_context) goto fail; + + vpx_free(cm->above_seg_context); + cm->above_seg_context = (PARTITION_CONTEXT *)vpx_calloc( + mi_cols_aligned_to_sb(cm->mi_cols), sizeof(*cm->above_seg_context)); + if (!cm->above_seg_context) goto fail; + cm->above_context_alloc_cols = cm->mi_cols; + } + + if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) { + // Create the segmentation map structure and set to 0. + free_seg_map(cm); + if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols)) goto fail; + } + + if (vp9_alloc_loop_filter(cm)) goto fail; + + return 0; + +fail: + // clear the mi_* values to force a realloc on resync + vp9_set_mb_mi(cm, 0, 0); + vp9_free_context_buffers(cm); + return 1; +} + +void vp9_remove_common(VP9_COMMON *cm) { +#if CONFIG_VP9_POSTPROC + vp9_free_postproc_buffers(cm); +#endif + vp9_free_context_buffers(cm); + + vpx_free(cm->fc); + cm->fc = NULL; + vpx_free(cm->frame_contexts); + cm->frame_contexts = NULL; +} + +void vp9_init_context_buffers(VP9_COMMON *cm) { + cm->setup_mi(cm); + if (cm->last_frame_seg_map) + memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols); +} + +void vp9_swap_current_and_last_seg_map(VP9_COMMON *cm) { + // Swap indices. + const int tmp = cm->seg_map_idx; + cm->seg_map_idx = cm->prev_seg_map_idx; + cm->prev_seg_map_idx = tmp; + + cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx]; + cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx]; +} diff --git a/media/libvpx/libvpx/vp9/common/vp9_alloccommon.h b/media/libvpx/libvpx/vp9/common/vp9_alloccommon.h new file mode 100644 index 0000000000..90cbb093d7 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_alloccommon.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_COMMON_VP9_ALLOCCOMMON_H_ +#define VPX_VP9_COMMON_VP9_ALLOCCOMMON_H_ + +#define INVALID_IDX (-1) // Invalid buffer index. + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP9Common; +struct BufferPool; + +void vp9_remove_common(struct VP9Common *cm); + +int vp9_alloc_loop_filter(struct VP9Common *cm); +int vp9_alloc_context_buffers(struct VP9Common *cm, int width, int height); +void vp9_init_context_buffers(struct VP9Common *cm); +void vp9_free_context_buffers(struct VP9Common *cm); + +void vp9_free_ref_frame_buffers(struct BufferPool *pool); +void vp9_free_postproc_buffers(struct VP9Common *cm); + +int vp9_alloc_state_buffers(struct VP9Common *cm, int width, int height); +void vp9_free_state_buffers(struct VP9Common *cm); + +void vp9_set_mi_size(int *mi_rows, int *mi_cols, int *mi_stride, int width, + int height); +void vp9_set_mb_size(int *mb_rows, int *mb_cols, int *mb_num, int mi_rows, + int mi_cols); + +void vp9_set_mb_mi(struct VP9Common *cm, int width, int height); + +void vp9_swap_current_and_last_seg_map(struct VP9Common *cm); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_COMMON_VP9_ALLOCCOMMON_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_blockd.c b/media/libvpx/libvpx/vp9/common/vp9_blockd.c new file mode 100644 index 0000000000..4327599510 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_blockd.c @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_blockd.h" + +PREDICTION_MODE vp9_left_block_mode(const MODE_INFO *cur_mi, + const MODE_INFO *left_mi, int b) { + if (b == 0 || b == 2) { + if (!left_mi || is_inter_block(left_mi)) return DC_PRED; + + return get_y_mode(left_mi, b + 1); + } else { + assert(b == 1 || b == 3); + return cur_mi->bmi[b - 1].as_mode; + } +} + +PREDICTION_MODE vp9_above_block_mode(const MODE_INFO *cur_mi, + const MODE_INFO *above_mi, int b) { + if (b == 0 || b == 1) { + if (!above_mi || is_inter_block(above_mi)) return DC_PRED; + + return get_y_mode(above_mi, b + 2); + } else { + assert(b == 2 || b == 3); + return cur_mi->bmi[b - 2].as_mode; + } +} + +void vp9_foreach_transformed_block_in_plane( + const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane, + foreach_transformed_block_visitor visit, void *arg) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const MODE_INFO *mi = xd->mi[0]; + // block and transform sizes, in number of 4x4 blocks log 2 ("*_b") + // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8 + // transform size varies per plane, look it up in a common way. + const TX_SIZE tx_size = plane ? get_uv_tx_size(mi, pd) : mi->tx_size; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); + const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; + const int step = 1 << (tx_size << 1); + int i = 0, r, c; + + // If mb_to_right_edge is < 0 we are in a situation in which + // the current block size extends into the UMV and we won't + // visit the sub blocks that are wholly within the UMV. + const int max_blocks_wide = + num_4x4_w + (xd->mb_to_right_edge >= 0 + ? 0 + : xd->mb_to_right_edge >> (5 + pd->subsampling_x)); + const int max_blocks_high = + num_4x4_h + (xd->mb_to_bottom_edge >= 0 + ? 0 + : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); + const int extra_step = ((num_4x4_w - max_blocks_wide) >> tx_size) * step; + + // Keep track of the row and column of the blocks we use so that we know + // if we are in the unrestricted motion border. + for (r = 0; r < max_blocks_high; r += (1 << tx_size)) { + // Skip visiting the sub blocks that are wholly within the UMV. + for (c = 0; c < max_blocks_wide; c += (1 << tx_size)) { + visit(plane, i, r, c, plane_bsize, tx_size, arg); + i += step; + } + i += extra_step; + } +} + +void vp9_foreach_transformed_block(const MACROBLOCKD *const xd, + BLOCK_SIZE bsize, + foreach_transformed_block_visitor visit, + void *arg) { + int plane; + + for (plane = 0; plane < MAX_MB_PLANE; ++plane) + vp9_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg); +} + +void vp9_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob, + int aoff, int loff) { + ENTROPY_CONTEXT *const a = pd->above_context + aoff; + ENTROPY_CONTEXT *const l = pd->left_context + loff; + const int tx_size_in_blocks = 1 << tx_size; + + // above + if (has_eob && xd->mb_to_right_edge < 0) { + int i; + const int blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize] + + (xd->mb_to_right_edge >> (5 + pd->subsampling_x)); + int above_contexts = tx_size_in_blocks; + if (above_contexts + aoff > blocks_wide) + above_contexts = blocks_wide - aoff; + + for (i = 0; i < above_contexts; ++i) a[i] = has_eob; + for (i = above_contexts; i < tx_size_in_blocks; ++i) a[i] = 0; + } else { + memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks); + } + + // left + if (has_eob && xd->mb_to_bottom_edge < 0) { + int i; + const int blocks_high = num_4x4_blocks_high_lookup[plane_bsize] + + (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); + int left_contexts = tx_size_in_blocks; + if (left_contexts + loff > blocks_high) left_contexts = blocks_high - loff; + + for (i = 0; i < left_contexts; ++i) l[i] = has_eob; + for (i = left_contexts; i < tx_size_in_blocks; ++i) l[i] = 0; + } else { + memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks); + } +} + +void vp9_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y) { + int i; + + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].subsampling_x = i ? ss_x : 0; + xd->plane[i].subsampling_y = i ? ss_y : 0; + } +} diff --git a/media/libvpx/libvpx/vp9/common/vp9_blockd.h b/media/libvpx/libvpx/vp9/common/vp9_blockd.h new file mode 100644 index 0000000000..aa13d8a0d5 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_blockd.h @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_COMMON_VP9_BLOCKD_H_ +#define VPX_VP9_COMMON_VP9_BLOCKD_H_ + +#include "./vpx_config.h" + +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_ports/mem.h" +#include "vpx_scale/yv12config.h" + +#include "vp9/common/vp9_common_data.h" +#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_mv.h" +#include "vp9/common/vp9_scale.h" +#include "vp9/common/vp9_seg_common.h" +#include "vp9/common/vp9_tile_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_MB_PLANE 3 + +typedef enum { + KEY_FRAME = 0, + INTER_FRAME = 1, + FRAME_TYPES, +} FRAME_TYPE; + +static INLINE int is_inter_mode(PREDICTION_MODE mode) { + return mode >= NEARESTMV && mode <= NEWMV; +} + +/* For keyframes, intra block modes are predicted by the (already decoded) + modes for the Y blocks to the left and above us; for interframes, there + is a single probability table. */ + +typedef struct { + PREDICTION_MODE as_mode; + int_mv as_mv[2]; // first, second inter predictor motion vectors +} b_mode_info; + +// Note that the rate-distortion optimization loop, bit-stream writer, and +// decoder implementation modules critically rely on the defined entry values +// specified herein. They should be refactored concurrently. + +#define NO_REF_FRAME (-1) +#define INTRA_FRAME 0 +#define LAST_FRAME 1 +#define GOLDEN_FRAME 2 +#define ALTREF_FRAME 3 +#define MAX_REF_FRAMES 4 +#define MAX_INTER_REF_FRAMES 3 + +typedef int8_t MV_REFERENCE_FRAME; + +static INLINE int mv_ref_frame_to_inter_ref_idx( + MV_REFERENCE_FRAME mv_ref_frame) { + assert(mv_ref_frame >= LAST_FRAME && mv_ref_frame < MAX_REF_FRAMES); + return mv_ref_frame - 1; +} + +// This structure now relates to 8x8 block regions. +typedef struct MODE_INFO { + // Common for both INTER and INTRA blocks + BLOCK_SIZE sb_type; + PREDICTION_MODE mode; + TX_SIZE tx_size; + int8_t skip; + int8_t segment_id; + int8_t seg_id_predicted; // valid only when temporal_update is enabled + + // Only for INTRA blocks + PREDICTION_MODE uv_mode; + + // Only for INTER blocks + INTERP_FILTER interp_filter; + + // if ref_frame[idx] is equal to ALTREF_FRAME then + // MACROBLOCKD::block_ref[idx] is an altref + MV_REFERENCE_FRAME ref_frame[2]; + + // TODO(slavarnway): Delete and use bmi[3].as_mv[] instead. + int_mv mv[2]; + + b_mode_info bmi[4]; +} MODE_INFO; + +static INLINE PREDICTION_MODE get_y_mode(const MODE_INFO *mi, int block) { + return mi->sb_type < BLOCK_8X8 ? mi->bmi[block].as_mode : mi->mode; +} + +static INLINE int is_inter_block(const MODE_INFO *mi) { + return mi->ref_frame[0] > INTRA_FRAME; +} + +static INLINE int has_second_ref(const MODE_INFO *mi) { + return mi->ref_frame[1] > INTRA_FRAME; +} + +PREDICTION_MODE vp9_left_block_mode(const MODE_INFO *cur_mi, + const MODE_INFO *left_mi, int b); + +PREDICTION_MODE vp9_above_block_mode(const MODE_INFO *cur_mi, + const MODE_INFO *above_mi, int b); + +enum mv_precision { MV_PRECISION_Q3, MV_PRECISION_Q4 }; + +struct buf_2d { + uint8_t *buf; + int stride; +}; + +struct macroblockd_plane { + tran_low_t *dqcoeff; + int subsampling_x; + int subsampling_y; + struct buf_2d dst; + struct buf_2d pre[2]; + ENTROPY_CONTEXT *above_context; + ENTROPY_CONTEXT *left_context; + int16_t seg_dequant[MAX_SEGMENTS][2]; + + // number of 4x4s in current block + uint16_t n4_w, n4_h; + // log2 of n4_w, n4_h + uint8_t n4_wl, n4_hl; + + // encoder + const int16_t *dequant; + + int *eob; +}; + +#define BLOCK_OFFSET(x, i) ((x) + (i)*16) + +typedef struct RefBuffer { + // TODO(dkovalev): idx is not really required and should be removed, now it + // is used in vp9_onyxd_if.c + int idx; + YV12_BUFFER_CONFIG *buf; + struct scale_factors sf; +} RefBuffer; + +typedef struct macroblockd { + struct macroblockd_plane plane[MAX_MB_PLANE]; + uint8_t bmode_blocks_wl; + uint8_t bmode_blocks_hl; + + FRAME_COUNTS *counts; + TileInfo tile; + + int mi_stride; + + // Grid of 8x8 cells is placed over the block. + // If some of them belong to the same mbtree-block + // they will just have same mi[i][j] value + MODE_INFO **mi; + MODE_INFO *left_mi; + MODE_INFO *above_mi; + + unsigned int max_blocks_wide; + unsigned int max_blocks_high; + + const vpx_prob (*partition_probs)[PARTITION_TYPES - 1]; + + /* Distance of MB away from frame edges */ + int mb_to_left_edge; + int mb_to_right_edge; + int mb_to_top_edge; + int mb_to_bottom_edge; + + FRAME_CONTEXT *fc; + + /* pointers to reference frames */ + const RefBuffer *block_refs[2]; + + /* pointer to current frame */ + const YV12_BUFFER_CONFIG *cur_buf; + + ENTROPY_CONTEXT *above_context[MAX_MB_PLANE]; + ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16]; + + PARTITION_CONTEXT *above_seg_context; + PARTITION_CONTEXT left_seg_context[8]; + +#if CONFIG_VP9_HIGHBITDEPTH + /* Bit depth: 8, 10, 12 */ + int bd; +#endif + + int lossless; + int corrupted; + + struct vpx_internal_error_info *error_info; + + PARTITION_TYPE *partition; +} MACROBLOCKD; + +static INLINE PLANE_TYPE get_plane_type(int plane) { + return (PLANE_TYPE)(plane > 0); +} + +static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize, + PARTITION_TYPE partition) { + return subsize_lookup[partition][bsize]; +} + +extern const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES]; + +static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type, + const MACROBLOCKD *xd) { + const MODE_INFO *const mi = xd->mi[0]; + + if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(mi)) + return DCT_DCT; + + return intra_mode_to_tx_type_lookup[mi->mode]; +} + +static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type, + const MACROBLOCKD *xd, int ib) { + const MODE_INFO *const mi = xd->mi[0]; + + if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(mi)) + return DCT_DCT; + + return intra_mode_to_tx_type_lookup[get_y_mode(mi, ib)]; +} + +void vp9_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y); + +static INLINE TX_SIZE get_uv_tx_size(const MODE_INFO *mi, + const struct macroblockd_plane *pd) { + assert(mi->sb_type < BLOCK_8X8 || + ss_size_lookup[mi->sb_type][pd->subsampling_x][pd->subsampling_y] != + BLOCK_INVALID); + return uv_txsize_lookup[mi->sb_type][mi->tx_size][pd->subsampling_x] + [pd->subsampling_y]; +} + +static INLINE BLOCK_SIZE +get_plane_block_size(BLOCK_SIZE bsize, const struct macroblockd_plane *pd) { + return ss_size_lookup[bsize][pd->subsampling_x][pd->subsampling_y]; +} + +static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) { + struct macroblockd_plane *const pd = &xd->plane[i]; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); + memset(pd->above_context, 0, + sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide_lookup[plane_bsize]); + memset(pd->left_context, 0, + sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high_lookup[plane_bsize]); + } +} + +static INLINE const vpx_prob *get_y_mode_probs(const MODE_INFO *mi, + const MODE_INFO *above_mi, + const MODE_INFO *left_mi, + int block) { + const PREDICTION_MODE above = vp9_above_block_mode(mi, above_mi, block); + const PREDICTION_MODE left = vp9_left_block_mode(mi, left_mi, block); + return vp9_kf_y_mode_prob[above][left]; +} + +typedef void (*foreach_transformed_block_visitor)(int plane, int block, int row, + int col, + BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg); + +void vp9_foreach_transformed_block_in_plane( + const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane, + foreach_transformed_block_visitor visit, void *arg); + +void vp9_foreach_transformed_block(const MACROBLOCKD *const xd, + BLOCK_SIZE bsize, + foreach_transformed_block_visitor visit, + void *arg); + +void vp9_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob, + int aoff, int loff); + +#if CONFIG_MISMATCH_DEBUG +#define TX_UNIT_SIZE_LOG2 2 +static INLINE void mi_to_pixel_loc(int *pixel_c, int *pixel_r, int mi_col, + int mi_row, int tx_blk_col, int tx_blk_row, + int subsampling_x, int subsampling_y) { + *pixel_c = ((mi_col << MI_SIZE_LOG2) >> subsampling_x) + + (tx_blk_col << TX_UNIT_SIZE_LOG2); + *pixel_r = ((mi_row << MI_SIZE_LOG2) >> subsampling_y) + + (tx_blk_row << TX_UNIT_SIZE_LOG2); +} + +static INLINE int get_block_width(BLOCK_SIZE bsize) { + const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; + return 4 * num_4x4_w; +} + +static INLINE int get_block_height(BLOCK_SIZE bsize) { + const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; + return 4 * num_4x4_h; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_COMMON_VP9_BLOCKD_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_common.h b/media/libvpx/libvpx/vp9/common/vp9_common.h new file mode 100644 index 0000000000..d63bad93d1 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_common.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_COMMON_VP9_COMMON_H_ +#define VPX_VP9_COMMON_VP9_COMMON_H_ + +/* Interface header for common constant data structures and lookup tables */ + +#include + +#include "./vpx_config.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/bitops.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Only need this for fixed-size arrays, for structs just assign. +#define vp9_copy(dest, src) \ + do { \ + assert(sizeof(dest) == sizeof(src)); \ + memcpy(dest, src, sizeof(src)); \ + } while (0) + +// Use this for variably-sized arrays. +#define vp9_copy_array(dest, src, n) \ + { \ + assert(sizeof(*(dest)) == sizeof(*(src))); \ + memcpy(dest, src, (n) * sizeof(*(src))); \ + } + +#define vp9_zero(dest) memset(&(dest), 0, sizeof(dest)) +#define vp9_zero_array(dest, n) memset(dest, 0, (n) * sizeof(*(dest))) + +static INLINE int get_unsigned_bits(unsigned int num_values) { + return num_values > 0 ? get_msb(num_values) + 1 : 0; +} + +#define VP9_SYNC_CODE_0 0x49 +#define VP9_SYNC_CODE_1 0x83 +#define VP9_SYNC_CODE_2 0x42 + +#define VP9_FRAME_MARKER 0x2 + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_COMMON_VP9_COMMON_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_common_data.c b/media/libvpx/libvpx/vp9/common/vp9_common_data.c new file mode 100644 index 0000000000..809d7317ce --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_common_data.c @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_common_data.h" +#include "vpx_dsp/vpx_dsp_common.h" + +// Log 2 conversion lookup tables for block width and height +const uint8_t b_width_log2_lookup[BLOCK_SIZES] = { 0, 0, 1, 1, 1, 2, 2, + 2, 3, 3, 3, 4, 4 }; +const uint8_t b_height_log2_lookup[BLOCK_SIZES] = { 0, 1, 0, 1, 2, 1, 2, + 3, 2, 3, 4, 3, 4 }; +const uint8_t num_4x4_blocks_wide_lookup[BLOCK_SIZES] = { 1, 1, 2, 2, 2, 4, 4, + 4, 8, 8, 8, 16, 16 }; +const uint8_t num_4x4_blocks_high_lookup[BLOCK_SIZES] = { 1, 2, 1, 2, 4, 2, 4, + 8, 4, 8, 16, 8, 16 }; +// Log 2 conversion lookup tables for modeinfo width and height +const uint8_t mi_width_log2_lookup[BLOCK_SIZES] = { 0, 0, 0, 0, 0, 1, 1, + 1, 2, 2, 2, 3, 3 }; +const uint8_t num_8x8_blocks_wide_lookup[BLOCK_SIZES] = { 1, 1, 1, 1, 1, 2, 2, + 2, 4, 4, 4, 8, 8 }; +const uint8_t num_8x8_blocks_high_lookup[BLOCK_SIZES] = { 1, 1, 1, 1, 2, 1, 2, + 4, 2, 4, 8, 4, 8 }; + +// VPXMIN(3, VPXMIN(b_width_log2_lookup(bsize), b_height_log2_lookup(bsize))) +const uint8_t size_group_lookup[BLOCK_SIZES] = { 0, 0, 0, 1, 1, 1, 2, + 2, 2, 3, 3, 3, 3 }; + +const uint8_t num_pels_log2_lookup[BLOCK_SIZES] = { 4, 5, 5, 6, 7, 7, 8, + 9, 9, 10, 11, 11, 12 }; + +const PARTITION_TYPE partition_lookup[][BLOCK_SIZES] = { + { // 4X4 + // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64 + PARTITION_NONE, PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, + PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, + PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, + PARTITION_INVALID }, + { // 8X8 + // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64 + PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE, + PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, + PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, + PARTITION_INVALID }, + { // 16X16 + // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64 + PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, + PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID, + PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, + PARTITION_INVALID }, + { // 32X32 + // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64 + PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, + PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT, + PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID, PARTITION_INVALID, + PARTITION_INVALID }, + { // 64X64 + // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64 + PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, + PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, + PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ, + PARTITION_NONE } +}; + +const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES] = { + { // PARTITION_NONE + BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, BLOCK_8X8, BLOCK_8X16, BLOCK_16X8, + BLOCK_16X16, BLOCK_16X32, BLOCK_32X16, BLOCK_32X32, BLOCK_32X64, + BLOCK_64X32, BLOCK_64X64 }, + { // PARTITION_HORZ + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X4, BLOCK_INVALID, + BLOCK_INVALID, BLOCK_16X8, BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32 }, + { // PARTITION_VERT + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X8, BLOCK_INVALID, + BLOCK_INVALID, BLOCK_8X16, BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64 }, + { // PARTITION_SPLIT + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X4, BLOCK_INVALID, + BLOCK_INVALID, BLOCK_8X8, BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X16, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X32 } +}; + +const TX_SIZE max_txsize_lookup[BLOCK_SIZES] = { + TX_4X4, TX_4X4, TX_4X4, TX_8X8, TX_8X8, TX_8X8, TX_16X16, + TX_16X16, TX_16X16, TX_32X32, TX_32X32, TX_32X32, TX_32X32 +}; + +const BLOCK_SIZE txsize_to_bsize[TX_SIZES] = { + BLOCK_4X4, // TX_4X4 + BLOCK_8X8, // TX_8X8 + BLOCK_16X16, // TX_16X16 + BLOCK_32X32, // TX_32X32 +}; + +const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = { + TX_4X4, // ONLY_4X4 + TX_8X8, // ALLOW_8X8 + TX_16X16, // ALLOW_16X16 + TX_32X32, // ALLOW_32X32 + TX_32X32, // TX_MODE_SELECT +}; + +const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = { + // ss_x == 0 ss_x == 0 ss_x == 1 ss_x == 1 + // ss_y == 0 ss_y == 1 ss_y == 0 ss_y == 1 + { { BLOCK_4X4, BLOCK_INVALID }, { BLOCK_INVALID, BLOCK_INVALID } }, + { { BLOCK_4X8, BLOCK_4X4 }, { BLOCK_INVALID, BLOCK_INVALID } }, + { { BLOCK_8X4, BLOCK_INVALID }, { BLOCK_4X4, BLOCK_INVALID } }, + { { BLOCK_8X8, BLOCK_8X4 }, { BLOCK_4X8, BLOCK_4X4 } }, + { { BLOCK_8X16, BLOCK_8X8 }, { BLOCK_INVALID, BLOCK_4X8 } }, + { { BLOCK_16X8, BLOCK_INVALID }, { BLOCK_8X8, BLOCK_8X4 } }, + { { BLOCK_16X16, BLOCK_16X8 }, { BLOCK_8X16, BLOCK_8X8 } }, + { { BLOCK_16X32, BLOCK_16X16 }, { BLOCK_INVALID, BLOCK_8X16 } }, + { { BLOCK_32X16, BLOCK_INVALID }, { BLOCK_16X16, BLOCK_16X8 } }, + { { BLOCK_32X32, BLOCK_32X16 }, { BLOCK_16X32, BLOCK_16X16 } }, + { { BLOCK_32X64, BLOCK_32X32 }, { BLOCK_INVALID, BLOCK_16X32 } }, + { { BLOCK_64X32, BLOCK_INVALID }, { BLOCK_32X32, BLOCK_32X16 } }, + { { BLOCK_64X64, BLOCK_64X32 }, { BLOCK_32X64, BLOCK_32X32 } }, +}; + +const TX_SIZE uv_txsize_lookup[BLOCK_SIZES][TX_SIZES][2][2] = { + // ss_x == 0 ss_x == 0 ss_x == 1 ss_x == 1 + // ss_y == 0 ss_y == 1 ss_y == 0 ss_y == 1 + { + // BLOCK_4X4 + { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } }, + { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } }, + { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } }, + { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } }, + }, + { + // BLOCK_4X8 + { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } }, + { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } }, + { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } }, + { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } }, + }, + { + // BLOCK_8X4 + { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } }, + { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } }, + { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } }, + { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } }, + }, + { + // BLOCK_8X8 + { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } }, + { { TX_8X8, TX_4X4 }, { TX_4X4, TX_4X4 } }, + { { TX_8X8, TX_4X4 }, { TX_4X4, TX_4X4 } }, + { { TX_8X8, TX_4X4 }, { TX_4X4, TX_4X4 } }, + }, + { + // BLOCK_8X16 + { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } }, + { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } }, + { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } }, + { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } }, + }, + { + // BLOCK_16X8 + { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } }, + { { TX_8X8, TX_4X4 }, { TX_8X8, TX_4X4 } }, + { { TX_8X8, TX_4X4 }, { TX_8X8, TX_8X8 } }, + { { TX_8X8, TX_4X4 }, { TX_8X8, TX_8X8 } }, + }, + { + // BLOCK_16X16 + { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } }, + { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } }, + { { TX_16X16, TX_8X8 }, { TX_8X8, TX_8X8 } }, + { { TX_16X16, TX_8X8 }, { TX_8X8, TX_8X8 } }, + }, + { + // BLOCK_16X32 + { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } }, + { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } }, + { { TX_16X16, TX_16X16 }, { TX_8X8, TX_8X8 } }, + { { TX_16X16, TX_16X16 }, { TX_8X8, TX_8X8 } }, + }, + { + // BLOCK_32X16 + { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } }, + { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } }, + { { TX_16X16, TX_8X8 }, { TX_16X16, TX_8X8 } }, + { { TX_16X16, TX_8X8 }, { TX_16X16, TX_8X8 } }, + }, + { + // BLOCK_32X32 + { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } }, + { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } }, + { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } }, + { { TX_32X32, TX_16X16 }, { TX_16X16, TX_16X16 } }, + }, + { + // BLOCK_32X64 + { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } }, + { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } }, + { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } }, + { { TX_32X32, TX_32X32 }, { TX_16X16, TX_16X16 } }, + }, + { + // BLOCK_64X32 + { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } }, + { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } }, + { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } }, + { { TX_32X32, TX_16X16 }, { TX_32X32, TX_16X16 } }, + }, + { + // BLOCK_64X64 + { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } }, + { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } }, + { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } }, + { { TX_32X32, TX_32X32 }, { TX_32X32, TX_32X32 } }, + }, +}; + +// Generates 4 bit field in which each bit set to 1 represents +// a blocksize partition 1111 means we split 64x64, 32x32, 16x16 +// and 8x8. 1000 means we just split the 64x64 to 32x32 +const struct { + PARTITION_CONTEXT above; + PARTITION_CONTEXT left; +} partition_context_lookup[BLOCK_SIZES] = { + { 15, 15 }, // 4X4 - {0b1111, 0b1111} + { 15, 14 }, // 4X8 - {0b1111, 0b1110} + { 14, 15 }, // 8X4 - {0b1110, 0b1111} + { 14, 14 }, // 8X8 - {0b1110, 0b1110} + { 14, 12 }, // 8X16 - {0b1110, 0b1100} + { 12, 14 }, // 16X8 - {0b1100, 0b1110} + { 12, 12 }, // 16X16 - {0b1100, 0b1100} + { 12, 8 }, // 16X32 - {0b1100, 0b1000} + { 8, 12 }, // 32X16 - {0b1000, 0b1100} + { 8, 8 }, // 32X32 - {0b1000, 0b1000} + { 8, 0 }, // 32X64 - {0b1000, 0b0000} + { 0, 8 }, // 64X32 - {0b0000, 0b1000} + { 0, 0 }, // 64X64 - {0b0000, 0b0000} +}; + +#if CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH +const uint8_t need_top_left[INTRA_MODES] = { + 0, // DC_PRED + 0, // V_PRED + 0, // H_PRED + 0, // D45_PRED + 1, // D135_PRED + 1, // D117_PRED + 1, // D153_PRED + 0, // D207_PRED + 0, // D63_PRED + 1, // TM_PRED +}; +#endif // CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vp9/common/vp9_common_data.h b/media/libvpx/libvpx/vp9/common/vp9_common_data.h new file mode 100644 index 0000000000..a533c5f058 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_common_data.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_COMMON_VP9_COMMON_DATA_H_ +#define VPX_VP9_COMMON_VP9_COMMON_DATA_H_ + +#include "vp9/common/vp9_enums.h" +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +extern const uint8_t b_width_log2_lookup[BLOCK_SIZES]; +extern const uint8_t b_height_log2_lookup[BLOCK_SIZES]; +extern const uint8_t mi_width_log2_lookup[BLOCK_SIZES]; +extern const uint8_t num_8x8_blocks_wide_lookup[BLOCK_SIZES]; +extern const uint8_t num_8x8_blocks_high_lookup[BLOCK_SIZES]; +extern const uint8_t num_4x4_blocks_high_lookup[BLOCK_SIZES]; +extern const uint8_t num_4x4_blocks_wide_lookup[BLOCK_SIZES]; +extern const uint8_t size_group_lookup[BLOCK_SIZES]; +extern const uint8_t num_pels_log2_lookup[BLOCK_SIZES]; +extern const PARTITION_TYPE partition_lookup[][BLOCK_SIZES]; +extern const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES]; +extern const TX_SIZE max_txsize_lookup[BLOCK_SIZES]; +extern const BLOCK_SIZE txsize_to_bsize[TX_SIZES]; +extern const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES]; +extern const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2]; +extern const TX_SIZE uv_txsize_lookup[BLOCK_SIZES][TX_SIZES][2][2]; +#if CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH +extern const uint8_t need_top_left[INTRA_MODES]; +#endif // CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_COMMON_VP9_COMMON_DATA_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_debugmodes.c b/media/libvpx/libvpx/vp9/common/vp9_debugmodes.c new file mode 100644 index 0000000000..28cd4a1924 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_debugmodes.c @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_onyxc_int.h" + +static void log_frame_info(VP9_COMMON *cm, const char *str, FILE *f) { + fprintf(f, "%s", str); + fprintf(f, "(Frame %d, Show:%d, Q:%d): \n", cm->current_video_frame, + cm->show_frame, cm->base_qindex); +} +/* This function dereferences a pointer to the mbmi structure + * and uses the passed in member offset to print out the value of an integer + * for each mbmi member value in the mi structure. + */ +static void print_mi_data(VP9_COMMON *cm, FILE *file, const char *descriptor, + size_t member_offset) { + int mi_row, mi_col; + MODE_INFO **mi = cm->mi_grid_visible; + int rows = cm->mi_rows; + int cols = cm->mi_cols; + char prefix = descriptor[0]; + + log_frame_info(cm, descriptor, file); + for (mi_row = 0; mi_row < rows; mi_row++) { + fprintf(file, "%c ", prefix); + for (mi_col = 0; mi_col < cols; mi_col++) { + fprintf(file, "%2d ", *((char *)((char *)(mi[0]) + member_offset))); + mi++; + } + fprintf(file, "\n"); + mi += 8; + } + fprintf(file, "\n"); +} + +void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, const char *file) { + int mi_row; + int mi_col; + FILE *mvs = fopen(file, "a"); + MODE_INFO **mi = cm->mi_grid_visible; + int rows = cm->mi_rows; + int cols = cm->mi_cols; + + print_mi_data(cm, mvs, "Partitions:", offsetof(MODE_INFO, sb_type)); + print_mi_data(cm, mvs, "Modes:", offsetof(MODE_INFO, mode)); + print_mi_data(cm, mvs, "Ref frame:", offsetof(MODE_INFO, ref_frame[0])); + print_mi_data(cm, mvs, "Transform:", offsetof(MODE_INFO, tx_size)); + print_mi_data(cm, mvs, "UV Modes:", offsetof(MODE_INFO, uv_mode)); + + // output skip infomation. + log_frame_info(cm, "Skips:", mvs); + for (mi_row = 0; mi_row < rows; mi_row++) { + fprintf(mvs, "S "); + for (mi_col = 0; mi_col < cols; mi_col++) { + fprintf(mvs, "%2d ", mi[0]->skip); + mi++; + } + fprintf(mvs, "\n"); + mi += 8; + } + fprintf(mvs, "\n"); + + // output motion vectors. + log_frame_info(cm, "Vectors ", mvs); + mi = cm->mi_grid_visible; + for (mi_row = 0; mi_row < rows; mi_row++) { + fprintf(mvs, "V "); + for (mi_col = 0; mi_col < cols; mi_col++) { + fprintf(mvs, "%4d:%4d ", mi[0]->mv[0].as_mv.row, mi[0]->mv[0].as_mv.col); + mi++; + } + fprintf(mvs, "\n"); + mi += 8; + } + fprintf(mvs, "\n"); + + fclose(mvs); +} diff --git a/media/libvpx/libvpx/vp9/common/vp9_entropy.c b/media/libvpx/libvpx/vp9/common/vp9_entropy.c new file mode 100644 index 0000000000..430b917b8f --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_entropy.c @@ -0,0 +1,1100 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_entropymode.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx/vpx_integer.h" + +// Unconstrained Node Tree +/* clang-format off */ +const vpx_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)] = { + 2, 6, // 0 = LOW_VAL + -TWO_TOKEN, 4, // 1 = TWO + -THREE_TOKEN, -FOUR_TOKEN, // 2 = THREE + 8, 10, // 3 = HIGH_LOW + -CATEGORY1_TOKEN, -CATEGORY2_TOKEN, // 4 = CAT_ONE + 12, 14, // 5 = CAT_THREEFOUR + -CATEGORY3_TOKEN, -CATEGORY4_TOKEN, // 6 = CAT_THREE + -CATEGORY5_TOKEN, -CATEGORY6_TOKEN // 7 = CAT_FIVE +}; +/* clang-format on */ + +const vpx_prob vp9_cat1_prob[] = { 159 }; +const vpx_prob vp9_cat2_prob[] = { 165, 145 }; +const vpx_prob vp9_cat3_prob[] = { 173, 148, 140 }; +const vpx_prob vp9_cat4_prob[] = { 176, 155, 140, 135 }; +const vpx_prob vp9_cat5_prob[] = { 180, 157, 141, 134, 130 }; +const vpx_prob vp9_cat6_prob[] = { 254, 254, 254, 252, 249, 243, 230, + 196, 177, 153, 140, 133, 130, 129 }; +#if CONFIG_VP9_HIGHBITDEPTH +const vpx_prob vp9_cat6_prob_high12[] = { 255, 255, 255, 255, 254, 254, + 254, 252, 249, 243, 230, 196, + 177, 153, 140, 133, 130, 129 }; +#endif + +/* clang-format off */ +const uint8_t vp9_coefband_trans_8x8plus[1024] = { + 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, + // beyond MAXBAND_INDEX+1 all values are filled as 5 + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, +}; +/* clang-format on */ + +const uint8_t vp9_coefband_trans_4x4[16] = { + 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, +}; + +const uint8_t vp9_pt_energy_class[ENTROPY_TOKENS] = { 0, 1, 2, 3, 3, 4, + 4, 5, 5, 5, 5, 5 }; + +// Model obtained from a 2-sided zero-centerd distribuition derived +// from a Pareto distribution. The cdf of the distribution is: +// cdf(x) = 0.5 + 0.5 * sgn(x) * [1 - {alpha/(alpha + |x|)} ^ beta] +// +// For a given beta and a given probablity of the 1-node, the alpha +// is first solved, and then the {alpha, beta} pair is used to generate +// the probabilities for the rest of the nodes. + +// beta = 8 + +// Every odd line in this table can be generated from the even lines +// by averaging : +// vp9_pareto8_full[l][node] = (vp9_pareto8_full[l-1][node] + +// vp9_pareto8_full[l+1][node] ) >> 1; +const vpx_prob vp9_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES] = { + { 3, 86, 128, 6, 86, 23, 88, 29 }, + { 6, 86, 128, 11, 87, 42, 91, 52 }, + { 9, 86, 129, 17, 88, 61, 94, 76 }, + { 12, 86, 129, 22, 88, 77, 97, 93 }, + { 15, 87, 129, 28, 89, 93, 100, 110 }, + { 17, 87, 129, 33, 90, 105, 103, 123 }, + { 20, 88, 130, 38, 91, 118, 106, 136 }, + { 23, 88, 130, 43, 91, 128, 108, 146 }, + { 26, 89, 131, 48, 92, 139, 111, 156 }, + { 28, 89, 131, 53, 93, 147, 114, 163 }, + { 31, 90, 131, 58, 94, 156, 117, 171 }, + { 34, 90, 131, 62, 94, 163, 119, 177 }, + { 37, 90, 132, 66, 95, 171, 122, 184 }, + { 39, 90, 132, 70, 96, 177, 124, 189 }, + { 42, 91, 132, 75, 97, 183, 127, 194 }, + { 44, 91, 132, 79, 97, 188, 129, 198 }, + { 47, 92, 133, 83, 98, 193, 132, 202 }, + { 49, 92, 133, 86, 99, 197, 134, 205 }, + { 52, 93, 133, 90, 100, 201, 137, 208 }, + { 54, 93, 133, 94, 100, 204, 139, 211 }, + { 57, 94, 134, 98, 101, 208, 142, 214 }, + { 59, 94, 134, 101, 102, 211, 144, 216 }, + { 62, 94, 135, 105, 103, 214, 146, 218 }, + { 64, 94, 135, 108, 103, 216, 148, 220 }, + { 66, 95, 135, 111, 104, 219, 151, 222 }, + { 68, 95, 135, 114, 105, 221, 153, 223 }, + { 71, 96, 136, 117, 106, 224, 155, 225 }, + { 73, 96, 136, 120, 106, 225, 157, 226 }, + { 76, 97, 136, 123, 107, 227, 159, 228 }, + { 78, 97, 136, 126, 108, 229, 160, 229 }, + { 80, 98, 137, 129, 109, 231, 162, 231 }, + { 82, 98, 137, 131, 109, 232, 164, 232 }, + { 84, 98, 138, 134, 110, 234, 166, 233 }, + { 86, 98, 138, 137, 111, 235, 168, 234 }, + { 89, 99, 138, 140, 112, 236, 170, 235 }, + { 91, 99, 138, 142, 112, 237, 171, 235 }, + { 93, 100, 139, 145, 113, 238, 173, 236 }, + { 95, 100, 139, 147, 114, 239, 174, 237 }, + { 97, 101, 140, 149, 115, 240, 176, 238 }, + { 99, 101, 140, 151, 115, 241, 177, 238 }, + { 101, 102, 140, 154, 116, 242, 179, 239 }, + { 103, 102, 140, 156, 117, 242, 180, 239 }, + { 105, 103, 141, 158, 118, 243, 182, 240 }, + { 107, 103, 141, 160, 118, 243, 183, 240 }, + { 109, 104, 141, 162, 119, 244, 185, 241 }, + { 111, 104, 141, 164, 119, 244, 186, 241 }, + { 113, 104, 142, 166, 120, 245, 187, 242 }, + { 114, 104, 142, 168, 121, 245, 188, 242 }, + { 116, 105, 143, 170, 122, 246, 190, 243 }, + { 118, 105, 143, 171, 122, 246, 191, 243 }, + { 120, 106, 143, 173, 123, 247, 192, 244 }, + { 121, 106, 143, 175, 124, 247, 193, 244 }, + { 123, 107, 144, 177, 125, 248, 195, 244 }, + { 125, 107, 144, 178, 125, 248, 196, 244 }, + { 127, 108, 145, 180, 126, 249, 197, 245 }, + { 128, 108, 145, 181, 127, 249, 198, 245 }, + { 130, 109, 145, 183, 128, 249, 199, 245 }, + { 132, 109, 145, 184, 128, 249, 200, 245 }, + { 134, 110, 146, 186, 129, 250, 201, 246 }, + { 135, 110, 146, 187, 130, 250, 202, 246 }, + { 137, 111, 147, 189, 131, 251, 203, 246 }, + { 138, 111, 147, 190, 131, 251, 204, 246 }, + { 140, 112, 147, 192, 132, 251, 205, 247 }, + { 141, 112, 147, 193, 132, 251, 206, 247 }, + { 143, 113, 148, 194, 133, 251, 207, 247 }, + { 144, 113, 148, 195, 134, 251, 207, 247 }, + { 146, 114, 149, 197, 135, 252, 208, 248 }, + { 147, 114, 149, 198, 135, 252, 209, 248 }, + { 149, 115, 149, 199, 136, 252, 210, 248 }, + { 150, 115, 149, 200, 137, 252, 210, 248 }, + { 152, 115, 150, 201, 138, 252, 211, 248 }, + { 153, 115, 150, 202, 138, 252, 212, 248 }, + { 155, 116, 151, 204, 139, 253, 213, 249 }, + { 156, 116, 151, 205, 139, 253, 213, 249 }, + { 158, 117, 151, 206, 140, 253, 214, 249 }, + { 159, 117, 151, 207, 141, 253, 215, 249 }, + { 161, 118, 152, 208, 142, 253, 216, 249 }, + { 162, 118, 152, 209, 142, 253, 216, 249 }, + { 163, 119, 153, 210, 143, 253, 217, 249 }, + { 164, 119, 153, 211, 143, 253, 217, 249 }, + { 166, 120, 153, 212, 144, 254, 218, 250 }, + { 167, 120, 153, 212, 145, 254, 219, 250 }, + { 168, 121, 154, 213, 146, 254, 220, 250 }, + { 169, 121, 154, 214, 146, 254, 220, 250 }, + { 171, 122, 155, 215, 147, 254, 221, 250 }, + { 172, 122, 155, 216, 147, 254, 221, 250 }, + { 173, 123, 155, 217, 148, 254, 222, 250 }, + { 174, 123, 155, 217, 149, 254, 222, 250 }, + { 176, 124, 156, 218, 150, 254, 223, 250 }, + { 177, 124, 156, 219, 150, 254, 223, 250 }, + { 178, 125, 157, 220, 151, 254, 224, 251 }, + { 179, 125, 157, 220, 151, 254, 224, 251 }, + { 180, 126, 157, 221, 152, 254, 225, 251 }, + { 181, 126, 157, 221, 152, 254, 225, 251 }, + { 183, 127, 158, 222, 153, 254, 226, 251 }, + { 184, 127, 158, 223, 154, 254, 226, 251 }, + { 185, 128, 159, 224, 155, 255, 227, 251 }, + { 186, 128, 159, 224, 155, 255, 227, 251 }, + { 187, 129, 160, 225, 156, 255, 228, 251 }, + { 188, 130, 160, 225, 156, 255, 228, 251 }, + { 189, 131, 160, 226, 157, 255, 228, 251 }, + { 190, 131, 160, 226, 158, 255, 228, 251 }, + { 191, 132, 161, 227, 159, 255, 229, 251 }, + { 192, 132, 161, 227, 159, 255, 229, 251 }, + { 193, 133, 162, 228, 160, 255, 230, 252 }, + { 194, 133, 162, 229, 160, 255, 230, 252 }, + { 195, 134, 163, 230, 161, 255, 231, 252 }, + { 196, 134, 163, 230, 161, 255, 231, 252 }, + { 197, 135, 163, 231, 162, 255, 231, 252 }, + { 198, 135, 163, 231, 162, 255, 231, 252 }, + { 199, 136, 164, 232, 163, 255, 232, 252 }, + { 200, 136, 164, 232, 164, 255, 232, 252 }, + { 201, 137, 165, 233, 165, 255, 233, 252 }, + { 201, 137, 165, 233, 165, 255, 233, 252 }, + { 202, 138, 166, 233, 166, 255, 233, 252 }, + { 203, 138, 166, 233, 166, 255, 233, 252 }, + { 204, 139, 166, 234, 167, 255, 234, 252 }, + { 205, 139, 166, 234, 167, 255, 234, 252 }, + { 206, 140, 167, 235, 168, 255, 235, 252 }, + { 206, 140, 167, 235, 168, 255, 235, 252 }, + { 207, 141, 168, 236, 169, 255, 235, 252 }, + { 208, 141, 168, 236, 170, 255, 235, 252 }, + { 209, 142, 169, 237, 171, 255, 236, 252 }, + { 209, 143, 169, 237, 171, 255, 236, 252 }, + { 210, 144, 169, 237, 172, 255, 236, 252 }, + { 211, 144, 169, 237, 172, 255, 236, 252 }, + { 212, 145, 170, 238, 173, 255, 237, 252 }, + { 213, 145, 170, 238, 173, 255, 237, 252 }, + { 214, 146, 171, 239, 174, 255, 237, 253 }, + { 214, 146, 171, 239, 174, 255, 237, 253 }, + { 215, 147, 172, 240, 175, 255, 238, 253 }, + { 215, 147, 172, 240, 175, 255, 238, 253 }, + { 216, 148, 173, 240, 176, 255, 238, 253 }, + { 217, 148, 173, 240, 176, 255, 238, 253 }, + { 218, 149, 173, 241, 177, 255, 239, 253 }, + { 218, 149, 173, 241, 178, 255, 239, 253 }, + { 219, 150, 174, 241, 179, 255, 239, 253 }, + { 219, 151, 174, 241, 179, 255, 239, 253 }, + { 220, 152, 175, 242, 180, 255, 240, 253 }, + { 221, 152, 175, 242, 180, 255, 240, 253 }, + { 222, 153, 176, 242, 181, 255, 240, 253 }, + { 222, 153, 176, 242, 181, 255, 240, 253 }, + { 223, 154, 177, 243, 182, 255, 240, 253 }, + { 223, 154, 177, 243, 182, 255, 240, 253 }, + { 224, 155, 178, 244, 183, 255, 241, 253 }, + { 224, 155, 178, 244, 183, 255, 241, 253 }, + { 225, 156, 178, 244, 184, 255, 241, 253 }, + { 225, 157, 178, 244, 184, 255, 241, 253 }, + { 226, 158, 179, 244, 185, 255, 242, 253 }, + { 227, 158, 179, 244, 185, 255, 242, 253 }, + { 228, 159, 180, 245, 186, 255, 242, 253 }, + { 228, 159, 180, 245, 186, 255, 242, 253 }, + { 229, 160, 181, 245, 187, 255, 242, 253 }, + { 229, 160, 181, 245, 187, 255, 242, 253 }, + { 230, 161, 182, 246, 188, 255, 243, 253 }, + { 230, 162, 182, 246, 188, 255, 243, 253 }, + { 231, 163, 183, 246, 189, 255, 243, 253 }, + { 231, 163, 183, 246, 189, 255, 243, 253 }, + { 232, 164, 184, 247, 190, 255, 243, 253 }, + { 232, 164, 184, 247, 190, 255, 243, 253 }, + { 233, 165, 185, 247, 191, 255, 244, 253 }, + { 233, 165, 185, 247, 191, 255, 244, 253 }, + { 234, 166, 185, 247, 192, 255, 244, 253 }, + { 234, 167, 185, 247, 192, 255, 244, 253 }, + { 235, 168, 186, 248, 193, 255, 244, 253 }, + { 235, 168, 186, 248, 193, 255, 244, 253 }, + { 236, 169, 187, 248, 194, 255, 244, 253 }, + { 236, 169, 187, 248, 194, 255, 244, 253 }, + { 236, 170, 188, 248, 195, 255, 245, 253 }, + { 236, 170, 188, 248, 195, 255, 245, 253 }, + { 237, 171, 189, 249, 196, 255, 245, 254 }, + { 237, 172, 189, 249, 196, 255, 245, 254 }, + { 238, 173, 190, 249, 197, 255, 245, 254 }, + { 238, 173, 190, 249, 197, 255, 245, 254 }, + { 239, 174, 191, 249, 198, 255, 245, 254 }, + { 239, 174, 191, 249, 198, 255, 245, 254 }, + { 240, 175, 192, 249, 199, 255, 246, 254 }, + { 240, 176, 192, 249, 199, 255, 246, 254 }, + { 240, 177, 193, 250, 200, 255, 246, 254 }, + { 240, 177, 193, 250, 200, 255, 246, 254 }, + { 241, 178, 194, 250, 201, 255, 246, 254 }, + { 241, 178, 194, 250, 201, 255, 246, 254 }, + { 242, 179, 195, 250, 202, 255, 246, 254 }, + { 242, 180, 195, 250, 202, 255, 246, 254 }, + { 242, 181, 196, 250, 203, 255, 247, 254 }, + { 242, 181, 196, 250, 203, 255, 247, 254 }, + { 243, 182, 197, 251, 204, 255, 247, 254 }, + { 243, 183, 197, 251, 204, 255, 247, 254 }, + { 244, 184, 198, 251, 205, 255, 247, 254 }, + { 244, 184, 198, 251, 205, 255, 247, 254 }, + { 244, 185, 199, 251, 206, 255, 247, 254 }, + { 244, 185, 199, 251, 206, 255, 247, 254 }, + { 245, 186, 200, 251, 207, 255, 247, 254 }, + { 245, 187, 200, 251, 207, 255, 247, 254 }, + { 246, 188, 201, 252, 207, 255, 248, 254 }, + { 246, 188, 201, 252, 207, 255, 248, 254 }, + { 246, 189, 202, 252, 208, 255, 248, 254 }, + { 246, 190, 202, 252, 208, 255, 248, 254 }, + { 247, 191, 203, 252, 209, 255, 248, 254 }, + { 247, 191, 203, 252, 209, 255, 248, 254 }, + { 247, 192, 204, 252, 210, 255, 248, 254 }, + { 247, 193, 204, 252, 210, 255, 248, 254 }, + { 248, 194, 205, 252, 211, 255, 248, 254 }, + { 248, 194, 205, 252, 211, 255, 248, 254 }, + { 248, 195, 206, 252, 212, 255, 249, 254 }, + { 248, 196, 206, 252, 212, 255, 249, 254 }, + { 249, 197, 207, 253, 213, 255, 249, 254 }, + { 249, 197, 207, 253, 213, 255, 249, 254 }, + { 249, 198, 208, 253, 214, 255, 249, 254 }, + { 249, 199, 209, 253, 214, 255, 249, 254 }, + { 250, 200, 210, 253, 215, 255, 249, 254 }, + { 250, 200, 210, 253, 215, 255, 249, 254 }, + { 250, 201, 211, 253, 215, 255, 249, 254 }, + { 250, 202, 211, 253, 215, 255, 249, 254 }, + { 250, 203, 212, 253, 216, 255, 249, 254 }, + { 250, 203, 212, 253, 216, 255, 249, 254 }, + { 251, 204, 213, 253, 217, 255, 250, 254 }, + { 251, 205, 213, 253, 217, 255, 250, 254 }, + { 251, 206, 214, 254, 218, 255, 250, 254 }, + { 251, 206, 215, 254, 218, 255, 250, 254 }, + { 252, 207, 216, 254, 219, 255, 250, 254 }, + { 252, 208, 216, 254, 219, 255, 250, 254 }, + { 252, 209, 217, 254, 220, 255, 250, 254 }, + { 252, 210, 217, 254, 220, 255, 250, 254 }, + { 252, 211, 218, 254, 221, 255, 250, 254 }, + { 252, 212, 218, 254, 221, 255, 250, 254 }, + { 253, 213, 219, 254, 222, 255, 250, 254 }, + { 253, 213, 220, 254, 222, 255, 250, 254 }, + { 253, 214, 221, 254, 223, 255, 250, 254 }, + { 253, 215, 221, 254, 223, 255, 250, 254 }, + { 253, 216, 222, 254, 224, 255, 251, 254 }, + { 253, 217, 223, 254, 224, 255, 251, 254 }, + { 253, 218, 224, 254, 225, 255, 251, 254 }, + { 253, 219, 224, 254, 225, 255, 251, 254 }, + { 254, 220, 225, 254, 225, 255, 251, 254 }, + { 254, 221, 226, 254, 225, 255, 251, 254 }, + { 254, 222, 227, 255, 226, 255, 251, 254 }, + { 254, 223, 227, 255, 226, 255, 251, 254 }, + { 254, 224, 228, 255, 227, 255, 251, 254 }, + { 254, 225, 229, 255, 227, 255, 251, 254 }, + { 254, 226, 230, 255, 228, 255, 251, 254 }, + { 254, 227, 230, 255, 229, 255, 251, 254 }, + { 255, 228, 231, 255, 230, 255, 251, 254 }, + { 255, 229, 232, 255, 230, 255, 251, 254 }, + { 255, 230, 233, 255, 231, 255, 252, 254 }, + { 255, 231, 234, 255, 231, 255, 252, 254 }, + { 255, 232, 235, 255, 232, 255, 252, 254 }, + { 255, 233, 236, 255, 232, 255, 252, 254 }, + { 255, 235, 237, 255, 233, 255, 252, 254 }, + { 255, 236, 238, 255, 234, 255, 252, 254 }, + { 255, 238, 240, 255, 235, 255, 252, 255 }, + { 255, 239, 241, 255, 235, 255, 252, 254 }, + { 255, 241, 243, 255, 236, 255, 252, 254 }, + { 255, 243, 245, 255, 237, 255, 252, 254 }, + { 255, 246, 247, 255, 239, 255, 253, 255 }, +}; + +static const vp9_coeff_probs_model default_coef_probs_4x4[PLANE_TYPES] = { + { // Y plane + { // Intra + { // Band 0 + { 195, 29, 183 }, + { 84, 49, 136 }, + { 8, 42, 71 } }, + { // Band 1 + { 31, 107, 169 }, + { 35, 99, 159 }, + { 17, 82, 140 }, + { 8, 66, 114 }, + { 2, 44, 76 }, + { 1, 19, 32 } }, + { // Band 2 + { 40, 132, 201 }, + { 29, 114, 187 }, + { 13, 91, 157 }, + { 7, 75, 127 }, + { 3, 58, 95 }, + { 1, 28, 47 } }, + { // Band 3 + { 69, 142, 221 }, + { 42, 122, 201 }, + { 15, 91, 159 }, + { 6, 67, 121 }, + { 1, 42, 77 }, + { 1, 17, 31 } }, + { // Band 4 + { 102, 148, 228 }, + { 67, 117, 204 }, + { 17, 82, 154 }, + { 6, 59, 114 }, + { 2, 39, 75 }, + { 1, 15, 29 } }, + { // Band 5 + { 156, 57, 233 }, + { 119, 57, 212 }, + { 58, 48, 163 }, + { 29, 40, 124 }, + { 12, 30, 81 }, + { 3, 12, 31 } } }, + { // Inter + { // Band 0 + { 191, 107, 226 }, + { 124, 117, 204 }, + { 25, 99, 155 } }, + { // Band 1 + { 29, 148, 210 }, + { 37, 126, 194 }, + { 8, 93, 157 }, + { 2, 68, 118 }, + { 1, 39, 69 }, + { 1, 17, 33 } }, + { // Band 2 + { 41, 151, 213 }, + { 27, 123, 193 }, + { 3, 82, 144 }, + { 1, 58, 105 }, + { 1, 32, 60 }, + { 1, 13, 26 } }, + { // Band 3 + { 59, 159, 220 }, + { 23, 126, 198 }, + { 4, 88, 151 }, + { 1, 66, 114 }, + { 1, 38, 71 }, + { 1, 18, 34 } }, + { // Band 4 + { 114, 136, 232 }, + { 51, 114, 207 }, + { 11, 83, 155 }, + { 3, 56, 105 }, + { 1, 33, 65 }, + { 1, 17, 34 } }, + { // Band 5 + { 149, 65, 234 }, + { 121, 57, 215 }, + { 61, 49, 166 }, + { 28, 36, 114 }, + { 12, 25, 76 }, + { 3, 16, 42 } } } }, + { // UV plane + { // Intra + { // Band 0 + { 214, 49, 220 }, + { 132, 63, 188 }, + { 42, 65, 137 } }, + { // Band 1 + { 85, 137, 221 }, + { 104, 131, 216 }, + { 49, 111, 192 }, + { 21, 87, 155 }, + { 2, 49, 87 }, + { 1, 16, 28 } }, + { // Band 2 + { 89, 163, 230 }, + { 90, 137, 220 }, + { 29, 100, 183 }, + { 10, 70, 135 }, + { 2, 42, 81 }, + { 1, 17, 33 } }, + { // Band 3 + { 108, 167, 237 }, + { 55, 133, 222 }, + { 15, 97, 179 }, + { 4, 72, 135 }, + { 1, 45, 85 }, + { 1, 19, 38 } }, + { // Band 4 + { 124, 146, 240 }, + { 66, 124, 224 }, + { 17, 88, 175 }, + { 4, 58, 122 }, + { 1, 36, 75 }, + { 1, 18, 37 } }, + { // Band 5 + { 141, 79, 241 }, + { 126, 70, 227 }, + { 66, 58, 182 }, + { 30, 44, 136 }, + { 12, 34, 96 }, + { 2, 20, 47 } } }, + { // Inter + { // Band 0 + { 229, 99, 249 }, + { 143, 111, 235 }, + { 46, 109, 192 } }, + { // Band 1 + { 82, 158, 236 }, + { 94, 146, 224 }, + { 25, 117, 191 }, + { 9, 87, 149 }, + { 3, 56, 99 }, + { 1, 33, 57 } }, + { // Band 2 + { 83, 167, 237 }, + { 68, 145, 222 }, + { 10, 103, 177 }, + { 2, 72, 131 }, + { 1, 41, 79 }, + { 1, 20, 39 } }, + { // Band 3 + { 99, 167, 239 }, + { 47, 141, 224 }, + { 10, 104, 178 }, + { 2, 73, 133 }, + { 1, 44, 85 }, + { 1, 22, 47 } }, + { // Band 4 + { 127, 145, 243 }, + { 71, 129, 228 }, + { 17, 93, 177 }, + { 3, 61, 124 }, + { 1, 41, 84 }, + { 1, 21, 52 } }, + { // Band 5 + { 157, 78, 244 }, + { 140, 72, 231 }, + { 69, 58, 184 }, + { 31, 44, 137 }, + { 14, 38, 105 }, + { 8, 23, 61 } } } } +}; + +static const vp9_coeff_probs_model default_coef_probs_8x8[PLANE_TYPES] = { + { // Y plane + { // Intra + { // Band 0 + { 125, 34, 187 }, + { 52, 41, 133 }, + { 6, 31, 56 } }, + { // Band 1 + { 37, 109, 153 }, + { 51, 102, 147 }, + { 23, 87, 128 }, + { 8, 67, 101 }, + { 1, 41, 63 }, + { 1, 19, 29 } }, + { // Band 2 + { 31, 154, 185 }, + { 17, 127, 175 }, + { 6, 96, 145 }, + { 2, 73, 114 }, + { 1, 51, 82 }, + { 1, 28, 45 } }, + { // Band 3 + { 23, 163, 200 }, + { 10, 131, 185 }, + { 2, 93, 148 }, + { 1, 67, 111 }, + { 1, 41, 69 }, + { 1, 14, 24 } }, + { // Band 4 + { 29, 176, 217 }, + { 12, 145, 201 }, + { 3, 101, 156 }, + { 1, 69, 111 }, + { 1, 39, 63 }, + { 1, 14, 23 } }, + { // Band 5 + { 57, 192, 233 }, + { 25, 154, 215 }, + { 6, 109, 167 }, + { 3, 78, 118 }, + { 1, 48, 69 }, + { 1, 21, 29 } } }, + { // Inter + { // Band 0 + { 202, 105, 245 }, + { 108, 106, 216 }, + { 18, 90, 144 } }, + { // Band 1 + { 33, 172, 219 }, + { 64, 149, 206 }, + { 14, 117, 177 }, + { 5, 90, 141 }, + { 2, 61, 95 }, + { 1, 37, 57 } }, + { // Band 2 + { 33, 179, 220 }, + { 11, 140, 198 }, + { 1, 89, 148 }, + { 1, 60, 104 }, + { 1, 33, 57 }, + { 1, 12, 21 } }, + { // Band 3 + { 30, 181, 221 }, + { 8, 141, 198 }, + { 1, 87, 145 }, + { 1, 58, 100 }, + { 1, 31, 55 }, + { 1, 12, 20 } }, + { // Band 4 + { 32, 186, 224 }, + { 7, 142, 198 }, + { 1, 86, 143 }, + { 1, 58, 100 }, + { 1, 31, 55 }, + { 1, 12, 22 } }, + { // Band 5 + { 57, 192, 227 }, + { 20, 143, 204 }, + { 3, 96, 154 }, + { 1, 68, 112 }, + { 1, 42, 69 }, + { 1, 19, 32 } } } }, + { // UV plane + { // Intra + { // Band 0 + { 212, 35, 215 }, + { 113, 47, 169 }, + { 29, 48, 105 } }, + { // Band 1 + { 74, 129, 203 }, + { 106, 120, 203 }, + { 49, 107, 178 }, + { 19, 84, 144 }, + { 4, 50, 84 }, + { 1, 15, 25 } }, + { // Band 2 + { 71, 172, 217 }, + { 44, 141, 209 }, + { 15, 102, 173 }, + { 6, 76, 133 }, + { 2, 51, 89 }, + { 1, 24, 42 } }, + { // Band 3 + { 64, 185, 231 }, + { 31, 148, 216 }, + { 8, 103, 175 }, + { 3, 74, 131 }, + { 1, 46, 81 }, + { 1, 18, 30 } }, + { // Band 4 + { 65, 196, 235 }, + { 25, 157, 221 }, + { 5, 105, 174 }, + { 1, 67, 120 }, + { 1, 38, 69 }, + { 1, 15, 30 } }, + { // Band 5 + { 65, 204, 238 }, + { 30, 156, 224 }, + { 7, 107, 177 }, + { 2, 70, 124 }, + { 1, 42, 73 }, + { 1, 18, 34 } } }, + { // Inter + { // Band 0 + { 225, 86, 251 }, + { 144, 104, 235 }, + { 42, 99, 181 } }, + { // Band 1 + { 85, 175, 239 }, + { 112, 165, 229 }, + { 29, 136, 200 }, + { 12, 103, 162 }, + { 6, 77, 123 }, + { 2, 53, 84 } }, + { // Band 2 + { 75, 183, 239 }, + { 30, 155, 221 }, + { 3, 106, 171 }, + { 1, 74, 128 }, + { 1, 44, 76 }, + { 1, 17, 28 } }, + { // Band 3 + { 73, 185, 240 }, + { 27, 159, 222 }, + { 2, 107, 172 }, + { 1, 75, 127 }, + { 1, 42, 73 }, + { 1, 17, 29 } }, + { // Band 4 + { 62, 190, 238 }, + { 21, 159, 222 }, + { 2, 107, 172 }, + { 1, 72, 122 }, + { 1, 40, 71 }, + { 1, 18, 32 } }, + { // Band 5 + { 61, 199, 240 }, + { 27, 161, 226 }, + { 4, 113, 180 }, + { 1, 76, 129 }, + { 1, 46, 80 }, + { 1, 23, 41 } } } } +}; + +static const vp9_coeff_probs_model default_coef_probs_16x16[PLANE_TYPES] = { + { // Y plane + { // Intra + { // Band 0 + { 7, 27, 153 }, + { 5, 30, 95 }, + { 1, 16, 30 } }, + { // Band 1 + { 50, 75, 127 }, + { 57, 75, 124 }, + { 27, 67, 108 }, + { 10, 54, 86 }, + { 1, 33, 52 }, + { 1, 12, 18 } }, + { // Band 2 + { 43, 125, 151 }, + { 26, 108, 148 }, + { 7, 83, 122 }, + { 2, 59, 89 }, + { 1, 38, 60 }, + { 1, 17, 27 } }, + { // Band 3 + { 23, 144, 163 }, + { 13, 112, 154 }, + { 2, 75, 117 }, + { 1, 50, 81 }, + { 1, 31, 51 }, + { 1, 14, 23 } }, + { // Band 4 + { 18, 162, 185 }, + { 6, 123, 171 }, + { 1, 78, 125 }, + { 1, 51, 86 }, + { 1, 31, 54 }, + { 1, 14, 23 } }, + { // Band 5 + { 15, 199, 227 }, + { 3, 150, 204 }, + { 1, 91, 146 }, + { 1, 55, 95 }, + { 1, 30, 53 }, + { 1, 11, 20 } } }, + { // Inter + { // Band 0 + { 19, 55, 240 }, + { 19, 59, 196 }, + { 3, 52, 105 } }, + { // Band 1 + { 41, 166, 207 }, + { 104, 153, 199 }, + { 31, 123, 181 }, + { 14, 101, 152 }, + { 5, 72, 106 }, + { 1, 36, 52 } }, + { // Band 2 + { 35, 176, 211 }, + { 12, 131, 190 }, + { 2, 88, 144 }, + { 1, 60, 101 }, + { 1, 36, 60 }, + { 1, 16, 28 } }, + { // Band 3 + { 28, 183, 213 }, + { 8, 134, 191 }, + { 1, 86, 142 }, + { 1, 56, 96 }, + { 1, 30, 53 }, + { 1, 12, 20 } }, + { // Band 4 + { 20, 190, 215 }, + { 4, 135, 192 }, + { 1, 84, 139 }, + { 1, 53, 91 }, + { 1, 28, 49 }, + { 1, 11, 20 } }, + { // Band 5 + { 13, 196, 216 }, + { 2, 137, 192 }, + { 1, 86, 143 }, + { 1, 57, 99 }, + { 1, 32, 56 }, + { 1, 13, 24 } } } }, + { // UV plane + { // Intra + { // Band 0 + { 211, 29, 217 }, + { 96, 47, 156 }, + { 22, 43, 87 } }, + { // Band 1 + { 78, 120, 193 }, + { 111, 116, 186 }, + { 46, 102, 164 }, + { 15, 80, 128 }, + { 2, 49, 76 }, + { 1, 18, 28 } }, + { // Band 2 + { 71, 161, 203 }, + { 42, 132, 192 }, + { 10, 98, 150 }, + { 3, 69, 109 }, + { 1, 44, 70 }, + { 1, 18, 29 } }, + { // Band 3 + { 57, 186, 211 }, + { 30, 140, 196 }, + { 4, 93, 146 }, + { 1, 62, 102 }, + { 1, 38, 65 }, + { 1, 16, 27 } }, + { // Band 4 + { 47, 199, 217 }, + { 14, 145, 196 }, + { 1, 88, 142 }, + { 1, 57, 98 }, + { 1, 36, 62 }, + { 1, 15, 26 } }, + { // Band 5 + { 26, 219, 229 }, + { 5, 155, 207 }, + { 1, 94, 151 }, + { 1, 60, 104 }, + { 1, 36, 62 }, + { 1, 16, 28 } } }, + { // Inter + { // Band 0 + { 233, 29, 248 }, + { 146, 47, 220 }, + { 43, 52, 140 } }, + { // Band 1 + { 100, 163, 232 }, + { 179, 161, 222 }, + { 63, 142, 204 }, + { 37, 113, 174 }, + { 26, 89, 137 }, + { 18, 68, 97 } }, + { // Band 2 + { 85, 181, 230 }, + { 32, 146, 209 }, + { 7, 100, 164 }, + { 3, 71, 121 }, + { 1, 45, 77 }, + { 1, 18, 30 } }, + { // Band 3 + { 65, 187, 230 }, + { 20, 148, 207 }, + { 2, 97, 159 }, + { 1, 68, 116 }, + { 1, 40, 70 }, + { 1, 14, 29 } }, + { // Band 4 + { 40, 194, 227 }, + { 8, 147, 204 }, + { 1, 94, 155 }, + { 1, 65, 112 }, + { 1, 39, 66 }, + { 1, 14, 26 } }, + { // Band 5 + { 16, 208, 228 }, + { 3, 151, 207 }, + { 1, 98, 160 }, + { 1, 67, 117 }, + { 1, 41, 74 }, + { 1, 17, 31 } } } } +}; + +static const vp9_coeff_probs_model default_coef_probs_32x32[PLANE_TYPES] = { + { // Y plane + { // Intra + { // Band 0 + { 17, 38, 140 }, + { 7, 34, 80 }, + { 1, 17, 29 } }, + { // Band 1 + { 37, 75, 128 }, + { 41, 76, 128 }, + { 26, 66, 116 }, + { 12, 52, 94 }, + { 2, 32, 55 }, + { 1, 10, 16 } }, + { // Band 2 + { 50, 127, 154 }, + { 37, 109, 152 }, + { 16, 82, 121 }, + { 5, 59, 85 }, + { 1, 35, 54 }, + { 1, 13, 20 } }, + { // Band 3 + { 40, 142, 167 }, + { 17, 110, 157 }, + { 2, 71, 112 }, + { 1, 44, 72 }, + { 1, 27, 45 }, + { 1, 11, 17 } }, + { // Band 4 + { 30, 175, 188 }, + { 9, 124, 169 }, + { 1, 74, 116 }, + { 1, 48, 78 }, + { 1, 30, 49 }, + { 1, 11, 18 } }, + { // Band 5 + { 10, 222, 223 }, + { 2, 150, 194 }, + { 1, 83, 128 }, + { 1, 48, 79 }, + { 1, 27, 45 }, + { 1, 11, 17 } } }, + { // Inter + { // Band 0 + { 36, 41, 235 }, + { 29, 36, 193 }, + { 10, 27, 111 } }, + { // Band 1 + { 85, 165, 222 }, + { 177, 162, 215 }, + { 110, 135, 195 }, + { 57, 113, 168 }, + { 23, 83, 120 }, + { 10, 49, 61 } }, + { // Band 2 + { 85, 190, 223 }, + { 36, 139, 200 }, + { 5, 90, 146 }, + { 1, 60, 103 }, + { 1, 38, 65 }, + { 1, 18, 30 } }, + { // Band 3 + { 72, 202, 223 }, + { 23, 141, 199 }, + { 2, 86, 140 }, + { 1, 56, 97 }, + { 1, 36, 61 }, + { 1, 16, 27 } }, + { // Band 4 + { 55, 218, 225 }, + { 13, 145, 200 }, + { 1, 86, 141 }, + { 1, 57, 99 }, + { 1, 35, 61 }, + { 1, 13, 22 } }, + { // Band 5 + { 15, 235, 212 }, + { 1, 132, 184 }, + { 1, 84, 139 }, + { 1, 57, 97 }, + { 1, 34, 56 }, + { 1, 14, 23 } } } }, + { // UV plane + { // Intra + { // Band 0 + { 181, 21, 201 }, + { 61, 37, 123 }, + { 10, 38, 71 } }, + { // Band 1 + { 47, 106, 172 }, + { 95, 104, 173 }, + { 42, 93, 159 }, + { 18, 77, 131 }, + { 4, 50, 81 }, + { 1, 17, 23 } }, + { // Band 2 + { 62, 147, 199 }, + { 44, 130, 189 }, + { 28, 102, 154 }, + { 18, 75, 115 }, + { 2, 44, 65 }, + { 1, 12, 19 } }, + { // Band 3 + { 55, 153, 210 }, + { 24, 130, 194 }, + { 3, 93, 146 }, + { 1, 61, 97 }, + { 1, 31, 50 }, + { 1, 10, 16 } }, + { // Band 4 + { 49, 186, 223 }, + { 17, 148, 204 }, + { 1, 96, 142 }, + { 1, 53, 83 }, + { 1, 26, 44 }, + { 1, 11, 17 } }, + { // Band 5 + { 13, 217, 212 }, + { 2, 136, 180 }, + { 1, 78, 124 }, + { 1, 50, 83 }, + { 1, 29, 49 }, + { 1, 14, 23 } } }, + { // Inter + { // Band 0 + { 197, 13, 247 }, + { 82, 17, 222 }, + { 25, 17, 162 } }, + { // Band 1 + { 126, 186, 247 }, + { 234, 191, 243 }, + { 176, 177, 234 }, + { 104, 158, 220 }, + { 66, 128, 186 }, + { 55, 90, 137 } }, + { // Band 2 + { 111, 197, 242 }, + { 46, 158, 219 }, + { 9, 104, 171 }, + { 2, 65, 125 }, + { 1, 44, 80 }, + { 1, 17, 91 } }, + { // Band 3 + { 104, 208, 245 }, + { 39, 168, 224 }, + { 3, 109, 162 }, + { 1, 79, 124 }, + { 1, 50, 102 }, + { 1, 43, 102 } }, + { // Band 4 + { 84, 220, 246 }, + { 31, 177, 231 }, + { 2, 115, 180 }, + { 1, 79, 134 }, + { 1, 55, 77 }, + { 1, 60, 79 } }, + { // Band 5 + { 43, 243, 240 }, + { 8, 180, 217 }, + { 1, 115, 166 }, + { 1, 84, 121 }, + { 1, 51, 67 }, + { 1, 16, 6 } } } } +}; + +static void extend_to_full_distribution(vpx_prob *probs, vpx_prob p) { + assert(p != 0); + memcpy(probs, vp9_pareto8_full[p - 1], MODEL_NODES * sizeof(vpx_prob)); +} + +void vp9_model_to_full_probs(const vpx_prob *model, vpx_prob *full) { + if (full != model) + memcpy(full, model, sizeof(vpx_prob) * UNCONSTRAINED_NODES); + extend_to_full_distribution(&full[UNCONSTRAINED_NODES], model[PIVOT_NODE]); +} + +void vp9_default_coef_probs(VP9_COMMON *cm) { + vp9_copy(cm->fc->coef_probs[TX_4X4], default_coef_probs_4x4); + vp9_copy(cm->fc->coef_probs[TX_8X8], default_coef_probs_8x8); + vp9_copy(cm->fc->coef_probs[TX_16X16], default_coef_probs_16x16); + vp9_copy(cm->fc->coef_probs[TX_32X32], default_coef_probs_32x32); +} + +#define COEF_COUNT_SAT 24 +#define COEF_MAX_UPDATE_FACTOR 112 +#define COEF_COUNT_SAT_KEY 24 +#define COEF_MAX_UPDATE_FACTOR_KEY 112 +#define COEF_COUNT_SAT_AFTER_KEY 24 +#define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128 + +static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size, + unsigned int count_sat, + unsigned int update_factor) { + const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx]; + vp9_coeff_probs_model *const probs = cm->fc->coef_probs[tx_size]; + const vp9_coeff_probs_model *const pre_probs = pre_fc->coef_probs[tx_size]; + vp9_coeff_count_model *counts = cm->counts.coef[tx_size]; + unsigned int(*eob_counts)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] = + cm->counts.eob_branch[tx_size]; + int i, j, k, l, m; + + for (i = 0; i < PLANE_TYPES; ++i) + for (j = 0; j < REF_TYPES; ++j) + for (k = 0; k < COEF_BANDS; ++k) + for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) { + const int n0 = counts[i][j][k][l][ZERO_TOKEN]; + const int n1 = counts[i][j][k][l][ONE_TOKEN]; + const int n2 = counts[i][j][k][l][TWO_TOKEN]; + const int neob = counts[i][j][k][l][EOB_MODEL_TOKEN]; + const unsigned int branch_ct[UNCONSTRAINED_NODES][2] = { + { neob, eob_counts[i][j][k][l] - neob }, { n0, n1 + n2 }, { n1, n2 } + }; + for (m = 0; m < UNCONSTRAINED_NODES; ++m) + probs[i][j][k][l][m] = + merge_probs(pre_probs[i][j][k][l][m], branch_ct[m], count_sat, + update_factor); + } +} + +void vp9_adapt_coef_probs(VP9_COMMON *cm) { + TX_SIZE t; + unsigned int count_sat, update_factor; + + if (frame_is_intra_only(cm)) { + update_factor = COEF_MAX_UPDATE_FACTOR_KEY; + count_sat = COEF_COUNT_SAT_KEY; + } else if (cm->last_frame_type == KEY_FRAME) { + update_factor = COEF_MAX_UPDATE_FACTOR_AFTER_KEY; /* adapt quickly */ + count_sat = COEF_COUNT_SAT_AFTER_KEY; + } else { + update_factor = COEF_MAX_UPDATE_FACTOR; + count_sat = COEF_COUNT_SAT; + } + for (t = TX_4X4; t <= TX_32X32; t++) + adapt_coef_probs(cm, t, count_sat, update_factor); +} diff --git a/media/libvpx/libvpx/vp9/common/vp9_entropy.h b/media/libvpx/libvpx/vp9/common/vp9_entropy.h new file mode 100644 index 0000000000..d026651df7 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_entropy.h @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_COMMON_VP9_ENTROPY_H_ +#define VPX_VP9_COMMON_VP9_ENTROPY_H_ + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/prob.h" + +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_enums.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define DIFF_UPDATE_PROB 252 + +// Coefficient token alphabet +#define ZERO_TOKEN 0 // 0 Extra Bits 0+0 +#define ONE_TOKEN 1 // 1 Extra Bits 0+1 +#define TWO_TOKEN 2 // 2 Extra Bits 0+1 +#define THREE_TOKEN 3 // 3 Extra Bits 0+1 +#define FOUR_TOKEN 4 // 4 Extra Bits 0+1 +#define CATEGORY1_TOKEN 5 // 5-6 Extra Bits 1+1 +#define CATEGORY2_TOKEN 6 // 7-10 Extra Bits 2+1 +#define CATEGORY3_TOKEN 7 // 11-18 Extra Bits 3+1 +#define CATEGORY4_TOKEN 8 // 19-34 Extra Bits 4+1 +#define CATEGORY5_TOKEN 9 // 35-66 Extra Bits 5+1 +#define CATEGORY6_TOKEN 10 // 67+ Extra Bits 14+1 +#define EOB_TOKEN 11 // EOB Extra Bits 0+0 + +#define ENTROPY_TOKENS 12 + +#define ENTROPY_NODES 11 + +DECLARE_ALIGNED(16, extern const uint8_t, vp9_pt_energy_class[ENTROPY_TOKENS]); + +#define CAT1_MIN_VAL 5 +#define CAT2_MIN_VAL 7 +#define CAT3_MIN_VAL 11 +#define CAT4_MIN_VAL 19 +#define CAT5_MIN_VAL 35 +#define CAT6_MIN_VAL 67 + +// Extra bit probabilities. +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat1_prob[1]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat2_prob[2]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat3_prob[3]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat4_prob[4]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat5_prob[5]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat6_prob[14]); + +#if CONFIG_VP9_HIGHBITDEPTH +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat1_prob_high10[1]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat2_prob_high10[2]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat3_prob_high10[3]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat4_prob_high10[4]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat5_prob_high10[5]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat6_prob_high10[16]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat1_prob_high12[1]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat2_prob_high12[2]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat3_prob_high12[3]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat4_prob_high12[4]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat5_prob_high12[5]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat6_prob_high12[18]); +#endif // CONFIG_VP9_HIGHBITDEPTH + +#define EOB_MODEL_TOKEN 3 + +#define DCT_MAX_VALUE 16384 +#if CONFIG_VP9_HIGHBITDEPTH +#define DCT_MAX_VALUE_HIGH10 65536 +#define DCT_MAX_VALUE_HIGH12 262144 +#endif // CONFIG_VP9_HIGHBITDEPTH + +/* Coefficients are predicted via a 3-dimensional probability table. */ + +#define REF_TYPES 2 // intra=0, inter=1 + +/* Middle dimension reflects the coefficient position within the transform. */ +#define COEF_BANDS 6 + +/* Inside dimension is measure of nearby complexity, that reflects the energy + of nearby coefficients are nonzero. For the first coefficient (DC, unless + block type is 0), we look at the (already encoded) blocks above and to the + left of the current block. The context index is then the number (0,1,or 2) + of these blocks having nonzero coefficients. + After decoding a coefficient, the measure is determined by the size of the + most recently decoded coefficient. + Note that the intuitive meaning of this measure changes as coefficients + are decoded, e.g., prior to the first token, a zero means that my neighbors + are empty while, after the first token, because of the use of end-of-block, + a zero means we just decoded a zero and hence guarantees that a non-zero + coefficient will appear later in this block. However, this shift + in meaning is perfectly OK because our context depends also on the + coefficient band (and since zigzag positions 0, 1, and 2 are in + distinct bands). */ + +#define COEFF_CONTEXTS 6 +#define BAND_COEFF_CONTEXTS(band) ((band) == 0 ? 3 : COEFF_CONTEXTS) + +// #define ENTROPY_STATS + +typedef unsigned int vp9_coeff_count[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] + [ENTROPY_TOKENS]; +typedef unsigned int vp9_coeff_stats[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] + [ENTROPY_NODES][2]; + +#define SUBEXP_PARAM 4 /* Subexponential code parameter */ +#define MODULUS_PARAM 13 /* Modulus parameter */ + +struct VP9Common; +void vp9_default_coef_probs(struct VP9Common *cm); +void vp9_adapt_coef_probs(struct VP9Common *cm); + +// This is the index in the scan order beyond which all coefficients for +// 8x8 transform and above are in the top band. +// This macro is currently unused but may be used by certain implementations +#define MAXBAND_INDEX 21 + +DECLARE_ALIGNED(16, extern const uint8_t, vp9_coefband_trans_8x8plus[1024]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_coefband_trans_4x4[16]); + +static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) { + return tx_size == TX_4X4 ? vp9_coefband_trans_4x4 + : vp9_coefband_trans_8x8plus; +} + +// 128 lists of probabilities are stored for the following ONE node probs: +// 1, 3, 5, 7, ..., 253, 255 +// In between probabilities are interpolated linearly +#define COEFF_PROB_MODELS 255 + +#define UNCONSTRAINED_NODES 3 + +#define PIVOT_NODE 2 // which node is pivot + +#define MODEL_NODES (ENTROPY_NODES - UNCONSTRAINED_NODES) +extern const vpx_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)]; +extern const vpx_prob vp9_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES]; + +typedef vpx_prob vp9_coeff_probs_model[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] + [UNCONSTRAINED_NODES]; + +typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS] + [COEFF_CONTEXTS] + [UNCONSTRAINED_NODES + 1]; + +void vp9_model_to_full_probs(const vpx_prob *model, vpx_prob *full); + +typedef char ENTROPY_CONTEXT; + +static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a, + ENTROPY_CONTEXT b) { + return (a != 0) + (b != 0); +} + +static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, + const ENTROPY_CONTEXT *l) { + ENTROPY_CONTEXT above_ec = 0, left_ec = 0; + + switch (tx_size) { + case TX_4X4: + above_ec = a[0] != 0; + left_ec = l[0] != 0; + break; + case TX_8X8: + above_ec = !!*(const uint16_t *)a; + left_ec = !!*(const uint16_t *)l; + break; + case TX_16X16: + above_ec = !!*(const uint32_t *)a; + left_ec = !!*(const uint32_t *)l; + break; + case TX_32X32: + above_ec = !!*(const uint64_t *)a; + left_ec = !!*(const uint64_t *)l; + break; + default: assert(0 && "Invalid transform size."); break; + } + + return combine_entropy_contexts(above_ec, left_ec); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_COMMON_VP9_ENTROPY_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_entropymode.c b/media/libvpx/libvpx/vp9/common/vp9_entropymode.c new file mode 100644 index 0000000000..9289fc9e1f --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_entropymode.c @@ -0,0 +1,469 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_mem/vpx_mem.h" + +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_seg_common.h" + +const vpx_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES][INTRA_MODES - 1] = { + { + // above = dc + { 137, 30, 42, 148, 151, 207, 70, 52, 91 }, // left = dc + { 92, 45, 102, 136, 116, 180, 74, 90, 100 }, // left = v + { 73, 32, 19, 187, 222, 215, 46, 34, 100 }, // left = h + { 91, 30, 32, 116, 121, 186, 93, 86, 94 }, // left = d45 + { 72, 35, 36, 149, 68, 206, 68, 63, 105 }, // left = d135 + { 73, 31, 28, 138, 57, 124, 55, 122, 151 }, // left = d117 + { 67, 23, 21, 140, 126, 197, 40, 37, 171 }, // left = d153 + { 86, 27, 28, 128, 154, 212, 45, 43, 53 }, // left = d207 + { 74, 32, 27, 107, 86, 160, 63, 134, 102 }, // left = d63 + { 59, 67, 44, 140, 161, 202, 78, 67, 119 } // left = tm + }, + { + // above = v + { 63, 36, 126, 146, 123, 158, 60, 90, 96 }, // left = dc + { 43, 46, 168, 134, 107, 128, 69, 142, 92 }, // left = v + { 44, 29, 68, 159, 201, 177, 50, 57, 77 }, // left = h + { 58, 38, 76, 114, 97, 172, 78, 133, 92 }, // left = d45 + { 46, 41, 76, 140, 63, 184, 69, 112, 57 }, // left = d135 + { 38, 32, 85, 140, 46, 112, 54, 151, 133 }, // left = d117 + { 39, 27, 61, 131, 110, 175, 44, 75, 136 }, // left = d153 + { 52, 30, 74, 113, 130, 175, 51, 64, 58 }, // left = d207 + { 47, 35, 80, 100, 74, 143, 64, 163, 74 }, // left = d63 + { 36, 61, 116, 114, 128, 162, 80, 125, 82 } // left = tm + }, + { + // above = h + { 82, 26, 26, 171, 208, 204, 44, 32, 105 }, // left = dc + { 55, 44, 68, 166, 179, 192, 57, 57, 108 }, // left = v + { 42, 26, 11, 199, 241, 228, 23, 15, 85 }, // left = h + { 68, 42, 19, 131, 160, 199, 55, 52, 83 }, // left = d45 + { 58, 50, 25, 139, 115, 232, 39, 52, 118 }, // left = d135 + { 50, 35, 33, 153, 104, 162, 64, 59, 131 }, // left = d117 + { 44, 24, 16, 150, 177, 202, 33, 19, 156 }, // left = d153 + { 55, 27, 12, 153, 203, 218, 26, 27, 49 }, // left = d207 + { 53, 49, 21, 110, 116, 168, 59, 80, 76 }, // left = d63 + { 38, 72, 19, 168, 203, 212, 50, 50, 107 } // left = tm + }, + { + // above = d45 + { 103, 26, 36, 129, 132, 201, 83, 80, 93 }, // left = dc + { 59, 38, 83, 112, 103, 162, 98, 136, 90 }, // left = v + { 62, 30, 23, 158, 200, 207, 59, 57, 50 }, // left = h + { 67, 30, 29, 84, 86, 191, 102, 91, 59 }, // left = d45 + { 60, 32, 33, 112, 71, 220, 64, 89, 104 }, // left = d135 + { 53, 26, 34, 130, 56, 149, 84, 120, 103 }, // left = d117 + { 53, 21, 23, 133, 109, 210, 56, 77, 172 }, // left = d153 + { 77, 19, 29, 112, 142, 228, 55, 66, 36 }, // left = d207 + { 61, 29, 29, 93, 97, 165, 83, 175, 162 }, // left = d63 + { 47, 47, 43, 114, 137, 181, 100, 99, 95 } // left = tm + }, + { + // above = d135 + { 69, 23, 29, 128, 83, 199, 46, 44, 101 }, // left = dc + { 53, 40, 55, 139, 69, 183, 61, 80, 110 }, // left = v + { 40, 29, 19, 161, 180, 207, 43, 24, 91 }, // left = h + { 60, 34, 19, 105, 61, 198, 53, 64, 89 }, // left = d45 + { 52, 31, 22, 158, 40, 209, 58, 62, 89 }, // left = d135 + { 44, 31, 29, 147, 46, 158, 56, 102, 198 }, // left = d117 + { 35, 19, 12, 135, 87, 209, 41, 45, 167 }, // left = d153 + { 55, 25, 21, 118, 95, 215, 38, 39, 66 }, // left = d207 + { 51, 38, 25, 113, 58, 164, 70, 93, 97 }, // left = d63 + { 47, 54, 34, 146, 108, 203, 72, 103, 151 } // left = tm + }, + { + // above = d117 + { 64, 19, 37, 156, 66, 138, 49, 95, 133 }, // left = dc + { 46, 27, 80, 150, 55, 124, 55, 121, 135 }, // left = v + { 36, 23, 27, 165, 149, 166, 54, 64, 118 }, // left = h + { 53, 21, 36, 131, 63, 163, 60, 109, 81 }, // left = d45 + { 40, 26, 35, 154, 40, 185, 51, 97, 123 }, // left = d135 + { 35, 19, 34, 179, 19, 97, 48, 129, 124 }, // left = d117 + { 36, 20, 26, 136, 62, 164, 33, 77, 154 }, // left = d153 + { 45, 18, 32, 130, 90, 157, 40, 79, 91 }, // left = d207 + { 45, 26, 28, 129, 45, 129, 49, 147, 123 }, // left = d63 + { 38, 44, 51, 136, 74, 162, 57, 97, 121 } // left = tm + }, + { + // above = d153 + { 75, 17, 22, 136, 138, 185, 32, 34, 166 }, // left = dc + { 56, 39, 58, 133, 117, 173, 48, 53, 187 }, // left = v + { 35, 21, 12, 161, 212, 207, 20, 23, 145 }, // left = h + { 56, 29, 19, 117, 109, 181, 55, 68, 112 }, // left = d45 + { 47, 29, 17, 153, 64, 220, 59, 51, 114 }, // left = d135 + { 46, 16, 24, 136, 76, 147, 41, 64, 172 }, // left = d117 + { 34, 17, 11, 108, 152, 187, 13, 15, 209 }, // left = d153 + { 51, 24, 14, 115, 133, 209, 32, 26, 104 }, // left = d207 + { 55, 30, 18, 122, 79, 179, 44, 88, 116 }, // left = d63 + { 37, 49, 25, 129, 168, 164, 41, 54, 148 } // left = tm + }, + { + // above = d207 + { 82, 22, 32, 127, 143, 213, 39, 41, 70 }, // left = dc + { 62, 44, 61, 123, 105, 189, 48, 57, 64 }, // left = v + { 47, 25, 17, 175, 222, 220, 24, 30, 86 }, // left = h + { 68, 36, 17, 106, 102, 206, 59, 74, 74 }, // left = d45 + { 57, 39, 23, 151, 68, 216, 55, 63, 58 }, // left = d135 + { 49, 30, 35, 141, 70, 168, 82, 40, 115 }, // left = d117 + { 51, 25, 15, 136, 129, 202, 38, 35, 139 }, // left = d153 + { 68, 26, 16, 111, 141, 215, 29, 28, 28 }, // left = d207 + { 59, 39, 19, 114, 75, 180, 77, 104, 42 }, // left = d63 + { 40, 61, 26, 126, 152, 206, 61, 59, 93 } // left = tm + }, + { + // above = d63 + { 78, 23, 39, 111, 117, 170, 74, 124, 94 }, // left = dc + { 48, 34, 86, 101, 92, 146, 78, 179, 134 }, // left = v + { 47, 22, 24, 138, 187, 178, 68, 69, 59 }, // left = h + { 56, 25, 33, 105, 112, 187, 95, 177, 129 }, // left = d45 + { 48, 31, 27, 114, 63, 183, 82, 116, 56 }, // left = d135 + { 43, 28, 37, 121, 63, 123, 61, 192, 169 }, // left = d117 + { 42, 17, 24, 109, 97, 177, 56, 76, 122 }, // left = d153 + { 58, 18, 28, 105, 139, 182, 70, 92, 63 }, // left = d207 + { 46, 23, 32, 74, 86, 150, 67, 183, 88 }, // left = d63 + { 36, 38, 48, 92, 122, 165, 88, 137, 91 } // left = tm + }, + { + // above = tm + { 65, 70, 60, 155, 159, 199, 61, 60, 81 }, // left = dc + { 44, 78, 115, 132, 119, 173, 71, 112, 93 }, // left = v + { 39, 38, 21, 184, 227, 206, 42, 32, 64 }, // left = h + { 58, 47, 36, 124, 137, 193, 80, 82, 78 }, // left = d45 + { 49, 50, 35, 144, 95, 205, 63, 78, 59 }, // left = d135 + { 41, 53, 52, 148, 71, 142, 65, 128, 51 }, // left = d117 + { 40, 36, 28, 143, 143, 202, 40, 55, 137 }, // left = d153 + { 52, 34, 29, 129, 183, 227, 42, 35, 43 }, // left = d207 + { 42, 44, 44, 104, 105, 164, 64, 130, 80 }, // left = d63 + { 43, 81, 53, 140, 169, 204, 68, 84, 72 } // left = tm + } +}; + +const vpx_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1] = { + { 144, 11, 54, 157, 195, 130, 46, 58, 108 }, // y = dc + { 118, 15, 123, 148, 131, 101, 44, 93, 131 }, // y = v + { 113, 12, 23, 188, 226, 142, 26, 32, 125 }, // y = h + { 120, 11, 50, 123, 163, 135, 64, 77, 103 }, // y = d45 + { 113, 9, 36, 155, 111, 157, 32, 44, 161 }, // y = d135 + { 116, 9, 55, 176, 76, 96, 37, 61, 149 }, // y = d117 + { 115, 9, 28, 141, 161, 167, 21, 25, 193 }, // y = d153 + { 120, 12, 32, 145, 195, 142, 32, 38, 86 }, // y = d207 + { 116, 12, 64, 120, 140, 125, 49, 115, 121 }, // y = d63 + { 102, 19, 66, 162, 182, 122, 35, 59, 128 } // y = tm +}; + +static const vpx_prob default_if_y_probs[BLOCK_SIZE_GROUPS][INTRA_MODES - 1] = { + { 65, 32, 18, 144, 162, 194, 41, 51, 98 }, // block_size < 8x8 + { 132, 68, 18, 165, 217, 196, 45, 40, 78 }, // block_size < 16x16 + { 173, 80, 19, 176, 240, 193, 64, 35, 46 }, // block_size < 32x32 + { 221, 135, 38, 194, 248, 121, 96, 85, 29 } // block_size >= 32x32 +}; + +static const vpx_prob default_if_uv_probs[INTRA_MODES][INTRA_MODES - 1] = { + { 120, 7, 76, 176, 208, 126, 28, 54, 103 }, // y = dc + { 48, 12, 154, 155, 139, 90, 34, 117, 119 }, // y = v + { 67, 6, 25, 204, 243, 158, 13, 21, 96 }, // y = h + { 97, 5, 44, 131, 176, 139, 48, 68, 97 }, // y = d45 + { 83, 5, 42, 156, 111, 152, 26, 49, 152 }, // y = d135 + { 80, 5, 58, 178, 74, 83, 33, 62, 145 }, // y = d117 + { 86, 5, 32, 154, 192, 168, 14, 22, 163 }, // y = d153 + { 85, 5, 32, 156, 216, 148, 19, 29, 73 }, // y = d207 + { 77, 7, 64, 116, 132, 122, 37, 126, 120 }, // y = d63 + { 101, 21, 107, 181, 192, 103, 19, 67, 125 } // y = tm +}; + +const vpx_prob vp9_kf_partition_probs[PARTITION_CONTEXTS] + [PARTITION_TYPES - 1] = { + // 8x8 -> 4x4 + { 158, 97, 94 }, // a/l both not split + { 93, 24, 99 }, // a split, l not split + { 85, 119, 44 }, // l split, a not split + { 62, 59, 67 }, // a/l both split + + // 16x16 -> 8x8 + { 149, 53, 53 }, // a/l both not split + { 94, 20, 48 }, // a split, l not split + { 83, 53, 24 }, // l split, a not split + { 52, 18, 18 }, // a/l both split + + // 32x32 -> 16x16 + { 150, 40, 39 }, // a/l both not split + { 78, 12, 26 }, // a split, l not split + { 67, 33, 11 }, // l split, a not split + { 24, 7, 5 }, // a/l both split + + // 64x64 -> 32x32 + { 174, 35, 49 }, // a/l both not split + { 68, 11, 27 }, // a split, l not split + { 57, 15, 9 }, // l split, a not split + { 12, 3, 3 }, // a/l both split + }; + +static const vpx_prob + default_partition_probs[PARTITION_CONTEXTS][PARTITION_TYPES - 1] = { + // 8x8 -> 4x4 + { 199, 122, 141 }, // a/l both not split + { 147, 63, 159 }, // a split, l not split + { 148, 133, 118 }, // l split, a not split + { 121, 104, 114 }, // a/l both split + // 16x16 -> 8x8 + { 174, 73, 87 }, // a/l both not split + { 92, 41, 83 }, // a split, l not split + { 82, 99, 50 }, // l split, a not split + { 53, 39, 39 }, // a/l both split + // 32x32 -> 16x16 + { 177, 58, 59 }, // a/l both not split + { 68, 26, 63 }, // a split, l not split + { 52, 79, 25 }, // l split, a not split + { 17, 14, 12 }, // a/l both split + // 64x64 -> 32x32 + { 222, 34, 30 }, // a/l both not split + { 72, 16, 44 }, // a split, l not split + { 58, 32, 12 }, // l split, a not split + { 10, 7, 6 }, // a/l both split + }; + +static const vpx_prob + default_inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1] = { + { 2, 173, 34 }, // 0 = both zero mv + { 7, 145, 85 }, // 1 = one zero mv + one a predicted mv + { 7, 166, 63 }, // 2 = two predicted mvs + { 7, 94, 66 }, // 3 = one predicted/zero and one new mv + { 8, 64, 46 }, // 4 = two new mvs + { 17, 81, 31 }, // 5 = one intra neighbour + x + { 25, 29, 30 }, // 6 = two intra neighbours + }; + +/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */ +const vpx_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = { + -DC_PRED, 2, /* 0 = DC_NODE */ + -TM_PRED, 4, /* 1 = TM_NODE */ + -V_PRED, 6, /* 2 = V_NODE */ + 8, 12, /* 3 = COM_NODE */ + -H_PRED, 10, /* 4 = H_NODE */ + -D135_PRED, -D117_PRED, /* 5 = D135_NODE */ + -D45_PRED, 14, /* 6 = D45_NODE */ + -D63_PRED, 16, /* 7 = D63_NODE */ + -D153_PRED, -D207_PRED /* 8 = D153_NODE */ +}; + +const vpx_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)] = { + -INTER_OFFSET(ZEROMV), 2, -INTER_OFFSET(NEARESTMV), 4, -INTER_OFFSET(NEARMV), + -INTER_OFFSET(NEWMV) +}; + +const vpx_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)] = { + -PARTITION_NONE, 2, -PARTITION_HORZ, 4, -PARTITION_VERT, -PARTITION_SPLIT +}; + +static const vpx_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = { 9, 102, + 187, + 225 }; + +static const vpx_prob default_comp_inter_p[COMP_INTER_CONTEXTS] = { 239, 183, + 119, 96, + 41 }; + +static const vpx_prob default_comp_ref_p[REF_CONTEXTS] = { 50, 126, 123, 221, + 226 }; + +static const vpx_prob default_single_ref_p[REF_CONTEXTS][2] = { + { 33, 16 }, { 77, 74 }, { 142, 142 }, { 172, 170 }, { 238, 247 } +}; + +static const struct tx_probs default_tx_probs = { { { 3, 136, 37 }, + { 5, 52, 13 } }, + + { { 20, 152 }, { 15, 101 } }, + + { { 100 }, { 66 } } }; + +void tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p, + unsigned int (*ct_32x32p)[2]) { + ct_32x32p[0][0] = tx_count_32x32p[TX_4X4]; + ct_32x32p[0][1] = tx_count_32x32p[TX_8X8] + tx_count_32x32p[TX_16X16] + + tx_count_32x32p[TX_32X32]; + ct_32x32p[1][0] = tx_count_32x32p[TX_8X8]; + ct_32x32p[1][1] = tx_count_32x32p[TX_16X16] + tx_count_32x32p[TX_32X32]; + ct_32x32p[2][0] = tx_count_32x32p[TX_16X16]; + ct_32x32p[2][1] = tx_count_32x32p[TX_32X32]; +} + +void tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p, + unsigned int (*ct_16x16p)[2]) { + ct_16x16p[0][0] = tx_count_16x16p[TX_4X4]; + ct_16x16p[0][1] = tx_count_16x16p[TX_8X8] + tx_count_16x16p[TX_16X16]; + ct_16x16p[1][0] = tx_count_16x16p[TX_8X8]; + ct_16x16p[1][1] = tx_count_16x16p[TX_16X16]; +} + +void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p, + unsigned int (*ct_8x8p)[2]) { + ct_8x8p[0][0] = tx_count_8x8p[TX_4X4]; + ct_8x8p[0][1] = tx_count_8x8p[TX_8X8]; +} + +static const vpx_prob default_skip_probs[SKIP_CONTEXTS] = { 192, 128, 64 }; + +static const vpx_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS] + [SWITCHABLE_FILTERS - 1] = { + { 235, 162 }, + { 36, 255 }, + { 34, 3 }, + { 149, 144 }, + }; + +static void init_mode_probs(FRAME_CONTEXT *fc) { + vp9_copy(fc->uv_mode_prob, default_if_uv_probs); + vp9_copy(fc->y_mode_prob, default_if_y_probs); + vp9_copy(fc->switchable_interp_prob, default_switchable_interp_prob); + vp9_copy(fc->partition_prob, default_partition_probs); + vp9_copy(fc->intra_inter_prob, default_intra_inter_p); + vp9_copy(fc->comp_inter_prob, default_comp_inter_p); + vp9_copy(fc->comp_ref_prob, default_comp_ref_p); + vp9_copy(fc->single_ref_prob, default_single_ref_p); + fc->tx_probs = default_tx_probs; + vp9_copy(fc->skip_probs, default_skip_probs); + vp9_copy(fc->inter_mode_probs, default_inter_mode_probs); +} + +const vpx_tree_index vp9_switchable_interp_tree[TREE_SIZE( + SWITCHABLE_FILTERS)] = { -EIGHTTAP, 2, -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP }; + +void vp9_adapt_mode_probs(VP9_COMMON *cm) { + int i, j; + FRAME_CONTEXT *fc = cm->fc; + const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx]; + const FRAME_COUNTS *counts = &cm->counts; + + for (i = 0; i < INTRA_INTER_CONTEXTS; i++) + fc->intra_inter_prob[i] = mode_mv_merge_probs(pre_fc->intra_inter_prob[i], + counts->intra_inter[i]); + for (i = 0; i < COMP_INTER_CONTEXTS; i++) + fc->comp_inter_prob[i] = + mode_mv_merge_probs(pre_fc->comp_inter_prob[i], counts->comp_inter[i]); + for (i = 0; i < REF_CONTEXTS; i++) + fc->comp_ref_prob[i] = + mode_mv_merge_probs(pre_fc->comp_ref_prob[i], counts->comp_ref[i]); + for (i = 0; i < REF_CONTEXTS; i++) + for (j = 0; j < 2; j++) + fc->single_ref_prob[i][j] = mode_mv_merge_probs( + pre_fc->single_ref_prob[i][j], counts->single_ref[i][j]); + + for (i = 0; i < INTER_MODE_CONTEXTS; i++) + vpx_tree_merge_probs(vp9_inter_mode_tree, pre_fc->inter_mode_probs[i], + counts->inter_mode[i], fc->inter_mode_probs[i]); + + for (i = 0; i < BLOCK_SIZE_GROUPS; i++) + vpx_tree_merge_probs(vp9_intra_mode_tree, pre_fc->y_mode_prob[i], + counts->y_mode[i], fc->y_mode_prob[i]); + + for (i = 0; i < INTRA_MODES; ++i) + vpx_tree_merge_probs(vp9_intra_mode_tree, pre_fc->uv_mode_prob[i], + counts->uv_mode[i], fc->uv_mode_prob[i]); + + for (i = 0; i < PARTITION_CONTEXTS; i++) + vpx_tree_merge_probs(vp9_partition_tree, pre_fc->partition_prob[i], + counts->partition[i], fc->partition_prob[i]); + + if (cm->interp_filter == SWITCHABLE) { + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) + vpx_tree_merge_probs( + vp9_switchable_interp_tree, pre_fc->switchable_interp_prob[i], + counts->switchable_interp[i], fc->switchable_interp_prob[i]); + } + + if (cm->tx_mode == TX_MODE_SELECT) { + unsigned int branch_ct_8x8p[TX_SIZES - 3][2]; + unsigned int branch_ct_16x16p[TX_SIZES - 2][2]; + unsigned int branch_ct_32x32p[TX_SIZES - 1][2]; + + for (i = 0; i < TX_SIZE_CONTEXTS; ++i) { + tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], branch_ct_8x8p); + for (j = 0; j < TX_SIZES - 3; ++j) + fc->tx_probs.p8x8[i][j] = + mode_mv_merge_probs(pre_fc->tx_probs.p8x8[i][j], branch_ct_8x8p[j]); + + tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], branch_ct_16x16p); + for (j = 0; j < TX_SIZES - 2; ++j) + fc->tx_probs.p16x16[i][j] = mode_mv_merge_probs( + pre_fc->tx_probs.p16x16[i][j], branch_ct_16x16p[j]); + + tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], branch_ct_32x32p); + for (j = 0; j < TX_SIZES - 1; ++j) + fc->tx_probs.p32x32[i][j] = mode_mv_merge_probs( + pre_fc->tx_probs.p32x32[i][j], branch_ct_32x32p[j]); + } + } + + for (i = 0; i < SKIP_CONTEXTS; ++i) + fc->skip_probs[i] = + mode_mv_merge_probs(pre_fc->skip_probs[i], counts->skip[i]); +} + +static void set_default_lf_deltas(struct loopfilter *lf) { + lf->mode_ref_delta_enabled = 1; + lf->mode_ref_delta_update = 1; + + lf->ref_deltas[INTRA_FRAME] = 1; + lf->ref_deltas[LAST_FRAME] = 0; + lf->ref_deltas[GOLDEN_FRAME] = -1; + lf->ref_deltas[ALTREF_FRAME] = -1; + + lf->mode_deltas[0] = 0; + lf->mode_deltas[1] = 0; +} + +void vp9_setup_past_independence(VP9_COMMON *cm) { + // Reset the segment feature data to the default stats: + // Features disabled, 0, with delta coding (Default state). + struct loopfilter *const lf = &cm->lf; + + int i; + vp9_clearall_segfeatures(&cm->seg); + cm->seg.abs_delta = SEGMENT_DELTADATA; + + if (cm->last_frame_seg_map) + memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols)); + + if (cm->current_frame_seg_map) + memset(cm->current_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols)); + + // Reset the mode ref deltas for loop filter + vp9_zero(lf->last_ref_deltas); + vp9_zero(lf->last_mode_deltas); + set_default_lf_deltas(lf); + + // To force update of the sharpness + lf->last_sharpness_level = -1; + + vp9_default_coef_probs(cm); + init_mode_probs(cm->fc); + vp9_init_mv_probs(cm); + cm->fc->initialized = 1; + + if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode || + cm->reset_frame_context == 3) { + // Reset all frame contexts. + for (i = 0; i < FRAME_CONTEXTS; ++i) cm->frame_contexts[i] = *cm->fc; + } else if (cm->reset_frame_context == 2) { + // Reset only the frame context specified in the frame header. + cm->frame_contexts[cm->frame_context_idx] = *cm->fc; + } + + // prev_mip will only be allocated in encoder. + if (frame_is_intra_only(cm) && cm->prev_mip) + memset(cm->prev_mip, 0, + cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->prev_mip)); + + vp9_zero(cm->ref_frame_sign_bias); + + cm->frame_context_idx = 0; +} diff --git a/media/libvpx/libvpx/vp9/common/vp9_entropymode.h b/media/libvpx/libvpx/vp9/common/vp9_entropymode.h new file mode 100644 index 0000000000..a756c8d0b8 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_entropymode.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_COMMON_VP9_ENTROPYMODE_H_ +#define VPX_VP9_COMMON_VP9_ENTROPYMODE_H_ + +#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_entropymv.h" +#include "vp9/common/vp9_filter.h" +#include "vpx_dsp/vpx_filter.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define BLOCK_SIZE_GROUPS 4 + +#define TX_SIZE_CONTEXTS 2 + +#define INTER_OFFSET(mode) ((mode)-NEARESTMV) + +struct VP9Common; + +struct tx_probs { + vpx_prob p32x32[TX_SIZE_CONTEXTS][TX_SIZES - 1]; + vpx_prob p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 2]; + vpx_prob p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 3]; +}; + +struct tx_counts { + unsigned int p32x32[TX_SIZE_CONTEXTS][TX_SIZES]; + unsigned int p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 1]; + unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 2]; + unsigned int tx_totals[TX_SIZES]; +}; + +typedef struct frame_contexts { + vpx_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1]; + vpx_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1]; + vpx_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1]; + vp9_coeff_probs_model coef_probs[TX_SIZES][PLANE_TYPES]; + vpx_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS] + [SWITCHABLE_FILTERS - 1]; + vpx_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1]; + vpx_prob intra_inter_prob[INTRA_INTER_CONTEXTS]; + vpx_prob comp_inter_prob[COMP_INTER_CONTEXTS]; + vpx_prob single_ref_prob[REF_CONTEXTS][2]; + vpx_prob comp_ref_prob[REF_CONTEXTS]; + struct tx_probs tx_probs; + vpx_prob skip_probs[SKIP_CONTEXTS]; + nmv_context nmvc; + int initialized; +} FRAME_CONTEXT; + +typedef struct FRAME_COUNTS { + unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES]; + unsigned int uv_mode[INTRA_MODES][INTRA_MODES]; + unsigned int partition[PARTITION_CONTEXTS][PARTITION_TYPES]; + vp9_coeff_count_model coef[TX_SIZES][PLANE_TYPES]; + unsigned int eob_branch[TX_SIZES][PLANE_TYPES][REF_TYPES][COEF_BANDS] + [COEFF_CONTEXTS]; + unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS] + [SWITCHABLE_FILTERS]; + unsigned int inter_mode[INTER_MODE_CONTEXTS][INTER_MODES]; + unsigned int intra_inter[INTRA_INTER_CONTEXTS][2]; + unsigned int comp_inter[COMP_INTER_CONTEXTS][2]; + unsigned int single_ref[REF_CONTEXTS][2][2]; + unsigned int comp_ref[REF_CONTEXTS][2]; + struct tx_counts tx; + unsigned int skip[SKIP_CONTEXTS][2]; + nmv_context_counts mv; +} FRAME_COUNTS; + +extern const vpx_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1]; +extern const vpx_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES] + [INTRA_MODES - 1]; +extern const vpx_prob vp9_kf_partition_probs[PARTITION_CONTEXTS] + [PARTITION_TYPES - 1]; +extern const vpx_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)]; +extern const vpx_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)]; +extern const vpx_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)]; +extern const vpx_tree_index + vp9_switchable_interp_tree[TREE_SIZE(SWITCHABLE_FILTERS)]; + +void vp9_setup_past_independence(struct VP9Common *cm); + +void vp9_adapt_mode_probs(struct VP9Common *cm); + +void tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p, + unsigned int (*ct_32x32p)[2]); +void tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p, + unsigned int (*ct_16x16p)[2]); +void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p, + unsigned int (*ct_8x8p)[2]); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_COMMON_VP9_ENTROPYMODE_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_entropymv.c b/media/libvpx/libvpx/vp9/common/vp9_entropymv.c new file mode 100644 index 0000000000..b6f052d088 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_entropymv.c @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_entropymv.h" + +const vpx_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)] = { + -MV_JOINT_ZERO, 2, -MV_JOINT_HNZVZ, 4, -MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ +}; + +const vpx_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = { + -MV_CLASS_0, 2, -MV_CLASS_1, 4, 6, + 8, -MV_CLASS_2, -MV_CLASS_3, 10, 12, + -MV_CLASS_4, -MV_CLASS_5, -MV_CLASS_6, 14, 16, + 18, -MV_CLASS_7, -MV_CLASS_8, -MV_CLASS_9, -MV_CLASS_10, +}; + +const vpx_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = { -0, -1 }; + +const vpx_tree_index vp9_mv_fp_tree[TREE_SIZE(MV_FP_SIZE)] = { -0, 2, -1, + 4, -2, -3 }; + +static const nmv_context default_nmv_context = { + { 32, 64, 96 }, + { { + // Vertical component + 128, // sign + { 224, 144, 192, 168, 192, 176, 192, 198, 198, 245 }, // class + { 216 }, // class0 + { 136, 140, 148, 160, 176, 192, 224, 234, 234, 240 }, // bits + { { 128, 128, 64 }, { 96, 112, 64 } }, // class0_fp + { 64, 96, 64 }, // fp + 160, // class0_hp bit + 128, // hp + }, + { + // Horizontal component + 128, // sign + { 216, 128, 176, 160, 176, 176, 192, 198, 198, 208 }, // class + { 208 }, // class0 + { 136, 140, 148, 160, 176, 192, 224, 234, 234, 240 }, // bits + { { 128, 128, 64 }, { 96, 112, 64 } }, // class0_fp + { 64, 96, 64 }, // fp + 160, // class0_hp bit + 128, // hp + } }, +}; + +static const uint8_t log_in_base_2[] = { + 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10 +}; + +static INLINE int mv_class_base(MV_CLASS_TYPE c) { + return c ? CLASS0_SIZE << (c + 2) : 0; +} + +MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) { + const MV_CLASS_TYPE c = (z >= CLASS0_SIZE * 4096) + ? MV_CLASS_10 + : (MV_CLASS_TYPE)log_in_base_2[z >> 3]; + if (offset) *offset = z - mv_class_base(c); + return c; +} + +static void inc_mv_component(int v, nmv_component_counts *comp_counts, int incr, + int usehp) { + int s, z, c, o, d, e, f; + assert(v != 0); /* should not be zero */ + s = v < 0; + comp_counts->sign[s] += incr; + z = (s ? -v : v) - 1; /* magnitude - 1 */ + + c = vp9_get_mv_class(z, &o); + comp_counts->classes[c] += incr; + + d = (o >> 3); /* int mv data */ + f = (o >> 1) & 3; /* fractional pel mv data */ + e = (o & 1); /* high precision mv data */ + + if (c == MV_CLASS_0) { + comp_counts->class0[d] += incr; + comp_counts->class0_fp[d][f] += incr; + comp_counts->class0_hp[e] += usehp * incr; + } else { + int i; + int b = c + CLASS0_BITS - 1; // number of bits + for (i = 0; i < b; ++i) comp_counts->bits[i][((d >> i) & 1)] += incr; + comp_counts->fp[f] += incr; + comp_counts->hp[e] += usehp * incr; + } +} + +void vp9_inc_mv(const MV *mv, nmv_context_counts *counts) { + if (counts != NULL) { + const MV_JOINT_TYPE j = vp9_get_mv_joint(mv); + ++counts->joints[j]; + + if (mv_joint_vertical(j)) { + inc_mv_component(mv->row, &counts->comps[0], 1, 1); + } + + if (mv_joint_horizontal(j)) { + inc_mv_component(mv->col, &counts->comps[1], 1, 1); + } + } +} + +void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) { + int i, j; + + nmv_context *fc = &cm->fc->nmvc; + const nmv_context *pre_fc = &cm->frame_contexts[cm->frame_context_idx].nmvc; + const nmv_context_counts *counts = &cm->counts.mv; + + vpx_tree_merge_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints, + fc->joints); + + for (i = 0; i < 2; ++i) { + nmv_component *comp = &fc->comps[i]; + const nmv_component *pre_comp = &pre_fc->comps[i]; + const nmv_component_counts *c = &counts->comps[i]; + + comp->sign = mode_mv_merge_probs(pre_comp->sign, c->sign); + vpx_tree_merge_probs(vp9_mv_class_tree, pre_comp->classes, c->classes, + comp->classes); + vpx_tree_merge_probs(vp9_mv_class0_tree, pre_comp->class0, c->class0, + comp->class0); + + for (j = 0; j < MV_OFFSET_BITS; ++j) + comp->bits[j] = mode_mv_merge_probs(pre_comp->bits[j], c->bits[j]); + + for (j = 0; j < CLASS0_SIZE; ++j) + vpx_tree_merge_probs(vp9_mv_fp_tree, pre_comp->class0_fp[j], + c->class0_fp[j], comp->class0_fp[j]); + + vpx_tree_merge_probs(vp9_mv_fp_tree, pre_comp->fp, c->fp, comp->fp); + + if (allow_hp) { + comp->class0_hp = mode_mv_merge_probs(pre_comp->class0_hp, c->class0_hp); + comp->hp = mode_mv_merge_probs(pre_comp->hp, c->hp); + } + } +} + +void vp9_init_mv_probs(VP9_COMMON *cm) { cm->fc->nmvc = default_nmv_context; } diff --git a/media/libvpx/libvpx/vp9/common/vp9_entropymv.h b/media/libvpx/libvpx/vp9/common/vp9_entropymv.h new file mode 100644 index 0000000000..ee9d37973f --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_entropymv.h @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_COMMON_VP9_ENTROPYMV_H_ +#define VPX_VP9_COMMON_VP9_ENTROPYMV_H_ + +#include "./vpx_config.h" + +#include "vpx_dsp/prob.h" + +#include "vp9/common/vp9_mv.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP9Common; + +void vp9_init_mv_probs(struct VP9Common *cm); + +void vp9_adapt_mv_probs(struct VP9Common *cm, int allow_hp); + +static INLINE int use_mv_hp(const MV *ref) { + const int kMvRefThresh = 64; // threshold for use of high-precision 1/8 mv + return abs(ref->row) < kMvRefThresh && abs(ref->col) < kMvRefThresh; +} + +#define MV_UPDATE_PROB 252 + +/* Symbols for coding which components are zero jointly */ +#define MV_JOINTS 4 +typedef enum { + MV_JOINT_ZERO = 0, /* Zero vector */ + MV_JOINT_HNZVZ = 1, /* Vert zero, hor nonzero */ + MV_JOINT_HZVNZ = 2, /* Hor zero, vert nonzero */ + MV_JOINT_HNZVNZ = 3, /* Both components nonzero */ +} MV_JOINT_TYPE; + +static INLINE int mv_joint_vertical(MV_JOINT_TYPE type) { + return type == MV_JOINT_HZVNZ || type == MV_JOINT_HNZVNZ; +} + +static INLINE int mv_joint_horizontal(MV_JOINT_TYPE type) { + return type == MV_JOINT_HNZVZ || type == MV_JOINT_HNZVNZ; +} + +/* Symbols for coding magnitude class of nonzero components */ +#define MV_CLASSES 11 +typedef enum { + MV_CLASS_0 = 0, /* (0, 2] integer pel */ + MV_CLASS_1 = 1, /* (2, 4] integer pel */ + MV_CLASS_2 = 2, /* (4, 8] integer pel */ + MV_CLASS_3 = 3, /* (8, 16] integer pel */ + MV_CLASS_4 = 4, /* (16, 32] integer pel */ + MV_CLASS_5 = 5, /* (32, 64] integer pel */ + MV_CLASS_6 = 6, /* (64, 128] integer pel */ + MV_CLASS_7 = 7, /* (128, 256] integer pel */ + MV_CLASS_8 = 8, /* (256, 512] integer pel */ + MV_CLASS_9 = 9, /* (512, 1024] integer pel */ + MV_CLASS_10 = 10, /* (1024,2048] integer pel */ +} MV_CLASS_TYPE; + +#define CLASS0_BITS 1 /* bits at integer precision for class 0 */ +#define CLASS0_SIZE (1 << CLASS0_BITS) +#define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2) +#define MV_FP_SIZE 4 + +#define MV_MAX_BITS (MV_CLASSES + CLASS0_BITS + 2) +#define MV_MAX ((1 << MV_MAX_BITS) - 1) +#define MV_VALS ((MV_MAX << 1) + 1) + +#define MV_IN_USE_BITS 14 +#define MV_UPP ((1 << MV_IN_USE_BITS) - 1) +#define MV_LOW (-(1 << MV_IN_USE_BITS)) + +extern const vpx_tree_index vp9_mv_joint_tree[]; +extern const vpx_tree_index vp9_mv_class_tree[]; +extern const vpx_tree_index vp9_mv_class0_tree[]; +extern const vpx_tree_index vp9_mv_fp_tree[]; + +typedef struct { + vpx_prob sign; + vpx_prob classes[MV_CLASSES - 1]; + vpx_prob class0[CLASS0_SIZE - 1]; + vpx_prob bits[MV_OFFSET_BITS]; + vpx_prob class0_fp[CLASS0_SIZE][MV_FP_SIZE - 1]; + vpx_prob fp[MV_FP_SIZE - 1]; + vpx_prob class0_hp; + vpx_prob hp; +} nmv_component; + +typedef struct { + vpx_prob joints[MV_JOINTS - 1]; + nmv_component comps[2]; +} nmv_context; + +static INLINE MV_JOINT_TYPE vp9_get_mv_joint(const MV *mv) { + if (mv->row == 0) { + return mv->col == 0 ? MV_JOINT_ZERO : MV_JOINT_HNZVZ; + } else { + return mv->col == 0 ? MV_JOINT_HZVNZ : MV_JOINT_HNZVNZ; + } +} + +MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset); + +typedef struct { + unsigned int sign[2]; + unsigned int classes[MV_CLASSES]; + unsigned int class0[CLASS0_SIZE]; + unsigned int bits[MV_OFFSET_BITS][2]; + unsigned int class0_fp[CLASS0_SIZE][MV_FP_SIZE]; + unsigned int fp[MV_FP_SIZE]; + unsigned int class0_hp[2]; + unsigned int hp[2]; +} nmv_component_counts; + +typedef struct { + unsigned int joints[MV_JOINTS]; + nmv_component_counts comps[2]; +} nmv_context_counts; + +void vp9_inc_mv(const MV *mv, nmv_context_counts *counts); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_COMMON_VP9_ENTROPYMV_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_enums.h b/media/libvpx/libvpx/vp9/common/vp9_enums.h new file mode 100644 index 0000000000..b33a3a2978 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_enums.h @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_COMMON_VP9_ENUMS_H_ +#define VPX_VP9_COMMON_VP9_ENUMS_H_ + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MI_SIZE_LOG2 3 +#define MI_BLOCK_SIZE_LOG2 (6 - MI_SIZE_LOG2) // 64 = 2^6 + +#define MI_SIZE (1 << MI_SIZE_LOG2) // pixels per mi-unit +#define MI_BLOCK_SIZE (1 << MI_BLOCK_SIZE_LOG2) // mi-units per max block + +#define MI_MASK (MI_BLOCK_SIZE - 1) + +// Bitstream profiles indicated by 2-3 bits in the uncompressed header. +// 00: Profile 0. 8-bit 4:2:0 only. +// 10: Profile 1. 8-bit 4:4:4, 4:2:2, and 4:4:0. +// 01: Profile 2. 10-bit and 12-bit color only, with 4:2:0 sampling. +// 110: Profile 3. 10-bit and 12-bit color only, with 4:2:2/4:4:4/4:4:0 +// sampling. +// 111: Undefined profile. +typedef enum BITSTREAM_PROFILE { + PROFILE_0, + PROFILE_1, + PROFILE_2, + PROFILE_3, + MAX_PROFILES +} BITSTREAM_PROFILE; + +typedef enum PARSE_RECON_FLAG { PARSE = 1, RECON = 2 } PARSE_RECON_FLAG; + +#define BLOCK_4X4 0 +#define BLOCK_4X8 1 +#define BLOCK_8X4 2 +#define BLOCK_8X8 3 +#define BLOCK_8X16 4 +#define BLOCK_16X8 5 +#define BLOCK_16X16 6 +#define BLOCK_16X32 7 +#define BLOCK_32X16 8 +#define BLOCK_32X32 9 +#define BLOCK_32X64 10 +#define BLOCK_64X32 11 +#define BLOCK_64X64 12 +#define BLOCK_SIZES 13 +#define BLOCK_INVALID BLOCK_SIZES +typedef uint8_t BLOCK_SIZE; + +typedef enum PARTITION_TYPE { + PARTITION_NONE, + PARTITION_HORZ, + PARTITION_VERT, + PARTITION_SPLIT, + PARTITION_TYPES, + PARTITION_INVALID = PARTITION_TYPES +} PARTITION_TYPE; + +typedef char PARTITION_CONTEXT; +#define PARTITION_PLOFFSET 4 // number of probability models per block size +#define PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET) + +// block transform size +typedef uint8_t TX_SIZE; +#define TX_4X4 ((TX_SIZE)0) // 4x4 transform +#define TX_8X8 ((TX_SIZE)1) // 8x8 transform +#define TX_16X16 ((TX_SIZE)2) // 16x16 transform +#define TX_32X32 ((TX_SIZE)3) // 32x32 transform +#define TX_SIZES ((TX_SIZE)4) + +// frame transform mode +typedef enum { + ONLY_4X4 = 0, // only 4x4 transform used + ALLOW_8X8 = 1, // allow block transform size up to 8x8 + ALLOW_16X16 = 2, // allow block transform size up to 16x16 + ALLOW_32X32 = 3, // allow block transform size up to 32x32 + TX_MODE_SELECT = 4, // transform specified for each block + TX_MODES = 5, +} TX_MODE; + +typedef enum { + DCT_DCT = 0, // DCT in both horizontal and vertical + ADST_DCT = 1, // ADST in vertical, DCT in horizontal + DCT_ADST = 2, // DCT in vertical, ADST in horizontal + ADST_ADST = 3, // ADST in both directions + TX_TYPES = 4 +} TX_TYPE; + +typedef enum { + VP9_LAST_FLAG = 1 << 0, + VP9_GOLD_FLAG = 1 << 1, + VP9_ALT_FLAG = 1 << 2, +} VP9_REFFRAME; + +typedef enum { PLANE_TYPE_Y = 0, PLANE_TYPE_UV = 1, PLANE_TYPES } PLANE_TYPE; + +#define DC_PRED 0 // Average of above and left pixels +#define V_PRED 1 // Vertical +#define H_PRED 2 // Horizontal +#define D45_PRED 3 // Directional 45 deg = round(arctan(1/1) * 180/pi) +#define D135_PRED 4 // Directional 135 deg = 180 - 45 +#define D117_PRED 5 // Directional 117 deg = 180 - 63 +#define D153_PRED 6 // Directional 153 deg = 180 - 27 +#define D207_PRED 7 // Directional 207 deg = 180 + 27 +#define D63_PRED 8 // Directional 63 deg = round(arctan(2/1) * 180/pi) +#define TM_PRED 9 // True-motion +#define NEARESTMV 10 +#define NEARMV 11 +#define ZEROMV 12 +#define NEWMV 13 +#define MB_MODE_COUNT 14 +typedef uint8_t PREDICTION_MODE; + +#define INTRA_MODES (TM_PRED + 1) + +#define INTER_MODES (1 + NEWMV - NEARESTMV) + +#define SKIP_CONTEXTS 3 +#define INTER_MODE_CONTEXTS 7 + +/* Segment Feature Masks */ +#define MAX_MV_REF_CANDIDATES 2 + +#define INTRA_INTER_CONTEXTS 4 +#define COMP_INTER_CONTEXTS 5 +#define REF_CONTEXTS 5 + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_COMMON_VP9_ENUMS_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_filter.c b/media/libvpx/libvpx/vp9/common/vp9_filter.c new file mode 100644 index 0000000000..adbda6c825 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_filter.c @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vp9/common/vp9_filter.h" + +DECLARE_ALIGNED(256, static const InterpKernel, + bilinear_filters[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 0, 120, 8, 0, 0, 0 }, + { 0, 0, 0, 112, 16, 0, 0, 0 }, { 0, 0, 0, 104, 24, 0, 0, 0 }, + { 0, 0, 0, 96, 32, 0, 0, 0 }, { 0, 0, 0, 88, 40, 0, 0, 0 }, + { 0, 0, 0, 80, 48, 0, 0, 0 }, { 0, 0, 0, 72, 56, 0, 0, 0 }, + { 0, 0, 0, 64, 64, 0, 0, 0 }, { 0, 0, 0, 56, 72, 0, 0, 0 }, + { 0, 0, 0, 48, 80, 0, 0, 0 }, { 0, 0, 0, 40, 88, 0, 0, 0 }, + { 0, 0, 0, 32, 96, 0, 0, 0 }, { 0, 0, 0, 24, 104, 0, 0, 0 }, + { 0, 0, 0, 16, 112, 0, 0, 0 }, { 0, 0, 0, 8, 120, 0, 0, 0 } +}; + +// Lagrangian interpolation filter +DECLARE_ALIGNED(256, static const InterpKernel, + sub_pel_filters_8[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 1, -5, 126, 8, -3, 1, 0 }, + { -1, 3, -10, 122, 18, -6, 2, 0 }, { -1, 4, -13, 118, 27, -9, 3, -1 }, + { -1, 4, -16, 112, 37, -11, 4, -1 }, { -1, 5, -18, 105, 48, -14, 4, -1 }, + { -1, 5, -19, 97, 58, -16, 5, -1 }, { -1, 6, -19, 88, 68, -18, 5, -1 }, + { -1, 6, -19, 78, 78, -19, 6, -1 }, { -1, 5, -18, 68, 88, -19, 6, -1 }, + { -1, 5, -16, 58, 97, -19, 5, -1 }, { -1, 4, -14, 48, 105, -18, 5, -1 }, + { -1, 4, -11, 37, 112, -16, 4, -1 }, { -1, 3, -9, 27, 118, -13, 4, -1 }, + { 0, 2, -6, 18, 122, -10, 3, -1 }, { 0, 1, -3, 8, 126, -5, 1, 0 } +}; + +// DCT based filter +DECLARE_ALIGNED(256, static const InterpKernel, + sub_pel_filters_8s[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { -1, 3, -7, 127, 8, -3, 1, 0 }, + { -2, 5, -13, 125, 17, -6, 3, -1 }, { -3, 7, -17, 121, 27, -10, 5, -2 }, + { -4, 9, -20, 115, 37, -13, 6, -2 }, { -4, 10, -23, 108, 48, -16, 8, -3 }, + { -4, 10, -24, 100, 59, -19, 9, -3 }, { -4, 11, -24, 90, 70, -21, 10, -4 }, + { -4, 11, -23, 80, 80, -23, 11, -4 }, { -4, 10, -21, 70, 90, -24, 11, -4 }, + { -3, 9, -19, 59, 100, -24, 10, -4 }, { -3, 8, -16, 48, 108, -23, 10, -4 }, + { -2, 6, -13, 37, 115, -20, 9, -4 }, { -2, 5, -10, 27, 121, -17, 7, -3 }, + { -1, 3, -6, 17, 125, -13, 5, -2 }, { 0, 1, -3, 8, 127, -7, 3, -1 } +}; + +// freqmultiplier = 0.5 +DECLARE_ALIGNED(256, static const InterpKernel, + sub_pel_filters_8lp[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { -3, -1, 32, 64, 38, 1, -3, 0 }, + { -2, -2, 29, 63, 41, 2, -3, 0 }, { -2, -2, 26, 63, 43, 4, -4, 0 }, + { -2, -3, 24, 62, 46, 5, -4, 0 }, { -2, -3, 21, 60, 49, 7, -4, 0 }, + { -1, -4, 18, 59, 51, 9, -4, 0 }, { -1, -4, 16, 57, 53, 12, -4, -1 }, + { -1, -4, 14, 55, 55, 14, -4, -1 }, { -1, -4, 12, 53, 57, 16, -4, -1 }, + { 0, -4, 9, 51, 59, 18, -4, -1 }, { 0, -4, 7, 49, 60, 21, -3, -2 }, + { 0, -4, 5, 46, 62, 24, -3, -2 }, { 0, -4, 4, 43, 63, 26, -2, -2 }, + { 0, -3, 2, 41, 63, 29, -2, -2 }, { 0, -3, 1, 38, 64, 32, -1, -3 } +}; + +// 4-tap filter +DECLARE_ALIGNED(256, static const InterpKernel, + sub_pel_filters_4[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -4, 126, 8, -2, 0, 0 }, + { 0, 0, -6, 120, 18, -4, 0, 0 }, { 0, 0, -8, 114, 28, -6, 0, 0 }, + { 0, 0, -10, 108, 36, -6, 0, 0 }, { 0, 0, -12, 102, 46, -8, 0, 0 }, + { 0, 0, -12, 94, 56, -10, 0, 0 }, { 0, 0, -12, 84, 66, -10, 0, 0 }, + { 0, 0, -12, 76, 76, -12, 0, 0 }, { 0, 0, -10, 66, 84, -12, 0, 0 }, + { 0, 0, -10, 56, 94, -12, 0, 0 }, { 0, 0, -8, 46, 102, -12, 0, 0 }, + { 0, 0, -6, 36, 108, -10, 0, 0 }, { 0, 0, -6, 28, 114, -8, 0, 0 }, + { 0, 0, -4, 18, 120, -6, 0, 0 }, { 0, 0, -2, 8, 126, -4, 0, 0 } +}; + +const InterpKernel *vp9_filter_kernels[5] = { + sub_pel_filters_8, sub_pel_filters_8lp, sub_pel_filters_8s, bilinear_filters, + sub_pel_filters_4 +}; diff --git a/media/libvpx/libvpx/vp9/common/vp9_filter.h b/media/libvpx/libvpx/vp9/common/vp9_filter.h new file mode 100644 index 0000000000..0382c88e7c --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_filter.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_COMMON_VP9_FILTER_H_ +#define VPX_VP9_COMMON_VP9_FILTER_H_ + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define EIGHTTAP 0 +#define EIGHTTAP_SMOOTH 1 +#define EIGHTTAP_SHARP 2 +#define SWITCHABLE_FILTERS 3 /* Number of switchable filters */ +#define BILINEAR 3 +#define FOURTAP 4 +// The codec can operate in four possible inter prediction filter mode: +// 8-tap, 8-tap-smooth, 8-tap-sharp, and switching between the three. +#define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1) +#define SWITCHABLE 4 /* should be the last one */ + +typedef uint8_t INTERP_FILTER; + +extern const InterpKernel *vp9_filter_kernels[5]; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_COMMON_VP9_FILTER_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_frame_buffers.c b/media/libvpx/libvpx/vp9/common/vp9_frame_buffers.c new file mode 100644 index 0000000000..889b809e50 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_frame_buffers.c @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vp9/common/vp9_frame_buffers.h" +#include "vpx_mem/vpx_mem.h" + +int vp9_alloc_internal_frame_buffers(InternalFrameBufferList *list) { + const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS; + assert(list != NULL); + vp9_free_internal_frame_buffers(list); + + list->int_fb = + (InternalFrameBuffer *)vpx_calloc(num_buffers, sizeof(*list->int_fb)); + if (list->int_fb) { + list->num_internal_frame_buffers = num_buffers; + return 0; + } + return -1; +} + +void vp9_free_internal_frame_buffers(InternalFrameBufferList *list) { + int i; + + assert(list != NULL); + + for (i = 0; i < list->num_internal_frame_buffers; ++i) { + vpx_free(list->int_fb[i].data); + list->int_fb[i].data = NULL; + } + vpx_free(list->int_fb); + list->int_fb = NULL; + list->num_internal_frame_buffers = 0; +} + +int vp9_get_frame_buffer(void *cb_priv, size_t min_size, + vpx_codec_frame_buffer_t *fb) { + int i; + InternalFrameBufferList *const int_fb_list = + (InternalFrameBufferList *)cb_priv; + if (int_fb_list == NULL) return -1; + + // Find a free frame buffer. + for (i = 0; i < int_fb_list->num_internal_frame_buffers; ++i) { + if (!int_fb_list->int_fb[i].in_use) break; + } + + if (i == int_fb_list->num_internal_frame_buffers) return -1; + + if (int_fb_list->int_fb[i].size < min_size) { + vpx_free(int_fb_list->int_fb[i].data); + // The data must be zeroed to fix a valgrind error from the C loop filter + // due to access uninitialized memory in frame border. It could be + // skipped if border were totally removed. + int_fb_list->int_fb[i].data = (uint8_t *)vpx_calloc(1, min_size); + if (!int_fb_list->int_fb[i].data) return -1; + int_fb_list->int_fb[i].size = min_size; + } + + fb->data = int_fb_list->int_fb[i].data; + fb->size = int_fb_list->int_fb[i].size; + int_fb_list->int_fb[i].in_use = 1; + + // Set the frame buffer's private data to point at the internal frame buffer. + fb->priv = &int_fb_list->int_fb[i]; + return 0; +} + +int vp9_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb) { + InternalFrameBuffer *const int_fb = (InternalFrameBuffer *)fb->priv; + (void)cb_priv; + if (int_fb) int_fb->in_use = 0; + return 0; +} diff --git a/media/libvpx/libvpx/vp9/common/vp9_frame_buffers.h b/media/libvpx/libvpx/vp9/common/vp9_frame_buffers.h new file mode 100644 index 0000000000..11be838c02 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_frame_buffers.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_COMMON_VP9_FRAME_BUFFERS_H_ +#define VPX_VP9_COMMON_VP9_FRAME_BUFFERS_H_ + +#include "vpx/vpx_frame_buffer.h" +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct InternalFrameBuffer { + uint8_t *data; + size_t size; + int in_use; +} InternalFrameBuffer; + +typedef struct InternalFrameBufferList { + int num_internal_frame_buffers; + InternalFrameBuffer *int_fb; +} InternalFrameBufferList; + +// Initializes |list|. Returns 0 on success. +int vp9_alloc_internal_frame_buffers(InternalFrameBufferList *list); + +// Free any data allocated to the frame buffers. +void vp9_free_internal_frame_buffers(InternalFrameBufferList *list); + +// Callback used by libvpx to request an external frame buffer. |cb_priv| +// Callback private data, which points to an InternalFrameBufferList. +// |min_size| is the minimum size in bytes needed to decode the next frame. +// |fb| pointer to the frame buffer. +int vp9_get_frame_buffer(void *cb_priv, size_t min_size, + vpx_codec_frame_buffer_t *fb); + +// Callback used by libvpx when there are no references to the frame buffer. +// |cb_priv| is not used. |fb| pointer to the frame buffer. +int vp9_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_COMMON_VP9_FRAME_BUFFERS_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_idct.c b/media/libvpx/libvpx/vp9/common/vp9_idct.c new file mode 100644 index 0000000000..71be0f310d --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_idct.c @@ -0,0 +1,398 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_idct.h" +#include "vpx_dsp/inv_txfm.h" +#include "vpx_ports/mem.h" + +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { + const transform_2d IHT_4[] = { + { idct4_c, idct4_c }, // DCT_DCT = 0 + { iadst4_c, idct4_c }, // ADST_DCT = 1 + { idct4_c, iadst4_c }, // DCT_ADST = 2 + { iadst4_c, iadst4_c } // ADST_ADST = 3 + }; + + int i, j; + tran_low_t out[4 * 4]; + tran_low_t *outptr = out; + tran_low_t temp_in[4], temp_out[4]; + + // inverse transform row vectors + for (i = 0; i < 4; ++i) { + IHT_4[tx_type].rows(input, outptr); + input += 4; + outptr += 4; + } + + // inverse transform column vectors + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; + IHT_4[tx_type].cols(temp_in, temp_out); + for (j = 0; j < 4; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 4)); + } + } +} + +static const transform_2d IHT_8[] = { + { idct8_c, idct8_c }, // DCT_DCT = 0 + { iadst8_c, idct8_c }, // ADST_DCT = 1 + { idct8_c, iadst8_c }, // DCT_ADST = 2 + { iadst8_c, iadst8_c } // ADST_ADST = 3 +}; + +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { + int i, j; + tran_low_t out[8 * 8]; + tran_low_t *outptr = out; + tran_low_t temp_in[8], temp_out[8]; + const transform_2d ht = IHT_8[tx_type]; + + // inverse transform row vectors + for (i = 0; i < 8; ++i) { + ht.rows(input, outptr); + input += 8; + outptr += 8; + } + + // inverse transform column vectors + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; + ht.cols(temp_in, temp_out); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 5)); + } + } +} + +static const transform_2d IHT_16[] = { + { idct16_c, idct16_c }, // DCT_DCT = 0 + { iadst16_c, idct16_c }, // ADST_DCT = 1 + { idct16_c, iadst16_c }, // DCT_ADST = 2 + { iadst16_c, iadst16_c } // ADST_ADST = 3 +}; + +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { + int i, j; + tran_low_t out[16 * 16]; + tran_low_t *outptr = out; + tran_low_t temp_in[16], temp_out[16]; + const transform_2d ht = IHT_16[tx_type]; + + // Rows + for (i = 0; i < 16; ++i) { + ht.rows(input, outptr); + input += 16; + outptr += 16; + } + + // Columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; + ht.cols(temp_in, temp_out); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } +} + +// idct +void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob) { + if (eob > 1) + vpx_idct4x4_16_add(input, dest, stride); + else + vpx_idct4x4_1_add(input, dest, stride); +} + +void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob) { + if (eob > 1) + vpx_iwht4x4_16_add(input, dest, stride); + else + vpx_iwht4x4_1_add(input, dest, stride); +} + +void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob) { + // If dc is 1, then input[0] is the reconstructed value, do not need + // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. + + // The calculation can be simplified if there are not many non-zero dct + // coefficients. Use eobs to decide what to do. + if (eob == 1) + // DC only DCT coefficient + vpx_idct8x8_1_add(input, dest, stride); + else if (eob <= 12) + vpx_idct8x8_12_add(input, dest, stride); + else + vpx_idct8x8_64_add(input, dest, stride); +} + +void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob) { + assert(((intptr_t)input) % 32 == 0); + /* The calculation can be simplified if there are not many non-zero dct + * coefficients. Use eobs to separate different cases. */ + if (eob == 1) /* DC only DCT coefficient. */ + vpx_idct16x16_1_add(input, dest, stride); + else if (eob <= 10) + vpx_idct16x16_10_add(input, dest, stride); + else if (eob <= 38) + vpx_idct16x16_38_add(input, dest, stride); + else + vpx_idct16x16_256_add(input, dest, stride); +} + +void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob) { + assert(((intptr_t)input) % 32 == 0); + if (eob == 1) + vpx_idct32x32_1_add(input, dest, stride); + else if (eob <= 34) + // non-zero coeff only in upper-left 8x8 + vpx_idct32x32_34_add(input, dest, stride); + else if (eob <= 135) + // non-zero coeff only in upper-left 16x16 + vpx_idct32x32_135_add(input, dest, stride); + else + vpx_idct32x32_1024_add(input, dest, stride); +} + +// iht +void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, + int stride, int eob) { + if (tx_type == DCT_DCT) + vp9_idct4x4_add(input, dest, stride, eob); + else + vp9_iht4x4_16_add(input, dest, stride, tx_type); +} + +void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, + int stride, int eob) { + if (tx_type == DCT_DCT) { + vp9_idct8x8_add(input, dest, stride, eob); + } else { + vp9_iht8x8_64_add(input, dest, stride, tx_type); + } +} + +void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, + int stride, int eob) { + if (tx_type == DCT_DCT) { + vp9_idct16x16_add(input, dest, stride, eob); + } else { + vp9_iht16x16_256_add(input, dest, stride, tx_type); + } +} + +#if CONFIG_VP9_HIGHBITDEPTH + +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, + int stride, int tx_type, int bd) { + const highbd_transform_2d IHT_4[] = { + { vpx_highbd_idct4_c, vpx_highbd_idct4_c }, // DCT_DCT = 0 + { vpx_highbd_iadst4_c, vpx_highbd_idct4_c }, // ADST_DCT = 1 + { vpx_highbd_idct4_c, vpx_highbd_iadst4_c }, // DCT_ADST = 2 + { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c } // ADST_ADST = 3 + }; + + int i, j; + tran_low_t out[4 * 4]; + tran_low_t *outptr = out; + tran_low_t temp_in[4], temp_out[4]; + + // Inverse transform row vectors. + for (i = 0; i < 4; ++i) { + IHT_4[tx_type].rows(input, outptr, bd); + input += 4; + outptr += 4; + } + + // Inverse transform column vectors. + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; + IHT_4[tx_type].cols(temp_in, temp_out, bd); + for (j = 0; j < 4; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); + } + } +} + +static const highbd_transform_2d HIGH_IHT_8[] = { + { vpx_highbd_idct8_c, vpx_highbd_idct8_c }, // DCT_DCT = 0 + { vpx_highbd_iadst8_c, vpx_highbd_idct8_c }, // ADST_DCT = 1 + { vpx_highbd_idct8_c, vpx_highbd_iadst8_c }, // DCT_ADST = 2 + { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c } // ADST_ADST = 3 +}; + +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, + int stride, int tx_type, int bd) { + int i, j; + tran_low_t out[8 * 8]; + tran_low_t *outptr = out; + tran_low_t temp_in[8], temp_out[8]; + const highbd_transform_2d ht = HIGH_IHT_8[tx_type]; + + // Inverse transform row vectors. + for (i = 0; i < 8; ++i) { + ht.rows(input, outptr, bd); + input += 8; + outptr += 8; + } + + // Inverse transform column vectors. + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; + ht.cols(temp_in, temp_out, bd); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); + } + } +} + +static const highbd_transform_2d HIGH_IHT_16[] = { + { vpx_highbd_idct16_c, vpx_highbd_idct16_c }, // DCT_DCT = 0 + { vpx_highbd_iadst16_c, vpx_highbd_idct16_c }, // ADST_DCT = 1 + { vpx_highbd_idct16_c, vpx_highbd_iadst16_c }, // DCT_ADST = 2 + { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c } // ADST_ADST = 3 +}; + +void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *dest, + int stride, int tx_type, int bd) { + int i, j; + tran_low_t out[16 * 16]; + tran_low_t *outptr = out; + tran_low_t temp_in[16], temp_out[16]; + const highbd_transform_2d ht = HIGH_IHT_16[tx_type]; + + // Rows + for (i = 0; i < 16; ++i) { + ht.rows(input, outptr, bd); + input += 16; + outptr += 16; + } + + // Columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; + ht.cols(temp_in, temp_out, bd); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } + } +} + +// idct +void vp9_highbd_idct4x4_add(const tran_low_t *input, uint16_t *dest, int stride, + int eob, int bd) { + if (eob > 1) + vpx_highbd_idct4x4_16_add(input, dest, stride, bd); + else + vpx_highbd_idct4x4_1_add(input, dest, stride, bd); +} + +void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint16_t *dest, int stride, + int eob, int bd) { + if (eob > 1) + vpx_highbd_iwht4x4_16_add(input, dest, stride, bd); + else + vpx_highbd_iwht4x4_1_add(input, dest, stride, bd); +} + +void vp9_highbd_idct8x8_add(const tran_low_t *input, uint16_t *dest, int stride, + int eob, int bd) { + // If dc is 1, then input[0] is the reconstructed value, do not need + // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. + + // The calculation can be simplified if there are not many non-zero dct + // coefficients. Use eobs to decide what to do. + // DC only DCT coefficient + if (eob == 1) { + vpx_highbd_idct8x8_1_add(input, dest, stride, bd); + } else if (eob <= 12) { + vpx_highbd_idct8x8_12_add(input, dest, stride, bd); + } else { + vpx_highbd_idct8x8_64_add(input, dest, stride, bd); + } +} + +void vp9_highbd_idct16x16_add(const tran_low_t *input, uint16_t *dest, + int stride, int eob, int bd) { + // The calculation can be simplified if there are not many non-zero dct + // coefficients. Use eobs to separate different cases. + // DC only DCT coefficient. + if (eob == 1) { + vpx_highbd_idct16x16_1_add(input, dest, stride, bd); + } else if (eob <= 10) { + vpx_highbd_idct16x16_10_add(input, dest, stride, bd); + } else if (eob <= 38) { + vpx_highbd_idct16x16_38_add(input, dest, stride, bd); + } else { + vpx_highbd_idct16x16_256_add(input, dest, stride, bd); + } +} + +void vp9_highbd_idct32x32_add(const tran_low_t *input, uint16_t *dest, + int stride, int eob, int bd) { + // Non-zero coeff only in upper-left 8x8 + if (eob == 1) { + vpx_highbd_idct32x32_1_add(input, dest, stride, bd); + } else if (eob <= 34) { + vpx_highbd_idct32x32_34_add(input, dest, stride, bd); + } else if (eob <= 135) { + vpx_highbd_idct32x32_135_add(input, dest, stride, bd); + } else { + vpx_highbd_idct32x32_1024_add(input, dest, stride, bd); + } +} + +// iht +void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, + uint16_t *dest, int stride, int eob, int bd) { + if (tx_type == DCT_DCT) + vp9_highbd_idct4x4_add(input, dest, stride, eob, bd); + else + vp9_highbd_iht4x4_16_add(input, dest, stride, tx_type, bd); +} + +void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, + uint16_t *dest, int stride, int eob, int bd) { + if (tx_type == DCT_DCT) { + vp9_highbd_idct8x8_add(input, dest, stride, eob, bd); + } else { + vp9_highbd_iht8x8_64_add(input, dest, stride, tx_type, bd); + } +} + +void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, + uint16_t *dest, int stride, int eob, int bd) { + if (tx_type == DCT_DCT) { + vp9_highbd_idct16x16_add(input, dest, stride, eob, bd); + } else { + vp9_highbd_iht16x16_256_add(input, dest, stride, tx_type, bd); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vp9/common/vp9_idct.h b/media/libvpx/libvpx/vp9/common/vp9_idct.h new file mode 100644 index 0000000000..94eeaf599e --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_idct.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_COMMON_VP9_IDCT_H_ +#define VPX_VP9_COMMON_VP9_IDCT_H_ + +#include + +#include "./vpx_config.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_enums.h" +#include "vpx_dsp/inv_txfm.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (*transform_1d)(const tran_low_t *, tran_low_t *); + +typedef struct { + transform_1d cols, rows; // vertical and horizontal +} transform_2d; + +#if CONFIG_VP9_HIGHBITDEPTH +typedef void (*highbd_transform_1d)(const tran_low_t *, tran_low_t *, int bd); + +typedef struct { + highbd_transform_1d cols, rows; // vertical and horizontal +} highbd_transform_2d; +#endif // CONFIG_VP9_HIGHBITDEPTH + +void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob); +void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob); +void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob); +void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob); +void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob); + +void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, + int stride, int eob); +void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, + int stride, int eob); +void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, + int stride, int eob); + +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint16_t *dest, int stride, + int eob, int bd); +void vp9_highbd_idct4x4_add(const tran_low_t *input, uint16_t *dest, int stride, + int eob, int bd); +void vp9_highbd_idct8x8_add(const tran_low_t *input, uint16_t *dest, int stride, + int eob, int bd); +void vp9_highbd_idct16x16_add(const tran_low_t *input, uint16_t *dest, + int stride, int eob, int bd); +void vp9_highbd_idct32x32_add(const tran_low_t *input, uint16_t *dest, + int stride, int eob, int bd); +void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, + uint16_t *dest, int stride, int eob, int bd); +void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, + uint16_t *dest, int stride, int eob, int bd); +void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, + uint16_t *dest, int stride, int eob, int bd); +#endif // CONFIG_VP9_HIGHBITDEPTH +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_COMMON_VP9_IDCT_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_loopfilter.c b/media/libvpx/libvpx/vp9/common/vp9_loopfilter.c new file mode 100644 index 0000000000..1a9d45ae77 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_loopfilter.c @@ -0,0 +1,1633 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vp9/common/vp9_loopfilter.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_reconinter.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" + +#include "vp9/common/vp9_seg_common.h" + +// 64 bit masks for left transform size. Each 1 represents a position where +// we should apply a loop filter across the left border of an 8x8 block +// boundary. +// +// In the case of TX_16X16-> ( in low order byte first we end up with +// a mask that looks like this +// +// 10101010 +// 10101010 +// 10101010 +// 10101010 +// 10101010 +// 10101010 +// 10101010 +// 10101010 +// +// A loopfilter should be applied to every other 8x8 horizontally. +static const uint64_t left_64x64_txform_mask[TX_SIZES] = { + 0xffffffffffffffffULL, // TX_4X4 + 0xffffffffffffffffULL, // TX_8x8 + 0x5555555555555555ULL, // TX_16x16 + 0x1111111111111111ULL, // TX_32x32 +}; + +// 64 bit masks for above transform size. Each 1 represents a position where +// we should apply a loop filter across the top border of an 8x8 block +// boundary. +// +// In the case of TX_32x32 -> ( in low order byte first we end up with +// a mask that looks like this +// +// 11111111 +// 00000000 +// 00000000 +// 00000000 +// 11111111 +// 00000000 +// 00000000 +// 00000000 +// +// A loopfilter should be applied to every other 4 the row vertically. +static const uint64_t above_64x64_txform_mask[TX_SIZES] = { + 0xffffffffffffffffULL, // TX_4X4 + 0xffffffffffffffffULL, // TX_8x8 + 0x00ff00ff00ff00ffULL, // TX_16x16 + 0x000000ff000000ffULL, // TX_32x32 +}; + +// 64 bit masks for prediction sizes (left). Each 1 represents a position +// where left border of an 8x8 block. These are aligned to the right most +// appropriate bit, and then shifted into place. +// +// In the case of TX_16x32 -> ( low order byte first ) we end up with +// a mask that looks like this : +// +// 10000000 +// 10000000 +// 10000000 +// 10000000 +// 00000000 +// 00000000 +// 00000000 +// 00000000 +static const uint64_t left_prediction_mask[BLOCK_SIZES] = { + 0x0000000000000001ULL, // BLOCK_4X4, + 0x0000000000000001ULL, // BLOCK_4X8, + 0x0000000000000001ULL, // BLOCK_8X4, + 0x0000000000000001ULL, // BLOCK_8X8, + 0x0000000000000101ULL, // BLOCK_8X16, + 0x0000000000000001ULL, // BLOCK_16X8, + 0x0000000000000101ULL, // BLOCK_16X16, + 0x0000000001010101ULL, // BLOCK_16X32, + 0x0000000000000101ULL, // BLOCK_32X16, + 0x0000000001010101ULL, // BLOCK_32X32, + 0x0101010101010101ULL, // BLOCK_32X64, + 0x0000000001010101ULL, // BLOCK_64X32, + 0x0101010101010101ULL, // BLOCK_64X64 +}; + +// 64 bit mask to shift and set for each prediction size. +static const uint64_t above_prediction_mask[BLOCK_SIZES] = { + 0x0000000000000001ULL, // BLOCK_4X4 + 0x0000000000000001ULL, // BLOCK_4X8 + 0x0000000000000001ULL, // BLOCK_8X4 + 0x0000000000000001ULL, // BLOCK_8X8 + 0x0000000000000001ULL, // BLOCK_8X16, + 0x0000000000000003ULL, // BLOCK_16X8 + 0x0000000000000003ULL, // BLOCK_16X16 + 0x0000000000000003ULL, // BLOCK_16X32, + 0x000000000000000fULL, // BLOCK_32X16, + 0x000000000000000fULL, // BLOCK_32X32, + 0x000000000000000fULL, // BLOCK_32X64, + 0x00000000000000ffULL, // BLOCK_64X32, + 0x00000000000000ffULL, // BLOCK_64X64 +}; +// 64 bit mask to shift and set for each prediction size. A bit is set for +// each 8x8 block that would be in the left most block of the given block +// size in the 64x64 block. +static const uint64_t size_mask[BLOCK_SIZES] = { + 0x0000000000000001ULL, // BLOCK_4X4 + 0x0000000000000001ULL, // BLOCK_4X8 + 0x0000000000000001ULL, // BLOCK_8X4 + 0x0000000000000001ULL, // BLOCK_8X8 + 0x0000000000000101ULL, // BLOCK_8X16, + 0x0000000000000003ULL, // BLOCK_16X8 + 0x0000000000000303ULL, // BLOCK_16X16 + 0x0000000003030303ULL, // BLOCK_16X32, + 0x0000000000000f0fULL, // BLOCK_32X16, + 0x000000000f0f0f0fULL, // BLOCK_32X32, + 0x0f0f0f0f0f0f0f0fULL, // BLOCK_32X64, + 0x00000000ffffffffULL, // BLOCK_64X32, + 0xffffffffffffffffULL, // BLOCK_64X64 +}; + +// These are used for masking the left and above borders. +static const uint64_t left_border = 0x1111111111111111ULL; +static const uint64_t above_border = 0x000000ff000000ffULL; + +// 16 bit masks for uv transform sizes. +static const uint16_t left_64x64_txform_mask_uv[TX_SIZES] = { + 0xffff, // TX_4X4 + 0xffff, // TX_8x8 + 0x5555, // TX_16x16 + 0x1111, // TX_32x32 +}; + +static const uint16_t above_64x64_txform_mask_uv[TX_SIZES] = { + 0xffff, // TX_4X4 + 0xffff, // TX_8x8 + 0x0f0f, // TX_16x16 + 0x000f, // TX_32x32 +}; + +// 16 bit left mask to shift and set for each uv prediction size. +static const uint16_t left_prediction_mask_uv[BLOCK_SIZES] = { + 0x0001, // BLOCK_4X4, + 0x0001, // BLOCK_4X8, + 0x0001, // BLOCK_8X4, + 0x0001, // BLOCK_8X8, + 0x0001, // BLOCK_8X16, + 0x0001, // BLOCK_16X8, + 0x0001, // BLOCK_16X16, + 0x0011, // BLOCK_16X32, + 0x0001, // BLOCK_32X16, + 0x0011, // BLOCK_32X32, + 0x1111, // BLOCK_32X64 + 0x0011, // BLOCK_64X32, + 0x1111, // BLOCK_64X64 +}; +// 16 bit above mask to shift and set for uv each prediction size. +static const uint16_t above_prediction_mask_uv[BLOCK_SIZES] = { + 0x0001, // BLOCK_4X4 + 0x0001, // BLOCK_4X8 + 0x0001, // BLOCK_8X4 + 0x0001, // BLOCK_8X8 + 0x0001, // BLOCK_8X16, + 0x0001, // BLOCK_16X8 + 0x0001, // BLOCK_16X16 + 0x0001, // BLOCK_16X32, + 0x0003, // BLOCK_32X16, + 0x0003, // BLOCK_32X32, + 0x0003, // BLOCK_32X64, + 0x000f, // BLOCK_64X32, + 0x000f, // BLOCK_64X64 +}; + +// 64 bit mask to shift and set for each uv prediction size +static const uint16_t size_mask_uv[BLOCK_SIZES] = { + 0x0001, // BLOCK_4X4 + 0x0001, // BLOCK_4X8 + 0x0001, // BLOCK_8X4 + 0x0001, // BLOCK_8X8 + 0x0001, // BLOCK_8X16, + 0x0001, // BLOCK_16X8 + 0x0001, // BLOCK_16X16 + 0x0011, // BLOCK_16X32, + 0x0003, // BLOCK_32X16, + 0x0033, // BLOCK_32X32, + 0x3333, // BLOCK_32X64, + 0x00ff, // BLOCK_64X32, + 0xffff, // BLOCK_64X64 +}; +static const uint16_t left_border_uv = 0x1111; +static const uint16_t above_border_uv = 0x000f; + +static const int mode_lf_lut[MB_MODE_COUNT] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // INTRA_MODES + 1, 1, 0, 1 // INTER_MODES (ZEROMV == 0) +}; + +static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) { + int lvl; + + // For each possible value for the loop filter fill out limits + for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) { + // Set loop filter parameters that control sharpness. + int block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4)); + + if (sharpness_lvl > 0) { + if (block_inside_limit > (9 - sharpness_lvl)) + block_inside_limit = (9 - sharpness_lvl); + } + + if (block_inside_limit < 1) block_inside_limit = 1; + + memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH); + memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit), + SIMD_WIDTH); + } +} + +static uint8_t get_filter_level(const loop_filter_info_n *lfi_n, + const MODE_INFO *mi) { + return lfi_n->lvl[mi->segment_id][mi->ref_frame[0]][mode_lf_lut[mi->mode]]; +} + +void vp9_loop_filter_init(VP9_COMMON *cm) { + loop_filter_info_n *lfi = &cm->lf_info; + struct loopfilter *lf = &cm->lf; + int lvl; + + // init limits for given sharpness + update_sharpness(lfi, lf->sharpness_level); + lf->last_sharpness_level = lf->sharpness_level; + + // init hev threshold const vectors + for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) + memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH); +} + +void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) { + int seg_id; + // n_shift is the multiplier for lf_deltas + // the multiplier is 1 for when filter_lvl is between 0 and 31; + // 2 when filter_lvl is between 32 and 63 + const int scale = 1 << (default_filt_lvl >> 5); + loop_filter_info_n *const lfi = &cm->lf_info; + struct loopfilter *const lf = &cm->lf; + const struct segmentation *const seg = &cm->seg; + + // update limits if sharpness has changed + if (lf->last_sharpness_level != lf->sharpness_level) { + update_sharpness(lfi, lf->sharpness_level); + lf->last_sharpness_level = lf->sharpness_level; + } + + for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) { + int lvl_seg = default_filt_lvl; + if (segfeature_active(seg, seg_id, SEG_LVL_ALT_LF)) { + const int data = get_segdata(seg, seg_id, SEG_LVL_ALT_LF); + lvl_seg = clamp( + seg->abs_delta == SEGMENT_ABSDATA ? data : default_filt_lvl + data, 0, + MAX_LOOP_FILTER); + } + + if (!lf->mode_ref_delta_enabled) { + // we could get rid of this if we assume that deltas are set to + // zero when not in use; encoder always uses deltas + memset(lfi->lvl[seg_id], lvl_seg, sizeof(lfi->lvl[seg_id])); + } else { + int ref, mode; + const int intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale; + lfi->lvl[seg_id][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER); + + for (ref = LAST_FRAME; ref < MAX_REF_FRAMES; ++ref) { + for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) { + const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale + + lf->mode_deltas[mode] * scale; + lfi->lvl[seg_id][ref][mode] = clamp(inter_lvl, 0, MAX_LOOP_FILTER); + } + } + } + } +} + +static void filter_selectively_vert_row2( + int subsampling_factor, uint8_t *s, int pitch, unsigned int mask_16x16, + unsigned int mask_8x8, unsigned int mask_4x4, unsigned int mask_4x4_int, + const loop_filter_thresh *lfthr, const uint8_t *lfl) { + const int dual_mask_cutoff = subsampling_factor ? 0xff : 0xffff; + const int lfl_forward = subsampling_factor ? 4 : 8; + const unsigned int dual_one = 1 | (1 << lfl_forward); + unsigned int mask; + uint8_t *ss[2]; + ss[0] = s; + + for (mask = + (mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int) & dual_mask_cutoff; + mask; mask = (mask & ~dual_one) >> 1) { + if (mask & dual_one) { + const loop_filter_thresh *lfis[2]; + lfis[0] = lfthr + *lfl; + lfis[1] = lfthr + *(lfl + lfl_forward); + ss[1] = ss[0] + 8 * pitch; + + if (mask_16x16 & dual_one) { + if ((mask_16x16 & dual_one) == dual_one) { + vpx_lpf_vertical_16_dual(ss[0], pitch, lfis[0]->mblim, lfis[0]->lim, + lfis[0]->hev_thr); + } else { + const loop_filter_thresh *lfi = lfis[!(mask_16x16 & 1)]; + vpx_lpf_vertical_16(ss[!(mask_16x16 & 1)], pitch, lfi->mblim, + lfi->lim, lfi->hev_thr); + } + } + + if (mask_8x8 & dual_one) { + if ((mask_8x8 & dual_one) == dual_one) { + vpx_lpf_vertical_8_dual(ss[0], pitch, lfis[0]->mblim, lfis[0]->lim, + lfis[0]->hev_thr, lfis[1]->mblim, + lfis[1]->lim, lfis[1]->hev_thr); + } else { + const loop_filter_thresh *lfi = lfis[!(mask_8x8 & 1)]; + vpx_lpf_vertical_8(ss[!(mask_8x8 & 1)], pitch, lfi->mblim, lfi->lim, + lfi->hev_thr); + } + } + + if (mask_4x4 & dual_one) { + if ((mask_4x4 & dual_one) == dual_one) { + vpx_lpf_vertical_4_dual(ss[0], pitch, lfis[0]->mblim, lfis[0]->lim, + lfis[0]->hev_thr, lfis[1]->mblim, + lfis[1]->lim, lfis[1]->hev_thr); + } else { + const loop_filter_thresh *lfi = lfis[!(mask_4x4 & 1)]; + vpx_lpf_vertical_4(ss[!(mask_4x4 & 1)], pitch, lfi->mblim, lfi->lim, + lfi->hev_thr); + } + } + + if (mask_4x4_int & dual_one) { + if ((mask_4x4_int & dual_one) == dual_one) { + vpx_lpf_vertical_4_dual( + ss[0] + 4, pitch, lfis[0]->mblim, lfis[0]->lim, lfis[0]->hev_thr, + lfis[1]->mblim, lfis[1]->lim, lfis[1]->hev_thr); + } else { + const loop_filter_thresh *lfi = lfis[!(mask_4x4_int & 1)]; + vpx_lpf_vertical_4(ss[!(mask_4x4_int & 1)] + 4, pitch, lfi->mblim, + lfi->lim, lfi->hev_thr); + } + } + } + + ss[0] += 8; + lfl += 1; + mask_16x16 >>= 1; + mask_8x8 >>= 1; + mask_4x4 >>= 1; + mask_4x4_int >>= 1; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +static void highbd_filter_selectively_vert_row2( + int subsampling_factor, uint16_t *s, int pitch, unsigned int mask_16x16, + unsigned int mask_8x8, unsigned int mask_4x4, unsigned int mask_4x4_int, + const loop_filter_thresh *lfthr, const uint8_t *lfl, int bd) { + const int dual_mask_cutoff = subsampling_factor ? 0xff : 0xffff; + const int lfl_forward = subsampling_factor ? 4 : 8; + const unsigned int dual_one = 1 | (1 << lfl_forward); + unsigned int mask; + uint16_t *ss[2]; + ss[0] = s; + + for (mask = + (mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int) & dual_mask_cutoff; + mask; mask = (mask & ~dual_one) >> 1) { + if (mask & dual_one) { + const loop_filter_thresh *lfis[2]; + lfis[0] = lfthr + *lfl; + lfis[1] = lfthr + *(lfl + lfl_forward); + ss[1] = ss[0] + 8 * pitch; + + if (mask_16x16 & dual_one) { + if ((mask_16x16 & dual_one) == dual_one) { + vpx_highbd_lpf_vertical_16_dual(ss[0], pitch, lfis[0]->mblim, + lfis[0]->lim, lfis[0]->hev_thr, bd); + } else { + const loop_filter_thresh *lfi = lfis[!(mask_16x16 & 1)]; + vpx_highbd_lpf_vertical_16(ss[!(mask_16x16 & 1)], pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, bd); + } + } + + if (mask_8x8 & dual_one) { + if ((mask_8x8 & dual_one) == dual_one) { + vpx_highbd_lpf_vertical_8_dual( + ss[0], pitch, lfis[0]->mblim, lfis[0]->lim, lfis[0]->hev_thr, + lfis[1]->mblim, lfis[1]->lim, lfis[1]->hev_thr, bd); + } else { + const loop_filter_thresh *lfi = lfis[!(mask_8x8 & 1)]; + vpx_highbd_lpf_vertical_8(ss[!(mask_8x8 & 1)], pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, bd); + } + } + + if (mask_4x4 & dual_one) { + if ((mask_4x4 & dual_one) == dual_one) { + vpx_highbd_lpf_vertical_4_dual( + ss[0], pitch, lfis[0]->mblim, lfis[0]->lim, lfis[0]->hev_thr, + lfis[1]->mblim, lfis[1]->lim, lfis[1]->hev_thr, bd); + } else { + const loop_filter_thresh *lfi = lfis[!(mask_4x4 & 1)]; + vpx_highbd_lpf_vertical_4(ss[!(mask_4x4 & 1)], pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, bd); + } + } + + if (mask_4x4_int & dual_one) { + if ((mask_4x4_int & dual_one) == dual_one) { + vpx_highbd_lpf_vertical_4_dual( + ss[0] + 4, pitch, lfis[0]->mblim, lfis[0]->lim, lfis[0]->hev_thr, + lfis[1]->mblim, lfis[1]->lim, lfis[1]->hev_thr, bd); + } else { + const loop_filter_thresh *lfi = lfis[!(mask_4x4_int & 1)]; + vpx_highbd_lpf_vertical_4(ss[!(mask_4x4_int & 1)] + 4, pitch, + lfi->mblim, lfi->lim, lfi->hev_thr, bd); + } + } + } + + ss[0] += 8; + lfl += 1; + mask_16x16 >>= 1; + mask_8x8 >>= 1; + mask_4x4 >>= 1; + mask_4x4_int >>= 1; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +static void filter_selectively_horiz( + uint8_t *s, int pitch, unsigned int mask_16x16, unsigned int mask_8x8, + unsigned int mask_4x4, unsigned int mask_4x4_int, + const loop_filter_thresh *lfthr, const uint8_t *lfl) { + unsigned int mask; + int count; + + for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; mask; + mask >>= count) { + count = 1; + if (mask & 1) { + const loop_filter_thresh *lfi = lfthr + *lfl; + + if (mask_16x16 & 1) { + if ((mask_16x16 & 3) == 3) { + vpx_lpf_horizontal_16_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr); + count = 2; + } else { + vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); + } + } else if (mask_8x8 & 1) { + if ((mask_8x8 & 3) == 3) { + // Next block's thresholds. + const loop_filter_thresh *lfin = lfthr + *(lfl + 1); + + vpx_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, lfin->lim, + lfin->hev_thr); + + if ((mask_4x4_int & 3) == 3) { + vpx_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, lfin->mblim, + lfin->lim, lfin->hev_thr); + } else { + if (mask_4x4_int & 1) + vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr); + else if (mask_4x4_int & 2) + vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim, + lfin->lim, lfin->hev_thr); + } + count = 2; + } else { + vpx_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); + + if (mask_4x4_int & 1) + vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr); + } + } else if (mask_4x4 & 1) { + if ((mask_4x4 & 3) == 3) { + // Next block's thresholds. + const loop_filter_thresh *lfin = lfthr + *(lfl + 1); + + vpx_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, lfin->lim, + lfin->hev_thr); + if ((mask_4x4_int & 3) == 3) { + vpx_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, lfin->mblim, + lfin->lim, lfin->hev_thr); + } else { + if (mask_4x4_int & 1) + vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr); + else if (mask_4x4_int & 2) + vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim, + lfin->lim, lfin->hev_thr); + } + count = 2; + } else { + vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); + + if (mask_4x4_int & 1) + vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr); + } + } else { + vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr); + } + } + s += 8 * count; + lfl += count; + mask_16x16 >>= count; + mask_8x8 >>= count; + mask_4x4 >>= count; + mask_4x4_int >>= count; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +static void highbd_filter_selectively_horiz( + uint16_t *s, int pitch, unsigned int mask_16x16, unsigned int mask_8x8, + unsigned int mask_4x4, unsigned int mask_4x4_int, + const loop_filter_thresh *lfthr, const uint8_t *lfl, int bd) { + unsigned int mask; + int count; + + for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; mask; + mask >>= count) { + count = 1; + if (mask & 1) { + const loop_filter_thresh *lfi = lfthr + *lfl; + + if (mask_16x16 & 1) { + if ((mask_16x16 & 3) == 3) { + vpx_highbd_lpf_horizontal_16_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, bd); + count = 2; + } else { + vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, bd); + } + } else if (mask_8x8 & 1) { + if ((mask_8x8 & 3) == 3) { + // Next block's thresholds. + const loop_filter_thresh *lfin = lfthr + *(lfl + 1); + + vpx_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, lfin->lim, + lfin->hev_thr, bd); + + if ((mask_4x4_int & 3) == 3) { + vpx_highbd_lpf_horizontal_4_dual( + s + 4 * pitch, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, + lfin->mblim, lfin->lim, lfin->hev_thr, bd); + } else { + if (mask_4x4_int & 1) { + vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, bd); + } else if (mask_4x4_int & 2) { + vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim, + lfin->lim, lfin->hev_thr, bd); + } + } + count = 2; + } else { + vpx_highbd_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, bd); + + if (mask_4x4_int & 1) { + vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, bd); + } + } + } else if (mask_4x4 & 1) { + if ((mask_4x4 & 3) == 3) { + // Next block's thresholds. + const loop_filter_thresh *lfin = lfthr + *(lfl + 1); + + vpx_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, lfin->lim, + lfin->hev_thr, bd); + if ((mask_4x4_int & 3) == 3) { + vpx_highbd_lpf_horizontal_4_dual( + s + 4 * pitch, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, + lfin->mblim, lfin->lim, lfin->hev_thr, bd); + } else { + if (mask_4x4_int & 1) { + vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, bd); + } else if (mask_4x4_int & 2) { + vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim, + lfin->lim, lfin->hev_thr, bd); + } + } + count = 2; + } else { + vpx_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, bd); + + if (mask_4x4_int & 1) { + vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, bd); + } + } + } else { + vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, bd); + } + } + s += 8 * count; + lfl += count; + mask_16x16 >>= count; + mask_8x8 >>= count; + mask_4x4 >>= count; + mask_4x4_int >>= count; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +// This function ors into the current lfm structure, where to do loop +// filters for the specific mi we are looking at. It uses information +// including the block_size_type (32x16, 32x32, etc.), the transform size, +// whether there were any coefficients encoded, and the loop filter strength +// block we are currently looking at. Shift is used to position the +// 1's we produce. +static void build_masks(const loop_filter_info_n *const lfi_n, + const MODE_INFO *mi, const int shift_y, + const int shift_uv, LOOP_FILTER_MASK *lfm) { + const BLOCK_SIZE block_size = mi->sb_type; + const TX_SIZE tx_size_y = mi->tx_size; + const TX_SIZE tx_size_uv = uv_txsize_lookup[block_size][tx_size_y][1][1]; + const int filter_level = get_filter_level(lfi_n, mi); + uint64_t *const left_y = &lfm->left_y[tx_size_y]; + uint64_t *const above_y = &lfm->above_y[tx_size_y]; + uint64_t *const int_4x4_y = &lfm->int_4x4_y; + uint16_t *const left_uv = &lfm->left_uv[tx_size_uv]; + uint16_t *const above_uv = &lfm->above_uv[tx_size_uv]; + uint16_t *const int_4x4_uv = &lfm->int_4x4_uv; + int i; + + // If filter level is 0 we don't loop filter. + if (!filter_level) { + return; + } else { + const int w = num_8x8_blocks_wide_lookup[block_size]; + const int h = num_8x8_blocks_high_lookup[block_size]; + int index = shift_y; + for (i = 0; i < h; i++) { + memset(&lfm->lfl_y[index], filter_level, w); + index += 8; + } + } + + // These set 1 in the current block size for the block size edges. + // For instance if the block size is 32x16, we'll set: + // above = 1111 + // 0000 + // and + // left = 1000 + // = 1000 + // NOTE : In this example the low bit is left most ( 1000 ) is stored as + // 1, not 8... + // + // U and V set things on a 16 bit scale. + // + *above_y |= above_prediction_mask[block_size] << shift_y; + *above_uv |= above_prediction_mask_uv[block_size] << shift_uv; + *left_y |= left_prediction_mask[block_size] << shift_y; + *left_uv |= left_prediction_mask_uv[block_size] << shift_uv; + + // If the block has no coefficients and is not intra we skip applying + // the loop filter on block edges. + if (mi->skip && is_inter_block(mi)) return; + + // Here we are adding a mask for the transform size. The transform + // size mask is set to be correct for a 64x64 prediction block size. We + // mask to match the size of the block we are working on and then shift it + // into place.. + *above_y |= (size_mask[block_size] & above_64x64_txform_mask[tx_size_y]) + << shift_y; + *above_uv |= + (size_mask_uv[block_size] & above_64x64_txform_mask_uv[tx_size_uv]) + << shift_uv; + + *left_y |= (size_mask[block_size] & left_64x64_txform_mask[tx_size_y]) + << shift_y; + *left_uv |= (size_mask_uv[block_size] & left_64x64_txform_mask_uv[tx_size_uv]) + << shift_uv; + + // Here we are trying to determine what to do with the internal 4x4 block + // boundaries. These differ from the 4x4 boundaries on the outside edge of + // an 8x8 in that the internal ones can be skipped and don't depend on + // the prediction block size. + if (tx_size_y == TX_4X4) *int_4x4_y |= size_mask[block_size] << shift_y; + + if (tx_size_uv == TX_4X4) + *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv; +} + +// This function does the same thing as the one above with the exception that +// it only affects the y masks. It exists because for blocks < 16x16 in size, +// we only update u and v masks on the first block. +static void build_y_mask(const loop_filter_info_n *const lfi_n, + const MODE_INFO *mi, const int shift_y, + LOOP_FILTER_MASK *lfm) { + const BLOCK_SIZE block_size = mi->sb_type; + const TX_SIZE tx_size_y = mi->tx_size; + const int filter_level = get_filter_level(lfi_n, mi); + uint64_t *const left_y = &lfm->left_y[tx_size_y]; + uint64_t *const above_y = &lfm->above_y[tx_size_y]; + uint64_t *const int_4x4_y = &lfm->int_4x4_y; + int i; + + if (!filter_level) { + return; + } else { + const int w = num_8x8_blocks_wide_lookup[block_size]; + const int h = num_8x8_blocks_high_lookup[block_size]; + int index = shift_y; + for (i = 0; i < h; i++) { + memset(&lfm->lfl_y[index], filter_level, w); + index += 8; + } + } + + *above_y |= above_prediction_mask[block_size] << shift_y; + *left_y |= left_prediction_mask[block_size] << shift_y; + + if (mi->skip && is_inter_block(mi)) return; + + *above_y |= (size_mask[block_size] & above_64x64_txform_mask[tx_size_y]) + << shift_y; + + *left_y |= (size_mask[block_size] & left_64x64_txform_mask[tx_size_y]) + << shift_y; + + if (tx_size_y == TX_4X4) *int_4x4_y |= size_mask[block_size] << shift_y; +} + +void vp9_adjust_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, + LOOP_FILTER_MASK *lfm) { + int i; + + // The largest loopfilter we have is 16x16 so we use the 16x16 mask + // for 32x32 transforms also. + lfm->left_y[TX_16X16] |= lfm->left_y[TX_32X32]; + lfm->above_y[TX_16X16] |= lfm->above_y[TX_32X32]; + lfm->left_uv[TX_16X16] |= lfm->left_uv[TX_32X32]; + lfm->above_uv[TX_16X16] |= lfm->above_uv[TX_32X32]; + + // We do at least 8 tap filter on every 32x32 even if the transform size + // is 4x4. So if the 4x4 is set on a border pixel add it to the 8x8 and + // remove it from the 4x4. + lfm->left_y[TX_8X8] |= lfm->left_y[TX_4X4] & left_border; + lfm->left_y[TX_4X4] &= ~left_border; + lfm->above_y[TX_8X8] |= lfm->above_y[TX_4X4] & above_border; + lfm->above_y[TX_4X4] &= ~above_border; + lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_4X4] & left_border_uv; + lfm->left_uv[TX_4X4] &= ~left_border_uv; + lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_4X4] & above_border_uv; + lfm->above_uv[TX_4X4] &= ~above_border_uv; + + // We do some special edge handling. + if (mi_row + MI_BLOCK_SIZE > cm->mi_rows) { + const uint64_t rows = cm->mi_rows - mi_row; + + // Each pixel inside the border gets a 1, + const uint64_t mask_y = (((uint64_t)1 << (rows << 3)) - 1); + const uint16_t mask_uv = (((uint16_t)1 << (((rows + 1) >> 1) << 2)) - 1); + + // Remove values completely outside our border. + for (i = 0; i < TX_32X32; i++) { + lfm->left_y[i] &= mask_y; + lfm->above_y[i] &= mask_y; + lfm->left_uv[i] &= mask_uv; + lfm->above_uv[i] &= mask_uv; + } + lfm->int_4x4_y &= mask_y; + lfm->int_4x4_uv &= mask_uv; + + // We don't apply a wide loop filter on the last uv block row. If set + // apply the shorter one instead. + if (rows == 1) { + lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16]; + lfm->above_uv[TX_16X16] = 0; + } + if (rows == 5) { + lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16] & 0xff00; + lfm->above_uv[TX_16X16] &= ~(lfm->above_uv[TX_16X16] & 0xff00); + } + } + + if (mi_col + MI_BLOCK_SIZE > cm->mi_cols) { + const uint64_t columns = cm->mi_cols - mi_col; + + // Each pixel inside the border gets a 1, the multiply copies the border + // to where we need it. + const uint64_t mask_y = (((1 << columns) - 1)) * 0x0101010101010101ULL; + const uint16_t mask_uv = ((1 << ((columns + 1) >> 1)) - 1) * 0x1111; + + // Internal edges are not applied on the last column of the image so + // we mask 1 more for the internal edges + const uint16_t mask_uv_int = ((1 << (columns >> 1)) - 1) * 0x1111; + + // Remove the bits outside the image edge. + for (i = 0; i < TX_32X32; i++) { + lfm->left_y[i] &= mask_y; + lfm->above_y[i] &= mask_y; + lfm->left_uv[i] &= mask_uv; + lfm->above_uv[i] &= mask_uv; + } + lfm->int_4x4_y &= mask_y; + lfm->int_4x4_uv &= mask_uv_int; + + // We don't apply a wide loop filter on the last uv column. If set + // apply the shorter one instead. + if (columns == 1) { + lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_16X16]; + lfm->left_uv[TX_16X16] = 0; + } + if (columns == 5) { + lfm->left_uv[TX_8X8] |= (lfm->left_uv[TX_16X16] & 0xcccc); + lfm->left_uv[TX_16X16] &= ~(lfm->left_uv[TX_16X16] & 0xcccc); + } + } + // We don't apply a loop filter on the first column in the image, mask that + // out. + if (mi_col == 0) { + for (i = 0; i < TX_32X32; i++) { + lfm->left_y[i] &= 0xfefefefefefefefeULL; + lfm->left_uv[i] &= 0xeeee; + } + } + + // Assert if we try to apply 2 different loop filters at the same position. + assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_8X8])); + assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_4X4])); + assert(!(lfm->left_y[TX_8X8] & lfm->left_y[TX_4X4])); + assert(!(lfm->int_4x4_y & lfm->left_y[TX_16X16])); + assert(!(lfm->left_uv[TX_16X16] & lfm->left_uv[TX_8X8])); + assert(!(lfm->left_uv[TX_16X16] & lfm->left_uv[TX_4X4])); + assert(!(lfm->left_uv[TX_8X8] & lfm->left_uv[TX_4X4])); + assert(!(lfm->int_4x4_uv & lfm->left_uv[TX_16X16])); + assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_8X8])); + assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_4X4])); + assert(!(lfm->above_y[TX_8X8] & lfm->above_y[TX_4X4])); + assert(!(lfm->int_4x4_y & lfm->above_y[TX_16X16])); + assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_8X8])); + assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_4X4])); + assert(!(lfm->above_uv[TX_8X8] & lfm->above_uv[TX_4X4])); + assert(!(lfm->int_4x4_uv & lfm->above_uv[TX_16X16])); +} + +// This function sets up the bit masks for the entire 64x64 region represented +// by mi_row, mi_col. +void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, + MODE_INFO **mi8x8, const int mode_info_stride, + LOOP_FILTER_MASK *lfm) { + int idx_32, idx_16, idx_8; + const loop_filter_info_n *const lfi_n = &cm->lf_info; + MODE_INFO **mip = mi8x8; + MODE_INFO **mip2 = mi8x8; + + // These are offsets to the next mi in the 64x64 block. It is what gets + // added to the mi ptr as we go through each loop. It helps us to avoid + // setting up special row and column counters for each index. The last step + // brings us out back to the starting position. + const int offset_32[] = { 4, (mode_info_stride << 2) - 4, 4, + -(mode_info_stride << 2) - 4 }; + const int offset_16[] = { 2, (mode_info_stride << 1) - 2, 2, + -(mode_info_stride << 1) - 2 }; + const int offset[] = { 1, mode_info_stride - 1, 1, -mode_info_stride - 1 }; + + // Following variables represent shifts to position the current block + // mask over the appropriate block. A shift of 36 to the left will move + // the bits for the final 32 by 32 block in the 64x64 up 4 rows and left + // 4 rows to the appropriate spot. + const int shift_32_y[] = { 0, 4, 32, 36 }; + const int shift_16_y[] = { 0, 2, 16, 18 }; + const int shift_8_y[] = { 0, 1, 8, 9 }; + const int shift_32_uv[] = { 0, 2, 8, 10 }; + const int shift_16_uv[] = { 0, 1, 4, 5 }; + const int max_rows = + (mi_row + MI_BLOCK_SIZE > cm->mi_rows ? cm->mi_rows - mi_row + : MI_BLOCK_SIZE); + const int max_cols = + (mi_col + MI_BLOCK_SIZE > cm->mi_cols ? cm->mi_cols - mi_col + : MI_BLOCK_SIZE); + + vp9_zero(*lfm); + assert(mip[0] != NULL); + + switch (mip[0]->sb_type) { + case BLOCK_64X64: build_masks(lfi_n, mip[0], 0, 0, lfm); break; + case BLOCK_64X32: + build_masks(lfi_n, mip[0], 0, 0, lfm); + mip2 = mip + mode_info_stride * 4; + if (4 >= max_rows) break; + build_masks(lfi_n, mip2[0], 32, 8, lfm); + break; + case BLOCK_32X64: + build_masks(lfi_n, mip[0], 0, 0, lfm); + mip2 = mip + 4; + if (4 >= max_cols) break; + build_masks(lfi_n, mip2[0], 4, 2, lfm); + break; + default: + for (idx_32 = 0; idx_32 < 4; mip += offset_32[idx_32], ++idx_32) { + const int shift_y_32 = shift_32_y[idx_32]; + const int shift_uv_32 = shift_32_uv[idx_32]; + const int mi_32_col_offset = ((idx_32 & 1) << 2); + const int mi_32_row_offset = ((idx_32 >> 1) << 2); + if (mi_32_col_offset >= max_cols || mi_32_row_offset >= max_rows) + continue; + switch (mip[0]->sb_type) { + case BLOCK_32X32: + build_masks(lfi_n, mip[0], shift_y_32, shift_uv_32, lfm); + break; + case BLOCK_32X16: + build_masks(lfi_n, mip[0], shift_y_32, shift_uv_32, lfm); + if (mi_32_row_offset + 2 >= max_rows) continue; + mip2 = mip + mode_info_stride * 2; + build_masks(lfi_n, mip2[0], shift_y_32 + 16, shift_uv_32 + 4, lfm); + break; + case BLOCK_16X32: + build_masks(lfi_n, mip[0], shift_y_32, shift_uv_32, lfm); + if (mi_32_col_offset + 2 >= max_cols) continue; + mip2 = mip + 2; + build_masks(lfi_n, mip2[0], shift_y_32 + 2, shift_uv_32 + 1, lfm); + break; + default: + for (idx_16 = 0; idx_16 < 4; mip += offset_16[idx_16], ++idx_16) { + const int shift_y_16 = shift_y_32 + shift_16_y[idx_16]; + const int shift_uv_16 = shift_uv_32 + shift_16_uv[idx_16]; + const int mi_16_col_offset = + mi_32_col_offset + ((idx_16 & 1) << 1); + const int mi_16_row_offset = + mi_32_row_offset + ((idx_16 >> 1) << 1); + + if (mi_16_col_offset >= max_cols || mi_16_row_offset >= max_rows) + continue; + + switch (mip[0]->sb_type) { + case BLOCK_16X16: + build_masks(lfi_n, mip[0], shift_y_16, shift_uv_16, lfm); + break; + case BLOCK_16X8: + build_masks(lfi_n, mip[0], shift_y_16, shift_uv_16, lfm); + if (mi_16_row_offset + 1 >= max_rows) continue; + mip2 = mip + mode_info_stride; + build_y_mask(lfi_n, mip2[0], shift_y_16 + 8, lfm); + break; + case BLOCK_8X16: + build_masks(lfi_n, mip[0], shift_y_16, shift_uv_16, lfm); + if (mi_16_col_offset + 1 >= max_cols) continue; + mip2 = mip + 1; + build_y_mask(lfi_n, mip2[0], shift_y_16 + 1, lfm); + break; + default: { + const int shift_y_8_0 = shift_y_16 + shift_8_y[0]; + build_masks(lfi_n, mip[0], shift_y_8_0, shift_uv_16, lfm); + mip += offset[0]; + for (idx_8 = 1; idx_8 < 4; mip += offset[idx_8], ++idx_8) { + const int shift_y_8 = shift_y_16 + shift_8_y[idx_8]; + const int mi_8_col_offset = + mi_16_col_offset + ((idx_8 & 1)); + const int mi_8_row_offset = + mi_16_row_offset + ((idx_8 >> 1)); + + if (mi_8_col_offset >= max_cols || + mi_8_row_offset >= max_rows) + continue; + build_y_mask(lfi_n, mip[0], shift_y_8, lfm); + } + break; + } + } + } + break; + } + } + break; + } +} + +static void filter_selectively_vert( + uint8_t *s, int pitch, unsigned int mask_16x16, unsigned int mask_8x8, + unsigned int mask_4x4, unsigned int mask_4x4_int, + const loop_filter_thresh *lfthr, const uint8_t *lfl) { + unsigned int mask; + + for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; mask; + mask >>= 1) { + const loop_filter_thresh *lfi = lfthr + *lfl; + + if (mask & 1) { + if (mask_16x16 & 1) { + vpx_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); + } else if (mask_8x8 & 1) { + vpx_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); + } else if (mask_4x4 & 1) { + vpx_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); + } + } + if (mask_4x4_int & 1) + vpx_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); + s += 8; + lfl += 1; + mask_16x16 >>= 1; + mask_8x8 >>= 1; + mask_4x4 >>= 1; + mask_4x4_int >>= 1; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +static void highbd_filter_selectively_vert( + uint16_t *s, int pitch, unsigned int mask_16x16, unsigned int mask_8x8, + unsigned int mask_4x4, unsigned int mask_4x4_int, + const loop_filter_thresh *lfthr, const uint8_t *lfl, int bd) { + unsigned int mask; + + for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; mask; + mask >>= 1) { + const loop_filter_thresh *lfi = lfthr + *lfl; + + if (mask & 1) { + if (mask_16x16 & 1) { + vpx_highbd_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, + bd); + } else if (mask_8x8 & 1) { + vpx_highbd_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, + bd); + } else if (mask_4x4 & 1) { + vpx_highbd_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, + bd); + } + } + if (mask_4x4_int & 1) + vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, bd); + s += 8; + lfl += 1; + mask_16x16 >>= 1; + mask_8x8 >>= 1; + mask_4x4 >>= 1; + mask_4x4_int >>= 1; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +void vp9_filter_block_plane_non420(VP9_COMMON *cm, + struct macroblockd_plane *plane, + MODE_INFO **mi_8x8, int mi_row, int mi_col) { + const int ss_x = plane->subsampling_x; + const int ss_y = plane->subsampling_y; + const int row_step = 1 << ss_y; + const int col_step = 1 << ss_x; + const int row_step_stride = cm->mi_stride * row_step; + struct buf_2d *const dst = &plane->dst; + uint8_t *const dst0 = dst->buf; + unsigned int mask_16x16[MI_BLOCK_SIZE]; + unsigned int mask_8x8[MI_BLOCK_SIZE]; + unsigned int mask_4x4[MI_BLOCK_SIZE]; + unsigned int mask_4x4_int[MI_BLOCK_SIZE]; + uint8_t lfl[MI_BLOCK_SIZE * MI_BLOCK_SIZE]; + int r, c; + + vp9_zero(mask_16x16); + vp9_zero(mask_8x8); + vp9_zero(mask_4x4); + vp9_zero(mask_4x4_int); + vp9_zero(lfl); + + for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) { + unsigned int mask_16x16_c = 0; + unsigned int mask_8x8_c = 0; + unsigned int mask_4x4_c = 0; + unsigned int border_mask; + + // Determine the vertical edges that need filtering + for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) { + const MODE_INFO *mi = mi_8x8[c]; + const BLOCK_SIZE sb_type = mi[0].sb_type; + const int skip_this = mi[0].skip && is_inter_block(mi); + // left edge of current unit is block/partition edge -> no skip + const int block_edge_left = + (num_4x4_blocks_wide_lookup[sb_type] > 1) + ? !(c & (num_8x8_blocks_wide_lookup[sb_type] - 1)) + : 1; + const int skip_this_c = skip_this && !block_edge_left; + // top edge of current unit is block/partition edge -> no skip + const int block_edge_above = + (num_4x4_blocks_high_lookup[sb_type] > 1) + ? !(r & (num_8x8_blocks_high_lookup[sb_type] - 1)) + : 1; + const int skip_this_r = skip_this && !block_edge_above; + const TX_SIZE tx_size = get_uv_tx_size(mi, plane); + const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1; + const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1; + + // Filter level can vary per MI + if (!(lfl[(r << 3) + (c >> ss_x)] = get_filter_level(&cm->lf_info, mi))) + continue; + + // Build masks based on the transform size of each block + if (tx_size == TX_32X32) { + if (!skip_this_c && ((c >> ss_x) & 3) == 0) { + if (!skip_border_4x4_c) + mask_16x16_c |= 1 << (c >> ss_x); + else + mask_8x8_c |= 1 << (c >> ss_x); + } + if (!skip_this_r && ((r >> ss_y) & 3) == 0) { + if (!skip_border_4x4_r) + mask_16x16[r] |= 1 << (c >> ss_x); + else + mask_8x8[r] |= 1 << (c >> ss_x); + } + } else if (tx_size == TX_16X16) { + if (!skip_this_c && ((c >> ss_x) & 1) == 0) { + if (!skip_border_4x4_c) + mask_16x16_c |= 1 << (c >> ss_x); + else + mask_8x8_c |= 1 << (c >> ss_x); + } + if (!skip_this_r && ((r >> ss_y) & 1) == 0) { + if (!skip_border_4x4_r) + mask_16x16[r] |= 1 << (c >> ss_x); + else + mask_8x8[r] |= 1 << (c >> ss_x); + } + } else { + // force 8x8 filtering on 32x32 boundaries + if (!skip_this_c) { + if (tx_size == TX_8X8 || ((c >> ss_x) & 3) == 0) + mask_8x8_c |= 1 << (c >> ss_x); + else + mask_4x4_c |= 1 << (c >> ss_x); + } + + if (!skip_this_r) { + if (tx_size == TX_8X8 || ((r >> ss_y) & 3) == 0) + mask_8x8[r] |= 1 << (c >> ss_x); + else + mask_4x4[r] |= 1 << (c >> ss_x); + } + + if (!skip_this && tx_size < TX_8X8 && !skip_border_4x4_c) + mask_4x4_int[r] |= 1 << (c >> ss_x); + } + } + + // Disable filtering on the leftmost column + border_mask = ~(mi_col == 0 ? 1u : 0u); +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) { + highbd_filter_selectively_vert( + CONVERT_TO_SHORTPTR(dst->buf), dst->stride, + mask_16x16_c & border_mask, mask_8x8_c & border_mask, + mask_4x4_c & border_mask, mask_4x4_int[r], cm->lf_info.lfthr, + &lfl[r << 3], (int)cm->bit_depth); + } else { +#endif // CONFIG_VP9_HIGHBITDEPTH + filter_selectively_vert(dst->buf, dst->stride, mask_16x16_c & border_mask, + mask_8x8_c & border_mask, + mask_4x4_c & border_mask, mask_4x4_int[r], + cm->lf_info.lfthr, &lfl[r << 3]); +#if CONFIG_VP9_HIGHBITDEPTH + } +#endif // CONFIG_VP9_HIGHBITDEPTH + dst->buf += 8 * dst->stride; + mi_8x8 += row_step_stride; + } + + // Now do horizontal pass + dst->buf = dst0; + for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) { + const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1; + const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r]; + + unsigned int mask_16x16_r; + unsigned int mask_8x8_r; + unsigned int mask_4x4_r; + + if (mi_row + r == 0) { + mask_16x16_r = 0; + mask_8x8_r = 0; + mask_4x4_r = 0; + } else { + mask_16x16_r = mask_16x16[r]; + mask_8x8_r = mask_8x8[r]; + mask_4x4_r = mask_4x4[r]; + } +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) { + highbd_filter_selectively_horiz( + CONVERT_TO_SHORTPTR(dst->buf), dst->stride, mask_16x16_r, mask_8x8_r, + mask_4x4_r, mask_4x4_int_r, cm->lf_info.lfthr, &lfl[r << 3], + (int)cm->bit_depth); + } else { +#endif // CONFIG_VP9_HIGHBITDEPTH + filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r, + mask_4x4_r, mask_4x4_int_r, cm->lf_info.lfthr, + &lfl[r << 3]); +#if CONFIG_VP9_HIGHBITDEPTH + } +#endif // CONFIG_VP9_HIGHBITDEPTH + dst->buf += 8 * dst->stride; + } +} + +void vp9_filter_block_plane_ss00(VP9_COMMON *const cm, + struct macroblockd_plane *const plane, + int mi_row, LOOP_FILTER_MASK *lfm) { + struct buf_2d *const dst = &plane->dst; + uint8_t *const dst0 = dst->buf; + int r; + uint64_t mask_16x16 = lfm->left_y[TX_16X16]; + uint64_t mask_8x8 = lfm->left_y[TX_8X8]; + uint64_t mask_4x4 = lfm->left_y[TX_4X4]; + uint64_t mask_4x4_int = lfm->int_4x4_y; + + assert(plane->subsampling_x == 0 && plane->subsampling_y == 0); + + // Vertical pass: do 2 rows at one time + for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) { +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) { + // Disable filtering on the leftmost column. + highbd_filter_selectively_vert_row2( + plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, + (unsigned int)mask_16x16, (unsigned int)mask_8x8, + (unsigned int)mask_4x4, (unsigned int)mask_4x4_int, cm->lf_info.lfthr, + &lfm->lfl_y[r << 3], (int)cm->bit_depth); + } else { +#endif // CONFIG_VP9_HIGHBITDEPTH + // Disable filtering on the leftmost column. + filter_selectively_vert_row2( + plane->subsampling_x, dst->buf, dst->stride, (unsigned int)mask_16x16, + (unsigned int)mask_8x8, (unsigned int)mask_4x4, + (unsigned int)mask_4x4_int, cm->lf_info.lfthr, &lfm->lfl_y[r << 3]); +#if CONFIG_VP9_HIGHBITDEPTH + } +#endif // CONFIG_VP9_HIGHBITDEPTH + dst->buf += 16 * dst->stride; + mask_16x16 >>= 16; + mask_8x8 >>= 16; + mask_4x4 >>= 16; + mask_4x4_int >>= 16; + } + + // Horizontal pass + dst->buf = dst0; + mask_16x16 = lfm->above_y[TX_16X16]; + mask_8x8 = lfm->above_y[TX_8X8]; + mask_4x4 = lfm->above_y[TX_4X4]; + mask_4x4_int = lfm->int_4x4_y; + + for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r++) { + unsigned int mask_16x16_r; + unsigned int mask_8x8_r; + unsigned int mask_4x4_r; + + if (mi_row + r == 0) { + mask_16x16_r = 0; + mask_8x8_r = 0; + mask_4x4_r = 0; + } else { + mask_16x16_r = mask_16x16 & 0xff; + mask_8x8_r = mask_8x8 & 0xff; + mask_4x4_r = mask_4x4 & 0xff; + } + +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) { + highbd_filter_selectively_horiz( + CONVERT_TO_SHORTPTR(dst->buf), dst->stride, mask_16x16_r, mask_8x8_r, + mask_4x4_r, mask_4x4_int & 0xff, cm->lf_info.lfthr, + &lfm->lfl_y[r << 3], (int)cm->bit_depth); + } else { +#endif // CONFIG_VP9_HIGHBITDEPTH + filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r, + mask_4x4_r, mask_4x4_int & 0xff, + cm->lf_info.lfthr, &lfm->lfl_y[r << 3]); +#if CONFIG_VP9_HIGHBITDEPTH + } +#endif // CONFIG_VP9_HIGHBITDEPTH + + dst->buf += 8 * dst->stride; + mask_16x16 >>= 8; + mask_8x8 >>= 8; + mask_4x4 >>= 8; + mask_4x4_int >>= 8; + } +} + +void vp9_filter_block_plane_ss11(VP9_COMMON *const cm, + struct macroblockd_plane *const plane, + int mi_row, LOOP_FILTER_MASK *lfm) { + struct buf_2d *const dst = &plane->dst; + uint8_t *const dst0 = dst->buf; + int r, c; + uint8_t lfl_uv[16]; + + uint16_t mask_16x16 = lfm->left_uv[TX_16X16]; + uint16_t mask_8x8 = lfm->left_uv[TX_8X8]; + uint16_t mask_4x4 = lfm->left_uv[TX_4X4]; + uint16_t mask_4x4_int = lfm->int_4x4_uv; + + vp9_zero(lfl_uv); + + assert(plane->subsampling_x == 1 && plane->subsampling_y == 1); + + // Vertical pass: do 2 rows at one time + for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 4) { + for (c = 0; c < (MI_BLOCK_SIZE >> 1); c++) { + lfl_uv[(r << 1) + c] = lfm->lfl_y[(r << 3) + (c << 1)]; + lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) + (c << 1)]; + } + +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) { + // Disable filtering on the leftmost column. + highbd_filter_selectively_vert_row2( + plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, + (unsigned int)mask_16x16, (unsigned int)mask_8x8, + (unsigned int)mask_4x4, (unsigned int)mask_4x4_int, cm->lf_info.lfthr, + &lfl_uv[r << 1], (int)cm->bit_depth); + } else { +#endif // CONFIG_VP9_HIGHBITDEPTH + // Disable filtering on the leftmost column. + filter_selectively_vert_row2( + plane->subsampling_x, dst->buf, dst->stride, (unsigned int)mask_16x16, + (unsigned int)mask_8x8, (unsigned int)mask_4x4, + (unsigned int)mask_4x4_int, cm->lf_info.lfthr, &lfl_uv[r << 1]); +#if CONFIG_VP9_HIGHBITDEPTH + } +#endif // CONFIG_VP9_HIGHBITDEPTH + + dst->buf += 16 * dst->stride; + mask_16x16 >>= 8; + mask_8x8 >>= 8; + mask_4x4 >>= 8; + mask_4x4_int >>= 8; + } + + // Horizontal pass + dst->buf = dst0; + mask_16x16 = lfm->above_uv[TX_16X16]; + mask_8x8 = lfm->above_uv[TX_8X8]; + mask_4x4 = lfm->above_uv[TX_4X4]; + mask_4x4_int = lfm->int_4x4_uv; + + for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) { + const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1; + const unsigned int mask_4x4_int_r = + skip_border_4x4_r ? 0 : (mask_4x4_int & 0xf); + unsigned int mask_16x16_r; + unsigned int mask_8x8_r; + unsigned int mask_4x4_r; + + if (mi_row + r == 0) { + mask_16x16_r = 0; + mask_8x8_r = 0; + mask_4x4_r = 0; + } else { + mask_16x16_r = mask_16x16 & 0xf; + mask_8x8_r = mask_8x8 & 0xf; + mask_4x4_r = mask_4x4 & 0xf; + } + +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) { + highbd_filter_selectively_horiz( + CONVERT_TO_SHORTPTR(dst->buf), dst->stride, mask_16x16_r, mask_8x8_r, + mask_4x4_r, mask_4x4_int_r, cm->lf_info.lfthr, &lfl_uv[r << 1], + (int)cm->bit_depth); + } else { +#endif // CONFIG_VP9_HIGHBITDEPTH + filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r, + mask_4x4_r, mask_4x4_int_r, cm->lf_info.lfthr, + &lfl_uv[r << 1]); +#if CONFIG_VP9_HIGHBITDEPTH + } +#endif // CONFIG_VP9_HIGHBITDEPTH + + dst->buf += 8 * dst->stride; + mask_16x16 >>= 4; + mask_8x8 >>= 4; + mask_4x4 >>= 4; + mask_4x4_int >>= 4; + } +} + +static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, VP9_COMMON *cm, + struct macroblockd_plane planes[MAX_MB_PLANE], + int start, int stop, int y_only) { + const int num_planes = y_only ? 1 : MAX_MB_PLANE; + enum lf_path path; + int mi_row, mi_col; + + if (y_only) + path = LF_PATH_444; + else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1) + path = LF_PATH_420; + else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0) + path = LF_PATH_444; + else + path = LF_PATH_SLOW; + + for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) { + MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride; + LOOP_FILTER_MASK *lfm = get_lfm(&cm->lf, mi_row, 0); + + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE, ++lfm) { + int plane; + + vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col); + + // TODO(jimbankoski): For 444 only need to do y mask. + vp9_adjust_mask(cm, mi_row, mi_col, lfm); + + vp9_filter_block_plane_ss00(cm, &planes[0], mi_row, lfm); + for (plane = 1; plane < num_planes; ++plane) { + switch (path) { + case LF_PATH_420: + vp9_filter_block_plane_ss11(cm, &planes[plane], mi_row, lfm); + break; + case LF_PATH_444: + vp9_filter_block_plane_ss00(cm, &planes[plane], mi_row, lfm); + break; + case LF_PATH_SLOW: + vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col, + mi_row, mi_col); + break; + } + } + } + } +} + +void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm, + MACROBLOCKD *xd, int frame_filter_level, int y_only, + int partial_frame) { + int start_mi_row, end_mi_row, mi_rows_to_filter; + if (!frame_filter_level) return; + start_mi_row = 0; + mi_rows_to_filter = cm->mi_rows; + if (partial_frame && cm->mi_rows > 8) { + start_mi_row = cm->mi_rows >> 1; + start_mi_row &= 0xfffffff8; + mi_rows_to_filter = VPXMAX(cm->mi_rows / 8, 8); + } + end_mi_row = start_mi_row + mi_rows_to_filter; + loop_filter_rows(frame, cm, xd->plane, start_mi_row, end_mi_row, y_only); +} + +// Used by the encoder to build the loopfilter masks. +// TODO(slavarnway): Do the encoder the same way the decoder does it and +// build the masks in line as part of the encode process. +void vp9_build_mask_frame(VP9_COMMON *cm, int frame_filter_level, + int partial_frame) { + int start_mi_row, end_mi_row, mi_rows_to_filter; + int mi_col, mi_row; + if (!frame_filter_level) return; + start_mi_row = 0; + mi_rows_to_filter = cm->mi_rows; + if (partial_frame && cm->mi_rows > 8) { + start_mi_row = cm->mi_rows >> 1; + start_mi_row &= 0xfffffff8; + mi_rows_to_filter = VPXMAX(cm->mi_rows / 8, 8); + } + end_mi_row = start_mi_row + mi_rows_to_filter; + + vp9_loop_filter_frame_init(cm, frame_filter_level); + + for (mi_row = start_mi_row; mi_row < end_mi_row; mi_row += MI_BLOCK_SIZE) { + MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride; + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) { + // vp9_setup_mask() zeros lfm + vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, + get_lfm(&cm->lf, mi_row, mi_col)); + } + } +} + +// 8x8 blocks in a superblock. A "1" represents the first block in a 16x16 +// or greater area. +static const uint8_t first_block_in_16x16[8][8] = { + { 1, 0, 1, 0, 1, 0, 1, 0 }, { 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1, 0, 1, 0, 1, 0, 1, 0 }, { 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1, 0, 1, 0, 1, 0, 1, 0 }, { 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1, 0, 1, 0, 1, 0, 1, 0 }, { 0, 0, 0, 0, 0, 0, 0, 0 } +}; + +// This function sets up the bit masks for a block represented +// by mi_row, mi_col in a 64x64 region. +// TODO(SJL): This function only works for yv12. +void vp9_build_mask(VP9_COMMON *cm, const MODE_INFO *mi, int mi_row, int mi_col, + int bw, int bh) { + const BLOCK_SIZE block_size = mi->sb_type; + const TX_SIZE tx_size_y = mi->tx_size; + const loop_filter_info_n *const lfi_n = &cm->lf_info; + const int filter_level = get_filter_level(lfi_n, mi); + const TX_SIZE tx_size_uv = uv_txsize_lookup[block_size][tx_size_y][1][1]; + LOOP_FILTER_MASK *const lfm = get_lfm(&cm->lf, mi_row, mi_col); + uint64_t *const left_y = &lfm->left_y[tx_size_y]; + uint64_t *const above_y = &lfm->above_y[tx_size_y]; + uint64_t *const int_4x4_y = &lfm->int_4x4_y; + uint16_t *const left_uv = &lfm->left_uv[tx_size_uv]; + uint16_t *const above_uv = &lfm->above_uv[tx_size_uv]; + uint16_t *const int_4x4_uv = &lfm->int_4x4_uv; + const int row_in_sb = (mi_row & 7); + const int col_in_sb = (mi_col & 7); + const int shift_y = col_in_sb + (row_in_sb << 3); + const int shift_uv = (col_in_sb >> 1) + ((row_in_sb >> 1) << 2); + const int build_uv = first_block_in_16x16[row_in_sb][col_in_sb]; + + if (!filter_level) { + return; + } else { + int index = shift_y; + int i; + for (i = 0; i < bh; i++) { + memset(&lfm->lfl_y[index], filter_level, bw); + index += 8; + } + } + + // These set 1 in the current block size for the block size edges. + // For instance if the block size is 32x16, we'll set: + // above = 1111 + // 0000 + // and + // left = 1000 + // = 1000 + // NOTE : In this example the low bit is left most ( 1000 ) is stored as + // 1, not 8... + // + // U and V set things on a 16 bit scale. + // + *above_y |= above_prediction_mask[block_size] << shift_y; + *left_y |= left_prediction_mask[block_size] << shift_y; + + if (build_uv) { + *above_uv |= above_prediction_mask_uv[block_size] << shift_uv; + *left_uv |= left_prediction_mask_uv[block_size] << shift_uv; + } + + // If the block has no coefficients and is not intra we skip applying + // the loop filter on block edges. + if (mi->skip && is_inter_block(mi)) return; + + // Add a mask for the transform size. The transform size mask is set to + // be correct for a 64x64 prediction block size. Mask to match the size of + // the block we are working on and then shift it into place. + *above_y |= (size_mask[block_size] & above_64x64_txform_mask[tx_size_y]) + << shift_y; + *left_y |= (size_mask[block_size] & left_64x64_txform_mask[tx_size_y]) + << shift_y; + + if (build_uv) { + *above_uv |= + (size_mask_uv[block_size] & above_64x64_txform_mask_uv[tx_size_uv]) + << shift_uv; + + *left_uv |= + (size_mask_uv[block_size] & left_64x64_txform_mask_uv[tx_size_uv]) + << shift_uv; + } + + // Try to determine what to do with the internal 4x4 block boundaries. These + // differ from the 4x4 boundaries on the outside edge of an 8x8 in that the + // internal ones can be skipped and don't depend on the prediction block size. + if (tx_size_y == TX_4X4) *int_4x4_y |= size_mask[block_size] << shift_y; + + if (build_uv && tx_size_uv == TX_4X4) + *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv; +} + +void vp9_loop_filter_data_reset( + LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer, + struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]) { + lf_data->frame_buffer = frame_buffer; + lf_data->cm = cm; + lf_data->start = 0; + lf_data->stop = 0; + lf_data->y_only = 0; + memcpy(lf_data->planes, planes, sizeof(lf_data->planes)); +} + +void vp9_reset_lfm(VP9_COMMON *const cm) { + if (cm->lf.filter_level) { + memset(cm->lf.lfm, 0, + ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) * cm->lf.lfm_stride * + sizeof(*cm->lf.lfm)); + } +} + +int vp9_loop_filter_worker(void *arg1, void *unused) { + LFWorkerData *const lf_data = (LFWorkerData *)arg1; + (void)unused; + loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, + lf_data->start, lf_data->stop, lf_data->y_only); + return 1; +} diff --git a/media/libvpx/libvpx/vp9/common/vp9_loopfilter.h b/media/libvpx/libvpx/vp9/common/vp9_loopfilter.h new file mode 100644 index 0000000000..39648a72c3 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_loopfilter.h @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_COMMON_VP9_LOOPFILTER_H_ +#define VPX_VP9_COMMON_VP9_LOOPFILTER_H_ + +#include "vpx_ports/mem.h" +#include "./vpx_config.h" + +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_seg_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_LOOP_FILTER 63 +#define MAX_SHARPNESS 7 + +#define SIMD_WIDTH 16 + +#define MAX_REF_LF_DELTAS 4 +#define MAX_MODE_LF_DELTAS 2 + +enum lf_path { + LF_PATH_420, + LF_PATH_444, + LF_PATH_SLOW, +}; + +// Need to align this structure so when it is declared and +// passed it can be loaded into vector registers. +typedef struct { + DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, mblim[SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, lim[SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, hev_thr[SIMD_WIDTH]); +} loop_filter_thresh; + +typedef struct { + loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1]; + uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS]; +} loop_filter_info_n; + +// This structure holds bit masks for all 8x8 blocks in a 64x64 region. +// Each 1 bit represents a position in which we want to apply the loop filter. +// Left_ entries refer to whether we apply a filter on the border to the +// left of the block. Above_ entries refer to whether or not to apply a +// filter on the above border. Int_ entries refer to whether or not to +// apply borders on the 4x4 edges within the 8x8 block that each bit +// represents. +// Since each transform is accompanied by a potentially different type of +// loop filter there is a different entry in the array for each transform size. +typedef struct { + uint64_t left_y[TX_SIZES]; + uint64_t above_y[TX_SIZES]; + uint64_t int_4x4_y; + uint16_t left_uv[TX_SIZES]; + uint16_t above_uv[TX_SIZES]; + uint16_t int_4x4_uv; + uint8_t lfl_y[64]; +} LOOP_FILTER_MASK; + +struct loopfilter { + int filter_level; + int last_filt_level; + + int sharpness_level; + int last_sharpness_level; + + uint8_t mode_ref_delta_enabled; + uint8_t mode_ref_delta_update; + + // 0 = Intra, Last, GF, ARF + signed char ref_deltas[MAX_REF_LF_DELTAS]; + signed char last_ref_deltas[MAX_REF_LF_DELTAS]; + + // 0 = ZERO_MV, MV + signed char mode_deltas[MAX_MODE_LF_DELTAS]; + signed char last_mode_deltas[MAX_MODE_LF_DELTAS]; + + LOOP_FILTER_MASK *lfm; + int lfm_stride; +}; + +/* assorted loopfilter functions which get used elsewhere */ +struct VP9Common; +struct macroblockd; +struct VP9LfSyncData; + +// This function sets up the bit masks for the entire 64x64 region represented +// by mi_row, mi_col. +void vp9_setup_mask(struct VP9Common *const cm, const int mi_row, + const int mi_col, MODE_INFO **mi8x8, + const int mode_info_stride, LOOP_FILTER_MASK *lfm); + +void vp9_filter_block_plane_ss00(struct VP9Common *const cm, + struct macroblockd_plane *const plane, + int mi_row, LOOP_FILTER_MASK *lfm); + +void vp9_filter_block_plane_ss11(struct VP9Common *const cm, + struct macroblockd_plane *const plane, + int mi_row, LOOP_FILTER_MASK *lfm); + +void vp9_filter_block_plane_non420(struct VP9Common *cm, + struct macroblockd_plane *plane, + MODE_INFO **mi_8x8, int mi_row, int mi_col); + +void vp9_loop_filter_init(struct VP9Common *cm); + +// Update the loop filter for the current frame. +// This should be called before vp9_loop_filter_frame(), vp9_build_mask_frame() +// calls this function directly. +void vp9_loop_filter_frame_init(struct VP9Common *cm, int default_filt_lvl); + +void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct VP9Common *cm, + struct macroblockd *xd, int frame_filter_level, + int y_only, int partial_frame); + +// Get the superblock lfm for a given mi_row, mi_col. +static INLINE LOOP_FILTER_MASK *get_lfm(const struct loopfilter *lf, + const int mi_row, const int mi_col) { + return &lf->lfm[(mi_col >> 3) + ((mi_row >> 3) * lf->lfm_stride)]; +} + +void vp9_build_mask(struct VP9Common *cm, const MODE_INFO *mi, int mi_row, + int mi_col, int bw, int bh); +void vp9_adjust_mask(struct VP9Common *const cm, const int mi_row, + const int mi_col, LOOP_FILTER_MASK *lfm); +void vp9_build_mask_frame(struct VP9Common *cm, int frame_filter_level, + int partial_frame); +void vp9_reset_lfm(struct VP9Common *const cm); + +typedef struct LoopFilterWorkerData { + YV12_BUFFER_CONFIG *frame_buffer; + struct VP9Common *cm; + struct macroblockd_plane planes[MAX_MB_PLANE]; + + int start; + int stop; + int y_only; +} LFWorkerData; + +void vp9_loop_filter_data_reset( + LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer, + struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]); + +// Operates on the rows described by 'arg1' (cast to LFWorkerData *). +int vp9_loop_filter_worker(void *arg1, void *unused); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_COMMON_VP9_LOOPFILTER_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_mfqe.c b/media/libvpx/libvpx/vp9/common/vp9_mfqe.c new file mode 100644 index 0000000000..cf60fa40fd --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_mfqe.c @@ -0,0 +1,383 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "./vpx_scale_rtcd.h" + +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_postproc.h" + +// TODO(jackychen): Replace this function with SSE2 code. There is +// one SSE2 implementation in vp8, so will consider how to share it +// between vp8 and vp9. +static void filter_by_weight(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int block_size, int src_weight) { + const int dst_weight = (1 << MFQE_PRECISION) - src_weight; + const int rounding_bit = 1 << (MFQE_PRECISION - 1); + int r, c; + + for (r = 0; r < block_size; r++) { + for (c = 0; c < block_size; c++) { + dst[c] = (src[c] * src_weight + dst[c] * dst_weight + rounding_bit) >> + MFQE_PRECISION; + } + src += src_stride; + dst += dst_stride; + } +} + +void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int src_weight) { + filter_by_weight(src, src_stride, dst, dst_stride, 8, src_weight); +} + +void vp9_filter_by_weight16x16_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int src_weight) { + filter_by_weight(src, src_stride, dst, dst_stride, 16, src_weight); +} + +static void filter_by_weight32x32(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int weight) { + vp9_filter_by_weight16x16(src, src_stride, dst, dst_stride, weight); + vp9_filter_by_weight16x16(src + 16, src_stride, dst + 16, dst_stride, weight); + vp9_filter_by_weight16x16(src + src_stride * 16, src_stride, + dst + dst_stride * 16, dst_stride, weight); + vp9_filter_by_weight16x16(src + src_stride * 16 + 16, src_stride, + dst + dst_stride * 16 + 16, dst_stride, weight); +} + +static void filter_by_weight64x64(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int weight) { + filter_by_weight32x32(src, src_stride, dst, dst_stride, weight); + filter_by_weight32x32(src + 32, src_stride, dst + 32, dst_stride, weight); + filter_by_weight32x32(src + src_stride * 32, src_stride, + dst + dst_stride * 32, dst_stride, weight); + filter_by_weight32x32(src + src_stride * 32 + 32, src_stride, + dst + dst_stride * 32 + 32, dst_stride, weight); +} + +static void apply_ifactor(const uint8_t *y, int y_stride, uint8_t *yd, + int yd_stride, const uint8_t *u, const uint8_t *v, + int uv_stride, uint8_t *ud, uint8_t *vd, + int uvd_stride, BLOCK_SIZE block_size, int weight) { + if (block_size == BLOCK_16X16) { + vp9_filter_by_weight16x16(y, y_stride, yd, yd_stride, weight); + vp9_filter_by_weight8x8(u, uv_stride, ud, uvd_stride, weight); + vp9_filter_by_weight8x8(v, uv_stride, vd, uvd_stride, weight); + } else if (block_size == BLOCK_32X32) { + filter_by_weight32x32(y, y_stride, yd, yd_stride, weight); + vp9_filter_by_weight16x16(u, uv_stride, ud, uvd_stride, weight); + vp9_filter_by_weight16x16(v, uv_stride, vd, uvd_stride, weight); + } else if (block_size == BLOCK_64X64) { + filter_by_weight64x64(y, y_stride, yd, yd_stride, weight); + filter_by_weight32x32(u, uv_stride, ud, uvd_stride, weight); + filter_by_weight32x32(v, uv_stride, vd, uvd_stride, weight); + } +} + +// TODO(jackychen): Determine whether replace it with assembly code. +static void copy_mem8x8(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride) { + int r; + for (r = 0; r < 8; r++) { + memcpy(dst, src, 8); + src += src_stride; + dst += dst_stride; + } +} + +static void copy_mem16x16(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride) { + int r; + for (r = 0; r < 16; r++) { + memcpy(dst, src, 16); + src += src_stride; + dst += dst_stride; + } +} + +static void copy_mem32x32(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride) { + copy_mem16x16(src, src_stride, dst, dst_stride); + copy_mem16x16(src + 16, src_stride, dst + 16, dst_stride); + copy_mem16x16(src + src_stride * 16, src_stride, dst + dst_stride * 16, + dst_stride); + copy_mem16x16(src + src_stride * 16 + 16, src_stride, + dst + dst_stride * 16 + 16, dst_stride); +} + +static void copy_mem64x64(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride) { + copy_mem32x32(src, src_stride, dst, dst_stride); + copy_mem32x32(src + 32, src_stride, dst + 32, dst_stride); + copy_mem32x32(src + src_stride * 32, src_stride, dst + src_stride * 32, + dst_stride); + copy_mem32x32(src + src_stride * 32 + 32, src_stride, + dst + src_stride * 32 + 32, dst_stride); +} + +static void copy_block(const uint8_t *y, const uint8_t *u, const uint8_t *v, + int y_stride, int uv_stride, uint8_t *yd, uint8_t *ud, + uint8_t *vd, int yd_stride, int uvd_stride, + BLOCK_SIZE bs) { + if (bs == BLOCK_16X16) { + copy_mem16x16(y, y_stride, yd, yd_stride); + copy_mem8x8(u, uv_stride, ud, uvd_stride); + copy_mem8x8(v, uv_stride, vd, uvd_stride); + } else if (bs == BLOCK_32X32) { + copy_mem32x32(y, y_stride, yd, yd_stride); + copy_mem16x16(u, uv_stride, ud, uvd_stride); + copy_mem16x16(v, uv_stride, vd, uvd_stride); + } else { + copy_mem64x64(y, y_stride, yd, yd_stride); + copy_mem32x32(u, uv_stride, ud, uvd_stride); + copy_mem32x32(v, uv_stride, vd, uvd_stride); + } +} + +static void get_thr(BLOCK_SIZE bs, int qdiff, int *sad_thr, int *vdiff_thr) { + const int adj = qdiff >> MFQE_PRECISION; + if (bs == BLOCK_16X16) { + *sad_thr = 7 + adj; + } else if (bs == BLOCK_32X32) { + *sad_thr = 6 + adj; + } else { // BLOCK_64X64 + *sad_thr = 5 + adj; + } + *vdiff_thr = 125 + qdiff; +} + +static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u, + const uint8_t *v, int y_stride, int uv_stride, + uint8_t *yd, uint8_t *ud, uint8_t *vd, int yd_stride, + int uvd_stride, int qdiff) { + int sad, sad_thr, vdiff, vdiff_thr; + uint32_t sse; + + get_thr(bs, qdiff, &sad_thr, &vdiff_thr); + + if (bs == BLOCK_16X16) { + vdiff = (vpx_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8; + sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8; + } else if (bs == BLOCK_32X32) { + vdiff = (vpx_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10; + sad = (vpx_sad32x32(y, y_stride, yd, yd_stride) + 512) >> 10; + } else /* if (bs == BLOCK_64X64) */ { + vdiff = (vpx_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12; + sad = (vpx_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12; + } + + // vdiff > sad * 3 means vdiff should not be too small, otherwise, + // it might be a lighting change in smooth area. When there is a + // lighting change in smooth area, it is dangerous to do MFQE. + if (sad > 1 && vdiff > sad * 3) { + const int weight = 1 << MFQE_PRECISION; + int ifactor = weight * sad * vdiff / (sad_thr * vdiff_thr); + // When ifactor equals weight, no MFQE is done. + if (ifactor > weight) { + ifactor = weight; + } + apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride, ud, vd, + uvd_stride, bs, ifactor); + } else { + // Copy the block from current frame (i.e., no mfqe is done). + copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd, yd_stride, uvd_stride, + bs); + } +} + +static int mfqe_decision(MODE_INFO *mi, BLOCK_SIZE cur_bs) { + // Check the motion in current block(for inter frame), + // or check the motion in the correlated block in last frame (for keyframe). + const int mv_len_square = mi->mv[0].as_mv.row * mi->mv[0].as_mv.row + + mi->mv[0].as_mv.col * mi->mv[0].as_mv.col; + const int mv_threshold = 100; + return mi->mode >= NEARESTMV && // Not an intra block + cur_bs >= BLOCK_16X16 && mv_len_square <= mv_threshold; +} + +// Process each partiton in a super block, recursively. +static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs, + const uint8_t *y, const uint8_t *u, const uint8_t *v, + int y_stride, int uv_stride, uint8_t *yd, + uint8_t *ud, uint8_t *vd, int yd_stride, + int uvd_stride) { + int mi_offset, y_offset, uv_offset; + const BLOCK_SIZE cur_bs = mi->sb_type; + const int qdiff = cm->base_qindex - cm->postproc_state.last_base_qindex; + const int bsl = b_width_log2_lookup[bs]; + PARTITION_TYPE partition = partition_lookup[bsl][cur_bs]; + const BLOCK_SIZE subsize = get_subsize(bs, partition); + BLOCK_SIZE mfqe_bs, bs_tmp; + + if (cur_bs < BLOCK_8X8) { + // If there are blocks smaller than 8x8, it must be on the boundary. + return; + } + // No MFQE on blocks smaller than 16x16 + if (bs == BLOCK_16X16) { + partition = PARTITION_NONE; + } + if (bs == BLOCK_64X64) { + mi_offset = 4; + y_offset = 32; + uv_offset = 16; + } else { + mi_offset = 2; + y_offset = 16; + uv_offset = 8; + } + switch (partition) { + case PARTITION_HORZ: + if (bs == BLOCK_64X64) { + mfqe_bs = BLOCK_64X32; + bs_tmp = BLOCK_32X32; + } else { + mfqe_bs = BLOCK_32X16; + bs_tmp = BLOCK_16X16; + } + if (mfqe_decision(mi, mfqe_bs)) { + // Do mfqe on the first square partition. + mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride, yd, ud, vd, yd_stride, + uvd_stride, qdiff); + // Do mfqe on the second square partition. + mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset, y_stride, + uv_stride, yd + y_offset, ud + uv_offset, vd + uv_offset, + yd_stride, uvd_stride, qdiff); + } + if (mfqe_decision(mi + mi_offset * cm->mi_stride, mfqe_bs)) { + // Do mfqe on the first square partition. + mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride, + v + uv_offset * uv_stride, y_stride, uv_stride, + yd + y_offset * yd_stride, ud + uv_offset * uvd_stride, + vd + uv_offset * uvd_stride, yd_stride, uvd_stride, qdiff); + // Do mfqe on the second square partition. + mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset, + u + uv_offset * uv_stride + uv_offset, + v + uv_offset * uv_stride + uv_offset, y_stride, uv_stride, + yd + y_offset * yd_stride + y_offset, + ud + uv_offset * uvd_stride + uv_offset, + vd + uv_offset * uvd_stride + uv_offset, yd_stride, + uvd_stride, qdiff); + } + break; + case PARTITION_VERT: + if (bs == BLOCK_64X64) { + mfqe_bs = BLOCK_32X64; + bs_tmp = BLOCK_32X32; + } else { + mfqe_bs = BLOCK_16X32; + bs_tmp = BLOCK_16X16; + } + if (mfqe_decision(mi, mfqe_bs)) { + // Do mfqe on the first square partition. + mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride, yd, ud, vd, yd_stride, + uvd_stride, qdiff); + // Do mfqe on the second square partition. + mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride, + v + uv_offset * uv_stride, y_stride, uv_stride, + yd + y_offset * yd_stride, ud + uv_offset * uvd_stride, + vd + uv_offset * uvd_stride, yd_stride, uvd_stride, qdiff); + } + if (mfqe_decision(mi + mi_offset, mfqe_bs)) { + // Do mfqe on the first square partition. + mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset, y_stride, + uv_stride, yd + y_offset, ud + uv_offset, vd + uv_offset, + yd_stride, uvd_stride, qdiff); + // Do mfqe on the second square partition. + mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset, + u + uv_offset * uv_stride + uv_offset, + v + uv_offset * uv_stride + uv_offset, y_stride, uv_stride, + yd + y_offset * yd_stride + y_offset, + ud + uv_offset * uvd_stride + uv_offset, + vd + uv_offset * uvd_stride + uv_offset, yd_stride, + uvd_stride, qdiff); + } + break; + case PARTITION_NONE: + if (mfqe_decision(mi, cur_bs)) { + // Do mfqe on this partition. + mfqe_block(cur_bs, y, u, v, y_stride, uv_stride, yd, ud, vd, yd_stride, + uvd_stride, qdiff); + } else { + // Copy the block from current frame(i.e., no mfqe is done). + copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd, yd_stride, + uvd_stride, bs); + } + break; + case PARTITION_SPLIT: + // Recursion on four square partitions, e.g. if bs is 64X64, + // then look into four 32X32 blocks in it. + mfqe_partition(cm, mi, subsize, y, u, v, y_stride, uv_stride, yd, ud, vd, + yd_stride, uvd_stride); + mfqe_partition(cm, mi + mi_offset, subsize, y + y_offset, u + uv_offset, + v + uv_offset, y_stride, uv_stride, yd + y_offset, + ud + uv_offset, vd + uv_offset, yd_stride, uvd_stride); + mfqe_partition(cm, mi + mi_offset * cm->mi_stride, subsize, + y + y_offset * y_stride, u + uv_offset * uv_stride, + v + uv_offset * uv_stride, y_stride, uv_stride, + yd + y_offset * yd_stride, ud + uv_offset * uvd_stride, + vd + uv_offset * uvd_stride, yd_stride, uvd_stride); + mfqe_partition(cm, mi + mi_offset * cm->mi_stride + mi_offset, subsize, + y + y_offset * y_stride + y_offset, + u + uv_offset * uv_stride + uv_offset, + v + uv_offset * uv_stride + uv_offset, y_stride, uv_stride, + yd + y_offset * yd_stride + y_offset, + ud + uv_offset * uvd_stride + uv_offset, + vd + uv_offset * uvd_stride + uv_offset, yd_stride, + uvd_stride); + break; + default: assert(0); + } +} + +void vp9_mfqe(VP9_COMMON *cm) { + int mi_row, mi_col; + // Current decoded frame. + const YV12_BUFFER_CONFIG *show = cm->frame_to_show; + // Last decoded frame and will store the MFQE result. + YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer; + // Loop through each super block. + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MI_BLOCK_SIZE) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) { + MODE_INFO *mi; + MODE_INFO *mi_local = cm->mi + (mi_row * cm->mi_stride + mi_col); + // Motion Info in last frame. + MODE_INFO *mi_prev = + cm->postproc_state.prev_mi + (mi_row * cm->mi_stride + mi_col); + const uint32_t y_stride = show->y_stride; + const uint32_t uv_stride = show->uv_stride; + const uint32_t yd_stride = dest->y_stride; + const uint32_t uvd_stride = dest->uv_stride; + const uint32_t row_offset_y = mi_row << 3; + const uint32_t row_offset_uv = mi_row << 2; + const uint32_t col_offset_y = mi_col << 3; + const uint32_t col_offset_uv = mi_col << 2; + const uint8_t *y = + show->y_buffer + row_offset_y * y_stride + col_offset_y; + const uint8_t *u = + show->u_buffer + row_offset_uv * uv_stride + col_offset_uv; + const uint8_t *v = + show->v_buffer + row_offset_uv * uv_stride + col_offset_uv; + uint8_t *yd = dest->y_buffer + row_offset_y * yd_stride + col_offset_y; + uint8_t *ud = dest->u_buffer + row_offset_uv * uvd_stride + col_offset_uv; + uint8_t *vd = dest->v_buffer + row_offset_uv * uvd_stride + col_offset_uv; + if (frame_is_intra_only(cm)) { + mi = mi_prev; + } else { + mi = mi_local; + } + mfqe_partition(cm, mi, BLOCK_64X64, y, u, v, y_stride, uv_stride, yd, ud, + vd, yd_stride, uvd_stride); + } + } +} diff --git a/media/libvpx/libvpx/vp9/common/vp9_mfqe.h b/media/libvpx/libvpx/vp9/common/vp9_mfqe.h new file mode 100644 index 0000000000..f53e1c2f9d --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_mfqe.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_COMMON_VP9_MFQE_H_ +#define VPX_VP9_COMMON_VP9_MFQE_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +// Multiframe Quality Enhancement. +// The aim for MFQE is to replace pixel blocks in the current frame with +// the correlated pixel blocks (with higher quality) in the last frame. +// The replacement can only be taken in stationary blocks by checking +// the motion of the blocks and other conditions such as the SAD of +// the current block and correlated block, the variance of the block +// difference, etc. +void vp9_mfqe(struct VP9Common *cm); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_COMMON_VP9_MFQE_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_mv.h b/media/libvpx/libvpx/vp9/common/vp9_mv.h new file mode 100644 index 0000000000..76f93cf0ba --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_mv.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_COMMON_VP9_MV_H_ +#define VPX_VP9_COMMON_VP9_MV_H_ + +#include "vpx/vpx_integer.h" + +#include "vp9/common/vp9_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define INVALID_MV 0x80008000 + +typedef struct mv { + int16_t row; + int16_t col; +} MV; + +typedef union int_mv { + uint32_t as_int; + MV as_mv; +} int_mv; /* facilitates faster equality tests and copies */ + +typedef struct mv32 { + int32_t row; + int32_t col; +} MV32; + +static INLINE int is_zero_mv(const MV *mv) { + return *((const uint32_t *)mv) == 0; +} + +static INLINE int is_equal_mv(const MV *a, const MV *b) { + return *((const uint32_t *)a) == *((const uint32_t *)b); +} + +static INLINE void clamp_mv(MV *mv, int min_col, int max_col, int min_row, + int max_row) { + mv->col = clamp(mv->col, min_col, max_col); + mv->row = clamp(mv->row, min_row, max_row); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_COMMON_VP9_MV_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_mvref_common.c b/media/libvpx/libvpx/vp9/common/vp9_mvref_common.c new file mode 100644 index 0000000000..70f77aba1f --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_mvref_common.c @@ -0,0 +1,199 @@ + +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_mvref_common.h" + +// This function searches the neighborhood of a given MB/SB +// to try and find candidate reference vectors. +static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, + MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, + int_mv *mv_ref_list, int block, int mi_row, + int mi_col, uint8_t *mode_context) { + const int *ref_sign_bias = cm->ref_frame_sign_bias; + int i, refmv_count = 0; + const POSITION *const mv_ref_search = mv_ref_blocks[mi->sb_type]; + int different_ref_found = 0; + int context_counter = 0; + const MV_REF *const prev_frame_mvs = + cm->use_prev_frame_mvs + ? cm->prev_frame->mvs + mi_row * cm->mi_cols + mi_col + : NULL; + const TileInfo *const tile = &xd->tile; + + // Blank the reference vector list + memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES); + + // The nearest 2 blocks are treated differently + // if the size < 8x8 we get the mv from the bmi substructure, + // and we also need to keep a mode count. + for (i = 0; i < 2; ++i) { + const POSITION *const mv_ref = &mv_ref_search[i]; + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { + const MODE_INFO *const candidate_mi = + xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]; + // Keep counts for entropy encoding. + context_counter += mode_2_counter[candidate_mi->mode]; + different_ref_found = 1; + + if (candidate_mi->ref_frame[0] == ref_frame) + ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, block), + refmv_count, mv_ref_list, Done); + else if (candidate_mi->ref_frame[1] == ref_frame) + ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 1, mv_ref->col, block), + refmv_count, mv_ref_list, Done); + } + } + + // Check the rest of the neighbors in much the same way + // as before except we don't need to keep track of sub blocks or + // mode counts. + for (; i < MVREF_NEIGHBOURS; ++i) { + const POSITION *const mv_ref = &mv_ref_search[i]; + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { + const MODE_INFO *const candidate_mi = + xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]; + different_ref_found = 1; + + if (candidate_mi->ref_frame[0] == ref_frame) + ADD_MV_REF_LIST(candidate_mi->mv[0], refmv_count, mv_ref_list, Done); + else if (candidate_mi->ref_frame[1] == ref_frame) + ADD_MV_REF_LIST(candidate_mi->mv[1], refmv_count, mv_ref_list, Done); + } + } + + // Check the last frame's mode and mv info. + if (cm->use_prev_frame_mvs) { + if (prev_frame_mvs->ref_frame[0] == ref_frame) { + ADD_MV_REF_LIST(prev_frame_mvs->mv[0], refmv_count, mv_ref_list, Done); + } else if (prev_frame_mvs->ref_frame[1] == ref_frame) { + ADD_MV_REF_LIST(prev_frame_mvs->mv[1], refmv_count, mv_ref_list, Done); + } + } + + // Since we couldn't find 2 mvs from the same reference frame + // go back through the neighbors and find motion vectors from + // different reference frames. + if (different_ref_found) { + for (i = 0; i < MVREF_NEIGHBOURS; ++i) { + const POSITION *mv_ref = &mv_ref_search[i]; + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { + const MODE_INFO *const candidate_mi = + xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]; + + // If the candidate is INTRA we don't want to consider its mv. + IF_DIFF_REF_FRAME_ADD_MV(candidate_mi, ref_frame, ref_sign_bias, + refmv_count, mv_ref_list, Done); + } + } + } + + // Since we still don't have a candidate we'll try the last frame. + if (cm->use_prev_frame_mvs) { + if (prev_frame_mvs->ref_frame[0] != ref_frame && + prev_frame_mvs->ref_frame[0] > INTRA_FRAME) { + int_mv mv = prev_frame_mvs->mv[0]; + if (ref_sign_bias[prev_frame_mvs->ref_frame[0]] != + ref_sign_bias[ref_frame]) { + mv.as_mv.row *= -1; + mv.as_mv.col *= -1; + } + ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, Done); + } + + if (prev_frame_mvs->ref_frame[1] > INTRA_FRAME && + prev_frame_mvs->ref_frame[1] != ref_frame && + prev_frame_mvs->mv[1].as_int != prev_frame_mvs->mv[0].as_int) { + int_mv mv = prev_frame_mvs->mv[1]; + if (ref_sign_bias[prev_frame_mvs->ref_frame[1]] != + ref_sign_bias[ref_frame]) { + mv.as_mv.row *= -1; + mv.as_mv.col *= -1; + } + ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, Done); + } + } + +Done: + + mode_context[ref_frame] = counter_to_context[context_counter]; + + // Clamp vectors + for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) + clamp_mv_ref(&mv_ref_list[i].as_mv, xd); +} + +void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, + MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, + int_mv *mv_ref_list, int mi_row, int mi_col, + uint8_t *mode_context) { + find_mv_refs_idx(cm, xd, mi, ref_frame, mv_ref_list, -1, mi_row, mi_col, + mode_context); +} + +void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp, int_mv *mvlist, + int_mv *nearest_mv, int_mv *near_mv) { + int i; + // Make sure all the candidates are properly clamped etc + for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) { + lower_mv_precision(&mvlist[i].as_mv, allow_hp); + clamp_mv2(&mvlist[i].as_mv, xd); + } + *nearest_mv = mvlist[0]; + *near_mv = mvlist[1]; +} + +void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, int block, + int ref, int mi_row, int mi_col, + int_mv *nearest_mv, int_mv *near_mv, + uint8_t *mode_context) { + int_mv mv_list[MAX_MV_REF_CANDIDATES]; + MODE_INFO *const mi = xd->mi[0]; + b_mode_info *bmi = mi->bmi; + int n; + + assert(MAX_MV_REF_CANDIDATES == 2); + + find_mv_refs_idx(cm, xd, mi, mi->ref_frame[ref], mv_list, block, mi_row, + mi_col, mode_context); + + near_mv->as_int = 0; + switch (block) { + case 0: + nearest_mv->as_int = mv_list[0].as_int; + near_mv->as_int = mv_list[1].as_int; + break; + case 1: + case 2: + nearest_mv->as_int = bmi[0].as_mv[ref].as_int; + for (n = 0; n < MAX_MV_REF_CANDIDATES; ++n) + if (nearest_mv->as_int != mv_list[n].as_int) { + near_mv->as_int = mv_list[n].as_int; + break; + } + break; + case 3: { + int_mv candidates[2 + MAX_MV_REF_CANDIDATES]; + candidates[0] = bmi[1].as_mv[ref]; + candidates[1] = bmi[0].as_mv[ref]; + candidates[2] = mv_list[0]; + candidates[3] = mv_list[1]; + + nearest_mv->as_int = bmi[2].as_mv[ref].as_int; + for (n = 0; n < 2 + MAX_MV_REF_CANDIDATES; ++n) + if (nearest_mv->as_int != candidates[n].as_int) { + near_mv->as_int = candidates[n].as_int; + break; + } + break; + } + default: assert(0 && "Invalid block index."); + } +} diff --git a/media/libvpx/libvpx/vp9/common/vp9_mvref_common.h b/media/libvpx/libvpx/vp9/common/vp9_mvref_common.h new file mode 100644 index 0000000000..5db6772dca --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_mvref_common.h @@ -0,0 +1,323 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_VP9_COMMON_VP9_MVREF_COMMON_H_ +#define VPX_VP9_COMMON_VP9_MVREF_COMMON_H_ + +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_blockd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define LEFT_TOP_MARGIN ((VP9_ENC_BORDER_IN_PIXELS - VP9_INTERP_EXTEND) << 3) +#define RIGHT_BOTTOM_MARGIN \ + ((VP9_ENC_BORDER_IN_PIXELS - VP9_INTERP_EXTEND) << 3) + +#define MVREF_NEIGHBOURS 8 + +typedef struct position { + int row; + int col; +} POSITION; + +typedef enum { + BOTH_ZERO = 0, + ZERO_PLUS_PREDICTED = 1, + BOTH_PREDICTED = 2, + NEW_PLUS_NON_INTRA = 3, + BOTH_NEW = 4, + INTRA_PLUS_NON_INTRA = 5, + BOTH_INTRA = 6, + INVALID_CASE = 9 +} motion_vector_context; + +// This is used to figure out a context for the ref blocks. The code flattens +// an array that would have 3 possible counts (0, 1 & 2) for 3 choices by +// adding 9 for each intra block, 3 for each zero mv and 1 for each new +// motion vector. This single number is then converted into a context +// with a single lookup ( counter_to_context ). +static const int mode_2_counter[MB_MODE_COUNT] = { + 9, // DC_PRED + 9, // V_PRED + 9, // H_PRED + 9, // D45_PRED + 9, // D135_PRED + 9, // D117_PRED + 9, // D153_PRED + 9, // D207_PRED + 9, // D63_PRED + 9, // TM_PRED + 0, // NEARESTMV + 0, // NEARMV + 3, // ZEROMV + 1, // NEWMV +}; + +// There are 3^3 different combinations of 3 counts that can be either 0,1 or +// 2. However the actual count can never be greater than 2 so the highest +// counter we need is 18. 9 is an invalid counter that's never used. +static const int counter_to_context[19] = { + BOTH_PREDICTED, // 0 + NEW_PLUS_NON_INTRA, // 1 + BOTH_NEW, // 2 + ZERO_PLUS_PREDICTED, // 3 + NEW_PLUS_NON_INTRA, // 4 + INVALID_CASE, // 5 + BOTH_ZERO, // 6 + INVALID_CASE, // 7 + INVALID_CASE, // 8 + INTRA_PLUS_NON_INTRA, // 9 + INTRA_PLUS_NON_INTRA, // 10 + INVALID_CASE, // 11 + INTRA_PLUS_NON_INTRA, // 12 + INVALID_CASE, // 13 + INVALID_CASE, // 14 + INVALID_CASE, // 15 + INVALID_CASE, // 16 + INVALID_CASE, // 17 + BOTH_INTRA // 18 +}; + +static const POSITION mv_ref_blocks[BLOCK_SIZES][MVREF_NEIGHBOURS] = { + // 4X4 + { { -1, 0 }, + { 0, -1 }, + { -1, -1 }, + { -2, 0 }, + { 0, -2 }, + { -2, -1 }, + { -1, -2 }, + { -2, -2 } }, + // 4X8 + { { -1, 0 }, + { 0, -1 }, + { -1, -1 }, + { -2, 0 }, + { 0, -2 }, + { -2, -1 }, + { -1, -2 }, + { -2, -2 } }, + // 8X4 + { { -1, 0 }, + { 0, -1 }, + { -1, -1 }, + { -2, 0 }, + { 0, -2 }, + { -2, -1 }, + { -1, -2 }, + { -2, -2 } }, + // 8X8 + { { -1, 0 }, + { 0, -1 }, + { -1, -1 }, + { -2, 0 }, + { 0, -2 }, + { -2, -1 }, + { -1, -2 }, + { -2, -2 } }, + // 8X16 + { { 0, -1 }, + { -1, 0 }, + { 1, -1 }, + { -1, -1 }, + { 0, -2 }, + { -2, 0 }, + { -2, -1 }, + { -1, -2 } }, + // 16X8 + { { -1, 0 }, + { 0, -1 }, + { -1, 1 }, + { -1, -1 }, + { -2, 0 }, + { 0, -2 }, + { -1, -2 }, + { -2, -1 } }, + // 16X16 + { { -1, 0 }, + { 0, -1 }, + { -1, 1 }, + { 1, -1 }, + { -1, -1 }, + { -3, 0 }, + { 0, -3 }, + { -3, -3 } }, + // 16X32 + { { 0, -1 }, + { -1, 0 }, + { 2, -1 }, + { -1, -1 }, + { -1, 1 }, + { 0, -3 }, + { -3, 0 }, + { -3, -3 } }, + // 32X16 + { { -1, 0 }, + { 0, -1 }, + { -1, 2 }, + { -1, -1 }, + { 1, -1 }, + { -3, 0 }, + { 0, -3 }, + { -3, -3 } }, + // 32X32 + { { -1, 1 }, + { 1, -1 }, + { -1, 2 }, + { 2, -1 }, + { -1, -1 }, + { -3, 0 }, + { 0, -3 }, + { -3, -3 } }, + // 32X64 + { { 0, -1 }, + { -1, 0 }, + { 4, -1 }, + { -1, 2 }, + { -1, -1 }, + { 0, -3 }, + { -3, 0 }, + { 2, -1 } }, + // 64X32 + { { -1, 0 }, + { 0, -1 }, + { -1, 4 }, + { 2, -1 }, + { -1, -1 }, + { -3, 0 }, + { 0, -3 }, + { -1, 2 } }, + // 64X64 + { { -1, 3 }, + { 3, -1 }, + { -1, 4 }, + { 4, -1 }, + { -1, -1 }, + { -1, 0 }, + { 0, -1 }, + { -1, 6 } } +}; + +static const int idx_n_column_to_subblock[4][2] = { + { 1, 2 }, { 1, 3 }, { 3, 2 }, { 3, 3 } +}; + +// clamp_mv_ref +#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units + +static INLINE void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) { + clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER, + xd->mb_to_right_edge + MV_BORDER, xd->mb_to_top_edge - MV_BORDER, + xd->mb_to_bottom_edge + MV_BORDER); +} + +// This function returns either the appropriate sub block or block's mv +// on whether the block_size < 8x8 and we have check_sub_blocks set. +static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate, int which_mv, + int search_col, int block_idx) { + return block_idx >= 0 && candidate->sb_type < BLOCK_8X8 + ? candidate + ->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]] + .as_mv[which_mv] + : candidate->mv[which_mv]; +} + +// Performs mv sign inversion if indicated by the reference frame combination. +static INLINE int_mv scale_mv(const MODE_INFO *mi, int ref, + const MV_REFERENCE_FRAME this_ref_frame, + const int *ref_sign_bias) { + int_mv mv = mi->mv[ref]; + if (ref_sign_bias[mi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) { + mv.as_mv.row *= -1; + mv.as_mv.col *= -1; + } + return mv; +} + +// This macro is used to add a motion vector mv_ref list if it isn't +// already in the list. If it's the second motion vector it will also +// skip all additional processing and jump to Done! +#define ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, Done) \ + do { \ + if (refmv_count) { \ + if ((mv).as_int != (mv_ref_list)[0].as_int) { \ + (mv_ref_list)[(refmv_count)] = (mv); \ + goto Done; \ + } \ + } else { \ + (mv_ref_list)[(refmv_count)++] = (mv); \ + } \ + } while (0) + +// If either reference frame is different, not INTRA, and they +// are different from each other scale and add the mv to our list. +#define IF_DIFF_REF_FRAME_ADD_MV(mbmi, ref_frame, ref_sign_bias, refmv_count, \ + mv_ref_list, Done) \ + do { \ + if (is_inter_block(mbmi)) { \ + if ((mbmi)->ref_frame[0] != (ref_frame)) \ + ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias), \ + refmv_count, mv_ref_list, Done); \ + if (has_second_ref(mbmi) && (mbmi)->ref_frame[1] != (ref_frame) && \ + (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) \ + ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias), \ + refmv_count, mv_ref_list, Done); \ + } \ + } while (0) + +// Checks that the given mi_row, mi_col and search point +// are inside the borders of the tile. +static INLINE int is_inside(const TileInfo *const tile, int mi_col, int mi_row, + int mi_rows, const POSITION *mi_pos) { + return !(mi_row + mi_pos->row < 0 || + mi_col + mi_pos->col < tile->mi_col_start || + mi_row + mi_pos->row >= mi_rows || + mi_col + mi_pos->col >= tile->mi_col_end); +} + +// TODO(jingning): this mv clamping function should be block size dependent. +static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) { + clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN, + xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN, + xd->mb_to_top_edge - LEFT_TOP_MARGIN, + xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN); +} + +static INLINE void lower_mv_precision(MV *mv, int allow_hp) { + const int use_hp = allow_hp && use_mv_hp(mv); + if (!use_hp) { + if (mv->row & 1) mv->row += (mv->row > 0 ? -1 : 1); + if (mv->col & 1) mv->col += (mv->col > 0 ? -1 : 1); + } +} + +typedef void (*find_mv_refs_sync)(void *const data, int mi_row); +void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, + MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, + int_mv *mv_ref_list, int mi_row, int mi_col, + uint8_t *mode_context); + +// check a list of motion vectors by sad score using a number rows of pixels +// above and a number cols of pixels in the left to select the one with best +// score to use as ref motion vector +void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp, int_mv *mvlist, + int_mv *nearest_mv, int_mv *near_mv); + +void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, int block, + int ref, int mi_row, int mi_col, + int_mv *nearest_mv, int_mv *near_mv, + uint8_t *mode_context); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_COMMON_VP9_MVREF_COMMON_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h b/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h new file mode 100644 index 0000000000..1cfc12f6fa --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h @@ -0,0 +1,468 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_COMMON_VP9_ONYXC_INT_H_ +#define VPX_VP9_COMMON_VP9_ONYXC_INT_H_ + +#include "./vpx_config.h" +#include "vpx/internal/vpx_codec_internal.h" +#include "vpx_util/vpx_thread.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_alloccommon.h" +#include "vp9/common/vp9_loopfilter.h" +#include "vp9/common/vp9_entropymv.h" +#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_frame_buffers.h" +#include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_tile_common.h" + +#if CONFIG_VP9_POSTPROC +#include "vp9/common/vp9_postproc.h" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#define REFS_PER_FRAME 3 + +#define REF_FRAMES_LOG2 3 +#define REF_FRAMES (1 << REF_FRAMES_LOG2) + +// 1 scratch frame for the new frame, REFS_PER_FRAME for scaled references on +// the encoder. +#define FRAME_BUFFERS (REF_FRAMES + 1 + REFS_PER_FRAME) + +#define FRAME_CONTEXTS_LOG2 2 +#define FRAME_CONTEXTS (1 << FRAME_CONTEXTS_LOG2) + +#define NUM_PING_PONG_BUFFERS 2 + +extern const struct { + PARTITION_CONTEXT above; + PARTITION_CONTEXT left; +} partition_context_lookup[BLOCK_SIZES]; + +typedef enum { + SINGLE_REFERENCE = 0, + COMPOUND_REFERENCE = 1, + REFERENCE_MODE_SELECT = 2, + REFERENCE_MODES = 3, +} REFERENCE_MODE; + +typedef struct { + int_mv mv[2]; + MV_REFERENCE_FRAME ref_frame[2]; +} MV_REF; + +typedef struct { + int ref_count; + MV_REF *mvs; + int mi_rows; + int mi_cols; + uint8_t released; + + // Note that frame_index/frame_coding_index are only set by set_frame_index() + // on the encoder side. + + // TODO(angiebird): Set frame_index/frame_coding_index on the decoder side + // properly. + int frame_index; // Display order in the video, it's equivalent to the + // show_idx defined in EncodeFrameInfo. + int frame_coding_index; // The coding order (starting from zero) of this + // frame. + vpx_codec_frame_buffer_t raw_frame_buffer; + YV12_BUFFER_CONFIG buf; +} RefCntBuffer; + +typedef struct BufferPool { + // Private data associated with the frame buffer callbacks. + void *cb_priv; + + vpx_get_frame_buffer_cb_fn_t get_fb_cb; + vpx_release_frame_buffer_cb_fn_t release_fb_cb; + + RefCntBuffer frame_bufs[FRAME_BUFFERS]; + + // Frame buffers allocated internally by the codec. + InternalFrameBufferList int_frame_buffers; +} BufferPool; + +typedef struct VP9Common { + struct vpx_internal_error_info error; + vpx_color_space_t color_space; + vpx_color_range_t color_range; + int width; + int height; + int render_width; + int render_height; + int last_width; + int last_height; + + // TODO(jkoleszar): this implies chroma ss right now, but could vary per + // plane. Revisit as part of the future change to YV12_BUFFER_CONFIG to + // support additional planes. + int subsampling_x; + int subsampling_y; + +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth; // Marks if we need to use 16bit frame buffers. +#endif + + YV12_BUFFER_CONFIG *frame_to_show; + RefCntBuffer *prev_frame; + + // TODO(hkuang): Combine this with cur_buf in macroblockd. + RefCntBuffer *cur_frame; + + int ref_frame_map[REF_FRAMES]; /* maps fb_idx to reference slot */ + + // Prepare ref_frame_map for the next frame. + // Only used in frame parallel decode. + int next_ref_frame_map[REF_FRAMES]; + + // TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and + // roll new_fb_idx into it. + + // Each frame can reference REFS_PER_FRAME buffers + RefBuffer frame_refs[REFS_PER_FRAME]; + + int new_fb_idx; + + int cur_show_frame_fb_idx; + +#if CONFIG_VP9_POSTPROC + YV12_BUFFER_CONFIG post_proc_buffer; + YV12_BUFFER_CONFIG post_proc_buffer_int; +#endif + + FRAME_TYPE last_frame_type; /* last frame's frame type for motion search.*/ + FRAME_TYPE frame_type; + + int show_frame; + int last_show_frame; + int show_existing_frame; + + // Flag signaling that the frame is encoded using only INTRA modes. + uint8_t intra_only; + uint8_t last_intra_only; + + int allow_high_precision_mv; + + // Flag signaling that the frame context should be reset to default values. + // 0 or 1 implies don't reset, 2 reset just the context specified in the + // frame header, 3 reset all contexts. + int reset_frame_context; + + // MBs, mb_rows/cols is in 16-pixel units; mi_rows/cols is in + // MODE_INFO (8-pixel) units. + int MBs; + int mb_rows, mi_rows; + int mb_cols, mi_cols; + int mi_stride; + + /* profile settings */ + TX_MODE tx_mode; + + int base_qindex; + int y_dc_delta_q; + int uv_dc_delta_q; + int uv_ac_delta_q; + int16_t y_dequant[MAX_SEGMENTS][2]; + int16_t uv_dequant[MAX_SEGMENTS][2]; + + /* We allocate a MODE_INFO struct for each macroblock, together with + an extra row on top and column on the left to simplify prediction. */ + int mi_alloc_size; + MODE_INFO *mip; /* Base of allocated array */ + MODE_INFO *mi; /* Corresponds to upper left visible macroblock */ + + // TODO(agrange): Move prev_mi into encoder structure. + // prev_mip and prev_mi will only be allocated in VP9 encoder. + MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */ + MODE_INFO *prev_mi; /* 'mi' from last frame (points into prev_mip) */ + + // Separate mi functions between encoder and decoder. + int (*alloc_mi)(struct VP9Common *cm, int mi_size); + void (*free_mi)(struct VP9Common *cm); + void (*setup_mi)(struct VP9Common *cm); + + // Grid of pointers to 8x8 MODE_INFO structs. Any 8x8 not in the visible + // area will be NULL. + MODE_INFO **mi_grid_base; + MODE_INFO **mi_grid_visible; + MODE_INFO **prev_mi_grid_base; + MODE_INFO **prev_mi_grid_visible; + + // Whether to use previous frame's motion vectors for prediction. + int use_prev_frame_mvs; + + // Persistent mb segment id map used in prediction. + int seg_map_idx; + int prev_seg_map_idx; + + uint8_t *seg_map_array[NUM_PING_PONG_BUFFERS]; + uint8_t *last_frame_seg_map; + uint8_t *current_frame_seg_map; + int seg_map_alloc_size; + + INTERP_FILTER interp_filter; + + loop_filter_info_n lf_info; + + int refresh_frame_context; /* Two state 0 = NO, 1 = YES */ + + int ref_frame_sign_bias[MAX_REF_FRAMES]; /* Two state 0, 1 */ + + struct loopfilter lf; + struct segmentation seg; + + // Context probabilities for reference frame prediction + MV_REFERENCE_FRAME comp_fixed_ref; + MV_REFERENCE_FRAME comp_var_ref[2]; + REFERENCE_MODE reference_mode; + + FRAME_CONTEXT *fc; /* this frame entropy */ + FRAME_CONTEXT *frame_contexts; // FRAME_CONTEXTS + unsigned int frame_context_idx; /* Context to use/update */ + FRAME_COUNTS counts; + + // TODO(angiebird): current_video_frame/current_frame_coding_index into a + // structure + unsigned int current_video_frame; + // Each show or no show frame is assigned with a coding index based on its + // coding order (starting from zero). + + // Current frame's coding index. + int current_frame_coding_index; + BITSTREAM_PROFILE profile; + + // VPX_BITS_8 in profile 0 or 1, VPX_BITS_10 or VPX_BITS_12 in profile 2 or 3. + vpx_bit_depth_t bit_depth; + vpx_bit_depth_t dequant_bit_depth; // bit_depth of current dequantizer + +#if CONFIG_VP9_POSTPROC + struct postproc_state postproc_state; +#endif + + int error_resilient_mode; + int frame_parallel_decoding_mode; + + int log2_tile_cols, log2_tile_rows; + int byte_alignment; + int skip_loop_filter; + + // External BufferPool passed from outside. + BufferPool *buffer_pool; + + PARTITION_CONTEXT *above_seg_context; + ENTROPY_CONTEXT *above_context; + int above_context_alloc_cols; + + int lf_row; +} VP9_COMMON; + +static INLINE void init_frame_indexes(VP9_COMMON *cm) { + cm->current_video_frame = 0; + cm->current_frame_coding_index = 0; +} + +static INLINE void update_frame_indexes(VP9_COMMON *cm, int show_frame) { + if (show_frame) { + // Don't increment frame counters if this was an altref buffer + // update not a real frame + ++cm->current_video_frame; + } + ++cm->current_frame_coding_index; +} + +typedef struct { + int frame_width; + int frame_height; + int render_frame_width; + int render_frame_height; + int mi_rows; + int mi_cols; + int mb_rows; + int mb_cols; + int num_mbs; + vpx_bit_depth_t bit_depth; +} FRAME_INFO; + +static INLINE void init_frame_info(FRAME_INFO *frame_info, + const VP9_COMMON *cm) { + frame_info->frame_width = cm->width; + frame_info->frame_height = cm->height; + frame_info->render_frame_width = cm->render_width; + frame_info->render_frame_height = cm->render_height; + frame_info->mi_cols = cm->mi_cols; + frame_info->mi_rows = cm->mi_rows; + frame_info->mb_cols = cm->mb_cols; + frame_info->mb_rows = cm->mb_rows; + frame_info->num_mbs = cm->MBs; + frame_info->bit_depth = cm->bit_depth; + // TODO(angiebird): Figure out how to get subsampling_x/y here +} + +static INLINE YV12_BUFFER_CONFIG *get_buf_frame(VP9_COMMON *cm, int index) { + if (index < 0 || index >= FRAME_BUFFERS) return NULL; + if (cm->error.error_code != VPX_CODEC_OK) return NULL; + return &cm->buffer_pool->frame_bufs[index].buf; +} + +static INLINE YV12_BUFFER_CONFIG *get_ref_frame(VP9_COMMON *cm, int index) { + if (index < 0 || index >= REF_FRAMES) return NULL; + if (cm->ref_frame_map[index] < 0) return NULL; + assert(cm->ref_frame_map[index] < FRAME_BUFFERS); + return &cm->buffer_pool->frame_bufs[cm->ref_frame_map[index]].buf; +} + +static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) { + return &cm->buffer_pool->frame_bufs[cm->new_fb_idx].buf; +} + +static INLINE int get_free_fb(VP9_COMMON *cm) { + RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; + int i; + + for (i = 0; i < FRAME_BUFFERS; ++i) + if (frame_bufs[i].ref_count == 0) break; + + if (i != FRAME_BUFFERS) { + frame_bufs[i].ref_count = 1; + } else { + // Reset i to be INVALID_IDX to indicate no free buffer found. + i = INVALID_IDX; + } + + return i; +} + +static INLINE void ref_cnt_fb(RefCntBuffer *bufs, int *idx, int new_idx) { + const int ref_index = *idx; + + if (ref_index >= 0 && bufs[ref_index].ref_count > 0) + bufs[ref_index].ref_count--; + + *idx = new_idx; + + bufs[new_idx].ref_count++; +} + +static INLINE int mi_cols_aligned_to_sb(int n_mis) { + return ALIGN_POWER_OF_TWO(n_mis, MI_BLOCK_SIZE_LOG2); +} + +static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) { + return cm->frame_type == KEY_FRAME || cm->intra_only; +} + +static INLINE void set_partition_probs(const VP9_COMMON *const cm, + MACROBLOCKD *const xd) { + xd->partition_probs = + frame_is_intra_only(cm) + ? &vp9_kf_partition_probs[0] + : (const vpx_prob(*)[PARTITION_TYPES - 1]) cm->fc->partition_prob; +} + +static INLINE void vp9_init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd, + tran_low_t *dqcoeff) { + int i; + + for (i = 0; i < MAX_MB_PLANE; ++i) { + xd->plane[i].dqcoeff = dqcoeff; + xd->above_context[i] = + cm->above_context + + i * sizeof(*cm->above_context) * 2 * mi_cols_aligned_to_sb(cm->mi_cols); + + if (get_plane_type(i) == PLANE_TYPE_Y) { + memcpy(xd->plane[i].seg_dequant, cm->y_dequant, sizeof(cm->y_dequant)); + } else { + memcpy(xd->plane[i].seg_dequant, cm->uv_dequant, sizeof(cm->uv_dequant)); + } + xd->fc = cm->fc; + } + + xd->above_seg_context = cm->above_seg_context; + xd->mi_stride = cm->mi_stride; + xd->error_info = &cm->error; + + set_partition_probs(cm, xd); +} + +static INLINE const vpx_prob *get_partition_probs(const MACROBLOCKD *xd, + int ctx) { + return xd->partition_probs[ctx]; +} + +static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col) { + const int above_idx = mi_col * 2; + const int left_idx = (mi_row * 2) & 15; + int i; + for (i = 0; i < MAX_MB_PLANE; ++i) { + struct macroblockd_plane *const pd = &xd->plane[i]; + pd->above_context = &xd->above_context[i][above_idx >> pd->subsampling_x]; + pd->left_context = &xd->left_context[i][left_idx >> pd->subsampling_y]; + } +} + +static INLINE int calc_mi_size(int len) { + // len is in mi units. + return len + MI_BLOCK_SIZE; +} + +static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile, + int mi_row, int bh, int mi_col, int bw, + int mi_rows, int mi_cols) { + xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8); + xd->mb_to_bottom_edge = ((mi_rows - bh - mi_row) * MI_SIZE) * 8; + xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8); + xd->mb_to_right_edge = ((mi_cols - bw - mi_col) * MI_SIZE) * 8; + + // Are edges available for intra prediction? + xd->above_mi = (mi_row != 0) ? xd->mi[-xd->mi_stride] : NULL; + xd->left_mi = (mi_col > tile->mi_col_start) ? xd->mi[-1] : NULL; +} + +static INLINE void update_partition_context(MACROBLOCKD *xd, int mi_row, + int mi_col, BLOCK_SIZE subsize, + BLOCK_SIZE bsize) { + PARTITION_CONTEXT *const above_ctx = xd->above_seg_context + mi_col; + PARTITION_CONTEXT *const left_ctx = xd->left_seg_context + (mi_row & MI_MASK); + + // num_4x4_blocks_wide_lookup[bsize] / 2 + const int bs = num_8x8_blocks_wide_lookup[bsize]; + + // update the partition context at the end notes. set partition bits + // of block sizes larger than the current one to be one, and partition + // bits of smaller block sizes to be zero. + memset(above_ctx, partition_context_lookup[subsize].above, bs); + memset(left_ctx, partition_context_lookup[subsize].left, bs); +} + +static INLINE int partition_plane_context(const MACROBLOCKD *xd, int mi_row, + int mi_col, BLOCK_SIZE bsize) { + const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col; + const PARTITION_CONTEXT *left_ctx = xd->left_seg_context + (mi_row & MI_MASK); + const int bsl = mi_width_log2_lookup[bsize]; + int above = (*above_ctx >> bsl) & 1, left = (*left_ctx >> bsl) & 1; + + assert(b_width_log2_lookup[bsize] == b_height_log2_lookup[bsize]); + assert(bsl >= 0); + + return (left * 2 + above) + bsl * PARTITION_PLOFFSET; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_COMMON_VP9_ONYXC_INT_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_postproc.c b/media/libvpx/libvpx/vp9/common/vp9_postproc.c new file mode 100644 index 0000000000..96519f0051 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_postproc.c @@ -0,0 +1,435 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" +#include "./vpx_scale_rtcd.h" +#include "./vp9_rtcd.h" + +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/postproc.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/system_state.h" +#include "vpx_scale/vpx_scale.h" +#include "vpx_scale/yv12config.h" + +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_postproc.h" + +#if CONFIG_VP9_POSTPROC + +static const uint8_t q_diff_thresh = 20; +static const uint8_t last_q_thresh = 170; +extern const int16_t vpx_rv[]; + +#if CONFIG_VP9_HIGHBITDEPTH +static const int16_t kernel5[] = { 1, 1, 4, 1, 1 }; + +void vp9_highbd_post_proc_down_and_across_c(const uint16_t *src_ptr, + uint16_t *dst_ptr, + int src_pixels_per_line, + int dst_pixels_per_line, int rows, + int cols, int flimit) { + uint16_t const *p_src; + uint16_t *p_dst; + int row, col, i, v, kernel; + int pitch = src_pixels_per_line; + uint16_t d[8]; + + for (row = 0; row < rows; row++) { + // post_proc_down for one row. + p_src = src_ptr; + p_dst = dst_ptr; + + for (col = 0; col < cols; col++) { + kernel = 4; + v = p_src[col]; + + for (i = -2; i <= 2; i++) { + if (abs(v - p_src[col + i * pitch]) > flimit) goto down_skip_convolve; + + kernel += kernel5[2 + i] * p_src[col + i * pitch]; + } + + v = (kernel >> 3); + + down_skip_convolve: + p_dst[col] = v; + } + + /* now post_proc_across */ + p_src = dst_ptr; + p_dst = dst_ptr; + + for (i = 0; i < 8; i++) d[i] = p_src[i]; + + for (col = 0; col < cols; col++) { + kernel = 4; + v = p_src[col]; + + d[col & 7] = v; + + for (i = -2; i <= 2; i++) { + if (abs(v - p_src[col + i]) > flimit) goto across_skip_convolve; + + kernel += kernel5[2 + i] * p_src[col + i]; + } + + d[col & 7] = (kernel >> 3); + + across_skip_convolve: + if (col >= 2) p_dst[col - 2] = d[(col - 2) & 7]; + } + + /* handle the last two pixels */ + p_dst[col - 2] = d[(col - 2) & 7]; + p_dst[col - 1] = d[(col - 1) & 7]; + + /* next row */ + src_ptr += pitch; + dst_ptr += dst_pixels_per_line; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +static int q2mbl(int x) { + if (x < 20) x = 20; + + x = 50 + (x - 50) * 10 / 8; + return x * x / 3; +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, + int cols, int flimit) { + int r, c, i; + + uint16_t *s = src; + uint16_t d[16]; + + for (r = 0; r < rows; r++) { + int sumsq = 0; + int sum = 0; + + for (i = -8; i <= 6; i++) { + sumsq += s[i] * s[i]; + sum += s[i]; + d[i + 8] = 0; + } + + for (c = 0; c < cols + 8; c++) { + int x = s[c + 7] - s[c - 8]; + int y = s[c + 7] + s[c - 8]; + + sum += x; + sumsq += x * y; + + d[c & 15] = s[c]; + + if (sumsq * 15 - sum * sum < flimit) { + d[c & 15] = (8 + sum + s[c]) >> 4; + } + + s[c - 8] = d[(c - 8) & 15]; + } + + s += pitch; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_highbd_mbpost_proc_down_c(uint16_t *dst, int pitch, int rows, int cols, + int flimit) { + int r, c, i; + const int16_t *rv3 = &vpx_rv[63 & rand()]; // NOLINT + + for (c = 0; c < cols; c++) { + uint16_t *s = &dst[c]; + int sumsq = 0; + int sum = 0; + uint16_t d[16]; + const int16_t *rv2 = rv3 + ((c * 17) & 127); + + for (i = -8; i <= 6; i++) { + sumsq += s[i * pitch] * s[i * pitch]; + sum += s[i * pitch]; + } + + for (r = 0; r < rows + 8; r++) { + sumsq += s[7 * pitch] * s[7 * pitch] - s[-8 * pitch] * s[-8 * pitch]; + sum += s[7 * pitch] - s[-8 * pitch]; + d[r & 15] = s[0]; + + if (sumsq * 15 - sum * sum < flimit) { + d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4; + } + + s[-8 * pitch] = d[(r - 8) & 15]; + s += pitch; + } + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +static void deblock_and_de_macro_block(VP9_COMMON *cm, + YV12_BUFFER_CONFIG *source, + YV12_BUFFER_CONFIG *post, int q, + int low_var_thresh, int flag, + uint8_t *limits) { + (void)low_var_thresh; + (void)flag; +#if CONFIG_VP9_HIGHBITDEPTH + if (source->flags & YV12_FLAG_HIGHBITDEPTH) { + double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065; + int ppl = (int)(level + .5); + vp9_highbd_post_proc_down_and_across( + CONVERT_TO_SHORTPTR(source->y_buffer), + CONVERT_TO_SHORTPTR(post->y_buffer), source->y_stride, post->y_stride, + source->y_height, source->y_width, ppl); + + vp9_highbd_mbpost_proc_across_ip(CONVERT_TO_SHORTPTR(post->y_buffer), + post->y_stride, post->y_height, + post->y_width, q2mbl(q)); + + vp9_highbd_mbpost_proc_down(CONVERT_TO_SHORTPTR(post->y_buffer), + post->y_stride, post->y_height, post->y_width, + q2mbl(q)); + + vp9_highbd_post_proc_down_and_across( + CONVERT_TO_SHORTPTR(source->u_buffer), + CONVERT_TO_SHORTPTR(post->u_buffer), source->uv_stride, post->uv_stride, + source->uv_height, source->uv_width, ppl); + vp9_highbd_post_proc_down_and_across( + CONVERT_TO_SHORTPTR(source->v_buffer), + CONVERT_TO_SHORTPTR(post->v_buffer), source->uv_stride, post->uv_stride, + source->uv_height, source->uv_width, ppl); + } else { +#endif // CONFIG_VP9_HIGHBITDEPTH + vp9_deblock(cm, source, post, q, limits); + vpx_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height, + post->y_width, q2mbl(q)); + vpx_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height, + post->y_width, q2mbl(q)); +#if CONFIG_VP9_HIGHBITDEPTH + } +#endif // CONFIG_VP9_HIGHBITDEPTH +} + +void vp9_deblock(struct VP9Common *cm, const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, int q, uint8_t *limits) { + const int ppl = + (int)(6.0e-05 * q * q * q - 0.0067 * q * q + 0.306 * q + 0.0065 + 0.5); +#if CONFIG_VP9_HIGHBITDEPTH + if (src->flags & YV12_FLAG_HIGHBITDEPTH) { + int i; + const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer, + src->v_buffer }; + const int src_strides[3] = { src->y_stride, src->uv_stride, + src->uv_stride }; + const int src_widths[3] = { src->y_width, src->uv_width, src->uv_width }; + const int src_heights[3] = { src->y_height, src->uv_height, + src->uv_height }; + + uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer }; + const int dst_strides[3] = { dst->y_stride, dst->uv_stride, + dst->uv_stride }; + for (i = 0; i < MAX_MB_PLANE; ++i) { + vp9_highbd_post_proc_down_and_across( + CONVERT_TO_SHORTPTR(srcs[i]), CONVERT_TO_SHORTPTR(dsts[i]), + src_strides[i], dst_strides[i], src_heights[i], src_widths[i], ppl); + } + } else { +#endif // CONFIG_VP9_HIGHBITDEPTH + int mbr; + const int mb_rows = cm->mb_rows; + const int mb_cols = cm->mb_cols; + memset(limits, (unsigned char)ppl, 16 * mb_cols); + + for (mbr = 0; mbr < mb_rows; mbr++) { + vpx_post_proc_down_and_across_mb_row( + src->y_buffer + 16 * mbr * src->y_stride, + dst->y_buffer + 16 * mbr * dst->y_stride, src->y_stride, + dst->y_stride, src->y_width, limits, 16); + vpx_post_proc_down_and_across_mb_row( + src->u_buffer + 8 * mbr * src->uv_stride, + dst->u_buffer + 8 * mbr * dst->uv_stride, src->uv_stride, + dst->uv_stride, src->uv_width, limits, 8); + vpx_post_proc_down_and_across_mb_row( + src->v_buffer + 8 * mbr * src->uv_stride, + dst->v_buffer + 8 * mbr * dst->uv_stride, src->uv_stride, + dst->uv_stride, src->uv_width, limits, 8); + } +#if CONFIG_VP9_HIGHBITDEPTH + } +#endif // CONFIG_VP9_HIGHBITDEPTH +} + +void vp9_denoise(struct VP9Common *cm, const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, int q, uint8_t *limits) { + vp9_deblock(cm, src, dst, q, limits); +} + +static void swap_mi_and_prev_mi(VP9_COMMON *cm) { + // Current mip will be the prev_mip for the next frame. + MODE_INFO *temp = cm->postproc_state.prev_mip; + cm->postproc_state.prev_mip = cm->mip; + cm->mip = temp; + + // Update the upper left visible macroblock ptrs. + cm->mi = cm->mip + cm->mi_stride + 1; + cm->postproc_state.prev_mi = cm->postproc_state.prev_mip + cm->mi_stride + 1; +} + +int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, + vp9_ppflags_t *ppflags, int unscaled_width) { + const int q = VPXMIN(105, cm->lf.filter_level * 2); + const int flags = ppflags->post_proc_flag; + YV12_BUFFER_CONFIG *const ppbuf = &cm->post_proc_buffer; + struct postproc_state *const ppstate = &cm->postproc_state; + + if (!cm->frame_to_show) return -1; + + if (!flags) { + *dest = *cm->frame_to_show; + return 0; + } + + vpx_clear_system_state(); + + // Alloc memory for prev_mip in the first frame. + if (cm->current_video_frame == 1) { + ppstate->last_base_qindex = cm->base_qindex; + ppstate->last_frame_valid = 1; + } + + if ((flags & VP9D_MFQE) && ppstate->prev_mip == NULL) { + ppstate->prev_mip = vpx_calloc(cm->mi_alloc_size, sizeof(*cm->mip)); + if (!ppstate->prev_mip) { + return 1; + } + ppstate->prev_mi = ppstate->prev_mip + cm->mi_stride + 1; + } + + // Allocate post_proc_buffer_int if needed. + if ((flags & VP9D_MFQE) && !cm->post_proc_buffer_int.buffer_alloc) { + if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) { + const int width = ALIGN_POWER_OF_TWO(cm->width, 4); + const int height = ALIGN_POWER_OF_TWO(cm->height, 4); + + if (vpx_alloc_frame_buffer(&cm->post_proc_buffer_int, width, height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif // CONFIG_VP9_HIGHBITDEPTH + VP9_ENC_BORDER_IN_PIXELS, + cm->byte_alignment) < 0) { + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate MFQE framebuffer"); + } + + // Ensure that postproc is set to all 0s so that post proc + // doesn't pull random data in from edge. + memset(cm->post_proc_buffer_int.buffer_alloc, 128, + cm->post_proc_buffer.frame_size); + } + } + + if (vpx_realloc_frame_buffer(&cm->post_proc_buffer, cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_DEC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL) < 0) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate post-processing buffer"); + + if (flags & (VP9D_DEMACROBLOCK | VP9D_DEBLOCK)) { + if (!cm->postproc_state.limits) { + cm->postproc_state.limits = + vpx_calloc(unscaled_width, sizeof(*cm->postproc_state.limits)); + if (!cm->postproc_state.limits) return 1; + } + } + + if (flags & VP9D_ADDNOISE) { + if (!cm->postproc_state.generated_noise) { + cm->postproc_state.generated_noise = vpx_calloc( + cm->width + 256, sizeof(*cm->postproc_state.generated_noise)); + if (!cm->postproc_state.generated_noise) return 1; + } + } + + if ((flags & VP9D_MFQE) && cm->current_video_frame >= 2 && + ppstate->last_frame_valid && cm->bit_depth == 8 && + ppstate->last_base_qindex <= last_q_thresh && + cm->base_qindex - ppstate->last_base_qindex >= q_diff_thresh) { + vp9_mfqe(cm); + // TODO(jackychen): Consider whether enable deblocking by default + // if mfqe is enabled. Need to take both the quality and the speed + // into consideration. + if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) { + vpx_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int); + } + if ((flags & VP9D_DEMACROBLOCK) && cm->post_proc_buffer_int.buffer_alloc) { + deblock_and_de_macro_block(cm, &cm->post_proc_buffer_int, ppbuf, + q + (ppflags->deblocking_level - 5) * 10, 1, 0, + cm->postproc_state.limits); + } else if (flags & VP9D_DEBLOCK) { + vp9_deblock(cm, &cm->post_proc_buffer_int, ppbuf, q, + cm->postproc_state.limits); + } else { + vpx_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf); + } + } else if (flags & VP9D_DEMACROBLOCK) { + deblock_and_de_macro_block(cm, cm->frame_to_show, ppbuf, + q + (ppflags->deblocking_level - 5) * 10, 1, 0, + cm->postproc_state.limits); + } else if (flags & VP9D_DEBLOCK) { + vp9_deblock(cm, cm->frame_to_show, ppbuf, q, cm->postproc_state.limits); + } else { + vpx_yv12_copy_frame(cm->frame_to_show, ppbuf); + } + + ppstate->last_base_qindex = cm->base_qindex; + ppstate->last_frame_valid = 1; + if (flags & VP9D_ADDNOISE) { + const int noise_level = ppflags->noise_level; + if (ppstate->last_q != q || ppstate->last_noise != noise_level) { + double sigma; + vpx_clear_system_state(); + sigma = noise_level + .5 + .6 * q / 63.0; + ppstate->clamp = + vpx_setup_noise(sigma, ppstate->generated_noise, cm->width + 256); + ppstate->last_q = q; + ppstate->last_noise = noise_level; + } + vpx_plane_add_noise(ppbuf->y_buffer, ppstate->generated_noise, + ppstate->clamp, ppstate->clamp, ppbuf->y_width, + ppbuf->y_height, ppbuf->y_stride); + } + + *dest = *ppbuf; + + /* handle problem with extending borders */ + dest->y_width = cm->width; + dest->y_height = cm->height; + dest->uv_width = dest->y_width >> cm->subsampling_x; + dest->uv_height = dest->y_height >> cm->subsampling_y; + + if (flags & VP9D_MFQE) swap_mi_and_prev_mi(cm); + return 0; +} +#endif // CONFIG_VP9_POSTPROC diff --git a/media/libvpx/libvpx/vp9/common/vp9_postproc.h b/media/libvpx/libvpx/vp9/common/vp9_postproc.h new file mode 100644 index 0000000000..bbe3aed835 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_postproc.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_COMMON_VP9_POSTPROC_H_ +#define VPX_VP9_COMMON_VP9_POSTPROC_H_ + +#include "vpx_ports/mem.h" +#include "vpx_scale/yv12config.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_mfqe.h" +#include "vp9/common/vp9_ppflags.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct postproc_state { + int last_q; + int last_noise; + int last_base_qindex; + int last_frame_valid; + MODE_INFO *prev_mip; + MODE_INFO *prev_mi; + int clamp; + uint8_t *limits; + int8_t *generated_noise; +}; + +struct VP9Common; + +#define MFQE_PRECISION 4 + +int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, + vp9_ppflags_t *ppflags, int unscaled_width); + +void vp9_denoise(struct VP9Common *cm, const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, int q, uint8_t *limits); + +void vp9_deblock(struct VP9Common *cm, const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, int q, uint8_t *limits); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_COMMON_VP9_POSTPROC_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_ppflags.h b/media/libvpx/libvpx/vp9/common/vp9_ppflags.h new file mode 100644 index 0000000000..a0e3017626 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_ppflags.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_COMMON_VP9_PPFLAGS_H_ +#define VPX_VP9_COMMON_VP9_PPFLAGS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +enum { + VP9D_NOFILTERING = 0, + VP9D_DEBLOCK = 1 << 0, + VP9D_DEMACROBLOCK = 1 << 1, + VP9D_ADDNOISE = 1 << 2, + VP9D_MFQE = 1 << 3 +}; + +typedef struct { + int post_proc_flag; + int deblocking_level; + int noise_level; +} vp9_ppflags_t; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_COMMON_VP9_PPFLAGS_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_pred_common.c b/media/libvpx/libvpx/vp9/common/vp9_pred_common.c new file mode 100644 index 0000000000..375cb4d76c --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_pred_common.c @@ -0,0 +1,316 @@ + +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_pred_common.h" +#include "vp9/common/vp9_seg_common.h" + +int vp9_compound_reference_allowed(const VP9_COMMON *cm) { + int i; + for (i = 1; i < REFS_PER_FRAME; ++i) + if (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1]) return 1; + + return 0; +} + +void vp9_setup_compound_reference_mode(VP9_COMMON *cm) { + if (cm->ref_frame_sign_bias[LAST_FRAME] == + cm->ref_frame_sign_bias[GOLDEN_FRAME]) { + cm->comp_fixed_ref = ALTREF_FRAME; + cm->comp_var_ref[0] = LAST_FRAME; + cm->comp_var_ref[1] = GOLDEN_FRAME; + } else if (cm->ref_frame_sign_bias[LAST_FRAME] == + cm->ref_frame_sign_bias[ALTREF_FRAME]) { + cm->comp_fixed_ref = GOLDEN_FRAME; + cm->comp_var_ref[0] = LAST_FRAME; + cm->comp_var_ref[1] = ALTREF_FRAME; + } else { + cm->comp_fixed_ref = LAST_FRAME; + cm->comp_var_ref[0] = GOLDEN_FRAME; + cm->comp_var_ref[1] = ALTREF_FRAME; + } +} + +int vp9_get_reference_mode_context(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { + int ctx; + const MODE_INFO *const above_mi = xd->above_mi; + const MODE_INFO *const left_mi = xd->left_mi; + const int has_above = !!above_mi; + const int has_left = !!left_mi; + // Note: + // The mode info data structure has a one element border above and to the + // left of the entries corresponding to real macroblocks. + // The prediction flags in these dummy entries are initialized to 0. + if (has_above && has_left) { // both edges available + if (!has_second_ref(above_mi) && !has_second_ref(left_mi)) + // neither edge uses comp pred (0/1) + ctx = (above_mi->ref_frame[0] == cm->comp_fixed_ref) ^ + (left_mi->ref_frame[0] == cm->comp_fixed_ref); + else if (!has_second_ref(above_mi)) + // one of two edges uses comp pred (2/3) + ctx = 2 + (above_mi->ref_frame[0] == cm->comp_fixed_ref || + !is_inter_block(above_mi)); + else if (!has_second_ref(left_mi)) + // one of two edges uses comp pred (2/3) + ctx = 2 + (left_mi->ref_frame[0] == cm->comp_fixed_ref || + !is_inter_block(left_mi)); + else // both edges use comp pred (4) + ctx = 4; + } else if (has_above || has_left) { // one edge available + const MODE_INFO *edge_mi = has_above ? above_mi : left_mi; + + if (!has_second_ref(edge_mi)) + // edge does not use comp pred (0/1) + ctx = edge_mi->ref_frame[0] == cm->comp_fixed_ref; + else + // edge uses comp pred (3) + ctx = 3; + } else { // no edges available (1) + ctx = 1; + } + assert(ctx >= 0 && ctx < COMP_INTER_CONTEXTS); + return ctx; +} + +// Returns a context number for the given MB prediction signal +int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { + int pred_context; + const MODE_INFO *const above_mi = xd->above_mi; + const MODE_INFO *const left_mi = xd->left_mi; + const int above_in_image = !!above_mi; + const int left_in_image = !!left_mi; + + // Note: + // The mode info data structure has a one element border above and to the + // left of the entries corresponding to real macroblocks. + // The prediction flags in these dummy entries are initialized to 0. + const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref]; + const int var_ref_idx = !fix_ref_idx; + + if (above_in_image && left_in_image) { // both edges available + const int above_intra = !is_inter_block(above_mi); + const int left_intra = !is_inter_block(left_mi); + + if (above_intra && left_intra) { // intra/intra (2) + pred_context = 2; + } else if (above_intra || left_intra) { // intra/inter + const MODE_INFO *edge_mi = above_intra ? left_mi : above_mi; + + if (!has_second_ref(edge_mi)) // single pred (1/3) + pred_context = 1 + 2 * (edge_mi->ref_frame[0] != cm->comp_var_ref[1]); + else // comp pred (1/3) + pred_context = + 1 + 2 * (edge_mi->ref_frame[var_ref_idx] != cm->comp_var_ref[1]); + } else { // inter/inter + const int l_sg = !has_second_ref(left_mi); + const int a_sg = !has_second_ref(above_mi); + const MV_REFERENCE_FRAME vrfa = + a_sg ? above_mi->ref_frame[0] : above_mi->ref_frame[var_ref_idx]; + const MV_REFERENCE_FRAME vrfl = + l_sg ? left_mi->ref_frame[0] : left_mi->ref_frame[var_ref_idx]; + + if (vrfa == vrfl && cm->comp_var_ref[1] == vrfa) { + pred_context = 0; + } else if (l_sg && a_sg) { // single/single + if ((vrfa == cm->comp_fixed_ref && vrfl == cm->comp_var_ref[0]) || + (vrfl == cm->comp_fixed_ref && vrfa == cm->comp_var_ref[0])) + pred_context = 4; + else if (vrfa == vrfl) + pred_context = 3; + else + pred_context = 1; + } else if (l_sg || a_sg) { // single/comp + const MV_REFERENCE_FRAME vrfc = l_sg ? vrfa : vrfl; + const MV_REFERENCE_FRAME rfs = a_sg ? vrfa : vrfl; + if (vrfc == cm->comp_var_ref[1] && rfs != cm->comp_var_ref[1]) + pred_context = 1; + else if (rfs == cm->comp_var_ref[1] && vrfc != cm->comp_var_ref[1]) + pred_context = 2; + else + pred_context = 4; + } else if (vrfa == vrfl) { // comp/comp + pred_context = 4; + } else { + pred_context = 2; + } + } + } else if (above_in_image || left_in_image) { // one edge available + const MODE_INFO *edge_mi = above_in_image ? above_mi : left_mi; + + if (!is_inter_block(edge_mi)) { + pred_context = 2; + } else { + if (has_second_ref(edge_mi)) + pred_context = + 4 * (edge_mi->ref_frame[var_ref_idx] != cm->comp_var_ref[1]); + else + pred_context = 3 * (edge_mi->ref_frame[0] != cm->comp_var_ref[1]); + } + } else { // no edges available (2) + pred_context = 2; + } + assert(pred_context >= 0 && pred_context < REF_CONTEXTS); + + return pred_context; +} + +int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) { + int pred_context; + const MODE_INFO *const above_mi = xd->above_mi; + const MODE_INFO *const left_mi = xd->left_mi; + const int has_above = !!above_mi; + const int has_left = !!left_mi; + // Note: + // The mode info data structure has a one element border above and to the + // left of the entries corresponding to real macroblocks. + // The prediction flags in these dummy entries are initialized to 0. + if (has_above && has_left) { // both edges available + const int above_intra = !is_inter_block(above_mi); + const int left_intra = !is_inter_block(left_mi); + + if (above_intra && left_intra) { // intra/intra + pred_context = 2; + } else if (above_intra || left_intra) { // intra/inter or inter/intra + const MODE_INFO *edge_mi = above_intra ? left_mi : above_mi; + if (!has_second_ref(edge_mi)) + pred_context = 4 * (edge_mi->ref_frame[0] == LAST_FRAME); + else + pred_context = 1 + (edge_mi->ref_frame[0] == LAST_FRAME || + edge_mi->ref_frame[1] == LAST_FRAME); + } else { // inter/inter + const int above_has_second = has_second_ref(above_mi); + const int left_has_second = has_second_ref(left_mi); + const MV_REFERENCE_FRAME above0 = above_mi->ref_frame[0]; + const MV_REFERENCE_FRAME above1 = above_mi->ref_frame[1]; + const MV_REFERENCE_FRAME left0 = left_mi->ref_frame[0]; + const MV_REFERENCE_FRAME left1 = left_mi->ref_frame[1]; + + if (above_has_second && left_has_second) { + pred_context = 1 + (above0 == LAST_FRAME || above1 == LAST_FRAME || + left0 == LAST_FRAME || left1 == LAST_FRAME); + } else if (above_has_second || left_has_second) { + const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0; + const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0; + const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1; + + if (rfs == LAST_FRAME) + pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME); + else + pred_context = (crf1 == LAST_FRAME || crf2 == LAST_FRAME); + } else { + pred_context = 2 * (above0 == LAST_FRAME) + 2 * (left0 == LAST_FRAME); + } + } + } else if (has_above || has_left) { // one edge available + const MODE_INFO *edge_mi = has_above ? above_mi : left_mi; + if (!is_inter_block(edge_mi)) { // intra + pred_context = 2; + } else { // inter + if (!has_second_ref(edge_mi)) + pred_context = 4 * (edge_mi->ref_frame[0] == LAST_FRAME); + else + pred_context = 1 + (edge_mi->ref_frame[0] == LAST_FRAME || + edge_mi->ref_frame[1] == LAST_FRAME); + } + } else { // no edges available + pred_context = 2; + } + + assert(pred_context >= 0 && pred_context < REF_CONTEXTS); + return pred_context; +} + +int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { + int pred_context; + const MODE_INFO *const above_mi = xd->above_mi; + const MODE_INFO *const left_mi = xd->left_mi; + const int has_above = !!above_mi; + const int has_left = !!left_mi; + + // Note: + // The mode info data structure has a one element border above and to the + // left of the entries corresponding to real macroblocks. + // The prediction flags in these dummy entries are initialized to 0. + if (has_above && has_left) { // both edges available + const int above_intra = !is_inter_block(above_mi); + const int left_intra = !is_inter_block(left_mi); + + if (above_intra && left_intra) { // intra/intra + pred_context = 2; + } else if (above_intra || left_intra) { // intra/inter or inter/intra + const MODE_INFO *edge_mi = above_intra ? left_mi : above_mi; + if (!has_second_ref(edge_mi)) { + if (edge_mi->ref_frame[0] == LAST_FRAME) + pred_context = 3; + else + pred_context = 4 * (edge_mi->ref_frame[0] == GOLDEN_FRAME); + } else { + pred_context = 1 + 2 * (edge_mi->ref_frame[0] == GOLDEN_FRAME || + edge_mi->ref_frame[1] == GOLDEN_FRAME); + } + } else { // inter/inter + const int above_has_second = has_second_ref(above_mi); + const int left_has_second = has_second_ref(left_mi); + const MV_REFERENCE_FRAME above0 = above_mi->ref_frame[0]; + const MV_REFERENCE_FRAME above1 = above_mi->ref_frame[1]; + const MV_REFERENCE_FRAME left0 = left_mi->ref_frame[0]; + const MV_REFERENCE_FRAME left1 = left_mi->ref_frame[1]; + + if (above_has_second && left_has_second) { + if (above0 == left0 && above1 == left1) + pred_context = + 3 * (above0 == GOLDEN_FRAME || above1 == GOLDEN_FRAME || + left0 == GOLDEN_FRAME || left1 == GOLDEN_FRAME); + else + pred_context = 2; + } else if (above_has_second || left_has_second) { + const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0; + const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0; + const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1; + + if (rfs == GOLDEN_FRAME) + pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME); + else if (rfs == ALTREF_FRAME) + pred_context = crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME; + else + pred_context = 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME); + } else { + if (above0 == LAST_FRAME && left0 == LAST_FRAME) { + pred_context = 3; + } else if (above0 == LAST_FRAME || left0 == LAST_FRAME) { + const MV_REFERENCE_FRAME edge0 = + (above0 == LAST_FRAME) ? left0 : above0; + pred_context = 4 * (edge0 == GOLDEN_FRAME); + } else { + pred_context = + 2 * (above0 == GOLDEN_FRAME) + 2 * (left0 == GOLDEN_FRAME); + } + } + } + } else if (has_above || has_left) { // one edge available + const MODE_INFO *edge_mi = has_above ? above_mi : left_mi; + + if (!is_inter_block(edge_mi) || + (edge_mi->ref_frame[0] == LAST_FRAME && !has_second_ref(edge_mi))) + pred_context = 2; + else if (!has_second_ref(edge_mi)) + pred_context = 4 * (edge_mi->ref_frame[0] == GOLDEN_FRAME); + else + pred_context = 3 * (edge_mi->ref_frame[0] == GOLDEN_FRAME || + edge_mi->ref_frame[1] == GOLDEN_FRAME); + } else { // no edges available (2) + pred_context = 2; + } + assert(pred_context >= 0 && pred_context < REF_CONTEXTS); + return pred_context; +} diff --git a/media/libvpx/libvpx/vp9/common/vp9_pred_common.h b/media/libvpx/libvpx/vp9/common/vp9_pred_common.h new file mode 100644 index 0000000000..ee59669359 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_pred_common.h @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_COMMON_VP9_PRED_COMMON_H_ +#define VPX_VP9_COMMON_VP9_PRED_COMMON_H_ + +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vpx_dsp/vpx_dsp_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static INLINE int get_segment_id(const VP9_COMMON *cm, + const uint8_t *segment_ids, BLOCK_SIZE bsize, + int mi_row, int mi_col) { + const int mi_offset = mi_row * cm->mi_cols + mi_col; + const int bw = num_8x8_blocks_wide_lookup[bsize]; + const int bh = num_8x8_blocks_high_lookup[bsize]; + const int xmis = VPXMIN(cm->mi_cols - mi_col, bw); + const int ymis = VPXMIN(cm->mi_rows - mi_row, bh); + int x, y, segment_id = MAX_SEGMENTS; + + for (y = 0; y < ymis; ++y) + for (x = 0; x < xmis; ++x) + segment_id = + VPXMIN(segment_id, segment_ids[mi_offset + y * cm->mi_cols + x]); + + assert(segment_id >= 0 && segment_id < MAX_SEGMENTS); + return segment_id; +} + +static INLINE int vp9_get_pred_context_seg_id(const MACROBLOCKD *xd) { + const MODE_INFO *const above_mi = xd->above_mi; + const MODE_INFO *const left_mi = xd->left_mi; + const int above_sip = (above_mi != NULL) ? above_mi->seg_id_predicted : 0; + const int left_sip = (left_mi != NULL) ? left_mi->seg_id_predicted : 0; + + return above_sip + left_sip; +} + +static INLINE vpx_prob vp9_get_pred_prob_seg_id(const struct segmentation *seg, + const MACROBLOCKD *xd) { + return seg->pred_probs[vp9_get_pred_context_seg_id(xd)]; +} + +static INLINE int vp9_get_skip_context(const MACROBLOCKD *xd) { + const MODE_INFO *const above_mi = xd->above_mi; + const MODE_INFO *const left_mi = xd->left_mi; + const int above_skip = (above_mi != NULL) ? above_mi->skip : 0; + const int left_skip = (left_mi != NULL) ? left_mi->skip : 0; + return above_skip + left_skip; +} + +static INLINE vpx_prob vp9_get_skip_prob(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { + return cm->fc->skip_probs[vp9_get_skip_context(xd)]; +} + +// Returns a context number for the given MB prediction signal +static INLINE int get_pred_context_switchable_interp(const MACROBLOCKD *xd) { + // Note: + // The mode info data structure has a one element border above and to the + // left of the entries corresponding to real macroblocks. + // The prediction flags in these dummy entries are initialized to 0. + const MODE_INFO *const left_mi = xd->left_mi; + const int left_type = left_mi ? left_mi->interp_filter : SWITCHABLE_FILTERS; + const MODE_INFO *const above_mi = xd->above_mi; + const int above_type = + above_mi ? above_mi->interp_filter : SWITCHABLE_FILTERS; + + if (left_type == above_type) + return left_type; + else if (left_type == SWITCHABLE_FILTERS) + return above_type; + else if (above_type == SWITCHABLE_FILTERS) + return left_type; + else + return SWITCHABLE_FILTERS; +} + +// The mode info data structure has a one element border above and to the +// left of the entries corresponding to real macroblocks. +// The prediction flags in these dummy entries are initialized to 0. +// 0 - inter/inter, inter/--, --/inter, --/-- +// 1 - intra/inter, inter/intra +// 2 - intra/--, --/intra +// 3 - intra/intra +static INLINE int get_intra_inter_context(const MACROBLOCKD *xd) { + const MODE_INFO *const above_mi = xd->above_mi; + const MODE_INFO *const left_mi = xd->left_mi; + const int has_above = !!above_mi; + const int has_left = !!left_mi; + + if (has_above && has_left) { // both edges available + const int above_intra = !is_inter_block(above_mi); + const int left_intra = !is_inter_block(left_mi); + return left_intra && above_intra ? 3 : left_intra || above_intra; + } else if (has_above || has_left) { // one edge available + return 2 * !is_inter_block(has_above ? above_mi : left_mi); + } + return 0; +} + +static INLINE vpx_prob vp9_get_intra_inter_prob(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { + return cm->fc->intra_inter_prob[get_intra_inter_context(xd)]; +} + +int vp9_get_reference_mode_context(const VP9_COMMON *cm, const MACROBLOCKD *xd); + +static INLINE vpx_prob vp9_get_reference_mode_prob(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { + return cm->fc->comp_inter_prob[vp9_get_reference_mode_context(cm, xd)]; +} + +int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm, + const MACROBLOCKD *xd); + +static INLINE vpx_prob vp9_get_pred_prob_comp_ref_p(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { + const int pred_context = vp9_get_pred_context_comp_ref_p(cm, xd); + return cm->fc->comp_ref_prob[pred_context]; +} + +int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd); + +static INLINE vpx_prob vp9_get_pred_prob_single_ref_p1(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { + return cm->fc->single_ref_prob[vp9_get_pred_context_single_ref_p1(xd)][0]; +} + +int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd); + +static INLINE vpx_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { + return cm->fc->single_ref_prob[vp9_get_pred_context_single_ref_p2(xd)][1]; +} + +int vp9_compound_reference_allowed(const VP9_COMMON *cm); + +void vp9_setup_compound_reference_mode(VP9_COMMON *cm); + +// Returns a context number for the given MB prediction signal +// The mode info data structure has a one element border above and to the +// left of the entries corresponding to real blocks. +// The prediction flags in these dummy entries are initialized to 0. +static INLINE int get_tx_size_context(const MACROBLOCKD *xd) { + const int max_tx_size = max_txsize_lookup[xd->mi[0]->sb_type]; + const MODE_INFO *const above_mi = xd->above_mi; + const MODE_INFO *const left_mi = xd->left_mi; + const int has_above = !!above_mi; + const int has_left = !!left_mi; + int above_ctx = + (has_above && !above_mi->skip) ? (int)above_mi->tx_size : max_tx_size; + int left_ctx = + (has_left && !left_mi->skip) ? (int)left_mi->tx_size : max_tx_size; + if (!has_left) left_ctx = above_ctx; + + if (!has_above) above_ctx = left_ctx; + + return (above_ctx + left_ctx) > max_tx_size; +} + +static INLINE const vpx_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx, + const struct tx_probs *tx_probs) { + switch (max_tx_size) { + case TX_8X8: return tx_probs->p8x8[ctx]; + case TX_16X16: return tx_probs->p16x16[ctx]; + case TX_32X32: return tx_probs->p32x32[ctx]; + default: assert(0 && "Invalid max_tx_size."); return NULL; + } +} + +static INLINE unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx, + struct tx_counts *tx_counts) { + switch (max_tx_size) { + case TX_8X8: return tx_counts->p8x8[ctx]; + case TX_16X16: return tx_counts->p16x16[ctx]; + case TX_32X32: return tx_counts->p32x32[ctx]; + default: assert(0 && "Invalid max_tx_size."); return NULL; + } +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_COMMON_VP9_PRED_COMMON_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_quant_common.c b/media/libvpx/libvpx/vp9/common/vp9_quant_common.c new file mode 100644 index 0000000000..1dc18dc6df --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_quant_common.c @@ -0,0 +1,206 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_seg_common.h" + +static const int16_t dc_qlookup[QINDEX_RANGE] = { + 4, 8, 8, 9, 10, 11, 12, 12, 13, 14, 15, 16, 17, 18, + 19, 19, 20, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30, + 31, 32, 32, 33, 34, 35, 36, 37, 38, 38, 39, 40, 41, 42, + 43, 43, 44, 45, 46, 47, 48, 48, 49, 50, 51, 52, 53, 53, + 54, 55, 56, 57, 57, 58, 59, 60, 61, 62, 62, 63, 64, 65, + 66, 66, 67, 68, 69, 70, 70, 71, 72, 73, 74, 74, 75, 76, + 77, 78, 78, 79, 80, 81, 81, 82, 83, 84, 85, 85, 87, 88, + 90, 92, 93, 95, 96, 98, 99, 101, 102, 104, 105, 107, 108, 110, + 111, 113, 114, 116, 117, 118, 120, 121, 123, 125, 127, 129, 131, 134, + 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 161, 164, + 166, 169, 172, 174, 177, 180, 182, 185, 187, 190, 192, 195, 199, 202, + 205, 208, 211, 214, 217, 220, 223, 226, 230, 233, 237, 240, 243, 247, + 250, 253, 257, 261, 265, 269, 272, 276, 280, 284, 288, 292, 296, 300, + 304, 309, 313, 317, 322, 326, 330, 335, 340, 344, 349, 354, 359, 364, + 369, 374, 379, 384, 389, 395, 400, 406, 411, 417, 423, 429, 435, 441, + 447, 454, 461, 467, 475, 482, 489, 497, 505, 513, 522, 530, 539, 549, + 559, 569, 579, 590, 602, 614, 626, 640, 654, 668, 684, 700, 717, 736, + 755, 775, 796, 819, 843, 869, 896, 925, 955, 988, 1022, 1058, 1098, 1139, + 1184, 1232, 1282, 1336, +}; + +#if CONFIG_VP9_HIGHBITDEPTH +static const int16_t dc_qlookup_10[QINDEX_RANGE] = { + 4, 9, 10, 13, 15, 17, 20, 22, 25, 28, 31, 34, 37, + 40, 43, 47, 50, 53, 57, 60, 64, 68, 71, 75, 78, 82, + 86, 90, 93, 97, 101, 105, 109, 113, 116, 120, 124, 128, 132, + 136, 140, 143, 147, 151, 155, 159, 163, 166, 170, 174, 178, 182, + 185, 189, 193, 197, 200, 204, 208, 212, 215, 219, 223, 226, 230, + 233, 237, 241, 244, 248, 251, 255, 259, 262, 266, 269, 273, 276, + 280, 283, 287, 290, 293, 297, 300, 304, 307, 310, 314, 317, 321, + 324, 327, 331, 334, 337, 343, 350, 356, 362, 369, 375, 381, 387, + 394, 400, 406, 412, 418, 424, 430, 436, 442, 448, 454, 460, 466, + 472, 478, 484, 490, 499, 507, 516, 525, 533, 542, 550, 559, 567, + 576, 584, 592, 601, 609, 617, 625, 634, 644, 655, 666, 676, 687, + 698, 708, 718, 729, 739, 749, 759, 770, 782, 795, 807, 819, 831, + 844, 856, 868, 880, 891, 906, 920, 933, 947, 961, 975, 988, 1001, + 1015, 1030, 1045, 1061, 1076, 1090, 1105, 1120, 1137, 1153, 1170, 1186, 1202, + 1218, 1236, 1253, 1271, 1288, 1306, 1323, 1342, 1361, 1379, 1398, 1416, 1436, + 1456, 1476, 1496, 1516, 1537, 1559, 1580, 1601, 1624, 1647, 1670, 1692, 1717, + 1741, 1766, 1791, 1817, 1844, 1871, 1900, 1929, 1958, 1990, 2021, 2054, 2088, + 2123, 2159, 2197, 2236, 2276, 2319, 2363, 2410, 2458, 2508, 2561, 2616, 2675, + 2737, 2802, 2871, 2944, 3020, 3102, 3188, 3280, 3375, 3478, 3586, 3702, 3823, + 3953, 4089, 4236, 4394, 4559, 4737, 4929, 5130, 5347, +}; + +static const int16_t dc_qlookup_12[QINDEX_RANGE] = { + 4, 12, 18, 25, 33, 41, 50, 60, 70, 80, 91, + 103, 115, 127, 140, 153, 166, 180, 194, 208, 222, 237, + 251, 266, 281, 296, 312, 327, 343, 358, 374, 390, 405, + 421, 437, 453, 469, 484, 500, 516, 532, 548, 564, 580, + 596, 611, 627, 643, 659, 674, 690, 706, 721, 737, 752, + 768, 783, 798, 814, 829, 844, 859, 874, 889, 904, 919, + 934, 949, 964, 978, 993, 1008, 1022, 1037, 1051, 1065, 1080, + 1094, 1108, 1122, 1136, 1151, 1165, 1179, 1192, 1206, 1220, 1234, + 1248, 1261, 1275, 1288, 1302, 1315, 1329, 1342, 1368, 1393, 1419, + 1444, 1469, 1494, 1519, 1544, 1569, 1594, 1618, 1643, 1668, 1692, + 1717, 1741, 1765, 1789, 1814, 1838, 1862, 1885, 1909, 1933, 1957, + 1992, 2027, 2061, 2096, 2130, 2165, 2199, 2233, 2267, 2300, 2334, + 2367, 2400, 2434, 2467, 2499, 2532, 2575, 2618, 2661, 2704, 2746, + 2788, 2830, 2872, 2913, 2954, 2995, 3036, 3076, 3127, 3177, 3226, + 3275, 3324, 3373, 3421, 3469, 3517, 3565, 3621, 3677, 3733, 3788, + 3843, 3897, 3951, 4005, 4058, 4119, 4181, 4241, 4301, 4361, 4420, + 4479, 4546, 4612, 4677, 4742, 4807, 4871, 4942, 5013, 5083, 5153, + 5222, 5291, 5367, 5442, 5517, 5591, 5665, 5745, 5825, 5905, 5984, + 6063, 6149, 6234, 6319, 6404, 6495, 6587, 6678, 6769, 6867, 6966, + 7064, 7163, 7269, 7376, 7483, 7599, 7715, 7832, 7958, 8085, 8214, + 8352, 8492, 8635, 8788, 8945, 9104, 9275, 9450, 9639, 9832, 10031, + 10245, 10465, 10702, 10946, 11210, 11482, 11776, 12081, 12409, 12750, 13118, + 13501, 13913, 14343, 14807, 15290, 15812, 16356, 16943, 17575, 18237, 18949, + 19718, 20521, 21387, +}; +#endif + +static const int16_t ac_qlookup[QINDEX_RANGE] = { + 4, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, + 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, + 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, + 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, + 98, 99, 100, 101, 102, 104, 106, 108, 110, 112, 114, 116, 118, + 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, + 146, 148, 150, 152, 155, 158, 161, 164, 167, 170, 173, 176, 179, + 182, 185, 188, 191, 194, 197, 200, 203, 207, 211, 215, 219, 223, + 227, 231, 235, 239, 243, 247, 251, 255, 260, 265, 270, 275, 280, + 285, 290, 295, 300, 305, 311, 317, 323, 329, 335, 341, 347, 353, + 359, 366, 373, 380, 387, 394, 401, 408, 416, 424, 432, 440, 448, + 456, 465, 474, 483, 492, 501, 510, 520, 530, 540, 550, 560, 571, + 582, 593, 604, 615, 627, 639, 651, 663, 676, 689, 702, 715, 729, + 743, 757, 771, 786, 801, 816, 832, 848, 864, 881, 898, 915, 933, + 951, 969, 988, 1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151, 1173, 1196, + 1219, 1243, 1267, 1292, 1317, 1343, 1369, 1396, 1423, 1451, 1479, 1508, 1537, + 1567, 1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828, +}; + +#if CONFIG_VP9_HIGHBITDEPTH +static const int16_t ac_qlookup_10[QINDEX_RANGE] = { + 4, 9, 11, 13, 16, 18, 21, 24, 27, 30, 33, 37, 40, + 44, 48, 51, 55, 59, 63, 67, 71, 75, 79, 83, 88, 92, + 96, 100, 105, 109, 114, 118, 122, 127, 131, 136, 140, 145, 149, + 154, 158, 163, 168, 172, 177, 181, 186, 190, 195, 199, 204, 208, + 213, 217, 222, 226, 231, 235, 240, 244, 249, 253, 258, 262, 267, + 271, 275, 280, 284, 289, 293, 297, 302, 306, 311, 315, 319, 324, + 328, 332, 337, 341, 345, 349, 354, 358, 362, 367, 371, 375, 379, + 384, 388, 392, 396, 401, 409, 417, 425, 433, 441, 449, 458, 466, + 474, 482, 490, 498, 506, 514, 523, 531, 539, 547, 555, 563, 571, + 579, 588, 596, 604, 616, 628, 640, 652, 664, 676, 688, 700, 713, + 725, 737, 749, 761, 773, 785, 797, 809, 825, 841, 857, 873, 889, + 905, 922, 938, 954, 970, 986, 1002, 1018, 1038, 1058, 1078, 1098, 1118, + 1138, 1158, 1178, 1198, 1218, 1242, 1266, 1290, 1314, 1338, 1362, 1386, 1411, + 1435, 1463, 1491, 1519, 1547, 1575, 1603, 1631, 1663, 1695, 1727, 1759, 1791, + 1823, 1859, 1895, 1931, 1967, 2003, 2039, 2079, 2119, 2159, 2199, 2239, 2283, + 2327, 2371, 2415, 2459, 2507, 2555, 2603, 2651, 2703, 2755, 2807, 2859, 2915, + 2971, 3027, 3083, 3143, 3203, 3263, 3327, 3391, 3455, 3523, 3591, 3659, 3731, + 3803, 3876, 3952, 4028, 4104, 4184, 4264, 4348, 4432, 4516, 4604, 4692, 4784, + 4876, 4972, 5068, 5168, 5268, 5372, 5476, 5584, 5692, 5804, 5916, 6032, 6148, + 6268, 6388, 6512, 6640, 6768, 6900, 7036, 7172, 7312, +}; + +static const int16_t ac_qlookup_12[QINDEX_RANGE] = { + 4, 13, 19, 27, 35, 44, 54, 64, 75, 87, 99, + 112, 126, 139, 154, 168, 183, 199, 214, 230, 247, 263, + 280, 297, 314, 331, 349, 366, 384, 402, 420, 438, 456, + 475, 493, 511, 530, 548, 567, 586, 604, 623, 642, 660, + 679, 698, 716, 735, 753, 772, 791, 809, 828, 846, 865, + 884, 902, 920, 939, 957, 976, 994, 1012, 1030, 1049, 1067, + 1085, 1103, 1121, 1139, 1157, 1175, 1193, 1211, 1229, 1246, 1264, + 1282, 1299, 1317, 1335, 1352, 1370, 1387, 1405, 1422, 1440, 1457, + 1474, 1491, 1509, 1526, 1543, 1560, 1577, 1595, 1627, 1660, 1693, + 1725, 1758, 1791, 1824, 1856, 1889, 1922, 1954, 1987, 2020, 2052, + 2085, 2118, 2150, 2183, 2216, 2248, 2281, 2313, 2346, 2378, 2411, + 2459, 2508, 2556, 2605, 2653, 2701, 2750, 2798, 2847, 2895, 2943, + 2992, 3040, 3088, 3137, 3185, 3234, 3298, 3362, 3426, 3491, 3555, + 3619, 3684, 3748, 3812, 3876, 3941, 4005, 4069, 4149, 4230, 4310, + 4390, 4470, 4550, 4631, 4711, 4791, 4871, 4967, 5064, 5160, 5256, + 5352, 5448, 5544, 5641, 5737, 5849, 5961, 6073, 6185, 6297, 6410, + 6522, 6650, 6778, 6906, 7034, 7162, 7290, 7435, 7579, 7723, 7867, + 8011, 8155, 8315, 8475, 8635, 8795, 8956, 9132, 9308, 9484, 9660, + 9836, 10028, 10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661, 11885, + 12109, 12333, 12573, 12813, 13053, 13309, 13565, 13821, 14093, 14365, 14637, + 14925, 15213, 15502, 15806, 16110, 16414, 16734, 17054, 17390, 17726, 18062, + 18414, 18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486, 21902, 22334, + 22766, 23214, 23662, 24126, 24590, 25070, 25551, 26047, 26559, 27071, 27599, + 28143, 28687, 29247, +}; +#endif + +int16_t vp9_dc_quant(int qindex, int delta, vpx_bit_depth_t bit_depth) { +#if CONFIG_VP9_HIGHBITDEPTH + switch (bit_depth) { + case VPX_BITS_8: return dc_qlookup[clamp(qindex + delta, 0, MAXQ)]; + case VPX_BITS_10: return dc_qlookup_10[clamp(qindex + delta, 0, MAXQ)]; + case VPX_BITS_12: return dc_qlookup_12[clamp(qindex + delta, 0, MAXQ)]; + default: + assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); + return -1; + } +#else + (void)bit_depth; + return dc_qlookup[clamp(qindex + delta, 0, MAXQ)]; +#endif +} + +int16_t vp9_ac_quant(int qindex, int delta, vpx_bit_depth_t bit_depth) { +#if CONFIG_VP9_HIGHBITDEPTH + switch (bit_depth) { + case VPX_BITS_8: return ac_qlookup[clamp(qindex + delta, 0, MAXQ)]; + case VPX_BITS_10: return ac_qlookup_10[clamp(qindex + delta, 0, MAXQ)]; + case VPX_BITS_12: return ac_qlookup_12[clamp(qindex + delta, 0, MAXQ)]; + default: + assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); + return -1; + } +#else + (void)bit_depth; + return ac_qlookup[clamp(qindex + delta, 0, MAXQ)]; +#endif +} + +int vp9_get_qindex(const struct segmentation *seg, int segment_id, + int base_qindex) { + if (segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) { + const int data = get_segdata(seg, segment_id, SEG_LVL_ALT_Q); + const int seg_qindex = + seg->abs_delta == SEGMENT_ABSDATA ? data : base_qindex + data; + return clamp(seg_qindex, 0, MAXQ); + } else { + return base_qindex; + } +} diff --git a/media/libvpx/libvpx/vp9/common/vp9_quant_common.h b/media/libvpx/libvpx/vp9/common/vp9_quant_common.h new file mode 100644 index 0000000000..ec8b9f4c6a --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_quant_common.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_COMMON_VP9_QUANT_COMMON_H_ +#define VPX_VP9_COMMON_VP9_QUANT_COMMON_H_ + +#include "vpx/vpx_codec.h" +#include "vp9/common/vp9_seg_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MINQ 0 +#define MAXQ 255 +#define QINDEX_RANGE (MAXQ - MINQ + 1) +#define QINDEX_BITS 8 + +int16_t vp9_dc_quant(int qindex, int delta, vpx_bit_depth_t bit_depth); +int16_t vp9_ac_quant(int qindex, int delta, vpx_bit_depth_t bit_depth); + +int vp9_get_qindex(const struct segmentation *seg, int segment_id, + int base_qindex); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_COMMON_VP9_QUANT_COMMON_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_reconinter.c b/media/libvpx/libvpx/vp9/common/vp9_reconinter.c new file mode 100644 index 0000000000..0a60b853d8 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_reconinter.c @@ -0,0 +1,290 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_scale_rtcd.h" +#include "./vpx_config.h" + +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_reconintra.h" + +#include "vpx/vpx_integer.h" +#include "vpx_scale/yv12config.h" + +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_highbd_build_inter_predictor( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + const MV *src_mv, const struct scale_factors *sf, int w, int h, int ref, + const InterpKernel *kernel, enum mv_precision precision, int x, int y, + int bd) { + const int is_q4 = precision == MV_PRECISION_Q4; + const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2, + is_q4 ? src_mv->col : src_mv->col * 2 }; + MV32 mv = vp9_scale_mv(&mv_q4, x, y, sf); + const int subpel_x = mv.col & SUBPEL_MASK; + const int subpel_y = mv.row & SUBPEL_MASK; + + src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS); + + highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y, + sf, w, h, ref, kernel, sf->x_step_q4, sf->y_step_q4, + bd); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +void vp9_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, const MV *src_mv, + const struct scale_factors *sf, int w, int h, + int ref, const InterpKernel *kernel, + enum mv_precision precision, int x, int y) { + const int is_q4 = precision == MV_PRECISION_Q4; + const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2, + is_q4 ? src_mv->col : src_mv->col * 2 }; + MV32 mv = vp9_scale_mv(&mv_q4, x, y, sf); + const int subpel_x = mv.col & SUBPEL_MASK; + const int subpel_y = mv.row & SUBPEL_MASK; + + src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS); + + inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y, sf, w, + h, ref, kernel, sf->x_step_q4, sf->y_step_q4); +} + +static INLINE int round_mv_comp_q4(int value) { + return (value < 0 ? value - 2 : value + 2) / 4; +} + +static MV mi_mv_pred_q4(const MODE_INFO *mi, int idx) { + MV res = { round_mv_comp_q4(mi->bmi[0].as_mv[idx].as_mv.row + + mi->bmi[1].as_mv[idx].as_mv.row + + mi->bmi[2].as_mv[idx].as_mv.row + + mi->bmi[3].as_mv[idx].as_mv.row), + round_mv_comp_q4(mi->bmi[0].as_mv[idx].as_mv.col + + mi->bmi[1].as_mv[idx].as_mv.col + + mi->bmi[2].as_mv[idx].as_mv.col + + mi->bmi[3].as_mv[idx].as_mv.col) }; + return res; +} + +static INLINE int round_mv_comp_q2(int value) { + return (value < 0 ? value - 1 : value + 1) / 2; +} + +static MV mi_mv_pred_q2(const MODE_INFO *mi, int idx, int block0, int block1) { + MV res = { round_mv_comp_q2(mi->bmi[block0].as_mv[idx].as_mv.row + + mi->bmi[block1].as_mv[idx].as_mv.row), + round_mv_comp_q2(mi->bmi[block0].as_mv[idx].as_mv.col + + mi->bmi[block1].as_mv[idx].as_mv.col) }; + return res; +} + +// TODO(jkoleszar): yet another mv clamping function :-( +MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv, int bw, + int bh, int ss_x, int ss_y) { + // If the MV points so far into the UMV border that no visible pixels + // are used for reconstruction, the subpel part of the MV can be + // discarded and the MV limited to 16 pixels with equivalent results. + const int spel_left = (VP9_INTERP_EXTEND + bw) << SUBPEL_BITS; + const int spel_right = spel_left - SUBPEL_SHIFTS; + const int spel_top = (VP9_INTERP_EXTEND + bh) << SUBPEL_BITS; + const int spel_bottom = spel_top - SUBPEL_SHIFTS; + MV clamped_mv = { (short)(src_mv->row * (1 << (1 - ss_y))), + (short)(src_mv->col * (1 << (1 - ss_x))) }; + assert(ss_x <= 1); + assert(ss_y <= 1); + + clamp_mv(&clamped_mv, xd->mb_to_left_edge * (1 << (1 - ss_x)) - spel_left, + xd->mb_to_right_edge * (1 << (1 - ss_x)) + spel_right, + xd->mb_to_top_edge * (1 << (1 - ss_y)) - spel_top, + xd->mb_to_bottom_edge * (1 << (1 - ss_y)) + spel_bottom); + + return clamped_mv; +} + +MV average_split_mvs(const struct macroblockd_plane *pd, const MODE_INFO *mi, + int ref, int block) { + const int ss_idx = ((pd->subsampling_x > 0) << 1) | (pd->subsampling_y > 0); + MV res = { 0, 0 }; + switch (ss_idx) { + case 0: res = mi->bmi[block].as_mv[ref].as_mv; break; + case 1: res = mi_mv_pred_q2(mi, ref, block, block + 2); break; + case 2: res = mi_mv_pred_q2(mi, ref, block, block + 1); break; + case 3: res = mi_mv_pred_q4(mi, ref); break; + default: assert(ss_idx <= 3 && ss_idx >= 0); + } + return res; +} + +static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block, + int bw, int bh, int x, int y, int w, int h, + int mi_x, int mi_y) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + const MODE_INFO *mi = xd->mi[0]; + const int is_compound = has_second_ref(mi); + const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter]; + int ref; + + for (ref = 0; ref < 1 + is_compound; ++ref) { + const struct scale_factors *const sf = &xd->block_refs[ref]->sf; + struct buf_2d *const pre_buf = &pd->pre[ref]; + struct buf_2d *const dst_buf = &pd->dst; + uint8_t *const dst = dst_buf->buf + (int64_t)dst_buf->stride * y + x; + const MV mv = mi->sb_type < BLOCK_8X8 + ? average_split_mvs(pd, mi, ref, block) + : mi->mv[ref].as_mv; + + // TODO(jkoleszar): This clamping is done in the incorrect place for the + // scaling case. It needs to be done on the scaled MV, not the pre-scaling + // MV. Note however that it performs the subsampling aware scaling so + // that the result is always q4. + // mv_precision precision is MV_PRECISION_Q4. + const MV mv_q4 = clamp_mv_to_umv_border_sb( + xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y); + + uint8_t *pre; + MV32 scaled_mv; + int xs, ys, subpel_x, subpel_y; + const int is_scaled = vp9_is_scaled(sf); + + if (is_scaled) { + // Co-ordinate of containing block to pixel precision. + const int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)); + const int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)); + const YV12_BUFFER_CONFIG *ref_buf = xd->block_refs[ref]->buf; + uint8_t *buf_array[] = { ref_buf->y_buffer, ref_buf->u_buffer, + ref_buf->v_buffer }; + const int stride_array[] = { ref_buf->y_stride, ref_buf->uv_stride, + ref_buf->uv_stride }; +#if 0 // CONFIG_BETTER_HW_COMPATIBILITY + assert(xd->mi[0]->sb_type != BLOCK_4X8 && + xd->mi[0]->sb_type != BLOCK_8X4); + assert(mv_q4.row == mv.row * (1 << (1 - pd->subsampling_y)) && + mv_q4.col == mv.col * (1 << (1 - pd->subsampling_x))); +#endif + pre_buf->buf = buf_array[plane]; + pre_buf->stride = stride_array[plane]; + + pre_buf->buf += + scaled_buffer_offset(x_start + x, y_start + y, pre_buf->stride, sf); + pre = pre_buf->buf; + scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf); + xs = sf->x_step_q4; + ys = sf->y_step_q4; + } else { + pre = pre_buf->buf + ((int64_t)y * pre_buf->stride + x); + scaled_mv.row = mv_q4.row; + scaled_mv.col = mv_q4.col; + xs = ys = 16; + } + subpel_x = scaled_mv.col & SUBPEL_MASK; + subpel_y = scaled_mv.row & SUBPEL_MASK; + pre += (scaled_mv.row >> SUBPEL_BITS) * pre_buf->stride + + (scaled_mv.col >> SUBPEL_BITS); + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + highbd_inter_predictor(CONVERT_TO_SHORTPTR(pre), pre_buf->stride, + CONVERT_TO_SHORTPTR(dst), dst_buf->stride, + subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys, + xd->bd); + } else { + inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride, subpel_x, + subpel_y, sf, w, h, ref, kernel, xs, ys); + } +#else + inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride, subpel_x, + subpel_y, sf, w, h, ref, kernel, xs, ys); +#endif // CONFIG_VP9_HIGHBITDEPTH + } +} + +static void build_inter_predictors_for_planes(MACROBLOCKD *xd, BLOCK_SIZE bsize, + int mi_row, int mi_col, + int plane_from, int plane_to) { + int plane; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + for (plane = plane_from; plane <= plane_to; ++plane) { + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, &xd->plane[plane]); + const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; + const int bw = 4 * num_4x4_w; + const int bh = 4 * num_4x4_h; + + if (xd->mi[0]->sb_type < BLOCK_8X8) { + int i = 0, x, y; + assert(bsize == BLOCK_8X8); + for (y = 0; y < num_4x4_h; ++y) + for (x = 0; x < num_4x4_w; ++x) + build_inter_predictors(xd, plane, i++, bw, bh, 4 * x, 4 * y, 4, 4, + mi_x, mi_y); + } else { + build_inter_predictors(xd, plane, 0, bw, bh, 0, 0, bw, bh, mi_x, mi_y); + } + } +} + +void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0, 0); +} + +void vp9_build_inter_predictors_sbp(MACROBLOCKD *xd, int mi_row, int mi_col, + BLOCK_SIZE bsize, int plane) { + build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, plane, plane); +} + +void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 1, + MAX_MB_PLANE - 1); +} + +void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0, + MAX_MB_PLANE - 1); +} + +void vp9_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE], + const YV12_BUFFER_CONFIG *src, int mi_row, + int mi_col) { + uint8_t *const buffers[MAX_MB_PLANE] = { src->y_buffer, src->u_buffer, + src->v_buffer }; + const int strides[MAX_MB_PLANE] = { src->y_stride, src->uv_stride, + src->uv_stride }; + int i; + + for (i = 0; i < MAX_MB_PLANE; ++i) { + struct macroblockd_plane *const pd = &planes[i]; + setup_pred_plane(&pd->dst, buffers[i], strides[i], mi_row, mi_col, NULL, + pd->subsampling_x, pd->subsampling_y); + } +} + +void vp9_setup_pre_planes(MACROBLOCKD *xd, int idx, + const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, + const struct scale_factors *sf) { + if (src != NULL) { + int i; + uint8_t *const buffers[MAX_MB_PLANE] = { src->y_buffer, src->u_buffer, + src->v_buffer }; + const int strides[MAX_MB_PLANE] = { src->y_stride, src->uv_stride, + src->uv_stride }; + for (i = 0; i < MAX_MB_PLANE; ++i) { + struct macroblockd_plane *const pd = &xd->plane[i]; + setup_pred_plane(&pd->pre[idx], buffers[i], strides[i], mi_row, mi_col, + sf, pd->subsampling_x, pd->subsampling_y); + } + } +} diff --git a/media/libvpx/libvpx/vp9/common/vp9_reconinter.h b/media/libvpx/libvpx/vp9/common/vp9_reconinter.h new file mode 100644 index 0000000000..12b545831a --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_reconinter.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_COMMON_VP9_RECONINTER_H_ +#define VPX_VP9_COMMON_VP9_RECONINTER_H_ + +#include "vp9/common/vp9_filter.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_filter.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static INLINE void inter_predictor(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int subpel_x, const int subpel_y, + const struct scale_factors *sf, int w, int h, + int ref, const InterpKernel *kernel, int xs, + int ys) { + sf->predict[subpel_x != 0][subpel_y != 0][ref](src, src_stride, dst, + dst_stride, kernel, subpel_x, + xs, subpel_y, ys, w, h); +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void highbd_inter_predictor( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + const int subpel_x, const int subpel_y, const struct scale_factors *sf, + int w, int h, int ref, const InterpKernel *kernel, int xs, int ys, int bd) { + sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref]( + src, src_stride, dst, dst_stride, kernel, subpel_x, xs, subpel_y, ys, w, + h, bd); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +MV average_split_mvs(const struct macroblockd_plane *pd, const MODE_INFO *mi, + int ref, int block); + +MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv, int bw, + int bh, int ss_x, int ss_y); + +void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col, + BLOCK_SIZE bsize); + +void vp9_build_inter_predictors_sbp(MACROBLOCKD *xd, int mi_row, int mi_col, + BLOCK_SIZE bsize, int plane); + +void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col, + BLOCK_SIZE bsize); + +void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col, + BLOCK_SIZE bsize); + +void vp9_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, const MV *src_mv, + const struct scale_factors *sf, int w, int h, + int ref, const InterpKernel *kernel, + enum mv_precision precision, int x, int y); + +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_highbd_build_inter_predictor( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + const MV *src_mv, const struct scale_factors *sf, int w, int h, int ref, + const InterpKernel *kernel, enum mv_precision precision, int x, int y, + int bd); +#endif + +static INLINE int64_t scaled_buffer_offset(int x_offset, int y_offset, + int stride, + const struct scale_factors *sf) { + const int x = sf ? sf->scale_value_x(x_offset, sf) : x_offset; + const int y = sf ? sf->scale_value_y(y_offset, sf) : y_offset; + return (int64_t)y * stride + x; +} + +static INLINE void setup_pred_plane(struct buf_2d *dst, uint8_t *src, + int stride, int mi_row, int mi_col, + const struct scale_factors *scale, + int subsampling_x, int subsampling_y) { + const int x = (MI_SIZE * mi_col) >> subsampling_x; + const int y = (MI_SIZE * mi_row) >> subsampling_y; + dst->buf = src + scaled_buffer_offset(x, y, stride, scale); + dst->stride = stride; +} + +void vp9_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE], + const YV12_BUFFER_CONFIG *src, int mi_row, + int mi_col); + +void vp9_setup_pre_planes(MACROBLOCKD *xd, int idx, + const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, + const struct scale_factors *sf); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_COMMON_VP9_RECONINTER_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_reconintra.c b/media/libvpx/libvpx/vp9/common/vp9_reconintra.c new file mode 100644 index 0000000000..3e5ed616d3 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_reconintra.c @@ -0,0 +1,431 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#if CONFIG_VP9_HIGHBITDEPTH +#include "vpx_dsp/vpx_dsp_common.h" +#endif // CONFIG_VP9_HIGHBITDEPTH +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/vpx_once.h" + +#include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_onyxc_int.h" + +const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES] = { + DCT_DCT, // DC + ADST_DCT, // V + DCT_ADST, // H + DCT_DCT, // D45 + ADST_ADST, // D135 + ADST_DCT, // D117 + DCT_ADST, // D153 + DCT_ADST, // D207 + ADST_DCT, // D63 + ADST_ADST, // TM +}; + +enum { + NEED_LEFT = 1 << 1, + NEED_ABOVE = 1 << 2, + NEED_ABOVERIGHT = 1 << 3, +}; + +static const uint8_t extend_modes[INTRA_MODES] = { + NEED_ABOVE | NEED_LEFT, // DC + NEED_ABOVE, // V + NEED_LEFT, // H + NEED_ABOVERIGHT, // D45 + NEED_LEFT | NEED_ABOVE, // D135 + NEED_LEFT | NEED_ABOVE, // D117 + NEED_LEFT | NEED_ABOVE, // D153 + NEED_LEFT, // D207 + NEED_ABOVERIGHT, // D63 + NEED_LEFT | NEED_ABOVE, // TM +}; + +typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left); + +static intra_pred_fn pred[INTRA_MODES][TX_SIZES]; +static intra_pred_fn dc_pred[2][2][TX_SIZES]; + +#if CONFIG_VP9_HIGHBITDEPTH +typedef void (*intra_high_pred_fn)(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd); +static intra_high_pred_fn pred_high[INTRA_MODES][4]; +static intra_high_pred_fn dc_pred_high[2][2][4]; +#endif // CONFIG_VP9_HIGHBITDEPTH + +static void vp9_init_intra_predictors_internal(void) { +#define INIT_ALL_SIZES(p, type) \ + p[TX_4X4] = vpx_##type##_predictor_4x4; \ + p[TX_8X8] = vpx_##type##_predictor_8x8; \ + p[TX_16X16] = vpx_##type##_predictor_16x16; \ + p[TX_32X32] = vpx_##type##_predictor_32x32 + + INIT_ALL_SIZES(pred[V_PRED], v); + INIT_ALL_SIZES(pred[H_PRED], h); + INIT_ALL_SIZES(pred[D207_PRED], d207); + INIT_ALL_SIZES(pred[D45_PRED], d45); + INIT_ALL_SIZES(pred[D63_PRED], d63); + INIT_ALL_SIZES(pred[D117_PRED], d117); + INIT_ALL_SIZES(pred[D135_PRED], d135); + INIT_ALL_SIZES(pred[D153_PRED], d153); + INIT_ALL_SIZES(pred[TM_PRED], tm); + + INIT_ALL_SIZES(dc_pred[0][0], dc_128); + INIT_ALL_SIZES(dc_pred[0][1], dc_top); + INIT_ALL_SIZES(dc_pred[1][0], dc_left); + INIT_ALL_SIZES(dc_pred[1][1], dc); + +#if CONFIG_VP9_HIGHBITDEPTH + INIT_ALL_SIZES(pred_high[V_PRED], highbd_v); + INIT_ALL_SIZES(pred_high[H_PRED], highbd_h); + INIT_ALL_SIZES(pred_high[D207_PRED], highbd_d207); + INIT_ALL_SIZES(pred_high[D45_PRED], highbd_d45); + INIT_ALL_SIZES(pred_high[D63_PRED], highbd_d63); + INIT_ALL_SIZES(pred_high[D117_PRED], highbd_d117); + INIT_ALL_SIZES(pred_high[D135_PRED], highbd_d135); + INIT_ALL_SIZES(pred_high[D153_PRED], highbd_d153); + INIT_ALL_SIZES(pred_high[TM_PRED], highbd_tm); + + INIT_ALL_SIZES(dc_pred_high[0][0], highbd_dc_128); + INIT_ALL_SIZES(dc_pred_high[0][1], highbd_dc_top); + INIT_ALL_SIZES(dc_pred_high[1][0], highbd_dc_left); + INIT_ALL_SIZES(dc_pred_high[1][1], highbd_dc); +#endif // CONFIG_VP9_HIGHBITDEPTH + +#undef intra_pred_allsizes +} + +#if CONFIG_VP9_HIGHBITDEPTH +static void build_intra_predictors_high( + const MACROBLOCKD *xd, const uint8_t *ref8, int ref_stride, uint8_t *dst8, + int dst_stride, PREDICTION_MODE mode, TX_SIZE tx_size, int up_available, + int left_available, int right_available, int x, int y, int plane, int bd) { + int i; + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + DECLARE_ALIGNED(16, uint16_t, left_col[32]); + DECLARE_ALIGNED(16, uint16_t, above_data[64 + 16]); + uint16_t *above_row = above_data + 16; + const uint16_t *const_above_row = above_row; + const int bs = 4 << tx_size; + int frame_width, frame_height; + int x0, y0; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int need_left = extend_modes[mode] & NEED_LEFT; + const int need_above = extend_modes[mode] & NEED_ABOVE; + const int need_aboveright = extend_modes[mode] & NEED_ABOVERIGHT; + int base = 128 << (bd - 8); + // 127 127 127 .. 127 127 127 127 127 127 + // 129 A B .. Y Z + // 129 C D .. W X + // 129 E F .. U V + // 129 G H .. S T T T T T + // For 10 bit and 12 bit, 127 and 129 are replaced by base -1 and base + 1. + + // Get current frame pointer, width and height. + if (plane == 0) { + frame_width = xd->cur_buf->y_width; + frame_height = xd->cur_buf->y_height; + } else { + frame_width = xd->cur_buf->uv_width; + frame_height = xd->cur_buf->uv_height; + } + + // Get block position in current frame. + x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x; + y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y; + + // NEED_LEFT + if (need_left) { + if (left_available) { + if (xd->mb_to_bottom_edge < 0) { + /* slower path if the block needs border extension */ + if (y0 + bs <= frame_height) { + for (i = 0; i < bs; ++i) left_col[i] = ref[i * ref_stride - 1]; + } else { + const int extend_bottom = frame_height - y0; + for (i = 0; i < extend_bottom; ++i) + left_col[i] = ref[i * ref_stride - 1]; + for (; i < bs; ++i) + left_col[i] = ref[(extend_bottom - 1) * ref_stride - 1]; + } + } else { + /* faster path if the block does not need extension */ + for (i = 0; i < bs; ++i) left_col[i] = ref[i * ref_stride - 1]; + } + } else { + vpx_memset16(left_col, base + 1, bs); + } + } + + // NEED_ABOVE + if (need_above) { + if (up_available) { + const uint16_t *above_ref = ref - ref_stride; + if (xd->mb_to_right_edge < 0) { + /* slower path if the block needs border extension */ + if (x0 + bs <= frame_width) { + memcpy(above_row, above_ref, bs * sizeof(above_row[0])); + } else if (x0 <= frame_width) { + const int r = frame_width - x0; + memcpy(above_row, above_ref, r * sizeof(above_row[0])); + vpx_memset16(above_row + r, above_row[r - 1], x0 + bs - frame_width); + } + } else { + /* faster path if the block does not need extension */ + if (bs == 4 && right_available && left_available) { + const_above_row = above_ref; + } else { + memcpy(above_row, above_ref, bs * sizeof(above_row[0])); + } + } + above_row[-1] = left_available ? above_ref[-1] : (base + 1); + } else { + vpx_memset16(above_row, base - 1, bs); + above_row[-1] = base - 1; + } + } + + // NEED_ABOVERIGHT + if (need_aboveright) { + if (up_available) { + const uint16_t *above_ref = ref - ref_stride; + if (xd->mb_to_right_edge < 0) { + /* slower path if the block needs border extension */ + if (x0 + 2 * bs <= frame_width) { + if (right_available && bs == 4) { + memcpy(above_row, above_ref, 2 * bs * sizeof(above_row[0])); + } else { + memcpy(above_row, above_ref, bs * sizeof(above_row[0])); + vpx_memset16(above_row + bs, above_row[bs - 1], bs); + } + } else if (x0 + bs <= frame_width) { + const int r = frame_width - x0; + if (right_available && bs == 4) { + memcpy(above_row, above_ref, r * sizeof(above_row[0])); + vpx_memset16(above_row + r, above_row[r - 1], + x0 + 2 * bs - frame_width); + } else { + memcpy(above_row, above_ref, bs * sizeof(above_row[0])); + vpx_memset16(above_row + bs, above_row[bs - 1], bs); + } + } else if (x0 <= frame_width) { + const int r = frame_width - x0; + memcpy(above_row, above_ref, r * sizeof(above_row[0])); + vpx_memset16(above_row + r, above_row[r - 1], + x0 + 2 * bs - frame_width); + } + above_row[-1] = left_available ? above_ref[-1] : (base + 1); + } else { + /* faster path if the block does not need extension */ + if (bs == 4 && right_available && left_available) { + const_above_row = above_ref; + } else { + memcpy(above_row, above_ref, bs * sizeof(above_row[0])); + if (bs == 4 && right_available) + memcpy(above_row + bs, above_ref + bs, bs * sizeof(above_row[0])); + else + vpx_memset16(above_row + bs, above_row[bs - 1], bs); + above_row[-1] = left_available ? above_ref[-1] : (base + 1); + } + } + } else { + vpx_memset16(above_row, base - 1, bs * 2); + above_row[-1] = base - 1; + } + } + + // predict + if (mode == DC_PRED) { + dc_pred_high[left_available][up_available][tx_size]( + dst, dst_stride, const_above_row, left_col, xd->bd); + } else { + pred_high[mode][tx_size](dst, dst_stride, const_above_row, left_col, + xd->bd); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref, + int ref_stride, uint8_t *dst, int dst_stride, + PREDICTION_MODE mode, TX_SIZE tx_size, + int up_available, int left_available, + int right_available, int x, int y, + int plane) { + int i; + DECLARE_ALIGNED(16, uint8_t, left_col[32]); + DECLARE_ALIGNED(16, uint8_t, above_data[64 + 16]); + uint8_t *above_row = above_data + 16; + const uint8_t *const_above_row = above_row; + const int bs = 4 << tx_size; + int frame_width, frame_height; + int x0, y0; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + + // 127 127 127 .. 127 127 127 127 127 127 + // 129 A B .. Y Z + // 129 C D .. W X + // 129 E F .. U V + // 129 G H .. S T T T T T + // .. + + // Get current frame pointer, width and height. + if (plane == 0) { + frame_width = xd->cur_buf->y_width; + frame_height = xd->cur_buf->y_height; + } else { + frame_width = xd->cur_buf->uv_width; + frame_height = xd->cur_buf->uv_height; + } + + // Get block position in current frame. + x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x; + y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y; + + // NEED_LEFT + if (extend_modes[mode] & NEED_LEFT) { + if (left_available) { + if (xd->mb_to_bottom_edge < 0) { + /* slower path if the block needs border extension */ + if (y0 + bs <= frame_height) { + for (i = 0; i < bs; ++i) left_col[i] = ref[i * ref_stride - 1]; + } else { + const int extend_bottom = frame_height - y0; + for (i = 0; i < extend_bottom; ++i) + left_col[i] = ref[i * ref_stride - 1]; + for (; i < bs; ++i) + left_col[i] = ref[(extend_bottom - 1) * ref_stride - 1]; + } + } else { + /* faster path if the block does not need extension */ + for (i = 0; i < bs; ++i) left_col[i] = ref[i * ref_stride - 1]; + } + } else { + memset(left_col, 129, bs); + } + } + + // NEED_ABOVE + if (extend_modes[mode] & NEED_ABOVE) { + if (up_available) { + const uint8_t *above_ref = ref - ref_stride; + if (xd->mb_to_right_edge < 0) { + /* slower path if the block needs border extension */ + if (x0 + bs <= frame_width) { + memcpy(above_row, above_ref, bs); + } else if (x0 <= frame_width) { + const int r = frame_width - x0; + memcpy(above_row, above_ref, r); + memset(above_row + r, above_row[r - 1], x0 + bs - frame_width); + } + } else { + /* faster path if the block does not need extension */ + if (bs == 4 && right_available && left_available) { + const_above_row = above_ref; + } else { + memcpy(above_row, above_ref, bs); + } + } + above_row[-1] = left_available ? above_ref[-1] : 129; + } else { + memset(above_row, 127, bs); + above_row[-1] = 127; + } + } + + // NEED_ABOVERIGHT + if (extend_modes[mode] & NEED_ABOVERIGHT) { + if (up_available) { + const uint8_t *above_ref = ref - ref_stride; + if (xd->mb_to_right_edge < 0) { + /* slower path if the block needs border extension */ + if (x0 + 2 * bs <= frame_width) { + if (right_available && bs == 4) { + memcpy(above_row, above_ref, 2 * bs); + } else { + memcpy(above_row, above_ref, bs); + memset(above_row + bs, above_row[bs - 1], bs); + } + } else if (x0 + bs <= frame_width) { + const int r = frame_width - x0; + if (right_available && bs == 4) { + memcpy(above_row, above_ref, r); + memset(above_row + r, above_row[r - 1], x0 + 2 * bs - frame_width); + } else { + memcpy(above_row, above_ref, bs); + memset(above_row + bs, above_row[bs - 1], bs); + } + } else if (x0 <= frame_width) { + const int r = frame_width - x0; + memcpy(above_row, above_ref, r); + memset(above_row + r, above_row[r - 1], x0 + 2 * bs - frame_width); + } + } else { + /* faster path if the block does not need extension */ + if (bs == 4 && right_available && left_available) { + const_above_row = above_ref; + } else { + memcpy(above_row, above_ref, bs); + if (bs == 4 && right_available) + memcpy(above_row + bs, above_ref + bs, bs); + else + memset(above_row + bs, above_row[bs - 1], bs); + } + } + above_row[-1] = left_available ? above_ref[-1] : 129; + } else { + memset(above_row, 127, bs * 2); + above_row[-1] = 127; + } + } + + // predict + if (mode == DC_PRED) { + dc_pred[left_available][up_available][tx_size](dst, dst_stride, + const_above_row, left_col); + } else { + pred[mode][tx_size](dst, dst_stride, const_above_row, left_col); + } +} + +void vp9_predict_intra_block(const MACROBLOCKD *xd, int bwl_in, TX_SIZE tx_size, + PREDICTION_MODE mode, const uint8_t *ref, + int ref_stride, uint8_t *dst, int dst_stride, + int aoff, int loff, int plane) { + const int bw = (1 << bwl_in); + const int txw = (1 << tx_size); + const int have_top = loff || (xd->above_mi != NULL); + const int have_left = aoff || (xd->left_mi != NULL); + const int have_right = (aoff + txw) < bw; + const int x = aoff * 4; + const int y = loff * 4; + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + build_intra_predictors_high(xd, ref, ref_stride, dst, dst_stride, mode, + tx_size, have_top, have_left, have_right, x, y, + plane, xd->bd); + return; + } +#endif + build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode, tx_size, + have_top, have_left, have_right, x, y, plane); +} + +void vp9_init_intra_predictors(void) { + once(vp9_init_intra_predictors_internal); +} diff --git a/media/libvpx/libvpx/vp9/common/vp9_reconintra.h b/media/libvpx/libvpx/vp9/common/vp9_reconintra.h new file mode 100644 index 0000000000..426a35ebfa --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_reconintra.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_COMMON_VP9_RECONINTRA_H_ +#define VPX_VP9_COMMON_VP9_RECONINTRA_H_ + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_blockd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_init_intra_predictors(void); + +void vp9_predict_intra_block(const MACROBLOCKD *xd, int bwl_in, TX_SIZE tx_size, + PREDICTION_MODE mode, const uint8_t *ref, + int ref_stride, uint8_t *dst, int dst_stride, + int aoff, int loff, int plane); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_COMMON_VP9_RECONINTRA_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_rtcd.c b/media/libvpx/libvpx/vp9/common/vp9_rtcd.c new file mode 100644 index 0000000000..37762ca15a --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_rtcd.c @@ -0,0 +1,15 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_config.h" +#define RTCD_C +#include "./vp9_rtcd.h" +#include "vpx_ports/vpx_once.h" + +void vp9_rtcd() { once(setup_rtcd_internal); } diff --git a/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl b/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl new file mode 100644 index 0000000000..3ecbd5417f --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl @@ -0,0 +1,228 @@ +## +## Copyright (c) 2017 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +sub vp9_common_forward_decls() { +print <x_scale_fp >> REF_SCALE_SHIFT); +} + +static INLINE int scaled_y(int val, const struct scale_factors *sf) { + return (int)((int64_t)val * sf->y_scale_fp >> REF_SCALE_SHIFT); +} + +static int unscaled_value(int val, const struct scale_factors *sf) { + (void)sf; + return val; +} + +static int get_fixed_point_scale_factor(int other_size, int this_size) { + // Calculate scaling factor once for each reference frame + // and use fixed point scaling factors in decoding and encoding routines. + // Hardware implementations can calculate scale factor in device driver + // and use multiplication and shifting on hardware instead of division. + return (other_size << REF_SCALE_SHIFT) / this_size; +} + +MV32 vp9_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf) { + const int x_off_q4 = scaled_x(x << SUBPEL_BITS, sf) & SUBPEL_MASK; + const int y_off_q4 = scaled_y(y << SUBPEL_BITS, sf) & SUBPEL_MASK; + const MV32 res = { scaled_y(mv->row, sf) + y_off_q4, + scaled_x(mv->col, sf) + x_off_q4 }; + return res; +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w, + int other_h, int this_w, int this_h, + int use_highbd) { +#else +void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w, + int other_h, int this_w, int this_h) { +#endif + if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) { + sf->x_scale_fp = REF_INVALID_SCALE; + sf->y_scale_fp = REF_INVALID_SCALE; + return; + } + + sf->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w); + sf->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h); + sf->x_step_q4 = scaled_x(16, sf); + sf->y_step_q4 = scaled_y(16, sf); + + if (vp9_is_scaled(sf)) { + sf->scale_value_x = scaled_x; + sf->scale_value_y = scaled_y; + } else { + sf->scale_value_x = unscaled_value; + sf->scale_value_y = unscaled_value; + } + + // TODO(agrange): Investigate the best choice of functions to use here + // for EIGHTTAP_SMOOTH. Since it is not interpolating, need to choose what + // to do at full-pel offsets. The current selection, where the filter is + // applied in one direction only, and not at all for 0,0, seems to give the + // best quality, but it may be worth trying an additional mode that does + // do the filtering on full-pel. + + if (sf->x_step_q4 == 16) { + if (sf->y_step_q4 == 16) { + // No scaling in either direction. + sf->predict[0][0][0] = vpx_convolve_copy; + sf->predict[0][0][1] = vpx_convolve_avg; + sf->predict[0][1][0] = vpx_convolve8_vert; + sf->predict[0][1][1] = vpx_convolve8_avg_vert; + sf->predict[1][0][0] = vpx_convolve8_horiz; + sf->predict[1][0][1] = vpx_convolve8_avg_horiz; + } else { + // No scaling in x direction. Must always scale in the y direction. + sf->predict[0][0][0] = vpx_scaled_vert; + sf->predict[0][0][1] = vpx_scaled_avg_vert; + sf->predict[0][1][0] = vpx_scaled_vert; + sf->predict[0][1][1] = vpx_scaled_avg_vert; + sf->predict[1][0][0] = vpx_scaled_2d; + sf->predict[1][0][1] = vpx_scaled_avg_2d; + } + } else { + if (sf->y_step_q4 == 16) { + // No scaling in the y direction. Must always scale in the x direction. + sf->predict[0][0][0] = vpx_scaled_horiz; + sf->predict[0][0][1] = vpx_scaled_avg_horiz; + sf->predict[0][1][0] = vpx_scaled_2d; + sf->predict[0][1][1] = vpx_scaled_avg_2d; + sf->predict[1][0][0] = vpx_scaled_horiz; + sf->predict[1][0][1] = vpx_scaled_avg_horiz; + } else { + // Must always scale in both directions. + sf->predict[0][0][0] = vpx_scaled_2d; + sf->predict[0][0][1] = vpx_scaled_avg_2d; + sf->predict[0][1][0] = vpx_scaled_2d; + sf->predict[0][1][1] = vpx_scaled_avg_2d; + sf->predict[1][0][0] = vpx_scaled_2d; + sf->predict[1][0][1] = vpx_scaled_avg_2d; + } + } + + // 2D subpel motion always gets filtered in both directions + + if ((sf->x_step_q4 != 16) || (sf->y_step_q4 != 16)) { + sf->predict[1][1][0] = vpx_scaled_2d; + sf->predict[1][1][1] = vpx_scaled_avg_2d; + } else { + sf->predict[1][1][0] = vpx_convolve8; + sf->predict[1][1][1] = vpx_convolve8_avg; + } + +#if CONFIG_VP9_HIGHBITDEPTH + if (use_highbd) { + if (sf->x_step_q4 == 16) { + if (sf->y_step_q4 == 16) { + // No scaling in either direction. + sf->highbd_predict[0][0][0] = vpx_highbd_convolve_copy; + sf->highbd_predict[0][0][1] = vpx_highbd_convolve_avg; + sf->highbd_predict[0][1][0] = vpx_highbd_convolve8_vert; + sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg_vert; + sf->highbd_predict[1][0][0] = vpx_highbd_convolve8_horiz; + sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg_horiz; + } else { + // No scaling in x direction. Must always scale in the y direction. + sf->highbd_predict[0][0][0] = vpx_highbd_convolve8_vert; + sf->highbd_predict[0][0][1] = vpx_highbd_convolve8_avg_vert; + sf->highbd_predict[0][1][0] = vpx_highbd_convolve8_vert; + sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg_vert; + sf->highbd_predict[1][0][0] = vpx_highbd_convolve8; + sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg; + } + } else { + if (sf->y_step_q4 == 16) { + // No scaling in the y direction. Must always scale in the x direction. + sf->highbd_predict[0][0][0] = vpx_highbd_convolve8_horiz; + sf->highbd_predict[0][0][1] = vpx_highbd_convolve8_avg_horiz; + sf->highbd_predict[0][1][0] = vpx_highbd_convolve8; + sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg; + sf->highbd_predict[1][0][0] = vpx_highbd_convolve8_horiz; + sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg_horiz; + } else { + // Must always scale in both directions. + sf->highbd_predict[0][0][0] = vpx_highbd_convolve8; + sf->highbd_predict[0][0][1] = vpx_highbd_convolve8_avg; + sf->highbd_predict[0][1][0] = vpx_highbd_convolve8; + sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg; + sf->highbd_predict[1][0][0] = vpx_highbd_convolve8; + sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg; + } + } + // 2D subpel motion always gets filtered in both directions. + sf->highbd_predict[1][1][0] = vpx_highbd_convolve8; + sf->highbd_predict[1][1][1] = vpx_highbd_convolve8_avg; + } +#endif +} diff --git a/media/libvpx/libvpx/vp9/common/vp9_scale.h b/media/libvpx/libvpx/vp9/common/vp9_scale.h new file mode 100644 index 0000000000..2f3b609483 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_scale.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_COMMON_VP9_SCALE_H_ +#define VPX_VP9_COMMON_VP9_SCALE_H_ + +#include "vp9/common/vp9_mv.h" +#include "vpx_dsp/vpx_convolve.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define REF_SCALE_SHIFT 14 +#define REF_NO_SCALE (1 << REF_SCALE_SHIFT) +#define REF_INVALID_SCALE (-1) + +struct scale_factors { + int x_scale_fp; // horizontal fixed point scale factor + int y_scale_fp; // vertical fixed point scale factor + int x_step_q4; + int y_step_q4; + + int (*scale_value_x)(int val, const struct scale_factors *sf); + int (*scale_value_y)(int val, const struct scale_factors *sf); + + convolve_fn_t predict[2][2][2]; // horiz, vert, avg +#if CONFIG_VP9_HIGHBITDEPTH + highbd_convolve_fn_t highbd_predict[2][2][2]; // horiz, vert, avg +#endif +}; + +MV32 vp9_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf); + +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w, + int other_h, int this_w, int this_h, + int use_highbd); +#else +void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w, + int other_h, int this_w, int this_h); +#endif + +static INLINE int vp9_is_valid_scale(const struct scale_factors *sf) { + return sf->x_scale_fp != REF_INVALID_SCALE && + sf->y_scale_fp != REF_INVALID_SCALE; +} + +static INLINE int vp9_is_scaled(const struct scale_factors *sf) { + return vp9_is_valid_scale(sf) && + (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE); +} + +static INLINE int valid_ref_frame_size(int ref_width, int ref_height, + int this_width, int this_height) { + return 2 * this_width >= ref_width && 2 * this_height >= ref_height && + this_width <= 16 * ref_width && this_height <= 16 * ref_height; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_COMMON_VP9_SCALE_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_scan.c b/media/libvpx/libvpx/vp9/common/vp9_scan.c new file mode 100644 index 0000000000..adacb7ef96 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_scan.c @@ -0,0 +1,725 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vp9/common/vp9_scan.h" + +DECLARE_ALIGNED(16, static const int16_t, default_scan_4x4[16]) = { + 0, 4, 1, 5, 8, 2, 12, 9, 3, 6, 13, 10, 7, 14, 11, 15, +}; + +DECLARE_ALIGNED(16, static const int16_t, col_scan_4x4[16]) = { + 0, 4, 8, 1, 12, 5, 9, 2, 13, 6, 10, 3, 7, 14, 11, 15, +}; + +DECLARE_ALIGNED(16, static const int16_t, row_scan_4x4[16]) = { + 0, 1, 4, 2, 5, 3, 6, 8, 9, 7, 12, 10, 13, 11, 14, 15, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8[64]) = { + 0, 8, 1, 16, 9, 2, 17, 24, 10, 3, 18, 25, 32, 11, 4, 26, + 33, 19, 40, 12, 34, 27, 5, 41, 20, 48, 13, 35, 42, 28, 21, 6, + 49, 56, 36, 43, 29, 7, 14, 50, 57, 44, 22, 37, 15, 51, 58, 30, + 45, 23, 52, 59, 38, 31, 60, 53, 46, 39, 61, 54, 47, 62, 55, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, col_scan_8x8[64]) = { + 0, 8, 16, 1, 24, 9, 32, 17, 2, 40, 25, 10, 33, 18, 48, 3, + 26, 41, 11, 56, 19, 34, 4, 49, 27, 42, 12, 35, 20, 57, 50, 28, + 5, 43, 13, 36, 58, 51, 21, 44, 6, 29, 59, 37, 14, 52, 22, 7, + 45, 60, 30, 15, 38, 53, 23, 46, 31, 61, 39, 54, 47, 62, 55, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, row_scan_8x8[64]) = { + 0, 1, 2, 8, 9, 3, 16, 10, 4, 17, 11, 24, 5, 18, 25, 12, + 19, 26, 32, 6, 13, 20, 33, 27, 7, 34, 40, 21, 28, 41, 14, 35, + 48, 42, 29, 36, 49, 22, 43, 15, 56, 37, 50, 44, 30, 57, 23, 51, + 58, 45, 38, 52, 31, 59, 53, 46, 60, 39, 61, 47, 54, 55, 62, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_16x16[256]) = { + 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 34, 49, 19, 65, + 80, 50, 4, 35, 66, 20, 81, 96, 51, 5, 36, 82, 97, 67, 112, + 21, 52, 98, 37, 83, 113, 6, 68, 128, 53, 22, 99, 114, 84, 7, + 129, 38, 69, 100, 115, 144, 130, 85, 54, 23, 8, 145, 39, 70, 116, + 101, 131, 160, 146, 55, 86, 24, 71, 132, 117, 161, 40, 9, 102, 147, + 176, 162, 87, 56, 25, 133, 118, 177, 148, 72, 103, 41, 163, 10, 192, + 178, 88, 57, 134, 149, 119, 26, 164, 73, 104, 193, 42, 179, 208, 11, + 135, 89, 165, 120, 150, 58, 194, 180, 27, 74, 209, 105, 151, 136, 43, + 90, 224, 166, 195, 181, 121, 210, 59, 12, 152, 106, 167, 196, 75, 137, + 225, 211, 240, 182, 122, 91, 28, 197, 13, 226, 168, 183, 153, 44, 212, + 138, 107, 241, 60, 29, 123, 198, 184, 227, 169, 242, 76, 213, 154, 45, + 92, 14, 199, 139, 61, 228, 214, 170, 185, 243, 108, 77, 155, 30, 15, + 200, 229, 124, 215, 244, 93, 46, 186, 171, 201, 109, 140, 230, 62, 216, + 245, 31, 125, 78, 156, 231, 47, 187, 202, 217, 94, 246, 141, 63, 232, + 172, 110, 247, 157, 79, 218, 203, 126, 233, 188, 248, 95, 173, 142, 219, + 111, 249, 234, 158, 127, 189, 204, 250, 235, 143, 174, 220, 205, 159, 251, + 190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, col_scan_16x16[256]) = { + 0, 16, 32, 48, 1, 64, 17, 80, 33, 96, 49, 2, 65, 112, 18, + 81, 34, 128, 50, 97, 3, 66, 144, 19, 113, 35, 82, 160, 98, 51, + 129, 4, 67, 176, 20, 114, 145, 83, 36, 99, 130, 52, 192, 5, 161, + 68, 115, 21, 146, 84, 208, 177, 37, 131, 100, 53, 162, 224, 69, 6, + 116, 193, 147, 85, 22, 240, 132, 38, 178, 101, 163, 54, 209, 117, 70, + 7, 148, 194, 86, 179, 225, 23, 133, 39, 164, 8, 102, 210, 241, 55, + 195, 118, 149, 71, 180, 24, 87, 226, 134, 165, 211, 40, 103, 56, 72, + 150, 196, 242, 119, 9, 181, 227, 88, 166, 25, 135, 41, 104, 212, 57, + 151, 197, 120, 73, 243, 182, 136, 167, 213, 89, 10, 228, 105, 152, 198, + 26, 42, 121, 183, 244, 168, 58, 137, 229, 74, 214, 90, 153, 199, 184, + 11, 106, 245, 27, 122, 230, 169, 43, 215, 59, 200, 138, 185, 246, 75, + 12, 91, 154, 216, 231, 107, 28, 44, 201, 123, 170, 60, 247, 232, 76, + 139, 13, 92, 217, 186, 248, 155, 108, 29, 124, 45, 202, 233, 171, 61, + 14, 77, 140, 15, 249, 93, 30, 187, 156, 218, 46, 109, 125, 62, 172, + 78, 203, 31, 141, 234, 94, 47, 188, 63, 157, 110, 250, 219, 79, 126, + 204, 173, 142, 95, 189, 111, 235, 158, 220, 251, 127, 174, 143, 205, 236, + 159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, row_scan_16x16[256]) = { + 0, 1, 2, 16, 3, 17, 4, 18, 32, 5, 33, 19, 6, 34, 48, + 20, 49, 7, 35, 21, 50, 64, 8, 36, 65, 22, 51, 37, 80, 9, + 66, 52, 23, 38, 81, 67, 10, 53, 24, 82, 68, 96, 39, 11, 54, + 83, 97, 69, 25, 98, 84, 40, 112, 55, 12, 70, 99, 113, 85, 26, + 41, 56, 114, 100, 13, 71, 128, 86, 27, 115, 101, 129, 42, 57, 72, + 116, 14, 87, 130, 102, 144, 73, 131, 117, 28, 58, 15, 88, 43, 145, + 103, 132, 146, 118, 74, 160, 89, 133, 104, 29, 59, 147, 119, 44, 161, + 148, 90, 105, 134, 162, 120, 176, 75, 135, 149, 30, 60, 163, 177, 45, + 121, 91, 106, 164, 178, 150, 192, 136, 165, 179, 31, 151, 193, 76, 122, + 61, 137, 194, 107, 152, 180, 208, 46, 166, 167, 195, 92, 181, 138, 209, + 123, 153, 224, 196, 77, 168, 210, 182, 240, 108, 197, 62, 154, 225, 183, + 169, 211, 47, 139, 93, 184, 226, 212, 241, 198, 170, 124, 155, 199, 78, + 213, 185, 109, 227, 200, 63, 228, 242, 140, 214, 171, 186, 156, 229, 243, + 125, 94, 201, 244, 215, 216, 230, 141, 187, 202, 79, 172, 110, 157, 245, + 217, 231, 95, 246, 232, 126, 203, 247, 233, 173, 218, 142, 111, 158, 188, + 248, 127, 234, 219, 249, 189, 204, 143, 174, 159, 250, 235, 205, 220, 175, + 190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_32x32[1024]) = { + 0, 32, 1, 64, 33, 2, 96, 65, 34, 128, 3, 97, 66, + 160, 129, 35, 98, 4, 67, 130, 161, 192, 36, 99, 224, 5, + 162, 193, 68, 131, 37, 100, 225, 194, 256, 163, 69, 132, 6, + 226, 257, 288, 195, 101, 164, 38, 258, 7, 227, 289, 133, 320, + 70, 196, 165, 290, 259, 228, 39, 321, 102, 352, 8, 197, 71, + 134, 322, 291, 260, 353, 384, 229, 166, 103, 40, 354, 323, 292, + 135, 385, 198, 261, 72, 9, 416, 167, 386, 355, 230, 324, 104, + 293, 41, 417, 199, 136, 262, 387, 448, 325, 356, 10, 73, 418, + 231, 168, 449, 294, 388, 105, 419, 263, 42, 200, 357, 450, 137, + 480, 74, 326, 232, 11, 389, 169, 295, 420, 106, 451, 481, 358, + 264, 327, 201, 43, 138, 512, 482, 390, 296, 233, 170, 421, 75, + 452, 359, 12, 513, 265, 483, 328, 107, 202, 514, 544, 422, 391, + 453, 139, 44, 234, 484, 297, 360, 171, 76, 515, 545, 266, 329, + 454, 13, 423, 203, 108, 546, 485, 576, 298, 235, 140, 361, 330, + 172, 547, 45, 455, 267, 577, 486, 77, 204, 362, 608, 14, 299, + 578, 109, 236, 487, 609, 331, 141, 579, 46, 15, 173, 610, 363, + 78, 205, 16, 110, 237, 611, 142, 47, 174, 79, 206, 17, 111, + 238, 48, 143, 80, 175, 112, 207, 49, 18, 239, 81, 113, 19, + 50, 82, 114, 51, 83, 115, 640, 516, 392, 268, 144, 20, 672, + 641, 548, 517, 424, 393, 300, 269, 176, 145, 52, 21, 704, 673, + 642, 580, 549, 518, 456, 425, 394, 332, 301, 270, 208, 177, 146, + 84, 53, 22, 736, 705, 674, 643, 612, 581, 550, 519, 488, 457, + 426, 395, 364, 333, 302, 271, 240, 209, 178, 147, 116, 85, 54, + 23, 737, 706, 675, 613, 582, 551, 489, 458, 427, 365, 334, 303, + 241, 210, 179, 117, 86, 55, 738, 707, 614, 583, 490, 459, 366, + 335, 242, 211, 118, 87, 739, 615, 491, 367, 243, 119, 768, 644, + 520, 396, 272, 148, 24, 800, 769, 676, 645, 552, 521, 428, 397, + 304, 273, 180, 149, 56, 25, 832, 801, 770, 708, 677, 646, 584, + 553, 522, 460, 429, 398, 336, 305, 274, 212, 181, 150, 88, 57, + 26, 864, 833, 802, 771, 740, 709, 678, 647, 616, 585, 554, 523, + 492, 461, 430, 399, 368, 337, 306, 275, 244, 213, 182, 151, 120, + 89, 58, 27, 865, 834, 803, 741, 710, 679, 617, 586, 555, 493, + 462, 431, 369, 338, 307, 245, 214, 183, 121, 90, 59, 866, 835, + 742, 711, 618, 587, 494, 463, 370, 339, 246, 215, 122, 91, 867, + 743, 619, 495, 371, 247, 123, 896, 772, 648, 524, 400, 276, 152, + 28, 928, 897, 804, 773, 680, 649, 556, 525, 432, 401, 308, 277, + 184, 153, 60, 29, 960, 929, 898, 836, 805, 774, 712, 681, 650, + 588, 557, 526, 464, 433, 402, 340, 309, 278, 216, 185, 154, 92, + 61, 30, 992, 961, 930, 899, 868, 837, 806, 775, 744, 713, 682, + 651, 620, 589, 558, 527, 496, 465, 434, 403, 372, 341, 310, 279, + 248, 217, 186, 155, 124, 93, 62, 31, 993, 962, 931, 869, 838, + 807, 745, 714, 683, 621, 590, 559, 497, 466, 435, 373, 342, 311, + 249, 218, 187, 125, 94, 63, 994, 963, 870, 839, 746, 715, 622, + 591, 498, 467, 374, 343, 250, 219, 126, 95, 995, 871, 747, 623, + 499, 375, 251, 127, 900, 776, 652, 528, 404, 280, 156, 932, 901, + 808, 777, 684, 653, 560, 529, 436, 405, 312, 281, 188, 157, 964, + 933, 902, 840, 809, 778, 716, 685, 654, 592, 561, 530, 468, 437, + 406, 344, 313, 282, 220, 189, 158, 996, 965, 934, 903, 872, 841, + 810, 779, 748, 717, 686, 655, 624, 593, 562, 531, 500, 469, 438, + 407, 376, 345, 314, 283, 252, 221, 190, 159, 997, 966, 935, 873, + 842, 811, 749, 718, 687, 625, 594, 563, 501, 470, 439, 377, 346, + 315, 253, 222, 191, 998, 967, 874, 843, 750, 719, 626, 595, 502, + 471, 378, 347, 254, 223, 999, 875, 751, 627, 503, 379, 255, 904, + 780, 656, 532, 408, 284, 936, 905, 812, 781, 688, 657, 564, 533, + 440, 409, 316, 285, 968, 937, 906, 844, 813, 782, 720, 689, 658, + 596, 565, 534, 472, 441, 410, 348, 317, 286, 1000, 969, 938, 907, + 876, 845, 814, 783, 752, 721, 690, 659, 628, 597, 566, 535, 504, + 473, 442, 411, 380, 349, 318, 287, 1001, 970, 939, 877, 846, 815, + 753, 722, 691, 629, 598, 567, 505, 474, 443, 381, 350, 319, 1002, + 971, 878, 847, 754, 723, 630, 599, 506, 475, 382, 351, 1003, 879, + 755, 631, 507, 383, 908, 784, 660, 536, 412, 940, 909, 816, 785, + 692, 661, 568, 537, 444, 413, 972, 941, 910, 848, 817, 786, 724, + 693, 662, 600, 569, 538, 476, 445, 414, 1004, 973, 942, 911, 880, + 849, 818, 787, 756, 725, 694, 663, 632, 601, 570, 539, 508, 477, + 446, 415, 1005, 974, 943, 881, 850, 819, 757, 726, 695, 633, 602, + 571, 509, 478, 447, 1006, 975, 882, 851, 758, 727, 634, 603, 510, + 479, 1007, 883, 759, 635, 511, 912, 788, 664, 540, 944, 913, 820, + 789, 696, 665, 572, 541, 976, 945, 914, 852, 821, 790, 728, 697, + 666, 604, 573, 542, 1008, 977, 946, 915, 884, 853, 822, 791, 760, + 729, 698, 667, 636, 605, 574, 543, 1009, 978, 947, 885, 854, 823, + 761, 730, 699, 637, 606, 575, 1010, 979, 886, 855, 762, 731, 638, + 607, 1011, 887, 763, 639, 916, 792, 668, 948, 917, 824, 793, 700, + 669, 980, 949, 918, 856, 825, 794, 732, 701, 670, 1012, 981, 950, + 919, 888, 857, 826, 795, 764, 733, 702, 671, 1013, 982, 951, 889, + 858, 827, 765, 734, 703, 1014, 983, 890, 859, 766, 735, 1015, 891, + 767, 920, 796, 952, 921, 828, 797, 984, 953, 922, 860, 829, 798, + 1016, 985, 954, 923, 892, 861, 830, 799, 1017, 986, 955, 893, 862, + 831, 1018, 987, 894, 863, 1019, 895, 924, 956, 925, 988, 957, 926, + 1020, 989, 958, 927, 1021, 990, 959, 1022, 991, 1023, +}; + +// Neighborhood 2-tuples for various scans and blocksizes, +// in {top, left} order for each position in corresponding scan order. +DECLARE_ALIGNED(16, static const int16_t, + default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = { + 0, 0, 0, 0, 0, 0, 1, 4, 4, 4, 1, 1, 8, 8, 5, 8, 2, + 2, 2, 5, 9, 12, 6, 9, 3, 6, 10, 13, 7, 10, 11, 14, 0, 0, +}; + +DECLARE_ALIGNED(16, static const int16_t, + col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = { + 0, 0, 0, 0, 4, 4, 0, 0, 8, 8, 1, 1, 5, 5, 1, 1, 9, + 9, 2, 2, 6, 6, 2, 2, 3, 3, 10, 10, 7, 7, 11, 11, 0, 0, +}; + +DECLARE_ALIGNED(16, static const int16_t, + row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = { + 0, 0, 0, 0, 0, 0, 1, 1, 4, 4, 2, 2, 5, 5, 4, 4, 8, + 8, 6, 6, 8, 8, 9, 9, 12, 12, 10, 10, 13, 13, 14, 14, 0, 0, +}; + +DECLARE_ALIGNED(16, static const int16_t, + col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = { + 0, 0, 0, 0, 8, 8, 0, 0, 16, 16, 1, 1, 24, 24, 9, 9, 1, 1, 32, + 32, 17, 17, 2, 2, 25, 25, 10, 10, 40, 40, 2, 2, 18, 18, 33, 33, 3, 3, + 48, 48, 11, 11, 26, 26, 3, 3, 41, 41, 19, 19, 34, 34, 4, 4, 27, 27, 12, + 12, 49, 49, 42, 42, 20, 20, 4, 4, 35, 35, 5, 5, 28, 28, 50, 50, 43, 43, + 13, 13, 36, 36, 5, 5, 21, 21, 51, 51, 29, 29, 6, 6, 44, 44, 14, 14, 6, + 6, 37, 37, 52, 52, 22, 22, 7, 7, 30, 30, 45, 45, 15, 15, 38, 38, 23, 23, + 53, 53, 31, 31, 46, 46, 39, 39, 54, 54, 47, 47, 55, 55, 0, 0, +}; + +DECLARE_ALIGNED(16, static const int16_t, + row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = { + 0, 0, 0, 0, 1, 1, 0, 0, 8, 8, 2, 2, 8, 8, 9, 9, 3, 3, 16, + 16, 10, 10, 16, 16, 4, 4, 17, 17, 24, 24, 11, 11, 18, 18, 25, 25, 24, 24, + 5, 5, 12, 12, 19, 19, 32, 32, 26, 26, 6, 6, 33, 33, 32, 32, 20, 20, 27, + 27, 40, 40, 13, 13, 34, 34, 40, 40, 41, 41, 28, 28, 35, 35, 48, 48, 21, 21, + 42, 42, 14, 14, 48, 48, 36, 36, 49, 49, 43, 43, 29, 29, 56, 56, 22, 22, 50, + 50, 57, 57, 44, 44, 37, 37, 51, 51, 30, 30, 58, 58, 52, 52, 45, 45, 59, 59, + 38, 38, 60, 60, 46, 46, 53, 53, 54, 54, 61, 61, 62, 62, 0, 0, +}; + +DECLARE_ALIGNED(16, static const int16_t, + default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = { + 0, 0, 0, 0, 0, 0, 8, 8, 1, 8, 1, 1, 9, 16, 16, 16, 2, 9, 2, + 2, 10, 17, 17, 24, 24, 24, 3, 10, 3, 3, 18, 25, 25, 32, 11, 18, 32, 32, + 4, 11, 26, 33, 19, 26, 4, 4, 33, 40, 12, 19, 40, 40, 5, 12, 27, 34, 34, + 41, 20, 27, 13, 20, 5, 5, 41, 48, 48, 48, 28, 35, 35, 42, 21, 28, 6, 6, + 6, 13, 42, 49, 49, 56, 36, 43, 14, 21, 29, 36, 7, 14, 43, 50, 50, 57, 22, + 29, 37, 44, 15, 22, 44, 51, 51, 58, 30, 37, 23, 30, 52, 59, 45, 52, 38, 45, + 31, 38, 53, 60, 46, 53, 39, 46, 54, 61, 47, 54, 55, 62, 0, 0, +}; + +DECLARE_ALIGNED(16, static const int16_t, + col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = { + 0, 0, 0, 0, 16, 16, 32, 32, 0, 0, 48, 48, 1, 1, 64, + 64, 17, 17, 80, 80, 33, 33, 1, 1, 49, 49, 96, 96, 2, 2, + 65, 65, 18, 18, 112, 112, 34, 34, 81, 81, 2, 2, 50, 50, 128, + 128, 3, 3, 97, 97, 19, 19, 66, 66, 144, 144, 82, 82, 35, 35, + 113, 113, 3, 3, 51, 51, 160, 160, 4, 4, 98, 98, 129, 129, 67, + 67, 20, 20, 83, 83, 114, 114, 36, 36, 176, 176, 4, 4, 145, 145, + 52, 52, 99, 99, 5, 5, 130, 130, 68, 68, 192, 192, 161, 161, 21, + 21, 115, 115, 84, 84, 37, 37, 146, 146, 208, 208, 53, 53, 5, 5, + 100, 100, 177, 177, 131, 131, 69, 69, 6, 6, 224, 224, 116, 116, 22, + 22, 162, 162, 85, 85, 147, 147, 38, 38, 193, 193, 101, 101, 54, 54, + 6, 6, 132, 132, 178, 178, 70, 70, 163, 163, 209, 209, 7, 7, 117, + 117, 23, 23, 148, 148, 7, 7, 86, 86, 194, 194, 225, 225, 39, 39, + 179, 179, 102, 102, 133, 133, 55, 55, 164, 164, 8, 8, 71, 71, 210, + 210, 118, 118, 149, 149, 195, 195, 24, 24, 87, 87, 40, 40, 56, 56, + 134, 134, 180, 180, 226, 226, 103, 103, 8, 8, 165, 165, 211, 211, 72, + 72, 150, 150, 9, 9, 119, 119, 25, 25, 88, 88, 196, 196, 41, 41, + 135, 135, 181, 181, 104, 104, 57, 57, 227, 227, 166, 166, 120, 120, 151, + 151, 197, 197, 73, 73, 9, 9, 212, 212, 89, 89, 136, 136, 182, 182, + 10, 10, 26, 26, 105, 105, 167, 167, 228, 228, 152, 152, 42, 42, 121, + 121, 213, 213, 58, 58, 198, 198, 74, 74, 137, 137, 183, 183, 168, 168, + 10, 10, 90, 90, 229, 229, 11, 11, 106, 106, 214, 214, 153, 153, 27, + 27, 199, 199, 43, 43, 184, 184, 122, 122, 169, 169, 230, 230, 59, 59, + 11, 11, 75, 75, 138, 138, 200, 200, 215, 215, 91, 91, 12, 12, 28, + 28, 185, 185, 107, 107, 154, 154, 44, 44, 231, 231, 216, 216, 60, 60, + 123, 123, 12, 12, 76, 76, 201, 201, 170, 170, 232, 232, 139, 139, 92, + 92, 13, 13, 108, 108, 29, 29, 186, 186, 217, 217, 155, 155, 45, 45, + 13, 13, 61, 61, 124, 124, 14, 14, 233, 233, 77, 77, 14, 14, 171, + 171, 140, 140, 202, 202, 30, 30, 93, 93, 109, 109, 46, 46, 156, 156, + 62, 62, 187, 187, 15, 15, 125, 125, 218, 218, 78, 78, 31, 31, 172, + 172, 47, 47, 141, 141, 94, 94, 234, 234, 203, 203, 63, 63, 110, 110, + 188, 188, 157, 157, 126, 126, 79, 79, 173, 173, 95, 95, 219, 219, 142, + 142, 204, 204, 235, 235, 111, 111, 158, 158, 127, 127, 189, 189, 220, 220, + 143, 143, 174, 174, 205, 205, 236, 236, 159, 159, 190, 190, 221, 221, 175, + 175, 237, 237, 206, 206, 222, 222, 191, 191, 238, 238, 207, 207, 223, 223, + 239, 239, 0, 0, +}; + +DECLARE_ALIGNED(16, static const int16_t, + row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = { + 0, 0, 0, 0, 1, 1, 0, 0, 2, 2, 16, 16, 3, 3, 17, + 17, 16, 16, 4, 4, 32, 32, 18, 18, 5, 5, 33, 33, 32, 32, + 19, 19, 48, 48, 6, 6, 34, 34, 20, 20, 49, 49, 48, 48, 7, + 7, 35, 35, 64, 64, 21, 21, 50, 50, 36, 36, 64, 64, 8, 8, + 65, 65, 51, 51, 22, 22, 37, 37, 80, 80, 66, 66, 9, 9, 52, + 52, 23, 23, 81, 81, 67, 67, 80, 80, 38, 38, 10, 10, 53, 53, + 82, 82, 96, 96, 68, 68, 24, 24, 97, 97, 83, 83, 39, 39, 96, + 96, 54, 54, 11, 11, 69, 69, 98, 98, 112, 112, 84, 84, 25, 25, + 40, 40, 55, 55, 113, 113, 99, 99, 12, 12, 70, 70, 112, 112, 85, + 85, 26, 26, 114, 114, 100, 100, 128, 128, 41, 41, 56, 56, 71, 71, + 115, 115, 13, 13, 86, 86, 129, 129, 101, 101, 128, 128, 72, 72, 130, + 130, 116, 116, 27, 27, 57, 57, 14, 14, 87, 87, 42, 42, 144, 144, + 102, 102, 131, 131, 145, 145, 117, 117, 73, 73, 144, 144, 88, 88, 132, + 132, 103, 103, 28, 28, 58, 58, 146, 146, 118, 118, 43, 43, 160, 160, + 147, 147, 89, 89, 104, 104, 133, 133, 161, 161, 119, 119, 160, 160, 74, + 74, 134, 134, 148, 148, 29, 29, 59, 59, 162, 162, 176, 176, 44, 44, + 120, 120, 90, 90, 105, 105, 163, 163, 177, 177, 149, 149, 176, 176, 135, + 135, 164, 164, 178, 178, 30, 30, 150, 150, 192, 192, 75, 75, 121, 121, + 60, 60, 136, 136, 193, 193, 106, 106, 151, 151, 179, 179, 192, 192, 45, + 45, 165, 165, 166, 166, 194, 194, 91, 91, 180, 180, 137, 137, 208, 208, + 122, 122, 152, 152, 208, 208, 195, 195, 76, 76, 167, 167, 209, 209, 181, + 181, 224, 224, 107, 107, 196, 196, 61, 61, 153, 153, 224, 224, 182, 182, + 168, 168, 210, 210, 46, 46, 138, 138, 92, 92, 183, 183, 225, 225, 211, + 211, 240, 240, 197, 197, 169, 169, 123, 123, 154, 154, 198, 198, 77, 77, + 212, 212, 184, 184, 108, 108, 226, 226, 199, 199, 62, 62, 227, 227, 241, + 241, 139, 139, 213, 213, 170, 170, 185, 185, 155, 155, 228, 228, 242, 242, + 124, 124, 93, 93, 200, 200, 243, 243, 214, 214, 215, 215, 229, 229, 140, + 140, 186, 186, 201, 201, 78, 78, 171, 171, 109, 109, 156, 156, 244, 244, + 216, 216, 230, 230, 94, 94, 245, 245, 231, 231, 125, 125, 202, 202, 246, + 246, 232, 232, 172, 172, 217, 217, 141, 141, 110, 110, 157, 157, 187, 187, + 247, 247, 126, 126, 233, 233, 218, 218, 248, 248, 188, 188, 203, 203, 142, + 142, 173, 173, 158, 158, 249, 249, 234, 234, 204, 204, 219, 219, 174, 174, + 189, 189, 250, 250, 220, 220, 190, 190, 205, 205, 235, 235, 206, 206, 236, + 236, 251, 251, 221, 221, 252, 252, 222, 222, 237, 237, 238, 238, 253, 253, + 254, 254, 0, 0, +}; + +DECLARE_ALIGNED(16, static const int16_t, + default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = { + 0, 0, 0, 0, 0, 0, 16, 16, 1, 16, 1, 1, 32, 32, 17, + 32, 2, 17, 2, 2, 48, 48, 18, 33, 33, 48, 3, 18, 49, 64, + 64, 64, 34, 49, 3, 3, 19, 34, 50, 65, 4, 19, 65, 80, 80, + 80, 35, 50, 4, 4, 20, 35, 66, 81, 81, 96, 51, 66, 96, 96, + 5, 20, 36, 51, 82, 97, 21, 36, 67, 82, 97, 112, 5, 5, 52, + 67, 112, 112, 37, 52, 6, 21, 83, 98, 98, 113, 68, 83, 6, 6, + 113, 128, 22, 37, 53, 68, 84, 99, 99, 114, 128, 128, 114, 129, 69, + 84, 38, 53, 7, 22, 7, 7, 129, 144, 23, 38, 54, 69, 100, 115, + 85, 100, 115, 130, 144, 144, 130, 145, 39, 54, 70, 85, 8, 23, 55, + 70, 116, 131, 101, 116, 145, 160, 24, 39, 8, 8, 86, 101, 131, 146, + 160, 160, 146, 161, 71, 86, 40, 55, 9, 24, 117, 132, 102, 117, 161, + 176, 132, 147, 56, 71, 87, 102, 25, 40, 147, 162, 9, 9, 176, 176, + 162, 177, 72, 87, 41, 56, 118, 133, 133, 148, 103, 118, 10, 25, 148, + 163, 57, 72, 88, 103, 177, 192, 26, 41, 163, 178, 192, 192, 10, 10, + 119, 134, 73, 88, 149, 164, 104, 119, 134, 149, 42, 57, 178, 193, 164, + 179, 11, 26, 58, 73, 193, 208, 89, 104, 135, 150, 120, 135, 27, 42, + 74, 89, 208, 208, 150, 165, 179, 194, 165, 180, 105, 120, 194, 209, 43, + 58, 11, 11, 136, 151, 90, 105, 151, 166, 180, 195, 59, 74, 121, 136, + 209, 224, 195, 210, 224, 224, 166, 181, 106, 121, 75, 90, 12, 27, 181, + 196, 12, 12, 210, 225, 152, 167, 167, 182, 137, 152, 28, 43, 196, 211, + 122, 137, 91, 106, 225, 240, 44, 59, 13, 28, 107, 122, 182, 197, 168, + 183, 211, 226, 153, 168, 226, 241, 60, 75, 197, 212, 138, 153, 29, 44, + 76, 91, 13, 13, 183, 198, 123, 138, 45, 60, 212, 227, 198, 213, 154, + 169, 169, 184, 227, 242, 92, 107, 61, 76, 139, 154, 14, 29, 14, 14, + 184, 199, 213, 228, 108, 123, 199, 214, 228, 243, 77, 92, 30, 45, 170, + 185, 155, 170, 185, 200, 93, 108, 124, 139, 214, 229, 46, 61, 200, 215, + 229, 244, 15, 30, 109, 124, 62, 77, 140, 155, 215, 230, 31, 46, 171, + 186, 186, 201, 201, 216, 78, 93, 230, 245, 125, 140, 47, 62, 216, 231, + 156, 171, 94, 109, 231, 246, 141, 156, 63, 78, 202, 217, 187, 202, 110, + 125, 217, 232, 172, 187, 232, 247, 79, 94, 157, 172, 126, 141, 203, 218, + 95, 110, 233, 248, 218, 233, 142, 157, 111, 126, 173, 188, 188, 203, 234, + 249, 219, 234, 127, 142, 158, 173, 204, 219, 189, 204, 143, 158, 235, 250, + 174, 189, 205, 220, 159, 174, 220, 235, 221, 236, 175, 190, 190, 205, 236, + 251, 206, 221, 237, 252, 191, 206, 222, 237, 207, 222, 238, 253, 223, 238, + 239, 254, 0, 0, +}; + +DECLARE_ALIGNED(16, static const int16_t, + default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = { + 0, 0, 0, 0, 0, 0, 32, 32, 1, 32, 1, 1, 64, 64, + 33, 64, 2, 33, 96, 96, 2, 2, 65, 96, 34, 65, 128, 128, + 97, 128, 3, 34, 66, 97, 3, 3, 35, 66, 98, 129, 129, 160, + 160, 160, 4, 35, 67, 98, 192, 192, 4, 4, 130, 161, 161, 192, + 36, 67, 99, 130, 5, 36, 68, 99, 193, 224, 162, 193, 224, 224, + 131, 162, 37, 68, 100, 131, 5, 5, 194, 225, 225, 256, 256, 256, + 163, 194, 69, 100, 132, 163, 6, 37, 226, 257, 6, 6, 195, 226, + 257, 288, 101, 132, 288, 288, 38, 69, 164, 195, 133, 164, 258, 289, + 227, 258, 196, 227, 7, 38, 289, 320, 70, 101, 320, 320, 7, 7, + 165, 196, 39, 70, 102, 133, 290, 321, 259, 290, 228, 259, 321, 352, + 352, 352, 197, 228, 134, 165, 71, 102, 8, 39, 322, 353, 291, 322, + 260, 291, 103, 134, 353, 384, 166, 197, 229, 260, 40, 71, 8, 8, + 384, 384, 135, 166, 354, 385, 323, 354, 198, 229, 292, 323, 72, 103, + 261, 292, 9, 40, 385, 416, 167, 198, 104, 135, 230, 261, 355, 386, + 416, 416, 293, 324, 324, 355, 9, 9, 41, 72, 386, 417, 199, 230, + 136, 167, 417, 448, 262, 293, 356, 387, 73, 104, 387, 418, 231, 262, + 10, 41, 168, 199, 325, 356, 418, 449, 105, 136, 448, 448, 42, 73, + 294, 325, 200, 231, 10, 10, 357, 388, 137, 168, 263, 294, 388, 419, + 74, 105, 419, 450, 449, 480, 326, 357, 232, 263, 295, 326, 169, 200, + 11, 42, 106, 137, 480, 480, 450, 481, 358, 389, 264, 295, 201, 232, + 138, 169, 389, 420, 43, 74, 420, 451, 327, 358, 11, 11, 481, 512, + 233, 264, 451, 482, 296, 327, 75, 106, 170, 201, 482, 513, 512, 512, + 390, 421, 359, 390, 421, 452, 107, 138, 12, 43, 202, 233, 452, 483, + 265, 296, 328, 359, 139, 170, 44, 75, 483, 514, 513, 544, 234, 265, + 297, 328, 422, 453, 12, 12, 391, 422, 171, 202, 76, 107, 514, 545, + 453, 484, 544, 544, 266, 297, 203, 234, 108, 139, 329, 360, 298, 329, + 140, 171, 515, 546, 13, 44, 423, 454, 235, 266, 545, 576, 454, 485, + 45, 76, 172, 203, 330, 361, 576, 576, 13, 13, 267, 298, 546, 577, + 77, 108, 204, 235, 455, 486, 577, 608, 299, 330, 109, 140, 547, 578, + 14, 45, 14, 14, 141, 172, 578, 609, 331, 362, 46, 77, 173, 204, + 15, 15, 78, 109, 205, 236, 579, 610, 110, 141, 15, 46, 142, 173, + 47, 78, 174, 205, 16, 16, 79, 110, 206, 237, 16, 47, 111, 142, + 48, 79, 143, 174, 80, 111, 175, 206, 17, 48, 17, 17, 207, 238, + 49, 80, 81, 112, 18, 18, 18, 49, 50, 81, 82, 113, 19, 50, + 51, 82, 83, 114, 608, 608, 484, 515, 360, 391, 236, 267, 112, 143, + 19, 19, 640, 640, 609, 640, 516, 547, 485, 516, 392, 423, 361, 392, + 268, 299, 237, 268, 144, 175, 113, 144, 20, 51, 20, 20, 672, 672, + 641, 672, 610, 641, 548, 579, 517, 548, 486, 517, 424, 455, 393, 424, + 362, 393, 300, 331, 269, 300, 238, 269, 176, 207, 145, 176, 114, 145, + 52, 83, 21, 52, 21, 21, 704, 704, 673, 704, 642, 673, 611, 642, + 580, 611, 549, 580, 518, 549, 487, 518, 456, 487, 425, 456, 394, 425, + 363, 394, 332, 363, 301, 332, 270, 301, 239, 270, 208, 239, 177, 208, + 146, 177, 115, 146, 84, 115, 53, 84, 22, 53, 22, 22, 705, 736, + 674, 705, 643, 674, 581, 612, 550, 581, 519, 550, 457, 488, 426, 457, + 395, 426, 333, 364, 302, 333, 271, 302, 209, 240, 178, 209, 147, 178, + 85, 116, 54, 85, 23, 54, 706, 737, 675, 706, 582, 613, 551, 582, + 458, 489, 427, 458, 334, 365, 303, 334, 210, 241, 179, 210, 86, 117, + 55, 86, 707, 738, 583, 614, 459, 490, 335, 366, 211, 242, 87, 118, + 736, 736, 612, 643, 488, 519, 364, 395, 240, 271, 116, 147, 23, 23, + 768, 768, 737, 768, 644, 675, 613, 644, 520, 551, 489, 520, 396, 427, + 365, 396, 272, 303, 241, 272, 148, 179, 117, 148, 24, 55, 24, 24, + 800, 800, 769, 800, 738, 769, 676, 707, 645, 676, 614, 645, 552, 583, + 521, 552, 490, 521, 428, 459, 397, 428, 366, 397, 304, 335, 273, 304, + 242, 273, 180, 211, 149, 180, 118, 149, 56, 87, 25, 56, 25, 25, + 832, 832, 801, 832, 770, 801, 739, 770, 708, 739, 677, 708, 646, 677, + 615, 646, 584, 615, 553, 584, 522, 553, 491, 522, 460, 491, 429, 460, + 398, 429, 367, 398, 336, 367, 305, 336, 274, 305, 243, 274, 212, 243, + 181, 212, 150, 181, 119, 150, 88, 119, 57, 88, 26, 57, 26, 26, + 833, 864, 802, 833, 771, 802, 709, 740, 678, 709, 647, 678, 585, 616, + 554, 585, 523, 554, 461, 492, 430, 461, 399, 430, 337, 368, 306, 337, + 275, 306, 213, 244, 182, 213, 151, 182, 89, 120, 58, 89, 27, 58, + 834, 865, 803, 834, 710, 741, 679, 710, 586, 617, 555, 586, 462, 493, + 431, 462, 338, 369, 307, 338, 214, 245, 183, 214, 90, 121, 59, 90, + 835, 866, 711, 742, 587, 618, 463, 494, 339, 370, 215, 246, 91, 122, + 864, 864, 740, 771, 616, 647, 492, 523, 368, 399, 244, 275, 120, 151, + 27, 27, 896, 896, 865, 896, 772, 803, 741, 772, 648, 679, 617, 648, + 524, 555, 493, 524, 400, 431, 369, 400, 276, 307, 245, 276, 152, 183, + 121, 152, 28, 59, 28, 28, 928, 928, 897, 928, 866, 897, 804, 835, + 773, 804, 742, 773, 680, 711, 649, 680, 618, 649, 556, 587, 525, 556, + 494, 525, 432, 463, 401, 432, 370, 401, 308, 339, 277, 308, 246, 277, + 184, 215, 153, 184, 122, 153, 60, 91, 29, 60, 29, 29, 960, 960, + 929, 960, 898, 929, 867, 898, 836, 867, 805, 836, 774, 805, 743, 774, + 712, 743, 681, 712, 650, 681, 619, 650, 588, 619, 557, 588, 526, 557, + 495, 526, 464, 495, 433, 464, 402, 433, 371, 402, 340, 371, 309, 340, + 278, 309, 247, 278, 216, 247, 185, 216, 154, 185, 123, 154, 92, 123, + 61, 92, 30, 61, 30, 30, 961, 992, 930, 961, 899, 930, 837, 868, + 806, 837, 775, 806, 713, 744, 682, 713, 651, 682, 589, 620, 558, 589, + 527, 558, 465, 496, 434, 465, 403, 434, 341, 372, 310, 341, 279, 310, + 217, 248, 186, 217, 155, 186, 93, 124, 62, 93, 31, 62, 962, 993, + 931, 962, 838, 869, 807, 838, 714, 745, 683, 714, 590, 621, 559, 590, + 466, 497, 435, 466, 342, 373, 311, 342, 218, 249, 187, 218, 94, 125, + 63, 94, 963, 994, 839, 870, 715, 746, 591, 622, 467, 498, 343, 374, + 219, 250, 95, 126, 868, 899, 744, 775, 620, 651, 496, 527, 372, 403, + 248, 279, 124, 155, 900, 931, 869, 900, 776, 807, 745, 776, 652, 683, + 621, 652, 528, 559, 497, 528, 404, 435, 373, 404, 280, 311, 249, 280, + 156, 187, 125, 156, 932, 963, 901, 932, 870, 901, 808, 839, 777, 808, + 746, 777, 684, 715, 653, 684, 622, 653, 560, 591, 529, 560, 498, 529, + 436, 467, 405, 436, 374, 405, 312, 343, 281, 312, 250, 281, 188, 219, + 157, 188, 126, 157, 964, 995, 933, 964, 902, 933, 871, 902, 840, 871, + 809, 840, 778, 809, 747, 778, 716, 747, 685, 716, 654, 685, 623, 654, + 592, 623, 561, 592, 530, 561, 499, 530, 468, 499, 437, 468, 406, 437, + 375, 406, 344, 375, 313, 344, 282, 313, 251, 282, 220, 251, 189, 220, + 158, 189, 127, 158, 965, 996, 934, 965, 903, 934, 841, 872, 810, 841, + 779, 810, 717, 748, 686, 717, 655, 686, 593, 624, 562, 593, 531, 562, + 469, 500, 438, 469, 407, 438, 345, 376, 314, 345, 283, 314, 221, 252, + 190, 221, 159, 190, 966, 997, 935, 966, 842, 873, 811, 842, 718, 749, + 687, 718, 594, 625, 563, 594, 470, 501, 439, 470, 346, 377, 315, 346, + 222, 253, 191, 222, 967, 998, 843, 874, 719, 750, 595, 626, 471, 502, + 347, 378, 223, 254, 872, 903, 748, 779, 624, 655, 500, 531, 376, 407, + 252, 283, 904, 935, 873, 904, 780, 811, 749, 780, 656, 687, 625, 656, + 532, 563, 501, 532, 408, 439, 377, 408, 284, 315, 253, 284, 936, 967, + 905, 936, 874, 905, 812, 843, 781, 812, 750, 781, 688, 719, 657, 688, + 626, 657, 564, 595, 533, 564, 502, 533, 440, 471, 409, 440, 378, 409, + 316, 347, 285, 316, 254, 285, 968, 999, 937, 968, 906, 937, 875, 906, + 844, 875, 813, 844, 782, 813, 751, 782, 720, 751, 689, 720, 658, 689, + 627, 658, 596, 627, 565, 596, 534, 565, 503, 534, 472, 503, 441, 472, + 410, 441, 379, 410, 348, 379, 317, 348, 286, 317, 255, 286, 969, 1000, + 938, 969, 907, 938, 845, 876, 814, 845, 783, 814, 721, 752, 690, 721, + 659, 690, 597, 628, 566, 597, 535, 566, 473, 504, 442, 473, 411, 442, + 349, 380, 318, 349, 287, 318, 970, 1001, 939, 970, 846, 877, 815, 846, + 722, 753, 691, 722, 598, 629, 567, 598, 474, 505, 443, 474, 350, 381, + 319, 350, 971, 1002, 847, 878, 723, 754, 599, 630, 475, 506, 351, 382, + 876, 907, 752, 783, 628, 659, 504, 535, 380, 411, 908, 939, 877, 908, + 784, 815, 753, 784, 660, 691, 629, 660, 536, 567, 505, 536, 412, 443, + 381, 412, 940, 971, 909, 940, 878, 909, 816, 847, 785, 816, 754, 785, + 692, 723, 661, 692, 630, 661, 568, 599, 537, 568, 506, 537, 444, 475, + 413, 444, 382, 413, 972, 1003, 941, 972, 910, 941, 879, 910, 848, 879, + 817, 848, 786, 817, 755, 786, 724, 755, 693, 724, 662, 693, 631, 662, + 600, 631, 569, 600, 538, 569, 507, 538, 476, 507, 445, 476, 414, 445, + 383, 414, 973, 1004, 942, 973, 911, 942, 849, 880, 818, 849, 787, 818, + 725, 756, 694, 725, 663, 694, 601, 632, 570, 601, 539, 570, 477, 508, + 446, 477, 415, 446, 974, 1005, 943, 974, 850, 881, 819, 850, 726, 757, + 695, 726, 602, 633, 571, 602, 478, 509, 447, 478, 975, 1006, 851, 882, + 727, 758, 603, 634, 479, 510, 880, 911, 756, 787, 632, 663, 508, 539, + 912, 943, 881, 912, 788, 819, 757, 788, 664, 695, 633, 664, 540, 571, + 509, 540, 944, 975, 913, 944, 882, 913, 820, 851, 789, 820, 758, 789, + 696, 727, 665, 696, 634, 665, 572, 603, 541, 572, 510, 541, 976, 1007, + 945, 976, 914, 945, 883, 914, 852, 883, 821, 852, 790, 821, 759, 790, + 728, 759, 697, 728, 666, 697, 635, 666, 604, 635, 573, 604, 542, 573, + 511, 542, 977, 1008, 946, 977, 915, 946, 853, 884, 822, 853, 791, 822, + 729, 760, 698, 729, 667, 698, 605, 636, 574, 605, 543, 574, 978, 1009, + 947, 978, 854, 885, 823, 854, 730, 761, 699, 730, 606, 637, 575, 606, + 979, 1010, 855, 886, 731, 762, 607, 638, 884, 915, 760, 791, 636, 667, + 916, 947, 885, 916, 792, 823, 761, 792, 668, 699, 637, 668, 948, 979, + 917, 948, 886, 917, 824, 855, 793, 824, 762, 793, 700, 731, 669, 700, + 638, 669, 980, 1011, 949, 980, 918, 949, 887, 918, 856, 887, 825, 856, + 794, 825, 763, 794, 732, 763, 701, 732, 670, 701, 639, 670, 981, 1012, + 950, 981, 919, 950, 857, 888, 826, 857, 795, 826, 733, 764, 702, 733, + 671, 702, 982, 1013, 951, 982, 858, 889, 827, 858, 734, 765, 703, 734, + 983, 1014, 859, 890, 735, 766, 888, 919, 764, 795, 920, 951, 889, 920, + 796, 827, 765, 796, 952, 983, 921, 952, 890, 921, 828, 859, 797, 828, + 766, 797, 984, 1015, 953, 984, 922, 953, 891, 922, 860, 891, 829, 860, + 798, 829, 767, 798, 985, 1016, 954, 985, 923, 954, 861, 892, 830, 861, + 799, 830, 986, 1017, 955, 986, 862, 893, 831, 862, 987, 1018, 863, 894, + 892, 923, 924, 955, 893, 924, 956, 987, 925, 956, 894, 925, 988, 1019, + 957, 988, 926, 957, 895, 926, 989, 1020, 958, 989, 927, 958, 990, 1021, + 959, 990, 991, 1022, 0, 0, +}; + +// Add 1 to iscan values. This represents the EOB position instead of the index. +DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_4x4[16]) = { + 1, 3, 6, 9, 2, 4, 10, 13, 5, 8, 12, 15, 7, 11, 14, 16, +}; + +DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_4x4[16]) = { + 1, 4, 8, 12, 2, 6, 10, 13, 3, 7, 11, 15, 5, 9, 14, 16, +}; + +DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_4x4[16]) = { + 1, 2, 4, 6, 3, 5, 7, 10, 8, 9, 12, 14, 11, 13, 15, 16, +}; + +DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_8x8[64]) = { + 1, 4, 9, 16, 23, 33, 41, 48, 2, 6, 12, 19, 27, 35, 45, 52, + 3, 8, 14, 21, 29, 39, 47, 55, 5, 11, 17, 25, 32, 42, 51, 57, + 7, 13, 22, 28, 36, 44, 53, 59, 10, 18, 26, 34, 40, 49, 56, 61, + 15, 24, 31, 38, 46, 54, 60, 63, 20, 30, 37, 43, 50, 58, 62, 64, +}; + +DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_8x8[64]) = { + 1, 2, 3, 6, 9, 13, 20, 25, 4, 5, 8, 11, 16, 21, 31, 40, + 7, 10, 14, 17, 22, 28, 38, 47, 12, 15, 18, 24, 29, 35, 45, 53, + 19, 23, 26, 32, 36, 42, 51, 58, 27, 30, 34, 39, 44, 50, 56, 60, + 33, 37, 43, 48, 52, 55, 61, 62, 41, 46, 49, 54, 57, 59, 63, 64, +}; + +DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_8x8[64]) = { + 1, 3, 6, 10, 15, 23, 32, 38, 2, 5, 9, 14, 20, 27, 39, 45, + 4, 7, 11, 18, 25, 31, 43, 50, 8, 12, 16, 22, 30, 37, 48, 54, + 13, 17, 21, 28, 35, 44, 53, 58, 19, 24, 29, 36, 42, 49, 57, 61, + 26, 33, 40, 46, 51, 56, 60, 63, 34, 41, 47, 52, 55, 59, 62, 64, +}; + +DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_16x16[256]) = { + 1, 5, 12, 21, 32, 44, 60, 76, 86, 110, 131, 151, 166, 182, 196, 199, + 2, 7, 15, 24, 35, 48, 65, 82, 96, 115, 136, 154, 172, 189, 202, 213, + 3, 9, 17, 26, 39, 53, 68, 84, 102, 117, 137, 158, 173, 191, 206, 217, + 4, 11, 19, 30, 42, 56, 72, 90, 104, 120, 142, 160, 177, 195, 209, 219, + 6, 13, 22, 33, 46, 59, 75, 94, 105, 124, 145, 165, 180, 197, 211, 224, + 8, 16, 27, 38, 50, 64, 79, 97, 113, 130, 147, 167, 183, 201, 216, 229, + 10, 20, 29, 40, 55, 70, 87, 103, 118, 133, 152, 171, 188, 207, 221, 231, + 14, 25, 36, 47, 61, 74, 92, 109, 123, 138, 155, 175, 190, 208, 225, 236, + 18, 31, 41, 54, 67, 83, 99, 116, 127, 143, 162, 181, 198, 214, 228, 238, + 23, 37, 49, 63, 77, 93, 106, 121, 134, 148, 168, 187, 204, 220, 233, 241, + 28, 45, 57, 71, 85, 100, 114, 128, 141, 157, 176, 194, 210, 227, 237, 245, + 34, 52, 69, 80, 95, 111, 126, 139, 150, 163, 185, 203, 218, 230, 242, 248, + 43, 62, 78, 91, 107, 122, 135, 149, 161, 174, 192, 212, 226, 239, 246, 252, + 51, 73, 88, 101, 119, 129, 146, 159, 169, 184, 205, 223, 234, 243, 250, 254, + 58, 81, 98, 112, 132, 144, 156, 170, 179, 193, 215, 232, 240, 247, 251, 255, + 66, 89, 108, 125, 140, 153, 164, 178, 186, 200, 222, 235, 244, 249, 253, 256, +}; + +DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_16x16[256]) = { + 1, 2, 3, 5, 7, 10, 13, 18, 23, 30, 37, 44, 55, 65, 77, + 87, 4, 6, 8, 12, 16, 20, 26, 33, 39, 49, 60, 69, 85, 100, + 116, 131, 9, 11, 14, 19, 24, 28, 34, 43, 52, 61, 73, 89, 104, + 120, 143, 168, 15, 17, 21, 27, 32, 38, 45, 54, 62, 74, 86, 101, + 117, 136, 162, 186, 22, 25, 31, 36, 41, 48, 56, 66, 75, 82, 95, + 113, 134, 155, 180, 206, 29, 35, 40, 46, 51, 59, 68, 78, 88, 97, + 107, 122, 147, 170, 197, 213, 42, 47, 50, 57, 64, 71, 80, 91, 99, + 108, 123, 139, 160, 183, 208, 223, 53, 58, 63, 70, 76, 84, 94, 103, + 111, 121, 135, 151, 177, 196, 216, 227, 67, 72, 79, 83, 92, 98, 109, + 114, 128, 137, 149, 169, 189, 203, 222, 233, 81, 90, 93, 102, 106, 115, + 126, 132, 140, 152, 163, 178, 193, 209, 224, 235, 96, 105, 110, 118, 124, + 129, 144, 145, 156, 166, 176, 191, 207, 220, 234, 240, 112, 119, 125, 130, + 141, 148, 158, 165, 171, 182, 192, 204, 225, 231, 241, 244, 127, 133, 138, + 146, 154, 161, 175, 179, 185, 198, 205, 217, 232, 238, 245, 247, 142, 150, + 157, 167, 173, 181, 190, 200, 201, 211, 221, 229, 239, 243, 250, 252, 153, + 164, 172, 184, 187, 194, 202, 212, 215, 219, 228, 237, 246, 248, 253, 254, + 159, 174, 188, 195, 199, 210, 214, 218, 226, 230, 236, 242, 249, 251, 255, + 256, +}; + +DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_16x16[256]) = { + 1, 3, 6, 10, 18, 25, 37, 45, 56, 73, 89, 105, 129, 144, 167, + 180, 2, 5, 9, 14, 21, 31, 41, 55, 67, 80, 97, 114, 142, 155, + 179, 197, 4, 8, 12, 19, 26, 34, 47, 58, 72, 87, 102, 120, 149, + 165, 187, 202, 7, 13, 17, 24, 32, 40, 54, 65, 79, 93, 111, 128, + 154, 170, 194, 209, 11, 15, 20, 29, 38, 48, 59, 68, 85, 99, 115, + 134, 162, 177, 199, 215, 16, 22, 27, 35, 44, 53, 66, 78, 92, 107, + 121, 141, 166, 186, 206, 222, 23, 28, 33, 42, 49, 61, 74, 86, 100, + 117, 131, 152, 176, 191, 212, 226, 30, 36, 43, 50, 60, 70, 82, 96, + 109, 126, 140, 156, 183, 198, 218, 230, 39, 46, 52, 62, 69, 81, 94, + 106, 119, 135, 151, 169, 192, 208, 224, 235, 51, 57, 64, 75, 84, 95, + 110, 118, 130, 148, 164, 178, 200, 214, 229, 239, 63, 71, 77, 88, 98, + 108, 123, 132, 146, 160, 173, 189, 211, 223, 236, 243, 76, 83, 91, 103, + 113, 125, 139, 147, 158, 174, 188, 203, 220, 231, 241, 246, 90, 101, 112, + 124, 133, 143, 157, 168, 181, 190, 204, 217, 232, 238, 247, 251, 104, 116, + 127, 137, 150, 163, 172, 184, 195, 205, 216, 225, 237, 242, 249, 253, 122, + 136, 145, 159, 171, 182, 193, 201, 210, 219, 228, 234, 244, 245, 252, 255, + 138, 153, 161, 175, 185, 196, 207, 213, 221, 227, 233, 240, 248, 250, 254, + 256, +}; + +DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_32x32[1024]) = { + 1, 3, 6, 11, 18, 26, 39, 48, 63, 84, 102, 122, 146, + 171, 194, 205, 211, 220, 230, 234, 246, 258, 276, 300, 343, 357, + 378, 406, 456, 472, 496, 528, 2, 5, 9, 16, 23, 31, 46, + 59, 75, 93, 113, 134, 159, 185, 204, 216, 223, 229, 235, 238, + 257, 275, 299, 318, 356, 377, 405, 427, 471, 495, 527, 552, 4, + 8, 13, 19, 29, 37, 53, 65, 83, 103, 119, 143, 165, 190, + 209, 218, 225, 232, 236, 239, 274, 298, 317, 330, 376, 404, 426, + 441, 494, 526, 551, 568, 7, 12, 17, 24, 32, 44, 61, 74, + 91, 110, 127, 151, 174, 197, 212, 221, 227, 233, 237, 240, 297, + 316, 329, 336, 403, 425, 440, 448, 525, 550, 567, 576, 10, 15, + 20, 30, 38, 51, 66, 79, 96, 117, 135, 158, 180, 202, 215, + 224, 245, 256, 273, 296, 342, 355, 375, 402, 455, 470, 493, 524, + 583, 597, 618, 646, 14, 21, 27, 36, 45, 55, 73, 86, 106, + 124, 141, 164, 183, 206, 217, 226, 255, 272, 295, 315, 354, 374, + 401, 424, 469, 492, 523, 549, 596, 617, 645, 667, 22, 28, 34, + 43, 54, 64, 81, 95, 114, 133, 152, 173, 191, 210, 219, 228, + 271, 294, 314, 328, 373, 400, 423, 439, 491, 522, 548, 566, 616, + 644, 666, 681, 25, 33, 40, 49, 58, 72, 89, 105, 121, 140, + 160, 179, 198, 213, 222, 231, 293, 313, 327, 335, 399, 422, 438, + 447, 521, 547, 565, 575, 643, 665, 680, 688, 35, 41, 47, 57, + 69, 82, 97, 112, 131, 148, 168, 187, 244, 254, 270, 292, 341, + 353, 372, 398, 454, 468, 490, 520, 582, 595, 615, 642, 694, 706, + 724, 748, 42, 50, 56, 68, 78, 92, 108, 125, 139, 162, 178, + 195, 253, 269, 291, 312, 352, 371, 397, 421, 467, 489, 519, 546, + 594, 614, 641, 664, 705, 723, 747, 766, 52, 60, 67, 77, 90, + 100, 120, 132, 150, 169, 182, 201, 268, 290, 311, 326, 370, 396, + 420, 437, 488, 518, 545, 564, 613, 640, 663, 679, 722, 746, 765, + 778, 62, 70, 76, 88, 101, 115, 130, 145, 163, 181, 192, 208, + 289, 310, 325, 334, 395, 419, 436, 446, 517, 544, 563, 574, 639, + 662, 678, 687, 745, 764, 777, 784, 71, 80, 87, 98, 109, 123, + 138, 156, 243, 252, 267, 288, 340, 351, 369, 394, 453, 466, 487, + 516, 581, 593, 612, 638, 693, 704, 721, 744, 789, 799, 814, 834, + 85, 94, 104, 111, 126, 142, 155, 172, 251, 266, 287, 309, 350, + 368, 393, 418, 465, 486, 515, 543, 592, 611, 637, 661, 703, 720, + 743, 763, 798, 813, 833, 849, 99, 107, 116, 128, 144, 157, 170, + 186, 265, 286, 308, 324, 367, 392, 417, 435, 485, 514, 542, 562, + 610, 636, 660, 677, 719, 742, 762, 776, 812, 832, 848, 859, 118, + 129, 137, 149, 161, 176, 189, 199, 285, 307, 323, 333, 391, 416, + 434, 445, 513, 541, 561, 573, 635, 659, 676, 686, 741, 761, 775, + 783, 831, 847, 858, 864, 136, 147, 153, 166, 242, 250, 264, 284, + 339, 349, 366, 390, 452, 464, 484, 512, 580, 591, 609, 634, 692, + 702, 718, 740, 788, 797, 811, 830, 868, 876, 888, 904, 154, 167, + 175, 184, 249, 263, 283, 306, 348, 365, 389, 415, 463, 483, 511, + 540, 590, 608, 633, 658, 701, 717, 739, 760, 796, 810, 829, 846, + 875, 887, 903, 916, 177, 188, 196, 203, 262, 282, 305, 322, 364, + 388, 414, 433, 482, 510, 539, 560, 607, 632, 657, 675, 716, 738, + 759, 774, 809, 828, 845, 857, 886, 902, 915, 924, 193, 200, 207, + 214, 281, 304, 321, 332, 387, 413, 432, 444, 509, 538, 559, 572, + 631, 656, 674, 685, 737, 758, 773, 782, 827, 844, 856, 863, 901, + 914, 923, 928, 241, 248, 261, 280, 338, 347, 363, 386, 451, 462, + 481, 508, 579, 589, 606, 630, 691, 700, 715, 736, 787, 795, 808, + 826, 867, 874, 885, 900, 931, 937, 946, 958, 247, 260, 279, 303, + 346, 362, 385, 412, 461, 480, 507, 537, 588, 605, 629, 655, 699, + 714, 735, 757, 794, 807, 825, 843, 873, 884, 899, 913, 936, 945, + 957, 967, 259, 278, 302, 320, 361, 384, 411, 431, 479, 506, 536, + 558, 604, 628, 654, 673, 713, 734, 756, 772, 806, 824, 842, 855, + 883, 898, 912, 922, 944, 956, 966, 973, 277, 301, 319, 331, 383, + 410, 430, 443, 505, 535, 557, 571, 627, 653, 672, 684, 733, 755, + 771, 781, 823, 841, 854, 862, 897, 911, 921, 927, 955, 965, 972, + 976, 337, 345, 360, 382, 450, 460, 478, 504, 578, 587, 603, 626, + 690, 698, 712, 732, 786, 793, 805, 822, 866, 872, 882, 896, 930, + 935, 943, 954, 978, 982, 988, 996, 344, 359, 381, 409, 459, 477, + 503, 534, 586, 602, 625, 652, 697, 711, 731, 754, 792, 804, 821, + 840, 871, 881, 895, 910, 934, 942, 953, 964, 981, 987, 995, 1002, + 358, 380, 408, 429, 476, 502, 533, 556, 601, 624, 651, 671, 710, + 730, 753, 770, 803, 820, 839, 853, 880, 894, 909, 920, 941, 952, + 963, 971, 986, 994, 1001, 1006, 379, 407, 428, 442, 501, 532, 555, + 570, 623, 650, 670, 683, 729, 752, 769, 780, 819, 838, 852, 861, + 893, 908, 919, 926, 951, 962, 970, 975, 993, 1000, 1005, 1008, 449, + 458, 475, 500, 577, 585, 600, 622, 689, 696, 709, 728, 785, 791, + 802, 818, 865, 870, 879, 892, 929, 933, 940, 950, 977, 980, 985, + 992, 1009, 1011, 1014, 1018, 457, 474, 499, 531, 584, 599, 621, 649, + 695, 708, 727, 751, 790, 801, 817, 837, 869, 878, 891, 907, 932, + 939, 949, 961, 979, 984, 991, 999, 1010, 1013, 1017, 1021, 473, 498, + 530, 554, 598, 620, 648, 669, 707, 726, 750, 768, 800, 816, 836, + 851, 877, 890, 906, 918, 938, 948, 960, 969, 983, 990, 998, 1004, + 1012, 1016, 1020, 1023, 497, 529, 553, 569, 619, 647, 668, 682, 725, + 749, 767, 779, 815, 835, 850, 860, 889, 905, 917, 925, 947, 959, + 968, 974, 989, 997, 1003, 1007, 1015, 1019, 1022, 1024, +}; + +const ScanOrder vp9_default_scan_orders[TX_SIZES] = { + { default_scan_4x4, vp9_default_iscan_4x4, default_scan_4x4_neighbors }, + { default_scan_8x8, vp9_default_iscan_8x8, default_scan_8x8_neighbors }, + { default_scan_16x16, vp9_default_iscan_16x16, default_scan_16x16_neighbors }, + { default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors }, +}; + +const ScanOrder vp9_scan_orders[TX_SIZES][TX_TYPES] = { + { // TX_4X4 + { default_scan_4x4, vp9_default_iscan_4x4, default_scan_4x4_neighbors }, + { row_scan_4x4, vp9_row_iscan_4x4, row_scan_4x4_neighbors }, + { col_scan_4x4, vp9_col_iscan_4x4, col_scan_4x4_neighbors }, + { default_scan_4x4, vp9_default_iscan_4x4, default_scan_4x4_neighbors } }, + { // TX_8X8 + { default_scan_8x8, vp9_default_iscan_8x8, default_scan_8x8_neighbors }, + { row_scan_8x8, vp9_row_iscan_8x8, row_scan_8x8_neighbors }, + { col_scan_8x8, vp9_col_iscan_8x8, col_scan_8x8_neighbors }, + { default_scan_8x8, vp9_default_iscan_8x8, default_scan_8x8_neighbors } }, + { // TX_16X16 + { default_scan_16x16, vp9_default_iscan_16x16, + default_scan_16x16_neighbors }, + { row_scan_16x16, vp9_row_iscan_16x16, row_scan_16x16_neighbors }, + { col_scan_16x16, vp9_col_iscan_16x16, col_scan_16x16_neighbors }, + { default_scan_16x16, vp9_default_iscan_16x16, + default_scan_16x16_neighbors } }, + { // TX_32X32 + { default_scan_32x32, vp9_default_iscan_32x32, + default_scan_32x32_neighbors }, + { default_scan_32x32, vp9_default_iscan_32x32, + default_scan_32x32_neighbors }, + { default_scan_32x32, vp9_default_iscan_32x32, + default_scan_32x32_neighbors }, + { default_scan_32x32, vp9_default_iscan_32x32, + default_scan_32x32_neighbors } } +}; diff --git a/media/libvpx/libvpx/vp9/common/vp9_scan.h b/media/libvpx/libvpx/vp9/common/vp9_scan.h new file mode 100644 index 0000000000..3d1dcc66da --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_scan.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_COMMON_VP9_SCAN_H_ +#define VPX_VP9_COMMON_VP9_SCAN_H_ + +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +#include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_blockd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_NEIGHBORS 2 + +typedef struct ScanOrder { + const int16_t *scan; + const int16_t *iscan; + const int16_t *neighbors; +} ScanOrder; + +extern const ScanOrder vp9_default_scan_orders[TX_SIZES]; +extern const ScanOrder vp9_scan_orders[TX_SIZES][TX_TYPES]; + +static INLINE int get_coef_context(const int16_t *neighbors, + const uint8_t *token_cache, int c) { + return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] + + token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> + 1; +} + +static INLINE const ScanOrder *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size, + PLANE_TYPE type, int block_idx) { + const MODE_INFO *const mi = xd->mi[0]; + + if (is_inter_block(mi) || type != PLANE_TYPE_Y || xd->lossless) { + return &vp9_default_scan_orders[tx_size]; + } else { + const PREDICTION_MODE mode = get_y_mode(mi, block_idx); + return &vp9_scan_orders[tx_size][intra_mode_to_tx_type_lookup[mode]]; + } +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_COMMON_VP9_SCAN_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_seg_common.c b/media/libvpx/libvpx/vp9/common/vp9_seg_common.c new file mode 100644 index 0000000000..1c7a1d2e9a --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_seg_common.c @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_loopfilter.h" +#include "vp9/common/vp9_seg_common.h" +#include "vp9/common/vp9_quant_common.h" + +static const int seg_feature_data_signed[SEG_LVL_MAX] = { 1, 1, 0, 0 }; + +static const int seg_feature_data_max[SEG_LVL_MAX] = { MAXQ, MAX_LOOP_FILTER, 3, + 0 }; + +// These functions provide access to new segment level features. +// Eventually these function may be "optimized out" but for the moment, +// the coding mechanism is still subject to change so these provide a +// convenient single point of change. + +void vp9_clearall_segfeatures(struct segmentation *seg) { + vp9_zero(seg->feature_data); + vp9_zero(seg->feature_mask); + seg->aq_av_offset = 0; +} + +void vp9_enable_segfeature(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id) { + seg->feature_mask[segment_id] |= 1 << feature_id; +} + +int vp9_seg_feature_data_max(SEG_LVL_FEATURES feature_id) { + return seg_feature_data_max[feature_id]; +} + +int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id) { + return seg_feature_data_signed[feature_id]; +} + +void vp9_set_segdata(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id, int seg_data) { + assert(seg_data <= seg_feature_data_max[feature_id]); + if (seg_data < 0) { + assert(seg_feature_data_signed[feature_id]); + assert(-seg_data <= seg_feature_data_max[feature_id]); + } + + seg->feature_data[segment_id][feature_id] = seg_data; +} + +const vpx_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)] = { + 2, 4, 6, 8, 10, 12, 0, -1, -2, -3, -4, -5, -6, -7 +}; + +// TBD? Functions to read and write segment data with range / validity checking diff --git a/media/libvpx/libvpx/vp9/common/vp9_seg_common.h b/media/libvpx/libvpx/vp9/common/vp9_seg_common.h new file mode 100644 index 0000000000..5e71c2fca5 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_seg_common.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_COMMON_VP9_SEG_COMMON_H_ +#define VPX_VP9_COMMON_VP9_SEG_COMMON_H_ + +#include "vpx_dsp/prob.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define SEGMENT_DELTADATA 0 +#define SEGMENT_ABSDATA 1 + +#define MAX_SEGMENTS 8 +#define SEG_TREE_PROBS (MAX_SEGMENTS - 1) + +#define PREDICTION_PROBS 3 + +// Segment ID used to skip background encoding +#define BACKGROUND_SEG_SKIP_ID 3 +// Number of frames that don't skip after a key frame +#define FRAMES_NO_SKIPPING_AFTER_KEY 20 + +// Segment level features. +typedef enum { + SEG_LVL_ALT_Q = 0, // Use alternate Quantizer .... + SEG_LVL_ALT_LF = 1, // Use alternate loop filter value... + SEG_LVL_REF_FRAME = 2, // Optional Segment reference frame + SEG_LVL_SKIP = 3, // Optional Segment (0,0) + skip mode + SEG_LVL_MAX = 4 // Number of features supported +} SEG_LVL_FEATURES; + +struct segmentation { + uint8_t enabled; + uint8_t update_map; + uint8_t update_data; + uint8_t abs_delta; + uint8_t temporal_update; + + vpx_prob tree_probs[SEG_TREE_PROBS]; + vpx_prob pred_probs[PREDICTION_PROBS]; + + int16_t feature_data[MAX_SEGMENTS][SEG_LVL_MAX]; + uint32_t feature_mask[MAX_SEGMENTS]; + int aq_av_offset; +}; + +static INLINE int segfeature_active(const struct segmentation *seg, + int segment_id, + SEG_LVL_FEATURES feature_id) { + return seg->enabled && (seg->feature_mask[segment_id] & (1 << feature_id)); +} + +void vp9_clearall_segfeatures(struct segmentation *seg); + +void vp9_enable_segfeature(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id); + +int vp9_seg_feature_data_max(SEG_LVL_FEATURES feature_id); + +int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id); + +void vp9_set_segdata(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id, int seg_data); + +static INLINE int get_segdata(const struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id) { + return seg->feature_data[segment_id][feature_id]; +} + +extern const vpx_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)]; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_COMMON_VP9_SEG_COMMON_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_thread_common.c b/media/libvpx/libvpx/vp9/common/vp9_thread_common.c new file mode 100644 index 0000000000..8df18af3b8 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_thread_common.c @@ -0,0 +1,596 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include "./vpx_config.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_mem/vpx_mem.h" +#include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_thread_common.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_loopfilter.h" + +#if CONFIG_MULTITHREAD +static INLINE void mutex_lock(pthread_mutex_t *const mutex) { + const int kMaxTryLocks = 4000; + int locked = 0; + int i; + + for (i = 0; i < kMaxTryLocks; ++i) { + if (!pthread_mutex_trylock(mutex)) { + locked = 1; + break; + } + } + + if (!locked) pthread_mutex_lock(mutex); +} +#endif // CONFIG_MULTITHREAD + +static INLINE void sync_read(VP9LfSync *const lf_sync, int r, int c) { +#if CONFIG_MULTITHREAD + const int nsync = lf_sync->sync_range; + + if (r && !(c & (nsync - 1))) { + pthread_mutex_t *const mutex = &lf_sync->mutex[r - 1]; + mutex_lock(mutex); + + while (c > lf_sync->cur_sb_col[r - 1] - nsync) { + pthread_cond_wait(&lf_sync->cond[r - 1], mutex); + } + pthread_mutex_unlock(mutex); + } +#else + (void)lf_sync; + (void)r; + (void)c; +#endif // CONFIG_MULTITHREAD +} + +static INLINE void sync_write(VP9LfSync *const lf_sync, int r, int c, + const int sb_cols) { +#if CONFIG_MULTITHREAD + const int nsync = lf_sync->sync_range; + int cur; + // Only signal when there are enough filtered SB for next row to run. + int sig = 1; + + if (c < sb_cols - 1) { + cur = c; + if (c % nsync) sig = 0; + } else { + cur = sb_cols + nsync; + } + + if (sig) { + mutex_lock(&lf_sync->mutex[r]); + + lf_sync->cur_sb_col[r] = cur; + + pthread_cond_signal(&lf_sync->cond[r]); + pthread_mutex_unlock(&lf_sync->mutex[r]); + } +#else + (void)lf_sync; + (void)r; + (void)c; + (void)sb_cols; +#endif // CONFIG_MULTITHREAD +} + +// Implement row loopfiltering for each thread. +static INLINE void thread_loop_filter_rows( + const YV12_BUFFER_CONFIG *const frame_buffer, VP9_COMMON *const cm, + struct macroblockd_plane planes[MAX_MB_PLANE], int start, int stop, + int y_only, VP9LfSync *const lf_sync) { + const int num_planes = y_only ? 1 : MAX_MB_PLANE; + const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2; + const int num_active_workers = lf_sync->num_active_workers; + int mi_row, mi_col; + enum lf_path path; + if (y_only) + path = LF_PATH_444; + else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1) + path = LF_PATH_420; + else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0) + path = LF_PATH_444; + else + path = LF_PATH_SLOW; + + assert(num_active_workers > 0); + + for (mi_row = start; mi_row < stop; + mi_row += num_active_workers * MI_BLOCK_SIZE) { + MODE_INFO **const mi = cm->mi_grid_visible + mi_row * cm->mi_stride; + LOOP_FILTER_MASK *lfm = get_lfm(&cm->lf, mi_row, 0); + + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE, ++lfm) { + const int r = mi_row >> MI_BLOCK_SIZE_LOG2; + const int c = mi_col >> MI_BLOCK_SIZE_LOG2; + int plane; + + sync_read(lf_sync, r, c); + + vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col); + + vp9_adjust_mask(cm, mi_row, mi_col, lfm); + + vp9_filter_block_plane_ss00(cm, &planes[0], mi_row, lfm); + for (plane = 1; plane < num_planes; ++plane) { + switch (path) { + case LF_PATH_420: + vp9_filter_block_plane_ss11(cm, &planes[plane], mi_row, lfm); + break; + case LF_PATH_444: + vp9_filter_block_plane_ss00(cm, &planes[plane], mi_row, lfm); + break; + case LF_PATH_SLOW: + vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col, + mi_row, mi_col); + break; + } + } + + sync_write(lf_sync, r, c, sb_cols); + } + } +} + +// Row-based multi-threaded loopfilter hook +static int loop_filter_row_worker(void *arg1, void *arg2) { + VP9LfSync *const lf_sync = (VP9LfSync *)arg1; + LFWorkerData *const lf_data = (LFWorkerData *)arg2; + thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, + lf_data->start, lf_data->stop, lf_data->y_only, + lf_sync); + return 1; +} + +static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm, + struct macroblockd_plane planes[MAX_MB_PLANE], + int start, int stop, int y_only, + VPxWorker *workers, int nworkers, + VP9LfSync *lf_sync) { + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + // Number of superblock rows and cols + const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + const int num_tile_cols = 1 << cm->log2_tile_cols; + // Limit the number of workers to prevent changes in frame dimensions from + // causing incorrect sync calculations when sb_rows < threads/tile_cols. + // Further restrict them by the number of tile columns should the user + // request more as this implementation doesn't scale well beyond that. + const int num_workers = VPXMIN(nworkers, VPXMIN(num_tile_cols, sb_rows)); + int i; + + if (!lf_sync->sync_range || sb_rows != lf_sync->rows || + num_workers > lf_sync->num_workers) { + vp9_loop_filter_dealloc(lf_sync); + vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers); + } + lf_sync->num_active_workers = num_workers; + + // Initialize cur_sb_col to -1 for all SB rows. + memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows); + + // Set up loopfilter thread data. + // The decoder is capping num_workers because it has been observed that using + // more threads on the loopfilter than there are cores will hurt performance + // on Android. This is because the system will only schedule the tile decode + // workers on cores equal to the number of tile columns. Then if the decoder + // tries to use more threads for the loopfilter, it will hurt performance + // because of contention. If the multithreading code changes in the future + // then the number of workers used by the loopfilter should be revisited. + for (i = 0; i < num_workers; ++i) { + VPxWorker *const worker = &workers[i]; + LFWorkerData *const lf_data = &lf_sync->lfdata[i]; + + worker->hook = loop_filter_row_worker; + worker->data1 = lf_sync; + worker->data2 = lf_data; + + // Loopfilter data + vp9_loop_filter_data_reset(lf_data, frame, cm, planes); + lf_data->start = start + i * MI_BLOCK_SIZE; + lf_data->stop = stop; + lf_data->y_only = y_only; + + // Start loopfiltering + if (i == num_workers - 1) { + winterface->execute(worker); + } else { + winterface->launch(worker); + } + } + + // Wait till all rows are finished + for (i = 0; i < num_workers; ++i) { + winterface->sync(&workers[i]); + } +} + +void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm, + struct macroblockd_plane planes[MAX_MB_PLANE], + int frame_filter_level, int y_only, + int partial_frame, VPxWorker *workers, + int num_workers, VP9LfSync *lf_sync) { + int start_mi_row, end_mi_row, mi_rows_to_filter; + + if (!frame_filter_level) return; + + start_mi_row = 0; + mi_rows_to_filter = cm->mi_rows; + if (partial_frame && cm->mi_rows > 8) { + start_mi_row = cm->mi_rows >> 1; + start_mi_row &= 0xfffffff8; + mi_rows_to_filter = VPXMAX(cm->mi_rows / 8, 8); + } + end_mi_row = start_mi_row + mi_rows_to_filter; + vp9_loop_filter_frame_init(cm, frame_filter_level); + + loop_filter_rows_mt(frame, cm, planes, start_mi_row, end_mi_row, y_only, + workers, num_workers, lf_sync); +} + +void vp9_lpf_mt_init(VP9LfSync *lf_sync, VP9_COMMON *cm, int frame_filter_level, + int num_workers) { + const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + + if (!frame_filter_level) return; + + if (!lf_sync->sync_range || sb_rows != lf_sync->rows || + num_workers > lf_sync->num_workers) { + vp9_loop_filter_dealloc(lf_sync); + vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers); + } + + // Initialize cur_sb_col to -1 for all SB rows. + memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows); + + lf_sync->corrupted = 0; + + memset(lf_sync->num_tiles_done, 0, + sizeof(*lf_sync->num_tiles_done) * sb_rows); + cm->lf_row = 0; +} + +// Set up nsync by width. +static INLINE int get_sync_range(int width) { + // nsync numbers are picked by testing. For example, for 4k + // video, using 4 gives best performance. + if (width < 640) + return 1; + else if (width <= 1280) + return 2; + else if (width <= 4096) + return 4; + else + return 8; +} + +// Allocate memory for lf row synchronization +void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, + int width, int num_workers) { + lf_sync->rows = rows; +#if CONFIG_MULTITHREAD + { + int i; + + CHECK_MEM_ERROR(&cm->error, lf_sync->mutex, + vpx_malloc(sizeof(*lf_sync->mutex) * rows)); + if (lf_sync->mutex) { + for (i = 0; i < rows; ++i) { + pthread_mutex_init(&lf_sync->mutex[i], NULL); + } + } + + CHECK_MEM_ERROR(&cm->error, lf_sync->cond, + vpx_malloc(sizeof(*lf_sync->cond) * rows)); + if (lf_sync->cond) { + for (i = 0; i < rows; ++i) { + pthread_cond_init(&lf_sync->cond[i], NULL); + } + } + + CHECK_MEM_ERROR(&cm->error, lf_sync->lf_mutex, + vpx_malloc(sizeof(*lf_sync->lf_mutex))); + pthread_mutex_init(lf_sync->lf_mutex, NULL); + + CHECK_MEM_ERROR(&cm->error, lf_sync->recon_done_mutex, + vpx_malloc(sizeof(*lf_sync->recon_done_mutex) * rows)); + if (lf_sync->recon_done_mutex) { + for (i = 0; i < rows; ++i) { + pthread_mutex_init(&lf_sync->recon_done_mutex[i], NULL); + } + } + + CHECK_MEM_ERROR(&cm->error, lf_sync->recon_done_cond, + vpx_malloc(sizeof(*lf_sync->recon_done_cond) * rows)); + if (lf_sync->recon_done_cond) { + for (i = 0; i < rows; ++i) { + pthread_cond_init(&lf_sync->recon_done_cond[i], NULL); + } + } + } +#endif // CONFIG_MULTITHREAD + + CHECK_MEM_ERROR(&cm->error, lf_sync->lfdata, + vpx_malloc(num_workers * sizeof(*lf_sync->lfdata))); + lf_sync->num_workers = num_workers; + lf_sync->num_active_workers = lf_sync->num_workers; + + CHECK_MEM_ERROR(&cm->error, lf_sync->cur_sb_col, + vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows)); + + CHECK_MEM_ERROR(&cm->error, lf_sync->num_tiles_done, + vpx_malloc(sizeof(*lf_sync->num_tiles_done) * + mi_cols_aligned_to_sb(cm->mi_rows) >> + MI_BLOCK_SIZE_LOG2)); + + // Set up nsync. + lf_sync->sync_range = get_sync_range(width); +} + +// Deallocate lf synchronization related mutex and data +void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) { + assert(lf_sync != NULL); + +#if CONFIG_MULTITHREAD + if (lf_sync->mutex != NULL) { + int i; + for (i = 0; i < lf_sync->rows; ++i) { + pthread_mutex_destroy(&lf_sync->mutex[i]); + } + vpx_free(lf_sync->mutex); + } + if (lf_sync->cond != NULL) { + int i; + for (i = 0; i < lf_sync->rows; ++i) { + pthread_cond_destroy(&lf_sync->cond[i]); + } + vpx_free(lf_sync->cond); + } + if (lf_sync->recon_done_mutex != NULL) { + int i; + for (i = 0; i < lf_sync->rows; ++i) { + pthread_mutex_destroy(&lf_sync->recon_done_mutex[i]); + } + vpx_free(lf_sync->recon_done_mutex); + } + + if (lf_sync->lf_mutex != NULL) { + pthread_mutex_destroy(lf_sync->lf_mutex); + vpx_free(lf_sync->lf_mutex); + } + if (lf_sync->recon_done_cond != NULL) { + int i; + for (i = 0; i < lf_sync->rows; ++i) { + pthread_cond_destroy(&lf_sync->recon_done_cond[i]); + } + vpx_free(lf_sync->recon_done_cond); + } +#endif // CONFIG_MULTITHREAD + + vpx_free(lf_sync->lfdata); + vpx_free(lf_sync->cur_sb_col); + vpx_free(lf_sync->num_tiles_done); + // clear the structure as the source of this call may be a resize in which + // case this call will be followed by an _alloc() which may fail. + vp9_zero(*lf_sync); +} + +static int get_next_row(VP9_COMMON *cm, VP9LfSync *lf_sync) { + int return_val = -1; + const int max_rows = cm->mi_rows; + +#if CONFIG_MULTITHREAD + int cur_row; + const int tile_cols = 1 << cm->log2_tile_cols; + + pthread_mutex_lock(lf_sync->lf_mutex); + if (cm->lf_row < max_rows) { + cur_row = cm->lf_row >> MI_BLOCK_SIZE_LOG2; + return_val = cm->lf_row; + cm->lf_row += MI_BLOCK_SIZE; + if (cm->lf_row < max_rows) { + /* If this is not the last row, make sure the next row is also decoded. + * This is because the intra predict has to happen before loop filter */ + cur_row += 1; + } + } + pthread_mutex_unlock(lf_sync->lf_mutex); + + if (return_val == -1) return return_val; + + pthread_mutex_lock(&lf_sync->recon_done_mutex[cur_row]); + if (lf_sync->num_tiles_done[cur_row] < tile_cols) { + pthread_cond_wait(&lf_sync->recon_done_cond[cur_row], + &lf_sync->recon_done_mutex[cur_row]); + } + pthread_mutex_unlock(&lf_sync->recon_done_mutex[cur_row]); + pthread_mutex_lock(lf_sync->lf_mutex); + if (lf_sync->corrupted) { + int row = return_val >> MI_BLOCK_SIZE_LOG2; + pthread_mutex_lock(&lf_sync->mutex[row]); + lf_sync->cur_sb_col[row] = INT_MAX; + pthread_cond_signal(&lf_sync->cond[row]); + pthread_mutex_unlock(&lf_sync->mutex[row]); + return_val = -1; + } + pthread_mutex_unlock(lf_sync->lf_mutex); +#else + (void)lf_sync; + if (cm->lf_row < max_rows) { + return_val = cm->lf_row; + cm->lf_row += MI_BLOCK_SIZE; + } +#endif // CONFIG_MULTITHREAD + + return return_val; +} + +void vp9_loopfilter_rows(LFWorkerData *lf_data, VP9LfSync *lf_sync) { + int mi_row; + VP9_COMMON *cm = lf_data->cm; + + while ((mi_row = get_next_row(cm, lf_sync)) != -1 && mi_row < cm->mi_rows) { + lf_data->start = mi_row; + lf_data->stop = mi_row + MI_BLOCK_SIZE; + + thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, + lf_data->start, lf_data->stop, lf_data->y_only, + lf_sync); + } +} + +void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row, + int corrupted) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(lf_sync->lf_mutex); + lf_sync->corrupted |= corrupted; + pthread_mutex_unlock(lf_sync->lf_mutex); + pthread_mutex_lock(&lf_sync->recon_done_mutex[row]); + lf_sync->num_tiles_done[row] += 1; + if (num_tiles == lf_sync->num_tiles_done[row]) { + if (is_last_row) { + /* The last 2 rows wait on the last row to be done. + * So, we have to broadcast the signal in this case. + */ + pthread_cond_broadcast(&lf_sync->recon_done_cond[row]); + } else { + pthread_cond_signal(&lf_sync->recon_done_cond[row]); + } + } + pthread_mutex_unlock(&lf_sync->recon_done_mutex[row]); +#else + (void)lf_sync; + (void)num_tiles; + (void)row; + (void)is_last_row; + (void)corrupted; +#endif // CONFIG_MULTITHREAD +} + +void vp9_loopfilter_job(LFWorkerData *lf_data, VP9LfSync *lf_sync) { + thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, + lf_data->start, lf_data->stop, lf_data->y_only, + lf_sync); +} + +// Accumulate frame counts. +void vp9_accumulate_frame_counts(FRAME_COUNTS *accum, + const FRAME_COUNTS *counts, int is_dec) { + int i, j, k, l, m; + + for (i = 0; i < BLOCK_SIZE_GROUPS; i++) + for (j = 0; j < INTRA_MODES; j++) + accum->y_mode[i][j] += counts->y_mode[i][j]; + + for (i = 0; i < INTRA_MODES; i++) + for (j = 0; j < INTRA_MODES; j++) + accum->uv_mode[i][j] += counts->uv_mode[i][j]; + + for (i = 0; i < PARTITION_CONTEXTS; i++) + for (j = 0; j < PARTITION_TYPES; j++) + accum->partition[i][j] += counts->partition[i][j]; + + if (is_dec) { + int n; + for (i = 0; i < TX_SIZES; i++) + for (j = 0; j < PLANE_TYPES; j++) + for (k = 0; k < REF_TYPES; k++) + for (l = 0; l < COEF_BANDS; l++) + for (m = 0; m < COEFF_CONTEXTS; m++) { + accum->eob_branch[i][j][k][l][m] += + counts->eob_branch[i][j][k][l][m]; + for (n = 0; n < UNCONSTRAINED_NODES + 1; n++) + accum->coef[i][j][k][l][m][n] += counts->coef[i][j][k][l][m][n]; + } + } else { + for (i = 0; i < TX_SIZES; i++) + for (j = 0; j < PLANE_TYPES; j++) + for (k = 0; k < REF_TYPES; k++) + for (l = 0; l < COEF_BANDS; l++) + for (m = 0; m < COEFF_CONTEXTS; m++) + accum->eob_branch[i][j][k][l][m] += + counts->eob_branch[i][j][k][l][m]; + // In the encoder, coef is only updated at frame + // level, so not need to accumulate it here. + // for (n = 0; n < UNCONSTRAINED_NODES + 1; n++) + // accum->coef[i][j][k][l][m][n] += + // counts->coef[i][j][k][l][m][n]; + } + + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) + for (j = 0; j < SWITCHABLE_FILTERS; j++) + accum->switchable_interp[i][j] += counts->switchable_interp[i][j]; + + for (i = 0; i < INTER_MODE_CONTEXTS; i++) + for (j = 0; j < INTER_MODES; j++) + accum->inter_mode[i][j] += counts->inter_mode[i][j]; + + for (i = 0; i < INTRA_INTER_CONTEXTS; i++) + for (j = 0; j < 2; j++) + accum->intra_inter[i][j] += counts->intra_inter[i][j]; + + for (i = 0; i < COMP_INTER_CONTEXTS; i++) + for (j = 0; j < 2; j++) accum->comp_inter[i][j] += counts->comp_inter[i][j]; + + for (i = 0; i < REF_CONTEXTS; i++) + for (j = 0; j < 2; j++) + for (k = 0; k < 2; k++) + accum->single_ref[i][j][k] += counts->single_ref[i][j][k]; + + for (i = 0; i < REF_CONTEXTS; i++) + for (j = 0; j < 2; j++) accum->comp_ref[i][j] += counts->comp_ref[i][j]; + + for (i = 0; i < TX_SIZE_CONTEXTS; i++) { + for (j = 0; j < TX_SIZES; j++) + accum->tx.p32x32[i][j] += counts->tx.p32x32[i][j]; + + for (j = 0; j < TX_SIZES - 1; j++) + accum->tx.p16x16[i][j] += counts->tx.p16x16[i][j]; + + for (j = 0; j < TX_SIZES - 2; j++) + accum->tx.p8x8[i][j] += counts->tx.p8x8[i][j]; + } + + for (i = 0; i < TX_SIZES; i++) + accum->tx.tx_totals[i] += counts->tx.tx_totals[i]; + + for (i = 0; i < SKIP_CONTEXTS; i++) + for (j = 0; j < 2; j++) accum->skip[i][j] += counts->skip[i][j]; + + for (i = 0; i < MV_JOINTS; i++) accum->mv.joints[i] += counts->mv.joints[i]; + + for (k = 0; k < 2; k++) { + nmv_component_counts *const comps = &accum->mv.comps[k]; + const nmv_component_counts *const comps_t = &counts->mv.comps[k]; + + for (i = 0; i < 2; i++) { + comps->sign[i] += comps_t->sign[i]; + comps->class0_hp[i] += comps_t->class0_hp[i]; + comps->hp[i] += comps_t->hp[i]; + } + + for (i = 0; i < MV_CLASSES; i++) comps->classes[i] += comps_t->classes[i]; + + for (i = 0; i < CLASS0_SIZE; i++) { + comps->class0[i] += comps_t->class0[i]; + for (j = 0; j < MV_FP_SIZE; j++) + comps->class0_fp[i][j] += comps_t->class0_fp[i][j]; + } + + for (i = 0; i < MV_OFFSET_BITS; i++) + for (j = 0; j < 2; j++) comps->bits[i][j] += comps_t->bits[i][j]; + + for (i = 0; i < MV_FP_SIZE; i++) comps->fp[i] += comps_t->fp[i]; + } +} diff --git a/media/libvpx/libvpx/vp9/common/vp9_thread_common.h b/media/libvpx/libvpx/vp9/common/vp9_thread_common.h new file mode 100644 index 0000000000..5df0117f12 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_thread_common.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_COMMON_VP9_THREAD_COMMON_H_ +#define VPX_VP9_COMMON_VP9_THREAD_COMMON_H_ +#include "./vpx_config.h" +#include "vp9/common/vp9_loopfilter.h" +#include "vpx_util/vpx_thread.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP9Common; +struct FRAME_COUNTS; + +// Loopfilter row synchronization +typedef struct VP9LfSyncData { +#if CONFIG_MULTITHREAD + pthread_mutex_t *mutex; + pthread_cond_t *cond; +#endif + // Allocate memory to store the loop-filtered superblock index in each row. + int *cur_sb_col; + // The optimal sync_range for different resolution and platform should be + // determined by testing. Currently, it is chosen to be a power-of-2 number. + int sync_range; + int rows; + + // Row-based parallel loopfilter data + LFWorkerData *lfdata; + int num_workers; // number of allocated workers. + int num_active_workers; // number of scheduled workers. + +#if CONFIG_MULTITHREAD + pthread_mutex_t *lf_mutex; + pthread_mutex_t *recon_done_mutex; + pthread_cond_t *recon_done_cond; +#endif + int *num_tiles_done; + int corrupted; +} VP9LfSync; + +// Allocate memory for loopfilter row synchronization. +void vp9_loop_filter_alloc(VP9LfSync *lf_sync, struct VP9Common *cm, int rows, + int width, int num_workers); + +// Deallocate loopfilter synchronization related mutex and data. +void vp9_loop_filter_dealloc(VP9LfSync *lf_sync); + +// Multi-threaded loopfilter that uses the tile threads. +void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct VP9Common *cm, + struct macroblockd_plane planes[MAX_MB_PLANE], + int frame_filter_level, int y_only, + int partial_frame, VPxWorker *workers, + int num_workers, VP9LfSync *lf_sync); + +// Multi-threaded loopfilter initialisations +void vp9_lpf_mt_init(VP9LfSync *lf_sync, struct VP9Common *cm, + int frame_filter_level, int num_workers); + +void vp9_loopfilter_rows(LFWorkerData *lf_data, VP9LfSync *lf_sync); + +void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row, + int corrupted); + +void vp9_loopfilter_job(LFWorkerData *lf_data, VP9LfSync *lf_sync); + +void vp9_accumulate_frame_counts(struct FRAME_COUNTS *accum, + const struct FRAME_COUNTS *counts, int is_dec); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_COMMON_VP9_THREAD_COMMON_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_tile_common.c b/media/libvpx/libvpx/vp9/common/vp9_tile_common.c new file mode 100644 index 0000000000..672f808adc --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_tile_common.c @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_tile_common.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vpx_dsp/vpx_dsp_common.h" + +#define MIN_TILE_WIDTH_B64 4 +#define MAX_TILE_WIDTH_B64 64 + +static int get_tile_offset(int idx, int mis, int log2) { + const int sb_cols = mi_cols_aligned_to_sb(mis) >> MI_BLOCK_SIZE_LOG2; + const int offset = ((idx * sb_cols) >> log2) << MI_BLOCK_SIZE_LOG2; + return VPXMIN(offset, mis); +} + +void vp9_tile_set_row(TileInfo *tile, const VP9_COMMON *cm, int row) { + tile->mi_row_start = get_tile_offset(row, cm->mi_rows, cm->log2_tile_rows); + tile->mi_row_end = get_tile_offset(row + 1, cm->mi_rows, cm->log2_tile_rows); +} + +void vp9_tile_set_col(TileInfo *tile, const VP9_COMMON *cm, int col) { + tile->mi_col_start = get_tile_offset(col, cm->mi_cols, cm->log2_tile_cols); + tile->mi_col_end = get_tile_offset(col + 1, cm->mi_cols, cm->log2_tile_cols); +} + +void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm, int row, int col) { + vp9_tile_set_row(tile, cm, row); + vp9_tile_set_col(tile, cm, col); +} + +static int get_min_log2_tile_cols(const int sb64_cols) { + int min_log2 = 0; + while ((MAX_TILE_WIDTH_B64 << min_log2) < sb64_cols) ++min_log2; + return min_log2; +} + +static int get_max_log2_tile_cols(const int sb64_cols) { + int max_log2 = 1; + while ((sb64_cols >> max_log2) >= MIN_TILE_WIDTH_B64) ++max_log2; + return max_log2 - 1; +} + +void vp9_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols, + int *max_log2_tile_cols) { + const int sb64_cols = mi_cols_aligned_to_sb(mi_cols) >> MI_BLOCK_SIZE_LOG2; + *min_log2_tile_cols = get_min_log2_tile_cols(sb64_cols); + *max_log2_tile_cols = get_max_log2_tile_cols(sb64_cols); + assert(*min_log2_tile_cols <= *max_log2_tile_cols); +} diff --git a/media/libvpx/libvpx/vp9/common/vp9_tile_common.h b/media/libvpx/libvpx/vp9/common/vp9_tile_common.h new file mode 100644 index 0000000000..4ccf0a3d5f --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/vp9_tile_common.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_COMMON_VP9_TILE_COMMON_H_ +#define VPX_VP9_COMMON_VP9_TILE_COMMON_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP9Common; + +typedef struct TileInfo { + int mi_row_start, mi_row_end; + int mi_col_start, mi_col_end; +} TileInfo; + +// initializes 'tile->mi_(row|col)_(start|end)' for (row, col) based on +// 'cm->log2_tile_(rows|cols)' & 'cm->mi_(rows|cols)' +void vp9_tile_init(TileInfo *tile, const struct VP9Common *cm, int row, + int col); + +void vp9_tile_set_row(TileInfo *tile, const struct VP9Common *cm, int row); +void vp9_tile_set_col(TileInfo *tile, const struct VP9Common *cm, int col); + +void vp9_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols, + int *max_log2_tile_cols); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_COMMON_VP9_TILE_COMMON_H_ diff --git a/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c b/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c new file mode 100644 index 0000000000..57b79a732d --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c @@ -0,0 +1,419 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_idct.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in, + const int c, + __m128i *const s) { + const __m128i pair_c = pair_set_epi32(4 * c, 0); + __m128i x[2]; + + extend_64bit(in, x); + s[0] = _mm_mul_epi32(pair_c, x[0]); + s[1] = _mm_mul_epi32(pair_c, x[1]); +} + +static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0, + const __m128i in1, + const int c0, const int c1, + __m128i *const s0, + __m128i *const s1) { + const __m128i pair_c0 = pair_set_epi32(4 * c0, 0); + const __m128i pair_c1 = pair_set_epi32(4 * c1, 0); + __m128i t00[2], t01[2], t10[2], t11[2]; + __m128i x0[2], x1[2]; + + extend_64bit(in0, x0); + extend_64bit(in1, x1); + t00[0] = _mm_mul_epi32(pair_c0, x0[0]); + t00[1] = _mm_mul_epi32(pair_c0, x0[1]); + t01[0] = _mm_mul_epi32(pair_c0, x1[0]); + t01[1] = _mm_mul_epi32(pair_c0, x1[1]); + t10[0] = _mm_mul_epi32(pair_c1, x0[0]); + t10[1] = _mm_mul_epi32(pair_c1, x0[1]); + t11[0] = _mm_mul_epi32(pair_c1, x1[0]); + t11[1] = _mm_mul_epi32(pair_c1, x1[1]); + + s0[0] = _mm_add_epi64(t00[0], t11[0]); + s0[1] = _mm_add_epi64(t00[1], t11[1]); + s1[0] = _mm_sub_epi64(t10[0], t01[0]); + s1[1] = _mm_sub_epi64(t10[1], t01[1]); +} + +static void highbd_iadst16_4col_sse4_1(__m128i *const io /*io[16]*/) { + __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2], s8[2], s9[2], + s10[2], s11[2], s12[2], s13[2], s14[2], s15[2]; + __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2], x8[2], x9[2], + x10[2], x11[2], x12[2], x13[2], x14[2], x15[2]; + + // stage 1 + highbd_iadst_butterfly_sse4_1(io[15], io[0], cospi_1_64, cospi_31_64, s0, s1); + highbd_iadst_butterfly_sse4_1(io[13], io[2], cospi_5_64, cospi_27_64, s2, s3); + highbd_iadst_butterfly_sse4_1(io[11], io[4], cospi_9_64, cospi_23_64, s4, s5); + highbd_iadst_butterfly_sse4_1(io[9], io[6], cospi_13_64, cospi_19_64, s6, s7); + highbd_iadst_butterfly_sse4_1(io[7], io[8], cospi_17_64, cospi_15_64, s8, s9); + highbd_iadst_butterfly_sse4_1(io[5], io[10], cospi_21_64, cospi_11_64, s10, + s11); + highbd_iadst_butterfly_sse4_1(io[3], io[12], cospi_25_64, cospi_7_64, s12, + s13); + highbd_iadst_butterfly_sse4_1(io[1], io[14], cospi_29_64, cospi_3_64, s14, + s15); + + x0[0] = _mm_add_epi64(s0[0], s8[0]); + x0[1] = _mm_add_epi64(s0[1], s8[1]); + x1[0] = _mm_add_epi64(s1[0], s9[0]); + x1[1] = _mm_add_epi64(s1[1], s9[1]); + x2[0] = _mm_add_epi64(s2[0], s10[0]); + x2[1] = _mm_add_epi64(s2[1], s10[1]); + x3[0] = _mm_add_epi64(s3[0], s11[0]); + x3[1] = _mm_add_epi64(s3[1], s11[1]); + x4[0] = _mm_add_epi64(s4[0], s12[0]); + x4[1] = _mm_add_epi64(s4[1], s12[1]); + x5[0] = _mm_add_epi64(s5[0], s13[0]); + x5[1] = _mm_add_epi64(s5[1], s13[1]); + x6[0] = _mm_add_epi64(s6[0], s14[0]); + x6[1] = _mm_add_epi64(s6[1], s14[1]); + x7[0] = _mm_add_epi64(s7[0], s15[0]); + x7[1] = _mm_add_epi64(s7[1], s15[1]); + x8[0] = _mm_sub_epi64(s0[0], s8[0]); + x8[1] = _mm_sub_epi64(s0[1], s8[1]); + x9[0] = _mm_sub_epi64(s1[0], s9[0]); + x9[1] = _mm_sub_epi64(s1[1], s9[1]); + x10[0] = _mm_sub_epi64(s2[0], s10[0]); + x10[1] = _mm_sub_epi64(s2[1], s10[1]); + x11[0] = _mm_sub_epi64(s3[0], s11[0]); + x11[1] = _mm_sub_epi64(s3[1], s11[1]); + x12[0] = _mm_sub_epi64(s4[0], s12[0]); + x12[1] = _mm_sub_epi64(s4[1], s12[1]); + x13[0] = _mm_sub_epi64(s5[0], s13[0]); + x13[1] = _mm_sub_epi64(s5[1], s13[1]); + x14[0] = _mm_sub_epi64(s6[0], s14[0]); + x14[1] = _mm_sub_epi64(s6[1], s14[1]); + x15[0] = _mm_sub_epi64(s7[0], s15[0]); + x15[1] = _mm_sub_epi64(s7[1], s15[1]); + + x0[0] = dct_const_round_shift_64bit(x0[0]); + x0[1] = dct_const_round_shift_64bit(x0[1]); + x1[0] = dct_const_round_shift_64bit(x1[0]); + x1[1] = dct_const_round_shift_64bit(x1[1]); + x2[0] = dct_const_round_shift_64bit(x2[0]); + x2[1] = dct_const_round_shift_64bit(x2[1]); + x3[0] = dct_const_round_shift_64bit(x3[0]); + x3[1] = dct_const_round_shift_64bit(x3[1]); + x4[0] = dct_const_round_shift_64bit(x4[0]); + x4[1] = dct_const_round_shift_64bit(x4[1]); + x5[0] = dct_const_round_shift_64bit(x5[0]); + x5[1] = dct_const_round_shift_64bit(x5[1]); + x6[0] = dct_const_round_shift_64bit(x6[0]); + x6[1] = dct_const_round_shift_64bit(x6[1]); + x7[0] = dct_const_round_shift_64bit(x7[0]); + x7[1] = dct_const_round_shift_64bit(x7[1]); + x8[0] = dct_const_round_shift_64bit(x8[0]); + x8[1] = dct_const_round_shift_64bit(x8[1]); + x9[0] = dct_const_round_shift_64bit(x9[0]); + x9[1] = dct_const_round_shift_64bit(x9[1]); + x10[0] = dct_const_round_shift_64bit(x10[0]); + x10[1] = dct_const_round_shift_64bit(x10[1]); + x11[0] = dct_const_round_shift_64bit(x11[0]); + x11[1] = dct_const_round_shift_64bit(x11[1]); + x12[0] = dct_const_round_shift_64bit(x12[0]); + x12[1] = dct_const_round_shift_64bit(x12[1]); + x13[0] = dct_const_round_shift_64bit(x13[0]); + x13[1] = dct_const_round_shift_64bit(x13[1]); + x14[0] = dct_const_round_shift_64bit(x14[0]); + x14[1] = dct_const_round_shift_64bit(x14[1]); + x15[0] = dct_const_round_shift_64bit(x15[0]); + x15[1] = dct_const_round_shift_64bit(x15[1]); + x0[0] = pack_4(x0[0], x0[1]); + x1[0] = pack_4(x1[0], x1[1]); + x2[0] = pack_4(x2[0], x2[1]); + x3[0] = pack_4(x3[0], x3[1]); + x4[0] = pack_4(x4[0], x4[1]); + x5[0] = pack_4(x5[0], x5[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + x8[0] = pack_4(x8[0], x8[1]); + x9[0] = pack_4(x9[0], x9[1]); + x10[0] = pack_4(x10[0], x10[1]); + x11[0] = pack_4(x11[0], x11[1]); + x12[0] = pack_4(x12[0], x12[1]); + x13[0] = pack_4(x13[0], x13[1]); + x14[0] = pack_4(x14[0], x14[1]); + x15[0] = pack_4(x15[0], x15[1]); + + // stage 2 + s0[0] = x0[0]; + s1[0] = x1[0]; + s2[0] = x2[0]; + s3[0] = x3[0]; + s4[0] = x4[0]; + s5[0] = x5[0]; + s6[0] = x6[0]; + s7[0] = x7[0]; + x0[0] = _mm_add_epi32(s0[0], s4[0]); + x1[0] = _mm_add_epi32(s1[0], s5[0]); + x2[0] = _mm_add_epi32(s2[0], s6[0]); + x3[0] = _mm_add_epi32(s3[0], s7[0]); + x4[0] = _mm_sub_epi32(s0[0], s4[0]); + x5[0] = _mm_sub_epi32(s1[0], s5[0]); + x6[0] = _mm_sub_epi32(s2[0], s6[0]); + x7[0] = _mm_sub_epi32(s3[0], s7[0]); + + highbd_iadst_butterfly_sse4_1(x8[0], x9[0], cospi_4_64, cospi_28_64, s8, s9); + highbd_iadst_butterfly_sse4_1(x10[0], x11[0], cospi_20_64, cospi_12_64, s10, + s11); + highbd_iadst_butterfly_sse4_1(x13[0], x12[0], cospi_28_64, cospi_4_64, s13, + s12); + highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_12_64, cospi_20_64, s15, + s14); + + x8[0] = _mm_add_epi64(s8[0], s12[0]); + x8[1] = _mm_add_epi64(s8[1], s12[1]); + x9[0] = _mm_add_epi64(s9[0], s13[0]); + x9[1] = _mm_add_epi64(s9[1], s13[1]); + x10[0] = _mm_add_epi64(s10[0], s14[0]); + x10[1] = _mm_add_epi64(s10[1], s14[1]); + x11[0] = _mm_add_epi64(s11[0], s15[0]); + x11[1] = _mm_add_epi64(s11[1], s15[1]); + x12[0] = _mm_sub_epi64(s8[0], s12[0]); + x12[1] = _mm_sub_epi64(s8[1], s12[1]); + x13[0] = _mm_sub_epi64(s9[0], s13[0]); + x13[1] = _mm_sub_epi64(s9[1], s13[1]); + x14[0] = _mm_sub_epi64(s10[0], s14[0]); + x14[1] = _mm_sub_epi64(s10[1], s14[1]); + x15[0] = _mm_sub_epi64(s11[0], s15[0]); + x15[1] = _mm_sub_epi64(s11[1], s15[1]); + x8[0] = dct_const_round_shift_64bit(x8[0]); + x8[1] = dct_const_round_shift_64bit(x8[1]); + x9[0] = dct_const_round_shift_64bit(x9[0]); + x9[1] = dct_const_round_shift_64bit(x9[1]); + x10[0] = dct_const_round_shift_64bit(x10[0]); + x10[1] = dct_const_round_shift_64bit(x10[1]); + x11[0] = dct_const_round_shift_64bit(x11[0]); + x11[1] = dct_const_round_shift_64bit(x11[1]); + x12[0] = dct_const_round_shift_64bit(x12[0]); + x12[1] = dct_const_round_shift_64bit(x12[1]); + x13[0] = dct_const_round_shift_64bit(x13[0]); + x13[1] = dct_const_round_shift_64bit(x13[1]); + x14[0] = dct_const_round_shift_64bit(x14[0]); + x14[1] = dct_const_round_shift_64bit(x14[1]); + x15[0] = dct_const_round_shift_64bit(x15[0]); + x15[1] = dct_const_round_shift_64bit(x15[1]); + x8[0] = pack_4(x8[0], x8[1]); + x9[0] = pack_4(x9[0], x9[1]); + x10[0] = pack_4(x10[0], x10[1]); + x11[0] = pack_4(x11[0], x11[1]); + x12[0] = pack_4(x12[0], x12[1]); + x13[0] = pack_4(x13[0], x13[1]); + x14[0] = pack_4(x14[0], x14[1]); + x15[0] = pack_4(x15[0], x15[1]); + + // stage 3 + s0[0] = x0[0]; + s1[0] = x1[0]; + s2[0] = x2[0]; + s3[0] = x3[0]; + highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5); + highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6); + s8[0] = x8[0]; + s9[0] = x9[0]; + s10[0] = x10[0]; + s11[0] = x11[0]; + highbd_iadst_butterfly_sse4_1(x12[0], x13[0], cospi_8_64, cospi_24_64, s12, + s13); + highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_24_64, cospi_8_64, s15, + s14); + + x0[0] = _mm_add_epi32(s0[0], s2[0]); + x1[0] = _mm_add_epi32(s1[0], s3[0]); + x2[0] = _mm_sub_epi32(s0[0], s2[0]); + x3[0] = _mm_sub_epi32(s1[0], s3[0]); + x4[0] = _mm_add_epi64(s4[0], s6[0]); + x4[1] = _mm_add_epi64(s4[1], s6[1]); + x5[0] = _mm_add_epi64(s5[0], s7[0]); + x5[1] = _mm_add_epi64(s5[1], s7[1]); + x6[0] = _mm_sub_epi64(s4[0], s6[0]); + x6[1] = _mm_sub_epi64(s4[1], s6[1]); + x7[0] = _mm_sub_epi64(s5[0], s7[0]); + x7[1] = _mm_sub_epi64(s5[1], s7[1]); + x4[0] = dct_const_round_shift_64bit(x4[0]); + x4[1] = dct_const_round_shift_64bit(x4[1]); + x5[0] = dct_const_round_shift_64bit(x5[0]); + x5[1] = dct_const_round_shift_64bit(x5[1]); + x6[0] = dct_const_round_shift_64bit(x6[0]); + x6[1] = dct_const_round_shift_64bit(x6[1]); + x7[0] = dct_const_round_shift_64bit(x7[0]); + x7[1] = dct_const_round_shift_64bit(x7[1]); + x4[0] = pack_4(x4[0], x4[1]); + x5[0] = pack_4(x5[0], x5[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + x8[0] = _mm_add_epi32(s8[0], s10[0]); + x9[0] = _mm_add_epi32(s9[0], s11[0]); + x10[0] = _mm_sub_epi32(s8[0], s10[0]); + x11[0] = _mm_sub_epi32(s9[0], s11[0]); + x12[0] = _mm_add_epi64(s12[0], s14[0]); + x12[1] = _mm_add_epi64(s12[1], s14[1]); + x13[0] = _mm_add_epi64(s13[0], s15[0]); + x13[1] = _mm_add_epi64(s13[1], s15[1]); + x14[0] = _mm_sub_epi64(s12[0], s14[0]); + x14[1] = _mm_sub_epi64(s12[1], s14[1]); + x15[0] = _mm_sub_epi64(s13[0], s15[0]); + x15[1] = _mm_sub_epi64(s13[1], s15[1]); + x12[0] = dct_const_round_shift_64bit(x12[0]); + x12[1] = dct_const_round_shift_64bit(x12[1]); + x13[0] = dct_const_round_shift_64bit(x13[0]); + x13[1] = dct_const_round_shift_64bit(x13[1]); + x14[0] = dct_const_round_shift_64bit(x14[0]); + x14[1] = dct_const_round_shift_64bit(x14[1]); + x15[0] = dct_const_round_shift_64bit(x15[0]); + x15[1] = dct_const_round_shift_64bit(x15[1]); + x12[0] = pack_4(x12[0], x12[1]); + x13[0] = pack_4(x13[0], x13[1]); + x14[0] = pack_4(x14[0], x14[1]); + x15[0] = pack_4(x15[0], x15[1]); + + // stage 4 + s2[0] = _mm_add_epi32(x2[0], x3[0]); + s3[0] = _mm_sub_epi32(x2[0], x3[0]); + s6[0] = _mm_add_epi32(x7[0], x6[0]); + s7[0] = _mm_sub_epi32(x7[0], x6[0]); + s10[0] = _mm_add_epi32(x11[0], x10[0]); + s11[0] = _mm_sub_epi32(x11[0], x10[0]); + s14[0] = _mm_add_epi32(x14[0], x15[0]); + s15[0] = _mm_sub_epi32(x14[0], x15[0]); + highbd_iadst_half_butterfly_sse4_1(s2[0], -cospi_16_64, s2); + highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3); + highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6); + highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7); + highbd_iadst_half_butterfly_sse4_1(s10[0], cospi_16_64, s10); + highbd_iadst_half_butterfly_sse4_1(s11[0], cospi_16_64, s11); + highbd_iadst_half_butterfly_sse4_1(s14[0], -cospi_16_64, s14); + highbd_iadst_half_butterfly_sse4_1(s15[0], cospi_16_64, s15); + + x2[0] = dct_const_round_shift_64bit(s2[0]); + x2[1] = dct_const_round_shift_64bit(s2[1]); + x3[0] = dct_const_round_shift_64bit(s3[0]); + x3[1] = dct_const_round_shift_64bit(s3[1]); + x6[0] = dct_const_round_shift_64bit(s6[0]); + x6[1] = dct_const_round_shift_64bit(s6[1]); + x7[0] = dct_const_round_shift_64bit(s7[0]); + x7[1] = dct_const_round_shift_64bit(s7[1]); + x10[0] = dct_const_round_shift_64bit(s10[0]); + x10[1] = dct_const_round_shift_64bit(s10[1]); + x11[0] = dct_const_round_shift_64bit(s11[0]); + x11[1] = dct_const_round_shift_64bit(s11[1]); + x14[0] = dct_const_round_shift_64bit(s14[0]); + x14[1] = dct_const_round_shift_64bit(s14[1]); + x15[0] = dct_const_round_shift_64bit(s15[0]); + x15[1] = dct_const_round_shift_64bit(s15[1]); + x2[0] = pack_4(x2[0], x2[1]); + x3[0] = pack_4(x3[0], x3[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + x10[0] = pack_4(x10[0], x10[1]); + x11[0] = pack_4(x11[0], x11[1]); + x14[0] = pack_4(x14[0], x14[1]); + x15[0] = pack_4(x15[0], x15[1]); + + io[0] = x0[0]; + io[1] = _mm_sub_epi32(_mm_setzero_si128(), x8[0]); + io[2] = x12[0]; + io[3] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]); + io[4] = x6[0]; + io[5] = x14[0]; + io[6] = x10[0]; + io[7] = x2[0]; + io[8] = x3[0]; + io[9] = x11[0]; + io[10] = x15[0]; + io[11] = x7[0]; + io[12] = x5[0]; + io[13] = _mm_sub_epi32(_mm_setzero_si128(), x13[0]); + io[14] = x9[0]; + io[15] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]); +} + +void vp9_highbd_iht16x16_256_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int tx_type, int bd) { + int i; + __m128i out[16], *in; + + if (bd == 8) { + __m128i l[16], r[16]; + + in = l; + for (i = 0; i < 2; i++) { + highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]); + highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]); + if (tx_type == DCT_DCT || tx_type == ADST_DCT) { + idct16_8col(in, in); + } else { + vpx_iadst16_8col_sse2(in); + } + in = r; + input += 128; + } + + for (i = 0; i < 16; i += 8) { + int j; + transpose_16bit_8x8(l + i, out); + transpose_16bit_8x8(r + i, out + 8); + if (tx_type == DCT_DCT || tx_type == DCT_ADST) { + idct16_8col(out, out); + } else { + vpx_iadst16_8col_sse2(out); + } + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[4][16]; + + for (i = 0; i < 4; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]); + if (tx_type == DCT_DCT || tx_type == ADST_DCT) { + vpx_highbd_idct16_4col_sse4_1(in); + } else { + highbd_iadst16_4col_sse4_1(in); + } + input += 4 * 16; + } + + for (i = 0; i < 16; i += 4) { + int j; + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + if (tx_type == DCT_DCT || tx_type == DCT_ADST) { + vpx_highbd_idct16_4col_sse4_1(out); + } else { + highbd_iadst16_4col_sse4_1(out); + } + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} diff --git a/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c b/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c new file mode 100644 index 0000000000..af158536f9 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_idct.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void highbd_iadst4_sse4_1(__m128i *const io) { + const __m128i pair_c1 = pair_set_epi32(4 * sinpi_1_9, 0); + const __m128i pair_c2 = pair_set_epi32(4 * sinpi_2_9, 0); + const __m128i pair_c3 = pair_set_epi32(4 * sinpi_3_9, 0); + const __m128i pair_c4 = pair_set_epi32(4 * sinpi_4_9, 0); + __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], t0[2], t1[2], t2[2]; + __m128i temp[2]; + + transpose_32bit_4x4(io, io); + + extend_64bit(io[0], temp); + s0[0] = _mm_mul_epi32(pair_c1, temp[0]); + s0[1] = _mm_mul_epi32(pair_c1, temp[1]); + s1[0] = _mm_mul_epi32(pair_c2, temp[0]); + s1[1] = _mm_mul_epi32(pair_c2, temp[1]); + + extend_64bit(io[1], temp); + s2[0] = _mm_mul_epi32(pair_c3, temp[0]); + s2[1] = _mm_mul_epi32(pair_c3, temp[1]); + + extend_64bit(io[2], temp); + s3[0] = _mm_mul_epi32(pair_c4, temp[0]); + s3[1] = _mm_mul_epi32(pair_c4, temp[1]); + s4[0] = _mm_mul_epi32(pair_c1, temp[0]); + s4[1] = _mm_mul_epi32(pair_c1, temp[1]); + + extend_64bit(io[3], temp); + s5[0] = _mm_mul_epi32(pair_c2, temp[0]); + s5[1] = _mm_mul_epi32(pair_c2, temp[1]); + s6[0] = _mm_mul_epi32(pair_c4, temp[0]); + s6[1] = _mm_mul_epi32(pair_c4, temp[1]); + + t0[0] = _mm_add_epi64(s0[0], s3[0]); + t0[1] = _mm_add_epi64(s0[1], s3[1]); + t0[0] = _mm_add_epi64(t0[0], s5[0]); + t0[1] = _mm_add_epi64(t0[1], s5[1]); + t1[0] = _mm_sub_epi64(s1[0], s4[0]); + t1[1] = _mm_sub_epi64(s1[1], s4[1]); + t1[0] = _mm_sub_epi64(t1[0], s6[0]); + t1[1] = _mm_sub_epi64(t1[1], s6[1]); + temp[0] = _mm_sub_epi32(io[0], io[2]); + temp[0] = _mm_add_epi32(temp[0], io[3]); + extend_64bit(temp[0], temp); + t2[0] = _mm_mul_epi32(pair_c3, temp[0]); + t2[1] = _mm_mul_epi32(pair_c3, temp[1]); + + s0[0] = _mm_add_epi64(t0[0], s2[0]); + s0[1] = _mm_add_epi64(t0[1], s2[1]); + s1[0] = _mm_add_epi64(t1[0], s2[0]); + s1[1] = _mm_add_epi64(t1[1], s2[1]); + s3[0] = _mm_add_epi64(t0[0], t1[0]); + s3[1] = _mm_add_epi64(t0[1], t1[1]); + s3[0] = _mm_sub_epi64(s3[0], s2[0]); + s3[1] = _mm_sub_epi64(s3[1], s2[1]); + + s0[0] = dct_const_round_shift_64bit(s0[0]); + s0[1] = dct_const_round_shift_64bit(s0[1]); + s1[0] = dct_const_round_shift_64bit(s1[0]); + s1[1] = dct_const_round_shift_64bit(s1[1]); + s2[0] = dct_const_round_shift_64bit(t2[0]); + s2[1] = dct_const_round_shift_64bit(t2[1]); + s3[0] = dct_const_round_shift_64bit(s3[0]); + s3[1] = dct_const_round_shift_64bit(s3[1]); + io[0] = pack_4(s0[0], s0[1]); + io[1] = pack_4(s1[0], s1[1]); + io[2] = pack_4(s2[0], s2[1]); + io[3] = pack_4(s3[0], s3[1]); +} + +void vp9_highbd_iht4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int tx_type, int bd) { + __m128i io[4]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0)); + io[1] = _mm_load_si128((const __m128i *)(input + 4)); + io[2] = _mm_load_si128((const __m128i *)(input + 8)); + io[3] = _mm_load_si128((const __m128i *)(input + 12)); + + if (bd == 8) { + __m128i io_short[2]; + + io_short[0] = _mm_packs_epi32(io[0], io[1]); + io_short[1] = _mm_packs_epi32(io[2], io[3]); + if (tx_type == DCT_DCT || tx_type == ADST_DCT) { + idct4_sse2(io_short); + } else { + iadst4_sse2(io_short); + } + if (tx_type == DCT_DCT || tx_type == DCT_ADST) { + idct4_sse2(io_short); + } else { + iadst4_sse2(io_short); + } + io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8)); + io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8)); + io[0] = _mm_srai_epi16(io_short[0], 4); + io[1] = _mm_srai_epi16(io_short[1], 4); + } else { + if (tx_type == DCT_DCT || tx_type == ADST_DCT) { + highbd_idct4_sse4_1(io); + } else { + highbd_iadst4_sse4_1(io); + } + if (tx_type == DCT_DCT || tx_type == DCT_ADST) { + highbd_idct4_sse4_1(io); + } else { + highbd_iadst4_sse4_1(io); + } + io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8)); + io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8)); + } + + recon_and_store_4x4(io, dest, stride, bd); +} diff --git a/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c b/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c new file mode 100644 index 0000000000..7d949b6dbc --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_idct.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in, + const int c, + __m128i *const s) { + const __m128i pair_c = pair_set_epi32(4 * c, 0); + __m128i x[2]; + + extend_64bit(in, x); + s[0] = _mm_mul_epi32(pair_c, x[0]); + s[1] = _mm_mul_epi32(pair_c, x[1]); +} + +static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0, + const __m128i in1, + const int c0, const int c1, + __m128i *const s0, + __m128i *const s1) { + const __m128i pair_c0 = pair_set_epi32(4 * c0, 0); + const __m128i pair_c1 = pair_set_epi32(4 * c1, 0); + __m128i t00[2], t01[2], t10[2], t11[2]; + __m128i x0[2], x1[2]; + + extend_64bit(in0, x0); + extend_64bit(in1, x1); + t00[0] = _mm_mul_epi32(pair_c0, x0[0]); + t00[1] = _mm_mul_epi32(pair_c0, x0[1]); + t01[0] = _mm_mul_epi32(pair_c0, x1[0]); + t01[1] = _mm_mul_epi32(pair_c0, x1[1]); + t10[0] = _mm_mul_epi32(pair_c1, x0[0]); + t10[1] = _mm_mul_epi32(pair_c1, x0[1]); + t11[0] = _mm_mul_epi32(pair_c1, x1[0]); + t11[1] = _mm_mul_epi32(pair_c1, x1[1]); + + s0[0] = _mm_add_epi64(t00[0], t11[0]); + s0[1] = _mm_add_epi64(t00[1], t11[1]); + s1[0] = _mm_sub_epi64(t10[0], t01[0]); + s1[1] = _mm_sub_epi64(t10[1], t01[1]); +} + +static void highbd_iadst8_sse4_1(__m128i *const io) { + __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2]; + __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2]; + + transpose_32bit_4x4x2(io, io); + + // stage 1 + highbd_iadst_butterfly_sse4_1(io[7], io[0], cospi_2_64, cospi_30_64, s0, s1); + highbd_iadst_butterfly_sse4_1(io[3], io[4], cospi_18_64, cospi_14_64, s4, s5); + x0[0] = _mm_add_epi64(s0[0], s4[0]); + x0[1] = _mm_add_epi64(s0[1], s4[1]); + x1[0] = _mm_add_epi64(s1[0], s5[0]); + x1[1] = _mm_add_epi64(s1[1], s5[1]); + x4[0] = _mm_sub_epi64(s0[0], s4[0]); + x4[1] = _mm_sub_epi64(s0[1], s4[1]); + x5[0] = _mm_sub_epi64(s1[0], s5[0]); + x5[1] = _mm_sub_epi64(s1[1], s5[1]); + + highbd_iadst_butterfly_sse4_1(io[5], io[2], cospi_10_64, cospi_22_64, s2, s3); + highbd_iadst_butterfly_sse4_1(io[1], io[6], cospi_26_64, cospi_6_64, s6, s7); + x2[0] = _mm_add_epi64(s2[0], s6[0]); + x2[1] = _mm_add_epi64(s2[1], s6[1]); + x3[0] = _mm_add_epi64(s3[0], s7[0]); + x3[1] = _mm_add_epi64(s3[1], s7[1]); + x6[0] = _mm_sub_epi64(s2[0], s6[0]); + x6[1] = _mm_sub_epi64(s2[1], s6[1]); + x7[0] = _mm_sub_epi64(s3[0], s7[0]); + x7[1] = _mm_sub_epi64(s3[1], s7[1]); + + x0[0] = dct_const_round_shift_64bit(x0[0]); + x0[1] = dct_const_round_shift_64bit(x0[1]); + x1[0] = dct_const_round_shift_64bit(x1[0]); + x1[1] = dct_const_round_shift_64bit(x1[1]); + x2[0] = dct_const_round_shift_64bit(x2[0]); + x2[1] = dct_const_round_shift_64bit(x2[1]); + x3[0] = dct_const_round_shift_64bit(x3[0]); + x3[1] = dct_const_round_shift_64bit(x3[1]); + x4[0] = dct_const_round_shift_64bit(x4[0]); + x4[1] = dct_const_round_shift_64bit(x4[1]); + x5[0] = dct_const_round_shift_64bit(x5[0]); + x5[1] = dct_const_round_shift_64bit(x5[1]); + x6[0] = dct_const_round_shift_64bit(x6[0]); + x6[1] = dct_const_round_shift_64bit(x6[1]); + x7[0] = dct_const_round_shift_64bit(x7[0]); + x7[1] = dct_const_round_shift_64bit(x7[1]); + s0[0] = pack_4(x0[0], x0[1]); // s0 = x0; + s1[0] = pack_4(x1[0], x1[1]); // s1 = x1; + s2[0] = pack_4(x2[0], x2[1]); // s2 = x2; + s3[0] = pack_4(x3[0], x3[1]); // s3 = x3; + x4[0] = pack_4(x4[0], x4[1]); + x5[0] = pack_4(x5[0], x5[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + + // stage 2 + x0[0] = _mm_add_epi32(s0[0], s2[0]); + x1[0] = _mm_add_epi32(s1[0], s3[0]); + x2[0] = _mm_sub_epi32(s0[0], s2[0]); + x3[0] = _mm_sub_epi32(s1[0], s3[0]); + + highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5); + highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6); + + x4[0] = _mm_add_epi64(s4[0], s6[0]); + x4[1] = _mm_add_epi64(s4[1], s6[1]); + x5[0] = _mm_add_epi64(s5[0], s7[0]); + x5[1] = _mm_add_epi64(s5[1], s7[1]); + x6[0] = _mm_sub_epi64(s4[0], s6[0]); + x6[1] = _mm_sub_epi64(s4[1], s6[1]); + x7[0] = _mm_sub_epi64(s5[0], s7[0]); + x7[1] = _mm_sub_epi64(s5[1], s7[1]); + x4[0] = dct_const_round_shift_64bit(x4[0]); + x4[1] = dct_const_round_shift_64bit(x4[1]); + x5[0] = dct_const_round_shift_64bit(x5[0]); + x5[1] = dct_const_round_shift_64bit(x5[1]); + x6[0] = dct_const_round_shift_64bit(x6[0]); + x6[1] = dct_const_round_shift_64bit(x6[1]); + x7[0] = dct_const_round_shift_64bit(x7[0]); + x7[1] = dct_const_round_shift_64bit(x7[1]); + x4[0] = pack_4(x4[0], x4[1]); + x5[0] = pack_4(x5[0], x5[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + + // stage 3 + s2[0] = _mm_add_epi32(x2[0], x3[0]); + s3[0] = _mm_sub_epi32(x2[0], x3[0]); + s6[0] = _mm_add_epi32(x6[0], x7[0]); + s7[0] = _mm_sub_epi32(x6[0], x7[0]); + highbd_iadst_half_butterfly_sse4_1(s2[0], cospi_16_64, s2); + highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3); + highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6); + highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7); + + x2[0] = dct_const_round_shift_64bit(s2[0]); + x2[1] = dct_const_round_shift_64bit(s2[1]); + x3[0] = dct_const_round_shift_64bit(s3[0]); + x3[1] = dct_const_round_shift_64bit(s3[1]); + x6[0] = dct_const_round_shift_64bit(s6[0]); + x6[1] = dct_const_round_shift_64bit(s6[1]); + x7[0] = dct_const_round_shift_64bit(s7[0]); + x7[1] = dct_const_round_shift_64bit(s7[1]); + x2[0] = pack_4(x2[0], x2[1]); + x3[0] = pack_4(x3[0], x3[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + + io[0] = x0[0]; + io[1] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]); + io[2] = x6[0]; + io[3] = _mm_sub_epi32(_mm_setzero_si128(), x2[0]); + io[4] = x3[0]; + io[5] = _mm_sub_epi32(_mm_setzero_si128(), x7[0]); + io[6] = x5[0]; + io[7] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]); +} + +void vp9_highbd_iht8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int tx_type, int bd) { + __m128i io[16]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0)); + io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4)); + io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0)); + io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4)); + io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0)); + io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4)); + io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0)); + io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4)); + io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0)); + io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4)); + io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0)); + io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4)); + io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0)); + io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4)); + io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0)); + io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4)); + + if (bd == 8) { + __m128i io_short[8]; + + io_short[0] = _mm_packs_epi32(io[0], io[4]); + io_short[1] = _mm_packs_epi32(io[1], io[5]); + io_short[2] = _mm_packs_epi32(io[2], io[6]); + io_short[3] = _mm_packs_epi32(io[3], io[7]); + io_short[4] = _mm_packs_epi32(io[8], io[12]); + io_short[5] = _mm_packs_epi32(io[9], io[13]); + io_short[6] = _mm_packs_epi32(io[10], io[14]); + io_short[7] = _mm_packs_epi32(io[11], io[15]); + + if (tx_type == DCT_DCT || tx_type == ADST_DCT) { + vpx_idct8_sse2(io_short); + } else { + iadst8_sse2(io_short); + } + if (tx_type == DCT_DCT || tx_type == DCT_ADST) { + vpx_idct8_sse2(io_short); + } else { + iadst8_sse2(io_short); + } + round_shift_8x8(io_short, io); + } else { + __m128i temp[4]; + + if (tx_type == DCT_DCT || tx_type == ADST_DCT) { + vpx_highbd_idct8x8_half1d_sse4_1(io); + vpx_highbd_idct8x8_half1d_sse4_1(&io[8]); + } else { + highbd_iadst8_sse4_1(io); + highbd_iadst8_sse4_1(&io[8]); + } + + temp[0] = io[4]; + temp[1] = io[5]; + temp[2] = io[6]; + temp[3] = io[7]; + io[4] = io[8]; + io[5] = io[9]; + io[6] = io[10]; + io[7] = io[11]; + + if (tx_type == DCT_DCT || tx_type == DCT_ADST) { + vpx_highbd_idct8x8_half1d_sse4_1(io); + io[8] = temp[0]; + io[9] = temp[1]; + io[10] = temp[2]; + io[11] = temp[3]; + vpx_highbd_idct8x8_half1d_sse4_1(&io[8]); + } else { + highbd_iadst8_sse4_1(io); + io[8] = temp[0]; + io[9] = temp[1]; + io[10] = temp[2]; + io[11] = temp[3]; + highbd_iadst8_sse4_1(&io[8]); + } + highbd_idct8x8_final_round(io); + } + recon_and_store_8x8(io, dest, stride, bd); +} diff --git a/media/libvpx/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/media/libvpx/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c new file mode 100644 index 0000000000..ad693718c0 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" + +void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { + __m128i in[2]; + const __m128i eight = _mm_set1_epi16(8); + + in[0] = load_input_data8(input); + in[1] = load_input_data8(input + 8); + + switch (tx_type) { + case DCT_DCT: + idct4_sse2(in); + idct4_sse2(in); + break; + case ADST_DCT: + idct4_sse2(in); + iadst4_sse2(in); + break; + case DCT_ADST: + iadst4_sse2(in); + idct4_sse2(in); + break; + default: + assert(tx_type == ADST_ADST); + iadst4_sse2(in); + iadst4_sse2(in); + break; + } + + // Final round and shift + in[0] = _mm_add_epi16(in[0], eight); + in[1] = _mm_add_epi16(in[1], eight); + + in[0] = _mm_srai_epi16(in[0], 4); + in[1] = _mm_srai_epi16(in[1], 4); + + recon_and_store4x4_sse2(in, dest, stride); +} + +void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { + __m128i in[8]; + const __m128i final_rounding = _mm_set1_epi16(1 << 4); + + // load input data + in[0] = load_input_data8(input); + in[1] = load_input_data8(input + 8 * 1); + in[2] = load_input_data8(input + 8 * 2); + in[3] = load_input_data8(input + 8 * 3); + in[4] = load_input_data8(input + 8 * 4); + in[5] = load_input_data8(input + 8 * 5); + in[6] = load_input_data8(input + 8 * 6); + in[7] = load_input_data8(input + 8 * 7); + + switch (tx_type) { + case DCT_DCT: + vpx_idct8_sse2(in); + vpx_idct8_sse2(in); + break; + case ADST_DCT: + vpx_idct8_sse2(in); + iadst8_sse2(in); + break; + case DCT_ADST: + iadst8_sse2(in); + vpx_idct8_sse2(in); + break; + default: + assert(tx_type == ADST_ADST); + iadst8_sse2(in); + iadst8_sse2(in); + break; + } + + // Final rounding and shift + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); + + in[0] = _mm_srai_epi16(in[0], 5); + in[1] = _mm_srai_epi16(in[1], 5); + in[2] = _mm_srai_epi16(in[2], 5); + in[3] = _mm_srai_epi16(in[3], 5); + in[4] = _mm_srai_epi16(in[4], 5); + in[5] = _mm_srai_epi16(in[5], 5); + in[6] = _mm_srai_epi16(in[6], 5); + in[7] = _mm_srai_epi16(in[7], 5); + + recon_and_store(dest + 0 * stride, in[0]); + recon_and_store(dest + 1 * stride, in[1]); + recon_and_store(dest + 2 * stride, in[2]); + recon_and_store(dest + 3 * stride, in[3]); + recon_and_store(dest + 4 * stride, in[4]); + recon_and_store(dest + 5 * stride, in[5]); + recon_and_store(dest + 6 * stride, in[6]); + recon_and_store(dest + 7 * stride, in[7]); +} + +static INLINE void load_buffer_8x16(const tran_low_t *const input, + __m128i *const in) { + in[0] = load_input_data8(input + 0 * 16); + in[1] = load_input_data8(input + 1 * 16); + in[2] = load_input_data8(input + 2 * 16); + in[3] = load_input_data8(input + 3 * 16); + in[4] = load_input_data8(input + 4 * 16); + in[5] = load_input_data8(input + 5 * 16); + in[6] = load_input_data8(input + 6 * 16); + in[7] = load_input_data8(input + 7 * 16); + + in[8] = load_input_data8(input + 8 * 16); + in[9] = load_input_data8(input + 9 * 16); + in[10] = load_input_data8(input + 10 * 16); + in[11] = load_input_data8(input + 11 * 16); + in[12] = load_input_data8(input + 12 * 16); + in[13] = load_input_data8(input + 13 * 16); + in[14] = load_input_data8(input + 14 * 16); + in[15] = load_input_data8(input + 15 * 16); +} + +static INLINE void write_buffer_8x16(uint8_t *const dest, __m128i *const in, + const int stride) { + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + // Final rounding and shift + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); + in[8] = _mm_adds_epi16(in[8], final_rounding); + in[9] = _mm_adds_epi16(in[9], final_rounding); + in[10] = _mm_adds_epi16(in[10], final_rounding); + in[11] = _mm_adds_epi16(in[11], final_rounding); + in[12] = _mm_adds_epi16(in[12], final_rounding); + in[13] = _mm_adds_epi16(in[13], final_rounding); + in[14] = _mm_adds_epi16(in[14], final_rounding); + in[15] = _mm_adds_epi16(in[15], final_rounding); + + in[0] = _mm_srai_epi16(in[0], 6); + in[1] = _mm_srai_epi16(in[1], 6); + in[2] = _mm_srai_epi16(in[2], 6); + in[3] = _mm_srai_epi16(in[3], 6); + in[4] = _mm_srai_epi16(in[4], 6); + in[5] = _mm_srai_epi16(in[5], 6); + in[6] = _mm_srai_epi16(in[6], 6); + in[7] = _mm_srai_epi16(in[7], 6); + in[8] = _mm_srai_epi16(in[8], 6); + in[9] = _mm_srai_epi16(in[9], 6); + in[10] = _mm_srai_epi16(in[10], 6); + in[11] = _mm_srai_epi16(in[11], 6); + in[12] = _mm_srai_epi16(in[12], 6); + in[13] = _mm_srai_epi16(in[13], 6); + in[14] = _mm_srai_epi16(in[14], 6); + in[15] = _mm_srai_epi16(in[15], 6); + + recon_and_store(dest + 0 * stride, in[0]); + recon_and_store(dest + 1 * stride, in[1]); + recon_and_store(dest + 2 * stride, in[2]); + recon_and_store(dest + 3 * stride, in[3]); + recon_and_store(dest + 4 * stride, in[4]); + recon_and_store(dest + 5 * stride, in[5]); + recon_and_store(dest + 6 * stride, in[6]); + recon_and_store(dest + 7 * stride, in[7]); + recon_and_store(dest + 8 * stride, in[8]); + recon_and_store(dest + 9 * stride, in[9]); + recon_and_store(dest + 10 * stride, in[10]); + recon_and_store(dest + 11 * stride, in[11]); + recon_and_store(dest + 12 * stride, in[12]); + recon_and_store(dest + 13 * stride, in[13]); + recon_and_store(dest + 14 * stride, in[14]); + recon_and_store(dest + 15 * stride, in[15]); +} + +void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride, int tx_type) { + __m128i in0[16], in1[16]; + + load_buffer_8x16(input, in0); + input += 8; + load_buffer_8x16(input, in1); + + switch (tx_type) { + case DCT_DCT: + idct16_sse2(in0, in1); + idct16_sse2(in0, in1); + break; + case ADST_DCT: + idct16_sse2(in0, in1); + iadst16_sse2(in0, in1); + break; + case DCT_ADST: + iadst16_sse2(in0, in1); + idct16_sse2(in0, in1); + break; + default: + assert(tx_type == ADST_ADST); + iadst16_sse2(in0, in1); + iadst16_sse2(in0, in1); + break; + } + + write_buffer_8x16(dest, in0, stride); + dest += 8; + write_buffer_8x16(dest, in1, stride); +} diff --git a/media/libvpx/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm b/media/libvpx/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm new file mode 100644 index 0000000000..ae7c94ea3f --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm @@ -0,0 +1,289 @@ +; +; Copyright (c) 2015 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +; This file is a duplicate of mfqe_sse2.asm in VP8. +; TODO(jackychen): Find a way to fix the duplicate. +%include "vpx_ports/x86_abi_support.asm" + +SECTION .text + +;void vp9_filter_by_weight16x16_sse2 +;( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride, +; int src_weight +;) +globalsym(vp9_filter_by_weight16x16_sse2) +sym(vp9_filter_by_weight16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movd xmm0, arg(4) ; src_weight + pshuflw xmm0, xmm0, 0x0 ; replicate to all low words + punpcklqdq xmm0, xmm0 ; replicate to all hi words + + movdqa xmm1, [GLOBAL(tMFQE)] + psubw xmm1, xmm0 ; dst_weight + + mov rax, arg(0) ; src + mov rsi, arg(1) ; src_stride + mov rdx, arg(2) ; dst + mov rdi, arg(3) ; dst_stride + + mov rcx, 16 ; loop count + pxor xmm6, xmm6 + +.combine: + movdqa xmm2, [rax] + movdqa xmm4, [rdx] + add rax, rsi + + ; src * src_weight + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm6 + punpckhbw xmm3, xmm6 + pmullw xmm2, xmm0 + pmullw xmm3, xmm0 + + ; dst * dst_weight + movdqa xmm5, xmm4 + punpcklbw xmm4, xmm6 + punpckhbw xmm5, xmm6 + pmullw xmm4, xmm1 + pmullw xmm5, xmm1 + + ; sum, round and shift + paddw xmm2, xmm4 + paddw xmm3, xmm5 + paddw xmm2, [GLOBAL(tMFQE_round)] + paddw xmm3, [GLOBAL(tMFQE_round)] + psrlw xmm2, 4 + psrlw xmm3, 4 + + packuswb xmm2, xmm3 + movdqa [rdx], xmm2 + add rdx, rdi + + dec rcx + jnz .combine + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + + ret + +;void vp9_filter_by_weight8x8_sse2 +;( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride, +; int src_weight +;) +globalsym(vp9_filter_by_weight8x8_sse2) +sym(vp9_filter_by_weight8x8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movd xmm0, arg(4) ; src_weight + pshuflw xmm0, xmm0, 0x0 ; replicate to all low words + punpcklqdq xmm0, xmm0 ; replicate to all hi words + + movdqa xmm1, [GLOBAL(tMFQE)] + psubw xmm1, xmm0 ; dst_weight + + mov rax, arg(0) ; src + mov rsi, arg(1) ; src_stride + mov rdx, arg(2) ; dst + mov rdi, arg(3) ; dst_stride + + mov rcx, 8 ; loop count + pxor xmm4, xmm4 + +.combine: + movq xmm2, [rax] + movq xmm3, [rdx] + add rax, rsi + + ; src * src_weight + punpcklbw xmm2, xmm4 + pmullw xmm2, xmm0 + + ; dst * dst_weight + punpcklbw xmm3, xmm4 + pmullw xmm3, xmm1 + + ; sum, round and shift + paddw xmm2, xmm3 + paddw xmm2, [GLOBAL(tMFQE_round)] + psrlw xmm2, 4 + + packuswb xmm2, xmm4 + movq [rdx], xmm2 + add rdx, rdi + + dec rcx + jnz .combine + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + + ret + +;void vp9_variance_and_sad_16x16_sse2 | arg +;( +; unsigned char *src1, 0 +; int stride1, 1 +; unsigned char *src2, 2 +; int stride2, 3 +; unsigned int *variance, 4 +; unsigned int *sad, 5 +;) +globalsym(vp9_variance_and_sad_16x16_sse2) +sym(vp9_variance_and_sad_16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rax, arg(0) ; src1 + mov rcx, arg(1) ; stride1 + mov rdx, arg(2) ; src2 + mov rdi, arg(3) ; stride2 + + mov rsi, 16 ; block height + + ; Prep accumulator registers + pxor xmm3, xmm3 ; SAD + pxor xmm4, xmm4 ; sum of src2 + pxor xmm5, xmm5 ; sum of src2^2 + + ; Because we're working with the actual output frames + ; we can't depend on any kind of data alignment. +.accumulate: + movdqa xmm0, [rax] ; src1 + movdqa xmm1, [rdx] ; src2 + add rax, rcx ; src1 + stride1 + add rdx, rdi ; src2 + stride2 + + ; SAD(src1, src2) + psadbw xmm0, xmm1 + paddusw xmm3, xmm0 + + ; SUM(src2) + pxor xmm2, xmm2 + psadbw xmm2, xmm1 ; sum src2 by misusing SAD against 0 + paddusw xmm4, xmm2 + + ; pmaddubsw would be ideal if it took two unsigned values. instead, + ; it expects a signed and an unsigned value. so instead we zero extend + ; and operate on words. + pxor xmm2, xmm2 + movdqa xmm0, xmm1 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + pmaddwd xmm0, xmm0 + pmaddwd xmm1, xmm1 + paddd xmm5, xmm0 + paddd xmm5, xmm1 + + sub rsi, 1 + jnz .accumulate + + ; phaddd only operates on adjacent double words. + ; Finalize SAD and store + movdqa xmm0, xmm3 + psrldq xmm0, 8 + paddusw xmm0, xmm3 + paddd xmm0, [GLOBAL(t128)] + psrld xmm0, 8 + + mov rax, arg(5) + movd [rax], xmm0 + + ; Accumulate sum of src2 + movdqa xmm0, xmm4 + psrldq xmm0, 8 + paddusw xmm0, xmm4 + ; Square src2. Ignore high value + pmuludq xmm0, xmm0 + psrld xmm0, 8 + + ; phaddw could be used to sum adjacent values but we want + ; all the values summed. promote to doubles, accumulate, + ; shift and sum + pxor xmm2, xmm2 + movdqa xmm1, xmm5 + punpckldq xmm1, xmm2 + punpckhdq xmm5, xmm2 + paddd xmm1, xmm5 + movdqa xmm2, xmm1 + psrldq xmm1, 8 + paddd xmm1, xmm2 + + psubd xmm1, xmm0 + + ; (variance + 128) >> 8 + paddd xmm1, [GLOBAL(t128)] + psrld xmm1, 8 + mov rax, arg(4) + + movd [rax], xmm1 + + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +t128: +%ifndef __NASM_VER__ + ddq 128 +%elif CONFIG_BIG_ENDIAN + dq 0, 128 +%else + dq 128, 0 +%endif +align 16 +tMFQE: ; 1 << MFQE_PRECISION + times 8 dw 0x10 +align 16 +tMFQE_round: ; 1 << (MFQE_PRECISION - 1) + times 8 dw 0x08 diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c b/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c new file mode 100644 index 0000000000..c5892156f4 --- /dev/null +++ b/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c @@ -0,0 +1,3063 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include // qsort() + +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "./vpx_scale_rtcd.h" + +#include "vpx_dsp/bitreader_buffer.h" +#include "vpx_dsp/bitreader.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/mem_ops.h" +#include "vpx_scale/vpx_scale.h" +#include "vpx_util/vpx_thread.h" +#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG +#include "vpx_util/vpx_debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG + +#include "vp9/common/vp9_alloccommon.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_idct.h" +#include "vp9/common/vp9_thread_common.h" +#include "vp9/common/vp9_pred_common.h" +#include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_seg_common.h" +#include "vp9/common/vp9_tile_common.h" + +#include "vp9/decoder/vp9_decodeframe.h" +#include "vp9/decoder/vp9_detokenize.h" +#include "vp9/decoder/vp9_decodemv.h" +#include "vp9/decoder/vp9_decoder.h" +#include "vp9/decoder/vp9_dsubexp.h" +#include "vp9/decoder/vp9_job_queue.h" + +#define MAX_VP9_HEADER_SIZE 80 + +typedef int (*predict_recon_func)(TileWorkerData *twd, MODE_INFO *const mi, + int plane, int row, int col, TX_SIZE tx_size); + +typedef void (*intra_recon_func)(TileWorkerData *twd, MODE_INFO *const mi, + int plane, int row, int col, TX_SIZE tx_size); + +static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) { + return len != 0 && len <= (size_t)(end - start); +} + +static int decode_unsigned_max(struct vpx_read_bit_buffer *rb, int max) { + const int data = vpx_rb_read_literal(rb, get_unsigned_bits(max)); + return data > max ? max : data; +} + +static TX_MODE read_tx_mode(vpx_reader *r) { + TX_MODE tx_mode = vpx_read_literal(r, 2); + if (tx_mode == ALLOW_32X32) tx_mode += vpx_read_bit(r); + return tx_mode; +} + +static void read_tx_mode_probs(struct tx_probs *tx_probs, vpx_reader *r) { + int i, j; + + for (i = 0; i < TX_SIZE_CONTEXTS; ++i) + for (j = 0; j < TX_SIZES - 3; ++j) + vp9_diff_update_prob(r, &tx_probs->p8x8[i][j]); + + for (i = 0; i < TX_SIZE_CONTEXTS; ++i) + for (j = 0; j < TX_SIZES - 2; ++j) + vp9_diff_update_prob(r, &tx_probs->p16x16[i][j]); + + for (i = 0; i < TX_SIZE_CONTEXTS; ++i) + for (j = 0; j < TX_SIZES - 1; ++j) + vp9_diff_update_prob(r, &tx_probs->p32x32[i][j]); +} + +static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vpx_reader *r) { + int i, j; + for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) + for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) + vp9_diff_update_prob(r, &fc->switchable_interp_prob[j][i]); +} + +static void read_inter_mode_probs(FRAME_CONTEXT *fc, vpx_reader *r) { + int i, j; + for (i = 0; i < INTER_MODE_CONTEXTS; ++i) + for (j = 0; j < INTER_MODES - 1; ++j) + vp9_diff_update_prob(r, &fc->inter_mode_probs[i][j]); +} + +static REFERENCE_MODE read_frame_reference_mode(const VP9_COMMON *cm, + vpx_reader *r) { + if (vp9_compound_reference_allowed(cm)) { + return vpx_read_bit(r) + ? (vpx_read_bit(r) ? REFERENCE_MODE_SELECT : COMPOUND_REFERENCE) + : SINGLE_REFERENCE; + } else { + return SINGLE_REFERENCE; + } +} + +static void read_frame_reference_mode_probs(VP9_COMMON *cm, vpx_reader *r) { + FRAME_CONTEXT *const fc = cm->fc; + int i; + + if (cm->reference_mode == REFERENCE_MODE_SELECT) + for (i = 0; i < COMP_INTER_CONTEXTS; ++i) + vp9_diff_update_prob(r, &fc->comp_inter_prob[i]); + + if (cm->reference_mode != COMPOUND_REFERENCE) + for (i = 0; i < REF_CONTEXTS; ++i) { + vp9_diff_update_prob(r, &fc->single_ref_prob[i][0]); + vp9_diff_update_prob(r, &fc->single_ref_prob[i][1]); + } + + if (cm->reference_mode != SINGLE_REFERENCE) + for (i = 0; i < REF_CONTEXTS; ++i) + vp9_diff_update_prob(r, &fc->comp_ref_prob[i]); +} + +static void update_mv_probs(vpx_prob *p, int n, vpx_reader *r) { + int i; + for (i = 0; i < n; ++i) + if (vpx_read(r, MV_UPDATE_PROB)) p[i] = (vpx_read_literal(r, 7) << 1) | 1; +} + +static void read_mv_probs(nmv_context *ctx, int allow_hp, vpx_reader *r) { + int i, j; + + update_mv_probs(ctx->joints, MV_JOINTS - 1, r); + + for (i = 0; i < 2; ++i) { + nmv_component *const comp_ctx = &ctx->comps[i]; + update_mv_probs(&comp_ctx->sign, 1, r); + update_mv_probs(comp_ctx->classes, MV_CLASSES - 1, r); + update_mv_probs(comp_ctx->class0, CLASS0_SIZE - 1, r); + update_mv_probs(comp_ctx->bits, MV_OFFSET_BITS, r); + } + + for (i = 0; i < 2; ++i) { + nmv_component *const comp_ctx = &ctx->comps[i]; + for (j = 0; j < CLASS0_SIZE; ++j) + update_mv_probs(comp_ctx->class0_fp[j], MV_FP_SIZE - 1, r); + update_mv_probs(comp_ctx->fp, 3, r); + } + + if (allow_hp) { + for (i = 0; i < 2; ++i) { + nmv_component *const comp_ctx = &ctx->comps[i]; + update_mv_probs(&comp_ctx->class0_hp, 1, r); + update_mv_probs(&comp_ctx->hp, 1, r); + } + } +} + +static void inverse_transform_block_inter(MACROBLOCKD *xd, int plane, + const TX_SIZE tx_size, uint8_t *dst, + int stride, int eob) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + tran_low_t *const dqcoeff = pd->dqcoeff; + assert(eob > 0); +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); + if (xd->lossless) { + vp9_highbd_iwht4x4_add(dqcoeff, dst16, stride, eob, xd->bd); + } else { + switch (tx_size) { + case TX_4X4: + vp9_highbd_idct4x4_add(dqcoeff, dst16, stride, eob, xd->bd); + break; + case TX_8X8: + vp9_highbd_idct8x8_add(dqcoeff, dst16, stride, eob, xd->bd); + break; + case TX_16X16: + vp9_highbd_idct16x16_add(dqcoeff, dst16, stride, eob, xd->bd); + break; + case TX_32X32: + vp9_highbd_idct32x32_add(dqcoeff, dst16, stride, eob, xd->bd); + break; + default: assert(0 && "Invalid transform size"); + } + } + } else { + if (xd->lossless) { + vp9_iwht4x4_add(dqcoeff, dst, stride, eob); + } else { + switch (tx_size) { + case TX_4X4: vp9_idct4x4_add(dqcoeff, dst, stride, eob); break; + case TX_8X8: vp9_idct8x8_add(dqcoeff, dst, stride, eob); break; + case TX_16X16: vp9_idct16x16_add(dqcoeff, dst, stride, eob); break; + case TX_32X32: vp9_idct32x32_add(dqcoeff, dst, stride, eob); break; + default: assert(0 && "Invalid transform size"); return; + } + } + } +#else + if (xd->lossless) { + vp9_iwht4x4_add(dqcoeff, dst, stride, eob); + } else { + switch (tx_size) { + case TX_4X4: vp9_idct4x4_add(dqcoeff, dst, stride, eob); break; + case TX_8X8: vp9_idct8x8_add(dqcoeff, dst, stride, eob); break; + case TX_16X16: vp9_idct16x16_add(dqcoeff, dst, stride, eob); break; + case TX_32X32: vp9_idct32x32_add(dqcoeff, dst, stride, eob); break; + default: assert(0 && "Invalid transform size"); return; + } + } +#endif // CONFIG_VP9_HIGHBITDEPTH + + if (eob == 1) { + dqcoeff[0] = 0; + } else { + if (tx_size <= TX_16X16 && eob <= 10) + memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0])); + else if (tx_size == TX_32X32 && eob <= 34) + memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0])); + else + memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0])); + } +} + +static void inverse_transform_block_intra(MACROBLOCKD *xd, int plane, + const TX_TYPE tx_type, + const TX_SIZE tx_size, uint8_t *dst, + int stride, int eob) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + tran_low_t *const dqcoeff = pd->dqcoeff; + assert(eob > 0); +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); + if (xd->lossless) { + vp9_highbd_iwht4x4_add(dqcoeff, dst16, stride, eob, xd->bd); + } else { + switch (tx_size) { + case TX_4X4: + vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd); + break; + case TX_8X8: + vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd); + break; + case TX_16X16: + vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd); + break; + case TX_32X32: + vp9_highbd_idct32x32_add(dqcoeff, dst16, stride, eob, xd->bd); + break; + default: assert(0 && "Invalid transform size"); + } + } + } else { + if (xd->lossless) { + vp9_iwht4x4_add(dqcoeff, dst, stride, eob); + } else { + switch (tx_size) { + case TX_4X4: vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob); break; + case TX_8X8: vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob); break; + case TX_16X16: + vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob); + break; + case TX_32X32: vp9_idct32x32_add(dqcoeff, dst, stride, eob); break; + default: assert(0 && "Invalid transform size"); return; + } + } + } +#else + if (xd->lossless) { + vp9_iwht4x4_add(dqcoeff, dst, stride, eob); + } else { + switch (tx_size) { + case TX_4X4: vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob); break; + case TX_8X8: vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob); break; + case TX_16X16: + vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob); + break; + case TX_32X32: vp9_idct32x32_add(dqcoeff, dst, stride, eob); break; + default: assert(0 && "Invalid transform size"); return; + } + } +#endif // CONFIG_VP9_HIGHBITDEPTH + + if (eob == 1) { + dqcoeff[0] = 0; + } else { + if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10) + memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0])); + else if (tx_size == TX_32X32 && eob <= 34) + memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0])); + else + memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0])); + } +} + +static void predict_and_reconstruct_intra_block(TileWorkerData *twd, + MODE_INFO *const mi, int plane, + int row, int col, + TX_SIZE tx_size) { + MACROBLOCKD *const xd = &twd->xd; + struct macroblockd_plane *const pd = &xd->plane[plane]; + PREDICTION_MODE mode = (plane == 0) ? mi->mode : mi->uv_mode; + uint8_t *dst; + dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col]; + + if (mi->sb_type < BLOCK_8X8) + if (plane == 0) mode = xd->mi[0]->bmi[(row << 1) + col].as_mode; + + vp9_predict_intra_block(xd, pd->n4_wl, tx_size, mode, dst, pd->dst.stride, + dst, pd->dst.stride, col, row, plane); + + if (!mi->skip) { + const TX_TYPE tx_type = + (plane || xd->lossless) ? DCT_DCT : intra_mode_to_tx_type_lookup[mode]; + const ScanOrder *sc = (plane || xd->lossless) + ? &vp9_default_scan_orders[tx_size] + : &vp9_scan_orders[tx_size][tx_type]; + const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size, + mi->segment_id); + if (eob > 0) { + inverse_transform_block_intra(xd, plane, tx_type, tx_size, dst, + pd->dst.stride, eob); + } + } +} + +static void parse_intra_block_row_mt(TileWorkerData *twd, MODE_INFO *const mi, + int plane, int row, int col, + TX_SIZE tx_size) { + MACROBLOCKD *const xd = &twd->xd; + PREDICTION_MODE mode = (plane == 0) ? mi->mode : mi->uv_mode; + + if (mi->sb_type < BLOCK_8X8) + if (plane == 0) mode = xd->mi[0]->bmi[(row << 1) + col].as_mode; + + if (!mi->skip) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + const TX_TYPE tx_type = + (plane || xd->lossless) ? DCT_DCT : intra_mode_to_tx_type_lookup[mode]; + const ScanOrder *sc = (plane || xd->lossless) + ? &vp9_default_scan_orders[tx_size] + : &vp9_scan_orders[tx_size][tx_type]; + *pd->eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size, + mi->segment_id); + /* Keep the alignment to 16 */ + pd->dqcoeff += (16 << (tx_size << 1)); + pd->eob++; + } +} + +static void predict_and_reconstruct_intra_block_row_mt(TileWorkerData *twd, + MODE_INFO *const mi, + int plane, int row, + int col, + TX_SIZE tx_size) { + MACROBLOCKD *const xd = &twd->xd; + struct macroblockd_plane *const pd = &xd->plane[plane]; + PREDICTION_MODE mode = (plane == 0) ? mi->mode : mi->uv_mode; + uint8_t *dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col]; + + if (mi->sb_type < BLOCK_8X8) + if (plane == 0) mode = xd->mi[0]->bmi[(row << 1) + col].as_mode; + + vp9_predict_intra_block(xd, pd->n4_wl, tx_size, mode, dst, pd->dst.stride, + dst, pd->dst.stride, col, row, plane); + + if (!mi->skip) { + const TX_TYPE tx_type = + (plane || xd->lossless) ? DCT_DCT : intra_mode_to_tx_type_lookup[mode]; + if (*pd->eob > 0) { + inverse_transform_block_intra(xd, plane, tx_type, tx_size, dst, + pd->dst.stride, *pd->eob); + } + /* Keep the alignment to 16 */ + pd->dqcoeff += (16 << (tx_size << 1)); + pd->eob++; + } +} + +static int reconstruct_inter_block(TileWorkerData *twd, MODE_INFO *const mi, + int plane, int row, int col, TX_SIZE tx_size, + int mi_row, int mi_col) { + MACROBLOCKD *const xd = &twd->xd; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const ScanOrder *sc = &vp9_default_scan_orders[tx_size]; + const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size, + mi->segment_id); + uint8_t *dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col]; + + if (eob > 0) { + inverse_transform_block_inter(xd, plane, tx_size, dst, pd->dst.stride, eob); + } +#if CONFIG_MISMATCH_DEBUG + { + int pixel_c, pixel_r; + int blk_w = 1 << (tx_size + TX_UNIT_SIZE_LOG2); + int blk_h = 1 << (tx_size + TX_UNIT_SIZE_LOG2); + mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, col, row, + pd->subsampling_x, pd->subsampling_y); + mismatch_check_block_tx(dst, pd->dst.stride, plane, pixel_c, pixel_r, blk_w, + blk_h, xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); + } +#else + (void)mi_row; + (void)mi_col; +#endif + return eob; +} + +static int parse_inter_block_row_mt(TileWorkerData *twd, MODE_INFO *const mi, + int plane, int row, int col, + TX_SIZE tx_size) { + MACROBLOCKD *const xd = &twd->xd; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const ScanOrder *sc = &vp9_default_scan_orders[tx_size]; + const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size, + mi->segment_id); + + *pd->eob = eob; + pd->dqcoeff += (16 << (tx_size << 1)); + pd->eob++; + + return eob; +} + +static int reconstruct_inter_block_row_mt(TileWorkerData *twd, + MODE_INFO *const mi, int plane, + int row, int col, TX_SIZE tx_size) { + MACROBLOCKD *const xd = &twd->xd; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const int eob = *pd->eob; + + (void)mi; + if (eob > 0) { + inverse_transform_block_inter( + xd, plane, tx_size, &pd->dst.buf[4 * row * pd->dst.stride + 4 * col], + pd->dst.stride, eob); + } + pd->dqcoeff += (16 << (tx_size << 1)); + pd->eob++; + + return eob; +} + +static void build_mc_border(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int x, int y, int b_w, int b_h, + int w, int h) { + // Get a pointer to the start of the real data for this row. + const uint8_t *ref_row = src - x - y * src_stride; + + if (y >= h) + ref_row += (h - 1) * src_stride; + else if (y > 0) + ref_row += y * src_stride; + + do { + int right = 0, copy; + int left = x < 0 ? -x : 0; + + if (left > b_w) left = b_w; + + if (x + b_w > w) right = x + b_w - w; + + if (right > b_w) right = b_w; + + copy = b_w - left - right; + + if (left) memset(dst, ref_row[0], left); + + if (copy) memcpy(dst + left, ref_row + x + left, copy); + + if (right) memset(dst + left + copy, ref_row[w - 1], right); + + dst += dst_stride; + ++y; + + if (y > 0 && y < h) ref_row += src_stride; + } while (--b_h); +} + +#if CONFIG_VP9_HIGHBITDEPTH +static void high_build_mc_border(const uint8_t *src8, int src_stride, + uint16_t *dst, int dst_stride, int x, int y, + int b_w, int b_h, int w, int h) { + // Get a pointer to the start of the real data for this row. + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *ref_row = src - x - y * src_stride; + + if (y >= h) + ref_row += (h - 1) * src_stride; + else if (y > 0) + ref_row += y * src_stride; + + do { + int right = 0, copy; + int left = x < 0 ? -x : 0; + + if (left > b_w) left = b_w; + + if (x + b_w > w) right = x + b_w - w; + + if (right > b_w) right = b_w; + + copy = b_w - left - right; + + if (left) vpx_memset16(dst, ref_row[0], left); + + if (copy) memcpy(dst + left, ref_row + x + left, copy * sizeof(uint16_t)); + + if (right) vpx_memset16(dst + left + copy, ref_row[w - 1], right); + + dst += dst_stride; + ++y; + + if (y > 0 && y < h) ref_row += src_stride; + } while (--b_h); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if CONFIG_VP9_HIGHBITDEPTH +static void extend_and_predict(TileWorkerData *twd, const uint8_t *buf_ptr1, + int pre_buf_stride, int x0, int y0, int b_w, + int b_h, int frame_width, int frame_height, + int border_offset, uint8_t *const dst, + int dst_buf_stride, int subpel_x, int subpel_y, + const InterpKernel *kernel, + const struct scale_factors *sf, MACROBLOCKD *xd, + int w, int h, int ref, int xs, int ys) { + uint16_t *mc_buf_high = twd->extend_and_predict_buf; + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + high_build_mc_border(buf_ptr1, pre_buf_stride, mc_buf_high, b_w, x0, y0, + b_w, b_h, frame_width, frame_height); + highbd_inter_predictor(mc_buf_high + border_offset, b_w, + CONVERT_TO_SHORTPTR(dst), dst_buf_stride, subpel_x, + subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd); + } else { + build_mc_border(buf_ptr1, pre_buf_stride, (uint8_t *)mc_buf_high, b_w, x0, + y0, b_w, b_h, frame_width, frame_height); + inter_predictor(((uint8_t *)mc_buf_high) + border_offset, b_w, dst, + dst_buf_stride, subpel_x, subpel_y, sf, w, h, ref, kernel, + xs, ys); + } +} +#else +static void extend_and_predict(TileWorkerData *twd, const uint8_t *buf_ptr1, + int pre_buf_stride, int x0, int y0, int b_w, + int b_h, int frame_width, int frame_height, + int border_offset, uint8_t *const dst, + int dst_buf_stride, int subpel_x, int subpel_y, + const InterpKernel *kernel, + const struct scale_factors *sf, int w, int h, + int ref, int xs, int ys) { + uint8_t *mc_buf = (uint8_t *)twd->extend_and_predict_buf; + const uint8_t *buf_ptr; + + build_mc_border(buf_ptr1, pre_buf_stride, mc_buf, b_w, x0, y0, b_w, b_h, + frame_width, frame_height); + buf_ptr = mc_buf + border_offset; + + inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x, subpel_y, sf, w, + h, ref, kernel, xs, ys); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +static void dec_build_inter_predictors( + TileWorkerData *twd, MACROBLOCKD *xd, int plane, int bw, int bh, int x, + int y, int w, int h, int mi_x, int mi_y, const InterpKernel *kernel, + const struct scale_factors *sf, struct buf_2d *pre_buf, + struct buf_2d *dst_buf, const MV *mv, RefCntBuffer *ref_frame_buf, + int is_scaled, int ref) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x; + MV32 scaled_mv; + int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height, buf_stride, + subpel_x, subpel_y; + uint8_t *ref_frame, *buf_ptr; + + // Get reference frame pointer, width and height. + if (plane == 0) { + frame_width = ref_frame_buf->buf.y_crop_width; + frame_height = ref_frame_buf->buf.y_crop_height; + ref_frame = ref_frame_buf->buf.y_buffer; + } else { + frame_width = ref_frame_buf->buf.uv_crop_width; + frame_height = ref_frame_buf->buf.uv_crop_height; + ref_frame = + plane == 1 ? ref_frame_buf->buf.u_buffer : ref_frame_buf->buf.v_buffer; + } + + if (is_scaled) { + const MV mv_q4 = clamp_mv_to_umv_border_sb( + xd, mv, bw, bh, pd->subsampling_x, pd->subsampling_y); + // Co-ordinate of containing block to pixel precision. + int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)); + int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)); +#if 0 // CONFIG_BETTER_HW_COMPATIBILITY + assert(xd->mi[0]->sb_type != BLOCK_4X8 && + xd->mi[0]->sb_type != BLOCK_8X4); + assert(mv_q4.row == mv->row * (1 << (1 - pd->subsampling_y)) && + mv_q4.col == mv->col * (1 << (1 - pd->subsampling_x))); +#endif + // Co-ordinate of the block to 1/16th pixel precision. + x0_16 = (x_start + x) << SUBPEL_BITS; + y0_16 = (y_start + y) << SUBPEL_BITS; + + // Co-ordinate of current block in reference frame + // to 1/16th pixel precision. + x0_16 = sf->scale_value_x(x0_16, sf); + y0_16 = sf->scale_value_y(y0_16, sf); + + // Map the top left corner of the block into the reference frame. + x0 = sf->scale_value_x(x_start + x, sf); + y0 = sf->scale_value_y(y_start + y, sf); + + // Scale the MV and incorporate the sub-pixel offset of the block + // in the reference frame. + scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf); + xs = sf->x_step_q4; + ys = sf->y_step_q4; + } else { + // Co-ordinate of containing block to pixel precision. + x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x; + y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y; + + // Co-ordinate of the block to 1/16th pixel precision. + x0_16 = x0 << SUBPEL_BITS; + y0_16 = y0 << SUBPEL_BITS; + + scaled_mv.row = mv->row * (1 << (1 - pd->subsampling_y)); + scaled_mv.col = mv->col * (1 << (1 - pd->subsampling_x)); + xs = ys = 16; + } + subpel_x = scaled_mv.col & SUBPEL_MASK; + subpel_y = scaled_mv.row & SUBPEL_MASK; + + // Calculate the top left corner of the best matching block in the + // reference frame. + x0 += scaled_mv.col >> SUBPEL_BITS; + y0 += scaled_mv.row >> SUBPEL_BITS; + x0_16 += scaled_mv.col; + y0_16 += scaled_mv.row; + + // Get reference block pointer. + buf_ptr = ref_frame + y0 * pre_buf->stride + x0; + buf_stride = pre_buf->stride; + + // Do border extension if there is motion or the + // width/height is not a multiple of 8 pixels. + if (is_scaled || scaled_mv.col || scaled_mv.row || (frame_width & 0x7) || + (frame_height & 0x7)) { + int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1; + + // Get reference block bottom right horizontal coordinate. + int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1; + int x_pad = 0, y_pad = 0; + + if (subpel_x || (sf->x_step_q4 != SUBPEL_SHIFTS)) { + x0 -= VP9_INTERP_EXTEND - 1; + x1 += VP9_INTERP_EXTEND; + x_pad = 1; + } + + if (subpel_y || (sf->y_step_q4 != SUBPEL_SHIFTS)) { + y0 -= VP9_INTERP_EXTEND - 1; + y1 += VP9_INTERP_EXTEND; + y_pad = 1; + } + + // Skip border extension if block is inside the frame. + if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 || + y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) { + // Extend the border. + const uint8_t *const buf_ptr1 = ref_frame + y0 * buf_stride + x0; + const int b_w = x1 - x0 + 1; + const int b_h = y1 - y0 + 1; + const int border_offset = y_pad * 3 * b_w + x_pad * 3; + + extend_and_predict(twd, buf_ptr1, buf_stride, x0, y0, b_w, b_h, + frame_width, frame_height, border_offset, dst, + dst_buf->stride, subpel_x, subpel_y, kernel, sf, +#if CONFIG_VP9_HIGHBITDEPTH + xd, +#endif + w, h, ref, xs, ys); + return; + } + } +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + highbd_inter_predictor(CONVERT_TO_SHORTPTR(buf_ptr), buf_stride, + CONVERT_TO_SHORTPTR(dst), dst_buf->stride, subpel_x, + subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd); + } else { + inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x, + subpel_y, sf, w, h, ref, kernel, xs, ys); + } +#else + inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x, subpel_y, + sf, w, h, ref, kernel, xs, ys); +#endif // CONFIG_VP9_HIGHBITDEPTH +} + +static void dec_build_inter_predictors_sb(TileWorkerData *twd, + VP9Decoder *const pbi, + MACROBLOCKD *xd, int mi_row, + int mi_col) { + int plane; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + const MODE_INFO *mi = xd->mi[0]; + const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter]; + const BLOCK_SIZE sb_type = mi->sb_type; + const int is_compound = has_second_ref(mi); + int ref; + int is_scaled; + + for (ref = 0; ref < 1 + is_compound; ++ref) { + const MV_REFERENCE_FRAME frame = mi->ref_frame[ref]; + RefBuffer *ref_buf = &pbi->common.frame_refs[frame - LAST_FRAME]; + const struct scale_factors *const sf = &ref_buf->sf; + const int idx = ref_buf->idx; + BufferPool *const pool = pbi->common.buffer_pool; + RefCntBuffer *const ref_frame_buf = &pool->frame_bufs[idx]; + + if (!vp9_is_valid_scale(sf)) + vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM, + "Reference frame has invalid dimensions"); + + is_scaled = vp9_is_scaled(sf); + vp9_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col, + is_scaled ? sf : NULL); + xd->block_refs[ref] = ref_buf; + + if (sb_type < BLOCK_8X8) { + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + struct buf_2d *const dst_buf = &pd->dst; + const int num_4x4_w = pd->n4_w; + const int num_4x4_h = pd->n4_h; + const int n4w_x4 = 4 * num_4x4_w; + const int n4h_x4 = 4 * num_4x4_h; + struct buf_2d *const pre_buf = &pd->pre[ref]; + int i = 0, x, y; + for (y = 0; y < num_4x4_h; ++y) { + for (x = 0; x < num_4x4_w; ++x) { + const MV mv = average_split_mvs(pd, mi, ref, i++); + dec_build_inter_predictors(twd, xd, plane, n4w_x4, n4h_x4, 4 * x, + 4 * y, 4, 4, mi_x, mi_y, kernel, sf, + pre_buf, dst_buf, &mv, ref_frame_buf, + is_scaled, ref); + } + } + } + } else { + const MV mv = mi->mv[ref].as_mv; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + struct buf_2d *const dst_buf = &pd->dst; + const int num_4x4_w = pd->n4_w; + const int num_4x4_h = pd->n4_h; + const int n4w_x4 = 4 * num_4x4_w; + const int n4h_x4 = 4 * num_4x4_h; + struct buf_2d *const pre_buf = &pd->pre[ref]; + dec_build_inter_predictors(twd, xd, plane, n4w_x4, n4h_x4, 0, 0, n4w_x4, + n4h_x4, mi_x, mi_y, kernel, sf, pre_buf, + dst_buf, &mv, ref_frame_buf, is_scaled, ref); + } + } + } +} + +static INLINE void dec_reset_skip_context(MACROBLOCKD *xd) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) { + struct macroblockd_plane *const pd = &xd->plane[i]; + memset(pd->above_context, 0, sizeof(ENTROPY_CONTEXT) * pd->n4_w); + memset(pd->left_context, 0, sizeof(ENTROPY_CONTEXT) * pd->n4_h); + } +} + +static void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh, int bwl, + int bhl) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].n4_w = (bw << 1) >> xd->plane[i].subsampling_x; + xd->plane[i].n4_h = (bh << 1) >> xd->plane[i].subsampling_y; + xd->plane[i].n4_wl = bwl - xd->plane[i].subsampling_x; + xd->plane[i].n4_hl = bhl - xd->plane[i].subsampling_y; + } +} + +static MODE_INFO *set_offsets_recon(VP9_COMMON *const cm, MACROBLOCKD *const xd, + int mi_row, int mi_col, int bw, int bh, + int bwl, int bhl) { + const int offset = mi_row * cm->mi_stride + mi_col; + const TileInfo *const tile = &xd->tile; + xd->mi = cm->mi_grid_visible + offset; + + set_plane_n4(xd, bw, bh, bwl, bhl); + + set_skip_context(xd, mi_row, mi_col); + + // Distance of Mb to the various image edges. These are specified to 8th pel + // as they are always compared to values that are in 1/8th pel units + set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols); + + vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col); + return xd->mi[0]; +} + +static MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd, + BLOCK_SIZE bsize, int mi_row, int mi_col, int bw, + int bh, int x_mis, int y_mis, int bwl, int bhl) { + const int offset = mi_row * cm->mi_stride + mi_col; + int x, y; + const TileInfo *const tile = &xd->tile; + + xd->mi = cm->mi_grid_visible + offset; + xd->mi[0] = &cm->mi[offset]; + // TODO(slavarnway): Generate sb_type based on bwl and bhl, instead of + // passing bsize from decode_partition(). + xd->mi[0]->sb_type = bsize; + for (y = 0; y < y_mis; ++y) + for (x = !y; x < x_mis; ++x) { + xd->mi[y * cm->mi_stride + x] = xd->mi[0]; + } + + set_plane_n4(xd, bw, bh, bwl, bhl); + + set_skip_context(xd, mi_row, mi_col); + + // Distance of Mb to the various image edges. These are specified to 8th pel + // as they are always compared to values that are in 1/8th pel units + set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols); + + vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col); + return xd->mi[0]; +} + +static INLINE int predict_recon_inter(MACROBLOCKD *xd, MODE_INFO *mi, + TileWorkerData *twd, + predict_recon_func func) { + int eobtotal = 0; + int plane; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const TX_SIZE tx_size = plane ? get_uv_tx_size(mi, pd) : mi->tx_size; + const int num_4x4_w = pd->n4_w; + const int num_4x4_h = pd->n4_h; + const int step = (1 << tx_size); + int row, col; + const int max_blocks_wide = + num_4x4_w + (xd->mb_to_right_edge >= 0 + ? 0 + : xd->mb_to_right_edge >> (5 + pd->subsampling_x)); + const int max_blocks_high = + num_4x4_h + (xd->mb_to_bottom_edge >= 0 + ? 0 + : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); + + xd->max_blocks_wide = xd->mb_to_right_edge >= 0 ? 0 : max_blocks_wide; + xd->max_blocks_high = xd->mb_to_bottom_edge >= 0 ? 0 : max_blocks_high; + + for (row = 0; row < max_blocks_high; row += step) + for (col = 0; col < max_blocks_wide; col += step) + eobtotal += func(twd, mi, plane, row, col, tx_size); + } + return eobtotal; +} + +static INLINE void predict_recon_intra(MACROBLOCKD *xd, MODE_INFO *mi, + TileWorkerData *twd, + intra_recon_func func) { + int plane; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const TX_SIZE tx_size = plane ? get_uv_tx_size(mi, pd) : mi->tx_size; + const int num_4x4_w = pd->n4_w; + const int num_4x4_h = pd->n4_h; + const int step = (1 << tx_size); + int row, col; + const int max_blocks_wide = + num_4x4_w + (xd->mb_to_right_edge >= 0 + ? 0 + : xd->mb_to_right_edge >> (5 + pd->subsampling_x)); + const int max_blocks_high = + num_4x4_h + (xd->mb_to_bottom_edge >= 0 + ? 0 + : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); + + xd->max_blocks_wide = xd->mb_to_right_edge >= 0 ? 0 : max_blocks_wide; + xd->max_blocks_high = xd->mb_to_bottom_edge >= 0 ? 0 : max_blocks_high; + + for (row = 0; row < max_blocks_high; row += step) + for (col = 0; col < max_blocks_wide; col += step) + func(twd, mi, plane, row, col, tx_size); + } +} + +static void decode_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row, + int mi_col, BLOCK_SIZE bsize, int bwl, int bhl) { + VP9_COMMON *const cm = &pbi->common; + const int less8x8 = bsize < BLOCK_8X8; + const int bw = 1 << (bwl - 1); + const int bh = 1 << (bhl - 1); + const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col); + const int y_mis = VPXMIN(bh, cm->mi_rows - mi_row); + vpx_reader *r = &twd->bit_reader; + MACROBLOCKD *const xd = &twd->xd; + + MODE_INFO *mi = set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, + y_mis, bwl, bhl); + + if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) { + const BLOCK_SIZE uv_subsize = + ss_size_lookup[bsize][cm->subsampling_x][cm->subsampling_y]; + if (uv_subsize == BLOCK_INVALID) + vpx_internal_error(xd->error_info, VPX_CODEC_CORRUPT_FRAME, + "Invalid block size."); + } + + vp9_read_mode_info(twd, pbi, mi_row, mi_col, x_mis, y_mis); + + if (mi->skip) { + dec_reset_skip_context(xd); + } + + if (!is_inter_block(mi)) { + int plane; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const TX_SIZE tx_size = plane ? get_uv_tx_size(mi, pd) : mi->tx_size; + const int num_4x4_w = pd->n4_w; + const int num_4x4_h = pd->n4_h; + const int step = (1 << tx_size); + int row, col; + const int max_blocks_wide = + num_4x4_w + (xd->mb_to_right_edge >= 0 + ? 0 + : xd->mb_to_right_edge >> (5 + pd->subsampling_x)); + const int max_blocks_high = + num_4x4_h + (xd->mb_to_bottom_edge >= 0 + ? 0 + : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); + + xd->max_blocks_wide = xd->mb_to_right_edge >= 0 ? 0 : max_blocks_wide; + xd->max_blocks_high = xd->mb_to_bottom_edge >= 0 ? 0 : max_blocks_high; + + for (row = 0; row < max_blocks_high; row += step) + for (col = 0; col < max_blocks_wide; col += step) + predict_and_reconstruct_intra_block(twd, mi, plane, row, col, + tx_size); + } + } else { + // Prediction + dec_build_inter_predictors_sb(twd, pbi, xd, mi_row, mi_col); +#if CONFIG_MISMATCH_DEBUG + { + int plane; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + const struct macroblockd_plane *pd = &xd->plane[plane]; + int pixel_c, pixel_r; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(VPXMAX(bsize, BLOCK_8X8), &xd->plane[plane]); + const int bw = get_block_width(plane_bsize); + const int bh = get_block_height(plane_bsize); + mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0, + pd->subsampling_x, pd->subsampling_y); + mismatch_check_block_pre(pd->dst.buf, pd->dst.stride, plane, pixel_c, + pixel_r, bw, bh, + xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); + } + } +#endif + + // Reconstruction + if (!mi->skip) { + int eobtotal = 0; + int plane; + + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const TX_SIZE tx_size = plane ? get_uv_tx_size(mi, pd) : mi->tx_size; + const int num_4x4_w = pd->n4_w; + const int num_4x4_h = pd->n4_h; + const int step = (1 << tx_size); + int row, col; + const int max_blocks_wide = + num_4x4_w + (xd->mb_to_right_edge >= 0 + ? 0 + : xd->mb_to_right_edge >> (5 + pd->subsampling_x)); + const int max_blocks_high = + num_4x4_h + + (xd->mb_to_bottom_edge >= 0 + ? 0 + : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); + + xd->max_blocks_wide = xd->mb_to_right_edge >= 0 ? 0 : max_blocks_wide; + xd->max_blocks_high = xd->mb_to_bottom_edge >= 0 ? 0 : max_blocks_high; + + for (row = 0; row < max_blocks_high; row += step) + for (col = 0; col < max_blocks_wide; col += step) + eobtotal += reconstruct_inter_block(twd, mi, plane, row, col, + tx_size, mi_row, mi_col); + } + + if (!less8x8 && eobtotal == 0) mi->skip = 1; // skip loopfilter + } + } + + xd->corrupted |= vpx_reader_has_error(r); + + if (cm->lf.filter_level) { + vp9_build_mask(cm, mi, mi_row, mi_col, bw, bh); + } +} + +static void recon_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row, + int mi_col, BLOCK_SIZE bsize, int bwl, int bhl) { + VP9_COMMON *const cm = &pbi->common; + const int bw = 1 << (bwl - 1); + const int bh = 1 << (bhl - 1); + MACROBLOCKD *const xd = &twd->xd; + + MODE_INFO *mi = set_offsets_recon(cm, xd, mi_row, mi_col, bw, bh, bwl, bhl); + + if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) { + const BLOCK_SIZE uv_subsize = + ss_size_lookup[bsize][cm->subsampling_x][cm->subsampling_y]; + if (uv_subsize == BLOCK_INVALID) + vpx_internal_error(xd->error_info, VPX_CODEC_CORRUPT_FRAME, + "Invalid block size."); + } + + if (!is_inter_block(mi)) { + predict_recon_intra(xd, mi, twd, + predict_and_reconstruct_intra_block_row_mt); + } else { + // Prediction + dec_build_inter_predictors_sb(twd, pbi, xd, mi_row, mi_col); + + // Reconstruction + if (!mi->skip) { + predict_recon_inter(xd, mi, twd, reconstruct_inter_block_row_mt); + } + } + + vp9_build_mask(cm, mi, mi_row, mi_col, bw, bh); +} + +static void parse_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row, + int mi_col, BLOCK_SIZE bsize, int bwl, int bhl) { + VP9_COMMON *const cm = &pbi->common; + const int bw = 1 << (bwl - 1); + const int bh = 1 << (bhl - 1); + const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col); + const int y_mis = VPXMIN(bh, cm->mi_rows - mi_row); + vpx_reader *r = &twd->bit_reader; + MACROBLOCKD *const xd = &twd->xd; + + MODE_INFO *mi = set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, + y_mis, bwl, bhl); + + if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) { + const BLOCK_SIZE uv_subsize = + ss_size_lookup[bsize][cm->subsampling_x][cm->subsampling_y]; + if (uv_subsize == BLOCK_INVALID) + vpx_internal_error(xd->error_info, VPX_CODEC_CORRUPT_FRAME, + "Invalid block size."); + } + + vp9_read_mode_info(twd, pbi, mi_row, mi_col, x_mis, y_mis); + + if (mi->skip) { + dec_reset_skip_context(xd); + } + + if (!is_inter_block(mi)) { + predict_recon_intra(xd, mi, twd, parse_intra_block_row_mt); + } else { + if (!mi->skip) { + tran_low_t *dqcoeff[MAX_MB_PLANE]; + int *eob[MAX_MB_PLANE]; + int plane; + int eobtotal; + // Based on eobtotal and bsize, this may be mi->skip may be set to true + // In that case dqcoeff and eob need to be backed up and restored as + // recon_block will not increment these pointers for skip cases + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + dqcoeff[plane] = pd->dqcoeff; + eob[plane] = pd->eob; + } + eobtotal = predict_recon_inter(xd, mi, twd, parse_inter_block_row_mt); + + if (bsize >= BLOCK_8X8 && eobtotal == 0) { + mi->skip = 1; // skip loopfilter + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + struct macroblockd_plane *pd = &xd->plane[plane]; + pd->dqcoeff = dqcoeff[plane]; + pd->eob = eob[plane]; + } + } + } + } + + xd->corrupted |= vpx_reader_has_error(r); +} + +static INLINE int dec_partition_plane_context(TileWorkerData *twd, int mi_row, + int mi_col, int bsl) { + const PARTITION_CONTEXT *above_ctx = twd->xd.above_seg_context + mi_col; + const PARTITION_CONTEXT *left_ctx = + twd->xd.left_seg_context + (mi_row & MI_MASK); + int above = (*above_ctx >> bsl) & 1, left = (*left_ctx >> bsl) & 1; + + // assert(bsl >= 0); + + return (left * 2 + above) + bsl * PARTITION_PLOFFSET; +} + +static INLINE void dec_update_partition_context(TileWorkerData *twd, int mi_row, + int mi_col, BLOCK_SIZE subsize, + int bw) { + PARTITION_CONTEXT *const above_ctx = twd->xd.above_seg_context + mi_col; + PARTITION_CONTEXT *const left_ctx = + twd->xd.left_seg_context + (mi_row & MI_MASK); + + // update the partition context at the end notes. set partition bits + // of block sizes larger than the current one to be one, and partition + // bits of smaller block sizes to be zero. + memset(above_ctx, partition_context_lookup[subsize].above, bw); + memset(left_ctx, partition_context_lookup[subsize].left, bw); +} + +static PARTITION_TYPE read_partition(TileWorkerData *twd, int mi_row, + int mi_col, int has_rows, int has_cols, + int bsl) { + const int ctx = dec_partition_plane_context(twd, mi_row, mi_col, bsl); + const vpx_prob *const probs = twd->xd.partition_probs[ctx]; + FRAME_COUNTS *counts = twd->xd.counts; + PARTITION_TYPE p; + vpx_reader *r = &twd->bit_reader; + + if (has_rows && has_cols) + p = (PARTITION_TYPE)vpx_read_tree(r, vp9_partition_tree, probs); + else if (!has_rows && has_cols) + p = vpx_read(r, probs[1]) ? PARTITION_SPLIT : PARTITION_HORZ; + else if (has_rows && !has_cols) + p = vpx_read(r, probs[2]) ? PARTITION_SPLIT : PARTITION_VERT; + else + p = PARTITION_SPLIT; + + if (counts) ++counts->partition[ctx][p]; + + return p; +} + +// TODO(slavarnway): eliminate bsize and subsize in future commits +static void decode_partition(TileWorkerData *twd, VP9Decoder *const pbi, + int mi_row, int mi_col, BLOCK_SIZE bsize, + int n4x4_l2) { + VP9_COMMON *const cm = &pbi->common; + const int n8x8_l2 = n4x4_l2 - 1; + const int num_8x8_wh = 1 << n8x8_l2; + const int hbs = num_8x8_wh >> 1; + PARTITION_TYPE partition; + BLOCK_SIZE subsize; + const int has_rows = (mi_row + hbs) < cm->mi_rows; + const int has_cols = (mi_col + hbs) < cm->mi_cols; + MACROBLOCKD *const xd = &twd->xd; + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + partition = read_partition(twd, mi_row, mi_col, has_rows, has_cols, n8x8_l2); + subsize = subsize_lookup[partition][bsize]; // get_subsize(bsize, partition); + if (!hbs) { + // calculate bmode block dimensions (log 2) + xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT); + xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ); + decode_block(twd, pbi, mi_row, mi_col, subsize, 1, 1); + } else { + switch (partition) { + case PARTITION_NONE: + decode_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n4x4_l2); + break; + case PARTITION_HORZ: + decode_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n8x8_l2); + if (has_rows) + decode_block(twd, pbi, mi_row + hbs, mi_col, subsize, n4x4_l2, + n8x8_l2); + break; + case PARTITION_VERT: + decode_block(twd, pbi, mi_row, mi_col, subsize, n8x8_l2, n4x4_l2); + if (has_cols) + decode_block(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2, + n4x4_l2); + break; + case PARTITION_SPLIT: + decode_partition(twd, pbi, mi_row, mi_col, subsize, n8x8_l2); + decode_partition(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2); + decode_partition(twd, pbi, mi_row + hbs, mi_col, subsize, n8x8_l2); + decode_partition(twd, pbi, mi_row + hbs, mi_col + hbs, subsize, + n8x8_l2); + break; + default: assert(0 && "Invalid partition type"); + } + } + + // update partition context + if (bsize >= BLOCK_8X8 && + (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT)) + dec_update_partition_context(twd, mi_row, mi_col, subsize, num_8x8_wh); +} + +static void process_partition(TileWorkerData *twd, VP9Decoder *const pbi, + int mi_row, int mi_col, BLOCK_SIZE bsize, + int n4x4_l2, int parse_recon_flag, + process_block_fn_t process_block) { + VP9_COMMON *const cm = &pbi->common; + const int n8x8_l2 = n4x4_l2 - 1; + const int num_8x8_wh = 1 << n8x8_l2; + const int hbs = num_8x8_wh >> 1; + PARTITION_TYPE partition; + BLOCK_SIZE subsize; + const int has_rows = (mi_row + hbs) < cm->mi_rows; + const int has_cols = (mi_col + hbs) < cm->mi_cols; + MACROBLOCKD *const xd = &twd->xd; + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + if (parse_recon_flag & PARSE) { + *xd->partition = + read_partition(twd, mi_row, mi_col, has_rows, has_cols, n8x8_l2); + } + + partition = *xd->partition; + xd->partition++; + + subsize = get_subsize(bsize, partition); + if (!hbs) { + // calculate bmode block dimensions (log 2) + xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT); + xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ); + process_block(twd, pbi, mi_row, mi_col, subsize, 1, 1); + } else { + switch (partition) { + case PARTITION_NONE: + process_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n4x4_l2); + break; + case PARTITION_HORZ: + process_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n8x8_l2); + if (has_rows) + process_block(twd, pbi, mi_row + hbs, mi_col, subsize, n4x4_l2, + n8x8_l2); + break; + case PARTITION_VERT: + process_block(twd, pbi, mi_row, mi_col, subsize, n8x8_l2, n4x4_l2); + if (has_cols) + process_block(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2, + n4x4_l2); + break; + case PARTITION_SPLIT: + process_partition(twd, pbi, mi_row, mi_col, subsize, n8x8_l2, + parse_recon_flag, process_block); + process_partition(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2, + parse_recon_flag, process_block); + process_partition(twd, pbi, mi_row + hbs, mi_col, subsize, n8x8_l2, + parse_recon_flag, process_block); + process_partition(twd, pbi, mi_row + hbs, mi_col + hbs, subsize, + n8x8_l2, parse_recon_flag, process_block); + break; + default: assert(0 && "Invalid partition type"); + } + } + + if (parse_recon_flag & PARSE) { + // update partition context + if ((bsize == BLOCK_8X8 || partition != PARTITION_SPLIT) && + bsize >= BLOCK_8X8) + dec_update_partition_context(twd, mi_row, mi_col, subsize, num_8x8_wh); + } +} + +static void setup_token_decoder(const uint8_t *data, const uint8_t *data_end, + size_t read_size, + struct vpx_internal_error_info *error_info, + vpx_reader *r, vpx_decrypt_cb decrypt_cb, + void *decrypt_state) { + // Validate the calculated partition length. If the buffer described by the + // partition can't be fully read then throw an error. + if (!read_is_valid(data, read_size, data_end)) + vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME, + "Truncated packet or corrupt tile length"); + + if (vpx_reader_init(r, data, read_size, decrypt_cb, decrypt_state)) + vpx_internal_error(error_info, VPX_CODEC_MEM_ERROR, + "Failed to allocate bool decoder %d", 1); +} + +static void read_coef_probs_common(vp9_coeff_probs_model *coef_probs, + vpx_reader *r) { + int i, j, k, l, m; + + if (vpx_read_bit(r)) + for (i = 0; i < PLANE_TYPES; ++i) + for (j = 0; j < REF_TYPES; ++j) + for (k = 0; k < COEF_BANDS; ++k) + for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) + for (m = 0; m < UNCONSTRAINED_NODES; ++m) + vp9_diff_update_prob(r, &coef_probs[i][j][k][l][m]); +} + +static void read_coef_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode, vpx_reader *r) { + const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode]; + TX_SIZE tx_size; + for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size) + read_coef_probs_common(fc->coef_probs[tx_size], r); +} + +static void setup_segmentation(struct segmentation *seg, + struct vpx_read_bit_buffer *rb) { + int i, j; + + seg->update_map = 0; + seg->update_data = 0; + + seg->enabled = vpx_rb_read_bit(rb); + if (!seg->enabled) return; + + // Segmentation map update + seg->update_map = vpx_rb_read_bit(rb); + if (seg->update_map) { + for (i = 0; i < SEG_TREE_PROBS; i++) + seg->tree_probs[i] = + vpx_rb_read_bit(rb) ? vpx_rb_read_literal(rb, 8) : MAX_PROB; + + seg->temporal_update = vpx_rb_read_bit(rb); + if (seg->temporal_update) { + for (i = 0; i < PREDICTION_PROBS; i++) + seg->pred_probs[i] = + vpx_rb_read_bit(rb) ? vpx_rb_read_literal(rb, 8) : MAX_PROB; + } else { + for (i = 0; i < PREDICTION_PROBS; i++) seg->pred_probs[i] = MAX_PROB; + } + } + + // Segmentation data update + seg->update_data = vpx_rb_read_bit(rb); + if (seg->update_data) { + seg->abs_delta = vpx_rb_read_bit(rb); + + vp9_clearall_segfeatures(seg); + + for (i = 0; i < MAX_SEGMENTS; i++) { + for (j = 0; j < SEG_LVL_MAX; j++) { + int data = 0; + const int feature_enabled = vpx_rb_read_bit(rb); + if (feature_enabled) { + vp9_enable_segfeature(seg, i, j); + data = decode_unsigned_max(rb, vp9_seg_feature_data_max(j)); + if (vp9_is_segfeature_signed(j)) + data = vpx_rb_read_bit(rb) ? -data : data; + } + vp9_set_segdata(seg, i, j, data); + } + } + } +} + +static void setup_loopfilter(struct loopfilter *lf, + struct vpx_read_bit_buffer *rb) { + lf->filter_level = vpx_rb_read_literal(rb, 6); + lf->sharpness_level = vpx_rb_read_literal(rb, 3); + + // Read in loop filter deltas applied at the MB level based on mode or ref + // frame. + lf->mode_ref_delta_update = 0; + + lf->mode_ref_delta_enabled = vpx_rb_read_bit(rb); + if (lf->mode_ref_delta_enabled) { + lf->mode_ref_delta_update = vpx_rb_read_bit(rb); + if (lf->mode_ref_delta_update) { + int i; + + for (i = 0; i < MAX_REF_LF_DELTAS; i++) + if (vpx_rb_read_bit(rb)) + lf->ref_deltas[i] = vpx_rb_read_signed_literal(rb, 6); + + for (i = 0; i < MAX_MODE_LF_DELTAS; i++) + if (vpx_rb_read_bit(rb)) + lf->mode_deltas[i] = vpx_rb_read_signed_literal(rb, 6); + } + } +} + +static INLINE int read_delta_q(struct vpx_read_bit_buffer *rb) { + return vpx_rb_read_bit(rb) ? vpx_rb_read_signed_literal(rb, 4) : 0; +} + +static void setup_quantization(VP9_COMMON *const cm, MACROBLOCKD *const xd, + struct vpx_read_bit_buffer *rb) { + cm->base_qindex = vpx_rb_read_literal(rb, QINDEX_BITS); + cm->y_dc_delta_q = read_delta_q(rb); + cm->uv_dc_delta_q = read_delta_q(rb); + cm->uv_ac_delta_q = read_delta_q(rb); + cm->dequant_bit_depth = cm->bit_depth; + xd->lossless = cm->base_qindex == 0 && cm->y_dc_delta_q == 0 && + cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0; + +#if CONFIG_VP9_HIGHBITDEPTH + xd->bd = (int)cm->bit_depth; +#endif +} + +static void setup_segmentation_dequant(VP9_COMMON *const cm) { + // Build y/uv dequant values based on segmentation. + if (cm->seg.enabled) { + int i; + for (i = 0; i < MAX_SEGMENTS; ++i) { + const int qindex = vp9_get_qindex(&cm->seg, i, cm->base_qindex); + cm->y_dequant[i][0] = + vp9_dc_quant(qindex, cm->y_dc_delta_q, cm->bit_depth); + cm->y_dequant[i][1] = vp9_ac_quant(qindex, 0, cm->bit_depth); + cm->uv_dequant[i][0] = + vp9_dc_quant(qindex, cm->uv_dc_delta_q, cm->bit_depth); + cm->uv_dequant[i][1] = + vp9_ac_quant(qindex, cm->uv_ac_delta_q, cm->bit_depth); + } + } else { + const int qindex = cm->base_qindex; + // When segmentation is disabled, only the first value is used. The + // remaining are don't cares. + cm->y_dequant[0][0] = vp9_dc_quant(qindex, cm->y_dc_delta_q, cm->bit_depth); + cm->y_dequant[0][1] = vp9_ac_quant(qindex, 0, cm->bit_depth); + cm->uv_dequant[0][0] = + vp9_dc_quant(qindex, cm->uv_dc_delta_q, cm->bit_depth); + cm->uv_dequant[0][1] = + vp9_ac_quant(qindex, cm->uv_ac_delta_q, cm->bit_depth); + } +} + +static INTERP_FILTER read_interp_filter(struct vpx_read_bit_buffer *rb) { + const INTERP_FILTER literal_to_filter[] = { EIGHTTAP_SMOOTH, EIGHTTAP, + EIGHTTAP_SHARP, BILINEAR }; + return vpx_rb_read_bit(rb) ? SWITCHABLE + : literal_to_filter[vpx_rb_read_literal(rb, 2)]; +} + +static void setup_render_size(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) { + cm->render_width = cm->width; + cm->render_height = cm->height; + if (vpx_rb_read_bit(rb)) + vp9_read_frame_size(rb, &cm->render_width, &cm->render_height); +} + +static void resize_mv_buffer(VP9_COMMON *cm) { + vpx_free(cm->cur_frame->mvs); + cm->cur_frame->mi_rows = cm->mi_rows; + cm->cur_frame->mi_cols = cm->mi_cols; + CHECK_MEM_ERROR(&cm->error, cm->cur_frame->mvs, + (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols, + sizeof(*cm->cur_frame->mvs))); +} + +static void resize_context_buffers(VP9_COMMON *cm, int width, int height) { +#if CONFIG_SIZE_LIMIT + if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Dimensions of %dx%d beyond allowed size of %dx%d.", + width, height, DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT); +#endif + if (cm->width != width || cm->height != height) { + const int new_mi_rows = + ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2) >> MI_SIZE_LOG2; + const int new_mi_cols = + ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2) >> MI_SIZE_LOG2; + + // Allocations in vp9_alloc_context_buffers() depend on individual + // dimensions as well as the overall size. + if (new_mi_cols > cm->mi_cols || new_mi_rows > cm->mi_rows) { + if (vp9_alloc_context_buffers(cm, width, height)) { + // The cm->mi_* values have been cleared and any existing context + // buffers have been freed. Clear cm->width and cm->height to be + // consistent and to force a realloc next time. + cm->width = 0; + cm->height = 0; + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate context buffers"); + } + } else { + vp9_set_mb_mi(cm, width, height); + } + vp9_init_context_buffers(cm); + cm->width = width; + cm->height = height; + } + if (cm->cur_frame->mvs == NULL || cm->mi_rows > cm->cur_frame->mi_rows || + cm->mi_cols > cm->cur_frame->mi_cols) { + resize_mv_buffer(cm); + } +} + +static void setup_frame_size(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) { + int width, height; + BufferPool *const pool = cm->buffer_pool; + vp9_read_frame_size(rb, &width, &height); + resize_context_buffers(cm, width, height); + setup_render_size(cm, rb); + + if (vpx_realloc_frame_buffer( + get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x, + cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_DEC_BORDER_IN_PIXELS, cm->byte_alignment, + &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb, + pool->cb_priv)) { + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + } + + pool->frame_bufs[cm->new_fb_idx].released = 0; + pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x; + pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y; + pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth; + pool->frame_bufs[cm->new_fb_idx].buf.color_space = cm->color_space; + pool->frame_bufs[cm->new_fb_idx].buf.color_range = cm->color_range; + pool->frame_bufs[cm->new_fb_idx].buf.render_width = cm->render_width; + pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height; +} + +static INLINE int valid_ref_frame_img_fmt(vpx_bit_depth_t ref_bit_depth, + int ref_xss, int ref_yss, + vpx_bit_depth_t this_bit_depth, + int this_xss, int this_yss) { + return ref_bit_depth == this_bit_depth && ref_xss == this_xss && + ref_yss == this_yss; +} + +static void setup_frame_size_with_refs(VP9_COMMON *cm, + struct vpx_read_bit_buffer *rb) { + int width, height; + int found = 0, i; + int has_valid_ref_frame = 0; + BufferPool *const pool = cm->buffer_pool; + for (i = 0; i < REFS_PER_FRAME; ++i) { + if (vpx_rb_read_bit(rb)) { + if (cm->frame_refs[i].idx != INVALID_IDX) { + YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf; + width = buf->y_crop_width; + height = buf->y_crop_height; + found = 1; + break; + } else { + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Failed to decode frame size"); + } + } + } + + if (!found) vp9_read_frame_size(rb, &width, &height); + + if (width <= 0 || height <= 0) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Invalid frame size"); + + // Check to make sure at least one of frames that this frame references + // has valid dimensions. + for (i = 0; i < REFS_PER_FRAME; ++i) { + RefBuffer *const ref_frame = &cm->frame_refs[i]; + has_valid_ref_frame |= + (ref_frame->idx != INVALID_IDX && + valid_ref_frame_size(ref_frame->buf->y_crop_width, + ref_frame->buf->y_crop_height, width, height)); + } + if (!has_valid_ref_frame) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Referenced frame has invalid size"); + for (i = 0; i < REFS_PER_FRAME; ++i) { + RefBuffer *const ref_frame = &cm->frame_refs[i]; + if (ref_frame->idx == INVALID_IDX || + !valid_ref_frame_img_fmt(ref_frame->buf->bit_depth, + ref_frame->buf->subsampling_x, + ref_frame->buf->subsampling_y, cm->bit_depth, + cm->subsampling_x, cm->subsampling_y)) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Referenced frame has incompatible color format"); + } + + resize_context_buffers(cm, width, height); + setup_render_size(cm, rb); + + if (vpx_realloc_frame_buffer( + get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x, + cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_DEC_BORDER_IN_PIXELS, cm->byte_alignment, + &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb, + pool->cb_priv)) { + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + } + + pool->frame_bufs[cm->new_fb_idx].released = 0; + pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x; + pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y; + pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth; + pool->frame_bufs[cm->new_fb_idx].buf.color_space = cm->color_space; + pool->frame_bufs[cm->new_fb_idx].buf.color_range = cm->color_range; + pool->frame_bufs[cm->new_fb_idx].buf.render_width = cm->render_width; + pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height; +} + +static void setup_tile_info(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) { + int min_log2_tile_cols, max_log2_tile_cols, max_ones; + vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols); + + // columns + max_ones = max_log2_tile_cols - min_log2_tile_cols; + cm->log2_tile_cols = min_log2_tile_cols; + while (max_ones-- && vpx_rb_read_bit(rb)) cm->log2_tile_cols++; + + if (cm->log2_tile_cols > 6) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Invalid number of tile columns"); + + // rows + cm->log2_tile_rows = vpx_rb_read_bit(rb); + if (cm->log2_tile_rows) cm->log2_tile_rows += vpx_rb_read_bit(rb); +} + +// Reads the next tile returning its size and adjusting '*data' accordingly +// based on 'is_last'. +static void get_tile_buffer(const uint8_t *const data_end, int is_last, + struct vpx_internal_error_info *error_info, + const uint8_t **data, vpx_decrypt_cb decrypt_cb, + void *decrypt_state, TileBuffer *buf) { + size_t size; + + if (!is_last) { + if (!read_is_valid(*data, 4, data_end)) + vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME, + "Truncated packet or corrupt tile length"); + + if (decrypt_cb) { + uint8_t be_data[4]; + decrypt_cb(decrypt_state, *data, be_data, 4); + size = mem_get_be32(be_data); + } else { + size = mem_get_be32(*data); + } + *data += 4; + + if (size > (size_t)(data_end - *data)) + vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME, + "Truncated packet or corrupt tile size"); + } else { + size = data_end - *data; + } + + buf->data = *data; + buf->size = size; + + *data += size; +} + +static void get_tile_buffers(VP9Decoder *pbi, const uint8_t *data, + const uint8_t *data_end, int tile_cols, + int tile_rows, + TileBuffer (*tile_buffers)[1 << 6]) { + int r, c; + + for (r = 0; r < tile_rows; ++r) { + for (c = 0; c < tile_cols; ++c) { + const int is_last = (r == tile_rows - 1) && (c == tile_cols - 1); + TileBuffer *const buf = &tile_buffers[r][c]; + buf->col = c; + get_tile_buffer(data_end, is_last, &pbi->common.error, &data, + pbi->decrypt_cb, pbi->decrypt_state, buf); + } + } +} + +static void map_write(RowMTWorkerData *const row_mt_worker_data, int map_idx, + int sync_idx) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&row_mt_worker_data->recon_sync_mutex[sync_idx]); + row_mt_worker_data->recon_map[map_idx] = 1; + pthread_cond_signal(&row_mt_worker_data->recon_sync_cond[sync_idx]); + pthread_mutex_unlock(&row_mt_worker_data->recon_sync_mutex[sync_idx]); +#else + (void)row_mt_worker_data; + (void)map_idx; + (void)sync_idx; +#endif // CONFIG_MULTITHREAD +} + +static void map_read(RowMTWorkerData *const row_mt_worker_data, int map_idx, + int sync_idx) { +#if CONFIG_MULTITHREAD + volatile int8_t *map = row_mt_worker_data->recon_map + map_idx; + pthread_mutex_t *const mutex = + &row_mt_worker_data->recon_sync_mutex[sync_idx]; + pthread_mutex_lock(mutex); + while (!(*map)) { + pthread_cond_wait(&row_mt_worker_data->recon_sync_cond[sync_idx], mutex); + } + pthread_mutex_unlock(mutex); +#else + (void)row_mt_worker_data; + (void)map_idx; + (void)sync_idx; +#endif // CONFIG_MULTITHREAD +} + +static int lpf_map_write_check(VP9LfSync *lf_sync, int row, int num_tile_cols) { + int return_val = 0; +#if CONFIG_MULTITHREAD + int corrupted; + pthread_mutex_lock(lf_sync->lf_mutex); + corrupted = lf_sync->corrupted; + pthread_mutex_unlock(lf_sync->lf_mutex); + if (!corrupted) { + pthread_mutex_lock(&lf_sync->recon_done_mutex[row]); + lf_sync->num_tiles_done[row] += 1; + if (num_tile_cols == lf_sync->num_tiles_done[row]) return_val = 1; + pthread_mutex_unlock(&lf_sync->recon_done_mutex[row]); + } +#else + (void)lf_sync; + (void)row; + (void)num_tile_cols; +#endif + return return_val; +} + +static void vp9_tile_done(VP9Decoder *pbi) { +#if CONFIG_MULTITHREAD + int terminate; + RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data; + const int all_parse_done = 1 << pbi->common.log2_tile_cols; + pthread_mutex_lock(&row_mt_worker_data->recon_done_mutex); + row_mt_worker_data->num_tiles_done++; + terminate = all_parse_done == row_mt_worker_data->num_tiles_done; + pthread_mutex_unlock(&row_mt_worker_data->recon_done_mutex); + if (terminate) { + vp9_jobq_terminate(&row_mt_worker_data->jobq); + } +#else + (void)pbi; +#endif +} + +static void vp9_jobq_alloc(VP9Decoder *pbi) { + VP9_COMMON *const cm = &pbi->common; + RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data; + const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows); + const int sb_rows = aligned_rows >> MI_BLOCK_SIZE_LOG2; + const int tile_cols = 1 << cm->log2_tile_cols; + const size_t jobq_size = (tile_cols * sb_rows * 2 + sb_rows) * sizeof(Job); + + if (jobq_size > row_mt_worker_data->jobq_size) { + vpx_free(row_mt_worker_data->jobq_buf); + CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->jobq_buf, + vpx_calloc(1, jobq_size)); + vp9_jobq_init(&row_mt_worker_data->jobq, row_mt_worker_data->jobq_buf, + jobq_size); + row_mt_worker_data->jobq_size = jobq_size; + } +} + +static void recon_tile_row(TileWorkerData *tile_data, VP9Decoder *pbi, + int mi_row, int is_last_row, VP9LfSync *lf_sync, + int cur_tile_col) { + VP9_COMMON *const cm = &pbi->common; + RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data; + const int tile_cols = 1 << cm->log2_tile_cols; + const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int sb_cols = aligned_cols >> MI_BLOCK_SIZE_LOG2; + const int cur_sb_row = mi_row >> MI_BLOCK_SIZE_LOG2; + int mi_col_start = tile_data->xd.tile.mi_col_start; + int mi_col_end = tile_data->xd.tile.mi_col_end; + int mi_col; + + vp9_zero(tile_data->xd.left_context); + vp9_zero(tile_data->xd.left_seg_context); + for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += MI_BLOCK_SIZE) { + const int c = mi_col >> MI_BLOCK_SIZE_LOG2; + int plane; + const int sb_num = (cur_sb_row * (aligned_cols >> MI_BLOCK_SIZE_LOG2) + c); + + // Top Dependency + if (cur_sb_row) { + map_read(row_mt_worker_data, ((cur_sb_row - 1) * sb_cols) + c, + ((cur_sb_row - 1) * tile_cols) + cur_tile_col); + } + + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + tile_data->xd.plane[plane].eob = + row_mt_worker_data->eob[plane] + (sb_num << EOBS_PER_SB_LOG2); + tile_data->xd.plane[plane].dqcoeff = + row_mt_worker_data->dqcoeff[plane] + (sb_num << DQCOEFFS_PER_SB_LOG2); + } + tile_data->xd.partition = + row_mt_worker_data->partition + (sb_num * PARTITIONS_PER_SB); + process_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4, RECON, + recon_block); + if (cm->lf.filter_level && !cm->skip_loop_filter) { + // Queue LPF_JOB + int is_lpf_job_ready = 0; + + if (mi_col + MI_BLOCK_SIZE >= mi_col_end) { + // Checks if this row has been decoded in all tiles + is_lpf_job_ready = lpf_map_write_check(lf_sync, cur_sb_row, tile_cols); + + if (is_lpf_job_ready) { + Job lpf_job; + lpf_job.job_type = LPF_JOB; + if (cur_sb_row > 0) { + lpf_job.row_num = mi_row - MI_BLOCK_SIZE; + vp9_jobq_queue(&row_mt_worker_data->jobq, &lpf_job, + sizeof(lpf_job)); + } + if (is_last_row) { + lpf_job.row_num = mi_row; + vp9_jobq_queue(&row_mt_worker_data->jobq, &lpf_job, + sizeof(lpf_job)); + } + } + } + } + map_write(row_mt_worker_data, (cur_sb_row * sb_cols) + c, + (cur_sb_row * tile_cols) + cur_tile_col); + } +} + +static void parse_tile_row(TileWorkerData *tile_data, VP9Decoder *pbi, + int mi_row, int cur_tile_col, uint8_t **data_end) { + int mi_col; + VP9_COMMON *const cm = &pbi->common; + RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data; + TileInfo *tile = &tile_data->xd.tile; + TileBuffer *const buf = &pbi->tile_buffers[cur_tile_col]; + const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols); + + vp9_zero(tile_data->dqcoeff); + vp9_tile_init(tile, cm, 0, cur_tile_col); + + /* Update reader only at the beginning of each row in a tile */ + if (mi_row == 0) { + setup_token_decoder(buf->data, *data_end, buf->size, &tile_data->error_info, + &tile_data->bit_reader, pbi->decrypt_cb, + pbi->decrypt_state); + } + vp9_init_macroblockd(cm, &tile_data->xd, tile_data->dqcoeff); + tile_data->xd.error_info = &tile_data->error_info; + + vp9_zero(tile_data->xd.left_context); + vp9_zero(tile_data->xd.left_seg_context); + for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; + mi_col += MI_BLOCK_SIZE) { + const int r = mi_row >> MI_BLOCK_SIZE_LOG2; + const int c = mi_col >> MI_BLOCK_SIZE_LOG2; + int plane; + const int sb_num = (r * (aligned_cols >> MI_BLOCK_SIZE_LOG2) + c); + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + tile_data->xd.plane[plane].eob = + row_mt_worker_data->eob[plane] + (sb_num << EOBS_PER_SB_LOG2); + tile_data->xd.plane[plane].dqcoeff = + row_mt_worker_data->dqcoeff[plane] + (sb_num << DQCOEFFS_PER_SB_LOG2); + } + tile_data->xd.partition = + row_mt_worker_data->partition + sb_num * PARTITIONS_PER_SB; + process_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4, PARSE, + parse_block); + } +} + +static int row_decode_worker_hook(void *arg1, void *arg2) { + ThreadData *const thread_data = (ThreadData *)arg1; + uint8_t **data_end = (uint8_t **)arg2; + VP9Decoder *const pbi = thread_data->pbi; + VP9_COMMON *const cm = &pbi->common; + RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data; + const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows); + const int sb_rows = aligned_rows >> MI_BLOCK_SIZE_LOG2; + const int tile_cols = 1 << cm->log2_tile_cols; + Job job; + LFWorkerData *lf_data = thread_data->lf_data; + VP9LfSync *lf_sync = thread_data->lf_sync; + volatile int corrupted = 0; + TileWorkerData *volatile tile_data_recon = NULL; + + while (!vp9_jobq_dequeue(&row_mt_worker_data->jobq, &job, sizeof(job), 1)) { + int mi_col; + const int mi_row = job.row_num; + + if (job.job_type == LPF_JOB) { + lf_data->start = mi_row; + lf_data->stop = lf_data->start + MI_BLOCK_SIZE; + + if (cm->lf.filter_level && !cm->skip_loop_filter && + mi_row < cm->mi_rows) { + vp9_loopfilter_job(lf_data, lf_sync); + } + } else if (job.job_type == RECON_JOB) { + const int cur_sb_row = mi_row >> MI_BLOCK_SIZE_LOG2; + const int is_last_row = sb_rows - 1 == cur_sb_row; + int mi_col_start, mi_col_end; + if (!tile_data_recon) + CHECK_MEM_ERROR(&cm->error, tile_data_recon, + vpx_memalign(32, sizeof(TileWorkerData))); + + tile_data_recon->xd = pbi->mb; + vp9_tile_init(&tile_data_recon->xd.tile, cm, 0, job.tile_col); + vp9_init_macroblockd(cm, &tile_data_recon->xd, tile_data_recon->dqcoeff); + mi_col_start = tile_data_recon->xd.tile.mi_col_start; + mi_col_end = tile_data_recon->xd.tile.mi_col_end; + + if (setjmp(tile_data_recon->error_info.jmp)) { + const int sb_cols = aligned_cols >> MI_BLOCK_SIZE_LOG2; + tile_data_recon->error_info.setjmp = 0; + corrupted = 1; + for (mi_col = mi_col_start; mi_col < mi_col_end; + mi_col += MI_BLOCK_SIZE) { + const int c = mi_col >> MI_BLOCK_SIZE_LOG2; + map_write(row_mt_worker_data, (cur_sb_row * sb_cols) + c, + (cur_sb_row * tile_cols) + job.tile_col); + } + if (is_last_row) { + vp9_tile_done(pbi); + } + continue; + } + + tile_data_recon->error_info.setjmp = 1; + tile_data_recon->xd.error_info = &tile_data_recon->error_info; + + recon_tile_row(tile_data_recon, pbi, mi_row, is_last_row, lf_sync, + job.tile_col); + + if (corrupted) + vpx_internal_error(&tile_data_recon->error_info, + VPX_CODEC_CORRUPT_FRAME, + "Failed to decode tile data"); + + if (is_last_row) { + vp9_tile_done(pbi); + } + } else if (job.job_type == PARSE_JOB) { + TileWorkerData *const tile_data = &pbi->tile_worker_data[job.tile_col]; + + if (setjmp(tile_data->error_info.jmp)) { + tile_data->error_info.setjmp = 0; + corrupted = 1; + vp9_tile_done(pbi); + continue; + } + + tile_data->xd = pbi->mb; + tile_data->xd.counts = + cm->frame_parallel_decoding_mode ? 0 : &tile_data->counts; + + tile_data->error_info.setjmp = 1; + + parse_tile_row(tile_data, pbi, mi_row, job.tile_col, data_end); + + corrupted |= tile_data->xd.corrupted; + if (corrupted) + vpx_internal_error(&tile_data->error_info, VPX_CODEC_CORRUPT_FRAME, + "Failed to decode tile data"); + + /* Queue in the recon_job for this row */ + { + Job recon_job; + recon_job.row_num = mi_row; + recon_job.tile_col = job.tile_col; + recon_job.job_type = RECON_JOB; + vp9_jobq_queue(&row_mt_worker_data->jobq, &recon_job, + sizeof(recon_job)); + } + + /* Queue next parse job */ + if (mi_row + MI_BLOCK_SIZE < cm->mi_rows) { + Job parse_job; + parse_job.row_num = mi_row + MI_BLOCK_SIZE; + parse_job.tile_col = job.tile_col; + parse_job.job_type = PARSE_JOB; + vp9_jobq_queue(&row_mt_worker_data->jobq, &parse_job, + sizeof(parse_job)); + } + } + } + + vpx_free(tile_data_recon); + return !corrupted; +} + +static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data, + const uint8_t *data_end) { + VP9_COMMON *const cm = &pbi->common; + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + TileBuffer tile_buffers[4][1 << 6]; + int tile_row, tile_col; + int mi_row, mi_col; + TileWorkerData *tile_data = NULL; + + if (cm->lf.filter_level && !cm->skip_loop_filter && + pbi->lf_worker.data1 == NULL) { + CHECK_MEM_ERROR(&cm->error, pbi->lf_worker.data1, + vpx_memalign(32, sizeof(LFWorkerData))); + pbi->lf_worker.hook = vp9_loop_filter_worker; + if (pbi->max_threads > 1 && !winterface->reset(&pbi->lf_worker)) { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Loop filter thread creation failed"); + } + } + + if (cm->lf.filter_level && !cm->skip_loop_filter) { + LFWorkerData *const lf_data = (LFWorkerData *)pbi->lf_worker.data1; + // Be sure to sync as we might be resuming after a failed frame decode. + winterface->sync(&pbi->lf_worker); + vp9_loop_filter_data_reset(lf_data, get_frame_new_buffer(cm), cm, + pbi->mb.plane); + } + + assert(tile_rows <= 4); + assert(tile_cols <= (1 << 6)); + + // Note: this memset assumes above_context[0], [1] and [2] + // are allocated as part of the same buffer. + memset(cm->above_context, 0, + sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_cols); + + memset(cm->above_seg_context, 0, + sizeof(*cm->above_seg_context) * aligned_cols); + + vp9_reset_lfm(cm); + + get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers); + + // Load all tile information into tile_data. + for (tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + const TileBuffer *const buf = &tile_buffers[tile_row][tile_col]; + tile_data = pbi->tile_worker_data + tile_cols * tile_row + tile_col; + tile_data->xd = pbi->mb; + tile_data->xd.corrupted = 0; + tile_data->xd.counts = + cm->frame_parallel_decoding_mode ? NULL : &cm->counts; + vp9_zero(tile_data->dqcoeff); + vp9_tile_init(&tile_data->xd.tile, cm, tile_row, tile_col); + setup_token_decoder(buf->data, data_end, buf->size, &cm->error, + &tile_data->bit_reader, pbi->decrypt_cb, + pbi->decrypt_state); + vp9_init_macroblockd(cm, &tile_data->xd, tile_data->dqcoeff); + } + } + + for (tile_row = 0; tile_row < tile_rows; ++tile_row) { + TileInfo tile; + vp9_tile_set_row(&tile, cm, tile_row); + for (mi_row = tile.mi_row_start; mi_row < tile.mi_row_end; + mi_row += MI_BLOCK_SIZE) { + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + const int col = + pbi->inv_tile_order ? tile_cols - tile_col - 1 : tile_col; + tile_data = pbi->tile_worker_data + tile_cols * tile_row + col; + vp9_tile_set_col(&tile, cm, col); + vp9_zero(tile_data->xd.left_context); + vp9_zero(tile_data->xd.left_seg_context); + for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end; + mi_col += MI_BLOCK_SIZE) { + if (pbi->row_mt == 1) { + int plane; + RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + tile_data->xd.plane[plane].eob = row_mt_worker_data->eob[plane]; + tile_data->xd.plane[plane].dqcoeff = + row_mt_worker_data->dqcoeff[plane]; + } + tile_data->xd.partition = row_mt_worker_data->partition; + process_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4, + PARSE, parse_block); + + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + tile_data->xd.plane[plane].eob = row_mt_worker_data->eob[plane]; + tile_data->xd.plane[plane].dqcoeff = + row_mt_worker_data->dqcoeff[plane]; + } + tile_data->xd.partition = row_mt_worker_data->partition; + process_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4, + RECON, recon_block); + } else { + decode_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4); + } + } + pbi->mb.corrupted |= tile_data->xd.corrupted; + if (pbi->mb.corrupted) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Failed to decode tile data"); + } + // Loopfilter one row. + if (cm->lf.filter_level && !cm->skip_loop_filter) { + const int lf_start = mi_row - MI_BLOCK_SIZE; + LFWorkerData *const lf_data = (LFWorkerData *)pbi->lf_worker.data1; + + // delay the loopfilter by 1 macroblock row. + if (lf_start < 0) continue; + + // decoding has completed: finish up the loop filter in this thread. + if (mi_row + MI_BLOCK_SIZE >= cm->mi_rows) continue; + + winterface->sync(&pbi->lf_worker); + lf_data->start = lf_start; + lf_data->stop = mi_row; + if (pbi->max_threads > 1) { + winterface->launch(&pbi->lf_worker); + } else { + winterface->execute(&pbi->lf_worker); + } + } + } + } + + // Loopfilter remaining rows in the frame. + if (cm->lf.filter_level && !cm->skip_loop_filter) { + LFWorkerData *const lf_data = (LFWorkerData *)pbi->lf_worker.data1; + winterface->sync(&pbi->lf_worker); + lf_data->start = lf_data->stop; + lf_data->stop = cm->mi_rows; + winterface->execute(&pbi->lf_worker); + } + + // Get last tile data. + tile_data = pbi->tile_worker_data + tile_cols * tile_rows - 1; + + return vpx_reader_find_end(&tile_data->bit_reader); +} + +static void set_rows_after_error(VP9LfSync *lf_sync, int start_row, int mi_rows, + int num_tiles_left, int total_num_tiles) { + do { + int mi_row; + const int aligned_rows = mi_cols_aligned_to_sb(mi_rows); + const int sb_rows = (aligned_rows >> MI_BLOCK_SIZE_LOG2); + const int corrupted = 1; + for (mi_row = start_row; mi_row < mi_rows; mi_row += MI_BLOCK_SIZE) { + const int is_last_row = (sb_rows - 1 == mi_row >> MI_BLOCK_SIZE_LOG2); + vp9_set_row(lf_sync, total_num_tiles, mi_row >> MI_BLOCK_SIZE_LOG2, + is_last_row, corrupted); + } + /* If there are multiple tiles, the second tile should start marking row + * progress from row 0. + */ + start_row = 0; + } while (num_tiles_left--); +} + +// On entry 'tile_data->data_end' points to the end of the input frame, on exit +// it is updated to reflect the bitreader position of the final tile column if +// present in the tile buffer group or NULL otherwise. +static int tile_worker_hook(void *arg1, void *arg2) { + TileWorkerData *const tile_data = (TileWorkerData *)arg1; + VP9Decoder *const pbi = (VP9Decoder *)arg2; + + TileInfo *volatile tile = &tile_data->xd.tile; + const int final_col = (1 << pbi->common.log2_tile_cols) - 1; + const uint8_t *volatile bit_reader_end = NULL; + VP9_COMMON *cm = &pbi->common; + + LFWorkerData *lf_data = tile_data->lf_data; + VP9LfSync *lf_sync = tile_data->lf_sync; + + volatile int mi_row = 0; + volatile int n = tile_data->buf_start; + if (setjmp(tile_data->error_info.jmp)) { + tile_data->error_info.setjmp = 0; + tile_data->xd.corrupted = 1; + tile_data->data_end = NULL; + if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) { + const int num_tiles_left = tile_data->buf_end - n; + const int mi_row_start = mi_row; + set_rows_after_error(lf_sync, mi_row_start, cm->mi_rows, num_tiles_left, + 1 << cm->log2_tile_cols); + } + return 0; + } + tile_data->error_info.setjmp = 1; + + tile_data->xd.corrupted = 0; + + do { + int mi_col; + const TileBuffer *const buf = pbi->tile_buffers + n; + + /* Initialize to 0 is safe since we do not deal with streams that have + * more than one row of tiles. (So tile->mi_row_start will be 0) + */ + assert(cm->log2_tile_rows == 0); + mi_row = 0; + vp9_zero(tile_data->dqcoeff); + vp9_tile_init(tile, &pbi->common, 0, buf->col); + setup_token_decoder(buf->data, tile_data->data_end, buf->size, + &tile_data->error_info, &tile_data->bit_reader, + pbi->decrypt_cb, pbi->decrypt_state); + vp9_init_macroblockd(&pbi->common, &tile_data->xd, tile_data->dqcoeff); + // init resets xd.error_info + tile_data->xd.error_info = &tile_data->error_info; + + for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end; + mi_row += MI_BLOCK_SIZE) { + vp9_zero(tile_data->xd.left_context); + vp9_zero(tile_data->xd.left_seg_context); + for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; + mi_col += MI_BLOCK_SIZE) { + decode_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4); + } + if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) { + const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows); + const int sb_rows = (aligned_rows >> MI_BLOCK_SIZE_LOG2); + const int is_last_row = (sb_rows - 1 == mi_row >> MI_BLOCK_SIZE_LOG2); + vp9_set_row(lf_sync, 1 << cm->log2_tile_cols, + mi_row >> MI_BLOCK_SIZE_LOG2, is_last_row, + tile_data->xd.corrupted); + } + } + + if (buf->col == final_col) { + bit_reader_end = vpx_reader_find_end(&tile_data->bit_reader); + } + } while (!tile_data->xd.corrupted && ++n <= tile_data->buf_end); + + if (pbi->lpf_mt_opt && n < tile_data->buf_end && cm->lf.filter_level && + !cm->skip_loop_filter) { + /* This was not incremented in the tile loop, so increment before tiles left + * calculation + */ + ++n; + set_rows_after_error(lf_sync, 0, cm->mi_rows, tile_data->buf_end - n, + 1 << cm->log2_tile_cols); + } + + if (pbi->lpf_mt_opt && !tile_data->xd.corrupted && cm->lf.filter_level && + !cm->skip_loop_filter) { + vp9_loopfilter_rows(lf_data, lf_sync); + } + + tile_data->data_end = bit_reader_end; + return !tile_data->xd.corrupted; +} + +// sorts in descending order +static int compare_tile_buffers(const void *a, const void *b) { + const TileBuffer *const buf_a = (const TileBuffer *)a; + const TileBuffer *const buf_b = (const TileBuffer *)b; + return (buf_a->size < buf_b->size) - (buf_a->size > buf_b->size); +} + +static INLINE void init_mt(VP9Decoder *pbi) { + int n; + VP9_COMMON *const cm = &pbi->common; + VP9LfSync *lf_row_sync = &pbi->lf_row_sync; + const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + + if (pbi->num_tile_workers == 0) { + const int num_threads = pbi->max_threads; + CHECK_MEM_ERROR(&cm->error, pbi->tile_workers, + vpx_malloc(num_threads * sizeof(*pbi->tile_workers))); + for (n = 0; n < num_threads; ++n) { + VPxWorker *const worker = &pbi->tile_workers[n]; + ++pbi->num_tile_workers; + + winterface->init(worker); + if (n < num_threads - 1 && !winterface->reset(worker)) { + do { + winterface->end(&pbi->tile_workers[pbi->num_tile_workers - 1]); + } while (--pbi->num_tile_workers != 0); + vpx_free(pbi->tile_workers); + pbi->tile_workers = NULL; + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Tile decoder thread creation failed"); + } + } + } + + // Initialize LPF + if ((pbi->lpf_mt_opt || pbi->row_mt) && cm->lf.filter_level && + !cm->skip_loop_filter) { + vp9_lpf_mt_init(lf_row_sync, cm, cm->lf.filter_level, + pbi->num_tile_workers); + } + + // Note: this memset assumes above_context[0], [1] and [2] + // are allocated as part of the same buffer. + memset(cm->above_context, 0, + sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols); + + memset(cm->above_seg_context, 0, + sizeof(*cm->above_seg_context) * aligned_mi_cols); + + vp9_reset_lfm(cm); +} + +static const uint8_t *decode_tiles_row_wise_mt(VP9Decoder *pbi, + const uint8_t *data, + const uint8_t *data_end) { + VP9_COMMON *const cm = &pbi->common; + RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data; + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + const int num_workers = pbi->max_threads; + int i, n; + int col; + int corrupted = 0; + const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2; + VP9LfSync *lf_row_sync = &pbi->lf_row_sync; + YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm); + + assert(tile_cols <= (1 << 6)); + assert(tile_rows == 1); + (void)tile_rows; + + memset(row_mt_worker_data->recon_map, 0, + sb_rows * sb_cols * sizeof(*row_mt_worker_data->recon_map)); + + init_mt(pbi); + + // Reset tile decoding hook + for (n = 0; n < num_workers; ++n) { + VPxWorker *const worker = &pbi->tile_workers[n]; + ThreadData *const thread_data = &pbi->row_mt_worker_data->thread_data[n]; + winterface->sync(worker); + + if (cm->lf.filter_level && !cm->skip_loop_filter) { + thread_data->lf_sync = lf_row_sync; + thread_data->lf_data = &thread_data->lf_sync->lfdata[n]; + vp9_loop_filter_data_reset(thread_data->lf_data, new_fb, cm, + pbi->mb.plane); + } + + thread_data->pbi = pbi; + + worker->hook = row_decode_worker_hook; + worker->data1 = thread_data; + worker->data2 = (void *)&row_mt_worker_data->data_end; + } + + for (col = 0; col < tile_cols; ++col) { + TileWorkerData *const tile_data = &pbi->tile_worker_data[col]; + tile_data->xd = pbi->mb; + tile_data->xd.counts = + cm->frame_parallel_decoding_mode ? NULL : &tile_data->counts; + } + + /* Reset the jobq to start of the jobq buffer */ + vp9_jobq_reset(&row_mt_worker_data->jobq); + row_mt_worker_data->num_tiles_done = 0; + row_mt_worker_data->data_end = NULL; + + // Load tile data into tile_buffers + get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, + &pbi->tile_buffers); + + // Initialize thread frame counts. + if (!cm->frame_parallel_decoding_mode) { + for (col = 0; col < tile_cols; ++col) { + TileWorkerData *const tile_data = &pbi->tile_worker_data[col]; + vp9_zero(tile_data->counts); + } + } + + // queue parse jobs for 0th row of every tile + for (col = 0; col < tile_cols; ++col) { + Job parse_job; + parse_job.row_num = 0; + parse_job.tile_col = col; + parse_job.job_type = PARSE_JOB; + vp9_jobq_queue(&row_mt_worker_data->jobq, &parse_job, sizeof(parse_job)); + } + + for (i = 0; i < num_workers; ++i) { + VPxWorker *const worker = &pbi->tile_workers[i]; + worker->had_error = 0; + if (i == num_workers - 1) { + winterface->execute(worker); + } else { + winterface->launch(worker); + } + } + + for (; n > 0; --n) { + VPxWorker *const worker = &pbi->tile_workers[n - 1]; + // TODO(jzern): The tile may have specific error data associated with + // its vpx_internal_error_info which could be propagated to the main info + // in cm. Additionally once the threads have been synced and an error is + // detected, there's no point in continuing to decode tiles. + corrupted |= !winterface->sync(worker); + } + + pbi->mb.corrupted = corrupted; + + { + /* Set data end */ + TileWorkerData *const tile_data = &pbi->tile_worker_data[tile_cols - 1]; + row_mt_worker_data->data_end = vpx_reader_find_end(&tile_data->bit_reader); + } + + // Accumulate thread frame counts. + if (!cm->frame_parallel_decoding_mode) { + for (i = 0; i < tile_cols; ++i) { + TileWorkerData *const tile_data = &pbi->tile_worker_data[i]; + vp9_accumulate_frame_counts(&cm->counts, &tile_data->counts, 1); + } + } + + return row_mt_worker_data->data_end; +} + +static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data, + const uint8_t *data_end) { + VP9_COMMON *const cm = &pbi->common; + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + const uint8_t *bit_reader_end = NULL; + VP9LfSync *lf_row_sync = &pbi->lf_row_sync; + YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm); + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + const int num_workers = VPXMIN(pbi->max_threads, tile_cols); + int n; + + assert(tile_cols <= (1 << 6)); + assert(tile_rows == 1); + (void)tile_rows; + + init_mt(pbi); + + // Reset tile decoding hook + for (n = 0; n < num_workers; ++n) { + VPxWorker *const worker = &pbi->tile_workers[n]; + TileWorkerData *const tile_data = + &pbi->tile_worker_data[n + pbi->total_tiles]; + winterface->sync(worker); + + if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) { + tile_data->lf_sync = lf_row_sync; + tile_data->lf_data = &tile_data->lf_sync->lfdata[n]; + vp9_loop_filter_data_reset(tile_data->lf_data, new_fb, cm, pbi->mb.plane); + tile_data->lf_data->y_only = 0; + } + + tile_data->xd = pbi->mb; + tile_data->xd.counts = + cm->frame_parallel_decoding_mode ? NULL : &tile_data->counts; + worker->hook = tile_worker_hook; + worker->data1 = tile_data; + worker->data2 = pbi; + } + + // Load tile data into tile_buffers + get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, + &pbi->tile_buffers); + + // Sort the buffers based on size in descending order. + qsort(pbi->tile_buffers, tile_cols, sizeof(pbi->tile_buffers[0]), + compare_tile_buffers); + + if (num_workers == tile_cols) { + // Rearrange the tile buffers such that the largest, and + // presumably the most difficult, tile will be decoded in the main thread. + // This should help minimize the number of instances where the main thread + // is waiting for a worker to complete. + const TileBuffer largest = pbi->tile_buffers[0]; + memmove(pbi->tile_buffers, pbi->tile_buffers + 1, + (tile_cols - 1) * sizeof(pbi->tile_buffers[0])); + pbi->tile_buffers[tile_cols - 1] = largest; + } else { + int start = 0, end = tile_cols - 2; + TileBuffer tmp; + + // Interleave the tiles to distribute the load between threads, assuming a + // larger tile implies it is more difficult to decode. + while (start < end) { + tmp = pbi->tile_buffers[start]; + pbi->tile_buffers[start] = pbi->tile_buffers[end]; + pbi->tile_buffers[end] = tmp; + start += 2; + end -= 2; + } + } + + // Initialize thread frame counts. + if (!cm->frame_parallel_decoding_mode) { + for (n = 0; n < num_workers; ++n) { + TileWorkerData *const tile_data = + (TileWorkerData *)pbi->tile_workers[n].data1; + vp9_zero(tile_data->counts); + } + } + + { + const int base = tile_cols / num_workers; + const int remain = tile_cols % num_workers; + int buf_start = 0; + + for (n = 0; n < num_workers; ++n) { + const int count = base + (remain + n) / num_workers; + VPxWorker *const worker = &pbi->tile_workers[n]; + TileWorkerData *const tile_data = (TileWorkerData *)worker->data1; + + tile_data->buf_start = buf_start; + tile_data->buf_end = buf_start + count - 1; + tile_data->data_end = data_end; + buf_start += count; + + worker->had_error = 0; + if (n == num_workers - 1) { + assert(tile_data->buf_end == tile_cols - 1); + winterface->execute(worker); + } else { + winterface->launch(worker); + } + } + + for (; n > 0; --n) { + VPxWorker *const worker = &pbi->tile_workers[n - 1]; + TileWorkerData *const tile_data = (TileWorkerData *)worker->data1; + // TODO(jzern): The tile may have specific error data associated with + // its vpx_internal_error_info which could be propagated to the main info + // in cm. Additionally once the threads have been synced and an error is + // detected, there's no point in continuing to decode tiles. + pbi->mb.corrupted |= !winterface->sync(worker); + if (!bit_reader_end) bit_reader_end = tile_data->data_end; + } + } + + // Accumulate thread frame counts. + if (!cm->frame_parallel_decoding_mode) { + for (n = 0; n < num_workers; ++n) { + TileWorkerData *const tile_data = + (TileWorkerData *)pbi->tile_workers[n].data1; + vp9_accumulate_frame_counts(&cm->counts, &tile_data->counts, 1); + } + } + + assert(bit_reader_end || pbi->mb.corrupted); + return bit_reader_end; +} + +static void error_handler(void *data) { + VP9_COMMON *const cm = (VP9_COMMON *)data; + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet"); +} + +static void read_bitdepth_colorspace_sampling(VP9_COMMON *cm, + struct vpx_read_bit_buffer *rb) { + if (cm->profile >= PROFILE_2) { + cm->bit_depth = vpx_rb_read_bit(rb) ? VPX_BITS_12 : VPX_BITS_10; +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth = 1; +#endif + } else { + cm->bit_depth = VPX_BITS_8; +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth = 0; +#endif + } + cm->color_space = vpx_rb_read_literal(rb, 3); + if (cm->color_space != VPX_CS_SRGB) { + cm->color_range = (vpx_color_range_t)vpx_rb_read_bit(rb); + if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) { + cm->subsampling_x = vpx_rb_read_bit(rb); + cm->subsampling_y = vpx_rb_read_bit(rb); + if (cm->subsampling_x == 1 && cm->subsampling_y == 1) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "4:2:0 color not supported in profile 1 or 3"); + if (vpx_rb_read_bit(rb)) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Reserved bit set"); + } else { + cm->subsampling_y = cm->subsampling_x = 1; + } + } else { + cm->color_range = VPX_CR_FULL_RANGE; + if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) { + // Note if colorspace is SRGB then 4:4:4 chroma sampling is assumed. + // 4:2:2 or 4:4:0 chroma sampling is not allowed. + cm->subsampling_y = cm->subsampling_x = 0; + if (vpx_rb_read_bit(rb)) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Reserved bit set"); + } else { + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "4:4:4 color not supported in profile 0 or 2"); + } + } +} + +static INLINE void flush_all_fb_on_key(VP9_COMMON *cm) { + if (cm->frame_type == KEY_FRAME && cm->current_video_frame > 0) { + RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; + BufferPool *const pool = cm->buffer_pool; + int i; + for (i = 0; i < FRAME_BUFFERS; ++i) { + if (i == cm->new_fb_idx) continue; + frame_bufs[i].ref_count = 0; + if (!frame_bufs[i].released) { + pool->release_fb_cb(pool->cb_priv, &frame_bufs[i].raw_frame_buffer); + frame_bufs[i].released = 1; + } + } + } +} + +static size_t read_uncompressed_header(VP9Decoder *pbi, + struct vpx_read_bit_buffer *rb) { + VP9_COMMON *const cm = &pbi->common; + BufferPool *const pool = cm->buffer_pool; + RefCntBuffer *const frame_bufs = pool->frame_bufs; + int i, mask, ref_index = 0; + size_t sz; + + cm->last_frame_type = cm->frame_type; + cm->last_intra_only = cm->intra_only; + + if (vpx_rb_read_literal(rb, 2) != VP9_FRAME_MARKER) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Invalid frame marker"); + + cm->profile = vp9_read_profile(rb); +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->profile >= MAX_PROFILES) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Unsupported bitstream profile"); +#else + if (cm->profile >= PROFILE_2) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Unsupported bitstream profile"); +#endif + + cm->show_existing_frame = vpx_rb_read_bit(rb); + if (cm->show_existing_frame) { + // Show an existing frame directly. + const int frame_to_show = cm->ref_frame_map[vpx_rb_read_literal(rb, 3)]; + if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) { + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Buffer %d does not contain a decoded frame", + frame_to_show); + } + + ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show); + pbi->refresh_frame_flags = 0; + cm->lf.filter_level = 0; + cm->show_frame = 1; + + return 0; + } + + cm->frame_type = (FRAME_TYPE)vpx_rb_read_bit(rb); + cm->show_frame = vpx_rb_read_bit(rb); + cm->error_resilient_mode = vpx_rb_read_bit(rb); + + if (cm->frame_type == KEY_FRAME) { + if (!vp9_read_sync_code(rb)) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Invalid frame sync code"); + + read_bitdepth_colorspace_sampling(cm, rb); + pbi->refresh_frame_flags = (1 << REF_FRAMES) - 1; + + for (i = 0; i < REFS_PER_FRAME; ++i) { + cm->frame_refs[i].idx = INVALID_IDX; + cm->frame_refs[i].buf = NULL; + } + + setup_frame_size(cm, rb); + if (pbi->need_resync) { + memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map)); + flush_all_fb_on_key(cm); + pbi->need_resync = 0; + } + } else { + cm->intra_only = cm->show_frame ? 0 : vpx_rb_read_bit(rb); + + cm->reset_frame_context = + cm->error_resilient_mode ? 0 : vpx_rb_read_literal(rb, 2); + + if (cm->intra_only) { + if (!vp9_read_sync_code(rb)) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Invalid frame sync code"); + if (cm->profile > PROFILE_0) { + read_bitdepth_colorspace_sampling(cm, rb); + } else { + // NOTE: The intra-only frame header does not include the specification + // of either the color format or color sub-sampling in profile 0. VP9 + // specifies that the default color format should be YUV 4:2:0 in this + // case (normative). + cm->color_space = VPX_CS_BT_601; + cm->color_range = VPX_CR_STUDIO_RANGE; + cm->subsampling_y = cm->subsampling_x = 1; + cm->bit_depth = VPX_BITS_8; +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth = 0; +#endif + } + + pbi->refresh_frame_flags = vpx_rb_read_literal(rb, REF_FRAMES); + setup_frame_size(cm, rb); + if (pbi->need_resync) { + memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map)); + pbi->need_resync = 0; + } + } else if (pbi->need_resync != 1) { /* Skip if need resync */ + pbi->refresh_frame_flags = vpx_rb_read_literal(rb, REF_FRAMES); + for (i = 0; i < REFS_PER_FRAME; ++i) { + const int ref = vpx_rb_read_literal(rb, REF_FRAMES_LOG2); + const int idx = cm->ref_frame_map[ref]; + RefBuffer *const ref_frame = &cm->frame_refs[i]; + ref_frame->idx = idx; + ref_frame->buf = &frame_bufs[idx].buf; + cm->ref_frame_sign_bias[LAST_FRAME + i] = vpx_rb_read_bit(rb); + } + + setup_frame_size_with_refs(cm, rb); + + cm->allow_high_precision_mv = vpx_rb_read_bit(rb); + cm->interp_filter = read_interp_filter(rb); + + for (i = 0; i < REFS_PER_FRAME; ++i) { + RefBuffer *const ref_buf = &cm->frame_refs[i]; +#if CONFIG_VP9_HIGHBITDEPTH + vp9_setup_scale_factors_for_frame( + &ref_buf->sf, ref_buf->buf->y_crop_width, + ref_buf->buf->y_crop_height, cm->width, cm->height, + cm->use_highbitdepth); +#else + vp9_setup_scale_factors_for_frame( + &ref_buf->sf, ref_buf->buf->y_crop_width, + ref_buf->buf->y_crop_height, cm->width, cm->height); +#endif + } + } + } +#if CONFIG_VP9_HIGHBITDEPTH + get_frame_new_buffer(cm)->bit_depth = cm->bit_depth; +#endif + get_frame_new_buffer(cm)->color_space = cm->color_space; + get_frame_new_buffer(cm)->color_range = cm->color_range; + get_frame_new_buffer(cm)->render_width = cm->render_width; + get_frame_new_buffer(cm)->render_height = cm->render_height; + + if (pbi->need_resync) { + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Keyframe / intra-only frame required to reset decoder" + " state"); + } + + if (!cm->error_resilient_mode) { + cm->refresh_frame_context = vpx_rb_read_bit(rb); + cm->frame_parallel_decoding_mode = vpx_rb_read_bit(rb); + if (!cm->frame_parallel_decoding_mode) vp9_zero(cm->counts); + } else { + cm->refresh_frame_context = 0; + cm->frame_parallel_decoding_mode = 1; + } + + // This flag will be overridden by the call to vp9_setup_past_independence + // below, forcing the use of context 0 for those frame types. + cm->frame_context_idx = vpx_rb_read_literal(rb, FRAME_CONTEXTS_LOG2); + + // Generate next_ref_frame_map. + for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) { + if (mask & 1) { + cm->next_ref_frame_map[ref_index] = cm->new_fb_idx; + ++frame_bufs[cm->new_fb_idx].ref_count; + } else { + cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index]; + } + // Current thread holds the reference frame. + if (cm->ref_frame_map[ref_index] >= 0) + ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count; + ++ref_index; + } + + for (; ref_index < REF_FRAMES; ++ref_index) { + cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index]; + // Current thread holds the reference frame. + if (cm->ref_frame_map[ref_index] >= 0) + ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count; + } + pbi->hold_ref_buf = 1; + + if (frame_is_intra_only(cm) || cm->error_resilient_mode) + vp9_setup_past_independence(cm); + + setup_loopfilter(&cm->lf, rb); + setup_quantization(cm, &pbi->mb, rb); + setup_segmentation(&cm->seg, rb); + setup_segmentation_dequant(cm); + + setup_tile_info(cm, rb); + if (pbi->row_mt == 1) { + int num_sbs = 1; + const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows); + const int sb_rows = aligned_rows >> MI_BLOCK_SIZE_LOG2; + const int num_jobs = sb_rows << cm->log2_tile_cols; + + if (pbi->row_mt_worker_data == NULL) { + CHECK_MEM_ERROR(&cm->error, pbi->row_mt_worker_data, + vpx_calloc(1, sizeof(*pbi->row_mt_worker_data))); +#if CONFIG_MULTITHREAD + pthread_mutex_init(&pbi->row_mt_worker_data->recon_done_mutex, NULL); +#endif + } + + if (pbi->max_threads > 1) { + const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int sb_cols = aligned_cols >> MI_BLOCK_SIZE_LOG2; + + num_sbs = sb_cols * sb_rows; + } + + if (num_sbs > pbi->row_mt_worker_data->num_sbs || + num_jobs > pbi->row_mt_worker_data->num_jobs) { + vp9_dec_free_row_mt_mem(pbi->row_mt_worker_data); + vp9_dec_alloc_row_mt_mem(pbi->row_mt_worker_data, cm, num_sbs, + pbi->max_threads, num_jobs); + } + vp9_jobq_alloc(pbi); + } + sz = vpx_rb_read_literal(rb, 16); + + if (sz == 0) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Invalid header size"); + + return sz; +} + +static int read_compressed_header(VP9Decoder *pbi, const uint8_t *data, + size_t partition_size) { + VP9_COMMON *const cm = &pbi->common; + MACROBLOCKD *const xd = &pbi->mb; + FRAME_CONTEXT *const fc = cm->fc; + vpx_reader r; + int k; + + if (vpx_reader_init(&r, data, partition_size, pbi->decrypt_cb, + pbi->decrypt_state)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate bool decoder 0"); + + cm->tx_mode = xd->lossless ? ONLY_4X4 : read_tx_mode(&r); + if (cm->tx_mode == TX_MODE_SELECT) read_tx_mode_probs(&fc->tx_probs, &r); + read_coef_probs(fc, cm->tx_mode, &r); + + for (k = 0; k < SKIP_CONTEXTS; ++k) + vp9_diff_update_prob(&r, &fc->skip_probs[k]); + + if (!frame_is_intra_only(cm)) { + nmv_context *const nmvc = &fc->nmvc; + int i, j; + + read_inter_mode_probs(fc, &r); + + if (cm->interp_filter == SWITCHABLE) read_switchable_interp_probs(fc, &r); + + for (i = 0; i < INTRA_INTER_CONTEXTS; i++) + vp9_diff_update_prob(&r, &fc->intra_inter_prob[i]); + + cm->reference_mode = read_frame_reference_mode(cm, &r); + if (cm->reference_mode != SINGLE_REFERENCE) + vp9_setup_compound_reference_mode(cm); + read_frame_reference_mode_probs(cm, &r); + + for (j = 0; j < BLOCK_SIZE_GROUPS; j++) + for (i = 0; i < INTRA_MODES - 1; ++i) + vp9_diff_update_prob(&r, &fc->y_mode_prob[j][i]); + + for (j = 0; j < PARTITION_CONTEXTS; ++j) + for (i = 0; i < PARTITION_TYPES - 1; ++i) + vp9_diff_update_prob(&r, &fc->partition_prob[j][i]); + + read_mv_probs(nmvc, cm->allow_high_precision_mv, &r); + } + + return vpx_reader_has_error(&r); +} + +static struct vpx_read_bit_buffer *init_read_bit_buffer( + VP9Decoder *pbi, struct vpx_read_bit_buffer *rb, const uint8_t *data, + const uint8_t *data_end, uint8_t clear_data[MAX_VP9_HEADER_SIZE]) { + rb->bit_offset = 0; + rb->error_handler = error_handler; + rb->error_handler_data = &pbi->common; + if (pbi->decrypt_cb) { + const int n = (int)VPXMIN(MAX_VP9_HEADER_SIZE, data_end - data); + pbi->decrypt_cb(pbi->decrypt_state, data, clear_data, n); + rb->bit_buffer = clear_data; + rb->bit_buffer_end = clear_data + n; + } else { + rb->bit_buffer = data; + rb->bit_buffer_end = data_end; + } + return rb; +} + +//------------------------------------------------------------------------------ + +int vp9_read_sync_code(struct vpx_read_bit_buffer *const rb) { + return vpx_rb_read_literal(rb, 8) == VP9_SYNC_CODE_0 && + vpx_rb_read_literal(rb, 8) == VP9_SYNC_CODE_1 && + vpx_rb_read_literal(rb, 8) == VP9_SYNC_CODE_2; +} + +void vp9_read_frame_size(struct vpx_read_bit_buffer *rb, int *width, + int *height) { + *width = vpx_rb_read_literal(rb, 16) + 1; + *height = vpx_rb_read_literal(rb, 16) + 1; +} + +BITSTREAM_PROFILE vp9_read_profile(struct vpx_read_bit_buffer *rb) { + int profile = vpx_rb_read_bit(rb); + profile |= vpx_rb_read_bit(rb) << 1; + if (profile > 2) profile += vpx_rb_read_bit(rb); + return (BITSTREAM_PROFILE)profile; +} + +void vp9_decode_frame(VP9Decoder *pbi, const uint8_t *data, + const uint8_t *data_end, const uint8_t **p_data_end) { + VP9_COMMON *const cm = &pbi->common; + MACROBLOCKD *const xd = &pbi->mb; + struct vpx_read_bit_buffer rb; + int context_updated = 0; + uint8_t clear_data[MAX_VP9_HEADER_SIZE]; + const size_t first_partition_size = read_uncompressed_header( + pbi, init_read_bit_buffer(pbi, &rb, data, data_end, clear_data)); + const int tile_rows = 1 << cm->log2_tile_rows; + const int tile_cols = 1 << cm->log2_tile_cols; + YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm); +#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG + bitstream_queue_set_frame_read(cm->current_video_frame * 2 + cm->show_frame); +#endif +#if CONFIG_MISMATCH_DEBUG + mismatch_move_frame_idx_r(); +#endif + xd->cur_buf = new_fb; + + if (!first_partition_size) { + // showing a frame directly + *p_data_end = data + (cm->profile <= PROFILE_2 ? 1 : 2); + return; + } + + data += vpx_rb_bytes_read(&rb); + if (!read_is_valid(data, first_partition_size, data_end)) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Truncated packet or corrupt header length"); + + cm->use_prev_frame_mvs = + !cm->error_resilient_mode && cm->width == cm->last_width && + cm->height == cm->last_height && !cm->last_intra_only && + cm->last_show_frame && (cm->last_frame_type != KEY_FRAME); + + vp9_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y); + + *cm->fc = cm->frame_contexts[cm->frame_context_idx]; + if (!cm->fc->initialized) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Uninitialized entropy context."); + + xd->corrupted = 0; + new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size); + if (new_fb->corrupted) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Decode failed. Frame data header is corrupted."); + + if (cm->lf.filter_level && !cm->skip_loop_filter) { + vp9_loop_filter_frame_init(cm, cm->lf.filter_level); + } + + if (pbi->tile_worker_data == NULL || + (tile_cols * tile_rows) != pbi->total_tiles) { + const int num_tile_workers = + tile_cols * tile_rows + ((pbi->max_threads > 1) ? pbi->max_threads : 0); + const size_t twd_size = num_tile_workers * sizeof(*pbi->tile_worker_data); + // Ensure tile data offsets will be properly aligned. This may fail on + // platforms without DECLARE_ALIGNED(). + assert((sizeof(*pbi->tile_worker_data) % 16) == 0); + vpx_free(pbi->tile_worker_data); + CHECK_MEM_ERROR(&cm->error, pbi->tile_worker_data, + vpx_memalign(32, twd_size)); + pbi->total_tiles = tile_rows * tile_cols; + } + + if (pbi->max_threads > 1 && tile_rows == 1 && + (tile_cols > 1 || pbi->row_mt == 1)) { + if (pbi->row_mt == 1) { + *p_data_end = + decode_tiles_row_wise_mt(pbi, data + first_partition_size, data_end); + } else { + // Multi-threaded tile decoder + *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end); + if (!pbi->lpf_mt_opt) { + if (!xd->corrupted) { + if (!cm->skip_loop_filter) { + // If multiple threads are used to decode tiles, then we use those + // threads to do parallel loopfiltering. + vp9_loop_filter_frame_mt( + new_fb, cm, pbi->mb.plane, cm->lf.filter_level, 0, 0, + pbi->tile_workers, pbi->num_tile_workers, &pbi->lf_row_sync); + } + } else { + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Decode failed. Frame data is corrupted."); + } + } + } + } else { + *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end); + } + + if (!xd->corrupted) { + if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) { + vp9_adapt_coef_probs(cm); + + if (!frame_is_intra_only(cm)) { + vp9_adapt_mode_probs(cm); + vp9_adapt_mv_probs(cm, cm->allow_high_precision_mv); + } + } + } else { + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Decode failed. Frame data is corrupted."); + } + + // Non frame parallel update frame context here. + if (cm->refresh_frame_context && !context_updated) + cm->frame_contexts[cm->frame_context_idx] = *cm->fc; +} diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.h b/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.h new file mode 100644 index 0000000000..ba95e72344 --- /dev/null +++ b/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_DECODER_VP9_DECODEFRAME_H_ +#define VPX_VP9_DECODER_VP9_DECODEFRAME_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "vp9/common/vp9_enums.h" + +struct VP9Decoder; +struct vpx_read_bit_buffer; + +int vp9_read_sync_code(struct vpx_read_bit_buffer *const rb); +void vp9_read_frame_size(struct vpx_read_bit_buffer *rb, int *width, + int *height); +BITSTREAM_PROFILE vp9_read_profile(struct vpx_read_bit_buffer *rb); + +void vp9_decode_frame(struct VP9Decoder *pbi, const uint8_t *data, + const uint8_t *data_end, const uint8_t **p_data_end); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_DECODER_VP9_DECODEFRAME_H_ diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decodemv.c b/media/libvpx/libvpx/vp9/decoder/vp9_decodemv.c new file mode 100644 index 0000000000..0989cde58d --- /dev/null +++ b/media/libvpx/libvpx/vp9/decoder/vp9_decodemv.c @@ -0,0 +1,850 @@ +/* + Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_entropymv.h" +#include "vp9/common/vp9_mvref_common.h" +#include "vp9/common/vp9_pred_common.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_seg_common.h" + +#include "vp9/decoder/vp9_decodemv.h" +#include "vp9/decoder/vp9_decodeframe.h" + +#include "vpx_dsp/vpx_dsp_common.h" + +static PREDICTION_MODE read_intra_mode(vpx_reader *r, const vpx_prob *p) { + return (PREDICTION_MODE)vpx_read_tree(r, vp9_intra_mode_tree, p); +} + +static PREDICTION_MODE read_intra_mode_y(VP9_COMMON *cm, MACROBLOCKD *xd, + vpx_reader *r, int size_group) { + const PREDICTION_MODE y_mode = + read_intra_mode(r, cm->fc->y_mode_prob[size_group]); + FRAME_COUNTS *counts = xd->counts; + if (counts) ++counts->y_mode[size_group][y_mode]; + return y_mode; +} + +static PREDICTION_MODE read_intra_mode_uv(VP9_COMMON *cm, MACROBLOCKD *xd, + vpx_reader *r, + PREDICTION_MODE y_mode) { + const PREDICTION_MODE uv_mode = + read_intra_mode(r, cm->fc->uv_mode_prob[y_mode]); + FRAME_COUNTS *counts = xd->counts; + if (counts) ++counts->uv_mode[y_mode][uv_mode]; + return uv_mode; +} + +static PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, MACROBLOCKD *xd, + vpx_reader *r, int ctx) { + const int mode = + vpx_read_tree(r, vp9_inter_mode_tree, cm->fc->inter_mode_probs[ctx]); + FRAME_COUNTS *counts = xd->counts; + if (counts) ++counts->inter_mode[ctx][mode]; + + return NEARESTMV + mode; +} + +static int read_segment_id(vpx_reader *r, const struct segmentation *seg) { + return vpx_read_tree(r, vp9_segment_tree, seg->tree_probs); +} + +static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd, + TX_SIZE max_tx_size, vpx_reader *r) { + FRAME_COUNTS *counts = xd->counts; + const int ctx = get_tx_size_context(xd); + const vpx_prob *tx_probs = get_tx_probs(max_tx_size, ctx, &cm->fc->tx_probs); + int tx_size = vpx_read(r, tx_probs[0]); + if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) { + tx_size += vpx_read(r, tx_probs[1]); + if (tx_size != TX_8X8 && max_tx_size >= TX_32X32) + tx_size += vpx_read(r, tx_probs[2]); + } + + if (counts) ++get_tx_counts(max_tx_size, ctx, &counts->tx)[tx_size]; + return (TX_SIZE)tx_size; +} + +static INLINE TX_SIZE read_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd, + int allow_select, vpx_reader *r) { + TX_MODE tx_mode = cm->tx_mode; + BLOCK_SIZE bsize = xd->mi[0]->sb_type; + const TX_SIZE max_tx_size = max_txsize_lookup[bsize]; + if (allow_select && tx_mode == TX_MODE_SELECT && bsize >= BLOCK_8X8) + return read_selected_tx_size(cm, xd, max_tx_size, r); + else + return VPXMIN(max_tx_size, tx_mode_to_biggest_tx_size[tx_mode]); +} + +static int dec_get_segment_id(const VP9_COMMON *cm, const uint8_t *segment_ids, + int mi_offset, int x_mis, int y_mis) { + int x, y, segment_id = INT_MAX; + + for (y = 0; y < y_mis; y++) + for (x = 0; x < x_mis; x++) + segment_id = + VPXMIN(segment_id, segment_ids[mi_offset + y * cm->mi_cols + x]); + + assert(segment_id >= 0 && segment_id < MAX_SEGMENTS); + return segment_id; +} + +static void set_segment_id(VP9_COMMON *cm, int mi_offset, int x_mis, int y_mis, + int segment_id) { + int x, y; + + assert(segment_id >= 0 && segment_id < MAX_SEGMENTS); + + for (y = 0; y < y_mis; y++) + for (x = 0; x < x_mis; x++) + cm->current_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id; +} + +static void copy_segment_id(const VP9_COMMON *cm, + const uint8_t *last_segment_ids, + uint8_t *current_segment_ids, int mi_offset, + int x_mis, int y_mis) { + int x, y; + + for (y = 0; y < y_mis; y++) + for (x = 0; x < x_mis; x++) + current_segment_ids[mi_offset + y * cm->mi_cols + x] = + last_segment_ids ? last_segment_ids[mi_offset + y * cm->mi_cols + x] + : 0; +} + +static int read_intra_segment_id(VP9_COMMON *const cm, int mi_offset, int x_mis, + int y_mis, vpx_reader *r) { + struct segmentation *const seg = &cm->seg; + int segment_id; + + if (!seg->enabled) return 0; // Default for disabled segmentation + + if (!seg->update_map) { + copy_segment_id(cm, cm->last_frame_seg_map, cm->current_frame_seg_map, + mi_offset, x_mis, y_mis); + return 0; + } + + segment_id = read_segment_id(r, seg); + set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id); + return segment_id; +} + +static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd, + int mi_row, int mi_col, vpx_reader *r, + int x_mis, int y_mis) { + struct segmentation *const seg = &cm->seg; + MODE_INFO *const mi = xd->mi[0]; + int predicted_segment_id, segment_id; + const int mi_offset = mi_row * cm->mi_cols + mi_col; + + if (!seg->enabled) return 0; // Default for disabled segmentation + + predicted_segment_id = cm->last_frame_seg_map + ? dec_get_segment_id(cm, cm->last_frame_seg_map, + mi_offset, x_mis, y_mis) + : 0; + + if (!seg->update_map) { + copy_segment_id(cm, cm->last_frame_seg_map, cm->current_frame_seg_map, + mi_offset, x_mis, y_mis); + return predicted_segment_id; + } + + if (seg->temporal_update) { + const vpx_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd); + mi->seg_id_predicted = vpx_read(r, pred_prob); + segment_id = + mi->seg_id_predicted ? predicted_segment_id : read_segment_id(r, seg); + } else { + segment_id = read_segment_id(r, seg); + } + set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id); + return segment_id; +} + +static int read_skip(VP9_COMMON *cm, const MACROBLOCKD *xd, int segment_id, + vpx_reader *r) { + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { + return 1; + } else { + const int ctx = vp9_get_skip_context(xd); + const int skip = vpx_read(r, cm->fc->skip_probs[ctx]); + FRAME_COUNTS *counts = xd->counts; + if (counts) ++counts->skip[ctx][skip]; + return skip; + } +} + +static void read_intra_frame_mode_info(VP9_COMMON *const cm, + MACROBLOCKD *const xd, int mi_row, + int mi_col, vpx_reader *r, int x_mis, + int y_mis) { + MODE_INFO *const mi = xd->mi[0]; + const MODE_INFO *above_mi = xd->above_mi; + const MODE_INFO *left_mi = xd->left_mi; + const BLOCK_SIZE bsize = mi->sb_type; + int i; + const int mi_offset = mi_row * cm->mi_cols + mi_col; + + mi->segment_id = read_intra_segment_id(cm, mi_offset, x_mis, y_mis, r); + mi->skip = read_skip(cm, xd, mi->segment_id, r); + mi->tx_size = read_tx_size(cm, xd, 1, r); + mi->ref_frame[0] = INTRA_FRAME; + mi->ref_frame[1] = NO_REF_FRAME; + + switch (bsize) { + case BLOCK_4X4: + for (i = 0; i < 4; ++i) + mi->bmi[i].as_mode = + read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, i)); + mi->mode = mi->bmi[3].as_mode; + break; + case BLOCK_4X8: + mi->bmi[0].as_mode = mi->bmi[2].as_mode = + read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 0)); + mi->bmi[1].as_mode = mi->bmi[3].as_mode = mi->mode = + read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 1)); + break; + case BLOCK_8X4: + mi->bmi[0].as_mode = mi->bmi[1].as_mode = + read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 0)); + mi->bmi[2].as_mode = mi->bmi[3].as_mode = mi->mode = + read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 2)); + break; + default: + mi->mode = read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 0)); + } + + mi->uv_mode = read_intra_mode(r, vp9_kf_uv_mode_prob[mi->mode]); +} + +static int read_mv_component(vpx_reader *r, const nmv_component *mvcomp, + int usehp) { + int mag, d, fr, hp; + const int sign = vpx_read(r, mvcomp->sign); + const int mv_class = vpx_read_tree(r, vp9_mv_class_tree, mvcomp->classes); + const int class0 = mv_class == MV_CLASS_0; + + // Integer part + if (class0) { + d = vpx_read(r, mvcomp->class0[0]); + mag = 0; + } else { + int i; + const int n = mv_class + CLASS0_BITS - 1; // number of bits + + d = 0; + for (i = 0; i < n; ++i) d |= vpx_read(r, mvcomp->bits[i]) << i; + mag = CLASS0_SIZE << (mv_class + 2); + } + + // Fractional part + fr = vpx_read_tree(r, vp9_mv_fp_tree, + class0 ? mvcomp->class0_fp[d] : mvcomp->fp); + + // High precision part (if hp is not used, the default value of the hp is 1) + hp = usehp ? vpx_read(r, class0 ? mvcomp->class0_hp : mvcomp->hp) : 1; + + // Result + mag += ((d << 3) | (fr << 1) | hp) + 1; + return sign ? -mag : mag; +} + +static INLINE void read_mv(vpx_reader *r, MV *mv, const MV *ref, + const nmv_context *ctx, nmv_context_counts *counts, + int allow_hp) { + const MV_JOINT_TYPE joint_type = + (MV_JOINT_TYPE)vpx_read_tree(r, vp9_mv_joint_tree, ctx->joints); + const int use_hp = allow_hp && use_mv_hp(ref); + MV diff = { 0, 0 }; + + if (mv_joint_vertical(joint_type)) + diff.row = read_mv_component(r, &ctx->comps[0], use_hp); + + if (mv_joint_horizontal(joint_type)) + diff.col = read_mv_component(r, &ctx->comps[1], use_hp); + + vp9_inc_mv(&diff, counts); + + mv->row = ref->row + diff.row; + mv->col = ref->col + diff.col; +} + +static REFERENCE_MODE read_block_reference_mode(VP9_COMMON *cm, + const MACROBLOCKD *xd, + vpx_reader *r) { + if (cm->reference_mode == REFERENCE_MODE_SELECT) { + const int ctx = vp9_get_reference_mode_context(cm, xd); + const REFERENCE_MODE mode = + (REFERENCE_MODE)vpx_read(r, cm->fc->comp_inter_prob[ctx]); + FRAME_COUNTS *counts = xd->counts; + if (counts) ++counts->comp_inter[ctx][mode]; + return mode; // SINGLE_REFERENCE or COMPOUND_REFERENCE + } else { + return cm->reference_mode; + } +} + +// Read the reference frame +static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd, + vpx_reader *r, int segment_id, + MV_REFERENCE_FRAME ref_frame[2]) { + FRAME_CONTEXT *const fc = cm->fc; + FRAME_COUNTS *counts = xd->counts; + + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { + ref_frame[0] = (MV_REFERENCE_FRAME)get_segdata(&cm->seg, segment_id, + SEG_LVL_REF_FRAME); + ref_frame[1] = NO_REF_FRAME; + } else { + const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r); + // FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding + if (mode == COMPOUND_REFERENCE) { + const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref]; + const int ctx = vp9_get_pred_context_comp_ref_p(cm, xd); + const int bit = vpx_read(r, fc->comp_ref_prob[ctx]); + if (counts) ++counts->comp_ref[ctx][bit]; + ref_frame[idx] = cm->comp_fixed_ref; + ref_frame[!idx] = cm->comp_var_ref[bit]; + } else if (mode == SINGLE_REFERENCE) { + const int ctx0 = vp9_get_pred_context_single_ref_p1(xd); + const int bit0 = vpx_read(r, fc->single_ref_prob[ctx0][0]); + if (counts) ++counts->single_ref[ctx0][0][bit0]; + if (bit0) { + const int ctx1 = vp9_get_pred_context_single_ref_p2(xd); + const int bit1 = vpx_read(r, fc->single_ref_prob[ctx1][1]); + if (counts) ++counts->single_ref[ctx1][1][bit1]; + ref_frame[0] = bit1 ? ALTREF_FRAME : GOLDEN_FRAME; + } else { + ref_frame[0] = LAST_FRAME; + } + + ref_frame[1] = NO_REF_FRAME; + } else { + assert(0 && "Invalid prediction mode."); + } + } +} + +static INLINE INTERP_FILTER read_switchable_interp_filter(VP9_COMMON *const cm, + MACROBLOCKD *const xd, + vpx_reader *r) { + const int ctx = get_pred_context_switchable_interp(xd); + const INTERP_FILTER type = (INTERP_FILTER)vpx_read_tree( + r, vp9_switchable_interp_tree, cm->fc->switchable_interp_prob[ctx]); + FRAME_COUNTS *counts = xd->counts; + if (counts) ++counts->switchable_interp[ctx][type]; + return type; +} + +static void read_intra_block_mode_info(VP9_COMMON *const cm, + MACROBLOCKD *const xd, MODE_INFO *mi, + vpx_reader *r) { + const BLOCK_SIZE bsize = mi->sb_type; + int i; + + switch (bsize) { + case BLOCK_4X4: + for (i = 0; i < 4; ++i) + mi->bmi[i].as_mode = read_intra_mode_y(cm, xd, r, 0); + mi->mode = mi->bmi[3].as_mode; + break; + case BLOCK_4X8: + mi->bmi[0].as_mode = mi->bmi[2].as_mode = read_intra_mode_y(cm, xd, r, 0); + mi->bmi[1].as_mode = mi->bmi[3].as_mode = mi->mode = + read_intra_mode_y(cm, xd, r, 0); + break; + case BLOCK_8X4: + mi->bmi[0].as_mode = mi->bmi[1].as_mode = read_intra_mode_y(cm, xd, r, 0); + mi->bmi[2].as_mode = mi->bmi[3].as_mode = mi->mode = + read_intra_mode_y(cm, xd, r, 0); + break; + default: mi->mode = read_intra_mode_y(cm, xd, r, size_group_lookup[bsize]); + } + + mi->uv_mode = read_intra_mode_uv(cm, xd, r, mi->mode); + + // Initialize interp_filter here so we do not have to check for inter block + // modes in get_pred_context_switchable_interp() + mi->interp_filter = SWITCHABLE_FILTERS; + + mi->ref_frame[0] = INTRA_FRAME; + mi->ref_frame[1] = NO_REF_FRAME; +} + +static INLINE int is_mv_valid(const MV *mv) { + return mv->row > MV_LOW && mv->row < MV_UPP && mv->col > MV_LOW && + mv->col < MV_UPP; +} + +static INLINE void copy_mv_pair(int_mv *dst, const int_mv *src) { + memcpy(dst, src, sizeof(*dst) * 2); +} + +static INLINE void zero_mv_pair(int_mv *dst) { + memset(dst, 0, sizeof(*dst) * 2); +} + +static INLINE int assign_mv(VP9_COMMON *cm, MACROBLOCKD *xd, + PREDICTION_MODE mode, int_mv mv[2], + int_mv ref_mv[2], int_mv near_nearest_mv[2], + int is_compound, int allow_hp, vpx_reader *r) { + int i; + int ret = 1; + + switch (mode) { + case NEWMV: { + FRAME_COUNTS *counts = xd->counts; + nmv_context_counts *const mv_counts = counts ? &counts->mv : NULL; + for (i = 0; i < 1 + is_compound; ++i) { + read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, &cm->fc->nmvc, mv_counts, + allow_hp); + ret = ret && is_mv_valid(&mv[i].as_mv); + } + break; + } + case NEARMV: + case NEARESTMV: { + copy_mv_pair(mv, near_nearest_mv); + break; + } + case ZEROMV: { + zero_mv_pair(mv); + break; + } + default: { + return 0; + } + } + return ret; +} + +static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd, + int segment_id, vpx_reader *r) { + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { + return get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) != INTRA_FRAME; + } else { + const int ctx = get_intra_inter_context(xd); + const int is_inter = vpx_read(r, cm->fc->intra_inter_prob[ctx]); + FRAME_COUNTS *counts = xd->counts; + if (counts) ++counts->intra_inter[ctx][is_inter]; + return is_inter; + } +} + +// This macro is used to add a motion vector mv_ref list if it isn't +// already in the list. If it's the second motion vector or early_break +// it will also skip all additional processing and jump to Done! +#define ADD_MV_REF_LIST_EB(mv, refmv_count, mv_ref_list, Done) \ + do { \ + if (refmv_count) { \ + if ((mv).as_int != (mv_ref_list)[0].as_int) { \ + (mv_ref_list)[(refmv_count)] = (mv); \ + refmv_count++; \ + goto Done; \ + } \ + } else { \ + (mv_ref_list)[(refmv_count)++] = (mv); \ + if (early_break) goto Done; \ + } \ + } while (0) + +// If either reference frame is different, not INTRA, and they +// are different from each other scale and add the mv to our list. +#define IF_DIFF_REF_FRAME_ADD_MV_EB(mbmi, ref_frame, ref_sign_bias, \ + refmv_count, mv_ref_list, Done) \ + do { \ + if (is_inter_block(mbmi)) { \ + if ((mbmi)->ref_frame[0] != ref_frame) \ + ADD_MV_REF_LIST_EB(scale_mv((mbmi), 0, ref_frame, ref_sign_bias), \ + refmv_count, mv_ref_list, Done); \ + if (has_second_ref(mbmi) && (mbmi)->ref_frame[1] != ref_frame && \ + (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) \ + ADD_MV_REF_LIST_EB(scale_mv((mbmi), 1, ref_frame, ref_sign_bias), \ + refmv_count, mv_ref_list, Done); \ + } \ + } while (0) + +// This function searches the neighborhood of a given MB/SB +// to try and find candidate reference vectors. +static int dec_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, + PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, + const POSITION *const mv_ref_search, + int_mv *mv_ref_list, int mi_row, int mi_col, + int block) { + const int *ref_sign_bias = cm->ref_frame_sign_bias; + int i, refmv_count = 0; + int different_ref_found = 0; + const MV_REF *const prev_frame_mvs = + cm->use_prev_frame_mvs + ? cm->prev_frame->mvs + mi_row * cm->mi_cols + mi_col + : NULL; + const TileInfo *const tile = &xd->tile; + // If mode is nearestmv or newmv (uses nearestmv as a reference) then stop + // searching after the first mv is found. + const int early_break = (mode != NEARMV); + + // Blank the reference vector list + memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES); + + i = 0; + if (block >= 0) { + // If the size < 8x8 we get the mv from the bmi substructure for the + // nearest two blocks. + for (i = 0; i < 2; ++i) { + const POSITION *const mv_ref = &mv_ref_search[i]; + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { + const MODE_INFO *const candidate_mi = + xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]; + different_ref_found = 1; + + if (candidate_mi->ref_frame[0] == ref_frame) + ADD_MV_REF_LIST_EB( + get_sub_block_mv(candidate_mi, 0, mv_ref->col, block), + refmv_count, mv_ref_list, Done); + else if (candidate_mi->ref_frame[1] == ref_frame) + ADD_MV_REF_LIST_EB( + get_sub_block_mv(candidate_mi, 1, mv_ref->col, block), + refmv_count, mv_ref_list, Done); + } + } + } + + // Check the rest of the neighbors in much the same way + // as before except we don't need to keep track of sub blocks or + // mode counts. + for (; i < MVREF_NEIGHBOURS; ++i) { + const POSITION *const mv_ref = &mv_ref_search[i]; + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { + const MODE_INFO *const candidate = + xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]; + different_ref_found = 1; + + if (candidate->ref_frame[0] == ref_frame) + ADD_MV_REF_LIST_EB(candidate->mv[0], refmv_count, mv_ref_list, Done); + else if (candidate->ref_frame[1] == ref_frame) + ADD_MV_REF_LIST_EB(candidate->mv[1], refmv_count, mv_ref_list, Done); + } + } + + // Check the last frame's mode and mv info. + if (prev_frame_mvs) { + if (prev_frame_mvs->ref_frame[0] == ref_frame) { + ADD_MV_REF_LIST_EB(prev_frame_mvs->mv[0], refmv_count, mv_ref_list, Done); + } else if (prev_frame_mvs->ref_frame[1] == ref_frame) { + ADD_MV_REF_LIST_EB(prev_frame_mvs->mv[1], refmv_count, mv_ref_list, Done); + } + } + + // Since we couldn't find 2 mvs from the same reference frame + // go back through the neighbors and find motion vectors from + // different reference frames. + if (different_ref_found) { + for (i = 0; i < MVREF_NEIGHBOURS; ++i) { + const POSITION *mv_ref = &mv_ref_search[i]; + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { + const MODE_INFO *const candidate = + xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]; + + // If the candidate is INTRA we don't want to consider its mv. + IF_DIFF_REF_FRAME_ADD_MV_EB(candidate, ref_frame, ref_sign_bias, + refmv_count, mv_ref_list, Done); + } + } + } + + // Since we still don't have a candidate we'll try the last frame. + if (prev_frame_mvs) { + if (prev_frame_mvs->ref_frame[0] != ref_frame && + prev_frame_mvs->ref_frame[0] > INTRA_FRAME) { + int_mv mv = prev_frame_mvs->mv[0]; + if (ref_sign_bias[prev_frame_mvs->ref_frame[0]] != + ref_sign_bias[ref_frame]) { + mv.as_mv.row *= -1; + mv.as_mv.col *= -1; + } + ADD_MV_REF_LIST_EB(mv, refmv_count, mv_ref_list, Done); + } + + if (prev_frame_mvs->ref_frame[1] > INTRA_FRAME && + prev_frame_mvs->ref_frame[1] != ref_frame && + prev_frame_mvs->mv[1].as_int != prev_frame_mvs->mv[0].as_int) { + int_mv mv = prev_frame_mvs->mv[1]; + if (ref_sign_bias[prev_frame_mvs->ref_frame[1]] != + ref_sign_bias[ref_frame]) { + mv.as_mv.row *= -1; + mv.as_mv.col *= -1; + } + ADD_MV_REF_LIST_EB(mv, refmv_count, mv_ref_list, Done); + } + } + + if (mode == NEARMV) + refmv_count = MAX_MV_REF_CANDIDATES; + else + // we only care about the nearestmv for the remaining modes + refmv_count = 1; + +Done: + // Clamp vectors + for (i = 0; i < refmv_count; ++i) clamp_mv_ref(&mv_ref_list[i].as_mv, xd); + + return refmv_count; +} + +static void append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, + const POSITION *const mv_ref_search, + PREDICTION_MODE b_mode, int block, + int ref, int mi_row, int mi_col, + int_mv *best_sub8x8) { + int_mv mv_list[MAX_MV_REF_CANDIDATES]; + MODE_INFO *const mi = xd->mi[0]; + b_mode_info *bmi = mi->bmi; + int n; + int refmv_count; + + assert(MAX_MV_REF_CANDIDATES == 2); + + switch (block) { + case 0: + refmv_count = + dec_find_mv_refs(cm, xd, b_mode, mi->ref_frame[ref], mv_ref_search, + mv_list, mi_row, mi_col, block); + best_sub8x8->as_int = mv_list[refmv_count - 1].as_int; + break; + case 1: + case 2: + if (b_mode == NEARESTMV) { + best_sub8x8->as_int = bmi[0].as_mv[ref].as_int; + } else { + dec_find_mv_refs(cm, xd, b_mode, mi->ref_frame[ref], mv_ref_search, + mv_list, mi_row, mi_col, block); + best_sub8x8->as_int = 0; + for (n = 0; n < 2; ++n) + if (bmi[0].as_mv[ref].as_int != mv_list[n].as_int) { + best_sub8x8->as_int = mv_list[n].as_int; + break; + } + } + break; + case 3: + if (b_mode == NEARESTMV) { + best_sub8x8->as_int = bmi[2].as_mv[ref].as_int; + } else { + best_sub8x8->as_int = 0; + if (bmi[2].as_mv[ref].as_int != bmi[1].as_mv[ref].as_int) { + best_sub8x8->as_int = bmi[1].as_mv[ref].as_int; + break; + } + if (bmi[2].as_mv[ref].as_int != bmi[0].as_mv[ref].as_int) { + best_sub8x8->as_int = bmi[0].as_mv[ref].as_int; + break; + } + dec_find_mv_refs(cm, xd, b_mode, mi->ref_frame[ref], mv_ref_search, + mv_list, mi_row, mi_col, block); + for (n = 0; n < 2; ++n) + if (bmi[2].as_mv[ref].as_int != mv_list[n].as_int) { + best_sub8x8->as_int = mv_list[n].as_int; + break; + } + } + break; + default: assert(0 && "Invalid block index."); + } +} + +static uint8_t get_mode_context(const VP9_COMMON *cm, const MACROBLOCKD *xd, + const POSITION *const mv_ref_search, int mi_row, + int mi_col) { + int i; + int context_counter = 0; + const TileInfo *const tile = &xd->tile; + + // Get mode count from nearest 2 blocks + for (i = 0; i < 2; ++i) { + const POSITION *const mv_ref = &mv_ref_search[i]; + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { + const MODE_INFO *const candidate = + xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]; + // Keep counts for entropy encoding. + context_counter += mode_2_counter[candidate->mode]; + } + } + + return counter_to_context[context_counter]; +} + +static void read_inter_block_mode_info(VP9Decoder *const pbi, + MACROBLOCKD *const xd, + MODE_INFO *const mi, int mi_row, + int mi_col, vpx_reader *r) { + VP9_COMMON *const cm = &pbi->common; + const BLOCK_SIZE bsize = mi->sb_type; + const int allow_hp = cm->allow_high_precision_mv; + int_mv best_ref_mvs[2] = { { 0 }, { 0 } }; + int ref, is_compound; + uint8_t inter_mode_ctx; + const POSITION *const mv_ref_search = mv_ref_blocks[bsize]; + + read_ref_frames(cm, xd, r, mi->segment_id, mi->ref_frame); + is_compound = has_second_ref(mi); + inter_mode_ctx = get_mode_context(cm, xd, mv_ref_search, mi_row, mi_col); + + if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP)) { + mi->mode = ZEROMV; + if (bsize < BLOCK_8X8) { + vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM, + "Invalid usage of segment feature on small blocks"); + return; + } + } else { + if (bsize >= BLOCK_8X8) + mi->mode = read_inter_mode(cm, xd, r, inter_mode_ctx); + } + + mi->interp_filter = (cm->interp_filter == SWITCHABLE) + ? read_switchable_interp_filter(cm, xd, r) + : cm->interp_filter; + + if (bsize < BLOCK_8X8) { + const int num_4x4_w = 1 << xd->bmode_blocks_wl; + const int num_4x4_h = 1 << xd->bmode_blocks_hl; + int idx, idy; + PREDICTION_MODE b_mode; + int got_mv_refs_for_new = 0; + int_mv best_sub8x8[2]; + const uint32_t invalid_mv = 0x80008000; + // Initialize the 2nd element as even though it won't be used meaningfully + // if is_compound is false, copying/clamping it may trigger a MSan warning. + best_sub8x8[1].as_int = invalid_mv; + for (idy = 0; idy < 2; idy += num_4x4_h) { + for (idx = 0; idx < 2; idx += num_4x4_w) { + const int j = idy * 2 + idx; + b_mode = read_inter_mode(cm, xd, r, inter_mode_ctx); + + if (b_mode == NEARESTMV || b_mode == NEARMV) { + for (ref = 0; ref < 1 + is_compound; ++ref) + append_sub8x8_mvs_for_idx(cm, xd, mv_ref_search, b_mode, j, ref, + mi_row, mi_col, &best_sub8x8[ref]); + } else if (b_mode == NEWMV && !got_mv_refs_for_new) { + for (ref = 0; ref < 1 + is_compound; ++ref) { + int_mv tmp_mvs[MAX_MV_REF_CANDIDATES]; + const MV_REFERENCE_FRAME frame = mi->ref_frame[ref]; + + dec_find_mv_refs(cm, xd, NEWMV, frame, mv_ref_search, tmp_mvs, + mi_row, mi_col, -1); + + lower_mv_precision(&tmp_mvs[0].as_mv, allow_hp); + best_ref_mvs[ref] = tmp_mvs[0]; + got_mv_refs_for_new = 1; + } + } + + if (!assign_mv(cm, xd, b_mode, mi->bmi[j].as_mv, best_ref_mvs, + best_sub8x8, is_compound, allow_hp, r)) { + xd->corrupted |= 1; + return; + } + + if (num_4x4_h == 2) mi->bmi[j + 2] = mi->bmi[j]; + if (num_4x4_w == 2) mi->bmi[j + 1] = mi->bmi[j]; + } + } + + mi->mode = b_mode; + + copy_mv_pair(mi->mv, mi->bmi[3].as_mv); + } else { + if (mi->mode != ZEROMV) { + for (ref = 0; ref < 1 + is_compound; ++ref) { + int_mv tmp_mvs[MAX_MV_REF_CANDIDATES]; + const MV_REFERENCE_FRAME frame = mi->ref_frame[ref]; + int refmv_count = + dec_find_mv_refs(cm, xd, mi->mode, frame, mv_ref_search, tmp_mvs, + mi_row, mi_col, -1); + lower_mv_precision(&tmp_mvs[refmv_count - 1].as_mv, allow_hp); + best_ref_mvs[ref] = tmp_mvs[refmv_count - 1]; + } + } + xd->corrupted |= !assign_mv(cm, xd, mi->mode, mi->mv, best_ref_mvs, + best_ref_mvs, is_compound, allow_hp, r); + } +} + +static void read_inter_frame_mode_info(VP9Decoder *const pbi, + MACROBLOCKD *const xd, int mi_row, + int mi_col, vpx_reader *r, int x_mis, + int y_mis) { + VP9_COMMON *const cm = &pbi->common; + MODE_INFO *const mi = xd->mi[0]; + int inter_block; + + mi->segment_id = + read_inter_segment_id(cm, xd, mi_row, mi_col, r, x_mis, y_mis); + mi->skip = read_skip(cm, xd, mi->segment_id, r); + inter_block = read_is_inter_block(cm, xd, mi->segment_id, r); + mi->tx_size = read_tx_size(cm, xd, !mi->skip || !inter_block, r); + + if (inter_block) + read_inter_block_mode_info(pbi, xd, mi, mi_row, mi_col, r); + else + read_intra_block_mode_info(cm, xd, mi, r); +} + +static INLINE void copy_ref_frame_pair(MV_REFERENCE_FRAME *dst, + const MV_REFERENCE_FRAME *src) { + memcpy(dst, src, sizeof(*dst) * 2); +} + +void vp9_read_mode_info(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row, + int mi_col, int x_mis, int y_mis) { + vpx_reader *r = &twd->bit_reader; + MACROBLOCKD *const xd = &twd->xd; + VP9_COMMON *const cm = &pbi->common; + MODE_INFO *const mi = xd->mi[0]; + MV_REF *frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col; + int w, h; + + if (frame_is_intra_only(cm)) { + read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r, x_mis, y_mis); + } else { + // Cache mi->ref_frame and mi->mv so that the compiler can prove that they + // are constant for the duration of the loop and avoids reloading them. + MV_REFERENCE_FRAME mi_ref_frame[2]; + int_mv mi_mv[2]; + + read_inter_frame_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis); + + copy_ref_frame_pair(mi_ref_frame, mi->ref_frame); + copy_mv_pair(mi_mv, mi->mv); + + for (h = 0; h < y_mis; ++h) { + for (w = 0; w < x_mis; ++w) { + MV_REF *const mv = frame_mvs + w; + copy_ref_frame_pair(mv->ref_frame, mi_ref_frame); + copy_mv_pair(mv->mv, mi_mv); + } + frame_mvs += cm->mi_cols; + } + } +#if 0 // CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH + if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && + (xd->above_mi == NULL || xd->left_mi == NULL) && + !is_inter_block(mi) && need_top_left[mi->uv_mode]) + assert(0); +#endif // CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH +} diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decodemv.h b/media/libvpx/libvpx/vp9/decoder/vp9_decodemv.h new file mode 100644 index 0000000000..11b45ace06 --- /dev/null +++ b/media/libvpx/libvpx/vp9/decoder/vp9_decodemv.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_DECODER_VP9_DECODEMV_H_ +#define VPX_VP9_DECODER_VP9_DECODEMV_H_ + +#include "vpx_dsp/bitreader.h" + +#include "vp9/decoder/vp9_decoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_read_mode_info(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row, + int mi_col, int x_mis, int y_mis); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_DECODER_VP9_DECODEMV_H_ diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c new file mode 100644 index 0000000000..5a7e9f9ab3 --- /dev/null +++ b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c @@ -0,0 +1,585 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "./vpx_scale_rtcd.h" + +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/system_state.h" +#include "vpx_ports/vpx_once.h" +#include "vpx_ports/vpx_timer.h" +#include "vpx_scale/vpx_scale.h" +#include "vpx_util/vpx_thread.h" + +#include "vp9/common/vp9_alloccommon.h" +#include "vp9/common/vp9_loopfilter.h" +#include "vp9/common/vp9_onyxc_int.h" +#if CONFIG_VP9_POSTPROC +#include "vp9/common/vp9_postproc.h" +#endif +#include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_reconintra.h" + +#include "vp9/decoder/vp9_decodeframe.h" +#include "vp9/decoder/vp9_decoder.h" +#include "vp9/decoder/vp9_detokenize.h" + +static void initialize_dec(void) { + static volatile int init_done = 0; + + if (!init_done) { + vp9_rtcd(); + vpx_dsp_rtcd(); + vpx_scale_rtcd(); + vp9_init_intra_predictors(); + init_done = 1; + } +} + +static void vp9_dec_setup_mi(VP9_COMMON *cm) { + cm->mi = cm->mip + cm->mi_stride + 1; + cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1; + memset(cm->mi_grid_base, 0, + cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base)); +} + +void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data, + VP9_COMMON *cm, int num_sbs, int max_threads, + int num_jobs) { + int plane; + const size_t dqcoeff_size = (num_sbs << DQCOEFFS_PER_SB_LOG2) * + sizeof(*row_mt_worker_data->dqcoeff[0]); + row_mt_worker_data->num_jobs = num_jobs; +#if CONFIG_MULTITHREAD + { + int i; + CHECK_MEM_ERROR( + &cm->error, row_mt_worker_data->recon_sync_mutex, + vpx_malloc(sizeof(*row_mt_worker_data->recon_sync_mutex) * num_jobs)); + if (row_mt_worker_data->recon_sync_mutex) { + for (i = 0; i < num_jobs; ++i) { + pthread_mutex_init(&row_mt_worker_data->recon_sync_mutex[i], NULL); + } + } + + CHECK_MEM_ERROR( + &cm->error, row_mt_worker_data->recon_sync_cond, + vpx_malloc(sizeof(*row_mt_worker_data->recon_sync_cond) * num_jobs)); + if (row_mt_worker_data->recon_sync_cond) { + for (i = 0; i < num_jobs; ++i) { + pthread_cond_init(&row_mt_worker_data->recon_sync_cond[i], NULL); + } + } + } +#endif + row_mt_worker_data->num_sbs = num_sbs; + for (plane = 0; plane < 3; ++plane) { + CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->dqcoeff[plane], + vpx_memalign(32, dqcoeff_size)); + memset(row_mt_worker_data->dqcoeff[plane], 0, dqcoeff_size); + CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->eob[plane], + vpx_calloc(num_sbs << EOBS_PER_SB_LOG2, + sizeof(*row_mt_worker_data->eob[plane]))); + } + CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->partition, + vpx_calloc(num_sbs * PARTITIONS_PER_SB, + sizeof(*row_mt_worker_data->partition))); + CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->recon_map, + vpx_calloc(num_sbs, sizeof(*row_mt_worker_data->recon_map))); + + // allocate memory for thread_data + if (row_mt_worker_data->thread_data == NULL) { + const size_t thread_size = + max_threads * sizeof(*row_mt_worker_data->thread_data); + CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->thread_data, + vpx_memalign(32, thread_size)); + } +} + +void vp9_dec_free_row_mt_mem(RowMTWorkerData *row_mt_worker_data) { + if (row_mt_worker_data != NULL) { + int plane; +#if CONFIG_MULTITHREAD + int i; + if (row_mt_worker_data->recon_sync_mutex != NULL) { + for (i = 0; i < row_mt_worker_data->num_jobs; ++i) { + pthread_mutex_destroy(&row_mt_worker_data->recon_sync_mutex[i]); + } + vpx_free(row_mt_worker_data->recon_sync_mutex); + row_mt_worker_data->recon_sync_mutex = NULL; + } + if (row_mt_worker_data->recon_sync_cond != NULL) { + for (i = 0; i < row_mt_worker_data->num_jobs; ++i) { + pthread_cond_destroy(&row_mt_worker_data->recon_sync_cond[i]); + } + vpx_free(row_mt_worker_data->recon_sync_cond); + row_mt_worker_data->recon_sync_cond = NULL; + } +#endif + for (plane = 0; plane < 3; ++plane) { + vpx_free(row_mt_worker_data->eob[plane]); + row_mt_worker_data->eob[plane] = NULL; + vpx_free(row_mt_worker_data->dqcoeff[plane]); + row_mt_worker_data->dqcoeff[plane] = NULL; + } + vpx_free(row_mt_worker_data->partition); + row_mt_worker_data->partition = NULL; + vpx_free(row_mt_worker_data->recon_map); + row_mt_worker_data->recon_map = NULL; + vpx_free(row_mt_worker_data->thread_data); + row_mt_worker_data->thread_data = NULL; + } +} + +static int vp9_dec_alloc_mi(VP9_COMMON *cm, int mi_size) { + cm->mip = vpx_calloc(mi_size, sizeof(*cm->mip)); + if (!cm->mip) return 1; + cm->mi_alloc_size = mi_size; + cm->mi_grid_base = (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO *)); + if (!cm->mi_grid_base) return 1; + return 0; +} + +static void vp9_dec_free_mi(VP9_COMMON *cm) { +#if CONFIG_VP9_POSTPROC + // MFQE allocates an additional mip and swaps it with cm->mip. + vpx_free(cm->postproc_state.prev_mip); + cm->postproc_state.prev_mip = NULL; +#endif + vpx_free(cm->mip); + cm->mip = NULL; + vpx_free(cm->mi_grid_base); + cm->mi_grid_base = NULL; + cm->mi_alloc_size = 0; +} + +VP9Decoder *vp9_decoder_create(BufferPool *const pool) { + VP9Decoder *volatile const pbi = vpx_memalign(32, sizeof(*pbi)); + VP9_COMMON *volatile const cm = pbi ? &pbi->common : NULL; + + if (!cm) return NULL; + + vp9_zero(*pbi); + + if (setjmp(cm->error.jmp)) { + cm->error.setjmp = 0; + vp9_decoder_remove(pbi); + return NULL; + } + + cm->error.setjmp = 1; + + CHECK_MEM_ERROR(&cm->error, cm->fc, + (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc))); + CHECK_MEM_ERROR( + &cm->error, cm->frame_contexts, + (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, sizeof(*cm->frame_contexts))); + + pbi->need_resync = 1; + once(initialize_dec); + + // Initialize the references to not point to any frame buffers. + memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map)); + memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map)); + + init_frame_indexes(cm); + pbi->ready_for_new_data = 1; + pbi->common.buffer_pool = pool; + + cm->bit_depth = VPX_BITS_8; + cm->dequant_bit_depth = VPX_BITS_8; + + cm->alloc_mi = vp9_dec_alloc_mi; + cm->free_mi = vp9_dec_free_mi; + cm->setup_mi = vp9_dec_setup_mi; + + vp9_loop_filter_init(cm); + + cm->error.setjmp = 0; + + vpx_get_worker_interface()->init(&pbi->lf_worker); + + return pbi; +} + +void vp9_decoder_remove(VP9Decoder *pbi) { + int i; + + if (!pbi) return; + + vpx_get_worker_interface()->end(&pbi->lf_worker); + vpx_free(pbi->lf_worker.data1); + + for (i = 0; i < pbi->num_tile_workers; ++i) { + VPxWorker *const worker = &pbi->tile_workers[i]; + vpx_get_worker_interface()->end(worker); + } + + vpx_free(pbi->tile_worker_data); + vpx_free(pbi->tile_workers); + + if (pbi->num_tile_workers > 0) { + vp9_loop_filter_dealloc(&pbi->lf_row_sync); + } + + if (pbi->row_mt == 1) { + vp9_dec_free_row_mt_mem(pbi->row_mt_worker_data); + if (pbi->row_mt_worker_data != NULL) { + vp9_jobq_deinit(&pbi->row_mt_worker_data->jobq); + vpx_free(pbi->row_mt_worker_data->jobq_buf); +#if CONFIG_MULTITHREAD + pthread_mutex_destroy(&pbi->row_mt_worker_data->recon_done_mutex); +#endif + } + vpx_free(pbi->row_mt_worker_data); + } + + vp9_remove_common(&pbi->common); + vpx_free(pbi); +} + +static int equal_dimensions(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + return a->y_height == b->y_height && a->y_width == b->y_width && + a->uv_height == b->uv_height && a->uv_width == b->uv_width; +} + +vpx_codec_err_t vp9_copy_reference_dec(VP9Decoder *pbi, + VP9_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd) { + VP9_COMMON *cm = &pbi->common; + + /* TODO(jkoleszar): The decoder doesn't have any real knowledge of what the + * encoder is using the frame buffers for. This is just a stub to keep the + * vpxenc --test-decode functionality working, and will be replaced in a + * later commit that adds VP9-specific controls for this functionality. + */ + if (ref_frame_flag == VP9_LAST_FLAG) { + const YV12_BUFFER_CONFIG *const cfg = get_ref_frame(cm, 0); + if (cfg == NULL) { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "No 'last' reference frame"); + return VPX_CODEC_ERROR; + } + if (!equal_dimensions(cfg, sd)) + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Incorrect buffer dimensions"); + else + vpx_yv12_copy_frame(cfg, sd); + } else { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, "Invalid reference frame"); + } + + return cm->error.error_code; +} + +vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm, + VP9_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd) { + int idx; + YV12_BUFFER_CONFIG *ref_buf = NULL; + + // TODO(jkoleszar): The decoder doesn't have any real knowledge of what the + // encoder is using the frame buffers for. This is just a stub to keep the + // vpxenc --test-decode functionality working, and will be replaced in a + // later commit that adds VP9-specific controls for this functionality. + // (Yunqing) The set_reference control depends on the following setting in + // encoder. + // cpi->lst_fb_idx = 0; + // cpi->gld_fb_idx = 1; + // cpi->alt_fb_idx = 2; + if (ref_frame_flag == VP9_LAST_FLAG) { + idx = cm->ref_frame_map[0]; + } else if (ref_frame_flag == VP9_GOLD_FLAG) { + idx = cm->ref_frame_map[1]; + } else if (ref_frame_flag == VP9_ALT_FLAG) { + idx = cm->ref_frame_map[2]; + } else { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, "Invalid reference frame"); + return cm->error.error_code; + } + + if (idx < 0 || idx >= FRAME_BUFFERS) { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Invalid reference frame map"); + return cm->error.error_code; + } + + // Get the destination reference buffer. + ref_buf = &cm->buffer_pool->frame_bufs[idx].buf; + + if (!equal_dimensions(ref_buf, sd)) { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Incorrect buffer dimensions"); + } else { + // Overwrite the reference frame buffer. + vpx_yv12_copy_frame(sd, ref_buf); + } + + return cm->error.error_code; +} + +/* If any buffer updating is signaled it should be done here. */ +static void swap_frame_buffers(VP9Decoder *pbi) { + int ref_index = 0, mask; + VP9_COMMON *const cm = &pbi->common; + BufferPool *const pool = cm->buffer_pool; + RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; + + for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) { + const int old_idx = cm->ref_frame_map[ref_index]; + // Current thread releases the holding of reference frame. + decrease_ref_count(old_idx, frame_bufs, pool); + + // Release the reference frame in reference map. + if (mask & 1) { + decrease_ref_count(old_idx, frame_bufs, pool); + } + cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index]; + ++ref_index; + } + + // Current thread releases the holding of reference frame. + for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) { + const int old_idx = cm->ref_frame_map[ref_index]; + decrease_ref_count(old_idx, frame_bufs, pool); + cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index]; + } + pbi->hold_ref_buf = 0; + cm->frame_to_show = get_frame_new_buffer(cm); + + --frame_bufs[cm->new_fb_idx].ref_count; + + // Invalidate these references until the next frame starts. + for (ref_index = 0; ref_index < 3; ref_index++) + cm->frame_refs[ref_index].idx = -1; +} + +static void release_fb_on_decoder_exit(VP9Decoder *pbi) { + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + VP9_COMMON *volatile const cm = &pbi->common; + BufferPool *volatile const pool = cm->buffer_pool; + RefCntBuffer *volatile const frame_bufs = cm->buffer_pool->frame_bufs; + int i; + + // Synchronize all threads immediately as a subsequent decode call may + // cause a resize invalidating some allocations. + winterface->sync(&pbi->lf_worker); + for (i = 0; i < pbi->num_tile_workers; ++i) { + winterface->sync(&pbi->tile_workers[i]); + } + + // Release all the reference buffers if worker thread is holding them. + if (pbi->hold_ref_buf == 1) { + int ref_index = 0, mask; + for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) { + const int old_idx = cm->ref_frame_map[ref_index]; + // Current thread releases the holding of reference frame. + decrease_ref_count(old_idx, frame_bufs, pool); + + // Release the reference frame in reference map. + if (mask & 1) { + decrease_ref_count(old_idx, frame_bufs, pool); + } + ++ref_index; + } + + // Current thread releases the holding of reference frame. + for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) { + const int old_idx = cm->ref_frame_map[ref_index]; + decrease_ref_count(old_idx, frame_bufs, pool); + } + pbi->hold_ref_buf = 0; + } +} + +int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size, + const uint8_t **psource) { + VP9_COMMON *volatile const cm = &pbi->common; + BufferPool *volatile const pool = cm->buffer_pool; + RefCntBuffer *volatile const frame_bufs = cm->buffer_pool->frame_bufs; + const uint8_t *source = *psource; + int retcode = 0; + cm->error.error_code = VPX_CODEC_OK; + + if (size == 0) { + // This is used to signal that we are missing frames. + // We do not know if the missing frame(s) was supposed to update + // any of the reference buffers, but we act conservative and + // mark only the last buffer as corrupted. + // + // TODO(jkoleszar): Error concealment is undefined and non-normative + // at this point, but if it becomes so, [0] may not always be the correct + // thing to do here. + if (cm->frame_refs[0].idx > 0) { + assert(cm->frame_refs[0].buf != NULL); + cm->frame_refs[0].buf->corrupted = 1; + } + } + + pbi->ready_for_new_data = 0; + + // Check if the previous frame was a frame without any references to it. + if (cm->new_fb_idx >= 0 && frame_bufs[cm->new_fb_idx].ref_count == 0 && + !frame_bufs[cm->new_fb_idx].released) { + pool->release_fb_cb(pool->cb_priv, + &frame_bufs[cm->new_fb_idx].raw_frame_buffer); + frame_bufs[cm->new_fb_idx].released = 1; + } + + // Find a free frame buffer. Return error if can not find any. + cm->new_fb_idx = get_free_fb(cm); + if (cm->new_fb_idx == INVALID_IDX) { + pbi->ready_for_new_data = 1; + release_fb_on_decoder_exit(pbi); + vpx_clear_system_state(); + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Unable to find free frame buffer"); + return cm->error.error_code; + } + + // Assign a MV array to the frame buffer. + cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx]; + + pbi->hold_ref_buf = 0; + pbi->cur_buf = &frame_bufs[cm->new_fb_idx]; + + if (setjmp(cm->error.jmp)) { + cm->error.setjmp = 0; + pbi->ready_for_new_data = 1; + release_fb_on_decoder_exit(pbi); + // Release current frame. + decrease_ref_count(cm->new_fb_idx, frame_bufs, pool); + vpx_clear_system_state(); + return -1; + } + + cm->error.setjmp = 1; + vp9_decode_frame(pbi, source, source + size, psource); + + swap_frame_buffers(pbi); + + vpx_clear_system_state(); + + if (!cm->show_existing_frame) { + cm->last_show_frame = cm->show_frame; + cm->prev_frame = cm->cur_frame; + if (cm->seg.enabled) vp9_swap_current_and_last_seg_map(cm); + } + + if (cm->show_frame) cm->cur_show_frame_fb_idx = cm->new_fb_idx; + + // Update progress in frame parallel decode. + cm->last_width = cm->width; + cm->last_height = cm->height; + if (cm->show_frame) { + cm->current_video_frame++; + } + + cm->error.setjmp = 0; + return retcode; +} + +int vp9_get_raw_frame(VP9Decoder *pbi, YV12_BUFFER_CONFIG *sd, + vp9_ppflags_t *flags) { + VP9_COMMON *const cm = &pbi->common; + int ret = -1; +#if !CONFIG_VP9_POSTPROC + (void)*flags; +#endif + + if (pbi->ready_for_new_data == 1) return ret; + + pbi->ready_for_new_data = 1; + + /* no raw frame to show!!! */ + if (!cm->show_frame) return ret; + + pbi->ready_for_new_data = 1; + +#if CONFIG_VP9_POSTPROC + if (!cm->show_existing_frame) { + ret = vp9_post_proc_frame(cm, sd, flags, cm->width); + } else { + *sd = *cm->frame_to_show; + ret = 0; + } +#else + *sd = *cm->frame_to_show; + ret = 0; +#endif /*!CONFIG_POSTPROC*/ + vpx_clear_system_state(); + return ret; +} + +vpx_codec_err_t vp9_parse_superframe_index(const uint8_t *data, size_t data_sz, + uint32_t sizes[8], int *count, + vpx_decrypt_cb decrypt_cb, + void *decrypt_state) { + // A chunk ending with a byte matching 0xc0 is an invalid chunk unless + // it is a super frame index. If the last byte of real video compression + // data is 0xc0 the encoder must add a 0 byte. If we have the marker but + // not the associated matching marker byte at the front of the index we have + // an invalid bitstream and need to return an error. + + uint8_t marker; + + assert(data_sz); + marker = read_marker(decrypt_cb, decrypt_state, data + data_sz - 1); + *count = 0; + + if ((marker & 0xe0) == 0xc0) { + const uint32_t frames = (marker & 0x7) + 1; + const uint32_t mag = ((marker >> 3) & 0x3) + 1; + const size_t index_sz = 2 + mag * frames; + + // This chunk is marked as having a superframe index but doesn't have + // enough data for it, thus it's an invalid superframe index. + if (data_sz < index_sz) return VPX_CODEC_CORRUPT_FRAME; + + { + const uint8_t marker2 = + read_marker(decrypt_cb, decrypt_state, data + data_sz - index_sz); + + // This chunk is marked as having a superframe index but doesn't have + // the matching marker byte at the front of the index therefore it's an + // invalid chunk. + if (marker != marker2) return VPX_CODEC_CORRUPT_FRAME; + } + + { + // Found a valid superframe index. + uint32_t i, j; + const uint8_t *x = &data[data_sz - index_sz + 1]; + + // Frames has a maximum of 8 and mag has a maximum of 4. + uint8_t clear_buffer[32]; + assert(sizeof(clear_buffer) >= frames * mag); + if (decrypt_cb) { + decrypt_cb(decrypt_state, x, clear_buffer, frames * mag); + x = clear_buffer; + } + + for (i = 0; i < frames; ++i) { + uint32_t this_sz = 0; + + for (j = 0; j < mag; ++j) this_sz |= ((uint32_t)(*x++)) << (j * 8); + sizes[i] = this_sz; + } + *count = frames; + } + } + return VPX_CODEC_OK; +} diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h new file mode 100644 index 0000000000..2e198d552e --- /dev/null +++ b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_DECODER_VP9_DECODER_H_ +#define VPX_VP9_DECODER_VP9_DECODER_H_ + +#include "./vpx_config.h" + +#include "vpx/vpx_codec.h" +#include "vpx_dsp/bitreader.h" +#include "vpx_scale/yv12config.h" +#include "vpx_util/vpx_thread.h" + +#include "vp9/common/vp9_thread_common.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_ppflags.h" +#include "./vp9_job_queue.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define EOBS_PER_SB_LOG2 8 +#define DQCOEFFS_PER_SB_LOG2 12 +#define PARTITIONS_PER_SB 85 + +typedef enum JobType { PARSE_JOB, RECON_JOB, LPF_JOB } JobType; + +typedef struct ThreadData { + struct VP9Decoder *pbi; + LFWorkerData *lf_data; + VP9LfSync *lf_sync; +} ThreadData; + +typedef struct TileBuffer { + const uint8_t *data; + size_t size; + int col; // only used with multi-threaded decoding +} TileBuffer; + +typedef struct TileWorkerData { + const uint8_t *data_end; + int buf_start, buf_end; // pbi->tile_buffers to decode, inclusive + vpx_reader bit_reader; + FRAME_COUNTS counts; + LFWorkerData *lf_data; + VP9LfSync *lf_sync; + DECLARE_ALIGNED(16, MACROBLOCKD, xd); + /* dqcoeff are shared by all the planes. So planes must be decoded serially */ + DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]); + DECLARE_ALIGNED(16, uint16_t, extend_and_predict_buf[80 * 2 * 80 * 2]); + struct vpx_internal_error_info error_info; +} TileWorkerData; + +typedef void (*process_block_fn_t)(TileWorkerData *twd, + struct VP9Decoder *const pbi, int mi_row, + int mi_col, BLOCK_SIZE bsize, int bwl, + int bhl); + +typedef struct RowMTWorkerData { + int num_sbs; + int *eob[MAX_MB_PLANE]; + PARTITION_TYPE *partition; + tran_low_t *dqcoeff[MAX_MB_PLANE]; + int8_t *recon_map; + const uint8_t *data_end; + uint8_t *jobq_buf; + JobQueueRowMt jobq; + size_t jobq_size; + int num_tiles_done; + int num_jobs; +#if CONFIG_MULTITHREAD + pthread_mutex_t recon_done_mutex; + pthread_mutex_t *recon_sync_mutex; + pthread_cond_t *recon_sync_cond; +#endif + ThreadData *thread_data; +} RowMTWorkerData; + +/* Structure to queue and dequeue row decode jobs */ +typedef struct Job { + int row_num; + int tile_col; + JobType job_type; +} Job; + +typedef struct VP9Decoder { + DECLARE_ALIGNED(16, MACROBLOCKD, mb); + + DECLARE_ALIGNED(16, VP9_COMMON, common); + + int ready_for_new_data; + + int refresh_frame_flags; + + // TODO(hkuang): Combine this with cur_buf in macroblockd as they are + // the same. + RefCntBuffer *cur_buf; // Current decoding frame buffer. + + VPxWorker lf_worker; + VPxWorker *tile_workers; + TileWorkerData *tile_worker_data; + TileBuffer tile_buffers[64]; + int num_tile_workers; + int total_tiles; + + VP9LfSync lf_row_sync; + + vpx_decrypt_cb decrypt_cb; + void *decrypt_state; + + int max_threads; + int inv_tile_order; + int need_resync; // wait for key/intra-only frame. + int hold_ref_buf; // hold the reference buffer. + + int row_mt; + int lpf_mt_opt; + RowMTWorkerData *row_mt_worker_data; +} VP9Decoder; + +int vp9_receive_compressed_data(struct VP9Decoder *pbi, size_t size, + const uint8_t **psource); + +int vp9_get_raw_frame(struct VP9Decoder *pbi, YV12_BUFFER_CONFIG *sd, + vp9_ppflags_t *flags); + +vpx_codec_err_t vp9_copy_reference_dec(struct VP9Decoder *pbi, + VP9_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd); + +vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm, + VP9_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd); + +static INLINE uint8_t read_marker(vpx_decrypt_cb decrypt_cb, + void *decrypt_state, const uint8_t *data) { + if (decrypt_cb) { + uint8_t marker; + decrypt_cb(decrypt_state, data, &marker, 1); + return marker; + } + return *data; +} + +// This function is exposed for use in tests, as well as the inlined function +// "read_marker". +vpx_codec_err_t vp9_parse_superframe_index(const uint8_t *data, size_t data_sz, + uint32_t sizes[8], int *count, + vpx_decrypt_cb decrypt_cb, + void *decrypt_state); + +struct VP9Decoder *vp9_decoder_create(BufferPool *const pool); + +void vp9_decoder_remove(struct VP9Decoder *pbi); + +void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data, + VP9_COMMON *cm, int num_sbs, int max_threads, + int num_jobs); +void vp9_dec_free_row_mt_mem(RowMTWorkerData *row_mt_worker_data); + +static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs, + BufferPool *const pool) { + if (idx >= 0 && frame_bufs[idx].ref_count > 0) { + --frame_bufs[idx].ref_count; + // A worker may only get a free framebuffer index when calling get_free_fb. + // But the private buffer is not set up until finish decoding header. + // So any error happens during decoding header, the frame_bufs will not + // have valid priv buffer. + if (!frame_bufs[idx].released && frame_bufs[idx].ref_count == 0 && + frame_bufs[idx].raw_frame_buffer.priv) { + pool->release_fb_cb(pool->cb_priv, &frame_bufs[idx].raw_frame_buffer); + frame_bufs[idx].released = 1; + } + } +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_DECODER_VP9_DECODER_H_ diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_detokenize.c b/media/libvpx/libvpx/vp9/decoder/vp9_detokenize.c new file mode 100644 index 0000000000..d957dc34e3 --- /dev/null +++ b/media/libvpx/libvpx/vp9/decoder/vp9_detokenize.c @@ -0,0 +1,333 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" + +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_entropy.h" +#if CONFIG_COEFFICIENT_RANGE_CHECKING +#include "vp9/common/vp9_idct.h" +#endif + +#include "vp9/decoder/vp9_detokenize.h" + +#define EOB_CONTEXT_NODE 0 +#define ZERO_CONTEXT_NODE 1 +#define ONE_CONTEXT_NODE 2 + +#define INCREMENT_COUNT(token) \ + do { \ + if (counts) ++coef_counts[band][ctx][token]; \ + } while (0) + +static INLINE int read_bool(vpx_reader *r, int prob, BD_VALUE *value, + int *count, unsigned int *range) { + const unsigned int split = (*range * prob + (256 - prob)) >> CHAR_BIT; + const BD_VALUE bigsplit = (BD_VALUE)split << (BD_VALUE_SIZE - CHAR_BIT); +#if CONFIG_BITSTREAM_DEBUG + const int queue_r = bitstream_queue_get_read(); + const int frame_idx = bitstream_queue_get_frame_read(); + int ref_result, ref_prob; + bitstream_queue_pop(&ref_result, &ref_prob); + if (prob != ref_prob) { + fprintf(stderr, + "\n *** [bit] prob error, frame_idx_r %d prob %d ref_prob %d " + "queue_r %d\n", + frame_idx, prob, ref_prob, queue_r); + + assert(0); + } +#endif + + if (*count < 0) { + r->value = *value; + r->count = *count; + vpx_reader_fill(r); + *value = r->value; + *count = r->count; + } + + if (*value >= bigsplit) { + *range = *range - split; + *value = *value - bigsplit; + { + const int shift = vpx_norm[*range]; + *range <<= shift; + *value <<= shift; + *count -= shift; + } +#if CONFIG_BITSTREAM_DEBUG + { + const int bit = 1; + if (bit != ref_result) { + fprintf( + stderr, + "\n *** [bit] result error, frame_idx_r %d bit %d ref_result %d " + "queue_r %d\n", + frame_idx, bit, ref_result, queue_r); + + assert(0); + } + } +#endif + return 1; + } + *range = split; + { + const int shift = vpx_norm[*range]; + *range <<= shift; + *value <<= shift; + *count -= shift; + } +#if CONFIG_BITSTREAM_DEBUG + { + const int bit = 0; + if (bit != ref_result) { + fprintf(stderr, + "\n *** [bit] result error, frame_idx_r %d bit %d ref_result %d " + "queue_r %d\n", + frame_idx, bit, ref_result, queue_r); + + assert(0); + } + } +#endif + return 0; +} + +static INLINE int read_coeff(vpx_reader *r, const vpx_prob *probs, int n, + BD_VALUE *value, int *count, unsigned int *range) { + int i, val = 0; + for (i = 0; i < n; ++i) + val = (val << 1) | read_bool(r, probs[i], value, count, range); + return val; +} + +static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type, + tran_low_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq, + int ctx, const int16_t *scan, const int16_t *nb, + vpx_reader *r) { + FRAME_COUNTS *counts = xd->counts; + const int max_eob = 16 << (tx_size << 1); + const FRAME_CONTEXT *const fc = xd->fc; + const int ref = is_inter_block(xd->mi[0]); + int band, c = 0; + const vpx_prob(*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] = + fc->coef_probs[tx_size][type][ref]; + const vpx_prob *prob; + unsigned int(*coef_counts)[COEFF_CONTEXTS][UNCONSTRAINED_NODES + 1]; + unsigned int(*eob_branch_count)[COEFF_CONTEXTS]; + uint8_t token_cache[32 * 32]; + const uint8_t *band_translate = get_band_translate(tx_size); + const int dq_shift = (tx_size == TX_32X32); + int v; + int16_t dqv = dq[0]; + const uint8_t *const cat6_prob = +#if CONFIG_VP9_HIGHBITDEPTH + (xd->bd == VPX_BITS_12) ? vp9_cat6_prob_high12 + : (xd->bd == VPX_BITS_10) ? vp9_cat6_prob_high12 + 2 + : +#endif // CONFIG_VP9_HIGHBITDEPTH + vp9_cat6_prob; + const int cat6_bits = +#if CONFIG_VP9_HIGHBITDEPTH + (xd->bd == VPX_BITS_12) ? 18 + : (xd->bd == VPX_BITS_10) ? 16 + : +#endif // CONFIG_VP9_HIGHBITDEPTH + 14; + // Keep value, range, and count as locals. The compiler produces better + // results with the locals than using r directly. + BD_VALUE value = r->value; + unsigned int range = r->range; + int count = r->count; + + if (counts) { + coef_counts = counts->coef[tx_size][type][ref]; + eob_branch_count = counts->eob_branch[tx_size][type][ref]; + } + + while (c < max_eob) { + int val = -1; + band = *band_translate++; + prob = coef_probs[band][ctx]; + if (counts) ++eob_branch_count[band][ctx]; + if (!read_bool(r, prob[EOB_CONTEXT_NODE], &value, &count, &range)) { + INCREMENT_COUNT(EOB_MODEL_TOKEN); + break; + } + + while (!read_bool(r, prob[ZERO_CONTEXT_NODE], &value, &count, &range)) { + INCREMENT_COUNT(ZERO_TOKEN); + dqv = dq[1]; + token_cache[scan[c]] = 0; + ++c; + if (c >= max_eob) { + r->value = value; + r->range = range; + r->count = count; + return c; // zero tokens at the end (no eob token) + } + ctx = get_coef_context(nb, token_cache, c); + band = *band_translate++; + prob = coef_probs[band][ctx]; + } + + if (read_bool(r, prob[ONE_CONTEXT_NODE], &value, &count, &range)) { + const vpx_prob *p = vp9_pareto8_full[prob[PIVOT_NODE] - 1]; + INCREMENT_COUNT(TWO_TOKEN); + if (read_bool(r, p[0], &value, &count, &range)) { + if (read_bool(r, p[3], &value, &count, &range)) { + token_cache[scan[c]] = 5; + if (read_bool(r, p[5], &value, &count, &range)) { + if (read_bool(r, p[7], &value, &count, &range)) { + val = CAT6_MIN_VAL + + read_coeff(r, cat6_prob, cat6_bits, &value, &count, &range); + } else { + val = CAT5_MIN_VAL + + read_coeff(r, vp9_cat5_prob, 5, &value, &count, &range); + } + } else if (read_bool(r, p[6], &value, &count, &range)) { + val = CAT4_MIN_VAL + + read_coeff(r, vp9_cat4_prob, 4, &value, &count, &range); + } else { + val = CAT3_MIN_VAL + + read_coeff(r, vp9_cat3_prob, 3, &value, &count, &range); + } + } else { + token_cache[scan[c]] = 4; + if (read_bool(r, p[4], &value, &count, &range)) { + val = CAT2_MIN_VAL + + read_coeff(r, vp9_cat2_prob, 2, &value, &count, &range); + } else { + val = CAT1_MIN_VAL + + read_coeff(r, vp9_cat1_prob, 1, &value, &count, &range); + } + } +#if CONFIG_VP9_HIGHBITDEPTH + // val may use 18-bits + v = (int)(((int64_t)val * dqv) >> dq_shift); +#else + v = (val * dqv) >> dq_shift; +#endif + } else { + if (read_bool(r, p[1], &value, &count, &range)) { + token_cache[scan[c]] = 3; + v = ((3 + read_bool(r, p[2], &value, &count, &range)) * dqv) >> + dq_shift; + } else { + token_cache[scan[c]] = 2; + v = (2 * dqv) >> dq_shift; + } + } + } else { + INCREMENT_COUNT(ONE_TOKEN); + token_cache[scan[c]] = 1; + v = dqv >> dq_shift; + } +#if CONFIG_COEFFICIENT_RANGE_CHECKING +#if CONFIG_VP9_HIGHBITDEPTH + dqcoeff[scan[c]] = highbd_check_range( + read_bool(r, 128, &value, &count, &range) ? -v : v, xd->bd); +#else + dqcoeff[scan[c]] = + check_range(read_bool(r, 128, &value, &count, &range) ? -v : v); +#endif // CONFIG_VP9_HIGHBITDEPTH +#else + if (read_bool(r, 128, &value, &count, &range)) { + dqcoeff[scan[c]] = (tran_low_t)-v; + } else { + dqcoeff[scan[c]] = (tran_low_t)v; + } +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + ++c; + ctx = get_coef_context(nb, token_cache, c); + dqv = dq[1]; + } + + r->value = value; + r->range = range; + r->count = count; + return c; +} + +static void get_ctx_shift(MACROBLOCKD *xd, int *ctx_shift_a, int *ctx_shift_l, + int x, int y, unsigned int tx_size_in_blocks) { + if (xd->max_blocks_wide) { + if (tx_size_in_blocks + x > xd->max_blocks_wide) + *ctx_shift_a = (tx_size_in_blocks - (xd->max_blocks_wide - x)) * 8; + } + if (xd->max_blocks_high) { + if (tx_size_in_blocks + y > xd->max_blocks_high) + *ctx_shift_l = (tx_size_in_blocks - (xd->max_blocks_high - y)) * 8; + } +} + +int vp9_decode_block_tokens(TileWorkerData *twd, int plane, const ScanOrder *sc, + int x, int y, TX_SIZE tx_size, int seg_id) { + vpx_reader *r = &twd->bit_reader; + MACROBLOCKD *xd = &twd->xd; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const int16_t *const dequant = pd->seg_dequant[seg_id]; + int eob; + ENTROPY_CONTEXT *a = pd->above_context + x; + ENTROPY_CONTEXT *l = pd->left_context + y; + int ctx; + int ctx_shift_a = 0; + int ctx_shift_l = 0; + + switch (tx_size) { + case TX_4X4: + ctx = a[0] != 0; + ctx += l[0] != 0; + eob = decode_coefs(xd, get_plane_type(plane), pd->dqcoeff, tx_size, + dequant, ctx, sc->scan, sc->neighbors, r); + a[0] = l[0] = (eob > 0); + break; + case TX_8X8: + get_ctx_shift(xd, &ctx_shift_a, &ctx_shift_l, x, y, 1 << TX_8X8); + ctx = !!*(const uint16_t *)a; + ctx += !!*(const uint16_t *)l; + eob = decode_coefs(xd, get_plane_type(plane), pd->dqcoeff, tx_size, + dequant, ctx, sc->scan, sc->neighbors, r); + *(uint16_t *)a = ((eob > 0) * 0x0101) >> ctx_shift_a; + *(uint16_t *)l = ((eob > 0) * 0x0101) >> ctx_shift_l; + break; + case TX_16X16: + get_ctx_shift(xd, &ctx_shift_a, &ctx_shift_l, x, y, 1 << TX_16X16); + ctx = !!*(const uint32_t *)a; + ctx += !!*(const uint32_t *)l; + eob = decode_coefs(xd, get_plane_type(plane), pd->dqcoeff, tx_size, + dequant, ctx, sc->scan, sc->neighbors, r); + *(uint32_t *)a = ((eob > 0) * 0x01010101) >> ctx_shift_a; + *(uint32_t *)l = ((eob > 0) * 0x01010101) >> ctx_shift_l; + break; + case TX_32X32: + get_ctx_shift(xd, &ctx_shift_a, &ctx_shift_l, x, y, 1 << TX_32X32); + // NOTE: casting to uint64_t here is safe because the default memory + // alignment is at least 8 bytes and the TX_32X32 is aligned on 8 byte + // boundaries. + ctx = !!*(const uint64_t *)a; + ctx += !!*(const uint64_t *)l; + eob = decode_coefs(xd, get_plane_type(plane), pd->dqcoeff, tx_size, + dequant, ctx, sc->scan, sc->neighbors, r); + *(uint64_t *)a = ((eob > 0) * 0x0101010101010101ULL) >> ctx_shift_a; + *(uint64_t *)l = ((eob > 0) * 0x0101010101010101ULL) >> ctx_shift_l; + break; + default: + assert(0 && "Invalid transform size."); + eob = 0; + break; + } + + return eob; +} diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_detokenize.h b/media/libvpx/libvpx/vp9/decoder/vp9_detokenize.h new file mode 100644 index 0000000000..a8e47021b8 --- /dev/null +++ b/media/libvpx/libvpx/vp9/decoder/vp9_detokenize.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_DECODER_VP9_DETOKENIZE_H_ +#define VPX_VP9_DECODER_VP9_DETOKENIZE_H_ + +#include "vpx_dsp/bitreader.h" +#include "vp9/decoder/vp9_decoder.h" +#include "vp9/common/vp9_scan.h" + +#ifdef __cplusplus +extern "C" { +#endif + +int vp9_decode_block_tokens(TileWorkerData *twd, int plane, const ScanOrder *sc, + int x, int y, TX_SIZE tx_size, int seg_id); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_DECODER_VP9_DETOKENIZE_H_ diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_dsubexp.c b/media/libvpx/libvpx/vp9/decoder/vp9_dsubexp.c new file mode 100644 index 0000000000..126ba0b96e --- /dev/null +++ b/media/libvpx/libvpx/vp9/decoder/vp9_dsubexp.c @@ -0,0 +1,72 @@ +/* + Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vp9/common/vp9_entropy.h" + +#include "vp9/decoder/vp9_dsubexp.h" + +static int inv_recenter_nonneg(int v, int m) { + if (v > 2 * m) return v; + + return (v & 1) ? m - ((v + 1) >> 1) : m + (v >> 1); +} + +static int decode_uniform(vpx_reader *r) { + const int l = 8; + const int m = (1 << l) - 191; + const int v = vpx_read_literal(r, l - 1); + return v < m ? v : (v << 1) - m + vpx_read_bit(r); +} + +static int inv_remap_prob(int v, int m) { + static uint8_t inv_map_table[MAX_PROB] = { + 7, 20, 33, 46, 59, 72, 85, 98, 111, 124, 137, 150, 163, 176, 189, + 202, 215, 228, 241, 254, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 25, 26, 27, + 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, + 44, 45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 60, + 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, + 77, 78, 79, 80, 81, 82, 83, 84, 86, 87, 88, 89, 90, 91, 92, + 93, 94, 95, 96, 97, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, + 109, 110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 125, + 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 138, 139, 140, 141, + 142, 143, 144, 145, 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, + 158, 159, 160, 161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, + 174, 175, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, + 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206, + 207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221, 222, + 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, + 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 253 + }; + assert(v < (int)(sizeof(inv_map_table) / sizeof(inv_map_table[0]))); + v = inv_map_table[v]; + m--; + if ((m << 1) <= MAX_PROB) { + return 1 + inv_recenter_nonneg(v, m); + } else { + return MAX_PROB - inv_recenter_nonneg(v, MAX_PROB - 1 - m); + } +} + +static int decode_term_subexp(vpx_reader *r) { + if (!vpx_read_bit(r)) return vpx_read_literal(r, 4); + if (!vpx_read_bit(r)) return vpx_read_literal(r, 4) + 16; + if (!vpx_read_bit(r)) return vpx_read_literal(r, 5) + 32; + return decode_uniform(r) + 64; +} + +void vp9_diff_update_prob(vpx_reader *r, vpx_prob *p) { + if (vpx_read(r, DIFF_UPDATE_PROB)) { + const int delp = decode_term_subexp(r); + *p = (vpx_prob)inv_remap_prob(delp, *p); + } +} diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_dsubexp.h b/media/libvpx/libvpx/vp9/decoder/vp9_dsubexp.h new file mode 100644 index 0000000000..b0c7750736 --- /dev/null +++ b/media/libvpx/libvpx/vp9/decoder/vp9_dsubexp.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_DECODER_VP9_DSUBEXP_H_ +#define VPX_VP9_DECODER_VP9_DSUBEXP_H_ + +#include "vpx_dsp/bitreader.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_diff_update_prob(vpx_reader *r, vpx_prob *p); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_DECODER_VP9_DSUBEXP_H_ diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c new file mode 100644 index 0000000000..9a31f5a6d0 --- /dev/null +++ b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "vpx/vpx_integer.h" + +#include "vp9/decoder/vp9_job_queue.h" + +void vp9_jobq_init(JobQueueRowMt *jobq, uint8_t *buf, size_t buf_size) { +#if CONFIG_MULTITHREAD + pthread_mutex_init(&jobq->mutex, NULL); + pthread_cond_init(&jobq->cond, NULL); +#endif + jobq->buf_base = buf; + jobq->buf_wr = buf; + jobq->buf_rd = buf; + jobq->buf_end = buf + buf_size; + jobq->terminate = 0; +} + +void vp9_jobq_reset(JobQueueRowMt *jobq) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&jobq->mutex); +#endif + jobq->buf_wr = jobq->buf_base; + jobq->buf_rd = jobq->buf_base; + jobq->terminate = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(&jobq->mutex); +#endif +} + +void vp9_jobq_deinit(JobQueueRowMt *jobq) { + vp9_jobq_reset(jobq); +#if CONFIG_MULTITHREAD + pthread_mutex_destroy(&jobq->mutex); + pthread_cond_destroy(&jobq->cond); +#endif +} + +void vp9_jobq_terminate(JobQueueRowMt *jobq) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&jobq->mutex); +#endif + jobq->terminate = 1; +#if CONFIG_MULTITHREAD + pthread_cond_broadcast(&jobq->cond); + pthread_mutex_unlock(&jobq->mutex); +#endif +} + +int vp9_jobq_queue(JobQueueRowMt *jobq, void *job, size_t job_size) { + int ret = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&jobq->mutex); +#endif + if (jobq->buf_end >= jobq->buf_wr + job_size) { + memcpy(jobq->buf_wr, job, job_size); + jobq->buf_wr = jobq->buf_wr + job_size; +#if CONFIG_MULTITHREAD + pthread_cond_signal(&jobq->cond); +#endif + ret = 0; + } else { + /* Wrap around case is not supported */ + assert(0); + ret = 1; + } +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(&jobq->mutex); +#endif + return ret; +} + +int vp9_jobq_dequeue(JobQueueRowMt *jobq, void *job, size_t job_size, + int blocking) { + int ret = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&jobq->mutex); +#endif + if (jobq->buf_end >= jobq->buf_rd + job_size) { + while (1) { + if (jobq->buf_wr >= jobq->buf_rd + job_size) { + memcpy(job, jobq->buf_rd, job_size); + jobq->buf_rd = jobq->buf_rd + job_size; + ret = 0; + break; + } else { + /* If all the entries have been dequeued, then break and return */ + if (jobq->terminate == 1) { + ret = 1; + break; + } + if (blocking == 1) { +#if CONFIG_MULTITHREAD + pthread_cond_wait(&jobq->cond, &jobq->mutex); +#endif + } else { + /* If there is no job available, + * and this is non blocking call then return fail */ + ret = 1; + break; + } + } + } + } else { + /* Wrap around case is not supported */ + ret = 1; + } +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(&jobq->mutex); +#endif + + return ret; +} diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h new file mode 100644 index 0000000000..bc23bf9c2c --- /dev/null +++ b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_DECODER_VP9_JOB_QUEUE_H_ +#define VPX_VP9_DECODER_VP9_JOB_QUEUE_H_ + +#include "vpx_util/vpx_thread.h" + +typedef struct { + // Pointer to buffer base which contains the jobs + uint8_t *buf_base; + + // Pointer to current address where new job can be added + uint8_t *volatile buf_wr; + + // Pointer to current address from where next job can be obtained + uint8_t *volatile buf_rd; + + // Pointer to end of job buffer + uint8_t *buf_end; + + int terminate; + +#if CONFIG_MULTITHREAD + pthread_mutex_t mutex; + pthread_cond_t cond; +#endif +} JobQueueRowMt; + +void vp9_jobq_init(JobQueueRowMt *jobq, uint8_t *buf, size_t buf_size); +void vp9_jobq_reset(JobQueueRowMt *jobq); +void vp9_jobq_deinit(JobQueueRowMt *jobq); +void vp9_jobq_terminate(JobQueueRowMt *jobq); +int vp9_jobq_queue(JobQueueRowMt *jobq, void *job, size_t job_size); +int vp9_jobq_dequeue(JobQueueRowMt *jobq, void *job, size_t job_size, + int blocking); + +#endif // VPX_VP9_DECODER_VP9_JOB_QUEUE_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c new file mode 100644 index 0000000000..997b5477e1 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c @@ -0,0 +1,2173 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/fdct_neon.h" +#include "vpx_dsp/arm/fdct4x4_neon.h" +#include "vpx_dsp/arm/fdct8x8_neon.h" +#include "vpx_dsp/arm/fdct16x16_neon.h" + +static INLINE void load_buffer_4x4(const int16_t *input, int16x8_t *in, + int stride) { + // { 0, 1, 1, 1 }; + const int16x4_t nonzero_bias_a = vext_s16(vdup_n_s16(0), vdup_n_s16(1), 3); + // { 1, 0, 0, 0 }; + const int16x4_t nonzero_bias_b = vext_s16(vdup_n_s16(1), vdup_n_s16(0), 3); + int16x4_t mask; + + int16x4_t input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4); + int16x4_t input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4); + int16x4_t input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4); + int16x4_t input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4); + + // Copy the SSE method, use a mask to avoid an 'if' branch here to increase by + // one non-zero first elements + mask = vreinterpret_s16_u16(vceq_s16(input_0, nonzero_bias_a)); + input_0 = vadd_s16(input_0, mask); + input_0 = vadd_s16(input_0, nonzero_bias_b); + + in[0] = vcombine_s16(input_0, input_1); + in[1] = vcombine_s16(input_2, input_3); +} + +static INLINE void write_buffer_4x4(tran_low_t *output, int16x8_t *res) { + const int16x8_t one_s16 = vdupq_n_s16(1); + res[0] = vaddq_s16(res[0], one_s16); + res[1] = vaddq_s16(res[1], one_s16); + res[0] = vshrq_n_s16(res[0], 2); + res[1] = vshrq_n_s16(res[1], 2); + store_s16q_to_tran_low(output + 0 * 8, res[0]); + store_s16q_to_tran_low(output + 1 * 8, res[1]); +} + +static INLINE void fadst4x4_neon(int16x8_t *in) { + int32x4_t u[4], t[4]; + int16x4_t s[4], out[4]; + + s[0] = vget_low_s16(in[0]); // | x_00 | x_01 | x_02 | x_03 | + s[1] = vget_high_s16(in[0]); // | x_10 | x_11 | x_12 | x_13 | + s[2] = vget_low_s16(in[1]); // | x_20 | x_21 | x_22 | x_23 | + s[3] = vget_high_s16(in[1]); // | x_30 | x_31 | x_32 | x_33 | + + // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c. + // t0 = s0 * sinpi_1_9 + s1 * sinpi_2_9 + s3 * sinpi_4_9 + t[0] = vmull_n_s16(s[0], sinpi_1_9); + t[0] = vmlal_n_s16(t[0], s[1], sinpi_2_9); + t[0] = vmlal_n_s16(t[0], s[3], sinpi_4_9); + + // t1 = (s0 + s1) * sinpi_3_9 - s3 * sinpi_3_9 + t[1] = vmull_n_s16(s[0], sinpi_3_9); + t[1] = vmlal_n_s16(t[1], s[1], sinpi_3_9); + t[1] = vmlsl_n_s16(t[1], s[3], sinpi_3_9); + + // t2 = s0 * sinpi_4_9 - s1* sinpi_1_9 + s3 * sinpi_2_9 + t[2] = vmull_n_s16(s[0], sinpi_4_9); + t[2] = vmlsl_n_s16(t[2], s[1], sinpi_1_9); + t[2] = vmlal_n_s16(t[2], s[3], sinpi_2_9); + + // t3 = s2 * sinpi_3_9 + t[3] = vmull_n_s16(s[2], sinpi_3_9); + + /* + * u0 = t0 + t3 + * u1 = t1 + * u2 = t2 - t3 + * u3 = t2 - t0 + t3 + */ + u[0] = vaddq_s32(t[0], t[3]); + u[1] = t[1]; + u[2] = vsubq_s32(t[2], t[3]); + u[3] = vaddq_s32(vsubq_s32(t[2], t[0]), t[3]); + + // fdct_round_shift + out[0] = vrshrn_n_s32(u[0], DCT_CONST_BITS); + out[1] = vrshrn_n_s32(u[1], DCT_CONST_BITS); + out[2] = vrshrn_n_s32(u[2], DCT_CONST_BITS); + out[3] = vrshrn_n_s32(u[3], DCT_CONST_BITS); + + transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]); + + in[0] = vcombine_s16(out[0], out[1]); + in[1] = vcombine_s16(out[2], out[3]); +} + +void vp9_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + int16x8_t in[2]; + + switch (tx_type) { + case DCT_DCT: vpx_fdct4x4_neon(input, output, stride); break; + case ADST_DCT: + load_buffer_4x4(input, in, stride); + fadst4x4_neon(in); + // pass1 variant is not accurate enough + vpx_fdct4x4_pass2_neon((int16x4_t *)in); + write_buffer_4x4(output, in); + break; + case DCT_ADST: + load_buffer_4x4(input, in, stride); + // pass1 variant is not accurate enough + vpx_fdct4x4_pass2_neon((int16x4_t *)in); + fadst4x4_neon(in); + write_buffer_4x4(output, in); + break; + default: + assert(tx_type == ADST_ADST); + load_buffer_4x4(input, in, stride); + fadst4x4_neon(in); + fadst4x4_neon(in); + write_buffer_4x4(output, in); + break; + } +} + +static INLINE void load_buffer_8x8(const int16_t *input, int16x8_t *in, + int stride) { + in[0] = vshlq_n_s16(vld1q_s16(input + 0 * stride), 2); + in[1] = vshlq_n_s16(vld1q_s16(input + 1 * stride), 2); + in[2] = vshlq_n_s16(vld1q_s16(input + 2 * stride), 2); + in[3] = vshlq_n_s16(vld1q_s16(input + 3 * stride), 2); + in[4] = vshlq_n_s16(vld1q_s16(input + 4 * stride), 2); + in[5] = vshlq_n_s16(vld1q_s16(input + 5 * stride), 2); + in[6] = vshlq_n_s16(vld1q_s16(input + 6 * stride), 2); + in[7] = vshlq_n_s16(vld1q_s16(input + 7 * stride), 2); +} + +/* right shift and rounding + * first get the sign bit (bit 15). + * If bit == 1, it's the simple case of shifting right by one bit. + * If bit == 2, it essentially computes the expression: + * + * out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; + * + * for each row. + */ +static INLINE void right_shift_8x8(int16x8_t *res, const int bit) { + int16x8_t sign0 = vshrq_n_s16(res[0], 15); + int16x8_t sign1 = vshrq_n_s16(res[1], 15); + int16x8_t sign2 = vshrq_n_s16(res[2], 15); + int16x8_t sign3 = vshrq_n_s16(res[3], 15); + int16x8_t sign4 = vshrq_n_s16(res[4], 15); + int16x8_t sign5 = vshrq_n_s16(res[5], 15); + int16x8_t sign6 = vshrq_n_s16(res[6], 15); + int16x8_t sign7 = vshrq_n_s16(res[7], 15); + + if (bit == 2) { + const int16x8_t const_rounding = vdupq_n_s16(1); + res[0] = vaddq_s16(res[0], const_rounding); + res[1] = vaddq_s16(res[1], const_rounding); + res[2] = vaddq_s16(res[2], const_rounding); + res[3] = vaddq_s16(res[3], const_rounding); + res[4] = vaddq_s16(res[4], const_rounding); + res[5] = vaddq_s16(res[5], const_rounding); + res[6] = vaddq_s16(res[6], const_rounding); + res[7] = vaddq_s16(res[7], const_rounding); + } + + res[0] = vsubq_s16(res[0], sign0); + res[1] = vsubq_s16(res[1], sign1); + res[2] = vsubq_s16(res[2], sign2); + res[3] = vsubq_s16(res[3], sign3); + res[4] = vsubq_s16(res[4], sign4); + res[5] = vsubq_s16(res[5], sign5); + res[6] = vsubq_s16(res[6], sign6); + res[7] = vsubq_s16(res[7], sign7); + + if (bit == 1) { + res[0] = vshrq_n_s16(res[0], 1); + res[1] = vshrq_n_s16(res[1], 1); + res[2] = vshrq_n_s16(res[2], 1); + res[3] = vshrq_n_s16(res[3], 1); + res[4] = vshrq_n_s16(res[4], 1); + res[5] = vshrq_n_s16(res[5], 1); + res[6] = vshrq_n_s16(res[6], 1); + res[7] = vshrq_n_s16(res[7], 1); + } else { + res[0] = vshrq_n_s16(res[0], 2); + res[1] = vshrq_n_s16(res[1], 2); + res[2] = vshrq_n_s16(res[2], 2); + res[3] = vshrq_n_s16(res[3], 2); + res[4] = vshrq_n_s16(res[4], 2); + res[5] = vshrq_n_s16(res[5], 2); + res[6] = vshrq_n_s16(res[6], 2); + res[7] = vshrq_n_s16(res[7], 2); + } +} + +static INLINE void write_buffer_8x8(tran_low_t *output, int16x8_t *res, + int stride) { + store_s16q_to_tran_low(output + 0 * stride, res[0]); + store_s16q_to_tran_low(output + 1 * stride, res[1]); + store_s16q_to_tran_low(output + 2 * stride, res[2]); + store_s16q_to_tran_low(output + 3 * stride, res[3]); + store_s16q_to_tran_low(output + 4 * stride, res[4]); + store_s16q_to_tran_low(output + 5 * stride, res[5]); + store_s16q_to_tran_low(output + 6 * stride, res[6]); + store_s16q_to_tran_low(output + 7 * stride, res[7]); +} + +static INLINE void fadst8x8_neon(int16x8_t *in) { + int16x4_t x_lo[8], x_hi[8]; + int32x4_t s_lo[8], s_hi[8]; + int32x4_t t_lo[8], t_hi[8]; + + x_lo[0] = vget_low_s16(in[7]); + x_hi[0] = vget_high_s16(in[7]); + x_lo[1] = vget_low_s16(in[0]); + x_hi[1] = vget_high_s16(in[0]); + x_lo[2] = vget_low_s16(in[5]); + x_hi[2] = vget_high_s16(in[5]); + x_lo[3] = vget_low_s16(in[2]); + x_hi[3] = vget_high_s16(in[2]); + x_lo[4] = vget_low_s16(in[3]); + x_hi[4] = vget_high_s16(in[3]); + x_lo[5] = vget_low_s16(in[4]); + x_hi[5] = vget_high_s16(in[4]); + x_lo[6] = vget_low_s16(in[1]); + x_hi[6] = vget_high_s16(in[1]); + x_lo[7] = vget_low_s16(in[6]); + x_hi[7] = vget_high_s16(in[6]); + + // stage 1 + // s0 = cospi_2_64 * x0 + cospi_30_64 * x1; + // s1 = cospi_30_64 * x0 - cospi_2_64 * x1; + butterfly_two_coeff_s16_s32_noround(x_lo[0], x_hi[0], x_lo[1], x_hi[1], + cospi_2_64, cospi_30_64, &s_lo[0], + &s_hi[0], &s_lo[1], &s_hi[1]); + + // s2 = cospi_10_64 * x2 + cospi_22_64 * x3; + // s3 = cospi_22_64 * x2 - cospi_10_64 * x3; + butterfly_two_coeff_s16_s32_noround(x_lo[2], x_hi[2], x_lo[3], x_hi[3], + cospi_10_64, cospi_22_64, &s_lo[2], + &s_hi[2], &s_lo[3], &s_hi[3]); + + // s4 = cospi_18_64 * x4 + cospi_14_64 * x5; + // s5 = cospi_14_64 * x4 - cospi_18_64 * x5; + butterfly_two_coeff_s16_s32_noround(x_lo[4], x_hi[4], x_lo[5], x_hi[5], + cospi_18_64, cospi_14_64, &s_lo[4], + &s_hi[4], &s_lo[5], &s_hi[5]); + + // s6 = cospi_26_64 * x6 + cospi_6_64 * x7; + // s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + butterfly_two_coeff_s16_s32_noround(x_lo[6], x_hi[6], x_lo[7], x_hi[7], + cospi_26_64, cospi_6_64, &s_lo[6], + &s_hi[6], &s_lo[7], &s_hi[7]); + + // fdct_round_shift + t_lo[0] = vrshrq_n_s32(vaddq_s32(s_lo[0], s_lo[4]), DCT_CONST_BITS); + t_hi[0] = vrshrq_n_s32(vaddq_s32(s_hi[0], s_hi[4]), DCT_CONST_BITS); + t_lo[1] = vrshrq_n_s32(vaddq_s32(s_lo[1], s_lo[5]), DCT_CONST_BITS); + t_hi[1] = vrshrq_n_s32(vaddq_s32(s_hi[1], s_hi[5]), DCT_CONST_BITS); + t_lo[2] = vrshrq_n_s32(vaddq_s32(s_lo[2], s_lo[6]), DCT_CONST_BITS); + t_hi[2] = vrshrq_n_s32(vaddq_s32(s_hi[2], s_hi[6]), DCT_CONST_BITS); + t_lo[3] = vrshrq_n_s32(vaddq_s32(s_lo[3], s_lo[7]), DCT_CONST_BITS); + t_hi[3] = vrshrq_n_s32(vaddq_s32(s_hi[3], s_hi[7]), DCT_CONST_BITS); + t_lo[4] = vrshrq_n_s32(vsubq_s32(s_lo[0], s_lo[4]), DCT_CONST_BITS); + t_hi[4] = vrshrq_n_s32(vsubq_s32(s_hi[0], s_hi[4]), DCT_CONST_BITS); + t_lo[5] = vrshrq_n_s32(vsubq_s32(s_lo[1], s_lo[5]), DCT_CONST_BITS); + t_hi[5] = vrshrq_n_s32(vsubq_s32(s_hi[1], s_hi[5]), DCT_CONST_BITS); + t_lo[6] = vrshrq_n_s32(vsubq_s32(s_lo[2], s_lo[6]), DCT_CONST_BITS); + t_hi[6] = vrshrq_n_s32(vsubq_s32(s_hi[2], s_hi[6]), DCT_CONST_BITS); + t_lo[7] = vrshrq_n_s32(vsubq_s32(s_lo[3], s_lo[7]), DCT_CONST_BITS); + t_hi[7] = vrshrq_n_s32(vsubq_s32(s_hi[3], s_hi[7]), DCT_CONST_BITS); + + // stage 2 + s_lo[0] = t_lo[0]; + s_hi[0] = t_hi[0]; + s_lo[1] = t_lo[1]; + s_hi[1] = t_hi[1]; + s_lo[2] = t_lo[2]; + s_hi[2] = t_hi[2]; + s_lo[3] = t_lo[3]; + s_hi[3] = t_hi[3]; + // s4 = cospi_8_64 * x4 + cospi_24_64 * x5; + // s5 = cospi_24_64 * x4 - cospi_8_64 * x5; + butterfly_two_coeff_s32_noround(t_lo[4], t_hi[4], t_lo[5], t_hi[5], + cospi_8_64, cospi_24_64, &s_lo[4], &s_hi[4], + &s_lo[5], &s_hi[5]); + + // s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; + // s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + butterfly_two_coeff_s32_noround(t_lo[6], t_hi[6], t_lo[7], t_hi[7], + -cospi_24_64, cospi_8_64, &s_lo[6], &s_hi[6], + &s_lo[7], &s_hi[7]); + + // fdct_round_shift + // s0 + s2 + t_lo[0] = vaddq_s32(s_lo[0], s_lo[2]); + t_hi[0] = vaddq_s32(s_hi[0], s_hi[2]); + // s1 + s3 + t_lo[1] = vaddq_s32(s_lo[1], s_lo[3]); + t_hi[1] = vaddq_s32(s_hi[1], s_hi[3]); + // s0 - s2 + t_lo[2] = vsubq_s32(s_lo[0], s_lo[2]); + t_hi[2] = vsubq_s32(s_hi[0], s_hi[2]); + // s1 - s3 + t_lo[3] = vsubq_s32(s_lo[1], s_lo[3]); + t_hi[3] = vsubq_s32(s_hi[1], s_hi[3]); + // s4 + s6 + t_lo[4] = vrshrq_n_s32(vaddq_s32(s_lo[4], s_lo[6]), DCT_CONST_BITS); + t_hi[4] = vrshrq_n_s32(vaddq_s32(s_hi[4], s_hi[6]), DCT_CONST_BITS); + // s5 + s7 + t_lo[5] = vrshrq_n_s32(vaddq_s32(s_lo[5], s_lo[7]), DCT_CONST_BITS); + t_hi[5] = vrshrq_n_s32(vaddq_s32(s_hi[5], s_hi[7]), DCT_CONST_BITS); + // s4 - s6 + t_lo[6] = vrshrq_n_s32(vsubq_s32(s_lo[4], s_lo[6]), DCT_CONST_BITS); + t_hi[6] = vrshrq_n_s32(vsubq_s32(s_hi[4], s_hi[6]), DCT_CONST_BITS); + // s5 - s7 + t_lo[7] = vrshrq_n_s32(vsubq_s32(s_lo[5], s_lo[7]), DCT_CONST_BITS); + t_hi[7] = vrshrq_n_s32(vsubq_s32(s_hi[5], s_hi[7]), DCT_CONST_BITS); + + // stage 3 + // cospi_16_64 * (x2 + x3) + // cospi_16_64 * (x2 - x3) + butterfly_one_coeff_s32_noround(t_lo[2], t_hi[2], t_lo[3], t_hi[3], + cospi_16_64, &s_lo[2], &s_hi[2], &s_lo[3], + &s_hi[3]); + + // cospi_16_64 * (x6 + x7) + // cospi_16_64 * (x2 - x3) + butterfly_one_coeff_s32_noround(t_lo[6], t_hi[6], t_lo[7], t_hi[7], + cospi_16_64, &s_lo[6], &s_hi[6], &s_lo[7], + &s_hi[7]); + + // final fdct_round_shift + x_lo[2] = vrshrn_n_s32(s_lo[2], DCT_CONST_BITS); + x_hi[2] = vrshrn_n_s32(s_hi[2], DCT_CONST_BITS); + x_lo[3] = vrshrn_n_s32(s_lo[3], DCT_CONST_BITS); + x_hi[3] = vrshrn_n_s32(s_hi[3], DCT_CONST_BITS); + x_lo[6] = vrshrn_n_s32(s_lo[6], DCT_CONST_BITS); + x_hi[6] = vrshrn_n_s32(s_hi[6], DCT_CONST_BITS); + x_lo[7] = vrshrn_n_s32(s_lo[7], DCT_CONST_BITS); + x_hi[7] = vrshrn_n_s32(s_hi[7], DCT_CONST_BITS); + + // x0, x1, x4, x5 narrow down to 16-bits directly + x_lo[0] = vmovn_s32(t_lo[0]); + x_hi[0] = vmovn_s32(t_hi[0]); + x_lo[1] = vmovn_s32(t_lo[1]); + x_hi[1] = vmovn_s32(t_hi[1]); + x_lo[4] = vmovn_s32(t_lo[4]); + x_hi[4] = vmovn_s32(t_hi[4]); + x_lo[5] = vmovn_s32(t_lo[5]); + x_hi[5] = vmovn_s32(t_hi[5]); + + in[0] = vcombine_s16(x_lo[0], x_hi[0]); + in[1] = vnegq_s16(vcombine_s16(x_lo[4], x_hi[4])); + in[2] = vcombine_s16(x_lo[6], x_hi[6]); + in[3] = vnegq_s16(vcombine_s16(x_lo[2], x_hi[2])); + in[4] = vcombine_s16(x_lo[3], x_hi[3]); + in[5] = vnegq_s16(vcombine_s16(x_lo[7], x_hi[7])); + in[6] = vcombine_s16(x_lo[5], x_hi[5]); + in[7] = vnegq_s16(vcombine_s16(x_lo[1], x_hi[1])); + + transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); +} + +void vp9_fht8x8_neon(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + int16x8_t in[8]; + + switch (tx_type) { + case DCT_DCT: vpx_fdct8x8_neon(input, output, stride); break; + case ADST_DCT: + load_buffer_8x8(input, in, stride); + fadst8x8_neon(in); + // pass1 variant is not accurate enough + vpx_fdct8x8_pass2_neon(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + case DCT_ADST: + load_buffer_8x8(input, in, stride); + // pass1 variant is not accurate enough + vpx_fdct8x8_pass2_neon(in); + fadst8x8_neon(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + default: + assert(tx_type == ADST_ADST); + load_buffer_8x8(input, in, stride); + fadst8x8_neon(in); + fadst8x8_neon(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + } +} + +static INLINE void load_buffer_16x16(const int16_t *input, int16x8_t *in0, + int16x8_t *in1, int stride) { + // load first 8 columns + load_buffer_8x8(input, in0, stride); + load_buffer_8x8(input + 8 * stride, in0 + 8, stride); + + input += 8; + // load second 8 columns + load_buffer_8x8(input, in1, stride); + load_buffer_8x8(input + 8 * stride, in1 + 8, stride); +} + +static INLINE void write_buffer_16x16(tran_low_t *output, int16x8_t *in0, + int16x8_t *in1, int stride) { + // write first 8 columns + write_buffer_8x8(output, in0, stride); + write_buffer_8x8(output + 8 * stride, in0 + 8, stride); + + // write second 8 columns + output += 8; + write_buffer_8x8(output, in1, stride); + write_buffer_8x8(output + 8 * stride, in1 + 8, stride); +} + +static INLINE void right_shift_16x16(int16x8_t *res0, int16x8_t *res1) { + // perform rounding operations + right_shift_8x8(res0, 2); + right_shift_8x8(res0 + 8, 2); + right_shift_8x8(res1, 2); + right_shift_8x8(res1 + 8, 2); +} + +static void fdct16_8col(int16x8_t *in) { + // perform 16x16 1-D DCT for 8 columns + int16x8_t i[8], s1[8], s2[8], s3[8], t[8]; + int16x4_t t_lo[8], t_hi[8]; + int32x4_t u_lo[8], u_hi[8]; + + // stage 1 + i[0] = vaddq_s16(in[0], in[15]); + i[1] = vaddq_s16(in[1], in[14]); + i[2] = vaddq_s16(in[2], in[13]); + i[3] = vaddq_s16(in[3], in[12]); + i[4] = vaddq_s16(in[4], in[11]); + i[5] = vaddq_s16(in[5], in[10]); + i[6] = vaddq_s16(in[6], in[9]); + i[7] = vaddq_s16(in[7], in[8]); + + // pass1 variant is not accurate enough + vpx_fdct8x8_pass2_neon(i); + transpose_s16_8x8(&i[0], &i[1], &i[2], &i[3], &i[4], &i[5], &i[6], &i[7]); + + // step 2 + s1[0] = vsubq_s16(in[7], in[8]); + s1[1] = vsubq_s16(in[6], in[9]); + s1[2] = vsubq_s16(in[5], in[10]); + s1[3] = vsubq_s16(in[4], in[11]); + s1[4] = vsubq_s16(in[3], in[12]); + s1[5] = vsubq_s16(in[2], in[13]); + s1[6] = vsubq_s16(in[1], in[14]); + s1[7] = vsubq_s16(in[0], in[15]); + + t[2] = vsubq_s16(s1[5], s1[2]); + t[3] = vsubq_s16(s1[4], s1[3]); + t[4] = vaddq_s16(s1[4], s1[3]); + t[5] = vaddq_s16(s1[5], s1[2]); + + t_lo[2] = vget_low_s16(t[2]); + t_hi[2] = vget_high_s16(t[2]); + t_lo[3] = vget_low_s16(t[3]); + t_hi[3] = vget_high_s16(t[3]); + t_lo[4] = vget_low_s16(t[4]); + t_hi[4] = vget_high_s16(t[4]); + t_lo[5] = vget_low_s16(t[5]); + t_hi[5] = vget_high_s16(t[5]); + + u_lo[2] = vmull_n_s16(t_lo[2], cospi_16_64); + u_hi[2] = vmull_n_s16(t_hi[2], cospi_16_64); + u_lo[3] = vmull_n_s16(t_lo[3], cospi_16_64); + u_hi[3] = vmull_n_s16(t_hi[3], cospi_16_64); + u_lo[4] = vmull_n_s16(t_lo[4], cospi_16_64); + u_hi[4] = vmull_n_s16(t_hi[4], cospi_16_64); + u_lo[5] = vmull_n_s16(t_lo[5], cospi_16_64); + u_hi[5] = vmull_n_s16(t_hi[5], cospi_16_64); + + t_lo[2] = vrshrn_n_s32(u_lo[2], DCT_CONST_BITS); + t_hi[2] = vrshrn_n_s32(u_hi[2], DCT_CONST_BITS); + t_lo[3] = vrshrn_n_s32(u_lo[3], DCT_CONST_BITS); + t_hi[3] = vrshrn_n_s32(u_hi[3], DCT_CONST_BITS); + t_lo[4] = vrshrn_n_s32(u_lo[4], DCT_CONST_BITS); + t_hi[4] = vrshrn_n_s32(u_hi[4], DCT_CONST_BITS); + t_lo[5] = vrshrn_n_s32(u_lo[5], DCT_CONST_BITS); + t_hi[5] = vrshrn_n_s32(u_hi[5], DCT_CONST_BITS); + + s2[2] = vcombine_s16(t_lo[2], t_hi[2]); + s2[3] = vcombine_s16(t_lo[3], t_hi[3]); + s2[4] = vcombine_s16(t_lo[4], t_hi[4]); + s2[5] = vcombine_s16(t_lo[5], t_hi[5]); + + // step 3 + s3[0] = vaddq_s16(s1[0], s2[3]); + s3[1] = vaddq_s16(s1[1], s2[2]); + s3[2] = vsubq_s16(s1[1], s2[2]); + s3[3] = vsubq_s16(s1[0], s2[3]); + s3[4] = vsubq_s16(s1[7], s2[4]); + s3[5] = vsubq_s16(s1[6], s2[5]); + s3[6] = vaddq_s16(s1[6], s2[5]); + s3[7] = vaddq_s16(s1[7], s2[4]); + + // step 4 + t_lo[0] = vget_low_s16(s3[0]); + t_hi[0] = vget_high_s16(s3[0]); + t_lo[1] = vget_low_s16(s3[1]); + t_hi[1] = vget_high_s16(s3[1]); + t_lo[2] = vget_low_s16(s3[2]); + t_hi[2] = vget_high_s16(s3[2]); + t_lo[3] = vget_low_s16(s3[3]); + t_hi[3] = vget_high_s16(s3[3]); + t_lo[4] = vget_low_s16(s3[4]); + t_hi[4] = vget_high_s16(s3[4]); + t_lo[5] = vget_low_s16(s3[5]); + t_hi[5] = vget_high_s16(s3[5]); + t_lo[6] = vget_low_s16(s3[6]); + t_hi[6] = vget_high_s16(s3[6]); + t_lo[7] = vget_low_s16(s3[7]); + t_hi[7] = vget_high_s16(s3[7]); + + // u[1] = -cospi_8_64 * t[1] + cospi_24_64 * t[6] + // u[6] = cospi_24_64 * t[1] + cospi_8_64 * t[6] + butterfly_two_coeff_s16_s32_noround(t_lo[1], t_hi[1], t_lo[6], t_hi[6], + -cospi_8_64, cospi_24_64, &u_lo[1], + &u_hi[1], &u_lo[6], &u_hi[6]); + + // u[5] = -cospi_24_64 * t[5] + cospi_8_64 * t[2] + // u[2] = cospi_8_64 * t[5] + cospi_24_64 * t[2] + butterfly_two_coeff_s16_s32_noround(t_lo[5], t_hi[5], t_lo[2], t_hi[2], + -cospi_24_64, cospi_8_64, &u_lo[5], + &u_hi[5], &u_lo[2], &u_hi[2]); + + t_lo[1] = vrshrn_n_s32(u_lo[1], DCT_CONST_BITS); + t_hi[1] = vrshrn_n_s32(u_hi[1], DCT_CONST_BITS); + t_lo[2] = vrshrn_n_s32(u_lo[2], DCT_CONST_BITS); + t_hi[2] = vrshrn_n_s32(u_hi[2], DCT_CONST_BITS); + t_lo[5] = vrshrn_n_s32(u_lo[5], DCT_CONST_BITS); + t_hi[5] = vrshrn_n_s32(u_hi[5], DCT_CONST_BITS); + t_lo[6] = vrshrn_n_s32(u_lo[6], DCT_CONST_BITS); + t_hi[6] = vrshrn_n_s32(u_hi[6], DCT_CONST_BITS); + + s2[1] = vcombine_s16(t_lo[1], t_hi[1]); + s2[2] = vcombine_s16(t_lo[2], t_hi[2]); + s2[5] = vcombine_s16(t_lo[5], t_hi[5]); + s2[6] = vcombine_s16(t_lo[6], t_hi[6]); + + // step 5 + s1[0] = vaddq_s16(s3[0], s2[1]); + s1[1] = vsubq_s16(s3[0], s2[1]); + s1[2] = vaddq_s16(s3[3], s2[2]); + s1[3] = vsubq_s16(s3[3], s2[2]); + s1[4] = vsubq_s16(s3[4], s2[5]); + s1[5] = vaddq_s16(s3[4], s2[5]); + s1[6] = vsubq_s16(s3[7], s2[6]); + s1[7] = vaddq_s16(s3[7], s2[6]); + + // step 6 + t_lo[0] = vget_low_s16(s1[0]); + t_hi[0] = vget_high_s16(s1[0]); + t_lo[1] = vget_low_s16(s1[1]); + t_hi[1] = vget_high_s16(s1[1]); + t_lo[2] = vget_low_s16(s1[2]); + t_hi[2] = vget_high_s16(s1[2]); + t_lo[3] = vget_low_s16(s1[3]); + t_hi[3] = vget_high_s16(s1[3]); + t_lo[4] = vget_low_s16(s1[4]); + t_hi[4] = vget_high_s16(s1[4]); + t_lo[5] = vget_low_s16(s1[5]); + t_hi[5] = vget_high_s16(s1[5]); + t_lo[6] = vget_low_s16(s1[6]); + t_hi[6] = vget_high_s16(s1[6]); + t_lo[7] = vget_low_s16(s1[7]); + t_hi[7] = vget_high_s16(s1[7]); + + // u[0] = step1[7] * cospi_2_64 + step1[0] * cospi_30_64 + // u[7] = step1[7] * cospi_30_64 - step1[0] * cospi_2_64 + butterfly_two_coeff_s16_s32_noround(t_lo[7], t_hi[7], t_lo[0], t_hi[0], + cospi_2_64, cospi_30_64, &u_lo[0], + &u_hi[0], &u_lo[7], &u_hi[7]); + + // u[1] = step1[6] * cospi_18_64 + step1[1] * cospi_14_64 + // u[6] = step1[6] * cospi_14_64 - step1[1] * cospi_18_64 + butterfly_two_coeff_s16_s32_noround(t_lo[6], t_hi[6], t_lo[1], t_hi[1], + cospi_18_64, cospi_14_64, &u_lo[1], + &u_hi[1], &u_lo[6], &u_hi[6]); + + // u[2] = step1[5] * cospi_10_64 + step1[2] * cospi_22_64 + // u[5] = step1[5] * cospi_22_64 - step1[2] * cospi_10_64 + butterfly_two_coeff_s16_s32_noround(t_lo[5], t_hi[5], t_lo[2], t_hi[2], + cospi_10_64, cospi_22_64, &u_lo[2], + &u_hi[2], &u_lo[5], &u_hi[5]); + + // u[3] = step1[4] * cospi_26_64 + step1[3] * cospi_6_64 + // u[4] = step1[4] * cospi_6_64 - step1[3] * cospi_26_64 + butterfly_two_coeff_s16_s32_noround(t_lo[4], t_hi[4], t_lo[3], t_hi[3], + cospi_26_64, cospi_6_64, &u_lo[3], + &u_hi[3], &u_lo[4], &u_hi[4]); + + // final fdct_round_shift + t_lo[0] = vrshrn_n_s32(u_lo[0], DCT_CONST_BITS); + t_hi[0] = vrshrn_n_s32(u_hi[0], DCT_CONST_BITS); + t_lo[1] = vrshrn_n_s32(u_lo[1], DCT_CONST_BITS); + t_hi[1] = vrshrn_n_s32(u_hi[1], DCT_CONST_BITS); + t_lo[2] = vrshrn_n_s32(u_lo[2], DCT_CONST_BITS); + t_hi[2] = vrshrn_n_s32(u_hi[2], DCT_CONST_BITS); + t_lo[3] = vrshrn_n_s32(u_lo[3], DCT_CONST_BITS); + t_hi[3] = vrshrn_n_s32(u_hi[3], DCT_CONST_BITS); + t_lo[4] = vrshrn_n_s32(u_lo[4], DCT_CONST_BITS); + t_hi[4] = vrshrn_n_s32(u_hi[4], DCT_CONST_BITS); + t_lo[5] = vrshrn_n_s32(u_lo[5], DCT_CONST_BITS); + t_hi[5] = vrshrn_n_s32(u_hi[5], DCT_CONST_BITS); + t_lo[6] = vrshrn_n_s32(u_lo[6], DCT_CONST_BITS); + t_hi[6] = vrshrn_n_s32(u_hi[6], DCT_CONST_BITS); + t_lo[7] = vrshrn_n_s32(u_lo[7], DCT_CONST_BITS); + t_hi[7] = vrshrn_n_s32(u_hi[7], DCT_CONST_BITS); + + in[0] = i[0]; + in[2] = i[1]; + in[4] = i[2]; + in[6] = i[3]; + in[8] = i[4]; + in[10] = i[5]; + in[12] = i[6]; + in[14] = i[7]; + in[1] = vcombine_s16(t_lo[0], t_hi[0]); + in[3] = vcombine_s16(t_lo[4], t_hi[4]); + in[5] = vcombine_s16(t_lo[2], t_hi[2]); + in[7] = vcombine_s16(t_lo[6], t_hi[6]); + in[9] = vcombine_s16(t_lo[1], t_hi[1]); + in[11] = vcombine_s16(t_lo[5], t_hi[5]); + in[13] = vcombine_s16(t_lo[3], t_hi[3]); + in[15] = vcombine_s16(t_lo[7], t_hi[7]); +} + +static void fadst16_8col(int16x8_t *in) { + // perform 16x16 1-D ADST for 8 columns + int16x4_t x_lo[16], x_hi[16]; + int32x4_t s_lo[16], s_hi[16]; + int32x4_t t_lo[16], t_hi[16]; + + x_lo[0] = vget_low_s16(in[15]); + x_hi[0] = vget_high_s16(in[15]); + x_lo[1] = vget_low_s16(in[0]); + x_hi[1] = vget_high_s16(in[0]); + x_lo[2] = vget_low_s16(in[13]); + x_hi[2] = vget_high_s16(in[13]); + x_lo[3] = vget_low_s16(in[2]); + x_hi[3] = vget_high_s16(in[2]); + x_lo[4] = vget_low_s16(in[11]); + x_hi[4] = vget_high_s16(in[11]); + x_lo[5] = vget_low_s16(in[4]); + x_hi[5] = vget_high_s16(in[4]); + x_lo[6] = vget_low_s16(in[9]); + x_hi[6] = vget_high_s16(in[9]); + x_lo[7] = vget_low_s16(in[6]); + x_hi[7] = vget_high_s16(in[6]); + x_lo[8] = vget_low_s16(in[7]); + x_hi[8] = vget_high_s16(in[7]); + x_lo[9] = vget_low_s16(in[8]); + x_hi[9] = vget_high_s16(in[8]); + x_lo[10] = vget_low_s16(in[5]); + x_hi[10] = vget_high_s16(in[5]); + x_lo[11] = vget_low_s16(in[10]); + x_hi[11] = vget_high_s16(in[10]); + x_lo[12] = vget_low_s16(in[3]); + x_hi[12] = vget_high_s16(in[3]); + x_lo[13] = vget_low_s16(in[12]); + x_hi[13] = vget_high_s16(in[12]); + x_lo[14] = vget_low_s16(in[1]); + x_hi[14] = vget_high_s16(in[1]); + x_lo[15] = vget_low_s16(in[14]); + x_hi[15] = vget_high_s16(in[14]); + + // stage 1 + // s0 = cospi_1_64 * x0 + cospi_31_64 * x1; + // s1 = cospi_31_64 * x0 - cospi_1_64 * x1; + butterfly_two_coeff_s16_s32_noround(x_lo[0], x_hi[0], x_lo[1], x_hi[1], + cospi_1_64, cospi_31_64, &s_lo[0], + &s_hi[0], &s_lo[1], &s_hi[1]); + // s2 = cospi_5_64 * x2 + cospi_27_64 * x3; + // s3 = cospi_27_64 * x2 - cospi_5_64 * x3; + butterfly_two_coeff_s16_s32_noround(x_lo[2], x_hi[2], x_lo[3], x_hi[3], + cospi_5_64, cospi_27_64, &s_lo[2], + &s_hi[2], &s_lo[3], &s_hi[3]); + // s4 = cospi_9_64 * x4 + cospi_23_64 * x5; + // s5 = cospi_23_64 * x4 - cospi_9_64 * x5; + butterfly_two_coeff_s16_s32_noround(x_lo[4], x_hi[4], x_lo[5], x_hi[5], + cospi_9_64, cospi_23_64, &s_lo[4], + &s_hi[4], &s_lo[5], &s_hi[5]); + // s6 = cospi_13_64 * x6 + cospi_19_64 * x7; + // s7 = cospi_19_64 * x6 - cospi_13_64 * x7; + butterfly_two_coeff_s16_s32_noround(x_lo[6], x_hi[6], x_lo[7], x_hi[7], + cospi_13_64, cospi_19_64, &s_lo[6], + &s_hi[6], &s_lo[7], &s_hi[7]); + // s8 = cospi_17_64 * x8 + cospi_15_64 * x9; + // s9 = cospi_15_64 * x8 - cospi_17_64 * x9; + butterfly_two_coeff_s16_s32_noround(x_lo[8], x_hi[8], x_lo[9], x_hi[9], + cospi_17_64, cospi_15_64, &s_lo[8], + &s_hi[8], &s_lo[9], &s_hi[9]); + // s10 = cospi_21_64 * x10 + cospi_11_64 * x11; + // s11 = cospi_11_64 * x10 - cospi_21_64 * x11; + butterfly_two_coeff_s16_s32_noround(x_lo[10], x_hi[10], x_lo[11], x_hi[11], + cospi_21_64, cospi_11_64, &s_lo[10], + &s_hi[10], &s_lo[11], &s_hi[11]); + // s12 = cospi_25_64 * x12 + cospi_7_64 * x13; + // s13 = cospi_7_64 * x12 - cospi_25_64 * x13; + butterfly_two_coeff_s16_s32_noround(x_lo[12], x_hi[12], x_lo[13], x_hi[13], + cospi_25_64, cospi_7_64, &s_lo[12], + &s_hi[12], &s_lo[13], &s_hi[13]); + // s14 = cospi_29_64 * x14 + cospi_3_64 * x15; + // s15 = cospi_3_64 * x14 - cospi_29_64 * x15; + butterfly_two_coeff_s16_s32_noround(x_lo[14], x_hi[14], x_lo[15], x_hi[15], + cospi_29_64, cospi_3_64, &s_lo[14], + &s_hi[14], &s_lo[15], &s_hi[15]); + + // fdct_round_shift + t_lo[0] = vrshrq_n_s32(vaddq_s32(s_lo[0], s_lo[8]), DCT_CONST_BITS); + t_hi[0] = vrshrq_n_s32(vaddq_s32(s_hi[0], s_hi[8]), DCT_CONST_BITS); + t_lo[1] = vrshrq_n_s32(vaddq_s32(s_lo[1], s_lo[9]), DCT_CONST_BITS); + t_hi[1] = vrshrq_n_s32(vaddq_s32(s_hi[1], s_hi[9]), DCT_CONST_BITS); + t_lo[2] = vrshrq_n_s32(vaddq_s32(s_lo[2], s_lo[10]), DCT_CONST_BITS); + t_hi[2] = vrshrq_n_s32(vaddq_s32(s_hi[2], s_hi[10]), DCT_CONST_BITS); + t_lo[3] = vrshrq_n_s32(vaddq_s32(s_lo[3], s_lo[11]), DCT_CONST_BITS); + t_hi[3] = vrshrq_n_s32(vaddq_s32(s_hi[3], s_hi[11]), DCT_CONST_BITS); + t_lo[4] = vrshrq_n_s32(vaddq_s32(s_lo[4], s_lo[12]), DCT_CONST_BITS); + t_hi[4] = vrshrq_n_s32(vaddq_s32(s_hi[4], s_hi[12]), DCT_CONST_BITS); + t_lo[5] = vrshrq_n_s32(vaddq_s32(s_lo[5], s_lo[13]), DCT_CONST_BITS); + t_hi[5] = vrshrq_n_s32(vaddq_s32(s_hi[5], s_hi[13]), DCT_CONST_BITS); + t_lo[6] = vrshrq_n_s32(vaddq_s32(s_lo[6], s_lo[14]), DCT_CONST_BITS); + t_hi[6] = vrshrq_n_s32(vaddq_s32(s_hi[6], s_hi[14]), DCT_CONST_BITS); + t_lo[7] = vrshrq_n_s32(vaddq_s32(s_lo[7], s_lo[15]), DCT_CONST_BITS); + t_hi[7] = vrshrq_n_s32(vaddq_s32(s_hi[7], s_hi[15]), DCT_CONST_BITS); + t_lo[8] = vrshrq_n_s32(vsubq_s32(s_lo[0], s_lo[8]), DCT_CONST_BITS); + t_hi[8] = vrshrq_n_s32(vsubq_s32(s_hi[0], s_hi[8]), DCT_CONST_BITS); + t_lo[9] = vrshrq_n_s32(vsubq_s32(s_lo[1], s_lo[9]), DCT_CONST_BITS); + t_hi[9] = vrshrq_n_s32(vsubq_s32(s_hi[1], s_hi[9]), DCT_CONST_BITS); + t_lo[10] = vrshrq_n_s32(vsubq_s32(s_lo[2], s_lo[10]), DCT_CONST_BITS); + t_hi[10] = vrshrq_n_s32(vsubq_s32(s_hi[2], s_hi[10]), DCT_CONST_BITS); + t_lo[11] = vrshrq_n_s32(vsubq_s32(s_lo[3], s_lo[11]), DCT_CONST_BITS); + t_hi[11] = vrshrq_n_s32(vsubq_s32(s_hi[3], s_hi[11]), DCT_CONST_BITS); + t_lo[12] = vrshrq_n_s32(vsubq_s32(s_lo[4], s_lo[12]), DCT_CONST_BITS); + t_hi[12] = vrshrq_n_s32(vsubq_s32(s_hi[4], s_hi[12]), DCT_CONST_BITS); + t_lo[13] = vrshrq_n_s32(vsubq_s32(s_lo[5], s_lo[13]), DCT_CONST_BITS); + t_hi[13] = vrshrq_n_s32(vsubq_s32(s_hi[5], s_hi[13]), DCT_CONST_BITS); + t_lo[14] = vrshrq_n_s32(vsubq_s32(s_lo[6], s_lo[14]), DCT_CONST_BITS); + t_hi[14] = vrshrq_n_s32(vsubq_s32(s_hi[6], s_hi[14]), DCT_CONST_BITS); + t_lo[15] = vrshrq_n_s32(vsubq_s32(s_lo[7], s_lo[15]), DCT_CONST_BITS); + t_hi[15] = vrshrq_n_s32(vsubq_s32(s_hi[7], s_hi[15]), DCT_CONST_BITS); + + // stage 2 + s_lo[0] = t_lo[0]; + s_hi[0] = t_hi[0]; + s_lo[1] = t_lo[1]; + s_hi[1] = t_hi[1]; + s_lo[2] = t_lo[2]; + s_hi[2] = t_hi[2]; + s_lo[3] = t_lo[3]; + s_hi[3] = t_hi[3]; + s_lo[4] = t_lo[4]; + s_hi[4] = t_hi[4]; + s_lo[5] = t_lo[5]; + s_hi[5] = t_hi[5]; + s_lo[6] = t_lo[6]; + s_hi[6] = t_hi[6]; + s_lo[7] = t_lo[7]; + s_hi[7] = t_hi[7]; + // s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + // s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + butterfly_two_coeff_s32_noround(t_lo[8], t_hi[8], t_lo[9], t_hi[9], + cospi_4_64, cospi_28_64, &s_lo[8], &s_hi[8], + &s_lo[9], &s_hi[9]); + // s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + // s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + butterfly_two_coeff_s32_noround(t_lo[10], t_hi[10], t_lo[11], t_hi[11], + cospi_20_64, cospi_12_64, &s_lo[10], + &s_hi[10], &s_lo[11], &s_hi[11]); + // s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; + // s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + butterfly_two_coeff_s32_noround(t_lo[13], t_hi[13], t_lo[12], t_hi[12], + cospi_28_64, cospi_4_64, &s_lo[13], &s_hi[13], + &s_lo[12], &s_hi[12]); + // s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; + // s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + butterfly_two_coeff_s32_noround(t_lo[15], t_hi[15], t_lo[14], t_hi[14], + cospi_12_64, cospi_20_64, &s_lo[15], + &s_hi[15], &s_lo[14], &s_hi[14]); + + // s0 + s4 + t_lo[0] = vaddq_s32(s_lo[0], s_lo[4]); + t_hi[0] = vaddq_s32(s_hi[0], s_hi[4]); + // s1 + s5 + t_lo[1] = vaddq_s32(s_lo[1], s_lo[5]); + t_hi[1] = vaddq_s32(s_hi[1], s_hi[5]); + // s2 + s6 + t_lo[2] = vaddq_s32(s_lo[2], s_lo[6]); + t_hi[2] = vaddq_s32(s_hi[2], s_hi[6]); + // s3 + s7 + t_lo[3] = vaddq_s32(s_lo[3], s_lo[7]); + t_hi[3] = vaddq_s32(s_hi[3], s_hi[7]); + // s0 - s4 + t_lo[4] = vsubq_s32(s_lo[0], s_lo[4]); + t_hi[4] = vsubq_s32(s_hi[0], s_hi[4]); + // s1 - s7 + t_lo[5] = vsubq_s32(s_lo[1], s_lo[5]); + t_hi[5] = vsubq_s32(s_hi[1], s_hi[5]); + // s2 - s6 + t_lo[6] = vsubq_s32(s_lo[2], s_lo[6]); + t_hi[6] = vsubq_s32(s_hi[2], s_hi[6]); + // s3 - s7 + t_lo[7] = vsubq_s32(s_lo[3], s_lo[7]); + t_hi[7] = vsubq_s32(s_hi[3], s_hi[7]); + // s8 + s12 + t_lo[8] = vaddq_s32(s_lo[8], s_lo[12]); + t_hi[8] = vaddq_s32(s_hi[8], s_hi[12]); + // s9 + s13 + t_lo[9] = vaddq_s32(s_lo[9], s_lo[13]); + t_hi[9] = vaddq_s32(s_hi[9], s_hi[13]); + // s10 + s14 + t_lo[10] = vaddq_s32(s_lo[10], s_lo[14]); + t_hi[10] = vaddq_s32(s_hi[10], s_hi[14]); + // s11 + s15 + t_lo[11] = vaddq_s32(s_lo[11], s_lo[15]); + t_hi[11] = vaddq_s32(s_hi[11], s_hi[15]); + // s8 + s12 + t_lo[12] = vsubq_s32(s_lo[8], s_lo[12]); + t_hi[12] = vsubq_s32(s_hi[8], s_hi[12]); + // s9 + s13 + t_lo[13] = vsubq_s32(s_lo[9], s_lo[13]); + t_hi[13] = vsubq_s32(s_hi[9], s_hi[13]); + // s10 + s14 + t_lo[14] = vsubq_s32(s_lo[10], s_lo[14]); + t_hi[14] = vsubq_s32(s_hi[10], s_hi[14]); + // s11 + s15 + t_lo[15] = vsubq_s32(s_lo[11], s_lo[15]); + t_hi[15] = vsubq_s32(s_hi[11], s_hi[15]); + + t_lo[8] = vrshrq_n_s32(t_lo[8], DCT_CONST_BITS); + t_hi[8] = vrshrq_n_s32(t_hi[8], DCT_CONST_BITS); + t_lo[9] = vrshrq_n_s32(t_lo[9], DCT_CONST_BITS); + t_hi[9] = vrshrq_n_s32(t_hi[9], DCT_CONST_BITS); + t_lo[10] = vrshrq_n_s32(t_lo[10], DCT_CONST_BITS); + t_hi[10] = vrshrq_n_s32(t_hi[10], DCT_CONST_BITS); + t_lo[11] = vrshrq_n_s32(t_lo[11], DCT_CONST_BITS); + t_hi[11] = vrshrq_n_s32(t_hi[11], DCT_CONST_BITS); + t_lo[12] = vrshrq_n_s32(t_lo[12], DCT_CONST_BITS); + t_hi[12] = vrshrq_n_s32(t_hi[12], DCT_CONST_BITS); + t_lo[13] = vrshrq_n_s32(t_lo[13], DCT_CONST_BITS); + t_hi[13] = vrshrq_n_s32(t_hi[13], DCT_CONST_BITS); + t_lo[14] = vrshrq_n_s32(t_lo[14], DCT_CONST_BITS); + t_hi[14] = vrshrq_n_s32(t_hi[14], DCT_CONST_BITS); + t_lo[15] = vrshrq_n_s32(t_lo[15], DCT_CONST_BITS); + t_hi[15] = vrshrq_n_s32(t_hi[15], DCT_CONST_BITS); + + // stage 3 + s_lo[0] = t_lo[0]; + s_hi[0] = t_hi[0]; + s_lo[1] = t_lo[1]; + s_hi[1] = t_hi[1]; + s_lo[2] = t_lo[2]; + s_hi[2] = t_hi[2]; + s_lo[3] = t_lo[3]; + s_hi[3] = t_hi[3]; + // s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + // s5 = x4 * cospi_24_64 - x5 * cospi_8_64; + butterfly_two_coeff_s32_noround(t_lo[4], t_hi[4], t_lo[5], t_hi[5], + cospi_8_64, cospi_24_64, &s_lo[4], &s_hi[4], + &s_lo[5], &s_hi[5]); + // s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; + // s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + butterfly_two_coeff_s32_noround(t_lo[7], t_hi[7], t_lo[6], t_hi[6], + cospi_24_64, cospi_8_64, &s_lo[7], &s_hi[7], + &s_lo[6], &s_hi[6]); + s_lo[8] = t_lo[8]; + s_hi[8] = t_hi[8]; + s_lo[9] = t_lo[9]; + s_hi[9] = t_hi[9]; + s_lo[10] = t_lo[10]; + s_hi[10] = t_hi[10]; + s_lo[11] = t_lo[11]; + s_hi[11] = t_hi[11]; + // s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + // s13 = x12 * cospi_24_64 - x13 * cospi_8_64; + butterfly_two_coeff_s32_noround(t_lo[12], t_hi[12], t_lo[13], t_hi[13], + cospi_8_64, cospi_24_64, &s_lo[12], &s_hi[12], + &s_lo[13], &s_hi[13]); + // s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; + // s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + butterfly_two_coeff_s32_noround(t_lo[15], t_hi[15], t_lo[14], t_hi[14], + cospi_24_64, cospi_8_64, &s_lo[15], &s_hi[15], + &s_lo[14], &s_hi[14]); + + // s0 + s4 + t_lo[0] = vaddq_s32(s_lo[0], s_lo[2]); + t_hi[0] = vaddq_s32(s_hi[0], s_hi[2]); + // s1 + s3 + t_lo[1] = vaddq_s32(s_lo[1], s_lo[3]); + t_hi[1] = vaddq_s32(s_hi[1], s_hi[3]); + // s0 - s4 + t_lo[2] = vsubq_s32(s_lo[0], s_lo[2]); + t_hi[2] = vsubq_s32(s_hi[0], s_hi[2]); + // s1 - s3 + t_lo[3] = vsubq_s32(s_lo[1], s_lo[3]); + t_hi[3] = vsubq_s32(s_hi[1], s_hi[3]); + // s4 + s6 + t_lo[4] = vaddq_s32(s_lo[4], s_lo[6]); + t_hi[4] = vaddq_s32(s_hi[4], s_hi[6]); + // s5 + s7 + t_lo[5] = vaddq_s32(s_lo[5], s_lo[7]); + t_hi[5] = vaddq_s32(s_hi[5], s_hi[7]); + // s4 - s6 + t_lo[6] = vsubq_s32(s_lo[4], s_lo[6]); + t_hi[6] = vsubq_s32(s_hi[4], s_hi[6]); + // s5 - s7 + t_lo[7] = vsubq_s32(s_lo[5], s_lo[7]); + t_hi[7] = vsubq_s32(s_hi[5], s_hi[7]); + // s8 + s10 + t_lo[8] = vaddq_s32(s_lo[8], s_lo[10]); + t_hi[8] = vaddq_s32(s_hi[8], s_hi[10]); + // s9 + s11 + t_lo[9] = vaddq_s32(s_lo[9], s_lo[11]); + t_hi[9] = vaddq_s32(s_hi[9], s_hi[11]); + // s8 - s10 + t_lo[10] = vsubq_s32(s_lo[8], s_lo[10]); + t_hi[10] = vsubq_s32(s_hi[8], s_hi[10]); + // s9 - s11 + t_lo[11] = vsubq_s32(s_lo[9], s_lo[11]); + t_hi[11] = vsubq_s32(s_hi[9], s_hi[11]); + // s12 + s14 + t_lo[12] = vaddq_s32(s_lo[12], s_lo[14]); + t_hi[12] = vaddq_s32(s_hi[12], s_hi[14]); + // s13 + s15 + t_lo[13] = vaddq_s32(s_lo[13], s_lo[15]); + t_hi[13] = vaddq_s32(s_hi[13], s_hi[15]); + // s12 - s14 + t_lo[14] = vsubq_s32(s_lo[12], s_lo[14]); + t_hi[14] = vsubq_s32(s_hi[12], s_hi[14]); + // s13 - s15 + t_lo[15] = vsubq_s32(s_lo[13], s_lo[15]); + t_hi[15] = vsubq_s32(s_hi[13], s_hi[15]); + + t_lo[4] = vrshrq_n_s32(t_lo[4], DCT_CONST_BITS); + t_hi[4] = vrshrq_n_s32(t_hi[4], DCT_CONST_BITS); + t_lo[5] = vrshrq_n_s32(t_lo[5], DCT_CONST_BITS); + t_hi[5] = vrshrq_n_s32(t_hi[5], DCT_CONST_BITS); + t_lo[6] = vrshrq_n_s32(t_lo[6], DCT_CONST_BITS); + t_hi[6] = vrshrq_n_s32(t_hi[6], DCT_CONST_BITS); + t_lo[7] = vrshrq_n_s32(t_lo[7], DCT_CONST_BITS); + t_hi[7] = vrshrq_n_s32(t_hi[7], DCT_CONST_BITS); + t_lo[12] = vrshrq_n_s32(t_lo[12], DCT_CONST_BITS); + t_hi[12] = vrshrq_n_s32(t_hi[12], DCT_CONST_BITS); + t_lo[13] = vrshrq_n_s32(t_lo[13], DCT_CONST_BITS); + t_hi[13] = vrshrq_n_s32(t_hi[13], DCT_CONST_BITS); + t_lo[14] = vrshrq_n_s32(t_lo[14], DCT_CONST_BITS); + t_hi[14] = vrshrq_n_s32(t_hi[14], DCT_CONST_BITS); + t_lo[15] = vrshrq_n_s32(t_lo[15], DCT_CONST_BITS); + t_hi[15] = vrshrq_n_s32(t_hi[15], DCT_CONST_BITS); + + // stage 4 + // s2 = (-cospi_16_64) * (x2 + x3); + // s3 = cospi_16_64 * (x2 - x3); + butterfly_one_coeff_s32_noround(t_lo[3], t_hi[3], t_lo[2], t_hi[2], + -cospi_16_64, &s_lo[2], &s_hi[2], &s_lo[3], + &s_hi[3]); + // s6 = cospi_16_64 * (x6 + x7); + // s7 = cospi_16_64 * (-x6 + x7); + butterfly_one_coeff_s32_noround(t_lo[7], t_hi[7], t_lo[6], t_hi[6], + cospi_16_64, &s_lo[6], &s_hi[6], &s_lo[7], + &s_hi[7]); + // s10 = cospi_16_64 * (x10 + x11); + // s11 = cospi_16_64 * (-x10 + x11); + butterfly_one_coeff_s32_noround(t_lo[11], t_hi[11], t_lo[10], t_hi[10], + cospi_16_64, &s_lo[10], &s_hi[10], &s_lo[11], + &s_hi[11]); + // s14 = (-cospi_16_64) * (x14 + x15); + // s15 = cospi_16_64 * (x14 - x15); + butterfly_one_coeff_s32_noround(t_lo[15], t_hi[15], t_lo[14], t_hi[14], + -cospi_16_64, &s_lo[14], &s_hi[14], &s_lo[15], + &s_hi[15]); + + // final fdct_round_shift + x_lo[2] = vrshrn_n_s32(s_lo[2], DCT_CONST_BITS); + x_hi[2] = vrshrn_n_s32(s_hi[2], DCT_CONST_BITS); + x_lo[3] = vrshrn_n_s32(s_lo[3], DCT_CONST_BITS); + x_hi[3] = vrshrn_n_s32(s_hi[3], DCT_CONST_BITS); + x_lo[6] = vrshrn_n_s32(s_lo[6], DCT_CONST_BITS); + x_hi[6] = vrshrn_n_s32(s_hi[6], DCT_CONST_BITS); + x_lo[7] = vrshrn_n_s32(s_lo[7], DCT_CONST_BITS); + x_hi[7] = vrshrn_n_s32(s_hi[7], DCT_CONST_BITS); + x_lo[10] = vrshrn_n_s32(s_lo[10], DCT_CONST_BITS); + x_hi[10] = vrshrn_n_s32(s_hi[10], DCT_CONST_BITS); + x_lo[11] = vrshrn_n_s32(s_lo[11], DCT_CONST_BITS); + x_hi[11] = vrshrn_n_s32(s_hi[11], DCT_CONST_BITS); + x_lo[14] = vrshrn_n_s32(s_lo[14], DCT_CONST_BITS); + x_hi[14] = vrshrn_n_s32(s_hi[14], DCT_CONST_BITS); + x_lo[15] = vrshrn_n_s32(s_lo[15], DCT_CONST_BITS); + x_hi[15] = vrshrn_n_s32(s_hi[15], DCT_CONST_BITS); + + // x0, x1, x4, x5, x8, x9, x12, x13 narrow down to 16-bits directly + x_lo[0] = vmovn_s32(t_lo[0]); + x_hi[0] = vmovn_s32(t_hi[0]); + x_lo[1] = vmovn_s32(t_lo[1]); + x_hi[1] = vmovn_s32(t_hi[1]); + x_lo[4] = vmovn_s32(t_lo[4]); + x_hi[4] = vmovn_s32(t_hi[4]); + x_lo[5] = vmovn_s32(t_lo[5]); + x_hi[5] = vmovn_s32(t_hi[5]); + x_lo[8] = vmovn_s32(t_lo[8]); + x_hi[8] = vmovn_s32(t_hi[8]); + x_lo[9] = vmovn_s32(t_lo[9]); + x_hi[9] = vmovn_s32(t_hi[9]); + x_lo[12] = vmovn_s32(t_lo[12]); + x_hi[12] = vmovn_s32(t_hi[12]); + x_lo[13] = vmovn_s32(t_lo[13]); + x_hi[13] = vmovn_s32(t_hi[13]); + + in[0] = vcombine_s16(x_lo[0], x_hi[0]); + in[1] = vnegq_s16(vcombine_s16(x_lo[8], x_hi[8])); + in[2] = vcombine_s16(x_lo[12], x_hi[12]); + in[3] = vnegq_s16(vcombine_s16(x_lo[4], x_hi[4])); + in[4] = vcombine_s16(x_lo[6], x_hi[6]); + in[5] = vcombine_s16(x_lo[14], x_hi[14]); + in[6] = vcombine_s16(x_lo[10], x_hi[10]); + in[7] = vcombine_s16(x_lo[2], x_hi[2]); + in[8] = vcombine_s16(x_lo[3], x_hi[3]); + in[9] = vcombine_s16(x_lo[11], x_hi[11]); + in[10] = vcombine_s16(x_lo[15], x_hi[15]); + in[11] = vcombine_s16(x_lo[7], x_hi[7]); + in[12] = vcombine_s16(x_lo[5], x_hi[5]); + in[13] = vnegq_s16(vcombine_s16(x_lo[13], x_hi[13])); + in[14] = vcombine_s16(x_lo[9], x_hi[9]); + in[15] = vnegq_s16(vcombine_s16(x_lo[1], x_hi[1])); +} + +static void fdct16x16_neon(int16x8_t *in0, int16x8_t *in1) { + // Left half. + fdct16_8col(in0); + // Right half. + fdct16_8col(in1); + transpose_s16_16x16(in0, in1); +} + +static void fadst16x16_neon(int16x8_t *in0, int16x8_t *in1) { + fadst16_8col(in0); + fadst16_8col(in1); + transpose_s16_16x16(in0, in1); +} + +void vp9_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + int16x8_t in0[16], in1[16]; + + switch (tx_type) { + case DCT_DCT: vpx_fdct16x16_neon(input, output, stride); break; + case ADST_DCT: + load_buffer_16x16(input, in0, in1, stride); + fadst16x16_neon(in0, in1); + right_shift_16x16(in0, in1); + fdct16x16_neon(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + case DCT_ADST: + load_buffer_16x16(input, in0, in1, stride); + fdct16x16_neon(in0, in1); + right_shift_16x16(in0, in1); + fadst16x16_neon(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + default: + assert(tx_type == ADST_ADST); + load_buffer_16x16(input, in0, in1, stride); + fadst16x16_neon(in0, in1); + right_shift_16x16(in0, in1); + fadst16x16_neon(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH + +static INLINE void highbd_load_buffer_4x4(const int16_t *input, + int32x4_t *in /*[4]*/, int stride) { + // { 0, 1, 1, 1 }; + const int32x4_t nonzero_bias_a = vextq_s32(vdupq_n_s32(0), vdupq_n_s32(1), 3); + // { 1, 0, 0, 0 }; + const int32x4_t nonzero_bias_b = vextq_s32(vdupq_n_s32(1), vdupq_n_s32(0), 3); + int32x4_t mask; + + in[0] = vshll_n_s16(vld1_s16(input + 0 * stride), 4); + in[1] = vshll_n_s16(vld1_s16(input + 1 * stride), 4); + in[2] = vshll_n_s16(vld1_s16(input + 2 * stride), 4); + in[3] = vshll_n_s16(vld1_s16(input + 3 * stride), 4); + + // Copy the SSE method, use a mask to avoid an 'if' branch here to increase by + // one non-zero first elements + mask = vreinterpretq_s32_u32(vceqq_s32(in[0], nonzero_bias_a)); + in[0] = vaddq_s32(in[0], mask); + in[0] = vaddq_s32(in[0], nonzero_bias_b); +} + +static INLINE void highbd_write_buffer_4x4(tran_low_t *output, int32x4_t *res) { + const int32x4_t one = vdupq_n_s32(1); + res[0] = vshrq_n_s32(vaddq_s32(res[0], one), 2); + res[1] = vshrq_n_s32(vaddq_s32(res[1], one), 2); + res[2] = vshrq_n_s32(vaddq_s32(res[2], one), 2); + res[3] = vshrq_n_s32(vaddq_s32(res[3], one), 2); + vst1q_s32(output + 0 * 4, res[0]); + vst1q_s32(output + 1 * 4, res[1]); + vst1q_s32(output + 2 * 4, res[2]); + vst1q_s32(output + 3 * 4, res[3]); +} + +static INLINE void highbd_fadst4x4_neon(int32x4_t *in /*[4]*/) { + int32x2_t s_lo[4], s_hi[4]; + int64x2_t u_lo[4], u_hi[4], t_lo[4], t_hi[4]; + + s_lo[0] = vget_low_s32(in[0]); + s_hi[0] = vget_high_s32(in[0]); + s_lo[1] = vget_low_s32(in[1]); + s_hi[1] = vget_high_s32(in[1]); + s_lo[2] = vget_low_s32(in[2]); + s_hi[2] = vget_high_s32(in[2]); + s_lo[3] = vget_low_s32(in[3]); + s_hi[3] = vget_high_s32(in[3]); + + // t0 = s0 * sinpi_1_9 + s1 * sinpi_2_9 + s3 * sinpi_4_9 + t_lo[0] = vmull_n_s32(s_lo[0], sinpi_1_9); + t_lo[0] = vmlal_n_s32(t_lo[0], s_lo[1], sinpi_2_9); + t_lo[0] = vmlal_n_s32(t_lo[0], s_lo[3], sinpi_4_9); + t_hi[0] = vmull_n_s32(s_hi[0], sinpi_1_9); + t_hi[0] = vmlal_n_s32(t_hi[0], s_hi[1], sinpi_2_9); + t_hi[0] = vmlal_n_s32(t_hi[0], s_hi[3], sinpi_4_9); + + // t1 = (s0 + s1) * sinpi_3_9 - s3 * sinpi_3_9 + t_lo[1] = vmull_n_s32(s_lo[0], sinpi_3_9); + t_lo[1] = vmlal_n_s32(t_lo[1], s_lo[1], sinpi_3_9); + t_lo[1] = vmlsl_n_s32(t_lo[1], s_lo[3], sinpi_3_9); + t_hi[1] = vmull_n_s32(s_hi[0], sinpi_3_9); + t_hi[1] = vmlal_n_s32(t_hi[1], s_hi[1], sinpi_3_9); + t_hi[1] = vmlsl_n_s32(t_hi[1], s_hi[3], sinpi_3_9); + + // t2 = s0 * sinpi_4_9 - s1* sinpi_1_9 + s3 * sinpi_2_9 + t_lo[2] = vmull_n_s32(s_lo[0], sinpi_4_9); + t_lo[2] = vmlsl_n_s32(t_lo[2], s_lo[1], sinpi_1_9); + t_lo[2] = vmlal_n_s32(t_lo[2], s_lo[3], sinpi_2_9); + t_hi[2] = vmull_n_s32(s_hi[0], sinpi_4_9); + t_hi[2] = vmlsl_n_s32(t_hi[2], s_hi[1], sinpi_1_9); + t_hi[2] = vmlal_n_s32(t_hi[2], s_hi[3], sinpi_2_9); + + // t3 = s2 * sinpi_3_9 + t_lo[3] = vmull_n_s32(s_lo[2], sinpi_3_9); + t_hi[3] = vmull_n_s32(s_hi[2], sinpi_3_9); + + /* + * u0 = t0 + t3 + * u1 = t1 + * u2 = t2 - t3 + * u3 = t2 - t0 + t3 + */ + u_lo[0] = vaddq_s64(t_lo[0], t_lo[3]); + u_hi[0] = vaddq_s64(t_hi[0], t_hi[3]); + u_lo[1] = t_lo[1]; + u_hi[1] = t_hi[1]; + u_lo[2] = vsubq_s64(t_lo[2], t_lo[3]); + u_hi[2] = vsubq_s64(t_hi[2], t_hi[3]); + u_lo[3] = vaddq_s64(vsubq_s64(t_lo[2], t_lo[0]), t_lo[3]); + u_hi[3] = vaddq_s64(vsubq_s64(t_hi[2], t_hi[0]), t_hi[3]); + + // fdct_round_shift + in[0] = vcombine_s32(vrshrn_n_s64(u_lo[0], DCT_CONST_BITS), + vrshrn_n_s64(u_hi[0], DCT_CONST_BITS)); + in[1] = vcombine_s32(vrshrn_n_s64(u_lo[1], DCT_CONST_BITS), + vrshrn_n_s64(u_hi[1], DCT_CONST_BITS)); + in[2] = vcombine_s32(vrshrn_n_s64(u_lo[2], DCT_CONST_BITS), + vrshrn_n_s64(u_hi[2], DCT_CONST_BITS)); + in[3] = vcombine_s32(vrshrn_n_s64(u_lo[3], DCT_CONST_BITS), + vrshrn_n_s64(u_hi[3], DCT_CONST_BITS)); + + transpose_s32_4x4(&in[0], &in[1], &in[2], &in[3]); +} + +void vp9_highbd_fht4x4_neon(const int16_t *input, tran_low_t *output, + int stride, int tx_type) { + int32x4_t in[4]; + // int i; + + switch (tx_type) { + case DCT_DCT: vpx_highbd_fdct4x4_neon(input, output, stride); break; + case ADST_DCT: + highbd_load_buffer_4x4(input, in, stride); + highbd_fadst4x4_neon(in); + vpx_highbd_fdct4x4_pass1_neon(in); + highbd_write_buffer_4x4(output, in); + break; + case DCT_ADST: + highbd_load_buffer_4x4(input, in, stride); + vpx_highbd_fdct4x4_pass1_neon(in); + highbd_fadst4x4_neon(in); + highbd_write_buffer_4x4(output, in); + break; + default: + assert(tx_type == ADST_ADST); + highbd_load_buffer_4x4(input, in, stride); + highbd_fadst4x4_neon(in); + highbd_fadst4x4_neon(in); + highbd_write_buffer_4x4(output, in); + break; + } +} + +static INLINE void highbd_load_buffer_8x8(const int16_t *input, + int32x4_t *lo /*[8]*/, + int32x4_t *hi /*[8]*/, int stride) { + int16x8_t in[8]; + in[0] = vld1q_s16(input + 0 * stride); + in[1] = vld1q_s16(input + 1 * stride); + in[2] = vld1q_s16(input + 2 * stride); + in[3] = vld1q_s16(input + 3 * stride); + in[4] = vld1q_s16(input + 4 * stride); + in[5] = vld1q_s16(input + 5 * stride); + in[6] = vld1q_s16(input + 6 * stride); + in[7] = vld1q_s16(input + 7 * stride); + lo[0] = vshll_n_s16(vget_low_s16(in[0]), 2); + hi[0] = vshll_n_s16(vget_high_s16(in[0]), 2); + lo[1] = vshll_n_s16(vget_low_s16(in[1]), 2); + hi[1] = vshll_n_s16(vget_high_s16(in[1]), 2); + lo[2] = vshll_n_s16(vget_low_s16(in[2]), 2); + hi[2] = vshll_n_s16(vget_high_s16(in[2]), 2); + lo[3] = vshll_n_s16(vget_low_s16(in[3]), 2); + hi[3] = vshll_n_s16(vget_high_s16(in[3]), 2); + lo[4] = vshll_n_s16(vget_low_s16(in[4]), 2); + hi[4] = vshll_n_s16(vget_high_s16(in[4]), 2); + lo[5] = vshll_n_s16(vget_low_s16(in[5]), 2); + hi[5] = vshll_n_s16(vget_high_s16(in[5]), 2); + lo[6] = vshll_n_s16(vget_low_s16(in[6]), 2); + hi[6] = vshll_n_s16(vget_high_s16(in[6]), 2); + lo[7] = vshll_n_s16(vget_low_s16(in[7]), 2); + hi[7] = vshll_n_s16(vget_high_s16(in[7]), 2); +} + +/* right shift and rounding + * first get the sign bit (bit 15). + * If bit == 1, it's the simple case of shifting right by one bit. + * If bit == 2, it essentially computes the expression: + * + * out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; + * + * for each row. + */ +static INLINE void highbd_right_shift_8x8(int32x4_t *lo, int32x4_t *hi, + const int bit) { + int32x4_t sign_lo[8], sign_hi[8]; + sign_lo[0] = vshrq_n_s32(lo[0], 31); + sign_hi[0] = vshrq_n_s32(hi[0], 31); + sign_lo[1] = vshrq_n_s32(lo[1], 31); + sign_hi[1] = vshrq_n_s32(hi[1], 31); + sign_lo[2] = vshrq_n_s32(lo[2], 31); + sign_hi[2] = vshrq_n_s32(hi[2], 31); + sign_lo[3] = vshrq_n_s32(lo[3], 31); + sign_hi[3] = vshrq_n_s32(hi[3], 31); + sign_lo[4] = vshrq_n_s32(lo[4], 31); + sign_hi[4] = vshrq_n_s32(hi[4], 31); + sign_lo[5] = vshrq_n_s32(lo[5], 31); + sign_hi[5] = vshrq_n_s32(hi[5], 31); + sign_lo[6] = vshrq_n_s32(lo[6], 31); + sign_hi[6] = vshrq_n_s32(hi[6], 31); + sign_lo[7] = vshrq_n_s32(lo[7], 31); + sign_hi[7] = vshrq_n_s32(hi[7], 31); + + if (bit == 2) { + const int32x4_t const_rounding = vdupq_n_s32(1); + lo[0] = vaddq_s32(lo[0], const_rounding); + hi[0] = vaddq_s32(hi[0], const_rounding); + lo[1] = vaddq_s32(lo[1], const_rounding); + hi[1] = vaddq_s32(hi[1], const_rounding); + lo[2] = vaddq_s32(lo[2], const_rounding); + hi[2] = vaddq_s32(hi[2], const_rounding); + lo[3] = vaddq_s32(lo[3], const_rounding); + hi[3] = vaddq_s32(hi[3], const_rounding); + lo[4] = vaddq_s32(lo[4], const_rounding); + hi[4] = vaddq_s32(hi[4], const_rounding); + lo[5] = vaddq_s32(lo[5], const_rounding); + hi[5] = vaddq_s32(hi[5], const_rounding); + lo[6] = vaddq_s32(lo[6], const_rounding); + hi[6] = vaddq_s32(hi[6], const_rounding); + lo[7] = vaddq_s32(lo[7], const_rounding); + hi[7] = vaddq_s32(hi[7], const_rounding); + } + + lo[0] = vsubq_s32(lo[0], sign_lo[0]); + hi[0] = vsubq_s32(hi[0], sign_hi[0]); + lo[1] = vsubq_s32(lo[1], sign_lo[1]); + hi[1] = vsubq_s32(hi[1], sign_hi[1]); + lo[2] = vsubq_s32(lo[2], sign_lo[2]); + hi[2] = vsubq_s32(hi[2], sign_hi[2]); + lo[3] = vsubq_s32(lo[3], sign_lo[3]); + hi[3] = vsubq_s32(hi[3], sign_hi[3]); + lo[4] = vsubq_s32(lo[4], sign_lo[4]); + hi[4] = vsubq_s32(hi[4], sign_hi[4]); + lo[5] = vsubq_s32(lo[5], sign_lo[5]); + hi[5] = vsubq_s32(hi[5], sign_hi[5]); + lo[6] = vsubq_s32(lo[6], sign_lo[6]); + hi[6] = vsubq_s32(hi[6], sign_hi[6]); + lo[7] = vsubq_s32(lo[7], sign_lo[7]); + hi[7] = vsubq_s32(hi[7], sign_hi[7]); + + if (bit == 1) { + lo[0] = vshrq_n_s32(lo[0], 1); + hi[0] = vshrq_n_s32(hi[0], 1); + lo[1] = vshrq_n_s32(lo[1], 1); + hi[1] = vshrq_n_s32(hi[1], 1); + lo[2] = vshrq_n_s32(lo[2], 1); + hi[2] = vshrq_n_s32(hi[2], 1); + lo[3] = vshrq_n_s32(lo[3], 1); + hi[3] = vshrq_n_s32(hi[3], 1); + lo[4] = vshrq_n_s32(lo[4], 1); + hi[4] = vshrq_n_s32(hi[4], 1); + lo[5] = vshrq_n_s32(lo[5], 1); + hi[5] = vshrq_n_s32(hi[5], 1); + lo[6] = vshrq_n_s32(lo[6], 1); + hi[6] = vshrq_n_s32(hi[6], 1); + lo[7] = vshrq_n_s32(lo[7], 1); + hi[7] = vshrq_n_s32(hi[7], 1); + } else { + lo[0] = vshrq_n_s32(lo[0], 2); + hi[0] = vshrq_n_s32(hi[0], 2); + lo[1] = vshrq_n_s32(lo[1], 2); + hi[1] = vshrq_n_s32(hi[1], 2); + lo[2] = vshrq_n_s32(lo[2], 2); + hi[2] = vshrq_n_s32(hi[2], 2); + lo[3] = vshrq_n_s32(lo[3], 2); + hi[3] = vshrq_n_s32(hi[3], 2); + lo[4] = vshrq_n_s32(lo[4], 2); + hi[4] = vshrq_n_s32(hi[4], 2); + lo[5] = vshrq_n_s32(lo[5], 2); + hi[5] = vshrq_n_s32(hi[5], 2); + lo[6] = vshrq_n_s32(lo[6], 2); + hi[6] = vshrq_n_s32(hi[6], 2); + lo[7] = vshrq_n_s32(lo[7], 2); + hi[7] = vshrq_n_s32(hi[7], 2); + } +} + +static INLINE void highbd_write_buffer_8x8(tran_low_t *output, int32x4_t *lo, + int32x4_t *hi, int stride) { + vst1q_s32(output + 0 * stride, lo[0]); + vst1q_s32(output + 0 * stride + 4, hi[0]); + vst1q_s32(output + 1 * stride, lo[1]); + vst1q_s32(output + 1 * stride + 4, hi[1]); + vst1q_s32(output + 2 * stride, lo[2]); + vst1q_s32(output + 2 * stride + 4, hi[2]); + vst1q_s32(output + 3 * stride, lo[3]); + vst1q_s32(output + 3 * stride + 4, hi[3]); + vst1q_s32(output + 4 * stride, lo[4]); + vst1q_s32(output + 4 * stride + 4, hi[4]); + vst1q_s32(output + 5 * stride, lo[5]); + vst1q_s32(output + 5 * stride + 4, hi[5]); + vst1q_s32(output + 6 * stride, lo[6]); + vst1q_s32(output + 6 * stride + 4, hi[6]); + vst1q_s32(output + 7 * stride, lo[7]); + vst1q_s32(output + 7 * stride + 4, hi[7]); +} + +static INLINE void highbd_fadst8x8_neon(int32x4_t *lo /*[8]*/, + int32x4_t *hi /*[8]*/) { + int32x4_t s_lo[8], s_hi[8]; + int32x4_t t_lo[8], t_hi[8]; + int32x4_t x_lo[8], x_hi[8]; + int64x2_t s64_lo[16], s64_hi[16]; + + x_lo[0] = lo[7]; + x_hi[0] = hi[7]; + x_lo[1] = lo[0]; + x_hi[1] = hi[0]; + x_lo[2] = lo[5]; + x_hi[2] = hi[5]; + x_lo[3] = lo[2]; + x_hi[3] = hi[2]; + x_lo[4] = lo[3]; + x_hi[4] = hi[3]; + x_lo[5] = lo[4]; + x_hi[5] = hi[4]; + x_lo[6] = lo[1]; + x_hi[6] = hi[1]; + x_lo[7] = lo[6]; + x_hi[7] = hi[6]; + + // stage 1 + // s0 = cospi_2_64 * x0 + cospi_30_64 * x1; + // s1 = cospi_30_64 * x0 - cospi_2_64 * x1; + butterfly_two_coeff_s32_s64_noround( + x_lo[0], x_hi[0], x_lo[1], x_hi[1], cospi_2_64, cospi_30_64, + &s64_lo[2 * 0], &s64_hi[2 * 0], &s64_lo[2 * 1], &s64_hi[2 * 1]); + // s2 = cospi_10_64 * x2 + cospi_22_64 * x3; + // s3 = cospi_22_64 * x2 - cospi_10_64 * x3; + butterfly_two_coeff_s32_s64_noround( + x_lo[2], x_hi[2], x_lo[3], x_hi[3], cospi_10_64, cospi_22_64, + &s64_lo[2 * 2], &s64_hi[2 * 2], &s64_lo[2 * 3], &s64_hi[2 * 3]); + + // s4 = cospi_18_64 * x4 + cospi_14_64 * x5; + // s5 = cospi_14_64 * x4 - cospi_18_64 * x5; + butterfly_two_coeff_s32_s64_noround( + x_lo[4], x_hi[4], x_lo[5], x_hi[5], cospi_18_64, cospi_14_64, + &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]); + + // s6 = cospi_26_64 * x6 + cospi_6_64 * x7; + // s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + butterfly_two_coeff_s32_s64_noround( + x_lo[6], x_hi[6], x_lo[7], x_hi[7], cospi_26_64, cospi_6_64, + &s64_lo[2 * 6], &s64_hi[2 * 6], &s64_lo[2 * 7], &s64_hi[2 * 7]); + + // fdct_round_shift, indices are doubled + t_lo[0] = add_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 4]); + t_hi[0] = add_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 4]); + t_lo[1] = add_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 5]); + t_hi[1] = add_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 5]); + t_lo[2] = add_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 6]); + t_hi[2] = add_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 6]); + t_lo[3] = add_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 7]); + t_hi[3] = add_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 7]); + t_lo[4] = sub_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 4]); + t_hi[4] = sub_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 4]); + t_lo[5] = sub_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 5]); + t_hi[5] = sub_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 5]); + t_lo[6] = sub_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 6]); + t_hi[6] = sub_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 6]); + t_lo[7] = sub_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 7]); + t_hi[7] = sub_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 7]); + + // stage 2 + s_lo[0] = t_lo[0]; + s_hi[0] = t_hi[0]; + s_lo[1] = t_lo[1]; + s_hi[1] = t_hi[1]; + s_lo[2] = t_lo[2]; + s_hi[2] = t_hi[2]; + s_lo[3] = t_lo[3]; + s_hi[3] = t_hi[3]; + // s4 = cospi_8_64 * x4 + cospi_24_64 * x5; + // s5 = cospi_24_64 * x4 - cospi_8_64 * x5; + butterfly_two_coeff_s32_s64_noround( + t_lo[4], t_hi[4], t_lo[5], t_hi[5], cospi_8_64, cospi_24_64, + &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]); + + // s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; + // s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + butterfly_two_coeff_s32_s64_noround( + t_lo[6], t_hi[6], t_lo[7], t_hi[7], -cospi_24_64, cospi_8_64, + &s64_lo[2 * 6], &s64_hi[2 * 6], &s64_lo[2 * 7], &s64_hi[2 * 7]); + + // fdct_round_shift + // s0 + s2 + t_lo[0] = add_s32_s64_narrow(s_lo[0], s_lo[2]); + t_hi[0] = add_s32_s64_narrow(s_hi[0], s_hi[2]); + // s0 - s2 + t_lo[2] = sub_s32_s64_narrow(s_lo[0], s_lo[2]); + t_hi[2] = sub_s32_s64_narrow(s_hi[0], s_hi[2]); + + // s1 + s3 + t_lo[1] = add_s32_s64_narrow(s_lo[1], s_lo[3]); + t_hi[1] = add_s32_s64_narrow(s_hi[1], s_hi[3]); + // s1 - s3 + t_lo[3] = sub_s32_s64_narrow(s_lo[1], s_lo[3]); + t_hi[3] = sub_s32_s64_narrow(s_hi[1], s_hi[3]); + + // s4 + s6 + t_lo[4] = add_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]); + t_hi[4] = add_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]); + // s4 - s6 + t_lo[6] = sub_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]); + t_hi[6] = sub_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]); + + // s5 + s7 + t_lo[5] = add_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]); + t_hi[5] = add_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]); + // s5 - s7 + t_lo[7] = sub_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]); + t_hi[7] = sub_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]); + + // stage 3 + // s2 = cospi_16_64 * (x2 + x3) + // s3 = cospi_16_64 * (x2 - x3) + butterfly_one_coeff_s32_fast(t_lo[2], t_hi[2], t_lo[3], t_hi[3], cospi_16_64, + &s_lo[2], &s_hi[2], &s_lo[3], &s_hi[3]); + + // s6 = cospi_16_64 * (x6 + x7) + // s7 = cospi_16_64 * (x6 - x7) + butterfly_one_coeff_s32_fast(t_lo[6], t_hi[6], t_lo[7], t_hi[7], cospi_16_64, + &s_lo[6], &s_hi[6], &s_lo[7], &s_hi[7]); + + // x0, x2, x4, x6 pass through + lo[0] = t_lo[0]; + hi[0] = t_hi[0]; + lo[2] = s_lo[6]; + hi[2] = s_hi[6]; + lo[4] = s_lo[3]; + hi[4] = s_hi[3]; + lo[6] = t_lo[5]; + hi[6] = t_hi[5]; + + lo[1] = vnegq_s32(t_lo[4]); + hi[1] = vnegq_s32(t_hi[4]); + lo[3] = vnegq_s32(s_lo[2]); + hi[3] = vnegq_s32(s_hi[2]); + lo[5] = vnegq_s32(s_lo[7]); + hi[5] = vnegq_s32(s_hi[7]); + lo[7] = vnegq_s32(t_lo[1]); + hi[7] = vnegq_s32(t_hi[1]); + + transpose_s32_8x8_2(lo, hi, lo, hi); +} + +void vp9_highbd_fht8x8_neon(const int16_t *input, tran_low_t *output, + int stride, int tx_type) { + int32x4_t lo[8], hi[8]; + + switch (tx_type) { + case DCT_DCT: vpx_highbd_fdct8x8_neon(input, output, stride); break; + case ADST_DCT: + highbd_load_buffer_8x8(input, lo, hi, stride); + highbd_fadst8x8_neon(lo, hi); + // pass1 variant is not precise enough + vpx_highbd_fdct8x8_pass2_neon(lo, hi); + highbd_right_shift_8x8(lo, hi, 1); + highbd_write_buffer_8x8(output, lo, hi, 8); + break; + case DCT_ADST: + highbd_load_buffer_8x8(input, lo, hi, stride); + // pass1 variant is not precise enough + vpx_highbd_fdct8x8_pass2_neon(lo, hi); + highbd_fadst8x8_neon(lo, hi); + highbd_right_shift_8x8(lo, hi, 1); + highbd_write_buffer_8x8(output, lo, hi, 8); + break; + default: + assert(tx_type == ADST_ADST); + highbd_load_buffer_8x8(input, lo, hi, stride); + highbd_fadst8x8_neon(lo, hi); + highbd_fadst8x8_neon(lo, hi); + highbd_right_shift_8x8(lo, hi, 1); + highbd_write_buffer_8x8(output, lo, hi, 8); + break; + } +} + +static INLINE void highbd_load_buffer_16x16( + const int16_t *input, int32x4_t *left1 /*[16]*/, int32x4_t *right1 /*[16]*/, + int32x4_t *left2 /*[16]*/, int32x4_t *right2 /*[16]*/, int stride) { + // load first 8 columns + highbd_load_buffer_8x8(input, left1, right1, stride); + highbd_load_buffer_8x8(input + 8 * stride, left1 + 8, right1 + 8, stride); + + input += 8; + // load second 8 columns + highbd_load_buffer_8x8(input, left2, right2, stride); + highbd_load_buffer_8x8(input + 8 * stride, left2 + 8, right2 + 8, stride); +} + +static INLINE void highbd_write_buffer_16x16( + tran_low_t *output, int32x4_t *left1 /*[16]*/, int32x4_t *right1 /*[16]*/, + int32x4_t *left2 /*[16]*/, int32x4_t *right2 /*[16]*/, int stride) { + // write first 8 columns + highbd_write_buffer_8x8(output, left1, right1, stride); + highbd_write_buffer_8x8(output + 8 * stride, left1 + 8, right1 + 8, stride); + + // write second 8 columns + output += 8; + highbd_write_buffer_8x8(output, left2, right2, stride); + highbd_write_buffer_8x8(output + 8 * stride, left2 + 8, right2 + 8, stride); +} + +static INLINE void highbd_right_shift_16x16(int32x4_t *left1 /*[16]*/, + int32x4_t *right1 /*[16]*/, + int32x4_t *left2 /*[16]*/, + int32x4_t *right2 /*[16]*/, + const int bit) { + // perform rounding operations + highbd_right_shift_8x8(left1, right1, bit); + highbd_right_shift_8x8(left1 + 8, right1 + 8, bit); + highbd_right_shift_8x8(left2, right2, bit); + highbd_right_shift_8x8(left2 + 8, right2 + 8, bit); +} + +static void highbd_fdct16_8col(int32x4_t *left, int32x4_t *right) { + // perform 16x16 1-D DCT for 8 columns + int32x4_t s1_lo[8], s1_hi[8], s2_lo[8], s2_hi[8], s3_lo[8], s3_hi[8]; + int32x4_t left8[8], right8[8]; + + // stage 1 + left8[0] = vaddq_s32(left[0], left[15]); + right8[0] = vaddq_s32(right[0], right[15]); + left8[1] = vaddq_s32(left[1], left[14]); + right8[1] = vaddq_s32(right[1], right[14]); + left8[2] = vaddq_s32(left[2], left[13]); + right8[2] = vaddq_s32(right[2], right[13]); + left8[3] = vaddq_s32(left[3], left[12]); + right8[3] = vaddq_s32(right[3], right[12]); + left8[4] = vaddq_s32(left[4], left[11]); + right8[4] = vaddq_s32(right[4], right[11]); + left8[5] = vaddq_s32(left[5], left[10]); + right8[5] = vaddq_s32(right[5], right[10]); + left8[6] = vaddq_s32(left[6], left[9]); + right8[6] = vaddq_s32(right[6], right[9]); + left8[7] = vaddq_s32(left[7], left[8]); + right8[7] = vaddq_s32(right[7], right[8]); + + // step 1 + s1_lo[0] = vsubq_s32(left[7], left[8]); + s1_hi[0] = vsubq_s32(right[7], right[8]); + s1_lo[1] = vsubq_s32(left[6], left[9]); + s1_hi[1] = vsubq_s32(right[6], right[9]); + s1_lo[2] = vsubq_s32(left[5], left[10]); + s1_hi[2] = vsubq_s32(right[5], right[10]); + s1_lo[3] = vsubq_s32(left[4], left[11]); + s1_hi[3] = vsubq_s32(right[4], right[11]); + s1_lo[4] = vsubq_s32(left[3], left[12]); + s1_hi[4] = vsubq_s32(right[3], right[12]); + s1_lo[5] = vsubq_s32(left[2], left[13]); + s1_hi[5] = vsubq_s32(right[2], right[13]); + s1_lo[6] = vsubq_s32(left[1], left[14]); + s1_hi[6] = vsubq_s32(right[1], right[14]); + s1_lo[7] = vsubq_s32(left[0], left[15]); + s1_hi[7] = vsubq_s32(right[0], right[15]); + + // pass1 variant is not accurate enough + vpx_highbd_fdct8x8_pass2_notranspose_neon(left8, right8); + + // step 2 + // step2[2] = (step1[5] - step1[2]) * cospi_16_64; + // step2[5] = (step1[5] + step1[2]) * cospi_16_64; + butterfly_one_coeff_s32_s64_narrow(s1_lo[5], s1_hi[5], s1_lo[2], s1_hi[2], + cospi_16_64, &s2_lo[5], &s2_hi[5], + &s2_lo[2], &s2_hi[2]); + // step2[3] = (step1[4] - step1[3]) * cospi_16_64; + // step2[4] = (step1[4] + step1[3]) * cospi_16_64; + butterfly_one_coeff_s32_s64_narrow(s1_lo[4], s1_hi[4], s1_lo[3], s1_hi[3], + cospi_16_64, &s2_lo[4], &s2_hi[4], + &s2_lo[3], &s2_hi[3]); + + // step 3 + s3_lo[0] = vaddq_s32(s1_lo[0], s2_lo[3]); + s3_hi[0] = vaddq_s32(s1_hi[0], s2_hi[3]); + s3_lo[1] = vaddq_s32(s1_lo[1], s2_lo[2]); + s3_hi[1] = vaddq_s32(s1_hi[1], s2_hi[2]); + s3_lo[2] = vsubq_s32(s1_lo[1], s2_lo[2]); + s3_hi[2] = vsubq_s32(s1_hi[1], s2_hi[2]); + s3_lo[3] = vsubq_s32(s1_lo[0], s2_lo[3]); + s3_hi[3] = vsubq_s32(s1_hi[0], s2_hi[3]); + s3_lo[4] = vsubq_s32(s1_lo[7], s2_lo[4]); + s3_hi[4] = vsubq_s32(s1_hi[7], s2_hi[4]); + s3_lo[5] = vsubq_s32(s1_lo[6], s2_lo[5]); + s3_hi[5] = vsubq_s32(s1_hi[6], s2_hi[5]); + s3_lo[6] = vaddq_s32(s1_lo[6], s2_lo[5]); + s3_hi[6] = vaddq_s32(s1_hi[6], s2_hi[5]); + s3_lo[7] = vaddq_s32(s1_lo[7], s2_lo[4]); + s3_hi[7] = vaddq_s32(s1_hi[7], s2_hi[4]); + + // step 4 + // s2[1] = cospi_24_64 * s3[6] - cospi_8_64 * s3[1] + // s2[6] = cospi_8_64 * s3[6] + cospi_24_64 * s3[1] + butterfly_two_coeff_s32_s64_narrow(s3_lo[6], s3_hi[6], s3_lo[1], s3_hi[1], + cospi_8_64, cospi_24_64, &s2_lo[6], + &s2_hi[6], &s2_lo[1], &s2_hi[1]); + + // s2[5] = cospi_8_64 * s3[2] - cospi_24_64 * s3[5] + // s2[2] = cospi_24_64 * s3[2] + cospi_8_64 * s3[5] + butterfly_two_coeff_s32_s64_narrow(s3_lo[2], s3_hi[2], s3_lo[5], s3_hi[5], + cospi_24_64, cospi_8_64, &s2_lo[2], + &s2_hi[2], &s2_lo[5], &s2_hi[5]); + + // step 5 + s1_lo[0] = vaddq_s32(s3_lo[0], s2_lo[1]); + s1_hi[0] = vaddq_s32(s3_hi[0], s2_hi[1]); + s1_lo[1] = vsubq_s32(s3_lo[0], s2_lo[1]); + s1_hi[1] = vsubq_s32(s3_hi[0], s2_hi[1]); + s1_lo[2] = vaddq_s32(s3_lo[3], s2_lo[2]); + s1_hi[2] = vaddq_s32(s3_hi[3], s2_hi[2]); + s1_lo[3] = vsubq_s32(s3_lo[3], s2_lo[2]); + s1_hi[3] = vsubq_s32(s3_hi[3], s2_hi[2]); + s1_lo[4] = vsubq_s32(s3_lo[4], s2_lo[5]); + s1_hi[4] = vsubq_s32(s3_hi[4], s2_hi[5]); + s1_lo[5] = vaddq_s32(s3_lo[4], s2_lo[5]); + s1_hi[5] = vaddq_s32(s3_hi[4], s2_hi[5]); + s1_lo[6] = vsubq_s32(s3_lo[7], s2_lo[6]); + s1_hi[6] = vsubq_s32(s3_hi[7], s2_hi[6]); + s1_lo[7] = vaddq_s32(s3_lo[7], s2_lo[6]); + s1_hi[7] = vaddq_s32(s3_hi[7], s2_hi[6]); + + // step 6 + // out[1] = step1[7] * cospi_2_64 + step1[0] * cospi_30_64 + // out[15] = step1[7] * cospi_30_64 - step1[0] * cospi_2_64 + butterfly_two_coeff_s32_s64_narrow(s1_lo[7], s1_hi[7], s1_lo[0], s1_hi[0], + cospi_2_64, cospi_30_64, &left[1], + &right[1], &left[15], &right[15]); + + // out[9] = step1[6] * cospi_18_64 + step1[1] * cospi_14_64 + // out[7] = step1[6] * cospi_14_64 - step1[1] * cospi_18_64 + butterfly_two_coeff_s32_s64_narrow(s1_lo[6], s1_hi[6], s1_lo[1], s1_hi[1], + cospi_18_64, cospi_14_64, &left[9], + &right[9], &left[7], &right[7]); + + // out[5] = step1[5] * cospi_10_64 + step1[2] * cospi_22_64 + // out[11] = step1[5] * cospi_22_64 - step1[2] * cospi_10_64 + butterfly_two_coeff_s32_s64_narrow(s1_lo[5], s1_hi[5], s1_lo[2], s1_hi[2], + cospi_10_64, cospi_22_64, &left[5], + &right[5], &left[11], &right[11]); + + // out[13] = step1[4] * cospi_26_64 + step1[3] * cospi_6_64 + // out[3] = step1[4] * cospi_6_64 - step1[3] * cospi_26_64 + butterfly_two_coeff_s32_s64_narrow(s1_lo[4], s1_hi[4], s1_lo[3], s1_hi[3], + cospi_26_64, cospi_6_64, &left[13], + &right[13], &left[3], &right[3]); + + left[0] = left8[0]; + right[0] = right8[0]; + left[2] = left8[1]; + right[2] = right8[1]; + left[4] = left8[2]; + right[4] = right8[2]; + left[6] = left8[3]; + right[6] = right8[3]; + left[8] = left8[4]; + right[8] = right8[4]; + left[10] = left8[5]; + right[10] = right8[5]; + left[12] = left8[6]; + right[12] = right8[6]; + left[14] = left8[7]; + right[14] = right8[7]; +} + +static void highbd_fadst16_8col(int32x4_t *left, int32x4_t *right) { + // perform 16x16 1-D ADST for 8 columns + int32x4_t x_lo[16], x_hi[16]; + int32x4_t s_lo[16], s_hi[16]; + int32x4_t t_lo[16], t_hi[16]; + int64x2_t s64_lo[32], s64_hi[32]; + + x_lo[0] = left[15]; + x_hi[0] = right[15]; + x_lo[1] = left[0]; + x_hi[1] = right[0]; + x_lo[2] = left[13]; + x_hi[2] = right[13]; + x_lo[3] = left[2]; + x_hi[3] = right[2]; + x_lo[4] = left[11]; + x_hi[4] = right[11]; + x_lo[5] = left[4]; + x_hi[5] = right[4]; + x_lo[6] = left[9]; + x_hi[6] = right[9]; + x_lo[7] = left[6]; + x_hi[7] = right[6]; + x_lo[8] = left[7]; + x_hi[8] = right[7]; + x_lo[9] = left[8]; + x_hi[9] = right[8]; + x_lo[10] = left[5]; + x_hi[10] = right[5]; + x_lo[11] = left[10]; + x_hi[11] = right[10]; + x_lo[12] = left[3]; + x_hi[12] = right[3]; + x_lo[13] = left[12]; + x_hi[13] = right[12]; + x_lo[14] = left[1]; + x_hi[14] = right[1]; + x_lo[15] = left[14]; + x_hi[15] = right[14]; + + // stage 1, indices are doubled + // s0 = cospi_1_64 * x0 + cospi_31_64 * x1; + // s1 = cospi_31_64 * x0 - cospi_1_64 * x1; + butterfly_two_coeff_s32_s64_noround( + x_lo[0], x_hi[0], x_lo[1], x_hi[1], cospi_1_64, cospi_31_64, + &s64_lo[2 * 0], &s64_hi[2 * 0], &s64_lo[2 * 1], &s64_hi[2 * 1]); + // s2 = cospi_5_64 * x2 + cospi_27_64 * x3; + // s3 = cospi_27_64 * x2 - cospi_5_64 * x3; + butterfly_two_coeff_s32_s64_noround( + x_lo[2], x_hi[2], x_lo[3], x_hi[3], cospi_5_64, cospi_27_64, + &s64_lo[2 * 2], &s64_hi[2 * 2], &s64_lo[2 * 3], &s64_hi[2 * 3]); + // s4 = cospi_9_64 * x4 + cospi_23_64 * x5; + // s5 = cospi_23_64 * x4 - cospi_9_64 * x5; + butterfly_two_coeff_s32_s64_noround( + x_lo[4], x_hi[4], x_lo[5], x_hi[5], cospi_9_64, cospi_23_64, + &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]); + // s6 = cospi_13_64 * x6 + cospi_19_64 * x7; + // s7 = cospi_19_64 * x6 - cospi_13_64 * x7; + butterfly_two_coeff_s32_s64_noround( + x_lo[6], x_hi[6], x_lo[7], x_hi[7], cospi_13_64, cospi_19_64, + &s64_lo[2 * 6], &s64_hi[2 * 6], &s64_lo[2 * 7], &s64_hi[2 * 7]); + // s8 = cospi_17_64 * x8 + cospi_15_64 * x9; + // s9 = cospi_15_64 * x8 - cospi_17_64 * x9; + butterfly_two_coeff_s32_s64_noround( + x_lo[8], x_hi[8], x_lo[9], x_hi[9], cospi_17_64, cospi_15_64, + &s64_lo[2 * 8], &s64_hi[2 * 8], &s64_lo[2 * 9], &s64_hi[2 * 9]); + // s10 = cospi_21_64 * x10 + cospi_11_64 * x11; + // s11 = cospi_11_64 * x10 - cospi_21_64 * x11; + butterfly_two_coeff_s32_s64_noround( + x_lo[10], x_hi[10], x_lo[11], x_hi[11], cospi_21_64, cospi_11_64, + &s64_lo[2 * 10], &s64_hi[2 * 10], &s64_lo[2 * 11], &s64_hi[2 * 11]); + // s12 = cospi_25_64 * x12 + cospi_7_64 * x13; + // s13 = cospi_7_64 * x12 - cospi_25_64 * x13; + butterfly_two_coeff_s32_s64_noround( + x_lo[12], x_hi[12], x_lo[13], x_hi[13], cospi_25_64, cospi_7_64, + &s64_lo[2 * 12], &s64_hi[2 * 12], &s64_lo[2 * 13], &s64_hi[2 * 13]); + // s14 = cospi_29_64 * x14 + cospi_3_64 * x15; + // s15 = cospi_3_64 * x14 - cospi_29_64 * x15; + butterfly_two_coeff_s32_s64_noround( + x_lo[14], x_hi[14], x_lo[15], x_hi[15], cospi_29_64, cospi_3_64, + &s64_lo[2 * 14], &s64_hi[2 * 14], &s64_lo[2 * 15], &s64_hi[2 * 15]); + + // fdct_round_shift, indices are doubled + t_lo[0] = add_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 8]); + t_hi[0] = add_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 8]); + t_lo[1] = add_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 9]); + t_hi[1] = add_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 9]); + t_lo[2] = add_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 10]); + t_hi[2] = add_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 10]); + t_lo[3] = add_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 11]); + t_hi[3] = add_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 11]); + t_lo[4] = add_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 12]); + t_hi[4] = add_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 12]); + t_lo[5] = add_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 13]); + t_hi[5] = add_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 13]); + t_lo[6] = add_s64_round_narrow(&s64_lo[2 * 6], &s64_lo[2 * 14]); + t_hi[6] = add_s64_round_narrow(&s64_hi[2 * 6], &s64_hi[2 * 14]); + t_lo[7] = add_s64_round_narrow(&s64_lo[2 * 7], &s64_lo[2 * 15]); + t_hi[7] = add_s64_round_narrow(&s64_hi[2 * 7], &s64_hi[2 * 15]); + t_lo[8] = sub_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 8]); + t_hi[8] = sub_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 8]); + t_lo[9] = sub_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 9]); + t_hi[9] = sub_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 9]); + t_lo[10] = sub_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 10]); + t_hi[10] = sub_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 10]); + t_lo[11] = sub_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 11]); + t_hi[11] = sub_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 11]); + t_lo[12] = sub_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 12]); + t_hi[12] = sub_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 12]); + t_lo[13] = sub_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 13]); + t_hi[13] = sub_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 13]); + t_lo[14] = sub_s64_round_narrow(&s64_lo[2 * 6], &s64_lo[2 * 14]); + t_hi[14] = sub_s64_round_narrow(&s64_hi[2 * 6], &s64_hi[2 * 14]); + t_lo[15] = sub_s64_round_narrow(&s64_lo[2 * 7], &s64_lo[2 * 15]); + t_hi[15] = sub_s64_round_narrow(&s64_hi[2 * 7], &s64_hi[2 * 15]); + + // stage 2 + s_lo[0] = t_lo[0]; + s_hi[0] = t_hi[0]; + s_lo[1] = t_lo[1]; + s_hi[1] = t_hi[1]; + s_lo[2] = t_lo[2]; + s_hi[2] = t_hi[2]; + s_lo[3] = t_lo[3]; + s_hi[3] = t_hi[3]; + s_lo[4] = t_lo[4]; + s_hi[4] = t_hi[4]; + s_lo[5] = t_lo[5]; + s_hi[5] = t_hi[5]; + s_lo[6] = t_lo[6]; + s_hi[6] = t_hi[6]; + s_lo[7] = t_lo[7]; + s_hi[7] = t_hi[7]; + // s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + // s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + butterfly_two_coeff_s32_s64_noround( + t_lo[8], t_hi[8], t_lo[9], t_hi[9], cospi_4_64, cospi_28_64, + &s64_lo[2 * 8], &s64_hi[2 * 8], &s64_lo[2 * 9], &s64_hi[2 * 9]); + // s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + // s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + butterfly_two_coeff_s32_s64_noround( + t_lo[10], t_hi[10], t_lo[11], t_hi[11], cospi_20_64, cospi_12_64, + &s64_lo[2 * 10], &s64_hi[2 * 10], &s64_lo[2 * 11], &s64_hi[2 * 11]); + // s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; + // s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + butterfly_two_coeff_s32_s64_noround( + t_lo[13], t_hi[13], t_lo[12], t_hi[12], cospi_28_64, cospi_4_64, + &s64_lo[2 * 13], &s64_hi[2 * 13], &s64_lo[2 * 12], &s64_hi[2 * 12]); + // s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; + // s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + butterfly_two_coeff_s32_s64_noround( + t_lo[15], t_hi[15], t_lo[14], t_hi[14], cospi_12_64, cospi_20_64, + &s64_lo[2 * 15], &s64_hi[2 * 15], &s64_lo[2 * 14], &s64_hi[2 * 14]); + + // s0 + s4 + t_lo[0] = add_s32_s64_narrow(s_lo[0], s_lo[4]); + t_hi[0] = add_s32_s64_narrow(s_hi[0], s_hi[4]); + // s1 + s5 + t_lo[1] = add_s32_s64_narrow(s_lo[1], s_lo[5]); + t_hi[1] = add_s32_s64_narrow(s_hi[1], s_hi[5]); + // s2 + s6 + t_lo[2] = add_s32_s64_narrow(s_lo[2], s_lo[6]); + t_hi[2] = add_s32_s64_narrow(s_hi[2], s_hi[6]); + // s3 + s7 + t_lo[3] = add_s32_s64_narrow(s_lo[3], s_lo[7]); + t_hi[3] = add_s32_s64_narrow(s_hi[3], s_hi[7]); + + // s0 - s4 + t_lo[4] = sub_s32_s64_narrow(s_lo[0], s_lo[4]); + t_hi[4] = sub_s32_s64_narrow(s_hi[0], s_hi[4]); + // s1 - s5 + t_lo[5] = sub_s32_s64_narrow(s_lo[1], s_lo[5]); + t_hi[5] = sub_s32_s64_narrow(s_hi[1], s_hi[5]); + // s2 - s6 + t_lo[6] = sub_s32_s64_narrow(s_lo[2], s_lo[6]); + t_hi[6] = sub_s32_s64_narrow(s_hi[2], s_hi[6]); + // s3 - s7 + t_lo[7] = sub_s32_s64_narrow(s_lo[3], s_lo[7]); + t_hi[7] = sub_s32_s64_narrow(s_hi[3], s_hi[7]); + + // fdct_round_shift() + // s8 + s12 + t_lo[8] = add_s64_round_narrow(&s64_lo[2 * 8], &s64_lo[2 * 12]); + t_hi[8] = add_s64_round_narrow(&s64_hi[2 * 8], &s64_hi[2 * 12]); + // s9 + s13 + t_lo[9] = add_s64_round_narrow(&s64_lo[2 * 9], &s64_lo[2 * 13]); + t_hi[9] = add_s64_round_narrow(&s64_hi[2 * 9], &s64_hi[2 * 13]); + // s10 + s14 + t_lo[10] = add_s64_round_narrow(&s64_lo[2 * 10], &s64_lo[2 * 14]); + t_hi[10] = add_s64_round_narrow(&s64_hi[2 * 10], &s64_hi[2 * 14]); + // s11 + s15 + t_lo[11] = add_s64_round_narrow(&s64_lo[2 * 11], &s64_lo[2 * 15]); + t_hi[11] = add_s64_round_narrow(&s64_hi[2 * 11], &s64_hi[2 * 15]); + + // s8 - s12 + t_lo[12] = sub_s64_round_narrow(&s64_lo[2 * 8], &s64_lo[2 * 12]); + t_hi[12] = sub_s64_round_narrow(&s64_hi[2 * 8], &s64_hi[2 * 12]); + // s9 - s13 + t_lo[13] = sub_s64_round_narrow(&s64_lo[2 * 9], &s64_lo[2 * 13]); + t_hi[13] = sub_s64_round_narrow(&s64_hi[2 * 9], &s64_hi[2 * 13]); + // s10 - s14 + t_lo[14] = sub_s64_round_narrow(&s64_lo[2 * 10], &s64_lo[2 * 14]); + t_hi[14] = sub_s64_round_narrow(&s64_hi[2 * 10], &s64_hi[2 * 14]); + // s11 - s15 + t_lo[15] = sub_s64_round_narrow(&s64_lo[2 * 11], &s64_lo[2 * 15]); + t_hi[15] = sub_s64_round_narrow(&s64_hi[2 * 11], &s64_hi[2 * 15]); + + // stage 3 + s_lo[0] = t_lo[0]; + s_hi[0] = t_hi[0]; + s_lo[1] = t_lo[1]; + s_hi[1] = t_hi[1]; + s_lo[2] = t_lo[2]; + s_hi[2] = t_hi[2]; + s_lo[3] = t_lo[3]; + s_hi[3] = t_hi[3]; + // s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + // s5 = x4 * cospi_24_64 - x5 * cospi_8_64; + butterfly_two_coeff_s32_s64_noround( + t_lo[4], t_hi[4], t_lo[5], t_hi[5], cospi_8_64, cospi_24_64, + &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]); + // s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; + // s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + butterfly_two_coeff_s32_s64_noround( + t_lo[7], t_hi[7], t_lo[6], t_hi[6], cospi_24_64, cospi_8_64, + &s64_lo[2 * 7], &s64_hi[2 * 7], &s64_lo[2 * 6], &s64_hi[2 * 6]); + s_lo[8] = t_lo[8]; + s_hi[8] = t_hi[8]; + s_lo[9] = t_lo[9]; + s_hi[9] = t_hi[9]; + s_lo[10] = t_lo[10]; + s_hi[10] = t_hi[10]; + s_lo[11] = t_lo[11]; + s_hi[11] = t_hi[11]; + // s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + // s13 = x12 * cospi_24_64 - x13 * cospi_8_64; + butterfly_two_coeff_s32_s64_noround( + t_lo[12], t_hi[12], t_lo[13], t_hi[13], cospi_8_64, cospi_24_64, + &s64_lo[2 * 12], &s64_hi[2 * 12], &s64_lo[2 * 13], &s64_hi[2 * 13]); + // s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; + // s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + butterfly_two_coeff_s32_s64_noround( + t_lo[15], t_hi[15], t_lo[14], t_hi[14], cospi_24_64, cospi_8_64, + &s64_lo[2 * 15], &s64_hi[2 * 15], &s64_lo[2 * 14], &s64_hi[2 * 14]); + + // s0 + s2 + t_lo[0] = add_s32_s64_narrow(s_lo[0], s_lo[2]); + t_hi[0] = add_s32_s64_narrow(s_hi[0], s_hi[2]); + // s1 + s3 + t_lo[1] = add_s32_s64_narrow(s_lo[1], s_lo[3]); + t_hi[1] = add_s32_s64_narrow(s_hi[1], s_hi[3]); + // s0 - s2 + t_lo[2] = sub_s32_s64_narrow(s_lo[0], s_lo[2]); + t_hi[2] = sub_s32_s64_narrow(s_hi[0], s_hi[2]); + // s1 - s3 + t_lo[3] = sub_s32_s64_narrow(s_lo[1], s_lo[3]); + t_hi[3] = sub_s32_s64_narrow(s_hi[1], s_hi[3]); + // fdct_round_shift() + // s4 + s6 + t_lo[4] = add_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]); + t_hi[4] = add_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]); + // s5 + s7 + t_lo[5] = add_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]); + t_hi[5] = add_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]); + // s4 - s6 + t_lo[6] = sub_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]); + t_hi[6] = sub_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]); + // s5 - s7 + t_lo[7] = sub_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]); + t_hi[7] = sub_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]); + // s8 + s10 + t_lo[8] = add_s32_s64_narrow(s_lo[8], s_lo[10]); + t_hi[8] = add_s32_s64_narrow(s_hi[8], s_hi[10]); + // s9 + s11 + t_lo[9] = add_s32_s64_narrow(s_lo[9], s_lo[11]); + t_hi[9] = add_s32_s64_narrow(s_hi[9], s_hi[11]); + // s8 - s10 + t_lo[10] = sub_s32_s64_narrow(s_lo[8], s_lo[10]); + t_hi[10] = sub_s32_s64_narrow(s_hi[8], s_hi[10]); + // s9 - s11 + t_lo[11] = sub_s32_s64_narrow(s_lo[9], s_lo[11]); + t_hi[11] = sub_s32_s64_narrow(s_hi[9], s_hi[11]); + // fdct_round_shift() + // s12 + s14 + t_lo[12] = add_s64_round_narrow(&s64_lo[2 * 12], &s64_lo[2 * 14]); + t_hi[12] = add_s64_round_narrow(&s64_hi[2 * 12], &s64_hi[2 * 14]); + // s13 + s15 + t_lo[13] = add_s64_round_narrow(&s64_lo[2 * 13], &s64_lo[2 * 15]); + t_hi[13] = add_s64_round_narrow(&s64_hi[2 * 13], &s64_hi[2 * 15]); + // s12 - s14 + t_lo[14] = sub_s64_round_narrow(&s64_lo[2 * 12], &s64_lo[2 * 14]); + t_hi[14] = sub_s64_round_narrow(&s64_hi[2 * 12], &s64_hi[2 * 14]); + // s13 - s15 + t_lo[15] = sub_s64_round_narrow(&s64_lo[2 * 13], &s64_lo[2 * 15]); + t_hi[15] = sub_s64_round_narrow(&s64_hi[2 * 13], &s64_hi[2 * 15]); + + // stage 4, with fdct_round_shift + // s2 = (-cospi_16_64) * (x2 + x3); + // s3 = cospi_16_64 * (x2 - x3); + butterfly_one_coeff_s32_s64_narrow(t_lo[3], t_hi[3], t_lo[2], t_hi[2], + -cospi_16_64, &x_lo[2], &x_hi[2], &x_lo[3], + &x_hi[3]); + // s6 = cospi_16_64 * (x6 + x7); + // s7 = cospi_16_64 * (-x6 + x7); + butterfly_one_coeff_s32_s64_narrow(t_lo[7], t_hi[7], t_lo[6], t_hi[6], + cospi_16_64, &x_lo[6], &x_hi[6], &x_lo[7], + &x_hi[7]); + // s10 = cospi_16_64 * (x10 + x11); + // s11 = cospi_16_64 * (-x10 + x11); + butterfly_one_coeff_s32_s64_narrow(t_lo[11], t_hi[11], t_lo[10], t_hi[10], + cospi_16_64, &x_lo[10], &x_hi[10], + &x_lo[11], &x_hi[11]); + // s14 = (-cospi_16_64) * (x14 + x15); + // s15 = cospi_16_64 * (x14 - x15); + butterfly_one_coeff_s32_s64_narrow(t_lo[15], t_hi[15], t_lo[14], t_hi[14], + -cospi_16_64, &x_lo[14], &x_hi[14], + &x_lo[15], &x_hi[15]); + + // Just copy x0, x1, x4, x5, x8, x9, x12, x13 + x_lo[0] = t_lo[0]; + x_hi[0] = t_hi[0]; + x_lo[1] = t_lo[1]; + x_hi[1] = t_hi[1]; + x_lo[4] = t_lo[4]; + x_hi[4] = t_hi[4]; + x_lo[5] = t_lo[5]; + x_hi[5] = t_hi[5]; + x_lo[8] = t_lo[8]; + x_hi[8] = t_hi[8]; + x_lo[9] = t_lo[9]; + x_hi[9] = t_hi[9]; + x_lo[12] = t_lo[12]; + x_hi[12] = t_hi[12]; + x_lo[13] = t_lo[13]; + x_hi[13] = t_hi[13]; + + left[0] = x_lo[0]; + right[0] = x_hi[0]; + left[1] = vnegq_s32(x_lo[8]); + right[1] = vnegq_s32(x_hi[8]); + left[2] = x_lo[12]; + right[2] = x_hi[12]; + left[3] = vnegq_s32(x_lo[4]); + right[3] = vnegq_s32(x_hi[4]); + left[4] = x_lo[6]; + right[4] = x_hi[6]; + left[5] = x_lo[14]; + right[5] = x_hi[14]; + left[6] = x_lo[10]; + right[6] = x_hi[10]; + left[7] = x_lo[2]; + right[7] = x_hi[2]; + left[8] = x_lo[3]; + right[8] = x_hi[3]; + left[9] = x_lo[11]; + right[9] = x_hi[11]; + left[10] = x_lo[15]; + right[10] = x_hi[15]; + left[11] = x_lo[7]; + right[11] = x_hi[7]; + left[12] = x_lo[5]; + right[12] = x_hi[5]; + left[13] = vnegq_s32(x_lo[13]); + right[13] = vnegq_s32(x_hi[13]); + left[14] = x_lo[9]; + right[14] = x_hi[9]; + left[15] = vnegq_s32(x_lo[1]); + right[15] = vnegq_s32(x_hi[1]); +} + +static void highbd_fdct16x16_neon(int32x4_t *left1, int32x4_t *right1, + int32x4_t *left2, int32x4_t *right2) { + // Left half. + highbd_fdct16_8col(left1, right1); + // Right half. + highbd_fdct16_8col(left2, right2); + transpose_s32_16x16(left1, right1, left2, right2); +} + +static void highbd_fadst16x16_neon(int32x4_t *left1, int32x4_t *right1, + int32x4_t *left2, int32x4_t *right2) { + // Left half. + highbd_fadst16_8col(left1, right1); + // Right half. + highbd_fadst16_8col(left2, right2); + transpose_s32_16x16(left1, right1, left2, right2); +} + +void vp9_highbd_fht16x16_neon(const int16_t *input, tran_low_t *output, + int stride, int tx_type) { + int32x4_t left1[16], right1[16], left2[16], right2[16]; + + switch (tx_type) { + case DCT_DCT: vpx_highbd_fdct16x16_neon(input, output, stride); break; + case ADST_DCT: + highbd_load_buffer_16x16(input, left1, right1, left2, right2, stride); + highbd_fadst16x16_neon(left1, right1, left2, right2); + highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16); + highbd_right_shift_16x16(left1, right1, left2, right2, 2); + highbd_fdct16x16_neon(left1, right1, left2, right2); + highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16); + break; + case DCT_ADST: + highbd_load_buffer_16x16(input, left1, right1, left2, right2, stride); + highbd_fdct16x16_neon(left1, right1, left2, right2); + highbd_right_shift_16x16(left1, right1, left2, right2, 2); + highbd_fadst16x16_neon(left1, right1, left2, right2); + highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16); + break; + default: + assert(tx_type == ADST_ADST); + highbd_load_buffer_16x16(input, left1, right1, left2, right2, stride); + highbd_fadst16x16_neon(left1, right1, left2, right2); + highbd_right_shift_16x16(left1, right1, left2, right2, 2); + highbd_fadst16x16_neon(left1, right1, left2, right2); + highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16); + break; + } +} + +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c new file mode 100644 index 0000000000..d631cd437d --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c @@ -0,0 +1,356 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/encoder/vp9_context_tree.h" +#include "vp9/encoder/vp9_denoiser.h" +#include "vpx_mem/vpx_mem.h" + +// Compute the sum of all pixel differences of this MB. +static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) { +#if VPX_ARCH_AARCH64 + return vaddlvq_s8(v_sum_diff_total); +#else + const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total); + const int32x4_t fedc_ba98_7654_3210 = vpaddlq_s16(fe_dc_ba_98_76_54_32_10); + const int64x2_t fedcba98_76543210 = vpaddlq_s32(fedc_ba98_7654_3210); + const int64x1_t x = vqadd_s64(vget_high_s64(fedcba98_76543210), + vget_low_s64(fedcba98_76543210)); + const int sum_diff = vget_lane_s32(vreinterpret_s32_s64(x), 0); + return sum_diff; +#endif +} + +// Denoise a 16x1 vector. +static INLINE int8x16_t denoiser_16x1_neon( + const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, + const uint8x16_t v_level1_threshold, const uint8x16_t v_level2_threshold, + const uint8x16_t v_level3_threshold, const uint8x16_t v_level1_adjustment, + const uint8x16_t v_delta_level_1_and_2, + const uint8x16_t v_delta_level_2_and_3, int8x16_t v_sum_diff_total) { + const uint8x16_t v_sig = vld1q_u8(sig); + const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y); + + /* Calculate absolute difference and sign masks. */ + const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y); + + /* Figure out which level that put us in. */ + const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold, v_abs_diff); + const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold, v_abs_diff); + const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold, v_abs_diff); + + /* Calculate absolute adjustments for level 1, 2 and 3. */ + const uint8x16_t v_level2_adjustment = + vandq_u8(v_level2_mask, v_delta_level_1_and_2); + const uint8x16_t v_level3_adjustment = + vandq_u8(v_level3_mask, v_delta_level_2_and_3); + const uint8x16_t v_level1and2_adjustment = + vaddq_u8(v_level1_adjustment, v_level2_adjustment); + const uint8x16_t v_level1and2and3_adjustment = + vaddq_u8(v_level1and2_adjustment, v_level3_adjustment); + + /* Figure adjustment absolute value by selecting between the absolute + * difference if in level0 or the value for level 1, 2 and 3. + */ + const uint8x16_t v_abs_adjustment = + vbslq_u8(v_level1_mask, v_level1and2and3_adjustment, v_abs_diff); + + /* Calculate positive and negative adjustments. Apply them to the signal + * and accumulate them. Adjustments are less than eight and the maximum + * sum of them (7 * 16) can fit in a signed char. + */ + const uint8x16_t v_pos_adjustment = + vandq_u8(v_diff_pos_mask, v_abs_adjustment); + const uint8x16_t v_neg_adjustment = + vandq_u8(v_diff_neg_mask, v_abs_adjustment); + + uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment); + v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment); + + /* Store results. */ + vst1q_u8(running_avg_y, v_running_avg_y); + + /* Sum all the accumulators to have the sum of all pixel differences + * for this macroblock. + */ + { + const int8x16_t v_sum_diff = + vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment), + vreinterpretq_s8_u8(v_neg_adjustment)); + v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff); + } + return v_sum_diff_total; +} + +static INLINE int8x16_t denoiser_adjust_16x1_neon( + const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, + const uint8x16_t k_delta, int8x16_t v_sum_diff_total) { + uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y); + const uint8x16_t v_sig = vld1q_u8(sig); + const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y); + + /* Calculate absolute difference and sign masks. */ + const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y); + // Clamp absolute difference to delta to get the adjustment. + const uint8x16_t v_abs_adjustment = vminq_u8(v_abs_diff, (k_delta)); + + const uint8x16_t v_pos_adjustment = + vandq_u8(v_diff_pos_mask, v_abs_adjustment); + const uint8x16_t v_neg_adjustment = + vandq_u8(v_diff_neg_mask, v_abs_adjustment); + + v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment); + v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment); + + /* Store results. */ + vst1q_u8(running_avg_y, v_running_avg_y); + + { + const int8x16_t v_sum_diff = + vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment), + vreinterpretq_s8_u8(v_pos_adjustment)); + v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff); + } + return v_sum_diff_total; +} + +// Denoise 8x8 and 8x16 blocks. +static int vp9_denoiser_8xN_neon(const uint8_t *sig, int sig_stride, + const uint8_t *mc_running_avg_y, + int mc_avg_y_stride, uint8_t *running_avg_y, + int avg_y_stride, int increase_denoising, + BLOCK_SIZE bs, int motion_magnitude, + int width) { + int sum_diff_thresh, r, sum_diff = 0; + const int shift_inc = + (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) + ? 1 + : 0; + uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16]; + + const uint8x16_t v_level1_adjustment = vmovq_n_u8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3); + const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1); + const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2); + const uint8x16_t v_level1_threshold = vdupq_n_u8(4 + shift_inc); + const uint8x16_t v_level2_threshold = vdupq_n_u8(8); + const uint8x16_t v_level3_threshold = vdupq_n_u8(16); + + const int b_height = (4 << b_height_log2_lookup[bs]) >> 1; + + int8x16_t v_sum_diff_total = vdupq_n_s8(0); + + for (r = 0; r < b_height; ++r) { + memcpy(sig_buffer[r], sig, width); + memcpy(sig_buffer[r] + width, sig + sig_stride, width); + memcpy(mc_running_buffer[r], mc_running_avg_y, width); + memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride, + width); + memcpy(running_buffer[r], running_avg_y, width); + memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width); + v_sum_diff_total = denoiser_16x1_neon( + sig_buffer[r], mc_running_buffer[r], running_buffer[r], + v_level1_threshold, v_level2_threshold, v_level3_threshold, + v_level1_adjustment, v_delta_level_1_and_2, v_delta_level_2_and_3, + v_sum_diff_total); + { + const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]); + const uint8x8_t v_running_buffer_high = vget_high_u8(v_running_buffer); + const uint8x8_t v_running_buffer_low = vget_low_u8(v_running_buffer); + vst1_u8(running_avg_y, v_running_buffer_low); + vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high); + } + // Update pointers for next iteration. + sig += (sig_stride << 1); + mc_running_avg_y += (mc_avg_y_stride << 1); + running_avg_y += (avg_y_stride << 1); + } + + { + sum_diff = horizontal_add_s8x16(v_sum_diff_total); + sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); + if (abs(sum_diff) > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), + // check if we can still apply some (weaker) temporal filtering to + // this block, that would otherwise not be denoised at all. Simplest + // is to apply an additional adjustment to running_avg_y to bring it + // closer to sig. The adjustment is capped by a maximum delta, and + // chosen such that in most cases the resulting sum_diff will be + // within the acceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over the + // threshold. + const int delta = + ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const uint8x16_t k_delta = vmovq_n_u8(delta); + running_avg_y -= avg_y_stride * (b_height << 1); + for (r = 0; r < b_height; ++r) { + v_sum_diff_total = denoiser_adjust_16x1_neon( + sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_delta, + v_sum_diff_total); + { + const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]); + const uint8x8_t v_running_buffer_high = + vget_high_u8(v_running_buffer); + const uint8x8_t v_running_buffer_low = + vget_low_u8(v_running_buffer); + vst1_u8(running_avg_y, v_running_buffer_low); + vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high); + } + // Update pointers for next iteration. + running_avg_y += (avg_y_stride << 1); + } + sum_diff = horizontal_add_s8x16(v_sum_diff_total); + if (abs(sum_diff) > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + + return FILTER_BLOCK; +} + +// Denoise 16x16, 16x32, 32x16, 32x32, 32x64, 64x32 and 64x64 blocks. +static int vp9_denoiser_NxM_neon(const uint8_t *sig, int sig_stride, + const uint8_t *mc_running_avg_y, + int mc_avg_y_stride, uint8_t *running_avg_y, + int avg_y_stride, int increase_denoising, + BLOCK_SIZE bs, int motion_magnitude) { + const int shift_inc = + (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) + ? 1 + : 0; + const uint8x16_t v_level1_adjustment = vmovq_n_u8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3); + const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1); + const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2); + const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc); + const uint8x16_t v_level2_threshold = vdupq_n_u8(8); + const uint8x16_t v_level3_threshold = vdupq_n_u8(16); + + const int b_width = (4 << b_width_log2_lookup[bs]); + const int b_height = (4 << b_height_log2_lookup[bs]); + const int b_width_shift4 = b_width >> 4; + + int8x16_t v_sum_diff_total[4][4]; + int r, c, sum_diff = 0; + + for (r = 0; r < 4; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + v_sum_diff_total[c][r] = vdupq_n_s8(0); + } + } + + for (r = 0; r < b_height; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + v_sum_diff_total[c][r >> 4] = denoiser_16x1_neon( + sig, mc_running_avg_y, running_avg_y, v_level1_threshold, + v_level2_threshold, v_level3_threshold, v_level1_adjustment, + v_delta_level_1_and_2, v_delta_level_2_and_3, + v_sum_diff_total[c][r >> 4]); + + // Update pointers for next iteration. + sig += 16; + mc_running_avg_y += 16; + running_avg_y += 16; + } + + if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) { + for (c = 0; c < b_width_shift4; ++c) { + sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]); + } + } + + // Update pointers for next iteration. + sig = sig - b_width + sig_stride; + mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride; + running_avg_y = running_avg_y - b_width + avg_y_stride; + } + + { + const int sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); + if (abs(sum_diff) > sum_diff_thresh) { + const int delta = + ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const uint8x16_t k_delta = vdupq_n_u8(delta); + sig -= sig_stride * b_height; + mc_running_avg_y -= mc_avg_y_stride * b_height; + running_avg_y -= avg_y_stride * b_height; + sum_diff = 0; + + for (r = 0; r < b_height; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + v_sum_diff_total[c][r >> 4] = + denoiser_adjust_16x1_neon(sig, mc_running_avg_y, running_avg_y, + k_delta, v_sum_diff_total[c][r >> 4]); + + // Update pointers for next iteration. + sig += 16; + mc_running_avg_y += 16; + running_avg_y += 16; + } + if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) { + for (c = 0; c < b_width_shift4; ++c) { + sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]); + } + } + + sig = sig - b_width + sig_stride; + mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride; + running_avg_y = running_avg_y - b_width + avg_y_stride; + } + + if (abs(sum_diff) > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + return FILTER_BLOCK; +} + +int vp9_denoiser_filter_neon(const uint8_t *sig, int sig_stride, + const uint8_t *mc_avg, int mc_avg_stride, + uint8_t *avg, int avg_stride, + int increase_denoising, BLOCK_SIZE bs, + int motion_magnitude) { + // Rank by frequency of the block type to have an early termination. + if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 || + bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 || + bs == BLOCK_32X64 || bs == BLOCK_64X32) { + return vp9_denoiser_NxM_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg, + avg_stride, increase_denoising, bs, + motion_magnitude); + } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) { + return vp9_denoiser_8xN_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg, + avg_stride, increase_denoising, bs, + motion_magnitude, 8); + } + return COPY_BLOCK; +} diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c new file mode 100644 index 0000000000..b82b3f9db5 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "vpx_dsp/vpx_dsp_common.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vpx_ports/mem.h" + +#ifdef __GNUC__ +#define LIKELY(v) __builtin_expect(v, 1) +#define UNLIKELY(v) __builtin_expect(v, 0) +#else +#define LIKELY(v) (v) +#define UNLIKELY(v) (v) +#endif + +static INLINE int_mv pack_int_mv(int16_t row, int16_t col) { + int_mv result; + result.as_mv.row = row; + result.as_mv.col = col; + return result; +} + +/***************************************************************************** + * This function utilizes 3 properties of the cost function lookup tables, * + * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in * + * vp9_encoder.c. * + * For the joint cost: * + * - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] * + * For the component costs: * + * - For all i: mvsadcost[0][i] == mvsadcost[1][i] * + * (Equal costs for both components) * + * - For all i: mvsadcost[0][i] == mvsadcost[0][-i] * + * (Cost function is even) * + * If these do not hold, then this function cannot be used without * + * modification, in which case you can revert to using the C implementation, * + * which does not rely on these properties. * + *****************************************************************************/ +int vp9_diamond_search_sad_neon(const MACROBLOCK *x, + const search_site_config *cfg, MV *ref_mv, + uint32_t start_mv_sad, MV *best_mv, + int search_param, int sad_per_bit, int *num00, + const vp9_sad_fn_ptr_t *sad_fn_ptr, + const MV *center_mv) { + static const uint32_t data[4] = { 0, 1, 2, 3 }; + const uint32x4_t v_idx_d = vld1q_u32((const uint32_t *)data); + + const int32x4_t zero_s32 = vdupq_n_s32(0); + const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max); + const int16x8_t v_max_mv_w = vreinterpretq_s16_s32(vdupq_n_s32(maxmv.as_int)); + const int_mv minmv = pack_int_mv(x->mv_limits.row_min, x->mv_limits.col_min); + const int16x8_t v_min_mv_w = vreinterpretq_s16_s32(vdupq_n_s32(minmv.as_int)); + + const int32x4_t v_spb_d = vdupq_n_s32(sad_per_bit); + + const int32x4_t v_joint_cost_0_d = vdupq_n_s32(x->nmvjointsadcost[0]); + const int32x4_t v_joint_cost_1_d = vdupq_n_s32(x->nmvjointsadcost[1]); + + // search_param determines the length of the initial step and hence the number + // of iterations. + // 0 = initial step (MAX_FIRST_STEP) pel + // 1 = (MAX_FIRST_STEP/2) pel, + // 2 = (MAX_FIRST_STEP/4) pel... + const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param]; + const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param]; + const int tot_steps = cfg->total_steps - search_param; + + const int_mv fcenter_mv = + pack_int_mv(center_mv->row >> 3, center_mv->col >> 3); + const int16x8_t vfcmv = vreinterpretq_s16_s32(vdupq_n_s32(fcenter_mv.as_int)); + + const int ref_row = ref_mv->row; + const int ref_col = ref_mv->col; + + int_mv bmv = pack_int_mv(ref_row, ref_col); + int_mv new_bmv = bmv; + int16x8_t v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int)); + + const int what_stride = x->plane[0].src.stride; + const int in_what_stride = x->e_mbd.plane[0].pre[0].stride; + const uint8_t *const what = x->plane[0].src.buf; + const uint8_t *const in_what = + x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col; + + // Work out the start point for the search + const uint8_t *best_address = in_what; + const uint8_t *new_best_address = best_address; +#if VPX_ARCH_AARCH64 + int64x2_t v_ba_q = vdupq_n_s64((intptr_t)best_address); +#else + int32x4_t v_ba_d = vdupq_n_s32((intptr_t)best_address); +#endif + // Starting position + unsigned int best_sad = start_mv_sad; + int i, j, step; + + // Check the prerequisite cost function properties that are easy to check + // in an assert. See the function-level documentation for details on all + // prerequisites. + assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]); + assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]); + + *num00 = 0; + + for (i = 0, step = 0; step < tot_steps; step++) { + for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) { + int16x8_t v_diff_mv_w; + int8x16_t v_inside_d; + uint32x4_t v_outside_d; + int32x4_t v_cost_d, v_sad_d; +#if VPX_ARCH_AARCH64 + int64x2_t v_blocka[2]; +#else + int32x4_t v_blocka[1]; + uint32x2_t horiz_max_0, horiz_max_1; +#endif + + uint32_t horiz_max; + // Compute the candidate motion vectors + const int16x8_t v_ss_mv_w = vld1q_s16((const int16_t *)&ss_mv[i]); + const int16x8_t v_these_mv_w = vaddq_s16(v_bmv_w, v_ss_mv_w); + // Clamp them to the search bounds + int16x8_t v_these_mv_clamp_w = v_these_mv_w; + v_these_mv_clamp_w = vminq_s16(v_these_mv_clamp_w, v_max_mv_w); + v_these_mv_clamp_w = vmaxq_s16(v_these_mv_clamp_w, v_min_mv_w); + // The ones that did not change are inside the search area + v_inside_d = vreinterpretq_s8_u32( + vceqq_s32(vreinterpretq_s32_s16(v_these_mv_clamp_w), + vreinterpretq_s32_s16(v_these_mv_w))); + + // If none of them are inside, then move on +#if VPX_ARCH_AARCH64 + horiz_max = vmaxvq_u32(vreinterpretq_u32_s8(v_inside_d)); +#else + horiz_max_0 = vmax_u32(vget_low_u32(vreinterpretq_u32_s8(v_inside_d)), + vget_high_u32(vreinterpretq_u32_s8(v_inside_d))); + horiz_max_1 = vpmax_u32(horiz_max_0, horiz_max_0); + vst1_lane_u32(&horiz_max, horiz_max_1, 0); +#endif + if (LIKELY(horiz_max == 0)) { + continue; + } + + // The inverse mask indicates which of the MVs are outside + v_outside_d = + vreinterpretq_u32_s8(veorq_s8(v_inside_d, vdupq_n_s8((int8_t)0xff))); + // Shift right to keep the sign bit clear, we will use this later + // to set the cost to the maximum value. + v_outside_d = vshrq_n_u32(v_outside_d, 1); + + // Compute the difference MV + v_diff_mv_w = vsubq_s16(v_these_mv_clamp_w, vfcmv); + // We utilise the fact that the cost function is even, and use the + // absolute difference. This allows us to use unsigned indexes later + // and reduces cache pressure somewhat as only a half of the table + // is ever referenced. + v_diff_mv_w = vabsq_s16(v_diff_mv_w); + + // Compute the SIMD pointer offsets. + { +#if VPX_ARCH_AARCH64 // sizeof(intptr_t) == 8 + // Load the offsets + int64x2_t v_bo10_q = vld1q_s64((const int64_t *)&ss_os[i + 0]); + int64x2_t v_bo32_q = vld1q_s64((const int64_t *)&ss_os[i + 2]); + // Set the ones falling outside to zero + v_bo10_q = vandq_s64( + v_bo10_q, + vmovl_s32(vget_low_s32(vreinterpretq_s32_s8(v_inside_d)))); + v_bo32_q = vandq_s64( + v_bo32_q, + vmovl_s32(vget_high_s32(vreinterpretq_s32_s8(v_inside_d)))); + // Compute the candidate addresses + v_blocka[0] = vaddq_s64(v_ba_q, v_bo10_q); + v_blocka[1] = vaddq_s64(v_ba_q, v_bo32_q); +#else // sizeof(intptr_t) == 4 + int32x4_t v_bo_d = vld1q_s32((const int32_t *)&ss_os[i]); + v_bo_d = vandq_s32(v_bo_d, vreinterpretq_s32_s8(v_inside_d)); + v_blocka[0] = vaddq_s32(v_ba_d, v_bo_d); +#endif + } + + sad_fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0], + in_what_stride, (uint32_t *)&v_sad_d); + + // Look up the component cost of the residual motion vector + { + uint32_t cost[4]; + DECLARE_ALIGNED(16, int16_t, rowcol[8]); + vst1q_s16(rowcol, v_diff_mv_w); + + // Note: This is a use case for gather instruction + cost[0] = x->nmvsadcost[0][rowcol[0]] + x->nmvsadcost[0][rowcol[1]]; + cost[1] = x->nmvsadcost[0][rowcol[2]] + x->nmvsadcost[0][rowcol[3]]; + cost[2] = x->nmvsadcost[0][rowcol[4]] + x->nmvsadcost[0][rowcol[5]]; + cost[3] = x->nmvsadcost[0][rowcol[6]] + x->nmvsadcost[0][rowcol[7]]; + + v_cost_d = vld1q_s32((int32_t *)cost); + } + + // Now add in the joint cost + { + const uint32x4_t v_sel_d = + vceqq_s32(vreinterpretq_s32_s16(v_diff_mv_w), zero_s32); + const int32x4_t v_joint_cost_d = vreinterpretq_s32_u8( + vbslq_u8(vreinterpretq_u8_u32(v_sel_d), + vreinterpretq_u8_s32(v_joint_cost_0_d), + vreinterpretq_u8_s32(v_joint_cost_1_d))); + v_cost_d = vaddq_s32(v_cost_d, v_joint_cost_d); + } + + // Multiply by sad_per_bit + v_cost_d = vmulq_s32(v_cost_d, v_spb_d); + // ROUND_POWER_OF_TWO(v_cost_d, VP9_PROB_COST_SHIFT) + v_cost_d = + vaddq_s32(v_cost_d, vdupq_n_s32(1 << (VP9_PROB_COST_SHIFT - 1))); + v_cost_d = vshrq_n_s32(v_cost_d, VP9_PROB_COST_SHIFT); + // Add the cost to the sad + v_sad_d = vaddq_s32(v_sad_d, v_cost_d); + + // Make the motion vectors outside the search area have max cost + // by or'ing in the comparison mask, this way the minimum search won't + // pick them. + v_sad_d = vorrq_s32(v_sad_d, vreinterpretq_s32_u32(v_outside_d)); + + // Find the minimum value and index horizontally in v_sad_d + { + uint32_t local_best_sad; +#if VPX_ARCH_AARCH64 + local_best_sad = vminvq_u32(vreinterpretq_u32_s32(v_sad_d)); +#else + uint32x2_t horiz_min_0 = + vmin_u32(vget_low_u32(vreinterpretq_u32_s32(v_sad_d)), + vget_high_u32(vreinterpretq_u32_s32(v_sad_d))); + uint32x2_t horiz_min_1 = vpmin_u32(horiz_min_0, horiz_min_0); + vst1_lane_u32(&local_best_sad, horiz_min_1, 0); +#endif + + // Update the global minimum if the local minimum is smaller + if (LIKELY(local_best_sad < best_sad)) { +#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif + uint32_t local_best_idx; + const uint32x4_t v_sel_d = + vceqq_s32(v_sad_d, vdupq_n_s32(local_best_sad)); + uint32x4_t v_mask_d = vandq_u32(v_sel_d, v_idx_d); + v_mask_d = vbslq_u32(v_sel_d, v_mask_d, vdupq_n_u32(0xffffffff)); + +#if VPX_ARCH_AARCH64 + local_best_idx = vminvq_u32(v_mask_d); +#else + horiz_min_0 = + vmin_u32(vget_low_u32(v_mask_d), vget_high_u32(v_mask_d)); + horiz_min_1 = vpmin_u32(horiz_min_0, horiz_min_0); + vst1_lane_u32(&local_best_idx, horiz_min_1, 0); +#endif + + new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx]; +#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__) +#pragma GCC diagnostic pop +#endif + new_best_address = ((const uint8_t **)v_blocka)[local_best_idx]; + + best_sad = local_best_sad; + } + } + } + + bmv = new_bmv; + best_address = new_best_address; + + v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int)); +#if VPX_ARCH_AARCH64 + v_ba_q = vdupq_n_s64((intptr_t)best_address); +#else + v_ba_d = vdupq_n_s32((intptr_t)best_address); +#endif + + if (UNLIKELY(best_address == in_what)) { + (*num00)++; + } + } + + *best_mv = bmv.as_mv; + return best_sad; +} diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_neon.c new file mode 100644 index 0000000000..0cf0bf250e --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_neon.c @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +int64_t vp9_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { + uint64x2_t err_u64 = vdupq_n_u64(0); + int64x2_t ssz_s64 = vdupq_n_s64(0); + + assert(block_size >= 16); + assert((block_size % 16) == 0); + + do { + uint32x4_t err; + int32x4_t ssz0, ssz1; + + const int16x8_t c0 = load_tran_low_to_s16q(coeff); + const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8); + const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff); + const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8); + + const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0)); + const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1)); + + // diff is 15-bits, the squares 30, so we can store 4 in 32-bits before + // accumulating them in 64-bits. + err = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0)); + err = vmlal_u16(err, vget_high_u16(diff0), vget_high_u16(diff0)); + err = vmlal_u16(err, vget_low_u16(diff1), vget_low_u16(diff1)); + err = vmlal_u16(err, vget_high_u16(diff1), vget_high_u16(diff1)); + err_u64 = vpadalq_u32(err_u64, err); + + // We can't do the same here as we're operating on signed integers, so we + // can store 2 15-bit diff before accumulating into 64-bits. + ssz0 = vmull_s16(vget_low_s16(c0), vget_low_s16(c0)); + ssz0 = vmlal_s16(ssz0, vget_high_s16(c0), vget_high_s16(c0)); + ssz_s64 = vpadalq_s32(ssz_s64, ssz0); + + ssz1 = vmull_s16(vget_low_s16(c1), vget_low_s16(c1)); + ssz1 = vmlal_s16(ssz1, vget_high_s16(c1), vget_high_s16(c1)); + ssz_s64 = vpadalq_s32(ssz_s64, ssz1); + + coeff += 16; + dqcoeff += 16; + block_size -= 16; + } while (block_size != 0); + + *ssz = horizontal_add_int64x2(ssz_s64); + return (int64_t)horizontal_add_uint64x2(err_u64); +} + +int64_t vp9_block_error_fp_neon(const tran_low_t *coeff, + const tran_low_t *dqcoeff, int block_size) { + uint64x2_t err_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; + + assert(block_size >= 16); + assert((block_size % 16) == 0); + + do { + uint32x4_t err0, err1; + + const int16x8_t c0 = load_tran_low_to_s16q(coeff); + const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8); + const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff); + const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8); + + const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0)); + const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1)); + + // diff is 15-bits, the squares 30, so in theory we can store 4 in 32-bits + // before accumulating them in 64-bits. However splitting into 2 mull, mlal + // pairs is beneficial since it allows us to use both Neon + // multiply-accumulate pipes - on CPUs that have them - rather than having + // a single chain of 4 instructions executing serially. + err0 = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0)); + err0 = vmlal_u16(err0, vget_high_u16(diff0), vget_high_u16(diff0)); + err_u64[0] = vpadalq_u32(err_u64[0], err0); + + err1 = vmull_u16(vget_low_u16(diff1), vget_low_u16(diff1)); + err1 = vmlal_u16(err1, vget_high_u16(diff1), vget_high_u16(diff1)); + err_u64[1] = vpadalq_u32(err_u64[1], err1); + + coeff += 16; + dqcoeff += 16; + block_size -= 16; + } while (block_size != 0); + + return horizontal_add_uint64x2(vaddq_u64(err_u64[0], err_u64[1])); +} diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c new file mode 100644 index 0000000000..bc8dd4a341 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c @@ -0,0 +1,844 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "./vpx_scale_rtcd.h" +#include "vp9/common/vp9_blockd.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/vpx_convolve8_neon.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_scale/yv12config.h" + +// Note: The scaling functions could write extra rows and columns in dst, which +// exceed the right and bottom boundaries of the destination frame. We rely on +// the following frame extension function to fix these rows and columns. + +static INLINE void scale_plane_2_to_1_phase_0(const uint8_t *src, + const int src_stride, + uint8_t *dst, + const int dst_stride, const int w, + const int h) { + const int max_width = (w + 15) & ~15; + int y = h; + + assert(w && h); + + do { + int x = max_width; + do { + const uint8x16x2_t s = vld2q_u8(src); + vst1q_u8(dst, s.val[0]); + src += 32; + dst += 16; + x -= 16; + } while (x); + src += 2 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static INLINE void scale_plane_4_to_1_phase_0(const uint8_t *src, + const int src_stride, + uint8_t *dst, + const int dst_stride, const int w, + const int h) { + const int max_width = (w + 15) & ~15; + int y = h; + + assert(w && h); + + do { + int x = max_width; + do { + const uint8x16x4_t s = vld4q_u8(src); + vst1q_u8(dst, s.val[0]); + src += 64; + dst += 16; + x -= 16; + } while (x); + src += 4 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static INLINE void scale_plane_bilinear_kernel( + const uint8x16_t in0, const uint8x16_t in1, const uint8x16_t in2, + const uint8x16_t in3, const uint8x8_t coef0, const uint8x8_t coef1, + uint8_t *const dst) { + const uint16x8_t h0 = vmull_u8(vget_low_u8(in0), coef0); + const uint16x8_t h1 = vmull_u8(vget_high_u8(in0), coef0); + const uint16x8_t h2 = vmull_u8(vget_low_u8(in2), coef0); + const uint16x8_t h3 = vmull_u8(vget_high_u8(in2), coef0); + const uint16x8_t h4 = vmlal_u8(h0, vget_low_u8(in1), coef1); + const uint16x8_t h5 = vmlal_u8(h1, vget_high_u8(in1), coef1); + const uint16x8_t h6 = vmlal_u8(h2, vget_low_u8(in3), coef1); + const uint16x8_t h7 = vmlal_u8(h3, vget_high_u8(in3), coef1); + + const uint8x8_t hor0 = vrshrn_n_u16(h4, 7); // temp: 00 01 02 03 04 05 06 07 + const uint8x8_t hor1 = vrshrn_n_u16(h5, 7); // temp: 08 09 0A 0B 0C 0D 0E 0F + const uint8x8_t hor2 = vrshrn_n_u16(h6, 7); // temp: 10 11 12 13 14 15 16 17 + const uint8x8_t hor3 = vrshrn_n_u16(h7, 7); // temp: 18 19 1A 1B 1C 1D 1E 1F + const uint16x8_t v0 = vmull_u8(hor0, coef0); + const uint16x8_t v1 = vmull_u8(hor1, coef0); + const uint16x8_t v2 = vmlal_u8(v0, hor2, coef1); + const uint16x8_t v3 = vmlal_u8(v1, hor3, coef1); + // dst: 0 1 2 3 4 5 6 7 8 9 A B C D E F + const uint8x16_t d = vcombine_u8(vrshrn_n_u16(v2, 7), vrshrn_n_u16(v3, 7)); + vst1q_u8(dst, d); +} + +static INLINE void scale_plane_2_to_1_bilinear( + const uint8_t *const src, const int src_stride, uint8_t *dst, + const int dst_stride, const int w, const int h, const int16_t c0, + const int16_t c1) { + const int max_width = (w + 15) & ~15; + const uint8_t *src0 = src; + const uint8_t *src1 = src + src_stride; + const uint8x8_t coef0 = vdup_n_u8(c0); + const uint8x8_t coef1 = vdup_n_u8(c1); + int y = h; + + assert(w && h); + + do { + int x = max_width; + do { + // 000 002 004 006 008 00A 00C 00E 010 012 014 016 018 01A 01C 01E + // 001 003 005 007 009 00B 00D 00F 011 013 015 017 019 01B 01D 01F + const uint8x16x2_t s0 = vld2q_u8(src0); + // 100 102 104 106 108 10A 10C 10E 110 112 114 116 118 11A 11C 11E + // 101 103 105 107 109 10B 10D 10F 111 113 115 117 119 11B 11D 11F + const uint8x16x2_t s1 = vld2q_u8(src1); + scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1], + coef0, coef1, dst); + src0 += 32; + src1 += 32; + dst += 16; + x -= 16; + } while (x); + src0 += 2 * (src_stride - max_width); + src1 += 2 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static INLINE void scale_plane_4_to_1_bilinear( + const uint8_t *const src, const int src_stride, uint8_t *dst, + const int dst_stride, const int w, const int h, const int16_t c0, + const int16_t c1) { + const int max_width = (w + 15) & ~15; + const uint8_t *src0 = src; + const uint8_t *src1 = src + src_stride; + const uint8x8_t coef0 = vdup_n_u8(c0); + const uint8x8_t coef1 = vdup_n_u8(c1); + int y = h; + + assert(w && h); + + do { + int x = max_width; + do { + // (*) -- useless + // 000 004 008 00C 010 014 018 01C 020 024 028 02C 030 034 038 03C + // 001 005 009 00D 011 015 019 01D 021 025 029 02D 031 035 039 03D + // 002 006 00A 00E 012 016 01A 01E 022 026 02A 02E 032 036 03A 03E (*) + // 003 007 00B 00F 013 017 01B 01F 023 027 02B 02F 033 037 03B 03F (*) + const uint8x16x4_t s0 = vld4q_u8(src0); + // 100 104 108 10C 110 114 118 11C 120 124 128 12C 130 134 138 13C + // 101 105 109 10D 111 115 119 11D 121 125 129 12D 131 135 139 13D + // 102 106 10A 10E 112 116 11A 11E 122 126 12A 12E 132 136 13A 13E (*) + // 103 107 10B 10F 113 117 11B 11F 123 127 12B 12F 133 137 13B 13F (*) + const uint8x16x4_t s1 = vld4q_u8(src1); + scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1], + coef0, coef1, dst); + src0 += 64; + src1 += 64; + dst += 16; + x -= 16; + } while (x); + src0 += 4 * (src_stride - max_width); + src1 += 4 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static INLINE uint8x8_t scale_filter_bilinear(const uint8x8_t *const s, + const uint8x8_t *const coef) { + const uint16x8_t h0 = vmull_u8(s[0], coef[0]); + const uint16x8_t h1 = vmlal_u8(h0, s[1], coef[1]); + + return vrshrn_n_u16(h1, 7); +} + +static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + const int width_hor = (w + 3) & ~3; + const int width_ver = (w + 7) & ~7; + const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7; + const int height_ver = (h + 3) & ~3; + const int16x8_t filters = vld1q_s16(coef); + int x, y = height_hor; + uint8_t *t = temp_buffer; + uint8x8_t s[14], d[4]; + + assert(w && h); + + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1; + + // horizontal 4x8 + // Note: processing 4x8 is about 20% faster than processing row by row using + // vld4_u8(). + do { + load_u8_8x8(src + 2, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]); + x = width_hor; + + do { + src += 8; + load_u8_8x8(src, src_stride, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11], + &s[12], &s[13]); + transpose_u8_8x8(&s[6], &s[7], &s[8], &s[9], &s[10], &s[11], &s[12], + &s[13]); + + d[0] = scale_filter_8(&s[0], filters); // 00 10 20 30 40 50 60 70 + d[1] = scale_filter_8(&s[2], filters); // 01 11 21 31 41 51 61 71 + d[2] = scale_filter_8(&s[4], filters); // 02 12 22 32 42 52 62 72 + d[3] = scale_filter_8(&s[6], filters); // 03 13 23 33 43 53 63 73 + // 00 01 02 03 40 41 42 43 + // 10 11 12 13 50 51 52 53 + // 20 21 22 23 60 61 62 63 + // 30 31 32 33 70 71 72 73 + transpose_u8_8x4(&d[0], &d[1], &d[2], &d[3]); + vst1_lane_u32((uint32_t *)(t + 0 * width_hor), vreinterpret_u32_u8(d[0]), + 0); + vst1_lane_u32((uint32_t *)(t + 1 * width_hor), vreinterpret_u32_u8(d[1]), + 0); + vst1_lane_u32((uint32_t *)(t + 2 * width_hor), vreinterpret_u32_u8(d[2]), + 0); + vst1_lane_u32((uint32_t *)(t + 3 * width_hor), vreinterpret_u32_u8(d[3]), + 0); + vst1_lane_u32((uint32_t *)(t + 4 * width_hor), vreinterpret_u32_u8(d[0]), + 1); + vst1_lane_u32((uint32_t *)(t + 5 * width_hor), vreinterpret_u32_u8(d[1]), + 1); + vst1_lane_u32((uint32_t *)(t + 6 * width_hor), vreinterpret_u32_u8(d[2]), + 1); + vst1_lane_u32((uint32_t *)(t + 7 * width_hor), vreinterpret_u32_u8(d[3]), + 1); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + s[4] = s[12]; + s[5] = s[13]; + + t += 4; + x -= 4; + } while (x); + src += 8 * src_stride - 2 * width_hor; + t += 7 * width_hor; + y -= 8; + } while (y); + + // vertical 8x4 + x = width_ver; + t = temp_buffer; + do { + load_u8_8x8(t, width_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7]); + t += 6 * width_hor; + y = height_ver; + + do { + load_u8_8x8(t, width_hor, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11], + &s[12], &s[13]); + t += 8 * width_hor; + + d[0] = scale_filter_8(&s[0], filters); // 00 01 02 03 04 05 06 07 + d[1] = scale_filter_8(&s[2], filters); // 10 11 12 13 14 15 16 17 + d[2] = scale_filter_8(&s[4], filters); // 20 21 22 23 24 25 26 27 + d[3] = scale_filter_8(&s[6], filters); // 30 31 32 33 34 35 36 37 + vst1_u8(dst + 0 * dst_stride, d[0]); + vst1_u8(dst + 1 * dst_stride, d[1]); + vst1_u8(dst + 2 * dst_stride, d[2]); + vst1_u8(dst + 3 * dst_stride, d[3]); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + s[4] = s[12]; + s[5] = s[13]; + + dst += 4 * dst_stride; + y -= 4; + } while (y); + t -= width_hor * (2 * height_ver + 6); + t += 8; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + const int width_hor = (w + 1) & ~1; + const int width_ver = (w + 7) & ~7; + const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7; + const int height_ver = (h + 1) & ~1; + const int16x8_t filters = vld1q_s16(coef); + int x, y = height_hor; + uint8_t *t = temp_buffer; + uint8x8_t s[12], d[2]; + + assert(w && h); + + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3; + + // horizontal 2x8 + // Note: processing 2x8 is about 20% faster than processing row by row using + // vld4_u8(). + do { + load_u8_8x8(src + 4, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + transpose_u8_4x8(&s[0], &s[1], &s[2], &s[3], s[4], s[5], s[6], s[7]); + x = width_hor; + + do { + uint8x8x2_t dd; + src += 8; + load_u8_8x8(src, src_stride, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9], + &s[10], &s[11]); + transpose_u8_8x8(&s[4], &s[5], &s[6], &s[7], &s[8], &s[9], &s[10], + &s[11]); + + d[0] = scale_filter_8(&s[0], filters); // 00 10 20 30 40 50 60 70 + d[1] = scale_filter_8(&s[4], filters); // 01 11 21 31 41 51 61 71 + // dd.val[0]: 00 01 20 21 40 41 60 61 + // dd.val[1]: 10 11 30 31 50 51 70 71 + dd = vtrn_u8(d[0], d[1]); + vst1_lane_u16((uint16_t *)(t + 0 * width_hor), + vreinterpret_u16_u8(dd.val[0]), 0); + vst1_lane_u16((uint16_t *)(t + 1 * width_hor), + vreinterpret_u16_u8(dd.val[1]), 0); + vst1_lane_u16((uint16_t *)(t + 2 * width_hor), + vreinterpret_u16_u8(dd.val[0]), 1); + vst1_lane_u16((uint16_t *)(t + 3 * width_hor), + vreinterpret_u16_u8(dd.val[1]), 1); + vst1_lane_u16((uint16_t *)(t + 4 * width_hor), + vreinterpret_u16_u8(dd.val[0]), 2); + vst1_lane_u16((uint16_t *)(t + 5 * width_hor), + vreinterpret_u16_u8(dd.val[1]), 2); + vst1_lane_u16((uint16_t *)(t + 6 * width_hor), + vreinterpret_u16_u8(dd.val[0]), 3); + vst1_lane_u16((uint16_t *)(t + 7 * width_hor), + vreinterpret_u16_u8(dd.val[1]), 3); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + + t += 2; + x -= 2; + } while (x); + src += 8 * src_stride - 4 * width_hor; + t += 7 * width_hor; + y -= 8; + } while (y); + + // vertical 8x2 + x = width_ver; + t = temp_buffer; + do { + load_u8_8x4(t, width_hor, &s[0], &s[1], &s[2], &s[3]); + t += 4 * width_hor; + y = height_ver; + + do { + load_u8_8x8(t, width_hor, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9], + &s[10], &s[11]); + t += 8 * width_hor; + + d[0] = scale_filter_8(&s[0], filters); // 00 01 02 03 04 05 06 07 + d[1] = scale_filter_8(&s[4], filters); // 10 11 12 13 14 15 16 17 + vst1_u8(dst + 0 * dst_stride, d[0]); + vst1_u8(dst + 1 * dst_stride, d[1]); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + + dst += 2 * dst_stride; + y -= 2; + } while (y); + t -= width_hor * (4 * height_ver + 4); + t += 8; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +// Notes for 4 to 3 scaling: +// +// 1. 6 rows are calculated in each horizontal inner loop, so width_hor must be +// multiple of 6, and no less than w. +// +// 2. 8 rows are calculated in each vertical inner loop, so width_ver must be +// multiple of 8, and no less than w. +// +// 3. 8 columns are calculated in each horizontal inner loop for further +// vertical scaling, so height_hor must be multiple of 8, and no less than +// 4 * h / 3. +// +// 4. 6 columns are calculated in each vertical inner loop, so height_ver must +// be multiple of 6, and no less than h. +// +// 5. The physical location of the last row of the 4 to 3 scaled frame is +// decided by phase_scaler, and are always less than 1 pixel below the last row +// of the original image. + +static void scale_plane_4_to_3_bilinear(const uint8_t *src, + const int src_stride, uint8_t *dst, + const int dst_stride, const int w, + const int h, const int phase_scaler, + uint8_t *const temp_buffer) { + static const int step_q4 = 16 * 4 / 3; + const int width_hor = (w + 5) - ((w + 5) % 6); + const int stride_hor = width_hor + 2; // store 2 extra pixels + const int width_ver = (w + 7) & ~7; + // We only need 1 extra row below because there are only 2 bilinear + // coefficients. + const int height_hor = (4 * h / 3 + 1 + 7) & ~7; + const int height_ver = (h + 5) - ((h + 5) % 6); + int x, y = height_hor; + uint8_t *t = temp_buffer; + uint8x8_t s[9], d[8], c[6]; + + assert(w && h); + + c[0] = vdup_n_u8((uint8_t)vp9_filter_kernels[BILINEAR][phase_scaler][3]); + c[1] = vdup_n_u8((uint8_t)vp9_filter_kernels[BILINEAR][phase_scaler][4]); + c[2] = vdup_n_u8( + (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 1 * step_q4) & + SUBPEL_MASK][3]); + c[3] = vdup_n_u8( + (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 1 * step_q4) & + SUBPEL_MASK][4]); + c[4] = vdup_n_u8( + (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 2 * step_q4) & + SUBPEL_MASK][3]); + c[5] = vdup_n_u8( + (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 2 * step_q4) & + SUBPEL_MASK][4]); + + d[6] = vdup_n_u8(0); + d[7] = vdup_n_u8(0); + + // horizontal 6x8 + do { + load_u8_8x8(src, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + src += 1; + transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]); + x = width_hor; + + do { + load_u8_8x8(src, src_stride, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7], &s[8]); + src += 8; + transpose_u8_8x8(&s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7], &s[8]); + + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + d[0] = scale_filter_bilinear(&s[0], &c[0]); + d[1] = + scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]); + d[2] = + scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]); + d[3] = scale_filter_bilinear(&s[4], &c[0]); + d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], + &c[2]); + d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], + &c[4]); + + // 00 01 02 03 04 05 xx xx + // 10 11 12 13 14 15 xx xx + // 20 21 22 23 24 25 xx xx + // 30 31 32 33 34 35 xx xx + // 40 41 42 43 44 45 xx xx + // 50 51 52 53 54 55 xx xx + // 60 61 62 63 64 65 xx xx + // 70 71 72 73 74 75 xx xx + transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); + // store 2 extra pixels + vst1_u8(t + 0 * stride_hor, d[0]); + vst1_u8(t + 1 * stride_hor, d[1]); + vst1_u8(t + 2 * stride_hor, d[2]); + vst1_u8(t + 3 * stride_hor, d[3]); + vst1_u8(t + 4 * stride_hor, d[4]); + vst1_u8(t + 5 * stride_hor, d[5]); + vst1_u8(t + 6 * stride_hor, d[6]); + vst1_u8(t + 7 * stride_hor, d[7]); + + s[0] = s[8]; + + t += 6; + x -= 6; + } while (x); + src += 8 * src_stride - 4 * width_hor / 3 - 1; + t += 7 * stride_hor + 2; + y -= 8; + } while (y); + + // vertical 8x6 + x = width_ver; + t = temp_buffer; + do { + load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7]); + t += stride_hor; + y = height_ver; + + do { + load_u8_8x8(t, stride_hor, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7], &s[8]); + t += 8 * stride_hor; + + d[0] = scale_filter_bilinear(&s[0], &c[0]); + d[1] = + scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]); + d[2] = + scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]); + d[3] = scale_filter_bilinear(&s[4], &c[0]); + d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], + &c[2]); + d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], + &c[4]); + vst1_u8(dst + 0 * dst_stride, d[0]); + vst1_u8(dst + 1 * dst_stride, d[1]); + vst1_u8(dst + 2 * dst_stride, d[2]); + vst1_u8(dst + 3 * dst_stride, d[3]); + vst1_u8(dst + 4 * dst_stride, d[4]); + vst1_u8(dst + 5 * dst_stride, d[5]); + + s[0] = s[8]; + + dst += 6 * dst_stride; + y -= 6; + } while (y); + t -= stride_hor * (4 * height_ver / 3 + 1); + t += 8; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const InterpKernel *const coef, + const int phase_scaler, + uint8_t *const temp_buffer) { + static const int step_q4 = 16 * 4 / 3; + const int width_hor = (w + 5) - ((w + 5) % 6); + const int stride_hor = width_hor + 2; // store 2 extra pixels + const int width_ver = (w + 7) & ~7; + // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows + // above and (SUBPEL_TAPS / 2) extra rows below. + const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; + const int height_ver = (h + 5) - ((h + 5) % 6); + const int16x8_t filters0 = + vld1q_s16(coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK]); + const int16x8_t filters1 = + vld1q_s16(coef[(phase_scaler + 1 * step_q4) & SUBPEL_MASK]); + const int16x8_t filters2 = + vld1q_s16(coef[(phase_scaler + 2 * step_q4) & SUBPEL_MASK]); + int x, y = height_hor; + uint8_t *t = temp_buffer; + uint8x8_t s[15], d[8]; + + assert(w && h); + + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2; + d[6] = vdup_n_u8(0); + d[7] = vdup_n_u8(0); + + // horizontal 6x8 + do { + load_u8_8x8(src + 1, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]); + x = width_hor; + + do { + src += 8; + load_u8_8x8(src, src_stride, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12], + &s[13], &s[14]); + transpose_u8_8x8(&s[7], &s[8], &s[9], &s[10], &s[11], &s[12], &s[13], + &s[14]); + + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + d[0] = scale_filter_8(&s[0], filters0); + d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1); + d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2); + d[3] = scale_filter_8(&s[4], filters0); + d[4] = + scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1); + d[5] = + scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2); + + // 00 01 02 03 04 05 xx xx + // 10 11 12 13 14 15 xx xx + // 20 21 22 23 24 25 xx xx + // 30 31 32 33 34 35 xx xx + // 40 41 42 43 44 45 xx xx + // 50 51 52 53 54 55 xx xx + // 60 61 62 63 64 65 xx xx + // 70 71 72 73 74 75 xx xx + transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); + // store 2 extra pixels + vst1_u8(t + 0 * stride_hor, d[0]); + vst1_u8(t + 1 * stride_hor, d[1]); + vst1_u8(t + 2 * stride_hor, d[2]); + vst1_u8(t + 3 * stride_hor, d[3]); + vst1_u8(t + 4 * stride_hor, d[4]); + vst1_u8(t + 5 * stride_hor, d[5]); + vst1_u8(t + 6 * stride_hor, d[6]); + vst1_u8(t + 7 * stride_hor, d[7]); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + s[4] = s[12]; + s[5] = s[13]; + s[6] = s[14]; + + t += 6; + x -= 6; + } while (x); + src += 8 * src_stride - 4 * width_hor / 3; + t += 7 * stride_hor + 2; + y -= 8; + } while (y); + + // vertical 8x6 + x = width_ver; + t = temp_buffer; + do { + load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7]); + t += 7 * stride_hor; + y = height_ver; + + do { + load_u8_8x8(t, stride_hor, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12], + &s[13], &s[14]); + t += 8 * stride_hor; + + d[0] = scale_filter_8(&s[0], filters0); + d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1); + d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2); + d[3] = scale_filter_8(&s[4], filters0); + d[4] = + scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1); + d[5] = + scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2); + vst1_u8(dst + 0 * dst_stride, d[0]); + vst1_u8(dst + 1 * dst_stride, d[1]); + vst1_u8(dst + 2 * dst_stride, d[2]); + vst1_u8(dst + 3 * dst_stride, d[3]); + vst1_u8(dst + 4 * dst_stride, d[4]); + vst1_u8(dst + 5 * dst_stride, d[5]); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + s[4] = s[12]; + s[5] = s[13]; + s[6] = s[14]; + + dst += 6 * dst_stride; + y -= 6; + } while (y); + t -= stride_hor * (4 * height_ver / 3 + 7); + t += 8; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +void vp9_scale_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + INTERP_FILTER filter_type, + int phase_scaler) { + const int src_w = src->y_crop_width; + const int src_h = src->y_crop_height; + const int dst_w = dst->y_crop_width; + const int dst_h = dst->y_crop_height; + const int dst_uv_w = dst->uv_crop_width; + const int dst_uv_h = dst->uv_crop_height; + int scaled = 0; + + // phase_scaler is usually 0 or 8. + assert(phase_scaler >= 0 && phase_scaler < 16); + + if (2 * dst_w == src_w && 2 * dst_h == src_h) { + // 2 to 1 + scaled = 1; + if (phase_scaler == 0) { + scale_plane_2_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h); + scale_plane_2_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + scale_plane_2_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + } else if (filter_type == BILINEAR) { + const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3]; + const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4]; + scale_plane_2_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h, c0, c1); + scale_plane_2_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1); + scale_plane_2_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1); + } else { + const int buffer_stride = (dst_w + 3) & ~3; + const int buffer_height = (2 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height); + if (temp_buffer) { + scale_plane_2_to_1_general( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w, + dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer); + scale_plane_2_to_1_general( + src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + scale_plane_2_to_1_general( + src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + free(temp_buffer); + } else { + scaled = 0; + } + } + } else if (4 * dst_w == src_w && 4 * dst_h == src_h) { + // 4 to 1 + scaled = 1; + if (phase_scaler == 0) { + scale_plane_4_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h); + scale_plane_4_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + scale_plane_4_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + } else if (filter_type == BILINEAR) { + const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3]; + const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4]; + scale_plane_4_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h, c0, c1); + scale_plane_4_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1); + scale_plane_4_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1); + } else { + const int buffer_stride = (dst_w + 1) & ~1; + const int buffer_height = (4 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height); + if (temp_buffer) { + scale_plane_4_to_1_general( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w, + dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer); + scale_plane_4_to_1_general( + src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + scale_plane_4_to_1_general( + src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + free(temp_buffer); + } else { + scaled = 0; + } + } + } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) { + // 4 to 3 + const int buffer_stride = (dst_w + 5) - ((dst_w + 5) % 6) + 2; + const int buffer_height = (4 * dst_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height); + if (temp_buffer) { + scaled = 1; + if (filter_type == BILINEAR) { + scale_plane_4_to_3_bilinear(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h, phase_scaler, + temp_buffer); + scale_plane_4_to_3_bilinear(src->u_buffer, src->uv_stride, + dst->u_buffer, dst->uv_stride, dst_uv_w, + dst_uv_h, phase_scaler, temp_buffer); + scale_plane_4_to_3_bilinear(src->v_buffer, src->uv_stride, + dst->v_buffer, dst->uv_stride, dst_uv_w, + dst_uv_h, phase_scaler, temp_buffer); + } else { + scale_plane_4_to_3_general( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w, + dst_h, vp9_filter_kernels[filter_type], phase_scaler, temp_buffer); + scale_plane_4_to_3_general(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, + vp9_filter_kernels[filter_type], + phase_scaler, temp_buffer); + scale_plane_4_to_3_general(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, + vp9_filter_kernels[filter_type], + phase_scaler, temp_buffer); + } + free(temp_buffer); + } + } + + if (scaled) { + vpx_extend_frame_borders(dst); + } else { + // Call c version for all other scaling ratios. + vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler); + } +} diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_error_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_error_neon.c new file mode 100644 index 0000000000..d9b183472d --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_error_neon.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +int64_t vp9_highbd_block_error_neon(const tran_low_t *coeff, + const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz, int bd) { + uint64x2_t err_u64 = vdupq_n_u64(0); + int64x2_t ssz_s64 = vdupq_n_s64(0); + + const int shift = 2 * (bd - 8); + const int rounding = shift > 0 ? 1 << (shift - 1) : 0; + + assert(block_size >= 16); + assert((block_size % 16) == 0); + + do { + const int32x4_t c = load_tran_low_to_s32q(coeff); + const int32x4_t d = load_tran_low_to_s32q(dqcoeff); + + const uint32x4_t diff = vreinterpretq_u32_s32(vabdq_s32(c, d)); + + err_u64 = vmlal_u32(err_u64, vget_low_u32(diff), vget_low_u32(diff)); + err_u64 = vmlal_u32(err_u64, vget_high_u32(diff), vget_high_u32(diff)); + + ssz_s64 = vmlal_s32(ssz_s64, vget_low_s32(c), vget_low_s32(c)); + ssz_s64 = vmlal_s32(ssz_s64, vget_high_s32(c), vget_high_s32(c)); + + coeff += 4; + dqcoeff += 4; + block_size -= 4; + } while (block_size != 0); + + *ssz = (horizontal_add_int64x2(ssz_s64) + rounding) >> shift; + return ((int64_t)horizontal_add_uint64x2(err_u64) + rounding) >> shift; +} diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c new file mode 100644 index 0000000000..c3aef3c865 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c @@ -0,0 +1,872 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_temporal_filter.h" +#include "vp9/encoder/vp9_temporal_filter_constants.h" + +// Compute (a-b)**2 for 8 pixels with size 16-bit +static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b, + uint32_t *dst) { + const uint16x8_t a_reg = vld1q_u16(a); + const uint16x8_t b_reg = vld1q_u16(b); + + uint16x8_t dist = vabdq_u16(a_reg, b_reg); + uint32x4_t dist_first = vmull_u16(vget_low_u16(dist), vget_low_u16(dist)); + uint32x4_t dist_second = vmull_u16(vget_high_u16(dist), vget_high_u16(dist)); + + vst1q_u32(dst, dist_first); + vst1q_u32(dst + 4, dist_second); +} + +// Sum up three neighboring distortions for the pixels +static INLINE void highbd_get_sum_4(const uint32_t *dist, uint32x4_t *sum) { + uint32x4_t dist_reg, dist_left, dist_right; + + dist_reg = vld1q_u32(dist); + dist_left = vld1q_u32(dist - 1); + dist_right = vld1q_u32(dist + 1); + + *sum = vaddq_u32(dist_reg, dist_left); + *sum = vaddq_u32(*sum, dist_right); +} + +static INLINE void highbd_get_sum_8(const uint32_t *dist, uint32x4_t *sum_first, + uint32x4_t *sum_second) { + highbd_get_sum_4(dist, sum_first); + highbd_get_sum_4(dist + 4, sum_second); +} + +// Average the value based on the number of values summed (9 for pixels away +// from the border, 4 for pixels in corners, and 6 for other edge values, plus +// however many values from y/uv plane are). +// +// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply +// by weight. +static INLINE void highbd_average_4(uint32x4_t *output, const uint32x4_t sum, + const uint32x4_t *mul_constants, + const int strength, const int rounding, + const int weight) { + const int64x2_t strength_s64 = vdupq_n_s64(-strength - 32); + const uint64x2_t rounding_u64 = vdupq_n_u64((uint64_t)rounding << 32); + const uint32x4_t weight_u32 = vdupq_n_u32(weight); + const uint32x4_t sixteen = vdupq_n_u32(16); + uint32x4_t sum2; + + // modifier * 3 / index; + uint64x2_t sum_lo = + vmlal_u32(rounding_u64, vget_low_u32(sum), vget_low_u32(*mul_constants)); + uint64x2_t sum_hi = vmlal_u32(rounding_u64, vget_high_u32(sum), + vget_high_u32(*mul_constants)); + + // we cannot use vshrn_n_u64 as strength is not known at compile time. + sum_lo = vshlq_u64(sum_lo, strength_s64); + sum_hi = vshlq_u64(sum_hi, strength_s64); + + sum2 = vcombine_u32(vmovn_u64(sum_lo), vmovn_u64(sum_hi)); + + // Multiply with the weight + sum2 = vminq_u32(sum2, sixteen); + sum2 = vsubq_u32(sixteen, sum2); + *output = vmulq_u32(sum2, weight_u32); +} + +static INLINE void highbd_average_8(uint32x4_t *output_0, uint32x4_t *output_1, + const uint32x4_t sum_0_u32, + const uint32x4_t sum_1_u32, + const uint32x4_t *mul_constants_0, + const uint32x4_t *mul_constants_1, + const int strength, const int rounding, + const int weight) { + highbd_average_4(output_0, sum_0_u32, mul_constants_0, strength, rounding, + weight); + highbd_average_4(output_1, sum_1_u32, mul_constants_1, strength, rounding, + weight); +} + +// Add 'sum_u32' to 'count'. Multiply by 'pred' and add to 'accumulator.' +static INLINE void highbd_accumulate_and_store_8( + const uint32x4_t sum_first_u32, const uint32x4_t sum_second_u32, + const uint16_t *pred, uint16_t *count, uint32_t *accumulator) { + const uint16x8_t sum_u16 = + vcombine_u16(vqmovn_u32(sum_first_u32), vqmovn_u32(sum_second_u32)); + uint16x8_t pred_u16 = vld1q_u16(pred); + uint16x8_t count_u16 = vld1q_u16(count); + uint32x4_t pred_0_u32, pred_1_u32; + uint32x4_t accum_0_u32, accum_1_u32; + + count_u16 = vqaddq_u16(count_u16, sum_u16); + vst1q_u16(count, count_u16); + + accum_0_u32 = vld1q_u32(accumulator); + accum_1_u32 = vld1q_u32(accumulator + 4); + + pred_0_u32 = vmovl_u16(vget_low_u16(pred_u16)); + pred_1_u32 = vmovl_u16(vget_high_u16(pred_u16)); + + // Don't use sum_u16 as that produces different results to the C version + accum_0_u32 = vmlaq_u32(accum_0_u32, sum_first_u32, pred_0_u32); + accum_1_u32 = vmlaq_u32(accum_1_u32, sum_second_u32, pred_1_u32); + + vst1q_u32(accumulator, accum_0_u32); + vst1q_u32(accumulator + 4, accum_1_u32); +} + +static INLINE void highbd_read_dist_4(const uint32_t *dist, + uint32x4_t *dist_reg) { + *dist_reg = vld1q_u32(dist); +} + +static INLINE void highbd_read_dist_8(const uint32_t *dist, + uint32x4_t *reg_first, + uint32x4_t *reg_second) { + highbd_read_dist_4(dist, reg_first); + highbd_read_dist_4(dist + 4, reg_second); +} + +static INLINE void highbd_read_chroma_dist_row_8( + int ss_x, const uint32_t *u_dist, const uint32_t *v_dist, + uint32x4_t *u_first, uint32x4_t *u_second, uint32x4_t *v_first, + uint32x4_t *v_second) { + if (!ss_x) { + // If there is no chroma subsampling in the horizontal direction, then we + // need to load 8 entries from chroma. + highbd_read_dist_8(u_dist, u_first, u_second); + highbd_read_dist_8(v_dist, v_first, v_second); + } else { // ss_x == 1 + // Otherwise, we only need to load 8 entries + uint32x4_t u_reg, v_reg; + uint32x4x2_t pair; + + highbd_read_dist_4(u_dist, &u_reg); + + pair = vzipq_u32(u_reg, u_reg); + *u_first = pair.val[0]; + *u_second = pair.val[1]; + + highbd_read_dist_4(v_dist, &v_reg); + + pair = vzipq_u32(v_reg, v_reg); + *v_first = pair.val[0]; + *v_second = pair.val[1]; + } +} + +static void highbd_apply_temporal_filter_luma_8( + const uint16_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist, + const uint32_t *const *neighbors_first, + const uint32_t *const *neighbors_second, int top_weight, + int bottom_weight) { + const int rounding = (1 << strength) >> 1; + int weight = top_weight; + + uint32x4_t mul_first, mul_second; + + uint32x4_t sum_row_1_first, sum_row_1_second; + uint32x4_t sum_row_2_first, sum_row_2_second; + uint32x4_t sum_row_3_first, sum_row_3_second; + + uint32x4_t u_first, u_second; + uint32x4_t v_first, v_second; + + uint32x4_t sum_row_first; + uint32x4_t sum_row_second; + + // Loop variables + unsigned int h; + + assert(strength >= 4 && strength <= 14 && + "invalid adjusted temporal filter strength"); + assert(block_width == 8); + + (void)block_width; + + // First row + mul_first = vld1q_u32(neighbors_first[0]); + mul_second = vld1q_u32(neighbors_second[0]); + + // Add luma values + highbd_get_sum_8(y_dist, &sum_row_2_first, &sum_row_2_second); + highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + // We don't need to saturate here because the maximum value is UINT12_MAX ** 2 + // * 9 ~= 2**24 * 9 < 2 ** 28 < INT32_MAX + sum_row_first = vaddq_u32(sum_row_2_first, sum_row_3_first); + sum_row_second = vaddq_u32(sum_row_2_second, sum_row_3_second); + + // Add chroma values + highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + + // Max value here is 2 ** 24 * (9 + 2), so no saturation is needed + sum_row_first = vaddq_u32(sum_row_first, u_first); + sum_row_second = vaddq_u32(sum_row_second, u_second); + + sum_row_first = vaddq_u32(sum_row_first, v_first); + sum_row_second = vaddq_u32(sum_row_second, v_second); + + // Get modifier and store result + highbd_average_8(&sum_row_first, &sum_row_second, sum_row_first, + sum_row_second, &mul_first, &mul_second, strength, rounding, + weight); + + highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + + u_dist += DIST_STRIDE; + v_dist += DIST_STRIDE; + + // Then all the rows except the last one + mul_first = vld1q_u32(neighbors_first[1]); + mul_second = vld1q_u32(neighbors_second[1]); + + for (h = 1; h < block_height - 1; ++h) { + // Move the weight to bottom half + if (!use_whole_blk && h == block_height / 2) { + weight = bottom_weight; + } + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = vaddq_u32(sum_row_1_first, sum_row_2_first); + sum_row_second = vaddq_u32(sum_row_1_second, sum_row_2_second); + + highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + sum_row_first = vaddq_u32(sum_row_first, sum_row_3_first); + sum_row_second = vaddq_u32(sum_row_second, sum_row_3_second); + + // Add chroma values to the modifier + if (ss_y == 0 || h % 2 == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + + u_dist += DIST_STRIDE; + v_dist += DIST_STRIDE; + } + + sum_row_first = vaddq_u32(sum_row_first, u_first); + sum_row_second = vaddq_u32(sum_row_second, u_second); + sum_row_first = vaddq_u32(sum_row_first, v_first); + sum_row_second = vaddq_u32(sum_row_second, v_second); + + // Get modifier and store result + highbd_average_8(&sum_row_first, &sum_row_second, sum_row_first, + sum_row_second, &mul_first, &mul_second, strength, + rounding, weight); + highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + } + + // The last row + mul_first = vld1q_u32(neighbors_first[0]); + mul_second = vld1q_u32(neighbors_second[0]); + + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = vaddq_u32(sum_row_1_first, sum_row_2_first); + sum_row_second = vaddq_u32(sum_row_1_second, sum_row_2_second); + + // Add chroma values to the modifier + if (ss_y == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + } + + sum_row_first = vaddq_u32(sum_row_first, u_first); + sum_row_second = vaddq_u32(sum_row_second, u_second); + sum_row_first = vaddq_u32(sum_row_first, v_first); + sum_row_second = vaddq_u32(sum_row_second, v_second); + + // Get modifier and store result + highbd_average_8(&sum_row_first, &sum_row_second, sum_row_first, + sum_row_second, &mul_first, &mul_second, strength, rounding, + weight); + highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); +} + +// Perform temporal filter for the luma component. +static void highbd_apply_temporal_filter_luma( + const uint16_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) { + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x; + const unsigned int mid_width = block_width >> 1, + last_width = block_width - blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const uint32_t *const *neighbors_first; + const uint32_t *const *neighbors_second; + + // Left + neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS; + neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS; + highbd_apply_temporal_filter_luma_8( + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + neighbors_first = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS; + for (; blk_col < mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + highbd_apply_temporal_filter_luma_8( + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; blk_col < last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + highbd_apply_temporal_filter_luma_8( + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); + } + + // Right + neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS; + highbd_apply_temporal_filter_luma_8( + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); +} + +// Add a row of luma distortion that corresponds to 8 chroma mods. If we are +// subsampling in x direction, then we have 16 lumas, else we have 8. +static INLINE void highbd_add_luma_dist_to_8_chroma_mod( + const uint32_t *y_dist, int ss_x, int ss_y, uint32x4_t *u_mod_fst, + uint32x4_t *u_mod_snd, uint32x4_t *v_mod_fst, uint32x4_t *v_mod_snd) { + uint32x4_t y_reg_fst, y_reg_snd; + if (!ss_x) { + highbd_read_dist_8(y_dist, &y_reg_fst, &y_reg_snd); + if (ss_y == 1) { + uint32x4_t y_tmp_fst, y_tmp_snd; + highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd); + y_reg_fst = vaddq_u32(y_reg_fst, y_tmp_fst); + y_reg_snd = vaddq_u32(y_reg_snd, y_tmp_snd); + } + } else { + // Temporary + uint32x4_t y_fst, y_snd; + uint64x2_t y_fst64, y_snd64; + + // First 8 + highbd_read_dist_8(y_dist, &y_fst, &y_snd); + if (ss_y == 1) { + uint32x4_t y_tmp_fst, y_tmp_snd; + highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd); + + y_fst = vaddq_u32(y_fst, y_tmp_fst); + y_snd = vaddq_u32(y_snd, y_tmp_snd); + } + + y_fst64 = vpaddlq_u32(y_fst); + y_snd64 = vpaddlq_u32(y_snd); + y_reg_fst = vcombine_u32(vqmovn_u64(y_fst64), vqmovn_u64(y_snd64)); + + // Second 8 + highbd_read_dist_8(y_dist + 8, &y_fst, &y_snd); + if (ss_y == 1) { + uint32x4_t y_tmp_fst, y_tmp_snd; + highbd_read_dist_8(y_dist + 8 + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd); + + y_fst = vaddq_u32(y_fst, y_tmp_fst); + y_snd = vaddq_u32(y_snd, y_tmp_snd); + } + + y_fst64 = vpaddlq_u32(y_fst); + y_snd64 = vpaddlq_u32(y_snd); + y_reg_snd = vcombine_u32(vqmovn_u64(y_fst64), vqmovn_u64(y_snd64)); + } + + *u_mod_fst = vaddq_u32(*u_mod_fst, y_reg_fst); + *u_mod_snd = vaddq_u32(*u_mod_snd, y_reg_snd); + *v_mod_fst = vaddq_u32(*v_mod_fst, y_reg_fst); + *v_mod_snd = vaddq_u32(*v_mod_snd, y_reg_snd); +} + +// Apply temporal filter to the chroma components. This performs temporal +// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use +// blk_fw as an array of size 4 for the weights for each of the 4 subblocks, +// else use top_weight for top half, and bottom weight for bottom half. +static void highbd_apply_temporal_filter_chroma_8( + const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride, + unsigned int uv_block_width, unsigned int uv_block_height, int ss_x, + int ss_y, int strength, uint32_t *u_accum, uint16_t *u_count, + uint32_t *v_accum, uint16_t *v_count, const uint32_t *y_dist, + const uint32_t *u_dist, const uint32_t *v_dist, + const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd, + int top_weight, int bottom_weight, const int *blk_fw) { + const int rounding = (1 << strength) >> 1; + int weight = top_weight; + + uint32x4_t mul_fst, mul_snd; + + uint32x4_t u_sum_row_1_fst, u_sum_row_2_fst, u_sum_row_3_fst; + uint32x4_t v_sum_row_1_fst, v_sum_row_2_fst, v_sum_row_3_fst; + uint32x4_t u_sum_row_1_snd, u_sum_row_2_snd, u_sum_row_3_snd; + uint32x4_t v_sum_row_1_snd, v_sum_row_2_snd, v_sum_row_3_snd; + + uint32x4_t u_sum_row_fst, v_sum_row_fst; + uint32x4_t u_sum_row_snd, v_sum_row_snd; + + // Loop variable + unsigned int h; + + (void)uv_block_width; + + // First row + mul_fst = vld1q_u32(neighbors_fst[0]); + mul_snd = vld1q_u32(neighbors_snd[0]); + + // Add chroma values + highbd_get_sum_8(u_dist, &u_sum_row_2_fst, &u_sum_row_2_snd); + highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd); + + u_sum_row_fst = vaddq_u32(u_sum_row_2_fst, u_sum_row_3_fst); + u_sum_row_snd = vaddq_u32(u_sum_row_2_snd, u_sum_row_3_snd); + + highbd_get_sum_8(v_dist, &v_sum_row_2_fst, &v_sum_row_2_snd); + highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd); + + v_sum_row_fst = vaddq_u32(v_sum_row_2_fst, v_sum_row_3_fst); + v_sum_row_snd = vaddq_u32(v_sum_row_2_snd, v_sum_row_3_snd); + + // Add luma values + highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst, + &u_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd); + + // Get modifier and store result + if (blk_fw) { + highbd_average_4(&u_sum_row_fst, u_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&u_sum_row_snd, u_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + highbd_average_4(&v_sum_row_fst, v_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&v_sum_row_snd, v_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + } else { + highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, u_sum_row_fst, + u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, v_sum_row_fst, + v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + } + highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count, + u_accum); + highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, + v_accum); + + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_dist += DIST_STRIDE * (1 + ss_y); + + // Then all the rows except the last one + mul_fst = vld1q_u32(neighbors_fst[1]); + mul_snd = vld1q_u32(neighbors_snd[1]); + + for (h = 1; h < uv_block_height - 1; ++h) { + // Move the weight pointer to the bottom half of the blocks + if (h == uv_block_height / 2) { + if (blk_fw) { + blk_fw += 2; + } else { + weight = bottom_weight; + } + } + + // Shift the rows up + u_sum_row_1_fst = u_sum_row_2_fst; + u_sum_row_2_fst = u_sum_row_3_fst; + u_sum_row_1_snd = u_sum_row_2_snd; + u_sum_row_2_snd = u_sum_row_3_snd; + + v_sum_row_1_fst = v_sum_row_2_fst; + v_sum_row_2_fst = v_sum_row_3_fst; + v_sum_row_1_snd = v_sum_row_2_snd; + v_sum_row_2_snd = v_sum_row_3_snd; + + // Add chroma values + u_sum_row_fst = vaddq_u32(u_sum_row_1_fst, u_sum_row_2_fst); + u_sum_row_snd = vaddq_u32(u_sum_row_1_snd, u_sum_row_2_snd); + highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd); + u_sum_row_fst = vaddq_u32(u_sum_row_fst, u_sum_row_3_fst); + u_sum_row_snd = vaddq_u32(u_sum_row_snd, u_sum_row_3_snd); + + v_sum_row_fst = vaddq_u32(v_sum_row_1_fst, v_sum_row_2_fst); + v_sum_row_snd = vaddq_u32(v_sum_row_1_snd, v_sum_row_2_snd); + highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd); + v_sum_row_fst = vaddq_u32(v_sum_row_fst, v_sum_row_3_fst); + v_sum_row_snd = vaddq_u32(v_sum_row_snd, v_sum_row_3_snd); + + // Add luma values + highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst, + &u_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd); + + // Get modifier and store result + if (blk_fw) { + highbd_average_4(&u_sum_row_fst, u_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&u_sum_row_snd, u_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + highbd_average_4(&v_sum_row_fst, v_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&v_sum_row_snd, v_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + } else { + highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, u_sum_row_fst, + u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, v_sum_row_fst, + v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + } + + highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count, + u_accum); + highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, + v_accum); + + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_dist += DIST_STRIDE * (1 + ss_y); + } + + // The last row + mul_fst = vld1q_u32(neighbors_fst[0]); + mul_snd = vld1q_u32(neighbors_snd[0]); + + // Shift the rows up + u_sum_row_1_fst = u_sum_row_2_fst; + u_sum_row_2_fst = u_sum_row_3_fst; + u_sum_row_1_snd = u_sum_row_2_snd; + u_sum_row_2_snd = u_sum_row_3_snd; + + v_sum_row_1_fst = v_sum_row_2_fst; + v_sum_row_2_fst = v_sum_row_3_fst; + v_sum_row_1_snd = v_sum_row_2_snd; + v_sum_row_2_snd = v_sum_row_3_snd; + + // Add chroma values + u_sum_row_fst = vaddq_u32(u_sum_row_1_fst, u_sum_row_2_fst); + v_sum_row_fst = vaddq_u32(v_sum_row_1_fst, v_sum_row_2_fst); + u_sum_row_snd = vaddq_u32(u_sum_row_1_snd, u_sum_row_2_snd); + v_sum_row_snd = vaddq_u32(v_sum_row_1_snd, v_sum_row_2_snd); + + // Add luma values + highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst, + &u_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd); + + // Get modifier and store result + if (blk_fw) { + highbd_average_4(&u_sum_row_fst, u_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&u_sum_row_snd, u_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + highbd_average_4(&v_sum_row_fst, v_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&v_sum_row_snd, v_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + } else { + highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, u_sum_row_fst, + u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, v_sum_row_fst, + v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + } + + highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count, + u_accum); + highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, + v_accum); +} + +// Perform temporal filter for the chroma components. +static void highbd_apply_temporal_filter_chroma( + const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride, + unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, + int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) { + const unsigned int uv_width = block_width >> ss_x, + uv_height = block_height >> ss_y; + + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x; + const unsigned int uv_mid_width = uv_width >> 1, + uv_last_width = uv_width - uv_blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const uint32_t *const *neighbors_fst; + const uint32_t *const *neighbors_snd; + + if (uv_width == 8) { + // Special Case: We are subsampling in x direction on a 16x16 block. Since + // we are operating on a row of 8 chroma pixels, we can't use the usual + // left-middle-right pattern. + assert(ss_x); + + if (ss_y) { + neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else { + neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS; + } + + if (use_whole_blk) { + highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL); + } else { + highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, 0, 0, blk_fw); + } + + return; + } + + // Left + if (ss_x && ss_y) { + neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else { + neighbors_fst = HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS; + } + + highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, + neighbors_snd, top_weight, bottom_weight, NULL); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + if (ss_x && ss_y) { + neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else { + neighbors_fst = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS; + } + + for (; uv_blk_col < uv_mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; uv_blk_col < uv_last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL); + } + + // Right + if (ss_x && ss_y) { + neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else { + neighbors_snd = HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS; + } + + highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, + neighbors_snd, top_weight, bottom_weight, NULL); +} + +void vp9_highbd_apply_temporal_filter_neon( + const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, + int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, + int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *const blk_fw, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) { + const unsigned int chroma_height = block_height >> ss_y, + chroma_width = block_width >> ss_x; + + DECLARE_ALIGNED(16, uint32_t, y_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint32_t, u_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint32_t, v_dist[BH * DIST_STRIDE]) = { 0 }; + + uint32_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1, + *v_dist_ptr = v_dist + 1; + const uint16_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src; + const uint16_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre; + + // Loop variables + unsigned int row, blk_col; + + assert(block_width <= BW && "block width too large"); + assert(block_height <= BH && "block height too large"); + assert(block_width % 16 == 0 && "block width must be multiple of 16"); + assert(block_height % 2 == 0 && "block height must be even"); + assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) && + "invalid chroma subsampling"); + assert(strength >= 4 && strength <= 14 && + "invalid adjusted temporal filter strength"); + assert(blk_fw[0] >= 0 && "filter weight must be positive"); + assert( + (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) && + "subblock filter weight must be positive"); + assert(blk_fw[0] <= 2 && "subblock filter weight must be less than 2"); + assert( + (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) && + "subblock filter weight must be less than 2"); + + // Precompute the difference squared + for (row = 0; row < block_height; row++) { + for (blk_col = 0; blk_col < block_width; blk_col += 8) { + highbd_store_dist_8(y_src_ptr + blk_col, y_pre_ptr + blk_col, + y_dist_ptr + blk_col); + } + y_src_ptr += y_src_stride; + y_pre_ptr += y_pre_stride; + y_dist_ptr += DIST_STRIDE; + } + + for (row = 0; row < chroma_height; row++) { + for (blk_col = 0; blk_col < chroma_width; blk_col += 8) { + highbd_store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col, + u_dist_ptr + blk_col); + highbd_store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col, + v_dist_ptr + blk_col); + } + + u_src_ptr += uv_src_stride; + u_pre_ptr += uv_pre_stride; + u_dist_ptr += DIST_STRIDE; + v_src_ptr += uv_src_stride; + v_pre_ptr += uv_pre_stride; + v_dist_ptr += DIST_STRIDE; + } + + y_dist_ptr = y_dist + 1; + u_dist_ptr = u_dist + 1; + v_dist_ptr = v_dist + 1; + + highbd_apply_temporal_filter_luma(y_pre, y_pre_stride, block_width, + block_height, ss_x, ss_y, strength, blk_fw, + use_whole_blk, y_accum, y_count, y_dist_ptr, + u_dist_ptr, v_dist_ptr); + + highbd_apply_temporal_filter_chroma( + u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y, + strength, blk_fw, use_whole_blk, u_accum, u_count, v_accum, v_count, + y_dist_ptr, u_dist_ptr, v_dist_ptr); +} diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c new file mode 100644 index 0000000000..96d0614367 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -0,0 +1,403 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include + +#include "./vpx_config.h" +#include "vpx_mem/vpx_mem.h" + +#include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/common/vp9_seg_common.h" + +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_quantize.h" +#include "vp9/encoder/vp9_rd.h" + +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/vpx_dsp_common.h" + +static VPX_FORCE_INLINE void calculate_dqcoeff_and_store( + const int16x8_t qcoeff, const int16x8_t dequant, tran_low_t *dqcoeff) { + const int32x4_t dqcoeff_0 = + vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant)); + const int32x4_t dqcoeff_1 = + vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant)); + +#if CONFIG_VP9_HIGHBITDEPTH + vst1q_s32(dqcoeff, dqcoeff_0); + vst1q_s32(dqcoeff + 4, dqcoeff_1); +#else + vst1q_s16(dqcoeff, vcombine_s16(vmovn_s32(dqcoeff_0), vmovn_s32(dqcoeff_1))); +#endif // CONFIG_VP9_HIGHBITDEPTH +} + +static VPX_FORCE_INLINE int16x8_t get_max_lane_eob(const int16_t *iscan_ptr, + int16x8_t v_eobmax, + uint16x8_t v_nz_mask) { + const int16x8_t v_iscan = vld1q_s16(&iscan_ptr[0]); + const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, vdupq_n_s16(0), v_iscan); + return vmaxq_s16(v_eobmax, v_nz_iscan); +} + +static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) { +#if VPX_ARCH_AARCH64 + return (uint16_t)vmaxvq_s16(v_eobmax); +#else + const int16x4_t v_eobmax_3210 = + vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax)); + const int64x1_t v_eobmax_xx32 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32); + const int16x4_t v_eobmax_tmp = + vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32)); + const int64x1_t v_eobmax_xxx3 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); + const int16x4_t v_eobmax_final = + vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); + + return (uint16_t)vget_lane_s16(v_eobmax_final, 0); +#endif // VPX_ARCH_AARCH64 +} + +static VPX_FORCE_INLINE void load_fp_values( + const struct macroblock_plane *mb_plane, const int16_t *dequant_ptr, + int16x8_t *round, int16x8_t *quant, int16x8_t *dequant) { + *round = vld1q_s16(mb_plane->round_fp); + *quant = vld1q_s16(mb_plane->quant_fp); + *dequant = vld1q_s16(dequant_ptr); +} + +static VPX_FORCE_INLINE void update_fp_values(int16x8_t *v_round, + int16x8_t *v_quant, + int16x8_t *v_dequant) { +#if VPX_ARCH_AARCH64 + *v_round = vdupq_laneq_s16(*v_round, 1); + *v_quant = vdupq_laneq_s16(*v_quant, 1); + *v_dequant = vdupq_laneq_s16(*v_dequant, 1); +#else + *v_round = vdupq_lane_s16(vget_low_s16(*v_round), 1); + *v_quant = vdupq_lane_s16(vget_low_s16(*v_quant), 1); + *v_dequant = vdupq_lane_s16(vget_low_s16(*v_dequant), 1); +#endif +} + +static VPX_FORCE_INLINE void quantize_fp_8( + const int16x8_t *v_round, const int16x8_t *v_quant, + const int16x8_t *v_dequant, const tran_low_t *coeff_ptr, + const int16_t *iscan_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + int16x8_t *v_eobmax) { + const int16x8_t v_zero = vdupq_n_s16(0); + const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + const int16x8_t v_abs = vabsq_s16(v_coeff); + const int16x8_t v_tmp = vqaddq_s16(v_abs, *v_round); + const int32x4_t v_tmp_lo = + vmull_s16(vget_low_s16(v_tmp), vget_low_s16(*v_quant)); + const int32x4_t v_tmp_hi = + vmull_s16(vget_high_s16(v_tmp), vget_high_s16(*v_quant)); + const int16x8_t v_tmp2 = + vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16)); + const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); + const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); + const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); + calculate_dqcoeff_and_store(v_qcoeff, *v_dequant, dqcoeff_ptr); + store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff); + + *v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask); +} + +void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + int i; + int16x8_t v_eobmax = vdupq_n_s16(-1); + int16x8_t v_round, v_quant, v_dequant; + const int16_t *iscan = scan_order->iscan; + + load_fp_values(mb_plane, dequant_ptr, &v_round, &v_quant, &v_dequant); + // process dc and the first seven ac coeffs + quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr, iscan, qcoeff_ptr, + dqcoeff_ptr, &v_eobmax); + + // now process the rest of the ac coeffs + update_fp_values(&v_round, &v_quant, &v_dequant); + for (i = 8; i < n_coeffs; i += 8) { + quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr + i, iscan + i, + qcoeff_ptr + i, dqcoeff_ptr + i, &v_eobmax); + } + + *eob_ptr = get_max_eob(v_eobmax); +} + +static INLINE int32x4_t extract_sign_bit(int32x4_t a) { + return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31)); +} + +static VPX_FORCE_INLINE void quantize_fp_32x32_8( + const int16x8_t *v_round, const int16x8_t *v_quant, + const int16x8_t *v_dequant, const int16x8_t *dequant_thresh, + const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int16x8_t *v_eobmax) { + const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + const int16x8_t v_coeff_abs = vabsq_s16(v_coeff); + const int16x8_t v_thr_mask = + vreinterpretq_s16_u16(vcgeq_s16(v_coeff_abs, *dequant_thresh)); + const int16x8_t v_tmp_rnd = + vandq_s16(vqaddq_s16(v_coeff_abs, *v_round), v_thr_mask); + const int16x8_t v_abs_qcoeff = vqdmulhq_s16(v_tmp_rnd, *v_quant); + const int16x8_t v_qcoeff = + vsubq_s16(veorq_s16(v_abs_qcoeff, v_coeff_sign), v_coeff_sign); + const uint16x8_t v_nz_mask = vceqq_s16(v_abs_qcoeff, vdupq_n_s16(0)); + + int32x4_t dqcoeff_0, dqcoeff_1; + dqcoeff_0 = vmull_s16(vget_low_s16(v_qcoeff), vget_low_s16(*v_dequant)); + dqcoeff_1 = vmull_s16(vget_high_s16(v_qcoeff), vget_high_s16(*v_dequant)); + // Add 1 if negative to round towards zero because the C uses division. + dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0)); + dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1)); + +#if CONFIG_VP9_HIGHBITDEPTH + vst1q_s32(dqcoeff_ptr, vshrq_n_s32(dqcoeff_0, 1)); + vst1q_s32(dqcoeff_ptr + 4, vshrq_n_s32(dqcoeff_1, 1)); +#else + store_s16q_to_tran_low(dqcoeff_ptr, vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), + vshrn_n_s32(dqcoeff_1, 1))); +#endif + + store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff); + + *v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask); +} + +void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + int16x8_t eob_max = vdupq_n_s16(-1); + // ROUND_POWER_OF_TWO(round_ptr[], 1) + int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round_fp), 1); + int16x8_t quant = vld1q_s16(mb_plane->quant_fp); + int16x8_t dequant = vld1q_s16(dequant_ptr); + // dequant >> 2 is used similar to zbin as a threshold. + int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2); + int i; + const int16_t *iscan = scan_order->iscan; + + (void)n_coeffs; + + // Process dc and the first seven ac coeffs. + quantize_fp_32x32_8(&round, &quant, &dequant, &dequant_thresh, coeff_ptr, + iscan, qcoeff_ptr, dqcoeff_ptr, &eob_max); + + update_fp_values(&round, &quant, &dequant); + dequant_thresh = vdupq_lane_s16(vget_low_s16(dequant_thresh), 1); + + iscan += 8; + coeff_ptr += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + + // Process the rest of the ac coeffs. + for (i = 8; i < 32 * 32; i += 8) { + quantize_fp_32x32_8(&round, &quant, &dequant, &dequant_thresh, coeff_ptr, + iscan, qcoeff_ptr, dqcoeff_ptr, &eob_max); + + iscan += 8; + coeff_ptr += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + } + + *eob_ptr = get_max_eob(eob_max); +} + +#if CONFIG_VP9_HIGHBITDEPTH +static VPX_FORCE_INLINE uint16x4_t +highbd_quantize_fp_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, int32x4_t v_quant_s32, + int32x4_t v_dequant_s32, int32x4_t v_round_s32) { + const int32x4_t v_coeff = vld1q_s32(coeff_ptr); + const int32x4_t v_coeff_sign = + vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0))); + const int32x4_t v_abs_coeff = vabsq_s32(v_coeff); + const int32x4_t v_tmp = vaddq_s32(v_abs_coeff, v_round_s32); + // const int abs_qcoeff = (int)((tmp * quant) >> 16); + const int32x4_t v_abs_qcoeff = vqdmulhq_s32(v_tmp, v_quant_s32); + // qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + const int32x4_t v_qcoeff = + vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign); + const int32x4_t v_abs_dqcoeff = vmulq_s32(v_abs_qcoeff, v_dequant_s32); + // dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + const int32x4_t v_dqcoeff = + vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign); + + vst1q_s32(qcoeff_ptr, v_qcoeff); + vst1q_s32(dqcoeff_ptr, v_dqcoeff); + + // Packed nz_qcoeff_mask. Used to find eob. + return vmovn_u32(vceqq_s32(v_abs_qcoeff, vdupq_n_s32(0))); +} + +void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *mb_plane, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const int16x4_t v_zero = vdup_n_s16(0); + const int16x4_t v_quant = vld1_s16(mb_plane->quant_fp); + const int16x4_t v_dequant = vld1_s16(dequant_ptr); + const int16x4_t v_round = vld1_s16(mb_plane->round_fp); + int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero); + int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15); + int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero); + uint16x4_t v_mask_lo, v_mask_hi; + int16x8_t v_eobmax = vdupq_n_s16(-1); + const int16_t *iscan = scan_order->iscan; + + // DC and first 3 AC + v_mask_lo = highbd_quantize_fp_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, + v_quant_s32, v_dequant_s32, v_round_s32); + + // overwrite the DC constants with AC constants + v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1); + v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1); + v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1); + + // 4 more AC + v_mask_hi = + highbd_quantize_fp_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4, + v_quant_s32, v_dequant_s32, v_round_s32); + + // Find the max lane eob for the first 8 coeffs. + v_eobmax = + get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi)); + + n_coeffs -= 8; + do { + coeff_ptr += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + iscan += 8; + v_mask_lo = highbd_quantize_fp_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, + v_quant_s32, v_dequant_s32, v_round_s32); + v_mask_hi = + highbd_quantize_fp_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4, + v_quant_s32, v_dequant_s32, v_round_s32); + // Find the max lane eob for 8 coeffs. + v_eobmax = + get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi)); + n_coeffs -= 8; + } while (n_coeffs); + + *eob_ptr = get_max_eob(v_eobmax); +} + +static VPX_FORCE_INLINE uint16x4_t +highbd_quantize_fp_32x32_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, int32x4_t v_quant_s32, + int32x4_t v_dequant_s32, int32x4_t v_round_s32) { + const int32x4_t v_coeff = vld1q_s32(coeff_ptr); + const int32x4_t v_coeff_sign = + vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0))); + const int32x4_t v_abs_coeff = vabsq_s32(v_coeff); + // ((abs_coeff << (1 + log_scale)) >= dequant_ptr[rc01]) + const int32x4_t v_abs_coeff_scaled = vshlq_n_s32(v_abs_coeff, 2); + const uint32x4_t v_mask = vcgeq_s32(v_abs_coeff_scaled, v_dequant_s32); + // const int64_t tmp = vmask ? (int64_t)abs_coeff + log_scaled_round : 0 + const int32x4_t v_tmp = vandq_s32(vaddq_s32(v_abs_coeff, v_round_s32), + vreinterpretq_s32_u32(v_mask)); + // const int abs_qcoeff = (int)((tmp * quant) >> (16 - log_scale)); + const int32x4_t v_abs_qcoeff = + vqdmulhq_s32(vshlq_n_s32(v_tmp, 1), v_quant_s32); + // qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + const int32x4_t v_qcoeff = + vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign); + // vshlq_s32 will shift right if shift value is negative. + const int32x4_t v_abs_dqcoeff = + vshrq_n_s32(vmulq_s32(v_abs_qcoeff, v_dequant_s32), 1); + // dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + const int32x4_t v_dqcoeff = + vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign); + + vst1q_s32(qcoeff_ptr, v_qcoeff); + vst1q_s32(dqcoeff_ptr, v_dqcoeff); + + // Packed nz_qcoeff_mask. Used to find eob. + return vmovn_u32(vceqq_s32(v_abs_qcoeff, vdupq_n_s32(0))); +} + +void vp9_highbd_quantize_fp_32x32_neon( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const int16x4_t v_quant = vld1_s16(mb_plane->quant_fp); + const int16x4_t v_dequant = vld1_s16(dequant_ptr); + const int16x4_t v_zero = vdup_n_s16(0); + const int16x4_t v_round = + vqrdmulh_n_s16(vld1_s16(mb_plane->round_fp), (int16_t)(1 << 14)); + int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero); + int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15); + int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero); + uint16x4_t v_mask_lo, v_mask_hi; + int16x8_t v_eobmax = vdupq_n_s16(-1); + const int16_t *iscan = scan_order->iscan; + + // DC and first 3 AC + v_mask_lo = + highbd_quantize_fp_32x32_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, + v_quant_s32, v_dequant_s32, v_round_s32); + + // overwrite the DC constants with AC constants + v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1); + v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1); + v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1); + + // 4 more AC + v_mask_hi = + highbd_quantize_fp_32x32_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4, + v_quant_s32, v_dequant_s32, v_round_s32); + + // Find the max lane eob for the first 8 coeffs. + v_eobmax = + get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi)); + + n_coeffs -= 8; + do { + coeff_ptr += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + iscan += 8; + v_mask_lo = + highbd_quantize_fp_32x32_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, + v_quant_s32, v_dequant_s32, v_round_s32); + v_mask_hi = highbd_quantize_fp_32x32_4(coeff_ptr + 4, qcoeff_ptr + 4, + dqcoeff_ptr + 4, v_quant_s32, + v_dequant_s32, v_round_s32); + // Find the max lane eob for 8 coeffs. + v_eobmax = + get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi)); + n_coeffs -= 8; + } while (n_coeffs); + + *eob_ptr = get_max_eob(v_eobmax); +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c new file mode 100644 index 0000000000..a651a15d90 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c @@ -0,0 +1,849 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_temporal_filter.h" +#include "vp9/encoder/vp9_temporal_filter_constants.h" + +// Read in 8 pixels from a and b as 8-bit unsigned integers, compute the +// difference squared, and store as unsigned 16-bit integer to dst. +static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b, + uint16_t *dst) { + const uint8x8_t a_reg = vld1_u8(a); + const uint8x8_t b_reg = vld1_u8(b); + + uint16x8_t dist_first = vabdl_u8(a_reg, b_reg); + dist_first = vmulq_u16(dist_first, dist_first); + + vst1q_u16(dst, dist_first); +} + +static INLINE void store_dist_16(const uint8_t *a, const uint8_t *b, + uint16_t *dst) { + const uint8x16_t a_reg = vld1q_u8(a); + const uint8x16_t b_reg = vld1q_u8(b); + + uint16x8_t dist_first = vabdl_u8(vget_low_u8(a_reg), vget_low_u8(b_reg)); + uint16x8_t dist_second = vabdl_u8(vget_high_u8(a_reg), vget_high_u8(b_reg)); + dist_first = vmulq_u16(dist_first, dist_first); + dist_second = vmulq_u16(dist_second, dist_second); + + vst1q_u16(dst, dist_first); + vst1q_u16(dst + 8, dist_second); +} + +static INLINE void read_dist_8(const uint16_t *dist, uint16x8_t *dist_reg) { + *dist_reg = vld1q_u16(dist); +} + +static INLINE void read_dist_16(const uint16_t *dist, uint16x8_t *reg_first, + uint16x8_t *reg_second) { + read_dist_8(dist, reg_first); + read_dist_8(dist + 8, reg_second); +} + +// Average the value based on the number of values summed (9 for pixels away +// from the border, 4 for pixels in corners, and 6 for other edge values). +// +// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply +// by weight. +static INLINE uint16x8_t average_8(uint16x8_t sum, + const uint16x8_t *mul_constants, + const int strength, const int rounding, + const uint16x8_t *weight) { + const uint32x4_t rounding_u32 = vdupq_n_u32(rounding << 16); + const uint16x8_t weight_u16 = *weight; + const uint16x8_t sixteen = vdupq_n_u16(16); + const int32x4_t strength_u32 = vdupq_n_s32(-strength - 16); + + // modifier * 3 / index; + uint32x4_t sum_hi = + vmull_u16(vget_low_u16(sum), vget_low_u16(*mul_constants)); + uint32x4_t sum_lo = + vmull_u16(vget_high_u16(sum), vget_high_u16(*mul_constants)); + + sum_lo = vqaddq_u32(sum_lo, rounding_u32); + sum_hi = vqaddq_u32(sum_hi, rounding_u32); + + // we cannot use vshrn_n_u32 as strength is not known at compile time. + sum_lo = vshlq_u32(sum_lo, strength_u32); + sum_hi = vshlq_u32(sum_hi, strength_u32); + + sum = vcombine_u16(vmovn_u32(sum_hi), vmovn_u32(sum_lo)); + + // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4 + // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385 + // So this needs to use the epu16 version which did not come until SSE4. + sum = vminq_u16(sum, sixteen); + sum = vsubq_u16(sixteen, sum); + return vmulq_u16(sum, weight_u16); +} + +// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.' +static void accumulate_and_store_8(const uint16x8_t sum_u16, + const uint8_t *pred, uint16_t *count, + uint32_t *accumulator) { + uint16x8_t pred_u16 = vmovl_u8(vld1_u8(pred)); + uint16x8_t count_u16 = vld1q_u16(count); + uint32x4_t accum_0_u32, accum_1_u32; + + count_u16 = vqaddq_u16(count_u16, sum_u16); + vst1q_u16(count, count_u16); + + accum_0_u32 = vld1q_u32(accumulator); + accum_1_u32 = vld1q_u32(accumulator + 4); + + accum_0_u32 = + vmlal_u16(accum_0_u32, vget_low_u16(sum_u16), vget_low_u16(pred_u16)); + accum_1_u32 = + vmlal_u16(accum_1_u32, vget_high_u16(sum_u16), vget_high_u16(pred_u16)); + + vst1q_u32(accumulator, accum_0_u32); + vst1q_u32(accumulator + 4, accum_1_u32); +} + +static INLINE void accumulate_and_store_16(const uint16x8_t sum_0_u16, + const uint16x8_t sum_1_u16, + const uint8_t *pred, uint16_t *count, + uint32_t *accumulator) { + uint8x16_t pred_u8 = vld1q_u8(pred); + uint16x8_t pred_0_u16 = vmovl_u8(vget_low_u8(pred_u8)); + uint16x8_t pred_1_u16 = vmovl_u8(vget_high_u8(pred_u8)); + uint16x8_t count_0_u16 = vld1q_u16(count); + uint16x8_t count_1_u16 = vld1q_u16(count + 8); + uint32x4_t accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32; + + count_0_u16 = vqaddq_u16(count_0_u16, sum_0_u16); + vst1q_u16(count, count_0_u16); + count_1_u16 = vqaddq_u16(count_1_u16, sum_1_u16); + vst1q_u16(count + 8, count_1_u16); + + accum_0_u32 = vld1q_u32(accumulator); + accum_1_u32 = vld1q_u32(accumulator + 4); + accum_2_u32 = vld1q_u32(accumulator + 8); + accum_3_u32 = vld1q_u32(accumulator + 12); + + accum_0_u32 = + vmlal_u16(accum_0_u32, vget_low_u16(sum_0_u16), vget_low_u16(pred_0_u16)); + accum_1_u32 = vmlal_u16(accum_1_u32, vget_high_u16(sum_0_u16), + vget_high_u16(pred_0_u16)); + accum_2_u32 = + vmlal_u16(accum_2_u32, vget_low_u16(sum_1_u16), vget_low_u16(pred_1_u16)); + accum_3_u32 = vmlal_u16(accum_3_u32, vget_high_u16(sum_1_u16), + vget_high_u16(pred_1_u16)); + + vst1q_u32(accumulator, accum_0_u32); + vst1q_u32(accumulator + 4, accum_1_u32); + vst1q_u32(accumulator + 8, accum_2_u32); + vst1q_u32(accumulator + 12, accum_3_u32); +} + +// Read in 8 pixels from y_dist. For each index i, compute y_dist[i-1] + +// y_dist[i] + y_dist[i+1] and store in sum as 16-bit unsigned int. +static INLINE void get_sum_8(const uint16_t *y_dist, uint16x8_t *sum) { + uint16x8_t dist_reg, dist_left, dist_right; + + dist_reg = vld1q_u16(y_dist); + dist_left = vld1q_u16(y_dist - 1); + dist_right = vld1q_u16(y_dist + 1); + + *sum = vqaddq_u16(dist_reg, dist_left); + *sum = vqaddq_u16(*sum, dist_right); +} + +// Read in 16 pixels from y_dist. For each index i, compute y_dist[i-1] + +// y_dist[i] + y_dist[i+1]. Store the result for first 8 pixels in sum_first and +// the rest in sum_second. +static INLINE void get_sum_16(const uint16_t *y_dist, uint16x8_t *sum_first, + uint16x8_t *sum_second) { + get_sum_8(y_dist, sum_first); + get_sum_8(y_dist + 8, sum_second); +} + +// Read in a row of chroma values corresponds to a row of 16 luma values. +static INLINE void read_chroma_dist_row_16(int ss_x, const uint16_t *u_dist, + const uint16_t *v_dist, + uint16x8_t *u_first, + uint16x8_t *u_second, + uint16x8_t *v_first, + uint16x8_t *v_second) { + if (!ss_x) { + // If there is no chroma subsampling in the horizontal direction, then we + // need to load 16 entries from chroma. + read_dist_16(u_dist, u_first, u_second); + read_dist_16(v_dist, v_first, v_second); + } else { // ss_x == 1 + // Otherwise, we only need to load 8 entries + uint16x8_t u_reg, v_reg; + uint16x8x2_t pair; + + read_dist_8(u_dist, &u_reg); + + pair = vzipq_u16(u_reg, u_reg); + *u_first = pair.val[0]; + *u_second = pair.val[1]; + + read_dist_8(v_dist, &v_reg); + + pair = vzipq_u16(v_reg, v_reg); + *v_first = pair.val[0]; + *v_second = pair.val[1]; + } +} + +// Add a row of luma distortion to 8 corresponding chroma mods. +static INLINE void add_luma_dist_to_8_chroma_mod(const uint16_t *y_dist, + int ss_x, int ss_y, + uint16x8_t *u_mod, + uint16x8_t *v_mod) { + uint16x8_t y_reg; + if (!ss_x) { + read_dist_8(y_dist, &y_reg); + if (ss_y == 1) { + uint16x8_t y_tmp; + read_dist_8(y_dist + DIST_STRIDE, &y_tmp); + + y_reg = vqaddq_u16(y_reg, y_tmp); + } + } else { + uint16x8_t y_first, y_second; + uint32x4_t y_first32, y_second32; + + read_dist_16(y_dist, &y_first, &y_second); + if (ss_y == 1) { + uint16x8_t y_tmp_0, y_tmp_1; + read_dist_16(y_dist + DIST_STRIDE, &y_tmp_0, &y_tmp_1); + + y_first = vqaddq_u16(y_first, y_tmp_0); + y_second = vqaddq_u16(y_second, y_tmp_1); + } + + y_first32 = vpaddlq_u16(y_first); + y_second32 = vpaddlq_u16(y_second); + + y_reg = vcombine_u16(vqmovn_u32(y_first32), vqmovn_u32(y_second32)); + } + + *u_mod = vqaddq_u16(*u_mod, y_reg); + *v_mod = vqaddq_u16(*v_mod, y_reg); +} + +// Apply temporal filter to the luma components. This performs temporal +// filtering on a luma block of 16 X block_height. Use blk_fw as an array of +// size 4 for the weights for each of the 4 subblocks if blk_fw is not NULL, +// else use top_weight for top half, and bottom weight for bottom half. +static void apply_temporal_filter_luma_16( + const uint8_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist, + const int16_t *const *neighbors_first, + const int16_t *const *neighbors_second, int top_weight, int bottom_weight, + const int *blk_fw) { + const int rounding = (1 << strength) >> 1; + uint16x8_t weight_first, weight_second; + + uint16x8_t mul_first, mul_second; + + uint16x8_t sum_row_1_first, sum_row_1_second; + uint16x8_t sum_row_2_first, sum_row_2_second; + uint16x8_t sum_row_3_first, sum_row_3_second; + + uint16x8_t u_first, u_second; + uint16x8_t v_first, v_second; + + uint16x8_t sum_row_first; + uint16x8_t sum_row_second; + + // Loop variables + unsigned int h; + + assert(strength >= 0); + assert(strength <= 6); + + assert(block_width == 16); + (void)block_width; + + // Initialize the weights + if (blk_fw) { + weight_first = vdupq_n_u16(blk_fw[0]); + weight_second = vdupq_n_u16(blk_fw[1]); + } else { + weight_first = vdupq_n_u16(top_weight); + weight_second = weight_first; + } + + // First row + mul_first = vld1q_u16((const uint16_t *)neighbors_first[0]); + mul_second = vld1q_u16((const uint16_t *)neighbors_second[0]); + + // Add luma values + get_sum_16(y_dist, &sum_row_2_first, &sum_row_2_second); + get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + sum_row_first = vqaddq_u16(sum_row_2_first, sum_row_3_first); + sum_row_second = vqaddq_u16(sum_row_2_second, sum_row_3_second); + + // Add chroma values + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first, + &v_second); + + sum_row_first = vqaddq_u16(sum_row_first, u_first); + sum_row_second = vqaddq_u16(sum_row_second, u_second); + + sum_row_first = vqaddq_u16(sum_row_first, v_first); + sum_row_second = vqaddq_u16(sum_row_second, v_second); + + // Get modifier and store result + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, &weight_first); + + sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding, + &weight_second); + + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + + u_dist += DIST_STRIDE; + v_dist += DIST_STRIDE; + + // Then all the rows except the last one + mul_first = vld1q_u16((const uint16_t *)neighbors_first[1]); + mul_second = vld1q_u16((const uint16_t *)neighbors_second[1]); + + for (h = 1; h < block_height - 1; ++h) { + // Move the weight to bottom half + if (!use_whole_blk && h == block_height / 2) { + if (blk_fw) { + weight_first = vdupq_n_u16(blk_fw[2]); + weight_second = vdupq_n_u16(blk_fw[3]); + } else { + weight_first = vdupq_n_u16(bottom_weight); + weight_second = weight_first; + } + } + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = vqaddq_u16(sum_row_1_first, sum_row_2_first); + sum_row_second = vqaddq_u16(sum_row_1_second, sum_row_2_second); + + get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + sum_row_first = vqaddq_u16(sum_row_first, sum_row_3_first); + sum_row_second = vqaddq_u16(sum_row_second, sum_row_3_second); + + // Add chroma values to the modifier + if (ss_y == 0 || h % 2 == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + u_dist += DIST_STRIDE; + v_dist += DIST_STRIDE; + } + + sum_row_first = vqaddq_u16(sum_row_first, u_first); + sum_row_second = vqaddq_u16(sum_row_second, u_second); + sum_row_first = vqaddq_u16(sum_row_first, v_first); + sum_row_second = vqaddq_u16(sum_row_second, v_second); + + // Get modifier and store result + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, &weight_first); + sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding, + &weight_second); + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + } + + // The last row + mul_first = vld1q_u16((const uint16_t *)neighbors_first[0]); + mul_second = vld1q_u16((const uint16_t *)neighbors_second[0]); + + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = vqaddq_u16(sum_row_1_first, sum_row_2_first); + sum_row_second = vqaddq_u16(sum_row_1_second, sum_row_2_second); + + // Add chroma values to the modifier + if (ss_y == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first, + &v_second); + } + + sum_row_first = vqaddq_u16(sum_row_first, u_first); + sum_row_second = vqaddq_u16(sum_row_second, u_second); + sum_row_first = vqaddq_u16(sum_row_first, v_first); + sum_row_second = vqaddq_u16(sum_row_second, v_second); + + // Get modifier and store result + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, &weight_first); + sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding, + &weight_second); + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); +} + +// Perform temporal filter for the luma component. +static void apply_temporal_filter_luma( + const uint8_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) { + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int blk_col_step = 16, uv_blk_col_step = 16 >> ss_x; + const unsigned int mid_width = block_width >> 1, + last_width = block_width - blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const int16_t *const *neighbors_first; + const int16_t *const *neighbors_second; + + if (block_width == 16) { + // Special Case: The block width is 16 and we are operating on a row of 16 + // chroma pixels. In this case, we can't use the usual left-middle-right + // pattern. We also don't support splitting now. + neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS; + neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS; + if (use_whole_blk) { + apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); + } else { + apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, 0, 0, blk_fw); + } + + return; + } + + // Left + neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS; + neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS; + apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + neighbors_first = LUMA_MIDDLE_COLUMN_NEIGHBORS; + for (; blk_col < mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; blk_col < last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); + } + + // Right + neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS; + apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); +} + +// Apply temporal filter to the chroma components. This performs temporal +// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use +// blk_fw as an array of size 4 for the weights for each of the 4 subblocks, +// else use top_weight for top half, and bottom weight for bottom half. +static void apply_temporal_filter_chroma_8( + const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, + unsigned int uv_block_height, int ss_x, int ss_y, int strength, + uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist, + const int16_t *const *neighbors, int top_weight, int bottom_weight, + const int *blk_fw) { + const int rounding = (1 << strength) >> 1; + + uint16x8_t weight; + + uint16x8_t mul; + + uint16x8_t u_sum_row_1, u_sum_row_2, u_sum_row_3; + uint16x8_t v_sum_row_1, v_sum_row_2, v_sum_row_3; + + uint16x8_t u_sum_row, v_sum_row; + + // Loop variable + unsigned int h; + + // Initialize weight + if (blk_fw) { + weight = vcombine_u16(vdup_n_u16(blk_fw[0]), vdup_n_u16(blk_fw[1])); + } else { + weight = vdupq_n_u16(top_weight); + } + + // First row + mul = vld1q_u16((const uint16_t *)neighbors[0]); + + // Add chroma values + get_sum_8(u_dist, &u_sum_row_2); + get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3); + + u_sum_row = vqaddq_u16(u_sum_row_2, u_sum_row_3); + + get_sum_8(v_dist, &v_sum_row_2); + get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3); + + v_sum_row = vqaddq_u16(v_sum_row_2, v_sum_row_3); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight); + + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); + + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_dist += DIST_STRIDE * (1 + ss_y); + + // Then all the rows except the last one + mul = vld1q_u16((const uint16_t *)neighbors[1]); + + for (h = 1; h < uv_block_height - 1; ++h) { + // Move the weight pointer to the bottom half of the blocks + if (h == uv_block_height / 2) { + if (blk_fw) { + weight = vcombine_u16(vdup_n_u16(blk_fw[2]), vdup_n_u16(blk_fw[3])); + } else { + weight = vdupq_n_u16(bottom_weight); + } + } + + // Shift the rows up + u_sum_row_1 = u_sum_row_2; + u_sum_row_2 = u_sum_row_3; + + v_sum_row_1 = v_sum_row_2; + v_sum_row_2 = v_sum_row_3; + + // Add chroma values + u_sum_row = vqaddq_u16(u_sum_row_1, u_sum_row_2); + get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3); + u_sum_row = vqaddq_u16(u_sum_row, u_sum_row_3); + + v_sum_row = vqaddq_u16(v_sum_row_1, v_sum_row_2); + get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3); + v_sum_row = vqaddq_u16(v_sum_row, v_sum_row_3); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight); + + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); + + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_dist += DIST_STRIDE * (1 + ss_y); + } + + // The last row + mul = vld1q_u16((const uint16_t *)neighbors[0]); + + // Shift the rows up + u_sum_row_1 = u_sum_row_2; + u_sum_row_2 = u_sum_row_3; + + v_sum_row_1 = v_sum_row_2; + v_sum_row_2 = v_sum_row_3; + + // Add chroma values + u_sum_row = vqaddq_u16(u_sum_row_1, u_sum_row_2); + v_sum_row = vqaddq_u16(v_sum_row_1, v_sum_row_2); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight); + + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); +} + +// Perform temporal filter for the chroma components. +static void apply_temporal_filter_chroma( + const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, + unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, + int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) { + const unsigned int uv_width = block_width >> ss_x, + uv_height = block_height >> ss_y; + + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x; + const unsigned int uv_mid_width = uv_width >> 1, + uv_last_width = uv_width - uv_blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const int16_t *const *neighbors; + + if (uv_width == 8) { + // Special Case: We are subsampling in x direction on a 16x16 block. Since + // we are operating on a row of 8 chroma pixels, we can't use the usual + // left-middle-right pattern. + assert(ss_x); + + if (ss_y) { + neighbors = CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS; + } + + if (use_whole_blk) { + apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, + ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); + } else { + apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, + ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, 0, 0, blk_fw); + } + + return; + } + + // Left + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS; + } + + apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x, + ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS; + } + + for (; uv_blk_col < uv_mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x, + ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; uv_blk_col < uv_last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x, + ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); + } + + // Right + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS; + } + + apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x, + ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); +} + +void vp9_apply_temporal_filter_neon( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *const blk_fw, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) { + const unsigned int chroma_height = block_height >> ss_y, + chroma_width = block_width >> ss_x; + + DECLARE_ALIGNED(16, uint16_t, y_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, u_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, v_dist[BH * DIST_STRIDE]) = { 0 }; + const int *blk_fw_ptr = blk_fw; + + uint16_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1, + *v_dist_ptr = v_dist + 1; + const uint8_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src; + const uint8_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre; + + // Loop variables + unsigned int row, blk_col; + + assert(block_width <= BW && "block width too large"); + assert(block_height <= BH && "block height too large"); + assert(block_width % 16 == 0 && "block width must be multiple of 16"); + assert(block_height % 2 == 0 && "block height must be even"); + assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) && + "invalid chroma subsampling"); + assert(strength >= 0 && strength <= 6 && "invalid temporal filter strength"); + assert(blk_fw[0] >= 0 && "filter weight must be positive"); + assert( + (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) && + "subblock filter weight must be positive"); + assert(blk_fw[0] <= 2 && "subblock filter weight must be less than 2"); + assert( + (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) && + "subblock filter weight must be less than 2"); + + // Precompute the difference squared + for (row = 0; row < block_height; row++) { + for (blk_col = 0; blk_col < block_width; blk_col += 16) { + store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col, + y_dist_ptr + blk_col); + } + y_src_ptr += y_src_stride; + y_pre_ptr += y_pre_stride; + y_dist_ptr += DIST_STRIDE; + } + + for (row = 0; row < chroma_height; row++) { + for (blk_col = 0; blk_col < chroma_width; blk_col += 8) { + store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col, + u_dist_ptr + blk_col); + store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col, + v_dist_ptr + blk_col); + } + + u_src_ptr += uv_src_stride; + u_pre_ptr += uv_pre_stride; + u_dist_ptr += DIST_STRIDE; + v_src_ptr += uv_src_stride; + v_pre_ptr += uv_pre_stride; + v_dist_ptr += DIST_STRIDE; + } + + y_dist_ptr = y_dist + 1; + u_dist_ptr = u_dist + 1; + v_dist_ptr = v_dist + 1; + + apply_temporal_filter_luma(y_pre, y_pre_stride, block_width, block_height, + ss_x, ss_y, strength, blk_fw_ptr, use_whole_blk, + y_accum, y_count, y_dist_ptr, u_dist_ptr, + v_dist_ptr); + + apply_temporal_filter_chroma(u_pre, v_pre, uv_pre_stride, block_width, + block_height, ss_x, ss_y, strength, blk_fw_ptr, + use_whole_blk, u_accum, u_count, v_accum, + v_count, y_dist_ptr, u_dist_ptr, v_dist_ptr); +} diff --git a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c new file mode 100644 index 0000000000..61786d8f66 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vpx_dsp/mips/macros_msa.h" + +#define BLOCK_ERROR_BLOCKSIZE_MSA(BSize) \ + static int64_t block_error_##BSize##size_msa( \ + const int16_t *coeff_ptr, const int16_t *dq_coeff_ptr, int64_t *ssz) { \ + int64_t err = 0; \ + uint32_t loop_cnt; \ + v8i16 coeff, dq_coeff, coeff_r_h, coeff_l_h; \ + v4i32 diff_r, diff_l, coeff_r_w, coeff_l_w; \ + v2i64 sq_coeff_r, sq_coeff_l; \ + v2i64 err0, err_dup0, err1, err_dup1; \ + \ + coeff = LD_SH(coeff_ptr); \ + dq_coeff = LD_SH(dq_coeff_ptr); \ + UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \ + ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \ + HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \ + DOTP_SW2_SD(coeff_r_w, coeff_l_w, coeff_r_w, coeff_l_w, sq_coeff_r, \ + sq_coeff_l); \ + DOTP_SW2_SD(diff_r, diff_l, diff_r, diff_l, err0, err1); \ + \ + coeff = LD_SH(coeff_ptr + 8); \ + dq_coeff = LD_SH(dq_coeff_ptr + 8); \ + UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \ + ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \ + HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \ + DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \ + DPADD_SD2_SD(diff_r, diff_l, err0, err1); \ + \ + coeff_ptr += 16; \ + dq_coeff_ptr += 16; \ + \ + for (loop_cnt = ((BSize >> 4) - 1); loop_cnt--;) { \ + coeff = LD_SH(coeff_ptr); \ + dq_coeff = LD_SH(dq_coeff_ptr); \ + UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \ + ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \ + HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \ + DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \ + DPADD_SD2_SD(diff_r, diff_l, err0, err1); \ + \ + coeff = LD_SH(coeff_ptr + 8); \ + dq_coeff = LD_SH(dq_coeff_ptr + 8); \ + UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \ + ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \ + HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \ + DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \ + DPADD_SD2_SD(diff_r, diff_l, err0, err1); \ + \ + coeff_ptr += 16; \ + dq_coeff_ptr += 16; \ + } \ + \ + err_dup0 = __msa_splati_d(sq_coeff_r, 1); \ + err_dup1 = __msa_splati_d(sq_coeff_l, 1); \ + sq_coeff_r += err_dup0; \ + sq_coeff_l += err_dup1; \ + *ssz = __msa_copy_s_d(sq_coeff_r, 0); \ + *ssz += __msa_copy_s_d(sq_coeff_l, 0); \ + \ + err_dup0 = __msa_splati_d(err0, 1); \ + err_dup1 = __msa_splati_d(err1, 1); \ + err0 += err_dup0; \ + err1 += err_dup1; \ + err = __msa_copy_s_d(err0, 0); \ + err += __msa_copy_s_d(err1, 0); \ + \ + return err; \ + } + +#if !CONFIG_VP9_HIGHBITDEPTH +BLOCK_ERROR_BLOCKSIZE_MSA(16); +BLOCK_ERROR_BLOCKSIZE_MSA(64); +BLOCK_ERROR_BLOCKSIZE_MSA(256); +BLOCK_ERROR_BLOCKSIZE_MSA(1024); + +int64_t vp9_block_error_msa(const tran_low_t *coeff_ptr, + const tran_low_t *dq_coeff_ptr, intptr_t blk_size, + int64_t *ssz) { + int64_t err; + const int16_t *coeff = (const int16_t *)coeff_ptr; + const int16_t *dq_coeff = (const int16_t *)dq_coeff_ptr; + + switch (blk_size) { + case 16: err = block_error_16size_msa(coeff, dq_coeff, ssz); break; + case 64: err = block_error_64size_msa(coeff, dq_coeff, ssz); break; + case 256: err = block_error_256size_msa(coeff, dq_coeff, ssz); break; + case 1024: err = block_error_1024size_msa(coeff, dq_coeff, ssz); break; + default: + err = vp9_block_error_c(coeff_ptr, dq_coeff_ptr, blk_size, ssz); + break; + } + + return err; +} +#endif // !CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c new file mode 100644 index 0000000000..efbbe830db --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c @@ -0,0 +1,501 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_enums.h" +#include "vp9/encoder/mips/msa/vp9_fdct_msa.h" +#include "vpx_dsp/mips/fwd_txfm_msa.h" + +static void fadst16_cols_step1_msa(const int16_t *input, int32_t stride, + const int32_t *const0, int16_t *int_buf) { + v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3; + v4i32 k0, k1, k2, k3; + + /* load input data */ + r0 = LD_SH(input); + r15 = LD_SH(input + 15 * stride); + r7 = LD_SH(input + 7 * stride); + r8 = LD_SH(input + 8 * stride); + SLLI_4V(r0, r15, r7, r8, 2); + + /* stage 1 */ + LD_SW2(const0, 4, k0, k1); + LD_SW2(const0 + 8, 4, k2, k3); + MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3); + + r3 = LD_SH(input + 3 * stride); + r4 = LD_SH(input + 4 * stride); + r11 = LD_SH(input + 11 * stride); + r12 = LD_SH(input + 12 * stride); + SLLI_4V(r3, r4, r11, r12, 2); + + LD_SW2(const0 + 4 * 4, 4, k0, k1); + LD_SW2(const0 + 4 * 6, 4, k2, k3); + MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11); + + /* stage 2 */ + BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1); + ST_SH2(tp0, tp2, int_buf, 8); + ST_SH2(tp1, tp3, int_buf + 4 * 8, 8); + + LD_SW2(const0 + 4 * 8, 4, k0, k1); + k2 = LD_SW(const0 + 4 * 10); + MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3); + + ST_SH2(h0, h1, int_buf + 8 * 8, 8); + ST_SH2(h3, h2, int_buf + 12 * 8, 8); + + r9 = LD_SH(input + 9 * stride); + r6 = LD_SH(input + 6 * stride); + r1 = LD_SH(input + stride); + r14 = LD_SH(input + 14 * stride); + SLLI_4V(r9, r6, r1, r14, 2); + + LD_SW2(const0 + 4 * 11, 4, k0, k1); + LD_SW2(const0 + 4 * 13, 4, k2, k3); + MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3); + + ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8); + + r13 = LD_SH(input + 13 * stride); + r2 = LD_SH(input + 2 * stride); + r5 = LD_SH(input + 5 * stride); + r10 = LD_SH(input + 10 * stride); + SLLI_4V(r13, r2, r5, r10, 2); + + LD_SW2(const0 + 4 * 15, 4, k0, k1); + LD_SW2(const0 + 4 * 17, 4, k2, k3); + MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3); + + ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8); + + BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3); + ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8); +} + +static void fadst16_cols_step2_msa(int16_t *int_buf, const int32_t *const0, + int16_t *out) { + int16_t *out_ptr = out + 128; + v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15; + v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11; + v8i16 out0, out1, out2, out3, out4, out5, out6, out7; + v8i16 out8, out9, out10, out11, out12, out13, out14, out15; + v4i32 k0, k1, k2, k3; + + LD_SH2(int_buf + 3 * 8, 4 * 8, g13, g15); + LD_SH2(int_buf + 11 * 8, 4 * 8, g5, g7); + LD_SW2(const0 + 4 * 19, 4, k0, k1); + k2 = LD_SW(const0 + 4 * 21); + MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7); + + tp0 = LD_SH(int_buf + 4 * 8); + tp1 = LD_SH(int_buf + 5 * 8); + tp3 = LD_SH(int_buf + 10 * 8); + tp2 = LD_SH(int_buf + 14 * 8); + LD_SW2(const0 + 4 * 22, 4, k0, k1); + k2 = LD_SW(const0 + 4 * 24); + MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7); + out4 = -out4; + ST_SH(out4, (out + 3 * 16)); + ST_SH(out5, (out_ptr + 4 * 16)); + + h1 = LD_SH(int_buf + 9 * 8); + h3 = LD_SH(int_buf + 12 * 8); + MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15); + out13 = -out13; + ST_SH(out12, (out + 2 * 16)); + ST_SH(out13, (out_ptr + 5 * 16)); + + tp0 = LD_SH(int_buf); + tp1 = LD_SH(int_buf + 8); + tp2 = LD_SH(int_buf + 2 * 8); + tp3 = LD_SH(int_buf + 6 * 8); + + BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10); + out1 = -out1; + ST_SH(out0, (out)); + ST_SH(out1, (out_ptr + 7 * 16)); + + h0 = LD_SH(int_buf + 8 * 8); + h2 = LD_SH(int_buf + 13 * 8); + + BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10); + out8 = -out8; + ST_SH(out8, (out + 16)); + ST_SH(out9, (out_ptr + 6 * 16)); + + /* stage 4 */ + LD_SW2(const0 + 4 * 25, 4, k0, k1); + LD_SW2(const0 + 4 * 27, 4, k2, k3); + MADD_SHORT(h10, h11, k1, k2, out2, out3); + ST_SH(out2, (out + 7 * 16)); + ST_SH(out3, (out_ptr)); + + MADD_SHORT(out6, out7, k0, k3, out6, out7); + ST_SH(out6, (out + 4 * 16)); + ST_SH(out7, (out_ptr + 3 * 16)); + + MADD_SHORT(out10, out11, k0, k3, out10, out11); + ST_SH(out10, (out + 6 * 16)); + ST_SH(out11, (out_ptr + 16)); + + MADD_SHORT(out14, out15, k1, k2, out14, out15); + ST_SH(out14, (out + 5 * 16)); + ST_SH(out15, (out_ptr + 2 * 16)); +} + +static void fadst16_transpose_postproc_msa(int16_t *input, int16_t *out) { + v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; + + /* load input data */ + LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7); + TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6, + r7); + FDCT_POSTPROC_2V_NEG_H(r0, r1); + FDCT_POSTPROC_2V_NEG_H(r2, r3); + FDCT_POSTPROC_2V_NEG_H(r4, r5); + FDCT_POSTPROC_2V_NEG_H(r6, r7); + ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8); + out += 64; + + LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15); + TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11, + r12, r13, r14, r15); + FDCT_POSTPROC_2V_NEG_H(r8, r9); + FDCT_POSTPROC_2V_NEG_H(r10, r11); + FDCT_POSTPROC_2V_NEG_H(r12, r13); + FDCT_POSTPROC_2V_NEG_H(r14, r15); + ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8); + out += 64; + + /* load input data */ + input += 128; + LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7); + TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6, + r7); + FDCT_POSTPROC_2V_NEG_H(r0, r1); + FDCT_POSTPROC_2V_NEG_H(r2, r3); + FDCT_POSTPROC_2V_NEG_H(r4, r5); + FDCT_POSTPROC_2V_NEG_H(r6, r7); + ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8); + out += 64; + + LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15); + TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11, + r12, r13, r14, r15); + FDCT_POSTPROC_2V_NEG_H(r8, r9); + FDCT_POSTPROC_2V_NEG_H(r10, r11); + FDCT_POSTPROC_2V_NEG_H(r12, r13); + FDCT_POSTPROC_2V_NEG_H(r14, r15); + ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8); +} + +static void fadst16_rows_step1_msa(int16_t *input, const int32_t *const0, + int16_t *int_buf) { + v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3; + v4i32 k0, k1, k2, k3; + + /* load input data */ + r0 = LD_SH(input); + r7 = LD_SH(input + 7 * 8); + r8 = LD_SH(input + 8 * 8); + r15 = LD_SH(input + 15 * 8); + + /* stage 1 */ + LD_SW2(const0, 4, k0, k1); + LD_SW2(const0 + 4 * 2, 4, k2, k3); + MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3); + + r3 = LD_SH(input + 3 * 8); + r4 = LD_SH(input + 4 * 8); + r11 = LD_SH(input + 11 * 8); + r12 = LD_SH(input + 12 * 8); + + LD_SW2(const0 + 4 * 4, 4, k0, k1); + LD_SW2(const0 + 4 * 6, 4, k2, k3); + MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11); + + /* stage 2 */ + BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1); + ST_SH2(tp0, tp1, int_buf, 4 * 8); + ST_SH2(tp2, tp3, int_buf + 8, 4 * 8); + + LD_SW2(const0 + 4 * 8, 4, k0, k1); + k2 = LD_SW(const0 + 4 * 10); + MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3); + ST_SH2(h0, h3, int_buf + 8 * 8, 4 * 8); + ST_SH2(h1, h2, int_buf + 9 * 8, 4 * 8); + + r1 = LD_SH(input + 8); + r6 = LD_SH(input + 6 * 8); + r9 = LD_SH(input + 9 * 8); + r14 = LD_SH(input + 14 * 8); + + LD_SW2(const0 + 4 * 11, 4, k0, k1); + LD_SW2(const0 + 4 * 13, 4, k2, k3); + MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3); + ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8); + + r2 = LD_SH(input + 2 * 8); + r5 = LD_SH(input + 5 * 8); + r10 = LD_SH(input + 10 * 8); + r13 = LD_SH(input + 13 * 8); + + LD_SW2(const0 + 4 * 15, 4, k0, k1); + LD_SW2(const0 + 4 * 17, 4, k2, k3); + MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3); + ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8); + BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3); + ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8); +} + +static void fadst16_rows_step2_msa(int16_t *int_buf, const int32_t *const0, + int16_t *out) { + int16_t *out_ptr = out + 8; + v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15; + v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11; + v8i16 out0, out1, out2, out3, out4, out5, out6, out7; + v8i16 out8, out9, out10, out11, out12, out13, out14, out15; + v4i32 k0, k1, k2, k3; + + g13 = LD_SH(int_buf + 3 * 8); + g15 = LD_SH(int_buf + 7 * 8); + g5 = LD_SH(int_buf + 11 * 8); + g7 = LD_SH(int_buf + 15 * 8); + + LD_SW2(const0 + 4 * 19, 4, k0, k1); + k2 = LD_SW(const0 + 4 * 21); + MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7); + + tp0 = LD_SH(int_buf + 4 * 8); + tp1 = LD_SH(int_buf + 5 * 8); + tp3 = LD_SH(int_buf + 10 * 8); + tp2 = LD_SH(int_buf + 14 * 8); + + LD_SW2(const0 + 4 * 22, 4, k0, k1); + k2 = LD_SW(const0 + 4 * 24); + MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7); + out4 = -out4; + ST_SH(out4, (out + 3 * 16)); + ST_SH(out5, (out_ptr + 4 * 16)); + + h1 = LD_SH(int_buf + 9 * 8); + h3 = LD_SH(int_buf + 12 * 8); + MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15); + out13 = -out13; + ST_SH(out12, (out + 2 * 16)); + ST_SH(out13, (out_ptr + 5 * 16)); + + tp0 = LD_SH(int_buf); + tp1 = LD_SH(int_buf + 8); + tp2 = LD_SH(int_buf + 2 * 8); + tp3 = LD_SH(int_buf + 6 * 8); + + BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10); + out1 = -out1; + ST_SH(out0, (out)); + ST_SH(out1, (out_ptr + 7 * 16)); + + h0 = LD_SH(int_buf + 8 * 8); + h2 = LD_SH(int_buf + 13 * 8); + BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10); + out8 = -out8; + ST_SH(out8, (out + 16)); + ST_SH(out9, (out_ptr + 6 * 16)); + + /* stage 4 */ + LD_SW2(const0 + 4 * 25, 4, k0, k1); + LD_SW2(const0 + 4 * 27, 4, k2, k3); + MADD_SHORT(h10, h11, k1, k2, out2, out3); + ST_SH(out2, (out + 7 * 16)); + ST_SH(out3, (out_ptr)); + + MADD_SHORT(out6, out7, k0, k3, out6, out7); + ST_SH(out6, (out + 4 * 16)); + ST_SH(out7, (out_ptr + 3 * 16)); + + MADD_SHORT(out10, out11, k0, k3, out10, out11); + ST_SH(out10, (out + 6 * 16)); + ST_SH(out11, (out_ptr + 16)); + + MADD_SHORT(out14, out15, k1, k2, out14, out15); + ST_SH(out14, (out + 5 * 16)); + ST_SH(out15, (out_ptr + 2 * 16)); +} + +static void fadst16_transpose_msa(int16_t *input, int16_t *out) { + v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; + + /* load input data */ + LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, + l7, l15); + TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6, + r7); + TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11, + r12, r13, r14, r15); + ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8); + ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8); + out += 16 * 8; + + /* load input data */ + input += 128; + LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, + l7, l15); + TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6, + r7); + TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11, + r12, r13, r14, r15); + ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8); + ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8); +} + +static void postproc_fdct16x8_1d_row(int16_t *intermediate, int16_t *output) { + int16_t *temp = intermediate; + int16_t *out = output; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11; + v8i16 in12, in13, in14, in15; + + LD_SH8(temp, 16, in0, in1, in2, in3, in4, in5, in6, in7); + temp = intermediate + 8; + LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + FDCT_POSTPROC_2V_NEG_H(in0, in1); + FDCT_POSTPROC_2V_NEG_H(in2, in3); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + FDCT_POSTPROC_2V_NEG_H(in6, in7); + FDCT_POSTPROC_2V_NEG_H(in8, in9); + FDCT_POSTPROC_2V_NEG_H(in10, in11); + FDCT_POSTPROC_2V_NEG_H(in12, in13); + FDCT_POSTPROC_2V_NEG_H(in14, in15); + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, + tmp7, in8, in9, in10, in11, in12, in13, in14, in15); + temp = intermediate; + ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, temp, 16); + FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, + tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); + temp = intermediate; + LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15); + FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0, + tmp1, in1, tmp2, in2, tmp3, in3); + ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, out, 16); + TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4, + tmp5, in5, tmp6, in6, tmp7, in7); + out = output + 8; + ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, out, 16); +} + +void vp9_fht16x16_msa(const int16_t *input, int16_t *output, int32_t stride, + int32_t tx_type) { + DECLARE_ALIGNED(32, int16_t, tmp[256]); + DECLARE_ALIGNED(32, int16_t, trans_buf[256]); + DECLARE_ALIGNED(32, int16_t, tmp_buf[128]); + int32_t i; + int16_t *ptmpbuf = &tmp_buf[0]; + int16_t *trans = &trans_buf[0]; + const int32_t const_arr[29 * 4] = { + 52707308, 52707308, 52707308, 52707308, -1072430300, + -1072430300, -1072430300, -1072430300, 795618043, 795618043, + 795618043, 795618043, -721080468, -721080468, -721080468, + -721080468, 459094491, 459094491, 459094491, 459094491, + -970646691, -970646691, -970646691, -970646691, 1010963856, + 1010963856, 1010963856, 1010963856, -361743294, -361743294, + -361743294, -361743294, 209469125, 209469125, 209469125, + 209469125, -1053094788, -1053094788, -1053094788, -1053094788, + 1053160324, 1053160324, 1053160324, 1053160324, 639644520, + 639644520, 639644520, 639644520, -862444000, -862444000, + -862444000, -862444000, 1062144356, 1062144356, 1062144356, + 1062144356, -157532337, -157532337, -157532337, -157532337, + 260914709, 260914709, 260914709, 260914709, -1041559667, + -1041559667, -1041559667, -1041559667, 920985831, 920985831, + 920985831, 920985831, -551995675, -551995675, -551995675, + -551995675, 596522295, 596522295, 596522295, 596522295, + 892853362, 892853362, 892853362, 892853362, -892787826, + -892787826, -892787826, -892787826, 410925857, 410925857, + 410925857, 410925857, -992012162, -992012162, -992012162, + -992012162, 992077698, 992077698, 992077698, 992077698, + 759246145, 759246145, 759246145, 759246145, -759180609, + -759180609, -759180609, -759180609, -759222975, -759222975, + -759222975, -759222975, 759288511, 759288511, 759288511, + 759288511 + }; + + switch (tx_type) { + case DCT_DCT: + /* column transform */ + for (i = 0; i < 2; ++i) { + fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride); + } + + /* row transform */ + for (i = 0; i < 2; ++i) { + fdct16x8_1d_row(tmp + (128 * i), output + (128 * i)); + } + break; + case ADST_DCT: + /* column transform */ + for (i = 0; i < 2; ++i) { + fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf); + fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3)); + } + + /* row transform */ + for (i = 0; i < 2; ++i) { + postproc_fdct16x8_1d_row(tmp + (128 * i), output + (128 * i)); + } + break; + case DCT_ADST: + /* column transform */ + for (i = 0; i < 2; ++i) { + fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride); + } + + fadst16_transpose_postproc_msa(tmp, trans); + + /* row transform */ + for (i = 0; i < 2; ++i) { + fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf); + fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7)); + } + + fadst16_transpose_msa(tmp, output); + break; + case ADST_ADST: + /* column transform */ + for (i = 0; i < 2; ++i) { + fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf); + fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3)); + } + + fadst16_transpose_postproc_msa(tmp, trans); + + /* row transform */ + for (i = 0; i < 2; ++i) { + fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf); + fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7)); + } + + fadst16_transpose_msa(tmp, output); + break; + default: assert(0); break; + } +} diff --git a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c new file mode 100644 index 0000000000..9c5cc12ef0 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_enums.h" +#include "vp9/encoder/mips/msa/vp9_fdct_msa.h" + +void vp9_fwht4x4_msa(const int16_t *input, int16_t *output, + int32_t src_stride) { + v8i16 in0, in1, in2, in3, in4; + + LD_SH4(input, src_stride, in0, in1, in2, in3); + + in0 += in1; + in3 -= in2; + in4 = (in0 - in3) >> 1; + SUB2(in4, in1, in4, in2, in1, in2); + in0 -= in2; + in3 += in1; + + TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1); + + in0 += in2; + in1 -= in3; + in4 = (in0 - in1) >> 1; + SUB2(in4, in2, in4, in3, in2, in3); + in0 -= in3; + in1 += in2; + + SLLI_4V(in0, in1, in2, in3, 2); + + TRANSPOSE4x4_SH_SH(in0, in3, in1, in2, in0, in3, in1, in2); + + ST4x2_UB(in0, output, 4); + ST4x2_UB(in3, output + 4, 4); + ST4x2_UB(in1, output + 8, 4); + ST4x2_UB(in2, output + 12, 4); +} + +void vp9_fht4x4_msa(const int16_t *input, int16_t *output, int32_t stride, + int32_t tx_type) { + v8i16 in0, in1, in2, in3; + + LD_SH4(input, stride, in0, in1, in2, in3); + + /* fdct4 pre-process */ + { + v8i16 temp, mask; + v16i8 zero = { 0 }; + v16i8 one = __msa_ldi_b(1); + + mask = (v8i16)__msa_sldi_b(zero, one, 15); + SLLI_4V(in0, in1, in2, in3, 4); + temp = __msa_ceqi_h(in0, 0); + temp = (v8i16)__msa_xori_b((v16u8)temp, 255); + temp = mask & temp; + in0 += temp; + } + + switch (tx_type) { + case DCT_DCT: + VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + break; + case ADST_DCT: + VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + break; + case DCT_ADST: + VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); + break; + case ADST_ADST: + VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); + break; + default: assert(0); break; + } + + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); + SRA_4V(in0, in1, in2, in3, 2); + PCKEV_D2_SH(in1, in0, in3, in2, in0, in2); + ST_SH2(in0, in2, output, 8); +} diff --git a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c new file mode 100644 index 0000000000..26d81aa9ef --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_enums.h" +#include "vp9/encoder/mips/msa/vp9_fdct_msa.h" + +void vp9_fht8x8_msa(const int16_t *input, int16_t *output, int32_t stride, + int32_t tx_type) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + + LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7); + SLLI_4V(in0, in1, in2, in3, 2); + SLLI_4V(in4, in5, in6, in7, 2); + + switch (tx_type) { + case DCT_DCT: + VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, + in3, in4, in5, in6, in7); + VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + break; + case ADST_DCT: + VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, + in3, in4, in5, in6, in7); + VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + break; + case DCT_ADST: + VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, + in3, in4, in5, in6, in7); + VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + break; + case ADST_ADST: + VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, + in3, in4, in5, in6, in7); + VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + break; + default: assert(0); break; + } + + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7); + ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8); +} diff --git a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h new file mode 100644 index 0000000000..fa1af2fc57 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ +#define VPX_VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ + +#include "vpx_dsp/mips/fwd_txfm_msa.h" +#include "vpx_dsp/mips/txfm_macros_msa.h" +#include "vpx_ports/mem.h" + +#define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ + { \ + v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \ + v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m; \ + v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64, \ + cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \ + v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, -cospi_16_64, \ + cospi_24_64, -cospi_24_64, 0, 0 }; \ + \ + SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m); \ + cnst2_m = -cnst0_m; \ + ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ + SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m); \ + cnst4_m = -cnst2_m; \ + ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ + \ + ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \ + ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \ + DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \ + cnst2_m, cnst3_m, in7, in0, in4, in3); \ + \ + SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \ + cnst2_m = -cnst0_m; \ + ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ + SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m); \ + cnst4_m = -cnst2_m; \ + ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ + \ + ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ + ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ + \ + DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \ + cnst2_m, cnst3_m, in5, in2, in6, in1); \ + BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \ + out7 = -s0_m; \ + out0 = s1_m; \ + \ + SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m); \ + \ + ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m); \ + cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + cnst1_m = cnst0_m; \ + \ + ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \ + ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ + DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst2_m, \ + cnst3_m, cnst1_m, out1, out6, s0_m, s1_m); \ + \ + SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + \ + ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ + ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m); \ + out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ + out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \ + out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \ + \ + out1 = -out1; \ + out3 = -out3; \ + out5 = -out5; \ + } + +#define VP9_FADST4(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v4i32 s0_m, s1_m, s2_m, s3_m, constant_m; \ + v4i32 in0_r_m, in1_r_m, in2_r_m, in3_r_m; \ + \ + UNPCK_R_SH_SW(in0, in0_r_m); \ + UNPCK_R_SH_SW(in1, in1_r_m); \ + UNPCK_R_SH_SW(in2, in2_r_m); \ + UNPCK_R_SH_SW(in3, in3_r_m); \ + \ + constant_m = __msa_fill_w(sinpi_4_9); \ + MUL2(in0_r_m, constant_m, in3_r_m, constant_m, s1_m, s0_m); \ + \ + constant_m = __msa_fill_w(sinpi_1_9); \ + s0_m += in0_r_m * constant_m; \ + s1_m -= in1_r_m * constant_m; \ + \ + constant_m = __msa_fill_w(sinpi_2_9); \ + s0_m += in1_r_m * constant_m; \ + s1_m += in3_r_m * constant_m; \ + \ + s2_m = in0_r_m + in1_r_m - in3_r_m; \ + \ + constant_m = __msa_fill_w(sinpi_3_9); \ + MUL2(in2_r_m, constant_m, s2_m, constant_m, s3_m, in1_r_m); \ + \ + in0_r_m = s0_m + s3_m; \ + s2_m = s1_m - s3_m; \ + s3_m = s1_m - s0_m + s3_m; \ + \ + SRARI_W4_SW(in0_r_m, in1_r_m, s2_m, s3_m, DCT_CONST_BITS); \ + PCKEV_H4_SH(in0_r_m, in0_r_m, in1_r_m, in1_r_m, s2_m, s2_m, s3_m, s3_m, \ + out0, out1, out2, out3); \ + } +#endif // VPX_VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c b/media/libvpx/libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c new file mode 100644 index 0000000000..4d31558471 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" + +#include "./vp9_rtcd.h" +#include "vpx_dsp/ppc/types_vsx.h" + +// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit +// integers, and return the high 16 bits of the intermediate integers. +// (a * b) >> 16 +// Note: Because this is done in 2 operations, a and b cannot both be UINT16_MIN +static INLINE int16x8_t vec_mulhi(int16x8_t a, int16x8_t b) { + // madds does ((A * B) >> 15) + C, we need >> 16, so we perform an extra right + // shift. + return vec_sra(vec_madds(a, b, vec_zeros_s16), vec_ones_u16); +} + +// Negate 16-bit integers in a when the corresponding signed 16-bit +// integer in b is negative. +static INLINE int16x8_t vec_sign(int16x8_t a, int16x8_t b) { + const int16x8_t mask = vec_sra(b, vec_shift_sign_s16); + return vec_xor(vec_add(a, mask), mask); +} + +// Compare packed 16-bit integers across a, and return the maximum value in +// every element. Returns a vector containing the biggest value across vector a. +static INLINE int16x8_t vec_max_across(int16x8_t a) { + a = vec_max(a, vec_perm(a, a, vec_perm64)); + a = vec_max(a, vec_perm(a, a, vec_perm32)); + return vec_max(a, vec_perm(a, a, vec_perm16)); +} + +void vp9_quantize_fp_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob; + bool16x8_t zero_coeff0, zero_coeff1; + + int16x8_t round = vec_vsx_ld(0, round_ptr); + int16x8_t quant = vec_vsx_ld(0, quant_ptr); + int16x8_t dequant = vec_vsx_ld(0, dequant_ptr); + int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr); + int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr); + int16x8_t scan0 = vec_vsx_ld(0, iscan); + int16x8_t scan1 = vec_vsx_ld(16, iscan); + + (void)scan; + + // First set of 8 coeff starts with DC + 7 AC + qcoeff0 = vec_mulhi(vec_vaddshs(vec_abs(coeff0), round), quant); + zero_coeff0 = vec_cmpeq(qcoeff0, vec_zeros_s16); + qcoeff0 = vec_sign(qcoeff0, coeff0); + vec_vsx_st(qcoeff0, 0, qcoeff_ptr); + + dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16); + vec_vsx_st(dqcoeff0, 0, dqcoeff_ptr); + + // Remove DC value from round and quant + round = vec_splat(round, 1); + quant = vec_splat(quant, 1); + + // Remove DC value from dequant + dequant = vec_splat(dequant, 1); + + // Second set of 8 coeff starts with (all AC) + qcoeff1 = vec_mulhi(vec_vaddshs(vec_abs(coeff1), round), quant); + zero_coeff1 = vec_cmpeq(qcoeff1, vec_zeros_s16); + qcoeff1 = vec_sign(qcoeff1, coeff1); + vec_vsx_st(qcoeff1, 16, qcoeff_ptr); + + dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16); + vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr); + + eob = vec_max(vec_or(scan0, zero_coeff0), vec_or(scan1, zero_coeff1)); + + // We quantize 16 coeff up front (enough for a 4x4) and process 24 coeff per + // loop iteration. + // for 8x8: 16 + 2 x 24 = 64 + // for 16x16: 16 + 10 x 24 = 256 + if (n_coeffs > 16) { + int16x8_t coeff2, qcoeff2, dqcoeff2, eob2, scan2; + bool16x8_t zero_coeff2; + + int index = 16; + int off0 = 32; + int off1 = 48; + int off2 = 64; + + do { + coeff0 = vec_vsx_ld(off0, coeff_ptr); + coeff1 = vec_vsx_ld(off1, coeff_ptr); + coeff2 = vec_vsx_ld(off2, coeff_ptr); + scan0 = vec_vsx_ld(off0, iscan); + scan1 = vec_vsx_ld(off1, iscan); + scan2 = vec_vsx_ld(off2, iscan); + + qcoeff0 = vec_mulhi(vec_vaddshs(vec_abs(coeff0), round), quant); + zero_coeff0 = vec_cmpeq(qcoeff0, vec_zeros_s16); + qcoeff0 = vec_sign(qcoeff0, coeff0); + vec_vsx_st(qcoeff0, off0, qcoeff_ptr); + dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16); + vec_vsx_st(dqcoeff0, off0, dqcoeff_ptr); + + qcoeff1 = vec_mulhi(vec_vaddshs(vec_abs(coeff1), round), quant); + zero_coeff1 = vec_cmpeq(qcoeff1, vec_zeros_s16); + qcoeff1 = vec_sign(qcoeff1, coeff1); + vec_vsx_st(qcoeff1, off1, qcoeff_ptr); + dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16); + vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr); + + qcoeff2 = vec_mulhi(vec_vaddshs(vec_abs(coeff2), round), quant); + zero_coeff2 = vec_cmpeq(qcoeff2, vec_zeros_s16); + qcoeff2 = vec_sign(qcoeff2, coeff2); + vec_vsx_st(qcoeff2, off2, qcoeff_ptr); + dqcoeff2 = vec_mladd(qcoeff2, dequant, vec_zeros_s16); + vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr); + + eob = vec_max(eob, vec_or(scan0, zero_coeff0)); + eob2 = vec_max(vec_or(scan1, zero_coeff1), vec_or(scan2, zero_coeff2)); + eob = vec_max(eob, eob2); + + index += 24; + off0 += 48; + off1 += 48; + off2 += 48; + } while (index < n_coeffs); + } + + eob = vec_max_across(eob); + *eob_ptr = eob[0] + 1; +} + +// Sets the value of a 32-bit integers to 1 when the corresponding value in a is +// negative. +static INLINE int32x4_t vec_is_neg(int32x4_t a) { + return vec_sr(a, vec_shift_sign_s32); +} + +// DeQuantization function used for 32x32 blocks. Quantized coeff of 32x32 +// blocks are twice as big as for other block sizes. As such, using +// vec_mladd results in overflow. +static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff, + int16x8_t dequant) { + int32x4_t dqcoeffe = vec_mule(qcoeff, dequant); + int32x4_t dqcoeffo = vec_mulo(qcoeff, dequant); + // Add 1 if negative to round towards zero because the C uses division. + dqcoeffe = vec_add(dqcoeffe, vec_is_neg(dqcoeffe)); + dqcoeffo = vec_add(dqcoeffo, vec_is_neg(dqcoeffo)); + dqcoeffe = vec_sra(dqcoeffe, vec_ones_u32); + dqcoeffo = vec_sra(dqcoeffo, vec_ones_u32); + return (int16x8_t)vec_perm(dqcoeffe, dqcoeffo, vec_perm_odd_even_pack); +} + +void vp9_quantize_fp_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, + const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + // In stage 1, we quantize 16 coeffs (DC + 15 AC) + // In stage 2, we loop 42 times and quantize 24 coeffs per iteration + // (32 * 32 - 16) / 24 = 42 + int num_itr = 42; + // Offsets are in bytes, 16 coeffs = 32 bytes + int off0 = 32; + int off1 = 48; + int off2 = 64; + + int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob; + bool16x8_t mask0, mask1, zero_coeff0, zero_coeff1; + + int16x8_t round = vec_vsx_ld(0, round_ptr); + int16x8_t quant = vec_vsx_ld(0, quant_ptr); + int16x8_t dequant = vec_vsx_ld(0, dequant_ptr); + int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr); + int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr); + int16x8_t scan0 = vec_vsx_ld(0, iscan); + int16x8_t scan1 = vec_vsx_ld(16, iscan); + int16x8_t thres = vec_sra(dequant, vec_splats((uint16_t)2)); + int16x8_t abs_coeff0 = vec_abs(coeff0); + int16x8_t abs_coeff1 = vec_abs(coeff1); + + (void)scan; + (void)n_coeffs; + + mask0 = vec_cmpge(abs_coeff0, thres); + round = vec_sra(vec_add(round, vec_ones_s16), vec_ones_u16); + // First set of 8 coeff starts with DC + 7 AC + qcoeff0 = vec_madds(vec_vaddshs(abs_coeff0, round), quant, vec_zeros_s16); + qcoeff0 = vec_and(qcoeff0, mask0); + zero_coeff0 = vec_cmpeq(qcoeff0, vec_zeros_s16); + qcoeff0 = vec_sign(qcoeff0, coeff0); + vec_vsx_st(qcoeff0, 0, qcoeff_ptr); + + dqcoeff0 = dequantize_coeff_32(qcoeff0, dequant); + vec_vsx_st(dqcoeff0, 0, dqcoeff_ptr); + + // Remove DC value from thres, round, quant and dequant + thres = vec_splat(thres, 1); + round = vec_splat(round, 1); + quant = vec_splat(quant, 1); + dequant = vec_splat(dequant, 1); + + mask1 = vec_cmpge(abs_coeff1, thres); + + // Second set of 8 coeff starts with (all AC) + qcoeff1 = + vec_madds(vec_vaddshs(vec_abs(coeff1), round), quant, vec_zeros_s16); + qcoeff1 = vec_and(qcoeff1, mask1); + zero_coeff1 = vec_cmpeq(qcoeff1, vec_zeros_s16); + qcoeff1 = vec_sign(qcoeff1, coeff1); + vec_vsx_st(qcoeff1, 16, qcoeff_ptr); + + dqcoeff1 = dequantize_coeff_32(qcoeff1, dequant); + vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr); + + eob = vec_max(vec_or(scan0, zero_coeff0), vec_or(scan1, zero_coeff1)); + + do { + int16x8_t coeff2, abs_coeff2, qcoeff2, dqcoeff2, eob2, scan2; + bool16x8_t zero_coeff2, mask2; + coeff0 = vec_vsx_ld(off0, coeff_ptr); + coeff1 = vec_vsx_ld(off1, coeff_ptr); + coeff2 = vec_vsx_ld(off2, coeff_ptr); + scan0 = vec_vsx_ld(off0, iscan); + scan1 = vec_vsx_ld(off1, iscan); + scan2 = vec_vsx_ld(off2, iscan); + + abs_coeff0 = vec_abs(coeff0); + abs_coeff1 = vec_abs(coeff1); + abs_coeff2 = vec_abs(coeff2); + + qcoeff0 = vec_madds(vec_vaddshs(abs_coeff0, round), quant, vec_zeros_s16); + qcoeff1 = vec_madds(vec_vaddshs(abs_coeff1, round), quant, vec_zeros_s16); + qcoeff2 = vec_madds(vec_vaddshs(abs_coeff2, round), quant, vec_zeros_s16); + + mask0 = vec_cmpge(abs_coeff0, thres); + mask1 = vec_cmpge(abs_coeff1, thres); + mask2 = vec_cmpge(abs_coeff2, thres); + + qcoeff0 = vec_and(qcoeff0, mask0); + qcoeff1 = vec_and(qcoeff1, mask1); + qcoeff2 = vec_and(qcoeff2, mask2); + + zero_coeff0 = vec_cmpeq(qcoeff0, vec_zeros_s16); + zero_coeff1 = vec_cmpeq(qcoeff1, vec_zeros_s16); + zero_coeff2 = vec_cmpeq(qcoeff2, vec_zeros_s16); + + qcoeff0 = vec_sign(qcoeff0, coeff0); + qcoeff1 = vec_sign(qcoeff1, coeff1); + qcoeff2 = vec_sign(qcoeff2, coeff2); + + vec_vsx_st(qcoeff0, off0, qcoeff_ptr); + vec_vsx_st(qcoeff1, off1, qcoeff_ptr); + vec_vsx_st(qcoeff2, off2, qcoeff_ptr); + + dqcoeff0 = dequantize_coeff_32(qcoeff0, dequant); + dqcoeff1 = dequantize_coeff_32(qcoeff1, dequant); + dqcoeff2 = dequantize_coeff_32(qcoeff2, dequant); + + vec_vsx_st(dqcoeff0, off0, dqcoeff_ptr); + vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr); + vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr); + + eob = vec_max(eob, vec_or(scan0, zero_coeff0)); + eob2 = vec_max(vec_or(scan1, zero_coeff1), vec_or(scan2, zero_coeff2)); + eob = vec_max(eob, eob2); + + off0 += 48; + off1 += 48; + off2 += 48; + num_itr--; + } while (num_itr != 0); + + eob = vec_max_across(eob); + *eob_ptr = eob[0] + 1; +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.c b/media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.c new file mode 100644 index 0000000000..acc3764c7a --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.c @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file in the root of the source tree. An additional + * intellectual property rights grant can be found in the file PATENTS. + * All contributing project authors may be found in the AUTHORS file in + * the root of the source tree. + */ + +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_alt_ref_aq.h" + +struct ALT_REF_AQ { + int dummy; +}; + +struct ALT_REF_AQ *vp9_alt_ref_aq_create(void) { + return (struct ALT_REF_AQ *)vpx_malloc(sizeof(struct ALT_REF_AQ)); +} + +void vp9_alt_ref_aq_destroy(struct ALT_REF_AQ *const self) { vpx_free(self); } + +void vp9_alt_ref_aq_upload_map(struct ALT_REF_AQ *const self, + const struct MATX_8U *segmentation_map) { + (void)self; + (void)segmentation_map; +} + +void vp9_alt_ref_aq_set_nsegments(struct ALT_REF_AQ *const self, + int nsegments) { + (void)self; + (void)nsegments; +} + +void vp9_alt_ref_aq_setup_mode(struct ALT_REF_AQ *const self, + struct VP9_COMP *const cpi) { + (void)cpi; + (void)self; +} + +// set basic segmentation to the altref's one +void vp9_alt_ref_aq_setup_map(struct ALT_REF_AQ *const self, + struct VP9_COMP *const cpi) { + (void)cpi; + (void)self; +} + +// restore cpi->aq_mode +void vp9_alt_ref_aq_unset_all(struct ALT_REF_AQ *const self, + struct VP9_COMP *const cpi) { + (void)cpi; + (void)self; +} + +int vp9_alt_ref_aq_disable_if(const struct ALT_REF_AQ *self, + int segmentation_overhead, int bandwidth) { + (void)bandwidth; + (void)self; + (void)segmentation_overhead; + + return 0; +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.h b/media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.h new file mode 100644 index 0000000000..22a657e035 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file in the root of the source tree. An additional + * intellectual property rights grant can be found in the file PATENTS. + * All contributing project authors may be found in the AUTHORS file in + * the root of the source tree. + */ + +/* + * \file vp9_alt_ref_aq.h + * + * This file contains public interface for setting up adaptive segmentation + * for altref frames. Go to alt_ref_aq_private.h for implmentation details. + */ + +#ifndef VPX_VP9_ENCODER_VP9_ALT_REF_AQ_H_ +#define VPX_VP9_ENCODER_VP9_ALT_REF_AQ_H_ + +#include "vpx/vpx_integer.h" + +// Where to disable segmentation +#define ALT_REF_AQ_LOW_BITRATE_BOUNDARY 150 + +// Last frame always has overall quality = 0, +// so it is questionable if I can process it +#define ALT_REF_AQ_APPLY_TO_LAST_FRAME 1 + +// If I should try to compare gain +// against segmentation overhead +#define ALT_REF_AQ_PROTECT_GAIN 0 + +// Threshold to disable segmentation +#define ALT_REF_AQ_PROTECT_GAIN_THRESH 0.5 + +#ifdef __cplusplus +extern "C" { +#endif + +// Simple structure for storing images +struct MATX_8U { + int rows; + int cols; + int stride; + + uint8_t *data; +}; + +struct VP9_COMP; +struct ALT_REF_AQ; + +/*!\brief Constructor + * + * \return Instance of the class + */ +struct ALT_REF_AQ *vp9_alt_ref_aq_create(void); + +/*!\brief Upload segmentation_map to self object + * + * \param self Instance of the class + * \param segmentation_map Segmentation map to upload + */ +void vp9_alt_ref_aq_upload_map(struct ALT_REF_AQ *const self, + const struct MATX_8U *segmentation_map); + +/*!\brief Return pointer to the altref segmentation map + * + * \param self Instance of the class + * \param segmentation_overhead Segmentation overhead in bytes + * \param bandwidth Current frame bandwidth in bytes + * + * \return Boolean value to disable segmentation + */ +int vp9_alt_ref_aq_disable_if(const struct ALT_REF_AQ *self, + int segmentation_overhead, int bandwidth); + +/*!\brief Set number of segments + * + * It is used for delta quantizer computations + * and thus it can be larger than + * maximum value of the segmentation map + * + * \param self Instance of the class + * \param nsegments Maximum number of segments + */ +void vp9_alt_ref_aq_set_nsegments(struct ALT_REF_AQ *const self, int nsegments); + +/*!\brief Set up LOOKAHEAD_AQ segmentation mode + * + * Set up segmentation mode to LOOKAHEAD_AQ + * (expected future frames prediction + * quality refering to the current frame). + * + * \param self Instance of the class + * \param cpi Encoder context + */ +void vp9_alt_ref_aq_setup_mode(struct ALT_REF_AQ *const self, + struct VP9_COMP *const cpi); + +/*!\brief Set up LOOKAHEAD_AQ segmentation map and delta quantizers + * + * \param self Instance of the class + * \param cpi Encoder context + */ +void vp9_alt_ref_aq_setup_map(struct ALT_REF_AQ *const self, + struct VP9_COMP *const cpi); + +/*!\brief Restore main segmentation map mode and reset the class variables + * + * \param self Instance of the class + * \param cpi Encoder context + */ +void vp9_alt_ref_aq_unset_all(struct ALT_REF_AQ *const self, + struct VP9_COMP *const cpi); + +/*!\brief Destructor + * + * \param self Instance of the class + */ +void vp9_alt_ref_aq_destroy(struct ALT_REF_AQ *const self); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_ALT_REF_AQ_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_aq_360.c b/media/libvpx/libvpx/vp9/encoder/vp9_aq_360.c new file mode 100644 index 0000000000..dba017ffcc --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_aq_360.c @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vpx_ports/mem.h" +#include "vpx_ports/system_state.h" + +#include "vp9/encoder/vp9_aq_360.h" +#include "vp9/encoder/vp9_aq_variance.h" + +#include "vp9/common/vp9_seg_common.h" + +#include "vp9/encoder/vp9_ratectrl.h" +#include "vp9/encoder/vp9_rd.h" +#include "vp9/encoder/vp9_segmentation.h" + +static const double rate_ratio[MAX_SEGMENTS] = { 1.0, 0.75, 0.6, 0.5, + 0.4, 0.3, 0.25 }; + +// Sets segment id 0 for the equatorial region, 1 for temperate region +// and 2 for the polar regions +unsigned int vp9_360aq_segment_id(int mi_row, int mi_rows) { + if (mi_row < mi_rows / 8 || mi_row > mi_rows - mi_rows / 8) + return 2; + else if (mi_row < mi_rows / 4 || mi_row > mi_rows - mi_rows / 4) + return 1; + else + return 0; +} + +void vp9_360aq_frame_setup(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + struct segmentation *seg = &cm->seg; + int i; + + if (frame_is_intra_only(cm) || cpi->force_update_segmentation || + cm->error_resilient_mode) { + vp9_enable_segmentation(seg); + vp9_clearall_segfeatures(seg); + + seg->abs_delta = SEGMENT_DELTADATA; + + vpx_clear_system_state(); + + for (i = 0; i < MAX_SEGMENTS; ++i) { + int qindex_delta = + vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex, + rate_ratio[i], cm->bit_depth); + + // We don't allow qindex 0 in a segment if the base value is not 0. + // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment + // Q delta is sometimes applied without going back around the rd loop. + // This could lead to an illegal combination of partition size and q. + if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) { + qindex_delta = -cm->base_qindex + 1; + } + + // No need to enable SEG_LVL_ALT_Q for this segment. + if (rate_ratio[i] == 1.0) { + continue; + } + + vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, qindex_delta); + vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q); + } + } +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_aq_360.h b/media/libvpx/libvpx/vp9/encoder/vp9_aq_360.h new file mode 100644 index 0000000000..749d3c198a --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_aq_360.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_AQ_360_H_ +#define VPX_VP9_ENCODER_VP9_AQ_360_H_ + +#include "vp9/encoder/vp9_encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +unsigned int vp9_360aq_segment_id(int mi_row, int mi_rows); +void vp9_360aq_frame_setup(VP9_COMP *cpi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_AQ_360_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_aq_complexity.c b/media/libvpx/libvpx/vp9/encoder/vp9_aq_complexity.c new file mode 100644 index 0000000000..ef3423f8eb --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_aq_complexity.c @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_ports/system_state.h" + +#include "vp9/encoder/vp9_aq_complexity.h" +#include "vp9/encoder/vp9_aq_variance.h" +#include "vp9/encoder/vp9_encodeframe.h" +#include "vp9/common/vp9_seg_common.h" +#include "vp9/encoder/vp9_segmentation.h" + +#define AQ_C_SEGMENTS 5 +#define DEFAULT_AQ2_SEG 3 // Neutral Q segment +#define AQ_C_STRENGTHS 3 +static const double aq_c_q_adj_factor[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = { + { 1.75, 1.25, 1.05, 1.00, 0.90 }, + { 2.00, 1.50, 1.15, 1.00, 0.85 }, + { 2.50, 1.75, 1.25, 1.00, 0.80 } +}; +static const double aq_c_transitions[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = { + { 0.15, 0.30, 0.55, 2.00, 100.0 }, + { 0.20, 0.40, 0.65, 2.00, 100.0 }, + { 0.25, 0.50, 0.75, 2.00, 100.0 } +}; +static const double aq_c_var_thresholds[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = { + { -4.0, -3.0, -2.0, 100.00, 100.0 }, + { -3.5, -2.5, -1.5, 100.00, 100.0 }, + { -3.0, -2.0, -1.0, 100.00, 100.0 } +}; + +static int get_aq_c_strength(int q_index, vpx_bit_depth_t bit_depth) { + // Approximate base quatizer (truncated to int) + const int base_quant = vp9_ac_quant(q_index, 0, bit_depth) / 4; + return (base_quant > 10) + (base_quant > 25); +} + +void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + struct segmentation *const seg = &cm->seg; + + // Make SURE use of floating point in this function is safe. + vpx_clear_system_state(); + + if (frame_is_intra_only(cm) || cm->error_resilient_mode || + cpi->refresh_alt_ref_frame || cpi->force_update_segmentation || + (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) { + int segment; + const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth); + + // Clear down the segment map. + memset(cpi->segmentation_map, DEFAULT_AQ2_SEG, cm->mi_rows * cm->mi_cols); + + vp9_clearall_segfeatures(seg); + + // Segmentation only makes sense if the target bits per SB is above a + // threshold. Below this the overheads will usually outweigh any benefit. + if (cpi->rc.sb64_target_rate < 256) { + vp9_disable_segmentation(seg); + return; + } + + vp9_enable_segmentation(seg); + + // Select delta coding method. + seg->abs_delta = SEGMENT_DELTADATA; + + // Default segment "Q" feature is disabled so it defaults to the baseline Q. + vp9_disable_segfeature(seg, DEFAULT_AQ2_SEG, SEG_LVL_ALT_Q); + + // Use some of the segments for in frame Q adjustment. + for (segment = 0; segment < AQ_C_SEGMENTS; ++segment) { + int qindex_delta; + + if (segment == DEFAULT_AQ2_SEG) continue; + + qindex_delta = vp9_compute_qdelta_by_rate( + &cpi->rc, cm->frame_type, cm->base_qindex, + aq_c_q_adj_factor[aq_strength][segment], cm->bit_depth); + + // For AQ complexity mode, we don't allow Q0 in a segment if the base + // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment + // Q delta is sometimes applied without going back around the rd loop. + // This could lead to an illegal combination of partition size and q. + if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) { + qindex_delta = -cm->base_qindex + 1; + } + if ((cm->base_qindex + qindex_delta) > 0) { + vp9_enable_segfeature(seg, segment, SEG_LVL_ALT_Q); + vp9_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta); + } + } + } +} + +#define DEFAULT_LV_THRESH 10.0 +#define MIN_DEFAULT_LV_THRESH 8.0 +// Select a segment for the current block. +// The choice of segment for a block depends on the ratio of the projected +// bits for the block vs a target average and its spatial complexity. +void vp9_caq_select_segment(VP9_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs, + int mi_row, int mi_col, int projected_rate) { + VP9_COMMON *const cm = &cpi->common; + + const int mi_offset = mi_row * cm->mi_cols + mi_col; + const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64]; + const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64]; + const int xmis = VPXMIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[bs]); + const int ymis = VPXMIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[bs]); + int x, y; + int i; + unsigned char segment; + + if (0) { + segment = DEFAULT_AQ2_SEG; + } else { + // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh). + // It is converted to bits * 256 units. + const int target_rate = + (cpi->rc.sb64_target_rate * xmis * ymis * 256) / (bw * bh); + double logvar; + double low_var_thresh; + const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth); + + vpx_clear_system_state(); + low_var_thresh = (cpi->oxcf.pass == 2) ? VPXMAX(cpi->twopass.mb_av_energy, + MIN_DEFAULT_LV_THRESH) + : DEFAULT_LV_THRESH; + + vp9_setup_src_planes(mb, cpi->Source, mi_row, mi_col); + logvar = vp9_log_block_var(cpi, mb, bs); + + segment = AQ_C_SEGMENTS - 1; // Just in case no break out below. + for (i = 0; i < AQ_C_SEGMENTS; ++i) { + // Test rate against a threshold value and variance against a threshold. + // Increasing segment number (higher variance and complexity) = higher Q. + if ((projected_rate < target_rate * aq_c_transitions[aq_strength][i]) && + (logvar < (low_var_thresh + aq_c_var_thresholds[aq_strength][i]))) { + segment = i; + break; + } + } + } + + // Fill in the entires in the segment map corresponding to this SB64. + for (y = 0; y < ymis; y++) { + for (x = 0; x < xmis; x++) { + cpi->segmentation_map[mi_offset + y * cm->mi_cols + x] = segment; + } + } +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_aq_complexity.h b/media/libvpx/libvpx/vp9/encoder/vp9_aq_complexity.h new file mode 100644 index 0000000000..d3cb34c013 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_aq_complexity.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_AQ_COMPLEXITY_H_ +#define VPX_VP9_ENCODER_VP9_AQ_COMPLEXITY_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "vp9/common/vp9_enums.h" + +struct VP9_COMP; +struct macroblock; + +// Select a segment for the current Block. +void vp9_caq_select_segment(struct VP9_COMP *cpi, struct macroblock *, + BLOCK_SIZE bs, int mi_row, int mi_col, + int projected_rate); + +// This function sets up a set of segments with delta Q values around +// the baseline frame quantizer. +void vp9_setup_in_frame_q_adj(struct VP9_COMP *cpi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_AQ_COMPLEXITY_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c b/media/libvpx/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c new file mode 100644 index 0000000000..28ab10a13b --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c @@ -0,0 +1,702 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_ports/system_state.h" + +#include "vp9/encoder/vp9_aq_cyclicrefresh.h" + +#include "vp9/common/vp9_seg_common.h" + +#include "vp9/encoder/vp9_ratectrl.h" +#include "vp9/encoder/vp9_segmentation.h" + +static const uint8_t VP9_VAR_OFFS[64] = { + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 +}; + +CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) { + size_t last_coded_q_map_size; + CYCLIC_REFRESH *const cr = vpx_calloc(1, sizeof(*cr)); + if (cr == NULL) return NULL; + + cr->map = vpx_calloc(mi_rows * mi_cols, sizeof(*cr->map)); + if (cr->map == NULL) { + vp9_cyclic_refresh_free(cr); + return NULL; + } + last_coded_q_map_size = mi_rows * mi_cols * sizeof(*cr->last_coded_q_map); + cr->last_coded_q_map = vpx_malloc(last_coded_q_map_size); + if (cr->last_coded_q_map == NULL) { + vp9_cyclic_refresh_free(cr); + return NULL; + } + assert(MAXQ <= 255); + memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size); + cr->counter_encode_maxq_scene_change = 0; + cr->content_mode = 1; + return cr; +} + +void vp9_cyclic_refresh_free(CYCLIC_REFRESH *cr) { + if (cr != NULL) { + vpx_free(cr->map); + vpx_free(cr->last_coded_q_map); + vpx_free(cr); + } +} + +// Check if this coding block, of size bsize, should be considered for refresh +// (lower-qp coding). Decision can be based on various factors, such as +// size of the coding block (i.e., below min_block size rejected), coding +// mode, and rate/distortion. +static int candidate_refresh_aq(const CYCLIC_REFRESH *cr, const MODE_INFO *mi, + int64_t rate, int64_t dist, int bsize) { + MV mv = mi->mv[0].as_mv; + // Reject the block for lower-qp coding if projected distortion + // is above the threshold, and any of the following is true: + // 1) mode uses large mv + // 2) mode is an intra-mode + // Otherwise accept for refresh. + if (dist > cr->thresh_dist_sb && + (mv.row > cr->motion_thresh || mv.row < -cr->motion_thresh || + mv.col > cr->motion_thresh || mv.col < -cr->motion_thresh || + !is_inter_block(mi))) + return CR_SEGMENT_ID_BASE; + else if (bsize >= BLOCK_16X16 && rate < cr->thresh_rate_sb && + is_inter_block(mi) && mi->mv[0].as_int == 0 && + cr->rate_boost_fac > 10) + // More aggressive delta-q for bigger blocks with zero motion. + return CR_SEGMENT_ID_BOOST2; + else + return CR_SEGMENT_ID_BOOST1; +} + +// Compute delta-q for the segment. +static int compute_deltaq(const VP9_COMP *cpi, int q, double rate_factor) { + const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + const RATE_CONTROL *const rc = &cpi->rc; + int deltaq = vp9_compute_qdelta_by_rate(rc, cpi->common.frame_type, q, + rate_factor, cpi->common.bit_depth); + if ((-deltaq) > cr->max_qdelta_perc * q / 100) { + deltaq = -cr->max_qdelta_perc * q / 100; + } + return deltaq; +} + +// For the just encoded frame, estimate the bits, incorporating the delta-q +// from non-base segment. For now ignore effect of multiple segments +// (with different delta-q). Note this function is called in the postencode +// (called from rc_update_rate_correction_factors()). +int vp9_cyclic_refresh_estimate_bits_at_q(const VP9_COMP *cpi, + double correction_factor) { + const VP9_COMMON *const cm = &cpi->common; + const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + int estimated_bits; + int mbs = cm->MBs; + int num8x8bl = mbs << 2; + // Weight for non-base segments: use actual number of blocks refreshed in + // previous/just encoded frame. Note number of blocks here is in 8x8 units. + double weight_segment1 = (double)cr->actual_num_seg1_blocks / num8x8bl; + double weight_segment2 = (double)cr->actual_num_seg2_blocks / num8x8bl; + // Take segment weighted average for estimated bits. + estimated_bits = + (int)((1.0 - weight_segment1 - weight_segment2) * + vp9_estimate_bits_at_q(cm->frame_type, cm->base_qindex, mbs, + correction_factor, cm->bit_depth) + + weight_segment1 * + vp9_estimate_bits_at_q(cm->frame_type, + cm->base_qindex + cr->qindex_delta[1], + mbs, correction_factor, cm->bit_depth) + + weight_segment2 * + vp9_estimate_bits_at_q(cm->frame_type, + cm->base_qindex + cr->qindex_delta[2], + mbs, correction_factor, cm->bit_depth)); + return estimated_bits; +} + +// Prior to encoding the frame, estimate the bits per mb, for a given q = i and +// a corresponding delta-q (for segment 1). This function is called in the +// rc_regulate_q() to set the base qp index. +// Note: the segment map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or +// to 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock, prior to encoding. +int vp9_cyclic_refresh_rc_bits_per_mb(const VP9_COMP *cpi, int i, + double correction_factor) { + const VP9_COMMON *const cm = &cpi->common; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + int bits_per_mb; + int deltaq = 0; + if (cpi->oxcf.speed < 8) + deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta); + else + deltaq = -(cr->max_qdelta_perc * i) / 200; + // Take segment weighted average for bits per mb. + bits_per_mb = (int)((1.0 - cr->weight_segment) * + vp9_rc_bits_per_mb(cm->frame_type, i, + correction_factor, cm->bit_depth) + + cr->weight_segment * + vp9_rc_bits_per_mb(cm->frame_type, i + deltaq, + correction_factor, cm->bit_depth)); + return bits_per_mb; +} + +// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col), +// check if we should reset the segment_id, and update the cyclic_refresh map +// and segmentation map. +void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi, MODE_INFO *const mi, + int mi_row, int mi_col, BLOCK_SIZE bsize, + int64_t rate, int64_t dist, int skip, + struct macroblock_plane *const p) { + const VP9_COMMON *const cm = &cpi->common; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + const int bw = num_8x8_blocks_wide_lookup[bsize]; + const int bh = num_8x8_blocks_high_lookup[bsize]; + const int xmis = VPXMIN(cm->mi_cols - mi_col, bw); + const int ymis = VPXMIN(cm->mi_rows - mi_row, bh); + const int block_index = mi_row * cm->mi_cols + mi_col; + int refresh_this_block = candidate_refresh_aq(cr, mi, rate, dist, bsize); + // Default is to not update the refresh map. + int new_map_value = cr->map[block_index]; + int x = 0; + int y = 0; + + int is_skin = 0; + if (refresh_this_block == 0 && bsize <= BLOCK_16X16 && + cpi->use_skin_detection) { + is_skin = + vp9_compute_skin_block(p[0].src.buf, p[1].src.buf, p[2].src.buf, + p[0].src.stride, p[1].src.stride, bsize, 0, 0); + if (is_skin) refresh_this_block = 1; + } + + if (cpi->oxcf.rc_mode == VPX_VBR && mi->ref_frame[0] == GOLDEN_FRAME) + refresh_this_block = 0; + + // If this block is labeled for refresh, check if we should reset the + // segment_id. + if (cpi->sf.use_nonrd_pick_mode && + cyclic_refresh_segment_id_boosted(mi->segment_id)) { + mi->segment_id = refresh_this_block; + // Reset segment_id if it will be skipped. + if (skip) mi->segment_id = CR_SEGMENT_ID_BASE; + } + + // Update the cyclic refresh map, to be used for setting segmentation map + // for the next frame. If the block will be refreshed this frame, mark it + // as clean. The magnitude of the -ve influences how long before we consider + // it for refresh again. + if (cyclic_refresh_segment_id_boosted(mi->segment_id)) { + new_map_value = -cr->time_for_refresh; + } else if (refresh_this_block) { + // Else if it is accepted as candidate for refresh, and has not already + // been refreshed (marked as 1) then mark it as a candidate for cleanup + // for future time (marked as 0), otherwise don't update it. + if (cr->map[block_index] == 1) new_map_value = 0; + } else { + // Leave it marked as block that is not candidate for refresh. + new_map_value = 1; + } + + // Update entries in the cyclic refresh map with new_map_value, and + // copy mbmi->segment_id into global segmentation map. + for (y = 0; y < ymis; y++) + for (x = 0; x < xmis; x++) { + int map_offset = block_index + y * cm->mi_cols + x; + cr->map[map_offset] = new_map_value; + cpi->segmentation_map[map_offset] = mi->segment_id; + } +} + +void vp9_cyclic_refresh_update_sb_postencode(VP9_COMP *const cpi, + const MODE_INFO *const mi, + int mi_row, int mi_col, + BLOCK_SIZE bsize) { + const VP9_COMMON *const cm = &cpi->common; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + const int bw = num_8x8_blocks_wide_lookup[bsize]; + const int bh = num_8x8_blocks_high_lookup[bsize]; + const int xmis = VPXMIN(cm->mi_cols - mi_col, bw); + const int ymis = VPXMIN(cm->mi_rows - mi_row, bh); + const int block_index = mi_row * cm->mi_cols + mi_col; + int x, y; + for (y = 0; y < ymis; y++) + for (x = 0; x < xmis; x++) { + int map_offset = block_index + y * cm->mi_cols + x; + // Inter skip blocks were clearly not coded at the current qindex, so + // don't update the map for them. For cases where motion is non-zero or + // the reference frame isn't the previous frame, the previous value in + // the map for this spatial location is not entirely correct. + if ((!is_inter_block(mi) || !mi->skip) && + mi->segment_id <= CR_SEGMENT_ID_BOOST2) { + cr->last_coded_q_map[map_offset] = + clamp(cm->base_qindex + cr->qindex_delta[mi->segment_id], 0, MAXQ); + } else if (is_inter_block(mi) && mi->skip && + mi->segment_id <= CR_SEGMENT_ID_BOOST2) { + cr->last_coded_q_map[map_offset] = VPXMIN( + clamp(cm->base_qindex + cr->qindex_delta[mi->segment_id], 0, MAXQ), + cr->last_coded_q_map[map_offset]); + } + } +} + +// From the just encoded frame: update the actual number of blocks that were +// applied the segment delta q, and the amount of low motion in the frame. +// Also check conditions for forcing golden update, or preventing golden +// update if the period is up. +void vp9_cyclic_refresh_postencode(VP9_COMP *const cpi) { + VP9_COMMON *const cm = &cpi->common; + MODE_INFO **mi = cm->mi_grid_visible; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + RATE_CONTROL *const rc = &cpi->rc; + unsigned char *const seg_map = cpi->segmentation_map; + double fraction_low = 0.0; + int force_gf_refresh = 0; + int low_content_frame = 0; + int mi_row, mi_col; + cr->actual_num_seg1_blocks = 0; + cr->actual_num_seg2_blocks = 0; + for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) { + MV mv = mi[0]->mv[0].as_mv; + int map_index = mi_row * cm->mi_cols + mi_col; + if (cyclic_refresh_segment_id(seg_map[map_index]) == CR_SEGMENT_ID_BOOST1) + cr->actual_num_seg1_blocks++; + else if (cyclic_refresh_segment_id(seg_map[map_index]) == + CR_SEGMENT_ID_BOOST2) + cr->actual_num_seg2_blocks++; + // Accumulate low_content_frame. + if (is_inter_block(mi[0]) && abs(mv.row) < 16 && abs(mv.col) < 16) + low_content_frame++; + mi++; + } + mi += 8; + } + // Check for golden frame update: only for non-SVC and non-golden boost. + if (!cpi->use_svc && cpi->ext_refresh_frame_flags_pending == 0 && + !cpi->oxcf.gf_cbr_boost_pct) { + // Force this frame as a golden update frame if this frame changes the + // resolution (resize_pending != 0). + if (cpi->resize_pending != 0) { + vp9_cyclic_refresh_set_golden_update(cpi); + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + if (rc->frames_till_gf_update_due > rc->frames_to_key) + rc->frames_till_gf_update_due = rc->frames_to_key; + cpi->refresh_golden_frame = 1; + force_gf_refresh = 1; + } + // Update average of low content/motion in the frame. + fraction_low = (double)low_content_frame / (cm->mi_rows * cm->mi_cols); + cr->low_content_avg = (fraction_low + 3 * cr->low_content_avg) / 4; + if (!force_gf_refresh && cpi->refresh_golden_frame == 1 && + rc->frames_since_key > rc->frames_since_golden + 1) { + // Don't update golden reference if the amount of low_content for the + // current encoded frame is small, or if the recursive average of the + // low_content over the update interval window falls below threshold. + if (fraction_low < 0.65 || cr->low_content_avg < 0.6) { + cpi->refresh_golden_frame = 0; + } + // Reset for next internal. + cr->low_content_avg = fraction_low; + } + } +} + +// Set golden frame update interval, for non-svc 1 pass CBR mode. +void vp9_cyclic_refresh_set_golden_update(VP9_COMP *const cpi) { + RATE_CONTROL *const rc = &cpi->rc; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + // Set minimum gf_interval for GF update to a multiple of the refresh period, + // with some max limit. Depending on past encoding stats, GF flag may be + // reset and update may not occur until next baseline_gf_interval. + if (cr->percent_refresh > 0) + rc->baseline_gf_interval = VPXMIN(4 * (100 / cr->percent_refresh), 40); + else + rc->baseline_gf_interval = 40; + if (cpi->oxcf.rc_mode == VPX_VBR) rc->baseline_gf_interval = 20; + if (rc->avg_frame_low_motion < 50 && rc->frames_since_key > 40 && + cr->content_mode) + rc->baseline_gf_interval = 10; +} + +static int is_superblock_flat_static(VP9_COMP *const cpi, int sb_row_index, + int sb_col_index) { + unsigned int source_variance; + const uint8_t *src_y = cpi->Source->y_buffer; + const int ystride = cpi->Source->y_stride; + unsigned int sse; + const BLOCK_SIZE bsize = BLOCK_64X64; + src_y += (sb_row_index << 6) * ystride + (sb_col_index << 6); + source_variance = + cpi->fn_ptr[bsize].vf(src_y, ystride, VP9_VAR_OFFS, 0, &sse); + if (source_variance == 0) { + uint64_t block_sad; + const uint8_t *last_src_y = cpi->Last_Source->y_buffer; + const int last_ystride = cpi->Last_Source->y_stride; + last_src_y += (sb_row_index << 6) * ystride + (sb_col_index << 6); + block_sad = + cpi->fn_ptr[bsize].sdf(src_y, ystride, last_src_y, last_ystride); + if (block_sad == 0) return 1; + } + return 0; +} + +// Update the segmentation map, and related quantities: cyclic refresh map, +// refresh sb_index, and target number of blocks to be refreshed. +// The map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or to +// 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock. +// Blocks labeled as BOOST1 may later get set to BOOST2 (during the +// encoding of the superblock). +static void cyclic_refresh_update_map(VP9_COMP *const cpi) { + VP9_COMMON *const cm = &cpi->common; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + unsigned char *const seg_map = cpi->segmentation_map; + int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame; + int xmis, ymis, x, y; + int consec_zero_mv_thresh = 0; + int qindex_thresh = 0; + int count_sel = 0; + int count_tot = 0; + memset(seg_map, CR_SEGMENT_ID_BASE, cm->mi_rows * cm->mi_cols); + sb_cols = (cm->mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE; + sb_rows = (cm->mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE; + sbs_in_frame = sb_cols * sb_rows; + // Number of target blocks to get the q delta (segment 1). + block_count = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100; + // Set the segmentation map: cycle through the superblocks, starting at + // cr->mb_index, and stopping when either block_count blocks have been found + // to be refreshed, or we have passed through whole frame. + assert(cr->sb_index < sbs_in_frame); + i = cr->sb_index; + cr->target_num_seg_blocks = 0; + if (cpi->oxcf.content != VP9E_CONTENT_SCREEN) { + consec_zero_mv_thresh = 100; + } + qindex_thresh = + cpi->oxcf.content == VP9E_CONTENT_SCREEN + ? vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex) + : vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex); + // More aggressive settings for noisy content. + if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium && + cr->content_mode) { + consec_zero_mv_thresh = 60; + qindex_thresh = + VPXMAX(vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex), + cm->base_qindex); + } + do { + int sum_map = 0; + int consec_zero_mv_thresh_block = consec_zero_mv_thresh; + // Get the mi_row/mi_col corresponding to superblock index i. + int sb_row_index = (i / sb_cols); + int sb_col_index = i - sb_row_index * sb_cols; + int mi_row = sb_row_index * MI_BLOCK_SIZE; + int mi_col = sb_col_index * MI_BLOCK_SIZE; + int flat_static_blocks = 0; + int compute_content = 1; + assert(mi_row >= 0 && mi_row < cm->mi_rows); + assert(mi_col >= 0 && mi_col < cm->mi_cols); +#if CONFIG_VP9_HIGHBITDEPTH + if (cpi->common.use_highbitdepth) compute_content = 0; +#endif + if (cr->content_mode == 0 || cpi->Last_Source == NULL || + cpi->Last_Source->y_width != cpi->Source->y_width || + cpi->Last_Source->y_height != cpi->Source->y_height) + compute_content = 0; + bl_index = mi_row * cm->mi_cols + mi_col; + // Loop through all 8x8 blocks in superblock and update map. + xmis = + VPXMIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[BLOCK_64X64]); + ymis = + VPXMIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[BLOCK_64X64]); + if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium && + (xmis <= 2 || ymis <= 2)) + consec_zero_mv_thresh_block = 4; + for (y = 0; y < ymis; y++) { + for (x = 0; x < xmis; x++) { + const int bl_index2 = bl_index + y * cm->mi_cols + x; + // If the block is as a candidate for clean up then mark it + // for possible boost/refresh (segment 1). The segment id may get + // reset to 0 later depending on the coding mode. + if (cr->map[bl_index2] == 0) { + count_tot++; + if (cr->content_mode == 0 || + cr->last_coded_q_map[bl_index2] > qindex_thresh || + cpi->consec_zero_mv[bl_index2] < consec_zero_mv_thresh_block) { + sum_map++; + count_sel++; + } + } else if (cr->map[bl_index2] < 0) { + cr->map[bl_index2]++; + } + } + } + // Enforce constant segment over superblock. + // If segment is at least half of superblock, set to 1. + if (sum_map >= xmis * ymis / 2) { + // This superblock is a candidate for refresh: + // compute spatial variance and exclude blocks that are spatially flat + // and stationary. Note: this is currently only done for screne content + // mode. + if (compute_content && cr->skip_flat_static_blocks) + flat_static_blocks = + is_superblock_flat_static(cpi, sb_row_index, sb_col_index); + if (!flat_static_blocks) { + // Label this superblock as segment 1. + for (y = 0; y < ymis; y++) + for (x = 0; x < xmis; x++) { + seg_map[bl_index + y * cm->mi_cols + x] = CR_SEGMENT_ID_BOOST1; + } + cr->target_num_seg_blocks += xmis * ymis; + } + } + i++; + if (i == sbs_in_frame) { + i = 0; + } + } while (cr->target_num_seg_blocks < block_count && i != cr->sb_index); + cr->sb_index = i; + cr->reduce_refresh = 0; + if (cpi->oxcf.content != VP9E_CONTENT_SCREEN) + if (count_sel < (3 * count_tot) >> 2) cr->reduce_refresh = 1; +} + +// Set cyclic refresh parameters. +void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) { + const RATE_CONTROL *const rc = &cpi->rc; + const VP9_COMMON *const cm = &cpi->common; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + int num8x8bl = cm->MBs << 2; + int target_refresh = 0; + double weight_segment_target = 0; + double weight_segment = 0; + int thresh_low_motion = 20; + int qp_thresh = VPXMIN((cpi->oxcf.content == VP9E_CONTENT_SCREEN) ? 35 : 20, + rc->best_quality << 1); + int qp_max_thresh = 117 * MAXQ >> 7; + cr->apply_cyclic_refresh = 1; + if (frame_is_intra_only(cm) || cpi->svc.temporal_layer_id > 0 || + is_lossless_requested(&cpi->oxcf) || + rc->avg_frame_qindex[INTER_FRAME] < qp_thresh || + (cpi->use_svc && + cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) || + (!cpi->use_svc && cr->content_mode && + rc->avg_frame_low_motion < thresh_low_motion && + rc->frames_since_key > 40) || + (!cpi->use_svc && rc->avg_frame_qindex[INTER_FRAME] > qp_max_thresh && + rc->frames_since_key > 20) || + (cpi->roi.enabled && cpi->roi.skip[BACKGROUND_SEG_SKIP_ID] && + rc->frames_since_key > FRAMES_NO_SKIPPING_AFTER_KEY)) { + cr->apply_cyclic_refresh = 0; + return; + } + cr->percent_refresh = 10; + if (cr->reduce_refresh) cr->percent_refresh = 5; + cr->max_qdelta_perc = 60; + cr->time_for_refresh = 0; + cr->motion_thresh = 32; + cr->rate_boost_fac = 15; + // Use larger delta-qp (increase rate_ratio_qdelta) for first few (~4) + // periods of the refresh cycle, after a key frame. + // Account for larger interval on base layer for temporal layers. + if (cr->percent_refresh > 0 && + rc->frames_since_key < + (4 * cpi->svc.number_temporal_layers) * (100 / cr->percent_refresh)) { + cr->rate_ratio_qdelta = 3.0; + } else { + cr->rate_ratio_qdelta = 2.0; + if (cr->content_mode && cpi->noise_estimate.enabled && + cpi->noise_estimate.level >= kMedium) { + // Reduce the delta-qp if the estimated source noise is above threshold. + cr->rate_ratio_qdelta = 1.7; + cr->rate_boost_fac = 13; + } + } + // For screen-content: keep rate_ratio_qdelta to 2.0 (segment#1 boost) and + // percent_refresh (refresh rate) to 10. But reduce rate boost for segment#2 + // (rate_boost_fac = 10 disables segment#2). + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) { + // Only enable feature of skipping flat_static blocks for top layer + // under screen content mode. + if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) + cr->skip_flat_static_blocks = 1; + cr->percent_refresh = (cr->skip_flat_static_blocks) ? 5 : 10; + // Increase the amount of refresh on scene change that is encoded at max Q, + // increase for a few cycles of the refresh period (~100 / percent_refresh). + if (cr->content_mode && cr->counter_encode_maxq_scene_change < 30) + cr->percent_refresh = (cr->skip_flat_static_blocks) ? 10 : 15; + cr->rate_ratio_qdelta = 2.0; + cr->rate_boost_fac = 10; + } + // Adjust some parameters for low resolutions. + if (cm->width * cm->height <= 352 * 288) { + if (rc->avg_frame_bandwidth < 3000) { + cr->motion_thresh = 64; + cr->rate_boost_fac = 13; + } else { + cr->max_qdelta_perc = 70; + cr->rate_ratio_qdelta = VPXMAX(cr->rate_ratio_qdelta, 2.5); + } + } + if (cpi->oxcf.rc_mode == VPX_VBR) { + // To be adjusted for VBR mode, e.g., based on gf period and boost. + // For now use smaller qp-delta (than CBR), no second boosted seg, and + // turn-off (no refresh) on golden refresh (since it's already boosted). + cr->percent_refresh = 10; + cr->rate_ratio_qdelta = 1.5; + cr->rate_boost_fac = 10; + if (cpi->refresh_golden_frame == 1 && !cpi->use_svc) { + cr->percent_refresh = 0; + cr->rate_ratio_qdelta = 1.0; + } + } + // Weight for segment prior to encoding: take the average of the target + // number for the frame to be encoded and the actual from the previous frame. + // Use the target if its less. To be used for setting the base qp for the + // frame in vp9_rc_regulate_q. + target_refresh = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100; + weight_segment_target = (double)(target_refresh) / num8x8bl; + weight_segment = (double)((target_refresh + cr->actual_num_seg1_blocks + + cr->actual_num_seg2_blocks) >> + 1) / + num8x8bl; + if (weight_segment_target < 7 * weight_segment / 8) + weight_segment = weight_segment_target; + // For screen-content: don't include target for the weight segment, + // since for all flat areas the segment is reset, so its more accurate + // to just use the previous actual number of seg blocks for the weight. + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) + weight_segment = + (double)(cr->actual_num_seg1_blocks + cr->actual_num_seg2_blocks) / + num8x8bl; + cr->weight_segment = weight_segment; + if (cr->content_mode == 0) { + cr->actual_num_seg1_blocks = + cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100; + cr->actual_num_seg2_blocks = 0; + cr->weight_segment = (double)(cr->actual_num_seg1_blocks) / num8x8bl; + } +} + +// Setup cyclic background refresh: set delta q and segmentation map. +void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) { + VP9_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + struct segmentation *const seg = &cm->seg; + int scene_change_detected = + cpi->rc.high_source_sad || + (cpi->use_svc && cpi->svc.high_source_sad_superframe); + if (cm->current_video_frame == 0) cr->low_content_avg = 0.0; + // Reset if resoluton change has occurred. + if (cpi->resize_pending != 0) vp9_cyclic_refresh_reset_resize(cpi); + if (!cr->apply_cyclic_refresh || (cpi->force_update_segmentation) || + scene_change_detected) { + // Set segmentation map to 0 and disable. + unsigned char *const seg_map = cpi->segmentation_map; + memset(seg_map, 0, cm->mi_rows * cm->mi_cols); + vp9_disable_segmentation(&cm->seg); + if (cm->frame_type == KEY_FRAME || scene_change_detected) { + memset(cr->last_coded_q_map, MAXQ, + cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map)); + cr->sb_index = 0; + cr->reduce_refresh = 0; + cr->counter_encode_maxq_scene_change = 0; + } + return; + } else { + int qindex_delta = 0; + int qindex2; + const double q = vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth); + cr->counter_encode_maxq_scene_change++; + vpx_clear_system_state(); + // Set rate threshold to some multiple (set to 2 for now) of the target + // rate (target is given by sb64_target_rate and scaled by 256). + cr->thresh_rate_sb = ((int64_t)(rc->sb64_target_rate) << 8) << 2; + // Distortion threshold, quadratic in Q, scale factor to be adjusted. + // q will not exceed 457, so (q * q) is within 32bit; see: + // vp9_convert_qindex_to_q(), vp9_ac_quant(), ac_qlookup*[]. + cr->thresh_dist_sb = ((int64_t)(q * q)) << 2; + + // Set up segmentation. + // Clear down the segment map. + vp9_enable_segmentation(&cm->seg); + vp9_clearall_segfeatures(seg); + // Select delta coding method. + seg->abs_delta = SEGMENT_DELTADATA; + + // Note: setting temporal_update has no effect, as the seg-map coding method + // (temporal or spatial) is determined in vp9_choose_segmap_coding_method(), + // based on the coding cost of each method. For error_resilient mode on the + // last_frame_seg_map is set to 0, so if temporal coding is used, it is + // relative to 0 previous map. + // seg->temporal_update = 0; + + // Segment BASE "Q" feature is disabled so it defaults to the baseline Q. + vp9_disable_segfeature(seg, CR_SEGMENT_ID_BASE, SEG_LVL_ALT_Q); + // Use segment BOOST1 for in-frame Q adjustment. + vp9_enable_segfeature(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q); + // Use segment BOOST2 for more aggressive in-frame Q adjustment. + vp9_enable_segfeature(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q); + + // Set the q delta for segment BOOST1. + qindex_delta = compute_deltaq(cpi, cm->base_qindex, cr->rate_ratio_qdelta); + cr->qindex_delta[1] = qindex_delta; + + // Compute rd-mult for segment BOOST1. + qindex2 = clamp(cm->base_qindex + cm->y_dc_delta_q + qindex_delta, 0, MAXQ); + + cr->rdmult = vp9_compute_rd_mult(cpi, qindex2); + + vp9_set_segdata(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q, qindex_delta); + + // Set a more aggressive (higher) q delta for segment BOOST2. + qindex_delta = compute_deltaq( + cpi, cm->base_qindex, + VPXMIN(CR_MAX_RATE_TARGET_RATIO, + 0.1 * cr->rate_boost_fac * cr->rate_ratio_qdelta)); + cr->qindex_delta[2] = qindex_delta; + vp9_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta); + + // Update the segmentation and refresh map. + cyclic_refresh_update_map(cpi); + } +} + +int vp9_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr) { + return cr->rdmult; +} + +void vp9_cyclic_refresh_reset_resize(VP9_COMP *const cpi) { + const VP9_COMMON *const cm = &cpi->common; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + memset(cr->map, 0, cm->mi_rows * cm->mi_cols); + memset(cr->last_coded_q_map, MAXQ, + cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map)); + cr->sb_index = 0; + cpi->refresh_golden_frame = 1; + cpi->refresh_alt_ref_frame = 1; + cr->counter_encode_maxq_scene_change = 0; +} + +void vp9_cyclic_refresh_limit_q(const VP9_COMP *cpi, int *q) { + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + // For now apply hard limit to frame-level decrease in q, if the cyclic + // refresh is active (percent_refresh > 0). + if (cr->percent_refresh > 0 && cpi->rc.q_1_frame - *q > 8) { + *q = cpi->rc.q_1_frame - 8; + } +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h b/media/libvpx/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h new file mode 100644 index 0000000000..c74cee4743 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_ +#define VPX_VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_ + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/encoder/vp9_block.h" +#include "vp9/encoder/vp9_skin_detection.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// The segment ids used in cyclic refresh: from base (no boost) to increasing +// boost (higher delta-qp). +#define CR_SEGMENT_ID_BASE 0 +#define CR_SEGMENT_ID_BOOST1 1 +#define CR_SEGMENT_ID_BOOST2 2 + +// Maximum rate target ratio for setting segment delta-qp. +#define CR_MAX_RATE_TARGET_RATIO 4.0 + +struct CYCLIC_REFRESH { + // Percentage of blocks per frame that are targeted as candidates + // for cyclic refresh. + int percent_refresh; + // Maximum q-delta as percentage of base q. + int max_qdelta_perc; + // Superblock starting index for cycling through the frame. + int sb_index; + // Controls how long block will need to wait to be refreshed again, in + // excess of the cycle time, i.e., in the case of all zero motion, block + // will be refreshed every (100/percent_refresh + time_for_refresh) frames. + int time_for_refresh; + // Target number of (8x8) blocks that are set for delta-q. + int target_num_seg_blocks; + // Actual number of (8x8) blocks that were applied delta-q. + int actual_num_seg1_blocks; + int actual_num_seg2_blocks; + // RD mult. parameters for segment 1. + int rdmult; + // Cyclic refresh map. + signed char *map; + // Map of the last q a block was coded at. + uint8_t *last_coded_q_map; + // Thresholds applied to the projected rate/distortion of the coding block, + // when deciding whether block should be refreshed. + int64_t thresh_rate_sb; + int64_t thresh_dist_sb; + // Threshold applied to the motion vector (in units of 1/8 pel) of the + // coding block, when deciding whether block should be refreshed. + int16_t motion_thresh; + // Rate target ratio to set q delta. + double rate_ratio_qdelta; + // Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2. + int rate_boost_fac; + double low_content_avg; + int qindex_delta[3]; + int reduce_refresh; + double weight_segment; + int apply_cyclic_refresh; + int counter_encode_maxq_scene_change; + int skip_flat_static_blocks; + int content_mode; +}; + +struct VP9_COMP; + +typedef struct CYCLIC_REFRESH CYCLIC_REFRESH; + +CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols); + +void vp9_cyclic_refresh_free(CYCLIC_REFRESH *cr); + +// Estimate the bits, incorporating the delta-q from segment 1, after encoding +// the frame. +int vp9_cyclic_refresh_estimate_bits_at_q(const struct VP9_COMP *cpi, + double correction_factor); + +// Estimate the bits per mb, for a given q = i and a corresponding delta-q +// (for segment 1), prior to encoding the frame. +int vp9_cyclic_refresh_rc_bits_per_mb(const struct VP9_COMP *cpi, int i, + double correction_factor); + +// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col), +// check if we should reset the segment_id, and update the cyclic_refresh map +// and segmentation map. +void vp9_cyclic_refresh_update_segment(struct VP9_COMP *const cpi, + MODE_INFO *const mi, int mi_row, + int mi_col, BLOCK_SIZE bsize, + int64_t rate, int64_t dist, int skip, + struct macroblock_plane *const p); + +void vp9_cyclic_refresh_update_sb_postencode(struct VP9_COMP *const cpi, + const MODE_INFO *const mi, + int mi_row, int mi_col, + BLOCK_SIZE bsize); + +// From the just encoded frame: update the actual number of blocks that were +// applied the segment delta q, and the amount of low motion in the frame. +// Also check conditions for forcing golden update, or preventing golden +// update if the period is up. +void vp9_cyclic_refresh_postencode(struct VP9_COMP *const cpi); + +// Set golden frame update interval, for non-svc 1 pass CBR mode. +void vp9_cyclic_refresh_set_golden_update(struct VP9_COMP *const cpi); + +// Set/update global/frame level refresh parameters. +void vp9_cyclic_refresh_update_parameters(struct VP9_COMP *const cpi); + +// Setup cyclic background refresh: set delta q and segmentation map. +void vp9_cyclic_refresh_setup(struct VP9_COMP *const cpi); + +int vp9_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr); + +void vp9_cyclic_refresh_reset_resize(struct VP9_COMP *const cpi); + +static INLINE int cyclic_refresh_segment_id_boosted(int segment_id) { + return segment_id == CR_SEGMENT_ID_BOOST1 || + segment_id == CR_SEGMENT_ID_BOOST2; +} + +static INLINE int cyclic_refresh_segment_id(int segment_id) { + if (segment_id == CR_SEGMENT_ID_BOOST1) + return CR_SEGMENT_ID_BOOST1; + else if (segment_id == CR_SEGMENT_ID_BOOST2) + return CR_SEGMENT_ID_BOOST2; + else + return CR_SEGMENT_ID_BASE; +} + +void vp9_cyclic_refresh_limit_q(const struct VP9_COMP *cpi, int *q); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.c b/media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.c new file mode 100644 index 0000000000..9e5f3bfb28 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.c @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vpx_ports/mem.h" +#include "vpx_ports/system_state.h" + +#include "vp9/encoder/vp9_aq_variance.h" + +#include "vp9/common/vp9_seg_common.h" + +#include "vp9/encoder/vp9_ratectrl.h" +#include "vp9/encoder/vp9_rd.h" +#include "vp9/encoder/vp9_encodeframe.h" +#include "vp9/encoder/vp9_segmentation.h" + +#define ENERGY_MIN (-4) +#define ENERGY_MAX (1) +#define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN + 1) +#define ENERGY_IN_BOUNDS(energy) \ + assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX) + +static const double rate_ratio[MAX_SEGMENTS] = { 2.5, 2.0, 1.5, 1.0, + 0.75, 1.0, 1.0, 1.0 }; +static const int segment_id[ENERGY_SPAN] = { 0, 1, 1, 2, 3, 4 }; + +#define SEGMENT_ID(i) segment_id[(i)-ENERGY_MIN] + +DECLARE_ALIGNED(16, static const uint8_t, vp9_64_zeros[64]) = { 0 }; +#if CONFIG_VP9_HIGHBITDEPTH +DECLARE_ALIGNED(16, static const uint16_t, vp9_highbd_64_zeros[64]) = { 0 }; +#endif + +unsigned int vp9_vaq_segment_id(int energy) { + ENERGY_IN_BOUNDS(energy); + return SEGMENT_ID(energy); +} + +void vp9_vaq_frame_setup(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + struct segmentation *seg = &cm->seg; + int i; + + if (frame_is_intra_only(cm) || cm->error_resilient_mode || + cpi->refresh_alt_ref_frame || cpi->force_update_segmentation || + (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) { + vp9_enable_segmentation(seg); + vp9_clearall_segfeatures(seg); + + seg->abs_delta = SEGMENT_DELTADATA; + + vpx_clear_system_state(); + + for (i = 0; i < MAX_SEGMENTS; ++i) { + int qindex_delta = + vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex, + rate_ratio[i], cm->bit_depth); + + // We don't allow qindex 0 in a segment if the base value is not 0. + // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment + // Q delta is sometimes applied without going back around the rd loop. + // This could lead to an illegal combination of partition size and q. + if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) { + qindex_delta = -cm->base_qindex + 1; + } + + // No need to enable SEG_LVL_ALT_Q for this segment. + if (rate_ratio[i] == 1.0) { + continue; + } + + vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, qindex_delta); + vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q); + } + } +} + +/* TODO(agrange, paulwilkins): The block_variance calls the unoptimized versions + * of variance() and highbd_8_variance(). It should not. + */ +static void aq_variance(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h, unsigned int *sse, + int *sum) { + int i, j; + + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +static void aq_highbd_variance64(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, int h, + uint64_t *sse, int64_t *sum) { + int i, j; + + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + a += a_stride; + b += b_stride; + } +} + +#endif // CONFIG_VP9_HIGHBITDEPTH + +static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bs) { + MACROBLOCKD *xd = &x->e_mbd; + unsigned int var, sse; + int right_overflow = + (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0; + int bottom_overflow = + (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0; + + if (right_overflow || bottom_overflow) { + const int bw = 8 * num_8x8_blocks_wide_lookup[bs] - right_overflow; + const int bh = 8 * num_8x8_blocks_high_lookup[bs] - bottom_overflow; + int avg; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint64_t sse64 = 0; + int64_t sum64 = 0; + aq_highbd_variance64(x->plane[0].src.buf, x->plane[0].src.stride, + CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh, + &sse64, &sum64); + sse = (unsigned int)(sse64 >> (2 * (xd->bd - 8))); + avg = (int)(sum64 >> (xd->bd - 8)); + } else { + aq_variance(x->plane[0].src.buf, x->plane[0].src.stride, vp9_64_zeros, 0, + bw, bh, &sse, &avg); + } +#else + aq_variance(x->plane[0].src.buf, x->plane[0].src.stride, vp9_64_zeros, 0, + bw, bh, &sse, &avg); +#endif // CONFIG_VP9_HIGHBITDEPTH + var = sse - (unsigned int)(((int64_t)avg * avg) / (bw * bh)); + return (unsigned int)(((uint64_t)256 * var) / (bw * bh)); + } else { +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + var = + cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride, + CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, &sse); + } else { + var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride, + vp9_64_zeros, 0, &sse); + } +#else + var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride, + vp9_64_zeros, 0, &sse); +#endif // CONFIG_VP9_HIGHBITDEPTH + return (unsigned int)(((uint64_t)256 * var) >> num_pels_log2_lookup[bs]); + } +} + +double vp9_log_block_var(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) { + unsigned int var = block_variance(cpi, x, bs); + vpx_clear_system_state(); + return log(var + 1.0); +} + +#define DEFAULT_E_MIDPOINT 10.0 +static int scale_block_energy(VP9_COMP *cpi, unsigned int block_var) { + double energy; + double energy_midpoint; + energy_midpoint = + (cpi->oxcf.pass == 2) ? cpi->twopass.mb_av_energy : DEFAULT_E_MIDPOINT; + energy = log(block_var + 1.0) - energy_midpoint; + return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX); +} +#undef DEFAULT_E_MIDPOINT + +// Get the range of sub block energy values; +void vp9_get_sub_block_energy(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, + int mi_col, BLOCK_SIZE bsize, int *min_e, + int *max_e) { + VP9_COMMON *const cm = &cpi->common; + const int bw = num_8x8_blocks_wide_lookup[bsize]; + const int bh = num_8x8_blocks_high_lookup[bsize]; + const int xmis = VPXMIN(cm->mi_cols - mi_col, bw); + const int ymis = VPXMIN(cm->mi_rows - mi_row, bh); + int x, y; + + if (xmis < bw || ymis < bh) { + vp9_setup_src_planes(mb, cpi->Source, mi_row, mi_col); + *min_e = vp9_block_energy(cpi, mb, bsize); + *max_e = *min_e; + } else { + unsigned int var; + // Because scale_block_energy is non-decreasing, we can find the min/max + // block variance and scale afterwards. This avoids a costly scaling at + // every iteration. + unsigned int min_var = UINT_MAX; + unsigned int max_var = 0; + + for (y = 0; y < ymis; ++y) { + for (x = 0; x < xmis; ++x) { + vp9_setup_src_planes(mb, cpi->Source, mi_row + y, mi_col + x); + vpx_clear_system_state(); + var = block_variance(cpi, mb, BLOCK_8X8); + vpx_clear_system_state(); + min_var = VPXMIN(min_var, var); + max_var = VPXMAX(max_var, var); + } + } + *min_e = scale_block_energy(cpi, min_var); + *max_e = scale_block_energy(cpi, max_var); + } + + // Re-instate source pointers back to what they should have been on entry. + vp9_setup_src_planes(mb, cpi->Source, mi_row, mi_col); +} + +int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) { + unsigned int var; + vpx_clear_system_state(); + var = block_variance(cpi, x, bs); + vpx_clear_system_state(); + return scale_block_energy(cpi, var); +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.h b/media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.h new file mode 100644 index 0000000000..a4f872879d --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_AQ_VARIANCE_H_ +#define VPX_VP9_ENCODER_VP9_AQ_VARIANCE_H_ + +#include "vp9/encoder/vp9_encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +unsigned int vp9_vaq_segment_id(int energy); +void vp9_vaq_frame_setup(VP9_COMP *cpi); + +void vp9_get_sub_block_energy(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, + int mi_col, BLOCK_SIZE bsize, int *min_e, + int *max_e); +int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs); + +double vp9_log_block_var(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_AQ_VARIANCE_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_bitstream.c b/media/libvpx/libvpx/vp9/encoder/vp9_bitstream.c new file mode 100644 index 0000000000..ca56d14aa1 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_bitstream.c @@ -0,0 +1,1387 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "vpx/vpx_encoder.h" +#include "vpx_dsp/bitwriter_buffer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem_ops.h" +#include "vpx_ports/system_state.h" +#if CONFIG_BITSTREAM_DEBUG +#include "vpx_util/vpx_debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG + +#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_entropymv.h" +#include "vp9/common/vp9_mvref_common.h" +#include "vp9/common/vp9_pred_common.h" +#include "vp9/common/vp9_seg_common.h" +#include "vp9/common/vp9_tile_common.h" + +#include "vp9/encoder/vp9_cost.h" +#include "vp9/encoder/vp9_bitstream.h" +#include "vp9/encoder/vp9_encodemv.h" +#include "vp9/encoder/vp9_mcomp.h" +#include "vp9/encoder/vp9_segmentation.h" +#include "vp9/encoder/vp9_subexp.h" +#include "vp9/encoder/vp9_tokenize.h" + +static const struct vp9_token intra_mode_encodings[INTRA_MODES] = { + { 0, 1 }, { 6, 3 }, { 28, 5 }, { 30, 5 }, { 58, 6 }, + { 59, 6 }, { 126, 7 }, { 127, 7 }, { 62, 6 }, { 2, 2 } +}; +static const struct vp9_token + switchable_interp_encodings[SWITCHABLE_FILTERS] = { { 0, 1 }, + { 2, 2 }, + { 3, 2 } }; +static const struct vp9_token partition_encodings[PARTITION_TYPES] = { + { 0, 1 }, { 2, 2 }, { 6, 3 }, { 7, 3 } +}; +static const struct vp9_token inter_mode_encodings[INTER_MODES] = { + { 2, 2 }, { 6, 3 }, { 0, 1 }, { 7, 3 } +}; + +static void write_intra_mode(vpx_writer *w, PREDICTION_MODE mode, + const vpx_prob *probs) { + vp9_write_token(w, vp9_intra_mode_tree, probs, &intra_mode_encodings[mode]); +} + +static void write_inter_mode(vpx_writer *w, PREDICTION_MODE mode, + const vpx_prob *probs) { + assert(is_inter_mode(mode)); + vp9_write_token(w, vp9_inter_mode_tree, probs, + &inter_mode_encodings[INTER_OFFSET(mode)]); +} + +static void encode_unsigned_max(struct vpx_write_bit_buffer *wb, int data, + int max) { + vpx_wb_write_literal(wb, data, get_unsigned_bits(max)); +} + +static void prob_diff_update(const vpx_tree_index *tree, + vpx_prob probs[/*n - 1*/], + const unsigned int counts[/*n - 1*/], int n, + vpx_writer *w) { + int i; + unsigned int branch_ct[32][2]; + + // Assuming max number of probabilities <= 32 + assert(n <= 32); + + vp9_tree_probs_from_distribution(tree, branch_ct, counts); + for (i = 0; i < n - 1; ++i) + vp9_cond_prob_diff_update(w, &probs[i], branch_ct[i]); +} + +static void write_selected_tx_size(const VP9_COMMON *cm, + const MACROBLOCKD *const xd, vpx_writer *w) { + TX_SIZE tx_size = xd->mi[0]->tx_size; + BLOCK_SIZE bsize = xd->mi[0]->sb_type; + const TX_SIZE max_tx_size = max_txsize_lookup[bsize]; + const vpx_prob *const tx_probs = + get_tx_probs(max_tx_size, get_tx_size_context(xd), &cm->fc->tx_probs); + vpx_write(w, tx_size != TX_4X4, tx_probs[0]); + if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) { + vpx_write(w, tx_size != TX_8X8, tx_probs[1]); + if (tx_size != TX_8X8 && max_tx_size >= TX_32X32) + vpx_write(w, tx_size != TX_16X16, tx_probs[2]); + } +} + +static int write_skip(const VP9_COMMON *cm, const MACROBLOCKD *const xd, + int segment_id, const MODE_INFO *mi, vpx_writer *w) { + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { + return 1; + } else { + const int skip = mi->skip; + vpx_write(w, skip, vp9_get_skip_prob(cm, xd)); + return skip; + } +} + +static void update_skip_probs(VP9_COMMON *cm, vpx_writer *w, + FRAME_COUNTS *counts) { + int k; + + for (k = 0; k < SKIP_CONTEXTS; ++k) + vp9_cond_prob_diff_update(w, &cm->fc->skip_probs[k], counts->skip[k]); +} + +static void update_switchable_interp_probs(VP9_COMMON *cm, vpx_writer *w, + FRAME_COUNTS *counts) { + int j; + for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) + prob_diff_update(vp9_switchable_interp_tree, + cm->fc->switchable_interp_prob[j], + counts->switchable_interp[j], SWITCHABLE_FILTERS, w); +} + +static void pack_mb_tokens(vpx_writer *w, TOKENEXTRA **tp, + const TOKENEXTRA *const stop, + vpx_bit_depth_t bit_depth) { + const TOKENEXTRA *p; + const vp9_extra_bit *const extra_bits = +#if CONFIG_VP9_HIGHBITDEPTH + (bit_depth == VPX_BITS_12) ? vp9_extra_bits_high12 + : (bit_depth == VPX_BITS_10) ? vp9_extra_bits_high10 + : vp9_extra_bits; +#else + vp9_extra_bits; + (void)bit_depth; +#endif // CONFIG_VP9_HIGHBITDEPTH + + for (p = *tp; p < stop && p->token != EOSB_TOKEN; ++p) { + if (p->token == EOB_TOKEN) { + vpx_write(w, 0, p->context_tree[0]); + continue; + } + vpx_write(w, 1, p->context_tree[0]); + while (p->token == ZERO_TOKEN) { + vpx_write(w, 0, p->context_tree[1]); + ++p; + if (p == stop || p->token == EOSB_TOKEN) { + *tp = (TOKENEXTRA *)(uintptr_t)p + (p->token == EOSB_TOKEN); + return; + } + } + + { + const int t = p->token; + const vpx_prob *const context_tree = p->context_tree; + assert(t != ZERO_TOKEN); + assert(t != EOB_TOKEN); + assert(t != EOSB_TOKEN); + vpx_write(w, 1, context_tree[1]); + if (t == ONE_TOKEN) { + vpx_write(w, 0, context_tree[2]); + vpx_write_bit(w, p->extra & 1); + } else { // t >= TWO_TOKEN && t < EOB_TOKEN + const struct vp9_token *const a = &vp9_coef_encodings[t]; + int v = a->value; + int n = a->len; + const int e = p->extra; + vpx_write(w, 1, context_tree[2]); + vp9_write_tree(w, vp9_coef_con_tree, + vp9_pareto8_full[context_tree[PIVOT_NODE] - 1], v, + n - UNCONSTRAINED_NODES, 0); + if (t >= CATEGORY1_TOKEN) { + const vp9_extra_bit *const b = &extra_bits[t]; + const unsigned char *pb = b->prob; + v = e >> 1; + n = b->len; // number of bits in v, assumed nonzero + do { + const int bb = (v >> --n) & 1; + vpx_write(w, bb, *pb++); + } while (n); + } + vpx_write_bit(w, e & 1); + } + } + } + *tp = (TOKENEXTRA *)(uintptr_t)p + (p->token == EOSB_TOKEN); +} + +static void write_segment_id(vpx_writer *w, const struct segmentation *seg, + int segment_id) { + if (seg->enabled && seg->update_map) + vp9_write_tree(w, vp9_segment_tree, seg->tree_probs, segment_id, 3, 0); +} + +// This function encodes the reference frame +static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *const xd, + vpx_writer *w) { + const MODE_INFO *const mi = xd->mi[0]; + const int is_compound = has_second_ref(mi); + const int segment_id = mi->segment_id; + + // If segment level coding of this signal is disabled... + // or the segment allows multiple reference frame options + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { + assert(!is_compound); + assert(mi->ref_frame[0] == + get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME)); + } else { + // does the feature use compound prediction or not + // (if not specified at the frame/segment level) + if (cm->reference_mode == REFERENCE_MODE_SELECT) { + vpx_write(w, is_compound, vp9_get_reference_mode_prob(cm, xd)); + } else { + assert((!is_compound) == (cm->reference_mode == SINGLE_REFERENCE)); + } + + if (is_compound) { + const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref]; + vpx_write(w, mi->ref_frame[!idx] == cm->comp_var_ref[1], + vp9_get_pred_prob_comp_ref_p(cm, xd)); + } else { + const int bit0 = mi->ref_frame[0] != LAST_FRAME; + vpx_write(w, bit0, vp9_get_pred_prob_single_ref_p1(cm, xd)); + if (bit0) { + const int bit1 = mi->ref_frame[0] != GOLDEN_FRAME; + vpx_write(w, bit1, vp9_get_pred_prob_single_ref_p2(cm, xd)); + } + } + } +} + +static void pack_inter_mode_mvs(VP9_COMP *cpi, const MACROBLOCKD *const xd, + const MB_MODE_INFO_EXT *const mbmi_ext, + vpx_writer *w, + unsigned int *const max_mv_magnitude, + int interp_filter_selected[][SWITCHABLE]) { + VP9_COMMON *const cm = &cpi->common; + const nmv_context *nmvc = &cm->fc->nmvc; + const struct segmentation *const seg = &cm->seg; + const MODE_INFO *const mi = xd->mi[0]; + const PREDICTION_MODE mode = mi->mode; + const int segment_id = mi->segment_id; + const BLOCK_SIZE bsize = mi->sb_type; + const int allow_hp = cm->allow_high_precision_mv; + const int is_inter = is_inter_block(mi); + const int is_compound = has_second_ref(mi); + int skip, ref; + + if (seg->update_map) { + if (seg->temporal_update) { + const int pred_flag = mi->seg_id_predicted; + vpx_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd); + vpx_write(w, pred_flag, pred_prob); + if (!pred_flag) write_segment_id(w, seg, segment_id); + } else { + write_segment_id(w, seg, segment_id); + } + } + + skip = write_skip(cm, xd, segment_id, mi, w); + + if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) + vpx_write(w, is_inter, vp9_get_intra_inter_prob(cm, xd)); + + if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT && + !(is_inter && skip)) { + write_selected_tx_size(cm, xd, w); + } + + if (!is_inter) { + if (bsize >= BLOCK_8X8) { + write_intra_mode(w, mode, cm->fc->y_mode_prob[size_group_lookup[bsize]]); + } else { + int idx, idy; + const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; + for (idy = 0; idy < 2; idy += num_4x4_h) { + for (idx = 0; idx < 2; idx += num_4x4_w) { + const PREDICTION_MODE b_mode = mi->bmi[idy * 2 + idx].as_mode; + write_intra_mode(w, b_mode, cm->fc->y_mode_prob[0]); + } + } + } + write_intra_mode(w, mi->uv_mode, cm->fc->uv_mode_prob[mode]); + } else { + const int mode_ctx = mbmi_ext->mode_context[mi->ref_frame[0]]; + const vpx_prob *const inter_probs = cm->fc->inter_mode_probs[mode_ctx]; + write_ref_frames(cm, xd, w); + + // If segment skip is not enabled code the mode. + if (!segfeature_active(seg, segment_id, SEG_LVL_SKIP)) { + if (bsize >= BLOCK_8X8) { + write_inter_mode(w, mode, inter_probs); + } + } + + if (cm->interp_filter == SWITCHABLE) { + const int ctx = get_pred_context_switchable_interp(xd); + vp9_write_token(w, vp9_switchable_interp_tree, + cm->fc->switchable_interp_prob[ctx], + &switchable_interp_encodings[mi->interp_filter]); + ++interp_filter_selected[0][mi->interp_filter]; + } else { + assert(mi->interp_filter == cm->interp_filter); + } + + if (bsize < BLOCK_8X8) { + const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; + int idx, idy; + for (idy = 0; idy < 2; idy += num_4x4_h) { + for (idx = 0; idx < 2; idx += num_4x4_w) { + const int j = idy * 2 + idx; + const PREDICTION_MODE b_mode = mi->bmi[j].as_mode; + write_inter_mode(w, b_mode, inter_probs); + if (b_mode == NEWMV) { + for (ref = 0; ref < 1 + is_compound; ++ref) + vp9_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv, + &mbmi_ext->ref_mvs[mi->ref_frame[ref]][0].as_mv, + nmvc, allow_hp, max_mv_magnitude); + } + } + } + } else { + if (mode == NEWMV) { + for (ref = 0; ref < 1 + is_compound; ++ref) + vp9_encode_mv(cpi, w, &mi->mv[ref].as_mv, + &mbmi_ext->ref_mvs[mi->ref_frame[ref]][0].as_mv, nmvc, + allow_hp, max_mv_magnitude); + } + } + } +} + +static void write_mb_modes_kf(const VP9_COMMON *cm, const MACROBLOCKD *xd, + vpx_writer *w) { + const struct segmentation *const seg = &cm->seg; + const MODE_INFO *const mi = xd->mi[0]; + const MODE_INFO *const above_mi = xd->above_mi; + const MODE_INFO *const left_mi = xd->left_mi; + const BLOCK_SIZE bsize = mi->sb_type; + + if (seg->update_map) write_segment_id(w, seg, mi->segment_id); + + write_skip(cm, xd, mi->segment_id, mi, w); + + if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT) + write_selected_tx_size(cm, xd, w); + + if (bsize >= BLOCK_8X8) { + write_intra_mode(w, mi->mode, get_y_mode_probs(mi, above_mi, left_mi, 0)); + } else { + const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; + int idx, idy; + + for (idy = 0; idy < 2; idy += num_4x4_h) { + for (idx = 0; idx < 2; idx += num_4x4_w) { + const int block = idy * 2 + idx; + write_intra_mode(w, mi->bmi[block].as_mode, + get_y_mode_probs(mi, above_mi, left_mi, block)); + } + } + } + + write_intra_mode(w, mi->uv_mode, vp9_kf_uv_mode_prob[mi->mode]); +} + +static void write_modes_b(VP9_COMP *cpi, MACROBLOCKD *const xd, + const TileInfo *const tile, vpx_writer *w, + TOKENEXTRA **tok, const TOKENEXTRA *const tok_end, + int mi_row, int mi_col, + unsigned int *const max_mv_magnitude, + int interp_filter_selected[][SWITCHABLE]) { + const VP9_COMMON *const cm = &cpi->common; + const MB_MODE_INFO_EXT *const mbmi_ext = + cpi->td.mb.mbmi_ext_base + (mi_row * cm->mi_cols + mi_col); + MODE_INFO *m; + + xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col); + m = xd->mi[0]; + + set_mi_row_col(xd, tile, mi_row, num_8x8_blocks_high_lookup[m->sb_type], + mi_col, num_8x8_blocks_wide_lookup[m->sb_type], cm->mi_rows, + cm->mi_cols); + if (frame_is_intra_only(cm)) { + write_mb_modes_kf(cm, xd, w); + } else { + pack_inter_mode_mvs(cpi, xd, mbmi_ext, w, max_mv_magnitude, + interp_filter_selected); + } + + assert(*tok < tok_end); + pack_mb_tokens(w, tok, tok_end, cm->bit_depth); +} + +static void write_partition(const VP9_COMMON *const cm, + const MACROBLOCKD *const xd, int hbs, int mi_row, + int mi_col, PARTITION_TYPE p, BLOCK_SIZE bsize, + vpx_writer *w) { + const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize); + const vpx_prob *const probs = xd->partition_probs[ctx]; + const int has_rows = (mi_row + hbs) < cm->mi_rows; + const int has_cols = (mi_col + hbs) < cm->mi_cols; + + if (has_rows && has_cols) { + vp9_write_token(w, vp9_partition_tree, probs, &partition_encodings[p]); + } else if (!has_rows && has_cols) { + assert(p == PARTITION_SPLIT || p == PARTITION_HORZ); + vpx_write(w, p == PARTITION_SPLIT, probs[1]); + } else if (has_rows && !has_cols) { + assert(p == PARTITION_SPLIT || p == PARTITION_VERT); + vpx_write(w, p == PARTITION_SPLIT, probs[2]); + } else { + assert(p == PARTITION_SPLIT); + } +} + +static void write_modes_sb(VP9_COMP *cpi, MACROBLOCKD *const xd, + const TileInfo *const tile, vpx_writer *w, + TOKENEXTRA **tok, const TOKENEXTRA *const tok_end, + int mi_row, int mi_col, BLOCK_SIZE bsize, + unsigned int *const max_mv_magnitude, + int interp_filter_selected[][SWITCHABLE]) { + const VP9_COMMON *const cm = &cpi->common; + const int bsl = b_width_log2_lookup[bsize]; + const int bs = (1 << bsl) / 4; + PARTITION_TYPE partition; + BLOCK_SIZE subsize; + const MODE_INFO *m = NULL; + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + m = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]; + + partition = partition_lookup[bsl][m->sb_type]; + write_partition(cm, xd, bs, mi_row, mi_col, partition, bsize, w); + subsize = get_subsize(bsize, partition); + if (subsize < BLOCK_8X8) { + write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col, + max_mv_magnitude, interp_filter_selected); + } else { + switch (partition) { + case PARTITION_NONE: + write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col, + max_mv_magnitude, interp_filter_selected); + break; + case PARTITION_HORZ: + write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col, + max_mv_magnitude, interp_filter_selected); + if (mi_row + bs < cm->mi_rows) + write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row + bs, mi_col, + max_mv_magnitude, interp_filter_selected); + break; + case PARTITION_VERT: + write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col, + max_mv_magnitude, interp_filter_selected); + if (mi_col + bs < cm->mi_cols) + write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col + bs, + max_mv_magnitude, interp_filter_selected); + break; + default: + assert(partition == PARTITION_SPLIT); + write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col, subsize, + max_mv_magnitude, interp_filter_selected); + write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col + bs, + subsize, max_mv_magnitude, interp_filter_selected); + write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row + bs, mi_col, + subsize, max_mv_magnitude, interp_filter_selected); + write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row + bs, mi_col + bs, + subsize, max_mv_magnitude, interp_filter_selected); + break; + } + } + + // update partition context + if (bsize >= BLOCK_8X8 && + (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT)) + update_partition_context(xd, mi_row, mi_col, subsize, bsize); +} + +static void write_modes(VP9_COMP *cpi, MACROBLOCKD *const xd, + const TileInfo *const tile, vpx_writer *w, int tile_row, + int tile_col, unsigned int *const max_mv_magnitude, + int interp_filter_selected[][SWITCHABLE]) { + const VP9_COMMON *const cm = &cpi->common; + int mi_row, mi_col, tile_sb_row; + TOKENEXTRA *tok = NULL; + TOKENEXTRA *tok_end = NULL; + + set_partition_probs(cm, xd); + + for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end; + mi_row += MI_BLOCK_SIZE) { + tile_sb_row = mi_cols_aligned_to_sb(mi_row - tile->mi_row_start) >> + MI_BLOCK_SIZE_LOG2; + tok = cpi->tplist[tile_row][tile_col][tile_sb_row].start; + tok_end = tok + cpi->tplist[tile_row][tile_col][tile_sb_row].count; + + vp9_zero(xd->left_seg_context); + for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; + mi_col += MI_BLOCK_SIZE) + write_modes_sb(cpi, xd, tile, w, &tok, tok_end, mi_row, mi_col, + BLOCK_64X64, max_mv_magnitude, interp_filter_selected); + + assert(tok == cpi->tplist[tile_row][tile_col][tile_sb_row].stop); + } +} + +static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size, + vp9_coeff_stats *coef_branch_ct, + vp9_coeff_probs_model *coef_probs) { + vp9_coeff_count *coef_counts = cpi->td.rd_counts.coef_counts[tx_size]; + unsigned int(*eob_branch_ct)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] = + cpi->common.counts.eob_branch[tx_size]; + int i, j, k, l, m; + + for (i = 0; i < PLANE_TYPES; ++i) { + for (j = 0; j < REF_TYPES; ++j) { + for (k = 0; k < COEF_BANDS; ++k) { + for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) { + vp9_tree_probs_from_distribution(vp9_coef_tree, + coef_branch_ct[i][j][k][l], + coef_counts[i][j][k][l]); + coef_branch_ct[i][j][k][l][0][1] = + eob_branch_ct[i][j][k][l] - coef_branch_ct[i][j][k][l][0][0]; + for (m = 0; m < UNCONSTRAINED_NODES; ++m) + coef_probs[i][j][k][l][m] = + get_binary_prob(coef_branch_ct[i][j][k][l][m][0], + coef_branch_ct[i][j][k][l][m][1]); + } + } + } + } +} + +static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi, + TX_SIZE tx_size, + vp9_coeff_stats *frame_branch_ct, + vp9_coeff_probs_model *new_coef_probs) { + vp9_coeff_probs_model *old_coef_probs = cpi->common.fc->coef_probs[tx_size]; + const vpx_prob upd = DIFF_UPDATE_PROB; + const int entropy_nodes_update = UNCONSTRAINED_NODES; + int i, j, k, l, t; + int stepsize = cpi->sf.coeff_prob_appx_step; + + switch (cpi->sf.use_fast_coef_updates) { + case TWO_LOOP: { + /* dry run to see if there is any update at all needed */ + int64_t savings = 0; + int update[2] = { 0, 0 }; + for (i = 0; i < PLANE_TYPES; ++i) { + for (j = 0; j < REF_TYPES; ++j) { + for (k = 0; k < COEF_BANDS; ++k) { + for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) { + for (t = 0; t < entropy_nodes_update; ++t) { + vpx_prob newp = new_coef_probs[i][j][k][l][t]; + const vpx_prob oldp = old_coef_probs[i][j][k][l][t]; + int64_t s; + int u = 0; + if (t == PIVOT_NODE) + s = vp9_prob_diff_update_savings_search_model( + frame_branch_ct[i][j][k][l][0], oldp, &newp, upd, + stepsize); + else + s = vp9_prob_diff_update_savings_search( + frame_branch_ct[i][j][k][l][t], oldp, &newp, upd); + if (s > 0 && newp != oldp) u = 1; + if (u) + savings += s - (int)(vp9_cost_zero(upd)); + else + savings -= (int)(vp9_cost_zero(upd)); + update[u]++; + } + } + } + } + } + + // printf("Update %d %d, savings %d\n", update[0], update[1], savings); + /* Is coef updated at all */ + if (update[1] == 0 || savings < 0) { + vpx_write_bit(bc, 0); + return; + } + vpx_write_bit(bc, 1); + for (i = 0; i < PLANE_TYPES; ++i) { + for (j = 0; j < REF_TYPES; ++j) { + for (k = 0; k < COEF_BANDS; ++k) { + for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) { + // calc probs and branch cts for this frame only + for (t = 0; t < entropy_nodes_update; ++t) { + vpx_prob newp = new_coef_probs[i][j][k][l][t]; + vpx_prob *oldp = old_coef_probs[i][j][k][l] + t; + int64_t s; + int u = 0; + if (t == PIVOT_NODE) + s = vp9_prob_diff_update_savings_search_model( + frame_branch_ct[i][j][k][l][0], *oldp, &newp, upd, + stepsize); + else + s = vp9_prob_diff_update_savings_search( + frame_branch_ct[i][j][k][l][t], *oldp, &newp, upd); + if (s > 0 && newp != *oldp) u = 1; + vpx_write(bc, u, upd); + if (u) { + /* send/use new probability */ + vp9_write_prob_diff_update(bc, newp, *oldp); + *oldp = newp; + } + } + } + } + } + } + return; + } + + default: { + int updates = 0; + int noupdates_before_first = 0; + assert(cpi->sf.use_fast_coef_updates == ONE_LOOP_REDUCED); + for (i = 0; i < PLANE_TYPES; ++i) { + for (j = 0; j < REF_TYPES; ++j) { + for (k = 0; k < COEF_BANDS; ++k) { + for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) { + // calc probs and branch cts for this frame only + for (t = 0; t < entropy_nodes_update; ++t) { + vpx_prob newp = new_coef_probs[i][j][k][l][t]; + vpx_prob *oldp = old_coef_probs[i][j][k][l] + t; + int64_t s; + int u = 0; + + if (t == PIVOT_NODE) { + s = vp9_prob_diff_update_savings_search_model( + frame_branch_ct[i][j][k][l][0], *oldp, &newp, upd, + stepsize); + } else { + s = vp9_prob_diff_update_savings_search( + frame_branch_ct[i][j][k][l][t], *oldp, &newp, upd); + } + + if (s > 0 && newp != *oldp) u = 1; + updates += u; + if (u == 0 && updates == 0) { + noupdates_before_first++; + continue; + } + if (u == 1 && updates == 1) { + int v; + // first update + vpx_write_bit(bc, 1); + for (v = 0; v < noupdates_before_first; ++v) + vpx_write(bc, 0, upd); + } + vpx_write(bc, u, upd); + if (u) { + /* send/use new probability */ + vp9_write_prob_diff_update(bc, newp, *oldp); + *oldp = newp; + } + } + } + } + } + } + if (updates == 0) { + vpx_write_bit(bc, 0); // no updates + } + return; + } + } +} + +static void update_coef_probs(VP9_COMP *cpi, vpx_writer *w) { + const TX_MODE tx_mode = cpi->common.tx_mode; + const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode]; + TX_SIZE tx_size; + for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size) { + vp9_coeff_stats frame_branch_ct[PLANE_TYPES]; + vp9_coeff_probs_model frame_coef_probs[PLANE_TYPES]; + if (cpi->td.counts->tx.tx_totals[tx_size] <= 20 || + (tx_size >= TX_16X16 && cpi->sf.tx_size_search_method == USE_TX_8X8)) { + vpx_write_bit(w, 0); + } else { + build_tree_distribution(cpi, tx_size, frame_branch_ct, frame_coef_probs); + update_coef_probs_common(w, cpi, tx_size, frame_branch_ct, + frame_coef_probs); + } + } +} + +static void encode_loopfilter(struct loopfilter *lf, + struct vpx_write_bit_buffer *wb) { + int i; + + // Encode the loop filter level and type + vpx_wb_write_literal(wb, lf->filter_level, 6); + vpx_wb_write_literal(wb, lf->sharpness_level, 3); + + // Write out loop filter deltas applied at the MB level based on mode or + // ref frame (if they are enabled). + vpx_wb_write_bit(wb, lf->mode_ref_delta_enabled); + + if (lf->mode_ref_delta_enabled) { + vpx_wb_write_bit(wb, lf->mode_ref_delta_update); + if (lf->mode_ref_delta_update) { + for (i = 0; i < MAX_REF_LF_DELTAS; i++) { + const int delta = lf->ref_deltas[i]; + const int changed = delta != lf->last_ref_deltas[i]; + vpx_wb_write_bit(wb, changed); + if (changed) { + lf->last_ref_deltas[i] = delta; + vpx_wb_write_literal(wb, abs(delta) & 0x3F, 6); + vpx_wb_write_bit(wb, delta < 0); + } + } + + for (i = 0; i < MAX_MODE_LF_DELTAS; i++) { + const int delta = lf->mode_deltas[i]; + const int changed = delta != lf->last_mode_deltas[i]; + vpx_wb_write_bit(wb, changed); + if (changed) { + lf->last_mode_deltas[i] = delta; + vpx_wb_write_literal(wb, abs(delta) & 0x3F, 6); + vpx_wb_write_bit(wb, delta < 0); + } + } + } + } +} + +static void write_delta_q(struct vpx_write_bit_buffer *wb, int delta_q) { + if (delta_q != 0) { + vpx_wb_write_bit(wb, 1); + vpx_wb_write_literal(wb, abs(delta_q), 4); + vpx_wb_write_bit(wb, delta_q < 0); + } else { + vpx_wb_write_bit(wb, 0); + } +} + +static void encode_quantization(const VP9_COMMON *const cm, + struct vpx_write_bit_buffer *wb) { + vpx_wb_write_literal(wb, cm->base_qindex, QINDEX_BITS); + write_delta_q(wb, cm->y_dc_delta_q); + write_delta_q(wb, cm->uv_dc_delta_q); + write_delta_q(wb, cm->uv_ac_delta_q); +} + +static void encode_segmentation(VP9_COMMON *cm, MACROBLOCKD *xd, + struct vpx_write_bit_buffer *wb) { + int i, j; + + const struct segmentation *seg = &cm->seg; + + vpx_wb_write_bit(wb, seg->enabled); + if (!seg->enabled) return; + + // Segmentation map + vpx_wb_write_bit(wb, seg->update_map); + if (seg->update_map) { + // Select the coding strategy (temporal or spatial) + vp9_choose_segmap_coding_method(cm, xd); + // Write out probabilities used to decode unpredicted macro-block segments + for (i = 0; i < SEG_TREE_PROBS; i++) { + const int prob = seg->tree_probs[i]; + const int update = prob != MAX_PROB; + vpx_wb_write_bit(wb, update); + if (update) vpx_wb_write_literal(wb, prob, 8); + } + + // Write out the chosen coding method. + vpx_wb_write_bit(wb, seg->temporal_update); + if (seg->temporal_update) { + for (i = 0; i < PREDICTION_PROBS; i++) { + const int prob = seg->pred_probs[i]; + const int update = prob != MAX_PROB; + vpx_wb_write_bit(wb, update); + if (update) vpx_wb_write_literal(wb, prob, 8); + } + } + } + + // Segmentation data + vpx_wb_write_bit(wb, seg->update_data); + if (seg->update_data) { + vpx_wb_write_bit(wb, seg->abs_delta); + + for (i = 0; i < MAX_SEGMENTS; i++) { + for (j = 0; j < SEG_LVL_MAX; j++) { + const int active = segfeature_active(seg, i, j); + vpx_wb_write_bit(wb, active); + if (active) { + const int data = get_segdata(seg, i, j); + const int data_max = vp9_seg_feature_data_max(j); + + if (vp9_is_segfeature_signed(j)) { + encode_unsigned_max(wb, abs(data), data_max); + vpx_wb_write_bit(wb, data < 0); + } else { + encode_unsigned_max(wb, data, data_max); + } + } + } + } + } +} + +static void encode_txfm_probs(VP9_COMMON *cm, vpx_writer *w, + FRAME_COUNTS *counts) { + // Mode + vpx_write_literal(w, VPXMIN(cm->tx_mode, ALLOW_32X32), 2); + if (cm->tx_mode >= ALLOW_32X32) + vpx_write_bit(w, cm->tx_mode == TX_MODE_SELECT); + + // Probabilities + if (cm->tx_mode == TX_MODE_SELECT) { + int i, j; + unsigned int ct_8x8p[TX_SIZES - 3][2]; + unsigned int ct_16x16p[TX_SIZES - 2][2]; + unsigned int ct_32x32p[TX_SIZES - 1][2]; + + for (i = 0; i < TX_SIZE_CONTEXTS; i++) { + tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], ct_8x8p); + for (j = 0; j < TX_SIZES - 3; j++) + vp9_cond_prob_diff_update(w, &cm->fc->tx_probs.p8x8[i][j], ct_8x8p[j]); + } + + for (i = 0; i < TX_SIZE_CONTEXTS; i++) { + tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], ct_16x16p); + for (j = 0; j < TX_SIZES - 2; j++) + vp9_cond_prob_diff_update(w, &cm->fc->tx_probs.p16x16[i][j], + ct_16x16p[j]); + } + + for (i = 0; i < TX_SIZE_CONTEXTS; i++) { + tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], ct_32x32p); + for (j = 0; j < TX_SIZES - 1; j++) + vp9_cond_prob_diff_update(w, &cm->fc->tx_probs.p32x32[i][j], + ct_32x32p[j]); + } + } +} + +static void write_interp_filter(INTERP_FILTER filter, + struct vpx_write_bit_buffer *wb) { + const int filter_to_literal[] = { 1, 0, 2, 3 }; + + vpx_wb_write_bit(wb, filter == SWITCHABLE); + if (filter != SWITCHABLE) + vpx_wb_write_literal(wb, filter_to_literal[filter], 2); +} + +static void fix_interp_filter(VP9_COMMON *cm, FRAME_COUNTS *counts) { + if (cm->interp_filter == SWITCHABLE) { + // Check to see if only one of the filters is actually used + int count[SWITCHABLE_FILTERS]; + int i, j, c = 0; + for (i = 0; i < SWITCHABLE_FILTERS; ++i) { + count[i] = 0; + for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) + count[i] += counts->switchable_interp[j][i]; + c += (count[i] > 0); + } + if (c == 1) { + // Only one filter is used. So set the filter at frame level + for (i = 0; i < SWITCHABLE_FILTERS; ++i) { + if (count[i]) { + cm->interp_filter = i; + break; + } + } + } + } +} + +static void write_tile_info(const VP9_COMMON *const cm, + struct vpx_write_bit_buffer *wb) { + int min_log2_tile_cols, max_log2_tile_cols, ones; + vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols); + + // columns + ones = cm->log2_tile_cols - min_log2_tile_cols; + while (ones--) vpx_wb_write_bit(wb, 1); + + if (cm->log2_tile_cols < max_log2_tile_cols) vpx_wb_write_bit(wb, 0); + + // rows + vpx_wb_write_bit(wb, cm->log2_tile_rows != 0); + if (cm->log2_tile_rows != 0) vpx_wb_write_bit(wb, cm->log2_tile_rows != 1); +} + +int vp9_get_refresh_mask(VP9_COMP *cpi) { + if (vp9_preserve_existing_gf(cpi)) { + // We have decided to preserve the previously existing golden frame as our + // new ARF frame. However, in the short term we leave it in the GF slot and, + // if we're updating the GF with the current decoded frame, we save it + // instead to the ARF slot. + // Later, in the function vp9_encoder.c:vp9_update_reference_frames() we + // will swap gld_fb_idx and alt_fb_idx to achieve our objective. We do it + // there so that it can be done outside of the recode loop. + // Note: This is highly specific to the use of ARF as a forward reference, + // and this needs to be generalized as other uses are implemented + // (like RTC/temporal scalability). + return (cpi->refresh_last_frame << cpi->lst_fb_idx) | + (cpi->refresh_golden_frame << cpi->alt_fb_idx); + } else { + int arf_idx = cpi->alt_fb_idx; + GF_GROUP *const gf_group = &cpi->twopass.gf_group; + + if (cpi->multi_layer_arf) { + for (arf_idx = 0; arf_idx < REF_FRAMES; ++arf_idx) { + if (arf_idx != cpi->alt_fb_idx && arf_idx != cpi->lst_fb_idx && + arf_idx != cpi->gld_fb_idx) { + int idx; + for (idx = 0; idx < gf_group->stack_size; ++idx) + if (arf_idx == gf_group->arf_index_stack[idx]) break; + if (idx == gf_group->stack_size) break; + } + } + } + cpi->twopass.gf_group.top_arf_idx = arf_idx; + + if (cpi->use_svc && cpi->svc.use_set_ref_frame_config && + cpi->svc.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) + return cpi->svc.update_buffer_slot[cpi->svc.spatial_layer_id]; + return (cpi->refresh_last_frame << cpi->lst_fb_idx) | + (cpi->refresh_golden_frame << cpi->gld_fb_idx) | + (cpi->refresh_alt_ref_frame << arf_idx); + } +} + +static int encode_tile_worker(void *arg1, void *arg2) { + VP9_COMP *cpi = (VP9_COMP *)arg1; + VP9BitstreamWorkerData *data = (VP9BitstreamWorkerData *)arg2; + MACROBLOCKD *const xd = &data->xd; + const int tile_row = 0; + vpx_start_encode(&data->bit_writer, data->dest); + write_modes(cpi, xd, &cpi->tile_data[data->tile_idx].tile_info, + &data->bit_writer, tile_row, data->tile_idx, + &data->max_mv_magnitude, data->interp_filter_selected); + vpx_stop_encode(&data->bit_writer); + return 1; +} + +void vp9_bitstream_encode_tiles_buffer_dealloc(VP9_COMP *const cpi) { + if (cpi->vp9_bitstream_worker_data) { + int i; + for (i = 1; i < cpi->num_workers; ++i) { + vpx_free(cpi->vp9_bitstream_worker_data[i].dest); + } + vpx_free(cpi->vp9_bitstream_worker_data); + cpi->vp9_bitstream_worker_data = NULL; + } +} + +static void encode_tiles_buffer_alloc(VP9_COMP *const cpi) { + VP9_COMMON *const cm = &cpi->common; + int i; + const size_t worker_data_size = + cpi->num_workers * sizeof(*cpi->vp9_bitstream_worker_data); + CHECK_MEM_ERROR(&cm->error, cpi->vp9_bitstream_worker_data, + vpx_memalign(16, worker_data_size)); + memset(cpi->vp9_bitstream_worker_data, 0, worker_data_size); + for (i = 1; i < cpi->num_workers; ++i) { + cpi->vp9_bitstream_worker_data[i].dest_size = + cpi->oxcf.width * cpi->oxcf.height; + CHECK_MEM_ERROR(&cm->error, cpi->vp9_bitstream_worker_data[i].dest, + vpx_malloc(cpi->vp9_bitstream_worker_data[i].dest_size)); + } +} + +static size_t encode_tiles_mt(VP9_COMP *cpi, uint8_t *data_ptr) { + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int num_workers = cpi->num_workers; + size_t total_size = 0; + int tile_col = 0; + + if (!cpi->vp9_bitstream_worker_data || + cpi->vp9_bitstream_worker_data[1].dest_size > + (cpi->oxcf.width * cpi->oxcf.height)) { + vp9_bitstream_encode_tiles_buffer_dealloc(cpi); + encode_tiles_buffer_alloc(cpi); + } + + while (tile_col < tile_cols) { + int i, j; + for (i = 0; i < num_workers && tile_col < tile_cols; ++i) { + VPxWorker *const worker = &cpi->workers[i]; + VP9BitstreamWorkerData *const data = &cpi->vp9_bitstream_worker_data[i]; + + // Populate the worker data. + data->xd = cpi->td.mb.e_mbd; + data->tile_idx = tile_col; + data->max_mv_magnitude = cpi->max_mv_magnitude; + memset(data->interp_filter_selected, 0, + sizeof(data->interp_filter_selected[0][0]) * SWITCHABLE); + + // First thread can directly write into the output buffer. + if (i == 0) { + // If this worker happens to be for the last tile, then do not offset it + // by 4 for the tile size. + data->dest = + data_ptr + total_size + (tile_col == tile_cols - 1 ? 0 : 4); + } + worker->data1 = cpi; + worker->data2 = data; + worker->hook = encode_tile_worker; + worker->had_error = 0; + + if (i < num_workers - 1) { + winterface->launch(worker); + } else { + winterface->execute(worker); + } + ++tile_col; + } + for (j = 0; j < i; ++j) { + VPxWorker *const worker = &cpi->workers[j]; + VP9BitstreamWorkerData *const data = + (VP9BitstreamWorkerData *)worker->data2; + uint32_t tile_size; + int k; + + if (!winterface->sync(worker)) return 0; + tile_size = data->bit_writer.pos; + + // Aggregate per-thread bitstream stats. + cpi->max_mv_magnitude = + VPXMAX(cpi->max_mv_magnitude, data->max_mv_magnitude); + for (k = 0; k < SWITCHABLE; ++k) { + cpi->interp_filter_selected[0][k] += data->interp_filter_selected[0][k]; + } + + // Prefix the size of the tile on all but the last. + if (tile_col != tile_cols || j < i - 1) { + mem_put_be32(data_ptr + total_size, tile_size); + total_size += 4; + } + if (j > 0) { + memcpy(data_ptr + total_size, data->dest, tile_size); + } + total_size += tile_size; + } + } + return total_size; +} + +static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + vpx_writer residual_bc; + int tile_row, tile_col; + size_t total_size = 0; + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + + memset(cm->above_seg_context, 0, + sizeof(*cm->above_seg_context) * mi_cols_aligned_to_sb(cm->mi_cols)); + + // Encoding tiles in parallel is done only for realtime mode now. In other + // modes the speed up is insignificant and requires further testing to ensure + // that it does not make the overall process worse in any case. + if (cpi->oxcf.mode == REALTIME && cpi->num_workers > 1 && tile_rows == 1 && + tile_cols > 1) { + return encode_tiles_mt(cpi, data_ptr); + } + + for (tile_row = 0; tile_row < tile_rows; tile_row++) { + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + int tile_idx = tile_row * tile_cols + tile_col; + + if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) + vpx_start_encode(&residual_bc, data_ptr + total_size + 4); + else + vpx_start_encode(&residual_bc, data_ptr + total_size); + + write_modes(cpi, xd, &cpi->tile_data[tile_idx].tile_info, &residual_bc, + tile_row, tile_col, &cpi->max_mv_magnitude, + cpi->interp_filter_selected); + + vpx_stop_encode(&residual_bc); + if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) { + // size of this tile + mem_put_be32(data_ptr + total_size, residual_bc.pos); + total_size += 4; + } + + total_size += residual_bc.pos; + } + } + return total_size; +} + +static void write_render_size(const VP9_COMMON *cm, + struct vpx_write_bit_buffer *wb) { + const int scaling_active = + cm->width != cm->render_width || cm->height != cm->render_height; + vpx_wb_write_bit(wb, scaling_active); + if (scaling_active) { + vpx_wb_write_literal(wb, cm->render_width - 1, 16); + vpx_wb_write_literal(wb, cm->render_height - 1, 16); + } +} + +static void write_frame_size(const VP9_COMMON *cm, + struct vpx_write_bit_buffer *wb) { + vpx_wb_write_literal(wb, cm->width - 1, 16); + vpx_wb_write_literal(wb, cm->height - 1, 16); + + write_render_size(cm, wb); +} + +static void write_frame_size_with_refs(VP9_COMP *cpi, + struct vpx_write_bit_buffer *wb) { + VP9_COMMON *const cm = &cpi->common; + int found = 0; + + MV_REFERENCE_FRAME ref_frame; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, ref_frame); + + // Set "found" to 0 for temporal svc and for spatial svc key frame + if (cpi->use_svc && + ((cpi->svc.number_temporal_layers > 1 && + cpi->oxcf.rc_mode == VPX_CBR) || + (cpi->svc.number_spatial_layers > 1 && + cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame))) { + found = 0; + } else if (cfg != NULL) { + found = + cm->width == cfg->y_crop_width && cm->height == cfg->y_crop_height; + } + vpx_wb_write_bit(wb, found); + if (found) { + break; + } + } + + if (!found) { + vpx_wb_write_literal(wb, cm->width - 1, 16); + vpx_wb_write_literal(wb, cm->height - 1, 16); + } + + write_render_size(cm, wb); +} + +static void write_sync_code(struct vpx_write_bit_buffer *wb) { + vpx_wb_write_literal(wb, VP9_SYNC_CODE_0, 8); + vpx_wb_write_literal(wb, VP9_SYNC_CODE_1, 8); + vpx_wb_write_literal(wb, VP9_SYNC_CODE_2, 8); +} + +static void write_profile(BITSTREAM_PROFILE profile, + struct vpx_write_bit_buffer *wb) { + switch (profile) { + case PROFILE_0: vpx_wb_write_literal(wb, 0, 2); break; + case PROFILE_1: vpx_wb_write_literal(wb, 2, 2); break; + case PROFILE_2: vpx_wb_write_literal(wb, 1, 2); break; + default: + assert(profile == PROFILE_3); + vpx_wb_write_literal(wb, 6, 3); + break; + } +} + +static void write_bitdepth_colorspace_sampling( + VP9_COMMON *const cm, struct vpx_write_bit_buffer *wb) { + if (cm->profile >= PROFILE_2) { + assert(cm->bit_depth > VPX_BITS_8); + vpx_wb_write_bit(wb, cm->bit_depth == VPX_BITS_10 ? 0 : 1); + } + vpx_wb_write_literal(wb, cm->color_space, 3); + if (cm->color_space != VPX_CS_SRGB) { + // 0: [16, 235] (i.e. xvYCC), 1: [0, 255] + vpx_wb_write_bit(wb, cm->color_range); + if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) { + assert(cm->subsampling_x != 1 || cm->subsampling_y != 1); + vpx_wb_write_bit(wb, cm->subsampling_x); + vpx_wb_write_bit(wb, cm->subsampling_y); + vpx_wb_write_bit(wb, 0); // unused + } else { + assert(cm->subsampling_x == 1 && cm->subsampling_y == 1); + } + } else { + assert(cm->profile == PROFILE_1 || cm->profile == PROFILE_3); + vpx_wb_write_bit(wb, 0); // unused + } +} + +static void write_uncompressed_header(VP9_COMP *cpi, + struct vpx_write_bit_buffer *wb) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + + vpx_wb_write_literal(wb, VP9_FRAME_MARKER, 2); + + write_profile(cm->profile, wb); + + // If to use show existing frame. + vpx_wb_write_bit(wb, cm->show_existing_frame); + if (cm->show_existing_frame) { + vpx_wb_write_literal(wb, cpi->alt_fb_idx, 3); + return; + } + + vpx_wb_write_bit(wb, cm->frame_type); + vpx_wb_write_bit(wb, cm->show_frame); + vpx_wb_write_bit(wb, cm->error_resilient_mode); + + if (cm->frame_type == KEY_FRAME) { + write_sync_code(wb); + write_bitdepth_colorspace_sampling(cm, wb); + write_frame_size(cm, wb); + } else { + if (!cm->show_frame) vpx_wb_write_bit(wb, cm->intra_only); + + if (!cm->error_resilient_mode) + vpx_wb_write_literal(wb, cm->reset_frame_context, 2); + + if (cm->intra_only) { + write_sync_code(wb); + + // Note for profile 0, 420 8bpp is assumed. + if (cm->profile > PROFILE_0) { + write_bitdepth_colorspace_sampling(cm, wb); + } + + vpx_wb_write_literal(wb, vp9_get_refresh_mask(cpi), REF_FRAMES); + write_frame_size(cm, wb); + } else { + MV_REFERENCE_FRAME ref_frame; + vpx_wb_write_literal(wb, vp9_get_refresh_mask(cpi), REF_FRAMES); + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX); + vpx_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame), + REF_FRAMES_LOG2); + vpx_wb_write_bit(wb, cm->ref_frame_sign_bias[ref_frame]); + } + + write_frame_size_with_refs(cpi, wb); + + vpx_wb_write_bit(wb, cm->allow_high_precision_mv); + + fix_interp_filter(cm, cpi->td.counts); + write_interp_filter(cm->interp_filter, wb); + } + } + + if (!cm->error_resilient_mode) { + vpx_wb_write_bit(wb, cm->refresh_frame_context); + vpx_wb_write_bit(wb, cm->frame_parallel_decoding_mode); + } + + vpx_wb_write_literal(wb, cm->frame_context_idx, FRAME_CONTEXTS_LOG2); + + encode_loopfilter(&cm->lf, wb); + encode_quantization(cm, wb); + encode_segmentation(cm, xd, wb); + + write_tile_info(cm, wb); +} + +static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + FRAME_CONTEXT *const fc = cm->fc; + FRAME_COUNTS *counts = cpi->td.counts; + vpx_writer header_bc; + + vpx_start_encode(&header_bc, data); + + if (xd->lossless) + cm->tx_mode = ONLY_4X4; + else + encode_txfm_probs(cm, &header_bc, counts); + + update_coef_probs(cpi, &header_bc); + update_skip_probs(cm, &header_bc, counts); + + if (!frame_is_intra_only(cm)) { + int i; + + for (i = 0; i < INTER_MODE_CONTEXTS; ++i) + prob_diff_update(vp9_inter_mode_tree, cm->fc->inter_mode_probs[i], + counts->inter_mode[i], INTER_MODES, &header_bc); + + if (cm->interp_filter == SWITCHABLE) + update_switchable_interp_probs(cm, &header_bc, counts); + + for (i = 0; i < INTRA_INTER_CONTEXTS; i++) + vp9_cond_prob_diff_update(&header_bc, &fc->intra_inter_prob[i], + counts->intra_inter[i]); + + if (cpi->allow_comp_inter_inter) { + const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE; + const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT; + + vpx_write_bit(&header_bc, use_compound_pred); + if (use_compound_pred) { + vpx_write_bit(&header_bc, use_hybrid_pred); + if (use_hybrid_pred) + for (i = 0; i < COMP_INTER_CONTEXTS; i++) + vp9_cond_prob_diff_update(&header_bc, &fc->comp_inter_prob[i], + counts->comp_inter[i]); + } + } + + if (cm->reference_mode != COMPOUND_REFERENCE) { + for (i = 0; i < REF_CONTEXTS; i++) { + vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][0], + counts->single_ref[i][0]); + vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][1], + counts->single_ref[i][1]); + } + } + + if (cm->reference_mode != SINGLE_REFERENCE) + for (i = 0; i < REF_CONTEXTS; i++) + vp9_cond_prob_diff_update(&header_bc, &fc->comp_ref_prob[i], + counts->comp_ref[i]); + + for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) + prob_diff_update(vp9_intra_mode_tree, cm->fc->y_mode_prob[i], + counts->y_mode[i], INTRA_MODES, &header_bc); + + for (i = 0; i < PARTITION_CONTEXTS; ++i) + prob_diff_update(vp9_partition_tree, fc->partition_prob[i], + counts->partition[i], PARTITION_TYPES, &header_bc); + + vp9_write_nmv_probs(cm, cm->allow_high_precision_mv, &header_bc, + &counts->mv); + } + + vpx_stop_encode(&header_bc); + assert(header_bc.pos <= 0xffff); + + return header_bc.pos; +} + +void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size) { + uint8_t *data = dest; + size_t first_part_size, uncompressed_hdr_size; + struct vpx_write_bit_buffer wb = { data, 0 }; + struct vpx_write_bit_buffer saved_wb; + +#if CONFIG_BITSTREAM_DEBUG + bitstream_queue_reset_write(); +#endif + + write_uncompressed_header(cpi, &wb); + + // Skip the rest coding process if use show existing frame. + if (cpi->common.show_existing_frame) { + uncompressed_hdr_size = vpx_wb_bytes_written(&wb); + data += uncompressed_hdr_size; + *size = data - dest; + return; + } + + saved_wb = wb; + vpx_wb_write_literal(&wb, 0, 16); // don't know in advance first part. size + + uncompressed_hdr_size = vpx_wb_bytes_written(&wb); + data += uncompressed_hdr_size; + + vpx_clear_system_state(); + + first_part_size = write_compressed_header(cpi, data); + data += first_part_size; + // TODO(jbb): Figure out what to do if first_part_size > 16 bits. + vpx_wb_write_literal(&saved_wb, (int)first_part_size, 16); + + data += encode_tiles(cpi, data); + + *size = data - dest; +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_bitstream.h b/media/libvpx/libvpx/vp9/encoder/vp9_bitstream.h new file mode 100644 index 0000000000..208651dc22 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_bitstream.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_BITSTREAM_H_ +#define VPX_VP9_ENCODER_VP9_BITSTREAM_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "vp9/encoder/vp9_encoder.h" + +typedef struct VP9BitstreamWorkerData { + uint8_t *dest; + int dest_size; + vpx_writer bit_writer; + int tile_idx; + unsigned int max_mv_magnitude; + // The size of interp_filter_selected in VP9_COMP is actually + // MAX_REFERENCE_FRAMES x SWITCHABLE. But when encoding tiles, all we ever do + // is increment the very first index (index 0) for the first dimension. Hence + // this is sufficient. + int interp_filter_selected[1][SWITCHABLE]; + DECLARE_ALIGNED(16, MACROBLOCKD, xd); +} VP9BitstreamWorkerData; + +int vp9_get_refresh_mask(VP9_COMP *cpi); + +void vp9_bitstream_encode_tiles_buffer_dealloc(VP9_COMP *const cpi); + +void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size); + +static INLINE int vp9_preserve_existing_gf(VP9_COMP *cpi) { + return cpi->refresh_golden_frame && cpi->rc.is_src_frame_alt_ref && + !cpi->use_svc; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_BITSTREAM_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_block.h b/media/libvpx/libvpx/vp9/encoder/vp9_block.h new file mode 100644 index 0000000000..7fa00cd194 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_block.h @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_BLOCK_H_ +#define VPX_VP9_ENCODER_VP9_BLOCK_H_ + +#include "vpx_util/vpx_thread.h" + +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_entropymv.h" +#include "vp9/common/vp9_entropy.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + unsigned int sse; + int sum; + unsigned int var; +} Diff; + +struct macroblock_plane { + DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]); + tran_low_t *qcoeff; + tran_low_t *coeff; + uint16_t *eobs; + struct buf_2d src; + + // Quantizer settings + int16_t *round_fp; + int16_t *quant_fp; + int16_t *quant; + int16_t *quant_shift; + int16_t *zbin; + int16_t *round; + + int64_t quant_thred[2]; +}; + +/* The [2] dimension is for whether we skip the EOB node (i.e. if previous + * coefficient in this block was zero) or not. */ +typedef unsigned int vp9_coeff_cost[PLANE_TYPES][REF_TYPES][COEF_BANDS][2] + [COEFF_CONTEXTS][ENTROPY_TOKENS]; + +typedef struct { + int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES]; + uint8_t mode_context[MAX_REF_FRAMES]; +} MB_MODE_INFO_EXT; + +typedef struct { + int col_min; + int col_max; + int row_min; + int row_max; +} MvLimits; + +typedef struct macroblock MACROBLOCK; +struct macroblock { +// cf. https://bugs.chromium.org/p/webm/issues/detail?id=1054 +#if defined(_MSC_VER) && _MSC_VER < 1900 + int64_t bsse[MAX_MB_PLANE << 2]; +#endif + + struct macroblock_plane plane[MAX_MB_PLANE]; + + MACROBLOCKD e_mbd; + MB_MODE_INFO_EXT *mbmi_ext; + MB_MODE_INFO_EXT *mbmi_ext_base; + int skip_block; + int select_tx_size; + int skip_recode; + int skip_optimize; + int q_index; + double log_block_src_var; + int block_tx_domain; + + // The equivalent error at the current rdmult of one whole bit (not one + // bitcost unit). + int errorperbit; + // The equivalent SAD error of one (whole) bit at the current quantizer + // for large blocks. + int sadperbit16; + // The equivalent SAD error of one (whole) bit at the current quantizer + // for sub-8x8 blocks. + int sadperbit4; + int rddiv; + int rdmult; + int cb_rdmult; + int segment_id; + int mb_energy; + + // These are set to their default values at the beginning, and then adjusted + // further in the encoding process. + BLOCK_SIZE min_partition_size; + BLOCK_SIZE max_partition_size; + + int mv_best_ref_index[MAX_REF_FRAMES]; + unsigned int max_mv_context[MAX_REF_FRAMES]; + unsigned int source_variance; + unsigned int pred_sse[MAX_REF_FRAMES]; + int pred_mv_sad[MAX_REF_FRAMES]; + + int nmvjointcost[MV_JOINTS]; + int *nmvcost[2]; + int *nmvcost_hp[2]; + int **mvcost; + + int nmvjointsadcost[MV_JOINTS]; + int *nmvsadcost[2]; + int *nmvsadcost_hp[2]; + int **mvsadcost; + + // sharpness is used to disable skip mode and change rd_mult + int sharpness; + + // aq mode is used to adjust rd based on segment. + int adjust_rdmult_by_segment; + + // These define limits to motion vector components to prevent them + // from extending outside the UMV borders + MvLimits mv_limits; + + // Notes transform blocks where no coefficients are coded. + // Set during mode selection. Read during block encoding. + uint8_t zcoeff_blk[TX_SIZES][256]; + + // Accumulate the tx block eobs in a partition block. + int32_t sum_y_eobs[TX_SIZES]; + + int skip; + + int encode_breakout; + + // note that token_costs is the cost when eob node is skipped + vp9_coeff_cost token_costs[TX_SIZES]; + + int optimize; + + // indicate if it is in the rd search loop or encoding process + int use_lp32x32fdct; + int skip_encode; + + // In first pass, intra prediction is done based on source pixels + // at tile boundaries + int fp_src_pred; + + // use fast quantization process + int quant_fp; + + // skip forward transform and quantization + uint8_t skip_txfm[MAX_MB_PLANE << 2]; +#define SKIP_TXFM_NONE 0 +// TODO(chengchen): consider remove SKIP_TXFM_AC_DC from vp9 completely +// since it increases risks of bad perceptual quality. +// https://crbug.com/webm/1729 +#define SKIP_TXFM_AC_DC 1 +#define SKIP_TXFM_AC_ONLY 2 + +// cf. https://bugs.chromium.org/p/webm/issues/detail?id=1054 +#if !defined(_MSC_VER) || _MSC_VER >= 1900 + int64_t bsse[MAX_MB_PLANE << 2]; +#endif + + // Used to store sub partition's choices. + MV pred_mv[MAX_REF_FRAMES]; + + // Strong color activity detection. Used in RTC coding mode to enhance + // the visual quality at the boundary of moving color objects. + uint8_t color_sensitivity[2]; + + uint8_t sb_is_skin; + + uint8_t skip_low_source_sad; + + uint8_t lowvar_highsumdiff; + + uint8_t last_sb_high_content; + + int sb_use_mv_part; + + int sb_mvcol_part; + + int sb_mvrow_part; + + int sb_pickmode_part; + + int zero_temp_sad_source; + + // For each superblock: saves the content value (e.g., low/high sad/sumdiff) + // based on source sad, prior to encoding the frame. + uint8_t content_state_sb; + + // Used to save the status of whether a block has a low variance in + // choose_partitioning. 0 for 64x64, 1~2 for 64x32, 3~4 for 32x64, 5~8 for + // 32x32, 9~24 for 16x16. + uint8_t variance_low[25]; + + uint8_t arf_frame_usage; + uint8_t lastgolden_frame_usage; + + void (*fwd_txfm4x4)(const int16_t *input, tran_low_t *output, int stride); + void (*inv_txfm_add)(const tran_low_t *input, uint8_t *dest, int stride, + int eob); +#if CONFIG_VP9_HIGHBITDEPTH + void (*highbd_inv_txfm_add)(const tran_low_t *input, uint16_t *dest, + int stride, int eob, int bd); +#endif + DECLARE_ALIGNED(16, uint8_t, est_pred[64 * 64]); + + struct scale_factors *me_sf; +}; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_BLOCK_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_blockiness.c b/media/libvpx/libvpx/vp9/encoder/vp9_blockiness.c new file mode 100644 index 0000000000..da68a3c3c3 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_blockiness.c @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include + +#include "vpx/vpx_integer.h" +#include "vpx_ports/system_state.h" +#include "vp9/encoder/vp9_blockiness.h" + +static int horizontal_filter(const uint8_t *s) { + return (s[1] - s[-2]) * 2 + (s[-1] - s[0]) * 6; +} + +static int vertical_filter(const uint8_t *s, int p) { + return (s[p] - s[-2 * p]) * 2 + (s[-p] - s[0]) * 6; +} + +static int variance(int sum, int sum_squared, int size) { + return sum_squared / size - (sum / size) * (sum / size); +} +// Calculate a blockiness level for a vertical block edge. +// This function returns a new blockiness metric that's defined as + +// p0 p1 p2 p3 +// q0 q1 q2 q3 +// block edge -> +// r0 r1 r2 r3 +// s0 s1 s2 s3 + +// blockiness = p0*-2+q0*6+r0*-6+s0*2 + +// p1*-2+q1*6+r1*-6+s1*2 + +// p2*-2+q2*6+r2*-6+s2*2 + +// p3*-2+q3*6+r3*-6+s3*2 ; + +// reconstructed_blockiness = abs(blockiness from reconstructed buffer - +// blockiness from source buffer,0) +// +// I make the assumption that flat blocks are much more visible than high +// contrast blocks. As such, I scale the result of the blockiness calc +// by dividing the blockiness by the variance of the pixels on either side +// of the edge as follows: +// var_0 = (q0^2+q1^2+q2^2+q3^2) - ((q0 + q1 + q2 + q3) / 4 )^2 +// var_1 = (r0^2+r1^2+r2^2+r3^2) - ((r0 + r1 + r2 + r3) / 4 )^2 +// The returned blockiness is the scaled value +// Reconstructed blockiness / ( 1 + var_0 + var_1 ) ; +static int blockiness_vertical(const uint8_t *s, int sp, const uint8_t *r, + int rp, int size) { + int s_blockiness = 0; + int r_blockiness = 0; + int sum_0 = 0; + int sum_sq_0 = 0; + int sum_1 = 0; + int sum_sq_1 = 0; + int i; + int var_0; + int var_1; + for (i = 0; i < size; ++i, s += sp, r += rp) { + s_blockiness += horizontal_filter(s); + r_blockiness += horizontal_filter(r); + sum_0 += s[0]; + sum_sq_0 += s[0] * s[0]; + sum_1 += s[-1]; + sum_sq_1 += s[-1] * s[-1]; + } + var_0 = variance(sum_0, sum_sq_0, size); + var_1 = variance(sum_1, sum_sq_1, size); + r_blockiness = abs(r_blockiness); + s_blockiness = abs(s_blockiness); + + if (r_blockiness > s_blockiness) + return (r_blockiness - s_blockiness) / (1 + var_0 + var_1); + else + return 0; +} + +// Calculate a blockiness level for a horizontal block edge +// same as above. +static int blockiness_horizontal(const uint8_t *s, int sp, const uint8_t *r, + int rp, int size) { + int s_blockiness = 0; + int r_blockiness = 0; + int sum_0 = 0; + int sum_sq_0 = 0; + int sum_1 = 0; + int sum_sq_1 = 0; + int i; + int var_0; + int var_1; + for (i = 0; i < size; ++i, ++s, ++r) { + s_blockiness += vertical_filter(s, sp); + r_blockiness += vertical_filter(r, rp); + sum_0 += s[0]; + sum_sq_0 += s[0] * s[0]; + sum_1 += s[-sp]; + sum_sq_1 += s[-sp] * s[-sp]; + } + var_0 = variance(sum_0, sum_sq_0, size); + var_1 = variance(sum_1, sum_sq_1, size); + r_blockiness = abs(r_blockiness); + s_blockiness = abs(s_blockiness); + + if (r_blockiness > s_blockiness) + return (r_blockiness - s_blockiness) / (1 + var_0 + var_1); + else + return 0; +} + +// This function returns the blockiness for the entire frame currently by +// looking at all borders in steps of 4. +double vp9_get_blockiness(const uint8_t *img1, int img1_pitch, + const uint8_t *img2, int img2_pitch, int width, + int height) { + double blockiness = 0; + int i, j; + vpx_clear_system_state(); + for (i = 0; i < height; + i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) { + for (j = 0; j < width; j += 4) { + if (i > 0 && i < height && j > 0 && j < width) { + blockiness += + blockiness_vertical(img1 + j, img1_pitch, img2 + j, img2_pitch, 4); + blockiness += blockiness_horizontal(img1 + j, img1_pitch, img2 + j, + img2_pitch, 4); + } + } + } + blockiness /= width * height / 16; + return blockiness; +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_blockiness.h b/media/libvpx/libvpx/vp9/encoder/vp9_blockiness.h new file mode 100644 index 0000000000..e840cb2518 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_blockiness.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_BLOCKINESS_H_ +#define VPX_VP9_ENCODER_VP9_BLOCKINESS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +double vp9_get_blockiness(const uint8_t *img1, int img1_pitch, + const uint8_t *img2, int img2_pitch, int width, + int height); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_BLOCKINESS_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c new file mode 100644 index 0000000000..42073f756c --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/encoder/vp9_context_tree.h" +#include "vp9/encoder/vp9_encoder.h" + +static const BLOCK_SIZE square[] = { + BLOCK_8X8, + BLOCK_16X16, + BLOCK_32X32, + BLOCK_64X64, +}; + +static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk, + PICK_MODE_CONTEXT *ctx) { + const int num_blk = (num_4x4_blk < 4 ? 4 : num_4x4_blk); + const int num_pix = num_blk << 4; + int i, k; + ctx->num_4x4_blk = num_blk; + + CHECK_MEM_ERROR(&cm->error, ctx->zcoeff_blk, + vpx_calloc(num_blk, sizeof(uint8_t))); + for (i = 0; i < MAX_MB_PLANE; ++i) { + for (k = 0; k < 3; ++k) { + CHECK_MEM_ERROR(&cm->error, ctx->coeff[i][k], + vpx_memalign(32, num_pix * sizeof(*ctx->coeff[i][k]))); + CHECK_MEM_ERROR(&cm->error, ctx->qcoeff[i][k], + vpx_memalign(32, num_pix * sizeof(*ctx->qcoeff[i][k]))); + CHECK_MEM_ERROR(&cm->error, ctx->dqcoeff[i][k], + vpx_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i][k]))); + CHECK_MEM_ERROR(&cm->error, ctx->eobs[i][k], + vpx_memalign(32, num_blk * sizeof(*ctx->eobs[i][k]))); + ctx->coeff_pbuf[i][k] = ctx->coeff[i][k]; + ctx->qcoeff_pbuf[i][k] = ctx->qcoeff[i][k]; + ctx->dqcoeff_pbuf[i][k] = ctx->dqcoeff[i][k]; + ctx->eobs_pbuf[i][k] = ctx->eobs[i][k]; + } + } +} + +static void free_mode_context(PICK_MODE_CONTEXT *ctx) { + int i, k; + vpx_free(ctx->zcoeff_blk); + ctx->zcoeff_blk = 0; + for (i = 0; i < MAX_MB_PLANE; ++i) { + for (k = 0; k < 3; ++k) { + vpx_free(ctx->coeff[i][k]); + ctx->coeff[i][k] = 0; + vpx_free(ctx->qcoeff[i][k]); + ctx->qcoeff[i][k] = 0; + vpx_free(ctx->dqcoeff[i][k]); + ctx->dqcoeff[i][k] = 0; + vpx_free(ctx->eobs[i][k]); + ctx->eobs[i][k] = 0; + } + } +} + +static void alloc_tree_contexts(VP9_COMMON *cm, PC_TREE *tree, + int num_4x4_blk) { + alloc_mode_context(cm, num_4x4_blk, &tree->none); + alloc_mode_context(cm, num_4x4_blk / 2, &tree->horizontal[0]); + alloc_mode_context(cm, num_4x4_blk / 2, &tree->vertical[0]); + + if (num_4x4_blk > 4) { + alloc_mode_context(cm, num_4x4_blk / 2, &tree->horizontal[1]); + alloc_mode_context(cm, num_4x4_blk / 2, &tree->vertical[1]); + } else { + memset(&tree->horizontal[1], 0, sizeof(tree->horizontal[1])); + memset(&tree->vertical[1], 0, sizeof(tree->vertical[1])); + } +} + +static void free_tree_contexts(PC_TREE *tree) { + free_mode_context(&tree->none); + free_mode_context(&tree->horizontal[0]); + free_mode_context(&tree->horizontal[1]); + free_mode_context(&tree->vertical[0]); + free_mode_context(&tree->vertical[1]); +} + +// This function sets up a tree of contexts such that at each square +// partition level. There are contexts for none, horizontal, vertical, and +// split. Along with a block_size value and a selected block_size which +// represents the state of our search. +void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) { + int i, j; + const int leaf_nodes = 64; + const int tree_nodes = 64 + 16 + 4 + 1; + int pc_tree_index = 0; + PC_TREE *this_pc; + PICK_MODE_CONTEXT *this_leaf; + int square_index = 1; + int nodes; + + vpx_free(td->leaf_tree); + CHECK_MEM_ERROR(&cm->error, td->leaf_tree, + vpx_calloc(leaf_nodes, sizeof(*td->leaf_tree))); + vpx_free(td->pc_tree); + CHECK_MEM_ERROR(&cm->error, td->pc_tree, + vpx_calloc(tree_nodes, sizeof(*td->pc_tree))); + + this_pc = &td->pc_tree[0]; + this_leaf = &td->leaf_tree[0]; + + // 4x4 blocks smaller than 8x8 but in the same 8x8 block share the same + // context so we only need to allocate 1 for each 8x8 block. + for (i = 0; i < leaf_nodes; ++i) alloc_mode_context(cm, 1, &td->leaf_tree[i]); + + // Sets up all the leaf nodes in the tree. + for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) { + PC_TREE *const tree = &td->pc_tree[pc_tree_index]; + tree->block_size = square[0]; + alloc_tree_contexts(cm, tree, 4); + tree->leaf_split[0] = this_leaf++; + for (j = 1; j < 4; j++) tree->leaf_split[j] = tree->leaf_split[0]; + } + + // Each node has 4 leaf nodes, fill each block_size level of the tree + // from leafs to the root. + for (nodes = 16; nodes > 0; nodes >>= 2) { + for (i = 0; i < nodes; ++i) { + PC_TREE *const tree = &td->pc_tree[pc_tree_index]; + alloc_tree_contexts(cm, tree, 4 << (2 * square_index)); + tree->block_size = square[square_index]; + for (j = 0; j < 4; j++) tree->split[j] = this_pc++; + ++pc_tree_index; + } + ++square_index; + } + td->pc_root = &td->pc_tree[tree_nodes - 1]; + td->pc_root[0].none.best_mode_index = 2; +} + +void vp9_free_pc_tree(ThreadData *td) { + int i; + + if (td == NULL) return; + + if (td->leaf_tree != NULL) { + // Set up all 4x4 mode contexts + for (i = 0; i < 64; ++i) free_mode_context(&td->leaf_tree[i]); + vpx_free(td->leaf_tree); + td->leaf_tree = NULL; + } + + if (td->pc_tree != NULL) { + const int tree_nodes = 64 + 16 + 4 + 1; + // Sets up all the leaf nodes in the tree. + for (i = 0; i < tree_nodes; ++i) free_tree_contexts(&td->pc_tree[i]); + vpx_free(td->pc_tree); + td->pc_tree = NULL; + } +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h new file mode 100644 index 0000000000..4e301cc17d --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_CONTEXT_TREE_H_ +#define VPX_VP9_ENCODER_VP9_CONTEXT_TREE_H_ + +#include "vp9/common/vp9_blockd.h" +#include "vp9/encoder/vp9_block.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP9_COMP; +struct VP9Common; +struct ThreadData; + +// Structure to hold snapshot of coding context during the mode picking process +typedef struct { + MODE_INFO mic; + MB_MODE_INFO_EXT mbmi_ext; + uint8_t *zcoeff_blk; + tran_low_t *coeff[MAX_MB_PLANE][3]; + tran_low_t *qcoeff[MAX_MB_PLANE][3]; + tran_low_t *dqcoeff[MAX_MB_PLANE][3]; + uint16_t *eobs[MAX_MB_PLANE][3]; + + // dual buffer pointers, 0: in use, 1: best in store + tran_low_t *coeff_pbuf[MAX_MB_PLANE][3]; + tran_low_t *qcoeff_pbuf[MAX_MB_PLANE][3]; + tran_low_t *dqcoeff_pbuf[MAX_MB_PLANE][3]; + uint16_t *eobs_pbuf[MAX_MB_PLANE][3]; + + int is_coded; + int num_4x4_blk; + int skip; + int pred_pixel_ready; + // For current partition, only if all Y, U, and V transform blocks' + // coefficients are quantized to 0, skippable is set to 0. + int skippable; + uint8_t skip_txfm[MAX_MB_PLANE << 2]; + int best_mode_index; + int hybrid_pred_diff; + int comp_pred_diff; + int single_pred_diff; + int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]; + + // TODO(jingning) Use RD_COST struct here instead. This involves a boarder + // scope of refactoring. + int rate; + int64_t dist; + int64_t rdcost; + +#if CONFIG_VP9_TEMPORAL_DENOISING + unsigned int newmv_sse; + unsigned int zeromv_sse; + unsigned int zeromv_lastref_sse; + PREDICTION_MODE best_sse_inter_mode; + int_mv best_sse_mv; + MV_REFERENCE_FRAME best_reference_frame; + MV_REFERENCE_FRAME best_zeromv_reference_frame; + int sb_skip_denoising; +#endif + + // motion vector cache for adaptive motion search control in partition + // search loop + MV pred_mv[MAX_REF_FRAMES]; + INTERP_FILTER pred_interp_filter; + + // Used for the machine learning-based early termination + int32_t sum_y_eobs; + // Skip certain ref frames during RD search of rectangular partitions. + uint8_t skip_ref_frame_mask; +} PICK_MODE_CONTEXT; + +typedef struct PC_TREE { + int index; + PARTITION_TYPE partitioning; + BLOCK_SIZE block_size; + PICK_MODE_CONTEXT none; + PICK_MODE_CONTEXT horizontal[2]; + PICK_MODE_CONTEXT vertical[2]; + union { + struct PC_TREE *split[4]; + PICK_MODE_CONTEXT *leaf_split[4]; + }; + // Obtained from a simple motion search. Used by the ML based partition search + // speed feature. + MV mv; +} PC_TREE; + +void vp9_setup_pc_tree(struct VP9Common *cm, struct ThreadData *td); +void vp9_free_pc_tree(struct ThreadData *td); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_CONTEXT_TREE_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_cost.c b/media/libvpx/libvpx/vp9/encoder/vp9_cost.c new file mode 100644 index 0000000000..81581a80c2 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_cost.c @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include + +#include "vp9/encoder/vp9_cost.h" + +/* round(-log2(i/256.) * (1 << VP9_PROB_COST_SHIFT)) + Begins with a bogus entry for simpler addressing. */ +const uint16_t vp9_prob_cost[256] = { + 4096, 4096, 3584, 3284, 3072, 2907, 2772, 2659, 2560, 2473, 2395, 2325, 2260, + 2201, 2147, 2096, 2048, 2003, 1961, 1921, 1883, 1847, 1813, 1780, 1748, 1718, + 1689, 1661, 1635, 1609, 1584, 1559, 1536, 1513, 1491, 1470, 1449, 1429, 1409, + 1390, 1371, 1353, 1335, 1318, 1301, 1284, 1268, 1252, 1236, 1221, 1206, 1192, + 1177, 1163, 1149, 1136, 1123, 1110, 1097, 1084, 1072, 1059, 1047, 1036, 1024, + 1013, 1001, 990, 979, 968, 958, 947, 937, 927, 917, 907, 897, 887, + 878, 868, 859, 850, 841, 832, 823, 814, 806, 797, 789, 780, 772, + 764, 756, 748, 740, 732, 724, 717, 709, 702, 694, 687, 680, 673, + 665, 658, 651, 644, 637, 631, 624, 617, 611, 604, 598, 591, 585, + 578, 572, 566, 560, 554, 547, 541, 535, 530, 524, 518, 512, 506, + 501, 495, 489, 484, 478, 473, 467, 462, 456, 451, 446, 441, 435, + 430, 425, 420, 415, 410, 405, 400, 395, 390, 385, 380, 375, 371, + 366, 361, 356, 352, 347, 343, 338, 333, 329, 324, 320, 316, 311, + 307, 302, 298, 294, 289, 285, 281, 277, 273, 268, 264, 260, 256, + 252, 248, 244, 240, 236, 232, 228, 224, 220, 216, 212, 209, 205, + 201, 197, 194, 190, 186, 182, 179, 175, 171, 168, 164, 161, 157, + 153, 150, 146, 143, 139, 136, 132, 129, 125, 122, 119, 115, 112, + 109, 105, 102, 99, 95, 92, 89, 86, 82, 79, 76, 73, 70, + 66, 63, 60, 57, 54, 51, 48, 45, 42, 38, 35, 32, 29, + 26, 23, 20, 18, 15, 12, 9, 6, 3 +}; + +static void cost(int *costs, vpx_tree tree, const vpx_prob *probs, int i, + int c) { + const vpx_prob prob = probs[i / 2]; + int b; + + assert(prob != 0); + for (b = 0; b <= 1; ++b) { + const int cc = c + vp9_cost_bit(prob, b); + const vpx_tree_index ii = tree[i + b]; + + if (ii <= 0) + costs[-ii] = cc; + else + cost(costs, tree, probs, ii, cc); + } +} + +void vp9_cost_tokens(int *costs, const vpx_prob *probs, vpx_tree tree) { + cost(costs, tree, probs, 0, 0); +} + +void vp9_cost_tokens_skip(int *costs, const vpx_prob *probs, vpx_tree tree) { + assert(tree[0] <= 0 && tree[1] > 0); + + costs[-tree[0]] = vp9_cost_bit(probs[0], 0); + cost(costs, tree, probs, 2, 0); +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_cost.h b/media/libvpx/libvpx/vp9/encoder/vp9_cost.h new file mode 100644 index 0000000000..ee0033fa31 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_cost.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_COST_H_ +#define VPX_VP9_ENCODER_VP9_COST_H_ + +#include "vpx_dsp/prob.h" +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +extern const uint16_t vp9_prob_cost[256]; + +// The factor to scale from cost in bits to cost in vp9_prob_cost units. +#define VP9_PROB_COST_SHIFT 9 + +#define vp9_cost_zero(prob) (vp9_prob_cost[prob]) + +#define vp9_cost_one(prob) vp9_cost_zero(256 - (prob)) + +#define vp9_cost_bit(prob, bit) vp9_cost_zero((bit) ? 256 - (prob) : (prob)) + +static INLINE uint64_t cost_branch256(const unsigned int ct[2], vpx_prob p) { + return (uint64_t)ct[0] * vp9_cost_zero(p) + (uint64_t)ct[1] * vp9_cost_one(p); +} + +static INLINE int treed_cost(vpx_tree tree, const vpx_prob *probs, int bits, + int len) { + int cost = 0; + vpx_tree_index i = 0; + + do { + const int bit = (bits >> --len) & 1; + cost += vp9_cost_bit(probs[i >> 1], bit); + i = tree[i + bit]; + } while (len); + + return cost; +} + +void vp9_cost_tokens(int *costs, const vpx_prob *probs, vpx_tree tree); +void vp9_cost_tokens_skip(int *costs, const vpx_prob *probs, vpx_tree tree); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_COST_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_dct.c b/media/libvpx/libvpx/vp9/encoder/vp9_dct.c new file mode 100644 index 0000000000..2f42c6afc2 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_dct.c @@ -0,0 +1,687 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_idct.h" +#include "vpx_dsp/fwd_txfm.h" +#include "vpx_ports/mem.h" + +static void fdct4(const tran_low_t *input, tran_low_t *output) { + tran_high_t step[4]; + tran_high_t temp1, temp2; + + step[0] = input[0] + input[3]; + step[1] = input[1] + input[2]; + step[2] = input[1] - input[2]; + step[3] = input[0] - input[3]; + + temp1 = (step[0] + step[1]) * cospi_16_64; + temp2 = (step[0] - step[1]) * cospi_16_64; + output[0] = (tran_low_t)fdct_round_shift(temp1); + output[2] = (tran_low_t)fdct_round_shift(temp2); + temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; + temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; + output[1] = (tran_low_t)fdct_round_shift(temp1); + output[3] = (tran_low_t)fdct_round_shift(temp2); +} + +static void fdct8(const tran_low_t *input, tran_low_t *output) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 + tran_high_t t0, t1, t2, t3; // needs32 + tran_high_t x0, x1, x2, x3; // canbe16 + + // stage 1 + s0 = input[0] + input[7]; + s1 = input[1] + input[6]; + s2 = input[2] + input[5]; + s3 = input[3] + input[4]; + s4 = input[3] - input[4]; + s5 = input[2] - input[5]; + s6 = input[1] - input[6]; + s7 = input[0] - input[7]; + + // fdct4(step, step); + x0 = s0 + s3; + x1 = s1 + s2; + x2 = s1 - s2; + x3 = s0 - s3; + t0 = (x0 + x1) * cospi_16_64; + t1 = (x0 - x1) * cospi_16_64; + t2 = x2 * cospi_24_64 + x3 * cospi_8_64; + t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; + output[0] = (tran_low_t)fdct_round_shift(t0); + output[2] = (tran_low_t)fdct_round_shift(t2); + output[4] = (tran_low_t)fdct_round_shift(t1); + output[6] = (tran_low_t)fdct_round_shift(t3); + + // Stage 2 + t0 = (s6 - s5) * cospi_16_64; + t1 = (s6 + s5) * cospi_16_64; + t2 = (tran_low_t)fdct_round_shift(t0); + t3 = (tran_low_t)fdct_round_shift(t1); + + // Stage 3 + x0 = s4 + t2; + x1 = s4 - t2; + x2 = s7 - t3; + x3 = s7 + t3; + + // Stage 4 + t0 = x0 * cospi_28_64 + x3 * cospi_4_64; + t1 = x1 * cospi_12_64 + x2 * cospi_20_64; + t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; + t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; + output[1] = (tran_low_t)fdct_round_shift(t0); + output[3] = (tran_low_t)fdct_round_shift(t2); + output[5] = (tran_low_t)fdct_round_shift(t1); + output[7] = (tran_low_t)fdct_round_shift(t3); +} + +static void fdct16(const tran_low_t in[16], tran_low_t out[16]) { + tran_high_t step1[8]; // canbe16 + tran_high_t step2[8]; // canbe16 + tran_high_t step3[8]; // canbe16 + tran_high_t input[8]; // canbe16 + tran_high_t temp1, temp2; // needs32 + + // step 1 + input[0] = in[0] + in[15]; + input[1] = in[1] + in[14]; + input[2] = in[2] + in[13]; + input[3] = in[3] + in[12]; + input[4] = in[4] + in[11]; + input[5] = in[5] + in[10]; + input[6] = in[6] + in[9]; + input[7] = in[7] + in[8]; + + step1[0] = in[7] - in[8]; + step1[1] = in[6] - in[9]; + step1[2] = in[5] - in[10]; + step1[3] = in[4] - in[11]; + step1[4] = in[3] - in[12]; + step1[5] = in[2] - in[13]; + step1[6] = in[1] - in[14]; + step1[7] = in[0] - in[15]; + + // fdct8(step, step); + { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 + tran_high_t t0, t1, t2, t3; // needs32 + tran_high_t x0, x1, x2, x3; // canbe16 + + // stage 1 + s0 = input[0] + input[7]; + s1 = input[1] + input[6]; + s2 = input[2] + input[5]; + s3 = input[3] + input[4]; + s4 = input[3] - input[4]; + s5 = input[2] - input[5]; + s6 = input[1] - input[6]; + s7 = input[0] - input[7]; + + // fdct4(step, step); + x0 = s0 + s3; + x1 = s1 + s2; + x2 = s1 - s2; + x3 = s0 - s3; + t0 = (x0 + x1) * cospi_16_64; + t1 = (x0 - x1) * cospi_16_64; + t2 = x3 * cospi_8_64 + x2 * cospi_24_64; + t3 = x3 * cospi_24_64 - x2 * cospi_8_64; + out[0] = (tran_low_t)fdct_round_shift(t0); + out[4] = (tran_low_t)fdct_round_shift(t2); + out[8] = (tran_low_t)fdct_round_shift(t1); + out[12] = (tran_low_t)fdct_round_shift(t3); + + // Stage 2 + t0 = (s6 - s5) * cospi_16_64; + t1 = (s6 + s5) * cospi_16_64; + t2 = fdct_round_shift(t0); + t3 = fdct_round_shift(t1); + + // Stage 3 + x0 = s4 + t2; + x1 = s4 - t2; + x2 = s7 - t3; + x3 = s7 + t3; + + // Stage 4 + t0 = x0 * cospi_28_64 + x3 * cospi_4_64; + t1 = x1 * cospi_12_64 + x2 * cospi_20_64; + t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; + t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; + out[2] = (tran_low_t)fdct_round_shift(t0); + out[6] = (tran_low_t)fdct_round_shift(t2); + out[10] = (tran_low_t)fdct_round_shift(t1); + out[14] = (tran_low_t)fdct_round_shift(t3); + } + + // step 2 + temp1 = (step1[5] - step1[2]) * cospi_16_64; + temp2 = (step1[4] - step1[3]) * cospi_16_64; + step2[2] = fdct_round_shift(temp1); + step2[3] = fdct_round_shift(temp2); + temp1 = (step1[4] + step1[3]) * cospi_16_64; + temp2 = (step1[5] + step1[2]) * cospi_16_64; + step2[4] = fdct_round_shift(temp1); + step2[5] = fdct_round_shift(temp2); + + // step 3 + step3[0] = step1[0] + step2[3]; + step3[1] = step1[1] + step2[2]; + step3[2] = step1[1] - step2[2]; + step3[3] = step1[0] - step2[3]; + step3[4] = step1[7] - step2[4]; + step3[5] = step1[6] - step2[5]; + step3[6] = step1[6] + step2[5]; + step3[7] = step1[7] + step2[4]; + + // step 4 + temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64; + temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64; + step2[1] = fdct_round_shift(temp1); + step2[2] = fdct_round_shift(temp2); + temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64; + temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64; + step2[5] = fdct_round_shift(temp1); + step2[6] = fdct_round_shift(temp2); + + // step 5 + step1[0] = step3[0] + step2[1]; + step1[1] = step3[0] - step2[1]; + step1[2] = step3[3] + step2[2]; + step1[3] = step3[3] - step2[2]; + step1[4] = step3[4] - step2[5]; + step1[5] = step3[4] + step2[5]; + step1[6] = step3[7] - step2[6]; + step1[7] = step3[7] + step2[6]; + + // step 6 + temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64; + temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64; + out[1] = (tran_low_t)fdct_round_shift(temp1); + out[9] = (tran_low_t)fdct_round_shift(temp2); + + temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64; + temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64; + out[5] = (tran_low_t)fdct_round_shift(temp1); + out[13] = (tran_low_t)fdct_round_shift(temp2); + + temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64; + temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; + out[3] = (tran_low_t)fdct_round_shift(temp1); + out[11] = (tran_low_t)fdct_round_shift(temp2); + + temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; + temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; + out[7] = (tran_low_t)fdct_round_shift(temp1); + out[15] = (tran_low_t)fdct_round_shift(temp2); +} + +static void fadst4(const tran_low_t *input, tran_low_t *output) { + tran_high_t x0, x1, x2, x3; + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; + + x0 = input[0]; + x1 = input[1]; + x2 = input[2]; + x3 = input[3]; + + if (!(x0 | x1 | x2 | x3)) { + output[0] = output[1] = output[2] = output[3] = 0; + return; + } + + s0 = sinpi_1_9 * x0; + s1 = sinpi_4_9 * x0; + s2 = sinpi_2_9 * x1; + s3 = sinpi_1_9 * x1; + s4 = sinpi_3_9 * x2; + s5 = sinpi_4_9 * x3; + s6 = sinpi_2_9 * x3; + s7 = x0 + x1 - x3; + + x0 = s0 + s2 + s5; + x1 = sinpi_3_9 * s7; + x2 = s1 - s3 + s6; + x3 = s4; + + s0 = x0 + x3; + s1 = x1; + s2 = x2 - x3; + s3 = x2 - x0 + x3; + + // 1-D transform scaling factor is sqrt(2). + output[0] = (tran_low_t)fdct_round_shift(s0); + output[1] = (tran_low_t)fdct_round_shift(s1); + output[2] = (tran_low_t)fdct_round_shift(s2); + output[3] = (tran_low_t)fdct_round_shift(s3); +} + +static void fadst8(const tran_low_t *input, tran_low_t *output) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; + + tran_high_t x0 = input[7]; + tran_high_t x1 = input[0]; + tran_high_t x2 = input[5]; + tran_high_t x3 = input[2]; + tran_high_t x4 = input[3]; + tran_high_t x5 = input[4]; + tran_high_t x6 = input[1]; + tran_high_t x7 = input[6]; + + // stage 1 + s0 = cospi_2_64 * x0 + cospi_30_64 * x1; + s1 = cospi_30_64 * x0 - cospi_2_64 * x1; + s2 = cospi_10_64 * x2 + cospi_22_64 * x3; + s3 = cospi_22_64 * x2 - cospi_10_64 * x3; + s4 = cospi_18_64 * x4 + cospi_14_64 * x5; + s5 = cospi_14_64 * x4 - cospi_18_64 * x5; + s6 = cospi_26_64 * x6 + cospi_6_64 * x7; + s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + + x0 = fdct_round_shift(s0 + s4); + x1 = fdct_round_shift(s1 + s5); + x2 = fdct_round_shift(s2 + s6); + x3 = fdct_round_shift(s3 + s7); + x4 = fdct_round_shift(s0 - s4); + x5 = fdct_round_shift(s1 - s5); + x6 = fdct_round_shift(s2 - s6); + x7 = fdct_round_shift(s3 - s7); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = cospi_8_64 * x4 + cospi_24_64 * x5; + s5 = cospi_24_64 * x4 - cospi_8_64 * x5; + s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; + s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + + x0 = s0 + s2; + x1 = s1 + s3; + x2 = s0 - s2; + x3 = s1 - s3; + x4 = fdct_round_shift(s4 + s6); + x5 = fdct_round_shift(s5 + s7); + x6 = fdct_round_shift(s4 - s6); + x7 = fdct_round_shift(s5 - s7); + + // stage 3 + s2 = cospi_16_64 * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (x6 - x7); + + x2 = fdct_round_shift(s2); + x3 = fdct_round_shift(s3); + x6 = fdct_round_shift(s6); + x7 = fdct_round_shift(s7); + + output[0] = (tran_low_t)x0; + output[1] = (tran_low_t)-x4; + output[2] = (tran_low_t)x6; + output[3] = (tran_low_t)-x2; + output[4] = (tran_low_t)x3; + output[5] = (tran_low_t)-x7; + output[6] = (tran_low_t)x5; + output[7] = (tran_low_t)-x1; +} + +static void fadst16(const tran_low_t *input, tran_low_t *output) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; + tran_high_t s9, s10, s11, s12, s13, s14, s15; + + tran_high_t x0 = input[15]; + tran_high_t x1 = input[0]; + tran_high_t x2 = input[13]; + tran_high_t x3 = input[2]; + tran_high_t x4 = input[11]; + tran_high_t x5 = input[4]; + tran_high_t x6 = input[9]; + tran_high_t x7 = input[6]; + tran_high_t x8 = input[7]; + tran_high_t x9 = input[8]; + tran_high_t x10 = input[5]; + tran_high_t x11 = input[10]; + tran_high_t x12 = input[3]; + tran_high_t x13 = input[12]; + tran_high_t x14 = input[1]; + tran_high_t x15 = input[14]; + + // stage 1 + s0 = x0 * cospi_1_64 + x1 * cospi_31_64; + s1 = x0 * cospi_31_64 - x1 * cospi_1_64; + s2 = x2 * cospi_5_64 + x3 * cospi_27_64; + s3 = x2 * cospi_27_64 - x3 * cospi_5_64; + s4 = x4 * cospi_9_64 + x5 * cospi_23_64; + s5 = x4 * cospi_23_64 - x5 * cospi_9_64; + s6 = x6 * cospi_13_64 + x7 * cospi_19_64; + s7 = x6 * cospi_19_64 - x7 * cospi_13_64; + s8 = x8 * cospi_17_64 + x9 * cospi_15_64; + s9 = x8 * cospi_15_64 - x9 * cospi_17_64; + s10 = x10 * cospi_21_64 + x11 * cospi_11_64; + s11 = x10 * cospi_11_64 - x11 * cospi_21_64; + s12 = x12 * cospi_25_64 + x13 * cospi_7_64; + s13 = x12 * cospi_7_64 - x13 * cospi_25_64; + s14 = x14 * cospi_29_64 + x15 * cospi_3_64; + s15 = x14 * cospi_3_64 - x15 * cospi_29_64; + + x0 = fdct_round_shift(s0 + s8); + x1 = fdct_round_shift(s1 + s9); + x2 = fdct_round_shift(s2 + s10); + x3 = fdct_round_shift(s3 + s11); + x4 = fdct_round_shift(s4 + s12); + x5 = fdct_round_shift(s5 + s13); + x6 = fdct_round_shift(s6 + s14); + x7 = fdct_round_shift(s7 + s15); + x8 = fdct_round_shift(s0 - s8); + x9 = fdct_round_shift(s1 - s9); + x10 = fdct_round_shift(s2 - s10); + x11 = fdct_round_shift(s3 - s11); + x12 = fdct_round_shift(s4 - s12); + x13 = fdct_round_shift(s5 - s13); + x14 = fdct_round_shift(s6 - s14); + x15 = fdct_round_shift(s7 - s15); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4; + s5 = x5; + s6 = x6; + s7 = x7; + s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; + s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; + s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + + x0 = s0 + s4; + x1 = s1 + s5; + x2 = s2 + s6; + x3 = s3 + s7; + x4 = s0 - s4; + x5 = s1 - s5; + x6 = s2 - s6; + x7 = s3 - s7; + x8 = fdct_round_shift(s8 + s12); + x9 = fdct_round_shift(s9 + s13); + x10 = fdct_round_shift(s10 + s14); + x11 = fdct_round_shift(s11 + s15); + x12 = fdct_round_shift(s8 - s12); + x13 = fdct_round_shift(s9 - s13); + x14 = fdct_round_shift(s10 - s14); + x15 = fdct_round_shift(s11 - s15); + + // stage 3 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + s5 = x4 * cospi_24_64 - x5 * cospi_8_64; + s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; + s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + s8 = x8; + s9 = x9; + s10 = x10; + s11 = x11; + s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + s13 = x12 * cospi_24_64 - x13 * cospi_8_64; + s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; + s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + + x0 = s0 + s2; + x1 = s1 + s3; + x2 = s0 - s2; + x3 = s1 - s3; + x4 = fdct_round_shift(s4 + s6); + x5 = fdct_round_shift(s5 + s7); + x6 = fdct_round_shift(s4 - s6); + x7 = fdct_round_shift(s5 - s7); + x8 = s8 + s10; + x9 = s9 + s11; + x10 = s8 - s10; + x11 = s9 - s11; + x12 = fdct_round_shift(s12 + s14); + x13 = fdct_round_shift(s13 + s15); + x14 = fdct_round_shift(s12 - s14); + x15 = fdct_round_shift(s13 - s15); + + // stage 4 + s2 = (-cospi_16_64) * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (-x6 + x7); + s10 = cospi_16_64 * (x10 + x11); + s11 = cospi_16_64 * (-x10 + x11); + s14 = (-cospi_16_64) * (x14 + x15); + s15 = cospi_16_64 * (x14 - x15); + + x2 = fdct_round_shift(s2); + x3 = fdct_round_shift(s3); + x6 = fdct_round_shift(s6); + x7 = fdct_round_shift(s7); + x10 = fdct_round_shift(s10); + x11 = fdct_round_shift(s11); + x14 = fdct_round_shift(s14); + x15 = fdct_round_shift(s15); + + output[0] = (tran_low_t)x0; + output[1] = (tran_low_t)-x8; + output[2] = (tran_low_t)x12; + output[3] = (tran_low_t)-x4; + output[4] = (tran_low_t)x6; + output[5] = (tran_low_t)x14; + output[6] = (tran_low_t)x10; + output[7] = (tran_low_t)x2; + output[8] = (tran_low_t)x3; + output[9] = (tran_low_t)x11; + output[10] = (tran_low_t)x15; + output[11] = (tran_low_t)x7; + output[12] = (tran_low_t)x5; + output[13] = (tran_low_t)-x13; + output[14] = (tran_low_t)x9; + output[15] = (tran_low_t)-x1; +} + +static const transform_2d FHT_4[] = { + { fdct4, fdct4 }, // DCT_DCT = 0 + { fadst4, fdct4 }, // ADST_DCT = 1 + { fdct4, fadst4 }, // DCT_ADST = 2 + { fadst4, fadst4 } // ADST_ADST = 3 +}; + +static const transform_2d FHT_8[] = { + { fdct8, fdct8 }, // DCT_DCT = 0 + { fadst8, fdct8 }, // ADST_DCT = 1 + { fdct8, fadst8 }, // DCT_ADST = 2 + { fadst8, fadst8 } // ADST_ADST = 3 +}; + +static const transform_2d FHT_16[] = { + { fdct16, fdct16 }, // DCT_DCT = 0 + { fadst16, fdct16 }, // ADST_DCT = 1 + { fdct16, fadst16 }, // DCT_ADST = 2 + { fadst16, fadst16 } // ADST_ADST = 3 +}; + +void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + if (tx_type == DCT_DCT) { + vpx_fdct4x4_c(input, output, stride); + } else { + tran_low_t out[4 * 4]; + int i, j; + tran_low_t temp_in[4], temp_out[4]; + const transform_2d ht = FHT_4[tx_type]; + + // Columns + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) temp_in[j] = input[j * stride + i] * 16; + if (i == 0 && temp_in[0]) temp_in[0] += 1; + ht.cols(temp_in, temp_out); + for (j = 0; j < 4; ++j) out[j * 4 + i] = temp_out[j]; + } + + // Rows + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) temp_in[j] = out[j + i * 4]; + ht.rows(temp_in, temp_out); + for (j = 0; j < 4; ++j) output[j + i * 4] = (temp_out[j] + 1) >> 2; + } + } +} + +void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + if (tx_type == DCT_DCT) { + vpx_fdct8x8_c(input, output, stride); + } else { + tran_low_t out[64]; + int i, j; + tran_low_t temp_in[8], temp_out[8]; + const transform_2d ht = FHT_8[tx_type]; + + // Columns + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) temp_in[j] = input[j * stride + i] * 4; + ht.cols(temp_in, temp_out); + for (j = 0; j < 8; ++j) out[j * 8 + i] = temp_out[j]; + } + + // Rows + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) temp_in[j] = out[j + i * 8]; + ht.rows(temp_in, temp_out); + for (j = 0; j < 8; ++j) + output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; + } + } +} + +/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per + pixel. */ +void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) { + int i; + tran_high_t a1, b1, c1, d1, e1; + const int16_t *ip_pass0 = input; + const tran_low_t *ip = NULL; + tran_low_t *op = output; + + for (i = 0; i < 4; i++) { + a1 = ip_pass0[0 * stride]; + b1 = ip_pass0[1 * stride]; + c1 = ip_pass0[2 * stride]; + d1 = ip_pass0[3 * stride]; + + a1 += b1; + d1 = d1 - c1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= c1; + d1 += b1; + op[0] = (tran_low_t)a1; + op[4] = (tran_low_t)c1; + op[8] = (tran_low_t)d1; + op[12] = (tran_low_t)b1; + + ip_pass0++; + op++; + } + ip = output; + op = output; + + for (i = 0; i < 4; i++) { + a1 = ip[0]; + b1 = ip[1]; + c1 = ip[2]; + d1 = ip[3]; + + a1 += b1; + d1 -= c1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= c1; + d1 += b1; + op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR); + op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR); + op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR); + op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR); + + ip += 4; + op += 4; + } +} + +void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + if (tx_type == DCT_DCT) { + vpx_fdct16x16_c(input, output, stride); + } else { + tran_low_t out[256]; + int i, j; + tran_low_t temp_in[16], temp_out[16]; + const transform_2d ht = FHT_16[tx_type]; + + // Columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) temp_in[j] = input[j * stride + i] * 4; + ht.cols(temp_in, temp_out); + for (j = 0; j < 16; ++j) + out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; + } + + // Rows + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) temp_in[j] = out[j + i * 16]; + ht.rows(temp_in, temp_out); + for (j = 0; j < 16; ++j) output[j + i * 16] = temp_out[j]; + } + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + vp9_fht4x4_c(input, output, stride, tx_type); +} + +void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + vp9_fht8x8_c(input, output, stride, tx_type); +} + +void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, + int stride) { + vp9_fwht4x4_c(input, output, stride); +} + +void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + vp9_fht16x16_c(input, output, stride, tx_type); +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_denoiser.c b/media/libvpx/libvpx/vp9/encoder/vp9_denoiser.c new file mode 100644 index 0000000000..e5dffa90a8 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_denoiser.c @@ -0,0 +1,839 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_scale/yv12config.h" +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/encoder/vp9_context_tree.h" +#include "vp9/encoder/vp9_denoiser.h" +#include "vp9/encoder/vp9_encoder.h" + +#ifdef OUTPUT_YUV_DENOISED +static void make_grayscale(YV12_BUFFER_CONFIG *yuv); +#endif + +static int absdiff_thresh(BLOCK_SIZE bs, int increase_denoising) { + (void)bs; + return 3 + (increase_denoising ? 1 : 0); +} + +static int delta_thresh(BLOCK_SIZE bs, int increase_denoising) { + (void)bs; + (void)increase_denoising; + return 4; +} + +static int noise_motion_thresh(BLOCK_SIZE bs, int increase_denoising) { + (void)bs; + (void)increase_denoising; + return 625; +} + +static unsigned int sse_thresh(BLOCK_SIZE bs, int increase_denoising) { + return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 80 : 40); +} + +static int sse_diff_thresh(BLOCK_SIZE bs, int increase_denoising, + int motion_magnitude) { + if (motion_magnitude > noise_motion_thresh(bs, increase_denoising)) { + if (increase_denoising) + return (1 << num_pels_log2_lookup[bs]) << 2; + else + return 0; + } else { + return (1 << num_pels_log2_lookup[bs]) << 4; + } +} + +static int total_adj_weak_thresh(BLOCK_SIZE bs, int increase_denoising) { + return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2); +} + +// TODO(jackychen): If increase_denoising is enabled in the future, +// we might need to update the code for calculating 'total_adj' in +// case the C code is not bit-exact with corresponding sse2 code. +int vp9_denoiser_filter_c(const uint8_t *sig, int sig_stride, + const uint8_t *mc_avg, int mc_avg_stride, + uint8_t *avg, int avg_stride, int increase_denoising, + BLOCK_SIZE bs, int motion_magnitude) { + int r, c; + const uint8_t *sig_start = sig; + const uint8_t *mc_avg_start = mc_avg; + uint8_t *avg_start = avg; + int diff, adj, absdiff, delta; + int adj_val[] = { 3, 4, 6 }; + int total_adj = 0; + int shift_inc = 1; + + // If motion_magnitude is small, making the denoiser more aggressive by + // increasing the adjustment for each level. Add another increment for + // blocks that are labeled for increase denoising. + if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) { + if (increase_denoising) { + shift_inc = 2; + } + adj_val[0] += shift_inc; + adj_val[1] += shift_inc; + adj_val[2] += shift_inc; + } + + // First attempt to apply a strong temporal denoising filter. + for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) { + for (c = 0; c < (4 << b_width_log2_lookup[bs]); ++c) { + diff = mc_avg[c] - sig[c]; + absdiff = abs(diff); + + if (absdiff <= absdiff_thresh(bs, increase_denoising)) { + avg[c] = mc_avg[c]; + total_adj += diff; + } else { + switch (absdiff) { + case 4: + case 5: + case 6: + case 7: adj = adj_val[0]; break; + case 8: + case 9: + case 10: + case 11: + case 12: + case 13: + case 14: + case 15: adj = adj_val[1]; break; + default: adj = adj_val[2]; + } + if (diff > 0) { + avg[c] = VPXMIN(UINT8_MAX, sig[c] + adj); + total_adj += adj; + } else { + avg[c] = VPXMAX(0, sig[c] - adj); + total_adj -= adj; + } + } + } + sig += sig_stride; + avg += avg_stride; + mc_avg += mc_avg_stride; + } + + // If the strong filter did not modify the signal too much, we're all set. + if (abs(total_adj) <= total_adj_strong_thresh(bs, increase_denoising)) { + return FILTER_BLOCK; + } + + // Otherwise, we try to dampen the filter if the delta is not too high. + delta = ((abs(total_adj) - total_adj_strong_thresh(bs, increase_denoising)) >> + num_pels_log2_lookup[bs]) + + 1; + + if (delta >= delta_thresh(bs, increase_denoising)) { + return COPY_BLOCK; + } + + mc_avg = mc_avg_start; + avg = avg_start; + sig = sig_start; + for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) { + for (c = 0; c < (4 << b_width_log2_lookup[bs]); ++c) { + diff = mc_avg[c] - sig[c]; + adj = abs(diff); + if (adj > delta) { + adj = delta; + } + if (diff > 0) { + // Diff positive means we made positive adjustment above + // (in first try/attempt), so now make negative adjustment to bring + // denoised signal down. + avg[c] = VPXMAX(0, avg[c] - adj); + total_adj -= adj; + } else { + // Diff negative means we made negative adjustment above + // (in first try/attempt), so now make positive adjustment to bring + // denoised signal up. + avg[c] = VPXMIN(UINT8_MAX, avg[c] + adj); + total_adj += adj; + } + } + sig += sig_stride; + avg += avg_stride; + mc_avg += mc_avg_stride; + } + + // We can use the filter if it has been sufficiently dampened + if (abs(total_adj) <= total_adj_weak_thresh(bs, increase_denoising)) { + return FILTER_BLOCK; + } + return COPY_BLOCK; +} + +static uint8_t *block_start(uint8_t *framebuf, int stride, int mi_row, + int mi_col) { + return framebuf + (stride * mi_row << 3) + (mi_col << 3); +} + +static VP9_DENOISER_DECISION perform_motion_compensation( + VP9_COMMON *const cm, VP9_DENOISER *denoiser, MACROBLOCK *mb, BLOCK_SIZE bs, + int increase_denoising, int mi_row, int mi_col, PICK_MODE_CONTEXT *ctx, + int motion_magnitude, int is_skin, int *zeromv_filter, int consec_zeromv, + int num_spatial_layers, int width, int lst_fb_idx, int gld_fb_idx, + int use_svc, int spatial_layer, int use_gf_temporal_ref) { + const int sse_diff = (ctx->newmv_sse == UINT_MAX) + ? 0 + : ((int)ctx->zeromv_sse - (int)ctx->newmv_sse); + int frame; + int denoise_layer_idx = 0; + MACROBLOCKD *filter_mbd = &mb->e_mbd; + MODE_INFO *mi = filter_mbd->mi[0]; + MODE_INFO saved_mi; + int i; + struct buf_2d saved_dst[MAX_MB_PLANE]; + struct buf_2d saved_pre[MAX_MB_PLANE]; + const RefBuffer *saved_block_refs[2]; + MV_REFERENCE_FRAME saved_frame; + + frame = ctx->best_reference_frame; + + saved_mi = *mi; + + if (is_skin && (motion_magnitude > 0 || consec_zeromv < 4)) return COPY_BLOCK; + + // Avoid denoising small blocks. When noise > kDenLow or frame width > 480, + // denoise 16x16 blocks. + if (bs == BLOCK_8X8 || bs == BLOCK_8X16 || bs == BLOCK_16X8 || + (bs == BLOCK_16X16 && width > 480 && + denoiser->denoising_level <= kDenLow)) + return COPY_BLOCK; + + // If the best reference frame uses inter-prediction and there is enough of a + // difference in sum-squared-error, use it. + if (frame != INTRA_FRAME && frame != ALTREF_FRAME && frame != GOLDEN_FRAME && + sse_diff > sse_diff_thresh(bs, increase_denoising, motion_magnitude)) { + mi->ref_frame[0] = ctx->best_reference_frame; + mi->mode = ctx->best_sse_inter_mode; + mi->mv[0] = ctx->best_sse_mv; + } else { + // Otherwise, use the zero reference frame. + frame = ctx->best_zeromv_reference_frame; + ctx->newmv_sse = ctx->zeromv_sse; + // Bias to last reference. + if ((num_spatial_layers > 1 && !use_gf_temporal_ref) || + frame == ALTREF_FRAME || + (frame == GOLDEN_FRAME && use_gf_temporal_ref) || + (frame != LAST_FRAME && + ((ctx->zeromv_lastref_sse < (5 * ctx->zeromv_sse) >> 2) || + denoiser->denoising_level >= kDenHigh))) { + frame = LAST_FRAME; + ctx->newmv_sse = ctx->zeromv_lastref_sse; + } + mi->ref_frame[0] = frame; + mi->mode = ZEROMV; + mi->mv[0].as_int = 0; + ctx->best_sse_inter_mode = ZEROMV; + ctx->best_sse_mv.as_int = 0; + *zeromv_filter = 1; + if (denoiser->denoising_level > kDenMedium) { + motion_magnitude = 0; + } + } + + saved_frame = frame; + // When using SVC, we need to map REF_FRAME to the frame buffer index. + if (use_svc) { + if (frame == LAST_FRAME) + frame = lst_fb_idx + 1; + else if (frame == GOLDEN_FRAME) + frame = gld_fb_idx + 1; + // Shift for the second spatial layer. + if (num_spatial_layers - spatial_layer == 2) + frame = frame + denoiser->num_ref_frames; + denoise_layer_idx = num_spatial_layers - spatial_layer - 1; + } + + // Force copy (no denoise, copy source in denoised buffer) if + // running_avg_y[frame] is NULL. + if (denoiser->running_avg_y[frame].buffer_alloc == NULL) { + // Restore everything to its original state + *mi = saved_mi; + return COPY_BLOCK; + } + + if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) { + // Restore everything to its original state + *mi = saved_mi; + return COPY_BLOCK; + } + if (motion_magnitude > (noise_motion_thresh(bs, increase_denoising) << 3)) { + // Restore everything to its original state + *mi = saved_mi; + return COPY_BLOCK; + } + + // We will restore these after motion compensation. + for (i = 0; i < MAX_MB_PLANE; ++i) { + saved_pre[i] = filter_mbd->plane[i].pre[0]; + saved_dst[i] = filter_mbd->plane[i].dst; + } + saved_block_refs[0] = filter_mbd->block_refs[0]; + + // Set the pointers in the MACROBLOCKD to point to the buffers in the denoiser + // struct. + filter_mbd->plane[0].pre[0].buf = + block_start(denoiser->running_avg_y[frame].y_buffer, + denoiser->running_avg_y[frame].y_stride, mi_row, mi_col); + filter_mbd->plane[0].pre[0].stride = denoiser->running_avg_y[frame].y_stride; + filter_mbd->plane[1].pre[0].buf = + block_start(denoiser->running_avg_y[frame].u_buffer, + denoiser->running_avg_y[frame].uv_stride, mi_row, mi_col); + filter_mbd->plane[1].pre[0].stride = denoiser->running_avg_y[frame].uv_stride; + filter_mbd->plane[2].pre[0].buf = + block_start(denoiser->running_avg_y[frame].v_buffer, + denoiser->running_avg_y[frame].uv_stride, mi_row, mi_col); + filter_mbd->plane[2].pre[0].stride = denoiser->running_avg_y[frame].uv_stride; + + filter_mbd->plane[0].dst.buf = block_start( + denoiser->mc_running_avg_y[denoise_layer_idx].y_buffer, + denoiser->mc_running_avg_y[denoise_layer_idx].y_stride, mi_row, mi_col); + filter_mbd->plane[0].dst.stride = + denoiser->mc_running_avg_y[denoise_layer_idx].y_stride; + filter_mbd->plane[1].dst.buf = block_start( + denoiser->mc_running_avg_y[denoise_layer_idx].u_buffer, + denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride, mi_row, mi_col); + filter_mbd->plane[1].dst.stride = + denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride; + filter_mbd->plane[2].dst.buf = block_start( + denoiser->mc_running_avg_y[denoise_layer_idx].v_buffer, + denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride, mi_row, mi_col); + filter_mbd->plane[2].dst.stride = + denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride; + + set_ref_ptrs(cm, filter_mbd, saved_frame, NO_REF_FRAME); + vp9_build_inter_predictors_sby(filter_mbd, mi_row, mi_col, bs); + + // Restore everything to its original state + *mi = saved_mi; + filter_mbd->block_refs[0] = saved_block_refs[0]; + for (i = 0; i < MAX_MB_PLANE; ++i) { + filter_mbd->plane[i].pre[0] = saved_pre[i]; + filter_mbd->plane[i].dst = saved_dst[i]; + } + + return FILTER_BLOCK; +} + +void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, + BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx, + VP9_DENOISER_DECISION *denoiser_decision, + int use_gf_temporal_ref) { + int mv_col, mv_row; + int motion_magnitude = 0; + int zeromv_filter = 0; + VP9_DENOISER *denoiser = &cpi->denoiser; + VP9_DENOISER_DECISION decision = COPY_BLOCK; + + const int shift = + cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2 + ? denoiser->num_ref_frames + : 0; + YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME + shift]; + const int denoise_layer_index = + cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id - 1; + YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y[denoise_layer_index]; + uint8_t *avg_start = block_start(avg.y_buffer, avg.y_stride, mi_row, mi_col); + + uint8_t *mc_avg_start = + block_start(mc_avg.y_buffer, mc_avg.y_stride, mi_row, mi_col); + struct buf_2d src = mb->plane[0].src; + int is_skin = 0; + int increase_denoising = 0; + int consec_zeromv = 0; + int last_is_reference = cpi->ref_frame_flags & VP9_LAST_FLAG; + mv_col = ctx->best_sse_mv.as_mv.col; + mv_row = ctx->best_sse_mv.as_mv.row; + motion_magnitude = mv_row * mv_row + mv_col * mv_col; + + if (cpi->use_skin_detection && bs <= BLOCK_32X32 && + denoiser->denoising_level < kDenHigh) { + int motion_level = (motion_magnitude < 16) ? 0 : 1; + // If motion for current block is small/zero, compute consec_zeromv for + // skin detection (early exit in skin detection is done for large + // consec_zeromv when current block has small/zero motion). + consec_zeromv = 0; + if (motion_level == 0) { + VP9_COMMON *const cm = &cpi->common; + int j, i; + // Loop through the 8x8 sub-blocks. + const int bw = num_8x8_blocks_wide_lookup[bs]; + const int bh = num_8x8_blocks_high_lookup[bs]; + const int xmis = VPXMIN(cm->mi_cols - mi_col, bw); + const int ymis = VPXMIN(cm->mi_rows - mi_row, bh); + const int block_index = mi_row * cm->mi_cols + mi_col; + consec_zeromv = 100; + for (i = 0; i < ymis; i++) { + for (j = 0; j < xmis; j++) { + int bl_index = block_index + i * cm->mi_cols + j; + consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index], consec_zeromv); + // No need to keep checking 8x8 blocks if any of the sub-blocks + // has small consec_zeromv (since threshold for no_skin based on + // zero/small motion in skin detection is high, i.e., > 4). + if (consec_zeromv < 4) { + i = ymis; + break; + } + } + } + } + // TODO(marpan): Compute skin detection over sub-blocks. + is_skin = vp9_compute_skin_block( + mb->plane[0].src.buf, mb->plane[1].src.buf, mb->plane[2].src.buf, + mb->plane[0].src.stride, mb->plane[1].src.stride, bs, consec_zeromv, + motion_level); + } + if (!is_skin && denoiser->denoising_level == kDenHigh) increase_denoising = 1; + + // Copy block if LAST_FRAME is not a reference. + // Last doesn't always exist when SVC layers are dynamically changed, e.g. top + // spatial layer doesn't have last reference when it's brought up for the + // first time on the fly. + if (last_is_reference && denoiser->denoising_level >= kDenLow && + !ctx->sb_skip_denoising) + decision = perform_motion_compensation( + &cpi->common, denoiser, mb, bs, increase_denoising, mi_row, mi_col, ctx, + motion_magnitude, is_skin, &zeromv_filter, consec_zeromv, + cpi->svc.number_spatial_layers, cpi->Source->y_width, cpi->lst_fb_idx, + cpi->gld_fb_idx, cpi->use_svc, cpi->svc.spatial_layer_id, + use_gf_temporal_ref); + + if (decision == FILTER_BLOCK) { + decision = vp9_denoiser_filter(src.buf, src.stride, mc_avg_start, + mc_avg.y_stride, avg_start, avg.y_stride, + increase_denoising, bs, motion_magnitude); + } + + if (decision == FILTER_BLOCK) { + vpx_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride, NULL, 0, 0, + 0, 0, num_4x4_blocks_wide_lookup[bs] << 2, + num_4x4_blocks_high_lookup[bs] << 2); + } else { // COPY_BLOCK + vpx_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride, NULL, 0, 0, + 0, 0, num_4x4_blocks_wide_lookup[bs] << 2, + num_4x4_blocks_high_lookup[bs] << 2); + } + *denoiser_decision = decision; + if (decision == FILTER_BLOCK && zeromv_filter == 1) + *denoiser_decision = FILTER_ZEROMV_BLOCK; +} + +static void copy_frame(YV12_BUFFER_CONFIG *const dest, + const YV12_BUFFER_CONFIG *const src) { + int r; + const uint8_t *srcbuf = src->y_buffer; + uint8_t *destbuf = dest->y_buffer; + + assert(dest->y_width == src->y_width); + assert(dest->y_height == src->y_height); + + for (r = 0; r < dest->y_height; ++r) { + memcpy(destbuf, srcbuf, dest->y_width); + destbuf += dest->y_stride; + srcbuf += src->y_stride; + } +} + +static void swap_frame_buffer(YV12_BUFFER_CONFIG *const dest, + YV12_BUFFER_CONFIG *const src) { + uint8_t *tmp_buf = dest->y_buffer; + assert(dest->y_width == src->y_width); + assert(dest->y_height == src->y_height); + dest->y_buffer = src->y_buffer; + src->y_buffer = tmp_buf; +} + +void vp9_denoiser_update_frame_info( + VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, struct SVC *svc, + FRAME_TYPE frame_type, int refresh_alt_ref_frame, int refresh_golden_frame, + int refresh_last_frame, int alt_fb_idx, int gld_fb_idx, int lst_fb_idx, + int resized, int svc_refresh_denoiser_buffers, int second_spatial_layer) { + const int shift = second_spatial_layer ? denoiser->num_ref_frames : 0; + // Copy source into denoised reference buffers on KEY_FRAME or + // if the just encoded frame was resized. For SVC, copy source if the base + // spatial layer was key frame. + if (frame_type == KEY_FRAME || resized != 0 || denoiser->reset || + svc_refresh_denoiser_buffers) { + int i; + // Start at 1 so as not to overwrite the INTRA_FRAME + for (i = 1; i < denoiser->num_ref_frames; ++i) { + if (denoiser->running_avg_y[i + shift].buffer_alloc != NULL) + copy_frame(&denoiser->running_avg_y[i + shift], &src); + } + denoiser->reset = 0; + return; + } + + if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->use_set_ref_frame_config) { + int i; + for (i = 0; i < REF_FRAMES; i++) { + if (svc->update_buffer_slot[svc->spatial_layer_id] & (1 << i)) + copy_frame(&denoiser->running_avg_y[i + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + } else { + // If more than one refresh occurs, must copy frame buffer. + if ((refresh_alt_ref_frame + refresh_golden_frame + refresh_last_frame) > + 1) { + if (refresh_alt_ref_frame) { + copy_frame(&denoiser->running_avg_y[alt_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + if (refresh_golden_frame) { + copy_frame(&denoiser->running_avg_y[gld_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + if (refresh_last_frame) { + copy_frame(&denoiser->running_avg_y[lst_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + } else { + if (refresh_alt_ref_frame) { + swap_frame_buffer(&denoiser->running_avg_y[alt_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + if (refresh_golden_frame) { + swap_frame_buffer(&denoiser->running_avg_y[gld_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + if (refresh_last_frame) { + swap_frame_buffer(&denoiser->running_avg_y[lst_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + } + } +} + +void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx) { + ctx->zeromv_sse = UINT_MAX; + ctx->newmv_sse = UINT_MAX; + ctx->zeromv_lastref_sse = UINT_MAX; + ctx->best_sse_mv.as_int = 0; +} + +void vp9_denoiser_update_frame_stats(MODE_INFO *mi, unsigned int sse, + PREDICTION_MODE mode, + PICK_MODE_CONTEXT *ctx) { + if (mi->mv[0].as_int == 0 && sse < ctx->zeromv_sse) { + ctx->zeromv_sse = sse; + ctx->best_zeromv_reference_frame = mi->ref_frame[0]; + if (mi->ref_frame[0] == LAST_FRAME) ctx->zeromv_lastref_sse = sse; + } + + if (mi->mv[0].as_int != 0 && sse < ctx->newmv_sse) { + ctx->newmv_sse = sse; + ctx->best_sse_inter_mode = mode; + ctx->best_sse_mv = mi->mv[0]; + ctx->best_reference_frame = mi->ref_frame[0]; + } +} + +static int vp9_denoiser_realloc_svc_helper(VP9_COMMON *cm, + VP9_DENOISER *denoiser, int fb_idx) { + int fail = 0; + if (denoiser->running_avg_y[fb_idx].buffer_alloc == NULL) { + fail = + vpx_alloc_frame_buffer(&denoiser->running_avg_y[fb_idx], cm->width, + cm->height, cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, 0); + if (fail) { + vp9_denoiser_free(denoiser); + return 1; + } + } + return 0; +} + +int vp9_denoiser_realloc_svc(VP9_COMMON *cm, VP9_DENOISER *denoiser, + struct SVC *svc, int svc_buf_shift, + int refresh_alt, int refresh_gld, int refresh_lst, + int alt_fb_idx, int gld_fb_idx, int lst_fb_idx) { + int fail = 0; + if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->use_set_ref_frame_config) { + int i; + for (i = 0; i < REF_FRAMES; i++) { + if (cm->frame_type == KEY_FRAME || + svc->update_buffer_slot[svc->spatial_layer_id] & (1 << i)) { + fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, + i + 1 + svc_buf_shift); + } + } + } else { + if (refresh_alt) { + // Increase the frame buffer index by 1 to map it to the buffer index in + // the denoiser. + fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, + alt_fb_idx + 1 + svc_buf_shift); + if (fail) return 1; + } + if (refresh_gld) { + fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, + gld_fb_idx + 1 + svc_buf_shift); + if (fail) return 1; + } + if (refresh_lst) { + fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, + lst_fb_idx + 1 + svc_buf_shift); + if (fail) return 1; + } + } + return 0; +} + +int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser, + int use_svc, int noise_sen, int width, int height, + int ssx, int ssy, +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth, +#endif + int border) { + int i, layer, fail, init_num_ref_frames; + const int legacy_byte_alignment = 0; + int num_layers = 1; + int scaled_width = width; + int scaled_height = height; + if (use_svc) { + LAYER_CONTEXT *lc = &svc->layer_context[svc->spatial_layer_id * + svc->number_temporal_layers + + svc->temporal_layer_id]; + get_layer_resolution(width, height, lc->scaling_factor_num, + lc->scaling_factor_den, &scaled_width, &scaled_height); + // For SVC: only denoise at most 2 spatial (highest) layers. + if (noise_sen >= 2) + // Denoise from one spatial layer below the top. + svc->first_layer_denoise = VPXMAX(svc->number_spatial_layers - 2, 0); + else + // Only denoise the top spatial layer. + svc->first_layer_denoise = VPXMAX(svc->number_spatial_layers - 1, 0); + num_layers = svc->number_spatial_layers - svc->first_layer_denoise; + } + assert(denoiser != NULL); + denoiser->num_ref_frames = use_svc ? SVC_REF_FRAMES : NONSVC_REF_FRAMES; + init_num_ref_frames = use_svc ? MAX_REF_FRAMES : NONSVC_REF_FRAMES; + denoiser->num_layers = num_layers; + CHECK_MEM_ERROR(&cm->error, denoiser->running_avg_y, + vpx_calloc(denoiser->num_ref_frames * num_layers, + sizeof(denoiser->running_avg_y[0]))); + CHECK_MEM_ERROR( + &cm->error, denoiser->mc_running_avg_y, + vpx_calloc(num_layers, sizeof(denoiser->mc_running_avg_y[0]))); + + for (layer = 0; layer < num_layers; ++layer) { + const int denoise_width = (layer == 0) ? width : scaled_width; + const int denoise_height = (layer == 0) ? height : scaled_height; + for (i = 0; i < init_num_ref_frames; ++i) { + fail = vpx_alloc_frame_buffer( + &denoiser->running_avg_y[i + denoiser->num_ref_frames * layer], + denoise_width, denoise_height, ssx, ssy, +#if CONFIG_VP9_HIGHBITDEPTH + use_highbitdepth, +#endif + border, legacy_byte_alignment); + if (fail) { + vp9_denoiser_free(denoiser); + return 1; + } +#ifdef OUTPUT_YUV_DENOISED + make_grayscale(&denoiser->running_avg_y[i]); +#endif + } + + fail = vpx_alloc_frame_buffer(&denoiser->mc_running_avg_y[layer], + denoise_width, denoise_height, ssx, ssy, +#if CONFIG_VP9_HIGHBITDEPTH + use_highbitdepth, +#endif + border, legacy_byte_alignment); + if (fail) { + vp9_denoiser_free(denoiser); + return 1; + } + } + + // denoiser->last_source only used for noise_estimation, so only for top + // layer. + fail = vpx_alloc_frame_buffer(&denoiser->last_source, width, height, ssx, ssy, +#if CONFIG_VP9_HIGHBITDEPTH + use_highbitdepth, +#endif + border, legacy_byte_alignment); + if (fail) { + vp9_denoiser_free(denoiser); + return 1; + } +#ifdef OUTPUT_YUV_DENOISED + make_grayscale(&denoiser->running_avg_y[i]); +#endif + denoiser->frame_buffer_initialized = 1; + denoiser->denoising_level = kDenMedium; + denoiser->prev_denoising_level = kDenMedium; + denoiser->reset = 0; + denoiser->current_denoiser_frame = 0; + return 0; +} + +void vp9_denoiser_free(VP9_DENOISER *denoiser) { + int i; + if (denoiser == NULL) { + return; + } + denoiser->frame_buffer_initialized = 0; + for (i = 0; i < denoiser->num_ref_frames * denoiser->num_layers; ++i) { + vpx_free_frame_buffer(&denoiser->running_avg_y[i]); + } + vpx_free(denoiser->running_avg_y); + denoiser->running_avg_y = NULL; + + for (i = 0; i < denoiser->num_layers; ++i) { + vpx_free_frame_buffer(&denoiser->mc_running_avg_y[i]); + } + + vpx_free(denoiser->mc_running_avg_y); + denoiser->mc_running_avg_y = NULL; + vpx_free_frame_buffer(&denoiser->last_source); +} + +static void force_refresh_longterm_ref(VP9_COMP *const cpi) { + SVC *const svc = &cpi->svc; + // If long term reference is used, force refresh of that slot, so + // denoiser buffer for long term reference stays in sync. + if (svc->use_gf_temporal_ref_current_layer) { + int index = svc->spatial_layer_id; + if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1; + assert(index >= 0); + cpi->alt_fb_idx = svc->buffer_gf_temporal_ref[index].idx; + cpi->refresh_alt_ref_frame = 1; + } +} + +void vp9_denoiser_set_noise_level(VP9_COMP *const cpi, int noise_level) { + VP9_DENOISER *const denoiser = &cpi->denoiser; + denoiser->denoising_level = noise_level; + if (denoiser->denoising_level > kDenLowLow && + denoiser->prev_denoising_level == kDenLowLow) { + denoiser->reset = 1; + force_refresh_longterm_ref(cpi); + } else { + denoiser->reset = 0; + } + denoiser->prev_denoising_level = denoiser->denoising_level; +} + +// Scale/increase the partition threshold +// for denoiser speed-up. +int64_t vp9_scale_part_thresh(int64_t threshold, VP9_DENOISER_LEVEL noise_level, + int content_state, int temporal_layer_id) { + if ((content_state == kLowSadLowSumdiff) || + (content_state == kHighSadLowSumdiff) || + (content_state == kLowVarHighSumdiff) || (noise_level == kDenHigh) || + (temporal_layer_id != 0)) { + int64_t scaled_thr = + (temporal_layer_id < 2) ? (3 * threshold) >> 1 : (7 * threshold) >> 2; + return scaled_thr; + } else { + return (5 * threshold) >> 2; + } +} + +// Scale/increase the ac skip threshold for +// denoiser speed-up. +int64_t vp9_scale_acskip_thresh(int64_t threshold, + VP9_DENOISER_LEVEL noise_level, int abs_sumdiff, + int temporal_layer_id) { + if (noise_level >= kDenLow && abs_sumdiff < 5) + return threshold *= (noise_level == kDenLow) ? 2 + : (temporal_layer_id == 2) ? 10 + : 6; + else + return threshold; +} + +void vp9_denoiser_reset_on_first_frame(VP9_COMP *const cpi) { + if (vp9_denoise_svc_non_key(cpi) && + cpi->denoiser.current_denoiser_frame == 0) { + cpi->denoiser.reset = 1; + force_refresh_longterm_ref(cpi); + } +} + +void vp9_denoiser_update_ref_frame(VP9_COMP *const cpi) { + VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + + if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) && + cpi->denoiser.denoising_level > kDenLowLow) { + int svc_refresh_denoiser_buffers = 0; + int denoise_svc_second_layer = 0; + FRAME_TYPE frame_type = cm->intra_only ? KEY_FRAME : cm->frame_type; + cpi->denoiser.current_denoiser_frame++; + if (cpi->use_svc) { + const int svc_buf_shift = + svc->number_spatial_layers - svc->spatial_layer_id == 2 + ? cpi->denoiser.num_ref_frames + : 0; + int layer = + LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id, + svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + svc_refresh_denoiser_buffers = + lc->is_key_frame || svc->spatial_layer_sync[svc->spatial_layer_id]; + denoise_svc_second_layer = + svc->number_spatial_layers - svc->spatial_layer_id == 2 ? 1 : 0; + // Check if we need to allocate extra buffers in the denoiser + // for refreshed frames. + if (vp9_denoiser_realloc_svc(cm, &cpi->denoiser, svc, svc_buf_shift, + cpi->refresh_alt_ref_frame, + cpi->refresh_golden_frame, + cpi->refresh_last_frame, cpi->alt_fb_idx, + cpi->gld_fb_idx, cpi->lst_fb_idx)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to re-allocate denoiser for SVC"); + } + vp9_denoiser_update_frame_info( + &cpi->denoiser, *cpi->Source, svc, frame_type, + cpi->refresh_alt_ref_frame, cpi->refresh_golden_frame, + cpi->refresh_last_frame, cpi->alt_fb_idx, cpi->gld_fb_idx, + cpi->lst_fb_idx, cpi->resize_pending, svc_refresh_denoiser_buffers, + denoise_svc_second_layer); + } +} + +#ifdef OUTPUT_YUV_DENOISED +static void make_grayscale(YV12_BUFFER_CONFIG *yuv) { + int r, c; + uint8_t *u = yuv->u_buffer; + uint8_t *v = yuv->v_buffer; + + for (r = 0; r < yuv->uv_height; ++r) { + for (c = 0; c < yuv->uv_width; ++c) { + u[c] = UINT8_MAX / 2; + v[c] = UINT8_MAX / 2; + } + u += yuv->uv_stride; + v += yuv->uv_stride; + } +} +#endif diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_denoiser.h b/media/libvpx/libvpx/vp9/encoder/vp9_denoiser.h new file mode 100644 index 0000000000..1973e98988 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_denoiser.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_DENOISER_H_ +#define VPX_VP9_ENCODER_VP9_DENOISER_H_ + +#include "vp9/encoder/vp9_block.h" +#include "vp9/encoder/vp9_skin_detection.h" +#include "vpx_scale/yv12config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MOTION_MAGNITUDE_THRESHOLD (8 * 3) + +// Denoiser is used in non svc real-time mode which does not use alt-ref, so no +// need to allocate for it, and hence we need MAX_REF_FRAME - 1 +#define NONSVC_REF_FRAMES MAX_REF_FRAMES - 1 + +// Number of frame buffers when SVC is used. [0] for current denoised buffer and +// [1..8] for REF_FRAMES +#define SVC_REF_FRAMES 9 + +typedef enum vp9_denoiser_decision { + COPY_BLOCK, + FILTER_BLOCK, + FILTER_ZEROMV_BLOCK +} VP9_DENOISER_DECISION; + +typedef enum vp9_denoiser_level { + kDenLowLow, + kDenLow, + kDenMedium, + kDenHigh +} VP9_DENOISER_LEVEL; + +typedef struct vp9_denoiser { + YV12_BUFFER_CONFIG *running_avg_y; + YV12_BUFFER_CONFIG *mc_running_avg_y; + YV12_BUFFER_CONFIG last_source; + int frame_buffer_initialized; + int reset; + int num_ref_frames; + int num_layers; + unsigned int current_denoiser_frame; + VP9_DENOISER_LEVEL denoising_level; + VP9_DENOISER_LEVEL prev_denoising_level; +} VP9_DENOISER; + +typedef struct { + int64_t zero_last_cost_orig; + int *ref_frame_cost; + int_mv (*frame_mv)[MAX_REF_FRAMES]; + int reuse_inter_pred; + TX_SIZE best_tx_size; + PREDICTION_MODE best_mode; + MV_REFERENCE_FRAME best_ref_frame; + INTERP_FILTER best_pred_filter; + uint8_t best_mode_skip_txfm; +} VP9_PICKMODE_CTX_DEN; + +struct VP9_COMP; +struct SVC; + +void vp9_denoiser_update_frame_info( + VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, struct SVC *svc, + FRAME_TYPE frame_type, int refresh_alt_ref_frame, int refresh_golden_frame, + int refresh_last_frame, int alt_fb_idx, int gld_fb_idx, int lst_fb_idx, + int resized, int svc_refresh_denoiser_buffers, int second_spatial_layer); + +void vp9_denoiser_denoise(struct VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, + int mi_col, BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx, + VP9_DENOISER_DECISION *denoiser_decision, + int use_gf_temporal_ref); + +void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx); + +void vp9_denoiser_update_frame_stats(MODE_INFO *mi, unsigned int sse, + PREDICTION_MODE mode, + PICK_MODE_CONTEXT *ctx); + +int vp9_denoiser_realloc_svc(VP9_COMMON *cm, VP9_DENOISER *denoiser, + struct SVC *svc, int svc_buf_shift, + int refresh_alt, int refresh_gld, int refresh_lst, + int alt_fb_idx, int gld_fb_idx, int lst_fb_idx); + +int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser, + int use_svc, int noise_sen, int width, int height, + int ssx, int ssy, +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth, +#endif + int border); + +#if CONFIG_VP9_TEMPORAL_DENOISING +// This function is used by both c and sse2 denoiser implementations. +// Define it as a static function within the scope where vp9_denoiser.h +// is referenced. +static INLINE int total_adj_strong_thresh(BLOCK_SIZE bs, + int increase_denoising) { + return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2); +} +#endif + +void vp9_denoiser_free(VP9_DENOISER *denoiser); + +void vp9_denoiser_set_noise_level(struct VP9_COMP *const cpi, int noise_level); + +void vp9_denoiser_reset_on_first_frame(struct VP9_COMP *const cpi); + +int64_t vp9_scale_part_thresh(int64_t threshold, VP9_DENOISER_LEVEL noise_level, + int content_state, int temporal_layer_id); + +int64_t vp9_scale_acskip_thresh(int64_t threshold, + VP9_DENOISER_LEVEL noise_level, int abs_sumdiff, + int temporal_layer_id); + +void vp9_denoiser_update_ref_frame(struct VP9_COMP *const cpi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_DENOISER_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c b/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c new file mode 100644 index 0000000000..46291f4868 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c @@ -0,0 +1,6581 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/vpx_timer.h" +#include "vpx_ports/system_state.h" + +#if CONFIG_MISMATCH_DEBUG +#include "vpx_util/vpx_debug_util.h" +#endif // CONFIG_MISMATCH_DEBUG + +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_idct.h" +#include "vp9/common/vp9_mvref_common.h" +#include "vp9/common/vp9_pred_common.h" +#include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_seg_common.h" +#include "vp9/common/vp9_tile_common.h" +#if !CONFIG_REALTIME_ONLY +#include "vp9/encoder/vp9_aq_360.h" +#include "vp9/encoder/vp9_aq_complexity.h" +#endif +#include "vp9/encoder/vp9_aq_cyclicrefresh.h" +#if !CONFIG_REALTIME_ONLY +#include "vp9/encoder/vp9_aq_variance.h" +#endif +#include "vp9/encoder/vp9_encodeframe.h" +#include "vp9/encoder/vp9_encodemb.h" +#include "vp9/encoder/vp9_encodemv.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_ethread.h" +#include "vp9/encoder/vp9_extend.h" +#include "vp9/encoder/vp9_multi_thread.h" +#include "vp9/encoder/vp9_partition_models.h" +#include "vp9/encoder/vp9_pickmode.h" +#include "vp9/encoder/vp9_rd.h" +#include "vp9/encoder/vp9_rdopt.h" +#include "vp9/encoder/vp9_segmentation.h" +#include "vp9/encoder/vp9_tokenize.h" + +static void encode_superblock(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t, + int output_enabled, int mi_row, int mi_col, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx); + +// This is used as a reference when computing the source variance for the +// purpose of activity masking. +// Eventually this should be replaced by custom no-reference routines, +// which will be faster. +static const uint8_t VP9_VAR_OFFS[64] = { + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 +}; + +#if CONFIG_VP9_HIGHBITDEPTH +static const uint16_t VP9_HIGH_VAR_OFFS_8[64] = { + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 +}; + +static const uint16_t VP9_HIGH_VAR_OFFS_10[64] = { + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4 +}; + +static const uint16_t VP9_HIGH_VAR_OFFS_12[64] = { + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16 +}; +#endif // CONFIG_VP9_HIGHBITDEPTH + +unsigned int vp9_get_sby_variance(VP9_COMP *cpi, const struct buf_2d *ref, + BLOCK_SIZE bs) { + unsigned int sse; + const unsigned int var = + cpi->fn_ptr[bs].vf(ref->buf, ref->stride, VP9_VAR_OFFS, 0, &sse); + return var; +} + +#if CONFIG_VP9_HIGHBITDEPTH +unsigned int vp9_high_get_sby_variance(VP9_COMP *cpi, const struct buf_2d *ref, + BLOCK_SIZE bs, int bd) { + unsigned int var, sse; + switch (bd) { + case 10: + var = + cpi->fn_ptr[bs].vf(ref->buf, ref->stride, + CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_10), 0, &sse); + break; + case 12: + var = + cpi->fn_ptr[bs].vf(ref->buf, ref->stride, + CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_12), 0, &sse); + break; + case 8: + default: + var = + cpi->fn_ptr[bs].vf(ref->buf, ref->stride, + CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_8), 0, &sse); + break; + } + return var; +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +unsigned int vp9_get_sby_perpixel_variance(VP9_COMP *cpi, + const struct buf_2d *ref, + BLOCK_SIZE bs) { + return ROUND_POWER_OF_TWO(vp9_get_sby_variance(cpi, ref, bs), + num_pels_log2_lookup[bs]); +} + +#if CONFIG_VP9_HIGHBITDEPTH +unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi, + const struct buf_2d *ref, + BLOCK_SIZE bs, int bd) { + return (unsigned int)ROUND64_POWER_OF_TWO( + (int64_t)vp9_high_get_sby_variance(cpi, ref, bs, bd), + num_pels_log2_lookup[bs]); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +static void set_segment_index(VP9_COMP *cpi, MACROBLOCK *const x, int mi_row, + int mi_col, BLOCK_SIZE bsize, int segment_index) { + VP9_COMMON *const cm = &cpi->common; + const struct segmentation *const seg = &cm->seg; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *mi = xd->mi[0]; + + const AQ_MODE aq_mode = cpi->oxcf.aq_mode; + const uint8_t *const map = + seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; + + // Initialize the segmentation index as 0. + mi->segment_id = 0; + + // Skip the rest if AQ mode is disabled. + if (!seg->enabled) return; + + switch (aq_mode) { + case CYCLIC_REFRESH_AQ: + mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); + break; +#if !CONFIG_REALTIME_ONLY + case VARIANCE_AQ: + if (cm->frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame || + cpi->force_update_segmentation || + (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) { + int min_energy; + int max_energy; + // Get sub block energy range + if (bsize >= BLOCK_32X32) { + vp9_get_sub_block_energy(cpi, x, mi_row, mi_col, bsize, &min_energy, + &max_energy); + } else { + min_energy = bsize <= BLOCK_16X16 ? x->mb_energy + : vp9_block_energy(cpi, x, bsize); + } + mi->segment_id = vp9_vaq_segment_id(min_energy); + } else { + mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); + } + break; + case EQUATOR360_AQ: + if (cm->frame_type == KEY_FRAME || cpi->force_update_segmentation) + mi->segment_id = vp9_360aq_segment_id(mi_row, cm->mi_rows); + else + mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); + break; +#endif + case LOOKAHEAD_AQ: + mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); + break; + case PSNR_AQ: mi->segment_id = segment_index; break; + case PERCEPTUAL_AQ: mi->segment_id = x->segment_id; break; + default: + // NO_AQ or PSNR_AQ + break; + } + + // Set segment index if ROI map or active_map is enabled. + if (cpi->roi.enabled || cpi->active_map.enabled) + mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); + + vp9_init_plane_quantizers(cpi, x); +} + +// Lighter version of set_offsets that only sets the mode info +// pointers. +static INLINE void set_mode_info_offsets(VP9_COMMON *const cm, + MACROBLOCK *const x, + MACROBLOCKD *const xd, int mi_row, + int mi_col) { + const int idx_str = xd->mi_stride * mi_row + mi_col; + xd->mi = cm->mi_grid_visible + idx_str; + xd->mi[0] = cm->mi + idx_str; + x->mbmi_ext = x->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col); +} + +static void set_ssim_rdmult(VP9_COMP *const cpi, MACROBLOCK *const x, + const BLOCK_SIZE bsize, const int mi_row, + const int mi_col, int *const rdmult) { + const VP9_COMMON *const cm = &cpi->common; + + const int bsize_base = BLOCK_16X16; + const int num_8x8_w = num_8x8_blocks_wide_lookup[bsize_base]; + const int num_8x8_h = num_8x8_blocks_high_lookup[bsize_base]; + const int num_cols = (cm->mi_cols + num_8x8_w - 1) / num_8x8_w; + const int num_rows = (cm->mi_rows + num_8x8_h - 1) / num_8x8_h; + const int num_bcols = + (num_8x8_blocks_wide_lookup[bsize] + num_8x8_w - 1) / num_8x8_w; + const int num_brows = + (num_8x8_blocks_high_lookup[bsize] + num_8x8_h - 1) / num_8x8_h; + int row, col; + double num_of_mi = 0.0; + double geom_mean_of_scale = 0.0; + + assert(cpi->oxcf.tuning == VP8_TUNE_SSIM); + + for (row = mi_row / num_8x8_w; + row < num_rows && row < mi_row / num_8x8_w + num_brows; ++row) { + for (col = mi_col / num_8x8_h; + col < num_cols && col < mi_col / num_8x8_h + num_bcols; ++col) { + const int index = row * num_cols + col; + geom_mean_of_scale += log(cpi->mi_ssim_rdmult_scaling_factors[index]); + num_of_mi += 1.0; + } + } + geom_mean_of_scale = exp(geom_mean_of_scale / num_of_mi); + + *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale); + *rdmult = VPXMAX(*rdmult, 0); + set_error_per_bit(x, *rdmult); + vpx_clear_system_state(); +} + +static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile, + MACROBLOCK *const x, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + VP9_COMMON *const cm = &cpi->common; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + MACROBLOCKD *const xd = &x->e_mbd; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + MvLimits *const mv_limits = &x->mv_limits; + + set_skip_context(xd, mi_row, mi_col); + + set_mode_info_offsets(cm, x, xd, mi_row, mi_col); + + // Set up destination pointers. + vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col); + + // Set up limit values for MV components. + // Mv beyond the range do not produce new/different prediction block. + mv_limits->row_min = -(((mi_row + mi_height) * MI_SIZE) + VP9_INTERP_EXTEND); + mv_limits->col_min = -(((mi_col + mi_width) * MI_SIZE) + VP9_INTERP_EXTEND); + mv_limits->row_max = (cm->mi_rows - mi_row) * MI_SIZE + VP9_INTERP_EXTEND; + mv_limits->col_max = (cm->mi_cols - mi_col) * MI_SIZE + VP9_INTERP_EXTEND; + + // Set up distance of MB to edge of frame in 1/8th pel units. + assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1))); + set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width, cm->mi_rows, + cm->mi_cols); + + // Set up source buffers. + vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col); + + // R/D setup. + x->rddiv = cpi->rd.RDDIV; + x->rdmult = cpi->rd.RDMULT; + if (oxcf->tuning == VP8_TUNE_SSIM) { + set_ssim_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult); + } + + // required by vp9_append_sub8x8_mvs_for_idx() and vp9_find_best_ref_mvs() + xd->tile = *tile; +} + +static void duplicate_mode_info_in_sb(VP9_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, + BLOCK_SIZE bsize) { + const int block_width = + VPXMIN(num_8x8_blocks_wide_lookup[bsize], cm->mi_cols - mi_col); + const int block_height = + VPXMIN(num_8x8_blocks_high_lookup[bsize], cm->mi_rows - mi_row); + const int mi_stride = xd->mi_stride; + MODE_INFO *const src_mi = xd->mi[0]; + int i, j; + + for (j = 0; j < block_height; ++j) + for (i = 0; i < block_width; ++i) xd->mi[j * mi_stride + i] = src_mi; +} + +static void set_block_size(VP9_COMP *const cpi, MACROBLOCK *const x, + MACROBLOCKD *const xd, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + if (cpi->common.mi_cols > mi_col && cpi->common.mi_rows > mi_row) { + set_mode_info_offsets(&cpi->common, x, xd, mi_row, mi_col); + xd->mi[0]->sb_type = bsize; + } +} + +typedef struct { + // This struct is used for computing variance in choose_partitioning(), where + // the max number of samples within a superblock is 16x16 (with 4x4 avg). Even + // in high bitdepth, uint32_t is enough for sum_square_error (2^12 * 2^12 * 16 + // * 16 = 2^32). + uint32_t sum_square_error; + int32_t sum_error; + int log2_count; + int variance; +} Var; + +typedef struct { + Var none; + Var horz[2]; + Var vert[2]; +} partition_variance; + +typedef struct { + partition_variance part_variances; + Var split[4]; +} v4x4; + +typedef struct { + partition_variance part_variances; + v4x4 split[4]; +} v8x8; + +typedef struct { + partition_variance part_variances; + v8x8 split[4]; +} v16x16; + +typedef struct { + partition_variance part_variances; + v16x16 split[4]; +} v32x32; + +typedef struct { + partition_variance part_variances; + v32x32 split[4]; +} v64x64; + +typedef struct { + partition_variance *part_variances; + Var *split[4]; +} variance_node; + +typedef enum { + V16X16, + V32X32, + V64X64, +} TREE_LEVEL; + +static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) { + int i; + node->part_variances = NULL; + switch (bsize) { + case BLOCK_64X64: { + v64x64 *vt = (v64x64 *)data; + node->part_variances = &vt->part_variances; + for (i = 0; i < 4; i++) + node->split[i] = &vt->split[i].part_variances.none; + break; + } + case BLOCK_32X32: { + v32x32 *vt = (v32x32 *)data; + node->part_variances = &vt->part_variances; + for (i = 0; i < 4; i++) + node->split[i] = &vt->split[i].part_variances.none; + break; + } + case BLOCK_16X16: { + v16x16 *vt = (v16x16 *)data; + node->part_variances = &vt->part_variances; + for (i = 0; i < 4; i++) + node->split[i] = &vt->split[i].part_variances.none; + break; + } + case BLOCK_8X8: { + v8x8 *vt = (v8x8 *)data; + node->part_variances = &vt->part_variances; + for (i = 0; i < 4; i++) + node->split[i] = &vt->split[i].part_variances.none; + break; + } + default: { + v4x4 *vt = (v4x4 *)data; + assert(bsize == BLOCK_4X4); + node->part_variances = &vt->part_variances; + for (i = 0; i < 4; i++) node->split[i] = &vt->split[i]; + break; + } + } +} + +// Set variance values given sum square error, sum error, count. +static void fill_variance(uint32_t s2, int32_t s, int c, Var *v) { + v->sum_square_error = s2; + v->sum_error = s; + v->log2_count = c; +} + +static void get_variance(Var *v) { + v->variance = + (int)(256 * (v->sum_square_error - + (uint32_t)(((int64_t)v->sum_error * v->sum_error) >> + v->log2_count)) >> + v->log2_count); +} + +static void sum_2_variances(const Var *a, const Var *b, Var *r) { + assert(a->log2_count == b->log2_count); + fill_variance(a->sum_square_error + b->sum_square_error, + a->sum_error + b->sum_error, a->log2_count + 1, r); +} + +static void fill_variance_tree(void *data, BLOCK_SIZE bsize) { + variance_node node; + memset(&node, 0, sizeof(node)); + tree_to_node(data, bsize, &node); + sum_2_variances(node.split[0], node.split[1], &node.part_variances->horz[0]); + sum_2_variances(node.split[2], node.split[3], &node.part_variances->horz[1]); + sum_2_variances(node.split[0], node.split[2], &node.part_variances->vert[0]); + sum_2_variances(node.split[1], node.split[3], &node.part_variances->vert[1]); + sum_2_variances(&node.part_variances->vert[0], &node.part_variances->vert[1], + &node.part_variances->none); +} + +static int set_vt_partitioning(VP9_COMP *cpi, MACROBLOCK *const x, + MACROBLOCKD *const xd, void *data, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int64_t threshold, BLOCK_SIZE bsize_min, + int force_split) { + VP9_COMMON *const cm = &cpi->common; + variance_node vt; + const int block_width = num_8x8_blocks_wide_lookup[bsize]; + const int block_height = num_8x8_blocks_high_lookup[bsize]; + + assert(block_height == block_width); + tree_to_node(data, bsize, &vt); + + if (force_split == 1) return 0; + + // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if + // variance is below threshold, otherwise split will be selected. + // No check for vert/horiz split as too few samples for variance. + if (bsize == bsize_min) { + // Variance already computed to set the force_split. + if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none); + if (mi_col + block_width / 2 < cm->mi_cols && + mi_row + block_height / 2 < cm->mi_rows && + vt.part_variances->none.variance < threshold) { + set_block_size(cpi, x, xd, mi_row, mi_col, bsize); + return 1; + } + return 0; + } else if (bsize > bsize_min) { + // Variance already computed to set the force_split. + if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none); + // For key frame: take split for bsize above 32X32 or very high variance. + if (frame_is_intra_only(cm) && + (bsize > BLOCK_32X32 || + vt.part_variances->none.variance > (threshold << 4))) { + return 0; + } + // If variance is low, take the bsize (no split). + if (mi_col + block_width / 2 < cm->mi_cols && + mi_row + block_height / 2 < cm->mi_rows && + vt.part_variances->none.variance < threshold) { + set_block_size(cpi, x, xd, mi_row, mi_col, bsize); + return 1; + } + + // Check vertical split. + if (mi_row + block_height / 2 < cm->mi_rows) { + BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT); + get_variance(&vt.part_variances->vert[0]); + get_variance(&vt.part_variances->vert[1]); + if (vt.part_variances->vert[0].variance < threshold && + vt.part_variances->vert[1].variance < threshold && + get_plane_block_size(subsize, &xd->plane[1]) < BLOCK_INVALID) { + set_block_size(cpi, x, xd, mi_row, mi_col, subsize); + set_block_size(cpi, x, xd, mi_row, mi_col + block_width / 2, subsize); + return 1; + } + } + // Check horizontal split. + if (mi_col + block_width / 2 < cm->mi_cols) { + BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ); + get_variance(&vt.part_variances->horz[0]); + get_variance(&vt.part_variances->horz[1]); + if (vt.part_variances->horz[0].variance < threshold && + vt.part_variances->horz[1].variance < threshold && + get_plane_block_size(subsize, &xd->plane[1]) < BLOCK_INVALID) { + set_block_size(cpi, x, xd, mi_row, mi_col, subsize); + set_block_size(cpi, x, xd, mi_row + block_height / 2, mi_col, subsize); + return 1; + } + } + + return 0; + } + return 0; +} + +static int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed, + int width, int height, + int content_state) { + if (speed >= 8) { + if (width <= 640 && height <= 480) + return (5 * threshold_base) >> 2; + else if ((content_state == kLowSadLowSumdiff) || + (content_state == kHighSadLowSumdiff) || + (content_state == kLowVarHighSumdiff)) + return (5 * threshold_base) >> 2; + } else if (speed == 7) { + if ((content_state == kLowSadLowSumdiff) || + (content_state == kHighSadLowSumdiff) || + (content_state == kLowVarHighSumdiff)) { + return (5 * threshold_base) >> 2; + } + } + return threshold_base; +} + +// Set the variance split thresholds for following the block sizes: +// 0 - threshold_64x64, 1 - threshold_32x32, 2 - threshold_16x16, +// 3 - vbp_threshold_8x8. vbp_threshold_8x8 (to split to 4x4 partition) is +// currently only used on key frame. +static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q, + int content_state) { + VP9_COMMON *const cm = &cpi->common; + const int is_key_frame = frame_is_intra_only(cm); + const int threshold_multiplier = + is_key_frame ? 20 : cpi->sf.variance_part_thresh_mult; + int64_t threshold_base = + (int64_t)(threshold_multiplier * cpi->y_dequant[q][1]); + + if (is_key_frame) { + thresholds[0] = threshold_base; + thresholds[1] = threshold_base >> 2; + thresholds[2] = threshold_base >> 2; + thresholds[3] = threshold_base << 2; + } else { + // Increase base variance threshold based on estimated noise level. + if (cpi->noise_estimate.enabled && cm->width >= 640 && cm->height >= 480) { + NOISE_LEVEL noise_level = + vp9_noise_estimate_extract_level(&cpi->noise_estimate); + if (noise_level == kHigh) + threshold_base = 3 * threshold_base; + else if (noise_level == kMedium) + threshold_base = threshold_base << 1; + else if (noise_level < kLow) + threshold_base = (7 * threshold_base) >> 3; + } +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) && + cpi->oxcf.speed > 5 && cpi->denoiser.denoising_level >= kDenLow) + threshold_base = + vp9_scale_part_thresh(threshold_base, cpi->denoiser.denoising_level, + content_state, cpi->svc.temporal_layer_id); + else + threshold_base = + scale_part_thresh_sumdiff(threshold_base, cpi->oxcf.speed, cm->width, + cm->height, content_state); +#else + // Increase base variance threshold based on content_state/sum_diff level. + threshold_base = scale_part_thresh_sumdiff( + threshold_base, cpi->oxcf.speed, cm->width, cm->height, content_state); +#endif + thresholds[0] = threshold_base; + thresholds[2] = threshold_base << cpi->oxcf.speed; + if (cm->width >= 1280 && cm->height >= 720 && cpi->oxcf.speed < 7) + thresholds[2] = thresholds[2] << 1; + if (cm->width <= 352 && cm->height <= 288) { + thresholds[0] = threshold_base >> 3; + thresholds[1] = threshold_base >> 1; + thresholds[2] = threshold_base << 3; + if (cpi->rc.avg_frame_qindex[INTER_FRAME] > 220) + thresholds[2] = thresholds[2] << 2; + else if (cpi->rc.avg_frame_qindex[INTER_FRAME] > 200) + thresholds[2] = thresholds[2] << 1; + } else if (cm->width < 1280 && cm->height < 720) { + thresholds[1] = (5 * threshold_base) >> 2; + } else if (cm->width < 1920 && cm->height < 1080) { + thresholds[1] = threshold_base << 1; + } else { + thresholds[1] = (5 * threshold_base) >> 1; + } + if (cpi->sf.disable_16x16part_nonkey) thresholds[2] = INT64_MAX; + } +} + +void vp9_set_variance_partition_thresholds(VP9_COMP *cpi, int q, + int content_state) { + VP9_COMMON *const cm = &cpi->common; + SPEED_FEATURES *const sf = &cpi->sf; + const int is_key_frame = frame_is_intra_only(cm); + if (sf->partition_search_type != VAR_BASED_PARTITION && + sf->partition_search_type != REFERENCE_PARTITION) { + return; + } else { + set_vbp_thresholds(cpi, cpi->vbp_thresholds, q, content_state); + // The thresholds below are not changed locally. + if (is_key_frame) { + cpi->vbp_threshold_sad = 0; + cpi->vbp_threshold_copy = 0; + cpi->vbp_bsize_min = BLOCK_8X8; + } else { + if (cm->width <= 352 && cm->height <= 288) + cpi->vbp_threshold_sad = 10; + else + cpi->vbp_threshold_sad = (cpi->y_dequant[q][1] << 1) > 1000 + ? (cpi->y_dequant[q][1] << 1) + : 1000; + cpi->vbp_bsize_min = BLOCK_16X16; + if (cm->width <= 352 && cm->height <= 288) + cpi->vbp_threshold_copy = 4000; + else if (cm->width <= 640 && cm->height <= 360) + cpi->vbp_threshold_copy = 8000; + else + cpi->vbp_threshold_copy = (cpi->y_dequant[q][1] << 3) > 8000 + ? (cpi->y_dequant[q][1] << 3) + : 8000; + if (cpi->rc.high_source_sad || + (cpi->use_svc && cpi->svc.high_source_sad_superframe)) { + cpi->vbp_threshold_sad = 0; + cpi->vbp_threshold_copy = 0; + } + } + cpi->vbp_threshold_minmax = 15 + (q >> 3); + } +} + +// Compute the minmax over the 8x8 subblocks. +static int compute_minmax_8x8(const uint8_t *s, int sp, const uint8_t *d, + int dp, int x16_idx, int y16_idx, +#if CONFIG_VP9_HIGHBITDEPTH + int highbd_flag, +#endif + int pixels_wide, int pixels_high) { + int k; + int minmax_max = 0; + int minmax_min = 255; + // Loop over the 4 8x8 subblocks. + for (k = 0; k < 4; k++) { + int x8_idx = x16_idx + ((k & 1) << 3); + int y8_idx = y16_idx + ((k >> 1) << 3); + int min = 0; + int max = 0; + if (x8_idx < pixels_wide && y8_idx < pixels_high) { +#if CONFIG_VP9_HIGHBITDEPTH + if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) { + vpx_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp, + d + y8_idx * dp + x8_idx, dp, &min, &max); + } else { + vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp, d + y8_idx * dp + x8_idx, + dp, &min, &max); + } +#else + vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp, d + y8_idx * dp + x8_idx, dp, + &min, &max); +#endif + if ((max - min) > minmax_max) minmax_max = (max - min); + if ((max - min) < minmax_min) minmax_min = (max - min); + } + } + return (minmax_max - minmax_min); +} + +static void fill_variance_4x4avg(const uint8_t *s, int sp, const uint8_t *d, + int dp, int x8_idx, int y8_idx, v8x8 *vst, +#if CONFIG_VP9_HIGHBITDEPTH + int highbd_flag, +#endif + int pixels_wide, int pixels_high, + int is_key_frame) { + int k; + for (k = 0; k < 4; k++) { + int x4_idx = x8_idx + ((k & 1) << 2); + int y4_idx = y8_idx + ((k >> 1) << 2); + unsigned int sse = 0; + int sum = 0; + if (x4_idx < pixels_wide && y4_idx < pixels_high) { + int s_avg; + int d_avg = 128; +#if CONFIG_VP9_HIGHBITDEPTH + if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) { + s_avg = vpx_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp); + if (!is_key_frame) + d_avg = vpx_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp); + } else { + s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp); + if (!is_key_frame) d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp); + } +#else + s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp); + if (!is_key_frame) d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp); +#endif + sum = s_avg - d_avg; + sse = sum * sum; + } + fill_variance(sse, sum, 0, &vst->split[k].part_variances.none); + } +} + +static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d, + int dp, int x16_idx, int y16_idx, v16x16 *vst, +#if CONFIG_VP9_HIGHBITDEPTH + int highbd_flag, +#endif + int pixels_wide, int pixels_high, + int is_key_frame) { + int k; + for (k = 0; k < 4; k++) { + int x8_idx = x16_idx + ((k & 1) << 3); + int y8_idx = y16_idx + ((k >> 1) << 3); + unsigned int sse = 0; + int sum = 0; + if (x8_idx < pixels_wide && y8_idx < pixels_high) { + int s_avg; + int d_avg = 128; +#if CONFIG_VP9_HIGHBITDEPTH + if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) { + s_avg = vpx_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp); + if (!is_key_frame) + d_avg = vpx_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp); + } else { + s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp); + if (!is_key_frame) d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp); + } +#else + s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp); + if (!is_key_frame) d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp); +#endif + sum = s_avg - d_avg; + sse = sum * sum; + } + fill_variance(sse, sum, 0, &vst->split[k].part_variances.none); + } +} + +// Check if most of the superblock is skin content, and if so, force split to +// 32x32, and set x->sb_is_skin for use in mode selection. +static int skin_sb_split(VP9_COMP *cpi, const int low_res, int mi_row, + int mi_col, int *force_split) { + VP9_COMMON *const cm = &cpi->common; +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) return 0; +#endif + // Avoid checking superblocks on/near boundary and avoid low resolutions. + // Note superblock may still pick 64X64 if y_sad is very small + // (i.e., y_sad < cpi->vbp_threshold_sad) below. For now leave this as is. + if (!low_res && (mi_col >= 8 && mi_col + 8 < cm->mi_cols && mi_row >= 8 && + mi_row + 8 < cm->mi_rows)) { + int num_16x16_skin = 0; + int num_16x16_nonskin = 0; + const int block_index = mi_row * cm->mi_cols + mi_col; + const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64]; + const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64]; + const int xmis = VPXMIN(cm->mi_cols - mi_col, bw); + const int ymis = VPXMIN(cm->mi_rows - mi_row, bh); + // Loop through the 16x16 sub-blocks. + int i, j; + for (i = 0; i < ymis; i += 2) { + for (j = 0; j < xmis; j += 2) { + int bl_index = block_index + i * cm->mi_cols + j; + int is_skin = cpi->skin_map[bl_index]; + num_16x16_skin += is_skin; + num_16x16_nonskin += (1 - is_skin); + if (num_16x16_nonskin > 3) { + // Exit loop if at least 4 of the 16x16 blocks are not skin. + i = ymis; + break; + } + } + } + if (num_16x16_skin > 12) { + *force_split = 1; + return 1; + } + } + return 0; +} + +static void set_low_temp_var_flag(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, + v64x64 *vt, int64_t thresholds[], + MV_REFERENCE_FRAME ref_frame_partition, + int mi_col, int mi_row) { + int i, j; + VP9_COMMON *const cm = &cpi->common; + const int mv_thr = cm->width > 640 ? 8 : 4; + // Check temporal variance for bsize >= 16x16, if LAST_FRAME was selected and + // int_pro mv is small. If the temporal variance is small set the flag + // variance_low for the block. The variance threshold can be adjusted, the + // higher the more aggressive. + if (ref_frame_partition == LAST_FRAME && + (cpi->sf.short_circuit_low_temp_var == 1 || + (xd->mi[0]->mv[0].as_mv.col < mv_thr && + xd->mi[0]->mv[0].as_mv.col > -mv_thr && + xd->mi[0]->mv[0].as_mv.row < mv_thr && + xd->mi[0]->mv[0].as_mv.row > -mv_thr))) { + if (xd->mi[0]->sb_type == BLOCK_64X64) { + if ((vt->part_variances).none.variance < (thresholds[0] >> 1)) + x->variance_low[0] = 1; + } else if (xd->mi[0]->sb_type == BLOCK_64X32) { + for (i = 0; i < 2; i++) { + if (vt->part_variances.horz[i].variance < (thresholds[0] >> 2)) + x->variance_low[i + 1] = 1; + } + } else if (xd->mi[0]->sb_type == BLOCK_32X64) { + for (i = 0; i < 2; i++) { + if (vt->part_variances.vert[i].variance < (thresholds[0] >> 2)) + x->variance_low[i + 3] = 1; + } + } else { + for (i = 0; i < 4; i++) { + const int idx[4][2] = { { 0, 0 }, { 0, 4 }, { 4, 0 }, { 4, 4 } }; + const int idx_str = + cm->mi_stride * (mi_row + idx[i][0]) + mi_col + idx[i][1]; + MODE_INFO **this_mi = cm->mi_grid_visible + idx_str; + + if (cm->mi_cols <= mi_col + idx[i][1] || + cm->mi_rows <= mi_row + idx[i][0]) + continue; + + if ((*this_mi)->sb_type == BLOCK_32X32) { + int64_t threshold_32x32 = (cpi->sf.short_circuit_low_temp_var == 1 || + cpi->sf.short_circuit_low_temp_var == 3) + ? ((5 * thresholds[1]) >> 3) + : (thresholds[1] >> 1); + if (vt->split[i].part_variances.none.variance < threshold_32x32) + x->variance_low[i + 5] = 1; + } else if (cpi->sf.short_circuit_low_temp_var >= 2) { + // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block + // inside. + if ((*this_mi)->sb_type == BLOCK_16X16 || + (*this_mi)->sb_type == BLOCK_32X16 || + (*this_mi)->sb_type == BLOCK_16X32) { + for (j = 0; j < 4; j++) { + if (vt->split[i].split[j].part_variances.none.variance < + (thresholds[2] >> 8)) + x->variance_low[(i << 2) + j + 9] = 1; + } + } + } + } + } + } +} + +static void copy_partitioning_helper(VP9_COMP *cpi, MACROBLOCK *x, + MACROBLOCKD *xd, BLOCK_SIZE bsize, + int mi_row, int mi_col) { + VP9_COMMON *const cm = &cpi->common; + BLOCK_SIZE *prev_part = cpi->prev_partition; + int start_pos = mi_row * cm->mi_stride + mi_col; + + const int bsl = b_width_log2_lookup[bsize]; + const int bs = (1 << bsl) >> 2; + BLOCK_SIZE subsize; + PARTITION_TYPE partition; + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + partition = partition_lookup[bsl][prev_part[start_pos]]; + subsize = get_subsize(bsize, partition); + + if (subsize < BLOCK_8X8) { + set_block_size(cpi, x, xd, mi_row, mi_col, bsize); + } else { + switch (partition) { + case PARTITION_NONE: + set_block_size(cpi, x, xd, mi_row, mi_col, bsize); + break; + case PARTITION_HORZ: + set_block_size(cpi, x, xd, mi_row, mi_col, subsize); + set_block_size(cpi, x, xd, mi_row + bs, mi_col, subsize); + break; + case PARTITION_VERT: + set_block_size(cpi, x, xd, mi_row, mi_col, subsize); + set_block_size(cpi, x, xd, mi_row, mi_col + bs, subsize); + break; + default: + assert(partition == PARTITION_SPLIT); + copy_partitioning_helper(cpi, x, xd, subsize, mi_row, mi_col); + copy_partitioning_helper(cpi, x, xd, subsize, mi_row + bs, mi_col); + copy_partitioning_helper(cpi, x, xd, subsize, mi_row, mi_col + bs); + copy_partitioning_helper(cpi, x, xd, subsize, mi_row + bs, mi_col + bs); + break; + } + } +} + +static int copy_partitioning(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, + int mi_row, int mi_col, int segment_id, + int sb_offset) { + int svc_copy_allowed = 1; + int frames_since_key_thresh = 1; + if (cpi->use_svc) { + // For SVC, don't allow copy if base spatial layer is key frame, or if + // frame is not a temporal enhancement layer frame. + int layer = LAYER_IDS_TO_IDX(0, cpi->svc.temporal_layer_id, + cpi->svc.number_temporal_layers); + const LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; + if (lc->is_key_frame || !cpi->svc.non_reference_frame) svc_copy_allowed = 0; + frames_since_key_thresh = cpi->svc.number_spatial_layers << 1; + } + if (cpi->rc.frames_since_key > frames_since_key_thresh && svc_copy_allowed && + !cpi->resize_pending && segment_id == CR_SEGMENT_ID_BASE && + cpi->prev_segment_id[sb_offset] == CR_SEGMENT_ID_BASE && + cpi->copied_frame_cnt[sb_offset] < cpi->max_copied_frame) { + if (cpi->prev_partition != NULL) { + copy_partitioning_helper(cpi, x, xd, BLOCK_64X64, mi_row, mi_col); + cpi->copied_frame_cnt[sb_offset] += 1; + memcpy(x->variance_low, &(cpi->prev_variance_low[sb_offset * 25]), + sizeof(x->variance_low)); + return 1; + } + } + + return 0; +} + +static int scale_partitioning_svc(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int mi_row_high, int mi_col_high) { + VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + BLOCK_SIZE *prev_part = svc->prev_partition_svc; + // Variables with _high are for higher resolution. + int bsize_high = 0; + int subsize_high = 0; + const int bsl_high = b_width_log2_lookup[bsize]; + const int bs_high = (1 << bsl_high) >> 2; + const int has_rows = (mi_row_high + bs_high) < cm->mi_rows; + const int has_cols = (mi_col_high + bs_high) < cm->mi_cols; + + const int row_boundary_block_scale_factor[BLOCK_SIZES] = { 13, 13, 13, 1, 0, + 1, 1, 0, 1, 1, + 0, 1, 0 }; + const int col_boundary_block_scale_factor[BLOCK_SIZES] = { 13, 13, 13, 2, 2, + 0, 2, 2, 0, 2, + 2, 0, 0 }; + int start_pos; + BLOCK_SIZE bsize_low; + PARTITION_TYPE partition_high; + + if (mi_row_high >= cm->mi_rows || mi_col_high >= cm->mi_cols) return 0; + if (mi_row >= svc->mi_rows[svc->spatial_layer_id - 1] || + mi_col >= svc->mi_cols[svc->spatial_layer_id - 1]) + return 0; + + // Find corresponding (mi_col/mi_row) block down-scaled by 2x2. + start_pos = mi_row * (svc->mi_stride[svc->spatial_layer_id - 1]) + mi_col; + bsize_low = prev_part[start_pos]; + // The block size is too big for boundaries. Do variance based partitioning. + if ((!has_rows || !has_cols) && bsize_low > BLOCK_16X16) return 1; + + // For reference frames: return 1 (do variance-based partitioning) if the + // superblock is not low source sad and lower-resoln bsize is below 32x32. + if (!cpi->svc.non_reference_frame && !x->skip_low_source_sad && + bsize_low < BLOCK_32X32) + return 1; + + // Scale up block size by 2x2. Force 64x64 for size larger than 32x32. + if (bsize_low < BLOCK_32X32) { + bsize_high = bsize_low + 3; + } else if (bsize_low >= BLOCK_32X32) { + bsize_high = BLOCK_64X64; + } + // Scale up blocks on boundary. + if (!has_cols && has_rows) { + bsize_high = bsize_low + row_boundary_block_scale_factor[bsize_low]; + } else if (has_cols && !has_rows) { + bsize_high = bsize_low + col_boundary_block_scale_factor[bsize_low]; + } else if (!has_cols && !has_rows) { + bsize_high = bsize_low; + } + + partition_high = partition_lookup[bsl_high][bsize_high]; + subsize_high = get_subsize(bsize, partition_high); + + if (subsize_high < BLOCK_8X8) { + set_block_size(cpi, x, xd, mi_row_high, mi_col_high, bsize_high); + } else { + const int bsl = b_width_log2_lookup[bsize]; + const int bs = (1 << bsl) >> 2; + switch (partition_high) { + case PARTITION_NONE: + set_block_size(cpi, x, xd, mi_row_high, mi_col_high, bsize_high); + break; + case PARTITION_HORZ: + set_block_size(cpi, x, xd, mi_row_high, mi_col_high, subsize_high); + if (subsize_high < BLOCK_64X64) + set_block_size(cpi, x, xd, mi_row_high + bs_high, mi_col_high, + subsize_high); + break; + case PARTITION_VERT: + set_block_size(cpi, x, xd, mi_row_high, mi_col_high, subsize_high); + if (subsize_high < BLOCK_64X64) + set_block_size(cpi, x, xd, mi_row_high, mi_col_high + bs_high, + subsize_high); + break; + default: + assert(partition_high == PARTITION_SPLIT); + if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row, mi_col, + mi_row_high, mi_col_high)) + return 1; + if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row + (bs >> 1), + mi_col, mi_row_high + bs_high, mi_col_high)) + return 1; + if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row, + mi_col + (bs >> 1), mi_row_high, + mi_col_high + bs_high)) + return 1; + if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row + (bs >> 1), + mi_col + (bs >> 1), mi_row_high + bs_high, + mi_col_high + bs_high)) + return 1; + break; + } + } + + return 0; +} + +static void update_partition_svc(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col) { + VP9_COMMON *const cm = &cpi->common; + BLOCK_SIZE *prev_part = cpi->svc.prev_partition_svc; + int start_pos = mi_row * cm->mi_stride + mi_col; + const int bsl = b_width_log2_lookup[bsize]; + const int bs = (1 << bsl) >> 2; + BLOCK_SIZE subsize; + PARTITION_TYPE partition; + const MODE_INFO *mi = NULL; + int xx, yy; + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + mi = cm->mi_grid_visible[start_pos]; + partition = partition_lookup[bsl][mi->sb_type]; + subsize = get_subsize(bsize, partition); + if (subsize < BLOCK_8X8) { + prev_part[start_pos] = bsize; + } else { + switch (partition) { + case PARTITION_NONE: + prev_part[start_pos] = bsize; + if (bsize == BLOCK_64X64) { + for (xx = 0; xx < 8; xx += 4) + for (yy = 0; yy < 8; yy += 4) { + if ((mi_row + xx < cm->mi_rows) && (mi_col + yy < cm->mi_cols)) + prev_part[start_pos + xx * cm->mi_stride + yy] = bsize; + } + } + break; + case PARTITION_HORZ: + prev_part[start_pos] = subsize; + if (mi_row + bs < cm->mi_rows) + prev_part[start_pos + bs * cm->mi_stride] = subsize; + break; + case PARTITION_VERT: + prev_part[start_pos] = subsize; + if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize; + break; + default: + assert(partition == PARTITION_SPLIT); + update_partition_svc(cpi, subsize, mi_row, mi_col); + update_partition_svc(cpi, subsize, mi_row + bs, mi_col); + update_partition_svc(cpi, subsize, mi_row, mi_col + bs); + update_partition_svc(cpi, subsize, mi_row + bs, mi_col + bs); + break; + } + } +} + +static void update_prev_partition_helper(VP9_COMP *cpi, BLOCK_SIZE bsize, + int mi_row, int mi_col) { + VP9_COMMON *const cm = &cpi->common; + BLOCK_SIZE *prev_part = cpi->prev_partition; + int start_pos = mi_row * cm->mi_stride + mi_col; + const int bsl = b_width_log2_lookup[bsize]; + const int bs = (1 << bsl) >> 2; + BLOCK_SIZE subsize; + PARTITION_TYPE partition; + const MODE_INFO *mi = NULL; + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + mi = cm->mi_grid_visible[start_pos]; + partition = partition_lookup[bsl][mi->sb_type]; + subsize = get_subsize(bsize, partition); + if (subsize < BLOCK_8X8) { + prev_part[start_pos] = bsize; + } else { + switch (partition) { + case PARTITION_NONE: prev_part[start_pos] = bsize; break; + case PARTITION_HORZ: + prev_part[start_pos] = subsize; + if (mi_row + bs < cm->mi_rows) + prev_part[start_pos + bs * cm->mi_stride] = subsize; + break; + case PARTITION_VERT: + prev_part[start_pos] = subsize; + if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize; + break; + default: + assert(partition == PARTITION_SPLIT); + update_prev_partition_helper(cpi, subsize, mi_row, mi_col); + update_prev_partition_helper(cpi, subsize, mi_row + bs, mi_col); + update_prev_partition_helper(cpi, subsize, mi_row, mi_col + bs); + update_prev_partition_helper(cpi, subsize, mi_row + bs, mi_col + bs); + break; + } + } +} + +static void update_prev_partition(VP9_COMP *cpi, MACROBLOCK *x, int segment_id, + int mi_row, int mi_col, int sb_offset) { + update_prev_partition_helper(cpi, BLOCK_64X64, mi_row, mi_col); + cpi->prev_segment_id[sb_offset] = segment_id; + memcpy(&(cpi->prev_variance_low[sb_offset * 25]), x->variance_low, + sizeof(x->variance_low)); + // Reset the counter for copy partitioning + cpi->copied_frame_cnt[sb_offset] = 0; +} + +static void chroma_check(VP9_COMP *cpi, MACROBLOCK *x, int bsize, + unsigned int y_sad, int is_key_frame, + int scene_change_detected) { + int i; + MACROBLOCKD *xd = &x->e_mbd; + int shift = 2; + + if (is_key_frame) return; + + // For speed > 8, avoid the chroma check if y_sad is above threshold. + if (cpi->oxcf.speed > 8) { + if (y_sad > cpi->vbp_thresholds[1] && + (!cpi->noise_estimate.enabled || + vp9_noise_estimate_extract_level(&cpi->noise_estimate) < kMedium)) + return; + } + + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && scene_change_detected) + shift = 5; + + for (i = 1; i <= 2; ++i) { + unsigned int uv_sad = UINT_MAX; + struct macroblock_plane *p = &x->plane[i]; + struct macroblockd_plane *pd = &xd->plane[i]; + const BLOCK_SIZE bs = get_plane_block_size(bsize, pd); + + if (bs != BLOCK_INVALID) + uv_sad = cpi->fn_ptr[bs].sdf(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride); + + // TODO(marpan): Investigate if we should lower this threshold if + // superblock is detected as skin. + x->color_sensitivity[i - 1] = uv_sad > (y_sad >> shift); + } +} + +static uint64_t avg_source_sad(VP9_COMP *cpi, MACROBLOCK *x, int shift, + int sb_offset) { + unsigned int tmp_sse; + uint64_t tmp_sad; + unsigned int tmp_variance; + const BLOCK_SIZE bsize = BLOCK_64X64; + uint8_t *src_y = cpi->Source->y_buffer; + int src_ystride = cpi->Source->y_stride; + uint8_t *last_src_y = cpi->Last_Source->y_buffer; + int last_src_ystride = cpi->Last_Source->y_stride; + uint64_t avg_source_sad_threshold = 10000; + uint64_t avg_source_sad_threshold2 = 12000; +#if CONFIG_VP9_HIGHBITDEPTH + if (cpi->common.use_highbitdepth) return 0; +#endif + src_y += shift; + last_src_y += shift; + tmp_sad = + cpi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y, last_src_ystride); + tmp_variance = vpx_variance64x64(src_y, src_ystride, last_src_y, + last_src_ystride, &tmp_sse); + // Note: tmp_sse - tmp_variance = ((sum * sum) >> 12) + if (tmp_sad < avg_source_sad_threshold) + x->content_state_sb = ((tmp_sse - tmp_variance) < 25) ? kLowSadLowSumdiff + : kLowSadHighSumdiff; + else + x->content_state_sb = ((tmp_sse - tmp_variance) < 25) ? kHighSadLowSumdiff + : kHighSadHighSumdiff; + + // Detect large lighting change. + if (cpi->oxcf.content != VP9E_CONTENT_SCREEN && + cpi->oxcf.rc_mode == VPX_CBR && tmp_variance < (tmp_sse >> 3) && + (tmp_sse - tmp_variance) > 10000) + x->content_state_sb = kLowVarHighSumdiff; + else if (tmp_sad > (avg_source_sad_threshold << 1)) + x->content_state_sb = kVeryHighSad; + + if (cpi->content_state_sb_fd != NULL) { + if (tmp_sad < avg_source_sad_threshold2) { + // Cap the increment to 255. + if (cpi->content_state_sb_fd[sb_offset] < 255) + cpi->content_state_sb_fd[sb_offset]++; + } else { + cpi->content_state_sb_fd[sb_offset] = 0; + } + } + if (tmp_sad == 0) x->zero_temp_sad_source = 1; + return tmp_sad; +} + +// This function chooses partitioning based on the variance between source and +// reconstructed last, where variance is computed for down-sampled inputs. +static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, + MACROBLOCK *x, int mi_row, int mi_col) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + int i, j, k, m; + v64x64 vt; + v16x16 *vt2 = NULL; + int force_split[21]; + int avg_32x32; + int max_var_32x32 = 0; + int min_var_32x32 = INT_MAX; + int var_32x32; + int avg_16x16[4]; + int maxvar_16x16[4]; + int minvar_16x16[4]; + int64_t threshold_4x4avg; + NOISE_LEVEL noise_level = kLow; + int content_state = 0; + uint8_t *s; + const uint8_t *d; + int sp; + int dp; + int compute_minmax_variance = 1; + unsigned int y_sad = UINT_MAX; + BLOCK_SIZE bsize = BLOCK_64X64; + // Ref frame used in partitioning. + MV_REFERENCE_FRAME ref_frame_partition = LAST_FRAME; + int pixels_wide = 64, pixels_high = 64; + int64_t thresholds[4] = { cpi->vbp_thresholds[0], cpi->vbp_thresholds[1], + cpi->vbp_thresholds[2], cpi->vbp_thresholds[3] }; + int scene_change_detected = + cpi->rc.high_source_sad || + (cpi->use_svc && cpi->svc.high_source_sad_superframe); + int force_64_split = scene_change_detected || + (cpi->oxcf.content == VP9E_CONTENT_SCREEN && + cpi->compute_source_sad_onepass && + cpi->sf.use_source_sad && !x->zero_temp_sad_source); + + // For the variance computation under SVC mode, we treat the frame as key if + // the reference (base layer frame) is key frame (i.e., is_key_frame == 1). + int is_key_frame = + (frame_is_intra_only(cm) || + (is_one_pass_svc(cpi) && + cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)); + + if (!is_key_frame) { + if (cm->frame_refs[LAST_FRAME - 1].sf.x_scale_fp == REF_INVALID_SCALE || + cm->frame_refs[LAST_FRAME - 1].sf.y_scale_fp == REF_INVALID_SCALE) + is_key_frame = 1; + } + + // Always use 4x4 partition for key frame. + const int use_4x4_partition = frame_is_intra_only(cm); + const int low_res = (cm->width <= 352 && cm->height <= 288); + int variance4x4downsample[16]; + int segment_id; + int sb_offset = (cm->mi_stride >> 3) * (mi_row >> 3) + (mi_col >> 3); + + // For SVC: check if LAST frame is NULL or if the resolution of LAST is + // different than the current frame resolution, and if so, treat this frame + // as a key frame, for the purpose of the superblock partitioning. + // LAST == NULL can happen in some cases where enhancement spatial layers are + // enabled dyanmically in the stream and the only reference is the spatial + // reference (GOLDEN). + if (cpi->use_svc) { + const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, LAST_FRAME); + if (ref == NULL || ref->y_crop_height != cm->height || + ref->y_crop_width != cm->width) + is_key_frame = 1; + } + + set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64); + set_segment_index(cpi, x, mi_row, mi_col, BLOCK_64X64, 0); + segment_id = xd->mi[0]->segment_id; + + if (cpi->oxcf.speed >= 8 || (cpi->use_svc && cpi->svc.non_reference_frame)) + compute_minmax_variance = 0; + + memset(x->variance_low, 0, sizeof(x->variance_low)); + + if (cpi->sf.use_source_sad && !is_key_frame) { + int sb_offset2 = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3); + content_state = x->content_state_sb; + x->skip_low_source_sad = (content_state == kLowSadLowSumdiff || + content_state == kLowSadHighSumdiff) + ? 1 + : 0; + x->lowvar_highsumdiff = (content_state == kLowVarHighSumdiff) ? 1 : 0; + if (cpi->content_state_sb_fd != NULL) + x->last_sb_high_content = cpi->content_state_sb_fd[sb_offset2]; + + // For SVC on top spatial layer: use/scale the partition from + // the lower spatial resolution if svc_use_lowres_part is enabled. + if (cpi->sf.svc_use_lowres_part && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 && + cpi->svc.prev_partition_svc != NULL && content_state != kVeryHighSad) { + if (!scale_partitioning_svc(cpi, x, xd, BLOCK_64X64, mi_row >> 1, + mi_col >> 1, mi_row, mi_col)) { + if (cpi->sf.copy_partition_flag) { + update_prev_partition(cpi, x, segment_id, mi_row, mi_col, sb_offset); + } + return 0; + } + } + // If source_sad is low copy the partition without computing the y_sad. + if (x->skip_low_source_sad && cpi->sf.copy_partition_flag && + !force_64_split && + copy_partitioning(cpi, x, xd, mi_row, mi_col, segment_id, sb_offset)) { + x->sb_use_mv_part = 1; + if (cpi->sf.svc_use_lowres_part && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2) + update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col); + return 0; + } + } + + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled && + cyclic_refresh_segment_id_boosted(segment_id)) { + int q = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex); + set_vbp_thresholds(cpi, thresholds, q, content_state); + } else { + set_vbp_thresholds(cpi, thresholds, cm->base_qindex, content_state); + } + // Decrease 32x32 split threshold for screen on base layer, for scene + // change/high motion frames. + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && + cpi->svc.spatial_layer_id == 0 && force_64_split) + thresholds[1] = 3 * thresholds[1] >> 2; + + // For non keyframes, disable 4x4 average for low resolution when speed = 8 + threshold_4x4avg = (cpi->oxcf.speed < 8) ? thresholds[1] << 1 : INT64_MAX; + + if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3); + if (xd->mb_to_bottom_edge < 0) pixels_high += (xd->mb_to_bottom_edge >> 3); + + s = x->plane[0].src.buf; + sp = x->plane[0].src.stride; + + // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks, + // 5-20 for the 16x16 blocks. + force_split[0] = force_64_split; + + if (!is_key_frame) { + // In the case of spatial/temporal scalable coding, the assumption here is + // that the temporal reference frame will always be of type LAST_FRAME. + // TODO(marpan): If that assumption is broken, we need to revisit this code. + MODE_INFO *mi = xd->mi[0]; + YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME); + + const YV12_BUFFER_CONFIG *yv12_g = NULL; + unsigned int y_sad_g, y_sad_thr, y_sad_last; + bsize = BLOCK_32X32 + (mi_col + 4 < cm->mi_cols) * 2 + + (mi_row + 4 < cm->mi_rows); + + assert(yv12 != NULL); + + if (!(is_one_pass_svc(cpi) && cpi->svc.spatial_layer_id) || + cpi->svc.use_gf_temporal_ref_current_layer) { + // For now, GOLDEN will not be used for non-zero spatial layers, since + // it may not be a temporal reference. + yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME); + } + + // Only compute y_sad_g (sad for golden reference) for speed < 8. + if (cpi->oxcf.speed < 8 && yv12_g && yv12_g != yv12 && + (cpi->ref_frame_flags & VP9_GOLD_FLAG)) { + vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col, + &cm->frame_refs[GOLDEN_FRAME - 1].sf); + y_sad_g = cpi->fn_ptr[bsize].sdf( + x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf, + xd->plane[0].pre[0].stride); + } else { + y_sad_g = UINT_MAX; + } + + if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR && + cpi->rc.is_src_frame_alt_ref) { + yv12 = get_ref_frame_buffer(cpi, ALTREF_FRAME); + vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, + &cm->frame_refs[ALTREF_FRAME - 1].sf); + mi->ref_frame[0] = ALTREF_FRAME; + y_sad_g = UINT_MAX; + } else { + vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, + &cm->frame_refs[LAST_FRAME - 1].sf); + mi->ref_frame[0] = LAST_FRAME; + } + mi->ref_frame[1] = NO_REF_FRAME; + mi->sb_type = BLOCK_64X64; + mi->mv[0].as_int = 0; + mi->interp_filter = BILINEAR; + + if (cpi->oxcf.speed >= 8 && !low_res && + x->content_state_sb != kVeryHighSad) { + y_sad = cpi->fn_ptr[bsize].sdf( + x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf, + xd->plane[0].pre[0].stride); + } else { + const MV dummy_mv = { 0, 0 }; + y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col, + &dummy_mv); + x->sb_use_mv_part = 1; + x->sb_mvcol_part = mi->mv[0].as_mv.col; + x->sb_mvrow_part = mi->mv[0].as_mv.row; + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && + cpi->svc.spatial_layer_id == cpi->svc.first_spatial_layer_to_encode && + cpi->svc.high_num_blocks_with_motion && !x->zero_temp_sad_source && + cm->width > 640 && cm->height > 480) { + // Disable split below 16x16 block size when scroll motion (horz or + // vert) is detected. + // TODO(marpan/jianj): Improve this condition: issue is that search + // range is hard-coded/limited in vp9_int_pro_motion_estimation() so + // scroll motion may not be detected here. + if (((abs(x->sb_mvrow_part) >= 48 && abs(x->sb_mvcol_part) <= 8) || + (abs(x->sb_mvcol_part) >= 48 && abs(x->sb_mvrow_part) <= 8)) && + y_sad < 100000) { + compute_minmax_variance = 0; + thresholds[2] = INT64_MAX; + } + } + } + + y_sad_last = y_sad; + // Pick ref frame for partitioning, bias last frame when y_sad_g and y_sad + // are close if short_circuit_low_temp_var is on. + y_sad_thr = cpi->sf.short_circuit_low_temp_var ? (y_sad * 7) >> 3 : y_sad; + if (y_sad_g < y_sad_thr) { + vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col, + &cm->frame_refs[GOLDEN_FRAME - 1].sf); + mi->ref_frame[0] = GOLDEN_FRAME; + mi->mv[0].as_int = 0; + y_sad = y_sad_g; + ref_frame_partition = GOLDEN_FRAME; + } else { + x->pred_mv[LAST_FRAME] = mi->mv[0].as_mv; + ref_frame_partition = LAST_FRAME; + } + + set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); + vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64); + + if (cpi->use_skin_detection) + x->sb_is_skin = skin_sb_split(cpi, low_res, mi_row, mi_col, force_split); + + d = xd->plane[0].dst.buf; + dp = xd->plane[0].dst.stride; + + // If the y_sad is very small, take 64x64 as partition and exit. + // Don't check on boosted segment for now, as 64x64 is suppressed there. + if (segment_id == CR_SEGMENT_ID_BASE && y_sad < cpi->vbp_threshold_sad) { + const int block_width = num_8x8_blocks_wide_lookup[BLOCK_64X64]; + const int block_height = num_8x8_blocks_high_lookup[BLOCK_64X64]; + if (mi_col + block_width / 2 < cm->mi_cols && + mi_row + block_height / 2 < cm->mi_rows) { + set_block_size(cpi, x, xd, mi_row, mi_col, BLOCK_64X64); + x->variance_low[0] = 1; + chroma_check(cpi, x, bsize, y_sad, is_key_frame, scene_change_detected); + if (cpi->sf.svc_use_lowres_part && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2) + update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col); + if (cpi->sf.copy_partition_flag) { + update_prev_partition(cpi, x, segment_id, mi_row, mi_col, sb_offset); + } + return 0; + } + } + + // If the y_sad is small enough, copy the partition of the superblock in the + // last frame to current frame only if the last frame is not a keyframe. + // Stop the copy every cpi->max_copied_frame to refresh the partition. + // TODO(jianj) : tune the threshold. + if (cpi->sf.copy_partition_flag && y_sad_last < cpi->vbp_threshold_copy && + copy_partitioning(cpi, x, xd, mi_row, mi_col, segment_id, sb_offset)) { + chroma_check(cpi, x, bsize, y_sad, is_key_frame, scene_change_detected); + if (cpi->sf.svc_use_lowres_part && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2) + update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col); + return 0; + } + } else { + d = VP9_VAR_OFFS; + dp = 0; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + switch (xd->bd) { + case 10: d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_10); break; + case 12: d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_12); break; + case 8: + default: d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_8); break; + } + } +#endif // CONFIG_VP9_HIGHBITDEPTH + } + + if (low_res && threshold_4x4avg < INT64_MAX) + CHECK_MEM_ERROR(&cm->error, vt2, vpx_calloc(16, sizeof(*vt2))); + // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances + // for splits. + for (i = 0; i < 4; i++) { + const int x32_idx = ((i & 1) << 5); + const int y32_idx = ((i >> 1) << 5); + const int i2 = i << 2; + force_split[i + 1] = 0; + avg_16x16[i] = 0; + maxvar_16x16[i] = 0; + minvar_16x16[i] = INT_MAX; + for (j = 0; j < 4; j++) { + const int x16_idx = x32_idx + ((j & 1) << 4); + const int y16_idx = y32_idx + ((j >> 1) << 4); + const int split_index = 5 + i2 + j; + v16x16 *vst = &vt.split[i].split[j]; + force_split[split_index] = 0; + variance4x4downsample[i2 + j] = 0; + if (!is_key_frame) { + fill_variance_8x8avg(s, sp, d, dp, x16_idx, y16_idx, vst, +#if CONFIG_VP9_HIGHBITDEPTH + xd->cur_buf->flags, +#endif + pixels_wide, pixels_high, is_key_frame); + fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16); + get_variance(&vt.split[i].split[j].part_variances.none); + avg_16x16[i] += vt.split[i].split[j].part_variances.none.variance; + if (vt.split[i].split[j].part_variances.none.variance < minvar_16x16[i]) + minvar_16x16[i] = vt.split[i].split[j].part_variances.none.variance; + if (vt.split[i].split[j].part_variances.none.variance > maxvar_16x16[i]) + maxvar_16x16[i] = vt.split[i].split[j].part_variances.none.variance; + if (vt.split[i].split[j].part_variances.none.variance > thresholds[2]) { + // 16X16 variance is above threshold for split, so force split to 8x8 + // for this 16x16 block (this also forces splits for upper levels). + force_split[split_index] = 1; + force_split[i + 1] = 1; + force_split[0] = 1; + } else if (compute_minmax_variance && + vt.split[i].split[j].part_variances.none.variance > + thresholds[1] && + !cyclic_refresh_segment_id_boosted(segment_id)) { + // We have some nominal amount of 16x16 variance (based on average), + // compute the minmax over the 8x8 sub-blocks, and if above threshold, + // force split to 8x8 block for this 16x16 block. + int minmax = compute_minmax_8x8(s, sp, d, dp, x16_idx, y16_idx, +#if CONFIG_VP9_HIGHBITDEPTH + xd->cur_buf->flags, +#endif + pixels_wide, pixels_high); + int thresh_minmax = (int)cpi->vbp_threshold_minmax; + if (x->content_state_sb == kVeryHighSad) + thresh_minmax = thresh_minmax << 1; + if (minmax > thresh_minmax) { + force_split[split_index] = 1; + force_split[i + 1] = 1; + force_split[0] = 1; + } + } + } + if (is_key_frame || + (low_res && vt.split[i].split[j].part_variances.none.variance > + threshold_4x4avg)) { + force_split[split_index] = 0; + // Go down to 4x4 down-sampling for variance. + variance4x4downsample[i2 + j] = 1; + for (k = 0; k < 4; k++) { + int x8_idx = x16_idx + ((k & 1) << 3); + int y8_idx = y16_idx + ((k >> 1) << 3); + v8x8 *vst2 = is_key_frame ? &vst->split[k] : &vt2[i2 + j].split[k]; + fill_variance_4x4avg(s, sp, d, dp, x8_idx, y8_idx, vst2, +#if CONFIG_VP9_HIGHBITDEPTH + xd->cur_buf->flags, +#endif + pixels_wide, pixels_high, is_key_frame); + } + } + } + } + if (cpi->noise_estimate.enabled) + noise_level = vp9_noise_estimate_extract_level(&cpi->noise_estimate); + // Fill the rest of the variance tree by summing split partition values. + avg_32x32 = 0; + for (i = 0; i < 4; i++) { + const int i2 = i << 2; + for (j = 0; j < 4; j++) { + if (variance4x4downsample[i2 + j] == 1) { + v16x16 *vtemp = (!is_key_frame) ? &vt2[i2 + j] : &vt.split[i].split[j]; + for (m = 0; m < 4; m++) fill_variance_tree(&vtemp->split[m], BLOCK_8X8); + fill_variance_tree(vtemp, BLOCK_16X16); + // If variance of this 16x16 block is above the threshold, force block + // to split. This also forces a split on the upper levels. + get_variance(&vtemp->part_variances.none); + if (vtemp->part_variances.none.variance > thresholds[2]) { + force_split[5 + i2 + j] = 1; + force_split[i + 1] = 1; + force_split[0] = 1; + } + } + } + fill_variance_tree(&vt.split[i], BLOCK_32X32); + // If variance of this 32x32 block is above the threshold, or if its above + // (some threshold of) the average variance over the sub-16x16 blocks, then + // force this block to split. This also forces a split on the upper + // (64x64) level. + if (!force_split[i + 1]) { + get_variance(&vt.split[i].part_variances.none); + var_32x32 = vt.split[i].part_variances.none.variance; + max_var_32x32 = VPXMAX(var_32x32, max_var_32x32); + min_var_32x32 = VPXMIN(var_32x32, min_var_32x32); + if (vt.split[i].part_variances.none.variance > thresholds[1] || + (!is_key_frame && + vt.split[i].part_variances.none.variance > (thresholds[1] >> 1) && + vt.split[i].part_variances.none.variance > (avg_16x16[i] >> 1))) { + force_split[i + 1] = 1; + force_split[0] = 1; + } else if (!is_key_frame && noise_level < kLow && cm->height <= 360 && + (maxvar_16x16[i] - minvar_16x16[i]) > (thresholds[1] >> 1) && + maxvar_16x16[i] > thresholds[1]) { + force_split[i + 1] = 1; + force_split[0] = 1; + } + avg_32x32 += var_32x32; + } + } + if (!force_split[0]) { + fill_variance_tree(&vt, BLOCK_64X64); + get_variance(&vt.part_variances.none); + // If variance of this 64x64 block is above (some threshold of) the average + // variance over the sub-32x32 blocks, then force this block to split. + // Only checking this for noise level >= medium for now. + if (!is_key_frame && noise_level >= kMedium && + vt.part_variances.none.variance > (9 * avg_32x32) >> 5) + force_split[0] = 1; + // Else if the maximum 32x32 variance minus the miniumum 32x32 variance in + // a 64x64 block is greater than threshold and the maximum 32x32 variance is + // above a miniumum threshold, then force the split of a 64x64 block + // Only check this for low noise. + else if (!is_key_frame && noise_level < kMedium && + (max_var_32x32 - min_var_32x32) > 3 * (thresholds[0] >> 3) && + max_var_32x32 > thresholds[0] >> 1) + force_split[0] = 1; + } + + // Now go through the entire structure, splitting every block size until + // we get to one that's got a variance lower than our threshold. + if (mi_col + 8 > cm->mi_cols || mi_row + 8 > cm->mi_rows || + !set_vt_partitioning(cpi, x, xd, &vt, BLOCK_64X64, mi_row, mi_col, + thresholds[0], BLOCK_16X16, force_split[0])) { + for (i = 0; i < 4; ++i) { + const int x32_idx = ((i & 1) << 2); + const int y32_idx = ((i >> 1) << 2); + const int i2 = i << 2; + if (!set_vt_partitioning(cpi, x, xd, &vt.split[i], BLOCK_32X32, + (mi_row + y32_idx), (mi_col + x32_idx), + thresholds[1], BLOCK_16X16, + force_split[i + 1])) { + for (j = 0; j < 4; ++j) { + const int x16_idx = ((j & 1) << 1); + const int y16_idx = ((j >> 1) << 1); + // For inter frames: if variance4x4downsample[] == 1 for this 16x16 + // block, then the variance is based on 4x4 down-sampling, so use vt2 + // in set_vt_partitioning(), otherwise use vt. + v16x16 *vtemp = (!is_key_frame && variance4x4downsample[i2 + j] == 1) + ? &vt2[i2 + j] + : &vt.split[i].split[j]; + if (!set_vt_partitioning( + cpi, x, xd, vtemp, BLOCK_16X16, mi_row + y32_idx + y16_idx, + mi_col + x32_idx + x16_idx, thresholds[2], cpi->vbp_bsize_min, + force_split[5 + i2 + j])) { + for (k = 0; k < 4; ++k) { + const int x8_idx = (k & 1); + const int y8_idx = (k >> 1); + if (use_4x4_partition) { + if (!set_vt_partitioning(cpi, x, xd, &vtemp->split[k], + BLOCK_8X8, + mi_row + y32_idx + y16_idx + y8_idx, + mi_col + x32_idx + x16_idx + x8_idx, + thresholds[3], BLOCK_8X8, 0)) { + set_block_size( + cpi, x, xd, (mi_row + y32_idx + y16_idx + y8_idx), + (mi_col + x32_idx + x16_idx + x8_idx), BLOCK_4X4); + } + } else { + set_block_size( + cpi, x, xd, (mi_row + y32_idx + y16_idx + y8_idx), + (mi_col + x32_idx + x16_idx + x8_idx), BLOCK_8X8); + } + } + } + } + } + } + } + + if (!frame_is_intra_only(cm) && cpi->sf.copy_partition_flag) { + update_prev_partition(cpi, x, segment_id, mi_row, mi_col, sb_offset); + } + + if (!frame_is_intra_only(cm) && cpi->sf.svc_use_lowres_part && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2) + update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col); + + if (cpi->sf.short_circuit_low_temp_var) { + set_low_temp_var_flag(cpi, x, xd, &vt, thresholds, ref_frame_partition, + mi_col, mi_row); + } + + chroma_check(cpi, x, bsize, y_sad, is_key_frame, scene_change_detected); + if (vt2) vpx_free(vt2); + return 0; +} + +#if !CONFIG_REALTIME_ONLY +static void update_state(VP9_COMP *cpi, ThreadData *td, PICK_MODE_CONTEXT *ctx, + int mi_row, int mi_col, BLOCK_SIZE bsize, + int output_enabled) { + int i, x_idx, y; + VP9_COMMON *const cm = &cpi->common; + RD_COUNTS *const rdc = &td->rd_counts; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = xd->plane; + MODE_INFO *mi = &ctx->mic; + MODE_INFO *const xdmi = xd->mi[0]; + MODE_INFO *mi_addr = xd->mi[0]; + const struct segmentation *const seg = &cm->seg; + const int bw = num_8x8_blocks_wide_lookup[mi->sb_type]; + const int bh = num_8x8_blocks_high_lookup[mi->sb_type]; + const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col); + const int y_mis = VPXMIN(bh, cm->mi_rows - mi_row); + MV_REF *const frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col; + int w, h; + + const int mis = cm->mi_stride; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + int max_plane; + + assert(mi->sb_type == bsize); + + *mi_addr = *mi; + *x->mbmi_ext = ctx->mbmi_ext; + + // If segmentation in use + if (seg->enabled) { + // For in frame complexity AQ copy the segment id from the segment map. + if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) { + const uint8_t *const map = + seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; + mi_addr->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); + } + // Else for cyclic refresh mode update the segment map, set the segment id + // and then update the quantizer. + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && + cpi->cyclic_refresh->content_mode) { + vp9_cyclic_refresh_update_segment(cpi, xd->mi[0], mi_row, mi_col, bsize, + ctx->rate, ctx->dist, x->skip, p); + } + } + + max_plane = is_inter_block(xdmi) ? MAX_MB_PLANE : 1; + for (i = 0; i < max_plane; ++i) { + p[i].coeff = ctx->coeff_pbuf[i][1]; + p[i].qcoeff = ctx->qcoeff_pbuf[i][1]; + pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1]; + p[i].eobs = ctx->eobs_pbuf[i][1]; + } + + for (i = max_plane; i < MAX_MB_PLANE; ++i) { + p[i].coeff = ctx->coeff_pbuf[i][2]; + p[i].qcoeff = ctx->qcoeff_pbuf[i][2]; + pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2]; + p[i].eobs = ctx->eobs_pbuf[i][2]; + } + + // Restore the coding context of the MB to that that was in place + // when the mode was picked for it + for (y = 0; y < mi_height; y++) + for (x_idx = 0; x_idx < mi_width; x_idx++) + if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx && + (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) { + xd->mi[x_idx + y * mis] = mi_addr; + } + + if (cpi->oxcf.aq_mode != NO_AQ) vp9_init_plane_quantizers(cpi, x); + + if (is_inter_block(xdmi) && xdmi->sb_type < BLOCK_8X8) { + xdmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int; + xdmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int; + } + + x->skip = ctx->skip; + memcpy(x->zcoeff_blk[xdmi->tx_size], ctx->zcoeff_blk, + sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk); + + if (!output_enabled) return; + +#if CONFIG_INTERNAL_STATS + if (frame_is_intra_only(cm)) { + static const int kf_mode_index[] = { + THR_DC /*DC_PRED*/, THR_V_PRED /*V_PRED*/, + THR_H_PRED /*H_PRED*/, THR_D45_PRED /*D45_PRED*/, + THR_D135_PRED /*D135_PRED*/, THR_D117_PRED /*D117_PRED*/, + THR_D153_PRED /*D153_PRED*/, THR_D207_PRED /*D207_PRED*/, + THR_D63_PRED /*D63_PRED*/, THR_TM /*TM_PRED*/, + }; + ++cpi->mode_chosen_counts[kf_mode_index[xdmi->mode]]; + } else { + // Note how often each mode chosen as best + ++cpi->mode_chosen_counts[ctx->best_mode_index]; + } +#endif + if (!frame_is_intra_only(cm)) { + if (is_inter_block(xdmi)) { + vp9_update_mv_count(td); + + if (cm->interp_filter == SWITCHABLE) { + const int ctx_interp = get_pred_context_switchable_interp(xd); + ++td->counts->switchable_interp[ctx_interp][xdmi->interp_filter]; + } + } + + rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff; + rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff; + rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff; + + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) + rdc->filter_diff[i] += ctx->best_filter_diff[i]; + } + + for (h = 0; h < y_mis; ++h) { + MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols; + for (w = 0; w < x_mis; ++w) { + MV_REF *const mv = frame_mv + w; + mv->ref_frame[0] = mi->ref_frame[0]; + mv->ref_frame[1] = mi->ref_frame[1]; + mv->mv[0].as_int = mi->mv[0].as_int; + mv->mv[1].as_int = mi->mv[1].as_int; + } + } +} +#endif // !CONFIG_REALTIME_ONLY + +void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src, + int mi_row, int mi_col) { + uint8_t *const buffers[3] = { src->y_buffer, src->u_buffer, src->v_buffer }; + const int strides[3] = { src->y_stride, src->uv_stride, src->uv_stride }; + int i; + + // Set current frame pointer. + x->e_mbd.cur_buf = src; + + for (i = 0; i < MAX_MB_PLANE; i++) + setup_pred_plane(&x->plane[i].src, buffers[i], strides[i], mi_row, mi_col, + NULL, x->e_mbd.plane[i].subsampling_x, + x->e_mbd.plane[i].subsampling_y); +} + +static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode, + INTERP_FILTER interp_filter, + RD_COST *rd_cost, BLOCK_SIZE bsize) { + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mi = xd->mi[0]; + INTERP_FILTER filter_ref; + + filter_ref = get_pred_context_switchable_interp(xd); + if (interp_filter == BILINEAR) + filter_ref = BILINEAR; + else if (filter_ref == SWITCHABLE_FILTERS) + filter_ref = EIGHTTAP; + + mi->sb_type = bsize; + mi->mode = ZEROMV; + mi->tx_size = + VPXMIN(max_txsize_lookup[bsize], tx_mode_to_biggest_tx_size[tx_mode]); + mi->skip = 1; + mi->uv_mode = DC_PRED; + mi->ref_frame[0] = LAST_FRAME; + mi->ref_frame[1] = NO_REF_FRAME; + mi->mv[0].as_int = 0; + mi->interp_filter = filter_ref; + + xd->mi[0]->bmi[0].as_mv[0].as_int = 0; + x->skip = 1; + + vp9_rd_cost_init(rd_cost); +} + +#if !CONFIG_REALTIME_ONLY +static void set_segment_rdmult(VP9_COMP *const cpi, MACROBLOCK *const x, + int mi_row, int mi_col, BLOCK_SIZE bsize, + AQ_MODE aq_mode) { + VP9_COMMON *const cm = &cpi->common; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + const uint8_t *const map = + cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map; + + vp9_init_plane_quantizers(cpi, x); + vpx_clear_system_state(); + + if (aq_mode == NO_AQ || aq_mode == PSNR_AQ) { + if (cpi->sf.enable_tpl_model) x->rdmult = x->cb_rdmult; + } else if (aq_mode == PERCEPTUAL_AQ) { + x->rdmult = x->cb_rdmult; + } else if (aq_mode == CYCLIC_REFRESH_AQ) { + // If segment is boosted, use rdmult for that segment. + if (cyclic_refresh_segment_id_boosted( + get_segment_id(cm, map, bsize, mi_row, mi_col))) + x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh); + } else { + x->rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q); + } + + if (oxcf->tuning == VP8_TUNE_SSIM) { + set_ssim_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult); + } +} + +static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, + MACROBLOCK *const x, int mi_row, int mi_col, + RD_COST *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, int rate_in_best_rd, + int64_t dist_in_best_rd) { + VP9_COMMON *const cm = &cpi->common; + TileInfo *const tile_info = &tile_data->tile_info; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *mi; + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = xd->plane; + const AQ_MODE aq_mode = cpi->oxcf.aq_mode; + int i, orig_rdmult; + int64_t best_rd = INT64_MAX; + + vpx_clear_system_state(); +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, rd_pick_sb_modes_time); +#endif + + // Use the lower precision, but faster, 32x32 fdct for mode selection. + x->use_lp32x32fdct = 1; + + set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); + mi = xd->mi[0]; + mi->sb_type = bsize; + + for (i = 0; i < MAX_MB_PLANE; ++i) { + p[i].coeff = ctx->coeff_pbuf[i][0]; + p[i].qcoeff = ctx->qcoeff_pbuf[i][0]; + pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][0]; + p[i].eobs = ctx->eobs_pbuf[i][0]; + } + ctx->is_coded = 0; + ctx->skippable = 0; + ctx->pred_pixel_ready = 0; + x->skip_recode = 0; + + // Set to zero to make sure we do not use the previous encoded frame stats + mi->skip = 0; + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + x->source_variance = vp9_high_get_sby_perpixel_variance( + cpi, &x->plane[0].src, bsize, xd->bd); + } else { + x->source_variance = + vp9_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize); + } +#else + x->source_variance = + vp9_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize); +#endif // CONFIG_VP9_HIGHBITDEPTH + + // Save rdmult before it might be changed, so it can be restored later. + orig_rdmult = x->rdmult; + + if ((cpi->sf.tx_domain_thresh > 0.0) || + (cpi->sf.trellis_opt_tx_rd.thresh > 0.0)) { + double logvar = vp9_log_block_var(cpi, x, bsize); + // Check block complexity as part of decision on using pixel or transform + // domain distortion in rd tests. + x->block_tx_domain = cpi->sf.allow_txfm_domain_distortion && + (logvar >= cpi->sf.tx_domain_thresh); + + // Store block complexity to decide on using quantized coefficient + // optimization inside the rd loop. + x->log_block_src_var = logvar; + } else { + x->block_tx_domain = cpi->sf.allow_txfm_domain_distortion; + x->log_block_src_var = 0.0; + } + + set_segment_index(cpi, x, mi_row, mi_col, bsize, 0); + set_segment_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode); + if (rate_in_best_rd < INT_MAX && dist_in_best_rd < INT64_MAX) { + best_rd = vp9_calculate_rd_cost(x->rdmult, x->rddiv, rate_in_best_rd, + dist_in_best_rd); + } + + // Find best coding mode & reconstruct the MB so it is available + // as a predictor for MBs that follow in the SB + if (frame_is_intra_only(cm)) { + vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd); + } else { + if (bsize >= BLOCK_8X8) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, vp9_rd_pick_inter_mode_sb_time); +#endif + if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP)) + vp9_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, rd_cost, bsize, + ctx, best_rd); + else + vp9_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost, + bsize, ctx, best_rd); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, vp9_rd_pick_inter_mode_sb_time); +#endif + } else { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, vp9_rd_pick_inter_mode_sub8x8_time); +#endif + vp9_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col, rd_cost, + bsize, ctx, best_rd); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, vp9_rd_pick_inter_mode_sub8x8_time); +#endif + } + } + + // Examine the resulting rate and for AQ mode 2 make a segment choice. + if ((rd_cost->rate != INT_MAX) && (aq_mode == COMPLEXITY_AQ) && + (bsize >= BLOCK_16X16) && + (cm->frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame || + (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref))) { + vp9_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate); + } + + // TODO(jingning) The rate-distortion optimization flow needs to be + // refactored to provide proper exit/return handle. + if (rd_cost->rate == INT_MAX || rd_cost->dist == INT64_MAX) + rd_cost->rdcost = INT64_MAX; + else + rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist); + + x->rdmult = orig_rdmult; + + ctx->rate = rd_cost->rate; + ctx->dist = rd_cost->dist; +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, rd_pick_sb_modes_time); +#endif +} +#endif // !CONFIG_REALTIME_ONLY + +static void update_stats(VP9_COMMON *cm, ThreadData *td) { + const MACROBLOCK *x = &td->mb; + const MACROBLOCKD *const xd = &x->e_mbd; + const MODE_INFO *const mi = xd->mi[0]; + const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + const BLOCK_SIZE bsize = mi->sb_type; + + if (!frame_is_intra_only(cm)) { + FRAME_COUNTS *const counts = td->counts; + const int inter_block = is_inter_block(mi); + const int seg_ref_active = + segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_REF_FRAME); + if (!seg_ref_active) { + counts->intra_inter[get_intra_inter_context(xd)][inter_block]++; + // If the segment reference feature is enabled we have only a single + // reference frame allowed for the segment so exclude it from + // the reference frame counts used to work out probabilities. + if (inter_block) { + const MV_REFERENCE_FRAME ref0 = mi->ref_frame[0]; + if (cm->reference_mode == REFERENCE_MODE_SELECT) + counts->comp_inter[vp9_get_reference_mode_context(cm, xd)] + [has_second_ref(mi)]++; + + if (has_second_ref(mi)) { + const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref]; + const int ctx = vp9_get_pred_context_comp_ref_p(cm, xd); + const int bit = mi->ref_frame[!idx] == cm->comp_var_ref[1]; + counts->comp_ref[ctx][bit]++; + } else { + counts->single_ref[vp9_get_pred_context_single_ref_p1(xd)][0] + [ref0 != LAST_FRAME]++; + if (ref0 != LAST_FRAME) + counts->single_ref[vp9_get_pred_context_single_ref_p2(xd)][1] + [ref0 != GOLDEN_FRAME]++; + } + } + } + if (inter_block && + !segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP)) { + const int mode_ctx = mbmi_ext->mode_context[mi->ref_frame[0]]; + if (bsize >= BLOCK_8X8) { + const PREDICTION_MODE mode = mi->mode; + ++counts->inter_mode[mode_ctx][INTER_OFFSET(mode)]; + } else { + const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; + int idx, idy; + for (idy = 0; idy < 2; idy += num_4x4_h) { + for (idx = 0; idx < 2; idx += num_4x4_w) { + const int j = idy * 2 + idx; + const PREDICTION_MODE b_mode = mi->bmi[j].as_mode; + ++counts->inter_mode[mode_ctx][INTER_OFFSET(b_mode)]; + } + } + } + } + } +} + +#if !CONFIG_REALTIME_ONLY +static void restore_context(MACROBLOCK *const x, int mi_row, int mi_col, + ENTROPY_CONTEXT a[16 * MAX_MB_PLANE], + ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], + PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8], + BLOCK_SIZE bsize) { + MACROBLOCKD *const xd = &x->e_mbd; + int p; + const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; + const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; + int mi_width = num_8x8_blocks_wide_lookup[bsize]; + int mi_height = num_8x8_blocks_high_lookup[bsize]; + for (p = 0; p < MAX_MB_PLANE; p++) { + memcpy(xd->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x), + a + num_4x4_blocks_wide * p, + (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >> + xd->plane[p].subsampling_x); + memcpy(xd->left_context[p] + + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y), + l + num_4x4_blocks_high * p, + (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >> + xd->plane[p].subsampling_y); + } + memcpy(xd->above_seg_context + mi_col, sa, + sizeof(*xd->above_seg_context) * mi_width); + memcpy(xd->left_seg_context + (mi_row & MI_MASK), sl, + sizeof(xd->left_seg_context[0]) * mi_height); +} + +static void save_context(MACROBLOCK *const x, int mi_row, int mi_col, + ENTROPY_CONTEXT a[16 * MAX_MB_PLANE], + ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], + PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8], + BLOCK_SIZE bsize) { + const MACROBLOCKD *const xd = &x->e_mbd; + int p; + const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; + const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; + int mi_width = num_8x8_blocks_wide_lookup[bsize]; + int mi_height = num_8x8_blocks_high_lookup[bsize]; + + // buffer the above/left context information of the block in search. + for (p = 0; p < MAX_MB_PLANE; ++p) { + memcpy(a + num_4x4_blocks_wide * p, + xd->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x), + (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >> + xd->plane[p].subsampling_x); + memcpy(l + num_4x4_blocks_high * p, + xd->left_context[p] + + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y), + (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >> + xd->plane[p].subsampling_y); + } + memcpy(sa, xd->above_seg_context + mi_col, + sizeof(*xd->above_seg_context) * mi_width); + memcpy(sl, xd->left_seg_context + (mi_row & MI_MASK), + sizeof(xd->left_seg_context[0]) * mi_height); +} + +static void encode_b(VP9_COMP *cpi, const TileInfo *const tile, ThreadData *td, + TOKENEXTRA **tp, int mi_row, int mi_col, + int output_enabled, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx) { + MACROBLOCK *const x = &td->mb; + set_offsets(cpi, tile, x, mi_row, mi_col, bsize); + + if (cpi->sf.enable_tpl_model && + (cpi->oxcf.aq_mode == NO_AQ || cpi->oxcf.aq_mode == PERCEPTUAL_AQ)) { + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + x->rdmult = x->cb_rdmult; + if (oxcf->tuning == VP8_TUNE_SSIM) { + set_ssim_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult); + } + } + + update_state(cpi, td, ctx, mi_row, mi_col, bsize, output_enabled); + encode_superblock(cpi, td, tp, output_enabled, mi_row, mi_col, bsize, ctx); + + if (output_enabled) { + update_stats(&cpi->common, td); + + (*tp)->token = EOSB_TOKEN; + (*tp)++; + } +} + +static void encode_sb(VP9_COMP *cpi, ThreadData *td, const TileInfo *const tile, + TOKENEXTRA **tp, int mi_row, int mi_col, + int output_enabled, BLOCK_SIZE bsize, PC_TREE *pc_tree) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + + const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4; + int ctx; + PARTITION_TYPE partition; + BLOCK_SIZE subsize = bsize; + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + if (bsize >= BLOCK_8X8) { + ctx = partition_plane_context(xd, mi_row, mi_col, bsize); + subsize = get_subsize(bsize, pc_tree->partitioning); + } else { + ctx = 0; + subsize = BLOCK_4X4; + } + + partition = partition_lookup[bsl][subsize]; + if (output_enabled && bsize != BLOCK_4X4) + td->counts->partition[ctx][partition]++; + + switch (partition) { + case PARTITION_NONE: + encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize, + &pc_tree->none); + break; + case PARTITION_VERT: + encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize, + &pc_tree->vertical[0]); + if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) { + encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, output_enabled, + subsize, &pc_tree->vertical[1]); + } + break; + case PARTITION_HORZ: + encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize, + &pc_tree->horizontal[0]); + if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) { + encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, output_enabled, + subsize, &pc_tree->horizontal[1]); + } + break; + default: + assert(partition == PARTITION_SPLIT); + if (bsize == BLOCK_8X8) { + encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize, + pc_tree->leaf_split[0]); + } else { + encode_sb(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize, + pc_tree->split[0]); + encode_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled, + subsize, pc_tree->split[1]); + encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled, + subsize, pc_tree->split[2]); + encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled, + subsize, pc_tree->split[3]); + } + break; + } + + if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) + update_partition_context(xd, mi_row, mi_col, subsize, bsize); +} +#endif // !CONFIG_REALTIME_ONLY + +// Check to see if the given partition size is allowed for a specified number +// of 8x8 block rows and columns remaining in the image. +// If not then return the largest allowed partition size +static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, int rows_left, + int cols_left, int *bh, int *bw) { + if (rows_left <= 0 || cols_left <= 0) { + return VPXMIN(bsize, BLOCK_8X8); + } else { + for (; bsize > 0; bsize -= 3) { + *bh = num_8x8_blocks_high_lookup[bsize]; + *bw = num_8x8_blocks_wide_lookup[bsize]; + if ((*bh <= rows_left) && (*bw <= cols_left)) { + break; + } + } + } + return bsize; +} + +static void set_partial_b64x64_partition(MODE_INFO *mi, int mis, int bh_in, + int bw_in, int row8x8_remaining, + int col8x8_remaining, BLOCK_SIZE bsize, + MODE_INFO **mi_8x8) { + int bh = bh_in; + int r, c; + for (r = 0; r < MI_BLOCK_SIZE; r += bh) { + int bw = bw_in; + for (c = 0; c < MI_BLOCK_SIZE; c += bw) { + const int index = r * mis + c; + mi_8x8[index] = mi + index; + mi_8x8[index]->sb_type = find_partition_size( + bsize, row8x8_remaining - r, col8x8_remaining - c, &bh, &bw); + } + } +} + +// This function attempts to set all mode info entries in a given SB64 +// to the same block partition size. +// However, at the bottom and right borders of the image the requested size +// may not be allowed in which case this code attempts to choose the largest +// allowable partition. +static void set_fixed_partitioning(VP9_COMP *cpi, const TileInfo *const tile, + MODE_INFO **mi_8x8, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + VP9_COMMON *const cm = &cpi->common; + const int mis = cm->mi_stride; + const int row8x8_remaining = tile->mi_row_end - mi_row; + const int col8x8_remaining = tile->mi_col_end - mi_col; + int block_row, block_col; + MODE_INFO *mi_upper_left = cm->mi + mi_row * mis + mi_col; + int bh = num_8x8_blocks_high_lookup[bsize]; + int bw = num_8x8_blocks_wide_lookup[bsize]; + + assert((row8x8_remaining > 0) && (col8x8_remaining > 0)); + + // Apply the requested partition size to the SB64 if it is all "in image" + if ((col8x8_remaining >= MI_BLOCK_SIZE) && + (row8x8_remaining >= MI_BLOCK_SIZE)) { + for (block_row = 0; block_row < MI_BLOCK_SIZE; block_row += bh) { + for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) { + int index = block_row * mis + block_col; + mi_8x8[index] = mi_upper_left + index; + mi_8x8[index]->sb_type = bsize; + } + } + } else { + // Else this is a partial SB64. + set_partial_b64x64_partition(mi_upper_left, mis, bh, bw, row8x8_remaining, + col8x8_remaining, bsize, mi_8x8); + } +} + +static const struct { + int row; + int col; +} coord_lookup[16] = { + // 32x32 index = 0 + { 0, 0 }, + { 0, 2 }, + { 2, 0 }, + { 2, 2 }, + // 32x32 index = 1 + { 0, 4 }, + { 0, 6 }, + { 2, 4 }, + { 2, 6 }, + // 32x32 index = 2 + { 4, 0 }, + { 4, 2 }, + { 6, 0 }, + { 6, 2 }, + // 32x32 index = 3 + { 4, 4 }, + { 4, 6 }, + { 6, 4 }, + { 6, 6 }, +}; + +static void set_source_var_based_partition(VP9_COMP *cpi, + const TileInfo *const tile, + MACROBLOCK *const x, + MODE_INFO **mi_8x8, int mi_row, + int mi_col) { + VP9_COMMON *const cm = &cpi->common; + const int mis = cm->mi_stride; + const int row8x8_remaining = tile->mi_row_end - mi_row; + const int col8x8_remaining = tile->mi_col_end - mi_col; + MODE_INFO *mi_upper_left = cm->mi + mi_row * mis + mi_col; + + vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col); + + assert((row8x8_remaining > 0) && (col8x8_remaining > 0)); + + // In-image SB64 + if ((col8x8_remaining >= MI_BLOCK_SIZE) && + (row8x8_remaining >= MI_BLOCK_SIZE)) { + int i, j; + int index; + Diff d32[4]; + const int offset = (mi_row >> 1) * cm->mb_cols + (mi_col >> 1); + int is_larger_better = 0; + int use32x32 = 0; + unsigned int thr = cpi->source_var_thresh; + + memset(d32, 0, sizeof(d32)); + + for (i = 0; i < 4; i++) { + Diff *d16[4]; + + for (j = 0; j < 4; j++) { + int b_mi_row = coord_lookup[i * 4 + j].row; + int b_mi_col = coord_lookup[i * 4 + j].col; + int boffset = b_mi_row / 2 * cm->mb_cols + b_mi_col / 2; + + d16[j] = cpi->source_diff_var + offset + boffset; + + index = b_mi_row * mis + b_mi_col; + mi_8x8[index] = mi_upper_left + index; + mi_8x8[index]->sb_type = BLOCK_16X16; + + // TODO(yunqingwang): If d16[j].var is very large, use 8x8 partition + // size to further improve quality. + } + + is_larger_better = (d16[0]->var < thr) && (d16[1]->var < thr) && + (d16[2]->var < thr) && (d16[3]->var < thr); + + // Use 32x32 partition + if (is_larger_better) { + use32x32 += 1; + + for (j = 0; j < 4; j++) { + d32[i].sse += d16[j]->sse; + d32[i].sum += d16[j]->sum; + } + + d32[i].var = + (unsigned int)(d32[i].sse - + (unsigned int)(((int64_t)d32[i].sum * d32[i].sum) >> + 10)); + + index = coord_lookup[i * 4].row * mis + coord_lookup[i * 4].col; + mi_8x8[index] = mi_upper_left + index; + mi_8x8[index]->sb_type = BLOCK_32X32; + } + } + + if (use32x32 == 4) { + thr <<= 1; + is_larger_better = (d32[0].var < thr) && (d32[1].var < thr) && + (d32[2].var < thr) && (d32[3].var < thr); + + // Use 64x64 partition + if (is_larger_better) { + mi_8x8[0] = mi_upper_left; + mi_8x8[0]->sb_type = BLOCK_64X64; + } + } + } else { // partial in-image SB64 + int bh = num_8x8_blocks_high_lookup[BLOCK_16X16]; + int bw = num_8x8_blocks_wide_lookup[BLOCK_16X16]; + set_partial_b64x64_partition(mi_upper_left, mis, bh, bw, row8x8_remaining, + col8x8_remaining, BLOCK_16X16, mi_8x8); + } +} + +static void update_state_rt(VP9_COMP *cpi, ThreadData *td, + PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col, + int bsize) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mi = xd->mi[0]; + struct macroblock_plane *const p = x->plane; + const struct segmentation *const seg = &cm->seg; + const int bw = num_8x8_blocks_wide_lookup[mi->sb_type]; + const int bh = num_8x8_blocks_high_lookup[mi->sb_type]; + const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col); + const int y_mis = VPXMIN(bh, cm->mi_rows - mi_row); + + *(xd->mi[0]) = ctx->mic; + *(x->mbmi_ext) = ctx->mbmi_ext; + + if (seg->enabled && (cpi->oxcf.aq_mode != NO_AQ || cpi->roi.enabled || + cpi->active_map.enabled)) { + // Setting segmentation map for cyclic_refresh. + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && + cpi->cyclic_refresh->content_mode) { + vp9_cyclic_refresh_update_segment(cpi, mi, mi_row, mi_col, bsize, + ctx->rate, ctx->dist, x->skip, p); + } else { + const uint8_t *const map = + seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; + mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); + } + vp9_init_plane_quantizers(cpi, x); + } + + if (is_inter_block(mi)) { + vp9_update_mv_count(td); + if (cm->interp_filter == SWITCHABLE) { + const int pred_ctx = get_pred_context_switchable_interp(xd); + ++td->counts->switchable_interp[pred_ctx][mi->interp_filter]; + } + + if (mi->sb_type < BLOCK_8X8) { + mi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int; + mi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int; + } + } + + if (cm->use_prev_frame_mvs || !cm->error_resilient_mode || + (cpi->svc.use_base_mv && cpi->svc.number_spatial_layers > 1 && + cpi->svc.spatial_layer_id != cpi->svc.number_spatial_layers - 1)) { + MV_REF *const frame_mvs = + cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col; + int w, h; + + for (h = 0; h < y_mis; ++h) { + MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols; + for (w = 0; w < x_mis; ++w) { + MV_REF *const mv = frame_mv + w; + mv->ref_frame[0] = mi->ref_frame[0]; + mv->ref_frame[1] = mi->ref_frame[1]; + mv->mv[0].as_int = mi->mv[0].as_int; + mv->mv[1].as_int = mi->mv[1].as_int; + } + } + } + + x->skip = ctx->skip; + x->skip_txfm[0] = (mi->segment_id || xd->lossless) ? 0 : ctx->skip_txfm[0]; +} + +static void encode_b_rt(VP9_COMP *cpi, ThreadData *td, + const TileInfo *const tile, TOKENEXTRA **tp, int mi_row, + int mi_col, int output_enabled, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx) { + MACROBLOCK *const x = &td->mb; + set_offsets(cpi, tile, x, mi_row, mi_col, bsize); + update_state_rt(cpi, td, ctx, mi_row, mi_col, bsize); + + encode_superblock(cpi, td, tp, output_enabled, mi_row, mi_col, bsize, ctx); + update_stats(&cpi->common, td); + + (*tp)->token = EOSB_TOKEN; + (*tp)++; +} + +static void encode_sb_rt(VP9_COMP *cpi, ThreadData *td, + const TileInfo *const tile, TOKENEXTRA **tp, + int mi_row, int mi_col, int output_enabled, + BLOCK_SIZE bsize, PC_TREE *pc_tree) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + + const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4; + int ctx; + PARTITION_TYPE partition; + BLOCK_SIZE subsize; + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + if (bsize >= BLOCK_8X8) { + const int idx_str = xd->mi_stride * mi_row + mi_col; + MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str; + ctx = partition_plane_context(xd, mi_row, mi_col, bsize); + subsize = mi_8x8[0]->sb_type; + } else { + ctx = 0; + subsize = BLOCK_4X4; + } + + partition = partition_lookup[bsl][subsize]; + if (output_enabled && bsize != BLOCK_4X4) + td->counts->partition[ctx][partition]++; + + switch (partition) { + case PARTITION_NONE: + encode_b_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize, + &pc_tree->none); + break; + case PARTITION_VERT: + encode_b_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize, + &pc_tree->vertical[0]); + if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) { + encode_b_rt(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled, + subsize, &pc_tree->vertical[1]); + } + break; + case PARTITION_HORZ: + encode_b_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize, + &pc_tree->horizontal[0]); + if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) { + encode_b_rt(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled, + subsize, &pc_tree->horizontal[1]); + } + break; + default: + assert(partition == PARTITION_SPLIT); + subsize = get_subsize(bsize, PARTITION_SPLIT); + encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize, + pc_tree->split[0]); + encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled, + subsize, pc_tree->split[1]); + encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled, + subsize, pc_tree->split[2]); + encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, + output_enabled, subsize, pc_tree->split[3]); + break; + } + + if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) + update_partition_context(xd, mi_row, mi_col, subsize, bsize); +} + +#if !CONFIG_REALTIME_ONLY +static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, MODE_INFO **mi_8x8, + TOKENEXTRA **tp, int mi_row, int mi_col, + BLOCK_SIZE bsize, int *rate, int64_t *dist, + int do_recon, PC_TREE *pc_tree) { + VP9_COMMON *const cm = &cpi->common; + TileInfo *const tile_info = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int mis = cm->mi_stride; + const int bsl = b_width_log2_lookup[bsize]; + const int mi_step = num_4x4_blocks_wide_lookup[bsize] / 2; + const int bss = (1 << bsl) / 4; + int i, pl; + PARTITION_TYPE partition = PARTITION_NONE; + BLOCK_SIZE subsize; + ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; + PARTITION_CONTEXT sl[8], sa[8]; + RD_COST last_part_rdc, none_rdc, chosen_rdc; + BLOCK_SIZE sub_subsize = BLOCK_4X4; + int splits_below = 0; + BLOCK_SIZE bs_type = mi_8x8[0]->sb_type; + int do_partition_search = 1; + PICK_MODE_CONTEXT *ctx = &pc_tree->none; + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + assert(num_4x4_blocks_wide_lookup[bsize] == + num_4x4_blocks_high_lookup[bsize]); + + vp9_rd_cost_reset(&last_part_rdc); + vp9_rd_cost_reset(&none_rdc); + vp9_rd_cost_reset(&chosen_rdc); + + partition = partition_lookup[bsl][bs_type]; + subsize = get_subsize(bsize, partition); + + pc_tree->partitioning = partition; + save_context(x, mi_row, mi_col, a, l, sa, sl, bsize); + + if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode != NO_AQ) { + set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); + x->mb_energy = vp9_block_energy(cpi, x, bsize); + } + + if (do_partition_search && + cpi->sf.partition_search_type == SEARCH_PARTITION && + cpi->sf.adjust_partitioning_from_last_frame) { + // Check if any of the sub blocks are further split. + if (partition == PARTITION_SPLIT && subsize > BLOCK_8X8) { + sub_subsize = get_subsize(subsize, PARTITION_SPLIT); + splits_below = 1; + for (i = 0; i < 4; i++) { + int jj = i >> 1, ii = i & 0x01; + MODE_INFO *this_mi = mi_8x8[jj * bss * mis + ii * bss]; + if (this_mi && this_mi->sb_type >= sub_subsize) { + splits_below = 0; + } + } + } + + // If partition is not none try none unless each of the 4 splits are split + // even further.. + if (partition != PARTITION_NONE && !splits_below && + mi_row + (mi_step >> 1) < cm->mi_rows && + mi_col + (mi_step >> 1) < cm->mi_cols) { + pc_tree->partitioning = PARTITION_NONE; + rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize, ctx, + INT_MAX, INT64_MAX); + + pl = partition_plane_context(xd, mi_row, mi_col, bsize); + + if (none_rdc.rate < INT_MAX) { + none_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE]; + none_rdc.rdcost = + RDCOST(x->rdmult, x->rddiv, none_rdc.rate, none_rdc.dist); + } + + restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize); + mi_8x8[0]->sb_type = bs_type; + pc_tree->partitioning = partition; + } + } + + switch (partition) { + case PARTITION_NONE: + rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, bsize, + ctx, INT_MAX, INT64_MAX); + break; + case PARTITION_HORZ: + pc_tree->horizontal[0].skip_ref_frame_mask = 0; + rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, + subsize, &pc_tree->horizontal[0], INT_MAX, INT64_MAX); + if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && + mi_row + (mi_step >> 1) < cm->mi_rows) { + RD_COST tmp_rdc; + PICK_MODE_CONTEXT *hctx = &pc_tree->horizontal[0]; + vp9_rd_cost_init(&tmp_rdc); + update_state(cpi, td, hctx, mi_row, mi_col, subsize, 0); + encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, hctx); + pc_tree->horizontal[1].skip_ref_frame_mask = 0; + rd_pick_sb_modes(cpi, tile_data, x, mi_row + (mi_step >> 1), mi_col, + &tmp_rdc, subsize, &pc_tree->horizontal[1], INT_MAX, + INT64_MAX); + if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { + vp9_rd_cost_reset(&last_part_rdc); + break; + } + last_part_rdc.rate += tmp_rdc.rate; + last_part_rdc.dist += tmp_rdc.dist; + last_part_rdc.rdcost += tmp_rdc.rdcost; + } + break; + case PARTITION_VERT: + pc_tree->vertical[0].skip_ref_frame_mask = 0; + rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, + subsize, &pc_tree->vertical[0], INT_MAX, INT64_MAX); + if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && + mi_col + (mi_step >> 1) < cm->mi_cols) { + RD_COST tmp_rdc; + PICK_MODE_CONTEXT *vctx = &pc_tree->vertical[0]; + vp9_rd_cost_init(&tmp_rdc); + update_state(cpi, td, vctx, mi_row, mi_col, subsize, 0); + encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, vctx); + pc_tree->vertical[bsize > BLOCK_8X8].skip_ref_frame_mask = 0; + rd_pick_sb_modes( + cpi, tile_data, x, mi_row, mi_col + (mi_step >> 1), &tmp_rdc, + subsize, &pc_tree->vertical[bsize > BLOCK_8X8], INT_MAX, INT64_MAX); + if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { + vp9_rd_cost_reset(&last_part_rdc); + break; + } + last_part_rdc.rate += tmp_rdc.rate; + last_part_rdc.dist += tmp_rdc.dist; + last_part_rdc.rdcost += tmp_rdc.rdcost; + } + break; + default: + assert(partition == PARTITION_SPLIT); + if (bsize == BLOCK_8X8) { + rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, + subsize, pc_tree->leaf_split[0], INT_MAX, INT64_MAX); + break; + } + last_part_rdc.rate = 0; + last_part_rdc.dist = 0; + last_part_rdc.rdcost = 0; + for (i = 0; i < 4; i++) { + int x_idx = (i & 1) * (mi_step >> 1); + int y_idx = (i >> 1) * (mi_step >> 1); + int jj = i >> 1, ii = i & 0x01; + RD_COST tmp_rdc; + if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols)) + continue; + + vp9_rd_cost_init(&tmp_rdc); + rd_use_partition(cpi, td, tile_data, mi_8x8 + jj * bss * mis + ii * bss, + tp, mi_row + y_idx, mi_col + x_idx, subsize, + &tmp_rdc.rate, &tmp_rdc.dist, i != 3, + pc_tree->split[i]); + if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { + vp9_rd_cost_reset(&last_part_rdc); + break; + } + last_part_rdc.rate += tmp_rdc.rate; + last_part_rdc.dist += tmp_rdc.dist; + } + break; + } + + pl = partition_plane_context(xd, mi_row, mi_col, bsize); + if (last_part_rdc.rate < INT_MAX) { + last_part_rdc.rate += cpi->partition_cost[pl][partition]; + last_part_rdc.rdcost = + RDCOST(x->rdmult, x->rddiv, last_part_rdc.rate, last_part_rdc.dist); + } + + if (do_partition_search && cpi->sf.adjust_partitioning_from_last_frame && + cpi->sf.partition_search_type == SEARCH_PARTITION && + partition != PARTITION_SPLIT && bsize > BLOCK_8X8 && + (mi_row + mi_step < cm->mi_rows || + mi_row + (mi_step >> 1) == cm->mi_rows) && + (mi_col + mi_step < cm->mi_cols || + mi_col + (mi_step >> 1) == cm->mi_cols)) { + BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT); + chosen_rdc.rate = 0; + chosen_rdc.dist = 0; + restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize); + pc_tree->partitioning = PARTITION_SPLIT; + + // Split partition. + for (i = 0; i < 4; i++) { + int x_idx = (i & 1) * (mi_step >> 1); + int y_idx = (i >> 1) * (mi_step >> 1); + RD_COST tmp_rdc; + + if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols)) + continue; + + save_context(x, mi_row, mi_col, a, l, sa, sl, bsize); + pc_tree->split[i]->partitioning = PARTITION_NONE; + rd_pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, + &tmp_rdc, split_subsize, &pc_tree->split[i]->none, + INT_MAX, INT64_MAX); + + restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize); + + if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { + vp9_rd_cost_reset(&chosen_rdc); + break; + } + + chosen_rdc.rate += tmp_rdc.rate; + chosen_rdc.dist += tmp_rdc.dist; + + if (i != 3) + encode_sb(cpi, td, tile_info, tp, mi_row + y_idx, mi_col + x_idx, 0, + split_subsize, pc_tree->split[i]); + + pl = partition_plane_context(xd, mi_row + y_idx, mi_col + x_idx, + split_subsize); + chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE]; + } + pl = partition_plane_context(xd, mi_row, mi_col, bsize); + if (chosen_rdc.rate < INT_MAX) { + chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT]; + chosen_rdc.rdcost = + RDCOST(x->rdmult, x->rddiv, chosen_rdc.rate, chosen_rdc.dist); + } + } + + // If last_part is better set the partitioning to that. + if (last_part_rdc.rdcost < chosen_rdc.rdcost) { + mi_8x8[0]->sb_type = bsize; + if (bsize >= BLOCK_8X8) pc_tree->partitioning = partition; + chosen_rdc = last_part_rdc; + } + // If none was better set the partitioning to that. + if (none_rdc.rdcost < chosen_rdc.rdcost) { + if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE; + chosen_rdc = none_rdc; + } + + restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize); + + // We must have chosen a partitioning and encoding or we'll fail later on. + // No other opportunities for success. + if (bsize == BLOCK_64X64) + assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX); + + if (do_recon) { + int output_enabled = (bsize == BLOCK_64X64); + encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize, + pc_tree); + } + + *rate = chosen_rdc.rate; + *dist = chosen_rdc.dist; +} + +static const BLOCK_SIZE min_partition_size[BLOCK_SIZES] = { + BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, + BLOCK_4X4, BLOCK_8X8, BLOCK_8X8, BLOCK_8X8, BLOCK_16X16, + BLOCK_16X16, BLOCK_16X16, BLOCK_16X16 +}; + +static const BLOCK_SIZE max_partition_size[BLOCK_SIZES] = { + BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, BLOCK_32X32, + BLOCK_32X32, BLOCK_32X32, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, + BLOCK_64X64, BLOCK_64X64, BLOCK_64X64 +}; + +// Look at all the mode_info entries for blocks that are part of this +// partition and find the min and max values for sb_type. +// At the moment this is designed to work on a 64x64 SB but could be +// adjusted to use a size parameter. +// +// The min and max are assumed to have been initialized prior to calling this +// function so repeat calls can accumulate a min and max of more than one sb64. +static void get_sb_partition_size_range(MACROBLOCKD *xd, MODE_INFO **mi_8x8, + BLOCK_SIZE *min_block_size, + BLOCK_SIZE *max_block_size, + int bs_hist[BLOCK_SIZES]) { + int sb_width_in_blocks = MI_BLOCK_SIZE; + int sb_height_in_blocks = MI_BLOCK_SIZE; + int i, j; + int index = 0; + + // Check the sb_type for each block that belongs to this region. + for (i = 0; i < sb_height_in_blocks; ++i) { + for (j = 0; j < sb_width_in_blocks; ++j) { + MODE_INFO *mi = mi_8x8[index + j]; + BLOCK_SIZE sb_type = mi ? mi->sb_type : 0; + bs_hist[sb_type]++; + *min_block_size = VPXMIN(*min_block_size, sb_type); + *max_block_size = VPXMAX(*max_block_size, sb_type); + } + index += xd->mi_stride; + } +} + +// Next square block size less or equal than current block size. +static const BLOCK_SIZE next_square_size[BLOCK_SIZES] = { + BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, BLOCK_8X8, + BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, BLOCK_32X32, + BLOCK_32X32, BLOCK_32X32, BLOCK_64X64 +}; + +// Look at neighboring blocks and set a min and max partition size based on +// what they chose. +static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile, + MACROBLOCKD *const xd, int mi_row, + int mi_col, BLOCK_SIZE *min_block_size, + BLOCK_SIZE *max_block_size) { + VP9_COMMON *const cm = &cpi->common; + MODE_INFO **mi = xd->mi; + const int left_in_image = !!xd->left_mi; + const int above_in_image = !!xd->above_mi; + const int row8x8_remaining = tile->mi_row_end - mi_row; + const int col8x8_remaining = tile->mi_col_end - mi_col; + int bh, bw; + BLOCK_SIZE min_size = BLOCK_4X4; + BLOCK_SIZE max_size = BLOCK_64X64; + int bs_hist[BLOCK_SIZES] = { 0 }; + + // Trap case where we do not have a prediction. + if (left_in_image || above_in_image || cm->frame_type != KEY_FRAME) { + // Default "min to max" and "max to min" + min_size = BLOCK_64X64; + max_size = BLOCK_4X4; + + // NOTE: each call to get_sb_partition_size_range() uses the previous + // passed in values for min and max as a starting point. + // Find the min and max partition used in previous frame at this location + if (cm->frame_type != KEY_FRAME) { + MODE_INFO **prev_mi = + &cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col]; + get_sb_partition_size_range(xd, prev_mi, &min_size, &max_size, bs_hist); + } + // Find the min and max partition sizes used in the left SB64 + if (left_in_image) { + MODE_INFO **left_sb64_mi = &mi[-MI_BLOCK_SIZE]; + get_sb_partition_size_range(xd, left_sb64_mi, &min_size, &max_size, + bs_hist); + } + // Find the min and max partition sizes used in the above SB64. + if (above_in_image) { + MODE_INFO **above_sb64_mi = &mi[-xd->mi_stride * MI_BLOCK_SIZE]; + get_sb_partition_size_range(xd, above_sb64_mi, &min_size, &max_size, + bs_hist); + } + + // Adjust observed min and max for "relaxed" auto partition case. + if (cpi->sf.auto_min_max_partition_size == RELAXED_NEIGHBORING_MIN_MAX) { + min_size = min_partition_size[min_size]; + max_size = max_partition_size[max_size]; + } + } + + // Check border cases where max and min from neighbors may not be legal. + max_size = find_partition_size(max_size, row8x8_remaining, col8x8_remaining, + &bh, &bw); + // Test for blocks at the edge of the active image. + // This may be the actual edge of the image or where there are formatting + // bars. + if (vp9_active_edge_sb(cpi, mi_row, mi_col)) { + min_size = BLOCK_4X4; + } else { + min_size = + VPXMIN(cpi->sf.rd_auto_partition_min_limit, VPXMIN(min_size, max_size)); + } + + // When use_square_partition_only is true, make sure at least one square + // partition is allowed by selecting the next smaller square size as + // *min_block_size. + if (cpi->sf.use_square_partition_only && + next_square_size[max_size] < min_size) { + min_size = next_square_size[max_size]; + } + + *min_block_size = min_size; + *max_block_size = max_size; +} + +// TODO(jingning) refactor functions setting partition search range +static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd, int mi_row, + int mi_col, BLOCK_SIZE bsize, + BLOCK_SIZE *min_bs, BLOCK_SIZE *max_bs) { + int mi_width = num_8x8_blocks_wide_lookup[bsize]; + int mi_height = num_8x8_blocks_high_lookup[bsize]; + int idx, idy; + + MODE_INFO *mi; + const int idx_str = cm->mi_stride * mi_row + mi_col; + MODE_INFO **prev_mi = &cm->prev_mi_grid_visible[idx_str]; + BLOCK_SIZE bs, min_size, max_size; + + min_size = BLOCK_64X64; + max_size = BLOCK_4X4; + + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + mi = prev_mi[idy * cm->mi_stride + idx]; + bs = mi ? mi->sb_type : bsize; + min_size = VPXMIN(min_size, bs); + max_size = VPXMAX(max_size, bs); + } + } + + if (xd->left_mi) { + for (idy = 0; idy < mi_height; ++idy) { + mi = xd->mi[idy * cm->mi_stride - 1]; + bs = mi ? mi->sb_type : bsize; + min_size = VPXMIN(min_size, bs); + max_size = VPXMAX(max_size, bs); + } + } + + if (xd->above_mi) { + for (idx = 0; idx < mi_width; ++idx) { + mi = xd->mi[idx - cm->mi_stride]; + bs = mi ? mi->sb_type : bsize; + min_size = VPXMIN(min_size, bs); + max_size = VPXMAX(max_size, bs); + } + } + + if (min_size == max_size) { + min_size = min_partition_size[min_size]; + max_size = max_partition_size[max_size]; + } + + *min_bs = min_size; + *max_bs = max_size; +} +#endif // !CONFIG_REALTIME_ONLY + +static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) { + memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv)); +} + +static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) { + memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv)); +} + +// Calculate prediction based on the given input features and neural net config. +// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden +// layer. +static void nn_predict(const float *features, const NN_CONFIG *nn_config, + float *output) { + int num_input_nodes = nn_config->num_inputs; + int buf_index = 0; + float buf[2][NN_MAX_NODES_PER_LAYER]; + const float *input_nodes = features; + + // Propagate hidden layers. + const int num_layers = nn_config->num_hidden_layers; + int layer, node, i; + assert(num_layers <= NN_MAX_HIDDEN_LAYERS); + for (layer = 0; layer < num_layers; ++layer) { + const float *weights = nn_config->weights[layer]; + const float *bias = nn_config->bias[layer]; + float *output_nodes = buf[buf_index]; + const int num_output_nodes = nn_config->num_hidden_nodes[layer]; + assert(num_output_nodes < NN_MAX_NODES_PER_LAYER); + for (node = 0; node < num_output_nodes; ++node) { + float val = 0.0f; + for (i = 0; i < num_input_nodes; ++i) val += weights[i] * input_nodes[i]; + val += bias[node]; + // ReLU as activation function. + val = VPXMAX(val, 0.0f); + output_nodes[node] = val; + weights += num_input_nodes; + } + num_input_nodes = num_output_nodes; + input_nodes = output_nodes; + buf_index = 1 - buf_index; + } + + // Final output layer. + { + const float *weights = nn_config->weights[num_layers]; + for (node = 0; node < nn_config->num_outputs; ++node) { + const float *bias = nn_config->bias[num_layers]; + float val = 0.0f; + for (i = 0; i < num_input_nodes; ++i) val += weights[i] * input_nodes[i]; + output[node] = val + bias[node]; + weights += num_input_nodes; + } + } +} + +#if !CONFIG_REALTIME_ONLY +#define FEATURES 7 +// Machine-learning based partition search early termination. +// Return 1 to skip split and rect partitions. +static int ml_pruning_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd, + PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + const int mag_mv = + abs(ctx->mic.mv[0].as_mv.col) + abs(ctx->mic.mv[0].as_mv.row); + const int left_in_image = !!xd->left_mi; + const int above_in_image = !!xd->above_mi; + MODE_INFO **prev_mi = + &cm->prev_mi_grid_visible[mi_col + cm->mi_stride * mi_row]; + int above_par = 0; // above_partitioning + int left_par = 0; // left_partitioning + int last_par = 0; // last_partitioning + int offset = 0; + int i; + BLOCK_SIZE context_size; + const NN_CONFIG *nn_config = NULL; + const float *mean, *sd, *linear_weights; + float nn_score, linear_score; + float features[FEATURES]; + + assert(b_width_log2_lookup[bsize] == b_height_log2_lookup[bsize]); + vpx_clear_system_state(); + + switch (bsize) { + case BLOCK_64X64: + offset = 0; + nn_config = &vp9_partition_nnconfig_64x64; + break; + case BLOCK_32X32: + offset = 8; + nn_config = &vp9_partition_nnconfig_32x32; + break; + case BLOCK_16X16: + offset = 16; + nn_config = &vp9_partition_nnconfig_16x16; + break; + default: assert(0 && "Unexpected block size."); return 0; + } + + if (above_in_image) { + context_size = xd->above_mi->sb_type; + if (context_size < bsize) + above_par = 2; + else if (context_size == bsize) + above_par = 1; + } + + if (left_in_image) { + context_size = xd->left_mi->sb_type; + if (context_size < bsize) + left_par = 2; + else if (context_size == bsize) + left_par = 1; + } + + if (prev_mi[0]) { + context_size = prev_mi[0]->sb_type; + if (context_size < bsize) + last_par = 2; + else if (context_size == bsize) + last_par = 1; + } + + mean = &vp9_partition_feature_mean[offset]; + sd = &vp9_partition_feature_std[offset]; + features[0] = ((float)ctx->rate - mean[0]) / sd[0]; + features[1] = ((float)ctx->dist - mean[1]) / sd[1]; + features[2] = ((float)mag_mv / 2 - mean[2]) * sd[2]; + features[3] = ((float)(left_par + above_par) / 2 - mean[3]) * sd[3]; + features[4] = ((float)ctx->sum_y_eobs - mean[4]) / sd[4]; + features[5] = ((float)cm->base_qindex - mean[5]) * sd[5]; + features[6] = ((float)last_par - mean[6]) * sd[6]; + + // Predict using linear model. + linear_weights = &vp9_partition_linear_weights[offset]; + linear_score = linear_weights[FEATURES]; + for (i = 0; i < FEATURES; ++i) + linear_score += linear_weights[i] * features[i]; + if (linear_score > 0.1f) return 0; + + // Predict using neural net model. + nn_predict(features, nn_config, &nn_score); + + if (linear_score < -0.0f && nn_score < 0.1f) return 1; + if (nn_score < -0.0f && linear_score < 0.1f) return 1; + return 0; +} +#undef FEATURES + +#define FEATURES 4 +// ML-based partition search breakout. +static int ml_predict_breakout(VP9_COMP *const cpi, BLOCK_SIZE bsize, + const MACROBLOCK *const x, + const RD_COST *const rd_cost) { + DECLARE_ALIGNED(16, static const uint8_t, vp9_64_zeros[64]) = { 0 }; + const VP9_COMMON *const cm = &cpi->common; + float features[FEATURES]; + const float *linear_weights = NULL; // Linear model weights. + float linear_score = 0.0f; + const int qindex = cm->base_qindex; + const int q_ctx = qindex >= 200 ? 0 : (qindex >= 150 ? 1 : 2); + const int is_720p_or_larger = VPXMIN(cm->width, cm->height) >= 720; + const int resolution_ctx = is_720p_or_larger ? 1 : 0; + + switch (bsize) { + case BLOCK_64X64: + linear_weights = vp9_partition_breakout_weights_64[resolution_ctx][q_ctx]; + break; + case BLOCK_32X32: + linear_weights = vp9_partition_breakout_weights_32[resolution_ctx][q_ctx]; + break; + case BLOCK_16X16: + linear_weights = vp9_partition_breakout_weights_16[resolution_ctx][q_ctx]; + break; + case BLOCK_8X8: + linear_weights = vp9_partition_breakout_weights_8[resolution_ctx][q_ctx]; + break; + default: assert(0 && "Unexpected block size."); return 0; + } + if (!linear_weights) return 0; + + { // Generate feature values. +#if CONFIG_VP9_HIGHBITDEPTH + const int ac_q = + vp9_ac_quant(cm->base_qindex, 0, cm->bit_depth) >> (x->e_mbd.bd - 8); +#else + const int ac_q = vp9_ac_quant(qindex, 0, cm->bit_depth); +#endif // CONFIG_VP9_HIGHBITDEPTH + const int num_pels_log2 = num_pels_log2_lookup[bsize]; + int feature_index = 0; + unsigned int var, sse; + float rate_f, dist_f; + +#if CONFIG_VP9_HIGHBITDEPTH + if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + var = + vp9_high_get_sby_variance(cpi, &x->plane[0].src, bsize, x->e_mbd.bd); + } else { + var = cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride, + vp9_64_zeros, 0, &sse); + } +#else + var = cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride, + vp9_64_zeros, 0, &sse); +#endif + var = var >> num_pels_log2; + + vpx_clear_system_state(); + + rate_f = (float)VPXMIN(rd_cost->rate, INT_MAX); + dist_f = (float)(VPXMIN(rd_cost->dist, INT_MAX) >> num_pels_log2); + rate_f = + ((float)x->rdmult / 128.0f / 512.0f / (float)(1 << num_pels_log2)) * + rate_f; + + features[feature_index++] = rate_f; + features[feature_index++] = dist_f; + features[feature_index++] = (float)var; + features[feature_index++] = (float)ac_q; + assert(feature_index == FEATURES); + } + + { // Calculate the output score. + int i; + linear_score = linear_weights[FEATURES]; + for (i = 0; i < FEATURES; ++i) + linear_score += linear_weights[i] * features[i]; + } + + return linear_score >= cpi->sf.rd_ml_partition.search_breakout_thresh[q_ctx]; +} +#undef FEATURES + +#define FEATURES 8 +#define LABELS 4 +static void ml_prune_rect_partition(VP9_COMP *const cpi, MACROBLOCK *const x, + BLOCK_SIZE bsize, + const PC_TREE *const pc_tree, + int *allow_horz, int *allow_vert, + int64_t ref_rd) { + const NN_CONFIG *nn_config = NULL; + float score[LABELS] = { + 0.0f, + }; + int thresh = -1; + int i; + (void)x; + + if (ref_rd <= 0 || ref_rd > 1000000000) return; + + switch (bsize) { + case BLOCK_8X8: break; + case BLOCK_16X16: + nn_config = &vp9_rect_part_nnconfig_16; + thresh = cpi->sf.rd_ml_partition.prune_rect_thresh[1]; + break; + case BLOCK_32X32: + nn_config = &vp9_rect_part_nnconfig_32; + thresh = cpi->sf.rd_ml_partition.prune_rect_thresh[2]; + break; + case BLOCK_64X64: + nn_config = &vp9_rect_part_nnconfig_64; + thresh = cpi->sf.rd_ml_partition.prune_rect_thresh[3]; + break; + default: assert(0 && "Unexpected block size."); return; + } + if (!nn_config || thresh < 0) return; + + // Feature extraction and model score calculation. + { + const VP9_COMMON *const cm = &cpi->common; +#if CONFIG_VP9_HIGHBITDEPTH + const int dc_q = + vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth) >> (x->e_mbd.bd - 8); +#else + const int dc_q = vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth); +#endif // CONFIG_VP9_HIGHBITDEPTH + const int bs = 4 * num_4x4_blocks_wide_lookup[bsize]; + int feature_index = 0; + float features[FEATURES]; + + features[feature_index++] = logf((float)dc_q + 1.0f); + features[feature_index++] = + (float)(pc_tree->partitioning == PARTITION_NONE); + features[feature_index++] = logf((float)ref_rd / bs / bs + 1.0f); + + { + const float norm_factor = 1.0f / ((float)ref_rd + 1.0f); + const int64_t none_rdcost = pc_tree->none.rdcost; + float rd_ratio = 2.0f; + if (none_rdcost > 0 && none_rdcost < 1000000000) + rd_ratio = (float)none_rdcost * norm_factor; + features[feature_index++] = VPXMIN(rd_ratio, 2.0f); + + for (i = 0; i < 4; ++i) { + const int64_t this_rd = pc_tree->split[i]->none.rdcost; + const int rd_valid = this_rd > 0 && this_rd < 1000000000; + // Ratio between sub-block RD and whole block RD. + features[feature_index++] = + rd_valid ? (float)this_rd * norm_factor : 1.0f; + } + } + + assert(feature_index == FEATURES); + nn_predict(features, nn_config, score); + } + + // Make decisions based on the model score. + { + int max_score = -1000; + int horz = 0, vert = 0; + int int_score[LABELS]; + for (i = 0; i < LABELS; ++i) { + int_score[i] = (int)(100 * score[i]); + max_score = VPXMAX(int_score[i], max_score); + } + thresh = max_score - thresh; + for (i = 0; i < LABELS; ++i) { + if (int_score[i] >= thresh) { + if ((i >> 0) & 1) horz = 1; + if ((i >> 1) & 1) vert = 1; + } + } + *allow_horz = *allow_horz && horz; + *allow_vert = *allow_vert && vert; + } +} +#undef FEATURES +#undef LABELS + +// Perform fast and coarse motion search for the given block. This is a +// pre-processing step for the ML based partition search speedup. +static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x, + BLOCK_SIZE bsize, int mi_row, int mi_col, + MV ref_mv, MV_REFERENCE_FRAME ref, + uint8_t *const pred_buf) { + const VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mi = xd->mi[0]; + YV12_BUFFER_CONFIG *yv12; + YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, ref); + const int step_param = 1; + const MvLimits tmp_mv_limits = x->mv_limits; + const SEARCH_METHODS search_method = NSTEP; + const int sadpb = x->sadperbit16; + MV ref_mv_full = { ref_mv.row >> 3, ref_mv.col >> 3 }; + MV best_mv = { 0, 0 }; + int cost_list[5]; + struct buf_2d backup_pre[MAX_MB_PLANE] = { { 0, 0 } }; + + if (scaled_ref_frame) { + yv12 = scaled_ref_frame; + // As reported in b/311294795, the reference buffer pointer needs to be + // saved and restored after the search. Otherwise, it causes problems while + // the reference frame scaling happens. + for (int i = 0; i < MAX_MB_PLANE; i++) backup_pre[i] = xd->plane[i].pre[0]; + } else { + yv12 = get_ref_frame_buffer(cpi, ref); + } + + assert(yv12 != NULL); + if (!yv12) return; + vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, NULL); + mi->ref_frame[0] = ref; + mi->ref_frame[1] = NO_REF_FRAME; + mi->sb_type = bsize; + vp9_set_mv_search_range(&x->mv_limits, &ref_mv); + vp9_full_pixel_search(cpi, x, bsize, &ref_mv_full, step_param, search_method, + sadpb, cond_cost_list(cpi, cost_list), &ref_mv, + &best_mv, 0, 0); + best_mv.row *= 8; + best_mv.col *= 8; + x->mv_limits = tmp_mv_limits; + mi->mv[0].as_mv = best_mv; + + // Restore reference buffer pointer. + if (scaled_ref_frame) { + for (int i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_pre[i]; + } + + set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); + xd->plane[0].dst.buf = pred_buf; + xd->plane[0].dst.stride = 64; + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); +} + +// Use a neural net model to prune partition-none and partition-split search. +// Features used: QP; spatial block size contexts; variance of prediction +// residue after simple_motion_search. +#define FEATURES 12 +static void ml_predict_var_rd_partitioning(const VP9_COMP *const cpi, + MACROBLOCK *const x, + PC_TREE *const pc_tree, + BLOCK_SIZE bsize, int mi_row, + int mi_col, int *none, int *split) { + const VP9_COMMON *const cm = &cpi->common; + const NN_CONFIG *nn_config = NULL; + const MACROBLOCKD *const xd = &x->e_mbd; +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint8_t, pred_buffer[64 * 64 * 2]); + uint8_t *const pred_buf = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + ? (CONVERT_TO_BYTEPTR(pred_buffer)) + : pred_buffer; +#else + DECLARE_ALIGNED(16, uint8_t, pred_buffer[64 * 64]); + uint8_t *const pred_buf = pred_buffer; +#endif // CONFIG_VP9_HIGHBITDEPTH + const int speed = cpi->oxcf.speed; + float thresh = 0.0f; + + switch (bsize) { + case BLOCK_64X64: + nn_config = &vp9_part_split_nnconfig_64; + thresh = speed > 0 ? 2.8f : 3.0f; + break; + case BLOCK_32X32: + nn_config = &vp9_part_split_nnconfig_32; + thresh = speed > 0 ? 3.5f : 3.0f; + break; + case BLOCK_16X16: + nn_config = &vp9_part_split_nnconfig_16; + thresh = speed > 0 ? 3.8f : 4.0f; + break; + case BLOCK_8X8: + nn_config = &vp9_part_split_nnconfig_8; + if (cm->width >= 720 && cm->height >= 720) + thresh = speed > 0 ? 2.5f : 2.0f; + else + thresh = speed > 0 ? 3.8f : 2.0f; + break; + default: assert(0 && "Unexpected block size."); return; + } + + if (!nn_config) return; + + // Do a simple single motion search to find a prediction for current block. + // The variance of the residue will be used as input features. + { + MV ref_mv; + const MV_REFERENCE_FRAME ref = + cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME; + // If bsize is 64x64, use zero MV as reference; otherwise, use MV result + // of previous(larger) block as reference. + if (bsize == BLOCK_64X64) + ref_mv.row = ref_mv.col = 0; + else + ref_mv = pc_tree->mv; + vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col); + simple_motion_search(cpi, x, bsize, mi_row, mi_col, ref_mv, ref, pred_buf); + pc_tree->mv = x->e_mbd.mi[0]->mv[0].as_mv; + } + + vpx_clear_system_state(); + + { + float features[FEATURES] = { 0.0f }; +#if CONFIG_VP9_HIGHBITDEPTH + const int dc_q = + vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth) >> (xd->bd - 8); +#else + const int dc_q = vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth); +#endif // CONFIG_VP9_HIGHBITDEPTH + int feature_idx = 0; + float score; + + // Generate model input features. + features[feature_idx++] = logf((float)dc_q + 1.0f); + + // Get the variance of the residue as input features. + { + const int bs = 4 * num_4x4_blocks_wide_lookup[bsize]; + const BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT); + const uint8_t *pred = pred_buf; + const uint8_t *src = x->plane[0].src.buf; + const int src_stride = x->plane[0].src.stride; + const int pred_stride = 64; + unsigned int sse; + // Variance of whole block. + const unsigned int var = + cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse); + const float factor = (var == 0) ? 1.0f : (1.0f / (float)var); + const int has_above = !!xd->above_mi; + const int has_left = !!xd->left_mi; + const BLOCK_SIZE above_bsize = has_above ? xd->above_mi->sb_type : bsize; + const BLOCK_SIZE left_bsize = has_left ? xd->left_mi->sb_type : bsize; + int i; + + features[feature_idx++] = (float)has_above; + features[feature_idx++] = (float)b_width_log2_lookup[above_bsize]; + features[feature_idx++] = (float)b_height_log2_lookup[above_bsize]; + features[feature_idx++] = (float)has_left; + features[feature_idx++] = (float)b_width_log2_lookup[left_bsize]; + features[feature_idx++] = (float)b_height_log2_lookup[left_bsize]; + features[feature_idx++] = logf((float)var + 1.0f); + for (i = 0; i < 4; ++i) { + const int x_idx = (i & 1) * bs / 2; + const int y_idx = (i >> 1) * bs / 2; + const int src_offset = y_idx * src_stride + x_idx; + const int pred_offset = y_idx * pred_stride + x_idx; + // Variance of quarter block. + const unsigned int sub_var = + cpi->fn_ptr[subsize].vf(src + src_offset, src_stride, + pred + pred_offset, pred_stride, &sse); + const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var; + features[feature_idx++] = var_ratio; + } + } + assert(feature_idx == FEATURES); + + // Feed the features into the model to get the confidence score. + nn_predict(features, nn_config, &score); + + // Higher score means that the model has higher confidence that the split + // partition is better than the non-split partition. So if the score is + // high enough, we skip the none-split partition search; if the score is + // low enough, we skip the split partition search. + if (score > thresh) *none = 0; + if (score < -thresh) *split = 0; + } +} +#undef FEATURES +#endif // !CONFIG_REALTIME_ONLY + +static double log_wiener_var(int64_t wiener_variance) { + return log(1.0 + wiener_variance) / log(2.0); +} + +static void build_kmeans_segmentation(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + BLOCK_SIZE bsize = BLOCK_64X64; + KMEANS_DATA *kmeans_data; + + vp9_disable_segmentation(&cm->seg); + if (cm->show_frame) { + int mi_row, mi_col; + cpi->kmeans_data_size = 0; + cpi->kmeans_ctr_num = 8; + + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MI_BLOCK_SIZE) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) { + int mb_row_start = mi_row >> 1; + int mb_col_start = mi_col >> 1; + int mb_row_end = VPXMIN( + (mi_row + num_8x8_blocks_high_lookup[bsize]) >> 1, cm->mb_rows); + int mb_col_end = VPXMIN( + (mi_col + num_8x8_blocks_wide_lookup[bsize]) >> 1, cm->mb_cols); + int row, col; + int64_t wiener_variance = 0; + + for (row = mb_row_start; row < mb_row_end; ++row) + for (col = mb_col_start; col < mb_col_end; ++col) + wiener_variance += cpi->mb_wiener_variance[row * cm->mb_cols + col]; + + wiener_variance /= + (mb_row_end - mb_row_start) * (mb_col_end - mb_col_start); + +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&cpi->kmeans_mutex); +#endif // CONFIG_MULTITHREAD + + kmeans_data = &cpi->kmeans_data_arr[cpi->kmeans_data_size++]; + kmeans_data->value = log_wiener_var(wiener_variance); + kmeans_data->pos = mi_row * cpi->kmeans_data_stride + mi_col; +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(&cpi->kmeans_mutex); +#endif // CONFIG_MULTITHREAD + } + } + + vp9_kmeans(cpi->kmeans_ctr_ls, cpi->kmeans_boundary_ls, + cpi->kmeans_count_ls, cpi->kmeans_ctr_num, cpi->kmeans_data_arr, + cpi->kmeans_data_size); + + vp9_perceptual_aq_mode_setup(cpi, &cm->seg); + } +} + +#if !CONFIG_REALTIME_ONLY +static int wiener_var_segment(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col) { + VP9_COMMON *cm = &cpi->common; + int mb_row_start = mi_row >> 1; + int mb_col_start = mi_col >> 1; + int mb_row_end = + VPXMIN((mi_row + num_8x8_blocks_high_lookup[bsize]) >> 1, cm->mb_rows); + int mb_col_end = + VPXMIN((mi_col + num_8x8_blocks_wide_lookup[bsize]) >> 1, cm->mb_cols); + int row, col, idx; + int64_t wiener_variance = 0; + int segment_id; + int8_t seg_hist[MAX_SEGMENTS] = { 0 }; + int8_t max_count = 0, max_index = -1; + + vpx_clear_system_state(); + + assert(cpi->norm_wiener_variance > 0); + + for (row = mb_row_start; row < mb_row_end; ++row) { + for (col = mb_col_start; col < mb_col_end; ++col) { + wiener_variance = cpi->mb_wiener_variance[row * cm->mb_cols + col]; + segment_id = + vp9_get_group_idx(log_wiener_var(wiener_variance), + cpi->kmeans_boundary_ls, cpi->kmeans_ctr_num); + ++seg_hist[segment_id]; + } + } + + for (idx = 0; idx < cpi->kmeans_ctr_num; ++idx) { + if (seg_hist[idx] > max_count) { + max_count = seg_hist[idx]; + max_index = idx; + } + } + + assert(max_index >= 0); + segment_id = max_index; + + return segment_id; +} + +static int get_rdmult_delta(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col, int orig_rdmult) { + const int gf_group_index = cpi->twopass.gf_group.index; + int64_t intra_cost = 0; + int64_t mc_dep_cost = 0; + int mi_wide = num_8x8_blocks_wide_lookup[bsize]; + int mi_high = num_8x8_blocks_high_lookup[bsize]; + int row, col; + + int dr = 0; + double r0, rk, beta; + + TplDepFrame *tpl_frame; + TplDepStats *tpl_stats; + int tpl_stride; + + if (gf_group_index >= MAX_ARF_GOP_SIZE) return orig_rdmult; + tpl_frame = &cpi->tpl_stats[gf_group_index]; + + if (tpl_frame->is_valid == 0) return orig_rdmult; + tpl_stats = tpl_frame->tpl_stats_ptr; + tpl_stride = tpl_frame->stride; + + if (cpi->twopass.gf_group.layer_depth[gf_group_index] > 1) return orig_rdmult; + + for (row = mi_row; row < mi_row + mi_high; ++row) { + for (col = mi_col; col < mi_col + mi_wide; ++col) { + TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col]; + + if (row >= cpi->common.mi_rows || col >= cpi->common.mi_cols) continue; + + intra_cost += this_stats->intra_cost; + mc_dep_cost += this_stats->mc_dep_cost; + } + } + + vpx_clear_system_state(); + + r0 = cpi->rd.r0; + rk = (double)intra_cost / mc_dep_cost; + beta = r0 / rk; + dr = vp9_get_adaptive_rdmult(cpi, beta); + + dr = VPXMIN(dr, orig_rdmult * 3 / 2); + dr = VPXMAX(dr, orig_rdmult * 1 / 2); + + dr = VPXMAX(1, dr); + + return dr; +} +#endif // !CONFIG_REALTIME_ONLY + +#if CONFIG_RATE_CTRL +static void assign_partition_info( + const int row_start_4x4, const int col_start_4x4, const int block_width_4x4, + const int block_height_4x4, const int num_unit_rows, + const int num_unit_cols, PARTITION_INFO *partition_info) { + int i, j; + for (i = 0; i < block_height_4x4; ++i) { + for (j = 0; j < block_width_4x4; ++j) { + const int row_4x4 = row_start_4x4 + i; + const int col_4x4 = col_start_4x4 + j; + const int unit_index = row_4x4 * num_unit_cols + col_4x4; + if (row_4x4 >= num_unit_rows || col_4x4 >= num_unit_cols) continue; + partition_info[unit_index].row = row_4x4 << 2; + partition_info[unit_index].column = col_4x4 << 2; + partition_info[unit_index].row_start = row_start_4x4 << 2; + partition_info[unit_index].column_start = col_start_4x4 << 2; + partition_info[unit_index].width = block_width_4x4 << 2; + partition_info[unit_index].height = block_height_4x4 << 2; + } + } +} + +static void assign_motion_vector_info(const int block_width_4x4, + const int block_height_4x4, + const int row_start_4x4, + const int col_start_4x4, + const int num_unit_rows, + const int num_unit_cols, MV *source_mv[2], + MV_REFERENCE_FRAME source_ref_frame[2], + MOTION_VECTOR_INFO *motion_vector_info) { + int i, j; + for (i = 0; i < block_height_4x4; ++i) { + for (j = 0; j < block_width_4x4; ++j) { + const int row_4x4 = row_start_4x4 + i; + const int col_4x4 = col_start_4x4 + j; + const int unit_index = row_4x4 * num_unit_cols + col_4x4; + if (row_4x4 >= num_unit_rows || col_4x4 >= num_unit_cols) continue; + if (source_ref_frame[1] == NO_REF_FRAME) { + assert(source_mv[1]->row == 0 && source_mv[1]->col == 0); + } + motion_vector_info[unit_index].ref_frame[0] = source_ref_frame[0]; + motion_vector_info[unit_index].ref_frame[1] = source_ref_frame[1]; + motion_vector_info[unit_index].mv[0].as_mv.row = source_mv[0]->row; + motion_vector_info[unit_index].mv[0].as_mv.col = source_mv[0]->col; + motion_vector_info[unit_index].mv[1].as_mv.row = source_mv[1]->row; + motion_vector_info[unit_index].mv[1].as_mv.col = source_mv[1]->col; + } + } +} + +static void store_superblock_info( + const PC_TREE *const pc_tree, MODE_INFO **mi_grid_visible, + const int mi_stride, const int square_size_4x4, const int num_unit_rows, + const int num_unit_cols, const int row_start_4x4, const int col_start_4x4, + PARTITION_INFO *partition_info, MOTION_VECTOR_INFO *motion_vector_info) { + const int subblock_square_size_4x4 = square_size_4x4 >> 1; + if (row_start_4x4 >= num_unit_rows || col_start_4x4 >= num_unit_cols) return; + assert(pc_tree->partitioning != PARTITION_INVALID); + // End node, no split. + if (pc_tree->partitioning == PARTITION_NONE || + pc_tree->partitioning == PARTITION_HORZ || + pc_tree->partitioning == PARTITION_VERT || square_size_4x4 == 1) { + const int mi_row = row_start_4x4 >> 1; + const int mi_col = col_start_4x4 >> 1; + const int mi_idx = mi_stride * mi_row + mi_col; + MODE_INFO **mi = mi_grid_visible + mi_idx; + MV *source_mv[2]; + MV_REFERENCE_FRAME source_ref_frame[2]; + + // partition info + const int block_width_4x4 = (pc_tree->partitioning == PARTITION_VERT) + ? square_size_4x4 >> 1 + : square_size_4x4; + const int block_height_4x4 = (pc_tree->partitioning == PARTITION_HORZ) + ? square_size_4x4 >> 1 + : square_size_4x4; + assign_partition_info(row_start_4x4, col_start_4x4, block_width_4x4, + block_height_4x4, num_unit_rows, num_unit_cols, + partition_info); + if (pc_tree->partitioning == PARTITION_VERT) { + assign_partition_info(row_start_4x4, col_start_4x4 + block_width_4x4, + block_width_4x4, block_height_4x4, num_unit_rows, + num_unit_cols, partition_info); + } else if (pc_tree->partitioning == PARTITION_HORZ) { + assign_partition_info(row_start_4x4 + block_height_4x4, col_start_4x4, + block_width_4x4, block_height_4x4, num_unit_rows, + num_unit_cols, partition_info); + } + + // motion vector info + if (pc_tree->partitioning == PARTITION_HORZ) { + int is_valid_second_rectangle = 0; + assert(square_size_4x4 > 1); + // First rectangle. + source_ref_frame[0] = mi[0]->ref_frame[0]; + source_ref_frame[1] = mi[0]->ref_frame[1]; + source_mv[0] = &mi[0]->mv[0].as_mv; + source_mv[1] = &mi[0]->mv[1].as_mv; + assign_motion_vector_info(block_width_4x4, block_height_4x4, + row_start_4x4, col_start_4x4, num_unit_rows, + num_unit_cols, source_mv, source_ref_frame, + motion_vector_info); + // Second rectangle. + if (square_size_4x4 == 2) { + is_valid_second_rectangle = 1; + source_ref_frame[0] = mi[0]->ref_frame[0]; + source_ref_frame[1] = mi[0]->ref_frame[1]; + source_mv[0] = &mi[0]->bmi[2].as_mv[0].as_mv; + source_mv[1] = &mi[0]->bmi[2].as_mv[1].as_mv; + } else { + const int mi_row_2 = mi_row + (block_height_4x4 >> 1); + const int mi_col_2 = mi_col; + if (mi_row_2 * 2 < num_unit_rows && mi_col_2 * 2 < num_unit_cols) { + const int mi_idx_2 = mi_stride * mi_row_2 + mi_col_2; + is_valid_second_rectangle = 1; + mi = mi_grid_visible + mi_idx_2; + source_ref_frame[0] = mi[0]->ref_frame[0]; + source_ref_frame[1] = mi[0]->ref_frame[1]; + source_mv[0] = &mi[0]->mv[0].as_mv; + source_mv[1] = &mi[0]->mv[1].as_mv; + } + } + if (is_valid_second_rectangle) { + assign_motion_vector_info( + block_width_4x4, block_height_4x4, row_start_4x4 + block_height_4x4, + col_start_4x4, num_unit_rows, num_unit_cols, source_mv, + source_ref_frame, motion_vector_info); + } + } else if (pc_tree->partitioning == PARTITION_VERT) { + int is_valid_second_rectangle = 0; + assert(square_size_4x4 > 1); + // First rectangle. + source_ref_frame[0] = mi[0]->ref_frame[0]; + source_ref_frame[1] = mi[0]->ref_frame[1]; + source_mv[0] = &mi[0]->mv[0].as_mv; + source_mv[1] = &mi[0]->mv[1].as_mv; + assign_motion_vector_info(block_width_4x4, block_height_4x4, + row_start_4x4, col_start_4x4, num_unit_rows, + num_unit_cols, source_mv, source_ref_frame, + motion_vector_info); + // Second rectangle. + if (square_size_4x4 == 2) { + is_valid_second_rectangle = 1; + source_ref_frame[0] = mi[0]->ref_frame[0]; + source_ref_frame[1] = mi[0]->ref_frame[1]; + source_mv[0] = &mi[0]->bmi[1].as_mv[0].as_mv; + source_mv[1] = &mi[0]->bmi[1].as_mv[1].as_mv; + } else { + const int mi_row_2 = mi_row; + const int mi_col_2 = mi_col + (block_width_4x4 >> 1); + if (mi_row_2 * 2 < num_unit_rows && mi_col_2 * 2 < num_unit_cols) { + const int mi_idx_2 = mi_stride * mi_row_2 + mi_col_2; + is_valid_second_rectangle = 1; + mi = mi_grid_visible + mi_idx_2; + source_ref_frame[0] = mi[0]->ref_frame[0]; + source_ref_frame[1] = mi[0]->ref_frame[1]; + source_mv[0] = &mi[0]->mv[0].as_mv; + source_mv[1] = &mi[0]->mv[1].as_mv; + } + } + if (is_valid_second_rectangle) { + assign_motion_vector_info( + block_width_4x4, block_height_4x4, row_start_4x4, + col_start_4x4 + block_width_4x4, num_unit_rows, num_unit_cols, + source_mv, source_ref_frame, motion_vector_info); + } + } else { + assert(pc_tree->partitioning == PARTITION_NONE || square_size_4x4 == 1); + source_ref_frame[0] = mi[0]->ref_frame[0]; + source_ref_frame[1] = mi[0]->ref_frame[1]; + if (square_size_4x4 == 1) { + const int sub8x8_row = row_start_4x4 % 2; + const int sub8x8_col = col_start_4x4 % 2; + const int sub8x8_idx = sub8x8_row * 2 + sub8x8_col; + source_mv[0] = &mi[0]->bmi[sub8x8_idx].as_mv[0].as_mv; + source_mv[1] = &mi[0]->bmi[sub8x8_idx].as_mv[1].as_mv; + } else { + source_mv[0] = &mi[0]->mv[0].as_mv; + source_mv[1] = &mi[0]->mv[1].as_mv; + } + assign_motion_vector_info(block_width_4x4, block_height_4x4, + row_start_4x4, col_start_4x4, num_unit_rows, + num_unit_cols, source_mv, source_ref_frame, + motion_vector_info); + } + + return; + } + // recursively traverse partition tree when partition is split. + assert(pc_tree->partitioning == PARTITION_SPLIT); + store_superblock_info(pc_tree->split[0], mi_grid_visible, mi_stride, + subblock_square_size_4x4, num_unit_rows, num_unit_cols, + row_start_4x4, col_start_4x4, partition_info, + motion_vector_info); + store_superblock_info(pc_tree->split[1], mi_grid_visible, mi_stride, + subblock_square_size_4x4, num_unit_rows, num_unit_cols, + row_start_4x4, col_start_4x4 + subblock_square_size_4x4, + partition_info, motion_vector_info); + store_superblock_info(pc_tree->split[2], mi_grid_visible, mi_stride, + subblock_square_size_4x4, num_unit_rows, num_unit_cols, + row_start_4x4 + subblock_square_size_4x4, col_start_4x4, + partition_info, motion_vector_info); + store_superblock_info(pc_tree->split[3], mi_grid_visible, mi_stride, + subblock_square_size_4x4, num_unit_rows, num_unit_cols, + row_start_4x4 + subblock_square_size_4x4, + col_start_4x4 + subblock_square_size_4x4, + partition_info, motion_vector_info); +} +#endif // CONFIG_RATE_CTRL + +#if !CONFIG_REALTIME_ONLY +// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are +// unlikely to be selected depending on previous rate-distortion optimization +// results, for encoding speed-up. +static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, TOKENEXTRA **tp, + int mi_row, int mi_col, BLOCK_SIZE bsize, + RD_COST *rd_cost, RD_COST best_rdc, + PC_TREE *pc_tree) { + VP9_COMMON *const cm = &cpi->common; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + TileInfo *const tile_info = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int mi_step = num_8x8_blocks_wide_lookup[bsize] / 2; + ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; + PARTITION_CONTEXT sl[8], sa[8]; + TOKENEXTRA *tp_orig = *tp; + PICK_MODE_CONTEXT *const ctx = &pc_tree->none; + int i; + const int pl = partition_plane_context(xd, mi_row, mi_col, bsize); + BLOCK_SIZE subsize; + RD_COST this_rdc, sum_rdc; + int do_split = bsize >= BLOCK_8X8; + int do_rect = 1; + INTERP_FILTER pred_interp_filter; + + // Override skipping rectangular partition operations for edge blocks + const int force_horz_split = (mi_row + mi_step >= cm->mi_rows); + const int force_vert_split = (mi_col + mi_step >= cm->mi_cols); + const int xss = x->e_mbd.plane[1].subsampling_x; + const int yss = x->e_mbd.plane[1].subsampling_y; + + BLOCK_SIZE min_size = x->min_partition_size; + BLOCK_SIZE max_size = x->max_partition_size; + + int partition_none_allowed = !force_horz_split && !force_vert_split; + int partition_horz_allowed = + !force_vert_split && yss <= xss && bsize >= BLOCK_8X8; + int partition_vert_allowed = + !force_horz_split && xss <= yss && bsize >= BLOCK_8X8; + + int64_t dist_breakout_thr = cpi->sf.partition_search_breakout_thr.dist; + int rate_breakout_thr = cpi->sf.partition_search_breakout_thr.rate; + int must_split = 0; + int should_encode_sb = 0; + + // Ref frames picked in the [i_th] quarter subblock during square partition + // RD search. It may be used to prune ref frame selection of rect partitions. + uint8_t ref_frames_used[4] = { 0, 0, 0, 0 }; + + int partition_mul = x->cb_rdmult; + + (void)*tp_orig; + + assert(num_8x8_blocks_wide_lookup[bsize] == + num_8x8_blocks_high_lookup[bsize]); + + dist_breakout_thr >>= + 8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + + rate_breakout_thr *= num_pels_log2_lookup[bsize]; + + vp9_rd_cost_init(&this_rdc); + vp9_rd_cost_init(&sum_rdc); + + set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); + + if (oxcf->tuning == VP8_TUNE_SSIM) { + set_ssim_rdmult(cpi, x, bsize, mi_row, mi_col, &partition_mul); + } + vp9_rd_cost_update(partition_mul, x->rddiv, &best_rdc); + + if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode != NO_AQ && + cpi->oxcf.aq_mode != LOOKAHEAD_AQ) + x->mb_energy = vp9_block_energy(cpi, x, bsize); + + if (cpi->sf.cb_partition_search && bsize == BLOCK_16X16) { + int cb_partition_search_ctrl = + ((pc_tree->index == 0 || pc_tree->index == 3) + + get_chessboard_index(cm->current_video_frame)) & + 0x1; + + if (cb_partition_search_ctrl && bsize > min_size && bsize < max_size) + set_partition_range(cm, xd, mi_row, mi_col, bsize, &min_size, &max_size); + } + + // Get sub block energy range + if (bsize >= BLOCK_16X16) { + int min_energy, max_energy; + vp9_get_sub_block_energy(cpi, x, mi_row, mi_col, bsize, &min_energy, + &max_energy); + must_split = (min_energy < -3) && (max_energy - min_energy > 2); + } + + // Determine partition types in search according to the speed features. + // The threshold set here has to be of square block size. + if (cpi->sf.auto_min_max_partition_size) { + partition_none_allowed &= (bsize <= max_size); + partition_horz_allowed &= + ((bsize <= max_size && bsize > min_size) || force_horz_split); + partition_vert_allowed &= + ((bsize <= max_size && bsize > min_size) || force_vert_split); + do_split &= bsize > min_size; + } + + if (cpi->sf.use_square_partition_only && + (bsize > cpi->sf.use_square_only_thresh_high || + bsize < cpi->sf.use_square_only_thresh_low)) { + if (cpi->use_svc) { + if (!vp9_active_h_edge(cpi, mi_row, mi_step) || x->e_mbd.lossless) + partition_horz_allowed &= force_horz_split; + if (!vp9_active_v_edge(cpi, mi_row, mi_step) || x->e_mbd.lossless) + partition_vert_allowed &= force_vert_split; + } else { + partition_horz_allowed &= force_horz_split; + partition_vert_allowed &= force_vert_split; + } + } + + save_context(x, mi_row, mi_col, a, l, sa, sl, bsize); + + pc_tree->partitioning = PARTITION_NONE; + + if (cpi->sf.rd_ml_partition.var_pruning && !frame_is_intra_only(cm)) { + const int do_rd_ml_partition_var_pruning = + partition_none_allowed && do_split && + mi_row + num_8x8_blocks_high_lookup[bsize] <= cm->mi_rows && + mi_col + num_8x8_blocks_wide_lookup[bsize] <= cm->mi_cols; + if (do_rd_ml_partition_var_pruning) { + ml_predict_var_rd_partitioning(cpi, x, pc_tree, bsize, mi_row, mi_col, + &partition_none_allowed, &do_split); + } else { + vp9_zero(pc_tree->mv); + } + if (bsize > BLOCK_8X8) { // Store MV result as reference for subblocks. + for (i = 0; i < 4; ++i) pc_tree->split[i]->mv = pc_tree->mv; + } + } + + // PARTITION_NONE + if (partition_none_allowed) { + rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, bsize, ctx, + best_rdc.rate, best_rdc.dist); + ctx->rdcost = this_rdc.rdcost; + if (this_rdc.rate != INT_MAX) { + if (cpi->sf.prune_ref_frame_for_rect_partitions) { + const int ref1 = ctx->mic.ref_frame[0]; + const int ref2 = ctx->mic.ref_frame[1]; + for (i = 0; i < 4; ++i) { + ref_frames_used[i] |= (1 << ref1); + if (ref2 > 0) ref_frames_used[i] |= (1 << ref2); + } + } + if (bsize >= BLOCK_8X8) { + this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE]; + vp9_rd_cost_update(partition_mul, x->rddiv, &this_rdc); + } + + if (this_rdc.rdcost < best_rdc.rdcost) { + MODE_INFO *mi = xd->mi[0]; + + best_rdc = this_rdc; + should_encode_sb = 1; + if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE; + + if (cpi->sf.rd_ml_partition.search_early_termination) { + // Currently, the machine-learning based partition search early + // termination is only used while bsize is 16x16, 32x32 or 64x64, + // VPXMIN(cm->width, cm->height) >= 480, and speed = 0. + if (!x->e_mbd.lossless && + !segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP) && + ctx->mic.mode >= INTRA_MODES && bsize >= BLOCK_16X16) { + if (ml_pruning_partition(cm, xd, ctx, mi_row, mi_col, bsize)) { + do_split = 0; + do_rect = 0; + } + } + } + + if ((do_split || do_rect) && !x->e_mbd.lossless && ctx->skippable) { + const int use_ml_based_breakout = + cpi->sf.rd_ml_partition.search_breakout && cm->base_qindex >= 100; + if (use_ml_based_breakout) { + if (ml_predict_breakout(cpi, bsize, x, &this_rdc)) { + do_split = 0; + do_rect = 0; + } + } else { + if (!cpi->sf.rd_ml_partition.search_early_termination) { + if ((best_rdc.dist < (dist_breakout_thr >> 2)) || + (best_rdc.dist < dist_breakout_thr && + best_rdc.rate < rate_breakout_thr)) { + do_split = 0; + do_rect = 0; + } + } + } + } + } + } + restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize); + } else { + vp9_zero(ctx->pred_mv); + ctx->mic.interp_filter = EIGHTTAP; + } + + // store estimated motion vector + store_pred_mv(x, ctx); + + // If the interp_filter is marked as SWITCHABLE_FILTERS, it was for an + // intra block and used for context purposes. + if (ctx->mic.interp_filter == SWITCHABLE_FILTERS) { + pred_interp_filter = EIGHTTAP; + } else { + pred_interp_filter = ctx->mic.interp_filter; + } + + // PARTITION_SPLIT + // TODO(jingning): use the motion vectors given by the above search as + // the starting point of motion search in the following partition type check. + pc_tree->split[0]->none.rdcost = 0; + pc_tree->split[1]->none.rdcost = 0; + pc_tree->split[2]->none.rdcost = 0; + pc_tree->split[3]->none.rdcost = 0; + if (do_split || must_split) { + subsize = get_subsize(bsize, PARTITION_SPLIT); + load_pred_mv(x, ctx); + if (bsize == BLOCK_8X8) { + i = 4; + if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed) + pc_tree->leaf_split[0]->pred_interp_filter = pred_interp_filter; + rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize, + pc_tree->leaf_split[0], best_rdc.rate, best_rdc.dist); + if (sum_rdc.rate == INT_MAX) { + sum_rdc.rdcost = INT64_MAX; + } else { + if (cpi->sf.prune_ref_frame_for_rect_partitions) { + const int ref1 = pc_tree->leaf_split[0]->mic.ref_frame[0]; + const int ref2 = pc_tree->leaf_split[0]->mic.ref_frame[1]; + for (i = 0; i < 4; ++i) { + ref_frames_used[i] |= (1 << ref1); + if (ref2 > 0) ref_frames_used[i] |= (1 << ref2); + } + } + } + } else { + for (i = 0; (i < 4) && ((sum_rdc.rdcost < best_rdc.rdcost) || must_split); + ++i) { + const int x_idx = (i & 1) * mi_step; + const int y_idx = (i >> 1) * mi_step; + int found_best_rd = 0; + RD_COST best_rdc_split; + vp9_rd_cost_reset(&best_rdc_split); + + if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX) { + // A must split test here increases the number of sub + // partitions but hurts metrics results quite a bit, + // so this extra test is commented out pending + // further tests on whether it adds much in terms of + // visual quality. + // (must_split) ? best_rdc.rate + // : best_rdc.rate - sum_rdc.rate, + // (must_split) ? best_rdc.dist + // : best_rdc.dist - sum_rdc.dist, + best_rdc_split.rate = best_rdc.rate - sum_rdc.rate; + best_rdc_split.dist = best_rdc.dist - sum_rdc.dist; + } + + if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols) + continue; + + pc_tree->split[i]->index = i; + if (cpi->sf.prune_ref_frame_for_rect_partitions) + pc_tree->split[i]->none.rate = INT_MAX; + found_best_rd = rd_pick_partition( + cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize, + &this_rdc, best_rdc_split, pc_tree->split[i]); + + if (found_best_rd == 0) { + sum_rdc.rdcost = INT64_MAX; + break; + } else { + if (cpi->sf.prune_ref_frame_for_rect_partitions && + pc_tree->split[i]->none.rate != INT_MAX) { + const int ref1 = pc_tree->split[i]->none.mic.ref_frame[0]; + const int ref2 = pc_tree->split[i]->none.mic.ref_frame[1]; + ref_frames_used[i] |= (1 << ref1); + if (ref2 > 0) ref_frames_used[i] |= (1 << ref2); + } + sum_rdc.rate += this_rdc.rate; + sum_rdc.dist += this_rdc.dist; + vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc); + } + } + } + + if (((sum_rdc.rdcost < best_rdc.rdcost) || must_split) && i == 4) { + sum_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT]; + vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc); + + if ((sum_rdc.rdcost < best_rdc.rdcost) || + (must_split && (sum_rdc.dist < best_rdc.dist))) { + best_rdc = sum_rdc; + should_encode_sb = 1; + pc_tree->partitioning = PARTITION_SPLIT; + + // Rate and distortion based partition search termination clause. + if (!cpi->sf.rd_ml_partition.search_early_termination && + !x->e_mbd.lossless && + ((best_rdc.dist < (dist_breakout_thr >> 2)) || + (best_rdc.dist < dist_breakout_thr && + best_rdc.rate < rate_breakout_thr))) { + do_rect = 0; + } + } + } else { + // skip rectangular partition test when larger block size + // gives better rd cost + if (cpi->sf.less_rectangular_check && + (bsize > cpi->sf.use_square_only_thresh_high || + best_rdc.dist < dist_breakout_thr)) + do_rect &= !partition_none_allowed; + } + restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize); + } + + pc_tree->horizontal[0].skip_ref_frame_mask = 0; + pc_tree->horizontal[1].skip_ref_frame_mask = 0; + pc_tree->vertical[0].skip_ref_frame_mask = 0; + pc_tree->vertical[1].skip_ref_frame_mask = 0; + if (cpi->sf.prune_ref_frame_for_rect_partitions) { + uint8_t used_frames; + used_frames = ref_frames_used[0] | ref_frames_used[1]; + if (used_frames) { + pc_tree->horizontal[0].skip_ref_frame_mask = ~used_frames & 0xff; + } + used_frames = ref_frames_used[2] | ref_frames_used[3]; + if (used_frames) { + pc_tree->horizontal[1].skip_ref_frame_mask = ~used_frames & 0xff; + } + used_frames = ref_frames_used[0] | ref_frames_used[2]; + if (used_frames) { + pc_tree->vertical[0].skip_ref_frame_mask = ~used_frames & 0xff; + } + used_frames = ref_frames_used[1] | ref_frames_used[3]; + if (used_frames) { + pc_tree->vertical[1].skip_ref_frame_mask = ~used_frames & 0xff; + } + } + + { + const int do_ml_rect_partition_pruning = + !frame_is_intra_only(cm) && !force_horz_split && !force_vert_split && + (partition_horz_allowed || partition_vert_allowed) && bsize > BLOCK_8X8; + if (do_ml_rect_partition_pruning) { + ml_prune_rect_partition(cpi, x, bsize, pc_tree, &partition_horz_allowed, + &partition_vert_allowed, best_rdc.rdcost); + } + } + + // PARTITION_HORZ + if (partition_horz_allowed && + (do_rect || vp9_active_h_edge(cpi, mi_row, mi_step))) { + const int part_mode_rate = cpi->partition_cost[pl][PARTITION_HORZ]; + subsize = get_subsize(bsize, PARTITION_HORZ); + load_pred_mv(x, ctx); + if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && + partition_none_allowed) + pc_tree->horizontal[0].pred_interp_filter = pred_interp_filter; + rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize, + &pc_tree->horizontal[0], best_rdc.rate - part_mode_rate, + best_rdc.dist); + if (sum_rdc.rdcost < INT64_MAX) { + sum_rdc.rate += part_mode_rate; + vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc); + } + + if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + mi_step < cm->mi_rows && + bsize > BLOCK_8X8) { + PICK_MODE_CONTEXT *hctx = &pc_tree->horizontal[0]; + update_state(cpi, td, hctx, mi_row, mi_col, subsize, 0); + encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, hctx); + if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && + partition_none_allowed) + pc_tree->horizontal[1].pred_interp_filter = pred_interp_filter; + rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc, + subsize, &pc_tree->horizontal[1], + best_rdc.rate - sum_rdc.rate, + best_rdc.dist - sum_rdc.dist); + if (this_rdc.rate == INT_MAX) { + sum_rdc.rdcost = INT64_MAX; + } else { + sum_rdc.rate += this_rdc.rate; + sum_rdc.dist += this_rdc.dist; + vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc); + } + } + + if (sum_rdc.rdcost < best_rdc.rdcost) { + best_rdc = sum_rdc; + should_encode_sb = 1; + pc_tree->partitioning = PARTITION_HORZ; + + if (cpi->sf.less_rectangular_check && + bsize > cpi->sf.use_square_only_thresh_high) + do_rect = 0; + } + restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize); + } + + // PARTITION_VERT + if (partition_vert_allowed && + (do_rect || vp9_active_v_edge(cpi, mi_col, mi_step))) { + const int part_mode_rate = cpi->partition_cost[pl][PARTITION_VERT]; + subsize = get_subsize(bsize, PARTITION_VERT); + load_pred_mv(x, ctx); + if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && + partition_none_allowed) + pc_tree->vertical[0].pred_interp_filter = pred_interp_filter; + rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize, + &pc_tree->vertical[0], best_rdc.rate - part_mode_rate, + best_rdc.dist); + if (sum_rdc.rdcost < INT64_MAX) { + sum_rdc.rate += part_mode_rate; + vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc); + } + + if (sum_rdc.rdcost < best_rdc.rdcost && mi_col + mi_step < cm->mi_cols && + bsize > BLOCK_8X8) { + update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 0); + encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, + &pc_tree->vertical[0]); + if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && + partition_none_allowed) + pc_tree->vertical[1].pred_interp_filter = pred_interp_filter; + rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc, + subsize, &pc_tree->vertical[1], + best_rdc.rate - sum_rdc.rate, + best_rdc.dist - sum_rdc.dist); + if (this_rdc.rate == INT_MAX) { + sum_rdc.rdcost = INT64_MAX; + } else { + sum_rdc.rate += this_rdc.rate; + sum_rdc.dist += this_rdc.dist; + vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc); + } + } + + if (sum_rdc.rdcost < best_rdc.rdcost) { + best_rdc = sum_rdc; + should_encode_sb = 1; + pc_tree->partitioning = PARTITION_VERT; + } + restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize); + } + + if (bsize == BLOCK_64X64 && best_rdc.rdcost == INT64_MAX) { + vp9_rd_cost_reset(&this_rdc); + rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, BLOCK_64X64, + ctx, INT_MAX, INT64_MAX); + ctx->rdcost = this_rdc.rdcost; + vp9_rd_cost_update(partition_mul, x->rddiv, &this_rdc); + if (this_rdc.rdcost < best_rdc.rdcost) { + best_rdc = this_rdc; + should_encode_sb = 1; + pc_tree->partitioning = PARTITION_NONE; + } + } + + *rd_cost = best_rdc; + + if (should_encode_sb && pc_tree->index != 3) { + int output_enabled = (bsize == BLOCK_64X64); +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, encode_sb_time); +#endif + encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize, + pc_tree); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, encode_sb_time); +#endif +#if CONFIG_RATE_CTRL + if (oxcf->use_simple_encode_api) { + // Store partition, motion vector of the superblock. + if (output_enabled) { + const int num_unit_rows = + get_num_unit_4x4(cpi->frame_info.frame_height); + const int num_unit_cols = get_num_unit_4x4(cpi->frame_info.frame_width); + store_superblock_info(pc_tree, cm->mi_grid_visible, cm->mi_stride, + num_4x4_blocks_wide_lookup[BLOCK_64X64], + num_unit_rows, num_unit_cols, mi_row << 1, + mi_col << 1, cpi->partition_info, + cpi->motion_vector_info); + } + } +#endif // CONFIG_RATE_CTRL + } + + if (bsize == BLOCK_64X64) { + assert(tp_orig < *tp); + assert(best_rdc.rate < INT_MAX); + assert(best_rdc.dist < INT64_MAX); + } else { + assert(tp_orig == *tp); + } + + return should_encode_sb; +} + +static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, int mi_row, + TOKENEXTRA **tp) { + VP9_COMMON *const cm = &cpi->common; + TileInfo *const tile_info = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + SPEED_FEATURES *const sf = &cpi->sf; + const int mi_col_start = tile_info->mi_col_start; + const int mi_col_end = tile_info->mi_col_end; + int mi_col; + const int sb_row = mi_row >> MI_BLOCK_SIZE_LOG2; + const int num_sb_cols = + get_num_cols(tile_data->tile_info, MI_BLOCK_SIZE_LOG2); + int sb_col_in_tile; + + // Initialize the left context for the new SB row + memset(&xd->left_context, 0, sizeof(xd->left_context)); + memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context)); + + // Code each SB in the row + for (mi_col = mi_col_start, sb_col_in_tile = 0; mi_col < mi_col_end; + mi_col += MI_BLOCK_SIZE, sb_col_in_tile++) { + const struct segmentation *const seg = &cm->seg; + int dummy_rate; + int64_t dummy_dist; + RD_COST dummy_rdc; + int i; + int seg_skip = 0; + int orig_rdmult = cpi->rd.RDMULT; + + const int idx_str = cm->mi_stride * mi_row + mi_col; + MODE_INFO **mi = cm->mi_grid_visible + idx_str; + + vp9_rd_cost_reset(&dummy_rdc); + (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, sb_row, + sb_col_in_tile); + + if (sf->adaptive_pred_interp_filter) { + for (i = 0; i < 64; ++i) td->leaf_tree[i].pred_interp_filter = SWITCHABLE; + + for (i = 0; i < 64; ++i) { + td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE; + td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE; + td->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE; + td->pc_tree[i].horizontal[1].pred_interp_filter = SWITCHABLE; + } + } + + for (i = 0; i < MAX_REF_FRAMES; ++i) { + x->pred_mv[i].row = INT16_MAX; + x->pred_mv[i].col = INT16_MAX; + } + td->pc_root->index = 0; + + if (seg->enabled) { + const uint8_t *const map = + seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; + int segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col); + seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP); + } + + x->source_variance = UINT_MAX; + + x->cb_rdmult = orig_rdmult; + + if (sf->partition_search_type == FIXED_PARTITION || seg_skip) { + const BLOCK_SIZE bsize = + seg_skip ? BLOCK_64X64 : sf->always_this_block_size; + set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64); + set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize); + rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64, + &dummy_rate, &dummy_dist, 1, td->pc_root); + } else if (sf->partition_search_type == VAR_BASED_PARTITION && + cm->frame_type != KEY_FRAME) { + choose_partitioning(cpi, tile_info, x, mi_row, mi_col); + rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64, + &dummy_rate, &dummy_dist, 1, td->pc_root); + } else { + if (cpi->twopass.gf_group.index > 0 && cpi->sf.enable_tpl_model) { + int dr = + get_rdmult_delta(cpi, BLOCK_64X64, mi_row, mi_col, orig_rdmult); + x->cb_rdmult = dr; + } + + if (cpi->oxcf.aq_mode == PERCEPTUAL_AQ && cm->show_frame) { + x->segment_id = wiener_var_segment(cpi, BLOCK_64X64, mi_row, mi_col); + x->cb_rdmult = vp9_compute_rd_mult( + cpi, vp9_get_qindex(&cm->seg, x->segment_id, cm->base_qindex)); + } + + // If required set upper and lower partition size limits + if (sf->auto_min_max_partition_size) { + set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64); + rd_auto_partition_range(cpi, tile_info, xd, mi_row, mi_col, + &x->min_partition_size, &x->max_partition_size); + } + td->pc_root->none.rdcost = 0; + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, rd_pick_partition_time); +#endif + rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64, + &dummy_rdc, dummy_rdc, td->pc_root); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, rd_pick_partition_time); +#endif + } + (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row, + sb_col_in_tile, num_sb_cols); + } +} +#endif // !CONFIG_REALTIME_ONLY + +static void init_encode_frame_mb_context(VP9_COMP *cpi) { + MACROBLOCK *const x = &cpi->td.mb; + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); + + // Copy data over into macro block data structures. + vp9_setup_src_planes(x, cpi->Source, 0, 0); + + vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y); + + // Note: this memset assumes above_context[0], [1] and [2] + // are allocated as part of the same buffer. + memset(xd->above_context[0], 0, + sizeof(*xd->above_context[0]) * 2 * aligned_mi_cols * MAX_MB_PLANE); + memset(xd->above_seg_context, 0, + sizeof(*xd->above_seg_context) * aligned_mi_cols); +} + +static int check_dual_ref_flags(VP9_COMP *cpi) { + const int ref_flags = cpi->ref_frame_flags; + + if (segfeature_active(&cpi->common.seg, 1, SEG_LVL_REF_FRAME)) { + return 0; + } else { + return (!!(ref_flags & VP9_GOLD_FLAG) + !!(ref_flags & VP9_LAST_FLAG) + + !!(ref_flags & VP9_ALT_FLAG)) >= 2; + } +} + +static void reset_skip_tx_size(VP9_COMMON *cm, TX_SIZE max_tx_size) { + int mi_row, mi_col; + const int mis = cm->mi_stride; + MODE_INFO **mi_ptr = cm->mi_grid_visible; + + for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row, mi_ptr += mis) { + for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) { + if (mi_ptr[mi_col]->tx_size > max_tx_size) + mi_ptr[mi_col]->tx_size = max_tx_size; + } + } +} + +static MV_REFERENCE_FRAME get_frame_type(const VP9_COMP *cpi) { + if (frame_is_intra_only(&cpi->common)) + return INTRA_FRAME; + else if (cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame) + return ALTREF_FRAME; + else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) + return GOLDEN_FRAME; + else + return LAST_FRAME; +} + +static TX_MODE select_tx_mode(const VP9_COMP *cpi, MACROBLOCKD *const xd) { + if (xd->lossless) return ONLY_4X4; + if (cpi->common.frame_type == KEY_FRAME && cpi->sf.use_nonrd_pick_mode) + return ALLOW_16X16; + if (cpi->sf.tx_size_search_method == USE_LARGESTALL) + return ALLOW_32X32; + else if (cpi->sf.tx_size_search_method == USE_FULL_RD || + cpi->sf.tx_size_search_method == USE_TX_8X8) + return TX_MODE_SELECT; + else + return cpi->common.tx_mode; +} + +static void hybrid_intra_mode_search(VP9_COMP *cpi, MACROBLOCK *const x, + RD_COST *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx) { + if (!cpi->sf.nonrd_keyframe && bsize < BLOCK_16X16) + vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX); + else + vp9_pick_intra_mode(cpi, x, rd_cost, bsize, ctx); +} + +static void hybrid_search_svc_baseiskey(VP9_COMP *cpi, MACROBLOCK *const x, + RD_COST *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, + TileDataEnc *tile_data, int mi_row, + int mi_col) { + if (!cpi->sf.nonrd_keyframe && bsize <= BLOCK_8X8) { + vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX); + } else { + if (cpi->svc.disable_inter_layer_pred == INTER_LAYER_PRED_OFF) + vp9_pick_intra_mode(cpi, x, rd_cost, bsize, ctx); + else if (bsize >= BLOCK_8X8) + vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col, rd_cost, bsize, + ctx); + else + vp9_pick_inter_mode_sub8x8(cpi, x, mi_row, mi_col, rd_cost, bsize, ctx); + } +} + +static void hybrid_search_scene_change(VP9_COMP *cpi, MACROBLOCK *const x, + RD_COST *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, + TileDataEnc *tile_data, int mi_row, + int mi_col) { + if (!cpi->sf.nonrd_keyframe && bsize <= BLOCK_8X8) { + vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX); + } else { + vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col, rd_cost, bsize, ctx); + } +} + +static void nonrd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, + MACROBLOCK *const x, int mi_row, int mi_col, + RD_COST *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx) { + VP9_COMMON *const cm = &cpi->common; + TileInfo *const tile_info = &tile_data->tile_info; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *mi; + ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; + BLOCK_SIZE bs = VPXMAX(bsize, BLOCK_8X8); // processing unit block size + const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bs]; + const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bs]; + int plane; + + set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); + + set_segment_index(cpi, x, mi_row, mi_col, bsize, 0); + + x->skip_recode = 0; + + mi = xd->mi[0]; + mi->sb_type = bsize; + + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + struct macroblockd_plane *pd = &xd->plane[plane]; + memcpy(a + num_4x4_blocks_wide * plane, pd->above_context, + (sizeof(a[0]) * num_4x4_blocks_wide) >> pd->subsampling_x); + memcpy(l + num_4x4_blocks_high * plane, pd->left_context, + (sizeof(l[0]) * num_4x4_blocks_high) >> pd->subsampling_y); + } + + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) + if (cyclic_refresh_segment_id_boosted(mi->segment_id)) + x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh); + + if (frame_is_intra_only(cm)) + hybrid_intra_mode_search(cpi, x, rd_cost, bsize, ctx); + else if (cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) + hybrid_search_svc_baseiskey(cpi, x, rd_cost, bsize, ctx, tile_data, mi_row, + mi_col); + else if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP)) + set_mode_info_seg_skip(x, cm->tx_mode, cm->interp_filter, rd_cost, bsize); + else if (bsize >= BLOCK_8X8) { + if (cpi->rc.hybrid_intra_scene_change) + hybrid_search_scene_change(cpi, x, rd_cost, bsize, ctx, tile_data, mi_row, + mi_col); + else + vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col, rd_cost, bsize, + ctx); + } else { + vp9_pick_inter_mode_sub8x8(cpi, x, mi_row, mi_col, rd_cost, bsize, ctx); + } + + duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize); + + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + struct macroblockd_plane *pd = &xd->plane[plane]; + memcpy(pd->above_context, a + num_4x4_blocks_wide * plane, + (sizeof(a[0]) * num_4x4_blocks_wide) >> pd->subsampling_x); + memcpy(pd->left_context, l + num_4x4_blocks_high * plane, + (sizeof(l[0]) * num_4x4_blocks_high) >> pd->subsampling_y); + } + + if (rd_cost->rate == INT_MAX) vp9_rd_cost_reset(rd_cost); + + ctx->rate = rd_cost->rate; + ctx->dist = rd_cost->dist; +} + +static void fill_mode_info_sb(VP9_COMMON *cm, MACROBLOCK *x, int mi_row, + int mi_col, BLOCK_SIZE bsize, PC_TREE *pc_tree) { + MACROBLOCKD *xd = &x->e_mbd; + int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4; + PARTITION_TYPE partition = pc_tree->partitioning; + BLOCK_SIZE subsize = get_subsize(bsize, partition); + + assert(bsize >= BLOCK_8X8); + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + switch (partition) { + case PARTITION_NONE: + set_mode_info_offsets(cm, x, xd, mi_row, mi_col); + *(xd->mi[0]) = pc_tree->none.mic; + *(x->mbmi_ext) = pc_tree->none.mbmi_ext; + duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize); + break; + case PARTITION_VERT: + set_mode_info_offsets(cm, x, xd, mi_row, mi_col); + *(xd->mi[0]) = pc_tree->vertical[0].mic; + *(x->mbmi_ext) = pc_tree->vertical[0].mbmi_ext; + duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, subsize); + + if (mi_col + hbs < cm->mi_cols) { + set_mode_info_offsets(cm, x, xd, mi_row, mi_col + hbs); + *(xd->mi[0]) = pc_tree->vertical[1].mic; + *(x->mbmi_ext) = pc_tree->vertical[1].mbmi_ext; + duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col + hbs, subsize); + } + break; + case PARTITION_HORZ: + set_mode_info_offsets(cm, x, xd, mi_row, mi_col); + *(xd->mi[0]) = pc_tree->horizontal[0].mic; + *(x->mbmi_ext) = pc_tree->horizontal[0].mbmi_ext; + duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, subsize); + if (mi_row + hbs < cm->mi_rows) { + set_mode_info_offsets(cm, x, xd, mi_row + hbs, mi_col); + *(xd->mi[0]) = pc_tree->horizontal[1].mic; + *(x->mbmi_ext) = pc_tree->horizontal[1].mbmi_ext; + duplicate_mode_info_in_sb(cm, xd, mi_row + hbs, mi_col, subsize); + } + break; + case PARTITION_SPLIT: { + fill_mode_info_sb(cm, x, mi_row, mi_col, subsize, pc_tree->split[0]); + fill_mode_info_sb(cm, x, mi_row, mi_col + hbs, subsize, + pc_tree->split[1]); + fill_mode_info_sb(cm, x, mi_row + hbs, mi_col, subsize, + pc_tree->split[2]); + fill_mode_info_sb(cm, x, mi_row + hbs, mi_col + hbs, subsize, + pc_tree->split[3]); + break; + } + default: break; + } +} + +// Reset the prediction pixel ready flag recursively. +static void pred_pixel_ready_reset(PC_TREE *pc_tree, BLOCK_SIZE bsize) { + pc_tree->none.pred_pixel_ready = 0; + pc_tree->horizontal[0].pred_pixel_ready = 0; + pc_tree->horizontal[1].pred_pixel_ready = 0; + pc_tree->vertical[0].pred_pixel_ready = 0; + pc_tree->vertical[1].pred_pixel_ready = 0; + + if (bsize > BLOCK_8X8) { + BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT); + int i; + for (i = 0; i < 4; ++i) pred_pixel_ready_reset(pc_tree->split[i], subsize); + } +} + +#define FEATURES 6 +#define LABELS 2 +static int ml_predict_var_partitioning(VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, + int mi_col) { + VP9_COMMON *const cm = &cpi->common; + const NN_CONFIG *nn_config = NULL; + + switch (bsize) { + case BLOCK_64X64: nn_config = &vp9_var_part_nnconfig_64; break; + case BLOCK_32X32: nn_config = &vp9_var_part_nnconfig_32; break; + case BLOCK_16X16: nn_config = &vp9_var_part_nnconfig_16; break; + case BLOCK_8X8: break; + default: assert(0 && "Unexpected block size."); return -1; + } + + if (!nn_config) return -1; + + vpx_clear_system_state(); + + { + const float thresh = cpi->oxcf.speed <= 5 ? 1.25f : 0.0f; + float features[FEATURES] = { 0.0f }; + const int dc_q = vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth); + int feature_idx = 0; + float score[LABELS]; + + features[feature_idx++] = logf((float)(dc_q * dc_q) / 256.0f + 1.0f); + vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col); + { + const int bs = 4 * num_4x4_blocks_wide_lookup[bsize]; + const BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT); + const int sb_offset_row = 8 * (mi_row & 7); + const int sb_offset_col = 8 * (mi_col & 7); + const uint8_t *pred = x->est_pred + sb_offset_row * 64 + sb_offset_col; + const uint8_t *src = x->plane[0].src.buf; + const int src_stride = x->plane[0].src.stride; + const int pred_stride = 64; + unsigned int sse; + int i; + // Variance of whole block. + const unsigned int var = + cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse); + const float factor = (var == 0) ? 1.0f : (1.0f / (float)var); + + features[feature_idx++] = logf((float)var + 1.0f); + for (i = 0; i < 4; ++i) { + const int x_idx = (i & 1) * bs / 2; + const int y_idx = (i >> 1) * bs / 2; + const int src_offset = y_idx * src_stride + x_idx; + const int pred_offset = y_idx * pred_stride + x_idx; + // Variance of quarter block. + const unsigned int sub_var = + cpi->fn_ptr[subsize].vf(src + src_offset, src_stride, + pred + pred_offset, pred_stride, &sse); + const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var; + features[feature_idx++] = var_ratio; + } + } + + assert(feature_idx == FEATURES); + nn_predict(features, nn_config, score); + if (score[0] > thresh) return PARTITION_SPLIT; + if (score[0] < -thresh) return PARTITION_NONE; + return -1; + } +} +#undef FEATURES +#undef LABELS + +static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, TOKENEXTRA **tp, + int mi_row, int mi_col, BLOCK_SIZE bsize, + RD_COST *rd_cost, int do_recon, + int64_t best_rd, PC_TREE *pc_tree) { + const SPEED_FEATURES *const sf = &cpi->sf; + VP9_COMMON *const cm = &cpi->common; + TileInfo *const tile_info = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int ms = num_8x8_blocks_wide_lookup[bsize] / 2; + TOKENEXTRA *tp_orig = *tp; + PICK_MODE_CONTEXT *ctx = &pc_tree->none; + int i; + BLOCK_SIZE subsize = bsize; + RD_COST this_rdc, sum_rdc, best_rdc; + int do_split = bsize >= BLOCK_8X8; + int do_rect = 1; + // Override skipping rectangular partition operations for edge blocks + const int force_horz_split = (mi_row + ms >= cm->mi_rows); + const int force_vert_split = (mi_col + ms >= cm->mi_cols); + const int xss = x->e_mbd.plane[1].subsampling_x; + const int yss = x->e_mbd.plane[1].subsampling_y; + + int partition_none_allowed = !force_horz_split && !force_vert_split; + int partition_horz_allowed = + !force_vert_split && yss <= xss && bsize >= BLOCK_8X8; + int partition_vert_allowed = + !force_horz_split && xss <= yss && bsize >= BLOCK_8X8; + const int use_ml_based_partitioning = + sf->partition_search_type == ML_BASED_PARTITION; + + (void)*tp_orig; + + // Avoid checking for rectangular partitions for speed >= 5. + if (cpi->oxcf.speed >= 5) do_rect = 0; + + assert(num_8x8_blocks_wide_lookup[bsize] == + num_8x8_blocks_high_lookup[bsize]); + + vp9_rd_cost_init(&sum_rdc); + vp9_rd_cost_reset(&best_rdc); + best_rdc.rdcost = best_rd; + + // Determine partition types in search according to the speed features. + // The threshold set here has to be of square block size. + if (sf->auto_min_max_partition_size) { + partition_none_allowed &= + (bsize <= x->max_partition_size && bsize >= x->min_partition_size); + partition_horz_allowed &= + ((bsize <= x->max_partition_size && bsize > x->min_partition_size) || + force_horz_split); + partition_vert_allowed &= + ((bsize <= x->max_partition_size && bsize > x->min_partition_size) || + force_vert_split); + do_split &= bsize > x->min_partition_size; + } + if (sf->use_square_partition_only) { + partition_horz_allowed &= force_horz_split; + partition_vert_allowed &= force_vert_split; + } + + if (use_ml_based_partitioning) { + if (partition_none_allowed || do_split) do_rect = 0; + if (partition_none_allowed && do_split) { + const int ml_predicted_partition = + ml_predict_var_partitioning(cpi, x, bsize, mi_row, mi_col); + if (ml_predicted_partition == PARTITION_NONE) do_split = 0; + if (ml_predicted_partition == PARTITION_SPLIT) partition_none_allowed = 0; + } + } + + if (!partition_none_allowed && !do_split) do_rect = 1; + + ctx->pred_pixel_ready = + !(partition_vert_allowed || partition_horz_allowed || do_split); + + // PARTITION_NONE + if (partition_none_allowed) { + nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, bsize, + ctx); + ctx->mic = *xd->mi[0]; + ctx->mbmi_ext = *x->mbmi_ext; + ctx->skip_txfm[0] = x->skip_txfm[0]; + ctx->skip = x->skip; + + if (this_rdc.rate != INT_MAX) { + const int pl = partition_plane_context(xd, mi_row, mi_col, bsize); + this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE]; + this_rdc.rdcost = + RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist); + if (this_rdc.rdcost < best_rdc.rdcost) { + best_rdc = this_rdc; + if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE; + + if (!use_ml_based_partitioning) { + int64_t dist_breakout_thr = sf->partition_search_breakout_thr.dist; + int64_t rate_breakout_thr = sf->partition_search_breakout_thr.rate; + dist_breakout_thr >>= + 8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + rate_breakout_thr *= num_pels_log2_lookup[bsize]; + if (!x->e_mbd.lossless && this_rdc.rate < rate_breakout_thr && + this_rdc.dist < dist_breakout_thr) { + do_split = 0; + do_rect = 0; + } + } + } + } + } + + // store estimated motion vector + store_pred_mv(x, ctx); + + // PARTITION_SPLIT + if (do_split) { + int pl = partition_plane_context(xd, mi_row, mi_col, bsize); + sum_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT]; + sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist); + subsize = get_subsize(bsize, PARTITION_SPLIT); + for (i = 0; i < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++i) { + const int x_idx = (i & 1) * ms; + const int y_idx = (i >> 1) * ms; + + if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols) + continue; + load_pred_mv(x, ctx); + nonrd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx, + mi_col + x_idx, subsize, &this_rdc, 0, + best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]); + + if (this_rdc.rate == INT_MAX) { + vp9_rd_cost_reset(&sum_rdc); + } else { + sum_rdc.rate += this_rdc.rate; + sum_rdc.dist += this_rdc.dist; + sum_rdc.rdcost += this_rdc.rdcost; + } + } + + if (sum_rdc.rdcost < best_rdc.rdcost) { + best_rdc = sum_rdc; + pc_tree->partitioning = PARTITION_SPLIT; + } else { + // skip rectangular partition test when larger block size + // gives better rd cost + if (sf->less_rectangular_check) do_rect &= !partition_none_allowed; + } + } + + // PARTITION_HORZ + if (partition_horz_allowed && do_rect) { + subsize = get_subsize(bsize, PARTITION_HORZ); + load_pred_mv(x, ctx); + pc_tree->horizontal[0].pred_pixel_ready = 1; + nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize, + &pc_tree->horizontal[0]); + + pc_tree->horizontal[0].mic = *xd->mi[0]; + pc_tree->horizontal[0].mbmi_ext = *x->mbmi_ext; + pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0]; + pc_tree->horizontal[0].skip = x->skip; + + if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + ms < cm->mi_rows) { + load_pred_mv(x, ctx); + pc_tree->horizontal[1].pred_pixel_ready = 1; + nonrd_pick_sb_modes(cpi, tile_data, x, mi_row + ms, mi_col, &this_rdc, + subsize, &pc_tree->horizontal[1]); + + pc_tree->horizontal[1].mic = *xd->mi[0]; + pc_tree->horizontal[1].mbmi_ext = *x->mbmi_ext; + pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0]; + pc_tree->horizontal[1].skip = x->skip; + + if (this_rdc.rate == INT_MAX) { + vp9_rd_cost_reset(&sum_rdc); + } else { + int pl = partition_plane_context(xd, mi_row, mi_col, bsize); + this_rdc.rate += cpi->partition_cost[pl][PARTITION_HORZ]; + sum_rdc.rate += this_rdc.rate; + sum_rdc.dist += this_rdc.dist; + sum_rdc.rdcost = + RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist); + } + } + + if (sum_rdc.rdcost < best_rdc.rdcost) { + best_rdc = sum_rdc; + pc_tree->partitioning = PARTITION_HORZ; + } else { + pred_pixel_ready_reset(pc_tree, bsize); + } + } + + // PARTITION_VERT + if (partition_vert_allowed && do_rect) { + subsize = get_subsize(bsize, PARTITION_VERT); + load_pred_mv(x, ctx); + pc_tree->vertical[0].pred_pixel_ready = 1; + nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize, + &pc_tree->vertical[0]); + pc_tree->vertical[0].mic = *xd->mi[0]; + pc_tree->vertical[0].mbmi_ext = *x->mbmi_ext; + pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0]; + pc_tree->vertical[0].skip = x->skip; + + if (sum_rdc.rdcost < best_rdc.rdcost && mi_col + ms < cm->mi_cols) { + load_pred_mv(x, ctx); + pc_tree->vertical[1].pred_pixel_ready = 1; + nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + ms, &this_rdc, + subsize, &pc_tree->vertical[1]); + pc_tree->vertical[1].mic = *xd->mi[0]; + pc_tree->vertical[1].mbmi_ext = *x->mbmi_ext; + pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0]; + pc_tree->vertical[1].skip = x->skip; + + if (this_rdc.rate == INT_MAX) { + vp9_rd_cost_reset(&sum_rdc); + } else { + int pl = partition_plane_context(xd, mi_row, mi_col, bsize); + sum_rdc.rate += cpi->partition_cost[pl][PARTITION_VERT]; + sum_rdc.rate += this_rdc.rate; + sum_rdc.dist += this_rdc.dist; + sum_rdc.rdcost = + RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist); + } + } + + if (sum_rdc.rdcost < best_rdc.rdcost) { + best_rdc = sum_rdc; + pc_tree->partitioning = PARTITION_VERT; + } else { + pred_pixel_ready_reset(pc_tree, bsize); + } + } + + *rd_cost = best_rdc; + + if (best_rdc.rate == INT_MAX) { + vp9_rd_cost_reset(rd_cost); + return; + } + + // update mode info array + fill_mode_info_sb(cm, x, mi_row, mi_col, bsize, pc_tree); + + if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX && do_recon) { + int output_enabled = (bsize == BLOCK_64X64); + encode_sb_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize, + pc_tree); + } + + if (bsize == BLOCK_64X64 && do_recon) { + assert(tp_orig < *tp); + assert(best_rdc.rate < INT_MAX); + assert(best_rdc.dist < INT64_MAX); + } else { + assert(tp_orig == *tp); + } +} + +static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, MODE_INFO **mi, + TOKENEXTRA **tp, int mi_row, int mi_col, + BLOCK_SIZE bsize, int output_enabled, + RD_COST *rd_cost, PC_TREE *pc_tree) { + VP9_COMMON *const cm = &cpi->common; + TileInfo *const tile_info = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4; + const int mis = cm->mi_stride; + PARTITION_TYPE partition; + BLOCK_SIZE subsize; + RD_COST this_rdc; + BLOCK_SIZE subsize_ref = + (cpi->sf.adapt_partition_source_sad) ? BLOCK_8X8 : BLOCK_16X16; + + vp9_rd_cost_reset(&this_rdc); + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + subsize = (bsize >= BLOCK_8X8) ? mi[0]->sb_type : BLOCK_4X4; + partition = partition_lookup[bsl][subsize]; + + if (bsize == BLOCK_32X32 && subsize == BLOCK_32X32) { + x->max_partition_size = BLOCK_32X32; + x->min_partition_size = BLOCK_16X16; + nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize, rd_cost, + 0, INT64_MAX, pc_tree); + } else if (bsize == BLOCK_32X32 && partition != PARTITION_NONE && + subsize >= subsize_ref) { + x->max_partition_size = BLOCK_32X32; + x->min_partition_size = BLOCK_8X8; + nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize, rd_cost, + 0, INT64_MAX, pc_tree); + } else if (bsize == BLOCK_16X16 && partition != PARTITION_NONE) { + x->max_partition_size = BLOCK_16X16; + x->min_partition_size = BLOCK_8X8; + nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize, rd_cost, + 0, INT64_MAX, pc_tree); + } else { + switch (partition) { + case PARTITION_NONE: + pc_tree->none.pred_pixel_ready = 1; + nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, rd_cost, subsize, + &pc_tree->none); + pc_tree->none.mic = *xd->mi[0]; + pc_tree->none.mbmi_ext = *x->mbmi_ext; + pc_tree->none.skip_txfm[0] = x->skip_txfm[0]; + pc_tree->none.skip = x->skip; + break; + case PARTITION_VERT: + pc_tree->vertical[0].pred_pixel_ready = 1; + nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, rd_cost, subsize, + &pc_tree->vertical[0]); + pc_tree->vertical[0].mic = *xd->mi[0]; + pc_tree->vertical[0].mbmi_ext = *x->mbmi_ext; + pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0]; + pc_tree->vertical[0].skip = x->skip; + if (mi_col + hbs < cm->mi_cols) { + pc_tree->vertical[1].pred_pixel_ready = 1; + nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, + &this_rdc, subsize, &pc_tree->vertical[1]); + pc_tree->vertical[1].mic = *xd->mi[0]; + pc_tree->vertical[1].mbmi_ext = *x->mbmi_ext; + pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0]; + pc_tree->vertical[1].skip = x->skip; + if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX && + rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) { + rd_cost->rate += this_rdc.rate; + rd_cost->dist += this_rdc.dist; + } + } + break; + case PARTITION_HORZ: + pc_tree->horizontal[0].pred_pixel_ready = 1; + nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, rd_cost, subsize, + &pc_tree->horizontal[0]); + pc_tree->horizontal[0].mic = *xd->mi[0]; + pc_tree->horizontal[0].mbmi_ext = *x->mbmi_ext; + pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0]; + pc_tree->horizontal[0].skip = x->skip; + if (mi_row + hbs < cm->mi_rows) { + pc_tree->horizontal[1].pred_pixel_ready = 1; + nonrd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, + &this_rdc, subsize, &pc_tree->horizontal[1]); + pc_tree->horizontal[1].mic = *xd->mi[0]; + pc_tree->horizontal[1].mbmi_ext = *x->mbmi_ext; + pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0]; + pc_tree->horizontal[1].skip = x->skip; + if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX && + rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) { + rd_cost->rate += this_rdc.rate; + rd_cost->dist += this_rdc.dist; + } + } + break; + default: + assert(partition == PARTITION_SPLIT); + subsize = get_subsize(bsize, PARTITION_SPLIT); + nonrd_select_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, + subsize, output_enabled, rd_cost, + pc_tree->split[0]); + nonrd_select_partition(cpi, td, tile_data, mi + hbs, tp, mi_row, + mi_col + hbs, subsize, output_enabled, &this_rdc, + pc_tree->split[1]); + if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX && + rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) { + rd_cost->rate += this_rdc.rate; + rd_cost->dist += this_rdc.dist; + } + nonrd_select_partition(cpi, td, tile_data, mi + hbs * mis, tp, + mi_row + hbs, mi_col, subsize, output_enabled, + &this_rdc, pc_tree->split[2]); + if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX && + rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) { + rd_cost->rate += this_rdc.rate; + rd_cost->dist += this_rdc.dist; + } + nonrd_select_partition(cpi, td, tile_data, mi + hbs * mis + hbs, tp, + mi_row + hbs, mi_col + hbs, subsize, + output_enabled, &this_rdc, pc_tree->split[3]); + if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX && + rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) { + rd_cost->rate += this_rdc.rate; + rd_cost->dist += this_rdc.dist; + } + break; + } + } + + if (bsize == BLOCK_64X64 && output_enabled) + encode_sb_rt(cpi, td, tile_info, tp, mi_row, mi_col, 1, bsize, pc_tree); +} + +static void nonrd_use_partition(VP9_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, MODE_INFO **mi, + TOKENEXTRA **tp, int mi_row, int mi_col, + BLOCK_SIZE bsize, int output_enabled, + RD_COST *dummy_cost, PC_TREE *pc_tree) { + VP9_COMMON *const cm = &cpi->common; + TileInfo *tile_info = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4; + const int mis = cm->mi_stride; + PARTITION_TYPE partition; + BLOCK_SIZE subsize; + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + subsize = (bsize >= BLOCK_8X8) ? mi[0]->sb_type : BLOCK_4X4; + partition = partition_lookup[bsl][subsize]; + + if (output_enabled && bsize != BLOCK_4X4) { + int ctx = partition_plane_context(xd, mi_row, mi_col, bsize); + td->counts->partition[ctx][partition]++; + } + + switch (partition) { + case PARTITION_NONE: + pc_tree->none.pred_pixel_ready = 1; + nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost, + subsize, &pc_tree->none); + pc_tree->none.mic = *xd->mi[0]; + pc_tree->none.mbmi_ext = *x->mbmi_ext; + pc_tree->none.skip_txfm[0] = x->skip_txfm[0]; + pc_tree->none.skip = x->skip; + encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, + subsize, &pc_tree->none); + break; + case PARTITION_VERT: + pc_tree->vertical[0].pred_pixel_ready = 1; + nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost, + subsize, &pc_tree->vertical[0]); + pc_tree->vertical[0].mic = *xd->mi[0]; + pc_tree->vertical[0].mbmi_ext = *x->mbmi_ext; + pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0]; + pc_tree->vertical[0].skip = x->skip; + encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, + subsize, &pc_tree->vertical[0]); + if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) { + pc_tree->vertical[1].pred_pixel_ready = 1; + nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, dummy_cost, + subsize, &pc_tree->vertical[1]); + pc_tree->vertical[1].mic = *xd->mi[0]; + pc_tree->vertical[1].mbmi_ext = *x->mbmi_ext; + pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0]; + pc_tree->vertical[1].skip = x->skip; + encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col + hbs, + output_enabled, subsize, &pc_tree->vertical[1]); + } + break; + case PARTITION_HORZ: + pc_tree->horizontal[0].pred_pixel_ready = 1; + nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost, + subsize, &pc_tree->horizontal[0]); + pc_tree->horizontal[0].mic = *xd->mi[0]; + pc_tree->horizontal[0].mbmi_ext = *x->mbmi_ext; + pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0]; + pc_tree->horizontal[0].skip = x->skip; + encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, + subsize, &pc_tree->horizontal[0]); + + if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) { + pc_tree->horizontal[1].pred_pixel_ready = 1; + nonrd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, dummy_cost, + subsize, &pc_tree->horizontal[1]); + pc_tree->horizontal[1].mic = *xd->mi[0]; + pc_tree->horizontal[1].mbmi_ext = *x->mbmi_ext; + pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0]; + pc_tree->horizontal[1].skip = x->skip; + encode_b_rt(cpi, td, tile_info, tp, mi_row + hbs, mi_col, + output_enabled, subsize, &pc_tree->horizontal[1]); + } + break; + default: + assert(partition == PARTITION_SPLIT); + subsize = get_subsize(bsize, PARTITION_SPLIT); + if (bsize == BLOCK_8X8) { + nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost, + subsize, pc_tree->leaf_split[0]); + encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, + subsize, pc_tree->leaf_split[0]); + } else { + nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, subsize, + output_enabled, dummy_cost, pc_tree->split[0]); + nonrd_use_partition(cpi, td, tile_data, mi + hbs, tp, mi_row, + mi_col + hbs, subsize, output_enabled, dummy_cost, + pc_tree->split[1]); + nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis, tp, + mi_row + hbs, mi_col, subsize, output_enabled, + dummy_cost, pc_tree->split[2]); + nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis + hbs, tp, + mi_row + hbs, mi_col + hbs, subsize, output_enabled, + dummy_cost, pc_tree->split[3]); + } + break; + } + + if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) + update_partition_context(xd, mi_row, mi_col, subsize, bsize); +} + +// Get a prediction(stored in x->est_pred) for the whole 64x64 superblock. +static void get_estimated_pred(VP9_COMP *cpi, const TileInfo *const tile, + MACROBLOCK *x, int mi_row, int mi_col) { + VP9_COMMON *const cm = &cpi->common; + const int is_key_frame = frame_is_intra_only(cm); + MACROBLOCKD *xd = &x->e_mbd; + + set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64); + + if (!is_key_frame) { + MODE_INFO *mi = xd->mi[0]; + YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME); + const YV12_BUFFER_CONFIG *yv12_g = NULL; + const BLOCK_SIZE bsize = BLOCK_32X32 + (mi_col + 4 < cm->mi_cols) * 2 + + (mi_row + 4 < cm->mi_rows); + unsigned int y_sad_g, y_sad_thr; + unsigned int y_sad = UINT_MAX; + + assert(yv12 != NULL); + + if (!(is_one_pass_svc(cpi) && cpi->svc.spatial_layer_id) || + cpi->svc.use_gf_temporal_ref_current_layer) { + // For now, GOLDEN will not be used for non-zero spatial layers, since + // it may not be a temporal reference. + yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME); + } + + // Only compute y_sad_g (sad for golden reference) for speed < 8. + if (cpi->oxcf.speed < 8 && yv12_g && yv12_g != yv12 && + (cpi->ref_frame_flags & VP9_GOLD_FLAG)) { + vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col, + &cm->frame_refs[GOLDEN_FRAME - 1].sf); + y_sad_g = cpi->fn_ptr[bsize].sdf( + x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf, + xd->plane[0].pre[0].stride); + } else { + y_sad_g = UINT_MAX; + } + + if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR && + cpi->rc.is_src_frame_alt_ref) { + yv12 = get_ref_frame_buffer(cpi, ALTREF_FRAME); + vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, + &cm->frame_refs[ALTREF_FRAME - 1].sf); + mi->ref_frame[0] = ALTREF_FRAME; + y_sad_g = UINT_MAX; + } else { + vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, + &cm->frame_refs[LAST_FRAME - 1].sf); + mi->ref_frame[0] = LAST_FRAME; + } + mi->ref_frame[1] = NO_REF_FRAME; + mi->sb_type = BLOCK_64X64; + mi->mv[0].as_int = 0; + mi->interp_filter = BILINEAR; + + { + const MV dummy_mv = { 0, 0 }; + y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col, + &dummy_mv); + x->sb_use_mv_part = 1; + x->sb_mvcol_part = mi->mv[0].as_mv.col; + x->sb_mvrow_part = mi->mv[0].as_mv.row; + } + + // Pick ref frame for partitioning, bias last frame when y_sad_g and y_sad + // are close if short_circuit_low_temp_var is on. + y_sad_thr = cpi->sf.short_circuit_low_temp_var ? (y_sad * 7) >> 3 : y_sad; + if (y_sad_g < y_sad_thr) { + vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col, + &cm->frame_refs[GOLDEN_FRAME - 1].sf); + mi->ref_frame[0] = GOLDEN_FRAME; + mi->mv[0].as_int = 0; + } else { + x->pred_mv[LAST_FRAME] = mi->mv[0].as_mv; + } + + set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); + xd->plane[0].dst.buf = x->est_pred; + xd->plane[0].dst.stride = 64; + vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64); + } else { +#if CONFIG_VP9_HIGHBITDEPTH + switch (xd->bd) { + case 8: memset(x->est_pred, 128, 64 * 64 * sizeof(x->est_pred[0])); break; + case 10: + memset(x->est_pred, 128 * 4, 64 * 64 * sizeof(x->est_pred[0])); + break; + case 12: + memset(x->est_pred, 128 * 16, 64 * 64 * sizeof(x->est_pred[0])); + break; + } +#else + memset(x->est_pred, 128, 64 * 64 * sizeof(x->est_pred[0])); +#endif // CONFIG_VP9_HIGHBITDEPTH + } +} + +static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, int mi_row, + TOKENEXTRA **tp) { + SPEED_FEATURES *const sf = &cpi->sf; + VP9_COMMON *const cm = &cpi->common; + TileInfo *const tile_info = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int mi_col_start = tile_info->mi_col_start; + const int mi_col_end = tile_info->mi_col_end; + int mi_col; + const int sb_row = mi_row >> MI_BLOCK_SIZE_LOG2; + const int num_sb_cols = + get_num_cols(tile_data->tile_info, MI_BLOCK_SIZE_LOG2); + int sb_col_in_tile; + + // Initialize the left context for the new SB row + memset(&xd->left_context, 0, sizeof(xd->left_context)); + memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context)); + + // Code each SB in the row + for (mi_col = mi_col_start, sb_col_in_tile = 0; mi_col < mi_col_end; + mi_col += MI_BLOCK_SIZE, ++sb_col_in_tile) { + const struct segmentation *const seg = &cm->seg; + RD_COST dummy_rdc; + const int idx_str = cm->mi_stride * mi_row + mi_col; + MODE_INFO **mi = cm->mi_grid_visible + idx_str; + PARTITION_SEARCH_TYPE partition_search_type = sf->partition_search_type; + BLOCK_SIZE bsize = BLOCK_64X64; + int seg_skip = 0; + int i; + + (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, sb_row, + sb_col_in_tile); + + if (cpi->use_skin_detection) { + vp9_compute_skin_sb(cpi, BLOCK_16X16, mi_row, mi_col); + } + + x->source_variance = UINT_MAX; + for (i = 0; i < MAX_REF_FRAMES; ++i) { + x->pred_mv[i].row = INT16_MAX; + x->pred_mv[i].col = INT16_MAX; + } + vp9_rd_cost_init(&dummy_rdc); + x->color_sensitivity[0] = 0; + x->color_sensitivity[1] = 0; + x->sb_is_skin = 0; + x->skip_low_source_sad = 0; + x->lowvar_highsumdiff = 0; + x->content_state_sb = 0; + x->zero_temp_sad_source = 0; + x->sb_use_mv_part = 0; + x->sb_mvcol_part = 0; + x->sb_mvrow_part = 0; + x->sb_pickmode_part = 0; + x->arf_frame_usage = 0; + x->lastgolden_frame_usage = 0; + + if (cpi->compute_source_sad_onepass && cpi->sf.use_source_sad) { + int shift = cpi->Source->y_stride * (mi_row << 3) + (mi_col << 3); + int sb_offset2 = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3); + int64_t source_sad = avg_source_sad(cpi, x, shift, sb_offset2); + if (sf->adapt_partition_source_sad && + (cpi->oxcf.rc_mode == VPX_VBR && !cpi->rc.is_src_frame_alt_ref && + source_sad > sf->adapt_partition_thresh && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) + partition_search_type = REFERENCE_PARTITION; + } + + if (seg->enabled) { + const uint8_t *const map = + seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; + int segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col); + seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP); + + if (cpi->roi.enabled && cpi->roi.skip[BACKGROUND_SEG_SKIP_ID] && + cpi->rc.frames_since_key > FRAMES_NO_SKIPPING_AFTER_KEY && + x->content_state_sb > kLowSadLowSumdiff) { + // For ROI with skip, force segment = 0 (no skip) over whole + // superblock to avoid artifacts if temporal change in source_sad is + // not 0. + int xi, yi; + const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64]; + const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64]; + const int xmis = VPXMIN(cm->mi_cols - mi_col, bw); + const int ymis = VPXMIN(cm->mi_rows - mi_row, bh); + const int block_index = mi_row * cm->mi_cols + mi_col; + set_mode_info_offsets(cm, x, xd, mi_row, mi_col); + for (yi = 0; yi < ymis; yi++) + for (xi = 0; xi < xmis; xi++) { + int map_offset = block_index + yi * cm->mi_cols + xi; + cpi->segmentation_map[map_offset] = 0; + } + set_segment_index(cpi, x, mi_row, mi_col, BLOCK_64X64, 0); + seg_skip = 0; + } + if (seg_skip) { + partition_search_type = FIXED_PARTITION; + } + } + + // Set the partition type of the 64X64 block + switch (partition_search_type) { + case VAR_BASED_PARTITION: + // TODO(jingning, marpan): The mode decision and encoding process + // support both intra and inter sub8x8 block coding for RTC mode. + // Tune the thresholds accordingly to use sub8x8 block coding for + // coding performance improvement. + choose_partitioning(cpi, tile_info, x, mi_row, mi_col); + nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, + BLOCK_64X64, 1, &dummy_rdc, td->pc_root); + break; + case ML_BASED_PARTITION: + get_estimated_pred(cpi, tile_info, x, mi_row, mi_col); + x->max_partition_size = BLOCK_64X64; + x->min_partition_size = BLOCK_8X8; + x->sb_pickmode_part = 1; + nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, + BLOCK_64X64, &dummy_rdc, 1, INT64_MAX, + td->pc_root); + break; + case SOURCE_VAR_BASED_PARTITION: + set_source_var_based_partition(cpi, tile_info, x, mi, mi_row, mi_col); + nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, + BLOCK_64X64, 1, &dummy_rdc, td->pc_root); + break; + case FIXED_PARTITION: + if (!seg_skip) bsize = sf->always_this_block_size; + set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize); + nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, + BLOCK_64X64, 1, &dummy_rdc, td->pc_root); + break; + default: + assert(partition_search_type == REFERENCE_PARTITION); + x->sb_pickmode_part = 1; + set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64); + // Use nonrd_pick_partition on scene-cut for VBR mode. + // nonrd_pick_partition does not support 4x4 partition, so avoid it + // on key frame for now. + if ((cpi->oxcf.rc_mode == VPX_VBR && cpi->rc.high_source_sad && + cpi->oxcf.speed < 6 && !frame_is_intra_only(cm) && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) { + // Use lower max_partition_size for low resolutions. + if (cm->width <= 352 && cm->height <= 288) + x->max_partition_size = BLOCK_32X32; + else + x->max_partition_size = BLOCK_64X64; + x->min_partition_size = BLOCK_8X8; + nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, + BLOCK_64X64, &dummy_rdc, 1, INT64_MAX, + td->pc_root); + } else { + choose_partitioning(cpi, tile_info, x, mi_row, mi_col); + // TODO(marpan): Seems like nonrd_select_partition does not support + // 4x4 partition. Since 4x4 is used on key frame, use this switch + // for now. + if (frame_is_intra_only(cm)) + nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, + BLOCK_64X64, 1, &dummy_rdc, td->pc_root); + else + nonrd_select_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, + BLOCK_64X64, 1, &dummy_rdc, td->pc_root); + } + + break; + } + + // Update ref_frame usage for inter frame if this group is ARF group. + if (!cpi->rc.is_src_frame_alt_ref && !cpi->refresh_golden_frame && + !cpi->refresh_alt_ref_frame && cpi->rc.alt_ref_gf_group && + cpi->sf.use_altref_onepass) { + int sboffset = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3); + if (cpi->count_arf_frame_usage != NULL) + cpi->count_arf_frame_usage[sboffset] = x->arf_frame_usage; + if (cpi->count_lastgolden_frame_usage != NULL) + cpi->count_lastgolden_frame_usage[sboffset] = x->lastgolden_frame_usage; + } + + (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row, + sb_col_in_tile, num_sb_cols); + } +} +// end RTC play code + +static INLINE uint32_t variance(const Diff *const d) { + return d->sse - (uint32_t)(((int64_t)d->sum * d->sum) >> 8); +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE uint32_t variance_highbd(Diff *const d) { + const int64_t var = (int64_t)d->sse - (((int64_t)d->sum * d->sum) >> 8); + return (var >= 0) ? (uint32_t)var : 0; +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +static int set_var_thresh_from_histogram(VP9_COMP *cpi) { + const SPEED_FEATURES *const sf = &cpi->sf; + const VP9_COMMON *const cm = &cpi->common; + + const uint8_t *src = cpi->Source->y_buffer; + const uint8_t *last_src = cpi->Last_Source->y_buffer; + const int src_stride = cpi->Source->y_stride; + const int last_stride = cpi->Last_Source->y_stride; + + // Pick cutoff threshold + const int cutoff = (VPXMIN(cm->width, cm->height) >= 720) + ? (cm->MBs * VAR_HIST_LARGE_CUT_OFF / 100) + : (cm->MBs * VAR_HIST_SMALL_CUT_OFF / 100); + DECLARE_ALIGNED(16, int, hist[VAR_HIST_BINS]); + Diff *var16 = cpi->source_diff_var; + + int sum = 0; + int i, j; + + memset(hist, 0, VAR_HIST_BINS * sizeof(hist[0])); + + for (i = 0; i < cm->mb_rows; i++) { + for (j = 0; j < cm->mb_cols; j++) { +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) { + switch (cm->bit_depth) { + case VPX_BITS_8: + vpx_highbd_8_get16x16var(src, src_stride, last_src, last_stride, + &var16->sse, &var16->sum); + var16->var = variance(var16); + break; + case VPX_BITS_10: + vpx_highbd_10_get16x16var(src, src_stride, last_src, last_stride, + &var16->sse, &var16->sum); + var16->var = variance_highbd(var16); + break; + default: + assert(cm->bit_depth == VPX_BITS_12); + vpx_highbd_12_get16x16var(src, src_stride, last_src, last_stride, + &var16->sse, &var16->sum); + var16->var = variance_highbd(var16); + break; + } + } else { + vpx_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, + &var16->sum); + var16->var = variance(var16); + } +#else + vpx_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, + &var16->sum); + var16->var = variance(var16); +#endif // CONFIG_VP9_HIGHBITDEPTH + + if (var16->var >= VAR_HIST_MAX_BG_VAR) + hist[VAR_HIST_BINS - 1]++; + else + hist[var16->var / VAR_HIST_FACTOR]++; + + src += 16; + last_src += 16; + var16++; + } + + src = src - cm->mb_cols * 16 + 16 * src_stride; + last_src = last_src - cm->mb_cols * 16 + 16 * last_stride; + } + + cpi->source_var_thresh = 0; + + if (hist[VAR_HIST_BINS - 1] < cutoff) { + for (i = 0; i < VAR_HIST_BINS - 1; i++) { + sum += hist[i]; + + if (sum > cutoff) { + cpi->source_var_thresh = (i + 1) * VAR_HIST_FACTOR; + return 0; + } + } + } + + return sf->search_type_check_frequency; +} + +static void source_var_based_partition_search_method(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + SPEED_FEATURES *const sf = &cpi->sf; + + if (cm->frame_type == KEY_FRAME) { + // For key frame, use SEARCH_PARTITION. + sf->partition_search_type = SEARCH_PARTITION; + } else if (cm->intra_only) { + sf->partition_search_type = FIXED_PARTITION; + } else { + if (cm->last_width != cm->width || cm->last_height != cm->height) { + if (cpi->source_diff_var) vpx_free(cpi->source_diff_var); + + CHECK_MEM_ERROR(&cm->error, cpi->source_diff_var, + vpx_calloc(cm->MBs, sizeof(cpi->source_diff_var))); + } + + if (!cpi->frames_till_next_var_check) + cpi->frames_till_next_var_check = set_var_thresh_from_histogram(cpi); + + if (cpi->frames_till_next_var_check > 0) { + sf->partition_search_type = FIXED_PARTITION; + cpi->frames_till_next_var_check--; + } + } +} + +static int get_skip_encode_frame(const VP9_COMMON *cm, ThreadData *const td) { + unsigned int intra_count = 0, inter_count = 0; + int j; + + for (j = 0; j < INTRA_INTER_CONTEXTS; ++j) { + intra_count += td->counts->intra_inter[j][0]; + inter_count += td->counts->intra_inter[j][1]; + } + + return (intra_count << 2) < inter_count && cm->frame_type != KEY_FRAME && + cm->show_frame; +} + +void vp9_init_tile_data(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + int tile_col, tile_row; + TOKENEXTRA *pre_tok = cpi->tile_tok[0][0]; + TOKENLIST *tplist = cpi->tplist[0][0]; + int tile_tok = 0; + int tplist_count = 0; + + if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) { + if (cpi->tile_data != NULL) vpx_free(cpi->tile_data); + CHECK_MEM_ERROR( + &cm->error, cpi->tile_data, + vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data))); + cpi->allocated_tiles = tile_cols * tile_rows; + + for (tile_row = 0; tile_row < tile_rows; ++tile_row) + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + TileDataEnc *tile_data = + &cpi->tile_data[tile_row * tile_cols + tile_col]; + int i, j; + const MV zero_mv = { 0, 0 }; + for (i = 0; i < BLOCK_SIZES; ++i) { + for (j = 0; j < MAX_MODES; ++j) { + tile_data->thresh_freq_fact[i][j] = RD_THRESH_INIT_FACT; + tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT; + tile_data->mode_map[i][j] = j; + } + } + tile_data->firstpass_top_mv = zero_mv; +#if CONFIG_MULTITHREAD + tile_data->row_base_thresh_freq_fact = NULL; +#endif + } + } + + for (tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + TileInfo *tile_info = &this_tile->tile_info; + if (cpi->sf.adaptive_rd_thresh_row_mt && + this_tile->row_base_thresh_freq_fact == NULL) + vp9_row_mt_alloc_rd_thresh(cpi, this_tile); + vp9_tile_init(tile_info, cm, tile_row, tile_col); + + cpi->tile_tok[tile_row][tile_col] = pre_tok + tile_tok; + pre_tok = cpi->tile_tok[tile_row][tile_col]; + tile_tok = allocated_tokens(*tile_info); + + cpi->tplist[tile_row][tile_col] = tplist + tplist_count; + tplist = cpi->tplist[tile_row][tile_col]; + tplist_count = get_num_vert_units(*tile_info, MI_BLOCK_SIZE_LOG2); + } + } +} + +void vp9_encode_sb_row(VP9_COMP *cpi, ThreadData *td, int tile_row, + int tile_col, int mi_row) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + const TileInfo *const tile_info = &this_tile->tile_info; + TOKENEXTRA *tok = NULL; + int tile_sb_row; + int tile_mb_cols = (tile_info->mi_col_end - tile_info->mi_col_start + 1) >> 1; + + tile_sb_row = mi_cols_aligned_to_sb(mi_row - tile_info->mi_row_start) >> + MI_BLOCK_SIZE_LOG2; + get_start_tok(cpi, tile_row, tile_col, mi_row, &tok); + cpi->tplist[tile_row][tile_col][tile_sb_row].start = tok; + +#if CONFIG_REALTIME_ONLY + assert(cpi->sf.use_nonrd_pick_mode); + encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok); +#else + if (cpi->sf.use_nonrd_pick_mode) + encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok); + else + encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok); +#endif + + cpi->tplist[tile_row][tile_col][tile_sb_row].stop = tok; + cpi->tplist[tile_row][tile_col][tile_sb_row].count = + (unsigned int)(cpi->tplist[tile_row][tile_col][tile_sb_row].stop - + cpi->tplist[tile_row][tile_col][tile_sb_row].start); + assert(tok - cpi->tplist[tile_row][tile_col][tile_sb_row].start <= + get_token_alloc(MI_BLOCK_SIZE >> 1, tile_mb_cols)); + + (void)tile_mb_cols; +} + +void vp9_encode_tile(VP9_COMP *cpi, ThreadData *td, int tile_row, + int tile_col) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + const TileInfo *const tile_info = &this_tile->tile_info; + const int mi_row_start = tile_info->mi_row_start; + const int mi_row_end = tile_info->mi_row_end; + int mi_row; + + for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += MI_BLOCK_SIZE) + vp9_encode_sb_row(cpi, td, tile_row, tile_col, mi_row); +} + +static void encode_tiles(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + int tile_col, tile_row; + + vp9_init_tile_data(cpi); + + for (tile_row = 0; tile_row < tile_rows; ++tile_row) + for (tile_col = 0; tile_col < tile_cols; ++tile_col) + vp9_encode_tile(cpi, &cpi->td, tile_row, tile_col); +} + +static int compare_kmeans_data(const void *a, const void *b) { + if (((const KMEANS_DATA *)a)->value > ((const KMEANS_DATA *)b)->value) { + return 1; + } else if (((const KMEANS_DATA *)a)->value < + ((const KMEANS_DATA *)b)->value) { + return -1; + } else { + return 0; + } +} + +static void compute_boundary_ls(const double *ctr_ls, int k, + double *boundary_ls) { + // boundary_ls[j] is the upper bound of data centered at ctr_ls[j] + int j; + for (j = 0; j < k - 1; ++j) { + boundary_ls[j] = (ctr_ls[j] + ctr_ls[j + 1]) / 2.; + } + boundary_ls[k - 1] = DBL_MAX; +} + +int vp9_get_group_idx(double value, double *boundary_ls, int k) { + int group_idx = 0; + while (value >= boundary_ls[group_idx]) { + ++group_idx; + if (group_idx == k - 1) { + break; + } + } + return group_idx; +} + +void vp9_kmeans(double *ctr_ls, double *boundary_ls, int *count_ls, int k, + KMEANS_DATA *arr, int size) { + int i, j; + int itr; + int group_idx; + double sum[MAX_KMEANS_GROUPS]; + int count[MAX_KMEANS_GROUPS]; + + vpx_clear_system_state(); + + assert(k >= 2 && k <= MAX_KMEANS_GROUPS); + + qsort(arr, size, sizeof(*arr), compare_kmeans_data); + + // initialize the center points + for (j = 0; j < k; ++j) { + ctr_ls[j] = arr[(size * (2 * j + 1)) / (2 * k)].value; + } + + for (itr = 0; itr < 10; ++itr) { + compute_boundary_ls(ctr_ls, k, boundary_ls); + for (i = 0; i < MAX_KMEANS_GROUPS; ++i) { + sum[i] = 0; + count[i] = 0; + } + + // Both the data and centers are sorted in ascending order. + // As each data point is processed in order, its corresponding group index + // can only increase. So we only need to reset the group index to zero here. + group_idx = 0; + for (i = 0; i < size; ++i) { + while (arr[i].value >= boundary_ls[group_idx]) { + // place samples into clusters + ++group_idx; + if (group_idx == k - 1) { + break; + } + } + sum[group_idx] += arr[i].value; + ++count[group_idx]; + } + + for (group_idx = 0; group_idx < k; ++group_idx) { + if (count[group_idx] > 0) + ctr_ls[group_idx] = sum[group_idx] / count[group_idx]; + + sum[group_idx] = 0; + count[group_idx] = 0; + } + } + + // compute group_idx, boundary_ls and count_ls + for (j = 0; j < k; ++j) { + count_ls[j] = 0; + } + compute_boundary_ls(ctr_ls, k, boundary_ls); + group_idx = 0; + for (i = 0; i < size; ++i) { + while (arr[i].value >= boundary_ls[group_idx]) { + ++group_idx; + if (group_idx == k - 1) { + break; + } + } + arr[i].group_idx = group_idx; + ++count_ls[group_idx]; + } +} + +static void encode_frame_internal(VP9_COMP *cpi) { + SPEED_FEATURES *const sf = &cpi->sf; + ThreadData *const td = &cpi->td; + MACROBLOCK *const x = &td->mb; + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + const int gf_group_index = cpi->twopass.gf_group.index; + + xd->mi = cm->mi_grid_visible; + xd->mi[0] = cm->mi; + vp9_zero(*td->counts); + vp9_zero(cpi->td.rd_counts); + + xd->lossless = cm->base_qindex == 0 && cm->y_dc_delta_q == 0 && + cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0; + +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) + x->fwd_txfm4x4 = xd->lossless ? vp9_highbd_fwht4x4 : vpx_highbd_fdct4x4; + else + x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4; + x->highbd_inv_txfm_add = + xd->lossless ? vp9_highbd_iwht4x4_add : vp9_highbd_idct4x4_add; +#else + x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4; +#endif // CONFIG_VP9_HIGHBITDEPTH + x->inv_txfm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; + x->optimize = sf->optimize_coefficients == 1 && cpi->oxcf.pass != 1; + if (xd->lossless) x->optimize = 0; + x->sharpness = cpi->oxcf.sharpness; + x->adjust_rdmult_by_segment = (cpi->oxcf.aq_mode == VARIANCE_AQ); + + cm->tx_mode = select_tx_mode(cpi, xd); + + vp9_frame_init_quantizer(cpi); + + vp9_initialize_rd_consts(cpi); + vp9_initialize_me_consts(cpi, x, cm->base_qindex); + init_encode_frame_mb_context(cpi); + cm->use_prev_frame_mvs = + !cm->error_resilient_mode && cm->width == cm->last_width && + cm->height == cm->last_height && !cm->intra_only && cm->last_show_frame; + // Special case: set prev_mi to NULL when the previous mode info + // context cannot be used. + cm->prev_mi = + cm->use_prev_frame_mvs ? cm->prev_mip + cm->mi_stride + 1 : NULL; + + x->quant_fp = cpi->sf.use_quant_fp; + vp9_zero(x->skip_txfm); + if (sf->use_nonrd_pick_mode) { + // Initialize internal buffer pointers for rtc coding, where non-RD + // mode decision is used and hence no buffer pointer swap needed. + int i; + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = xd->plane; + PICK_MODE_CONTEXT *ctx = &cpi->td.pc_root->none; + + for (i = 0; i < MAX_MB_PLANE; ++i) { + p[i].coeff = ctx->coeff_pbuf[i][0]; + p[i].qcoeff = ctx->qcoeff_pbuf[i][0]; + pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][0]; + p[i].eobs = ctx->eobs_pbuf[i][0]; + } + vp9_zero(x->zcoeff_blk); + + if (cm->frame_type != KEY_FRAME && cpi->rc.frames_since_golden == 0 && + !(cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR) && + !cpi->use_svc) + cpi->ref_frame_flags &= (~VP9_GOLD_FLAG); + + if (sf->partition_search_type == SOURCE_VAR_BASED_PARTITION) + source_var_based_partition_search_method(cpi); + } else if (gf_group_index && gf_group_index < MAX_ARF_GOP_SIZE && + cpi->sf.enable_tpl_model) { + TplDepFrame *tpl_frame = &cpi->tpl_stats[cpi->twopass.gf_group.index]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + + int tpl_stride = tpl_frame->stride; + int64_t intra_cost_base = 0; + int64_t mc_dep_cost_base = 0; + int row, col; + + for (row = 0; row < cm->mi_rows && tpl_frame->is_valid; ++row) { + for (col = 0; col < cm->mi_cols; ++col) { + TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col]; + intra_cost_base += this_stats->intra_cost; + mc_dep_cost_base += this_stats->mc_dep_cost; + } + } + + vpx_clear_system_state(); + + if (tpl_frame->is_valid) + cpi->rd.r0 = (double)intra_cost_base / mc_dep_cost_base; + } + + for (MV_REFERENCE_FRAME ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; + ++ref_frame) { + if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) { + if (cm->frame_refs[ref_frame - 1].sf.x_scale_fp == REF_INVALID_SCALE || + cm->frame_refs[ref_frame - 1].sf.y_scale_fp == REF_INVALID_SCALE) + cpi->ref_frame_flags &= ~ref_frame_to_flag(ref_frame); + } + } + + // Frame segmentation + if (cpi->oxcf.aq_mode == PERCEPTUAL_AQ) build_kmeans_segmentation(cpi); + + { + struct vpx_usec_timer emr_timer; + vpx_usec_timer_start(&emr_timer); + + if (!cpi->row_mt) { + cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read_dummy; + cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write_dummy; + // If allowed, encoding tiles in parallel with one thread handling one + // tile when row based multi-threading is disabled. + if (VPXMIN(cpi->oxcf.max_threads, 1 << cm->log2_tile_cols) > 1) + vp9_encode_tiles_mt(cpi); + else + encode_tiles(cpi); + } else { + cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read; + cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write; + vp9_encode_tiles_row_mt(cpi); + } + + vpx_usec_timer_mark(&emr_timer); + cpi->time_encode_sb_row += vpx_usec_timer_elapsed(&emr_timer); + } + + sf->skip_encode_frame = + sf->skip_encode_sb ? get_skip_encode_frame(cm, td) : 0; + +#if 0 + // Keep record of the total distortion this time around for future use + cpi->last_frame_distortion = cpi->frame_distortion; +#endif +} + +static INTERP_FILTER get_interp_filter( + const int64_t threshes[SWITCHABLE_FILTER_CONTEXTS], int is_alt_ref) { + if (!is_alt_ref && threshes[EIGHTTAP_SMOOTH] > threshes[EIGHTTAP] && + threshes[EIGHTTAP_SMOOTH] > threshes[EIGHTTAP_SHARP] && + threshes[EIGHTTAP_SMOOTH] > threshes[SWITCHABLE - 1]) { + return EIGHTTAP_SMOOTH; + } else if (threshes[EIGHTTAP_SHARP] > threshes[EIGHTTAP] && + threshes[EIGHTTAP_SHARP] > threshes[SWITCHABLE - 1]) { + return EIGHTTAP_SHARP; + } else if (threshes[EIGHTTAP] > threshes[SWITCHABLE - 1]) { + return EIGHTTAP; + } else { + return SWITCHABLE; + } +} + +static int compute_frame_aq_offset(struct VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + MODE_INFO **mi_8x8_ptr = cm->mi_grid_visible; + struct segmentation *const seg = &cm->seg; + + int mi_row, mi_col; + int sum_delta = 0; + int qdelta_index; + int segment_id; + + for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) { + MODE_INFO **mi_8x8 = mi_8x8_ptr; + for (mi_col = 0; mi_col < cm->mi_cols; mi_col++, mi_8x8++) { + segment_id = mi_8x8[0]->segment_id; + qdelta_index = get_segdata(seg, segment_id, SEG_LVL_ALT_Q); + sum_delta += qdelta_index; + } + mi_8x8_ptr += cm->mi_stride; + } + + return sum_delta / (cm->mi_rows * cm->mi_cols); +} + +static void restore_encode_params(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + int tile_idx; + int i, j; + TileDataEnc *tile_data; + RD_OPT *rd_opt = &cpi->rd; + for (i = 0; i < MAX_REF_FRAMES; i++) { + for (j = 0; j < REFERENCE_MODES; j++) + rd_opt->prediction_type_threshes[i][j] = + rd_opt->prediction_type_threshes_prev[i][j]; + + for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; j++) + rd_opt->filter_threshes[i][j] = rd_opt->filter_threshes_prev[i][j]; + } + + for (tile_idx = 0; tile_idx < cpi->allocated_tiles; tile_idx++) { + assert(cpi->tile_data); + tile_data = &cpi->tile_data[tile_idx]; + vp9_copy(tile_data->thresh_freq_fact, tile_data->thresh_freq_fact_prev); + } + + cm->interp_filter = cpi->sf.default_interp_filter; +} + +void vp9_encode_frame(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + + restore_encode_params(cpi); + +#if CONFIG_MISMATCH_DEBUG + mismatch_reset_frame(MAX_MB_PLANE); +#endif + + // In the longer term the encoder should be generalized to match the + // decoder such that we allow compound where one of the 3 buffers has a + // different sign bias and that buffer is then the fixed ref. However, this + // requires further work in the rd loop. For now the only supported encoder + // side behavior is where the ALT ref buffer has opposite sign bias to + // the other two. + if (!frame_is_intra_only(cm)) { + if (vp9_compound_reference_allowed(cm)) { + cpi->allow_comp_inter_inter = 1; + vp9_setup_compound_reference_mode(cm); + } else { + cpi->allow_comp_inter_inter = 0; + } + } + + if (cpi->sf.frame_parameter_update) { + int i; + RD_OPT *const rd_opt = &cpi->rd; + FRAME_COUNTS *counts = cpi->td.counts; + RD_COUNTS *const rdc = &cpi->td.rd_counts; + + // This code does a single RD pass over the whole frame assuming + // either compound, single or hybrid prediction as per whatever has + // worked best for that type of frame in the past. + // It also predicts whether another coding mode would have worked + // better than this coding mode. If that is the case, it remembers + // that for subsequent frames. + // It also does the same analysis for transform size selection. + const MV_REFERENCE_FRAME frame_type = get_frame_type(cpi); + int64_t *const mode_thrs = rd_opt->prediction_type_threshes[frame_type]; + int64_t *const filter_thrs = rd_opt->filter_threshes[frame_type]; + const int is_alt_ref = frame_type == ALTREF_FRAME; + + /* prediction (compound, single or hybrid) mode selection */ + if (is_alt_ref || !cpi->allow_comp_inter_inter) + cm->reference_mode = SINGLE_REFERENCE; + else if (mode_thrs[COMPOUND_REFERENCE] > mode_thrs[SINGLE_REFERENCE] && + mode_thrs[COMPOUND_REFERENCE] > mode_thrs[REFERENCE_MODE_SELECT] && + check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100) + cm->reference_mode = COMPOUND_REFERENCE; + else if (mode_thrs[SINGLE_REFERENCE] > mode_thrs[REFERENCE_MODE_SELECT]) + cm->reference_mode = SINGLE_REFERENCE; + else + cm->reference_mode = REFERENCE_MODE_SELECT; + + if (cm->interp_filter == SWITCHABLE) + cm->interp_filter = get_interp_filter(filter_thrs, is_alt_ref); + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, encode_frame_internal_time); +#endif + encode_frame_internal(cpi); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, encode_frame_internal_time); +#endif + + for (i = 0; i < REFERENCE_MODES; ++i) + mode_thrs[i] = (mode_thrs[i] + rdc->comp_pred_diff[i] / cm->MBs) / 2; + + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) + filter_thrs[i] = (filter_thrs[i] + rdc->filter_diff[i] / cm->MBs) / 2; + + if (cm->reference_mode == REFERENCE_MODE_SELECT) { + int single_count_zero = 0; + int comp_count_zero = 0; + + for (i = 0; i < COMP_INTER_CONTEXTS; i++) { + single_count_zero += counts->comp_inter[i][0]; + comp_count_zero += counts->comp_inter[i][1]; + } + + if (comp_count_zero == 0) { + cm->reference_mode = SINGLE_REFERENCE; + vp9_zero(counts->comp_inter); + } else if (single_count_zero == 0) { + cm->reference_mode = COMPOUND_REFERENCE; + vp9_zero(counts->comp_inter); + } + } + + if (cm->tx_mode == TX_MODE_SELECT) { + int count4x4 = 0; + int count8x8_lp = 0, count8x8_8x8p = 0; + int count16x16_16x16p = 0, count16x16_lp = 0; + int count32x32 = 0; + + for (i = 0; i < TX_SIZE_CONTEXTS; ++i) { + count4x4 += counts->tx.p32x32[i][TX_4X4]; + count4x4 += counts->tx.p16x16[i][TX_4X4]; + count4x4 += counts->tx.p8x8[i][TX_4X4]; + + count8x8_lp += counts->tx.p32x32[i][TX_8X8]; + count8x8_lp += counts->tx.p16x16[i][TX_8X8]; + count8x8_8x8p += counts->tx.p8x8[i][TX_8X8]; + + count16x16_16x16p += counts->tx.p16x16[i][TX_16X16]; + count16x16_lp += counts->tx.p32x32[i][TX_16X16]; + count32x32 += counts->tx.p32x32[i][TX_32X32]; + } + if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 && + count32x32 == 0) { + cm->tx_mode = ALLOW_8X8; + reset_skip_tx_size(cm, TX_8X8); + } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 && + count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) { + cm->tx_mode = ONLY_4X4; + reset_skip_tx_size(cm, TX_4X4); + } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) { + cm->tx_mode = ALLOW_32X32; + } else if (count32x32 == 0 && count8x8_lp == 0 && count4x4 == 0) { + cm->tx_mode = ALLOW_16X16; + reset_skip_tx_size(cm, TX_16X16); + } + } + } else { + FRAME_COUNTS *counts = cpi->td.counts; + cm->reference_mode = SINGLE_REFERENCE; + if (cpi->allow_comp_inter_inter && cpi->sf.use_compound_nonrd_pickmode && + cpi->rc.alt_ref_gf_group && !cpi->rc.is_src_frame_alt_ref && + cm->frame_type != KEY_FRAME) + cm->reference_mode = REFERENCE_MODE_SELECT; + + encode_frame_internal(cpi); + + if (cm->reference_mode == REFERENCE_MODE_SELECT) { + int single_count_zero = 0; + int comp_count_zero = 0; + int i; + for (i = 0; i < COMP_INTER_CONTEXTS; i++) { + single_count_zero += counts->comp_inter[i][0]; + comp_count_zero += counts->comp_inter[i][1]; + } + if (comp_count_zero == 0) { + cm->reference_mode = SINGLE_REFERENCE; + vp9_zero(counts->comp_inter); + } else if (single_count_zero == 0) { + cm->reference_mode = COMPOUND_REFERENCE; + vp9_zero(counts->comp_inter); + } + } + } + + // If segmented AQ is enabled compute the average AQ weighting. + if (cm->seg.enabled && (cpi->oxcf.aq_mode != NO_AQ) && + (cm->seg.update_map || cm->seg.update_data)) { + cm->seg.aq_av_offset = compute_frame_aq_offset(cpi); + } +} + +static void sum_intra_stats(FRAME_COUNTS *counts, const MODE_INFO *mi) { + const PREDICTION_MODE y_mode = mi->mode; + const PREDICTION_MODE uv_mode = mi->uv_mode; + const BLOCK_SIZE bsize = mi->sb_type; + + if (bsize < BLOCK_8X8) { + int idx, idy; + const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; + for (idy = 0; idy < 2; idy += num_4x4_h) + for (idx = 0; idx < 2; idx += num_4x4_w) + ++counts->y_mode[0][mi->bmi[idy * 2 + idx].as_mode]; + } else { + ++counts->y_mode[size_group_lookup[bsize]][y_mode]; + } + + ++counts->uv_mode[y_mode][uv_mode]; +} + +static void update_zeromv_cnt(VP9_COMP *const cpi, const MODE_INFO *const mi, + int mi_row, int mi_col, BLOCK_SIZE bsize) { + const VP9_COMMON *const cm = &cpi->common; + MV mv = mi->mv[0].as_mv; + const int bw = num_8x8_blocks_wide_lookup[bsize]; + const int bh = num_8x8_blocks_high_lookup[bsize]; + const int xmis = VPXMIN(cm->mi_cols - mi_col, bw); + const int ymis = VPXMIN(cm->mi_rows - mi_row, bh); + const int block_index = mi_row * cm->mi_cols + mi_col; + int x, y; + for (y = 0; y < ymis; y++) + for (x = 0; x < xmis; x++) { + int map_offset = block_index + y * cm->mi_cols + x; + if (mi->ref_frame[0] == LAST_FRAME && is_inter_block(mi) && + mi->segment_id <= CR_SEGMENT_ID_BOOST2) { + if (abs(mv.row) < 8 && abs(mv.col) < 8) { + if (cpi->consec_zero_mv[map_offset] < 255) + cpi->consec_zero_mv[map_offset]++; + } else { + cpi->consec_zero_mv[map_offset] = 0; + } + } + } +} + +static void encode_superblock(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t, + int output_enabled, int mi_row, int mi_col, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *mi = xd->mi[0]; + const int seg_skip = + segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP); + x->skip_recode = !x->select_tx_size && mi->sb_type >= BLOCK_8X8 && + cpi->oxcf.aq_mode != COMPLEXITY_AQ && + cpi->oxcf.aq_mode != CYCLIC_REFRESH_AQ && + cpi->sf.allow_skip_recode; + + if (!x->skip_recode && !cpi->sf.use_nonrd_pick_mode) + memset(x->skip_txfm, 0, sizeof(x->skip_txfm)); + + x->skip_optimize = ctx->is_coded; + ctx->is_coded = 1; + x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct; + x->skip_encode = (!output_enabled && cpi->sf.skip_encode_frame && + x->q_index < QIDX_SKIP_THRESH); + + if (x->skip_encode) return; + + if (!is_inter_block(mi)) { + int plane; +#if CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH + if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && + (xd->above_mi == NULL || xd->left_mi == NULL) && + need_top_left[mi->uv_mode]) + assert(0); +#endif // CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH + mi->skip = 1; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) + vp9_encode_intra_block_plane(x, VPXMAX(bsize, BLOCK_8X8), plane, 1); + if (output_enabled) sum_intra_stats(td->counts, mi); + vp9_tokenize_sb(cpi, td, t, !output_enabled, seg_skip, + VPXMAX(bsize, BLOCK_8X8)); + } else { + int ref; + const int is_compound = has_second_ref(mi); + set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); + for (ref = 0; ref < 1 + is_compound; ++ref) { + YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mi->ref_frame[ref]); + assert(cfg != NULL); + vp9_setup_pre_planes(xd, ref, cfg, mi_row, mi_col, + &xd->block_refs[ref]->sf); + } + if (!(cpi->sf.reuse_inter_pred_sby && ctx->pred_pixel_ready) || seg_skip) + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, + VPXMAX(bsize, BLOCK_8X8)); + + vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, + VPXMAX(bsize, BLOCK_8X8)); + +#if CONFIG_MISMATCH_DEBUG + if (output_enabled) { + int plane; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + const struct macroblockd_plane *pd = &xd->plane[plane]; + int pixel_c, pixel_r; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(VPXMAX(bsize, BLOCK_8X8), &xd->plane[plane]); + const int bw = get_block_width(plane_bsize); + const int bh = get_block_height(plane_bsize); + mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0, + pd->subsampling_x, pd->subsampling_y); + + mismatch_record_block_pre(pd->dst.buf, pd->dst.stride, plane, pixel_c, + pixel_r, bw, bh, + xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); + } + } +#endif + + vp9_encode_sb(x, VPXMAX(bsize, BLOCK_8X8), mi_row, mi_col, output_enabled); + vp9_tokenize_sb(cpi, td, t, !output_enabled, seg_skip, + VPXMAX(bsize, BLOCK_8X8)); + } + + if (seg_skip) { + assert(mi->skip); + } + + if (output_enabled) { + if (cm->tx_mode == TX_MODE_SELECT && mi->sb_type >= BLOCK_8X8 && + !(is_inter_block(mi) && mi->skip)) { + ++get_tx_counts(max_txsize_lookup[bsize], get_tx_size_context(xd), + &td->counts->tx)[mi->tx_size]; + } else { + // The new intra coding scheme requires no change of transform size + if (is_inter_block(mi)) { + mi->tx_size = VPXMIN(tx_mode_to_biggest_tx_size[cm->tx_mode], + max_txsize_lookup[bsize]); + } else { + mi->tx_size = (bsize >= BLOCK_8X8) ? mi->tx_size : TX_4X4; + } + } + + ++td->counts->tx.tx_totals[mi->tx_size]; + ++td->counts->tx.tx_totals[get_uv_tx_size(mi, &xd->plane[1])]; + if (cm->seg.enabled && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && + cpi->cyclic_refresh->content_mode) + vp9_cyclic_refresh_update_sb_postencode(cpi, mi, mi_row, mi_col, bsize); + if (cpi->oxcf.pass == 0 && cpi->svc.temporal_layer_id == 0 && + (!cpi->use_svc || + (cpi->use_svc && + !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1))) + update_zeromv_cnt(cpi, mi, mi_row, mi_col, bsize); + } +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.h b/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.h new file mode 100644 index 0000000000..fd0a9c517e --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_ENCODEFRAME_H_ +#define VPX_VP9_ENCODER_VP9_ENCODEFRAME_H_ + +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct macroblock; +struct yv12_buffer_config; +struct VP9_COMP; +struct ThreadData; + +// Constants used in SOURCE_VAR_BASED_PARTITION +#define VAR_HIST_MAX_BG_VAR 1000 +#define VAR_HIST_FACTOR 10 +#define VAR_HIST_BINS (VAR_HIST_MAX_BG_VAR / VAR_HIST_FACTOR + 1) +#define VAR_HIST_LARGE_CUT_OFF 75 +#define VAR_HIST_SMALL_CUT_OFF 45 + +void vp9_setup_src_planes(struct macroblock *x, + const struct yv12_buffer_config *src, int mi_row, + int mi_col); + +void vp9_encode_frame(struct VP9_COMP *cpi); + +void vp9_init_tile_data(struct VP9_COMP *cpi); +void vp9_encode_tile(struct VP9_COMP *cpi, struct ThreadData *td, int tile_row, + int tile_col); + +void vp9_encode_sb_row(struct VP9_COMP *cpi, struct ThreadData *td, + int tile_row, int tile_col, int mi_row); + +void vp9_set_variance_partition_thresholds(struct VP9_COMP *cpi, int q, + int content_state); + +struct KMEANS_DATA; +void vp9_kmeans(double *ctr_ls, double *boundary_ls, int *count_ls, int k, + struct KMEANS_DATA *arr, int size); +int vp9_get_group_idx(double value, double *boundary_ls, int k); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_ENCODEFRAME_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encodemb.c b/media/libvpx/libvpx/vp9/encoder/vp9_encodemb.c new file mode 100644 index 0000000000..eded9f5c42 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_encodemb.c @@ -0,0 +1,1061 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_dsp/quantize.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" + +#if CONFIG_MISMATCH_DEBUG +#include "vpx_util/vpx_debug_util.h" +#endif + +#include "vp9/common/vp9_idct.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_scan.h" + +#include "vp9/encoder/vp9_encodemb.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_rd.h" +#include "vp9/encoder/vp9_tokenize.h" + +struct optimize_ctx { + ENTROPY_CONTEXT ta[MAX_MB_PLANE][16]; + ENTROPY_CONTEXT tl[MAX_MB_PLANE][16]; +}; + +void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { + struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane]; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); + const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; + const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize]; + +#if CONFIG_VP9_HIGHBITDEPTH + if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vpx_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, + p->src.stride, pd->dst.buf, pd->dst.stride, + x->e_mbd.bd); + return; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + vpx_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, + pd->dst.buf, pd->dst.stride); +} + +static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = { + { 10, 6 }, + { 8, 5 }, +}; + +// 'num' can be negative, but 'shift' must be non-negative. +#define RIGHT_SHIFT_POSSIBLY_NEGATIVE(num, shift) \ + (((num) >= 0) ? (num) >> (shift) : -((-(num)) >> (shift))) + +int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, + int ctx) { + MACROBLOCKD *const xd = &mb->e_mbd; + struct macroblock_plane *const p = &mb->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ref = is_inter_block(xd->mi[0]); + uint8_t token_cache[1024]; + const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); + tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + const int eob = p->eobs[block]; + const PLANE_TYPE plane_type = get_plane_type(plane); + const int default_eob = 16 << (tx_size << 1); + const int shift = (tx_size == TX_32X32); + const int16_t *const dequant_ptr = pd->dequant; + const uint8_t *const band_translate = get_band_translate(tx_size); + const ScanOrder *const so = get_scan(xd, tx_size, plane_type, block); + const int16_t *const scan = so->scan; + const int16_t *const nb = so->neighbors; + const MODE_INFO *mbmi = xd->mi[0]; + const int sharpness = mb->sharpness; + const int64_t rdadj = (int64_t)mb->rdmult * plane_rd_mult[ref][plane_type]; + const int64_t rdmult = + (sharpness == 0 ? rdadj >> 1 + : (rdadj * (8 - sharpness + mbmi->segment_id)) >> 4); + + const int64_t rddiv = mb->rddiv; + int64_t rd_cost0, rd_cost1; + int64_t rate0, rate1; + int16_t t0, t1; + int i, final_eob; + int count_high_values_after_eob = 0; +#if CONFIG_VP9_HIGHBITDEPTH + const uint16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd); +#else + const uint16_t *cat6_high_cost = vp9_get_high_cost_table(8); +#endif + unsigned int(*const token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] = + mb->token_costs[tx_size][plane_type][ref]; + unsigned int(*token_costs_cur)[2][COEFF_CONTEXTS][ENTROPY_TOKENS]; + int64_t eob_cost0, eob_cost1; + const int ctx0 = ctx; + int64_t accu_rate = 0; + // Initialized to the worst possible error for the largest transform size. + // This ensures that it never goes negative. + int64_t accu_error = ((int64_t)1) << 50; + int64_t best_block_rd_cost = INT64_MAX; + int x_prev = 1; + tran_low_t before_best_eob_qc = 0; + tran_low_t before_best_eob_dqc = 0; + + assert((!plane_type && !plane) || (plane_type && plane)); + assert(eob <= default_eob); + + for (i = 0; i < eob; i++) { + const int rc = scan[i]; + token_cache[rc] = vp9_pt_energy_class[vp9_get_token(qcoeff[rc])]; + } + final_eob = 0; + + // Initial RD cost. + token_costs_cur = token_costs + band_translate[0]; + rate0 = (*token_costs_cur)[0][ctx0][EOB_TOKEN]; + best_block_rd_cost = RDCOST(rdmult, rddiv, rate0, accu_error); + + // For each token, pick one of two choices greedily: + // (i) First candidate: Keep current quantized value, OR + // (ii) Second candidate: Reduce quantized value by 1. + for (i = 0; i < eob; i++) { + const int rc = scan[i]; + const int x = qcoeff[rc]; + const int band_cur = band_translate[i]; + const int ctx_cur = (i == 0) ? ctx : get_coef_context(nb, token_cache, i); + const int token_tree_sel_cur = (x_prev == 0); + token_costs_cur = token_costs + band_cur; + if (x == 0) { // No need to search + const int token = vp9_get_token(x); + rate0 = (*token_costs_cur)[token_tree_sel_cur][ctx_cur][token]; + accu_rate += rate0; + x_prev = 0; + // Note: accu_error does not change. + } else { + const int dqv = dequant_ptr[rc != 0]; + // Compute the distortion for quantizing to 0. + const int diff_for_zero_raw = (0 - coeff[rc]) * (1 << shift); + const int diff_for_zero = +#if CONFIG_VP9_HIGHBITDEPTH + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + ? RIGHT_SHIFT_POSSIBLY_NEGATIVE(diff_for_zero_raw, xd->bd - 8) + : +#endif + diff_for_zero_raw; + const int64_t distortion_for_zero = + (int64_t)diff_for_zero * diff_for_zero; + + // Compute the distortion for the first candidate + const int diff0_raw = (dqcoeff[rc] - coeff[rc]) * (1 << shift); + const int diff0 = +#if CONFIG_VP9_HIGHBITDEPTH + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + ? RIGHT_SHIFT_POSSIBLY_NEGATIVE(diff0_raw, xd->bd - 8) + : +#endif // CONFIG_VP9_HIGHBITDEPTH + diff0_raw; + const int64_t distortion0 = (int64_t)diff0 * diff0; + + // Compute the distortion for the second candidate + const int sign = -(x < 0); // -1 if x is negative and 0 otherwise. + const int x1 = x - 2 * sign - 1; // abs(x1) = abs(x) - 1. + int64_t distortion1; + if (x1 != 0) { + const int dqv_step = +#if CONFIG_VP9_HIGHBITDEPTH + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? dqv >> (xd->bd - 8) + : +#endif // CONFIG_VP9_HIGHBITDEPTH + dqv; + const int diff_step = (dqv_step + sign) ^ sign; + const int diff1 = diff0 - diff_step; + assert(dqv > 0); // We aren't right shifting a negative number above. + distortion1 = (int64_t)diff1 * diff1; + } else { + distortion1 = distortion_for_zero; + } + { + // Calculate RDCost for current coeff for the two candidates. + const int64_t base_bits0 = vp9_get_token_cost(x, &t0, cat6_high_cost); + const int64_t base_bits1 = vp9_get_token_cost(x1, &t1, cat6_high_cost); + rate0 = + base_bits0 + (*token_costs_cur)[token_tree_sel_cur][ctx_cur][t0]; + rate1 = + base_bits1 + (*token_costs_cur)[token_tree_sel_cur][ctx_cur][t1]; + } + { + int rdcost_better_for_x1, eob_rdcost_better_for_x1; + int dqc0, dqc1; + int64_t best_eob_cost_cur; + int use_x1; + + // Calculate RD Cost effect on the next coeff for the two candidates. + int64_t next_bits0 = 0; + int64_t next_bits1 = 0; + int64_t next_eob_bits0 = 0; + int64_t next_eob_bits1 = 0; + if (i < default_eob - 1) { + int ctx_next, token_tree_sel_next; + const int band_next = band_translate[i + 1]; + const int token_next = + (i + 1 != eob) ? vp9_get_token(qcoeff[scan[i + 1]]) : EOB_TOKEN; + unsigned int(*const token_costs_next)[2][COEFF_CONTEXTS] + [ENTROPY_TOKENS] = + token_costs + band_next; + token_cache[rc] = vp9_pt_energy_class[t0]; + ctx_next = get_coef_context(nb, token_cache, i + 1); + token_tree_sel_next = (x == 0); + next_bits0 = + (*token_costs_next)[token_tree_sel_next][ctx_next][token_next]; + next_eob_bits0 = + (*token_costs_next)[token_tree_sel_next][ctx_next][EOB_TOKEN]; + token_cache[rc] = vp9_pt_energy_class[t1]; + ctx_next = get_coef_context(nb, token_cache, i + 1); + token_tree_sel_next = (x1 == 0); + next_bits1 = + (*token_costs_next)[token_tree_sel_next][ctx_next][token_next]; + if (x1 != 0) { + next_eob_bits1 = + (*token_costs_next)[token_tree_sel_next][ctx_next][EOB_TOKEN]; + } + } + + // Compare the total RD costs for two candidates. + rd_cost0 = RDCOST(rdmult, rddiv, (rate0 + next_bits0), distortion0); + rd_cost1 = RDCOST(rdmult, rddiv, (rate1 + next_bits1), distortion1); + rdcost_better_for_x1 = (rd_cost1 < rd_cost0); + eob_cost0 = RDCOST(rdmult, rddiv, (accu_rate + rate0 + next_eob_bits0), + (accu_error + distortion0 - distortion_for_zero)); + eob_cost1 = eob_cost0; + if (x1 != 0) { + eob_cost1 = + RDCOST(rdmult, rddiv, (accu_rate + rate1 + next_eob_bits1), + (accu_error + distortion1 - distortion_for_zero)); + eob_rdcost_better_for_x1 = (eob_cost1 < eob_cost0); + } else { + eob_rdcost_better_for_x1 = 0; + } + + // Calculate the two candidate de-quantized values. + dqc0 = dqcoeff[rc]; + dqc1 = 0; + if (rdcost_better_for_x1 + eob_rdcost_better_for_x1) { + if (x1 != 0) { + dqc1 = RIGHT_SHIFT_POSSIBLY_NEGATIVE(x1 * dqv, shift); + } else { + dqc1 = 0; + } + } + + // Pick and record the better quantized and de-quantized values. + if (rdcost_better_for_x1) { + qcoeff[rc] = x1; + dqcoeff[rc] = dqc1; + accu_rate += rate1; + accu_error += distortion1 - distortion_for_zero; + assert(distortion1 <= distortion_for_zero); + token_cache[rc] = vp9_pt_energy_class[t1]; + } else { + accu_rate += rate0; + accu_error += distortion0 - distortion_for_zero; + assert(distortion0 <= distortion_for_zero); + token_cache[rc] = vp9_pt_energy_class[t0]; + } + if (sharpness > 0 && abs(qcoeff[rc]) > 1) count_high_values_after_eob++; + assert(accu_error >= 0); + x_prev = qcoeff[rc]; // Update based on selected quantized value. + + use_x1 = (x1 != 0) && eob_rdcost_better_for_x1; + best_eob_cost_cur = use_x1 ? eob_cost1 : eob_cost0; + + // Determine whether to move the eob position to i+1 + if (best_eob_cost_cur < best_block_rd_cost) { + best_block_rd_cost = best_eob_cost_cur; + final_eob = i + 1; + count_high_values_after_eob = 0; + if (use_x1) { + before_best_eob_qc = x1; + before_best_eob_dqc = dqc1; + } else { + before_best_eob_qc = x; + before_best_eob_dqc = dqc0; + } + } + } + } + } + if (count_high_values_after_eob > 0) { + final_eob = eob - 1; + for (; final_eob >= 0; final_eob--) { + const int rc = scan[final_eob]; + const int x = qcoeff[rc]; + if (x) { + break; + } + } + final_eob++; + } else { + assert(final_eob <= eob); + if (final_eob > 0) { + int rc; + assert(before_best_eob_qc != 0); + i = final_eob - 1; + rc = scan[i]; + qcoeff[rc] = before_best_eob_qc; + dqcoeff[rc] = before_best_eob_dqc; + } + for (i = final_eob; i < eob; i++) { + int rc = scan[i]; + qcoeff[rc] = 0; + dqcoeff[rc] = 0; + } + } + mb->plane[plane].eobs[block] = final_eob; + return final_eob; +} +#undef RIGHT_SHIFT_POSSIBLY_NEGATIVE + +static INLINE void fdct32x32(int rd_transform, const int16_t *src, + tran_low_t *dst, int src_stride) { + if (rd_transform) + vpx_fdct32x32_rd(src, dst, src_stride); + else + vpx_fdct32x32(src, dst, src_stride); +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void highbd_fdct32x32(int rd_transform, const int16_t *src, + tran_low_t *dst, int src_stride) { + if (rd_transform) + vpx_highbd_fdct32x32_rd(src, dst, src_stride); + else + vpx_highbd_fdct32x32(src, dst, src_stride); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size]; + tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); + tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + uint16_t *const eob = &p->eobs[block]; + const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; + const int16_t *src_diff; + src_diff = &p->src_diff[4 * (row * diff_stride + col)]; + // skip block condition should be handled before this is called. + assert(!x->skip_block); + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + switch (tx_size) { + case TX_32X32: + highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); + vp9_highbd_quantize_fp_32x32(coeff, 1024, p, qcoeff, dqcoeff, + pd->dequant, eob, scan_order); + break; + case TX_16X16: + vpx_highbd_fdct16x16(src_diff, coeff, diff_stride); + vp9_highbd_quantize_fp(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); + break; + case TX_8X8: + vpx_highbd_fdct8x8(src_diff, coeff, diff_stride); + vp9_highbd_quantize_fp(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); + break; + default: + assert(tx_size == TX_4X4); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); + vp9_highbd_quantize_fp(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); + break; + } + return; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + + switch (tx_size) { + case TX_32X32: + fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); + vp9_quantize_fp_32x32(coeff, 1024, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); + break; + case TX_16X16: + vpx_fdct16x16(src_diff, coeff, diff_stride); + vp9_quantize_fp(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); + break; + case TX_8X8: + vpx_fdct8x8(src_diff, coeff, diff_stride); + vp9_quantize_fp(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); + + break; + default: + assert(tx_size == TX_4X4); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); + vp9_quantize_fp(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); + break; + } +} + +void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); + tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + uint16_t *const eob = &p->eobs[block]; + const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; + const int16_t *src_diff; + src_diff = &p->src_diff[4 * (row * diff_stride + col)]; + // skip block condition should be handled before this is called. + assert(!x->skip_block); + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + switch (tx_size) { + case TX_32X32: + vpx_highbd_fdct32x32_1(src_diff, coeff, diff_stride); + vpx_highbd_quantize_dc_32x32(coeff, p->round, p->quant_fp[0], qcoeff, + dqcoeff, pd->dequant[0], eob); + break; + case TX_16X16: + vpx_highbd_fdct16x16_1(src_diff, coeff, diff_stride); + vpx_highbd_quantize_dc(coeff, 256, p->round, p->quant_fp[0], qcoeff, + dqcoeff, pd->dequant[0], eob); + break; + case TX_8X8: + vpx_highbd_fdct8x8_1(src_diff, coeff, diff_stride); + vpx_highbd_quantize_dc(coeff, 64, p->round, p->quant_fp[0], qcoeff, + dqcoeff, pd->dequant[0], eob); + break; + default: + assert(tx_size == TX_4X4); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); + vpx_highbd_quantize_dc(coeff, 16, p->round, p->quant_fp[0], qcoeff, + dqcoeff, pd->dequant[0], eob); + break; + } + return; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + + switch (tx_size) { + case TX_32X32: + vpx_fdct32x32_1(src_diff, coeff, diff_stride); + vpx_quantize_dc_32x32(coeff, p->round, p->quant_fp[0], qcoeff, dqcoeff, + pd->dequant[0], eob); + break; + case TX_16X16: + vpx_fdct16x16_1(src_diff, coeff, diff_stride); + vpx_quantize_dc(coeff, 256, p->round, p->quant_fp[0], qcoeff, dqcoeff, + pd->dequant[0], eob); + break; + case TX_8X8: + vpx_fdct8x8_1(src_diff, coeff, diff_stride); + vpx_quantize_dc(coeff, 64, p->round, p->quant_fp[0], qcoeff, dqcoeff, + pd->dequant[0], eob); + break; + default: + assert(tx_size == TX_4X4); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); + vpx_quantize_dc(coeff, 16, p->round, p->quant_fp[0], qcoeff, dqcoeff, + pd->dequant[0], eob); + break; + } +} + +void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size]; + tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); + tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + uint16_t *const eob = &p->eobs[block]; + const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; + const int16_t *src_diff; + src_diff = &p->src_diff[4 * (row * diff_stride + col)]; + // skip block condition should be handled before this is called. + assert(!x->skip_block); + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + switch (tx_size) { + case TX_32X32: + highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); + vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); + break; + case TX_16X16: + vpx_highbd_fdct16x16(src_diff, coeff, diff_stride); + vpx_highbd_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); + break; + case TX_8X8: + vpx_highbd_fdct8x8(src_diff, coeff, diff_stride); + vpx_highbd_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); + break; + default: + assert(tx_size == TX_4X4); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); + vpx_highbd_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); + break; + } + return; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + + switch (tx_size) { + case TX_32X32: + fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); + vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); + break; + case TX_16X16: + vpx_fdct16x16(src_diff, coeff, diff_stride); + vpx_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); + break; + case TX_8X8: + vpx_fdct8x8(src_diff, coeff, diff_stride); + vpx_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); + break; + default: + assert(tx_size == TX_4X4); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); + vpx_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); + break; + } +} + +static void encode_block(int plane, int block, int row, int col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { + struct encode_b_args *const args = arg; +#if CONFIG_MISMATCH_DEBUG + int mi_row = args->mi_row; + int mi_col = args->mi_col; + int output_enabled = args->output_enabled; +#endif + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + uint8_t *dst; + ENTROPY_CONTEXT *a, *l; + dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col]; + a = &args->ta[col]; + l = &args->tl[row]; + + // TODO(jingning): per transformed block zero forcing only enabled for + // luma component. will integrate chroma components as well. + if (x->zcoeff_blk[tx_size][block] && plane == 0) { + p->eobs[block] = 0; + *a = *l = 0; +#if CONFIG_MISMATCH_DEBUG + goto encode_block_end; +#else + return; +#endif + } + + if (!x->skip_recode) { + if (x->quant_fp) { + // Encoding process for rtc mode + if (x->skip_txfm[0] == SKIP_TXFM_AC_DC && plane == 0) { + // skip forward transform + p->eobs[block] = 0; + *a = *l = 0; +#if CONFIG_MISMATCH_DEBUG + goto encode_block_end; +#else + return; +#endif + } else { + vp9_xform_quant_fp(x, plane, block, row, col, plane_bsize, tx_size); + } + } else { + if (max_txsize_lookup[plane_bsize] == tx_size) { + int txfm_blk_index = (plane << 2) + (block >> (tx_size << 1)); + if (x->skip_txfm[txfm_blk_index] == SKIP_TXFM_NONE) { + // full forward transform and quantization + vp9_xform_quant(x, plane, block, row, col, plane_bsize, tx_size); + } else if (x->skip_txfm[txfm_blk_index] == SKIP_TXFM_AC_ONLY) { + // fast path forward transform and quantization + vp9_xform_quant_dc(x, plane, block, row, col, plane_bsize, tx_size); + } else { + // skip forward transform + p->eobs[block] = 0; + *a = *l = 0; +#if CONFIG_MISMATCH_DEBUG + goto encode_block_end; +#else + return; +#endif + } + } else { + vp9_xform_quant(x, plane, block, row, col, plane_bsize, tx_size); + } + } + } + + if (x->optimize && (!x->skip_recode || !x->skip_optimize)) { + const int ctx = combine_entropy_contexts(*a, *l); + *a = *l = vp9_optimize_b(x, plane, block, tx_size, ctx) > 0; + } else { + *a = *l = p->eobs[block] > 0; + } + + if (p->eobs[block]) *(args->skip) = 0; + + if (x->skip_encode || p->eobs[block] == 0) { +#if CONFIG_MISMATCH_DEBUG + goto encode_block_end; +#else + return; +#endif + } +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); + switch (tx_size) { + case TX_32X32: + vp9_highbd_idct32x32_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], + xd->bd); + break; + case TX_16X16: + vp9_highbd_idct16x16_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], + xd->bd); + break; + case TX_8X8: + vp9_highbd_idct8x8_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], + xd->bd); + break; + default: + assert(tx_size == TX_4X4); + // this is like vp9_short_idct4x4 but has a special case around eob<=1 + // which is significant (not just an optimization) for the lossless + // case. + x->highbd_inv_txfm_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], + xd->bd); + break; + } +#if CONFIG_MISMATCH_DEBUG + goto encode_block_end; +#else + return; +#endif + } +#endif // CONFIG_VP9_HIGHBITDEPTH + + switch (tx_size) { + case TX_32X32: + vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); + break; + case TX_16X16: + vp9_idct16x16_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); + break; + case TX_8X8: + vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); + break; + default: + assert(tx_size == TX_4X4); + // this is like vp9_short_idct4x4 but has a special case around eob<=1 + // which is significant (not just an optimization) for the lossless + // case. + x->inv_txfm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); + break; + } +#if CONFIG_MISMATCH_DEBUG +encode_block_end: + if (output_enabled) { + int pixel_c, pixel_r; + int blk_w = 1 << (tx_size + TX_UNIT_SIZE_LOG2); + int blk_h = 1 << (tx_size + TX_UNIT_SIZE_LOG2); + mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, col, row, + pd->subsampling_x, pd->subsampling_y); + mismatch_record_block_tx(dst, pd->dst.stride, plane, pixel_c, pixel_r, + blk_w, blk_h, + xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); + } +#endif +} + +static void encode_block_pass1(int plane, int block, int row, int col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + void *arg) { + MACROBLOCK *const x = (MACROBLOCK *)arg; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + uint8_t *dst; + dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col]; + + vp9_xform_quant(x, plane, block, row, col, plane_bsize, tx_size); + + if (p->eobs[block] > 0) { +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + x->highbd_inv_txfm_add(dqcoeff, CONVERT_TO_SHORTPTR(dst), pd->dst.stride, + p->eobs[block], xd->bd); + return; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + x->inv_txfm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); + } +} + +void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) { + vp9_subtract_plane(x, bsize, 0); + vp9_foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0, + encode_block_pass1, x); +} + +void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col, + int output_enabled) { + MACROBLOCKD *const xd = &x->e_mbd; + struct optimize_ctx ctx; + MODE_INFO *mi = xd->mi[0]; + int plane; +#if CONFIG_MISMATCH_DEBUG + struct encode_b_args arg = { x, + 1, // enable_trellis_opt + 0.0, // trellis_opt_thresh + NULL, // &sse_calc_done + NULL, // &sse + NULL, // above entropy context + NULL, // left entropy context + &mi->skip, mi_row, mi_col, output_enabled }; +#else + struct encode_b_args arg = { x, + 1, // enable_trellis_opt + 0.0, // trellis_opt_thresh + NULL, // &sse_calc_done + NULL, // &sse + NULL, // above entropy context + NULL, // left entropy context + &mi->skip }; + (void)mi_row; + (void)mi_col; + (void)output_enabled; +#endif + + mi->skip = 1; + + if (x->skip) return; + + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + if (!x->skip_recode) vp9_subtract_plane(x, bsize, plane); + + if (x->optimize && (!x->skip_recode || !x->skip_optimize)) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const TX_SIZE tx_size = plane ? get_uv_tx_size(mi, pd) : mi->tx_size; + vp9_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], + ctx.tl[plane]); + arg.enable_trellis_opt = 1; + } else { + arg.enable_trellis_opt = 0; + } + arg.ta = ctx.ta[plane]; + arg.tl = ctx.tl[plane]; + + vp9_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block, + &arg); + } +} + +void vp9_encode_block_intra(int plane, int block, int row, int col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + void *arg) { + struct encode_b_args *const args = arg; + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *mi = xd->mi[0]; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block); + tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block); + tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + const ScanOrder *scan_order; + TX_TYPE tx_type = DCT_DCT; + PREDICTION_MODE mode; + const int bwl = b_width_log2_lookup[plane_bsize]; + const int diff_stride = 4 * (1 << bwl); + uint8_t *src, *dst; + int16_t *src_diff; + uint16_t *eob = &p->eobs[block]; + const int src_stride = p->src.stride; + const int dst_stride = pd->dst.stride; + int enable_trellis_opt = !x->skip_recode; + ENTROPY_CONTEXT *a = NULL; + ENTROPY_CONTEXT *l = NULL; + int entropy_ctx = 0; + dst = &pd->dst.buf[4 * (row * dst_stride + col)]; + src = &p->src.buf[4 * (row * src_stride + col)]; + src_diff = &p->src_diff[4 * (row * diff_stride + col)]; + + if (tx_size == TX_4X4) { + tx_type = get_tx_type_4x4(get_plane_type(plane), xd, block); + scan_order = &vp9_scan_orders[TX_4X4][tx_type]; + mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mi->uv_mode; + } else { + mode = plane == 0 ? mi->mode : mi->uv_mode; + if (tx_size == TX_32X32) { + scan_order = &vp9_default_scan_orders[TX_32X32]; + } else { + tx_type = get_tx_type(get_plane_type(plane), xd); + scan_order = &vp9_scan_orders[tx_size][tx_type]; + } + } + + vp9_predict_intra_block( + xd, bwl, tx_size, mode, (x->skip_encode || x->fp_src_pred) ? src : dst, + (x->skip_encode || x->fp_src_pred) ? src_stride : dst_stride, dst, + dst_stride, col, row, plane); + + // skip block condition should be handled before this is called. + assert(!x->skip_block); + + if (!x->skip_recode) { + const int tx_size_in_pixels = (1 << tx_size) << 2; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vpx_highbd_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff, + diff_stride, src, src_stride, dst, dst_stride, + xd->bd); + } else { + vpx_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff, + diff_stride, src, src_stride, dst, dst_stride); + } +#else + vpx_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff, + diff_stride, src, src_stride, dst, dst_stride); +#endif + enable_trellis_opt = do_trellis_opt(pd, src_diff, diff_stride, row, col, + plane_bsize, tx_size, args); + } + + if (enable_trellis_opt) { + a = &args->ta[col]; + l = &args->tl[row]; + entropy_ctx = combine_entropy_contexts(*a, *l); + } + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); + switch (tx_size) { + case TX_32X32: + if (!x->skip_recode) { + highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); + vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, + eob, scan_order); + } + if (enable_trellis_opt) { + *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; + } + if (!x->skip_encode && *eob) { + vp9_highbd_idct32x32_add(dqcoeff, dst16, dst_stride, *eob, xd->bd); + } + break; + case TX_16X16: + if (!x->skip_recode) { + if (tx_type == DCT_DCT) + vpx_highbd_fdct16x16(src_diff, coeff, diff_stride); + else + vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type); + vpx_highbd_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, + eob, scan_order); + } + if (enable_trellis_opt) { + *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; + } + if (!x->skip_encode && *eob) { + vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst16, dst_stride, *eob, + xd->bd); + } + break; + case TX_8X8: + if (!x->skip_recode) { + if (tx_type == DCT_DCT) + vpx_highbd_fdct8x8(src_diff, coeff, diff_stride); + else + vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type); + vpx_highbd_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); + } + if (enable_trellis_opt) { + *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; + } + if (!x->skip_encode && *eob) { + vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst16, dst_stride, *eob, + xd->bd); + } + break; + default: + assert(tx_size == TX_4X4); + if (!x->skip_recode) { + if (tx_type != DCT_DCT) + vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type); + else + x->fwd_txfm4x4(src_diff, coeff, diff_stride); + vpx_highbd_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); + } + if (enable_trellis_opt) { + *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; + } + if (!x->skip_encode && *eob) { + if (tx_type == DCT_DCT) { + // this is like vp9_short_idct4x4 but has a special case around + // eob<=1 which is significant (not just an optimization) for the + // lossless case. + x->highbd_inv_txfm_add(dqcoeff, dst16, dst_stride, *eob, xd->bd); + } else { + vp9_highbd_iht4x4_16_add(dqcoeff, dst16, dst_stride, tx_type, + xd->bd); + } + } + break; + } + if (*eob) *(args->skip) = 0; + return; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + + switch (tx_size) { + case TX_32X32: + if (!x->skip_recode) { + fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); + vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); + } + if (enable_trellis_opt) { + *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; + } + if (!x->skip_encode && *eob) + vp9_idct32x32_add(dqcoeff, dst, dst_stride, *eob); + break; + case TX_16X16: + if (!x->skip_recode) { + vp9_fht16x16(src_diff, coeff, diff_stride, tx_type); + vpx_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); + } + if (enable_trellis_opt) { + *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; + } + if (!x->skip_encode && *eob) + vp9_iht16x16_add(tx_type, dqcoeff, dst, dst_stride, *eob); + break; + case TX_8X8: + if (!x->skip_recode) { + vp9_fht8x8(src_diff, coeff, diff_stride, tx_type); + vpx_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); + } + if (enable_trellis_opt) { + *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; + } + if (!x->skip_encode && *eob) + vp9_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob); + break; + default: + assert(tx_size == TX_4X4); + if (!x->skip_recode) { + if (tx_type != DCT_DCT) + vp9_fht4x4(src_diff, coeff, diff_stride, tx_type); + else + x->fwd_txfm4x4(src_diff, coeff, diff_stride); + vpx_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); + } + if (enable_trellis_opt) { + *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; + } + if (!x->skip_encode && *eob) { + if (tx_type == DCT_DCT) + // this is like vp9_short_idct4x4 but has a special case around eob<=1 + // which is significant (not just an optimization) for the lossless + // case. + x->inv_txfm_add(dqcoeff, dst, dst_stride, *eob); + else + vp9_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type); + } + break; + } + if (*eob) *(args->skip) = 0; +} + +void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane, + int enable_trellis_opt) { + const MACROBLOCKD *const xd = &x->e_mbd; + struct optimize_ctx ctx; +#if CONFIG_MISMATCH_DEBUG + // TODO(angiebird): make mismatch_debug support intra mode + struct encode_b_args arg = { + x, + enable_trellis_opt, + 0.0, // trellis_opt_thresh + NULL, // &sse_calc_done + NULL, // &sse + ctx.ta[plane], + ctx.tl[plane], + &xd->mi[0]->skip, + 0, // mi_row + 0, // mi_col + 0 // output_enabled + }; +#else + struct encode_b_args arg = { x, + enable_trellis_opt, + 0.0, // trellis_opt_thresh + NULL, // &sse_calc_done + NULL, // &sse + ctx.ta[plane], + ctx.tl[plane], + &xd->mi[0]->skip }; +#endif + + if (enable_trellis_opt && x->optimize && + (!x->skip_recode || !x->skip_optimize)) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const TX_SIZE tx_size = + plane ? get_uv_tx_size(xd->mi[0], pd) : xd->mi[0]->tx_size; + vp9_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]); + } else { + arg.enable_trellis_opt = 0; + } + + vp9_foreach_transformed_block_in_plane(xd, bsize, plane, + vp9_encode_block_intra, &arg); +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encodemb.h b/media/libvpx/libvpx/vp9/encoder/vp9_encodemb.h new file mode 100644 index 0000000000..1391446bed --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_encodemb.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_ENCODEMB_H_ +#define VPX_VP9_ENCODER_VP9_ENCODEMB_H_ + +#include "./vpx_config.h" +#include "vp9/encoder/vp9_block.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct encode_b_args { + MACROBLOCK *x; + int enable_trellis_opt; + double trellis_opt_thresh; + int *sse_calc_done; + int64_t *sse; + ENTROPY_CONTEXT *ta; + ENTROPY_CONTEXT *tl; + int8_t *skip; +#if CONFIG_MISMATCH_DEBUG + int mi_row; + int mi_col; + int output_enabled; +#endif +}; +int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, + int ctx); +void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col, + int output_enabled); +void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize); +void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size); +void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size); +void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size); + +void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane); + +void vp9_encode_block_intra(int plane, int block, int row, int col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg); + +void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane, + int enable_trellis_opt); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_ENCODEMB_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encodemv.c b/media/libvpx/libvpx/vp9/encoder/vp9_encodemv.c new file mode 100644 index 0000000000..023d087c2c --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_encodemv.c @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_entropymode.h" + +#include "vp9/encoder/vp9_cost.h" +#include "vp9/encoder/vp9_encodemv.h" + +#include "vpx_dsp/vpx_dsp_common.h" + +static struct vp9_token mv_joint_encodings[MV_JOINTS]; +static struct vp9_token mv_class_encodings[MV_CLASSES]; +static struct vp9_token mv_fp_encodings[MV_FP_SIZE]; + +void vp9_entropy_mv_init(void) { + vp9_tokens_from_tree(mv_joint_encodings, vp9_mv_joint_tree); + vp9_tokens_from_tree(mv_class_encodings, vp9_mv_class_tree); + vp9_tokens_from_tree(mv_fp_encodings, vp9_mv_fp_tree); +} + +static void encode_mv_component(vpx_writer *w, int comp, + const nmv_component *mvcomp, int usehp) { + int offset; + const int sign = comp < 0; + const int mag = sign ? -comp : comp; + const int mv_class = vp9_get_mv_class(mag - 1, &offset); + const int d = offset >> 3; // int mv data + const int fr = (offset >> 1) & 3; // fractional mv data + const int hp = offset & 1; // high precision mv data + + assert(comp != 0); + + // Sign + vpx_write(w, sign, mvcomp->sign); + + // Class + vp9_write_token(w, vp9_mv_class_tree, mvcomp->classes, + &mv_class_encodings[mv_class]); + + // Integer bits + if (mv_class == MV_CLASS_0) { + vpx_write(w, d, mvcomp->class0[0]); + } else { + int i; + const int n = mv_class + CLASS0_BITS - 1; // number of bits + for (i = 0; i < n; ++i) vpx_write(w, (d >> i) & 1, mvcomp->bits[i]); + } + + // Fractional bits + vp9_write_token(w, vp9_mv_fp_tree, + mv_class == MV_CLASS_0 ? mvcomp->class0_fp[d] : mvcomp->fp, + &mv_fp_encodings[fr]); + + // High precision bit + if (usehp) + vpx_write(w, hp, mv_class == MV_CLASS_0 ? mvcomp->class0_hp : mvcomp->hp); +} + +static void build_nmv_component_cost_table(int *mvcost, + const nmv_component *const mvcomp, + int usehp) { + int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE]; + int bits_cost[MV_OFFSET_BITS][2]; + int class0_fp_cost[CLASS0_SIZE][MV_FP_SIZE], fp_cost[MV_FP_SIZE]; + int class0_hp_cost[2], hp_cost[2]; + int i; + int c, o; + + sign_cost[0] = vp9_cost_zero(mvcomp->sign); + sign_cost[1] = vp9_cost_one(mvcomp->sign); + vp9_cost_tokens(class_cost, mvcomp->classes, vp9_mv_class_tree); + vp9_cost_tokens(class0_cost, mvcomp->class0, vp9_mv_class0_tree); + for (i = 0; i < MV_OFFSET_BITS; ++i) { + bits_cost[i][0] = vp9_cost_zero(mvcomp->bits[i]); + bits_cost[i][1] = vp9_cost_one(mvcomp->bits[i]); + } + + for (i = 0; i < CLASS0_SIZE; ++i) + vp9_cost_tokens(class0_fp_cost[i], mvcomp->class0_fp[i], vp9_mv_fp_tree); + vp9_cost_tokens(fp_cost, mvcomp->fp, vp9_mv_fp_tree); + + // Always build the hp costs to avoid an uninitialized warning from gcc + class0_hp_cost[0] = vp9_cost_zero(mvcomp->class0_hp); + class0_hp_cost[1] = vp9_cost_one(mvcomp->class0_hp); + hp_cost[0] = vp9_cost_zero(mvcomp->hp); + hp_cost[1] = vp9_cost_one(mvcomp->hp); + + mvcost[0] = 0; + // MV_CLASS_0 + for (o = 0; o < (CLASS0_SIZE << 3); ++o) { + int d, e, f; + int cost = class_cost[MV_CLASS_0]; + int v = o + 1; + d = (o >> 3); /* int mv data */ + f = (o >> 1) & 3; /* fractional pel mv data */ + cost += class0_cost[d]; + cost += class0_fp_cost[d][f]; + if (usehp) { + e = (o & 1); /* high precision mv data */ + cost += class0_hp_cost[e]; + } + mvcost[v] = cost + sign_cost[0]; + mvcost[-v] = cost + sign_cost[1]; + } + for (c = MV_CLASS_1; c < MV_CLASSES; ++c) { + int d; + for (d = 0; d < (1 << c); ++d) { + int f; + int whole_cost = class_cost[c]; + int b = c + CLASS0_BITS - 1; /* number of bits */ + for (i = 0; i < b; ++i) whole_cost += bits_cost[i][((d >> i) & 1)]; + for (f = 0; f < 4; ++f) { + int cost = whole_cost + fp_cost[f]; + int v = (CLASS0_SIZE << (c + 2)) + d * 8 + f * 2 /* + e */ + 1; + if (usehp) { + mvcost[v] = cost + hp_cost[0] + sign_cost[0]; + mvcost[-v] = cost + hp_cost[0] + sign_cost[1]; + if (v + 1 > MV_MAX) break; + mvcost[v + 1] = cost + hp_cost[1] + sign_cost[0]; + mvcost[-v - 1] = cost + hp_cost[1] + sign_cost[1]; + } else { + mvcost[v] = cost + sign_cost[0]; + mvcost[-v] = cost + sign_cost[1]; + if (v + 1 > MV_MAX) break; + mvcost[v + 1] = cost + sign_cost[0]; + mvcost[-v - 1] = cost + sign_cost[1]; + } + } + } + } +} + +static int update_mv(vpx_writer *w, const unsigned int ct[2], vpx_prob *cur_p, + vpx_prob upd_p) { + const vpx_prob new_p = get_binary_prob(ct[0], ct[1]) | 1; + const int update = cost_branch256(ct, *cur_p) + vp9_cost_zero(upd_p) > + cost_branch256(ct, new_p) + vp9_cost_one(upd_p) + + (7 << VP9_PROB_COST_SHIFT); + vpx_write(w, update, upd_p); + if (update) { + *cur_p = new_p; + vpx_write_literal(w, new_p >> 1, 7); + } + return update; +} + +static void write_mv_update(const vpx_tree_index *tree, + vpx_prob probs[/*n - 1*/], + const unsigned int counts[/*n - 1*/], int n, + vpx_writer *w) { + int i; + unsigned int branch_ct[32][2]; + + // Assuming max number of probabilities <= 32 + assert(n <= 32); + + vp9_tree_probs_from_distribution(tree, branch_ct, counts); + for (i = 0; i < n - 1; ++i) + update_mv(w, branch_ct[i], &probs[i], MV_UPDATE_PROB); +} + +void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vpx_writer *w, + nmv_context_counts *const counts) { + int i, j; + nmv_context *const mvc = &cm->fc->nmvc; + + write_mv_update(vp9_mv_joint_tree, mvc->joints, counts->joints, MV_JOINTS, w); + + for (i = 0; i < 2; ++i) { + nmv_component *comp = &mvc->comps[i]; + nmv_component_counts *comp_counts = &counts->comps[i]; + + update_mv(w, comp_counts->sign, &comp->sign, MV_UPDATE_PROB); + write_mv_update(vp9_mv_class_tree, comp->classes, comp_counts->classes, + MV_CLASSES, w); + write_mv_update(vp9_mv_class0_tree, comp->class0, comp_counts->class0, + CLASS0_SIZE, w); + for (j = 0; j < MV_OFFSET_BITS; ++j) + update_mv(w, comp_counts->bits[j], &comp->bits[j], MV_UPDATE_PROB); + } + + for (i = 0; i < 2; ++i) { + for (j = 0; j < CLASS0_SIZE; ++j) + write_mv_update(vp9_mv_fp_tree, mvc->comps[i].class0_fp[j], + counts->comps[i].class0_fp[j], MV_FP_SIZE, w); + + write_mv_update(vp9_mv_fp_tree, mvc->comps[i].fp, counts->comps[i].fp, + MV_FP_SIZE, w); + } + + if (usehp) { + for (i = 0; i < 2; ++i) { + update_mv(w, counts->comps[i].class0_hp, &mvc->comps[i].class0_hp, + MV_UPDATE_PROB); + update_mv(w, counts->comps[i].hp, &mvc->comps[i].hp, MV_UPDATE_PROB); + } + } +} + +void vp9_encode_mv(VP9_COMP *cpi, vpx_writer *w, const MV *mv, const MV *ref, + const nmv_context *mvctx, int usehp, + unsigned int *const max_mv_magnitude) { + const MV diff = { mv->row - ref->row, mv->col - ref->col }; + const MV_JOINT_TYPE j = vp9_get_mv_joint(&diff); + usehp = usehp && use_mv_hp(ref); + + vp9_write_token(w, vp9_mv_joint_tree, mvctx->joints, &mv_joint_encodings[j]); + if (mv_joint_vertical(j)) + encode_mv_component(w, diff.row, &mvctx->comps[0], usehp); + + if (mv_joint_horizontal(j)) + encode_mv_component(w, diff.col, &mvctx->comps[1], usehp); + + // If auto_mv_step_size is enabled then keep track of the largest + // motion vector component used. + if (cpi->sf.mv.auto_mv_step_size) { + const unsigned int maxv = VPXMAX(abs(mv->row), abs(mv->col)) >> 3; + *max_mv_magnitude = VPXMAX(maxv, *max_mv_magnitude); + } +} + +void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2], + const nmv_context *ctx, int usehp) { + vp9_cost_tokens(mvjoint, ctx->joints, vp9_mv_joint_tree); + build_nmv_component_cost_table(mvcost[0], &ctx->comps[0], usehp); + build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], usehp); +} + +static void inc_mvs(const MODE_INFO *mi, const MB_MODE_INFO_EXT *mbmi_ext, + const int_mv mvs[2], nmv_context_counts *counts) { + int i; + + for (i = 0; i < 1 + has_second_ref(mi); ++i) { + const MV *ref = &mbmi_ext->ref_mvs[mi->ref_frame[i]][0].as_mv; + const MV diff = { mvs[i].as_mv.row - ref->row, + mvs[i].as_mv.col - ref->col }; + vp9_inc_mv(&diff, counts); + } +} + +void vp9_update_mv_count(ThreadData *td) { + const MACROBLOCKD *xd = &td->mb.e_mbd; + const MODE_INFO *mi = xd->mi[0]; + const MB_MODE_INFO_EXT *mbmi_ext = td->mb.mbmi_ext; + + if (mi->sb_type < BLOCK_8X8) { + const int num_4x4_w = num_4x4_blocks_wide_lookup[mi->sb_type]; + const int num_4x4_h = num_4x4_blocks_high_lookup[mi->sb_type]; + int idx, idy; + + for (idy = 0; idy < 2; idy += num_4x4_h) { + for (idx = 0; idx < 2; idx += num_4x4_w) { + const int i = idy * 2 + idx; + if (mi->bmi[i].as_mode == NEWMV) + inc_mvs(mi, mbmi_ext, mi->bmi[i].as_mv, &td->counts->mv); + } + } + } else { + if (mi->mode == NEWMV) inc_mvs(mi, mbmi_ext, mi->mv, &td->counts->mv); + } +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encodemv.h b/media/libvpx/libvpx/vp9/encoder/vp9_encodemv.h new file mode 100644 index 0000000000..2f1be4b233 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_encodemv.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_ENCODEMV_H_ +#define VPX_VP9_ENCODER_VP9_ENCODEMV_H_ + +#include "vp9/encoder/vp9_encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_entropy_mv_init(void); + +void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vpx_writer *w, + nmv_context_counts *const counts); + +void vp9_encode_mv(VP9_COMP *cpi, vpx_writer *w, const MV *mv, const MV *ref, + const nmv_context *mvctx, int usehp, + unsigned int *const max_mv_magnitude); + +void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2], + const nmv_context *ctx, int usehp); + +void vp9_update_mv_count(ThreadData *td); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_ENCODEMV_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c new file mode 100644 index 0000000000..fd213f1e6b --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c @@ -0,0 +1,7074 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "./vpx_scale_rtcd.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_ext_ratectrl.h" +#include "vpx_dsp/psnr.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#if CONFIG_INTERNAL_STATS +#include "vpx_dsp/ssim.h" +#endif +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/system_state.h" +#include "vpx_ports/vpx_once.h" +#include "vpx_ports/vpx_timer.h" +#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG +#include "vpx_util/vpx_debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG + +#include "vp9/common/vp9_alloccommon.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_filter.h" +#include "vp9/common/vp9_idct.h" +#if CONFIG_VP9_POSTPROC +#include "vp9/common/vp9_postproc.h" +#endif +#include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_scale.h" +#include "vp9/common/vp9_tile_common.h" + +#if !CONFIG_REALTIME_ONLY +#include "vp9/encoder/vp9_alt_ref_aq.h" +#include "vp9/encoder/vp9_aq_360.h" +#include "vp9/encoder/vp9_aq_complexity.h" +#endif +#include "vp9/encoder/vp9_aq_cyclicrefresh.h" +#if !CONFIG_REALTIME_ONLY +#include "vp9/encoder/vp9_aq_variance.h" +#endif +#include "vp9/encoder/vp9_bitstream.h" +#if CONFIG_INTERNAL_STATS +#include "vp9/encoder/vp9_blockiness.h" +#endif +#include "vp9/encoder/vp9_context_tree.h" +#include "vp9/encoder/vp9_encodeframe.h" +#include "vp9/encoder/vp9_encodemb.h" +#include "vp9/encoder/vp9_encodemv.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_ethread.h" +#include "vp9/encoder/vp9_extend.h" +#include "vp9/encoder/vp9_firstpass.h" +#include "vp9/encoder/vp9_mbgraph.h" +#if CONFIG_NON_GREEDY_MV +#include "vp9/encoder/vp9_mcomp.h" +#endif +#include "vp9/encoder/vp9_multi_thread.h" +#include "vp9/encoder/vp9_noise_estimate.h" +#include "vp9/encoder/vp9_picklpf.h" +#include "vp9/encoder/vp9_ratectrl.h" +#include "vp9/encoder/vp9_rd.h" +#include "vp9/encoder/vp9_resize.h" +#include "vp9/encoder/vp9_segmentation.h" +#include "vp9/encoder/vp9_skin_detection.h" +#include "vp9/encoder/vp9_speed_features.h" +#include "vp9/encoder/vp9_svc_layercontext.h" +#include "vp9/encoder/vp9_temporal_filter.h" +#include "vp9/encoder/vp9_tpl_model.h" +#include "vp9/vp9_cx_iface.h" + +#define AM_SEGMENT_ID_INACTIVE 7 +#define AM_SEGMENT_ID_ACTIVE 0 + +// Whether to use high precision mv for altref computation. +#define ALTREF_HIGH_PRECISION_MV 1 + +// Q threshold for high precision mv. Choose a very high value for now so that +// HIGH_PRECISION is always chosen. +#define HIGH_PRECISION_MV_QTHRESH 200 + +#define FRAME_SIZE_FACTOR 128 // empirical params for context model threshold +#define FRAME_RATE_FACTOR 8 + +#ifdef OUTPUT_YUV_DENOISED +FILE *yuv_denoised_file = NULL; +#endif +#ifdef OUTPUT_YUV_SKINMAP +static FILE *yuv_skinmap_file = NULL; +#endif +#ifdef OUTPUT_YUV_REC +FILE *yuv_rec_file; +#endif +#ifdef OUTPUT_YUV_SVC_SRC +FILE *yuv_svc_src[3] = { NULL, NULL, NULL }; +#endif + +#if 0 +FILE *framepsnr; +FILE *kf_list; +FILE *keyfile; +#endif + +#ifdef ENABLE_KF_DENOISE +// Test condition for spatial denoise of source. +static int is_spatial_denoise_enabled(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + + return (oxcf->pass != 1) && !is_lossless_requested(&cpi->oxcf) && + frame_is_intra_only(cm); +} +#endif + +#if !CONFIG_REALTIME_ONLY +// compute adaptive threshold for skip recoding +static int compute_context_model_thresh(const VP9_COMP *const cpi) { + const VP9_COMMON *const cm = &cpi->common; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + const int frame_size = (cm->width * cm->height) >> 10; + const int bitrate = (int)(oxcf->target_bandwidth >> 10); + const int qindex_factor = cm->base_qindex + (MAXQ >> 1); + + // This equation makes the threshold adaptive to frame size. + // Coding gain obtained by recoding comes from alternate frames of large + // content change. We skip recoding if the difference of previous and current + // frame context probability model is less than a certain threshold. + // The first component is the most critical part to guarantee adaptivity. + // Other parameters are estimated based on normal setting of hd resolution + // parameters. e.g. frame_size = 1920x1080, bitrate = 8000, qindex_factor < 50 + const int thresh = + ((FRAME_SIZE_FACTOR * frame_size - FRAME_RATE_FACTOR * bitrate) * + qindex_factor) >> + 9; + + return thresh; +} + +// compute the total cost difference between current +// and previous frame context prob model. +static int compute_context_model_diff(const VP9_COMMON *const cm) { + const FRAME_CONTEXT *const pre_fc = + &cm->frame_contexts[cm->frame_context_idx]; + const FRAME_CONTEXT *const cur_fc = cm->fc; + const FRAME_COUNTS *counts = &cm->counts; + vpx_prob pre_last_prob, cur_last_prob; + int diff = 0; + int i, j, k, l, m, n; + + // y_mode_prob + for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) { + for (j = 0; j < INTRA_MODES - 1; ++j) { + diff += (int)counts->y_mode[i][j] * + (pre_fc->y_mode_prob[i][j] - cur_fc->y_mode_prob[i][j]); + } + pre_last_prob = MAX_PROB - pre_fc->y_mode_prob[i][INTRA_MODES - 2]; + cur_last_prob = MAX_PROB - cur_fc->y_mode_prob[i][INTRA_MODES - 2]; + + diff += (int)counts->y_mode[i][INTRA_MODES - 1] * + (pre_last_prob - cur_last_prob); + } + + // uv_mode_prob + for (i = 0; i < INTRA_MODES; ++i) { + for (j = 0; j < INTRA_MODES - 1; ++j) { + diff += (int)counts->uv_mode[i][j] * + (pre_fc->uv_mode_prob[i][j] - cur_fc->uv_mode_prob[i][j]); + } + pre_last_prob = MAX_PROB - pre_fc->uv_mode_prob[i][INTRA_MODES - 2]; + cur_last_prob = MAX_PROB - cur_fc->uv_mode_prob[i][INTRA_MODES - 2]; + + diff += (int)counts->uv_mode[i][INTRA_MODES - 1] * + (pre_last_prob - cur_last_prob); + } + + // partition_prob + for (i = 0; i < PARTITION_CONTEXTS; ++i) { + for (j = 0; j < PARTITION_TYPES - 1; ++j) { + diff += (int)counts->partition[i][j] * + (pre_fc->partition_prob[i][j] - cur_fc->partition_prob[i][j]); + } + pre_last_prob = MAX_PROB - pre_fc->partition_prob[i][PARTITION_TYPES - 2]; + cur_last_prob = MAX_PROB - cur_fc->partition_prob[i][PARTITION_TYPES - 2]; + + diff += (int)counts->partition[i][PARTITION_TYPES - 1] * + (pre_last_prob - cur_last_prob); + } + + // coef_probs + for (i = 0; i < TX_SIZES; ++i) { + for (j = 0; j < PLANE_TYPES; ++j) { + for (k = 0; k < REF_TYPES; ++k) { + for (l = 0; l < COEF_BANDS; ++l) { + for (m = 0; m < BAND_COEFF_CONTEXTS(l); ++m) { + for (n = 0; n < UNCONSTRAINED_NODES; ++n) { + diff += (int)counts->coef[i][j][k][l][m][n] * + (pre_fc->coef_probs[i][j][k][l][m][n] - + cur_fc->coef_probs[i][j][k][l][m][n]); + } + + pre_last_prob = + MAX_PROB - + pre_fc->coef_probs[i][j][k][l][m][UNCONSTRAINED_NODES - 1]; + cur_last_prob = + MAX_PROB - + cur_fc->coef_probs[i][j][k][l][m][UNCONSTRAINED_NODES - 1]; + + diff += (int)counts->coef[i][j][k][l][m][UNCONSTRAINED_NODES] * + (pre_last_prob - cur_last_prob); + } + } + } + } + } + + // switchable_interp_prob + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) { + for (j = 0; j < SWITCHABLE_FILTERS - 1; ++j) { + diff += (int)counts->switchable_interp[i][j] * + (pre_fc->switchable_interp_prob[i][j] - + cur_fc->switchable_interp_prob[i][j]); + } + pre_last_prob = + MAX_PROB - pre_fc->switchable_interp_prob[i][SWITCHABLE_FILTERS - 2]; + cur_last_prob = + MAX_PROB - cur_fc->switchable_interp_prob[i][SWITCHABLE_FILTERS - 2]; + + diff += (int)counts->switchable_interp[i][SWITCHABLE_FILTERS - 1] * + (pre_last_prob - cur_last_prob); + } + + // inter_mode_probs + for (i = 0; i < INTER_MODE_CONTEXTS; ++i) { + for (j = 0; j < INTER_MODES - 1; ++j) { + diff += (int)counts->inter_mode[i][j] * + (pre_fc->inter_mode_probs[i][j] - cur_fc->inter_mode_probs[i][j]); + } + pre_last_prob = MAX_PROB - pre_fc->inter_mode_probs[i][INTER_MODES - 2]; + cur_last_prob = MAX_PROB - cur_fc->inter_mode_probs[i][INTER_MODES - 2]; + + diff += (int)counts->inter_mode[i][INTER_MODES - 1] * + (pre_last_prob - cur_last_prob); + } + + // intra_inter_prob + for (i = 0; i < INTRA_INTER_CONTEXTS; ++i) { + diff += (int)counts->intra_inter[i][0] * + (pre_fc->intra_inter_prob[i] - cur_fc->intra_inter_prob[i]); + + pre_last_prob = MAX_PROB - pre_fc->intra_inter_prob[i]; + cur_last_prob = MAX_PROB - cur_fc->intra_inter_prob[i]; + + diff += (int)counts->intra_inter[i][1] * (pre_last_prob - cur_last_prob); + } + + // comp_inter_prob + for (i = 0; i < COMP_INTER_CONTEXTS; ++i) { + diff += (int)counts->comp_inter[i][0] * + (pre_fc->comp_inter_prob[i] - cur_fc->comp_inter_prob[i]); + + pre_last_prob = MAX_PROB - pre_fc->comp_inter_prob[i]; + cur_last_prob = MAX_PROB - cur_fc->comp_inter_prob[i]; + + diff += (int)counts->comp_inter[i][1] * (pre_last_prob - cur_last_prob); + } + + // single_ref_prob + for (i = 0; i < REF_CONTEXTS; ++i) { + for (j = 0; j < 2; ++j) { + diff += (int)counts->single_ref[i][j][0] * + (pre_fc->single_ref_prob[i][j] - cur_fc->single_ref_prob[i][j]); + + pre_last_prob = MAX_PROB - pre_fc->single_ref_prob[i][j]; + cur_last_prob = MAX_PROB - cur_fc->single_ref_prob[i][j]; + + diff += + (int)counts->single_ref[i][j][1] * (pre_last_prob - cur_last_prob); + } + } + + // comp_ref_prob + for (i = 0; i < REF_CONTEXTS; ++i) { + diff += (int)counts->comp_ref[i][0] * + (pre_fc->comp_ref_prob[i] - cur_fc->comp_ref_prob[i]); + + pre_last_prob = MAX_PROB - pre_fc->comp_ref_prob[i]; + cur_last_prob = MAX_PROB - cur_fc->comp_ref_prob[i]; + + diff += (int)counts->comp_ref[i][1] * (pre_last_prob - cur_last_prob); + } + + // tx_probs + for (i = 0; i < TX_SIZE_CONTEXTS; ++i) { + // p32x32 + for (j = 0; j < TX_SIZES - 1; ++j) { + diff += (int)counts->tx.p32x32[i][j] * + (pre_fc->tx_probs.p32x32[i][j] - cur_fc->tx_probs.p32x32[i][j]); + } + pre_last_prob = MAX_PROB - pre_fc->tx_probs.p32x32[i][TX_SIZES - 2]; + cur_last_prob = MAX_PROB - cur_fc->tx_probs.p32x32[i][TX_SIZES - 2]; + + diff += (int)counts->tx.p32x32[i][TX_SIZES - 1] * + (pre_last_prob - cur_last_prob); + + // p16x16 + for (j = 0; j < TX_SIZES - 2; ++j) { + diff += (int)counts->tx.p16x16[i][j] * + (pre_fc->tx_probs.p16x16[i][j] - cur_fc->tx_probs.p16x16[i][j]); + } + pre_last_prob = MAX_PROB - pre_fc->tx_probs.p16x16[i][TX_SIZES - 3]; + cur_last_prob = MAX_PROB - cur_fc->tx_probs.p16x16[i][TX_SIZES - 3]; + + diff += (int)counts->tx.p16x16[i][TX_SIZES - 2] * + (pre_last_prob - cur_last_prob); + + // p8x8 + for (j = 0; j < TX_SIZES - 3; ++j) { + diff += (int)counts->tx.p8x8[i][j] * + (pre_fc->tx_probs.p8x8[i][j] - cur_fc->tx_probs.p8x8[i][j]); + } + pre_last_prob = MAX_PROB - pre_fc->tx_probs.p8x8[i][TX_SIZES - 4]; + cur_last_prob = MAX_PROB - cur_fc->tx_probs.p8x8[i][TX_SIZES - 4]; + + diff += + (int)counts->tx.p8x8[i][TX_SIZES - 3] * (pre_last_prob - cur_last_prob); + } + + // skip_probs + for (i = 0; i < SKIP_CONTEXTS; ++i) { + diff += (int)counts->skip[i][0] * + (pre_fc->skip_probs[i] - cur_fc->skip_probs[i]); + + pre_last_prob = MAX_PROB - pre_fc->skip_probs[i]; + cur_last_prob = MAX_PROB - cur_fc->skip_probs[i]; + + diff += (int)counts->skip[i][1] * (pre_last_prob - cur_last_prob); + } + + // mv + for (i = 0; i < MV_JOINTS - 1; ++i) { + diff += (int)counts->mv.joints[i] * + (pre_fc->nmvc.joints[i] - cur_fc->nmvc.joints[i]); + } + pre_last_prob = MAX_PROB - pre_fc->nmvc.joints[MV_JOINTS - 2]; + cur_last_prob = MAX_PROB - cur_fc->nmvc.joints[MV_JOINTS - 2]; + + diff += + (int)counts->mv.joints[MV_JOINTS - 1] * (pre_last_prob - cur_last_prob); + + for (i = 0; i < 2; ++i) { + const nmv_component_counts *nmv_count = &counts->mv.comps[i]; + const nmv_component *pre_nmv_prob = &pre_fc->nmvc.comps[i]; + const nmv_component *cur_nmv_prob = &cur_fc->nmvc.comps[i]; + + // sign + diff += (int)nmv_count->sign[0] * (pre_nmv_prob->sign - cur_nmv_prob->sign); + + pre_last_prob = MAX_PROB - pre_nmv_prob->sign; + cur_last_prob = MAX_PROB - cur_nmv_prob->sign; + + diff += (int)nmv_count->sign[1] * (pre_last_prob - cur_last_prob); + + // classes + for (j = 0; j < MV_CLASSES - 1; ++j) { + diff += (int)nmv_count->classes[j] * + (pre_nmv_prob->classes[j] - cur_nmv_prob->classes[j]); + } + pre_last_prob = MAX_PROB - pre_nmv_prob->classes[MV_CLASSES - 2]; + cur_last_prob = MAX_PROB - cur_nmv_prob->classes[MV_CLASSES - 2]; + + diff += (int)nmv_count->classes[MV_CLASSES - 1] * + (pre_last_prob - cur_last_prob); + + // class0 + for (j = 0; j < CLASS0_SIZE - 1; ++j) { + diff += (int)nmv_count->class0[j] * + (pre_nmv_prob->class0[j] - cur_nmv_prob->class0[j]); + } + pre_last_prob = MAX_PROB - pre_nmv_prob->class0[CLASS0_SIZE - 2]; + cur_last_prob = MAX_PROB - cur_nmv_prob->class0[CLASS0_SIZE - 2]; + + diff += (int)nmv_count->class0[CLASS0_SIZE - 1] * + (pre_last_prob - cur_last_prob); + + // bits + for (j = 0; j < MV_OFFSET_BITS; ++j) { + diff += (int)nmv_count->bits[j][0] * + (pre_nmv_prob->bits[j] - cur_nmv_prob->bits[j]); + + pre_last_prob = MAX_PROB - pre_nmv_prob->bits[j]; + cur_last_prob = MAX_PROB - cur_nmv_prob->bits[j]; + + diff += (int)nmv_count->bits[j][1] * (pre_last_prob - cur_last_prob); + } + + // class0_fp + for (j = 0; j < CLASS0_SIZE; ++j) { + for (k = 0; k < MV_FP_SIZE - 1; ++k) { + diff += (int)nmv_count->class0_fp[j][k] * + (pre_nmv_prob->class0_fp[j][k] - cur_nmv_prob->class0_fp[j][k]); + } + pre_last_prob = MAX_PROB - pre_nmv_prob->class0_fp[j][MV_FP_SIZE - 2]; + cur_last_prob = MAX_PROB - cur_nmv_prob->class0_fp[j][MV_FP_SIZE - 2]; + + diff += (int)nmv_count->class0_fp[j][MV_FP_SIZE - 1] * + (pre_last_prob - cur_last_prob); + } + + // fp + for (j = 0; j < MV_FP_SIZE - 1; ++j) { + diff += + (int)nmv_count->fp[j] * (pre_nmv_prob->fp[j] - cur_nmv_prob->fp[j]); + } + pre_last_prob = MAX_PROB - pre_nmv_prob->fp[MV_FP_SIZE - 2]; + cur_last_prob = MAX_PROB - cur_nmv_prob->fp[MV_FP_SIZE - 2]; + + diff += + (int)nmv_count->fp[MV_FP_SIZE - 1] * (pre_last_prob - cur_last_prob); + + // class0_hp + diff += (int)nmv_count->class0_hp[0] * + (pre_nmv_prob->class0_hp - cur_nmv_prob->class0_hp); + + pre_last_prob = MAX_PROB - pre_nmv_prob->class0_hp; + cur_last_prob = MAX_PROB - cur_nmv_prob->class0_hp; + + diff += (int)nmv_count->class0_hp[1] * (pre_last_prob - cur_last_prob); + + // hp + diff += (int)nmv_count->hp[0] * (pre_nmv_prob->hp - cur_nmv_prob->hp); + + pre_last_prob = MAX_PROB - pre_nmv_prob->hp; + cur_last_prob = MAX_PROB - cur_nmv_prob->hp; + + diff += (int)nmv_count->hp[1] * (pre_last_prob - cur_last_prob); + } + + return -diff; +} +#endif // !CONFIG_REALTIME_ONLY + +// Test for whether to calculate metrics for the frame. +static int is_psnr_calc_enabled(const VP9_COMP *cpi) { + const VP9_COMMON *const cm = &cpi->common; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + + return cpi->b_calculate_psnr && (oxcf->pass != 1) && cm->show_frame; +} + +/* clang-format off */ +const Vp9LevelSpec vp9_level_defs[VP9_LEVELS] = { + // sample rate size breadth bitrate cpb + { LEVEL_1, 829440, 36864, 512, 200, 400, 2, 1, 4, 8 }, + { LEVEL_1_1, 2764800, 73728, 768, 800, 1000, 2, 1, 4, 8 }, + { LEVEL_2, 4608000, 122880, 960, 1800, 1500, 2, 1, 4, 8 }, + { LEVEL_2_1, 9216000, 245760, 1344, 3600, 2800, 2, 2, 4, 8 }, + { LEVEL_3, 20736000, 552960, 2048, 7200, 6000, 2, 4, 4, 8 }, + { LEVEL_3_1, 36864000, 983040, 2752, 12000, 10000, 2, 4, 4, 8 }, + { LEVEL_4, 83558400, 2228224, 4160, 18000, 16000, 4, 4, 4, 8 }, + { LEVEL_4_1, 160432128, 2228224, 4160, 30000, 18000, 4, 4, 5, 6 }, + { LEVEL_5, 311951360, 8912896, 8384, 60000, 36000, 6, 8, 6, 4 }, + { LEVEL_5_1, 588251136, 8912896, 8384, 120000, 46000, 8, 8, 10, 4 }, + // TODO(huisu): update max_cpb_size for level 5_2 ~ 6_2 when + // they are finalized (currently tentative). + { LEVEL_5_2, 1176502272, 8912896, 8384, 180000, 90000, 8, 8, 10, 4 }, + { LEVEL_6, 1176502272, 35651584, 16832, 180000, 90000, 8, 16, 10, 4 }, + { LEVEL_6_1, 2353004544u, 35651584, 16832, 240000, 180000, 8, 16, 10, 4 }, + { LEVEL_6_2, 4706009088u, 35651584, 16832, 480000, 360000, 8, 16, 10, 4 }, +}; +/* clang-format on */ + +static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = { + "The average bit-rate is too high.", + "The picture size is too large.", + "The picture width/height is too large.", + "The luma sample rate is too large.", + "The CPB size is too large.", + "The compression ratio is too small", + "Too many column tiles are used.", + "The alt-ref distance is too small.", + "Too many reference buffers are used." +}; + +static INLINE void Scale2Ratio(VPX_SCALING_MODE mode, int *hr, int *hs) { + switch (mode) { + case VP8E_NORMAL: + *hr = 1; + *hs = 1; + break; + case VP8E_FOURFIVE: + *hr = 4; + *hs = 5; + break; + case VP8E_THREEFIVE: + *hr = 3; + *hs = 5; + break; + default: + assert(mode == VP8E_ONETWO); + *hr = 1; + *hs = 2; + break; + } +} + +// Mark all inactive blocks as active. Other segmentation features may be set +// so memset cannot be used, instead only inactive blocks should be reset. +static void suppress_active_map(VP9_COMP *cpi) { + unsigned char *const seg_map = cpi->segmentation_map; + + if (cpi->active_map.enabled || cpi->active_map.update) { + const int rows = cpi->common.mi_rows; + const int cols = cpi->common.mi_cols; + int i; + + for (i = 0; i < rows * cols; ++i) + if (seg_map[i] == AM_SEGMENT_ID_INACTIVE) + seg_map[i] = AM_SEGMENT_ID_ACTIVE; + } +} + +static void apply_active_map(VP9_COMP *cpi) { + struct segmentation *const seg = &cpi->common.seg; + unsigned char *const seg_map = cpi->segmentation_map; + const unsigned char *const active_map = cpi->active_map.map; + int i; + + assert(AM_SEGMENT_ID_ACTIVE == CR_SEGMENT_ID_BASE); + + if (frame_is_intra_only(&cpi->common)) { + cpi->active_map.enabled = 0; + cpi->active_map.update = 1; + } + + if (cpi->active_map.update) { + if (cpi->active_map.enabled) { + for (i = 0; i < cpi->common.mi_rows * cpi->common.mi_cols; ++i) + if (seg_map[i] == AM_SEGMENT_ID_ACTIVE) seg_map[i] = active_map[i]; + vp9_enable_segmentation(seg); + vp9_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP); + vp9_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF); + // Setting the data to -MAX_LOOP_FILTER will result in the computed loop + // filter level being zero regardless of the value of seg->abs_delta. + vp9_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF, + -MAX_LOOP_FILTER); + } else { + vp9_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP); + vp9_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF); + if (seg->enabled) { + seg->update_data = 1; + seg->update_map = 1; + } + } + cpi->active_map.update = 0; + } +} + +static void apply_roi_map(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + struct segmentation *const seg = &cm->seg; + vpx_roi_map_t *roi = &cpi->roi; + const int *delta_q = roi->delta_q; + const int *delta_lf = roi->delta_lf; + const int *skip = roi->skip; + int ref_frame[8]; + int internal_delta_q[MAX_SEGMENTS]; + int i; + + // TODO(jianj): Investigate why ROI not working in speed < 5 or in non + // realtime mode. + if (cpi->oxcf.mode != REALTIME || cpi->oxcf.speed < 5) return; + if (!roi->enabled) return; + + memcpy(&ref_frame, roi->ref_frame, sizeof(ref_frame)); + + vp9_enable_segmentation(seg); + vp9_clearall_segfeatures(seg); + // Select delta coding method; + seg->abs_delta = SEGMENT_DELTADATA; + + memcpy(cpi->segmentation_map, roi->roi_map, (cm->mi_rows * cm->mi_cols)); + + for (i = 0; i < MAX_SEGMENTS; ++i) { + // Translate the external delta q values to internal values. + internal_delta_q[i] = vp9_quantizer_to_qindex(abs(delta_q[i])); + if (delta_q[i] < 0) internal_delta_q[i] = -internal_delta_q[i]; + vp9_disable_segfeature(seg, i, SEG_LVL_ALT_Q); + vp9_disable_segfeature(seg, i, SEG_LVL_ALT_LF); + if (internal_delta_q[i] != 0) { + vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q); + vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, internal_delta_q[i]); + } + if (delta_lf[i] != 0) { + vp9_enable_segfeature(seg, i, SEG_LVL_ALT_LF); + vp9_set_segdata(seg, i, SEG_LVL_ALT_LF, delta_lf[i]); + } + if (skip[i] != 0) { + vp9_enable_segfeature(seg, i, SEG_LVL_SKIP); + vp9_set_segdata(seg, i, SEG_LVL_SKIP, 0); + } + if (ref_frame[i] >= 0) { + int valid_ref = 1; + // ALTREF is not used as reference for nonrd_pickmode with 0 lag. + if (ref_frame[i] == ALTREF_FRAME && cpi->sf.use_nonrd_pick_mode) + valid_ref = 0; + // If GOLDEN is selected, make sure it's set as reference. + if (ref_frame[i] == GOLDEN_FRAME && + !(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame[i]))) { + valid_ref = 0; + } + // GOLDEN was updated in previous encoded frame, so GOLDEN and LAST are + // same reference. + if (ref_frame[i] == GOLDEN_FRAME && cpi->rc.frames_since_golden == 0) + ref_frame[i] = LAST_FRAME; + if (valid_ref) { + vp9_enable_segfeature(seg, i, SEG_LVL_REF_FRAME); + vp9_set_segdata(seg, i, SEG_LVL_REF_FRAME, ref_frame[i]); + } + } + } + roi->enabled = 1; +} + +static void init_level_info(Vp9LevelInfo *level_info) { + Vp9LevelStats *const level_stats = &level_info->level_stats; + Vp9LevelSpec *const level_spec = &level_info->level_spec; + + memset(level_stats, 0, sizeof(*level_stats)); + memset(level_spec, 0, sizeof(*level_spec)); + level_spec->level = LEVEL_UNKNOWN; + level_spec->min_altref_distance = INT_MAX; +} + +static int check_seg_range(int seg_data[8], int range) { + int i; + for (i = 0; i < 8; ++i) { + // Note abs() alone can't be used as the behavior of abs(INT_MIN) is + // undefined. + if (seg_data[i] > range || seg_data[i] < -range) { + return 0; + } + } + return 1; +} + +VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) { + int i; + const Vp9LevelSpec *this_level; + + vpx_clear_system_state(); + + for (i = 0; i < VP9_LEVELS; ++i) { + this_level = &vp9_level_defs[i]; + if ((double)level_spec->max_luma_sample_rate > + (double)this_level->max_luma_sample_rate * + (1 + SAMPLE_RATE_GRACE_P) || + level_spec->max_luma_picture_size > this_level->max_luma_picture_size || + level_spec->max_luma_picture_breadth > + this_level->max_luma_picture_breadth || + level_spec->average_bitrate > this_level->average_bitrate || + level_spec->max_cpb_size > this_level->max_cpb_size || + level_spec->compression_ratio < this_level->compression_ratio || + level_spec->max_col_tiles > this_level->max_col_tiles || + level_spec->min_altref_distance < this_level->min_altref_distance || + level_spec->max_ref_frame_buffers > this_level->max_ref_frame_buffers) + continue; + break; + } + return (i == VP9_LEVELS) ? LEVEL_UNKNOWN : vp9_level_defs[i].level; +} + +vpx_codec_err_t vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, + unsigned int rows, unsigned int cols, + int delta_q[8], int delta_lf[8], int skip[8], + int ref_frame[8]) { + VP9_COMMON *cm = &cpi->common; + vpx_roi_map_t *roi = &cpi->roi; + const int range = 63; + const int ref_frame_range = 3; // Alt-ref + const int skip_range = 1; + const int frame_rows = cpi->common.mi_rows; + const int frame_cols = cpi->common.mi_cols; + + // Check number of rows and columns match + if (frame_rows != (int)rows || frame_cols != (int)cols) { + return VPX_CODEC_INVALID_PARAM; + } + + if (!check_seg_range(delta_q, range) || !check_seg_range(delta_lf, range) || + !check_seg_range(ref_frame, ref_frame_range) || + !check_seg_range(skip, skip_range)) + return VPX_CODEC_INVALID_PARAM; + + // Also disable segmentation if no deltas are specified. + if (!map || + (!(delta_q[0] | delta_q[1] | delta_q[2] | delta_q[3] | delta_q[4] | + delta_q[5] | delta_q[6] | delta_q[7] | delta_lf[0] | delta_lf[1] | + delta_lf[2] | delta_lf[3] | delta_lf[4] | delta_lf[5] | delta_lf[6] | + delta_lf[7] | skip[0] | skip[1] | skip[2] | skip[3] | skip[4] | + skip[5] | skip[6] | skip[7]) && + (ref_frame[0] == -1 && ref_frame[1] == -1 && ref_frame[2] == -1 && + ref_frame[3] == -1 && ref_frame[4] == -1 && ref_frame[5] == -1 && + ref_frame[6] == -1 && ref_frame[7] == -1))) { + vp9_disable_segmentation(&cm->seg); + cpi->roi.enabled = 0; + return VPX_CODEC_OK; + } + + if (roi->roi_map) { + vpx_free(roi->roi_map); + roi->roi_map = NULL; + } + roi->roi_map = vpx_malloc(rows * cols); + if (!roi->roi_map) return VPX_CODEC_MEM_ERROR; + + // Copy to ROI structure in the compressor. + memcpy(roi->roi_map, map, rows * cols); + memcpy(&roi->delta_q, delta_q, MAX_SEGMENTS * sizeof(delta_q[0])); + memcpy(&roi->delta_lf, delta_lf, MAX_SEGMENTS * sizeof(delta_lf[0])); + memcpy(&roi->skip, skip, MAX_SEGMENTS * sizeof(skip[0])); + memcpy(&roi->ref_frame, ref_frame, MAX_SEGMENTS * sizeof(ref_frame[0])); + roi->enabled = 1; + roi->rows = rows; + roi->cols = cols; + + return VPX_CODEC_OK; +} + +int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows, + int cols) { + if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) { + unsigned char *const active_map_8x8 = cpi->active_map.map; + const int mi_rows = cpi->common.mi_rows; + const int mi_cols = cpi->common.mi_cols; + cpi->active_map.update = 1; + if (new_map_16x16) { + int r, c; + for (r = 0; r < mi_rows; ++r) { + for (c = 0; c < mi_cols; ++c) { + active_map_8x8[r * mi_cols + c] = + new_map_16x16[(r >> 1) * cols + (c >> 1)] + ? AM_SEGMENT_ID_ACTIVE + : AM_SEGMENT_ID_INACTIVE; + } + } + cpi->active_map.enabled = 1; + } else { + cpi->active_map.enabled = 0; + } + return 0; + } else { + return -1; + } +} + +int vp9_get_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows, + int cols) { + if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols && + new_map_16x16) { + unsigned char *const seg_map_8x8 = cpi->segmentation_map; + const int mi_rows = cpi->common.mi_rows; + const int mi_cols = cpi->common.mi_cols; + memset(new_map_16x16, !cpi->active_map.enabled, rows * cols); + if (cpi->active_map.enabled) { + int r, c; + for (r = 0; r < mi_rows; ++r) { + for (c = 0; c < mi_cols; ++c) { + // Cyclic refresh segments are considered active despite not having + // AM_SEGMENT_ID_ACTIVE + new_map_16x16[(r >> 1) * cols + (c >> 1)] |= + seg_map_8x8[r * mi_cols + c] != AM_SEGMENT_ID_INACTIVE; + } + } + } + return 0; + } else { + return -1; + } +} + +void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv) { + MACROBLOCK *const mb = &cpi->td.mb; + cpi->common.allow_high_precision_mv = allow_high_precision_mv; + if (cpi->common.allow_high_precision_mv) { + mb->mvcost = mb->nmvcost_hp; + mb->mvsadcost = mb->nmvsadcost_hp; + } else { + mb->mvcost = mb->nmvcost; + mb->mvsadcost = mb->nmvsadcost; + } +} + +static void setup_frame(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + // Set up entropy context depending on frame type. The decoder mandates + // the use of the default context, index 0, for keyframes and inter + // frames where the error_resilient_mode or intra_only flag is set. For + // other inter-frames the encoder currently uses only two contexts; + // context 1 for ALTREF frames and context 0 for the others. + if (frame_is_intra_only(cm) || cm->error_resilient_mode) { + vp9_setup_past_independence(cm); + } else { + if (!cpi->use_svc) cm->frame_context_idx = cpi->refresh_alt_ref_frame; + } + + // TODO(jingning): Overwrite the frame_context_idx index in multi-layer ARF + // case. Need some further investigation on if we could apply this to single + // layer ARF case as well. + if (cpi->multi_layer_arf && !cpi->use_svc) { + GF_GROUP *const gf_group = &cpi->twopass.gf_group; + const int gf_group_index = gf_group->index; + const int boost_frame = + !cpi->rc.is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame); + + // frame_context_idx Frame Type + // 0 Intra only frame, base layer ARF + // 1 ARFs with layer depth = 2,3 + // 2 ARFs with layer depth > 3 + // 3 Non-boosted frames + if (frame_is_intra_only(cm)) { + cm->frame_context_idx = 0; + } else if (boost_frame) { + if (gf_group->rf_level[gf_group_index] == GF_ARF_STD) + cm->frame_context_idx = 0; + else if (gf_group->layer_depth[gf_group_index] <= 3) + cm->frame_context_idx = 1; + else + cm->frame_context_idx = 2; + } else { + cm->frame_context_idx = 3; + } + } + + if (cm->frame_type == KEY_FRAME) { + cpi->refresh_golden_frame = 1; + cpi->refresh_alt_ref_frame = 1; + vp9_zero(cpi->interp_filter_selected); + } else { + *cm->fc = cm->frame_contexts[cm->frame_context_idx]; + vp9_zero(cpi->interp_filter_selected[0]); + } +} + +static void vp9_enc_setup_mi(VP9_COMMON *cm) { + int i; + cm->mi = cm->mip + cm->mi_stride + 1; + memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip)); + cm->prev_mi = cm->prev_mip + cm->mi_stride + 1; + // Clear top border row + memset(cm->prev_mip, 0, sizeof(*cm->prev_mip) * cm->mi_stride); + // Clear left border column + for (i = 1; i < cm->mi_rows + 1; ++i) + memset(&cm->prev_mip[i * cm->mi_stride], 0, sizeof(*cm->prev_mip)); + + cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1; + cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1; + + memset(cm->mi_grid_base, 0, + cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base)); +} + +static int vp9_enc_alloc_mi(VP9_COMMON *cm, int mi_size) { + cm->mip = vpx_calloc(mi_size, sizeof(*cm->mip)); + if (!cm->mip) return 1; + cm->prev_mip = vpx_calloc(mi_size, sizeof(*cm->prev_mip)); + if (!cm->prev_mip) return 1; + cm->mi_alloc_size = mi_size; + + cm->mi_grid_base = + (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->mi_grid_base)); + if (!cm->mi_grid_base) return 1; + cm->prev_mi_grid_base = + (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->prev_mi_grid_base)); + if (!cm->prev_mi_grid_base) return 1; + + return 0; +} + +static void vp9_enc_free_mi(VP9_COMMON *cm) { + vpx_free(cm->mip); + cm->mip = NULL; + vpx_free(cm->prev_mip); + cm->prev_mip = NULL; + vpx_free(cm->mi_grid_base); + cm->mi_grid_base = NULL; + vpx_free(cm->prev_mi_grid_base); + cm->prev_mi_grid_base = NULL; + cm->mi_alloc_size = 0; +} + +static void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) { + // Current mip will be the prev_mip for the next frame. + MODE_INFO **temp_base = cm->prev_mi_grid_base; + MODE_INFO *temp = cm->prev_mip; + + // Skip update prev_mi frame in show_existing_frame mode. + if (cm->show_existing_frame) return; + + cm->prev_mip = cm->mip; + cm->mip = temp; + + // Update the upper left visible macroblock ptrs. + cm->mi = cm->mip + cm->mi_stride + 1; + cm->prev_mi = cm->prev_mip + cm->mi_stride + 1; + + cm->prev_mi_grid_base = cm->mi_grid_base; + cm->mi_grid_base = temp_base; + cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1; + cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1; +} + +static void initialize_enc(void) { + vp9_rtcd(); + vpx_dsp_rtcd(); + vpx_scale_rtcd(); + vp9_init_intra_predictors(); + vp9_init_me_luts(); + vp9_rc_init_minq_luts(); + vp9_entropy_mv_init(); +#if !CONFIG_REALTIME_ONLY + vp9_temporal_filter_init(); +#endif +} + +void vp9_initialize_enc(void) { once(initialize_enc); } + +static void dealloc_compressor_data(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + int i; + + vpx_free(cpi->mbmi_ext_base); + cpi->mbmi_ext_base = NULL; + + vpx_free(cpi->tile_data); + cpi->tile_data = NULL; + + vpx_free(cpi->segmentation_map); + cpi->segmentation_map = NULL; + vpx_free(cpi->coding_context.last_frame_seg_map_copy); + cpi->coding_context.last_frame_seg_map_copy = NULL; + + vpx_free(cpi->nmvcosts[0]); + vpx_free(cpi->nmvcosts[1]); + cpi->nmvcosts[0] = NULL; + cpi->nmvcosts[1] = NULL; + + vpx_free(cpi->nmvcosts_hp[0]); + vpx_free(cpi->nmvcosts_hp[1]); + cpi->nmvcosts_hp[0] = NULL; + cpi->nmvcosts_hp[1] = NULL; + + vpx_free(cpi->nmvsadcosts[0]); + vpx_free(cpi->nmvsadcosts[1]); + cpi->nmvsadcosts[0] = NULL; + cpi->nmvsadcosts[1] = NULL; + + vpx_free(cpi->nmvsadcosts_hp[0]); + vpx_free(cpi->nmvsadcosts_hp[1]); + cpi->nmvsadcosts_hp[0] = NULL; + cpi->nmvsadcosts_hp[1] = NULL; + + vpx_free(cpi->skin_map); + cpi->skin_map = NULL; + + vpx_free(cpi->prev_partition); + cpi->prev_partition = NULL; + + vpx_free(cpi->svc.prev_partition_svc); + cpi->svc.prev_partition_svc = NULL; + + vpx_free(cpi->prev_segment_id); + cpi->prev_segment_id = NULL; + + vpx_free(cpi->prev_variance_low); + cpi->prev_variance_low = NULL; + + vpx_free(cpi->copied_frame_cnt); + cpi->copied_frame_cnt = NULL; + + vpx_free(cpi->content_state_sb_fd); + cpi->content_state_sb_fd = NULL; + + vpx_free(cpi->count_arf_frame_usage); + cpi->count_arf_frame_usage = NULL; + vpx_free(cpi->count_lastgolden_frame_usage); + cpi->count_lastgolden_frame_usage = NULL; + + vp9_cyclic_refresh_free(cpi->cyclic_refresh); + cpi->cyclic_refresh = NULL; + + vpx_free(cpi->active_map.map); + cpi->active_map.map = NULL; + + vpx_free(cpi->roi.roi_map); + cpi->roi.roi_map = NULL; + + vpx_free(cpi->consec_zero_mv); + cpi->consec_zero_mv = NULL; + + vpx_free(cpi->mb_wiener_variance); + cpi->mb_wiener_variance = NULL; + + vpx_free(cpi->mi_ssim_rdmult_scaling_factors); + cpi->mi_ssim_rdmult_scaling_factors = NULL; + +#if CONFIG_RATE_CTRL + if (cpi->oxcf.use_simple_encode_api) { + free_partition_info(cpi); + free_motion_vector_info(cpi); + free_fp_motion_vector_info(cpi); + free_tpl_stats_info(cpi); + } +#endif + + vp9_free_ref_frame_buffers(cm->buffer_pool); +#if CONFIG_VP9_POSTPROC + vp9_free_postproc_buffers(cm); +#endif + vp9_free_context_buffers(cm); + + vpx_free_frame_buffer(&cpi->last_frame_uf); + vpx_free_frame_buffer(&cpi->scaled_source); + vpx_free_frame_buffer(&cpi->scaled_last_source); + vpx_free_frame_buffer(&cpi->alt_ref_buffer); +#ifdef ENABLE_KF_DENOISE + vpx_free_frame_buffer(&cpi->raw_unscaled_source); + vpx_free_frame_buffer(&cpi->raw_scaled_source); +#endif + + vp9_lookahead_destroy(cpi->lookahead); + + vpx_free(cpi->tile_tok[0][0]); + cpi->tile_tok[0][0] = 0; + + vpx_free(cpi->tplist[0][0]); + cpi->tplist[0][0] = NULL; + + vp9_free_pc_tree(&cpi->td); + + for (i = 0; i < cpi->svc.number_spatial_layers; ++i) { + LAYER_CONTEXT *const lc = &cpi->svc.layer_context[i]; + vpx_free(lc->rc_twopass_stats_in.buf); + lc->rc_twopass_stats_in.buf = NULL; + lc->rc_twopass_stats_in.sz = 0; + } + + if (cpi->source_diff_var != NULL) { + vpx_free(cpi->source_diff_var); + cpi->source_diff_var = NULL; + } + + for (i = 0; i < MAX_LAG_BUFFERS; ++i) { + vpx_free_frame_buffer(&cpi->svc.scaled_frames[i]); + } + memset(&cpi->svc.scaled_frames[0], 0, + MAX_LAG_BUFFERS * sizeof(cpi->svc.scaled_frames[0])); + + vpx_free_frame_buffer(&cpi->svc.scaled_temp); + memset(&cpi->svc.scaled_temp, 0, sizeof(cpi->svc.scaled_temp)); + + vpx_free_frame_buffer(&cpi->svc.empty_frame.img); + memset(&cpi->svc.empty_frame, 0, sizeof(cpi->svc.empty_frame)); + + vp9_free_svc_cyclic_refresh(cpi); +} + +static void save_coding_context(VP9_COMP *cpi) { + CODING_CONTEXT *const cc = &cpi->coding_context; + VP9_COMMON *cm = &cpi->common; + + // Stores a snapshot of key state variables which can subsequently be + // restored with a call to vp9_restore_coding_context. These functions are + // intended for use in a re-code loop in vp9_compress_frame where the + // quantizer value is adjusted between loop iterations. + vp9_copy(cc->nmvjointcost, cpi->td.mb.nmvjointcost); + + memcpy(cc->nmvcosts[0], cpi->nmvcosts[0], + MV_VALS * sizeof(*cpi->nmvcosts[0])); + memcpy(cc->nmvcosts[1], cpi->nmvcosts[1], + MV_VALS * sizeof(*cpi->nmvcosts[1])); + memcpy(cc->nmvcosts_hp[0], cpi->nmvcosts_hp[0], + MV_VALS * sizeof(*cpi->nmvcosts_hp[0])); + memcpy(cc->nmvcosts_hp[1], cpi->nmvcosts_hp[1], + MV_VALS * sizeof(*cpi->nmvcosts_hp[1])); + + vp9_copy(cc->segment_pred_probs, cm->seg.pred_probs); + + memcpy(cpi->coding_context.last_frame_seg_map_copy, cm->last_frame_seg_map, + (cm->mi_rows * cm->mi_cols)); + + vp9_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas); + vp9_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas); + + cc->fc = *cm->fc; +} + +static void restore_coding_context(VP9_COMP *cpi) { + CODING_CONTEXT *const cc = &cpi->coding_context; + VP9_COMMON *cm = &cpi->common; + + // Restore key state variables to the snapshot state stored in the + // previous call to vp9_save_coding_context. + vp9_copy(cpi->td.mb.nmvjointcost, cc->nmvjointcost); + + memcpy(cpi->nmvcosts[0], cc->nmvcosts[0], MV_VALS * sizeof(*cc->nmvcosts[0])); + memcpy(cpi->nmvcosts[1], cc->nmvcosts[1], MV_VALS * sizeof(*cc->nmvcosts[1])); + memcpy(cpi->nmvcosts_hp[0], cc->nmvcosts_hp[0], + MV_VALS * sizeof(*cc->nmvcosts_hp[0])); + memcpy(cpi->nmvcosts_hp[1], cc->nmvcosts_hp[1], + MV_VALS * sizeof(*cc->nmvcosts_hp[1])); + + vp9_copy(cm->seg.pred_probs, cc->segment_pred_probs); + + memcpy(cm->last_frame_seg_map, cpi->coding_context.last_frame_seg_map_copy, + (cm->mi_rows * cm->mi_cols)); + + vp9_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas); + vp9_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas); + + *cm->fc = cc->fc; +} + +#if !CONFIG_REALTIME_ONLY +static void configure_static_seg_features(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + struct segmentation *const seg = &cm->seg; + + int high_q = (int)(rc->avg_q > 48.0); + int qi_delta; + + // Disable and clear down for KF + if (cm->frame_type == KEY_FRAME) { + // Clear down the global segmentation map + memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols); + seg->update_map = 0; + seg->update_data = 0; + cpi->static_mb_pct = 0; + + // Disable segmentation + vp9_disable_segmentation(seg); + + // Clear down the segment features. + vp9_clearall_segfeatures(seg); + } else if (cpi->refresh_alt_ref_frame) { + // If this is an alt ref frame + // Clear down the global segmentation map + memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols); + seg->update_map = 0; + seg->update_data = 0; + cpi->static_mb_pct = 0; + + // Disable segmentation and individual segment features by default + vp9_disable_segmentation(seg); + vp9_clearall_segfeatures(seg); + + // Scan frames from current to arf frame. + // This function re-enables segmentation if appropriate. + vp9_update_mbgraph_stats(cpi); + + // If segmentation was enabled set those features needed for the + // arf itself. + if (seg->enabled) { + seg->update_map = 1; + seg->update_data = 1; + + qi_delta = + vp9_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875, cm->bit_depth); + vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2); + vp9_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2); + + vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q); + vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_LF); + + // Where relevant assume segment data is delta data + seg->abs_delta = SEGMENT_DELTADATA; + } + } else if (seg->enabled) { + // All other frames if segmentation has been enabled + + // First normal frame in a valid gf or alt ref group + if (rc->frames_since_golden == 0) { + // Set up segment features for normal frames in an arf group + if (rc->source_alt_ref_active) { + seg->update_map = 0; + seg->update_data = 1; + seg->abs_delta = SEGMENT_DELTADATA; + + qi_delta = + vp9_compute_qdelta(rc, rc->avg_q, rc->avg_q * 1.125, cm->bit_depth); + vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta + 2); + vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q); + + vp9_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2); + vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_LF); + + // Segment coding disabled for compred testing + if (high_q || (cpi->static_mb_pct == 100)) { + vp9_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME); + vp9_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME); + vp9_enable_segfeature(seg, 1, SEG_LVL_SKIP); + } + } else { + // Disable segmentation and clear down features if alt ref + // is not active for this group + + vp9_disable_segmentation(seg); + + memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols); + + seg->update_map = 0; + seg->update_data = 0; + + vp9_clearall_segfeatures(seg); + } + } else if (rc->is_src_frame_alt_ref) { + // Special case where we are coding over the top of a previous + // alt ref frame. + // Segment coding disabled for compred testing + + // Enable ref frame features for segment 0 as well + vp9_enable_segfeature(seg, 0, SEG_LVL_REF_FRAME); + vp9_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME); + + // All mbs should use ALTREF_FRAME + vp9_clear_segdata(seg, 0, SEG_LVL_REF_FRAME); + vp9_set_segdata(seg, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME); + vp9_clear_segdata(seg, 1, SEG_LVL_REF_FRAME); + vp9_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME); + + // Skip all MBs if high Q (0,0 mv and skip coeffs) + if (high_q) { + vp9_enable_segfeature(seg, 0, SEG_LVL_SKIP); + vp9_enable_segfeature(seg, 1, SEG_LVL_SKIP); + } + // Enable data update + seg->update_data = 1; + } else { + // All other frames. + + // No updates.. leave things as they are. + seg->update_map = 0; + seg->update_data = 0; + } + } +} +#endif // !CONFIG_REALTIME_ONLY + +static void update_reference_segmentation_map(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + MODE_INFO **mi_8x8_ptr = cm->mi_grid_visible; + uint8_t *cache_ptr = cm->last_frame_seg_map; + int row, col; + + for (row = 0; row < cm->mi_rows; row++) { + MODE_INFO **mi_8x8 = mi_8x8_ptr; + uint8_t *cache = cache_ptr; + for (col = 0; col < cm->mi_cols; col++, mi_8x8++, cache++) + cache[0] = mi_8x8[0]->segment_id; + mi_8x8_ptr += cm->mi_stride; + cache_ptr += cm->mi_cols; + } +} + +static void alloc_raw_frame_buffers(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + const VP9EncoderConfig *oxcf = &cpi->oxcf; + + if (!cpi->lookahead) + cpi->lookahead = vp9_lookahead_init(oxcf->width, oxcf->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + oxcf->lag_in_frames); + if (!cpi->lookahead) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate lag buffers"); + + // TODO(agrange) Check if ARF is enabled and skip allocation if not. + if (vpx_realloc_frame_buffer(&cpi->alt_ref_buffer, oxcf->width, oxcf->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate altref buffer"); +} + +static void alloc_util_frame_buffers(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + if (vpx_realloc_frame_buffer(&cpi->last_frame_uf, cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate last frame buffer"); + + if (vpx_realloc_frame_buffer(&cpi->scaled_source, cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate scaled source buffer"); + + // For 1 pass cbr: allocate scaled_frame that may be used as an intermediate + // buffer for a 2 stage down-sampling: two stages of 1:2 down-sampling for a + // target of 1/4x1/4. number_spatial_layers must be greater than 2. + if (is_one_pass_svc(cpi) && !cpi->svc.scaled_temp_is_alloc && + cpi->svc.number_spatial_layers > 2) { + cpi->svc.scaled_temp_is_alloc = 1; + if (vpx_realloc_frame_buffer( + &cpi->svc.scaled_temp, cm->width >> 1, cm->height >> 1, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL)) + vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, + "Failed to allocate scaled_frame for svc "); + } + + if (vpx_realloc_frame_buffer(&cpi->scaled_last_source, cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate scaled last source buffer"); +#ifdef ENABLE_KF_DENOISE + if (vpx_realloc_frame_buffer(&cpi->raw_unscaled_source, cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate unscaled raw source frame buffer"); + + if (vpx_realloc_frame_buffer(&cpi->raw_scaled_source, cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate scaled raw source frame buffer"); +#endif +} + +static void alloc_context_buffers_ext(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + int mi_size = cm->mi_cols * cm->mi_rows; + + CHECK_MEM_ERROR(&cm->error, cpi->mbmi_ext_base, + vpx_calloc(mi_size, sizeof(*cpi->mbmi_ext_base))); +} + +static void alloc_compressor_data(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + int sb_rows; + + if (vp9_alloc_context_buffers(cm, cm->width, cm->height)) { + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate context buffers"); + } + + alloc_context_buffers_ext(cpi); + + vpx_free(cpi->tile_tok[0][0]); + + { + unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols); + CHECK_MEM_ERROR(&cm->error, cpi->tile_tok[0][0], + vpx_calloc(tokens, sizeof(*cpi->tile_tok[0][0]))); + } + + sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + vpx_free(cpi->tplist[0][0]); + CHECK_MEM_ERROR( + &cm->error, cpi->tplist[0][0], + vpx_calloc(sb_rows * 4 * (1 << 6), sizeof(*cpi->tplist[0][0]))); + + vp9_setup_pc_tree(&cpi->common, &cpi->td); +} + +void vp9_new_framerate(VP9_COMP *cpi, double framerate) { + cpi->framerate = framerate < 0.1 ? 30 : framerate; + vp9_rc_update_framerate(cpi); +} + +static void set_tile_limits(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + + int min_log2_tile_cols, max_log2_tile_cols; + vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols); + + cm->log2_tile_cols = + clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols); + cm->log2_tile_rows = cpi->oxcf.tile_rows; + + if (cpi->oxcf.target_level == LEVEL_AUTO) { + const int level_tile_cols = + log_tile_cols_from_picsize_level(cpi->common.width, cpi->common.height); + if (cm->log2_tile_cols > level_tile_cols) { + cm->log2_tile_cols = VPXMAX(level_tile_cols, min_log2_tile_cols); + } + } +} + +static void update_frame_size(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + + vp9_set_mb_mi(cm, cm->width, cm->height); + vp9_init_context_buffers(cm); + vp9_init_macroblockd(cm, xd, NULL); + cpi->td.mb.mbmi_ext_base = cpi->mbmi_ext_base; + memset(cpi->mbmi_ext_base, 0, + cm->mi_rows * cm->mi_cols * sizeof(*cpi->mbmi_ext_base)); + + set_tile_limits(cpi); +} + +static void init_buffer_indices(VP9_COMP *cpi) { + int ref_frame; + + for (ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) + cpi->ref_fb_idx[ref_frame] = ref_frame; + + cpi->lst_fb_idx = cpi->ref_fb_idx[LAST_FRAME - 1]; + cpi->gld_fb_idx = cpi->ref_fb_idx[GOLDEN_FRAME - 1]; + cpi->alt_fb_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1]; +} + +static void init_level_constraint(LevelConstraint *lc) { + lc->level_index = -1; + lc->max_cpb_size = INT_MAX; + lc->max_frame_size = INT_MAX; + lc->fail_flag = 0; +} + +static void set_level_constraint(LevelConstraint *ls, int8_t level_index) { + vpx_clear_system_state(); + ls->level_index = level_index; + if (level_index >= 0) { + ls->max_cpb_size = vp9_level_defs[level_index].max_cpb_size * (double)1000; + } +} + +static void init_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { + VP9_COMMON *const cm = &cpi->common; + + cpi->oxcf = *oxcf; + cpi->framerate = oxcf->init_framerate; + cm->profile = oxcf->profile; + cm->bit_depth = oxcf->bit_depth; +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth = oxcf->use_highbitdepth; +#endif + cm->color_space = oxcf->color_space; + cm->color_range = oxcf->color_range; + + cpi->target_level = oxcf->target_level; + cpi->keep_level_stats = oxcf->target_level != LEVEL_MAX; + set_level_constraint(&cpi->level_constraint, + get_level_index(cpi->target_level)); + + cm->width = oxcf->width; + cm->height = oxcf->height; + alloc_compressor_data(cpi); + + cpi->svc.temporal_layering_mode = oxcf->temporal_layering_mode; + + // Single thread case: use counts in common. + cpi->td.counts = &cm->counts; + + // Spatial scalability. + cpi->svc.number_spatial_layers = oxcf->ss_number_layers; + // Temporal scalability. + cpi->svc.number_temporal_layers = oxcf->ts_number_layers; + + if ((cpi->svc.number_temporal_layers > 1) || + ((cpi->svc.number_temporal_layers > 1 || + cpi->svc.number_spatial_layers > 1) && + cpi->oxcf.pass != 1)) { + vp9_init_layer_context(cpi); + } + + // change includes all joint functionality + vp9_change_config(cpi, oxcf); + + cpi->static_mb_pct = 0; + cpi->ref_frame_flags = 0; + + init_buffer_indices(cpi); + + vp9_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height); + cpi->fixed_qp_onepass = 0; +} + +void vp9_check_reset_rc_flag(VP9_COMP *cpi) { + RATE_CONTROL *rc = &cpi->rc; + + if (cpi->common.current_video_frame > + (unsigned int)cpi->svc.number_spatial_layers) { + if (cpi->use_svc) { + vp9_svc_check_reset_layer_rc_flag(cpi); + } else { + if (rc->avg_frame_bandwidth > (3 * rc->last_avg_frame_bandwidth >> 1) || + rc->avg_frame_bandwidth < (rc->last_avg_frame_bandwidth >> 1)) { + rc->rc_1_frame = 0; + rc->rc_2_frame = 0; + rc->bits_off_target = rc->optimal_buffer_level; + rc->buffer_level = rc->optimal_buffer_level; + } + } + } +} + +void vp9_set_rc_buffer_sizes(VP9_COMP *cpi) { + RATE_CONTROL *rc = &cpi->rc; + const VP9EncoderConfig *oxcf = &cpi->oxcf; + + const int64_t bandwidth = oxcf->target_bandwidth; + const int64_t starting = oxcf->starting_buffer_level_ms; + const int64_t optimal = oxcf->optimal_buffer_level_ms; + const int64_t maximum = oxcf->maximum_buffer_size_ms; + + rc->starting_buffer_level = starting * bandwidth / 1000; + rc->optimal_buffer_level = + (optimal == 0) ? bandwidth / 8 : optimal * bandwidth / 1000; + rc->maximum_buffer_size = + (maximum == 0) ? bandwidth / 8 : maximum * bandwidth / 1000; + + // Under a configuration change, where maximum_buffer_size may change, + // keep buffer level clipped to the maximum allowed buffer size. + rc->bits_off_target = VPXMIN(rc->bits_off_target, rc->maximum_buffer_size); + rc->buffer_level = VPXMIN(rc->buffer_level, rc->maximum_buffer_size); +} + +#if CONFIG_VP9_HIGHBITDEPTH +#define HIGHBD_BFP(BT, SDF, SDSF, SDAF, VF, SVF, SVAF, SDX4DF, SDSX4DF) \ + cpi->fn_ptr[BT].sdf = SDF; \ + cpi->fn_ptr[BT].sdsf = SDSF; \ + cpi->fn_ptr[BT].sdaf = SDAF; \ + cpi->fn_ptr[BT].vf = VF; \ + cpi->fn_ptr[BT].svf = SVF; \ + cpi->fn_ptr[BT].svaf = SVAF; \ + cpi->fn_ptr[BT].sdx4df = SDX4DF; \ + cpi->fn_ptr[BT].sdsx4df = SDSX4DF; + +#define MAKE_BFP_SAD_WRAPPER(fnname) \ + static unsigned int fnname##_bits8(const uint8_t *src_ptr, \ + int source_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride); \ + } \ + static unsigned int fnname##_bits10( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 2; \ + } \ + static unsigned int fnname##_bits12( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 4; \ + } + +#define MAKE_BFP_SADAVG_WRAPPER(fnname) \ + static unsigned int fnname##_bits8( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred); \ + } \ + static unsigned int fnname##_bits10( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \ + 2; \ + } \ + static unsigned int fnname##_bits12( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \ + 4; \ + } + +#define MAKE_BFP_SAD4D_WRAPPER(fnname) \ + static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \ + const uint8_t *const ref_ptr[], int ref_stride, \ + unsigned int *sad_array) { \ + fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ + } \ + static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \ + const uint8_t *const ref_ptr[], int ref_stride, \ + unsigned int *sad_array) { \ + int i; \ + fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ + for (i = 0; i < 4; i++) sad_array[i] >>= 2; \ + } \ + static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \ + const uint8_t *const ref_ptr[], int ref_stride, \ + unsigned int *sad_array) { \ + int i; \ + fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ + for (i = 0; i < 4; i++) sad_array[i] >>= 4; \ + } + +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x16) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x16) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x16_avg) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x16x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x16x4d) + +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x32) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x32) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x32_avg) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x32x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x32x4d) + +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x32) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_64x32) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x32_avg) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x32x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_64x32x4d) + +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x64) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x64) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x64_avg) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x64x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x64x4d) + +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x32) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x32) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x32_avg) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x32x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x32x4d) + +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x64) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_64x64) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x64_avg) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x64x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_64x64x4d) + +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x16) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x16) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x16_avg) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x16x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x16x4d) + +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x8) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x8) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x8_avg) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x8x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x8x4d) + +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x16) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x16) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x16_avg) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x16x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x16x4d) + +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x8) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x8) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x8_avg) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x8x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x8x4d) + +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x4) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x4) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x4_avg) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x4x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x4x4d) + +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x8) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_4x8) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x8_avg) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x8x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_4x8x4d) + +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x4) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_4x4) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x4_avg) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x4x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_4x4x4d) + +static void highbd_set_var_fns(VP9_COMP *const cpi) { + VP9_COMMON *const cm = &cpi->common; + if (cm->use_highbitdepth) { + switch (cm->bit_depth) { + case VPX_BITS_8: + HIGHBD_BFP( + BLOCK_32X16, vpx_highbd_sad32x16_bits8, + vpx_highbd_sad_skip_32x16_bits8, vpx_highbd_sad32x16_avg_bits8, + vpx_highbd_8_variance32x16, vpx_highbd_8_sub_pixel_variance32x16, + vpx_highbd_8_sub_pixel_avg_variance32x16, + vpx_highbd_sad32x16x4d_bits8, vpx_highbd_sad_skip_32x16x4d_bits8) + + HIGHBD_BFP( + BLOCK_16X32, vpx_highbd_sad16x32_bits8, + vpx_highbd_sad_skip_16x32_bits8, vpx_highbd_sad16x32_avg_bits8, + vpx_highbd_8_variance16x32, vpx_highbd_8_sub_pixel_variance16x32, + vpx_highbd_8_sub_pixel_avg_variance16x32, + vpx_highbd_sad16x32x4d_bits8, vpx_highbd_sad_skip_16x32x4d_bits8) + + HIGHBD_BFP( + BLOCK_64X32, vpx_highbd_sad64x32_bits8, + vpx_highbd_sad_skip_64x32_bits8, vpx_highbd_sad64x32_avg_bits8, + vpx_highbd_8_variance64x32, vpx_highbd_8_sub_pixel_variance64x32, + vpx_highbd_8_sub_pixel_avg_variance64x32, + vpx_highbd_sad64x32x4d_bits8, vpx_highbd_sad_skip_64x32x4d_bits8) + + HIGHBD_BFP( + BLOCK_32X64, vpx_highbd_sad32x64_bits8, + vpx_highbd_sad_skip_32x64_bits8, vpx_highbd_sad32x64_avg_bits8, + vpx_highbd_8_variance32x64, vpx_highbd_8_sub_pixel_variance32x64, + vpx_highbd_8_sub_pixel_avg_variance32x64, + vpx_highbd_sad32x64x4d_bits8, vpx_highbd_sad_skip_32x64x4d_bits8) + + HIGHBD_BFP( + BLOCK_32X32, vpx_highbd_sad32x32_bits8, + vpx_highbd_sad_skip_32x32_bits8, vpx_highbd_sad32x32_avg_bits8, + vpx_highbd_8_variance32x32, vpx_highbd_8_sub_pixel_variance32x32, + vpx_highbd_8_sub_pixel_avg_variance32x32, + vpx_highbd_sad32x32x4d_bits8, vpx_highbd_sad_skip_32x32x4d_bits8) + + HIGHBD_BFP( + BLOCK_64X64, vpx_highbd_sad64x64_bits8, + vpx_highbd_sad_skip_64x64_bits8, vpx_highbd_sad64x64_avg_bits8, + vpx_highbd_8_variance64x64, vpx_highbd_8_sub_pixel_variance64x64, + vpx_highbd_8_sub_pixel_avg_variance64x64, + vpx_highbd_sad64x64x4d_bits8, vpx_highbd_sad_skip_64x64x4d_bits8) + + HIGHBD_BFP( + BLOCK_16X16, vpx_highbd_sad16x16_bits8, + vpx_highbd_sad_skip_16x16_bits8, vpx_highbd_sad16x16_avg_bits8, + vpx_highbd_8_variance16x16, vpx_highbd_8_sub_pixel_variance16x16, + vpx_highbd_8_sub_pixel_avg_variance16x16, + vpx_highbd_sad16x16x4d_bits8, vpx_highbd_sad_skip_16x16x4d_bits8) + + HIGHBD_BFP( + BLOCK_16X8, vpx_highbd_sad16x8_bits8, + vpx_highbd_sad_skip_16x8_bits8, vpx_highbd_sad16x8_avg_bits8, + vpx_highbd_8_variance16x8, vpx_highbd_8_sub_pixel_variance16x8, + vpx_highbd_8_sub_pixel_avg_variance16x8, + vpx_highbd_sad16x8x4d_bits8, vpx_highbd_sad_skip_16x8x4d_bits8) + + HIGHBD_BFP( + BLOCK_8X16, vpx_highbd_sad8x16_bits8, + vpx_highbd_sad_skip_8x16_bits8, vpx_highbd_sad8x16_avg_bits8, + vpx_highbd_8_variance8x16, vpx_highbd_8_sub_pixel_variance8x16, + vpx_highbd_8_sub_pixel_avg_variance8x16, + vpx_highbd_sad8x16x4d_bits8, vpx_highbd_sad_skip_8x16x4d_bits8) + + HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits8, + vpx_highbd_sad_skip_8x8_bits8, vpx_highbd_sad8x8_avg_bits8, + vpx_highbd_8_variance8x8, vpx_highbd_8_sub_pixel_variance8x8, + vpx_highbd_8_sub_pixel_avg_variance8x8, + vpx_highbd_sad8x8x4d_bits8, vpx_highbd_sad_skip_8x8x4d_bits8) + + HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits8, + vpx_highbd_sad_skip_8x4_bits8, vpx_highbd_sad8x4_avg_bits8, + vpx_highbd_8_variance8x4, vpx_highbd_8_sub_pixel_variance8x4, + vpx_highbd_8_sub_pixel_avg_variance8x4, + vpx_highbd_sad8x4x4d_bits8, vpx_highbd_sad_skip_8x4x4d_bits8) + + HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits8, + vpx_highbd_sad_skip_4x8_bits8, vpx_highbd_sad4x8_avg_bits8, + vpx_highbd_8_variance4x8, vpx_highbd_8_sub_pixel_variance4x8, + vpx_highbd_8_sub_pixel_avg_variance4x8, + vpx_highbd_sad4x8x4d_bits8, vpx_highbd_sad_skip_4x8x4d_bits8) + + HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits8, + vpx_highbd_sad_skip_4x4_bits8, vpx_highbd_sad4x4_avg_bits8, + vpx_highbd_8_variance4x4, vpx_highbd_8_sub_pixel_variance4x4, + vpx_highbd_8_sub_pixel_avg_variance4x4, + vpx_highbd_sad4x4x4d_bits8, vpx_highbd_sad_skip_4x4x4d_bits8) + break; + + case VPX_BITS_10: + HIGHBD_BFP( + BLOCK_32X16, vpx_highbd_sad32x16_bits10, + vpx_highbd_sad_skip_32x16_bits10, vpx_highbd_sad32x16_avg_bits10, + vpx_highbd_10_variance32x16, vpx_highbd_10_sub_pixel_variance32x16, + vpx_highbd_10_sub_pixel_avg_variance32x16, + vpx_highbd_sad32x16x4d_bits10, vpx_highbd_sad_skip_32x16x4d_bits10) + + HIGHBD_BFP( + BLOCK_16X32, vpx_highbd_sad16x32_bits10, + vpx_highbd_sad_skip_16x32_bits10, vpx_highbd_sad16x32_avg_bits10, + vpx_highbd_10_variance16x32, vpx_highbd_10_sub_pixel_variance16x32, + vpx_highbd_10_sub_pixel_avg_variance16x32, + vpx_highbd_sad16x32x4d_bits10, vpx_highbd_sad_skip_16x32x4d_bits10) + + HIGHBD_BFP( + BLOCK_64X32, vpx_highbd_sad64x32_bits10, + vpx_highbd_sad_skip_64x32_bits10, vpx_highbd_sad64x32_avg_bits10, + vpx_highbd_10_variance64x32, vpx_highbd_10_sub_pixel_variance64x32, + vpx_highbd_10_sub_pixel_avg_variance64x32, + vpx_highbd_sad64x32x4d_bits10, vpx_highbd_sad_skip_64x32x4d_bits10) + + HIGHBD_BFP( + BLOCK_32X64, vpx_highbd_sad32x64_bits10, + vpx_highbd_sad_skip_32x64_bits10, vpx_highbd_sad32x64_avg_bits10, + vpx_highbd_10_variance32x64, vpx_highbd_10_sub_pixel_variance32x64, + vpx_highbd_10_sub_pixel_avg_variance32x64, + vpx_highbd_sad32x64x4d_bits10, vpx_highbd_sad_skip_32x64x4d_bits10) + + HIGHBD_BFP( + BLOCK_32X32, vpx_highbd_sad32x32_bits10, + vpx_highbd_sad_skip_32x32_bits10, vpx_highbd_sad32x32_avg_bits10, + vpx_highbd_10_variance32x32, vpx_highbd_10_sub_pixel_variance32x32, + vpx_highbd_10_sub_pixel_avg_variance32x32, + vpx_highbd_sad32x32x4d_bits10, vpx_highbd_sad_skip_32x32x4d_bits10) + + HIGHBD_BFP( + BLOCK_64X64, vpx_highbd_sad64x64_bits10, + vpx_highbd_sad_skip_64x64_bits10, vpx_highbd_sad64x64_avg_bits10, + vpx_highbd_10_variance64x64, vpx_highbd_10_sub_pixel_variance64x64, + vpx_highbd_10_sub_pixel_avg_variance64x64, + vpx_highbd_sad64x64x4d_bits10, vpx_highbd_sad_skip_64x64x4d_bits10) + + HIGHBD_BFP( + BLOCK_16X16, vpx_highbd_sad16x16_bits10, + vpx_highbd_sad_skip_16x16_bits10, vpx_highbd_sad16x16_avg_bits10, + vpx_highbd_10_variance16x16, vpx_highbd_10_sub_pixel_variance16x16, + vpx_highbd_10_sub_pixel_avg_variance16x16, + vpx_highbd_sad16x16x4d_bits10, vpx_highbd_sad_skip_16x16x4d_bits10) + + HIGHBD_BFP( + BLOCK_16X8, vpx_highbd_sad16x8_bits10, + vpx_highbd_sad_skip_16x8_bits10, vpx_highbd_sad16x8_avg_bits10, + vpx_highbd_10_variance16x8, vpx_highbd_10_sub_pixel_variance16x8, + vpx_highbd_10_sub_pixel_avg_variance16x8, + vpx_highbd_sad16x8x4d_bits10, vpx_highbd_sad_skip_16x8x4d_bits10) + + HIGHBD_BFP( + BLOCK_8X16, vpx_highbd_sad8x16_bits10, + vpx_highbd_sad_skip_8x16_bits10, vpx_highbd_sad8x16_avg_bits10, + vpx_highbd_10_variance8x16, vpx_highbd_10_sub_pixel_variance8x16, + vpx_highbd_10_sub_pixel_avg_variance8x16, + vpx_highbd_sad8x16x4d_bits10, vpx_highbd_sad_skip_8x16x4d_bits10) + + HIGHBD_BFP( + BLOCK_8X8, vpx_highbd_sad8x8_bits10, vpx_highbd_sad_skip_8x8_bits10, + vpx_highbd_sad8x8_avg_bits10, vpx_highbd_10_variance8x8, + vpx_highbd_10_sub_pixel_variance8x8, + vpx_highbd_10_sub_pixel_avg_variance8x8, + vpx_highbd_sad8x8x4d_bits10, vpx_highbd_sad_skip_8x8x4d_bits10) + + HIGHBD_BFP( + BLOCK_8X4, vpx_highbd_sad8x4_bits10, vpx_highbd_sad_skip_8x4_bits10, + vpx_highbd_sad8x4_avg_bits10, vpx_highbd_10_variance8x4, + vpx_highbd_10_sub_pixel_variance8x4, + vpx_highbd_10_sub_pixel_avg_variance8x4, + vpx_highbd_sad8x4x4d_bits10, vpx_highbd_sad_skip_8x4x4d_bits10) + + HIGHBD_BFP( + BLOCK_4X8, vpx_highbd_sad4x8_bits10, vpx_highbd_sad_skip_4x8_bits10, + vpx_highbd_sad4x8_avg_bits10, vpx_highbd_10_variance4x8, + vpx_highbd_10_sub_pixel_variance4x8, + vpx_highbd_10_sub_pixel_avg_variance4x8, + vpx_highbd_sad4x8x4d_bits10, vpx_highbd_sad_skip_4x8x4d_bits10) + + HIGHBD_BFP( + BLOCK_4X4, vpx_highbd_sad4x4_bits10, vpx_highbd_sad_skip_4x4_bits10, + vpx_highbd_sad4x4_avg_bits10, vpx_highbd_10_variance4x4, + vpx_highbd_10_sub_pixel_variance4x4, + vpx_highbd_10_sub_pixel_avg_variance4x4, + vpx_highbd_sad4x4x4d_bits10, vpx_highbd_sad_skip_4x4x4d_bits10) + break; + + default: + assert(cm->bit_depth == VPX_BITS_12); + HIGHBD_BFP( + BLOCK_32X16, vpx_highbd_sad32x16_bits12, + vpx_highbd_sad_skip_32x16_bits12, vpx_highbd_sad32x16_avg_bits12, + vpx_highbd_12_variance32x16, vpx_highbd_12_sub_pixel_variance32x16, + vpx_highbd_12_sub_pixel_avg_variance32x16, + vpx_highbd_sad32x16x4d_bits12, vpx_highbd_sad_skip_32x16x4d_bits12) + + HIGHBD_BFP( + BLOCK_16X32, vpx_highbd_sad16x32_bits12, + vpx_highbd_sad_skip_16x32_bits12, vpx_highbd_sad16x32_avg_bits12, + vpx_highbd_12_variance16x32, vpx_highbd_12_sub_pixel_variance16x32, + vpx_highbd_12_sub_pixel_avg_variance16x32, + vpx_highbd_sad16x32x4d_bits12, vpx_highbd_sad_skip_16x32x4d_bits12) + + HIGHBD_BFP( + BLOCK_64X32, vpx_highbd_sad64x32_bits12, + vpx_highbd_sad_skip_64x32_bits12, vpx_highbd_sad64x32_avg_bits12, + vpx_highbd_12_variance64x32, vpx_highbd_12_sub_pixel_variance64x32, + vpx_highbd_12_sub_pixel_avg_variance64x32, + vpx_highbd_sad64x32x4d_bits12, vpx_highbd_sad_skip_64x32x4d_bits12) + + HIGHBD_BFP( + BLOCK_32X64, vpx_highbd_sad32x64_bits12, + vpx_highbd_sad_skip_32x64_bits12, vpx_highbd_sad32x64_avg_bits12, + vpx_highbd_12_variance32x64, vpx_highbd_12_sub_pixel_variance32x64, + vpx_highbd_12_sub_pixel_avg_variance32x64, + vpx_highbd_sad32x64x4d_bits12, vpx_highbd_sad_skip_32x64x4d_bits12) + + HIGHBD_BFP( + BLOCK_32X32, vpx_highbd_sad32x32_bits12, + vpx_highbd_sad_skip_32x32_bits12, vpx_highbd_sad32x32_avg_bits12, + vpx_highbd_12_variance32x32, vpx_highbd_12_sub_pixel_variance32x32, + vpx_highbd_12_sub_pixel_avg_variance32x32, + vpx_highbd_sad32x32x4d_bits12, vpx_highbd_sad_skip_32x32x4d_bits12) + + HIGHBD_BFP( + BLOCK_64X64, vpx_highbd_sad64x64_bits12, + vpx_highbd_sad_skip_64x64_bits12, vpx_highbd_sad64x64_avg_bits12, + vpx_highbd_12_variance64x64, vpx_highbd_12_sub_pixel_variance64x64, + vpx_highbd_12_sub_pixel_avg_variance64x64, + vpx_highbd_sad64x64x4d_bits12, vpx_highbd_sad_skip_64x64x4d_bits12) + + HIGHBD_BFP( + BLOCK_16X16, vpx_highbd_sad16x16_bits12, + vpx_highbd_sad_skip_16x16_bits12, vpx_highbd_sad16x16_avg_bits12, + vpx_highbd_12_variance16x16, vpx_highbd_12_sub_pixel_variance16x16, + vpx_highbd_12_sub_pixel_avg_variance16x16, + vpx_highbd_sad16x16x4d_bits12, vpx_highbd_sad_skip_16x16x4d_bits12) + + HIGHBD_BFP( + BLOCK_16X8, vpx_highbd_sad16x8_bits12, + vpx_highbd_sad_skip_16x8_bits12, vpx_highbd_sad16x8_avg_bits12, + vpx_highbd_12_variance16x8, vpx_highbd_12_sub_pixel_variance16x8, + vpx_highbd_12_sub_pixel_avg_variance16x8, + vpx_highbd_sad16x8x4d_bits12, vpx_highbd_sad_skip_16x8x4d_bits12) + + HIGHBD_BFP( + BLOCK_8X16, vpx_highbd_sad8x16_bits12, + vpx_highbd_sad_skip_8x16_bits12, vpx_highbd_sad8x16_avg_bits12, + vpx_highbd_12_variance8x16, vpx_highbd_12_sub_pixel_variance8x16, + vpx_highbd_12_sub_pixel_avg_variance8x16, + vpx_highbd_sad8x16x4d_bits12, vpx_highbd_sad_skip_8x16x4d_bits12) + + HIGHBD_BFP( + BLOCK_8X8, vpx_highbd_sad8x8_bits12, vpx_highbd_sad_skip_8x8_bits12, + vpx_highbd_sad8x8_avg_bits12, vpx_highbd_12_variance8x8, + vpx_highbd_12_sub_pixel_variance8x8, + vpx_highbd_12_sub_pixel_avg_variance8x8, + vpx_highbd_sad8x8x4d_bits12, vpx_highbd_sad_skip_8x8x4d_bits12) + + HIGHBD_BFP( + BLOCK_8X4, vpx_highbd_sad8x4_bits12, vpx_highbd_sad_skip_8x4_bits12, + vpx_highbd_sad8x4_avg_bits12, vpx_highbd_12_variance8x4, + vpx_highbd_12_sub_pixel_variance8x4, + vpx_highbd_12_sub_pixel_avg_variance8x4, + vpx_highbd_sad8x4x4d_bits12, vpx_highbd_sad_skip_8x4x4d_bits12) + + HIGHBD_BFP( + BLOCK_4X8, vpx_highbd_sad4x8_bits12, vpx_highbd_sad_skip_4x8_bits12, + vpx_highbd_sad4x8_avg_bits12, vpx_highbd_12_variance4x8, + vpx_highbd_12_sub_pixel_variance4x8, + vpx_highbd_12_sub_pixel_avg_variance4x8, + vpx_highbd_sad4x8x4d_bits12, vpx_highbd_sad_skip_4x8x4d_bits12) + + HIGHBD_BFP( + BLOCK_4X4, vpx_highbd_sad4x4_bits12, vpx_highbd_sad_skip_4x4_bits12, + vpx_highbd_sad4x4_avg_bits12, vpx_highbd_12_variance4x4, + vpx_highbd_12_sub_pixel_variance4x4, + vpx_highbd_12_sub_pixel_avg_variance4x4, + vpx_highbd_sad4x4x4d_bits12, vpx_highbd_sad_skip_4x4x4d_bits12) + break; + } + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +static void realloc_segmentation_maps(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + + // Create the encoder segmentation map and set all entries to 0 + vpx_free(cpi->segmentation_map); + CHECK_MEM_ERROR(&cm->error, cpi->segmentation_map, + vpx_calloc(cm->mi_rows * cm->mi_cols, 1)); + + // Create a map used for cyclic background refresh. + if (cpi->cyclic_refresh) vp9_cyclic_refresh_free(cpi->cyclic_refresh); + CHECK_MEM_ERROR(&cm->error, cpi->cyclic_refresh, + vp9_cyclic_refresh_alloc(cm->mi_rows, cm->mi_cols)); + + // Create a map used to mark inactive areas. + vpx_free(cpi->active_map.map); + CHECK_MEM_ERROR(&cm->error, cpi->active_map.map, + vpx_calloc(cm->mi_rows * cm->mi_cols, 1)); + + // And a place holder structure is the coding context + // for use if we want to save and restore it + vpx_free(cpi->coding_context.last_frame_seg_map_copy); + CHECK_MEM_ERROR(&cm->error, cpi->coding_context.last_frame_seg_map_copy, + vpx_calloc(cm->mi_rows * cm->mi_cols, 1)); +} + +static void alloc_copy_partition_data(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + if (cpi->prev_partition == NULL) { + CHECK_MEM_ERROR(&cm->error, cpi->prev_partition, + (BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows, + sizeof(*cpi->prev_partition))); + } + if (cpi->prev_segment_id == NULL) { + CHECK_MEM_ERROR( + &cm->error, cpi->prev_segment_id, + (int8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), + sizeof(*cpi->prev_segment_id))); + } + if (cpi->prev_variance_low == NULL) { + CHECK_MEM_ERROR(&cm->error, cpi->prev_variance_low, + (uint8_t *)vpx_calloc( + (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1) * 25, + sizeof(*cpi->prev_variance_low))); + } + if (cpi->copied_frame_cnt == NULL) { + CHECK_MEM_ERROR( + &cm->error, cpi->copied_frame_cnt, + (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), + sizeof(*cpi->copied_frame_cnt))); + } +} + +static void free_copy_partition_data(VP9_COMP *cpi) { + vpx_free(cpi->prev_partition); + cpi->prev_partition = NULL; + vpx_free(cpi->prev_segment_id); + cpi->prev_segment_id = NULL; + vpx_free(cpi->prev_variance_low); + cpi->prev_variance_low = NULL; + vpx_free(cpi->copied_frame_cnt); + cpi->copied_frame_cnt = NULL; +} + +void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { + VP9_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + int last_w = cpi->oxcf.width; + int last_h = cpi->oxcf.height; + + vp9_init_quantizer(cpi); + if (cm->profile != oxcf->profile) cm->profile = oxcf->profile; + cm->bit_depth = oxcf->bit_depth; + cm->color_space = oxcf->color_space; + cm->color_range = oxcf->color_range; + + cpi->target_level = oxcf->target_level; + cpi->keep_level_stats = oxcf->target_level != LEVEL_MAX; + set_level_constraint(&cpi->level_constraint, + get_level_index(cpi->target_level)); + + if (cm->profile <= PROFILE_1) + assert(cm->bit_depth == VPX_BITS_8); + else + assert(cm->bit_depth > VPX_BITS_8); + + cpi->oxcf = *oxcf; +#if CONFIG_VP9_HIGHBITDEPTH + cpi->td.mb.e_mbd.bd = (int)cm->bit_depth; +#endif // CONFIG_VP9_HIGHBITDEPTH + + if ((oxcf->pass == 0) && (oxcf->rc_mode == VPX_Q)) { + rc->baseline_gf_interval = FIXED_GF_INTERVAL; + } else { + rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2; + } + + cpi->refresh_golden_frame = 0; + cpi->refresh_last_frame = 1; + cm->refresh_frame_context = 1; + cm->reset_frame_context = 0; + + vp9_reset_segment_features(&cm->seg); + vp9_set_high_precision_mv(cpi, 0); + + { + int i; + + for (i = 0; i < MAX_SEGMENTS; i++) + cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout; + } + cpi->encode_breakout = cpi->oxcf.encode_breakout; + + vp9_set_rc_buffer_sizes(cpi); + + // Set up frame rate and related parameters rate control values. + vp9_new_framerate(cpi, cpi->framerate); + + // Set absolute upper and lower quality limits + rc->worst_quality = cpi->oxcf.worst_allowed_q; + rc->best_quality = cpi->oxcf.best_allowed_q; + + cm->interp_filter = cpi->sf.default_interp_filter; + + if (cpi->oxcf.render_width > 0 && cpi->oxcf.render_height > 0) { + cm->render_width = cpi->oxcf.render_width; + cm->render_height = cpi->oxcf.render_height; + } else { + cm->render_width = cpi->oxcf.width; + cm->render_height = cpi->oxcf.height; + } + if (last_w != cpi->oxcf.width || last_h != cpi->oxcf.height) { + cm->width = cpi->oxcf.width; + cm->height = cpi->oxcf.height; + cpi->external_resize = 1; + } + + if (cpi->initial_width) { + int new_mi_size = 0; + vp9_set_mb_mi(cm, cm->width, cm->height); + new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows); + if (cm->mi_alloc_size < new_mi_size) { + vp9_free_context_buffers(cm); + vp9_free_pc_tree(&cpi->td); + vpx_free(cpi->mbmi_ext_base); + alloc_compressor_data(cpi); + realloc_segmentation_maps(cpi); + cpi->initial_width = cpi->initial_height = 0; + cpi->external_resize = 0; + } else if (cm->mi_alloc_size == new_mi_size && + (cpi->oxcf.width > last_w || cpi->oxcf.height > last_h)) { + if (vp9_alloc_loop_filter(cm)) { + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate loop filter data"); + } + } + } + + if (cm->current_video_frame == 0 || last_w != cpi->oxcf.width || + last_h != cpi->oxcf.height) + update_frame_size(cpi); + + if (last_w != cpi->oxcf.width || last_h != cpi->oxcf.height) { + vpx_free(cpi->consec_zero_mv); + CHECK_MEM_ERROR( + &cm->error, cpi->consec_zero_mv, + vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->consec_zero_mv))); + + vpx_free(cpi->skin_map); + CHECK_MEM_ERROR( + &cm->error, cpi->skin_map, + vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->skin_map))); + + free_copy_partition_data(cpi); + alloc_copy_partition_data(cpi); + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) + vp9_cyclic_refresh_reset_resize(cpi); + rc->rc_1_frame = 0; + rc->rc_2_frame = 0; + } + + if ((cpi->svc.number_temporal_layers > 1) || + ((cpi->svc.number_temporal_layers > 1 || + cpi->svc.number_spatial_layers > 1) && + cpi->oxcf.pass != 1)) { + vp9_update_layer_context_change_config(cpi, + (int)cpi->oxcf.target_bandwidth); + } + + vp9_check_reset_rc_flag(cpi); + + cpi->alt_ref_source = NULL; + rc->is_src_frame_alt_ref = 0; + +#if 0 + // Experimental RD Code + cpi->frame_distortion = 0; + cpi->last_frame_distortion = 0; +#endif + + set_tile_limits(cpi); + + cpi->ext_refresh_frame_flags_pending = 0; + cpi->ext_refresh_frame_context_pending = 0; + +#if CONFIG_VP9_HIGHBITDEPTH + highbd_set_var_fns(cpi); +#endif + + vp9_set_row_mt(cpi); +} + +/*********************************************************************** + * Read before modifying 'cal_nmvjointsadcost' or 'cal_nmvsadcosts' * + *********************************************************************** + * The following 2 functions ('cal_nmvjointsadcost' and * + * 'cal_nmvsadcosts') are used to calculate cost lookup tables * + * used by 'vp9_diamond_search_sad'. The C implementation of the * + * function is generic, but the NEON intrinsics optimised version * + * relies on the following properties of the computed tables: * + * For cal_nmvjointsadcost: * + * - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] * + * For cal_nmvsadcosts: * + * - For all i: mvsadcost[0][i] == mvsadcost[1][i] * + * (Equal costs for both components) * + * - For all i: mvsadcost[0][i] == mvsadcost[0][-i] * + * (Cost function is even) * + * If these do not hold, then the NEON optimised version of the * + * 'vp9_diamond_search_sad' function cannot be used as it is, in which * + * case you can revert to using the C function instead. * + ***********************************************************************/ + +static void cal_nmvjointsadcost(int *mvjointsadcost) { + /********************************************************************* + * Warning: Read the comments above before modifying this function * + *********************************************************************/ + mvjointsadcost[0] = 600; + mvjointsadcost[1] = 300; + mvjointsadcost[2] = 300; + mvjointsadcost[3] = 300; +} + +static void cal_nmvsadcosts(int *mvsadcost[2]) { + /********************************************************************* + * Warning: Read the comments above before modifying this function * + *********************************************************************/ + int i = 1; + + mvsadcost[0][0] = 0; + mvsadcost[1][0] = 0; + + do { + double z = 256 * (2 * (log2f(8 * i) + .6)); + mvsadcost[0][i] = (int)z; + mvsadcost[1][i] = (int)z; + mvsadcost[0][-i] = (int)z; + mvsadcost[1][-i] = (int)z; + } while (++i <= MV_MAX); +} + +static void cal_nmvsadcosts_hp(int *mvsadcost[2]) { + int i = 1; + + mvsadcost[0][0] = 0; + mvsadcost[1][0] = 0; + + do { + double z = 256 * (2 * (log2f(8 * i) + .6)); + mvsadcost[0][i] = (int)z; + mvsadcost[1][i] = (int)z; + mvsadcost[0][-i] = (int)z; + mvsadcost[1][-i] = (int)z; + } while (++i <= MV_MAX); +} + +static void init_ref_frame_bufs(VP9_COMMON *cm) { + int i; + BufferPool *const pool = cm->buffer_pool; + cm->new_fb_idx = INVALID_IDX; + for (i = 0; i < REF_FRAMES; ++i) { + cm->ref_frame_map[i] = INVALID_IDX; + } + for (i = 0; i < FRAME_BUFFERS; ++i) { + pool->frame_bufs[i].ref_count = 0; + } +} + +static void update_initial_width(VP9_COMP *cpi, int use_highbitdepth, + int subsampling_x, int subsampling_y) { + VP9_COMMON *const cm = &cpi->common; +#if !CONFIG_VP9_HIGHBITDEPTH + (void)use_highbitdepth; + assert(use_highbitdepth == 0); +#endif + + if (!cpi->initial_width || +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth != use_highbitdepth || +#endif + cm->subsampling_x != subsampling_x || + cm->subsampling_y != subsampling_y) { + cm->subsampling_x = subsampling_x; + cm->subsampling_y = subsampling_y; +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth = use_highbitdepth; +#endif + alloc_util_frame_buffers(cpi); + cpi->initial_width = cm->width; + cpi->initial_height = cm->height; + cpi->initial_mbs = cm->MBs; + } +} + +// TODO(angiebird): Check whether we can move this function to vpx_image.c +static INLINE void vpx_img_chroma_subsampling(vpx_img_fmt_t fmt, + unsigned int *subsampling_x, + unsigned int *subsampling_y) { + switch (fmt) { + case VPX_IMG_FMT_I420: + case VPX_IMG_FMT_YV12: + case VPX_IMG_FMT_NV12: + case VPX_IMG_FMT_I422: + case VPX_IMG_FMT_I42016: + case VPX_IMG_FMT_I42216: *subsampling_x = 1; break; + default: *subsampling_x = 0; break; + } + + switch (fmt) { + case VPX_IMG_FMT_I420: + case VPX_IMG_FMT_I440: + case VPX_IMG_FMT_YV12: + case VPX_IMG_FMT_NV12: + case VPX_IMG_FMT_I42016: + case VPX_IMG_FMT_I44016: *subsampling_y = 1; break; + default: *subsampling_y = 0; break; + } +} + +// TODO(angiebird): Check whether we can move this function to vpx_image.c +static INLINE int vpx_img_use_highbitdepth(vpx_img_fmt_t fmt) { + return fmt & VPX_IMG_FMT_HIGHBITDEPTH; +} + +#if CONFIG_VP9_TEMPORAL_DENOISING +static void setup_denoiser_buffer(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + if (cpi->oxcf.noise_sensitivity > 0 && + !cpi->denoiser.frame_buffer_initialized) { + if (vp9_denoiser_alloc(cm, &cpi->svc, &cpi->denoiser, cpi->use_svc, + cpi->oxcf.noise_sensitivity, cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate denoiser"); + } +} +#endif + +void vp9_update_compressor_with_img_fmt(VP9_COMP *cpi, vpx_img_fmt_t img_fmt) { + const VP9EncoderConfig *oxcf = &cpi->oxcf; + unsigned int subsampling_x, subsampling_y; + const int use_highbitdepth = vpx_img_use_highbitdepth(img_fmt); + vpx_img_chroma_subsampling(img_fmt, &subsampling_x, &subsampling_y); + + update_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y); +#if CONFIG_VP9_TEMPORAL_DENOISING + setup_denoiser_buffer(cpi); +#endif + + assert(cpi->lookahead == NULL); + cpi->lookahead = vp9_lookahead_init(oxcf->width, oxcf->height, subsampling_x, + subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + use_highbitdepth, +#endif + oxcf->lag_in_frames); + alloc_raw_frame_buffers(cpi); +} + +VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, + BufferPool *const pool) { + unsigned int i; + VP9_COMP *volatile const cpi = vpx_memalign(32, sizeof(*cpi)); + VP9_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL; + + if (!cm) return NULL; + + vp9_zero(*cpi); + + if (setjmp(cm->error.jmp)) { + cm->error.setjmp = 0; + vp9_remove_compressor(cpi); + return 0; + } + + cm->error.setjmp = 1; + cm->alloc_mi = vp9_enc_alloc_mi; + cm->free_mi = vp9_enc_free_mi; + cm->setup_mi = vp9_enc_setup_mi; + + CHECK_MEM_ERROR(&cm->error, cm->fc, + (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc))); + CHECK_MEM_ERROR( + &cm->error, cm->frame_contexts, + (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, sizeof(*cm->frame_contexts))); + + cpi->compute_frame_low_motion_onepass = 1; + cpi->use_svc = 0; + cpi->resize_state = ORIG; + cpi->external_resize = 0; + cpi->resize_avg_qp = 0; + cpi->resize_buffer_underflow = 0; + cpi->use_skin_detection = 0; + cpi->common.buffer_pool = pool; + init_ref_frame_bufs(cm); + + cpi->force_update_segmentation = 0; + + init_config(cpi, oxcf); + cpi->frame_info = vp9_get_frame_info(oxcf); + + vp9_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc); + vp9_init_rd_parameters(cpi); + + init_frame_indexes(cm); + cpi->tile_data = NULL; + + realloc_segmentation_maps(cpi); + + CHECK_MEM_ERROR( + &cm->error, cpi->skin_map, + vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->skin_map))); + +#if !CONFIG_REALTIME_ONLY + CHECK_MEM_ERROR(&cm->error, cpi->alt_ref_aq, vp9_alt_ref_aq_create()); +#endif + + CHECK_MEM_ERROR( + &cm->error, cpi->consec_zero_mv, + vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->consec_zero_mv))); + + CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts[0], + vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[0]))); + CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts[1], + vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[1]))); + CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts_hp[0], + vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[0]))); + CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts_hp[1], + vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[1]))); + CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts[0], + vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[0]))); + CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts[1], + vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[1]))); + CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts_hp[0], + vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[0]))); + CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts_hp[1], + vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[1]))); + + for (i = 0; i < (sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0])); + i++) { + CHECK_MEM_ERROR( + &cm->error, cpi->mbgraph_stats[i].mb_stats, + vpx_calloc(cm->MBs * sizeof(*cpi->mbgraph_stats[i].mb_stats), 1)); + } + + cpi->refresh_alt_ref_frame = 0; + cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS; + + init_level_info(&cpi->level_info); + init_level_constraint(&cpi->level_constraint); + +#if CONFIG_INTERNAL_STATS + cpi->b_calculate_blockiness = 1; + cpi->b_calculate_consistency = 1; + cpi->total_inconsistency = 0; + cpi->psnr.worst = 100.0; + cpi->worst_ssim = 100.0; + + cpi->count = 0; + cpi->bytes = 0; + + if (cpi->b_calculate_psnr) { + cpi->total_sq_error = 0; + cpi->total_samples = 0; + + cpi->totalp_sq_error = 0; + cpi->totalp_samples = 0; + + cpi->tot_recode_hits = 0; + cpi->summed_quality = 0; + cpi->summed_weights = 0; + cpi->summedp_quality = 0; + cpi->summedp_weights = 0; + } + + cpi->fastssim.worst = 100.0; + + cpi->psnrhvs.worst = 100.0; + + if (cpi->b_calculate_blockiness) { + cpi->total_blockiness = 0; + cpi->worst_blockiness = 0.0; + } + + if (cpi->b_calculate_consistency) { + CHECK_MEM_ERROR(&cm->error, cpi->ssim_vars, + vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols, + sizeof(*cpi->ssim_vars) * 4)); + cpi->worst_consistency = 100.0; + } else { + cpi->ssim_vars = NULL; + } + +#endif + + cpi->first_time_stamp_ever = INT64_MAX; + + /********************************************************************* + * Warning: Read the comments around 'cal_nmvjointsadcost' and * + * 'cal_nmvsadcosts' before modifying how these tables are computed. * + *********************************************************************/ + cal_nmvjointsadcost(cpi->td.mb.nmvjointsadcost); + cpi->td.mb.nmvcost[0] = &cpi->nmvcosts[0][MV_MAX]; + cpi->td.mb.nmvcost[1] = &cpi->nmvcosts[1][MV_MAX]; + cpi->td.mb.nmvsadcost[0] = &cpi->nmvsadcosts[0][MV_MAX]; + cpi->td.mb.nmvsadcost[1] = &cpi->nmvsadcosts[1][MV_MAX]; + cal_nmvsadcosts(cpi->td.mb.nmvsadcost); + + cpi->td.mb.nmvcost_hp[0] = &cpi->nmvcosts_hp[0][MV_MAX]; + cpi->td.mb.nmvcost_hp[1] = &cpi->nmvcosts_hp[1][MV_MAX]; + cpi->td.mb.nmvsadcost_hp[0] = &cpi->nmvsadcosts_hp[0][MV_MAX]; + cpi->td.mb.nmvsadcost_hp[1] = &cpi->nmvsadcosts_hp[1][MV_MAX]; + cal_nmvsadcosts_hp(cpi->td.mb.nmvsadcost_hp); + +#if CONFIG_VP9_TEMPORAL_DENOISING +#ifdef OUTPUT_YUV_DENOISED + yuv_denoised_file = fopen("denoised.yuv", "ab"); +#endif +#endif +#ifdef OUTPUT_YUV_SKINMAP + yuv_skinmap_file = fopen("skinmap.yuv", "wb"); +#endif +#ifdef OUTPUT_YUV_REC + yuv_rec_file = fopen("rec.yuv", "wb"); +#endif +#ifdef OUTPUT_YUV_SVC_SRC + yuv_svc_src[0] = fopen("svc_src_0.yuv", "wb"); + yuv_svc_src[1] = fopen("svc_src_1.yuv", "wb"); + yuv_svc_src[2] = fopen("svc_src_2.yuv", "wb"); +#endif + +#if 0 + framepsnr = fopen("framepsnr.stt", "a"); + kf_list = fopen("kf_list.stt", "w"); +#endif + + cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED; + + { + vpx_codec_err_t codec_status = vp9_extrc_init(&cpi->ext_ratectrl); + if (codec_status != VPX_CODEC_OK) { + vpx_internal_error(&cm->error, codec_status, "vp9_extrc_init() failed"); + } + } + +#if !CONFIG_REALTIME_ONLY + if (oxcf->pass == 1) { + vp9_init_first_pass(cpi); + } else if (oxcf->pass == 2) { + const size_t packet_sz = sizeof(FIRSTPASS_STATS); + const int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz); + + if (cpi->svc.number_spatial_layers > 1 || + cpi->svc.number_temporal_layers > 1) { + FIRSTPASS_STATS *const stats = oxcf->two_pass_stats_in.buf; + FIRSTPASS_STATS *stats_copy[VPX_SS_MAX_LAYERS] = { 0 }; + int n; + + for (n = 0; n < oxcf->ss_number_layers; ++n) { + FIRSTPASS_STATS *const last_packet_for_layer = + &stats[packets - oxcf->ss_number_layers + n]; + const int layer_id = (int)last_packet_for_layer->spatial_layer_id; + const int packets_in_layer = (int)last_packet_for_layer->count + 1; + if (layer_id >= 0 && layer_id < oxcf->ss_number_layers) { + int num_frames; + LAYER_CONTEXT *const lc = &cpi->svc.layer_context[layer_id]; + + vpx_free(lc->rc_twopass_stats_in.buf); + + lc->rc_twopass_stats_in.sz = packets_in_layer * packet_sz; + CHECK_MEM_ERROR(&cm->error, lc->rc_twopass_stats_in.buf, + vpx_malloc(lc->rc_twopass_stats_in.sz)); + lc->twopass.stats_in_start = lc->rc_twopass_stats_in.buf; + lc->twopass.stats_in = lc->twopass.stats_in_start; + lc->twopass.stats_in_end = + lc->twopass.stats_in_start + packets_in_layer - 1; + // Note the last packet is cumulative first pass stats. + // So the number of frames is packet number minus one + num_frames = packets_in_layer - 1; + fps_init_first_pass_info(&lc->twopass.first_pass_info, + lc->rc_twopass_stats_in.buf, num_frames); + stats_copy[layer_id] = lc->rc_twopass_stats_in.buf; + } + } + + for (n = 0; n < packets; ++n) { + const int layer_id = (int)stats[n].spatial_layer_id; + if (layer_id >= 0 && layer_id < oxcf->ss_number_layers && + stats_copy[layer_id] != NULL) { + *stats_copy[layer_id] = stats[n]; + ++stats_copy[layer_id]; + } + } + + vp9_init_second_pass_spatial_svc(cpi); + } else { + int num_frames; + + cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf; + cpi->twopass.stats_in = cpi->twopass.stats_in_start; + cpi->twopass.stats_in_end = &cpi->twopass.stats_in[packets - 1]; + // Note the last packet is cumulative first pass stats. + // So the number of frames is packet number minus one + num_frames = packets - 1; + fps_init_first_pass_info(&cpi->twopass.first_pass_info, + oxcf->two_pass_stats_in.buf, num_frames); + + vp9_init_second_pass(cpi); + } + } +#endif // !CONFIG_REALTIME_ONLY + + cpi->mb_wiener_var_cols = 0; + cpi->mb_wiener_var_rows = 0; + cpi->mb_wiener_variance = NULL; + + vp9_set_speed_features_framesize_independent(cpi, oxcf->speed); + vp9_set_speed_features_framesize_dependent(cpi, oxcf->speed); + + { + const int bsize = BLOCK_16X16; + const int w = num_8x8_blocks_wide_lookup[bsize]; + const int h = num_8x8_blocks_high_lookup[bsize]; + const int num_cols = (cm->mi_cols + w - 1) / w; + const int num_rows = (cm->mi_rows + h - 1) / h; + CHECK_MEM_ERROR(&cm->error, cpi->mi_ssim_rdmult_scaling_factors, + vpx_calloc(num_rows * num_cols, + sizeof(*cpi->mi_ssim_rdmult_scaling_factors))); + } + + cpi->kmeans_data_arr_alloc = 0; +#if CONFIG_NON_GREEDY_MV + cpi->tpl_ready = 0; +#endif // CONFIG_NON_GREEDY_MV + for (i = 0; i < MAX_ARF_GOP_SIZE; ++i) { + cpi->tpl_stats[i].tpl_stats_ptr = NULL; + } + + // Allocate memory to store variances for a frame. + CHECK_MEM_ERROR(&cm->error, cpi->source_diff_var, + vpx_calloc(cm->MBs, sizeof(cpi->source_diff_var))); + cpi->source_var_thresh = 0; + cpi->frames_till_next_var_check = 0; +#define BFP(BT, SDF, SDSF, SDAF, VF, SVF, SVAF, SDX4DF, SDSX4DF) \ + cpi->fn_ptr[BT].sdf = SDF; \ + cpi->fn_ptr[BT].sdsf = SDSF; \ + cpi->fn_ptr[BT].sdaf = SDAF; \ + cpi->fn_ptr[BT].vf = VF; \ + cpi->fn_ptr[BT].svf = SVF; \ + cpi->fn_ptr[BT].svaf = SVAF; \ + cpi->fn_ptr[BT].sdx4df = SDX4DF; \ + cpi->fn_ptr[BT].sdsx4df = SDSX4DF; + + BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad_skip_32x16, vpx_sad32x16_avg, + vpx_variance32x16, vpx_sub_pixel_variance32x16, + vpx_sub_pixel_avg_variance32x16, vpx_sad32x16x4d, vpx_sad_skip_32x16x4d) + + BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad_skip_16x32, vpx_sad16x32_avg, + vpx_variance16x32, vpx_sub_pixel_variance16x32, + vpx_sub_pixel_avg_variance16x32, vpx_sad16x32x4d, vpx_sad_skip_16x32x4d) + + BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad_skip_64x32, vpx_sad64x32_avg, + vpx_variance64x32, vpx_sub_pixel_variance64x32, + vpx_sub_pixel_avg_variance64x32, vpx_sad64x32x4d, vpx_sad_skip_64x32x4d) + + BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad_skip_32x64, vpx_sad32x64_avg, + vpx_variance32x64, vpx_sub_pixel_variance32x64, + vpx_sub_pixel_avg_variance32x64, vpx_sad32x64x4d, vpx_sad_skip_32x64x4d) + + BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad_skip_32x32, vpx_sad32x32_avg, + vpx_variance32x32, vpx_sub_pixel_variance32x32, + vpx_sub_pixel_avg_variance32x32, vpx_sad32x32x4d, vpx_sad_skip_32x32x4d) + + BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad_skip_64x64, vpx_sad64x64_avg, + vpx_variance64x64, vpx_sub_pixel_variance64x64, + vpx_sub_pixel_avg_variance64x64, vpx_sad64x64x4d, vpx_sad_skip_64x64x4d) + + BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad_skip_16x16, vpx_sad16x16_avg, + vpx_variance16x16, vpx_sub_pixel_variance16x16, + vpx_sub_pixel_avg_variance16x16, vpx_sad16x16x4d, vpx_sad_skip_16x16x4d) + + BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad_skip_16x8, vpx_sad16x8_avg, + vpx_variance16x8, vpx_sub_pixel_variance16x8, + vpx_sub_pixel_avg_variance16x8, vpx_sad16x8x4d, vpx_sad_skip_16x8x4d) + + BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad_skip_8x16, vpx_sad8x16_avg, + vpx_variance8x16, vpx_sub_pixel_variance8x16, + vpx_sub_pixel_avg_variance8x16, vpx_sad8x16x4d, vpx_sad_skip_8x16x4d) + + BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad_skip_8x8, vpx_sad8x8_avg, vpx_variance8x8, + vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d, + vpx_sad_skip_8x8x4d) + + BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad_skip_8x4, vpx_sad8x4_avg, vpx_variance8x4, + vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d, + vpx_sad_skip_8x4x4d) + + BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad_skip_4x8, vpx_sad4x8_avg, vpx_variance4x8, + vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d, + vpx_sad_skip_4x8x4d) + + BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad_skip_4x4, vpx_sad4x4_avg, vpx_variance4x4, + vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d, + vpx_sad_skip_4x4x4d) + +#if CONFIG_VP9_HIGHBITDEPTH + highbd_set_var_fns(cpi); +#endif + + /* vp9_init_quantizer() is first called here. Add check in + * vp9_frame_init_quantizer() so that vp9_init_quantizer is only + * called later when needed. This will avoid unnecessary calls of + * vp9_init_quantizer() for every frame. + */ + vp9_init_quantizer(cpi); + + vp9_loop_filter_init(cm); + + // Set up the unit scaling factor used during motion search. +#if CONFIG_VP9_HIGHBITDEPTH + vp9_setup_scale_factors_for_frame(&cpi->me_sf, cm->width, cm->height, + cm->width, cm->height, + cm->use_highbitdepth); +#else + vp9_setup_scale_factors_for_frame(&cpi->me_sf, cm->width, cm->height, + cm->width, cm->height); +#endif // CONFIG_VP9_HIGHBITDEPTH + cpi->td.mb.me_sf = &cpi->me_sf; + + cm->error.setjmp = 0; + +#if CONFIG_RATE_CTRL + encode_command_init(&cpi->encode_command); + if (oxcf->use_simple_encode_api) { + partition_info_init(cpi); + motion_vector_info_init(cpi); + fp_motion_vector_info_init(cpi); + tpl_stats_info_init(cpi); + } +#endif + + return cpi; +} + +#if CONFIG_INTERNAL_STATS +#define SNPRINT(H, T) snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T)) + +#define SNPRINT2(H, T, V) \ + snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V)) +#endif // CONFIG_INTERNAL_STATS + +void vp9_remove_compressor(VP9_COMP *cpi) { + VP9_COMMON *cm; + unsigned int i; + + if (!cpi) return; + +#if CONFIG_INTERNAL_STATS + vpx_free(cpi->ssim_vars); +#endif + + cm = &cpi->common; + if (cm->current_video_frame > 0) { +#if CONFIG_INTERNAL_STATS + vpx_clear_system_state(); + + if (cpi->oxcf.pass != 1) { + char headings[512] = { 0 }; + char results[512] = { 0 }; + FILE *f = fopen("opsnr.stt", "a"); + double time_encoded = + (cpi->last_end_time_stamp_seen - cpi->first_time_stamp_ever) / + 10000000.000; + double total_encode_time = + (cpi->time_receive_data + cpi->time_compress_data) / 1000.000; + const double dr = + (double)cpi->bytes * (double)8 / (double)1000 / time_encoded; + const double peak = (double)((1 << cpi->oxcf.input_bit_depth) - 1); + const double target_rate = (double)cpi->oxcf.target_bandwidth / 1000; + const double rate_err = ((100.0 * (dr - target_rate)) / target_rate); + + if (cpi->b_calculate_psnr) { + const double total_psnr = vpx_sse_to_psnr( + (double)cpi->total_samples, peak, (double)cpi->total_sq_error); + const double totalp_psnr = vpx_sse_to_psnr( + (double)cpi->totalp_samples, peak, (double)cpi->totalp_sq_error); + const double total_ssim = + 100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0); + const double totalp_ssim = + 100 * pow(cpi->summedp_quality / cpi->summedp_weights, 8.0); + + snprintf(headings, sizeof(headings), + "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t" + "VPXSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t" + "WstPsnr\tWstSsim\tWstFast\tWstHVS\t" + "AVPsnrY\tAPsnrCb\tAPsnrCr"); + snprintf(results, sizeof(results), + "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t" + "%7.3f\t%7.3f\t%7.3f\t%7.3f\t" + "%7.3f\t%7.3f\t%7.3f\t%7.3f\t" + "%7.3f\t%7.3f\t%7.3f", + dr, cpi->psnr.stat[ALL] / cpi->count, total_psnr, + cpi->psnrp.stat[ALL] / cpi->count, totalp_psnr, total_ssim, + totalp_ssim, cpi->fastssim.stat[ALL] / cpi->count, + cpi->psnrhvs.stat[ALL] / cpi->count, cpi->psnr.worst, + cpi->worst_ssim, cpi->fastssim.worst, cpi->psnrhvs.worst, + cpi->psnr.stat[Y] / cpi->count, cpi->psnr.stat[U] / cpi->count, + cpi->psnr.stat[V] / cpi->count); + + if (cpi->b_calculate_blockiness) { + SNPRINT(headings, "\t Block\tWstBlck"); + SNPRINT2(results, "\t%7.3f", cpi->total_blockiness / cpi->count); + SNPRINT2(results, "\t%7.3f", cpi->worst_blockiness); + } + + if (cpi->b_calculate_consistency) { + double consistency = + vpx_sse_to_psnr((double)cpi->totalp_samples, peak, + (double)cpi->total_inconsistency); + + SNPRINT(headings, "\tConsist\tWstCons"); + SNPRINT2(results, "\t%7.3f", consistency); + SNPRINT2(results, "\t%7.3f", cpi->worst_consistency); + } + + SNPRINT(headings, "\t Time\tRcErr\tAbsErr"); + SNPRINT2(results, "\t%8.0f", total_encode_time); + SNPRINT2(results, "\t%7.2f", rate_err); + SNPRINT2(results, "\t%7.2f", fabs(rate_err)); + + fprintf(f, "%s\tAPsnr611\n", headings); + fprintf( + f, "%s\t%7.3f\n", results, + (6 * cpi->psnr.stat[Y] + cpi->psnr.stat[U] + cpi->psnr.stat[V]) / + (cpi->count * 8)); + } + + fclose(f); + } +#endif + +#if 0 + { + printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000); + printf("\n_frames receive_data encod_mb_row compress_frame Total\n"); + printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame, + cpi->time_receive_data / 1000, cpi->time_encode_sb_row / 1000, + cpi->time_compress_data / 1000, + (cpi->time_receive_data + cpi->time_compress_data) / 1000); + } +#endif + } + +#if CONFIG_VP9_TEMPORAL_DENOISING + vp9_denoiser_free(&(cpi->denoiser)); +#endif + + if (cpi->kmeans_data_arr_alloc) { +#if CONFIG_MULTITHREAD + pthread_mutex_destroy(&cpi->kmeans_mutex); +#endif + vpx_free(cpi->kmeans_data_arr); + } + + vp9_free_tpl_buffer(cpi); + + vp9_loop_filter_dealloc(&cpi->lf_row_sync); + vp9_bitstream_encode_tiles_buffer_dealloc(cpi); + vp9_row_mt_mem_dealloc(cpi); + vp9_encode_free_mt_data(cpi); + +#if !CONFIG_REALTIME_ONLY + vp9_alt_ref_aq_destroy(cpi->alt_ref_aq); +#endif + + dealloc_compressor_data(cpi); + + for (i = 0; i < sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]); + ++i) { + vpx_free(cpi->mbgraph_stats[i].mb_stats); + } + + vp9_extrc_delete(&cpi->ext_ratectrl); + + // Help detect use after free of the error detail string. + memset(cm->error.detail, 'A', sizeof(cm->error.detail) - 1); + cm->error.detail[sizeof(cm->error.detail) - 1] = '\0'; + + vp9_remove_common(cm); + vp9_free_ref_frame_buffers(cm->buffer_pool); +#if CONFIG_VP9_POSTPROC + vp9_free_postproc_buffers(cm); +#endif + vpx_free(cpi); + +#if CONFIG_VP9_TEMPORAL_DENOISING +#ifdef OUTPUT_YUV_DENOISED + fclose(yuv_denoised_file); +#endif +#endif +#ifdef OUTPUT_YUV_SKINMAP + fclose(yuv_skinmap_file); +#endif +#ifdef OUTPUT_YUV_REC + fclose(yuv_rec_file); +#endif +#ifdef OUTPUT_YUV_SVC_SRC + fclose(yuv_svc_src[0]); + fclose(yuv_svc_src[1]); + fclose(yuv_svc_src[2]); +#endif + +#if 0 + + if (keyfile) + fclose(keyfile); + + if (framepsnr) + fclose(framepsnr); + + if (kf_list) + fclose(kf_list); + +#endif +} + +int vp9_get_psnr(const VP9_COMP *cpi, PSNR_STATS *psnr) { + if (is_psnr_calc_enabled(cpi)) { +#if CONFIG_VP9_HIGHBITDEPTH + vpx_calc_highbd_psnr(cpi->raw_source_frame, cpi->common.frame_to_show, psnr, + cpi->td.mb.e_mbd.bd, cpi->oxcf.input_bit_depth); +#else + vpx_calc_psnr(cpi->raw_source_frame, cpi->common.frame_to_show, psnr); +#endif + return 1; + } else { + vp9_zero(*psnr); + return 0; + } +} + +int vp9_use_as_reference(VP9_COMP *cpi, int ref_frame_flags) { + if (ref_frame_flags > 7) return -1; + + cpi->ref_frame_flags = ref_frame_flags; + return 0; +} + +void vp9_update_reference(VP9_COMP *cpi, int ref_frame_flags) { + cpi->ext_refresh_golden_frame = (ref_frame_flags & VP9_GOLD_FLAG) != 0; + cpi->ext_refresh_alt_ref_frame = (ref_frame_flags & VP9_ALT_FLAG) != 0; + cpi->ext_refresh_last_frame = (ref_frame_flags & VP9_LAST_FLAG) != 0; + cpi->ext_refresh_frame_flags_pending = 1; +} + +static YV12_BUFFER_CONFIG *get_vp9_ref_frame_buffer( + VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag) { + MV_REFERENCE_FRAME ref_frame = NO_REF_FRAME; + if (ref_frame_flag == VP9_LAST_FLAG) + ref_frame = LAST_FRAME; + else if (ref_frame_flag == VP9_GOLD_FLAG) + ref_frame = GOLDEN_FRAME; + else if (ref_frame_flag == VP9_ALT_FLAG) + ref_frame = ALTREF_FRAME; + + return ref_frame == NO_REF_FRAME ? NULL + : get_ref_frame_buffer(cpi, ref_frame); +} + +int vp9_copy_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd) { + YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag); + if (cfg) { + vpx_yv12_copy_frame(cfg, sd); + return 0; + } else { + return -1; + } +} + +int vp9_set_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd) { + YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag); + if (cfg) { + vpx_yv12_copy_frame(sd, cfg); + return 0; + } else { + return -1; + } +} + +int vp9_update_entropy(VP9_COMP *cpi, int update) { + cpi->ext_refresh_frame_context = update; + cpi->ext_refresh_frame_context_pending = 1; + return 0; +} + +#ifdef OUTPUT_YUV_REC +void vp9_write_yuv_rec_frame(VP9_COMMON *cm) { + YV12_BUFFER_CONFIG *s = cm->frame_to_show; + uint8_t *src = s->y_buffer; + int h = cm->height; + +#if CONFIG_VP9_HIGHBITDEPTH + if (s->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *src16 = CONVERT_TO_SHORTPTR(s->y_buffer); + + do { + fwrite(src16, s->y_width, 2, yuv_rec_file); + src16 += s->y_stride; + } while (--h); + + src16 = CONVERT_TO_SHORTPTR(s->u_buffer); + h = s->uv_height; + + do { + fwrite(src16, s->uv_width, 2, yuv_rec_file); + src16 += s->uv_stride; + } while (--h); + + src16 = CONVERT_TO_SHORTPTR(s->v_buffer); + h = s->uv_height; + + do { + fwrite(src16, s->uv_width, 2, yuv_rec_file); + src16 += s->uv_stride; + } while (--h); + + fflush(yuv_rec_file); + return; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + + do { + fwrite(src, s->y_width, 1, yuv_rec_file); + src += s->y_stride; + } while (--h); + + src = s->u_buffer; + h = s->uv_height; + + do { + fwrite(src, s->uv_width, 1, yuv_rec_file); + src += s->uv_stride; + } while (--h); + + src = s->v_buffer; + h = s->uv_height; + + do { + fwrite(src, s->uv_width, 1, yuv_rec_file); + src += s->uv_stride; + } while (--h); + + fflush(yuv_rec_file); +} +#endif + +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, int bd) { +#else +void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst) { +#endif // CONFIG_VP9_HIGHBITDEPTH + // TODO(dkovalev): replace YV12_BUFFER_CONFIG with vpx_image_t + int i; + const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer, + src->v_buffer }; + const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride }; + const int src_widths[3] = { src->y_crop_width, src->uv_crop_width, + src->uv_crop_width }; + const int src_heights[3] = { src->y_crop_height, src->uv_crop_height, + src->uv_crop_height }; + uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer }; + const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride }; + const int dst_widths[3] = { dst->y_crop_width, dst->uv_crop_width, + dst->uv_crop_width }; + const int dst_heights[3] = { dst->y_crop_height, dst->uv_crop_height, + dst->uv_crop_height }; + + for (i = 0; i < MAX_MB_PLANE; ++i) { +#if CONFIG_VP9_HIGHBITDEPTH + if (src->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_highbd_resize_plane(srcs[i], src_heights[i], src_widths[i], + src_strides[i], dsts[i], dst_heights[i], + dst_widths[i], dst_strides[i], bd); + } else { + vp9_resize_plane(srcs[i], src_heights[i], src_widths[i], src_strides[i], + dsts[i], dst_heights[i], dst_widths[i], dst_strides[i]); + } +#else + vp9_resize_plane(srcs[i], src_heights[i], src_widths[i], src_strides[i], + dsts[i], dst_heights[i], dst_widths[i], dst_strides[i]); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + vpx_extend_frame_borders(dst); +} + +#if CONFIG_VP9_HIGHBITDEPTH +static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, int bd, + INTERP_FILTER filter_type, + int phase_scaler) { + const int src_w = src->y_crop_width; + const int src_h = src->y_crop_height; + const int dst_w = dst->y_crop_width; + const int dst_h = dst->y_crop_height; + + // The issue b/311394513 reveals a corner case bug. + // For bd = 8, vpx_scaled_2d() requires both x_step_q4 and y_step_q4 are less + // than or equal to 64. For bd >= 10, vpx_highbd_convolve8() requires both + // x_step_q4 and y_step_q4 are less than or equal to 32. If this condition + // isn't met, it needs to call vp9_scale_and_extend_frame_nonnormative() that + // supports arbitrary scaling. + const int x_step_q4 = 16 * src_w / dst_w; + const int y_step_q4 = 16 * src_h / dst_h; + const int is_arbitrary_scaling = + (bd == 8 && (x_step_q4 > 64 || y_step_q4 > 64)) || + (bd >= 10 && (x_step_q4 > 32 || y_step_q4 > 32)); + if (is_arbitrary_scaling) { + vp9_scale_and_extend_frame_nonnormative(src, dst, bd); + return; + } + + const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer, + src->v_buffer }; + const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride }; + uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer }; + const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride }; + const InterpKernel *const kernel = vp9_filter_kernels[filter_type]; + int x, y, i; + + for (i = 0; i < MAX_MB_PLANE; ++i) { + const int factor = (i == 0 || i == 3 ? 1 : 2); + const int src_stride = src_strides[i]; + const int dst_stride = dst_strides[i]; + for (y = 0; y < dst_h; y += 16) { + const int y_q4 = y * (16 / factor) * src_h / dst_h + phase_scaler; + for (x = 0; x < dst_w; x += 16) { + const int x_q4 = x * (16 / factor) * src_w / dst_w + phase_scaler; + const uint8_t *src_ptr = srcs[i] + + (y / factor) * src_h / dst_h * src_stride + + (x / factor) * src_w / dst_w; + uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor); + + if (src->flags & YV12_FLAG_HIGHBITDEPTH) { + vpx_highbd_convolve8(CONVERT_TO_SHORTPTR(src_ptr), src_stride, + CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, kernel, + x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf, + 16 * src_h / dst_h, 16 / factor, 16 / factor, + bd); + } else { + vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel, + x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf, + 16 * src_h / dst_h, 16 / factor, 16 / factor); + } + } + } + } + + vpx_extend_frame_borders(dst); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if !CONFIG_REALTIME_ONLY +static int scale_down(VP9_COMP *cpi, int q) { + RATE_CONTROL *const rc = &cpi->rc; + GF_GROUP *const gf_group = &cpi->twopass.gf_group; + int scale = 0; + assert(frame_is_kf_gf_arf(cpi)); + + if (rc->frame_size_selector == UNSCALED && + q >= rc->rf_level_maxq[gf_group->rf_level[gf_group->index]]) { + const int max_size_thresh = + (int)(rate_thresh_mult[SCALE_STEP1] * + VPXMAX(rc->this_frame_target, rc->avg_frame_bandwidth)); + scale = rc->projected_frame_size > max_size_thresh ? 1 : 0; + } + return scale; +} + +static int big_rate_miss_high_threshold(VP9_COMP *cpi) { + const RATE_CONTROL *const rc = &cpi->rc; + int big_miss_high; + + if (frame_is_kf_gf_arf(cpi)) + big_miss_high = rc->this_frame_target * 3 / 2; + else + big_miss_high = rc->this_frame_target * 2; + + return big_miss_high; +} + +static int big_rate_miss(VP9_COMP *cpi) { + const RATE_CONTROL *const rc = &cpi->rc; + int big_miss_high; + int big_miss_low; + + // Ignore for overlay frames + if (rc->is_src_frame_alt_ref) { + return 0; + } else { + big_miss_low = (rc->this_frame_target / 2); + big_miss_high = big_rate_miss_high_threshold(cpi); + + return (rc->projected_frame_size > big_miss_high) || + (rc->projected_frame_size < big_miss_low); + } +} + +// test in two pass for the first +static int two_pass_first_group_inter(VP9_COMP *cpi) { + if (cpi->oxcf.pass == 2) { + TWO_PASS *const twopass = &cpi->twopass; + GF_GROUP *const gf_group = &twopass->gf_group; + const int gfg_index = gf_group->index; + + if (gfg_index == 0) return gf_group->update_type[gfg_index] == LF_UPDATE; + return gf_group->update_type[gfg_index - 1] != LF_UPDATE && + gf_group->update_type[gfg_index] == LF_UPDATE; + } else { + return 0; + } +} + +// Function to test for conditions that indicate we should loop +// back and recode a frame. +static int recode_loop_test(VP9_COMP *cpi, int high_limit, int low_limit, int q, + int maxq, int minq) { + const RATE_CONTROL *const rc = &cpi->rc; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + const int frame_is_kfgfarf = frame_is_kf_gf_arf(cpi); + int force_recode = 0; + + if ((rc->projected_frame_size >= rc->max_frame_bandwidth) || + big_rate_miss(cpi) || (cpi->sf.recode_loop == ALLOW_RECODE) || + (two_pass_first_group_inter(cpi) && + (cpi->sf.recode_loop == ALLOW_RECODE_FIRST)) || + (frame_is_kfgfarf && (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF))) { + if (frame_is_kfgfarf && (oxcf->resize_mode == RESIZE_DYNAMIC) && + scale_down(cpi, q)) { + // Code this group at a lower resolution. + cpi->resize_pending = 1; + return 1; + } + + // Force recode for extreme overshoot. + if ((rc->projected_frame_size >= rc->max_frame_bandwidth) || + (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF && + rc->projected_frame_size >= big_rate_miss_high_threshold(cpi))) { + return 1; + } + + // TODO(agrange) high_limit could be greater than the scale-down threshold. + if ((rc->projected_frame_size > high_limit && q < maxq) || + (rc->projected_frame_size < low_limit && q > minq)) { + force_recode = 1; + } else if (cpi->oxcf.rc_mode == VPX_CQ) { + // Deal with frame undershoot and whether or not we are + // below the automatically set cq level. + if (q > oxcf->cq_level && + rc->projected_frame_size < ((rc->this_frame_target * 7) >> 3)) { + force_recode = 1; + } + } + } + return force_recode; +} +#endif // !CONFIG_REALTIME_ONLY + +static void update_ref_frames(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + BufferPool *const pool = cm->buffer_pool; + GF_GROUP *const gf_group = &cpi->twopass.gf_group; + + if (cpi->rc.show_arf_as_gld) { + int tmp = cpi->alt_fb_idx; + cpi->alt_fb_idx = cpi->gld_fb_idx; + cpi->gld_fb_idx = tmp; + } else if (cm->show_existing_frame) { + // Pop ARF. + cpi->lst_fb_idx = cpi->alt_fb_idx; + cpi->alt_fb_idx = + stack_pop(gf_group->arf_index_stack, gf_group->stack_size); + --gf_group->stack_size; + } + + // At this point the new frame has been encoded. + // If any buffer copy / swapping is signaled it should be done here. + if (cm->frame_type == KEY_FRAME) { + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx], + cm->new_fb_idx); + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx], + cm->new_fb_idx); + } else if (vp9_preserve_existing_gf(cpi)) { + // We have decided to preserve the previously existing golden frame as our + // new ARF frame. However, in the short term in function + // vp9_get_refresh_mask() we left it in the GF slot and, if + // we're updating the GF with the current decoded frame, we save it to the + // ARF slot instead. + // We now have to update the ARF with the current frame and swap gld_fb_idx + // and alt_fb_idx so that, overall, we've stored the old GF in the new ARF + // slot and, if we're updating the GF, the current frame becomes the new GF. + int tmp; + + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx], + cm->new_fb_idx); + + tmp = cpi->alt_fb_idx; + cpi->alt_fb_idx = cpi->gld_fb_idx; + cpi->gld_fb_idx = tmp; + } else { /* For non key/golden frames */ + if (cpi->refresh_alt_ref_frame) { + int arf_idx = gf_group->top_arf_idx; + + // Push new ARF into stack. + stack_push(gf_group->arf_index_stack, cpi->alt_fb_idx, + gf_group->stack_size); + ++gf_group->stack_size; + + assert(arf_idx < REF_FRAMES); + + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[arf_idx], cm->new_fb_idx); + memcpy(cpi->interp_filter_selected[ALTREF_FRAME], + cpi->interp_filter_selected[0], + sizeof(cpi->interp_filter_selected[0])); + + cpi->alt_fb_idx = arf_idx; + } + + if (cpi->refresh_golden_frame) { + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx], + cm->new_fb_idx); + if (!cpi->rc.is_src_frame_alt_ref) + memcpy(cpi->interp_filter_selected[GOLDEN_FRAME], + cpi->interp_filter_selected[0], + sizeof(cpi->interp_filter_selected[0])); + else + memcpy(cpi->interp_filter_selected[GOLDEN_FRAME], + cpi->interp_filter_selected[ALTREF_FRAME], + sizeof(cpi->interp_filter_selected[ALTREF_FRAME])); + } + } + + if (cpi->refresh_last_frame) { + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx], + cm->new_fb_idx); + if (!cpi->rc.is_src_frame_alt_ref) + memcpy(cpi->interp_filter_selected[LAST_FRAME], + cpi->interp_filter_selected[0], + sizeof(cpi->interp_filter_selected[0])); + } + + if (gf_group->update_type[gf_group->index] == MID_OVERLAY_UPDATE) { + cpi->alt_fb_idx = + stack_pop(gf_group->arf_index_stack, gf_group->stack_size); + --gf_group->stack_size; + } +} + +void vp9_update_reference_frames(VP9_COMP *cpi) { + update_ref_frames(cpi); + +#if CONFIG_VP9_TEMPORAL_DENOISING + vp9_denoiser_update_ref_frame(cpi); +#endif + + if (is_one_pass_svc(cpi)) vp9_svc_update_ref_frame(cpi); +} + +static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { + MACROBLOCKD *xd = &cpi->td.mb.e_mbd; + struct loopfilter *lf = &cm->lf; + int is_reference_frame = + (cm->frame_type == KEY_FRAME || cpi->refresh_last_frame || + cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame); + if (cpi->use_svc && + cpi->svc.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) + is_reference_frame = !cpi->svc.non_reference_frame; + + // Skip loop filter in show_existing_frame mode. + if (cm->show_existing_frame) { + lf->filter_level = 0; + return; + } + + if (cpi->loopfilter_ctrl == NO_LOOPFILTER || + (!is_reference_frame && cpi->loopfilter_ctrl == LOOPFILTER_REFERENCE)) { + lf->filter_level = 0; + vpx_extend_frame_inner_borders(cm->frame_to_show); + return; + } + + if (xd->lossless) { + lf->filter_level = 0; + lf->last_filt_level = 0; + } else { + struct vpx_usec_timer timer; + + vpx_clear_system_state(); + + vpx_usec_timer_start(&timer); + + if (!cpi->rc.is_src_frame_alt_ref) { + if ((cpi->common.frame_type == KEY_FRAME) && + (!cpi->rc.this_key_frame_forced)) { + lf->last_filt_level = 0; + } + vp9_pick_filter_level(cpi->Source, cpi, cpi->sf.lpf_pick); + lf->last_filt_level = lf->filter_level; + } else { + lf->filter_level = 0; + } + + vpx_usec_timer_mark(&timer); + cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer); + } + + if (lf->filter_level > 0 && is_reference_frame) { + vp9_build_mask_frame(cm, lf->filter_level, 0); + + if (cpi->num_workers > 1) + vp9_loop_filter_frame_mt(cm->frame_to_show, cm, xd->plane, + lf->filter_level, 0, 0, cpi->workers, + cpi->num_workers, &cpi->lf_row_sync); + else + vp9_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0); + } + + vpx_extend_frame_inner_borders(cm->frame_to_show); +} + +void vp9_scale_references(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + MV_REFERENCE_FRAME ref_frame; + const VP9_REFFRAME ref_mask[3] = { VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + // Need to convert from VP9_REFFRAME to index into ref_mask (subtract 1). + if (cpi->ref_frame_flags & ref_mask[ref_frame - 1]) { + BufferPool *const pool = cm->buffer_pool; + const YV12_BUFFER_CONFIG *const ref = + get_ref_frame_buffer(cpi, ref_frame); + + if (ref == NULL) { + cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX; + continue; + } + +#if CONFIG_VP9_HIGHBITDEPTH + if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) { + RefCntBuffer *new_fb_ptr = NULL; + int force_scaling = 0; + int new_fb = cpi->scaled_ref_idx[ref_frame - 1]; + if (new_fb == INVALID_IDX) { + new_fb = get_free_fb(cm); + force_scaling = 1; + } + if (new_fb == INVALID_IDX) return; + new_fb_ptr = &pool->frame_bufs[new_fb]; + if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width || + new_fb_ptr->buf.y_crop_height != cm->height) { + if (vpx_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, + cm->use_highbitdepth, + VP9_ENC_BORDER_IN_PIXELS, + cm->byte_alignment, NULL, NULL, NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth, + EIGHTTAP, 0); + cpi->scaled_ref_idx[ref_frame - 1] = new_fb; + alloc_frame_mvs(cm, new_fb); + } +#else + if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) { + RefCntBuffer *new_fb_ptr = NULL; + int force_scaling = 0; + int new_fb = cpi->scaled_ref_idx[ref_frame - 1]; + if (new_fb == INVALID_IDX) { + new_fb = get_free_fb(cm); + force_scaling = 1; + } + if (new_fb == INVALID_IDX) return; + new_fb_ptr = &pool->frame_bufs[new_fb]; + if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width || + new_fb_ptr->buf.y_crop_height != cm->height) { + if (vpx_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, + VP9_ENC_BORDER_IN_PIXELS, + cm->byte_alignment, NULL, NULL, NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + vp9_scale_and_extend_frame(ref, &new_fb_ptr->buf, EIGHTTAP, 0); + cpi->scaled_ref_idx[ref_frame - 1] = new_fb; + alloc_frame_mvs(cm, new_fb); + } +#endif // CONFIG_VP9_HIGHBITDEPTH + } else { + int buf_idx; + RefCntBuffer *buf = NULL; + if (cpi->oxcf.pass == 0 && !cpi->use_svc) { + // Check for release of scaled reference. + buf_idx = cpi->scaled_ref_idx[ref_frame - 1]; + if (buf_idx != INVALID_IDX) { + buf = &pool->frame_bufs[buf_idx]; + --buf->ref_count; + cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX; + } + } + buf_idx = get_ref_frame_buf_idx(cpi, ref_frame); + buf = &pool->frame_bufs[buf_idx]; + buf->buf.y_crop_width = ref->y_crop_width; + buf->buf.y_crop_height = ref->y_crop_height; + cpi->scaled_ref_idx[ref_frame - 1] = buf_idx; + ++buf->ref_count; + } + } else { + if (cpi->oxcf.pass != 0 || cpi->use_svc) + cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX; + } + } +} + +static void release_scaled_references(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + int i; + if (cpi->oxcf.pass == 0 && !cpi->use_svc) { + // Only release scaled references under certain conditions: + // if reference will be updated, or if scaled reference has same resolution. + int refresh[3]; + refresh[0] = (cpi->refresh_last_frame) ? 1 : 0; + refresh[1] = (cpi->refresh_golden_frame) ? 1 : 0; + refresh[2] = (cpi->refresh_alt_ref_frame) ? 1 : 0; + for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { + const int idx = cpi->scaled_ref_idx[i - 1]; + if (idx != INVALID_IDX) { + RefCntBuffer *const buf = &cm->buffer_pool->frame_bufs[idx]; + const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, i); + if (refresh[i - 1] || (buf->buf.y_crop_width == ref->y_crop_width && + buf->buf.y_crop_height == ref->y_crop_height)) { + --buf->ref_count; + cpi->scaled_ref_idx[i - 1] = INVALID_IDX; + } + } + } + } else { + for (i = 0; i < REFS_PER_FRAME; ++i) { + const int idx = cpi->scaled_ref_idx[i]; + if (idx != INVALID_IDX) { + RefCntBuffer *const buf = &cm->buffer_pool->frame_bufs[idx]; + --buf->ref_count; + cpi->scaled_ref_idx[i] = INVALID_IDX; + } + } + } +} + +static void full_to_model_count(unsigned int *model_count, + unsigned int *full_count) { + int n; + model_count[ZERO_TOKEN] = full_count[ZERO_TOKEN]; + model_count[ONE_TOKEN] = full_count[ONE_TOKEN]; + model_count[TWO_TOKEN] = full_count[TWO_TOKEN]; + for (n = THREE_TOKEN; n < EOB_TOKEN; ++n) + model_count[TWO_TOKEN] += full_count[n]; + model_count[EOB_MODEL_TOKEN] = full_count[EOB_TOKEN]; +} + +static void full_to_model_counts(vp9_coeff_count_model *model_count, + vp9_coeff_count *full_count) { + int i, j, k, l; + + for (i = 0; i < PLANE_TYPES; ++i) + for (j = 0; j < REF_TYPES; ++j) + for (k = 0; k < COEF_BANDS; ++k) + for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) + full_to_model_count(model_count[i][j][k][l], full_count[i][j][k][l]); +} + +#if 0 && CONFIG_INTERNAL_STATS +static void output_frame_level_debug_stats(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + FILE *const f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w"); + int64_t recon_err; + + vpx_clear_system_state(); + +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) { + recon_err = vpx_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm)); + } else { + recon_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm)); + } +#else + recon_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm)); +#endif // CONFIG_VP9_HIGHBITDEPTH + + + if (cpi->twopass.total_left_stats.coded_error != 0.0) { + double dc_quant_devisor; +#if CONFIG_VP9_HIGHBITDEPTH + switch (cm->bit_depth) { + case VPX_BITS_8: + dc_quant_devisor = 4.0; + break; + case VPX_BITS_10: + dc_quant_devisor = 16.0; + break; + default: + assert(cm->bit_depth == VPX_BITS_12); + dc_quant_devisor = 64.0; + break; + } +#else + dc_quant_devisor = 4.0; +#endif + + if (!cm->current_video_frame) { + fprintf(f, "frame, width, height, last ts, last end ts, " + "source_alt_ref_pending, source_alt_ref_active, " + "this_frame_target, projected_frame_size, " + "projected_frame_size / MBs, " + "projected_frame_size - this_frame_target, " + "vbr_bits_off_target, vbr_bits_off_target_fast, " + "twopass.extend_minq, twopass.extend_minq_fast, " + "total_target_vs_actual, " + "starting_buffer_level - bits_off_target, " + "total_actual_bits, base_qindex, q for base_qindex, " + "dc quant, q for active_worst_quality, avg_q, q for oxcf.cq_level, " + "refresh_last_frame, refresh_golden_frame, refresh_alt_ref_frame, " + "frame_type, gfu_boost, " + "twopass.bits_left, " + "twopass.total_left_stats.coded_error, " + "twopass.bits_left / (1 + twopass.total_left_stats.coded_error), " + "tot_recode_hits, recon_err, kf_boost, " + "twopass.kf_zeromotion_pct, twopass.fr_content_type, " + "filter_level, seg.aq_av_offset\n"); + } + + fprintf(f, "%10u, %d, %d, %10"PRId64", %10"PRId64", %d, %d, %10d, %10d, " + "%10d, %10d, %10"PRId64", %10"PRId64", %5d, %5d, %10"PRId64", " + "%10"PRId64", %10"PRId64", %10d, %7.2lf, %7.2lf, %7.2lf, %7.2lf, " + "%7.2lf, %6d, %6d, %5d, %5d, %5d, %10"PRId64", %10.3lf, %10lf, %8u, " + "%10"PRId64", %10d, %10d, %10d, %10d, %10d\n", + cpi->common.current_video_frame, + cm->width, cm->height, + cpi->last_time_stamp_seen, + cpi->last_end_time_stamp_seen, + cpi->rc.source_alt_ref_pending, + cpi->rc.source_alt_ref_active, + cpi->rc.this_frame_target, + cpi->rc.projected_frame_size, + cpi->rc.projected_frame_size / cpi->common.MBs, + (cpi->rc.projected_frame_size - cpi->rc.this_frame_target), + cpi->rc.vbr_bits_off_target, + cpi->rc.vbr_bits_off_target_fast, + cpi->twopass.extend_minq, + cpi->twopass.extend_minq_fast, + cpi->rc.total_target_vs_actual, + (cpi->rc.starting_buffer_level - cpi->rc.bits_off_target), + cpi->rc.total_actual_bits, cm->base_qindex, + vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth), + (double)vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth) / + dc_quant_devisor, + vp9_convert_qindex_to_q(cpi->twopass.active_worst_quality, + cm->bit_depth), + cpi->rc.avg_q, + vp9_convert_qindex_to_q(cpi->oxcf.cq_level, cm->bit_depth), + cpi->refresh_last_frame, cpi->refresh_golden_frame, + cpi->refresh_alt_ref_frame, cm->frame_type, cpi->rc.gfu_boost, + cpi->twopass.bits_left, + cpi->twopass.total_left_stats.coded_error, + cpi->twopass.bits_left / + (1 + cpi->twopass.total_left_stats.coded_error), + cpi->tot_recode_hits, recon_err, cpi->rc.kf_boost, + cpi->twopass.kf_zeromotion_pct, + cpi->twopass.fr_content_type, + cm->lf.filter_level, + cm->seg.aq_av_offset); + } + fclose(f); + + if (0) { + FILE *const fmodes = fopen("Modes.stt", "a"); + int i; + + fprintf(fmodes, "%6d:%1d:%1d:%1d ", cpi->common.current_video_frame, + cm->frame_type, cpi->refresh_golden_frame, + cpi->refresh_alt_ref_frame); + + for (i = 0; i < MAX_MODES; ++i) + fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]); + + fprintf(fmodes, "\n"); + + fclose(fmodes); + } +} +#endif + +static void set_mv_search_params(VP9_COMP *cpi) { + const VP9_COMMON *const cm = &cpi->common; + const unsigned int max_mv_def = VPXMIN(cm->width, cm->height); + + // Default based on max resolution. + cpi->mv_step_param = vp9_init_search_range(max_mv_def); + + if (cpi->sf.mv.auto_mv_step_size) { + if (frame_is_intra_only(cm)) { + // Initialize max_mv_magnitude for use in the first INTER frame + // after a key/intra-only frame. + cpi->max_mv_magnitude = max_mv_def; + } else { + if (cm->show_frame) { + // Allow mv_steps to correspond to twice the max mv magnitude found + // in the previous frame, capped by the default max_mv_magnitude based + // on resolution. + cpi->mv_step_param = vp9_init_search_range( + VPXMIN(max_mv_def, 2 * cpi->max_mv_magnitude)); + } + cpi->max_mv_magnitude = 0; + } + } +} + +static void set_size_independent_vars(VP9_COMP *cpi) { + vp9_set_speed_features_framesize_independent(cpi, cpi->oxcf.speed); + vp9_set_rd_speed_thresholds(cpi); + vp9_set_rd_speed_thresholds_sub8x8(cpi); + cpi->common.interp_filter = cpi->sf.default_interp_filter; +} + +static void set_size_dependent_vars(VP9_COMP *cpi, int *q, int *bottom_index, + int *top_index) { + VP9_COMMON *const cm = &cpi->common; + + // Setup variables that depend on the dimensions of the frame. + vp9_set_speed_features_framesize_dependent(cpi, cpi->oxcf.speed); + + // Decide q and q bounds. + *q = vp9_rc_pick_q_and_bounds(cpi, bottom_index, top_index); + + if (cpi->oxcf.rc_mode == VPX_CBR && cpi->rc.force_max_q) { + *q = cpi->rc.worst_quality; + cpi->rc.force_max_q = 0; + } + + if (cpi->use_svc) { + cpi->svc.base_qindex[cpi->svc.spatial_layer_id] = *q; + } + + if (!frame_is_intra_only(cm)) { + vp9_set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH); + } + +#if !CONFIG_REALTIME_ONLY + // Configure experimental use of segmentation for enhanced coding of + // static regions if indicated. + // Only allowed in the second pass of a two pass encode, as it requires + // lagged coding, and if the relevant speed feature flag is set. + if (cpi->oxcf.pass == 2 && cpi->sf.static_segmentation) + configure_static_seg_features(cpi); +#endif // !CONFIG_REALTIME_ONLY + +#if CONFIG_VP9_POSTPROC && !(CONFIG_VP9_TEMPORAL_DENOISING) + if (cpi->oxcf.noise_sensitivity > 0) { + int l = 0; + switch (cpi->oxcf.noise_sensitivity) { + case 1: l = 20; break; + case 2: l = 40; break; + case 3: l = 60; break; + case 4: + case 5: l = 100; break; + case 6: l = 150; break; + } + if (!cpi->common.postproc_state.limits) { + CHECK_MEM_ERROR(&cm->error, cpi->common.postproc_state.limits, + vpx_calloc(cpi->un_scaled_source->y_width, + sizeof(*cpi->common.postproc_state.limits))); + } + vp9_denoise(&cpi->common, cpi->Source, cpi->Source, l, + cpi->common.postproc_state.limits); + } +#endif // CONFIG_VP9_POSTPROC +} + +static void init_motion_estimation(VP9_COMP *cpi) { + int y_stride = cpi->scaled_source.y_stride; + + if (cpi->sf.mv.search_method == NSTEP) { + vp9_init3smotion_compensation(&cpi->ss_cfg, y_stride); + } else if (cpi->sf.mv.search_method == DIAMOND) { + vp9_init_dsmotion_compensation(&cpi->ss_cfg, y_stride); + } +} + +static void set_frame_size(VP9_COMP *cpi) { + int ref_frame; + VP9_COMMON *const cm = &cpi->common; + VP9EncoderConfig *const oxcf = &cpi->oxcf; + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + +#if !CONFIG_REALTIME_ONLY + if (oxcf->pass == 2 && oxcf->rc_mode == VPX_VBR && + ((oxcf->resize_mode == RESIZE_FIXED && cm->current_video_frame == 0) || + (oxcf->resize_mode == RESIZE_DYNAMIC && cpi->resize_pending))) { + calculate_coded_size(cpi, &oxcf->scaled_frame_width, + &oxcf->scaled_frame_height); + + // There has been a change in frame size. + vp9_set_size_literal(cpi, oxcf->scaled_frame_width, + oxcf->scaled_frame_height); + } +#endif // !CONFIG_REALTIME_ONLY + + if (oxcf->pass == 0 && oxcf->rc_mode == VPX_CBR && + oxcf->resize_mode == RESIZE_DYNAMIC && cpi->resize_pending != 0) { + // For SVC scaled width/height will have been set (svc->resize_set=1) + // in get_svc_params based on the layer width/height. + if (!cpi->use_svc || !cpi->svc.resize_set) { + oxcf->scaled_frame_width = + (oxcf->width * cpi->resize_scale_num) / cpi->resize_scale_den; + oxcf->scaled_frame_height = + (oxcf->height * cpi->resize_scale_num) / cpi->resize_scale_den; + // There has been a change in frame size. + vp9_set_size_literal(cpi, oxcf->scaled_frame_width, + oxcf->scaled_frame_height); + } + + // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed. + set_mv_search_params(cpi); + + vp9_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height); +#if CONFIG_VP9_TEMPORAL_DENOISING + // Reset the denoiser on the resized frame. + if (cpi->oxcf.noise_sensitivity > 0) { + vp9_denoiser_free(&(cpi->denoiser)); + setup_denoiser_buffer(cpi); + // Dynamic resize is only triggered for non-SVC, so we can force + // golden frame update here as temporary fix to denoiser. + cpi->refresh_golden_frame = 1; + } +#endif + } + + if ((oxcf->pass == 2) && !cpi->use_svc) { + vp9_set_target_rate(cpi); + } + + alloc_frame_mvs(cm, cm->new_fb_idx); + + // Reset the frame pointers to the current frame size. + if (vpx_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + + alloc_util_frame_buffers(cpi); + init_motion_estimation(cpi); + + int has_valid_ref_frame = 0; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - 1]; + const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame); + + ref_buf->idx = buf_idx; + + if (buf_idx != INVALID_IDX) { + YV12_BUFFER_CONFIG *const buf = &cm->buffer_pool->frame_bufs[buf_idx].buf; + ref_buf->buf = buf; +#if CONFIG_VP9_HIGHBITDEPTH + vp9_setup_scale_factors_for_frame( + &ref_buf->sf, buf->y_crop_width, buf->y_crop_height, cm->width, + cm->height, (buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0); +#else + vp9_setup_scale_factors_for_frame(&ref_buf->sf, buf->y_crop_width, + buf->y_crop_height, cm->width, + cm->height); +#endif // CONFIG_VP9_HIGHBITDEPTH + has_valid_ref_frame |= vp9_is_valid_scale(&ref_buf->sf); + if (vp9_is_scaled(&ref_buf->sf)) vpx_extend_frame_borders(buf); + } else { + ref_buf->buf = NULL; + } + } + if (!frame_is_intra_only(cm) && !has_valid_ref_frame) { + vpx_internal_error( + &cm->error, VPX_CODEC_CORRUPT_FRAME, + "Can't find at least one reference frame with valid size"); + } + + set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME); +} + +static void save_encode_params(VP9_COMP *cpi) { + int tile_idx; + int i, j; + TileDataEnc *tile_data; + RD_OPT *rd_opt = &cpi->rd; + for (i = 0; i < MAX_REF_FRAMES; i++) { + for (j = 0; j < REFERENCE_MODES; j++) + rd_opt->prediction_type_threshes_prev[i][j] = + rd_opt->prediction_type_threshes[i][j]; + + for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; j++) + rd_opt->filter_threshes_prev[i][j] = rd_opt->filter_threshes[i][j]; + } + + for (tile_idx = 0; tile_idx < cpi->allocated_tiles; tile_idx++) { + assert(cpi->tile_data); + tile_data = &cpi->tile_data[tile_idx]; + vp9_copy(tile_data->thresh_freq_fact_prev, tile_data->thresh_freq_fact); + } +} + +static INLINE void set_raw_source_frame(VP9_COMP *cpi) { +#ifdef ENABLE_KF_DENOISE + if (is_spatial_denoise_enabled(cpi)) { + cpi->raw_source_frame = vp9_scale_if_required( + cm, &cpi->raw_unscaled_source, &cpi->raw_scaled_source, + (oxcf->pass == 0), EIGHTTAP, 0); + } else { + cpi->raw_source_frame = cpi->Source; + } +#else + cpi->raw_source_frame = cpi->Source; +#endif +} + +static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size, + uint8_t *dest) { + VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + int q = 0, bottom_index = 0, top_index = 0; + int no_drop_scene_change = 0; + const INTERP_FILTER filter_scaler = + (is_one_pass_svc(cpi)) + ? svc->downsample_filter_type[svc->spatial_layer_id] + : EIGHTTAP; + const int phase_scaler = + (is_one_pass_svc(cpi)) + ? svc->downsample_filter_phase[svc->spatial_layer_id] + : 0; + + if (cm->show_existing_frame) { + cpi->rc.this_frame_target = 0; + if (is_psnr_calc_enabled(cpi)) set_raw_source_frame(cpi); + return 1; + } + + svc->time_stamp_prev[svc->spatial_layer_id] = svc->time_stamp_superframe; + + // Flag to check if its valid to compute the source sad (used for + // scene detection and for superblock content state in CBR mode). + // The flag may get reset below based on SVC or resizing state. + cpi->compute_source_sad_onepass = cpi->oxcf.mode == REALTIME; + + vpx_clear_system_state(); + + set_frame_size(cpi); + + if (is_one_pass_svc(cpi) && + cpi->un_scaled_source->y_width == cm->width << 2 && + cpi->un_scaled_source->y_height == cm->height << 2 && + svc->scaled_temp.y_width == cm->width << 1 && + svc->scaled_temp.y_height == cm->height << 1) { + // For svc, if it is a 1/4x1/4 downscaling, do a two-stage scaling to take + // advantage of the 1:2 optimized scaler. In the process, the 1/2x1/2 + // result will be saved in scaled_temp and might be used later. + const INTERP_FILTER filter_scaler2 = svc->downsample_filter_type[1]; + const int phase_scaler2 = svc->downsample_filter_phase[1]; + cpi->Source = vp9_svc_twostage_scale( + cm, cpi->un_scaled_source, &cpi->scaled_source, &svc->scaled_temp, + filter_scaler, phase_scaler, filter_scaler2, phase_scaler2); + svc->scaled_one_half = 1; + } else if (is_one_pass_svc(cpi) && + cpi->un_scaled_source->y_width == cm->width << 1 && + cpi->un_scaled_source->y_height == cm->height << 1 && + svc->scaled_one_half) { + // If the spatial layer is 1/2x1/2 and the scaling is already done in the + // two-stage scaling, use the result directly. + cpi->Source = &svc->scaled_temp; + svc->scaled_one_half = 0; + } else { + cpi->Source = vp9_scale_if_required( + cm, cpi->un_scaled_source, &cpi->scaled_source, (cpi->oxcf.pass == 0), + filter_scaler, phase_scaler); + } +#ifdef OUTPUT_YUV_SVC_SRC + // Write out at most 3 spatial layers. + if (is_one_pass_svc(cpi) && svc->spatial_layer_id < 3) { + vpx_write_yuv_frame(yuv_svc_src[svc->spatial_layer_id], cpi->Source); + } +#endif + // Unfiltered raw source used in metrics calculation if the source + // has been filtered. + if (is_psnr_calc_enabled(cpi)) { +#ifdef ENABLE_KF_DENOISE + if (is_spatial_denoise_enabled(cpi)) { + cpi->raw_source_frame = vp9_scale_if_required( + cm, &cpi->raw_unscaled_source, &cpi->raw_scaled_source, + (cpi->oxcf.pass == 0), EIGHTTAP, phase_scaler); + } else { + cpi->raw_source_frame = cpi->Source; + } +#else + cpi->raw_source_frame = cpi->Source; +#endif + } + + if ((cpi->use_svc && + (svc->spatial_layer_id < svc->number_spatial_layers - 1 || + svc->temporal_layer_id < svc->number_temporal_layers - 1 || + svc->current_superframe < 1)) || + cpi->resize_pending || cpi->resize_state || cpi->external_resize || + cpi->resize_state != ORIG) { + cpi->compute_source_sad_onepass = 0; + if (cpi->content_state_sb_fd != NULL) + memset(cpi->content_state_sb_fd, 0, + (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1) * + sizeof(*cpi->content_state_sb_fd)); + } + + // Avoid scaling last_source unless its needed. + // Last source is needed if avg_source_sad() is used, or if + // partition_search_type == SOURCE_VAR_BASED_PARTITION, or if noise + // estimation is enabled. + if (cpi->unscaled_last_source != NULL && + (cpi->oxcf.content == VP9E_CONTENT_SCREEN || + (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_VBR && + cpi->oxcf.mode == REALTIME && cpi->oxcf.speed >= 5) || + cpi->sf.partition_search_type == SOURCE_VAR_BASED_PARTITION || + (cpi->noise_estimate.enabled && !cpi->oxcf.noise_sensitivity) || + cpi->compute_source_sad_onepass)) + cpi->Last_Source = vp9_scale_if_required( + cm, cpi->unscaled_last_source, &cpi->scaled_last_source, + (cpi->oxcf.pass == 0), EIGHTTAP, 0); + + if (cpi->Last_Source == NULL || + cpi->Last_Source->y_width != cpi->Source->y_width || + cpi->Last_Source->y_height != cpi->Source->y_height) + cpi->compute_source_sad_onepass = 0; + + if (frame_is_intra_only(cm) || cpi->resize_pending != 0) { + memset(cpi->consec_zero_mv, 0, + cm->mi_rows * cm->mi_cols * sizeof(*cpi->consec_zero_mv)); + } + +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && cpi->use_svc) + vp9_denoiser_reset_on_first_frame(cpi); +#endif + + // Scene detection is always used for VBR mode or screen-content case. + // For other cases (e.g., CBR mode) use it for 5 <= speed < 8 for now + // (need to check encoding time cost for doing this for speed 8). + cpi->rc.high_source_sad = 0; + cpi->rc.hybrid_intra_scene_change = 0; + cpi->rc.re_encode_maxq_scene_change = 0; + if (cm->show_frame && cpi->oxcf.mode == REALTIME && + !cpi->disable_scene_detection_rtc_ratectrl && + (cpi->oxcf.rc_mode == VPX_VBR || + cpi->oxcf.content == VP9E_CONTENT_SCREEN || + (cpi->oxcf.speed >= 5 && cpi->oxcf.speed < 8))) + vp9_scene_detection_onepass(cpi); + + if (svc->spatial_layer_id == svc->first_spatial_layer_to_encode) { + svc->high_source_sad_superframe = cpi->rc.high_source_sad; + svc->high_num_blocks_with_motion = cpi->rc.high_num_blocks_with_motion; + // On scene change reset temporal layer pattern to TL0. + // Note that if the base/lower spatial layers are skipped: instead of + // inserting base layer here, we force max-q for the next superframe + // with lower spatial layers: this is done in vp9_encodedframe_overshoot() + // when max-q is decided for the current layer. + // Only do this reset for bypass/flexible mode. + if (svc->high_source_sad_superframe && svc->temporal_layer_id > 0 && + svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + // rc->high_source_sad will get reset so copy it to restore it. + int tmp_high_source_sad = cpi->rc.high_source_sad; + vp9_svc_reset_temporal_layers(cpi, cm->frame_type == KEY_FRAME); + cpi->rc.high_source_sad = tmp_high_source_sad; + } + } + + vp9_update_noise_estimate(cpi); + + // For 1 pass CBR, check if we are dropping this frame. + // Never drop on key frame, if base layer is key for svc, + // on scene change, or if superframe has layer sync. + if ((cpi->rc.high_source_sad || svc->high_source_sad_superframe) && + !(cpi->rc.use_post_encode_drop && svc->last_layer_dropped[0])) + no_drop_scene_change = 1; + if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR && + !frame_is_intra_only(cm) && !no_drop_scene_change && + !svc->superframe_has_layer_sync && + (!cpi->use_svc || + !svc->layer_context[svc->temporal_layer_id].is_key_frame)) { + if (vp9_rc_drop_frame(cpi)) return 0; + } + + // For 1 pass SVC, only ZEROMV is allowed for spatial reference frame + // when svc->force_zero_mode_spatial_ref = 1. Under those conditions we can + // avoid this frame-level upsampling (for non intra_only frames). + // For SVC single_layer mode, dynamic resize is allowed and we need to + // scale references for this case. + if (frame_is_intra_only(cm) == 0 && + ((svc->single_layer_svc && cpi->oxcf.resize_mode == RESIZE_DYNAMIC) || + !(is_one_pass_svc(cpi) && svc->force_zero_mode_spatial_ref))) { + vp9_scale_references(cpi); + } + + set_size_independent_vars(cpi); + set_size_dependent_vars(cpi, &q, &bottom_index, &top_index); + + // search method and step parameter might be changed in speed settings. + init_motion_estimation(cpi); + + if (cpi->sf.copy_partition_flag) alloc_copy_partition_data(cpi); + + if (cpi->sf.svc_use_lowres_part && + svc->spatial_layer_id == svc->number_spatial_layers - 2) { + if (svc->prev_partition_svc == NULL) { + CHECK_MEM_ERROR( + &cm->error, svc->prev_partition_svc, + (BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows, + sizeof(*svc->prev_partition_svc))); + } + } + + // TODO(jianj): Look into issue of skin detection with high bitdepth. + if (cm->bit_depth == 8 && cpi->oxcf.speed >= 5 && cpi->oxcf.pass == 0 && + cpi->oxcf.rc_mode == VPX_CBR && + cpi->oxcf.content != VP9E_CONTENT_SCREEN && + cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { + cpi->use_skin_detection = 1; + } + + // Enable post encode frame dropping for CBR on non key frame, when + // ext_use_post_encode_drop is specified by user. + cpi->rc.use_post_encode_drop = cpi->rc.ext_use_post_encode_drop && + cpi->oxcf.rc_mode == VPX_CBR && + cm->frame_type != KEY_FRAME; + + vp9_set_quantizer(cpi, q); + vp9_set_variance_partition_thresholds(cpi, q, 0); + + setup_frame(cpi); + + suppress_active_map(cpi); + + if (cpi->use_svc) { + // On non-zero spatial layer, check for disabling inter-layer + // prediction. + if (svc->spatial_layer_id > 0) vp9_svc_constrain_inter_layer_pred(cpi); + vp9_svc_assert_constraints_pattern(cpi); + } + + if (cpi->rc.last_post_encode_dropped_scene_change) { + cpi->rc.high_source_sad = 1; + svc->high_source_sad_superframe = 1; + // For now disable use_source_sad since Last_Source will not be the previous + // encoded but the dropped one. + cpi->sf.use_source_sad = 0; + cpi->rc.last_post_encode_dropped_scene_change = 0; + } + // Check if this high_source_sad (scene/slide change) frame should be + // encoded at high/max QP, and if so, set the q and adjust some rate + // control parameters. + if (cpi->sf.overshoot_detection_cbr_rt == FAST_DETECTION_MAXQ && + (cpi->rc.high_source_sad || + (cpi->use_svc && svc->high_source_sad_superframe))) { + if (vp9_encodedframe_overshoot(cpi, -1, &q)) { + vp9_set_quantizer(cpi, q); + vp9_set_variance_partition_thresholds(cpi, q, 0); + } + } + +#if !CONFIG_REALTIME_ONLY + // Variance adaptive and in frame q adjustment experiments are mutually + // exclusive. + if (cpi->oxcf.aq_mode == VARIANCE_AQ) { + vp9_vaq_frame_setup(cpi); + } else if (cpi->oxcf.aq_mode == EQUATOR360_AQ) { + vp9_360aq_frame_setup(cpi); + } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) { + vp9_setup_in_frame_q_adj(cpi); + } else if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ) { + // it may be pretty bad for rate-control, + // and I should handle it somehow + vp9_alt_ref_aq_setup_map(cpi->alt_ref_aq, cpi); + } else { +#endif + // If ROI is enabled and skip feature is used for segmentation, apply cyclic + // refresh but not apply ROI for skip for the first 20 frames (defined by + // FRAMES_NO_SKIPPING_AFTER_KEY) after key frame to improve quality. + if (cpi->roi.enabled && !frame_is_intra_only(cm)) { + if (cpi->roi.skip[BACKGROUND_SEG_SKIP_ID]) { + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) + vp9_cyclic_refresh_setup(cpi); + if (cpi->rc.frames_since_key > FRAMES_NO_SKIPPING_AFTER_KEY) + apply_roi_map(cpi); + } else { + apply_roi_map(cpi); + } + } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { + vp9_cyclic_refresh_setup(cpi); + } + +#if !CONFIG_REALTIME_ONLY + } +#endif + + apply_active_map(cpi); + + vp9_encode_frame(cpi); + + // Check if we should re-encode this frame at high Q because of high + // overshoot based on the encoded frame size. Only for frames where + // high temporal-source SAD is detected. + // For SVC: all spatial layers are checked for re-encoding. + if (cpi->sf.overshoot_detection_cbr_rt == RE_ENCODE_MAXQ && + (cpi->rc.high_source_sad || + (cpi->use_svc && svc->high_source_sad_superframe))) { + int frame_size = 0; + // Get an estimate of the encoded frame size. + save_coding_context(cpi); + vp9_pack_bitstream(cpi, dest, size); + restore_coding_context(cpi); + frame_size = (int)(*size) << 3; + // Check if encoded frame will overshoot too much, and if so, set the q and + // adjust some rate control parameters, and return to re-encode the frame. + if (vp9_encodedframe_overshoot(cpi, frame_size, &q)) { + vpx_clear_system_state(); + vp9_set_quantizer(cpi, q); + vp9_set_variance_partition_thresholds(cpi, q, 0); + suppress_active_map(cpi); + // Turn-off cyclic refresh for re-encoded frame. + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + unsigned char *const seg_map = cpi->segmentation_map; + memset(seg_map, 0, cm->mi_rows * cm->mi_cols); + memset(cr->last_coded_q_map, MAXQ, + cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map)); + cr->sb_index = 0; + vp9_disable_segmentation(&cm->seg); + } + apply_active_map(cpi); + vp9_encode_frame(cpi); + } + } + + // Update some stats from cyclic refresh, and check for golden frame update. + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled && + !frame_is_intra_only(cm) && cpi->cyclic_refresh->content_mode) + vp9_cyclic_refresh_postencode(cpi); + + // Update the skip mb flag probabilities based on the distribution + // seen in the last encoder iteration. + // update_base_skip_probs(cpi); + vpx_clear_system_state(); + return 1; +} + +static int get_ref_frame_flags(const VP9_COMP *cpi) { + const int *const map = cpi->common.ref_frame_map; + const int gold_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idx]; + const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idx]; + const int gold_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx]; + int flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG; + + if (gold_is_last) flags &= ~VP9_GOLD_FLAG; + + if (cpi->rc.frames_till_gf_update_due == INT_MAX && + (cpi->svc.number_temporal_layers == 1 && + cpi->svc.number_spatial_layers == 1)) + flags &= ~VP9_GOLD_FLAG; + + if (alt_is_last) flags &= ~VP9_ALT_FLAG; + + if (gold_is_alt) flags &= ~VP9_ALT_FLAG; + + return flags; +} + +#if !CONFIG_REALTIME_ONLY +#define MAX_QSTEP_ADJ 4 +static int get_qstep_adj(int rate_excess, int rate_limit) { + int qstep = + rate_limit ? ((rate_excess + rate_limit / 2) / rate_limit) : INT_MAX; + return VPXMIN(qstep, MAX_QSTEP_ADJ); +} + +#if CONFIG_RATE_CTRL +static void init_rq_history(RATE_QINDEX_HISTORY *rq_history) { + rq_history->recode_count = 0; + rq_history->q_index_high = 255; + rq_history->q_index_low = 0; +} + +static void update_rq_history(RATE_QINDEX_HISTORY *rq_history, int target_bits, + int actual_bits, int q_index) { + rq_history->q_index_history[rq_history->recode_count] = q_index; + rq_history->rate_history[rq_history->recode_count] = actual_bits; + if (actual_bits <= target_bits) { + rq_history->q_index_high = q_index; + } + if (actual_bits >= target_bits) { + rq_history->q_index_low = q_index; + } + rq_history->recode_count += 1; +} + +static int guess_q_index_from_model(const RATE_QSTEP_MODEL *rq_model, + int target_bits) { + // The model predicts bits as follows. + // target_bits = bias - ratio * log2(q_step) + // Given the target_bits, we compute the q_step as follows. + double q_step; + assert(rq_model->ratio > 0); + q_step = pow(2.0, (rq_model->bias - target_bits) / rq_model->ratio); + // TODO(angiebird): Make this function support highbitdepth. + return vp9_convert_q_to_qindex(q_step, VPX_BITS_8); +} + +static int guess_q_index_linear(int prev_q_index, int target_bits, + int actual_bits, int gap) { + int q_index = prev_q_index; + if (actual_bits < target_bits) { + q_index -= gap; + q_index = VPXMAX(q_index, 0); + } else { + q_index += gap; + q_index = VPXMIN(q_index, 255); + } + return q_index; +} + +static double get_bits_percent_diff(int target_bits, int actual_bits) { + double diff; + target_bits = VPXMAX(target_bits, 1); + diff = abs(target_bits - actual_bits) * 1. / target_bits; + return diff * 100; +} + +static int rq_model_predict_q_index(const RATE_QSTEP_MODEL *rq_model, + const RATE_QINDEX_HISTORY *rq_history, + int target_bits) { + int q_index = 128; + if (rq_history->recode_count > 0) { + const int actual_bits = + rq_history->rate_history[rq_history->recode_count - 1]; + const int prev_q_index = + rq_history->q_index_history[rq_history->recode_count - 1]; + const double percent_diff = get_bits_percent_diff(target_bits, actual_bits); + if (percent_diff > 50) { + // Binary search. + // When the actual_bits and target_bits are far apart, binary search + // q_index is faster. + q_index = (rq_history->q_index_low + rq_history->q_index_high) / 2; + } else { + if (rq_model->ready) { + q_index = guess_q_index_from_model(rq_model, target_bits); + } else { + // TODO(angiebird): Find a better way to set the gap. + q_index = + guess_q_index_linear(prev_q_index, target_bits, actual_bits, 20); + } + } + } else { + if (rq_model->ready) { + q_index = guess_q_index_from_model(rq_model, target_bits); + } + } + + assert(rq_history->q_index_low <= rq_history->q_index_high); + if (q_index <= rq_history->q_index_low) { + q_index = rq_history->q_index_low + 1; + } + if (q_index >= rq_history->q_index_high) { + q_index = rq_history->q_index_high - 1; + } + return q_index; +} + +static void rq_model_update(const RATE_QINDEX_HISTORY *rq_history, + int target_bits, RATE_QSTEP_MODEL *rq_model) { + const int recode_count = rq_history->recode_count; + const double delta = 0.00001; + if (recode_count >= 2) { + const int q_index1 = rq_history->q_index_history[recode_count - 2]; + const int q_index2 = rq_history->q_index_history[recode_count - 1]; + const int r1 = rq_history->rate_history[recode_count - 2]; + const int r2 = rq_history->rate_history[recode_count - 1]; + int valid = 0; + // lower q_index should yield higher bit rate + if (q_index1 < q_index2) { + valid = r1 > r2; + } else if (q_index1 > q_index2) { + valid = r1 < r2; + } + // Only update the model when the q_index and rate behave normally. + if (valid) { + // Fit the ratio and bias of rq_model based on last two recode histories. + const double s1 = vp9_convert_qindex_to_q(q_index1, VPX_BITS_8); + const double s2 = vp9_convert_qindex_to_q(q_index2, VPX_BITS_8); + if (fabs(log2(s1) - log2(s2)) > delta) { + rq_model->ratio = (r2 - r1) / (log2(s1) - log2(s2)); + rq_model->bias = r1 + (rq_model->ratio) * log2(s1); + if (rq_model->ratio > delta && rq_model->bias > delta) { + rq_model->ready = 1; + } + } + } + } else if (recode_count == 1) { + if (rq_model->ready) { + // Update the ratio only when the initial model exists and we only have + // one recode history. + const int prev_q = rq_history->q_index_history[recode_count - 1]; + const double prev_q_step = vp9_convert_qindex_to_q(prev_q, VPX_BITS_8); + if (fabs(log2(prev_q_step)) > delta) { + const int actual_bits = rq_history->rate_history[recode_count - 1]; + rq_model->ratio = + rq_model->ratio + (target_bits - actual_bits) / log2(prev_q_step); + } + } + } +} +#endif // CONFIG_RATE_CTRL + +static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest +#if CONFIG_RATE_CTRL + , + RATE_QINDEX_HISTORY *rq_history +#endif // CONFIG_RATE_CTRL +) { + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + VP9_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + int bottom_index, top_index; + int loop_count = 0; + int loop_at_this_size = 0; + int loop = 0; + int overshoot_seen = 0; + int undershoot_seen = 0; + int frame_over_shoot_limit; + int frame_under_shoot_limit; + int q = 0, q_low = 0, q_high = 0; + int enable_acl; +#ifdef AGGRESSIVE_VBR + int qrange_adj = 1; +#endif + + // A flag which indicates whether we are recoding the current frame + // when the current frame size is larger than the max frame size in the + // external rate control model. + // This flag doesn't have any impact when external rate control is not used. + int ext_rc_recode = 0; + // Maximal frame size allowed by the external rate control. + // case: 0, we ignore the max frame size limit, and encode with the qindex + // passed in by the external rate control model. + // If the external qindex is VPX_DEFAULT_Q, libvpx will pick a qindex + // and may recode if undershoot/overshoot is seen. + // If the external qindex is not VPX_DEFAULT_Q, we force no recode. + // case: -1, we take libvpx's decision for the max frame size, as well as + // the recode decision. + // Otherwise: if a specific size is given, libvpx's recode decision + // will respect the given size. + int ext_rc_max_frame_size = 0; + // Use VP9's decision of qindex. This flag is in use only in external rate + // control model to help determine whether to recode when + // |ext_rc_max_frame_size| is 0. + int ext_rc_use_default_q = 1; + const int orig_rc_max_frame_bandwidth = rc->max_frame_bandwidth; + +#if CONFIG_RATE_CTRL + RATE_QSTEP_MODEL *rq_model; + { + const FRAME_UPDATE_TYPE update_type = + cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index]; + const ENCODE_FRAME_TYPE frame_type = get_encode_frame_type(update_type); + rq_model = &cpi->rq_model[frame_type]; + } + init_rq_history(rq_history); +#endif // CONFIG_RATE_CTRL + + if (cm->show_existing_frame) { + rc->this_frame_target = 0; + if (is_psnr_calc_enabled(cpi)) set_raw_source_frame(cpi); + return; + } + + set_size_independent_vars(cpi); + + enable_acl = cpi->sf.allow_acl ? (cm->frame_type == KEY_FRAME) || + (cpi->twopass.gf_group.index == 1) + : 0; + +#if CONFIG_COLLECT_COMPONENT_TIMING + printf("\n Encoding a frame: \n"); +#endif + do { + vpx_clear_system_state(); + + set_frame_size(cpi); + + if (loop_count == 0 || cpi->resize_pending != 0) { + set_size_dependent_vars(cpi, &q, &bottom_index, &top_index); + +#ifdef AGGRESSIVE_VBR + if (two_pass_first_group_inter(cpi)) { + // Adjustment limits for min and max q + qrange_adj = VPXMAX(1, (top_index - bottom_index) / 2); + + bottom_index = + VPXMAX(bottom_index - qrange_adj / 2, oxcf->best_allowed_q); + top_index = VPXMIN(oxcf->worst_allowed_q, top_index + qrange_adj / 2); + } +#endif + // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed. + set_mv_search_params(cpi); + + // Reset the loop state for new frame size. + overshoot_seen = 0; + undershoot_seen = 0; + + // Reconfiguration for change in frame size has concluded. + cpi->resize_pending = 0; + + q_low = bottom_index; + q_high = top_index; + + loop_at_this_size = 0; + } + + // Decide frame size bounds first time through. + if (loop_count == 0) { + vp9_rc_compute_frame_size_bounds(cpi, rc->this_frame_target, + &frame_under_shoot_limit, + &frame_over_shoot_limit); + } + + cpi->Source = + vp9_scale_if_required(cm, cpi->un_scaled_source, &cpi->scaled_source, + (oxcf->pass == 0), EIGHTTAP, 0); + + // Unfiltered raw source used in metrics calculation if the source + // has been filtered. + if (is_psnr_calc_enabled(cpi)) { +#ifdef ENABLE_KF_DENOISE + if (is_spatial_denoise_enabled(cpi)) { + cpi->raw_source_frame = vp9_scale_if_required( + cm, &cpi->raw_unscaled_source, &cpi->raw_scaled_source, + (oxcf->pass == 0), EIGHTTAP, 0); + } else { + cpi->raw_source_frame = cpi->Source; + } +#else + cpi->raw_source_frame = cpi->Source; +#endif + } + + if (cpi->unscaled_last_source != NULL) + cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source, + &cpi->scaled_last_source, + (oxcf->pass == 0), EIGHTTAP, 0); + + if (frame_is_intra_only(cm) == 0) { + if (loop_count > 0) { + release_scaled_references(cpi); + } + vp9_scale_references(cpi); + } + +#if CONFIG_RATE_CTRL + // TODO(angiebird): This is a hack for making sure the encoder use the + // external_quantize_index exactly. Avoid this kind of hack later. + if (cpi->oxcf.use_simple_encode_api) { + if (cpi->encode_command.use_external_target_frame_bits) { + q = rq_model_predict_q_index(rq_model, rq_history, + rc->this_frame_target); + } + if (cpi->encode_command.use_external_quantize_index) { + q = cpi->encode_command.external_quantize_index; + } + } +#endif // CONFIG_RATE_CTRL + if (cpi->ext_ratectrl.ready && !ext_rc_recode && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0 && + cpi->ext_ratectrl.funcs.get_encodeframe_decision != NULL) { + vpx_codec_err_t codec_status; + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + vpx_rc_encodeframe_decision_t encode_frame_decision; + FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index]; + const int ref_frame_flags = get_ref_frame_flags(cpi); + RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES]; + const RefCntBuffer *curr_frame_buf = + get_ref_cnt_buffer(cm, cm->new_fb_idx); + // index 0 of a gf group is always KEY/OVERLAY/GOLDEN. + // index 1 refers to the first encoding frame in a gf group. + // Therefore if it is ARF_UPDATE, it means this gf group uses alt ref. + // See function define_gf_group_structure(). + const int use_alt_ref = gf_group->update_type[1] == ARF_UPDATE; + get_ref_frame_bufs(cpi, ref_frame_bufs); + codec_status = vp9_extrc_get_encodeframe_decision( + &cpi->ext_ratectrl, curr_frame_buf->frame_index, + cm->current_frame_coding_index, gf_group->index, update_type, + gf_group->gf_group_size, use_alt_ref, ref_frame_bufs, ref_frame_flags, + &encode_frame_decision); + if (codec_status != VPX_CODEC_OK) { + vpx_internal_error(&cm->error, codec_status, + "vp9_extrc_get_encodeframe_decision() failed"); + } + // If the external model recommends a reserved value, we use + // libvpx's default q. + if (encode_frame_decision.q_index != VPX_DEFAULT_Q) { + q = encode_frame_decision.q_index; + ext_rc_use_default_q = 0; + } + ext_rc_max_frame_size = encode_frame_decision.max_frame_size; + } + + vp9_set_quantizer(cpi, q); + + if (loop_count == 0) setup_frame(cpi); + + // Variance adaptive and in frame q adjustment experiments are mutually + // exclusive. + if (oxcf->aq_mode == VARIANCE_AQ) { + vp9_vaq_frame_setup(cpi); + } else if (oxcf->aq_mode == EQUATOR360_AQ) { + vp9_360aq_frame_setup(cpi); + } else if (oxcf->aq_mode == COMPLEXITY_AQ) { + vp9_setup_in_frame_q_adj(cpi); + } else if (oxcf->aq_mode == LOOKAHEAD_AQ) { + vp9_alt_ref_aq_setup_map(cpi->alt_ref_aq, cpi); + } else if (oxcf->aq_mode == PSNR_AQ) { + vp9_psnr_aq_mode_setup(&cm->seg); + } + + vp9_encode_frame(cpi); + + // Update the skip mb flag probabilities based on the distribution + // seen in the last encoder iteration. + // update_base_skip_probs(cpi); + + vpx_clear_system_state(); + + // Dummy pack of the bitstream using up to date stats to get an + // accurate estimate of output frame size to determine if we need + // to recode. + if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) { + save_coding_context(cpi); + if (!cpi->sf.use_nonrd_pick_mode) vp9_pack_bitstream(cpi, dest, size); + + rc->projected_frame_size = (int)(*size) << 3; + + if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1; + } + + if (cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0) { + // In general, for the external rate control, we take the qindex provided + // as input and encode the frame with this qindex faithfully. However, + // in some extreme scenarios, the provided qindex leads to a massive + // overshoot of frame size. In this case, we fall back to VP9's decision + // to pick a new qindex and recode the frame. We return the new qindex + // through the API to the external model. + if (ext_rc_max_frame_size == 0) { + if (!ext_rc_use_default_q) break; + } else if (ext_rc_max_frame_size == -1) { + // Do nothing, fall back to libvpx's recode decision. + } else { + // Change the max frame size, used in libvpx's recode decision. + rc->max_frame_bandwidth = ext_rc_max_frame_size; + } + ext_rc_recode = 1; + } +#if CONFIG_RATE_CTRL + if (cpi->oxcf.use_simple_encode_api) { + // This part needs to be after save_coding_context() because + // restore_coding_context will be called in the end of this function. + // TODO(angiebird): This is a hack for making sure the encoder use the + // external_quantize_index exactly. Avoid this kind of hack later. + if (cpi->encode_command.use_external_quantize_index) { + break; + } + + if (cpi->encode_command.use_external_target_frame_bits) { + const double percent_diff = get_bits_percent_diff( + rc->this_frame_target, rc->projected_frame_size); + update_rq_history(rq_history, rc->this_frame_target, + rc->projected_frame_size, q); + loop_count += 1; + + rq_model_update(rq_history, rc->this_frame_target, rq_model); + + // Check if we hit the target bitrate. + if (percent_diff <= + cpi->encode_command.target_frame_bits_error_percent || + rq_history->recode_count >= RATE_CTRL_MAX_RECODE_NUM || + rq_history->q_index_low >= rq_history->q_index_high) { + break; + } + + loop = 1; + restore_coding_context(cpi); + continue; + } + } +#endif // CONFIG_RATE_CTRL + + if (oxcf->rc_mode == VPX_Q) { + loop = 0; + } else { + if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced && + (rc->projected_frame_size < rc->max_frame_bandwidth)) { + int last_q = q; + int64_t kf_err; + + int64_t high_err_target = cpi->ambient_err; + int64_t low_err_target = cpi->ambient_err >> 1; + +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) { + kf_err = vpx_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm)); + } else { + kf_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm)); + } +#else + kf_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm)); +#endif // CONFIG_VP9_HIGHBITDEPTH + + // Prevent possible divide by zero error below for perfect KF + kf_err += !kf_err; + + // The key frame is not good enough or we can afford + // to make it better without undue risk of popping. + if ((kf_err > high_err_target && + rc->projected_frame_size <= frame_over_shoot_limit) || + (kf_err > low_err_target && + rc->projected_frame_size <= frame_under_shoot_limit)) { + // Lower q_high + q_high = q > q_low ? q - 1 : q_low; + + // Adjust Q + q = (int)((q * high_err_target) / kf_err); + q = VPXMIN(q, (q_high + q_low) >> 1); + } else if (kf_err < low_err_target && + rc->projected_frame_size >= frame_under_shoot_limit) { + // The key frame is much better than the previous frame + // Raise q_low + q_low = q < q_high ? q + 1 : q_high; + + // Adjust Q + q = (int)((q * low_err_target) / kf_err); + q = VPXMIN(q, (q_high + q_low + 1) >> 1); + } + + // Clamp Q to upper and lower limits: + q = clamp(q, q_low, q_high); + + loop = q != last_q; + } else if (recode_loop_test(cpi, frame_over_shoot_limit, + frame_under_shoot_limit, q, + VPXMAX(q_high, top_index), bottom_index)) { + // Is the projected frame size out of range and are we allowed + // to attempt to recode. + int last_q = q; + int retries = 0; + int qstep; + + if (cpi->resize_pending == 1) { + // Change in frame size so go back around the recode loop. + cpi->rc.frame_size_selector = + SCALE_STEP1 - cpi->rc.frame_size_selector; + cpi->rc.next_frame_size_selector = cpi->rc.frame_size_selector; + +#if CONFIG_INTERNAL_STATS + ++cpi->tot_recode_hits; +#endif + ++loop_count; + loop = 1; + continue; + } + + // Frame size out of permitted range: + // Update correction factor & compute new Q to try... + + // Frame is too large + if (rc->projected_frame_size > rc->this_frame_target) { + // Special case if the projected size is > the max allowed. + if ((q == q_high) && + ((rc->projected_frame_size >= rc->max_frame_bandwidth) || + (!rc->is_src_frame_alt_ref && + (rc->projected_frame_size >= + big_rate_miss_high_threshold(cpi))))) { + int max_rate = VPXMAX(1, VPXMIN(rc->max_frame_bandwidth, + big_rate_miss_high_threshold(cpi))); + double q_val_high; + q_val_high = vp9_convert_qindex_to_q(q_high, cm->bit_depth); + q_val_high = + q_val_high * ((double)rc->projected_frame_size / max_rate); + q_high = vp9_convert_q_to_qindex(q_val_high, cm->bit_depth); + q_high = clamp(q_high, rc->best_quality, rc->worst_quality); + } + + // Raise Qlow as to at least the current value + qstep = + get_qstep_adj(rc->projected_frame_size, rc->this_frame_target); + q_low = VPXMIN(q + qstep, q_high); + + if (undershoot_seen || loop_at_this_size > 1) { + // Update rate_correction_factor unless + vp9_rc_update_rate_correction_factors(cpi); + + q = (q_high + q_low + 1) / 2; + } else { + // Update rate_correction_factor unless + vp9_rc_update_rate_correction_factors(cpi); + + q = vp9_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, + VPXMAX(q_high, top_index)); + + while (q < q_low && retries < 10) { + vp9_rc_update_rate_correction_factors(cpi); + q = vp9_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, + VPXMAX(q_high, top_index)); + retries++; + } + } + + overshoot_seen = 1; + } else { + // Frame is too small + qstep = + get_qstep_adj(rc->this_frame_target, rc->projected_frame_size); + q_high = VPXMAX(q - qstep, q_low); + + if (overshoot_seen || loop_at_this_size > 1) { + vp9_rc_update_rate_correction_factors(cpi); + q = (q_high + q_low) / 2; + } else { + vp9_rc_update_rate_correction_factors(cpi); + q = vp9_rc_regulate_q(cpi, rc->this_frame_target, + VPXMIN(q_low, bottom_index), top_index); + // Special case reset for qlow for constrained quality. + // This should only trigger where there is very substantial + // undershoot on a frame and the auto cq level is above + // the user passed in value. + if (oxcf->rc_mode == VPX_CQ && q < q_low) { + q_low = q; + } + + while (q > q_high && retries < 10) { + vp9_rc_update_rate_correction_factors(cpi); + q = vp9_rc_regulate_q(cpi, rc->this_frame_target, + VPXMIN(q_low, bottom_index), top_index); + retries++; + } + } + undershoot_seen = 1; + } + + // Clamp Q to upper and lower limits: + q = clamp(q, q_low, q_high); + + loop = (q != last_q); + } else { + loop = 0; + } + } + + // Special case for overlay frame. + if (rc->is_src_frame_alt_ref && + rc->projected_frame_size < rc->max_frame_bandwidth) + loop = 0; + + if (loop) { + ++loop_count; + ++loop_at_this_size; + +#if CONFIG_INTERNAL_STATS + ++cpi->tot_recode_hits; +#endif + } + + if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) + if (loop) restore_coding_context(cpi); +#if CONFIG_COLLECT_COMPONENT_TIMING + if (loop) printf("\n Recoding:"); +#endif + } while (loop); + + rc->max_frame_bandwidth = orig_rc_max_frame_bandwidth; + +#ifdef AGGRESSIVE_VBR + if (two_pass_first_group_inter(cpi)) { + cpi->twopass.active_worst_quality = + VPXMIN(q + qrange_adj, oxcf->worst_allowed_q); + } else if (!frame_is_kf_gf_arf(cpi)) { +#else + if (!frame_is_kf_gf_arf(cpi)) { +#endif + // Have we been forced to adapt Q outside the expected range by an extreme + // rate miss. If so adjust the active maxQ for the subsequent frames. + if (!rc->is_src_frame_alt_ref && (q > cpi->twopass.active_worst_quality)) { + cpi->twopass.active_worst_quality = q; + } else if (oxcf->vbr_corpus_complexity && q == q_low && + rc->projected_frame_size < rc->this_frame_target) { + cpi->twopass.active_worst_quality = + VPXMAX(q, cpi->twopass.active_worst_quality - 1); + } + } + + if (enable_acl) { + // Skip recoding, if model diff is below threshold + const int thresh = compute_context_model_thresh(cpi); + const int diff = compute_context_model_diff(cm); + if (diff >= thresh) { + vp9_encode_frame(cpi); + } + } + if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) { + vpx_clear_system_state(); + restore_coding_context(cpi); + } +} +#endif // !CONFIG_REALTIME_ONLY + +static void set_ext_overrides(VP9_COMP *cpi) { + // Overrides the defaults with the externally supplied values with + // vp9_update_reference() and vp9_update_entropy() calls + // Note: The overrides are valid only for the next frame passed + // to encode_frame_to_data_rate() function + if (cpi->ext_refresh_frame_context_pending) { + cpi->common.refresh_frame_context = cpi->ext_refresh_frame_context; + cpi->ext_refresh_frame_context_pending = 0; + } + if (cpi->ext_refresh_frame_flags_pending) { + cpi->refresh_last_frame = cpi->ext_refresh_last_frame; + cpi->refresh_golden_frame = cpi->ext_refresh_golden_frame; + cpi->refresh_alt_ref_frame = cpi->ext_refresh_alt_ref_frame; + } +} + +YV12_BUFFER_CONFIG *vp9_svc_twostage_scale( + VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, + YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type, + int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2) { + if (cm->mi_cols * MI_SIZE != unscaled->y_width || + cm->mi_rows * MI_SIZE != unscaled->y_height) { +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->bit_depth == VPX_BITS_8) { + vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2, + phase_scaler2); + vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type, + phase_scaler); + } else { + scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth, + filter_type2, phase_scaler2); + scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth, + filter_type, phase_scaler); + } +#else + vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2, + phase_scaler2); + vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type, phase_scaler); +#endif // CONFIG_VP9_HIGHBITDEPTH + return scaled; + } else { + return unscaled; + } +} + +YV12_BUFFER_CONFIG *vp9_scale_if_required( + VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, + int use_normative_scaler, INTERP_FILTER filter_type, int phase_scaler) { + if (cm->mi_cols * MI_SIZE != unscaled->y_width || + cm->mi_rows * MI_SIZE != unscaled->y_height) { +#if CONFIG_VP9_HIGHBITDEPTH + if (use_normative_scaler && unscaled->y_width <= (scaled->y_width << 1) && + unscaled->y_height <= (scaled->y_height << 1)) + if (cm->bit_depth == VPX_BITS_8) + vp9_scale_and_extend_frame(unscaled, scaled, filter_type, phase_scaler); + else + scale_and_extend_frame(unscaled, scaled, (int)cm->bit_depth, + filter_type, phase_scaler); + else + vp9_scale_and_extend_frame_nonnormative(unscaled, scaled, + (int)cm->bit_depth); +#else + if (use_normative_scaler && unscaled->y_width <= (scaled->y_width << 1) && + unscaled->y_height <= (scaled->y_height << 1)) + vp9_scale_and_extend_frame(unscaled, scaled, filter_type, phase_scaler); + else + vp9_scale_and_extend_frame_nonnormative(unscaled, scaled); +#endif // CONFIG_VP9_HIGHBITDEPTH + return scaled; + } else { + return unscaled; + } +} + +static void set_ref_sign_bias(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + RefCntBuffer *const ref_buffer = get_ref_cnt_buffer(cm, cm->new_fb_idx); + const int cur_frame_index = ref_buffer->frame_index; + MV_REFERENCE_FRAME ref_frame; + + for (ref_frame = LAST_FRAME; ref_frame < MAX_REF_FRAMES; ++ref_frame) { + const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame); + const RefCntBuffer *const ref_cnt_buf = + get_ref_cnt_buffer(&cpi->common, buf_idx); + if (ref_cnt_buf) { + cm->ref_frame_sign_bias[ref_frame] = + cur_frame_index < ref_cnt_buf->frame_index; + } + } +} + +static int setup_interp_filter_search_mask(VP9_COMP *cpi) { + INTERP_FILTER ifilter; + int ref_total[MAX_REF_FRAMES] = { 0 }; + MV_REFERENCE_FRAME ref; + int mask = 0; + if (cpi->common.last_frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame) + return mask; + for (ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) + for (ifilter = EIGHTTAP; ifilter <= EIGHTTAP_SHARP; ++ifilter) + ref_total[ref] += cpi->interp_filter_selected[ref][ifilter]; + + for (ifilter = EIGHTTAP; ifilter <= EIGHTTAP_SHARP; ++ifilter) { + if ((ref_total[LAST_FRAME] && + cpi->interp_filter_selected[LAST_FRAME][ifilter] == 0) && + (ref_total[GOLDEN_FRAME] == 0 || + cpi->interp_filter_selected[GOLDEN_FRAME][ifilter] * 50 < + ref_total[GOLDEN_FRAME]) && + (ref_total[ALTREF_FRAME] == 0 || + cpi->interp_filter_selected[ALTREF_FRAME][ifilter] * 50 < + ref_total[ALTREF_FRAME])) + mask |= 1 << ifilter; + } + return mask; +} + +#ifdef ENABLE_KF_DENOISE +// Baseline kernel weights for denoise +static uint8_t dn_kernel_3[9] = { 1, 2, 1, 2, 4, 2, 1, 2, 1 }; +static uint8_t dn_kernel_5[25] = { 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 4, + 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1 }; + +static INLINE void add_denoise_point(int centre_val, int data_val, int thresh, + uint8_t point_weight, int *sum_val, + int *sum_weight) { + if (abs(centre_val - data_val) <= thresh) { + *sum_weight += point_weight; + *sum_val += (int)data_val * (int)point_weight; + } +} + +static void spatial_denoise_point(uint8_t *src_ptr, const int stride, + const int strength) { + int sum_weight = 0; + int sum_val = 0; + int thresh = strength; + int kernel_size = 5; + int half_k_size = 2; + int i, j; + int max_diff = 0; + uint8_t *tmp_ptr; + uint8_t *kernel_ptr; + + // Find the maximum deviation from the source point in the locale. + tmp_ptr = src_ptr - (stride * (half_k_size + 1)) - (half_k_size + 1); + for (i = 0; i < kernel_size + 2; ++i) { + for (j = 0; j < kernel_size + 2; ++j) { + max_diff = VPXMAX(max_diff, abs((int)*src_ptr - (int)tmp_ptr[j])); + } + tmp_ptr += stride; + } + + // Select the kernel size. + if (max_diff > (strength + (strength >> 1))) { + kernel_size = 3; + half_k_size = 1; + thresh = thresh >> 1; + } + kernel_ptr = (kernel_size == 3) ? dn_kernel_3 : dn_kernel_5; + + // Apply the kernel + tmp_ptr = src_ptr - (stride * half_k_size) - half_k_size; + for (i = 0; i < kernel_size; ++i) { + for (j = 0; j < kernel_size; ++j) { + add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernel_ptr, + &sum_val, &sum_weight); + ++kernel_ptr; + } + tmp_ptr += stride; + } + + // Update the source value with the new filtered value + *src_ptr = (uint8_t)((sum_val + (sum_weight >> 1)) / sum_weight); +} + +#if CONFIG_VP9_HIGHBITDEPTH +static void highbd_spatial_denoise_point(uint16_t *src_ptr, const int stride, + const int strength) { + int sum_weight = 0; + int sum_val = 0; + int thresh = strength; + int kernel_size = 5; + int half_k_size = 2; + int i, j; + int max_diff = 0; + uint16_t *tmp_ptr; + uint8_t *kernel_ptr; + + // Find the maximum deviation from the source point in the locale. + tmp_ptr = src_ptr - (stride * (half_k_size + 1)) - (half_k_size + 1); + for (i = 0; i < kernel_size + 2; ++i) { + for (j = 0; j < kernel_size + 2; ++j) { + max_diff = VPXMAX(max_diff, abs((int)src_ptr - (int)tmp_ptr[j])); + } + tmp_ptr += stride; + } + + // Select the kernel size. + if (max_diff > (strength + (strength >> 1))) { + kernel_size = 3; + half_k_size = 1; + thresh = thresh >> 1; + } + kernel_ptr = (kernel_size == 3) ? dn_kernel_3 : dn_kernel_5; + + // Apply the kernel + tmp_ptr = src_ptr - (stride * half_k_size) - half_k_size; + for (i = 0; i < kernel_size; ++i) { + for (j = 0; j < kernel_size; ++j) { + add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernel_ptr, + &sum_val, &sum_weight); + ++kernel_ptr; + } + tmp_ptr += stride; + } + + // Update the source value with the new filtered value + *src_ptr = (uint16_t)((sum_val + (sum_weight >> 1)) / sum_weight); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +// Apply thresholded spatial noise suppression to a given buffer. +static void spatial_denoise_buffer(VP9_COMP *cpi, uint8_t *buffer, + const int stride, const int width, + const int height, const int strength) { + VP9_COMMON *const cm = &cpi->common; + uint8_t *src_ptr = buffer; + int row; + int col; + + for (row = 0; row < height; ++row) { + for (col = 0; col < width; ++col) { +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) + highbd_spatial_denoise_point(CONVERT_TO_SHORTPTR(&src_ptr[col]), stride, + strength); + else + spatial_denoise_point(&src_ptr[col], stride, strength); +#else + spatial_denoise_point(&src_ptr[col], stride, strength); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + src_ptr += stride; + } +} + +// Apply thresholded spatial noise suppression to source. +static void spatial_denoise_frame(VP9_COMP *cpi) { + YV12_BUFFER_CONFIG *src = cpi->Source; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + TWO_PASS *const twopass = &cpi->twopass; + VP9_COMMON *const cm = &cpi->common; + + // Base the filter strength on the current active max Q. + const int q = (int)(vp9_convert_qindex_to_q(twopass->active_worst_quality, + cm->bit_depth)); + int strength = + VPXMAX(oxcf->arnr_strength >> 2, VPXMIN(oxcf->arnr_strength, (q >> 4))); + + // Denoise each of Y,U and V buffers. + spatial_denoise_buffer(cpi, src->y_buffer, src->y_stride, src->y_width, + src->y_height, strength); + + strength += (strength >> 1); + spatial_denoise_buffer(cpi, src->u_buffer, src->uv_stride, src->uv_width, + src->uv_height, strength << 1); + + spatial_denoise_buffer(cpi, src->v_buffer, src->uv_stride, src->uv_width, + src->uv_height, strength << 1); +} +#endif // ENABLE_KF_DENOISE + +#if !CONFIG_REALTIME_ONLY +static void vp9_try_disable_lookahead_aq(VP9_COMP *cpi, size_t *size, + uint8_t *dest) { + if (cpi->common.seg.enabled) + if (ALT_REF_AQ_PROTECT_GAIN) { + size_t nsize = *size; + int overhead; + + // TODO(yuryg): optimize this, as + // we don't really need to repack + + save_coding_context(cpi); + vp9_disable_segmentation(&cpi->common.seg); + vp9_pack_bitstream(cpi, dest, &nsize); + restore_coding_context(cpi); + + overhead = (int)*size - (int)nsize; + + if (vp9_alt_ref_aq_disable_if(cpi->alt_ref_aq, overhead, (int)*size)) + vp9_encode_frame(cpi); + else + vp9_enable_segmentation(&cpi->common.seg); + } +} +#endif + +static void set_frame_index(VP9_COMP *cpi, VP9_COMMON *cm) { + RefCntBuffer *const ref_buffer = get_ref_cnt_buffer(cm, cm->new_fb_idx); + + if (ref_buffer) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + ref_buffer->frame_index = + cm->current_video_frame + gf_group->arf_src_offset[gf_group->index]; + ref_buffer->frame_coding_index = cm->current_frame_coding_index; + } +} + +static void set_mb_ssim_rdmult_scaling(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + ThreadData *td = &cpi->td; + MACROBLOCK *x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + uint8_t *y_buffer = cpi->Source->y_buffer; + const int y_stride = cpi->Source->y_stride; + const int block_size = BLOCK_16X16; + + const int num_8x8_w = num_8x8_blocks_wide_lookup[block_size]; + const int num_8x8_h = num_8x8_blocks_high_lookup[block_size]; + const int num_cols = (cm->mi_cols + num_8x8_w - 1) / num_8x8_w; + const int num_rows = (cm->mi_rows + num_8x8_h - 1) / num_8x8_h; + double log_sum = 0.0; + int row, col; + + // Loop through each 64x64 block. + for (row = 0; row < num_rows; ++row) { + for (col = 0; col < num_cols; ++col) { + int mi_row, mi_col; + double var = 0.0, num_of_var = 0.0; + const int index = row * num_cols + col; + + for (mi_row = row * num_8x8_h; + mi_row < cm->mi_rows && mi_row < (row + 1) * num_8x8_h; ++mi_row) { + for (mi_col = col * num_8x8_w; + mi_col < cm->mi_cols && mi_col < (col + 1) * num_8x8_w; ++mi_col) { + struct buf_2d buf; + const int row_offset_y = mi_row << 3; + const int col_offset_y = mi_col << 3; + + buf.buf = y_buffer + row_offset_y * y_stride + col_offset_y; + buf.stride = y_stride; + + // In order to make SSIM_VAR_SCALE in a same scale for both 8 bit + // and high bit videos, the variance needs to be divided by 2.0 or + // 64.0 separately. + // TODO(sdeng): need to tune for 12bit videos. +#if CONFIG_VP9_HIGHBITDEPTH + if (cpi->Source->flags & YV12_FLAG_HIGHBITDEPTH) + var += vp9_high_get_sby_variance(cpi, &buf, BLOCK_8X8, xd->bd); + else +#endif + var += vp9_get_sby_variance(cpi, &buf, BLOCK_8X8); + + num_of_var += 1.0; + } + } + var = var / num_of_var / 64.0; + + // Curve fitting with an exponential model on all 16x16 blocks from the + // Midres dataset. + var = 67.035434 * (1 - exp(-0.0021489 * var)) + 17.492222; + cpi->mi_ssim_rdmult_scaling_factors[index] = var; + log_sum += log(var); + } + } + log_sum = exp(log_sum / (double)(num_rows * num_cols)); + + for (row = 0; row < num_rows; ++row) { + for (col = 0; col < num_cols; ++col) { + const int index = row * num_cols + col; + cpi->mi_ssim_rdmult_scaling_factors[index] /= log_sum; + } + } + + (void)xd; +} + +// Process the wiener variance in 16x16 block basis. +static int qsort_comp(const void *elem1, const void *elem2) { + int a = *((const int *)elem1); + int b = *((const int *)elem2); + if (a > b) return 1; + if (a < b) return -1; + return 0; +} + +static void init_mb_wiener_var_buffer(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + + if (cpi->mb_wiener_variance && cpi->mb_wiener_var_rows >= cm->mb_rows && + cpi->mb_wiener_var_cols >= cm->mb_cols) + return; + + vpx_free(cpi->mb_wiener_variance); + cpi->mb_wiener_variance = NULL; + + CHECK_MEM_ERROR( + &cm->error, cpi->mb_wiener_variance, + vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->mb_wiener_variance))); + cpi->mb_wiener_var_rows = cm->mb_rows; + cpi->mb_wiener_var_cols = cm->mb_cols; +} + +static void set_mb_wiener_variance(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + uint8_t *buffer = cpi->Source->y_buffer; + int buf_stride = cpi->Source->y_stride; + +#if CONFIG_VP9_HIGHBITDEPTH + ThreadData *td = &cpi->td; + MACROBLOCK *x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + DECLARE_ALIGNED(16, uint16_t, zero_pred16[32 * 32]); + DECLARE_ALIGNED(16, uint8_t, zero_pred8[32 * 32]); + uint8_t *zero_pred; +#else + DECLARE_ALIGNED(16, uint8_t, zero_pred[32 * 32]); +#endif + + DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]); + DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]); + + int mb_row, mb_col, count = 0; + // Hard coded operating block size + const int block_size = 16; + const int coeff_count = block_size * block_size; + const TX_SIZE tx_size = TX_16X16; + +#if CONFIG_VP9_HIGHBITDEPTH + xd->cur_buf = cpi->Source; + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + zero_pred = CONVERT_TO_BYTEPTR(zero_pred16); + memset(zero_pred16, 0, sizeof(*zero_pred16) * coeff_count); + } else { + zero_pred = zero_pred8; + memset(zero_pred8, 0, sizeof(*zero_pred8) * coeff_count); + } +#else + memset(zero_pred, 0, sizeof(*zero_pred) * coeff_count); +#endif + + cpi->norm_wiener_variance = 0; + + for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { + for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) { + int idx; + int16_t median_val = 0; + uint8_t *mb_buffer = + buffer + mb_row * block_size * buf_stride + mb_col * block_size; + int64_t wiener_variance = 0; + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vpx_highbd_subtract_block(block_size, block_size, src_diff, block_size, + mb_buffer, buf_stride, zero_pred, block_size, + xd->bd); + vp9_highbd_wht_fwd_txfm(src_diff, block_size, coeff, tx_size); + } else { + vpx_subtract_block(block_size, block_size, src_diff, block_size, + mb_buffer, buf_stride, zero_pred, block_size); + vp9_wht_fwd_txfm(src_diff, block_size, coeff, tx_size); + } +#else + vpx_subtract_block(block_size, block_size, src_diff, block_size, + mb_buffer, buf_stride, zero_pred, block_size); + vp9_wht_fwd_txfm(src_diff, block_size, coeff, tx_size); +#endif // CONFIG_VP9_HIGHBITDEPTH + + coeff[0] = 0; + for (idx = 1; idx < coeff_count; ++idx) coeff[idx] = abs(coeff[idx]); + + qsort(coeff, coeff_count - 1, sizeof(*coeff), qsort_comp); + + // Noise level estimation + median_val = coeff[coeff_count / 2]; + + // Wiener filter + for (idx = 1; idx < coeff_count; ++idx) { + int64_t sqr_coeff = (int64_t)coeff[idx] * coeff[idx]; + int64_t tmp_coeff = (int64_t)coeff[idx]; + if (median_val) { + tmp_coeff = (sqr_coeff * coeff[idx]) / + (sqr_coeff + (int64_t)median_val * median_val); + } + wiener_variance += tmp_coeff * tmp_coeff; + } + cpi->mb_wiener_variance[mb_row * cm->mb_cols + mb_col] = + wiener_variance / coeff_count; + cpi->norm_wiener_variance += + cpi->mb_wiener_variance[mb_row * cm->mb_cols + mb_col]; + ++count; + } + } + + if (count) cpi->norm_wiener_variance /= count; + cpi->norm_wiener_variance = VPXMAX(1, cpi->norm_wiener_variance); +} + +#if !CONFIG_REALTIME_ONLY +static void update_encode_frame_result_basic( + FRAME_UPDATE_TYPE update_type, int show_idx, int quantize_index, + ENCODE_FRAME_RESULT *encode_frame_result) { + encode_frame_result->show_idx = show_idx; + encode_frame_result->update_type = update_type; + encode_frame_result->quantize_index = quantize_index; +} + +#if CONFIG_RATE_CTRL +static void yv12_buffer_to_image_buffer(const YV12_BUFFER_CONFIG *yv12_buffer, + IMAGE_BUFFER *image_buffer) { + const uint8_t *src_buf_ls[3] = { yv12_buffer->y_buffer, yv12_buffer->u_buffer, + yv12_buffer->v_buffer }; + const int src_stride_ls[3] = { yv12_buffer->y_stride, yv12_buffer->uv_stride, + yv12_buffer->uv_stride }; + const int w_ls[3] = { yv12_buffer->y_crop_width, yv12_buffer->uv_crop_width, + yv12_buffer->uv_crop_width }; + const int h_ls[3] = { yv12_buffer->y_crop_height, yv12_buffer->uv_crop_height, + yv12_buffer->uv_crop_height }; + int plane; + for (plane = 0; plane < 3; ++plane) { + const int src_stride = src_stride_ls[plane]; + const int w = w_ls[plane]; + const int h = h_ls[plane]; + const uint8_t *src_buf = src_buf_ls[plane]; + uint8_t *dst_buf = image_buffer->plane_buffer[plane]; + int r; + assert(image_buffer->plane_width[plane] == w); + assert(image_buffer->plane_height[plane] == h); + for (r = 0; r < h; ++r) { + memcpy(dst_buf, src_buf, sizeof(*src_buf) * w); + src_buf += src_stride; + dst_buf += w; + } + } +} +// This function will update extra information specific for simple_encode APIs +static void update_encode_frame_result_simple_encode( + int ref_frame_flags, FRAME_UPDATE_TYPE update_type, + const YV12_BUFFER_CONFIG *source_frame, const RefCntBuffer *coded_frame_buf, + RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int quantize_index, + uint32_t bit_depth, uint32_t input_bit_depth, const FRAME_COUNTS *counts, + const PARTITION_INFO *partition_info, + const MOTION_VECTOR_INFO *motion_vector_info, + const TplDepStats *tpl_stats_info, + ENCODE_FRAME_RESULT *encode_frame_result) { + PSNR_STATS psnr; + update_encode_frame_result_basic(update_type, coded_frame_buf->frame_index, + quantize_index, encode_frame_result); +#if CONFIG_VP9_HIGHBITDEPTH + vpx_calc_highbd_psnr(source_frame, &coded_frame_buf->buf, &psnr, bit_depth, + input_bit_depth); +#else // CONFIG_VP9_HIGHBITDEPTH + (void)bit_depth; + (void)input_bit_depth; + vpx_calc_psnr(source_frame, &coded_frame_buf->buf, &psnr); +#endif // CONFIG_VP9_HIGHBITDEPTH + encode_frame_result->frame_coding_index = coded_frame_buf->frame_coding_index; + + vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs, + encode_frame_result->ref_frame_coding_indexes, + encode_frame_result->ref_frame_valid_list); + + encode_frame_result->psnr = psnr.psnr[0]; + encode_frame_result->sse = psnr.sse[0]; + encode_frame_result->frame_counts = *counts; + encode_frame_result->partition_info = partition_info; + encode_frame_result->motion_vector_info = motion_vector_info; + encode_frame_result->tpl_stats_info = tpl_stats_info; + if (encode_frame_result->coded_frame.allocated) { + yv12_buffer_to_image_buffer(&coded_frame_buf->buf, + &encode_frame_result->coded_frame); + } +} +#endif // CONFIG_RATE_CTRL +#endif // !CONFIG_REALTIME_ONLY + +static void encode_frame_to_data_rate( + VP9_COMP *cpi, size_t *size, uint8_t *dest, unsigned int *frame_flags, + ENCODE_FRAME_RESULT *encode_frame_result) { + VP9_COMMON *const cm = &cpi->common; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + struct segmentation *const seg = &cm->seg; + TX_SIZE t; + + if (vp9_svc_check_skip_enhancement_layer(cpi)) return; + + set_ext_overrides(cpi); + vpx_clear_system_state(); + +#ifdef ENABLE_KF_DENOISE + // Spatial denoise of key frame. + if (is_spatial_denoise_enabled(cpi)) spatial_denoise_frame(cpi); +#endif + + if (cm->show_existing_frame == 0) { + // Update frame index + set_frame_index(cpi, cm); + + // Set the arf sign bias for this frame. + set_ref_sign_bias(cpi); + } + + // On the very first frame set the deadline_mode_previous_frame to + // the current mode. + if (cpi->common.current_video_frame == 0) + cpi->deadline_mode_previous_frame = cpi->oxcf.mode; + + // Set default state for segment based loop filter update flags. + cm->lf.mode_ref_delta_update = 0; + + if (cpi->oxcf.pass == 2 && cpi->sf.adaptive_interp_filter_search) + cpi->sf.interp_filter_search_mask = setup_interp_filter_search_mask(cpi); + + // Set various flags etc to special state if it is a key frame. + if (frame_is_intra_only(cm)) { + // Reset the loop filter deltas and segmentation map. + vp9_reset_segment_features(&cm->seg); + + // If segmentation is enabled force a map update for key frames. + if (seg->enabled) { + seg->update_map = 1; + seg->update_data = 1; + } + + // The alternate reference frame cannot be active for a key frame. + cpi->rc.source_alt_ref_active = 0; + + cm->error_resilient_mode = oxcf->error_resilient_mode; + cm->frame_parallel_decoding_mode = oxcf->frame_parallel_decoding_mode; + + // By default, encoder assumes decoder can use prev_mi. + if (cm->error_resilient_mode) { + cm->frame_parallel_decoding_mode = 1; + cm->reset_frame_context = 0; + cm->refresh_frame_context = 0; + } else if (cm->intra_only) { + // Only reset the current context. + cm->reset_frame_context = 2; + } + } + + if (oxcf->tuning == VP8_TUNE_SSIM) set_mb_ssim_rdmult_scaling(cpi); + + if (oxcf->aq_mode == PERCEPTUAL_AQ) { + init_mb_wiener_var_buffer(cpi); + set_mb_wiener_variance(cpi); + } + + vpx_clear_system_state(); + +#if CONFIG_INTERNAL_STATS + memset(cpi->mode_chosen_counts, 0, + MAX_MODES * sizeof(*cpi->mode_chosen_counts)); +#endif + // Backup to ensure consistency between recodes + save_encode_params(cpi); + if (cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0 && + cpi->ext_ratectrl.funcs.get_frame_rdmult != NULL) { + vpx_codec_err_t codec_status; + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index]; + const int ref_frame_flags = get_ref_frame_flags(cpi); + RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES]; + const RefCntBuffer *curr_frame_buf = get_ref_cnt_buffer(cm, cm->new_fb_idx); + // index 0 of a gf group is always KEY/OVERLAY/GOLDEN. + // index 1 refers to the first encoding frame in a gf group. + // Therefore if it is ARF_UPDATE, it means this gf group uses alt ref. + // See function define_gf_group_structure(). + const int use_alt_ref = gf_group->update_type[1] == ARF_UPDATE; + int ext_rdmult = VPX_DEFAULT_RDMULT; + get_ref_frame_bufs(cpi, ref_frame_bufs); + codec_status = vp9_extrc_get_frame_rdmult( + &cpi->ext_ratectrl, curr_frame_buf->frame_index, + cm->current_frame_coding_index, gf_group->index, update_type, + gf_group->gf_group_size, use_alt_ref, ref_frame_bufs, ref_frame_flags, + &ext_rdmult); + if (codec_status != VPX_CODEC_OK) { + vpx_internal_error(&cm->error, codec_status, + "vp9_extrc_get_frame_rdmult() failed"); + } + cpi->ext_ratectrl.ext_rdmult = ext_rdmult; + } + + if (cpi->sf.recode_loop == DISALLOW_RECODE) { + if (!encode_without_recode_loop(cpi, size, dest)) return; + } else { +#if !CONFIG_REALTIME_ONLY +#if CONFIG_RATE_CTRL + encode_with_recode_loop(cpi, size, dest, &encode_frame_result->rq_history); +#else // CONFIG_RATE_CTRL +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, encode_with_recode_loop_time); +#endif + encode_with_recode_loop(cpi, size, dest); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, encode_with_recode_loop_time); +#endif +#endif // CONFIG_RATE_CTRL +#endif // !CONFIG_REALTIME_ONLY + } + + // TODO(jingning): When using show existing frame mode, we assume that the + // current ARF will be directly used as the final reconstructed frame. This is + // an encoder control scheme. One could in principle explore other + // possibilities to arrange the reference frame buffer and their coding order. + if (cm->show_existing_frame) { + ref_cnt_fb(cm->buffer_pool->frame_bufs, &cm->new_fb_idx, + cm->ref_frame_map[cpi->alt_fb_idx]); + } + +#if !CONFIG_REALTIME_ONLY + // Disable segmentation if it decrease rate/distortion ratio + if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ) + vp9_try_disable_lookahead_aq(cpi, size, dest); +#endif + +#if CONFIG_VP9_TEMPORAL_DENOISING +#ifdef OUTPUT_YUV_DENOISED + if (oxcf->noise_sensitivity > 0 && denoise_svc(cpi)) { + vpx_write_yuv_frame(yuv_denoised_file, + &cpi->denoiser.running_avg_y[INTRA_FRAME]); + } +#endif +#endif +#ifdef OUTPUT_YUV_SKINMAP + if (cpi->common.current_video_frame > 1) { + vp9_output_skin_map(cpi, yuv_skinmap_file); + } +#endif + + // Special case code to reduce pulsing when key frames are forced at a + // fixed interval. Note the reconstruction error if it is the frame before + // the force key frame + if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) { +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) { + cpi->ambient_err = + vpx_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm)); + } else { + cpi->ambient_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm)); + } +#else + cpi->ambient_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm)); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + + // If the encoder forced a KEY_FRAME decision + if (cm->frame_type == KEY_FRAME) cpi->refresh_last_frame = 1; + + cm->frame_to_show = get_frame_new_buffer(cm); + cm->frame_to_show->color_space = cm->color_space; + cm->frame_to_show->color_range = cm->color_range; + cm->frame_to_show->render_width = cm->render_width; + cm->frame_to_show->render_height = cm->render_height; + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, loopfilter_frame_time); +#endif + // Pick the loop filter level for the frame. + loopfilter_frame(cpi, cm); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, loopfilter_frame_time); +#endif + + if (cpi->rc.use_post_encode_drop) save_coding_context(cpi); + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, vp9_pack_bitstream_time); +#endif + // build the bitstream + vp9_pack_bitstream(cpi, dest, size); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, vp9_pack_bitstream_time); +#endif + + if (cpi->ext_ratectrl.ready && + cpi->ext_ratectrl.funcs.update_encodeframe_result != NULL) { + const RefCntBuffer *coded_frame_buf = + get_ref_cnt_buffer(cm, cm->new_fb_idx); + vpx_codec_err_t codec_status = vp9_extrc_update_encodeframe_result( + &cpi->ext_ratectrl, (*size) << 3, cpi->Source, &coded_frame_buf->buf, + cm->bit_depth, cpi->oxcf.input_bit_depth, cm->base_qindex); + if (codec_status != VPX_CODEC_OK) { + vpx_internal_error(&cm->error, codec_status, + "vp9_extrc_update_encodeframe_result() failed"); + } + } +#if CONFIG_REALTIME_ONLY + (void)encode_frame_result; + assert(encode_frame_result == NULL); +#else // CONFIG_REALTIME_ONLY + if (encode_frame_result != NULL) { + const RefCntBuffer *coded_frame_buf = + get_ref_cnt_buffer(cm, cm->new_fb_idx); + RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES]; + FRAME_UPDATE_TYPE update_type = + cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index]; + int quantize_index = vp9_get_quantizer(cpi); + get_ref_frame_bufs(cpi, ref_frame_bufs); + // update_encode_frame_result() depends on twopass.gf_group.index and + // cm->new_fb_idx, cpi->Source, cpi->lst_fb_idx, cpi->gld_fb_idx and + // cpi->alt_fb_idx are updated for current frame and have + // not been updated for the next frame yet. + // The update locations are as follows. + // 1) twopass.gf_group.index is initialized at define_gf_group by vp9_zero() + // for the first frame in the gf_group and is updated for the next frame at + // vp9_twopass_postencode_update(). + // 2) cpi->Source is updated at the beginning of vp9_get_compressed_data() + // 3) cm->new_fb_idx is updated at the beginning of + // vp9_get_compressed_data() by get_free_fb(cm). + // 4) cpi->lst_fb_idx/gld_fb_idx/alt_fb_idx will be updated for the next + // frame at vp9_update_reference_frames(). + // This function needs to be called before vp9_update_reference_frames(). + // TODO(angiebird): Improve the codebase to make the update of frame + // dependent variables more robust. + + update_encode_frame_result_basic(update_type, coded_frame_buf->frame_index, + quantize_index, encode_frame_result); +#if CONFIG_RATE_CTRL + if (cpi->oxcf.use_simple_encode_api) { + const int ref_frame_flags = get_ref_frame_flags(cpi); + update_encode_frame_result_simple_encode( + ref_frame_flags, + cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index], + cpi->Source, coded_frame_buf, ref_frame_bufs, quantize_index, + cm->bit_depth, cpi->oxcf.input_bit_depth, cpi->td.counts, + cpi->partition_info, cpi->motion_vector_info, cpi->tpl_stats_info, + encode_frame_result); + } +#endif // CONFIG_RATE_CTRL + } +#endif // CONFIG_REALTIME_ONLY + + if (cpi->rc.use_post_encode_drop && cm->base_qindex < cpi->rc.worst_quality && + cpi->svc.spatial_layer_id == 0 && post_encode_drop_cbr(cpi, size)) { + restore_coding_context(cpi); + return; + } + + cpi->last_frame_dropped = 0; + cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 0; + if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) + cpi->svc.num_encoded_top_layer++; + + // Keep track of the frame buffer index updated/refreshed for the + // current encoded TL0 superframe. + if (cpi->svc.temporal_layer_id == 0) { + if (cpi->refresh_last_frame) + cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id] = cpi->lst_fb_idx; + else if (cpi->refresh_golden_frame) + cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id] = cpi->gld_fb_idx; + else if (cpi->refresh_alt_ref_frame) + cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id] = cpi->alt_fb_idx; + } + + if (cm->seg.update_map) update_reference_segmentation_map(cpi); + + if (frame_is_intra_only(cm) == 0) { + release_scaled_references(cpi); + } + vp9_update_reference_frames(cpi); + + if (!cm->show_existing_frame) { + for (t = TX_4X4; t <= TX_32X32; ++t) { + full_to_model_counts(cpi->td.counts->coef[t], + cpi->td.rd_counts.coef_counts[t]); + } + + if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) { + if (!frame_is_intra_only(cm)) { + vp9_adapt_mode_probs(cm); + vp9_adapt_mv_probs(cm, cm->allow_high_precision_mv); + } + vp9_adapt_coef_probs(cm); + } + } + + cpi->ext_refresh_frame_flags_pending = 0; + + if (cpi->refresh_golden_frame == 1) + cpi->frame_flags |= FRAMEFLAGS_GOLDEN; + else + cpi->frame_flags &= ~FRAMEFLAGS_GOLDEN; + + if (cpi->refresh_alt_ref_frame == 1) + cpi->frame_flags |= FRAMEFLAGS_ALTREF; + else + cpi->frame_flags &= ~FRAMEFLAGS_ALTREF; + + cpi->ref_frame_flags = get_ref_frame_flags(cpi); + + cm->last_frame_type = cm->frame_type; + + vp9_rc_postencode_update(cpi, *size); + + if (cpi->compute_frame_low_motion_onepass && oxcf->pass == 0 && + !frame_is_intra_only(cm) && + (!cpi->use_svc || + (cpi->use_svc && + !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1))) { + vp9_compute_frame_low_motion(cpi); + } + + *size = VPXMAX(1, *size); + +#if 0 + output_frame_level_debug_stats(cpi); +#endif + + if (cm->frame_type == KEY_FRAME) { + // Tell the caller that the frame was coded as a key frame + *frame_flags = cpi->frame_flags | FRAMEFLAGS_KEY; + } else { + *frame_flags = cpi->frame_flags & ~FRAMEFLAGS_KEY; + } + + // Clear the one shot update flags for segmentation map and mode/ref loop + // filter deltas. + cm->seg.update_map = 0; + cm->seg.update_data = 0; + cm->lf.mode_ref_delta_update = 0; + + // keep track of the last coded dimensions + cm->last_width = cm->width; + cm->last_height = cm->height; + + // reset to normal state now that we are done. + if (!cm->show_existing_frame) { + cm->last_show_frame = cm->show_frame; + cm->prev_frame = cm->cur_frame; + } + + if (cm->show_frame) { + vp9_swap_mi_and_prev_mi(cm); + if (cpi->use_svc) vp9_inc_frame_in_layer(cpi); + } + update_frame_indexes(cm, cm->show_frame); + + if (cpi->use_svc) { + cpi->svc + .layer_context[cpi->svc.spatial_layer_id * + cpi->svc.number_temporal_layers + + cpi->svc.temporal_layer_id] + .last_frame_type = cm->frame_type; + // Reset layer_sync back to 0 for next frame. + cpi->svc.spatial_layer_sync[cpi->svc.spatial_layer_id] = 0; + } + + cpi->force_update_segmentation = 0; + +#if !CONFIG_REALTIME_ONLY + if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ) + vp9_alt_ref_aq_unset_all(cpi->alt_ref_aq, cpi); +#endif + + cpi->svc.previous_frame_is_intra_only = cm->intra_only; + cpi->svc.set_intra_only_frame = 0; +} + +static void SvcEncode(VP9_COMP *cpi, size_t *size, uint8_t *dest, + unsigned int *frame_flags) { + vp9_rc_get_svc_params(cpi); + encode_frame_to_data_rate(cpi, size, dest, frame_flags, + /*encode_frame_result = */ NULL); +} + +static void Pass0Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest, + unsigned int *frame_flags) { + if (cpi->oxcf.rc_mode == VPX_CBR) { + vp9_rc_get_one_pass_cbr_params(cpi); + } else { + vp9_rc_get_one_pass_vbr_params(cpi); + } + encode_frame_to_data_rate(cpi, size, dest, frame_flags, + /*encode_frame_result = */ NULL); +} + +#if !CONFIG_REALTIME_ONLY +static void Pass2Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest, + unsigned int *frame_flags, + ENCODE_FRAME_RESULT *encode_frame_result) { + cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED; +#if CONFIG_MISMATCH_DEBUG + mismatch_move_frame_idx_w(); +#endif + encode_frame_to_data_rate(cpi, size, dest, frame_flags, encode_frame_result); +} +#endif // !CONFIG_REALTIME_ONLY + +int vp9_receive_raw_frame(VP9_COMP *cpi, vpx_enc_frame_flags_t frame_flags, + YV12_BUFFER_CONFIG *sd, int64_t time_stamp, + int64_t end_time) { + VP9_COMMON *const cm = &cpi->common; + struct vpx_usec_timer timer; + int res = 0; + const int subsampling_x = sd->subsampling_x; + const int subsampling_y = sd->subsampling_y; +#if CONFIG_VP9_HIGHBITDEPTH + const int use_highbitdepth = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0; +#else + const int use_highbitdepth = 0; +#endif + + update_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y); +#if CONFIG_VP9_TEMPORAL_DENOISING + setup_denoiser_buffer(cpi); +#endif + + alloc_raw_frame_buffers(cpi); + + vpx_usec_timer_start(&timer); + + if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, + use_highbitdepth, frame_flags)) + res = -1; + vpx_usec_timer_mark(&timer); + cpi->time_receive_data += vpx_usec_timer_elapsed(&timer); + + if ((cm->profile == PROFILE_0 || cm->profile == PROFILE_2) && + (subsampling_x != 1 || subsampling_y != 1)) { + vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM, + "Non-4:2:0 color format requires profile 1 or 3"); + res = -1; + } + if ((cm->profile == PROFILE_1 || cm->profile == PROFILE_3) && + (subsampling_x == 1 && subsampling_y == 1)) { + vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM, + "4:2:0 color format requires profile 0 or 2"); + res = -1; + } + + return res; +} + +static int frame_is_reference(const VP9_COMP *cpi) { + const VP9_COMMON *cm = &cpi->common; + + return cm->frame_type == KEY_FRAME || cpi->refresh_last_frame || + cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame || + cm->refresh_frame_context || cm->lf.mode_ref_delta_update || + cm->seg.update_map || cm->seg.update_data; +} + +static void adjust_frame_rate(VP9_COMP *cpi, + const struct lookahead_entry *source) { + int64_t this_duration; + int step = 0; + + if (source->ts_start == cpi->first_time_stamp_ever) { + this_duration = source->ts_end - source->ts_start; + step = 1; + } else { + int64_t last_duration = + cpi->last_end_time_stamp_seen - cpi->last_time_stamp_seen; + + this_duration = source->ts_end - cpi->last_end_time_stamp_seen; + + // do a step update if the duration changes by 10% + if (last_duration) + step = (int)((this_duration - last_duration) * 10 / last_duration); + } + + if (this_duration) { + if (step) { + vp9_new_framerate(cpi, 10000000.0 / this_duration); + } else { + // Average this frame's rate into the last second's average + // frame rate. If we haven't seen 1 second yet, then average + // over the whole interval seen. + const double interval = VPXMIN( + (double)(source->ts_end - cpi->first_time_stamp_ever), 10000000.0); + double avg_duration = 10000000.0 / cpi->framerate; + avg_duration *= (interval - avg_duration + this_duration); + avg_duration /= interval; + + vp9_new_framerate(cpi, 10000000.0 / avg_duration); + } + } + cpi->last_time_stamp_seen = source->ts_start; + cpi->last_end_time_stamp_seen = source->ts_end; +} + +// Returns 0 if this is not an alt ref else the offset of the source frame +// used as the arf midpoint. +static int get_arf_src_index(VP9_COMP *cpi) { + RATE_CONTROL *const rc = &cpi->rc; + int arf_src_index = 0; + if (is_altref_enabled(cpi)) { + if (cpi->oxcf.pass == 2) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + if (gf_group->update_type[gf_group->index] == ARF_UPDATE) { + arf_src_index = gf_group->arf_src_offset[gf_group->index]; + } + } else if (rc->source_alt_ref_pending) { + arf_src_index = rc->frames_till_gf_update_due; + } + } + return arf_src_index; +} + +static void check_src_altref(VP9_COMP *cpi, + const struct lookahead_entry *source) { + RATE_CONTROL *const rc = &cpi->rc; + + if (cpi->oxcf.pass == 2) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + rc->is_src_frame_alt_ref = + (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE); + } else { + rc->is_src_frame_alt_ref = + cpi->alt_ref_source && (source == cpi->alt_ref_source); + } + + if (rc->is_src_frame_alt_ref) { + // Current frame is an ARF overlay frame. + cpi->alt_ref_source = NULL; + + // Don't refresh the last buffer for an ARF overlay frame. It will + // become the GF so preserve last as an alternative prediction option. + cpi->refresh_last_frame = 0; + } +} + +#if CONFIG_INTERNAL_STATS +static void adjust_image_stat(double y, double u, double v, double all, + ImageStat *s) { + s->stat[Y] += y; + s->stat[U] += u; + s->stat[V] += v; + s->stat[ALL] += all; + s->worst = VPXMIN(s->worst, all); +} +#endif // CONFIG_INTERNAL_STATS + +// Adjust the maximum allowable frame size for the target level. +static void level_rc_framerate(VP9_COMP *cpi, int arf_src_index) { + RATE_CONTROL *const rc = &cpi->rc; + LevelConstraint *const ls = &cpi->level_constraint; + VP9_COMMON *const cm = &cpi->common; + const double max_cpb_size = ls->max_cpb_size; + vpx_clear_system_state(); + rc->max_frame_bandwidth = VPXMIN(rc->max_frame_bandwidth, ls->max_frame_size); + if (frame_is_intra_only(cm)) { + rc->max_frame_bandwidth = + VPXMIN(rc->max_frame_bandwidth, (int)(max_cpb_size * 0.5)); + } else if (arf_src_index > 0) { + rc->max_frame_bandwidth = + VPXMIN(rc->max_frame_bandwidth, (int)(max_cpb_size * 0.4)); + } else { + rc->max_frame_bandwidth = + VPXMIN(rc->max_frame_bandwidth, (int)(max_cpb_size * 0.2)); + } +} + +static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) { + VP9_COMMON *const cm = &cpi->common; + Vp9LevelInfo *const level_info = &cpi->level_info; + Vp9LevelSpec *const level_spec = &level_info->level_spec; + Vp9LevelStats *const level_stats = &level_info->level_stats; + int i, idx; + uint64_t luma_samples, dur_end; + const uint32_t luma_pic_size = cm->width * cm->height; + const uint32_t luma_pic_breadth = VPXMAX(cm->width, cm->height); + LevelConstraint *const level_constraint = &cpi->level_constraint; + const int8_t level_index = level_constraint->level_index; + double cpb_data_size; + + vpx_clear_system_state(); + + // update level_stats + level_stats->total_compressed_size += *size; + if (cm->show_frame) { + level_stats->total_uncompressed_size += + luma_pic_size + + 2 * (luma_pic_size >> (cm->subsampling_x + cm->subsampling_y)); + level_stats->time_encoded = + (cpi->last_end_time_stamp_seen - cpi->first_time_stamp_ever) / + (double)TICKS_PER_SEC; + } + + if (arf_src_index > 0) { + if (!level_stats->seen_first_altref) { + level_stats->seen_first_altref = 1; + } else if (level_stats->frames_since_last_altref < + level_spec->min_altref_distance) { + level_spec->min_altref_distance = level_stats->frames_since_last_altref; + } + level_stats->frames_since_last_altref = 0; + } else { + ++level_stats->frames_since_last_altref; + } + + if (level_stats->frame_window_buffer.len < FRAME_WINDOW_SIZE - 1) { + idx = (level_stats->frame_window_buffer.start + + level_stats->frame_window_buffer.len++) % + FRAME_WINDOW_SIZE; + } else { + idx = level_stats->frame_window_buffer.start; + level_stats->frame_window_buffer.start = (idx + 1) % FRAME_WINDOW_SIZE; + } + level_stats->frame_window_buffer.buf[idx].ts = cpi->last_time_stamp_seen; + level_stats->frame_window_buffer.buf[idx].size = (uint32_t)(*size); + level_stats->frame_window_buffer.buf[idx].luma_samples = luma_pic_size; + + if (cm->frame_type == KEY_FRAME) { + level_stats->ref_refresh_map = 0; + } else { + int count = 0; + level_stats->ref_refresh_map |= vp9_get_refresh_mask(cpi); + // Also need to consider the case where the encoder refers to a buffer + // that has been implicitly refreshed after encoding a keyframe. + if (!cm->intra_only) { + level_stats->ref_refresh_map |= (1 << cpi->lst_fb_idx); + level_stats->ref_refresh_map |= (1 << cpi->gld_fb_idx); + level_stats->ref_refresh_map |= (1 << cpi->alt_fb_idx); + } + for (i = 0; i < REF_FRAMES; ++i) { + count += (level_stats->ref_refresh_map >> i) & 1; + } + if (count > level_spec->max_ref_frame_buffers) { + level_spec->max_ref_frame_buffers = count; + } + } + + // update average_bitrate + level_spec->average_bitrate = (double)level_stats->total_compressed_size / + 125.0 / level_stats->time_encoded; + + // update max_luma_sample_rate + luma_samples = 0; + for (i = 0; i < level_stats->frame_window_buffer.len; ++i) { + idx = (level_stats->frame_window_buffer.start + + level_stats->frame_window_buffer.len - 1 - i) % + FRAME_WINDOW_SIZE; + if (i == 0) { + dur_end = level_stats->frame_window_buffer.buf[idx].ts; + } + if (dur_end - level_stats->frame_window_buffer.buf[idx].ts >= + TICKS_PER_SEC) { + break; + } + luma_samples += level_stats->frame_window_buffer.buf[idx].luma_samples; + } + if (luma_samples > level_spec->max_luma_sample_rate) { + level_spec->max_luma_sample_rate = luma_samples; + } + + // update max_cpb_size + cpb_data_size = 0; + for (i = 0; i < CPB_WINDOW_SIZE; ++i) { + if (i >= level_stats->frame_window_buffer.len) break; + idx = (level_stats->frame_window_buffer.start + + level_stats->frame_window_buffer.len - 1 - i) % + FRAME_WINDOW_SIZE; + cpb_data_size += level_stats->frame_window_buffer.buf[idx].size; + } + cpb_data_size = cpb_data_size / 125.0; + if (cpb_data_size > level_spec->max_cpb_size) { + level_spec->max_cpb_size = cpb_data_size; + } + + // update max_luma_picture_size + if (luma_pic_size > level_spec->max_luma_picture_size) { + level_spec->max_luma_picture_size = luma_pic_size; + } + + // update max_luma_picture_breadth + if (luma_pic_breadth > level_spec->max_luma_picture_breadth) { + level_spec->max_luma_picture_breadth = luma_pic_breadth; + } + + // update compression_ratio + level_spec->compression_ratio = (double)level_stats->total_uncompressed_size * + cm->bit_depth / + level_stats->total_compressed_size / 8.0; + + // update max_col_tiles + if (level_spec->max_col_tiles < (1 << cm->log2_tile_cols)) { + level_spec->max_col_tiles = (1 << cm->log2_tile_cols); + } + + if (level_index >= 0 && level_constraint->fail_flag == 0) { + if (level_spec->max_luma_picture_size > + vp9_level_defs[level_index].max_luma_picture_size) { + level_constraint->fail_flag |= (1 << LUMA_PIC_SIZE_TOO_LARGE); + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Failed to encode to the target level %d. %s", + vp9_level_defs[level_index].level, + level_fail_messages[LUMA_PIC_SIZE_TOO_LARGE]); + } + + if (level_spec->max_luma_picture_breadth > + vp9_level_defs[level_index].max_luma_picture_breadth) { + level_constraint->fail_flag |= (1 << LUMA_PIC_BREADTH_TOO_LARGE); + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Failed to encode to the target level %d. %s", + vp9_level_defs[level_index].level, + level_fail_messages[LUMA_PIC_BREADTH_TOO_LARGE]); + } + + if ((double)level_spec->max_luma_sample_rate > + (double)vp9_level_defs[level_index].max_luma_sample_rate * + (1 + SAMPLE_RATE_GRACE_P)) { + level_constraint->fail_flag |= (1 << LUMA_SAMPLE_RATE_TOO_LARGE); + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Failed to encode to the target level %d. %s", + vp9_level_defs[level_index].level, + level_fail_messages[LUMA_SAMPLE_RATE_TOO_LARGE]); + } + + if (level_spec->max_col_tiles > vp9_level_defs[level_index].max_col_tiles) { + level_constraint->fail_flag |= (1 << TOO_MANY_COLUMN_TILE); + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Failed to encode to the target level %d. %s", + vp9_level_defs[level_index].level, + level_fail_messages[TOO_MANY_COLUMN_TILE]); + } + + if (level_spec->min_altref_distance < + vp9_level_defs[level_index].min_altref_distance) { + level_constraint->fail_flag |= (1 << ALTREF_DIST_TOO_SMALL); + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Failed to encode to the target level %d. %s", + vp9_level_defs[level_index].level, + level_fail_messages[ALTREF_DIST_TOO_SMALL]); + } + + if (level_spec->max_ref_frame_buffers > + vp9_level_defs[level_index].max_ref_frame_buffers) { + level_constraint->fail_flag |= (1 << TOO_MANY_REF_BUFFER); + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Failed to encode to the target level %d. %s", + vp9_level_defs[level_index].level, + level_fail_messages[TOO_MANY_REF_BUFFER]); + } + + if (level_spec->max_cpb_size > vp9_level_defs[level_index].max_cpb_size) { + level_constraint->fail_flag |= (1 << CPB_TOO_LARGE); + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Failed to encode to the target level %d. %s", + vp9_level_defs[level_index].level, + level_fail_messages[CPB_TOO_LARGE]); + } + + // Set an upper bound for the next frame size. It will be used in + // level_rc_framerate() before encoding the next frame. + cpb_data_size = 0; + for (i = 0; i < CPB_WINDOW_SIZE - 1; ++i) { + if (i >= level_stats->frame_window_buffer.len) break; + idx = (level_stats->frame_window_buffer.start + + level_stats->frame_window_buffer.len - 1 - i) % + FRAME_WINDOW_SIZE; + cpb_data_size += level_stats->frame_window_buffer.buf[idx].size; + } + cpb_data_size = cpb_data_size / 125.0; + level_constraint->max_frame_size = + (int)((vp9_level_defs[level_index].max_cpb_size - cpb_data_size) * + 1000.0); + if (level_stats->frame_window_buffer.len < CPB_WINDOW_SIZE - 1) + level_constraint->max_frame_size >>= 1; + } +} + +void vp9_get_ref_frame_info(FRAME_UPDATE_TYPE update_type, int ref_frame_flags, + RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], + int *ref_frame_coding_indexes, + int *ref_frame_valid_list) { + if (update_type != KF_UPDATE) { + const VP9_REFFRAME inter_ref_flags[MAX_INTER_REF_FRAMES] = { VP9_LAST_FLAG, + VP9_GOLD_FLAG, + VP9_ALT_FLAG }; + int i; + for (i = 0; i < MAX_INTER_REF_FRAMES; ++i) { + assert(ref_frame_bufs[i] != NULL); + ref_frame_coding_indexes[i] = ref_frame_bufs[i]->frame_coding_index; + ref_frame_valid_list[i] = (ref_frame_flags & inter_ref_flags[i]) != 0; + } + } else { + // No reference frame is available when this is a key frame. + int i; + for (i = 0; i < MAX_INTER_REF_FRAMES; ++i) { + ref_frame_coding_indexes[i] = -1; + ref_frame_valid_list[i] = 0; + } + } +} + +void vp9_init_encode_frame_result(ENCODE_FRAME_RESULT *encode_frame_result) { + encode_frame_result->show_idx = -1; // Actual encoding doesn't happen. +#if CONFIG_RATE_CTRL + encode_frame_result->frame_coding_index = -1; + vp9_zero(encode_frame_result->coded_frame); + encode_frame_result->coded_frame.allocated = 0; + init_rq_history(&encode_frame_result->rq_history); +#endif // CONFIG_RATE_CTRL +} + +int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, + size_t *size, uint8_t *dest, int64_t *time_stamp, + int64_t *time_end, int flush, + ENCODE_FRAME_RESULT *encode_frame_result) { + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + VP9_COMMON *const cm = &cpi->common; + BufferPool *const pool = cm->buffer_pool; + RATE_CONTROL *const rc = &cpi->rc; + struct vpx_usec_timer cmptimer; + YV12_BUFFER_CONFIG *force_src_buffer = NULL; + struct lookahead_entry *last_source = NULL; + struct lookahead_entry *source = NULL; + int arf_src_index; + const int gf_group_index = cpi->twopass.gf_group.index; + int i; + +#if CONFIG_COLLECT_COMPONENT_TIMING + if (oxcf->pass == 2) start_timing(cpi, vp9_get_compressed_data_time); +#endif + + if (is_one_pass_svc(cpi)) { + vp9_one_pass_svc_start_layer(cpi); + } + + vpx_usec_timer_start(&cmptimer); + + vp9_set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV); + + // Is multi-arf enabled. + // Note that at the moment multi_arf is only configured for 2 pass VBR and + // will not work properly with svc. + // Enable the Jingning's new "multi_layer_arf" code if "enable_auto_arf" + // is greater than or equal to 2. + if ((oxcf->pass == 2) && !cpi->use_svc && (cpi->oxcf.enable_auto_arf >= 2)) + cpi->multi_layer_arf = 1; + else + cpi->multi_layer_arf = 0; + + // Normal defaults + cm->reset_frame_context = 0; + cm->refresh_frame_context = 1; + if (!is_one_pass_svc(cpi)) { + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 0; + cpi->refresh_alt_ref_frame = 0; + } + + // Should we encode an arf frame. + arf_src_index = get_arf_src_index(cpi); + + if (arf_src_index) { + for (i = 0; i <= arf_src_index; ++i) { + struct lookahead_entry *e = vp9_lookahead_peek(cpi->lookahead, i); + // Avoid creating an alt-ref if there's a forced keyframe pending. + if (e == NULL) { + break; + } else if (e->flags == VPX_EFLAG_FORCE_KF) { + arf_src_index = 0; + flush = 1; + break; + } + } + } + + // Clear arf index stack before group of pictures processing starts. + if (gf_group_index == 1) { + stack_init(cpi->twopass.gf_group.arf_index_stack, MAX_LAG_BUFFERS * 2); + cpi->twopass.gf_group.stack_size = 0; + } + + if (arf_src_index) { + assert(arf_src_index <= rc->frames_to_key); + if ((source = vp9_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) { + cpi->alt_ref_source = source; + +#if !CONFIG_REALTIME_ONLY + if ((oxcf->mode != REALTIME) && (oxcf->arnr_max_frames > 0) && + (oxcf->arnr_strength > 0)) { + int bitrate = cpi->rc.avg_frame_bandwidth / 40; + int not_low_bitrate = bitrate > ALT_REF_AQ_LOW_BITRATE_BOUNDARY; + + int not_last_frame = (cpi->lookahead->sz - arf_src_index > 1); + not_last_frame |= ALT_REF_AQ_APPLY_TO_LAST_FRAME; + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, vp9_temporal_filter_time); +#endif + // Produce the filtered ARF frame. + vp9_temporal_filter(cpi, arf_src_index); + vpx_extend_frame_borders(&cpi->alt_ref_buffer); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, vp9_temporal_filter_time); +#endif + + // for small bitrates segmentation overhead usually + // eats all bitrate gain from enabling delta quantizers + if (cpi->oxcf.alt_ref_aq != 0 && not_low_bitrate && not_last_frame) + vp9_alt_ref_aq_setup_mode(cpi->alt_ref_aq, cpi); + + force_src_buffer = &cpi->alt_ref_buffer; + } +#endif + cm->show_frame = 0; + cm->intra_only = 0; + cpi->refresh_alt_ref_frame = 1; + cpi->refresh_golden_frame = 0; + cpi->refresh_last_frame = 0; + rc->is_src_frame_alt_ref = 0; + rc->source_alt_ref_pending = 0; + } else { + rc->source_alt_ref_pending = 0; + } + } + + if (!source) { + // Get last frame source. + if (cm->current_video_frame > 0) { + if ((last_source = vp9_lookahead_peek(cpi->lookahead, -1)) == NULL) + return -1; + } + + // Read in the source frame. + if (cpi->use_svc || cpi->svc.set_intra_only_frame) + source = vp9_svc_lookahead_pop(cpi, cpi->lookahead, flush); + else + source = vp9_lookahead_pop(cpi->lookahead, flush); + + if (source != NULL) { + cm->show_frame = 1; + cm->intra_only = 0; + // If the flags indicate intra frame, but if the current picture is for + // spatial layer above first_spatial_layer_to_encode, it should not be an + // intra picture. + if ((source->flags & VPX_EFLAG_FORCE_KF) && cpi->use_svc && + cpi->svc.spatial_layer_id > cpi->svc.first_spatial_layer_to_encode) { + source->flags &= ~(unsigned int)(VPX_EFLAG_FORCE_KF); + } + + // Check to see if the frame should be encoded as an arf overlay. + check_src_altref(cpi, source); + } + } + + if (source) { + cpi->un_scaled_source = cpi->Source = + force_src_buffer ? force_src_buffer : &source->img; + +#ifdef ENABLE_KF_DENOISE + // Copy of raw source for metrics calculation. + if (is_psnr_calc_enabled(cpi)) + vp9_copy_and_extend_frame(cpi->Source, &cpi->raw_unscaled_source); +#endif + + cpi->unscaled_last_source = last_source != NULL ? &last_source->img : NULL; + + *time_stamp = source->ts_start; + *time_end = source->ts_end; + *frame_flags = (source->flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0; + } else { + *size = 0; + return -1; + } + + if (source->ts_start < cpi->first_time_stamp_ever) { + cpi->first_time_stamp_ever = source->ts_start; + cpi->last_end_time_stamp_seen = source->ts_start; + } + + // Clear down mmx registers + vpx_clear_system_state(); + + // adjust frame rates based on timestamps given + if (cm->show_frame) { + if (cpi->use_svc && cpi->svc.use_set_ref_frame_config && + cpi->svc.duration[cpi->svc.spatial_layer_id] > 0) + vp9_svc_adjust_frame_rate(cpi); + else + adjust_frame_rate(cpi, source); + } + + if (is_one_pass_svc(cpi)) { + vp9_update_temporal_layer_framerate(cpi); + vp9_restore_layer_context(cpi); + } + + // Find a free buffer for the new frame, releasing the reference previously + // held. + if (cm->new_fb_idx != INVALID_IDX) { + --pool->frame_bufs[cm->new_fb_idx].ref_count; + } + cm->new_fb_idx = get_free_fb(cm); + + if (cm->new_fb_idx == INVALID_IDX) return -1; + cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx]; + // If the frame buffer for current frame is the same as previous frame, MV in + // the base layer shouldn't be used as it'll cause data race. + if (cpi->svc.spatial_layer_id > 0 && cm->cur_frame == cm->prev_frame) { + cpi->svc.use_base_mv = 0; + } + // Start with a 0 size frame. + *size = 0; + + cpi->frame_flags = *frame_flags; + +#if !CONFIG_REALTIME_ONLY + if ((oxcf->pass == 2) && !cpi->use_svc) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, vp9_rc_get_second_pass_params_time); +#endif + vp9_rc_get_second_pass_params(cpi); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, vp9_rc_get_second_pass_params_time); +#endif + } else if (oxcf->pass == 1) { + set_frame_size(cpi); + } +#endif // !CONFIG_REALTIME_ONLY + + if (oxcf->pass != 1 && cpi->level_constraint.level_index >= 0 && + cpi->level_constraint.fail_flag == 0) + level_rc_framerate(cpi, arf_src_index); + + if (cpi->oxcf.pass != 0 || cpi->use_svc || frame_is_intra_only(cm) == 1) { + for (i = 0; i < REFS_PER_FRAME; ++i) cpi->scaled_ref_idx[i] = INVALID_IDX; + } + + if (cpi->kmeans_data_arr_alloc == 0) { + const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows); +#if CONFIG_MULTITHREAD + pthread_mutex_init(&cpi->kmeans_mutex, NULL); +#endif + CHECK_MEM_ERROR( + &cm->error, cpi->kmeans_data_arr, + vpx_calloc(mi_rows * mi_cols, sizeof(*cpi->kmeans_data_arr))); + cpi->kmeans_data_stride = mi_cols; + cpi->kmeans_data_arr_alloc = 1; + } + +#if CONFIG_NON_GREEDY_MV + { + const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows); + Status status = vp9_alloc_motion_field_info( + &cpi->motion_field_info, MAX_ARF_GOP_SIZE, mi_rows, mi_cols); + if (status == STATUS_FAILED) { + vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR, + "vp9_alloc_motion_field_info failed"); + } + } +#endif // CONFIG_NON_GREEDY_MV + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, setup_tpl_stats_time); +#endif + if (gf_group_index == 1 && + cpi->twopass.gf_group.update_type[gf_group_index] == ARF_UPDATE && + cpi->sf.enable_tpl_model) { + vp9_init_tpl_buffer(cpi); + vp9_estimate_qp_gop(cpi); + vp9_setup_tpl_stats(cpi); + } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, setup_tpl_stats_time); +#endif + +#if CONFIG_BITSTREAM_DEBUG + assert(cpi->oxcf.max_threads == 0 && + "bitstream debug tool does not support multithreading"); + bitstream_queue_record_write(); +#endif +#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG + bitstream_queue_set_frame_write(cm->current_video_frame * 2 + cm->show_frame); +#endif + + cpi->td.mb.fp_src_pred = 0; +#if CONFIG_REALTIME_ONLY + (void)encode_frame_result; + if (cpi->use_svc) { + SvcEncode(cpi, size, dest, frame_flags); + } else { + // One pass encode + Pass0Encode(cpi, size, dest, frame_flags); + } +#else // !CONFIG_REALTIME_ONLY + if (oxcf->pass == 1 && !cpi->use_svc) { + const int lossless = is_lossless_requested(oxcf); +#if CONFIG_VP9_HIGHBITDEPTH + if (cpi->oxcf.use_highbitdepth) + cpi->td.mb.fwd_txfm4x4 = + lossless ? vp9_highbd_fwht4x4 : vpx_highbd_fdct4x4; + else + cpi->td.mb.fwd_txfm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4; + cpi->td.mb.highbd_inv_txfm_add = + lossless ? vp9_highbd_iwht4x4_add : vp9_highbd_idct4x4_add; +#else + cpi->td.mb.fwd_txfm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4; +#endif // CONFIG_VP9_HIGHBITDEPTH + cpi->td.mb.inv_txfm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; + vp9_first_pass(cpi, source); + } else if (oxcf->pass == 2 && !cpi->use_svc) { +#if CONFIG_COLLECT_COMPONENT_TIMING + // Accumulate 2nd pass time in 2-pass case. + start_timing(cpi, Pass2Encode_time); +#endif + Pass2Encode(cpi, size, dest, frame_flags, encode_frame_result); + vp9_twopass_postencode_update(cpi); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, Pass2Encode_time); +#endif + } else if (cpi->use_svc) { + SvcEncode(cpi, size, dest, frame_flags); + } else { + // One pass encode + Pass0Encode(cpi, size, dest, frame_flags); + } +#endif // CONFIG_REALTIME_ONLY + + if (cm->show_frame) cm->cur_show_frame_fb_idx = cm->new_fb_idx; + + if (cm->refresh_frame_context) + cm->frame_contexts[cm->frame_context_idx] = *cm->fc; + + // No frame encoded, or frame was dropped, release scaled references. + if ((*size == 0) && (frame_is_intra_only(cm) == 0)) { + release_scaled_references(cpi); + } + + if (*size > 0) { + cpi->droppable = !frame_is_reference(cpi); + } + + // Save layer specific state. + if (is_one_pass_svc(cpi) || ((cpi->svc.number_temporal_layers > 1 || + cpi->svc.number_spatial_layers > 1) && + oxcf->pass == 2)) { + vp9_save_layer_context(cpi); + } + + if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) + cpi->fixed_qp_onepass = 0; + + vpx_usec_timer_mark(&cmptimer); + cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer); + + if (cpi->keep_level_stats && oxcf->pass != 1) + update_level_info(cpi, size, arf_src_index); + +#if CONFIG_INTERNAL_STATS + + if (oxcf->pass != 1 && !cpi->last_frame_dropped) { + double samples = 0.0; + cpi->bytes += (int)(*size); + + if (cm->show_frame) { + uint32_t bit_depth = 8; + uint32_t in_bit_depth = 8; + cpi->count++; +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) { + in_bit_depth = cpi->oxcf.input_bit_depth; + bit_depth = cm->bit_depth; + } +#endif + + if (cpi->b_calculate_psnr) { + YV12_BUFFER_CONFIG *orig = cpi->raw_source_frame; + YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show; + YV12_BUFFER_CONFIG *pp = &cm->post_proc_buffer; + PSNR_STATS psnr; +#if CONFIG_VP9_HIGHBITDEPTH + vpx_calc_highbd_psnr(orig, recon, &psnr, cpi->td.mb.e_mbd.bd, + in_bit_depth); +#else + vpx_calc_psnr(orig, recon, &psnr); +#endif // CONFIG_VP9_HIGHBITDEPTH + + adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3], + psnr.psnr[0], &cpi->psnr); + cpi->total_sq_error += psnr.sse[0]; + cpi->total_samples += psnr.samples[0]; + samples = psnr.samples[0]; + + { + PSNR_STATS psnr2; + double frame_ssim2 = 0, weight = 0; +#if CONFIG_VP9_POSTPROC + if (vpx_alloc_frame_buffer( + pp, recon->y_crop_width, recon->y_crop_height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment) < 0) { + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate post processing buffer"); + } + { + vp9_ppflags_t ppflags; + ppflags.post_proc_flag = VP9D_DEBLOCK; + ppflags.deblocking_level = 0; // not used in vp9_post_proc_frame() + ppflags.noise_level = 0; // not used in vp9_post_proc_frame() + vp9_post_proc_frame(cm, pp, &ppflags, + cpi->un_scaled_source->y_width); + } +#endif + vpx_clear_system_state(); + +#if CONFIG_VP9_HIGHBITDEPTH + vpx_calc_highbd_psnr(orig, pp, &psnr2, cpi->td.mb.e_mbd.bd, + cpi->oxcf.input_bit_depth); +#else + vpx_calc_psnr(orig, pp, &psnr2); +#endif // CONFIG_VP9_HIGHBITDEPTH + + cpi->totalp_sq_error += psnr2.sse[0]; + cpi->totalp_samples += psnr2.samples[0]; + adjust_image_stat(psnr2.psnr[1], psnr2.psnr[2], psnr2.psnr[3], + psnr2.psnr[0], &cpi->psnrp); + +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) { + frame_ssim2 = vpx_highbd_calc_ssim(orig, recon, &weight, bit_depth, + in_bit_depth); + } else { + frame_ssim2 = vpx_calc_ssim(orig, recon, &weight); + } +#else + frame_ssim2 = vpx_calc_ssim(orig, recon, &weight); +#endif // CONFIG_VP9_HIGHBITDEPTH + + cpi->worst_ssim = VPXMIN(cpi->worst_ssim, frame_ssim2); + cpi->summed_quality += frame_ssim2 * weight; + cpi->summed_weights += weight; + +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) { + frame_ssim2 = vpx_highbd_calc_ssim(orig, pp, &weight, bit_depth, + in_bit_depth); + } else { + frame_ssim2 = vpx_calc_ssim(orig, pp, &weight); + } +#else + frame_ssim2 = vpx_calc_ssim(orig, pp, &weight); +#endif // CONFIG_VP9_HIGHBITDEPTH + + cpi->summedp_quality += frame_ssim2 * weight; + cpi->summedp_weights += weight; +#if 0 + if (cm->show_frame) { + FILE *f = fopen("q_used.stt", "a"); + fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n", + cpi->common.current_video_frame, psnr2.psnr[1], + psnr2.psnr[2], psnr2.psnr[3], psnr2.psnr[0], frame_ssim2); + fclose(f); + } +#endif + } + } + if (cpi->b_calculate_blockiness) { +#if CONFIG_VP9_HIGHBITDEPTH + if (!cm->use_highbitdepth) +#endif + { + double frame_blockiness = vp9_get_blockiness( + cpi->Source->y_buffer, cpi->Source->y_stride, + cm->frame_to_show->y_buffer, cm->frame_to_show->y_stride, + cpi->Source->y_width, cpi->Source->y_height); + cpi->worst_blockiness = + VPXMAX(cpi->worst_blockiness, frame_blockiness); + cpi->total_blockiness += frame_blockiness; + } + } + + if (cpi->b_calculate_consistency) { +#if CONFIG_VP9_HIGHBITDEPTH + if (!cm->use_highbitdepth) +#endif + { + double this_inconsistency = vpx_get_ssim_metrics( + cpi->Source->y_buffer, cpi->Source->y_stride, + cm->frame_to_show->y_buffer, cm->frame_to_show->y_stride, + cpi->Source->y_width, cpi->Source->y_height, cpi->ssim_vars, + &cpi->metrics, 1); + + const double peak = (double)((1 << cpi->oxcf.input_bit_depth) - 1); + double consistency = + vpx_sse_to_psnr(samples, peak, (double)cpi->total_inconsistency); + if (consistency > 0.0) + cpi->worst_consistency = + VPXMIN(cpi->worst_consistency, consistency); + cpi->total_inconsistency += this_inconsistency; + } + } + + { + double y, u, v, frame_all; + frame_all = vpx_calc_fastssim(cpi->Source, cm->frame_to_show, &y, &u, + &v, bit_depth, in_bit_depth); + adjust_image_stat(y, u, v, frame_all, &cpi->fastssim); + } + { + double y, u, v, frame_all; + frame_all = vpx_psnrhvs(cpi->Source, cm->frame_to_show, &y, &u, &v, + bit_depth, in_bit_depth); + adjust_image_stat(y, u, v, frame_all, &cpi->psnrhvs); + } + } + } + +#endif + +#if CONFIG_COLLECT_COMPONENT_TIMING + if (oxcf->pass == 2) end_timing(cpi, vp9_get_compressed_data_time); + + // Print out timing information. + // Note: Use "cpi->frame_component_time[0] > 100 us" to avoid showing of + // show_existing_frame and lag-in-frames. + // if (cpi->frame_component_time[0] > 100) + if (oxcf->pass == 2) { + uint64_t frame_total = 0, total = 0; + int i; + + fprintf(stderr, + "\n Frame number: %d, Frame type: %s, Show Frame: %d, Q: %d\n", + cm->current_video_frame, get_frame_type_enum(cm->frame_type), + cm->show_frame, cm->base_qindex); + for (i = 0; i < kTimingComponents; i++) { + cpi->component_time[i] += cpi->frame_component_time[i]; + // Use vp9_get_compressed_data_time (i = 0) as the total time. + if (i == 0) { + frame_total = cpi->frame_component_time[0]; + total = cpi->component_time[0]; + } + fprintf(stderr, + " %50s: %15" PRId64 " us [%6.2f%%] (total: %15" PRId64 + " us [%6.2f%%])\n", + get_component_name(i), cpi->frame_component_time[i], + (float)((float)cpi->frame_component_time[i] * 100.0 / + (float)frame_total), + cpi->component_time[i], + (float)((float)cpi->component_time[i] * 100.0 / (float)total)); + cpi->frame_component_time[i] = 0; + } + } +#endif + + if (is_one_pass_svc(cpi)) { + if (cm->show_frame) { + ++cpi->svc.spatial_layer_to_encode; + if (cpi->svc.spatial_layer_to_encode >= cpi->svc.number_spatial_layers) + cpi->svc.spatial_layer_to_encode = 0; + } + } + + vpx_clear_system_state(); + return 0; +} + +int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest, + vp9_ppflags_t *flags) { + VP9_COMMON *cm = &cpi->common; +#if !CONFIG_VP9_POSTPROC + (void)flags; +#endif + + if (!cm->show_frame) { + return -1; + } else { + int ret; +#if CONFIG_VP9_POSTPROC + ret = vp9_post_proc_frame(cm, dest, flags, cpi->un_scaled_source->y_width); +#else + if (cm->frame_to_show) { + *dest = *cm->frame_to_show; + dest->y_width = cm->width; + dest->y_height = cm->height; + dest->uv_width = cm->width >> cm->subsampling_x; + dest->uv_height = cm->height >> cm->subsampling_y; + ret = 0; + } else { + ret = -1; + } +#endif // !CONFIG_VP9_POSTPROC + vpx_clear_system_state(); + return ret; + } +} + +int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING_MODE horiz_mode, + VPX_SCALING_MODE vert_mode) { + VP9_COMMON *cm = &cpi->common; + int hr = 0, hs = 0, vr = 0, vs = 0; + + if (horiz_mode > VP8E_ONETWO || vert_mode > VP8E_ONETWO) return -1; + + Scale2Ratio(horiz_mode, &hr, &hs); + Scale2Ratio(vert_mode, &vr, &vs); + + // always go to the next whole number + cm->width = (hs - 1 + cpi->oxcf.width * hr) / hs; + cm->height = (vs - 1 + cpi->oxcf.height * vr) / vs; + if (cm->current_video_frame) { + assert(cm->width <= cpi->initial_width); + assert(cm->height <= cpi->initial_height); + } + + update_frame_size(cpi); + + return 0; +} + +int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width, + unsigned int height) { + VP9_COMMON *cm = &cpi->common; +#if CONFIG_VP9_HIGHBITDEPTH + update_initial_width(cpi, cm->use_highbitdepth, cpi->common.subsampling_x, + cpi->common.subsampling_y); +#else + update_initial_width(cpi, 0, cpi->common.subsampling_x, + cpi->common.subsampling_y); +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if CONFIG_VP9_TEMPORAL_DENOISING + setup_denoiser_buffer(cpi); +#endif + alloc_raw_frame_buffers(cpi); + if (width) { + cm->width = width; + if (cm->width > cpi->initial_width) { + cm->width = cpi->initial_width; + printf("Warning: Desired width too large, changed to %d\n", cm->width); + } + } + + if (height) { + cm->height = height; + if (cm->height > cpi->initial_height) { + cm->height = cpi->initial_height; + printf("Warning: Desired height too large, changed to %d\n", cm->height); + } + } + assert(cm->width <= cpi->initial_width); + assert(cm->height <= cpi->initial_height); + + update_frame_size(cpi); + + return 0; +} + +void vp9_set_svc(VP9_COMP *cpi, int use_svc) { + cpi->use_svc = use_svc; + return; +} + +int vp9_get_quantizer(const VP9_COMP *cpi) { return cpi->common.base_qindex; } + +void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags) { + if (flags & + (VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF)) { + int ref = 7; + + if (flags & VP8_EFLAG_NO_REF_LAST) ref ^= VP9_LAST_FLAG; + + if (flags & VP8_EFLAG_NO_REF_GF) ref ^= VP9_GOLD_FLAG; + + if (flags & VP8_EFLAG_NO_REF_ARF) ref ^= VP9_ALT_FLAG; + + vp9_use_as_reference(cpi, ref); + } + + if (flags & + (VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | + VP8_EFLAG_FORCE_GF | VP8_EFLAG_FORCE_ARF)) { + int upd = 7; + + if (flags & VP8_EFLAG_NO_UPD_LAST) upd ^= VP9_LAST_FLAG; + + if (flags & VP8_EFLAG_NO_UPD_GF) upd ^= VP9_GOLD_FLAG; + + if (flags & VP8_EFLAG_NO_UPD_ARF) upd ^= VP9_ALT_FLAG; + + vp9_update_reference(cpi, upd); + } + + if (flags & VP8_EFLAG_NO_UPD_ENTROPY) { + vp9_update_entropy(cpi, 0); + } +} + +void vp9_set_row_mt(VP9_COMP *cpi) { + // Enable row based multi-threading for supported modes of encoding + cpi->row_mt = 0; + if (((cpi->oxcf.mode == GOOD || cpi->oxcf.mode == BEST) && + cpi->oxcf.speed < 5 && cpi->oxcf.pass == 1) && + cpi->oxcf.row_mt && !cpi->use_svc) + cpi->row_mt = 1; + + if (cpi->oxcf.mode == GOOD && cpi->oxcf.speed < 5 && + (cpi->oxcf.pass == 0 || cpi->oxcf.pass == 2) && cpi->oxcf.row_mt && + !cpi->use_svc) + cpi->row_mt = 1; + + // In realtime mode, enable row based multi-threading for all the speed levels + // where non-rd path is used. + if (cpi->oxcf.mode == REALTIME && cpi->oxcf.speed >= 5 && cpi->oxcf.row_mt) { + cpi->row_mt = 1; + } + + if (cpi->row_mt) + cpi->row_mt_bit_exact = 1; + else + cpi->row_mt_bit_exact = 0; +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h new file mode 100644 index 0000000000..91df538821 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h @@ -0,0 +1,1664 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_ENCODER_H_ +#define VPX_VP9_ENCODER_VP9_ENCODER_H_ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/internal/vpx_codec_internal.h" +#include "vpx/vpx_ext_ratectrl.h" +#include "vpx/vp8cx.h" +#include "vpx/vpx_tpl.h" +#if CONFIG_INTERNAL_STATS +#include "vpx_dsp/ssim.h" +#endif +#include "vpx_dsp/variance.h" +#include "vpx_dsp/psnr.h" +#include "vpx_ports/system_state.h" +#include "vpx_util/vpx_thread.h" +#include "vpx_util/vpx_timestamp.h" + +#include "vp9/common/vp9_alloccommon.h" +#include "vp9/common/vp9_ppflags.h" +#include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_thread_common.h" +#include "vp9/common/vp9_onyxc_int.h" + +#if !CONFIG_REALTIME_ONLY +#include "vp9/encoder/vp9_alt_ref_aq.h" +#endif +#include "vp9/encoder/vp9_aq_cyclicrefresh.h" +#include "vp9/encoder/vp9_context_tree.h" +#include "vp9/encoder/vp9_encodemb.h" +#include "vp9/encoder/vp9_ethread.h" +#include "vp9/encoder/vp9_ext_ratectrl.h" +#include "vp9/encoder/vp9_firstpass.h" +#include "vp9/encoder/vp9_job_queue.h" +#include "vp9/encoder/vp9_lookahead.h" +#include "vp9/encoder/vp9_mbgraph.h" +#include "vp9/encoder/vp9_mcomp.h" +#include "vp9/encoder/vp9_noise_estimate.h" +#include "vp9/encoder/vp9_quantize.h" +#include "vp9/encoder/vp9_ratectrl.h" +#include "vp9/encoder/vp9_rd.h" +#include "vp9/encoder/vp9_speed_features.h" +#include "vp9/encoder/vp9_svc_layercontext.h" +#include "vp9/encoder/vp9_tokenize.h" + +#if CONFIG_VP9_TEMPORAL_DENOISING +#include "vp9/encoder/vp9_denoiser.h" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +// vp9 uses 10,000,000 ticks/second as time stamp +#define TICKS_PER_SEC 10000000 + +typedef struct { + int nmvjointcost[MV_JOINTS]; + int nmvcosts[2][MV_VALS]; + int nmvcosts_hp[2][MV_VALS]; + + vpx_prob segment_pred_probs[PREDICTION_PROBS]; + + unsigned char *last_frame_seg_map_copy; + + // 0 = Intra, Last, GF, ARF + signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS]; + // 0 = ZERO_MV, MV + signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS]; + + FRAME_CONTEXT fc; +} CODING_CONTEXT; + +typedef enum { + // encode_breakout is disabled. + ENCODE_BREAKOUT_DISABLED = 0, + // encode_breakout is enabled. + ENCODE_BREAKOUT_ENABLED = 1, + // encode_breakout is enabled with small max_thresh limit. + ENCODE_BREAKOUT_LIMITED = 2 +} ENCODE_BREAKOUT_TYPE; + +typedef enum { + // Good Quality Fast Encoding. The encoder balances quality with the amount of + // time it takes to encode the output. Speed setting controls how fast. + GOOD, + + // The encoder places priority on the quality of the output over encoding + // speed. The output is compressed at the highest possible quality. This + // option takes the longest amount of time to encode. Speed setting ignored. + BEST, + + // Realtime/Live Encoding. This mode is optimized for realtime encoding (for + // example, capturing a television signal or feed from a live camera). Speed + // setting controls how fast. + REALTIME +} MODE; + +typedef enum { + FRAMEFLAGS_KEY = 1 << 0, + FRAMEFLAGS_GOLDEN = 1 << 1, + FRAMEFLAGS_ALTREF = 1 << 2, +} FRAMETYPE_FLAGS; + +typedef enum { + NO_AQ = 0, + VARIANCE_AQ = 1, + COMPLEXITY_AQ = 2, + CYCLIC_REFRESH_AQ = 3, + EQUATOR360_AQ = 4, + PERCEPTUAL_AQ = 5, + PSNR_AQ = 6, + // AQ based on lookahead temporal + // variance (only valid for altref frames) + LOOKAHEAD_AQ = 7, + AQ_MODE_COUNT // This should always be the last member of the enum +} AQ_MODE; + +typedef enum { + RESIZE_NONE = 0, // No frame resizing allowed (except for SVC). + RESIZE_FIXED = 1, // All frames are coded at the specified dimension. + RESIZE_DYNAMIC = 2 // Coded size of each frame is determined by the codec. +} RESIZE_TYPE; + +typedef enum { + kInvalid = 0, + kLowSadLowSumdiff = 1, + kLowSadHighSumdiff = 2, + kHighSadLowSumdiff = 3, + kHighSadHighSumdiff = 4, + kLowVarHighSumdiff = 5, + kVeryHighSad = 6, +} CONTENT_STATE_SB; + +typedef enum { + LOOPFILTER_ALL = 0, + LOOPFILTER_REFERENCE = 1, // Disable loopfilter on non reference frames. + NO_LOOPFILTER = 2, // Disable loopfilter on all frames. +} LOOPFILTER_CONTROL; + +typedef struct VP9EncoderConfig { + BITSTREAM_PROFILE profile; + vpx_bit_depth_t bit_depth; // Codec bit-depth. + int width; // width of data passed to the compressor + int height; // height of data passed to the compressor + unsigned int input_bit_depth; // Input bit depth. + double init_framerate; // set to passed in framerate + vpx_rational_t g_timebase; // equivalent to g_timebase in vpx_codec_enc_cfg_t + vpx_rational64_t g_timebase_in_ts; // g_timebase * TICKS_PER_SEC + + int64_t target_bandwidth; // bandwidth to be used in bits per second + + int noise_sensitivity; // pre processing blur: recommendation 0 + int sharpness; // sharpening output: recommendation 0: + int speed; + // maximum allowed bitrate for any intra frame in % of bitrate target. + unsigned int rc_max_intra_bitrate_pct; + // maximum allowed bitrate for any inter frame in % of bitrate target. + unsigned int rc_max_inter_bitrate_pct; + // percent of rate boost for golden frame in CBR mode. + unsigned int gf_cbr_boost_pct; + + MODE mode; + int pass; + + // Key Framing Operations + int auto_key; // autodetect cut scenes and set the keyframes + int key_freq; // maximum distance to key frame. + + int lag_in_frames; // how many frames lag before we start encoding + + // ---------------------------------------------------------------- + // DATARATE CONTROL OPTIONS + + // vbr, cbr, constrained quality or constant quality + enum vpx_rc_mode rc_mode; + + // buffer targeting aggressiveness + int under_shoot_pct; + int over_shoot_pct; + + // buffering parameters + int64_t starting_buffer_level_ms; + int64_t optimal_buffer_level_ms; + int64_t maximum_buffer_size_ms; + + // Frame drop threshold. + int drop_frames_water_mark; + + // controlling quality + int fixed_q; + int worst_allowed_q; + int best_allowed_q; + int cq_level; + AQ_MODE aq_mode; // Adaptive Quantization mode + + // Special handling of Adaptive Quantization for AltRef frames + int alt_ref_aq; + + // Internal frame size scaling. + RESIZE_TYPE resize_mode; + int scaled_frame_width; + int scaled_frame_height; + + // Enable feature to reduce the frame quantization every x frames. + int frame_periodic_boost; + + // two pass datarate control + int two_pass_vbrbias; // two pass datarate control tweaks + int two_pass_vbrmin_section; + int two_pass_vbrmax_section; + int vbr_corpus_complexity; // 0 indicates corpus vbr disabled + // END DATARATE CONTROL OPTIONS + // ---------------------------------------------------------------- + + // Spatial and temporal scalability. + int ss_number_layers; // Number of spatial layers. + int ts_number_layers; // Number of temporal layers. + // Bitrate allocation for spatial layers. + int layer_target_bitrate[VPX_MAX_LAYERS]; + int ss_target_bitrate[VPX_SS_MAX_LAYERS]; + int ss_enable_auto_arf[VPX_SS_MAX_LAYERS]; + // Bitrate allocation (CBR mode) and framerate factor, for temporal layers. + int ts_rate_decimator[VPX_TS_MAX_LAYERS]; + + int enable_auto_arf; + + int encode_breakout; // early breakout : for video conf recommend 800 + + /* Bitfield defining the error resiliency features to enable. + * Can provide decodable frames after losses in previous + * frames and decodable partitions after losses in the same frame. + */ + unsigned int error_resilient_mode; + + /* Bitfield defining the parallel decoding mode where the + * decoding in successive frames may be conducted in parallel + * just by decoding the frame headers. + */ + unsigned int frame_parallel_decoding_mode; + + int arnr_max_frames; + int arnr_strength; + + int min_gf_interval; + int max_gf_interval; + + int tile_columns; + int tile_rows; + + int enable_tpl_model; + + int max_threads; + + unsigned int target_level; + + vpx_fixed_buf_t two_pass_stats_in; + + vp8e_tuning tuning; + vp9e_tune_content content; +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth; +#endif + vpx_color_space_t color_space; + vpx_color_range_t color_range; + int render_width; + int render_height; + VP9E_TEMPORAL_LAYERING_MODE temporal_layering_mode; + + int row_mt; + unsigned int motion_vector_unit_test; + int delta_q_uv; + int use_simple_encode_api; // Use SimpleEncode APIs or not +} VP9EncoderConfig; + +static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) { + return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0; +} + +typedef struct TplDepStats { + int64_t intra_cost; + int64_t inter_cost; + int64_t mc_flow; + int64_t mc_dep_cost; + int64_t mc_ref_cost; + + int ref_frame_index; + int_mv mv; +} TplDepStats; + +#if CONFIG_NON_GREEDY_MV + +#define ZERO_MV_MODE 0 +#define NEW_MV_MODE 1 +#define NEAREST_MV_MODE 2 +#define NEAR_MV_MODE 3 +#define MAX_MV_MODE 4 +#endif + +typedef struct TplDepFrame { + uint8_t is_valid; + TplDepStats *tpl_stats_ptr; + int stride; + int width; + int height; + int mi_rows; + int mi_cols; + int base_qindex; +#if CONFIG_NON_GREEDY_MV + int lambda; + int *mv_mode_arr[3]; + double *rd_diff_arr[3]; +#endif +} TplDepFrame; + +#define TPL_DEP_COST_SCALE_LOG2 4 + +// TODO(jingning) All spatially adaptive variables should go to TileDataEnc. +typedef struct TileDataEnc { + TileInfo tile_info; + int thresh_freq_fact[BLOCK_SIZES][MAX_MODES]; + int thresh_freq_fact_prev[BLOCK_SIZES][MAX_MODES]; + int8_t mode_map[BLOCK_SIZES][MAX_MODES]; + FIRSTPASS_DATA fp_data; + VP9RowMTSync row_mt_sync; + + // Used for adaptive_rd_thresh with row multithreading + int *row_base_thresh_freq_fact; + MV firstpass_top_mv; +} TileDataEnc; + +typedef struct RowMTInfo { + JobQueueHandle job_queue_hdl; +#if CONFIG_MULTITHREAD + pthread_mutex_t job_mutex; +#endif +} RowMTInfo; + +typedef struct { + TOKENEXTRA *start; + TOKENEXTRA *stop; + unsigned int count; +} TOKENLIST; + +typedef struct MultiThreadHandle { + int allocated_tile_rows; + int allocated_tile_cols; + int allocated_vert_unit_rows; + + // Frame level params + int num_tile_vert_sbs[MAX_NUM_TILE_ROWS]; + + // Job Queue structure and handles + JobQueue *job_queue; + + int jobs_per_tile_col; + + RowMTInfo row_mt_info[MAX_NUM_TILE_COLS]; + int thread_id_to_tile_id[MAX_NUM_THREADS]; // Mapping of threads to tiles +} MultiThreadHandle; + +typedef struct RD_COUNTS { + vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES]; + int64_t comp_pred_diff[REFERENCE_MODES]; + int64_t filter_diff[SWITCHABLE_FILTER_CONTEXTS]; +} RD_COUNTS; + +typedef struct ThreadData { + MACROBLOCK mb; + RD_COUNTS rd_counts; + FRAME_COUNTS *counts; + + PICK_MODE_CONTEXT *leaf_tree; + PC_TREE *pc_tree; + PC_TREE *pc_root; +} ThreadData; + +struct EncWorkerData; + +typedef struct ActiveMap { + int enabled; + int update; + unsigned char *map; +} ActiveMap; + +typedef enum { Y, U, V, ALL } STAT_TYPE; + +typedef struct IMAGE_STAT { + double stat[ALL + 1]; + double worst; +} ImageStat; + +// Kf noise filtering currently disabled by default in build. +// #define ENABLE_KF_DENOISE 1 + +#define CPB_WINDOW_SIZE 4 +#define FRAME_WINDOW_SIZE 128 +#define SAMPLE_RATE_GRACE_P 0.015 +#define VP9_LEVELS 14 + +typedef enum { + LEVEL_UNKNOWN = 0, + LEVEL_AUTO = 1, + LEVEL_1 = 10, + LEVEL_1_1 = 11, + LEVEL_2 = 20, + LEVEL_2_1 = 21, + LEVEL_3 = 30, + LEVEL_3_1 = 31, + LEVEL_4 = 40, + LEVEL_4_1 = 41, + LEVEL_5 = 50, + LEVEL_5_1 = 51, + LEVEL_5_2 = 52, + LEVEL_6 = 60, + LEVEL_6_1 = 61, + LEVEL_6_2 = 62, + LEVEL_MAX = 255 +} VP9_LEVEL; + +typedef struct { + VP9_LEVEL level; + uint64_t max_luma_sample_rate; + uint32_t max_luma_picture_size; + uint32_t max_luma_picture_breadth; + double average_bitrate; // in kilobits per second + double max_cpb_size; // in kilobits + double compression_ratio; + uint8_t max_col_tiles; + uint32_t min_altref_distance; + uint8_t max_ref_frame_buffers; +} Vp9LevelSpec; + +extern const Vp9LevelSpec vp9_level_defs[VP9_LEVELS]; + +typedef struct { + int64_t ts; // timestamp + uint32_t luma_samples; + uint32_t size; // in bytes +} FrameRecord; + +typedef struct { + FrameRecord buf[FRAME_WINDOW_SIZE]; + uint8_t start; + uint8_t len; +} FrameWindowBuffer; + +typedef struct { + uint8_t seen_first_altref; + uint32_t frames_since_last_altref; + uint64_t total_compressed_size; + uint64_t total_uncompressed_size; + double time_encoded; // in seconds + FrameWindowBuffer frame_window_buffer; + int ref_refresh_map; +} Vp9LevelStats; + +typedef struct { + Vp9LevelStats level_stats; + Vp9LevelSpec level_spec; +} Vp9LevelInfo; + +typedef enum { + BITRATE_TOO_LARGE = 0, + LUMA_PIC_SIZE_TOO_LARGE, + LUMA_PIC_BREADTH_TOO_LARGE, + LUMA_SAMPLE_RATE_TOO_LARGE, + CPB_TOO_LARGE, + COMPRESSION_RATIO_TOO_SMALL, + TOO_MANY_COLUMN_TILE, + ALTREF_DIST_TOO_SMALL, + TOO_MANY_REF_BUFFER, + TARGET_LEVEL_FAIL_IDS +} TARGET_LEVEL_FAIL_ID; + +typedef struct { + int8_t level_index; + uint8_t fail_flag; + int max_frame_size; // in bits + double max_cpb_size; // in bits +} LevelConstraint; + +typedef struct ARNRFilterData { + YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS]; + int strength; + int frame_count; + int alt_ref_index; + struct scale_factors sf; +} ARNRFilterData; + +typedef struct EncFrameBuf { + int mem_valid; + int released; + YV12_BUFFER_CONFIG frame; +} EncFrameBuf; + +// Maximum operating frame buffer size needed for a GOP using ARF reference. +// This is used to allocate the memory for TPL stats for a GOP. +#define MAX_ARF_GOP_SIZE (2 * MAX_LAG_BUFFERS) +#define MAX_KMEANS_GROUPS 8 + +typedef struct KMEANS_DATA { + double value; + int pos; + int group_idx; +} KMEANS_DATA; + +#if CONFIG_RATE_CTRL +typedef struct PARTITION_INFO { + int row; // row pixel offset of current 4x4 block + int column; // column pixel offset of current 4x4 block + int row_start; // row pixel offset of the start of the prediction block + int column_start; // column pixel offset of the start of the prediction block + int width; // prediction block width + int height; // prediction block height +} PARTITION_INFO; + +typedef struct MOTION_VECTOR_INFO { + MV_REFERENCE_FRAME ref_frame[2]; + int_mv mv[2]; +} MOTION_VECTOR_INFO; + +typedef struct GOP_COMMAND { + int use; // use this command to set gop or not. If not, use vp9's decision. + int show_frame_count; + int use_alt_ref; +} GOP_COMMAND; + +static INLINE void gop_command_on(GOP_COMMAND *gop_command, + int show_frame_count, int use_alt_ref) { + gop_command->use = 1; + gop_command->show_frame_count = show_frame_count; + gop_command->use_alt_ref = use_alt_ref; +} + +static INLINE void gop_command_off(GOP_COMMAND *gop_command) { + gop_command->use = 0; + gop_command->show_frame_count = 0; + gop_command->use_alt_ref = 0; +} + +static INLINE int gop_command_coding_frame_count( + const GOP_COMMAND *gop_command) { + if (gop_command->use == 0) { + assert(0); + return -1; + } + return gop_command->show_frame_count + gop_command->use_alt_ref; +} + +// TODO(angiebird): See if we can merge this one with FrameType in +// simple_encode.h +typedef enum ENCODE_FRAME_TYPE { + ENCODE_FRAME_TYPE_KEY, + ENCODE_FRAME_TYPE_INTER, + ENCODE_FRAME_TYPE_ALTREF, + ENCODE_FRAME_TYPE_OVERLAY, + ENCODE_FRAME_TYPE_GOLDEN, + ENCODE_FRAME_TYPES, +} ENCODE_FRAME_TYPE; + +// TODO(angiebird): Merge this function with get_frame_type_from_update_type() +static INLINE ENCODE_FRAME_TYPE +get_encode_frame_type(FRAME_UPDATE_TYPE update_type) { + switch (update_type) { + case KF_UPDATE: return ENCODE_FRAME_TYPE_KEY; + case ARF_UPDATE: return ENCODE_FRAME_TYPE_ALTREF; + case GF_UPDATE: return ENCODE_FRAME_TYPE_GOLDEN; + case OVERLAY_UPDATE: return ENCODE_FRAME_TYPE_OVERLAY; + case LF_UPDATE: return ENCODE_FRAME_TYPE_INTER; + default: + fprintf(stderr, "Unsupported update_type %d\n", update_type); + abort(); + return ENCODE_FRAME_TYPE_INTER; + } +} + +typedef struct RATE_QSTEP_MODEL { + // The rq model predicts the bit usage as follows. + // rate = bias - ratio * log2(q_step) + int ready; + double bias; + double ratio; +} RATE_QSTEP_MODEL; + +typedef struct ENCODE_COMMAND { + int use_external_quantize_index; + int external_quantize_index; + + int use_external_target_frame_bits; + int target_frame_bits; + double target_frame_bits_error_percent; + + GOP_COMMAND gop_command; +} ENCODE_COMMAND; + +static INLINE void encode_command_set_gop_command( + ENCODE_COMMAND *encode_command, GOP_COMMAND gop_command) { + encode_command->gop_command = gop_command; +} + +static INLINE void encode_command_set_external_quantize_index( + ENCODE_COMMAND *encode_command, int quantize_index) { + encode_command->use_external_quantize_index = 1; + encode_command->external_quantize_index = quantize_index; +} + +static INLINE void encode_command_reset_external_quantize_index( + ENCODE_COMMAND *encode_command) { + encode_command->use_external_quantize_index = 0; + encode_command->external_quantize_index = -1; +} + +static INLINE void encode_command_set_target_frame_bits( + ENCODE_COMMAND *encode_command, int target_frame_bits, + double target_frame_bits_error_percent) { + encode_command->use_external_target_frame_bits = 1; + encode_command->target_frame_bits = target_frame_bits; + encode_command->target_frame_bits_error_percent = + target_frame_bits_error_percent; +} + +static INLINE void encode_command_reset_target_frame_bits( + ENCODE_COMMAND *encode_command) { + encode_command->use_external_target_frame_bits = 0; + encode_command->target_frame_bits = -1; + encode_command->target_frame_bits_error_percent = 0; +} + +static INLINE void encode_command_init(ENCODE_COMMAND *encode_command) { + vp9_zero(*encode_command); + encode_command_reset_external_quantize_index(encode_command); + encode_command_reset_target_frame_bits(encode_command); + gop_command_off(&encode_command->gop_command); +} + +// Returns number of units in size of 4, if not multiple not a multiple of 4, +// round it up. For example, size is 7, return 2. +static INLINE int get_num_unit_4x4(int size) { return (size + 3) >> 2; } +// Returns number of units in size of 16, if not multiple not a multiple of 16, +// round it up. For example, size is 17, return 2. +static INLINE int get_num_unit_16x16(int size) { return (size + 15) >> 4; } +#endif // CONFIG_RATE_CTRL + +#if CONFIG_COLLECT_COMPONENT_TIMING +#include "vpx_ports/vpx_timer.h" +// Adjust the following to add new components. +typedef enum { + vp9_get_compressed_data_time, + vp9_temporal_filter_time, + vp9_rc_get_second_pass_params_time, + setup_tpl_stats_time, + Pass2Encode_time, + + encode_with_recode_loop_time, + loopfilter_frame_time, + vp9_pack_bitstream_time, + + encode_frame_internal_time, + rd_pick_partition_time, + rd_pick_sb_modes_time, + encode_sb_time, + + vp9_rd_pick_inter_mode_sb_time, + vp9_rd_pick_inter_mode_sub8x8_time, + + intra_mode_search_time, + handle_inter_mode_time, + single_motion_search_time, + joint_motion_search_time, + interp_filter_time, + + kTimingComponents, +} TIMING_COMPONENT; + +static INLINE char const *get_component_name(int index) { + switch (index) { + case vp9_get_compressed_data_time: return "vp9_get_compressed_data_time"; + case vp9_temporal_filter_time: return "vp9_temporal_filter_time"; + case vp9_rc_get_second_pass_params_time: + return "vp9_rc_get_second_pass_params_time"; + case setup_tpl_stats_time: return "setup_tpl_stats_time"; + case Pass2Encode_time: return "Pass2Encode_time"; + + case encode_with_recode_loop_time: return "encode_with_recode_loop_time"; + case loopfilter_frame_time: return "loopfilter_frame_time"; + case vp9_pack_bitstream_time: return "vp9_pack_bitstream_time"; + + case encode_frame_internal_time: return "encode_frame_internal_time"; + case rd_pick_partition_time: return "rd_pick_partition_time"; + case rd_pick_sb_modes_time: return "rd_pick_sb_modes_time"; + case encode_sb_time: return "encode_sb_time"; + + case vp9_rd_pick_inter_mode_sb_time: + return "vp9_rd_pick_inter_mode_sb_time"; + case vp9_rd_pick_inter_mode_sub8x8_time: + return "vp9_rd_pick_inter_mode_sub8x8_time"; + + case intra_mode_search_time: return "intra_mode_search_time"; + case handle_inter_mode_time: return "handle_inter_mode_time"; + case single_motion_search_time: return "single_motion_search_time"; + case joint_motion_search_time: return "joint_motion_search_time"; + case interp_filter_time: return "interp_filter_time"; + + default: assert(0); + } + return "error"; +} +#endif + +typedef struct VP9_COMP { + FRAME_INFO frame_info; + QUANTS quants; + ThreadData td; + MB_MODE_INFO_EXT *mbmi_ext_base; + DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]); + VP9_COMMON common; + VP9EncoderConfig oxcf; + struct lookahead_ctx *lookahead; + struct lookahead_entry *alt_ref_source; + + YV12_BUFFER_CONFIG *Source; + YV12_BUFFER_CONFIG *Last_Source; // NULL for first frame and alt_ref frames + YV12_BUFFER_CONFIG *un_scaled_source; + YV12_BUFFER_CONFIG scaled_source; + YV12_BUFFER_CONFIG *unscaled_last_source; + YV12_BUFFER_CONFIG scaled_last_source; +#ifdef ENABLE_KF_DENOISE + YV12_BUFFER_CONFIG raw_unscaled_source; + YV12_BUFFER_CONFIG raw_scaled_source; +#endif + YV12_BUFFER_CONFIG *raw_source_frame; + + BLOCK_SIZE tpl_bsize; + TplDepFrame tpl_stats[MAX_ARF_GOP_SIZE]; + // Used to store TPL stats before propagation + VpxTplGopStats tpl_gop_stats; + YV12_BUFFER_CONFIG *tpl_recon_frames[REF_FRAMES]; + EncFrameBuf enc_frame_buf[REF_FRAMES]; +#if CONFIG_MULTITHREAD + pthread_mutex_t kmeans_mutex; +#endif + int kmeans_data_arr_alloc; + KMEANS_DATA *kmeans_data_arr; + int kmeans_data_size; + int kmeans_data_stride; + double kmeans_ctr_ls[MAX_KMEANS_GROUPS]; + double kmeans_boundary_ls[MAX_KMEANS_GROUPS]; + int kmeans_count_ls[MAX_KMEANS_GROUPS]; + int kmeans_ctr_num; +#if CONFIG_NON_GREEDY_MV + MotionFieldInfo motion_field_info; + int tpl_ready; + int_mv *select_mv_arr; +#endif + + TileDataEnc *tile_data; + int allocated_tiles; // Keep track of memory allocated for tiles. + + int scaled_ref_idx[REFS_PER_FRAME]; + int lst_fb_idx; + int gld_fb_idx; + int alt_fb_idx; + + int ref_fb_idx[REF_FRAMES]; + + int refresh_last_frame; + int refresh_golden_frame; + int refresh_alt_ref_frame; + + int ext_refresh_frame_flags_pending; + int ext_refresh_last_frame; + int ext_refresh_golden_frame; + int ext_refresh_alt_ref_frame; + + int ext_refresh_frame_context_pending; + int ext_refresh_frame_context; + + int64_t norm_wiener_variance; + int64_t *mb_wiener_variance; + int mb_wiener_var_rows; + int mb_wiener_var_cols; + double *mi_ssim_rdmult_scaling_factors; + + YV12_BUFFER_CONFIG last_frame_uf; + + TOKENEXTRA *tile_tok[4][1 << 6]; + TOKENLIST *tplist[4][1 << 6]; + + // Ambient reconstruction err target for force key frames + int64_t ambient_err; + + RD_CONTROL rd_ctrl; + RD_OPT rd; + + CODING_CONTEXT coding_context; + + int *nmvcosts[2]; + int *nmvcosts_hp[2]; + int *nmvsadcosts[2]; + int *nmvsadcosts_hp[2]; + + int64_t last_time_stamp_seen; + int64_t last_end_time_stamp_seen; + int64_t first_time_stamp_ever; + + RATE_CONTROL rc; + double framerate; + + int interp_filter_selected[REF_FRAMES][SWITCHABLE]; + + struct vpx_codec_pkt_list *output_pkt_list; + + MBGRAPH_FRAME_STATS mbgraph_stats[MAX_LAG_BUFFERS]; + int mbgraph_n_frames; // number of frames filled in the above + int static_mb_pct; // % forced skip mbs by segmentation + int ref_frame_flags; + + SPEED_FEATURES sf; + + uint32_t max_mv_magnitude; + int mv_step_param; + + int allow_comp_inter_inter; + + // Default value is 1. From first pass stats, encode_breakout may be disabled. + ENCODE_BREAKOUT_TYPE allow_encode_breakout; + + // Get threshold from external input. A suggested threshold is 800 for HD + // clips, and 300 for < HD clips. + int encode_breakout; + + uint8_t *segmentation_map; + + uint8_t *skin_map; + + // segment threshold for encode breakout + int segment_encode_breakout[MAX_SEGMENTS]; + + CYCLIC_REFRESH *cyclic_refresh; + ActiveMap active_map; + + fractional_mv_step_fp *find_fractional_mv_step; + struct scale_factors me_sf; + vp9_diamond_search_fn_t diamond_search_sad; + vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZES]; + uint64_t time_receive_data; + uint64_t time_compress_data; + uint64_t time_pick_lpf; + uint64_t time_encode_sb_row; + + TWO_PASS twopass; + + // Force recalculation of segment_ids for each mode info + uint8_t force_update_segmentation; + + YV12_BUFFER_CONFIG alt_ref_buffer; + + // class responsible for adaptive + // quantization of altref frames + struct ALT_REF_AQ *alt_ref_aq; + +#if CONFIG_INTERNAL_STATS + unsigned int mode_chosen_counts[MAX_MODES]; + + int count; + uint64_t total_sq_error; + uint64_t total_samples; + ImageStat psnr; + + uint64_t totalp_sq_error; + uint64_t totalp_samples; + ImageStat psnrp; + + double total_blockiness; + double worst_blockiness; + + int bytes; + double summed_quality; + double summed_weights; + double summedp_quality; + double summedp_weights; + unsigned int tot_recode_hits; + double worst_ssim; + + ImageStat ssimg; + ImageStat fastssim; + ImageStat psnrhvs; + + int b_calculate_ssimg; + int b_calculate_blockiness; + + int b_calculate_consistency; + + double total_inconsistency; + double worst_consistency; + Ssimv *ssim_vars; + Metrics metrics; +#endif + int b_calculate_psnr; + + int droppable; + + int initial_width; + int initial_height; + int initial_mbs; // Number of MBs in the full-size frame; to be used to + // normalize the firstpass stats. This will differ from the + // number of MBs in the current frame when the frame is + // scaled. + + int last_coded_width; + int last_coded_height; + + int use_svc; + + SVC svc; + + // Store frame variance info in SOURCE_VAR_BASED_PARTITION search type. + Diff *source_diff_var; + // The threshold used in SOURCE_VAR_BASED_PARTITION search type. + unsigned int source_var_thresh; + int frames_till_next_var_check; + + int frame_flags; + + search_site_config ss_cfg; + + int mbmode_cost[INTRA_MODES]; + unsigned int inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES]; + int intra_uv_mode_cost[FRAME_TYPES][INTRA_MODES][INTRA_MODES]; + int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES]; + int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS]; + int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES]; + // Indices are: max_tx_size-1, tx_size_ctx, tx_size + int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES]; + +#if CONFIG_VP9_TEMPORAL_DENOISING + VP9_DENOISER denoiser; +#endif + + int resize_pending; + RESIZE_STATE resize_state; + int external_resize; + int resize_scale_num; + int resize_scale_den; + int resize_avg_qp; + int resize_buffer_underflow; + int resize_count; + + int use_skin_detection; + + int target_level; + + NOISE_ESTIMATE noise_estimate; + + // Count on how many consecutive times a block uses small/zeromv for encoding. + uint8_t *consec_zero_mv; + + // VAR_BASED_PARTITION thresholds + // 0 - threshold_64x64; 1 - threshold_32x32; + // 2 - threshold_16x16; 3 - vbp_threshold_8x8; + int64_t vbp_thresholds[4]; + int64_t vbp_threshold_minmax; + int64_t vbp_threshold_sad; + // Threshold used for partition copy + int64_t vbp_threshold_copy; + BLOCK_SIZE vbp_bsize_min; + + // Multi-threading + int num_workers; + VPxWorker *workers; + struct EncWorkerData *tile_thr_data; + VP9LfSync lf_row_sync; + struct VP9BitstreamWorkerData *vp9_bitstream_worker_data; + + int keep_level_stats; + Vp9LevelInfo level_info; + MultiThreadHandle multi_thread_ctxt; + void (*row_mt_sync_read_ptr)(VP9RowMTSync *const, int, int); + void (*row_mt_sync_write_ptr)(VP9RowMTSync *const, int, int, const int); + ARNRFilterData arnr_filter_data; + + int row_mt; + unsigned int row_mt_bit_exact; + + // Previous Partition Info + BLOCK_SIZE *prev_partition; + int8_t *prev_segment_id; + // Used to save the status of whether a block has a low variance in + // choose_partitioning. 0 for 64x64, 1~2 for 64x32, 3~4 for 32x64, 5~8 for + // 32x32, 9~24 for 16x16. + // This is for the last frame and is copied to the current frame + // when partition copy happens. + uint8_t *prev_variance_low; + uint8_t *copied_frame_cnt; + uint8_t max_copied_frame; + // If the last frame is dropped, we don't copy partition. + uint8_t last_frame_dropped; + + // For each superblock: keeps track of the last time (in frame distance) the + // the superblock did not have low source sad. + uint8_t *content_state_sb_fd; + + int compute_source_sad_onepass; + + int compute_frame_low_motion_onepass; + + LevelConstraint level_constraint; + + uint8_t *count_arf_frame_usage; + uint8_t *count_lastgolden_frame_usage; + + int multi_layer_arf; + vpx_roi_map_t roi; + + LOOPFILTER_CONTROL loopfilter_ctrl; +#if CONFIG_RATE_CTRL + ENCODE_COMMAND encode_command; + PARTITION_INFO *partition_info; + MOTION_VECTOR_INFO *motion_vector_info; + MOTION_VECTOR_INFO *fp_motion_vector_info; + TplDepStats *tpl_stats_info; + + RATE_QSTEP_MODEL rq_model[ENCODE_FRAME_TYPES]; +#endif + EXT_RATECTRL ext_ratectrl; + + int fixed_qp_onepass; + + // Flag to keep track of dynamic change in deadline mode + // (good/best/realtime). + MODE deadline_mode_previous_frame; + + // Flag to disable scene detection when rtc rate control library is used. + int disable_scene_detection_rtc_ratectrl; + +#if CONFIG_COLLECT_COMPONENT_TIMING + /*! + * component_time[] are initialized to zero while encoder starts. + */ + uint64_t component_time[kTimingComponents]; + /*! + * Stores timing for individual components between calls of start_timing() + * and end_timing(). + */ + struct vpx_usec_timer component_timer[kTimingComponents]; + /*! + * frame_component_time[] are initialized to zero at beginning of each frame. + */ + uint64_t frame_component_time[kTimingComponents]; +#endif + // Flag to indicate if QP and GOP for TPL is controlled by external RC. + int tpl_with_external_rc; +} VP9_COMP; + +#if CONFIG_RATE_CTRL +// Allocates memory for the partition information. +// The unit size is each 4x4 block. +// Only called once in vp9_create_compressor(). +static INLINE void partition_info_init(struct VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int unit_width = get_num_unit_4x4(cpi->frame_info.frame_width); + const int unit_height = get_num_unit_4x4(cpi->frame_info.frame_height); + CHECK_MEM_ERROR(&cm->error, cpi->partition_info, + (PARTITION_INFO *)vpx_calloc(unit_width * unit_height, + sizeof(PARTITION_INFO))); + memset(cpi->partition_info, 0, + unit_width * unit_height * sizeof(PARTITION_INFO)); +} + +// Frees memory of the partition information. +// Only called once in dealloc_compressor_data(). +static INLINE void free_partition_info(struct VP9_COMP *cpi) { + vpx_free(cpi->partition_info); + cpi->partition_info = NULL; +} + +static INLINE void reset_mv_info(MOTION_VECTOR_INFO *mv_info) { + mv_info->ref_frame[0] = NO_REF_FRAME; + mv_info->ref_frame[1] = NO_REF_FRAME; + mv_info->mv[0].as_int = INVALID_MV; + mv_info->mv[1].as_int = INVALID_MV; +} + +// Allocates memory for the motion vector information. +// The unit size is each 4x4 block. +// Only called once in vp9_create_compressor(). +static INLINE void motion_vector_info_init(struct VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int unit_width = get_num_unit_4x4(cpi->frame_info.frame_width); + const int unit_height = get_num_unit_4x4(cpi->frame_info.frame_height); + CHECK_MEM_ERROR(&cm->error, cpi->motion_vector_info, + (MOTION_VECTOR_INFO *)vpx_calloc(unit_width * unit_height, + sizeof(MOTION_VECTOR_INFO))); + memset(cpi->motion_vector_info, 0, + unit_width * unit_height * sizeof(MOTION_VECTOR_INFO)); +} + +// Frees memory of the motion vector information. +// Only called once in dealloc_compressor_data(). +static INLINE void free_motion_vector_info(struct VP9_COMP *cpi) { + vpx_free(cpi->motion_vector_info); + cpi->motion_vector_info = NULL; +} + +// Allocates memory for the tpl stats information. +// Only called once in vp9_create_compressor(). +static INLINE void tpl_stats_info_init(struct VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + CHECK_MEM_ERROR( + &cm->error, cpi->tpl_stats_info, + (TplDepStats *)vpx_calloc(MAX_LAG_BUFFERS, sizeof(TplDepStats))); + memset(cpi->tpl_stats_info, 0, MAX_LAG_BUFFERS * sizeof(TplDepStats)); +} + +// Frees memory of the tpl stats information. +// Only called once in dealloc_compressor_data(). +static INLINE void free_tpl_stats_info(struct VP9_COMP *cpi) { + vpx_free(cpi->tpl_stats_info); + cpi->tpl_stats_info = NULL; +} + +// Allocates memory for the first pass motion vector information. +// The unit size is each 16x16 block. +// Only called once in vp9_create_compressor(). +static INLINE void fp_motion_vector_info_init(struct VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int unit_width = get_num_unit_16x16(cpi->frame_info.frame_width); + const int unit_height = get_num_unit_16x16(cpi->frame_info.frame_height); + CHECK_MEM_ERROR(&cm->error, cpi->fp_motion_vector_info, + (MOTION_VECTOR_INFO *)vpx_calloc(unit_width * unit_height, + sizeof(MOTION_VECTOR_INFO))); +} + +static INLINE void fp_motion_vector_info_reset( + int frame_width, int frame_height, + MOTION_VECTOR_INFO *fp_motion_vector_info) { + const int unit_width = get_num_unit_16x16(frame_width); + const int unit_height = get_num_unit_16x16(frame_height); + int i; + for (i = 0; i < unit_width * unit_height; ++i) { + reset_mv_info(fp_motion_vector_info + i); + } +} + +// Frees memory of the first pass motion vector information. +// Only called once in dealloc_compressor_data(). +static INLINE void free_fp_motion_vector_info(struct VP9_COMP *cpi) { + vpx_free(cpi->fp_motion_vector_info); + cpi->fp_motion_vector_info = NULL; +} + +// This is the c-version counter part of ImageBuffer +typedef struct IMAGE_BUFFER { + int allocated; + int plane_width[3]; + int plane_height[3]; + uint8_t *plane_buffer[3]; +} IMAGE_BUFFER; + +#define RATE_CTRL_MAX_RECODE_NUM 7 + +typedef struct RATE_QINDEX_HISTORY { + int recode_count; + int q_index_history[RATE_CTRL_MAX_RECODE_NUM]; + int rate_history[RATE_CTRL_MAX_RECODE_NUM]; + int q_index_high; + int q_index_low; +} RATE_QINDEX_HISTORY; + +#endif // CONFIG_RATE_CTRL + +typedef struct ENCODE_FRAME_RESULT { + int show_idx; + FRAME_UPDATE_TYPE update_type; +#if CONFIG_RATE_CTRL + int frame_coding_index; + int ref_frame_coding_indexes[MAX_INTER_REF_FRAMES]; + int ref_frame_valid_list[MAX_INTER_REF_FRAMES]; + double psnr; + uint64_t sse; + FRAME_COUNTS frame_counts; + const PARTITION_INFO *partition_info; + const MOTION_VECTOR_INFO *motion_vector_info; + const TplDepStats *tpl_stats_info; + IMAGE_BUFFER coded_frame; + RATE_QINDEX_HISTORY rq_history; +#endif // CONFIG_RATE_CTRL + int quantize_index; +} ENCODE_FRAME_RESULT; + +void vp9_init_encode_frame_result(ENCODE_FRAME_RESULT *encode_frame_result); + +void vp9_initialize_enc(void); + +void vp9_update_compressor_with_img_fmt(VP9_COMP *cpi, vpx_img_fmt_t img_fmt); +struct VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, + BufferPool *const pool); +void vp9_remove_compressor(VP9_COMP *cpi); + +void vp9_change_config(VP9_COMP *cpi, const VP9EncoderConfig *oxcf); + +// receive a frames worth of data. caller can assume that a copy of this +// frame is made and not just a copy of the pointer.. +int vp9_receive_raw_frame(VP9_COMP *cpi, vpx_enc_frame_flags_t frame_flags, + YV12_BUFFER_CONFIG *sd, int64_t time_stamp, + int64_t end_time); + +int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, + size_t *size, uint8_t *dest, int64_t *time_stamp, + int64_t *time_end, int flush, + ENCODE_FRAME_RESULT *encode_frame_result); + +int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest, + vp9_ppflags_t *flags); + +int vp9_use_as_reference(VP9_COMP *cpi, int ref_frame_flags); + +void vp9_update_reference(VP9_COMP *cpi, int ref_frame_flags); + +int vp9_copy_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd); + +int vp9_set_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd); + +int vp9_update_entropy(VP9_COMP *cpi, int update); + +int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows, + int cols); + +int vp9_get_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows, + int cols); + +int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING_MODE horiz_mode, + VPX_SCALING_MODE vert_mode); + +int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width, + unsigned int height); + +void vp9_set_svc(VP9_COMP *cpi, int use_svc); + +// Check for resetting the rc flags (rc_1_frame, rc_2_frame) if the +// configuration change has a large change in avg_frame_bandwidth. +// For SVC check for resetting based on spatial layer average bandwidth. +// Also reset buffer level to optimal level. +void vp9_check_reset_rc_flag(VP9_COMP *cpi); + +void vp9_set_rc_buffer_sizes(VP9_COMP *cpi); + +static INLINE int stack_pop(int *stack, int stack_size) { + int idx; + const int r = stack[0]; + for (idx = 1; idx < stack_size; ++idx) stack[idx - 1] = stack[idx]; + + return r; +} + +static INLINE int stack_top(const int *stack) { return stack[0]; } + +static INLINE void stack_push(int *stack, int new_item, int stack_size) { + int idx; + for (idx = stack_size; idx > 0; --idx) stack[idx] = stack[idx - 1]; + stack[0] = new_item; +} + +static INLINE void stack_init(int *stack, int length) { + int idx; + for (idx = 0; idx < length; ++idx) stack[idx] = -1; +} + +int vp9_get_quantizer(const VP9_COMP *cpi); + +static INLINE int frame_is_kf_gf_arf(const VP9_COMP *cpi) { + return frame_is_intra_only(&cpi->common) || cpi->refresh_alt_ref_frame || + (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref); +} + +static INLINE int ref_frame_to_flag(int8_t ref_frame) { + static const int kVp9RefFlagList[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; + assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME); + return kVp9RefFlagList[ref_frame]; +} + +static INLINE int get_ref_frame_map_idx(const VP9_COMP *cpi, + MV_REFERENCE_FRAME ref_frame) { + if (ref_frame == LAST_FRAME) { + return cpi->lst_fb_idx; + } else if (ref_frame == GOLDEN_FRAME) { + return cpi->gld_fb_idx; + } else { + return cpi->alt_fb_idx; + } +} + +static INLINE int get_ref_frame_buf_idx(const VP9_COMP *const cpi, + int ref_frame) { + const VP9_COMMON *const cm = &cpi->common; + const int map_idx = get_ref_frame_map_idx(cpi, ref_frame); + return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : INVALID_IDX; +} + +static INLINE RefCntBuffer *get_ref_cnt_buffer(const VP9_COMMON *cm, + int fb_idx) { + return fb_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[fb_idx] : NULL; +} + +static INLINE void get_ref_frame_bufs( + const VP9_COMP *cpi, RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES]) { + const VP9_COMMON *const cm = &cpi->common; + MV_REFERENCE_FRAME ref_frame; + for (ref_frame = LAST_FRAME; ref_frame < MAX_REF_FRAMES; ++ref_frame) { + int ref_frame_buf_idx = get_ref_frame_buf_idx(cpi, ref_frame); + int inter_ref_idx = mv_ref_frame_to_inter_ref_idx(ref_frame); + ref_frame_bufs[inter_ref_idx] = get_ref_cnt_buffer(cm, ref_frame_buf_idx); + } +} + +static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer( + const VP9_COMP *const cpi, MV_REFERENCE_FRAME ref_frame) { + const VP9_COMMON *const cm = &cpi->common; + const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame); + return buf_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[buf_idx].buf + : NULL; +} + +static INLINE int get_token_alloc(int mb_rows, int mb_cols) { + // TODO(JBB): double check we can't exceed this token count if we have a + // 32x32 transform crossing a boundary at a multiple of 16. + // mb_rows, cols are in units of 16 pixels. We assume 3 planes all at full + // resolution. We assume up to 1 token per pixel, and then allow + // a head room of 4. + return mb_rows * mb_cols * (16 * 16 * 3 + 4); +} + +// Get the allocated token size for a tile. It does the same calculation as in +// the frame token allocation. +static INLINE int allocated_tokens(TileInfo tile) { + int tile_mb_rows = (tile.mi_row_end - tile.mi_row_start + 1) >> 1; + int tile_mb_cols = (tile.mi_col_end - tile.mi_col_start + 1) >> 1; + + return get_token_alloc(tile_mb_rows, tile_mb_cols); +} + +static INLINE void get_start_tok(VP9_COMP *cpi, int tile_row, int tile_col, + int mi_row, TOKENEXTRA **tok) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + const TileInfo *const tile_info = &this_tile->tile_info; + + int tile_mb_cols = (tile_info->mi_col_end - tile_info->mi_col_start + 1) >> 1; + const int mb_row = (mi_row - tile_info->mi_row_start) >> 1; + + *tok = + cpi->tile_tok[tile_row][tile_col] + get_token_alloc(mb_row, tile_mb_cols); +} + +int64_t vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); +#if CONFIG_VP9_HIGHBITDEPTH +int64_t vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b); +#endif // CONFIG_VP9_HIGHBITDEPTH + +void vp9_scale_references(VP9_COMP *cpi); + +void vp9_update_reference_frames(VP9_COMP *cpi); + +void vp9_get_ref_frame_info(FRAME_UPDATE_TYPE update_type, int ref_frame_flags, + RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], + int *ref_frame_coding_indexes, + int *ref_frame_valid_list); + +void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv); + +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, int bd); +#else +void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst); +#endif // CONFIG_VP9_HIGHBITDEPTH + +YV12_BUFFER_CONFIG *vp9_svc_twostage_scale( + VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, + YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type, + int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2); + +YV12_BUFFER_CONFIG *vp9_scale_if_required( + VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, + int use_normative_scaler, INTERP_FILTER filter_type, int phase_scaler); + +void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags); + +static INLINE int is_one_pass_svc(const struct VP9_COMP *const cpi) { + return (cpi->use_svc && cpi->oxcf.pass == 0); +} + +#if CONFIG_VP9_TEMPORAL_DENOISING +static INLINE int denoise_svc(const struct VP9_COMP *const cpi) { + return (!cpi->use_svc || (cpi->use_svc && cpi->svc.spatial_layer_id >= + cpi->svc.first_layer_denoise)); +} +#endif + +#define MIN_LOOKAHEAD_FOR_ARFS 4 +static INLINE int is_altref_enabled(const VP9_COMP *const cpi) { + return !(cpi->oxcf.mode == REALTIME && cpi->oxcf.rc_mode == VPX_CBR) && + cpi->oxcf.lag_in_frames >= MIN_LOOKAHEAD_FOR_ARFS && + cpi->oxcf.enable_auto_arf; +} + +static INLINE void set_ref_ptrs(const VP9_COMMON *const cm, MACROBLOCKD *xd, + MV_REFERENCE_FRAME ref0, + MV_REFERENCE_FRAME ref1) { + xd->block_refs[0] = + &cm->frame_refs[ref0 >= LAST_FRAME ? ref0 - LAST_FRAME : 0]; + xd->block_refs[1] = + &cm->frame_refs[ref1 >= LAST_FRAME ? ref1 - LAST_FRAME : 0]; +} + +static INLINE int get_chessboard_index(const int frame_index) { + return frame_index & 0x1; +} + +static INLINE int *cond_cost_list(const struct VP9_COMP *cpi, int *cost_list) { + return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL; +} + +static INLINE int get_num_vert_units(TileInfo tile, int shift) { + int num_vert_units = + (tile.mi_row_end - tile.mi_row_start + (1 << shift) - 1) >> shift; + return num_vert_units; +} + +static INLINE int get_num_cols(TileInfo tile, int shift) { + int num_cols = + (tile.mi_col_end - tile.mi_col_start + (1 << shift) - 1) >> shift; + return num_cols; +} + +static INLINE int get_level_index(VP9_LEVEL level) { + int i; + for (i = 0; i < VP9_LEVELS; ++i) { + if (level == vp9_level_defs[i].level) return i; + } + return -1; +} + +// Return the log2 value of max column tiles corresponding to the level that +// the picture size fits into. +static INLINE int log_tile_cols_from_picsize_level(uint32_t width, + uint32_t height) { + int i; + const uint32_t pic_size = width * height; + const uint32_t pic_breadth = VPXMAX(width, height); + for (i = LEVEL_1; i < LEVEL_MAX; ++i) { + if (vp9_level_defs[i].max_luma_picture_size >= pic_size && + vp9_level_defs[i].max_luma_picture_breadth >= pic_breadth) { + return get_msb(vp9_level_defs[i].max_col_tiles); + } + } + return INT_MAX; +} + +VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec); + +vpx_codec_err_t vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, + unsigned int rows, unsigned int cols, + int delta_q[8], int delta_lf[8], int skip[8], + int ref_frame[8]); + +void vp9_new_framerate(VP9_COMP *cpi, double framerate); + +void vp9_set_row_mt(VP9_COMP *cpi); + +int vp9_get_psnr(const VP9_COMP *cpi, PSNR_STATS *psnr); + +#define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl)) + +static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) { + RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx]; + if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows || + new_fb_ptr->mi_cols < cm->mi_cols) { + vpx_free(new_fb_ptr->mvs); + CHECK_MEM_ERROR(&cm->error, new_fb_ptr->mvs, + (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols, + sizeof(*new_fb_ptr->mvs))); + new_fb_ptr->mi_rows = cm->mi_rows; + new_fb_ptr->mi_cols = cm->mi_cols; + } +} + +static INLINE int mv_cost(const MV *mv, const int *joint_cost, + int *const comp_cost[2]) { + assert(mv->row >= -MV_MAX && mv->row < MV_MAX); + assert(mv->col >= -MV_MAX && mv->col < MV_MAX); + return joint_cost[vp9_get_mv_joint(mv)] + comp_cost[0][mv->row] + + comp_cost[1][mv->col]; +} + +static INLINE int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, + const MV *ref, int sad_per_bit) { + MV diff; + diff.row = mv->row - ref->row; + diff.col = mv->col - ref->col; + return ROUND_POWER_OF_TWO( + (unsigned)mv_cost(&diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit, + VP9_PROB_COST_SHIFT); +} + +static INLINE uint32_t get_start_mv_sad(const MACROBLOCK *x, const MV *mvp_full, + const MV *ref_mv_full, + vpx_sad_fn_t sad_fn_ptr, int sadpb) { + const int src_buf_stride = x->plane[0].src.stride; + const uint8_t *const src_buf = x->plane[0].src.buf; + const MACROBLOCKD *const xd = &x->e_mbd; + const int pred_buf_stride = xd->plane[0].pre[0].stride; + const uint8_t *const pred_buf = + xd->plane[0].pre[0].buf + mvp_full->row * pred_buf_stride + mvp_full->col; + uint32_t start_mv_sad = + sad_fn_ptr(src_buf, src_buf_stride, pred_buf, pred_buf_stride); + start_mv_sad += mvsad_err_cost(x, mvp_full, ref_mv_full, sadpb); + + return start_mv_sad; +} + +static INLINE int num_4x4_to_edge(int plane_4x4_dim, int mb_to_edge_dim, + int subsampling_dim, int blk_dim) { + return plane_4x4_dim + (mb_to_edge_dim >> (5 + subsampling_dim)) - blk_dim; +} + +// Compute the sum of squares on all visible 4x4s in the transform block. +static int64_t sum_squares_visible(const MACROBLOCKD *xd, + const struct macroblockd_plane *const pd, + const int16_t *diff, const int diff_stride, + int blk_row, int blk_col, + const BLOCK_SIZE plane_bsize, + const BLOCK_SIZE tx_bsize, + int *visible_width, int *visible_height) { + int64_t sse; + const int plane_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; + const int plane_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; + const int tx_4x4_w = num_4x4_blocks_wide_lookup[tx_bsize]; + const int tx_4x4_h = num_4x4_blocks_high_lookup[tx_bsize]; + const int b4x4s_to_right_edge = num_4x4_to_edge( + plane_4x4_w, xd->mb_to_right_edge, pd->subsampling_x, blk_col); + const int b4x4s_to_bottom_edge = num_4x4_to_edge( + plane_4x4_h, xd->mb_to_bottom_edge, pd->subsampling_y, blk_row); + if (tx_bsize == BLOCK_4X4 || + (b4x4s_to_right_edge >= tx_4x4_w && b4x4s_to_bottom_edge >= tx_4x4_h)) { + assert(tx_4x4_w == tx_4x4_h); + sse = (int64_t)vpx_sum_squares_2d_i16(diff, diff_stride, tx_4x4_w << 2); + *visible_width = tx_4x4_w << 2; + *visible_height = tx_4x4_h << 2; + } else { + int r, c; + const int max_r = VPXMIN(b4x4s_to_bottom_edge, tx_4x4_h); + const int max_c = VPXMIN(b4x4s_to_right_edge, tx_4x4_w); + sse = 0; + // if we are in the unrestricted motion border. + for (r = 0; r < max_r; ++r) { + // Skip visiting the sub blocks that are wholly within the UMV. + for (c = 0; c < max_c; ++c) { + sse += (int64_t)vpx_sum_squares_2d_i16( + diff + r * diff_stride * 4 + c * 4, diff_stride, 4); + } + } + *visible_width = max_c << 2; + *visible_height = max_r << 2; + } + return sse; +} + +// Check if trellis coefficient optimization of the transform block is enabled. +static INLINE int do_trellis_opt(const struct macroblockd_plane *pd, + const int16_t *src_diff, int diff_stride, + int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + void *arg) { + const struct encode_b_args *const args = (struct encode_b_args *)arg; + const MACROBLOCK *const x = args->x; + + switch (args->enable_trellis_opt) { + case DISABLE_TRELLIS_OPT: return 0; + case ENABLE_TRELLIS_OPT: return 1; + case ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR: { + vpx_clear_system_state(); + + return (args->trellis_opt_thresh > 0.0) + ? (x->log_block_src_var <= args->trellis_opt_thresh) + : 1; + } + case ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE: { + const MACROBLOCKD *const xd = &x->e_mbd; + const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; +#if CONFIG_VP9_HIGHBITDEPTH + const int dequant_shift = + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3; +#else + const int dequant_shift = 3; +#endif // CONFIG_VP9_HIGHBITDEPTH + const int qstep = pd->dequant[1] >> dequant_shift; + int *sse_calc_done = args->sse_calc_done; + int64_t *sse = args->sse; + int visible_width = 0, visible_height = 0; + + // TODO: Enable the sf for high bit-depth case + if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) || !sse || + !sse_calc_done) + return 1; + + *sse = sum_squares_visible(xd, pd, src_diff, diff_stride, blk_row, + blk_col, plane_bsize, tx_bsize, &visible_width, + &visible_height); + *sse_calc_done = 1; + + vpx_clear_system_state(); + + return (*(sse) <= (int64_t)visible_width * visible_height * qstep * + qstep * args->trellis_opt_thresh); + } + default: assert(0 && "Invalid trellis optimization method."); return 1; + } +} + +#if CONFIG_COLLECT_COMPONENT_TIMING +static INLINE void start_timing(VP9_COMP *cpi, int component) { + vpx_usec_timer_start(&cpi->component_timer[component]); +} +static INLINE void end_timing(VP9_COMP *cpi, int component) { + vpx_usec_timer_mark(&cpi->component_timer[component]); + cpi->frame_component_time[component] += + vpx_usec_timer_elapsed(&cpi->component_timer[component]); +} +static INLINE char const *get_frame_type_enum(int type) { + switch (type) { + case 0: return "KEY_FRAME"; + case 1: return "INTER_FRAME"; + default: assert(0); + } + return "error"; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_ENCODER_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c new file mode 100644 index 0000000000..a8d1cb7a7a --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c @@ -0,0 +1,692 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_thread_common.h" +#include "vp9/encoder/vp9_bitstream.h" +#include "vp9/encoder/vp9_encodeframe.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_ethread.h" +#include "vp9/encoder/vp9_firstpass.h" +#include "vp9/encoder/vp9_multi_thread.h" +#include "vp9/encoder/vp9_temporal_filter.h" +#include "vpx_dsp/vpx_dsp_common.h" + +static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) { + int i, j, k, l, m, n; + + for (i = 0; i < REFERENCE_MODES; i++) + td->rd_counts.comp_pred_diff[i] += td_t->rd_counts.comp_pred_diff[i]; + + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) + td->rd_counts.filter_diff[i] += td_t->rd_counts.filter_diff[i]; + + for (i = 0; i < TX_SIZES; i++) + for (j = 0; j < PLANE_TYPES; j++) + for (k = 0; k < REF_TYPES; k++) + for (l = 0; l < COEF_BANDS; l++) + for (m = 0; m < COEFF_CONTEXTS; m++) + for (n = 0; n < ENTROPY_TOKENS; n++) + td->rd_counts.coef_counts[i][j][k][l][m][n] += + td_t->rd_counts.coef_counts[i][j][k][l][m][n]; +} + +static int enc_worker_hook(void *arg1, void *unused) { + EncWorkerData *const thread_data = (EncWorkerData *)arg1; + VP9_COMP *const cpi = thread_data->cpi; + const VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + int t; + + (void)unused; + + for (t = thread_data->start; t < tile_rows * tile_cols; + t += cpi->num_workers) { + int tile_row = t / tile_cols; + int tile_col = t % tile_cols; + + vp9_encode_tile(cpi, thread_data->td, tile_row, tile_col); + } + + return 0; +} + +static int get_max_tile_cols(VP9_COMP *cpi) { + const int aligned_width = ALIGN_POWER_OF_TWO(cpi->oxcf.width, MI_SIZE_LOG2); + int mi_cols = aligned_width >> MI_SIZE_LOG2; + int min_log2_tile_cols, max_log2_tile_cols; + int log2_tile_cols; + + vp9_get_tile_n_bits(mi_cols, &min_log2_tile_cols, &max_log2_tile_cols); + log2_tile_cols = + clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols); + if (cpi->oxcf.target_level == LEVEL_AUTO) { + const int level_tile_cols = + log_tile_cols_from_picsize_level(cpi->common.width, cpi->common.height); + if (log2_tile_cols > level_tile_cols) { + log2_tile_cols = VPXMAX(level_tile_cols, min_log2_tile_cols); + } + } + return (1 << log2_tile_cols); +} + +static void create_enc_workers(VP9_COMP *cpi, int num_workers) { + VP9_COMMON *const cm = &cpi->common; + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + int i; + // While using SVC, we need to allocate threads according to the highest + // resolution. When row based multithreading is enabled, it is OK to + // allocate more threads than the number of max tile columns. + if (cpi->use_svc && !cpi->row_mt) { + int max_tile_cols = get_max_tile_cols(cpi); + num_workers = VPXMIN(cpi->oxcf.max_threads, max_tile_cols); + } + assert(num_workers > 0); + if (num_workers == cpi->num_workers) return; + vp9_loop_filter_dealloc(&cpi->lf_row_sync); + vp9_bitstream_encode_tiles_buffer_dealloc(cpi); + vp9_encode_free_mt_data(cpi); + + CHECK_MEM_ERROR(&cm->error, cpi->workers, + vpx_malloc(num_workers * sizeof(*cpi->workers))); + + CHECK_MEM_ERROR(&cm->error, cpi->tile_thr_data, + vpx_calloc(num_workers, sizeof(*cpi->tile_thr_data))); + + for (i = 0; i < num_workers; i++) { + VPxWorker *const worker = &cpi->workers[i]; + EncWorkerData *thread_data = &cpi->tile_thr_data[i]; + + ++cpi->num_workers; + winterface->init(worker); + + if (i < num_workers - 1) { + thread_data->cpi = cpi; + + // Allocate thread data. + CHECK_MEM_ERROR(&cm->error, thread_data->td, + vpx_memalign(32, sizeof(*thread_data->td))); + vp9_zero(*thread_data->td); + + // Set up pc_tree. + thread_data->td->leaf_tree = NULL; + thread_data->td->pc_tree = NULL; + vp9_setup_pc_tree(cm, thread_data->td); + + // Allocate frame counters in thread data. + CHECK_MEM_ERROR(&cm->error, thread_data->td->counts, + vpx_calloc(1, sizeof(*thread_data->td->counts))); + + // Create threads + if (!winterface->reset(worker)) + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Tile encoder thread creation failed"); + } else { + // Main thread acts as a worker and uses the thread data in cpi. + thread_data->cpi = cpi; + thread_data->td = &cpi->td; + } + winterface->sync(worker); + } +} + +static void launch_enc_workers(VP9_COMP *cpi, VPxWorkerHook hook, void *data2, + int num_workers) { + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + int i; + + for (i = 0; i < num_workers; i++) { + VPxWorker *const worker = &cpi->workers[i]; + worker->hook = hook; + worker->data1 = &cpi->tile_thr_data[i]; + worker->data2 = data2; + } + + // Encode a frame + for (i = 0; i < num_workers; i++) { + VPxWorker *const worker = &cpi->workers[i]; + EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; + + // Set the starting tile for each thread. + thread_data->start = i; + + if (i == cpi->num_workers - 1) + winterface->execute(worker); + else + winterface->launch(worker); + } + + // Encoding ends. + for (i = 0; i < num_workers; i++) { + VPxWorker *const worker = &cpi->workers[i]; + winterface->sync(worker); + } +} + +void vp9_encode_free_mt_data(struct VP9_COMP *cpi) { + int t; + for (t = 0; t < cpi->num_workers; ++t) { + VPxWorker *const worker = &cpi->workers[t]; + EncWorkerData *const thread_data = &cpi->tile_thr_data[t]; + + // Deallocate allocated threads. + vpx_get_worker_interface()->end(worker); + + // Deallocate allocated thread data. + if (t < cpi->num_workers - 1) { + vpx_free(thread_data->td->counts); + vp9_free_pc_tree(thread_data->td); + vpx_free(thread_data->td); + } + } + vpx_free(cpi->tile_thr_data); + cpi->tile_thr_data = NULL; + vpx_free(cpi->workers); + cpi->workers = NULL; + cpi->num_workers = 0; +} + +void vp9_encode_tiles_mt(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int num_workers = VPXMIN(cpi->oxcf.max_threads, tile_cols); + int i; + + vp9_init_tile_data(cpi); + + create_enc_workers(cpi, num_workers); + + for (i = 0; i < num_workers; i++) { + EncWorkerData *thread_data; + thread_data = &cpi->tile_thr_data[i]; + + // Before encoding a frame, copy the thread data from cpi. + if (thread_data->td != &cpi->td) { + thread_data->td->mb = cpi->td.mb; + thread_data->td->rd_counts = cpi->td.rd_counts; + } + if (thread_data->td->counts != &cpi->common.counts) { + memcpy(thread_data->td->counts, &cpi->common.counts, + sizeof(cpi->common.counts)); + } + + // Handle use_nonrd_pick_mode case. + if (cpi->sf.use_nonrd_pick_mode) { + MACROBLOCK *const x = &thread_data->td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = xd->plane; + PICK_MODE_CONTEXT *ctx = &thread_data->td->pc_root->none; + int j; + + for (j = 0; j < MAX_MB_PLANE; ++j) { + p[j].coeff = ctx->coeff_pbuf[j][0]; + p[j].qcoeff = ctx->qcoeff_pbuf[j][0]; + pd[j].dqcoeff = ctx->dqcoeff_pbuf[j][0]; + p[j].eobs = ctx->eobs_pbuf[j][0]; + } + } + } + + launch_enc_workers(cpi, enc_worker_hook, NULL, num_workers); + + for (i = 0; i < num_workers; i++) { + VPxWorker *const worker = &cpi->workers[i]; + EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; + + // Accumulate counters. + if (i < cpi->num_workers - 1) { + vp9_accumulate_frame_counts(&cm->counts, thread_data->td->counts, 0); + accumulate_rd_opt(&cpi->td, thread_data->td); + } + } +} + +#if !CONFIG_REALTIME_ONLY +static void accumulate_fp_tile_stat(TileDataEnc *tile_data, + TileDataEnc *tile_data_t) { + tile_data->fp_data.intra_factor += tile_data_t->fp_data.intra_factor; + tile_data->fp_data.brightness_factor += + tile_data_t->fp_data.brightness_factor; + tile_data->fp_data.coded_error += tile_data_t->fp_data.coded_error; + tile_data->fp_data.sr_coded_error += tile_data_t->fp_data.sr_coded_error; + tile_data->fp_data.frame_noise_energy += + tile_data_t->fp_data.frame_noise_energy; + tile_data->fp_data.intra_error += tile_data_t->fp_data.intra_error; + tile_data->fp_data.intercount += tile_data_t->fp_data.intercount; + tile_data->fp_data.second_ref_count += tile_data_t->fp_data.second_ref_count; + tile_data->fp_data.neutral_count += tile_data_t->fp_data.neutral_count; + tile_data->fp_data.intra_count_low += tile_data_t->fp_data.intra_count_low; + tile_data->fp_data.intra_count_high += tile_data_t->fp_data.intra_count_high; + tile_data->fp_data.intra_skip_count += tile_data_t->fp_data.intra_skip_count; + tile_data->fp_data.mvcount += tile_data_t->fp_data.mvcount; + tile_data->fp_data.new_mv_count += tile_data_t->fp_data.new_mv_count; + tile_data->fp_data.sum_mvr += tile_data_t->fp_data.sum_mvr; + tile_data->fp_data.sum_mvr_abs += tile_data_t->fp_data.sum_mvr_abs; + tile_data->fp_data.sum_mvc += tile_data_t->fp_data.sum_mvc; + tile_data->fp_data.sum_mvc_abs += tile_data_t->fp_data.sum_mvc_abs; + tile_data->fp_data.sum_mvrs += tile_data_t->fp_data.sum_mvrs; + tile_data->fp_data.sum_mvcs += tile_data_t->fp_data.sum_mvcs; + tile_data->fp_data.sum_in_vectors += tile_data_t->fp_data.sum_in_vectors; + tile_data->fp_data.intra_smooth_count += + tile_data_t->fp_data.intra_smooth_count; + tile_data->fp_data.image_data_start_row = + VPXMIN(tile_data->fp_data.image_data_start_row, + tile_data_t->fp_data.image_data_start_row) == INVALID_ROW + ? VPXMAX(tile_data->fp_data.image_data_start_row, + tile_data_t->fp_data.image_data_start_row) + : VPXMIN(tile_data->fp_data.image_data_start_row, + tile_data_t->fp_data.image_data_start_row); +} +#endif // !CONFIG_REALTIME_ONLY + +// Allocate memory for row synchronization +void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm, + int rows) { + row_mt_sync->rows = rows; +#if CONFIG_MULTITHREAD + { + int i; + + CHECK_MEM_ERROR(&cm->error, row_mt_sync->mutex, + vpx_malloc(sizeof(*row_mt_sync->mutex) * rows)); + if (row_mt_sync->mutex) { + for (i = 0; i < rows; ++i) { + pthread_mutex_init(&row_mt_sync->mutex[i], NULL); + } + } + + CHECK_MEM_ERROR(&cm->error, row_mt_sync->cond, + vpx_malloc(sizeof(*row_mt_sync->cond) * rows)); + if (row_mt_sync->cond) { + for (i = 0; i < rows; ++i) { + pthread_cond_init(&row_mt_sync->cond[i], NULL); + } + } + } +#endif // CONFIG_MULTITHREAD + + CHECK_MEM_ERROR(&cm->error, row_mt_sync->cur_col, + vpx_malloc(sizeof(*row_mt_sync->cur_col) * rows)); + + // Set up nsync. + row_mt_sync->sync_range = 1; +} + +// Deallocate row based multi-threading synchronization related mutex and data +void vp9_row_mt_sync_mem_dealloc(VP9RowMTSync *row_mt_sync) { + if (row_mt_sync != NULL) { +#if CONFIG_MULTITHREAD + int i; + + if (row_mt_sync->mutex != NULL) { + for (i = 0; i < row_mt_sync->rows; ++i) { + pthread_mutex_destroy(&row_mt_sync->mutex[i]); + } + vpx_free(row_mt_sync->mutex); + } + if (row_mt_sync->cond != NULL) { + for (i = 0; i < row_mt_sync->rows; ++i) { + pthread_cond_destroy(&row_mt_sync->cond[i]); + } + vpx_free(row_mt_sync->cond); + } +#endif // CONFIG_MULTITHREAD + vpx_free(row_mt_sync->cur_col); + // clear the structure as the source of this call may be dynamic change + // in tiles in which case this call will be followed by an _alloc() + // which may fail. + vp9_zero(*row_mt_sync); + } +} + +void vp9_row_mt_sync_read(VP9RowMTSync *const row_mt_sync, int r, int c) { +#if CONFIG_MULTITHREAD + const int nsync = row_mt_sync->sync_range; + + if (r && !(c & (nsync - 1))) { + pthread_mutex_t *const mutex = &row_mt_sync->mutex[r - 1]; + pthread_mutex_lock(mutex); + + while (c > row_mt_sync->cur_col[r - 1] - nsync + 1) { + pthread_cond_wait(&row_mt_sync->cond[r - 1], mutex); + } + pthread_mutex_unlock(mutex); + } +#else + (void)row_mt_sync; + (void)r; + (void)c; +#endif // CONFIG_MULTITHREAD +} + +void vp9_row_mt_sync_read_dummy(VP9RowMTSync *const row_mt_sync, int r, int c) { + (void)row_mt_sync; + (void)r; + (void)c; + return; +} + +void vp9_row_mt_sync_write(VP9RowMTSync *const row_mt_sync, int r, int c, + const int cols) { +#if CONFIG_MULTITHREAD + const int nsync = row_mt_sync->sync_range; + int cur; + // Only signal when there are enough encoded blocks for next row to run. + int sig = 1; + + if (c < cols - 1) { + cur = c; + if (c % nsync != nsync - 1) sig = 0; + } else { + cur = cols + nsync; + } + + if (sig) { + pthread_mutex_lock(&row_mt_sync->mutex[r]); + + row_mt_sync->cur_col[r] = cur; + + pthread_cond_signal(&row_mt_sync->cond[r]); + pthread_mutex_unlock(&row_mt_sync->mutex[r]); + } +#else + (void)row_mt_sync; + (void)r; + (void)c; + (void)cols; +#endif // CONFIG_MULTITHREAD +} + +void vp9_row_mt_sync_write_dummy(VP9RowMTSync *const row_mt_sync, int r, int c, + const int cols) { + (void)row_mt_sync; + (void)r; + (void)c; + (void)cols; + return; +} + +#if !CONFIG_REALTIME_ONLY +static int first_pass_worker_hook(void *arg1, void *arg2) { + EncWorkerData *const thread_data = (EncWorkerData *)arg1; + MultiThreadHandle *multi_thread_ctxt = (MultiThreadHandle *)arg2; + VP9_COMP *const cpi = thread_data->cpi; + const VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + int tile_row, tile_col; + TileDataEnc *this_tile; + int end_of_frame; + int thread_id = thread_data->thread_id; + int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id]; + JobNode *proc_job = NULL; + FIRSTPASS_DATA fp_acc_data; + MV zero_mv = { 0, 0 }; + MV best_ref_mv; + int mb_row; + + end_of_frame = 0; + while (0 == end_of_frame) { + // Get the next job in the queue + proc_job = + (JobNode *)vp9_enc_grp_get_next_job(multi_thread_ctxt, cur_tile_id); + if (NULL == proc_job) { + // Query for the status of other tiles + end_of_frame = vp9_get_tiles_proc_status( + multi_thread_ctxt, thread_data->tile_completion_status, &cur_tile_id, + tile_cols); + } else { + tile_col = proc_job->tile_col_id; + tile_row = proc_job->tile_row_id; + + this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + mb_row = proc_job->vert_unit_row_num; + + best_ref_mv = zero_mv; + vp9_zero(fp_acc_data); + fp_acc_data.image_data_start_row = INVALID_ROW; + vp9_first_pass_encode_tile_mb_row(cpi, thread_data->td, &fp_acc_data, + this_tile, &best_ref_mv, mb_row); + } + } + return 0; +} + +void vp9_encode_fp_row_mt(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; + TileDataEnc *first_tile_col; + int num_workers = VPXMAX(cpi->oxcf.max_threads, 1); + int i; + + if (multi_thread_ctxt->allocated_tile_cols < tile_cols || + multi_thread_ctxt->allocated_tile_rows < tile_rows || + multi_thread_ctxt->allocated_vert_unit_rows < cm->mb_rows) { + vp9_row_mt_mem_dealloc(cpi); + vp9_init_tile_data(cpi); + vp9_row_mt_mem_alloc(cpi); + } else { + vp9_init_tile_data(cpi); + } + + create_enc_workers(cpi, num_workers); + + vp9_assign_tile_to_thread(multi_thread_ctxt, tile_cols, cpi->num_workers); + + vp9_prepare_job_queue(cpi, FIRST_PASS_JOB); + + vp9_multi_thread_tile_init(cpi); + + for (i = 0; i < num_workers; i++) { + EncWorkerData *thread_data; + thread_data = &cpi->tile_thr_data[i]; + + // Before encoding a frame, copy the thread data from cpi. + if (thread_data->td != &cpi->td) { + thread_data->td->mb = cpi->td.mb; + } + } + + launch_enc_workers(cpi, first_pass_worker_hook, multi_thread_ctxt, + num_workers); + + first_tile_col = &cpi->tile_data[0]; + for (i = 1; i < tile_cols; i++) { + TileDataEnc *this_tile = &cpi->tile_data[i]; + accumulate_fp_tile_stat(first_tile_col, this_tile); + } +} + +static int temporal_filter_worker_hook(void *arg1, void *arg2) { + EncWorkerData *const thread_data = (EncWorkerData *)arg1; + MultiThreadHandle *multi_thread_ctxt = (MultiThreadHandle *)arg2; + VP9_COMP *const cpi = thread_data->cpi; + const VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + int tile_row, tile_col; + int mb_col_start, mb_col_end; + TileDataEnc *this_tile; + int end_of_frame; + int thread_id = thread_data->thread_id; + int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id]; + JobNode *proc_job = NULL; + int mb_row; + + end_of_frame = 0; + while (0 == end_of_frame) { + // Get the next job in the queue + proc_job = + (JobNode *)vp9_enc_grp_get_next_job(multi_thread_ctxt, cur_tile_id); + if (NULL == proc_job) { + // Query for the status of other tiles + end_of_frame = vp9_get_tiles_proc_status( + multi_thread_ctxt, thread_data->tile_completion_status, &cur_tile_id, + tile_cols); + } else { + tile_col = proc_job->tile_col_id; + tile_row = proc_job->tile_row_id; + this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + mb_col_start = (this_tile->tile_info.mi_col_start) >> TF_SHIFT; + mb_col_end = (this_tile->tile_info.mi_col_end + TF_ROUND) >> TF_SHIFT; + mb_row = proc_job->vert_unit_row_num; + + vp9_temporal_filter_iterate_row_c(cpi, thread_data->td, mb_row, + mb_col_start, mb_col_end); + } + } + return 0; +} + +void vp9_temporal_filter_row_mt(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; + int num_workers = cpi->num_workers ? cpi->num_workers : 1; + int i; + + if (multi_thread_ctxt->allocated_tile_cols < tile_cols || + multi_thread_ctxt->allocated_tile_rows < tile_rows || + multi_thread_ctxt->allocated_vert_unit_rows < cm->mb_rows) { + vp9_row_mt_mem_dealloc(cpi); + vp9_init_tile_data(cpi); + vp9_row_mt_mem_alloc(cpi); + } else { + vp9_init_tile_data(cpi); + } + + create_enc_workers(cpi, num_workers); + + vp9_assign_tile_to_thread(multi_thread_ctxt, tile_cols, cpi->num_workers); + + vp9_prepare_job_queue(cpi, ARNR_JOB); + + for (i = 0; i < num_workers; i++) { + EncWorkerData *thread_data; + thread_data = &cpi->tile_thr_data[i]; + + // Before encoding a frame, copy the thread data from cpi. + if (thread_data->td != &cpi->td) { + thread_data->td->mb = cpi->td.mb; + } + } + + launch_enc_workers(cpi, temporal_filter_worker_hook, multi_thread_ctxt, + num_workers); +} +#endif // !CONFIG_REALTIME_ONLY + +static int enc_row_mt_worker_hook(void *arg1, void *arg2) { + EncWorkerData *const thread_data = (EncWorkerData *)arg1; + MultiThreadHandle *multi_thread_ctxt = (MultiThreadHandle *)arg2; + VP9_COMP *const cpi = thread_data->cpi; + const VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + int tile_row, tile_col; + int end_of_frame; + int thread_id = thread_data->thread_id; + int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id]; + JobNode *proc_job = NULL; + int mi_row; + + end_of_frame = 0; + while (0 == end_of_frame) { + // Get the next job in the queue + proc_job = + (JobNode *)vp9_enc_grp_get_next_job(multi_thread_ctxt, cur_tile_id); + if (NULL == proc_job) { + // Query for the status of other tiles + end_of_frame = vp9_get_tiles_proc_status( + multi_thread_ctxt, thread_data->tile_completion_status, &cur_tile_id, + tile_cols); + } else { + tile_col = proc_job->tile_col_id; + tile_row = proc_job->tile_row_id; + mi_row = proc_job->vert_unit_row_num * MI_BLOCK_SIZE; + + vp9_encode_sb_row(cpi, thread_data->td, tile_row, tile_col, mi_row); + } + } + return 0; +} + +void vp9_encode_tiles_row_mt(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; + int num_workers = VPXMAX(cpi->oxcf.max_threads, 1); + int i; + + if (multi_thread_ctxt->allocated_tile_cols < tile_cols || + multi_thread_ctxt->allocated_tile_rows < tile_rows || + multi_thread_ctxt->allocated_vert_unit_rows < cm->mb_rows) { + vp9_row_mt_mem_dealloc(cpi); + vp9_init_tile_data(cpi); + vp9_row_mt_mem_alloc(cpi); + } else { + vp9_init_tile_data(cpi); + } + + create_enc_workers(cpi, num_workers); + + vp9_assign_tile_to_thread(multi_thread_ctxt, tile_cols, cpi->num_workers); + + vp9_prepare_job_queue(cpi, ENCODE_JOB); + + vp9_multi_thread_tile_init(cpi); + + for (i = 0; i < num_workers; i++) { + EncWorkerData *thread_data; + thread_data = &cpi->tile_thr_data[i]; + // Before encoding a frame, copy the thread data from cpi. + if (thread_data->td != &cpi->td) { + thread_data->td->mb = cpi->td.mb; + thread_data->td->rd_counts = cpi->td.rd_counts; + } + if (thread_data->td->counts != &cpi->common.counts) { + memcpy(thread_data->td->counts, &cpi->common.counts, + sizeof(cpi->common.counts)); + } + + // Handle use_nonrd_pick_mode case. + if (cpi->sf.use_nonrd_pick_mode) { + MACROBLOCK *const x = &thread_data->td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = xd->plane; + PICK_MODE_CONTEXT *ctx = &thread_data->td->pc_root->none; + int j; + + for (j = 0; j < MAX_MB_PLANE; ++j) { + p[j].coeff = ctx->coeff_pbuf[j][0]; + p[j].qcoeff = ctx->qcoeff_pbuf[j][0]; + pd[j].dqcoeff = ctx->dqcoeff_pbuf[j][0]; + p[j].eobs = ctx->eobs_pbuf[j][0]; + } + } + } + + launch_enc_workers(cpi, enc_row_mt_worker_hook, multi_thread_ctxt, + num_workers); + + for (i = 0; i < num_workers; i++) { + VPxWorker *const worker = &cpi->workers[i]; + EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; + + // Accumulate counters. + if (i < cpi->num_workers - 1) { + vp9_accumulate_frame_counts(&cm->counts, thread_data->td->counts, 0); + accumulate_rd_opt(&cpi->td, thread_data->td); + } + } +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h new file mode 100644 index 0000000000..4c192da515 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_ETHREAD_H_ +#define VPX_VP9_ENCODER_VP9_ETHREAD_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_NUM_TILE_COLS (1 << 6) +#define MAX_NUM_TILE_ROWS 4 +#define MAX_NUM_THREADS 80 + +struct VP9_COMP; +struct ThreadData; + +typedef struct EncWorkerData { + struct VP9_COMP *cpi; + struct ThreadData *td; + int start; + int thread_id; + int tile_completion_status[MAX_NUM_TILE_COLS]; +} EncWorkerData; + +// Encoder row synchronization +typedef struct VP9RowMTSyncData { +#if CONFIG_MULTITHREAD + pthread_mutex_t *mutex; + pthread_cond_t *cond; +#endif + // Allocate memory to store the sb/mb block index in each row. + int *cur_col; + int sync_range; + int rows; +} VP9RowMTSync; + +// Frees EncWorkerData related allocations made by vp9_encode_*_mt(). +// row_mt specific data is freed with vp9_row_mt_mem_dealloc() and is not +// called by this function. +void vp9_encode_free_mt_data(struct VP9_COMP *cpi); + +void vp9_encode_tiles_mt(struct VP9_COMP *cpi); + +void vp9_encode_tiles_row_mt(struct VP9_COMP *cpi); + +void vp9_encode_fp_row_mt(struct VP9_COMP *cpi); + +void vp9_row_mt_sync_read(VP9RowMTSync *const row_mt_sync, int r, int c); +void vp9_row_mt_sync_write(VP9RowMTSync *const row_mt_sync, int r, int c, + const int cols); + +void vp9_row_mt_sync_read_dummy(VP9RowMTSync *const row_mt_sync, int r, int c); +void vp9_row_mt_sync_write_dummy(VP9RowMTSync *const row_mt_sync, int r, int c, + const int cols); + +// Allocate memory for row based multi-threading synchronization. +void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, struct VP9Common *cm, + int rows); + +// Deallocate row based multi-threading synchronization related mutex and data. +void vp9_row_mt_sync_mem_dealloc(VP9RowMTSync *row_mt_sync); + +void vp9_temporal_filter_row_mt(struct VP9_COMP *cpi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_ETHREAD_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c new file mode 100644 index 0000000000..4664e8c5e2 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c @@ -0,0 +1,281 @@ +/* + * Copyright (c) 2020 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vp9/encoder/vp9_ext_ratectrl.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/common/vp9_common.h" +#include "vpx_dsp/psnr.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_ext_ratectrl.h" +#include "vpx/vpx_tpl.h" + +vpx_codec_err_t vp9_extrc_init(EXT_RATECTRL *ext_ratectrl) { + if (ext_ratectrl == NULL) { + return VPX_CODEC_INVALID_PARAM; + } + vp9_zero(*ext_ratectrl); + return VPX_CODEC_OK; +} + +vpx_codec_err_t vp9_extrc_create(vpx_rc_funcs_t funcs, + vpx_rc_config_t ratectrl_config, + EXT_RATECTRL *ext_ratectrl) { + vpx_rc_status_t rc_status; + vpx_rc_firstpass_stats_t *rc_firstpass_stats; + if (ext_ratectrl == NULL) { + return VPX_CODEC_INVALID_PARAM; + } + vp9_extrc_delete(ext_ratectrl); + ext_ratectrl->funcs = funcs; + ext_ratectrl->ratectrl_config = ratectrl_config; + rc_status = ext_ratectrl->funcs.create_model(ext_ratectrl->funcs.priv, + &ext_ratectrl->ratectrl_config, + &ext_ratectrl->model); + if (rc_status == VPX_RC_ERROR) { + return VPX_CODEC_ERROR; + } + rc_firstpass_stats = &ext_ratectrl->rc_firstpass_stats; + rc_firstpass_stats->num_frames = ratectrl_config.show_frame_count; + rc_firstpass_stats->frame_stats = + vpx_malloc(sizeof(*rc_firstpass_stats->frame_stats) * + rc_firstpass_stats->num_frames); + if (rc_firstpass_stats->frame_stats == NULL) { + return VPX_CODEC_MEM_ERROR; + } + ext_ratectrl->ready = 1; + return VPX_CODEC_OK; +} + +vpx_codec_err_t vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl) { + if (ext_ratectrl == NULL) { + return VPX_CODEC_INVALID_PARAM; + } + if (ext_ratectrl->ready) { + vpx_rc_status_t rc_status = + ext_ratectrl->funcs.delete_model(ext_ratectrl->model); + if (rc_status == VPX_RC_ERROR) { + return VPX_CODEC_ERROR; + } + vpx_free(ext_ratectrl->rc_firstpass_stats.frame_stats); + } + return vp9_extrc_init(ext_ratectrl); +} + +static void gen_rc_firstpass_stats(const FIRSTPASS_STATS *stats, + vpx_rc_frame_stats_t *rc_frame_stats) { + rc_frame_stats->frame = stats->frame; + rc_frame_stats->weight = stats->weight; + rc_frame_stats->intra_error = stats->intra_error; + rc_frame_stats->coded_error = stats->coded_error; + rc_frame_stats->sr_coded_error = stats->sr_coded_error; + rc_frame_stats->frame_noise_energy = stats->frame_noise_energy; + rc_frame_stats->pcnt_inter = stats->pcnt_inter; + rc_frame_stats->pcnt_motion = stats->pcnt_motion; + rc_frame_stats->pcnt_second_ref = stats->pcnt_second_ref; + rc_frame_stats->pcnt_neutral = stats->pcnt_neutral; + rc_frame_stats->pcnt_intra_low = stats->pcnt_intra_low; + rc_frame_stats->pcnt_intra_high = stats->pcnt_intra_high; + rc_frame_stats->intra_skip_pct = stats->intra_skip_pct; + rc_frame_stats->intra_smooth_pct = stats->intra_smooth_pct; + rc_frame_stats->inactive_zone_rows = stats->inactive_zone_rows; + rc_frame_stats->inactive_zone_cols = stats->inactive_zone_cols; + rc_frame_stats->MVr = stats->MVr; + rc_frame_stats->mvr_abs = stats->mvr_abs; + rc_frame_stats->MVc = stats->MVc; + rc_frame_stats->mvc_abs = stats->mvc_abs; + rc_frame_stats->MVrv = stats->MVrv; + rc_frame_stats->MVcv = stats->MVcv; + rc_frame_stats->mv_in_out_count = stats->mv_in_out_count; + rc_frame_stats->duration = stats->duration; + rc_frame_stats->count = stats->count; + rc_frame_stats->new_mv_count = stats->new_mv_count; +} + +vpx_codec_err_t vp9_extrc_send_firstpass_stats( + EXT_RATECTRL *ext_ratectrl, const FIRST_PASS_INFO *first_pass_info) { + if (ext_ratectrl == NULL) { + return VPX_CODEC_INVALID_PARAM; + } + if (ext_ratectrl->ready) { + vpx_rc_status_t rc_status; + vpx_rc_firstpass_stats_t *rc_firstpass_stats = + &ext_ratectrl->rc_firstpass_stats; + int i; + assert(rc_firstpass_stats->num_frames == first_pass_info->num_frames); + for (i = 0; i < rc_firstpass_stats->num_frames; ++i) { + gen_rc_firstpass_stats(&first_pass_info->stats[i], + &rc_firstpass_stats->frame_stats[i]); + } + rc_status = ext_ratectrl->funcs.send_firstpass_stats(ext_ratectrl->model, + rc_firstpass_stats); + if (rc_status == VPX_RC_ERROR) { + return VPX_CODEC_ERROR; + } + } + return VPX_CODEC_OK; +} + +vpx_codec_err_t vp9_extrc_send_tpl_stats(EXT_RATECTRL *ext_ratectrl, + const VpxTplGopStats *tpl_gop_stats) { + if (ext_ratectrl == NULL) { + return VPX_CODEC_INVALID_PARAM; + } + if (ext_ratectrl->ready && ext_ratectrl->funcs.send_tpl_gop_stats != NULL) { + vpx_rc_status_t rc_status = ext_ratectrl->funcs.send_tpl_gop_stats( + ext_ratectrl->model, tpl_gop_stats); + if (rc_status == VPX_RC_ERROR) { + return VPX_CODEC_ERROR; + } + } + return VPX_CODEC_OK; +} + +static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) { + // TODO(angiebird): Add unit test to make sure this function behaves like + // get_frame_type_from_update_type() + // TODO(angiebird): Merge this function with get_frame_type_from_update_type() + switch (update_type) { + case KF_UPDATE: return 0; // kFrameTypeKey; + case ARF_UPDATE: return 2; // kFrameTypeAltRef; + case GF_UPDATE: return 4; // kFrameTypeGolden; + case OVERLAY_UPDATE: return 3; // kFrameTypeOverlay; + case LF_UPDATE: return 1; // kFrameTypeInter; + default: + fprintf(stderr, "Unsupported update_type %d\n", update_type); + abort(); + } +} + +vpx_codec_err_t vp9_extrc_get_encodeframe_decision( + EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index, + FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref, + RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags, + vpx_rc_encodeframe_decision_t *encode_frame_decision) { + if (ext_ratectrl == NULL) { + return VPX_CODEC_INVALID_PARAM; + } + if (ext_ratectrl->ready && (ext_ratectrl->funcs.rc_type & VPX_RC_QP) != 0) { + vpx_rc_status_t rc_status; + vpx_rc_encodeframe_info_t encode_frame_info; + encode_frame_info.show_index = show_index; + encode_frame_info.coding_index = coding_index; + encode_frame_info.gop_index = gop_index; + encode_frame_info.frame_type = extrc_get_frame_type(update_type); + encode_frame_info.gop_size = gop_size; + encode_frame_info.use_alt_ref = use_alt_ref; + + vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs, + encode_frame_info.ref_frame_coding_indexes, + encode_frame_info.ref_frame_valid_list); + + rc_status = ext_ratectrl->funcs.get_encodeframe_decision( + ext_ratectrl->model, &encode_frame_info, encode_frame_decision); + if (rc_status == VPX_RC_ERROR) { + return VPX_CODEC_ERROR; + } + } + return VPX_CODEC_OK; +} + +vpx_codec_err_t vp9_extrc_update_encodeframe_result( + EXT_RATECTRL *ext_ratectrl, int64_t bit_count, + const YV12_BUFFER_CONFIG *source_frame, + const YV12_BUFFER_CONFIG *coded_frame, uint32_t bit_depth, + uint32_t input_bit_depth, const int actual_encoding_qindex) { + if (ext_ratectrl == NULL) { + return VPX_CODEC_INVALID_PARAM; + } + if (ext_ratectrl->ready) { + PSNR_STATS psnr; + vpx_rc_status_t rc_status; + vpx_rc_encodeframe_result_t encode_frame_result; + encode_frame_result.bit_count = bit_count; + encode_frame_result.pixel_count = + source_frame->y_crop_width * source_frame->y_crop_height + + 2 * source_frame->uv_crop_width * source_frame->uv_crop_height; + encode_frame_result.actual_encoding_qindex = actual_encoding_qindex; +#if CONFIG_VP9_HIGHBITDEPTH + vpx_calc_highbd_psnr(source_frame, coded_frame, &psnr, bit_depth, + input_bit_depth); +#else + (void)bit_depth; + (void)input_bit_depth; + vpx_calc_psnr(source_frame, coded_frame, &psnr); +#endif + encode_frame_result.sse = psnr.sse[0]; + rc_status = ext_ratectrl->funcs.update_encodeframe_result( + ext_ratectrl->model, &encode_frame_result); + if (rc_status == VPX_RC_ERROR) { + return VPX_CODEC_ERROR; + } + } + return VPX_CODEC_OK; +} + +vpx_codec_err_t vp9_extrc_get_gop_decision( + EXT_RATECTRL *ext_ratectrl, const vpx_rc_gop_info_t *const gop_info, + vpx_rc_gop_decision_t *gop_decision) { + vpx_rc_status_t rc_status; + if (ext_ratectrl == NULL || !ext_ratectrl->ready || + (ext_ratectrl->funcs.rc_type & VPX_RC_GOP) == 0) { + return VPX_CODEC_INVALID_PARAM; + } + rc_status = ext_ratectrl->funcs.get_gop_decision(ext_ratectrl->model, + gop_info, gop_decision); + if (gop_decision->use_alt_ref) { + const int arf_constraint = + gop_decision->gop_coding_frames >= gop_info->min_gf_interval && + gop_decision->gop_coding_frames < gop_info->lag_in_frames; + if (!arf_constraint || !gop_info->allow_alt_ref) return VPX_CODEC_ERROR; + } + // TODO(chengchen): Take min and max gf interval from the model + // and overwrite libvpx's decision so that we can get rid + // of one of the checks here. + if (gop_decision->gop_coding_frames > gop_info->frames_to_key || + gop_decision->gop_coding_frames - gop_decision->use_alt_ref > + gop_info->max_gf_interval) { + return VPX_CODEC_ERROR; + } + if (rc_status == VPX_RC_ERROR) { + return VPX_CODEC_ERROR; + } + return VPX_CODEC_OK; +} + +vpx_codec_err_t vp9_extrc_get_frame_rdmult( + EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index, + FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref, + RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags, + int *rdmult) { + vpx_rc_status_t rc_status; + vpx_rc_encodeframe_info_t encode_frame_info; + if (ext_ratectrl == NULL || !ext_ratectrl->ready || + (ext_ratectrl->funcs.rc_type & VPX_RC_RDMULT) == 0) { + return VPX_CODEC_INVALID_PARAM; + } + encode_frame_info.show_index = show_index; + encode_frame_info.coding_index = coding_index; + encode_frame_info.gop_index = gop_index; + encode_frame_info.frame_type = extrc_get_frame_type(update_type); + encode_frame_info.gop_size = gop_size; + encode_frame_info.use_alt_ref = use_alt_ref; + + vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs, + encode_frame_info.ref_frame_coding_indexes, + encode_frame_info.ref_frame_valid_list); + rc_status = ext_ratectrl->funcs.get_frame_rdmult(ext_ratectrl->model, + &encode_frame_info, rdmult); + if (rc_status == VPX_RC_ERROR) { + return VPX_CODEC_ERROR; + } + return VPX_CODEC_OK; +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h new file mode 100644 index 0000000000..b04580c1d4 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2020 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_ +#define VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_ + +#include "vpx/vpx_ext_ratectrl.h" +#include "vpx/vpx_tpl.h" +#include "vp9/encoder/vp9_firstpass.h" + +typedef struct EXT_RATECTRL { + int ready; + int ext_rdmult; + vpx_rc_model_t model; + vpx_rc_funcs_t funcs; + vpx_rc_config_t ratectrl_config; + vpx_rc_firstpass_stats_t rc_firstpass_stats; +} EXT_RATECTRL; + +vpx_codec_err_t vp9_extrc_init(EXT_RATECTRL *ext_ratectrl); + +vpx_codec_err_t vp9_extrc_create(vpx_rc_funcs_t funcs, + vpx_rc_config_t ratectrl_config, + EXT_RATECTRL *ext_ratectrl); + +vpx_codec_err_t vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl); + +vpx_codec_err_t vp9_extrc_send_firstpass_stats( + EXT_RATECTRL *ext_ratectrl, const FIRST_PASS_INFO *first_pass_info); + +vpx_codec_err_t vp9_extrc_send_tpl_stats(EXT_RATECTRL *ext_ratectrl, + const VpxTplGopStats *tpl_gop_stats); + +vpx_codec_err_t vp9_extrc_get_encodeframe_decision( + EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index, + FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref, + RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags, + vpx_rc_encodeframe_decision_t *encode_frame_decision); + +vpx_codec_err_t vp9_extrc_update_encodeframe_result( + EXT_RATECTRL *ext_ratectrl, int64_t bit_count, + const YV12_BUFFER_CONFIG *source_frame, + const YV12_BUFFER_CONFIG *coded_frame, uint32_t bit_depth, + uint32_t input_bit_depth, const int actual_encoding_qindex); + +vpx_codec_err_t vp9_extrc_get_gop_decision( + EXT_RATECTRL *ext_ratectrl, const vpx_rc_gop_info_t *const gop_info, + vpx_rc_gop_decision_t *gop_decision); + +vpx_codec_err_t vp9_extrc_get_frame_rdmult( + EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index, + FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref, + RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags, + int *rdmult); + +#endif // VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_extend.c b/media/libvpx/libvpx/vp9/encoder/vp9_extend.c new file mode 100644 index 0000000000..dcb62e8768 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_extend.c @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" + +#include "vp9/common/vp9_common.h" +#include "vp9/encoder/vp9_extend.h" + +static void copy_and_extend_plane(const uint8_t *src, int src_pitch, + uint8_t *dst, int dst_pitch, int w, int h, + int extend_top, int extend_left, + int extend_bottom, int extend_right, + int interleave_step) { + int i, j, linesize; + const int step = interleave_step < 1 ? 1 : interleave_step; + + // copy the left and right most columns out + const uint8_t *src_ptr1 = src; + const uint8_t *src_ptr2 = src + (w - 1) * step; + uint8_t *dst_ptr1 = dst - extend_left; + uint8_t *dst_ptr2 = dst + w; + + for (i = 0; i < h; i++) { + memset(dst_ptr1, src_ptr1[0], extend_left); + if (step == 1) { + memcpy(dst_ptr1 + extend_left, src_ptr1, w); + } else { + for (j = 0; j < w; j++) { + dst_ptr1[extend_left + j] = src_ptr1[step * j]; + } + } + memset(dst_ptr2, src_ptr2[0], extend_right); + src_ptr1 += src_pitch; + src_ptr2 += src_pitch; + dst_ptr1 += dst_pitch; + dst_ptr2 += dst_pitch; + } + + // Now copy the top and bottom lines into each line of the respective + // borders + src_ptr1 = dst - extend_left; + src_ptr2 = dst + dst_pitch * (h - 1) - extend_left; + dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left; + dst_ptr2 = dst + dst_pitch * (h)-extend_left; + linesize = extend_left + extend_right + w; + + for (i = 0; i < extend_top; i++) { + memcpy(dst_ptr1, src_ptr1, linesize); + dst_ptr1 += dst_pitch; + } + + for (i = 0; i < extend_bottom; i++) { + memcpy(dst_ptr2, src_ptr2, linesize); + dst_ptr2 += dst_pitch; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch, + uint8_t *dst8, int dst_pitch, int w, + int h, int extend_top, int extend_left, + int extend_bottom, int extend_right) { + int i, linesize; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + + // copy the left and right most columns out + const uint16_t *src_ptr1 = src; + const uint16_t *src_ptr2 = src + w - 1; + uint16_t *dst_ptr1 = dst - extend_left; + uint16_t *dst_ptr2 = dst + w; + + for (i = 0; i < h; i++) { + vpx_memset16(dst_ptr1, src_ptr1[0], extend_left); + memcpy(dst_ptr1 + extend_left, src_ptr1, w * sizeof(src_ptr1[0])); + vpx_memset16(dst_ptr2, src_ptr2[0], extend_right); + src_ptr1 += src_pitch; + src_ptr2 += src_pitch; + dst_ptr1 += dst_pitch; + dst_ptr2 += dst_pitch; + } + + // Now copy the top and bottom lines into each line of the respective + // borders + src_ptr1 = dst - extend_left; + src_ptr2 = dst + dst_pitch * (h - 1) - extend_left; + dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left; + dst_ptr2 = dst + dst_pitch * (h)-extend_left; + linesize = extend_left + extend_right + w; + + for (i = 0; i < extend_top; i++) { + memcpy(dst_ptr1, src_ptr1, linesize * sizeof(src_ptr1[0])); + dst_ptr1 += dst_pitch; + } + + for (i = 0; i < extend_bottom; i++) { + memcpy(dst_ptr2, src_ptr2, linesize * sizeof(src_ptr2[0])); + dst_ptr2 += dst_pitch; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst) { + // Extend src frame in buffer + // Altref filtering assumes 16 pixel extension + const int et_y = 16; + const int el_y = 16; + // Motion estimation may use src block variance with the block size up + // to 64x64, so the right and bottom need to be extended to 64 multiple + // or up to 16, whichever is greater. + const int er_y = + VPXMAX(src->y_width + 16, ALIGN_POWER_OF_TWO(src->y_width, 6)) - + src->y_crop_width; + const int eb_y = + VPXMAX(src->y_height + 16, ALIGN_POWER_OF_TWO(src->y_height, 6)) - + src->y_crop_height; + const int uv_width_subsampling = (src->uv_width != src->y_width); + const int uv_height_subsampling = (src->uv_height != src->y_height); + const int et_uv = et_y >> uv_height_subsampling; + const int el_uv = el_y >> uv_width_subsampling; + const int eb_uv = eb_y >> uv_height_subsampling; + const int er_uv = er_y >> uv_width_subsampling; + // detect nv12 colorspace + const int chroma_step = src->v_buffer - src->u_buffer == 1 ? 2 : 1; + +#if CONFIG_VP9_HIGHBITDEPTH + if (src->flags & YV12_FLAG_HIGHBITDEPTH) { + highbd_copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, src->y_crop_width, + src->y_crop_height, et_y, el_y, eb_y, er_y); + + highbd_copy_and_extend_plane( + src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, + src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv); + + highbd_copy_and_extend_plane( + src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, + src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv); + return; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + + copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, src->y_crop_width, src->y_crop_height, + et_y, el_y, eb_y, er_y, 1); + + copy_and_extend_plane(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, src->uv_crop_width, src->uv_crop_height, + et_uv, el_uv, eb_uv, er_uv, chroma_step); + + copy_and_extend_plane(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, src->uv_crop_width, src->uv_crop_height, + et_uv, el_uv, eb_uv, er_uv, chroma_step); +} + +void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, int srcy, + int srcx, int srch, int srcw) { + // If the side is not touching the bounder then don't extend. + const int et_y = srcy ? 0 : dst->border; + const int el_y = srcx ? 0 : dst->border; + const int eb_y = srcy + srch != src->y_height + ? 0 + : dst->border + dst->y_height - src->y_height; + const int er_y = srcx + srcw != src->y_width + ? 0 + : dst->border + dst->y_width - src->y_width; + const int src_y_offset = srcy * src->y_stride + srcx; + const int dst_y_offset = srcy * dst->y_stride + srcx; + + const int et_uv = ROUND_POWER_OF_TWO(et_y, 1); + const int el_uv = ROUND_POWER_OF_TWO(el_y, 1); + const int eb_uv = ROUND_POWER_OF_TWO(eb_y, 1); + const int er_uv = ROUND_POWER_OF_TWO(er_y, 1); + const int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1); + const int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1); + const int srch_uv = ROUND_POWER_OF_TWO(srch, 1); + const int srcw_uv = ROUND_POWER_OF_TWO(srcw, 1); + // detect nv12 colorspace + const int chroma_step = src->v_buffer - src->u_buffer == 1 ? 2 : 1; + + copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride, + dst->y_buffer + dst_y_offset, dst->y_stride, srcw, srch, + et_y, el_y, eb_y, er_y, 1); + + copy_and_extend_plane(src->u_buffer + src_uv_offset, src->uv_stride, + dst->u_buffer + dst_uv_offset, dst->uv_stride, srcw_uv, + srch_uv, et_uv, el_uv, eb_uv, er_uv, chroma_step); + + copy_and_extend_plane(src->v_buffer + src_uv_offset, src->uv_stride, + dst->v_buffer + dst_uv_offset, dst->uv_stride, srcw_uv, + srch_uv, et_uv, el_uv, eb_uv, er_uv, chroma_step); +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_extend.h b/media/libvpx/libvpx/vp9/encoder/vp9_extend.h new file mode 100644 index 0000000000..4ba7fc95e3 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_extend.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_EXTEND_H_ +#define VPX_VP9_ENCODER_VP9_EXTEND_H_ + +#include "vpx_scale/yv12config.h" +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst); + +void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, int srcy, + int srcx, int srch, int srcw); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_EXTEND_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c new file mode 100644 index 0000000000..a9cdf5353f --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c @@ -0,0 +1,3906 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_scale_rtcd.h" + +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/system_state.h" +#include "vpx_scale/vpx_scale.h" +#include "vpx_scale/yv12config.h" + +#include "vp9/common/vp9_entropymv.h" +#include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_reconinter.h" // vp9_setup_dst_planes() +#include "vp9/encoder/vp9_aq_variance.h" +#include "vp9/encoder/vp9_block.h" +#include "vp9/encoder/vp9_encodeframe.h" +#include "vp9/encoder/vp9_encodemb.h" +#include "vp9/encoder/vp9_encodemv.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_ethread.h" +#include "vp9/encoder/vp9_extend.h" +#include "vp9/encoder/vp9_firstpass.h" +#include "vp9/encoder/vp9_mcomp.h" +#include "vp9/encoder/vp9_quantize.h" +#include "vp9/encoder/vp9_rd.h" +#include "vpx_dsp/variance.h" + +#define OUTPUT_FPF 0 +#define ARF_STATS_OUTPUT 0 +#define COMPLEXITY_STATS_OUTPUT 0 + +#define FIRST_PASS_Q 10.0 +#define NORMAL_BOOST 100 +#define MIN_ARF_GF_BOOST 250 +#define MIN_DECAY_FACTOR 0.01 +#define NEW_MV_MODE_PENALTY 32 +#define DARK_THRESH 64 +#define LOW_I_THRESH 24000 + +#define NCOUNT_INTRA_THRESH 8192 +#define NCOUNT_INTRA_FACTOR 3 + +#define INTRA_PART 0.005 +#define DEFAULT_DECAY_LIMIT 0.75 +#define LOW_SR_DIFF_TRHESH 0.1 +#define LOW_CODED_ERR_PER_MB 10.0 +#define NCOUNT_FRAME_II_THRESH 6.0 +#define BASELINE_ERR_PER_MB 12500.0 +#define GF_MAX_FRAME_BOOST 96.0 + +#ifdef AGGRESSIVE_VBR +#define KF_MIN_FRAME_BOOST 40.0 +#define KF_MAX_FRAME_BOOST 80.0 +#define MAX_KF_TOT_BOOST 4800 +#else +#define KF_MIN_FRAME_BOOST 40.0 +#define KF_MAX_FRAME_BOOST 96.0 +#define MAX_KF_TOT_BOOST 5400 +#endif + +#define DEFAULT_ZM_FACTOR 0.5 +#define MINQ_ADJ_LIMIT 48 +#define MINQ_ADJ_LIMIT_CQ 20 +#define HIGH_UNDERSHOOT_RATIO 2 +#define AV_WQ_FACTOR 4.0 + +#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x)-0.000001 : (x) + 0.000001) + +#if ARF_STATS_OUTPUT +unsigned int arf_count = 0; +#endif + +// Resets the first pass file to the given position using a relative seek from +// the current position. +static void reset_fpf_position(TWO_PASS *p, const FIRSTPASS_STATS *position) { + p->stats_in = position; +} + +// Read frame stats at an offset from the current position. +static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p, int offset) { + if ((offset >= 0 && p->stats_in + offset >= p->stats_in_end) || + (offset < 0 && p->stats_in + offset < p->stats_in_start)) { + return NULL; + } + + return &p->stats_in[offset]; +} + +static int input_stats(TWO_PASS *p, FIRSTPASS_STATS *fps) { + if (p->stats_in >= p->stats_in_end) return EOF; + + *fps = *p->stats_in; + ++p->stats_in; + return 1; +} + +static void output_stats(FIRSTPASS_STATS *stats) { + (void)stats; +// TEMP debug code +#if OUTPUT_FPF + { + FILE *fpfile; + fpfile = fopen("firstpass.stt", "a"); + + fprintf(fpfile, + "%12.0lf %12.4lf %12.2lf %12.2lf %12.2lf %12.0lf %12.4lf %12.4lf" + "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf" + "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.0lf %12.4lf %12.0lf" + "%12.4lf" + "\n", + stats->frame, stats->weight, stats->intra_error, stats->coded_error, + stats->sr_coded_error, stats->frame_noise_energy, stats->pcnt_inter, + stats->pcnt_motion, stats->pcnt_second_ref, stats->pcnt_neutral, + stats->pcnt_intra_low, stats->pcnt_intra_high, + stats->intra_skip_pct, stats->intra_smooth_pct, + stats->inactive_zone_rows, stats->inactive_zone_cols, stats->MVr, + stats->mvr_abs, stats->MVc, stats->mvc_abs, stats->MVrv, + stats->MVcv, stats->mv_in_out_count, stats->count, stats->duration); + fclose(fpfile); + } +#endif +} + +static void zero_stats(FIRSTPASS_STATS *section) { + section->frame = 0.0; + section->weight = 0.0; + section->intra_error = 0.0; + section->coded_error = 0.0; + section->sr_coded_error = 0.0; + section->frame_noise_energy = 0.0; + section->pcnt_inter = 0.0; + section->pcnt_motion = 0.0; + section->pcnt_second_ref = 0.0; + section->pcnt_neutral = 0.0; + section->intra_skip_pct = 0.0; + section->intra_smooth_pct = 0.0; + section->pcnt_intra_low = 0.0; + section->pcnt_intra_high = 0.0; + section->inactive_zone_rows = 0.0; + section->inactive_zone_cols = 0.0; + section->new_mv_count = 0.0; + section->MVr = 0.0; + section->mvr_abs = 0.0; + section->MVc = 0.0; + section->mvc_abs = 0.0; + section->MVrv = 0.0; + section->MVcv = 0.0; + section->mv_in_out_count = 0.0; + section->count = 0.0; + section->duration = 1.0; + section->spatial_layer_id = 0; +} + +static void accumulate_stats(FIRSTPASS_STATS *section, + const FIRSTPASS_STATS *frame) { + section->frame += frame->frame; + section->weight += frame->weight; + section->spatial_layer_id = frame->spatial_layer_id; + section->intra_error += frame->intra_error; + section->coded_error += frame->coded_error; + section->sr_coded_error += frame->sr_coded_error; + section->frame_noise_energy += frame->frame_noise_energy; + section->pcnt_inter += frame->pcnt_inter; + section->pcnt_motion += frame->pcnt_motion; + section->pcnt_second_ref += frame->pcnt_second_ref; + section->pcnt_neutral += frame->pcnt_neutral; + section->intra_skip_pct += frame->intra_skip_pct; + section->intra_smooth_pct += frame->intra_smooth_pct; + section->pcnt_intra_low += frame->pcnt_intra_low; + section->pcnt_intra_high += frame->pcnt_intra_high; + section->inactive_zone_rows += frame->inactive_zone_rows; + section->inactive_zone_cols += frame->inactive_zone_cols; + section->new_mv_count += frame->new_mv_count; + section->MVr += frame->MVr; + section->mvr_abs += frame->mvr_abs; + section->MVc += frame->MVc; + section->mvc_abs += frame->mvc_abs; + section->MVrv += frame->MVrv; + section->MVcv += frame->MVcv; + section->mv_in_out_count += frame->mv_in_out_count; + section->count += frame->count; + section->duration += frame->duration; +} + +static void subtract_stats(FIRSTPASS_STATS *section, + const FIRSTPASS_STATS *frame) { + section->frame -= frame->frame; + section->weight -= frame->weight; + section->intra_error -= frame->intra_error; + section->coded_error -= frame->coded_error; + section->sr_coded_error -= frame->sr_coded_error; + section->frame_noise_energy -= frame->frame_noise_energy; + section->pcnt_inter -= frame->pcnt_inter; + section->pcnt_motion -= frame->pcnt_motion; + section->pcnt_second_ref -= frame->pcnt_second_ref; + section->pcnt_neutral -= frame->pcnt_neutral; + section->intra_skip_pct -= frame->intra_skip_pct; + section->intra_smooth_pct -= frame->intra_smooth_pct; + section->pcnt_intra_low -= frame->pcnt_intra_low; + section->pcnt_intra_high -= frame->pcnt_intra_high; + section->inactive_zone_rows -= frame->inactive_zone_rows; + section->inactive_zone_cols -= frame->inactive_zone_cols; + section->new_mv_count -= frame->new_mv_count; + section->MVr -= frame->MVr; + section->mvr_abs -= frame->mvr_abs; + section->MVc -= frame->MVc; + section->mvc_abs -= frame->mvc_abs; + section->MVrv -= frame->MVrv; + section->MVcv -= frame->MVcv; + section->mv_in_out_count -= frame->mv_in_out_count; + section->count -= frame->count; + section->duration -= frame->duration; +} + +// Calculate an active area of the image that discounts formatting +// bars and partially discounts other 0 energy areas. +#define MIN_ACTIVE_AREA 0.5 +#define MAX_ACTIVE_AREA 1.0 +static double calculate_active_area(const FRAME_INFO *frame_info, + const FIRSTPASS_STATS *this_frame) { + double active_pct; + + active_pct = + 1.0 - + ((this_frame->intra_skip_pct / 2) + + ((this_frame->inactive_zone_rows * 2) / (double)frame_info->mb_rows)); + return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA); +} + +// Get the average weighted error for the clip (or corpus) +static double get_distribution_av_err(VP9_COMP *cpi, TWO_PASS *const twopass) { + const double av_weight = + twopass->total_stats.weight / twopass->total_stats.count; + + if (cpi->oxcf.vbr_corpus_complexity) + return av_weight * twopass->mean_mod_score; + else + return (twopass->total_stats.coded_error * av_weight) / + twopass->total_stats.count; +} + +#define ACT_AREA_CORRECTION 0.5 +// Calculate a modified Error used in distributing bits between easier and +// harder frames. +static double calculate_mod_frame_score(const VP9_COMP *cpi, + const VP9EncoderConfig *oxcf, + const FIRSTPASS_STATS *this_frame, + const double av_err) { + double modified_score = + av_err * pow(this_frame->coded_error * this_frame->weight / + DOUBLE_DIVIDE_CHECK(av_err), + oxcf->two_pass_vbrbias / 100.0); + + // Correction for active area. Frames with a reduced active area + // (eg due to formatting bars) have a higher error per mb for the + // remaining active MBs. The correction here assumes that coding + // 0.5N blocks of complexity 2X is a little easier than coding N + // blocks of complexity X. + modified_score *= pow(calculate_active_area(&cpi->frame_info, this_frame), + ACT_AREA_CORRECTION); + + return modified_score; +} + +static double calc_norm_frame_score(const VP9EncoderConfig *oxcf, + const FRAME_INFO *frame_info, + const FIRSTPASS_STATS *this_frame, + double mean_mod_score, double av_err) { + double modified_score = + av_err * pow(this_frame->coded_error * this_frame->weight / + DOUBLE_DIVIDE_CHECK(av_err), + oxcf->two_pass_vbrbias / 100.0); + + const double min_score = (double)(oxcf->two_pass_vbrmin_section) / 100.0; + const double max_score = (double)(oxcf->two_pass_vbrmax_section) / 100.0; + + // Correction for active area. Frames with a reduced active area + // (eg due to formatting bars) have a higher error per mb for the + // remaining active MBs. The correction here assumes that coding + // 0.5N blocks of complexity 2X is a little easier than coding N + // blocks of complexity X. + modified_score *= + pow(calculate_active_area(frame_info, this_frame), ACT_AREA_CORRECTION); + + // Normalize to a midpoint score. + modified_score /= DOUBLE_DIVIDE_CHECK(mean_mod_score); + return fclamp(modified_score, min_score, max_score); +} + +static double calculate_norm_frame_score(const VP9_COMP *cpi, + const TWO_PASS *twopass, + const VP9EncoderConfig *oxcf, + const FIRSTPASS_STATS *this_frame, + const double av_err) { + return calc_norm_frame_score(oxcf, &cpi->frame_info, this_frame, + twopass->mean_mod_score, av_err); +} + +// This function returns the maximum target rate per frame. +static int frame_max_bits(const RATE_CONTROL *rc, + const VP9EncoderConfig *oxcf) { + int64_t max_bits = ((int64_t)rc->avg_frame_bandwidth * + (int64_t)oxcf->two_pass_vbrmax_section) / + 100; + if (max_bits < 0) + max_bits = 0; + else if (max_bits > rc->max_frame_bandwidth) + max_bits = rc->max_frame_bandwidth; + + return (int)max_bits; +} + +void vp9_init_first_pass(VP9_COMP *cpi) { + zero_stats(&cpi->twopass.total_stats); +} + +void vp9_end_first_pass(VP9_COMP *cpi) { + output_stats(&cpi->twopass.total_stats); + cpi->twopass.first_pass_done = 1; + vpx_free(cpi->twopass.fp_mb_float_stats); + cpi->twopass.fp_mb_float_stats = NULL; +} + +static vpx_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) { + switch (bsize) { + case BLOCK_8X8: return vpx_mse8x8; + case BLOCK_16X8: return vpx_mse16x8; + case BLOCK_8X16: return vpx_mse8x16; + default: return vpx_mse16x16; + } +} + +static unsigned int get_prediction_error(BLOCK_SIZE bsize, + const struct buf_2d *src, + const struct buf_2d *ref) { + unsigned int sse; + const vpx_variance_fn_t fn = get_block_variance_fn(bsize); + fn(src->buf, src->stride, ref->buf, ref->stride, &sse); + return sse; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize, + int bd) { + switch (bd) { + default: + switch (bsize) { + case BLOCK_8X8: return vpx_highbd_8_mse8x8; + case BLOCK_16X8: return vpx_highbd_8_mse16x8; + case BLOCK_8X16: return vpx_highbd_8_mse8x16; + default: return vpx_highbd_8_mse16x16; + } + case 10: + switch (bsize) { + case BLOCK_8X8: return vpx_highbd_10_mse8x8; + case BLOCK_16X8: return vpx_highbd_10_mse16x8; + case BLOCK_8X16: return vpx_highbd_10_mse8x16; + default: return vpx_highbd_10_mse16x16; + } + case 12: + switch (bsize) { + case BLOCK_8X8: return vpx_highbd_12_mse8x8; + case BLOCK_16X8: return vpx_highbd_12_mse16x8; + case BLOCK_8X16: return vpx_highbd_12_mse8x16; + default: return vpx_highbd_12_mse16x16; + } + } +} + +static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize, + const struct buf_2d *src, + const struct buf_2d *ref, + int bd) { + unsigned int sse; + const vpx_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd); + fn(src->buf, src->stride, ref->buf, ref->stride, &sse); + return sse; +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +// Refine the motion search range according to the frame dimension +// for first pass test. +static int get_search_range(const VP9_COMP *cpi) { + int sr = 0; + const int dim = VPXMIN(cpi->initial_width, cpi->initial_height); + + while ((dim << sr) < MAX_FULL_PEL_VAL) ++sr; + return sr; +} + +// Reduce limits to keep the motion search within MV_MAX of ref_mv. Not doing +// this can be problematic for big videos (8K) and may cause assert failure +// (or memory violation) in mv_cost. Limits are only modified if they would +// be non-empty. Returns 1 if limits are non-empty. +static int intersect_limits_with_mv_max(MvLimits *mv_limits, const MV *ref_mv) { + const int row_min = + VPXMAX(mv_limits->row_min, (ref_mv->row + 7 - MV_MAX) >> 3); + const int row_max = + VPXMIN(mv_limits->row_max, (ref_mv->row - 1 + MV_MAX) >> 3); + const int col_min = + VPXMAX(mv_limits->col_min, (ref_mv->col + 7 - MV_MAX) >> 3); + const int col_max = + VPXMIN(mv_limits->col_max, (ref_mv->col - 1 + MV_MAX) >> 3); + if (row_min > row_max || col_min > col_max) { + return 0; + } + mv_limits->row_min = row_min; + mv_limits->row_max = row_max; + mv_limits->col_min = col_min; + mv_limits->col_max = col_max; + return 1; +} + +static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, + const MV *ref_mv, MV *best_mv, + int *best_motion_err) { + MACROBLOCKD *const xd = &x->e_mbd; + MV tmp_mv = { 0, 0 }; + MV ref_mv_full = { ref_mv->row >> 3, ref_mv->col >> 3 }; + int num00, tmp_err, n; + const BLOCK_SIZE bsize = xd->mi[0]->sb_type; + vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize]; + const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY; + MV center_mv_full = ref_mv_full; + unsigned int start_mv_sad; + vp9_sad_fn_ptr_t sad_fn_ptr; + + int step_param = 3; + int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; + const int sr = get_search_range(cpi); + const MvLimits tmp_mv_limits = x->mv_limits; + step_param += sr; + further_steps -= sr; + + if (!intersect_limits_with_mv_max(&x->mv_limits, ref_mv)) { + return; + } + + // Override the default variance function to use MSE. + v_fn_ptr.vf = get_block_variance_fn(bsize); +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, xd->bd); + } +#endif // CONFIG_VP9_HIGHBITDEPTH + + // Calculate SAD of the start mv + clamp_mv(&ref_mv_full, x->mv_limits.col_min, x->mv_limits.col_max, + x->mv_limits.row_min, x->mv_limits.row_max); + start_mv_sad = get_start_mv_sad(x, &ref_mv_full, ¢er_mv_full, + cpi->fn_ptr[bsize].sdf, x->sadperbit16); + sad_fn_ptr.sdf = cpi->fn_ptr[bsize].sdf; + sad_fn_ptr.sdx4df = cpi->fn_ptr[bsize].sdx4df; + + // Center the initial step/diamond search on best mv. + tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, start_mv_sad, + &tmp_mv, step_param, x->sadperbit16, &num00, + &sad_fn_ptr, ref_mv); + if (tmp_err < INT_MAX) + tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1); + if (tmp_err < INT_MAX - new_mv_mode_penalty) tmp_err += new_mv_mode_penalty; + + if (tmp_err < *best_motion_err) { + *best_motion_err = tmp_err; + *best_mv = tmp_mv; + } + + // Carry out further step/diamond searches as necessary. + n = num00; + num00 = 0; + + while (n < further_steps) { + ++n; + + if (num00) { + --num00; + } else { + tmp_err = cpi->diamond_search_sad( + x, &cpi->ss_cfg, &ref_mv_full, start_mv_sad, &tmp_mv, step_param + n, + x->sadperbit16, &num00, &sad_fn_ptr, ref_mv); + if (tmp_err < INT_MAX) + tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1); + if (tmp_err < INT_MAX - new_mv_mode_penalty) + tmp_err += new_mv_mode_penalty; + + if (tmp_err < *best_motion_err) { + *best_motion_err = tmp_err; + *best_mv = tmp_mv; + } + } + } + x->mv_limits = tmp_mv_limits; +} + +static BLOCK_SIZE get_bsize(const VP9_COMMON *cm, int mb_row, int mb_col) { + if (2 * mb_col + 1 < cm->mi_cols) { + return 2 * mb_row + 1 < cm->mi_rows ? BLOCK_16X16 : BLOCK_16X8; + } else { + return 2 * mb_row + 1 < cm->mi_rows ? BLOCK_8X16 : BLOCK_8X8; + } +} + +static int find_fp_qindex(vpx_bit_depth_t bit_depth) { + int i; + + for (i = 0; i < QINDEX_RANGE; ++i) + if (vp9_convert_qindex_to_q(i, bit_depth) >= FIRST_PASS_Q) break; + + if (i == QINDEX_RANGE) i--; + + return i; +} + +static void set_first_pass_params(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + if (!cpi->refresh_alt_ref_frame && + (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY))) { + cm->frame_type = KEY_FRAME; + } else { + cm->frame_type = INTER_FRAME; + } + // Do not use periodic key frames. + cpi->rc.frames_to_key = INT_MAX; +} + +// Scale an sse threshold to account for 8/10/12 bit. +static int scale_sse_threshold(VP9_COMMON *cm, int thresh) { + int ret_val = thresh; +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) { + switch (cm->bit_depth) { + case VPX_BITS_8: ret_val = thresh; break; + case VPX_BITS_10: ret_val = thresh << 4; break; + default: + assert(cm->bit_depth == VPX_BITS_12); + ret_val = thresh << 8; + break; + } + } +#else + (void)cm; +#endif // CONFIG_VP9_HIGHBITDEPTH + return ret_val; +} + +// This threshold is used to track blocks where to all intents and purposes +// the intra prediction error 0. Though the metric we test against +// is technically a sse we are mainly interested in blocks where all the pixels +// in the 8 bit domain have an error of <= 1 (where error = sse) so a +// linear scaling for 10 and 12 bit gives similar results. +#define UL_INTRA_THRESH 50 +static int get_ul_intra_threshold(VP9_COMMON *cm) { + int ret_val = UL_INTRA_THRESH; +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) { + switch (cm->bit_depth) { + case VPX_BITS_8: ret_val = UL_INTRA_THRESH; break; + case VPX_BITS_10: ret_val = UL_INTRA_THRESH << 2; break; + default: + assert(cm->bit_depth == VPX_BITS_12); + ret_val = UL_INTRA_THRESH << 4; + break; + } + } +#else + (void)cm; +#endif // CONFIG_VP9_HIGHBITDEPTH + return ret_val; +} + +#define SMOOTH_INTRA_THRESH 4000 +static int get_smooth_intra_threshold(VP9_COMMON *cm) { + int ret_val = SMOOTH_INTRA_THRESH; +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) { + switch (cm->bit_depth) { + case VPX_BITS_8: ret_val = SMOOTH_INTRA_THRESH; break; + case VPX_BITS_10: ret_val = SMOOTH_INTRA_THRESH << 4; break; + default: + assert(cm->bit_depth == VPX_BITS_12); + ret_val = SMOOTH_INTRA_THRESH << 8; + break; + } + } +#else + (void)cm; +#endif // CONFIG_VP9_HIGHBITDEPTH + return ret_val; +} + +#define FP_DN_THRESH 8 +#define FP_MAX_DN_THRESH 24 +#define KERNEL_SIZE 3 + +// Baseline Kernel weights for first pass noise metric +static uint8_t fp_dn_kernel_3[KERNEL_SIZE * KERNEL_SIZE] = { 1, 2, 1, 2, 4, + 2, 1, 2, 1 }; + +// Estimate noise at a single point based on the impact of a spatial kernel +// on the point value +static int fp_estimate_point_noise(uint8_t *src_ptr, const int stride) { + int sum_weight = 0; + int sum_val = 0; + int i, j; + int max_diff = 0; + int diff; + int dn_diff; + uint8_t *tmp_ptr; + uint8_t *kernel_ptr; + uint8_t dn_val; + uint8_t centre_val = *src_ptr; + + kernel_ptr = fp_dn_kernel_3; + + // Apply the kernel + tmp_ptr = src_ptr - stride - 1; + for (i = 0; i < KERNEL_SIZE; ++i) { + for (j = 0; j < KERNEL_SIZE; ++j) { + diff = abs((int)centre_val - (int)tmp_ptr[j]); + max_diff = VPXMAX(max_diff, diff); + if (diff <= FP_DN_THRESH) { + sum_weight += *kernel_ptr; + sum_val += (int)tmp_ptr[j] * (int)*kernel_ptr; + } + ++kernel_ptr; + } + tmp_ptr += stride; + } + + if (max_diff < FP_MAX_DN_THRESH) + // Update the source value with the new filtered value + dn_val = (sum_val + (sum_weight >> 1)) / sum_weight; + else + dn_val = *src_ptr; + + // return the noise energy as the square of the difference between the + // denoised and raw value. + dn_diff = (int)*src_ptr - (int)dn_val; + return dn_diff * dn_diff; +} +#if CONFIG_VP9_HIGHBITDEPTH +static int fp_highbd_estimate_point_noise(uint8_t *src_ptr, const int stride) { + int sum_weight = 0; + int sum_val = 0; + int i, j; + int max_diff = 0; + int diff; + int dn_diff; + uint8_t *tmp_ptr; + uint16_t *tmp_ptr16; + uint8_t *kernel_ptr; + uint16_t dn_val; + uint16_t centre_val = *CONVERT_TO_SHORTPTR(src_ptr); + + kernel_ptr = fp_dn_kernel_3; + + // Apply the kernel + tmp_ptr = src_ptr - stride - 1; + for (i = 0; i < KERNEL_SIZE; ++i) { + tmp_ptr16 = CONVERT_TO_SHORTPTR(tmp_ptr); + for (j = 0; j < KERNEL_SIZE; ++j) { + diff = abs((int)centre_val - (int)tmp_ptr16[j]); + max_diff = VPXMAX(max_diff, diff); + if (diff <= FP_DN_THRESH) { + sum_weight += *kernel_ptr; + sum_val += (int)tmp_ptr16[j] * (int)*kernel_ptr; + } + ++kernel_ptr; + } + tmp_ptr += stride; + } + + if (max_diff < FP_MAX_DN_THRESH) + // Update the source value with the new filtered value + dn_val = (sum_val + (sum_weight >> 1)) / sum_weight; + else + dn_val = *CONVERT_TO_SHORTPTR(src_ptr); + + // return the noise energy as the square of the difference between the + // denoised and raw value. + dn_diff = (int)(*CONVERT_TO_SHORTPTR(src_ptr)) - (int)dn_val; + return dn_diff * dn_diff; +} +#endif + +// Estimate noise for a block. +static int fp_estimate_block_noise(MACROBLOCK *x, BLOCK_SIZE bsize) { +#if CONFIG_VP9_HIGHBITDEPTH + MACROBLOCKD *xd = &x->e_mbd; +#endif + uint8_t *src_ptr = &x->plane[0].src.buf[0]; + const int width = num_4x4_blocks_wide_lookup[bsize] * 4; + const int height = num_4x4_blocks_high_lookup[bsize] * 4; + int w, h; + int stride = x->plane[0].src.stride; + int block_noise = 0; + + // Sampled points to reduce cost overhead. + for (h = 0; h < height; h += 2) { + for (w = 0; w < width; w += 2) { +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + block_noise += fp_highbd_estimate_point_noise(src_ptr, stride); + else + block_noise += fp_estimate_point_noise(src_ptr, stride); +#else + block_noise += fp_estimate_point_noise(src_ptr, stride); +#endif + ++src_ptr; + } + src_ptr += (stride - width); + } + return block_noise << 2; // Scale << 2 to account for sampling. +} + +// This function is called to test the functionality of row based +// multi-threading in unit tests for bit-exactness +static void accumulate_floating_point_stats(VP9_COMP *cpi, + TileDataEnc *first_tile_col) { + VP9_COMMON *const cm = &cpi->common; + int mb_row, mb_col; + first_tile_col->fp_data.intra_factor = 0; + first_tile_col->fp_data.brightness_factor = 0; + first_tile_col->fp_data.neutral_count = 0; + for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { + for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) { + const int mb_index = mb_row * cm->mb_cols + mb_col; + first_tile_col->fp_data.intra_factor += + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor; + first_tile_col->fp_data.brightness_factor += + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor; + first_tile_col->fp_data.neutral_count += + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count; + } + } +} + +static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps, + FIRSTPASS_DATA *fp_acc_data) { + VP9_COMMON *const cm = &cpi->common; + // The minimum error here insures some bit allocation to frames even + // in static regions. The allocation per MB declines for larger formats + // where the typical "real" energy per MB also falls. + // Initial estimate here uses sqrt(mbs) to define the min_err, where the + // number of mbs is proportional to the image area. + const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs + : cpi->common.MBs; + const double min_err = 200 * sqrt(num_mbs); + + // Clamp the image start to rows/2. This number of rows is discarded top + // and bottom as dead data so rows / 2 means the frame is blank. + if ((fp_acc_data->image_data_start_row > cm->mb_rows / 2) || + (fp_acc_data->image_data_start_row == INVALID_ROW)) { + fp_acc_data->image_data_start_row = cm->mb_rows / 2; + } + // Exclude any image dead zone + if (fp_acc_data->image_data_start_row > 0) { + fp_acc_data->intra_skip_count = + VPXMAX(0, fp_acc_data->intra_skip_count - + (fp_acc_data->image_data_start_row * cm->mb_cols * 2)); + } + + fp_acc_data->intra_factor = fp_acc_data->intra_factor / (double)num_mbs; + fp_acc_data->brightness_factor = + fp_acc_data->brightness_factor / (double)num_mbs; + fps->weight = fp_acc_data->intra_factor * fp_acc_data->brightness_factor; + + fps->frame = cm->current_video_frame; + fps->spatial_layer_id = cpi->svc.spatial_layer_id; + + fps->coded_error = + ((double)(fp_acc_data->coded_error >> 8) + min_err) / num_mbs; + fps->sr_coded_error = + ((double)(fp_acc_data->sr_coded_error >> 8) + min_err) / num_mbs; + fps->intra_error = + ((double)(fp_acc_data->intra_error >> 8) + min_err) / num_mbs; + + fps->frame_noise_energy = + (double)(fp_acc_data->frame_noise_energy) / (double)num_mbs; + fps->count = 1.0; + fps->pcnt_inter = (double)(fp_acc_data->intercount) / num_mbs; + fps->pcnt_second_ref = (double)(fp_acc_data->second_ref_count) / num_mbs; + fps->pcnt_neutral = (double)(fp_acc_data->neutral_count) / num_mbs; + fps->pcnt_intra_low = (double)(fp_acc_data->intra_count_low) / num_mbs; + fps->pcnt_intra_high = (double)(fp_acc_data->intra_count_high) / num_mbs; + fps->intra_skip_pct = (double)(fp_acc_data->intra_skip_count) / num_mbs; + fps->intra_smooth_pct = (double)(fp_acc_data->intra_smooth_count) / num_mbs; + fps->inactive_zone_rows = (double)(fp_acc_data->image_data_start_row); + // Currently set to 0 as most issues relate to letter boxing. + fps->inactive_zone_cols = (double)0; + + if (fp_acc_data->mvcount > 0) { + fps->new_mv_count = (double)(fp_acc_data->new_mv_count) / num_mbs; + fps->MVr = (double)(fp_acc_data->sum_mvr) / fp_acc_data->mvcount; + fps->mvr_abs = (double)(fp_acc_data->sum_mvr_abs) / fp_acc_data->mvcount; + fps->MVc = (double)(fp_acc_data->sum_mvc) / fp_acc_data->mvcount; + fps->mvc_abs = (double)(fp_acc_data->sum_mvc_abs) / fp_acc_data->mvcount; + fps->MVrv = ((double)(fp_acc_data->sum_mvrs) - + ((double)(fp_acc_data->sum_mvr) * (fp_acc_data->sum_mvr) / + fp_acc_data->mvcount)) / + fp_acc_data->mvcount; + fps->MVcv = ((double)(fp_acc_data->sum_mvcs) - + ((double)(fp_acc_data->sum_mvc) * (fp_acc_data->sum_mvc) / + fp_acc_data->mvcount)) / + fp_acc_data->mvcount; + fps->mv_in_out_count = + (double)(fp_acc_data->sum_in_vectors) / (fp_acc_data->mvcount * 2); + fps->pcnt_motion = (double)(fp_acc_data->mvcount) / num_mbs; + } else { + fps->new_mv_count = 0.0; + fps->MVr = 0.0; + fps->mvr_abs = 0.0; + fps->MVc = 0.0; + fps->mvc_abs = 0.0; + fps->MVrv = 0.0; + fps->MVcv = 0.0; + fps->mv_in_out_count = 0.0; + fps->pcnt_motion = 0.0; + } +} + +static void accumulate_fp_mb_row_stat(TileDataEnc *this_tile, + FIRSTPASS_DATA *fp_acc_data) { + this_tile->fp_data.intra_factor += fp_acc_data->intra_factor; + this_tile->fp_data.brightness_factor += fp_acc_data->brightness_factor; + this_tile->fp_data.coded_error += fp_acc_data->coded_error; + this_tile->fp_data.sr_coded_error += fp_acc_data->sr_coded_error; + this_tile->fp_data.frame_noise_energy += fp_acc_data->frame_noise_energy; + this_tile->fp_data.intra_error += fp_acc_data->intra_error; + this_tile->fp_data.intercount += fp_acc_data->intercount; + this_tile->fp_data.second_ref_count += fp_acc_data->second_ref_count; + this_tile->fp_data.neutral_count += fp_acc_data->neutral_count; + this_tile->fp_data.intra_count_low += fp_acc_data->intra_count_low; + this_tile->fp_data.intra_count_high += fp_acc_data->intra_count_high; + this_tile->fp_data.intra_skip_count += fp_acc_data->intra_skip_count; + this_tile->fp_data.new_mv_count += fp_acc_data->new_mv_count; + this_tile->fp_data.mvcount += fp_acc_data->mvcount; + this_tile->fp_data.sum_mvr += fp_acc_data->sum_mvr; + this_tile->fp_data.sum_mvr_abs += fp_acc_data->sum_mvr_abs; + this_tile->fp_data.sum_mvc += fp_acc_data->sum_mvc; + this_tile->fp_data.sum_mvc_abs += fp_acc_data->sum_mvc_abs; + this_tile->fp_data.sum_mvrs += fp_acc_data->sum_mvrs; + this_tile->fp_data.sum_mvcs += fp_acc_data->sum_mvcs; + this_tile->fp_data.sum_in_vectors += fp_acc_data->sum_in_vectors; + this_tile->fp_data.intra_smooth_count += fp_acc_data->intra_smooth_count; + this_tile->fp_data.image_data_start_row = + VPXMIN(this_tile->fp_data.image_data_start_row, + fp_acc_data->image_data_start_row) == INVALID_ROW + ? VPXMAX(this_tile->fp_data.image_data_start_row, + fp_acc_data->image_data_start_row) + : VPXMIN(this_tile->fp_data.image_data_start_row, + fp_acc_data->image_data_start_row); +} + +#if CONFIG_RATE_CTRL +static void store_fp_motion_vector(VP9_COMP *cpi, const MV *mv, + const int mb_row, const int mb_col, + MV_REFERENCE_FRAME frame_type, + const int mv_idx) { + VP9_COMMON *const cm = &cpi->common; + const int mb_index = mb_row * cm->mb_cols + mb_col; + MOTION_VECTOR_INFO *this_motion_vector_info = + &cpi->fp_motion_vector_info[mb_index]; + this_motion_vector_info->ref_frame[mv_idx] = frame_type; + if (frame_type != INTRA_FRAME) { + this_motion_vector_info->mv[mv_idx].as_mv = *mv; + } +} +#endif // CONFIG_RATE_CTRL + +#define NZ_MOTION_PENALTY 128 +#define INTRA_MODE_PENALTY 1024 +void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, + FIRSTPASS_DATA *fp_acc_data, + TileDataEnc *tile_data, MV *best_ref_mv, + int mb_row) { + int mb_col; + MACROBLOCK *const x = &td->mb; + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + TileInfo tile = tile_data->tile_info; + const int mb_col_start = ROUND_POWER_OF_TWO(tile.mi_col_start, 1); + const int mb_col_end = ROUND_POWER_OF_TWO(tile.mi_col_end, 1); + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = xd->plane; + const PICK_MODE_CONTEXT *ctx = &td->pc_root->none; + int i, c; + int num_mb_cols = get_num_cols(tile_data->tile_info, 1); + + int recon_yoffset, recon_uvoffset; + const int intrapenalty = INTRA_MODE_PENALTY; + const MV zero_mv = { 0, 0 }; + int recon_y_stride, recon_uv_stride, uv_mb_height; + + YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME); + YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME); + YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm); + const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12; + + MODE_INFO mi_above, mi_left; + + double mb_intra_factor; + double mb_brightness_factor; + double mb_neutral_count; + int scaled_low_intra_thresh = scale_sse_threshold(cm, LOW_I_THRESH); + + MV *first_top_mv = &tile_data->firstpass_top_mv; + MV last_nonzero_mv = { 0, 0 }; + + // First pass code requires valid last and new frame buffers. + assert(new_yv12 != NULL); + assert(frame_is_intra_only(cm) || (lst_yv12 != NULL)); + + xd->mi = cm->mi_grid_visible + xd->mi_stride * (mb_row << 1) + mb_col_start; + xd->mi[0] = cm->mi + xd->mi_stride * (mb_row << 1) + mb_col_start; + + for (i = 0; i < MAX_MB_PLANE; ++i) { + p[i].coeff = ctx->coeff_pbuf[i][1]; + p[i].qcoeff = ctx->qcoeff_pbuf[i][1]; + pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1]; + p[i].eobs = ctx->eobs_pbuf[i][1]; + } + + recon_y_stride = new_yv12->y_stride; + recon_uv_stride = new_yv12->uv_stride; + uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height); + + // Reset above block coeffs. + recon_yoffset = (mb_row * recon_y_stride * 16) + mb_col_start * 16; + recon_uvoffset = + (mb_row * recon_uv_stride * uv_mb_height) + mb_col_start * uv_mb_height; + + // Set up limit values for motion vectors to prevent them extending + // outside the UMV borders. + x->mv_limits.row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16); + x->mv_limits.row_max = + ((cm->mb_rows - 1 - mb_row) * 16) + BORDER_MV_PIXELS_B16; + + for (mb_col = mb_col_start, c = 0; mb_col < mb_col_end; ++mb_col, c++) { + int this_error; + int this_intra_error; + const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row); + const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col); + double log_intra; + int level_sample; + const int mb_index = mb_row * cm->mb_cols + mb_col; + + (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, mb_row, c); + + if (mb_col == mb_col_start) { + last_nonzero_mv = *first_top_mv; + } + + // Adjust to the next column of MBs. + x->plane[0].src.buf = cpi->Source->y_buffer + + mb_row * 16 * x->plane[0].src.stride + mb_col * 16; + x->plane[1].src.buf = cpi->Source->u_buffer + + mb_row * uv_mb_height * x->plane[1].src.stride + + mb_col * uv_mb_height; + x->plane[2].src.buf = cpi->Source->v_buffer + + mb_row * uv_mb_height * x->plane[1].src.stride + + mb_col * uv_mb_height; + + vpx_clear_system_state(); + + xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset; + xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset; + xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset; + xd->mi[0]->sb_type = bsize; + xd->mi[0]->ref_frame[0] = INTRA_FRAME; + set_mi_row_col(xd, &tile, mb_row << 1, num_8x8_blocks_high_lookup[bsize], + mb_col << 1, num_8x8_blocks_wide_lookup[bsize], cm->mi_rows, + cm->mi_cols); + // Are edges available for intra prediction? + // Since the firstpass does not populate the mi_grid_visible, + // above_mi/left_mi must be overwritten with a nonzero value when edges + // are available. Required by vp9_predict_intra_block(). + xd->above_mi = (mb_row != 0) ? &mi_above : NULL; + xd->left_mi = ((mb_col << 1) > tile.mi_col_start) ? &mi_left : NULL; + + // Do intra 16x16 prediction. + x->skip_encode = 0; + x->fp_src_pred = 0; + // Do intra prediction based on source pixels for tile boundaries + if (mb_col == mb_col_start && mb_col != 0) { + xd->left_mi = &mi_left; + x->fp_src_pred = 1; + } + xd->mi[0]->mode = DC_PRED; + xd->mi[0]->tx_size = + use_dc_pred ? (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4; + // Fix - zero the 16x16 block first. This ensures correct this_error for + // block sizes smaller than 16x16. + vp9_zero_array(x->plane[0].src_diff, 256); + vp9_encode_intra_block_plane(x, bsize, 0, 0); + this_error = vpx_get_mb_ss(x->plane[0].src_diff); + this_intra_error = this_error; + + // Keep a record of blocks that have very low intra error residual + // (i.e. are in effect completely flat and untextured in the intra + // domain). In natural videos this is uncommon, but it is much more + // common in animations, graphics and screen content, so may be used + // as a signal to detect these types of content. + if (this_error < get_ul_intra_threshold(cm)) { + ++(fp_acc_data->intra_skip_count); + } else if ((mb_col > 0) && + (fp_acc_data->image_data_start_row == INVALID_ROW)) { + fp_acc_data->image_data_start_row = mb_row; + } + + // Blocks that are mainly smooth in the intra domain. + // Some special accounting for CQ but also these are better for testing + // noise levels. + if (this_error < get_smooth_intra_threshold(cm)) { + ++(fp_acc_data->intra_smooth_count); + } + + // Special case noise measurement for first frame. + if (cm->current_video_frame == 0) { + if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) { + fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize); + } else { + fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF; + } + } + +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) { + switch (cm->bit_depth) { + case VPX_BITS_8: break; + case VPX_BITS_10: this_error >>= 4; break; + default: + assert(cm->bit_depth == VPX_BITS_12); + this_error >>= 8; + break; + } + } +#endif // CONFIG_VP9_HIGHBITDEPTH + + vpx_clear_system_state(); + log_intra = log(this_error + 1.0); + if (log_intra < 10.0) { + mb_intra_factor = 1.0 + ((10.0 - log_intra) * 0.05); + fp_acc_data->intra_factor += mb_intra_factor; + if (cpi->row_mt_bit_exact) + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor = + mb_intra_factor; + } else { + fp_acc_data->intra_factor += 1.0; + if (cpi->row_mt_bit_exact) + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor = 1.0; + } + +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) + level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0]; + else + level_sample = x->plane[0].src.buf[0]; +#else + level_sample = x->plane[0].src.buf[0]; +#endif + if ((level_sample < DARK_THRESH) && (log_intra < 9.0)) { + mb_brightness_factor = 1.0 + (0.01 * (DARK_THRESH - level_sample)); + fp_acc_data->brightness_factor += mb_brightness_factor; + if (cpi->row_mt_bit_exact) + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor = + mb_brightness_factor; + } else { + fp_acc_data->brightness_factor += 1.0; + if (cpi->row_mt_bit_exact) + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor = + 1.0; + } + + // Intrapenalty below deals with situations where the intra and inter + // error scores are very low (e.g. a plain black frame). + // We do not have special cases in first pass for 0,0 and nearest etc so + // all inter modes carry an overhead cost estimate for the mv. + // When the error score is very low this causes us to pick all or lots of + // INTRA modes and throw lots of key frames. + // This penalty adds a cost matching that of a 0,0 mv to the intra case. + this_error += intrapenalty; + + // Accumulate the intra error. + fp_acc_data->intra_error += (int64_t)this_error; + + // Set up limit values for motion vectors to prevent them extending + // outside the UMV borders. + x->mv_limits.col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16); + x->mv_limits.col_max = + ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16; + + // Other than for intra-only frame do a motion search. + if (!frame_is_intra_only(cm)) { + int tmp_err, motion_error, this_motion_error, raw_motion_error; + // Assume 0,0 motion with no mv overhead. + MV mv = { 0, 0 }, tmp_mv = { 0, 0 }; + struct buf_2d unscaled_last_source_buf_2d; + vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize]; + +#if CONFIG_RATE_CTRL + if (cpi->oxcf.use_simple_encode_api) { + // Store zero mv as default + store_fp_motion_vector(cpi, &mv, mb_row, mb_col, LAST_FRAME, 0); + } +#endif // CONFIG_RAGE_CTRL + + xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + motion_error = highbd_get_prediction_error( + bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd); + this_motion_error = highbd_get_prediction_error( + bsize, &x->plane[0].src, &xd->plane[0].pre[0], 8); + } else { + motion_error = + get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]); + this_motion_error = motion_error; + } +#else + motion_error = + get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]); + this_motion_error = motion_error; +#endif // CONFIG_VP9_HIGHBITDEPTH + + // Compute the motion error of the 0,0 motion using the last source + // frame as the reference. Skip the further motion search on + // reconstructed frame if this error is very small. + unscaled_last_source_buf_2d.buf = + cpi->unscaled_last_source->y_buffer + recon_yoffset; + unscaled_last_source_buf_2d.stride = cpi->unscaled_last_source->y_stride; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + raw_motion_error = highbd_get_prediction_error( + bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd); + } else { + raw_motion_error = get_prediction_error(bsize, &x->plane[0].src, + &unscaled_last_source_buf_2d); + } +#else + raw_motion_error = get_prediction_error(bsize, &x->plane[0].src, + &unscaled_last_source_buf_2d); +#endif // CONFIG_VP9_HIGHBITDEPTH + + if (raw_motion_error > NZ_MOTION_PENALTY) { + // Test last reference frame using the previous best mv as the + // starting point (best reference) for the search. + first_pass_motion_search(cpi, x, best_ref_mv, &mv, &motion_error); + + v_fn_ptr.vf = get_block_variance_fn(bsize); +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, 8); + } +#endif // CONFIG_VP9_HIGHBITDEPTH + this_motion_error = + vp9_get_mvpred_var(x, &mv, best_ref_mv, &v_fn_ptr, 0); + + // If the current best reference mv is not centered on 0,0 then do a + // 0,0 based search as well. + if (!is_zero_mv(best_ref_mv)) { + tmp_err = INT_MAX; + first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err); + + if (tmp_err < motion_error) { + motion_error = tmp_err; + mv = tmp_mv; + this_motion_error = + vp9_get_mvpred_var(x, &tmp_mv, &zero_mv, &v_fn_ptr, 0); + } + } +#if CONFIG_RATE_CTRL + if (cpi->oxcf.use_simple_encode_api) { + store_fp_motion_vector(cpi, &mv, mb_row, mb_col, LAST_FRAME, 0); + } +#endif // CONFIG_RAGE_CTRL + + // Search in an older reference frame. + if ((cm->current_video_frame > 1) && gld_yv12 != NULL) { + // Assume 0,0 motion with no mv overhead. + int gf_motion_error; + + xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + gf_motion_error = highbd_get_prediction_error( + bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd); + } else { + gf_motion_error = get_prediction_error(bsize, &x->plane[0].src, + &xd->plane[0].pre[0]); + } +#else + gf_motion_error = get_prediction_error(bsize, &x->plane[0].src, + &xd->plane[0].pre[0]); +#endif // CONFIG_VP9_HIGHBITDEPTH + + first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &gf_motion_error); +#if CONFIG_RATE_CTRL + if (cpi->oxcf.use_simple_encode_api) { + store_fp_motion_vector(cpi, &tmp_mv, mb_row, mb_col, GOLDEN_FRAME, + 1); + } +#endif // CONFIG_RAGE_CTRL + + if (gf_motion_error < motion_error && gf_motion_error < this_error) + ++(fp_acc_data->second_ref_count); + + // Reset to last frame as reference buffer. + xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset; + xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset; + xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset; + + // In accumulating a score for the older reference frame take the + // best of the motion predicted score and the intra coded error + // (just as will be done for) accumulation of "coded_error" for + // the last frame. + if (gf_motion_error < this_error) + fp_acc_data->sr_coded_error += gf_motion_error; + else + fp_acc_data->sr_coded_error += this_error; + } else { + fp_acc_data->sr_coded_error += motion_error; + } + } else { + fp_acc_data->sr_coded_error += motion_error; + } + + // Start by assuming that intra mode is best. + best_ref_mv->row = 0; + best_ref_mv->col = 0; + + if (motion_error <= this_error) { + vpx_clear_system_state(); + + // Keep a count of cases where the inter and intra were very close + // and very low. This helps with scene cut detection for example in + // cropped clips with black bars at the sides or top and bottom. + if (((this_error - intrapenalty) * 9 <= motion_error * 10) && + (this_error < (2 * intrapenalty))) { + fp_acc_data->neutral_count += 1.0; + if (cpi->row_mt_bit_exact) + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count = + 1.0; + // Also track cases where the intra is not much worse than the inter + // and use this in limiting the GF/arf group length. + } else if ((this_error > NCOUNT_INTRA_THRESH) && + (this_error < (NCOUNT_INTRA_FACTOR * motion_error))) { + mb_neutral_count = + (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_error); + fp_acc_data->neutral_count += mb_neutral_count; + if (cpi->row_mt_bit_exact) + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count = + mb_neutral_count; + } + + mv.row *= 8; + mv.col *= 8; + this_error = motion_error; + xd->mi[0]->mode = NEWMV; + xd->mi[0]->mv[0].as_mv = mv; + xd->mi[0]->tx_size = TX_4X4; + xd->mi[0]->ref_frame[0] = LAST_FRAME; + xd->mi[0]->ref_frame[1] = NO_REF_FRAME; + vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize); + vp9_encode_sby_pass1(x, bsize); + fp_acc_data->sum_mvr += mv.row; + fp_acc_data->sum_mvr_abs += abs(mv.row); + fp_acc_data->sum_mvc += mv.col; + fp_acc_data->sum_mvc_abs += abs(mv.col); + fp_acc_data->sum_mvrs += mv.row * mv.row; + fp_acc_data->sum_mvcs += mv.col * mv.col; + ++(fp_acc_data->intercount); + + *best_ref_mv = mv; + + if (!is_zero_mv(&mv)) { + ++(fp_acc_data->mvcount); + if (!is_equal_mv(&mv, &last_nonzero_mv)) { + ++(fp_acc_data->new_mv_count); + } + last_nonzero_mv = mv; + + // Does the row vector point inwards or outwards? + if (mb_row < cm->mb_rows / 2) { + if (mv.row > 0) + --(fp_acc_data->sum_in_vectors); + else if (mv.row < 0) + ++(fp_acc_data->sum_in_vectors); + } else if (mb_row > cm->mb_rows / 2) { + if (mv.row > 0) + ++(fp_acc_data->sum_in_vectors); + else if (mv.row < 0) + --(fp_acc_data->sum_in_vectors); + } + + // Does the col vector point inwards or outwards? + if (mb_col < cm->mb_cols / 2) { + if (mv.col > 0) + --(fp_acc_data->sum_in_vectors); + else if (mv.col < 0) + ++(fp_acc_data->sum_in_vectors); + } else if (mb_col > cm->mb_cols / 2) { + if (mv.col > 0) + ++(fp_acc_data->sum_in_vectors); + else if (mv.col < 0) + --(fp_acc_data->sum_in_vectors); + } + } + if (this_intra_error < scaled_low_intra_thresh) { + fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize); + } else { + fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF; + } + } else { // Intra < inter error + if (this_intra_error < scaled_low_intra_thresh) { + fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize); + if (this_motion_error < scaled_low_intra_thresh) { + fp_acc_data->intra_count_low += 1.0; + } else { + fp_acc_data->intra_count_high += 1.0; + } + } else { + fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF; + fp_acc_data->intra_count_high += 1.0; + } + } + } else { + fp_acc_data->sr_coded_error += (int64_t)this_error; +#if CONFIG_RATE_CTRL + if (cpi->oxcf.use_simple_encode_api) { + store_fp_motion_vector(cpi, NULL, mb_row, mb_col, INTRA_FRAME, 0); + } +#endif // CONFIG_RAGE_CTRL + } + fp_acc_data->coded_error += (int64_t)this_error; + + if (mb_col == mb_col_start) { + *first_top_mv = last_nonzero_mv; + } + recon_yoffset += 16; + recon_uvoffset += uv_mb_height; + + // Accumulate row level stats to the corresponding tile stats + if (cpi->row_mt && mb_col == mb_col_end - 1) + accumulate_fp_mb_row_stat(tile_data, fp_acc_data); + + (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, mb_row, c, + num_mb_cols); + } + vpx_clear_system_state(); +} + +static void first_pass_encode(VP9_COMP *cpi, FIRSTPASS_DATA *fp_acc_data) { + VP9_COMMON *const cm = &cpi->common; + int mb_row; + TileDataEnc tile_data; + TileInfo *tile = &tile_data.tile_info; + MV zero_mv = { 0, 0 }; + MV best_ref_mv; + // Tiling is ignored in the first pass. + vp9_tile_init(tile, cm, 0, 0); + tile_data.firstpass_top_mv = zero_mv; +#if CONFIG_RATE_CTRL + if (cpi->oxcf.use_simple_encode_api) { + fp_motion_vector_info_reset(cpi->frame_info.frame_width, + cpi->frame_info.frame_height, + cpi->fp_motion_vector_info); + } +#endif + + for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { + best_ref_mv = zero_mv; + vp9_first_pass_encode_tile_mb_row(cpi, &cpi->td, fp_acc_data, &tile_data, + &best_ref_mv, mb_row); + } +} + +void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { + MACROBLOCK *const x = &cpi->td.mb; + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + TWO_PASS *twopass = &cpi->twopass; + + YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME); + YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME); + YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm); + const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12; + + BufferPool *const pool = cm->buffer_pool; + + FIRSTPASS_DATA fp_temp_data; + FIRSTPASS_DATA *fp_acc_data = &fp_temp_data; + + vpx_clear_system_state(); + vp9_zero(fp_temp_data); + fp_acc_data->image_data_start_row = INVALID_ROW; + + // First pass code requires valid last and new frame buffers. + assert(new_yv12 != NULL); + assert(frame_is_intra_only(cm) || (lst_yv12 != NULL)); + + set_first_pass_params(cpi); + vp9_set_quantizer(cpi, find_fp_qindex(cm->bit_depth)); + + vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y); + + vp9_setup_src_planes(x, cpi->Source, 0, 0); + vp9_setup_dst_planes(xd->plane, new_yv12, 0, 0); + + if (!frame_is_intra_only(cm)) { + vp9_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL); + } + + xd->mi = cm->mi_grid_visible; + xd->mi[0] = cm->mi; + + vp9_frame_init_quantizer(cpi); + + x->skip_recode = 0; + + vp9_init_mv_probs(cm); + vp9_initialize_rd_consts(cpi); + + cm->log2_tile_rows = 0; + + if (cpi->row_mt_bit_exact && cpi->twopass.fp_mb_float_stats == NULL) + CHECK_MEM_ERROR( + &cm->error, cpi->twopass.fp_mb_float_stats, + vpx_calloc(cm->MBs * sizeof(*cpi->twopass.fp_mb_float_stats), 1)); + + { + FIRSTPASS_STATS fps; + TileDataEnc *first_tile_col; + if (!cpi->row_mt) { + cm->log2_tile_cols = 0; + cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read_dummy; + cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write_dummy; + first_pass_encode(cpi, fp_acc_data); + first_pass_stat_calc(cpi, &fps, fp_acc_data); + } else { + cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read; + cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write; + if (cpi->row_mt_bit_exact) { + cm->log2_tile_cols = 0; + vp9_zero_array(cpi->twopass.fp_mb_float_stats, cm->MBs); + } + vp9_encode_fp_row_mt(cpi); + first_tile_col = &cpi->tile_data[0]; + if (cpi->row_mt_bit_exact) + accumulate_floating_point_stats(cpi, first_tile_col); + first_pass_stat_calc(cpi, &fps, &(first_tile_col->fp_data)); + } + + // Don't allow a value of 0 for duration. + // (Section duration is also defaulted to minimum of 1.0). + fps.duration = VPXMAX(1.0, (double)(source->ts_end - source->ts_start)); + + // Don't want to do output stats with a stack variable! + twopass->this_frame_stats = fps; + output_stats(&twopass->this_frame_stats); + accumulate_stats(&twopass->total_stats, &fps); + } + + // Copy the previous Last Frame back into gf and arf buffers if + // the prediction is good enough... but also don't allow it to lag too far. + if ((twopass->sr_update_lag > 3) || + ((cm->current_video_frame > 0) && + (twopass->this_frame_stats.pcnt_inter > 0.20) && + ((twopass->this_frame_stats.intra_error / + DOUBLE_DIVIDE_CHECK(twopass->this_frame_stats.coded_error)) > 2.0))) { + if (gld_yv12 != NULL) { + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx], + cm->ref_frame_map[cpi->lst_fb_idx]); + } + twopass->sr_update_lag = 1; + } else { + ++twopass->sr_update_lag; + } + + vpx_extend_frame_borders(new_yv12); + + // The frame we just compressed now becomes the last frame. + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx], + cm->new_fb_idx); + + // Special case for the first frame. Copy into the GF buffer as a second + // reference. + if (cm->current_video_frame == 0 && cpi->gld_fb_idx != INVALID_IDX) { + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx], + cm->ref_frame_map[cpi->lst_fb_idx]); + } + + // In the first pass, every frame is considered as a show frame. + update_frame_indexes(cm, /*show_frame=*/1); + if (cpi->use_svc) vp9_inc_frame_in_layer(cpi); +} + +static const double q_pow_term[(QINDEX_RANGE >> 5) + 1] = { 0.65, 0.70, 0.75, + 0.85, 0.90, 0.90, + 0.90, 1.00, 1.25 }; + +static double calc_correction_factor(double err_per_mb, double err_divisor, + int q) { + const double error_term = err_per_mb / DOUBLE_DIVIDE_CHECK(err_divisor); + const int index = q >> 5; + double power_term; + + assert((index >= 0) && (index < (QINDEX_RANGE >> 5))); + + // Adjustment based on quantizer to the power term. + power_term = + q_pow_term[index] + + (((q_pow_term[index + 1] - q_pow_term[index]) * (q % 32)) / 32.0); + + // Calculate correction factor. + if (power_term < 1.0) assert(error_term >= 0.0); + + return fclamp(pow(error_term, power_term), 0.05, 5.0); +} + +static double wq_err_divisor(VP9_COMP *cpi) { + const VP9_COMMON *const cm = &cpi->common; + unsigned int screen_area = (cm->width * cm->height); + + // Use a different error per mb factor for calculating boost for + // different formats. + if (screen_area <= 640 * 360) { + return 115.0; + } else if (screen_area < 1280 * 720) { + return 125.0; + } else if (screen_area <= 1920 * 1080) { + return 130.0; + } else if (screen_area < 3840 * 2160) { + return 150.0; + } + + // Fall through to here only for 4K and above. + return 200.0; +} + +#define NOISE_FACTOR_MIN 0.9 +#define NOISE_FACTOR_MAX 1.1 +static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err, + double inactive_zone, double section_noise, + int section_target_bandwidth) { + const RATE_CONTROL *const rc = &cpi->rc; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + TWO_PASS *const twopass = &cpi->twopass; + double last_group_rate_err; + + // Clamp the target rate to VBR min / max limts. + const int target_rate = + vp9_rc_clamp_pframe_target_size(cpi, section_target_bandwidth); + double noise_factor = pow((section_noise / SECTION_NOISE_DEF), 0.5); + noise_factor = fclamp(noise_factor, NOISE_FACTOR_MIN, NOISE_FACTOR_MAX); + inactive_zone = fclamp(inactive_zone, 0.0, 1.0); + +// TODO(jimbankoski): remove #if here or below when this has been +// well tested. +#if CONFIG_ALWAYS_ADJUST_BPM + // based on recent history adjust expectations of bits per macroblock. + last_group_rate_err = + (double)twopass->rolling_arf_group_actual_bits / + DOUBLE_DIVIDE_CHECK((double)twopass->rolling_arf_group_target_bits); + last_group_rate_err = VPXMAX(0.25, VPXMIN(4.0, last_group_rate_err)); + twopass->bpm_factor *= (3.0 + last_group_rate_err) / 4.0; + twopass->bpm_factor = VPXMAX(0.25, VPXMIN(4.0, twopass->bpm_factor)); +#endif + + if (target_rate <= 0) { + return rc->worst_quality; // Highest value allowed + } else { + const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) + ? cpi->initial_mbs + : cpi->common.MBs; + const double active_pct = VPXMAX(0.01, 1.0 - inactive_zone); + const int active_mbs = (int)VPXMAX(1, (double)num_mbs * active_pct); + const double av_err_per_mb = section_err / active_pct; + const double speed_term = 1.0 + 0.04 * oxcf->speed; + const int target_norm_bits_per_mb = + (int)(((uint64_t)target_rate << BPER_MB_NORMBITS) / active_mbs); + int q; + +// TODO(jimbankoski): remove #if here or above when this has been +// well tested. +#if !CONFIG_ALWAYS_ADJUST_BPM + // based on recent history adjust expectations of bits per macroblock. + last_group_rate_err = + (double)twopass->rolling_arf_group_actual_bits / + DOUBLE_DIVIDE_CHECK((double)twopass->rolling_arf_group_target_bits); + last_group_rate_err = VPXMAX(0.25, VPXMIN(4.0, last_group_rate_err)); + twopass->bpm_factor *= (3.0 + last_group_rate_err) / 4.0; + twopass->bpm_factor = VPXMAX(0.25, VPXMIN(4.0, twopass->bpm_factor)); +#endif + + // Try and pick a max Q that will be high enough to encode the + // content at the given rate. + for (q = rc->best_quality; q < rc->worst_quality; ++q) { + const double factor = + calc_correction_factor(av_err_per_mb, wq_err_divisor(cpi), q); + const int bits_per_mb = vp9_rc_bits_per_mb( + INTER_FRAME, q, + factor * speed_term * cpi->twopass.bpm_factor * noise_factor, + cpi->common.bit_depth); + if (bits_per_mb <= target_norm_bits_per_mb) break; + } + + // Restriction on active max q for constrained quality mode. + if (cpi->oxcf.rc_mode == VPX_CQ) q = VPXMAX(q, oxcf->cq_level); + return q; + } +} + +static void setup_rf_level_maxq(VP9_COMP *cpi) { + int i; + RATE_CONTROL *const rc = &cpi->rc; + for (i = INTER_NORMAL; i < RATE_FACTOR_LEVELS; ++i) { + int qdelta = vp9_frame_type_qdelta(cpi, i, rc->worst_quality); + rc->rf_level_maxq[i] = VPXMAX(rc->worst_quality + qdelta, rc->best_quality); + } +} + +static void init_subsampling(VP9_COMP *cpi) { + const VP9_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + const int w = cm->width; + const int h = cm->height; + int i; + + for (i = 0; i < FRAME_SCALE_STEPS; ++i) { + // Note: Frames with odd-sized dimensions may result from this scaling. + rc->frame_width[i] = (w * 16) / frame_scale_factor[i]; + rc->frame_height[i] = (h * 16) / frame_scale_factor[i]; + } + + setup_rf_level_maxq(cpi); +} + +void calculate_coded_size(VP9_COMP *cpi, int *scaled_frame_width, + int *scaled_frame_height) { + RATE_CONTROL *const rc = &cpi->rc; + *scaled_frame_width = rc->frame_width[rc->frame_size_selector]; + *scaled_frame_height = rc->frame_height[rc->frame_size_selector]; +} + +void vp9_init_second_pass(VP9_COMP *cpi) { + VP9EncoderConfig *const oxcf = &cpi->oxcf; + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->twopass; + double frame_rate; + FIRSTPASS_STATS *stats; + + zero_stats(&twopass->total_stats); + zero_stats(&twopass->total_left_stats); + + if (!twopass->stats_in_end) return; + + stats = &twopass->total_stats; + + *stats = *twopass->stats_in_end; + twopass->total_left_stats = *stats; + + // Scan the first pass file and calculate a modified score for each + // frame that is used to distribute bits. The modified score is assumed + // to provide a linear basis for bit allocation. I.e., a frame A with a score + // that is double that of frame B will be allocated 2x as many bits. + { + double modified_score_total = 0.0; + const FIRSTPASS_STATS *s = twopass->stats_in; + double av_err; + + if (oxcf->vbr_corpus_complexity) { + twopass->mean_mod_score = (double)oxcf->vbr_corpus_complexity / 10.0; + av_err = get_distribution_av_err(cpi, twopass); + } else { + av_err = get_distribution_av_err(cpi, twopass); + // The first scan is unclamped and gives a raw average. + while (s < twopass->stats_in_end) { + modified_score_total += calculate_mod_frame_score(cpi, oxcf, s, av_err); + ++s; + } + + // The average error from this first scan is used to define the midpoint + // error for the rate distribution function. + twopass->mean_mod_score = + modified_score_total / DOUBLE_DIVIDE_CHECK(stats->count); + } + + // Second scan using clamps based on the previous cycle average. + // This may modify the total and average somewhat but we don't bother with + // further iterations. + modified_score_total = 0.0; + s = twopass->stats_in; + while (s < twopass->stats_in_end) { + modified_score_total += + calculate_norm_frame_score(cpi, twopass, oxcf, s, av_err); + ++s; + } + twopass->normalized_score_left = modified_score_total; + + // If using Corpus wide VBR mode then update the clip target bandwidth to + // reflect how the clip compares to the rest of the corpus. + if (oxcf->vbr_corpus_complexity) { + oxcf->target_bandwidth = + (int64_t)((double)oxcf->target_bandwidth * + (twopass->normalized_score_left / stats->count)); + } + +#if COMPLEXITY_STATS_OUTPUT + { + FILE *compstats; + compstats = fopen("complexity_stats.stt", "a"); + fprintf(compstats, "%10.3lf\n", + twopass->normalized_score_left / stats->count); + fclose(compstats); + } +#endif + } + + frame_rate = 10000000.0 * stats->count / stats->duration; + // Each frame can have a different duration, as the frame rate in the source + // isn't guaranteed to be constant. The frame rate prior to the first frame + // encoded in the second pass is a guess. However, the sum duration is not. + // It is calculated based on the actual durations of all frames from the + // first pass. + vp9_new_framerate(cpi, frame_rate); + twopass->bits_left = + (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0); + + // This variable monitors how far behind the second ref update is lagging. + twopass->sr_update_lag = 1; + + // Reset the vbr bits off target counters + rc->vbr_bits_off_target = 0; + rc->vbr_bits_off_target_fast = 0; + rc->rate_error_estimate = 0; + + // Static sequence monitor variables. + twopass->kf_zeromotion_pct = 100; + twopass->last_kfgroup_zeromotion_pct = 100; + + // Initialize bits per macro_block estimate correction factor. + twopass->bpm_factor = 1.0; + // Initialize actual and target bits counters for ARF groups so that + // at the start we have a neutral bpm adjustment. + twopass->rolling_arf_group_target_bits = 1; + twopass->rolling_arf_group_actual_bits = 1; + + if (oxcf->resize_mode != RESIZE_NONE) { + init_subsampling(cpi); + } + + // Initialize the arnr strangth adjustment to 0 + twopass->arnr_strength_adjustment = 0; +} + +/* This function considers how the quality of prediction may be deteriorating + * with distance. It compares the coded error for the last frame and the + * second reference frame (usually two frames old) and also applies a factor + * based on the extent of INTRA coding. + * + * The decay factor is then used to reduce the contribution of frames further + * from the alt-ref or golden frame, to the bitrate boost calculation for that + * alt-ref or golden frame. + */ +static double get_sr_decay_rate(const TWO_PASS *const twopass, + const FIRSTPASS_STATS *frame) { + double sr_diff = (frame->sr_coded_error - frame->coded_error); + double sr_decay = 1.0; + + // Do nothing if the second ref to last frame error difference is + // very small or even negative. + if ((sr_diff > LOW_SR_DIFF_TRHESH)) { + const double sr_diff_part = + twopass->sr_diff_factor * ((sr_diff * 0.25) / frame->intra_error); + double modified_pct_inter = frame->pcnt_inter; + double modified_pcnt_intra; + + if ((frame->coded_error > LOW_CODED_ERR_PER_MB) && + ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) < + (double)NCOUNT_FRAME_II_THRESH)) { + modified_pct_inter = + frame->pcnt_inter + frame->pcnt_intra_low - frame->pcnt_neutral; + } + modified_pcnt_intra = 100 * (1.0 - modified_pct_inter); + + sr_decay = 1.0 - sr_diff_part - (INTRA_PART * modified_pcnt_intra); + } + return VPXMAX(sr_decay, twopass->sr_default_decay_limit); +} + +// This function gives an estimate of how badly we believe the prediction +// quality is decaying from frame to frame. +static double get_zero_motion_factor(const TWO_PASS *const twopass, + const FIRSTPASS_STATS *frame_stats) { + const double zero_motion_pct = + frame_stats->pcnt_inter - frame_stats->pcnt_motion; + double sr_decay = get_sr_decay_rate(twopass, frame_stats); + return VPXMIN(sr_decay, zero_motion_pct); +} + +static double get_prediction_decay_rate(const TWO_PASS *const twopass, + const FIRSTPASS_STATS *frame_stats) { + const double sr_decay_rate = get_sr_decay_rate(twopass, frame_stats); + double zero_motion_factor = + twopass->zm_factor * (frame_stats->pcnt_inter - frame_stats->pcnt_motion); + + // Check that the zero motion factor is valid + assert(zero_motion_factor >= 0.0 && zero_motion_factor <= 1.0); + + return VPXMAX(zero_motion_factor, + (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor))); +} + +static int get_show_idx(const TWO_PASS *twopass) { + return (int)(twopass->stats_in - twopass->stats_in_start); +} +// Function to test for a condition where a complex transition is followed +// by a static section. For example in slide shows where there is a fade +// between slides. This is to help with more optimal kf and gf positioning. +static int check_transition_to_still(const FIRST_PASS_INFO *first_pass_info, + int show_idx, int still_interval) { + int j; + int num_frames = fps_get_num_frames(first_pass_info); + if (show_idx + still_interval > num_frames) { + return 0; + } + + // Look ahead a few frames to see if static condition persists... + for (j = 0; j < still_interval; ++j) { + const FIRSTPASS_STATS *stats = + fps_get_frame_stats(first_pass_info, show_idx + j); + if (stats->pcnt_inter - stats->pcnt_motion < 0.999) break; + } + + // Only if it does do we signal a transition to still. + return j == still_interval; +} + +// This function detects a flash through the high relative pcnt_second_ref +// score in the frame following a flash frame. The offset passed in should +// reflect this. +static int detect_flash_from_frame_stats(const FIRSTPASS_STATS *frame_stats) { + // What we are looking for here is a situation where there is a + // brief break in prediction (such as a flash) but subsequent frames + // are reasonably well predicted by an earlier (pre flash) frame. + // The recovery after a flash is indicated by a high pcnt_second_ref + // usage or a second ref coded error notabley lower than the last + // frame coded error. + if (frame_stats == NULL) { + return 0; + } + return (frame_stats->sr_coded_error < frame_stats->coded_error) || + ((frame_stats->pcnt_second_ref > frame_stats->pcnt_inter) && + (frame_stats->pcnt_second_ref >= 0.5)); +} + +static int detect_flash(const TWO_PASS *twopass, int offset) { + const FIRSTPASS_STATS *const next_frame = read_frame_stats(twopass, offset); + return detect_flash_from_frame_stats(next_frame); +} + +// Update the motion related elements to the GF arf boost calculation. +static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats, + double *mv_in_out, + double *mv_in_out_accumulator, + double *abs_mv_in_out_accumulator, + double *mv_ratio_accumulator) { + const double pct = stats->pcnt_motion; + + // Accumulate Motion In/Out of frame stats. + *mv_in_out = stats->mv_in_out_count * pct; + *mv_in_out_accumulator += *mv_in_out; + *abs_mv_in_out_accumulator += fabs(*mv_in_out); + + // Accumulate a measure of how uniform (or conversely how random) the motion + // field is (a ratio of abs(mv) / mv). + if (pct > 0.05) { + const double mvr_ratio = + fabs(stats->mvr_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVr)); + const double mvc_ratio = + fabs(stats->mvc_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVc)); + + *mv_ratio_accumulator += + pct * (mvr_ratio < stats->mvr_abs ? mvr_ratio : stats->mvr_abs); + *mv_ratio_accumulator += + pct * (mvc_ratio < stats->mvc_abs ? mvc_ratio : stats->mvc_abs); + } +} + +static double calc_frame_boost(const FRAME_INFO *frame_info, + const FIRSTPASS_STATS *this_frame, + const TWO_PASS *const twopass, + int avg_frame_qindex, + double this_frame_mv_in_out) { + double frame_boost; + const double lq = + vp9_convert_qindex_to_q(avg_frame_qindex, frame_info->bit_depth); + const double boost_q_correction = VPXMIN((0.5 + (lq * 0.015)), 1.5); + const double active_area = calculate_active_area(frame_info, this_frame); + + // Frame booost is based on inter error. + frame_boost = (twopass->err_per_mb * active_area) / + DOUBLE_DIVIDE_CHECK(this_frame->coded_error); + + // Small adjustment for cases where there is a zoom out + if (this_frame_mv_in_out > 0.0) + frame_boost += frame_boost * (this_frame_mv_in_out * 2.0); + + // Q correction and scalling + frame_boost = frame_boost * boost_q_correction; + + return VPXMIN(frame_boost, twopass->gf_frame_max_boost * boost_q_correction); +} + +static double calc_kf_frame_boost(VP9_COMP *cpi, + const FIRSTPASS_STATS *this_frame, + double *sr_accumulator, + double this_frame_mv_in_out, + double zm_factor) { + TWO_PASS *const twopass = &cpi->twopass; + double frame_boost; + const double lq = vp9_convert_qindex_to_q( + cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth); + const double boost_q_correction = VPXMIN((0.50 + (lq * 0.015)), 2.00); + const double active_area = + calculate_active_area(&cpi->frame_info, this_frame); + double max_boost; + + // Frame booost is based on inter error. + frame_boost = (twopass->kf_err_per_mb * active_area) / + DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator); + + // Update the accumulator for second ref error difference. + // This is intended to give an indication of how much the coded error is + // increasing over time. + *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error); + *sr_accumulator = VPXMAX(0.0, *sr_accumulator); + + // Small adjustment for cases where there is a zoom out + if (this_frame_mv_in_out > 0.0) + frame_boost += frame_boost * (this_frame_mv_in_out * 2.0); + + // Q correction and scaling + // The 40.0 value here is an experimentally derived baseline minimum. + // This value is in line with the minimum per frame boost in the alt_ref + // boost calculation. + frame_boost = + (frame_boost + twopass->kf_frame_min_boost) * boost_q_correction; + + // Maximum allowed boost this frame. May be different for first vs subsequent + // key frames. + max_boost = (cpi->common.current_video_frame == 0) + ? twopass->kf_frame_max_boost_first + : twopass->kf_frame_max_boost_subs; + max_boost *= zm_factor * boost_q_correction; + + return VPXMIN(frame_boost, max_boost); +} + +static int compute_arf_boost(const FRAME_INFO *frame_info, + TWO_PASS *const twopass, int arf_show_idx, + int f_frames, int b_frames, int avg_frame_qindex) { + const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info; + int i; + double boost_score = 0.0; + double mv_ratio_accumulator = 0.0; + double decay_accumulator = 1.0; + double this_frame_mv_in_out = 0.0; + double mv_in_out_accumulator = 0.0; + double abs_mv_in_out_accumulator = 0.0; + int arf_boost; + int flash_detected = 0; + + // Search forward from the proposed arf/next gf position. + for (i = 0; i < f_frames; ++i) { + const FIRSTPASS_STATS *this_frame = + fps_get_frame_stats(first_pass_info, arf_show_idx + i); + const FIRSTPASS_STATS *next_frame = + fps_get_frame_stats(first_pass_info, arf_show_idx + i + 1); + if (this_frame == NULL) break; + + // Update the motion related elements to the boost calculation. + accumulate_frame_motion_stats( + this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, + &abs_mv_in_out_accumulator, &mv_ratio_accumulator); + + // We want to discount the flash frame itself and the recovery + // frame that follows as both will have poor scores. + flash_detected = detect_flash_from_frame_stats(this_frame) || + detect_flash_from_frame_stats(next_frame); + + // Accumulate the effect of prediction quality decay. + if (!flash_detected) { + decay_accumulator *= get_prediction_decay_rate(twopass, this_frame); + decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR + ? MIN_DECAY_FACTOR + : decay_accumulator; + } + boost_score += decay_accumulator * + calc_frame_boost(frame_info, this_frame, twopass, + avg_frame_qindex, this_frame_mv_in_out); + } + + arf_boost = (int)boost_score; + + // Reset for backward looking loop. + boost_score = 0.0; + mv_ratio_accumulator = 0.0; + decay_accumulator = 1.0; + this_frame_mv_in_out = 0.0; + mv_in_out_accumulator = 0.0; + abs_mv_in_out_accumulator = 0.0; + + // Search backward towards last gf position. + for (i = -1; i >= -b_frames; --i) { + const FIRSTPASS_STATS *this_frame = + fps_get_frame_stats(first_pass_info, arf_show_idx + i); + const FIRSTPASS_STATS *next_frame = + fps_get_frame_stats(first_pass_info, arf_show_idx + i + 1); + if (this_frame == NULL) break; + + // Update the motion related elements to the boost calculation. + accumulate_frame_motion_stats( + this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, + &abs_mv_in_out_accumulator, &mv_ratio_accumulator); + + // We want to discount the flash frame itself and the recovery + // frame that follows as both will have poor scores. + flash_detected = detect_flash_from_frame_stats(this_frame) || + detect_flash_from_frame_stats(next_frame); + + // Cumulative effect of prediction quality decay. + if (!flash_detected) { + decay_accumulator *= get_prediction_decay_rate(twopass, this_frame); + decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR + ? MIN_DECAY_FACTOR + : decay_accumulator; + } + boost_score += decay_accumulator * + calc_frame_boost(frame_info, this_frame, twopass, + avg_frame_qindex, this_frame_mv_in_out); + } + arf_boost += (int)boost_score; + + if (arf_boost < ((b_frames + f_frames) * 40)) + arf_boost = ((b_frames + f_frames) * 40); + arf_boost = VPXMAX(arf_boost, MIN_ARF_GF_BOOST); + + return arf_boost; +} + +static int calc_arf_boost(VP9_COMP *cpi, int f_frames, int b_frames) { + const FRAME_INFO *frame_info = &cpi->frame_info; + TWO_PASS *const twopass = &cpi->twopass; + const int avg_inter_frame_qindex = cpi->rc.avg_frame_qindex[INTER_FRAME]; + int arf_show_idx = get_show_idx(twopass); + return compute_arf_boost(frame_info, twopass, arf_show_idx, f_frames, + b_frames, avg_inter_frame_qindex); +} + +// Calculate a section intra ratio used in setting max loop filter. +static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin, + const FIRSTPASS_STATS *end, + int section_length) { + const FIRSTPASS_STATS *s = begin; + double intra_error = 0.0; + double coded_error = 0.0; + int i = 0; + + while (s < end && i < section_length) { + intra_error += s->intra_error; + coded_error += s->coded_error; + ++s; + ++i; + } + + return (int)(intra_error / DOUBLE_DIVIDE_CHECK(coded_error)); +} + +// Calculate the total bits to allocate in this GF/ARF group. +static int64_t calculate_total_gf_group_bits(VP9_COMP *cpi, + double gf_group_err) { + VP9_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const TWO_PASS *const twopass = &cpi->twopass; + const int max_bits = frame_max_bits(rc, &cpi->oxcf); + int64_t total_group_bits; + const int is_key_frame = frame_is_intra_only(cm); + const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active; + int gop_frames = + rc->baseline_gf_interval + rc->source_alt_ref_pending - arf_active_or_kf; + + // Calculate the bits to be allocated to the group as a whole. + if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0.0)) { + int key_frame_interval = rc->frames_since_key + rc->frames_to_key; + int distance_from_next_key_frame = + rc->frames_to_key - + (rc->baseline_gf_interval + rc->source_alt_ref_pending); + int max_gf_bits_bias = rc->avg_frame_bandwidth; + double gf_interval_bias_bits_normalize_factor = + (double)rc->baseline_gf_interval / 16; + total_group_bits = (int64_t)(twopass->kf_group_bits * + (gf_group_err / twopass->kf_group_error_left)); + // TODO(ravi): Experiment with different values of max_gf_bits_bias + total_group_bits += + (int64_t)((double)distance_from_next_key_frame / key_frame_interval * + max_gf_bits_bias * gf_interval_bias_bits_normalize_factor); + } else { + total_group_bits = 0; + } + + // Clamp odd edge cases. + total_group_bits = (total_group_bits < 0) ? 0 + : (total_group_bits > twopass->kf_group_bits) + ? twopass->kf_group_bits + : total_group_bits; + + // Clip based on user supplied data rate variability limit. + if (total_group_bits > (int64_t)max_bits * gop_frames) + total_group_bits = (int64_t)max_bits * gop_frames; + + return total_group_bits; +} + +// Calculate the number bits extra to assign to boosted frames in a group. +static int calculate_boost_bits(int frame_count, int boost, + int64_t total_group_bits) { + int allocation_chunks; + + // return 0 for invalid inputs (could arise e.g. through rounding errors) + if (!boost || (total_group_bits <= 0) || (frame_count < 0)) return 0; + + allocation_chunks = (frame_count * NORMAL_BOOST) + boost; + + // Prevent overflow. + if (boost > 1023) { + int divisor = boost >> 10; + boost /= divisor; + allocation_chunks /= divisor; + } + + // Calculate the number of extra bits for use in the boosted frame or frames. + return VPXMAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks), + 0); +} + +// Used in corpus vbr: Calculates the total normalized group complexity score +// for a given number of frames starting at the current position in the stats +// file. +static double calculate_group_score(VP9_COMP *cpi, double av_score, + int frame_count) { + VP9EncoderConfig *const oxcf = &cpi->oxcf; + TWO_PASS *const twopass = &cpi->twopass; + const FIRSTPASS_STATS *s = twopass->stats_in; + double score_total = 0.0; + int i = 0; + + // We don't ever want to return a 0 score here. + if (frame_count == 0) return 1.0; + + while ((i < frame_count) && (s < twopass->stats_in_end)) { + score_total += calculate_norm_frame_score(cpi, twopass, oxcf, s, av_score); + ++s; + ++i; + } + + return score_total; +} + +static void find_arf_order(VP9_COMP *cpi, GF_GROUP *gf_group, + int *index_counter, int depth, int start, int end) { + TWO_PASS *twopass = &cpi->twopass; + const FIRSTPASS_STATS *const start_pos = twopass->stats_in; + FIRSTPASS_STATS fpf_frame; + const int mid = (start + end + 1) >> 1; + const int min_frame_interval = 2; + int idx; + + // Process regular P frames + if ((end - start < min_frame_interval) || + (depth > gf_group->allowed_max_layer_depth)) { + for (idx = start; idx <= end; ++idx) { + gf_group->update_type[*index_counter] = LF_UPDATE; + gf_group->arf_src_offset[*index_counter] = 0; + gf_group->frame_gop_index[*index_counter] = idx; + gf_group->rf_level[*index_counter] = INTER_NORMAL; + gf_group->layer_depth[*index_counter] = depth; + gf_group->gfu_boost[*index_counter] = NORMAL_BOOST; + ++(*index_counter); + } + gf_group->max_layer_depth = VPXMAX(gf_group->max_layer_depth, depth); + return; + } + + assert(abs(mid - start) >= 1 && abs(mid - end) >= 1); + + // Process ARF frame + gf_group->layer_depth[*index_counter] = depth; + gf_group->update_type[*index_counter] = ARF_UPDATE; + gf_group->arf_src_offset[*index_counter] = mid - start; + gf_group->frame_gop_index[*index_counter] = mid; + gf_group->rf_level[*index_counter] = GF_ARF_LOW; + + for (idx = 0; idx <= mid; ++idx) + if (EOF == input_stats(twopass, &fpf_frame)) break; + + gf_group->gfu_boost[*index_counter] = + VPXMAX(MIN_ARF_GF_BOOST, + calc_arf_boost(cpi, end - mid + 1, mid - start) >> depth); + + reset_fpf_position(twopass, start_pos); + + ++(*index_counter); + + find_arf_order(cpi, gf_group, index_counter, depth + 1, start, mid - 1); + + gf_group->update_type[*index_counter] = USE_BUF_FRAME; + gf_group->arf_src_offset[*index_counter] = 0; + gf_group->frame_gop_index[*index_counter] = mid; + gf_group->rf_level[*index_counter] = INTER_NORMAL; + gf_group->layer_depth[*index_counter] = depth; + ++(*index_counter); + + find_arf_order(cpi, gf_group, index_counter, depth + 1, mid + 1, end); +} + +static INLINE void set_gf_overlay_frame_type(GF_GROUP *gf_group, + int frame_index, + int source_alt_ref_active) { + if (source_alt_ref_active) { + gf_group->update_type[frame_index] = OVERLAY_UPDATE; + gf_group->rf_level[frame_index] = INTER_NORMAL; + gf_group->layer_depth[frame_index] = MAX_ARF_LAYERS - 1; + gf_group->gfu_boost[frame_index] = NORMAL_BOOST; + } else { + gf_group->update_type[frame_index] = GF_UPDATE; + gf_group->rf_level[frame_index] = GF_ARF_STD; + gf_group->layer_depth[frame_index] = 0; + } +} + +static void define_gf_group_structure(VP9_COMP *cpi) { + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->twopass; + GF_GROUP *const gf_group = &twopass->gf_group; + int frame_index = 0; + int key_frame = cpi->common.frame_type == KEY_FRAME; + int layer_depth = 1; + int gop_frames = + rc->baseline_gf_interval - (key_frame || rc->source_alt_ref_pending); + + gf_group->frame_start = cpi->common.current_video_frame; + gf_group->frame_end = gf_group->frame_start + rc->baseline_gf_interval; + gf_group->max_layer_depth = 0; + gf_group->allowed_max_layer_depth = 0; + + // For key frames the frame target rate is already set and it + // is also the golden frame. + // === [frame_index == 0] === + if (!key_frame) + set_gf_overlay_frame_type(gf_group, frame_index, rc->source_alt_ref_active); + + ++frame_index; + + // === [frame_index == 1] === + if (rc->source_alt_ref_pending) { + gf_group->update_type[frame_index] = ARF_UPDATE; + gf_group->rf_level[frame_index] = GF_ARF_STD; + gf_group->layer_depth[frame_index] = layer_depth; + gf_group->arf_src_offset[frame_index] = + (unsigned char)(rc->baseline_gf_interval - 1); + gf_group->frame_gop_index[frame_index] = rc->baseline_gf_interval; + gf_group->max_layer_depth = 1; + ++frame_index; + ++layer_depth; + gf_group->allowed_max_layer_depth = cpi->oxcf.enable_auto_arf; + } + + find_arf_order(cpi, gf_group, &frame_index, layer_depth, 1, gop_frames); + + set_gf_overlay_frame_type(gf_group, frame_index, rc->source_alt_ref_pending); + gf_group->arf_src_offset[frame_index] = 0; + gf_group->frame_gop_index[frame_index] = rc->baseline_gf_interval; + + // Set the frame ops number. + gf_group->gf_group_size = frame_index; +} + +static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, + int gf_arf_bits) { + VP9EncoderConfig *const oxcf = &cpi->oxcf; + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->twopass; + GF_GROUP *const gf_group = &twopass->gf_group; + FIRSTPASS_STATS frame_stats; + int i; + int frame_index = 0; + int target_frame_size; + int key_frame; + const int max_bits = frame_max_bits(&cpi->rc, oxcf); + int64_t total_group_bits = gf_group_bits; + int mid_frame_idx; + int normal_frames; + int normal_frame_bits; + int last_frame_reduction = 0; + double av_score = 1.0; + double tot_norm_frame_score = 1.0; + double this_frame_score = 1.0; + + // Define the GF structure and specify + int gop_frames = gf_group->gf_group_size; + + key_frame = cpi->common.frame_type == KEY_FRAME; + + // For key frames the frame target rate is already set and it + // is also the golden frame. + // === [frame_index == 0] === + if (!key_frame) { + gf_group->bit_allocation[frame_index] = + rc->source_alt_ref_active ? 0 : gf_arf_bits; + } + + // Deduct the boost bits for arf (or gf if it is not a key frame) + // from the group total. + if (rc->source_alt_ref_pending || !key_frame) total_group_bits -= gf_arf_bits; + + ++frame_index; + + // === [frame_index == 1] === + // Store the bits to spend on the ARF if there is one. + if (rc->source_alt_ref_pending) { + gf_group->bit_allocation[frame_index] = gf_arf_bits; + + ++frame_index; + } + + // Define middle frame + mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1; + + normal_frames = (rc->baseline_gf_interval - 1); + if (normal_frames > 1) + normal_frame_bits = (int)(total_group_bits / normal_frames); + else + normal_frame_bits = (int)total_group_bits; + + gf_group->gfu_boost[1] = rc->gfu_boost; + + if (cpi->multi_layer_arf) { + int idx; + int arf_depth_bits[MAX_ARF_LAYERS] = { 0 }; + int arf_depth_count[MAX_ARF_LAYERS] = { 0 }; + int arf_depth_boost[MAX_ARF_LAYERS] = { 0 }; + int total_arfs = 1; // Account for the base layer ARF. + + for (idx = 0; idx < gop_frames; ++idx) { + if (gf_group->update_type[idx] == ARF_UPDATE) { + arf_depth_boost[gf_group->layer_depth[idx]] += gf_group->gfu_boost[idx]; + ++arf_depth_count[gf_group->layer_depth[idx]]; + } + } + + for (idx = 2; idx < MAX_ARF_LAYERS; ++idx) { + if (arf_depth_boost[idx] == 0) break; + arf_depth_bits[idx] = calculate_boost_bits( + rc->baseline_gf_interval - total_arfs - arf_depth_count[idx], + arf_depth_boost[idx], total_group_bits); + + total_group_bits -= arf_depth_bits[idx]; + total_arfs += arf_depth_count[idx]; + } + + // offset the base layer arf + normal_frames -= (total_arfs - 1); + if (normal_frames > 1) + normal_frame_bits = (int)(total_group_bits / normal_frames); + else + normal_frame_bits = (int)total_group_bits; + + target_frame_size = normal_frame_bits; + target_frame_size = + clamp(target_frame_size, 0, VPXMIN(max_bits, (int)total_group_bits)); + + // The first layer ARF has its bit allocation assigned. + for (idx = frame_index; idx < gop_frames; ++idx) { + switch (gf_group->update_type[idx]) { + case ARF_UPDATE: + gf_group->bit_allocation[idx] = + (int)(((int64_t)arf_depth_bits[gf_group->layer_depth[idx]] * + gf_group->gfu_boost[idx]) / + arf_depth_boost[gf_group->layer_depth[idx]]); + break; + case USE_BUF_FRAME: gf_group->bit_allocation[idx] = 0; break; + default: gf_group->bit_allocation[idx] = target_frame_size; break; + } + } + gf_group->bit_allocation[idx] = 0; + + return; + } + + if (oxcf->vbr_corpus_complexity) { + av_score = get_distribution_av_err(cpi, twopass); + tot_norm_frame_score = calculate_group_score(cpi, av_score, normal_frames); + } + + // Allocate bits to the other frames in the group. + for (i = 0; i < normal_frames; ++i) { + if (EOF == input_stats(twopass, &frame_stats)) break; + if (oxcf->vbr_corpus_complexity) { + this_frame_score = calculate_norm_frame_score(cpi, twopass, oxcf, + &frame_stats, av_score); + normal_frame_bits = (int)((double)total_group_bits * + (this_frame_score / tot_norm_frame_score)); + } + + target_frame_size = normal_frame_bits; + if ((i == (normal_frames - 1)) && (i >= 1)) { + last_frame_reduction = normal_frame_bits / 16; + target_frame_size -= last_frame_reduction; + } + + target_frame_size = + clamp(target_frame_size, 0, VPXMIN(max_bits, (int)total_group_bits)); + + gf_group->bit_allocation[frame_index] = target_frame_size; + ++frame_index; + } + + // Add in some extra bits for the middle frame in the group. + gf_group->bit_allocation[mid_frame_idx] += last_frame_reduction; + + // Note: + // We need to configure the frame at the end of the sequence + 1 that will be + // the start frame for the next group. Otherwise prior to the call to + // vp9_rc_get_second_pass_params() the data will be undefined. +} + +// Adjusts the ARNF filter for a GF group. +static void adjust_group_arnr_filter(VP9_COMP *cpi, double section_noise, + double section_inter, + double section_motion) { + TWO_PASS *const twopass = &cpi->twopass; + double section_zeromv = section_inter - section_motion; + + twopass->arnr_strength_adjustment = 0; + + if (section_noise < 150) { + twopass->arnr_strength_adjustment -= 1; + if (section_noise < 75) twopass->arnr_strength_adjustment -= 1; + } else if (section_noise > 250) + twopass->arnr_strength_adjustment += 1; + + if (section_zeromv > 0.50) twopass->arnr_strength_adjustment += 1; +} + +// Analyse and define a gf/arf group. +#define ARF_ABS_ZOOM_THRESH 4.0 + +#define MAX_GF_BOOST 5400 + +typedef struct RANGE { + int min; + int max; +} RANGE; + +/* get_gop_coding_frame_num() depends on several fields in RATE_CONTROL *rc as + * follows. + * Static fields: + * (The following fields will remain unchanged after initialization of encoder.) + * rc->static_scene_max_gf_interval + * rc->min_gf_interval + * twopass->sr_diff_factor + * twopass->sr_default_decay_limit + * twopass->zm_factor + * + * Dynamic fields: + * (The following fields will be updated before or after coding each frame.) + * rc->frames_to_key + * rc->frames_since_key + * rc->source_alt_ref_active + * + * Special case: if CONFIG_RATE_CTRL is true, the external arf indexes will + * determine the arf position. + * + * TODO(angiebird): Separate the dynamic fields and static fields into two + * structs. + */ +static int get_gop_coding_frame_num( + int *use_alt_ref, const FRAME_INFO *frame_info, + const TWO_PASS *const twopass, const RATE_CONTROL *rc, + int gf_start_show_idx, const RANGE *active_gf_interval, + double gop_intra_factor, int lag_in_frames, int *end_of_sequence) { + const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info; + double loop_decay_rate = 1.00; + double mv_ratio_accumulator = 0.0; + double this_frame_mv_in_out = 0.0; + double mv_in_out_accumulator = 0.0; + double abs_mv_in_out_accumulator = 0.0; + double sr_accumulator = 0.0; + // Motion breakout threshold for loop below depends on image size. + double mv_ratio_accumulator_thresh = + (frame_info->frame_height + frame_info->frame_width) / 4.0; + double zero_motion_accumulator = 1.0; + int gop_coding_frames; + + *use_alt_ref = 1; + gop_coding_frames = 0; + while (gop_coding_frames < rc->static_scene_max_gf_interval && + gop_coding_frames < rc->frames_to_key) { + const FIRSTPASS_STATS *next_next_frame; + const FIRSTPASS_STATS *next_frame; + int flash_detected; + ++gop_coding_frames; + + next_frame = fps_get_frame_stats(first_pass_info, + gf_start_show_idx + gop_coding_frames); + if (next_frame == NULL) { + *end_of_sequence = gop_coding_frames == 1 && rc->source_alt_ref_active; + break; + } + + // Test for the case where there is a brief flash but the prediction + // quality back to an earlier frame is then restored. + next_next_frame = fps_get_frame_stats( + first_pass_info, gf_start_show_idx + gop_coding_frames + 1); + flash_detected = detect_flash_from_frame_stats(next_next_frame); + + // Update the motion related elements to the boost calculation. + accumulate_frame_motion_stats( + next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, + &abs_mv_in_out_accumulator, &mv_ratio_accumulator); + + // Monitor for static sections. + if ((rc->frames_since_key + gop_coding_frames - 1) > 1) { + zero_motion_accumulator = VPXMIN( + zero_motion_accumulator, get_zero_motion_factor(twopass, next_frame)); + } + + // Accumulate the effect of prediction quality decay. + if (!flash_detected) { + double last_loop_decay_rate = loop_decay_rate; + loop_decay_rate = get_prediction_decay_rate(twopass, next_frame); + + // Break clause to detect very still sections after motion. For example, + // a static image after a fade or other transition. + if (gop_coding_frames > rc->min_gf_interval && loop_decay_rate >= 0.999 && + last_loop_decay_rate < 0.9) { + int still_interval = 5; + if (check_transition_to_still(first_pass_info, + gf_start_show_idx + gop_coding_frames, + still_interval)) { + *use_alt_ref = 0; + break; + } + } + + // Update the accumulator for second ref error difference. + // This is intended to give an indication of how much the coded error is + // increasing over time. + if (gop_coding_frames == 1) { + sr_accumulator += next_frame->coded_error; + } else { + sr_accumulator += + (next_frame->sr_coded_error - next_frame->coded_error); + } + } + + // Break out conditions. + // Break at maximum of active_gf_interval->max unless almost totally + // static. + // + // Note that the addition of a test of rc->source_alt_ref_active is + // deliberate. The effect of this is that after a normal altref group even + // if the material is static there will be one normal length GF group + // before allowing longer GF groups. The reason for this is that in cases + // such as slide shows where slides are separated by a complex transition + // such as a fade, the arf group spanning the transition may not be coded + // at a very high quality and hence this frame (with its overlay) is a + // poor golden frame to use for an extended group. + if ((gop_coding_frames >= active_gf_interval->max) && + ((zero_motion_accumulator < 0.995) || (rc->source_alt_ref_active))) { + break; + } + if ( + // Don't break out with a very short interval. + (gop_coding_frames >= active_gf_interval->min) && + // If possible don't break very close to a kf + ((rc->frames_to_key - gop_coding_frames) >= rc->min_gf_interval) && + (gop_coding_frames & 0x01) && (!flash_detected) && + ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) || + (abs_mv_in_out_accumulator > ARF_ABS_ZOOM_THRESH) || + (sr_accumulator > gop_intra_factor * next_frame->intra_error))) { + break; + } + } + *use_alt_ref &= zero_motion_accumulator < 0.995; + *use_alt_ref &= gop_coding_frames < lag_in_frames; + *use_alt_ref &= gop_coding_frames >= rc->min_gf_interval; + return gop_coding_frames; +} + +static RANGE get_active_gf_inverval_range_simple(int min_gf_interval, + int arf_active_or_kf, + int frames_to_key) { + RANGE active_gf_interval; + active_gf_interval.min = min_gf_interval + arf_active_or_kf + 2; + active_gf_interval.max = 16 + arf_active_or_kf; + + if ((active_gf_interval.max <= frames_to_key) && + (active_gf_interval.max >= (frames_to_key - min_gf_interval))) { + active_gf_interval.min = frames_to_key / 2; + active_gf_interval.max = frames_to_key / 2; + } + return active_gf_interval; +} + +static RANGE get_active_gf_inverval_range( + const FRAME_INFO *frame_info, const RATE_CONTROL *rc, int arf_active_or_kf, + int gf_start_show_idx, int active_worst_quality, int last_boosted_qindex) { + RANGE active_gf_interval; + int int_max_q = (int)(vp9_convert_qindex_to_q(active_worst_quality, + frame_info->bit_depth)); + int q_term = (gf_start_show_idx == 0) + ? int_max_q / 32 + : (int)(vp9_convert_qindex_to_q(last_boosted_qindex, + frame_info->bit_depth) / + 6); + active_gf_interval.min = + rc->min_gf_interval + arf_active_or_kf + VPXMIN(2, int_max_q / 200); + active_gf_interval.min = + VPXMIN(active_gf_interval.min, rc->max_gf_interval + arf_active_or_kf); + + // The value chosen depends on the active Q range. At low Q we have + // bits to spare and are better with a smaller interval and smaller boost. + // At high Q when there are few bits to spare we are better with a longer + // interval to spread the cost of the GF. + active_gf_interval.max = 11 + arf_active_or_kf + VPXMIN(5, q_term); + + // Force max GF interval to be odd. + active_gf_interval.max = active_gf_interval.max | 0x01; + + // We have: active_gf_interval.min <= + // rc->max_gf_interval + arf_active_or_kf. + if (active_gf_interval.max < active_gf_interval.min) { + active_gf_interval.max = active_gf_interval.min; + } else { + active_gf_interval.max = + VPXMIN(active_gf_interval.max, rc->max_gf_interval + arf_active_or_kf); + } + + // Would the active max drop us out just before the near the next kf? + if ((active_gf_interval.max <= rc->frames_to_key) && + (active_gf_interval.max >= (rc->frames_to_key - rc->min_gf_interval))) { + active_gf_interval.max = rc->frames_to_key / 2; + } + active_gf_interval.max = + VPXMAX(active_gf_interval.max, active_gf_interval.min); + return active_gf_interval; +} + +static int get_arf_layers(int multi_layer_arf, int max_layers, + int coding_frame_num) { + assert(max_layers <= MAX_ARF_LAYERS); + if (multi_layer_arf) { + int layers = 0; + int i; + for (i = coding_frame_num; i > 0; i >>= 1) { + ++layers; + } + layers = VPXMIN(max_layers, layers); + return layers; + } else { + return 1; + } +} + +static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) { + VP9_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + VP9EncoderConfig *const oxcf = &cpi->oxcf; + TWO_PASS *const twopass = &cpi->twopass; + const FRAME_INFO *frame_info = &cpi->frame_info; + const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info; + const FIRSTPASS_STATS *const start_pos = twopass->stats_in; + int gop_coding_frames; + + double gf_group_err = 0.0; + double gf_group_raw_error = 0.0; + double gf_group_noise = 0.0; + double gf_group_skip_pct = 0.0; + double gf_group_inactive_zone_rows = 0.0; + double gf_group_inter = 0.0; + double gf_group_motion = 0.0; + + int allow_alt_ref = is_altref_enabled(cpi); + int use_alt_ref; + + int64_t gf_group_bits; + int gf_arf_bits; + const int is_key_frame = frame_is_intra_only(cm); + // If this is a key frame or the overlay from a previous arf then + // the error score / cost of this frame has already been accounted for. + const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active; + int is_alt_ref_flash = 0; + + double gop_intra_factor; + int gop_frames; + RANGE active_gf_interval; + // Whether this is at the end of last GOP of this sequence. + int end_of_sequence = 0; + + // Reset the GF group data structures unless this is a key + // frame in which case it will already have been done. + if (is_key_frame == 0) { + vp9_zero(twopass->gf_group); + ++rc->gop_global_index; + } else { + rc->gop_global_index = 0; + } + + vpx_clear_system_state(); + + if (oxcf->use_simple_encode_api) { + active_gf_interval = get_active_gf_inverval_range_simple( + rc->min_gf_interval, arf_active_or_kf, rc->frames_to_key); + } else { + active_gf_interval = get_active_gf_inverval_range( + frame_info, rc, arf_active_or_kf, gf_start_show_idx, + twopass->active_worst_quality, rc->last_boosted_qindex); + } + + if (cpi->multi_layer_arf) { + int arf_layers = get_arf_layers(cpi->multi_layer_arf, oxcf->enable_auto_arf, + active_gf_interval.max); + gop_intra_factor = 1.0 + 0.25 * arf_layers; + } else { + gop_intra_factor = 1.0; + } + + gop_coding_frames = get_gop_coding_frame_num( + &use_alt_ref, frame_info, twopass, rc, gf_start_show_idx, + &active_gf_interval, gop_intra_factor, cpi->oxcf.lag_in_frames, + &end_of_sequence); + use_alt_ref &= allow_alt_ref; +#if CONFIG_RATE_CTRL + // If the external gop_command is on, we will override the decisions + // of gop_coding_frames and use_alt_ref. + if (cpi->oxcf.use_simple_encode_api) { + const GOP_COMMAND *gop_command = &cpi->encode_command.gop_command; + assert(allow_alt_ref == 1); + if (gop_command->use) { + gop_coding_frames = gop_command_coding_frame_count(gop_command); + use_alt_ref = gop_command->use_alt_ref; + } + } +#endif + // If the external rate control model for GOP is used, the gop decisions + // are overwritten. Specifically, |gop_coding_frames| and |use_alt_ref| + // will be overwritten. + if (cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 && + cpi->ext_ratectrl.funcs.get_gop_decision != NULL && !end_of_sequence) { + vpx_codec_err_t codec_status; + vpx_rc_gop_decision_t gop_decision; + vpx_rc_gop_info_t gop_info; + gop_info.min_gf_interval = rc->min_gf_interval; + gop_info.max_gf_interval = rc->max_gf_interval; + gop_info.active_min_gf_interval = active_gf_interval.min; + gop_info.active_max_gf_interval = active_gf_interval.max; + gop_info.allow_alt_ref = allow_alt_ref; + gop_info.is_key_frame = is_key_frame; + gop_info.last_gop_use_alt_ref = rc->source_alt_ref_active; + gop_info.frames_since_key = rc->frames_since_key; + gop_info.frames_to_key = rc->frames_to_key; + gop_info.lag_in_frames = cpi->oxcf.lag_in_frames; + gop_info.show_index = cm->current_video_frame; + gop_info.coding_index = cm->current_frame_coding_index; + gop_info.gop_global_index = rc->gop_global_index; + + codec_status = vp9_extrc_get_gop_decision(&cpi->ext_ratectrl, &gop_info, + &gop_decision); + if (codec_status != VPX_CODEC_OK) { + vpx_internal_error(&cm->error, codec_status, + "vp9_extrc_get_gop_decision() failed"); + } + gop_coding_frames = gop_decision.gop_coding_frames; + use_alt_ref = gop_decision.use_alt_ref; + } + + // Was the group length constrained by the requirement for a new KF? + rc->constrained_gf_group = (gop_coding_frames >= rc->frames_to_key) ? 1 : 0; + + // Should we use the alternate reference frame. + if (use_alt_ref) { + const int f_frames = + (rc->frames_to_key - gop_coding_frames >= gop_coding_frames - 1) + ? gop_coding_frames - 1 + : VPXMAX(0, rc->frames_to_key - gop_coding_frames); + const int b_frames = gop_coding_frames - 1; + const int avg_inter_frame_qindex = rc->avg_frame_qindex[INTER_FRAME]; + // TODO(angiebird): figure out why arf's location is assigned this way + const int arf_show_idx = VPXMIN(gf_start_show_idx + gop_coding_frames + 1, + fps_get_num_frames(first_pass_info)); + + // Calculate the boost for alt ref. + rc->gfu_boost = + compute_arf_boost(frame_info, twopass, arf_show_idx, f_frames, b_frames, + avg_inter_frame_qindex); + rc->source_alt_ref_pending = 1; + } else { + const int f_frames = gop_coding_frames - 1; + const int b_frames = 0; + const int avg_inter_frame_qindex = rc->avg_frame_qindex[INTER_FRAME]; + // TODO(angiebird): figure out why arf's location is assigned this way + const int gld_show_idx = + VPXMIN(gf_start_show_idx + 1, fps_get_num_frames(first_pass_info)); + const int arf_boost = + compute_arf_boost(frame_info, twopass, gld_show_idx, f_frames, b_frames, + avg_inter_frame_qindex); + rc->gfu_boost = VPXMIN((int)twopass->gf_max_total_boost, arf_boost); + rc->source_alt_ref_pending = 0; + } + +#define LAST_ALR_ACTIVE_BEST_QUALITY_ADJUSTMENT_FACTOR 0.2 + rc->arf_active_best_quality_adjustment_factor = 1.0; + rc->arf_increase_active_best_quality = 0; + + if (!is_lossless_requested(&cpi->oxcf)) { + if (rc->frames_since_key >= rc->frames_to_key) { + // Increase the active best quality in the second half of key frame + // interval. + rc->arf_active_best_quality_adjustment_factor = + LAST_ALR_ACTIVE_BEST_QUALITY_ADJUSTMENT_FACTOR + + (1.0 - LAST_ALR_ACTIVE_BEST_QUALITY_ADJUSTMENT_FACTOR) * + (rc->frames_to_key - gop_coding_frames) / + (VPXMAX(1, ((rc->frames_to_key + rc->frames_since_key) / 2 - + gop_coding_frames))); + rc->arf_increase_active_best_quality = 1; + } else if ((rc->frames_to_key - gop_coding_frames) > 0) { + // Reduce the active best quality in the first half of key frame interval. + rc->arf_active_best_quality_adjustment_factor = + LAST_ALR_ACTIVE_BEST_QUALITY_ADJUSTMENT_FACTOR + + (1.0 - LAST_ALR_ACTIVE_BEST_QUALITY_ADJUSTMENT_FACTOR) * + (rc->frames_since_key + gop_coding_frames) / + (VPXMAX(1, (rc->frames_to_key + rc->frames_since_key) / 2 + + gop_coding_frames)); + rc->arf_increase_active_best_quality = -1; + } + } + +#ifdef AGGRESSIVE_VBR + // Limit maximum boost based on interval length. + rc->gfu_boost = VPXMIN((int)rc->gfu_boost, gop_coding_frames * 140); +#else + rc->gfu_boost = VPXMIN((int)rc->gfu_boost, gop_coding_frames * 200); +#endif + + // Cap the ARF boost when perceptual quality AQ mode is enabled. This is + // designed to improve the perceptual quality of high value content and to + // make consistent quality across consecutive frames. It will hurt objective + // quality. + if (oxcf->aq_mode == PERCEPTUAL_AQ) + rc->gfu_boost = VPXMIN(rc->gfu_boost, MIN_ARF_GF_BOOST); + + rc->baseline_gf_interval = gop_coding_frames - rc->source_alt_ref_pending; + + if (rc->source_alt_ref_pending) + is_alt_ref_flash = detect_flash(twopass, rc->baseline_gf_interval); + + { + const double av_err = get_distribution_av_err(cpi, twopass); + const double mean_mod_score = twopass->mean_mod_score; + // If the first frame is a key frame or the overlay from a previous arf then + // the error score / cost of this frame has already been accounted for. + int start_idx = arf_active_or_kf ? 1 : 0; + int j; + for (j = start_idx; j < gop_coding_frames; ++j) { + int show_idx = gf_start_show_idx + j; + const FIRSTPASS_STATS *frame_stats = + fps_get_frame_stats(first_pass_info, show_idx); + // Accumulate error score of frames in this gf group. + gf_group_err += calc_norm_frame_score(oxcf, frame_info, frame_stats, + mean_mod_score, av_err); + gf_group_raw_error += frame_stats->coded_error; + gf_group_noise += frame_stats->frame_noise_energy; + gf_group_skip_pct += frame_stats->intra_skip_pct; + gf_group_inactive_zone_rows += frame_stats->inactive_zone_rows; + gf_group_inter += frame_stats->pcnt_inter; + gf_group_motion += frame_stats->pcnt_motion; + } + } + + // Calculate the bits to be allocated to the gf/arf group as a whole + gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err); + + gop_frames = + rc->baseline_gf_interval + rc->source_alt_ref_pending - arf_active_or_kf; + + // Store the average moise level measured for the group + // TODO(any): Experiment with removal of else condition (gop_frames = 0) so + // that consumption of group noise energy is based on previous gf group + if (gop_frames > 0) + twopass->gf_group.group_noise_energy = (int)(gf_group_noise / gop_frames); + else + twopass->gf_group.group_noise_energy = 0; + + // Calculate an estimate of the maxq needed for the group. + // We are more aggressive about correcting for sections + // where there could be significant overshoot than for easier + // sections where we do not wish to risk creating an overshoot + // of the allocated bit budget. + if ((cpi->oxcf.rc_mode != VPX_Q) && (rc->baseline_gf_interval > 1)) { + const int vbr_group_bits_per_frame = (int)(gf_group_bits / gop_frames); + const double group_av_err = gf_group_raw_error / gop_frames; + const double group_av_noise = gf_group_noise / gop_frames; + const double group_av_skip_pct = gf_group_skip_pct / gop_frames; + const double group_av_inactive_zone = ((gf_group_inactive_zone_rows * 2) / + (gop_frames * (double)cm->mb_rows)); + int tmp_q = get_twopass_worst_quality( + cpi, group_av_err, (group_av_skip_pct + group_av_inactive_zone), + group_av_noise, vbr_group_bits_per_frame); + twopass->active_worst_quality = + (int)((tmp_q + (twopass->active_worst_quality * + (twopass->active_wq_factor - 1))) / + twopass->active_wq_factor); + +#if CONFIG_ALWAYS_ADJUST_BPM + // Reset rolling actual and target bits counters for ARF groups. + twopass->rolling_arf_group_target_bits = 0; + twopass->rolling_arf_group_actual_bits = 0; +#endif + } + + // Context Adjustment of ARNR filter strength + if (rc->baseline_gf_interval > 1) { + adjust_group_arnr_filter(cpi, (gf_group_noise / gop_frames), + (gf_group_inter / gop_frames), + (gf_group_motion / gop_frames)); + } else { + twopass->arnr_strength_adjustment = 0; + } + + // Calculate the extra bits to be used for boosted frame(s) + gf_arf_bits = calculate_boost_bits((rc->baseline_gf_interval - 1), + rc->gfu_boost, gf_group_bits); + + // Adjust KF group bits and error remaining. + twopass->kf_group_error_left -= gf_group_err; + + // Decide GOP structure. + define_gf_group_structure(cpi); + + // Allocate bits to each of the frames in the GF group. + allocate_gf_group_bits(cpi, gf_group_bits, gf_arf_bits); + + // Reset the file position. + reset_fpf_position(twopass, start_pos); + + // Calculate a section intra ratio used in setting max loop filter. + twopass->section_intra_rating = calculate_section_intra_ratio( + start_pos, twopass->stats_in_end, rc->baseline_gf_interval); + + if (oxcf->resize_mode == RESIZE_DYNAMIC) { + // Default to starting GF groups at normal frame size. + cpi->rc.next_frame_size_selector = UNSCALED; + } +#if !CONFIG_ALWAYS_ADJUST_BPM + // Reset rolling actual and target bits counters for ARF groups. + twopass->rolling_arf_group_target_bits = 0; + twopass->rolling_arf_group_actual_bits = 0; +#endif + rc->preserve_arf_as_gld = rc->preserve_next_arf_as_gld; + rc->preserve_next_arf_as_gld = 0; + // If alt ref frame is flash do not set preserve_arf_as_gld + if (!is_lossless_requested(&cpi->oxcf) && !cpi->use_svc && + cpi->oxcf.aq_mode == NO_AQ && cpi->multi_layer_arf && !is_alt_ref_flash) + rc->preserve_next_arf_as_gld = 1; +} + +// Intra / Inter threshold very low +#define VERY_LOW_II 1.5 +// Clean slide transitions we expect a sharp single frame spike in error. +#define ERROR_SPIKE 5.0 + +// Slide show transition detection. +// Tests for case where there is very low error either side of the current frame +// but much higher just for this frame. This can help detect key frames in +// slide shows even where the slides are pictures of different sizes. +// Also requires that intra and inter errors are very similar to help eliminate +// harmful false positives. +// It will not help if the transition is a fade or other multi-frame effect. +static int slide_transition(const FIRSTPASS_STATS *this_frame, + const FIRSTPASS_STATS *last_frame, + const FIRSTPASS_STATS *next_frame) { + return (this_frame->intra_error < (this_frame->coded_error * VERY_LOW_II)) && + (this_frame->coded_error > (last_frame->coded_error * ERROR_SPIKE)) && + (this_frame->coded_error > (next_frame->coded_error * ERROR_SPIKE)); +} + +// This test looks for anomalous changes in the nature of the intra signal +// related to the previous and next frame as an indicator for coding a key +// frame. This test serves to detect some additional scene cuts, +// especially in lowish motion and low contrast sections, that are missed +// by the other tests. +static int intra_step_transition(const FIRSTPASS_STATS *this_frame, + const FIRSTPASS_STATS *last_frame, + const FIRSTPASS_STATS *next_frame) { + double last_ii_ratio; + double this_ii_ratio; + double next_ii_ratio; + double last_pcnt_intra = 1.0 - last_frame->pcnt_inter; + double this_pcnt_intra = 1.0 - this_frame->pcnt_inter; + double next_pcnt_intra = 1.0 - next_frame->pcnt_inter; + double mod_this_intra = this_pcnt_intra + this_frame->pcnt_neutral; + + // Calculate ii ratio for this frame last frame and next frame. + last_ii_ratio = + last_frame->intra_error / DOUBLE_DIVIDE_CHECK(last_frame->coded_error); + this_ii_ratio = + this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error); + next_ii_ratio = + next_frame->intra_error / DOUBLE_DIVIDE_CHECK(next_frame->coded_error); + + // Return true the intra/inter ratio for the current frame is + // low but better in the next and previous frame and the relative usage of + // intra in the current frame is markedly higher than the last and next frame. + if ((this_ii_ratio < 2.0) && (last_ii_ratio > 2.25) && + (next_ii_ratio > 2.25) && (this_pcnt_intra > (3 * last_pcnt_intra)) && + (this_pcnt_intra > (3 * next_pcnt_intra)) && + ((this_pcnt_intra > 0.075) || (mod_this_intra > 0.85))) { + return 1; + // Very low inter intra ratio (i.e. not much gain from inter coding), most + // blocks neutral on coding method and better inter prediction either side + } else if ((this_ii_ratio < 1.25) && (mod_this_intra > 0.85) && + (this_ii_ratio < last_ii_ratio * 0.9) && + (this_ii_ratio < next_ii_ratio * 0.9)) { + return 1; + } else { + return 0; + } +} + +// Minimum % intra coding observed in first pass (1.0 = 100%) +#define MIN_INTRA_LEVEL 0.25 +// Threshold for use of the lagging second reference frame. Scene cuts do not +// usually have a high second ref usage. +#define SECOND_REF_USAGE_THRESH 0.2 +// Hard threshold where the first pass chooses intra for almost all blocks. +// In such a case even if the frame is not a scene cut coding a key frame +// may be a good option. +#define VERY_LOW_INTER_THRESH 0.05 +// Maximum threshold for the relative ratio of intra error score vs best +// inter error score. +#define KF_II_ERR_THRESHOLD 2.5 +#define KF_II_MAX 128.0 +#define II_FACTOR 12.5 +// Test for very low intra complexity which could cause false key frames +#define V_LOW_INTRA 0.5 + +static int test_candidate_kf(const FIRST_PASS_INFO *first_pass_info, + int show_idx) { + const FIRSTPASS_STATS *last_frame = + fps_get_frame_stats(first_pass_info, show_idx - 1); + const FIRSTPASS_STATS *this_frame = + fps_get_frame_stats(first_pass_info, show_idx); + const FIRSTPASS_STATS *next_frame = + fps_get_frame_stats(first_pass_info, show_idx + 1); + int is_viable_kf = 0; + double pcnt_intra = 1.0 - this_frame->pcnt_inter; + + // Does the frame satisfy the primary criteria of a key frame? + // See above for an explanation of the test criteria. + // If so, then examine how well it predicts subsequent frames. + detect_flash_from_frame_stats(next_frame); + if (!detect_flash_from_frame_stats(this_frame) && + !detect_flash_from_frame_stats(next_frame) && + (this_frame->pcnt_second_ref < SECOND_REF_USAGE_THRESH) && + ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) || + (slide_transition(this_frame, last_frame, next_frame)) || + (intra_step_transition(this_frame, last_frame, next_frame)) || + (((this_frame->coded_error > (next_frame->coded_error * 1.2)) && + (this_frame->coded_error > (last_frame->coded_error * 1.2))) && + (pcnt_intra > MIN_INTRA_LEVEL) && + ((pcnt_intra + this_frame->pcnt_neutral) > 0.5) && + ((this_frame->intra_error / + DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < + KF_II_ERR_THRESHOLD)))) { + int i; + double boost_score = 0.0; + double old_boost_score = 0.0; + double decay_accumulator = 1.0; + + // Examine how well the key frame predicts subsequent frames. + for (i = 0; i < 16; ++i) { + const FIRSTPASS_STATS *frame_stats = + fps_get_frame_stats(first_pass_info, show_idx + 1 + i); + double next_iiratio = (II_FACTOR * frame_stats->intra_error / + DOUBLE_DIVIDE_CHECK(frame_stats->coded_error)); + + if (next_iiratio > KF_II_MAX) next_iiratio = KF_II_MAX; + + // Cumulative effect of decay in prediction quality. + if (frame_stats->pcnt_inter > 0.85) + decay_accumulator *= frame_stats->pcnt_inter; + else + decay_accumulator *= (0.85 + frame_stats->pcnt_inter) / 2.0; + + // Keep a running total. + boost_score += (decay_accumulator * next_iiratio); + + // Test various breakout clauses. + if ((frame_stats->pcnt_inter < 0.05) || (next_iiratio < 1.5) || + (((frame_stats->pcnt_inter - frame_stats->pcnt_neutral) < 0.20) && + (next_iiratio < 3.0)) || + ((boost_score - old_boost_score) < 3.0) || + (frame_stats->intra_error < V_LOW_INTRA)) { + break; + } + + old_boost_score = boost_score; + + // Get the next frame details + if (show_idx + 1 + i == fps_get_num_frames(first_pass_info) - 1) break; + } + + // If there is tolerable prediction for at least the next 3 frames then + // break out else discard this potential key frame and move on + if (boost_score > 30.0 && (i > 3)) { + is_viable_kf = 1; + } else { + is_viable_kf = 0; + } + } + + return is_viable_kf; +} + +#define FRAMES_TO_CHECK_DECAY 8 +#define MIN_KF_TOT_BOOST 300 +#define DEFAULT_SCAN_FRAMES_FOR_KF_BOOST 32 +#define MAX_SCAN_FRAMES_FOR_KF_BOOST 48 +#define MIN_SCAN_FRAMES_FOR_KF_BOOST 32 +#define KF_ABS_ZOOM_THRESH 6.0 + +int vp9_get_frames_to_next_key(const VP9EncoderConfig *oxcf, + const TWO_PASS *const twopass, int kf_show_idx, + int min_gf_interval) { + const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info; + double recent_loop_decay[FRAMES_TO_CHECK_DECAY]; + int j; + int frames_to_key; + int max_frames_to_key = first_pass_info->num_frames - kf_show_idx; + max_frames_to_key = VPXMIN(max_frames_to_key, oxcf->key_freq); + + // Initialize the decay rates for the recent frames to check + for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0; + // Find the next keyframe. + if (!oxcf->auto_key) { + frames_to_key = max_frames_to_key; + } else { + frames_to_key = 1; + while (frames_to_key < max_frames_to_key) { + // Provided that we are not at the end of the file... + if (kf_show_idx + frames_to_key + 1 < first_pass_info->num_frames) { + double loop_decay_rate; + double decay_accumulator; + const FIRSTPASS_STATS *next_frame = fps_get_frame_stats( + first_pass_info, kf_show_idx + frames_to_key + 1); + + // Check for a scene cut. + if (test_candidate_kf(first_pass_info, kf_show_idx + frames_to_key)) + break; + + // How fast is the prediction quality decaying? + loop_decay_rate = get_prediction_decay_rate(twopass, next_frame); + + // We want to know something about the recent past... rather than + // as used elsewhere where we are concerned with decay in prediction + // quality since the last GF or KF. + recent_loop_decay[(frames_to_key - 1) % FRAMES_TO_CHECK_DECAY] = + loop_decay_rate; + decay_accumulator = 1.0; + for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) + decay_accumulator *= recent_loop_decay[j]; + + // Special check for transition or high motion followed by a + // static scene. + if ((frames_to_key - 1) > min_gf_interval && loop_decay_rate >= 0.999 && + decay_accumulator < 0.9) { + int still_interval = oxcf->key_freq - (frames_to_key - 1); + // TODO(angiebird): Figure out why we use "+1" here + int show_idx = kf_show_idx + frames_to_key; + if (check_transition_to_still(first_pass_info, show_idx, + still_interval)) { + break; + } + } + } + ++frames_to_key; + } + } + return frames_to_key; +} + +static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) { + int i; + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->twopass; + GF_GROUP *const gf_group = &twopass->gf_group; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info; + const FRAME_INFO *frame_info = &cpi->frame_info; + const FIRSTPASS_STATS *const start_position = twopass->stats_in; + const FIRSTPASS_STATS *keyframe_stats = + fps_get_frame_stats(first_pass_info, kf_show_idx); + FIRSTPASS_STATS next_frame; + int kf_bits = 0; + int64_t max_kf_bits; + double zero_motion_accumulator = 1.0; + double zero_motion_sum = 0.0; + double zero_motion_avg; + double motion_compensable_sum = 0.0; + double motion_compensable_avg; + int num_frames = 0; + int kf_boost_scan_frames = DEFAULT_SCAN_FRAMES_FOR_KF_BOOST; + double boost_score = 0.0; + double kf_mod_err = 0.0; + double kf_raw_err = 0.0; + double kf_group_err = 0.0; + double sr_accumulator = 0.0; + double abs_mv_in_out_accumulator = 0.0; + const double av_err = get_distribution_av_err(cpi, twopass); + const double mean_mod_score = twopass->mean_mod_score; + vp9_zero(next_frame); + + cpi->common.frame_type = KEY_FRAME; + rc->frames_since_key = 0; + + // Reset the GF group data structures. + vp9_zero(*gf_group); + + // Is this a forced key frame by interval. + rc->this_key_frame_forced = rc->next_key_frame_forced; + + // Clear the alt ref active flag and last group multi arf flags as they + // can never be set for a key frame. + rc->source_alt_ref_active = 0; + + // KF is always a GF so clear frames till next gf counter. + rc->frames_till_gf_update_due = 0; + + rc->frames_to_key = 1; + + twopass->kf_group_bits = 0; // Total bits available to kf group + twopass->kf_group_error_left = 0.0; // Group modified error score. + + kf_raw_err = keyframe_stats->intra_error; + kf_mod_err = calc_norm_frame_score(oxcf, frame_info, keyframe_stats, + mean_mod_score, av_err); + + rc->frames_to_key = vp9_get_frames_to_next_key(oxcf, twopass, kf_show_idx, + rc->min_gf_interval); + + // If there is a max kf interval set by the user we must obey it. + // We already breakout of the loop above at 2x max. + // This code centers the extra kf if the actual natural interval + // is between 1x and 2x. + if (rc->frames_to_key >= cpi->oxcf.key_freq) { + rc->next_key_frame_forced = 1; + } else { + rc->next_key_frame_forced = 0; + } + + for (i = 0; i < rc->frames_to_key; ++i) { + const FIRSTPASS_STATS *frame_stats = + fps_get_frame_stats(first_pass_info, kf_show_idx + i); + // Accumulate kf group error. + kf_group_err += calc_norm_frame_score(oxcf, frame_info, frame_stats, + mean_mod_score, av_err); + } + + // Calculate the number of bits that should be assigned to the kf group. + if (twopass->bits_left > 0 && twopass->normalized_score_left > 0.0) { + // Maximum number of bits for a single normal frame (not key frame). + const int max_bits = frame_max_bits(rc, &cpi->oxcf); + + // Maximum number of bits allocated to the key frame group. + int64_t max_grp_bits; + + // Default allocation based on bits left and relative + // complexity of the section. + twopass->kf_group_bits = (int64_t)( + twopass->bits_left * (kf_group_err / twopass->normalized_score_left)); + + // Clip based on maximum per frame rate defined by the user. + max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key; + if (twopass->kf_group_bits > max_grp_bits) + twopass->kf_group_bits = max_grp_bits; + } else { + twopass->kf_group_bits = 0; + } + twopass->kf_group_bits = VPXMAX(0, twopass->kf_group_bits); + + // Scan through the kf group collating various stats used to determine + // how many bits to spend on it. + boost_score = 0.0; + + for (i = 0; i < VPXMIN(MAX_SCAN_FRAMES_FOR_KF_BOOST, (rc->frames_to_key - 1)); + ++i) { + if (EOF == input_stats(twopass, &next_frame)) break; + + zero_motion_sum += next_frame.pcnt_inter - next_frame.pcnt_motion; + motion_compensable_sum += + 1 - (double)next_frame.coded_error / next_frame.intra_error; + num_frames++; + } + + if (num_frames >= MIN_SCAN_FRAMES_FOR_KF_BOOST) { + zero_motion_avg = zero_motion_sum / num_frames; + motion_compensable_avg = motion_compensable_sum / num_frames; + kf_boost_scan_frames = (int)(VPXMAX(64 * zero_motion_avg - 16, + 160 * motion_compensable_avg - 112)); + kf_boost_scan_frames = + VPXMAX(VPXMIN(kf_boost_scan_frames, MAX_SCAN_FRAMES_FOR_KF_BOOST), + MIN_SCAN_FRAMES_FOR_KF_BOOST); + } + reset_fpf_position(twopass, start_position); + + for (i = 0; i < (rc->frames_to_key - 1); ++i) { + if (EOF == input_stats(twopass, &next_frame)) break; + + // The zero motion test here insures that if we mark a kf group as static + // it is static throughout not just the first KF_BOOST_SCAN_MAX_FRAMES. + // It also allows for a larger boost on long static groups. + if ((i <= kf_boost_scan_frames) || (zero_motion_accumulator >= 0.99)) { + double frame_boost; + double zm_factor; + + // Monitor for static sections. + // First frame in kf group the second ref indicator is invalid. + if (i > 0) { + zero_motion_accumulator = + VPXMIN(zero_motion_accumulator, + get_zero_motion_factor(twopass, &next_frame)); + } else { + zero_motion_accumulator = + next_frame.pcnt_inter - next_frame.pcnt_motion; + } + + // Factor 0.75-1.25 based on how much of frame is static. + zm_factor = (0.75 + (zero_motion_accumulator / 2.0)); + + // The second (lagging) ref error is not valid immediately after + // a key frame because either the lag has not built up (in the case of + // the first key frame or it points to a reference before the new key + // frame. + if (i < 2) sr_accumulator = 0.0; + frame_boost = + calc_kf_frame_boost(cpi, &next_frame, &sr_accumulator, 0, zm_factor); + + boost_score += frame_boost; + + // Measure of zoom. Large zoom tends to indicate reduced boost. + abs_mv_in_out_accumulator += + fabs(next_frame.mv_in_out_count * next_frame.pcnt_motion); + + if ((frame_boost < 25.00) || + (abs_mv_in_out_accumulator > KF_ABS_ZOOM_THRESH) || + (sr_accumulator > (kf_raw_err * 1.50))) + break; + } else { + break; + } + } + + reset_fpf_position(twopass, start_position); + + // Store the zero motion percentage + twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0); + + // Calculate a section intra ratio used in setting max loop filter. + twopass->key_frame_section_intra_rating = calculate_section_intra_ratio( + start_position, twopass->stats_in_end, rc->frames_to_key); + + // Special case for static / slide show content but don't apply + // if the kf group is very short. + if ((zero_motion_accumulator > 0.99) && (rc->frames_to_key > 8)) { + rc->kf_boost = (int)(twopass->kf_max_total_boost); + } else { + // Apply various clamps for min and max oost + rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3)); + rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST); + rc->kf_boost = VPXMIN(rc->kf_boost, (int)(twopass->kf_max_total_boost)); + } + + // Work out how many bits to allocate for the key frame itself. + kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost, + twopass->kf_group_bits); + // Based on the spatial complexity, increase the bits allocated to key frame. + kf_bits += + (int)((twopass->kf_group_bits - kf_bits) * (kf_mod_err / kf_group_err)); + max_kf_bits = + twopass->kf_group_bits - (rc->frames_to_key - 1) * FRAME_OVERHEAD_BITS; + max_kf_bits = lclamp(max_kf_bits, 0, INT_MAX); + kf_bits = VPXMIN(kf_bits, (int)max_kf_bits); + + twopass->kf_group_bits -= kf_bits; + + // Save the bits to spend on the key frame. + gf_group->bit_allocation[0] = kf_bits; + gf_group->update_type[0] = KF_UPDATE; + gf_group->rf_level[0] = KF_STD; + gf_group->layer_depth[0] = 0; + + // Note the total error score of the kf group minus the key frame itself. + twopass->kf_group_error_left = (kf_group_err - kf_mod_err); + + // Adjust the count of total modified error left. + // The count of bits left is adjusted elsewhere based on real coded frame + // sizes. + twopass->normalized_score_left -= kf_group_err; + + if (oxcf->resize_mode == RESIZE_DYNAMIC) { + // Default to normal-sized frame on keyframes. + cpi->rc.next_frame_size_selector = UNSCALED; + } +} + +// Configure image size specific vizier parameters. +// Later these will be set via additional command line options +void vp9_init_vizier_params(TWO_PASS *const twopass, int screen_area) { + // When |use_vizier_rc_params| is 1, we expect the rc parameters below to + // have been initialised on the command line as adjustment factors such + // that a factor of 1.0 will match the default behavior when + // |use_vizier_rc_params| is 0 + if (twopass->use_vizier_rc_params) { + twopass->active_wq_factor *= AV_WQ_FACTOR; + twopass->err_per_mb *= BASELINE_ERR_PER_MB; + twopass->sr_default_decay_limit *= DEFAULT_DECAY_LIMIT; + if (twopass->sr_default_decay_limit > 1.0) // > 1.0 here makes no sense + twopass->sr_default_decay_limit = 1.0; + twopass->sr_diff_factor *= 1.0; + twopass->gf_frame_max_boost *= GF_MAX_FRAME_BOOST; + twopass->gf_max_total_boost *= MAX_GF_BOOST; + // NOTE: In use max boost has precedence over min boost. So even if min is + // somehow set higher than max the final boost value will be clamped to the + // appropriate maximum. + twopass->kf_frame_min_boost *= KF_MIN_FRAME_BOOST; + twopass->kf_frame_max_boost_first *= KF_MAX_FRAME_BOOST; + twopass->kf_frame_max_boost_subs *= KF_MAX_FRAME_BOOST; + twopass->kf_max_total_boost *= MAX_KF_TOT_BOOST; + twopass->zm_factor *= DEFAULT_ZM_FACTOR; + if (twopass->zm_factor > 1.0) // > 1.0 here makes no sense + twopass->zm_factor = 1.0; + + // Correction for the fact that the kf_err_per_mb_factor default is + // already different for different video formats and ensures that a passed + // in value of 1.0 on the vizier command line will still match the current + // default. + if (screen_area < 1280 * 720) { + twopass->kf_err_per_mb *= 2000.0; + } else if (screen_area < 1920 * 1080) { + twopass->kf_err_per_mb *= 500.0; + } else { + twopass->kf_err_per_mb *= 250.0; + } + } else { + // When |use_vizier_rc_params| is 0, use defaults. + twopass->active_wq_factor = AV_WQ_FACTOR; + twopass->err_per_mb = BASELINE_ERR_PER_MB; + twopass->sr_default_decay_limit = DEFAULT_DECAY_LIMIT; + twopass->sr_diff_factor = 1.0; + twopass->gf_frame_max_boost = GF_MAX_FRAME_BOOST; + twopass->gf_max_total_boost = MAX_GF_BOOST; + twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST; + twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; + twopass->kf_frame_max_boost_subs = KF_MAX_FRAME_BOOST; + twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; + twopass->zm_factor = DEFAULT_ZM_FACTOR; + + if (screen_area < 1280 * 720) { + twopass->kf_err_per_mb = 2000.0; + } else if (screen_area < 1920 * 1080) { + twopass->kf_err_per_mb = 500.0; + } else { + twopass->kf_err_per_mb = 250.0; + } + } +} + +void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->twopass; + GF_GROUP *const gf_group = &twopass->gf_group; + FIRSTPASS_STATS this_frame; + const int show_idx = cm->current_video_frame; + + if (cpi->common.current_frame_coding_index == 0 && + cpi->ext_ratectrl.funcs.send_firstpass_stats != NULL) { + const vpx_codec_err_t codec_status = vp9_extrc_send_firstpass_stats( + &cpi->ext_ratectrl, &cpi->twopass.first_pass_info); + if (codec_status != VPX_CODEC_OK) { + vpx_internal_error(&cm->error, codec_status, + "vp9_extrc_send_firstpass_stats() failed"); + } + } + + if (!twopass->stats_in) return; + + // Configure image size specific vizier parameters + if (cm->current_video_frame == 0) { + unsigned int screen_area = (cm->width * cm->height); + + vp9_init_vizier_params(twopass, screen_area); + } + + // If this is an arf frame then we don't want to read the stats file or + // advance the input pointer as we already have what we need. + if (gf_group->update_type[gf_group->index] == ARF_UPDATE) { + int target_rate; + + vp9_zero(this_frame); + this_frame = + cpi->twopass.stats_in_start[cm->current_video_frame + + gf_group->arf_src_offset[gf_group->index]]; + + vp9_configure_buffer_updates(cpi, gf_group->index); + + target_rate = gf_group->bit_allocation[gf_group->index]; + target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate); + rc->base_frame_target = target_rate; + + cm->frame_type = INTER_FRAME; + + // The multiplication by 256 reverses a scaling factor of (>> 8) + // applied when combining MB error values for the frame. + twopass->mb_av_energy = log((this_frame.intra_error * 256.0) + 1.0); + twopass->mb_smooth_pct = this_frame.intra_smooth_pct; + + return; + } + + vpx_clear_system_state(); + + if (cpi->oxcf.rc_mode == VPX_Q) { + twopass->active_worst_quality = cpi->oxcf.cq_level; + } else if (cm->current_video_frame == 0) { + const int frames_left = + (int)(twopass->total_stats.count - cm->current_video_frame); + // Special case code for first frame. + const int section_target_bandwidth = + (int)(twopass->bits_left / frames_left); + const double section_length = twopass->total_left_stats.count; + const double section_error = + twopass->total_left_stats.coded_error / section_length; + const double section_intra_skip = + twopass->total_left_stats.intra_skip_pct / section_length; + const double section_inactive_zone = + (twopass->total_left_stats.inactive_zone_rows * 2) / + ((double)cm->mb_rows * section_length); + const double section_noise = + twopass->total_left_stats.frame_noise_energy / section_length; + int tmp_q; + + tmp_q = get_twopass_worst_quality( + cpi, section_error, section_intra_skip + section_inactive_zone, + section_noise, section_target_bandwidth); + + twopass->active_worst_quality = tmp_q; + twopass->baseline_active_worst_quality = tmp_q; + rc->ni_av_qi = tmp_q; + rc->last_q[INTER_FRAME] = tmp_q; + rc->avg_q = vp9_convert_qindex_to_q(tmp_q, cm->bit_depth); + rc->avg_frame_qindex[INTER_FRAME] = tmp_q; + rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.best_allowed_q) / 2; + rc->avg_frame_qindex[KEY_FRAME] = rc->last_q[KEY_FRAME]; + } + vp9_zero(this_frame); + if (EOF == input_stats(twopass, &this_frame)) return; + + // Set the frame content type flag. + if (this_frame.intra_skip_pct >= FC_ANIMATION_THRESH) + twopass->fr_content_type = FC_GRAPHICS_ANIMATION; + else + twopass->fr_content_type = FC_NORMAL; + + // Keyframe and section processing. + if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) { + // Define next KF group and assign bits to it. + find_next_key_frame(cpi, show_idx); + } else { + cm->frame_type = INTER_FRAME; + } + + // Define a new GF/ARF group. (Should always enter here for key frames). + if (rc->frames_till_gf_update_due == 0) { + define_gf_group(cpi, show_idx); + + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + +#if ARF_STATS_OUTPUT + { + FILE *fpfile; + fpfile = fopen("arf.stt", "a"); + ++arf_count; + fprintf(fpfile, "%10d %10ld %10d %10d %10ld %10ld\n", + cm->current_video_frame, rc->frames_till_gf_update_due, + rc->kf_boost, arf_count, rc->gfu_boost, cm->frame_type); + + fclose(fpfile); + } +#endif + } + + vp9_configure_buffer_updates(cpi, gf_group->index); + + rc->base_frame_target = gf_group->bit_allocation[gf_group->index]; + + // The multiplication by 256 reverses a scaling factor of (>> 8) + // applied when combining MB error values for the frame. + twopass->mb_av_energy = log((this_frame.intra_error * 256.0) + 1.0); + twopass->mb_smooth_pct = this_frame.intra_smooth_pct; + + // Update the total stats remaining structure. + subtract_stats(&twopass->total_left_stats, &this_frame); +} + +void vp9_twopass_postencode_update(VP9_COMP *cpi) { + TWO_PASS *const twopass = &cpi->twopass; + RATE_CONTROL *const rc = &cpi->rc; + VP9_COMMON *const cm = &cpi->common; + const int bits_used = rc->base_frame_target; + + // VBR correction is done through rc->vbr_bits_off_target. Based on the + // sign of this value, a limited % adjustment is made to the target rate + // of subsequent frames, to try and push it back towards 0. This method + // is designed to prevent extreme behaviour at the end of a clip + // or group of frames. + rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size; + twopass->bits_left = VPXMAX(twopass->bits_left - bits_used, 0); + + // Target vs actual bits for this arf group. + twopass->rolling_arf_group_target_bits += rc->this_frame_target; + twopass->rolling_arf_group_actual_bits += rc->projected_frame_size; + + // Calculate the pct rc error. + if (rc->total_actual_bits) { + rc->rate_error_estimate = + (int)((rc->vbr_bits_off_target * 100) / rc->total_actual_bits); + rc->rate_error_estimate = clamp(rc->rate_error_estimate, -100, 100); + } else { + rc->rate_error_estimate = 0; + } + + if (cpi->common.frame_type != KEY_FRAME) { + twopass->kf_group_bits -= bits_used; + twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct; + } + twopass->kf_group_bits = VPXMAX(twopass->kf_group_bits, 0); + + // Increment the gf group index ready for the next frame. + ++twopass->gf_group.index; + + // If the rate control is drifting consider adjustment to min or maxq. + if ((cpi->oxcf.rc_mode != VPX_Q) && !cpi->rc.is_src_frame_alt_ref) { + const int maxq_adj_limit = + rc->worst_quality - twopass->active_worst_quality; + const int minq_adj_limit = + (cpi->oxcf.rc_mode == VPX_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT); + int aq_extend_min = 0; + int aq_extend_max = 0; + + // Extend min or Max Q range to account for imbalance from the base + // value when using AQ. + if (cpi->oxcf.aq_mode != NO_AQ && cpi->oxcf.aq_mode != PSNR_AQ && + cpi->oxcf.aq_mode != PERCEPTUAL_AQ) { + if (cm->seg.aq_av_offset < 0) { + // The balance of the AQ map tends towarda lowering the average Q. + aq_extend_min = 0; + aq_extend_max = VPXMIN(maxq_adj_limit, -cm->seg.aq_av_offset); + } else { + // The balance of the AQ map tends towards raising the average Q. + aq_extend_min = VPXMIN(minq_adj_limit, cm->seg.aq_av_offset); + aq_extend_max = 0; + } + } + + // Undershoot. + if (rc->rate_error_estimate > cpi->oxcf.under_shoot_pct) { + --twopass->extend_maxq; + if (rc->rolling_target_bits >= rc->rolling_actual_bits) + ++twopass->extend_minq; + // Overshoot. + } else if (rc->rate_error_estimate < -cpi->oxcf.over_shoot_pct) { + --twopass->extend_minq; + if (rc->rolling_target_bits < rc->rolling_actual_bits) + ++twopass->extend_maxq; + } else { + // Adjustment for extreme local overshoot. + if (rc->projected_frame_size > (2 * rc->base_frame_target) && + rc->projected_frame_size > (2 * rc->avg_frame_bandwidth)) + ++twopass->extend_maxq; + + // Unwind undershoot or overshoot adjustment. + if (rc->rolling_target_bits < rc->rolling_actual_bits) + --twopass->extend_minq; + else if (rc->rolling_target_bits > rc->rolling_actual_bits) + --twopass->extend_maxq; + } + + twopass->extend_minq = + clamp(twopass->extend_minq, aq_extend_min, minq_adj_limit); + twopass->extend_maxq = + clamp(twopass->extend_maxq, aq_extend_max, maxq_adj_limit); + + // If there is a big and undexpected undershoot then feed the extra + // bits back in quickly. One situation where this may happen is if a + // frame is unexpectedly almost perfectly predicted by the ARF or GF + // but not very well predcited by the previous frame. + if (!frame_is_kf_gf_arf(cpi) && !cpi->rc.is_src_frame_alt_ref) { + int fast_extra_thresh = rc->base_frame_target / HIGH_UNDERSHOOT_RATIO; + if (rc->projected_frame_size < fast_extra_thresh) { + rc->vbr_bits_off_target_fast += + fast_extra_thresh - rc->projected_frame_size; + rc->vbr_bits_off_target_fast = + VPXMIN(rc->vbr_bits_off_target_fast, (4 * rc->avg_frame_bandwidth)); + + // Fast adaptation of minQ if necessary to use up the extra bits. + if (rc->avg_frame_bandwidth) { + twopass->extend_minq_fast = + (int)(rc->vbr_bits_off_target_fast * 8 / rc->avg_frame_bandwidth); + } + twopass->extend_minq_fast = VPXMIN( + twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq); + } else if (rc->vbr_bits_off_target_fast) { + twopass->extend_minq_fast = VPXMIN( + twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq); + } else { + twopass->extend_minq_fast = 0; + } + } + } +} + +#if CONFIG_RATE_CTRL +void vp9_get_next_group_of_picture(const VP9_COMP *cpi, int *first_is_key_frame, + int *use_alt_ref, int *coding_frame_count, + int *first_show_idx, + int *last_gop_use_alt_ref) { + const GOP_COMMAND *gop_command = &cpi->encode_command.gop_command; + // We make a copy of rc here because we want to get information from the + // encoder without changing its state. + // TODO(angiebird): Avoid copying rc here. + RATE_CONTROL rc = cpi->rc; + const int multi_layer_arf = 0; + const int allow_alt_ref = 1; + // We assume that current_video_frame is updated to the show index of the + // frame we are about to called. Note that current_video_frame is updated at + // the end of encode_frame_to_data_rate(). + // TODO(angiebird): Avoid this kind of fragile style. + *first_show_idx = cpi->common.current_video_frame; + *last_gop_use_alt_ref = rc.source_alt_ref_active; + + *first_is_key_frame = 0; + if (rc.frames_to_key == 0) { + rc.frames_to_key = vp9_get_frames_to_next_key( + &cpi->oxcf, &cpi->twopass, *first_show_idx, rc.min_gf_interval); + rc.frames_since_key = 0; + *first_is_key_frame = 1; + } + + if (gop_command->use) { + *coding_frame_count = gop_command_coding_frame_count(gop_command); + *use_alt_ref = gop_command->use_alt_ref; + assert(gop_command->show_frame_count <= rc.frames_to_key); + } else { + *coding_frame_count = vp9_get_gop_coding_frame_count( + &cpi->oxcf, &cpi->twopass, &cpi->frame_info, &rc, *first_show_idx, + multi_layer_arf, allow_alt_ref, *first_is_key_frame, + *last_gop_use_alt_ref, use_alt_ref); + } +} + +int vp9_get_gop_coding_frame_count(const VP9EncoderConfig *oxcf, + const TWO_PASS *const twopass, + const FRAME_INFO *frame_info, + const RATE_CONTROL *rc, int show_idx, + int multi_layer_arf, int allow_alt_ref, + int first_is_key_frame, + int last_gop_use_alt_ref, int *use_alt_ref) { + int frame_count; + double gop_intra_factor; + const int arf_active_or_kf = last_gop_use_alt_ref || first_is_key_frame; + RANGE active_gf_interval; + int arf_layers; + int end_of_sequence = 0; + if (oxcf->use_simple_encode_api) { + active_gf_interval = get_active_gf_inverval_range_simple( + rc->min_gf_interval, arf_active_or_kf, rc->frames_to_key); + } else { + active_gf_interval = get_active_gf_inverval_range( + frame_info, rc, arf_active_or_kf, show_idx, /*active_worst_quality=*/0, + /*last_boosted_qindex=*/0); + } + + arf_layers = get_arf_layers(multi_layer_arf, oxcf->enable_auto_arf, + active_gf_interval.max); + if (multi_layer_arf) { + gop_intra_factor = 1.0 + 0.25 * arf_layers; + } else { + gop_intra_factor = 1.0; + } + + frame_count = get_gop_coding_frame_num( + use_alt_ref, frame_info, twopass, rc, show_idx, &active_gf_interval, + gop_intra_factor, oxcf->lag_in_frames, &end_of_sequence); + *use_alt_ref &= allow_alt_ref; + return frame_count; +} + +// Under CONFIG_RATE_CTRL, once the first_pass_info is ready, the number of +// coding frames (including show frame and alt ref) can be determined. +int vp9_get_coding_frame_num(const VP9EncoderConfig *oxcf, + const TWO_PASS *const twopass, + const FRAME_INFO *frame_info, int multi_layer_arf, + int allow_alt_ref) { + const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info; + int coding_frame_num = 0; + RATE_CONTROL rc; + int gop_coding_frame_count; + int gop_show_frames; + int show_idx = 0; + int last_gop_use_alt_ref = 0; + vp9_rc_init(oxcf, 1, &rc); + + while (show_idx < first_pass_info->num_frames) { + int use_alt_ref; + int first_is_key_frame = 0; + if (rc.frames_to_key == 0) { + rc.frames_to_key = vp9_get_frames_to_next_key(oxcf, twopass, show_idx, + rc.min_gf_interval); + rc.frames_since_key = 0; + first_is_key_frame = 1; + } + + gop_coding_frame_count = vp9_get_gop_coding_frame_count( + oxcf, twopass, frame_info, &rc, show_idx, multi_layer_arf, + allow_alt_ref, first_is_key_frame, last_gop_use_alt_ref, &use_alt_ref); + + rc.source_alt_ref_active = use_alt_ref; + last_gop_use_alt_ref = use_alt_ref; + gop_show_frames = gop_coding_frame_count - use_alt_ref; + rc.frames_to_key -= gop_show_frames; + rc.frames_since_key += gop_show_frames; + show_idx += gop_show_frames; + coding_frame_num += gop_show_frames + use_alt_ref; + } + return coding_frame_num; +} + +void vp9_get_key_frame_map(const VP9EncoderConfig *oxcf, + const TWO_PASS *const twopass, int *key_frame_map) { + const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info; + int show_idx = 0; + RATE_CONTROL rc; + vp9_rc_init(oxcf, 1, &rc); + + // key_frame_map points to an int array with size equal to + // first_pass_info->num_frames, which is also the number of show frames in the + // video. + memset(key_frame_map, 0, + sizeof(*key_frame_map) * first_pass_info->num_frames); + while (show_idx < first_pass_info->num_frames) { + int key_frame_group_size; + key_frame_map[show_idx] = 1; + key_frame_group_size = + vp9_get_frames_to_next_key(oxcf, twopass, show_idx, rc.min_gf_interval); + assert(key_frame_group_size > 0); + show_idx += key_frame_group_size; + } + assert(show_idx == first_pass_info->num_frames); +} +#endif // CONFIG_RATE_CTRL + +FIRSTPASS_STATS vp9_get_frame_stats(const TWO_PASS *twopass) { + return twopass->this_frame_stats; +} +FIRSTPASS_STATS vp9_get_total_stats(const TWO_PASS *twopass) { + return twopass->total_stats; +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.h b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.h new file mode 100644 index 0000000000..a19b04db74 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.h @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_FIRSTPASS_H_ +#define VPX_VP9_ENCODER_VP9_FIRSTPASS_H_ + +#include + +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/encoder/vp9_firstpass_stats.h" +#include "vp9/encoder/vp9_lookahead.h" +#include "vp9/encoder/vp9_ratectrl.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define INVALID_ROW (-1) + +#define MAX_ARF_LAYERS 6 +#define SECTION_NOISE_DEF 250.0 + +typedef struct { + double frame_mb_intra_factor; + double frame_mb_brightness_factor; + double frame_mb_neutral_count; +} FP_MB_FLOAT_STATS; + +typedef struct { + double intra_factor; + double brightness_factor; + int64_t coded_error; + int64_t sr_coded_error; + int64_t frame_noise_energy; + int64_t intra_error; + int intercount; + int second_ref_count; + double neutral_count; + double intra_count_low; // Coded intra but low variance + double intra_count_high; // Coded intra high variance + int intra_skip_count; + int image_data_start_row; + int mvcount; + int sum_mvr; + int sum_mvr_abs; + int sum_mvc; + int sum_mvc_abs; + int64_t sum_mvrs; + int64_t sum_mvcs; + int sum_in_vectors; + int intra_smooth_count; + int new_mv_count; +} FIRSTPASS_DATA; + +typedef enum { + KF_UPDATE = 0, + LF_UPDATE = 1, + GF_UPDATE = 2, + ARF_UPDATE = 3, + OVERLAY_UPDATE = 4, + MID_OVERLAY_UPDATE = 5, + USE_BUF_FRAME = 6, // Use show existing frame, no ref buffer update + FRAME_UPDATE_TYPES = 7 +} FRAME_UPDATE_TYPE; + +#define FC_ANIMATION_THRESH 0.15 +typedef enum { + FC_NORMAL = 0, + FC_GRAPHICS_ANIMATION = 1, + FRAME_CONTENT_TYPES = 2 +} FRAME_CONTENT_TYPE; + +typedef struct { + unsigned char index; + RATE_FACTOR_LEVEL rf_level[MAX_STATIC_GF_GROUP_LENGTH + 2]; + FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH + 2]; + unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH + 2]; + unsigned char layer_depth[MAX_STATIC_GF_GROUP_LENGTH + 2]; + unsigned char frame_gop_index[MAX_STATIC_GF_GROUP_LENGTH + 2]; + int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH + 2]; + int gfu_boost[MAX_STATIC_GF_GROUP_LENGTH + 2]; + + int frame_start; + int frame_end; + // TODO(jingning): The array size of arf_stack could be reduced. + int arf_index_stack[MAX_LAG_BUFFERS * 2]; + int top_arf_idx; + int stack_size; + int gf_group_size; + int max_layer_depth; + int allowed_max_layer_depth; + int group_noise_energy; +} GF_GROUP; + +typedef struct { + const FIRSTPASS_STATS *stats; + int num_frames; +} FIRST_PASS_INFO; + +static INLINE void fps_init_first_pass_info(FIRST_PASS_INFO *first_pass_info, + const FIRSTPASS_STATS *stats, + int num_frames) { + first_pass_info->stats = stats; + first_pass_info->num_frames = num_frames; +} + +static INLINE int fps_get_num_frames(const FIRST_PASS_INFO *first_pass_info) { + return first_pass_info->num_frames; +} + +static INLINE const FIRSTPASS_STATS *fps_get_frame_stats( + const FIRST_PASS_INFO *first_pass_info, int show_idx) { + if (show_idx < 0 || show_idx >= first_pass_info->num_frames) { + return NULL; + } + return &first_pass_info->stats[show_idx]; +} + +typedef struct { + unsigned int section_intra_rating; + unsigned int key_frame_section_intra_rating; + FIRSTPASS_STATS total_stats; + FIRSTPASS_STATS this_frame_stats; + const FIRSTPASS_STATS *stats_in; + const FIRSTPASS_STATS *stats_in_start; + const FIRSTPASS_STATS *stats_in_end; + FIRST_PASS_INFO first_pass_info; + FIRSTPASS_STATS total_left_stats; + int first_pass_done; + int64_t bits_left; + double mean_mod_score; + double normalized_score_left; + double mb_av_energy; + double mb_smooth_pct; + + FP_MB_FLOAT_STATS *fp_mb_float_stats; + + // An indication of the content type of the current frame + FRAME_CONTENT_TYPE fr_content_type; + + // Projected total bits available for a key frame group of frames + int64_t kf_group_bits; + + // Error score of frames still to be coded in kf group + double kf_group_error_left; + + double bpm_factor; + int rolling_arf_group_target_bits; + int rolling_arf_group_actual_bits; + + int sr_update_lag; + int kf_zeromotion_pct; + int last_kfgroup_zeromotion_pct; + int active_worst_quality; + int baseline_active_worst_quality; + int extend_minq; + int extend_maxq; + int extend_minq_fast; + int arnr_strength_adjustment; + int last_qindex_of_arf_layer[MAX_ARF_LAYERS]; + + GF_GROUP gf_group; + + // Vizeir project experimental two pass rate control parameters. + // When |use_vizier_rc_params| is 1, the following parameters will + // be overwritten by pass in values. Otherwise, they are initialized + // by default values. + int use_vizier_rc_params; + double active_wq_factor; + double err_per_mb; + double sr_default_decay_limit; + double sr_diff_factor; + double kf_err_per_mb; + double kf_frame_min_boost; + double kf_frame_max_boost_first; // Max for first kf in a chunk. + double kf_frame_max_boost_subs; // Max for subsequent mid chunk kfs. + double kf_max_total_boost; + double gf_max_total_boost; + double gf_frame_max_boost; + double zm_factor; +} TWO_PASS; + +struct VP9_COMP; +struct ThreadData; +struct TileDataEnc; + +void vp9_init_first_pass(struct VP9_COMP *cpi); +void vp9_first_pass(struct VP9_COMP *cpi, const struct lookahead_entry *source); +void vp9_end_first_pass(struct VP9_COMP *cpi); + +void vp9_first_pass_encode_tile_mb_row(struct VP9_COMP *cpi, + struct ThreadData *td, + FIRSTPASS_DATA *fp_acc_data, + struct TileDataEnc *tile_data, + MV *best_ref_mv, int mb_row); + +void vp9_init_second_pass(struct VP9_COMP *cpi); +void vp9_rc_get_second_pass_params(struct VP9_COMP *cpi); +void vp9_init_vizier_params(TWO_PASS *const twopass, int screen_area); + +// Post encode update of the rate control parameters for 2-pass +void vp9_twopass_postencode_update(struct VP9_COMP *cpi); + +void calculate_coded_size(struct VP9_COMP *cpi, int *scaled_frame_width, + int *scaled_frame_height); + +struct VP9EncoderConfig; +int vp9_get_frames_to_next_key(const struct VP9EncoderConfig *oxcf, + const TWO_PASS *const twopass, int kf_show_idx, + int min_gf_interval); +#if CONFIG_RATE_CTRL +/* Call this function to get info about the next group of pictures. + * This function should be called after vp9_create_compressor() when encoding + * starts or after vp9_get_compressed_data() when the encoding process of + * the last group of pictures is just finished. + */ +void vp9_get_next_group_of_picture(const struct VP9_COMP *cpi, + int *first_is_key_frame, int *use_alt_ref, + int *coding_frame_count, int *first_show_idx, + int *last_gop_use_alt_ref); + +/*!\brief Call this function before coding a new group of pictures to get + * information about it. + * \param[in] oxcf Encoder config + * \param[in] twopass Twopass info + * \param[in] frame_info Frame info + * \param[in] rc Rate control state + * \param[in] show_idx Show index of the first frame in the group + * \param[in] multi_layer_arf Is multi-layer alternate reference used + * \param[in] allow_alt_ref Is alternate reference allowed + * \param[in] first_is_key_frame Is the first frame in the group a key frame + * \param[in] last_gop_use_alt_ref Does the last group use alternate reference + * + * \param[out] use_alt_ref Does this group use alternate reference + * + * \return Returns coding frame count + */ +int vp9_get_gop_coding_frame_count(const struct VP9EncoderConfig *oxcf, + const TWO_PASS *const twopass, + const FRAME_INFO *frame_info, + const RATE_CONTROL *rc, int show_idx, + int multi_layer_arf, int allow_alt_ref, + int first_is_key_frame, + int last_gop_use_alt_ref, int *use_alt_ref); + +int vp9_get_coding_frame_num(const struct VP9EncoderConfig *oxcf, + const TWO_PASS *const twopass, + const FRAME_INFO *frame_info, int multi_layer_arf, + int allow_alt_ref); + +/*!\brief Compute a key frame binary map indicates whether key frames appear + * in the corresponding positions. The passed in key_frame_map must point to an + * integer array with length equal to twopass->first_pass_info.num_frames, + * which is the number of show frames in the video. + */ +void vp9_get_key_frame_map(const struct VP9EncoderConfig *oxcf, + const TWO_PASS *const twopass, int *key_frame_map); +#endif // CONFIG_RATE_CTRL + +FIRSTPASS_STATS vp9_get_frame_stats(const TWO_PASS *twopass); +FIRSTPASS_STATS vp9_get_total_stats(const TWO_PASS *twopass); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_FIRSTPASS_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_firstpass_stats.h b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass_stats.h new file mode 100644 index 0000000000..01928e7816 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass_stats.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_FIRSTPASS_STATS_H_ +#define VPX_VP9_ENCODER_VP9_FIRSTPASS_STATS_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + double frame; + double weight; + double intra_error; + double coded_error; + double sr_coded_error; + double frame_noise_energy; + double pcnt_inter; + double pcnt_motion; + double pcnt_second_ref; + double pcnt_neutral; + double pcnt_intra_low; // Coded intra but low variance + double pcnt_intra_high; // Coded intra high variance + double intra_skip_pct; + double intra_smooth_pct; // % of blocks that are smooth + double inactive_zone_rows; // Image mask rows top and bottom. + double inactive_zone_cols; // Image mask columns at left and right edges. + double MVr; + double mvr_abs; + double MVc; + double mvc_abs; + double MVrv; + double MVcv; + double mv_in_out_count; + double duration; + double count; + double new_mv_count; + int64_t spatial_layer_id; +} FIRSTPASS_STATS; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_FIRSTPASS_STATS_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_frame_scale.c b/media/libvpx/libvpx/vp9/encoder/vp9_frame_scale.c new file mode 100644 index 0000000000..c74d523246 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_frame_scale.c @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "./vpx_scale_rtcd.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vpx/vpx_codec.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_scale/yv12config.h" + +void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + INTERP_FILTER filter_type, int phase_scaler) { + const int src_w = src->y_crop_width; + const int src_h = src->y_crop_height; + const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer, + src->v_buffer }; + const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride }; + uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer }; + const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride }; + const InterpKernel *const kernel = vp9_filter_kernels[filter_type]; + int x, y, i; + +#if HAVE_SSSE3 || HAVE_NEON + // TODO(linfengz): The 4:3 specialized C code is disabled by default since + // it's much slower than the general version which calls vpx_scaled_2d() even + // if vpx_scaled_2d() is not optimized. It will only be enabled as a reference + // for the platforms which have faster optimization. + if (4 * dst->y_crop_width == 3 * src_w && + 4 * dst->y_crop_height == 3 * src_h) { + // Specialize 4 to 3 scaling. + // Example pixel locations. + // (O: Original pixel. S: Scaled pixel. X: Overlapped pixel.) + // phase_scaler = 0 | phase_scaler = 8 + // | + // X O S O S O X | O O O O O + // | + // | + // | S S S + // | + // | + // O O O O O | O O O O O + // | + // S S S S | + // | + // | + // | S S S + // O O O O O | O O O O O + // | + // | + // | + // S S S S | + // | + // O O O O O | O O O O O + // | S S S + // | + // | + // | + // | + // X O S O S O X | O O O O O + + const int dst_ws[3] = { dst->y_crop_width, dst->uv_crop_width, + dst->uv_crop_width }; + const int dst_hs[3] = { dst->y_crop_height, dst->uv_crop_height, + dst->uv_crop_height }; + for (i = 0; i < MAX_MB_PLANE; ++i) { + const int dst_w = dst_ws[i]; + const int dst_h = dst_hs[i]; + const int src_stride = src_strides[i]; + const int dst_stride = dst_strides[i]; + for (y = 0; y < dst_h; y += 3) { + for (x = 0; x < dst_w; x += 3) { + const uint8_t *src_ptr = srcs[i] + 4 * y / 3 * src_stride + 4 * x / 3; + uint8_t *dst_ptr = dsts[i] + y * dst_stride + x; + + // Must call c function because its optimization doesn't support 3x3. + vpx_scaled_2d_c(src_ptr, src_stride, dst_ptr, dst_stride, kernel, + phase_scaler, 64 / 3, phase_scaler, 64 / 3, 3, 3); + } + } + } + } else +#endif + { + const int dst_w = dst->y_crop_width; + const int dst_h = dst->y_crop_height; + + // The issue b/311394513 reveals a corner case bug. vpx_scaled_2d() requires + // both x_step_q4 and y_step_q4 are less than or equal to 64. Otherwise, it + // needs to call vp9_scale_and_extend_frame_nonnormative() that supports + // arbitrary scaling. + const int x_step_q4 = 16 * src_w / dst_w; + const int y_step_q4 = 16 * src_h / dst_h; + if (x_step_q4 > 64 || y_step_q4 > 64) { + // This function is only called while cm->bit_depth is VPX_BITS_8. +#if CONFIG_VP9_HIGHBITDEPTH + vp9_scale_and_extend_frame_nonnormative(src, dst, (int)VPX_BITS_8); +#else + vp9_scale_and_extend_frame_nonnormative(src, dst); +#endif // CONFIG_VP9_HIGHBITDEPTH + return; + } + + for (i = 0; i < MAX_MB_PLANE; ++i) { + const int factor = (i == 0 || i == 3 ? 1 : 2); + const int src_stride = src_strides[i]; + const int dst_stride = dst_strides[i]; + for (y = 0; y < dst_h; y += 16) { + const int y_q4 = y * (16 / factor) * src_h / dst_h + phase_scaler; + for (x = 0; x < dst_w; x += 16) { + const int x_q4 = x * (16 / factor) * src_w / dst_w + phase_scaler; + const uint8_t *src_ptr = srcs[i] + + (y / factor) * src_h / dst_h * src_stride + + (x / factor) * src_w / dst_w; + uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor); + + vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel, + x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf, + 16 * src_h / dst_h, 16 / factor, 16 / factor); + } + } + } + } + + vpx_extend_frame_borders(dst); +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_job_queue.h b/media/libvpx/libvpx/vp9/encoder/vp9_job_queue.h new file mode 100644 index 0000000000..ad09c11198 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_job_queue.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_JOB_QUEUE_H_ +#define VPX_VP9_ENCODER_VP9_JOB_QUEUE_H_ + +typedef enum { + FIRST_PASS_JOB, + ENCODE_JOB, + ARNR_JOB, + NUM_JOB_TYPES, +} JOB_TYPE; + +// Encode job parameters +typedef struct { + int vert_unit_row_num; // Index of the vertical unit row + int tile_col_id; // tile col id within a tile + int tile_row_id; // tile col id within a tile +} JobNode; + +// Job queue element parameters +typedef struct { + // Pointer to the next link in the job queue + void *next; + + // Job information context of the module + JobNode job_info; +} JobQueue; + +// Job queue handle +typedef struct { + // Pointer to the next link in the job queue + void *next; + + // Counter to store the number of jobs picked up for processing + int num_jobs_acquired; +} JobQueueHandle; + +#endif // VPX_VP9_ENCODER_VP9_JOB_QUEUE_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c b/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c new file mode 100644 index 0000000000..97838c38e6 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c @@ -0,0 +1,235 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include +#include + +#include "./vpx_config.h" + +#include "vp9/common/vp9_common.h" + +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_extend.h" +#include "vp9/encoder/vp9_lookahead.h" + +/* Return the buffer at the given absolute index and increment the index */ +static struct lookahead_entry *pop(struct lookahead_ctx *ctx, int *idx) { + int index = *idx; + struct lookahead_entry *buf = ctx->buf + index; + + assert(index < ctx->max_sz); + if (++index >= ctx->max_sz) index -= ctx->max_sz; + *idx = index; + return buf; +} + +void vp9_lookahead_destroy(struct lookahead_ctx *ctx) { + if (ctx) { + if (ctx->buf) { + int i; + + for (i = 0; i < ctx->max_sz; i++) vpx_free_frame_buffer(&ctx->buf[i].img); + free(ctx->buf); + } + free(ctx); + } +} + +struct lookahead_ctx *vp9_lookahead_init(unsigned int width, + unsigned int height, + unsigned int subsampling_x, + unsigned int subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth, +#endif + unsigned int depth) { + struct lookahead_ctx *ctx = NULL; + + // Clamp the lookahead queue depth + depth = clamp(depth, 1, MAX_LAG_BUFFERS); + + // Allocate memory to keep previous source frames available. + depth += MAX_PRE_FRAMES; + + // Allocate the lookahead structures + ctx = calloc(1, sizeof(*ctx)); + if (ctx) { + const int legacy_byte_alignment = 0; + unsigned int i; + ctx->max_sz = depth; + ctx->buf = calloc(depth, sizeof(*ctx->buf)); + ctx->next_show_idx = 0; + if (!ctx->buf) goto bail; + for (i = 0; i < depth; i++) + if (vpx_alloc_frame_buffer( + &ctx->buf[i].img, width, height, subsampling_x, subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, legacy_byte_alignment)) + goto bail; + } + return ctx; +bail: + vp9_lookahead_destroy(ctx); + return NULL; +} + +#define USE_PARTIAL_COPY 0 +int vp9_lookahead_full(const struct lookahead_ctx *ctx) { + return ctx->sz + 1 + MAX_PRE_FRAMES > ctx->max_sz; +} + +int vp9_lookahead_next_show_idx(const struct lookahead_ctx *ctx) { + return ctx->next_show_idx; +} + +int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src, + int64_t ts_start, int64_t ts_end, int use_highbitdepth, + vpx_enc_frame_flags_t flags) { + struct lookahead_entry *buf; +#if USE_PARTIAL_COPY + int row, col, active_end; + int mb_rows = (src->y_height + 15) >> 4; + int mb_cols = (src->y_width + 15) >> 4; +#endif + int width = src->y_crop_width; + int height = src->y_crop_height; + int uv_width = src->uv_crop_width; + int uv_height = src->uv_crop_height; + int subsampling_x = src->subsampling_x; + int subsampling_y = src->subsampling_y; + int larger_dimensions, new_dimensions; +#if !CONFIG_VP9_HIGHBITDEPTH + (void)use_highbitdepth; + assert(use_highbitdepth == 0); +#endif + + if (vp9_lookahead_full(ctx)) return 1; + ctx->sz++; + buf = pop(ctx, &ctx->write_idx); + + new_dimensions = width != buf->img.y_crop_width || + height != buf->img.y_crop_height || + uv_width != buf->img.uv_crop_width || + uv_height != buf->img.uv_crop_height; + larger_dimensions = width > buf->img.y_width || height > buf->img.y_height || + uv_width > buf->img.uv_width || + uv_height > buf->img.uv_height; + assert(!larger_dimensions || new_dimensions); + +#if USE_PARTIAL_COPY + // TODO(jkoleszar): This is disabled for now, as + // vp9_copy_and_extend_frame_with_rect is not subsampling/alpha aware. + + // Only do this partial copy if the following conditions are all met: + // 1. Lookahead queue has has size of 1. + // 2. Active map is provided. + // 3. This is not a key frame, golden nor altref frame. + if (!new_dimensions && ctx->max_sz == 1 && active_map && !flags) { + for (row = 0; row < mb_rows; ++row) { + col = 0; + + while (1) { + // Find the first active macroblock in this row. + for (; col < mb_cols; ++col) { + if (active_map[col]) break; + } + + // No more active macroblock in this row. + if (col == mb_cols) break; + + // Find the end of active region in this row. + active_end = col; + + for (; active_end < mb_cols; ++active_end) { + if (!active_map[active_end]) break; + } + + // Only copy this active region. + vp9_copy_and_extend_frame_with_rect(src, &buf->img, row << 4, col << 4, + 16, (active_end - col) << 4); + + // Start again from the end of this active region. + col = active_end; + } + + active_map += mb_cols; + } + } else { +#endif + if (larger_dimensions) { + YV12_BUFFER_CONFIG new_img; + memset(&new_img, 0, sizeof(new_img)); + if (vpx_alloc_frame_buffer(&new_img, width, height, subsampling_x, + subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, 0)) + return 1; + vpx_free_frame_buffer(&buf->img); + buf->img = new_img; + } else if (new_dimensions) { + buf->img.y_crop_width = src->y_crop_width; + buf->img.y_crop_height = src->y_crop_height; + buf->img.uv_crop_width = src->uv_crop_width; + buf->img.uv_crop_height = src->uv_crop_height; + buf->img.subsampling_x = src->subsampling_x; + buf->img.subsampling_y = src->subsampling_y; + } + // Partial copy not implemented yet + vp9_copy_and_extend_frame(src, &buf->img); +#if USE_PARTIAL_COPY + } +#endif + + buf->ts_start = ts_start; + buf->ts_end = ts_end; + buf->flags = flags; + buf->show_idx = ctx->next_show_idx; + ++ctx->next_show_idx; + return 0; +} + +struct lookahead_entry *vp9_lookahead_pop(struct lookahead_ctx *ctx, + int drain) { + struct lookahead_entry *buf = NULL; + + if (ctx && ctx->sz && (drain || ctx->sz == ctx->max_sz - MAX_PRE_FRAMES)) { + buf = pop(ctx, &ctx->read_idx); + ctx->sz--; + } + return buf; +} + +struct lookahead_entry *vp9_lookahead_peek(struct lookahead_ctx *ctx, + int index) { + struct lookahead_entry *buf = NULL; + + if (index >= 0) { + // Forward peek + if (index < ctx->sz) { + index += ctx->read_idx; + if (index >= ctx->max_sz) index -= ctx->max_sz; + buf = ctx->buf + index; + } + } else if (index < 0) { + // Backward peek + if (-index <= MAX_PRE_FRAMES) { + index += ctx->read_idx; + if (index < 0) index += ctx->max_sz; + buf = ctx->buf + index; + } + } + + return buf; +} + +unsigned int vp9_lookahead_depth(struct lookahead_ctx *ctx) { return ctx->sz; } diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.h b/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.h new file mode 100644 index 0000000000..6ac6736673 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_LOOKAHEAD_H_ +#define VPX_VP9_ENCODER_VP9_LOOKAHEAD_H_ + +#include "vpx_scale/yv12config.h" +#include "vpx/vpx_encoder.h" +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_LAG_BUFFERS 25 + +struct lookahead_entry { + YV12_BUFFER_CONFIG img; + int64_t ts_start; + int64_t ts_end; + int show_idx; /*The show_idx of this frame*/ + vpx_enc_frame_flags_t flags; +}; + +// The max of past frames we want to keep in the queue. +#define MAX_PRE_FRAMES 1 + +struct lookahead_ctx { + int max_sz; /* Absolute size of the queue */ + int sz; /* Number of buffers currently in the queue */ + int read_idx; /* Read index */ + int write_idx; /* Write index */ + int next_show_idx; /* The show_idx that will be assigned to the next frame + being pushed in the queue*/ + struct lookahead_entry *buf; /* Buffer list */ +}; + +/**\brief Initializes the lookahead stage + * + * The lookahead stage is a queue of frame buffers on which some analysis + * may be done when buffers are enqueued. + */ +struct lookahead_ctx *vp9_lookahead_init(unsigned int width, + unsigned int height, + unsigned int subsampling_x, + unsigned int subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth, +#endif + unsigned int depth); + +/**\brief Destroys the lookahead stage + */ +void vp9_lookahead_destroy(struct lookahead_ctx *ctx); + +/**\brief Check if lookahead is full + * + * \param[in] ctx Pointer to the lookahead context + * + * Return 1 if lookahead is full, otherwise return 0. + */ +int vp9_lookahead_full(const struct lookahead_ctx *ctx); + +/**\brief Return the next_show_idx + * + * \param[in] ctx Pointer to the lookahead context + * + * Return the show_idx that will be assigned to the next + * frame pushed by vp9_lookahead_push() + */ +int vp9_lookahead_next_show_idx(const struct lookahead_ctx *ctx); + +/**\brief Enqueue a source buffer + * + * This function will copy the source image into a new framebuffer with + * the expected stride/border. + * + * \param[in] ctx Pointer to the lookahead context + * \param[in] src Pointer to the image to enqueue + * \param[in] ts_start Timestamp for the start of this frame + * \param[in] ts_end Timestamp for the end of this frame + * \param[in] flags Flags set on this frame + */ +int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src, + int64_t ts_start, int64_t ts_end, int use_highbitdepth, + vpx_enc_frame_flags_t flags); + +/**\brief Get the next source buffer to encode + * + * + * \param[in] ctx Pointer to the lookahead context + * \param[in] drain Flag indicating the buffer should be drained + * (return a buffer regardless of the current queue depth) + * + * \retval NULL, if drain set and queue is empty + * \retval NULL, if drain not set and queue not of the configured depth + */ +struct lookahead_entry *vp9_lookahead_pop(struct lookahead_ctx *ctx, int drain); + +/**\brief Get a future source buffer to encode + * + * \param[in] ctx Pointer to the lookahead context + * \param[in] index Index of the frame to be returned, 0 == next frame + * + * \retval NULL, if no buffer exists at the specified index + */ +struct lookahead_entry *vp9_lookahead_peek(struct lookahead_ctx *ctx, + int index); + +/**\brief Get the number of frames currently in the lookahead queue + * + * \param[in] ctx Pointer to the lookahead context + */ +unsigned int vp9_lookahead_depth(struct lookahead_ctx *ctx); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_LOOKAHEAD_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.c b/media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.c new file mode 100644 index 0000000000..2f20a8fe6d --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.c @@ -0,0 +1,388 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/system_state.h" +#include "vp9/encoder/vp9_segmentation.h" +#include "vp9/encoder/vp9_mcomp.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_reconintra.h" + +static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, const MV *ref_mv, + MV *dst_mv, int mb_row, + int mb_col) { + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; + const SEARCH_METHODS old_search_method = mv_sf->search_method; + const vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16]; + const MvLimits tmp_mv_limits = x->mv_limits; + MV ref_full; + int cost_list[5]; + + // Further step/diamond searches as necessary + int step_param = mv_sf->reduce_first_step_size; + step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2); + + vp9_set_mv_search_range(&x->mv_limits, ref_mv); + + ref_full.col = ref_mv->col >> 3; + ref_full.row = ref_mv->row >> 3; + + mv_sf->search_method = HEX; + vp9_full_pixel_search(cpi, x, BLOCK_16X16, &ref_full, step_param, + cpi->sf.mv.search_method, x->errorperbit, + cond_cost_list(cpi, cost_list), ref_mv, dst_mv, 0, 0); + mv_sf->search_method = old_search_method; + + /* restore UMV window */ + x->mv_limits = tmp_mv_limits; + + // Try sub-pixel MC + // if (bestsme > error_thresh && bestsme < INT_MAX) + { + uint32_t distortion; + uint32_t sse; + // TODO(yunqing): may use higher tap interp filter than 2 taps if needed. + cpi->find_fractional_mv_step( + x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit, + &v_fn_ptr, 0, mv_sf->subpel_search_level, + cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, + 0, USE_2_TAPS); + } + + xd->mi[0]->mode = NEWMV; + xd->mi[0]->mv[0].as_mv = *dst_mv; + + vp9_build_inter_predictors_sby(xd, mb_row, mb_col, BLOCK_16X16); + + return vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, + xd->plane[0].dst.buf, xd->plane[0].dst.stride); +} + +static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv, + int_mv *dst_mv, int mb_row, int mb_col) { + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + unsigned int err, tmp_err; + MV tmp_mv; + + // Try zero MV first + // FIXME should really use something like near/nearest MV and/or MV prediction + err = vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, + xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride); + dst_mv->as_int = 0; + + // Test last reference frame using the previous best mv as the + // starting point (best reference) for the search + tmp_err = do_16x16_motion_iteration(cpi, ref_mv, &tmp_mv, mb_row, mb_col); + if (tmp_err < err) { + err = tmp_err; + dst_mv->as_mv = tmp_mv; + } + + // If the current best reference mv is not centered on 0,0 then do a 0,0 + // based search as well. + if (ref_mv->row != 0 || ref_mv->col != 0) { + MV zero_ref_mv = { 0, 0 }; + + tmp_err = + do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv, mb_row, mb_col); + if (tmp_err < err) { + dst_mv->as_mv = tmp_mv; + err = tmp_err; + } + } + + return err; +} + +static int do_16x16_zerozero_search(VP9_COMP *cpi, int_mv *dst_mv) { + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + unsigned int err; + + // Try zero MV first + // FIXME should really use something like near/nearest MV and/or MV prediction + err = vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, + xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride); + + dst_mv->as_int = 0; + + return err; +} +static int find_best_16x16_intra(VP9_COMP *cpi, PREDICTION_MODE *pbest_mode) { + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + PREDICTION_MODE best_mode = -1, mode; + unsigned int best_err = INT_MAX; + + // calculate SATD for each intra prediction mode; + // we're intentionally not doing 4x4, we just want a rough estimate + for (mode = DC_PRED; mode <= TM_PRED; mode++) { + unsigned int err; + + xd->mi[0]->mode = mode; + vp9_predict_intra_block(xd, 2, TX_16X16, mode, x->plane[0].src.buf, + x->plane[0].src.stride, xd->plane[0].dst.buf, + xd->plane[0].dst.stride, 0, 0, 0); + err = vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, + xd->plane[0].dst.buf, xd->plane[0].dst.stride); + + // find best + if (err < best_err) { + best_err = err; + best_mode = mode; + } + } + + if (pbest_mode) *pbest_mode = best_mode; + + return best_err; +} + +static void update_mbgraph_mb_stats(VP9_COMP *cpi, MBGRAPH_MB_STATS *stats, + YV12_BUFFER_CONFIG *buf, int mb_y_offset, + YV12_BUFFER_CONFIG *golden_ref, + const MV *prev_golden_ref_mv, + YV12_BUFFER_CONFIG *alt_ref, int mb_row, + int mb_col) { + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + int intra_error; + VP9_COMMON *cm = &cpi->common; + + // FIXME in practice we're completely ignoring chroma here + x->plane[0].src.buf = buf->y_buffer + mb_y_offset; + x->plane[0].src.stride = buf->y_stride; + + xd->plane[0].dst.buf = get_frame_new_buffer(cm)->y_buffer + mb_y_offset; + xd->plane[0].dst.stride = get_frame_new_buffer(cm)->y_stride; + + // do intra 16x16 prediction + intra_error = find_best_16x16_intra(cpi, &stats->ref[INTRA_FRAME].m.mode); + if (intra_error <= 0) intra_error = 1; + stats->ref[INTRA_FRAME].err = intra_error; + + // Golden frame MV search, if it exists and is different than last frame + if (golden_ref) { + int g_motion_error; + xd->plane[0].pre[0].buf = golden_ref->y_buffer + mb_y_offset; + xd->plane[0].pre[0].stride = golden_ref->y_stride; + g_motion_error = + do_16x16_motion_search(cpi, prev_golden_ref_mv, + &stats->ref[GOLDEN_FRAME].m.mv, mb_row, mb_col); + stats->ref[GOLDEN_FRAME].err = g_motion_error; + } else { + stats->ref[GOLDEN_FRAME].err = INT_MAX; + stats->ref[GOLDEN_FRAME].m.mv.as_int = 0; + } + + // Do an Alt-ref frame MV search, if it exists and is different than + // last/golden frame. + if (alt_ref) { + int a_motion_error; + xd->plane[0].pre[0].buf = alt_ref->y_buffer + mb_y_offset; + xd->plane[0].pre[0].stride = alt_ref->y_stride; + a_motion_error = + do_16x16_zerozero_search(cpi, &stats->ref[ALTREF_FRAME].m.mv); + + stats->ref[ALTREF_FRAME].err = a_motion_error; + } else { + stats->ref[ALTREF_FRAME].err = INT_MAX; + stats->ref[ALTREF_FRAME].m.mv.as_int = 0; + } +} + +static void update_mbgraph_frame_stats(VP9_COMP *cpi, + MBGRAPH_FRAME_STATS *stats, + YV12_BUFFER_CONFIG *buf, + YV12_BUFFER_CONFIG *golden_ref, + YV12_BUFFER_CONFIG *alt_ref) { + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + VP9_COMMON *const cm = &cpi->common; + + int mb_col, mb_row, offset = 0; + int mb_y_offset = 0; + MV gld_top_mv = { 0, 0 }; + MODE_INFO mi_local; + MODE_INFO mi_above, mi_left; + + vp9_zero(mi_local); + // Set up limit values for motion vectors to prevent them extending outside + // the UMV borders. + x->mv_limits.row_min = -BORDER_MV_PIXELS_B16; + x->mv_limits.row_max = (cm->mb_rows - 1) * 8 + BORDER_MV_PIXELS_B16; + // Signal to vp9_predict_intra_block() that above is not available + xd->above_mi = NULL; + + xd->plane[0].dst.stride = buf->y_stride; + xd->plane[0].pre[0].stride = buf->y_stride; + xd->plane[1].dst.stride = buf->uv_stride; + xd->mi[0] = &mi_local; + mi_local.sb_type = BLOCK_16X16; + mi_local.ref_frame[0] = LAST_FRAME; + mi_local.ref_frame[1] = NO_REF_FRAME; + + for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) { + MV gld_left_mv = gld_top_mv; + int mb_y_in_offset = mb_y_offset; + + // Set up limit values for motion vectors to prevent them extending outside + // the UMV borders. + x->mv_limits.col_min = -BORDER_MV_PIXELS_B16; + x->mv_limits.col_max = (cm->mb_cols - 1) * 8 + BORDER_MV_PIXELS_B16; + // Signal to vp9_predict_intra_block() that left is not available + xd->left_mi = NULL; + + for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { + MBGRAPH_MB_STATS *mb_stats = &stats->mb_stats[offset + mb_col]; + + update_mbgraph_mb_stats(cpi, mb_stats, buf, mb_y_in_offset, golden_ref, + &gld_left_mv, alt_ref, mb_row, mb_col); + gld_left_mv = mb_stats->ref[GOLDEN_FRAME].m.mv.as_mv; + if (mb_col == 0) { + gld_top_mv = gld_left_mv; + } + // Signal to vp9_predict_intra_block() that left is available + xd->left_mi = &mi_left; + + mb_y_in_offset += 16; + x->mv_limits.col_min -= 16; + x->mv_limits.col_max -= 16; + } + + // Signal to vp9_predict_intra_block() that above is available + xd->above_mi = &mi_above; + + mb_y_offset += buf->y_stride * 16; + x->mv_limits.row_min -= 16; + x->mv_limits.row_max -= 16; + offset += cm->mb_cols; + } +} + +// void separate_arf_mbs_byzz +static void separate_arf_mbs(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + int mb_col, mb_row, offset, i; + int mi_row, mi_col; + int ncnt[4] = { 0 }; + int n_frames = cpi->mbgraph_n_frames; + + int *arf_not_zz; + + CHECK_MEM_ERROR( + &cm->error, arf_not_zz, + vpx_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz), 1)); + + // We are not interested in results beyond the alt ref itself. + if (n_frames > cpi->rc.frames_till_gf_update_due) + n_frames = cpi->rc.frames_till_gf_update_due; + + // defer cost to reference frames + for (i = n_frames - 1; i >= 0; i--) { + MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i]; + + for (offset = 0, mb_row = 0; mb_row < cm->mb_rows; + offset += cm->mb_cols, mb_row++) { + for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { + MBGRAPH_MB_STATS *mb_stats = &frame_stats->mb_stats[offset + mb_col]; + + int altref_err = mb_stats->ref[ALTREF_FRAME].err; + int intra_err = mb_stats->ref[INTRA_FRAME].err; + int golden_err = mb_stats->ref[GOLDEN_FRAME].err; + + // Test for altref vs intra and gf and that its mv was 0,0. + if (altref_err > 1000 || altref_err > intra_err || + altref_err > golden_err) { + arf_not_zz[offset + mb_col]++; + } + } + } + } + + // arf_not_zz is indexed by MB, but this loop is indexed by MI to avoid out + // of bound access in segmentation_map + for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) { + // If any of the blocks in the sequence failed then the MB + // goes in segment 0 + if (arf_not_zz[mi_row / 2 * cm->mb_cols + mi_col / 2]) { + ncnt[0]++; + cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 0; + } else { + cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 1; + ncnt[1]++; + } + } + } + + // Note % of blocks that are marked as static + if (cm->MBs) + cpi->static_mb_pct = (ncnt[1] * 100) / (cm->mi_rows * cm->mi_cols); + + // This error case should not be reachable as this function should + // never be called with the common data structure uninitialized. + else + cpi->static_mb_pct = 0; + + vp9_enable_segmentation(&cm->seg); + + // Free localy allocated storage + vpx_free(arf_not_zz); +} + +void vp9_update_mbgraph_stats(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + int i, n_frames = vp9_lookahead_depth(cpi->lookahead); + YV12_BUFFER_CONFIG *golden_ref = get_ref_frame_buffer(cpi, GOLDEN_FRAME); + + assert(golden_ref != NULL); + + // we need to look ahead beyond where the ARF transitions into + // being a GF - so exit if we don't look ahead beyond that + if (n_frames <= cpi->rc.frames_till_gf_update_due) return; + + if (n_frames > MAX_LAG_BUFFERS) n_frames = MAX_LAG_BUFFERS; + + cpi->mbgraph_n_frames = n_frames; + for (i = 0; i < n_frames; i++) { + MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i]; + memset(frame_stats->mb_stats, 0, + cm->mb_rows * cm->mb_cols * sizeof(*cpi->mbgraph_stats[i].mb_stats)); + } + + // do motion search to find contribution of each reference to data + // later on in this GF group + // FIXME really, the GF/last MC search should be done forward, and + // the ARF MC search backwards, to get optimal results for MV caching + for (i = 0; i < n_frames; i++) { + MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i]; + struct lookahead_entry *q_cur = vp9_lookahead_peek(cpi->lookahead, i); + + assert(q_cur != NULL); + + update_mbgraph_frame_stats(cpi, frame_stats, &q_cur->img, golden_ref, + cpi->Source); + } + + vpx_clear_system_state(); + + separate_arf_mbs(cpi); +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.h b/media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.h new file mode 100644 index 0000000000..7b629861d5 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_MBGRAPH_H_ +#define VPX_VP9_ENCODER_VP9_MBGRAPH_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + struct { + int err; + union { + int_mv mv; + PREDICTION_MODE mode; + } m; + } ref[MAX_REF_FRAMES]; +} MBGRAPH_MB_STATS; + +typedef struct { + MBGRAPH_MB_STATS *mb_stats; +} MBGRAPH_FRAME_STATS; + +struct VP9_COMP; + +void vp9_update_mbgraph_stats(struct VP9_COMP *cpi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_MBGRAPH_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_mcomp.c b/media/libvpx/libvpx/vp9/encoder/vp9_mcomp.c new file mode 100644 index 0000000000..cbe1c40290 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_mcomp.c @@ -0,0 +1,3035 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" + +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_mvref_common.h" +#include "vp9/common/vp9_reconinter.h" + +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_mcomp.h" + +// #define NEW_DIAMOND_SEARCH + +void vp9_set_mv_search_range(MvLimits *mv_limits, const MV *mv) { + int col_min = (mv->col >> 3) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0); + int row_min = (mv->row >> 3) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0); + int col_max = (mv->col >> 3) + MAX_FULL_PEL_VAL; + int row_max = (mv->row >> 3) + MAX_FULL_PEL_VAL; + + col_min = VPXMAX(col_min, (MV_LOW >> 3) + 1); + row_min = VPXMAX(row_min, (MV_LOW >> 3) + 1); + col_max = VPXMIN(col_max, (MV_UPP >> 3) - 1); + row_max = VPXMIN(row_max, (MV_UPP >> 3) - 1); + + // Get intersection of UMV window and valid MV window to reduce # of checks + // in diamond search. + if (mv_limits->col_min < col_min) mv_limits->col_min = col_min; + if (mv_limits->col_max > col_max) mv_limits->col_max = col_max; + if (mv_limits->row_min < row_min) mv_limits->row_min = row_min; + if (mv_limits->row_max > row_max) mv_limits->row_max = row_max; +} + +void vp9_set_subpel_mv_search_range(MvLimits *subpel_mv_limits, + const MvLimits *umv_window_limits, + const MV *ref_mv) { + subpel_mv_limits->col_min = VPXMAX(umv_window_limits->col_min * 8, + ref_mv->col - MAX_FULL_PEL_VAL * 8); + subpel_mv_limits->col_max = VPXMIN(umv_window_limits->col_max * 8, + ref_mv->col + MAX_FULL_PEL_VAL * 8); + subpel_mv_limits->row_min = VPXMAX(umv_window_limits->row_min * 8, + ref_mv->row - MAX_FULL_PEL_VAL * 8); + subpel_mv_limits->row_max = VPXMIN(umv_window_limits->row_max * 8, + ref_mv->row + MAX_FULL_PEL_VAL * 8); + + subpel_mv_limits->col_min = VPXMAX(MV_LOW + 1, subpel_mv_limits->col_min); + subpel_mv_limits->col_max = VPXMIN(MV_UPP - 1, subpel_mv_limits->col_max); + subpel_mv_limits->row_min = VPXMAX(MV_LOW + 1, subpel_mv_limits->row_min); + subpel_mv_limits->row_max = VPXMIN(MV_UPP - 1, subpel_mv_limits->row_max); +} + +int vp9_init_search_range(int size) { + int sr = 0; + // Minimum search size no matter what the passed in value. + size = VPXMAX(16, size); + + while ((size << sr) < MAX_FULL_PEL_VAL) sr++; + + sr = VPXMIN(sr, MAX_MVSEARCH_STEPS - 2); + return sr; +} + +int vp9_mv_bit_cost(const MV *mv, const MV *ref, const int *mvjcost, + int *mvcost[2], int weight) { + const MV diff = { mv->row - ref->row, mv->col - ref->col }; + return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjcost, mvcost) * weight, 7); +} + +#define PIXEL_TRANSFORM_ERROR_SCALE 4 +static int mv_err_cost(const MV *mv, const MV *ref, const int *mvjcost, + int *mvcost[2], int error_per_bit) { + if (mvcost) { + const MV diff = { mv->row - ref->row, mv->col - ref->col }; + return (int)ROUND64_POWER_OF_TWO( + (int64_t)mv_cost(&diff, mvjcost, mvcost) * error_per_bit, + RDDIV_BITS + VP9_PROB_COST_SHIFT - RD_EPB_SHIFT + + PIXEL_TRANSFORM_ERROR_SCALE); + } + return 0; +} +void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride) { + int len; + int ss_count = 0; + + for (len = MAX_FIRST_STEP; len > 0; len /= 2) { + // Generate offsets for 4 search sites per step. + const MV ss_mvs[] = { { -len, 0 }, { len, 0 }, { 0, -len }, { 0, len } }; + int i; + for (i = 0; i < 4; ++i, ++ss_count) { + cfg->ss_mv[ss_count] = ss_mvs[i]; + cfg->ss_os[ss_count] = ss_mvs[i].row * stride + ss_mvs[i].col; + } + } + + cfg->searches_per_step = 4; + cfg->total_steps = ss_count / cfg->searches_per_step; +} + +void vp9_init3smotion_compensation(search_site_config *cfg, int stride) { + int len; + int ss_count = 0; + + for (len = MAX_FIRST_STEP; len > 0; len /= 2) { + // Generate offsets for 8 search sites per step. + const MV ss_mvs[8] = { { -len, 0 }, { len, 0 }, { 0, -len }, + { 0, len }, { -len, -len }, { -len, len }, + { len, -len }, { len, len } }; + int i; + for (i = 0; i < 8; ++i, ++ss_count) { + cfg->ss_mv[ss_count] = ss_mvs[i]; + cfg->ss_os[ss_count] = ss_mvs[i].row * stride + ss_mvs[i].col; + } + } + + cfg->searches_per_step = 8; + cfg->total_steps = ss_count / cfg->searches_per_step; +} + +// convert motion vector component to offset for sv[a]f calc +static INLINE int sp(int x) { return x & 7; } + +static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { + return &buf[(r >> 3) * stride + (c >> 3)]; +} + +#if CONFIG_VP9_HIGHBITDEPTH +/* checks if (r, c) has better score than previous best */ +#define CHECK_BETTER(v, r, c) \ + do { \ + if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ + int64_t tmpmse; \ + const MV cb_mv = { r, c }; \ + const MV cb_ref_mv = { rr, rc }; \ + if (second_pred == NULL) { \ + thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ + src_stride, &sse); \ + } else { \ + thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ + src_stride, &sse, second_pred); \ + } \ + tmpmse = thismse; \ + tmpmse += \ + mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, error_per_bit); \ + if (tmpmse >= INT_MAX) { \ + v = INT_MAX; \ + } else if ((v = (uint32_t)tmpmse) < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + } else { \ + v = INT_MAX; \ + } \ + } while (0) +#else +/* checks if (r, c) has better score than previous best */ +#define CHECK_BETTER(v, r, c) \ + do { \ + if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ + const MV cb_mv = { r, c }; \ + const MV cb_ref_mv = { rr, rc }; \ + if (second_pred == NULL) \ + thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ + src_stride, &sse); \ + else \ + thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ + src_stride, &sse, second_pred); \ + if ((v = mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, \ + error_per_bit) + \ + thismse) < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + } else { \ + v = INT_MAX; \ + } \ + } while (0) + +#endif +#define FIRST_LEVEL_CHECKS \ + do { \ + unsigned int left, right, up, down, diag; \ + CHECK_BETTER(left, tr, tc - hstep); \ + CHECK_BETTER(right, tr, tc + hstep); \ + CHECK_BETTER(up, tr - hstep, tc); \ + CHECK_BETTER(down, tr + hstep, tc); \ + whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); \ + switch (whichdir) { \ + case 0: CHECK_BETTER(diag, tr - hstep, tc - hstep); break; \ + case 1: CHECK_BETTER(diag, tr - hstep, tc + hstep); break; \ + case 2: CHECK_BETTER(diag, tr + hstep, tc - hstep); break; \ + case 3: CHECK_BETTER(diag, tr + hstep, tc + hstep); break; \ + } \ + } while (0) + +#define SECOND_LEVEL_CHECKS \ + do { \ + int kr, kc; \ + unsigned int second; \ + if (tr != br && tc != bc) { \ + kr = br - tr; \ + kc = bc - tc; \ + CHECK_BETTER(second, tr + kr, tc + 2 * kc); \ + CHECK_BETTER(second, tr + 2 * kr, tc + kc); \ + } else if (tr == br && tc != bc) { \ + kc = bc - tc; \ + CHECK_BETTER(second, tr + hstep, tc + 2 * kc); \ + CHECK_BETTER(second, tr - hstep, tc + 2 * kc); \ + switch (whichdir) { \ + case 0: \ + case 1: CHECK_BETTER(second, tr + hstep, tc + kc); break; \ + case 2: \ + case 3: CHECK_BETTER(second, tr - hstep, tc + kc); break; \ + } \ + } else if (tr != br && tc == bc) { \ + kr = br - tr; \ + CHECK_BETTER(second, tr + 2 * kr, tc + hstep); \ + CHECK_BETTER(second, tr + 2 * kr, tc - hstep); \ + switch (whichdir) { \ + case 0: \ + case 2: CHECK_BETTER(second, tr + kr, tc + hstep); break; \ + case 1: \ + case 3: CHECK_BETTER(second, tr + kr, tc - hstep); break; \ + } \ + } \ + } while (0) + +#define SETUP_SUBPEL_SEARCH \ + const uint8_t *const z = x->plane[0].src.buf; \ + const int src_stride = x->plane[0].src.stride; \ + const MACROBLOCKD *xd = &x->e_mbd; \ + unsigned int besterr = UINT_MAX; \ + unsigned int sse; \ + unsigned int whichdir; \ + int thismse; \ + const unsigned int halfiters = iters_per_step; \ + const unsigned int quarteriters = iters_per_step; \ + const unsigned int eighthiters = iters_per_step; \ + const int y_stride = xd->plane[0].pre[0].stride; \ + const int offset = bestmv->row * y_stride + bestmv->col; \ + const uint8_t *const y = xd->plane[0].pre[0].buf; \ + \ + int rr = ref_mv->row; \ + int rc = ref_mv->col; \ + int br = bestmv->row * 8; \ + int bc = bestmv->col * 8; \ + int hstep = 4; \ + int minc, maxc, minr, maxr; \ + int tr = br; \ + int tc = bc; \ + MvLimits subpel_mv_limits; \ + \ + vp9_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, ref_mv); \ + minc = subpel_mv_limits.col_min; \ + maxc = subpel_mv_limits.col_max; \ + minr = subpel_mv_limits.row_min; \ + maxr = subpel_mv_limits.row_max; \ + \ + bestmv->row *= 8; \ + bestmv->col *= 8 + +static unsigned int setup_center_error( + const MACROBLOCKD *xd, const MV *bestmv, const MV *ref_mv, + int error_per_bit, const vp9_variance_fn_ptr_t *vfp, + const uint8_t *const src, const int src_stride, const uint8_t *const y, + int y_stride, const uint8_t *second_pred, int w, int h, int offset, + int *mvjcost, int *mvcost[2], uint32_t *sse1, uint32_t *distortion) { +#if CONFIG_VP9_HIGHBITDEPTH + uint64_t besterr; + if (second_pred != NULL) { + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]); + vpx_highbd_comp_avg_pred(comp_pred16, CONVERT_TO_SHORTPTR(second_pred), w, + h, CONVERT_TO_SHORTPTR(y + offset), y_stride); + besterr = + vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1); + } else { + DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]); + vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); + besterr = vfp->vf(comp_pred, w, src, src_stride, sse1); + } + } else { + besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1); + } + *distortion = (uint32_t)besterr; + besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); + if (besterr >= UINT_MAX) return UINT_MAX; + return (uint32_t)besterr; +#else + uint32_t besterr; + (void)xd; + if (second_pred != NULL) { + DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]); + vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); + besterr = vfp->vf(comp_pred, w, src, src_stride, sse1); + } else { + besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1); + } + *distortion = besterr; + besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); + return besterr; +#endif // CONFIG_VP9_HIGHBITDEPTH +} + +static INLINE int64_t divide_and_round(const int64_t n, const int64_t d) { + return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d); +} + +static INLINE int is_cost_list_wellbehaved(int *cost_list) { + return cost_list[0] < cost_list[1] && cost_list[0] < cost_list[2] && + cost_list[0] < cost_list[3] && cost_list[0] < cost_list[4]; +} + +// Returns surface minima estimate at given precision in 1/2^n bits. +// Assume a model for the cost surface: S = A(x - x0)^2 + B(y - y0)^2 + C +// For a given set of costs S0, S1, S2, S3, S4 at points +// (y, x) = (0, 0), (0, -1), (1, 0), (0, 1) and (-1, 0) respectively, +// the solution for the location of the minima (x0, y0) is given by: +// x0 = 1/2 (S1 - S3)/(S1 + S3 - 2*S0), +// y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0). +// The code below is an integerized version of that. +static void get_cost_surf_min(int *cost_list, int *ir, int *ic, int bits) { + const int64_t x0 = (int64_t)cost_list[1] - cost_list[3]; + const int64_t y0 = cost_list[1] - 2 * (int64_t)cost_list[0] + cost_list[3]; + const int64_t x1 = (int64_t)cost_list[4] - cost_list[2]; + const int64_t y1 = cost_list[4] - 2 * (int64_t)cost_list[0] + cost_list[2]; + const int b = 1 << (bits - 1); + *ic = (int)divide_and_round(x0 * b, y0); + *ir = (int)divide_and_round(x1 * b, y1); +} + +uint32_t vp9_skip_sub_pixel_tree( + const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp, + int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, + int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], + uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, + int h, int use_accurate_subpel_search) { + SETUP_SUBPEL_SEARCH; + besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z, + src_stride, y, y_stride, second_pred, w, h, + offset, mvjcost, mvcost, sse1, distortion); + (void)halfiters; + (void)quarteriters; + (void)eighthiters; + (void)whichdir; + (void)allow_hp; + (void)forced_stop; + (void)hstep; + (void)rr; + (void)rc; + (void)minr; + (void)minc; + (void)maxr; + (void)maxc; + (void)tr; + (void)tc; + (void)sse; + (void)thismse; + (void)cost_list; + (void)use_accurate_subpel_search; + + return besterr; +} + +uint32_t vp9_find_best_sub_pixel_tree_pruned_evenmore( + const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp, + int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, + int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], + uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, + int h, int use_accurate_subpel_search) { + SETUP_SUBPEL_SEARCH; + besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z, + src_stride, y, y_stride, second_pred, w, h, + offset, mvjcost, mvcost, sse1, distortion); + (void)halfiters; + (void)quarteriters; + (void)eighthiters; + (void)whichdir; + (void)allow_hp; + (void)forced_stop; + (void)hstep; + (void)use_accurate_subpel_search; + + if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX && + cost_list[2] != INT_MAX && cost_list[3] != INT_MAX && + cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) { + int ir, ic; + unsigned int minpt = INT_MAX; + get_cost_surf_min(cost_list, &ir, &ic, 2); + if (ir != 0 || ic != 0) { + CHECK_BETTER(minpt, tr + 2 * ir, tc + 2 * ic); + } + } else { + FIRST_LEVEL_CHECKS; + if (halfiters > 1) { + SECOND_LEVEL_CHECKS; + } + + tr = br; + tc = bc; + + // Each subsequent iteration checks at least one point in common with + // the last iteration could be 2 ( if diag selected) 1/4 pel + // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only + if (forced_stop != 2) { + hstep >>= 1; + FIRST_LEVEL_CHECKS; + if (quarteriters > 1) { + SECOND_LEVEL_CHECKS; + } + } + } + + tr = br; + tc = bc; + + if (allow_hp && use_mv_hp(ref_mv) && forced_stop == 0) { + hstep >>= 1; + FIRST_LEVEL_CHECKS; + if (eighthiters > 1) { + SECOND_LEVEL_CHECKS; + } + } + + bestmv->row = br; + bestmv->col = bc; + + return besterr; +} + +uint32_t vp9_find_best_sub_pixel_tree_pruned_more( + const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp, + int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, + int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], + uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, + int h, int use_accurate_subpel_search) { + SETUP_SUBPEL_SEARCH; + (void)use_accurate_subpel_search; + + besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z, + src_stride, y, y_stride, second_pred, w, h, + offset, mvjcost, mvcost, sse1, distortion); + if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX && + cost_list[2] != INT_MAX && cost_list[3] != INT_MAX && + cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) { + unsigned int minpt; + int ir, ic; + get_cost_surf_min(cost_list, &ir, &ic, 1); + if (ir != 0 || ic != 0) { + CHECK_BETTER(minpt, tr + ir * hstep, tc + ic * hstep); + } + } else { + FIRST_LEVEL_CHECKS; + if (halfiters > 1) { + SECOND_LEVEL_CHECKS; + } + } + + // Each subsequent iteration checks at least one point in common with + // the last iteration could be 2 ( if diag selected) 1/4 pel + + // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only + if (forced_stop != 2) { + tr = br; + tc = bc; + hstep >>= 1; + FIRST_LEVEL_CHECKS; + if (quarteriters > 1) { + SECOND_LEVEL_CHECKS; + } + } + + if (allow_hp && use_mv_hp(ref_mv) && forced_stop == 0) { + tr = br; + tc = bc; + hstep >>= 1; + FIRST_LEVEL_CHECKS; + if (eighthiters > 1) { + SECOND_LEVEL_CHECKS; + } + } + // These lines insure static analysis doesn't warn that + // tr and tc aren't used after the above point. + (void)tr; + (void)tc; + + bestmv->row = br; + bestmv->col = bc; + + return besterr; +} + +uint32_t vp9_find_best_sub_pixel_tree_pruned( + const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp, + int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, + int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], + uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, + int h, int use_accurate_subpel_search) { + SETUP_SUBPEL_SEARCH; + (void)use_accurate_subpel_search; + + besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z, + src_stride, y, y_stride, second_pred, w, h, + offset, mvjcost, mvcost, sse1, distortion); + if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX && + cost_list[2] != INT_MAX && cost_list[3] != INT_MAX && + cost_list[4] != INT_MAX) { + unsigned int left, right, up, down, diag; + whichdir = (cost_list[1] < cost_list[3] ? 0 : 1) + + (cost_list[2] < cost_list[4] ? 0 : 2); + switch (whichdir) { + case 0: + CHECK_BETTER(left, tr, tc - hstep); + CHECK_BETTER(down, tr + hstep, tc); + CHECK_BETTER(diag, tr + hstep, tc - hstep); + break; + case 1: + CHECK_BETTER(right, tr, tc + hstep); + CHECK_BETTER(down, tr + hstep, tc); + CHECK_BETTER(diag, tr + hstep, tc + hstep); + break; + case 2: + CHECK_BETTER(left, tr, tc - hstep); + CHECK_BETTER(up, tr - hstep, tc); + CHECK_BETTER(diag, tr - hstep, tc - hstep); + break; + case 3: + CHECK_BETTER(right, tr, tc + hstep); + CHECK_BETTER(up, tr - hstep, tc); + CHECK_BETTER(diag, tr - hstep, tc + hstep); + break; + } + } else { + FIRST_LEVEL_CHECKS; + if (halfiters > 1) { + SECOND_LEVEL_CHECKS; + } + } + + tr = br; + tc = bc; + + // Each subsequent iteration checks at least one point in common with + // the last iteration could be 2 ( if diag selected) 1/4 pel + + // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only + if (forced_stop != 2) { + hstep >>= 1; + FIRST_LEVEL_CHECKS; + if (quarteriters > 1) { + SECOND_LEVEL_CHECKS; + } + tr = br; + tc = bc; + } + + if (allow_hp && use_mv_hp(ref_mv) && forced_stop == 0) { + hstep >>= 1; + FIRST_LEVEL_CHECKS; + if (eighthiters > 1) { + SECOND_LEVEL_CHECKS; + } + tr = br; + tc = bc; + } + // These lines insure static analysis doesn't warn that + // tr and tc aren't used after the above point. + (void)tr; + (void)tc; + + bestmv->row = br; + bestmv->col = bc; + + return besterr; +} + +/* clang-format off */ +static const MV search_step_table[12] = { + // left, right, up, down + { 0, -4 }, { 0, 4 }, { -4, 0 }, { 4, 0 }, + { 0, -2 }, { 0, 2 }, { -2, 0 }, { 2, 0 }, + { 0, -1 }, { 0, 1 }, { -1, 0 }, { 1, 0 } +}; +/* clang-format on */ + +static int accurate_sub_pel_search( + const MACROBLOCKD *xd, const MV *this_mv, const struct scale_factors *sf, + const InterpKernel *kernel, const vp9_variance_fn_ptr_t *vfp, + const uint8_t *const src_address, const int src_stride, + const uint8_t *const pre_address, int y_stride, const uint8_t *second_pred, + int w, int h, uint32_t *sse) { +#if CONFIG_VP9_HIGHBITDEPTH + uint64_t besterr; + assert(sf->x_step_q4 == 16 && sf->y_step_q4 == 16); + assert(w != 0 && h != 0); + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + DECLARE_ALIGNED(16, uint16_t, pred16[64 * 64]); + vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(pre_address), y_stride, + pred16, w, this_mv, sf, w, h, 0, kernel, + MV_PRECISION_Q3, 0, 0, xd->bd); + if (second_pred != NULL) { + DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]); + vpx_highbd_comp_avg_pred(comp_pred16, CONVERT_TO_SHORTPTR(second_pred), w, + h, pred16, w); + besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src_address, + src_stride, sse); + } else { + besterr = + vfp->vf(CONVERT_TO_BYTEPTR(pred16), w, src_address, src_stride, sse); + } + } else { + DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]); + vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h, + 0, kernel, MV_PRECISION_Q3, 0, 0); + if (second_pred != NULL) { + DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]); + vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w); + besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse); + } else { + besterr = vfp->vf(pred, w, src_address, src_stride, sse); + } + } + if (besterr >= UINT_MAX) return UINT_MAX; + return (int)besterr; +#else + int besterr; + DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]); + assert(sf->x_step_q4 == 16 && sf->y_step_q4 == 16); + assert(w != 0 && h != 0); + (void)xd; + + vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h, + 0, kernel, MV_PRECISION_Q3, 0, 0); + if (second_pred != NULL) { + DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]); + vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w); + besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse); + } else { + besterr = vfp->vf(pred, w, src_address, src_stride, sse); + } + return besterr; +#endif // CONFIG_VP9_HIGHBITDEPTH +} + +// TODO(yunqing): this part can be further refactored. +#if CONFIG_VP9_HIGHBITDEPTH +/* checks if (r, c) has better score than previous best */ +#define CHECK_BETTER1(v, r, c) \ + do { \ + if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ + int64_t tmpmse; \ + const MV cb_mv = { r, c }; \ + const MV cb_ref_mv = { rr, rc }; \ + thismse = accurate_sub_pel_search(xd, &cb_mv, x->me_sf, kernel, vfp, z, \ + src_stride, y, y_stride, second_pred, \ + w, h, &sse); \ + tmpmse = thismse; \ + tmpmse += \ + mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, error_per_bit); \ + if (tmpmse >= INT_MAX) { \ + v = INT_MAX; \ + } else if ((v = (uint32_t)tmpmse) < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + } else { \ + v = INT_MAX; \ + } \ + } while (0) +#else +/* checks if (r, c) has better score than previous best */ +#define CHECK_BETTER1(v, r, c) \ + do { \ + if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ + const MV cb_mv = { r, c }; \ + const MV cb_ref_mv = { rr, rc }; \ + thismse = accurate_sub_pel_search(xd, &cb_mv, x->me_sf, kernel, vfp, z, \ + src_stride, y, y_stride, second_pred, \ + w, h, &sse); \ + if ((v = mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, \ + error_per_bit) + \ + thismse) < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + } else { \ + v = INT_MAX; \ + } \ + } while (0) + +#endif + +uint32_t vp9_find_best_sub_pixel_tree( + const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp, + int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, + int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], + uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, + int h, int use_accurate_subpel_search) { + const uint8_t *const z = x->plane[0].src.buf; + const uint8_t *const src_address = z; + const int src_stride = x->plane[0].src.stride; + const MACROBLOCKD *xd = &x->e_mbd; + unsigned int besterr = UINT_MAX; + unsigned int sse; + int thismse; + const int y_stride = xd->plane[0].pre[0].stride; + const int offset = bestmv->row * y_stride + bestmv->col; + const uint8_t *const y = xd->plane[0].pre[0].buf; + + int rr = ref_mv->row; + int rc = ref_mv->col; + int br = bestmv->row * 8; + int bc = bestmv->col * 8; + int hstep = 4; + int iter, round = 3 - forced_stop; + + int minc, maxc, minr, maxr; + int tr = br; + int tc = bc; + const MV *search_step = search_step_table; + int idx, best_idx = -1; + unsigned int cost_array[5]; + int kr, kc; + MvLimits subpel_mv_limits; + + // TODO(yunqing): need to add 4-tap filter optimization to speed up the + // encoder. + const InterpKernel *kernel = + (use_accurate_subpel_search > 0) + ? ((use_accurate_subpel_search == USE_4_TAPS) + ? vp9_filter_kernels[FOURTAP] + : ((use_accurate_subpel_search == USE_8_TAPS) + ? vp9_filter_kernels[EIGHTTAP] + : vp9_filter_kernels[EIGHTTAP_SHARP])) + : vp9_filter_kernels[BILINEAR]; + + vp9_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, ref_mv); + minc = subpel_mv_limits.col_min; + maxc = subpel_mv_limits.col_max; + minr = subpel_mv_limits.row_min; + maxr = subpel_mv_limits.row_max; + + if (!(allow_hp && use_mv_hp(ref_mv))) + if (round == 3) round = 2; + + bestmv->row *= 8; + bestmv->col *= 8; + + besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z, + src_stride, y, y_stride, second_pred, w, h, + offset, mvjcost, mvcost, sse1, distortion); + + (void)cost_list; // to silence compiler warning + + for (iter = 0; iter < round; ++iter) { + // Check vertical and horizontal sub-pixel positions. + for (idx = 0; idx < 4; ++idx) { + tr = br + search_step[idx].row; + tc = bc + search_step[idx].col; + if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) { + MV this_mv; + this_mv.row = tr; + this_mv.col = tc; + + if (use_accurate_subpel_search) { + thismse = accurate_sub_pel_search(xd, &this_mv, x->me_sf, kernel, vfp, + src_address, src_stride, y, + y_stride, second_pred, w, h, &sse); + } else { + const uint8_t *const pre_address = + y + (tr >> 3) * y_stride + (tc >> 3); + if (second_pred == NULL) + thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), + src_address, src_stride, &sse); + else + thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), + src_address, src_stride, &sse, second_pred); + } + + cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, + mvcost, error_per_bit); + + if (cost_array[idx] < besterr) { + best_idx = idx; + besterr = cost_array[idx]; + *distortion = thismse; + *sse1 = sse; + } + } else { + cost_array[idx] = UINT_MAX; + } + } + + // Check diagonal sub-pixel position + kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep); + kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep); + + tc = bc + kc; + tr = br + kr; + if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) { + MV this_mv = { tr, tc }; + if (use_accurate_subpel_search) { + thismse = accurate_sub_pel_search(xd, &this_mv, x->me_sf, kernel, vfp, + src_address, src_stride, y, y_stride, + second_pred, w, h, &sse); + } else { + const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3); + if (second_pred == NULL) + thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address, + src_stride, &sse); + else + thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), + src_address, src_stride, &sse, second_pred); + } + + cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, + error_per_bit); + + if (cost_array[4] < besterr) { + best_idx = 4; + besterr = cost_array[4]; + *distortion = thismse; + *sse1 = sse; + } + } else { + cost_array[idx] = UINT_MAX; + } + + if (best_idx < 4 && best_idx >= 0) { + br += search_step[best_idx].row; + bc += search_step[best_idx].col; + } else if (best_idx == 4) { + br = tr; + bc = tc; + } + + if (iters_per_step > 0 && best_idx != -1) { + unsigned int second; + const int br0 = br; + const int bc0 = bc; + assert(tr == br || tc == bc); + + if (tr == br && tc != bc) { + kc = bc - tc; + if (iters_per_step == 1) { + if (use_accurate_subpel_search) { + CHECK_BETTER1(second, br0, bc0 + kc); + } else { + CHECK_BETTER(second, br0, bc0 + kc); + } + } + } else if (tr != br && tc == bc) { + kr = br - tr; + if (iters_per_step == 1) { + if (use_accurate_subpel_search) { + CHECK_BETTER1(second, br0 + kr, bc0); + } else { + CHECK_BETTER(second, br0 + kr, bc0); + } + } + } + + if (iters_per_step > 1) { + if (use_accurate_subpel_search) { + CHECK_BETTER1(second, br0 + kr, bc0); + CHECK_BETTER1(second, br0, bc0 + kc); + if (br0 != br || bc0 != bc) { + CHECK_BETTER1(second, br0 + kr, bc0 + kc); + } + } else { + CHECK_BETTER(second, br0 + kr, bc0); + CHECK_BETTER(second, br0, bc0 + kc); + if (br0 != br || bc0 != bc) { + CHECK_BETTER(second, br0 + kr, bc0 + kc); + } + } + } + } + + search_step += 4; + hstep >>= 1; + best_idx = -1; + } + + // Each subsequent iteration checks at least one point in common with + // the last iteration could be 2 ( if diag selected) 1/4 pel + + // These lines insure static analysis doesn't warn that + // tr and tc aren't used after the above point. + (void)tr; + (void)tc; + + bestmv->row = br; + bestmv->col = bc; + + return besterr; +} + +#undef CHECK_BETTER +#undef CHECK_BETTER1 + +static INLINE int check_bounds(const MvLimits *mv_limits, int row, int col, + int range) { + return ((row - range) >= mv_limits->row_min) & + ((row + range) <= mv_limits->row_max) & + ((col - range) >= mv_limits->col_min) & + ((col + range) <= mv_limits->col_max); +} + +static INLINE int is_mv_in(const MvLimits *mv_limits, const MV *mv) { + return (mv->col >= mv_limits->col_min) && (mv->col <= mv_limits->col_max) && + (mv->row >= mv_limits->row_min) && (mv->row <= mv_limits->row_max); +} + +#define CHECK_BETTER \ + { \ + if (thissad < bestsad) { \ + if (use_mvcost) \ + thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit); \ + if (thissad < bestsad) { \ + bestsad = thissad; \ + best_site = i; \ + } \ + } \ + } + +#define MAX_PATTERN_SCALES 11 +#define MAX_PATTERN_CANDIDATES 8 // max number of candidates per scale +#define PATTERN_CANDIDATES_REF 3 // number of refinement candidates + +// Calculate and return a sad+mvcost list around an integer best pel. +static INLINE void calc_int_cost_list(const MACROBLOCK *x, const MV *ref_mv, + int sadpb, + const vp9_variance_fn_ptr_t *fn_ptr, + const MV *best_mv, int *cost_list) { + static const MV neighbors[4] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } }; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &x->e_mbd.plane[0].pre[0]; + const MV fcenter_mv = { ref_mv->row >> 3, ref_mv->col >> 3 }; + int br = best_mv->row; + int bc = best_mv->col; + const MV mv = { br, bc }; + int i; + unsigned int sse; + + cost_list[0] = + fn_ptr->vf(what->buf, what->stride, get_buf_from_mv(in_what, &mv), + in_what->stride, &sse) + + mvsad_err_cost(x, &mv, &fcenter_mv, sadpb); + if (check_bounds(&x->mv_limits, br, bc, 1)) { + for (i = 0; i < 4; i++) { + const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col }; + cost_list[i + 1] = fn_ptr->vf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), + in_what->stride, &sse) + + mv_err_cost(&this_mv, &fcenter_mv, x->nmvjointcost, + x->mvcost, x->errorperbit); + } + } else { + for (i = 0; i < 4; i++) { + const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col }; + if (!is_mv_in(&x->mv_limits, &this_mv)) + cost_list[i + 1] = INT_MAX; + else + cost_list[i + 1] = fn_ptr->vf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), + in_what->stride, &sse) + + mv_err_cost(&this_mv, &fcenter_mv, x->nmvjointcost, + x->mvcost, x->errorperbit); + } + } +} + +// Generic pattern search function that searches over multiple scales. +// Each scale can have a different number of candidates and shape of +// candidates as indicated in the num_candidates and candidates arrays +// passed into this function +// +static int vp9_pattern_search( + const MACROBLOCK *x, MV *ref_mv, int search_param, int sad_per_bit, + int do_init_search, int *cost_list, const vp9_variance_fn_ptr_t *vfp, + int use_mvcost, const MV *center_mv, MV *best_mv, + const int num_candidates[MAX_PATTERN_SCALES], + const MV candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES]) { + const MACROBLOCKD *const xd = &x->e_mbd; + static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = { + 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, + }; + int i, s, t; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + int br, bc; + int bestsad = INT_MAX; + int thissad; + int k = -1; + const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; + int best_init_s = search_param_to_steps[search_param]; + // adjust ref_mv to make sure it is within MV range + clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max, + x->mv_limits.row_min, x->mv_limits.row_max); + br = ref_mv->row; + bc = ref_mv->col; + + // Work out the start point for the search + bestsad = vfp->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv), + in_what->stride) + + mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); + + // Search all possible scales up to the search param around the center point + // pick the scale of the point that is best as the starting scale of + // further steps around it. + if (do_init_search) { + s = best_init_s; + best_init_s = -1; + for (t = 0; t <= s; ++t) { + int best_site = -1; + if (check_bounds(&x->mv_limits, br, bc, 1 << t)) { + for (i = 0; i < num_candidates[t]; i++) { + const MV this_mv = { br + candidates[t][i].row, + bc + candidates[t][i].col }; + thissad = + vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), in_what->stride); + CHECK_BETTER + } + } else { + for (i = 0; i < num_candidates[t]; i++) { + const MV this_mv = { br + candidates[t][i].row, + bc + candidates[t][i].col }; + if (!is_mv_in(&x->mv_limits, &this_mv)) continue; + thissad = + vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), in_what->stride); + CHECK_BETTER + } + } + if (best_site == -1) { + continue; + } else { + best_init_s = t; + k = best_site; + } + } + if (best_init_s != -1) { + br += candidates[best_init_s][k].row; + bc += candidates[best_init_s][k].col; + } + } + + // If the center point is still the best, just skip this and move to + // the refinement step. + if (best_init_s != -1) { + int best_site = -1; + s = best_init_s; + + do { + // No need to search all 6 points the 1st time if initial search was used + if (!do_init_search || s != best_init_s) { + if (check_bounds(&x->mv_limits, br, bc, 1 << s)) { + for (i = 0; i < num_candidates[s]; i++) { + const MV this_mv = { br + candidates[s][i].row, + bc + candidates[s][i].col }; + thissad = + vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), in_what->stride); + CHECK_BETTER + } + } else { + for (i = 0; i < num_candidates[s]; i++) { + const MV this_mv = { br + candidates[s][i].row, + bc + candidates[s][i].col }; + if (!is_mv_in(&x->mv_limits, &this_mv)) continue; + thissad = + vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), in_what->stride); + CHECK_BETTER + } + } + + if (best_site == -1) { + continue; + } else { + br += candidates[s][best_site].row; + bc += candidates[s][best_site].col; + k = best_site; + } + } + + do { + int next_chkpts_indices[PATTERN_CANDIDATES_REF]; + best_site = -1; + next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1; + next_chkpts_indices[1] = k; + next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1; + + if (check_bounds(&x->mv_limits, br, bc, 1 << s)) { + for (i = 0; i < PATTERN_CANDIDATES_REF; i++) { + const MV this_mv = { + br + candidates[s][next_chkpts_indices[i]].row, + bc + candidates[s][next_chkpts_indices[i]].col + }; + thissad = + vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), in_what->stride); + CHECK_BETTER + } + } else { + for (i = 0; i < PATTERN_CANDIDATES_REF; i++) { + const MV this_mv = { + br + candidates[s][next_chkpts_indices[i]].row, + bc + candidates[s][next_chkpts_indices[i]].col + }; + if (!is_mv_in(&x->mv_limits, &this_mv)) continue; + thissad = + vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), in_what->stride); + CHECK_BETTER + } + } + + if (best_site != -1) { + k = next_chkpts_indices[best_site]; + br += candidates[s][k].row; + bc += candidates[s][k].col; + } + } while (best_site != -1); + } while (s--); + } + + best_mv->row = br; + best_mv->col = bc; + + // Returns the one-away integer pel sad values around the best as follows: + // cost_list[0]: cost at the best integer pel + // cost_list[1]: cost at delta {0, -1} (left) from the best integer pel + // cost_list[2]: cost at delta { 1, 0} (bottom) from the best integer pel + // cost_list[3]: cost at delta { 0, 1} (right) from the best integer pel + // cost_list[4]: cost at delta {-1, 0} (top) from the best integer pel + if (cost_list) { + calc_int_cost_list(x, &fcenter_mv, sad_per_bit, vfp, best_mv, cost_list); + } + return bestsad; +} + +// A specialized function where the smallest scale search candidates +// are 4 1-away neighbors, and cost_list is non-null +// TODO(debargha): Merge this function with the one above. Also remove +// use_mvcost option since it is always 1, to save unnecessary branches. +static int vp9_pattern_search_sad( + const MACROBLOCK *x, MV *ref_mv, int search_param, int sad_per_bit, + int do_init_search, int *cost_list, const vp9_variance_fn_ptr_t *vfp, + int use_mvcost, const MV *center_mv, MV *best_mv, + const int num_candidates[MAX_PATTERN_SCALES], + const MV candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES]) { + const MACROBLOCKD *const xd = &x->e_mbd; + static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = { + 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, + }; + int i, s, t; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + int br, bc; + int bestsad = INT_MAX; + int thissad; + int k = -1; + const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; + int best_init_s = search_param_to_steps[search_param]; + // adjust ref_mv to make sure it is within MV range + clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max, + x->mv_limits.row_min, x->mv_limits.row_max); + br = ref_mv->row; + bc = ref_mv->col; + if (cost_list != NULL) { + cost_list[0] = cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = + INT_MAX; + } + + // Work out the start point for the search + bestsad = vfp->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv), + in_what->stride) + + mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); + + // Search all possible scales up to the search param around the center point + // pick the scale of the point that is best as the starting scale of + // further steps around it. + if (do_init_search) { + s = best_init_s; + best_init_s = -1; + for (t = 0; t <= s; ++t) { + int best_site = -1; + if (check_bounds(&x->mv_limits, br, bc, 1 << t)) { + for (i = 0; i < num_candidates[t]; i++) { + const MV this_mv = { br + candidates[t][i].row, + bc + candidates[t][i].col }; + thissad = + vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), in_what->stride); + CHECK_BETTER + } + } else { + for (i = 0; i < num_candidates[t]; i++) { + const MV this_mv = { br + candidates[t][i].row, + bc + candidates[t][i].col }; + if (!is_mv_in(&x->mv_limits, &this_mv)) continue; + thissad = + vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), in_what->stride); + CHECK_BETTER + } + } + if (best_site == -1) { + continue; + } else { + best_init_s = t; + k = best_site; + } + } + if (best_init_s != -1) { + br += candidates[best_init_s][k].row; + bc += candidates[best_init_s][k].col; + } + } + + // If the center point is still the best, just skip this and move to + // the refinement step. + if (best_init_s != -1) { + int do_sad = (num_candidates[0] == 4 && cost_list != NULL); + int best_site = -1; + s = best_init_s; + + for (; s >= do_sad; s--) { + if (!do_init_search || s != best_init_s) { + if (check_bounds(&x->mv_limits, br, bc, 1 << s)) { + for (i = 0; i < num_candidates[s]; i++) { + const MV this_mv = { br + candidates[s][i].row, + bc + candidates[s][i].col }; + thissad = + vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), in_what->stride); + CHECK_BETTER + } + } else { + for (i = 0; i < num_candidates[s]; i++) { + const MV this_mv = { br + candidates[s][i].row, + bc + candidates[s][i].col }; + if (!is_mv_in(&x->mv_limits, &this_mv)) continue; + thissad = + vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), in_what->stride); + CHECK_BETTER + } + } + + if (best_site == -1) { + continue; + } else { + br += candidates[s][best_site].row; + bc += candidates[s][best_site].col; + k = best_site; + } + } + + do { + int next_chkpts_indices[PATTERN_CANDIDATES_REF]; + best_site = -1; + next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1; + next_chkpts_indices[1] = k; + next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1; + + if (check_bounds(&x->mv_limits, br, bc, 1 << s)) { + for (i = 0; i < PATTERN_CANDIDATES_REF; i++) { + const MV this_mv = { + br + candidates[s][next_chkpts_indices[i]].row, + bc + candidates[s][next_chkpts_indices[i]].col + }; + thissad = + vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), in_what->stride); + CHECK_BETTER + } + } else { + for (i = 0; i < PATTERN_CANDIDATES_REF; i++) { + const MV this_mv = { + br + candidates[s][next_chkpts_indices[i]].row, + bc + candidates[s][next_chkpts_indices[i]].col + }; + if (!is_mv_in(&x->mv_limits, &this_mv)) continue; + thissad = + vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), in_what->stride); + CHECK_BETTER + } + } + + if (best_site != -1) { + k = next_chkpts_indices[best_site]; + br += candidates[s][k].row; + bc += candidates[s][k].col; + } + } while (best_site != -1); + } + + // Note: If we enter the if below, then cost_list must be non-NULL. + if (s == 0) { + cost_list[0] = bestsad; + if (!do_init_search || s != best_init_s) { + if (check_bounds(&x->mv_limits, br, bc, 1 << s)) { + for (i = 0; i < num_candidates[s]; i++) { + const MV this_mv = { br + candidates[s][i].row, + bc + candidates[s][i].col }; + cost_list[i + 1] = thissad = + vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), in_what->stride); + CHECK_BETTER + } + } else { + for (i = 0; i < num_candidates[s]; i++) { + const MV this_mv = { br + candidates[s][i].row, + bc + candidates[s][i].col }; + if (!is_mv_in(&x->mv_limits, &this_mv)) continue; + cost_list[i + 1] = thissad = + vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), in_what->stride); + CHECK_BETTER + } + } + + if (best_site != -1) { + br += candidates[s][best_site].row; + bc += candidates[s][best_site].col; + k = best_site; + } + } + while (best_site != -1) { + int next_chkpts_indices[PATTERN_CANDIDATES_REF]; + best_site = -1; + next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1; + next_chkpts_indices[1] = k; + next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1; + cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = INT_MAX; + cost_list[((k + 2) % 4) + 1] = cost_list[0]; + cost_list[0] = bestsad; + + if (check_bounds(&x->mv_limits, br, bc, 1 << s)) { + for (i = 0; i < PATTERN_CANDIDATES_REF; i++) { + const MV this_mv = { + br + candidates[s][next_chkpts_indices[i]].row, + bc + candidates[s][next_chkpts_indices[i]].col + }; + cost_list[next_chkpts_indices[i] + 1] = thissad = + vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), in_what->stride); + CHECK_BETTER + } + } else { + for (i = 0; i < PATTERN_CANDIDATES_REF; i++) { + const MV this_mv = { + br + candidates[s][next_chkpts_indices[i]].row, + bc + candidates[s][next_chkpts_indices[i]].col + }; + if (!is_mv_in(&x->mv_limits, &this_mv)) { + cost_list[next_chkpts_indices[i] + 1] = INT_MAX; + continue; + } + cost_list[next_chkpts_indices[i] + 1] = thissad = + vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), in_what->stride); + CHECK_BETTER + } + } + + if (best_site != -1) { + k = next_chkpts_indices[best_site]; + br += candidates[s][k].row; + bc += candidates[s][k].col; + } + } + } + } + + // Returns the one-away integer pel sad values around the best as follows: + // cost_list[0]: sad at the best integer pel + // cost_list[1]: sad at delta {0, -1} (left) from the best integer pel + // cost_list[2]: sad at delta { 1, 0} (bottom) from the best integer pel + // cost_list[3]: sad at delta { 0, 1} (right) from the best integer pel + // cost_list[4]: sad at delta {-1, 0} (top) from the best integer pel + if (cost_list) { + static const MV neighbors[4] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } }; + if (cost_list[0] == INT_MAX) { + cost_list[0] = bestsad; + if (check_bounds(&x->mv_limits, br, bc, 1)) { + for (i = 0; i < 4; i++) { + const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col }; + cost_list[i + 1] = + vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), in_what->stride); + } + } else { + for (i = 0; i < 4; i++) { + const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col }; + if (!is_mv_in(&x->mv_limits, &this_mv)) + cost_list[i + 1] = INT_MAX; + else + cost_list[i + 1] = + vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), in_what->stride); + } + } + } else { + if (use_mvcost) { + for (i = 0; i < 4; i++) { + const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col }; + if (cost_list[i + 1] != INT_MAX) { + cost_list[i + 1] += + mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit); + } + } + } + } + } + best_mv->row = br; + best_mv->col = bc; + return bestsad; +} + +int vp9_get_mvpred_var(const MACROBLOCK *x, const MV *best_mv, + const MV *center_mv, const vp9_variance_fn_ptr_t *vfp, + int use_mvcost) { + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + const MV mv = { best_mv->row * 8, best_mv->col * 8 }; + uint32_t unused; +#if CONFIG_VP9_HIGHBITDEPTH + uint64_t err = + vfp->vf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv), + in_what->stride, &unused); + err += (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost, + x->errorperbit) + : 0); + if (err >= INT_MAX) return INT_MAX; + return (int)err; +#else + return vfp->vf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv), + in_what->stride, &unused) + + (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost, + x->errorperbit) + : 0); +#endif +} + +int vp9_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv, + const MV *center_mv, const uint8_t *second_pred, + const vp9_variance_fn_ptr_t *vfp, int use_mvcost) { + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + const MV mv = { best_mv->row * 8, best_mv->col * 8 }; + unsigned int unused; + + return vfp->svaf(get_buf_from_mv(in_what, best_mv), in_what->stride, 0, 0, + what->buf, what->stride, &unused, second_pred) + + (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost, + x->errorperbit) + : 0); +} + +static int hex_search(const MACROBLOCK *x, MV *ref_mv, int search_param, + int sad_per_bit, int do_init_search, int *cost_list, + const vp9_variance_fn_ptr_t *vfp, int use_mvcost, + const MV *center_mv, MV *best_mv) { + // First scale has 8-closest points, the rest have 6 points in hex shape + // at increasing scales + static const int hex_num_candidates[MAX_PATTERN_SCALES] = { 8, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6 }; + // Note that the largest candidate step at each scale is 2^scale + /* clang-format off */ + static const MV hex_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = { + { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 }, { -1, 1 }, + { -1, 0 } }, + { { -1, -2 }, { 1, -2 }, { 2, 0 }, { 1, 2 }, { -1, 2 }, { -2, 0 } }, + { { -2, -4 }, { 2, -4 }, { 4, 0 }, { 2, 4 }, { -2, 4 }, { -4, 0 } }, + { { -4, -8 }, { 4, -8 }, { 8, 0 }, { 4, 8 }, { -4, 8 }, { -8, 0 } }, + { { -8, -16 }, { 8, -16 }, { 16, 0 }, { 8, 16 }, { -8, 16 }, { -16, 0 } }, + { { -16, -32 }, { 16, -32 }, { 32, 0 }, { 16, 32 }, { -16, 32 }, + { -32, 0 } }, + { { -32, -64 }, { 32, -64 }, { 64, 0 }, { 32, 64 }, { -32, 64 }, + { -64, 0 } }, + { { -64, -128 }, { 64, -128 }, { 128, 0 }, { 64, 128 }, { -64, 128 }, + { -128, 0 } }, + { { -128, -256 }, { 128, -256 }, { 256, 0 }, { 128, 256 }, { -128, 256 }, + { -256, 0 } }, + { { -256, -512 }, { 256, -512 }, { 512, 0 }, { 256, 512 }, { -256, 512 }, + { -512, 0 } }, + { { -512, -1024 }, { 512, -1024 }, { 1024, 0 }, { 512, 1024 }, + { -512, 1024 }, { -1024, 0 } } + }; + /* clang-format on */ + return vp9_pattern_search( + x, ref_mv, search_param, sad_per_bit, do_init_search, cost_list, vfp, + use_mvcost, center_mv, best_mv, hex_num_candidates, hex_candidates); +} + +static int bigdia_search(const MACROBLOCK *x, MV *ref_mv, int search_param, + int sad_per_bit, int do_init_search, int *cost_list, + const vp9_variance_fn_ptr_t *vfp, int use_mvcost, + const MV *center_mv, MV *best_mv) { + // First scale has 4-closest points, the rest have 8 points in diamond + // shape at increasing scales + static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = { + 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + }; + // Note that the largest candidate step at each scale is 2^scale + /* clang-format off */ + static const MV + bigdia_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = { + { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } }, + { { -1, -1 }, { 0, -2 }, { 1, -1 }, { 2, 0 }, { 1, 1 }, { 0, 2 }, + { -1, 1 }, { -2, 0 } }, + { { -2, -2 }, { 0, -4 }, { 2, -2 }, { 4, 0 }, { 2, 2 }, { 0, 4 }, + { -2, 2 }, { -4, 0 } }, + { { -4, -4 }, { 0, -8 }, { 4, -4 }, { 8, 0 }, { 4, 4 }, { 0, 8 }, + { -4, 4 }, { -8, 0 } }, + { { -8, -8 }, { 0, -16 }, { 8, -8 }, { 16, 0 }, { 8, 8 }, { 0, 16 }, + { -8, 8 }, { -16, 0 } }, + { { -16, -16 }, { 0, -32 }, { 16, -16 }, { 32, 0 }, { 16, 16 }, + { 0, 32 }, { -16, 16 }, { -32, 0 } }, + { { -32, -32 }, { 0, -64 }, { 32, -32 }, { 64, 0 }, { 32, 32 }, + { 0, 64 }, { -32, 32 }, { -64, 0 } }, + { { -64, -64 }, { 0, -128 }, { 64, -64 }, { 128, 0 }, { 64, 64 }, + { 0, 128 }, { -64, 64 }, { -128, 0 } }, + { { -128, -128 }, { 0, -256 }, { 128, -128 }, { 256, 0 }, { 128, 128 }, + { 0, 256 }, { -128, 128 }, { -256, 0 } }, + { { -256, -256 }, { 0, -512 }, { 256, -256 }, { 512, 0 }, { 256, 256 }, + { 0, 512 }, { -256, 256 }, { -512, 0 } }, + { { -512, -512 }, { 0, -1024 }, { 512, -512 }, { 1024, 0 }, + { 512, 512 }, { 0, 1024 }, { -512, 512 }, { -1024, 0 } } + }; + /* clang-format on */ + return vp9_pattern_search_sad( + x, ref_mv, search_param, sad_per_bit, do_init_search, cost_list, vfp, + use_mvcost, center_mv, best_mv, bigdia_num_candidates, bigdia_candidates); +} + +static int square_search(const MACROBLOCK *x, MV *ref_mv, int search_param, + int sad_per_bit, int do_init_search, int *cost_list, + const vp9_variance_fn_ptr_t *vfp, int use_mvcost, + const MV *center_mv, MV *best_mv) { + // All scales have 8 closest points in square shape + static const int square_num_candidates[MAX_PATTERN_SCALES] = { + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + }; + // Note that the largest candidate step at each scale is 2^scale + /* clang-format off */ + static const MV + square_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = { + { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 }, + { -1, 1 }, { -1, 0 } }, + { { -2, -2 }, { 0, -2 }, { 2, -2 }, { 2, 0 }, { 2, 2 }, { 0, 2 }, + { -2, 2 }, { -2, 0 } }, + { { -4, -4 }, { 0, -4 }, { 4, -4 }, { 4, 0 }, { 4, 4 }, { 0, 4 }, + { -4, 4 }, { -4, 0 } }, + { { -8, -8 }, { 0, -8 }, { 8, -8 }, { 8, 0 }, { 8, 8 }, { 0, 8 }, + { -8, 8 }, { -8, 0 } }, + { { -16, -16 }, { 0, -16 }, { 16, -16 }, { 16, 0 }, { 16, 16 }, + { 0, 16 }, { -16, 16 }, { -16, 0 } }, + { { -32, -32 }, { 0, -32 }, { 32, -32 }, { 32, 0 }, { 32, 32 }, + { 0, 32 }, { -32, 32 }, { -32, 0 } }, + { { -64, -64 }, { 0, -64 }, { 64, -64 }, { 64, 0 }, { 64, 64 }, + { 0, 64 }, { -64, 64 }, { -64, 0 } }, + { { -128, -128 }, { 0, -128 }, { 128, -128 }, { 128, 0 }, { 128, 128 }, + { 0, 128 }, { -128, 128 }, { -128, 0 } }, + { { -256, -256 }, { 0, -256 }, { 256, -256 }, { 256, 0 }, { 256, 256 }, + { 0, 256 }, { -256, 256 }, { -256, 0 } }, + { { -512, -512 }, { 0, -512 }, { 512, -512 }, { 512, 0 }, { 512, 512 }, + { 0, 512 }, { -512, 512 }, { -512, 0 } }, + { { -1024, -1024 }, { 0, -1024 }, { 1024, -1024 }, { 1024, 0 }, + { 1024, 1024 }, { 0, 1024 }, { -1024, 1024 }, { -1024, 0 } } + }; + /* clang-format on */ + return vp9_pattern_search( + x, ref_mv, search_param, sad_per_bit, do_init_search, cost_list, vfp, + use_mvcost, center_mv, best_mv, square_num_candidates, square_candidates); +} + +static int fast_hex_search(const MACROBLOCK *x, MV *ref_mv, int search_param, + int sad_per_bit, + int do_init_search, // must be zero for fast_hex + int *cost_list, const vp9_variance_fn_ptr_t *vfp, + int use_mvcost, const MV *center_mv, MV *best_mv) { + return hex_search(x, ref_mv, VPXMAX(MAX_MVSEARCH_STEPS - 2, search_param), + sad_per_bit, do_init_search, cost_list, vfp, use_mvcost, + center_mv, best_mv); +} + +static int fast_dia_search(const MACROBLOCK *x, MV *ref_mv, int search_param, + int sad_per_bit, int do_init_search, int *cost_list, + const vp9_variance_fn_ptr_t *vfp, int use_mvcost, + const MV *center_mv, MV *best_mv) { + return bigdia_search(x, ref_mv, VPXMAX(MAX_MVSEARCH_STEPS - 2, search_param), + sad_per_bit, do_init_search, cost_list, vfp, use_mvcost, + center_mv, best_mv); +} + +#undef CHECK_BETTER + +// Exhuastive motion search around a given centre position with a given +// step size. +static int exhaustive_mesh_search(const MACROBLOCK *x, MV *ref_mv, MV *best_mv, + int range, int step, int sad_per_bit, + const vp9_variance_fn_ptr_t *fn_ptr, + const MV *center_mv) { + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + MV fcenter_mv = { center_mv->row, center_mv->col }; + unsigned int best_sad = INT_MAX; + int r, c, i; + int start_col, end_col, start_row, end_row; + int col_step = (step > 1) ? step : 4; + + assert(step >= 1); + + clamp_mv(&fcenter_mv, x->mv_limits.col_min, x->mv_limits.col_max, + x->mv_limits.row_min, x->mv_limits.row_max); + *best_mv = fcenter_mv; + best_sad = + fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &fcenter_mv), in_what->stride) + + mvsad_err_cost(x, &fcenter_mv, ref_mv, sad_per_bit); + start_row = VPXMAX(-range, x->mv_limits.row_min - fcenter_mv.row); + start_col = VPXMAX(-range, x->mv_limits.col_min - fcenter_mv.col); + end_row = VPXMIN(range, x->mv_limits.row_max - fcenter_mv.row); + end_col = VPXMIN(range, x->mv_limits.col_max - fcenter_mv.col); + + for (r = start_row; r <= end_row; r += step) { + for (c = start_col; c <= end_col; c += col_step) { + // Step > 1 means we are not checking every location in this pass. + if (step > 1) { + const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c }; + unsigned int sad = + fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &mv), + in_what->stride); + if (sad < best_sad) { + sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; + } + } + } else { + // 4 sads in a single call if we are checking every location + if (c + 3 <= end_col) { + unsigned int sads[4]; + const uint8_t *addrs[4]; + for (i = 0; i < 4; ++i) { + const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i }; + addrs[i] = get_buf_from_mv(in_what, &mv); + } + fn_ptr->sdx4df(what->buf, what->stride, addrs, in_what->stride, sads); + + for (i = 0; i < 4; ++i) { + if (sads[i] < best_sad) { + const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i }; + const unsigned int sad = + sads[i] + mvsad_err_cost(x, &mv, ref_mv, sad_per_bit); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; + } + } + } + } else { + for (i = 0; i < end_col - c; ++i) { + const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i }; + unsigned int sad = + fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &mv), in_what->stride); + if (sad < best_sad) { + sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; + } + } + } + } + } + } + } + + return best_sad; +} + +#define MIN_RANGE 7 +#define MAX_RANGE 256 +#define MIN_INTERVAL 1 +#if CONFIG_NON_GREEDY_MV +static int64_t exhaustive_mesh_search_multi_step( + MV *best_mv, const MV *center_mv, int range, int step, + const struct buf_2d *src, const struct buf_2d *pre, int lambda, + const int_mv *nb_full_mvs, int full_mv_num, const MvLimits *mv_limits, + const vp9_variance_fn_ptr_t *fn_ptr) { + int64_t best_sad; + int r, c; + int start_col, end_col, start_row, end_row; + *best_mv = *center_mv; + best_sad = + ((int64_t)fn_ptr->sdf(src->buf, src->stride, + get_buf_from_mv(pre, center_mv), pre->stride) + << LOG2_PRECISION) + + lambda * vp9_nb_mvs_inconsistency(best_mv, nb_full_mvs, full_mv_num); + start_row = VPXMAX(center_mv->row - range, mv_limits->row_min); + start_col = VPXMAX(center_mv->col - range, mv_limits->col_min); + end_row = VPXMIN(center_mv->row + range, mv_limits->row_max); + end_col = VPXMIN(center_mv->col + range, mv_limits->col_max); + for (r = start_row; r <= end_row; r += step) { + for (c = start_col; c <= end_col; c += step) { + const MV mv = { r, c }; + int64_t sad = (int64_t)fn_ptr->sdf(src->buf, src->stride, + get_buf_from_mv(pre, &mv), pre->stride) + << LOG2_PRECISION; + if (sad < best_sad) { + sad += lambda * vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; + } + } + } + } + return best_sad; +} + +static int64_t exhaustive_mesh_search_single_step( + MV *best_mv, const MV *center_mv, int range, const struct buf_2d *src, + const struct buf_2d *pre, int lambda, const int_mv *nb_full_mvs, + int full_mv_num, const MvLimits *mv_limits, + const vp9_variance_fn_ptr_t *fn_ptr) { + int64_t best_sad; + int r, c, i; + int start_col, end_col, start_row, end_row; + + *best_mv = *center_mv; + best_sad = + ((int64_t)fn_ptr->sdf(src->buf, src->stride, + get_buf_from_mv(pre, center_mv), pre->stride) + << LOG2_PRECISION) + + lambda * vp9_nb_mvs_inconsistency(best_mv, nb_full_mvs, full_mv_num); + start_row = VPXMAX(center_mv->row - range, mv_limits->row_min); + start_col = VPXMAX(center_mv->col - range, mv_limits->col_min); + end_row = VPXMIN(center_mv->row + range, mv_limits->row_max); + end_col = VPXMIN(center_mv->col + range, mv_limits->col_max); + for (r = start_row; r <= end_row; r += 1) { + c = start_col; + while (c + 3 <= end_col) { + unsigned int sads[4]; + const uint8_t *addrs[4]; + for (i = 0; i < 4; ++i) { + const MV mv = { r, c + i }; + addrs[i] = get_buf_from_mv(pre, &mv); + } + fn_ptr->sdx4df(src->buf, src->stride, addrs, pre->stride, sads); + + for (i = 0; i < 4; ++i) { + int64_t sad = (int64_t)sads[i] << LOG2_PRECISION; + if (sad < best_sad) { + const MV mv = { r, c + i }; + sad += + lambda * vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; + } + } + } + c += 4; + } + while (c <= end_col) { + const MV mv = { r, c }; + int64_t sad = (int64_t)fn_ptr->sdf(src->buf, src->stride, + get_buf_from_mv(pre, &mv), pre->stride) + << LOG2_PRECISION; + if (sad < best_sad) { + sad += lambda * vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; + } + } + c += 1; + } + } + return best_sad; +} + +static int64_t exhaustive_mesh_search_new(const MACROBLOCK *x, MV *best_mv, + int range, int step, + const vp9_variance_fn_ptr_t *fn_ptr, + const MV *center_mv, int lambda, + const int_mv *nb_full_mvs, + int full_mv_num) { + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *src = &x->plane[0].src; + const struct buf_2d *pre = &xd->plane[0].pre[0]; + assert(step >= 1); + assert(is_mv_in(&x->mv_limits, center_mv)); + if (step == 1) { + return exhaustive_mesh_search_single_step( + best_mv, center_mv, range, src, pre, lambda, nb_full_mvs, full_mv_num, + &x->mv_limits, fn_ptr); + } + return exhaustive_mesh_search_multi_step(best_mv, center_mv, range, step, src, + pre, lambda, nb_full_mvs, + full_mv_num, &x->mv_limits, fn_ptr); +} + +static int64_t full_pixel_exhaustive_new(const VP9_COMP *cpi, MACROBLOCK *x, + MV *centre_mv_full, + const vp9_variance_fn_ptr_t *fn_ptr, + MV *dst_mv, int lambda, + const int_mv *nb_full_mvs, + int full_mv_num) { + const SPEED_FEATURES *const sf = &cpi->sf; + MV temp_mv = { centre_mv_full->row, centre_mv_full->col }; + int64_t bestsme; + int i; + int interval = sf->mesh_patterns[0].interval; + int range = sf->mesh_patterns[0].range; + int baseline_interval_divisor; + + // Trap illegal values for interval and range for this function. + if ((range < MIN_RANGE) || (range > MAX_RANGE) || (interval < MIN_INTERVAL) || + (interval > range)) { + printf("ERROR: invalid range\n"); + assert(0); + } + + baseline_interval_divisor = range / interval; + + // Check size of proposed first range against magnitude of the centre + // value used as a starting point. + range = VPXMAX(range, (5 * VPXMAX(abs(temp_mv.row), abs(temp_mv.col))) / 4); + range = VPXMIN(range, MAX_RANGE); + interval = VPXMAX(interval, range / baseline_interval_divisor); + + // initial search + bestsme = + exhaustive_mesh_search_new(x, &temp_mv, range, interval, fn_ptr, &temp_mv, + lambda, nb_full_mvs, full_mv_num); + + if ((interval > MIN_INTERVAL) && (range > MIN_RANGE)) { + // Progressive searches with range and step size decreasing each time + // till we reach a step size of 1. Then break out. + for (i = 1; i < MAX_MESH_STEP; ++i) { + // First pass with coarser step and longer range + bestsme = exhaustive_mesh_search_new( + x, &temp_mv, sf->mesh_patterns[i].range, + sf->mesh_patterns[i].interval, fn_ptr, &temp_mv, lambda, nb_full_mvs, + full_mv_num); + + if (sf->mesh_patterns[i].interval == 1) break; + } + } + + *dst_mv = temp_mv; + + return bestsme; +} + +static int64_t diamond_search_sad_new(const MACROBLOCK *x, + const search_site_config *cfg, + const MV *init_full_mv, MV *best_full_mv, + int search_param, int lambda, int *num00, + const vp9_variance_fn_ptr_t *fn_ptr, + const int_mv *nb_full_mvs, + int full_mv_num) { + int i, j, step; + + const MACROBLOCKD *const xd = &x->e_mbd; + uint8_t *what = x->plane[0].src.buf; + const int what_stride = x->plane[0].src.stride; + const uint8_t *in_what; + const int in_what_stride = xd->plane[0].pre[0].stride; + const uint8_t *best_address; + + int64_t bestsad; + int best_site = -1; + int last_site = -1; + + // search_param determines the length of the initial step and hence the number + // of iterations. + // 0 = initial step (MAX_FIRST_STEP) pel + // 1 = (MAX_FIRST_STEP/2) pel, + // 2 = (MAX_FIRST_STEP/4) pel... + // const search_site *ss = &cfg->ss[search_param * cfg->searches_per_step]; + const MV *ss_mv = &cfg->ss_mv[search_param * cfg->searches_per_step]; + const intptr_t *ss_os = &cfg->ss_os[search_param * cfg->searches_per_step]; + const int tot_steps = cfg->total_steps - search_param; + vpx_clear_system_state(); + + *best_full_mv = *init_full_mv; + clamp_mv(best_full_mv, x->mv_limits.col_min, x->mv_limits.col_max, + x->mv_limits.row_min, x->mv_limits.row_max); + *num00 = 0; + + // Work out the start point for the search + in_what = xd->plane[0].pre[0].buf + best_full_mv->row * in_what_stride + + best_full_mv->col; + best_address = in_what; + + // Check the starting position + { + const int64_t mv_dist = + (int64_t)fn_ptr->sdf(what, what_stride, in_what, in_what_stride) + << LOG2_PRECISION; + const int64_t mv_cost = + vp9_nb_mvs_inconsistency(best_full_mv, nb_full_mvs, full_mv_num); + bestsad = mv_dist + lambda * mv_cost; + } + + i = 0; + + for (step = 0; step < tot_steps; step++) { + int all_in = 1, t; + + // All_in is true if every one of the points we are checking are within + // the bounds of the image. + all_in &= ((best_full_mv->row + ss_mv[i].row) > x->mv_limits.row_min); + all_in &= ((best_full_mv->row + ss_mv[i + 1].row) < x->mv_limits.row_max); + all_in &= ((best_full_mv->col + ss_mv[i + 2].col) > x->mv_limits.col_min); + all_in &= ((best_full_mv->col + ss_mv[i + 3].col) < x->mv_limits.col_max); + + // If all the pixels are within the bounds we don't check whether the + // search point is valid in this loop, otherwise we check each point + // for validity.. + if (all_in) { + unsigned int sad_array[4]; + + for (j = 0; j < cfg->searches_per_step; j += 4) { + unsigned char const *block_offset[4]; + + for (t = 0; t < 4; t++) block_offset[t] = ss_os[i + t] + best_address; + + fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, + sad_array); + + for (t = 0; t < 4; t++, i++) { + const int64_t mv_dist = (int64_t)sad_array[t] << LOG2_PRECISION; + if (mv_dist < bestsad) { + const MV this_mv = { best_full_mv->row + ss_mv[i].row, + best_full_mv->col + ss_mv[i].col }; + const int64_t mv_cost = + vp9_nb_mvs_inconsistency(&this_mv, nb_full_mvs, full_mv_num); + const int64_t thissad = mv_dist + lambda * mv_cost; + if (thissad < bestsad) { + bestsad = thissad; + best_site = i; + } + } + } + } + } else { + for (j = 0; j < cfg->searches_per_step; j++) { + // Trap illegal vectors + const MV this_mv = { best_full_mv->row + ss_mv[i].row, + best_full_mv->col + ss_mv[i].col }; + + if (is_mv_in(&x->mv_limits, &this_mv)) { + const uint8_t *const check_here = ss_os[i] + best_address; + const int64_t mv_dist = + (int64_t)fn_ptr->sdf(what, what_stride, check_here, + in_what_stride) + << LOG2_PRECISION; + if (mv_dist < bestsad) { + const int64_t mv_cost = + vp9_nb_mvs_inconsistency(&this_mv, nb_full_mvs, full_mv_num); + const int64_t thissad = mv_dist + lambda * mv_cost; + if (thissad < bestsad) { + bestsad = thissad; + best_site = i; + } + } + } + i++; + } + } + if (best_site != last_site) { + best_full_mv->row += ss_mv[best_site].row; + best_full_mv->col += ss_mv[best_site].col; + best_address += ss_os[best_site]; + last_site = best_site; + } else if (best_address == in_what) { + (*num00)++; + } + } + return bestsad; +} + +int vp9_prepare_nb_full_mvs(const MotionField *motion_field, int mi_row, + int mi_col, int_mv *nb_full_mvs) { + const int mi_width = num_8x8_blocks_wide_lookup[motion_field->bsize]; + const int mi_height = num_8x8_blocks_high_lookup[motion_field->bsize]; + const int dirs[NB_MVS_NUM][2] = { { -1, 0 }, { 0, -1 }, { 1, 0 }, { 0, 1 } }; + int nb_full_mv_num = 0; + int i; + assert(mi_row % mi_height == 0); + assert(mi_col % mi_width == 0); + for (i = 0; i < NB_MVS_NUM; ++i) { + int r = dirs[i][0]; + int c = dirs[i][1]; + int brow = mi_row / mi_height + r; + int bcol = mi_col / mi_width + c; + if (brow >= 0 && brow < motion_field->block_rows && bcol >= 0 && + bcol < motion_field->block_cols) { + if (vp9_motion_field_is_mv_set(motion_field, brow, bcol)) { + int_mv mv = vp9_motion_field_get_mv(motion_field, brow, bcol); + nb_full_mvs[nb_full_mv_num].as_mv = get_full_mv(&mv.as_mv); + ++nb_full_mv_num; + } + } + } + return nb_full_mv_num; +} +#endif // CONFIG_NON_GREEDY_MV + +int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg, + MV *ref_mv, uint32_t start_mv_sad, MV *best_mv, + int search_param, int sad_per_bit, int *num00, + const vp9_sad_fn_ptr_t *sad_fn_ptr, + const MV *center_mv) { + int i, j, step; + + const MACROBLOCKD *const xd = &x->e_mbd; + uint8_t *what = x->plane[0].src.buf; + const int what_stride = x->plane[0].src.stride; + const uint8_t *in_what; + const int in_what_stride = xd->plane[0].pre[0].stride; + const uint8_t *best_address; + + unsigned int bestsad = start_mv_sad; + int best_site = -1; + int last_site = -1; + + int ref_row; + int ref_col; + + // search_param determines the length of the initial step and hence the number + // of iterations. + // 0 = initial step (MAX_FIRST_STEP) pel + // 1 = (MAX_FIRST_STEP/2) pel, + // 2 = (MAX_FIRST_STEP/4) pel... + // const search_site *ss = &cfg->ss[search_param * cfg->searches_per_step]; + const MV *ss_mv = &cfg->ss_mv[search_param * cfg->searches_per_step]; + const intptr_t *ss_os = &cfg->ss_os[search_param * cfg->searches_per_step]; + const int tot_steps = cfg->total_steps - search_param; + + const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; + ref_row = ref_mv->row; + ref_col = ref_mv->col; + *num00 = 0; + best_mv->row = ref_row; + best_mv->col = ref_col; + + // Work out the start point for the search + in_what = xd->plane[0].pre[0].buf + ref_row * in_what_stride + ref_col; + best_address = in_what; + + i = 0; + + for (step = 0; step < tot_steps; step++) { + int all_in = 1, t; + + // All_in is true if every one of the points we are checking are within + // the bounds of the image. + all_in &= ((best_mv->row + ss_mv[i].row) > x->mv_limits.row_min); + all_in &= ((best_mv->row + ss_mv[i + 1].row) < x->mv_limits.row_max); + all_in &= ((best_mv->col + ss_mv[i + 2].col) > x->mv_limits.col_min); + all_in &= ((best_mv->col + ss_mv[i + 3].col) < x->mv_limits.col_max); + + // If all the pixels are within the bounds we don't check whether the + // search point is valid in this loop, otherwise we check each point + // for validity.. + if (all_in) { + unsigned int sad_array[4]; + + for (j = 0; j < cfg->searches_per_step; j += 4) { + unsigned char const *block_offset[4]; + + for (t = 0; t < 4; t++) block_offset[t] = ss_os[i + t] + best_address; + + sad_fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, + sad_array); + + for (t = 0; t < 4; t++, i++) { + if (sad_array[t] < bestsad) { + const MV this_mv = { best_mv->row + ss_mv[i].row, + best_mv->col + ss_mv[i].col }; + sad_array[t] += + mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit); + if (sad_array[t] < bestsad) { + bestsad = sad_array[t]; + best_site = i; + } + } + } + } + } else { + for (j = 0; j < cfg->searches_per_step; j++) { + // Trap illegal vectors + const MV this_mv = { best_mv->row + ss_mv[i].row, + best_mv->col + ss_mv[i].col }; + + if (is_mv_in(&x->mv_limits, &this_mv)) { + const uint8_t *const check_here = ss_os[i] + best_address; + unsigned int thissad = + sad_fn_ptr->sdf(what, what_stride, check_here, in_what_stride); + + if (thissad < bestsad) { + thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit); + if (thissad < bestsad) { + bestsad = thissad; + best_site = i; + } + } + } + i++; + } + } + if (best_site != last_site) { + best_mv->row += ss_mv[best_site].row; + best_mv->col += ss_mv[best_site].col; + best_address += ss_os[best_site]; + last_site = best_site; +#if defined(NEW_DIAMOND_SEARCH) + while (1) { + const MV this_mv = { best_mv->row + ss_mv[best_site].row, + best_mv->col + ss_mv[best_site].col }; + if (is_mv_in(&x->mv_limits, &this_mv)) { + const uint8_t *const check_here = ss_os[best_site] + best_address; + unsigned int thissad = + fn_ptr->sdf(what, what_stride, check_here, in_what_stride); + if (thissad < bestsad) { + thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit); + if (thissad < bestsad) { + bestsad = thissad; + best_mv->row += ss_mv[best_site].row; + best_mv->col += ss_mv[best_site].col; + best_address += ss_os[best_site]; + continue; + } + } + } + break; + } +#endif + } else if (best_address == in_what) { + (*num00)++; + } + } + return bestsad; +} + +static int vector_match(int16_t *ref, int16_t *src, int bwl) { + int best_sad = INT_MAX; + int this_sad; + int d; + int center, offset = 0; + int bw = 4 << bwl; // redundant variable, to be changed in the experiments. + for (d = 0; d <= bw; d += 16) { + this_sad = vpx_vector_var(&ref[d], src, bwl); + if (this_sad < best_sad) { + best_sad = this_sad; + offset = d; + } + } + center = offset; + + for (d = -8; d <= 8; d += 16) { + int this_pos = offset + d; + // check limit + if (this_pos < 0 || this_pos > bw) continue; + this_sad = vpx_vector_var(&ref[this_pos], src, bwl); + if (this_sad < best_sad) { + best_sad = this_sad; + center = this_pos; + } + } + offset = center; + + for (d = -4; d <= 4; d += 8) { + int this_pos = offset + d; + // check limit + if (this_pos < 0 || this_pos > bw) continue; + this_sad = vpx_vector_var(&ref[this_pos], src, bwl); + if (this_sad < best_sad) { + best_sad = this_sad; + center = this_pos; + } + } + offset = center; + + for (d = -2; d <= 2; d += 4) { + int this_pos = offset + d; + // check limit + if (this_pos < 0 || this_pos > bw) continue; + this_sad = vpx_vector_var(&ref[this_pos], src, bwl); + if (this_sad < best_sad) { + best_sad = this_sad; + center = this_pos; + } + } + offset = center; + + for (d = -1; d <= 1; d += 2) { + int this_pos = offset + d; + // check limit + if (this_pos < 0 || this_pos > bw) continue; + this_sad = vpx_vector_var(&ref[this_pos], src, bwl); + if (this_sad < best_sad) { + best_sad = this_sad; + center = this_pos; + } + } + + return (center - (bw >> 1)); +} + +static const MV search_pos[4] = { + { -1, 0 }, + { 0, -1 }, + { 0, 1 }, + { 1, 0 }, +}; + +unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, + int mi_col, const MV *ref_mv) { + MACROBLOCKD *xd = &x->e_mbd; + MODE_INFO *mi = xd->mi[0]; + struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0 } }; + DECLARE_ALIGNED(16, int16_t, hbuf[128]); + DECLARE_ALIGNED(16, int16_t, vbuf[128]); + DECLARE_ALIGNED(16, int16_t, src_hbuf[64]); + DECLARE_ALIGNED(16, int16_t, src_vbuf[64]); + int idx; + const int bw = 4 << b_width_log2_lookup[bsize]; + const int bh = 4 << b_height_log2_lookup[bsize]; + const int search_width = bw << 1; + const int search_height = bh << 1; + const int src_stride = x->plane[0].src.stride; + const int ref_stride = xd->plane[0].pre[0].stride; + uint8_t const *ref_buf, *src_buf; + MV *tmp_mv = &xd->mi[0]->mv[0].as_mv; + unsigned int best_sad, tmp_sad, this_sad[4]; + MV this_mv; + const int norm_factor = 3 + (bw >> 5); + const YV12_BUFFER_CONFIG *scaled_ref_frame = + vp9_get_scaled_ref_frame(cpi, mi->ref_frame[0]); + MvLimits subpel_mv_limits; + + if (scaled_ref_frame) { + int i; + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // motion search code to be used without additional modifications. + for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0]; + vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL); + } + +#if CONFIG_VP9_HIGHBITDEPTH + // TODO(jingning): Implement integral projection functions for high bit-depth + // setting and remove this part of code. + if (xd->bd != 8) { + const unsigned int sad = cpi->fn_ptr[bsize].sdf( + x->plane[0].src.buf, src_stride, xd->plane[0].pre[0].buf, ref_stride); + tmp_mv->row = 0; + tmp_mv->col = 0; + + if (scaled_ref_frame) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i]; + } + return sad; + } +#endif + + // Set up prediction 1-D reference set + ref_buf = xd->plane[0].pre[0].buf - (bw >> 1); + for (idx = 0; idx < search_width; idx += 16) { + vpx_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh); + ref_buf += 16; + } + + ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride; + for (idx = 0; idx < search_height; ++idx) { + vbuf[idx] = vpx_int_pro_col(ref_buf, bw) >> norm_factor; + ref_buf += ref_stride; + } + + // Set up src 1-D reference set + for (idx = 0; idx < bw; idx += 16) { + src_buf = x->plane[0].src.buf + idx; + vpx_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh); + } + + src_buf = x->plane[0].src.buf; + for (idx = 0; idx < bh; ++idx) { + src_vbuf[idx] = vpx_int_pro_col(src_buf, bw) >> norm_factor; + src_buf += src_stride; + } + + // Find the best match per 1-D search + tmp_mv->col = vector_match(hbuf, src_hbuf, b_width_log2_lookup[bsize]); + tmp_mv->row = vector_match(vbuf, src_vbuf, b_height_log2_lookup[bsize]); + + this_mv = *tmp_mv; + src_buf = x->plane[0].src.buf; + ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col; + best_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride); + + { + const uint8_t *const pos[4] = { + ref_buf - ref_stride, + ref_buf - 1, + ref_buf + 1, + ref_buf + ref_stride, + }; + + cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, this_sad); + } + + for (idx = 0; idx < 4; ++idx) { + if (this_sad[idx] < best_sad) { + best_sad = this_sad[idx]; + tmp_mv->row = search_pos[idx].row + this_mv.row; + tmp_mv->col = search_pos[idx].col + this_mv.col; + } + } + + if (this_sad[0] < this_sad[3]) + this_mv.row -= 1; + else + this_mv.row += 1; + + if (this_sad[1] < this_sad[2]) + this_mv.col -= 1; + else + this_mv.col += 1; + + ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col; + + tmp_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride); + if (best_sad > tmp_sad) { + *tmp_mv = this_mv; + best_sad = tmp_sad; + } + + tmp_mv->row *= 8; + tmp_mv->col *= 8; + + vp9_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, ref_mv); + clamp_mv(tmp_mv, subpel_mv_limits.col_min, subpel_mv_limits.col_max, + subpel_mv_limits.row_min, subpel_mv_limits.row_max); + + if (scaled_ref_frame) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i]; + } + + return best_sad; +} + +static int get_exhaustive_threshold(int exhaustive_searches_thresh, + BLOCK_SIZE bsize) { + return exhaustive_searches_thresh >> + (8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize])); +} + +#if CONFIG_NON_GREEDY_MV +// Runs sequence of diamond searches in smaller steps for RD. +/* do_refine: If last step (1-away) of n-step search doesn't pick the center + point as the best match, we will do a final 1-away diamond + refining search */ +int vp9_full_pixel_diamond_new(const VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, MV *mvp_full, int step_param, + int lambda, int do_refine, + const int_mv *nb_full_mvs, int full_mv_num, + MV *best_mv) { + const vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize]; + const SPEED_FEATURES *const sf = &cpi->sf; + int n, num00 = 0; + int thissme; + int bestsme; + const int further_steps = MAX_MVSEARCH_STEPS - 1 - step_param; + const MV center_mv = { 0, 0 }; + vpx_clear_system_state(); + diamond_search_sad_new(x, &cpi->ss_cfg, mvp_full, best_mv, step_param, lambda, + &n, fn_ptr, nb_full_mvs, full_mv_num); + + bestsme = vp9_get_mvpred_var(x, best_mv, ¢er_mv, fn_ptr, 0); + + // If there won't be more n-step search, check to see if refining search is + // needed. + if (n > further_steps) do_refine = 0; + + while (n < further_steps) { + ++n; + if (num00) { + num00--; + } else { + MV temp_mv; + diamond_search_sad_new(x, &cpi->ss_cfg, mvp_full, &temp_mv, + step_param + n, lambda, &num00, fn_ptr, + nb_full_mvs, full_mv_num); + thissme = vp9_get_mvpred_var(x, &temp_mv, ¢er_mv, fn_ptr, 0); + // check to see if refining search is needed. + if (num00 > further_steps - n) do_refine = 0; + + if (thissme < bestsme) { + bestsme = thissme; + *best_mv = temp_mv; + } + } + } + + // final 1-away diamond refining search + if (do_refine) { + const int search_range = 8; + MV temp_mv = *best_mv; + vp9_refining_search_sad_new(x, &temp_mv, lambda, search_range, fn_ptr, + nb_full_mvs, full_mv_num); + thissme = vp9_get_mvpred_var(x, &temp_mv, ¢er_mv, fn_ptr, 0); + if (thissme < bestsme) { + bestsme = thissme; + *best_mv = temp_mv; + } + } + + if (sf->exhaustive_searches_thresh < INT_MAX && + !cpi->rc.is_src_frame_alt_ref) { + const int64_t exhaustive_thr = + get_exhaustive_threshold(sf->exhaustive_searches_thresh, bsize); + if (bestsme > exhaustive_thr) { + full_pixel_exhaustive_new(cpi, x, best_mv, fn_ptr, best_mv, lambda, + nb_full_mvs, full_mv_num); + bestsme = vp9_get_mvpred_var(x, best_mv, ¢er_mv, fn_ptr, 0); + } + } + return bestsme; +} +#endif // CONFIG_NON_GREEDY_MV + +// Runs sequence of diamond searches in smaller steps for RD. +/* do_refine: If last step (1-away) of n-step search doesn't pick the center + point as the best match, we will do a final 1-away diamond + refining search */ +static int full_pixel_diamond(const VP9_COMP *const cpi, + const MACROBLOCK *const x, BLOCK_SIZE bsize, + MV *mvp_full, int step_param, int sadpb, + int further_steps, int do_refine, + int use_downsampled_sad, int *cost_list, + const vp9_variance_fn_ptr_t *fn_ptr, + const MV *ref_mv, MV *dst_mv) { + MV temp_mv; + int thissme, n, num00 = 0; + int bestsme; + const int src_buf_stride = x->plane[0].src.stride; + const uint8_t *const src_buf = x->plane[0].src.buf; + const MACROBLOCKD *const xd = &x->e_mbd; + const int pred_buf_stride = xd->plane[0].pre[0].stride; + uint8_t *pred_buf; + vp9_sad_fn_ptr_t sad_fn_ptr; + unsigned int start_mv_sad, start_mv_sad_even_rows, start_mv_sad_odd_rows; + const MV ref_mv_full = { ref_mv->row >> 3, ref_mv->col >> 3 }; + clamp_mv(mvp_full, x->mv_limits.col_min, x->mv_limits.col_max, + x->mv_limits.row_min, x->mv_limits.row_max); + + pred_buf = + xd->plane[0].pre[0].buf + mvp_full->row * pred_buf_stride + mvp_full->col; + start_mv_sad_even_rows = + fn_ptr->sdsf(src_buf, src_buf_stride, pred_buf, pred_buf_stride); + start_mv_sad_odd_rows = + fn_ptr->sdsf(src_buf + src_buf_stride, src_buf_stride, + pred_buf + pred_buf_stride, pred_buf_stride); + start_mv_sad = (start_mv_sad_even_rows + start_mv_sad_odd_rows) >> 1; + start_mv_sad += mvsad_err_cost(x, mvp_full, &ref_mv_full, sadpb); + + sad_fn_ptr.sdf = fn_ptr->sdf; + sad_fn_ptr.sdx4df = fn_ptr->sdx4df; + if (use_downsampled_sad && num_4x4_blocks_high_lookup[bsize] >= 2) { + // If the absolute difference between the pred-to-src SAD of even rows and + // the pred-to-src SAD of odd rows is small, skip every other row in sad + // computation. + const int odd_to_even_diff_sad = + abs((int)start_mv_sad_even_rows - (int)start_mv_sad_odd_rows); + const int mult_thresh = 10; + if (odd_to_even_diff_sad * mult_thresh < (int)start_mv_sad_even_rows) { + sad_fn_ptr.sdf = fn_ptr->sdsf; + sad_fn_ptr.sdx4df = fn_ptr->sdsx4df; + } + } + + bestsme = + cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, start_mv_sad, &temp_mv, + step_param, sadpb, &n, &sad_fn_ptr, ref_mv); + if (bestsme < INT_MAX) + bestsme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1); + *dst_mv = temp_mv; + + // If there won't be more n-step search, check to see if refining search is + // needed. + if (n > further_steps) do_refine = 0; + + while (n < further_steps) { + ++n; + + if (num00) { + num00--; + } else { + thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, start_mv_sad, + &temp_mv, step_param + n, sadpb, &num00, + &sad_fn_ptr, ref_mv); + if (thissme < INT_MAX) + thissme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1); + + // check to see if refining search is needed. + if (num00 > further_steps - n) do_refine = 0; + + if (thissme < bestsme) { + bestsme = thissme; + *dst_mv = temp_mv; + } + } + } + + // final 1-away diamond refining search + if (do_refine) { + const int search_range = 8; + MV best_mv = *dst_mv; + thissme = vp9_refining_search_sad(x, &best_mv, sadpb, search_range, + &sad_fn_ptr, ref_mv); + if (thissme < INT_MAX) + thissme = vp9_get_mvpred_var(x, &best_mv, ref_mv, fn_ptr, 1); + if (thissme < bestsme) { + bestsme = thissme; + *dst_mv = best_mv; + } + } + + if (sad_fn_ptr.sdf != fn_ptr->sdf) { + // If we are skipping rows when we perform the motion search, we need to + // check the quality of skipping. If it's bad, then we run search with + // skip row features off. + const uint8_t *best_address = get_buf_from_mv(&xd->plane[0].pre[0], dst_mv); + const int sad = + fn_ptr->sdf(src_buf, src_buf_stride, best_address, pred_buf_stride); + const int skip_sad = + fn_ptr->sdsf(src_buf, src_buf_stride, best_address, pred_buf_stride); + // We will keep the result of skipping rows if it's good enough. + const int kSADThresh = + 1 << (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + if (sad > kSADThresh && abs(skip_sad - sad) * 10 >= VPXMAX(sad, 1) * 9) { + // There is a large discrepancy between skipping and not skipping, so we + // need to redo the motion search. + return full_pixel_diamond(cpi, x, bsize, mvp_full, step_param, sadpb, + further_steps, do_refine, 0, cost_list, fn_ptr, + ref_mv, dst_mv); + } + } + + // Return cost list. + if (cost_list) { + calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list); + } + return bestsme; +} + +// Runs an limited range exhaustive mesh search using a pattern set +// according to the encode speed profile. +static int full_pixel_exhaustive(const VP9_COMP *const cpi, + const MACROBLOCK *const x, MV *centre_mv_full, + int sadpb, int *cost_list, + const vp9_variance_fn_ptr_t *fn_ptr, + const MV *ref_mv, MV *dst_mv) { + const SPEED_FEATURES *const sf = &cpi->sf; + MV temp_mv = { centre_mv_full->row, centre_mv_full->col }; + MV f_ref_mv = { ref_mv->row >> 3, ref_mv->col >> 3 }; + int bestsme; + int i; + int interval = sf->mesh_patterns[0].interval; + int range = sf->mesh_patterns[0].range; + int baseline_interval_divisor; + + // Trap illegal values for interval and range for this function. + if ((range < MIN_RANGE) || (range > MAX_RANGE) || (interval < MIN_INTERVAL) || + (interval > range)) + return INT_MAX; + + baseline_interval_divisor = range / interval; + + // Check size of proposed first range against magnitude of the centre + // value used as a starting point. + range = VPXMAX(range, (5 * VPXMAX(abs(temp_mv.row), abs(temp_mv.col))) / 4); + range = VPXMIN(range, MAX_RANGE); + interval = VPXMAX(interval, range / baseline_interval_divisor); + + // initial search + bestsme = exhaustive_mesh_search(x, &f_ref_mv, &temp_mv, range, interval, + sadpb, fn_ptr, &temp_mv); + + if ((interval > MIN_INTERVAL) && (range > MIN_RANGE)) { + // Progressive searches with range and step size decreasing each time + // till we reach a step size of 1. Then break out. + for (i = 1; i < MAX_MESH_STEP; ++i) { + // First pass with coarser step and longer range + bestsme = exhaustive_mesh_search( + x, &f_ref_mv, &temp_mv, sf->mesh_patterns[i].range, + sf->mesh_patterns[i].interval, sadpb, fn_ptr, &temp_mv); + + if (sf->mesh_patterns[i].interval == 1) break; + } + } + + if (bestsme < INT_MAX) + bestsme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1); + *dst_mv = temp_mv; + + // Return cost list. + if (cost_list) { + calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list); + } + return bestsme; +} + +#if CONFIG_NON_GREEDY_MV +int64_t vp9_refining_search_sad_new(const MACROBLOCK *x, MV *best_full_mv, + int lambda, int search_range, + const vp9_variance_fn_ptr_t *fn_ptr, + const int_mv *nb_full_mvs, + int full_mv_num) { + const MACROBLOCKD *const xd = &x->e_mbd; + const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } }; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + const uint8_t *best_address = get_buf_from_mv(in_what, best_full_mv); + int64_t best_sad; + int i, j; + vpx_clear_system_state(); + { + const int64_t mv_dist = (int64_t)fn_ptr->sdf(what->buf, what->stride, + best_address, in_what->stride) + << LOG2_PRECISION; + const int64_t mv_cost = + vp9_nb_mvs_inconsistency(best_full_mv, nb_full_mvs, full_mv_num); + best_sad = mv_dist + lambda * mv_cost; + } + + for (i = 0; i < search_range; i++) { + int best_site = -1; + const int all_in = ((best_full_mv->row - 1) > x->mv_limits.row_min) & + ((best_full_mv->row + 1) < x->mv_limits.row_max) & + ((best_full_mv->col - 1) > x->mv_limits.col_min) & + ((best_full_mv->col + 1) < x->mv_limits.col_max); + + if (all_in) { + unsigned int sads[4]; + const uint8_t *const positions[4] = { best_address - in_what->stride, + best_address - 1, best_address + 1, + best_address + in_what->stride }; + + fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, sads); + + for (j = 0; j < 4; ++j) { + const MV mv = { best_full_mv->row + neighbors[j].row, + best_full_mv->col + neighbors[j].col }; + const int64_t mv_dist = (int64_t)sads[j] << LOG2_PRECISION; + const int64_t mv_cost = + vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num); + const int64_t thissad = mv_dist + lambda * mv_cost; + if (thissad < best_sad) { + best_sad = thissad; + best_site = j; + } + } + } else { + for (j = 0; j < 4; ++j) { + const MV mv = { best_full_mv->row + neighbors[j].row, + best_full_mv->col + neighbors[j].col }; + + if (is_mv_in(&x->mv_limits, &mv)) { + const int64_t mv_dist = + (int64_t)fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &mv), + in_what->stride) + << LOG2_PRECISION; + const int64_t mv_cost = + vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num); + const int64_t thissad = mv_dist + lambda * mv_cost; + if (thissad < best_sad) { + best_sad = thissad; + best_site = j; + } + } + } + } + + if (best_site == -1) { + break; + } else { + best_full_mv->row += neighbors[best_site].row; + best_full_mv->col += neighbors[best_site].col; + best_address = get_buf_from_mv(in_what, best_full_mv); + } + } + + return best_sad; +} +#endif // CONFIG_NON_GREEDY_MV + +int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, + int search_range, + const vp9_sad_fn_ptr_t *sad_fn_ptr, + const MV *center_mv) { + const MACROBLOCKD *const xd = &x->e_mbd; + const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } }; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; + const uint8_t *best_address = get_buf_from_mv(in_what, ref_mv); + unsigned int best_sad = + sad_fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride) + + mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit); + int i, j; + + for (i = 0; i < search_range; i++) { + int best_site = -1; + const int all_in = ((ref_mv->row - 1) > x->mv_limits.row_min) & + ((ref_mv->row + 1) < x->mv_limits.row_max) & + ((ref_mv->col - 1) > x->mv_limits.col_min) & + ((ref_mv->col + 1) < x->mv_limits.col_max); + + if (all_in) { + unsigned int sads[4]; + const uint8_t *const positions[4] = { best_address - in_what->stride, + best_address - 1, best_address + 1, + best_address + in_what->stride }; + + sad_fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, + sads); + + for (j = 0; j < 4; ++j) { + if (sads[j] < best_sad) { + const MV mv = { ref_mv->row + neighbors[j].row, + ref_mv->col + neighbors[j].col }; + sads[j] += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit); + if (sads[j] < best_sad) { + best_sad = sads[j]; + best_site = j; + } + } + } + } else { + for (j = 0; j < 4; ++j) { + const MV mv = { ref_mv->row + neighbors[j].row, + ref_mv->col + neighbors[j].col }; + + if (is_mv_in(&x->mv_limits, &mv)) { + unsigned int sad = + sad_fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &mv), in_what->stride); + if (sad < best_sad) { + sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit); + if (sad < best_sad) { + best_sad = sad; + best_site = j; + } + } + } + } + } + + if (best_site == -1) { + break; + } else { + ref_mv->row += neighbors[best_site].row; + ref_mv->col += neighbors[best_site].col; + best_address = get_buf_from_mv(in_what, ref_mv); + } + } + + return best_sad; +} + +// This function is called when we do joint motion search in comp_inter_inter +// mode. +int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, + int search_range, + const vp9_variance_fn_ptr_t *fn_ptr, + const MV *center_mv, const uint8_t *second_pred) { + const MV neighbors[8] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 }, + { -1, -1 }, { 1, -1 }, { -1, 1 }, { 1, 1 } }; + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; + unsigned int best_sad = INT_MAX; + int i, j; + clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max, + x->mv_limits.row_min, x->mv_limits.row_max); + best_sad = + fn_ptr->sdaf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv), + in_what->stride, second_pred) + + mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit); + + for (i = 0; i < search_range; ++i) { + int best_site = -1; + + for (j = 0; j < 8; ++j) { + const MV mv = { ref_mv->row + neighbors[j].row, + ref_mv->col + neighbors[j].col }; + + if (is_mv_in(&x->mv_limits, &mv)) { + unsigned int sad = + fn_ptr->sdaf(what->buf, what->stride, get_buf_from_mv(in_what, &mv), + in_what->stride, second_pred); + if (sad < best_sad) { + sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit); + if (sad < best_sad) { + best_sad = sad; + best_site = j; + } + } + } + } + + if (best_site == -1) { + break; + } else { + ref_mv->row += neighbors[best_site].row; + ref_mv->col += neighbors[best_site].col; + } + } + return best_sad; +} + +int vp9_full_pixel_search(const VP9_COMP *const cpi, const MACROBLOCK *const x, + BLOCK_SIZE bsize, MV *mvp_full, int step_param, + int search_method, int error_per_bit, int *cost_list, + const MV *ref_mv, MV *tmp_mv, int var_max, int rd) { + const SPEED_FEATURES *const sf = &cpi->sf; + const SEARCH_METHODS method = (SEARCH_METHODS)search_method; + const vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize]; + int var = 0; + int run_exhaustive_search = 0; + + if (cost_list) { + cost_list[0] = INT_MAX; + cost_list[1] = INT_MAX; + cost_list[2] = INT_MAX; + cost_list[3] = INT_MAX; + cost_list[4] = INT_MAX; + } + + switch (method) { + case FAST_DIAMOND: + var = fast_dia_search(x, mvp_full, step_param, error_per_bit, 0, + cost_list, fn_ptr, 1, ref_mv, tmp_mv); + break; + case FAST_HEX: + var = fast_hex_search(x, mvp_full, step_param, error_per_bit, 0, + cost_list, fn_ptr, 1, ref_mv, tmp_mv); + break; + case HEX: + var = hex_search(x, mvp_full, step_param, error_per_bit, 1, cost_list, + fn_ptr, 1, ref_mv, tmp_mv); + break; + case SQUARE: + var = square_search(x, mvp_full, step_param, error_per_bit, 1, cost_list, + fn_ptr, 1, ref_mv, tmp_mv); + break; + case BIGDIA: + var = bigdia_search(x, mvp_full, step_param, error_per_bit, 1, cost_list, + fn_ptr, 1, ref_mv, tmp_mv); + break; + case NSTEP: + case MESH: + var = full_pixel_diamond( + cpi, x, bsize, mvp_full, step_param, error_per_bit, + MAX_MVSEARCH_STEPS - 1 - step_param, 1, + cpi->sf.mv.use_downsampled_sad, cost_list, fn_ptr, ref_mv, tmp_mv); + break; + default: assert(0 && "Unknown search method"); + } + + if (method == NSTEP) { + if (sf->exhaustive_searches_thresh < INT_MAX && + !cpi->rc.is_src_frame_alt_ref) { + const int64_t exhaustive_thr = + get_exhaustive_threshold(sf->exhaustive_searches_thresh, bsize); + if (var > exhaustive_thr) { + run_exhaustive_search = 1; + } + } + } else if (method == MESH) { + run_exhaustive_search = 1; + } + + if (run_exhaustive_search) { + int var_ex; + MV tmp_mv_ex; + var_ex = full_pixel_exhaustive(cpi, x, tmp_mv, error_per_bit, cost_list, + fn_ptr, ref_mv, &tmp_mv_ex); + if (var_ex < var) { + var = var_ex; + *tmp_mv = tmp_mv_ex; + } + } + + if (method != NSTEP && method != MESH && rd && var < var_max) + var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, fn_ptr, 1); + + return var; +} + +// Note(yunqingwang): The following 2 functions are only used in the motion +// vector unit test, which return extreme motion vectors allowed by the MV +// limits. +#define COMMON_MV_TEST \ + SETUP_SUBPEL_SEARCH; \ + \ + (void)error_per_bit; \ + (void)vfp; \ + (void)z; \ + (void)src_stride; \ + (void)y; \ + (void)y_stride; \ + (void)second_pred; \ + (void)w; \ + (void)h; \ + (void)offset; \ + (void)mvjcost; \ + (void)mvcost; \ + (void)sse1; \ + (void)distortion; \ + \ + (void)halfiters; \ + (void)quarteriters; \ + (void)eighthiters; \ + (void)whichdir; \ + (void)allow_hp; \ + (void)forced_stop; \ + (void)hstep; \ + (void)rr; \ + (void)rc; \ + \ + (void)tr; \ + (void)tc; \ + (void)sse; \ + (void)thismse; \ + (void)cost_list; \ + (void)use_accurate_subpel_search + +// Return the maximum MV. +uint32_t vp9_return_max_sub_pixel_mv( + const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp, + int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, + int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], + uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, + int h, int use_accurate_subpel_search) { + COMMON_MV_TEST; + + (void)minr; + (void)minc; + + bestmv->row = maxr; + bestmv->col = maxc; + besterr = 0; + + // In the sub-pel motion search, if hp is not used, then the last bit of mv + // has to be 0. + lower_mv_precision(bestmv, allow_hp && use_mv_hp(ref_mv)); + + return besterr; +} +// Return the minimum MV. +uint32_t vp9_return_min_sub_pixel_mv( + const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp, + int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, + int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], + uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, + int h, int use_accurate_subpel_search) { + COMMON_MV_TEST; + + (void)maxr; + (void)maxc; + + bestmv->row = minr; + bestmv->col = minc; + besterr = 0; + + // In the sub-pel motion search, if hp is not used, then the last bit of mv + // has to be 0. + lower_mv_precision(bestmv, allow_hp && use_mv_hp(ref_mv)); + + return besterr; +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_mcomp.h b/media/libvpx/libvpx/vp9/encoder/vp9_mcomp.h new file mode 100644 index 0000000000..fd6a8b9aca --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_mcomp.h @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_MCOMP_H_ +#define VPX_VP9_ENCODER_VP9_MCOMP_H_ + +#include "vp9/encoder/vp9_block.h" +#if CONFIG_NON_GREEDY_MV +#include "vp9/encoder/vp9_non_greedy_mv.h" +#endif // CONFIG_NON_GREEDY_MV +#include "vpx_dsp/variance.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// The maximum number of steps in a step search given the largest +// allowed initial step +#define MAX_MVSEARCH_STEPS 11 +// Max full pel mv specified in the unit of full pixel +// Enable the use of motion vector in range [-1023, 1023]. +#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS - 1)) - 1) +// Maximum size of the first step in full pel units +#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS - 1)) +// Allowed motion vector pixel distance outside image border +// for Block_16x16 +#define BORDER_MV_PIXELS_B16 (16 + VP9_INTERP_EXTEND) + +typedef struct search_site_config { + // motion search sites + MV ss_mv[8 * MAX_MVSEARCH_STEPS]; // Motion vector + intptr_t ss_os[8 * MAX_MVSEARCH_STEPS]; // Offset + int searches_per_step; + int total_steps; +} search_site_config; + +typedef struct vp9_sad_table { + vpx_sad_fn_t sdf; + vpx_sad_multi_d_fn_t sdx4df; +} vp9_sad_fn_ptr_t; + +static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf, + const MV *mv) { + return &buf->buf[mv->row * buf->stride + mv->col]; +} + +void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride); +void vp9_init3smotion_compensation(search_site_config *cfg, int stride); + +void vp9_set_mv_search_range(MvLimits *mv_limits, const MV *mv); +int vp9_mv_bit_cost(const MV *mv, const MV *ref, const int *mvjcost, + int *mvcost[2], int weight); + +// Utility to compute variance + MV rate cost for a given MV +int vp9_get_mvpred_var(const MACROBLOCK *x, const MV *best_mv, + const MV *center_mv, const vp9_variance_fn_ptr_t *vfp, + int use_mvcost); +int vp9_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv, + const MV *center_mv, const uint8_t *second_pred, + const vp9_variance_fn_ptr_t *vfp, int use_mvcost); + +struct VP9_COMP; +struct SPEED_FEATURES; +struct vp9_sad_table; + +int vp9_init_search_range(int size); + +int vp9_refining_search_sad(const struct macroblock *x, struct mv *ref_mv, + int error_per_bit, int search_range, + const struct vp9_sad_table *sad_fn_ptr, + const struct mv *center_mv); + +// Perform integral projection based motion estimation. +unsigned int vp9_int_pro_motion_estimation(const struct VP9_COMP *cpi, + MACROBLOCK *x, BLOCK_SIZE bsize, + int mi_row, int mi_col, + const MV *ref_mv); + +typedef uint32_t(fractional_mv_step_fp)( + const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp, + int error_per_bit, const vp9_variance_fn_ptr_t *vfp, + int forced_stop, // 0 - full, 1 - qtr only, 2 - half only + int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], + uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, + int h, int use_accurate_subpel_search); + +extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree; +extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned; +extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned_more; +extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned_evenmore; +extern fractional_mv_step_fp vp9_skip_sub_pixel_tree; +extern fractional_mv_step_fp vp9_return_max_sub_pixel_mv; +extern fractional_mv_step_fp vp9_return_min_sub_pixel_mv; + +typedef int (*vp9_diamond_search_fn_t)( + const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, + uint32_t start_mv_sad, MV *best_mv, int search_param, int sad_per_bit, + int *num00, const vp9_sad_fn_ptr_t *sad_fn_ptr, const MV *center_mv); + +int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, + int search_range, + const vp9_variance_fn_ptr_t *fn_ptr, + const MV *center_mv, const uint8_t *second_pred); + +struct VP9_COMP; + +// "mvp_full" is the MV search starting point; +// "ref_mv" is the context reference MV; +// "tmp_mv" is the searched best MV. +int vp9_full_pixel_search(const struct VP9_COMP *const cpi, + const MACROBLOCK *const x, BLOCK_SIZE bsize, + MV *mvp_full, int step_param, int search_method, + int error_per_bit, int *cost_list, const MV *ref_mv, + MV *tmp_mv, int var_max, int rd); + +void vp9_set_subpel_mv_search_range(MvLimits *subpel_mv_limits, + const MvLimits *umv_window_limits, + const MV *ref_mv); + +#if CONFIG_NON_GREEDY_MV +struct TplDepStats; +int64_t vp9_refining_search_sad_new(const MACROBLOCK *x, MV *best_full_mv, + int lambda, int search_range, + const vp9_variance_fn_ptr_t *fn_ptr, + const int_mv *nb_full_mvs, int full_mv_num); + +int vp9_full_pixel_diamond_new(const struct VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, MV *mvp_full, int step_param, + int lambda, int do_refine, + const int_mv *nb_full_mvs, int full_mv_num, + MV *best_mv); + +static INLINE MV get_full_mv(const MV *mv) { + MV out_mv; + out_mv.row = mv->row >> 3; + out_mv.col = mv->col >> 3; + return out_mv; +} +struct TplDepFrame; +int vp9_prepare_nb_full_mvs(const struct MotionField *motion_field, int mi_row, + int mi_col, int_mv *nb_full_mvs); + +static INLINE BLOCK_SIZE get_square_block_size(BLOCK_SIZE bsize) { + BLOCK_SIZE square_bsize; + switch (bsize) { + case BLOCK_4X4: + case BLOCK_4X8: + case BLOCK_8X4: square_bsize = BLOCK_4X4; break; + case BLOCK_8X8: + case BLOCK_8X16: + case BLOCK_16X8: square_bsize = BLOCK_8X8; break; + case BLOCK_16X16: + case BLOCK_16X32: + case BLOCK_32X16: square_bsize = BLOCK_16X16; break; + case BLOCK_32X32: + case BLOCK_32X64: + case BLOCK_64X32: + case BLOCK_64X64: square_bsize = BLOCK_32X32; break; + default: + square_bsize = BLOCK_INVALID; + assert(0 && "ERROR: invalid block size"); + break; + } + return square_bsize; +} +#endif // CONFIG_NON_GREEDY_MV +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_MCOMP_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c b/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c new file mode 100644 index 0000000000..0843cd97e4 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c @@ -0,0 +1,334 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_ethread.h" +#include "vp9/encoder/vp9_multi_thread.h" +#include "vp9/encoder/vp9_temporal_filter.h" + +void *vp9_enc_grp_get_next_job(MultiThreadHandle *multi_thread_ctxt, + int tile_id) { + RowMTInfo *row_mt_info; + JobQueueHandle *job_queue_hdl = NULL; + void *next = NULL; + JobNode *job_info = NULL; +#if CONFIG_MULTITHREAD + pthread_mutex_t *mutex_handle = NULL; +#endif + + row_mt_info = (RowMTInfo *)(&multi_thread_ctxt->row_mt_info[tile_id]); + job_queue_hdl = (JobQueueHandle *)&row_mt_info->job_queue_hdl; +#if CONFIG_MULTITHREAD + mutex_handle = &row_mt_info->job_mutex; +#endif + +// lock the mutex for queue access +#if CONFIG_MULTITHREAD + pthread_mutex_lock(mutex_handle); +#endif + next = job_queue_hdl->next; + if (next != NULL) { + JobQueue *job_queue = (JobQueue *)next; + job_info = &job_queue->job_info; + // Update the next job in the queue + job_queue_hdl->next = job_queue->next; + job_queue_hdl->num_jobs_acquired++; + } + +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(mutex_handle); +#endif + + return job_info; +} + +void vp9_row_mt_alloc_rd_thresh(VP9_COMP *const cpi, + TileDataEnc *const this_tile) { + VP9_COMMON *const cm = &cpi->common; + const int sb_rows = + (mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2) + 1; + int i; + + CHECK_MEM_ERROR( + &cm->error, this_tile->row_base_thresh_freq_fact, + (int *)vpx_calloc(sb_rows * BLOCK_SIZES * MAX_MODES, + sizeof(*(this_tile->row_base_thresh_freq_fact)))); + for (i = 0; i < sb_rows * BLOCK_SIZES * MAX_MODES; i++) + this_tile->row_base_thresh_freq_fact[i] = RD_THRESH_INIT_FACT; +} + +void vp9_row_mt_mem_alloc(VP9_COMP *cpi) { + struct VP9Common *cm = &cpi->common; + MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; + int tile_row, tile_col; + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + int jobs_per_tile_col, total_jobs; + + // Allocate memory that is large enough for all row_mt stages. First pass + // uses 16x16 block size. + jobs_per_tile_col = VPXMAX(cm->mb_rows, sb_rows); + // Calculate the total number of jobs + total_jobs = jobs_per_tile_col * tile_cols; + + multi_thread_ctxt->allocated_tile_cols = tile_cols; + multi_thread_ctxt->allocated_tile_rows = tile_rows; + multi_thread_ctxt->allocated_vert_unit_rows = jobs_per_tile_col; + + CHECK_MEM_ERROR(&cm->error, multi_thread_ctxt->job_queue, + (JobQueue *)vpx_memalign(32, total_jobs * sizeof(JobQueue))); + +#if CONFIG_MULTITHREAD + // Create mutex for each tile + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + RowMTInfo *row_mt_info = &multi_thread_ctxt->row_mt_info[tile_col]; + pthread_mutex_init(&row_mt_info->job_mutex, NULL); + } +#endif + + // Allocate memory for row based multi-threading + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + TileDataEnc *this_tile = &cpi->tile_data[tile_col]; + vp9_row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, jobs_per_tile_col); + if (cpi->sf.adaptive_rd_thresh_row_mt) { + if (this_tile->row_base_thresh_freq_fact != NULL) { + vpx_free(this_tile->row_base_thresh_freq_fact); + this_tile->row_base_thresh_freq_fact = NULL; + } + vp9_row_mt_alloc_rd_thresh(cpi, this_tile); + } + } + + // Assign the sync pointer of tile row zero for every tile row > 0 + for (tile_row = 1; tile_row < tile_rows; tile_row++) { + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + TileDataEnc *this_col_tile = &cpi->tile_data[tile_col]; + this_tile->row_mt_sync = this_col_tile->row_mt_sync; + } + } + + // Calculate the number of vertical units in the given tile row + for (tile_row = 0; tile_row < tile_rows; tile_row++) { + TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols]; + TileInfo *tile_info = &this_tile->tile_info; + multi_thread_ctxt->num_tile_vert_sbs[tile_row] = + get_num_vert_units(*tile_info, MI_BLOCK_SIZE_LOG2); + } +} + +void vp9_row_mt_mem_dealloc(VP9_COMP *cpi) { + MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; + int tile_col; +#if CONFIG_MULTITHREAD + int tile_row; +#endif + + // Deallocate memory for job queue + if (multi_thread_ctxt->job_queue) vpx_free(multi_thread_ctxt->job_queue); + +#if CONFIG_MULTITHREAD + // Destroy mutex for each tile + for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols; + tile_col++) { + RowMTInfo *row_mt_info = &multi_thread_ctxt->row_mt_info[tile_col]; + if (row_mt_info) pthread_mutex_destroy(&row_mt_info->job_mutex); + } +#endif + + // Free row based multi-threading sync memory + for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols; + tile_col++) { + TileDataEnc *this_tile = &cpi->tile_data[tile_col]; + vp9_row_mt_sync_mem_dealloc(&this_tile->row_mt_sync); + } + +#if CONFIG_MULTITHREAD + for (tile_row = 0; tile_row < multi_thread_ctxt->allocated_tile_rows; + tile_row++) { + for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols; + tile_col++) { + TileDataEnc *this_tile = + &cpi->tile_data[tile_row * multi_thread_ctxt->allocated_tile_cols + + tile_col]; + if (this_tile->row_base_thresh_freq_fact != NULL) { + vpx_free(this_tile->row_base_thresh_freq_fact); + this_tile->row_base_thresh_freq_fact = NULL; + } + } + } +#endif +} + +void vp9_multi_thread_tile_init(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + int i; + + for (i = 0; i < tile_cols; i++) { + TileDataEnc *this_tile = &cpi->tile_data[i]; + int jobs_per_tile_col = cpi->oxcf.pass == 1 ? cm->mb_rows : sb_rows; + + // Initialize cur_col to -1 for all rows. + memset(this_tile->row_mt_sync.cur_col, -1, + sizeof(*this_tile->row_mt_sync.cur_col) * jobs_per_tile_col); + vp9_zero(this_tile->fp_data); + this_tile->fp_data.image_data_start_row = INVALID_ROW; + } +} + +void vp9_assign_tile_to_thread(MultiThreadHandle *multi_thread_ctxt, + int tile_cols, int num_workers) { + int tile_id = 0; + int i; + + // Allocating the threads for the tiles + for (i = 0; i < num_workers; i++) { + multi_thread_ctxt->thread_id_to_tile_id[i] = tile_id++; + if (tile_id == tile_cols) tile_id = 0; + } +} + +int vp9_get_job_queue_status(MultiThreadHandle *multi_thread_ctxt, + int cur_tile_id) { + RowMTInfo *row_mt_info; + JobQueueHandle *job_queue_hndl; +#if CONFIG_MULTITHREAD + pthread_mutex_t *mutex; +#endif + int num_jobs_remaining; + + row_mt_info = &multi_thread_ctxt->row_mt_info[cur_tile_id]; + job_queue_hndl = &row_mt_info->job_queue_hdl; +#if CONFIG_MULTITHREAD + mutex = &row_mt_info->job_mutex; +#endif + +#if CONFIG_MULTITHREAD + pthread_mutex_lock(mutex); +#endif + num_jobs_remaining = + multi_thread_ctxt->jobs_per_tile_col - job_queue_hndl->num_jobs_acquired; +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(mutex); +#endif + + return (num_jobs_remaining); +} + +void vp9_prepare_job_queue(VP9_COMP *cpi, JOB_TYPE job_type) { + VP9_COMMON *const cm = &cpi->common; + MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; + JobQueue *job_queue = multi_thread_ctxt->job_queue; + const int tile_cols = 1 << cm->log2_tile_cols; + int job_row_num, jobs_per_tile, jobs_per_tile_col = 0, total_jobs; + const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + int tile_col, i; + + switch (job_type) { + case ENCODE_JOB: jobs_per_tile_col = sb_rows; break; + case FIRST_PASS_JOB: jobs_per_tile_col = cm->mb_rows; break; + case ARNR_JOB: + jobs_per_tile_col = ((cm->mi_rows + TF_ROUND) >> TF_SHIFT); + break; + default: assert(0); + } + + total_jobs = jobs_per_tile_col * tile_cols; + + multi_thread_ctxt->jobs_per_tile_col = jobs_per_tile_col; + // memset the entire job queue buffer to zero + memset(job_queue, 0, total_jobs * sizeof(JobQueue)); + + // Job queue preparation + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + RowMTInfo *tile_ctxt = &multi_thread_ctxt->row_mt_info[tile_col]; + JobQueue *job_queue_curr, *job_queue_temp; + int tile_row = 0; + + tile_ctxt->job_queue_hdl.next = (void *)job_queue; + tile_ctxt->job_queue_hdl.num_jobs_acquired = 0; + + job_queue_curr = job_queue; + job_queue_temp = job_queue; + + // loop over all the vertical rows + for (job_row_num = 0, jobs_per_tile = 0; job_row_num < jobs_per_tile_col; + job_row_num++, jobs_per_tile++) { + job_queue_curr->job_info.vert_unit_row_num = job_row_num; + job_queue_curr->job_info.tile_col_id = tile_col; + job_queue_curr->job_info.tile_row_id = tile_row; + job_queue_curr->next = (void *)(job_queue_temp + 1); + job_queue_curr = ++job_queue_temp; + + if (ENCODE_JOB == job_type) { + if (jobs_per_tile >= + multi_thread_ctxt->num_tile_vert_sbs[tile_row] - 1) { + tile_row++; + jobs_per_tile = -1; + } + } + } + + // Set the last pointer to NULL + job_queue_curr += -1; + job_queue_curr->next = (void *)NULL; + + // Move to the next tile + job_queue += jobs_per_tile_col; + } + + for (i = 0; i < cpi->num_workers; i++) { + EncWorkerData *thread_data; + thread_data = &cpi->tile_thr_data[i]; + thread_data->thread_id = i; + + for (tile_col = 0; tile_col < tile_cols; tile_col++) + thread_data->tile_completion_status[tile_col] = 0; + } +} + +int vp9_get_tiles_proc_status(MultiThreadHandle *multi_thread_ctxt, + int *tile_completion_status, int *cur_tile_id, + int tile_cols) { + int tile_col; + int tile_id = -1; // Stores the tile ID with minimum proc done + int max_num_jobs_remaining = 0; + int num_jobs_remaining; + + // Mark the completion to avoid check in the loop + tile_completion_status[*cur_tile_id] = 1; + // Check for the status of all the tiles + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + if (tile_completion_status[tile_col] == 0) { + num_jobs_remaining = + vp9_get_job_queue_status(multi_thread_ctxt, tile_col); + // Mark the completion to avoid checks during future switches across tiles + if (num_jobs_remaining == 0) tile_completion_status[tile_col] = 1; + if (num_jobs_remaining > max_num_jobs_remaining) { + max_num_jobs_remaining = num_jobs_remaining; + tile_id = tile_col; + } + } + } + + if (-1 == tile_id) { + return 1; + } else { + // Update the cur ID to the next tile ID that will be processed, + // which will be the least processed tile + *cur_tile_id = tile_id; + return 0; + } +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.h b/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.h new file mode 100644 index 0000000000..a2276f4fe6 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_MULTI_THREAD_H_ +#define VPX_VP9_ENCODER_VP9_MULTI_THREAD_H_ + +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_job_queue.h" + +void *vp9_enc_grp_get_next_job(MultiThreadHandle *multi_thread_ctxt, + int tile_id); + +void vp9_prepare_job_queue(VP9_COMP *cpi, JOB_TYPE job_type); + +int vp9_get_job_queue_status(MultiThreadHandle *multi_thread_ctxt, + int cur_tile_id); + +void vp9_assign_tile_to_thread(MultiThreadHandle *multi_thread_ctxt, + int tile_cols, int num_workers); + +void vp9_multi_thread_tile_init(VP9_COMP *cpi); + +void vp9_row_mt_mem_alloc(VP9_COMP *cpi); + +void vp9_row_mt_alloc_rd_thresh(VP9_COMP *const cpi, + TileDataEnc *const this_tile); + +void vp9_row_mt_mem_dealloc(VP9_COMP *cpi); + +int vp9_get_tiles_proc_status(MultiThreadHandle *multi_thread_ctxt, + int *tile_completion_status, int *cur_tile_id, + int tile_cols); + +#endif // VPX_VP9_ENCODER_VP9_MULTI_THREAD_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_noise_estimate.c b/media/libvpx/libvpx/vp9/encoder/vp9_noise_estimate.c new file mode 100644 index 0000000000..4ee6e51ba8 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_noise_estimate.c @@ -0,0 +1,302 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_scale/yv12config.h" +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/encoder/vp9_context_tree.h" +#include "vp9/encoder/vp9_noise_estimate.h" +#include "vp9/encoder/vp9_encoder.h" + +#if CONFIG_VP9_TEMPORAL_DENOISING +// For SVC: only do noise estimation on top spatial layer. +static INLINE int noise_est_svc(const struct VP9_COMP *const cpi) { + return (!cpi->use_svc || + (cpi->use_svc && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)); +} +#endif + +void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) { + ne->enabled = 0; + ne->level = (width * height < 1280 * 720) ? kLowLow : kLow; + ne->value = 0; + ne->count = 0; + ne->thresh = 90; + ne->last_w = 0; + ne->last_h = 0; + if (width * height >= 1920 * 1080) { + ne->thresh = 200; + } else if (width * height >= 1280 * 720) { + ne->thresh = 140; + } else if (width * height >= 640 * 360) { + ne->thresh = 115; + } + ne->num_frames_estimate = 15; + ne->adapt_thresh = (3 * ne->thresh) >> 1; +} + +static int enable_noise_estimation(VP9_COMP *const cpi) { +#if CONFIG_VP9_HIGHBITDEPTH + if (cpi->common.use_highbitdepth) return 0; +#endif +// Enable noise estimation if denoising is on. +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) && + cpi->common.width >= 320 && cpi->common.height >= 180) + return 1; +#endif + // Only allow noise estimate under certain encoding mode. + // Enabled for 1 pass CBR, speed >=5, and if resolution is same as original. + // Not enabled for SVC mode and screen_content_mode. + // Not enabled for low resolutions. + if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR && + cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.speed >= 5 && + cpi->resize_state == ORIG && cpi->resize_pending == 0 && !cpi->use_svc && + cpi->oxcf.content != VP9E_CONTENT_SCREEN && + cpi->common.width * cpi->common.height >= 640 * 360) + return 1; + else + return 0; +} + +#if CONFIG_VP9_TEMPORAL_DENOISING +static void copy_frame(YV12_BUFFER_CONFIG *const dest, + const YV12_BUFFER_CONFIG *const src) { + int r; + const uint8_t *srcbuf = src->y_buffer; + uint8_t *destbuf = dest->y_buffer; + + assert(dest->y_width == src->y_width); + assert(dest->y_height == src->y_height); + + for (r = 0; r < dest->y_height; ++r) { + memcpy(destbuf, srcbuf, dest->y_width); + destbuf += dest->y_stride; + srcbuf += src->y_stride; + } +} +#endif // CONFIG_VP9_TEMPORAL_DENOISING + +NOISE_LEVEL vp9_noise_estimate_extract_level(NOISE_ESTIMATE *const ne) { + int noise_level = kLowLow; + if (ne->value > (ne->thresh << 1)) { + noise_level = kHigh; + } else { + if (ne->value > ne->thresh) + noise_level = kMedium; + else if (ne->value > (ne->thresh >> 1)) + noise_level = kLow; + else + noise_level = kLowLow; + } + return noise_level; +} + +void vp9_update_noise_estimate(VP9_COMP *const cpi) { + const VP9_COMMON *const cm = &cpi->common; + NOISE_ESTIMATE *const ne = &cpi->noise_estimate; + const int low_res = (cm->width <= 352 && cm->height <= 288); + // Estimate of noise level every frame_period frames. + int frame_period = 8; + int thresh_consec_zeromv = 6; + int frame_counter = cm->current_video_frame; + // Estimate is between current source and last source. + YV12_BUFFER_CONFIG *last_source = cpi->Last_Source; +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) { + last_source = &cpi->denoiser.last_source; + // Tune these thresholds for different resolutions when denoising is + // enabled. + if (cm->width > 640 && cm->width <= 1920) { + thresh_consec_zeromv = 2; + } + } +#endif + ne->enabled = enable_noise_estimation(cpi); + if (cpi->svc.number_spatial_layers > 1) + frame_counter = cpi->svc.current_superframe; + if (!ne->enabled || frame_counter % frame_period != 0 || + last_source == NULL || + (cpi->svc.number_spatial_layers == 1 && + (ne->last_w != cm->width || ne->last_h != cm->height))) { +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) + copy_frame(&cpi->denoiser.last_source, cpi->Source); +#endif + if (last_source != NULL) { + ne->last_w = cm->width; + ne->last_h = cm->height; + } + return; + } else if (frame_counter > 60 && cpi->svc.num_encoded_top_layer > 1 && + cpi->rc.frames_since_key > cpi->svc.number_spatial_layers && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 && + cpi->rc.avg_frame_low_motion < (low_res ? 60 : 40)) { + // Force noise estimation to 0 and denoiser off if content has high motion. + ne->level = kLowLow; + ne->count = 0; + ne->num_frames_estimate = 10; +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) && + cpi->svc.current_superframe > 1) { + vp9_denoiser_set_noise_level(cpi, ne->level); + copy_frame(&cpi->denoiser.last_source, cpi->Source); + } +#endif + return; + } else { + unsigned int bin_size = 100; + unsigned int hist[MAX_VAR_HIST_BINS] = { 0 }; + unsigned int hist_avg[MAX_VAR_HIST_BINS]; + unsigned int max_bin = 0; + unsigned int max_bin_count = 0; + unsigned int bin_cnt; + int bsize = BLOCK_16X16; + // Loop over sub-sample of 16x16 blocks of frame, and for blocks that have + // been encoded as zero/small mv at least x consecutive frames, compute + // the variance to update estimate of noise in the source. + const uint8_t *src_y = cpi->Source->y_buffer; + const int src_ystride = cpi->Source->y_stride; + const uint8_t *last_src_y = last_source->y_buffer; + const int last_src_ystride = last_source->y_stride; + const uint8_t *src_u = cpi->Source->u_buffer; + const uint8_t *src_v = cpi->Source->v_buffer; + const int src_uvstride = cpi->Source->uv_stride; + int mi_row, mi_col; + int num_low_motion = 0; + int frame_low_motion = 1; + for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) { + int bl_index = mi_row * cm->mi_cols + mi_col; + if (cpi->consec_zero_mv[bl_index] > thresh_consec_zeromv) + num_low_motion++; + } + } + if (num_low_motion < ((3 * cm->mi_rows * cm->mi_cols) >> 3)) + frame_low_motion = 0; + for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) { + // 16x16 blocks, 1/4 sample of frame. + if (mi_row % 4 == 0 && mi_col % 4 == 0 && mi_row < cm->mi_rows - 1 && + mi_col < cm->mi_cols - 1) { + int bl_index = mi_row * cm->mi_cols + mi_col; + int bl_index1 = bl_index + 1; + int bl_index2 = bl_index + cm->mi_cols; + int bl_index3 = bl_index2 + 1; + int consec_zeromv = + VPXMIN(cpi->consec_zero_mv[bl_index], + VPXMIN(cpi->consec_zero_mv[bl_index1], + VPXMIN(cpi->consec_zero_mv[bl_index2], + cpi->consec_zero_mv[bl_index3]))); + // Only consider blocks that are likely steady background. i.e., have + // been encoded as zero/low motion x (= thresh_consec_zeromv) frames + // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all + // 4 sub-blocks for 16x16 block. And exclude this frame if + // high_source_sad is true (i.e., scene/content change). + if (frame_low_motion && consec_zeromv > thresh_consec_zeromv && + !cpi->rc.high_source_sad && + !cpi->svc.high_source_sad_superframe) { + int is_skin = 0; + if (cpi->use_skin_detection) { + is_skin = + vp9_compute_skin_block(src_y, src_u, src_v, src_ystride, + src_uvstride, bsize, consec_zeromv, 0); + } + if (!is_skin) { + unsigned int sse; + // Compute variance between co-located blocks from current and + // last input frames. + unsigned int variance = cpi->fn_ptr[bsize].vf( + src_y, src_ystride, last_src_y, last_src_ystride, &sse); + unsigned int hist_index = variance / bin_size; + if (hist_index < MAX_VAR_HIST_BINS) + hist[hist_index]++; + else if (hist_index < 3 * (MAX_VAR_HIST_BINS >> 1)) + hist[MAX_VAR_HIST_BINS - 1]++; // Account for the tail + } + } + } + src_y += 8; + last_src_y += 8; + src_u += 4; + src_v += 4; + } + src_y += (src_ystride << 3) - (cm->mi_cols << 3); + last_src_y += (last_src_ystride << 3) - (cm->mi_cols << 3); + src_u += (src_uvstride << 2) - (cm->mi_cols << 2); + src_v += (src_uvstride << 2) - (cm->mi_cols << 2); + } + ne->last_w = cm->width; + ne->last_h = cm->height; + // Adjust histogram to account for effect that histogram flattens + // and shifts to zero as scene darkens. + if (hist[0] > 10 && (hist[MAX_VAR_HIST_BINS - 1] > hist[0] >> 2)) { + hist[0] = 0; + hist[1] >>= 2; + hist[2] >>= 2; + hist[3] >>= 2; + hist[4] >>= 1; + hist[5] >>= 1; + hist[6] = 3 * hist[6] >> 1; + hist[MAX_VAR_HIST_BINS - 1] >>= 1; + } + + // Average hist[] and find largest bin + for (bin_cnt = 0; bin_cnt < MAX_VAR_HIST_BINS; bin_cnt++) { + if (bin_cnt == 0) + hist_avg[bin_cnt] = (hist[0] + hist[1] + hist[2]) / 3; + else if (bin_cnt == MAX_VAR_HIST_BINS - 1) + hist_avg[bin_cnt] = hist[MAX_VAR_HIST_BINS - 1] >> 2; + else if (bin_cnt == MAX_VAR_HIST_BINS - 2) + hist_avg[bin_cnt] = (hist[bin_cnt - 1] + 2 * hist[bin_cnt] + + (hist[bin_cnt + 1] >> 1) + 2) >> + 2; + else + hist_avg[bin_cnt] = + (hist[bin_cnt - 1] + 2 * hist[bin_cnt] + hist[bin_cnt + 1] + 2) >> + 2; + + if (hist_avg[bin_cnt] > max_bin_count) { + max_bin_count = hist_avg[bin_cnt]; + max_bin = bin_cnt; + } + } + + // Scale by 40 to work with existing thresholds + ne->value = (int)((3 * ne->value + max_bin * 40) >> 2); + // Quickly increase VNR strength when the noise level increases suddenly. + if (ne->level < kMedium && ne->value > ne->adapt_thresh) { + ne->count = ne->num_frames_estimate; + } else { + ne->count++; + } + if (ne->count == ne->num_frames_estimate) { + // Reset counter and check noise level condition. + ne->num_frames_estimate = 30; + ne->count = 0; + ne->level = vp9_noise_estimate_extract_level(ne); +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) + vp9_denoiser_set_noise_level(cpi, ne->level); +#endif + } + } +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) + copy_frame(&cpi->denoiser.last_source, cpi->Source); +#endif +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_noise_estimate.h b/media/libvpx/libvpx/vp9/encoder/vp9_noise_estimate.h new file mode 100644 index 0000000000..7fc94ff8c9 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_noise_estimate.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_NOISE_ESTIMATE_H_ +#define VPX_VP9_ENCODER_VP9_NOISE_ESTIMATE_H_ + +#include "vp9/encoder/vp9_block.h" +#include "vp9/encoder/vp9_skin_detection.h" +#include "vpx_scale/yv12config.h" + +#if CONFIG_VP9_TEMPORAL_DENOISING +#include "vp9/encoder/vp9_denoiser.h" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_VAR_HIST_BINS 20 + +typedef enum noise_level { kLowLow, kLow, kMedium, kHigh } NOISE_LEVEL; + +typedef struct noise_estimate { + int enabled; + NOISE_LEVEL level; + int value; + int thresh; + int adapt_thresh; + int count; + int last_w; + int last_h; + int num_frames_estimate; +} NOISE_ESTIMATE; + +struct VP9_COMP; + +void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height); + +NOISE_LEVEL vp9_noise_estimate_extract_level(NOISE_ESTIMATE *const ne); + +void vp9_update_noise_estimate(struct VP9_COMP *const cpi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_NOISE_ESTIMATE_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.c b/media/libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.c new file mode 100644 index 0000000000..d52801c845 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.c @@ -0,0 +1,536 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_mv.h" +#include "vp9/encoder/vp9_non_greedy_mv.h" +// TODO(angiebird): move non_greedy_mv related functions to this file + +#define LOG2_TABLE_SIZE 1024 +static const int log2_table[LOG2_TABLE_SIZE] = { + 0, // This is a dummy value + 0, 1048576, 1661954, 2097152, 2434718, 2710530, 2943725, + 3145728, 3323907, 3483294, 3627477, 3759106, 3880192, 3992301, + 4096672, 4194304, 4286015, 4372483, 4454275, 4531870, 4605679, + 4676053, 4743299, 4807682, 4869436, 4928768, 4985861, 5040877, + 5093962, 5145248, 5194851, 5242880, 5289431, 5334591, 5378443, + 5421059, 5462508, 5502851, 5542146, 5580446, 5617800, 5654255, + 5689851, 5724629, 5758625, 5791875, 5824409, 5856258, 5887450, + 5918012, 5947969, 5977344, 6006160, 6034437, 6062195, 6089453, + 6116228, 6142538, 6168398, 6193824, 6218829, 6243427, 6267632, + 6291456, 6314910, 6338007, 6360756, 6383167, 6405252, 6427019, + 6448477, 6469635, 6490501, 6511084, 6531390, 6551427, 6571202, + 6590722, 6609993, 6629022, 6647815, 6666376, 6684713, 6702831, + 6720734, 6738427, 6755916, 6773205, 6790299, 6807201, 6823917, + 6840451, 6856805, 6872985, 6888993, 6904834, 6920510, 6936026, + 6951384, 6966588, 6981641, 6996545, 7011304, 7025920, 7040397, + 7054736, 7068940, 7083013, 7096956, 7110771, 7124461, 7138029, + 7151476, 7164804, 7178017, 7191114, 7204100, 7216974, 7229740, + 7242400, 7254954, 7267405, 7279754, 7292003, 7304154, 7316208, + 7328167, 7340032, 7351805, 7363486, 7375079, 7386583, 7398000, + 7409332, 7420579, 7431743, 7442826, 7453828, 7464751, 7475595, + 7486362, 7497053, 7507669, 7518211, 7528680, 7539077, 7549404, + 7559660, 7569847, 7579966, 7590017, 7600003, 7609923, 7619778, + 7629569, 7639298, 7648964, 7658569, 7668114, 7677598, 7687023, + 7696391, 7705700, 7714952, 7724149, 7733289, 7742375, 7751407, + 7760385, 7769310, 7778182, 7787003, 7795773, 7804492, 7813161, + 7821781, 7830352, 7838875, 7847350, 7855777, 7864158, 7872493, + 7880782, 7889027, 7897226, 7905381, 7913492, 7921561, 7929586, + 7937569, 7945510, 7953410, 7961268, 7969086, 7976864, 7984602, + 7992301, 7999960, 8007581, 8015164, 8022709, 8030217, 8037687, + 8045121, 8052519, 8059880, 8067206, 8074496, 8081752, 8088973, + 8096159, 8103312, 8110431, 8117516, 8124569, 8131589, 8138576, + 8145532, 8152455, 8159347, 8166208, 8173037, 8179836, 8186605, + 8193343, 8200052, 8206731, 8213380, 8220001, 8226593, 8233156, + 8239690, 8246197, 8252676, 8259127, 8265550, 8271947, 8278316, + 8284659, 8290976, 8297266, 8303530, 8309768, 8315981, 8322168, + 8328330, 8334467, 8340579, 8346667, 8352730, 8358769, 8364784, + 8370775, 8376743, 8382687, 8388608, 8394506, 8400381, 8406233, + 8412062, 8417870, 8423655, 8429418, 8435159, 8440878, 8446576, + 8452252, 8457908, 8463542, 8469155, 8474748, 8480319, 8485871, + 8491402, 8496913, 8502404, 8507875, 8513327, 8518759, 8524171, + 8529564, 8534938, 8540293, 8545629, 8550947, 8556245, 8561525, + 8566787, 8572031, 8577256, 8582464, 8587653, 8592825, 8597980, + 8603116, 8608236, 8613338, 8618423, 8623491, 8628542, 8633576, + 8638593, 8643594, 8648579, 8653547, 8658499, 8663434, 8668354, + 8673258, 8678145, 8683017, 8687874, 8692715, 8697540, 8702350, + 8707145, 8711925, 8716690, 8721439, 8726174, 8730894, 8735599, + 8740290, 8744967, 8749628, 8754276, 8758909, 8763528, 8768134, + 8772725, 8777302, 8781865, 8786415, 8790951, 8795474, 8799983, + 8804478, 8808961, 8813430, 8817886, 8822328, 8826758, 8831175, + 8835579, 8839970, 8844349, 8848715, 8853068, 8857409, 8861737, + 8866053, 8870357, 8874649, 8878928, 8883195, 8887451, 8891694, + 8895926, 8900145, 8904353, 8908550, 8912734, 8916908, 8921069, + 8925220, 8929358, 8933486, 8937603, 8941708, 8945802, 8949885, + 8953957, 8958018, 8962068, 8966108, 8970137, 8974155, 8978162, + 8982159, 8986145, 8990121, 8994086, 8998041, 9001986, 9005920, + 9009844, 9013758, 9017662, 9021556, 9025440, 9029314, 9033178, + 9037032, 9040877, 9044711, 9048536, 9052352, 9056157, 9059953, + 9063740, 9067517, 9071285, 9075044, 9078793, 9082533, 9086263, + 9089985, 9093697, 9097400, 9101095, 9104780, 9108456, 9112123, + 9115782, 9119431, 9123072, 9126704, 9130328, 9133943, 9137549, + 9141146, 9144735, 9148316, 9151888, 9155452, 9159007, 9162554, + 9166092, 9169623, 9173145, 9176659, 9180165, 9183663, 9187152, + 9190634, 9194108, 9197573, 9201031, 9204481, 9207923, 9211357, + 9214784, 9218202, 9221613, 9225017, 9228412, 9231800, 9235181, + 9238554, 9241919, 9245277, 9248628, 9251971, 9255307, 9258635, + 9261956, 9265270, 9268577, 9271876, 9275169, 9278454, 9281732, + 9285002, 9288266, 9291523, 9294773, 9298016, 9301252, 9304481, + 9307703, 9310918, 9314126, 9317328, 9320523, 9323711, 9326892, + 9330067, 9333235, 9336397, 9339552, 9342700, 9345842, 9348977, + 9352106, 9355228, 9358344, 9361454, 9364557, 9367654, 9370744, + 9373828, 9376906, 9379978, 9383043, 9386102, 9389155, 9392202, + 9395243, 9398278, 9401306, 9404329, 9407345, 9410356, 9413360, + 9416359, 9419351, 9422338, 9425319, 9428294, 9431263, 9434226, + 9437184, 9440136, 9443082, 9446022, 9448957, 9451886, 9454809, + 9457726, 9460638, 9463545, 9466446, 9469341, 9472231, 9475115, + 9477994, 9480867, 9483735, 9486597, 9489454, 9492306, 9495152, + 9497993, 9500828, 9503659, 9506484, 9509303, 9512118, 9514927, + 9517731, 9520530, 9523324, 9526112, 9528895, 9531674, 9534447, + 9537215, 9539978, 9542736, 9545489, 9548237, 9550980, 9553718, + 9556451, 9559179, 9561903, 9564621, 9567335, 9570043, 9572747, + 9575446, 9578140, 9580830, 9583514, 9586194, 9588869, 9591540, + 9594205, 9596866, 9599523, 9602174, 9604821, 9607464, 9610101, + 9612735, 9615363, 9617987, 9620607, 9623222, 9625832, 9628438, + 9631040, 9633637, 9636229, 9638818, 9641401, 9643981, 9646556, + 9649126, 9651692, 9654254, 9656812, 9659365, 9661914, 9664459, + 9666999, 9669535, 9672067, 9674594, 9677118, 9679637, 9682152, + 9684663, 9687169, 9689672, 9692170, 9694665, 9697155, 9699641, + 9702123, 9704601, 9707075, 9709545, 9712010, 9714472, 9716930, + 9719384, 9721834, 9724279, 9726721, 9729159, 9731593, 9734024, + 9736450, 9738872, 9741291, 9743705, 9746116, 9748523, 9750926, + 9753326, 9755721, 9758113, 9760501, 9762885, 9765266, 9767642, + 9770015, 9772385, 9774750, 9777112, 9779470, 9781825, 9784175, + 9786523, 9788866, 9791206, 9793543, 9795875, 9798204, 9800530, + 9802852, 9805170, 9807485, 9809797, 9812104, 9814409, 9816710, + 9819007, 9821301, 9823591, 9825878, 9828161, 9830441, 9832718, + 9834991, 9837261, 9839527, 9841790, 9844050, 9846306, 9848559, + 9850808, 9853054, 9855297, 9857537, 9859773, 9862006, 9864235, + 9866462, 9868685, 9870904, 9873121, 9875334, 9877544, 9879751, + 9881955, 9884155, 9886352, 9888546, 9890737, 9892925, 9895109, + 9897291, 9899469, 9901644, 9903816, 9905985, 9908150, 9910313, + 9912473, 9914629, 9916783, 9918933, 9921080, 9923225, 9925366, + 9927504, 9929639, 9931771, 9933900, 9936027, 9938150, 9940270, + 9942387, 9944502, 9946613, 9948721, 9950827, 9952929, 9955029, + 9957126, 9959219, 9961310, 9963398, 9965484, 9967566, 9969645, + 9971722, 9973796, 9975866, 9977934, 9980000, 9982062, 9984122, + 9986179, 9988233, 9990284, 9992332, 9994378, 9996421, 9998461, + 10000498, 10002533, 10004565, 10006594, 10008621, 10010644, 10012665, + 10014684, 10016700, 10018713, 10020723, 10022731, 10024736, 10026738, + 10028738, 10030735, 10032729, 10034721, 10036710, 10038697, 10040681, + 10042662, 10044641, 10046617, 10048591, 10050562, 10052530, 10054496, + 10056459, 10058420, 10060379, 10062334, 10064287, 10066238, 10068186, + 10070132, 10072075, 10074016, 10075954, 10077890, 10079823, 10081754, + 10083682, 10085608, 10087532, 10089453, 10091371, 10093287, 10095201, + 10097112, 10099021, 10100928, 10102832, 10104733, 10106633, 10108529, + 10110424, 10112316, 10114206, 10116093, 10117978, 10119861, 10121742, + 10123620, 10125495, 10127369, 10129240, 10131109, 10132975, 10134839, + 10136701, 10138561, 10140418, 10142273, 10144126, 10145976, 10147825, + 10149671, 10151514, 10153356, 10155195, 10157032, 10158867, 10160699, + 10162530, 10164358, 10166184, 10168007, 10169829, 10171648, 10173465, + 10175280, 10177093, 10178904, 10180712, 10182519, 10184323, 10186125, + 10187925, 10189722, 10191518, 10193311, 10195103, 10196892, 10198679, + 10200464, 10202247, 10204028, 10205806, 10207583, 10209357, 10211130, + 10212900, 10214668, 10216435, 10218199, 10219961, 10221721, 10223479, + 10225235, 10226989, 10228741, 10230491, 10232239, 10233985, 10235728, + 10237470, 10239210, 10240948, 10242684, 10244417, 10246149, 10247879, + 10249607, 10251333, 10253057, 10254779, 10256499, 10258217, 10259933, + 10261647, 10263360, 10265070, 10266778, 10268485, 10270189, 10271892, + 10273593, 10275292, 10276988, 10278683, 10280376, 10282068, 10283757, + 10285444, 10287130, 10288814, 10290495, 10292175, 10293853, 10295530, + 10297204, 10298876, 10300547, 10302216, 10303883, 10305548, 10307211, + 10308873, 10310532, 10312190, 10313846, 10315501, 10317153, 10318804, + 10320452, 10322099, 10323745, 10325388, 10327030, 10328670, 10330308, + 10331944, 10333578, 10335211, 10336842, 10338472, 10340099, 10341725, + 10343349, 10344971, 10346592, 10348210, 10349828, 10351443, 10353057, + 10354668, 10356279, 10357887, 10359494, 10361099, 10362702, 10364304, + 10365904, 10367502, 10369099, 10370694, 10372287, 10373879, 10375468, + 10377057, 10378643, 10380228, 10381811, 10383393, 10384973, 10386551, + 10388128, 10389703, 10391276, 10392848, 10394418, 10395986, 10397553, + 10399118, 10400682, 10402244, 10403804, 10405363, 10406920, 10408476, + 10410030, 10411582, 10413133, 10414682, 10416230, 10417776, 10419320, + 10420863, 10422404, 10423944, 10425482, 10427019, 10428554, 10430087, + 10431619, 10433149, 10434678, 10436206, 10437731, 10439256, 10440778, + 10442299, 10443819, 10445337, 10446854, 10448369, 10449882, 10451394, + 10452905, 10454414, 10455921, 10457427, 10458932, 10460435, 10461936, + 10463436, 10464935, 10466432, 10467927, 10469422, 10470914, 10472405, + 10473895, 10475383, 10476870, 10478355, 10479839, 10481322, 10482802, + 10484282, +}; + +static int mi_size_to_block_size(int mi_bsize, int mi_num) { + return (mi_num % mi_bsize) ? mi_num / mi_bsize + 1 : mi_num / mi_bsize; +} + +Status vp9_alloc_motion_field_info(MotionFieldInfo *motion_field_info, + int frame_num, int mi_rows, int mi_cols) { + int frame_idx, rf_idx, square_block_idx; + if (motion_field_info->allocated) { + // TODO(angiebird): Avoid re-allocate buffer if possible + vp9_free_motion_field_info(motion_field_info); + } + motion_field_info->frame_num = frame_num; + motion_field_info->motion_field_array = + vpx_calloc(frame_num, sizeof(*motion_field_info->motion_field_array)); + if (!motion_field_info->motion_field_array) return STATUS_FAILED; + for (frame_idx = 0; frame_idx < frame_num; ++frame_idx) { + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES; + ++square_block_idx) { + BLOCK_SIZE bsize = square_block_idx_to_bsize(square_block_idx); + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int block_rows = mi_size_to_block_size(mi_height, mi_rows); + const int block_cols = mi_size_to_block_size(mi_width, mi_cols); + MotionField *motion_field = + &motion_field_info + ->motion_field_array[frame_idx][rf_idx][square_block_idx]; + Status status = + vp9_alloc_motion_field(motion_field, bsize, block_rows, block_cols); + if (status == STATUS_FAILED) { + return STATUS_FAILED; + } + } + } + } + motion_field_info->allocated = 1; + return STATUS_OK; +} + +Status vp9_alloc_motion_field(MotionField *motion_field, BLOCK_SIZE bsize, + int block_rows, int block_cols) { + Status status = STATUS_OK; + motion_field->ready = 0; + motion_field->bsize = bsize; + motion_field->block_rows = block_rows; + motion_field->block_cols = block_cols; + motion_field->block_num = block_rows * block_cols; + motion_field->mf = + vpx_calloc(motion_field->block_num, sizeof(*motion_field->mf)); + if (motion_field->mf == NULL) { + status = STATUS_FAILED; + } + motion_field->set_mv = + vpx_calloc(motion_field->block_num, sizeof(*motion_field->set_mv)); + if (motion_field->set_mv == NULL) { + vpx_free(motion_field->mf); + motion_field->mf = NULL; + status = STATUS_FAILED; + } + motion_field->local_structure = vpx_calloc( + motion_field->block_num, sizeof(*motion_field->local_structure)); + if (motion_field->local_structure == NULL) { + vpx_free(motion_field->mf); + motion_field->mf = NULL; + vpx_free(motion_field->set_mv); + motion_field->set_mv = NULL; + status = STATUS_FAILED; + } + return status; +} + +void vp9_free_motion_field(MotionField *motion_field) { + vpx_free(motion_field->mf); + vpx_free(motion_field->set_mv); + vpx_free(motion_field->local_structure); + vp9_zero(*motion_field); +} + +void vp9_free_motion_field_info(MotionFieldInfo *motion_field_info) { + if (motion_field_info->allocated) { + int frame_idx, rf_idx, square_block_idx; + for (frame_idx = 0; frame_idx < motion_field_info->frame_num; ++frame_idx) { + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES; + ++square_block_idx) { + MotionField *motion_field = + &motion_field_info + ->motion_field_array[frame_idx][rf_idx][square_block_idx]; + vp9_free_motion_field(motion_field); + } + } + } + vpx_free(motion_field_info->motion_field_array); + motion_field_info->motion_field_array = NULL; + motion_field_info->frame_num = 0; + motion_field_info->allocated = 0; + } +} + +MotionField *vp9_motion_field_info_get_motion_field( + MotionFieldInfo *motion_field_info, int frame_idx, int rf_idx, + BLOCK_SIZE bsize) { + int square_block_idx = get_square_block_idx(bsize); + assert(frame_idx < motion_field_info->frame_num); + assert(motion_field_info->allocated == 1); + return &motion_field_info + ->motion_field_array[frame_idx][rf_idx][square_block_idx]; +} + +int vp9_motion_field_is_mv_set(const MotionField *motion_field, int brow, + int bcol) { + assert(brow >= 0 && brow < motion_field->block_rows); + assert(bcol >= 0 && bcol < motion_field->block_cols); + return motion_field->set_mv[brow * motion_field->block_cols + bcol]; +} + +int_mv vp9_motion_field_get_mv(const MotionField *motion_field, int brow, + int bcol) { + assert(brow >= 0 && brow < motion_field->block_rows); + assert(bcol >= 0 && bcol < motion_field->block_cols); + return motion_field->mf[brow * motion_field->block_cols + bcol]; +} + +int_mv vp9_motion_field_mi_get_mv(const MotionField *motion_field, int mi_row, + int mi_col) { + const int mi_height = num_8x8_blocks_high_lookup[motion_field->bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[motion_field->bsize]; + const int brow = mi_row / mi_height; + const int bcol = mi_col / mi_width; + assert(mi_row % mi_height == 0); + assert(mi_col % mi_width == 0); + return vp9_motion_field_get_mv(motion_field, brow, bcol); +} + +void vp9_motion_field_mi_set_mv(MotionField *motion_field, int mi_row, + int mi_col, int_mv mv) { + const int mi_height = num_8x8_blocks_high_lookup[motion_field->bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[motion_field->bsize]; + const int brow = mi_row / mi_height; + const int bcol = mi_col / mi_width; + assert(mi_row % mi_height == 0); + assert(mi_col % mi_width == 0); + assert(brow >= 0 && brow < motion_field->block_rows); + assert(bcol >= 0 && bcol < motion_field->block_cols); + motion_field->mf[brow * motion_field->block_cols + bcol] = mv; + motion_field->set_mv[brow * motion_field->block_cols + bcol] = 1; +} + +void vp9_motion_field_reset_mvs(MotionField *motion_field) { + memset(motion_field->set_mv, 0, + motion_field->block_num * sizeof(*motion_field->set_mv)); +} + +static int64_t log2_approximation(int64_t v) { + assert(v > 0); + if (v < LOG2_TABLE_SIZE) { + return log2_table[v]; + } else { + // use linear approximation when v >= 2^10 + const int slope = + 1477; // slope = 1 / (log(2) * 1024) * (1 << LOG2_PRECISION) + assert(LOG2_TABLE_SIZE == 1 << 10); + + return slope * (v - LOG2_TABLE_SIZE) + (10 << LOG2_PRECISION); + } +} + +int64_t vp9_nb_mvs_inconsistency(const MV *mv, const int_mv *nb_full_mvs, + int mv_num) { + // The behavior of this function is to compute log2 of mv difference, + // i.e. min log2(1 + row_diff * row_diff + col_diff * col_diff) + // against available neighbor mvs. + // Since the log2 is monotonically increasing, we can compute + // min row_diff * row_diff + col_diff * col_diff first + // then apply log2 in the end. + int i; + int64_t min_abs_diff = INT64_MAX; + int cnt = 0; + assert(mv_num <= NB_MVS_NUM); + for (i = 0; i < mv_num; ++i) { + MV nb_mv = nb_full_mvs[i].as_mv; + const int64_t row_diff = abs(mv->row - nb_mv.row); + const int64_t col_diff = abs(mv->col - nb_mv.col); + const int64_t abs_diff = row_diff * row_diff + col_diff * col_diff; + assert(nb_full_mvs[i].as_int != INVALID_MV); + min_abs_diff = VPXMIN(abs_diff, min_abs_diff); + ++cnt; + } + if (cnt) { + return log2_approximation(1 + min_abs_diff); + } + return 0; +} + +static FloatMV get_smooth_motion_vector(const FloatMV scaled_search_mv, + const FloatMV *tmp_mf, + const int (*M)[MF_LOCAL_STRUCTURE_SIZE], + int rows, int cols, int row, int col, + float alpha) { + const FloatMV tmp_mv = tmp_mf[row * cols + col]; + int idx_row, idx_col; + FloatMV avg_nb_mv = { 0.0f, 0.0f }; + FloatMV mv = { 0.0f, 0.0f }; + float filter[3][3] = { { 1.0f / 12.0f, 1.0f / 6.0f, 1.0f / 12.0f }, + { 1.0f / 6.0f, 0.0f, 1.0f / 6.0f }, + { 1.0f / 12.0f, 1.0f / 6.0f, 1.0f / 12.0f } }; + for (idx_row = 0; idx_row < 3; ++idx_row) { + int nb_row = row + idx_row - 1; + for (idx_col = 0; idx_col < 3; ++idx_col) { + int nb_col = col + idx_col - 1; + if (nb_row < 0 || nb_col < 0 || nb_row >= rows || nb_col >= cols) { + avg_nb_mv.row += (tmp_mv.row) * filter[idx_row][idx_col]; + avg_nb_mv.col += (tmp_mv.col) * filter[idx_row][idx_col]; + } else { + const FloatMV nb_mv = tmp_mf[nb_row * cols + nb_col]; + avg_nb_mv.row += (nb_mv.row) * filter[idx_row][idx_col]; + avg_nb_mv.col += (nb_mv.col) * filter[idx_row][idx_col]; + } + } + } + { + // M is the local variance of reference frame + float M00 = M[row * cols + col][0]; + float M01 = M[row * cols + col][1]; + float M10 = M[row * cols + col][2]; + float M11 = M[row * cols + col][3]; + + float det = (M00 + alpha) * (M11 + alpha) - M01 * M10; + + float inv_M00 = (M11 + alpha) / det; + float inv_M01 = -M01 / det; + float inv_M10 = -M10 / det; + float inv_M11 = (M00 + alpha) / det; + + float inv_MM00 = inv_M00 * M00 + inv_M01 * M10; + float inv_MM01 = inv_M00 * M01 + inv_M01 * M11; + float inv_MM10 = inv_M10 * M00 + inv_M11 * M10; + float inv_MM11 = inv_M10 * M01 + inv_M11 * M11; + + mv.row = inv_M00 * avg_nb_mv.row * alpha + inv_M01 * avg_nb_mv.col * alpha + + inv_MM00 * scaled_search_mv.row + inv_MM01 * scaled_search_mv.col; + mv.col = inv_M10 * avg_nb_mv.row * alpha + inv_M11 * avg_nb_mv.col * alpha + + inv_MM10 * scaled_search_mv.row + inv_MM11 * scaled_search_mv.col; + } + return mv; +} + +void vp9_get_smooth_motion_field(const MV *search_mf, + const int (*M)[MF_LOCAL_STRUCTURE_SIZE], + int rows, int cols, BLOCK_SIZE bsize, + float alpha, int num_iters, MV *smooth_mf) { + // M is the local variation of reference frame + // build two buffers + FloatMV *input = (FloatMV *)malloc(rows * cols * sizeof(FloatMV)); + FloatMV *output = (FloatMV *)malloc(rows * cols * sizeof(FloatMV)); + int idx; + int row, col; + int bw = 4 << b_width_log2_lookup[bsize]; + int bh = 4 << b_height_log2_lookup[bsize]; + if (!(input && output)) goto fail; + // copy search results to input buffer + for (idx = 0; idx < rows * cols; ++idx) { + input[idx].row = (float)search_mf[idx].row / bh; + input[idx].col = (float)search_mf[idx].col / bw; + } + for (idx = 0; idx < num_iters; ++idx) { + FloatMV *tmp; + for (row = 0; row < rows; ++row) { + for (col = 0; col < cols; ++col) { + // note: the scaled_search_mf and smooth_mf are all scaled by macroblock + // size + const MV search_mv = search_mf[row * cols + col]; + FloatMV scaled_search_mv = { (float)search_mv.row / bh, + (float)search_mv.col / bw }; + output[row * cols + col] = get_smooth_motion_vector( + scaled_search_mv, input, M, rows, cols, row, col, alpha); + } + } + // swap buffers + tmp = input; + input = output; + output = tmp; + } + // copy smoothed results to output + for (idx = 0; idx < rows * cols; ++idx) { + smooth_mf[idx].row = (int)(input[idx].row * bh); + smooth_mf[idx].col = (int)(input[idx].col * bw); + } +fail: + free(input); + free(output); +} + +void vp9_get_local_structure(const YV12_BUFFER_CONFIG *cur_frame, + const YV12_BUFFER_CONFIG *ref_frame, + const MV *search_mf, + const vp9_variance_fn_ptr_t *fn_ptr, int rows, + int cols, BLOCK_SIZE bsize, + int (*M)[MF_LOCAL_STRUCTURE_SIZE]) { + const int bw = 4 << b_width_log2_lookup[bsize]; + const int bh = 4 << b_height_log2_lookup[bsize]; + const int cur_stride = cur_frame->y_stride; + const int ref_stride = ref_frame->y_stride; + const int width = ref_frame->y_width; + const int height = ref_frame->y_height; + int row, col; + for (row = 0; row < rows; ++row) { + for (col = 0; col < cols; ++col) { + int cur_offset = row * bh * cur_stride + col * bw; + uint8_t *center = cur_frame->y_buffer + cur_offset; + int ref_h = row * bh + search_mf[row * cols + col].row; + int ref_w = col * bw + search_mf[row * cols + col].col; + int ref_offset; + uint8_t *target; + uint8_t *nb; + int search_dist; + int nb_dist; + int I_row = 0, I_col = 0; + // TODO(Dan): handle the case that when reference frame block beyond the + // boundary + ref_h = ref_h < 0 ? 0 : (ref_h >= height - bh ? height - bh - 1 : ref_h); + ref_w = ref_w < 0 ? 0 : (ref_w >= width - bw ? width - bw - 1 : ref_w); + // compute search results distortion + // TODO(Dan): maybe need to use vp9 function to find the reference block, + // to compare with the results of my python code, I first use my way to + // compute the reference block + ref_offset = ref_h * ref_stride + ref_w; + target = ref_frame->y_buffer + ref_offset; + search_dist = fn_ptr->sdf(center, cur_stride, target, ref_stride); + // compute target's neighbors' distortions + // TODO(Dan): if using padding, the boundary condition may vary + // up + if (ref_h - bh >= 0) { + nb = target - ref_stride * bh; + nb_dist = fn_ptr->sdf(center, cur_stride, nb, ref_stride); + I_row += nb_dist - search_dist; + } + // down + if (ref_h + bh < height - bh) { + nb = target + ref_stride * bh; + nb_dist = fn_ptr->sdf(center, cur_stride, nb, ref_stride); + I_row += nb_dist - search_dist; + } + if (ref_h - bh >= 0 && ref_h + bh < height - bh) { + I_row /= 2; + } + I_row /= (bw * bh); + // left + if (ref_w - bw >= 0) { + nb = target - bw; + nb_dist = fn_ptr->sdf(center, cur_stride, nb, ref_stride); + I_col += nb_dist - search_dist; + } + // down + if (ref_w + bw < width - bw) { + nb = target + bw; + nb_dist = fn_ptr->sdf(center, cur_stride, nb, ref_stride); + I_col += nb_dist - search_dist; + } + if (ref_w - bw >= 0 && ref_w + bw < width - bw) { + I_col /= 2; + } + I_col /= (bw * bh); + M[row * cols + col][0] = I_row * I_row; + M[row * cols + col][1] = I_row * I_col; + M[row * cols + col][2] = I_col * I_row; + M[row * cols + col][3] = I_col * I_col; + } + } +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.h b/media/libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.h new file mode 100644 index 0000000000..c2bd69722a --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.h @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_NON_GREEDY_MV_H_ +#define VPX_VP9_ENCODER_VP9_NON_GREEDY_MV_H_ + +#include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_blockd.h" +#include "vpx_scale/yv12config.h" +#include "vpx_dsp/variance.h" + +#ifdef __cplusplus +extern "C" { +#endif +#define NB_MVS_NUM 4 +#define LOG2_PRECISION 20 +#define MF_LOCAL_STRUCTURE_SIZE 4 +#define SQUARE_BLOCK_SIZES 4 + +typedef enum Status { STATUS_OK = 0, STATUS_FAILED = 1 } Status; + +typedef struct MotionField { + int ready; + BLOCK_SIZE bsize; + int block_rows; + int block_cols; + int block_num; // block_num == block_rows * block_cols + int (*local_structure)[MF_LOCAL_STRUCTURE_SIZE]; + int_mv *mf; + int *set_mv; + int mv_log_scale; +} MotionField; + +typedef struct MotionFieldInfo { + int frame_num; + int allocated; + MotionField (*motion_field_array)[MAX_INTER_REF_FRAMES][SQUARE_BLOCK_SIZES]; +} MotionFieldInfo; + +typedef struct { + float row, col; +} FloatMV; + +static INLINE int get_square_block_idx(BLOCK_SIZE bsize) { + if (bsize == BLOCK_4X4) { + return 0; + } + if (bsize == BLOCK_8X8) { + return 1; + } + if (bsize == BLOCK_16X16) { + return 2; + } + if (bsize == BLOCK_32X32) { + return 3; + } + assert(0 && "ERROR: non-square block size"); + return -1; +} + +static INLINE BLOCK_SIZE square_block_idx_to_bsize(int square_block_idx) { + if (square_block_idx == 0) { + return BLOCK_4X4; + } + if (square_block_idx == 1) { + return BLOCK_8X8; + } + if (square_block_idx == 2) { + return BLOCK_16X16; + } + if (square_block_idx == 3) { + return BLOCK_32X32; + } + assert(0 && "ERROR: invalid square_block_idx"); + return BLOCK_INVALID; +} + +Status vp9_alloc_motion_field_info(MotionFieldInfo *motion_field_info, + int frame_num, int mi_rows, int mi_cols); + +Status vp9_alloc_motion_field(MotionField *motion_field, BLOCK_SIZE bsize, + int block_rows, int block_cols); + +void vp9_free_motion_field(MotionField *motion_field); + +void vp9_free_motion_field_info(MotionFieldInfo *motion_field_info); + +int64_t vp9_nb_mvs_inconsistency(const MV *mv, const int_mv *nb_full_mvs, + int mv_num); + +void vp9_get_smooth_motion_field(const MV *search_mf, + const int (*M)[MF_LOCAL_STRUCTURE_SIZE], + int rows, int cols, BLOCK_SIZE bize, + float alpha, int num_iters, MV *smooth_mf); + +void vp9_get_local_structure(const YV12_BUFFER_CONFIG *cur_frame, + const YV12_BUFFER_CONFIG *ref_frame, + const MV *search_mf, + const vp9_variance_fn_ptr_t *fn_ptr, int rows, + int cols, BLOCK_SIZE bsize, + int (*M)[MF_LOCAL_STRUCTURE_SIZE]); + +MotionField *vp9_motion_field_info_get_motion_field( + MotionFieldInfo *motion_field_info, int frame_idx, int rf_idx, + BLOCK_SIZE bsize); + +void vp9_motion_field_mi_set_mv(MotionField *motion_field, int mi_row, + int mi_col, int_mv mv); + +void vp9_motion_field_reset_mvs(MotionField *motion_field); + +int_mv vp9_motion_field_get_mv(const MotionField *motion_field, int brow, + int bcol); +int_mv vp9_motion_field_mi_get_mv(const MotionField *motion_field, int mi_row, + int mi_col); +int vp9_motion_field_is_mv_set(const MotionField *motion_field, int brow, + int bcol); + +#ifdef __cplusplus +} // extern "C" +#endif +#endif // VPX_VP9_ENCODER_VP9_NON_GREEDY_MV_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_partition_models.h b/media/libvpx/libvpx/vp9/encoder/vp9_partition_models.h new file mode 100644 index 0000000000..09c0e30a47 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_partition_models.h @@ -0,0 +1,975 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_PARTITION_MODELS_H_ +#define VPX_VP9_ENCODER_VP9_PARTITION_MODELS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#define NN_MAX_HIDDEN_LAYERS 10 +#define NN_MAX_NODES_PER_LAYER 128 + +// Neural net model config. It defines the layout of a neural net model, such as +// the number of inputs/outputs, number of layers, the number of nodes in each +// layer, as well as the weights and bias of each node. +typedef struct { + int num_inputs; // Number of input nodes, i.e. features. + int num_outputs; // Number of output nodes. + int num_hidden_layers; // Number of hidden layers, maximum 10. + // Number of nodes for each hidden layer. + int num_hidden_nodes[NN_MAX_HIDDEN_LAYERS]; + // Weight parameters, indexed by layer. + const float *weights[NN_MAX_HIDDEN_LAYERS + 1]; + // Bias parameters, indexed by layer. + const float *bias[NN_MAX_HIDDEN_LAYERS + 1]; +} NN_CONFIG; + +// Partition search breakout model. +#define FEATURES 4 +#define Q_CTX 3 +#define RESOLUTION_CTX 2 +static const float + vp9_partition_breakout_weights_64[RESOLUTION_CTX][Q_CTX][FEATURES + 1] = { + { + { + -0.016673f, + -0.001025f, + -0.000032f, + 0.000833f, + 1.94261885f - 2.1f, + }, + { + -0.160867f, + -0.002101f, + 0.000011f, + 0.002448f, + 1.65738142f - 2.5f, + }, + { + -0.628934f, + -0.011459f, + -0.000009f, + 0.013833f, + 1.47982645f - 1.6f, + }, + }, + { + { + -0.064309f, + -0.006121f, + 0.000232f, + 0.005778f, + 0.7989465f - 5.0f, + }, + { + -0.314957f, + -0.009346f, + -0.000225f, + 0.010072f, + 2.80695581f - 5.5f, + }, + { + -0.635535f, + -0.015135f, + 0.000091f, + 0.015247f, + 2.90381241f - 5.0f, + }, + }, + }; + +static const float + vp9_partition_breakout_weights_32[RESOLUTION_CTX][Q_CTX][FEATURES + 1] = { + { + { + -0.010554f, + -0.003081f, + -0.000134f, + 0.004491f, + 1.68445992f - 3.5f, + }, + { + -0.051489f, + -0.007609f, + 0.000016f, + 0.009792f, + 1.28089404f - 2.5f, + }, + { + -0.163097f, + -0.013081f, + 0.000022f, + 0.019006f, + 1.36129403f - 3.2f, + }, + }, + { + { + -0.024629f, + -0.006492f, + -0.000254f, + 0.004895f, + 1.27919173f - 4.5f, + }, + { + -0.083936f, + -0.009827f, + -0.000200f, + 0.010399f, + 2.73731065f - 4.5f, + }, + { + -0.279052f, + -0.013334f, + 0.000289f, + 0.023203f, + 2.43595719f - 3.5f, + }, + }, + }; + +static const float + vp9_partition_breakout_weights_16[RESOLUTION_CTX][Q_CTX][FEATURES + 1] = { + { + { + -0.013154f, + -0.002404f, + -0.000977f, + 0.008450f, + 2.57404566f - 5.5f, + }, + { + -0.019146f, + -0.004018f, + 0.000064f, + 0.008187f, + 2.15043926f - 2.5f, + }, + { + -0.075755f, + -0.010858f, + 0.000030f, + 0.024505f, + 2.06848121f - 2.5f, + }, + }, + { + { + -0.007636f, + -0.002751f, + -0.000682f, + 0.005968f, + 0.19225763f - 4.5f, + }, + { + -0.047306f, + -0.009113f, + -0.000518f, + 0.016007f, + 2.61068869f - 4.0f, + }, + { + -0.069336f, + -0.010448f, + -0.001120f, + 0.023083f, + 1.47591054f - 5.5f, + }, + }, + }; + +static const float vp9_partition_breakout_weights_8[RESOLUTION_CTX][Q_CTX] + [FEATURES + 1] = { + { + { + -0.011807f, + -0.009873f, + -0.000931f, + 0.034768f, + 1.32254851f - 2.0f, + }, + { + -0.003861f, + -0.002701f, + 0.000100f, + 0.013876f, + 1.96755111f - 1.5f, + }, + { + -0.013522f, + -0.008677f, + -0.000562f, + 0.034468f, + 1.53440356f - 1.5f, + }, + }, + { + { + -0.003221f, + -0.002125f, + 0.000993f, + 0.012768f, + 0.03541421f - 2.0f, + }, + { + -0.006069f, + -0.007335f, + 0.000229f, + 0.026104f, + 0.17135315f - 1.5f, + }, + { + -0.039894f, + -0.011419f, + 0.000070f, + 0.061817f, + 0.6739977f - 1.5f, + }, + }, + }; +#undef FEATURES +#undef Q_CTX +#undef RESOLUTION_CTX + +// Rectangular partition search pruning model. +#define FEATURES 8 +#define LABELS 4 +#define NODES 16 +static const float vp9_rect_part_nn_weights_16_layer0[FEATURES * NODES] = { + -0.432522f, 0.133070f, -0.169187f, 0.768340f, 0.891228f, 0.554458f, + 0.356000f, 0.403621f, 0.809165f, 0.778214f, -0.520357f, 0.301451f, + -0.386972f, -0.314402f, 0.021878f, 1.148746f, -0.462258f, -0.175524f, + -0.344589f, -0.475159f, -0.232322f, 0.471147f, -0.489948f, 0.467740f, + -0.391550f, 0.208601f, 0.054138f, 0.076859f, -0.309497f, -0.095927f, + 0.225917f, 0.011582f, -0.520730f, -0.585497f, 0.174036f, 0.072521f, + 0.120771f, -0.517234f, -0.581908f, -0.034003f, -0.694722f, -0.364368f, + 0.290584f, 0.038373f, 0.685654f, 0.394019f, 0.759667f, 1.257502f, + -0.610516f, -0.185434f, 0.211997f, -0.172458f, 0.044605f, 0.145316f, + -0.182525f, -0.147376f, 0.578742f, 0.312412f, -0.446135f, -0.389112f, + 0.454033f, 0.260490f, 0.664285f, 0.395856f, -0.231827f, 0.215228f, + 0.014856f, -0.395462f, 0.479646f, -0.391445f, -0.357788f, 0.166238f, + -0.056818f, -0.027783f, 0.060880f, -1.604710f, 0.531268f, 0.282184f, + 0.714944f, 0.093523f, -0.218312f, -0.095546f, -0.285621f, -0.190871f, + -0.448340f, -0.016611f, 0.413913f, -0.286720f, -0.158828f, -0.092635f, + -0.279551f, 0.166509f, -0.088162f, 0.446543f, -0.276830f, -0.065642f, + -0.176346f, -0.984754f, 0.338738f, 0.403809f, 0.738065f, 1.154439f, + 0.750764f, 0.770959f, -0.269403f, 0.295651f, -0.331858f, 0.367144f, + 0.279279f, 0.157419f, -0.348227f, -0.168608f, -0.956000f, -0.647136f, + 0.250516f, 0.858084f, 0.809802f, 0.492408f, 0.804841f, 0.282802f, + 0.079395f, -0.291771f, -0.024382f, -1.615880f, -0.445166f, -0.407335f, + -0.483044f, 0.141126f, +}; + +static const float vp9_rect_part_nn_bias_16_layer0[NODES] = { + 0.275384f, -0.053745f, 0.000000f, 0.000000f, -0.178103f, 0.513965f, + -0.161352f, 0.228551f, 0.000000f, 1.013712f, 0.000000f, 0.000000f, + -1.144009f, -0.000006f, -0.241727f, 2.048764f, +}; + +static const float vp9_rect_part_nn_weights_16_layer1[NODES * LABELS] = { + -1.435278f, 2.204691f, -0.410718f, 0.202708f, 0.109208f, 1.059142f, + -0.306360f, 0.845906f, 0.489654f, -1.121915f, -0.169133f, -0.003385f, + 0.660590f, -0.018711f, 1.227158f, -2.967504f, 1.407345f, -1.293243f, + -0.386921f, 0.300492f, 0.338824f, -0.083250f, -0.069454f, -1.001827f, + -0.327891f, 0.899353f, 0.367397f, -0.118601f, -0.171936f, -0.420646f, + -0.803319f, 2.029634f, 0.940268f, -0.664484f, 0.339916f, 0.315944f, + 0.157374f, -0.402482f, -0.491695f, 0.595827f, 0.015031f, 0.255887f, + -0.466327f, -0.212598f, 0.136485f, 0.033363f, -0.796921f, 1.414304f, + -0.282185f, -2.673571f, -0.280994f, 0.382658f, -0.350902f, 0.227926f, + 0.062602f, -1.000199f, 0.433731f, 1.176439f, -0.163216f, -0.229015f, + -0.640098f, -0.438852f, -0.947700f, 2.203434f, +}; + +static const float vp9_rect_part_nn_bias_16_layer1[LABELS] = { + -0.875510f, + 0.982408f, + 0.560854f, + -0.415209f, +}; + +static const NN_CONFIG vp9_rect_part_nnconfig_16 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + NODES, + }, // num_hidden_nodes + { + vp9_rect_part_nn_weights_16_layer0, + vp9_rect_part_nn_weights_16_layer1, + }, + { + vp9_rect_part_nn_bias_16_layer0, + vp9_rect_part_nn_bias_16_layer1, + }, +}; + +static const float vp9_rect_part_nn_weights_32_layer0[FEATURES * NODES] = { + -0.147312f, -0.753248f, 0.540206f, 0.661415f, 0.484117f, -0.341609f, + 0.016183f, 0.064177f, 0.781580f, 0.902232f, -0.505342f, 0.325183f, + -0.231072f, -0.120107f, -0.076216f, 0.120038f, 0.403695f, -0.463301f, + -0.192158f, 0.407442f, 0.106633f, 1.072371f, -0.446779f, 0.467353f, + 0.318812f, -0.505996f, -0.008768f, -0.239598f, 0.085480f, 0.284640f, + -0.365045f, -0.048083f, -0.112090f, -0.067089f, 0.304138f, -0.228809f, + 0.383651f, -0.196882f, 0.477039f, -0.217978f, -0.506931f, -0.125675f, + 0.050456f, 1.086598f, 0.732128f, 0.326941f, 0.103952f, 0.121769f, + -0.154487f, -0.255514f, 0.030591f, -0.382797f, -0.019981f, -0.326570f, + 0.149691f, -0.435633f, -0.070795f, 0.167691f, 0.251413f, -0.153405f, + 0.160347f, 0.455107f, -0.968580f, -0.575879f, 0.623115f, -0.069793f, + -0.379768f, -0.965807f, -0.062057f, 0.071312f, 0.457098f, 0.350372f, + -0.460659f, -0.985393f, 0.359963f, -0.093677f, 0.404272f, -0.326896f, + -0.277752f, 0.609322f, -0.114193f, -0.230701f, 0.089208f, 0.645381f, + 0.494485f, 0.467876f, -0.166187f, 0.251044f, -0.394661f, 0.192895f, + -0.344777f, -0.041893f, -0.111163f, 0.066347f, 0.378158f, -0.455465f, + 0.339839f, -0.418207f, -0.356515f, -0.227536f, -0.211091f, -0.122945f, + 0.361772f, -0.338095f, 0.004564f, -0.398510f, 0.060876f, -2.132504f, + -0.086776f, -0.029166f, 0.039241f, 0.222534f, -0.188565f, -0.288792f, + -0.160789f, -0.123905f, 0.397916f, -0.063779f, 0.167210f, -0.445004f, + 0.056889f, 0.207280f, 0.000101f, 0.384507f, -1.721239f, -2.036402f, + -2.084403f, -2.060483f, +}; + +static const float vp9_rect_part_nn_bias_32_layer0[NODES] = { + -0.859251f, -0.109938f, 0.091838f, 0.187817f, -0.728265f, 0.253080f, + 0.000000f, -0.357195f, -0.031290f, -1.373237f, -0.761086f, 0.000000f, + -0.024504f, 1.765711f, 0.000000f, 1.505390f, +}; + +static const float vp9_rect_part_nn_weights_32_layer1[NODES * LABELS] = { + 0.680940f, 1.367178f, 0.403075f, 0.029957f, 0.500917f, 1.407776f, + -0.354002f, 0.011667f, 1.663767f, 0.959155f, 0.428323f, -0.205345f, + -0.081850f, -3.920103f, -0.243802f, -4.253933f, -0.034020f, -1.361057f, + 0.128236f, -0.138422f, -0.025790f, -0.563518f, -0.148715f, -0.344381f, + -1.677389f, -0.868332f, -0.063792f, 0.052052f, 0.359591f, 2.739808f, + -0.414304f, 3.036597f, -0.075368f, -1.019680f, 0.642501f, 0.209779f, + -0.374539f, -0.718294f, -0.116616f, -0.043212f, -1.787809f, -0.773262f, + 0.068734f, 0.508309f, 0.099334f, 1.802239f, -0.333538f, 2.708645f, + -0.447682f, -2.355555f, -0.506674f, -0.061028f, -0.310305f, -0.375475f, + 0.194572f, 0.431788f, -0.789624f, -0.031962f, 0.358353f, 0.382937f, + 0.232002f, 2.321813f, -0.037523f, 2.104652f, +}; + +static const float vp9_rect_part_nn_bias_32_layer1[LABELS] = { + -0.693383f, + 0.773661f, + 0.426878f, + -0.070619f, +}; + +static const NN_CONFIG vp9_rect_part_nnconfig_32 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + NODES, + }, // num_hidden_nodes + { + vp9_rect_part_nn_weights_32_layer0, + vp9_rect_part_nn_weights_32_layer1, + }, + { + vp9_rect_part_nn_bias_32_layer0, + vp9_rect_part_nn_bias_32_layer1, + }, +}; +#undef NODES + +#define NODES 24 +static const float vp9_rect_part_nn_weights_64_layer0[FEATURES * NODES] = { + 0.024671f, -0.220610f, -0.284362f, -0.069556f, -0.315700f, 0.187861f, + 0.139782f, 0.063110f, 0.796561f, 0.172868f, -0.662194f, -1.393074f, + 0.085003f, 0.393381f, 0.358477f, -0.187268f, -0.370745f, 0.218287f, + 0.027271f, -0.254089f, -0.048236f, -0.459137f, 0.253171f, 0.122598f, + -0.550107f, -0.568456f, 0.159866f, -0.246534f, 0.096384f, -0.255460f, + 0.077864f, -0.334837f, 0.026921f, -0.697252f, 0.345262f, 1.343578f, + 0.815984f, 1.118211f, 1.574016f, 0.578476f, -0.285967f, -0.508672f, + 0.118137f, 0.037695f, 1.540510f, 1.256648f, 1.163819f, 1.172027f, + 0.661551f, -0.111980f, -0.434204f, -0.894217f, 0.570524f, 0.050292f, + -0.113680f, 0.000784f, -0.211554f, -0.369394f, 0.158306f, -0.512505f, + -0.238696f, 0.091498f, -0.448490f, -0.491268f, -0.353112f, -0.303315f, + -0.428438f, 0.127998f, -0.406790f, -0.401786f, -0.279888f, -0.384223f, + 0.026100f, 0.041621f, -0.315818f, -0.087888f, 0.353497f, 0.163123f, + -0.380128f, -0.090334f, -0.216647f, -0.117849f, -0.173502f, 0.301871f, + 0.070854f, 0.114627f, -0.050545f, -0.160381f, 0.595294f, 0.492696f, + -0.453858f, -1.154139f, 0.126000f, 0.034550f, 0.456665f, -0.236618f, + -0.112640f, 0.050759f, -0.449162f, 0.110059f, 0.147116f, 0.249358f, + -0.049894f, 0.063351f, -0.004467f, 0.057242f, -0.482015f, -0.174335f, + -0.085617f, -0.333808f, -0.358440f, -0.069006f, 0.099260f, -1.243430f, + -0.052963f, 0.112088f, -2.661115f, -2.445893f, -2.688174f, -2.624232f, + 0.030494f, 0.161311f, 0.012136f, 0.207564f, -2.776856f, -2.791940f, + -2.623962f, -2.918820f, 1.231619f, -0.376692f, -0.698078f, 0.110336f, + -0.285378f, 0.258367f, -0.180159f, -0.376608f, -0.034348f, -0.130206f, + 0.160020f, 0.852977f, 0.580573f, 1.450782f, 1.357596f, 0.787382f, + -0.544004f, -0.014795f, 0.032121f, -0.557696f, 0.159994f, -0.540908f, + 0.180380f, -0.398045f, 0.705095f, 0.515103f, -0.511521f, -1.271374f, + -0.231019f, 0.423647f, 0.064907f, -0.255338f, -0.877748f, -0.667205f, + 0.267847f, 0.135229f, 0.617844f, 1.349849f, 1.012623f, 0.730506f, + -0.078571f, 0.058401f, 0.053221f, -2.426146f, -0.098808f, -0.138508f, + -0.153299f, 0.149116f, -0.444243f, 0.301807f, 0.065066f, 0.092929f, + -0.372784f, -0.095540f, 0.192269f, 0.237894f, 0.080228f, -0.214074f, + -0.011426f, -2.352367f, -0.085394f, -0.190361f, -0.001177f, 0.089197f, +}; + +static const float vp9_rect_part_nn_bias_64_layer0[NODES] = { + 0.000000f, -0.057652f, -0.175413f, -0.175389f, -1.084097f, -1.423801f, + -0.076307f, -0.193803f, 0.000000f, -0.066474f, -0.050318f, -0.019832f, + -0.038814f, -0.144184f, 2.652451f, 2.415006f, 0.197464f, -0.729842f, + -0.173774f, 0.239171f, 0.486425f, 2.463304f, -0.175279f, 2.352637f, +}; + +static const float vp9_rect_part_nn_weights_64_layer1[NODES * LABELS] = { + -0.063237f, 1.925696f, -0.182145f, -0.226687f, 0.602941f, -0.941140f, + 0.814598f, -0.117063f, 0.282988f, 0.066369f, 0.096951f, 1.049735f, + -0.188188f, -0.281227f, -4.836746f, -5.047797f, 0.892358f, 0.417145f, + -0.279849f, 1.335945f, 0.660338f, -2.757938f, -0.115714f, -1.862183f, + -0.045980f, -1.597624f, -0.586822f, -0.615589f, -0.330537f, 1.068496f, + -0.167290f, 0.141290f, -0.112100f, 0.232761f, 0.252307f, -0.399653f, + 0.353118f, 0.241583f, 2.635241f, 4.026119f, -1.137327f, -0.052446f, + -0.139814f, -1.104256f, -0.759391f, 2.508457f, -0.526297f, 2.095348f, + -0.444473f, -1.090452f, 0.584122f, 0.468729f, -0.368865f, 1.041425f, + -1.079504f, 0.348837f, 0.390091f, 0.416191f, 0.212906f, -0.660255f, + 0.053630f, 0.209476f, 3.595525f, 2.257293f, -0.514030f, 0.074203f, + -0.375862f, -1.998307f, -0.930310f, 1.866686f, -0.247137f, 1.087789f, + 0.100186f, 0.298150f, 0.165265f, 0.050478f, 0.249167f, 0.371789f, + -0.294497f, 0.202954f, 0.037310f, 0.193159f, 0.161551f, 0.301597f, + 0.299286f, 0.185946f, 0.822976f, 2.066130f, -1.724588f, 0.055977f, + -0.330747f, -0.067747f, -0.475801f, 1.555958f, -0.025808f, -0.081516f, +}; + +static const float vp9_rect_part_nn_bias_64_layer1[LABELS] = { + -0.090723f, + 0.894968f, + 0.844754f, + -3.496194f, +}; + +static const NN_CONFIG vp9_rect_part_nnconfig_64 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + NODES, + }, // num_hidden_nodes + { + vp9_rect_part_nn_weights_64_layer0, + vp9_rect_part_nn_weights_64_layer1, + }, + { + vp9_rect_part_nn_bias_64_layer0, + vp9_rect_part_nn_bias_64_layer1, + }, +}; +#undef FEATURES +#undef LABELS +#undef NODES + +#define FEATURES 7 +// Partition pruning model(neural nets). +static const float vp9_partition_nn_weights_64x64_layer0[FEATURES * 8] = { + -3.571348f, 0.014835f, -3.255393f, -0.098090f, -0.013120f, 0.000221f, + 0.056273f, 0.190179f, -0.268130f, -1.828242f, -0.010655f, 0.937244f, + -0.435120f, 0.512125f, 1.610679f, 0.190816f, -0.799075f, -0.377348f, + -0.144232f, 0.614383f, -0.980388f, 1.754150f, -0.185603f, -0.061854f, + -0.807172f, 1.240177f, 1.419531f, -0.438544f, -5.980774f, 0.139045f, + -0.032359f, -0.068887f, -1.237918f, 0.115706f, 0.003164f, 2.924212f, + 1.246838f, -0.035833f, 0.810011f, -0.805894f, 0.010966f, 0.076463f, + -4.226380f, -2.437764f, -0.010619f, -0.020935f, -0.451494f, 0.300079f, + -0.168961f, -3.326450f, -2.731094f, 0.002518f, 0.018840f, -1.656815f, + 0.068039f, 0.010586f, +}; + +static const float vp9_partition_nn_bias_64x64_layer0[8] = { + -3.469882f, 0.683989f, 0.194010f, 0.313782f, + -3.153335f, 2.245849f, -1.946190f, -3.740020f, +}; + +static const float vp9_partition_nn_weights_64x64_layer1[8] = { + -8.058566f, 0.108306f, -0.280620f, -0.818823f, + -6.445117f, 0.865364f, -1.127127f, -8.808660f, +}; + +static const float vp9_partition_nn_bias_64x64_layer1[1] = { + 6.46909416f, +}; + +static const NN_CONFIG vp9_partition_nnconfig_64x64 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_partition_nn_weights_64x64_layer0, + vp9_partition_nn_weights_64x64_layer1, + }, + { + vp9_partition_nn_bias_64x64_layer0, + vp9_partition_nn_bias_64x64_layer1, + }, +}; + +static const float vp9_partition_nn_weights_32x32_layer0[FEATURES * 8] = { + -0.295437f, -4.002648f, -0.205399f, -0.060919f, 0.708037f, 0.027221f, + -0.039137f, -0.907724f, -3.151662f, 0.007106f, 0.018726f, -0.534928f, + 0.022744f, 0.000159f, -1.717189f, -3.229031f, -0.027311f, 0.269863f, + -0.400747f, -0.394366f, -0.108878f, 0.603027f, 0.455369f, -0.197170f, + 1.241746f, -1.347820f, -0.575636f, -0.462879f, -2.296426f, 0.196696f, + -0.138347f, -0.030754f, -0.200774f, 0.453795f, 0.055625f, -3.163116f, + -0.091003f, -0.027028f, -0.042984f, -0.605185f, 0.143240f, -0.036439f, + -0.801228f, 0.313409f, -0.159942f, 0.031267f, 0.886454f, -1.531644f, + -0.089655f, 0.037683f, -0.163441f, -0.130454f, -0.058344f, 0.060011f, + 0.275387f, 1.552226f, +}; + +static const float vp9_partition_nn_bias_32x32_layer0[8] = { + -0.838372f, -2.609089f, -0.055763f, 1.329485f, + -1.297638f, -2.636622f, -0.826909f, 1.012644f, +}; + +static const float vp9_partition_nn_weights_32x32_layer1[8] = { + -1.792632f, -7.322353f, -0.683386f, 0.676564f, + -1.488118f, -7.527719f, 1.240163f, 0.614309f, +}; + +static const float vp9_partition_nn_bias_32x32_layer1[1] = { + 4.97422546f, +}; + +static const NN_CONFIG vp9_partition_nnconfig_32x32 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_partition_nn_weights_32x32_layer0, + vp9_partition_nn_weights_32x32_layer1, + }, + { + vp9_partition_nn_bias_32x32_layer0, + vp9_partition_nn_bias_32x32_layer1, + }, +}; + +static const float vp9_partition_nn_weights_16x16_layer0[FEATURES * 8] = { + -1.717673f, -4.718130f, -0.125725f, -0.183427f, -0.511764f, 0.035328f, + 0.130891f, -3.096753f, 0.174968f, -0.188769f, -0.640796f, 1.305661f, + 1.700638f, -0.073806f, -4.006781f, -1.630999f, -0.064863f, -0.086410f, + -0.148617f, 0.172733f, -0.018619f, 2.152595f, 0.778405f, -0.156455f, + 0.612995f, -0.467878f, 0.152022f, -0.236183f, 0.339635f, -0.087119f, + -3.196610f, -1.080401f, -0.637704f, -0.059974f, 1.706298f, -0.793705f, + -6.399260f, 0.010624f, -0.064199f, -0.650621f, 0.338087f, -0.001531f, + 1.023655f, -3.700272f, -0.055281f, -0.386884f, 0.375504f, -0.898678f, + 0.281156f, -0.314611f, 0.863354f, -0.040582f, -0.145019f, 0.029329f, + -2.197880f, -0.108733f, +}; + +static const float vp9_partition_nn_bias_16x16_layer0[8] = { + 0.411516f, -2.143737f, -3.693192f, 2.123142f, + -1.356910f, -3.561016f, -0.765045f, -2.417082f, +}; + +static const float vp9_partition_nn_weights_16x16_layer1[8] = { + -0.619755f, -2.202391f, -4.337171f, 0.611319f, + 0.377677f, -4.998723f, -1.052235f, 1.949922f, +}; + +static const float vp9_partition_nn_bias_16x16_layer1[1] = { + 3.20981717f, +}; + +static const NN_CONFIG vp9_partition_nnconfig_16x16 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_partition_nn_weights_16x16_layer0, + vp9_partition_nn_weights_16x16_layer1, + }, + { + vp9_partition_nn_bias_16x16_layer0, + vp9_partition_nn_bias_16x16_layer1, + }, +}; +#undef FEATURES + +#define FEATURES 6 +static const float vp9_var_part_nn_weights_64_layer0[FEATURES * 8] = { + -0.249572f, 0.205532f, -2.175608f, 1.094836f, -2.986370f, 0.193160f, + -0.143823f, 0.378511f, -1.997788f, -2.166866f, -1.930158f, -1.202127f, + -0.611875f, -0.506422f, -0.432487f, 0.071205f, 0.578172f, -0.154285f, + -0.051830f, 0.331681f, -1.457177f, -2.443546f, -2.000302f, -1.389283f, + 0.372084f, -0.464917f, 2.265235f, 2.385787f, 2.312722f, 2.127868f, + -0.403963f, -0.177860f, -0.436751f, -0.560539f, 0.254903f, 0.193976f, + -0.305611f, 0.256632f, 0.309388f, -0.437439f, 1.702640f, -5.007069f, + -0.323450f, 0.294227f, 1.267193f, 1.056601f, 0.387181f, -0.191215f, +}; + +static const float vp9_var_part_nn_bias_64_layer0[8] = { + -0.044396f, -0.938166f, 0.000000f, -0.916375f, + 1.242299f, 0.000000f, -0.405734f, 0.014206f, +}; + +static const float vp9_var_part_nn_weights_64_layer1[8] = { + 1.635945f, 0.979557f, 0.455315f, 1.197199f, + -2.251024f, -0.464953f, 1.378676f, -0.111927f, +}; + +static const float vp9_var_part_nn_bias_64_layer1[1] = { + -0.37972447f, +}; + +static const NN_CONFIG vp9_var_part_nnconfig_64 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_var_part_nn_weights_64_layer0, + vp9_var_part_nn_weights_64_layer1, + }, + { + vp9_var_part_nn_bias_64_layer0, + vp9_var_part_nn_bias_64_layer1, + }, +}; + +static const float vp9_var_part_nn_weights_32_layer0[FEATURES * 8] = { + 0.067243f, -0.083598f, -2.191159f, 2.726434f, -3.324013f, 3.477977f, + 0.323736f, -0.510199f, 2.960693f, 2.937661f, 2.888476f, 2.938315f, + -0.307602f, -0.503353f, -0.080725f, -0.473909f, -0.417162f, 0.457089f, + 0.665153f, -0.273210f, 0.028279f, 0.972220f, -0.445596f, 1.756611f, + -0.177892f, -0.091758f, 0.436661f, -0.521506f, 0.133786f, 0.266743f, + 0.637367f, -0.160084f, -1.396269f, 1.020841f, -1.112971f, 0.919496f, + -0.235883f, 0.651954f, 0.109061f, -0.429463f, 0.740839f, -0.962060f, + 0.299519f, -0.386298f, 1.550231f, 2.464915f, 1.311969f, 2.561612f, +}; + +static const float vp9_var_part_nn_bias_32_layer0[8] = { + 0.368242f, 0.736617f, 0.000000f, 0.757287f, + 0.000000f, 0.613248f, -0.776390f, 0.928497f, +}; + +static const float vp9_var_part_nn_weights_32_layer1[8] = { + 0.939884f, -2.420850f, -0.410489f, -0.186690f, + 0.063287f, -0.522011f, 0.484527f, -0.639625f, +}; + +static const float vp9_var_part_nn_bias_32_layer1[1] = { + -0.6455006f, +}; + +static const NN_CONFIG vp9_var_part_nnconfig_32 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_var_part_nn_weights_32_layer0, + vp9_var_part_nn_weights_32_layer1, + }, + { + vp9_var_part_nn_bias_32_layer0, + vp9_var_part_nn_bias_32_layer1, + }, +}; + +static const float vp9_var_part_nn_weights_16_layer0[FEATURES * 8] = { + 0.742567f, -0.580624f, -0.244528f, 0.331661f, -0.113949f, -0.559295f, + -0.386061f, 0.438653f, 1.467463f, 0.211589f, 0.513972f, 1.067855f, + -0.876679f, 0.088560f, -0.687483f, -0.380304f, -0.016412f, 0.146380f, + 0.015318f, 0.000351f, -2.764887f, 3.269717f, 2.752428f, -2.236754f, + 0.561539f, -0.852050f, -0.084667f, 0.202057f, 0.197049f, 0.364922f, + -0.463801f, 0.431790f, 1.872096f, -0.091887f, -0.055034f, 2.443492f, + -0.156958f, -0.189571f, -0.542424f, -0.589804f, -0.354422f, 0.401605f, + 0.642021f, -0.875117f, 2.040794f, 1.921070f, 1.792413f, 1.839727f, +}; + +static const float vp9_var_part_nn_bias_16_layer0[8] = { + 2.901234f, -1.940932f, -0.198970f, -0.406524f, + 0.059422f, -1.879207f, -0.232340f, 2.979821f, +}; + +static const float vp9_var_part_nn_weights_16_layer1[8] = { + -0.528731f, 0.375234f, -0.088422f, 0.668629f, + 0.870449f, 0.578735f, 0.546103f, -1.957207f, +}; + +static const float vp9_var_part_nn_bias_16_layer1[1] = { + -1.95769405f, +}; + +static const NN_CONFIG vp9_var_part_nnconfig_16 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_var_part_nn_weights_16_layer0, + vp9_var_part_nn_weights_16_layer1, + }, + { + vp9_var_part_nn_bias_16_layer0, + vp9_var_part_nn_bias_16_layer1, + }, +}; +#undef FEATURES + +#define FEATURES 12 +#define LABELS 1 +#define NODES 8 +static const float vp9_part_split_nn_weights_64_layer0[FEATURES * NODES] = { + -0.609728f, -0.409099f, -0.472449f, 0.183769f, -0.457740f, 0.081089f, + 0.171003f, 0.578696f, -0.019043f, -0.856142f, 0.557369f, -1.779424f, + -0.274044f, -0.320632f, -0.392531f, -0.359462f, -0.404106f, -0.288357f, + 0.200620f, 0.038013f, -0.430093f, 0.235083f, -0.487442f, 0.424814f, + -0.232758f, -0.442943f, 0.229397f, -0.540301f, -0.648421f, -0.649747f, + -0.171638f, 0.603824f, 0.468497f, -0.421580f, 0.178840f, -0.533838f, + -0.029471f, -0.076296f, 0.197426f, -0.187908f, -0.003950f, -0.065740f, + 0.085165f, -0.039674f, -5.640702f, 1.909538f, -1.434604f, 3.294606f, + -0.788812f, 0.196864f, 0.057012f, -0.019757f, 0.336233f, 0.075378f, + 0.081503f, 0.491864f, -1.899470f, -1.764173f, -1.888137f, -1.762343f, + 0.845542f, 0.202285f, 0.381948f, -0.150996f, 0.556893f, -0.305354f, + 0.561482f, -0.021974f, -0.703117f, 0.268638f, -0.665736f, 1.191005f, + -0.081568f, -0.115653f, 0.272029f, -0.140074f, 0.072683f, 0.092651f, + -0.472287f, -0.055790f, -0.434425f, 0.352055f, 0.048246f, 0.372865f, + 0.111499f, -0.338304f, 0.739133f, 0.156519f, -0.594644f, 0.137295f, + 0.613350f, -0.165102f, -1.003731f, 0.043070f, -0.887896f, -0.174202f, +}; + +static const float vp9_part_split_nn_bias_64_layer0[NODES] = { + 1.182714f, 0.000000f, 0.902019f, 0.953115f, + -1.372486f, -1.288740f, -0.155144f, -3.041362f, +}; + +static const float vp9_part_split_nn_weights_64_layer1[NODES * LABELS] = { + 0.841214f, 0.456016f, 0.869270f, 1.692999f, + -1.700494f, -0.911761f, 0.030111f, -1.447548f, +}; + +static const float vp9_part_split_nn_bias_64_layer1[LABELS] = { + 1.17782545f, +}; + +static const NN_CONFIG vp9_part_split_nnconfig_64 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + NODES, + }, // num_hidden_nodes + { + vp9_part_split_nn_weights_64_layer0, + vp9_part_split_nn_weights_64_layer1, + }, + { + vp9_part_split_nn_bias_64_layer0, + vp9_part_split_nn_bias_64_layer1, + }, +}; + +static const float vp9_part_split_nn_weights_32_layer0[FEATURES * NODES] = { + -0.105488f, -0.218662f, 0.010980f, -0.226979f, 0.028076f, 0.743430f, + 0.789266f, 0.031907f, -1.464200f, 0.222336f, -1.068493f, -0.052712f, + -0.176181f, -0.102654f, -0.973932f, -0.182637f, -0.198000f, 0.335977f, + 0.271346f, 0.133005f, 1.674203f, 0.689567f, 0.657133f, 0.283524f, + 0.115529f, 0.738327f, 0.317184f, -0.179736f, 0.403691f, 0.679350f, + 0.048925f, 0.271338f, -1.538921f, -0.900737f, -1.377845f, 0.084245f, + 0.803122f, -0.107806f, 0.103045f, -0.023335f, -0.098116f, -0.127809f, + 0.037665f, -0.523225f, 1.622185f, 1.903999f, 1.358889f, 1.680785f, + 0.027743f, 0.117906f, -0.158810f, 0.057775f, 0.168257f, 0.062414f, + 0.086228f, -0.087381f, -3.066082f, 3.021855f, -4.092155f, 2.550104f, + -0.230022f, -0.207445f, -0.000347f, 0.034042f, 0.097057f, 0.220088f, + -0.228841f, -0.029405f, -1.507174f, -1.455184f, 2.624904f, 2.643355f, + 0.319912f, 0.585531f, -1.018225f, -0.699606f, 1.026490f, 0.169952f, + -0.093579f, -0.142352f, -0.107256f, 0.059598f, 0.043190f, 0.507543f, + -0.138617f, 0.030197f, 0.059574f, -0.634051f, -0.586724f, -0.148020f, + -0.334380f, 0.459547f, 1.620600f, 0.496850f, 0.639480f, -0.465715f, +}; + +static const float vp9_part_split_nn_bias_32_layer0[NODES] = { + -1.125885f, 0.753197f, -0.825808f, 0.004839f, + 0.583920f, 0.718062f, 0.976741f, 0.796188f, +}; + +static const float vp9_part_split_nn_weights_32_layer1[NODES * LABELS] = { + -0.458745f, 0.724624f, -0.479720f, -2.199872f, + 1.162661f, 1.194153f, -0.716896f, 0.824080f, +}; + +static const float vp9_part_split_nn_bias_32_layer1[LABELS] = { + 0.71644074f, +}; + +static const NN_CONFIG vp9_part_split_nnconfig_32 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + NODES, + }, // num_hidden_nodes + { + vp9_part_split_nn_weights_32_layer0, + vp9_part_split_nn_weights_32_layer1, + }, + { + vp9_part_split_nn_bias_32_layer0, + vp9_part_split_nn_bias_32_layer1, + }, +}; + +static const float vp9_part_split_nn_weights_16_layer0[FEATURES * NODES] = { + -0.003629f, -0.046852f, 0.220428f, -0.033042f, 0.049365f, 0.112818f, + -0.306149f, -0.005872f, 1.066947f, -2.290226f, 2.159505f, -0.618714f, + -0.213294f, 0.451372f, -0.199459f, 0.223730f, -0.321709f, 0.063364f, + 0.148704f, -0.293371f, 0.077225f, -0.421947f, -0.515543f, -0.240975f, + -0.418516f, 1.036523f, -0.009165f, 0.032484f, 1.086549f, 0.220322f, + -0.247585f, -0.221232f, -0.225050f, 0.993051f, 0.285907f, 1.308846f, + 0.707456f, 0.335152f, 0.234556f, 0.264590f, -0.078033f, 0.542226f, + 0.057777f, 0.163471f, 0.039245f, -0.725960f, 0.963780f, -0.972001f, + 0.252237f, -0.192745f, -0.836571f, -0.460539f, -0.528713f, -0.160198f, + -0.621108f, 0.486405f, -0.221923f, 1.519426f, -0.857871f, 0.411595f, + 0.947188f, 0.203339f, 0.174526f, 0.016382f, 0.256879f, 0.049818f, + 0.057836f, -0.659096f, 0.459894f, 0.174695f, 0.379359f, 0.062530f, + -0.210201f, -0.355788f, -0.208432f, -0.401723f, -0.115373f, 0.191336f, + -0.109342f, 0.002455f, -0.078746f, -0.391871f, 0.149892f, -0.239615f, + -0.520709f, 0.118568f, -0.437975f, 0.118116f, -0.565426f, -0.206446f, + 0.113407f, 0.558894f, 0.534627f, 1.154350f, -0.116833f, 1.723311f, +}; + +static const float vp9_part_split_nn_bias_16_layer0[NODES] = { + 0.013109f, -0.034341f, 0.679845f, -0.035781f, + -0.104183f, 0.098055f, -0.041130f, 0.160107f, +}; + +static const float vp9_part_split_nn_weights_16_layer1[NODES * LABELS] = { + 1.499564f, -0.403259f, 1.366532f, -0.469868f, + 0.482227f, -2.076697f, 0.527691f, 0.540495f, +}; + +static const float vp9_part_split_nn_bias_16_layer1[LABELS] = { + 0.01134653f, +}; + +static const NN_CONFIG vp9_part_split_nnconfig_16 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + NODES, + }, // num_hidden_nodes + { + vp9_part_split_nn_weights_16_layer0, + vp9_part_split_nn_weights_16_layer1, + }, + { + vp9_part_split_nn_bias_16_layer0, + vp9_part_split_nn_bias_16_layer1, + }, +}; + +static const float vp9_part_split_nn_weights_8_layer0[FEATURES * NODES] = { + -0.668875f, -0.159078f, -0.062663f, -0.483785f, -0.146814f, -0.608975f, + -0.589145f, 0.203704f, -0.051007f, -0.113769f, -0.477511f, -0.122603f, + -1.329890f, 1.403386f, 0.199636f, -0.161139f, 2.182090f, -0.014307f, + 0.015755f, -0.208468f, 0.884353f, 0.815920f, 0.632464f, 0.838225f, + 1.369483f, -0.029068f, 0.570213f, -0.573546f, 0.029617f, 0.562054f, + -0.653093f, -0.211910f, -0.661013f, -0.384418f, -0.574038f, -0.510069f, + 0.173047f, -0.274231f, -1.044008f, -0.422040f, -0.810296f, 0.144069f, + -0.406704f, 0.411230f, -0.144023f, 0.745651f, -0.595091f, 0.111787f, + 0.840651f, 0.030123f, -0.242155f, 0.101486f, -0.017889f, -0.254467f, + -0.285407f, -0.076675f, -0.549542f, -0.013544f, -0.686566f, -0.755150f, + 1.623949f, -0.286369f, 0.170976f, 0.016442f, -0.598353f, -0.038540f, + 0.202597f, -0.933582f, 0.599510f, 0.362273f, 0.577722f, 0.477603f, + 0.767097f, 0.431532f, 0.457034f, 0.223279f, 0.381349f, 0.033777f, + 0.423923f, -0.664762f, 0.385662f, 0.075744f, 0.182681f, 0.024118f, + 0.319408f, -0.528864f, 0.976537f, -0.305971f, -0.189380f, -0.241689f, + -1.318092f, 0.088647f, -0.109030f, -0.945654f, 1.082797f, 0.184564f, +}; + +static const float vp9_part_split_nn_bias_8_layer0[NODES] = { + -0.237472f, 2.051396f, 0.297062f, -0.730194f, + 0.060472f, -0.565959f, 0.560869f, -0.395448f, +}; + +static const float vp9_part_split_nn_weights_8_layer1[NODES * LABELS] = { + 0.568121f, 1.575915f, -0.544309f, 0.751595f, + -0.117911f, -1.340730f, -0.739671f, 0.661216f, +}; + +static const float vp9_part_split_nn_bias_8_layer1[LABELS] = { + -0.63375306f, +}; + +static const NN_CONFIG vp9_part_split_nnconfig_8 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + NODES, + }, // num_hidden_nodes + { + vp9_part_split_nn_weights_8_layer0, + vp9_part_split_nn_weights_8_layer1, + }, + { + vp9_part_split_nn_bias_8_layer0, + vp9_part_split_nn_bias_8_layer1, + }, +}; +#undef NODES +#undef FEATURES +#undef LABELS + +// Partition pruning model(linear). +static const float vp9_partition_feature_mean[24] = { + 303501.697372f, 3042630.372158f, 24.694696f, 1.392182f, + 689.413511f, 162.027012f, 1.478213f, 0.0, + 135382.260230f, 912738.513263f, 28.845217f, 1.515230f, + 544.158492f, 131.807995f, 1.436863f, 0.0f, + 43682.377587f, 208131.711766f, 28.084737f, 1.356677f, + 138.254122f, 119.522553f, 1.252322f, 0.0f, +}; + +static const float vp9_partition_feature_std[24] = { + 673689.212982f, 5996652.516628f, 0.024449f, 1.989792f, + 985.880847f, 0.014638f, 2.001898f, 0.0f, + 208798.775332f, 1812548.443284f, 0.018693f, 1.838009f, + 396.986910f, 0.015657f, 1.332541f, 0.0f, + 55888.847031f, 448587.962714f, 0.017900f, 1.904776f, + 98.652832f, 0.016598f, 1.320992f, 0.0f, +}; + +// Error tolerance: 0.01%-0.0.05%-0.1% +static const float vp9_partition_linear_weights[24] = { + 0.111736f, 0.289977f, 0.042219f, 0.204765f, 0.120410f, -0.143863f, + 0.282376f, 0.847811f, 0.637161f, 0.131570f, 0.018636f, 0.202134f, + 0.112797f, 0.028162f, 0.182450f, 1.124367f, 0.386133f, 0.083700f, + 0.050028f, 0.150873f, 0.061119f, 0.109318f, 0.127255f, 0.625211f, +}; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_PARTITION_MODELS_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_picklpf.c b/media/libvpx/libvpx/vp9/encoder/vp9_picklpf.c new file mode 100644 index 0000000000..3a620df693 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_picklpf.c @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_scale_rtcd.h" +#include "vpx_dsp/psnr.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" + +#include "vp9/common/vp9_loopfilter.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_quant_common.h" + +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_picklpf.h" +#include "vp9/encoder/vp9_quantize.h" + +static unsigned int get_section_intra_rating(const VP9_COMP *cpi) { + unsigned int section_intra_rating; + + section_intra_rating = (cpi->common.frame_type == KEY_FRAME) + ? cpi->twopass.key_frame_section_intra_rating + : cpi->twopass.section_intra_rating; + + return section_intra_rating; +} + +static int get_max_filter_level(const VP9_COMP *cpi) { + if (cpi->oxcf.pass == 2) { + unsigned int section_intra_rating = get_section_intra_rating(cpi); + return section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4 : MAX_LOOP_FILTER; + } else { + return MAX_LOOP_FILTER; + } +} + +static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd, + VP9_COMP *const cpi, int filt_level, + int partial_frame) { + VP9_COMMON *const cm = &cpi->common; + int64_t filt_err; + + vp9_build_mask_frame(cm, filt_level, partial_frame); + + if (cpi->num_workers > 1) + vp9_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane, + filt_level, 1, partial_frame, cpi->workers, + cpi->num_workers, &cpi->lf_row_sync); + else + vp9_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level, + 1, partial_frame); + +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) { + filt_err = vpx_highbd_get_y_sse(sd, cm->frame_to_show); + } else { + filt_err = vpx_get_y_sse(sd, cm->frame_to_show); + } +#else + filt_err = vpx_get_y_sse(sd, cm->frame_to_show); +#endif // CONFIG_VP9_HIGHBITDEPTH + + // Re-instate the unfiltered frame + vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); + + return filt_err; +} + +static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, + int partial_frame) { + const VP9_COMMON *const cm = &cpi->common; + const struct loopfilter *const lf = &cm->lf; + const int min_filter_level = 0; + const int max_filter_level = get_max_filter_level(cpi); + int filt_direction = 0; + int64_t best_err; + int filt_best; + + // Start the search at the previous frame filter level unless it is now out of + // range. + int filt_mid = clamp(lf->last_filt_level, min_filter_level, max_filter_level); + int filter_step = filt_mid < 16 ? 4 : filt_mid / 4; + // Sum squared error at each filter level + int64_t ss_err[MAX_LOOP_FILTER + 1]; + unsigned int section_intra_rating = get_section_intra_rating(cpi); + + // Set each entry to -1 + memset(ss_err, 0xFF, sizeof(ss_err)); + + // Make a copy of the unfiltered / processed recon buffer + vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf); + + best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame); + filt_best = filt_mid; + ss_err[filt_mid] = best_err; + + while (filter_step > 0) { + const int filt_high = VPXMIN(filt_mid + filter_step, max_filter_level); + const int filt_low = VPXMAX(filt_mid - filter_step, min_filter_level); + + // Bias against raising loop filter in favor of lowering it. + int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; + + if ((cpi->oxcf.pass == 2) && (section_intra_rating < 20)) + bias = (bias * section_intra_rating) / 20; + + // yx, bias less for large block size + if (cm->tx_mode != ONLY_4X4) bias >>= 1; + + if (filt_direction <= 0 && filt_low != filt_mid) { + // Get Low filter error score + if (ss_err[filt_low] < 0) { + ss_err[filt_low] = try_filter_frame(sd, cpi, filt_low, partial_frame); + } + // If value is close to the best so far then bias towards a lower loop + // filter value. + if ((ss_err[filt_low] - bias) < best_err) { + // Was it actually better than the previous best? + if (ss_err[filt_low] < best_err) best_err = ss_err[filt_low]; + + filt_best = filt_low; + } + } + + // Now look at filt_high + if (filt_direction >= 0 && filt_high != filt_mid) { + if (ss_err[filt_high] < 0) { + ss_err[filt_high] = try_filter_frame(sd, cpi, filt_high, partial_frame); + } + // Was it better than the previous best? + if (ss_err[filt_high] < (best_err - bias)) { + best_err = ss_err[filt_high]; + filt_best = filt_high; + } + } + + // Half the step distance if the best filter value was the same as last time + if (filt_best == filt_mid) { + filter_step /= 2; + filt_direction = 0; + } else { + filt_direction = (filt_best < filt_mid) ? -1 : 1; + filt_mid = filt_best; + } + } + + return filt_best; +} + +void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, + LPF_PICK_METHOD method) { + VP9_COMMON *const cm = &cpi->common; + struct loopfilter *const lf = &cm->lf; + + lf->sharpness_level = 0; + + if (method == LPF_PICK_MINIMAL_LPF && lf->filter_level) { + lf->filter_level = 0; + } else if (method >= LPF_PICK_FROM_Q) { + const int min_filter_level = 0; + const int max_filter_level = get_max_filter_level(cpi); + const int q = vp9_ac_quant(cm->base_qindex, 0, cm->bit_depth); +// These values were determined by linear fitting the result of the +// searched level, filt_guess = q * 0.316206 + 3.87252 +#if CONFIG_VP9_HIGHBITDEPTH + int filt_guess; + switch (cm->bit_depth) { + case VPX_BITS_8: + filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18); + break; + case VPX_BITS_10: + filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20); + break; + default: + assert(cm->bit_depth == VPX_BITS_12); + filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22); + break; + } +#else + int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18); +#endif // CONFIG_VP9_HIGHBITDEPTH + if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR && + cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled && + (cm->base_qindex < 200 || cm->width * cm->height > 320 * 240) && + cpi->oxcf.content != VP9E_CONTENT_SCREEN && cm->frame_type != KEY_FRAME) + filt_guess = 5 * filt_guess >> 3; + + if (cm->frame_type == KEY_FRAME) filt_guess -= 4; + lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level); + } else { + lf->filter_level = + search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE); + } +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_picklpf.h b/media/libvpx/libvpx/vp9/encoder/vp9_picklpf.h new file mode 100644 index 0000000000..8881b44daa --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_picklpf.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_PICKLPF_H_ +#define VPX_VP9_ENCODER_VP9_PICKLPF_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "vp9/encoder/vp9_encoder.h" + +struct yv12_buffer_config; +struct VP9_COMP; + +void vp9_pick_filter_level(const struct yv12_buffer_config *sd, + struct VP9_COMP *cpi, LPF_PICK_METHOD method); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_PICKLPF_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_pickmode.c b/media/libvpx/libvpx/vp9/encoder/vp9_pickmode.c new file mode 100644 index 0000000000..6f2524b36e --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_pickmode.c @@ -0,0 +1,2992 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx/vpx_codec.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/compiler_attributes.h" + +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_mvref_common.h" +#include "vp9/common/vp9_pred_common.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_scan.h" + +#include "vp9/encoder/vp9_cost.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_pickmode.h" +#include "vp9/encoder/vp9_ratectrl.h" +#include "vp9/encoder/vp9_rd.h" + +typedef struct { + uint8_t *data; + int stride; + int in_use; +} PRED_BUFFER; + +typedef struct { + PRED_BUFFER *best_pred; + PREDICTION_MODE best_mode; + TX_SIZE best_tx_size; + TX_SIZE best_intra_tx_size; + MV_REFERENCE_FRAME best_ref_frame; + MV_REFERENCE_FRAME best_second_ref_frame; + uint8_t best_mode_skip_txfm; + INTERP_FILTER best_pred_filter; +} BEST_PICKMODE; + +static const int pos_shift_16x16[4][4] = { + { 9, 10, 13, 14 }, { 11, 12, 15, 16 }, { 17, 18, 21, 22 }, { 19, 20, 23, 24 } +}; + +static int mv_refs_rt(VP9_COMP *cpi, const VP9_COMMON *cm, const MACROBLOCK *x, + const MACROBLOCKD *xd, const TileInfo *const tile, + MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, + int_mv *mv_ref_list, int_mv *base_mv, int mi_row, + int mi_col, int use_base_mv) { + const int *ref_sign_bias = cm->ref_frame_sign_bias; + int i, refmv_count = 0; + + const POSITION *const mv_ref_search = mv_ref_blocks[mi->sb_type]; + + int different_ref_found = 0; + int context_counter = 0; + int const_motion = 0; + + // Blank the reference vector list + memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES); + + // The nearest 2 blocks are treated differently + // if the size < 8x8 we get the mv from the bmi substructure, + // and we also need to keep a mode count. + for (i = 0; i < 2; ++i) { + const POSITION *const mv_ref = &mv_ref_search[i]; + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { + const MODE_INFO *const candidate_mi = + xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]; + // Keep counts for entropy encoding. + context_counter += mode_2_counter[candidate_mi->mode]; + different_ref_found = 1; + + if (candidate_mi->ref_frame[0] == ref_frame) + ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, -1), + refmv_count, mv_ref_list, Done); + } + } + + const_motion = 1; + + // Check the rest of the neighbors in much the same way + // as before except we don't need to keep track of sub blocks or + // mode counts. + for (; i < MVREF_NEIGHBOURS && !refmv_count; ++i) { + const POSITION *const mv_ref = &mv_ref_search[i]; + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { + const MODE_INFO *const candidate_mi = + xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]; + different_ref_found = 1; + + if (candidate_mi->ref_frame[0] == ref_frame) + ADD_MV_REF_LIST(candidate_mi->mv[0], refmv_count, mv_ref_list, Done); + } + } + + // Since we couldn't find 2 mvs from the same reference frame + // go back through the neighbors and find motion vectors from + // different reference frames. + if (different_ref_found && !refmv_count) { + for (i = 0; i < MVREF_NEIGHBOURS; ++i) { + const POSITION *mv_ref = &mv_ref_search[i]; + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { + const MODE_INFO *const candidate_mi = + xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]; + + // If the candidate is INTRA we don't want to consider its mv. + IF_DIFF_REF_FRAME_ADD_MV(candidate_mi, ref_frame, ref_sign_bias, + refmv_count, mv_ref_list, Done); + } + } + } + if (use_base_mv && + !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame && + ref_frame == LAST_FRAME) { + // Get base layer mv. + MV_REF *candidate = + &cm->prev_frame + ->mvs[(mi_col >> 1) + (mi_row >> 1) * (cm->mi_cols >> 1)]; + if (candidate->mv[0].as_int != INVALID_MV) { + base_mv->as_mv.row = (candidate->mv[0].as_mv.row * 2); + base_mv->as_mv.col = (candidate->mv[0].as_mv.col * 2); + clamp_mv_ref(&base_mv->as_mv, xd); + } else { + base_mv->as_int = INVALID_MV; + } + } + +Done: + + x->mbmi_ext->mode_context[ref_frame] = counter_to_context[context_counter]; + + // Clamp vectors + for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) + clamp_mv_ref(&mv_ref_list[i].as_mv, xd); + + return const_motion; +} + +static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int_mv *tmp_mv, int *rate_mv, + int64_t best_rd_sofar, int use_base_mv) { + MACROBLOCKD *xd = &x->e_mbd; + MODE_INFO *mi = xd->mi[0]; + struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0 } }; + const int step_param = cpi->sf.mv.fullpel_search_step_param; + const int sadpb = x->sadperbit16; + MV mvp_full; + const int ref = mi->ref_frame[0]; + const MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv; + MV center_mv; + uint32_t dis; + int rate_mode; + const MvLimits tmp_mv_limits = x->mv_limits; + int rv = 0; + int cost_list[5]; + int search_subpel = 1; + const YV12_BUFFER_CONFIG *scaled_ref_frame = + vp9_get_scaled_ref_frame(cpi, ref); + if (scaled_ref_frame) { + int i; + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // motion search code to be used without additional modifications. + for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0]; + vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL); + } + vp9_set_mv_search_range(&x->mv_limits, &ref_mv); + + // Limit motion vector for large lightning change. + if (cpi->oxcf.speed > 5 && x->lowvar_highsumdiff) { + x->mv_limits.col_min = VPXMAX(x->mv_limits.col_min, -10); + x->mv_limits.row_min = VPXMAX(x->mv_limits.row_min, -10); + x->mv_limits.col_max = VPXMIN(x->mv_limits.col_max, 10); + x->mv_limits.row_max = VPXMIN(x->mv_limits.row_max, 10); + } + + assert(x->mv_best_ref_index[ref] <= 2); + if (x->mv_best_ref_index[ref] < 2) + mvp_full = x->mbmi_ext->ref_mvs[ref][x->mv_best_ref_index[ref]].as_mv; + else + mvp_full = x->pred_mv[ref]; + + mvp_full.col >>= 3; + mvp_full.row >>= 3; + + if (!use_base_mv) + center_mv = ref_mv; + else + center_mv = tmp_mv->as_mv; + + if (x->sb_use_mv_part) { + tmp_mv->as_mv.row = x->sb_mvrow_part >> 3; + tmp_mv->as_mv.col = x->sb_mvcol_part >> 3; + } else { + vp9_full_pixel_search( + cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, sadpb, + cond_cost_list(cpi, cost_list), ¢er_mv, &tmp_mv->as_mv, INT_MAX, 0); + } + + x->mv_limits = tmp_mv_limits; + + // calculate the bit cost on motion vector + mvp_full.row = tmp_mv->as_mv.row * 8; + mvp_full.col = tmp_mv->as_mv.col * 8; + + *rate_mv = vp9_mv_bit_cost(&mvp_full, &ref_mv, x->nmvjointcost, x->mvcost, + MV_COST_WEIGHT); + + rate_mode = + cpi->inter_mode_cost[x->mbmi_ext->mode_context[ref]][INTER_OFFSET(NEWMV)]; + rv = + !(RDCOST(x->rdmult, x->rddiv, (*rate_mv + rate_mode), 0) > best_rd_sofar); + + // For SVC on non-reference frame, avoid subpel for (0, 0) motion. + if (cpi->use_svc && cpi->svc.non_reference_frame) { + if (mvp_full.row == 0 && mvp_full.col == 0) search_subpel = 0; + } + + if (rv && search_subpel) { + SUBPEL_FORCE_STOP subpel_force_stop = cpi->sf.mv.subpel_force_stop; + if (use_base_mv && cpi->sf.base_mv_aggressive) subpel_force_stop = HALF_PEL; + if (cpi->sf.mv.enable_adaptive_subpel_force_stop) { + const int mv_thresh = cpi->sf.mv.adapt_subpel_force_stop.mv_thresh; + if (abs(tmp_mv->as_mv.row) >= mv_thresh || + abs(tmp_mv->as_mv.col) >= mv_thresh) + subpel_force_stop = cpi->sf.mv.adapt_subpel_force_stop.force_stop_above; + else + subpel_force_stop = cpi->sf.mv.adapt_subpel_force_stop.force_stop_below; + } + cpi->find_fractional_mv_step( + x, &tmp_mv->as_mv, &ref_mv, cpi->common.allow_high_precision_mv, + x->errorperbit, &cpi->fn_ptr[bsize], subpel_force_stop, + cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list), + x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, 0, 0, + cpi->sf.use_accurate_subpel_search); + *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmvjointcost, + x->mvcost, MV_COST_WEIGHT); + } + + if (scaled_ref_frame) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i]; + } + return rv; +} + +static void block_variance(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int w, int h, + unsigned int *sse, int *sum, int block_size, +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth, vpx_bit_depth_t bd, +#endif + uint32_t *sse8x8, int *sum8x8, uint32_t *var8x8) { + int i, j, k = 0; + uint32_t k_sqr = 0; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { +#if CONFIG_VP9_HIGHBITDEPTH + if (use_highbitdepth) { + switch (bd) { + case VPX_BITS_8: + vpx_highbd_8_get8x8var(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, + &sse8x8[k], &sum8x8[k]); + break; + case VPX_BITS_10: + vpx_highbd_10_get8x8var(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, + &sse8x8[k], &sum8x8[k]); + break; + case VPX_BITS_12: + vpx_highbd_12_get8x8var(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, + &sse8x8[k], &sum8x8[k]); + break; + } + } else { + vpx_get8x8var(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, &sse8x8[k], + &sum8x8[k]); + } +#else + vpx_get8x8var(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, &sse8x8[k], + &sum8x8[k]); +#endif + *sse += sse8x8[k]; + *sum += sum8x8[k]; + k_sqr = (uint32_t)(((int64_t)sum8x8[k] * sum8x8[k]) >> 6); + var8x8[k] = sse8x8[k] > k_sqr ? sse8x8[k] - k_sqr : k_sqr - sse8x8[k]; + k++; + } + } +} + +static void calculate_variance(int bw, int bh, TX_SIZE tx_size, + unsigned int *sse_i, int *sum_i, + unsigned int *var_o, unsigned int *sse_o, + int *sum_o) { + const BLOCK_SIZE unit_size = txsize_to_bsize[tx_size]; + const int nw = 1 << (bw - b_width_log2_lookup[unit_size]); + const int nh = 1 << (bh - b_height_log2_lookup[unit_size]); + int i, j, k = 0; + uint32_t k_sqr = 0; + + for (i = 0; i < nh; i += 2) { + for (j = 0; j < nw; j += 2) { + sse_o[k] = sse_i[i * nw + j] + sse_i[i * nw + j + 1] + + sse_i[(i + 1) * nw + j] + sse_i[(i + 1) * nw + j + 1]; + sum_o[k] = sum_i[i * nw + j] + sum_i[i * nw + j + 1] + + sum_i[(i + 1) * nw + j] + sum_i[(i + 1) * nw + j + 1]; + k_sqr = (uint32_t)(((int64_t)sum_o[k] * sum_o[k]) >> + (b_width_log2_lookup[unit_size] + + b_height_log2_lookup[unit_size] + 6)); + var_o[k] = sse_o[k] > k_sqr ? sse_o[k] - k_sqr : k_sqr - sse_o[k]; + k++; + } + } +} + +// Adjust the ac_thr according to speed, width, height and normalized sum +static int ac_thr_factor(const int speed, const int width, const int height, + const int norm_sum) { + if (speed >= 8 && norm_sum < 5) { + if (width <= 640 && height <= 480) + return 4; + else + return 2; + } + return 1; +} + +static TX_SIZE calculate_tx_size(VP9_COMP *const cpi, BLOCK_SIZE bsize, + MACROBLOCKD *const xd, unsigned int var, + unsigned int sse, int64_t ac_thr, + unsigned int source_variance, int is_intra) { + // TODO(marpan): Tune selection for intra-modes, screen content, etc. + TX_SIZE tx_size; + unsigned int var_thresh = is_intra ? (unsigned int)ac_thr : 1; + int limit_tx = 1; + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && + (source_variance == 0 || var < var_thresh)) + limit_tx = 0; + if (cpi->common.tx_mode == TX_MODE_SELECT) { + if (sse > (var << 2)) + tx_size = VPXMIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); + else + tx_size = TX_8X8; + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && limit_tx && + cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id)) + tx_size = TX_8X8; + else if (tx_size > TX_16X16 && limit_tx) + tx_size = TX_16X16; + // For screen-content force 4X4 tx_size over 8X8, for large variance. + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && tx_size == TX_8X8 && + bsize <= BLOCK_16X16 && ((var >> 5) > (unsigned int)ac_thr)) + tx_size = TX_4X4; + } else { + tx_size = VPXMIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); + } + return tx_size; +} + +static void compute_intra_yprediction(PREDICTION_MODE mode, BLOCK_SIZE bsize, + MACROBLOCK *x, MACROBLOCKD *xd) { + struct macroblockd_plane *const pd = &xd->plane[0]; + struct macroblock_plane *const p = &x->plane[0]; + uint8_t *const src_buf_base = p->src.buf; + uint8_t *const dst_buf_base = pd->dst.buf; + const int src_stride = p->src.stride; + const int dst_stride = pd->dst.stride; + // block and transform sizes, in number of 4x4 blocks log 2 ("*_b") + // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8 + const TX_SIZE tx_size = max_txsize_lookup[bsize]; + const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; + int row, col; + // If mb_to_right_edge is < 0 we are in a situation in which + // the current block size extends into the UMV and we won't + // visit the sub blocks that are wholly within the UMV. + const int max_blocks_wide = + num_4x4_w + (xd->mb_to_right_edge >= 0 + ? 0 + : xd->mb_to_right_edge >> (5 + pd->subsampling_x)); + const int max_blocks_high = + num_4x4_h + (xd->mb_to_bottom_edge >= 0 + ? 0 + : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); + + // Keep track of the row and column of the blocks we use so that we know + // if we are in the unrestricted motion border. + for (row = 0; row < max_blocks_high; row += (1 << tx_size)) { + // Skip visiting the sub blocks that are wholly within the UMV. + for (col = 0; col < max_blocks_wide; col += (1 << tx_size)) { + p->src.buf = &src_buf_base[4 * (row * (int64_t)src_stride + col)]; + pd->dst.buf = &dst_buf_base[4 * (row * (int64_t)dst_stride + col)]; + vp9_predict_intra_block(xd, b_width_log2_lookup[bsize], tx_size, mode, + x->skip_encode ? p->src.buf : pd->dst.buf, + x->skip_encode ? src_stride : dst_stride, + pd->dst.buf, dst_stride, col, row, 0); + } + } + p->src.buf = src_buf_base; + pd->dst.buf = dst_buf_base; +} + +static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, + MACROBLOCK *x, MACROBLOCKD *xd, + int *out_rate_sum, int64_t *out_dist_sum, + unsigned int *var_y, unsigned int *sse_y, + int mi_row, int mi_col, int *early_term, + int *flag_preduv_computed) { + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + unsigned int sse; + int rate; + int64_t dist; + struct macroblock_plane *const p = &x->plane[0]; + struct macroblockd_plane *const pd = &xd->plane[0]; + const uint32_t dc_quant = pd->dequant[0]; + const uint32_t ac_quant = pd->dequant[1]; + int64_t dc_thr = dc_quant * dc_quant >> 6; + int64_t ac_thr = ac_quant * ac_quant >> 6; + unsigned int var; + int sum; + int skip_dc = 0; + + const int bw = b_width_log2_lookup[bsize]; + const int bh = b_height_log2_lookup[bsize]; + const int num8x8 = 1 << (bw + bh - 2); + unsigned int sse8x8[64] = { 0 }; + int sum8x8[64] = { 0 }; + unsigned int var8x8[64] = { 0 }; + TX_SIZE tx_size; + int i, k; + uint32_t sum_sqr; +#if CONFIG_VP9_HIGHBITDEPTH + const vpx_bit_depth_t bd = cpi->common.bit_depth; +#endif + // Calculate variance for whole partition, and also save 8x8 blocks' variance + // to be used in following transform skipping test. + block_variance(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, + 4 << bw, 4 << bh, &sse, &sum, 8, +#if CONFIG_VP9_HIGHBITDEPTH + cpi->common.use_highbitdepth, bd, +#endif + sse8x8, sum8x8, var8x8); + sum_sqr = (uint32_t)((int64_t)sum * sum) >> (bw + bh + 4); + var = sse > sum_sqr ? sse - sum_sqr : sum_sqr - sse; + + *var_y = var; + *sse_y = sse; + +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) && + cpi->oxcf.speed > 5) + ac_thr = vp9_scale_acskip_thresh(ac_thr, cpi->denoiser.denoising_level, + (abs(sum) >> (bw + bh)), + cpi->svc.temporal_layer_id); + else + ac_thr *= ac_thr_factor(cpi->oxcf.speed, cpi->common.width, + cpi->common.height, abs(sum) >> (bw + bh)); +#else + ac_thr *= ac_thr_factor(cpi->oxcf.speed, cpi->common.width, + cpi->common.height, abs(sum) >> (bw + bh)); +#endif + + tx_size = calculate_tx_size(cpi, bsize, xd, var, sse, ac_thr, + x->source_variance, 0); + // The code below for setting skip flag assumes tranform size of at least 8x8, + // so force this lower limit on transform. + if (tx_size < TX_8X8) tx_size = TX_8X8; + xd->mi[0]->tx_size = tx_size; + + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && x->zero_temp_sad_source && + x->source_variance == 0) + dc_thr = dc_thr << 1; + + // Evaluate if the partition block is a skippable block in Y plane. + { + unsigned int sse16x16[16] = { 0 }; + int sum16x16[16] = { 0 }; + unsigned int var16x16[16] = { 0 }; + const int num16x16 = num8x8 >> 2; + + unsigned int sse32x32[4] = { 0 }; + int sum32x32[4] = { 0 }; + unsigned int var32x32[4] = { 0 }; + const int num32x32 = num8x8 >> 4; + + int ac_test = 1; + int dc_test = 1; + const int num = (tx_size == TX_8X8) + ? num8x8 + : ((tx_size == TX_16X16) ? num16x16 : num32x32); + const unsigned int *sse_tx = + (tx_size == TX_8X8) ? sse8x8 + : ((tx_size == TX_16X16) ? sse16x16 : sse32x32); + const unsigned int *var_tx = + (tx_size == TX_8X8) ? var8x8 + : ((tx_size == TX_16X16) ? var16x16 : var32x32); + + // Calculate variance if tx_size > TX_8X8 + if (tx_size >= TX_16X16) + calculate_variance(bw, bh, TX_8X8, sse8x8, sum8x8, var16x16, sse16x16, + sum16x16); + if (tx_size == TX_32X32) + calculate_variance(bw, bh, TX_16X16, sse16x16, sum16x16, var32x32, + sse32x32, sum32x32); + + // Skipping test + x->skip_txfm[0] = SKIP_TXFM_NONE; + for (k = 0; k < num; k++) + // Check if all ac coefficients can be quantized to zero. + if (!(var_tx[k] < ac_thr || var == 0)) { + ac_test = 0; + break; + } + + for (k = 0; k < num; k++) + // Check if dc coefficient can be quantized to zero. + if (!(sse_tx[k] - var_tx[k] < dc_thr || sse == var)) { + dc_test = 0; + break; + } + + if (ac_test) { + x->skip_txfm[0] = SKIP_TXFM_AC_ONLY; + + if (dc_test) x->skip_txfm[0] = SKIP_TXFM_AC_DC; + } else if (dc_test) { + skip_dc = 1; + } + } + + if (x->skip_txfm[0] == SKIP_TXFM_AC_DC) { + int skip_uv[2] = { 0 }; + unsigned int var_uv[2]; + unsigned int sse_uv[2]; + + *out_rate_sum = 0; + *out_dist_sum = sse << 4; + + // Transform skipping test in UV planes. + for (i = 1; i <= 2; i++) { + struct macroblock_plane *const p_uv = &x->plane[i]; + struct macroblockd_plane *const pd_uv = &xd->plane[i]; + const TX_SIZE uv_tx_size = get_uv_tx_size(xd->mi[0], pd_uv); + const BLOCK_SIZE unit_size = txsize_to_bsize[uv_tx_size]; + const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd_uv); + const int uv_bw = b_width_log2_lookup[uv_bsize]; + const int uv_bh = b_height_log2_lookup[uv_bsize]; + const int sf = (uv_bw - b_width_log2_lookup[unit_size]) + + (uv_bh - b_height_log2_lookup[unit_size]); + const uint32_t uv_dc_thr = + pd_uv->dequant[0] * pd_uv->dequant[0] >> (6 - sf); + const uint32_t uv_ac_thr = + pd_uv->dequant[1] * pd_uv->dequant[1] >> (6 - sf); + int j = i - 1; + + vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i); + flag_preduv_computed[i - 1] = 1; + var_uv[j] = cpi->fn_ptr[uv_bsize].vf(p_uv->src.buf, p_uv->src.stride, + pd_uv->dst.buf, pd_uv->dst.stride, + &sse_uv[j]); + + if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) && + (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j])) + skip_uv[j] = 1; + else + break; + } + + // If the transform in YUV planes are skippable, the mode search checks + // fewer inter modes and doesn't check intra modes. + if (skip_uv[0] & skip_uv[1]) { + *early_term = 1; + } + return; + } + + if (!skip_dc) { +#if CONFIG_VP9_HIGHBITDEPTH + vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize], + dc_quant >> (xd->bd - 5), &rate, &dist); +#else + vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize], + dc_quant >> 3, &rate, &dist); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + + if (!skip_dc) { + *out_rate_sum = rate >> 1; + *out_dist_sum = dist << 3; + } else { + *out_rate_sum = 0; + *out_dist_sum = (sse - var) << 4; + } + +#if CONFIG_VP9_HIGHBITDEPTH + vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize], + ac_quant >> (xd->bd - 5), &rate, &dist); +#else + vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize], ac_quant >> 3, + &rate, &dist); +#endif // CONFIG_VP9_HIGHBITDEPTH + + *out_rate_sum += rate; + *out_dist_sum += dist << 4; +} + +static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, + MACROBLOCKD *xd, int *out_rate_sum, + int64_t *out_dist_sum, unsigned int *var_y, + unsigned int *sse_y, int is_intra) { + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + unsigned int sse; + int rate; + int64_t dist; + struct macroblock_plane *const p = &x->plane[0]; + struct macroblockd_plane *const pd = &xd->plane[0]; + const int64_t dc_thr = p->quant_thred[0] >> 6; + const int64_t ac_thr = p->quant_thred[1] >> 6; + const uint32_t dc_quant = pd->dequant[0]; + const uint32_t ac_quant = pd->dequant[1]; + unsigned int var = cpi->fn_ptr[bsize].vf(p->src.buf, p->src.stride, + pd->dst.buf, pd->dst.stride, &sse); + int skip_dc = 0; + + *var_y = var; + *sse_y = sse; + + xd->mi[0]->tx_size = calculate_tx_size(cpi, bsize, xd, var, sse, ac_thr, + x->source_variance, is_intra); + + // Evaluate if the partition block is a skippable block in Y plane. + { + const BLOCK_SIZE unit_size = txsize_to_bsize[xd->mi[0]->tx_size]; + const unsigned int num_blk_log2 = + (b_width_log2_lookup[bsize] - b_width_log2_lookup[unit_size]) + + (b_height_log2_lookup[bsize] - b_height_log2_lookup[unit_size]); + const unsigned int sse_tx = sse >> num_blk_log2; + const unsigned int var_tx = var >> num_blk_log2; + + x->skip_txfm[0] = SKIP_TXFM_NONE; + // Check if all ac coefficients can be quantized to zero. + if (var_tx < ac_thr || var == 0) { + x->skip_txfm[0] = SKIP_TXFM_AC_ONLY; + // Check if dc coefficient can be quantized to zero. + if (sse_tx - var_tx < dc_thr || sse == var) + x->skip_txfm[0] = SKIP_TXFM_AC_DC; + } else { + if (sse_tx - var_tx < dc_thr || sse == var) skip_dc = 1; + } + } + + if (x->skip_txfm[0] == SKIP_TXFM_AC_DC) { + *out_rate_sum = 0; + *out_dist_sum = sse << 4; + return; + } + + if (!skip_dc) { +#if CONFIG_VP9_HIGHBITDEPTH + vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize], + dc_quant >> (xd->bd - 5), &rate, &dist); +#else + vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize], + dc_quant >> 3, &rate, &dist); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + + if (!skip_dc) { + *out_rate_sum = rate >> 1; + *out_dist_sum = dist << 3; + } else { + *out_rate_sum = 0; + *out_dist_sum = (sse - var) << 4; + } + +#if CONFIG_VP9_HIGHBITDEPTH + vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize], + ac_quant >> (xd->bd - 5), &rate, &dist); +#else + vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize], ac_quant >> 3, + &rate, &dist); +#endif // CONFIG_VP9_HIGHBITDEPTH + + *out_rate_sum += rate; + *out_dist_sum += dist << 4; +} + +static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, + int *skippable, int64_t *sse, BLOCK_SIZE bsize, + TX_SIZE tx_size, int rd_computed, int is_intra) { + MACROBLOCKD *xd = &x->e_mbd; + const struct macroblockd_plane *pd = &xd->plane[0]; + struct macroblock_plane *const p = &x->plane[0]; + const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; + const int step = 1 << (tx_size << 1); + const int block_step = (1 << tx_size); + int block = 0, r, c; + const int max_blocks_wide = + num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >> 5); + const int max_blocks_high = + num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >> 5); + int eob_cost = 0; + const int bw = 4 * num_4x4_w; + const int bh = 4 * num_4x4_h; + + if (cpi->sf.use_simple_block_yrd && cpi->common.frame_type != KEY_FRAME && + (bsize < BLOCK_32X32 || + (cpi->use_svc && + (bsize < BLOCK_32X32 || cpi->svc.temporal_layer_id > 0)))) { + unsigned int var_y, sse_y; + (void)tx_size; + if (!rd_computed) + model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist, + &var_y, &sse_y, is_intra); + *sse = INT_MAX; + *skippable = 0; + return; + } + + (void)cpi; + + // The max tx_size passed in is TX_16X16. + assert(tx_size != TX_32X32); +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vpx_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, + p->src.stride, pd->dst.buf, pd->dst.stride, + x->e_mbd.bd); + } else { + vpx_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, + pd->dst.buf, pd->dst.stride); + } +#else + vpx_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, + pd->dst.buf, pd->dst.stride); +#endif + *skippable = 1; + // Keep track of the row and column of the blocks we use so that we know + // if we are in the unrestricted motion border. + for (r = 0; r < max_blocks_high; r += block_step) { + for (c = 0; c < num_4x4_w; c += block_step) { + if (c < max_blocks_wide) { + const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size]; + tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); + tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + uint16_t *const eob = &p->eobs[block]; + const int diff_stride = bw; + const int16_t *src_diff; + src_diff = &p->src_diff[(r * diff_stride + c) << 2]; + + // skip block condition should be handled before this is called. + assert(!x->skip_block); + + switch (tx_size) { + case TX_16X16: + vpx_hadamard_16x16(src_diff, diff_stride, coeff); + vp9_quantize_fp(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); + break; + case TX_8X8: + vpx_hadamard_8x8(src_diff, diff_stride, coeff); + vp9_quantize_fp(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); + break; + default: + assert(tx_size == TX_4X4); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); + vp9_quantize_fp(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); + break; + } + *skippable &= (*eob == 0); + eob_cost += 1; + } + block += step; + } + } + + this_rdc->rate = 0; + if (*sse < INT64_MAX) { + *sse = (*sse << 6) >> 2; + if (*skippable) { + this_rdc->dist = *sse; + return; + } + } + + block = 0; + this_rdc->dist = 0; + for (r = 0; r < max_blocks_high; r += block_step) { + for (c = 0; c < num_4x4_w; c += block_step) { + if (c < max_blocks_wide) { + tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); + tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + uint16_t *const eob = &p->eobs[block]; + + if (*eob == 1) + this_rdc->rate += (int)abs(qcoeff[0]); + else if (*eob > 1) + this_rdc->rate += vpx_satd(qcoeff, step << 4); + + this_rdc->dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> 2; + } + block += step; + } + } + + // If skippable is set, rate gets clobbered later. + this_rdc->rate <<= (2 + VP9_PROB_COST_SHIFT); + this_rdc->rate += (eob_cost << VP9_PROB_COST_SHIFT); +} + +static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE plane_bsize, + MACROBLOCK *x, MACROBLOCKD *xd, + RD_COST *this_rdc, unsigned int *var_y, + unsigned int *sse_y, int start_plane, + int stop_plane) { + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + unsigned int sse; + int rate; + int64_t dist; + int i; +#if CONFIG_VP9_HIGHBITDEPTH + uint64_t tot_var = *var_y; + uint64_t tot_sse = *sse_y; +#else + uint32_t tot_var = *var_y; + uint32_t tot_sse = *sse_y; +#endif + + this_rdc->rate = 0; + this_rdc->dist = 0; + + for (i = start_plane; i <= stop_plane; ++i) { + struct macroblock_plane *const p = &x->plane[i]; + struct macroblockd_plane *const pd = &xd->plane[i]; + const uint32_t dc_quant = pd->dequant[0]; + const uint32_t ac_quant = pd->dequant[1]; + const BLOCK_SIZE bs = plane_bsize; + unsigned int var; + if (!x->color_sensitivity[i - 1]) continue; + + var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, &sse); + assert(sse >= var); + tot_var += var; + tot_sse += sse; + +#if CONFIG_VP9_HIGHBITDEPTH + vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs], + dc_quant >> (xd->bd - 5), &rate, &dist); +#else + vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs], + dc_quant >> 3, &rate, &dist); +#endif // CONFIG_VP9_HIGHBITDEPTH + + this_rdc->rate += rate >> 1; + this_rdc->dist += dist << 3; + +#if CONFIG_VP9_HIGHBITDEPTH + vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs], + ac_quant >> (xd->bd - 5), &rate, &dist); +#else + vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs], ac_quant >> 3, + &rate, &dist); +#endif // CONFIG_VP9_HIGHBITDEPTH + + this_rdc->rate += rate; + this_rdc->dist += dist << 4; + } + +#if CONFIG_VP9_HIGHBITDEPTH + *var_y = tot_var > UINT32_MAX ? UINT32_MAX : (uint32_t)tot_var; + *sse_y = tot_sse > UINT32_MAX ? UINT32_MAX : (uint32_t)tot_sse; +#else + *var_y = tot_var; + *sse_y = tot_sse; +#endif +} + +static int get_pred_buffer(PRED_BUFFER *p, int len) { + int i; + + for (i = 0; i < len; i++) { + if (!p[i].in_use) { + p[i].in_use = 1; + return i; + } + } + return -1; +} + +static void free_pred_buffer(PRED_BUFFER *p) { + if (p != NULL) p->in_use = 0; +} + +static void encode_breakout_test( + VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col, + MV_REFERENCE_FRAME ref_frame, PREDICTION_MODE this_mode, unsigned int var_y, + unsigned int sse_y, struct buf_2d yv12_mb[][MAX_MB_PLANE], int *rate, + int64_t *dist, int *flag_preduv_computed) { + MACROBLOCKD *xd = &x->e_mbd; + MODE_INFO *const mi = xd->mi[0]; + const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]); + unsigned int var = var_y, sse = sse_y; + // Skipping threshold for ac. + unsigned int thresh_ac; + // Skipping threshold for dc. + unsigned int thresh_dc; + int motion_low = 1; + + if (cpi->use_svc && ref_frame == GOLDEN_FRAME) return; + if (mi->mv[0].as_mv.row > 64 || mi->mv[0].as_mv.row < -64 || + mi->mv[0].as_mv.col > 64 || mi->mv[0].as_mv.col < -64) + motion_low = 0; + if (x->encode_breakout > 0 && motion_low == 1) { + // Set a maximum for threshold to avoid big PSNR loss in low bit rate + // case. Use extreme low threshold for static frames to limit + // skipping. + const unsigned int max_thresh = 36000; + // The encode_breakout input + const unsigned int min_thresh = + VPXMIN(((unsigned int)x->encode_breakout << 4), max_thresh); +#if CONFIG_VP9_HIGHBITDEPTH + const int shift = (xd->bd << 1) - 16; +#endif + + // Calculate threshold according to dequant value. + thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) >> 3; +#if CONFIG_VP9_HIGHBITDEPTH + if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && shift > 0) { + thresh_ac = ROUND_POWER_OF_TWO(thresh_ac, shift); + } +#endif // CONFIG_VP9_HIGHBITDEPTH + thresh_ac = clamp(thresh_ac, min_thresh, max_thresh); + + // Adjust ac threshold according to partition size. + thresh_ac >>= + 8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + + thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6); +#if CONFIG_VP9_HIGHBITDEPTH + if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && shift > 0) { + thresh_dc = ROUND_POWER_OF_TWO(thresh_dc, shift); + } +#endif // CONFIG_VP9_HIGHBITDEPTH + } else { + thresh_ac = 0; + thresh_dc = 0; + } + + // Y skipping condition checking for ac and dc. + if (var <= thresh_ac && (sse - var) <= thresh_dc) { + unsigned int sse_u, sse_v; + unsigned int var_u, var_v; + unsigned int thresh_ac_uv = thresh_ac; + unsigned int thresh_dc_uv = thresh_dc; + if (x->sb_is_skin) { + thresh_ac_uv = 0; + thresh_dc_uv = 0; + } + + if (!flag_preduv_computed[0] || !flag_preduv_computed[1]) { + xd->plane[1].pre[0] = yv12_mb[ref_frame][1]; + xd->plane[2].pre[0] = yv12_mb[ref_frame][2]; + vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, bsize); + } + + var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf, x->plane[1].src.stride, + xd->plane[1].dst.buf, + xd->plane[1].dst.stride, &sse_u); + + // U skipping condition checking + if (((var_u << 2) <= thresh_ac_uv) && (sse_u - var_u <= thresh_dc_uv)) { + var_v = cpi->fn_ptr[uv_size].vf( + x->plane[2].src.buf, x->plane[2].src.stride, xd->plane[2].dst.buf, + xd->plane[2].dst.stride, &sse_v); + + // V skipping condition checking + if (((var_v << 2) <= thresh_ac_uv) && (sse_v - var_v <= thresh_dc_uv)) { + x->skip = 1; + + // The cost of skip bit needs to be added. + *rate = cpi->inter_mode_cost[x->mbmi_ext->mode_context[ref_frame]] + [INTER_OFFSET(this_mode)]; + + // More on this part of rate + // rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1); + + // Scaling factor for SSE from spatial domain to frequency + // domain is 16. Adjust distortion accordingly. + // TODO(yunqingwang): In this function, only y-plane dist is + // calculated. + *dist = (sse << 4); // + ((sse_u + sse_v) << 4); + + // *disable_skip = 1; + } + } + } +} + +struct estimate_block_intra_args { + VP9_COMP *cpi; + MACROBLOCK *x; + PREDICTION_MODE mode; + int skippable; + RD_COST *rdc; +}; + +static void estimate_block_intra(int plane, int block, int row, int col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + void *arg) { + struct estimate_block_intra_args *const args = arg; + VP9_COMP *const cpi = args->cpi; + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE bsize_tx = txsize_to_bsize[tx_size]; + uint8_t *const src_buf_base = p->src.buf; + uint8_t *const dst_buf_base = pd->dst.buf; + const int src_stride = p->src.stride; + const int dst_stride = pd->dst.stride; + RD_COST this_rdc; + + (void)block; + + p->src.buf = &src_buf_base[4 * (row * (int64_t)src_stride + col)]; + pd->dst.buf = &dst_buf_base[4 * (row * (int64_t)dst_stride + col)]; + // Use source buffer as an approximation for the fully reconstructed buffer. + vp9_predict_intra_block(xd, b_width_log2_lookup[plane_bsize], tx_size, + args->mode, x->skip_encode ? p->src.buf : pd->dst.buf, + x->skip_encode ? src_stride : dst_stride, pd->dst.buf, + dst_stride, col, row, plane); + + if (plane == 0) { + int64_t this_sse = INT64_MAX; + block_yrd(cpi, x, &this_rdc, &args->skippable, &this_sse, bsize_tx, + VPXMIN(tx_size, TX_16X16), 0, 1); + } else { + unsigned int var = 0; + unsigned int sse = 0; + model_rd_for_sb_uv(cpi, bsize_tx, x, xd, &this_rdc, &var, &sse, plane, + plane); + } + + p->src.buf = src_buf_base; + pd->dst.buf = dst_buf_base; + args->rdc->rate += this_rdc.rate; + args->rdc->dist += this_rdc.dist; +} + +static const THR_MODES mode_idx[MAX_REF_FRAMES][4] = { + { THR_DC, THR_V_PRED, THR_H_PRED, THR_TM }, + { THR_NEARESTMV, THR_NEARMV, THR_ZEROMV, THR_NEWMV }, + { THR_NEARESTG, THR_NEARG, THR_ZEROG, THR_NEWG }, + { THR_NEARESTA, THR_NEARA, THR_ZEROA, THR_NEWA }, +}; + +static const PREDICTION_MODE intra_mode_list[] = { DC_PRED, V_PRED, H_PRED, + TM_PRED }; + +static int mode_offset(const PREDICTION_MODE mode) { + if (mode >= NEARESTMV) { + return INTER_OFFSET(mode); + } else { + switch (mode) { + case DC_PRED: return 0; + case V_PRED: return 1; + case H_PRED: return 2; + case TM_PRED: return 3; + default: return -1; + } + } +} + +static INLINE int rd_less_than_thresh_row_mt(int64_t best_rd, int thresh, + const int *const thresh_fact) { + int is_rd_less_than_thresh; + is_rd_less_than_thresh = + best_rd < ((int64_t)thresh * (*thresh_fact) >> 5) || thresh == INT_MAX; + return is_rd_less_than_thresh; +} + +static INLINE void update_thresh_freq_fact_row_mt( + VP9_COMP *cpi, TileDataEnc *tile_data, unsigned int source_variance, + int thresh_freq_fact_idx, MV_REFERENCE_FRAME ref_frame, + THR_MODES best_mode_idx, PREDICTION_MODE mode) { + THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)]; + int freq_fact_idx = thresh_freq_fact_idx + thr_mode_idx; + int *freq_fact = &tile_data->row_base_thresh_freq_fact[freq_fact_idx]; + if (thr_mode_idx == best_mode_idx) + *freq_fact -= (*freq_fact >> 4); + else if (cpi->sf.limit_newmv_early_exit && mode == NEWMV && + ref_frame == LAST_FRAME && source_variance < 5) { + *freq_fact = VPXMIN(*freq_fact + RD_THRESH_INC, 32); + } else { + *freq_fact = VPXMIN(*freq_fact + RD_THRESH_INC, + cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT); + } +} + +static INLINE void update_thresh_freq_fact( + VP9_COMP *cpi, TileDataEnc *tile_data, unsigned int source_variance, + BLOCK_SIZE bsize, MV_REFERENCE_FRAME ref_frame, THR_MODES best_mode_idx, + PREDICTION_MODE mode) { + THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)]; + int *freq_fact = &tile_data->thresh_freq_fact[bsize][thr_mode_idx]; + if (thr_mode_idx == best_mode_idx) + *freq_fact -= (*freq_fact >> 4); + else if (cpi->sf.limit_newmv_early_exit && mode == NEWMV && + ref_frame == LAST_FRAME && source_variance < 5) { + *freq_fact = VPXMIN(*freq_fact + RD_THRESH_INC, 32); + } else { + *freq_fact = VPXMIN(*freq_fact + RD_THRESH_INC, + cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT); + } +} + +void vp9_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) { + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mi = xd->mi[0]; + RD_COST this_rdc, best_rdc; + PREDICTION_MODE this_mode; + struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 }; + const TX_SIZE intra_tx_size = + VPXMIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); + MODE_INFO *const mic = xd->mi[0]; + int *bmode_costs; + const MODE_INFO *above_mi = xd->above_mi; + const MODE_INFO *left_mi = xd->left_mi; + const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0); + const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0); + bmode_costs = cpi->y_mode_costs[A][L]; + + (void)ctx; + vp9_rd_cost_reset(&best_rdc); + vp9_rd_cost_reset(&this_rdc); + + mi->ref_frame[0] = INTRA_FRAME; + // Initialize interp_filter here so we do not have to check for inter block + // modes in get_pred_context_switchable_interp() + mi->interp_filter = SWITCHABLE_FILTERS; + + mi->mv[0].as_int = INVALID_MV; + mi->uv_mode = DC_PRED; + memset(x->skip_txfm, 0, sizeof(x->skip_txfm)); + + // Change the limit of this loop to add other intra prediction + // mode tests. + for (this_mode = DC_PRED; this_mode <= H_PRED; ++this_mode) { + this_rdc.dist = this_rdc.rate = 0; + args.mode = this_mode; + args.skippable = 1; + args.rdc = &this_rdc; + mi->tx_size = intra_tx_size; + vp9_foreach_transformed_block_in_plane(xd, bsize, 0, estimate_block_intra, + &args); + if (args.skippable) { + x->skip_txfm[0] = SKIP_TXFM_AC_DC; + this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 1); + } else { + x->skip_txfm[0] = SKIP_TXFM_NONE; + this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 0); + } + this_rdc.rate += bmode_costs[this_mode]; + this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist); + + if (this_rdc.rdcost < best_rdc.rdcost) { + best_rdc = this_rdc; + mi->mode = this_mode; + } + } + + *rd_cost = best_rdc; +} + +static void init_ref_frame_cost(VP9_COMMON *const cm, MACROBLOCKD *const xd, + int ref_frame_cost[MAX_REF_FRAMES]) { + vpx_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd); + vpx_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd); + vpx_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd); + + ref_frame_cost[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0); + ref_frame_cost[LAST_FRAME] = ref_frame_cost[GOLDEN_FRAME] = + ref_frame_cost[ALTREF_FRAME] = vp9_cost_bit(intra_inter_p, 1); + + ref_frame_cost[LAST_FRAME] += vp9_cost_bit(ref_single_p1, 0); + ref_frame_cost[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p1, 1); + ref_frame_cost[ALTREF_FRAME] += vp9_cost_bit(ref_single_p1, 1); + ref_frame_cost[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p2, 0); + ref_frame_cost[ALTREF_FRAME] += vp9_cost_bit(ref_single_p2, 1); +} + +typedef struct { + MV_REFERENCE_FRAME ref_frame; + PREDICTION_MODE pred_mode; +} REF_MODE; + +#define RT_INTER_MODES 12 +static const REF_MODE ref_mode_set[RT_INTER_MODES] = { + { LAST_FRAME, ZEROMV }, { LAST_FRAME, NEARESTMV }, + { GOLDEN_FRAME, ZEROMV }, { LAST_FRAME, NEARMV }, + { LAST_FRAME, NEWMV }, { GOLDEN_FRAME, NEARESTMV }, + { GOLDEN_FRAME, NEARMV }, { GOLDEN_FRAME, NEWMV }, + { ALTREF_FRAME, ZEROMV }, { ALTREF_FRAME, NEARESTMV }, + { ALTREF_FRAME, NEARMV }, { ALTREF_FRAME, NEWMV } +}; + +#define RT_INTER_MODES_SVC 8 +static const REF_MODE ref_mode_set_svc[RT_INTER_MODES_SVC] = { + { LAST_FRAME, ZEROMV }, { LAST_FRAME, NEARESTMV }, + { LAST_FRAME, NEARMV }, { GOLDEN_FRAME, ZEROMV }, + { GOLDEN_FRAME, NEARESTMV }, { GOLDEN_FRAME, NEARMV }, + { LAST_FRAME, NEWMV }, { GOLDEN_FRAME, NEWMV } +}; + +static INLINE void find_predictors( + VP9_COMP *cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, + int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], + int const_motion[MAX_REF_FRAMES], int *ref_frame_skip_mask, + TileDataEnc *tile_data, int mi_row, int mi_col, + struct buf_2d yv12_mb[4][MAX_MB_PLANE], BLOCK_SIZE bsize, + int force_skip_low_temp_var, int comp_pred_allowed) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); + TileInfo *const tile_info = &tile_data->tile_info; + // TODO(jingning) placeholder for inter-frame non-RD mode decision. + x->pred_mv_sad[ref_frame] = INT_MAX; + frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; + frame_mv[ZEROMV][ref_frame].as_int = 0; + // this needs various further optimizations. to be continued.. + if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) && (yv12 != NULL)) { + int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame]; + const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf; + vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf); + if (cm->use_prev_frame_mvs || comp_pred_allowed) { + vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame, candidates, mi_row, mi_col, + x->mbmi_ext->mode_context); + } else { + const_motion[ref_frame] = + mv_refs_rt(cpi, cm, x, xd, tile_info, xd->mi[0], ref_frame, + candidates, &frame_mv[NEWMV][ref_frame], mi_row, mi_col, + (int)(cpi->svc.use_base_mv && cpi->svc.spatial_layer_id)); + } + vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates, + &frame_mv[NEARESTMV][ref_frame], + &frame_mv[NEARMV][ref_frame]); + // Early exit for golden frame if force_skip_low_temp_var is set. + if (!vp9_is_scaled(sf) && bsize >= BLOCK_8X8 && + !(force_skip_low_temp_var && ref_frame == GOLDEN_FRAME)) { + vp9_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame, + bsize); + } + } else { + *ref_frame_skip_mask |= (1 << ref_frame); + } +} + +static void vp9_NEWMV_diff_bias(const NOISE_ESTIMATE *ne, MACROBLOCKD *xd, + PREDICTION_MODE this_mode, RD_COST *this_rdc, + BLOCK_SIZE bsize, int mv_row, int mv_col, + int is_last_frame, int lowvar_highsumdiff, + int is_skin) { + // Bias against MVs associated with NEWMV mode that are very different from + // top/left neighbors. + if (this_mode == NEWMV) { + int al_mv_average_row; + int al_mv_average_col; + int left_row, left_col; + int row_diff, col_diff; + int above_mv_valid = 0; + int left_mv_valid = 0; + int above_row = 0; + int above_col = 0; + + if (xd->above_mi) { + above_mv_valid = xd->above_mi->mv[0].as_int != INVALID_MV; + above_row = xd->above_mi->mv[0].as_mv.row; + above_col = xd->above_mi->mv[0].as_mv.col; + } + if (xd->left_mi) { + left_mv_valid = xd->left_mi->mv[0].as_int != INVALID_MV; + left_row = xd->left_mi->mv[0].as_mv.row; + left_col = xd->left_mi->mv[0].as_mv.col; + } + if (above_mv_valid && left_mv_valid) { + al_mv_average_row = (above_row + left_row + 1) >> 1; + al_mv_average_col = (above_col + left_col + 1) >> 1; + } else if (above_mv_valid) { + al_mv_average_row = above_row; + al_mv_average_col = above_col; + } else if (left_mv_valid) { + al_mv_average_row = left_row; + al_mv_average_col = left_col; + } else { + al_mv_average_row = al_mv_average_col = 0; + } + row_diff = (al_mv_average_row - mv_row); + col_diff = (al_mv_average_col - mv_col); + if (row_diff > 48 || row_diff < -48 || col_diff > 48 || col_diff < -48) { + if (bsize > BLOCK_32X32) + this_rdc->rdcost = this_rdc->rdcost << 1; + else + this_rdc->rdcost = 3 * this_rdc->rdcost >> 1; + } + } + // If noise estimation is enabled, and estimated level is above threshold, + // add a bias to LAST reference with small motion, for large blocks. + if (ne->enabled && ne->level >= kMedium && bsize >= BLOCK_32X32 && + is_last_frame && mv_row < 8 && mv_row > -8 && mv_col < 8 && mv_col > -8) + this_rdc->rdcost = 7 * (this_rdc->rdcost >> 3); + else if (lowvar_highsumdiff && !is_skin && bsize >= BLOCK_16X16 && + is_last_frame && mv_row < 16 && mv_row > -16 && mv_col < 16 && + mv_col > -16) + this_rdc->rdcost = 7 * (this_rdc->rdcost >> 3); +} + +#if CONFIG_VP9_TEMPORAL_DENOISING +static void vp9_pickmode_ctx_den_update( + VP9_PICKMODE_CTX_DEN *ctx_den, int64_t zero_last_cost_orig, + int ref_frame_cost[MAX_REF_FRAMES], + int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], int reuse_inter_pred, + BEST_PICKMODE *bp) { + ctx_den->zero_last_cost_orig = zero_last_cost_orig; + ctx_den->ref_frame_cost = ref_frame_cost; + ctx_den->frame_mv = frame_mv; + ctx_den->reuse_inter_pred = reuse_inter_pred; + ctx_den->best_tx_size = bp->best_tx_size; + ctx_den->best_mode = bp->best_mode; + ctx_den->best_ref_frame = bp->best_ref_frame; + ctx_den->best_pred_filter = bp->best_pred_filter; + ctx_den->best_mode_skip_txfm = bp->best_mode_skip_txfm; +} + +static void recheck_zeromv_after_denoising( + VP9_COMP *cpi, MODE_INFO *const mi, MACROBLOCK *x, MACROBLOCKD *const xd, + VP9_DENOISER_DECISION decision, VP9_PICKMODE_CTX_DEN *ctx_den, + struct buf_2d yv12_mb[4][MAX_MB_PLANE], RD_COST *best_rdc, BLOCK_SIZE bsize, + int mi_row, int mi_col) { + // If INTRA or GOLDEN reference was selected, re-evaluate ZEROMV on + // denoised result. Only do this under noise conditions, and if rdcost of + // ZEROMV onoriginal source is not significantly higher than rdcost of best + // mode. + if (cpi->noise_estimate.enabled && cpi->noise_estimate.level > kLow && + ctx_den->zero_last_cost_orig < (best_rdc->rdcost << 3) && + ((ctx_den->best_ref_frame == INTRA_FRAME && decision >= FILTER_BLOCK) || + (ctx_den->best_ref_frame == GOLDEN_FRAME && + cpi->svc.number_spatial_layers == 1 && + decision == FILTER_ZEROMV_BLOCK))) { + // Check if we should pick ZEROMV on denoised signal. + VP9_COMMON *const cm = &cpi->common; + int rate = 0; + int64_t dist = 0; + uint32_t var_y = UINT_MAX; + uint32_t sse_y = UINT_MAX; + RD_COST this_rdc; + mi->mode = ZEROMV; + mi->ref_frame[0] = LAST_FRAME; + mi->ref_frame[1] = NO_REF_FRAME; + set_ref_ptrs(cm, xd, mi->ref_frame[0], NO_REF_FRAME); + mi->mv[0].as_int = 0; + mi->interp_filter = EIGHTTAP; + if (cpi->sf.default_interp_filter == BILINEAR) mi->interp_filter = BILINEAR; + xd->plane[0].pre[0] = yv12_mb[LAST_FRAME][0]; + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); + model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y, 0); + this_rdc.rate = rate + ctx_den->ref_frame_cost[LAST_FRAME] + + cpi->inter_mode_cost[x->mbmi_ext->mode_context[LAST_FRAME]] + [INTER_OFFSET(ZEROMV)]; + this_rdc.dist = dist; + this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, rate, dist); + // Don't switch to ZEROMV if the rdcost for ZEROMV on denoised source + // is higher than best_ref mode (on original source). + if (this_rdc.rdcost > best_rdc->rdcost) { + this_rdc = *best_rdc; + mi->mode = ctx_den->best_mode; + mi->ref_frame[0] = ctx_den->best_ref_frame; + set_ref_ptrs(cm, xd, mi->ref_frame[0], NO_REF_FRAME); + mi->interp_filter = ctx_den->best_pred_filter; + if (ctx_den->best_ref_frame == INTRA_FRAME) { + mi->mv[0].as_int = INVALID_MV; + mi->interp_filter = SWITCHABLE_FILTERS; + } else if (ctx_den->best_ref_frame == GOLDEN_FRAME) { + mi->mv[0].as_int = + ctx_den->frame_mv[ctx_den->best_mode][ctx_den->best_ref_frame] + .as_int; + if (ctx_den->reuse_inter_pred) { + xd->plane[0].pre[0] = yv12_mb[GOLDEN_FRAME][0]; + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); + } + } + mi->tx_size = ctx_den->best_tx_size; + x->skip_txfm[0] = ctx_den->best_mode_skip_txfm; + } else { + ctx_den->best_ref_frame = LAST_FRAME; + *best_rdc = this_rdc; + } + } +} +#endif // CONFIG_VP9_TEMPORAL_DENOISING + +static INLINE int get_force_skip_low_temp_var(uint8_t *variance_low, int mi_row, + int mi_col, BLOCK_SIZE bsize) { + const int i = (mi_row & 0x7) >> 1; + const int j = (mi_col & 0x7) >> 1; + int force_skip_low_temp_var = 0; + // Set force_skip_low_temp_var based on the block size and block offset. + if (bsize == BLOCK_64X64) { + force_skip_low_temp_var = variance_low[0]; + } else if (bsize == BLOCK_64X32) { + if (!(mi_col & 0x7) && !(mi_row & 0x7)) { + force_skip_low_temp_var = variance_low[1]; + } else if (!(mi_col & 0x7) && (mi_row & 0x7)) { + force_skip_low_temp_var = variance_low[2]; + } + } else if (bsize == BLOCK_32X64) { + if (!(mi_col & 0x7) && !(mi_row & 0x7)) { + force_skip_low_temp_var = variance_low[3]; + } else if ((mi_col & 0x7) && !(mi_row & 0x7)) { + force_skip_low_temp_var = variance_low[4]; + } + } else if (bsize == BLOCK_32X32) { + if (!(mi_col & 0x7) && !(mi_row & 0x7)) { + force_skip_low_temp_var = variance_low[5]; + } else if ((mi_col & 0x7) && !(mi_row & 0x7)) { + force_skip_low_temp_var = variance_low[6]; + } else if (!(mi_col & 0x7) && (mi_row & 0x7)) { + force_skip_low_temp_var = variance_low[7]; + } else if ((mi_col & 0x7) && (mi_row & 0x7)) { + force_skip_low_temp_var = variance_low[8]; + } + } else if (bsize == BLOCK_16X16) { + force_skip_low_temp_var = variance_low[pos_shift_16x16[i][j]]; + } else if (bsize == BLOCK_32X16) { + // The col shift index for the second 16x16 block. + const int j2 = ((mi_col + 2) & 0x7) >> 1; + // Only if each 16x16 block inside has low temporal variance. + force_skip_low_temp_var = variance_low[pos_shift_16x16[i][j]] && + variance_low[pos_shift_16x16[i][j2]]; + } else if (bsize == BLOCK_16X32) { + // The row shift index for the second 16x16 block. + const int i2 = ((mi_row + 2) & 0x7) >> 1; + force_skip_low_temp_var = variance_low[pos_shift_16x16[i][j]] && + variance_low[pos_shift_16x16[i2][j]]; + } + return force_skip_low_temp_var; +} + +static void search_filter_ref(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, + int mi_row, int mi_col, PRED_BUFFER *tmp, + BLOCK_SIZE bsize, int reuse_inter_pred, + PRED_BUFFER **this_mode_pred, unsigned int *var_y, + unsigned int *sse_y, int force_smooth_filter, + int *this_early_term, int *flag_preduv_computed, + int use_model_yrd_large) { + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mi = xd->mi[0]; + struct macroblockd_plane *const pd = &xd->plane[0]; + const int bw = num_4x4_blocks_wide_lookup[bsize] << 2; + + int pf_rate[3] = { 0 }; + int64_t pf_dist[3] = { 0 }; + int curr_rate[3] = { 0 }; + unsigned int pf_var[3] = { 0 }; + unsigned int pf_sse[3] = { 0 }; + TX_SIZE pf_tx_size[3] = { 0 }; + int64_t best_cost = INT64_MAX; + INTERP_FILTER best_filter = SWITCHABLE, filter; + PRED_BUFFER *current_pred = *this_mode_pred; + uint8_t skip_txfm = SKIP_TXFM_NONE; + int best_early_term = 0; + int best_flag_preduv_computed[2] = { 0 }; + INTERP_FILTER filter_start = force_smooth_filter ? EIGHTTAP_SMOOTH : EIGHTTAP; + INTERP_FILTER filter_end = EIGHTTAP_SMOOTH; + for (filter = filter_start; filter <= filter_end; ++filter) { + int64_t cost; + mi->interp_filter = filter; + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); + // For large partition blocks, extra testing is done. + if (use_model_yrd_large) + model_rd_for_sb_y_large(cpi, bsize, x, xd, &pf_rate[filter], + &pf_dist[filter], &pf_var[filter], + &pf_sse[filter], mi_row, mi_col, this_early_term, + flag_preduv_computed); + else + model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter], &pf_dist[filter], + &pf_var[filter], &pf_sse[filter], 0); + curr_rate[filter] = pf_rate[filter]; + pf_rate[filter] += vp9_get_switchable_rate(cpi, xd); + cost = RDCOST(x->rdmult, x->rddiv, pf_rate[filter], pf_dist[filter]); + pf_tx_size[filter] = mi->tx_size; + if (cost < best_cost) { + best_filter = filter; + best_cost = cost; + skip_txfm = x->skip_txfm[0]; + best_early_term = *this_early_term; + best_flag_preduv_computed[0] = flag_preduv_computed[0]; + best_flag_preduv_computed[1] = flag_preduv_computed[1]; + + if (reuse_inter_pred) { + if (*this_mode_pred != current_pred) { + free_pred_buffer(*this_mode_pred); + *this_mode_pred = current_pred; + } + if (filter != filter_end) { + current_pred = &tmp[get_pred_buffer(tmp, 3)]; + pd->dst.buf = current_pred->data; + pd->dst.stride = bw; + } + } + } + } + + if (reuse_inter_pred && *this_mode_pred != current_pred) + free_pred_buffer(current_pred); + + mi->interp_filter = best_filter; + mi->tx_size = pf_tx_size[best_filter]; + this_rdc->rate = curr_rate[best_filter]; + this_rdc->dist = pf_dist[best_filter]; + *var_y = pf_var[best_filter]; + *sse_y = pf_sse[best_filter]; + x->skip_txfm[0] = skip_txfm; + *this_early_term = best_early_term; + flag_preduv_computed[0] = best_flag_preduv_computed[0]; + flag_preduv_computed[1] = best_flag_preduv_computed[1]; + if (reuse_inter_pred) { + pd->dst.buf = (*this_mode_pred)->data; + pd->dst.stride = (*this_mode_pred)->stride; + } else if (best_filter < filter_end) { + mi->interp_filter = best_filter; + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); + } +} + +static int search_new_mv(VP9_COMP *cpi, MACROBLOCK *x, + int_mv frame_mv[][MAX_REF_FRAMES], + MV_REFERENCE_FRAME ref_frame, int gf_temporal_ref, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int best_pred_sad, int *rate_mv, + unsigned int best_sse_sofar, RD_COST *best_rdc) { + SVC *const svc = &cpi->svc; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mi = xd->mi[0]; + SPEED_FEATURES *const sf = &cpi->sf; + + if (ref_frame > LAST_FRAME && gf_temporal_ref && + cpi->oxcf.rc_mode == VPX_CBR) { + int tmp_sad; + uint32_t dis; + int cost_list[5] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX }; + + if (bsize < BLOCK_16X16) return -1; + + tmp_sad = vp9_int_pro_motion_estimation( + cpi, x, bsize, mi_row, mi_col, + &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv); + + if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) return -1; + if (tmp_sad + (num_pels_log2_lookup[bsize] << 4) > best_pred_sad) return -1; + + frame_mv[NEWMV][ref_frame].as_int = mi->mv[0].as_int; + *rate_mv = vp9_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv, + &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv, + x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); + frame_mv[NEWMV][ref_frame].as_mv.row >>= 3; + frame_mv[NEWMV][ref_frame].as_mv.col >>= 3; + + cpi->find_fractional_mv_step( + x, &frame_mv[NEWMV][ref_frame].as_mv, + &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv, + cpi->common.allow_high_precision_mv, x->errorperbit, + &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, + cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list), + x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref_frame], NULL, 0, 0, + cpi->sf.use_accurate_subpel_search); + } else if (svc->use_base_mv && svc->spatial_layer_id) { + if (frame_mv[NEWMV][ref_frame].as_int != INVALID_MV) { + const int pre_stride = xd->plane[0].pre[0].stride; + unsigned int base_mv_sse = UINT_MAX; + int scale = (cpi->rc.avg_frame_low_motion > 60) ? 2 : 4; + const uint8_t *const pre_buf = + xd->plane[0].pre[0].buf + + (frame_mv[NEWMV][ref_frame].as_mv.row >> 3) * pre_stride + + (frame_mv[NEWMV][ref_frame].as_mv.col >> 3); + cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride, + pre_buf, pre_stride, &base_mv_sse); + + // Exit NEWMV search if base_mv is (0,0) && bsize < BLOCK_16x16, + // for SVC encoding. + if (cpi->use_svc && svc->use_base_mv && bsize < BLOCK_16X16 && + frame_mv[NEWMV][ref_frame].as_mv.row == 0 && + frame_mv[NEWMV][ref_frame].as_mv.col == 0) + return -1; + + // Exit NEWMV search if base_mv_sse is large. + if (sf->base_mv_aggressive && (base_mv_sse >> scale) > best_sse_sofar) + return -1; + if ((base_mv_sse >> 1) < best_sse_sofar) { + // Base layer mv is good. + // Exit NEWMV search if the base_mv is (0, 0) and sse is low, since + // (0, 0) mode is already tested. + unsigned int base_mv_sse_normalized = + base_mv_sse >> + (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + if (sf->base_mv_aggressive && base_mv_sse <= best_sse_sofar && + base_mv_sse_normalized < 400 && + frame_mv[NEWMV][ref_frame].as_mv.row == 0 && + frame_mv[NEWMV][ref_frame].as_mv.col == 0) + return -1; + if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, + &frame_mv[NEWMV][ref_frame], rate_mv, + best_rdc->rdcost, 1)) { + return -1; + } + } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, + &frame_mv[NEWMV][ref_frame], rate_mv, + best_rdc->rdcost, 0)) { + return -1; + } + } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, + &frame_mv[NEWMV][ref_frame], rate_mv, + best_rdc->rdcost, 0)) { + return -1; + } + } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, + &frame_mv[NEWMV][ref_frame], rate_mv, + best_rdc->rdcost, 0)) { + return -1; + } + + return 0; +} + +static INLINE void init_best_pickmode(BEST_PICKMODE *bp) { + bp->best_mode = ZEROMV; + bp->best_ref_frame = LAST_FRAME; + bp->best_tx_size = TX_SIZES; + bp->best_intra_tx_size = TX_SIZES; + bp->best_pred_filter = EIGHTTAP; + bp->best_mode_skip_txfm = SKIP_TXFM_NONE; + bp->best_second_ref_frame = NO_REF_FRAME; + bp->best_pred = NULL; +} + +void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, + int mi_row, int mi_col, RD_COST *rd_cost, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) { + VP9_COMMON *const cm = &cpi->common; + SPEED_FEATURES *const sf = &cpi->sf; + SVC *const svc = &cpi->svc; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mi = xd->mi[0]; + struct macroblockd_plane *const pd = &xd->plane[0]; + + BEST_PICKMODE best_pickmode; + + MV_REFERENCE_FRAME ref_frame; + MV_REFERENCE_FRAME usable_ref_frame, second_ref_frame; + int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; + uint8_t mode_checked[MB_MODE_COUNT][MAX_REF_FRAMES]; + struct buf_2d yv12_mb[4][MAX_MB_PLANE]; + RD_COST this_rdc, best_rdc; + // var_y and sse_y are saved to be used in skipping checking + unsigned int var_y = UINT_MAX; + unsigned int sse_y = UINT_MAX; + const int intra_cost_penalty = + vp9_get_intra_cost_penalty(cpi, bsize, cm->base_qindex, cm->y_dc_delta_q); + int64_t inter_mode_thresh = + RDCOST(x->rdmult, x->rddiv, intra_cost_penalty, 0); + const int *const rd_threshes = cpi->rd.threshes[mi->segment_id][bsize]; + const int sb_row = mi_row >> MI_BLOCK_SIZE_LOG2; + int thresh_freq_fact_idx = (sb_row * BLOCK_SIZES + bsize) * MAX_MODES; + const int *const rd_thresh_freq_fact = + (cpi->sf.adaptive_rd_thresh_row_mt) + ? &(tile_data->row_base_thresh_freq_fact[thresh_freq_fact_idx]) + : tile_data->thresh_freq_fact[bsize]; +#if CONFIG_VP9_TEMPORAL_DENOISING + const int denoise_recheck_zeromv = 1; +#endif + INTERP_FILTER filter_ref; + int pred_filter_search = cm->interp_filter == SWITCHABLE; + int const_motion[MAX_REF_FRAMES] = { 0 }; + const int bh = num_4x4_blocks_high_lookup[bsize] << 2; + const int bw = num_4x4_blocks_wide_lookup[bsize] << 2; + // For speed 6, the result of interp filter is reused later in actual encoding + // process. + // tmp[3] points to dst buffer, and the other 3 point to allocated buffers. + PRED_BUFFER tmp[4]; + DECLARE_ALIGNED(16, uint8_t, pred_buf[3 * 64 * 64] VPX_UNINITIALIZED); +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, pred_buf_16[3 * 64 * 64] VPX_UNINITIALIZED); +#endif + struct buf_2d orig_dst = pd->dst; + PRED_BUFFER *this_mode_pred = NULL; + const int pixels_in_block = bh * bw; + int reuse_inter_pred = cpi->sf.reuse_inter_pred_sby && ctx->pred_pixel_ready; + int ref_frame_skip_mask = 0; + int idx; + int best_pred_sad = INT_MAX; + int best_early_term = 0; + int ref_frame_cost[MAX_REF_FRAMES]; + int svc_force_zero_mode[3] = { 0 }; + int perform_intra_pred = 1; + int use_golden_nonzeromv = 1; + int force_skip_low_temp_var = 0; + int skip_ref_find_pred[4] = { 0 }; + unsigned int sse_zeromv_normalized = UINT_MAX; + unsigned int best_sse_sofar = UINT_MAX; + int gf_temporal_ref = 0; + int force_test_gf_zeromv = 0; +#if CONFIG_VP9_TEMPORAL_DENOISING + VP9_PICKMODE_CTX_DEN ctx_den; + int64_t zero_last_cost_orig = INT64_MAX; + int denoise_svc_pickmode = 1; +#endif + INTERP_FILTER filter_gf_svc = EIGHTTAP; + MV_REFERENCE_FRAME inter_layer_ref = GOLDEN_FRAME; + const struct segmentation *const seg = &cm->seg; + int comp_modes = 0; + int num_inter_modes = (cpi->use_svc) ? RT_INTER_MODES_SVC : RT_INTER_MODES; + int flag_svc_subpel = 0; + int svc_mv_col = 0; + int svc_mv_row = 0; + int no_scaling = 0; + int large_block = 0; + int use_model_yrd_large = 0; + unsigned int thresh_svc_skip_golden = 500; + unsigned int thresh_skip_golden = 500; + int force_smooth_filter = cpi->sf.force_smooth_interpol; + int scene_change_detected = + cpi->rc.high_source_sad || + (cpi->use_svc && cpi->svc.high_source_sad_superframe); + + init_best_pickmode(&best_pickmode); + + x->encode_breakout = seg->enabled + ? cpi->segment_encode_breakout[mi->segment_id] + : cpi->encode_breakout; + + x->source_variance = UINT_MAX; + if (cpi->sf.default_interp_filter == BILINEAR) { + best_pickmode.best_pred_filter = BILINEAR; + filter_gf_svc = BILINEAR; + } + if (cpi->use_svc && svc->spatial_layer_id > 0) { + int layer = + LAYER_IDS_TO_IDX(svc->spatial_layer_id - 1, svc->temporal_layer_id, + svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + if (lc->scaling_factor_num == lc->scaling_factor_den) no_scaling = 1; + } + if (svc->spatial_layer_id > 0 && + (svc->high_source_sad_superframe || no_scaling)) + thresh_svc_skip_golden = 0; + // Lower the skip threshold if lower spatial layer is better quality relative + // to current layer. + else if (svc->spatial_layer_id > 0 && cm->base_qindex > 150 && + cm->base_qindex > svc->lower_layer_qindex + 15) + thresh_svc_skip_golden = 100; + // Increase skip threshold if lower spatial layer is lower quality relative + // to current layer. + else if (svc->spatial_layer_id > 0 && cm->base_qindex < 140 && + cm->base_qindex < svc->lower_layer_qindex - 20) + thresh_svc_skip_golden = 1000; + + if (!cpi->use_svc || + (svc->use_gf_temporal_ref_current_layer && + !svc->layer_context[svc->temporal_layer_id].is_key_frame)) { + struct scale_factors *const sf_last = &cm->frame_refs[LAST_FRAME - 1].sf; + struct scale_factors *const sf_golden = + &cm->frame_refs[GOLDEN_FRAME - 1].sf; + gf_temporal_ref = 1; + // For temporal long term prediction, check that the golden reference + // is same scale as last reference, otherwise disable. + if ((sf_last->x_scale_fp != sf_golden->x_scale_fp) || + (sf_last->y_scale_fp != sf_golden->y_scale_fp)) { + gf_temporal_ref = 0; + } else { + if (cpi->rc.avg_frame_low_motion > 70) + thresh_svc_skip_golden = 500; + else + thresh_svc_skip_golden = 0; + } + } + + init_ref_frame_cost(cm, xd, ref_frame_cost); + memset(&mode_checked[0][0], 0, MB_MODE_COUNT * MAX_REF_FRAMES); + + if (reuse_inter_pred) { + int i; + for (i = 0; i < 3; i++) { +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) + tmp[i].data = CONVERT_TO_BYTEPTR(&pred_buf_16[pixels_in_block * i]); + else + tmp[i].data = &pred_buf[pixels_in_block * i]; +#else + tmp[i].data = &pred_buf[pixels_in_block * i]; +#endif // CONFIG_VP9_HIGHBITDEPTH + tmp[i].stride = bw; + tmp[i].in_use = 0; + } + tmp[3].data = pd->dst.buf; + tmp[3].stride = pd->dst.stride; + tmp[3].in_use = 0; + } + + x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; + x->skip = 0; + + if (cpi->sf.cb_pred_filter_search) { + const int bsl = mi_width_log2_lookup[bsize]; + pred_filter_search = cm->interp_filter == SWITCHABLE + ? (((mi_row + mi_col) >> bsl) + + get_chessboard_index(cm->current_video_frame)) & + 0x1 + : 0; + } + // Instead of using vp9_get_pred_context_switchable_interp(xd) to assign + // filter_ref, we use a less strict condition on assigning filter_ref. + // This is to reduce the probabily of entering the flow of not assigning + // filter_ref and then skip filter search. + filter_ref = cm->interp_filter; + if (cpi->sf.default_interp_filter != BILINEAR) { + if (xd->above_mi && is_inter_block(xd->above_mi)) + filter_ref = xd->above_mi->interp_filter; + else if (xd->left_mi && is_inter_block(xd->left_mi)) + filter_ref = xd->left_mi->interp_filter; + } + + // initialize mode decisions + vp9_rd_cost_reset(&best_rdc); + vp9_rd_cost_reset(rd_cost); + mi->sb_type = bsize; + mi->ref_frame[0] = NO_REF_FRAME; + mi->ref_frame[1] = NO_REF_FRAME; + + mi->tx_size = + VPXMIN(max_txsize_lookup[bsize], tx_mode_to_biggest_tx_size[cm->tx_mode]); + + if (sf->short_circuit_flat_blocks || sf->limit_newmv_early_exit) { +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + x->source_variance = vp9_high_get_sby_perpixel_variance( + cpi, &x->plane[0].src, bsize, xd->bd); + else +#endif // CONFIG_VP9_HIGHBITDEPTH + x->source_variance = + vp9_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize); + + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && + cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && mi->segment_id > 0 && + x->zero_temp_sad_source && x->source_variance == 0) { + mi->segment_id = 0; + vp9_init_plane_quantizers(cpi, x); + } + } + +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0) { + if (cpi->use_svc) denoise_svc_pickmode = vp9_denoise_svc_non_key(cpi); + if (cpi->denoiser.denoising_level > kDenLowLow && denoise_svc_pickmode) + vp9_denoiser_reset_frame_stats(ctx); + } +#endif + + if (cpi->rc.frames_since_golden == 0 && gf_temporal_ref && + !cpi->rc.alt_ref_gf_group && !cpi->rc.last_frame_is_src_altref) { + usable_ref_frame = LAST_FRAME; + } else { + usable_ref_frame = GOLDEN_FRAME; + } + + if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR) { + if (cpi->rc.alt_ref_gf_group || cpi->rc.is_src_frame_alt_ref) + usable_ref_frame = ALTREF_FRAME; + + if (cpi->rc.is_src_frame_alt_ref) { + skip_ref_find_pred[LAST_FRAME] = 1; + skip_ref_find_pred[GOLDEN_FRAME] = 1; + } + if (!cm->show_frame) { + if (cpi->rc.frames_since_key == 1) { + usable_ref_frame = LAST_FRAME; + skip_ref_find_pred[GOLDEN_FRAME] = 1; + skip_ref_find_pred[ALTREF_FRAME] = 1; + } + } + } + + // For svc mode, on spatial_layer_id > 0: if the reference has different scale + // constrain the inter mode to only test zero motion. + if (cpi->use_svc && svc->force_zero_mode_spatial_ref && + svc->spatial_layer_id > 0 && !gf_temporal_ref) { + if (cpi->ref_frame_flags & VP9_LAST_FLAG) { + struct scale_factors *const ref_sf = &cm->frame_refs[LAST_FRAME - 1].sf; + if (vp9_is_scaled(ref_sf)) { + svc_force_zero_mode[LAST_FRAME - 1] = 1; + inter_layer_ref = LAST_FRAME; + } + } + if (cpi->ref_frame_flags & VP9_GOLD_FLAG) { + struct scale_factors *const ref_sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf; + if (vp9_is_scaled(ref_sf)) { + svc_force_zero_mode[GOLDEN_FRAME - 1] = 1; + inter_layer_ref = GOLDEN_FRAME; + } + } + } + + if (cpi->sf.short_circuit_low_temp_var) { + force_skip_low_temp_var = + get_force_skip_low_temp_var(&x->variance_low[0], mi_row, mi_col, bsize); + // If force_skip_low_temp_var is set, and for short circuit mode = 1 and 3, + // skip golden reference. + if ((cpi->sf.short_circuit_low_temp_var == 1 || + cpi->sf.short_circuit_low_temp_var == 3) && + force_skip_low_temp_var) { + usable_ref_frame = LAST_FRAME; + } + } + + if (sf->disable_golden_ref && (x->content_state_sb != kVeryHighSad || + cpi->rc.avg_frame_low_motion < 60)) + usable_ref_frame = LAST_FRAME; + + if (!((cpi->ref_frame_flags & VP9_GOLD_FLAG) && + !svc_force_zero_mode[GOLDEN_FRAME - 1] && !force_skip_low_temp_var)) + use_golden_nonzeromv = 0; + + if (cpi->oxcf.speed >= 8 && !cpi->use_svc && + ((cpi->rc.frames_since_golden + 1) < x->last_sb_high_content || + x->last_sb_high_content > 40 || cpi->rc.frames_since_golden > 120)) + usable_ref_frame = LAST_FRAME; + + // Compound prediction modes: (0,0) on LAST/GOLDEN and ARF. + if (cm->reference_mode == REFERENCE_MODE_SELECT && + cpi->sf.use_compound_nonrd_pickmode && usable_ref_frame == ALTREF_FRAME) + comp_modes = 2; + + // If the segment reference frame feature is enabled and it's set to GOLDEN + // reference, then make sure we don't skip checking GOLDEN, this is to + // prevent possibility of not picking any mode. + if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) == GOLDEN_FRAME) { + usable_ref_frame = GOLDEN_FRAME; + skip_ref_find_pred[GOLDEN_FRAME] = 0; + thresh_svc_skip_golden = 0; + } + + for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) { + // Skip find_predictor if the reference frame is not in the + // ref_frame_flags (i.e., not used as a reference for this frame). + skip_ref_find_pred[ref_frame] = + !(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)); + if (!skip_ref_find_pred[ref_frame]) { + find_predictors(cpi, x, ref_frame, frame_mv, const_motion, + &ref_frame_skip_mask, tile_data, mi_row, mi_col, yv12_mb, + bsize, force_skip_low_temp_var, comp_modes > 0); + } + } + + if (cpi->use_svc || cpi->oxcf.speed <= 7 || bsize < BLOCK_32X32) + x->sb_use_mv_part = 0; + + // Set the flag_svc_subpel to 1 for SVC if the lower spatial layer used + // an averaging filter for downsampling (phase = 8). If so, we will test + // a nonzero motion mode on the spatial reference. + // The nonzero motion is half pixel shifted to left and top (-4, -4). + if (cpi->use_svc && svc->spatial_layer_id > 0 && + svc_force_zero_mode[inter_layer_ref - 1] && + svc->downsample_filter_phase[svc->spatial_layer_id - 1] == 8 && + !gf_temporal_ref) { + svc_mv_col = -4; + svc_mv_row = -4; + flag_svc_subpel = 1; + } + + // For SVC with quality layers, when QP of lower layer is lower + // than current layer: force check of GF-ZEROMV before early exit + // due to skip flag. + if (svc->spatial_layer_id > 0 && no_scaling && + (cpi->ref_frame_flags & VP9_GOLD_FLAG) && + cm->base_qindex > svc->lower_layer_qindex + 10) + force_test_gf_zeromv = 1; + + // For low motion content use x->sb_is_skin in addition to VeryHighSad + // for setting large_block. + large_block = (x->content_state_sb == kVeryHighSad || + (x->sb_is_skin && cpi->rc.avg_frame_low_motion > 70) || + cpi->oxcf.speed < 7) + ? bsize > BLOCK_32X32 + : bsize >= BLOCK_32X32; + use_model_yrd_large = + cpi->oxcf.rc_mode == VPX_CBR && large_block && + !cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) && + cm->base_qindex; + + for (idx = 0; idx < num_inter_modes + comp_modes; ++idx) { + int rate_mv = 0; + int mode_rd_thresh; + int mode_index; + int i; + int64_t this_sse; + int is_skippable; + int this_early_term = 0; + int rd_computed = 0; + int flag_preduv_computed[2] = { 0 }; + int inter_mv_mode = 0; + int skip_this_mv = 0; + int comp_pred = 0; + int force_mv_inter_layer = 0; + PREDICTION_MODE this_mode; + second_ref_frame = NO_REF_FRAME; + + if (idx < num_inter_modes) { + this_mode = ref_mode_set[idx].pred_mode; + ref_frame = ref_mode_set[idx].ref_frame; + + if (cpi->use_svc) { + this_mode = ref_mode_set_svc[idx].pred_mode; + ref_frame = ref_mode_set_svc[idx].ref_frame; + } + } else { + // Add (0,0) compound modes. + this_mode = ZEROMV; + ref_frame = LAST_FRAME; + if (idx == num_inter_modes + comp_modes - 1) ref_frame = GOLDEN_FRAME; + second_ref_frame = ALTREF_FRAME; + comp_pred = 1; + } + + if (ref_frame > usable_ref_frame) continue; + if (skip_ref_find_pred[ref_frame]) continue; + + if (svc->previous_frame_is_intra_only) { + if (ref_frame != LAST_FRAME || frame_mv[this_mode][ref_frame].as_int != 0) + continue; + } + + // If the segment reference frame feature is enabled then do nothing if the + // current ref frame is not allowed. + if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) + continue; + + if (flag_svc_subpel && ref_frame == inter_layer_ref) { + force_mv_inter_layer = 1; + // Only test mode if NEARESTMV/NEARMV is (svc_mv_col, svc_mv_row), + // otherwise set NEWMV to (svc_mv_col, svc_mv_row). + if (this_mode == NEWMV) { + frame_mv[this_mode][ref_frame].as_mv.col = svc_mv_col; + frame_mv[this_mode][ref_frame].as_mv.row = svc_mv_row; + } else if (frame_mv[this_mode][ref_frame].as_mv.col != svc_mv_col || + frame_mv[this_mode][ref_frame].as_mv.row != svc_mv_row) { + continue; + } + } + + if (comp_pred) { + if (!cpi->allow_comp_inter_inter) continue; + // Skip compound inter modes if ARF is not available. + if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame))) + continue; + // Do not allow compound prediction if the segment level reference frame + // feature is in use as in this case there can only be one reference. + if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) continue; + } + + // For CBR mode: skip the golden reference search if sse of zeromv_last is + // below threshold. + if (ref_frame == GOLDEN_FRAME && cpi->oxcf.rc_mode == VPX_CBR && + ((cpi->use_svc && sse_zeromv_normalized < thresh_svc_skip_golden) || + (!cpi->use_svc && sse_zeromv_normalized < thresh_skip_golden))) + continue; + + if (!(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) continue; + + // For screen content. If zero_temp_sad source is computed: skip + // non-zero motion check for stationary blocks. If the superblock is + // non-stationary then for flat blocks skip the zero last check (keep golden + // as it may be inter-layer reference). Otherwise (if zero_temp_sad_source + // is not computed) skip non-zero motion check for flat blocks. + // TODO(marpan): Compute zero_temp_sad_source per coding block. + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) { + if (cpi->compute_source_sad_onepass && cpi->sf.use_source_sad) { + if ((frame_mv[this_mode][ref_frame].as_int != 0 && + x->zero_temp_sad_source) || + (frame_mv[this_mode][ref_frame].as_int == 0 && + x->source_variance == 0 && ref_frame == LAST_FRAME && + !x->zero_temp_sad_source)) + continue; + } else if (frame_mv[this_mode][ref_frame].as_int != 0 && + x->source_variance == 0) { + continue; + } + } + + if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode))) continue; + + if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR) { + if (cpi->rc.is_src_frame_alt_ref && + (ref_frame != ALTREF_FRAME || + frame_mv[this_mode][ref_frame].as_int != 0)) + continue; + + if (!cm->show_frame && ref_frame == ALTREF_FRAME && + frame_mv[this_mode][ref_frame].as_int != 0) + continue; + + if (cpi->rc.alt_ref_gf_group && cm->show_frame && + cpi->rc.frames_since_golden > (cpi->rc.baseline_gf_interval >> 1) && + ref_frame == GOLDEN_FRAME && + frame_mv[this_mode][ref_frame].as_int != 0) + continue; + + if (cpi->rc.alt_ref_gf_group && cm->show_frame && + cpi->rc.frames_since_golden > 0 && + cpi->rc.frames_since_golden < (cpi->rc.baseline_gf_interval >> 1) && + ref_frame == ALTREF_FRAME && + frame_mv[this_mode][ref_frame].as_int != 0) + continue; + } + + if (const_motion[ref_frame] && this_mode == NEARMV) continue; + + // Skip non-zeromv mode search for golden frame if force_skip_low_temp_var + // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped + // later. + if (!force_mv_inter_layer && force_skip_low_temp_var && + ref_frame == GOLDEN_FRAME && + frame_mv[this_mode][ref_frame].as_int != 0) { + continue; + } + + if (x->content_state_sb != kVeryHighSad && + (cpi->sf.short_circuit_low_temp_var >= 2 || + (cpi->sf.short_circuit_low_temp_var == 1 && bsize == BLOCK_64X64)) && + force_skip_low_temp_var && ref_frame == LAST_FRAME && + this_mode == NEWMV) { + continue; + } + + if (cpi->use_svc) { + if (!force_mv_inter_layer && svc_force_zero_mode[ref_frame - 1] && + frame_mv[this_mode][ref_frame].as_int != 0) + continue; + } + + // Disable this drop out case if the ref frame segment level feature is + // enabled for this segment. This is to prevent the possibility that we end + // up unable to pick any mode. + if (!segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) { + if (sf->reference_masking && + !(frame_mv[this_mode][ref_frame].as_int == 0 && + ref_frame == LAST_FRAME)) { + if (usable_ref_frame < ALTREF_FRAME) { + if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) { + i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME; + if ((cpi->ref_frame_flags & ref_frame_to_flag(i))) + if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1)) + ref_frame_skip_mask |= (1 << ref_frame); + } + } else if (!cpi->rc.is_src_frame_alt_ref && + !(frame_mv[this_mode][ref_frame].as_int == 0 && + ref_frame == ALTREF_FRAME)) { + int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME; + int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME; + if (((cpi->ref_frame_flags & ref_frame_to_flag(ref1)) && + (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) || + ((cpi->ref_frame_flags & ref_frame_to_flag(ref2)) && + (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1)))) + ref_frame_skip_mask |= (1 << ref_frame); + } + } + if (ref_frame_skip_mask & (1 << ref_frame)) continue; + } + + // Select prediction reference frames. + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].pre[0] = yv12_mb[ref_frame][i]; + if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i]; + } + + mi->ref_frame[0] = ref_frame; + mi->ref_frame[1] = second_ref_frame; + set_ref_ptrs(cm, xd, ref_frame, second_ref_frame); + + mode_index = mode_idx[ref_frame][INTER_OFFSET(this_mode)]; + mode_rd_thresh = best_pickmode.best_mode_skip_txfm + ? rd_threshes[mode_index] << 1 + : rd_threshes[mode_index]; + + // Increase mode_rd_thresh value for GOLDEN_FRAME for improved encoding + // speed with little/no subjective quality loss. + if (cpi->sf.bias_golden && ref_frame == GOLDEN_FRAME && + cpi->rc.frames_since_golden > 4) + mode_rd_thresh = mode_rd_thresh << 3; + + if ((cpi->sf.adaptive_rd_thresh_row_mt && + rd_less_than_thresh_row_mt(best_rdc.rdcost, mode_rd_thresh, + &rd_thresh_freq_fact[mode_index])) || + (!cpi->sf.adaptive_rd_thresh_row_mt && + rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh, + &rd_thresh_freq_fact[mode_index]))) + if (frame_mv[this_mode][ref_frame].as_int != 0) continue; + + if (this_mode == NEWMV && !force_mv_inter_layer) { + if (search_new_mv(cpi, x, frame_mv, ref_frame, gf_temporal_ref, bsize, + mi_row, mi_col, best_pred_sad, &rate_mv, best_sse_sofar, + &best_rdc)) + continue; + } + + // TODO(jianj): Skipping the testing of (duplicate) non-zero motion vector + // causes some regression, leave it for duplicate zero-mv for now, until + // regression issue is resolved. + for (inter_mv_mode = NEARESTMV; inter_mv_mode <= NEWMV; inter_mv_mode++) { + if (inter_mv_mode == this_mode || comp_pred) continue; + if (mode_checked[inter_mv_mode][ref_frame] && + frame_mv[this_mode][ref_frame].as_int == + frame_mv[inter_mv_mode][ref_frame].as_int && + frame_mv[inter_mv_mode][ref_frame].as_int == 0) { + skip_this_mv = 1; + break; + } + } + + if (skip_this_mv) continue; + + // If use_golden_nonzeromv is false, NEWMV mode is skipped for golden, no + // need to compute best_pred_sad which is only used to skip golden NEWMV. + if (use_golden_nonzeromv && this_mode == NEWMV && ref_frame == LAST_FRAME && + frame_mv[NEWMV][LAST_FRAME].as_int != INVALID_MV) { + const int pre_stride = xd->plane[0].pre[0].stride; + const uint8_t *const pre_buf = + xd->plane[0].pre[0].buf + + (frame_mv[NEWMV][LAST_FRAME].as_mv.row >> 3) * pre_stride + + (frame_mv[NEWMV][LAST_FRAME].as_mv.col >> 3); + best_pred_sad = cpi->fn_ptr[bsize].sdf( + x->plane[0].src.buf, x->plane[0].src.stride, pre_buf, pre_stride); + x->pred_mv_sad[LAST_FRAME] = best_pred_sad; + } + + if (this_mode != NEARESTMV && !comp_pred && + frame_mv[this_mode][ref_frame].as_int == + frame_mv[NEARESTMV][ref_frame].as_int) + continue; + + mi->mode = this_mode; + mi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int; + mi->mv[1].as_int = 0; + + // Search for the best prediction filter type, when the resulting + // motion vector is at sub-pixel accuracy level for luma component, i.e., + // the last three bits are all zeros. + if (reuse_inter_pred) { + if (!this_mode_pred) { + this_mode_pred = &tmp[3]; + } else { + this_mode_pred = &tmp[get_pred_buffer(tmp, 3)]; + pd->dst.buf = this_mode_pred->data; + pd->dst.stride = bw; + } + } + + if ((this_mode == NEWMV || filter_ref == SWITCHABLE) && + pred_filter_search && + (ref_frame == LAST_FRAME || + (ref_frame == GOLDEN_FRAME && !force_mv_inter_layer && + (cpi->use_svc || cpi->oxcf.rc_mode == VPX_VBR))) && + (((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07) != 0)) { + rd_computed = 1; + search_filter_ref(cpi, x, &this_rdc, mi_row, mi_col, tmp, bsize, + reuse_inter_pred, &this_mode_pred, &var_y, &sse_y, + force_smooth_filter, &this_early_term, + flag_preduv_computed, use_model_yrd_large); + } else { + mi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP : filter_ref; + + if (cpi->use_svc && ref_frame == GOLDEN_FRAME && + svc_force_zero_mode[ref_frame - 1]) + mi->interp_filter = filter_gf_svc; + + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); + + // For large partition blocks, extra testing is done. + if (use_model_yrd_large) { + rd_computed = 1; + model_rd_for_sb_y_large(cpi, bsize, x, xd, &this_rdc.rate, + &this_rdc.dist, &var_y, &sse_y, mi_row, mi_col, + &this_early_term, flag_preduv_computed); + } else { + rd_computed = 1; + model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist, + &var_y, &sse_y, 0); + } + // Save normalized sse (between current and last frame) for (0, 0) motion. + if (ref_frame == LAST_FRAME && + frame_mv[this_mode][ref_frame].as_int == 0) { + sse_zeromv_normalized = + sse_y >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + } + if (sse_y < best_sse_sofar) best_sse_sofar = sse_y; + } + + if (!this_early_term) { + this_sse = (int64_t)sse_y; + block_yrd(cpi, x, &this_rdc, &is_skippable, &this_sse, bsize, + VPXMIN(mi->tx_size, TX_16X16), rd_computed, 0); + x->skip_txfm[0] = is_skippable; + if (is_skippable) { + this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1); + } else { + if (RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist) < + RDCOST(x->rdmult, x->rddiv, 0, this_sse)) { + this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0); + } else { + this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1); + this_rdc.dist = this_sse; + x->skip_txfm[0] = SKIP_TXFM_AC_DC; + } + } + + if (cm->interp_filter == SWITCHABLE) { + if ((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07) + this_rdc.rate += vp9_get_switchable_rate(cpi, xd); + } + } else { + if (cm->interp_filter == SWITCHABLE) { + if ((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07) + this_rdc.rate += vp9_get_switchable_rate(cpi, xd); + } + this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1); + } + + if (!this_early_term && + (x->color_sensitivity[0] || x->color_sensitivity[1])) { + RD_COST rdc_uv; + const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, &xd->plane[1]); + if (x->color_sensitivity[0] && !flag_preduv_computed[0]) { + vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 1); + flag_preduv_computed[0] = 1; + } + if (x->color_sensitivity[1] && !flag_preduv_computed[1]) { + vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 2); + flag_preduv_computed[1] = 1; + } + model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &rdc_uv, &var_y, &sse_y, 1, 2); + this_rdc.rate += rdc_uv.rate; + this_rdc.dist += rdc_uv.dist; + } + + this_rdc.rate += rate_mv; + this_rdc.rate += cpi->inter_mode_cost[x->mbmi_ext->mode_context[ref_frame]] + [INTER_OFFSET(this_mode)]; + // TODO(marpan): Add costing for compound mode. + this_rdc.rate += ref_frame_cost[ref_frame]; + this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist); + + // Bias against NEWMV that is very different from its neighbors, and bias + // to small motion-lastref for noisy input. + if (cpi->oxcf.rc_mode == VPX_CBR && cpi->oxcf.speed >= 5 && + cpi->oxcf.content != VP9E_CONTENT_SCREEN) { + vp9_NEWMV_diff_bias(&cpi->noise_estimate, xd, this_mode, &this_rdc, bsize, + frame_mv[this_mode][ref_frame].as_mv.row, + frame_mv[this_mode][ref_frame].as_mv.col, + ref_frame == LAST_FRAME, x->lowvar_highsumdiff, + x->sb_is_skin); + } + + // Skipping checking: test to see if this block can be reconstructed by + // prediction only. + if (cpi->allow_encode_breakout && !xd->lossless && !scene_change_detected && + !svc->high_num_blocks_with_motion) { + encode_breakout_test(cpi, x, bsize, mi_row, mi_col, ref_frame, this_mode, + var_y, sse_y, yv12_mb, &this_rdc.rate, + &this_rdc.dist, flag_preduv_computed); + if (x->skip) { + this_rdc.rate += rate_mv; + this_rdc.rdcost = + RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist); + } + } + + // On spatially flat blocks for screne content: bias against zero-last + // if the sse_y is non-zero. Only on scene change or high motion frames. + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && + (scene_change_detected || svc->high_num_blocks_with_motion) && + ref_frame == LAST_FRAME && frame_mv[this_mode][ref_frame].as_int == 0 && + svc->spatial_layer_id == 0 && x->source_variance == 0 && sse_y > 0) { + this_rdc.rdcost = this_rdc.rdcost << 2; + } + +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc_pickmode && + cpi->denoiser.denoising_level > kDenLowLow) { + vp9_denoiser_update_frame_stats(mi, sse_y, this_mode, ctx); + // Keep track of zero_last cost. + if (ref_frame == LAST_FRAME && frame_mv[this_mode][ref_frame].as_int == 0) + zero_last_cost_orig = this_rdc.rdcost; + } +#else + (void)ctx; +#endif + + mode_checked[this_mode][ref_frame] = 1; + + if (this_rdc.rdcost < best_rdc.rdcost || x->skip) { + best_rdc = this_rdc; + best_early_term = this_early_term; + best_pickmode.best_mode = this_mode; + best_pickmode.best_pred_filter = mi->interp_filter; + best_pickmode.best_tx_size = mi->tx_size; + best_pickmode.best_ref_frame = ref_frame; + best_pickmode.best_mode_skip_txfm = x->skip_txfm[0]; + best_pickmode.best_second_ref_frame = second_ref_frame; + + if (reuse_inter_pred) { + free_pred_buffer(best_pickmode.best_pred); + best_pickmode.best_pred = this_mode_pred; + } + } else { + if (reuse_inter_pred) free_pred_buffer(this_mode_pred); + } + + if (x->skip && + (!force_test_gf_zeromv || mode_checked[ZEROMV][GOLDEN_FRAME])) + break; + + // If early termination flag is 1 and at least 2 modes are checked, + // the mode search is terminated. + if (best_early_term && idx > 0 && !scene_change_detected && + (!force_test_gf_zeromv || mode_checked[ZEROMV][GOLDEN_FRAME])) { + x->skip = 1; + break; + } + } + + mi->mode = best_pickmode.best_mode; + mi->interp_filter = best_pickmode.best_pred_filter; + mi->tx_size = best_pickmode.best_tx_size; + mi->ref_frame[0] = best_pickmode.best_ref_frame; + mi->mv[0].as_int = + frame_mv[best_pickmode.best_mode][best_pickmode.best_ref_frame].as_int; + xd->mi[0]->bmi[0].as_mv[0].as_int = mi->mv[0].as_int; + x->skip_txfm[0] = best_pickmode.best_mode_skip_txfm; + mi->ref_frame[1] = best_pickmode.best_second_ref_frame; + + // For spatial enhancemanent layer: perform intra prediction only if base + // layer is chosen as the reference. Always perform intra prediction if + // LAST is the only reference, or is_key_frame is set, or on base + // temporal layer. + if (svc->spatial_layer_id && !gf_temporal_ref) { + perform_intra_pred = + svc->temporal_layer_id == 0 || + svc->layer_context[svc->temporal_layer_id].is_key_frame || + !(cpi->ref_frame_flags & VP9_GOLD_FLAG) || + (!svc->layer_context[svc->temporal_layer_id].is_key_frame && + svc_force_zero_mode[best_pickmode.best_ref_frame - 1]); + inter_mode_thresh = (inter_mode_thresh << 1) + inter_mode_thresh; + } + if ((cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR && + cpi->rc.is_src_frame_alt_ref) || + svc->previous_frame_is_intra_only) + perform_intra_pred = 0; + + // If the segment reference frame feature is enabled and set then + // skip the intra prediction. + if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) > 0) + perform_intra_pred = 0; + + // Perform intra prediction search, if the best SAD is above a certain + // threshold. + if (best_rdc.rdcost == INT64_MAX || + (cpi->oxcf.content == VP9E_CONTENT_SCREEN && x->source_variance == 0) || + (scene_change_detected && perform_intra_pred) || + ((!force_skip_low_temp_var || bsize < BLOCK_32X32 || + x->content_state_sb == kVeryHighSad) && + perform_intra_pred && !x->skip && best_rdc.rdcost > inter_mode_thresh && + bsize <= cpi->sf.max_intra_bsize && !x->skip_low_source_sad && + !x->lowvar_highsumdiff)) { + struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 }; + int64_t this_sse = INT64_MAX; + int i; + PRED_BUFFER *const best_pred = best_pickmode.best_pred; + TX_SIZE intra_tx_size = + VPXMIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); + + if (reuse_inter_pred && best_pred != NULL) { + if (best_pred->data == orig_dst.buf) { + this_mode_pred = &tmp[get_pred_buffer(tmp, 3)]; +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) + vpx_highbd_convolve_copy( + CONVERT_TO_SHORTPTR(best_pred->data), best_pred->stride, + CONVERT_TO_SHORTPTR(this_mode_pred->data), this_mode_pred->stride, + NULL, 0, 0, 0, 0, bw, bh, xd->bd); + else + vpx_convolve_copy(best_pred->data, best_pred->stride, + this_mode_pred->data, this_mode_pred->stride, NULL, + 0, 0, 0, 0, bw, bh); +#else + vpx_convolve_copy(best_pred->data, best_pred->stride, + this_mode_pred->data, this_mode_pred->stride, NULL, 0, + 0, 0, 0, bw, bh); +#endif // CONFIG_VP9_HIGHBITDEPTH + best_pickmode.best_pred = this_mode_pred; + } + } + pd->dst = orig_dst; + + for (i = 0; i < 4; ++i) { + const PREDICTION_MODE this_mode = intra_mode_list[i]; + THR_MODES mode_index = mode_idx[INTRA_FRAME][mode_offset(this_mode)]; + int mode_rd_thresh = rd_threshes[mode_index]; + // For spatially flat blocks, under short_circuit_flat_blocks flag: + // only check DC mode for stationary blocks, otherwise also check + // H and V mode. + if (sf->short_circuit_flat_blocks && x->source_variance == 0 && + ((x->zero_temp_sad_source && this_mode != DC_PRED) || i > 2)) { + continue; + } + + if (!((1 << this_mode) & cpi->sf.intra_y_mode_bsize_mask[bsize])) + continue; + + if (cpi->sf.rt_intra_dc_only_low_content && this_mode != DC_PRED && + x->content_state_sb != kVeryHighSad) + continue; + + if ((cpi->sf.adaptive_rd_thresh_row_mt && + rd_less_than_thresh_row_mt(best_rdc.rdcost, mode_rd_thresh, + &rd_thresh_freq_fact[mode_index])) || + (!cpi->sf.adaptive_rd_thresh_row_mt && + rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh, + &rd_thresh_freq_fact[mode_index]))) { + // Avoid this early exit for screen on base layer, for scene + // changes or high motion frames. + if (cpi->oxcf.content != VP9E_CONTENT_SCREEN || + svc->spatial_layer_id > 0 || + (!scene_change_detected && !svc->high_num_blocks_with_motion)) + continue; + } + + mi->mode = this_mode; + mi->ref_frame[0] = INTRA_FRAME; + this_rdc.dist = this_rdc.rate = 0; + args.mode = this_mode; + args.skippable = 1; + args.rdc = &this_rdc; + mi->tx_size = intra_tx_size; + + compute_intra_yprediction(this_mode, bsize, x, xd); + model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist, + &var_y, &sse_y, 1); + block_yrd(cpi, x, &this_rdc, &args.skippable, &this_sse, bsize, + VPXMIN(mi->tx_size, TX_16X16), 1, 1); + + // Check skip cost here since skippable is not set for for uv, this + // mirrors the behavior used by inter + if (args.skippable) { + x->skip_txfm[0] = SKIP_TXFM_AC_DC; + this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 1); + } else { + x->skip_txfm[0] = SKIP_TXFM_NONE; + this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 0); + } + // Inter and intra RD will mismatch in scale for non-screen content. + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) { + if (x->color_sensitivity[0]) + vp9_foreach_transformed_block_in_plane(xd, bsize, 1, + estimate_block_intra, &args); + if (x->color_sensitivity[1]) + vp9_foreach_transformed_block_in_plane(xd, bsize, 2, + estimate_block_intra, &args); + } + this_rdc.rate += cpi->mbmode_cost[this_mode]; + this_rdc.rate += ref_frame_cost[INTRA_FRAME]; + this_rdc.rate += intra_cost_penalty; + this_rdc.rdcost = + RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist); + + if (this_rdc.rdcost < best_rdc.rdcost) { + best_rdc = this_rdc; + best_pickmode.best_mode = this_mode; + best_pickmode.best_intra_tx_size = mi->tx_size; + best_pickmode.best_ref_frame = INTRA_FRAME; + best_pickmode.best_second_ref_frame = NO_REF_FRAME; + mi->uv_mode = this_mode; + mi->mv[0].as_int = INVALID_MV; + mi->mv[1].as_int = INVALID_MV; + best_pickmode.best_mode_skip_txfm = x->skip_txfm[0]; + } + } + + // Reset mb_mode_info to the best inter mode. + if (best_pickmode.best_ref_frame != INTRA_FRAME) { + mi->tx_size = best_pickmode.best_tx_size; + } else { + mi->tx_size = best_pickmode.best_intra_tx_size; + } + } + + pd->dst = orig_dst; + mi->mode = best_pickmode.best_mode; + mi->ref_frame[0] = best_pickmode.best_ref_frame; + mi->ref_frame[1] = best_pickmode.best_second_ref_frame; + x->skip_txfm[0] = best_pickmode.best_mode_skip_txfm; + + if (!is_inter_block(mi)) { + mi->interp_filter = SWITCHABLE_FILTERS; + } + + if (reuse_inter_pred && best_pickmode.best_pred != NULL) { + PRED_BUFFER *const best_pred = best_pickmode.best_pred; + if (best_pred->data != orig_dst.buf && is_inter_mode(mi->mode)) { +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) + vpx_highbd_convolve_copy( + CONVERT_TO_SHORTPTR(best_pred->data), best_pred->stride, + CONVERT_TO_SHORTPTR(pd->dst.buf), pd->dst.stride, NULL, 0, 0, 0, 0, + bw, bh, xd->bd); + else + vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf, + pd->dst.stride, NULL, 0, 0, 0, 0, bw, bh); +#else + vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf, + pd->dst.stride, NULL, 0, 0, 0, 0, bw, bh); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && cpi->resize_pending == 0 && + denoise_svc_pickmode && cpi->denoiser.denoising_level > kDenLowLow && + cpi->denoiser.reset == 0) { + VP9_DENOISER_DECISION decision = COPY_BLOCK; + ctx->sb_skip_denoising = 0; + // TODO(marpan): There is an issue with denoising when the + // superblock partitioning scheme is based on the pickmode. + // Remove this condition when the issue is resolved. + if (x->sb_pickmode_part) ctx->sb_skip_denoising = 1; + vp9_pickmode_ctx_den_update(&ctx_den, zero_last_cost_orig, ref_frame_cost, + frame_mv, reuse_inter_pred, &best_pickmode); + vp9_denoiser_denoise(cpi, x, mi_row, mi_col, bsize, ctx, &decision, + gf_temporal_ref); + if (denoise_recheck_zeromv) + recheck_zeromv_after_denoising(cpi, mi, x, xd, decision, &ctx_den, + yv12_mb, &best_rdc, bsize, mi_row, mi_col); + best_pickmode.best_ref_frame = ctx_den.best_ref_frame; + } +#endif + + if (best_pickmode.best_ref_frame == ALTREF_FRAME || + best_pickmode.best_second_ref_frame == ALTREF_FRAME) + x->arf_frame_usage++; + else if (best_pickmode.best_ref_frame != INTRA_FRAME) + x->lastgolden_frame_usage++; + + if (cpi->sf.adaptive_rd_thresh) { + THR_MODES best_mode_idx = + mode_idx[best_pickmode.best_ref_frame][mode_offset(mi->mode)]; + + if (best_pickmode.best_ref_frame == INTRA_FRAME) { + // Only consider the modes that are included in the intra_mode_list. + int intra_modes = sizeof(intra_mode_list) / sizeof(PREDICTION_MODE); + int i; + + // TODO(yunqingwang): Check intra mode mask and only update freq_fact + // for those valid modes. + for (i = 0; i < intra_modes; i++) { + if (cpi->sf.adaptive_rd_thresh_row_mt) + update_thresh_freq_fact_row_mt(cpi, tile_data, x->source_variance, + thresh_freq_fact_idx, INTRA_FRAME, + best_mode_idx, intra_mode_list[i]); + else + update_thresh_freq_fact(cpi, tile_data, x->source_variance, bsize, + INTRA_FRAME, best_mode_idx, + intra_mode_list[i]); + } + } else { + for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) { + PREDICTION_MODE this_mode; + if (best_pickmode.best_ref_frame != ref_frame) continue; + for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) { + if (cpi->sf.adaptive_rd_thresh_row_mt) + update_thresh_freq_fact_row_mt(cpi, tile_data, x->source_variance, + thresh_freq_fact_idx, ref_frame, + best_mode_idx, this_mode); + else + update_thresh_freq_fact(cpi, tile_data, x->source_variance, bsize, + ref_frame, best_mode_idx, this_mode); + } + } + } + } + + *rd_cost = best_rdc; +} + +void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, + int mi_col, RD_COST *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx) { + VP9_COMMON *const cm = &cpi->common; + SPEED_FEATURES *const sf = &cpi->sf; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mi = xd->mi[0]; + MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + const struct segmentation *const seg = &cm->seg; + MV_REFERENCE_FRAME ref_frame, second_ref_frame = NO_REF_FRAME; + MV_REFERENCE_FRAME best_ref_frame = NO_REF_FRAME; + unsigned char segment_id = mi->segment_id; + struct buf_2d yv12_mb[4][MAX_MB_PLANE]; + int64_t best_rd = INT64_MAX; + b_mode_info bsi[MAX_REF_FRAMES][4]; + int ref_frame_skip_mask = 0; + const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; + const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; + int idx, idy; + + x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; + ctx->pred_pixel_ready = 0; + + for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) { + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); + int_mv dummy_mv[2]; + x->pred_mv_sad[ref_frame] = INT_MAX; + + if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) && + (yv12 != NULL)) { + int_mv *const candidates = mbmi_ext->ref_mvs[ref_frame]; + const struct scale_factors *const ref_sf = + &cm->frame_refs[ref_frame - 1].sf; + vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, ref_sf, + ref_sf); + vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame, candidates, mi_row, mi_col, + mbmi_ext->mode_context); + + vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates, + &dummy_mv[0], &dummy_mv[1]); + } else { + ref_frame_skip_mask |= (1 << ref_frame); + } + } + + mi->sb_type = bsize; + mi->tx_size = TX_4X4; + mi->uv_mode = DC_PRED; + mi->ref_frame[0] = LAST_FRAME; + mi->ref_frame[1] = NO_REF_FRAME; + mi->interp_filter = + cm->interp_filter == SWITCHABLE ? EIGHTTAP : cm->interp_filter; + + for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) { + int64_t this_rd = 0; + int plane; + + if (ref_frame_skip_mask & (1 << ref_frame)) continue; + +#if CONFIG_BETTER_HW_COMPATIBILITY + if ((bsize == BLOCK_8X4 || bsize == BLOCK_4X8) && ref_frame > INTRA_FRAME && + vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf)) + continue; +#endif + + // TODO(jingning, agrange): Scaling reference frame not supported for + // sub8x8 blocks. Is this supported now? + if (ref_frame > INTRA_FRAME && + vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf)) + continue; + + // If the segment reference frame feature is enabled.... + // then do nothing if the current ref frame is not allowed.. + if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) + continue; + + mi->ref_frame[0] = ref_frame; + x->skip = 0; + set_ref_ptrs(cm, xd, ref_frame, second_ref_frame); + + // Select prediction reference frames. + for (plane = 0; plane < MAX_MB_PLANE; plane++) + xd->plane[plane].pre[0] = yv12_mb[ref_frame][plane]; + + for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { + for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { + int_mv b_mv[MB_MODE_COUNT]; + int64_t b_best_rd = INT64_MAX; + const int i = idy * 2 + idx; + PREDICTION_MODE this_mode; + RD_COST this_rdc; + unsigned int var_y, sse_y; + + struct macroblock_plane *p = &x->plane[0]; + struct macroblockd_plane *pd = &xd->plane[0]; + + const struct buf_2d orig_src = p->src; + const struct buf_2d orig_dst = pd->dst; + struct buf_2d orig_pre[2]; + memcpy(orig_pre, xd->plane[0].pre, sizeof(orig_pre)); + + // set buffer pointers for sub8x8 motion search. + p->src.buf = + &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, i, p->src.stride)]; + pd->dst.buf = + &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, i, pd->dst.stride)]; + pd->pre[0].buf = + &pd->pre[0] + .buf[vp9_raster_block_offset(BLOCK_8X8, i, pd->pre[0].stride)]; + + b_mv[ZEROMV].as_int = 0; + b_mv[NEWMV].as_int = INVALID_MV; + vp9_append_sub8x8_mvs_for_idx(cm, xd, i, 0, mi_row, mi_col, + &b_mv[NEARESTMV], &b_mv[NEARMV], + mbmi_ext->mode_context); + + for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) { + int b_rate = 0; + xd->mi[0]->bmi[i].as_mv[0].as_int = b_mv[this_mode].as_int; + + if (this_mode == NEWMV) { + const int step_param = cpi->sf.mv.fullpel_search_step_param; + MV mvp_full; + MV tmp_mv; + int cost_list[5]; + const MvLimits tmp_mv_limits = x->mv_limits; + uint32_t dummy_dist; + + if (i == 0) { + mvp_full.row = b_mv[NEARESTMV].as_mv.row >> 3; + mvp_full.col = b_mv[NEARESTMV].as_mv.col >> 3; + } else { + mvp_full.row = xd->mi[0]->bmi[0].as_mv[0].as_mv.row >> 3; + mvp_full.col = xd->mi[0]->bmi[0].as_mv[0].as_mv.col >> 3; + } + + vp9_set_mv_search_range(&x->mv_limits, + &mbmi_ext->ref_mvs[ref_frame][0].as_mv); + + vp9_full_pixel_search( + cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, + x->sadperbit4, cond_cost_list(cpi, cost_list), + &mbmi_ext->ref_mvs[ref_frame][0].as_mv, &tmp_mv, INT_MAX, 0); + + x->mv_limits = tmp_mv_limits; + + // calculate the bit cost on motion vector + mvp_full.row = tmp_mv.row * 8; + mvp_full.col = tmp_mv.col * 8; + + b_rate += vp9_mv_bit_cost( + &mvp_full, &mbmi_ext->ref_mvs[ref_frame][0].as_mv, + x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); + + b_rate += cpi->inter_mode_cost[x->mbmi_ext->mode_context[ref_frame]] + [INTER_OFFSET(NEWMV)]; + if (RDCOST(x->rdmult, x->rddiv, b_rate, 0) > b_best_rd) continue; + + cpi->find_fractional_mv_step( + x, &tmp_mv, &mbmi_ext->ref_mvs[ref_frame][0].as_mv, + cpi->common.allow_high_precision_mv, x->errorperbit, + &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, + cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list), + x->nmvjointcost, x->mvcost, &dummy_dist, + &x->pred_sse[ref_frame], NULL, 0, 0, + cpi->sf.use_accurate_subpel_search); + + xd->mi[0]->bmi[i].as_mv[0].as_mv = tmp_mv; + } else { + b_rate += cpi->inter_mode_cost[x->mbmi_ext->mode_context[ref_frame]] + [INTER_OFFSET(this_mode)]; + } + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_highbd_build_inter_predictor( + CONVERT_TO_SHORTPTR(pd->pre[0].buf), pd->pre[0].stride, + CONVERT_TO_SHORTPTR(pd->dst.buf), pd->dst.stride, + &xd->mi[0]->bmi[i].as_mv[0].as_mv, &xd->block_refs[0]->sf, + 4 * num_4x4_blocks_wide, 4 * num_4x4_blocks_high, 0, + vp9_filter_kernels[mi->interp_filter], MV_PRECISION_Q3, + mi_col * MI_SIZE + 4 * (i & 0x01), + mi_row * MI_SIZE + 4 * (i >> 1), xd->bd); + } else { +#endif + vp9_build_inter_predictor( + pd->pre[0].buf, pd->pre[0].stride, pd->dst.buf, pd->dst.stride, + &xd->mi[0]->bmi[i].as_mv[0].as_mv, &xd->block_refs[0]->sf, + 4 * num_4x4_blocks_wide, 4 * num_4x4_blocks_high, 0, + vp9_filter_kernels[mi->interp_filter], MV_PRECISION_Q3, + mi_col * MI_SIZE + 4 * (i & 0x01), + mi_row * MI_SIZE + 4 * (i >> 1)); + +#if CONFIG_VP9_HIGHBITDEPTH + } +#endif + + model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist, + &var_y, &sse_y, 0); + + this_rdc.rate += b_rate; + this_rdc.rdcost = + RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist); + if (this_rdc.rdcost < b_best_rd) { + b_best_rd = this_rdc.rdcost; + bsi[ref_frame][i].as_mode = this_mode; + bsi[ref_frame][i].as_mv[0].as_mv = xd->mi[0]->bmi[i].as_mv[0].as_mv; + } + } // mode search + + // restore source and prediction buffer pointers. + p->src = orig_src; + pd->pre[0] = orig_pre[0]; + pd->dst = orig_dst; + this_rd += b_best_rd; + + xd->mi[0]->bmi[i] = bsi[ref_frame][i]; + if (num_4x4_blocks_wide > 1) xd->mi[0]->bmi[i + 1] = xd->mi[0]->bmi[i]; + if (num_4x4_blocks_high > 1) xd->mi[0]->bmi[i + 2] = xd->mi[0]->bmi[i]; + } + } // loop through sub8x8 blocks + + if (this_rd < best_rd) { + best_rd = this_rd; + best_ref_frame = ref_frame; + } + } // reference frames + + mi->tx_size = TX_4X4; + mi->ref_frame[0] = best_ref_frame; + for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { + for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { + const int block = idy * 2 + idx; + xd->mi[0]->bmi[block] = bsi[best_ref_frame][block]; + if (num_4x4_blocks_wide > 1) + xd->mi[0]->bmi[block + 1] = bsi[best_ref_frame][block]; + if (num_4x4_blocks_high > 1) + xd->mi[0]->bmi[block + 2] = bsi[best_ref_frame][block]; + } + } + mi->mode = xd->mi[0]->bmi[3].as_mode; + ctx->mic = *(xd->mi[0]); + ctx->mbmi_ext = *x->mbmi_ext; + ctx->skip_txfm[0] = SKIP_TXFM_NONE; + ctx->skip = 0; + // Dummy assignment for speed -5. No effect in speed -6. + rd_cost->rdcost = best_rd; +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_pickmode.h b/media/libvpx/libvpx/vp9/encoder/vp9_pickmode.h new file mode 100644 index 0000000000..15207e6cf4 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_pickmode.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_PICKMODE_H_ +#define VPX_VP9_ENCODER_VP9_PICKMODE_H_ + +#include "vp9/encoder/vp9_encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx); + +void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, + int mi_row, int mi_col, RD_COST *rd_cost, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx); + +void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, + int mi_col, RD_COST *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_PICKMODE_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c b/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c new file mode 100644 index 0000000000..3f4fe6957b --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c @@ -0,0 +1,326 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" + +#include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/common/vp9_seg_common.h" + +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_quantize.h" +#include "vp9/encoder/vp9_rd.h" + +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + int i, eob = -1; + const int16_t *round_ptr = mb_plane->round_fp; + const int16_t *quant_ptr = mb_plane->quant_fp; + const int16_t *scan = scan_order->scan; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = (tmp * quant_ptr[rc != 0]) >> 16; + + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; + + if (tmp) eob = i; + } + *eob_ptr = eob + 1; +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + int i; + int eob = -1; + const int16_t *round_ptr = mb_plane->round_fp; + const int16_t *quant_ptr = mb_plane->quant_fp; + const int16_t *scan = scan_order->scan; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp = abs_coeff + round_ptr[rc != 0]; + const int abs_qcoeff = (int)((tmp * quant_ptr[rc != 0]) >> 16); + qcoeff_ptr[rc] = (tran_low_t)(abs_qcoeff ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; + if (abs_qcoeff) eob = i; + } + *eob_ptr = eob + 1; +} +#endif + +// TODO(jingning) Refactor this file and combine functions with similar +// operations. +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + int i, eob = -1; + const int16_t *round_ptr = mb_plane->round_fp; + const int16_t *quant_ptr = mb_plane->quant_fp; + const int16_t *scan = scan_order->scan; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + int tmp = 0; + int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) { + abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX); + tmp = (abs_coeff * quant_ptr[rc != 0]) >> 15; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + } + + if (tmp) eob = i; + } + *eob_ptr = eob + 1; +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_highbd_quantize_fp_32x32_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + int i, eob = -1; + const int16_t *round_ptr = mb_plane->round_fp; + const int16_t *quant_ptr = mb_plane->quant_fp; + const int16_t *scan = scan_order->scan; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + for (i = 0; i < n_coeffs; i++) { + int abs_qcoeff = 0; + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) { + const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + abs_qcoeff = (int)((tmp * quant_ptr[rc != 0]) >> 15); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + } + + if (abs_qcoeff) eob = i; + } + *eob_ptr = eob + 1; +} +#endif + +static void invert_quant(int16_t *quant, int16_t *shift, int d) { + unsigned int t; + int l, m; + t = (unsigned int)d; + l = get_msb(t); + m = 1 + (1 << (16 + l)) / d; + *quant = (int16_t)(m - (1 << 16)); + *shift = 1 << (16 - l); +} + +static int get_qzbin_factor(int q, vpx_bit_depth_t bit_depth) { + const int quant = vp9_dc_quant(q, 0, bit_depth); +#if CONFIG_VP9_HIGHBITDEPTH + switch (bit_depth) { + case VPX_BITS_8: return q == 0 ? 64 : (quant < 148 ? 84 : 80); + case VPX_BITS_10: return q == 0 ? 64 : (quant < 592 ? 84 : 80); + default: + assert(bit_depth == VPX_BITS_12); + return q == 0 ? 64 : (quant < 2368 ? 84 : 80); + } +#else + (void)bit_depth; + return q == 0 ? 64 : (quant < 148 ? 84 : 80); +#endif +} + +void vp9_init_quantizer(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + QUANTS *const quants = &cpi->quants; + int i, q, quant; + + for (q = 0; q < QINDEX_RANGE; q++) { + int qzbin_factor = get_qzbin_factor(q, cm->bit_depth); + int qrounding_factor = q == 0 ? 64 : 48; + const int sharpness_adjustment = 16 * (7 - cpi->oxcf.sharpness) / 7; + + if (cpi->oxcf.sharpness > 0 && q > 0) { + qzbin_factor = 64 + sharpness_adjustment; + qrounding_factor = 64 - sharpness_adjustment; + } + + for (i = 0; i < 2; ++i) { + int qrounding_factor_fp = i == 0 ? 48 : 42; + if (q == 0) qrounding_factor_fp = 64; + if (cpi->oxcf.sharpness > 0) + qrounding_factor_fp = 64 - sharpness_adjustment; + // y + quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q, cm->bit_depth) + : vp9_ac_quant(q, 0, cm->bit_depth); + invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], quant); + quants->y_quant_fp[q][i] = (1 << 16) / quant; + quants->y_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7; + quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7); + quants->y_round[q][i] = (qrounding_factor * quant) >> 7; + cpi->y_dequant[q][i] = quant; + + // uv + quant = i == 0 ? vp9_dc_quant(q, cm->uv_dc_delta_q, cm->bit_depth) + : vp9_ac_quant(q, cm->uv_ac_delta_q, cm->bit_depth); + invert_quant(&quants->uv_quant[q][i], &quants->uv_quant_shift[q][i], + quant); + quants->uv_quant_fp[q][i] = (1 << 16) / quant; + quants->uv_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7; + quants->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7); + quants->uv_round[q][i] = (qrounding_factor * quant) >> 7; + cpi->uv_dequant[q][i] = quant; + } + + for (i = 2; i < 8; i++) { + quants->y_quant[q][i] = quants->y_quant[q][1]; + quants->y_quant_fp[q][i] = quants->y_quant_fp[q][1]; + quants->y_round_fp[q][i] = quants->y_round_fp[q][1]; + quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1]; + quants->y_zbin[q][i] = quants->y_zbin[q][1]; + quants->y_round[q][i] = quants->y_round[q][1]; + cpi->y_dequant[q][i] = cpi->y_dequant[q][1]; + + quants->uv_quant[q][i] = quants->uv_quant[q][1]; + quants->uv_quant_fp[q][i] = quants->uv_quant_fp[q][1]; + quants->uv_round_fp[q][i] = quants->uv_round_fp[q][1]; + quants->uv_quant_shift[q][i] = quants->uv_quant_shift[q][1]; + quants->uv_zbin[q][i] = quants->uv_zbin[q][1]; + quants->uv_round[q][i] = quants->uv_round[q][1]; + cpi->uv_dequant[q][i] = cpi->uv_dequant[q][1]; + } + } +} + +void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) { + const VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + QUANTS *const quants = &cpi->quants; + const int segment_id = xd->mi[0]->segment_id; + const int qindex = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex); + const int rdmult = vp9_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q); + int i; + + // Y + x->plane[0].quant = quants->y_quant[qindex]; + x->plane[0].quant_fp = quants->y_quant_fp[qindex]; + x->plane[0].round_fp = quants->y_round_fp[qindex]; + x->plane[0].quant_shift = quants->y_quant_shift[qindex]; + x->plane[0].zbin = quants->y_zbin[qindex]; + x->plane[0].round = quants->y_round[qindex]; + xd->plane[0].dequant = cpi->y_dequant[qindex]; + x->plane[0].quant_thred[0] = x->plane[0].zbin[0] * x->plane[0].zbin[0]; + x->plane[0].quant_thred[1] = x->plane[0].zbin[1] * x->plane[0].zbin[1]; + + // UV + for (i = 1; i < 3; i++) { + x->plane[i].quant = quants->uv_quant[qindex]; + x->plane[i].quant_fp = quants->uv_quant_fp[qindex]; + x->plane[i].round_fp = quants->uv_round_fp[qindex]; + x->plane[i].quant_shift = quants->uv_quant_shift[qindex]; + x->plane[i].zbin = quants->uv_zbin[qindex]; + x->plane[i].round = quants->uv_round[qindex]; + xd->plane[i].dequant = cpi->uv_dequant[qindex]; + x->plane[i].quant_thred[0] = x->plane[i].zbin[0] * x->plane[i].zbin[0]; + x->plane[i].quant_thred[1] = x->plane[i].zbin[1] * x->plane[i].zbin[1]; + } + + x->skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP); + x->q_index = qindex; + + set_error_per_bit(x, rdmult); + + vp9_initialize_me_consts(cpi, x, x->q_index); +} + +void vp9_frame_init_quantizer(VP9_COMP *cpi) { + vp9_init_plane_quantizers(cpi, &cpi->td.mb); +} + +void vp9_set_quantizer(VP9_COMP *cpi, int q) { + VP9_COMMON *cm = &cpi->common; + // quantizer has to be reinitialized with vp9_init_quantizer() if any + // delta_q changes. + cm->base_qindex = q; + cm->y_dc_delta_q = 0; + cm->uv_dc_delta_q = 0; + cm->uv_ac_delta_q = 0; + if (cpi->oxcf.delta_q_uv != 0) { + cm->uv_dc_delta_q = cm->uv_ac_delta_q = cpi->oxcf.delta_q_uv; + vp9_init_quantizer(cpi); + } +} + +// Table that converts 0-63 Q-range values passed in outside to the Qindex +// range used internally. +static const int quantizer_to_qindex[] = { + 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, + 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, + 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, + 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, + 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 249, 255, +}; + +int vp9_quantizer_to_qindex(int quantizer) { + return quantizer_to_qindex[quantizer]; +} + +int vp9_qindex_to_quantizer(int qindex) { + int quantizer; + + for (quantizer = 0; quantizer < 64; ++quantizer) + if (quantizer_to_qindex[quantizer] >= qindex) return quantizer; + + return 63; +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_quantize.h b/media/libvpx/libvpx/vp9/encoder/vp9_quantize.h new file mode 100644 index 0000000000..f626f06566 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_quantize.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_QUANTIZE_H_ +#define VPX_VP9_ENCODER_VP9_QUANTIZE_H_ + +#include "./vpx_config.h" +#include "vp9/encoder/vp9_block.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, y_round[QINDEX_RANGE][8]); + + // TODO(jingning): in progress of re-working the quantization. will decide + // if we want to deprecate the current use of y_quant. + DECLARE_ALIGNED(16, int16_t, y_quant_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, uv_quant_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, y_round_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, uv_round_fp[QINDEX_RANGE][8]); + + DECLARE_ALIGNED(16, int16_t, uv_quant[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, uv_zbin[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]); +} QUANTS; + +struct VP9_COMP; +struct VP9Common; + +void vp9_frame_init_quantizer(struct VP9_COMP *cpi); + +void vp9_init_plane_quantizers(struct VP9_COMP *cpi, MACROBLOCK *x); + +void vp9_init_quantizer(struct VP9_COMP *cpi); + +void vp9_set_quantizer(struct VP9_COMP *cm, int q); + +int vp9_quantizer_to_qindex(int quantizer); + +int vp9_qindex_to_quantizer(int qindex); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_QUANTIZE_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c new file mode 100644 index 0000000000..62d6b93028 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c @@ -0,0 +1,3391 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/system_state.h" + +#include "vp9/common/vp9_alloccommon.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_seg_common.h" + +#include "vp9/encoder/vp9_aq_cyclicrefresh.h" +#include "vp9/encoder/vp9_encodemv.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_ext_ratectrl.h" +#include "vp9/encoder/vp9_firstpass.h" +#include "vp9/encoder/vp9_ratectrl.h" + +#include "vpx/vpx_codec.h" +#include "vpx/vpx_ext_ratectrl.h" +#include "vpx/internal/vpx_codec_internal.h" + +// Max rate per frame for 1080P and below encodes if no level requirement given. +// For larger formats limit to MAX_MB_RATE bits per MB +// 4Mbits is derived from the level requirement for level 4 (1080P 30) which +// requires that HW can sustain a rate of 16Mbits over a 4 frame group. +// If a lower level requirement is specified then this may over ride this value. +#define MAX_MB_RATE 250 +#define MAXRATE_1080P 4000000 + +#define LIMIT_QRANGE_FOR_ALTREF_AND_KEY 1 + +#define MIN_BPB_FACTOR 0.005 +#define MAX_BPB_FACTOR 50 + +#if CONFIG_VP9_HIGHBITDEPTH +#define ASSIGN_MINQ_TABLE(bit_depth, name) \ + do { \ + switch (bit_depth) { \ + case VPX_BITS_8: name = name##_8; break; \ + case VPX_BITS_10: name = name##_10; break; \ + default: \ + assert(bit_depth == VPX_BITS_12); \ + name = name##_12; \ + break; \ + } \ + } while (0) +#else +#define ASSIGN_MINQ_TABLE(bit_depth, name) \ + do { \ + (void)bit_depth; \ + name = name##_8; \ + } while (0) +#endif + +// Tables relating active max Q to active min Q +static int kf_low_motion_minq_8[QINDEX_RANGE]; +static int kf_high_motion_minq_8[QINDEX_RANGE]; +static int arfgf_low_motion_minq_8[QINDEX_RANGE]; +static int arfgf_high_motion_minq_8[QINDEX_RANGE]; +static int inter_minq_8[QINDEX_RANGE]; +static int rtc_minq_8[QINDEX_RANGE]; + +#if CONFIG_VP9_HIGHBITDEPTH +static int kf_low_motion_minq_10[QINDEX_RANGE]; +static int kf_high_motion_minq_10[QINDEX_RANGE]; +static int arfgf_low_motion_minq_10[QINDEX_RANGE]; +static int arfgf_high_motion_minq_10[QINDEX_RANGE]; +static int inter_minq_10[QINDEX_RANGE]; +static int rtc_minq_10[QINDEX_RANGE]; +static int kf_low_motion_minq_12[QINDEX_RANGE]; +static int kf_high_motion_minq_12[QINDEX_RANGE]; +static int arfgf_low_motion_minq_12[QINDEX_RANGE]; +static int arfgf_high_motion_minq_12[QINDEX_RANGE]; +static int inter_minq_12[QINDEX_RANGE]; +static int rtc_minq_12[QINDEX_RANGE]; +#endif + +#ifdef AGGRESSIVE_VBR +static int gf_high = 2400; +static int gf_low = 400; +static int kf_high = 4000; +static int kf_low = 400; +#else +static int gf_high = 2000; +static int gf_low = 400; +static int kf_high = 4800; +static int kf_low = 300; +#endif + +// Functions to compute the active minq lookup table entries based on a +// formulaic approach to facilitate easier adjustment of the Q tables. +// The formulae were derived from computing a 3rd order polynomial best +// fit to the original data (after plotting real maxq vs minq (not q index)) +static int get_minq_index(double maxq, double x3, double x2, double x1, + vpx_bit_depth_t bit_depth) { + int i; + const double minqtarget = VPXMIN(((x3 * maxq + x2) * maxq + x1) * maxq, maxq); + + // Special case handling to deal with the step from q2.0 + // down to lossless mode represented by q 1.0. + if (minqtarget <= 2.0) return 0; + + for (i = 0; i < QINDEX_RANGE; i++) { + if (minqtarget <= vp9_convert_qindex_to_q(i, bit_depth)) return i; + } + + return QINDEX_RANGE - 1; +} + +static void init_minq_luts(int *kf_low_m, int *kf_high_m, int *arfgf_low, + int *arfgf_high, int *inter, int *rtc, + vpx_bit_depth_t bit_depth) { + int i; + for (i = 0; i < QINDEX_RANGE; i++) { + const double maxq = vp9_convert_qindex_to_q(i, bit_depth); + kf_low_m[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.150, bit_depth); + kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.45, bit_depth); +#ifdef AGGRESSIVE_VBR + arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.275, bit_depth); + inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.80, bit_depth); +#else + arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30, bit_depth); + inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70, bit_depth); +#endif + arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth); + rtc[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70, bit_depth); + } +} + +void vp9_rc_init_minq_luts(void) { + init_minq_luts(kf_low_motion_minq_8, kf_high_motion_minq_8, + arfgf_low_motion_minq_8, arfgf_high_motion_minq_8, + inter_minq_8, rtc_minq_8, VPX_BITS_8); +#if CONFIG_VP9_HIGHBITDEPTH + init_minq_luts(kf_low_motion_minq_10, kf_high_motion_minq_10, + arfgf_low_motion_minq_10, arfgf_high_motion_minq_10, + inter_minq_10, rtc_minq_10, VPX_BITS_10); + init_minq_luts(kf_low_motion_minq_12, kf_high_motion_minq_12, + arfgf_low_motion_minq_12, arfgf_high_motion_minq_12, + inter_minq_12, rtc_minq_12, VPX_BITS_12); +#endif +} + +// These functions use formulaic calculations to make playing with the +// quantizer tables easier. If necessary they can be replaced by lookup +// tables if and when things settle down in the experimental bitstream +double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth) { +// Convert the index to a real Q value (scaled down to match old Q values) +#if CONFIG_VP9_HIGHBITDEPTH + switch (bit_depth) { + case VPX_BITS_8: return vp9_ac_quant(qindex, 0, bit_depth) / 4.0; + case VPX_BITS_10: return vp9_ac_quant(qindex, 0, bit_depth) / 16.0; + default: + assert(bit_depth == VPX_BITS_12); + return vp9_ac_quant(qindex, 0, bit_depth) / 64.0; + } +#else + return vp9_ac_quant(qindex, 0, bit_depth) / 4.0; +#endif +} + +int vp9_convert_q_to_qindex(double q_val, vpx_bit_depth_t bit_depth) { + int i; + + for (i = 0; i < QINDEX_RANGE; ++i) + if (vp9_convert_qindex_to_q(i, bit_depth) >= q_val) break; + + if (i == QINDEX_RANGE) i--; + + return i; +} + +int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex, + double correction_factor, vpx_bit_depth_t bit_depth) { + const double q = vp9_convert_qindex_to_q(qindex, bit_depth); + int enumerator = frame_type == KEY_FRAME ? 2700000 : 1800000; + + assert(correction_factor <= MAX_BPB_FACTOR && + correction_factor >= MIN_BPB_FACTOR); + + // q based adjustment to baseline enumerator + enumerator += (int)(enumerator * q) >> 12; + return (int)(enumerator * correction_factor / q); +} + +int vp9_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs, + double correction_factor, + vpx_bit_depth_t bit_depth) { + const int bpm = + (int)(vp9_rc_bits_per_mb(frame_type, q, correction_factor, bit_depth)); + return VPXMAX(FRAME_OVERHEAD_BITS, + (int)(((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS)); +} + +int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) { + const RATE_CONTROL *rc = &cpi->rc; + const VP9EncoderConfig *oxcf = &cpi->oxcf; + + const int min_frame_target = + VPXMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5); + if (target < min_frame_target) target = min_frame_target; + if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) { + // If there is an active ARF at this location use the minimum + // bits on this frame even if it is a constructed arf. + // The active maximum quantizer insures that an appropriate + // number of bits will be spent if needed for constructed ARFs. + target = min_frame_target; + } + + // Clip the frame target to the maximum allowed value. + if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth; + + if (oxcf->rc_max_inter_bitrate_pct) { + const int64_t max_rate = + (int64_t)rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100; + // target is of type int and VPXMIN cannot evaluate to larger than target + target = (int)VPXMIN(target, max_rate); + } + return target; +} + +int vp9_rc_clamp_iframe_target_size(const VP9_COMP *const cpi, int target) { + const RATE_CONTROL *rc = &cpi->rc; + const VP9EncoderConfig *oxcf = &cpi->oxcf; + if (oxcf->rc_max_intra_bitrate_pct) { + const int64_t max_rate = + (int64_t)rc->avg_frame_bandwidth * oxcf->rc_max_intra_bitrate_pct / 100; + target = (int)VPXMIN(target, max_rate); + } + if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth; + return target; +} + +// TODO(marpan/jianj): bits_off_target and buffer_level are used in the same +// way for CBR mode, for the buffering updates below. Look into removing one +// of these (i.e., bits_off_target). +// Update the buffer level before encoding with the per-frame-bandwidth, +void vp9_update_buffer_level_preencode(VP9_COMP *cpi) { + RATE_CONTROL *const rc = &cpi->rc; + rc->bits_off_target += rc->avg_frame_bandwidth; + // Clip the buffer level to the maximum specified buffer size. + rc->bits_off_target = VPXMIN(rc->bits_off_target, rc->maximum_buffer_size); + rc->buffer_level = rc->bits_off_target; +} + +// Update the buffer level before encoding with the per-frame-bandwidth +// for SVC. The current and all upper temporal layers are updated, needed +// for the layered rate control which involves cumulative buffer levels for +// the temporal layers. Allow for using the timestamp(pts) delta for the +// framerate when the set_ref_frame_config is used. +void vp9_update_buffer_level_svc_preencode(VP9_COMP *cpi) { + SVC *const svc = &cpi->svc; + int i; + // Set this to 1 to use timestamp delta for "framerate" under + // ref_frame_config usage. + int use_timestamp = 1; + const int64_t ts_delta = + svc->time_stamp_superframe - svc->time_stamp_prev[svc->spatial_layer_id]; + for (i = svc->temporal_layer_id; i < svc->number_temporal_layers; ++i) { + const int layer = + LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + RATE_CONTROL *const lrc = &lc->rc; + if (use_timestamp && cpi->svc.use_set_ref_frame_config && + svc->number_temporal_layers == 1 && ts_delta > 0 && + svc->current_superframe > 0) { + // TODO(marpan): This may need to be modified for temporal layers. + const double framerate_pts = 10000000.0 / ts_delta; + lrc->bits_off_target += (int)round(lc->target_bandwidth / framerate_pts); + } else { + lrc->bits_off_target += (int)round(lc->target_bandwidth / lc->framerate); + } + // Clip buffer level to maximum buffer size for the layer. + lrc->bits_off_target = + VPXMIN(lrc->bits_off_target, lrc->maximum_buffer_size); + lrc->buffer_level = lrc->bits_off_target; + if (i == svc->temporal_layer_id) { + cpi->rc.bits_off_target = lrc->bits_off_target; + cpi->rc.buffer_level = lrc->buffer_level; + } + } +} + +// Update the buffer level for higher temporal layers, given the encoded current +// temporal layer. +static void update_layer_buffer_level_postencode(SVC *svc, + int encoded_frame_size) { + int i = 0; + const int current_temporal_layer = svc->temporal_layer_id; + for (i = current_temporal_layer + 1; i < svc->number_temporal_layers; ++i) { + const int layer = + LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + lrc->bits_off_target -= encoded_frame_size; + // Clip buffer level to maximum buffer size for the layer. + lrc->bits_off_target = + VPXMIN(lrc->bits_off_target, lrc->maximum_buffer_size); + lrc->buffer_level = lrc->bits_off_target; + } +} + +// Update the buffer level after encoding with encoded frame size. +static void update_buffer_level_postencode(VP9_COMP *cpi, + int encoded_frame_size) { + RATE_CONTROL *const rc = &cpi->rc; + rc->bits_off_target -= encoded_frame_size; + // Clip the buffer level to the maximum specified buffer size. + rc->bits_off_target = VPXMIN(rc->bits_off_target, rc->maximum_buffer_size); + // For screen-content mode, and if frame-dropper is off, don't let buffer + // level go below threshold, given here as -rc->maximum_ buffer_size. + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && + cpi->oxcf.drop_frames_water_mark == 0) + rc->bits_off_target = VPXMAX(rc->bits_off_target, -rc->maximum_buffer_size); + + rc->buffer_level = rc->bits_off_target; + + if (is_one_pass_svc(cpi)) { + update_layer_buffer_level_postencode(&cpi->svc, encoded_frame_size); + } +} + +int vp9_rc_get_default_min_gf_interval(int width, int height, + double framerate) { + // Assume we do not need any constraint lower than 4K 20 fps + static const double factor_safe = 3840 * 2160 * 20.0; + const double factor = width * height * framerate; + const int default_interval = + clamp((int)(framerate * 0.125), MIN_GF_INTERVAL, MAX_GF_INTERVAL); + + if (factor <= factor_safe) + return default_interval; + else + return VPXMAX(default_interval, + (int)(MIN_GF_INTERVAL * factor / factor_safe + 0.5)); + // Note this logic makes: + // 4K24: 5 + // 4K30: 6 + // 4K60: 12 +} + +int vp9_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) { + int interval = VPXMIN(MAX_GF_INTERVAL, (int)(framerate * 0.75)); + interval += (interval & 0x01); // Round to even value + return VPXMAX(interval, min_gf_interval); +} + +void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) { + int i; + + if (pass == 0 && oxcf->rc_mode == VPX_CBR) { + rc->avg_frame_qindex[KEY_FRAME] = oxcf->worst_allowed_q; + rc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q; + } else { + rc->avg_frame_qindex[KEY_FRAME] = + (oxcf->worst_allowed_q + oxcf->best_allowed_q) / 2; + rc->avg_frame_qindex[INTER_FRAME] = + (oxcf->worst_allowed_q + oxcf->best_allowed_q) / 2; + } + + rc->last_q[KEY_FRAME] = oxcf->best_allowed_q; + rc->last_q[INTER_FRAME] = oxcf->worst_allowed_q; + + rc->buffer_level = rc->starting_buffer_level; + rc->bits_off_target = rc->starting_buffer_level; + + rc->rolling_target_bits = rc->avg_frame_bandwidth; + rc->rolling_actual_bits = rc->avg_frame_bandwidth; + rc->long_rolling_target_bits = rc->avg_frame_bandwidth; + rc->long_rolling_actual_bits = rc->avg_frame_bandwidth; + + rc->total_actual_bits = 0; + rc->total_target_bits = 0; + rc->total_target_vs_actual = 0; + rc->avg_frame_low_motion = 0; + rc->count_last_scene_change = 0; + rc->af_ratio_onepass_vbr = 10; + rc->prev_avg_source_sad_lag = 0; + rc->high_source_sad = 0; + rc->reset_high_source_sad = 0; + rc->high_source_sad_lagindex = -1; + rc->high_num_blocks_with_motion = 0; + rc->hybrid_intra_scene_change = 0; + rc->re_encode_maxq_scene_change = 0; + rc->alt_ref_gf_group = 0; + rc->last_frame_is_src_altref = 0; + rc->fac_active_worst_inter = 150; + rc->fac_active_worst_gf = 100; + rc->force_qpmin = 0; + for (i = 0; i < MAX_LAG_BUFFERS; ++i) rc->avg_source_sad[i] = 0; + rc->frames_to_key = 0; + rc->frames_since_key = 8; // Sensible default for first frame. + rc->this_key_frame_forced = 0; + rc->next_key_frame_forced = 0; + rc->source_alt_ref_pending = 0; + rc->source_alt_ref_active = 0; + + rc->frames_till_gf_update_due = 0; + rc->constrain_gf_key_freq_onepass_vbr = 1; + rc->ni_av_qi = oxcf->worst_allowed_q; + rc->ni_tot_qi = 0; + rc->ni_frames = 0; + + rc->tot_q = 0.0; + rc->avg_q = vp9_convert_qindex_to_q(oxcf->worst_allowed_q, oxcf->bit_depth); + + for (i = 0; i < RATE_FACTOR_LEVELS; ++i) { + rc->rate_correction_factors[i] = 1.0; + rc->damped_adjustment[i] = 0; + } + + rc->min_gf_interval = oxcf->min_gf_interval; + rc->max_gf_interval = oxcf->max_gf_interval; + if (rc->min_gf_interval == 0) + rc->min_gf_interval = vp9_rc_get_default_min_gf_interval( + oxcf->width, oxcf->height, oxcf->init_framerate); + if (rc->max_gf_interval == 0) + rc->max_gf_interval = vp9_rc_get_default_max_gf_interval( + oxcf->init_framerate, rc->min_gf_interval); + rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2; + if ((oxcf->pass == 0) && (oxcf->rc_mode == VPX_Q)) { + rc->static_scene_max_gf_interval = FIXED_GF_INTERVAL; + } else { + rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH; + } + + rc->force_max_q = 0; + rc->last_post_encode_dropped_scene_change = 0; + rc->use_post_encode_drop = 0; + rc->ext_use_post_encode_drop = 0; + rc->disable_overshoot_maxq_cbr = 0; + rc->arf_active_best_quality_adjustment_factor = 1.0; + rc->arf_increase_active_best_quality = 0; + rc->preserve_arf_as_gld = 0; + rc->preserve_next_arf_as_gld = 0; + rc->show_arf_as_gld = 0; +} + +static int check_buffer_above_thresh(VP9_COMP *cpi, int drop_mark) { + SVC *svc = &cpi->svc; + if (!cpi->use_svc || cpi->svc.framedrop_mode != FULL_SUPERFRAME_DROP) { + RATE_CONTROL *const rc = &cpi->rc; + return (rc->buffer_level > drop_mark); + } else { + int i; + // For SVC in the FULL_SUPERFRAME_DROP): the condition on + // buffer (if its above threshold, so no drop) is checked on current and + // upper spatial layers. If any spatial layer is not above threshold then + // we return 0. + for (i = svc->spatial_layer_id; i < svc->number_spatial_layers; ++i) { + const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id, + svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + // Exclude check for layer whose bitrate is 0. + if (lc->target_bandwidth > 0) { + const int drop_mark_layer = (int)(cpi->svc.framedrop_thresh[i] * + lrc->optimal_buffer_level / 100); + if (!(lrc->buffer_level > drop_mark_layer)) return 0; + } + } + return 1; + } +} + +static int check_buffer_below_thresh(VP9_COMP *cpi, int drop_mark) { + SVC *svc = &cpi->svc; + if (!cpi->use_svc || cpi->svc.framedrop_mode == LAYER_DROP) { + RATE_CONTROL *const rc = &cpi->rc; + return (rc->buffer_level <= drop_mark); + } else { + int i; + // For SVC in the constrained framedrop mode (svc->framedrop_mode = + // CONSTRAINED_LAYER_DROP or FULL_SUPERFRAME_DROP): the condition on + // buffer (if its below threshold, so drop frame) is checked on current + // and upper spatial layers. For FULL_SUPERFRAME_DROP mode if any + // spatial layer is <= threshold, then we return 1 (drop). + for (i = svc->spatial_layer_id; i < svc->number_spatial_layers; ++i) { + const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id, + svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + // Exclude check for layer whose bitrate is 0. + if (lc->target_bandwidth > 0) { + const int drop_mark_layer = (int)(cpi->svc.framedrop_thresh[i] * + lrc->optimal_buffer_level / 100); + if (cpi->svc.framedrop_mode == FULL_SUPERFRAME_DROP) { + if (lrc->buffer_level <= drop_mark_layer) return 1; + } else { + if (!(lrc->buffer_level <= drop_mark_layer)) return 0; + } + } + } + if (cpi->svc.framedrop_mode == FULL_SUPERFRAME_DROP) + return 0; + else + return 1; + } +} + +int vp9_test_drop(VP9_COMP *cpi) { + const VP9EncoderConfig *oxcf = &cpi->oxcf; + RATE_CONTROL *const rc = &cpi->rc; + SVC *svc = &cpi->svc; + int drop_frames_water_mark = oxcf->drop_frames_water_mark; + if (cpi->use_svc) { + // If we have dropped max_consec_drop frames, then we don't + // drop this spatial layer, and reset counter to 0. + if (svc->drop_count[svc->spatial_layer_id] == svc->max_consec_drop) { + svc->drop_count[svc->spatial_layer_id] = 0; + return 0; + } else { + drop_frames_water_mark = svc->framedrop_thresh[svc->spatial_layer_id]; + } + } + if (!drop_frames_water_mark || + (svc->spatial_layer_id > 0 && + svc->framedrop_mode == FULL_SUPERFRAME_DROP)) { + return 0; + } else { + if ((rc->buffer_level < 0 && svc->framedrop_mode != FULL_SUPERFRAME_DROP) || + (check_buffer_below_thresh(cpi, -1) && + svc->framedrop_mode == FULL_SUPERFRAME_DROP)) { + // Always drop if buffer is below 0. + return 1; + } else { + // If buffer is below drop_mark, for now just drop every other frame + // (starting with the next frame) until it increases back over drop_mark. + int drop_mark = + (int)(drop_frames_water_mark * rc->optimal_buffer_level / 100); + if (check_buffer_above_thresh(cpi, drop_mark) && + (rc->decimation_factor > 0)) { + --rc->decimation_factor; + } else if (check_buffer_below_thresh(cpi, drop_mark) && + rc->decimation_factor == 0) { + rc->decimation_factor = 1; + } + if (rc->decimation_factor > 0) { + if (rc->decimation_count > 0) { + --rc->decimation_count; + return 1; + } else { + rc->decimation_count = rc->decimation_factor; + return 0; + } + } else { + rc->decimation_count = 0; + return 0; + } + } + } +} + +int post_encode_drop_cbr(VP9_COMP *cpi, size_t *size) { + size_t frame_size = *size << 3; + int64_t new_buffer_level = + cpi->rc.buffer_level + cpi->rc.avg_frame_bandwidth - (int64_t)frame_size; + + // For now we drop if new buffer level (given the encoded frame size) goes + // below 0. + if (new_buffer_level < 0) { + *size = 0; + vp9_rc_postencode_update_drop_frame(cpi); + // Update flag to use for next frame. + if (cpi->rc.high_source_sad || + (cpi->use_svc && cpi->svc.high_source_sad_superframe)) + cpi->rc.last_post_encode_dropped_scene_change = 1; + // Force max_q on next fame. + cpi->rc.force_max_q = 1; + cpi->rc.avg_frame_qindex[INTER_FRAME] = cpi->rc.worst_quality; + cpi->last_frame_dropped = 1; + cpi->ext_refresh_frame_flags_pending = 0; + if (cpi->use_svc) { + SVC *svc = &cpi->svc; + int sl = 0; + int tl = 0; + svc->last_layer_dropped[svc->spatial_layer_id] = 1; + svc->drop_spatial_layer[svc->spatial_layer_id] = 1; + svc->drop_count[svc->spatial_layer_id]++; + svc->skip_enhancement_layer = 1; + // Postencode drop is only checked on base spatial layer, + // for now if max-q is set on base we force it on all layers. + for (sl = 0; sl < svc->number_spatial_layers; ++sl) { + for (tl = 0; tl < svc->number_temporal_layers; ++tl) { + const int layer = + LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + lrc->force_max_q = 1; + lrc->avg_frame_qindex[INTER_FRAME] = cpi->rc.worst_quality; + } + } + } + return 1; + } + + cpi->rc.force_max_q = 0; + cpi->rc.last_post_encode_dropped_scene_change = 0; + return 0; +} + +int vp9_rc_drop_frame(VP9_COMP *cpi) { + SVC *svc = &cpi->svc; + int svc_prev_layer_dropped = 0; + // In the constrained or full_superframe framedrop mode for svc + // (framedrop_mode != (LAYER_DROP && CONSTRAINED_FROM_ABOVE)), + // if the previous spatial layer was dropped, drop the current spatial layer. + if (cpi->use_svc && svc->spatial_layer_id > 0 && + svc->drop_spatial_layer[svc->spatial_layer_id - 1]) + svc_prev_layer_dropped = 1; + if ((svc_prev_layer_dropped && svc->framedrop_mode != LAYER_DROP && + svc->framedrop_mode != CONSTRAINED_FROM_ABOVE_DROP) || + svc->force_drop_constrained_from_above[svc->spatial_layer_id] || + vp9_test_drop(cpi)) { + vp9_rc_postencode_update_drop_frame(cpi); + cpi->ext_refresh_frame_flags_pending = 0; + cpi->last_frame_dropped = 1; + if (cpi->use_svc) { + svc->last_layer_dropped[svc->spatial_layer_id] = 1; + svc->drop_spatial_layer[svc->spatial_layer_id] = 1; + svc->drop_count[svc->spatial_layer_id]++; + svc->skip_enhancement_layer = 1; + if (svc->framedrop_mode == LAYER_DROP || + (svc->framedrop_mode == CONSTRAINED_FROM_ABOVE_DROP && + svc->force_drop_constrained_from_above[svc->number_spatial_layers - + 1] == 0) || + svc->drop_spatial_layer[0] == 0) { + // For the case of constrained drop mode where full superframe is + // dropped, we don't increment the svc frame counters. + // In particular temporal layer counter (which is incremented in + // vp9_inc_frame_in_layer()) won't be incremented, so on a dropped + // frame we try the same temporal_layer_id on next incoming frame. + // This is to avoid an issue with temporal alignment with full + // superframe dropping. + vp9_inc_frame_in_layer(cpi); + } + if (svc->spatial_layer_id == svc->number_spatial_layers - 1) { + int i; + int all_layers_drop = 1; + for (i = 0; i < svc->spatial_layer_id; i++) { + if (svc->drop_spatial_layer[i] == 0) { + all_layers_drop = 0; + break; + } + } + if (all_layers_drop == 1) svc->skip_enhancement_layer = 0; + } + } + return 1; + } + return 0; +} + +static int adjust_q_cbr(const VP9_COMP *cpi, int q) { + // This makes sure q is between oscillating Qs to prevent resonance. + if (!cpi->rc.reset_high_source_sad && + (!cpi->oxcf.gf_cbr_boost_pct || + !(cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)) && + (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) && + cpi->rc.q_1_frame != cpi->rc.q_2_frame) { + int qclamp = clamp(q, VPXMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame), + VPXMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame)); + // If the previous frame had overshoot and the current q needs to increase + // above the clamped value, reduce the clamp for faster reaction to + // overshoot. + if (cpi->rc.rc_1_frame == -1 && q > qclamp) + q = (q + qclamp) >> 1; + else + q = qclamp; + } + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && + cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) + vp9_cyclic_refresh_limit_q(cpi, &q); + return VPXMAX(VPXMIN(q, cpi->rc.worst_quality), cpi->rc.best_quality); +} + +static double get_rate_correction_factor(const VP9_COMP *cpi) { + const RATE_CONTROL *const rc = &cpi->rc; + const VP9_COMMON *const cm = &cpi->common; + double rcf; + + if (frame_is_intra_only(cm)) { + rcf = rc->rate_correction_factors[KF_STD]; + } else if (cpi->oxcf.pass == 2) { + RATE_FACTOR_LEVEL rf_lvl = + cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index]; + rcf = rc->rate_correction_factors[rf_lvl]; + } else { + if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) && + !rc->is_src_frame_alt_ref && !cpi->use_svc && + (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 100)) + rcf = rc->rate_correction_factors[GF_ARF_STD]; + else + rcf = rc->rate_correction_factors[INTER_NORMAL]; + } + rcf *= rcf_mult[rc->frame_size_selector]; + return fclamp(rcf, MIN_BPB_FACTOR, MAX_BPB_FACTOR); +} + +static void set_rate_correction_factor(VP9_COMP *cpi, double factor) { + RATE_CONTROL *const rc = &cpi->rc; + const VP9_COMMON *const cm = &cpi->common; + + // Normalize RCF to account for the size-dependent scaling factor. + factor /= rcf_mult[cpi->rc.frame_size_selector]; + + factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR); + + if (frame_is_intra_only(cm)) { + rc->rate_correction_factors[KF_STD] = factor; + } else if (cpi->oxcf.pass == 2) { + RATE_FACTOR_LEVEL rf_lvl = + cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index]; + rc->rate_correction_factors[rf_lvl] = factor; + } else { + if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) && + !rc->is_src_frame_alt_ref && !cpi->use_svc && + (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 100)) + rc->rate_correction_factors[GF_ARF_STD] = factor; + else + rc->rate_correction_factors[INTER_NORMAL] = factor; + } +} + +void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) { + const VP9_COMMON *const cm = &cpi->common; + int correction_factor = 100; + double rate_correction_factor = get_rate_correction_factor(cpi); + double adjustment_limit; + RATE_FACTOR_LEVEL rf_lvl = + cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index]; + + int projected_size_based_on_q = 0; + + // Do not update the rate factors for arf overlay frames. + if (cpi->rc.is_src_frame_alt_ref) return; + + // Clear down mmx registers to allow floating point in what follows + vpx_clear_system_state(); + + // Work out how big we would have expected the frame to be at this Q given + // the current correction factor. + // Stay in double to avoid int overflow when values are large + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->common.seg.enabled) { + projected_size_based_on_q = + vp9_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor); + } else { + FRAME_TYPE frame_type = cm->intra_only ? KEY_FRAME : cm->frame_type; + projected_size_based_on_q = + vp9_estimate_bits_at_q(frame_type, cm->base_qindex, cm->MBs, + rate_correction_factor, cm->bit_depth); + } + // Work out a size correction factor. + if (projected_size_based_on_q > FRAME_OVERHEAD_BITS) + correction_factor = (int)((100 * (int64_t)cpi->rc.projected_frame_size) / + projected_size_based_on_q); + + // Do not use damped adjustment for the first frame of each frame type + if (!cpi->rc.damped_adjustment[rf_lvl]) { + adjustment_limit = 1.0; + cpi->rc.damped_adjustment[rf_lvl] = 1; + } else { + // More heavily damped adjustment used if we have been oscillating either + // side of target. + adjustment_limit = + 0.25 + 0.5 * VPXMIN(1, fabs(log10(0.01 * correction_factor))); + } + + cpi->rc.q_2_frame = cpi->rc.q_1_frame; + cpi->rc.q_1_frame = cm->base_qindex; + cpi->rc.rc_2_frame = cpi->rc.rc_1_frame; + if (correction_factor > 110) + cpi->rc.rc_1_frame = -1; + else if (correction_factor < 90) + cpi->rc.rc_1_frame = 1; + else + cpi->rc.rc_1_frame = 0; + + // Turn off oscilation detection in the case of massive overshoot. + if (cpi->rc.rc_1_frame == -1 && cpi->rc.rc_2_frame == 1 && + correction_factor > 1000) { + cpi->rc.rc_2_frame = 0; + } + + if (correction_factor > 102) { + // We are not already at the worst allowable quality + correction_factor = + (int)(100 + ((correction_factor - 100) * adjustment_limit)); + rate_correction_factor = (rate_correction_factor * correction_factor) / 100; + // Keep rate_correction_factor within limits + if (rate_correction_factor > MAX_BPB_FACTOR) + rate_correction_factor = MAX_BPB_FACTOR; + } else if (correction_factor < 99) { + // We are not already at the best allowable quality + correction_factor = + (int)(100 - ((100 - correction_factor) * adjustment_limit)); + rate_correction_factor = (rate_correction_factor * correction_factor) / 100; + + // Keep rate_correction_factor within limits + if (rate_correction_factor < MIN_BPB_FACTOR) + rate_correction_factor = MIN_BPB_FACTOR; + } + + set_rate_correction_factor(cpi, rate_correction_factor); +} + +int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame, + int active_best_quality, int active_worst_quality) { + const VP9_COMMON *const cm = &cpi->common; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + int q = active_worst_quality; + int last_error = INT_MAX; + int i, target_bits_per_mb, bits_per_mb_at_this_q; + const double correction_factor = get_rate_correction_factor(cpi); + + // Calculate required scaling factor based on target frame size and size of + // frame produced using previous Q. + target_bits_per_mb = + (int)(((uint64_t)target_bits_per_frame << BPER_MB_NORMBITS) / cm->MBs); + + i = active_best_quality; + + do { + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cr->apply_cyclic_refresh && + (!cpi->oxcf.gf_cbr_boost_pct || !cpi->refresh_golden_frame)) { + bits_per_mb_at_this_q = + (int)vp9_cyclic_refresh_rc_bits_per_mb(cpi, i, correction_factor); + } else { + FRAME_TYPE frame_type = cm->intra_only ? KEY_FRAME : cm->frame_type; + bits_per_mb_at_this_q = (int)vp9_rc_bits_per_mb( + frame_type, i, correction_factor, cm->bit_depth); + } + + if (bits_per_mb_at_this_q <= target_bits_per_mb) { + if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error) + q = i; + else + q = i - 1; + + break; + } else { + last_error = bits_per_mb_at_this_q - target_bits_per_mb; + } + } while (++i <= active_worst_quality); + + // Adjustment to q for CBR mode. + if (cpi->oxcf.rc_mode == VPX_CBR) return adjust_q_cbr(cpi, q); + + return q; +} + +static int get_active_quality(int q, int gfu_boost, int low, int high, + int *low_motion_minq, int *high_motion_minq) { + if (gfu_boost > high) { + return low_motion_minq[q]; + } else if (gfu_boost < low) { + return high_motion_minq[q]; + } else { + const int gap = high - low; + const int offset = high - gfu_boost; + const int qdiff = high_motion_minq[q] - low_motion_minq[q]; + const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap; + return low_motion_minq[q] + adjustment; + } +} + +static int get_kf_active_quality(const RATE_CONTROL *const rc, int q, + vpx_bit_depth_t bit_depth) { + int *kf_low_motion_minq; + int *kf_high_motion_minq; + ASSIGN_MINQ_TABLE(bit_depth, kf_low_motion_minq); + ASSIGN_MINQ_TABLE(bit_depth, kf_high_motion_minq); + return get_active_quality(q, rc->kf_boost, kf_low, kf_high, + kf_low_motion_minq, kf_high_motion_minq); +} + +static int get_gf_active_quality(const VP9_COMP *const cpi, int q, + vpx_bit_depth_t bit_depth) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + const RATE_CONTROL *const rc = &cpi->rc; + + int *arfgf_low_motion_minq; + int *arfgf_high_motion_minq; + const int gfu_boost = cpi->multi_layer_arf + ? gf_group->gfu_boost[gf_group->index] + : rc->gfu_boost; + ASSIGN_MINQ_TABLE(bit_depth, arfgf_low_motion_minq); + ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq); + return get_active_quality(q, gfu_boost, gf_low, gf_high, + arfgf_low_motion_minq, arfgf_high_motion_minq); +} + +static int calc_active_worst_quality_one_pass_vbr(const VP9_COMP *cpi) { + const RATE_CONTROL *const rc = &cpi->rc; + const unsigned int curr_frame = cpi->common.current_video_frame; + int active_worst_quality; + + if (cpi->common.frame_type == KEY_FRAME) { + active_worst_quality = + curr_frame == 0 ? rc->worst_quality : rc->last_q[KEY_FRAME] << 1; + } else { + if (!rc->is_src_frame_alt_ref && !cpi->use_svc && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { + active_worst_quality = + curr_frame == 1 + ? rc->last_q[KEY_FRAME] * 5 >> 2 + : rc->last_q[INTER_FRAME] * rc->fac_active_worst_gf / 100; + } else { + active_worst_quality = curr_frame == 1 + ? rc->last_q[KEY_FRAME] << 1 + : rc->avg_frame_qindex[INTER_FRAME] * + rc->fac_active_worst_inter / 100; + } + } + return VPXMIN(active_worst_quality, rc->worst_quality); +} + +// Adjust active_worst_quality level based on buffer level. +static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) { + // Adjust active_worst_quality: If buffer is above the optimal/target level, + // bring active_worst_quality down depending on fullness of buffer. + // If buffer is below the optimal level, let the active_worst_quality go from + // ambient Q (at buffer = optimal level) to worst_quality level + // (at buffer = critical level). + const VP9_COMMON *const cm = &cpi->common; + const RATE_CONTROL *rc = &cpi->rc; + // Buffer level below which we push active_worst to worst_quality. + int64_t critical_level = rc->optimal_buffer_level >> 3; + int64_t buff_lvl_step = 0; + int adjustment = 0; + int active_worst_quality; + int ambient_qp; + unsigned int num_frames_weight_key = 5 * cpi->svc.number_temporal_layers; + if (frame_is_intra_only(cm) || rc->reset_high_source_sad || rc->force_max_q) + return rc->worst_quality; + // For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME] + // for the first few frames following key frame. These are both initialized + // to worst_quality and updated with (3/4, 1/4) average in postencode_update. + // So for first few frames following key, the qp of that key frame is weighted + // into the active_worst_quality setting. + ambient_qp = (cm->current_video_frame < num_frames_weight_key) + ? VPXMIN(rc->avg_frame_qindex[INTER_FRAME], + rc->avg_frame_qindex[KEY_FRAME]) + : rc->avg_frame_qindex[INTER_FRAME]; + active_worst_quality = VPXMIN(rc->worst_quality, (ambient_qp * 5) >> 2); + // For SVC if the current base spatial layer was key frame, use the QP from + // that base layer for ambient_qp. + if (cpi->use_svc && cpi->svc.spatial_layer_id > 0) { + int layer = LAYER_IDS_TO_IDX(0, cpi->svc.temporal_layer_id, + cpi->svc.number_temporal_layers); + const LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; + if (lc->is_key_frame) { + const RATE_CONTROL *lrc = &lc->rc; + ambient_qp = VPXMIN(ambient_qp, lrc->last_q[KEY_FRAME]); + active_worst_quality = VPXMIN(rc->worst_quality, (ambient_qp * 9) >> 3); + } + } + if (rc->buffer_level > rc->optimal_buffer_level) { + // Adjust down. + // Maximum limit for down adjustment ~30%; make it lower for screen content. + int max_adjustment_down = active_worst_quality / 3; + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) + max_adjustment_down = active_worst_quality >> 3; + if (max_adjustment_down) { + buff_lvl_step = ((rc->maximum_buffer_size - rc->optimal_buffer_level) / + max_adjustment_down); + if (buff_lvl_step) + adjustment = (int)((rc->buffer_level - rc->optimal_buffer_level) / + buff_lvl_step); + active_worst_quality -= adjustment; + } + } else if (rc->buffer_level > critical_level) { + // Adjust up from ambient Q. + if (critical_level) { + buff_lvl_step = (rc->optimal_buffer_level - critical_level); + if (buff_lvl_step) { + adjustment = (int)((rc->worst_quality - ambient_qp) * + (rc->optimal_buffer_level - rc->buffer_level) / + buff_lvl_step); + } + active_worst_quality = ambient_qp + adjustment; + } + } else { + // Set to worst_quality if buffer is below critical level. + active_worst_quality = rc->worst_quality; + } + return active_worst_quality; +} + +static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi, + int *bottom_index, + int *top_index) { + const VP9_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + int active_best_quality; + int active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi); + int q; + int *rtc_minq; + ASSIGN_MINQ_TABLE(cm->bit_depth, rtc_minq); + + if (frame_is_intra_only(cm)) { + active_best_quality = rc->best_quality; + // Handle the special case for key frames forced when we have reached + // the maximum key frame interval. Here force the Q to a range + // based on the ambient Q to reduce the risk of popping. + if (rc->this_key_frame_forced) { + int qindex = rc->last_boosted_qindex; + double last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth); + int delta_qindex = vp9_compute_qdelta( + rc, last_boosted_q, (last_boosted_q * 0.75), cm->bit_depth); + active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality); + } else if (cm->current_video_frame > 0) { + // not first frame of one pass and kf_boost is set + double q_adj_factor = 1.0; + double q_val; + + active_best_quality = get_kf_active_quality( + rc, rc->avg_frame_qindex[KEY_FRAME], cm->bit_depth); + + // Allow somewhat lower kf minq with small image formats. + if ((cm->width * cm->height) <= (352 * 288)) { + q_adj_factor -= 0.25; + } + + // Convert the adjustment factor to a qindex delta + // on active_best_quality. + q_val = vp9_convert_qindex_to_q(active_best_quality, cm->bit_depth); + active_best_quality += + vp9_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth); + } + } else if (!rc->is_src_frame_alt_ref && !cpi->use_svc && + cpi->oxcf.gf_cbr_boost_pct && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { + // Use the lower of active_worst_quality and recent + // average Q as basis for GF/ARF best Q limit unless last frame was + // a key frame. + if (rc->frames_since_key > 1 && + rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) { + q = rc->avg_frame_qindex[INTER_FRAME]; + } else { + q = active_worst_quality; + } + active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth); + } else { + // Use the lower of active_worst_quality and recent/average Q. + if (cm->current_video_frame > 1) { + if (rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) + active_best_quality = rtc_minq[rc->avg_frame_qindex[INTER_FRAME]]; + else + active_best_quality = rtc_minq[active_worst_quality]; + } else { + if (rc->avg_frame_qindex[KEY_FRAME] < active_worst_quality) + active_best_quality = rtc_minq[rc->avg_frame_qindex[KEY_FRAME]]; + else + active_best_quality = rtc_minq[active_worst_quality]; + } + } + + // Clip the active best and worst quality values to limits + active_best_quality = + clamp(active_best_quality, rc->best_quality, rc->worst_quality); + active_worst_quality = + clamp(active_worst_quality, active_best_quality, rc->worst_quality); + + *top_index = active_worst_quality; + *bottom_index = active_best_quality; + + // Special case code to try and match quality with forced key frames + if (frame_is_intra_only(cm) && rc->this_key_frame_forced) { + q = rc->last_boosted_qindex; + } else { + q = vp9_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality, + active_worst_quality); + if (q > *top_index) { + // Special case when we are targeting the max allowed rate + if (rc->this_frame_target >= rc->max_frame_bandwidth) + *top_index = q; + else + q = *top_index; + } + } + + assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality); + assert(*bottom_index <= rc->worst_quality && + *bottom_index >= rc->best_quality); + assert(q <= rc->worst_quality && q >= rc->best_quality); + return q; +} + +static int get_active_cq_level_one_pass(const RATE_CONTROL *rc, + const VP9EncoderConfig *const oxcf) { + static const double cq_adjust_threshold = 0.1; + int active_cq_level = oxcf->cq_level; + if (oxcf->rc_mode == VPX_CQ && rc->total_target_bits > 0) { + const double x = (double)rc->total_actual_bits / rc->total_target_bits; + if (x < cq_adjust_threshold) { + active_cq_level = (int)(active_cq_level * x / cq_adjust_threshold); + } + } + return active_cq_level; +} + +#define SMOOTH_PCT_MIN 0.1 +#define SMOOTH_PCT_DIV 0.05 +static int get_active_cq_level_two_pass(const TWO_PASS *twopass, + const RATE_CONTROL *rc, + const VP9EncoderConfig *const oxcf) { + static const double cq_adjust_threshold = 0.1; + int active_cq_level = oxcf->cq_level; + if (oxcf->rc_mode == VPX_CQ) { + if (twopass->mb_smooth_pct > SMOOTH_PCT_MIN) { + active_cq_level -= + (int)((twopass->mb_smooth_pct - SMOOTH_PCT_MIN) / SMOOTH_PCT_DIV); + active_cq_level = VPXMAX(active_cq_level, 0); + } + if (rc->total_target_bits > 0) { + const double x = (double)rc->total_actual_bits / rc->total_target_bits; + if (x < cq_adjust_threshold) { + active_cq_level = (int)(active_cq_level * x / cq_adjust_threshold); + } + } + } + return active_cq_level; +} + +static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, + int *bottom_index, + int *top_index) { + const VP9_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + const int cq_level = get_active_cq_level_one_pass(rc, oxcf); + int active_best_quality; + int active_worst_quality = calc_active_worst_quality_one_pass_vbr(cpi); + int q; + int *inter_minq; + ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq); + + if (frame_is_intra_only(cm)) { + if (oxcf->rc_mode == VPX_Q) { + int qindex = cq_level; + double qstart = vp9_convert_qindex_to_q(qindex, cm->bit_depth); + int delta_qindex = + vp9_compute_qdelta(rc, qstart, qstart * 0.25, cm->bit_depth); + active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality); + } else if (rc->this_key_frame_forced) { + // Handle the special case for key frames forced when we have reached + // the maximum key frame interval. Here force the Q to a range + // based on the ambient Q to reduce the risk of popping. + int qindex = rc->last_boosted_qindex; + double last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth); + int delta_qindex = vp9_compute_qdelta( + rc, last_boosted_q, last_boosted_q * 0.75, cm->bit_depth); + active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality); + } else { + // not first frame of one pass and kf_boost is set + double q_adj_factor = 1.0; + double q_val; + + active_best_quality = get_kf_active_quality( + rc, rc->avg_frame_qindex[KEY_FRAME], cm->bit_depth); + + // Allow somewhat lower kf minq with small image formats. + if ((cm->width * cm->height) <= (352 * 288)) { + q_adj_factor -= 0.25; + } + + // Convert the adjustment factor to a qindex delta + // on active_best_quality. + q_val = vp9_convert_qindex_to_q(active_best_quality, cm->bit_depth); + active_best_quality += + vp9_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth); + } + } else if (!rc->is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { + // Use the lower of active_worst_quality and recent + // average Q as basis for GF/ARF best Q limit unless last frame was + // a key frame. + if (rc->frames_since_key > 1) { + if (rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) { + q = rc->avg_frame_qindex[INTER_FRAME]; + } else { + q = active_worst_quality; + } + } else { + q = rc->avg_frame_qindex[KEY_FRAME]; + } + // For constrained quality don't allow Q less than the cq level + if (oxcf->rc_mode == VPX_CQ) { + if (q < cq_level) q = cq_level; + + active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth); + + // Constrained quality use slightly lower active best. + active_best_quality = active_best_quality * 15 / 16; + + } else if (oxcf->rc_mode == VPX_Q) { + int qindex = cq_level; + double qstart = vp9_convert_qindex_to_q(qindex, cm->bit_depth); + int delta_qindex; + if (cpi->refresh_alt_ref_frame) + delta_qindex = + vp9_compute_qdelta(rc, qstart, qstart * 0.40, cm->bit_depth); + else + delta_qindex = + vp9_compute_qdelta(rc, qstart, qstart * 0.50, cm->bit_depth); + active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality); + } else { + active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth); + } + } else { + if (oxcf->rc_mode == VPX_Q) { + int qindex = cq_level; + double qstart = vp9_convert_qindex_to_q(qindex, cm->bit_depth); + double delta_rate[FIXED_GF_INTERVAL] = { 0.50, 1.0, 0.85, 1.0, + 0.70, 1.0, 0.85, 1.0 }; + int delta_qindex = vp9_compute_qdelta( + rc, qstart, + qstart * delta_rate[cm->current_video_frame % FIXED_GF_INTERVAL], + cm->bit_depth); + active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality); + } else { + // Use the min of the average Q and active_worst_quality as basis for + // active_best. + if (cm->current_video_frame > 1) { + q = VPXMIN(rc->avg_frame_qindex[INTER_FRAME], active_worst_quality); + active_best_quality = inter_minq[q]; + } else { + active_best_quality = inter_minq[rc->avg_frame_qindex[KEY_FRAME]]; + } + // For the constrained quality mode we don't want + // q to fall below the cq level. + if ((oxcf->rc_mode == VPX_CQ) && (active_best_quality < cq_level)) { + active_best_quality = cq_level; + } + } + } + + // Clip the active best and worst quality values to limits + active_best_quality = + clamp(active_best_quality, rc->best_quality, rc->worst_quality); + active_worst_quality = + clamp(active_worst_quality, active_best_quality, rc->worst_quality); + + *top_index = active_worst_quality; + *bottom_index = active_best_quality; + +#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY + { + int qdelta = 0; + vpx_clear_system_state(); + + // Limit Q range for the adaptive loop. + if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced && + !(cm->current_video_frame == 0)) { + qdelta = vp9_compute_qdelta_by_rate( + &cpi->rc, cm->frame_type, active_worst_quality, 2.0, cm->bit_depth); + } else if (!rc->is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { + qdelta = vp9_compute_qdelta_by_rate( + &cpi->rc, cm->frame_type, active_worst_quality, 1.75, cm->bit_depth); + } + if (rc->high_source_sad && cpi->sf.use_altref_onepass) qdelta = 0; + *top_index = active_worst_quality + qdelta; + *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index; + } +#endif + + if (oxcf->rc_mode == VPX_Q) { + q = active_best_quality; + // Special case code to try and match quality with forced key frames + } else if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced) { + q = rc->last_boosted_qindex; + } else { + q = vp9_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality, + active_worst_quality); + if (q > *top_index) { + // Special case when we are targeting the max allowed rate + if (rc->this_frame_target >= rc->max_frame_bandwidth) + *top_index = q; + else + q = *top_index; + } + } + + assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality); + assert(*bottom_index <= rc->worst_quality && + *bottom_index >= rc->best_quality); + assert(q <= rc->worst_quality && q >= rc->best_quality); + return q; +} + +int vp9_frame_type_qdelta(const VP9_COMP *cpi, int rf_level, int q) { + static const double rate_factor_deltas[RATE_FACTOR_LEVELS] = { + 1.00, // INTER_NORMAL + 1.00, // INTER_HIGH + 1.50, // GF_ARF_LOW + 1.75, // GF_ARF_STD + 2.00, // KF_STD + }; + const VP9_COMMON *const cm = &cpi->common; + + int qdelta = vp9_compute_qdelta_by_rate( + &cpi->rc, cm->frame_type, q, rate_factor_deltas[rf_level], cm->bit_depth); + return qdelta; +} + +#define STATIC_MOTION_THRESH 95 + +static void pick_kf_q_bound_two_pass(const VP9_COMP *cpi, int *bottom_index, + int *top_index) { + const VP9_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + int active_best_quality; + int active_worst_quality = cpi->twopass.active_worst_quality; + + if (rc->this_key_frame_forced) { + // Handle the special case for key frames forced when we have reached + // the maximum key frame interval. Here force the Q to a range + // based on the ambient Q to reduce the risk of popping. + double last_boosted_q; + int delta_qindex; + int qindex; + + if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) { + qindex = VPXMIN(rc->last_kf_qindex, rc->last_boosted_qindex); + active_best_quality = qindex; + last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth); + delta_qindex = vp9_compute_qdelta(rc, last_boosted_q, + last_boosted_q * 1.25, cm->bit_depth); + active_worst_quality = + VPXMIN(qindex + delta_qindex, active_worst_quality); + } else { + qindex = rc->last_boosted_qindex; + last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth); + delta_qindex = vp9_compute_qdelta(rc, last_boosted_q, + last_boosted_q * 0.75, cm->bit_depth); + active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality); + } + } else { + // Not forced keyframe. + double q_adj_factor = 1.0; + double q_val; + // Baseline value derived from cpi->active_worst_quality and kf boost. + active_best_quality = + get_kf_active_quality(rc, active_worst_quality, cm->bit_depth); + if (cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) { + active_best_quality /= 4; + } + + // Don't allow the active min to be lossless (q0) unlesss the max q + // already indicates lossless. + active_best_quality = + VPXMIN(active_worst_quality, VPXMAX(1, active_best_quality)); + + // Allow somewhat lower kf minq with small image formats. + if ((cm->width * cm->height) <= (352 * 288)) { + q_adj_factor -= 0.25; + } + + // Make a further adjustment based on the kf zero motion measure. + q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct); + + // Convert the adjustment factor to a qindex delta + // on active_best_quality. + q_val = vp9_convert_qindex_to_q(active_best_quality, cm->bit_depth); + active_best_quality += + vp9_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth); + } + *top_index = active_worst_quality; + *bottom_index = active_best_quality; +} + +static int rc_constant_q(const VP9_COMP *cpi, int *bottom_index, int *top_index, + int gf_group_index) { + const VP9_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + const int is_intra_frame = frame_is_intra_only(cm); + + const int cq_level = get_active_cq_level_two_pass(&cpi->twopass, rc, oxcf); + + int q = cq_level; + int active_best_quality = cq_level; + int active_worst_quality = cq_level; + + // Key frame qp decision + if (is_intra_frame && rc->frames_to_key > 1) + pick_kf_q_bound_two_pass(cpi, &active_best_quality, &active_worst_quality); + + // ARF / GF qp decision + if (!is_intra_frame && !rc->is_src_frame_alt_ref && + cpi->refresh_alt_ref_frame) { + active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth); + + // Modify best quality for second level arfs. For mode VPX_Q this + // becomes the baseline frame q. + if (gf_group->rf_level[gf_group_index] == GF_ARF_LOW) { + const int layer_depth = gf_group->layer_depth[gf_group_index]; + // linearly fit the frame q depending on the layer depth index from + // the base layer ARF. + active_best_quality = ((layer_depth - 1) * cq_level + + active_best_quality + layer_depth / 2) / + layer_depth; + } + } + + q = active_best_quality; + *top_index = active_worst_quality; + *bottom_index = active_best_quality; + return q; +} + +static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index, + int *top_index, int gf_group_index) { + const VP9_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + const int cq_level = get_active_cq_level_two_pass(&cpi->twopass, rc, oxcf); + int active_best_quality; + int active_worst_quality = cpi->twopass.active_worst_quality; + int q; + int *inter_minq; + int arf_active_best_quality_hl; + int *arfgf_high_motion_minq, *arfgf_low_motion_minq; + const int boost_frame = + !rc->is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame); + + ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq); + + if (oxcf->rc_mode == VPX_Q) + return rc_constant_q(cpi, bottom_index, top_index, gf_group_index); + + if (frame_is_intra_only(cm)) { + pick_kf_q_bound_two_pass(cpi, &active_best_quality, &active_worst_quality); + } else if (boost_frame) { + // Use the lower of active_worst_quality and recent + // average Q as basis for GF/ARF best Q limit unless last frame was + // a key frame. + if (rc->frames_since_key > 1 && + rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) { + q = rc->avg_frame_qindex[INTER_FRAME]; + } else { + q = active_worst_quality; + } + // For constrained quality don't allow Q less than the cq level + if (oxcf->rc_mode == VPX_CQ) { + if (q < cq_level) q = cq_level; + } + active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth); + arf_active_best_quality_hl = active_best_quality; + + if (rc->arf_increase_active_best_quality == 1) { + ASSIGN_MINQ_TABLE(cm->bit_depth, arfgf_high_motion_minq); + arf_active_best_quality_hl = arfgf_high_motion_minq[q]; + } else if (rc->arf_increase_active_best_quality == -1) { + ASSIGN_MINQ_TABLE(cm->bit_depth, arfgf_low_motion_minq); + arf_active_best_quality_hl = arfgf_low_motion_minq[q]; + } + active_best_quality = + (int)((double)active_best_quality * + rc->arf_active_best_quality_adjustment_factor + + (double)arf_active_best_quality_hl * + (1.0 - rc->arf_active_best_quality_adjustment_factor)); + + // Modify best quality for second level arfs. For mode VPX_Q this + // becomes the baseline frame q. + if (gf_group->rf_level[gf_group_index] == GF_ARF_LOW) { + const int layer_depth = gf_group->layer_depth[gf_group_index]; + // linearly fit the frame q depending on the layer depth index from + // the base layer ARF. + active_best_quality = + ((layer_depth - 1) * q + active_best_quality + layer_depth / 2) / + layer_depth; + } + } else { + active_best_quality = inter_minq[active_worst_quality]; + + // For the constrained quality mode we don't want + // q to fall below the cq level. + if ((oxcf->rc_mode == VPX_CQ) && (active_best_quality < cq_level)) { + active_best_quality = cq_level; + } + } + + // Extension to max or min Q if undershoot or overshoot is outside + // the permitted range. + if (frame_is_intra_only(cm) || boost_frame) { + const int layer_depth = gf_group->layer_depth[gf_group_index]; + active_best_quality -= + (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast); + active_worst_quality += (cpi->twopass.extend_maxq / 2); + + if (gf_group->rf_level[gf_group_index] == GF_ARF_LOW) { + assert(layer_depth > 1); + active_best_quality = + VPXMAX(active_best_quality, + cpi->twopass.last_qindex_of_arf_layer[layer_depth - 1]); + } + } else { + const int max_layer_depth = gf_group->max_layer_depth; + assert(max_layer_depth > 0); + + active_best_quality -= + (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast) / 2; + active_worst_quality += cpi->twopass.extend_maxq; + + // For normal frames do not allow an active minq lower than the q used for + // the last boosted frame. + active_best_quality = + VPXMAX(active_best_quality, + cpi->twopass.last_qindex_of_arf_layer[max_layer_depth - 1]); + } + +#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY + vpx_clear_system_state(); + // Static forced key frames Q restrictions dealt with elsewhere. + if (!frame_is_intra_only(cm) || !rc->this_key_frame_forced || + cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH) { + int qdelta = vp9_frame_type_qdelta(cpi, gf_group->rf_level[gf_group_index], + active_worst_quality); + active_worst_quality = + VPXMAX(active_worst_quality + qdelta, active_best_quality); + } +#endif + + // Modify active_best_quality for downscaled normal frames. + if (rc->frame_size_selector != UNSCALED && !frame_is_kf_gf_arf(cpi)) { + int qdelta = vp9_compute_qdelta_by_rate( + rc, cm->frame_type, active_best_quality, 2.0, cm->bit_depth); + active_best_quality = + VPXMAX(active_best_quality + qdelta, rc->best_quality); + } + + active_best_quality = + clamp(active_best_quality, rc->best_quality, rc->worst_quality); + active_worst_quality = + clamp(active_worst_quality, active_best_quality, rc->worst_quality); + + if (frame_is_intra_only(cm) && rc->this_key_frame_forced) { + // If static since last kf use better of last boosted and last kf q. + if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) { + q = VPXMIN(rc->last_kf_qindex, rc->last_boosted_qindex); + } else { + q = rc->last_boosted_qindex; + } + } else if (frame_is_intra_only(cm) && !rc->this_key_frame_forced) { + q = active_best_quality; + } else { + q = vp9_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality, + active_worst_quality); + if (q > active_worst_quality) { + // Special case when we are targeting the max allowed rate. + if (rc->this_frame_target >= rc->max_frame_bandwidth) + active_worst_quality = q; + else + q = active_worst_quality; + } + } + clamp(q, active_best_quality, active_worst_quality); + + *top_index = active_worst_quality; + *bottom_index = active_best_quality; + + assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality); + assert(*bottom_index <= rc->worst_quality && + *bottom_index >= rc->best_quality); + assert(q <= rc->worst_quality && q >= rc->best_quality); + return q; +} + +int vp9_rc_pick_q_and_bounds(const VP9_COMP *cpi, int *bottom_index, + int *top_index) { + int q; + const int gf_group_index = cpi->twopass.gf_group.index; + if (cpi->oxcf.pass == 0) { + if (cpi->oxcf.rc_mode == VPX_CBR) + q = rc_pick_q_and_bounds_one_pass_cbr(cpi, bottom_index, top_index); + else + q = rc_pick_q_and_bounds_one_pass_vbr(cpi, bottom_index, top_index); + } else { + q = rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index, + gf_group_index); + } + if (cpi->sf.use_nonrd_pick_mode) { + if (cpi->sf.force_frame_boost == 1) q -= cpi->sf.max_delta_qindex; + + if (q < *bottom_index) + *bottom_index = q; + else if (q > *top_index) + *top_index = q; + } + return q; +} + +void vp9_configure_buffer_updates(VP9_COMP *cpi, int gf_group_index) { + VP9_COMMON *cm = &cpi->common; + TWO_PASS *const twopass = &cpi->twopass; + + cpi->rc.is_src_frame_alt_ref = 0; + cm->show_existing_frame = 0; + cpi->rc.show_arf_as_gld = 0; + switch (twopass->gf_group.update_type[gf_group_index]) { + case KF_UPDATE: + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 1; + cpi->refresh_alt_ref_frame = 1; + break; + case LF_UPDATE: + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 0; + cpi->refresh_alt_ref_frame = 0; + break; + case GF_UPDATE: + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 1; + cpi->refresh_alt_ref_frame = 0; + break; + case OVERLAY_UPDATE: + cpi->refresh_last_frame = 0; + cpi->refresh_golden_frame = 1; + cpi->refresh_alt_ref_frame = 0; + cpi->rc.is_src_frame_alt_ref = 1; + if (cpi->rc.preserve_arf_as_gld) { + cpi->rc.show_arf_as_gld = 1; + cpi->refresh_golden_frame = 0; + cm->show_existing_frame = 1; + cm->refresh_frame_context = 0; + } + break; + case MID_OVERLAY_UPDATE: + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 0; + cpi->refresh_alt_ref_frame = 0; + cpi->rc.is_src_frame_alt_ref = 1; + break; + case USE_BUF_FRAME: + cpi->refresh_last_frame = 0; + cpi->refresh_golden_frame = 0; + cpi->refresh_alt_ref_frame = 0; + cpi->rc.is_src_frame_alt_ref = 1; + cm->show_existing_frame = 1; + cm->refresh_frame_context = 0; + break; + default: + assert(twopass->gf_group.update_type[gf_group_index] == ARF_UPDATE); + cpi->refresh_last_frame = 0; + cpi->refresh_golden_frame = 0; + cpi->refresh_alt_ref_frame = 1; + break; + } +} + +void vp9_estimate_qp_gop(VP9_COMP *cpi) { + int gop_length = cpi->twopass.gf_group.gf_group_size; + int bottom_index, top_index; + int idx; + const int gf_index = cpi->twopass.gf_group.index; + const int is_src_frame_alt_ref = cpi->rc.is_src_frame_alt_ref; + const int refresh_frame_context = cpi->common.refresh_frame_context; + + for (idx = 1; idx <= gop_length; ++idx) { + TplDepFrame *tpl_frame = &cpi->tpl_stats[idx]; + int target_rate = cpi->twopass.gf_group.bit_allocation[idx]; + cpi->twopass.gf_group.index = idx; + vp9_rc_set_frame_target(cpi, target_rate); + vp9_configure_buffer_updates(cpi, idx); + if (cpi->tpl_with_external_rc) { + if (cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0 && + cpi->ext_ratectrl.funcs.get_encodeframe_decision != NULL) { + VP9_COMMON *cm = &cpi->common; + vpx_codec_err_t codec_status; + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + vpx_rc_encodeframe_decision_t encode_frame_decision; + FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index]; + RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES]; + const RefCntBuffer *curr_frame_buf = + get_ref_cnt_buffer(cm, cm->new_fb_idx); + // index 0 of a gf group is always KEY/OVERLAY/GOLDEN. + // index 1 refers to the first encoding frame in a gf group. + // Therefore if it is ARF_UPDATE, it means this gf group uses alt ref. + // See function define_gf_group_structure(). + const int use_alt_ref = gf_group->update_type[1] == ARF_UPDATE; + const int frame_coding_index = cm->current_frame_coding_index + idx - 1; + get_ref_frame_bufs(cpi, ref_frame_bufs); + codec_status = vp9_extrc_get_encodeframe_decision( + &cpi->ext_ratectrl, curr_frame_buf->frame_index, frame_coding_index, + gf_group->index, update_type, gf_group->gf_group_size, use_alt_ref, + ref_frame_bufs, 0 /*ref_frame_flags is not used*/, + &encode_frame_decision); + if (codec_status != VPX_CODEC_OK) { + vpx_internal_error(&cm->error, codec_status, + "vp9_extrc_get_encodeframe_decision() failed"); + } + tpl_frame->base_qindex = encode_frame_decision.q_index; + } + } else { + tpl_frame->base_qindex = + rc_pick_q_and_bounds_two_pass(cpi, &bottom_index, &top_index, idx); + tpl_frame->base_qindex = VPXMAX(tpl_frame->base_qindex, 1); + } + } + // Reset the actual index and frame update + cpi->twopass.gf_group.index = gf_index; + cpi->rc.is_src_frame_alt_ref = is_src_frame_alt_ref; + cpi->common.refresh_frame_context = refresh_frame_context; + vp9_configure_buffer_updates(cpi, gf_index); +} + +void vp9_rc_compute_frame_size_bounds(const VP9_COMP *cpi, int frame_target, + int *frame_under_shoot_limit, + int *frame_over_shoot_limit) { + if (cpi->oxcf.rc_mode == VPX_Q) { + *frame_under_shoot_limit = 0; + *frame_over_shoot_limit = INT_MAX; + } else { + // For very small rate targets where the fractional adjustment + // may be tiny make sure there is at least a minimum range. + const int tol_low = + (int)(((int64_t)cpi->sf.recode_tolerance_low * frame_target) / 100); + const int tol_high = + (int)(((int64_t)cpi->sf.recode_tolerance_high * frame_target) / 100); + *frame_under_shoot_limit = VPXMAX(frame_target - tol_low - 100, 0); + *frame_over_shoot_limit = + VPXMIN(frame_target + tol_high + 100, cpi->rc.max_frame_bandwidth); + } +} + +void vp9_rc_set_frame_target(VP9_COMP *cpi, int target) { + const VP9_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + + rc->this_frame_target = target; + + // Modify frame size target when down-scaling. + if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC && + rc->frame_size_selector != UNSCALED) { + rc->this_frame_target = (int)(rc->this_frame_target * + rate_thresh_mult[rc->frame_size_selector]); + } + +#if CONFIG_RATE_CTRL + if (cpi->oxcf.use_simple_encode_api) { + if (cpi->encode_command.use_external_target_frame_bits) { + rc->this_frame_target = cpi->encode_command.target_frame_bits; + } + } +#endif // CONFIG_RATE_CTRL + + // Target rate per SB64 (including partial SB64s. + rc->sb64_target_rate = (int)(((int64_t)rc->this_frame_target * 64 * 64) / + (cm->width * cm->height)); +} + +static void update_alt_ref_frame_stats(VP9_COMP *cpi) { + // this frame refreshes means next frames don't unless specified by user + RATE_CONTROL *const rc = &cpi->rc; + rc->frames_since_golden = 0; + + // Mark the alt ref as done (setting to 0 means no further alt refs pending). + rc->source_alt_ref_pending = 0; + + // Set the alternate reference frame active flag + rc->source_alt_ref_active = 1; +} + +static void update_golden_frame_stats(VP9_COMP *cpi) { + RATE_CONTROL *const rc = &cpi->rc; + + // Update the Golden frame usage counts. + if (cpi->refresh_golden_frame) { + // this frame refreshes means next frames don't unless specified by user + rc->frames_since_golden = 0; + + // If we are not using alt ref in the up and coming group clear the arf + // active flag. In multi arf group case, if the index is not 0 then + // we are overlaying a mid group arf so should not reset the flag. + if (cpi->oxcf.pass == 2) { + if (!rc->source_alt_ref_pending && (cpi->twopass.gf_group.index == 0)) + rc->source_alt_ref_active = 0; + } else if (!rc->source_alt_ref_pending) { + rc->source_alt_ref_active = 0; + } + + // Decrement count down till next gf + if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--; + + } else if (!cpi->refresh_alt_ref_frame) { + // Decrement count down till next gf + if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--; + + rc->frames_since_golden++; + + if (rc->show_arf_as_gld) { + rc->frames_since_golden = 0; + // If we are not using alt ref in the up and coming group clear the arf + // active flag. In multi arf group case, if the index is not 0 then + // we are overlaying a mid group arf so should not reset the flag. + if (!rc->source_alt_ref_pending && (cpi->twopass.gf_group.index == 0)) + rc->source_alt_ref_active = 0; + } + } +} + +static void update_altref_usage(VP9_COMP *const cpi) { + VP9_COMMON *const cm = &cpi->common; + int sum_ref_frame_usage = 0; + int arf_frame_usage = 0; + int mi_row, mi_col; + if (cpi->rc.alt_ref_gf_group && !cpi->rc.is_src_frame_alt_ref && + !cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 8) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 8) { + int sboffset = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3); + sum_ref_frame_usage += cpi->count_arf_frame_usage[sboffset] + + cpi->count_lastgolden_frame_usage[sboffset]; + arf_frame_usage += cpi->count_arf_frame_usage[sboffset]; + } + } + if (sum_ref_frame_usage > 0) { + double altref_count = 100.0 * arf_frame_usage / sum_ref_frame_usage; + cpi->rc.perc_arf_usage = + 0.75 * cpi->rc.perc_arf_usage + 0.25 * altref_count; + } +} + +void vp9_compute_frame_low_motion(VP9_COMP *const cpi) { + VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + int mi_row, mi_col; + MODE_INFO **mi = cm->mi_grid_visible; + RATE_CONTROL *const rc = &cpi->rc; + const int rows = cm->mi_rows, cols = cm->mi_cols; + int cnt_zeromv = 0; + for (mi_row = 0; mi_row < rows; mi_row++) { + for (mi_col = 0; mi_col < cols; mi_col++) { + if (mi[0]->ref_frame[0] == LAST_FRAME && + abs(mi[0]->mv[0].as_mv.row) < 16 && abs(mi[0]->mv[0].as_mv.col) < 16) + cnt_zeromv++; + mi++; + } + mi += 8; + } + cnt_zeromv = 100 * cnt_zeromv / (rows * cols); + rc->avg_frame_low_motion = (3 * rc->avg_frame_low_motion + cnt_zeromv) >> 2; + + // For SVC: set avg_frame_low_motion (only computed on top spatial layer) + // to all lower spatial layers. + if (cpi->use_svc && svc->spatial_layer_id == svc->number_spatial_layers - 1) { + int i; + for (i = 0; i < svc->number_spatial_layers - 1; ++i) { + const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id, + svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + RATE_CONTROL *const lrc = &lc->rc; + lrc->avg_frame_low_motion = rc->avg_frame_low_motion; + } + } +} + +void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { + const VP9_COMMON *const cm = &cpi->common; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + RATE_CONTROL *const rc = &cpi->rc; + SVC *const svc = &cpi->svc; + const int qindex = cm->base_qindex; + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + const int gf_group_index = cpi->twopass.gf_group.index; + const int layer_depth = gf_group->layer_depth[gf_group_index]; + + // Update rate control heuristics + rc->projected_frame_size = (int)(bytes_used << 3); + + // Post encode loop adjustment of Q prediction. + vp9_rc_update_rate_correction_factors(cpi); + + // Keep a record of last Q and ambient average Q. + if (frame_is_intra_only(cm)) { + rc->last_q[KEY_FRAME] = qindex; + rc->avg_frame_qindex[KEY_FRAME] = + ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2); + if (cpi->use_svc) { + int i; + for (i = 0; i < svc->number_temporal_layers; ++i) { + const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, + svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + lrc->last_q[KEY_FRAME] = rc->last_q[KEY_FRAME]; + lrc->avg_frame_qindex[KEY_FRAME] = rc->avg_frame_qindex[KEY_FRAME]; + } + } + } else { + if ((cpi->use_svc) || + (!rc->is_src_frame_alt_ref && + !(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) { + rc->last_q[INTER_FRAME] = qindex; + rc->avg_frame_qindex[INTER_FRAME] = + ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2); + rc->ni_frames++; + rc->tot_q += vp9_convert_qindex_to_q(qindex, cm->bit_depth); + rc->avg_q = rc->tot_q / rc->ni_frames; + // Calculate the average Q for normal inter frames (not key or GFU + // frames). + rc->ni_tot_qi += qindex; + rc->ni_av_qi = rc->ni_tot_qi / rc->ni_frames; + } + } + + if (cpi->use_svc) vp9_svc_adjust_avg_frame_qindex(cpi); + + // Keep record of last boosted (KF/KF/ARF) Q value. + // If the current frame is coded at a lower Q then we also update it. + // If all mbs in this group are skipped only update if the Q value is + // better than that already stored. + // This is used to help set quality in forced key frames to reduce popping + if ((qindex < rc->last_boosted_qindex) || (cm->frame_type == KEY_FRAME) || + (!rc->constrained_gf_group && + (cpi->refresh_alt_ref_frame || + (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) { + rc->last_boosted_qindex = qindex; + } + + if ((qindex < cpi->twopass.last_qindex_of_arf_layer[layer_depth]) || + (cm->frame_type == KEY_FRAME) || + (!rc->constrained_gf_group && + (cpi->refresh_alt_ref_frame || + (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) { + cpi->twopass.last_qindex_of_arf_layer[layer_depth] = qindex; + } + + if (frame_is_intra_only(cm)) rc->last_kf_qindex = qindex; + + update_buffer_level_postencode(cpi, rc->projected_frame_size); + + // Rolling monitors of whether we are over or underspending used to help + // regulate min and Max Q in two pass. + if (!frame_is_intra_only(cm)) { + rc->rolling_target_bits = (int)ROUND64_POWER_OF_TWO( + (int64_t)rc->rolling_target_bits * 3 + rc->this_frame_target, 2); + rc->rolling_actual_bits = (int)ROUND64_POWER_OF_TWO( + (int64_t)rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2); + rc->long_rolling_target_bits = (int)ROUND64_POWER_OF_TWO( + (int64_t)rc->long_rolling_target_bits * 31 + rc->this_frame_target, 5); + rc->long_rolling_actual_bits = (int)ROUND64_POWER_OF_TWO( + (int64_t)rc->long_rolling_actual_bits * 31 + rc->projected_frame_size, + 5); + } + + // Actual bits spent + rc->total_actual_bits += rc->projected_frame_size; + rc->total_target_bits += cm->show_frame ? rc->avg_frame_bandwidth : 0; + + rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits; + + if (!cpi->use_svc) { + if (is_altref_enabled(cpi) && cpi->refresh_alt_ref_frame && + (!frame_is_intra_only(cm))) + // Update the alternate reference frame stats as appropriate. + update_alt_ref_frame_stats(cpi); + else + // Update the Golden frame stats as appropriate. + update_golden_frame_stats(cpi); + } + + // If second (long term) temporal reference is used for SVC, + // update the golden frame counter, only for base temporal layer. + if (cpi->use_svc && svc->use_gf_temporal_ref_current_layer && + svc->temporal_layer_id == 0) { + int i = 0; + if (cpi->refresh_golden_frame) + rc->frames_since_golden = 0; + else + rc->frames_since_golden++; + // Decrement count down till next gf + if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--; + // Update the frames_since_golden for all upper temporal layers. + for (i = 1; i < svc->number_temporal_layers; ++i) { + const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, + svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + RATE_CONTROL *const lrc = &lc->rc; + lrc->frames_since_golden = rc->frames_since_golden; + } + } + + if (frame_is_intra_only(cm)) rc->frames_since_key = 0; + if (cm->show_frame) { + rc->frames_since_key++; + rc->frames_to_key--; + } + + // Trigger the resizing of the next frame if it is scaled. + if (oxcf->pass != 0) { + cpi->resize_pending = + rc->next_frame_size_selector != rc->frame_size_selector; + rc->frame_size_selector = rc->next_frame_size_selector; + } + + if (oxcf->pass == 0) { + if (!frame_is_intra_only(cm)) + if (cpi->sf.use_altref_onepass) update_altref_usage(cpi); + cpi->rc.last_frame_is_src_altref = cpi->rc.is_src_frame_alt_ref; + } + + if (!frame_is_intra_only(cm)) rc->reset_high_source_sad = 0; + + rc->last_avg_frame_bandwidth = rc->avg_frame_bandwidth; + if (cpi->use_svc && svc->spatial_layer_id < svc->number_spatial_layers - 1) + svc->lower_layer_qindex = cm->base_qindex; + cpi->deadline_mode_previous_frame = cpi->oxcf.mode; +} + +void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) { + cpi->common.current_video_frame++; + cpi->rc.frames_since_key++; + cpi->rc.frames_to_key--; + cpi->rc.rc_2_frame = 0; + cpi->rc.rc_1_frame = 0; + cpi->rc.last_avg_frame_bandwidth = cpi->rc.avg_frame_bandwidth; + cpi->rc.last_q[INTER_FRAME] = cpi->common.base_qindex; + // For SVC on dropped frame when framedrop_mode != LAYER_DROP: + // in this mode the whole superframe may be dropped if only a single layer + // has buffer underflow (below threshold). Since this can then lead to + // increasing buffer levels/overflow for certain layers even though whole + // superframe is dropped, we cap buffer level if its already stable. + if (cpi->use_svc && cpi->svc.framedrop_mode != LAYER_DROP && + cpi->rc.buffer_level > cpi->rc.optimal_buffer_level) { + cpi->rc.buffer_level = cpi->rc.optimal_buffer_level; + cpi->rc.bits_off_target = cpi->rc.optimal_buffer_level; + } + cpi->deadline_mode_previous_frame = cpi->oxcf.mode; +} + +int vp9_calc_pframe_target_size_one_pass_vbr(const VP9_COMP *cpi) { + const RATE_CONTROL *const rc = &cpi->rc; + const int af_ratio = rc->af_ratio_onepass_vbr; + int64_t target = + (!rc->is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) + ? ((int64_t)rc->avg_frame_bandwidth * rc->baseline_gf_interval * + af_ratio) / + (rc->baseline_gf_interval + af_ratio - 1) + : ((int64_t)rc->avg_frame_bandwidth * rc->baseline_gf_interval) / + (rc->baseline_gf_interval + af_ratio - 1); + // For SVC: refresh flags are used to define the pattern, so we can't + // use that for boosting the target size here. + // TODO(marpan): Consider adding internal boost on TL0 for VBR-SVC. + // For now just use the CBR logic for setting target size. + if (cpi->use_svc) target = vp9_calc_pframe_target_size_one_pass_cbr(cpi); + if (target > INT_MAX) target = INT_MAX; + return vp9_rc_clamp_pframe_target_size(cpi, (int)target); +} + +int vp9_calc_iframe_target_size_one_pass_vbr(const VP9_COMP *cpi) { + static const int kf_ratio = 25; + const RATE_CONTROL *rc = &cpi->rc; + int target = rc->avg_frame_bandwidth; + if (target > INT_MAX / kf_ratio) + target = INT_MAX; + else + target = rc->avg_frame_bandwidth * kf_ratio; + return vp9_rc_clamp_iframe_target_size(cpi, target); +} + +static void adjust_gfint_frame_constraint(VP9_COMP *cpi, int frame_constraint) { + RATE_CONTROL *const rc = &cpi->rc; + rc->constrained_gf_group = 0; + // Reset gf interval to make more equal spacing for frame_constraint. + if ((frame_constraint <= 7 * rc->baseline_gf_interval >> 2) && + (frame_constraint > rc->baseline_gf_interval)) { + rc->baseline_gf_interval = frame_constraint >> 1; + if (rc->baseline_gf_interval < 5) + rc->baseline_gf_interval = frame_constraint; + rc->constrained_gf_group = 1; + } else { + // Reset to keep gf_interval <= frame_constraint. + if (rc->baseline_gf_interval > frame_constraint) { + rc->baseline_gf_interval = frame_constraint; + rc->constrained_gf_group = 1; + } + } +} + +void vp9_set_gf_update_one_pass_vbr(VP9_COMP *const cpi) { + RATE_CONTROL *const rc = &cpi->rc; + VP9_COMMON *const cm = &cpi->common; + if (rc->frames_till_gf_update_due == 0) { + double rate_err = 1.0; + rc->gfu_boost = DEFAULT_GF_BOOST; + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.pass == 0) { + vp9_cyclic_refresh_set_golden_update(cpi); + } else { + rc->baseline_gf_interval = VPXMIN( + 20, VPXMAX(10, (rc->min_gf_interval + rc->max_gf_interval) / 2)); + } + rc->af_ratio_onepass_vbr = 10; + if (rc->rolling_target_bits > 0) + rate_err = + (double)rc->rolling_actual_bits / (double)rc->rolling_target_bits; + if (cm->current_video_frame > 30) { + if (rc->avg_frame_qindex[INTER_FRAME] > (7 * rc->worst_quality) >> 3 && + rate_err > 3.5) { + rc->baseline_gf_interval = + VPXMIN(15, (3 * rc->baseline_gf_interval) >> 1); + } else if (rc->avg_frame_low_motion > 0 && + rc->avg_frame_low_motion < 20) { + // Decrease gf interval for high motion case. + rc->baseline_gf_interval = VPXMAX(6, rc->baseline_gf_interval >> 1); + } + // Adjust boost and af_ratio based on avg_frame_low_motion, which + // varies between 0 and 100 (stationary, 100% zero/small motion). + if (rc->avg_frame_low_motion > 0) + rc->gfu_boost = + VPXMAX(500, DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) / + (rc->avg_frame_low_motion + 100)); + else if (rc->avg_frame_low_motion == 0 && rate_err > 1.0) + rc->gfu_boost = DEFAULT_GF_BOOST >> 1; + rc->af_ratio_onepass_vbr = VPXMIN(15, VPXMAX(5, 3 * rc->gfu_boost / 400)); + } + if (rc->constrain_gf_key_freq_onepass_vbr) + adjust_gfint_frame_constraint(cpi, rc->frames_to_key); + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + cpi->refresh_golden_frame = 1; + rc->source_alt_ref_pending = 0; + rc->alt_ref_gf_group = 0; + if (cpi->sf.use_altref_onepass && cpi->oxcf.enable_auto_arf) { + rc->source_alt_ref_pending = 1; + rc->alt_ref_gf_group = 1; + } + } +} + +void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + int target; + if (!cpi->refresh_alt_ref_frame && + (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) || + rc->frames_to_key == 0 || + (cpi->oxcf.mode != cpi->deadline_mode_previous_frame))) { + cm->frame_type = KEY_FRAME; + rc->this_key_frame_forced = + cm->current_video_frame != 0 && rc->frames_to_key == 0; + rc->frames_to_key = cpi->oxcf.key_freq; + rc->kf_boost = DEFAULT_KF_BOOST; + rc->source_alt_ref_active = 0; + } else { + cm->frame_type = INTER_FRAME; + } + vp9_set_gf_update_one_pass_vbr(cpi); + if (cm->frame_type == KEY_FRAME) + target = vp9_calc_iframe_target_size_one_pass_vbr(cpi); + else + target = vp9_calc_pframe_target_size_one_pass_vbr(cpi); + vp9_rc_set_frame_target(cpi, target); + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.pass == 0) + vp9_cyclic_refresh_update_parameters(cpi); +} + +int vp9_calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) { + const VP9EncoderConfig *oxcf = &cpi->oxcf; + const RATE_CONTROL *rc = &cpi->rc; + const SVC *const svc = &cpi->svc; + const int64_t diff = rc->optimal_buffer_level - rc->buffer_level; + const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100; + int min_frame_target = + VPXMAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS); + int target; + + if (oxcf->gf_cbr_boost_pct) { + const int af_ratio_pct = oxcf->gf_cbr_boost_pct + 100; + target = cpi->refresh_golden_frame + ? (rc->avg_frame_bandwidth * rc->baseline_gf_interval * + af_ratio_pct) / + (rc->baseline_gf_interval * 100 + af_ratio_pct - 100) + : (rc->avg_frame_bandwidth * rc->baseline_gf_interval * 100) / + (rc->baseline_gf_interval * 100 + af_ratio_pct - 100); + } else { + target = rc->avg_frame_bandwidth; + } + if (is_one_pass_svc(cpi)) { + // Note that for layers, avg_frame_bandwidth is the cumulative + // per-frame-bandwidth. For the target size of this frame, use the + // layer average frame size (i.e., non-cumulative per-frame-bw). + int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id, + svc->number_temporal_layers); + const LAYER_CONTEXT *lc = &svc->layer_context[layer]; + target = lc->avg_frame_size; + min_frame_target = VPXMAX(lc->avg_frame_size >> 4, FRAME_OVERHEAD_BITS); + } + if (diff > 0) { + // Lower the target bandwidth for this frame. + const int pct_low = (int)VPXMIN(diff / one_pct_bits, oxcf->under_shoot_pct); + target -= (int)(((int64_t)target * pct_low) / 200); + } else if (diff < 0) { + // Increase the target bandwidth for this frame. + const int pct_high = + (int)VPXMIN(-diff / one_pct_bits, oxcf->over_shoot_pct); + target += (int)(((int64_t)target * pct_high) / 200); + } + if (oxcf->rc_max_inter_bitrate_pct) { + const int max_rate = + rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100; + target = VPXMIN(target, max_rate); + } + return VPXMAX(min_frame_target, target); +} + +int vp9_calc_iframe_target_size_one_pass_cbr(const VP9_COMP *cpi) { + const RATE_CONTROL *rc = &cpi->rc; + const VP9EncoderConfig *oxcf = &cpi->oxcf; + const SVC *const svc = &cpi->svc; + int target; + if (cpi->common.current_video_frame == 0) { + target = ((rc->starting_buffer_level / 2) > INT_MAX) + ? INT_MAX + : (int)(rc->starting_buffer_level / 2); + } else { + int kf_boost = 32; + double framerate = cpi->framerate; + if (svc->number_temporal_layers > 1 && oxcf->rc_mode == VPX_CBR) { + // Use the layer framerate for temporal layers CBR mode. + const int layer = + LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id, + svc->number_temporal_layers); + const LAYER_CONTEXT *lc = &svc->layer_context[layer]; + framerate = lc->framerate; + } + kf_boost = VPXMAX(kf_boost, (int)(2 * framerate - 16)); + if (rc->frames_since_key < framerate / 2) { + kf_boost = (int)(kf_boost * rc->frames_since_key / (framerate / 2)); + } + target = ((16 + kf_boost) * rc->avg_frame_bandwidth) >> 4; + } + return vp9_rc_clamp_iframe_target_size(cpi, target); +} + +static void set_intra_only_frame(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + // Don't allow intra_only frame for bypass/flexible SVC mode, or if number + // of spatial layers is 1 or if number of spatial or temporal layers > 3. + // Also if intra-only is inserted on very first frame, don't allow if + // if number of temporal layers > 1. This is because on intra-only frame + // only 3 reference buffers can be updated, but for temporal layers > 1 + // we generally need to use buffer slots 4 and 5. + if ((cm->current_video_frame == 0 && svc->number_temporal_layers > 1) || + svc->number_spatial_layers > 3 || svc->number_temporal_layers > 3 || + svc->number_spatial_layers == 1) + return; + cm->show_frame = 0; + cm->intra_only = 1; + cm->frame_type = INTER_FRAME; + cpi->ext_refresh_frame_flags_pending = 1; + cpi->ext_refresh_last_frame = 1; + cpi->ext_refresh_golden_frame = 1; + cpi->ext_refresh_alt_ref_frame = 1; + if (cm->current_video_frame == 0) { + cpi->lst_fb_idx = 0; + cpi->gld_fb_idx = 1; + cpi->alt_fb_idx = 2; + } else { + int i; + int count = 0; + cpi->lst_fb_idx = -1; + cpi->gld_fb_idx = -1; + cpi->alt_fb_idx = -1; + svc->update_buffer_slot[0] = 0; + // For intra-only frame we need to refresh all slots that were + // being used for the base layer (fb_idx_base[i] == 1). + // Start with assigning last first, then golden and then alt. + for (i = 0; i < REF_FRAMES; ++i) { + if (svc->fb_idx_base[i] == 1) { + svc->update_buffer_slot[0] |= 1 << i; + count++; + } + if (count == 1 && cpi->lst_fb_idx == -1) cpi->lst_fb_idx = i; + if (count == 2 && cpi->gld_fb_idx == -1) cpi->gld_fb_idx = i; + if (count == 3 && cpi->alt_fb_idx == -1) cpi->alt_fb_idx = i; + } + // If golden or alt is not being used for base layer, then set them + // to the lst_fb_idx. + if (cpi->gld_fb_idx == -1) cpi->gld_fb_idx = cpi->lst_fb_idx; + if (cpi->alt_fb_idx == -1) cpi->alt_fb_idx = cpi->lst_fb_idx; + if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + cpi->ext_refresh_last_frame = 0; + cpi->ext_refresh_golden_frame = 0; + cpi->ext_refresh_alt_ref_frame = 0; + cpi->ref_frame_flags = 0; + } + } +} + +void vp9_rc_get_svc_params(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + SVC *const svc = &cpi->svc; + int target = rc->avg_frame_bandwidth; + int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id, + svc->number_temporal_layers); + if (svc->first_spatial_layer_to_encode) + svc->layer_context[svc->temporal_layer_id].is_key_frame = 0; + // Periodic key frames is based on the super-frame counter + // (svc.current_superframe), also only base spatial layer is key frame. + // Key frame is set for any of the following: very first frame, frame flags + // indicates key, superframe counter hits key frequency,(non-intra) sync + // flag is set for spatial layer 0, or deadline mode changes. + if ((cm->current_video_frame == 0 && !svc->previous_frame_is_intra_only) || + (cpi->frame_flags & FRAMEFLAGS_KEY) || + (cpi->oxcf.auto_key && + (svc->current_superframe % cpi->oxcf.key_freq == 0) && + !svc->previous_frame_is_intra_only && svc->spatial_layer_id == 0) || + (svc->spatial_layer_sync[0] == 1 && svc->spatial_layer_id == 0) || + (cpi->oxcf.mode != cpi->deadline_mode_previous_frame)) { + cm->frame_type = KEY_FRAME; + rc->source_alt_ref_active = 0; + if (is_one_pass_svc(cpi)) { + if (cm->current_video_frame > 0) vp9_svc_reset_temporal_layers(cpi, 1); + layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id, + svc->number_temporal_layers); + svc->layer_context[layer].is_key_frame = 1; + cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG); + // Assumption here is that LAST_FRAME is being updated for a keyframe. + // Thus no change in update flags. + if (cpi->oxcf.rc_mode == VPX_CBR) + target = vp9_calc_iframe_target_size_one_pass_cbr(cpi); + else + target = vp9_calc_iframe_target_size_one_pass_vbr(cpi); + } + } else { + cm->frame_type = INTER_FRAME; + if (is_one_pass_svc(cpi)) { + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + // Add condition current_video_frame > 0 for the case where first frame + // is intra only followed by overlay/copy frame. In this case we don't + // want to reset is_key_frame to 0 on overlay/copy frame. + lc->is_key_frame = + (svc->spatial_layer_id == 0 && cm->current_video_frame > 0) + ? 0 + : svc->layer_context[svc->temporal_layer_id].is_key_frame; + if (cpi->oxcf.rc_mode == VPX_CBR) { + target = vp9_calc_pframe_target_size_one_pass_cbr(cpi); + } else { + double rate_err = 0.0; + rc->fac_active_worst_inter = 140; + rc->fac_active_worst_gf = 100; + if (rc->rolling_target_bits > 0) { + rate_err = + (double)rc->rolling_actual_bits / (double)rc->rolling_target_bits; + if (rate_err < 1.0) + rc->fac_active_worst_inter = 120; + else if (rate_err > 2.0) + // Increase active_worst faster if rate fluctuation is high. + rc->fac_active_worst_inter = 160; + } + target = vp9_calc_pframe_target_size_one_pass_vbr(cpi); + } + } + } + + if (svc->simulcast_mode) { + if (svc->spatial_layer_id > 0 && + svc->layer_context[layer].is_key_frame == 1) { + cm->frame_type = KEY_FRAME; + cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG); + if (cpi->oxcf.rc_mode == VPX_CBR) + target = vp9_calc_iframe_target_size_one_pass_cbr(cpi); + else + target = vp9_calc_iframe_target_size_one_pass_vbr(cpi); + } + // Set the buffer idx and refresh flags for key frames in simulcast mode. + // Note the buffer slot for long-term reference is set below (line 2255), + // and alt_ref is used for that on key frame. So use last and golden for + // the other two normal slots. + if (cm->frame_type == KEY_FRAME) { + if (svc->number_spatial_layers == 2) { + if (svc->spatial_layer_id == 0) { + cpi->lst_fb_idx = 0; + cpi->gld_fb_idx = 2; + cpi->alt_fb_idx = 6; + } else if (svc->spatial_layer_id == 1) { + cpi->lst_fb_idx = 1; + cpi->gld_fb_idx = 3; + cpi->alt_fb_idx = 6; + } + } else if (svc->number_spatial_layers == 3) { + if (svc->spatial_layer_id == 0) { + cpi->lst_fb_idx = 0; + cpi->gld_fb_idx = 3; + cpi->alt_fb_idx = 6; + } else if (svc->spatial_layer_id == 1) { + cpi->lst_fb_idx = 1; + cpi->gld_fb_idx = 4; + cpi->alt_fb_idx = 6; + } else if (svc->spatial_layer_id == 2) { + cpi->lst_fb_idx = 2; + cpi->gld_fb_idx = 5; + cpi->alt_fb_idx = 7; + } + } + cpi->ext_refresh_last_frame = 1; + cpi->ext_refresh_golden_frame = 1; + cpi->ext_refresh_alt_ref_frame = 1; + } + } + + // Check if superframe contains a sync layer request. + vp9_svc_check_spatial_layer_sync(cpi); + + // If long term termporal feature is enabled, set the period of the update. + // The update/refresh of this reference frame is always on base temporal + // layer frame. + if (svc->use_gf_temporal_ref_current_layer) { + // Only use gf long-term prediction on non-key superframes. + if (!svc->layer_context[svc->temporal_layer_id].is_key_frame) { + // Use golden for this reference, which will be used for prediction. + int index = svc->spatial_layer_id; + if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1; + assert(index >= 0); + cpi->gld_fb_idx = svc->buffer_gf_temporal_ref[index].idx; + // Enable prediction off LAST (last reference) and golden (which will + // generally be further behind/long-term reference). + cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG; + } + // Check for update/refresh of reference: only refresh on base temporal + // layer. + if (svc->temporal_layer_id == 0) { + if (svc->layer_context[svc->temporal_layer_id].is_key_frame) { + // On key frame we update the buffer index used for long term reference. + // Use the alt_ref since it is not used or updated on key frames. + int index = svc->spatial_layer_id; + if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1; + assert(index >= 0); + cpi->alt_fb_idx = svc->buffer_gf_temporal_ref[index].idx; + cpi->ext_refresh_alt_ref_frame = 1; + } else if (rc->frames_till_gf_update_due == 0) { + // Set perdiod of next update. Make it a multiple of 10, as the cyclic + // refresh is typically ~10%, and we'd like the update to happen after + // a few cylces of the refresh (so it better quality frame). Note the + // cyclic refresh for SVC only operates on base temporal layer frames. + // Choose 20 as perdiod for now (2 cycles). + rc->baseline_gf_interval = 20; + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + cpi->ext_refresh_golden_frame = 1; + rc->gfu_boost = DEFAULT_GF_BOOST; + } + } + } else if (!svc->use_gf_temporal_ref) { + rc->frames_till_gf_update_due = INT_MAX; + rc->baseline_gf_interval = INT_MAX; + } + if (svc->set_intra_only_frame) { + set_intra_only_frame(cpi); + if (cpi->oxcf.rc_mode == VPX_CBR) + target = vp9_calc_iframe_target_size_one_pass_cbr(cpi); + else + target = vp9_calc_iframe_target_size_one_pass_vbr(cpi); + } + // Overlay frame predicts from LAST (intra-only) + if (svc->previous_frame_is_intra_only) cpi->ref_frame_flags |= VP9_LAST_FLAG; + + // Any update/change of global cyclic refresh parameters (amount/delta-qp) + // should be done here, before the frame qp is selected. + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) + vp9_cyclic_refresh_update_parameters(cpi); + + vp9_rc_set_frame_target(cpi, target); + if (cm->show_frame) vp9_update_buffer_level_svc_preencode(cpi); + + if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC && svc->single_layer_svc == 1 && + svc->spatial_layer_id == svc->first_spatial_layer_to_encode && + svc->temporal_layer_id == 0) { + LAYER_CONTEXT *lc = NULL; + cpi->resize_pending = vp9_resize_one_pass_cbr(cpi); + if (cpi->resize_pending) { + int tl, width, height; + // Apply the same scale to all temporal layers. + for (tl = 0; tl < svc->number_temporal_layers; tl++) { + lc = &svc->layer_context[svc->spatial_layer_id * + svc->number_temporal_layers + + tl]; + lc->scaling_factor_num_resize = + cpi->resize_scale_num * lc->scaling_factor_num; + lc->scaling_factor_den_resize = + cpi->resize_scale_den * lc->scaling_factor_den; + // Reset rate control for all temporal layers. + lc->rc.buffer_level = lc->rc.optimal_buffer_level; + lc->rc.bits_off_target = lc->rc.optimal_buffer_level; + lc->rc.rate_correction_factors[INTER_FRAME] = + rc->rate_correction_factors[INTER_FRAME]; + } + // Set the size for this current temporal layer. + lc = &svc->layer_context[svc->spatial_layer_id * + svc->number_temporal_layers + + svc->temporal_layer_id]; + get_layer_resolution(cpi->oxcf.width, cpi->oxcf.height, + lc->scaling_factor_num_resize, + lc->scaling_factor_den_resize, &width, &height); + vp9_set_size_literal(cpi, width, height); + svc->resize_set = 1; + } + } else { + cpi->resize_pending = 0; + svc->resize_set = 0; + } +} + +void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + int target; + if ((cm->current_video_frame == 0) || (cpi->frame_flags & FRAMEFLAGS_KEY) || + (cpi->oxcf.auto_key && rc->frames_to_key == 0) || + (cpi->oxcf.mode != cpi->deadline_mode_previous_frame)) { + cm->frame_type = KEY_FRAME; + rc->frames_to_key = cpi->oxcf.key_freq; + rc->kf_boost = DEFAULT_KF_BOOST; + rc->source_alt_ref_active = 0; + } else { + cm->frame_type = INTER_FRAME; + } + if (rc->frames_till_gf_update_due == 0) { + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) + vp9_cyclic_refresh_set_golden_update(cpi); + else + rc->baseline_gf_interval = + (rc->min_gf_interval + rc->max_gf_interval) / 2; + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + // NOTE: frames_till_gf_update_due must be <= frames_to_key. + if (rc->frames_till_gf_update_due > rc->frames_to_key) + rc->frames_till_gf_update_due = rc->frames_to_key; + cpi->refresh_golden_frame = 1; + rc->gfu_boost = DEFAULT_GF_BOOST; + } + + // Any update/change of global cyclic refresh parameters (amount/delta-qp) + // should be done here, before the frame qp is selected. + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) + vp9_cyclic_refresh_update_parameters(cpi); + + if (frame_is_intra_only(cm)) + target = vp9_calc_iframe_target_size_one_pass_cbr(cpi); + else + target = vp9_calc_pframe_target_size_one_pass_cbr(cpi); + + vp9_rc_set_frame_target(cpi, target); + + if (cm->show_frame) vp9_update_buffer_level_preencode(cpi); + + if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC) + cpi->resize_pending = vp9_resize_one_pass_cbr(cpi); + else + cpi->resize_pending = 0; +} + +int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget, + vpx_bit_depth_t bit_depth) { + int start_index = rc->worst_quality; + int target_index = rc->worst_quality; + int i; + + // Convert the average q value to an index. + for (i = rc->best_quality; i < rc->worst_quality; ++i) { + start_index = i; + if (vp9_convert_qindex_to_q(i, bit_depth) >= qstart) break; + } + + // Convert the q target to an index + for (i = rc->best_quality; i < rc->worst_quality; ++i) { + target_index = i; + if (vp9_convert_qindex_to_q(i, bit_depth) >= qtarget) break; + } + + return target_index - start_index; +} + +int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type, + int qindex, double rate_target_ratio, + vpx_bit_depth_t bit_depth) { + int target_index = rc->worst_quality; + int i; + + // Look up the current projected bits per block for the base index + const int base_bits_per_mb = + vp9_rc_bits_per_mb(frame_type, qindex, 1.0, bit_depth); + + // Find the target bits per mb based on the base value and given ratio. + const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb); + + // Convert the q target to an index + for (i = rc->best_quality; i < rc->worst_quality; ++i) { + if (vp9_rc_bits_per_mb(frame_type, i, 1.0, bit_depth) <= + target_bits_per_mb) { + target_index = i; + break; + } + } + return target_index - qindex; +} + +void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi, + RATE_CONTROL *const rc) { + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + + // Special case code for 1 pass fixed Q mode tests + if ((oxcf->pass == 0) && (oxcf->rc_mode == VPX_Q)) { + rc->max_gf_interval = FIXED_GF_INTERVAL; + rc->min_gf_interval = FIXED_GF_INTERVAL; + rc->static_scene_max_gf_interval = FIXED_GF_INTERVAL; + } else { + double framerate = cpi->framerate; + // Set Maximum gf/arf interval + rc->max_gf_interval = oxcf->max_gf_interval; + rc->min_gf_interval = oxcf->min_gf_interval; +#if CONFIG_RATE_CTRL + if (oxcf->use_simple_encode_api) { + // In this experiment, we avoid framerate being changed dynamically during + // encoding. + framerate = oxcf->init_framerate; + } +#endif // CONFIG_RATE_CTRL + if (rc->min_gf_interval == 0) { + rc->min_gf_interval = vp9_rc_get_default_min_gf_interval( + oxcf->width, oxcf->height, framerate); + } + if (rc->max_gf_interval == 0) { + rc->max_gf_interval = + vp9_rc_get_default_max_gf_interval(framerate, rc->min_gf_interval); + } + + // Extended max interval for genuinely static scenes like slide shows. + rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH; + + if (rc->max_gf_interval > rc->static_scene_max_gf_interval) + rc->max_gf_interval = rc->static_scene_max_gf_interval; + + // Clamp min to max + rc->min_gf_interval = VPXMIN(rc->min_gf_interval, rc->max_gf_interval); + + if (oxcf->target_level == LEVEL_AUTO) { + const uint32_t pic_size = cpi->common.width * cpi->common.height; + const uint32_t pic_breadth = + VPXMAX(cpi->common.width, cpi->common.height); + int i; + for (i = 0; i < VP9_LEVELS; ++i) { + if (vp9_level_defs[i].max_luma_picture_size >= pic_size && + vp9_level_defs[i].max_luma_picture_breadth >= pic_breadth) { + if (rc->min_gf_interval <= + (int)vp9_level_defs[i].min_altref_distance) { + rc->min_gf_interval = (int)vp9_level_defs[i].min_altref_distance; + rc->max_gf_interval = + VPXMAX(rc->max_gf_interval, rc->min_gf_interval); + } + break; + } + } + } + } +} + +void vp9_rc_update_framerate(VP9_COMP *cpi) { + const VP9_COMMON *const cm = &cpi->common; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + RATE_CONTROL *const rc = &cpi->rc; + int vbr_max_bits; + + rc->avg_frame_bandwidth = + (int)VPXMIN(oxcf->target_bandwidth / cpi->framerate, INT_MAX); + rc->min_frame_bandwidth = + (int)(rc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100); + + rc->min_frame_bandwidth = + VPXMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS); + + // A maximum bitrate for a frame is defined. + // However this limit is extended if a very high rate is given on the command + // line or the rate can not be achieved because of a user specified max q + // (e.g. when the user specifies lossless encode). + // + // If a level is specified that requires a lower maximum rate then the level + // value take precedence. + vbr_max_bits = + (int)(((int64_t)rc->avg_frame_bandwidth * oxcf->two_pass_vbrmax_section) / + 100); + rc->max_frame_bandwidth = + VPXMAX(VPXMAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits); + + vp9_rc_set_gf_interval_range(cpi, rc); +} + +#define VBR_PCT_ADJUSTMENT_LIMIT 50 +// For VBR...adjustment to the frame target based on error from previous frames +static void vbr_rate_correction(VP9_COMP *cpi, int *this_frame_target) { + RATE_CONTROL *const rc = &cpi->rc; + int64_t vbr_bits_off_target = rc->vbr_bits_off_target; + int max_delta; + int frame_window = VPXMIN(16, ((int)cpi->twopass.total_stats.count - + cpi->common.current_video_frame)); + + // Calcluate the adjustment to rate for this frame. + if (frame_window > 0) { + max_delta = (vbr_bits_off_target > 0) + ? (int)(vbr_bits_off_target / frame_window) + : (int)(-vbr_bits_off_target / frame_window); + + max_delta = VPXMIN(max_delta, + ((*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100)); + + // vbr_bits_off_target > 0 means we have extra bits to spend + if (vbr_bits_off_target > 0) { + *this_frame_target += (vbr_bits_off_target > max_delta) + ? max_delta + : (int)vbr_bits_off_target; + } else { + *this_frame_target -= (vbr_bits_off_target < -max_delta) + ? max_delta + : (int)-vbr_bits_off_target; + } + } + + // Fast redistribution of bits arising from massive local undershoot. + // Don't do it for kf,arf,gf or overlay frames. + if (!frame_is_kf_gf_arf(cpi) && !rc->is_src_frame_alt_ref && + rc->vbr_bits_off_target_fast) { + int one_frame_bits = VPXMAX(rc->avg_frame_bandwidth, *this_frame_target); + int fast_extra_bits; + fast_extra_bits = (int)VPXMIN(rc->vbr_bits_off_target_fast, one_frame_bits); + fast_extra_bits = (int)VPXMIN( + fast_extra_bits, + VPXMAX(one_frame_bits / 8, rc->vbr_bits_off_target_fast / 8)); + *this_frame_target += (int)fast_extra_bits; + rc->vbr_bits_off_target_fast -= fast_extra_bits; + } +} + +void vp9_set_target_rate(VP9_COMP *cpi) { + RATE_CONTROL *const rc = &cpi->rc; + int target_rate = rc->base_frame_target; + + if (cpi->common.frame_type == KEY_FRAME) + target_rate = vp9_rc_clamp_iframe_target_size(cpi, target_rate); + else + target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate); + + if (!cpi->oxcf.vbr_corpus_complexity) { + // Correction to rate target based on prior over or under shoot. + if (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.rc_mode == VPX_CQ) + vbr_rate_correction(cpi, &target_rate); + } + vp9_rc_set_frame_target(cpi, target_rate); +} + +// Check if we should resize, based on average QP from past x frames. +// Only allow for resize at most one scale down for now, scaling factor is 2. +int vp9_resize_one_pass_cbr(VP9_COMP *cpi) { + const VP9_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + RESIZE_ACTION resize_action = NO_RESIZE; + int avg_qp_thr1 = 70; + int avg_qp_thr2 = 50; + // Don't allow for resized frame to go below 320x180, resize in steps of 3/4. + int min_width = (320 * 4) / 3; + int min_height = (180 * 4) / 3; + int down_size_on = 1; + int force_downsize_rate = 0; + cpi->resize_scale_num = 1; + cpi->resize_scale_den = 1; + // Don't resize on key frame; reset the counters on key frame. + if (cm->frame_type == KEY_FRAME) { + cpi->resize_avg_qp = 0; + cpi->resize_count = 0; + return 0; + } + + // No resizing down if frame size is below some limit. + if ((cm->width * cm->height) < min_width * min_height) down_size_on = 0; + +#if CONFIG_VP9_TEMPORAL_DENOISING + // If denoiser is on, apply a smaller qp threshold. + if (cpi->oxcf.noise_sensitivity > 0) { + avg_qp_thr1 = 60; + avg_qp_thr2 = 40; + } +#endif + + // Force downsize based on per-frame-bandwidth, for extreme case, + // for HD input. + if (cpi->resize_state == ORIG && cm->width * cm->height >= 1280 * 720) { + if (rc->avg_frame_bandwidth < 300000 / 30) { + resize_action = DOWN_ONEHALF; + cpi->resize_state = ONE_HALF; + force_downsize_rate = 1; + } else if (rc->avg_frame_bandwidth < 400000 / 30) { + resize_action = ONEHALFONLY_RESIZE ? DOWN_ONEHALF : DOWN_THREEFOUR; + cpi->resize_state = ONEHALFONLY_RESIZE ? ONE_HALF : THREE_QUARTER; + force_downsize_rate = 1; + } + } else if (cpi->resize_state == THREE_QUARTER && + cm->width * cm->height >= 960 * 540) { + if (rc->avg_frame_bandwidth < 300000 / 30) { + resize_action = DOWN_ONEHALF; + cpi->resize_state = ONE_HALF; + force_downsize_rate = 1; + } + } + + // Resize based on average buffer underflow and QP over some window. + // Ignore samples close to key frame, since QP is usually high after key. + if (!force_downsize_rate && cpi->rc.frames_since_key > cpi->framerate) { + const int window = VPXMIN(30, (int)(2 * cpi->framerate)); + cpi->resize_avg_qp += rc->last_q[INTER_FRAME]; + if (cpi->rc.buffer_level < (int)(30 * rc->optimal_buffer_level / 100)) + ++cpi->resize_buffer_underflow; + ++cpi->resize_count; + // Check for resize action every "window" frames. + if (cpi->resize_count >= window) { + int avg_qp = cpi->resize_avg_qp / cpi->resize_count; + // Resize down if buffer level has underflowed sufficient amount in past + // window, and we are at original or 3/4 of original resolution. + // Resize back up if average QP is low, and we are currently in a resized + // down state, i.e. 1/2 or 3/4 of original resolution. + // Currently, use a flag to turn 3/4 resizing feature on/off. + if (cpi->resize_buffer_underflow > (cpi->resize_count >> 2) && + down_size_on) { + if (cpi->resize_state == THREE_QUARTER) { + resize_action = DOWN_ONEHALF; + cpi->resize_state = ONE_HALF; + } else if (cpi->resize_state == ORIG) { + resize_action = ONEHALFONLY_RESIZE ? DOWN_ONEHALF : DOWN_THREEFOUR; + cpi->resize_state = ONEHALFONLY_RESIZE ? ONE_HALF : THREE_QUARTER; + } + } else if (cpi->resize_state != ORIG && + avg_qp < avg_qp_thr1 * cpi->rc.worst_quality / 100) { + if (cpi->resize_state == THREE_QUARTER || + avg_qp < avg_qp_thr2 * cpi->rc.worst_quality / 100 || + ONEHALFONLY_RESIZE) { + resize_action = UP_ORIG; + cpi->resize_state = ORIG; + } else if (cpi->resize_state == ONE_HALF) { + resize_action = UP_THREEFOUR; + cpi->resize_state = THREE_QUARTER; + } + } + // Reset for next window measurement. + cpi->resize_avg_qp = 0; + cpi->resize_count = 0; + cpi->resize_buffer_underflow = 0; + } + } + // If decision is to resize, reset some quantities, and check is we should + // reduce rate correction factor, + if (resize_action != NO_RESIZE) { + int target_bits_per_frame; + int active_worst_quality; + int qindex; + int tot_scale_change; + if (resize_action == DOWN_THREEFOUR || resize_action == UP_THREEFOUR) { + cpi->resize_scale_num = 3; + cpi->resize_scale_den = 4; + } else if (resize_action == DOWN_ONEHALF) { + cpi->resize_scale_num = 1; + cpi->resize_scale_den = 2; + } else { // UP_ORIG or anything else + cpi->resize_scale_num = 1; + cpi->resize_scale_den = 1; + } + tot_scale_change = (cpi->resize_scale_den * cpi->resize_scale_den) / + (cpi->resize_scale_num * cpi->resize_scale_num); + // Reset buffer level to optimal, update target size. + rc->buffer_level = rc->optimal_buffer_level; + rc->bits_off_target = rc->optimal_buffer_level; + rc->this_frame_target = vp9_calc_pframe_target_size_one_pass_cbr(cpi); + // Get the projected qindex, based on the scaled target frame size (scaled + // so target_bits_per_mb in vp9_rc_regulate_q will be correct target). + target_bits_per_frame = (resize_action >= 0) + ? rc->this_frame_target * tot_scale_change + : rc->this_frame_target / tot_scale_change; + active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi); + qindex = vp9_rc_regulate_q(cpi, target_bits_per_frame, rc->best_quality, + active_worst_quality); + // If resize is down, check if projected q index is close to worst_quality, + // and if so, reduce the rate correction factor (since likely can afford + // lower q for resized frame). + if (resize_action > 0 && qindex > 90 * cpi->rc.worst_quality / 100) { + rc->rate_correction_factors[INTER_NORMAL] *= 0.85; + } + // If resize is back up, check if projected q index is too much above the + // current base_qindex, and if so, reduce the rate correction factor + // (since prefer to keep q for resized frame at least close to previous q). + if (resize_action < 0 && qindex > 130 * cm->base_qindex / 100) { + rc->rate_correction_factors[INTER_NORMAL] *= 0.9; + } + } + return resize_action; +} + +static void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, + uint64_t avg_sad_current) { + VP9_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + int target; + int found = 0; + int found2 = 0; + int frame; + int i; + uint64_t avg_source_sad_lag = avg_sad_current; + int high_source_sad_lagindex = -1; + int steady_sad_lagindex = -1; + uint32_t sad_thresh1 = 70000; + uint32_t sad_thresh2 = 120000; + int low_content = 0; + int high_content = 0; + double rate_err = 1.0; + // Get measure of complexity over the future frames, and get the first + // future frame with high_source_sad/scene-change. + int tot_frames = (int)vp9_lookahead_depth(cpi->lookahead) - 1; + for (frame = tot_frames; frame >= 1; --frame) { + const int lagframe_idx = tot_frames - frame + 1; + uint64_t reference_sad = rc->avg_source_sad[0]; + for (i = 1; i < lagframe_idx; ++i) { + if (rc->avg_source_sad[i] > 0) + reference_sad = (3 * reference_sad + rc->avg_source_sad[i]) >> 2; + } + // Detect up-coming scene change. + if (!found && + (rc->avg_source_sad[lagframe_idx] > + VPXMAX(sad_thresh1, (unsigned int)(reference_sad << 1)) || + rc->avg_source_sad[lagframe_idx] > + VPXMAX(3 * sad_thresh1 >> 2, + (unsigned int)(reference_sad << 2)))) { + high_source_sad_lagindex = lagframe_idx; + found = 1; + } + // Detect change from motion to steady. + if (!found2 && lagframe_idx > 1 && lagframe_idx < tot_frames && + rc->avg_source_sad[lagframe_idx - 1] > (sad_thresh1 >> 2)) { + found2 = 1; + for (i = lagframe_idx; i < tot_frames; ++i) { + if (!(rc->avg_source_sad[i] > 0 && + rc->avg_source_sad[i] < (sad_thresh1 >> 2) && + rc->avg_source_sad[i] < + (rc->avg_source_sad[lagframe_idx - 1] >> 1))) { + found2 = 0; + i = tot_frames; + } + } + if (found2) steady_sad_lagindex = lagframe_idx; + } + avg_source_sad_lag += rc->avg_source_sad[lagframe_idx]; + } + if (tot_frames > 0) avg_source_sad_lag = avg_source_sad_lag / tot_frames; + // Constrain distance between detected scene cuts. + if (high_source_sad_lagindex != -1 && + high_source_sad_lagindex != rc->high_source_sad_lagindex - 1 && + abs(high_source_sad_lagindex - rc->high_source_sad_lagindex) < 4) + rc->high_source_sad_lagindex = -1; + else + rc->high_source_sad_lagindex = high_source_sad_lagindex; + // Adjust some factors for the next GF group, ignore initial key frame, + // and only for lag_in_frames not too small. + if (cpi->refresh_golden_frame == 1 && cm->current_video_frame > 30 && + cpi->oxcf.lag_in_frames > 8) { + int frame_constraint; + if (rc->rolling_target_bits > 0) + rate_err = + (double)rc->rolling_actual_bits / (double)rc->rolling_target_bits; + high_content = high_source_sad_lagindex != -1 || + avg_source_sad_lag > (rc->prev_avg_source_sad_lag << 1) || + avg_source_sad_lag > sad_thresh2; + low_content = high_source_sad_lagindex == -1 && + ((avg_source_sad_lag < (rc->prev_avg_source_sad_lag >> 1)) || + (avg_source_sad_lag < sad_thresh1)); + if (low_content) { + rc->gfu_boost = DEFAULT_GF_BOOST; + rc->baseline_gf_interval = + VPXMIN(15, (3 * rc->baseline_gf_interval) >> 1); + } else if (high_content) { + rc->gfu_boost = DEFAULT_GF_BOOST >> 1; + rc->baseline_gf_interval = (rate_err > 3.0) + ? VPXMAX(10, rc->baseline_gf_interval >> 1) + : VPXMAX(6, rc->baseline_gf_interval >> 1); + } + if (rc->baseline_gf_interval > cpi->oxcf.lag_in_frames - 1) + rc->baseline_gf_interval = cpi->oxcf.lag_in_frames - 1; + // Check for constraining gf_interval for up-coming scene/content changes, + // or for up-coming key frame, whichever is closer. + frame_constraint = rc->frames_to_key; + if (rc->high_source_sad_lagindex > 0 && + frame_constraint > rc->high_source_sad_lagindex) + frame_constraint = rc->high_source_sad_lagindex; + if (steady_sad_lagindex > 3 && frame_constraint > steady_sad_lagindex) + frame_constraint = steady_sad_lagindex; + adjust_gfint_frame_constraint(cpi, frame_constraint); + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + // Adjust factors for active_worst setting & af_ratio for next gf interval. + rc->fac_active_worst_inter = 150; // corresponds to 3/2 (= 150 /100). + rc->fac_active_worst_gf = 100; + if (rate_err < 2.0 && !high_content) { + rc->fac_active_worst_inter = 120; + rc->fac_active_worst_gf = 90; + } else if (rate_err > 8.0 && rc->avg_frame_qindex[INTER_FRAME] < 16) { + // Increase active_worst faster at low Q if rate fluctuation is high. + rc->fac_active_worst_inter = 200; + if (rc->avg_frame_qindex[INTER_FRAME] < 8) + rc->fac_active_worst_inter = 400; + } + if (low_content && rc->avg_frame_low_motion > 80) { + rc->af_ratio_onepass_vbr = 15; + } else if (high_content || rc->avg_frame_low_motion < 30) { + rc->af_ratio_onepass_vbr = 5; + rc->gfu_boost = DEFAULT_GF_BOOST >> 2; + } + if (cpi->sf.use_altref_onepass && cpi->oxcf.enable_auto_arf) { + // Flag to disable usage of ARF based on past usage, only allow this + // disabling if current frame/group does not start with key frame or + // scene cut. Note perc_arf_usage is only computed for speed >= 5. + int arf_usage_low = + (cm->frame_type != KEY_FRAME && !rc->high_source_sad && + cpi->rc.perc_arf_usage < 15 && cpi->oxcf.speed >= 5); + // Don't use alt-ref for this group under certain conditions. + if (arf_usage_low || + (rc->high_source_sad_lagindex > 0 && + rc->high_source_sad_lagindex <= rc->frames_till_gf_update_due) || + (avg_source_sad_lag > 3 * sad_thresh1 >> 3)) { + rc->source_alt_ref_pending = 0; + rc->alt_ref_gf_group = 0; + } else { + rc->source_alt_ref_pending = 1; + rc->alt_ref_gf_group = 1; + // If alt-ref is used for this gf group, limit the interval. + if (rc->baseline_gf_interval > 12) { + rc->baseline_gf_interval = 12; + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + } + } + } + target = vp9_calc_pframe_target_size_one_pass_vbr(cpi); + vp9_rc_set_frame_target(cpi, target); + } + rc->prev_avg_source_sad_lag = avg_source_sad_lag; +} + +// Compute average source sad (temporal sad: between current source and +// previous source) over a subset of superblocks. Use this is detect big changes +// in content and allow rate control to react. +// This function also handles special case of lag_in_frames, to measure content +// level in #future frames set by the lag_in_frames. +void vp9_scene_detection_onepass(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + YV12_BUFFER_CONFIG const *unscaled_src = cpi->un_scaled_source; + YV12_BUFFER_CONFIG const *unscaled_last_src = cpi->unscaled_last_source; + uint8_t *src_y; + int src_ystride; + int src_width; + int src_height; + uint8_t *last_src_y; + int last_src_ystride; + int last_src_width; + int last_src_height; + if (cpi->un_scaled_source == NULL || cpi->unscaled_last_source == NULL || + (cpi->use_svc && cpi->svc.current_superframe == 0)) + return; + src_y = unscaled_src->y_buffer; + src_ystride = unscaled_src->y_stride; + src_width = unscaled_src->y_width; + src_height = unscaled_src->y_height; + last_src_y = unscaled_last_src->y_buffer; + last_src_ystride = unscaled_last_src->y_stride; + last_src_width = unscaled_last_src->y_width; + last_src_height = unscaled_last_src->y_height; +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) return; +#endif + rc->high_source_sad = 0; + rc->high_num_blocks_with_motion = 0; + // For SVC: scene detection is only checked on first spatial layer of + // the superframe using the original/unscaled resolutions. + if (cpi->svc.spatial_layer_id == cpi->svc.first_spatial_layer_to_encode && + src_width == last_src_width && src_height == last_src_height) { + YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL }; + int num_mi_cols = cm->mi_cols; + int num_mi_rows = cm->mi_rows; + int start_frame = 0; + int frames_to_buffer = 1; + int frame = 0; + int scene_cut_force_key_frame = 0; + int num_zero_temp_sad = 0; + uint64_t avg_sad_current = 0; + uint32_t min_thresh = 20000; // ~5 * 64 * 64 + float thresh = 8.0f; + uint32_t thresh_key = 140000; + if (cpi->oxcf.speed <= 5) thresh_key = 240000; + if (cpi->oxcf.content != VP9E_CONTENT_SCREEN) min_thresh = 65000; + if (cpi->oxcf.rc_mode == VPX_VBR) thresh = 2.1f; + if (cpi->use_svc && cpi->svc.number_spatial_layers > 1) { + const int aligned_width = ALIGN_POWER_OF_TWO(src_width, MI_SIZE_LOG2); + const int aligned_height = ALIGN_POWER_OF_TWO(src_height, MI_SIZE_LOG2); + num_mi_cols = aligned_width >> MI_SIZE_LOG2; + num_mi_rows = aligned_height >> MI_SIZE_LOG2; + } + if (cpi->oxcf.lag_in_frames > 0) { + frames_to_buffer = (cm->current_video_frame == 1) + ? (int)vp9_lookahead_depth(cpi->lookahead) - 1 + : 2; + start_frame = (int)vp9_lookahead_depth(cpi->lookahead) - 1; + for (frame = 0; frame < frames_to_buffer; ++frame) { + const int lagframe_idx = start_frame - frame; + if (lagframe_idx >= 0) { + struct lookahead_entry *buf = + vp9_lookahead_peek(cpi->lookahead, lagframe_idx); + frames[frame] = &buf->img; + } + } + // The avg_sad for this current frame is the value of frame#1 + // (first future frame) from previous frame. + avg_sad_current = rc->avg_source_sad[1]; + if (avg_sad_current > + VPXMAX(min_thresh, + (unsigned int)(rc->avg_source_sad[0] * thresh)) && + cm->current_video_frame > (unsigned int)cpi->oxcf.lag_in_frames) + rc->high_source_sad = 1; + else + rc->high_source_sad = 0; + if (rc->high_source_sad && avg_sad_current > thresh_key) + scene_cut_force_key_frame = 1; + // Update recursive average for current frame. + if (avg_sad_current > 0) + rc->avg_source_sad[0] = + (3 * rc->avg_source_sad[0] + avg_sad_current) >> 2; + // Shift back data, starting at frame#1. + for (frame = 1; frame < cpi->oxcf.lag_in_frames - 1; ++frame) + rc->avg_source_sad[frame] = rc->avg_source_sad[frame + 1]; + } + for (frame = 0; frame < frames_to_buffer; ++frame) { + if (cpi->oxcf.lag_in_frames == 0 || + (frames[frame] != NULL && frames[frame + 1] != NULL && + frames[frame]->y_width == frames[frame + 1]->y_width && + frames[frame]->y_height == frames[frame + 1]->y_height)) { + int sbi_row, sbi_col; + const int lagframe_idx = + (cpi->oxcf.lag_in_frames == 0) ? 0 : start_frame - frame + 1; + const BLOCK_SIZE bsize = BLOCK_64X64; + // Loop over sub-sample of frame, compute average sad over 64x64 blocks. + uint64_t avg_sad = 0; + uint64_t tmp_sad = 0; + int num_samples = 0; + int sb_cols = (num_mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE; + int sb_rows = (num_mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE; + if (cpi->oxcf.lag_in_frames > 0) { + src_y = frames[frame]->y_buffer; + src_ystride = frames[frame]->y_stride; + last_src_y = frames[frame + 1]->y_buffer; + last_src_ystride = frames[frame + 1]->y_stride; + } + num_zero_temp_sad = 0; + for (sbi_row = 0; sbi_row < sb_rows; ++sbi_row) { + for (sbi_col = 0; sbi_col < sb_cols; ++sbi_col) { + // Checker-board pattern, ignore boundary. + if (((sbi_row > 0 && sbi_col > 0) && + (sbi_row < sb_rows - 1 && sbi_col < sb_cols - 1) && + ((sbi_row % 2 == 0 && sbi_col % 2 == 0) || + (sbi_row % 2 != 0 && sbi_col % 2 != 0)))) { + tmp_sad = cpi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y, + last_src_ystride); + avg_sad += tmp_sad; + num_samples++; + if (tmp_sad == 0) num_zero_temp_sad++; + } + src_y += 64; + last_src_y += 64; + } + src_y += (src_ystride << 6) - (sb_cols << 6); + last_src_y += (last_src_ystride << 6) - (sb_cols << 6); + } + if (num_samples > 0) avg_sad = avg_sad / num_samples; + // Set high_source_sad flag if we detect very high increase in avg_sad + // between current and previous frame value(s). Use minimum threshold + // for cases where there is small change from content that is completely + // static. + if (lagframe_idx == 0) { + if (avg_sad > + VPXMAX(min_thresh, + (unsigned int)(rc->avg_source_sad[0] * thresh)) && + rc->frames_since_key > 1 + cpi->svc.number_spatial_layers && + num_zero_temp_sad < 3 * (num_samples >> 2)) + rc->high_source_sad = 1; + else + rc->high_source_sad = 0; + if (rc->high_source_sad && avg_sad > thresh_key) + scene_cut_force_key_frame = 1; + if (avg_sad > 0 || cpi->oxcf.rc_mode == VPX_CBR) + rc->avg_source_sad[0] = (3 * rc->avg_source_sad[0] + avg_sad) >> 2; + } else { + rc->avg_source_sad[lagframe_idx] = avg_sad; + } + if (num_zero_temp_sad < (3 * num_samples >> 2)) + rc->high_num_blocks_with_motion = 1; + } + } + // For CBR non-screen content mode, check if we should reset the rate + // control. Reset is done if high_source_sad is detected and the rate + // control is at very low QP with rate correction factor at min level. + if (cpi->oxcf.rc_mode == VPX_CBR && + cpi->oxcf.content != VP9E_CONTENT_SCREEN && !cpi->use_svc) { + if (rc->high_source_sad && rc->last_q[INTER_FRAME] == rc->best_quality && + rc->avg_frame_qindex[INTER_FRAME] < (rc->best_quality << 1) && + rc->rate_correction_factors[INTER_NORMAL] == MIN_BPB_FACTOR) { + rc->rate_correction_factors[INTER_NORMAL] = 0.5; + rc->avg_frame_qindex[INTER_FRAME] = rc->worst_quality; + rc->buffer_level = rc->optimal_buffer_level; + rc->bits_off_target = rc->optimal_buffer_level; + rc->reset_high_source_sad = 1; + } + if (cm->frame_type != KEY_FRAME && rc->reset_high_source_sad) + rc->this_frame_target = rc->avg_frame_bandwidth; + } + // For SVC the new (updated) avg_source_sad[0] for the current superframe + // updates the setting for all layers. + if (cpi->use_svc) { + int sl, tl; + SVC *const svc = &cpi->svc; + for (sl = 0; sl < svc->number_spatial_layers; ++sl) + for (tl = 0; tl < svc->number_temporal_layers; ++tl) { + int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + RATE_CONTROL *const lrc = &lc->rc; + lrc->avg_source_sad[0] = rc->avg_source_sad[0]; + } + } + // For VBR, under scene change/high content change, force golden refresh. + if (cpi->oxcf.rc_mode == VPX_VBR && cm->frame_type != KEY_FRAME && + rc->high_source_sad && rc->frames_to_key > 3 && + rc->count_last_scene_change > 4 && + cpi->ext_refresh_frame_flags_pending == 0) { + int target; + cpi->refresh_golden_frame = 1; + if (scene_cut_force_key_frame) cm->frame_type = KEY_FRAME; + rc->source_alt_ref_pending = 0; + if (cpi->sf.use_altref_onepass && cpi->oxcf.enable_auto_arf) + rc->source_alt_ref_pending = 1; + rc->gfu_boost = DEFAULT_GF_BOOST >> 1; + rc->baseline_gf_interval = + VPXMIN(20, VPXMAX(10, rc->baseline_gf_interval)); + adjust_gfint_frame_constraint(cpi, rc->frames_to_key); + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + target = vp9_calc_pframe_target_size_one_pass_vbr(cpi); + vp9_rc_set_frame_target(cpi, target); + rc->count_last_scene_change = 0; + } else { + rc->count_last_scene_change++; + } + // If lag_in_frame is used, set the gf boost and interval. + if (cpi->oxcf.lag_in_frames > 0) + adjust_gf_boost_lag_one_pass_vbr(cpi, avg_sad_current); + } +} + +// Test if encoded frame will significantly overshoot the target bitrate, and +// if so, set the QP, reset/adjust some rate control parameters, and return 1. +// frame_size = -1 means frame has not been encoded. +int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) { + VP9_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + SPEED_FEATURES *const sf = &cpi->sf; + int thresh_qp = 7 * (rc->worst_quality >> 3); + int thresh_rate = rc->avg_frame_bandwidth << 3; + // Lower thresh_qp for video (more overshoot at lower Q) to be + // more conservative for video. + if (cpi->oxcf.content != VP9E_CONTENT_SCREEN) + thresh_qp = 3 * (rc->worst_quality >> 2); + // If this decision is not based on an encoded frame size but just on + // scene/slide change detection (i.e., re_encode_overshoot_cbr_rt == + // FAST_DETECTION_MAXQ), for now skip the (frame_size > thresh_rate) + // condition in this case. + // TODO(marpan): Use a better size/rate condition for this case and + // adjust thresholds. + if ((sf->overshoot_detection_cbr_rt == FAST_DETECTION_MAXQ || + frame_size > thresh_rate) && + cm->base_qindex < thresh_qp) { + double rate_correction_factor = + cpi->rc.rate_correction_factors[INTER_NORMAL]; + const int target_size = cpi->rc.avg_frame_bandwidth; + double new_correction_factor; + int target_bits_per_mb; + double q2; + int enumerator; + // Force a re-encode, and for now use max-QP. + *q = cpi->rc.worst_quality; + cpi->cyclic_refresh->counter_encode_maxq_scene_change = 0; + cpi->rc.re_encode_maxq_scene_change = 1; + // If the frame_size is much larger than the threshold (big content change) + // and the encoded frame used alot of Intra modes, then force hybrid_intra + // encoding for the re-encode on this scene change. hybrid_intra will + // use rd-based intra mode selection for small blocks. + if (sf->overshoot_detection_cbr_rt == RE_ENCODE_MAXQ && + frame_size > (thresh_rate << 1) && cpi->svc.spatial_layer_id == 0) { + MODE_INFO **mi = cm->mi_grid_visible; + int sum_intra_usage = 0; + int mi_row, mi_col; + for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) { + if (mi[0]->ref_frame[0] == INTRA_FRAME) sum_intra_usage++; + mi++; + } + mi += 8; + } + sum_intra_usage = 100 * sum_intra_usage / (cm->mi_rows * cm->mi_cols); + if (sum_intra_usage > 60) cpi->rc.hybrid_intra_scene_change = 1; + } + // Adjust avg_frame_qindex, buffer_level, and rate correction factors, as + // these parameters will affect QP selection for subsequent frames. If they + // have settled down to a very different (low QP) state, then not adjusting + // them may cause next frame to select low QP and overshoot again. + cpi->rc.avg_frame_qindex[INTER_FRAME] = *q; + rc->buffer_level = rc->optimal_buffer_level; + rc->bits_off_target = rc->optimal_buffer_level; + // Reset rate under/over-shoot flags. + cpi->rc.rc_1_frame = 0; + cpi->rc.rc_2_frame = 0; + // Adjust rate correction factor. + target_bits_per_mb = + (int)(((uint64_t)target_size << BPER_MB_NORMBITS) / cm->MBs); + // Rate correction factor based on target_bits_per_mb and qp (==max_QP). + // This comes from the inverse computation of vp9_rc_bits_per_mb(). + q2 = vp9_convert_qindex_to_q(*q, cm->bit_depth); + enumerator = 1800000; // Factor for inter frame. + enumerator += (int)(enumerator * q2) >> 12; + new_correction_factor = (double)target_bits_per_mb * q2 / enumerator; + if (new_correction_factor > rate_correction_factor) { + rate_correction_factor = + VPXMIN(2.0 * rate_correction_factor, new_correction_factor); + if (rate_correction_factor > MAX_BPB_FACTOR) + rate_correction_factor = MAX_BPB_FACTOR; + cpi->rc.rate_correction_factors[INTER_NORMAL] = rate_correction_factor; + } + // For temporal layers, reset the rate control parametes across all + // temporal layers. If the first_spatial_layer_to_encode > 0, then this + // superframe has skipped lower base layers. So in this case we should also + // reset and force max-q for spatial layers < first_spatial_layer_to_encode. + if (cpi->use_svc) { + int tl = 0; + int sl = 0; + SVC *svc = &cpi->svc; + for (sl = 0; sl < VPXMAX(1, svc->first_spatial_layer_to_encode); ++sl) { + for (tl = 0; tl < svc->number_temporal_layers; ++tl) { + const int layer = + LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + lrc->avg_frame_qindex[INTER_FRAME] = *q; + lrc->buffer_level = lrc->optimal_buffer_level; + lrc->bits_off_target = lrc->optimal_buffer_level; + lrc->rc_1_frame = 0; + lrc->rc_2_frame = 0; + lrc->rate_correction_factors[INTER_NORMAL] = rate_correction_factor; + lrc->force_max_q = 1; + } + } + } + return 1; + } else { + return 0; + } +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h new file mode 100644 index 0000000000..48c49e937e --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h @@ -0,0 +1,359 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_RATECTRL_H_ +#define VPX_VP9_ENCODER_VP9_RATECTRL_H_ + +#include "vpx/vpx_codec.h" +#include "vpx/vpx_integer.h" + +#include "vp9/common/vp9_blockd.h" +#include "vp9/encoder/vp9_lookahead.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Used to control aggressive VBR mode. +// #define AGGRESSIVE_VBR 1 + +// Bits Per MB at different Q (Multiplied by 512) +#define BPER_MB_NORMBITS 9 + +#define DEFAULT_KF_BOOST 2000 +#define DEFAULT_GF_BOOST 2000 + +#define MIN_GF_INTERVAL 4 +#define MAX_GF_INTERVAL 16 +#define FIXED_GF_INTERVAL 8 // Used in some testing modes only +#define ONEHALFONLY_RESIZE 0 + +#define FRAME_OVERHEAD_BITS 200 + +// Threshold used to define a KF group as static (e.g. a slide show). +// Essentially this means that no frame in the group has more than 1% of MBs +// that are not marked as coded with 0,0 motion in the first pass. +#define STATIC_KF_GROUP_THRESH 99 + +// The maximum duration of a GF group that is static (for example a slide show). +#define MAX_STATIC_GF_GROUP_LENGTH 250 + +typedef enum { + INTER_NORMAL = 0, + INTER_HIGH = 1, + GF_ARF_LOW = 2, + GF_ARF_STD = 3, + KF_STD = 4, + RATE_FACTOR_LEVELS = 5 +} RATE_FACTOR_LEVEL; + +// Internal frame scaling level. +typedef enum { + UNSCALED = 0, // Frame is unscaled. + SCALE_STEP1 = 1, // First-level down-scaling. + FRAME_SCALE_STEPS +} FRAME_SCALE_LEVEL; + +typedef enum { + NO_RESIZE = 0, + DOWN_THREEFOUR = 1, // From orig to 3/4. + DOWN_ONEHALF = 2, // From orig or 3/4 to 1/2. + UP_THREEFOUR = -1, // From 1/2 to 3/4. + UP_ORIG = -2, // From 1/2 or 3/4 to orig. +} RESIZE_ACTION; + +typedef enum { ORIG = 0, THREE_QUARTER = 1, ONE_HALF = 2 } RESIZE_STATE; + +// Frame dimensions multiplier wrt the native frame size, in 1/16ths, +// specified for the scale-up case. +// e.g. 24 => 16/24 = 2/3 of native size. The restriction to 1/16th is +// intended to match the capabilities of the normative scaling filters, +// giving precedence to the up-scaling accuracy. +static const int frame_scale_factor[FRAME_SCALE_STEPS] = { 16, 24 }; + +// Multiplier of the target rate to be used as threshold for triggering scaling. +static const double rate_thresh_mult[FRAME_SCALE_STEPS] = { 1.0, 2.0 }; + +// Scale dependent Rate Correction Factor multipliers. Compensates for the +// greater number of bits per pixel generated in down-scaled frames. +static const double rcf_mult[FRAME_SCALE_STEPS] = { 1.0, 2.0 }; + +typedef struct { + // Rate targeting variables + int base_frame_target; // A baseline frame target before adjustment + // for previous under or over shoot. + int this_frame_target; // Actual frame target after rc adjustment. + int projected_frame_size; + int sb64_target_rate; + int last_q[FRAME_TYPES]; // Separate values for Intra/Inter + int last_boosted_qindex; // Last boosted GF/KF/ARF q + int last_kf_qindex; // Q index of the last key frame coded. + + int gfu_boost; + int last_boost; + int kf_boost; + + double rate_correction_factors[RATE_FACTOR_LEVELS]; + + int frames_since_golden; + int frames_till_gf_update_due; + int min_gf_interval; + int max_gf_interval; + int static_scene_max_gf_interval; + int baseline_gf_interval; + int constrained_gf_group; + int frames_to_key; + int frames_since_key; + int this_key_frame_forced; + int next_key_frame_forced; + int source_alt_ref_pending; + int source_alt_ref_active; + int is_src_frame_alt_ref; + + int avg_frame_bandwidth; // Average frame size target for clip + int min_frame_bandwidth; // Minimum allocation used for any frame + int max_frame_bandwidth; // Maximum burst rate allowed for a frame. + + int ni_av_qi; + int ni_tot_qi; + int ni_frames; + int avg_frame_qindex[FRAME_TYPES]; + double tot_q; + double avg_q; + + int64_t buffer_level; + int64_t bits_off_target; + int64_t vbr_bits_off_target; + int64_t vbr_bits_off_target_fast; + + int decimation_factor; + int decimation_count; + + int rolling_target_bits; + int rolling_actual_bits; + + int long_rolling_target_bits; + int long_rolling_actual_bits; + + int rate_error_estimate; + + int64_t total_actual_bits; + int64_t total_target_bits; + int64_t total_target_vs_actual; + + int worst_quality; + int best_quality; + + int64_t starting_buffer_level; + int64_t optimal_buffer_level; + int64_t maximum_buffer_size; + + // rate control history for last frame(1) and the frame before(2). + // -1: undershot + // 1: overshoot + // 0: not initialized. + int rc_1_frame; + int rc_2_frame; + int q_1_frame; + int q_2_frame; + // Keep track of the last target average frame bandwidth. + int last_avg_frame_bandwidth; + + // Auto frame-scaling variables. + FRAME_SCALE_LEVEL frame_size_selector; + FRAME_SCALE_LEVEL next_frame_size_selector; + int frame_width[FRAME_SCALE_STEPS]; + int frame_height[FRAME_SCALE_STEPS]; + int rf_level_maxq[RATE_FACTOR_LEVELS]; + + int fac_active_worst_inter; + int fac_active_worst_gf; + uint64_t avg_source_sad[MAX_LAG_BUFFERS]; + uint64_t prev_avg_source_sad_lag; + int high_source_sad_lagindex; + int high_num_blocks_with_motion; + int alt_ref_gf_group; + int last_frame_is_src_altref; + int high_source_sad; + int count_last_scene_change; + int hybrid_intra_scene_change; + int re_encode_maxq_scene_change; + int avg_frame_low_motion; + int af_ratio_onepass_vbr; + int force_qpmin; + int reset_high_source_sad; + double perc_arf_usage; + int force_max_q; + // Last frame was dropped post encode on scene change. + int last_post_encode_dropped_scene_change; + // Enable post encode frame dropping for screen content. Only enabled when + // ext_use_post_encode_drop is enabled by user. + int use_post_encode_drop; + // External flag to enable post encode frame dropping, controlled by user. + int ext_use_post_encode_drop; + // Flag to disable CBR feature to increase Q on overshoot detection. + int disable_overshoot_maxq_cbr; + int damped_adjustment[RATE_FACTOR_LEVELS]; + double arf_active_best_quality_adjustment_factor; + int arf_increase_active_best_quality; + + int preserve_arf_as_gld; + int preserve_next_arf_as_gld; + int show_arf_as_gld; + + // Flag to constrain golden frame interval on key frame frequency for 1 pass + // VBR. + int constrain_gf_key_freq_onepass_vbr; + + // The index of the current GOP. Start from zero. + // When a key frame is inserted, it resets to zero. + int gop_global_index; +} RATE_CONTROL; + +struct VP9_COMP; +struct VP9EncoderConfig; + +void vp9_rc_init(const struct VP9EncoderConfig *oxcf, int pass, + RATE_CONTROL *rc); + +int vp9_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs, + double correction_factor, vpx_bit_depth_t bit_depth); + +double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth); + +int vp9_convert_q_to_qindex(double q_val, vpx_bit_depth_t bit_depth); + +void vp9_rc_init_minq_luts(void); + +int vp9_rc_get_default_min_gf_interval(int width, int height, double framerate); +// Note vp9_rc_get_default_max_gf_interval() requires the min_gf_interval to +// be passed in to ensure that the max_gf_interval returned is at least as big +// as that. +int vp9_rc_get_default_max_gf_interval(double framerate, int min_gf_interval); + +// Generally at the high level, the following flow is expected +// to be enforced for rate control: +// First call per frame, one of: +// vp9_rc_get_one_pass_vbr_params() +// vp9_rc_get_one_pass_cbr_params() +// vp9_rc_get_svc_params() +// vp9_rc_get_first_pass_params() +// vp9_rc_get_second_pass_params() +// depending on the usage to set the rate control encode parameters desired. +// +// Then, call encode_frame_to_data_rate() to perform the +// actual encode. This function will in turn call encode_frame() +// one or more times, followed by one of: +// vp9_rc_postencode_update() +// vp9_rc_postencode_update_drop_frame() +// +// The majority of rate control parameters are only expected +// to be set in the vp9_rc_get_..._params() functions and +// updated during the vp9_rc_postencode_update...() functions. +// The only exceptions are vp9_rc_drop_frame() and +// vp9_rc_update_rate_correction_factors() functions. + +// Functions to set parameters for encoding before the actual +// encode_frame_to_data_rate() function. +void vp9_rc_get_one_pass_vbr_params(struct VP9_COMP *cpi); +void vp9_rc_get_one_pass_cbr_params(struct VP9_COMP *cpi); +int vp9_calc_pframe_target_size_one_pass_cbr(const struct VP9_COMP *cpi); +int vp9_calc_iframe_target_size_one_pass_cbr(const struct VP9_COMP *cpi); +int vp9_calc_pframe_target_size_one_pass_vbr(const struct VP9_COMP *cpi); +int vp9_calc_iframe_target_size_one_pass_vbr(const struct VP9_COMP *cpi); +void vp9_set_gf_update_one_pass_vbr(struct VP9_COMP *const cpi); +void vp9_update_buffer_level_preencode(struct VP9_COMP *cpi); +void vp9_rc_get_svc_params(struct VP9_COMP *cpi); + +// Post encode update of the rate control parameters based +// on bytes used +void vp9_rc_postencode_update(struct VP9_COMP *cpi, uint64_t bytes_used); +// Post encode update of the rate control parameters for dropped frames +void vp9_rc_postencode_update_drop_frame(struct VP9_COMP *cpi); + +// Updates rate correction factors +// Changes only the rate correction factors in the rate control structure. +void vp9_rc_update_rate_correction_factors(struct VP9_COMP *cpi); + +// Post encode drop for CBR mode. +int post_encode_drop_cbr(struct VP9_COMP *cpi, size_t *size); + +int vp9_test_drop(struct VP9_COMP *cpi); + +// Decide if we should drop this frame: For 1-pass CBR. +// Changes only the decimation count in the rate control structure +int vp9_rc_drop_frame(struct VP9_COMP *cpi); + +// Computes frame size bounds. +void vp9_rc_compute_frame_size_bounds(const struct VP9_COMP *cpi, + int frame_target, + int *frame_under_shoot_limit, + int *frame_over_shoot_limit); + +// Picks q and q bounds given the target for bits +int vp9_rc_pick_q_and_bounds(const struct VP9_COMP *cpi, int *bottom_index, + int *top_index); + +// Estimates q to achieve a target bits per frame +int vp9_rc_regulate_q(const struct VP9_COMP *cpi, int target_bits_per_frame, + int active_best_quality, int active_worst_quality); + +// Estimates bits per mb for a given qindex and correction factor. +int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex, + double correction_factor, vpx_bit_depth_t bit_depth); + +// Clamping utilities for bitrate targets for iframes and pframes. +int vp9_rc_clamp_iframe_target_size(const struct VP9_COMP *const cpi, + int target); +int vp9_rc_clamp_pframe_target_size(const struct VP9_COMP *const cpi, + int target); +// Utility to set frame_target into the RATE_CONTROL structure +// This function is called only from the vp9_rc_get_..._params() functions. +void vp9_rc_set_frame_target(struct VP9_COMP *cpi, int target); + +// Computes a q delta (in "q index" terms) to get from a starting q value +// to a target q value +int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget, + vpx_bit_depth_t bit_depth); + +// Computes a q delta (in "q index" terms) to get from a starting q value +// to a value that should equate to the given rate ratio. +int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type, + int qindex, double rate_target_ratio, + vpx_bit_depth_t bit_depth); + +int vp9_frame_type_qdelta(const struct VP9_COMP *cpi, int rf_level, int q); + +void vp9_rc_update_framerate(struct VP9_COMP *cpi); + +void vp9_rc_set_gf_interval_range(const struct VP9_COMP *const cpi, + RATE_CONTROL *const rc); + +void vp9_set_target_rate(struct VP9_COMP *cpi); + +int vp9_resize_one_pass_cbr(struct VP9_COMP *cpi); + +void vp9_scene_detection_onepass(struct VP9_COMP *cpi); + +int vp9_encodedframe_overshoot(struct VP9_COMP *cpi, int frame_size, int *q); + +void vp9_configure_buffer_updates(struct VP9_COMP *cpi, int gf_group_index); + +void vp9_estimate_qp_gop(struct VP9_COMP *cpi); + +void vp9_compute_frame_low_motion(struct VP9_COMP *const cpi); + +void vp9_update_buffer_level_svc_preencode(struct VP9_COMP *cpi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_RATECTRL_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_rd.c b/media/libvpx/libvpx/vp9/encoder/vp9_rd.c new file mode 100644 index 0000000000..95c95971c5 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_rd.c @@ -0,0 +1,795 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "./vp9_rtcd.h" + +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/bitops.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/system_state.h" + +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_mvref_common.h" +#include "vp9/common/vp9_pred_common.h" +#include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_seg_common.h" + +#include "vp9/encoder/vp9_cost.h" +#include "vp9/encoder/vp9_encodemb.h" +#include "vp9/encoder/vp9_encodemv.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_mcomp.h" +#include "vp9/encoder/vp9_quantize.h" +#include "vp9/encoder/vp9_ratectrl.h" +#include "vp9/encoder/vp9_rd.h" +#include "vp9/encoder/vp9_tokenize.h" + +#define RD_THRESH_POW 1.25 + +// Factor to weigh the rate for switchable interp filters. +#define SWITCHABLE_INTERP_RATE_FACTOR 1 + +void vp9_rd_cost_reset(RD_COST *rd_cost) { + rd_cost->rate = INT_MAX; + rd_cost->dist = INT64_MAX; + rd_cost->rdcost = INT64_MAX; +} + +void vp9_rd_cost_init(RD_COST *rd_cost) { + rd_cost->rate = 0; + rd_cost->dist = 0; + rd_cost->rdcost = 0; +} + +int64_t vp9_calculate_rd_cost(int mult, int div, int rate, int64_t dist) { + assert(mult >= 0); + assert(div > 0); + if (rate >= 0 && dist >= 0) { + return RDCOST(mult, div, rate, dist); + } + if (rate >= 0 && dist < 0) { + return RDCOST_NEG_D(mult, div, rate, -dist); + } + if (rate < 0 && dist >= 0) { + return RDCOST_NEG_R(mult, div, -rate, dist); + } + return -RDCOST(mult, div, -rate, -dist); +} + +void vp9_rd_cost_update(int mult, int div, RD_COST *rd_cost) { + if (rd_cost->rate < INT_MAX && rd_cost->dist < INT64_MAX) { + rd_cost->rdcost = + vp9_calculate_rd_cost(mult, div, rd_cost->rate, rd_cost->dist); + } else { + vp9_rd_cost_reset(rd_cost); + } +} + +// The baseline rd thresholds for breaking out of the rd loop for +// certain modes are assumed to be based on 8x8 blocks. +// This table is used to correct for block size. +// The factors here are << 2 (2 = x0.5, 32 = x8 etc). +static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES] = { + 2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32 +}; + +static void fill_mode_costs(VP9_COMP *cpi) { + const FRAME_CONTEXT *const fc = cpi->common.fc; + int i, j; + + for (i = 0; i < INTRA_MODES; ++i) { + for (j = 0; j < INTRA_MODES; ++j) { + vp9_cost_tokens(cpi->y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j], + vp9_intra_mode_tree); + } + } + + vp9_cost_tokens(cpi->mbmode_cost, fc->y_mode_prob[1], vp9_intra_mode_tree); + for (i = 0; i < INTRA_MODES; ++i) { + vp9_cost_tokens(cpi->intra_uv_mode_cost[KEY_FRAME][i], + vp9_kf_uv_mode_prob[i], vp9_intra_mode_tree); + vp9_cost_tokens(cpi->intra_uv_mode_cost[INTER_FRAME][i], + fc->uv_mode_prob[i], vp9_intra_mode_tree); + } + + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) { + vp9_cost_tokens(cpi->switchable_interp_costs[i], + fc->switchable_interp_prob[i], vp9_switchable_interp_tree); + } + + for (i = TX_8X8; i < TX_SIZES; ++i) { + for (j = 0; j < TX_SIZE_CONTEXTS; ++j) { + const vpx_prob *tx_probs = get_tx_probs(i, j, &fc->tx_probs); + int k; + for (k = 0; k <= i; ++k) { + int cost = 0; + int m; + for (m = 0; m <= k - (k == i); ++m) { + if (m == k) + cost += vp9_cost_zero(tx_probs[m]); + else + cost += vp9_cost_one(tx_probs[m]); + } + cpi->tx_size_cost[i - 1][j][k] = cost; + } + } + } +} + +static void fill_token_costs(vp9_coeff_cost *c, + vp9_coeff_probs_model (*p)[PLANE_TYPES]) { + int i, j, k, l; + TX_SIZE t; + for (t = TX_4X4; t <= TX_32X32; ++t) + for (i = 0; i < PLANE_TYPES; ++i) + for (j = 0; j < REF_TYPES; ++j) + for (k = 0; k < COEF_BANDS; ++k) + for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) { + vpx_prob probs[ENTROPY_NODES]; + vp9_model_to_full_probs(p[t][i][j][k][l], probs); + vp9_cost_tokens((int *)c[t][i][j][k][0][l], probs, vp9_coef_tree); + vp9_cost_tokens_skip((int *)c[t][i][j][k][1][l], probs, + vp9_coef_tree); + assert(c[t][i][j][k][0][l][EOB_TOKEN] == + c[t][i][j][k][1][l][EOB_TOKEN]); + } +} + +// Values are now correlated to quantizer. +static int sad_per_bit16lut_8[QINDEX_RANGE]; +static int sad_per_bit4lut_8[QINDEX_RANGE]; + +#if CONFIG_VP9_HIGHBITDEPTH +static int sad_per_bit16lut_10[QINDEX_RANGE]; +static int sad_per_bit4lut_10[QINDEX_RANGE]; +static int sad_per_bit16lut_12[QINDEX_RANGE]; +static int sad_per_bit4lut_12[QINDEX_RANGE]; +#endif + +static void init_me_luts_bd(int *bit16lut, int *bit4lut, int range, + vpx_bit_depth_t bit_depth) { + int i; + // Initialize the sad lut tables using a formulaic calculation for now. + // This is to make it easier to resolve the impact of experimental changes + // to the quantizer tables. + for (i = 0; i < range; i++) { + const double q = vp9_convert_qindex_to_q(i, bit_depth); + bit16lut[i] = (int)(0.0418 * q + 2.4107); + bit4lut[i] = (int)(0.063 * q + 2.742); + } +} + +void vp9_init_me_luts(void) { + init_me_luts_bd(sad_per_bit16lut_8, sad_per_bit4lut_8, QINDEX_RANGE, + VPX_BITS_8); +#if CONFIG_VP9_HIGHBITDEPTH + init_me_luts_bd(sad_per_bit16lut_10, sad_per_bit4lut_10, QINDEX_RANGE, + VPX_BITS_10); + init_me_luts_bd(sad_per_bit16lut_12, sad_per_bit4lut_12, QINDEX_RANGE, + VPX_BITS_12); +#endif +} + +static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12, + 8, 8, 4, 4, 2, 2, 1, 0 }; + +// Note that the element below for frame type "USE_BUF_FRAME", which indicates +// that the show frame flag is set, should not be used as no real frame +// is encoded so we should not reach here. However, a dummy value +// is inserted here to make sure the data structure has the right number +// of values assigned. +static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { 128, 144, 128, + 128, 144, 144 }; + +// Configure Vizier RD parameters. +// Later this function will use passed in command line values. +void vp9_init_rd_parameters(VP9_COMP *cpi) { + RD_CONTROL *const rdc = &cpi->rd_ctrl; + + // When |use_vizier_rc_params| is 1, we expect the rd parameters have been + // initialized by the pass in values. + // Be careful that parameters below are only initialized to 1, if we do not + // pass values to them. It is desired to take care of each parameter when + // using |use_vizier_rc_params|. + if (cpi->twopass.use_vizier_rc_params) return; + + // Make sure this function is floating point safe. + vpx_clear_system_state(); + + rdc->rd_mult_inter_qp_fac = 1.0; + rdc->rd_mult_arf_qp_fac = 1.0; + rdc->rd_mult_key_qp_fac = 1.0; +} + +// Returns the default rd multiplier for inter frames for a given qindex. +// The function here is a first pass estimate based on data from +// a previous Vizer run +static double def_inter_rd_multiplier(int qindex) { + return 4.15 + (0.001 * (double)qindex); +} + +// Returns the default rd multiplier for ARF/Golden Frames for a given qindex. +// The function here is a first pass estimate based on data from +// a previous Vizer run +static double def_arf_rd_multiplier(int qindex) { + return 4.25 + (0.001 * (double)qindex); +} + +// Returns the default rd multiplier for key frames for a given qindex. +// The function here is a first pass estimate based on data from +// a previous Vizer run +static double def_kf_rd_multiplier(int qindex) { + return 4.35 + (0.001 * (double)qindex); +} + +int vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) { + const RD_CONTROL *rdc = &cpi->rd_ctrl; + const int q = vp9_dc_quant(qindex, 0, cpi->common.bit_depth); + // largest dc_quant is 21387, therefore rdmult should fit in int32_t + int rdmult = q * q; + + if (cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0 && + cpi->ext_ratectrl.ext_rdmult != VPX_DEFAULT_RDMULT) { + return cpi->ext_ratectrl.ext_rdmult; + } + + // Make sure this function is floating point safe. + vpx_clear_system_state(); + + if (cpi->common.frame_type == KEY_FRAME) { + double def_rd_q_mult = def_kf_rd_multiplier(qindex); + rdmult = (int)((double)rdmult * def_rd_q_mult * rdc->rd_mult_key_qp_fac); + } else if (!cpi->rc.is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { + double def_rd_q_mult = def_arf_rd_multiplier(qindex); + rdmult = (int)((double)rdmult * def_rd_q_mult * rdc->rd_mult_arf_qp_fac); + } else { + double def_rd_q_mult = def_inter_rd_multiplier(qindex); + rdmult = (int)((double)rdmult * def_rd_q_mult * rdc->rd_mult_inter_qp_fac); + } + +#if CONFIG_VP9_HIGHBITDEPTH + switch (cpi->common.bit_depth) { + case VPX_BITS_10: rdmult = ROUND_POWER_OF_TWO(rdmult, 4); break; + case VPX_BITS_12: rdmult = ROUND_POWER_OF_TWO(rdmult, 8); break; + default: break; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + return rdmult > 0 ? rdmult : 1; +} + +static int modulate_rdmult(const VP9_COMP *cpi, int rdmult) { + int64_t rdmult_64 = rdmult; + if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index]; + const int gfu_boost = cpi->multi_layer_arf + ? gf_group->gfu_boost[gf_group->index] + : cpi->rc.gfu_boost; + const int boost_index = VPXMIN(15, (gfu_boost / 100)); + + rdmult_64 = (rdmult_64 * rd_frame_type_factor[frame_type]) >> 7; + rdmult_64 += ((rdmult_64 * rd_boost_factor[boost_index]) >> 7); + } + return (int)rdmult_64; +} + +int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) { + int rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, qindex); + if (cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0 && + cpi->ext_ratectrl.ext_rdmult != VPX_DEFAULT_RDMULT) { + return cpi->ext_ratectrl.ext_rdmult; + } + return modulate_rdmult(cpi, rdmult); +} + +int vp9_get_adaptive_rdmult(const VP9_COMP *cpi, double beta) { + int rdmult = + vp9_compute_rd_mult_based_on_qindex(cpi, cpi->common.base_qindex); + rdmult = (int)((double)rdmult / beta); + rdmult = rdmult > 0 ? rdmult : 1; + return modulate_rdmult(cpi, rdmult); +} + +static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) { + double q; +#if CONFIG_VP9_HIGHBITDEPTH + switch (bit_depth) { + case VPX_BITS_8: q = vp9_dc_quant(qindex, 0, VPX_BITS_8) / 4.0; break; + case VPX_BITS_10: q = vp9_dc_quant(qindex, 0, VPX_BITS_10) / 16.0; break; + default: + assert(bit_depth == VPX_BITS_12); + q = vp9_dc_quant(qindex, 0, VPX_BITS_12) / 64.0; + break; + } +#else + (void)bit_depth; + q = vp9_dc_quant(qindex, 0, VPX_BITS_8) / 4.0; +#endif // CONFIG_VP9_HIGHBITDEPTH + // TODO(debargha): Adjust the function below. + return VPXMAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8); +} + +void vp9_initialize_me_consts(VP9_COMP *cpi, MACROBLOCK *x, int qindex) { +#if CONFIG_VP9_HIGHBITDEPTH + switch (cpi->common.bit_depth) { + case VPX_BITS_8: + x->sadperbit16 = sad_per_bit16lut_8[qindex]; + x->sadperbit4 = sad_per_bit4lut_8[qindex]; + break; + case VPX_BITS_10: + x->sadperbit16 = sad_per_bit16lut_10[qindex]; + x->sadperbit4 = sad_per_bit4lut_10[qindex]; + break; + default: + assert(cpi->common.bit_depth == VPX_BITS_12); + x->sadperbit16 = sad_per_bit16lut_12[qindex]; + x->sadperbit4 = sad_per_bit4lut_12[qindex]; + break; + } +#else + (void)cpi; + x->sadperbit16 = sad_per_bit16lut_8[qindex]; + x->sadperbit4 = sad_per_bit4lut_8[qindex]; +#endif // CONFIG_VP9_HIGHBITDEPTH +} + +static void set_block_thresholds(const VP9_COMMON *cm, RD_OPT *rd) { + int i, bsize, segment_id; + + for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) { + const int qindex = + clamp(vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex) + + cm->y_dc_delta_q, + 0, MAXQ); + const int q = compute_rd_thresh_factor(qindex, cm->bit_depth); + + for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) { + // Threshold here seems unnecessarily harsh but fine given actual + // range of values used for cpi->sf.thresh_mult[]. + const int t = q * rd_thresh_block_size_factor[bsize]; + const int thresh_max = INT_MAX / t; + + if (bsize >= BLOCK_8X8) { + for (i = 0; i < MAX_MODES; ++i) + rd->threshes[segment_id][bsize][i] = rd->thresh_mult[i] < thresh_max + ? rd->thresh_mult[i] * t / 4 + : INT_MAX; + } else { + for (i = 0; i < MAX_REFS; ++i) + rd->threshes[segment_id][bsize][i] = + rd->thresh_mult_sub8x8[i] < thresh_max + ? rd->thresh_mult_sub8x8[i] * t / 4 + : INT_MAX; + } + } + } +} + +void vp9_build_inter_mode_cost(VP9_COMP *cpi) { + const VP9_COMMON *const cm = &cpi->common; + int i; + for (i = 0; i < INTER_MODE_CONTEXTS; ++i) { + vp9_cost_tokens((int *)cpi->inter_mode_cost[i], cm->fc->inter_mode_probs[i], + vp9_inter_mode_tree); + } +} + +void vp9_initialize_rd_consts(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + RD_OPT *const rd = &cpi->rd; + int i; + + vpx_clear_system_state(); + + rd->RDDIV = RDDIV_BITS; // In bits (to multiply D by 128). + rd->RDMULT = vp9_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q); + + set_error_per_bit(x, rd->RDMULT); + + x->select_tx_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL && + cm->frame_type != KEY_FRAME) + ? 0 + : 1; + + set_block_thresholds(cm, rd); + set_partition_probs(cm, xd); + + if (cpi->oxcf.pass == 1) { + if (!frame_is_intra_only(cm)) + vp9_build_nmv_cost_table( + x->nmvjointcost, + cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost, + &cm->fc->nmvc, cm->allow_high_precision_mv); + } else { + if (!cpi->sf.use_nonrd_pick_mode || cm->frame_type == KEY_FRAME) + fill_token_costs(x->token_costs, cm->fc->coef_probs); + + if (cpi->sf.partition_search_type != VAR_BASED_PARTITION || + cm->frame_type == KEY_FRAME) { + for (i = 0; i < PARTITION_CONTEXTS; ++i) + vp9_cost_tokens(cpi->partition_cost[i], get_partition_probs(xd, i), + vp9_partition_tree); + } + + if (!cpi->sf.use_nonrd_pick_mode || (cm->current_video_frame & 0x07) == 1 || + cm->frame_type == KEY_FRAME) { + fill_mode_costs(cpi); + + if (!frame_is_intra_only(cm)) { + vp9_build_nmv_cost_table( + x->nmvjointcost, + cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost, + &cm->fc->nmvc, cm->allow_high_precision_mv); + vp9_build_inter_mode_cost(cpi); + } + } + } +} + +// NOTE: The tables below must be of the same size. + +// The functions described below are sampled at the four most significant +// bits of x^2 + 8 / 256. + +// Normalized rate: +// This table models the rate for a Laplacian source with given variance +// when quantized with a uniform quantizer with given stepsize. The +// closed form expression is: +// Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)], +// where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance), +// and H(x) is the binary entropy function. +static const int rate_tab_q10[] = { + 65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, 4553, 4389, 4255, 4142, 4044, + 3958, 3881, 3811, 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186, 3133, 3037, + 2952, 2877, 2809, 2747, 2690, 2638, 2589, 2501, 2423, 2353, 2290, 2232, 2179, + 2130, 2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651, 1608, 1530, 1460, 1398, + 1342, 1290, 1243, 1199, 1159, 1086, 1021, 963, 911, 864, 821, 781, 745, + 680, 623, 574, 530, 490, 455, 424, 395, 345, 304, 269, 239, 213, + 190, 171, 154, 126, 104, 87, 73, 61, 52, 44, 38, 28, 21, + 16, 12, 10, 8, 6, 5, 3, 2, 1, 1, 1, 0, 0, +}; + +// Normalized distortion: +// This table models the normalized distortion for a Laplacian source +// with given variance when quantized with a uniform quantizer +// with given stepsize. The closed form expression is: +// Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2)) +// where x = qpstep / sqrt(variance). +// Note the actual distortion is Dn * variance. +static const int dist_tab_q10[] = { + 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5, 5, + 6, 7, 7, 8, 9, 11, 12, 13, 15, 16, 17, 18, 21, + 24, 26, 29, 31, 34, 36, 39, 44, 49, 54, 59, 64, 69, + 73, 78, 88, 97, 106, 115, 124, 133, 142, 151, 167, 184, 200, + 215, 231, 245, 260, 274, 301, 327, 351, 375, 397, 418, 439, 458, + 495, 528, 559, 587, 613, 637, 659, 680, 717, 749, 777, 801, 823, + 842, 859, 874, 899, 919, 936, 949, 960, 969, 977, 983, 994, 1001, + 1006, 1010, 1013, 1015, 1017, 1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024, +}; +static const int xsq_iq_q10[] = { + 0, 4, 8, 12, 16, 20, 24, 28, 32, + 40, 48, 56, 64, 72, 80, 88, 96, 112, + 128, 144, 160, 176, 192, 208, 224, 256, 288, + 320, 352, 384, 416, 448, 480, 544, 608, 672, + 736, 800, 864, 928, 992, 1120, 1248, 1376, 1504, + 1632, 1760, 1888, 2016, 2272, 2528, 2784, 3040, 3296, + 3552, 3808, 4064, 4576, 5088, 5600, 6112, 6624, 7136, + 7648, 8160, 9184, 10208, 11232, 12256, 13280, 14304, 15328, + 16352, 18400, 20448, 22496, 24544, 26592, 28640, 30688, 32736, + 36832, 40928, 45024, 49120, 53216, 57312, 61408, 65504, 73696, + 81888, 90080, 98272, 106464, 114656, 122848, 131040, 147424, 163808, + 180192, 196576, 212960, 229344, 245728, +}; + +static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) { + const int tmp = (xsq_q10 >> 2) + 8; + const int k = get_msb(tmp) - 3; + const int xq = (k << 3) + ((tmp >> k) & 0x7); + const int one_q10 = 1 << 10; + const int a_q10 = ((xsq_q10 - xsq_iq_q10[xq]) << 10) >> (2 + k); + const int b_q10 = one_q10 - a_q10; + *r_q10 = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10; + *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10; +} + +static const uint32_t MAX_XSQ_Q10 = 245727; + +void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2, + unsigned int qstep, int *rate, + int64_t *dist) { + // This function models the rate and distortion for a Laplacian + // source with given variance when quantized with a uniform quantizer + // with given stepsize. The closed form expressions are in: + // Hang and Chen, "Source Model for transform video coder and its + // application - Part I: Fundamental Theory", IEEE Trans. Circ. + // Sys. for Video Tech., April 1997. + if (var == 0) { + *rate = 0; + *dist = 0; + } else { + int d_q10, r_q10; + const uint64_t xsq_q10_64 = + (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var; + const int xsq_q10 = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10); + model_rd_norm(xsq_q10, &r_q10, &d_q10); + *rate = ROUND_POWER_OF_TWO(r_q10 << n_log2, 10 - VP9_PROB_COST_SHIFT); + *dist = (var * (int64_t)d_q10 + 512) >> 10; + } +} + +// Disable gcc 12.2 false positive warning. +// warning: writing 1 byte into a region of size 0 [-Wstringop-overflow=] +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstringop-overflow" +#endif +void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size, + const struct macroblockd_plane *pd, + ENTROPY_CONTEXT t_above[16], + ENTROPY_CONTEXT t_left[16]) { + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); + const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; + const ENTROPY_CONTEXT *const above = pd->above_context; + const ENTROPY_CONTEXT *const left = pd->left_context; + + int i; + switch (tx_size) { + case TX_4X4: + memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w); + memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h); + break; + case TX_8X8: + for (i = 0; i < num_4x4_w; i += 2) + t_above[i] = !!*(const uint16_t *)&above[i]; + for (i = 0; i < num_4x4_h; i += 2) + t_left[i] = !!*(const uint16_t *)&left[i]; + break; + case TX_16X16: + for (i = 0; i < num_4x4_w; i += 4) + t_above[i] = !!*(const uint32_t *)&above[i]; + for (i = 0; i < num_4x4_h; i += 4) + t_left[i] = !!*(const uint32_t *)&left[i]; + break; + default: + assert(tx_size == TX_32X32); + for (i = 0; i < num_4x4_w; i += 8) + t_above[i] = !!*(const uint64_t *)&above[i]; + for (i = 0; i < num_4x4_h; i += 8) + t_left[i] = !!*(const uint64_t *)&left[i]; + break; + } +} +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic pop +#endif + +void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer, + int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) { + int i; + int zero_seen = 0; + int best_index = 0; + int best_sad = INT_MAX; + int this_sad = INT_MAX; + int max_mv = 0; + int near_same_nearest; + uint8_t *src_y_ptr = x->plane[0].src.buf; + uint8_t *ref_y_ptr; + const int num_mv_refs = + MAX_MV_REF_CANDIDATES + (block_size < x->max_partition_size); + + MV pred_mv[3]; + pred_mv[0] = x->mbmi_ext->ref_mvs[ref_frame][0].as_mv; + pred_mv[1] = x->mbmi_ext->ref_mvs[ref_frame][1].as_mv; + pred_mv[2] = x->pred_mv[ref_frame]; + assert(num_mv_refs <= (int)(sizeof(pred_mv) / sizeof(pred_mv[0]))); + + near_same_nearest = x->mbmi_ext->ref_mvs[ref_frame][0].as_int == + x->mbmi_ext->ref_mvs[ref_frame][1].as_int; + + // Get the sad for each candidate reference mv. + for (i = 0; i < num_mv_refs; ++i) { + const MV *this_mv = &pred_mv[i]; + int fp_row, fp_col; + if (this_mv->row == INT16_MAX || this_mv->col == INT16_MAX) continue; + if (i == 1 && near_same_nearest) continue; + fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3; + fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3; + max_mv = VPXMAX(max_mv, VPXMAX(abs(this_mv->row), abs(this_mv->col)) >> 3); + + if (fp_row == 0 && fp_col == 0 && zero_seen) continue; + zero_seen |= (fp_row == 0 && fp_col == 0); + + ref_y_ptr = &ref_y_buffer[ref_y_stride * fp_row + fp_col]; + // Find sad for current vector. + this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride, + ref_y_ptr, ref_y_stride); + // Note if it is the best so far. + if (this_sad < best_sad) { + best_sad = this_sad; + best_index = i; + } + } + + // Note the index of the mv that worked best in the reference list. + x->mv_best_ref_index[ref_frame] = best_index; + x->max_mv_context[ref_frame] = max_mv; + x->pred_mv_sad[ref_frame] = best_sad; +} + +void vp9_setup_pred_block(const MACROBLOCKD *xd, + struct buf_2d dst[MAX_MB_PLANE], + const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, + const struct scale_factors *scale, + const struct scale_factors *scale_uv) { + int i; + + dst[0].buf = src->y_buffer; + dst[0].stride = src->y_stride; + dst[1].buf = src->u_buffer; + dst[2].buf = src->v_buffer; + dst[1].stride = dst[2].stride = src->uv_stride; + + for (i = 0; i < MAX_MB_PLANE; ++i) { + setup_pred_plane(dst + i, dst[i].buf, dst[i].stride, mi_row, mi_col, + i ? scale_uv : scale, xd->plane[i].subsampling_x, + xd->plane[i].subsampling_y); + } +} + +int vp9_raster_block_offset(BLOCK_SIZE plane_bsize, int raster_block, + int stride) { + const int bw = b_width_log2_lookup[plane_bsize]; + const int y = 4 * (raster_block >> bw); + const int x = 4 * (raster_block & ((1 << bw) - 1)); + return y * stride + x; +} + +int16_t *vp9_raster_block_offset_int16(BLOCK_SIZE plane_bsize, int raster_block, + int16_t *base) { + const int stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; + return base + vp9_raster_block_offset(plane_bsize, raster_block, stride); +} + +YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi, + int ref_frame) { + const VP9_COMMON *const cm = &cpi->common; + const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1]; + const int ref_idx = get_ref_frame_buf_idx(cpi, ref_frame); + assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME); + return (scaled_idx != ref_idx && scaled_idx != INVALID_IDX) + ? &cm->buffer_pool->frame_bufs[scaled_idx].buf + : NULL; +} + +int vp9_get_switchable_rate(const VP9_COMP *cpi, const MACROBLOCKD *const xd) { + const MODE_INFO *const mi = xd->mi[0]; + const int ctx = get_pred_context_switchable_interp(xd); + return SWITCHABLE_INTERP_RATE_FACTOR * + cpi->switchable_interp_costs[ctx][mi->interp_filter]; +} + +void vp9_set_rd_speed_thresholds(VP9_COMP *cpi) { + int i; + RD_OPT *const rd = &cpi->rd; + SPEED_FEATURES *const sf = &cpi->sf; + + // Set baseline threshold values. + for (i = 0; i < MAX_MODES; ++i) + rd->thresh_mult[i] = cpi->oxcf.mode == BEST ? -500 : 0; + + if (sf->adaptive_rd_thresh) { + rd->thresh_mult[THR_NEARESTMV] = 300; + rd->thresh_mult[THR_NEARESTG] = 300; + rd->thresh_mult[THR_NEARESTA] = 300; + } else { + rd->thresh_mult[THR_NEARESTMV] = 0; + rd->thresh_mult[THR_NEARESTG] = 0; + rd->thresh_mult[THR_NEARESTA] = 0; + } + + rd->thresh_mult[THR_DC] += 1000; + + rd->thresh_mult[THR_NEWMV] += 1000; + rd->thresh_mult[THR_NEWA] += 1000; + rd->thresh_mult[THR_NEWG] += 1000; + + rd->thresh_mult[THR_NEARMV] += 1000; + rd->thresh_mult[THR_NEARA] += 1000; + rd->thresh_mult[THR_COMP_NEARESTLA] += 1000; + rd->thresh_mult[THR_COMP_NEARESTGA] += 1000; + + rd->thresh_mult[THR_TM] += 1000; + + rd->thresh_mult[THR_COMP_NEARLA] += 1500; + rd->thresh_mult[THR_COMP_NEWLA] += 2000; + rd->thresh_mult[THR_NEARG] += 1000; + rd->thresh_mult[THR_COMP_NEARGA] += 1500; + rd->thresh_mult[THR_COMP_NEWGA] += 2000; + + rd->thresh_mult[THR_ZEROMV] += 2000; + rd->thresh_mult[THR_ZEROG] += 2000; + rd->thresh_mult[THR_ZEROA] += 2000; + rd->thresh_mult[THR_COMP_ZEROLA] += 2500; + rd->thresh_mult[THR_COMP_ZEROGA] += 2500; + + rd->thresh_mult[THR_H_PRED] += 2000; + rd->thresh_mult[THR_V_PRED] += 2000; + rd->thresh_mult[THR_D45_PRED] += 2500; + rd->thresh_mult[THR_D135_PRED] += 2500; + rd->thresh_mult[THR_D117_PRED] += 2500; + rd->thresh_mult[THR_D153_PRED] += 2500; + rd->thresh_mult[THR_D207_PRED] += 2500; + rd->thresh_mult[THR_D63_PRED] += 2500; +} + +void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi) { + static const int thresh_mult[2][MAX_REFS] = { + { 2500, 2500, 2500, 4500, 4500, 2500 }, + { 2000, 2000, 2000, 4000, 4000, 2000 } + }; + RD_OPT *const rd = &cpi->rd; + const int idx = cpi->oxcf.mode == BEST; + memcpy(rd->thresh_mult_sub8x8, thresh_mult[idx], sizeof(thresh_mult[idx])); +} + +void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh, + int bsize, int best_mode_index) { + if (rd_thresh > 0) { + const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES; + int mode; + for (mode = 0; mode < top_mode; ++mode) { + const BLOCK_SIZE min_size = VPXMAX(bsize - 1, BLOCK_4X4); + const BLOCK_SIZE max_size = VPXMIN(bsize + 2, BLOCK_64X64); + BLOCK_SIZE bs; + for (bs = min_size; bs <= max_size; ++bs) { + int *const fact = &factor_buf[bs][mode]; + if (mode == best_mode_index) { + *fact -= (*fact >> 4); + } else { + *fact = VPXMIN(*fact + RD_THRESH_INC, rd_thresh * RD_THRESH_MAX_FACT); + } + } + } + } +} + +int vp9_get_intra_cost_penalty(const VP9_COMP *const cpi, BLOCK_SIZE bsize, + int qindex, int qdelta) { + // Reduce the intra cost penalty for small blocks (<=16x16). + int reduction_fac = + (bsize <= BLOCK_16X16) ? ((bsize <= BLOCK_8X8) ? 4 : 2) : 0; + + if (cpi->noise_estimate.enabled && cpi->noise_estimate.level == kHigh) + // Don't reduce intra cost penalty if estimated noise level is high. + reduction_fac = 0; + + // Always use VPX_BITS_8 as input here because the penalty is applied + // to rate not distortion so we want a consistent penalty for all bit + // depths. If the actual bit depth were passed in here then the value + // retured by vp9_dc_quant() would scale with the bit depth and we would + // then need to apply inverse scaling to correct back to a bit depth + // independent rate penalty. + return (20 * vp9_dc_quant(qindex, qdelta, VPX_BITS_8)) >> reduction_fac; +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_rd.h b/media/libvpx/libvpx/vp9/encoder/vp9_rd.h new file mode 100644 index 0000000000..6c61ae514a --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_rd.h @@ -0,0 +1,235 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_RD_H_ +#define VPX_VP9_ENCODER_VP9_RD_H_ + +#include + +#include "vp9/common/vp9_blockd.h" + +#include "vp9/encoder/vp9_block.h" +#include "vp9/encoder/vp9_context_tree.h" +#include "vp9/encoder/vp9_cost.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define RDDIV_BITS 7 +#define RD_EPB_SHIFT 6 + +#define RDCOST(RM, DM, R, D) \ + ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), VP9_PROB_COST_SHIFT) + ((D) << (DM)) +#define RDCOST_NEG_R(RM, DM, R, D) \ + ((D) << (DM)) - ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), VP9_PROB_COST_SHIFT) +#define RDCOST_NEG_D(RM, DM, R, D) \ + ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), VP9_PROB_COST_SHIFT) - ((D) << (DM)) + +#define QIDX_SKIP_THRESH 115 + +#define MV_COST_WEIGHT 108 +#define MV_COST_WEIGHT_SUB 120 + +#define MAX_MODES 30 +#define MAX_REFS 6 + +#define RD_THRESH_INIT_FACT 32 +#define RD_THRESH_MAX_FACT 64 +#define RD_THRESH_INC 1 + +#define VP9_DIST_SCALE_LOG2 4 +#define VP9_DIST_SCALE (1 << VP9_DIST_SCALE_LOG2) + +// This enumerator type needs to be kept aligned with the mode order in +// const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code. +typedef enum { + THR_NEARESTMV, + THR_NEARESTA, + THR_NEARESTG, + + THR_DC, + + THR_NEWMV, + THR_NEWA, + THR_NEWG, + + THR_NEARMV, + THR_NEARA, + THR_NEARG, + + THR_ZEROMV, + THR_ZEROG, + THR_ZEROA, + + THR_COMP_NEARESTLA, + THR_COMP_NEARESTGA, + + THR_TM, + + THR_COMP_NEARLA, + THR_COMP_NEWLA, + THR_COMP_NEARGA, + THR_COMP_NEWGA, + + THR_COMP_ZEROLA, + THR_COMP_ZEROGA, + + THR_H_PRED, + THR_V_PRED, + THR_D135_PRED, + THR_D207_PRED, + THR_D153_PRED, + THR_D63_PRED, + THR_D117_PRED, + THR_D45_PRED, +} THR_MODES; + +typedef enum { + THR_LAST, + THR_GOLD, + THR_ALTR, + THR_COMP_LA, + THR_COMP_GA, + THR_INTRA, +} THR_MODES_SUB8X8; + +typedef struct { + // RD multiplier control factors added for Vizier project. + double rd_mult_inter_qp_fac; + double rd_mult_arf_qp_fac; + double rd_mult_key_qp_fac; +} RD_CONTROL; + +typedef struct RD_OPT { + // Thresh_mult is used to set a threshold for the rd score. A higher value + // means that we will accept the best mode so far more often. This number + // is used in combination with the current block size, and thresh_freq_fact to + // pick a threshold. + int thresh_mult[MAX_MODES]; + int thresh_mult_sub8x8[MAX_REFS]; + + int threshes[MAX_SEGMENTS][BLOCK_SIZES][MAX_MODES]; + + int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES]; + + int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS]; + int64_t prediction_type_threshes_prev[MAX_REF_FRAMES][REFERENCE_MODES]; + + int64_t filter_threshes_prev[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS]; + int RDMULT; + int RDDIV; + double r0; +} RD_OPT; + +typedef struct RD_COST { + int rate; + int64_t dist; + int64_t rdcost; +} RD_COST; + +// Reset the rate distortion cost values to maximum (invalid) value. +void vp9_rd_cost_reset(RD_COST *rd_cost); +// Initialize the rate distortion cost values to zero. +void vp9_rd_cost_init(RD_COST *rd_cost); +// It supports negative rate and dist, which is different from RDCOST(). +int64_t vp9_calculate_rd_cost(int mult, int div, int rate, int64_t dist); +// Update the cost value based on its rate and distortion. +void vp9_rd_cost_update(int mult, int div, RD_COST *rd_cost); + +struct TileInfo; +struct TileDataEnc; +struct VP9_COMP; +struct macroblock; + +void vp9_init_rd_parameters(struct VP9_COMP *cpi); + +int vp9_compute_rd_mult_based_on_qindex(const struct VP9_COMP *cpi, int qindex); + +int vp9_compute_rd_mult(const struct VP9_COMP *cpi, int qindex); + +int vp9_get_adaptive_rdmult(const struct VP9_COMP *cpi, double beta); + +void vp9_initialize_rd_consts(struct VP9_COMP *cpi); + +void vp9_initialize_me_consts(struct VP9_COMP *cpi, MACROBLOCK *x, int qindex); + +void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2, + unsigned int qstep, int *rate, int64_t *dist); + +int vp9_get_switchable_rate(const struct VP9_COMP *cpi, + const MACROBLOCKD *const xd); + +int vp9_raster_block_offset(BLOCK_SIZE plane_bsize, int raster_block, + int stride); + +int16_t *vp9_raster_block_offset_int16(BLOCK_SIZE plane_bsize, int raster_block, + int16_t *base); + +YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const struct VP9_COMP *cpi, + int ref_frame); + +void vp9_init_me_luts(void); + +void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size, + const struct macroblockd_plane *pd, + ENTROPY_CONTEXT t_above[16], + ENTROPY_CONTEXT t_left[16]); + +void vp9_set_rd_speed_thresholds(struct VP9_COMP *cpi); + +void vp9_set_rd_speed_thresholds_sub8x8(struct VP9_COMP *cpi); + +void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh, + int bsize, int best_mode_index); + +static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh, + const int *const thresh_fact) { + return best_rd < ((int64_t)thresh * (*thresh_fact) >> 5) || thresh == INT_MAX; +} + +static INLINE void set_error_per_bit(MACROBLOCK *x, int rdmult) { + x->errorperbit = rdmult >> RD_EPB_SHIFT; + x->errorperbit += (x->errorperbit == 0); +} + +void vp9_mv_pred(struct VP9_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer, + int ref_y_stride, int ref_frame, BLOCK_SIZE block_size); + +void vp9_setup_pred_block(const MACROBLOCKD *xd, + struct buf_2d dst[MAX_MB_PLANE], + const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, + const struct scale_factors *scale, + const struct scale_factors *scale_uv); + +int vp9_get_intra_cost_penalty(const struct VP9_COMP *const cpi, + BLOCK_SIZE bsize, int qindex, int qdelta); + +unsigned int vp9_get_sby_variance(struct VP9_COMP *cpi, + const struct buf_2d *ref, BLOCK_SIZE bs); +unsigned int vp9_get_sby_perpixel_variance(struct VP9_COMP *cpi, + const struct buf_2d *ref, + BLOCK_SIZE bs); +#if CONFIG_VP9_HIGHBITDEPTH +unsigned int vp9_high_get_sby_variance(struct VP9_COMP *cpi, + const struct buf_2d *ref, BLOCK_SIZE bs, + int bd); +unsigned int vp9_high_get_sby_perpixel_variance(struct VP9_COMP *cpi, + const struct buf_2d *ref, + BLOCK_SIZE bs, int bd); +#endif + +void vp9_build_inter_mode_cost(struct VP9_COMP *cpi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_RD_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c new file mode 100644 index 0000000000..974e43c90f --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c @@ -0,0 +1,4923 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/system_state.h" + +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_idct.h" +#include "vp9/common/vp9_mvref_common.h" +#include "vp9/common/vp9_pred_common.h" +#include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/common/vp9_seg_common.h" + +#if !CONFIG_REALTIME_ONLY +#include "vp9/encoder/vp9_aq_variance.h" +#endif +#include "vp9/encoder/vp9_cost.h" +#include "vp9/encoder/vp9_encodemb.h" +#include "vp9/encoder/vp9_encodemv.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_mcomp.h" +#include "vp9/encoder/vp9_quantize.h" +#include "vp9/encoder/vp9_ratectrl.h" +#include "vp9/encoder/vp9_rd.h" +#include "vp9/encoder/vp9_rdopt.h" + +#define LAST_FRAME_MODE_MASK \ + ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | (1 << INTRA_FRAME)) +#define GOLDEN_FRAME_MODE_MASK \ + ((1 << LAST_FRAME) | (1 << ALTREF_FRAME) | (1 << INTRA_FRAME)) +#define ALT_REF_MODE_MASK \ + ((1 << LAST_FRAME) | (1 << GOLDEN_FRAME) | (1 << INTRA_FRAME)) + +#define SECOND_REF_FRAME_MASK ((1 << ALTREF_FRAME) | 0x01) + +#define MIN_EARLY_TERM_INDEX 3 +#define NEW_MV_DISCOUNT_FACTOR 8 + +typedef struct { + PREDICTION_MODE mode; + MV_REFERENCE_FRAME ref_frame[2]; +} MODE_DEFINITION; + +typedef struct { + MV_REFERENCE_FRAME ref_frame[2]; +} REF_DEFINITION; + +struct rdcost_block_args { + const VP9_COMP *cpi; + MACROBLOCK *x; + ENTROPY_CONTEXT t_above[16]; + ENTROPY_CONTEXT t_left[16]; + int this_rate; + int64_t this_dist; + int64_t this_sse; + int64_t this_rd; + int64_t best_rd; + int exit_early; + int use_fast_coef_costing; + const ScanOrder *so; + uint8_t skippable; + struct buf_2d *this_recon; +}; + +#define LAST_NEW_MV_INDEX 6 + +#if !CONFIG_REALTIME_ONLY +static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = { + { NEARESTMV, { LAST_FRAME, NO_REF_FRAME } }, + { NEARESTMV, { ALTREF_FRAME, NO_REF_FRAME } }, + { NEARESTMV, { GOLDEN_FRAME, NO_REF_FRAME } }, + + { DC_PRED, { INTRA_FRAME, NO_REF_FRAME } }, + + { NEWMV, { LAST_FRAME, NO_REF_FRAME } }, + { NEWMV, { ALTREF_FRAME, NO_REF_FRAME } }, + { NEWMV, { GOLDEN_FRAME, NO_REF_FRAME } }, + + { NEARMV, { LAST_FRAME, NO_REF_FRAME } }, + { NEARMV, { ALTREF_FRAME, NO_REF_FRAME } }, + { NEARMV, { GOLDEN_FRAME, NO_REF_FRAME } }, + + { ZEROMV, { LAST_FRAME, NO_REF_FRAME } }, + { ZEROMV, { GOLDEN_FRAME, NO_REF_FRAME } }, + { ZEROMV, { ALTREF_FRAME, NO_REF_FRAME } }, + + { NEARESTMV, { LAST_FRAME, ALTREF_FRAME } }, + { NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + + { TM_PRED, { INTRA_FRAME, NO_REF_FRAME } }, + + { NEARMV, { LAST_FRAME, ALTREF_FRAME } }, + { NEWMV, { LAST_FRAME, ALTREF_FRAME } }, + { NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + { NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + + { ZEROMV, { LAST_FRAME, ALTREF_FRAME } }, + { ZEROMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + + { H_PRED, { INTRA_FRAME, NO_REF_FRAME } }, + { V_PRED, { INTRA_FRAME, NO_REF_FRAME } }, + { D135_PRED, { INTRA_FRAME, NO_REF_FRAME } }, + { D207_PRED, { INTRA_FRAME, NO_REF_FRAME } }, + { D153_PRED, { INTRA_FRAME, NO_REF_FRAME } }, + { D63_PRED, { INTRA_FRAME, NO_REF_FRAME } }, + { D117_PRED, { INTRA_FRAME, NO_REF_FRAME } }, + { D45_PRED, { INTRA_FRAME, NO_REF_FRAME } }, +}; + +static const REF_DEFINITION vp9_ref_order[MAX_REFS] = { + { { LAST_FRAME, NO_REF_FRAME } }, { { GOLDEN_FRAME, NO_REF_FRAME } }, + { { ALTREF_FRAME, NO_REF_FRAME } }, { { LAST_FRAME, ALTREF_FRAME } }, + { { GOLDEN_FRAME, ALTREF_FRAME } }, { { INTRA_FRAME, NO_REF_FRAME } }, +}; +#endif // !CONFIG_REALTIME_ONLY + +static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int m, int n, + int min_plane, int max_plane) { + int i; + + for (i = min_plane; i < max_plane; ++i) { + struct macroblock_plane *const p = &x->plane[i]; + struct macroblockd_plane *const pd = &x->e_mbd.plane[i]; + + p->coeff = ctx->coeff_pbuf[i][m]; + p->qcoeff = ctx->qcoeff_pbuf[i][m]; + pd->dqcoeff = ctx->dqcoeff_pbuf[i][m]; + p->eobs = ctx->eobs_pbuf[i][m]; + + ctx->coeff_pbuf[i][m] = ctx->coeff_pbuf[i][n]; + ctx->qcoeff_pbuf[i][m] = ctx->qcoeff_pbuf[i][n]; + ctx->dqcoeff_pbuf[i][m] = ctx->dqcoeff_pbuf[i][n]; + ctx->eobs_pbuf[i][m] = ctx->eobs_pbuf[i][n]; + + ctx->coeff_pbuf[i][n] = p->coeff; + ctx->qcoeff_pbuf[i][n] = p->qcoeff; + ctx->dqcoeff_pbuf[i][n] = pd->dqcoeff; + ctx->eobs_pbuf[i][n] = p->eobs; + } +} + +#if !CONFIG_REALTIME_ONLY +// Planewise build inter prediction and compute rdcost with early termination +// option +static int build_inter_pred_model_rd_earlyterm( + VP9_COMP *cpi, int mi_row, int mi_col, BLOCK_SIZE bsize, MACROBLOCK *x, + MACROBLOCKD *xd, int *out_rate_sum, int64_t *out_dist_sum, + int *skip_txfm_sb, int64_t *skip_sse_sb, int do_earlyterm, + int64_t best_rd) { + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + int i; + int64_t rate_sum = 0; + int64_t dist_sum = 0; + const int ref = xd->mi[0]->ref_frame[0]; + unsigned int sse; + unsigned int var = 0; + int64_t total_sse = 0; + int skip_flag = 1; + const int shift = 6; + const int dequant_shift = +#if CONFIG_VP9_HIGHBITDEPTH + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : +#endif // CONFIG_VP9_HIGHBITDEPTH + 3; + + x->pred_sse[ref] = 0; + + // Build prediction signal, compute stats and RD cost on per-plane basis + for (i = 0; i < MAX_MB_PLANE; ++i) { + struct macroblock_plane *const p = &x->plane[i]; + struct macroblockd_plane *const pd = &xd->plane[i]; + const BLOCK_SIZE bs = get_plane_block_size(bsize, pd); + const TX_SIZE max_tx_size = max_txsize_lookup[bs]; + const BLOCK_SIZE unit_size = txsize_to_bsize[max_tx_size]; + const int64_t dc_thr = p->quant_thred[0] >> shift; + const int64_t ac_thr = p->quant_thred[1] >> shift; + unsigned int sum_sse = 0; + // The low thresholds are used to measure if the prediction errors are + // low enough so that we can skip the mode search. + const int64_t low_dc_thr = VPXMIN(50, dc_thr >> 2); + const int64_t low_ac_thr = VPXMIN(80, ac_thr >> 2); + int bw = 1 << (b_width_log2_lookup[bs] - b_width_log2_lookup[unit_size]); + int bh = 1 << (b_height_log2_lookup[bs] - b_width_log2_lookup[unit_size]); + int idx, idy; + int lw = b_width_log2_lookup[unit_size] + 2; + int lh = b_height_log2_lookup[unit_size] + 2; + unsigned int qstep; + unsigned int nlog2; + int64_t dist = 0; + + // Build inter predictor + vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i); + + // Compute useful stats + for (idy = 0; idy < bh; ++idy) { + for (idx = 0; idx < bw; ++idx) { + uint8_t *src = p->src.buf + (idy * p->src.stride << lh) + (idx << lw); + uint8_t *dst = pd->dst.buf + (idy * pd->dst.stride << lh) + (idx << lh); + int block_idx = (idy << 1) + idx; + int low_err_skip = 0; + + var = cpi->fn_ptr[unit_size].vf(src, p->src.stride, dst, pd->dst.stride, + &sse); + x->bsse[(i << 2) + block_idx] = sse; + sum_sse += sse; + + x->skip_txfm[(i << 2) + block_idx] = SKIP_TXFM_NONE; + if (!x->select_tx_size) { + // Check if all ac coefficients can be quantized to zero. + if (var < ac_thr || var == 0) { + x->skip_txfm[(i << 2) + block_idx] = SKIP_TXFM_AC_ONLY; + + // Check if dc coefficient can be quantized to zero. + if (sse - var < dc_thr || sse == var) { + x->skip_txfm[(i << 2) + block_idx] = SKIP_TXFM_AC_DC; + + if (!sse || (var < low_ac_thr && sse - var < low_dc_thr)) + low_err_skip = 1; + } + } + } + + if (skip_flag && !low_err_skip) skip_flag = 0; + + if (i == 0) x->pred_sse[ref] += sse; + } + } + + total_sse += sum_sse; + qstep = pd->dequant[1] >> dequant_shift; + nlog2 = num_pels_log2_lookup[bs]; + + // Fast approximate the modelling function. + if (cpi->sf.simple_model_rd_from_var) { + int64_t rate; + if (qstep < 120) + rate = ((int64_t)sum_sse * (280 - qstep)) >> (16 - VP9_PROB_COST_SHIFT); + else + rate = 0; + dist = ((int64_t)sum_sse * qstep) >> 8; + rate_sum += rate; + } else { + int rate; + vp9_model_rd_from_var_lapndz(sum_sse, nlog2, qstep, &rate, &dist); + rate_sum += rate; + } + dist_sum += dist; + if (do_earlyterm) { + if (RDCOST(x->rdmult, x->rddiv, rate_sum, + dist_sum << VP9_DIST_SCALE_LOG2) >= best_rd) + return 1; + } + } + *skip_txfm_sb = skip_flag; + *skip_sse_sb = total_sse << VP9_DIST_SCALE_LOG2; + *out_rate_sum = (int)rate_sum; + *out_dist_sum = dist_sum << VP9_DIST_SCALE_LOG2; + + return 0; +} +#endif // !CONFIG_REALTIME_ONLY + +#if CONFIG_VP9_HIGHBITDEPTH +int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, + const tran_low_t *dqcoeff, intptr_t block_size, + int64_t *ssz, int bd) { + int i; + int64_t error = 0, sqcoeff = 0; + int shift = 2 * (bd - 8); + int rounding = shift > 0 ? 1 << (shift - 1) : 0; + + for (i = 0; i < block_size; i++) { + const int64_t diff = coeff[i] - dqcoeff[i]; + error += diff * diff; + sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i]; + } + assert(error >= 0 && sqcoeff >= 0); + error = (error + rounding) >> shift; + sqcoeff = (sqcoeff + rounding) >> shift; + + *ssz = sqcoeff; + return error; +} + +static int64_t vp9_highbd_block_error_dispatch(const tran_low_t *coeff, + const tran_low_t *dqcoeff, + intptr_t block_size, + int64_t *ssz, int bd) { + if (bd == 8) { + return vp9_block_error(coeff, dqcoeff, block_size, ssz); + } else { + return vp9_highbd_block_error(coeff, dqcoeff, block_size, ssz, bd); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { + int i; + int64_t error = 0, sqcoeff = 0; + + for (i = 0; i < block_size; i++) { + const int diff = coeff[i] - dqcoeff[i]; + error += diff * diff; + sqcoeff += coeff[i] * coeff[i]; + } + + *ssz = sqcoeff; + return error; +} + +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, + int block_size) { + int i; + int64_t error = 0; + + for (i = 0; i < block_size; i++) { + const int diff = coeff[i] - dqcoeff[i]; + error += diff * diff; + } + + return error; +} + +/* The trailing '0' is a terminator which is used inside cost_coeffs() to + * decide whether to include cost of a trailing EOB node or not (i.e. we + * can skip this if the last coefficient in this transform block, e.g. the + * 16th coefficient in a 4x4 block or the 64th coefficient in a 8x8 block, + * were non-zero). */ +static const int16_t band_counts[TX_SIZES][8] = { + { 1, 2, 3, 4, 3, 16 - 13, 0 }, + { 1, 2, 3, 4, 11, 64 - 21, 0 }, + { 1, 2, 3, 4, 11, 256 - 21, 0 }, + { 1, 2, 3, 4, 11, 1024 - 21, 0 }, +}; +static int cost_coeffs(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size, + int pt, const int16_t *scan, const int16_t *nb, + int use_fast_coef_costing) { + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *mi = xd->mi[0]; + const struct macroblock_plane *p = &x->plane[plane]; + const PLANE_TYPE type = get_plane_type(plane); + const int16_t *band_count = &band_counts[tx_size][1]; + const int eob = p->eobs[block]; + const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] = + x->token_costs[tx_size][type][is_inter_block(mi)]; + uint8_t token_cache[32 * 32]; + int cost; +#if CONFIG_VP9_HIGHBITDEPTH + const uint16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd); +#else + const uint16_t *cat6_high_cost = vp9_get_high_cost_table(8); +#endif + + // Check for consistency of tx_size with mode info + assert(type == PLANE_TYPE_Y + ? mi->tx_size == tx_size + : get_uv_tx_size(mi, &xd->plane[plane]) == tx_size); + + if (eob == 0) { + // single eob token + cost = token_costs[0][0][pt][EOB_TOKEN]; + } else { + if (use_fast_coef_costing) { + int band_left = *band_count++; + int c; + + // dc token + int v = qcoeff[0]; + int16_t prev_t; + cost = vp9_get_token_cost(v, &prev_t, cat6_high_cost); + cost += (*token_costs)[0][pt][prev_t]; + + token_cache[0] = vp9_pt_energy_class[prev_t]; + ++token_costs; + + // ac tokens + for (c = 1; c < eob; c++) { + const int rc = scan[c]; + int16_t t; + + v = qcoeff[rc]; + cost += vp9_get_token_cost(v, &t, cat6_high_cost); + cost += (*token_costs)[!prev_t][!prev_t][t]; + prev_t = t; + if (!--band_left) { + band_left = *band_count++; + ++token_costs; + } + } + + // eob token + if (band_left) cost += (*token_costs)[0][!prev_t][EOB_TOKEN]; + + } else { // !use_fast_coef_costing + int band_left = *band_count++; + int c; + + // dc token + int v = qcoeff[0]; + int16_t tok; + unsigned int(*tok_cost_ptr)[COEFF_CONTEXTS][ENTROPY_TOKENS]; + cost = vp9_get_token_cost(v, &tok, cat6_high_cost); + cost += (*token_costs)[0][pt][tok]; + + token_cache[0] = vp9_pt_energy_class[tok]; + ++token_costs; + + tok_cost_ptr = &((*token_costs)[!tok]); + + // ac tokens + for (c = 1; c < eob; c++) { + const int rc = scan[c]; + + v = qcoeff[rc]; + cost += vp9_get_token_cost(v, &tok, cat6_high_cost); + pt = get_coef_context(nb, token_cache, c); + cost += (*tok_cost_ptr)[pt][tok]; + token_cache[rc] = vp9_pt_energy_class[tok]; + if (!--band_left) { + band_left = *band_count++; + ++token_costs; + } + tok_cost_ptr = &((*token_costs)[!tok]); + } + + // eob token + if (band_left) { + pt = get_coef_context(nb, token_cache, c); + cost += (*token_costs)[0][pt][EOB_TOKEN]; + } + } + } + + return cost; +} + +// Copy all visible 4x4s in the transform block. +static void copy_block_visible(const MACROBLOCKD *xd, + const struct macroblockd_plane *const pd, + const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, int blk_row, + int blk_col, const BLOCK_SIZE plane_bsize, + const BLOCK_SIZE tx_bsize) { + const int plane_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; + const int plane_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; + const int tx_4x4_w = num_4x4_blocks_wide_lookup[tx_bsize]; + const int tx_4x4_h = num_4x4_blocks_high_lookup[tx_bsize]; + int b4x4s_to_right_edge = num_4x4_to_edge(plane_4x4_w, xd->mb_to_right_edge, + pd->subsampling_x, blk_col); + int b4x4s_to_bottom_edge = num_4x4_to_edge(plane_4x4_h, xd->mb_to_bottom_edge, + pd->subsampling_y, blk_row); + const int is_highbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH; + if (tx_bsize == BLOCK_4X4 || + (b4x4s_to_right_edge >= tx_4x4_w && b4x4s_to_bottom_edge >= tx_4x4_h)) { + const int w = tx_4x4_w << 2; + const int h = tx_4x4_h << 2; +#if CONFIG_VP9_HIGHBITDEPTH + if (is_highbd) { + vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(src), src_stride, + CONVERT_TO_SHORTPTR(dst), dst_stride, NULL, 0, 0, + 0, 0, w, h, xd->bd); + } else { +#endif + vpx_convolve_copy(src, src_stride, dst, dst_stride, NULL, 0, 0, 0, 0, w, + h); +#if CONFIG_VP9_HIGHBITDEPTH + } +#endif + } else { + int r, c; + int max_r = VPXMIN(b4x4s_to_bottom_edge, tx_4x4_h); + int max_c = VPXMIN(b4x4s_to_right_edge, tx_4x4_w); + // if we are in the unrestricted motion border. + for (r = 0; r < max_r; ++r) { + // Skip visiting the sub blocks that are wholly within the UMV. + for (c = 0; c < max_c; ++c) { + const uint8_t *src_ptr = src + r * src_stride * 4 + c * 4; + uint8_t *dst_ptr = dst + r * dst_stride * 4 + c * 4; +#if CONFIG_VP9_HIGHBITDEPTH + if (is_highbd) { + vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(src_ptr), src_stride, + CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, + NULL, 0, 0, 0, 0, 4, 4, xd->bd); + } else { +#endif + vpx_convolve_copy(src_ptr, src_stride, dst_ptr, dst_stride, NULL, 0, + 0, 0, 0, 4, 4); +#if CONFIG_VP9_HIGHBITDEPTH + } +#endif + } + } + } + (void)is_highbd; +} + +// Compute the pixel domain sum square error on all visible 4x4s in the +// transform block. +static unsigned pixel_sse(const VP9_COMP *const cpi, const MACROBLOCKD *xd, + const struct macroblockd_plane *const pd, + const uint8_t *src, const int src_stride, + const uint8_t *dst, const int dst_stride, int blk_row, + int blk_col, const BLOCK_SIZE plane_bsize, + const BLOCK_SIZE tx_bsize) { + unsigned int sse = 0; + const int plane_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; + const int plane_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; + const int tx_4x4_w = num_4x4_blocks_wide_lookup[tx_bsize]; + const int tx_4x4_h = num_4x4_blocks_high_lookup[tx_bsize]; + int b4x4s_to_right_edge = num_4x4_to_edge(plane_4x4_w, xd->mb_to_right_edge, + pd->subsampling_x, blk_col); + int b4x4s_to_bottom_edge = num_4x4_to_edge(plane_4x4_h, xd->mb_to_bottom_edge, + pd->subsampling_y, blk_row); + if (tx_bsize == BLOCK_4X4 || + (b4x4s_to_right_edge >= tx_4x4_w && b4x4s_to_bottom_edge >= tx_4x4_h)) { + cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse); + } else { + const vpx_variance_fn_t vf_4x4 = cpi->fn_ptr[BLOCK_4X4].vf; + int r, c; + unsigned this_sse = 0; + int max_r = VPXMIN(b4x4s_to_bottom_edge, tx_4x4_h); + int max_c = VPXMIN(b4x4s_to_right_edge, tx_4x4_w); + sse = 0; + // if we are in the unrestricted motion border. + for (r = 0; r < max_r; ++r) { + // Skip visiting the sub blocks that are wholly within the UMV. + for (c = 0; c < max_c; ++c) { + vf_4x4(src + r * src_stride * 4 + c * 4, src_stride, + dst + r * dst_stride * 4 + c * 4, dst_stride, &this_sse); + sse += this_sse; + } + } + } + return sse; +} + +static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, + BLOCK_SIZE plane_bsize, int block, int blk_row, + int blk_col, TX_SIZE tx_size, int64_t *out_dist, + int64_t *out_sse, struct buf_2d *out_recon, + int sse_calc_done) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int eob = p->eobs[block]; + + if (!out_recon && x->block_tx_domain && eob) { + const int ss_txfrm_size = tx_size << 1; + int64_t this_sse; + const int shift = tx_size == TX_32X32 ? 0 : 2; + const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); + const tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); +#if CONFIG_VP9_HIGHBITDEPTH + const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8; + *out_dist = vp9_highbd_block_error_dispatch( + coeff, dqcoeff, 16 << ss_txfrm_size, &this_sse, bd) >> + shift; +#else + *out_dist = + vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size, &this_sse) >> + shift; +#endif // CONFIG_VP9_HIGHBITDEPTH + *out_sse = this_sse >> shift; + + if (x->skip_encode && !is_inter_block(xd->mi[0])) { + // TODO(jingning): tune the model to better capture the distortion. + const int64_t mean_quant_error = + (pd->dequant[1] * pd->dequant[1] * (1 << ss_txfrm_size)) >> +#if CONFIG_VP9_HIGHBITDEPTH + (shift + 2 + (bd - 8) * 2); +#else + (shift + 2); +#endif // CONFIG_VP9_HIGHBITDEPTH + *out_dist += (mean_quant_error >> 4); + *out_sse += mean_quant_error; + } + } else { + const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; + const int bs = 4 * num_4x4_blocks_wide_lookup[tx_bsize]; + const int src_stride = p->src.stride; + const int dst_stride = pd->dst.stride; + const int src_idx = 4 * (blk_row * src_stride + blk_col); + const int dst_idx = 4 * (blk_row * dst_stride + blk_col); + const uint8_t *src = &p->src.buf[src_idx]; + const uint8_t *dst = &pd->dst.buf[dst_idx]; + uint8_t *out_recon_ptr = 0; + + const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + unsigned int tmp; + + if (sse_calc_done) { + tmp = (unsigned int)(*out_sse); + } else { + tmp = pixel_sse(cpi, xd, pd, src, src_stride, dst, dst_stride, blk_row, + blk_col, plane_bsize, tx_bsize); + } + *out_sse = (int64_t)tmp * 16; + if (out_recon) { + const int out_recon_idx = 4 * (blk_row * out_recon->stride + blk_col); + out_recon_ptr = &out_recon->buf[out_recon_idx]; + copy_block_visible(xd, pd, dst, dst_stride, out_recon_ptr, + out_recon->stride, blk_row, blk_col, plane_bsize, + tx_bsize); + } + + if (eob) { +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, recon16[1024]); + uint8_t *recon = (uint8_t *)recon16; +#else + DECLARE_ALIGNED(16, uint8_t, recon[1024]); +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, recon16, + 32, NULL, 0, 0, 0, 0, bs, bs, xd->bd); + if (xd->lossless) { + vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, eob, xd->bd); + } else { + switch (tx_size) { + case TX_4X4: + vp9_highbd_idct4x4_add(dqcoeff, recon16, 32, eob, xd->bd); + break; + case TX_8X8: + vp9_highbd_idct8x8_add(dqcoeff, recon16, 32, eob, xd->bd); + break; + case TX_16X16: + vp9_highbd_idct16x16_add(dqcoeff, recon16, 32, eob, xd->bd); + break; + default: + assert(tx_size == TX_32X32); + vp9_highbd_idct32x32_add(dqcoeff, recon16, 32, eob, xd->bd); + break; + } + } + recon = CONVERT_TO_BYTEPTR(recon16); + } else { +#endif // CONFIG_VP9_HIGHBITDEPTH + vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, 0, 0, 0, bs, bs); + switch (tx_size) { + case TX_32X32: vp9_idct32x32_add(dqcoeff, recon, 32, eob); break; + case TX_16X16: vp9_idct16x16_add(dqcoeff, recon, 32, eob); break; + case TX_8X8: vp9_idct8x8_add(dqcoeff, recon, 32, eob); break; + default: + assert(tx_size == TX_4X4); + // this is like vp9_short_idct4x4 but has a special case around + // eob<=1, which is significant (not just an optimization) for + // the lossless case. + x->inv_txfm_add(dqcoeff, recon, 32, eob); + break; + } +#if CONFIG_VP9_HIGHBITDEPTH + } +#endif // CONFIG_VP9_HIGHBITDEPTH + + tmp = pixel_sse(cpi, xd, pd, src, src_stride, recon, 32, blk_row, blk_col, + plane_bsize, tx_bsize); + if (out_recon) { + copy_block_visible(xd, pd, recon, 32, out_recon_ptr, out_recon->stride, + blk_row, blk_col, plane_bsize, tx_bsize); + } + } + + *out_dist = (int64_t)tmp * 16; + } +} + +static int rate_block(int plane, int block, TX_SIZE tx_size, int coeff_ctx, + struct rdcost_block_args *args) { + return cost_coeffs(args->x, plane, block, tx_size, coeff_ctx, args->so->scan, + args->so->neighbors, args->use_fast_coef_costing); +} + +static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { + struct rdcost_block_args *args = arg; + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mi = xd->mi[0]; + int64_t rd1, rd2, rd; + int rate; + int64_t dist = INT64_MAX; + int64_t sse = INT64_MAX; + const int coeff_ctx = + combine_entropy_contexts(args->t_left[blk_row], args->t_above[blk_col]); + struct buf_2d *recon = args->this_recon; + const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int dst_stride = pd->dst.stride; + const uint8_t *dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)]; + const int enable_trellis_opt = args->cpi->sf.trellis_opt_tx_rd.method; + const double trellis_opt_thresh = args->cpi->sf.trellis_opt_tx_rd.thresh; + int sse_calc_done = 0; +#if CONFIG_MISMATCH_DEBUG + struct encode_b_args encode_b_arg = { + x, enable_trellis_opt, trellis_opt_thresh, &sse_calc_done, + &sse, args->t_above, args->t_left, &mi->skip, + 0, // mi_row + 0, // mi_col + 0 // output_enabled + }; +#else + struct encode_b_args encode_b_arg = { + x, enable_trellis_opt, trellis_opt_thresh, &sse_calc_done, + &sse, args->t_above, args->t_left, &mi->skip + }; +#endif + + if (args->exit_early) return; + + if (!is_inter_block(mi)) { + vp9_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size, + &encode_b_arg); + if (recon) { + uint8_t *rec_ptr = &recon->buf[4 * (blk_row * recon->stride + blk_col)]; + copy_block_visible(xd, pd, dst, dst_stride, rec_ptr, recon->stride, + blk_row, blk_col, plane_bsize, tx_bsize); + } + if (x->block_tx_domain) { + dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col, + tx_size, &dist, &sse, /*out_recon=*/NULL, sse_calc_done); + } else { + const struct macroblock_plane *const p = &x->plane[plane]; + const int src_stride = p->src.stride; + const uint8_t *src = &p->src.buf[4 * (blk_row * src_stride + blk_col)]; + unsigned int tmp; + if (!sse_calc_done) { + const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; + const int16_t *diff = + &p->src_diff[4 * (blk_row * diff_stride + blk_col)]; + int visible_width, visible_height; + sse = sum_squares_visible(xd, pd, diff, diff_stride, blk_row, blk_col, + plane_bsize, tx_bsize, &visible_width, + &visible_height); + } +#if CONFIG_VP9_HIGHBITDEPTH + if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && (xd->bd > 8)) + sse = ROUND64_POWER_OF_TWO(sse, (xd->bd - 8) * 2); +#endif // CONFIG_VP9_HIGHBITDEPTH + sse = sse * 16; + tmp = pixel_sse(args->cpi, xd, pd, src, src_stride, dst, dst_stride, + blk_row, blk_col, plane_bsize, tx_bsize); + dist = (int64_t)tmp * 16; + } + } else { + int skip_txfm_flag = SKIP_TXFM_NONE; + if (max_txsize_lookup[plane_bsize] == tx_size) + skip_txfm_flag = x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))]; + + // This reduces the risk of bad perceptual quality due to bad prediction. + // We always force the encoder to perform transform and quantization. + if (!args->cpi->sf.allow_skip_txfm_ac_dc && + skip_txfm_flag == SKIP_TXFM_AC_DC) { + skip_txfm_flag = SKIP_TXFM_NONE; + } + + if (skip_txfm_flag == SKIP_TXFM_NONE || + (recon && skip_txfm_flag == SKIP_TXFM_AC_ONLY)) { + const struct macroblock_plane *const p = &x->plane[plane]; + const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; + const int16_t *const diff = + &p->src_diff[4 * (blk_row * diff_stride + blk_col)]; + const int use_trellis_opt = + do_trellis_opt(pd, diff, diff_stride, blk_row, blk_col, plane_bsize, + tx_size, &encode_b_arg); + // full forward transform and quantization + vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size); + if (use_trellis_opt) vp9_optimize_b(x, plane, block, tx_size, coeff_ctx); + dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col, + tx_size, &dist, &sse, recon, sse_calc_done); + } else if (skip_txfm_flag == SKIP_TXFM_AC_ONLY) { + // compute DC coefficient + tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block); + vp9_xform_quant_dc(x, plane, block, blk_row, blk_col, plane_bsize, + tx_size); + sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4; + dist = sse; + if (x->plane[plane].eobs[block]) { + const int64_t orig_sse = (int64_t)coeff[0] * coeff[0]; + const int64_t resd_sse = coeff[0] - dqcoeff[0]; + int64_t dc_correct = orig_sse - resd_sse * resd_sse; +#if CONFIG_VP9_HIGHBITDEPTH + dc_correct >>= ((xd->bd - 8) * 2); +#endif + if (tx_size != TX_32X32) dc_correct >>= 2; + + dist = VPXMAX(0, sse - dc_correct); + } + } else { + assert(0 && "allow_skip_txfm_ac_dc does not allow SKIP_TXFM_AC_DC."); + } + } + + rd = RDCOST(x->rdmult, x->rddiv, 0, dist); + if (args->this_rd + rd > args->best_rd) { + args->exit_early = 1; + return; + } + + rate = rate_block(plane, block, tx_size, coeff_ctx, args); + args->t_above[blk_col] = (x->plane[plane].eobs[block] > 0) ? 1 : 0; + args->t_left[blk_row] = (x->plane[plane].eobs[block] > 0) ? 1 : 0; + rd1 = RDCOST(x->rdmult, x->rddiv, rate, dist); + rd2 = RDCOST(x->rdmult, x->rddiv, 0, sse); + + // TODO(jingning): temporarily enabled only for luma component + rd = VPXMIN(rd1, rd2); + if (plane == 0) { + x->zcoeff_blk[tx_size][block] = + !x->plane[plane].eobs[block] || + (x->sharpness == 0 && rd1 > rd2 && !xd->lossless); + x->sum_y_eobs[tx_size] += x->plane[plane].eobs[block]; + } + + args->this_rate += rate; + args->this_dist += dist; + args->this_sse += sse; + args->this_rd += rd; + + if (args->this_rd > args->best_rd) { + args->exit_early = 1; + return; + } + + args->skippable &= !x->plane[plane].eobs[block]; +} + +static void txfm_rd_in_plane(const VP9_COMP *cpi, MACROBLOCK *x, int *rate, + int64_t *distortion, int *skippable, int64_t *sse, + int64_t ref_best_rd, int plane, BLOCK_SIZE bsize, + TX_SIZE tx_size, int use_fast_coef_costing, + struct buf_2d *recon) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + struct rdcost_block_args args; + vp9_zero(args); + args.cpi = cpi; + args.x = x; + args.best_rd = ref_best_rd; + args.use_fast_coef_costing = use_fast_coef_costing; + args.skippable = 1; + args.this_recon = recon; + + if (plane == 0) xd->mi[0]->tx_size = tx_size; + + vp9_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left); + + args.so = get_scan(xd, tx_size, get_plane_type(plane), 0); + + vp9_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm, + &args); + if (args.exit_early) { + *rate = INT_MAX; + *distortion = INT64_MAX; + *sse = INT64_MAX; + *skippable = 0; + } else { + *distortion = args.this_dist; + *rate = args.this_rate; + *sse = args.this_sse; + *skippable = args.skippable; + } +} + +static void choose_largest_tx_size(VP9_COMP *cpi, MACROBLOCK *x, int *rate, + int64_t *distortion, int *skip, int64_t *sse, + int64_t ref_best_rd, BLOCK_SIZE bs, + struct buf_2d *recon) { + const TX_SIZE max_tx_size = max_txsize_lookup[bs]; + VP9_COMMON *const cm = &cpi->common; + const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode]; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mi = xd->mi[0]; + + mi->tx_size = VPXMIN(max_tx_size, largest_tx_size); + + txfm_rd_in_plane(cpi, x, rate, distortion, skip, sse, ref_best_rd, 0, bs, + mi->tx_size, cpi->sf.use_fast_coef_costing, recon); +} + +static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, + int64_t *distortion, int *skip, + int64_t *psse, int64_t ref_best_rd, + BLOCK_SIZE bs, struct buf_2d *recon) { + const TX_SIZE max_tx_size = max_txsize_lookup[bs]; + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mi = xd->mi[0]; + vpx_prob skip_prob = vp9_get_skip_prob(cm, xd); + int r[TX_SIZES][2], s[TX_SIZES]; + int64_t d[TX_SIZES], sse[TX_SIZES]; + int64_t rd[TX_SIZES][2] = { { INT64_MAX, INT64_MAX }, + { INT64_MAX, INT64_MAX }, + { INT64_MAX, INT64_MAX }, + { INT64_MAX, INT64_MAX } }; + int n; + int s0, s1; + int64_t best_rd = ref_best_rd; + TX_SIZE best_tx = max_tx_size; + int start_tx, end_tx; + const int tx_size_ctx = get_tx_size_context(xd); +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, recon_buf16[TX_SIZES][64 * 64]); + uint8_t *recon_buf[TX_SIZES]; + for (n = 0; n < TX_SIZES; ++n) { + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + recon_buf[n] = CONVERT_TO_BYTEPTR(recon_buf16[n]); + } else { + recon_buf[n] = (uint8_t *)recon_buf16[n]; + } + } +#else + DECLARE_ALIGNED(16, uint8_t, recon_buf[TX_SIZES][64 * 64]); +#endif // CONFIG_VP9_HIGHBITDEPTH + + assert(skip_prob > 0); + s0 = vp9_cost_bit(skip_prob, 0); + s1 = vp9_cost_bit(skip_prob, 1); + + if (cm->tx_mode == TX_MODE_SELECT) { + start_tx = max_tx_size; + end_tx = VPXMAX(start_tx - cpi->sf.tx_size_search_depth, 0); + if (bs > BLOCK_32X32) end_tx = VPXMIN(end_tx + 1, start_tx); + } else { + TX_SIZE chosen_tx_size = + VPXMIN(max_tx_size, tx_mode_to_biggest_tx_size[cm->tx_mode]); + start_tx = chosen_tx_size; + end_tx = chosen_tx_size; + } + + for (n = start_tx; n >= end_tx; n--) { + const int r_tx_size = cpi->tx_size_cost[max_tx_size - 1][tx_size_ctx][n]; + if (recon) { + struct buf_2d this_recon; + this_recon.buf = recon_buf[n]; + this_recon.stride = recon->stride; + txfm_rd_in_plane(cpi, x, &r[n][0], &d[n], &s[n], &sse[n], best_rd, 0, bs, + n, cpi->sf.use_fast_coef_costing, &this_recon); + } else { + txfm_rd_in_plane(cpi, x, &r[n][0], &d[n], &s[n], &sse[n], best_rd, 0, bs, + n, cpi->sf.use_fast_coef_costing, 0); + } + r[n][1] = r[n][0]; + if (r[n][0] < INT_MAX) { + r[n][1] += r_tx_size; + } + if (d[n] == INT64_MAX || r[n][0] == INT_MAX) { + rd[n][0] = rd[n][1] = INT64_MAX; + } else if (s[n]) { + if (is_inter_block(mi)) { + rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, sse[n]); + r[n][1] -= r_tx_size; + } else { + rd[n][0] = RDCOST(x->rdmult, x->rddiv, s1, sse[n]); + rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1 + r_tx_size, sse[n]); + } + } else { + rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]); + rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]); + } + + if (is_inter_block(mi) && !xd->lossless && !s[n] && sse[n] != INT64_MAX) { + rd[n][0] = VPXMIN(rd[n][0], RDCOST(x->rdmult, x->rddiv, s1, sse[n])); + rd[n][1] = VPXMIN(rd[n][1], RDCOST(x->rdmult, x->rddiv, s1, sse[n])); + } + + // Early termination in transform size search. + if (cpi->sf.tx_size_search_breakout && + (rd[n][1] == INT64_MAX || + (n < (int)max_tx_size && rd[n][1] > rd[n + 1][1]) || s[n] == 1)) + break; + + if (rd[n][1] < best_rd) { + best_tx = n; + best_rd = rd[n][1]; + } + } + mi->tx_size = best_tx; + + *distortion = d[mi->tx_size]; + *rate = r[mi->tx_size][cm->tx_mode == TX_MODE_SELECT]; + *skip = s[mi->tx_size]; + *psse = sse[mi->tx_size]; + if (recon) { +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + memcpy(CONVERT_TO_SHORTPTR(recon->buf), + CONVERT_TO_SHORTPTR(recon_buf[mi->tx_size]), + 64 * 64 * sizeof(uint16_t)); + } else { +#endif + memcpy(recon->buf, recon_buf[mi->tx_size], 64 * 64); +#if CONFIG_VP9_HIGHBITDEPTH + } +#endif + } +} + +static void super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, + int64_t *distortion, int *skip, int64_t *psse, + BLOCK_SIZE bs, int64_t ref_best_rd, + struct buf_2d *recon) { + MACROBLOCKD *xd = &x->e_mbd; + int64_t sse; + int64_t *ret_sse = psse ? psse : &sse; + + assert(bs == xd->mi[0]->sb_type); + + if (cpi->sf.tx_size_search_method == USE_LARGESTALL || xd->lossless) { + choose_largest_tx_size(cpi, x, rate, distortion, skip, ret_sse, ref_best_rd, + bs, recon); + } else { + choose_tx_size_from_rd(cpi, x, rate, distortion, skip, ret_sse, ref_best_rd, + bs, recon); + } +} + +static int conditional_skipintra(PREDICTION_MODE mode, + PREDICTION_MODE best_intra_mode) { + if (mode == D117_PRED && best_intra_mode != V_PRED && + best_intra_mode != D135_PRED) + return 1; + if (mode == D63_PRED && best_intra_mode != V_PRED && + best_intra_mode != D45_PRED) + return 1; + if (mode == D207_PRED && best_intra_mode != H_PRED && + best_intra_mode != D45_PRED) + return 1; + if (mode == D153_PRED && best_intra_mode != H_PRED && + best_intra_mode != D135_PRED) + return 1; + return 0; +} + +static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, + int col, PREDICTION_MODE *best_mode, + const int *bmode_costs, ENTROPY_CONTEXT *a, + ENTROPY_CONTEXT *l, int *bestrate, + int *bestratey, int64_t *bestdistortion, + BLOCK_SIZE bsize, int64_t rd_thresh) { + PREDICTION_MODE mode; + MACROBLOCKD *const xd = &x->e_mbd; + int64_t best_rd = rd_thresh; + struct macroblock_plane *p = &x->plane[0]; + struct macroblockd_plane *pd = &xd->plane[0]; + const int src_stride = p->src.stride; + const int dst_stride = pd->dst.stride; + const uint8_t *src_init = &p->src.buf[row * 4 * src_stride + col * 4]; + uint8_t *dst_init = &pd->dst.buf[row * 4 * src_stride + col * 4]; + ENTROPY_CONTEXT ta[2], tempa[2]; + ENTROPY_CONTEXT tl[2], templ[2]; + const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; + const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; + int idx, idy; + uint8_t best_dst[8 * 8]; +#if CONFIG_VP9_HIGHBITDEPTH + uint16_t best_dst16[8 * 8]; +#endif + memcpy(ta, a, num_4x4_blocks_wide * sizeof(a[0])); + memcpy(tl, l, num_4x4_blocks_high * sizeof(l[0])); + + xd->mi[0]->tx_size = TX_4X4; + + assert(!x->skip_block); + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + for (mode = DC_PRED; mode <= TM_PRED; ++mode) { + int64_t this_rd; + int ratey = 0; + int64_t distortion = 0; + int rate = bmode_costs[mode]; + + if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode))) continue; + + // Only do the oblique modes if the best so far is + // one of the neighboring directional modes + if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) { + if (conditional_skipintra(mode, *best_mode)) continue; + } + + memcpy(tempa, ta, num_4x4_blocks_wide * sizeof(ta[0])); + memcpy(templ, tl, num_4x4_blocks_high * sizeof(tl[0])); + + for (idy = 0; idy < num_4x4_blocks_high; ++idy) { + for (idx = 0; idx < num_4x4_blocks_wide; ++idx) { + const int block = (row + idy) * 2 + (col + idx); + const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride]; + uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride]; + uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); + int16_t *const src_diff = + vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff); + tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); + tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + uint16_t *const eob = &p->eobs[block]; + xd->mi[0]->bmi[block].as_mode = mode; + vp9_predict_intra_block(xd, 1, TX_4X4, mode, + x->skip_encode ? src : dst, + x->skip_encode ? src_stride : dst_stride, dst, + dst_stride, col + idx, row + idy, 0); + vpx_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, + dst_stride, xd->bd); + if (xd->lossless) { + const ScanOrder *so = &vp9_default_scan_orders[TX_4X4]; + const int coeff_ctx = + combine_entropy_contexts(tempa[idx], templ[idy]); + vp9_highbd_fwht4x4(src_diff, coeff, 8); + vpx_highbd_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, + eob, so); + ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan, + so->neighbors, cpi->sf.use_fast_coef_costing); + tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0 ? 1 : 0); + if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd) + goto next_highbd; + vp9_highbd_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst16, + dst_stride, p->eobs[block], xd->bd); + } else { + int64_t unused; + const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block); + const ScanOrder *so = &vp9_scan_orders[TX_4X4][tx_type]; + const int coeff_ctx = + combine_entropy_contexts(tempa[idx], templ[idy]); + if (tx_type == DCT_DCT) + vpx_highbd_fdct4x4(src_diff, coeff, 8); + else + vp9_highbd_fht4x4(src_diff, coeff, 8, tx_type); + vpx_highbd_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, + eob, so); + ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan, + so->neighbors, cpi->sf.use_fast_coef_costing); + distortion += vp9_highbd_block_error_dispatch( + coeff, BLOCK_OFFSET(pd->dqcoeff, block), 16, + &unused, xd->bd) >> + 2; + tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0 ? 1 : 0); + if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd) + goto next_highbd; + vp9_highbd_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block), + dst16, dst_stride, p->eobs[block], xd->bd); + } + } + } + + rate += ratey; + this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion); + + if (this_rd < best_rd) { + *bestrate = rate; + *bestratey = ratey; + *bestdistortion = distortion; + best_rd = this_rd; + *best_mode = mode; + memcpy(a, tempa, num_4x4_blocks_wide * sizeof(tempa[0])); + memcpy(l, templ, num_4x4_blocks_high * sizeof(templ[0])); + for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) { + memcpy(best_dst16 + idy * 8, + CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride), + num_4x4_blocks_wide * 4 * sizeof(uint16_t)); + } + } + next_highbd : {} + } + if (best_rd >= rd_thresh || x->skip_encode) return best_rd; + + for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) { + memcpy(CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride), + best_dst16 + idy * 8, num_4x4_blocks_wide * 4 * sizeof(uint16_t)); + } + + return best_rd; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + + for (mode = DC_PRED; mode <= TM_PRED; ++mode) { + int64_t this_rd; + int ratey = 0; + int64_t distortion = 0; + int rate = bmode_costs[mode]; + + if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode))) continue; + + // Only do the oblique modes if the best so far is + // one of the neighboring directional modes + if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) { + if (conditional_skipintra(mode, *best_mode)) continue; + } + + memcpy(tempa, ta, num_4x4_blocks_wide * sizeof(ta[0])); + memcpy(templ, tl, num_4x4_blocks_high * sizeof(tl[0])); + + for (idy = 0; idy < num_4x4_blocks_high; ++idy) { + for (idx = 0; idx < num_4x4_blocks_wide; ++idx) { + const int block = (row + idy) * 2 + (col + idx); + const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride]; + uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride]; + int16_t *const src_diff = + vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff); + tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); + tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + uint16_t *const eob = &p->eobs[block]; + xd->mi[0]->bmi[block].as_mode = mode; + vp9_predict_intra_block(xd, 1, TX_4X4, mode, x->skip_encode ? src : dst, + x->skip_encode ? src_stride : dst_stride, dst, + dst_stride, col + idx, row + idy, 0); + vpx_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride); + + if (xd->lossless) { + const ScanOrder *so = &vp9_default_scan_orders[TX_4X4]; + const int coeff_ctx = + combine_entropy_contexts(tempa[idx], templ[idy]); + vp9_fwht4x4(src_diff, coeff, 8); + vpx_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob, + so); + ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan, + so->neighbors, cpi->sf.use_fast_coef_costing); + tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0; + if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd) + goto next; + vp9_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, dst_stride, + p->eobs[block]); + } else { + int64_t unused; + const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block); + const ScanOrder *so = &vp9_scan_orders[TX_4X4][tx_type]; + const int coeff_ctx = + combine_entropy_contexts(tempa[idx], templ[idy]); + vp9_fht4x4(src_diff, coeff, 8, tx_type); + vpx_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob, + so); + ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan, + so->neighbors, cpi->sf.use_fast_coef_costing); + tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0; + distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block), + 16, &unused) >> + 2; + if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd) + goto next; + vp9_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block), dst, + dst_stride, p->eobs[block]); + } + } + } + + rate += ratey; + this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion); + + if (this_rd < best_rd) { + *bestrate = rate; + *bestratey = ratey; + *bestdistortion = distortion; + best_rd = this_rd; + *best_mode = mode; + memcpy(a, tempa, num_4x4_blocks_wide * sizeof(tempa[0])); + memcpy(l, templ, num_4x4_blocks_high * sizeof(templ[0])); + for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) + memcpy(best_dst + idy * 8, dst_init + idy * dst_stride, + num_4x4_blocks_wide * 4); + } + next : {} + } + + if (best_rd >= rd_thresh || x->skip_encode) return best_rd; + + for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) + memcpy(dst_init + idy * dst_stride, best_dst + idy * 8, + num_4x4_blocks_wide * 4); + + return best_rd; +} + +static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP *cpi, MACROBLOCK *mb, + int *rate, int *rate_y, + int64_t *distortion, + int64_t best_rd) { + int i, j; + const MACROBLOCKD *const xd = &mb->e_mbd; + MODE_INFO *const mic = xd->mi[0]; + const MODE_INFO *above_mi = xd->above_mi; + const MODE_INFO *left_mi = xd->left_mi; + const BLOCK_SIZE bsize = xd->mi[0]->sb_type; + const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; + const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; + int idx, idy; + int cost = 0; + int64_t total_distortion = 0; + int tot_rate_y = 0; + int64_t total_rd = 0; + const int *bmode_costs = cpi->mbmode_cost; + + // Pick modes for each sub-block (of size 4x4, 4x8, or 8x4) in an 8x8 block. + for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { + for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { + PREDICTION_MODE best_mode = DC_PRED; + int r = INT_MAX, ry = INT_MAX; + int64_t d = INT64_MAX, this_rd = INT64_MAX; + i = idy * 2 + idx; + if (cpi->common.frame_type == KEY_FRAME) { + const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, i); + const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, i); + + bmode_costs = cpi->y_mode_costs[A][L]; + } + + this_rd = rd_pick_intra4x4block( + cpi, mb, idy, idx, &best_mode, bmode_costs, + xd->plane[0].above_context + idx, xd->plane[0].left_context + idy, &r, + &ry, &d, bsize, best_rd - total_rd); + + if (this_rd >= best_rd - total_rd) return INT64_MAX; + + total_rd += this_rd; + cost += r; + total_distortion += d; + tot_rate_y += ry; + + mic->bmi[i].as_mode = best_mode; + for (j = 1; j < num_4x4_blocks_high; ++j) + mic->bmi[i + j * 2].as_mode = best_mode; + for (j = 1; j < num_4x4_blocks_wide; ++j) + mic->bmi[i + j].as_mode = best_mode; + + if (total_rd >= best_rd) return INT64_MAX; + } + } + + *rate = cost; + *rate_y = tot_rate_y; + *distortion = total_distortion; + mic->mode = mic->bmi[3].as_mode; + + return RDCOST(mb->rdmult, mb->rddiv, cost, total_distortion); +} + +// This function is used only for intra_only frames +static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, int *rate, + int *rate_tokenonly, int64_t *distortion, + int *skippable, BLOCK_SIZE bsize, + int64_t best_rd) { + PREDICTION_MODE mode; + PREDICTION_MODE mode_selected = DC_PRED; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mic = xd->mi[0]; + int this_rate, this_rate_tokenonly, s; + int64_t this_distortion, this_rd; + TX_SIZE best_tx = TX_4X4; + int *bmode_costs; + const MODE_INFO *above_mi = xd->above_mi; + const MODE_INFO *left_mi = xd->left_mi; + const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0); + const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0); + bmode_costs = cpi->y_mode_costs[A][L]; + + memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm)); + /* Y Search for intra prediction mode */ + for (mode = DC_PRED; mode <= TM_PRED; mode++) { + if (cpi->sf.use_nonrd_pick_mode) { + // These speed features are turned on in hybrid non-RD and RD mode + // for key frame coding in the context of real-time setting. + if (conditional_skipintra(mode, mode_selected)) continue; + if (*skippable) break; + } + + mic->mode = mode; + + super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL, + bsize, best_rd, /*recon=*/NULL); + + if (this_rate_tokenonly == INT_MAX) continue; + + this_rate = this_rate_tokenonly + bmode_costs[mode]; + this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion); + + if (this_rd < best_rd) { + mode_selected = mode; + best_rd = this_rd; + best_tx = mic->tx_size; + *rate = this_rate; + *rate_tokenonly = this_rate_tokenonly; + *distortion = this_distortion; + *skippable = s; + } + } + + mic->mode = mode_selected; + mic->tx_size = best_tx; + + return best_rd; +} + +// Return value 0: early termination triggered, no valid rd cost available; +// 1: rd cost values are valid. +static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x, int *rate, + int64_t *distortion, int *skippable, int64_t *sse, + BLOCK_SIZE bsize, int64_t ref_best_rd) { + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mi = xd->mi[0]; + const TX_SIZE uv_tx_size = get_uv_tx_size(mi, &xd->plane[1]); + int plane; + int pnrate = 0, pnskip = 1; + int64_t pndist = 0, pnsse = 0; + int is_cost_valid = 1; + + if (ref_best_rd < 0) is_cost_valid = 0; + + if (is_inter_block(mi) && is_cost_valid) { + for (plane = 1; plane < MAX_MB_PLANE; ++plane) + vp9_subtract_plane(x, bsize, plane); + } + + *rate = 0; + *distortion = 0; + *sse = 0; + *skippable = 1; + + for (plane = 1; plane < MAX_MB_PLANE; ++plane) { + txfm_rd_in_plane(cpi, x, &pnrate, &pndist, &pnskip, &pnsse, ref_best_rd, + plane, bsize, uv_tx_size, cpi->sf.use_fast_coef_costing, + /*recon=*/NULL); + if (pnrate == INT_MAX) { + is_cost_valid = 0; + break; + } + *rate += pnrate; + *distortion += pndist; + *sse += pnsse; + *skippable &= pnskip; + } + + if (!is_cost_valid) { + // reset cost value + *rate = INT_MAX; + *distortion = INT64_MAX; + *sse = INT64_MAX; + *skippable = 0; + } + + return is_cost_valid; +} + +static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x, + PICK_MODE_CONTEXT *ctx, int *rate, + int *rate_tokenonly, int64_t *distortion, + int *skippable, BLOCK_SIZE bsize, + TX_SIZE max_tx_size) { + MACROBLOCKD *xd = &x->e_mbd; + PREDICTION_MODE mode; + PREDICTION_MODE mode_selected = DC_PRED; + int64_t best_rd = INT64_MAX, this_rd; + int this_rate_tokenonly, this_rate, s; + int64_t this_distortion, this_sse; + + memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm)); + for (mode = DC_PRED; mode <= TM_PRED; ++mode) { + if (!(cpi->sf.intra_uv_mode_mask[max_tx_size] & (1 << mode))) continue; +#if CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH + if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && + (xd->above_mi == NULL || xd->left_mi == NULL) && need_top_left[mode]) + continue; +#endif // CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH + + xd->mi[0]->uv_mode = mode; + + if (!super_block_uvrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, + &this_sse, bsize, best_rd)) + continue; + this_rate = + this_rate_tokenonly + + cpi->intra_uv_mode_cost[cpi->common.frame_type][xd->mi[0]->mode][mode]; + this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion); + + if (this_rd < best_rd) { + mode_selected = mode; + best_rd = this_rd; + *rate = this_rate; + *rate_tokenonly = this_rate_tokenonly; + *distortion = this_distortion; + *skippable = s; + if (!x->select_tx_size) swap_block_ptr(x, ctx, 2, 0, 1, MAX_MB_PLANE); + } + } + + xd->mi[0]->uv_mode = mode_selected; + return best_rd; +} + +#if !CONFIG_REALTIME_ONLY +static int64_t rd_sbuv_dcpred(const VP9_COMP *cpi, MACROBLOCK *x, int *rate, + int *rate_tokenonly, int64_t *distortion, + int *skippable, BLOCK_SIZE bsize) { + const VP9_COMMON *cm = &cpi->common; + int64_t unused; + + x->e_mbd.mi[0]->uv_mode = DC_PRED; + memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm)); + super_block_uvrd(cpi, x, rate_tokenonly, distortion, skippable, &unused, + bsize, INT64_MAX); + *rate = + *rate_tokenonly + + cpi->intra_uv_mode_cost[cm->frame_type][x->e_mbd.mi[0]->mode][DC_PRED]; + return RDCOST(x->rdmult, x->rddiv, *rate, *distortion); +} + +static void choose_intra_uv_mode(VP9_COMP *cpi, MACROBLOCK *const x, + PICK_MODE_CONTEXT *ctx, BLOCK_SIZE bsize, + TX_SIZE max_tx_size, int *rate_uv, + int *rate_uv_tokenonly, int64_t *dist_uv, + int *skip_uv, PREDICTION_MODE *mode_uv) { + // Use an estimated rd for uv_intra based on DC_PRED if the + // appropriate speed flag is set. + if (cpi->sf.use_uv_intra_rd_estimate) { + rd_sbuv_dcpred(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv, + bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize); + // Else do a proper rd search for each possible transform size that may + // be considered in the main rd loop. + } else { + rd_pick_intra_sbuv_mode(cpi, x, ctx, rate_uv, rate_uv_tokenonly, dist_uv, + skip_uv, bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize, + max_tx_size); + } + *mode_uv = x->e_mbd.mi[0]->uv_mode; +} + +static int cost_mv_ref(const VP9_COMP *cpi, PREDICTION_MODE mode, + int mode_context) { + assert(is_inter_mode(mode)); + return cpi->inter_mode_cost[mode_context][INTER_OFFSET(mode)]; +} + +static int set_and_cost_bmi_mvs(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, + int i, PREDICTION_MODE mode, int_mv this_mv[2], + int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], + int_mv seg_mvs[MAX_REF_FRAMES], + int_mv *best_ref_mv[2], const int *mvjcost, + int *mvcost[2]) { + MODE_INFO *const mi = xd->mi[0]; + const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + int thismvcost = 0; + int idx, idy; + const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mi->sb_type]; + const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mi->sb_type]; + const int is_compound = has_second_ref(mi); + + switch (mode) { + case NEWMV: + this_mv[0].as_int = seg_mvs[mi->ref_frame[0]].as_int; + thismvcost += vp9_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv, + mvjcost, mvcost, MV_COST_WEIGHT_SUB); + if (is_compound) { + this_mv[1].as_int = seg_mvs[mi->ref_frame[1]].as_int; + thismvcost += vp9_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv, + mvjcost, mvcost, MV_COST_WEIGHT_SUB); + } + break; + case NEARMV: + case NEARESTMV: + this_mv[0].as_int = frame_mv[mode][mi->ref_frame[0]].as_int; + if (is_compound) + this_mv[1].as_int = frame_mv[mode][mi->ref_frame[1]].as_int; + break; + default: + assert(mode == ZEROMV); + this_mv[0].as_int = 0; + if (is_compound) this_mv[1].as_int = 0; + break; + } + + mi->bmi[i].as_mv[0].as_int = this_mv[0].as_int; + if (is_compound) mi->bmi[i].as_mv[1].as_int = this_mv[1].as_int; + + mi->bmi[i].as_mode = mode; + + for (idy = 0; idy < num_4x4_blocks_high; ++idy) + for (idx = 0; idx < num_4x4_blocks_wide; ++idx) + memmove(&mi->bmi[i + idy * 2 + idx], &mi->bmi[i], sizeof(mi->bmi[i])); + + return cost_mv_ref(cpi, mode, mbmi_ext->mode_context[mi->ref_frame[0]]) + + thismvcost; +} + +static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x, + int64_t best_yrd, int i, int *labelyrate, + int64_t *distortion, int64_t *sse, + ENTROPY_CONTEXT *ta, ENTROPY_CONTEXT *tl, + int mi_row, int mi_col) { + int k; + MACROBLOCKD *xd = &x->e_mbd; + struct macroblockd_plane *const pd = &xd->plane[0]; + struct macroblock_plane *const p = &x->plane[0]; + MODE_INFO *const mi = xd->mi[0]; + const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->sb_type, pd); + const int width = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; + const int height = 4 * num_4x4_blocks_high_lookup[plane_bsize]; + int idx, idy; + + const uint8_t *const src = + &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, i, p->src.stride)]; + uint8_t *const dst = + &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, i, pd->dst.stride)]; + int64_t thisdistortion = 0, thissse = 0; + int thisrate = 0, ref; + const ScanOrder *so = &vp9_default_scan_orders[TX_4X4]; + const int is_compound = has_second_ref(mi); + const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter]; + + assert(!x->skip_block); + + for (ref = 0; ref < 1 + is_compound; ++ref) { + const int bw = b_width_log2_lookup[BLOCK_8X8]; + const int h = 4 * (i >> bw); + const int w = 4 * (i & ((1 << bw) - 1)); + const struct scale_factors *sf = &xd->block_refs[ref]->sf; + int y_stride = pd->pre[ref].stride; + uint8_t *pre = pd->pre[ref].buf + (h * pd->pre[ref].stride + w); + + if (vp9_is_scaled(sf)) { + const int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)); + const int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)); + + y_stride = xd->block_refs[ref]->buf->y_stride; + pre = xd->block_refs[ref]->buf->y_buffer; + pre += scaled_buffer_offset(x_start + w, y_start + h, y_stride, sf); + } +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_highbd_build_inter_predictor( + CONVERT_TO_SHORTPTR(pre), y_stride, CONVERT_TO_SHORTPTR(dst), + pd->dst.stride, &mi->bmi[i].as_mv[ref].as_mv, + &xd->block_refs[ref]->sf, width, height, ref, kernel, MV_PRECISION_Q3, + mi_col * MI_SIZE + 4 * (i % 2), mi_row * MI_SIZE + 4 * (i / 2), + xd->bd); + } else { + vp9_build_inter_predictor( + pre, y_stride, dst, pd->dst.stride, &mi->bmi[i].as_mv[ref].as_mv, + &xd->block_refs[ref]->sf, width, height, ref, kernel, MV_PRECISION_Q3, + mi_col * MI_SIZE + 4 * (i % 2), mi_row * MI_SIZE + 4 * (i / 2)); + } +#else + vp9_build_inter_predictor( + pre, y_stride, dst, pd->dst.stride, &mi->bmi[i].as_mv[ref].as_mv, + &xd->block_refs[ref]->sf, width, height, ref, kernel, MV_PRECISION_Q3, + mi_col * MI_SIZE + 4 * (i % 2), mi_row * MI_SIZE + 4 * (i / 2)); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vpx_highbd_subtract_block( + height, width, vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), + 8, src, p->src.stride, dst, pd->dst.stride, xd->bd); + } else { + vpx_subtract_block(height, width, + vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), + 8, src, p->src.stride, dst, pd->dst.stride); + } +#else + vpx_subtract_block(height, width, + vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), + 8, src, p->src.stride, dst, pd->dst.stride); +#endif // CONFIG_VP9_HIGHBITDEPTH + + k = i; + for (idy = 0; idy < height / 4; ++idy) { + for (idx = 0; idx < width / 4; ++idx) { +#if CONFIG_VP9_HIGHBITDEPTH + const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8; +#endif + int64_t ssz, rd, rd1, rd2; + tran_low_t *coeff, *qcoeff, *dqcoeff; + uint16_t *eob; + int coeff_ctx; + k += (idy * 2 + idx); + coeff_ctx = combine_entropy_contexts(ta[k & 1], tl[k >> 1]); + coeff = BLOCK_OFFSET(p->coeff, k); + qcoeff = BLOCK_OFFSET(p->qcoeff, k); + dqcoeff = BLOCK_OFFSET(pd->dqcoeff, k); + eob = &p->eobs[k]; + + x->fwd_txfm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff), + coeff, 8); +#if CONFIG_VP9_HIGHBITDEPTH + vpx_highbd_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob, + so); + thisdistortion += vp9_highbd_block_error_dispatch( + coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz, bd); +#else + vpx_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob, so); + thisdistortion += + vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz); +#endif // CONFIG_VP9_HIGHBITDEPTH + thissse += ssz; + thisrate += cost_coeffs(x, 0, k, TX_4X4, coeff_ctx, so->scan, + so->neighbors, cpi->sf.use_fast_coef_costing); + ta[k & 1] = tl[k >> 1] = (x->plane[0].eobs[k] > 0) ? 1 : 0; + rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion >> 2); + rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse >> 2); + rd = VPXMIN(rd1, rd2); + if (rd >= best_yrd) return INT64_MAX; + } + } + + *distortion = thisdistortion >> 2; + *labelyrate = thisrate; + *sse = thissse >> 2; + + return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion); +} +#endif // !CONFIG_REALTIME_ONLY + +typedef struct { + int eobs; + int brate; + int byrate; + int64_t bdist; + int64_t bsse; + int64_t brdcost; + int_mv mvs[2]; + ENTROPY_CONTEXT ta[2]; + ENTROPY_CONTEXT tl[2]; +} SEG_RDSTAT; + +typedef struct { + int_mv *ref_mv[2]; + int_mv mvp; + + int64_t segment_rd; + int r; + int64_t d; + int64_t sse; + int segment_yrate; + PREDICTION_MODE modes[4]; + SEG_RDSTAT rdstat[4][INTER_MODES]; + int mvthresh; +} BEST_SEG_INFO; + +#if !CONFIG_REALTIME_ONLY +static INLINE int mv_check_bounds(const MvLimits *mv_limits, const MV *mv) { + return (mv->row >> 3) < mv_limits->row_min || + (mv->row >> 3) > mv_limits->row_max || + (mv->col >> 3) < mv_limits->col_min || + (mv->col >> 3) > mv_limits->col_max; +} + +static INLINE void mi_buf_shift(MACROBLOCK *x, int i) { + MODE_INFO *const mi = x->e_mbd.mi[0]; + struct macroblock_plane *const p = &x->plane[0]; + struct macroblockd_plane *const pd = &x->e_mbd.plane[0]; + + p->src.buf = + &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, i, p->src.stride)]; + assert(((intptr_t)pd->pre[0].buf & 0x7) == 0); + pd->pre[0].buf = + &pd->pre[0].buf[vp9_raster_block_offset(BLOCK_8X8, i, pd->pre[0].stride)]; + if (has_second_ref(mi)) + pd->pre[1].buf = + &pd->pre[1] + .buf[vp9_raster_block_offset(BLOCK_8X8, i, pd->pre[1].stride)]; +} + +static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src, + struct buf_2d orig_pre[2]) { + MODE_INFO *mi = x->e_mbd.mi[0]; + x->plane[0].src = orig_src; + x->e_mbd.plane[0].pre[0] = orig_pre[0]; + if (has_second_ref(mi)) x->e_mbd.plane[0].pre[1] = orig_pre[1]; +} + +static INLINE int mv_has_subpel(const MV *mv) { + return (mv->row & 0x0F) || (mv->col & 0x0F); +} + +// Check if NEARESTMV/NEARMV/ZEROMV is the cheapest way encode zero motion. +// TODO(aconverse): Find out if this is still productive then clean up or remove +static int check_best_zero_mv(const VP9_COMP *cpi, + const uint8_t mode_context[MAX_REF_FRAMES], + int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], + int this_mode, + const MV_REFERENCE_FRAME ref_frames[2]) { + if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) && + frame_mv[this_mode][ref_frames[0]].as_int == 0 && + (ref_frames[1] == NO_REF_FRAME || + frame_mv[this_mode][ref_frames[1]].as_int == 0)) { + int rfc = mode_context[ref_frames[0]]; + int c1 = cost_mv_ref(cpi, NEARMV, rfc); + int c2 = cost_mv_ref(cpi, NEARESTMV, rfc); + int c3 = cost_mv_ref(cpi, ZEROMV, rfc); + + if (this_mode == NEARMV) { + if (c1 > c3) return 0; + } else if (this_mode == NEARESTMV) { + if (c2 > c3) return 0; + } else { + assert(this_mode == ZEROMV); + if (ref_frames[1] == NO_REF_FRAME) { + if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0) || + (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0)) + return 0; + } else { + if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0 && + frame_mv[NEARESTMV][ref_frames[1]].as_int == 0) || + (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0 && + frame_mv[NEARMV][ref_frames[1]].as_int == 0)) + return 0; + } + } + } + return 1; +} + +static INLINE int skip_iters(const int_mv iter_mvs[][2], int ite, int id) { + if (ite >= 2 && iter_mvs[ite - 2][!id].as_int == iter_mvs[ite][!id].as_int) { + int_mv cur_fullpel_mv, prev_fullpel_mv; + cur_fullpel_mv.as_mv.row = iter_mvs[ite][id].as_mv.row >> 3; + cur_fullpel_mv.as_mv.col = iter_mvs[ite][id].as_mv.col >> 3; + prev_fullpel_mv.as_mv.row = iter_mvs[ite - 2][id].as_mv.row >> 3; + prev_fullpel_mv.as_mv.col = iter_mvs[ite - 2][id].as_mv.col >> 3; + if (cur_fullpel_mv.as_int == prev_fullpel_mv.as_int) return 1; + } + return 0; +} + +// Compares motion vector and mode rate of current mode and given mode. +static INLINE int compare_mv_mode_rate(MV this_mv, MV mode_mv, + int this_mode_rate, int mode_rate, + int mv_thresh) { + const int mv_diff = + abs(mode_mv.col - this_mv.col) + abs(mode_mv.row - this_mv.row); + if (mv_diff <= mv_thresh && mode_rate < this_mode_rate) return 1; + return 0; +} + +// Skips single reference inter modes NEARMV and ZEROMV based on motion vector +// difference and mode rate. +static INLINE int skip_single_mode_based_on_mode_rate( + int_mv (*mode_mv)[MAX_REF_FRAMES], int *single_mode_rate, int this_mode, + int ref0, int this_mode_rate, int best_mode_index) { + MV this_mv = mode_mv[this_mode][ref0].as_mv; + const int mv_thresh = 3; + + // Pruning is not applicable for NEARESTMV or NEWMV modes. + if (this_mode == NEARESTMV || this_mode == NEWMV) return 0; + // Pruning is not done when reference frame of the mode is same as best + // reference so far. + if (best_mode_index > 0 && + ref0 == vp9_mode_order[best_mode_index].ref_frame[0]) + return 0; + + // Check absolute mv difference and mode rate of current mode w.r.t NEARESTMV + if (compare_mv_mode_rate( + this_mv, mode_mv[NEARESTMV][ref0].as_mv, this_mode_rate, + single_mode_rate[INTER_OFFSET(NEARESTMV)], mv_thresh)) + return 1; + + // Check absolute mv difference and mode rate of current mode w.r.t NEWMV + if (compare_mv_mode_rate(this_mv, mode_mv[NEWMV][ref0].as_mv, this_mode_rate, + single_mode_rate[INTER_OFFSET(NEWMV)], mv_thresh)) + return 1; + + // Pruning w.r.t NEARMV is applicable only for ZEROMV mode + if (this_mode == NEARMV) return 0; + // Check absolute mv difference and mode rate of current mode w.r.t NEARMV + if (compare_mv_mode_rate(this_mv, mode_mv[NEARMV][ref0].as_mv, this_mode_rate, + single_mode_rate[INTER_OFFSET(NEARMV)], mv_thresh)) + return 1; + return 0; +} + +#define MAX_JOINT_MV_SEARCH_ITERS 4 +static INLINE int get_joint_search_iters(int sf_level, BLOCK_SIZE bsize) { + int num_iters = MAX_JOINT_MV_SEARCH_ITERS; // sf_level = 0 + if (sf_level >= 2) + num_iters = 0; + else if (sf_level >= 1) + num_iters = bsize < BLOCK_8X8 + ? 0 + : (bsize <= BLOCK_16X16 ? 2 : MAX_JOINT_MV_SEARCH_ITERS); + return num_iters; +} + +static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + int_mv *frame_mv, int mi_row, int mi_col, + int_mv single_newmv[MAX_REF_FRAMES], + int *rate_mv, int num_iters) { + const VP9_COMMON *const cm = &cpi->common; + const int pw = 4 * num_4x4_blocks_wide_lookup[bsize]; + const int ph = 4 * num_4x4_blocks_high_lookup[bsize]; + MACROBLOCKD *xd = &x->e_mbd; + MODE_INFO *mi = xd->mi[0]; + const int refs[2] = { mi->ref_frame[0], + mi->ref_frame[1] < 0 ? 0 : mi->ref_frame[1] }; + int_mv ref_mv[2]; + int_mv iter_mvs[MAX_JOINT_MV_SEARCH_ITERS][2]; + int ite, ref; + const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter]; + struct scale_factors sf; + + // Do joint motion search in compound mode to get more accurate mv. + struct buf_2d backup_yv12[2][MAX_MB_PLANE]; + uint32_t last_besterr[2] = { UINT_MAX, UINT_MAX }; + const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = { + vp9_get_scaled_ref_frame(cpi, mi->ref_frame[0]), + vp9_get_scaled_ref_frame(cpi, mi->ref_frame[1]) + }; + +// Prediction buffer from second frame. +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(32, uint16_t, second_pred_alloc_16[64 * 64]); + uint8_t *second_pred; +#else + DECLARE_ALIGNED(32, uint8_t, second_pred[64 * 64]); +#endif // CONFIG_VP9_HIGHBITDEPTH + + // Check number of iterations do not exceed the max + assert(num_iters <= MAX_JOINT_MV_SEARCH_ITERS); + + for (ref = 0; ref < 2; ++ref) { + ref_mv[ref] = x->mbmi_ext->ref_mvs[refs[ref]][0]; + + if (scaled_ref_frame[ref]) { + int i; + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // motion search code to be used without additional modifications. + for (i = 0; i < MAX_MB_PLANE; i++) + backup_yv12[ref][i] = xd->plane[i].pre[ref]; + vp9_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col, + NULL); + } + + frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int; + iter_mvs[0][ref].as_int = single_newmv[refs[ref]].as_int; + } + +// Since we have scaled the reference frames to match the size of the current +// frame we must use a unit scaling factor during mode selection. +#if CONFIG_VP9_HIGHBITDEPTH + vp9_setup_scale_factors_for_frame(&sf, cm->width, cm->height, cm->width, + cm->height, cm->use_highbitdepth); +#else + vp9_setup_scale_factors_for_frame(&sf, cm->width, cm->height, cm->width, + cm->height); +#endif // CONFIG_VP9_HIGHBITDEPTH + + // Allow joint search multiple times iteratively for each reference frame + // and break out of the search loop if it couldn't find a better mv. + for (ite = 0; ite < num_iters; ite++) { + struct buf_2d ref_yv12[2]; + uint32_t bestsme = UINT_MAX; + int sadpb = x->sadperbit16; + MV tmp_mv; + int search_range = 3; + + const MvLimits tmp_mv_limits = x->mv_limits; + int id = ite % 2; // Even iterations search in the first reference frame, + // odd iterations search in the second. The predictor + // found for the 'other' reference frame is factored in. + + // Skip further iterations of search if in the previous iteration, the + // motion vector of the searched ref frame is unchanged, and the other ref + // frame's full-pixel mv is unchanged. + if (skip_iters(iter_mvs, ite, id)) break; + + // Initialized here because of compiler problem in Visual Studio. + ref_yv12[0] = xd->plane[0].pre[0]; + ref_yv12[1] = xd->plane[0].pre[1]; + +// Get the prediction block from the 'other' reference frame. +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16); + vp9_highbd_build_inter_predictor( + CONVERT_TO_SHORTPTR(ref_yv12[!id].buf), ref_yv12[!id].stride, + second_pred_alloc_16, pw, &frame_mv[refs[!id]].as_mv, &sf, pw, ph, 0, + kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd->bd); + } else { + second_pred = (uint8_t *)second_pred_alloc_16; + vp9_build_inter_predictor(ref_yv12[!id].buf, ref_yv12[!id].stride, + second_pred, pw, &frame_mv[refs[!id]].as_mv, + &sf, pw, ph, 0, kernel, MV_PRECISION_Q3, + mi_col * MI_SIZE, mi_row * MI_SIZE); + } +#else + vp9_build_inter_predictor(ref_yv12[!id].buf, ref_yv12[!id].stride, + second_pred, pw, &frame_mv[refs[!id]].as_mv, &sf, + pw, ph, 0, kernel, MV_PRECISION_Q3, + mi_col * MI_SIZE, mi_row * MI_SIZE); +#endif // CONFIG_VP9_HIGHBITDEPTH + + // Do compound motion search on the current reference frame. + if (id) xd->plane[0].pre[0] = ref_yv12[id]; + vp9_set_mv_search_range(&x->mv_limits, &ref_mv[id].as_mv); + + // Use the mv result from the single mode as mv predictor. + tmp_mv = frame_mv[refs[id]].as_mv; + + tmp_mv.col >>= 3; + tmp_mv.row >>= 3; + + // Small-range full-pixel motion search. + bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb, search_range, + &cpi->fn_ptr[bsize], &ref_mv[id].as_mv, + second_pred); + if (bestsme < UINT_MAX) + bestsme = vp9_get_mvpred_av_var(x, &tmp_mv, &ref_mv[id].as_mv, + second_pred, &cpi->fn_ptr[bsize], 1); + + x->mv_limits = tmp_mv_limits; + + if (bestsme < UINT_MAX) { + uint32_t dis; /* TODO: use dis in distortion calculation later. */ + uint32_t sse; + bestsme = cpi->find_fractional_mv_step( + x, &tmp_mv, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv, + x->errorperbit, &cpi->fn_ptr[bsize], 0, + cpi->sf.mv.subpel_search_level, NULL, x->nmvjointcost, x->mvcost, + &dis, &sse, second_pred, pw, ph, cpi->sf.use_accurate_subpel_search); + } + + // Restore the pointer to the first (possibly scaled) prediction buffer. + if (id) xd->plane[0].pre[0] = ref_yv12[0]; + + if (bestsme < last_besterr[id]) { + frame_mv[refs[id]].as_mv = tmp_mv; + last_besterr[id] = bestsme; + } else { + break; + } + if (ite < num_iters - 1) { + iter_mvs[ite + 1][0].as_int = frame_mv[refs[0]].as_int; + iter_mvs[ite + 1][1].as_int = frame_mv[refs[1]].as_int; + } + } + + *rate_mv = 0; + + for (ref = 0; ref < 2; ++ref) { + if (scaled_ref_frame[ref]) { + // Restore the prediction frame pointers to their unscaled versions. + int i; + for (i = 0; i < MAX_MB_PLANE; i++) + xd->plane[i].pre[ref] = backup_yv12[ref][i]; + } + + *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[ref]].as_mv, + &x->mbmi_ext->ref_mvs[refs[ref]][0].as_mv, + x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); + } +} + +static int64_t rd_pick_best_sub8x8_mode( + VP9_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv, + int_mv *second_best_ref_mv, int64_t best_rd_so_far, int *returntotrate, + int *returnyrate, int64_t *returndistortion, int *skippable, int64_t *psse, + int mvthresh, int_mv seg_mvs[4][MAX_REF_FRAMES], BEST_SEG_INFO *bsi_buf, + int filter_idx, int mi_row, int mi_col) { + int i; + BEST_SEG_INFO *bsi = bsi_buf + filter_idx; + MACROBLOCKD *xd = &x->e_mbd; + MODE_INFO *mi = xd->mi[0]; + int mode_idx; + int k, br = 0, idx, idy; + int64_t bd = 0, block_sse = 0; + PREDICTION_MODE this_mode; + VP9_COMMON *cm = &cpi->common; + struct macroblock_plane *const p = &x->plane[0]; + struct macroblockd_plane *const pd = &xd->plane[0]; + const int label_count = 4; + int64_t this_segment_rd = 0; + int label_mv_thresh; + int segmentyrate = 0; + const BLOCK_SIZE bsize = mi->sb_type; + const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; + const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; + const int pw = num_4x4_blocks_wide << 2; + const int ph = num_4x4_blocks_high << 2; + ENTROPY_CONTEXT t_above[2], t_left[2]; + int subpelmv = 1, have_ref = 0; + SPEED_FEATURES *const sf = &cpi->sf; + const int has_second_rf = has_second_ref(mi); + const int inter_mode_mask = sf->inter_mode_mask[bsize]; + MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + + vp9_zero(*bsi); + + bsi->segment_rd = best_rd_so_far; + bsi->ref_mv[0] = best_ref_mv; + bsi->ref_mv[1] = second_best_ref_mv; + bsi->mvp.as_int = best_ref_mv->as_int; + bsi->mvthresh = mvthresh; + + for (i = 0; i < 4; i++) bsi->modes[i] = ZEROMV; + + memcpy(t_above, pd->above_context, sizeof(t_above)); + memcpy(t_left, pd->left_context, sizeof(t_left)); + + // 64 makes this threshold really big effectively + // making it so that we very rarely check mvs on + // segments. setting this to 1 would make mv thresh + // roughly equal to what it is for macroblocks + label_mv_thresh = 1 * bsi->mvthresh / label_count; + + // Segmentation method overheads + for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { + for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { + // TODO(jingning,rbultje): rewrite the rate-distortion optimization + // loop for 4x4/4x8/8x4 block coding. to be replaced with new rd loop + int_mv mode_mv[MB_MODE_COUNT][2]; + int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; + PREDICTION_MODE mode_selected = ZEROMV; + int64_t best_rd = INT64_MAX; + const int block = idy * 2 + idx; + int ref; + + for (ref = 0; ref < 1 + has_second_rf; ++ref) { + const MV_REFERENCE_FRAME frame = mi->ref_frame[ref]; + frame_mv[ZEROMV][frame].as_int = 0; + vp9_append_sub8x8_mvs_for_idx( + cm, xd, block, ref, mi_row, mi_col, &frame_mv[NEARESTMV][frame], + &frame_mv[NEARMV][frame], mbmi_ext->mode_context); + } + + // search for the best motion vector on this segment + for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) { + const struct buf_2d orig_src = x->plane[0].src; + struct buf_2d orig_pre[2]; + + mode_idx = INTER_OFFSET(this_mode); + bsi->rdstat[block][mode_idx].brdcost = INT64_MAX; + if (!(inter_mode_mask & (1 << this_mode))) continue; + + if (!check_best_zero_mv(cpi, mbmi_ext->mode_context, frame_mv, + this_mode, mi->ref_frame)) + continue; + + memcpy(orig_pre, pd->pre, sizeof(orig_pre)); + memcpy(bsi->rdstat[block][mode_idx].ta, t_above, + sizeof(bsi->rdstat[block][mode_idx].ta)); + memcpy(bsi->rdstat[block][mode_idx].tl, t_left, + sizeof(bsi->rdstat[block][mode_idx].tl)); + + // motion search for newmv (single predictor case only) + if (!has_second_rf && this_mode == NEWMV && + seg_mvs[block][mi->ref_frame[0]].as_int == INVALID_MV) { + MV *const new_mv = &mode_mv[NEWMV][0].as_mv; + int step_param = 0; + uint32_t bestsme = UINT_MAX; + int sadpb = x->sadperbit4; + MV mvp_full; + int max_mv; + int cost_list[5]; + const MvLimits tmp_mv_limits = x->mv_limits; + + /* Is the best so far sufficiently good that we can't justify doing + * and new motion search. */ + if (best_rd < label_mv_thresh) break; + + if (cpi->oxcf.mode != BEST) { + // use previous block's result as next block's MV predictor. + if (block > 0) { + bsi->mvp.as_int = mi->bmi[block - 1].as_mv[0].as_int; + if (block == 2) + bsi->mvp.as_int = mi->bmi[block - 2].as_mv[0].as_int; + } + } + if (block == 0) + max_mv = x->max_mv_context[mi->ref_frame[0]]; + else + max_mv = + VPXMAX(abs(bsi->mvp.as_mv.row), abs(bsi->mvp.as_mv.col)) >> 3; + + if (sf->mv.auto_mv_step_size && cm->show_frame) { + // Take wtd average of the step_params based on the last frame's + // max mv magnitude and the best ref mvs of the current block for + // the given reference. + step_param = + (vp9_init_search_range(max_mv) + cpi->mv_step_param) / 2; + } else { + step_param = cpi->mv_step_param; + } + + mvp_full.row = bsi->mvp.as_mv.row >> 3; + mvp_full.col = bsi->mvp.as_mv.col >> 3; + + if (sf->adaptive_motion_search) { + if (x->pred_mv[mi->ref_frame[0]].row != INT16_MAX && + x->pred_mv[mi->ref_frame[0]].col != INT16_MAX) { + mvp_full.row = x->pred_mv[mi->ref_frame[0]].row >> 3; + mvp_full.col = x->pred_mv[mi->ref_frame[0]].col >> 3; + } + step_param = VPXMAX(step_param, 8); + } + + // adjust src pointer for this block + mi_buf_shift(x, block); + + vp9_set_mv_search_range(&x->mv_limits, &bsi->ref_mv[0]->as_mv); + + bestsme = vp9_full_pixel_search( + cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, + sadpb, + sf->mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL, + &bsi->ref_mv[0]->as_mv, new_mv, INT_MAX, 1); + + x->mv_limits = tmp_mv_limits; + + if (bestsme < UINT_MAX) { + uint32_t distortion; + cpi->find_fractional_mv_step( + x, new_mv, &bsi->ref_mv[0]->as_mv, cm->allow_high_precision_mv, + x->errorperbit, &cpi->fn_ptr[bsize], sf->mv.subpel_force_stop, + sf->mv.subpel_search_level, cond_cost_list(cpi, cost_list), + x->nmvjointcost, x->mvcost, &distortion, + &x->pred_sse[mi->ref_frame[0]], NULL, pw, ph, + cpi->sf.use_accurate_subpel_search); + + // save motion search result for use in compound prediction + seg_mvs[block][mi->ref_frame[0]].as_mv = *new_mv; + } + + x->pred_mv[mi->ref_frame[0]] = *new_mv; + + // restore src pointers + mi_buf_restore(x, orig_src, orig_pre); + } + + if (has_second_rf) { + if (seg_mvs[block][mi->ref_frame[1]].as_int == INVALID_MV || + seg_mvs[block][mi->ref_frame[0]].as_int == INVALID_MV) + continue; + } + + if (has_second_rf && this_mode == NEWMV && + mi->interp_filter == EIGHTTAP) { + // Decide number of joint motion search iterations + const int num_joint_search_iters = get_joint_search_iters( + cpi->sf.comp_inter_joint_search_iter_level, bsize); + // adjust src pointers + mi_buf_shift(x, block); + if (num_joint_search_iters) { + int rate_mv; + joint_motion_search(cpi, x, bsize, frame_mv[this_mode], mi_row, + mi_col, seg_mvs[block], &rate_mv, + num_joint_search_iters); + seg_mvs[block][mi->ref_frame[0]].as_int = + frame_mv[this_mode][mi->ref_frame[0]].as_int; + seg_mvs[block][mi->ref_frame[1]].as_int = + frame_mv[this_mode][mi->ref_frame[1]].as_int; + } + // restore src pointers + mi_buf_restore(x, orig_src, orig_pre); + } + + bsi->rdstat[block][mode_idx].brate = set_and_cost_bmi_mvs( + cpi, x, xd, block, this_mode, mode_mv[this_mode], frame_mv, + seg_mvs[block], bsi->ref_mv, x->nmvjointcost, x->mvcost); + + for (ref = 0; ref < 1 + has_second_rf; ++ref) { + bsi->rdstat[block][mode_idx].mvs[ref].as_int = + mode_mv[this_mode][ref].as_int; + if (num_4x4_blocks_wide > 1) + bsi->rdstat[block + 1][mode_idx].mvs[ref].as_int = + mode_mv[this_mode][ref].as_int; + if (num_4x4_blocks_high > 1) + bsi->rdstat[block + 2][mode_idx].mvs[ref].as_int = + mode_mv[this_mode][ref].as_int; + } + + // Trap vectors that reach beyond the UMV borders + if (mv_check_bounds(&x->mv_limits, &mode_mv[this_mode][0].as_mv) || + (has_second_rf && + mv_check_bounds(&x->mv_limits, &mode_mv[this_mode][1].as_mv))) + continue; + + if (filter_idx > 0) { + BEST_SEG_INFO *ref_bsi = bsi_buf; + subpelmv = 0; + have_ref = 1; + + for (ref = 0; ref < 1 + has_second_rf; ++ref) { + subpelmv |= mv_has_subpel(&mode_mv[this_mode][ref].as_mv); + have_ref &= mode_mv[this_mode][ref].as_int == + ref_bsi->rdstat[block][mode_idx].mvs[ref].as_int; + } + + if (filter_idx > 1 && !subpelmv && !have_ref) { + ref_bsi = bsi_buf + 1; + have_ref = 1; + for (ref = 0; ref < 1 + has_second_rf; ++ref) + have_ref &= mode_mv[this_mode][ref].as_int == + ref_bsi->rdstat[block][mode_idx].mvs[ref].as_int; + } + + if (!subpelmv && have_ref && + ref_bsi->rdstat[block][mode_idx].brdcost < INT64_MAX) { + memcpy(&bsi->rdstat[block][mode_idx], + &ref_bsi->rdstat[block][mode_idx], sizeof(SEG_RDSTAT)); + if (num_4x4_blocks_wide > 1) + bsi->rdstat[block + 1][mode_idx].eobs = + ref_bsi->rdstat[block + 1][mode_idx].eobs; + if (num_4x4_blocks_high > 1) + bsi->rdstat[block + 2][mode_idx].eobs = + ref_bsi->rdstat[block + 2][mode_idx].eobs; + + if (bsi->rdstat[block][mode_idx].brdcost < best_rd) { + mode_selected = this_mode; + best_rd = bsi->rdstat[block][mode_idx].brdcost; + } + continue; + } + } + + bsi->rdstat[block][mode_idx].brdcost = encode_inter_mb_segment( + cpi, x, bsi->segment_rd - this_segment_rd, block, + &bsi->rdstat[block][mode_idx].byrate, + &bsi->rdstat[block][mode_idx].bdist, + &bsi->rdstat[block][mode_idx].bsse, bsi->rdstat[block][mode_idx].ta, + bsi->rdstat[block][mode_idx].tl, mi_row, mi_col); + if (bsi->rdstat[block][mode_idx].brdcost < INT64_MAX) { + bsi->rdstat[block][mode_idx].brdcost += RDCOST( + x->rdmult, x->rddiv, bsi->rdstat[block][mode_idx].brate, 0); + bsi->rdstat[block][mode_idx].brate += + bsi->rdstat[block][mode_idx].byrate; + bsi->rdstat[block][mode_idx].eobs = p->eobs[block]; + if (num_4x4_blocks_wide > 1) + bsi->rdstat[block + 1][mode_idx].eobs = p->eobs[block + 1]; + if (num_4x4_blocks_high > 1) + bsi->rdstat[block + 2][mode_idx].eobs = p->eobs[block + 2]; + } + + if (bsi->rdstat[block][mode_idx].brdcost < best_rd) { + mode_selected = this_mode; + best_rd = bsi->rdstat[block][mode_idx].brdcost; + } + } /*for each 4x4 mode*/ + + if (best_rd == INT64_MAX) { + int iy, midx; + for (iy = block + 1; iy < 4; ++iy) + for (midx = 0; midx < INTER_MODES; ++midx) + bsi->rdstat[iy][midx].brdcost = INT64_MAX; + bsi->segment_rd = INT64_MAX; + return INT64_MAX; + } + + mode_idx = INTER_OFFSET(mode_selected); + memcpy(t_above, bsi->rdstat[block][mode_idx].ta, sizeof(t_above)); + memcpy(t_left, bsi->rdstat[block][mode_idx].tl, sizeof(t_left)); + + set_and_cost_bmi_mvs(cpi, x, xd, block, mode_selected, + mode_mv[mode_selected], frame_mv, seg_mvs[block], + bsi->ref_mv, x->nmvjointcost, x->mvcost); + + br += bsi->rdstat[block][mode_idx].brate; + bd += bsi->rdstat[block][mode_idx].bdist; + block_sse += bsi->rdstat[block][mode_idx].bsse; + segmentyrate += bsi->rdstat[block][mode_idx].byrate; + this_segment_rd += bsi->rdstat[block][mode_idx].brdcost; + + if (this_segment_rd > bsi->segment_rd) { + int iy, midx; + for (iy = block + 1; iy < 4; ++iy) + for (midx = 0; midx < INTER_MODES; ++midx) + bsi->rdstat[iy][midx].brdcost = INT64_MAX; + bsi->segment_rd = INT64_MAX; + return INT64_MAX; + } + } + } /* for each label */ + + bsi->r = br; + bsi->d = bd; + bsi->segment_yrate = segmentyrate; + bsi->segment_rd = this_segment_rd; + bsi->sse = block_sse; + + // update the coding decisions + for (k = 0; k < 4; ++k) bsi->modes[k] = mi->bmi[k].as_mode; + + if (bsi->segment_rd > best_rd_so_far) return INT64_MAX; + /* set it to the best */ + for (i = 0; i < 4; i++) { + mode_idx = INTER_OFFSET(bsi->modes[i]); + mi->bmi[i].as_mv[0].as_int = bsi->rdstat[i][mode_idx].mvs[0].as_int; + if (has_second_ref(mi)) + mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int; + x->plane[0].eobs[i] = bsi->rdstat[i][mode_idx].eobs; + mi->bmi[i].as_mode = bsi->modes[i]; + } + + /* + * used to set mbmi->mv.as_int + */ + *returntotrate = bsi->r; + *returndistortion = bsi->d; + *returnyrate = bsi->segment_yrate; + *skippable = vp9_is_skippable_in_plane(x, BLOCK_8X8, 0); + *psse = bsi->sse; + mi->mode = bsi->modes[3]; + + return bsi->segment_rd; +} + +static void estimate_ref_frame_costs(const VP9_COMMON *cm, + const MACROBLOCKD *xd, int segment_id, + unsigned int *ref_costs_single, + unsigned int *ref_costs_comp, + vpx_prob *comp_mode_p) { + int seg_ref_active = + segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME); + if (seg_ref_active) { + memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single)); + memset(ref_costs_comp, 0, MAX_REF_FRAMES * sizeof(*ref_costs_comp)); + *comp_mode_p = 128; + } else { + vpx_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd); + vpx_prob comp_inter_p = 128; + + if (cm->reference_mode == REFERENCE_MODE_SELECT) { + comp_inter_p = vp9_get_reference_mode_prob(cm, xd); + *comp_mode_p = comp_inter_p; + } else { + *comp_mode_p = 128; + } + + ref_costs_single[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0); + + if (cm->reference_mode != COMPOUND_REFERENCE) { + vpx_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd); + vpx_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd); + unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1); + + if (cm->reference_mode == REFERENCE_MODE_SELECT) + base_cost += vp9_cost_bit(comp_inter_p, 0); + + ref_costs_single[LAST_FRAME] = ref_costs_single[GOLDEN_FRAME] = + ref_costs_single[ALTREF_FRAME] = base_cost; + ref_costs_single[LAST_FRAME] += vp9_cost_bit(ref_single_p1, 0); + ref_costs_single[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p1, 1); + ref_costs_single[ALTREF_FRAME] += vp9_cost_bit(ref_single_p1, 1); + ref_costs_single[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p2, 0); + ref_costs_single[ALTREF_FRAME] += vp9_cost_bit(ref_single_p2, 1); + } else { + ref_costs_single[LAST_FRAME] = 512; + ref_costs_single[GOLDEN_FRAME] = 512; + ref_costs_single[ALTREF_FRAME] = 512; + } + if (cm->reference_mode != SINGLE_REFERENCE) { + vpx_prob ref_comp_p = vp9_get_pred_prob_comp_ref_p(cm, xd); + unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1); + + if (cm->reference_mode == REFERENCE_MODE_SELECT) + base_cost += vp9_cost_bit(comp_inter_p, 1); + + ref_costs_comp[LAST_FRAME] = base_cost + vp9_cost_bit(ref_comp_p, 0); + ref_costs_comp[GOLDEN_FRAME] = base_cost + vp9_cost_bit(ref_comp_p, 1); + } else { + ref_costs_comp[LAST_FRAME] = 512; + ref_costs_comp[GOLDEN_FRAME] = 512; + } + } +} + +static void store_coding_context( + MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int mode_index, + int64_t comp_pred_diff[REFERENCE_MODES], + int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS], int skippable) { + MACROBLOCKD *const xd = &x->e_mbd; + + // Take a snapshot of the coding context so it can be + // restored if we decide to encode this way + ctx->skip = x->skip; + ctx->skippable = skippable; + ctx->best_mode_index = mode_index; + ctx->mic = *xd->mi[0]; + ctx->mbmi_ext = *x->mbmi_ext; + ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE]; + ctx->comp_pred_diff = (int)comp_pred_diff[COMPOUND_REFERENCE]; + ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT]; + + memcpy(ctx->best_filter_diff, best_filter_diff, + sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS); +} + +static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, + MV_REFERENCE_FRAME ref_frame, + BLOCK_SIZE block_size, int mi_row, int mi_col, + int_mv frame_nearest_mv[MAX_REF_FRAMES], + int_mv frame_near_mv[MAX_REF_FRAMES], + struct buf_2d yv12_mb[4][MAX_MB_PLANE]) { + const VP9_COMMON *cm = &cpi->common; + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mi = xd->mi[0]; + int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame]; + const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf; + MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + + assert(yv12 != NULL); + + // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this + // use the UV scaling factors. + vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf); + + // Gets an initial list of candidate vectors from neighbours and orders them + vp9_find_mv_refs(cm, xd, mi, ref_frame, candidates, mi_row, mi_col, + mbmi_ext->mode_context); + + // Candidate refinement carried out at encoder and decoder + vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates, + &frame_nearest_mv[ref_frame], + &frame_near_mv[ref_frame]); + + // Further refinement that is encode side only to test the top few candidates + // in full and choose the best as the centre point for subsequent searches. + // The current implementation doesn't support scaling. + if (!vp9_is_scaled(sf) && block_size >= BLOCK_8X8) + vp9_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame, + block_size); +} + +#if CONFIG_NON_GREEDY_MV +static int ref_frame_to_gf_rf_idx(int ref_frame) { + if (ref_frame == GOLDEN_FRAME) { + return 0; + } + if (ref_frame == LAST_FRAME) { + return 1; + } + if (ref_frame == ALTREF_FRAME) { + return 2; + } + assert(0); + return -1; +} +#endif + +static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + int mi_row, int mi_col, int_mv *tmp_mv, + int *rate_mv) { + MACROBLOCKD *xd = &x->e_mbd; + const VP9_COMMON *cm = &cpi->common; + MODE_INFO *mi = xd->mi[0]; + struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0 } }; + int step_param; + MV mvp_full; + int ref = mi->ref_frame[0]; + MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv; + const MvLimits tmp_mv_limits = x->mv_limits; + int cost_list[5]; + const int best_predmv_idx = x->mv_best_ref_index[ref]; + const YV12_BUFFER_CONFIG *scaled_ref_frame = + vp9_get_scaled_ref_frame(cpi, ref); + const int pw = num_4x4_blocks_wide_lookup[bsize] << 2; + const int ph = num_4x4_blocks_high_lookup[bsize] << 2; + MV pred_mv[3]; + + int bestsme = INT_MAX; +#if CONFIG_NON_GREEDY_MV + int gf_group_idx = cpi->twopass.gf_group.index; + int gf_rf_idx = ref_frame_to_gf_rf_idx(ref); + BLOCK_SIZE square_bsize = get_square_block_size(bsize); + int_mv nb_full_mvs[NB_MVS_NUM] = { 0 }; + MotionField *motion_field = vp9_motion_field_info_get_motion_field( + &cpi->motion_field_info, gf_group_idx, gf_rf_idx, square_bsize); + const int nb_full_mv_num = + vp9_prepare_nb_full_mvs(motion_field, mi_row, mi_col, nb_full_mvs); + const int lambda = (pw * ph) / 4; + assert(pw * ph == lambda << 2); +#else // CONFIG_NON_GREEDY_MV + int sadpb = x->sadperbit16; +#endif // CONFIG_NON_GREEDY_MV + + pred_mv[0] = x->mbmi_ext->ref_mvs[ref][0].as_mv; + pred_mv[1] = x->mbmi_ext->ref_mvs[ref][1].as_mv; + pred_mv[2] = x->pred_mv[ref]; + + if (scaled_ref_frame) { + int i; + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // motion search code to be used without additional modifications. + for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0]; + + vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL); + } + + // Work out the size of the first step in the mv step search. + // 0 here is maximum length first step. 1 is VPXMAX >> 1 etc. + if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) { + // Take wtd average of the step_params based on the last frame's + // max mv magnitude and that based on the best ref mvs of the current + // block for the given reference. + step_param = + (vp9_init_search_range(x->max_mv_context[ref]) + cpi->mv_step_param) / + 2; + } else { + step_param = cpi->mv_step_param; + } + + if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64) { + const int boffset = + 2 * (b_width_log2_lookup[BLOCK_64X64] - + VPXMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize])); + step_param = VPXMAX(step_param, boffset); + } + + if (cpi->sf.adaptive_motion_search) { + int bwl = b_width_log2_lookup[bsize]; + int bhl = b_height_log2_lookup[bsize]; + int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4); + + if (tlevel < 5) step_param += 2; + + // prev_mv_sad is not setup for dynamically scaled frames. + if (cpi->oxcf.resize_mode != RESIZE_DYNAMIC) { + int i; + for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) { + if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) { + x->pred_mv[ref].row = INT16_MAX; + x->pred_mv[ref].col = INT16_MAX; + tmp_mv->as_int = INVALID_MV; + + if (scaled_ref_frame) { + int j; + for (j = 0; j < MAX_MB_PLANE; ++j) + xd->plane[j].pre[0] = backup_yv12[j]; + } + return; + } + } + } + } + + // Note: MV limits are modified here. Always restore the original values + // after full-pixel motion search. + vp9_set_mv_search_range(&x->mv_limits, &ref_mv); + + mvp_full = pred_mv[best_predmv_idx]; + mvp_full.col >>= 3; + mvp_full.row >>= 3; + +#if CONFIG_NON_GREEDY_MV + bestsme = vp9_full_pixel_diamond_new(cpi, x, bsize, &mvp_full, step_param, + lambda, 1, nb_full_mvs, nb_full_mv_num, + &tmp_mv->as_mv); +#else // CONFIG_NON_GREEDY_MV + bestsme = vp9_full_pixel_search( + cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, sadpb, + cond_cost_list(cpi, cost_list), &ref_mv, &tmp_mv->as_mv, INT_MAX, 1); +#endif // CONFIG_NON_GREEDY_MV + + if (cpi->sf.enhanced_full_pixel_motion_search) { + int i; + for (i = 0; i < 3; ++i) { + int this_me; + MV this_mv; + int diff_row; + int diff_col; + int step; + + if (pred_mv[i].row == INT16_MAX || pred_mv[i].col == INT16_MAX) continue; + if (i == best_predmv_idx) continue; + + diff_row = ((int)pred_mv[i].row - + pred_mv[i > 0 ? (i - 1) : best_predmv_idx].row) >> + 3; + diff_col = ((int)pred_mv[i].col - + pred_mv[i > 0 ? (i - 1) : best_predmv_idx].col) >> + 3; + if (diff_row == 0 && diff_col == 0) continue; + if (diff_row < 0) diff_row = -diff_row; + if (diff_col < 0) diff_col = -diff_col; + step = get_msb((diff_row + diff_col + 1) >> 1); + if (step <= 0) continue; + + mvp_full = pred_mv[i]; + mvp_full.col >>= 3; + mvp_full.row >>= 3; +#if CONFIG_NON_GREEDY_MV + this_me = vp9_full_pixel_diamond_new( + cpi, x, bsize, &mvp_full, + VPXMAX(step_param, MAX_MVSEARCH_STEPS - step), lambda, 1, nb_full_mvs, + nb_full_mv_num, &this_mv); +#else // CONFIG_NON_GREEDY_MV + this_me = vp9_full_pixel_search( + cpi, x, bsize, &mvp_full, + VPXMAX(step_param, MAX_MVSEARCH_STEPS - step), + cpi->sf.mv.search_method, sadpb, cond_cost_list(cpi, cost_list), + &ref_mv, &this_mv, INT_MAX, 1); +#endif // CONFIG_NON_GREEDY_MV + if (this_me < bestsme) { + tmp_mv->as_mv = this_mv; + bestsme = this_me; + } + } + } + + x->mv_limits = tmp_mv_limits; + + if (bestsme < INT_MAX) { + uint32_t dis; /* TODO: use dis in distortion calculation later. */ + cpi->find_fractional_mv_step( + x, &tmp_mv->as_mv, &ref_mv, cm->allow_high_precision_mv, x->errorperbit, + &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, + cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list), + x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, pw, ph, + cpi->sf.use_accurate_subpel_search); + } + *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmvjointcost, + x->mvcost, MV_COST_WEIGHT); + + x->pred_mv[ref] = tmp_mv->as_mv; + + if (scaled_ref_frame) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i]; + } +} + +static INLINE void restore_dst_buf(MACROBLOCKD *xd, + uint8_t *orig_dst[MAX_MB_PLANE], + int orig_dst_stride[MAX_MB_PLANE]) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].dst.buf = orig_dst[i]; + xd->plane[i].dst.stride = orig_dst_stride[i]; + } +} + +// In some situations we want to discount tha pparent cost of a new motion +// vector. Where there is a subtle motion field and especially where there is +// low spatial complexity then it can be hard to cover the cost of a new motion +// vector in a single block, even if that motion vector reduces distortion. +// However, once established that vector may be usable through the nearest and +// near mv modes to reduce distortion in subsequent blocks and also improve +// visual quality. +static int discount_newmv_test(VP9_COMP *cpi, int this_mode, int_mv this_mv, + int_mv (*mode_mv)[MAX_REF_FRAMES], int ref_frame, + int mi_row, int mi_col, BLOCK_SIZE bsize) { +#if CONFIG_NON_GREEDY_MV + (void)mode_mv; + (void)this_mv; + if (this_mode == NEWMV && bsize >= BLOCK_8X8 && cpi->tpl_ready) { + const int gf_group_idx = cpi->twopass.gf_group.index; + const int gf_rf_idx = ref_frame_to_gf_rf_idx(ref_frame); + const TplDepFrame tpl_frame = cpi->tpl_stats[gf_group_idx]; + const MotionField *motion_field = vp9_motion_field_info_get_motion_field( + &cpi->motion_field_info, gf_group_idx, gf_rf_idx, cpi->tpl_bsize); + const int tpl_block_mi_h = num_8x8_blocks_high_lookup[cpi->tpl_bsize]; + const int tpl_block_mi_w = num_8x8_blocks_wide_lookup[cpi->tpl_bsize]; + const int tpl_mi_row = mi_row - (mi_row % tpl_block_mi_h); + const int tpl_mi_col = mi_col - (mi_col % tpl_block_mi_w); + const int mv_mode = + tpl_frame + .mv_mode_arr[gf_rf_idx][tpl_mi_row * tpl_frame.stride + tpl_mi_col]; + if (mv_mode == NEW_MV_MODE) { + int_mv tpl_new_mv = + vp9_motion_field_mi_get_mv(motion_field, tpl_mi_row, tpl_mi_col); + int row_diff = abs(tpl_new_mv.as_mv.row - this_mv.as_mv.row); + int col_diff = abs(tpl_new_mv.as_mv.col - this_mv.as_mv.col); + if (VPXMAX(row_diff, col_diff) <= 8) { + return 1; + } else { + return 0; + } + } else { + return 0; + } + } else { + return 0; + } +#else + (void)mi_row; + (void)mi_col; + (void)bsize; + return (!cpi->rc.is_src_frame_alt_ref && (this_mode == NEWMV) && + (this_mv.as_int != 0) && + ((mode_mv[NEARESTMV][ref_frame].as_int == 0) || + (mode_mv[NEARESTMV][ref_frame].as_int == INVALID_MV)) && + ((mode_mv[NEARMV][ref_frame].as_int == 0) || + (mode_mv[NEARMV][ref_frame].as_int == INVALID_MV))); +#endif +} + +static int64_t handle_inter_mode( + VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int *rate2, + int64_t *distortion, int *skippable, int *rate_y, int *rate_uv, + struct buf_2d *recon, int *disable_skip, int_mv (*mode_mv)[MAX_REF_FRAMES], + int mi_row, int mi_col, int_mv single_newmv[MAX_REF_FRAMES], + INTERP_FILTER (*single_filter)[MAX_REF_FRAMES], + int (*single_skippable)[MAX_REF_FRAMES], int *single_mode_rate, + int64_t *psse, const int64_t ref_best_rd, int64_t *mask_filter, + int64_t filter_cache[], int best_mode_index) { + VP9_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + MODE_INFO *mi = xd->mi[0]; + MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + const int is_comp_pred = has_second_ref(mi); + const int this_mode = mi->mode; + int_mv *frame_mv = mode_mv[this_mode]; + int i; + int refs[2] = { mi->ref_frame[0], + (mi->ref_frame[1] < 0 ? 0 : mi->ref_frame[1]) }; + int_mv cur_mv[2]; +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, tmp_buf16[MAX_MB_PLANE * 64 * 64]); + uint8_t *tmp_buf; +#else + DECLARE_ALIGNED(16, uint8_t, tmp_buf[MAX_MB_PLANE * 64 * 64]); +#endif // CONFIG_VP9_HIGHBITDEPTH + int intpel_mv; + int64_t rd, tmp_rd = INT64_MAX, best_rd = INT64_MAX; + int best_needs_copy = 0; + uint8_t *orig_dst[MAX_MB_PLANE]; + int orig_dst_stride[MAX_MB_PLANE]; + int rs = 0; + INTERP_FILTER best_filter = SWITCHABLE; + uint8_t skip_txfm[MAX_MB_PLANE << 2] = { 0 }; + int64_t bsse[MAX_MB_PLANE << 2] = { 0 }; + + const int bsl = mi_width_log2_lookup[bsize]; + const int blk_parity = (((mi_row + mi_col) >> bsl) + + get_chessboard_index(cm->current_video_frame)) & + 0x1; + const int pred_filter_search = + (cpi->sf.cb_pred_filter_search >= 2) && blk_parity; + + int skip_txfm_sb = 0; + int64_t skip_sse_sb = INT64_MAX; + int64_t distortion_y = 0, distortion_uv = 0; + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf16); + } else { + tmp_buf = (uint8_t *)tmp_buf16; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + + if (pred_filter_search) { + INTERP_FILTER af = SWITCHABLE, lf = SWITCHABLE; + if (xd->above_mi && is_inter_block(xd->above_mi)) + af = xd->above_mi->interp_filter; + if (xd->left_mi && is_inter_block(xd->left_mi)) + lf = xd->left_mi->interp_filter; + + if ((this_mode != NEWMV) || (af == lf)) best_filter = af; + } + + if (is_comp_pred) { + if (frame_mv[refs[0]].as_int == INVALID_MV || + frame_mv[refs[1]].as_int == INVALID_MV) + return INT64_MAX; + + if (cpi->sf.adaptive_mode_search) { + if (single_filter[this_mode][refs[0]] == + single_filter[this_mode][refs[1]]) + best_filter = single_filter[this_mode][refs[0]]; + } + } + + if (this_mode == NEWMV) { + int rate_mv; + if (is_comp_pred) { + // Decide number of joint motion search iterations + const int num_joint_search_iters = get_joint_search_iters( + cpi->sf.comp_inter_joint_search_iter_level, bsize); + + // Initialize mv using single prediction mode result. + frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int; + frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int; + + if (num_joint_search_iters) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, joint_motion_search_time); +#endif + joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col, + single_newmv, &rate_mv, num_joint_search_iters); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, joint_motion_search_time); +#endif + } else { + rate_mv = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv, + &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv, + x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); + rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]].as_mv, + &x->mbmi_ext->ref_mvs[refs[1]][0].as_mv, + x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); + } + *rate2 += rate_mv; + } else { + int_mv tmp_mv; +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, single_motion_search_time); +#endif + single_motion_search(cpi, x, bsize, mi_row, mi_col, &tmp_mv, &rate_mv); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, single_motion_search_time); +#endif + if (tmp_mv.as_int == INVALID_MV) return INT64_MAX; + + frame_mv[refs[0]].as_int = xd->mi[0]->bmi[0].as_mv[0].as_int = + tmp_mv.as_int; + single_newmv[refs[0]].as_int = tmp_mv.as_int; + + // Estimate the rate implications of a new mv but discount this + // under certain circumstances where we want to help initiate a weak + // motion field, where the distortion gain for a single block may not + // be enough to overcome the cost of a new mv. + if (discount_newmv_test(cpi, this_mode, tmp_mv, mode_mv, refs[0], mi_row, + mi_col, bsize)) { + *rate2 += VPXMAX((rate_mv / NEW_MV_DISCOUNT_FACTOR), 1); + } else { + *rate2 += rate_mv; + } + } + } + + for (i = 0; i < is_comp_pred + 1; ++i) { + cur_mv[i] = frame_mv[refs[i]]; + // Clip "next_nearest" so that it does not extend to far out of image + if (this_mode != NEWMV) clamp_mv2(&cur_mv[i].as_mv, xd); + + if (mv_check_bounds(&x->mv_limits, &cur_mv[i].as_mv)) return INT64_MAX; + mi->mv[i].as_int = cur_mv[i].as_int; + } + + // do first prediction into the destination buffer. Do the next + // prediction into a temporary buffer. Then keep track of which one + // of these currently holds the best predictor, and use the other + // one for future predictions. In the end, copy from tmp_buf to + // dst if necessary. + for (i = 0; i < MAX_MB_PLANE; i++) { + orig_dst[i] = xd->plane[i].dst.buf; + orig_dst_stride[i] = xd->plane[i].dst.stride; + } + + // We don't include the cost of the second reference here, because there + // are only two options: Last/ARF or Golden/ARF; The second one is always + // known, which is ARF. + // + // Under some circumstances we discount the cost of new mv mode to encourage + // initiation of a motion field. + if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]], mode_mv, refs[0], + mi_row, mi_col, bsize)) { + *rate2 += + VPXMIN(cost_mv_ref(cpi, this_mode, mbmi_ext->mode_context[refs[0]]), + cost_mv_ref(cpi, NEARESTMV, mbmi_ext->mode_context[refs[0]])); + } else { + *rate2 += cost_mv_ref(cpi, this_mode, mbmi_ext->mode_context[refs[0]]); + } + + if (!is_comp_pred && cpi->sf.prune_single_mode_based_on_mv_diff_mode_rate) { + single_mode_rate[INTER_OFFSET(this_mode)] = *rate2; + // Prune NEARMV and ZEROMV modes based on motion vector difference and mode + // rate. + if (skip_single_mode_based_on_mode_rate(mode_mv, single_mode_rate, + this_mode, refs[0], *rate2, + best_mode_index)) { + // Check when the single inter mode is pruned, NEARESTMV or NEWMV modes + // are not early terminated. This ensures all single modes are not getting + // skipped when the speed feature is enabled. + assert(single_mode_rate[INTER_OFFSET(NEARESTMV)] != INT_MAX || + single_mode_rate[INTER_OFFSET(NEWMV)] != INT_MAX); + return INT64_MAX; + } + } + if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd && + mi->mode != NEARESTMV) + return INT64_MAX; + + // Are all MVs integer pel for Y and UV + intpel_mv = !mv_has_subpel(&mi->mv[0].as_mv); + if (is_comp_pred) intpel_mv &= !mv_has_subpel(&mi->mv[1].as_mv); + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, interp_filter_time); +#endif + // Search for best switchable filter by checking the variance of + // pred error irrespective of whether the filter will be used + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) filter_cache[i] = INT64_MAX; + + if (cm->interp_filter != BILINEAR) { + // Use cb pattern for filter eval when filter is not switchable + const int enable_interp_search = + (cpi->sf.cb_pred_filter_search && cm->interp_filter != SWITCHABLE) + ? blk_parity + : 1; + if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) { + best_filter = EIGHTTAP; + } else if (best_filter == SWITCHABLE && enable_interp_search) { + int newbest; + int tmp_rate_sum = 0; + int64_t tmp_dist_sum = 0; + + for (i = 0; i < SWITCHABLE_FILTERS; ++i) { + int j; + int64_t rs_rd; + int tmp_skip_sb = 0; + int64_t tmp_skip_sse = INT64_MAX; + const int enable_earlyterm = + cpi->sf.early_term_interp_search_plane_rd && cm->interp_filter != i; + int64_t filt_best_rd; + + mi->interp_filter = i; + rs = vp9_get_switchable_rate(cpi, xd); + rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0); + + if (i > 0 && intpel_mv) { + rd = RDCOST(x->rdmult, x->rddiv, tmp_rate_sum, tmp_dist_sum); + filter_cache[i] = rd; + filter_cache[SWITCHABLE_FILTERS] = + VPXMIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd); + if (cm->interp_filter == SWITCHABLE) rd += rs_rd; + *mask_filter = VPXMAX(*mask_filter, rd); + } else { + int rate_sum = 0; + int64_t dist_sum = 0; + if (i > 0 && cpi->sf.adaptive_interp_filter_search && + (cpi->sf.interp_filter_search_mask & (1 << i))) { + rate_sum = INT_MAX; + dist_sum = INT64_MAX; + continue; + } + + if ((cm->interp_filter == SWITCHABLE && (!i || best_needs_copy)) || + (cm->interp_filter != SWITCHABLE && + (cm->interp_filter == mi->interp_filter || + (i == 0 && intpel_mv)))) { + restore_dst_buf(xd, orig_dst, orig_dst_stride); + } else { + for (j = 0; j < MAX_MB_PLANE; j++) { + xd->plane[j].dst.buf = tmp_buf + j * 64 * 64; + xd->plane[j].dst.stride = 64; + } + } + + filt_best_rd = + cm->interp_filter == SWITCHABLE ? (best_rd - rs_rd) : best_rd; + if (build_inter_pred_model_rd_earlyterm( + cpi, mi_row, mi_col, bsize, x, xd, &rate_sum, &dist_sum, + &tmp_skip_sb, &tmp_skip_sse, enable_earlyterm, + filt_best_rd)) { + filter_cache[i] = INT64_MAX; + continue; + } + + rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum); + filter_cache[i] = rd; + filter_cache[SWITCHABLE_FILTERS] = + VPXMIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd); + if (cm->interp_filter == SWITCHABLE) rd += rs_rd; + *mask_filter = VPXMAX(*mask_filter, rd); + + if (i == 0 && intpel_mv) { + tmp_rate_sum = rate_sum; + tmp_dist_sum = dist_sum; + } + } + + if (i == 0 && cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) { + if (rd / 2 > ref_best_rd) { + restore_dst_buf(xd, orig_dst, orig_dst_stride); + return INT64_MAX; + } + } + newbest = i == 0 || rd < best_rd; + + if (newbest) { + best_rd = rd; + best_filter = mi->interp_filter; + if (cm->interp_filter == SWITCHABLE && i && !intpel_mv) + best_needs_copy = !best_needs_copy; + } + + if ((cm->interp_filter == SWITCHABLE && newbest) || + (cm->interp_filter != SWITCHABLE && + cm->interp_filter == mi->interp_filter)) { + tmp_rd = best_rd; + + skip_txfm_sb = tmp_skip_sb; + skip_sse_sb = tmp_skip_sse; + memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm)); + memcpy(bsse, x->bsse, sizeof(bsse)); + } + } + restore_dst_buf(xd, orig_dst, orig_dst_stride); + } + } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, interp_filter_time); +#endif + // Set the appropriate filter + mi->interp_filter = + cm->interp_filter != SWITCHABLE ? cm->interp_filter : best_filter; + rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(cpi, xd) : 0; + + if (tmp_rd != INT64_MAX) { + if (best_needs_copy) { + // again temporarily set the buffers to local memory to prevent a memcpy + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].dst.buf = tmp_buf + i * 64 * 64; + xd->plane[i].dst.stride = 64; + } + } + rd = tmp_rd + RDCOST(x->rdmult, x->rddiv, rs, 0); + } else { + int tmp_rate; + int64_t tmp_dist; + // Handles the special case when a filter that is not in the + // switchable list (ex. bilinear) is indicated at the frame level, or + // skip condition holds. + build_inter_pred_model_rd_earlyterm( + cpi, mi_row, mi_col, bsize, x, xd, &tmp_rate, &tmp_dist, &skip_txfm_sb, + &skip_sse_sb, 0 /*do_earlyterm*/, INT64_MAX); + rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist); + memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm)); + memcpy(bsse, x->bsse, sizeof(bsse)); + } + + if (!is_comp_pred) single_filter[this_mode][refs[0]] = mi->interp_filter; + + if (cpi->sf.adaptive_mode_search) + if (is_comp_pred) + if (single_skippable[this_mode][refs[0]] && + single_skippable[this_mode][refs[1]]) + memset(skip_txfm, SKIP_TXFM_AC_DC, sizeof(skip_txfm)); + + if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) { + // if current pred_error modeled rd is substantially more than the best + // so far, do not bother doing full rd + if (rd / 2 > ref_best_rd) { + restore_dst_buf(xd, orig_dst, orig_dst_stride); + return INT64_MAX; + } + } + + if (cm->interp_filter == SWITCHABLE) *rate2 += rs; + + memcpy(x->skip_txfm, skip_txfm, sizeof(skip_txfm)); + memcpy(x->bsse, bsse, sizeof(bsse)); + + if (!skip_txfm_sb || xd->lossless) { + int skippable_y, skippable_uv; + int64_t sseuv = INT64_MAX; + int64_t rdcosty = INT64_MAX; + + // Y cost and distortion + vp9_subtract_plane(x, bsize, 0); + super_block_yrd(cpi, x, rate_y, &distortion_y, &skippable_y, psse, bsize, + ref_best_rd, recon); + + if (*rate_y == INT_MAX) { + *rate2 = INT_MAX; + *distortion = INT64_MAX; + restore_dst_buf(xd, orig_dst, orig_dst_stride); + return INT64_MAX; + } + + *rate2 += *rate_y; + *distortion += distortion_y; + + rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion); + rdcosty = VPXMIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse)); + + if (!super_block_uvrd(cpi, x, rate_uv, &distortion_uv, &skippable_uv, + &sseuv, bsize, ref_best_rd - rdcosty)) { + *rate2 = INT_MAX; + *distortion = INT64_MAX; + restore_dst_buf(xd, orig_dst, orig_dst_stride); + return INT64_MAX; + } + + *psse += sseuv; + *rate2 += *rate_uv; + *distortion += distortion_uv; + *skippable = skippable_y && skippable_uv; + } else { + x->skip = 1; + *disable_skip = 1; + + // The cost of skip bit needs to be added. + *rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1); + + *distortion = skip_sse_sb; + } + + if (!is_comp_pred) single_skippable[this_mode][refs[0]] = *skippable; + + restore_dst_buf(xd, orig_dst, orig_dst_stride); + return 0; // The rate-distortion cost will be re-calculated by caller. +} +#endif // !CONFIG_REALTIME_ONLY + +void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, + int64_t best_rd) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblockd_plane *const pd = xd->plane; + int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0; + int y_skip = 0, uv_skip = 0; + int64_t dist_y = 0, dist_uv = 0; + TX_SIZE max_uv_tx_size; + x->skip_encode = 0; + ctx->skip = 0; + xd->mi[0]->ref_frame[0] = INTRA_FRAME; + xd->mi[0]->ref_frame[1] = NO_REF_FRAME; + // Initialize interp_filter here so we do not have to check for inter block + // modes in get_pred_context_switchable_interp() + xd->mi[0]->interp_filter = SWITCHABLE_FILTERS; + + if (bsize >= BLOCK_8X8) { + if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y, + &y_skip, bsize, best_rd) >= best_rd) { + rd_cost->rate = INT_MAX; + return; + } + } else { + y_skip = 0; + if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate_y, &rate_y_tokenonly, + &dist_y, best_rd) >= best_rd) { + rd_cost->rate = INT_MAX; + return; + } + } + max_uv_tx_size = uv_txsize_lookup[bsize][xd->mi[0]->tx_size] + [pd[1].subsampling_x][pd[1].subsampling_y]; + rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly, &dist_uv, + &uv_skip, VPXMAX(BLOCK_8X8, bsize), max_uv_tx_size); + + if (y_skip && uv_skip) { + rd_cost->rate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly + + vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1); + rd_cost->dist = dist_y + dist_uv; + } else { + rd_cost->rate = + rate_y + rate_uv + vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0); + rd_cost->dist = dist_y + dist_uv; + } + + ctx->mic = *xd->mi[0]; + ctx->mbmi_ext = *x->mbmi_ext; + rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist); +} + +#if !CONFIG_REALTIME_ONLY +// This function is designed to apply a bias or adjustment to an rd value based +// on the relative variance of the source and reconstruction. +#define LOW_VAR_THRESH 250 +#define VAR_MULT 250 +static unsigned int max_var_adjust[VP9E_CONTENT_INVALID] = { 16, 16, 250 }; + +static void rd_variance_adjustment(VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int64_t *this_rd, + struct buf_2d *recon, + MV_REFERENCE_FRAME ref_frame, + MV_REFERENCE_FRAME second_ref_frame, + PREDICTION_MODE this_mode) { + MACROBLOCKD *const xd = &x->e_mbd; + unsigned int rec_variance; + unsigned int src_variance; + unsigned int src_rec_min; + unsigned int var_diff = 0; + unsigned int var_factor = 0; + unsigned int adj_max; + unsigned int low_var_thresh = LOW_VAR_THRESH; + const int bw = num_8x8_blocks_wide_lookup[bsize]; + const int bh = num_8x8_blocks_high_lookup[bsize]; + vp9e_tune_content content_type = cpi->oxcf.content; + + if (*this_rd == INT64_MAX) return; + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + rec_variance = vp9_high_get_sby_variance(cpi, recon, bsize, xd->bd); + src_variance = + vp9_high_get_sby_variance(cpi, &x->plane[0].src, bsize, xd->bd); + } else { + rec_variance = vp9_get_sby_variance(cpi, recon, bsize); + src_variance = vp9_get_sby_variance(cpi, &x->plane[0].src, bsize); + } +#else + rec_variance = vp9_get_sby_variance(cpi, recon, bsize); + src_variance = vp9_get_sby_variance(cpi, &x->plane[0].src, bsize); +#endif // CONFIG_VP9_HIGHBITDEPTH + + // Scale based on area in 8x8 blocks + rec_variance /= (bw * bh); + src_variance /= (bw * bh); + + if (content_type == VP9E_CONTENT_FILM) { + if (cpi->oxcf.pass == 2) { + // Adjust low variance threshold based on estimated group noise enegry. + double noise_factor = + (double)cpi->twopass.gf_group.group_noise_energy / SECTION_NOISE_DEF; + low_var_thresh = (unsigned int)(low_var_thresh * noise_factor); + + if (ref_frame == INTRA_FRAME) { + low_var_thresh *= 2; + if (this_mode == DC_PRED) low_var_thresh *= 5; + } else if (second_ref_frame > INTRA_FRAME) { + low_var_thresh *= 2; + } + } + } else { + low_var_thresh = LOW_VAR_THRESH / 2; + } + + // Lower of source (raw per pixel value) and recon variance. Note that + // if the source per pixel is 0 then the recon value here will not be per + // pixel (see above) so will likely be much larger. + src_rec_min = VPXMIN(src_variance, rec_variance); + + if (src_rec_min > low_var_thresh) return; + + // We care more when the reconstruction has lower variance so give this case + // a stronger weighting. + var_diff = (src_variance > rec_variance) ? (src_variance - rec_variance) * 2 + : (rec_variance - src_variance) / 2; + + adj_max = max_var_adjust[content_type]; + + var_factor = + (unsigned int)((int64_t)VAR_MULT * var_diff) / VPXMAX(1, src_variance); + var_factor = VPXMIN(adj_max, var_factor); + + if ((content_type == VP9E_CONTENT_FILM) && + ((ref_frame == INTRA_FRAME) || (second_ref_frame > INTRA_FRAME))) { + var_factor *= 2; + } + + *this_rd += (*this_rd * var_factor) / 100; + + (void)xd; +} +#endif // !CONFIG_REALTIME_ONLY + +// Do we have an internal image edge (e.g. formatting bars). +int vp9_internal_image_edge(VP9_COMP *cpi) { + return (cpi->oxcf.pass == 2) && + ((cpi->twopass.this_frame_stats.inactive_zone_rows > 0) || + (cpi->twopass.this_frame_stats.inactive_zone_cols > 0)); +} + +// Checks to see if a super block is on a horizontal image edge. +// In most cases this is the "real" edge unless there are formatting +// bars embedded in the stream. +int vp9_active_h_edge(VP9_COMP *cpi, int mi_row, int mi_step) { + int top_edge = 0; + int bottom_edge = cpi->common.mi_rows; + int is_active_h_edge = 0; + + // For two pass account for any formatting bars detected. + if (cpi->oxcf.pass == 2) { + TWO_PASS *twopass = &cpi->twopass; + vpx_clear_system_state(); + + // The inactive region is specified in MBs not mi units. + // The image edge is in the following MB row. + top_edge += (int)(twopass->this_frame_stats.inactive_zone_rows * 2); + + bottom_edge -= (int)(twopass->this_frame_stats.inactive_zone_rows * 2); + bottom_edge = VPXMAX(top_edge, bottom_edge); + } + + if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) || + ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) { + is_active_h_edge = 1; + } + return is_active_h_edge; +} + +// Checks to see if a super block is on a vertical image edge. +// In most cases this is the "real" edge unless there are formatting +// bars embedded in the stream. +int vp9_active_v_edge(VP9_COMP *cpi, int mi_col, int mi_step) { + int left_edge = 0; + int right_edge = cpi->common.mi_cols; + int is_active_v_edge = 0; + + // For two pass account for any formatting bars detected. + if (cpi->oxcf.pass == 2) { + TWO_PASS *twopass = &cpi->twopass; + vpx_clear_system_state(); + + // The inactive region is specified in MBs not mi units. + // The image edge is in the following MB row. + left_edge += (int)(twopass->this_frame_stats.inactive_zone_cols * 2); + + right_edge -= (int)(twopass->this_frame_stats.inactive_zone_cols * 2); + right_edge = VPXMAX(left_edge, right_edge); + } + + if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) || + ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) { + is_active_v_edge = 1; + } + return is_active_v_edge; +} + +// Checks to see if a super block is at the edge of the active image. +// In most cases this is the "real" edge unless there are formatting +// bars embedded in the stream. +int vp9_active_edge_sb(VP9_COMP *cpi, int mi_row, int mi_col) { + return vp9_active_h_edge(cpi, mi_row, MI_BLOCK_SIZE) || + vp9_active_v_edge(cpi, mi_col, MI_BLOCK_SIZE); +} + +#if !CONFIG_REALTIME_ONLY +void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, + MACROBLOCK *x, int mi_row, int mi_col, + RD_COST *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far) { + VP9_COMMON *const cm = &cpi->common; + TileInfo *const tile_info = &tile_data->tile_info; + RD_OPT *const rd_opt = &cpi->rd; + SPEED_FEATURES *const sf = &cpi->sf; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mi = xd->mi[0]; + MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + const struct segmentation *const seg = &cm->seg; + PREDICTION_MODE this_mode; + MV_REFERENCE_FRAME ref_frame, second_ref_frame; + unsigned char segment_id = mi->segment_id; + int comp_pred, i, k; + int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; + struct buf_2d yv12_mb[4][MAX_MB_PLANE]; + int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } }; + INTERP_FILTER single_inter_filter[MB_MODE_COUNT][MAX_REF_FRAMES]; + int single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES]; + int single_mode_rate[MAX_REF_FRAMES][INTER_MODES]; + int64_t best_rd = best_rd_so_far; + int64_t best_pred_diff[REFERENCE_MODES]; + int64_t best_pred_rd[REFERENCE_MODES]; + int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS]; + int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]; + MODE_INFO best_mbmode; + int best_mode_skippable = 0; + int midx, best_mode_index = -1; + unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES]; + vpx_prob comp_mode_p; + int64_t best_intra_rd = INT64_MAX; + unsigned int best_pred_sse = UINT_MAX; + PREDICTION_MODE best_intra_mode = DC_PRED; + int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES]; + int64_t dist_uv[TX_SIZES]; + int skip_uv[TX_SIZES]; + PREDICTION_MODE mode_uv[TX_SIZES]; + const int intra_cost_penalty = + vp9_get_intra_cost_penalty(cpi, bsize, cm->base_qindex, cm->y_dc_delta_q); + int best_skip2 = 0; + uint8_t ref_frame_skip_mask[2] = { 0, 1 }; + uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 }; + int mode_skip_start = sf->mode_skip_start + 1; + const int *const rd_threshes = rd_opt->threshes[segment_id][bsize]; + const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize]; + int64_t mode_threshold[MAX_MODES]; + int8_t *tile_mode_map = tile_data->mode_map[bsize]; + int8_t mode_map[MAX_MODES]; // Maintain mode_map information locally to avoid + // lock mechanism involved with reads from + // tile_mode_map + const int mode_search_skip_flags = sf->mode_search_skip_flags; + const int is_rect_partition = + num_4x4_blocks_wide_lookup[bsize] != num_4x4_blocks_high_lookup[bsize]; + int64_t mask_filter = 0; + int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS]; + + struct buf_2d *recon; + struct buf_2d recon_buf; +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, recon16[64 * 64]); + recon_buf.buf = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH + ? CONVERT_TO_BYTEPTR(recon16) + : (uint8_t *)recon16; +#else + DECLARE_ALIGNED(16, uint8_t, recon8[64 * 64]); + recon_buf.buf = recon8; +#endif // CONFIG_VP9_HIGHBITDEPTH + recon_buf.stride = 64; + recon = cpi->oxcf.content == VP9E_CONTENT_FILM ? &recon_buf : 0; + + vp9_zero(best_mbmode); + + x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; + + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) filter_cache[i] = INT64_MAX; + + estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp, + &comp_mode_p); + + for (i = 0; i < REFERENCE_MODES; ++i) best_pred_rd[i] = INT64_MAX; + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) + best_filter_rd[i] = INT64_MAX; + for (i = 0; i < TX_SIZES; i++) rate_uv_intra[i] = INT_MAX; + for (i = 0; i < MAX_REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX; + for (i = 0; i < MB_MODE_COUNT; ++i) { + for (k = 0; k < MAX_REF_FRAMES; ++k) { + single_inter_filter[i][k] = SWITCHABLE; + single_skippable[i][k] = 0; + } + } + + rd_cost->rate = INT_MAX; + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + x->pred_mv_sad[ref_frame] = INT_MAX; + if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) && + !(is_rect_partition && (ctx->skip_ref_frame_mask & (1 << ref_frame)))) { + assert(get_ref_frame_buffer(cpi, ref_frame) != NULL); + setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col, + frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb); + } + frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; + frame_mv[ZEROMV][ref_frame].as_int = 0; + } + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + if (!(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) { + // Skip checking missing references in both single and compound reference + // modes. Note that a mode will be skipped if both reference frames + // are masked out. + ref_frame_skip_mask[0] |= (1 << ref_frame); + ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; + } else if (sf->reference_masking) { + for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { + // Skip fixed mv modes for poor references + if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) { + mode_skip_mask[ref_frame] |= INTER_NEAREST_NEAR_ZERO; + break; + } + } + } + // If the segment reference frame feature is enabled.... + // then do nothing if the current ref frame is not allowed.. + if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) { + ref_frame_skip_mask[0] |= (1 << ref_frame); + ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; + } + } + + // Disable this drop out case if the ref frame + // segment level feature is enabled for this segment. This is to + // prevent the possibility that we end up unable to pick any mode. + if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) { + // Only consider ZEROMV/ALTREF_FRAME for alt ref frame, + // unless ARNR filtering is enabled in which case we want + // an unfiltered alternative. We allow near/nearest as well + // because they may result in zero-zero MVs but be cheaper. + if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) { + ref_frame_skip_mask[0] = (1 << LAST_FRAME) | (1 << GOLDEN_FRAME); + ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK; + mode_skip_mask[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO; + if (frame_mv[NEARMV][ALTREF_FRAME].as_int != 0) + mode_skip_mask[ALTREF_FRAME] |= (1 << NEARMV); + if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != 0) + mode_skip_mask[ALTREF_FRAME] |= (1 << NEARESTMV); + } + } + + if (cpi->rc.is_src_frame_alt_ref) { + if (sf->alt_ref_search_fp) { + mode_skip_mask[ALTREF_FRAME] = 0; + ref_frame_skip_mask[0] = ~(1 << ALTREF_FRAME) & 0xff; + ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK; + } + } + + if (sf->alt_ref_search_fp) + if (!cm->show_frame && x->pred_mv_sad[GOLDEN_FRAME] < INT_MAX) + if (x->pred_mv_sad[ALTREF_FRAME] > (x->pred_mv_sad[GOLDEN_FRAME] << 1)) + mode_skip_mask[ALTREF_FRAME] |= INTER_ALL; + + if (sf->adaptive_mode_search) { + if (cm->show_frame && !cpi->rc.is_src_frame_alt_ref && + cpi->rc.frames_since_golden >= 3) + if (x->pred_mv_sad[GOLDEN_FRAME] > (x->pred_mv_sad[LAST_FRAME] << 1)) + mode_skip_mask[GOLDEN_FRAME] |= INTER_ALL; + } + + if (bsize > sf->max_intra_bsize && cpi->ref_frame_flags != 0) { + ref_frame_skip_mask[0] |= (1 << INTRA_FRAME); + ref_frame_skip_mask[1] |= (1 << INTRA_FRAME); + } + + mode_skip_mask[INTRA_FRAME] |= + (uint16_t) ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]); + + for (i = 0; i <= LAST_NEW_MV_INDEX; ++i) mode_threshold[i] = 0; + + for (i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i) + mode_threshold[i] = ((int64_t)rd_threshes[i] * rd_thresh_freq_fact[i]) >> 5; + + midx = sf->schedule_mode_search ? mode_skip_start : 0; + + while (midx > 4) { + uint8_t end_pos = 0; + for (i = 5; i < midx; ++i) { + if (mode_threshold[tile_mode_map[i - 1]] > + mode_threshold[tile_mode_map[i]]) { + uint8_t tmp = tile_mode_map[i]; + tile_mode_map[i] = tile_mode_map[i - 1]; + tile_mode_map[i - 1] = tmp; + end_pos = i; + } + } + midx = end_pos; + } + + memcpy(mode_map, tile_mode_map, sizeof(mode_map)); + + for (midx = 0; midx < MAX_MODES; ++midx) { + int mode_index = mode_map[midx]; + int mode_excluded = 0; + int64_t this_rd = INT64_MAX; + int disable_skip = 0; + int compmode_cost = 0; + int rate2 = 0, rate_y = 0, rate_uv = 0; + int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0; + int skippable = 0; + int this_skip2 = 0; + int64_t total_sse = INT64_MAX; + int early_term = 0; + + this_mode = vp9_mode_order[mode_index].mode; + ref_frame = vp9_mode_order[mode_index].ref_frame[0]; + second_ref_frame = vp9_mode_order[mode_index].ref_frame[1]; + + vp9_zero(x->sum_y_eobs); + comp_pred = second_ref_frame > INTRA_FRAME; + if (!comp_pred && ref_frame != INTRA_FRAME && + sf->prune_single_mode_based_on_mv_diff_mode_rate) + single_mode_rate[ref_frame][INTER_OFFSET(this_mode)] = INT_MAX; + + if (is_rect_partition) { + if (ctx->skip_ref_frame_mask & (1 << ref_frame)) continue; + if (second_ref_frame > 0 && + (ctx->skip_ref_frame_mask & (1 << second_ref_frame))) + continue; + } + + // Look at the reference frame of the best mode so far and set the + // skip mask to look at a subset of the remaining modes. + if (midx == mode_skip_start && best_mode_index >= 0) { + switch (best_mbmode.ref_frame[0]) { + case INTRA_FRAME: break; + case LAST_FRAME: ref_frame_skip_mask[0] |= LAST_FRAME_MODE_MASK; break; + case GOLDEN_FRAME: + ref_frame_skip_mask[0] |= GOLDEN_FRAME_MODE_MASK; + break; + case ALTREF_FRAME: ref_frame_skip_mask[0] |= ALT_REF_MODE_MASK; break; + case NO_REF_FRAME: + case MAX_REF_FRAMES: assert(0 && "Invalid Reference frame"); break; + } + } + + if ((ref_frame_skip_mask[0] & (1 << ref_frame)) && + (ref_frame_skip_mask[1] & (1 << VPXMAX(0, second_ref_frame)))) + continue; + + if (mode_skip_mask[ref_frame] & (1 << this_mode)) continue; + + // Test best rd so far against threshold for trying this mode. + if (best_mode_skippable && sf->schedule_mode_search) + mode_threshold[mode_index] <<= 1; + + if (best_rd < mode_threshold[mode_index]) continue; + + // This is only used in motion vector unit test. + if (cpi->oxcf.motion_vector_unit_test && ref_frame == INTRA_FRAME) continue; + + if (sf->motion_field_mode_search) { + const int mi_width = VPXMIN(num_8x8_blocks_wide_lookup[bsize], + tile_info->mi_col_end - mi_col); + const int mi_height = VPXMIN(num_8x8_blocks_high_lookup[bsize], + tile_info->mi_row_end - mi_row); + const int bsl = mi_width_log2_lookup[bsize]; + int cb_partition_search_ctrl = + (((mi_row + mi_col) >> bsl) + + get_chessboard_index(cm->current_video_frame)) & + 0x1; + MODE_INFO *ref_mi; + int const_motion = 1; + int skip_ref_frame = !cb_partition_search_ctrl; + MV_REFERENCE_FRAME rf = NO_REF_FRAME; + int_mv ref_mv; + ref_mv.as_int = INVALID_MV; + + if ((mi_row - 1) >= tile_info->mi_row_start) { + ref_mv = xd->mi[-xd->mi_stride]->mv[0]; + rf = xd->mi[-xd->mi_stride]->ref_frame[0]; + for (i = 0; i < mi_width; ++i) { + ref_mi = xd->mi[-xd->mi_stride + i]; + const_motion &= (ref_mv.as_int == ref_mi->mv[0].as_int) && + (ref_frame == ref_mi->ref_frame[0]); + skip_ref_frame &= (rf == ref_mi->ref_frame[0]); + } + } + + if ((mi_col - 1) >= tile_info->mi_col_start) { + if (ref_mv.as_int == INVALID_MV) ref_mv = xd->mi[-1]->mv[0]; + if (rf == NO_REF_FRAME) rf = xd->mi[-1]->ref_frame[0]; + for (i = 0; i < mi_height; ++i) { + ref_mi = xd->mi[i * xd->mi_stride - 1]; + const_motion &= (ref_mv.as_int == ref_mi->mv[0].as_int) && + (ref_frame == ref_mi->ref_frame[0]); + skip_ref_frame &= (rf == ref_mi->ref_frame[0]); + } + } + + if (skip_ref_frame && this_mode != NEARESTMV && this_mode != NEWMV) + if (rf > INTRA_FRAME) + if (ref_frame != rf) continue; + + if (const_motion) + if (this_mode == NEARMV || this_mode == ZEROMV) continue; + } + + if (comp_pred) { + if (!cpi->allow_comp_inter_inter) continue; + + if (cm->ref_frame_sign_bias[ref_frame] == + cm->ref_frame_sign_bias[second_ref_frame]) + continue; + + // Skip compound inter modes if ARF is not available. + if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame))) + continue; + + // Do not allow compound prediction if the segment level reference frame + // feature is in use as in this case there can only be one reference. + if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) continue; + + if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) && + best_mode_index >= 0 && best_mbmode.ref_frame[0] == INTRA_FRAME) + continue; + + mode_excluded = cm->reference_mode == SINGLE_REFERENCE; + } else { + if (ref_frame != INTRA_FRAME) + mode_excluded = cm->reference_mode == COMPOUND_REFERENCE; + } + + if (ref_frame == INTRA_FRAME) { + if (sf->adaptive_mode_search) + if ((x->source_variance << num_pels_log2_lookup[bsize]) > best_pred_sse) + continue; + + if (this_mode != DC_PRED) { + // Disable intra modes other than DC_PRED for blocks with low variance + // Threshold for intra skipping based on source variance + // TODO(debargha): Specialize the threshold for super block sizes + const unsigned int skip_intra_var_thresh = + (cpi->oxcf.content == VP9E_CONTENT_FILM) ? 0 : 64; + if ((mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) && + x->source_variance < skip_intra_var_thresh) + continue; + // Only search the oblique modes if the best so far is + // one of the neighboring directional modes + if ((mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) && + (this_mode >= D45_PRED && this_mode <= TM_PRED)) { + if (best_mode_index >= 0 && best_mbmode.ref_frame[0] > INTRA_FRAME) + continue; + } + if (mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) { + if (conditional_skipintra(this_mode, best_intra_mode)) continue; + } + } + } else { + const MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, second_ref_frame }; + if (!check_best_zero_mv(cpi, mbmi_ext->mode_context, frame_mv, this_mode, + ref_frames)) + continue; + } + + mi->mode = this_mode; + mi->uv_mode = DC_PRED; + mi->ref_frame[0] = ref_frame; + mi->ref_frame[1] = second_ref_frame; + // Evaluate all sub-pel filters irrespective of whether we can use + // them for this frame. + mi->interp_filter = + cm->interp_filter == SWITCHABLE ? EIGHTTAP : cm->interp_filter; + mi->mv[0].as_int = mi->mv[1].as_int = 0; + + x->skip = 0; + set_ref_ptrs(cm, xd, ref_frame, second_ref_frame); + + // Select prediction reference frames. + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].pre[0] = yv12_mb[ref_frame][i]; + if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i]; + } + + if (ref_frame == INTRA_FRAME) { + TX_SIZE uv_tx; + struct macroblockd_plane *const pd = &xd->plane[1]; +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, intra_mode_search_time); +#endif + memset(x->skip_txfm, 0, sizeof(x->skip_txfm)); + super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL, bsize, + best_rd, recon); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, intra_mode_search_time); +#endif + if (rate_y == INT_MAX) continue; + + uv_tx = uv_txsize_lookup[bsize][mi->tx_size][pd->subsampling_x] + [pd->subsampling_y]; +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, intra_mode_search_time); +#endif + if (rate_uv_intra[uv_tx] == INT_MAX) { + choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx], + &rate_uv_tokenonly[uv_tx], &dist_uv[uv_tx], + &skip_uv[uv_tx], &mode_uv[uv_tx]); + } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, intra_mode_search_time); +#endif + rate_uv = rate_uv_tokenonly[uv_tx]; + distortion_uv = dist_uv[uv_tx]; + skippable = skippable && skip_uv[uv_tx]; + mi->uv_mode = mode_uv[uv_tx]; + + rate2 = rate_y + cpi->mbmode_cost[mi->mode] + rate_uv_intra[uv_tx]; + if (this_mode != DC_PRED && this_mode != TM_PRED) + rate2 += intra_cost_penalty; + distortion2 = distortion_y + distortion_uv; + } else { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, handle_inter_mode_time); +#endif + this_rd = handle_inter_mode( + cpi, x, bsize, &rate2, &distortion2, &skippable, &rate_y, &rate_uv, + recon, &disable_skip, frame_mv, mi_row, mi_col, single_newmv, + single_inter_filter, single_skippable, + &single_mode_rate[ref_frame][0], &total_sse, best_rd, &mask_filter, + filter_cache, best_mode_index); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, handle_inter_mode_time); +#endif + if (this_rd == INT64_MAX) continue; + + compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred); + + if (cm->reference_mode == REFERENCE_MODE_SELECT) rate2 += compmode_cost; + } + + // Estimate the reference frame signaling cost and add it + // to the rolling cost variable. + if (comp_pred) { + rate2 += ref_costs_comp[ref_frame]; + } else { + rate2 += ref_costs_single[ref_frame]; + } + + if (!disable_skip) { + const vpx_prob skip_prob = vp9_get_skip_prob(cm, xd); + const int skip_cost0 = vp9_cost_bit(skip_prob, 0); + const int skip_cost1 = vp9_cost_bit(skip_prob, 1); + + if (skippable) { + // Back out the coefficient coding costs + rate2 -= (rate_y + rate_uv); + + // Cost the skip mb case + rate2 += skip_cost1; + } else if (ref_frame != INTRA_FRAME && !xd->lossless && + !cpi->oxcf.sharpness) { + if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv + skip_cost0, + distortion2) < + RDCOST(x->rdmult, x->rddiv, skip_cost1, total_sse)) { + // Add in the cost of the no skip flag. + rate2 += skip_cost0; + } else { + // FIXME(rbultje) make this work for splitmv also + assert(total_sse >= 0); + + rate2 += skip_cost1; + distortion2 = total_sse; + rate2 -= (rate_y + rate_uv); + this_skip2 = 1; + } + } else { + // Add in the cost of the no skip flag. + rate2 += skip_cost0; + } + + // Calculate the final RD estimate for this mode. + this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); + } + + if (recon) { + // In film mode bias against DC pred and other intra if there is a + // significant difference between the variance of the sub blocks in the + // the source. Also apply some bias against compound modes which also + // tend to blur fine texture such as film grain over time. + // + // The sub block test here acts in the case where one or more sub + // blocks have high relatively variance but others relatively low + // variance. Here the high variance sub blocks may push the + // total variance for the current block size over the thresholds + // used in rd_variance_adjustment() below. + if (cpi->oxcf.content == VP9E_CONTENT_FILM) { + if (bsize >= BLOCK_16X16) { + int min_energy, max_energy; + vp9_get_sub_block_energy(cpi, x, mi_row, mi_col, bsize, &min_energy, + &max_energy); + if (max_energy > min_energy) { + if (ref_frame == INTRA_FRAME) { + if (this_mode == DC_PRED) + this_rd += (this_rd * (max_energy - min_energy)); + else + this_rd += (this_rd * (max_energy - min_energy)) / 4; + } else if (second_ref_frame > INTRA_FRAME) { + this_rd += this_rd / 4; + } + } + } + } + // Apply an adjustment to the rd value based on the similarity of the + // source variance and reconstructed variance. + rd_variance_adjustment(cpi, x, bsize, &this_rd, recon, ref_frame, + second_ref_frame, this_mode); + } + + if (ref_frame == INTRA_FRAME) { + // Keep record of best intra rd + if (this_rd < best_intra_rd) { + best_intra_rd = this_rd; + best_intra_mode = mi->mode; + } + } + + if (!disable_skip && ref_frame == INTRA_FRAME) { + for (i = 0; i < REFERENCE_MODES; ++i) + best_pred_rd[i] = VPXMIN(best_pred_rd[i], this_rd); + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) + best_filter_rd[i] = VPXMIN(best_filter_rd[i], this_rd); + } + + // Did this mode help.. i.e. is it the new best mode + if (this_rd < best_rd || x->skip) { + int max_plane = MAX_MB_PLANE; + if (!mode_excluded) { + // Note index of best mode so far + best_mode_index = mode_index; + + if (ref_frame == INTRA_FRAME) { + /* required for left and above block mv */ + mi->mv[0].as_int = 0; + max_plane = 1; + // Initialize interp_filter here so we do not have to check for + // inter block modes in get_pred_context_switchable_interp() + mi->interp_filter = SWITCHABLE_FILTERS; + } else { + best_pred_sse = x->pred_sse[ref_frame]; + } + + rd_cost->rate = rate2; + rd_cost->dist = distortion2; + rd_cost->rdcost = this_rd; + best_rd = this_rd; + best_mbmode = *mi; + best_skip2 = this_skip2; + best_mode_skippable = skippable; + + if (!x->select_tx_size) swap_block_ptr(x, ctx, 1, 0, 0, max_plane); + memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mi->tx_size], + sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk); + ctx->sum_y_eobs = x->sum_y_eobs[mi->tx_size]; + + // TODO(debargha): enhance this test with a better distortion prediction + // based on qp, activity mask and history + if ((mode_search_skip_flags & FLAG_EARLY_TERMINATE) && + (mode_index > MIN_EARLY_TERM_INDEX)) { + int qstep = xd->plane[0].dequant[1]; + // TODO(debargha): Enhance this by specializing for each mode_index + int scale = 4; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + qstep >>= (xd->bd - 8); + } +#endif // CONFIG_VP9_HIGHBITDEPTH + if (x->source_variance < UINT_MAX) { + const int var_adjust = (x->source_variance < 16); + scale -= var_adjust; + } + if (ref_frame > INTRA_FRAME && distortion2 * scale < qstep * qstep) { + early_term = 1; + } + } + } + } + + /* keep record of best compound/single-only prediction */ + if (!disable_skip && ref_frame != INTRA_FRAME) { + int64_t single_rd, hybrid_rd, single_rate, hybrid_rate; + + if (cm->reference_mode == REFERENCE_MODE_SELECT) { + single_rate = rate2 - compmode_cost; + hybrid_rate = rate2; + } else { + single_rate = rate2; + hybrid_rate = rate2 + compmode_cost; + } + + single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2); + hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2); + + if (!comp_pred) { + if (single_rd < best_pred_rd[SINGLE_REFERENCE]) + best_pred_rd[SINGLE_REFERENCE] = single_rd; + } else { + if (single_rd < best_pred_rd[COMPOUND_REFERENCE]) + best_pred_rd[COMPOUND_REFERENCE] = single_rd; + } + if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT]) + best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd; + + /* keep record of best filter type */ + if (!mode_excluded && cm->interp_filter != BILINEAR) { + int64_t ref = + filter_cache[cm->interp_filter == SWITCHABLE ? SWITCHABLE_FILTERS + : cm->interp_filter]; + + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { + int64_t adj_rd; + if (ref == INT64_MAX) + adj_rd = 0; + else if (filter_cache[i] == INT64_MAX) + // when early termination is triggered, the encoder does not have + // access to the rate-distortion cost. it only knows that the cost + // should be above the maximum valid value. hence it takes the known + // maximum plus an arbitrary constant as the rate-distortion cost. + adj_rd = mask_filter - ref + 10; + else + adj_rd = filter_cache[i] - ref; + + adj_rd += this_rd; + best_filter_rd[i] = VPXMIN(best_filter_rd[i], adj_rd); + } + } + } + + if (early_term) break; + + if (x->skip && !comp_pred) break; + } + + // The inter modes' rate costs are not calculated precisely in some cases. + // Therefore, sometimes, NEWMV is chosen instead of NEARESTMV, NEARMV, and + // ZEROMV. Here, checks are added for those cases, and the mode decisions + // are corrected. + if (best_mbmode.mode == NEWMV) { + const MV_REFERENCE_FRAME refs[2] = { best_mbmode.ref_frame[0], + best_mbmode.ref_frame[1] }; + int comp_pred_mode = refs[1] > INTRA_FRAME; + + if (frame_mv[NEARESTMV][refs[0]].as_int == best_mbmode.mv[0].as_int && + ((comp_pred_mode && + frame_mv[NEARESTMV][refs[1]].as_int == best_mbmode.mv[1].as_int) || + !comp_pred_mode)) + best_mbmode.mode = NEARESTMV; + else if (frame_mv[NEARMV][refs[0]].as_int == best_mbmode.mv[0].as_int && + ((comp_pred_mode && + frame_mv[NEARMV][refs[1]].as_int == best_mbmode.mv[1].as_int) || + !comp_pred_mode)) + best_mbmode.mode = NEARMV; + else if (best_mbmode.mv[0].as_int == 0 && + ((comp_pred_mode && best_mbmode.mv[1].as_int == 0) || + !comp_pred_mode)) + best_mbmode.mode = ZEROMV; + } + + if (best_mode_index < 0 || best_rd >= best_rd_so_far) { + // If adaptive interp filter is enabled, then the current leaf node of 8x8 + // data is needed for sub8x8. Hence preserve the context. + if (bsize == BLOCK_8X8) ctx->mic = *xd->mi[0]; + rd_cost->rate = INT_MAX; + rd_cost->rdcost = INT64_MAX; + return; + } + + // If we used an estimate for the uv intra rd in the loop above... + if (sf->use_uv_intra_rd_estimate) { + // Do Intra UV best rd mode selection if best mode choice above was intra. + if (best_mbmode.ref_frame[0] == INTRA_FRAME) { + TX_SIZE uv_tx_size; + *mi = best_mbmode; + uv_tx_size = get_uv_tx_size(mi, &xd->plane[1]); + rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size], + &rate_uv_tokenonly[uv_tx_size], + &dist_uv[uv_tx_size], &skip_uv[uv_tx_size], + bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize, + uv_tx_size); + } + } + + assert((cm->interp_filter == SWITCHABLE) || + (cm->interp_filter == best_mbmode.interp_filter) || + !is_inter_block(&best_mbmode)); + + if (!cpi->rc.is_src_frame_alt_ref) + vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact, + sf->adaptive_rd_thresh, bsize, best_mode_index); + + // macroblock modes + *mi = best_mbmode; + x->skip |= best_skip2; + + for (i = 0; i < REFERENCE_MODES; ++i) { + if (best_pred_rd[i] == INT64_MAX) + best_pred_diff[i] = INT_MIN; + else + best_pred_diff[i] = best_rd - best_pred_rd[i]; + } + + if (!x->skip) { + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { + if (best_filter_rd[i] == INT64_MAX) + best_filter_diff[i] = 0; + else + best_filter_diff[i] = best_rd - best_filter_rd[i]; + } + if (cm->interp_filter == SWITCHABLE) + assert(best_filter_diff[SWITCHABLE_FILTERS] == 0); + } else { + vp9_zero(best_filter_diff); + } + + // TODO(yunqingwang): Moving this line in front of the above best_filter_diff + // updating code causes PSNR loss. Need to figure out the confliction. + x->skip |= best_mode_skippable; + + if (!x->skip && !x->select_tx_size) { + int has_high_freq_coeff = 0; + int plane; + int max_plane = is_inter_block(xd->mi[0]) ? MAX_MB_PLANE : 1; + for (plane = 0; plane < max_plane; ++plane) { + x->plane[plane].eobs = ctx->eobs_pbuf[plane][1]; + has_high_freq_coeff |= vp9_has_high_freq_in_plane(x, bsize, plane); + } + + for (plane = max_plane; plane < MAX_MB_PLANE; ++plane) { + x->plane[plane].eobs = ctx->eobs_pbuf[plane][2]; + has_high_freq_coeff |= vp9_has_high_freq_in_plane(x, bsize, plane); + } + + best_mode_skippable |= !has_high_freq_coeff; + } + + assert(best_mode_index >= 0); + + store_coding_context(x, ctx, best_mode_index, best_pred_diff, + best_filter_diff, best_mode_skippable); +} + +void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, TileDataEnc *tile_data, + MACROBLOCK *x, RD_COST *rd_cost, + BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, + int64_t best_rd_so_far) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mi = xd->mi[0]; + unsigned char segment_id = mi->segment_id; + const int comp_pred = 0; + int i; + int64_t best_pred_diff[REFERENCE_MODES]; + int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]; + unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES]; + vpx_prob comp_mode_p; + INTERP_FILTER best_filter = SWITCHABLE; + int64_t this_rd = INT64_MAX; + int rate2 = 0; + const int64_t distortion2 = 0; + + x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; + + estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp, + &comp_mode_p); + + for (i = 0; i < MAX_REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX; + for (i = LAST_FRAME; i < MAX_REF_FRAMES; ++i) x->pred_mv_sad[i] = INT_MAX; + + rd_cost->rate = INT_MAX; + + assert(segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)); + + mi->mode = ZEROMV; + mi->uv_mode = DC_PRED; + mi->ref_frame[0] = LAST_FRAME; + mi->ref_frame[1] = NO_REF_FRAME; + mi->mv[0].as_int = 0; + x->skip = 1; + + ctx->sum_y_eobs = 0; + + if (cm->interp_filter != BILINEAR) { + best_filter = EIGHTTAP; + if (cm->interp_filter == SWITCHABLE && + x->source_variance >= cpi->sf.disable_filter_search_var_thresh) { + int rs; + int best_rs = INT_MAX; + for (i = 0; i < SWITCHABLE_FILTERS; ++i) { + mi->interp_filter = i; + rs = vp9_get_switchable_rate(cpi, xd); + if (rs < best_rs) { + best_rs = rs; + best_filter = mi->interp_filter; + } + } + } + } + // Set the appropriate filter + if (cm->interp_filter == SWITCHABLE) { + mi->interp_filter = best_filter; + rate2 += vp9_get_switchable_rate(cpi, xd); + } else { + mi->interp_filter = cm->interp_filter; + } + + if (cm->reference_mode == REFERENCE_MODE_SELECT) + rate2 += vp9_cost_bit(comp_mode_p, comp_pred); + + // Estimate the reference frame signaling cost and add it + // to the rolling cost variable. + rate2 += ref_costs_single[LAST_FRAME]; + this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); + + rd_cost->rate = rate2; + rd_cost->dist = distortion2; + rd_cost->rdcost = this_rd; + + if (this_rd >= best_rd_so_far) { + rd_cost->rate = INT_MAX; + rd_cost->rdcost = INT64_MAX; + return; + } + + assert((cm->interp_filter == SWITCHABLE) || + (cm->interp_filter == mi->interp_filter)); + + vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact, + cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV); + + vp9_zero(best_pred_diff); + vp9_zero(best_filter_diff); + + if (!x->select_tx_size) swap_block_ptr(x, ctx, 1, 0, 0, MAX_MB_PLANE); + store_coding_context(x, ctx, THR_ZEROMV, best_pred_diff, best_filter_diff, 0); +} + +void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, + MACROBLOCK *x, int mi_row, int mi_col, + RD_COST *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, + int64_t best_rd_so_far) { + VP9_COMMON *const cm = &cpi->common; + RD_OPT *const rd_opt = &cpi->rd; + SPEED_FEATURES *const sf = &cpi->sf; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mi = xd->mi[0]; + const struct segmentation *const seg = &cm->seg; + MV_REFERENCE_FRAME ref_frame, second_ref_frame; + unsigned char segment_id = mi->segment_id; + int comp_pred, i; + int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; + struct buf_2d yv12_mb[4][MAX_MB_PLANE]; + int64_t best_rd = best_rd_so_far; + int64_t best_yrd = best_rd_so_far; // FIXME(rbultje) more precise + int64_t best_pred_diff[REFERENCE_MODES]; + int64_t best_pred_rd[REFERENCE_MODES]; + int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS]; + int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]; + MODE_INFO best_mbmode; + int ref_index, best_ref_index = 0; + unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES]; + vpx_prob comp_mode_p; + INTERP_FILTER tmp_best_filter = SWITCHABLE; + int rate_uv_intra, rate_uv_tokenonly; + int64_t dist_uv; + int skip_uv; + PREDICTION_MODE mode_uv = DC_PRED; + const int intra_cost_penalty = + vp9_get_intra_cost_penalty(cpi, bsize, cm->base_qindex, cm->y_dc_delta_q); + int_mv seg_mvs[4][MAX_REF_FRAMES]; + b_mode_info best_bmodes[4]; + int best_skip2 = 0; + int ref_frame_skip_mask[2] = { 0 }; + int64_t mask_filter = 0; + int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS]; + int internal_active_edge = + vp9_active_edge_sb(cpi, mi_row, mi_col) && vp9_internal_image_edge(cpi); + const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize]; + + x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; + memset(x->zcoeff_blk[TX_4X4], 0, 4); + vp9_zero(best_mbmode); + + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) filter_cache[i] = INT64_MAX; + + for (i = 0; i < 4; i++) { + int j; + for (j = 0; j < MAX_REF_FRAMES; j++) seg_mvs[i][j].as_int = INVALID_MV; + } + + estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp, + &comp_mode_p); + + for (i = 0; i < REFERENCE_MODES; ++i) best_pred_rd[i] = INT64_MAX; + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) + best_filter_rd[i] = INT64_MAX; + rate_uv_intra = INT_MAX; + + rd_cost->rate = INT_MAX; + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { + if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) { + setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col, + frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb); + } else { + ref_frame_skip_mask[0] |= (1 << ref_frame); + ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; + } + frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; + frame_mv[ZEROMV][ref_frame].as_int = 0; + } + + for (ref_index = 0; ref_index < MAX_REFS; ++ref_index) { + int mode_excluded = 0; + int64_t this_rd = INT64_MAX; + int disable_skip = 0; + int compmode_cost = 0; + int rate2 = 0, rate_y = 0, rate_uv = 0; + int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0; + int skippable = 0; + int this_skip2 = 0; + int64_t total_sse = INT_MAX; + int early_term = 0; + struct buf_2d backup_yv12[2][MAX_MB_PLANE]; + + ref_frame = vp9_ref_order[ref_index].ref_frame[0]; + second_ref_frame = vp9_ref_order[ref_index].ref_frame[1]; + + vp9_zero(x->sum_y_eobs); + +#if CONFIG_BETTER_HW_COMPATIBILITY + // forbid 8X4 and 4X8 partitions if any reference frame is scaled. + if (bsize == BLOCK_8X4 || bsize == BLOCK_4X8) { + int ref_scaled = ref_frame > INTRA_FRAME && + vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf); + if (second_ref_frame > INTRA_FRAME) + ref_scaled += vp9_is_scaled(&cm->frame_refs[second_ref_frame - 1].sf); + if (ref_scaled) continue; + } +#endif + // Look at the reference frame of the best mode so far and set the + // skip mask to look at a subset of the remaining modes. + if (ref_index > 2 && sf->mode_skip_start < MAX_MODES) { + if (ref_index == 3) { + switch (best_mbmode.ref_frame[0]) { + case INTRA_FRAME: break; + case LAST_FRAME: + ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME); + ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; + break; + case GOLDEN_FRAME: + ref_frame_skip_mask[0] |= (1 << LAST_FRAME) | (1 << ALTREF_FRAME); + ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; + break; + case ALTREF_FRAME: + ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) | (1 << LAST_FRAME); + break; + case NO_REF_FRAME: + case MAX_REF_FRAMES: assert(0 && "Invalid Reference frame"); break; + } + } + } + + if ((ref_frame_skip_mask[0] & (1 << ref_frame)) && + (ref_frame_skip_mask[1] & (1 << VPXMAX(0, second_ref_frame)))) + continue; + + // Test best rd so far against threshold for trying this mode. + if (!internal_active_edge && + rd_less_than_thresh(best_rd, + rd_opt->threshes[segment_id][bsize][ref_index], + &rd_thresh_freq_fact[ref_index])) + continue; + + // This is only used in motion vector unit test. + if (cpi->oxcf.motion_vector_unit_test && ref_frame == INTRA_FRAME) continue; + + comp_pred = second_ref_frame > INTRA_FRAME; + if (comp_pred) { + if (!cpi->allow_comp_inter_inter) continue; + + if (cm->ref_frame_sign_bias[ref_frame] == + cm->ref_frame_sign_bias[second_ref_frame]) + continue; + + if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame))) + continue; + // Do not allow compound prediction if the segment level reference frame + // feature is in use as in this case there can only be one reference. + if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) continue; + + if ((sf->mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) && + best_mbmode.ref_frame[0] == INTRA_FRAME) + continue; + } + + if (comp_pred) + mode_excluded = cm->reference_mode == SINGLE_REFERENCE; + else if (ref_frame != INTRA_FRAME) + mode_excluded = cm->reference_mode == COMPOUND_REFERENCE; + + // If the segment reference frame feature is enabled.... + // then do nothing if the current ref frame is not allowed.. + if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) { + continue; + // Disable this drop out case if the ref frame + // segment level feature is enabled for this segment. This is to + // prevent the possibility that we end up unable to pick any mode. + } else if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) { + // Only consider ZEROMV/ALTREF_FRAME for alt ref frame, + // unless ARNR filtering is enabled in which case we want + // an unfiltered alternative. We allow near/nearest as well + // because they may result in zero-zero MVs but be cheaper. + if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) + continue; + } + + mi->tx_size = TX_4X4; + mi->uv_mode = DC_PRED; + mi->ref_frame[0] = ref_frame; + mi->ref_frame[1] = second_ref_frame; + // Evaluate all sub-pel filters irrespective of whether we can use + // them for this frame. + mi->interp_filter = + cm->interp_filter == SWITCHABLE ? EIGHTTAP : cm->interp_filter; + x->skip = 0; + set_ref_ptrs(cm, xd, ref_frame, second_ref_frame); + + // Select prediction reference frames. + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].pre[0] = yv12_mb[ref_frame][i]; + if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i]; + } + + if (ref_frame == INTRA_FRAME) { + int rate; + if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y, &distortion_y, + best_rd) >= best_rd) + continue; + rate2 += rate; + rate2 += intra_cost_penalty; + distortion2 += distortion_y; + + if (rate_uv_intra == INT_MAX) { + choose_intra_uv_mode(cpi, x, ctx, bsize, TX_4X4, &rate_uv_intra, + &rate_uv_tokenonly, &dist_uv, &skip_uv, &mode_uv); + } + rate2 += rate_uv_intra; + rate_uv = rate_uv_tokenonly; + distortion2 += dist_uv; + distortion_uv = dist_uv; + mi->uv_mode = mode_uv; + } else { + int rate; + int64_t distortion; + int64_t this_rd_thresh; + int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX; + int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX; + int64_t tmp_best_distortion = INT_MAX, tmp_best_sse, uv_sse; + int tmp_best_skippable = 0; + int switchable_filter_index; + int_mv *second_ref = + comp_pred ? &x->mbmi_ext->ref_mvs[second_ref_frame][0] : NULL; + b_mode_info tmp_best_bmodes[16]; + MODE_INFO tmp_best_mbmode; + BEST_SEG_INFO bsi[SWITCHABLE_FILTERS]; + int pred_exists = 0; + int uv_skippable; + + YV12_BUFFER_CONFIG *scaled_ref_frame[2] = { NULL, NULL }; + int ref; + + for (ref = 0; ref < 2; ++ref) { + scaled_ref_frame[ref] = + mi->ref_frame[ref] > INTRA_FRAME + ? vp9_get_scaled_ref_frame(cpi, mi->ref_frame[ref]) + : NULL; + + if (scaled_ref_frame[ref]) { + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // motion search code to be used without additional modifications. + for (i = 0; i < MAX_MB_PLANE; i++) + backup_yv12[ref][i] = xd->plane[i].pre[ref]; + vp9_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col, + NULL); + } + } + + this_rd_thresh = (ref_frame == LAST_FRAME) + ? rd_opt->threshes[segment_id][bsize][THR_LAST] + : rd_opt->threshes[segment_id][bsize][THR_ALTR]; + this_rd_thresh = (ref_frame == GOLDEN_FRAME) + ? rd_opt->threshes[segment_id][bsize][THR_GOLD] + : this_rd_thresh; + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) + filter_cache[i] = INT64_MAX; + + if (cm->interp_filter != BILINEAR) { + tmp_best_filter = EIGHTTAP; + if (x->source_variance < sf->disable_filter_search_var_thresh) { + tmp_best_filter = EIGHTTAP; + } else if (sf->adaptive_pred_interp_filter == 1 && + ctx->pred_interp_filter < SWITCHABLE) { + tmp_best_filter = ctx->pred_interp_filter; + } else if (sf->adaptive_pred_interp_filter == 2) { + tmp_best_filter = ctx->pred_interp_filter < SWITCHABLE + ? ctx->pred_interp_filter + : 0; + } else { + for (switchable_filter_index = 0; + switchable_filter_index < SWITCHABLE_FILTERS; + ++switchable_filter_index) { + int newbest, rs; + int64_t rs_rd; + MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext; + mi->interp_filter = switchable_filter_index; + tmp_rd = rd_pick_best_sub8x8_mode( + cpi, x, &mbmi_ext->ref_mvs[ref_frame][0], second_ref, best_yrd, + &rate, &rate_y, &distortion, &skippable, &total_sse, + (int)this_rd_thresh, seg_mvs, bsi, switchable_filter_index, + mi_row, mi_col); + + if (tmp_rd == INT64_MAX) continue; + rs = vp9_get_switchable_rate(cpi, xd); + rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0); + filter_cache[switchable_filter_index] = tmp_rd; + filter_cache[SWITCHABLE_FILTERS] = + VPXMIN(filter_cache[SWITCHABLE_FILTERS], tmp_rd + rs_rd); + if (cm->interp_filter == SWITCHABLE) tmp_rd += rs_rd; + + mask_filter = VPXMAX(mask_filter, tmp_rd); + + newbest = (tmp_rd < tmp_best_rd); + if (newbest) { + tmp_best_filter = mi->interp_filter; + tmp_best_rd = tmp_rd; + } + if ((newbest && cm->interp_filter == SWITCHABLE) || + (mi->interp_filter == cm->interp_filter && + cm->interp_filter != SWITCHABLE)) { + tmp_best_rdu = tmp_rd; + tmp_best_rate = rate; + tmp_best_ratey = rate_y; + tmp_best_distortion = distortion; + tmp_best_sse = total_sse; + tmp_best_skippable = skippable; + tmp_best_mbmode = *mi; + x->sum_y_eobs[TX_4X4] = 0; + for (i = 0; i < 4; i++) { + tmp_best_bmodes[i] = xd->mi[0]->bmi[i]; + x->zcoeff_blk[TX_4X4][i] = !x->plane[0].eobs[i]; + x->sum_y_eobs[TX_4X4] += x->plane[0].eobs[i]; + } + pred_exists = 1; + if (switchable_filter_index == 0 && sf->use_rd_breakout && + best_rd < INT64_MAX) { + if (tmp_best_rdu / 2 > best_rd) { + // skip searching the other filters if the first is + // already substantially larger than the best so far + tmp_best_filter = mi->interp_filter; + tmp_best_rdu = INT64_MAX; + break; + } + } + } + } // switchable_filter_index loop + } + } + + if (tmp_best_rdu == INT64_MAX && pred_exists) continue; + + mi->interp_filter = (cm->interp_filter == SWITCHABLE ? tmp_best_filter + : cm->interp_filter); + if (!pred_exists) { + // Handles the special case when a filter that is not in the + // switchable list (bilinear, 6-tap) is indicated at the frame level + tmp_rd = rd_pick_best_sub8x8_mode( + cpi, x, &x->mbmi_ext->ref_mvs[ref_frame][0], second_ref, best_yrd, + &rate, &rate_y, &distortion, &skippable, &total_sse, + (int)this_rd_thresh, seg_mvs, bsi, 0, mi_row, mi_col); + if (tmp_rd == INT64_MAX) continue; + x->sum_y_eobs[TX_4X4] = 0; + for (i = 0; i < 4; i++) { + x->zcoeff_blk[TX_4X4][i] = !x->plane[0].eobs[i]; + x->sum_y_eobs[TX_4X4] += x->plane[0].eobs[i]; + } + } else { + total_sse = tmp_best_sse; + rate = tmp_best_rate; + rate_y = tmp_best_ratey; + distortion = tmp_best_distortion; + skippable = tmp_best_skippable; + *mi = tmp_best_mbmode; + for (i = 0; i < 4; i++) xd->mi[0]->bmi[i] = tmp_best_bmodes[i]; + } + + rate2 += rate; + distortion2 += distortion; + + if (cm->interp_filter == SWITCHABLE) + rate2 += vp9_get_switchable_rate(cpi, xd); + + if (!mode_excluded) + mode_excluded = comp_pred ? cm->reference_mode == SINGLE_REFERENCE + : cm->reference_mode == COMPOUND_REFERENCE; + + compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred); + + tmp_best_rdu = + best_rd - VPXMIN(RDCOST(x->rdmult, x->rddiv, rate2, distortion2), + RDCOST(x->rdmult, x->rddiv, 0, total_sse)); + + if (tmp_best_rdu > 0) { + // If even the 'Y' rd value of split is higher than best so far + // then don't bother looking at UV + vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col, BLOCK_8X8); + memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm)); + if (!super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable, + &uv_sse, BLOCK_8X8, tmp_best_rdu)) { + for (ref = 0; ref < 2; ++ref) { + if (scaled_ref_frame[ref]) { + for (i = 0; i < MAX_MB_PLANE; ++i) + xd->plane[i].pre[ref] = backup_yv12[ref][i]; + } + } + continue; + } + + rate2 += rate_uv; + distortion2 += distortion_uv; + skippable = skippable && uv_skippable; + total_sse += uv_sse; + } + + for (ref = 0; ref < 2; ++ref) { + if (scaled_ref_frame[ref]) { + // Restore the prediction frame pointers to their unscaled versions. + for (i = 0; i < MAX_MB_PLANE; ++i) + xd->plane[i].pre[ref] = backup_yv12[ref][i]; + } + } + } + + if (cm->reference_mode == REFERENCE_MODE_SELECT) rate2 += compmode_cost; + + // Estimate the reference frame signaling cost and add it + // to the rolling cost variable. + if (second_ref_frame > INTRA_FRAME) { + rate2 += ref_costs_comp[ref_frame]; + } else { + rate2 += ref_costs_single[ref_frame]; + } + + if (!disable_skip) { + const vpx_prob skip_prob = vp9_get_skip_prob(cm, xd); + const int skip_cost0 = vp9_cost_bit(skip_prob, 0); + const int skip_cost1 = vp9_cost_bit(skip_prob, 1); + + // Skip is never coded at the segment level for sub8x8 blocks and instead + // always coded in the bitstream at the mode info level. + if (ref_frame != INTRA_FRAME && !xd->lossless) { + if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv + skip_cost0, + distortion2) < + RDCOST(x->rdmult, x->rddiv, skip_cost1, total_sse)) { + // Add in the cost of the no skip flag. + rate2 += skip_cost0; + } else { + // FIXME(rbultje) make this work for splitmv also + rate2 += skip_cost1; + distortion2 = total_sse; + assert(total_sse >= 0); + rate2 -= (rate_y + rate_uv); + rate_y = 0; + rate_uv = 0; + this_skip2 = 1; + } + } else { + // Add in the cost of the no skip flag. + rate2 += skip_cost0; + } + + // Calculate the final RD estimate for this mode. + this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); + } + + if (!disable_skip && ref_frame == INTRA_FRAME) { + for (i = 0; i < REFERENCE_MODES; ++i) + best_pred_rd[i] = VPXMIN(best_pred_rd[i], this_rd); + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) + best_filter_rd[i] = VPXMIN(best_filter_rd[i], this_rd); + } + + // Did this mode help.. i.e. is it the new best mode + if (this_rd < best_rd || x->skip) { + if (!mode_excluded) { + int max_plane = MAX_MB_PLANE; + // Note index of best mode so far + best_ref_index = ref_index; + + if (ref_frame == INTRA_FRAME) { + /* required for left and above block mv */ + mi->mv[0].as_int = 0; + max_plane = 1; + // Initialize interp_filter here so we do not have to check for + // inter block modes in get_pred_context_switchable_interp() + mi->interp_filter = SWITCHABLE_FILTERS; + } + + rd_cost->rate = rate2; + rd_cost->dist = distortion2; + rd_cost->rdcost = this_rd; + best_rd = this_rd; + best_yrd = + best_rd - RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv); + best_mbmode = *mi; + best_skip2 = this_skip2; + if (!x->select_tx_size) swap_block_ptr(x, ctx, 1, 0, 0, max_plane); + memcpy(ctx->zcoeff_blk, x->zcoeff_blk[TX_4X4], + sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk); + ctx->sum_y_eobs = x->sum_y_eobs[TX_4X4]; + + for (i = 0; i < 4; i++) best_bmodes[i] = xd->mi[0]->bmi[i]; + + // TODO(debargha): enhance this test with a better distortion prediction + // based on qp, activity mask and history + if ((sf->mode_search_skip_flags & FLAG_EARLY_TERMINATE) && + (ref_index > MIN_EARLY_TERM_INDEX)) { + int qstep = xd->plane[0].dequant[1]; + // TODO(debargha): Enhance this by specializing for each mode_index + int scale = 4; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + qstep >>= (xd->bd - 8); + } +#endif // CONFIG_VP9_HIGHBITDEPTH + if (x->source_variance < UINT_MAX) { + const int var_adjust = (x->source_variance < 16); + scale -= var_adjust; + } + if (ref_frame > INTRA_FRAME && distortion2 * scale < qstep * qstep) { + early_term = 1; + } + } + } + } + + /* keep record of best compound/single-only prediction */ + if (!disable_skip && ref_frame != INTRA_FRAME) { + int64_t single_rd, hybrid_rd, single_rate, hybrid_rate; + + if (cm->reference_mode == REFERENCE_MODE_SELECT) { + single_rate = rate2 - compmode_cost; + hybrid_rate = rate2; + } else { + single_rate = rate2; + hybrid_rate = rate2 + compmode_cost; + } + + single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2); + hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2); + + if (!comp_pred && single_rd < best_pred_rd[SINGLE_REFERENCE]) + best_pred_rd[SINGLE_REFERENCE] = single_rd; + else if (comp_pred && single_rd < best_pred_rd[COMPOUND_REFERENCE]) + best_pred_rd[COMPOUND_REFERENCE] = single_rd; + + if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT]) + best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd; + } + + /* keep record of best filter type */ + if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME && + cm->interp_filter != BILINEAR) { + int64_t ref = + filter_cache[cm->interp_filter == SWITCHABLE ? SWITCHABLE_FILTERS + : cm->interp_filter]; + int64_t adj_rd; + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { + if (ref == INT64_MAX) + adj_rd = 0; + else if (filter_cache[i] == INT64_MAX) + // when early termination is triggered, the encoder does not have + // access to the rate-distortion cost. it only knows that the cost + // should be above the maximum valid value. hence it takes the known + // maximum plus an arbitrary constant as the rate-distortion cost. + adj_rd = mask_filter - ref + 10; + else + adj_rd = filter_cache[i] - ref; + + adj_rd += this_rd; + best_filter_rd[i] = VPXMIN(best_filter_rd[i], adj_rd); + } + } + + if (early_term) break; + + if (x->skip && !comp_pred) break; + } + + if (best_rd >= best_rd_so_far) { + rd_cost->rate = INT_MAX; + rd_cost->rdcost = INT64_MAX; + return; + } + + // If we used an estimate for the uv intra rd in the loop above... + if (sf->use_uv_intra_rd_estimate) { + // Do Intra UV best rd mode selection if best mode choice above was intra. + if (best_mbmode.ref_frame[0] == INTRA_FRAME) { + *mi = best_mbmode; + rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra, &rate_uv_tokenonly, + &dist_uv, &skip_uv, BLOCK_8X8, TX_4X4); + } + } + + if (best_rd == INT64_MAX) { + rd_cost->rate = INT_MAX; + rd_cost->dist = INT64_MAX; + rd_cost->rdcost = INT64_MAX; + return; + } + + assert((cm->interp_filter == SWITCHABLE) || + (cm->interp_filter == best_mbmode.interp_filter) || + !is_inter_block(&best_mbmode)); + + vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact, sf->adaptive_rd_thresh, + bsize, best_ref_index); + + // macroblock modes + *mi = best_mbmode; + x->skip |= best_skip2; + if (!is_inter_block(&best_mbmode)) { + for (i = 0; i < 4; i++) xd->mi[0]->bmi[i].as_mode = best_bmodes[i].as_mode; + } else { + for (i = 0; i < 4; ++i) + memcpy(&xd->mi[0]->bmi[i], &best_bmodes[i], sizeof(b_mode_info)); + + mi->mv[0].as_int = xd->mi[0]->bmi[3].as_mv[0].as_int; + mi->mv[1].as_int = xd->mi[0]->bmi[3].as_mv[1].as_int; + } + // If the second reference does not exist, set the corresponding mv to zero. + if (mi->ref_frame[1] == NO_REF_FRAME) { + mi->mv[1].as_int = 0; + for (i = 0; i < 4; ++i) { + mi->bmi[i].as_mv[1].as_int = 0; + } + } + + for (i = 0; i < REFERENCE_MODES; ++i) { + if (best_pred_rd[i] == INT64_MAX) + best_pred_diff[i] = INT_MIN; + else + best_pred_diff[i] = best_rd - best_pred_rd[i]; + } + + if (!x->skip) { + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { + if (best_filter_rd[i] == INT64_MAX) + best_filter_diff[i] = 0; + else + best_filter_diff[i] = best_rd - best_filter_rd[i]; + } + if (cm->interp_filter == SWITCHABLE) + assert(best_filter_diff[SWITCHABLE_FILTERS] == 0); + } else { + vp9_zero(best_filter_diff); + } + + store_coding_context(x, ctx, best_ref_index, best_pred_diff, best_filter_diff, + 0); +} +#endif // !CONFIG_REALTIME_ONLY diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.h b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.h new file mode 100644 index 0000000000..e1147ff943 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_RDOPT_H_ +#define VPX_VP9_ENCODER_VP9_RDOPT_H_ + +#include "vp9/common/vp9_blockd.h" + +#include "vp9/encoder/vp9_block.h" +#include "vp9/encoder/vp9_context_tree.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct TileInfo; +struct VP9_COMP; +struct macroblock; +struct RD_COST; + +void vp9_rd_pick_intra_mode_sb(struct VP9_COMP *cpi, struct macroblock *x, + struct RD_COST *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, int64_t best_rd); + +#if !CONFIG_REALTIME_ONLY +void vp9_rd_pick_inter_mode_sb(struct VP9_COMP *cpi, + struct TileDataEnc *tile_data, + struct macroblock *x, int mi_row, int mi_col, + struct RD_COST *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far); + +void vp9_rd_pick_inter_mode_sb_seg_skip( + struct VP9_COMP *cpi, struct TileDataEnc *tile_data, struct macroblock *x, + struct RD_COST *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, + int64_t best_rd_so_far); +#endif + +int vp9_internal_image_edge(struct VP9_COMP *cpi); +int vp9_active_h_edge(struct VP9_COMP *cpi, int mi_row, int mi_step); +int vp9_active_v_edge(struct VP9_COMP *cpi, int mi_col, int mi_step); +int vp9_active_edge_sb(struct VP9_COMP *cpi, int mi_row, int mi_col); + +#if !CONFIG_REALTIME_ONLY +void vp9_rd_pick_inter_mode_sub8x8(struct VP9_COMP *cpi, + struct TileDataEnc *tile_data, + struct macroblock *x, int mi_row, int mi_col, + struct RD_COST *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, + int64_t best_rd_so_far); +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_RDOPT_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_resize.c b/media/libvpx/libvpx/vp9/encoder/vp9_resize.c new file mode 100644 index 0000000000..ca55ec9886 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_resize.c @@ -0,0 +1,832 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include +#include +#include + +#include "./vpx_config.h" +#if CONFIG_VP9_HIGHBITDEPTH +#include "vpx_dsp/vpx_dsp_common.h" +#endif // CONFIG_VP9_HIGHBITDEPTH +#include "vpx_ports/mem.h" +#include "vp9/common/vp9_common.h" +#include "vp9/encoder/vp9_resize.h" + +#define FILTER_BITS 7 + +#define INTERP_TAPS 8 +#define SUBPEL_BITS 5 +#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1) +#define INTERP_PRECISION_BITS 32 + +typedef int16_t interp_kernel[INTERP_TAPS]; + +// Filters for interpolation (0.5-band) - note this also filters integer pels. +static const interp_kernel filteredinterp_filters500[(1 << SUBPEL_BITS)] = { + { -3, 0, 35, 64, 35, 0, -3, 0 }, { -3, -1, 34, 64, 36, 1, -3, 0 }, + { -3, -1, 32, 64, 38, 1, -3, 0 }, { -2, -2, 31, 63, 39, 2, -3, 0 }, + { -2, -2, 29, 63, 41, 2, -3, 0 }, { -2, -2, 28, 63, 42, 3, -4, 0 }, + { -2, -3, 27, 63, 43, 4, -4, 0 }, { -2, -3, 25, 62, 45, 5, -4, 0 }, + { -2, -3, 24, 62, 46, 5, -4, 0 }, { -2, -3, 23, 61, 47, 6, -4, 0 }, + { -2, -3, 21, 60, 49, 7, -4, 0 }, { -1, -4, 20, 60, 50, 8, -4, -1 }, + { -1, -4, 19, 59, 51, 9, -4, -1 }, { -1, -4, 17, 58, 52, 10, -4, 0 }, + { -1, -4, 16, 57, 53, 12, -4, -1 }, { -1, -4, 15, 56, 54, 13, -4, -1 }, + { -1, -4, 14, 55, 55, 14, -4, -1 }, { -1, -4, 13, 54, 56, 15, -4, -1 }, + { -1, -4, 12, 53, 57, 16, -4, -1 }, { 0, -4, 10, 52, 58, 17, -4, -1 }, + { -1, -4, 9, 51, 59, 19, -4, -1 }, { -1, -4, 8, 50, 60, 20, -4, -1 }, + { 0, -4, 7, 49, 60, 21, -3, -2 }, { 0, -4, 6, 47, 61, 23, -3, -2 }, + { 0, -4, 5, 46, 62, 24, -3, -2 }, { 0, -4, 5, 45, 62, 25, -3, -2 }, + { 0, -4, 4, 43, 63, 27, -3, -2 }, { 0, -4, 3, 42, 63, 28, -2, -2 }, + { 0, -3, 2, 41, 63, 29, -2, -2 }, { 0, -3, 2, 39, 63, 31, -2, -2 }, + { 0, -3, 1, 38, 64, 32, -1, -3 }, { 0, -3, 1, 36, 64, 34, -1, -3 } +}; + +// Filters for interpolation (0.625-band) - note this also filters integer pels. +static const interp_kernel filteredinterp_filters625[(1 << SUBPEL_BITS)] = { + { -1, -8, 33, 80, 33, -8, -1, 0 }, { -1, -8, 30, 80, 35, -8, -1, 1 }, + { -1, -8, 28, 80, 37, -7, -2, 1 }, { 0, -8, 26, 79, 39, -7, -2, 1 }, + { 0, -8, 24, 79, 41, -7, -2, 1 }, { 0, -8, 22, 78, 43, -6, -2, 1 }, + { 0, -8, 20, 78, 45, -5, -3, 1 }, { 0, -8, 18, 77, 48, -5, -3, 1 }, + { 0, -8, 16, 76, 50, -4, -3, 1 }, { 0, -8, 15, 75, 52, -3, -4, 1 }, + { 0, -7, 13, 74, 54, -3, -4, 1 }, { 0, -7, 11, 73, 56, -2, -4, 1 }, + { 0, -7, 10, 71, 58, -1, -4, 1 }, { 1, -7, 8, 70, 60, 0, -5, 1 }, + { 1, -6, 6, 68, 62, 1, -5, 1 }, { 1, -6, 5, 67, 63, 2, -5, 1 }, + { 1, -6, 4, 65, 65, 4, -6, 1 }, { 1, -5, 2, 63, 67, 5, -6, 1 }, + { 1, -5, 1, 62, 68, 6, -6, 1 }, { 1, -5, 0, 60, 70, 8, -7, 1 }, + { 1, -4, -1, 58, 71, 10, -7, 0 }, { 1, -4, -2, 56, 73, 11, -7, 0 }, + { 1, -4, -3, 54, 74, 13, -7, 0 }, { 1, -4, -3, 52, 75, 15, -8, 0 }, + { 1, -3, -4, 50, 76, 16, -8, 0 }, { 1, -3, -5, 48, 77, 18, -8, 0 }, + { 1, -3, -5, 45, 78, 20, -8, 0 }, { 1, -2, -6, 43, 78, 22, -8, 0 }, + { 1, -2, -7, 41, 79, 24, -8, 0 }, { 1, -2, -7, 39, 79, 26, -8, 0 }, + { 1, -2, -7, 37, 80, 28, -8, -1 }, { 1, -1, -8, 35, 80, 30, -8, -1 }, +}; + +// Filters for interpolation (0.75-band) - note this also filters integer pels. +static const interp_kernel filteredinterp_filters750[(1 << SUBPEL_BITS)] = { + { 2, -11, 25, 96, 25, -11, 2, 0 }, { 2, -11, 22, 96, 28, -11, 2, 0 }, + { 2, -10, 19, 95, 31, -11, 2, 0 }, { 2, -10, 17, 95, 34, -12, 2, 0 }, + { 2, -9, 14, 94, 37, -12, 2, 0 }, { 2, -8, 12, 93, 40, -12, 1, 0 }, + { 2, -8, 9, 92, 43, -12, 1, 1 }, { 2, -7, 7, 91, 46, -12, 1, 0 }, + { 2, -7, 5, 90, 49, -12, 1, 0 }, { 2, -6, 3, 88, 52, -12, 0, 1 }, + { 2, -5, 1, 86, 55, -12, 0, 1 }, { 2, -5, -1, 84, 58, -11, 0, 1 }, + { 2, -4, -2, 82, 61, -11, -1, 1 }, { 2, -4, -4, 80, 64, -10, -1, 1 }, + { 1, -3, -5, 77, 67, -9, -1, 1 }, { 1, -3, -6, 75, 70, -8, -2, 1 }, + { 1, -2, -7, 72, 72, -7, -2, 1 }, { 1, -2, -8, 70, 75, -6, -3, 1 }, + { 1, -1, -9, 67, 77, -5, -3, 1 }, { 1, -1, -10, 64, 80, -4, -4, 2 }, + { 1, -1, -11, 61, 82, -2, -4, 2 }, { 1, 0, -11, 58, 84, -1, -5, 2 }, + { 1, 0, -12, 55, 86, 1, -5, 2 }, { 1, 0, -12, 52, 88, 3, -6, 2 }, + { 0, 1, -12, 49, 90, 5, -7, 2 }, { 0, 1, -12, 46, 91, 7, -7, 2 }, + { 1, 1, -12, 43, 92, 9, -8, 2 }, { 0, 1, -12, 40, 93, 12, -8, 2 }, + { 0, 2, -12, 37, 94, 14, -9, 2 }, { 0, 2, -12, 34, 95, 17, -10, 2 }, + { 0, 2, -11, 31, 95, 19, -10, 2 }, { 0, 2, -11, 28, 96, 22, -11, 2 } +}; + +// Filters for interpolation (0.875-band) - note this also filters integer pels. +static const interp_kernel filteredinterp_filters875[(1 << SUBPEL_BITS)] = { + { 3, -8, 13, 112, 13, -8, 3, 0 }, { 3, -7, 10, 112, 17, -9, 3, -1 }, + { 2, -6, 7, 111, 21, -9, 3, -1 }, { 2, -5, 4, 111, 24, -10, 3, -1 }, + { 2, -4, 1, 110, 28, -11, 3, -1 }, { 1, -3, -1, 108, 32, -12, 4, -1 }, + { 1, -2, -3, 106, 36, -13, 4, -1 }, { 1, -1, -6, 105, 40, -14, 4, -1 }, + { 1, -1, -7, 102, 44, -14, 4, -1 }, { 1, 0, -9, 100, 48, -15, 4, -1 }, + { 1, 1, -11, 97, 53, -16, 4, -1 }, { 0, 1, -12, 95, 57, -16, 4, -1 }, + { 0, 2, -13, 91, 61, -16, 4, -1 }, { 0, 2, -14, 88, 65, -16, 4, -1 }, + { 0, 3, -15, 84, 69, -17, 4, 0 }, { 0, 3, -16, 81, 73, -16, 3, 0 }, + { 0, 3, -16, 77, 77, -16, 3, 0 }, { 0, 3, -16, 73, 81, -16, 3, 0 }, + { 0, 4, -17, 69, 84, -15, 3, 0 }, { -1, 4, -16, 65, 88, -14, 2, 0 }, + { -1, 4, -16, 61, 91, -13, 2, 0 }, { -1, 4, -16, 57, 95, -12, 1, 0 }, + { -1, 4, -16, 53, 97, -11, 1, 1 }, { -1, 4, -15, 48, 100, -9, 0, 1 }, + { -1, 4, -14, 44, 102, -7, -1, 1 }, { -1, 4, -14, 40, 105, -6, -1, 1 }, + { -1, 4, -13, 36, 106, -3, -2, 1 }, { -1, 4, -12, 32, 108, -1, -3, 1 }, + { -1, 3, -11, 28, 110, 1, -4, 2 }, { -1, 3, -10, 24, 111, 4, -5, 2 }, + { -1, 3, -9, 21, 111, 7, -6, 2 }, { -1, 3, -9, 17, 112, 10, -7, 3 } +}; + +// Filters for interpolation (full-band) - no filtering for integer pixels +static const interp_kernel filteredinterp_filters1000[(1 << SUBPEL_BITS)] = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 1, -3, 128, 3, -1, 0, 0 }, + { -1, 2, -6, 127, 7, -2, 1, 0 }, { -1, 3, -9, 126, 12, -4, 1, 0 }, + { -1, 4, -12, 125, 16, -5, 1, 0 }, { -1, 4, -14, 123, 20, -6, 2, 0 }, + { -1, 5, -15, 120, 25, -8, 2, 0 }, { -1, 5, -17, 118, 30, -9, 3, -1 }, + { -1, 6, -18, 114, 35, -10, 3, -1 }, { -1, 6, -19, 111, 41, -12, 3, -1 }, + { -1, 6, -20, 107, 46, -13, 4, -1 }, { -1, 6, -21, 103, 52, -14, 4, -1 }, + { -1, 6, -21, 99, 57, -16, 5, -1 }, { -1, 6, -21, 94, 63, -17, 5, -1 }, + { -1, 6, -20, 89, 68, -18, 5, -1 }, { -1, 6, -20, 84, 73, -19, 6, -1 }, + { -1, 6, -20, 79, 79, -20, 6, -1 }, { -1, 6, -19, 73, 84, -20, 6, -1 }, + { -1, 5, -18, 68, 89, -20, 6, -1 }, { -1, 5, -17, 63, 94, -21, 6, -1 }, + { -1, 5, -16, 57, 99, -21, 6, -1 }, { -1, 4, -14, 52, 103, -21, 6, -1 }, + { -1, 4, -13, 46, 107, -20, 6, -1 }, { -1, 3, -12, 41, 111, -19, 6, -1 }, + { -1, 3, -10, 35, 114, -18, 6, -1 }, { -1, 3, -9, 30, 118, -17, 5, -1 }, + { 0, 2, -8, 25, 120, -15, 5, -1 }, { 0, 2, -6, 20, 123, -14, 4, -1 }, + { 0, 1, -5, 16, 125, -12, 4, -1 }, { 0, 1, -4, 12, 126, -9, 3, -1 }, + { 0, 1, -2, 7, 127, -6, 2, -1 }, { 0, 0, -1, 3, 128, -3, 1, 0 } +}; + +// Filters for factor of 2 downsampling. +static const int16_t vp9_down2_symeven_half_filter[] = { 56, 12, -3, -1 }; +static const int16_t vp9_down2_symodd_half_filter[] = { 64, 35, 0, -3 }; + +static const interp_kernel *choose_interp_filter(int inlength, int outlength) { + int outlength16 = outlength * 16; + if (outlength16 >= inlength * 16) + return filteredinterp_filters1000; + else if (outlength16 >= inlength * 13) + return filteredinterp_filters875; + else if (outlength16 >= inlength * 11) + return filteredinterp_filters750; + else if (outlength16 >= inlength * 9) + return filteredinterp_filters625; + else + return filteredinterp_filters500; +} + +static void interpolate(const uint8_t *const input, int inlength, + uint8_t *output, int outlength) { + const int64_t delta = + (((uint64_t)inlength << 32) + outlength / 2) / outlength; + const int64_t offset = + inlength > outlength + ? (((int64_t)(inlength - outlength) << 31) + outlength / 2) / + outlength + : -(((int64_t)(outlength - inlength) << 31) + outlength / 2) / + outlength; + uint8_t *optr = output; + int x, x1, x2, sum, k, int_pel, sub_pel; + int64_t y; + + const interp_kernel *interp_filters = + choose_interp_filter(inlength, outlength); + + x = 0; + y = offset; + while ((y >> INTERP_PRECISION_BITS) < (INTERP_TAPS / 2 - 1)) { + x++; + y += delta; + } + x1 = x; + x = outlength - 1; + y = delta * x + offset; + while ((y >> INTERP_PRECISION_BITS) + (int64_t)(INTERP_TAPS / 2) >= + inlength) { + x--; + y -= delta; + } + x2 = x; + if (x1 > x2) { + for (x = 0, y = offset; x < outlength; ++x, y += delta) { + const int16_t *filter; + int_pel = y >> INTERP_PRECISION_BITS; + sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK; + filter = interp_filters[sub_pel]; + sum = 0; + for (k = 0; k < INTERP_TAPS; ++k) { + const int pk = int_pel - INTERP_TAPS / 2 + 1 + k; + sum += filter[k] * + input[(pk < 0 ? 0 : (pk >= inlength ? inlength - 1 : pk))]; + } + *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + } else { + // Initial part. + for (x = 0, y = offset; x < x1; ++x, y += delta) { + const int16_t *filter; + int_pel = y >> INTERP_PRECISION_BITS; + sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK; + filter = interp_filters[sub_pel]; + sum = 0; + for (k = 0; k < INTERP_TAPS; ++k) + sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k < 0 + ? 0 + : int_pel - INTERP_TAPS / 2 + 1 + k)]; + *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + // Middle part. + for (; x <= x2; ++x, y += delta) { + const int16_t *filter; + int_pel = y >> INTERP_PRECISION_BITS; + sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK; + filter = interp_filters[sub_pel]; + sum = 0; + for (k = 0; k < INTERP_TAPS; ++k) + sum += filter[k] * input[int_pel - INTERP_TAPS / 2 + 1 + k]; + *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + // End part. + for (; x < outlength; ++x, y += delta) { + const int16_t *filter; + int_pel = y >> INTERP_PRECISION_BITS; + sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK; + filter = interp_filters[sub_pel]; + sum = 0; + for (k = 0; k < INTERP_TAPS; ++k) + sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k >= inlength + ? inlength - 1 + : int_pel - INTERP_TAPS / 2 + 1 + k)]; + *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + } +} + +static void down2_symeven(const uint8_t *const input, int length, + uint8_t *output) { + // Actual filter len = 2 * filter_len_half. + const int16_t *filter = vp9_down2_symeven_half_filter; + const int filter_len_half = sizeof(vp9_down2_symeven_half_filter) / 2; + int i, j; + uint8_t *optr = output; + int l1 = filter_len_half; + int l2 = (length - filter_len_half); + l1 += (l1 & 1); + l2 += (l2 & 1); + if (l1 > l2) { + // Short input length. + for (i = 0; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += (input[(i - j < 0 ? 0 : i - j)] + + input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) * + filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + } else { + // Initial part. + for (i = 0; i < l1; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + 1 + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + // Middle part. + for (; i < l2; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += (input[i - j] + input[i + 1 + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + // End part. + for (; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += (input[i - j] + + input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) * + filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + } +} + +static void down2_symodd(const uint8_t *const input, int length, + uint8_t *output) { + // Actual filter len = 2 * filter_len_half - 1. + const int16_t *filter = vp9_down2_symodd_half_filter; + const int filter_len_half = sizeof(vp9_down2_symodd_half_filter) / 2; + int i, j; + uint8_t *optr = output; + int l1 = filter_len_half - 1; + int l2 = (length - filter_len_half + 1); + l1 += (l1 & 1); + l2 += (l2 & 1); + if (l1 > l2) { + // Short input length. + for (i = 0; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[(i - j < 0 ? 0 : i - j)] + + input[(i + j >= length ? length - 1 : i + j)]) * + filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + } else { + // Initial part. + for (i = 0; i < l1; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + // Middle part. + for (; i < l2; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[i - j] + input[i + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + // End part. + for (; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[i - j] + input[(i + j >= length ? length - 1 : i + j)]) * + filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + } +} + +static int get_down2_length(int length, int steps) { + int s; + for (s = 0; s < steps; ++s) length = (length + 1) >> 1; + return length; +} + +static int get_down2_steps(int in_length, int out_length) { + int steps = 0; + int proj_in_length; + while ((proj_in_length = get_down2_length(in_length, 1)) >= out_length) { + ++steps; + in_length = proj_in_length; + if (in_length == 1) { + // Special case: we break because any further calls to get_down2_length() + // with be with length == 1, which return 1, resulting in an infinite + // loop. + break; + } + } + return steps; +} + +static void resize_multistep(const uint8_t *const input, int length, + uint8_t *output, int olength, uint8_t *otmp) { + int steps; + if (length == olength) { + memcpy(output, input, sizeof(output[0]) * length); + return; + } + steps = get_down2_steps(length, olength); + + if (steps > 0) { + int s; + uint8_t *out = NULL; + uint8_t *otmp2; + int filteredlength = length; + + assert(otmp != NULL); + otmp2 = otmp + get_down2_length(length, 1); + for (s = 0; s < steps; ++s) { + const int proj_filteredlength = get_down2_length(filteredlength, 1); + const uint8_t *const in = (s == 0 ? input : out); + if (s == steps - 1 && proj_filteredlength == olength) + out = output; + else + out = (s & 1 ? otmp2 : otmp); + if (filteredlength & 1) + down2_symodd(in, filteredlength, out); + else + down2_symeven(in, filteredlength, out); + filteredlength = proj_filteredlength; + } + if (filteredlength != olength) { + interpolate(out, filteredlength, output, olength); + } + } else { + interpolate(input, length, output, olength); + } +} + +static void fill_col_to_arr(uint8_t *img, int stride, int len, uint8_t *arr) { + int i; + uint8_t *iptr = img; + uint8_t *aptr = arr; + for (i = 0; i < len; ++i, iptr += stride) { + *aptr++ = *iptr; + } +} + +static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) { + int i; + uint8_t *iptr = img; + uint8_t *aptr = arr; + for (i = 0; i < len; ++i, iptr += stride) { + *iptr = *aptr++; + } +} + +void vp9_resize_plane(const uint8_t *const input, int height, int width, + int in_stride, uint8_t *output, int height2, int width2, + int out_stride) { + int i; + uint8_t *intbuf = (uint8_t *)calloc(width2 * height, sizeof(*intbuf)); + uint8_t *tmpbuf = + (uint8_t *)calloc(width < height ? height : width, sizeof(*tmpbuf)); + uint8_t *arrbuf = (uint8_t *)calloc(height, sizeof(*arrbuf)); + uint8_t *arrbuf2 = (uint8_t *)calloc(height2, sizeof(*arrbuf2)); + if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) + goto Error; + assert(width > 0); + assert(height > 0); + assert(width2 > 0); + assert(height2 > 0); + for (i = 0; i < height; ++i) + resize_multistep(input + in_stride * i, width, intbuf + width2 * i, width2, + tmpbuf); + for (i = 0; i < width2; ++i) { + fill_col_to_arr(intbuf + i, width2, height, arrbuf); + resize_multistep(arrbuf, height, arrbuf2, height2, tmpbuf); + fill_arr_to_col(output + i, out_stride, height2, arrbuf2); + } + +Error: + free(intbuf); + free(tmpbuf); + free(arrbuf); + free(arrbuf2); +} + +#if CONFIG_VP9_HIGHBITDEPTH +static void highbd_interpolate(const uint16_t *const input, int inlength, + uint16_t *output, int outlength, int bd) { + const int64_t delta = + (((uint64_t)inlength << 32) + outlength / 2) / outlength; + const int64_t offset = + inlength > outlength + ? (((int64_t)(inlength - outlength) << 31) + outlength / 2) / + outlength + : -(((int64_t)(outlength - inlength) << 31) + outlength / 2) / + outlength; + uint16_t *optr = output; + int x, x1, x2, sum, k, int_pel, sub_pel; + int64_t y; + + const interp_kernel *interp_filters = + choose_interp_filter(inlength, outlength); + + x = 0; + y = offset; + while ((y >> INTERP_PRECISION_BITS) < (INTERP_TAPS / 2 - 1)) { + x++; + y += delta; + } + x1 = x; + x = outlength - 1; + y = delta * x + offset; + while ((y >> INTERP_PRECISION_BITS) + (int64_t)(INTERP_TAPS / 2) >= + inlength) { + x--; + y -= delta; + } + x2 = x; + if (x1 > x2) { + for (x = 0, y = offset; x < outlength; ++x, y += delta) { + const int16_t *filter; + int_pel = y >> INTERP_PRECISION_BITS; + sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK; + filter = interp_filters[sub_pel]; + sum = 0; + for (k = 0; k < INTERP_TAPS; ++k) { + const int pk = int_pel - INTERP_TAPS / 2 + 1 + k; + sum += filter[k] * + input[(pk < 0 ? 0 : (pk >= inlength ? inlength - 1 : pk))]; + } + *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + } + } else { + // Initial part. + for (x = 0, y = offset; x < x1; ++x, y += delta) { + const int16_t *filter; + int_pel = y >> INTERP_PRECISION_BITS; + sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK; + filter = interp_filters[sub_pel]; + sum = 0; + for (k = 0; k < INTERP_TAPS; ++k) { + assert(int_pel - INTERP_TAPS / 2 + 1 + k < inlength); + sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k < 0 + ? 0 + : int_pel - INTERP_TAPS / 2 + 1 + k)]; + } + *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + } + // Middle part. + for (; x <= x2; ++x, y += delta) { + const int16_t *filter; + int_pel = y >> INTERP_PRECISION_BITS; + sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK; + filter = interp_filters[sub_pel]; + sum = 0; + for (k = 0; k < INTERP_TAPS; ++k) + sum += filter[k] * input[int_pel - INTERP_TAPS / 2 + 1 + k]; + *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + } + // End part. + for (; x < outlength; ++x, y += delta) { + const int16_t *filter; + int_pel = y >> INTERP_PRECISION_BITS; + sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK; + filter = interp_filters[sub_pel]; + sum = 0; + for (k = 0; k < INTERP_TAPS; ++k) + sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k >= inlength + ? inlength - 1 + : int_pel - INTERP_TAPS / 2 + 1 + k)]; + *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + } + } +} + +static void highbd_down2_symeven(const uint16_t *const input, int length, + uint16_t *output, int bd) { + // Actual filter len = 2 * filter_len_half. + static const int16_t *filter = vp9_down2_symeven_half_filter; + const int filter_len_half = sizeof(vp9_down2_symeven_half_filter) / 2; + int i, j; + uint16_t *optr = output; + int l1 = filter_len_half; + int l2 = (length - filter_len_half); + l1 += (l1 & 1); + l2 += (l2 & 1); + if (l1 > l2) { + // Short input length. + for (i = 0; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += (input[(i - j < 0 ? 0 : i - j)] + + input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) * + filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel_highbd(sum, bd); + } + } else { + // Initial part. + for (i = 0; i < l1; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + 1 + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel_highbd(sum, bd); + } + // Middle part. + for (; i < l2; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += (input[i - j] + input[i + 1 + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel_highbd(sum, bd); + } + // End part. + for (; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += (input[i - j] + + input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) * + filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel_highbd(sum, bd); + } + } +} + +static void highbd_down2_symodd(const uint16_t *const input, int length, + uint16_t *output, int bd) { + // Actual filter len = 2 * filter_len_half - 1. + static const int16_t *filter = vp9_down2_symodd_half_filter; + const int filter_len_half = sizeof(vp9_down2_symodd_half_filter) / 2; + int i, j; + uint16_t *optr = output; + int l1 = filter_len_half - 1; + int l2 = (length - filter_len_half + 1); + l1 += (l1 & 1); + l2 += (l2 & 1); + if (l1 > l2) { + // Short input length. + for (i = 0; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[(i - j < 0 ? 0 : i - j)] + + input[(i + j >= length ? length - 1 : i + j)]) * + filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel_highbd(sum, bd); + } + } else { + // Initial part. + for (i = 0; i < l1; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel_highbd(sum, bd); + } + // Middle part. + for (; i < l2; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[i - j] + input[i + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel_highbd(sum, bd); + } + // End part. + for (; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[i - j] + input[(i + j >= length ? length - 1 : i + j)]) * + filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel_highbd(sum, bd); + } + } +} + +static void highbd_resize_multistep(const uint16_t *const input, int length, + uint16_t *output, int olength, + uint16_t *otmp, int bd) { + int steps; + if (length == olength) { + memcpy(output, input, sizeof(output[0]) * length); + return; + } + steps = get_down2_steps(length, olength); + + if (steps > 0) { + int s; + uint16_t *out = NULL; + uint16_t *otmp2; + int filteredlength = length; + + assert(otmp != NULL); + otmp2 = otmp + get_down2_length(length, 1); + for (s = 0; s < steps; ++s) { + const int proj_filteredlength = get_down2_length(filteredlength, 1); + const uint16_t *const in = (s == 0 ? input : out); + if (s == steps - 1 && proj_filteredlength == olength) + out = output; + else + out = (s & 1 ? otmp2 : otmp); + if (filteredlength & 1) + highbd_down2_symodd(in, filteredlength, out, bd); + else + highbd_down2_symeven(in, filteredlength, out, bd); + filteredlength = proj_filteredlength; + } + if (filteredlength != olength) { + highbd_interpolate(out, filteredlength, output, olength, bd); + } + } else { + highbd_interpolate(input, length, output, olength, bd); + } +} + +static void highbd_fill_col_to_arr(uint16_t *img, int stride, int len, + uint16_t *arr) { + int i; + uint16_t *iptr = img; + uint16_t *aptr = arr; + for (i = 0; i < len; ++i, iptr += stride) { + *aptr++ = *iptr; + } +} + +static void highbd_fill_arr_to_col(uint16_t *img, int stride, int len, + uint16_t *arr) { + int i; + uint16_t *iptr = img; + uint16_t *aptr = arr; + for (i = 0; i < len; ++i, iptr += stride) { + *iptr = *aptr++; + } +} + +void vp9_highbd_resize_plane(const uint8_t *const input, int height, int width, + int in_stride, uint8_t *output, int height2, + int width2, int out_stride, int bd) { + int i; + uint16_t *intbuf = (uint16_t *)malloc(sizeof(uint16_t) * width2 * height); + uint16_t *tmpbuf = + (uint16_t *)malloc(sizeof(uint16_t) * (width < height ? height : width)); + uint16_t *arrbuf = (uint16_t *)malloc(sizeof(uint16_t) * height); + uint16_t *arrbuf2 = (uint16_t *)malloc(sizeof(uint16_t) * height2); + if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) + goto Error; + assert(width > 0); + assert(height > 0); + assert(width2 > 0); + assert(height2 > 0); + for (i = 0; i < height; ++i) { + highbd_resize_multistep(CONVERT_TO_SHORTPTR(input + in_stride * i), width, + intbuf + width2 * i, width2, tmpbuf, bd); + } + for (i = 0; i < width2; ++i) { + highbd_fill_col_to_arr(intbuf + i, width2, height, arrbuf); + highbd_resize_multistep(arrbuf, height, arrbuf2, height2, tmpbuf, bd); + highbd_fill_arr_to_col(CONVERT_TO_SHORTPTR(output + i), out_stride, height2, + arrbuf2); + } + +Error: + free(intbuf); + free(tmpbuf); + free(arrbuf); + free(arrbuf2); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +void vp9_resize_frame420(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, uint8_t *oy, + int oy_stride, uint8_t *ou, uint8_t *ov, + int ouv_stride, int oheight, int owidth) { + vp9_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride); + vp9_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2, + owidth / 2, ouv_stride); + vp9_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2, + owidth / 2, ouv_stride); +} + +void vp9_resize_frame422(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, uint8_t *oy, + int oy_stride, uint8_t *ou, uint8_t *ov, + int ouv_stride, int oheight, int owidth) { + vp9_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride); + vp9_resize_plane(u, height, width / 2, uv_stride, ou, oheight, owidth / 2, + ouv_stride); + vp9_resize_plane(v, height, width / 2, uv_stride, ov, oheight, owidth / 2, + ouv_stride); +} + +void vp9_resize_frame444(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, uint8_t *oy, + int oy_stride, uint8_t *ou, uint8_t *ov, + int ouv_stride, int oheight, int owidth) { + vp9_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride); + vp9_resize_plane(u, height, width, uv_stride, ou, oheight, owidth, + ouv_stride); + vp9_resize_plane(v, height, width, uv_stride, ov, oheight, owidth, + ouv_stride); +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_highbd_resize_frame420(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, + uint8_t *ov, int ouv_stride, int oheight, + int owidth, int bd) { + vp9_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth, + oy_stride, bd); + vp9_highbd_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2, + owidth / 2, ouv_stride, bd); + vp9_highbd_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2, + owidth / 2, ouv_stride, bd); +} + +void vp9_highbd_resize_frame422(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, + uint8_t *ov, int ouv_stride, int oheight, + int owidth, int bd) { + vp9_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth, + oy_stride, bd); + vp9_highbd_resize_plane(u, height, width / 2, uv_stride, ou, oheight, + owidth / 2, ouv_stride, bd); + vp9_highbd_resize_plane(v, height, width / 2, uv_stride, ov, oheight, + owidth / 2, ouv_stride, bd); +} + +void vp9_highbd_resize_frame444(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, + uint8_t *ov, int ouv_stride, int oheight, + int owidth, int bd) { + vp9_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth, + oy_stride, bd); + vp9_highbd_resize_plane(u, height, width, uv_stride, ou, oheight, owidth, + ouv_stride, bd); + vp9_highbd_resize_plane(v, height, width, uv_stride, ov, oheight, owidth, + ouv_stride, bd); +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_resize.h b/media/libvpx/libvpx/vp9/encoder/vp9_resize.h new file mode 100644 index 0000000000..5d4ce97eba --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_resize.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_RESIZE_H_ +#define VPX_VP9_ENCODER_VP9_RESIZE_H_ + +#include +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_resize_plane(const uint8_t *const input, int height, int width, + int in_stride, uint8_t *output, int height2, int width2, + int out_stride); +void vp9_resize_frame420(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, uint8_t *oy, + int oy_stride, uint8_t *ou, uint8_t *ov, + int ouv_stride, int oheight, int owidth); +void vp9_resize_frame422(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, uint8_t *oy, + int oy_stride, uint8_t *ou, uint8_t *ov, + int ouv_stride, int oheight, int owidth); +void vp9_resize_frame444(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, uint8_t *oy, + int oy_stride, uint8_t *ou, uint8_t *ov, + int ouv_stride, int oheight, int owidth); + +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_highbd_resize_plane(const uint8_t *const input, int height, int width, + int in_stride, uint8_t *output, int height2, + int width2, int out_stride, int bd); +void vp9_highbd_resize_frame420(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, + uint8_t *ov, int ouv_stride, int oheight, + int owidth, int bd); +void vp9_highbd_resize_frame422(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, + uint8_t *ov, int ouv_stride, int oheight, + int owidth, int bd); +void vp9_highbd_resize_frame444(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, + uint8_t *ov, int ouv_stride, int oheight, + int owidth, int bd); +#endif // CONFIG_VP9_HIGHBITDEPTH + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_RESIZE_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_segmentation.c b/media/libvpx/libvpx/vp9/encoder/vp9_segmentation.c new file mode 100644 index 0000000000..d75488a8e6 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_segmentation.c @@ -0,0 +1,325 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "vpx_mem/vpx_mem.h" + +#include "vp9/common/vp9_pred_common.h" +#include "vp9/common/vp9_tile_common.h" + +#include "vp9/encoder/vp9_cost.h" +#include "vp9/encoder/vp9_segmentation.h" + +void vp9_enable_segmentation(struct segmentation *seg) { + seg->enabled = 1; + seg->update_map = 1; + seg->update_data = 1; +} + +void vp9_disable_segmentation(struct segmentation *seg) { + seg->enabled = 0; + seg->update_map = 0; + seg->update_data = 0; +} + +void vp9_set_segment_data(struct segmentation *seg, signed char *feature_data, + unsigned char abs_delta) { + seg->abs_delta = abs_delta; + + memcpy(seg->feature_data, feature_data, sizeof(seg->feature_data)); +} +void vp9_disable_segfeature(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id) { + seg->feature_mask[segment_id] &= ~(1u << feature_id); +} + +void vp9_clear_segdata(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id) { + seg->feature_data[segment_id][feature_id] = 0; +} + +void vp9_psnr_aq_mode_setup(struct segmentation *seg) { + int i; + + vp9_enable_segmentation(seg); + vp9_clearall_segfeatures(seg); + seg->abs_delta = SEGMENT_DELTADATA; + + for (i = 0; i < MAX_SEGMENTS; ++i) { + vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, 2 * (i - (MAX_SEGMENTS / 2))); + vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q); + } +} + +void vp9_perceptual_aq_mode_setup(struct VP9_COMP *cpi, + struct segmentation *seg) { + const VP9_COMMON *cm = &cpi->common; + const int seg_counts = cpi->kmeans_ctr_num; + const int base_qindex = cm->base_qindex; + const double base_qstep = vp9_convert_qindex_to_q(base_qindex, cm->bit_depth); + const double mid_ctr = cpi->kmeans_ctr_ls[seg_counts / 2]; + const double var_diff_scale = 4.0; + int i; + + assert(seg_counts <= MAX_SEGMENTS); + + vp9_enable_segmentation(seg); + vp9_clearall_segfeatures(seg); + seg->abs_delta = SEGMENT_DELTADATA; + + for (i = 0; i < seg_counts / 2; ++i) { + double wiener_var_diff = mid_ctr - cpi->kmeans_ctr_ls[i]; + double target_qstep = base_qstep / (1.0 + wiener_var_diff / var_diff_scale); + int target_qindex = vp9_convert_q_to_qindex(target_qstep, cm->bit_depth); + assert(wiener_var_diff >= 0.0); + + vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, target_qindex - base_qindex); + vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q); + } + + vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, 0); + vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q); + + for (; i < seg_counts; ++i) { + double wiener_var_diff = cpi->kmeans_ctr_ls[i] - mid_ctr; + double target_qstep = base_qstep * (1.0 + wiener_var_diff / var_diff_scale); + int target_qindex = vp9_convert_q_to_qindex(target_qstep, cm->bit_depth); + assert(wiener_var_diff >= 0.0); + + vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, target_qindex - base_qindex); + vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q); + } +} + +// Based on set of segment counts calculate a probability tree +static void calc_segtree_probs(int *segcounts, vpx_prob *segment_tree_probs) { + // Work out probabilities of each segment + const int c01 = segcounts[0] + segcounts[1]; + const int c23 = segcounts[2] + segcounts[3]; + const int c45 = segcounts[4] + segcounts[5]; + const int c67 = segcounts[6] + segcounts[7]; + + segment_tree_probs[0] = get_binary_prob(c01 + c23, c45 + c67); + segment_tree_probs[1] = get_binary_prob(c01, c23); + segment_tree_probs[2] = get_binary_prob(c45, c67); + segment_tree_probs[3] = get_binary_prob(segcounts[0], segcounts[1]); + segment_tree_probs[4] = get_binary_prob(segcounts[2], segcounts[3]); + segment_tree_probs[5] = get_binary_prob(segcounts[4], segcounts[5]); + segment_tree_probs[6] = get_binary_prob(segcounts[6], segcounts[7]); +} + +// Based on set of segment counts and probabilities calculate a cost estimate +static int cost_segmap(int *segcounts, vpx_prob *probs) { + const int c01 = segcounts[0] + segcounts[1]; + const int c23 = segcounts[2] + segcounts[3]; + const int c45 = segcounts[4] + segcounts[5]; + const int c67 = segcounts[6] + segcounts[7]; + const int c0123 = c01 + c23; + const int c4567 = c45 + c67; + + // Cost the top node of the tree + int cost = c0123 * vp9_cost_zero(probs[0]) + c4567 * vp9_cost_one(probs[0]); + + // Cost subsequent levels + if (c0123 > 0) { + cost += c01 * vp9_cost_zero(probs[1]) + c23 * vp9_cost_one(probs[1]); + + if (c01 > 0) + cost += segcounts[0] * vp9_cost_zero(probs[3]) + + segcounts[1] * vp9_cost_one(probs[3]); + if (c23 > 0) + cost += segcounts[2] * vp9_cost_zero(probs[4]) + + segcounts[3] * vp9_cost_one(probs[4]); + } + + if (c4567 > 0) { + cost += c45 * vp9_cost_zero(probs[2]) + c67 * vp9_cost_one(probs[2]); + + if (c45 > 0) + cost += segcounts[4] * vp9_cost_zero(probs[5]) + + segcounts[5] * vp9_cost_one(probs[5]); + if (c67 > 0) + cost += segcounts[6] * vp9_cost_zero(probs[6]) + + segcounts[7] * vp9_cost_one(probs[6]); + } + + return cost; +} + +static void count_segs(const VP9_COMMON *cm, MACROBLOCKD *xd, + const TileInfo *tile, MODE_INFO **mi, + int *no_pred_segcounts, + int (*temporal_predictor_count)[2], + int *t_unpred_seg_counts, int bw, int bh, int mi_row, + int mi_col) { + int segment_id; + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + xd->mi = mi; + segment_id = xd->mi[0]->segment_id; + + set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols); + + // Count the number of hits on each segment with no prediction + no_pred_segcounts[segment_id]++; + + // Temporal prediction not allowed on key frames + if (cm->frame_type != KEY_FRAME) { + const BLOCK_SIZE bsize = xd->mi[0]->sb_type; + // Test to see if the segment id matches the predicted value. + const int pred_segment_id = + get_segment_id(cm, cm->last_frame_seg_map, bsize, mi_row, mi_col); + const int pred_flag = pred_segment_id == segment_id; + const int pred_context = vp9_get_pred_context_seg_id(xd); + + // Store the prediction status for this mb and update counts + // as appropriate + xd->mi[0]->seg_id_predicted = pred_flag; + temporal_predictor_count[pred_context][pred_flag]++; + + // Update the "unpredicted" segment count + if (!pred_flag) t_unpred_seg_counts[segment_id]++; + } +} + +static void count_segs_sb(const VP9_COMMON *cm, MACROBLOCKD *xd, + const TileInfo *tile, MODE_INFO **mi, + int *no_pred_segcounts, + int (*temporal_predictor_count)[2], + int *t_unpred_seg_counts, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + const int mis = cm->mi_stride; + int bw, bh; + const int bs = num_8x8_blocks_wide_lookup[bsize], hbs = bs / 2; + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + bw = num_8x8_blocks_wide_lookup[mi[0]->sb_type]; + bh = num_8x8_blocks_high_lookup[mi[0]->sb_type]; + + if (bw == bs && bh == bs) { + count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count, + t_unpred_seg_counts, bs, bs, mi_row, mi_col); + } else if (bw == bs && bh < bs) { + count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count, + t_unpred_seg_counts, bs, hbs, mi_row, mi_col); + count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts, + temporal_predictor_count, t_unpred_seg_counts, bs, hbs, + mi_row + hbs, mi_col); + } else if (bw < bs && bh == bs) { + count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count, + t_unpred_seg_counts, hbs, bs, mi_row, mi_col); + count_segs(cm, xd, tile, mi + hbs, no_pred_segcounts, + temporal_predictor_count, t_unpred_seg_counts, hbs, bs, mi_row, + mi_col + hbs); + } else { + const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize]; + int n; + + assert(bw < bs && bh < bs); + + for (n = 0; n < 4; n++) { + const int mi_dc = hbs * (n & 1); + const int mi_dr = hbs * (n >> 1); + + count_segs_sb(cm, xd, tile, &mi[mi_dr * mis + mi_dc], no_pred_segcounts, + temporal_predictor_count, t_unpred_seg_counts, + mi_row + mi_dr, mi_col + mi_dc, subsize); + } + } +} + +void vp9_choose_segmap_coding_method(VP9_COMMON *cm, MACROBLOCKD *xd) { + struct segmentation *seg = &cm->seg; + + int no_pred_cost; + int t_pred_cost = INT_MAX; + + int i, tile_col, mi_row, mi_col; + + int temporal_predictor_count[PREDICTION_PROBS][2] = { { 0 } }; + int no_pred_segcounts[MAX_SEGMENTS] = { 0 }; + int t_unpred_seg_counts[MAX_SEGMENTS] = { 0 }; + + vpx_prob no_pred_tree[SEG_TREE_PROBS]; + vpx_prob t_pred_tree[SEG_TREE_PROBS]; + vpx_prob t_nopred_prob[PREDICTION_PROBS]; + + // Set default state for the segment tree probabilities and the + // temporal coding probabilities + memset(seg->tree_probs, 255, sizeof(seg->tree_probs)); + memset(seg->pred_probs, 255, sizeof(seg->pred_probs)); + + // First of all generate stats regarding how well the last segment map + // predicts this one + for (tile_col = 0; tile_col < 1 << cm->log2_tile_cols; tile_col++) { + TileInfo tile; + MODE_INFO **mi_ptr; + vp9_tile_init(&tile, cm, 0, tile_col); + + mi_ptr = cm->mi_grid_visible + tile.mi_col_start; + for (mi_row = 0; mi_row < cm->mi_rows; + mi_row += 8, mi_ptr += 8 * cm->mi_stride) { + MODE_INFO **mi = mi_ptr; + for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end; + mi_col += 8, mi += 8) + count_segs_sb(cm, xd, &tile, mi, no_pred_segcounts, + temporal_predictor_count, t_unpred_seg_counts, mi_row, + mi_col, BLOCK_64X64); + } + } + + // Work out probability tree for coding segments without prediction + // and the cost. + calc_segtree_probs(no_pred_segcounts, no_pred_tree); + no_pred_cost = cost_segmap(no_pred_segcounts, no_pred_tree); + + // Key frames cannot use temporal prediction + if (!frame_is_intra_only(cm)) { + // Work out probability tree for coding those segments not + // predicted using the temporal method and the cost. + calc_segtree_probs(t_unpred_seg_counts, t_pred_tree); + t_pred_cost = cost_segmap(t_unpred_seg_counts, t_pred_tree); + + // Add in the cost of the signaling for each prediction context. + for (i = 0; i < PREDICTION_PROBS; i++) { + const int count0 = temporal_predictor_count[i][0]; + const int count1 = temporal_predictor_count[i][1]; + + t_nopred_prob[i] = get_binary_prob(count0, count1); + + // Add in the predictor signaling cost + t_pred_cost += count0 * vp9_cost_zero(t_nopred_prob[i]) + + count1 * vp9_cost_one(t_nopred_prob[i]); + } + } + + // Now choose which coding method to use. + if (t_pred_cost < no_pred_cost) { + seg->temporal_update = 1; + memcpy(seg->tree_probs, t_pred_tree, sizeof(t_pred_tree)); + memcpy(seg->pred_probs, t_nopred_prob, sizeof(t_nopred_prob)); + } else { + seg->temporal_update = 0; + memcpy(seg->tree_probs, no_pred_tree, sizeof(no_pred_tree)); + } +} + +void vp9_reset_segment_features(struct segmentation *seg) { + // Set up default state for MB feature flags + seg->enabled = 0; + seg->update_map = 0; + seg->update_data = 0; + memset(seg->tree_probs, 255, sizeof(seg->tree_probs)); + vp9_clearall_segfeatures(seg); +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_segmentation.h b/media/libvpx/libvpx/vp9/encoder/vp9_segmentation.h new file mode 100644 index 0000000000..9404c38bc8 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_segmentation.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_SEGMENTATION_H_ +#define VPX_VP9_ENCODER_VP9_SEGMENTATION_H_ + +#include "vp9/common/vp9_blockd.h" +#include "vp9/encoder/vp9_encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_enable_segmentation(struct segmentation *seg); +void vp9_disable_segmentation(struct segmentation *seg); + +void vp9_disable_segfeature(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id); +void vp9_clear_segdata(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id); + +void vp9_psnr_aq_mode_setup(struct segmentation *seg); + +void vp9_perceptual_aq_mode_setup(struct VP9_COMP *cpi, + struct segmentation *seg); + +// The values given for each segment can be either deltas (from the default +// value chosen for the frame) or absolute values. +// +// Valid range for abs values is (0-127 for MB_LVL_ALT_Q), (0-63 for +// SEGMENT_ALT_LF) +// Valid range for delta values are (+/-127 for MB_LVL_ALT_Q), (+/-63 for +// SEGMENT_ALT_LF) +// +// abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use +// the absolute values given). +void vp9_set_segment_data(struct segmentation *seg, signed char *feature_data, + unsigned char abs_delta); + +void vp9_choose_segmap_coding_method(VP9_COMMON *cm, MACROBLOCKD *xd); + +void vp9_reset_segment_features(struct segmentation *seg); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_SEGMENTATION_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_skin_detection.c b/media/libvpx/libvpx/vp9/encoder/vp9_skin_detection.c new file mode 100644 index 0000000000..cc6c967767 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_skin_detection.c @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "vp9/common/vp9_blockd.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_skin_detection.h" + +int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v, + int stride, int strideuv, int bsize, + int consec_zeromv, int curr_motion_magn) { + // No skin if block has been zero/small motion for long consecutive time. + if (consec_zeromv > 60 && curr_motion_magn == 0) { + return 0; + } else { + int motion = 1; + // Take center pixel in block to determine is_skin. + const int y_width_shift = (4 << b_width_log2_lookup[bsize]) >> 1; + const int y_height_shift = (4 << b_height_log2_lookup[bsize]) >> 1; + const int uv_width_shift = y_width_shift >> 1; + const int uv_height_shift = y_height_shift >> 1; + const uint8_t ysource = y[y_height_shift * stride + y_width_shift]; + const uint8_t usource = u[uv_height_shift * strideuv + uv_width_shift]; + const uint8_t vsource = v[uv_height_shift * strideuv + uv_width_shift]; + + if (consec_zeromv > 25 && curr_motion_magn == 0) motion = 0; + return vpx_skin_pixel(ysource, usource, vsource, motion); + } +} + +void vp9_compute_skin_sb(VP9_COMP *const cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col) { + int i, j, num_bl; + VP9_COMMON *const cm = &cpi->common; + const uint8_t *src_y = cpi->Source->y_buffer; + const uint8_t *src_u = cpi->Source->u_buffer; + const uint8_t *src_v = cpi->Source->v_buffer; + const int src_ystride = cpi->Source->y_stride; + const int src_uvstride = cpi->Source->uv_stride; + const int y_bsize = 4 << b_width_log2_lookup[bsize]; + const int uv_bsize = y_bsize >> 1; + const int shy = (y_bsize == 8) ? 3 : 4; + const int shuv = shy - 1; + const int fac = y_bsize / 8; + const int y_shift = src_ystride * (mi_row << 3) + (mi_col << 3); + const int uv_shift = src_uvstride * (mi_row << 2) + (mi_col << 2); + const int mi_row_limit = VPXMIN(mi_row + 8, cm->mi_rows - 2); + const int mi_col_limit = VPXMIN(mi_col + 8, cm->mi_cols - 2); + src_y += y_shift; + src_u += uv_shift; + src_v += uv_shift; + + for (i = mi_row; i < mi_row_limit; i += fac) { + num_bl = 0; + for (j = mi_col; j < mi_col_limit; j += fac) { + int consec_zeromv = 0; + int bl_index = i * cm->mi_cols + j; + int bl_index1 = bl_index + 1; + int bl_index2 = bl_index + cm->mi_cols; + int bl_index3 = bl_index2 + 1; + // Don't detect skin on the boundary. + if (i == 0 || j == 0) continue; + if (bsize == BLOCK_8X8) + consec_zeromv = cpi->consec_zero_mv[bl_index]; + else + consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index], + VPXMIN(cpi->consec_zero_mv[bl_index1], + VPXMIN(cpi->consec_zero_mv[bl_index2], + cpi->consec_zero_mv[bl_index3]))); + cpi->skin_map[bl_index] = + vp9_compute_skin_block(src_y, src_u, src_v, src_ystride, src_uvstride, + bsize, consec_zeromv, 0); + num_bl++; + src_y += y_bsize; + src_u += uv_bsize; + src_v += uv_bsize; + } + src_y += (src_ystride << shy) - (num_bl << shy); + src_u += (src_uvstride << shuv) - (num_bl << shuv); + src_v += (src_uvstride << shuv) - (num_bl << shuv); + } + + // Remove isolated skin blocks (none of its neighbors are skin) and isolated + // non-skin blocks (all of its neighbors are skin). + // Skip 4 corner blocks which have only 3 neighbors to remove isolated skin + // blocks. Skip superblock borders to remove isolated non-skin blocks. + for (i = mi_row; i < mi_row_limit; i += fac) { + for (j = mi_col; j < mi_col_limit; j += fac) { + int bl_index = i * cm->mi_cols + j; + int num_neighbor = 0; + int mi, mj; + int non_skin_threshold = 8; + // Skip 4 corners. + if ((i == mi_row && (j == mi_col || j == mi_col_limit - fac)) || + (i == mi_row_limit - fac && (j == mi_col || j == mi_col_limit - fac))) + continue; + // There are only 5 neighbors for non-skin blocks on the border. + if (i == mi_row || i == mi_row_limit - fac || j == mi_col || + j == mi_col_limit - fac) + non_skin_threshold = 5; + + for (mi = -fac; mi <= fac; mi += fac) { + for (mj = -fac; mj <= fac; mj += fac) { + if (i + mi >= mi_row && i + mi < mi_row_limit && j + mj >= mi_col && + j + mj < mi_col_limit) { + int bl_neighbor_index = (i + mi) * cm->mi_cols + j + mj; + if (cpi->skin_map[bl_neighbor_index]) num_neighbor++; + } + } + } + + if (cpi->skin_map[bl_index] && num_neighbor < 2) + cpi->skin_map[bl_index] = 0; + if (!cpi->skin_map[bl_index] && num_neighbor == non_skin_threshold) + cpi->skin_map[bl_index] = 1; + } + } +} + +#ifdef OUTPUT_YUV_SKINMAP +// For viewing skin map on input source. +void vp9_output_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) { + int i, j, mi_row, mi_col, num_bl; + VP9_COMMON *const cm = &cpi->common; + uint8_t *y; + const uint8_t *src_y = cpi->Source->y_buffer; + const int src_ystride = cpi->Source->y_stride; + const int y_bsize = 16; // Use 8x8 or 16x16. + const int shy = (y_bsize == 8) ? 3 : 4; + const int fac = y_bsize / 8; + + YV12_BUFFER_CONFIG skinmap; + memset(&skinmap, 0, sizeof(YV12_BUFFER_CONFIG)); + if (vpx_alloc_frame_buffer(&skinmap, cm->width, cm->height, cm->subsampling_x, + cm->subsampling_y, VP9_ENC_BORDER_IN_PIXELS, + cm->byte_alignment)) { + vpx_free_frame_buffer(&skinmap); + return; + } + memset(skinmap.buffer_alloc, 128, skinmap.frame_size); + y = skinmap.y_buffer; + // Loop through blocks and set skin map based on center pixel of block. + // Set y to white for skin block, otherwise set to source with gray scale. + // Ignore rightmost/bottom boundary blocks. + for (mi_row = 0; mi_row < cm->mi_rows - 1; mi_row += fac) { + num_bl = 0; + for (mi_col = 0; mi_col < cm->mi_cols - 1; mi_col += fac) { + const int block_index = mi_row * cm->mi_cols + mi_col; + const int is_skin = cpi->skin_map[block_index]; + for (i = 0; i < y_bsize; i++) { + for (j = 0; j < y_bsize; j++) { + y[i * src_ystride + j] = is_skin ? 255 : src_y[i * src_ystride + j]; + } + } + num_bl++; + y += y_bsize; + src_y += y_bsize; + } + y += (src_ystride << shy) - (num_bl << shy); + src_y += (src_ystride << shy) - (num_bl << shy); + } + vpx_write_yuv_frame(yuv_skinmap_file, &skinmap); + vpx_free_frame_buffer(&skinmap); +} +#endif diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_skin_detection.h b/media/libvpx/libvpx/vp9/encoder/vp9_skin_detection.h new file mode 100644 index 0000000000..46a722af9b --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_skin_detection.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_SKIN_DETECTION_H_ +#define VPX_VP9_ENCODER_VP9_SKIN_DETECTION_H_ + +#include "vp9/common/vp9_blockd.h" +#include "vpx_dsp/skin_detection.h" +#include "vpx_util/vpx_write_yuv_frame.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP9_COMP; + +int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v, + int stride, int strideuv, int bsize, + int consec_zeromv, int curr_motion_magn); + +void vp9_compute_skin_sb(struct VP9_COMP *const cpi, BLOCK_SIZE bsize, + int mi_row, int mi_col); + +#ifdef OUTPUT_YUV_SKINMAP +// For viewing skin map on input source. +void vp9_output_skin_map(struct VP9_COMP *const cpi, FILE *yuv_skinmap_file); +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_SKIN_DETECTION_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.c b/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.c new file mode 100644 index 0000000000..56fb5f94f4 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.c @@ -0,0 +1,1093 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_speed_features.h" +#include "vp9/encoder/vp9_rdopt.h" +#include "vpx_dsp/vpx_dsp_common.h" + +// Mesh search patters for various speed settings +// Define 2 mesh density levels for FC_GRAPHICS_ANIMATION content type and non +// FC_GRAPHICS_ANIMATION content type. +static MESH_PATTERN best_quality_mesh_pattern[2][MAX_MESH_STEP] = { + { { 64, 4 }, { 28, 2 }, { 15, 1 }, { 7, 1 } }, + { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } }, +}; + +#if !CONFIG_REALTIME_ONLY +// Define 3 mesh density levels to control the number of searches. +#define MESH_DENSITY_LEVELS 3 +static MESH_PATTERN + good_quality_mesh_patterns[MESH_DENSITY_LEVELS][MAX_MESH_STEP] = { + { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } }, + { { 64, 8 }, { 14, 2 }, { 7, 1 }, { 7, 1 } }, + { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } }, + }; + +// Intra only frames, golden frames (except alt ref overlays) and +// alt ref frames tend to be coded at a higher than ambient quality +static int frame_is_boosted(const VP9_COMP *cpi) { + return frame_is_kf_gf_arf(cpi); +} + +// Sets a partition size down to which the auto partition code will always +// search (can go lower), based on the image dimensions. The logic here +// is that the extent to which ringing artefacts are offensive, depends +// partly on the screen area that over which they propagate. Propagation is +// limited by transform block size but the screen area take up by a given block +// size will be larger for a small image format stretched to full screen. +static BLOCK_SIZE set_partition_min_limit(VP9_COMMON *const cm) { + unsigned int screen_area = (cm->width * cm->height); + + // Select block size based on image format size. + if (screen_area < 1280 * 720) { + // Formats smaller in area than 720P + return BLOCK_4X4; + } else if (screen_area < 1920 * 1080) { + // Format >= 720P and < 1080P + return BLOCK_8X8; + } else { + // Formats 1080P and up + return BLOCK_16X16; + } +} + +static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi, + SPEED_FEATURES *sf, + int speed) { + VP9_COMMON *const cm = &cpi->common; + const int min_frame_size = VPXMIN(cm->width, cm->height); + const int is_480p_or_larger = min_frame_size >= 480; + const int is_720p_or_larger = min_frame_size >= 720; + const int is_1080p_or_larger = min_frame_size >= 1080; + const int is_2160p_or_larger = min_frame_size >= 2160; + const int boosted = frame_is_boosted(cpi); + + // speed 0 features + sf->partition_search_breakout_thr.dist = (1 << 20); + sf->partition_search_breakout_thr.rate = 80; + sf->use_square_only_thresh_high = BLOCK_SIZES; + sf->use_square_only_thresh_low = BLOCK_4X4; + + if (is_480p_or_larger) { + // Currently, the machine-learning based partition search early termination + // is only used while VPXMIN(cm->width, cm->height) >= 480 and speed = 0. + sf->rd_ml_partition.search_early_termination = 1; + sf->recode_tolerance_high = 45; + } else { + sf->use_square_only_thresh_high = BLOCK_32X32; + } + if (is_720p_or_larger) { + sf->alt_ref_search_fp = 1; + } + + if (!is_1080p_or_larger) { + sf->rd_ml_partition.search_breakout = 1; + if (is_720p_or_larger) { + sf->rd_ml_partition.search_breakout_thresh[0] = 0.0f; + sf->rd_ml_partition.search_breakout_thresh[1] = 0.0f; + sf->rd_ml_partition.search_breakout_thresh[2] = 0.0f; + } else { + sf->rd_ml_partition.search_breakout_thresh[0] = 2.5f; + sf->rd_ml_partition.search_breakout_thresh[1] = 1.5f; + sf->rd_ml_partition.search_breakout_thresh[2] = 1.5f; + } + } + + if (!is_720p_or_larger) { + if (is_480p_or_larger) + sf->prune_single_mode_based_on_mv_diff_mode_rate = boosted ? 0 : 1; + else + sf->prune_single_mode_based_on_mv_diff_mode_rate = 1; + } + + if (speed >= 1) { + sf->rd_ml_partition.search_early_termination = 0; + sf->rd_ml_partition.search_breakout = 1; + if (is_480p_or_larger) + sf->use_square_only_thresh_high = BLOCK_64X64; + else + sf->use_square_only_thresh_high = BLOCK_32X32; + sf->use_square_only_thresh_low = BLOCK_16X16; + if (is_720p_or_larger) { + sf->disable_split_mask = + cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT; + sf->partition_search_breakout_thr.dist = (1 << 22); + sf->rd_ml_partition.search_breakout_thresh[0] = -5.0f; + sf->rd_ml_partition.search_breakout_thresh[1] = -5.0f; + sf->rd_ml_partition.search_breakout_thresh[2] = -9.0f; + } else { + sf->disable_split_mask = DISABLE_COMPOUND_SPLIT; + sf->partition_search_breakout_thr.dist = (1 << 21); + sf->rd_ml_partition.search_breakout_thresh[0] = -1.0f; + sf->rd_ml_partition.search_breakout_thresh[1] = -1.0f; + sf->rd_ml_partition.search_breakout_thresh[2] = -1.0f; + } +#if CONFIG_VP9_HIGHBITDEPTH + if (cpi->Source->flags & YV12_FLAG_HIGHBITDEPTH) { + sf->rd_ml_partition.search_breakout_thresh[0] -= 1.0f; + sf->rd_ml_partition.search_breakout_thresh[1] -= 1.0f; + sf->rd_ml_partition.search_breakout_thresh[2] -= 1.0f; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + } + + if (speed >= 2) { + sf->use_square_only_thresh_high = BLOCK_4X4; + sf->use_square_only_thresh_low = BLOCK_SIZES; + if (is_720p_or_larger) { + sf->disable_split_mask = + cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT; + sf->adaptive_pred_interp_filter = 0; + sf->partition_search_breakout_thr.dist = (1 << 24); + sf->partition_search_breakout_thr.rate = 120; + sf->rd_ml_partition.search_breakout = 0; + } else { + sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY; + sf->partition_search_breakout_thr.dist = (1 << 22); + sf->partition_search_breakout_thr.rate = 100; + sf->rd_ml_partition.search_breakout_thresh[0] = 0.0f; + sf->rd_ml_partition.search_breakout_thresh[1] = -1.0f; + sf->rd_ml_partition.search_breakout_thresh[2] = -4.0f; + } + sf->rd_auto_partition_min_limit = set_partition_min_limit(cm); + + // Use a set of speed features for 4k videos. + if (is_2160p_or_larger) { + sf->use_square_partition_only = 1; + sf->intra_y_mode_mask[TX_32X32] = INTRA_DC; + sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC; + sf->alt_ref_search_fp = 1; + sf->cb_pred_filter_search = 2; + sf->adaptive_interp_filter_search = 1; + sf->disable_split_mask = DISABLE_ALL_SPLIT; + } + } + + if (speed >= 3) { + sf->rd_ml_partition.search_breakout = 0; + if (is_720p_or_larger) { + sf->disable_split_mask = DISABLE_ALL_SPLIT; + sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0; + sf->partition_search_breakout_thr.dist = (1 << 25); + sf->partition_search_breakout_thr.rate = 200; + } else { + sf->max_intra_bsize = BLOCK_32X32; + sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT; + sf->schedule_mode_search = cm->base_qindex < 175 ? 1 : 0; + sf->partition_search_breakout_thr.dist = (1 << 23); + sf->partition_search_breakout_thr.rate = 120; + } + } + + // If this is a two pass clip that fits the criteria for animated or + // graphics content then reset disable_split_mask for speeds 1-4. + // Also if the image edge is internal to the coded area. + if ((speed >= 1) && (cpi->oxcf.pass == 2) && + ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) || + (vp9_internal_image_edge(cpi)))) { + sf->disable_split_mask = DISABLE_COMPOUND_SPLIT; + } + + if (speed >= 4) { + sf->partition_search_breakout_thr.rate = 300; + if (is_720p_or_larger) { + sf->partition_search_breakout_thr.dist = (1 << 26); + } else { + sf->partition_search_breakout_thr.dist = (1 << 24); + } + sf->disable_split_mask = DISABLE_ALL_SPLIT; + } + + if (speed >= 5) { + sf->partition_search_breakout_thr.rate = 500; + } +} + +static double tx_dom_thresholds[6] = { 99.0, 14.0, 12.0, 8.0, 4.0, 0.0 }; +static double qopt_thresholds[6] = { 99.0, 12.0, 10.0, 4.0, 2.0, 0.0 }; + +static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, + VP9_COMMON *cm, + SPEED_FEATURES *sf, + int speed) { + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + const int boosted = frame_is_boosted(cpi); + int i; + + sf->adaptive_interp_filter_search = 1; + sf->adaptive_pred_interp_filter = 1; + sf->adaptive_rd_thresh = 1; + sf->adaptive_rd_thresh_row_mt = 0; + sf->allow_skip_recode = 1; + sf->less_rectangular_check = 1; + sf->mv.auto_mv_step_size = 1; + sf->mv.use_downsampled_sad = 1; + sf->prune_ref_frame_for_rect_partitions = 1; + sf->temporal_filter_search_method = NSTEP; + sf->tx_size_search_breakout = 1; + sf->use_square_partition_only = !boosted; + sf->early_term_interp_search_plane_rd = 1; + sf->cb_pred_filter_search = 1; + sf->trellis_opt_tx_rd.method = sf->optimize_coefficients + ? ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE + : DISABLE_TRELLIS_OPT; + sf->trellis_opt_tx_rd.thresh = boosted ? 4.0 : 3.0; + + sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; + sf->comp_inter_joint_search_iter_level = 1; + + // Reference masking is not supported in dynamic scaling mode. + sf->reference_masking = oxcf->resize_mode != RESIZE_DYNAMIC; + + sf->rd_ml_partition.var_pruning = 1; + sf->rd_ml_partition.prune_rect_thresh[0] = -1; + sf->rd_ml_partition.prune_rect_thresh[1] = 350; + sf->rd_ml_partition.prune_rect_thresh[2] = 325; + sf->rd_ml_partition.prune_rect_thresh[3] = 250; + + if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) { + sf->exhaustive_searches_thresh = (1 << 22); + } else { + sf->exhaustive_searches_thresh = INT_MAX; + } + + for (i = 0; i < MAX_MESH_STEP; ++i) { + const int mesh_density_level = 0; + sf->mesh_patterns[i].range = + good_quality_mesh_patterns[mesh_density_level][i].range; + sf->mesh_patterns[i].interval = + good_quality_mesh_patterns[mesh_density_level][i].interval; + } + + if (speed >= 1) { + sf->rd_ml_partition.var_pruning = !boosted; + sf->rd_ml_partition.prune_rect_thresh[1] = 225; + sf->rd_ml_partition.prune_rect_thresh[2] = 225; + sf->rd_ml_partition.prune_rect_thresh[3] = 225; + + if (oxcf->pass == 2) { + TWO_PASS *const twopass = &cpi->twopass; + if ((twopass->fr_content_type == FC_GRAPHICS_ANIMATION) || + vp9_internal_image_edge(cpi)) { + sf->use_square_partition_only = !boosted; + } else { + sf->use_square_partition_only = !frame_is_intra_only(cm); + } + } else { + sf->use_square_partition_only = !frame_is_intra_only(cm); + } + + sf->allow_txfm_domain_distortion = 1; + sf->tx_domain_thresh = tx_dom_thresholds[(speed < 6) ? speed : 5]; + sf->trellis_opt_tx_rd.method = sf->optimize_coefficients + ? ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR + : DISABLE_TRELLIS_OPT; + sf->trellis_opt_tx_rd.thresh = qopt_thresholds[(speed < 6) ? speed : 5]; + sf->less_rectangular_check = 1; + sf->use_rd_breakout = 1; + sf->adaptive_motion_search = 1; + sf->adaptive_rd_thresh = 2; + sf->mv.subpel_search_level = 1; + if (cpi->oxcf.content != VP9E_CONTENT_FILM) sf->mode_skip_start = 10; + sf->allow_acl = 0; + + sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V; + if (cpi->oxcf.content != VP9E_CONTENT_FILM) { + sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V; + sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V; + } + + sf->recode_tolerance_low = 15; + sf->recode_tolerance_high = 30; + + sf->exhaustive_searches_thresh = + (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 23) + : INT_MAX; + sf->use_accurate_subpel_search = USE_4_TAPS; + } + + if (speed >= 2) { + sf->rd_ml_partition.var_pruning = 0; + if (oxcf->vbr_corpus_complexity) + sf->recode_loop = ALLOW_RECODE_FIRST; + else + sf->recode_loop = ALLOW_RECODE_KFARFGF; + + sf->tx_size_search_method = + frame_is_boosted(cpi) ? USE_FULL_RD : USE_LARGESTALL; + + sf->mode_search_skip_flags = + (cm->frame_type == KEY_FRAME) + ? 0 + : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER | + FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR; + sf->disable_filter_search_var_thresh = 100; + sf->comp_inter_joint_search_iter_level = 2; + sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX; + sf->recode_tolerance_high = 45; + sf->enhanced_full_pixel_motion_search = 0; + sf->prune_ref_frame_for_rect_partitions = 0; + sf->rd_ml_partition.prune_rect_thresh[1] = -1; + sf->rd_ml_partition.prune_rect_thresh[2] = -1; + sf->rd_ml_partition.prune_rect_thresh[3] = -1; + sf->mv.subpel_search_level = 0; + + if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) { + for (i = 0; i < MAX_MESH_STEP; ++i) { + int mesh_density_level = 1; + sf->mesh_patterns[i].range = + good_quality_mesh_patterns[mesh_density_level][i].range; + sf->mesh_patterns[i].interval = + good_quality_mesh_patterns[mesh_density_level][i].interval; + } + } + + sf->use_accurate_subpel_search = USE_2_TAPS; + } + + if (speed >= 3) { + sf->use_square_partition_only = !frame_is_intra_only(cm); + sf->tx_size_search_method = + frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL; + sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED; + sf->adaptive_pred_interp_filter = 0; + sf->adaptive_mode_search = 1; + sf->cb_partition_search = !boosted; + sf->cb_pred_filter_search = 2; + sf->alt_ref_search_fp = 1; + sf->recode_loop = ALLOW_RECODE_KFMAXBW; + sf->adaptive_rd_thresh = 3; + sf->mode_skip_start = 6; + sf->intra_y_mode_mask[TX_32X32] = INTRA_DC; + sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC; + + if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) { + for (i = 0; i < MAX_MESH_STEP; ++i) { + int mesh_density_level = 2; + sf->mesh_patterns[i].range = + good_quality_mesh_patterns[mesh_density_level][i].range; + sf->mesh_patterns[i].interval = + good_quality_mesh_patterns[mesh_density_level][i].interval; + } + } + } + + if (speed >= 4) { + sf->use_square_partition_only = 1; + sf->tx_size_search_method = USE_LARGESTALL; + sf->mv.search_method = BIGDIA; + sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_MORE; + sf->adaptive_rd_thresh = 4; + if (cm->frame_type != KEY_FRAME) + sf->mode_search_skip_flags |= FLAG_EARLY_TERMINATE; + sf->disable_filter_search_var_thresh = 200; + sf->use_lp32x32fdct = 1; + sf->use_fast_coef_updates = ONE_LOOP_REDUCED; + sf->use_fast_coef_costing = 1; + sf->motion_field_mode_search = !boosted; + } + + if (speed >= 5) { + sf->optimize_coefficients = 0; + sf->mv.search_method = HEX; + sf->disable_filter_search_var_thresh = 500; + for (i = 0; i < TX_SIZES; ++i) { + sf->intra_y_mode_mask[i] = INTRA_DC; + sf->intra_uv_mode_mask[i] = INTRA_DC; + } + sf->mv.reduce_first_step_size = 1; + sf->simple_model_rd_from_var = 1; + } +} +#endif // !CONFIG_REALTIME_ONLY + +static void set_rt_speed_feature_framesize_dependent(VP9_COMP *cpi, + SPEED_FEATURES *sf, + int speed) { + VP9_COMMON *const cm = &cpi->common; + + if (speed >= 1) { + if (VPXMIN(cm->width, cm->height) >= 720) { + sf->disable_split_mask = + cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT; + } else { + sf->disable_split_mask = DISABLE_COMPOUND_SPLIT; + } + } + + if (speed >= 2) { + if (VPXMIN(cm->width, cm->height) >= 720) { + sf->disable_split_mask = + cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT; + } else { + sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY; + } + } + + if (speed >= 5) { + sf->partition_search_breakout_thr.rate = 200; + if (VPXMIN(cm->width, cm->height) >= 720) { + sf->partition_search_breakout_thr.dist = (1 << 25); + } else { + sf->partition_search_breakout_thr.dist = (1 << 23); + } + } + + if (speed >= 7) { + sf->encode_breakout_thresh = + (VPXMIN(cm->width, cm->height) >= 720) ? 800 : 300; + } +} + +static void set_rt_speed_feature_framesize_independent( + VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, vp9e_tune_content content) { + VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + const int is_keyframe = cm->frame_type == KEY_FRAME; + const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key; + sf->static_segmentation = 0; + sf->adaptive_rd_thresh = 1; + sf->adaptive_rd_thresh_row_mt = 0; + sf->use_fast_coef_costing = 1; + sf->exhaustive_searches_thresh = INT_MAX; + sf->allow_acl = 0; + sf->copy_partition_flag = 0; + sf->use_source_sad = 0; + sf->use_simple_block_yrd = 0; + sf->adapt_partition_source_sad = 0; + sf->use_altref_onepass = 0; + sf->use_compound_nonrd_pickmode = 0; + sf->nonrd_keyframe = 0; + sf->svc_use_lowres_part = 0; + sf->overshoot_detection_cbr_rt = NO_DETECTION; + sf->disable_16x16part_nonkey = 0; + sf->disable_golden_ref = 0; + sf->enable_tpl_model = 0; + sf->enhanced_full_pixel_motion_search = 0; + sf->use_accurate_subpel_search = USE_2_TAPS; + sf->nonrd_use_ml_partition = 0; + sf->variance_part_thresh_mult = 1; + sf->cb_pred_filter_search = 0; + sf->force_smooth_interpol = 0; + sf->rt_intra_dc_only_low_content = 0; + sf->mv.enable_adaptive_subpel_force_stop = 0; + + if (speed >= 1) { + sf->allow_txfm_domain_distortion = 1; + sf->tx_domain_thresh = 0.0; + sf->trellis_opt_tx_rd.method = DISABLE_TRELLIS_OPT; + sf->trellis_opt_tx_rd.thresh = 0.0; + sf->use_square_partition_only = !frame_is_intra_only(cm); + sf->less_rectangular_check = 1; + sf->tx_size_search_method = + frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL; + + sf->use_rd_breakout = 1; + + sf->adaptive_motion_search = 1; + sf->adaptive_pred_interp_filter = 1; + sf->mv.auto_mv_step_size = 1; + sf->adaptive_rd_thresh = 2; + sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; + sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V; + sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V; + } + + if (speed >= 2) { + sf->mode_search_skip_flags = + (cm->frame_type == KEY_FRAME) + ? 0 + : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER | + FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR; + sf->adaptive_pred_interp_filter = 2; + + // Reference masking only enabled for 1 spatial layer, and if none of the + // references have been scaled. The latter condition needs to be checked + // for external or internal dynamic resize. + sf->reference_masking = (svc->number_spatial_layers == 1); + if (sf->reference_masking == 1 && + (cpi->external_resize == 1 || + cpi->oxcf.resize_mode == RESIZE_DYNAMIC)) { + MV_REFERENCE_FRAME ref_frame; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); + if (yv12 != NULL && + (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) { + const struct scale_factors *const scale_fac = + &cm->frame_refs[ref_frame - 1].sf; + if (vp9_is_scaled(scale_fac)) sf->reference_masking = 0; + } + } + } + + sf->disable_filter_search_var_thresh = 50; + sf->comp_inter_joint_search_iter_level = 2; + sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX; + sf->lf_motion_threshold = LOW_MOTION_THRESHOLD; + sf->adjust_partitioning_from_last_frame = 1; + sf->last_partitioning_redo_frequency = 3; + sf->use_lp32x32fdct = 1; + sf->mode_skip_start = 11; + sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V; + } + + if (speed >= 3) { + sf->use_square_partition_only = 1; + sf->disable_filter_search_var_thresh = 100; + sf->use_uv_intra_rd_estimate = 1; + sf->skip_encode_sb = 1; + sf->mv.subpel_search_level = 0; + sf->adaptive_rd_thresh = 4; + sf->mode_skip_start = 6; + sf->allow_skip_recode = 0; + sf->optimize_coefficients = 0; + sf->disable_split_mask = DISABLE_ALL_SPLIT; + sf->lpf_pick = LPF_PICK_FROM_Q; + } + + if (speed >= 4) { + int i; + if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0) + sf->use_altref_onepass = 1; + sf->mv.subpel_force_stop = QUARTER_PEL; + for (i = 0; i < TX_SIZES; i++) { + sf->intra_y_mode_mask[i] = INTRA_DC_H_V; + sf->intra_uv_mode_mask[i] = INTRA_DC; + } + sf->intra_y_mode_mask[TX_32X32] = INTRA_DC; + sf->frame_parameter_update = 0; + sf->mv.search_method = FAST_HEX; + sf->allow_skip_recode = 0; + sf->max_intra_bsize = BLOCK_32X32; + sf->use_fast_coef_costing = 0; + sf->use_quant_fp = !is_keyframe; + sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEW_ZERO; + sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST_NEW_ZERO; + sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST_NEW_ZERO; + sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST_NEW_ZERO; + sf->adaptive_rd_thresh = 2; + sf->use_fast_coef_updates = is_keyframe ? TWO_LOOP : ONE_LOOP_REDUCED; + sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH; + sf->tx_size_search_method = is_keyframe ? USE_LARGESTALL : USE_TX_8X8; + sf->partition_search_type = VAR_BASED_PARTITION; + } + + if (speed >= 5) { + sf->use_altref_onepass = 0; + sf->use_quant_fp = !is_keyframe; + sf->auto_min_max_partition_size = + is_keyframe ? RELAXED_NEIGHBORING_MIN_MAX : STRICT_NEIGHBORING_MIN_MAX; + sf->default_max_partition_size = BLOCK_32X32; + sf->default_min_partition_size = BLOCK_8X8; + sf->force_frame_boost = + is_keyframe || + (frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1); + sf->max_delta_qindex = is_keyframe ? 20 : 15; + sf->partition_search_type = REFERENCE_PARTITION; + if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0 && + cpi->rc.is_src_frame_alt_ref) { + sf->partition_search_type = VAR_BASED_PARTITION; + } + sf->use_nonrd_pick_mode = 1; + sf->allow_skip_recode = 0; + sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEW_ZERO; + sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST_NEW_ZERO; + sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST_NEW_ZERO; + sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST_NEW_ZERO; + sf->adaptive_rd_thresh = 2; + // This feature is only enabled when partition search is disabled. + sf->reuse_inter_pred_sby = 1; + sf->coeff_prob_appx_step = 4; + sf->use_fast_coef_updates = is_keyframe ? TWO_LOOP : ONE_LOOP_REDUCED; + sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH; + sf->tx_size_search_method = is_keyframe ? USE_LARGESTALL : USE_TX_8X8; + sf->simple_model_rd_from_var = 1; + if (cpi->oxcf.rc_mode == VPX_VBR) sf->mv.search_method = NSTEP; + + if (!is_keyframe) { + int i; + if (content == VP9E_CONTENT_SCREEN) { + for (i = 0; i < BLOCK_SIZES; ++i) + if (i >= BLOCK_32X32) + sf->intra_y_mode_bsize_mask[i] = INTRA_DC_H_V; + else + sf->intra_y_mode_bsize_mask[i] = INTRA_DC_TM_H_V; + } else { + for (i = 0; i < BLOCK_SIZES; ++i) + if (i > BLOCK_16X16) + sf->intra_y_mode_bsize_mask[i] = INTRA_DC; + else + // Use H and V intra mode for block sizes <= 16X16. + sf->intra_y_mode_bsize_mask[i] = INTRA_DC_H_V; + } + } + if (content == VP9E_CONTENT_SCREEN) { + sf->short_circuit_flat_blocks = 1; + } + if (cpi->oxcf.rc_mode == VPX_CBR && + cpi->oxcf.content != VP9E_CONTENT_SCREEN) { + sf->limit_newmv_early_exit = 1; + if (!cpi->use_svc) sf->bias_golden = 1; + } + // Keep nonrd_keyframe = 1 for non-base spatial layers to prevent + // increase in encoding time. + if (cpi->use_svc && svc->spatial_layer_id > 0) sf->nonrd_keyframe = 1; + if (cm->frame_type != KEY_FRAME && cpi->resize_state == ORIG && + cpi->oxcf.rc_mode == VPX_CBR && !cpi->rc.disable_overshoot_maxq_cbr) { + if (cm->width * cm->height <= 352 * 288 && !cpi->use_svc && + cpi->oxcf.content != VP9E_CONTENT_SCREEN) + sf->overshoot_detection_cbr_rt = RE_ENCODE_MAXQ; + else + sf->overshoot_detection_cbr_rt = FAST_DETECTION_MAXQ; + } + if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0 && + cm->width <= 1280 && cm->height <= 720) { + sf->use_altref_onepass = 1; + sf->use_compound_nonrd_pickmode = 1; + } + if (cm->width * cm->height > 1280 * 720) sf->cb_pred_filter_search = 2; + if (!cpi->external_resize) sf->use_source_sad = 1; + } + + if (speed >= 6) { + if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0) { + sf->use_altref_onepass = 1; + sf->use_compound_nonrd_pickmode = 1; + } + sf->partition_search_type = VAR_BASED_PARTITION; + sf->mv.search_method = NSTEP; + sf->mv.reduce_first_step_size = 1; + sf->skip_encode_sb = 0; + + if (sf->use_source_sad) { + sf->adapt_partition_source_sad = 1; + sf->adapt_partition_thresh = + (cm->width * cm->height <= 640 * 360) ? 40000 : 60000; + if (cpi->content_state_sb_fd == NULL && + (!cpi->use_svc || + svc->spatial_layer_id == svc->number_spatial_layers - 1)) { + CHECK_MEM_ERROR(&cm->error, cpi->content_state_sb_fd, + (uint8_t *)vpx_calloc( + (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), + sizeof(uint8_t))); + } + } + if (cpi->oxcf.rc_mode == VPX_CBR && content != VP9E_CONTENT_SCREEN) { + // Enable short circuit for low temporal variance. + sf->short_circuit_low_temp_var = 1; + } + if (svc->temporal_layer_id > 0) { + sf->adaptive_rd_thresh = 4; + sf->limit_newmv_early_exit = 0; + sf->base_mv_aggressive = 1; + } + if (cm->frame_type != KEY_FRAME && cpi->resize_state == ORIG && + cpi->oxcf.rc_mode == VPX_CBR && !cpi->rc.disable_overshoot_maxq_cbr) + sf->overshoot_detection_cbr_rt = FAST_DETECTION_MAXQ; + } + + if (speed >= 7) { + sf->adapt_partition_source_sad = 0; + sf->adaptive_rd_thresh = 3; + sf->mv.search_method = FAST_DIAMOND; + sf->mv.fullpel_search_step_param = 10; + // For SVC: use better mv search on base temporal layer, and only + // on base spatial layer if highest resolution is above 640x360. + if (svc->number_temporal_layers > 2 && svc->temporal_layer_id == 0 && + (svc->spatial_layer_id == 0 || + cpi->oxcf.width * cpi->oxcf.height <= 640 * 360)) { + sf->mv.search_method = NSTEP; + sf->mv.fullpel_search_step_param = 6; + } + if (svc->temporal_layer_id > 0 || svc->spatial_layer_id > 1) { + sf->use_simple_block_yrd = 1; + if (svc->non_reference_frame) + sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_EVENMORE; + } + if (cpi->use_svc && cpi->row_mt && cpi->oxcf.max_threads > 1) + sf->adaptive_rd_thresh_row_mt = 1; + // Enable partition copy. For SVC only enabled for top spatial resolution + // layer. + cpi->max_copied_frame = 0; + if (!cpi->last_frame_dropped && cpi->resize_state == ORIG && + !cpi->external_resize && + (!cpi->use_svc || + (svc->spatial_layer_id == svc->number_spatial_layers - 1 && + !svc->last_layer_dropped[svc->number_spatial_layers - 1]))) { + sf->copy_partition_flag = 1; + cpi->max_copied_frame = 2; + // The top temporal enhancement layer (for number of temporal layers > 1) + // are non-reference frames, so use large/max value for max_copied_frame. + if (svc->number_temporal_layers > 1 && + svc->temporal_layer_id == svc->number_temporal_layers - 1) + cpi->max_copied_frame = 255; + } + // For SVC: enable use of lower resolution partition for higher resolution, + // only for 3 spatial layers and when config/top resolution is above VGA. + // Enable only for non-base temporal layer frames. + if (cpi->use_svc && svc->use_partition_reuse && + svc->number_spatial_layers == 3 && svc->temporal_layer_id > 0 && + cpi->oxcf.width * cpi->oxcf.height > 640 * 480) + sf->svc_use_lowres_part = 1; + // For SVC when golden is used as second temporal reference: to avoid + // encode time increase only use this feature on base temporal layer. + // (i.e remove golden flag from frame_flags for temporal_layer_id > 0). + if (cpi->use_svc && svc->use_gf_temporal_ref_current_layer && + svc->temporal_layer_id > 0) + cpi->ref_frame_flags &= (~VP9_GOLD_FLAG); + if (cm->width * cm->height > 640 * 480) sf->cb_pred_filter_search = 2; + } + + if (speed >= 8) { + sf->adaptive_rd_thresh = 4; + sf->skip_encode_sb = 1; + if (cpi->svc.number_spatial_layers > 1 && !cpi->svc.simulcast_mode) + sf->nonrd_keyframe = 0; + else + sf->nonrd_keyframe = 1; + if (!cpi->use_svc) cpi->max_copied_frame = 4; + if (cpi->row_mt && cpi->oxcf.max_threads > 1) + sf->adaptive_rd_thresh_row_mt = 1; + // Enable ML based partition for low res. + if (!frame_is_intra_only(cm) && cm->width * cm->height <= 352 * 288) { + sf->nonrd_use_ml_partition = 1; + } +#if CONFIG_VP9_HIGHBITDEPTH + if (cpi->Source->flags & YV12_FLAG_HIGHBITDEPTH) + sf->nonrd_use_ml_partition = 0; +#endif + if (content == VP9E_CONTENT_SCREEN) sf->mv.subpel_force_stop = HALF_PEL; + sf->rt_intra_dc_only_low_content = 1; + if (!cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR && + content != VP9E_CONTENT_SCREEN) { + // More aggressive short circuit for speed 8. + sf->short_circuit_low_temp_var = 3; + // Use level 2 for noisey cases as there is a regression in some + // noisy clips with level 3. + if (cpi->noise_estimate.enabled && cm->width >= 1280 && + cm->height >= 720) { + NOISE_LEVEL noise_level = + vp9_noise_estimate_extract_level(&cpi->noise_estimate); + if (noise_level >= kMedium) sf->short_circuit_low_temp_var = 2; + } + // Since the short_circuit_low_temp_var is used, reduce the + // adaptive_rd_thresh level. + if (cm->width * cm->height > 352 * 288) + sf->adaptive_rd_thresh = 1; + else + sf->adaptive_rd_thresh = 2; + } + sf->limit_newmv_early_exit = 0; + sf->use_simple_block_yrd = 1; + if (cm->width * cm->height > 352 * 288) sf->cb_pred_filter_search = 2; + } + + if (speed >= 9) { + // Only keep INTRA_DC mode for speed 9. + if (!is_keyframe) { + int i = 0; + for (i = 0; i < BLOCK_SIZES; ++i) + sf->intra_y_mode_bsize_mask[i] = INTRA_DC; + } + sf->cb_pred_filter_search = 2; + sf->mv.enable_adaptive_subpel_force_stop = 1; + sf->mv.adapt_subpel_force_stop.mv_thresh = 1; + sf->mv.adapt_subpel_force_stop.force_stop_below = QUARTER_PEL; + sf->mv.adapt_subpel_force_stop.force_stop_above = HALF_PEL; + // Disable partition blocks below 16x16, except for low-resolutions. + if (cm->frame_type != KEY_FRAME && cm->width >= 320 && cm->height >= 240) + sf->disable_16x16part_nonkey = 1; + // Allow for disabling GOLDEN reference, for CBR mode. + if (cpi->oxcf.rc_mode == VPX_CBR) sf->disable_golden_ref = 1; + if (cpi->rc.avg_frame_low_motion < 70) sf->default_interp_filter = BILINEAR; + if (cm->width * cm->height >= 640 * 360) sf->variance_part_thresh_mult = 2; + } + + // Disable split to 8x8 for low-resolution at very high Q. + // For variance partition (speed >= 6). Ignore the first few frames + // as avg_frame_qindex starts at max_q (worst_quality). + if (cm->frame_type != KEY_FRAME && cm->width * cm->height <= 320 * 240 && + sf->partition_search_type == VAR_BASED_PARTITION && + cpi->rc.avg_frame_qindex[INTER_FRAME] > 208 && + cpi->common.current_video_frame > 8) + sf->disable_16x16part_nonkey = 1; + + if (sf->nonrd_use_ml_partition) + sf->partition_search_type = ML_BASED_PARTITION; + + if (sf->use_altref_onepass) { + if (cpi->rc.is_src_frame_alt_ref && cm->frame_type != KEY_FRAME) { + sf->partition_search_type = FIXED_PARTITION; + sf->always_this_block_size = BLOCK_64X64; + } + if (cpi->count_arf_frame_usage == NULL) { + CHECK_MEM_ERROR( + &cm->error, cpi->count_arf_frame_usage, + (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), + sizeof(*cpi->count_arf_frame_usage))); + } + if (cpi->count_lastgolden_frame_usage == NULL) + CHECK_MEM_ERROR( + &cm->error, cpi->count_lastgolden_frame_usage, + (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), + sizeof(*cpi->count_lastgolden_frame_usage))); + } + if (svc->previous_frame_is_intra_only) { + sf->partition_search_type = FIXED_PARTITION; + sf->always_this_block_size = BLOCK_64X64; + } + // Special case for screen content: increase motion search on base spatial + // layer when high motion is detected or previous SL0 frame was dropped. + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && cpi->oxcf.speed >= 5 && + (svc->high_num_blocks_with_motion || svc->last_layer_dropped[0])) { + sf->mv.search_method = NSTEP; + // TODO(marpan/jianj): Tune this setting for screensharing. For now use + // small step_param for all spatial layers. + sf->mv.fullpel_search_step_param = 2; + } + // TODO(marpan): There is regression for aq-mode=3 speed <= 4, force it + // off for now. + if (speed <= 3 && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) + cpi->oxcf.aq_mode = 0; + // For all speeds for rt mode: if the deadline mode changed (was good/best + // quality on previous frame and now is realtime) set nonrd_keyframe to 1 to + // avoid entering rd pickmode. This causes issues, such as: b/310663186. + if (cpi->oxcf.mode != cpi->deadline_mode_previous_frame) + sf->nonrd_keyframe = 1; +} + +void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi, int speed) { + SPEED_FEATURES *const sf = &cpi->sf; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + RD_OPT *const rd = &cpi->rd; + int i; + + // best quality defaults + // Some speed-up features even for best quality as minimal impact on quality. + sf->partition_search_breakout_thr.dist = (1 << 19); + sf->partition_search_breakout_thr.rate = 80; + sf->rd_ml_partition.search_early_termination = 0; + sf->rd_ml_partition.search_breakout = 0; + + if (oxcf->mode == REALTIME) + set_rt_speed_feature_framesize_dependent(cpi, sf, speed); +#if !CONFIG_REALTIME_ONLY + else if (oxcf->mode == GOOD) + set_good_speed_feature_framesize_dependent(cpi, sf, speed); +#endif + + if (sf->disable_split_mask == DISABLE_ALL_SPLIT) { + sf->adaptive_pred_interp_filter = 0; + } + + if (cpi->encode_breakout && oxcf->mode == REALTIME && + sf->encode_breakout_thresh > cpi->encode_breakout) { + cpi->encode_breakout = sf->encode_breakout_thresh; + } + + // Check for masked out split cases. + for (i = 0; i < MAX_REFS; ++i) { + if (sf->disable_split_mask & (1 << i)) { + rd->thresh_mult_sub8x8[i] = INT_MAX; + } + } + + // With row based multi-threading, the following speed features + // have to be disabled to guarantee that bitstreams encoded with single thread + // and multiple threads match. + // It can be used in realtime when adaptive_rd_thresh_row_mt is enabled since + // adaptive_rd_thresh is defined per-row for non-rd pickmode. + if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact && + oxcf->max_threads > 1) + sf->adaptive_rd_thresh = 0; +} + +void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) { + SPEED_FEATURES *const sf = &cpi->sf; +#if !CONFIG_REALTIME_ONLY + VP9_COMMON *const cm = &cpi->common; +#endif + MACROBLOCK *const x = &cpi->td.mb; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + int i; + + // best quality defaults + sf->frame_parameter_update = 1; + sf->mv.search_method = NSTEP; + sf->recode_loop = ALLOW_RECODE_FIRST; + sf->mv.subpel_search_method = SUBPEL_TREE; + sf->mv.subpel_search_level = 2; + sf->mv.subpel_force_stop = EIGHTH_PEL; + sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf); + sf->mv.reduce_first_step_size = 0; + sf->coeff_prob_appx_step = 1; + sf->mv.auto_mv_step_size = 0; + sf->mv.fullpel_search_step_param = 6; + sf->mv.use_downsampled_sad = 0; + sf->comp_inter_joint_search_iter_level = 0; + sf->tx_size_search_method = USE_FULL_RD; + sf->use_lp32x32fdct = 0; + sf->adaptive_motion_search = 0; + sf->enhanced_full_pixel_motion_search = 1; + sf->adaptive_pred_interp_filter = 0; + sf->adaptive_mode_search = 0; + sf->prune_single_mode_based_on_mv_diff_mode_rate = 0; + sf->cb_pred_filter_search = 0; + sf->early_term_interp_search_plane_rd = 0; + sf->cb_partition_search = 0; + sf->motion_field_mode_search = 0; + sf->alt_ref_search_fp = 0; + sf->use_quant_fp = 0; + sf->reference_masking = 0; + sf->partition_search_type = SEARCH_PARTITION; + sf->less_rectangular_check = 0; + sf->use_square_partition_only = 0; + sf->use_square_only_thresh_high = BLOCK_SIZES; + sf->use_square_only_thresh_low = BLOCK_4X4; + sf->auto_min_max_partition_size = NOT_IN_USE; + sf->rd_auto_partition_min_limit = BLOCK_4X4; + sf->default_max_partition_size = BLOCK_64X64; + sf->default_min_partition_size = BLOCK_4X4; + sf->adjust_partitioning_from_last_frame = 0; + sf->last_partitioning_redo_frequency = 4; + sf->disable_split_mask = 0; + sf->mode_search_skip_flags = 0; + sf->force_frame_boost = 0; + sf->max_delta_qindex = 0; + sf->disable_filter_search_var_thresh = 0; + sf->adaptive_interp_filter_search = 0; + sf->allow_txfm_domain_distortion = 0; + sf->tx_domain_thresh = 99.0; + sf->trellis_opt_tx_rd.method = + sf->optimize_coefficients ? ENABLE_TRELLIS_OPT : DISABLE_TRELLIS_OPT; + sf->trellis_opt_tx_rd.thresh = 99.0; + sf->allow_acl = 1; + sf->enable_tpl_model = oxcf->enable_tpl_model; + sf->prune_ref_frame_for_rect_partitions = 0; + sf->temporal_filter_search_method = MESH; + sf->allow_skip_txfm_ac_dc = 0; + + for (i = 0; i < TX_SIZES; i++) { + sf->intra_y_mode_mask[i] = INTRA_ALL; + sf->intra_uv_mode_mask[i] = INTRA_ALL; + } + sf->use_rd_breakout = 0; + sf->skip_encode_sb = 0; + sf->use_uv_intra_rd_estimate = 0; + sf->allow_skip_recode = 0; + sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE; + sf->use_fast_coef_updates = TWO_LOOP; + sf->use_fast_coef_costing = 0; + sf->mode_skip_start = MAX_MODES; // Mode index at which mode skip mask set + sf->schedule_mode_search = 0; + sf->use_nonrd_pick_mode = 0; + for (i = 0; i < BLOCK_SIZES; ++i) sf->inter_mode_mask[i] = INTER_ALL; + sf->max_intra_bsize = BLOCK_64X64; + sf->reuse_inter_pred_sby = 0; + // This setting only takes effect when partition_search_type is set + // to FIXED_PARTITION. + sf->always_this_block_size = BLOCK_16X16; + sf->search_type_check_frequency = 50; + sf->encode_breakout_thresh = 0; + // Recode loop tolerance %. + sf->recode_tolerance_low = 12; + sf->recode_tolerance_high = 25; + sf->default_interp_filter = SWITCHABLE; + sf->simple_model_rd_from_var = 0; + sf->short_circuit_flat_blocks = 0; + sf->short_circuit_low_temp_var = 0; + sf->limit_newmv_early_exit = 0; + sf->bias_golden = 0; + sf->base_mv_aggressive = 0; + sf->rd_ml_partition.prune_rect_thresh[0] = -1; + sf->rd_ml_partition.prune_rect_thresh[1] = -1; + sf->rd_ml_partition.prune_rect_thresh[2] = -1; + sf->rd_ml_partition.prune_rect_thresh[3] = -1; + sf->rd_ml_partition.var_pruning = 0; + sf->use_accurate_subpel_search = USE_8_TAPS; + + // Some speed-up features even for best quality as minimal impact on quality. + sf->adaptive_rd_thresh = 1; + sf->tx_size_search_breakout = 1; + sf->tx_size_search_depth = 2; + + sf->exhaustive_searches_thresh = + (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 20) + : INT_MAX; + { + const int mesh_density_level = + (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? 0 : 1; + for (i = 0; i < MAX_MESH_STEP; ++i) { + sf->mesh_patterns[i].range = + best_quality_mesh_pattern[mesh_density_level][i].range; + sf->mesh_patterns[i].interval = + best_quality_mesh_pattern[mesh_density_level][i].interval; + } + } + + if (oxcf->mode == REALTIME) + set_rt_speed_feature_framesize_independent(cpi, sf, speed, oxcf->content); +#if !CONFIG_REALTIME_ONLY + else if (oxcf->mode == GOOD) + set_good_speed_feature_framesize_independent(cpi, cm, sf, speed); +#endif + + cpi->diamond_search_sad = vp9_diamond_search_sad; + + // Slow quant, dct and trellis not worthwhile for first pass + // so make sure they are always turned off. + if (oxcf->pass == 1) sf->optimize_coefficients = 0; + + // No recode for 1 pass. + if (oxcf->pass == 0) { + sf->recode_loop = DISALLOW_RECODE; + sf->optimize_coefficients = 0; + } + + if (sf->mv.subpel_force_stop == FULL_PEL) { + // Whole pel only + cpi->find_fractional_mv_step = vp9_skip_sub_pixel_tree; + } else if (sf->mv.subpel_search_method == SUBPEL_TREE) { + cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree; + } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED) { + cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned; + } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED_MORE) { + cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned_more; + } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED_EVENMORE) { + cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned_evenmore; + } + + // This is only used in motion vector unit test. + if (cpi->oxcf.motion_vector_unit_test == 1) + cpi->find_fractional_mv_step = vp9_return_max_sub_pixel_mv; + else if (cpi->oxcf.motion_vector_unit_test == 2) + cpi->find_fractional_mv_step = vp9_return_min_sub_pixel_mv; + + x->optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1; + + x->min_partition_size = sf->default_min_partition_size; + x->max_partition_size = sf->default_max_partition_size; + + if (!cpi->oxcf.frame_periodic_boost) { + sf->max_delta_qindex = 0; + } + + // With row based multi-threading, the following speed features + // have to be disabled to guarantee that bitstreams encoded with single thread + // and multiple threads match. + // It can be used in realtime when adaptive_rd_thresh_row_mt is enabled since + // adaptive_rd_thresh is defined per-row for non-rd pickmode. + if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact && + oxcf->max_threads > 1) + sf->adaptive_rd_thresh = 0; +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.h b/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.h new file mode 100644 index 0000000000..941de639ac --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.h @@ -0,0 +1,674 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_SPEED_FEATURES_H_ +#define VPX_VP9_ENCODER_VP9_SPEED_FEATURES_H_ + +#include "vp9/common/vp9_enums.h" + +#ifdef __cplusplus +extern "C" { +#endif + +enum { + INTRA_ALL = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED) | (1 << D45_PRED) | + (1 << D135_PRED) | (1 << D117_PRED) | (1 << D153_PRED) | + (1 << D207_PRED) | (1 << D63_PRED) | (1 << TM_PRED), + INTRA_DC = (1 << DC_PRED), + INTRA_DC_TM = (1 << DC_PRED) | (1 << TM_PRED), + INTRA_DC_H_V = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED), + INTRA_DC_TM_H_V = + (1 << DC_PRED) | (1 << TM_PRED) | (1 << V_PRED) | (1 << H_PRED) +}; + +enum { + INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV), + INTER_NEAREST = (1 << NEARESTMV), + INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV), + INTER_NEAREST_ZERO = (1 << NEARESTMV) | (1 << ZEROMV), + INTER_NEAREST_NEW_ZERO = (1 << NEARESTMV) | (1 << ZEROMV) | (1 << NEWMV), + INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV), + INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV), +}; + +enum { + DISABLE_ALL_INTER_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA) | + (1 << THR_ALTR) | (1 << THR_GOLD) | (1 << THR_LAST), + + DISABLE_ALL_SPLIT = (1 << THR_INTRA) | DISABLE_ALL_INTER_SPLIT, + + DISABLE_COMPOUND_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA), + + LAST_AND_INTRA_SPLIT_ONLY = (1 << THR_COMP_GA) | (1 << THR_COMP_LA) | + (1 << THR_ALTR) | (1 << THR_GOLD) +}; + +typedef enum { + DIAMOND = 0, + NSTEP = 1, + HEX = 2, + BIGDIA = 3, + SQUARE = 4, + FAST_HEX = 5, + FAST_DIAMOND = 6, + MESH = 7 +} SEARCH_METHODS; + +typedef enum { + // No recode. + DISALLOW_RECODE = 0, + // Allow recode for KF and exceeding maximum frame bandwidth. + ALLOW_RECODE_KFMAXBW = 1, + // Allow recode only for KF/ARF/GF frames. + ALLOW_RECODE_KFARFGF = 2, + // Allow recode for ARF/GF/KF and first normal frame in each group. + ALLOW_RECODE_FIRST = 3, + // Allow recode for all frames based on bitrate constraints. + ALLOW_RECODE = 4, +} RECODE_LOOP_TYPE; + +typedef enum { + SUBPEL_TREE = 0, + SUBPEL_TREE_PRUNED = 1, // Prunes 1/2-pel searches + SUBPEL_TREE_PRUNED_MORE = 2, // Prunes 1/2-pel searches more aggressively + SUBPEL_TREE_PRUNED_EVENMORE = 3, // Prunes 1/2- and 1/4-pel searches + // Other methods to come +} SUBPEL_SEARCH_METHODS; + +typedef enum { + NO_MOTION_THRESHOLD = 0, + LOW_MOTION_THRESHOLD = 7 +} MOTION_THRESHOLD; + +typedef enum { + USE_FULL_RD = 0, + USE_LARGESTALL, + USE_TX_8X8 +} TX_SIZE_SEARCH_METHOD; + +typedef enum { + NOT_IN_USE = 0, + RELAXED_NEIGHBORING_MIN_MAX = 1, + STRICT_NEIGHBORING_MIN_MAX = 2 +} AUTO_MIN_MAX_MODE; + +typedef enum { + // Try the full image with different values. + LPF_PICK_FROM_FULL_IMAGE, + // Try a small portion of the image with different values. + LPF_PICK_FROM_SUBIMAGE, + // Estimate the level based on quantizer and frame type + LPF_PICK_FROM_Q, + // Pick 0 to disable LPF if LPF was enabled last frame + LPF_PICK_MINIMAL_LPF +} LPF_PICK_METHOD; + +typedef enum { + // Terminate search early based on distortion so far compared to + // qp step, distortion in the neighborhood of the frame, etc. + FLAG_EARLY_TERMINATE = 1 << 0, + + // Skips comp inter modes if the best so far is an intra mode. + FLAG_SKIP_COMP_BESTINTRA = 1 << 1, + + // Skips oblique intra modes if the best so far is an inter mode. + FLAG_SKIP_INTRA_BESTINTER = 1 << 3, + + // Skips oblique intra modes at angles 27, 63, 117, 153 if the best + // intra so far is not one of the neighboring directions. + FLAG_SKIP_INTRA_DIRMISMATCH = 1 << 4, + + // Skips intra modes other than DC_PRED if the source variance is small + FLAG_SKIP_INTRA_LOWVAR = 1 << 5, +} MODE_SEARCH_SKIP_LOGIC; + +typedef enum { + FLAG_SKIP_EIGHTTAP = 1 << EIGHTTAP, + FLAG_SKIP_EIGHTTAP_SMOOTH = 1 << EIGHTTAP_SMOOTH, + FLAG_SKIP_EIGHTTAP_SHARP = 1 << EIGHTTAP_SHARP, +} INTERP_FILTER_MASK; + +typedef enum { + // Search partitions using RD/NONRD criterion. + SEARCH_PARTITION, + + // Always use a fixed size partition. + FIXED_PARTITION, + + REFERENCE_PARTITION, + + // Use an arbitrary partitioning scheme based on source variance within + // a 64X64 SB. + VAR_BASED_PARTITION, + + // Use non-fixed partitions based on source variance. + SOURCE_VAR_BASED_PARTITION, + + // Make partition decisions with machine learning models. + ML_BASED_PARTITION +} PARTITION_SEARCH_TYPE; + +typedef enum { + // Does a dry run to see if any of the contexts need to be updated or not, + // before the final run. + TWO_LOOP = 0, + + // No dry run, also only half the coef contexts and bands are updated. + // The rest are not updated at all. + ONE_LOOP_REDUCED = 1 +} FAST_COEFF_UPDATE; + +typedef enum { EIGHTH_PEL, QUARTER_PEL, HALF_PEL, FULL_PEL } SUBPEL_FORCE_STOP; + +typedef struct ADAPT_SUBPEL_FORCE_STOP { + // Threshold for full pixel motion vector; + int mv_thresh; + + // subpel_force_stop if full pixel MV is below the threshold. + SUBPEL_FORCE_STOP force_stop_below; + + // subpel_force_stop if full pixel MV is equal to or above the threshold. + SUBPEL_FORCE_STOP force_stop_above; +} ADAPT_SUBPEL_FORCE_STOP; + +typedef struct MV_SPEED_FEATURES { + // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc). + SEARCH_METHODS search_method; + + // This parameter controls which step in the n-step process we start at. + // It's changed adaptively based on circumstances. + int reduce_first_step_size; + + // If this is set to 1, we limit the motion search range to 2 times the + // largest motion vector found in the last frame. + int auto_mv_step_size; + + // Subpel_search_method can only be subpel_tree which does a subpixel + // logarithmic search that keeps stepping at 1/2 pixel units until + // you stop getting a gain, and then goes on to 1/4 and repeats + // the same process. Along the way it skips many diagonals. + SUBPEL_SEARCH_METHODS subpel_search_method; + + // Subpel MV search level. Can take values 0 - 2. Higher values mean more + // extensive subpel search. + int subpel_search_level; + + // When to stop subpel motion search. + SUBPEL_FORCE_STOP subpel_force_stop; + + // If it's enabled, different subpel_force_stop will be used for different MV. + int enable_adaptive_subpel_force_stop; + + ADAPT_SUBPEL_FORCE_STOP adapt_subpel_force_stop; + + // This variable sets the step_param used in full pel motion search. + int fullpel_search_step_param; + + // Whether to downsample the rows in sad calculation during motion search. + // This is only active when there are at least 8 rows. + int use_downsampled_sad; +} MV_SPEED_FEATURES; + +typedef struct PARTITION_SEARCH_BREAKOUT_THR { + int64_t dist; + int rate; +} PARTITION_SEARCH_BREAKOUT_THR; + +#define MAX_MESH_STEP 4 + +typedef struct MESH_PATTERN { + int range; + int interval; +} MESH_PATTERN; + +typedef enum { + // No reaction to rate control on a detected slide/scene change. + NO_DETECTION = 0, + + // Set to larger Q (max_q set by user) based only on the + // detected slide/scene change and current/past Q. + FAST_DETECTION_MAXQ = 1, + + // Based on (first pass) encoded frame, if large frame size is detected + // then set to higher Q for the second re-encode. This involves 2 pass + // encoding on slide change, so slower than 1, but more accurate for + // detecting overshoot. + RE_ENCODE_MAXQ = 2 +} OVERSHOOT_DETECTION_CBR_RT; + +typedef enum { + USE_2_TAPS = 0, + USE_4_TAPS, + USE_8_TAPS, + USE_8_TAPS_SHARP, +} SUBPEL_SEARCH_TYPE; + +typedef enum { + // Disable trellis coefficient optimization + DISABLE_TRELLIS_OPT, + // Enable trellis coefficient optimization + ENABLE_TRELLIS_OPT, + // Enable trellis coefficient optimization based on source variance of the + // prediction block during transform RD + ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR, + // Enable trellis coefficient optimization based on residual mse of the + // transform block during transform RD + ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE, +} ENABLE_TRELLIS_OPT_METHOD; + +typedef struct TRELLIS_OPT_CONTROL { + ENABLE_TRELLIS_OPT_METHOD method; + double thresh; +} TRELLIS_OPT_CONTROL; + +typedef struct SPEED_FEATURES { + MV_SPEED_FEATURES mv; + + // Frame level coding parameter update + int frame_parameter_update; + + RECODE_LOOP_TYPE recode_loop; + + // Trellis (dynamic programming) optimization of quantized values (+1, 0). + int optimize_coefficients; + + // Always set to 0. If on it enables 0 cost background transmission + // (except for the initial transmission of the segmentation). The feature is + // disabled because the addition of very large block sizes make the + // backgrounds very to cheap to encode, and the segmentation we have + // adds overhead. + int static_segmentation; + + // The best compound predictor is found using an iterative log search process + // that searches for best ref0 mv using error of combined predictor and then + // searches for best ref1 mv. This sf determines the number of iterations of + // this process based on block size. The sf becomes more aggressive from level + // 0 to 2. The following table indicates the number of iterations w.r.t bsize: + // ----------------------------------------------- + // |sf (level)|bsize < 8X8| [8X8, 16X16] | > 16X16 | + // | 0 | 4 | 4 | 4 | + // | 1 | 0 | 2 | 4 | + // | 2 | 0 | 0 | 0 | + // ----------------------------------------------- + // Here, 0 iterations indicate using the best single motion vector selected + // for each ref frame without any iterative refinement. + int comp_inter_joint_search_iter_level; + + // This variable is used to cap the maximum number of times we skip testing a + // mode to be evaluated. A high value means we will be faster. + // Turned off when (row_mt_bit_exact == 1 && adaptive_rd_thresh_row_mt == 0). + int adaptive_rd_thresh; + + // Flag to use adaptive_rd_thresh when row-mt it enabled, only for non-rd + // pickmode. + int adaptive_rd_thresh_row_mt; + + // Enables skipping the reconstruction step (idct, recon) in the + // intermediate steps assuming the last frame didn't have too many intra + // blocks and the q is less than a threshold. + int skip_encode_sb; + int skip_encode_frame; + // Speed feature to allow or disallow skipping of recode at block + // level within a frame. + int allow_skip_recode; + + // Coefficient probability model approximation step size + int coeff_prob_appx_step; + + // Enable uniform quantizer followed by trellis coefficient optimization + // during transform RD + TRELLIS_OPT_CONTROL trellis_opt_tx_rd; + + // Enable asymptotic closed-loop encoding decision for key frame and + // alternate reference frames. + int allow_acl; + + // Temporal dependency model based encoding mode optimization + int enable_tpl_model; + + // Use transform domain distortion. Use pixel domain distortion in speed 0 + // and certain situations in higher speed to improve the RD model precision. + int allow_txfm_domain_distortion; + double tx_domain_thresh; + + // The threshold is to determine how slow the motino is, it is used when + // use_lastframe_partitioning is set to LAST_FRAME_PARTITION_LOW_MOTION + MOTION_THRESHOLD lf_motion_threshold; + + // Determine which method we use to determine transform size. We can choose + // between options like full rd, largest for prediction size, largest + // for intra and model coefs for the rest. + TX_SIZE_SEARCH_METHOD tx_size_search_method; + + // How many levels of tx size to search, starting from the largest. + int tx_size_search_depth; + + // Low precision 32x32 fdct keeps everything in 16 bits and thus is less + // precise but significantly faster than the non lp version. + int use_lp32x32fdct; + + // After looking at the first set of modes (set by index here), skip + // checking modes for reference frames that don't match the reference frame + // of the best so far. + int mode_skip_start; + + // TODO(JBB): Remove this. + int reference_masking; + + PARTITION_SEARCH_TYPE partition_search_type; + + // Used if partition_search_type = FIXED_SIZE_PARTITION + BLOCK_SIZE always_this_block_size; + + // Skip rectangular partition test when partition type none gives better + // rd than partition type split. + int less_rectangular_check; + + // Disable testing non square partitions(eg 16x32) for block sizes larger than + // use_square_only_thresh_high or smaller than use_square_only_thresh_low. + int use_square_partition_only; + BLOCK_SIZE use_square_only_thresh_high; + BLOCK_SIZE use_square_only_thresh_low; + + // Prune reference frames for rectangular partitions. + int prune_ref_frame_for_rect_partitions; + + // Sets min and max partition sizes for this 64x64 region based on the + // same 64x64 in last encoded frame, and the left and above neighbor. + AUTO_MIN_MAX_MODE auto_min_max_partition_size; + // Ensures the rd based auto partition search will always + // go down at least to the specified level. + BLOCK_SIZE rd_auto_partition_min_limit; + + // Min and max partition size we enable (block_size) as per auto + // min max, but also used by adjust partitioning, and pick_partitioning. + BLOCK_SIZE default_min_partition_size; + BLOCK_SIZE default_max_partition_size; + + // Whether or not we allow partitions one smaller or one greater than the last + // frame's partitioning. Only used if use_lastframe_partitioning is set. + int adjust_partitioning_from_last_frame; + + // How frequently we re do the partitioning from scratch. Only used if + // use_lastframe_partitioning is set. + int last_partitioning_redo_frequency; + + // Disables sub 8x8 blocksizes in different scenarios: Choices are to disable + // it always, to allow it for only Last frame and Intra, disable it for all + // inter modes or to enable it always. + int disable_split_mask; + + // TODO(jingning): combine the related motion search speed features + // This allows us to use motion search at other sizes as a starting + // point for this motion search and limits the search range around it. + int adaptive_motion_search; + + // Do extra full pixel motion search to obtain better motion vector. + int enhanced_full_pixel_motion_search; + + // Threshold for allowing exhaistive motion search. + int exhaustive_searches_thresh; + + // Pattern to be used for any exhaustive mesh searches. + MESH_PATTERN mesh_patterns[MAX_MESH_STEP]; + + int schedule_mode_search; + + // Allows sub 8x8 modes to use the prediction filter that was determined + // best for 8x8 mode. If set to 0 we always re check all the filters for + // sizes less than 8x8, 1 means we check all filter modes if no 8x8 filter + // was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected. + int adaptive_pred_interp_filter; + + // Adaptive prediction mode search + int adaptive_mode_search; + + // Prune NEAREST and ZEROMV single reference modes based on motion vector + // difference and mode rate + int prune_single_mode_based_on_mv_diff_mode_rate; + + // Chessboard pattern prediction for interp filter. Aggressiveness increases + // with levels. + // 0: disable + // 1: cb pattern in eval when filter is not switchable + // 2: cb pattern prediction for filter search + int cb_pred_filter_search; + + // This variable enables an early termination of interpolation filter eval + // based on the current rd cost after processing each plane + int early_term_interp_search_plane_rd; + + int cb_partition_search; + + int motion_field_mode_search; + + int alt_ref_search_fp; + + // Fast quantization process path + int use_quant_fp; + + // Use finer quantizer in every other few frames that run variable block + // partition type search. + int force_frame_boost; + + // Maximally allowed base quantization index fluctuation. + int max_delta_qindex; + + // Implements various heuristics to skip searching modes + // The heuristics selected are based on flags + // defined in the MODE_SEARCH_SKIP_HEURISTICS enum + unsigned int mode_search_skip_flags; + + // A source variance threshold below which filter search is disabled + // Choose a very large value (UINT_MAX) to use 8-tap always + unsigned int disable_filter_search_var_thresh; + + // These bit masks allow you to enable or disable intra modes for each + // transform size separately. + int intra_y_mode_mask[TX_SIZES]; + int intra_uv_mode_mask[TX_SIZES]; + + // These bit masks allow you to enable or disable intra modes for each + // prediction block size separately. + int intra_y_mode_bsize_mask[BLOCK_SIZES]; + + // This variable enables an early break out of mode testing if the model for + // rd built from the prediction signal indicates a value that's much + // higher than the best rd we've seen so far. + int use_rd_breakout; + + // This enables us to use an estimate for intra rd based on dc mode rather + // than choosing an actual uv mode in the stage of encoding before the actual + // final encode. + int use_uv_intra_rd_estimate; + + // This feature controls how the loop filter level is determined. + LPF_PICK_METHOD lpf_pick; + + // This feature limits the number of coefficients updates we actually do + // by only looking at counts from 1/2 the bands. + FAST_COEFF_UPDATE use_fast_coef_updates; + + // This flag controls the use of non-RD mode decision. + int use_nonrd_pick_mode; + + // A binary mask indicating if NEARESTMV, NEARMV, ZEROMV, NEWMV + // modes are used in order from LSB to MSB for each BLOCK_SIZE. + int inter_mode_mask[BLOCK_SIZES]; + + // This feature controls whether we do the expensive context update and + // calculation in the rd coefficient costing loop. + int use_fast_coef_costing; + + // This feature controls the tolerence vs target used in deciding whether to + // recode a frame. It has no meaning if recode is disabled. + int recode_tolerance_low; + int recode_tolerance_high; + + // This variable controls the maximum block size where intra blocks can be + // used in inter frames. + // TODO(aconverse): Fold this into one of the other many mode skips + BLOCK_SIZE max_intra_bsize; + + // The frequency that we check if SOURCE_VAR_BASED_PARTITION or + // FIXED_PARTITION search type should be used. + int search_type_check_frequency; + + // When partition is pre-set, the inter prediction result from pick_inter_mode + // can be reused in final block encoding process. It is enabled only for real- + // time mode speed 6. + int reuse_inter_pred_sby; + + // This variable sets the encode_breakout threshold. Currently, it is only + // enabled in real time mode. + int encode_breakout_thresh; + + // default interp filter choice + INTERP_FILTER default_interp_filter; + + // Early termination in transform size search, which only applies while + // tx_size_search_method is USE_FULL_RD. + int tx_size_search_breakout; + + // adaptive interp_filter search to allow skip of certain filter types. + int adaptive_interp_filter_search; + + // mask for skip evaluation of certain interp_filter type. + INTERP_FILTER_MASK interp_filter_search_mask; + + // Partition search early breakout thresholds. + PARTITION_SEARCH_BREAKOUT_THR partition_search_breakout_thr; + + struct { + // Use ML-based partition search early breakout. + int search_breakout; + // Higher values mean more aggressiveness for partition search breakout that + // results in better encoding speed but worse compression performance. + float search_breakout_thresh[3]; + + // Machine-learning based partition search early termination + int search_early_termination; + + // Machine-learning based partition search pruning using prediction residue + // variance. + int var_pruning; + + // Threshold values used for ML based rectangular partition search pruning. + // If < 0, the feature is turned off. + // Higher values mean more aggressiveness to skip rectangular partition + // search that results in better encoding speed but worse coding + // performance. + int prune_rect_thresh[4]; + } rd_ml_partition; + + // Fast approximation of vp9_model_rd_from_var_lapndz + int simple_model_rd_from_var; + + // Skip a number of expensive mode evaluations for blocks with zero source + // variance. + int short_circuit_flat_blocks; + + // Skip a number of expensive mode evaluations for blocks with very low + // temporal variance. If the low temporal variance flag is set for a block, + // do the following: + // 1: Skip all golden modes and ALL INTRA for bsize >= 32x32. + // 2: Skip golden non-zeromv and newmv-last for bsize >= 16x16, skip ALL + // INTRA for bsize >= 32x32 and vert/horz INTRA for bsize 16x16, 16x32 and + // 32x16. + // 3: Same as (2), but also skip golden zeromv. + int short_circuit_low_temp_var; + + // Limits the rd-threshold update for early exit for the newmv-last mode, + // for non-rd mode. + int limit_newmv_early_exit; + + // Adds a bias against golden reference, for non-rd mode. + int bias_golden; + + // Bias to use base mv and skip 1/4 subpel search when use base mv in + // enhancement layer. + int base_mv_aggressive; + + // Global flag to enable partition copy from the previous frame. + int copy_partition_flag; + + // Compute the source sad for every superblock of the frame, + // prior to encoding the frame, to be used to bypass some encoder decisions. + int use_source_sad; + + int use_simple_block_yrd; + + // If source sad of superblock is high (> adapt_partition_thresh), will switch + // from VARIANCE_PARTITION to REFERENCE_PARTITION (which selects partition + // based on the nonrd-pickmode). + int adapt_partition_source_sad; + int adapt_partition_thresh; + + // Enable use of alt-refs in 1 pass VBR. + int use_altref_onepass; + + // Enable use of compound prediction, for nonrd_pickmode with nonzero lag. + int use_compound_nonrd_pickmode; + + // Always use nonrd_pick_intra for all block sizes on keyframes. + int nonrd_keyframe; + + // For SVC: enables use of partition from lower spatial resolution. + int svc_use_lowres_part; + + // Flag to indicate process for handling overshoot on slide/scene change, + // for real-time CBR mode. + OVERSHOOT_DETECTION_CBR_RT overshoot_detection_cbr_rt; + + // Disable partitioning of 16x16 blocks. + int disable_16x16part_nonkey; + + // Allow for disabling golden reference. + int disable_golden_ref; + + // Allow sub-pixel search to use interpolation filters with different taps in + // order to achieve accurate motion search result. + SUBPEL_SEARCH_TYPE use_accurate_subpel_search; + + // Search method used by temporal filtering in full_pixel_motion_search. + SEARCH_METHODS temporal_filter_search_method; + + // Use machine learning based partition search. + int nonrd_use_ml_partition; + + // Multiplier for base threshold for variance partitioning. + int variance_part_thresh_mult; + + // Force subpel motion filter to always use SMOOTH_FILTER. + int force_smooth_interpol; + + // For real-time mode: force DC only under intra search when content + // does not have high souce SAD. + int rt_intra_dc_only_low_content; + + // The encoder has a feature that skips forward transform and quantization + // based on a model rd estimation to reduce encoding time. + // However, this feature is dangerous since it could lead to bad perceptual + // quality. This flag is added to guard the feature. + int allow_skip_txfm_ac_dc; +} SPEED_FEATURES; + +struct VP9_COMP; + +void vp9_set_speed_features_framesize_independent(struct VP9_COMP *cpi, + int speed); +void vp9_set_speed_features_framesize_dependent(struct VP9_COMP *cpi, + int speed); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_SPEED_FEATURES_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_subexp.c b/media/libvpx/libvpx/vp9/encoder/vp9_subexp.c new file mode 100644 index 0000000000..3953253dbb --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_subexp.c @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "vpx_dsp/bitwriter.h" + +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_entropy.h" +#include "vp9/encoder/vp9_cost.h" +#include "vp9/encoder/vp9_subexp.h" + +static const uint8_t update_bits[255] = { + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 0, +}; +#define MIN_DELP_BITS 5 + +static int recenter_nonneg(int v, int m) { + if (v > (m << 1)) + return v; + else if (v >= m) + return ((v - m) << 1); + else + return ((m - v) << 1) - 1; +} + +static int remap_prob(int v, int m) { + int i; + static const uint8_t map_table[MAX_PROB - 1] = { + // generated by: + // map_table[j] = split_index(j, MAX_PROB - 1, MODULUS_PARAM); + 20, 21, 22, 23, 24, 25, 0, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 1, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 2, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, + 3, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 4, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 5, 86, 87, 88, + 89, 90, 91, 92, 93, 94, 95, 96, 97, 6, 98, 99, 100, 101, 102, + 103, 104, 105, 106, 107, 108, 109, 7, 110, 111, 112, 113, 114, 115, 116, + 117, 118, 119, 120, 121, 8, 122, 123, 124, 125, 126, 127, 128, 129, 130, + 131, 132, 133, 9, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, + 145, 10, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 11, + 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 12, 170, 171, + 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 13, 182, 183, 184, 185, + 186, 187, 188, 189, 190, 191, 192, 193, 14, 194, 195, 196, 197, 198, 199, + 200, 201, 202, 203, 204, 205, 15, 206, 207, 208, 209, 210, 211, 212, 213, + 214, 215, 216, 217, 16, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, + 228, 229, 17, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, + 18, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 19, + }; + v--; + m--; + if ((m << 1) <= MAX_PROB) + i = recenter_nonneg(v, m) - 1; + else + i = recenter_nonneg(MAX_PROB - 1 - v, MAX_PROB - 1 - m) - 1; + + assert(i >= 0 && (size_t)i < sizeof(map_table)); + i = map_table[i]; + return i; +} + +static int prob_diff_update_cost(vpx_prob newp, vpx_prob oldp) { + int delp = remap_prob(newp, oldp); + return update_bits[delp] << VP9_PROB_COST_SHIFT; +} + +static void encode_uniform(vpx_writer *w, int v) { + const int l = 8; + const int m = (1 << l) - 191; + if (v < m) { + vpx_write_literal(w, v, l - 1); + } else { + vpx_write_literal(w, m + ((v - m) >> 1), l - 1); + vpx_write_literal(w, (v - m) & 1, 1); + } +} + +static INLINE int write_bit_gte(vpx_writer *w, int word, int test) { + vpx_write_literal(w, word >= test, 1); + return word >= test; +} + +static void encode_term_subexp(vpx_writer *w, int word) { + if (!write_bit_gte(w, word, 16)) { + vpx_write_literal(w, word, 4); + } else if (!write_bit_gte(w, word, 32)) { + vpx_write_literal(w, word - 16, 4); + } else if (!write_bit_gte(w, word, 64)) { + vpx_write_literal(w, word - 32, 5); + } else { + encode_uniform(w, word - 64); + } +} + +void vp9_write_prob_diff_update(vpx_writer *w, vpx_prob newp, vpx_prob oldp) { + const int delp = remap_prob(newp, oldp); + encode_term_subexp(w, delp); +} + +int64_t vp9_prob_diff_update_savings_search(const unsigned int *ct, + vpx_prob oldp, vpx_prob *bestp, + vpx_prob upd) { + const int64_t old_b = cost_branch256(ct, oldp); + int64_t bestsavings = 0; + vpx_prob newp, bestnewp = oldp; + const int step = *bestp > oldp ? -1 : 1; + const int upd_cost = vp9_cost_one(upd) - vp9_cost_zero(upd); + + if (old_b > upd_cost + (MIN_DELP_BITS << VP9_PROB_COST_SHIFT)) { + for (newp = *bestp; newp != oldp; newp += step) { + const int64_t new_b = cost_branch256(ct, newp); + const int64_t update_b = prob_diff_update_cost(newp, oldp) + upd_cost; + const int64_t savings = old_b - new_b - update_b; + if (savings > bestsavings) { + bestsavings = savings; + bestnewp = newp; + } + } + } + *bestp = bestnewp; + return bestsavings; +} + +int64_t vp9_prob_diff_update_savings_search_model(const unsigned int *ct, + const vpx_prob oldp, + vpx_prob *bestp, vpx_prob upd, + int stepsize) { + int64_t i, old_b, new_b, update_b, savings, bestsavings; + int64_t newp; + const int64_t step_sign = *bestp > oldp ? -1 : 1; + const int64_t step = stepsize * step_sign; + const int64_t upd_cost = vp9_cost_one(upd) - vp9_cost_zero(upd); + const vpx_prob *newplist, *oldplist; + vpx_prob bestnewp; + oldplist = vp9_pareto8_full[oldp - 1]; + old_b = cost_branch256(ct + 2 * PIVOT_NODE, oldp); + for (i = UNCONSTRAINED_NODES; i < ENTROPY_NODES; ++i) + old_b += cost_branch256(ct + 2 * i, oldplist[i - UNCONSTRAINED_NODES]); + + bestsavings = 0; + bestnewp = oldp; + + assert(stepsize > 0); + + if (old_b > upd_cost + (MIN_DELP_BITS << VP9_PROB_COST_SHIFT)) { + for (newp = *bestp; (newp - oldp) * step_sign < 0; newp += step) { + if (newp < 1 || newp > 255) continue; + newplist = vp9_pareto8_full[newp - 1]; + new_b = cost_branch256(ct + 2 * PIVOT_NODE, (vpx_prob)newp); + for (i = UNCONSTRAINED_NODES; i < ENTROPY_NODES; ++i) + new_b += cost_branch256(ct + 2 * i, newplist[i - UNCONSTRAINED_NODES]); + update_b = prob_diff_update_cost((vpx_prob)newp, oldp) + upd_cost; + savings = old_b - new_b - update_b; + if (savings > bestsavings) { + bestsavings = savings; + bestnewp = (vpx_prob)newp; + } + } + } + + *bestp = bestnewp; + return bestsavings; +} + +void vp9_cond_prob_diff_update(vpx_writer *w, vpx_prob *oldp, + const unsigned int ct[2]) { + const vpx_prob upd = DIFF_UPDATE_PROB; + vpx_prob newp = get_binary_prob(ct[0], ct[1]); + const int64_t savings = + vp9_prob_diff_update_savings_search(ct, *oldp, &newp, upd); + assert(newp >= 1); + if (savings > 0) { + vpx_write(w, 1, upd); + vp9_write_prob_diff_update(w, newp, *oldp); + *oldp = newp; + } else { + vpx_write(w, 0, upd); + } +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_subexp.h b/media/libvpx/libvpx/vp9/encoder/vp9_subexp.h new file mode 100644 index 0000000000..2d016d24c5 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_subexp.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_SUBEXP_H_ +#define VPX_VP9_ENCODER_VP9_SUBEXP_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "vpx_dsp/prob.h" + +struct vpx_writer; + +void vp9_write_prob_diff_update(struct vpx_writer *w, vpx_prob newp, + vpx_prob oldp); + +void vp9_cond_prob_diff_update(struct vpx_writer *w, vpx_prob *oldp, + const unsigned int ct[2]); + +int64_t vp9_prob_diff_update_savings_search(const unsigned int *ct, + vpx_prob oldp, vpx_prob *bestp, + vpx_prob upd); + +int64_t vp9_prob_diff_update_savings_search_model(const unsigned int *ct, + const vpx_prob oldp, + vpx_prob *bestp, vpx_prob upd, + int stepsize); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_SUBEXP_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.c b/media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.c new file mode 100644 index 0000000000..fff6d25de0 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.c @@ -0,0 +1,1376 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vp9/encoder/vp9_aq_cyclicrefresh.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_svc_layercontext.h" +#include "vp9/encoder/vp9_extend.h" +#include "vpx_dsp/vpx_dsp_common.h" + +#define SMALL_FRAME_WIDTH 32 +#define SMALL_FRAME_HEIGHT 16 + +static void swap_ptr(void *a, void *b) { + void **a_p = (void **)a; + void **b_p = (void **)b; + void *c = *a_p; + *a_p = *b_p; + *b_p = c; +} + +void vp9_init_layer_context(VP9_COMP *const cpi) { + SVC *const svc = &cpi->svc; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + int mi_rows = cpi->common.mi_rows; + int mi_cols = cpi->common.mi_cols; + int sl, tl, i; + int alt_ref_idx = svc->number_spatial_layers; + + svc->spatial_layer_id = 0; + svc->temporal_layer_id = 0; + svc->force_zero_mode_spatial_ref = 0; + svc->use_base_mv = 0; + svc->use_partition_reuse = 0; + svc->use_gf_temporal_ref = 1; + svc->use_gf_temporal_ref_current_layer = 0; + svc->scaled_temp_is_alloc = 0; + svc->scaled_one_half = 0; + svc->current_superframe = 0; + svc->non_reference_frame = 0; + svc->skip_enhancement_layer = 0; + svc->disable_inter_layer_pred = INTER_LAYER_PRED_ON; + svc->framedrop_mode = CONSTRAINED_LAYER_DROP; + svc->set_intra_only_frame = 0; + svc->previous_frame_is_intra_only = 0; + svc->superframe_has_layer_sync = 0; + svc->use_set_ref_frame_config = 0; + svc->num_encoded_top_layer = 0; + svc->simulcast_mode = 0; + svc->single_layer_svc = 0; + svc->resize_set = 0; + + for (i = 0; i < REF_FRAMES; ++i) { + svc->fb_idx_spatial_layer_id[i] = 0xff; + svc->fb_idx_temporal_layer_id[i] = 0xff; + svc->fb_idx_base[i] = 0; + } + for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { + svc->last_layer_dropped[sl] = 0; + svc->drop_spatial_layer[sl] = 0; + svc->ext_frame_flags[sl] = 0; + svc->lst_fb_idx[sl] = 0; + svc->gld_fb_idx[sl] = 1; + svc->alt_fb_idx[sl] = 2; + svc->downsample_filter_type[sl] = BILINEAR; + svc->downsample_filter_phase[sl] = 8; // Set to 8 for averaging filter. + svc->framedrop_thresh[sl] = oxcf->drop_frames_water_mark; + svc->fb_idx_upd_tl0[sl] = -1; + svc->drop_count[sl] = 0; + svc->spatial_layer_sync[sl] = 0; + svc->force_drop_constrained_from_above[sl] = 0; + } + svc->max_consec_drop = INT_MAX; + + svc->buffer_gf_temporal_ref[1].idx = 7; + svc->buffer_gf_temporal_ref[0].idx = 6; + svc->buffer_gf_temporal_ref[1].is_used = 0; + svc->buffer_gf_temporal_ref[0].is_used = 0; + + if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) { + if (vpx_realloc_frame_buffer(&cpi->svc.empty_frame.img, SMALL_FRAME_WIDTH, + SMALL_FRAME_HEIGHT, cpi->common.subsampling_x, + cpi->common.subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cpi->common.use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, + cpi->common.byte_alignment, NULL, NULL, NULL)) + vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, + "Failed to allocate empty frame for multiple frame " + "contexts"); + + memset(cpi->svc.empty_frame.img.buffer_alloc, 0x80, + cpi->svc.empty_frame.img.buffer_alloc_sz); + } + + for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { + for (tl = 0; tl < oxcf->ts_number_layers; ++tl) { + int layer = LAYER_IDS_TO_IDX(sl, tl, oxcf->ts_number_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + RATE_CONTROL *const lrc = &lc->rc; + lc->current_video_frame_in_layer = 0; + lc->layer_size = 0; + lc->frames_from_key_frame = 0; + lc->last_frame_type = FRAME_TYPES; + lrc->ni_av_qi = oxcf->worst_allowed_q; + lrc->total_actual_bits = 0; + lrc->total_target_vs_actual = 0; + lrc->ni_tot_qi = 0; + lrc->tot_q = 0.0; + lrc->avg_q = 0.0; + lrc->ni_frames = 0; + lrc->decimation_count = 0; + lrc->decimation_factor = 0; + lrc->worst_quality = oxcf->worst_allowed_q; + lrc->best_quality = oxcf->best_allowed_q; + + for (i = 0; i < RATE_FACTOR_LEVELS; ++i) { + lrc->rate_correction_factors[i] = 1.0; + } + + if (cpi->oxcf.rc_mode == VPX_CBR) { + lc->target_bandwidth = oxcf->layer_target_bitrate[layer]; + lrc->last_q[INTER_FRAME] = oxcf->worst_allowed_q; + lrc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q; + lrc->avg_frame_qindex[KEY_FRAME] = oxcf->worst_allowed_q; + } else { + lc->target_bandwidth = oxcf->layer_target_bitrate[layer]; + lrc->last_q[KEY_FRAME] = oxcf->best_allowed_q; + lrc->last_q[INTER_FRAME] = oxcf->best_allowed_q; + lrc->avg_frame_qindex[KEY_FRAME] = + (oxcf->worst_allowed_q + oxcf->best_allowed_q) / 2; + lrc->avg_frame_qindex[INTER_FRAME] = + (oxcf->worst_allowed_q + oxcf->best_allowed_q) / 2; + if (oxcf->ss_enable_auto_arf[sl]) + lc->alt_ref_idx = alt_ref_idx++; + else + lc->alt_ref_idx = INVALID_IDX; + lc->gold_ref_idx = INVALID_IDX; + } + + lrc->buffer_level = + oxcf->starting_buffer_level_ms * lc->target_bandwidth / 1000; + lrc->bits_off_target = lrc->buffer_level; + + // Initialize the cyclic refresh parameters. If spatial layers are used + // (i.e., ss_number_layers > 1), these need to be updated per spatial + // layer. + // Cyclic refresh is only applied on base temporal layer. + if (oxcf->ss_number_layers > 1 && tl == 0) { + size_t last_coded_q_map_size; + size_t consec_zero_mv_size; + VP9_COMMON *const cm = &cpi->common; + lc->sb_index = 0; + lc->actual_num_seg1_blocks = 0; + lc->actual_num_seg2_blocks = 0; + lc->counter_encode_maxq_scene_change = 0; + CHECK_MEM_ERROR(&cm->error, lc->map, + vpx_malloc(mi_rows * mi_cols * sizeof(*lc->map))); + memset(lc->map, 0, mi_rows * mi_cols); + last_coded_q_map_size = + mi_rows * mi_cols * sizeof(*lc->last_coded_q_map); + CHECK_MEM_ERROR(&cm->error, lc->last_coded_q_map, + vpx_malloc(last_coded_q_map_size)); + assert(MAXQ <= 255); + memset(lc->last_coded_q_map, MAXQ, last_coded_q_map_size); + consec_zero_mv_size = mi_rows * mi_cols * sizeof(*lc->consec_zero_mv); + CHECK_MEM_ERROR(&cm->error, lc->consec_zero_mv, + vpx_malloc(consec_zero_mv_size)); + memset(lc->consec_zero_mv, 0, consec_zero_mv_size); + } + } + } + + // Still have extra buffer for base layer golden frame + if (!(svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) && + alt_ref_idx < REF_FRAMES) + svc->layer_context[0].gold_ref_idx = alt_ref_idx; +} + +// Update the layer context from a change_config() call. +void vp9_update_layer_context_change_config(VP9_COMP *const cpi, + const int target_bandwidth) { + SVC *const svc = &cpi->svc; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + const RATE_CONTROL *const rc = &cpi->rc; + int sl, tl, layer = 0, spatial_layer_target; + float bitrate_alloc = 1.0; + int num_spatial_layers_nonzero_rate = 0; + + cpi->svc.temporal_layering_mode = oxcf->temporal_layering_mode; + + if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) { + for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { + for (tl = 0; tl < oxcf->ts_number_layers; ++tl) { + layer = LAYER_IDS_TO_IDX(sl, tl, oxcf->ts_number_layers); + svc->layer_context[layer].target_bandwidth = + oxcf->layer_target_bitrate[layer]; + } + + layer = LAYER_IDS_TO_IDX( + sl, + ((oxcf->ts_number_layers - 1) < 0 ? 0 : (oxcf->ts_number_layers - 1)), + oxcf->ts_number_layers); + spatial_layer_target = svc->layer_context[layer].target_bandwidth = + oxcf->layer_target_bitrate[layer]; + + for (tl = 0; tl < oxcf->ts_number_layers; ++tl) { + LAYER_CONTEXT *const lc = + &svc->layer_context[sl * oxcf->ts_number_layers + tl]; + RATE_CONTROL *const lrc = &lc->rc; + + lc->spatial_layer_target_bandwidth = spatial_layer_target; + if (target_bandwidth != 0) { + bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth; + } + lrc->starting_buffer_level = + (int64_t)(rc->starting_buffer_level * bitrate_alloc + 0.5); + lrc->optimal_buffer_level = + (int64_t)(rc->optimal_buffer_level * bitrate_alloc + 0.5); + lrc->maximum_buffer_size = + (int64_t)(rc->maximum_buffer_size * bitrate_alloc + 0.5); + lrc->bits_off_target = + VPXMIN(lrc->bits_off_target, lrc->maximum_buffer_size); + lrc->buffer_level = VPXMIN(lrc->buffer_level, lrc->maximum_buffer_size); + lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[tl]; + lrc->avg_frame_bandwidth = + (int)VPXMIN(lc->target_bandwidth / lc->framerate, INT_MAX); + lrc->max_frame_bandwidth = rc->max_frame_bandwidth; + lrc->worst_quality = rc->worst_quality; + lrc->best_quality = rc->best_quality; + } + } + } else { + int layer_end; + + if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) { + layer_end = svc->number_temporal_layers; + } else { + layer_end = svc->number_spatial_layers; + } + + for (layer = 0; layer < layer_end; ++layer) { + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + RATE_CONTROL *const lrc = &lc->rc; + + lc->target_bandwidth = oxcf->layer_target_bitrate[layer]; + + if (target_bandwidth != 0) { + bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth; + } + // Update buffer-related quantities. + lrc->starting_buffer_level = + (int64_t)(rc->starting_buffer_level * bitrate_alloc); + lrc->optimal_buffer_level = + (int64_t)(rc->optimal_buffer_level * bitrate_alloc); + lrc->maximum_buffer_size = + (int64_t)(rc->maximum_buffer_size * bitrate_alloc); + lrc->bits_off_target = + VPXMIN(lrc->bits_off_target, lrc->maximum_buffer_size); + lrc->buffer_level = VPXMIN(lrc->buffer_level, lrc->maximum_buffer_size); + // Update framerate-related quantities. + if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) { + lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[layer]; + } else { + lc->framerate = cpi->framerate; + } + lrc->avg_frame_bandwidth = + (int)VPXMIN(lc->target_bandwidth / lc->framerate, INT_MAX); + lrc->max_frame_bandwidth = rc->max_frame_bandwidth; + // Update qp-related quantities. + lrc->worst_quality = rc->worst_quality; + lrc->best_quality = rc->best_quality; + } + } + for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { + // Check bitrate of spatia layer. + layer = LAYER_IDS_TO_IDX(sl, oxcf->ts_number_layers - 1, + oxcf->ts_number_layers); + if (oxcf->layer_target_bitrate[layer] > 0) + num_spatial_layers_nonzero_rate += 1; + } + if (num_spatial_layers_nonzero_rate == 1) + svc->single_layer_svc = 1; + else + svc->single_layer_svc = 0; +} + +static LAYER_CONTEXT *get_layer_context(VP9_COMP *const cpi) { + if (is_one_pass_svc(cpi)) + return &cpi->svc.layer_context[cpi->svc.spatial_layer_id * + cpi->svc.number_temporal_layers + + cpi->svc.temporal_layer_id]; + else + return (cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) + ? &cpi->svc.layer_context[cpi->svc.temporal_layer_id] + : &cpi->svc.layer_context[cpi->svc.spatial_layer_id]; +} + +void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) { + SVC *const svc = &cpi->svc; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + LAYER_CONTEXT *const lc = get_layer_context(cpi); + RATE_CONTROL *const lrc = &lc->rc; + // Index into spatial+temporal arrays. + const int st_idx = svc->spatial_layer_id * svc->number_temporal_layers + + svc->temporal_layer_id; + const int tl = svc->temporal_layer_id; + + lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[tl]; + lrc->avg_frame_bandwidth = + (int)VPXMIN(lc->target_bandwidth / lc->framerate, INT_MAX); + lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth; + // Update the average layer frame size (non-cumulative per-frame-bw). + if (tl == 0) { + lc->avg_frame_size = lrc->avg_frame_bandwidth; + } else { + const double prev_layer_framerate = + cpi->framerate / oxcf->ts_rate_decimator[tl - 1]; + const int prev_layer_target_bandwidth = + oxcf->layer_target_bitrate[st_idx - 1]; + lc->avg_frame_size = + (int)round((lc->target_bandwidth - prev_layer_target_bandwidth) / + (lc->framerate - prev_layer_framerate)); + } +} + +void vp9_update_spatial_layer_framerate(VP9_COMP *const cpi, double framerate) { + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + LAYER_CONTEXT *const lc = get_layer_context(cpi); + RATE_CONTROL *const lrc = &lc->rc; + + lc->framerate = framerate; + lrc->avg_frame_bandwidth = + (int)VPXMIN(lc->target_bandwidth / lc->framerate, INT_MAX); + lrc->min_frame_bandwidth = + (int)(lrc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100); + lrc->max_frame_bandwidth = (int)(((int64_t)lrc->avg_frame_bandwidth * + oxcf->two_pass_vbrmax_section) / + 100); + vp9_rc_set_gf_interval_range(cpi, lrc); +} + +void vp9_restore_layer_context(VP9_COMP *const cpi) { + LAYER_CONTEXT *const lc = get_layer_context(cpi); + const int old_frame_since_key = cpi->rc.frames_since_key; + const int old_frame_to_key = cpi->rc.frames_to_key; + const int old_ext_use_post_encode_drop = cpi->rc.ext_use_post_encode_drop; + + cpi->rc = lc->rc; + cpi->twopass = lc->twopass; + cpi->oxcf.target_bandwidth = lc->target_bandwidth; + cpi->alt_ref_source = lc->alt_ref_source; + // Check if it is one_pass_cbr_svc mode and lc->speed > 0 (real-time mode + // does not use speed = 0). + if (is_one_pass_svc(cpi) && lc->speed > 0) { + cpi->oxcf.speed = lc->speed; + } + cpi->loopfilter_ctrl = lc->loopfilter_ctrl; + // Reset the frames_since_key and frames_to_key counters to their values + // before the layer restore. Keep these defined for the stream (not layer). + if (cpi->svc.number_temporal_layers > 1 || + cpi->svc.number_spatial_layers > 1) { + cpi->rc.frames_since_key = old_frame_since_key; + cpi->rc.frames_to_key = old_frame_to_key; + } + cpi->rc.ext_use_post_encode_drop = old_ext_use_post_encode_drop; + // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers, + // for the base temporal layer. + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && + cpi->svc.number_spatial_layers > 1 && cpi->svc.temporal_layer_id == 0) { + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + swap_ptr(&cr->map, &lc->map); + swap_ptr(&cr->last_coded_q_map, &lc->last_coded_q_map); + swap_ptr(&cpi->consec_zero_mv, &lc->consec_zero_mv); + cr->sb_index = lc->sb_index; + cr->actual_num_seg1_blocks = lc->actual_num_seg1_blocks; + cr->actual_num_seg2_blocks = lc->actual_num_seg2_blocks; + cr->counter_encode_maxq_scene_change = lc->counter_encode_maxq_scene_change; + } +} + +void vp9_save_layer_context(VP9_COMP *const cpi) { + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + LAYER_CONTEXT *const lc = get_layer_context(cpi); + + lc->rc = cpi->rc; + lc->twopass = cpi->twopass; + lc->target_bandwidth = (int)oxcf->target_bandwidth; + lc->alt_ref_source = cpi->alt_ref_source; + lc->frame_qp = cpi->common.base_qindex; + lc->MBs = cpi->common.MBs; + + // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers, + // for the base temporal layer. + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && + cpi->svc.number_spatial_layers > 1 && cpi->svc.temporal_layer_id == 0) { + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + signed char *temp = lc->map; + uint8_t *temp2 = lc->last_coded_q_map; + uint8_t *temp3 = lc->consec_zero_mv; + lc->map = cr->map; + cr->map = temp; + lc->last_coded_q_map = cr->last_coded_q_map; + cr->last_coded_q_map = temp2; + lc->consec_zero_mv = cpi->consec_zero_mv; + cpi->consec_zero_mv = temp3; + lc->sb_index = cr->sb_index; + lc->actual_num_seg1_blocks = cr->actual_num_seg1_blocks; + lc->actual_num_seg2_blocks = cr->actual_num_seg2_blocks; + lc->counter_encode_maxq_scene_change = cr->counter_encode_maxq_scene_change; + lc->qindex_delta[0] = cr->qindex_delta[0]; + lc->qindex_delta[1] = cr->qindex_delta[1]; + lc->qindex_delta[2] = cr->qindex_delta[2]; + } +} + +#if !CONFIG_REALTIME_ONLY +void vp9_init_second_pass_spatial_svc(VP9_COMP *cpi) { + SVC *const svc = &cpi->svc; + int i; + + for (i = 0; i < svc->number_spatial_layers; ++i) { + TWO_PASS *const twopass = &svc->layer_context[i].twopass; + + svc->spatial_layer_id = i; + vp9_init_second_pass(cpi); + + twopass->total_stats.spatial_layer_id = i; + twopass->total_left_stats.spatial_layer_id = i; + } + svc->spatial_layer_id = 0; +} +#endif // !CONFIG_REALTIME_ONLY + +void vp9_inc_frame_in_layer(VP9_COMP *const cpi) { + LAYER_CONTEXT *const lc = + &cpi->svc.layer_context[cpi->svc.spatial_layer_id * + cpi->svc.number_temporal_layers]; + ++lc->current_video_frame_in_layer; + ++lc->frames_from_key_frame; + if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) + ++cpi->svc.current_superframe; +} + +void get_layer_resolution(const int width_org, const int height_org, + const int num, const int den, int *width_out, + int *height_out) { + int w, h; + + if (width_out == NULL || height_out == NULL || den == 0) return; + + w = width_org * num / den; + h = height_org * num / den; + + // make height and width even to make chrome player happy + w += w % 2; + h += h % 2; + + *width_out = w; + *height_out = h; +} + +static void reset_fb_idx_unused(VP9_COMP *const cpi) { + // If a reference frame is not referenced or refreshed, then set the + // fb_idx for that reference to the first one used/referenced. + // This is to avoid setting fb_idx for a reference to a slot that is not + // used/needed (i.e., since that reference is not referenced or refreshed). + MV_REFERENCE_FRAME ref_frame; + MV_REFERENCE_FRAME first_ref = 0; + int first_fb_idx = 0; + int fb_idx[3] = { cpi->lst_fb_idx, cpi->gld_fb_idx, cpi->alt_fb_idx }; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { + if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) { + first_ref = ref_frame; + first_fb_idx = fb_idx[ref_frame - 1]; + break; + } + } + if (first_ref > 0) { + if (first_ref != LAST_FRAME && !(cpi->ref_frame_flags & VP9_LAST_FLAG) && + !cpi->ext_refresh_last_frame) + cpi->lst_fb_idx = first_fb_idx; + else if (first_ref != GOLDEN_FRAME && + !(cpi->ref_frame_flags & VP9_GOLD_FLAG) && + !cpi->ext_refresh_golden_frame) + cpi->gld_fb_idx = first_fb_idx; + else if (first_ref != ALTREF_FRAME && + !(cpi->ref_frame_flags & VP9_ALT_FLAG) && + !cpi->ext_refresh_alt_ref_frame) + cpi->alt_fb_idx = first_fb_idx; + } +} + +// Never refresh any reference frame buffers on top temporal layers in +// simulcast mode, which has interlayer prediction disabled. +static void non_reference_frame_simulcast(VP9_COMP *const cpi) { + if (cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1 && + cpi->svc.temporal_layer_id > 0) { + cpi->ext_refresh_last_frame = 0; + cpi->ext_refresh_golden_frame = 0; + cpi->ext_refresh_alt_ref_frame = 0; + } +} + +// The function sets proper ref_frame_flags, buffer indices, and buffer update +// variables for temporal layering mode 3 - that does 0-2-1-2 temporal layering +// scheme. +static void set_flags_and_fb_idx_for_temporal_mode3(VP9_COMP *const cpi) { + int frame_num_within_temporal_struct = 0; + int spatial_id, temporal_id; + spatial_id = cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode; + frame_num_within_temporal_struct = + cpi->svc + .layer_context[cpi->svc.spatial_layer_id * + cpi->svc.number_temporal_layers] + .current_video_frame_in_layer % + 4; + temporal_id = cpi->svc.temporal_layer_id = + (frame_num_within_temporal_struct & 1) + ? 2 + : (frame_num_within_temporal_struct >> 1); + cpi->ext_refresh_last_frame = cpi->ext_refresh_golden_frame = + cpi->ext_refresh_alt_ref_frame = 0; + if (!temporal_id) { + cpi->ext_refresh_frame_flags_pending = 1; + cpi->ext_refresh_last_frame = 1; + if (!spatial_id) { + cpi->ref_frame_flags = VP9_LAST_FLAG; + } else if (cpi->svc.layer_context[temporal_id].is_key_frame) { + // base layer is a key frame. + cpi->ref_frame_flags = VP9_LAST_FLAG; + cpi->ext_refresh_last_frame = 0; + cpi->ext_refresh_golden_frame = 1; + } else { + cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG; + } + } else if (temporal_id == 1) { + cpi->ext_refresh_frame_flags_pending = 1; + cpi->ext_refresh_alt_ref_frame = 1; + if (!spatial_id) { + cpi->ref_frame_flags = VP9_LAST_FLAG; + } else { + cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG; + } + } else { + if (frame_num_within_temporal_struct == 1) { + // the first tl2 picture + if (spatial_id == cpi->svc.number_spatial_layers - 1) { // top layer + cpi->ext_refresh_frame_flags_pending = 1; + if (!spatial_id) + cpi->ref_frame_flags = VP9_LAST_FLAG; + else + cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG; + } else if (!spatial_id) { + cpi->ext_refresh_frame_flags_pending = 1; + cpi->ext_refresh_alt_ref_frame = 1; + cpi->ref_frame_flags = VP9_LAST_FLAG; + } else if (spatial_id < cpi->svc.number_spatial_layers - 1) { + cpi->ext_refresh_frame_flags_pending = 1; + cpi->ext_refresh_alt_ref_frame = 1; + cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG; + } + } else { + // The second tl2 picture + if (spatial_id == cpi->svc.number_spatial_layers - 1) { // top layer + cpi->ext_refresh_frame_flags_pending = 1; + if (!spatial_id) + cpi->ref_frame_flags = VP9_LAST_FLAG; + else + cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG; + } else if (!spatial_id) { + cpi->ext_refresh_frame_flags_pending = 1; + cpi->ref_frame_flags = VP9_LAST_FLAG; + cpi->ext_refresh_alt_ref_frame = 1; + } else { // top layer + cpi->ext_refresh_frame_flags_pending = 1; + cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG; + cpi->ext_refresh_alt_ref_frame = 1; + } + } + } + if (temporal_id == 0) { + cpi->lst_fb_idx = spatial_id; + if (spatial_id) { + if (cpi->svc.layer_context[temporal_id].is_key_frame) { + cpi->lst_fb_idx = spatial_id - 1; + cpi->gld_fb_idx = spatial_id; + } else { + cpi->gld_fb_idx = spatial_id - 1; + } + } else { + cpi->gld_fb_idx = 0; + } + cpi->alt_fb_idx = 0; + } else if (temporal_id == 1) { + cpi->lst_fb_idx = spatial_id; + cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1; + cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id; + } else if (frame_num_within_temporal_struct == 1) { + cpi->lst_fb_idx = spatial_id; + cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1; + cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id; + } else { + cpi->lst_fb_idx = cpi->svc.number_spatial_layers + spatial_id; + cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1; + cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id; + } + + if (cpi->svc.simulcast_mode) non_reference_frame_simulcast(cpi); + + reset_fb_idx_unused(cpi); +} + +// The function sets proper ref_frame_flags, buffer indices, and buffer update +// variables for temporal layering mode 2 - that does 0-1-0-1 temporal layering +// scheme. +static void set_flags_and_fb_idx_for_temporal_mode2(VP9_COMP *const cpi) { + int spatial_id, temporal_id; + spatial_id = cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode; + temporal_id = cpi->svc.temporal_layer_id = + cpi->svc + .layer_context[cpi->svc.spatial_layer_id * + cpi->svc.number_temporal_layers] + .current_video_frame_in_layer & + 1; + cpi->ext_refresh_last_frame = cpi->ext_refresh_golden_frame = + cpi->ext_refresh_alt_ref_frame = 0; + if (!temporal_id) { + cpi->ext_refresh_frame_flags_pending = 1; + cpi->ext_refresh_last_frame = 1; + if (!spatial_id) { + cpi->ref_frame_flags = VP9_LAST_FLAG; + } else if (cpi->svc.layer_context[temporal_id].is_key_frame) { + // base layer is a key frame. + cpi->ref_frame_flags = VP9_LAST_FLAG; + cpi->ext_refresh_last_frame = 0; + cpi->ext_refresh_golden_frame = 1; + } else { + cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG; + } + } else if (temporal_id == 1) { + cpi->ext_refresh_frame_flags_pending = 1; + cpi->ext_refresh_alt_ref_frame = 1; + if (!spatial_id) { + cpi->ref_frame_flags = VP9_LAST_FLAG; + } else { + if (spatial_id == cpi->svc.number_spatial_layers - 1) + cpi->ext_refresh_alt_ref_frame = 0; + cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG; + } + } + + if (temporal_id == 0) { + cpi->lst_fb_idx = spatial_id; + if (spatial_id) { + if (cpi->svc.layer_context[temporal_id].is_key_frame) { + cpi->lst_fb_idx = spatial_id - 1; + cpi->gld_fb_idx = spatial_id; + } else { + cpi->gld_fb_idx = spatial_id - 1; + } + } else { + cpi->gld_fb_idx = 0; + } + cpi->alt_fb_idx = 0; + } else if (temporal_id == 1) { + cpi->lst_fb_idx = spatial_id; + cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1; + cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id; + } + + if (cpi->svc.simulcast_mode) non_reference_frame_simulcast(cpi); + + reset_fb_idx_unused(cpi); +} + +// The function sets proper ref_frame_flags, buffer indices, and buffer update +// variables for temporal layering mode 0 - that has no temporal layering. +static void set_flags_and_fb_idx_for_temporal_mode_noLayering( + VP9_COMP *const cpi) { + int spatial_id; + spatial_id = cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode; + cpi->ext_refresh_last_frame = cpi->ext_refresh_golden_frame = + cpi->ext_refresh_alt_ref_frame = 0; + cpi->ext_refresh_frame_flags_pending = 1; + cpi->ext_refresh_last_frame = 1; + if (!spatial_id) { + cpi->ref_frame_flags = VP9_LAST_FLAG; + } else if (cpi->svc.layer_context[0].is_key_frame) { + cpi->ref_frame_flags = VP9_LAST_FLAG; + cpi->ext_refresh_last_frame = 0; + cpi->ext_refresh_golden_frame = 1; + } else { + cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG; + } + cpi->lst_fb_idx = spatial_id; + if (spatial_id) { + if (cpi->svc.layer_context[0].is_key_frame) { + cpi->lst_fb_idx = spatial_id - 1; + cpi->gld_fb_idx = spatial_id; + } else { + cpi->gld_fb_idx = spatial_id - 1; + } + } else { + cpi->gld_fb_idx = 0; + } + + if (cpi->svc.simulcast_mode) non_reference_frame_simulcast(cpi); + + reset_fb_idx_unused(cpi); +} + +static void set_flags_and_fb_idx_bypass_via_set_ref_frame_config( + VP9_COMP *const cpi) { + SVC *const svc = &cpi->svc; + int sl = svc->spatial_layer_id = svc->spatial_layer_to_encode; + cpi->svc.temporal_layer_id = cpi->svc.temporal_layer_id_per_spatial[sl]; + cpi->ext_refresh_frame_flags_pending = 1; + cpi->lst_fb_idx = svc->lst_fb_idx[sl]; + cpi->gld_fb_idx = svc->gld_fb_idx[sl]; + cpi->alt_fb_idx = svc->alt_fb_idx[sl]; + cpi->ext_refresh_last_frame = 0; + cpi->ext_refresh_golden_frame = 0; + cpi->ext_refresh_alt_ref_frame = 0; + cpi->ref_frame_flags = 0; + if (svc->reference_last[sl]) cpi->ref_frame_flags |= VP9_LAST_FLAG; + if (svc->reference_golden[sl]) cpi->ref_frame_flags |= VP9_GOLD_FLAG; + if (svc->reference_altref[sl]) cpi->ref_frame_flags |= VP9_ALT_FLAG; +} + +void vp9_copy_flags_ref_update_idx(VP9_COMP *const cpi) { + SVC *const svc = &cpi->svc; + int sl = svc->spatial_layer_id; + svc->lst_fb_idx[sl] = cpi->lst_fb_idx; + svc->gld_fb_idx[sl] = cpi->gld_fb_idx; + svc->alt_fb_idx[sl] = cpi->alt_fb_idx; + // For the fixed SVC mode: pass the refresh_lst/gld/alt_frame flags to the + // update_buffer_slot, this is needed for the GET_SVC_REF_FRAME_CONFIG api. + if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + int ref; + for (ref = 0; ref < REF_FRAMES; ++ref) { + svc->update_buffer_slot[sl] &= ~(1 << ref); + if ((ref == svc->lst_fb_idx[sl] && cpi->refresh_last_frame) || + (ref == svc->gld_fb_idx[sl] && cpi->refresh_golden_frame) || + (ref == svc->alt_fb_idx[sl] && cpi->refresh_alt_ref_frame)) + svc->update_buffer_slot[sl] |= (1 << ref); + } + } + + // TODO(jianj): Remove these 3, deprecated. + svc->update_last[sl] = (uint8_t)cpi->refresh_last_frame; + svc->update_golden[sl] = (uint8_t)cpi->refresh_golden_frame; + svc->update_altref[sl] = (uint8_t)cpi->refresh_alt_ref_frame; + + svc->reference_last[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_LAST_FLAG); + svc->reference_golden[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_GOLD_FLAG); + svc->reference_altref[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_ALT_FLAG); +} + +int vp9_one_pass_svc_start_layer(VP9_COMP *const cpi) { + int width = 0, height = 0; + SVC *const svc = &cpi->svc; + LAYER_CONTEXT *lc = NULL; + int scaling_factor_num = 1; + int scaling_factor_den = 1; + svc->skip_enhancement_layer = 0; + + if (svc->disable_inter_layer_pred == INTER_LAYER_PRED_OFF && + svc->number_spatial_layers > 1 && svc->number_spatial_layers <= 3 && + svc->number_temporal_layers <= 3) + svc->simulcast_mode = 1; + else + svc->simulcast_mode = 0; + + if (svc->number_spatial_layers > 1) { + svc->use_base_mv = 1; + svc->use_partition_reuse = 1; + } + svc->force_zero_mode_spatial_ref = 1; + svc->mi_stride[svc->spatial_layer_id] = cpi->common.mi_stride; + svc->mi_rows[svc->spatial_layer_id] = cpi->common.mi_rows; + svc->mi_cols[svc->spatial_layer_id] = cpi->common.mi_cols; + + // For constrained_from_above drop mode: before encoding superframe (i.e., + // at SL0 frame) check all spatial layers (starting from top) for possible + // drop, and if so, set a flag to force drop of that layer and all its lower + // layers. + if (svc->spatial_layer_to_encode == svc->first_spatial_layer_to_encode) { + int sl; + for (sl = 0; sl < svc->number_spatial_layers; sl++) + svc->force_drop_constrained_from_above[sl] = 0; + if (svc->framedrop_mode == CONSTRAINED_FROM_ABOVE_DROP) { + for (sl = svc->number_spatial_layers - 1; + sl >= svc->first_spatial_layer_to_encode; sl--) { + int layer = sl * svc->number_temporal_layers + svc->temporal_layer_id; + LAYER_CONTEXT *const sl_lc = &svc->layer_context[layer]; + cpi->rc = sl_lc->rc; + cpi->oxcf.target_bandwidth = sl_lc->target_bandwidth; + if (vp9_test_drop(cpi)) { + int sl2; + // Set flag to force drop in encoding for this mode. + for (sl2 = sl; sl2 >= svc->first_spatial_layer_to_encode; sl2--) + svc->force_drop_constrained_from_above[sl2] = 1; + break; + } + } + } + } + + if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) { + set_flags_and_fb_idx_for_temporal_mode3(cpi); + } else if (svc->temporal_layering_mode == + VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) { + set_flags_and_fb_idx_for_temporal_mode_noLayering(cpi); + } else if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0101) { + set_flags_and_fb_idx_for_temporal_mode2(cpi); + } else if (svc->temporal_layering_mode == + VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->use_set_ref_frame_config) { + set_flags_and_fb_idx_bypass_via_set_ref_frame_config(cpi); + } + + if (cpi->lst_fb_idx == svc->buffer_gf_temporal_ref[0].idx || + cpi->gld_fb_idx == svc->buffer_gf_temporal_ref[0].idx || + cpi->alt_fb_idx == svc->buffer_gf_temporal_ref[0].idx) + svc->buffer_gf_temporal_ref[0].is_used = 1; + if (cpi->lst_fb_idx == svc->buffer_gf_temporal_ref[1].idx || + cpi->gld_fb_idx == svc->buffer_gf_temporal_ref[1].idx || + cpi->alt_fb_idx == svc->buffer_gf_temporal_ref[1].idx) + svc->buffer_gf_temporal_ref[1].is_used = 1; + + // For the fixed (non-flexible/bypass) SVC mode: + // If long term temporal reference is enabled at the sequence level + // (use_gf_temporal_ref == 1), and inter_layer is disabled (on inter-frames), + // we can use golden as a second temporal reference + // (since the spatial/inter-layer reference is disabled). + // We check that the fb_idx for this reference (buffer_gf_temporal_ref.idx) is + // unused (slot 7 and 6 should be available for 3-3 layer system). + // For now usage of this second temporal reference will only be used for + // highest and next to highest spatial layer (i.e., top and middle layer for + // 3 spatial layers). + svc->use_gf_temporal_ref_current_layer = 0; + if (svc->use_gf_temporal_ref && !svc->buffer_gf_temporal_ref[0].is_used && + !svc->buffer_gf_temporal_ref[1].is_used && + svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->disable_inter_layer_pred != INTER_LAYER_PRED_ON && + svc->number_spatial_layers <= 3 && svc->number_temporal_layers <= 3 && + svc->spatial_layer_id >= svc->number_spatial_layers - 2) { + // Enable the second (long-term) temporal reference at the frame-level. + svc->use_gf_temporal_ref_current_layer = 1; + } + + // Check if current superframe has any layer sync, only check once on + // base layer. + if (svc->spatial_layer_id == 0) { + int sl = 0; + // Default is no sync. + svc->superframe_has_layer_sync = 0; + for (sl = 0; sl < svc->number_spatial_layers; ++sl) { + if (cpi->svc.spatial_layer_sync[sl]) svc->superframe_has_layer_sync = 1; + } + } + + // Reset the drop flags for all spatial layers, on the + // first_spatial_layer_to_encode. + if (svc->spatial_layer_id == svc->first_spatial_layer_to_encode) { + vp9_zero(svc->drop_spatial_layer); + // TODO(jianj/marpan): Investigate why setting svc->lst/gld/alt_fb_idx + // causes an issue with frame dropping and temporal layers, when the frame + // flags are passed via the encode call (bypass mode). Issue is that we're + // resetting ext_refresh_frame_flags_pending to 0 on frame drops. + if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + memset(&svc->lst_fb_idx, -1, sizeof(svc->lst_fb_idx)); + memset(&svc->gld_fb_idx, -1, sizeof(svc->lst_fb_idx)); + memset(&svc->alt_fb_idx, -1, sizeof(svc->lst_fb_idx)); + // These are set by API before the superframe is encoded and they are + // passed to encoder layer by layer. Don't reset them on layer 0 in bypass + // mode. + vp9_zero(svc->update_buffer_slot); + vp9_zero(svc->reference_last); + vp9_zero(svc->reference_golden); + vp9_zero(svc->reference_altref); + // TODO(jianj): Remove these 3, deprecated. + vp9_zero(svc->update_last); + vp9_zero(svc->update_golden); + vp9_zero(svc->update_altref); + } + } + + lc = &svc->layer_context[svc->spatial_layer_id * svc->number_temporal_layers + + svc->temporal_layer_id]; + + // Setting the worst/best_quality via the encoder control: SET_SVC_PARAMETERS, + // only for non-BYPASS mode for now. + if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS || + svc->use_set_ref_frame_config) { + RATE_CONTROL *const lrc = &lc->rc; + lrc->worst_quality = vp9_quantizer_to_qindex(lc->max_q); + lrc->best_quality = vp9_quantizer_to_qindex(lc->min_q); + if (cpi->fixed_qp_onepass) { + lrc->worst_quality = cpi->rc.worst_quality; + lrc->best_quality = cpi->rc.best_quality; + } + } + + if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC && svc->single_layer_svc == 1 && + svc->spatial_layer_id == svc->first_spatial_layer_to_encode && + cpi->resize_state != ORIG) { + scaling_factor_num = lc->scaling_factor_num_resize; + scaling_factor_den = lc->scaling_factor_den_resize; + } else { + scaling_factor_num = lc->scaling_factor_num; + scaling_factor_den = lc->scaling_factor_den; + } + + get_layer_resolution(cpi->oxcf.width, cpi->oxcf.height, scaling_factor_num, + scaling_factor_den, &width, &height); + + // Use Eightap_smooth for low resolutions. + if (width * height <= 320 * 240) + svc->downsample_filter_type[svc->spatial_layer_id] = EIGHTTAP_SMOOTH; + // For scale factors > 0.75, set the phase to 0 (aligns decimated pixel + // to source pixel). + if (scaling_factor_num > (3 * scaling_factor_den) >> 2) + svc->downsample_filter_phase[svc->spatial_layer_id] = 0; + + // The usage of use_base_mv or partition_reuse assumes down-scale of 2x2. + // For now, turn off use of base motion vectors and partition reuse if the + // spatial scale factors for any layers are not 2, + // keep the case of 3 spatial layers with scale factor of 4x4 for base layer. + // TODO(marpan): Fix this to allow for use_base_mv for scale factors != 2. + if (svc->number_spatial_layers > 1) { + int sl; + for (sl = 0; sl < svc->number_spatial_layers - 1; ++sl) { + lc = &svc->layer_context[sl * svc->number_temporal_layers + + svc->temporal_layer_id]; + if ((lc->scaling_factor_num != lc->scaling_factor_den >> 1) && + !(lc->scaling_factor_num == lc->scaling_factor_den >> 2 && sl == 0 && + svc->number_spatial_layers == 3)) { + svc->use_base_mv = 0; + svc->use_partition_reuse = 0; + break; + } + } + // For non-zero spatial layers: if the previous spatial layer was dropped + // disable the base_mv and partition_reuse features. + if (svc->spatial_layer_id > 0 && + svc->drop_spatial_layer[svc->spatial_layer_id - 1]) { + svc->use_base_mv = 0; + svc->use_partition_reuse = 0; + } + } + + svc->non_reference_frame = 0; + if (cpi->common.frame_type != KEY_FRAME && !cpi->ext_refresh_last_frame && + !cpi->ext_refresh_golden_frame && !cpi->ext_refresh_alt_ref_frame) + svc->non_reference_frame = 1; + // For flexible mode, where update_buffer_slot is used, need to check if + // all buffer slots are not refreshed. + if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + if (svc->update_buffer_slot[svc->spatial_layer_id] != 0) + svc->non_reference_frame = 0; + } + + if (svc->spatial_layer_id == 0) { + svc->high_source_sad_superframe = 0; + svc->high_num_blocks_with_motion = 0; + } + + if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->last_layer_dropped[svc->spatial_layer_id] && + svc->fb_idx_upd_tl0[svc->spatial_layer_id] != -1 && + !svc->layer_context[svc->temporal_layer_id].is_key_frame) { + // For fixed/non-flexible mode, if the previous frame (same spatial layer + // from previous superframe) was dropped, make sure the lst_fb_idx + // for this frame corresponds to the buffer index updated on (last) encoded + // TL0 frame (with same spatial layer). + cpi->lst_fb_idx = svc->fb_idx_upd_tl0[svc->spatial_layer_id]; + } + + if (vp9_set_size_literal(cpi, width, height) != 0) + return VPX_CODEC_INVALID_PARAM; + + return 0; +} + +struct lookahead_entry *vp9_svc_lookahead_pop(VP9_COMP *const cpi, + struct lookahead_ctx *ctx, + int drain) { + struct lookahead_entry *buf = NULL; + if (ctx->sz && (drain || ctx->sz == ctx->max_sz - MAX_PRE_FRAMES)) { + buf = vp9_lookahead_peek(ctx, 0); + if (buf != NULL) { + // Only remove the buffer when pop the highest layer. + if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) { + vp9_lookahead_pop(ctx, drain); + } + } + } + return buf; +} + +void vp9_free_svc_cyclic_refresh(VP9_COMP *const cpi) { + int sl, tl; + SVC *const svc = &cpi->svc; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { + for (tl = 0; tl < oxcf->ts_number_layers; ++tl) { + int layer = LAYER_IDS_TO_IDX(sl, tl, oxcf->ts_number_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + if (lc->map) vpx_free(lc->map); + if (lc->last_coded_q_map) vpx_free(lc->last_coded_q_map); + if (lc->consec_zero_mv) vpx_free(lc->consec_zero_mv); + } + } +} + +// Reset on key frame: reset counters, references and buffer updates. +void vp9_svc_reset_temporal_layers(VP9_COMP *const cpi, int is_key) { + int sl, tl; + SVC *const svc = &cpi->svc; + LAYER_CONTEXT *lc = NULL; + for (sl = 0; sl < svc->number_spatial_layers; ++sl) { + for (tl = 0; tl < svc->number_temporal_layers; ++tl) { + lc = &cpi->svc.layer_context[sl * svc->number_temporal_layers + tl]; + lc->current_video_frame_in_layer = 0; + if (is_key) lc->frames_from_key_frame = 0; + } + } + if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) { + set_flags_and_fb_idx_for_temporal_mode3(cpi); + } else if (svc->temporal_layering_mode == + VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) { + set_flags_and_fb_idx_for_temporal_mode_noLayering(cpi); + } else if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0101) { + set_flags_and_fb_idx_for_temporal_mode2(cpi); + } + vp9_update_temporal_layer_framerate(cpi); + vp9_restore_layer_context(cpi); +} + +void vp9_svc_check_reset_layer_rc_flag(VP9_COMP *const cpi) { + SVC *svc = &cpi->svc; + int sl, tl; + for (sl = 0; sl < svc->number_spatial_layers; ++sl) { + // Check for reset based on avg_frame_bandwidth for spatial layer sl. + const int spatial_layer_idx = LAYER_IDS_TO_IDX( + sl, svc->number_temporal_layers - 1, svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[spatial_layer_idx]; + RATE_CONTROL *lrc = &lc->rc; + if (lrc->avg_frame_bandwidth > (3 * lrc->last_avg_frame_bandwidth >> 1) || + lrc->avg_frame_bandwidth < (lrc->last_avg_frame_bandwidth >> 1)) { + // Reset for all temporal layers with spatial layer sl. + for (tl = 0; tl < svc->number_temporal_layers; ++tl) { + int temporal_layer_idx = + LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + lrc = &svc->layer_context[temporal_layer_idx].rc; + lrc->rc_1_frame = 0; + lrc->rc_2_frame = 0; + lrc->bits_off_target = lrc->optimal_buffer_level; + lrc->buffer_level = lrc->optimal_buffer_level; + } + } + } +} + +void vp9_svc_constrain_inter_layer_pred(VP9_COMP *const cpi) { + VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + const int sl = svc->spatial_layer_id; + // Check for disabling inter-layer (spatial) prediction, if + // svc.disable_inter_layer_pred is set. If the previous spatial layer was + // dropped then disable the prediction from this (scaled) reference. + // For INTER_LAYER_PRED_OFF_NONKEY: inter-layer prediction is disabled + // on key frames or if any spatial layer is a sync layer. + if ((svc->disable_inter_layer_pred == INTER_LAYER_PRED_OFF_NONKEY && + !svc->layer_context[svc->temporal_layer_id].is_key_frame && + !svc->superframe_has_layer_sync) || + svc->disable_inter_layer_pred == INTER_LAYER_PRED_OFF || + svc->drop_spatial_layer[sl - 1]) { + MV_REFERENCE_FRAME ref_frame; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); + if (yv12 != NULL && + (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) { + const struct scale_factors *const scale_fac = + &cm->frame_refs[ref_frame - 1].sf; + if (vp9_is_scaled(scale_fac)) { + cpi->ref_frame_flags &= (~ref_frame_to_flag(ref_frame)); + // Point golden/altref frame buffer index to last. + if (!svc->simulcast_mode) { + if (ref_frame == GOLDEN_FRAME) + cpi->gld_fb_idx = cpi->lst_fb_idx; + else if (ref_frame == ALTREF_FRAME) + cpi->alt_fb_idx = cpi->lst_fb_idx; + } + } + } + } + } + // For fixed/non-flexible SVC: check for disabling inter-layer prediction. + // If the reference for inter-layer prediction (the reference that is scaled) + // is not the previous spatial layer from the same superframe, then we disable + // inter-layer prediction. Only need to check when inter_layer prediction is + // not set to OFF mode. + if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->disable_inter_layer_pred != INTER_LAYER_PRED_OFF) { + // We only use LAST and GOLDEN for prediction in real-time mode, so we + // check both here. + MV_REFERENCE_FRAME ref_frame; + for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ref_frame++) { + struct scale_factors *scale_fac = &cm->frame_refs[ref_frame - 1].sf; + if (vp9_is_scaled(scale_fac)) { + // If this reference was updated on the previous spatial layer of the + // current superframe, then we keep this reference (don't disable). + // Otherwise we disable the inter-layer prediction. + // This condition is verified by checking if the current frame buffer + // index is equal to any of the slots for the previous spatial layer, + // and if so, check if that slot was updated/refreshed. If that is the + // case, then this reference is valid for inter-layer prediction under + // the mode INTER_LAYER_PRED_ON_CONSTRAINED. + int fb_idx = + ref_frame == LAST_FRAME ? cpi->lst_fb_idx : cpi->gld_fb_idx; + int ref_flag = ref_frame == LAST_FRAME ? VP9_LAST_FLAG : VP9_GOLD_FLAG; + int disable = 1; + if (fb_idx < 0) continue; + if ((fb_idx == svc->lst_fb_idx[sl - 1] && + (svc->update_buffer_slot[sl - 1] & (1 << fb_idx))) || + (fb_idx == svc->gld_fb_idx[sl - 1] && + (svc->update_buffer_slot[sl - 1] & (1 << fb_idx))) || + (fb_idx == svc->alt_fb_idx[sl - 1] && + (svc->update_buffer_slot[sl - 1] & (1 << fb_idx)))) + disable = 0; + if (disable) cpi->ref_frame_flags &= (~ref_flag); + } + } + } +} + +void vp9_svc_assert_constraints_pattern(VP9_COMP *const cpi) { + SVC *const svc = &cpi->svc; + // For fixed/non-flexible mode, the following constraint are expected, + // when inter-layer prediction is on (default). + if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->disable_inter_layer_pred == INTER_LAYER_PRED_ON && + svc->framedrop_mode != LAYER_DROP) { + if (!svc->layer_context[svc->temporal_layer_id].is_key_frame) { + // On non-key frames: LAST is always temporal reference, GOLDEN is + // spatial reference. + if (svc->temporal_layer_id == 0) + // Base temporal only predicts from base temporal. + assert(svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] == 0); + else + // Non-base temporal only predicts from lower temporal layer. + assert(svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] < + svc->temporal_layer_id); + if (svc->spatial_layer_id > 0 && cpi->ref_frame_flags & VP9_GOLD_FLAG && + svc->spatial_layer_id > svc->first_spatial_layer_to_encode) { + // Non-base spatial only predicts from lower spatial layer with same + // temporal_id. + assert(svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] == + svc->spatial_layer_id - 1); + assert(svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] == + svc->temporal_layer_id); + } + } else if (svc->spatial_layer_id > 0 && + svc->spatial_layer_id > svc->first_spatial_layer_to_encode) { + // Only 1 reference for frame whose base is key; reference may be LAST + // or GOLDEN, so we check both. + if (cpi->ref_frame_flags & VP9_LAST_FLAG) { + assert(svc->fb_idx_spatial_layer_id[cpi->lst_fb_idx] == + svc->spatial_layer_id - 1); + assert(svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] == + svc->temporal_layer_id); + } else if (cpi->ref_frame_flags & VP9_GOLD_FLAG) { + assert(svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] == + svc->spatial_layer_id - 1); + assert(svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] == + svc->temporal_layer_id); + } + } + } else if (svc->use_gf_temporal_ref_current_layer && + !svc->layer_context[svc->temporal_layer_id].is_key_frame) { + // For the usage of golden as second long term reference: the + // temporal_layer_id of that reference must be base temporal layer 0, and + // spatial_layer_id of that reference must be same as current + // spatial_layer_id. If not, disable feature. + // TODO(marpan): Investigate when this can happen, and maybe put this check + // and reset in a different place. + if (svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] != + svc->spatial_layer_id || + svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] != 0) + svc->use_gf_temporal_ref_current_layer = 0; + } +} + +#if CONFIG_VP9_TEMPORAL_DENOISING +int vp9_denoise_svc_non_key(VP9_COMP *const cpi) { + int layer = + LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, cpi->svc.temporal_layer_id, + cpi->svc.number_temporal_layers); + LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; + return denoise_svc(cpi) && !lc->is_key_frame; +} +#endif + +void vp9_svc_check_spatial_layer_sync(VP9_COMP *const cpi) { + SVC *const svc = &cpi->svc; + // Only for superframes whose base is not key, as those are + // already sync frames. + if (!svc->layer_context[svc->temporal_layer_id].is_key_frame) { + if (svc->spatial_layer_id == 0) { + // On base spatial layer: if the current superframe has a layer sync then + // reset the pattern counters and reset to base temporal layer. + if (svc->superframe_has_layer_sync) + vp9_svc_reset_temporal_layers(cpi, cpi->common.frame_type == KEY_FRAME); + } + // If the layer sync is set for this current spatial layer then + // disable the temporal reference. + if (svc->spatial_layer_id > 0 && + svc->spatial_layer_sync[svc->spatial_layer_id]) { + cpi->ref_frame_flags &= (~VP9_LAST_FLAG); + if (svc->use_gf_temporal_ref_current_layer) { + int index = svc->spatial_layer_id; + // If golden is used as second reference: need to remove it from + // prediction, reset refresh period to 0, and update the reference. + svc->use_gf_temporal_ref_current_layer = 0; + cpi->rc.baseline_gf_interval = 0; + cpi->rc.frames_till_gf_update_due = 0; + // On layer sync frame we must update the buffer index used for long + // term reference. Use the alt_ref since it is not used or updated on + // sync frames. + if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1; + assert(index >= 0); + cpi->alt_fb_idx = svc->buffer_gf_temporal_ref[index].idx; + cpi->ext_refresh_alt_ref_frame = 1; + } + } + } +} + +void vp9_svc_update_ref_frame_buffer_idx(VP9_COMP *const cpi) { + SVC *const svc = &cpi->svc; + int i = 0; + // Update the usage of frame buffer index for base spatial layers. + if (svc->spatial_layer_id == 0) { + if ((cpi->ref_frame_flags & VP9_LAST_FLAG) || cpi->refresh_last_frame) + svc->fb_idx_base[cpi->lst_fb_idx] = 1; + if ((cpi->ref_frame_flags & VP9_GOLD_FLAG) || cpi->refresh_golden_frame) + svc->fb_idx_base[cpi->gld_fb_idx] = 1; + if ((cpi->ref_frame_flags & VP9_ALT_FLAG) || cpi->refresh_alt_ref_frame) + svc->fb_idx_base[cpi->alt_fb_idx] = 1; + // For bypass/flexible mode: check for refresh slots. + if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + for (i = 0; i < REF_FRAMES; ++i) + if (svc->update_buffer_slot[0] & (1 << i)) svc->fb_idx_base[i] = 1; + } + } +} + +static void vp9_svc_update_ref_frame_bypass_mode(VP9_COMP *const cpi) { + // For non-flexible/bypass SVC mode: check for refreshing other buffer + // slots. + SVC *const svc = &cpi->svc; + VP9_COMMON *const cm = &cpi->common; + BufferPool *const pool = cm->buffer_pool; + int i; + for (i = 0; i < REF_FRAMES; i++) { + if ((cm->frame_type == KEY_FRAME && !svc->simulcast_mode) || + svc->update_buffer_slot[svc->spatial_layer_id] & (1 << i)) { + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[i], cm->new_fb_idx); + svc->fb_idx_spatial_layer_id[i] = svc->spatial_layer_id; + svc->fb_idx_temporal_layer_id[i] = svc->temporal_layer_id; + } + } +} + +void vp9_svc_update_ref_frame(VP9_COMP *const cpi) { + VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + BufferPool *const pool = cm->buffer_pool; + + if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->use_set_ref_frame_config) { + vp9_svc_update_ref_frame_bypass_mode(cpi); + } else if (cm->frame_type == KEY_FRAME && !svc->simulcast_mode) { + // Keep track of frame index for each reference frame. + int i; + // On key frame update all reference frame slots. + for (i = 0; i < REF_FRAMES; i++) { + svc->fb_idx_spatial_layer_id[i] = svc->spatial_layer_id; + svc->fb_idx_temporal_layer_id[i] = svc->temporal_layer_id; + // LAST/GOLDEN/ALTREF is already updated above. + if (i != cpi->lst_fb_idx && i != cpi->gld_fb_idx && i != cpi->alt_fb_idx) + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[i], cm->new_fb_idx); + } + } else { + if (cpi->refresh_last_frame) { + svc->fb_idx_spatial_layer_id[cpi->lst_fb_idx] = svc->spatial_layer_id; + svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] = svc->temporal_layer_id; + } + if (cpi->refresh_golden_frame) { + svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] = svc->spatial_layer_id; + svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] = svc->temporal_layer_id; + } + if (cpi->refresh_alt_ref_frame) { + svc->fb_idx_spatial_layer_id[cpi->alt_fb_idx] = svc->spatial_layer_id; + svc->fb_idx_temporal_layer_id[cpi->alt_fb_idx] = svc->temporal_layer_id; + } + } + // Copy flags from encoder to SVC struct. + vp9_copy_flags_ref_update_idx(cpi); + vp9_svc_update_ref_frame_buffer_idx(cpi); +} + +void vp9_svc_adjust_frame_rate(VP9_COMP *const cpi) { + int64_t this_duration = + cpi->svc.timebase_fac * cpi->svc.duration[cpi->svc.spatial_layer_id]; + vp9_new_framerate(cpi, 10000000.0 / this_duration); +} + +void vp9_svc_adjust_avg_frame_qindex(VP9_COMP *const cpi) { + VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + RATE_CONTROL *const rc = &cpi->rc; + // On key frames in CBR mode: reset the avg_frame_index for base layer + // (to level closer to worst_quality) if the overshoot is significant. + // Reset it for all temporal layers on base spatial layer. + if (cm->frame_type == KEY_FRAME && cpi->oxcf.rc_mode == VPX_CBR && + !svc->simulcast_mode && + rc->projected_frame_size > 3 * rc->avg_frame_bandwidth) { + int tl; + rc->avg_frame_qindex[INTER_FRAME] = + VPXMAX(rc->avg_frame_qindex[INTER_FRAME], + (cm->base_qindex + rc->worst_quality) >> 1); + for (tl = 0; tl < svc->number_temporal_layers; ++tl) { + const int layer = LAYER_IDS_TO_IDX(0, tl, svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + lrc->avg_frame_qindex[INTER_FRAME] = rc->avg_frame_qindex[INTER_FRAME]; + } + } +} + +// SVC: skip encoding of enhancement layer if the layer target bandwidth = 0. +// No need to set svc.skip_enhancement_layer if whole superframe will be +// dropped. +int vp9_svc_check_skip_enhancement_layer(VP9_COMP *const cpi) { + if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 && + cpi->oxcf.target_bandwidth == 0 && + !(cpi->svc.framedrop_mode != LAYER_DROP && + (cpi->svc.framedrop_mode != CONSTRAINED_FROM_ABOVE_DROP || + cpi->svc + .force_drop_constrained_from_above[cpi->svc.number_spatial_layers - + 1]) && + cpi->svc.drop_spatial_layer[0])) { + cpi->svc.skip_enhancement_layer = 1; + vp9_rc_postencode_update_drop_frame(cpi); + cpi->ext_refresh_frame_flags_pending = 0; + cpi->last_frame_dropped = 1; + cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 1; + cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id] = 1; + vp9_inc_frame_in_layer(cpi); + return 1; + } + return 0; +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.h b/media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.h new file mode 100644 index 0000000000..388a02789d --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.h @@ -0,0 +1,290 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_ +#define VPX_VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_ + +#include "vpx/vpx_encoder.h" + +#include "vp9/encoder/vp9_ratectrl.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + // Inter-layer prediction is on on all frames. + INTER_LAYER_PRED_ON, + // Inter-layer prediction is off on all frames. + INTER_LAYER_PRED_OFF, + // Inter-layer prediction is off on non-key frames and non-sync frames. + INTER_LAYER_PRED_OFF_NONKEY, + // Inter-layer prediction is on on all frames, but constrained such + // that any layer S (> 0) can only predict from previous spatial + // layer S-1, from the same superframe. + INTER_LAYER_PRED_ON_CONSTRAINED +} INTER_LAYER_PRED; + +typedef struct BUFFER_LONGTERM_REF { + int idx; + int is_used; +} BUFFER_LONGTERM_REF; + +typedef struct { + RATE_CONTROL rc; + int target_bandwidth; + int spatial_layer_target_bandwidth; // Target for the spatial layer. + double framerate; + int avg_frame_size; + int max_q; + int min_q; + int scaling_factor_num; + int scaling_factor_den; + // Scaling factors used for internal resize scaling for single layer SVC. + int scaling_factor_num_resize; + int scaling_factor_den_resize; + TWO_PASS twopass; + vpx_fixed_buf_t rc_twopass_stats_in; + unsigned int current_video_frame_in_layer; + int is_key_frame; + int frames_from_key_frame; + FRAME_TYPE last_frame_type; + struct lookahead_entry *alt_ref_source; + int alt_ref_idx; + int gold_ref_idx; + int has_alt_frame; + size_t layer_size; + // Cyclic refresh parameters (aq-mode=3), that need to be updated per-frame. + // TODO(jianj/marpan): Is it better to use the full cyclic refresh struct. + int sb_index; + signed char *map; + uint8_t *last_coded_q_map; + uint8_t *consec_zero_mv; + int actual_num_seg1_blocks; + int actual_num_seg2_blocks; + int counter_encode_maxq_scene_change; + int qindex_delta[3]; + uint8_t speed; + int loopfilter_ctrl; + int frame_qp; + int MBs; +} LAYER_CONTEXT; + +typedef struct SVC { + int spatial_layer_id; + int temporal_layer_id; + int number_spatial_layers; + int number_temporal_layers; + + int spatial_layer_to_encode; + + // Workaround for multiple frame contexts + enum { ENCODED = 0, ENCODING, NEED_TO_ENCODE } encode_empty_frame_state; + struct lookahead_entry empty_frame; + int encode_intra_empty_frame; + + // Store scaled source frames to be used for temporal filter to generate + // a alt ref frame. + YV12_BUFFER_CONFIG scaled_frames[MAX_LAG_BUFFERS]; + // Temp buffer used for 2-stage down-sampling, for real-time mode. + YV12_BUFFER_CONFIG scaled_temp; + int scaled_one_half; + int scaled_temp_is_alloc; + + // Layer context used for rate control in one pass temporal CBR mode or + // two pass spatial mode. + LAYER_CONTEXT layer_context[VPX_MAX_LAYERS]; + // Indicates what sort of temporal layering is used. + // Currently, this only works for CBR mode. + VP9E_TEMPORAL_LAYERING_MODE temporal_layering_mode; + // Frame flags and buffer indexes for each spatial layer, set by the + // application (external settings). + int ext_frame_flags[VPX_MAX_LAYERS]; + int lst_fb_idx[VPX_MAX_LAYERS]; + int gld_fb_idx[VPX_MAX_LAYERS]; + int alt_fb_idx[VPX_MAX_LAYERS]; + int force_zero_mode_spatial_ref; + // Sequence level flag to enable second (long term) temporal reference. + int use_gf_temporal_ref; + // Frame level flag to enable second (long term) temporal reference. + int use_gf_temporal_ref_current_layer; + // Allow second reference for at most 2 top highest resolution layers. + BUFFER_LONGTERM_REF buffer_gf_temporal_ref[2]; + int current_superframe; + int non_reference_frame; + int use_base_mv; + int use_partition_reuse; + // Used to control the downscaling filter for source scaling, for 1 pass CBR. + // downsample_filter_phase: = 0 will do sub-sampling (no weighted average), + // = 8 will center the target pixel and get a symmetric averaging filter. + // downsample_filter_type: 4 filters may be used: eighttap_regular, + // eighttap_smooth, eighttap_sharp, and bilinear. + INTERP_FILTER downsample_filter_type[VPX_SS_MAX_LAYERS]; + int downsample_filter_phase[VPX_SS_MAX_LAYERS]; + + BLOCK_SIZE *prev_partition_svc; + int mi_stride[VPX_MAX_LAYERS]; + int mi_rows[VPX_MAX_LAYERS]; + int mi_cols[VPX_MAX_LAYERS]; + + int first_layer_denoise; + + int skip_enhancement_layer; + + int lower_layer_qindex; + + int last_layer_dropped[VPX_MAX_LAYERS]; + int drop_spatial_layer[VPX_MAX_LAYERS]; + int framedrop_thresh[VPX_MAX_LAYERS]; + int drop_count[VPX_MAX_LAYERS]; + int force_drop_constrained_from_above[VPX_MAX_LAYERS]; + int max_consec_drop; + SVC_LAYER_DROP_MODE framedrop_mode; + + INTER_LAYER_PRED disable_inter_layer_pred; + + // Flag to indicate scene change and high num of motion blocks at current + // superframe, scene detection is currently checked for each superframe prior + // to encoding, on the full resolution source. + int high_source_sad_superframe; + int high_num_blocks_with_motion; + + // Flags used to get SVC pattern info. + int update_buffer_slot[VPX_SS_MAX_LAYERS]; + uint8_t reference_last[VPX_SS_MAX_LAYERS]; + uint8_t reference_golden[VPX_SS_MAX_LAYERS]; + uint8_t reference_altref[VPX_SS_MAX_LAYERS]; + // TODO(jianj): Remove these last 3, deprecated. + uint8_t update_last[VPX_SS_MAX_LAYERS]; + uint8_t update_golden[VPX_SS_MAX_LAYERS]; + uint8_t update_altref[VPX_SS_MAX_LAYERS]; + + // Keep track of the frame buffer index updated/refreshed on the base + // temporal superframe. + int fb_idx_upd_tl0[VPX_SS_MAX_LAYERS]; + + // Keep track of the spatial and temporal layer id of the frame that last + // updated the frame buffer index. + uint8_t fb_idx_spatial_layer_id[REF_FRAMES]; + uint8_t fb_idx_temporal_layer_id[REF_FRAMES]; + + int spatial_layer_sync[VPX_SS_MAX_LAYERS]; + // Quantizer for each spatial layer. + int base_qindex[VPX_SS_MAX_LAYERS]; + uint8_t set_intra_only_frame; + uint8_t previous_frame_is_intra_only; + uint8_t superframe_has_layer_sync; + + uint8_t fb_idx_base[REF_FRAMES]; + + int use_set_ref_frame_config; + + int temporal_layer_id_per_spatial[VPX_SS_MAX_LAYERS]; + + int first_spatial_layer_to_encode; + + // Parameters for allowing framerate per spatial layer, and buffer + // update based on timestamps. + int64_t duration[VPX_SS_MAX_LAYERS]; + int64_t timebase_fac; + int64_t time_stamp_superframe; + int64_t time_stamp_prev[VPX_SS_MAX_LAYERS]; + + int num_encoded_top_layer; + + // Every spatial layer on a superframe whose base is key is key too. + int simulcast_mode; + + // Flag to indicate SVC is dynamically switched to a single layer. + int single_layer_svc; + int resize_set; +} SVC; + +struct VP9_COMP; + +// Initialize layer context data from init_config(). +void vp9_init_layer_context(struct VP9_COMP *const cpi); + +// Update the layer context from a change_config() call. +void vp9_update_layer_context_change_config(struct VP9_COMP *const cpi, + const int target_bandwidth); + +// Prior to encoding the frame, update framerate-related quantities +// for the current temporal layer. +void vp9_update_temporal_layer_framerate(struct VP9_COMP *const cpi); + +// Update framerate-related quantities for the current spatial layer. +void vp9_update_spatial_layer_framerate(struct VP9_COMP *const cpi, + double framerate); + +// Prior to encoding the frame, set the layer context, for the current layer +// to be encoded, to the cpi struct. +void vp9_restore_layer_context(struct VP9_COMP *const cpi); + +// Save the layer context after encoding the frame. +void vp9_save_layer_context(struct VP9_COMP *const cpi); + +// Initialize second pass rc for spatial svc. +void vp9_init_second_pass_spatial_svc(struct VP9_COMP *cpi); + +void get_layer_resolution(const int width_org, const int height_org, + const int num, const int den, int *width_out, + int *height_out); + +// Increment number of video frames in layer +void vp9_inc_frame_in_layer(struct VP9_COMP *const cpi); + +// Check if current layer is key frame in spatial upper layer +int vp9_is_upper_layer_key_frame(const struct VP9_COMP *const cpi); + +// Get the next source buffer to encode +struct lookahead_entry *vp9_svc_lookahead_pop(struct VP9_COMP *const cpi, + struct lookahead_ctx *ctx, + int drain); + +// Start a frame and initialize svc parameters +int vp9_svc_start_frame(struct VP9_COMP *const cpi); + +#if CONFIG_VP9_TEMPORAL_DENOISING +int vp9_denoise_svc_non_key(struct VP9_COMP *const cpi); +#endif + +void vp9_copy_flags_ref_update_idx(struct VP9_COMP *const cpi); + +int vp9_one_pass_svc_start_layer(struct VP9_COMP *const cpi); + +void vp9_free_svc_cyclic_refresh(struct VP9_COMP *const cpi); + +void vp9_svc_reset_temporal_layers(struct VP9_COMP *const cpi, int is_key); + +void vp9_svc_check_reset_layer_rc_flag(struct VP9_COMP *const cpi); + +void vp9_svc_constrain_inter_layer_pred(struct VP9_COMP *const cpi); + +void vp9_svc_assert_constraints_pattern(struct VP9_COMP *const cpi); + +void vp9_svc_check_spatial_layer_sync(struct VP9_COMP *const cpi); + +void vp9_svc_update_ref_frame_buffer_idx(struct VP9_COMP *const cpi); + +void vp9_svc_update_ref_frame_key_simulcast(struct VP9_COMP *const cpi); + +void vp9_svc_update_ref_frame(struct VP9_COMP *const cpi); + +void vp9_svc_adjust_frame_rate(struct VP9_COMP *const cpi); + +void vp9_svc_adjust_avg_frame_qindex(struct VP9_COMP *const cpi); + +int vp9_svc_check_skip_enhancement_layer(struct VP9_COMP *const cpi); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter.c b/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter.c new file mode 100644 index 0000000000..986553a4a8 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter.c @@ -0,0 +1,1205 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "vp9/common/vp9_alloccommon.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/encoder/vp9_encodeframe.h" +#include "vp9/encoder/vp9_ethread.h" +#include "vp9/encoder/vp9_extend.h" +#include "vp9/encoder/vp9_firstpass.h" +#include "vp9/encoder/vp9_mcomp.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_quantize.h" +#include "vp9/encoder/vp9_ratectrl.h" +#include "vp9/encoder/vp9_segmentation.h" +#include "vp9/encoder/vp9_temporal_filter.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/vpx_timer.h" +#include "vpx_scale/vpx_scale.h" + +static int fixed_divide[512]; +static unsigned int index_mult[14] = { 0, 0, 0, 0, 49152, + 39322, 32768, 28087, 24576, 21846, + 19661, 17874, 0, 15124 }; +#if CONFIG_VP9_HIGHBITDEPTH +static int64_t highbd_index_mult[14] = { 0U, 0U, 0U, + 0U, 3221225472U, 2576980378U, + 2147483648U, 1840700270U, 1610612736U, + 1431655766U, 1288490189U, 1171354718U, + 0U, 991146300U }; +#endif // CONFIG_VP9_HIGHBITDEPTH + +static void temporal_filter_predictors_mb_c( + MACROBLOCKD *xd, uint8_t *y_mb_ptr, uint8_t *u_mb_ptr, uint8_t *v_mb_ptr, + int stride, int uv_block_width, int uv_block_height, int mv_row, int mv_col, + uint8_t *pred, struct scale_factors *scale, int x, int y, MV *blk_mvs, + int use_32x32) { + const int which_mv = 0; + const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP_SHARP]; + int i, j, k = 0, ys = (BH >> 1), xs = (BW >> 1); + + enum mv_precision mv_precision_uv; + int uv_stride; + if (uv_block_width == (BW >> 1)) { + uv_stride = (stride + 1) >> 1; + mv_precision_uv = MV_PRECISION_Q4; + } else { + uv_stride = stride; + mv_precision_uv = MV_PRECISION_Q3; + } +#if !CONFIG_VP9_HIGHBITDEPTH + (void)xd; +#endif + + if (use_32x32) { + const MV mv = { mv_row, mv_col }; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(y_mb_ptr), stride, + CONVERT_TO_SHORTPTR(&pred[0]), BW, &mv, + scale, BW, BH, which_mv, kernel, + MV_PRECISION_Q3, x, y, xd->bd); + + vp9_highbd_build_inter_predictor( + CONVERT_TO_SHORTPTR(u_mb_ptr), uv_stride, + CONVERT_TO_SHORTPTR(&pred[BLK_PELS]), uv_block_width, &mv, scale, + uv_block_width, uv_block_height, which_mv, kernel, mv_precision_uv, x, + y, xd->bd); + + vp9_highbd_build_inter_predictor( + CONVERT_TO_SHORTPTR(v_mb_ptr), uv_stride, + CONVERT_TO_SHORTPTR(&pred[(BLK_PELS << 1)]), uv_block_width, &mv, + scale, uv_block_width, uv_block_height, which_mv, kernel, + mv_precision_uv, x, y, xd->bd); + return; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + vp9_build_inter_predictor(y_mb_ptr, stride, &pred[0], BW, &mv, scale, BW, + BH, which_mv, kernel, MV_PRECISION_Q3, x, y); + + vp9_build_inter_predictor(u_mb_ptr, uv_stride, &pred[BLK_PELS], + uv_block_width, &mv, scale, uv_block_width, + uv_block_height, which_mv, kernel, + mv_precision_uv, x, y); + + vp9_build_inter_predictor(v_mb_ptr, uv_stride, &pred[(BLK_PELS << 1)], + uv_block_width, &mv, scale, uv_block_width, + uv_block_height, which_mv, kernel, + mv_precision_uv, x, y); + return; + } + + // While use_32x32 = 0, construct the 32x32 predictor using 4 16x16 + // predictors. + // Y predictor + for (i = 0; i < BH; i += ys) { + for (j = 0; j < BW; j += xs) { + const MV mv = blk_mvs[k]; + const int y_offset = i * stride + j; + const int p_offset = i * BW + j; + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_highbd_build_inter_predictor( + CONVERT_TO_SHORTPTR(y_mb_ptr + y_offset), stride, + CONVERT_TO_SHORTPTR(&pred[p_offset]), BW, &mv, scale, xs, ys, + which_mv, kernel, MV_PRECISION_Q3, x, y, xd->bd); + } else { + vp9_build_inter_predictor(y_mb_ptr + y_offset, stride, &pred[p_offset], + BW, &mv, scale, xs, ys, which_mv, kernel, + MV_PRECISION_Q3, x, y); + } +#else + vp9_build_inter_predictor(y_mb_ptr + y_offset, stride, &pred[p_offset], + BW, &mv, scale, xs, ys, which_mv, kernel, + MV_PRECISION_Q3, x, y); +#endif // CONFIG_VP9_HIGHBITDEPTH + k++; + } + } + + // U and V predictors + ys = (uv_block_height >> 1); + xs = (uv_block_width >> 1); + k = 0; + + for (i = 0; i < uv_block_height; i += ys) { + for (j = 0; j < uv_block_width; j += xs) { + const MV mv = blk_mvs[k]; + const int uv_offset = i * uv_stride + j; + const int p_offset = i * uv_block_width + j; + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_highbd_build_inter_predictor( + CONVERT_TO_SHORTPTR(u_mb_ptr + uv_offset), uv_stride, + CONVERT_TO_SHORTPTR(&pred[BLK_PELS + p_offset]), uv_block_width, + &mv, scale, xs, ys, which_mv, kernel, mv_precision_uv, x, y, + xd->bd); + + vp9_highbd_build_inter_predictor( + CONVERT_TO_SHORTPTR(v_mb_ptr + uv_offset), uv_stride, + CONVERT_TO_SHORTPTR(&pred[(BLK_PELS << 1) + p_offset]), + uv_block_width, &mv, scale, xs, ys, which_mv, kernel, + mv_precision_uv, x, y, xd->bd); + } else { + vp9_build_inter_predictor(u_mb_ptr + uv_offset, uv_stride, + &pred[BLK_PELS + p_offset], uv_block_width, + &mv, scale, xs, ys, which_mv, kernel, + mv_precision_uv, x, y); + + vp9_build_inter_predictor(v_mb_ptr + uv_offset, uv_stride, + &pred[(BLK_PELS << 1) + p_offset], + uv_block_width, &mv, scale, xs, ys, which_mv, + kernel, mv_precision_uv, x, y); + } +#else + vp9_build_inter_predictor(u_mb_ptr + uv_offset, uv_stride, + &pred[BLK_PELS + p_offset], uv_block_width, &mv, + scale, xs, ys, which_mv, kernel, + mv_precision_uv, x, y); + + vp9_build_inter_predictor(v_mb_ptr + uv_offset, uv_stride, + &pred[(BLK_PELS << 1) + p_offset], + uv_block_width, &mv, scale, xs, ys, which_mv, + kernel, mv_precision_uv, x, y); +#endif // CONFIG_VP9_HIGHBITDEPTH + k++; + } + } +} + +void vp9_temporal_filter_init(void) { + int i; + + fixed_divide[0] = 0; + for (i = 1; i < 512; ++i) fixed_divide[i] = 0x80000 / i; +} + +static INLINE int mod_index(int sum_dist, int index, int rounding, int strength, + int filter_weight) { + int mod; + + assert(index >= 0 && index <= 13); + assert(index_mult[index] != 0); + + mod = + ((unsigned int)clamp(sum_dist, 0, UINT16_MAX) * index_mult[index]) >> 16; + mod += rounding; + mod >>= strength; + + mod = VPXMIN(16, mod); + + mod = 16 - mod; + mod *= filter_weight; + + return mod; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE int highbd_mod_index(int sum_dist, int index, int rounding, + int strength, int filter_weight) { + int mod; + + assert(index >= 0 && index <= 13); + assert(highbd_index_mult[index] != 0); + + mod = (int)((clamp(sum_dist, 0, INT32_MAX) * highbd_index_mult[index]) >> 32); + mod += rounding; + mod >>= strength; + + mod = VPXMIN(16, mod); + + mod = 16 - mod; + mod *= filter_weight; + + return mod; +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +static INLINE int get_filter_weight(unsigned int i, unsigned int j, + unsigned int block_height, + unsigned int block_width, + const int *const blk_fw, int use_32x32) { + // blk_fw[0] ~ blk_fw[3] are the same. + if (use_32x32) { + return blk_fw[0]; + } + + if (i < block_height / 2) { + if (j < block_width / 2) { + return blk_fw[0]; + } + + return blk_fw[1]; + } + + if (j < block_width / 2) { + return blk_fw[2]; + } + + return blk_fw[3]; +} + +void vp9_apply_temporal_filter_c( + const uint8_t *y_frame1, int y_stride, const uint8_t *y_pred, + int y_buf_stride, const uint8_t *u_frame1, const uint8_t *v_frame1, + int uv_stride, const uint8_t *u_pred, const uint8_t *v_pred, + int uv_buf_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, + uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, + uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count) { + unsigned int i, j, k, m; + int modifier; + const int rounding = (1 << strength) >> 1; + const unsigned int uv_block_width = block_width >> ss_x; + const unsigned int uv_block_height = block_height >> ss_y; + DECLARE_ALIGNED(16, uint16_t, y_diff_sse[BLK_PELS]); + DECLARE_ALIGNED(16, uint16_t, u_diff_sse[BLK_PELS]); + DECLARE_ALIGNED(16, uint16_t, v_diff_sse[BLK_PELS]); + + int idx = 0, idy; + + assert(strength >= 0); + assert(strength <= 6); + + memset(y_diff_sse, 0, BLK_PELS * sizeof(uint16_t)); + memset(u_diff_sse, 0, BLK_PELS * sizeof(uint16_t)); + memset(v_diff_sse, 0, BLK_PELS * sizeof(uint16_t)); + + // Calculate diff^2 for each pixel of the 16x16 block. + // TODO(yunqing): the following code needs to be optimized. + for (i = 0; i < block_height; i++) { + for (j = 0; j < block_width; j++) { + const int16_t diff = + y_frame1[i * (int)y_stride + j] - y_pred[i * (int)block_width + j]; + y_diff_sse[idx++] = diff * diff; + } + } + idx = 0; + for (i = 0; i < uv_block_height; i++) { + for (j = 0; j < uv_block_width; j++) { + const int16_t diffu = + u_frame1[i * uv_stride + j] - u_pred[i * uv_buf_stride + j]; + const int16_t diffv = + v_frame1[i * uv_stride + j] - v_pred[i * uv_buf_stride + j]; + u_diff_sse[idx] = diffu * diffu; + v_diff_sse[idx] = diffv * diffv; + idx++; + } + } + + for (i = 0, k = 0, m = 0; i < block_height; i++) { + for (j = 0; j < block_width; j++) { + const int pixel_value = y_pred[i * y_buf_stride + j]; + const int filter_weight = + get_filter_weight(i, j, block_height, block_width, blk_fw, use_32x32); + + // non-local mean approach + int y_index = 0; + + const int uv_r = i >> ss_y; + const int uv_c = j >> ss_x; + modifier = 0; + + for (idy = -1; idy <= 1; ++idy) { + for (idx = -1; idx <= 1; ++idx) { + const int row = (int)i + idy; + const int col = (int)j + idx; + + if (row >= 0 && row < (int)block_height && col >= 0 && + col < (int)block_width) { + modifier += y_diff_sse[row * (int)block_width + col]; + ++y_index; + } + } + } + + assert(y_index > 0); + + modifier += u_diff_sse[uv_r * uv_block_width + uv_c]; + modifier += v_diff_sse[uv_r * uv_block_width + uv_c]; + + y_index += 2; + + modifier = + mod_index(modifier, y_index, rounding, strength, filter_weight); + + y_count[k] += modifier; + y_accumulator[k] += modifier * pixel_value; + + ++k; + + // Process chroma component + if (!(i & ss_y) && !(j & ss_x)) { + const int u_pixel_value = u_pred[uv_r * uv_buf_stride + uv_c]; + const int v_pixel_value = v_pred[uv_r * uv_buf_stride + uv_c]; + + // non-local mean approach + int cr_index = 0; + int u_mod = 0, v_mod = 0; + int y_diff = 0; + + for (idy = -1; idy <= 1; ++idy) { + for (idx = -1; idx <= 1; ++idx) { + const int row = uv_r + idy; + const int col = uv_c + idx; + + if (row >= 0 && row < (int)uv_block_height && col >= 0 && + col < (int)uv_block_width) { + u_mod += u_diff_sse[row * uv_block_width + col]; + v_mod += v_diff_sse[row * uv_block_width + col]; + ++cr_index; + } + } + } + + assert(cr_index > 0); + + for (idy = 0; idy < 1 + ss_y; ++idy) { + for (idx = 0; idx < 1 + ss_x; ++idx) { + const int row = (uv_r << ss_y) + idy; + const int col = (uv_c << ss_x) + idx; + y_diff += y_diff_sse[row * (int)block_width + col]; + ++cr_index; + } + } + + u_mod += y_diff; + v_mod += y_diff; + + u_mod = mod_index(u_mod, cr_index, rounding, strength, filter_weight); + v_mod = mod_index(v_mod, cr_index, rounding, strength, filter_weight); + + u_count[m] += u_mod; + u_accumulator[m] += u_mod * u_pixel_value; + v_count[m] += v_mod; + v_accumulator[m] += v_mod * v_pixel_value; + + ++m; + } // Complete YUV pixel + } + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_highbd_apply_temporal_filter_c( + const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, + int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, + int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, + uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count, + uint32_t *v_accum, uint16_t *v_count) { + const int uv_block_width = block_width >> ss_x; + const int uv_block_height = block_height >> ss_y; + const int y_diff_stride = BW; + const int uv_diff_stride = BW; + + DECLARE_ALIGNED(16, uint32_t, y_diff_sse[BLK_PELS]); + DECLARE_ALIGNED(16, uint32_t, u_diff_sse[BLK_PELS]); + DECLARE_ALIGNED(16, uint32_t, v_diff_sse[BLK_PELS]); + + const int rounding = (1 << strength) >> 1; + + // Loop variables + int row, col; + int uv_row, uv_col; + int row_step, col_step; + + memset(y_diff_sse, 0, BLK_PELS * sizeof(uint32_t)); + memset(u_diff_sse, 0, BLK_PELS * sizeof(uint32_t)); + memset(v_diff_sse, 0, BLK_PELS * sizeof(uint32_t)); + + // Get the square diffs + for (row = 0; row < (int)block_height; row++) { + for (col = 0; col < (int)block_width; col++) { + const int diff = + y_src[row * y_src_stride + col] - y_pre[row * y_pre_stride + col]; + y_diff_sse[row * y_diff_stride + col] = diff * diff; + } + } + + for (row = 0; row < uv_block_height; row++) { + for (col = 0; col < uv_block_width; col++) { + const int u_diff = + u_src[row * uv_src_stride + col] - u_pre[row * uv_pre_stride + col]; + const int v_diff = + v_src[row * uv_src_stride + col] - v_pre[row * uv_pre_stride + col]; + u_diff_sse[row * uv_diff_stride + col] = u_diff * u_diff; + v_diff_sse[row * uv_diff_stride + col] = v_diff * v_diff; + } + } + + // Apply the filter to luma + for (row = 0; row < (int)block_height; row++) { + for (col = 0; col < (int)block_width; col++) { + const int filter_weight = get_filter_weight( + row, col, block_height, block_width, blk_fw, use_32x32); + + // First we get the modifier for the current y pixel + const int y_pixel = y_pre[row * y_pre_stride + col]; + int y_num_used = 0; + int y_mod = 0; + + // Sum the neighboring 3x3 y pixels + for (row_step = -1; row_step <= 1; row_step++) { + for (col_step = -1; col_step <= 1; col_step++) { + const int sub_row = row + row_step; + const int sub_col = col + col_step; + + if (sub_row >= 0 && sub_row < (int)block_height && sub_col >= 0 && + sub_col < (int)block_width) { + y_mod += y_diff_sse[sub_row * y_diff_stride + sub_col]; + y_num_used++; + } + } + } + + // Sum the corresponding uv pixels to the current y modifier + // Note we are rounding down instead of rounding to the nearest pixel. + uv_row = row >> ss_y; + uv_col = col >> ss_x; + y_mod += u_diff_sse[uv_row * uv_diff_stride + uv_col]; + y_mod += v_diff_sse[uv_row * uv_diff_stride + uv_col]; + + y_num_used += 2; + + // Set the modifier + y_mod = highbd_mod_index(y_mod, y_num_used, rounding, strength, + filter_weight); + + // Accumulate the result + y_count[row * block_width + col] += y_mod; + y_accum[row * block_width + col] += y_mod * y_pixel; + } + } + + // Apply the filter to chroma + for (uv_row = 0; uv_row < uv_block_height; uv_row++) { + for (uv_col = 0; uv_col < uv_block_width; uv_col++) { + const int y_row = uv_row << ss_y; + const int y_col = uv_col << ss_x; + const int filter_weight = get_filter_weight( + uv_row, uv_col, uv_block_height, uv_block_width, blk_fw, use_32x32); + + const int u_pixel = u_pre[uv_row * uv_pre_stride + uv_col]; + const int v_pixel = v_pre[uv_row * uv_pre_stride + uv_col]; + + int uv_num_used = 0; + int u_mod = 0, v_mod = 0; + + // Sum the neighboring 3x3 chromal pixels to the chroma modifier + for (row_step = -1; row_step <= 1; row_step++) { + for (col_step = -1; col_step <= 1; col_step++) { + const int sub_row = uv_row + row_step; + const int sub_col = uv_col + col_step; + + if (sub_row >= 0 && sub_row < uv_block_height && sub_col >= 0 && + sub_col < uv_block_width) { + u_mod += u_diff_sse[sub_row * uv_diff_stride + sub_col]; + v_mod += v_diff_sse[sub_row * uv_diff_stride + sub_col]; + uv_num_used++; + } + } + } + + // Sum all the luma pixels associated with the current luma pixel + for (row_step = 0; row_step < 1 + ss_y; row_step++) { + for (col_step = 0; col_step < 1 + ss_x; col_step++) { + const int sub_row = y_row + row_step; + const int sub_col = y_col + col_step; + const int y_diff = y_diff_sse[sub_row * y_diff_stride + sub_col]; + + u_mod += y_diff; + v_mod += y_diff; + uv_num_used++; + } + } + + // Set the modifier + u_mod = highbd_mod_index(u_mod, uv_num_used, rounding, strength, + filter_weight); + v_mod = highbd_mod_index(v_mod, uv_num_used, rounding, strength, + filter_weight); + + // Accumulate the result + u_count[uv_row * uv_block_width + uv_col] += u_mod; + u_accum[uv_row * uv_block_width + uv_col] += u_mod * u_pixel; + v_count[uv_row * uv_block_width + uv_col] += v_mod; + v_accum[uv_row * uv_block_width + uv_col] += v_mod * v_pixel; + } + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +static uint32_t temporal_filter_find_matching_mb_c( + VP9_COMP *cpi, ThreadData *td, uint8_t *arf_frame_buf, + uint8_t *frame_ptr_buf, int stride, MV *ref_mv, MV *blk_mvs, + int *blk_bestsme) { + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; + const SEARCH_METHODS search_method = MESH; + const SEARCH_METHODS search_method_16 = cpi->sf.temporal_filter_search_method; + int step_param; + int sadpb = x->sadperbit16; + uint32_t bestsme = UINT_MAX; + uint32_t distortion; + uint32_t sse; + int cost_list[5]; + const MvLimits tmp_mv_limits = x->mv_limits; + + MV best_ref_mv1 = { 0, 0 }; + MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ + + // Save input state + struct buf_2d src = x->plane[0].src; + struct buf_2d pre = xd->plane[0].pre[0]; + int i, j, k = 0; + + best_ref_mv1_full.col = best_ref_mv1.col >> 3; + best_ref_mv1_full.row = best_ref_mv1.row >> 3; + + // Setup frame pointers + x->plane[0].src.buf = arf_frame_buf; + x->plane[0].src.stride = stride; + xd->plane[0].pre[0].buf = frame_ptr_buf; + xd->plane[0].pre[0].stride = stride; + + step_param = mv_sf->reduce_first_step_size; + step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2); + + vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1); + + vp9_full_pixel_search(cpi, x, TF_BLOCK, &best_ref_mv1_full, step_param, + search_method, sadpb, cond_cost_list(cpi, cost_list), + &best_ref_mv1, ref_mv, 0, 0); + + /* restore UMV window */ + x->mv_limits = tmp_mv_limits; + + // find_fractional_mv_step parameters: best_ref_mv1 is for mv rate cost + // calculation. The start full mv and the search result are stored in + // ref_mv. + bestsme = cpi->find_fractional_mv_step( + x, ref_mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, + x->errorperbit, &cpi->fn_ptr[TF_BLOCK], 0, mv_sf->subpel_search_level, + cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, BW, + BH, USE_8_TAPS_SHARP); + + // DO motion search on 4 16x16 sub_blocks. + best_ref_mv1.row = ref_mv->row; + best_ref_mv1.col = ref_mv->col; + best_ref_mv1_full.col = best_ref_mv1.col >> 3; + best_ref_mv1_full.row = best_ref_mv1.row >> 3; + + for (i = 0; i < BH; i += SUB_BH) { + for (j = 0; j < BW; j += SUB_BW) { + // Setup frame pointers + x->plane[0].src.buf = arf_frame_buf + i * stride + j; + x->plane[0].src.stride = stride; + xd->plane[0].pre[0].buf = frame_ptr_buf + i * stride + j; + xd->plane[0].pre[0].stride = stride; + + vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1); + vp9_full_pixel_search(cpi, x, TF_SUB_BLOCK, &best_ref_mv1_full, + step_param, search_method_16, sadpb, + cond_cost_list(cpi, cost_list), &best_ref_mv1, + &blk_mvs[k], 0, 0); + /* restore UMV window */ + x->mv_limits = tmp_mv_limits; + + blk_bestsme[k] = cpi->find_fractional_mv_step( + x, &blk_mvs[k], &best_ref_mv1, cpi->common.allow_high_precision_mv, + x->errorperbit, &cpi->fn_ptr[TF_SUB_BLOCK], 0, + mv_sf->subpel_search_level, cond_cost_list(cpi, cost_list), NULL, + NULL, &distortion, &sse, NULL, SUB_BW, SUB_BH, USE_8_TAPS_SHARP); + k++; + } + } + + // Restore input state + x->plane[0].src = src; + xd->plane[0].pre[0] = pre; + + return bestsme; +} + +void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, + int mb_row, int mb_col_start, + int mb_col_end) { + ARNRFilterData *arnr_filter_data = &cpi->arnr_filter_data; + YV12_BUFFER_CONFIG **frames = arnr_filter_data->frames; + int frame_count = arnr_filter_data->frame_count; + int alt_ref_index = arnr_filter_data->alt_ref_index; + int strength = arnr_filter_data->strength; + struct scale_factors *scale = &arnr_filter_data->sf; + int byte; + int frame; + int mb_col; + int mb_cols = (frames[alt_ref_index]->y_crop_width + BW - 1) >> BW_LOG2; + int mb_rows = (frames[alt_ref_index]->y_crop_height + BH - 1) >> BH_LOG2; + DECLARE_ALIGNED(16, uint32_t, accumulator[BLK_PELS * 3]); + DECLARE_ALIGNED(16, uint16_t, count[BLK_PELS * 3]); + MACROBLOCKD *mbd = &td->mb.e_mbd; + YV12_BUFFER_CONFIG *f = frames[alt_ref_index]; + uint8_t *dst1, *dst2; +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, predictor16[BLK_PELS * 3]); + DECLARE_ALIGNED(16, uint8_t, predictor8[BLK_PELS * 3]); + uint8_t *predictor; +#else + DECLARE_ALIGNED(16, uint8_t, predictor[BLK_PELS * 3]); +#endif + const int mb_uv_height = BH >> mbd->plane[1].subsampling_y; + const int mb_uv_width = BW >> mbd->plane[1].subsampling_x; + // Addition of the tile col level offsets + int mb_y_offset = mb_row * BH * (f->y_stride) + BW * mb_col_start; + int mb_uv_offset = + mb_row * mb_uv_height * f->uv_stride + mb_uv_width * mb_col_start; + +#if CONFIG_VP9_HIGHBITDEPTH + if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + predictor = CONVERT_TO_BYTEPTR(predictor16); + } else { + predictor = predictor8; + } +#endif + + // Source frames are extended to 16 pixels. This is different than + // L/A/G reference frames that have a border of 32 (VP9ENCBORDERINPIXELS) + // A 6/8 tap filter is used for motion search. This requires 2 pixels + // before and 3 pixels after. So the largest Y mv on a border would + // then be 16 - VP9_INTERP_EXTEND. The UV blocks are half the size of the + // Y and therefore only extended by 8. The largest mv that a UV block + // can support is 8 - VP9_INTERP_EXTEND. A UV mv is half of a Y mv. + // (16 - VP9_INTERP_EXTEND) >> 1 which is greater than + // 8 - VP9_INTERP_EXTEND. + // To keep the mv in play for both Y and UV planes the max that it + // can be on a border is therefore 16 - (2*VP9_INTERP_EXTEND+1). + td->mb.mv_limits.row_min = -((mb_row * BH) + (17 - 2 * VP9_INTERP_EXTEND)); + td->mb.mv_limits.row_max = + ((mb_rows - 1 - mb_row) * BH) + (17 - 2 * VP9_INTERP_EXTEND); + + for (mb_col = mb_col_start; mb_col < mb_col_end; mb_col++) { + int i, j, k; + int stride; + MV ref_mv; + + vp9_zero_array(accumulator, BLK_PELS * 3); + vp9_zero_array(count, BLK_PELS * 3); + + td->mb.mv_limits.col_min = -((mb_col * BW) + (17 - 2 * VP9_INTERP_EXTEND)); + td->mb.mv_limits.col_max = + ((mb_cols - 1 - mb_col) * BW) + (17 - 2 * VP9_INTERP_EXTEND); + + if (cpi->oxcf.content == VP9E_CONTENT_FILM) { + unsigned int src_variance; + struct buf_2d src; + + src.buf = f->y_buffer + mb_y_offset; + src.stride = f->y_stride; + +#if CONFIG_VP9_HIGHBITDEPTH + if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + src_variance = + vp9_high_get_sby_perpixel_variance(cpi, &src, TF_BLOCK, mbd->bd); + } else { + src_variance = vp9_get_sby_perpixel_variance(cpi, &src, TF_BLOCK); + } +#else + src_variance = vp9_get_sby_perpixel_variance(cpi, &src, TF_BLOCK); +#endif // CONFIG_VP9_HIGHBITDEPTH + + if (src_variance <= 2) { + strength = VPXMAX(0, arnr_filter_data->strength - 2); + } + } + + for (frame = 0; frame < frame_count; frame++) { + // MVs for 4 16x16 sub blocks. + MV blk_mvs[4]; + // Filter weights for 4 16x16 sub blocks. + int blk_fw[4] = { 0, 0, 0, 0 }; + int use_32x32 = 0; + + if (frames[frame] == NULL) continue; + + ref_mv.row = 0; + ref_mv.col = 0; + blk_mvs[0] = kZeroMv; + blk_mvs[1] = kZeroMv; + blk_mvs[2] = kZeroMv; + blk_mvs[3] = kZeroMv; + + if (frame == alt_ref_index) { + blk_fw[0] = blk_fw[1] = blk_fw[2] = blk_fw[3] = 2; + use_32x32 = 1; + } else { + const int thresh_low = 10000; + const int thresh_high = 20000; + int blk_bestsme[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX }; + + // Find best match in this frame by MC + int err = temporal_filter_find_matching_mb_c( + cpi, td, frames[alt_ref_index]->y_buffer + mb_y_offset, + frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride, + &ref_mv, blk_mvs, blk_bestsme); + + int err16 = + blk_bestsme[0] + blk_bestsme[1] + blk_bestsme[2] + blk_bestsme[3]; + int max_err = INT_MIN, min_err = INT_MAX; + for (k = 0; k < 4; k++) { + if (min_err > blk_bestsme[k]) min_err = blk_bestsme[k]; + if (max_err < blk_bestsme[k]) max_err = blk_bestsme[k]; + } + + if (((err * 15 < (err16 << 4)) && max_err - min_err < 10000) || + ((err * 14 < (err16 << 4)) && max_err - min_err < 5000)) { + use_32x32 = 1; + // Assign higher weight to matching MB if it's error + // score is lower. If not applying MC default behavior + // is to weight all MBs equal. + blk_fw[0] = err < (thresh_low << THR_SHIFT) ? 2 + : err < (thresh_high << THR_SHIFT) ? 1 + : 0; + blk_fw[1] = blk_fw[2] = blk_fw[3] = blk_fw[0]; + } else { + use_32x32 = 0; + for (k = 0; k < 4; k++) + blk_fw[k] = blk_bestsme[k] < thresh_low ? 2 + : blk_bestsme[k] < thresh_high ? 1 + : 0; + } + + for (k = 0; k < 4; k++) { + switch (abs(frame - alt_ref_index)) { + case 1: blk_fw[k] = VPXMIN(blk_fw[k], 2); break; + case 2: + case 3: blk_fw[k] = VPXMIN(blk_fw[k], 1); break; + default: break; + } + } + } + + if (blk_fw[0] | blk_fw[1] | blk_fw[2] | blk_fw[3]) { + // Construct the predictors + temporal_filter_predictors_mb_c( + mbd, frames[frame]->y_buffer + mb_y_offset, + frames[frame]->u_buffer + mb_uv_offset, + frames[frame]->v_buffer + mb_uv_offset, frames[frame]->y_stride, + mb_uv_width, mb_uv_height, ref_mv.row, ref_mv.col, predictor, scale, + mb_col * BW, mb_row * BH, blk_mvs, use_32x32); + +#if CONFIG_VP9_HIGHBITDEPTH + if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + int adj_strength = strength + 2 * (mbd->bd - 8); + // Apply the filter (YUV) + vp9_highbd_apply_temporal_filter( + CONVERT_TO_SHORTPTR(f->y_buffer + mb_y_offset), f->y_stride, + CONVERT_TO_SHORTPTR(predictor), BW, + CONVERT_TO_SHORTPTR(f->u_buffer + mb_uv_offset), + CONVERT_TO_SHORTPTR(f->v_buffer + mb_uv_offset), f->uv_stride, + CONVERT_TO_SHORTPTR(predictor + BLK_PELS), + CONVERT_TO_SHORTPTR(predictor + (BLK_PELS << 1)), mb_uv_width, BW, + BH, mbd->plane[1].subsampling_x, mbd->plane[1].subsampling_y, + adj_strength, blk_fw, use_32x32, accumulator, count, + accumulator + BLK_PELS, count + BLK_PELS, + accumulator + (BLK_PELS << 1), count + (BLK_PELS << 1)); + } else { + // Apply the filter (YUV) + vp9_apply_temporal_filter( + f->y_buffer + mb_y_offset, f->y_stride, predictor, BW, + f->u_buffer + mb_uv_offset, f->v_buffer + mb_uv_offset, + f->uv_stride, predictor + BLK_PELS, predictor + (BLK_PELS << 1), + mb_uv_width, BW, BH, mbd->plane[1].subsampling_x, + mbd->plane[1].subsampling_y, strength, blk_fw, use_32x32, + accumulator, count, accumulator + BLK_PELS, count + BLK_PELS, + accumulator + (BLK_PELS << 1), count + (BLK_PELS << 1)); + } +#else + // Apply the filter (YUV) + vp9_apply_temporal_filter( + f->y_buffer + mb_y_offset, f->y_stride, predictor, BW, + f->u_buffer + mb_uv_offset, f->v_buffer + mb_uv_offset, + f->uv_stride, predictor + BLK_PELS, predictor + (BLK_PELS << 1), + mb_uv_width, BW, BH, mbd->plane[1].subsampling_x, + mbd->plane[1].subsampling_y, strength, blk_fw, use_32x32, + accumulator, count, accumulator + BLK_PELS, count + BLK_PELS, + accumulator + (BLK_PELS << 1), count + (BLK_PELS << 1)); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + +#if CONFIG_VP9_HIGHBITDEPTH + if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *dst1_16; + uint16_t *dst2_16; + // Normalize filter output to produce AltRef frame + dst1 = cpi->alt_ref_buffer.y_buffer; + dst1_16 = CONVERT_TO_SHORTPTR(dst1); + stride = cpi->alt_ref_buffer.y_stride; + byte = mb_y_offset; + for (i = 0, k = 0; i < BH; i++) { + for (j = 0; j < BW; j++, k++) { + unsigned int pval = accumulator[k] + (count[k] >> 1); + pval *= fixed_divide[count[k]]; + pval >>= 19; + + dst1_16[byte] = (uint16_t)pval; + + // move to next pixel + byte++; + } + + byte += stride - BW; + } + + dst1 = cpi->alt_ref_buffer.u_buffer; + dst2 = cpi->alt_ref_buffer.v_buffer; + dst1_16 = CONVERT_TO_SHORTPTR(dst1); + dst2_16 = CONVERT_TO_SHORTPTR(dst2); + stride = cpi->alt_ref_buffer.uv_stride; + byte = mb_uv_offset; + for (i = 0, k = BLK_PELS; i < mb_uv_height; i++) { + for (j = 0; j < mb_uv_width; j++, k++) { + int m = k + BLK_PELS; + + // U + unsigned int pval = accumulator[k] + (count[k] >> 1); + pval *= fixed_divide[count[k]]; + pval >>= 19; + dst1_16[byte] = (uint16_t)pval; + + // V + pval = accumulator[m] + (count[m] >> 1); + pval *= fixed_divide[count[m]]; + pval >>= 19; + dst2_16[byte] = (uint16_t)pval; + + // move to next pixel + byte++; + } + + byte += stride - mb_uv_width; + } + } else { + // Normalize filter output to produce AltRef frame + dst1 = cpi->alt_ref_buffer.y_buffer; + stride = cpi->alt_ref_buffer.y_stride; + byte = mb_y_offset; + for (i = 0, k = 0; i < BH; i++) { + for (j = 0; j < BW; j++, k++) { + unsigned int pval = accumulator[k] + (count[k] >> 1); + pval *= fixed_divide[count[k]]; + pval >>= 19; + + dst1[byte] = (uint8_t)pval; + + // move to next pixel + byte++; + } + byte += stride - BW; + } + + dst1 = cpi->alt_ref_buffer.u_buffer; + dst2 = cpi->alt_ref_buffer.v_buffer; + stride = cpi->alt_ref_buffer.uv_stride; + byte = mb_uv_offset; + for (i = 0, k = BLK_PELS; i < mb_uv_height; i++) { + for (j = 0; j < mb_uv_width; j++, k++) { + int m = k + BLK_PELS; + + // U + unsigned int pval = accumulator[k] + (count[k] >> 1); + pval *= fixed_divide[count[k]]; + pval >>= 19; + dst1[byte] = (uint8_t)pval; + + // V + pval = accumulator[m] + (count[m] >> 1); + pval *= fixed_divide[count[m]]; + pval >>= 19; + dst2[byte] = (uint8_t)pval; + + // move to next pixel + byte++; + } + byte += stride - mb_uv_width; + } + } +#else + // Normalize filter output to produce AltRef frame + dst1 = cpi->alt_ref_buffer.y_buffer; + stride = cpi->alt_ref_buffer.y_stride; + byte = mb_y_offset; + for (i = 0, k = 0; i < BH; i++) { + for (j = 0; j < BW; j++, k++) { + unsigned int pval = accumulator[k] + (count[k] >> 1); + pval *= fixed_divide[count[k]]; + pval >>= 19; + + dst1[byte] = (uint8_t)pval; + + // move to next pixel + byte++; + } + byte += stride - BW; + } + + dst1 = cpi->alt_ref_buffer.u_buffer; + dst2 = cpi->alt_ref_buffer.v_buffer; + stride = cpi->alt_ref_buffer.uv_stride; + byte = mb_uv_offset; + for (i = 0, k = BLK_PELS; i < mb_uv_height; i++) { + for (j = 0; j < mb_uv_width; j++, k++) { + int m = k + BLK_PELS; + + // U + unsigned int pval = accumulator[k] + (count[k] >> 1); + pval *= fixed_divide[count[k]]; + pval >>= 19; + dst1[byte] = (uint8_t)pval; + + // V + pval = accumulator[m] + (count[m] >> 1); + pval *= fixed_divide[count[m]]; + pval >>= 19; + dst2[byte] = (uint8_t)pval; + + // move to next pixel + byte++; + } + byte += stride - mb_uv_width; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + mb_y_offset += BW; + mb_uv_offset += mb_uv_width; + } +} + +static void temporal_filter_iterate_tile_c(VP9_COMP *cpi, int tile_row, + int tile_col) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + TileInfo *tile_info = + &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info; + const int mb_row_start = (tile_info->mi_row_start) >> TF_SHIFT; + const int mb_row_end = (tile_info->mi_row_end + TF_ROUND) >> TF_SHIFT; + const int mb_col_start = (tile_info->mi_col_start) >> TF_SHIFT; + const int mb_col_end = (tile_info->mi_col_end + TF_ROUND) >> TF_SHIFT; + int mb_row; + + for (mb_row = mb_row_start; mb_row < mb_row_end; mb_row++) { + vp9_temporal_filter_iterate_row_c(cpi, &cpi->td, mb_row, mb_col_start, + mb_col_end); + } +} + +static void temporal_filter_iterate_c(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + int tile_row, tile_col; + vp9_init_tile_data(cpi); + + for (tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + temporal_filter_iterate_tile_c(cpi, tile_row, tile_col); + } + } +} + +// Apply buffer limits and context specific adjustments to arnr filter. +static void adjust_arnr_filter(VP9_COMP *cpi, int distance, int group_boost, + int *arnr_frames, int *arnr_strength) { + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + const int frames_after_arf = + vp9_lookahead_depth(cpi->lookahead) - distance - 1; + int frames_fwd = (cpi->oxcf.arnr_max_frames - 1) >> 1; + int frames_bwd; + int q, frames, base_strength, strength; + + // Context dependent two pass adjustment to strength. + if (oxcf->pass == 2) { + base_strength = oxcf->arnr_strength + cpi->twopass.arnr_strength_adjustment; + // Clip to allowed range. + base_strength = VPXMIN(6, VPXMAX(0, base_strength)); + } else { + base_strength = oxcf->arnr_strength; + } + + // Define the forward and backwards filter limits for this arnr group. + if (frames_fwd > frames_after_arf) frames_fwd = frames_after_arf; + if (frames_fwd > distance) frames_fwd = distance; + + frames_bwd = frames_fwd; + + // For even length filter there is one more frame backward + // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff. + if (frames_bwd < distance) frames_bwd += (oxcf->arnr_max_frames + 1) & 0x1; + + // Set the baseline active filter size. + frames = frames_bwd + 1 + frames_fwd; + + // Adjust the strength based on active max q. + if (cpi->common.current_video_frame > 1) + q = ((int)vp9_convert_qindex_to_q(cpi->rc.avg_frame_qindex[INTER_FRAME], + cpi->common.bit_depth)); + else + q = ((int)vp9_convert_qindex_to_q(cpi->rc.avg_frame_qindex[KEY_FRAME], + cpi->common.bit_depth)); + if (q > 16) { + strength = base_strength; + } else { + strength = base_strength - ((16 - q) / 2); + if (strength < 0) strength = 0; + } + + // Adjust number of frames in filter and strength based on gf boost level. + if (frames > group_boost / 150) { + frames = group_boost / 150; + frames += !(frames & 1); + } + + if (strength > group_boost / 300) { + strength = group_boost / 300; + } + + // Adjustments for second level arf in multi arf case. + // Leave commented out place holder for possible filtering adjustment with + // new multi-layer arf code. + // if (cpi->oxcf.pass == 2 && cpi->multi_arf_allowed) + // if (gf_group->rf_level[gf_group->index] != GF_ARF_STD) strength >>= 1; + + // TODO(jingning): Skip temporal filtering for intermediate frames that will + // be used as show_existing_frame. Need to further explore the possibility to + // apply certain filter. + if (gf_group->arf_src_offset[gf_group->index] < + cpi->rc.baseline_gf_interval - 1) + frames = 1; + + *arnr_frames = frames; + *arnr_strength = strength; +} + +void vp9_temporal_filter(VP9_COMP *cpi, int distance) { + VP9_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + ARNRFilterData *arnr_filter_data = &cpi->arnr_filter_data; + int frame; + int frames_to_blur; + int start_frame; + int strength; + int frames_to_blur_backward; + int frames_to_blur_forward; + struct scale_factors *sf = &arnr_filter_data->sf; + YV12_BUFFER_CONFIG **frames = arnr_filter_data->frames; + int rdmult; + + // Apply context specific adjustments to the arnr filter parameters. + adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur, &strength); + frames_to_blur_backward = (frames_to_blur / 2); + frames_to_blur_forward = ((frames_to_blur - 1) / 2); + start_frame = distance + frames_to_blur_forward; + + arnr_filter_data->strength = strength; + arnr_filter_data->frame_count = frames_to_blur; + arnr_filter_data->alt_ref_index = frames_to_blur_backward; + + // Setup frame pointers, NULL indicates frame not included in filter. + for (frame = 0; frame < frames_to_blur; ++frame) { + const int which_buffer = start_frame - frame; + struct lookahead_entry *buf = + vp9_lookahead_peek(cpi->lookahead, which_buffer); + frames[frames_to_blur - 1 - frame] = &buf->img; + } + + if (frames_to_blur > 0) { + // Setup scaling factors. Scaling on each of the arnr frames is not + // supported. + if (cpi->use_svc) { + // In spatial svc the scaling factors might be less then 1/2. + // So we will use non-normative scaling. + int frame_used = 0; +#if CONFIG_VP9_HIGHBITDEPTH + vp9_setup_scale_factors_for_frame( + sf, get_frame_new_buffer(cm)->y_crop_width, + get_frame_new_buffer(cm)->y_crop_height, + get_frame_new_buffer(cm)->y_crop_width, + get_frame_new_buffer(cm)->y_crop_height, cm->use_highbitdepth); +#else + vp9_setup_scale_factors_for_frame( + sf, get_frame_new_buffer(cm)->y_crop_width, + get_frame_new_buffer(cm)->y_crop_height, + get_frame_new_buffer(cm)->y_crop_width, + get_frame_new_buffer(cm)->y_crop_height); +#endif // CONFIG_VP9_HIGHBITDEPTH + + for (frame = 0; frame < frames_to_blur; ++frame) { + if (cm->mi_cols * MI_SIZE != frames[frame]->y_width || + cm->mi_rows * MI_SIZE != frames[frame]->y_height) { + if (vpx_realloc_frame_buffer(&cpi->svc.scaled_frames[frame_used], + cm->width, cm->height, cm->subsampling_x, + cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, + cm->byte_alignment, NULL, NULL, NULL)) { + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to reallocate alt_ref_buffer"); + } + frames[frame] = vp9_scale_if_required( + cm, frames[frame], &cpi->svc.scaled_frames[frame_used], 0, + EIGHTTAP, 0); + ++frame_used; + } + } + cm->mi = cm->mip + cm->mi_stride + 1; + xd->mi = cm->mi_grid_visible; + xd->mi[0] = cm->mi; + } else { +// ARF is produced at the native frame size and resized when coded. +#if CONFIG_VP9_HIGHBITDEPTH + vp9_setup_scale_factors_for_frame( + sf, frames[0]->y_crop_width, frames[0]->y_crop_height, + frames[0]->y_crop_width, frames[0]->y_crop_height, + cm->use_highbitdepth); +#else + vp9_setup_scale_factors_for_frame( + sf, frames[0]->y_crop_width, frames[0]->y_crop_height, + frames[0]->y_crop_width, frames[0]->y_crop_height); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + + // Initialize errorperbit and sabperbit. + rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, ARNR_FILT_QINDEX); + set_error_per_bit(&cpi->td.mb, rdmult); + vp9_initialize_me_consts(cpi, &cpi->td.mb, ARNR_FILT_QINDEX); + + if (!cpi->row_mt) + temporal_filter_iterate_c(cpi); + else + vp9_temporal_filter_row_mt(cpi); +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter.h b/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter.h new file mode 100644 index 0000000000..553a468280 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_TEMPORAL_FILTER_H_ +#define VPX_VP9_ENCODER_VP9_TEMPORAL_FILTER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#define ARNR_FILT_QINDEX 128 +static const MV kZeroMv = { 0, 0 }; + +// Block size used in temporal filtering +#define TF_BLOCK BLOCK_32X32 +#define BH 32 +#define BH_LOG2 5 +#define BW 32 +#define BW_LOG2 5 +#define BLK_PELS ((BH) * (BW)) // Pixels in the block +#define TF_SHIFT 2 +#define TF_ROUND 3 +#define THR_SHIFT 2 +#define TF_SUB_BLOCK BLOCK_16X16 +#define SUB_BH 16 +#define SUB_BW 16 + +void vp9_temporal_filter_init(void); +void vp9_temporal_filter(VP9_COMP *cpi, int distance); + +void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, + int mb_row, int mb_col_start, + int mb_col_end); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_TEMPORAL_FILTER_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter_constants.h b/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter_constants.h new file mode 100644 index 0000000000..8776dfc068 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter_constants.h @@ -0,0 +1,410 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_TEMPORAL_FILTER_CONSTANTS_H_ +#define VPX_VP9_ENCODER_TEMPORAL_FILTER_CONSTANTS_H_ +#include "./vpx_config.h" + +// Division using multiplication and shifting. The C implementation does: +// modifier *= 3; +// modifier /= index; +// where 'modifier' is a set of summed values and 'index' is the number of +// summed values. +// +// This equation works out to (m * 3) / i which reduces to: +// m * 3/4 +// m * 1/2 +// m * 1/3 +// +// By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16): +// m * C / 65536 +// we can create a C to replicate the division. +// +// m * 49152 / 65536 = m * 3/4 +// m * 32758 / 65536 = m * 1/2 +// m * 21846 / 65536 = m * 0.3333 +// +// These are loaded using an instruction expecting int16_t values but are used +// with _mm_mulhi_epu16(), which treats them as unsigned. +#define NEIGHBOR_CONSTANT_4 (int16_t)49152 +#define NEIGHBOR_CONSTANT_5 (int16_t)39322 +#define NEIGHBOR_CONSTANT_6 (int16_t)32768 +#define NEIGHBOR_CONSTANT_7 (int16_t)28087 +#define NEIGHBOR_CONSTANT_8 (int16_t)24576 +#define NEIGHBOR_CONSTANT_9 (int16_t)21846 +#define NEIGHBOR_CONSTANT_10 (int16_t)19661 +#define NEIGHBOR_CONSTANT_11 (int16_t)17874 +#define NEIGHBOR_CONSTANT_13 (int16_t)15124 + +DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_5, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_5 +}; + +DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_6 +}; + +DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11 +}; + +DECLARE_ALIGNED(16, static const int16_t, TWO_CORNER_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_6 +}; + +DECLARE_ALIGNED(16, static const int16_t, TWO_EDGE_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13 +}; + +DECLARE_ALIGNED(16, static const int16_t, TWO_CORNER_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, TWO_EDGE_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_10 +}; + +static const int16_t *const LUMA_LEFT_COLUMN_NEIGHBORS[2] = { + LEFT_CORNER_NEIGHBORS_PLUS_2, LEFT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const LUMA_MIDDLE_COLUMN_NEIGHBORS[2] = { + MIDDLE_EDGE_NEIGHBORS_PLUS_2, MIDDLE_CENTER_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const LUMA_RIGHT_COLUMN_NEIGHBORS[2] = { + RIGHT_CORNER_NEIGHBORS_PLUS_2, RIGHT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS[2] = { + LEFT_CORNER_NEIGHBORS_PLUS_1, LEFT_EDGE_NEIGHBORS_PLUS_1 +}; + +static const int16_t *const CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + MIDDLE_EDGE_NEIGHBORS_PLUS_1, MIDDLE_CENTER_NEIGHBORS_PLUS_1 +}; + +static const int16_t *const CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS[2] = { + RIGHT_CORNER_NEIGHBORS_PLUS_1, RIGHT_EDGE_NEIGHBORS_PLUS_1 +}; + +static const int16_t *const CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS[2] = { + LEFT_CORNER_NEIGHBORS_PLUS_2, LEFT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + MIDDLE_EDGE_NEIGHBORS_PLUS_2, MIDDLE_CENTER_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = { + RIGHT_CORNER_NEIGHBORS_PLUS_2, RIGHT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS[2] = { + TWO_CORNER_NEIGHBORS_PLUS_2, TWO_EDGE_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS[2] = { + LEFT_CORNER_NEIGHBORS_PLUS_4, LEFT_EDGE_NEIGHBORS_PLUS_4 +}; + +static const int16_t *const CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + MIDDLE_EDGE_NEIGHBORS_PLUS_4, MIDDLE_CENTER_NEIGHBORS_PLUS_4 +}; + +static const int16_t *const CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = { + RIGHT_CORNER_NEIGHBORS_PLUS_4, RIGHT_EDGE_NEIGHBORS_PLUS_4 +}; + +static const int16_t *const CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS[2] = { + TWO_CORNER_NEIGHBORS_PLUS_4, TWO_EDGE_NEIGHBORS_PLUS_4 +}; + +#if CONFIG_VP9_HIGHBITDEPTH +#define HIGHBD_NEIGHBOR_CONSTANT_4 (uint32_t)3221225472U +#define HIGHBD_NEIGHBOR_CONSTANT_5 (uint32_t)2576980378U +#define HIGHBD_NEIGHBOR_CONSTANT_6 (uint32_t)2147483648U +#define HIGHBD_NEIGHBOR_CONSTANT_7 (uint32_t)1840700270U +#define HIGHBD_NEIGHBOR_CONSTANT_8 (uint32_t)1610612736U +#define HIGHBD_NEIGHBOR_CONSTANT_9 (uint32_t)1431655766U +#define HIGHBD_NEIGHBOR_CONSTANT_10 (uint32_t)1288490189U +#define HIGHBD_NEIGHBOR_CONSTANT_11 (uint32_t)1171354718U +#define HIGHBD_NEIGHBOR_CONSTANT_13 (uint32_t)991146300U + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_1[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_5, HIGHBD_NEIGHBOR_CONSTANT_7, + HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_1[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7, + HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_5 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_1[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_10, + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_1[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10, + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_1[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7, + HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_1[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10, + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_6, HIGHBD_NEIGHBOR_CONSTANT_8, + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8, + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_6 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_11, + HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11, + HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8, + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11, + HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_4[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_10, + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_4[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10, + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_4[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_13, + HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_4[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13, + HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_4[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10, + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_4[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13, + HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13 +}; + +static const uint32_t *const HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS[2] = { + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const uint32_t *const HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS[2] = { + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2 +}; + +static const uint32_t *const HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS[2] = { + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const uint32_t *const HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS[2] = { + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_1, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_1 +}; + +static const uint32_t *const HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_1, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_1 +}; + +static const uint32_t *const HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS[2] = { + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_1, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_1 +}; + +static const uint32_t + *const HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS[2] = { + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2 + }; + +static const uint32_t + *const HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2 + }; + +static const uint32_t + *const HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = { + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2 + }; + +static const uint32_t + *const HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS[2] = { + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_4, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_4 + }; + +static const uint32_t + *const HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_4, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_4 + }; + +static const uint32_t + *const HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = { + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_4, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_4 + }; +#endif // CONFIG_VP9_HIGHBITDEPTH + +#define DIST_STRIDE ((BW) + 2) + +#endif // VPX_VP9_ENCODER_TEMPORAL_FILTER_CONSTANTS_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_tokenize.c b/media/libvpx/libvpx/vp9/encoder/vp9_tokenize.c new file mode 100644 index 0000000000..6c6c04493f --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_tokenize.c @@ -0,0 +1,490 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include + +#include "vpx_mem/vpx_mem.h" + +#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_pred_common.h" +#include "vp9/common/vp9_scan.h" + +#include "vp9/encoder/vp9_cost.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_tokenize.h" + +static const TOKENVALUE dct_cat_lt_10_value_tokens[] = { + { 9, 63 }, { 9, 61 }, { 9, 59 }, { 9, 57 }, { 9, 55 }, { 9, 53 }, { 9, 51 }, + { 9, 49 }, { 9, 47 }, { 9, 45 }, { 9, 43 }, { 9, 41 }, { 9, 39 }, { 9, 37 }, + { 9, 35 }, { 9, 33 }, { 9, 31 }, { 9, 29 }, { 9, 27 }, { 9, 25 }, { 9, 23 }, + { 9, 21 }, { 9, 19 }, { 9, 17 }, { 9, 15 }, { 9, 13 }, { 9, 11 }, { 9, 9 }, + { 9, 7 }, { 9, 5 }, { 9, 3 }, { 9, 1 }, { 8, 31 }, { 8, 29 }, { 8, 27 }, + { 8, 25 }, { 8, 23 }, { 8, 21 }, { 8, 19 }, { 8, 17 }, { 8, 15 }, { 8, 13 }, + { 8, 11 }, { 8, 9 }, { 8, 7 }, { 8, 5 }, { 8, 3 }, { 8, 1 }, { 7, 15 }, + { 7, 13 }, { 7, 11 }, { 7, 9 }, { 7, 7 }, { 7, 5 }, { 7, 3 }, { 7, 1 }, + { 6, 7 }, { 6, 5 }, { 6, 3 }, { 6, 1 }, { 5, 3 }, { 5, 1 }, { 4, 1 }, + { 3, 1 }, { 2, 1 }, { 1, 1 }, { 0, 0 }, { 1, 0 }, { 2, 0 }, { 3, 0 }, + { 4, 0 }, { 5, 0 }, { 5, 2 }, { 6, 0 }, { 6, 2 }, { 6, 4 }, { 6, 6 }, + { 7, 0 }, { 7, 2 }, { 7, 4 }, { 7, 6 }, { 7, 8 }, { 7, 10 }, { 7, 12 }, + { 7, 14 }, { 8, 0 }, { 8, 2 }, { 8, 4 }, { 8, 6 }, { 8, 8 }, { 8, 10 }, + { 8, 12 }, { 8, 14 }, { 8, 16 }, { 8, 18 }, { 8, 20 }, { 8, 22 }, { 8, 24 }, + { 8, 26 }, { 8, 28 }, { 8, 30 }, { 9, 0 }, { 9, 2 }, { 9, 4 }, { 9, 6 }, + { 9, 8 }, { 9, 10 }, { 9, 12 }, { 9, 14 }, { 9, 16 }, { 9, 18 }, { 9, 20 }, + { 9, 22 }, { 9, 24 }, { 9, 26 }, { 9, 28 }, { 9, 30 }, { 9, 32 }, { 9, 34 }, + { 9, 36 }, { 9, 38 }, { 9, 40 }, { 9, 42 }, { 9, 44 }, { 9, 46 }, { 9, 48 }, + { 9, 50 }, { 9, 52 }, { 9, 54 }, { 9, 56 }, { 9, 58 }, { 9, 60 }, { 9, 62 } +}; +const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens = + dct_cat_lt_10_value_tokens + + (sizeof(dct_cat_lt_10_value_tokens) / sizeof(*dct_cat_lt_10_value_tokens)) / + 2; +// The corresponding costs of the extrabits for the tokens in the above table +// are stored in the table below. The values are obtained from looking up the +// entry for the specified extrabits in the table corresponding to the token +// (as defined in cost element vp9_extra_bits) +// e.g. {9, 63} maps to cat5_cost[63 >> 1], {1, 1} maps to sign_cost[1 >> 1] +static const int dct_cat_lt_10_value_cost[] = { + 3773, 3750, 3704, 3681, 3623, 3600, 3554, 3531, 3432, 3409, 3363, 3340, 3282, + 3259, 3213, 3190, 3136, 3113, 3067, 3044, 2986, 2963, 2917, 2894, 2795, 2772, + 2726, 2703, 2645, 2622, 2576, 2553, 3197, 3116, 3058, 2977, 2881, 2800, 2742, + 2661, 2615, 2534, 2476, 2395, 2299, 2218, 2160, 2079, 2566, 2427, 2334, 2195, + 2023, 1884, 1791, 1652, 1893, 1696, 1453, 1256, 1229, 864, 512, 512, 512, + 512, 0, 512, 512, 512, 512, 864, 1229, 1256, 1453, 1696, 1893, 1652, + 1791, 1884, 2023, 2195, 2334, 2427, 2566, 2079, 2160, 2218, 2299, 2395, 2476, + 2534, 2615, 2661, 2742, 2800, 2881, 2977, 3058, 3116, 3197, 2553, 2576, 2622, + 2645, 2703, 2726, 2772, 2795, 2894, 2917, 2963, 2986, 3044, 3067, 3113, 3136, + 3190, 3213, 3259, 3282, 3340, 3363, 3409, 3432, 3531, 3554, 3600, 3623, 3681, + 3704, 3750, 3773, +}; +const int *vp9_dct_cat_lt_10_value_cost = + dct_cat_lt_10_value_cost + + (sizeof(dct_cat_lt_10_value_cost) / sizeof(*dct_cat_lt_10_value_cost)) / 2; + +// Array indices are identical to previously-existing CONTEXT_NODE indices +/* clang-format off */ +const vpx_tree_index vp9_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = { + -EOB_TOKEN, 2, // 0 = EOB + -ZERO_TOKEN, 4, // 1 = ZERO + -ONE_TOKEN, 6, // 2 = ONE + 8, 12, // 3 = LOW_VAL + -TWO_TOKEN, 10, // 4 = TWO + -THREE_TOKEN, -FOUR_TOKEN, // 5 = THREE + 14, 16, // 6 = HIGH_LOW + -CATEGORY1_TOKEN, -CATEGORY2_TOKEN, // 7 = CAT_ONE + 18, 20, // 8 = CAT_THREEFOUR + -CATEGORY3_TOKEN, -CATEGORY4_TOKEN, // 9 = CAT_THREE + -CATEGORY5_TOKEN, -CATEGORY6_TOKEN // 10 = CAT_FIVE +}; +/* clang-format on */ + +static const int16_t zero_cost[] = { 0 }; +static const int16_t sign_cost[1] = { 512 }; +static const int16_t cat1_cost[1 << 1] = { 864, 1229 }; +static const int16_t cat2_cost[1 << 2] = { 1256, 1453, 1696, 1893 }; +static const int16_t cat3_cost[1 << 3] = { 1652, 1791, 1884, 2023, + 2195, 2334, 2427, 2566 }; +static const int16_t cat4_cost[1 << 4] = { 2079, 2160, 2218, 2299, 2395, 2476, + 2534, 2615, 2661, 2742, 2800, 2881, + 2977, 3058, 3116, 3197 }; +static const int16_t cat5_cost[1 << 5] = { + 2553, 2576, 2622, 2645, 2703, 2726, 2772, 2795, 2894, 2917, 2963, + 2986, 3044, 3067, 3113, 3136, 3190, 3213, 3259, 3282, 3340, 3363, + 3409, 3432, 3531, 3554, 3600, 3623, 3681, 3704, 3750, 3773 +}; +const int16_t vp9_cat6_low_cost[256] = { + 3378, 3390, 3401, 3413, 3435, 3447, 3458, 3470, 3517, 3529, 3540, 3552, 3574, + 3586, 3597, 3609, 3671, 3683, 3694, 3706, 3728, 3740, 3751, 3763, 3810, 3822, + 3833, 3845, 3867, 3879, 3890, 3902, 3973, 3985, 3996, 4008, 4030, 4042, 4053, + 4065, 4112, 4124, 4135, 4147, 4169, 4181, 4192, 4204, 4266, 4278, 4289, 4301, + 4323, 4335, 4346, 4358, 4405, 4417, 4428, 4440, 4462, 4474, 4485, 4497, 4253, + 4265, 4276, 4288, 4310, 4322, 4333, 4345, 4392, 4404, 4415, 4427, 4449, 4461, + 4472, 4484, 4546, 4558, 4569, 4581, 4603, 4615, 4626, 4638, 4685, 4697, 4708, + 4720, 4742, 4754, 4765, 4777, 4848, 4860, 4871, 4883, 4905, 4917, 4928, 4940, + 4987, 4999, 5010, 5022, 5044, 5056, 5067, 5079, 5141, 5153, 5164, 5176, 5198, + 5210, 5221, 5233, 5280, 5292, 5303, 5315, 5337, 5349, 5360, 5372, 4988, 5000, + 5011, 5023, 5045, 5057, 5068, 5080, 5127, 5139, 5150, 5162, 5184, 5196, 5207, + 5219, 5281, 5293, 5304, 5316, 5338, 5350, 5361, 5373, 5420, 5432, 5443, 5455, + 5477, 5489, 5500, 5512, 5583, 5595, 5606, 5618, 5640, 5652, 5663, 5675, 5722, + 5734, 5745, 5757, 5779, 5791, 5802, 5814, 5876, 5888, 5899, 5911, 5933, 5945, + 5956, 5968, 6015, 6027, 6038, 6050, 6072, 6084, 6095, 6107, 5863, 5875, 5886, + 5898, 5920, 5932, 5943, 5955, 6002, 6014, 6025, 6037, 6059, 6071, 6082, 6094, + 6156, 6168, 6179, 6191, 6213, 6225, 6236, 6248, 6295, 6307, 6318, 6330, 6352, + 6364, 6375, 6387, 6458, 6470, 6481, 6493, 6515, 6527, 6538, 6550, 6597, 6609, + 6620, 6632, 6654, 6666, 6677, 6689, 6751, 6763, 6774, 6786, 6808, 6820, 6831, + 6843, 6890, 6902, 6913, 6925, 6947, 6959, 6970, 6982 +}; +const uint16_t vp9_cat6_high_cost[64] = { + 88, 2251, 2727, 4890, 3148, 5311, 5787, 7950, 3666, 5829, 6305, + 8468, 6726, 8889, 9365, 11528, 3666, 5829, 6305, 8468, 6726, 8889, + 9365, 11528, 7244, 9407, 9883, 12046, 10304, 12467, 12943, 15106, 3666, + 5829, 6305, 8468, 6726, 8889, 9365, 11528, 7244, 9407, 9883, 12046, + 10304, 12467, 12943, 15106, 7244, 9407, 9883, 12046, 10304, 12467, 12943, + 15106, 10822, 12985, 13461, 15624, 13882, 16045, 16521, 18684 +}; + +#if CONFIG_VP9_HIGHBITDEPTH +const uint16_t vp9_cat6_high10_high_cost[256] = { + 94, 2257, 2733, 4896, 3154, 5317, 5793, 7956, 3672, 5835, 6311, + 8474, 6732, 8895, 9371, 11534, 3672, 5835, 6311, 8474, 6732, 8895, + 9371, 11534, 7250, 9413, 9889, 12052, 10310, 12473, 12949, 15112, 3672, + 5835, 6311, 8474, 6732, 8895, 9371, 11534, 7250, 9413, 9889, 12052, + 10310, 12473, 12949, 15112, 7250, 9413, 9889, 12052, 10310, 12473, 12949, + 15112, 10828, 12991, 13467, 15630, 13888, 16051, 16527, 18690, 4187, 6350, + 6826, 8989, 7247, 9410, 9886, 12049, 7765, 9928, 10404, 12567, 10825, + 12988, 13464, 15627, 7765, 9928, 10404, 12567, 10825, 12988, 13464, 15627, + 11343, 13506, 13982, 16145, 14403, 16566, 17042, 19205, 7765, 9928, 10404, + 12567, 10825, 12988, 13464, 15627, 11343, 13506, 13982, 16145, 14403, 16566, + 17042, 19205, 11343, 13506, 13982, 16145, 14403, 16566, 17042, 19205, 14921, + 17084, 17560, 19723, 17981, 20144, 20620, 22783, 4187, 6350, 6826, 8989, + 7247, 9410, 9886, 12049, 7765, 9928, 10404, 12567, 10825, 12988, 13464, + 15627, 7765, 9928, 10404, 12567, 10825, 12988, 13464, 15627, 11343, 13506, + 13982, 16145, 14403, 16566, 17042, 19205, 7765, 9928, 10404, 12567, 10825, + 12988, 13464, 15627, 11343, 13506, 13982, 16145, 14403, 16566, 17042, 19205, + 11343, 13506, 13982, 16145, 14403, 16566, 17042, 19205, 14921, 17084, 17560, + 19723, 17981, 20144, 20620, 22783, 8280, 10443, 10919, 13082, 11340, 13503, + 13979, 16142, 11858, 14021, 14497, 16660, 14918, 17081, 17557, 19720, 11858, + 14021, 14497, 16660, 14918, 17081, 17557, 19720, 15436, 17599, 18075, 20238, + 18496, 20659, 21135, 23298, 11858, 14021, 14497, 16660, 14918, 17081, 17557, + 19720, 15436, 17599, 18075, 20238, 18496, 20659, 21135, 23298, 15436, 17599, + 18075, 20238, 18496, 20659, 21135, 23298, 19014, 21177, 21653, 23816, 22074, + 24237, 24713, 26876 +}; +const uint16_t vp9_cat6_high12_high_cost[1024] = { + 100, 2263, 2739, 4902, 3160, 5323, 5799, 7962, 3678, 5841, 6317, + 8480, 6738, 8901, 9377, 11540, 3678, 5841, 6317, 8480, 6738, 8901, + 9377, 11540, 7256, 9419, 9895, 12058, 10316, 12479, 12955, 15118, 3678, + 5841, 6317, 8480, 6738, 8901, 9377, 11540, 7256, 9419, 9895, 12058, + 10316, 12479, 12955, 15118, 7256, 9419, 9895, 12058, 10316, 12479, 12955, + 15118, 10834, 12997, 13473, 15636, 13894, 16057, 16533, 18696, 4193, 6356, + 6832, 8995, 7253, 9416, 9892, 12055, 7771, 9934, 10410, 12573, 10831, + 12994, 13470, 15633, 7771, 9934, 10410, 12573, 10831, 12994, 13470, 15633, + 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 7771, 9934, 10410, + 12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572, + 17048, 19211, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 14927, + 17090, 17566, 19729, 17987, 20150, 20626, 22789, 4193, 6356, 6832, 8995, + 7253, 9416, 9892, 12055, 7771, 9934, 10410, 12573, 10831, 12994, 13470, + 15633, 7771, 9934, 10410, 12573, 10831, 12994, 13470, 15633, 11349, 13512, + 13988, 16151, 14409, 16572, 17048, 19211, 7771, 9934, 10410, 12573, 10831, + 12994, 13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, + 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 14927, 17090, 17566, + 19729, 17987, 20150, 20626, 22789, 8286, 10449, 10925, 13088, 11346, 13509, + 13985, 16148, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 11864, + 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, + 18502, 20665, 21141, 23304, 11864, 14027, 14503, 16666, 14924, 17087, 17563, + 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 15442, 17605, + 18081, 20244, 18502, 20665, 21141, 23304, 19020, 21183, 21659, 23822, 22080, + 24243, 24719, 26882, 4193, 6356, 6832, 8995, 7253, 9416, 9892, 12055, + 7771, 9934, 10410, 12573, 10831, 12994, 13470, 15633, 7771, 9934, 10410, + 12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572, + 17048, 19211, 7771, 9934, 10410, 12573, 10831, 12994, 13470, 15633, 11349, + 13512, 13988, 16151, 14409, 16572, 17048, 19211, 11349, 13512, 13988, 16151, + 14409, 16572, 17048, 19211, 14927, 17090, 17566, 19729, 17987, 20150, 20626, + 22789, 8286, 10449, 10925, 13088, 11346, 13509, 13985, 16148, 11864, 14027, + 14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027, 14503, 16666, 14924, + 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, + 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, + 20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081, 20244, 18502, 20665, + 21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243, 24719, 26882, 8286, + 10449, 10925, 13088, 11346, 13509, 13985, 16148, 11864, 14027, 14503, 16666, + 14924, 17087, 17563, 19726, 11864, 14027, 14503, 16666, 14924, 17087, 17563, + 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 11864, 14027, + 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, + 20665, 21141, 23304, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, + 19020, 21183, 21659, 23822, 22080, 24243, 24719, 26882, 12379, 14542, 15018, + 17181, 15439, 17602, 18078, 20241, 15957, 18120, 18596, 20759, 19017, 21180, + 21656, 23819, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819, 19535, + 21698, 22174, 24337, 22595, 24758, 25234, 27397, 15957, 18120, 18596, 20759, + 19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234, + 27397, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 23113, 25276, + 25752, 27915, 26173, 28336, 28812, 30975, 4193, 6356, 6832, 8995, 7253, + 9416, 9892, 12055, 7771, 9934, 10410, 12573, 10831, 12994, 13470, 15633, + 7771, 9934, 10410, 12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988, + 16151, 14409, 16572, 17048, 19211, 7771, 9934, 10410, 12573, 10831, 12994, + 13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 11349, + 13512, 13988, 16151, 14409, 16572, 17048, 19211, 14927, 17090, 17566, 19729, + 17987, 20150, 20626, 22789, 8286, 10449, 10925, 13088, 11346, 13509, 13985, + 16148, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027, + 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, + 20665, 21141, 23304, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, + 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081, + 20244, 18502, 20665, 21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243, + 24719, 26882, 8286, 10449, 10925, 13088, 11346, 13509, 13985, 16148, 11864, + 14027, 14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027, 14503, 16666, + 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, + 23304, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, + 18081, 20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081, 20244, 18502, + 20665, 21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243, 24719, 26882, + 12379, 14542, 15018, 17181, 15439, 17602, 18078, 20241, 15957, 18120, 18596, + 20759, 19017, 21180, 21656, 23819, 15957, 18120, 18596, 20759, 19017, 21180, + 21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 15957, + 18120, 18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337, + 22595, 24758, 25234, 27397, 19535, 21698, 22174, 24337, 22595, 24758, 25234, + 27397, 23113, 25276, 25752, 27915, 26173, 28336, 28812, 30975, 8286, 10449, + 10925, 13088, 11346, 13509, 13985, 16148, 11864, 14027, 14503, 16666, 14924, + 17087, 17563, 19726, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, + 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 11864, 14027, 14503, + 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, + 21141, 23304, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 19020, + 21183, 21659, 23822, 22080, 24243, 24719, 26882, 12379, 14542, 15018, 17181, + 15439, 17602, 18078, 20241, 15957, 18120, 18596, 20759, 19017, 21180, 21656, + 23819, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698, + 22174, 24337, 22595, 24758, 25234, 27397, 15957, 18120, 18596, 20759, 19017, + 21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, + 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 23113, 25276, 25752, + 27915, 26173, 28336, 28812, 30975, 12379, 14542, 15018, 17181, 15439, 17602, + 18078, 20241, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819, 15957, + 18120, 18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337, + 22595, 24758, 25234, 27397, 15957, 18120, 18596, 20759, 19017, 21180, 21656, + 23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 19535, 21698, + 22174, 24337, 22595, 24758, 25234, 27397, 23113, 25276, 25752, 27915, 26173, + 28336, 28812, 30975, 16472, 18635, 19111, 21274, 19532, 21695, 22171, 24334, + 20050, 22213, 22689, 24852, 23110, 25273, 25749, 27912, 20050, 22213, 22689, + 24852, 23110, 25273, 25749, 27912, 23628, 25791, 26267, 28430, 26688, 28851, + 29327, 31490, 20050, 22213, 22689, 24852, 23110, 25273, 25749, 27912, 23628, + 25791, 26267, 28430, 26688, 28851, 29327, 31490, 23628, 25791, 26267, 28430, + 26688, 28851, 29327, 31490, 27206, 29369, 29845, 32008, 30266, 32429, 32905, + 35068 +}; +#endif + +const vp9_extra_bit vp9_extra_bits[ENTROPY_TOKENS] = { + { 0, 0, 0, zero_cost }, // ZERO_TOKEN + { 0, 0, 1, sign_cost }, // ONE_TOKEN + { 0, 0, 2, sign_cost }, // TWO_TOKEN + { 0, 0, 3, sign_cost }, // THREE_TOKEN + { 0, 0, 4, sign_cost }, // FOUR_TOKEN + { vp9_cat1_prob, 1, CAT1_MIN_VAL, cat1_cost }, // CATEGORY1_TOKEN + { vp9_cat2_prob, 2, CAT2_MIN_VAL, cat2_cost }, // CATEGORY2_TOKEN + { vp9_cat3_prob, 3, CAT3_MIN_VAL, cat3_cost }, // CATEGORY3_TOKEN + { vp9_cat4_prob, 4, CAT4_MIN_VAL, cat4_cost }, // CATEGORY4_TOKEN + { vp9_cat5_prob, 5, CAT5_MIN_VAL, cat5_cost }, // CATEGORY5_TOKEN + { vp9_cat6_prob, 14, CAT6_MIN_VAL, 0 }, // CATEGORY6_TOKEN + { 0, 0, 0, zero_cost } // EOB_TOKEN +}; + +#if CONFIG_VP9_HIGHBITDEPTH +const vp9_extra_bit vp9_extra_bits_high10[ENTROPY_TOKENS] = { + { 0, 0, 0, zero_cost }, // ZERO + { 0, 0, 1, sign_cost }, // ONE + { 0, 0, 2, sign_cost }, // TWO + { 0, 0, 3, sign_cost }, // THREE + { 0, 0, 4, sign_cost }, // FOUR + { vp9_cat1_prob, 1, CAT1_MIN_VAL, cat1_cost }, // CAT1 + { vp9_cat2_prob, 2, CAT2_MIN_VAL, cat2_cost }, // CAT2 + { vp9_cat3_prob, 3, CAT3_MIN_VAL, cat3_cost }, // CAT3 + { vp9_cat4_prob, 4, CAT4_MIN_VAL, cat4_cost }, // CAT4 + { vp9_cat5_prob, 5, CAT5_MIN_VAL, cat5_cost }, // CAT5 + { vp9_cat6_prob_high12 + 2, 16, CAT6_MIN_VAL, 0 }, // CAT6 + { 0, 0, 0, zero_cost } // EOB +}; +const vp9_extra_bit vp9_extra_bits_high12[ENTROPY_TOKENS] = { + { 0, 0, 0, zero_cost }, // ZERO + { 0, 0, 1, sign_cost }, // ONE + { 0, 0, 2, sign_cost }, // TWO + { 0, 0, 3, sign_cost }, // THREE + { 0, 0, 4, sign_cost }, // FOUR + { vp9_cat1_prob, 1, CAT1_MIN_VAL, cat1_cost }, // CAT1 + { vp9_cat2_prob, 2, CAT2_MIN_VAL, cat2_cost }, // CAT2 + { vp9_cat3_prob, 3, CAT3_MIN_VAL, cat3_cost }, // CAT3 + { vp9_cat4_prob, 4, CAT4_MIN_VAL, cat4_cost }, // CAT4 + { vp9_cat5_prob, 5, CAT5_MIN_VAL, cat5_cost }, // CAT5 + { vp9_cat6_prob_high12, 18, CAT6_MIN_VAL, 0 }, // CAT6 + { 0, 0, 0, zero_cost } // EOB +}; +#endif + +const struct vp9_token vp9_coef_encodings[ENTROPY_TOKENS] = { + { 2, 2 }, { 6, 3 }, { 28, 5 }, { 58, 6 }, { 59, 6 }, { 60, 6 }, + { 61, 6 }, { 124, 7 }, { 125, 7 }, { 126, 7 }, { 127, 7 }, { 0, 1 } +}; + +struct tokenize_b_args { + VP9_COMP *cpi; + ThreadData *td; + TOKENEXTRA **tp; +}; + +static void set_entropy_context_b(int plane, int block, int row, int col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + void *arg) { + struct tokenize_b_args *const args = arg; + ThreadData *const td = args->td; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *p = &x->plane[plane]; + struct macroblockd_plane *pd = &xd->plane[plane]; + vp9_set_contexts(xd, pd, plane_bsize, tx_size, p->eobs[block] > 0, col, row); +} + +static INLINE void add_token(TOKENEXTRA **t, const vpx_prob *context_tree, + int16_t token, EXTRABIT extra, + unsigned int *counts) { + (*t)->context_tree = context_tree; + (*t)->token = token; + (*t)->extra = extra; + (*t)++; + ++counts[token]; +} + +static INLINE void add_token_no_extra(TOKENEXTRA **t, + const vpx_prob *context_tree, + int16_t token, unsigned int *counts) { + (*t)->context_tree = context_tree; + (*t)->token = token; + (*t)++; + ++counts[token]; +} + +static void tokenize_b(int plane, int block, int row, int col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { + struct tokenize_b_args *const args = arg; + VP9_COMP *cpi = args->cpi; + ThreadData *const td = args->td; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + TOKENEXTRA **tp = args->tp; + uint8_t token_cache[32 * 32]; + struct macroblock_plane *p = &x->plane[plane]; + struct macroblockd_plane *pd = &xd->plane[plane]; + MODE_INFO *mi = xd->mi[0]; + int pt; /* near block/prev token context index */ + int c; + TOKENEXTRA *t = *tp; /* store tokens starting here */ + int eob = p->eobs[block]; + const PLANE_TYPE type = get_plane_type(plane); + const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block); + const int16_t *scan, *nb; + const ScanOrder *so; + const int ref = is_inter_block(mi); + unsigned int(*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] = + td->rd_counts.coef_counts[tx_size][type][ref]; + vpx_prob(*const coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] = + cpi->common.fc->coef_probs[tx_size][type][ref]; + unsigned int(*const eob_branch)[COEFF_CONTEXTS] = + td->counts->eob_branch[tx_size][type][ref]; + const uint8_t *const band = get_band_translate(tx_size); + const int tx_eob = 16 << (tx_size << 1); + int16_t token; + EXTRABIT extra; + pt = get_entropy_context(tx_size, pd->above_context + col, + pd->left_context + row); + so = get_scan(xd, tx_size, type, block); + scan = so->scan; + nb = so->neighbors; + c = 0; + + while (c < eob) { + int v = 0; + v = qcoeff[scan[c]]; + ++eob_branch[band[c]][pt]; + + while (!v) { + add_token_no_extra(&t, coef_probs[band[c]][pt], ZERO_TOKEN, + counts[band[c]][pt]); + + token_cache[scan[c]] = 0; + ++c; + pt = get_coef_context(nb, token_cache, c); + v = qcoeff[scan[c]]; + } + + vp9_get_token_extra(v, &token, &extra); + + add_token(&t, coef_probs[band[c]][pt], token, extra, counts[band[c]][pt]); + + token_cache[scan[c]] = vp9_pt_energy_class[token]; + ++c; + pt = get_coef_context(nb, token_cache, c); + } + if (c < tx_eob) { + ++eob_branch[band[c]][pt]; + add_token_no_extra(&t, coef_probs[band[c]][pt], EOB_TOKEN, + counts[band[c]][pt]); + } + + *tp = t; + + vp9_set_contexts(xd, pd, plane_bsize, tx_size, c > 0, col, row); +} + +struct is_skippable_args { + uint16_t *eobs; + int *skippable; +}; + +static void is_skippable(int plane, int block, int row, int col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *argv) { + struct is_skippable_args *args = argv; + (void)plane; + (void)plane_bsize; + (void)tx_size; + (void)row; + (void)col; + args->skippable[0] &= (!args->eobs[block]); +} + +// TODO(yaowu): rewrite and optimize this function to remove the usage of +// vp9_foreach_transform_block() and simplify is_skippable(). +int vp9_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { + int result = 1; + struct is_skippable_args args = { x->plane[plane].eobs, &result }; + vp9_foreach_transformed_block_in_plane(&x->e_mbd, bsize, plane, is_skippable, + &args); + return result; +} + +static void has_high_freq_coeff(int plane, int block, int row, int col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + void *argv) { + struct is_skippable_args *args = argv; + int eobs = (tx_size == TX_4X4) ? 3 : 10; + (void)plane; + (void)plane_bsize; + (void)row; + (void)col; + *(args->skippable) |= (args->eobs[block] > eobs); +} + +int vp9_has_high_freq_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { + int result = 0; + struct is_skippable_args args = { x->plane[plane].eobs, &result }; + vp9_foreach_transformed_block_in_plane(&x->e_mbd, bsize, plane, + has_high_freq_coeff, &args); + return result; +} + +void vp9_tokenize_sb(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t, int dry_run, + int seg_skip, BLOCK_SIZE bsize) { + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mi = xd->mi[0]; + const int ctx = vp9_get_skip_context(xd); + struct tokenize_b_args arg = { cpi, td, t }; + + if (seg_skip) { + assert(mi->skip); + } + + if (mi->skip) { + if (!dry_run && !seg_skip) ++td->counts->skip[ctx][1]; + reset_skip_context(xd, bsize); + return; + } + + if (!dry_run) { + ++td->counts->skip[ctx][0]; + vp9_foreach_transformed_block(xd, bsize, tokenize_b, &arg); + } else { + vp9_foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg); + } +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_tokenize.h b/media/libvpx/libvpx/vp9/encoder/vp9_tokenize.h new file mode 100644 index 0000000000..6407ff9237 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_tokenize.h @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_TOKENIZE_H_ +#define VPX_VP9_ENCODER_VP9_TOKENIZE_H_ + +#include "vp9/common/vp9_entropy.h" + +#include "vp9/encoder/vp9_block.h" +#include "vp9/encoder/vp9_treewriter.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define EOSB_TOKEN 127 // Not signalled, encoder only + +#if CONFIG_VP9_HIGHBITDEPTH +typedef int32_t EXTRABIT; +#else +typedef int16_t EXTRABIT; +#endif + +typedef struct { + int16_t token; + EXTRABIT extra; +} TOKENVALUE; + +typedef struct { + const vpx_prob *context_tree; + int16_t token; + EXTRABIT extra; +} TOKENEXTRA; + +extern const vpx_tree_index vp9_coef_tree[]; +extern const vpx_tree_index vp9_coef_con_tree[]; +extern const struct vp9_token vp9_coef_encodings[]; + +int vp9_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane); +int vp9_has_high_freq_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane); + +struct VP9_COMP; +struct ThreadData; + +void vp9_tokenize_sb(struct VP9_COMP *cpi, struct ThreadData *td, + TOKENEXTRA **t, int dry_run, int seg_skip, + BLOCK_SIZE bsize); + +typedef struct { + const vpx_prob *prob; + int len; + int base_val; + const int16_t *cost; +} vp9_extra_bit; + +// indexed by token value +extern const vp9_extra_bit vp9_extra_bits[ENTROPY_TOKENS]; +#if CONFIG_VP9_HIGHBITDEPTH +extern const vp9_extra_bit vp9_extra_bits_high10[ENTROPY_TOKENS]; +extern const vp9_extra_bit vp9_extra_bits_high12[ENTROPY_TOKENS]; +#endif // CONFIG_VP9_HIGHBITDEPTH + +extern const int16_t *vp9_dct_value_cost_ptr; +/* TODO: The Token field should be broken out into a separate char array to + * improve cache locality, since it's needed for costing when the rest of the + * fields are not. + */ +extern const TOKENVALUE *vp9_dct_value_tokens_ptr; +extern const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens; +extern const int *vp9_dct_cat_lt_10_value_cost; +extern const int16_t vp9_cat6_low_cost[256]; +extern const uint16_t vp9_cat6_high_cost[64]; +extern const uint16_t vp9_cat6_high10_high_cost[256]; +extern const uint16_t vp9_cat6_high12_high_cost[1024]; + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE const uint16_t *vp9_get_high_cost_table(int bit_depth) { + return bit_depth == 8 ? vp9_cat6_high_cost + : (bit_depth == 10 ? vp9_cat6_high10_high_cost + : vp9_cat6_high12_high_cost); +} +#else +static INLINE const uint16_t *vp9_get_high_cost_table(int bit_depth) { + (void)bit_depth; + return vp9_cat6_high_cost; +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +static INLINE void vp9_get_token_extra(int v, int16_t *token, EXTRABIT *extra) { + if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) { + *token = CATEGORY6_TOKEN; + if (v >= CAT6_MIN_VAL) + *extra = 2 * v - 2 * CAT6_MIN_VAL; + else + *extra = -2 * v - 2 * CAT6_MIN_VAL + 1; + return; + } + *token = vp9_dct_cat_lt_10_value_tokens[v].token; + *extra = vp9_dct_cat_lt_10_value_tokens[v].extra; +} +static INLINE int16_t vp9_get_token(int v) { + if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) return 10; + return vp9_dct_cat_lt_10_value_tokens[v].token; +} + +static INLINE int vp9_get_token_cost(int v, int16_t *token, + const uint16_t *cat6_high_table) { + if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) { + EXTRABIT extrabits; + *token = CATEGORY6_TOKEN; + extrabits = abs(v) - CAT6_MIN_VAL; + return vp9_cat6_low_cost[extrabits & 0xff] + + cat6_high_table[extrabits >> 8]; + } + *token = vp9_dct_cat_lt_10_value_tokens[v].token; + return vp9_dct_cat_lt_10_value_cost[v]; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_TOKENIZE_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c new file mode 100644 index 0000000000..b8910370e0 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c @@ -0,0 +1,1541 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#if CONFIG_NON_GREEDY_MV +#include "vp9/common/vp9_mvref_common.h" +#endif +#include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_tpl_model.h" +#include "vpx/internal/vpx_codec_internal.h" +#include "vpx/vpx_codec.h" + +static int init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture, + const GF_GROUP *gf_group, int *tpl_group_frames) { + VP9_COMMON *cm = &cpi->common; + int frame_idx = 0; + int i; + int gld_index = -1; + int alt_index = -1; + int lst_index = -1; + int arf_index_stack[MAX_ARF_LAYERS]; + int arf_stack_size = 0; + int extend_frame_count = 0; + int pframe_qindex = cpi->tpl_stats[2].base_qindex; + int frame_gop_offset = 0; + + RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs; + int8_t recon_frame_index[REFS_PER_FRAME + MAX_ARF_LAYERS]; + + memset(recon_frame_index, -1, sizeof(recon_frame_index)); + stack_init(arf_index_stack, MAX_ARF_LAYERS); + + for (i = 0; i < FRAME_BUFFERS; ++i) { + if (frame_bufs[i].ref_count == 0) { + alloc_frame_mvs(cm, i); + if (vpx_realloc_frame_buffer(&frame_bufs[i].buf, cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + + recon_frame_index[frame_idx] = i; + ++frame_idx; + + if (frame_idx >= REFS_PER_FRAME + cpi->oxcf.enable_auto_arf) break; + } + } + + for (i = 0; i < REFS_PER_FRAME + 1; ++i) { + assert(recon_frame_index[i] >= 0); + cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf; + } + + *tpl_group_frames = 0; + + // Initialize Golden reference frame. + gf_picture[0].frame = get_ref_frame_buffer(cpi, GOLDEN_FRAME); + for (i = 0; i < 3; ++i) gf_picture[0].ref_frame[i] = -1; + gf_picture[0].update_type = gf_group->update_type[0]; + gld_index = 0; + ++*tpl_group_frames; + + // Initialize base layer ARF frame + gf_picture[1].frame = cpi->Source; + gf_picture[1].ref_frame[0] = gld_index; + gf_picture[1].ref_frame[1] = lst_index; + gf_picture[1].ref_frame[2] = alt_index; + gf_picture[1].update_type = gf_group->update_type[1]; + alt_index = 1; + ++*tpl_group_frames; + + // Initialize P frames + for (frame_idx = 2; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) { + struct lookahead_entry *buf; + frame_gop_offset = gf_group->frame_gop_index[frame_idx]; + buf = vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1); + + if (buf == NULL) break; + + gf_picture[frame_idx].frame = &buf->img; + gf_picture[frame_idx].ref_frame[0] = gld_index; + gf_picture[frame_idx].ref_frame[1] = lst_index; + gf_picture[frame_idx].ref_frame[2] = alt_index; + gf_picture[frame_idx].update_type = gf_group->update_type[frame_idx]; + + switch (gf_group->update_type[frame_idx]) { + case ARF_UPDATE: + stack_push(arf_index_stack, alt_index, arf_stack_size); + ++arf_stack_size; + alt_index = frame_idx; + break; + case LF_UPDATE: lst_index = frame_idx; break; + case OVERLAY_UPDATE: + gld_index = frame_idx; + alt_index = stack_pop(arf_index_stack, arf_stack_size); + --arf_stack_size; + break; + case USE_BUF_FRAME: + lst_index = alt_index; + alt_index = stack_pop(arf_index_stack, arf_stack_size); + --arf_stack_size; + break; + default: break; + } + + ++*tpl_group_frames; + + // The length of group of pictures is baseline_gf_interval, plus the + // beginning golden frame from last GOP, plus the last overlay frame in + // the same GOP. + if (frame_idx == gf_group->gf_group_size) break; + } + + alt_index = -1; + ++frame_idx; + ++frame_gop_offset; + + // Extend two frames outside the current gf group. + for (; frame_idx < MAX_LAG_BUFFERS && extend_frame_count < 2; ++frame_idx) { + struct lookahead_entry *buf = + vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1); + + if (buf == NULL) break; + + cpi->tpl_stats[frame_idx].base_qindex = pframe_qindex; + + gf_picture[frame_idx].frame = &buf->img; + gf_picture[frame_idx].ref_frame[0] = gld_index; + gf_picture[frame_idx].ref_frame[1] = lst_index; + gf_picture[frame_idx].ref_frame[2] = alt_index; + gf_picture[frame_idx].update_type = LF_UPDATE; + lst_index = frame_idx; + ++*tpl_group_frames; + ++extend_frame_count; + ++frame_gop_offset; + } + + return extend_frame_count; +} + +static void init_tpl_stats(VP9_COMP *cpi) { + int frame_idx; + for (frame_idx = 0; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) { + TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + memset(tpl_frame->tpl_stats_ptr, 0, + tpl_frame->height * tpl_frame->width * + sizeof(*tpl_frame->tpl_stats_ptr)); + tpl_frame->is_valid = 0; + } +} + +static void free_tpl_frame_stats_list(VpxTplGopStats *tpl_gop_stats) { + int frame_idx; + for (frame_idx = 0; frame_idx < tpl_gop_stats->size; ++frame_idx) { + vpx_free(tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list); + } + vpx_free(tpl_gop_stats->frame_stats_list); +} + +static void init_tpl_stats_before_propagation( + struct vpx_internal_error_info *error_info, VpxTplGopStats *tpl_gop_stats, + TplDepFrame *tpl_stats, int tpl_gop_frames, int frame_width, + int frame_height) { + int frame_idx; + free_tpl_frame_stats_list(tpl_gop_stats); + CHECK_MEM_ERROR( + error_info, tpl_gop_stats->frame_stats_list, + vpx_calloc(tpl_gop_frames, sizeof(*tpl_gop_stats->frame_stats_list))); + tpl_gop_stats->size = tpl_gop_frames; + for (frame_idx = 0; frame_idx < tpl_gop_frames; ++frame_idx) { + const int mi_rows = tpl_stats[frame_idx].height; + const int mi_cols = tpl_stats[frame_idx].width; + CHECK_MEM_ERROR( + error_info, tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list, + vpx_calloc( + mi_rows * mi_cols, + sizeof( + *tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list))); + tpl_gop_stats->frame_stats_list[frame_idx].num_blocks = mi_rows * mi_cols; + tpl_gop_stats->frame_stats_list[frame_idx].frame_width = frame_width; + tpl_gop_stats->frame_stats_list[frame_idx].frame_height = frame_height; + } +} + +#if CONFIG_NON_GREEDY_MV +static uint32_t full_pixel_motion_search(VP9_COMP *cpi, ThreadData *td, + MotionField *motion_field, + int frame_idx, uint8_t *cur_frame_buf, + uint8_t *ref_frame_buf, int stride, + BLOCK_SIZE bsize, int mi_row, + int mi_col, MV *mv) { + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; + int step_param; + uint32_t bestsme = UINT_MAX; + const MvLimits tmp_mv_limits = x->mv_limits; + // lambda is used to adjust the importance of motion vector consistency. + // TODO(angiebird): Figure out lambda's proper value. + const int lambda = cpi->tpl_stats[frame_idx].lambda; + int_mv nb_full_mvs[NB_MVS_NUM]; + int nb_full_mv_num; + + MV best_ref_mv1 = { 0, 0 }; + MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ + + best_ref_mv1_full.col = best_ref_mv1.col >> 3; + best_ref_mv1_full.row = best_ref_mv1.row >> 3; + + // Setup frame pointers + x->plane[0].src.buf = cur_frame_buf; + x->plane[0].src.stride = stride; + xd->plane[0].pre[0].buf = ref_frame_buf; + xd->plane[0].pre[0].stride = stride; + + step_param = mv_sf->reduce_first_step_size; + step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2); + + vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1); + + nb_full_mv_num = + vp9_prepare_nb_full_mvs(motion_field, mi_row, mi_col, nb_full_mvs); + vp9_full_pixel_diamond_new(cpi, x, bsize, &best_ref_mv1_full, step_param, + lambda, 1, nb_full_mvs, nb_full_mv_num, mv); + + /* restore UMV window */ + x->mv_limits = tmp_mv_limits; + + return bestsme; +} + +static uint32_t sub_pixel_motion_search(VP9_COMP *cpi, ThreadData *td, + uint8_t *cur_frame_buf, + uint8_t *ref_frame_buf, int stride, + BLOCK_SIZE bsize, MV *mv) { + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; + uint32_t bestsme = UINT_MAX; + uint32_t distortion; + uint32_t sse; + int cost_list[5]; + + MV best_ref_mv1 = { 0, 0 }; + + // Setup frame pointers + x->plane[0].src.buf = cur_frame_buf; + x->plane[0].src.stride = stride; + xd->plane[0].pre[0].buf = ref_frame_buf; + xd->plane[0].pre[0].stride = stride; + + // TODO(yunqing): may use higher tap interp filter than 2 taps. + // Ignore mv costing by sending NULL pointer instead of cost array + bestsme = cpi->find_fractional_mv_step( + x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit, + &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level, + cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0, + USE_2_TAPS); + + return bestsme; +} + +#else // CONFIG_NON_GREEDY_MV +static uint32_t motion_compensated_prediction(VP9_COMP *cpi, ThreadData *td, + uint8_t *cur_frame_buf, + uint8_t *ref_frame_buf, + int stride, BLOCK_SIZE bsize, + MV *mv) { + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; + const SEARCH_METHODS search_method = NSTEP; + int step_param; + int sadpb = x->sadperbit16; + uint32_t bestsme = UINT_MAX; + uint32_t distortion; + uint32_t sse; + int cost_list[5]; + const MvLimits tmp_mv_limits = x->mv_limits; + + MV best_ref_mv1 = { 0, 0 }; + MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ + + best_ref_mv1_full.col = best_ref_mv1.col >> 3; + best_ref_mv1_full.row = best_ref_mv1.row >> 3; + + // Setup frame pointers + x->plane[0].src.buf = cur_frame_buf; + x->plane[0].src.stride = stride; + xd->plane[0].pre[0].buf = ref_frame_buf; + xd->plane[0].pre[0].stride = stride; + + step_param = mv_sf->reduce_first_step_size; + step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2); + + vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1); + + vp9_full_pixel_search(cpi, x, bsize, &best_ref_mv1_full, step_param, + search_method, sadpb, cond_cost_list(cpi, cost_list), + &best_ref_mv1, mv, 0, 0); + + /* restore UMV window */ + x->mv_limits = tmp_mv_limits; + + // TODO(yunqing): may use higher tap interp filter than 2 taps. + // Ignore mv costing by sending NULL pointer instead of cost array + bestsme = cpi->find_fractional_mv_step( + x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit, + &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level, + cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0, + USE_2_TAPS); + + return bestsme; +} +#endif + +static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row, + int ref_pos_col, int block, BLOCK_SIZE bsize) { + int width = 0, height = 0; + int bw = 4 << b_width_log2_lookup[bsize]; + int bh = 4 << b_height_log2_lookup[bsize]; + + switch (block) { + case 0: + width = grid_pos_col + bw - ref_pos_col; + height = grid_pos_row + bh - ref_pos_row; + break; + case 1: + width = ref_pos_col + bw - grid_pos_col; + height = grid_pos_row + bh - ref_pos_row; + break; + case 2: + width = grid_pos_col + bw - ref_pos_col; + height = ref_pos_row + bh - grid_pos_row; + break; + case 3: + width = ref_pos_col + bw - grid_pos_col; + height = ref_pos_row + bh - grid_pos_row; + break; + default: assert(0); + } + + return width * height; +} + +static int round_floor(int ref_pos, int bsize_pix) { + int round; + if (ref_pos < 0) + round = -(1 + (-ref_pos - 1) / bsize_pix); + else + round = ref_pos / bsize_pix; + + return round; +} + +static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col, + BLOCK_SIZE bsize, int stride) { + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col]; + int idx, idy; + + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + TplDepStats *tpl_ptr = &tpl_stats[(mi_row + idy) * stride + mi_col + idx]; + const int64_t mc_flow = tpl_ptr->mc_flow; + const int64_t mc_ref_cost = tpl_ptr->mc_ref_cost; + *tpl_ptr = *src_stats; + tpl_ptr->mc_flow = mc_flow; + tpl_ptr->mc_ref_cost = mc_ref_cost; + tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow; + } + } +} + +static void tpl_store_before_propagation(VpxTplBlockStats *tpl_block_stats, + TplDepStats *tpl_stats, int mi_row, + int mi_col, BLOCK_SIZE bsize, + int stride, int64_t recon_error, + int64_t rate_cost, int ref_frame_idx) { + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col]; + int idx, idy; + + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + VpxTplBlockStats *tpl_block_stats_ptr = + &tpl_block_stats[(mi_row + idy) * stride + mi_col + idx]; + tpl_block_stats_ptr->row = mi_row * 8; + tpl_block_stats_ptr->col = mi_col * 8; + tpl_block_stats_ptr->inter_cost = src_stats->inter_cost; + tpl_block_stats_ptr->intra_cost = src_stats->intra_cost; + tpl_block_stats_ptr->recrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2; + tpl_block_stats_ptr->recrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2; + tpl_block_stats_ptr->mv_r = src_stats->mv.as_mv.row; + tpl_block_stats_ptr->mv_c = src_stats->mv.as_mv.col; + tpl_block_stats_ptr->ref_frame_index = ref_frame_idx; + } + } +} + +static void tpl_model_update_b(TplDepFrame *tpl_frame, TplDepStats *tpl_stats, + int mi_row, int mi_col, const BLOCK_SIZE bsize) { + TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_stats->ref_frame_index]; + TplDepStats *ref_stats = ref_tpl_frame->tpl_stats_ptr; + MV mv = tpl_stats->mv.as_mv; + int mv_row = mv.row >> 3; + int mv_col = mv.col >> 3; + + int ref_pos_row = mi_row * MI_SIZE + mv_row; + int ref_pos_col = mi_col * MI_SIZE + mv_col; + + const int bw = 4 << b_width_log2_lookup[bsize]; + const int bh = 4 << b_height_log2_lookup[bsize]; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int pix_num = bw * bh; + + // top-left on grid block location in pixel + int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh; + int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw; + int block; + + for (block = 0; block < 4; ++block) { + int grid_pos_row = grid_pos_row_base + bh * (block >> 1); + int grid_pos_col = grid_pos_col_base + bw * (block & 0x01); + + if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE && + grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) { + int overlap_area = get_overlap_area( + grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize); + int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height; + int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width; + + int64_t mc_flow = tpl_stats->mc_dep_cost - + (tpl_stats->mc_dep_cost * tpl_stats->inter_cost) / + tpl_stats->intra_cost; + + int idx, idy; + + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + TplDepStats *des_stats = + &ref_stats[(ref_mi_row + idy) * ref_tpl_frame->stride + + (ref_mi_col + idx)]; + + des_stats->mc_flow += (mc_flow * overlap_area) / pix_num; + des_stats->mc_ref_cost += + ((tpl_stats->intra_cost - tpl_stats->inter_cost) * overlap_area) / + pix_num; + assert(overlap_area >= 0); + } + } + } + } +} + +static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats, + int mi_row, int mi_col, const BLOCK_SIZE bsize) { + int idx, idy; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + TplDepStats *tpl_ptr = + &tpl_stats[(mi_row + idy) * tpl_frame->stride + (mi_col + idx)]; + tpl_model_update_b(tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx, + BLOCK_8X8); + } + } +} + +static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + TX_SIZE tx_size, int64_t *recon_error, + int64_t *sse, uint16_t *eob) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size]; + int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]; + const int shift = tx_size == TX_32X32 ? 0 : 2; + + // skip block condition should be handled before this is called. + assert(!x->skip_block); + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_highbd_quantize_fp_32x32(coeff, pix_num, p, qcoeff, dqcoeff, + pd->dequant, eob, scan_order); + } else { + vp9_quantize_fp_32x32(coeff, pix_num, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); + } +#else + vp9_quantize_fp_32x32(coeff, pix_num, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); +#endif // CONFIG_VP9_HIGHBITDEPTH + + *recon_error = vp9_block_error(coeff, dqcoeff, pix_num, sse) >> shift; + *recon_error = VPXMAX(*recon_error, 1); + + *sse = (*sse) >> shift; + *sse = VPXMAX(*sse, 1); +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, + TX_SIZE tx_size) { + // TODO(sdeng): Implement SIMD based high bit-depth Hadamard transforms. + switch (tx_size) { + case TX_8X8: vpx_highbd_hadamard_8x8(src_diff, bw, coeff); break; + case TX_16X16: vpx_highbd_hadamard_16x16(src_diff, bw, coeff); break; + case TX_32X32: vpx_highbd_hadamard_32x32(src_diff, bw, coeff); break; + default: assert(0); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +void vp9_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, + TX_SIZE tx_size) { + switch (tx_size) { + case TX_8X8: vpx_hadamard_8x8(src_diff, bw, coeff); break; + case TX_16X16: vpx_hadamard_16x16(src_diff, bw, coeff); break; + case TX_32X32: vpx_hadamard_32x32(src_diff, bw, coeff); break; + default: assert(0); + } +} + +static void set_mv_limits(const VP9_COMMON *cm, MACROBLOCK *x, int mi_row, + int mi_col) { + x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND)); + x->mv_limits.row_max = + (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * VP9_INTERP_EXTEND); + x->mv_limits.col_min = -((mi_col * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND)); + x->mv_limits.col_max = + ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND); +} + +static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) { + const ScanOrder *const scan_order = &vp9_scan_orders[tx_size][DCT_DCT]; + int rate_cost = 1; + int idx; + assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob); + for (idx = 0; idx < eob; ++idx) { + unsigned int abs_level = abs(qcoeff[scan_order->scan[idx]]); + rate_cost += get_msb(abs_level + 1) + 1 + (abs_level > 0); + } + + return (rate_cost << VP9_PROB_COST_SHIFT); +} + +static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, + struct scale_factors *sf, GF_PICTURE *gf_picture, + int frame_idx, TplDepFrame *tpl_frame, + int16_t *src_diff, tran_low_t *coeff, + tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row, + int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size, + YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor, + int64_t *recon_error, int64_t *rate_cost, + int64_t *sse, int *ref_frame_idx) { + VP9_COMMON *cm = &cpi->common; + ThreadData *td = &cpi->td; + + const int bw = 4 << b_width_log2_lookup[bsize]; + const int bh = 4 << b_height_log2_lookup[bsize]; + const int pix_num = bw * bh; + int best_rf_idx = -1; + int_mv best_mv; + int64_t best_inter_cost = INT64_MAX; + int64_t inter_cost; + int rf_idx; + const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP]; + + int64_t best_intra_cost = INT64_MAX; + int64_t intra_cost; + PREDICTION_MODE mode; + int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; + MODE_INFO mi_above, mi_left; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + TplDepStats *tpl_stats = + &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col]; + + xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8); + xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8; + xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8); + xd->mb_to_right_edge = ((cm->mi_cols - 1 - mi_col) * MI_SIZE) * 8; + xd->above_mi = (mi_row > 0) ? &mi_above : NULL; + xd->left_mi = (mi_col > 0) ? &mi_left : NULL; + + // Intra prediction search + for (mode = DC_PRED; mode <= TM_PRED; ++mode) { + uint8_t *src, *dst; + int src_stride, dst_stride; + + src = xd->cur_buf->y_buffer + mb_y_offset; + src_stride = xd->cur_buf->y_stride; + + dst = &predictor[0]; + dst_stride = bw; + + xd->mi[0]->sb_type = bsize; + xd->mi[0]->ref_frame[0] = INTRA_FRAME; + + vp9_predict_intra_block(xd, b_width_log2_lookup[bsize], tx_size, mode, src, + src_stride, dst, dst_stride, 0, 0, 0); + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vpx_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, + dst_stride, xd->bd); + vp9_highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + intra_cost = vpx_highbd_satd(coeff, pix_num); + } else { + vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, + dst_stride); + vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + intra_cost = vpx_satd(coeff, pix_num); + } +#else + vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, dst_stride); + vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + intra_cost = vpx_satd(coeff, pix_num); +#endif // CONFIG_VP9_HIGHBITDEPTH + + if (intra_cost < best_intra_cost) best_intra_cost = intra_cost; + } + + // Motion compensated prediction + best_mv.as_int = 0; + + set_mv_limits(cm, x, mi_row, mi_col); + + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + int_mv mv; +#if CONFIG_NON_GREEDY_MV + MotionField *motion_field; +#endif + if (ref_frame[rf_idx] == NULL) continue; + +#if CONFIG_NON_GREEDY_MV + (void)td; + motion_field = vp9_motion_field_info_get_motion_field( + &cpi->motion_field_info, frame_idx, rf_idx, bsize); + mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col); +#else + motion_compensated_prediction(cpi, td, xd->cur_buf->y_buffer + mb_y_offset, + ref_frame[rf_idx]->y_buffer + mb_y_offset, + xd->cur_buf->y_stride, bsize, &mv.as_mv); +#endif + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_highbd_build_inter_predictor( + CONVERT_TO_SHORTPTR(ref_frame[rf_idx]->y_buffer + mb_y_offset), + ref_frame[rf_idx]->y_stride, CONVERT_TO_SHORTPTR(&predictor[0]), bw, + &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, + mi_row * MI_SIZE, xd->bd); + vpx_highbd_subtract_block( + bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset, + xd->cur_buf->y_stride, &predictor[0], bw, xd->bd); + vp9_highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + inter_cost = vpx_highbd_satd(coeff, pix_num); + } else { + vp9_build_inter_predictor( + ref_frame[rf_idx]->y_buffer + mb_y_offset, + ref_frame[rf_idx]->y_stride, &predictor[0], bw, &mv.as_mv, sf, bw, bh, + 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE); + vpx_subtract_block(bh, bw, src_diff, bw, + xd->cur_buf->y_buffer + mb_y_offset, + xd->cur_buf->y_stride, &predictor[0], bw); + vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + inter_cost = vpx_satd(coeff, pix_num); + } +#else + vp9_build_inter_predictor(ref_frame[rf_idx]->y_buffer + mb_y_offset, + ref_frame[rf_idx]->y_stride, &predictor[0], bw, + &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3, + mi_col * MI_SIZE, mi_row * MI_SIZE); + vpx_subtract_block(bh, bw, src_diff, bw, + xd->cur_buf->y_buffer + mb_y_offset, + xd->cur_buf->y_stride, &predictor[0], bw); + vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + inter_cost = vpx_satd(coeff, pix_num); +#endif + + if (inter_cost < best_inter_cost) { + uint16_t eob = 0; + best_rf_idx = rf_idx; + best_inter_cost = inter_cost; + best_mv.as_int = mv.as_int; + // Since best_inter_cost is initialized as INT64_MAX, recon_error and + // rate_cost will be calculated with the best reference frame. + get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, recon_error, + sse, &eob); + *rate_cost = rate_estimator(qcoeff, eob, tx_size); + } + } + best_intra_cost = VPXMAX(best_intra_cost, 1); + best_inter_cost = VPXMIN(best_intra_cost, best_inter_cost); + tpl_stats->inter_cost = VPXMAX( + 1, (best_inter_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width)); + tpl_stats->intra_cost = VPXMAX( + 1, (best_intra_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width)); + tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx]; + tpl_stats->mv.as_int = best_mv.as_int; + *ref_frame_idx = best_rf_idx; +} + +#if CONFIG_NON_GREEDY_MV +static int get_block_src_pred_buf(MACROBLOCKD *xd, GF_PICTURE *gf_picture, + int frame_idx, int rf_idx, int mi_row, + int mi_col, struct buf_2d *src, + struct buf_2d *pre) { + const int mb_y_offset = + mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; + YV12_BUFFER_CONFIG *ref_frame = NULL; + int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx]; + if (ref_frame_idx != -1) { + ref_frame = gf_picture[ref_frame_idx].frame; + src->buf = xd->cur_buf->y_buffer + mb_y_offset; + src->stride = xd->cur_buf->y_stride; + pre->buf = ref_frame->y_buffer + mb_y_offset; + pre->stride = ref_frame->y_stride; + assert(src->stride == pre->stride); + return 1; + } else { + printf("invalid ref_frame_idx"); + assert(ref_frame_idx != -1); + return 0; + } +} + +#define kMvPreCheckLines 5 +#define kMvPreCheckSize 15 + +#define MV_REF_POS_NUM 3 +POSITION mv_ref_pos[MV_REF_POS_NUM] = { + { -1, 0 }, + { 0, -1 }, + { -1, -1 }, +}; + +static int_mv *get_select_mv(VP9_COMP *cpi, TplDepFrame *tpl_frame, int mi_row, + int mi_col) { + return &cpi->select_mv_arr[mi_row * tpl_frame->stride + mi_col]; +} + +static int_mv find_ref_mv(int mv_mode, VP9_COMP *cpi, TplDepFrame *tpl_frame, + BLOCK_SIZE bsize, int mi_row, int mi_col) { + int i; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + int_mv nearest_mv, near_mv, invalid_mv; + nearest_mv.as_int = INVALID_MV; + near_mv.as_int = INVALID_MV; + invalid_mv.as_int = INVALID_MV; + for (i = 0; i < MV_REF_POS_NUM; ++i) { + int nb_row = mi_row + mv_ref_pos[i].row * mi_height; + int nb_col = mi_col + mv_ref_pos[i].col * mi_width; + assert(mv_ref_pos[i].row <= 0); + assert(mv_ref_pos[i].col <= 0); + if (nb_row >= 0 && nb_col >= 0) { + if (nearest_mv.as_int == INVALID_MV) { + nearest_mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col); + } else { + int_mv mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col); + if (mv.as_int == nearest_mv.as_int) { + continue; + } else { + near_mv = mv; + break; + } + } + } + } + if (nearest_mv.as_int == INVALID_MV) { + nearest_mv.as_mv.row = 0; + nearest_mv.as_mv.col = 0; + } + if (near_mv.as_int == INVALID_MV) { + near_mv.as_mv.row = 0; + near_mv.as_mv.col = 0; + } + if (mv_mode == NEAREST_MV_MODE) { + return nearest_mv; + } + if (mv_mode == NEAR_MV_MODE) { + return near_mv; + } + assert(0); + return invalid_mv; +} + +static int_mv get_mv_from_mv_mode(int mv_mode, VP9_COMP *cpi, + MotionField *motion_field, + TplDepFrame *tpl_frame, BLOCK_SIZE bsize, + int mi_row, int mi_col) { + int_mv mv; + switch (mv_mode) { + case ZERO_MV_MODE: + mv.as_mv.row = 0; + mv.as_mv.col = 0; + break; + case NEW_MV_MODE: + mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col); + break; + case NEAREST_MV_MODE: + mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col); + break; + case NEAR_MV_MODE: + mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col); + break; + default: + mv.as_int = INVALID_MV; + assert(0); + break; + } + return mv; +} + +static double get_mv_dist(int mv_mode, VP9_COMP *cpi, MACROBLOCKD *xd, + GF_PICTURE *gf_picture, MotionField *motion_field, + int frame_idx, TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int_mv *mv) { + uint32_t sse; + struct buf_2d src; + struct buf_2d pre; + MV full_mv; + *mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame, bsize, + mi_row, mi_col); + full_mv = get_full_mv(&mv->as_mv); + if (get_block_src_pred_buf(xd, gf_picture, frame_idx, rf_idx, mi_row, mi_col, + &src, &pre)) { + // TODO(angiebird): Consider subpixel when computing the sse. + cpi->fn_ptr[bsize].vf(src.buf, src.stride, get_buf_from_mv(&pre, &full_mv), + pre.stride, &sse); + return (double)(sse << VP9_DIST_SCALE_LOG2); + } else { + assert(0); + return 0; + } +} + +static int get_mv_mode_cost(int mv_mode) { + // TODO(angiebird): The probabilities are roughly inferred from + // default_inter_mode_probs. Check if there is a better way to set the + // probabilities. + const int zero_mv_prob = 16; + const int new_mv_prob = 24 * 1; + const int ref_mv_prob = 256 - zero_mv_prob - new_mv_prob; + assert(zero_mv_prob + new_mv_prob + ref_mv_prob == 256); + switch (mv_mode) { + case ZERO_MV_MODE: return vp9_prob_cost[zero_mv_prob]; break; + case NEW_MV_MODE: return vp9_prob_cost[new_mv_prob]; break; + case NEAREST_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break; + case NEAR_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break; + default: assert(0); return -1; + } +} + +static INLINE double get_mv_diff_cost(MV *new_mv, MV *ref_mv) { + double mv_diff_cost = log2(1 + abs(new_mv->row - ref_mv->row)) + + log2(1 + abs(new_mv->col - ref_mv->col)); + mv_diff_cost *= (1 << VP9_PROB_COST_SHIFT); + return mv_diff_cost; +} +static double get_mv_cost(int mv_mode, VP9_COMP *cpi, MotionField *motion_field, + TplDepFrame *tpl_frame, BLOCK_SIZE bsize, int mi_row, + int mi_col) { + double mv_cost = get_mv_mode_cost(mv_mode); + if (mv_mode == NEW_MV_MODE) { + MV new_mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame, + bsize, mi_row, mi_col) + .as_mv; + MV nearest_mv = get_mv_from_mv_mode(NEAREST_MV_MODE, cpi, motion_field, + tpl_frame, bsize, mi_row, mi_col) + .as_mv; + MV near_mv = get_mv_from_mv_mode(NEAR_MV_MODE, cpi, motion_field, tpl_frame, + bsize, mi_row, mi_col) + .as_mv; + double nearest_cost = get_mv_diff_cost(&new_mv, &nearest_mv); + double near_cost = get_mv_diff_cost(&new_mv, &near_mv); + mv_cost += nearest_cost < near_cost ? nearest_cost : near_cost; + } + return mv_cost; +} + +static double eval_mv_mode(int mv_mode, VP9_COMP *cpi, MACROBLOCK *x, + GF_PICTURE *gf_picture, MotionField *motion_field, + int frame_idx, TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int_mv *mv) { + MACROBLOCKD *xd = &x->e_mbd; + double mv_dist = + get_mv_dist(mv_mode, cpi, xd, gf_picture, motion_field, frame_idx, + tpl_frame, rf_idx, bsize, mi_row, mi_col, mv); + double mv_cost = + get_mv_cost(mv_mode, cpi, motion_field, tpl_frame, bsize, mi_row, mi_col); + double mult = 180; + + return mv_cost + mult * log2f(1 + mv_dist); +} + +static int find_best_ref_mv_mode(VP9_COMP *cpi, MACROBLOCK *x, + GF_PICTURE *gf_picture, + MotionField *motion_field, int frame_idx, + TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize, int mi_row, int mi_col, + double *rd, int_mv *mv) { + int best_mv_mode = ZERO_MV_MODE; + int update = 0; + int mv_mode; + *rd = 0; + for (mv_mode = 0; mv_mode < MAX_MV_MODE; ++mv_mode) { + double this_rd; + int_mv this_mv; + if (mv_mode == NEW_MV_MODE) { + continue; + } + this_rd = eval_mv_mode(mv_mode, cpi, x, gf_picture, motion_field, frame_idx, + tpl_frame, rf_idx, bsize, mi_row, mi_col, &this_mv); + if (update == 0) { + *rd = this_rd; + *mv = this_mv; + best_mv_mode = mv_mode; + update = 1; + } else { + if (this_rd < *rd) { + *rd = this_rd; + *mv = this_mv; + best_mv_mode = mv_mode; + } + } + } + return best_mv_mode; +} + +static void predict_mv_mode(VP9_COMP *cpi, MACROBLOCK *x, + GF_PICTURE *gf_picture, MotionField *motion_field, + int frame_idx, TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize, int mi_row, int mi_col) { + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + int tmp_mv_mode_arr[kMvPreCheckSize]; + int *mv_mode_arr = tpl_frame->mv_mode_arr[rf_idx]; + double *rd_diff_arr = tpl_frame->rd_diff_arr[rf_idx]; + int_mv *select_mv_arr = cpi->select_mv_arr; + int_mv tmp_select_mv_arr[kMvPreCheckSize]; + int stride = tpl_frame->stride; + double new_mv_rd = 0; + double no_new_mv_rd = 0; + double this_new_mv_rd = 0; + double this_no_new_mv_rd = 0; + int idx; + int tmp_idx; + assert(kMvPreCheckSize == (kMvPreCheckLines * (kMvPreCheckLines + 1)) >> 1); + + // no new mv + // diagonal scan order + tmp_idx = 0; + for (idx = 0; idx < kMvPreCheckLines; ++idx) { + int r; + for (r = 0; r <= idx; ++r) { + int c = idx - r; + int nb_row = mi_row + r * mi_height; + int nb_col = mi_col + c * mi_width; + if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) { + double this_rd; + int_mv *mv = &select_mv_arr[nb_row * stride + nb_col]; + mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode( + cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx, + bsize, nb_row, nb_col, &this_rd, mv); + if (r == 0 && c == 0) { + this_no_new_mv_rd = this_rd; + } + no_new_mv_rd += this_rd; + tmp_mv_mode_arr[tmp_idx] = mv_mode_arr[nb_row * stride + nb_col]; + tmp_select_mv_arr[tmp_idx] = select_mv_arr[nb_row * stride + nb_col]; + ++tmp_idx; + } + } + } + + // new mv + mv_mode_arr[mi_row * stride + mi_col] = NEW_MV_MODE; + this_new_mv_rd = eval_mv_mode( + NEW_MV_MODE, cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, + rf_idx, bsize, mi_row, mi_col, &select_mv_arr[mi_row * stride + mi_col]); + new_mv_rd = this_new_mv_rd; + // We start from idx = 1 because idx = 0 is evaluated as NEW_MV_MODE + // beforehand. + for (idx = 1; idx < kMvPreCheckLines; ++idx) { + int r; + for (r = 0; r <= idx; ++r) { + int c = idx - r; + int nb_row = mi_row + r * mi_height; + int nb_col = mi_col + c * mi_width; + if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) { + double this_rd; + int_mv *mv = &select_mv_arr[nb_row * stride + nb_col]; + mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode( + cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx, + bsize, nb_row, nb_col, &this_rd, mv); + new_mv_rd += this_rd; + } + } + } + + // update best_mv_mode + tmp_idx = 0; + if (no_new_mv_rd < new_mv_rd) { + for (idx = 0; idx < kMvPreCheckLines; ++idx) { + int r; + for (r = 0; r <= idx; ++r) { + int c = idx - r; + int nb_row = mi_row + r * mi_height; + int nb_col = mi_col + c * mi_width; + if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) { + mv_mode_arr[nb_row * stride + nb_col] = tmp_mv_mode_arr[tmp_idx]; + select_mv_arr[nb_row * stride + nb_col] = tmp_select_mv_arr[tmp_idx]; + ++tmp_idx; + } + } + } + rd_diff_arr[mi_row * stride + mi_col] = 0; + } else { + rd_diff_arr[mi_row * stride + mi_col] = + (no_new_mv_rd - this_no_new_mv_rd) - (new_mv_rd - this_new_mv_rd); + } +} + +static void predict_mv_mode_arr(VP9_COMP *cpi, MACROBLOCK *x, + GF_PICTURE *gf_picture, + MotionField *motion_field, int frame_idx, + TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize) { + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int unit_rows = tpl_frame->mi_rows / mi_height; + const int unit_cols = tpl_frame->mi_cols / mi_width; + const int max_diagonal_lines = unit_rows + unit_cols - 1; + int idx; + for (idx = 0; idx < max_diagonal_lines; ++idx) { + int r; + for (r = VPXMAX(idx - unit_cols + 1, 0); r <= VPXMIN(idx, unit_rows - 1); + ++r) { + int c = idx - r; + int mi_row = r * mi_height; + int mi_col = c * mi_width; + assert(c >= 0 && c < unit_cols); + assert(mi_row >= 0 && mi_row < tpl_frame->mi_rows); + assert(mi_col >= 0 && mi_col < tpl_frame->mi_cols); + predict_mv_mode(cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, + rf_idx, bsize, mi_row, mi_col); + } + } +} + +static void do_motion_search(VP9_COMP *cpi, ThreadData *td, + MotionField *motion_field, int frame_idx, + YV12_BUFFER_CONFIG *ref_frame, BLOCK_SIZE bsize, + int mi_row, int mi_col) { + VP9_COMMON *cm = &cpi->common; + MACROBLOCK *x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + const int mb_y_offset = + mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; + assert(ref_frame != NULL); + set_mv_limits(cm, x, mi_row, mi_col); + { + int_mv mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col); + uint8_t *cur_frame_buf = xd->cur_buf->y_buffer + mb_y_offset; + uint8_t *ref_frame_buf = ref_frame->y_buffer + mb_y_offset; + const int stride = xd->cur_buf->y_stride; + full_pixel_motion_search(cpi, td, motion_field, frame_idx, cur_frame_buf, + ref_frame_buf, stride, bsize, mi_row, mi_col, + &mv.as_mv); + sub_pixel_motion_search(cpi, td, cur_frame_buf, ref_frame_buf, stride, + bsize, &mv.as_mv); + vp9_motion_field_mi_set_mv(motion_field, mi_row, mi_col, mv); + } +} + +static void build_motion_field( + VP9_COMP *cpi, int frame_idx, + YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES], BLOCK_SIZE bsize) { + VP9_COMMON *cm = &cpi->common; + ThreadData *td = &cpi->td; + TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int pw = num_4x4_blocks_wide_lookup[bsize] << 2; + const int ph = num_4x4_blocks_high_lookup[bsize] << 2; + int mi_row, mi_col; + int rf_idx; + + tpl_frame->lambda = (pw * ph) >> 2; + assert(pw * ph == tpl_frame->lambda << 2); + + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + MotionField *motion_field = vp9_motion_field_info_get_motion_field( + &cpi->motion_field_info, frame_idx, rf_idx, bsize); + if (ref_frame[rf_idx] == NULL) { + continue; + } + vp9_motion_field_reset_mvs(motion_field); + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { + do_motion_search(cpi, td, motion_field, frame_idx, ref_frame[rf_idx], + bsize, mi_row, mi_col); + } + } + } +} +#endif // CONFIG_NON_GREEDY_MV + +static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, + int frame_idx, BLOCK_SIZE bsize) { + TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + VpxTplFrameStats *tpl_frame_stats_before_propagation = + &cpi->tpl_gop_stats.frame_stats_list[frame_idx]; + YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame; + YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES] = { NULL, NULL, NULL }; + + VP9_COMMON *cm = &cpi->common; + struct scale_factors sf; + int rdmult, idx; + ThreadData *td = &cpi->td; + MACROBLOCK *x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + int mi_row, mi_col; + +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, predictor16[32 * 32 * 3]); + DECLARE_ALIGNED(16, uint8_t, predictor8[32 * 32 * 3]); + uint8_t *predictor; +#else + DECLARE_ALIGNED(16, uint8_t, predictor[32 * 32 * 3]); +#endif + DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]); + DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]); + DECLARE_ALIGNED(16, tran_low_t, qcoeff[32 * 32]); + DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]); + + const TX_SIZE tx_size = max_txsize_lookup[bsize]; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + + tpl_frame_stats_before_propagation->frame_width = cm->width; + tpl_frame_stats_before_propagation->frame_height = cm->height; + // Setup scaling factor +#if CONFIG_VP9_HIGHBITDEPTH + vp9_setup_scale_factors_for_frame( + &sf, this_frame->y_crop_width, this_frame->y_crop_height, + this_frame->y_crop_width, this_frame->y_crop_height, + cpi->common.use_highbitdepth); + + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + predictor = CONVERT_TO_BYTEPTR(predictor16); + else + predictor = predictor8; +#else + vp9_setup_scale_factors_for_frame( + &sf, this_frame->y_crop_width, this_frame->y_crop_height, + this_frame->y_crop_width, this_frame->y_crop_height); +#endif // CONFIG_VP9_HIGHBITDEPTH + + // Prepare reference frame pointers. If any reference frame slot is + // unavailable, the pointer will be set to Null. + for (idx = 0; idx < MAX_INTER_REF_FRAMES; ++idx) { + int rf_idx = gf_picture[frame_idx].ref_frame[idx]; + if (rf_idx != -1) ref_frame[idx] = gf_picture[rf_idx].frame; + } + + xd->mi = cm->mi_grid_visible; + xd->mi[0] = cm->mi; + xd->cur_buf = this_frame; + + // Get rd multiplier set up. + rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, tpl_frame->base_qindex); + set_error_per_bit(&cpi->td.mb, rdmult); + vp9_initialize_me_consts(cpi, &cpi->td.mb, tpl_frame->base_qindex); + + tpl_frame->is_valid = 1; + + cm->base_qindex = tpl_frame->base_qindex; + vp9_frame_init_quantizer(cpi); + +#if CONFIG_NON_GREEDY_MV + { + int square_block_idx; + int rf_idx; + for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES; + ++square_block_idx) { + BLOCK_SIZE square_bsize = square_block_idx_to_bsize(square_block_idx); + build_motion_field(cpi, frame_idx, ref_frame, square_bsize); + } + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx]; + if (ref_frame_idx != -1) { + MotionField *motion_field = vp9_motion_field_info_get_motion_field( + &cpi->motion_field_info, frame_idx, rf_idx, bsize); + predict_mv_mode_arr(cpi, x, gf_picture, motion_field, frame_idx, + tpl_frame, rf_idx, bsize); + } + } + } +#endif // CONFIG_NON_GREEDY_MV + + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { + int64_t recon_error = 0; + int64_t rate_cost = 0; + int64_t sse = 0; + // Ref frame index in the ref frame buffer. + int ref_frame_idx = -1; + mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, tpl_frame, + src_diff, coeff, qcoeff, dqcoeff, mi_row, mi_col, bsize, + tx_size, ref_frame, predictor, &recon_error, &rate_cost, + &sse, &ref_frame_idx); + // Motion flow dependency dispenser. + tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, + tpl_frame->stride); + + tpl_store_before_propagation( + tpl_frame_stats_before_propagation->block_stats_list, + tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, tpl_frame->stride, + recon_error, rate_cost, ref_frame_idx); + + tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col, + bsize); + } + } +} + +static void trim_tpl_stats(struct vpx_internal_error_info *error_info, + VpxTplGopStats *tpl_gop_stats, int extra_frames) { + int i; + VpxTplFrameStats *new_frame_stats; + const int new_size = tpl_gop_stats->size - extra_frames; + if (tpl_gop_stats->size <= extra_frames) + vpx_internal_error( + error_info, VPX_CODEC_ERROR, + "The number of frames in VpxTplGopStats is fewer than expected."); + CHECK_MEM_ERROR(error_info, new_frame_stats, + vpx_calloc(new_size, sizeof(*new_frame_stats))); + for (i = 0; i < new_size; i++) { + VpxTplFrameStats *frame_stats = &tpl_gop_stats->frame_stats_list[i]; + const int num_blocks = frame_stats->num_blocks; + new_frame_stats[i].num_blocks = frame_stats->num_blocks; + new_frame_stats[i].frame_width = frame_stats->frame_width; + new_frame_stats[i].frame_height = frame_stats->frame_height; + new_frame_stats[i].num_blocks = num_blocks; + CHECK_MEM_ERROR( + error_info, new_frame_stats[i].block_stats_list, + vpx_calloc(num_blocks, sizeof(*new_frame_stats[i].block_stats_list))); + memcpy(new_frame_stats[i].block_stats_list, frame_stats->block_stats_list, + num_blocks * sizeof(*new_frame_stats[i].block_stats_list)); + } + free_tpl_frame_stats_list(tpl_gop_stats); + tpl_gop_stats->size = new_size; + tpl_gop_stats->frame_stats_list = new_frame_stats; +} + +#if CONFIG_NON_GREEDY_MV +#define DUMP_TPL_STATS 0 +#if DUMP_TPL_STATS +static void dump_buf(uint8_t *buf, int stride, int row, int col, int h, int w) { + int i, j; + printf("%d %d\n", h, w); + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + printf("%d ", buf[(row + i) * stride + col + j]); + } + } + printf("\n"); +} + +static void dump_frame_buf(const YV12_BUFFER_CONFIG *frame_buf) { + dump_buf(frame_buf->y_buffer, frame_buf->y_stride, 0, 0, frame_buf->y_height, + frame_buf->y_width); + dump_buf(frame_buf->u_buffer, frame_buf->uv_stride, 0, 0, + frame_buf->uv_height, frame_buf->uv_width); + dump_buf(frame_buf->v_buffer, frame_buf->uv_stride, 0, 0, + frame_buf->uv_height, frame_buf->uv_width); +} + +static void dump_tpl_stats(const VP9_COMP *cpi, int tpl_group_frames, + const GF_GROUP *gf_group, + const GF_PICTURE *gf_picture, BLOCK_SIZE bsize) { + int frame_idx; + const VP9_COMMON *cm = &cpi->common; + int rf_idx; + for (frame_idx = 1; frame_idx < tpl_group_frames; ++frame_idx) { + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + const TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + int mi_row, mi_col; + int ref_frame_idx; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx]; + if (ref_frame_idx != -1) { + YV12_BUFFER_CONFIG *ref_frame_buf = gf_picture[ref_frame_idx].frame; + const int gf_frame_offset = gf_group->frame_gop_index[frame_idx]; + const int ref_gf_frame_offset = + gf_group->frame_gop_index[ref_frame_idx]; + printf("=\n"); + printf( + "frame_idx %d mi_rows %d mi_cols %d bsize %d ref_frame_idx %d " + "rf_idx %d gf_frame_offset %d ref_gf_frame_offset %d\n", + frame_idx, cm->mi_rows, cm->mi_cols, mi_width * MI_SIZE, + ref_frame_idx, rf_idx, gf_frame_offset, ref_gf_frame_offset); + for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) { + for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) { + if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) { + int_mv mv = vp9_motion_field_info_get_mv(&cpi->motion_field_info, + frame_idx, rf_idx, bsize, + mi_row, mi_col); + printf("%d %d %d %d\n", mi_row, mi_col, mv.as_mv.row, + mv.as_mv.col); + } + } + } + for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) { + for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) { + if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) { + const TplDepStats *tpl_ptr = + &tpl_frame + ->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col]; + printf("%f ", tpl_ptr->feature_score); + } + } + } + printf("\n"); + + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { + const int mv_mode = + tpl_frame + ->mv_mode_arr[rf_idx][mi_row * tpl_frame->stride + mi_col]; + printf("%d ", mv_mode); + } + } + printf("\n"); + + dump_frame_buf(gf_picture[frame_idx].frame); + dump_frame_buf(ref_frame_buf); + } + } + } +} +#endif // DUMP_TPL_STATS +#endif // CONFIG_NON_GREEDY_MV + +void vp9_init_tpl_buffer(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + int frame; + + const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows); +#if CONFIG_NON_GREEDY_MV + int rf_idx; + + vpx_free(cpi->select_mv_arr); + CHECK_MEM_ERROR( + &cm->error, cpi->select_mv_arr, + vpx_calloc(mi_rows * mi_cols * 4, sizeof(*cpi->select_mv_arr))); +#endif + + // TODO(jingning): Reduce the actual memory use for tpl model build up. + for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) { + if (cpi->tpl_stats[frame].width >= mi_cols && + cpi->tpl_stats[frame].height >= mi_rows && + cpi->tpl_stats[frame].tpl_stats_ptr) + continue; + +#if CONFIG_NON_GREEDY_MV + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]); + CHECK_MEM_ERROR( + &cm->error, cpi->tpl_stats[frame].mv_mode_arr[rf_idx], + vpx_calloc(mi_rows * mi_cols * 4, + sizeof(*cpi->tpl_stats[frame].mv_mode_arr[rf_idx]))); + vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]); + CHECK_MEM_ERROR( + &cm->error, cpi->tpl_stats[frame].rd_diff_arr[rf_idx], + vpx_calloc(mi_rows * mi_cols * 4, + sizeof(*cpi->tpl_stats[frame].rd_diff_arr[rf_idx]))); + } +#endif + vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr); + CHECK_MEM_ERROR(&cm->error, cpi->tpl_stats[frame].tpl_stats_ptr, + vpx_calloc(mi_rows * mi_cols, + sizeof(*cpi->tpl_stats[frame].tpl_stats_ptr))); + cpi->tpl_stats[frame].is_valid = 0; + cpi->tpl_stats[frame].width = mi_cols; + cpi->tpl_stats[frame].height = mi_rows; + cpi->tpl_stats[frame].stride = mi_cols; + cpi->tpl_stats[frame].mi_rows = cm->mi_rows; + cpi->tpl_stats[frame].mi_cols = cm->mi_cols; + } + + for (frame = 0; frame < REF_FRAMES; ++frame) { + cpi->enc_frame_buf[frame].mem_valid = 0; + cpi->enc_frame_buf[frame].released = 1; + } +} + +void vp9_free_tpl_buffer(VP9_COMP *cpi) { + int frame; +#if CONFIG_NON_GREEDY_MV + vp9_free_motion_field_info(&cpi->motion_field_info); + vpx_free(cpi->select_mv_arr); +#endif + for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) { +#if CONFIG_NON_GREEDY_MV + int rf_idx; + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]); + vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]); + } +#endif + vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr); + cpi->tpl_stats[frame].is_valid = 0; + } + free_tpl_frame_stats_list(&cpi->tpl_gop_stats); +} + +#if CONFIG_RATE_CTRL +static void accumulate_frame_tpl_stats(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + int show_frame_count = 0; + int frame_idx; + // Accumulate tpl stats for each frame in the current group of picture. + for (frame_idx = 1; frame_idx < gf_group->gf_group_size; ++frame_idx) { + TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + const int tpl_stride = tpl_frame->stride; + int64_t intra_cost_base = 0; + int64_t inter_cost_base = 0; + int64_t mc_dep_cost_base = 0; + int64_t mc_ref_cost_base = 0; + int64_t mc_flow_base = 0; + int row, col; + + if (!tpl_frame->is_valid) continue; + + for (row = 0; row < cm->mi_rows && tpl_frame->is_valid; ++row) { + for (col = 0; col < cm->mi_cols; ++col) { + TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col]; + intra_cost_base += this_stats->intra_cost; + inter_cost_base += this_stats->inter_cost; + mc_dep_cost_base += this_stats->mc_dep_cost; + mc_ref_cost_base += this_stats->mc_ref_cost; + mc_flow_base += this_stats->mc_flow; + } + } + + cpi->tpl_stats_info[show_frame_count].intra_cost = intra_cost_base; + cpi->tpl_stats_info[show_frame_count].inter_cost = inter_cost_base; + cpi->tpl_stats_info[show_frame_count].mc_dep_cost = mc_dep_cost_base; + cpi->tpl_stats_info[show_frame_count].mc_ref_cost = mc_ref_cost_base; + cpi->tpl_stats_info[show_frame_count].mc_flow = mc_flow_base; + + ++show_frame_count; + } +} +#endif // CONFIG_RATE_CTRL + +void vp9_setup_tpl_stats(VP9_COMP *cpi) { + GF_PICTURE gf_picture[MAX_ARF_GOP_SIZE]; + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + int tpl_group_frames = 0; + int frame_idx; + int extended_frame_count; + cpi->tpl_bsize = BLOCK_32X32; + + extended_frame_count = + init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames); + + init_tpl_stats(cpi); + + init_tpl_stats_before_propagation(&cpi->common.error, &cpi->tpl_gop_stats, + cpi->tpl_stats, tpl_group_frames, + cpi->common.width, cpi->common.height); + + // Backward propagation from tpl_group_frames to 1. + for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx) { + if (gf_picture[frame_idx].update_type == USE_BUF_FRAME) continue; + mc_flow_dispenser(cpi, gf_picture, frame_idx, cpi->tpl_bsize); + } + + // TPL stats has extra frames from next GOP. Trim those extra frames for + // Qmode. + trim_tpl_stats(&cpi->common.error, &cpi->tpl_gop_stats, extended_frame_count); + + if (cpi->ext_ratectrl.ready && + cpi->ext_ratectrl.funcs.send_tpl_gop_stats != NULL) { + const vpx_codec_err_t codec_status = + vp9_extrc_send_tpl_stats(&cpi->ext_ratectrl, &cpi->tpl_gop_stats); + if (codec_status != VPX_CODEC_OK) { + vpx_internal_error(&cpi->common.error, codec_status, + "vp9_extrc_send_tpl_stats() failed"); + } + } + +#if CONFIG_NON_GREEDY_MV + cpi->tpl_ready = 1; +#if DUMP_TPL_STATS + dump_tpl_stats(cpi, tpl_group_frames, gf_group, gf_picture, cpi->tpl_bsize); +#endif // DUMP_TPL_STATS +#endif // CONFIG_NON_GREEDY_MV + +#if CONFIG_RATE_CTRL + if (cpi->oxcf.use_simple_encode_api) { + accumulate_frame_tpl_stats(cpi); + } +#endif // CONFIG_RATE_CTRL +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h new file mode 100644 index 0000000000..04beb22610 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_TPL_MODEL_H_ +#define VPX_VP9_ENCODER_VP9_TPL_MODEL_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef M_LOG2_E +#define M_LOG2_E 0.693147180559945309417 +#endif +#define log2f(x) (log(x) / (float)M_LOG2_E) + +#define TPL_DEP_COST_SCALE_LOG2 4 + +typedef struct GF_PICTURE { + YV12_BUFFER_CONFIG *frame; + int ref_frame[3]; + FRAME_UPDATE_TYPE update_type; +} GF_PICTURE; + +void vp9_init_tpl_buffer(VP9_COMP *cpi); +void vp9_setup_tpl_stats(VP9_COMP *cpi); +void vp9_free_tpl_buffer(VP9_COMP *cpi); + +void vp9_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, + TX_SIZE tx_size); +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, + TX_SIZE tx_size); +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_TPL_MODEL_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_treewriter.c b/media/libvpx/libvpx/vp9/encoder/vp9_treewriter.c new file mode 100644 index 0000000000..0fc078e0a7 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_treewriter.c @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/encoder/vp9_treewriter.h" + +static void tree2tok(struct vp9_token *tokens, const vpx_tree_index *tree, + int i, int v, int l) { + v += v; + ++l; + + do { + const vpx_tree_index j = tree[i++]; + if (j <= 0) { + tokens[-j].value = v; + tokens[-j].len = l; + } else { + tree2tok(tokens, tree, j, v, l); + } + } while (++v & 1); +} + +void vp9_tokens_from_tree(struct vp9_token *tokens, + const vpx_tree_index *tree) { + tree2tok(tokens, tree, 0, 0, 0); +} + +static unsigned int convert_distribution(unsigned int i, vpx_tree tree, + unsigned int branch_ct[][2], + const unsigned int num_events[]) { + unsigned int left, right; + + if (tree[i] <= 0) + left = num_events[-tree[i]]; + else + left = convert_distribution(tree[i], tree, branch_ct, num_events); + + if (tree[i + 1] <= 0) + right = num_events[-tree[i + 1]]; + else + right = convert_distribution(tree[i + 1], tree, branch_ct, num_events); + + branch_ct[i >> 1][0] = left; + branch_ct[i >> 1][1] = right; + return left + right; +} + +void vp9_tree_probs_from_distribution(vpx_tree tree, + unsigned int branch_ct[/* n-1 */][2], + const unsigned int num_events[/* n */]) { + convert_distribution(0, tree, branch_ct, num_events); +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_treewriter.h b/media/libvpx/libvpx/vp9/encoder/vp9_treewriter.h new file mode 100644 index 0000000000..86c5fa2244 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_treewriter.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_TREEWRITER_H_ +#define VPX_VP9_ENCODER_VP9_TREEWRITER_H_ + +#include "vpx_dsp/bitwriter.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_tree_probs_from_distribution(vpx_tree tree, + unsigned int branch_ct[/* n - 1 */][2], + const unsigned int num_events[/* n */]); + +struct vp9_token { + int value; + int len; +}; + +void vp9_tokens_from_tree(struct vp9_token *, const vpx_tree_index *); + +static INLINE void vp9_write_tree(vpx_writer *w, const vpx_tree_index *tree, + const vpx_prob *probs, int bits, int len, + vpx_tree_index i) { + do { + const int bit = (bits >> --len) & 1; + vpx_write(w, bit, probs[i >> 1]); + i = tree[i + bit]; + } while (len); +} + +static INLINE void vp9_write_token(vpx_writer *w, const vpx_tree_index *tree, + const vpx_prob *probs, + const struct vp9_token *token) { + vp9_write_tree(w, tree, probs, token->value, token->len, 0); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_TREEWRITER_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c b/media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c new file mode 100644 index 0000000000..97f182c660 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c @@ -0,0 +1,893 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_temporal_filter.h" +#include "vp9/encoder/vp9_temporal_filter_constants.h" + +// Compute (a-b)**2 for 8 pixels with size 16-bit +static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b, + uint32_t *dst) { + const __m128i zero = _mm_setzero_si128(); + const __m128i a_reg = _mm_loadu_si128((const __m128i *)a); + const __m128i b_reg = _mm_loadu_si128((const __m128i *)b); + + const __m128i a_first = _mm_cvtepu16_epi32(a_reg); + const __m128i a_second = _mm_unpackhi_epi16(a_reg, zero); + const __m128i b_first = _mm_cvtepu16_epi32(b_reg); + const __m128i b_second = _mm_unpackhi_epi16(b_reg, zero); + + __m128i dist_first, dist_second; + + dist_first = _mm_sub_epi32(a_first, b_first); + dist_second = _mm_sub_epi32(a_second, b_second); + dist_first = _mm_mullo_epi32(dist_first, dist_first); + dist_second = _mm_mullo_epi32(dist_second, dist_second); + + _mm_storeu_si128((__m128i *)dst, dist_first); + _mm_storeu_si128((__m128i *)(dst + 4), dist_second); +} + +// Sum up three neighboring distortions for the pixels +static INLINE void highbd_get_sum_4(const uint32_t *dist, __m128i *sum) { + __m128i dist_reg, dist_left, dist_right; + + dist_reg = _mm_loadu_si128((const __m128i *)dist); + dist_left = _mm_loadu_si128((const __m128i *)(dist - 1)); + dist_right = _mm_loadu_si128((const __m128i *)(dist + 1)); + + *sum = _mm_add_epi32(dist_reg, dist_left); + *sum = _mm_add_epi32(*sum, dist_right); +} + +static INLINE void highbd_get_sum_8(const uint32_t *dist, __m128i *sum_first, + __m128i *sum_second) { + highbd_get_sum_4(dist, sum_first); + highbd_get_sum_4(dist + 4, sum_second); +} + +// Average the value based on the number of values summed (9 for pixels away +// from the border, 4 for pixels in corners, and 6 for other edge values, plus +// however many values from y/uv plane are). +// +// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply +// by weight. +static INLINE void highbd_average_4(__m128i *output, const __m128i *sum, + const __m128i *mul_constants, + const int strength, const int rounding, + const int weight) { + // _mm_srl_epi16 uses the lower 64 bit value for the shift. + const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); + const __m128i rounding_u32 = _mm_set1_epi32(rounding); + const __m128i weight_u32 = _mm_set1_epi32(weight); + const __m128i sixteen = _mm_set1_epi32(16); + const __m128i zero = _mm_setzero_si128(); + + // modifier * 3 / index; + const __m128i sum_lo = _mm_unpacklo_epi32(*sum, zero); + const __m128i sum_hi = _mm_unpackhi_epi32(*sum, zero); + const __m128i const_lo = _mm_unpacklo_epi32(*mul_constants, zero); + const __m128i const_hi = _mm_unpackhi_epi32(*mul_constants, zero); + + const __m128i mul_lo = _mm_mul_epu32(sum_lo, const_lo); + const __m128i mul_lo_div = _mm_srli_epi64(mul_lo, 32); + const __m128i mul_hi = _mm_mul_epu32(sum_hi, const_hi); + const __m128i mul_hi_div = _mm_srli_epi64(mul_hi, 32); + + // Now we have + // mul_lo: 00 a1 00 a0 + // mul_hi: 00 a3 00 a2 + // Unpack as 64 bit words to get even and odd elements + // unpack_lo: 00 a2 00 a0 + // unpack_hi: 00 a3 00 a1 + // Then we can shift and OR the results to get everything in 32-bits + const __m128i mul_even = _mm_unpacklo_epi64(mul_lo_div, mul_hi_div); + const __m128i mul_odd = _mm_unpackhi_epi64(mul_lo_div, mul_hi_div); + const __m128i mul_odd_shift = _mm_slli_si128(mul_odd, 4); + const __m128i mul = _mm_or_si128(mul_even, mul_odd_shift); + + // Round + *output = _mm_add_epi32(mul, rounding_u32); + *output = _mm_srl_epi32(*output, strength_u128); + + // Multiply with the weight + *output = _mm_min_epu32(*output, sixteen); + *output = _mm_sub_epi32(sixteen, *output); + *output = _mm_mullo_epi32(*output, weight_u32); +} + +static INLINE void highbd_average_8(__m128i *output_0, __m128i *output_1, + const __m128i *sum_0_u32, + const __m128i *sum_1_u32, + const __m128i *mul_constants_0, + const __m128i *mul_constants_1, + const int strength, const int rounding, + const int weight) { + highbd_average_4(output_0, sum_0_u32, mul_constants_0, strength, rounding, + weight); + highbd_average_4(output_1, sum_1_u32, mul_constants_1, strength, rounding, + weight); +} + +// Add 'sum_u32' to 'count'. Multiply by 'pred' and add to 'accumulator.' +static INLINE void highbd_accumulate_and_store_8(const __m128i sum_first_u32, + const __m128i sum_second_u32, + const uint16_t *pred, + uint16_t *count, + uint32_t *accumulator) { + // Cast down to 16-bit ints + const __m128i sum_u16 = _mm_packus_epi32(sum_first_u32, sum_second_u32); + const __m128i zero = _mm_setzero_si128(); + + __m128i pred_u16 = _mm_loadu_si128((const __m128i *)pred); + __m128i count_u16 = _mm_loadu_si128((const __m128i *)count); + + __m128i pred_0_u32, pred_1_u32; + __m128i accum_0_u32, accum_1_u32; + + count_u16 = _mm_adds_epu16(count_u16, sum_u16); + _mm_storeu_si128((__m128i *)count, count_u16); + + pred_0_u32 = _mm_cvtepu16_epi32(pred_u16); + pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero); + + pred_0_u32 = _mm_mullo_epi32(sum_first_u32, pred_0_u32); + pred_1_u32 = _mm_mullo_epi32(sum_second_u32, pred_1_u32); + + accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator); + accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4)); + + accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32); + accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32); + + _mm_storeu_si128((__m128i *)accumulator, accum_0_u32); + _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32); +} + +static INLINE void highbd_read_dist_4(const uint32_t *dist, __m128i *dist_reg) { + *dist_reg = _mm_loadu_si128((const __m128i *)dist); +} + +static INLINE void highbd_read_dist_8(const uint32_t *dist, __m128i *reg_first, + __m128i *reg_second) { + highbd_read_dist_4(dist, reg_first); + highbd_read_dist_4(dist + 4, reg_second); +} + +static INLINE void highbd_read_chroma_dist_row_8( + int ss_x, const uint32_t *u_dist, const uint32_t *v_dist, __m128i *u_first, + __m128i *u_second, __m128i *v_first, __m128i *v_second) { + if (!ss_x) { + // If there is no chroma subsampling in the horizontal direction, then we + // need to load 8 entries from chroma. + highbd_read_dist_8(u_dist, u_first, u_second); + highbd_read_dist_8(v_dist, v_first, v_second); + } else { // ss_x == 1 + // Otherwise, we only need to load 8 entries + __m128i u_reg, v_reg; + + highbd_read_dist_4(u_dist, &u_reg); + + *u_first = _mm_unpacklo_epi32(u_reg, u_reg); + *u_second = _mm_unpackhi_epi32(u_reg, u_reg); + + highbd_read_dist_4(v_dist, &v_reg); + + *v_first = _mm_unpacklo_epi32(v_reg, v_reg); + *v_second = _mm_unpackhi_epi32(v_reg, v_reg); + } +} + +static void vp9_highbd_apply_temporal_filter_luma_8( + const uint16_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist, + const uint32_t *const *neighbors_first, + const uint32_t *const *neighbors_second, int top_weight, + int bottom_weight) { + const int rounding = (1 << strength) >> 1; + int weight = top_weight; + + __m128i mul_first, mul_second; + + __m128i sum_row_1_first, sum_row_1_second; + __m128i sum_row_2_first, sum_row_2_second; + __m128i sum_row_3_first, sum_row_3_second; + + __m128i u_first, u_second; + __m128i v_first, v_second; + + __m128i sum_row_first; + __m128i sum_row_second; + + // Loop variables + unsigned int h; + + assert(strength >= 4 && strength <= 14 && + "invalid adjusted temporal filter strength"); + assert(block_width == 8); + + (void)block_width; + + // First row + mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]); + mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]); + + // Add luma values + highbd_get_sum_8(y_dist, &sum_row_2_first, &sum_row_2_second); + highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + // We don't need to saturate here because the maximum value is UINT12_MAX ** 2 + // * 9 ~= 2**24 * 9 < 2 ** 28 < INT32_MAX + sum_row_first = _mm_add_epi32(sum_row_2_first, sum_row_3_first); + sum_row_second = _mm_add_epi32(sum_row_2_second, sum_row_3_second); + + // Add chroma values + highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + + // Max value here is 2 ** 24 * (9 + 2), so no saturation is needed + sum_row_first = _mm_add_epi32(sum_row_first, u_first); + sum_row_second = _mm_add_epi32(sum_row_second, u_second); + + sum_row_first = _mm_add_epi32(sum_row_first, v_first); + sum_row_second = _mm_add_epi32(sum_row_second, v_second); + + // Get modifier and store result + highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first, + &sum_row_second, &mul_first, &mul_second, strength, rounding, + weight); + + highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + + u_dist += DIST_STRIDE; + v_dist += DIST_STRIDE; + + // Then all the rows except the last one + mul_first = _mm_load_si128((const __m128i *)neighbors_first[1]); + mul_second = _mm_load_si128((const __m128i *)neighbors_second[1]); + + for (h = 1; h < block_height - 1; ++h) { + // Move the weight to bottom half + if (!use_whole_blk && h == block_height / 2) { + weight = bottom_weight; + } + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first); + sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second); + + highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + sum_row_first = _mm_add_epi32(sum_row_first, sum_row_3_first); + sum_row_second = _mm_add_epi32(sum_row_second, sum_row_3_second); + + // Add chroma values to the modifier + if (ss_y == 0 || h % 2 == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + + u_dist += DIST_STRIDE; + v_dist += DIST_STRIDE; + } + + sum_row_first = _mm_add_epi32(sum_row_first, u_first); + sum_row_second = _mm_add_epi32(sum_row_second, u_second); + sum_row_first = _mm_add_epi32(sum_row_first, v_first); + sum_row_second = _mm_add_epi32(sum_row_second, v_second); + + // Get modifier and store result + highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first, + &sum_row_second, &mul_first, &mul_second, strength, + rounding, weight); + highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + } + + // The last row + mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]); + mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]); + + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first); + sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second); + + // Add chroma values to the modifier + if (ss_y == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + } + + sum_row_first = _mm_add_epi32(sum_row_first, u_first); + sum_row_second = _mm_add_epi32(sum_row_second, u_second); + sum_row_first = _mm_add_epi32(sum_row_first, v_first); + sum_row_second = _mm_add_epi32(sum_row_second, v_second); + + // Get modifier and store result + highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first, + &sum_row_second, &mul_first, &mul_second, strength, rounding, + weight); + highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); +} + +// Perform temporal filter for the luma component. +static void vp9_highbd_apply_temporal_filter_luma( + const uint16_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) { + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x; + const unsigned int mid_width = block_width >> 1, + last_width = block_width - blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const uint32_t *const *neighbors_first; + const uint32_t *const *neighbors_second; + + // Left + neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS; + neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS; + vp9_highbd_apply_temporal_filter_luma_8( + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + neighbors_first = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS; + for (; blk_col < mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_highbd_apply_temporal_filter_luma_8( + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; blk_col < last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_highbd_apply_temporal_filter_luma_8( + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); + } + + // Right + neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS; + vp9_highbd_apply_temporal_filter_luma_8( + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); +} + +// Add a row of luma distortion that corresponds to 8 chroma mods. If we are +// subsampling in x direction, then we have 16 lumas, else we have 8. +static INLINE void highbd_add_luma_dist_to_8_chroma_mod( + const uint32_t *y_dist, int ss_x, int ss_y, __m128i *u_mod_fst, + __m128i *u_mod_snd, __m128i *v_mod_fst, __m128i *v_mod_snd) { + __m128i y_reg_fst, y_reg_snd; + if (!ss_x) { + highbd_read_dist_8(y_dist, &y_reg_fst, &y_reg_snd); + if (ss_y == 1) { + __m128i y_tmp_fst, y_tmp_snd; + highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd); + y_reg_fst = _mm_add_epi32(y_reg_fst, y_tmp_fst); + y_reg_snd = _mm_add_epi32(y_reg_snd, y_tmp_snd); + } + } else { + // Temporary + __m128i y_fst, y_snd; + + // First 8 + highbd_read_dist_8(y_dist, &y_fst, &y_snd); + if (ss_y == 1) { + __m128i y_tmp_fst, y_tmp_snd; + highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd); + + y_fst = _mm_add_epi32(y_fst, y_tmp_fst); + y_snd = _mm_add_epi32(y_snd, y_tmp_snd); + } + + y_reg_fst = _mm_hadd_epi32(y_fst, y_snd); + + // Second 8 + highbd_read_dist_8(y_dist + 8, &y_fst, &y_snd); + if (ss_y == 1) { + __m128i y_tmp_fst, y_tmp_snd; + highbd_read_dist_8(y_dist + 8 + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd); + + y_fst = _mm_add_epi32(y_fst, y_tmp_fst); + y_snd = _mm_add_epi32(y_snd, y_tmp_snd); + } + + y_reg_snd = _mm_hadd_epi32(y_fst, y_snd); + } + + *u_mod_fst = _mm_add_epi32(*u_mod_fst, y_reg_fst); + *u_mod_snd = _mm_add_epi32(*u_mod_snd, y_reg_snd); + *v_mod_fst = _mm_add_epi32(*v_mod_fst, y_reg_fst); + *v_mod_snd = _mm_add_epi32(*v_mod_snd, y_reg_snd); +} + +// Apply temporal filter to the chroma components. This performs temporal +// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use +// blk_fw as an array of size 4 for the weights for each of the 4 subblocks, +// else use top_weight for top half, and bottom weight for bottom half. +static void vp9_highbd_apply_temporal_filter_chroma_8( + const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride, + unsigned int uv_block_width, unsigned int uv_block_height, int ss_x, + int ss_y, int strength, uint32_t *u_accum, uint16_t *u_count, + uint32_t *v_accum, uint16_t *v_count, const uint32_t *y_dist, + const uint32_t *u_dist, const uint32_t *v_dist, + const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd, + int top_weight, int bottom_weight, const int *blk_fw) { + const int rounding = (1 << strength) >> 1; + int weight = top_weight; + + __m128i mul_fst, mul_snd; + + __m128i u_sum_row_1_fst, u_sum_row_2_fst, u_sum_row_3_fst; + __m128i v_sum_row_1_fst, v_sum_row_2_fst, v_sum_row_3_fst; + __m128i u_sum_row_1_snd, u_sum_row_2_snd, u_sum_row_3_snd; + __m128i v_sum_row_1_snd, v_sum_row_2_snd, v_sum_row_3_snd; + + __m128i u_sum_row_fst, v_sum_row_fst; + __m128i u_sum_row_snd, v_sum_row_snd; + + // Loop variable + unsigned int h; + + (void)uv_block_width; + + // First row + mul_fst = _mm_load_si128((const __m128i *)neighbors_fst[0]); + mul_snd = _mm_load_si128((const __m128i *)neighbors_snd[0]); + + // Add chroma values + highbd_get_sum_8(u_dist, &u_sum_row_2_fst, &u_sum_row_2_snd); + highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd); + + u_sum_row_fst = _mm_add_epi32(u_sum_row_2_fst, u_sum_row_3_fst); + u_sum_row_snd = _mm_add_epi32(u_sum_row_2_snd, u_sum_row_3_snd); + + highbd_get_sum_8(v_dist, &v_sum_row_2_fst, &v_sum_row_2_snd); + highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd); + + v_sum_row_fst = _mm_add_epi32(v_sum_row_2_fst, v_sum_row_3_fst); + v_sum_row_snd = _mm_add_epi32(v_sum_row_2_snd, v_sum_row_3_snd); + + // Add luma values + highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst, + &u_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd); + + // Get modifier and store result + if (blk_fw) { + highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + } else { + highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst, + &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + } + highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count, + u_accum); + highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, + v_accum); + + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_dist += DIST_STRIDE * (1 + ss_y); + + // Then all the rows except the last one + mul_fst = _mm_load_si128((const __m128i *)neighbors_fst[1]); + mul_snd = _mm_load_si128((const __m128i *)neighbors_snd[1]); + + for (h = 1; h < uv_block_height - 1; ++h) { + // Move the weight pointer to the bottom half of the blocks + if (h == uv_block_height / 2) { + if (blk_fw) { + blk_fw += 2; + } else { + weight = bottom_weight; + } + } + + // Shift the rows up + u_sum_row_1_fst = u_sum_row_2_fst; + u_sum_row_2_fst = u_sum_row_3_fst; + u_sum_row_1_snd = u_sum_row_2_snd; + u_sum_row_2_snd = u_sum_row_3_snd; + + v_sum_row_1_fst = v_sum_row_2_fst; + v_sum_row_2_fst = v_sum_row_3_fst; + v_sum_row_1_snd = v_sum_row_2_snd; + v_sum_row_2_snd = v_sum_row_3_snd; + + // Add chroma values + u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst); + u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd); + highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd); + u_sum_row_fst = _mm_add_epi32(u_sum_row_fst, u_sum_row_3_fst); + u_sum_row_snd = _mm_add_epi32(u_sum_row_snd, u_sum_row_3_snd); + + v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst); + v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd); + highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd); + v_sum_row_fst = _mm_add_epi32(v_sum_row_fst, v_sum_row_3_fst); + v_sum_row_snd = _mm_add_epi32(v_sum_row_snd, v_sum_row_3_snd); + + // Add luma values + highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst, + &u_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd); + + // Get modifier and store result + if (blk_fw) { + highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + } else { + highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst, + &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + } + + highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count, + u_accum); + highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, + v_accum); + + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_dist += DIST_STRIDE * (1 + ss_y); + } + + // The last row + mul_fst = _mm_load_si128((const __m128i *)neighbors_fst[0]); + mul_snd = _mm_load_si128((const __m128i *)neighbors_snd[0]); + + // Shift the rows up + u_sum_row_1_fst = u_sum_row_2_fst; + u_sum_row_2_fst = u_sum_row_3_fst; + u_sum_row_1_snd = u_sum_row_2_snd; + u_sum_row_2_snd = u_sum_row_3_snd; + + v_sum_row_1_fst = v_sum_row_2_fst; + v_sum_row_2_fst = v_sum_row_3_fst; + v_sum_row_1_snd = v_sum_row_2_snd; + v_sum_row_2_snd = v_sum_row_3_snd; + + // Add chroma values + u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst); + v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst); + u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd); + v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd); + + // Add luma values + highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst, + &u_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd); + + // Get modifier and store result + if (blk_fw) { + highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + } else { + highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst, + &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + } + + highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count, + u_accum); + highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, + v_accum); +} + +// Perform temporal filter for the chroma components. +static void vp9_highbd_apply_temporal_filter_chroma( + const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride, + unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, + int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) { + const unsigned int uv_width = block_width >> ss_x, + uv_height = block_height >> ss_y; + + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x; + const unsigned int uv_mid_width = uv_width >> 1, + uv_last_width = uv_width - uv_blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const uint32_t *const *neighbors_fst; + const uint32_t *const *neighbors_snd; + + if (uv_width == 8) { + // Special Case: We are subsampling in x direction on a 16x16 block. Since + // we are operating on a row of 8 chroma pixels, we can't use the usual + // left-middle-right pattern. + assert(ss_x); + + if (ss_y) { + neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else { + neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS; + } + + if (use_whole_blk) { + vp9_highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL); + } else { + vp9_highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, 0, 0, blk_fw); + } + + return; + } + + // Left + if (ss_x && ss_y) { + neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else { + neighbors_fst = HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS; + } + + vp9_highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, + neighbors_snd, top_weight, bottom_weight, NULL); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + if (ss_x && ss_y) { + neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else { + neighbors_fst = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS; + } + + for (; uv_blk_col < uv_mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; uv_blk_col < uv_last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL); + } + + // Right + if (ss_x && ss_y) { + neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else { + neighbors_snd = HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS; + } + + vp9_highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, + neighbors_snd, top_weight, bottom_weight, NULL); +} + +void vp9_highbd_apply_temporal_filter_sse4_1( + const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, + int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, + int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *const blk_fw, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) { + const unsigned int chroma_height = block_height >> ss_y, + chroma_width = block_width >> ss_x; + + DECLARE_ALIGNED(16, uint32_t, y_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint32_t, u_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint32_t, v_dist[BH * DIST_STRIDE]) = { 0 }; + + uint32_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1, + *v_dist_ptr = v_dist + 1; + const uint16_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src; + const uint16_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre; + + // Loop variables + unsigned int row, blk_col; + + assert(block_width <= BW && "block width too large"); + assert(block_height <= BH && "block height too large"); + assert(block_width % 16 == 0 && "block width must be multiple of 16"); + assert(block_height % 2 == 0 && "block height must be even"); + assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) && + "invalid chroma subsampling"); + assert(strength >= 4 && strength <= 14 && + "invalid adjusted temporal filter strength"); + assert(blk_fw[0] >= 0 && "filter weight must be positive"); + assert( + (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) && + "subblock filter weight must be positive"); + assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2"); + assert( + (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) && + "subblock filter weight must be less than 2"); + + // Precompute the difference squared + for (row = 0; row < block_height; row++) { + for (blk_col = 0; blk_col < block_width; blk_col += 8) { + highbd_store_dist_8(y_src_ptr + blk_col, y_pre_ptr + blk_col, + y_dist_ptr + blk_col); + } + y_src_ptr += y_src_stride; + y_pre_ptr += y_pre_stride; + y_dist_ptr += DIST_STRIDE; + } + + for (row = 0; row < chroma_height; row++) { + for (blk_col = 0; blk_col < chroma_width; blk_col += 8) { + highbd_store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col, + u_dist_ptr + blk_col); + highbd_store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col, + v_dist_ptr + blk_col); + } + + u_src_ptr += uv_src_stride; + u_pre_ptr += uv_pre_stride; + u_dist_ptr += DIST_STRIDE; + v_src_ptr += uv_src_stride; + v_pre_ptr += uv_pre_stride; + v_dist_ptr += DIST_STRIDE; + } + + y_dist_ptr = y_dist + 1; + u_dist_ptr = u_dist + 1; + v_dist_ptr = v_dist + 1; + + vp9_highbd_apply_temporal_filter_luma(y_pre, y_pre_stride, block_width, + block_height, ss_x, ss_y, strength, + blk_fw, use_whole_blk, y_accum, y_count, + y_dist_ptr, u_dist_ptr, v_dist_ptr); + + vp9_highbd_apply_temporal_filter_chroma( + u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y, + strength, blk_fw, use_whole_blk, u_accum, u_count, v_accum, v_count, + y_dist_ptr, u_dist_ptr, v_dist_ptr); +} diff --git a/media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_sse4.c b/media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_sse4.c new file mode 100644 index 0000000000..7571bfccac --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_sse4.c @@ -0,0 +1,875 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_temporal_filter.h" +#include "vp9/encoder/vp9_temporal_filter_constants.h" + +// Read in 8 pixels from a and b as 8-bit unsigned integers, compute the +// difference squared, and store as unsigned 16-bit integer to dst. +static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b, + uint16_t *dst) { + const __m128i a_reg = _mm_loadl_epi64((const __m128i *)a); + const __m128i b_reg = _mm_loadl_epi64((const __m128i *)b); + + const __m128i a_first = _mm_cvtepu8_epi16(a_reg); + const __m128i b_first = _mm_cvtepu8_epi16(b_reg); + + __m128i dist_first; + + dist_first = _mm_sub_epi16(a_first, b_first); + dist_first = _mm_mullo_epi16(dist_first, dist_first); + + _mm_storeu_si128((__m128i *)dst, dist_first); +} + +static INLINE void store_dist_16(const uint8_t *a, const uint8_t *b, + uint16_t *dst) { + const __m128i zero = _mm_setzero_si128(); + const __m128i a_reg = _mm_loadu_si128((const __m128i *)a); + const __m128i b_reg = _mm_loadu_si128((const __m128i *)b); + + const __m128i a_first = _mm_cvtepu8_epi16(a_reg); + const __m128i a_second = _mm_unpackhi_epi8(a_reg, zero); + const __m128i b_first = _mm_cvtepu8_epi16(b_reg); + const __m128i b_second = _mm_unpackhi_epi8(b_reg, zero); + + __m128i dist_first, dist_second; + + dist_first = _mm_sub_epi16(a_first, b_first); + dist_second = _mm_sub_epi16(a_second, b_second); + dist_first = _mm_mullo_epi16(dist_first, dist_first); + dist_second = _mm_mullo_epi16(dist_second, dist_second); + + _mm_storeu_si128((__m128i *)dst, dist_first); + _mm_storeu_si128((__m128i *)(dst + 8), dist_second); +} + +static INLINE void read_dist_8(const uint16_t *dist, __m128i *dist_reg) { + *dist_reg = _mm_loadu_si128((const __m128i *)dist); +} + +static INLINE void read_dist_16(const uint16_t *dist, __m128i *reg_first, + __m128i *reg_second) { + read_dist_8(dist, reg_first); + read_dist_8(dist + 8, reg_second); +} + +// Average the value based on the number of values summed (9 for pixels away +// from the border, 4 for pixels in corners, and 6 for other edge values). +// +// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply +// by weight. +static INLINE __m128i average_8(__m128i sum, const __m128i *mul_constants, + const int strength, const int rounding, + const __m128i *weight) { + // _mm_srl_epi16 uses the lower 64 bit value for the shift. + const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); + const __m128i rounding_u16 = _mm_set1_epi16(rounding); + const __m128i weight_u16 = *weight; + const __m128i sixteen = _mm_set1_epi16(16); + + // modifier * 3 / index; + sum = _mm_mulhi_epu16(sum, *mul_constants); + + sum = _mm_adds_epu16(sum, rounding_u16); + sum = _mm_srl_epi16(sum, strength_u128); + + // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4 + // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385 + // So this needs to use the epu16 version which did not come until SSE4. + sum = _mm_min_epu16(sum, sixteen); + + sum = _mm_sub_epi16(sixteen, sum); + + return _mm_mullo_epi16(sum, weight_u16); +} + +// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.' +static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred, + uint16_t *count, uint32_t *accumulator) { + const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred); + const __m128i zero = _mm_setzero_si128(); + __m128i count_u16 = _mm_loadu_si128((const __m128i *)count); + __m128i pred_u16 = _mm_cvtepu8_epi16(pred_u8); + __m128i pred_0_u32, pred_1_u32; + __m128i accum_0_u32, accum_1_u32; + + count_u16 = _mm_adds_epu16(count_u16, sum_u16); + _mm_storeu_si128((__m128i *)count, count_u16); + + pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16); + + pred_0_u32 = _mm_cvtepu16_epi32(pred_u16); + pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero); + + accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator); + accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4)); + + accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32); + accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32); + + _mm_storeu_si128((__m128i *)accumulator, accum_0_u32); + _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32); +} + +static INLINE void accumulate_and_store_16(const __m128i sum_0_u16, + const __m128i sum_1_u16, + const uint8_t *pred, uint16_t *count, + uint32_t *accumulator) { + const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred); + const __m128i zero = _mm_setzero_si128(); + __m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count), + count_1_u16 = _mm_loadu_si128((const __m128i *)(count + 8)); + __m128i pred_0_u16 = _mm_cvtepu8_epi16(pred_u8), + pred_1_u16 = _mm_unpackhi_epi8(pred_u8, zero); + __m128i pred_0_u32, pred_1_u32, pred_2_u32, pred_3_u32; + __m128i accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32; + + count_0_u16 = _mm_adds_epu16(count_0_u16, sum_0_u16); + _mm_storeu_si128((__m128i *)count, count_0_u16); + + count_1_u16 = _mm_adds_epu16(count_1_u16, sum_1_u16); + _mm_storeu_si128((__m128i *)(count + 8), count_1_u16); + + pred_0_u16 = _mm_mullo_epi16(sum_0_u16, pred_0_u16); + pred_1_u16 = _mm_mullo_epi16(sum_1_u16, pred_1_u16); + + pred_0_u32 = _mm_cvtepu16_epi32(pred_0_u16); + pred_1_u32 = _mm_unpackhi_epi16(pred_0_u16, zero); + pred_2_u32 = _mm_cvtepu16_epi32(pred_1_u16); + pred_3_u32 = _mm_unpackhi_epi16(pred_1_u16, zero); + + accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator); + accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4)); + accum_2_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 8)); + accum_3_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 12)); + + accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32); + accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32); + accum_2_u32 = _mm_add_epi32(pred_2_u32, accum_2_u32); + accum_3_u32 = _mm_add_epi32(pred_3_u32, accum_3_u32); + + _mm_storeu_si128((__m128i *)accumulator, accum_0_u32); + _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32); + _mm_storeu_si128((__m128i *)(accumulator + 8), accum_2_u32); + _mm_storeu_si128((__m128i *)(accumulator + 12), accum_3_u32); +} + +// Read in 8 pixels from y_dist. For each index i, compute y_dist[i-1] + +// y_dist[i] + y_dist[i+1] and store in sum as 16-bit unsigned int. +static INLINE void get_sum_8(const uint16_t *y_dist, __m128i *sum) { + __m128i dist_reg, dist_left, dist_right; + + dist_reg = _mm_loadu_si128((const __m128i *)y_dist); + dist_left = _mm_loadu_si128((const __m128i *)(y_dist - 1)); + dist_right = _mm_loadu_si128((const __m128i *)(y_dist + 1)); + + *sum = _mm_adds_epu16(dist_reg, dist_left); + *sum = _mm_adds_epu16(*sum, dist_right); +} + +// Read in 16 pixels from y_dist. For each index i, compute y_dist[i-1] + +// y_dist[i] + y_dist[i+1]. Store the result for first 8 pixels in sum_first and +// the rest in sum_second. +static INLINE void get_sum_16(const uint16_t *y_dist, __m128i *sum_first, + __m128i *sum_second) { + get_sum_8(y_dist, sum_first); + get_sum_8(y_dist + 8, sum_second); +} + +// Read in a row of chroma values corresponds to a row of 16 luma values. +static INLINE void read_chroma_dist_row_16(int ss_x, const uint16_t *u_dist, + const uint16_t *v_dist, + __m128i *u_first, __m128i *u_second, + __m128i *v_first, + __m128i *v_second) { + if (!ss_x) { + // If there is no chroma subsampling in the horizontal direction, then we + // need to load 16 entries from chroma. + read_dist_16(u_dist, u_first, u_second); + read_dist_16(v_dist, v_first, v_second); + } else { // ss_x == 1 + // Otherwise, we only need to load 8 entries + __m128i u_reg, v_reg; + + read_dist_8(u_dist, &u_reg); + + *u_first = _mm_unpacklo_epi16(u_reg, u_reg); + *u_second = _mm_unpackhi_epi16(u_reg, u_reg); + + read_dist_8(v_dist, &v_reg); + + *v_first = _mm_unpacklo_epi16(v_reg, v_reg); + *v_second = _mm_unpackhi_epi16(v_reg, v_reg); + } +} + +// Horizontal add unsigned 16-bit ints in src and store them as signed 32-bit +// int in dst. +static INLINE void hadd_epu16(__m128i *src, __m128i *dst) { + const __m128i zero = _mm_setzero_si128(); + const __m128i shift_right = _mm_srli_si128(*src, 2); + + const __m128i odd = _mm_blend_epi16(shift_right, zero, 170); + const __m128i even = _mm_blend_epi16(*src, zero, 170); + + *dst = _mm_add_epi32(even, odd); +} + +// Add a row of luma distortion to 8 corresponding chroma mods. +static INLINE void add_luma_dist_to_8_chroma_mod(const uint16_t *y_dist, + int ss_x, int ss_y, + __m128i *u_mod, + __m128i *v_mod) { + __m128i y_reg; + if (!ss_x) { + read_dist_8(y_dist, &y_reg); + if (ss_y == 1) { + __m128i y_tmp; + read_dist_8(y_dist + DIST_STRIDE, &y_tmp); + + y_reg = _mm_adds_epu16(y_reg, y_tmp); + } + } else { + __m128i y_first, y_second; + read_dist_16(y_dist, &y_first, &y_second); + if (ss_y == 1) { + __m128i y_tmp_0, y_tmp_1; + read_dist_16(y_dist + DIST_STRIDE, &y_tmp_0, &y_tmp_1); + + y_first = _mm_adds_epu16(y_first, y_tmp_0); + y_second = _mm_adds_epu16(y_second, y_tmp_1); + } + + hadd_epu16(&y_first, &y_first); + hadd_epu16(&y_second, &y_second); + + y_reg = _mm_packus_epi32(y_first, y_second); + } + + *u_mod = _mm_adds_epu16(*u_mod, y_reg); + *v_mod = _mm_adds_epu16(*v_mod, y_reg); +} + +// Apply temporal filter to the luma components. This performs temporal +// filtering on a luma block of 16 X block_height. Use blk_fw as an array of +// size 4 for the weights for each of the 4 subblocks if blk_fw is not NULL, +// else use top_weight for top half, and bottom weight for bottom half. +static void vp9_apply_temporal_filter_luma_16( + const uint8_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist, + const int16_t *const *neighbors_first, + const int16_t *const *neighbors_second, int top_weight, int bottom_weight, + const int *blk_fw) { + const int rounding = (1 << strength) >> 1; + __m128i weight_first, weight_second; + + __m128i mul_first, mul_second; + + __m128i sum_row_1_first, sum_row_1_second; + __m128i sum_row_2_first, sum_row_2_second; + __m128i sum_row_3_first, sum_row_3_second; + + __m128i u_first, u_second; + __m128i v_first, v_second; + + __m128i sum_row_first; + __m128i sum_row_second; + + // Loop variables + unsigned int h; + + assert(strength >= 0); + assert(strength <= 6); + + assert(block_width == 16); + (void)block_width; + + // Initialize the weights + if (blk_fw) { + weight_first = _mm_set1_epi16(blk_fw[0]); + weight_second = _mm_set1_epi16(blk_fw[1]); + } else { + weight_first = _mm_set1_epi16(top_weight); + weight_second = weight_first; + } + + // First row + mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]); + mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]); + + // Add luma values + get_sum_16(y_dist, &sum_row_2_first, &sum_row_2_second); + get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + sum_row_first = _mm_adds_epu16(sum_row_2_first, sum_row_3_first); + sum_row_second = _mm_adds_epu16(sum_row_2_second, sum_row_3_second); + + // Add chroma values + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first, + &v_second); + + sum_row_first = _mm_adds_epu16(sum_row_first, u_first); + sum_row_second = _mm_adds_epu16(sum_row_second, u_second); + + sum_row_first = _mm_adds_epu16(sum_row_first, v_first); + sum_row_second = _mm_adds_epu16(sum_row_second, v_second); + + // Get modifier and store result + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, &weight_first); + sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding, + &weight_second); + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + + u_dist += DIST_STRIDE; + v_dist += DIST_STRIDE; + + // Then all the rows except the last one + mul_first = _mm_load_si128((const __m128i *)neighbors_first[1]); + mul_second = _mm_load_si128((const __m128i *)neighbors_second[1]); + + for (h = 1; h < block_height - 1; ++h) { + // Move the weight to bottom half + if (!use_whole_blk && h == block_height / 2) { + if (blk_fw) { + weight_first = _mm_set1_epi16(blk_fw[2]); + weight_second = _mm_set1_epi16(blk_fw[3]); + } else { + weight_first = _mm_set1_epi16(bottom_weight); + weight_second = weight_first; + } + } + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first); + sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second); + + get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + sum_row_first = _mm_adds_epu16(sum_row_first, sum_row_3_first); + sum_row_second = _mm_adds_epu16(sum_row_second, sum_row_3_second); + + // Add chroma values to the modifier + if (ss_y == 0 || h % 2 == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + + u_dist += DIST_STRIDE; + v_dist += DIST_STRIDE; + } + + sum_row_first = _mm_adds_epu16(sum_row_first, u_first); + sum_row_second = _mm_adds_epu16(sum_row_second, u_second); + sum_row_first = _mm_adds_epu16(sum_row_first, v_first); + sum_row_second = _mm_adds_epu16(sum_row_second, v_second); + + // Get modifier and store result + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, &weight_first); + sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding, + &weight_second); + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + } + + // The last row + mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]); + mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]); + + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first); + sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second); + + // Add chroma values to the modifier + if (ss_y == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first, + &v_second); + } + + sum_row_first = _mm_adds_epu16(sum_row_first, u_first); + sum_row_second = _mm_adds_epu16(sum_row_second, u_second); + sum_row_first = _mm_adds_epu16(sum_row_first, v_first); + sum_row_second = _mm_adds_epu16(sum_row_second, v_second); + + // Get modifier and store result + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, &weight_first); + sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding, + &weight_second); + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); +} + +// Perform temporal filter for the luma component. +static void vp9_apply_temporal_filter_luma( + const uint8_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) { + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int blk_col_step = 16, uv_blk_col_step = 16 >> ss_x; + const unsigned int mid_width = block_width >> 1, + last_width = block_width - blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const int16_t *const *neighbors_first; + const int16_t *const *neighbors_second; + + if (block_width == 16) { + // Special Case: The blockwidth is 16 and we are operating on a row of 16 + // chroma pixels. In this case, we can't use the usual left-middle-right + // pattern. We also don't support splitting now. + neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS; + neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS; + if (use_whole_blk) { + vp9_apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); + } else { + vp9_apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, 0, 0, blk_fw); + } + + return; + } + + // Left + neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS; + neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS; + vp9_apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + neighbors_first = LUMA_MIDDLE_COLUMN_NEIGHBORS; + for (; blk_col < mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; blk_col < last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); + } + + // Right + neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS; + vp9_apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); +} + +// Apply temporal filter to the chroma components. This performs temporal +// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use +// blk_fw as an array of size 4 for the weights for each of the 4 subblocks, +// else use top_weight for top half, and bottom weight for bottom half. +static void vp9_apply_temporal_filter_chroma_8( + const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, + unsigned int uv_block_height, int ss_x, int ss_y, int strength, + uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist, + const int16_t *const *neighbors, int top_weight, int bottom_weight, + const int *blk_fw) { + const int rounding = (1 << strength) >> 1; + + __m128i weight; + + __m128i mul; + + __m128i u_sum_row_1, u_sum_row_2, u_sum_row_3; + __m128i v_sum_row_1, v_sum_row_2, v_sum_row_3; + + __m128i u_sum_row, v_sum_row; + + // Loop variable + unsigned int h; + + // Initialize weight + if (blk_fw) { + weight = _mm_setr_epi16(blk_fw[0], blk_fw[0], blk_fw[0], blk_fw[0], + blk_fw[1], blk_fw[1], blk_fw[1], blk_fw[1]); + } else { + weight = _mm_set1_epi16(top_weight); + } + + // First row + mul = _mm_load_si128((const __m128i *)neighbors[0]); + + // Add chroma values + get_sum_8(u_dist, &u_sum_row_2); + get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3); + + u_sum_row = _mm_adds_epu16(u_sum_row_2, u_sum_row_3); + + get_sum_8(v_dist, &v_sum_row_2); + get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3); + + v_sum_row = _mm_adds_epu16(v_sum_row_2, v_sum_row_3); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight); + + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); + + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_dist += DIST_STRIDE * (1 + ss_y); + + // Then all the rows except the last one + mul = _mm_load_si128((const __m128i *)neighbors[1]); + + for (h = 1; h < uv_block_height - 1; ++h) { + // Move the weight pointer to the bottom half of the blocks + if (h == uv_block_height / 2) { + if (blk_fw) { + weight = _mm_setr_epi16(blk_fw[2], blk_fw[2], blk_fw[2], blk_fw[2], + blk_fw[3], blk_fw[3], blk_fw[3], blk_fw[3]); + } else { + weight = _mm_set1_epi16(bottom_weight); + } + } + + // Shift the rows up + u_sum_row_1 = u_sum_row_2; + u_sum_row_2 = u_sum_row_3; + + v_sum_row_1 = v_sum_row_2; + v_sum_row_2 = v_sum_row_3; + + // Add chroma values + u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2); + get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3); + u_sum_row = _mm_adds_epu16(u_sum_row, u_sum_row_3); + + v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2); + get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3); + v_sum_row = _mm_adds_epu16(v_sum_row, v_sum_row_3); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight); + + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); + + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_dist += DIST_STRIDE * (1 + ss_y); + } + + // The last row + mul = _mm_load_si128((const __m128i *)neighbors[0]); + + // Shift the rows up + u_sum_row_1 = u_sum_row_2; + u_sum_row_2 = u_sum_row_3; + + v_sum_row_1 = v_sum_row_2; + v_sum_row_2 = v_sum_row_3; + + // Add chroma values + u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2); + v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight); + + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); +} + +// Perform temporal filter for the chroma components. +static void vp9_apply_temporal_filter_chroma( + const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, + unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, + int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) { + const unsigned int uv_width = block_width >> ss_x, + uv_height = block_height >> ss_y; + + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x; + const unsigned int uv_mid_width = uv_width >> 1, + uv_last_width = uv_width - uv_blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const int16_t *const *neighbors; + + if (uv_width == 8) { + // Special Case: We are subsampling in x direction on a 16x16 block. Since + // we are operating on a row of 8 chroma pixels, we can't use the usual + // left-middle-right pattern. + assert(ss_x); + + if (ss_y) { + neighbors = CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS; + } + + if (use_whole_blk) { + vp9_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, + ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); + } else { + vp9_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, + ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, 0, 0, blk_fw); + } + + return; + } + + // Left + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS; + } + + vp9_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x, + ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS; + } + + for (; uv_blk_col < uv_mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x, + ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; uv_blk_col < uv_last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x, + ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); + } + + // Right + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS; + } + + vp9_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x, + ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); +} + +void vp9_apply_temporal_filter_sse4_1( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *const blk_fw, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) { + const unsigned int chroma_height = block_height >> ss_y, + chroma_width = block_width >> ss_x; + + DECLARE_ALIGNED(16, uint16_t, y_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, u_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, v_dist[BH * DIST_STRIDE]) = { 0 }; + const int *blk_fw_ptr = blk_fw; + + uint16_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1, + *v_dist_ptr = v_dist + 1; + const uint8_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src; + const uint8_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre; + + // Loop variables + unsigned int row, blk_col; + + assert(block_width <= BW && "block width too large"); + assert(block_height <= BH && "block height too large"); + assert(block_width % 16 == 0 && "block width must be multiple of 16"); + assert(block_height % 2 == 0 && "block height must be even"); + assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) && + "invalid chroma subsampling"); + assert(strength >= 0 && strength <= 6 && "invalid temporal filter strength"); + assert(blk_fw[0] >= 0 && "filter weight must be positive"); + assert( + (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) && + "subblock filter weight must be positive"); + assert(blk_fw[0] <= 2 && "subblock filter weight must be less than 2"); + assert( + (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) && + "subblock filter weight must be less than 2"); + + // Precompute the difference squared + for (row = 0; row < block_height; row++) { + for (blk_col = 0; blk_col < block_width; blk_col += 16) { + store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col, + y_dist_ptr + blk_col); + } + y_src_ptr += y_src_stride; + y_pre_ptr += y_pre_stride; + y_dist_ptr += DIST_STRIDE; + } + + for (row = 0; row < chroma_height; row++) { + for (blk_col = 0; blk_col < chroma_width; blk_col += 8) { + store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col, + u_dist_ptr + blk_col); + store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col, + v_dist_ptr + blk_col); + } + + u_src_ptr += uv_src_stride; + u_pre_ptr += uv_pre_stride; + u_dist_ptr += DIST_STRIDE; + v_src_ptr += uv_src_stride; + v_pre_ptr += uv_pre_stride; + v_dist_ptr += DIST_STRIDE; + } + + y_dist_ptr = y_dist + 1; + u_dist_ptr = u_dist + 1; + v_dist_ptr = v_dist + 1; + + vp9_apply_temporal_filter_luma(y_pre, y_pre_stride, block_width, block_height, + ss_x, ss_y, strength, blk_fw_ptr, + use_whole_blk, y_accum, y_count, y_dist_ptr, + u_dist_ptr, v_dist_ptr); + + vp9_apply_temporal_filter_chroma( + u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y, + strength, blk_fw_ptr, use_whole_blk, u_accum, u_count, v_accum, v_count, + y_dist_ptr, u_dist_ptr, v_dist_ptr); +} diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c new file mode 100644 index 0000000000..e9943447fd --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c @@ -0,0 +1,1537 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include // SSE2 + +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" +#include "vpx_dsp/x86/fwd_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" +#include "vpx_ports/mem.h" + +static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in, + int stride) { + const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); + const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); + __m128i mask; + + in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); + in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); + + in[0] = _mm_slli_epi16(in[0], 4); + in[1] = _mm_slli_epi16(in[1], 4); + in[2] = _mm_slli_epi16(in[2], 4); + in[3] = _mm_slli_epi16(in[3], 4); + + mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a); + in[0] = _mm_add_epi16(in[0], mask); + in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b); +} + +static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) { + const __m128i kOne = _mm_set1_epi16(1); + __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]); + __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]); + __m128i out01 = _mm_add_epi16(in01, kOne); + __m128i out23 = _mm_add_epi16(in23, kOne); + out01 = _mm_srai_epi16(out01, 2); + out23 = _mm_srai_epi16(out23, 2); + store_output(&out01, (output + 0 * 8)); + store_output(&out23, (output + 1 * 8)); +} + +static INLINE void transpose_4x4(__m128i *res) { + // Combine and transpose + // 00 01 02 03 20 21 22 23 + // 10 11 12 13 30 31 32 33 + const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); + const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); + + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); + res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); + + // 00 10 20 30 01 11 21 31 + // 02 12 22 32 03 13 23 33 + // only use the first 4 16-bit integers + res[1] = _mm_unpackhi_epi64(res[0], res[0]); + res[3] = _mm_unpackhi_epi64(res[2], res[2]); +} + +static void fdct4_sse2(__m128i *in) { + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + + __m128i u[4], v[4]; + u[0] = _mm_unpacklo_epi16(in[0], in[1]); + u[1] = _mm_unpacklo_epi16(in[3], in[2]); + + v[0] = _mm_add_epi16(u[0], u[1]); + v[1] = _mm_sub_epi16(u[0], u[1]); + + u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0 + u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2 + u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1 + u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3 + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + + in[0] = _mm_packs_epi32(u[0], u[1]); + in[1] = _mm_packs_epi32(u[2], u[3]); + transpose_4x4(in); +} + +static void fadst4_sse2(__m128i *in) { + const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9); + const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9); + const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9); + const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9); + const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9); + const __m128i kZero = _mm_setzero_si128(); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i u[8], v[8]; + __m128i in7 = _mm_add_epi16(in[0], in[1]); + + u[0] = _mm_unpacklo_epi16(in[0], in[1]); + u[1] = _mm_unpacklo_epi16(in[2], in[3]); + u[2] = _mm_unpacklo_epi16(in7, kZero); + u[3] = _mm_unpacklo_epi16(in[2], kZero); + u[4] = _mm_unpacklo_epi16(in[3], kZero); + + v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2 + v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5 + v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x1 + v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3 + v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6 + v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4 + v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03); + + u[0] = _mm_add_epi32(v[0], v[1]); + u[1] = _mm_sub_epi32(v[2], v[6]); + u[2] = _mm_add_epi32(v[3], v[4]); + u[3] = _mm_sub_epi32(u[2], u[0]); + u[4] = _mm_slli_epi32(v[5], 2); + u[5] = _mm_sub_epi32(u[4], v[5]); + u[6] = _mm_add_epi32(u[3], u[5]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + + in[0] = _mm_packs_epi32(u[0], u[2]); + in[1] = _mm_packs_epi32(u[1], u[3]); + transpose_4x4(in); +} + +void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + __m128i in[4]; + + switch (tx_type) { + case DCT_DCT: vpx_fdct4x4_sse2(input, output, stride); break; + case ADST_DCT: + load_buffer_4x4(input, in, stride); + fadst4_sse2(in); + fdct4_sse2(in); + write_buffer_4x4(output, in); + break; + case DCT_ADST: + load_buffer_4x4(input, in, stride); + fdct4_sse2(in); + fadst4_sse2(in); + write_buffer_4x4(output, in); + break; + default: + assert(tx_type == ADST_ADST); + load_buffer_4x4(input, in, stride); + fadst4_sse2(in); + fadst4_sse2(in); + write_buffer_4x4(output, in); + break; + } +} + +// load 8x8 array +static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in, + int stride) { + in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); + in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); + in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); + in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); + in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); + + in[0] = _mm_slli_epi16(in[0], 2); + in[1] = _mm_slli_epi16(in[1], 2); + in[2] = _mm_slli_epi16(in[2], 2); + in[3] = _mm_slli_epi16(in[3], 2); + in[4] = _mm_slli_epi16(in[4], 2); + in[5] = _mm_slli_epi16(in[5], 2); + in[6] = _mm_slli_epi16(in[6], 2); + in[7] = _mm_slli_epi16(in[7], 2); +} + +// right shift and rounding +static INLINE void right_shift_8x8(__m128i *res, const int bit) { + __m128i sign0 = _mm_srai_epi16(res[0], 15); + __m128i sign1 = _mm_srai_epi16(res[1], 15); + __m128i sign2 = _mm_srai_epi16(res[2], 15); + __m128i sign3 = _mm_srai_epi16(res[3], 15); + __m128i sign4 = _mm_srai_epi16(res[4], 15); + __m128i sign5 = _mm_srai_epi16(res[5], 15); + __m128i sign6 = _mm_srai_epi16(res[6], 15); + __m128i sign7 = _mm_srai_epi16(res[7], 15); + + if (bit == 2) { + const __m128i const_rounding = _mm_set1_epi16(1); + res[0] = _mm_add_epi16(res[0], const_rounding); + res[1] = _mm_add_epi16(res[1], const_rounding); + res[2] = _mm_add_epi16(res[2], const_rounding); + res[3] = _mm_add_epi16(res[3], const_rounding); + res[4] = _mm_add_epi16(res[4], const_rounding); + res[5] = _mm_add_epi16(res[5], const_rounding); + res[6] = _mm_add_epi16(res[6], const_rounding); + res[7] = _mm_add_epi16(res[7], const_rounding); + } + + res[0] = _mm_sub_epi16(res[0], sign0); + res[1] = _mm_sub_epi16(res[1], sign1); + res[2] = _mm_sub_epi16(res[2], sign2); + res[3] = _mm_sub_epi16(res[3], sign3); + res[4] = _mm_sub_epi16(res[4], sign4); + res[5] = _mm_sub_epi16(res[5], sign5); + res[6] = _mm_sub_epi16(res[6], sign6); + res[7] = _mm_sub_epi16(res[7], sign7); + + if (bit == 1) { + res[0] = _mm_srai_epi16(res[0], 1); + res[1] = _mm_srai_epi16(res[1], 1); + res[2] = _mm_srai_epi16(res[2], 1); + res[3] = _mm_srai_epi16(res[3], 1); + res[4] = _mm_srai_epi16(res[4], 1); + res[5] = _mm_srai_epi16(res[5], 1); + res[6] = _mm_srai_epi16(res[6], 1); + res[7] = _mm_srai_epi16(res[7], 1); + } else { + res[0] = _mm_srai_epi16(res[0], 2); + res[1] = _mm_srai_epi16(res[1], 2); + res[2] = _mm_srai_epi16(res[2], 2); + res[3] = _mm_srai_epi16(res[3], 2); + res[4] = _mm_srai_epi16(res[4], 2); + res[5] = _mm_srai_epi16(res[5], 2); + res[6] = _mm_srai_epi16(res[6], 2); + res[7] = _mm_srai_epi16(res[7], 2); + } +} + +// write 8x8 array +static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res, + int stride) { + store_output(&res[0], (output + 0 * stride)); + store_output(&res[1], (output + 1 * stride)); + store_output(&res[2], (output + 2 * stride)); + store_output(&res[3], (output + 3 * stride)); + store_output(&res[4], (output + 4 * stride)); + store_output(&res[5], (output + 5 * stride)); + store_output(&res[6], (output + 6 * stride)); + store_output(&res[7], (output + 7 * stride)); +} + +static void fdct8_sse2(__m128i *in) { + // constants + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + __m128i s0, s1, s2, s3, s4, s5, s6, s7; + + // stage 1 + s0 = _mm_add_epi16(in[0], in[7]); + s1 = _mm_add_epi16(in[1], in[6]); + s2 = _mm_add_epi16(in[2], in[5]); + s3 = _mm_add_epi16(in[3], in[4]); + s4 = _mm_sub_epi16(in[3], in[4]); + s5 = _mm_sub_epi16(in[2], in[5]); + s6 = _mm_sub_epi16(in[1], in[6]); + s7 = _mm_sub_epi16(in[0], in[7]); + + u0 = _mm_add_epi16(s0, s3); + u1 = _mm_add_epi16(s1, s2); + u2 = _mm_sub_epi16(s1, s2); + u3 = _mm_sub_epi16(s0, s3); + // interleave and perform butterfly multiplication/addition + v0 = _mm_unpacklo_epi16(u0, u1); + v1 = _mm_unpackhi_epi16(u0, u1); + v2 = _mm_unpacklo_epi16(u2, u3); + v3 = _mm_unpackhi_epi16(u2, u3); + + u0 = _mm_madd_epi16(v0, k__cospi_p16_p16); + u1 = _mm_madd_epi16(v1, k__cospi_p16_p16); + u2 = _mm_madd_epi16(v0, k__cospi_p16_m16); + u3 = _mm_madd_epi16(v1, k__cospi_p16_m16); + u4 = _mm_madd_epi16(v2, k__cospi_p24_p08); + u5 = _mm_madd_epi16(v3, k__cospi_p24_p08); + u6 = _mm_madd_epi16(v2, k__cospi_m08_p24); + u7 = _mm_madd_epi16(v3, k__cospi_m08_p24); + + // shift and rounding + v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + + u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + + in[0] = _mm_packs_epi32(u0, u1); + in[2] = _mm_packs_epi32(u4, u5); + in[4] = _mm_packs_epi32(u2, u3); + in[6] = _mm_packs_epi32(u6, u7); + + // stage 2 + // interleave and perform butterfly multiplication/addition + u0 = _mm_unpacklo_epi16(s6, s5); + u1 = _mm_unpackhi_epi16(s6, s5); + v0 = _mm_madd_epi16(u0, k__cospi_p16_m16); + v1 = _mm_madd_epi16(u1, k__cospi_p16_m16); + v2 = _mm_madd_epi16(u0, k__cospi_p16_p16); + v3 = _mm_madd_epi16(u1, k__cospi_p16_p16); + + // shift and rounding + u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); + u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); + u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); + u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); + + v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); + v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); + v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); + v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); + + u0 = _mm_packs_epi32(v0, v1); + u1 = _mm_packs_epi32(v2, v3); + + // stage 3 + s0 = _mm_add_epi16(s4, u0); + s1 = _mm_sub_epi16(s4, u0); + s2 = _mm_sub_epi16(s7, u1); + s3 = _mm_add_epi16(s7, u1); + + // stage 4 + u0 = _mm_unpacklo_epi16(s0, s3); + u1 = _mm_unpackhi_epi16(s0, s3); + u2 = _mm_unpacklo_epi16(s1, s2); + u3 = _mm_unpackhi_epi16(s1, s2); + + v0 = _mm_madd_epi16(u0, k__cospi_p28_p04); + v1 = _mm_madd_epi16(u1, k__cospi_p28_p04); + v2 = _mm_madd_epi16(u2, k__cospi_p12_p20); + v3 = _mm_madd_epi16(u3, k__cospi_p12_p20); + v4 = _mm_madd_epi16(u2, k__cospi_m20_p12); + v5 = _mm_madd_epi16(u3, k__cospi_m20_p12); + v6 = _mm_madd_epi16(u0, k__cospi_m04_p28); + v7 = _mm_madd_epi16(u1, k__cospi_m04_p28); + + // shift and rounding + u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); + u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); + u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); + u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); + u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); + u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); + u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); + u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); + + v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); + v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); + v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); + v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); + v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); + v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); + v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); + v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); + + in[1] = _mm_packs_epi32(v0, v1); + in[3] = _mm_packs_epi32(v4, v5); + in[5] = _mm_packs_epi32(v2, v3); + in[7] = _mm_packs_epi32(v6, v7); + + // transpose + transpose_16bit_8x8(in, in); +} + +static void fadst8_sse2(__m128i *in) { + // Constants + const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); + const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i k__const_0 = _mm_setzero_si128(); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + + __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; + __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; + __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; + __m128i s0, s1, s2, s3, s4, s5, s6, s7; + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + + // properly aligned for butterfly input + in0 = in[7]; + in1 = in[0]; + in2 = in[5]; + in3 = in[2]; + in4 = in[3]; + in5 = in[4]; + in6 = in[1]; + in7 = in[6]; + + // column transformation + // stage 1 + // interleave and multiply/add into 32-bit integer + s0 = _mm_unpacklo_epi16(in0, in1); + s1 = _mm_unpackhi_epi16(in0, in1); + s2 = _mm_unpacklo_epi16(in2, in3); + s3 = _mm_unpackhi_epi16(in2, in3); + s4 = _mm_unpacklo_epi16(in4, in5); + s5 = _mm_unpackhi_epi16(in4, in5); + s6 = _mm_unpacklo_epi16(in6, in7); + s7 = _mm_unpackhi_epi16(in6, in7); + + u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); + u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); + u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); + u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); + u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); + u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); + u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); + u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); + u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); + u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); + u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); + u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); + u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); + u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); + u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); + u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); + + // addition + w0 = _mm_add_epi32(u0, u8); + w1 = _mm_add_epi32(u1, u9); + w2 = _mm_add_epi32(u2, u10); + w3 = _mm_add_epi32(u3, u11); + w4 = _mm_add_epi32(u4, u12); + w5 = _mm_add_epi32(u5, u13); + w6 = _mm_add_epi32(u6, u14); + w7 = _mm_add_epi32(u7, u15); + w8 = _mm_sub_epi32(u0, u8); + w9 = _mm_sub_epi32(u1, u9); + w10 = _mm_sub_epi32(u2, u10); + w11 = _mm_sub_epi32(u3, u11); + w12 = _mm_sub_epi32(u4, u12); + w13 = _mm_sub_epi32(u5, u13); + w14 = _mm_sub_epi32(u6, u14); + w15 = _mm_sub_epi32(u7, u15); + + // shift and rounding + v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); + v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); + v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); + v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); + v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); + v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); + v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); + v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); + v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); + v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); + v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); + v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); + v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); + v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); + v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); + v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); + + u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); + u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); + u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); + u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); + u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); + u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); + u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); + u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); + + // back to 16-bit and pack 8 integers into __m128i + in[0] = _mm_packs_epi32(u0, u1); + in[1] = _mm_packs_epi32(u2, u3); + in[2] = _mm_packs_epi32(u4, u5); + in[3] = _mm_packs_epi32(u6, u7); + in[4] = _mm_packs_epi32(u8, u9); + in[5] = _mm_packs_epi32(u10, u11); + in[6] = _mm_packs_epi32(u12, u13); + in[7] = _mm_packs_epi32(u14, u15); + + // stage 2 + s0 = _mm_add_epi16(in[0], in[2]); + s1 = _mm_add_epi16(in[1], in[3]); + s2 = _mm_sub_epi16(in[0], in[2]); + s3 = _mm_sub_epi16(in[1], in[3]); + u0 = _mm_unpacklo_epi16(in[4], in[5]); + u1 = _mm_unpackhi_epi16(in[4], in[5]); + u2 = _mm_unpacklo_epi16(in[6], in[7]); + u3 = _mm_unpackhi_epi16(in[6], in[7]); + + v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); + v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); + v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); + v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); + v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); + v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); + v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); + v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); + + w0 = _mm_add_epi32(v0, v4); + w1 = _mm_add_epi32(v1, v5); + w2 = _mm_add_epi32(v2, v6); + w3 = _mm_add_epi32(v3, v7); + w4 = _mm_sub_epi32(v0, v4); + w5 = _mm_sub_epi32(v1, v5); + w6 = _mm_sub_epi32(v2, v6); + w7 = _mm_sub_epi32(v3, v7); + + v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); + v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); + v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); + v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); + v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); + v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); + v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); + v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); + + u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + + // back to 16-bit intergers + s4 = _mm_packs_epi32(u0, u1); + s5 = _mm_packs_epi32(u2, u3); + s6 = _mm_packs_epi32(u4, u5); + s7 = _mm_packs_epi32(u6, u7); + + // stage 3 + u0 = _mm_unpacklo_epi16(s2, s3); + u1 = _mm_unpackhi_epi16(s2, s3); + u2 = _mm_unpacklo_epi16(s6, s7); + u3 = _mm_unpackhi_epi16(s6, s7); + + v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); + v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); + v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); + v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); + v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); + v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); + v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); + v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); + + u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); + u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); + u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); + u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); + u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); + u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); + u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); + u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); + + v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); + v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); + v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); + v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); + v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); + v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); + v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); + v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); + + s2 = _mm_packs_epi32(v0, v1); + s3 = _mm_packs_epi32(v2, v3); + s6 = _mm_packs_epi32(v4, v5); + s7 = _mm_packs_epi32(v6, v7); + + // FIXME(jingning): do subtract using bit inversion? + in[0] = s0; + in[1] = _mm_sub_epi16(k__const_0, s4); + in[2] = s6; + in[3] = _mm_sub_epi16(k__const_0, s2); + in[4] = s3; + in[5] = _mm_sub_epi16(k__const_0, s7); + in[6] = s5; + in[7] = _mm_sub_epi16(k__const_0, s1); + + // transpose + transpose_16bit_8x8(in, in); +} + +void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + __m128i in[8]; + + switch (tx_type) { + case DCT_DCT: vpx_fdct8x8_sse2(input, output, stride); break; + case ADST_DCT: + load_buffer_8x8(input, in, stride); + fadst8_sse2(in); + fdct8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + case DCT_ADST: + load_buffer_8x8(input, in, stride); + fdct8_sse2(in); + fadst8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + default: + assert(tx_type == ADST_ADST); + load_buffer_8x8(input, in, stride); + fadst8_sse2(in); + fadst8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + } +} + +static INLINE void load_buffer_16x16(const int16_t *input, __m128i *in0, + __m128i *in1, int stride) { + // load first 8 columns + load_buffer_8x8(input, in0, stride); + load_buffer_8x8(input + 8 * stride, in0 + 8, stride); + + input += 8; + // load second 8 columns + load_buffer_8x8(input, in1, stride); + load_buffer_8x8(input + 8 * stride, in1 + 8, stride); +} + +static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0, + __m128i *in1, int stride) { + // write first 8 columns + write_buffer_8x8(output, in0, stride); + write_buffer_8x8(output + 8 * stride, in0 + 8, stride); + // write second 8 columns + output += 8; + write_buffer_8x8(output, in1, stride); + write_buffer_8x8(output + 8 * stride, in1 + 8, stride); +} + +static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) { + // perform rounding operations + right_shift_8x8(res0, 2); + right_shift_8x8(res0 + 8, 2); + right_shift_8x8(res1, 2); + right_shift_8x8(res1 + 8, 2); +} + +static void fdct16_8col(__m128i *in) { + // perform 16x16 1-D DCT for 8 columns + __m128i i[8], s[8], p[8], t[8], u[16], v[16]; + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); + const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); + const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); + const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); + const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); + const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); + const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); + const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + + // stage 1 + i[0] = _mm_add_epi16(in[0], in[15]); + i[1] = _mm_add_epi16(in[1], in[14]); + i[2] = _mm_add_epi16(in[2], in[13]); + i[3] = _mm_add_epi16(in[3], in[12]); + i[4] = _mm_add_epi16(in[4], in[11]); + i[5] = _mm_add_epi16(in[5], in[10]); + i[6] = _mm_add_epi16(in[6], in[9]); + i[7] = _mm_add_epi16(in[7], in[8]); + + s[0] = _mm_sub_epi16(in[7], in[8]); + s[1] = _mm_sub_epi16(in[6], in[9]); + s[2] = _mm_sub_epi16(in[5], in[10]); + s[3] = _mm_sub_epi16(in[4], in[11]); + s[4] = _mm_sub_epi16(in[3], in[12]); + s[5] = _mm_sub_epi16(in[2], in[13]); + s[6] = _mm_sub_epi16(in[1], in[14]); + s[7] = _mm_sub_epi16(in[0], in[15]); + + p[0] = _mm_add_epi16(i[0], i[7]); + p[1] = _mm_add_epi16(i[1], i[6]); + p[2] = _mm_add_epi16(i[2], i[5]); + p[3] = _mm_add_epi16(i[3], i[4]); + p[4] = _mm_sub_epi16(i[3], i[4]); + p[5] = _mm_sub_epi16(i[2], i[5]); + p[6] = _mm_sub_epi16(i[1], i[6]); + p[7] = _mm_sub_epi16(i[0], i[7]); + + u[0] = _mm_add_epi16(p[0], p[3]); + u[1] = _mm_add_epi16(p[1], p[2]); + u[2] = _mm_sub_epi16(p[1], p[2]); + u[3] = _mm_sub_epi16(p[0], p[3]); + + v[0] = _mm_unpacklo_epi16(u[0], u[1]); + v[1] = _mm_unpackhi_epi16(u[0], u[1]); + v[2] = _mm_unpacklo_epi16(u[2], u[3]); + v[3] = _mm_unpackhi_epi16(u[2], u[3]); + + u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); + u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16); + u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16); + u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16); + u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08); + u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08); + u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24); + u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + + in[0] = _mm_packs_epi32(u[0], u[1]); + in[4] = _mm_packs_epi32(u[4], u[5]); + in[8] = _mm_packs_epi32(u[2], u[3]); + in[12] = _mm_packs_epi32(u[6], u[7]); + + u[0] = _mm_unpacklo_epi16(p[5], p[6]); + u[1] = _mm_unpackhi_epi16(p[5], p[6]); + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + + u[0] = _mm_packs_epi32(v[0], v[1]); + u[1] = _mm_packs_epi32(v[2], v[3]); + + t[0] = _mm_add_epi16(p[4], u[0]); + t[1] = _mm_sub_epi16(p[4], u[0]); + t[2] = _mm_sub_epi16(p[7], u[1]); + t[3] = _mm_add_epi16(p[7], u[1]); + + u[0] = _mm_unpacklo_epi16(t[0], t[3]); + u[1] = _mm_unpackhi_epi16(t[0], t[3]); + u[2] = _mm_unpacklo_epi16(t[1], t[2]); + u[3] = _mm_unpackhi_epi16(t[1], t[2]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04); + v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04); + v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20); + v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20); + v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12); + v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12); + v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28); + v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + + in[2] = _mm_packs_epi32(v[0], v[1]); + in[6] = _mm_packs_epi32(v[4], v[5]); + in[10] = _mm_packs_epi32(v[2], v[3]); + in[14] = _mm_packs_epi32(v[6], v[7]); + + // stage 2 + u[0] = _mm_unpacklo_epi16(s[2], s[5]); + u[1] = _mm_unpackhi_epi16(s[2], s[5]); + u[2] = _mm_unpacklo_epi16(s[3], s[4]); + u[3] = _mm_unpackhi_epi16(s[3], s[4]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); + v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16); + v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16); + v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); + v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); + v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + + t[2] = _mm_packs_epi32(v[0], v[1]); + t[3] = _mm_packs_epi32(v[2], v[3]); + t[4] = _mm_packs_epi32(v[4], v[5]); + t[5] = _mm_packs_epi32(v[6], v[7]); + + // stage 3 + p[0] = _mm_add_epi16(s[0], t[3]); + p[1] = _mm_add_epi16(s[1], t[2]); + p[2] = _mm_sub_epi16(s[1], t[2]); + p[3] = _mm_sub_epi16(s[0], t[3]); + p[4] = _mm_sub_epi16(s[7], t[4]); + p[5] = _mm_sub_epi16(s[6], t[5]); + p[6] = _mm_add_epi16(s[6], t[5]); + p[7] = _mm_add_epi16(s[7], t[4]); + + // stage 4 + u[0] = _mm_unpacklo_epi16(p[1], p[6]); + u[1] = _mm_unpackhi_epi16(p[1], p[6]); + u[2] = _mm_unpacklo_epi16(p[2], p[5]); + u[3] = _mm_unpackhi_epi16(p[2], p[5]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24); + v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24); + v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08); + v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08); + v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24); + v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24); + v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08); + v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + + t[1] = _mm_packs_epi32(v[0], v[1]); + t[2] = _mm_packs_epi32(v[2], v[3]); + t[5] = _mm_packs_epi32(v[4], v[5]); + t[6] = _mm_packs_epi32(v[6], v[7]); + + // stage 5 + s[0] = _mm_add_epi16(p[0], t[1]); + s[1] = _mm_sub_epi16(p[0], t[1]); + s[2] = _mm_add_epi16(p[3], t[2]); + s[3] = _mm_sub_epi16(p[3], t[2]); + s[4] = _mm_sub_epi16(p[4], t[5]); + s[5] = _mm_add_epi16(p[4], t[5]); + s[6] = _mm_sub_epi16(p[7], t[6]); + s[7] = _mm_add_epi16(p[7], t[6]); + + // stage 6 + u[0] = _mm_unpacklo_epi16(s[0], s[7]); + u[1] = _mm_unpackhi_epi16(s[0], s[7]); + u[2] = _mm_unpacklo_epi16(s[1], s[6]); + u[3] = _mm_unpackhi_epi16(s[1], s[6]); + u[4] = _mm_unpacklo_epi16(s[2], s[5]); + u[5] = _mm_unpackhi_epi16(s[2], s[5]); + u[6] = _mm_unpacklo_epi16(s[3], s[4]); + u[7] = _mm_unpackhi_epi16(s[3], s[4]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02); + v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02); + v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18); + v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18); + v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10); + v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10); + v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26); + v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26); + v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06); + v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06); + v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22); + v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22); + v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14); + v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14); + v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30); + v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + in[1] = _mm_packs_epi32(v[0], v[1]); + in[9] = _mm_packs_epi32(v[2], v[3]); + in[5] = _mm_packs_epi32(v[4], v[5]); + in[13] = _mm_packs_epi32(v[6], v[7]); + in[3] = _mm_packs_epi32(v[8], v[9]); + in[11] = _mm_packs_epi32(v[10], v[11]); + in[7] = _mm_packs_epi32(v[12], v[13]); + in[15] = _mm_packs_epi32(v[14], v[15]); +} + +static void fadst16_8col(__m128i *in) { + // perform 16x16 1-D ADST for 8 columns + __m128i s[16], x[16], u[32], v[32]; + const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); + const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); + const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); + const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); + const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); + const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); + const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); + const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); + const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); + const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); + const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); + const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i kZero = _mm_setzero_si128(); + + u[0] = _mm_unpacklo_epi16(in[15], in[0]); + u[1] = _mm_unpackhi_epi16(in[15], in[0]); + u[2] = _mm_unpacklo_epi16(in[13], in[2]); + u[3] = _mm_unpackhi_epi16(in[13], in[2]); + u[4] = _mm_unpacklo_epi16(in[11], in[4]); + u[5] = _mm_unpackhi_epi16(in[11], in[4]); + u[6] = _mm_unpacklo_epi16(in[9], in[6]); + u[7] = _mm_unpackhi_epi16(in[9], in[6]); + u[8] = _mm_unpacklo_epi16(in[7], in[8]); + u[9] = _mm_unpackhi_epi16(in[7], in[8]); + u[10] = _mm_unpacklo_epi16(in[5], in[10]); + u[11] = _mm_unpackhi_epi16(in[5], in[10]); + u[12] = _mm_unpacklo_epi16(in[3], in[12]); + u[13] = _mm_unpackhi_epi16(in[3], in[12]); + u[14] = _mm_unpacklo_epi16(in[1], in[14]); + u[15] = _mm_unpackhi_epi16(in[1], in[14]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); + v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); + v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); + v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); + v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); + v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); + v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); + v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); + v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); + v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); + v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); + v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); + v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); + v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); + v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); + v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); + v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); + v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); + v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); + v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); + v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); + v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); + v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); + v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); + v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); + v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); + v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); + v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); + v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); + v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); + v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); + v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); + + u[0] = _mm_add_epi32(v[0], v[16]); + u[1] = _mm_add_epi32(v[1], v[17]); + u[2] = _mm_add_epi32(v[2], v[18]); + u[3] = _mm_add_epi32(v[3], v[19]); + u[4] = _mm_add_epi32(v[4], v[20]); + u[5] = _mm_add_epi32(v[5], v[21]); + u[6] = _mm_add_epi32(v[6], v[22]); + u[7] = _mm_add_epi32(v[7], v[23]); + u[8] = _mm_add_epi32(v[8], v[24]); + u[9] = _mm_add_epi32(v[9], v[25]); + u[10] = _mm_add_epi32(v[10], v[26]); + u[11] = _mm_add_epi32(v[11], v[27]); + u[12] = _mm_add_epi32(v[12], v[28]); + u[13] = _mm_add_epi32(v[13], v[29]); + u[14] = _mm_add_epi32(v[14], v[30]); + u[15] = _mm_add_epi32(v[15], v[31]); + u[16] = _mm_sub_epi32(v[0], v[16]); + u[17] = _mm_sub_epi32(v[1], v[17]); + u[18] = _mm_sub_epi32(v[2], v[18]); + u[19] = _mm_sub_epi32(v[3], v[19]); + u[20] = _mm_sub_epi32(v[4], v[20]); + u[21] = _mm_sub_epi32(v[5], v[21]); + u[22] = _mm_sub_epi32(v[6], v[22]); + u[23] = _mm_sub_epi32(v[7], v[23]); + u[24] = _mm_sub_epi32(v[8], v[24]); + u[25] = _mm_sub_epi32(v[9], v[25]); + u[26] = _mm_sub_epi32(v[10], v[26]); + u[27] = _mm_sub_epi32(v[11], v[27]); + u[28] = _mm_sub_epi32(v[12], v[28]); + u[29] = _mm_sub_epi32(v[13], v[29]); + u[30] = _mm_sub_epi32(v[14], v[30]); + u[31] = _mm_sub_epi32(v[15], v[31]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); + v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); + v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); + v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); + v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); + v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); + v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); + v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); + v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); + v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); + v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); + v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); + v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); + v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); + v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); + v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); + u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); + u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); + u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); + u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); + u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); + u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); + u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); + u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); + u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); + u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); + u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); + u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); + u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); + u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); + u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); + + s[0] = _mm_packs_epi32(u[0], u[1]); + s[1] = _mm_packs_epi32(u[2], u[3]); + s[2] = _mm_packs_epi32(u[4], u[5]); + s[3] = _mm_packs_epi32(u[6], u[7]); + s[4] = _mm_packs_epi32(u[8], u[9]); + s[5] = _mm_packs_epi32(u[10], u[11]); + s[6] = _mm_packs_epi32(u[12], u[13]); + s[7] = _mm_packs_epi32(u[14], u[15]); + s[8] = _mm_packs_epi32(u[16], u[17]); + s[9] = _mm_packs_epi32(u[18], u[19]); + s[10] = _mm_packs_epi32(u[20], u[21]); + s[11] = _mm_packs_epi32(u[22], u[23]); + s[12] = _mm_packs_epi32(u[24], u[25]); + s[13] = _mm_packs_epi32(u[26], u[27]); + s[14] = _mm_packs_epi32(u[28], u[29]); + s[15] = _mm_packs_epi32(u[30], u[31]); + + // stage 2 + u[0] = _mm_unpacklo_epi16(s[8], s[9]); + u[1] = _mm_unpackhi_epi16(s[8], s[9]); + u[2] = _mm_unpacklo_epi16(s[10], s[11]); + u[3] = _mm_unpackhi_epi16(s[10], s[11]); + u[4] = _mm_unpacklo_epi16(s[12], s[13]); + u[5] = _mm_unpackhi_epi16(s[12], s[13]); + u[6] = _mm_unpacklo_epi16(s[14], s[15]); + u[7] = _mm_unpackhi_epi16(s[14], s[15]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); + v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); + v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); + v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); + v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); + v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); + v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); + v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); + v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); + v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); + v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); + v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); + v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); + v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); + v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); + v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); + + u[0] = _mm_add_epi32(v[0], v[8]); + u[1] = _mm_add_epi32(v[1], v[9]); + u[2] = _mm_add_epi32(v[2], v[10]); + u[3] = _mm_add_epi32(v[3], v[11]); + u[4] = _mm_add_epi32(v[4], v[12]); + u[5] = _mm_add_epi32(v[5], v[13]); + u[6] = _mm_add_epi32(v[6], v[14]); + u[7] = _mm_add_epi32(v[7], v[15]); + u[8] = _mm_sub_epi32(v[0], v[8]); + u[9] = _mm_sub_epi32(v[1], v[9]); + u[10] = _mm_sub_epi32(v[2], v[10]); + u[11] = _mm_sub_epi32(v[3], v[11]); + u[12] = _mm_sub_epi32(v[4], v[12]); + u[13] = _mm_sub_epi32(v[5], v[13]); + u[14] = _mm_sub_epi32(v[6], v[14]); + u[15] = _mm_sub_epi32(v[7], v[15]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + + x[0] = _mm_add_epi16(s[0], s[4]); + x[1] = _mm_add_epi16(s[1], s[5]); + x[2] = _mm_add_epi16(s[2], s[6]); + x[3] = _mm_add_epi16(s[3], s[7]); + x[4] = _mm_sub_epi16(s[0], s[4]); + x[5] = _mm_sub_epi16(s[1], s[5]); + x[6] = _mm_sub_epi16(s[2], s[6]); + x[7] = _mm_sub_epi16(s[3], s[7]); + x[8] = _mm_packs_epi32(u[0], u[1]); + x[9] = _mm_packs_epi32(u[2], u[3]); + x[10] = _mm_packs_epi32(u[4], u[5]); + x[11] = _mm_packs_epi32(u[6], u[7]); + x[12] = _mm_packs_epi32(u[8], u[9]); + x[13] = _mm_packs_epi32(u[10], u[11]); + x[14] = _mm_packs_epi32(u[12], u[13]); + x[15] = _mm_packs_epi32(u[14], u[15]); + + // stage 3 + u[0] = _mm_unpacklo_epi16(x[4], x[5]); + u[1] = _mm_unpackhi_epi16(x[4], x[5]); + u[2] = _mm_unpacklo_epi16(x[6], x[7]); + u[3] = _mm_unpackhi_epi16(x[6], x[7]); + u[4] = _mm_unpacklo_epi16(x[12], x[13]); + u[5] = _mm_unpackhi_epi16(x[12], x[13]); + u[6] = _mm_unpacklo_epi16(x[14], x[15]); + u[7] = _mm_unpackhi_epi16(x[14], x[15]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); + v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); + v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); + v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); + v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); + v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); + v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); + v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); + v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); + v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); + v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); + v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); + v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); + v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); + v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); + v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); + + u[0] = _mm_add_epi32(v[0], v[4]); + u[1] = _mm_add_epi32(v[1], v[5]); + u[2] = _mm_add_epi32(v[2], v[6]); + u[3] = _mm_add_epi32(v[3], v[7]); + u[4] = _mm_sub_epi32(v[0], v[4]); + u[5] = _mm_sub_epi32(v[1], v[5]); + u[6] = _mm_sub_epi32(v[2], v[6]); + u[7] = _mm_sub_epi32(v[3], v[7]); + u[8] = _mm_add_epi32(v[8], v[12]); + u[9] = _mm_add_epi32(v[9], v[13]); + u[10] = _mm_add_epi32(v[10], v[14]); + u[11] = _mm_add_epi32(v[11], v[15]); + u[12] = _mm_sub_epi32(v[8], v[12]); + u[13] = _mm_sub_epi32(v[9], v[13]); + u[14] = _mm_sub_epi32(v[10], v[14]); + u[15] = _mm_sub_epi32(v[11], v[15]); + + u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + s[0] = _mm_add_epi16(x[0], x[2]); + s[1] = _mm_add_epi16(x[1], x[3]); + s[2] = _mm_sub_epi16(x[0], x[2]); + s[3] = _mm_sub_epi16(x[1], x[3]); + s[4] = _mm_packs_epi32(v[0], v[1]); + s[5] = _mm_packs_epi32(v[2], v[3]); + s[6] = _mm_packs_epi32(v[4], v[5]); + s[7] = _mm_packs_epi32(v[6], v[7]); + s[8] = _mm_add_epi16(x[8], x[10]); + s[9] = _mm_add_epi16(x[9], x[11]); + s[10] = _mm_sub_epi16(x[8], x[10]); + s[11] = _mm_sub_epi16(x[9], x[11]); + s[12] = _mm_packs_epi32(v[8], v[9]); + s[13] = _mm_packs_epi32(v[10], v[11]); + s[14] = _mm_packs_epi32(v[12], v[13]); + s[15] = _mm_packs_epi32(v[14], v[15]); + + // stage 4 + u[0] = _mm_unpacklo_epi16(s[2], s[3]); + u[1] = _mm_unpackhi_epi16(s[2], s[3]); + u[2] = _mm_unpacklo_epi16(s[6], s[7]); + u[3] = _mm_unpackhi_epi16(s[6], s[7]); + u[4] = _mm_unpacklo_epi16(s[10], s[11]); + u[5] = _mm_unpackhi_epi16(s[10], s[11]); + u[6] = _mm_unpacklo_epi16(s[14], s[15]); + u[7] = _mm_unpackhi_epi16(s[14], s[15]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); + v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); + v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); + v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); + v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); + v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); + v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); + v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); + v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); + v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); + v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); + v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); + v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + in[0] = s[0]; + in[1] = _mm_sub_epi16(kZero, s[8]); + in[2] = s[12]; + in[3] = _mm_sub_epi16(kZero, s[4]); + in[4] = _mm_packs_epi32(v[4], v[5]); + in[5] = _mm_packs_epi32(v[12], v[13]); + in[6] = _mm_packs_epi32(v[8], v[9]); + in[7] = _mm_packs_epi32(v[0], v[1]); + in[8] = _mm_packs_epi32(v[2], v[3]); + in[9] = _mm_packs_epi32(v[10], v[11]); + in[10] = _mm_packs_epi32(v[14], v[15]); + in[11] = _mm_packs_epi32(v[6], v[7]); + in[12] = s[5]; + in[13] = _mm_sub_epi16(kZero, s[13]); + in[14] = s[9]; + in[15] = _mm_sub_epi16(kZero, s[1]); +} + +static void fdct16_sse2(__m128i *in0, __m128i *in1) { + fdct16_8col(in0); + fdct16_8col(in1); + transpose_16bit_16x16(in0, in1); +} + +static void fadst16_sse2(__m128i *in0, __m128i *in1) { + fadst16_8col(in0); + fadst16_8col(in1); + transpose_16bit_16x16(in0, in1); +} + +void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + __m128i in0[16], in1[16]; + + switch (tx_type) { + case DCT_DCT: vpx_fdct16x16_sse2(input, output, stride); break; + case ADST_DCT: + load_buffer_16x16(input, in0, in1, stride); + fadst16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fdct16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + case DCT_ADST: + load_buffer_16x16(input, in0, in1, stride); + fdct16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fadst16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + default: + assert(tx_type == ADST_ADST); + load_buffer_16x16(input, in0, in1, stride); + fadst16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fadst16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + } +} diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm b/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm new file mode 100644 index 0000000000..8152dce864 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm @@ -0,0 +1,69 @@ +; +; Copyright (c) 2016 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%define private_prefix vp9 + +%include "third_party/x86inc/x86inc.asm" +%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm" + +SECTION .text + +%macro TRANSFORM_COLS 0 + paddw m0, m1 + movq m4, m0 + psubw m3, m2 + psubw m4, m3 + psraw m4, 1 + movq m5, m4 + psubw m5, m1 ;b1 + psubw m4, m2 ;c1 + psubw m0, m4 + paddw m3, m5 + ; m0 a0 + SWAP 1, 4 ; m1 c1 + SWAP 2, 3 ; m2 d1 + SWAP 3, 5 ; m3 b1 +%endmacro + +%macro TRANSPOSE_4X4 0 + ; 00 01 02 03 + ; 10 11 12 13 + ; 20 21 22 23 + ; 30 31 32 33 + punpcklwd m0, m1 ; 00 10 01 11 02 12 03 13 + punpcklwd m2, m3 ; 20 30 21 31 22 32 23 33 + mova m1, m0 + punpckldq m0, m2 ; 00 10 20 30 01 11 21 31 + punpckhdq m1, m2 ; 02 12 22 32 03 13 23 33 +%endmacro + +INIT_XMM sse2 +cglobal fwht4x4, 3, 4, 8, input, output, stride + lea r3q, [inputq + strideq*4] + movq m0, [inputq] ;a1 + movq m1, [inputq + strideq*2] ;b1 + movq m2, [r3q] ;c1 + movq m3, [r3q + strideq*2] ;d1 + + TRANSFORM_COLS + TRANSPOSE_4X4 + SWAP 1, 2 + psrldq m1, m0, 8 + psrldq m3, m2, 8 + TRANSFORM_COLS + TRANSPOSE_4X4 + + psllw m0, 2 + psllw m1, 2 + + STORE_TRAN_LOW 0, outputq, 0, 2, 3 + STORE_TRAN_LOW 1, outputq, 8, 2, 3 + + RET diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c new file mode 100644 index 0000000000..5930bf491e --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c @@ -0,0 +1,327 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/encoder/vp9_context_tree.h" +#include "vp9/encoder/vp9_denoiser.h" +#include "vpx_mem/vpx_mem.h" + +// Compute the sum of all pixel differences of this MB. +static INLINE int sum_diff_16x1(__m128i acc_diff) { + const __m128i k_1 = _mm_set1_epi16(1); + const __m128i acc_diff_lo = + _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8); + const __m128i acc_diff_hi = + _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8); + const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi); + const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1); + const __m128i hgfe_dcba = + _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8)); + const __m128i hgfedcba = + _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4)); + return _mm_cvtsi128_si32(hgfedcba); +} + +// Denoise a 16x1 vector. +static INLINE __m128i vp9_denoiser_16x1_sse2( + const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, + const __m128i *k_0, const __m128i *k_4, const __m128i *k_8, + const __m128i *k_16, const __m128i *l3, const __m128i *l32, + const __m128i *l21, __m128i acc_diff) { + // Calculate differences + const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0])); + const __m128i v_mc_running_avg_y = + _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0])); + __m128i v_running_avg_y; + const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); + const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); + // Obtain the sign. FF if diff is negative. + const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, *k_0); + // Clamp absolute difference to 16 to be used to get mask. Doing this + // allows us to use _mm_cmpgt_epi8, which operates on signed byte. + const __m128i clamped_absdiff = + _mm_min_epu8(_mm_or_si128(pdiff, ndiff), *k_16); + // Get masks for l2 l1 and l0 adjustments. + const __m128i mask2 = _mm_cmpgt_epi8(*k_16, clamped_absdiff); + const __m128i mask1 = _mm_cmpgt_epi8(*k_8, clamped_absdiff); + const __m128i mask0 = _mm_cmpgt_epi8(*k_4, clamped_absdiff); + // Get adjustments for l2, l1, and l0. + __m128i adj2 = _mm_and_si128(mask2, *l32); + const __m128i adj1 = _mm_and_si128(mask1, *l21); + const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff); + __m128i adj, padj, nadj; + + // Combine the adjustments and get absolute adjustments. + adj2 = _mm_add_epi8(adj2, adj1); + adj = _mm_sub_epi8(*l3, adj2); + adj = _mm_andnot_si128(mask0, adj); + adj = _mm_or_si128(adj, adj0); + + // Restore the sign and get positive and negative adjustments. + padj = _mm_andnot_si128(diff_sign, adj); + nadj = _mm_and_si128(diff_sign, adj); + + // Calculate filtered value. + v_running_avg_y = _mm_adds_epu8(v_sig, padj); + v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj); + _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); + + // Adjustments <=7, and each element in acc_diff can fit in signed + // char. + acc_diff = _mm_adds_epi8(acc_diff, padj); + acc_diff = _mm_subs_epi8(acc_diff, nadj); + return acc_diff; +} + +// Denoise a 16x1 vector with a weaker filter. +static INLINE __m128i vp9_denoiser_adj_16x1_sse2( + const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, + const __m128i k_0, const __m128i k_delta, __m128i acc_diff) { + __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0])); + // Calculate differences. + const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0])); + const __m128i v_mc_running_avg_y = + _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0])); + const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); + const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); + // Obtain the sign. FF if diff is negative. + const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); + // Clamp absolute difference to delta to get the adjustment. + const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta); + // Restore the sign and get positive and negative adjustments. + __m128i padj, nadj; + padj = _mm_andnot_si128(diff_sign, adj); + nadj = _mm_and_si128(diff_sign, adj); + // Calculate filtered value. + v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj); + v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj); + _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); + + // Accumulate the adjustments. + acc_diff = _mm_subs_epi8(acc_diff, padj); + acc_diff = _mm_adds_epi8(acc_diff, nadj); + return acc_diff; +} + +// Denoise 8x8 and 8x16 blocks. +static int vp9_denoiser_NxM_sse2_small(const uint8_t *sig, int sig_stride, + const uint8_t *mc_running_avg_y, + int mc_avg_y_stride, + uint8_t *running_avg_y, int avg_y_stride, + int increase_denoising, BLOCK_SIZE bs, + int motion_magnitude, int width) { + int sum_diff_thresh, r, sum_diff = 0; + const int shift_inc = + (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) + ? 1 + : 0; + uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16]; + __m128i acc_diff = _mm_setzero_si128(); + const __m128i k_0 = _mm_setzero_si128(); + const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); + const __m128i k_8 = _mm_set1_epi8(8); + const __m128i k_16 = _mm_set1_epi8(16); + // Modify each level's adjustment according to motion_magnitude. + const __m128i l3 = _mm_set1_epi8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6); + // Difference between level 3 and level 2 is 2. + const __m128i l32 = _mm_set1_epi8(2); + // Difference between level 2 and level 1 is 1. + const __m128i l21 = _mm_set1_epi8(1); + const int b_height = (4 << b_height_log2_lookup[bs]) >> 1; + + for (r = 0; r < b_height; ++r) { + memcpy(sig_buffer[r], sig, width); + memcpy(sig_buffer[r] + width, sig + sig_stride, width); + memcpy(mc_running_buffer[r], mc_running_avg_y, width); + memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride, + width); + memcpy(running_buffer[r], running_avg_y, width); + memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width); + acc_diff = vp9_denoiser_16x1_sse2(sig_buffer[r], mc_running_buffer[r], + running_buffer[r], &k_0, &k_4, &k_8, + &k_16, &l3, &l32, &l21, acc_diff); + memcpy(running_avg_y, running_buffer[r], width); + memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width, width); + // Update pointers for next iteration. + sig += (sig_stride << 1); + mc_running_avg_y += (mc_avg_y_stride << 1); + running_avg_y += (avg_y_stride << 1); + } + + { + sum_diff = sum_diff_16x1(acc_diff); + sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); + if (abs(sum_diff) > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), + // check if we can still apply some (weaker) temporal filtering to + // this block, that would otherwise not be denoised at all. Simplest + // is to apply an additional adjustment to running_avg_y to bring it + // closer to sig. The adjustment is capped by a maximum delta, and + // chosen such that in most cases the resulting sum_diff will be + // within the acceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over the + // threshold. + const int delta = + ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const __m128i k_delta = _mm_set1_epi8(delta); + running_avg_y -= avg_y_stride * (b_height << 1); + for (r = 0; r < b_height; ++r) { + acc_diff = vp9_denoiser_adj_16x1_sse2( + sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_0, + k_delta, acc_diff); + memcpy(running_avg_y, running_buffer[r], width); + memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width, + width); + // Update pointers for next iteration. + running_avg_y += (avg_y_stride << 1); + } + sum_diff = sum_diff_16x1(acc_diff); + if (abs(sum_diff) > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + return FILTER_BLOCK; +} + +// Denoise 16x16, 16x32, 32x16, 32x32, 32x64, 64x32 and 64x64 blocks. +static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride, + const uint8_t *mc_running_avg_y, + int mc_avg_y_stride, + uint8_t *running_avg_y, int avg_y_stride, + int increase_denoising, BLOCK_SIZE bs, + int motion_magnitude) { + int sum_diff_thresh, r, c, sum_diff = 0; + const int shift_inc = + (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) + ? 1 + : 0; + __m128i acc_diff[4][4]; + const __m128i k_0 = _mm_setzero_si128(); + const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); + const __m128i k_8 = _mm_set1_epi8(8); + const __m128i k_16 = _mm_set1_epi8(16); + // Modify each level's adjustment according to motion_magnitude. + const __m128i l3 = _mm_set1_epi8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6); + // Difference between level 3 and level 2 is 2. + const __m128i l32 = _mm_set1_epi8(2); + // Difference between level 2 and level 1 is 1. + const __m128i l21 = _mm_set1_epi8(1); + const int b_width = (4 << b_width_log2_lookup[bs]); + const int b_height = (4 << b_height_log2_lookup[bs]); + const int b_width_shift4 = b_width >> 4; + + for (r = 0; r < 4; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + acc_diff[c][r] = _mm_setzero_si128(); + } + } + + for (r = 0; r < b_height; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + acc_diff[c][r >> 4] = vp9_denoiser_16x1_sse2( + sig, mc_running_avg_y, running_avg_y, &k_0, &k_4, &k_8, &k_16, &l3, + &l32, &l21, acc_diff[c][r >> 4]); + // Update pointers for next iteration. + sig += 16; + mc_running_avg_y += 16; + running_avg_y += 16; + } + + if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) { + for (c = 0; c < b_width_shift4; ++c) { + sum_diff += sum_diff_16x1(acc_diff[c][r >> 4]); + } + } + + // Update pointers for next iteration. + sig = sig - b_width + sig_stride; + mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride; + running_avg_y = running_avg_y - b_width + avg_y_stride; + } + + { + sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); + if (abs(sum_diff) > sum_diff_thresh) { + const int delta = + ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1; + + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const __m128i k_delta = _mm_set1_epi8(delta); + sig -= sig_stride * b_height; + mc_running_avg_y -= mc_avg_y_stride * b_height; + running_avg_y -= avg_y_stride * b_height; + sum_diff = 0; + for (r = 0; r < b_height; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + acc_diff[c][r >> 4] = + vp9_denoiser_adj_16x1_sse2(sig, mc_running_avg_y, running_avg_y, + k_0, k_delta, acc_diff[c][r >> 4]); + // Update pointers for next iteration. + sig += 16; + mc_running_avg_y += 16; + running_avg_y += 16; + } + + if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) { + for (c = 0; c < b_width_shift4; ++c) { + sum_diff += sum_diff_16x1(acc_diff[c][r >> 4]); + } + } + sig = sig - b_width + sig_stride; + mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride; + running_avg_y = running_avg_y - b_width + avg_y_stride; + } + if (abs(sum_diff) > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + return FILTER_BLOCK; +} + +int vp9_denoiser_filter_sse2(const uint8_t *sig, int sig_stride, + const uint8_t *mc_avg, int mc_avg_stride, + uint8_t *avg, int avg_stride, + int increase_denoising, BLOCK_SIZE bs, + int motion_magnitude) { + // Rank by frequency of the block type to have an early termination. + if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 || + bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 || + bs == BLOCK_32X64 || bs == BLOCK_64X32) { + return vp9_denoiser_NxM_sse2_big(sig, sig_stride, mc_avg, mc_avg_stride, + avg, avg_stride, increase_denoising, bs, + motion_magnitude); + } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) { + return vp9_denoiser_NxM_sse2_small(sig, sig_stride, mc_avg, mc_avg_stride, + avg, avg_stride, increase_denoising, bs, + motion_magnitude, 8); + } else { + return COPY_BLOCK; + } +} diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_avx2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_avx2.c new file mode 100644 index 0000000000..99fef31d16 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_avx2.c @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/x86/bitdepth_conversion_avx2.h" + +int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { + __m256i sse_256, ssz_256; + __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi; + __m256i sse_hi, ssz_hi; + __m128i sse_128, ssz_128; + int64_t sse; + const __m256i zero = _mm256_setzero_si256(); + + // If the block size is 16 then the results will fit in 32 bits. + if (block_size == 16) { + __m256i coeff_256, dqcoeff_256, coeff_hi, dqcoeff_hi; + // Load 16 elements for coeff and dqcoeff. + coeff_256 = load_tran_low(coeff); + dqcoeff_256 = load_tran_low(dqcoeff); + // dqcoeff - coeff + dqcoeff_256 = _mm256_sub_epi16(dqcoeff_256, coeff_256); + // madd (dqcoeff - coeff) + dqcoeff_256 = _mm256_madd_epi16(dqcoeff_256, dqcoeff_256); + // madd coeff + coeff_256 = _mm256_madd_epi16(coeff_256, coeff_256); + // Save the higher 64 bit of each 128 bit lane. + dqcoeff_hi = _mm256_srli_si256(dqcoeff_256, 8); + coeff_hi = _mm256_srli_si256(coeff_256, 8); + // Add the higher 64 bit to the low 64 bit. + dqcoeff_256 = _mm256_add_epi32(dqcoeff_256, dqcoeff_hi); + coeff_256 = _mm256_add_epi32(coeff_256, coeff_hi); + // Expand each double word in the lower 64 bits to quad word. + sse_256 = _mm256_unpacklo_epi32(dqcoeff_256, zero); + ssz_256 = _mm256_unpacklo_epi32(coeff_256, zero); + } else { + int i; + assert(block_size % 32 == 0); + sse_256 = zero; + ssz_256 = zero; + + for (i = 0; i < block_size; i += 32) { + __m256i coeff_0, coeff_1, dqcoeff_0, dqcoeff_1; + // Load 32 elements for coeff and dqcoeff. + coeff_0 = load_tran_low(coeff + i); + dqcoeff_0 = load_tran_low(dqcoeff + i); + coeff_1 = load_tran_low(coeff + i + 16); + dqcoeff_1 = load_tran_low(dqcoeff + i + 16); + // dqcoeff - coeff + dqcoeff_0 = _mm256_sub_epi16(dqcoeff_0, coeff_0); + dqcoeff_1 = _mm256_sub_epi16(dqcoeff_1, coeff_1); + // madd (dqcoeff - coeff) + dqcoeff_0 = _mm256_madd_epi16(dqcoeff_0, dqcoeff_0); + dqcoeff_1 = _mm256_madd_epi16(dqcoeff_1, dqcoeff_1); + // madd coeff + coeff_0 = _mm256_madd_epi16(coeff_0, coeff_0); + coeff_1 = _mm256_madd_epi16(coeff_1, coeff_1); + // Add the first madd (dqcoeff - coeff) with the second. + dqcoeff_0 = _mm256_add_epi32(dqcoeff_0, dqcoeff_1); + // Add the first madd (coeff) with the second. + coeff_0 = _mm256_add_epi32(coeff_0, coeff_1); + // Expand each double word of madd (dqcoeff - coeff) to quad word. + exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_0, zero); + exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_0, zero); + // expand each double word of madd (coeff) to quad word + exp_coeff_lo = _mm256_unpacklo_epi32(coeff_0, zero); + exp_coeff_hi = _mm256_unpackhi_epi32(coeff_0, zero); + // Add each quad word of madd (dqcoeff - coeff) and madd (coeff). + sse_256 = _mm256_add_epi64(sse_256, exp_dqcoeff_lo); + ssz_256 = _mm256_add_epi64(ssz_256, exp_coeff_lo); + sse_256 = _mm256_add_epi64(sse_256, exp_dqcoeff_hi); + ssz_256 = _mm256_add_epi64(ssz_256, exp_coeff_hi); + } + } + // Save the higher 64 bit of each 128 bit lane. + sse_hi = _mm256_srli_si256(sse_256, 8); + ssz_hi = _mm256_srli_si256(ssz_256, 8); + // Add the higher 64 bit to the low 64 bit. + sse_256 = _mm256_add_epi64(sse_256, sse_hi); + ssz_256 = _mm256_add_epi64(ssz_256, ssz_hi); + + // Add each 64 bit from each of the 128 bit lane of the 256 bit. + sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256), + _mm256_extractf128_si256(sse_256, 1)); + + ssz_128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_256), + _mm256_extractf128_si256(ssz_256, 1)); + + // Store the results. + _mm_storel_epi64((__m128i *)(&sse), sse_128); + + _mm_storel_epi64((__m128i *)(ssz), ssz_128); + return sse; +} + +int64_t vp9_block_error_fp_avx2(const tran_low_t *coeff, + const tran_low_t *dqcoeff, int block_size) { + int i; + const __m256i zero = _mm256_setzero_si256(); + __m256i sse_256 = zero; + __m256i sse_hi; + __m128i sse_128; + int64_t sse; + + if (block_size == 16) { + // Load 16 elements for coeff and dqcoeff. + const __m256i _coeff = load_tran_low(coeff); + const __m256i _dqcoeff = load_tran_low(dqcoeff); + // dqcoeff - coeff + const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff); + // madd (dqcoeff - coeff) + const __m256i error_lo = _mm256_madd_epi16(diff, diff); + // Save the higher 64 bit of each 128 bit lane. + const __m256i error_hi = _mm256_srli_si256(error_lo, 8); + // Add the higher 64 bit to the low 64 bit. + const __m256i error = _mm256_add_epi32(error_lo, error_hi); + // Expand each double word in the lower 64 bits to quad word. + sse_256 = _mm256_unpacklo_epi32(error, zero); + } else { + for (i = 0; i < block_size; i += 16) { + // Load 16 elements for coeff and dqcoeff. + const __m256i _coeff = load_tran_low(coeff); + const __m256i _dqcoeff = load_tran_low(dqcoeff); + const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff); + const __m256i error = _mm256_madd_epi16(diff, diff); + // Expand each double word of madd (dqcoeff - coeff) to quad word. + const __m256i exp_error_lo = _mm256_unpacklo_epi32(error, zero); + const __m256i exp_error_hi = _mm256_unpackhi_epi32(error, zero); + // Add each quad word of madd (dqcoeff - coeff). + sse_256 = _mm256_add_epi64(sse_256, exp_error_lo); + sse_256 = _mm256_add_epi64(sse_256, exp_error_hi); + coeff += 16; + dqcoeff += 16; + } + } + // Save the higher 64 bit of each 128 bit lane. + sse_hi = _mm256_srli_si256(sse_256, 8); + // Add the higher 64 bit to the low 64 bit. + sse_256 = _mm256_add_epi64(sse_256, sse_hi); + + // Add each 64 bit from each of the 128 bit lane of the 256 bit. + sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256), + _mm256_extractf128_si256(sse_256, 1)); + + // Store the results. + _mm_storel_epi64((__m128i *)&sse, sse_128); + return sse; +} diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_sse2.asm b/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_sse2.asm new file mode 100644 index 0000000000..7beec130ab --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_sse2.asm @@ -0,0 +1,115 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%define private_prefix vp9 + +%include "third_party/x86inc/x86inc.asm" +%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm" + +SECTION .text + +; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, +; int64_t *ssz) + +INIT_XMM sse2 +cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz + pxor m4, m4 ; sse accumulator + pxor m6, m6 ; ssz accumulator + pxor m5, m5 ; dedicated zero register +.loop: + LOAD_TRAN_LOW 2, uqcq, 0 + LOAD_TRAN_LOW 0, dqcq, 0 + LOAD_TRAN_LOW 3, uqcq, 8 + LOAD_TRAN_LOW 1, dqcq, 8 + INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16 + INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16 + sub sizeq, 16 + psubw m0, m2 + psubw m1, m3 + ; individual errors are max. 15bit+sign, so squares are 30bit, and + ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) + pmaddwd m0, m0 + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + ; the sum of 2 31bit integers will fit in a 32bit unsigned integer + paddd m0, m1 + paddd m2, m3 + ; accumulate in 64bit + punpckldq m7, m0, m5 + punpckhdq m0, m5 + paddq m4, m7 + punpckldq m7, m2, m5 + paddq m4, m0 + punpckhdq m2, m5 + paddq m6, m7 + paddq m6, m2 + jg .loop + + ; accumulate horizontally and store in return value + movhlps m5, m4 + movhlps m7, m6 + paddq m4, m5 + paddq m6, m7 +%if VPX_ARCH_X86_64 + movq rax, m4 + movq [sszq], m6 +%else + mov eax, sszm + pshufd m5, m4, 0x1 + movq [eax], m6 + movd eax, m4 + movd edx, m5 +%endif + RET + +; Compute the sum of squared difference between two tran_low_t vectors. +; Vectors are converted (if necessary) to int16_t for calculations. +; int64_t vp9_block_error_fp(tran_low_t *coeff, tran_low_t *dqcoeff, +; intptr_t block_size) + +INIT_XMM sse2 +cglobal block_error_fp, 3, 3, 6, uqc, dqc, size + pxor m4, m4 ; sse accumulator + pxor m5, m5 ; dedicated zero register +.loop: + LOAD_TRAN_LOW 2, uqcq, 0 + LOAD_TRAN_LOW 0, dqcq, 0 + LOAD_TRAN_LOW 3, uqcq, 8 + LOAD_TRAN_LOW 1, dqcq, 8 + INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16 + INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16 + sub sizeq, 16 + psubw m0, m2 + psubw m1, m3 + ; individual errors are max. 15bit+sign, so squares are 30bit, and + ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) + pmaddwd m0, m0 + pmaddwd m1, m1 + ; the sum of 2 31bit integers will fit in a 32bit unsigned integer + paddd m0, m1 + ; accumulate in 64bit + punpckldq m3, m0, m5 + punpckhdq m0, m5 + paddq m4, m3 + paddq m4, m0 + jnz .loop + + ; accumulate horizontally and store in return value + movhlps m5, m4 + paddq m4, m5 +%if VPX_ARCH_X86_64 + movq rax, m4 +%else + pshufd m5, m4, 0x1 + movd eax, m4 + movd edx, m5 +%endif + RET diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c new file mode 100644 index 0000000000..94506aad0f --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c @@ -0,0 +1,907 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSSE3 + +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "./vpx_scale_rtcd.h" +#include "vpx_dsp/x86/convolve_ssse3.h" +#include "vpx_dsp/x86/mem_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_scale/yv12config.h" + +static INLINE __m128i scale_plane_2_to_1_phase_0_kernel( + const uint8_t *const src, const __m128i *const mask) { + const __m128i a = _mm_loadu_si128((const __m128i *)(&src[0])); + const __m128i b = _mm_loadu_si128((const __m128i *)(&src[16])); + const __m128i a_and = _mm_and_si128(a, *mask); + const __m128i b_and = _mm_and_si128(b, *mask); + return _mm_packus_epi16(a_and, b_and); +} + +static void scale_plane_2_to_1_phase_0(const uint8_t *src, + const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, + const int dst_w, const int dst_h) { + const int max_width = (dst_w + 15) & ~15; + const __m128i mask = _mm_set1_epi16(0x00FF); + int y = dst_h; + + do { + int x = max_width; + do { + const __m128i d = scale_plane_2_to_1_phase_0_kernel(src, &mask); + _mm_storeu_si128((__m128i *)dst, d); + src += 32; + dst += 16; + x -= 16; + } while (x); + src += 2 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static void scale_plane_4_to_1_phase_0(const uint8_t *src, + const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, + const int dst_w, const int dst_h) { + const int max_width = (dst_w + 15) & ~15; + const __m128i mask = _mm_set1_epi32(0x000000FF); + int y = dst_h; + + do { + int x = max_width; + do { + const __m128i d0 = scale_plane_2_to_1_phase_0_kernel(&src[0], &mask); + const __m128i d1 = scale_plane_2_to_1_phase_0_kernel(&src[32], &mask); + const __m128i d2 = _mm_packus_epi16(d0, d1); + _mm_storeu_si128((__m128i *)dst, d2); + src += 64; + dst += 16; + x -= 16; + } while (x); + src += 4 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static INLINE __m128i scale_plane_bilinear_kernel(const __m128i *const s, + const __m128i c0c1) { + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i t0 = _mm_maddubs_epi16(s[0], c0c1); + const __m128i t1 = _mm_maddubs_epi16(s[1], c0c1); + // round and shift by 7 bit each 16 bit + const __m128i t2 = _mm_adds_epi16(t0, k_64); + const __m128i t3 = _mm_adds_epi16(t1, k_64); + const __m128i t4 = _mm_srai_epi16(t2, 7); + const __m128i t5 = _mm_srai_epi16(t3, 7); + return _mm_packus_epi16(t4, t5); +} + +static void scale_plane_2_to_1_bilinear(const uint8_t *src, + const ptrdiff_t src_stride, + uint8_t *dst, + const ptrdiff_t dst_stride, + const int dst_w, const int dst_h, + const __m128i c0c1) { + const int max_width = (dst_w + 15) & ~15; + int y = dst_h; + + do { + int x = max_width; + do { + __m128i s[2], d[2]; + + // Horizontal + // Even rows + s[0] = _mm_loadu_si128((const __m128i *)(src + 0)); + s[1] = _mm_loadu_si128((const __m128i *)(src + 16)); + d[0] = scale_plane_bilinear_kernel(s, c0c1); + + // odd rows + s[0] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0)); + s[1] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16)); + d[1] = scale_plane_bilinear_kernel(s, c0c1); + + // Vertical + s[0] = _mm_unpacklo_epi8(d[0], d[1]); + s[1] = _mm_unpackhi_epi8(d[0], d[1]); + d[0] = scale_plane_bilinear_kernel(s, c0c1); + + _mm_storeu_si128((__m128i *)dst, d[0]); + src += 32; + dst += 16; + x -= 16; + } while (x); + src += 2 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static void scale_plane_4_to_1_bilinear(const uint8_t *src, + const ptrdiff_t src_stride, + uint8_t *dst, + const ptrdiff_t dst_stride, + const int dst_w, const int dst_h, + const __m128i c0c1) { + const int max_width = (dst_w + 15) & ~15; + int y = dst_h; + + do { + int x = max_width; + do { + __m128i s[8], d[8]; + + // Note: Using _mm_packus_epi32() in SSE4.1 could be faster. + // Here we tried to not use shuffle instructions which would be slow + // on some x86 CPUs. + + // Horizontal + // 000 001 xx xx 004 005 xx xx 008 009 xx xx 00C 00D xx xx + // 010 011 xx xx 014 015 xx xx 018 019 xx xx 01C 01D xx xx + // 020 021 xx xx 024 025 xx xx 028 029 xx xx 02C 02D xx xx + // 030 031 xx xx 034 035 xx xx 038 039 xx xx 03C 03D xx xx + // 100 101 xx xx 104 105 xx xx 108 109 xx xx 10C 10D xx xx + // 110 111 xx xx 114 115 xx xx 118 119 xx xx 11C 11D xx xx + // 120 121 xx xx 124 125 xx xx 128 129 xx xx 12C 12D xx xx + // 130 131 xx xx 134 135 xx xx 138 139 xx xx 13C 13D xx xx + s[0] = _mm_loadu_si128((const __m128i *)(&src[0])); + s[1] = _mm_loadu_si128((const __m128i *)(&src[16])); + s[2] = _mm_loadu_si128((const __m128i *)(&src[32])); + s[3] = _mm_loadu_si128((const __m128i *)(&src[48])); + s[4] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0)); + s[5] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16)); + s[6] = _mm_loadu_si128((const __m128i *)(src + src_stride + 32)); + s[7] = _mm_loadu_si128((const __m128i *)(src + src_stride + 48)); + + // 000 001 100 101 xx xx xx xx 004 005 104 105 xx xx xx xx + // 008 009 108 109 xx xx xx xx 00C 00D 10C 10D xx xx xx xx + // 010 011 110 111 xx xx xx xx 014 015 114 115 xx xx xx xx + // 018 019 118 119 xx xx xx xx 01C 01D 11C 11D xx xx xx xx + // 020 021 120 121 xx xx xx xx 024 025 124 125 xx xx xx xx + // 028 029 128 129 xx xx xx xx 02C 02D 12C 12D xx xx xx xx + // 030 031 130 131 xx xx xx xx 034 035 134 135 xx xx xx xx + // 038 039 138 139 xx xx xx xx 03C 03D 13C 13D xx xx xx xx + d[0] = _mm_unpacklo_epi16(s[0], s[4]); + d[1] = _mm_unpackhi_epi16(s[0], s[4]); + d[2] = _mm_unpacklo_epi16(s[1], s[5]); + d[3] = _mm_unpackhi_epi16(s[1], s[5]); + d[4] = _mm_unpacklo_epi16(s[2], s[6]); + d[5] = _mm_unpackhi_epi16(s[2], s[6]); + d[6] = _mm_unpacklo_epi16(s[3], s[7]); + d[7] = _mm_unpackhi_epi16(s[3], s[7]); + + // 000 001 100 101 008 009 108 109 xx xx xx xx xx xx xx xx + // 004 005 104 105 00C 00D 10C 10D xx xx xx xx xx xx xx xx + // 010 011 110 111 018 019 118 119 xx xx xx xx xx xx xx xx + // 014 015 114 115 01C 01D 11C 11D xx xx xx xx xx xx xx xx + // 020 021 120 121 028 029 128 129 xx xx xx xx xx xx xx xx + // 024 025 124 125 02C 02D 12C 12D xx xx xx xx xx xx xx xx + // 030 031 130 131 038 039 138 139 xx xx xx xx xx xx xx xx + // 034 035 134 135 03C 03D 13C 13D xx xx xx xx xx xx xx xx + s[0] = _mm_unpacklo_epi32(d[0], d[1]); + s[1] = _mm_unpackhi_epi32(d[0], d[1]); + s[2] = _mm_unpacklo_epi32(d[2], d[3]); + s[3] = _mm_unpackhi_epi32(d[2], d[3]); + s[4] = _mm_unpacklo_epi32(d[4], d[5]); + s[5] = _mm_unpackhi_epi32(d[4], d[5]); + s[6] = _mm_unpacklo_epi32(d[6], d[7]); + s[7] = _mm_unpackhi_epi32(d[6], d[7]); + + // 000 001 100 101 004 005 104 105 008 009 108 109 00C 00D 10C 10D + // 010 011 110 111 014 015 114 115 018 019 118 119 01C 01D 11C 11D + // 020 021 120 121 024 025 124 125 028 029 128 129 02C 02D 12C 12D + // 030 031 130 131 034 035 134 135 038 039 138 139 03C 03D 13C 13D + d[0] = _mm_unpacklo_epi32(s[0], s[1]); + d[1] = _mm_unpacklo_epi32(s[2], s[3]); + d[2] = _mm_unpacklo_epi32(s[4], s[5]); + d[3] = _mm_unpacklo_epi32(s[6], s[7]); + + d[0] = scale_plane_bilinear_kernel(&d[0], c0c1); + d[1] = scale_plane_bilinear_kernel(&d[2], c0c1); + + // Vertical + d[0] = scale_plane_bilinear_kernel(d, c0c1); + + _mm_storeu_si128((__m128i *)dst, d[0]); + src += 64; + dst += 16; + x -= 16; + } while (x); + src += 4 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + const int width_hor = (w + 3) & ~3; + const int width_ver = (w + 7) & ~7; + const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7; + const int height_ver = (h + 3) & ~3; + int x, y = height_hor; + uint8_t *t = temp_buffer; + __m128i s[11], d[4]; + __m128i f[4]; + + assert(w && h); + + shuffle_filter_ssse3(coef, f); + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1; + + // horizontal 4x8 + do { + load_8bit_8x8(src + 2, src_stride, s); + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 + // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 (overlapped) + transpose_16bit_4x8(s, s); + x = width_hor; + + do { + src += 8; + load_8bit_8x8(src, src_stride, &s[3]); + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 + // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79 + // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B + // 0C 0D 1C 1D 2C 2D 3C 3D 4C 4D 5C 5D 6C 6D 7C 7D + transpose_16bit_4x8(&s[3], &s[3]); + + d[0] = convolve8_8_ssse3(&s[0], f); // 00 10 20 30 40 50 60 70 + d[1] = convolve8_8_ssse3(&s[1], f); // 01 11 21 31 41 51 61 71 + d[2] = convolve8_8_ssse3(&s[2], f); // 02 12 22 32 42 52 62 72 + d[3] = convolve8_8_ssse3(&s[3], f); // 03 13 23 33 43 53 63 73 + + // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72 + // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73 + d[0] = _mm_packus_epi16(d[0], d[2]); + d[1] = _mm_packus_epi16(d[1], d[3]); + // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71 + // 02 12 03 13 22 32 23 33 42 52 43 53 62 72 63 73 + d[2] = _mm_unpacklo_epi16(d[0], d[1]); + d[3] = _mm_unpackhi_epi16(d[0], d[1]); + // 00 10 01 11 02 12 03 13 20 30 21 31 22 32 23 33 + // 40 50 41 51 42 52 43 53 60 70 61 71 62 72 63 73 + d[0] = _mm_unpacklo_epi32(d[2], d[3]); + d[1] = _mm_unpackhi_epi32(d[2], d[3]); + store_8bit_8x4_from_16x2(d, t, 2 * width_hor); + + s[0] = s[4]; + s[1] = s[5]; + s[2] = s[6]; + + t += 8; + x -= 4; + } while (x); + src += 8 * src_stride - 2 * width_hor; + t += 6 * width_hor; + y -= 8; + } while (y); + + // vertical 8x4 + x = width_ver; + t = temp_buffer; + do { + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor)); + s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor)); + s[2] = _mm_loadu_si128((const __m128i *)(t + 4 * width_hor)); + t += 6 * width_hor; + y = height_ver; + + do { + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 77 + // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 77 + // C0 D0 C1 D1 C2 D2 C3 D3 C4 D4 C5 D5 C6 D6 C7 77 + loadu_8bit_16x4(t, 2 * width_hor, &s[3]); + t += 8 * width_hor; + + d[0] = convolve8_8_ssse3(&s[0], f); // 00 01 02 03 04 05 06 07 + d[1] = convolve8_8_ssse3(&s[1], f); // 10 11 12 13 14 15 16 17 + d[2] = convolve8_8_ssse3(&s[2], f); // 20 21 22 23 24 25 26 27 + d[3] = convolve8_8_ssse3(&s[3], f); // 30 31 32 33 34 35 36 37 + + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37 + d[0] = _mm_packus_epi16(d[0], d[1]); + d[1] = _mm_packus_epi16(d[2], d[3]); + store_8bit_8x4_from_16x2(d, dst, dst_stride); + + s[0] = s[4]; + s[1] = s[5]; + s[2] = s[6]; + + dst += 4 * dst_stride; + y -= 4; + } while (y); + t -= width_hor * (2 * height_ver + 6); + t += 16; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + const int width_hor = (w + 1) & ~1; + const int width_ver = (w + 7) & ~7; + const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7; + const int height_ver = (h + 1) & ~1; + int x, y = height_hor; + uint8_t *t = temp_buffer; + __m128i s[11], d[4]; + __m128i f[4]; + + assert(w && h); + + shuffle_filter_ssse3(coef, f); + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3; + + // horizontal 2x8 + do { + load_8bit_8x8(src + 4, src_stride, s); + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 + // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 (overlapped) + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 (overlapped) + transpose_16bit_4x8(s, s); + x = width_hor; + + do { + src += 8; + load_8bit_8x8(src, src_stride, &s[2]); + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 + // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79 + // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B + transpose_16bit_4x8(&s[2], &s[2]); + + d[0] = convolve8_8_ssse3(&s[0], f); // 00 10 20 30 40 50 60 70 + d[1] = convolve8_8_ssse3(&s[2], f); // 01 11 21 31 41 51 61 71 + + // 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx xx + // 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx + d[0] = _mm_packus_epi16(d[0], d[0]); + d[1] = _mm_packus_epi16(d[1], d[1]); + // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71 + d[0] = _mm_unpacklo_epi16(d[0], d[1]); + store_8bit_4x4_sse2(d[0], t, 2 * width_hor); + + s[0] = s[4]; + s[1] = s[5]; + + t += 4; + x -= 2; + } while (x); + src += 8 * src_stride - 4 * width_hor; + t += 6 * width_hor; + y -= 8; + } while (y); + + // vertical 8x2 + x = width_ver; + t = temp_buffer; + do { + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor)); + s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor)); + t += 4 * width_hor; + y = height_ver; + + do { + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 77 + // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 77 + loadu_8bit_16x4(t, 2 * width_hor, &s[2]); + t += 8 * width_hor; + + d[0] = convolve8_8_ssse3(&s[0], f); // 00 01 02 03 04 05 06 07 + d[1] = convolve8_8_ssse3(&s[2], f); // 10 11 12 13 14 15 16 17 + + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + d[0] = _mm_packus_epi16(d[0], d[1]); + _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]); + _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]); + + s[0] = s[4]; + s[1] = s[5]; + + dst += 2 * dst_stride; + y -= 2; + } while (y); + t -= width_hor * (4 * height_ver + 4); + t += 16; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +typedef void (*shuffle_filter_funcs)(const int16_t *const filter, + __m128i *const f); + +typedef __m128i (*convolve8_funcs)(const __m128i *const s, + const __m128i *const f); + +static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const InterpKernel *const coef, + const int phase_scaler, + uint8_t *const temp_buffer) { + static const int step_q4 = 16 * 4 / 3; + const int width_hor = (w + 5) - ((w + 5) % 6); + const int stride_hor = 2 * width_hor + 4; // store 4 extra pixels + const int width_ver = (w + 7) & ~7; + // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows + // above and (SUBPEL_TAPS / 2) extra rows below. + const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; + const int height_ver = (h + 5) - ((h + 5) % 6); + int x, y = height_hor; + uint8_t *t = temp_buffer; + __m128i s[12], d[6], dd[4]; + __m128i f0[4], f1[5], f2[5]; + // The offset of the first row is always less than 1 pixel. + const int offset1_q4 = phase_scaler + 1 * step_q4; + const int offset2_q4 = phase_scaler + 2 * step_q4; + // offset_idxx indicates the pixel offset is even (0) or odd (1). + // It's used to choose the src offset and filter coefficient offset. + const int offset_idx1 = (offset1_q4 >> 4) & 1; + const int offset_idx2 = (offset2_q4 >> 4) & 1; + static const shuffle_filter_funcs kShuffleFilterFuncs[2] = { + shuffle_filter_ssse3, shuffle_filter_odd_ssse3 + }; + static const convolve8_funcs kConvolve8Funcs[2] = { + convolve8_8_even_offset_ssse3, convolve8_8_odd_offset_ssse3 + }; + + assert(w && h); + + shuffle_filter_ssse3(coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK], f0); + kShuffleFilterFuncs[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1); + kShuffleFilterFuncs[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2); + + // Sub 64 to avoid overflow. + // Coef 128 would be treated as -128 in PMADDUBSW. Sub 64 here. + // Coef 128 is in either fx[1] or fx[2] depending on the phase idx. + // When filter phase idx is 1, the two biggest coefficients are shuffled + // together, and the sum of them are always no less than 128. Sub 64 here. + // After the subtraction, when the sum of all positive coefficients are no + // larger than 128, and the sum of all negative coefficients are no + // less than -128, there will be no overflow in the convolve8 functions. + f0[1] = _mm_sub_epi8(f0[1], _mm_set1_epi8(64)); + f1[1 + offset_idx1] = _mm_sub_epi8(f1[1 + offset_idx1], _mm_set1_epi8(64)); + f2[1 + offset_idx2] = _mm_sub_epi8(f2[1 + offset_idx2], _mm_set1_epi8(64)); + + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 - 1; + + // horizontal 6x8 + do { + load_8bit_8x8(src, src_stride, s); + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 + // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 + transpose_16bit_4x8(s, s); + x = width_hor; + + do { + src += 8; + load_8bit_8x8(src, src_stride, &s[4]); + // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79 + // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B + // OC 0D 1C 1D 2C 2D 3C 3D 4C 4D 5C 5D 6C 6D 7C 7D + // 0E 0F 1E 1F 2E 2F 3E 3F 4E 4F 5E 5F 6E 6F 7E 7F + transpose_16bit_4x8(&s[4], &s[4]); + + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + d[0] = convolve8_8_even_offset_ssse3(&s[0], f0); + d[1] = kConvolve8Funcs[offset_idx1](&s[offset1_q4 >> 5], f1); + d[2] = kConvolve8Funcs[offset_idx2](&s[offset2_q4 >> 5], f2); + d[3] = convolve8_8_even_offset_ssse3(&s[2], f0); + d[4] = kConvolve8Funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1); + d[5] = kConvolve8Funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2); + + // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72 + // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 xx xx xx xx xx xx xx xx + // 05 15 25 35 45 55 65 75 xx xx xx xx xx xx xx xx + dd[0] = _mm_packus_epi16(d[0], d[2]); + dd[1] = _mm_packus_epi16(d[1], d[3]); + dd[2] = _mm_packus_epi16(d[4], d[4]); + dd[3] = _mm_packus_epi16(d[5], d[5]); + + // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71 + // 02 12 03 13 22 32 23 33 42 52 43 53 62 72 63 73 + // 04 14 05 15 24 34 25 35 44 54 45 55 64 74 65 75 + d[0] = _mm_unpacklo_epi16(dd[0], dd[1]); + d[1] = _mm_unpackhi_epi16(dd[0], dd[1]); + d[2] = _mm_unpacklo_epi16(dd[2], dd[3]); + + // 00 10 01 11 02 12 03 13 20 30 21 31 22 32 23 33 + // 40 50 41 51 42 52 43 53 60 70 61 71 62 72 63 73 + // 04 14 05 15 xx xx xx xx 24 34 25 35 xx xx xx xx + // 44 54 45 55 xx xx xx xx 64 74 65 75 xx xx xx xx + dd[0] = _mm_unpacklo_epi32(d[0], d[1]); + dd[1] = _mm_unpackhi_epi32(d[0], d[1]); + dd[2] = _mm_unpacklo_epi32(d[2], d[2]); + dd[3] = _mm_unpackhi_epi32(d[2], d[2]); + + // 00 10 01 11 02 12 03 13 04 14 05 15 xx xx xx xx + // 20 30 21 31 22 32 23 33 24 34 25 35 xx xx xx xx + // 40 50 41 51 42 52 43 53 44 54 45 55 xx xx xx xx + // 60 70 61 71 62 72 63 73 64 74 65 75 xx xx xx xx + d[0] = _mm_unpacklo_epi64(dd[0], dd[2]); + d[1] = _mm_unpackhi_epi64(dd[0], dd[2]); + d[2] = _mm_unpacklo_epi64(dd[1], dd[3]); + d[3] = _mm_unpackhi_epi64(dd[1], dd[3]); + + // store 4 extra pixels + storeu_8bit_16x4(d, t, stride_hor); + + s[0] = s[4]; + s[1] = s[5]; + s[2] = s[6]; + s[3] = s[7]; + + t += 12; + x -= 6; + } while (x); + src += 8 * src_stride - 4 * width_hor / 3; + t += 3 * stride_hor + 4; + y -= 8; + } while (y); + + // vertical 8x6 + x = width_ver; + t = temp_buffer; + do { + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + loadu_8bit_16x4(t, stride_hor, s); + y = height_ver; + + do { + // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 97 + // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 B7 + // C0 D0 C1 D1 C2 D2 C3 D3 C4 D4 C5 D5 C6 D6 C7 D7 + // E0 F0 E1 F1 E2 F2 E3 F3 E4 F4 E5 F5 E6 F6 E7 F7 + t += 4 * stride_hor; + loadu_8bit_16x4(t, stride_hor, &s[4]); + + d[0] = convolve8_8_even_offset_ssse3(&s[0], f0); + d[1] = kConvolve8Funcs[offset_idx1](&s[offset1_q4 >> 5], f1); + d[2] = kConvolve8Funcs[offset_idx2](&s[offset2_q4 >> 5], f2); + d[3] = convolve8_8_even_offset_ssse3(&s[2], f0); + d[4] = kConvolve8Funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1); + d[5] = kConvolve8Funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2); + + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57 + d[0] = _mm_packus_epi16(d[0], d[1]); + d[2] = _mm_packus_epi16(d[2], d[3]); + d[4] = _mm_packus_epi16(d[4], d[5]); + + _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]); + _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]); + _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), d[2]); + _mm_storeh_epi64((__m128i *)(dst + 3 * dst_stride), d[2]); + _mm_storel_epi64((__m128i *)(dst + 4 * dst_stride), d[4]); + _mm_storeh_epi64((__m128i *)(dst + 5 * dst_stride), d[4]); + + s[0] = s[4]; + s[1] = s[5]; + s[2] = s[6]; + s[3] = s[7]; + + dst += 6 * dst_stride; + y -= 6; + } while (y); + t -= stride_hor * 2 * height_ver / 3; + t += 16; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +static INLINE __m128i scale_1_to_2_phase_0_kernel(const __m128i *const s, + const __m128i *const f) { + __m128i ss[4], temp; + + ss[0] = _mm_unpacklo_epi8(s[0], s[1]); + ss[1] = _mm_unpacklo_epi8(s[2], s[3]); + ss[2] = _mm_unpacklo_epi8(s[4], s[5]); + ss[3] = _mm_unpacklo_epi8(s[6], s[7]); + temp = convolve8_8_ssse3(ss, f); + return _mm_packus_epi16(temp, temp); +} + +// Only calculate odd columns since even columns are just src pixels' copies. +static void scale_1_to_2_phase_0_row(const uint8_t *src, uint8_t *dst, + const int w, const __m128i *const f) { + int x = w; + + do { + __m128i s[8], temp; + s[0] = _mm_loadl_epi64((const __m128i *)(src + 0)); + s[1] = _mm_loadl_epi64((const __m128i *)(src + 1)); + s[2] = _mm_loadl_epi64((const __m128i *)(src + 2)); + s[3] = _mm_loadl_epi64((const __m128i *)(src + 3)); + s[4] = _mm_loadl_epi64((const __m128i *)(src + 4)); + s[5] = _mm_loadl_epi64((const __m128i *)(src + 5)); + s[6] = _mm_loadl_epi64((const __m128i *)(src + 6)); + s[7] = _mm_loadl_epi64((const __m128i *)(src + 7)); + temp = scale_1_to_2_phase_0_kernel(s, f); + _mm_storel_epi64((__m128i *)dst, temp); + src += 8; + dst += 8; + x -= 8; + } while (x); +} + +static void scale_plane_1_to_2_phase_0(const uint8_t *src, + const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, + const int src_w, const int src_h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + int max_width; + int y; + uint8_t *tmp[9]; + __m128i f[4]; + + max_width = (src_w + 7) & ~7; + tmp[0] = temp_buffer + 0 * max_width; + tmp[1] = temp_buffer + 1 * max_width; + tmp[2] = temp_buffer + 2 * max_width; + tmp[3] = temp_buffer + 3 * max_width; + tmp[4] = temp_buffer + 4 * max_width; + tmp[5] = temp_buffer + 5 * max_width; + tmp[6] = temp_buffer + 6 * max_width; + tmp[7] = temp_buffer + 7 * max_width; + + shuffle_filter_ssse3(coef, f); + + scale_1_to_2_phase_0_row(src - 3 * src_stride - 3, tmp[0], max_width, f); + scale_1_to_2_phase_0_row(src - 2 * src_stride - 3, tmp[1], max_width, f); + scale_1_to_2_phase_0_row(src - 1 * src_stride - 3, tmp[2], max_width, f); + scale_1_to_2_phase_0_row(src + 0 * src_stride - 3, tmp[3], max_width, f); + scale_1_to_2_phase_0_row(src + 1 * src_stride - 3, tmp[4], max_width, f); + scale_1_to_2_phase_0_row(src + 2 * src_stride - 3, tmp[5], max_width, f); + scale_1_to_2_phase_0_row(src + 3 * src_stride - 3, tmp[6], max_width, f); + + y = src_h; + do { + int x; + scale_1_to_2_phase_0_row(src + 4 * src_stride - 3, tmp[7], max_width, f); + for (x = 0; x < max_width; x += 8) { + __m128i s[8], C, D, CD; + + // Even rows + const __m128i a = _mm_loadl_epi64((const __m128i *)(src + x)); + const __m128i b = _mm_loadl_epi64((const __m128i *)(tmp[3] + x)); + const __m128i ab = _mm_unpacklo_epi8(a, b); + _mm_storeu_si128((__m128i *)(dst + 2 * x), ab); + + // Odd rows + // Even columns + load_8bit_8x8(src + x - 3 * src_stride, src_stride, s); + C = scale_1_to_2_phase_0_kernel(s, f); + + // Odd columns + s[0] = _mm_loadl_epi64((const __m128i *)(tmp[0] + x)); + s[1] = _mm_loadl_epi64((const __m128i *)(tmp[1] + x)); + s[2] = _mm_loadl_epi64((const __m128i *)(tmp[2] + x)); + s[3] = _mm_loadl_epi64((const __m128i *)(tmp[3] + x)); + s[4] = _mm_loadl_epi64((const __m128i *)(tmp[4] + x)); + s[5] = _mm_loadl_epi64((const __m128i *)(tmp[5] + x)); + s[6] = _mm_loadl_epi64((const __m128i *)(tmp[6] + x)); + s[7] = _mm_loadl_epi64((const __m128i *)(tmp[7] + x)); + D = scale_1_to_2_phase_0_kernel(s, f); + + CD = _mm_unpacklo_epi8(C, D); + _mm_storeu_si128((__m128i *)(dst + dst_stride + 2 * x), CD); + } + + src += src_stride; + dst += 2 * dst_stride; + tmp[8] = tmp[0]; + tmp[0] = tmp[1]; + tmp[1] = tmp[2]; + tmp[2] = tmp[3]; + tmp[3] = tmp[4]; + tmp[4] = tmp[5]; + tmp[5] = tmp[6]; + tmp[6] = tmp[7]; + tmp[7] = tmp[8]; + } while (--y); +} + +void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + uint8_t filter_type, int phase_scaler) { + const int src_w = src->y_crop_width; + const int src_h = src->y_crop_height; + const int dst_w = dst->y_crop_width; + const int dst_h = dst->y_crop_height; + const int dst_uv_w = dst->uv_crop_width; + const int dst_uv_h = dst->uv_crop_height; + int scaled = 0; + + // phase_scaler is usually 0 or 8. + assert(phase_scaler >= 0 && phase_scaler < 16); + + if (dst_w * 2 == src_w && dst_h * 2 == src_h) { + // 2 to 1 + scaled = 1; + + if (phase_scaler == 0) { + scale_plane_2_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h); + scale_plane_2_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + scale_plane_2_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + } else if (filter_type == BILINEAR) { + const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3]; + const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4]; + const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8)); // c0 and c1 >= 0 + scale_plane_2_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h, c0c1); + scale_plane_2_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0c1); + scale_plane_2_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0c1); + } else { + const int buffer_stride = (dst_w + 3) & ~3; + const int buffer_height = (2 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height); + if (temp_buffer) { + scale_plane_2_to_1_general( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w, + dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer); + scale_plane_2_to_1_general( + src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + scale_plane_2_to_1_general( + src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + free(temp_buffer); + } else { + scaled = 0; + } + } + } else if (4 * dst_w == src_w && 4 * dst_h == src_h) { + // 4 to 1 + scaled = 1; + if (phase_scaler == 0) { + scale_plane_4_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h); + scale_plane_4_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + scale_plane_4_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + } else if (filter_type == BILINEAR) { + const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3]; + const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4]; + const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8)); // c0 and c1 >= 0 + scale_plane_4_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h, c0c1); + scale_plane_4_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0c1); + scale_plane_4_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0c1); + } else { + const int buffer_stride = (dst_w + 1) & ~1; + const int buffer_height = (4 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7; + // When dst_w is 1 or 2, we need extra padding to avoid heap read overflow + const int extra_padding = 16; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height + extra_padding); + if (temp_buffer) { + scale_plane_4_to_1_general( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w, + dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer); + scale_plane_4_to_1_general( + src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + scale_plane_4_to_1_general( + src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + free(temp_buffer); + } else { + scaled = 0; + } + } + } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) { + // 4 to 3 + const int buffer_stride_hor = (dst_w + 5) - ((dst_w + 5) % 6) + 2; + const int buffer_stride_ver = (dst_w + 7) & ~7; + const int buffer_height = (4 * dst_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; + // When the vertical filter reads more pixels than the horizontal filter + // generated in each row, we need extra padding to avoid heap read overflow. + // For example, the horizontal filter generates 18 pixels but the vertical + // filter reads 24 pixels in a row. The difference is multiplied by 2 since + // two rows are interlaced together in the optimization. + const int extra_padding = (buffer_stride_ver > buffer_stride_hor) + ? 2 * (buffer_stride_ver - buffer_stride_hor) + : 0; + const int buffer_size = buffer_stride_hor * buffer_height + extra_padding; + uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_size); + if (temp_buffer) { + scaled = 1; + scale_plane_4_to_3_general( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w, + dst_h, vp9_filter_kernels[filter_type], phase_scaler, temp_buffer); + scale_plane_4_to_3_general(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, + vp9_filter_kernels[filter_type], phase_scaler, + temp_buffer); + scale_plane_4_to_3_general(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, + vp9_filter_kernels[filter_type], phase_scaler, + temp_buffer); + free(temp_buffer); + } + } else if (dst_w == src_w * 2 && dst_h == src_h * 2 && phase_scaler == 0) { + // 1 to 2 + uint8_t *const temp_buffer = (uint8_t *)malloc(8 * ((src_w + 7) & ~7)); + if (temp_buffer) { + scaled = 1; + scale_plane_1_to_2_phase_0( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, src_w, + src_h, vp9_filter_kernels[filter_type][8], temp_buffer); + scale_plane_1_to_2_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, src_w / 2, src_h / 2, + vp9_filter_kernels[filter_type][8], + temp_buffer); + scale_plane_1_to_2_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, src_w / 2, src_h / 2, + vp9_filter_kernels[filter_type][8], + temp_buffer); + free(temp_buffer); + } + } + + if (scaled) { + vpx_extend_frame_borders(dst); + } else { + // Call c version for all other scaling ratios. + vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler); + } +} diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c new file mode 100644 index 0000000000..d7aafe7b01 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" + +int64_t vp9_highbd_block_error_sse2(const tran_low_t *coeff, + const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz, int bd) { + int i, j, test; + uint32_t temp[4]; + __m128i max, min, cmp0, cmp1, cmp2, cmp3; + int64_t error = 0, sqcoeff = 0; + const int shift = 2 * (bd - 8); + const int rounding = shift > 0 ? 1 << (shift - 1) : 0; + + for (i = 0; i < block_size; i += 8) { + // Load the data into xmm registers + __m128i mm_coeff = _mm_load_si128((const __m128i *)(coeff + i)); + __m128i mm_coeff2 = _mm_load_si128((const __m128i *)(coeff + i + 4)); + __m128i mm_dqcoeff = _mm_load_si128((const __m128i *)(dqcoeff + i)); + __m128i mm_dqcoeff2 = _mm_load_si128((const __m128i *)(dqcoeff + i + 4)); + // Check if any values require more than 15 bit + max = _mm_set1_epi32(0x3fff); + min = _mm_set1_epi32((int32_t)0xffffc000); + cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max), + _mm_cmplt_epi32(mm_coeff, min)); + cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max), + _mm_cmplt_epi32(mm_coeff2, min)); + cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max), + _mm_cmplt_epi32(mm_dqcoeff, min)); + cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max), + _mm_cmplt_epi32(mm_dqcoeff2, min)); + test = _mm_movemask_epi8( + _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3))); + + if (!test) { + __m128i mm_diff, error_sse2, sqcoeff_sse2; + mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2); + mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2); + mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff); + error_sse2 = _mm_madd_epi16(mm_diff, mm_diff); + sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff); + _mm_storeu_si128((__m128i *)temp, error_sse2); + error = error + temp[0] + temp[1] + temp[2] + temp[3]; + _mm_storeu_si128((__m128i *)temp, sqcoeff_sse2); + sqcoeff += temp[0] + temp[1] + temp[2] + temp[3]; + } else { + for (j = 0; j < 8; j++) { + const int64_t diff = coeff[i + j] - dqcoeff[i + j]; + error += diff * diff; + sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j]; + } + } + } + assert(error >= 0 && sqcoeff >= 0); + error = (error + rounding) >> shift; + sqcoeff = (sqcoeff + rounding) >> shift; + + *ssz = sqcoeff; + return error; +} diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c new file mode 100644 index 0000000000..bf44b08674 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c @@ -0,0 +1,439 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include // AVX2 + +#include "./vp9_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/x86/bitdepth_conversion_avx2.h" +#include "vpx_dsp/x86/quantize_sse2.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" + +// Zero fill 8 positions in the output buffer. +static VPX_FORCE_INLINE void store_zero_tran_low(tran_low_t *a) { + const __m256i zero = _mm256_setzero_si256(); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_storeu_si256((__m256i *)(a), zero); + _mm256_storeu_si256((__m256i *)(a + 8), zero); +#else + _mm256_storeu_si256((__m256i *)(a), zero); +#endif +} + +static VPX_FORCE_INLINE void load_fp_values_avx2( + const struct macroblock_plane *mb_plane, __m256i *round, __m256i *quant, + const int16_t *dequant_ptr, __m256i *dequant) { + *round = _mm256_castsi128_si256( + _mm_load_si128((const __m128i *)mb_plane->round_fp)); + *round = _mm256_permute4x64_epi64(*round, 0x54); + *quant = _mm256_castsi128_si256( + _mm_load_si128((const __m128i *)mb_plane->quant_fp)); + *quant = _mm256_permute4x64_epi64(*quant, 0x54); + *dequant = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr)); + *dequant = _mm256_permute4x64_epi64(*dequant, 0x54); +} + +static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan, + __m256i v_eobmax, + __m256i v_mask) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m256i v_iscan = _mm256_permute4x64_epi64( + _mm256_loadu_si256((const __m256i *)iscan), 0xD8); +#else + const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan); +#endif + const __m256i v_nz_iscan = _mm256_and_si256(v_iscan, v_mask); + return _mm256_max_epi16(v_eobmax, v_nz_iscan); +} + +static VPX_FORCE_INLINE uint16_t get_max_eob(__m256i eob256) { + const __m256i eob_lo = eob256; + // Copy upper 128 to lower 128 + const __m256i eob_hi = _mm256_permute2x128_si256(eob256, eob256, 0X81); + __m256i eob = _mm256_max_epi16(eob_lo, eob_hi); + __m256i eob_s = _mm256_shuffle_epi32(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 1); + eob = _mm256_max_epi16(eob, eob_s); +#if defined(_MSC_VER) && (_MSC_VER < 1910) + return _mm_cvtsi128_si32(_mm256_extracti128_si256(eob, 0)) & 0xffff; +#else + return (uint16_t)_mm256_extract_epi16(eob, 0); +#endif +} + +static VPX_FORCE_INLINE void quantize_fp_16( + const __m256i *round, const __m256i *quant, const __m256i *dequant, + const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) { + const __m256i coeff = load_tran_low(coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi16(coeff); + const int32_t nzflag = + _mm256_movemask_epi8(_mm256_cmpgt_epi16(abs_coeff, *thr)); + + if (nzflag) { + const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round); + const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant); + const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff); + const __m256i dqcoeff = _mm256_mullo_epi16(qcoeff, *dequant); + const __m256i nz_mask = + _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256()); + store_tran_low(qcoeff, qcoeff_ptr); + store_tran_low(dqcoeff, dqcoeff_ptr); + + *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask); + } else { + store_zero_tran_low(qcoeff_ptr); + store_zero_tran_low(dqcoeff_ptr); + } +} + +void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + __m256i round, quant, dequant, thr; + __m256i eob_max = _mm256_setzero_si256(); + const int16_t *iscan = scan_order->iscan; + + coeff_ptr += n_coeffs; + iscan += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + + // Setup global values + load_fp_values_avx2(mb_plane, &round, &quant, dequant_ptr, &dequant); + thr = _mm256_setzero_si256(); + + quantize_fp_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); + + n_coeffs += 8 * 2; + + // remove dc constants + dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31); + quant = _mm256_permute2x128_si256(quant, quant, 0x31); + round = _mm256_permute2x128_si256(round, round, 0x31); + thr = _mm256_srai_epi16(dequant, 1); + + // AC only loop + while (n_coeffs < 0) { + quantize_fp_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); + n_coeffs += 8 * 2; + } + + *eob_ptr = get_max_eob(eob_max); +} + +// Enable this flag when matching the optimized code to +// vp9_quantize_fp_32x32_c(). Disabled, the optimized code will match the +// existing ssse3 code and quantize_fp_32x32_nz_c(). +// +// #define MATCH_VP9_QUANTIZE_FP_32X32_C + +#ifndef MATCH_VP9_QUANTIZE_FP_32X32_C +static VPX_FORCE_INLINE void quantize_fp_32x32_16_no_nzflag( + const __m256i *round, const __m256i *quant, const __m256i *dequant, + const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) { + const __m256i coeff = load_tran_low(coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi16(coeff); + const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round); + const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant); + const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff); + const __m256i abs_dqcoeff = + _mm256_srli_epi16(_mm256_mullo_epi16(abs_qcoeff, *dequant), 1); + const __m256i dqcoeff = _mm256_sign_epi16(abs_dqcoeff, coeff); + const __m256i nz_mask = + _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256()); + store_tran_low(qcoeff, qcoeff_ptr); + store_tran_low(dqcoeff, dqcoeff_ptr); + + *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask); + (void)thr; +} +#endif + +static VPX_FORCE_INLINE void quantize_fp_32x32_16( + const __m256i *round, const __m256i *quant, const __m256i *dequant, + const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) { + const __m256i coeff = load_tran_low(coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi16(coeff); + const __m256i thr_mask = _mm256_cmpgt_epi16(abs_coeff, *thr); + const int32_t nzflag = _mm256_movemask_epi8(thr_mask); + + if (nzflag) { +#ifdef MATCH_VP9_QUANTIZE_FP_32X32_C + const __m256i tmp_rnd = + _mm256_and_si256(_mm256_adds_epi16(abs_coeff, *round), thr_mask); +#else + const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round); +#endif + const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant); + const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff); + const __m256i abs_dqcoeff = + _mm256_srli_epi16(_mm256_mullo_epi16(abs_qcoeff, *dequant), 1); + const __m256i dqcoeff = _mm256_sign_epi16(abs_dqcoeff, coeff); + const __m256i nz_mask = + _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256()); + store_tran_low(qcoeff, qcoeff_ptr); + store_tran_low(dqcoeff, dqcoeff_ptr); + + *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask); + } else { + store_zero_tran_low(qcoeff_ptr); + store_zero_tran_low(dqcoeff_ptr); + } +} + +void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + __m256i round, quant, dequant, thr; + __m256i eob_max = _mm256_setzero_si256(); + const int16_t *iscan = scan_order->iscan; + + coeff_ptr += n_coeffs; + iscan += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + + // Setup global values + load_fp_values_avx2(mb_plane, &round, &quant, dequant_ptr, &dequant); + thr = _mm256_srli_epi16(dequant, 2); + quant = _mm256_slli_epi16(quant, 1); + { + const __m256i rnd = _mm256_set1_epi16((int16_t)1); + round = _mm256_add_epi16(round, rnd); + round = _mm256_srai_epi16(round, 1); + } + +#ifdef MATCH_VP9_QUANTIZE_FP_32X32_C + // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when + // calculating the zbin mask. + thr = _mm256_sub_epi16(thr, _mm256_set1_epi16(1)); + quantize_fp_32x32_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); +#else + quantize_fp_32x32_16_no_nzflag( + &round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, iscan + n_coeffs, + qcoeff_ptr + n_coeffs, dqcoeff_ptr + n_coeffs, &eob_max); +#endif + + n_coeffs += 8 * 2; + + // remove dc constants + dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31); + quant = _mm256_permute2x128_si256(quant, quant, 0x31); + round = _mm256_permute2x128_si256(round, round, 0x31); + thr = _mm256_permute2x128_si256(thr, thr, 0x31); + + // AC only loop + while (n_coeffs < 0) { + quantize_fp_32x32_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); + n_coeffs += 8 * 2; + } + + *eob_ptr = get_max_eob(eob_max); +} + +#if CONFIG_VP9_HIGHBITDEPTH +static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x, + const __m256i *y, + int log_scale) { + __m256i prod_lo = _mm256_mul_epi32(*x, *y); + __m256i prod_hi = _mm256_srli_epi64(*x, 32); + const __m256i mult_hi = _mm256_srli_epi64(*y, 32); + const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); + prod_hi = _mm256_mul_epi32(prod_hi, mult_hi); + prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale); + prod_lo = _mm256_and_si256(prod_lo, mask); + prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale); + prod_hi = _mm256_slli_epi64(prod_hi, 32); + return _mm256_or_si256(prod_lo, prod_hi); +} + +static VPX_FORCE_INLINE __m256i highbd_init_256(const int16_t *val_ptr) { + const __m128i v = _mm_load_si128((const __m128i *)val_ptr); + const __m128i zero = _mm_setzero_si128(); + const __m128i dc = _mm_unpacklo_epi16(v, zero); + const __m128i ac = _mm_unpackhi_epi16(v, zero); + return _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1); +} + +static VPX_FORCE_INLINE void highbd_load_fp_values( + const struct macroblock_plane *mb_plane, __m256i *round, __m256i *quant, + const int16_t *dequant_ptr, __m256i *dequant) { + *round = highbd_init_256(mb_plane->round_fp); + *quant = highbd_init_256(mb_plane->quant_fp); + *dequant = highbd_init_256(dequant_ptr); +} + +static VPX_FORCE_INLINE __m256i highbd_get_max_lane_eob( + const int16_t *iscan_ptr, __m256i eobmax, __m256i nz_mask) { + const __m256i packed_nz_mask = + _mm256_packs_epi32(nz_mask, _mm256_setzero_si256()); + const __m256i packed_nz_mask_perm = + _mm256_permute4x64_epi64(packed_nz_mask, 0xD8); + const __m256i iscan = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr)); + const __m256i nz_iscan = _mm256_and_si256(iscan, packed_nz_mask_perm); + return _mm256_max_epi16(eobmax, nz_iscan); +} + +static VPX_FORCE_INLINE void highbd_quantize_fp( + const __m256i *round, const __m256i *quant, const __m256i *dequant, + const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob) { + const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi32(coeff); + const __m256i tmp_rnd = _mm256_add_epi32(abs_coeff, *round); + const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp_rnd, quant, 0); + const __m256i abs_dq = _mm256_mullo_epi32(abs_q, *dequant); + const __m256i q = _mm256_sign_epi32(abs_q, coeff); + const __m256i dq = _mm256_sign_epi32(abs_dq, coeff); + const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256()); + + _mm256_storeu_si256((__m256i *)qcoeff_ptr, q); + _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dq); + + *eob = highbd_get_max_lane_eob(iscan_ptr, *eob, nz_mask); +} + +void vp9_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const int step = 8; + __m256i round, quant, dequant; + __m256i eob_max = _mm256_setzero_si256(); + const int16_t *iscan = scan_order->iscan; + + coeff_ptr += n_coeffs; + iscan += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + + // Setup global values + highbd_load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant); + + highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); + + n_coeffs += step; + + // remove dc constants + dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31); + quant = _mm256_permute2x128_si256(quant, quant, 0x31); + round = _mm256_permute2x128_si256(round, round, 0x31); + + // AC only loop + while (n_coeffs < 0) { + highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); + n_coeffs += step; + } + + *eob_ptr = get_max_eob(eob_max); +} + +static VPX_FORCE_INLINE void highbd_quantize_fp_32x32( + const __m256i *round, const __m256i *quant, const __m256i *dequant, + const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob) { + const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi32(coeff); + const __m256i thr_mask = _mm256_cmpgt_epi32(abs_coeff, *thr); + const __m256i tmp_rnd = + _mm256_and_si256(_mm256_add_epi32(abs_coeff, *round), thr_mask); + const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp_rnd, quant, 0); + const __m256i abs_dq = + _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, *dequant), 1); + const __m256i q = _mm256_sign_epi32(abs_q, coeff); + const __m256i dq = _mm256_sign_epi32(abs_dq, coeff); + const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256()); + + _mm256_storeu_si256((__m256i *)qcoeff_ptr, q); + _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dq); + + *eob = highbd_get_max_lane_eob(iscan_ptr, *eob, nz_mask); +} + +void vp9_highbd_quantize_fp_32x32_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const int step = 8; + __m256i round, quant, dequant, thr; + __m256i eob_max = _mm256_setzero_si256(); + const int16_t *iscan = scan_order->iscan; + + coeff_ptr += n_coeffs; + iscan += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + + // Setup global values + highbd_load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant); + thr = _mm256_srli_epi32(dequant, 2); + // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when + // calculating the zbin mask. + thr = _mm256_sub_epi32(thr, _mm256_set1_epi32(1)); + quant = _mm256_slli_epi32(quant, 1); + round = _mm256_srai_epi32(_mm256_add_epi32(round, _mm256_set1_epi32(1)), 1); + + highbd_quantize_fp_32x32(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); + + n_coeffs += step; + + // remove dc constants + dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31); + quant = _mm256_permute2x128_si256(quant, quant, 0x31); + round = _mm256_permute2x128_si256(round, round, 0x31); + thr = _mm256_permute2x128_si256(thr, thr, 0x31); + + // AC only loop + while (n_coeffs < 0) { + highbd_quantize_fp_32x32( + &round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, iscan + n_coeffs, + qcoeff_ptr + n_coeffs, dqcoeff_ptr + n_coeffs, &eob_max); + n_coeffs += step; + } + + *eob_ptr = get_max_eob(eob_max); +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c new file mode 100644 index 0000000000..2481eb366e --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "./vp9_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" +#include "vpx_dsp/x86/quantize_sse2.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" + +void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const __m128i zero = _mm_setzero_si128(); + __m128i thr; + int nzflag; + int index = 16; + __m128i round, quant, dequant; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i eob; + const int16_t *iscan = scan_order->iscan; + + // Setup global values. + load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + // Poor man's abs(). + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs. + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + store_tran_low(qcoeff0, dqcoeff_ptr); + store_tran_low(qcoeff1, dqcoeff_ptr + 8); + + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); + + thr = _mm_srai_epi16(dequant, 1); + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + // Poor man's abs(). + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | + _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); + + if (nzflag) { + __m128i eob0; + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); + qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs. + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); + qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + store_tran_low(qcoeff0, dqcoeff_ptr + index); + store_tran_low(qcoeff1, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); + eob = _mm_max_epi16(eob, eob0); + } else { + store_zero_tran_low(qcoeff_ptr + index); + store_zero_tran_low(qcoeff_ptr + index + 8); + + store_zero_tran_low(dqcoeff_ptr + index); + store_zero_tran_low(dqcoeff_ptr + index + 8); + } + + index += 16; + } + + *eob_ptr = accumulate_eob(eob); +} diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c new file mode 100644 index 0000000000..98decae749 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" +#include "vpx_dsp/x86/quantize_sse2.h" +#include "vpx_dsp/x86/quantize_ssse3.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" + +void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const __m128i zero = _mm_setzero_si128(); + __m128i thr; + int nzflag; + int index = 16; + __m128i round, quant, dequant; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i eob; + const int16_t *iscan = scan_order->iscan; + + // Setup global values. + load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + store_tran_low(qcoeff0, dqcoeff_ptr); + store_tran_low(qcoeff1, dqcoeff_ptr + 8); + + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); + + thr = _mm_srai_epi16(dequant, 1); + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | + _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); + + if (nzflag) { + __m128i eob0; + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); + qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); + qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + store_tran_low(qcoeff0, dqcoeff_ptr + index); + store_tran_low(qcoeff1, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); + eob = _mm_max_epi16(eob, eob0); + } else { + store_zero_tran_low(qcoeff_ptr + index); + store_zero_tran_low(qcoeff_ptr + index + 8); + + store_zero_tran_low(dqcoeff_ptr + index); + store_zero_tran_low(dqcoeff_ptr + index + 8); + } + + index += 16; + } + + *eob_ptr = accumulate_eob(eob); +} + +void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one_s16 = _mm_set1_epi16(1); + __m128i thr; + int nzflag; + int index = 16; + __m128i round, quant, dequant; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i eob; + const int16_t *iscan = scan_order->iscan; + + // Setup global values. + load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant); + // The 32x32 halves round. + round = _mm_add_epi16(round, one_s16); + round = _mm_srli_epi16(round, 1); + + // The 16x16 shifts by 16, the 32x32 shifts by 15. We want to use pmulhw so + // upshift quant to account for this. + quant = _mm_slli_epi16(quant, 1); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + // Get the abs value of qcoeff again so we can use shifts for division. + qcoeff0 = _mm_abs_epi16(qcoeff0); + qcoeff1 = _mm_abs_epi16(qcoeff1); + + qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + // Divide by 2. + qcoeff0 = _mm_srli_epi16(qcoeff0, 1); + qcoeff1 = _mm_srli_epi16(qcoeff1, 1); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + store_tran_low(qcoeff0, dqcoeff_ptr); + store_tran_low(qcoeff1, dqcoeff_ptr + 8); + + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); + + thr = _mm_srai_epi16(dequant, 2); + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | + _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); + + if (nzflag) { + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); + qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + // Get the abs value of qcoeff again so we can use shifts for division. + qcoeff0 = _mm_abs_epi16(qcoeff0); + qcoeff1 = _mm_abs_epi16(qcoeff1); + + qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); + qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + // Divide by 2. + qcoeff0 = _mm_srli_epi16(qcoeff0, 1); + qcoeff1 = _mm_srli_epi16(qcoeff1, 1); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + store_tran_low(qcoeff0, dqcoeff_ptr + index); + store_tran_low(qcoeff1, dqcoeff_ptr + index + 8); + } else { + store_zero_tran_low(qcoeff_ptr + index); + store_zero_tran_low(qcoeff_ptr + index + 8); + + store_zero_tran_low(dqcoeff_ptr + index); + store_zero_tran_low(dqcoeff_ptr + index + 8); + } + + if (nzflag) { + const __m128i eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); + eob = _mm_max_epi16(eob, eob0); + } + index += 16; + } + + *eob_ptr = accumulate_eob(eob); +} diff --git a/media/libvpx/libvpx/vp9/exports_dec b/media/libvpx/libvpx/vp9/exports_dec new file mode 100644 index 0000000000..0a61fde398 --- /dev/null +++ b/media/libvpx/libvpx/vp9/exports_dec @@ -0,0 +1,2 @@ +data vpx_codec_vp9_dx_algo +text vpx_codec_vp9_dx diff --git a/media/libvpx/libvpx/vp9/exports_enc b/media/libvpx/libvpx/vp9/exports_enc new file mode 100644 index 0000000000..2a0fef3eaf --- /dev/null +++ b/media/libvpx/libvpx/vp9/exports_enc @@ -0,0 +1,2 @@ +data vpx_codec_vp9_cx_algo +text vpx_codec_vp9_cx diff --git a/media/libvpx/libvpx/vp9/ratectrl_rtc.cc b/media/libvpx/libvpx/vp9/ratectrl_rtc.cc new file mode 100644 index 0000000000..fd81bce7b5 --- /dev/null +++ b/media/libvpx/libvpx/vp9/ratectrl_rtc.cc @@ -0,0 +1,348 @@ +/* + * Copyright (c) 2020 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "vp9/ratectrl_rtc.h" + +#include + +#include "vp9/common/vp9_common.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_picklpf.h" +#include "vpx/vp8cx.h" +#include "vpx/vpx_codec.h" + +namespace libvpx { + +std::unique_ptr VP9RateControlRTC::Create( + const VP9RateControlRtcConfig &cfg) { + std::unique_ptr rc_api(new (std::nothrow) + VP9RateControlRTC()); + if (!rc_api) return nullptr; + rc_api->cpi_ = static_cast(vpx_memalign(32, sizeof(*cpi_))); + if (!rc_api->cpi_) return nullptr; + vp9_zero(*rc_api->cpi_); + + if (!rc_api->InitRateControl(cfg)) return nullptr; + if (cfg.aq_mode) { + VP9_COMP *const cpi = rc_api->cpi_; + cpi->segmentation_map = static_cast( + vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols, + sizeof(*cpi->segmentation_map))); + if (!cpi->segmentation_map) return nullptr; + cpi->cyclic_refresh = + vp9_cyclic_refresh_alloc(cpi->common.mi_rows, cpi->common.mi_cols); + cpi->cyclic_refresh->content_mode = 0; + } + return rc_api; +} + +VP9RateControlRTC::~VP9RateControlRTC() { + if (cpi_) { + if (cpi_->svc.number_spatial_layers > 1 || + cpi_->svc.number_temporal_layers > 1) { + for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) { + for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) { + int layer = LAYER_IDS_TO_IDX(sl, tl, cpi_->oxcf.ts_number_layers); + LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer]; + vpx_free(lc->map); + vpx_free(lc->last_coded_q_map); + vpx_free(lc->consec_zero_mv); + } + } + } + if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { + vpx_free(cpi_->segmentation_map); + cpi_->segmentation_map = NULL; + vp9_cyclic_refresh_free(cpi_->cyclic_refresh); + } + vpx_free(cpi_); + } +} + +bool VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) { + VP9_COMMON *cm = &cpi_->common; + VP9EncoderConfig *oxcf = &cpi_->oxcf; + RATE_CONTROL *const rc = &cpi_->rc; + cm->profile = PROFILE_0; + cm->bit_depth = VPX_BITS_8; + cm->show_frame = 1; + oxcf->profile = cm->profile; + oxcf->bit_depth = cm->bit_depth; + oxcf->rc_mode = rc_cfg.rc_mode; + oxcf->pass = 0; + oxcf->aq_mode = rc_cfg.aq_mode ? CYCLIC_REFRESH_AQ : NO_AQ; + oxcf->content = VP9E_CONTENT_DEFAULT; + oxcf->drop_frames_water_mark = 0; + cm->current_video_frame = 0; + rc->kf_boost = DEFAULT_KF_BOOST; + + if (!UpdateRateControl(rc_cfg)) return false; + vp9_set_mb_mi(cm, cm->width, cm->height); + + cpi_->use_svc = (cpi_->svc.number_spatial_layers > 1 || + cpi_->svc.number_temporal_layers > 1) + ? 1 + : 0; + + rc->rc_1_frame = 0; + rc->rc_2_frame = 0; + vp9_rc_init_minq_luts(); + vp9_rc_init(oxcf, 0, rc); + rc->constrain_gf_key_freq_onepass_vbr = 0; + cpi_->sf.use_nonrd_pick_mode = 1; + return true; +} + +bool VP9RateControlRTC::UpdateRateControl( + const VP9RateControlRtcConfig &rc_cfg) { + // Since VPX_MAX_LAYERS (12) is less than the product of VPX_SS_MAX_LAYERS (5) + // and VPX_TS_MAX_LAYERS (5), check all three. + if (rc_cfg.ss_number_layers < 1 || + rc_cfg.ss_number_layers > VPX_SS_MAX_LAYERS || + rc_cfg.ts_number_layers < 1 || + rc_cfg.ts_number_layers > VPX_TS_MAX_LAYERS || + rc_cfg.ss_number_layers * rc_cfg.ts_number_layers > VPX_MAX_LAYERS) { + return false; + } + + VP9_COMMON *cm = &cpi_->common; + VP9EncoderConfig *oxcf = &cpi_->oxcf; + RATE_CONTROL *const rc = &cpi_->rc; + + cm->width = rc_cfg.width; + cm->height = rc_cfg.height; + oxcf->width = rc_cfg.width; + oxcf->height = rc_cfg.height; + oxcf->worst_allowed_q = vp9_quantizer_to_qindex(rc_cfg.max_quantizer); + oxcf->best_allowed_q = vp9_quantizer_to_qindex(rc_cfg.min_quantizer); + rc->worst_quality = oxcf->worst_allowed_q; + rc->best_quality = oxcf->best_allowed_q; + oxcf->init_framerate = rc_cfg.framerate; + oxcf->target_bandwidth = 1000 * rc_cfg.target_bandwidth; + oxcf->starting_buffer_level_ms = rc_cfg.buf_initial_sz; + oxcf->optimal_buffer_level_ms = rc_cfg.buf_optimal_sz; + oxcf->maximum_buffer_size_ms = rc_cfg.buf_sz; + oxcf->under_shoot_pct = rc_cfg.undershoot_pct; + oxcf->over_shoot_pct = rc_cfg.overshoot_pct; + oxcf->drop_frames_water_mark = rc_cfg.frame_drop_thresh; + oxcf->content = rc_cfg.is_screen ? VP9E_CONTENT_SCREEN : VP9E_CONTENT_DEFAULT; + oxcf->ss_number_layers = rc_cfg.ss_number_layers; + oxcf->ts_number_layers = rc_cfg.ts_number_layers; + oxcf->temporal_layering_mode = (VP9E_TEMPORAL_LAYERING_MODE)( + (rc_cfg.ts_number_layers > 1) ? rc_cfg.ts_number_layers : 0); + + cpi_->oxcf.rc_max_intra_bitrate_pct = rc_cfg.max_intra_bitrate_pct; + cpi_->oxcf.rc_max_inter_bitrate_pct = rc_cfg.max_inter_bitrate_pct; + cpi_->framerate = rc_cfg.framerate; + cpi_->svc.number_spatial_layers = rc_cfg.ss_number_layers; + cpi_->svc.number_temporal_layers = rc_cfg.ts_number_layers; + + vp9_set_mb_mi(cm, cm->width, cm->height); + + if (setjmp(cpi_->common.error.jmp)) { + cpi_->common.error.setjmp = 0; + vpx_clear_system_state(); + return false; + } + cpi_->common.error.setjmp = 1; + + for (int tl = 0; tl < cpi_->svc.number_temporal_layers; ++tl) { + oxcf->ts_rate_decimator[tl] = rc_cfg.ts_rate_decimator[tl]; + } + for (int sl = 0; sl < cpi_->svc.number_spatial_layers; ++sl) { + for (int tl = 0; tl < cpi_->svc.number_temporal_layers; ++tl) { + const int layer = + LAYER_IDS_TO_IDX(sl, tl, cpi_->svc.number_temporal_layers); + LAYER_CONTEXT *lc = &cpi_->svc.layer_context[layer]; + RATE_CONTROL *const lrc = &lc->rc; + oxcf->layer_target_bitrate[layer] = + 1000 * rc_cfg.layer_target_bitrate[layer]; + lrc->worst_quality = + vp9_quantizer_to_qindex(rc_cfg.max_quantizers[layer]); + lrc->best_quality = vp9_quantizer_to_qindex(rc_cfg.min_quantizers[layer]); + lc->scaling_factor_num = rc_cfg.scaling_factor_num[sl]; + lc->scaling_factor_den = rc_cfg.scaling_factor_den[sl]; + } + } + vp9_set_rc_buffer_sizes(cpi_); + vp9_new_framerate(cpi_, cpi_->framerate); + if (cpi_->svc.number_temporal_layers > 1 || + cpi_->svc.number_spatial_layers > 1) { + if (cm->current_video_frame == 0) { + vp9_init_layer_context(cpi_); + // svc->framedrop_mode is not currently exposed, so only allow for + // full superframe drop for now. + cpi_->svc.framedrop_mode = FULL_SUPERFRAME_DROP; + } + vp9_update_layer_context_change_config(cpi_, + (int)cpi_->oxcf.target_bandwidth); + cpi_->svc.max_consec_drop = rc_cfg.max_consec_drop; + } + vp9_check_reset_rc_flag(cpi_); + + cpi_->common.error.setjmp = 0; + return true; +} + +// Compute the QP for the frame. If the frame is dropped this function +// returns kDrop, and no QP is computed. If the frame is encoded (not dropped) +// the QP is computed and kOk is returned. +FrameDropDecision VP9RateControlRTC::ComputeQP( + const VP9FrameParamsQpRTC &frame_params) { + VP9_COMMON *const cm = &cpi_->common; + int width, height; + cpi_->svc.spatial_layer_id = frame_params.spatial_layer_id; + cpi_->svc.temporal_layer_id = frame_params.temporal_layer_id; + if (cpi_->svc.number_spatial_layers > 1) { + const int layer = LAYER_IDS_TO_IDX(cpi_->svc.spatial_layer_id, + cpi_->svc.temporal_layer_id, + cpi_->svc.number_temporal_layers); + LAYER_CONTEXT *lc = &cpi_->svc.layer_context[layer]; + get_layer_resolution(cpi_->oxcf.width, cpi_->oxcf.height, + lc->scaling_factor_num, lc->scaling_factor_den, &width, + &height); + cm->width = width; + cm->height = height; + } + vp9_set_mb_mi(cm, cm->width, cm->height); + cm->frame_type = static_cast(frame_params.frame_type); + // This is needed to ensure key frame does not get unset in rc_get_svc_params. + cpi_->frame_flags = (cm->frame_type == KEY_FRAME) ? FRAMEFLAGS_KEY : 0; + cpi_->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0; + cpi_->sf.use_nonrd_pick_mode = 1; + if (cpi_->svc.number_spatial_layers == 1 && + cpi_->svc.number_temporal_layers == 1) { + int target = 0; + if (cpi_->oxcf.rc_mode == VPX_CBR) { + if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) + vp9_cyclic_refresh_update_parameters(cpi_); + if (frame_is_intra_only(cm)) + target = vp9_calc_iframe_target_size_one_pass_cbr(cpi_); + else + target = vp9_calc_pframe_target_size_one_pass_cbr(cpi_); + } else if (cpi_->oxcf.rc_mode == VPX_VBR) { + if (cm->frame_type == KEY_FRAME) { + cpi_->rc.this_key_frame_forced = cm->current_video_frame != 0; + cpi_->rc.frames_to_key = cpi_->oxcf.key_freq; + } + vp9_set_gf_update_one_pass_vbr(cpi_); + if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) + vp9_cyclic_refresh_update_parameters(cpi_); + if (frame_is_intra_only(cm)) + target = vp9_calc_iframe_target_size_one_pass_vbr(cpi_); + else + target = vp9_calc_pframe_target_size_one_pass_vbr(cpi_); + } + vp9_rc_set_frame_target(cpi_, target); + vp9_update_buffer_level_preencode(cpi_); + } else { + vp9_update_temporal_layer_framerate(cpi_); + vp9_restore_layer_context(cpi_); + vp9_rc_get_svc_params(cpi_); + } + if (cpi_->svc.spatial_layer_id == 0) vp9_zero(cpi_->svc.drop_spatial_layer); + // SVC: check for skip encoding of enhancement layer if the + // layer target bandwidth = 0. + if (vp9_svc_check_skip_enhancement_layer(cpi_)) + return FrameDropDecision::kDrop; + // Check for dropping this frame based on buffer level. + // Never drop on key frame, or if base layer is key for svc, + if (!frame_is_intra_only(cm) && + (!cpi_->use_svc || + !cpi_->svc.layer_context[cpi_->svc.temporal_layer_id].is_key_frame)) { + if (vp9_rc_drop_frame(cpi_)) { + // For FULL_SUPERFRAME_DROP mode (the only mode considered here): + // if the superframe drop is decided we need to save the layer context for + // all spatial layers, and call update_buffer_level and postencode_drop + // for all spatial layers. + if (cpi_->svc.number_spatial_layers > 1 || + cpi_->svc.number_temporal_layers > 1) { + vp9_save_layer_context(cpi_); + for (int sl = 1; sl < cpi_->svc.number_spatial_layers; sl++) { + cpi_->svc.spatial_layer_id = sl; + vp9_restore_layer_context(cpi_); + vp9_update_buffer_level_svc_preencode(cpi_); + vp9_rc_postencode_update_drop_frame(cpi_); + vp9_save_layer_context(cpi_); + } + } + return FrameDropDecision::kDrop; + } + } + // Compute the QP for the frame. + int bottom_index, top_index; + cpi_->common.base_qindex = + vp9_rc_pick_q_and_bounds(cpi_, &bottom_index, &top_index); + + if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_setup(cpi_); + if (cpi_->svc.number_spatial_layers > 1 || + cpi_->svc.number_temporal_layers > 1) + vp9_save_layer_context(cpi_); + + cpi_->last_frame_dropped = 0; + cpi_->svc.last_layer_dropped[cpi_->svc.spatial_layer_id] = 0; + if (cpi_->svc.spatial_layer_id == cpi_->svc.number_spatial_layers - 1) + cpi_->svc.num_encoded_top_layer++; + + return FrameDropDecision::kOk; +} + +int VP9RateControlRTC::GetQP() const { return cpi_->common.base_qindex; } + +int VP9RateControlRTC::GetLoopfilterLevel() const { + struct loopfilter *const lf = &cpi_->common.lf; + vp9_pick_filter_level(nullptr, cpi_, LPF_PICK_FROM_Q); + return lf->filter_level; +} + +bool VP9RateControlRTC::GetSegmentationData( + VP9SegmentationData *segmentation_data) const { + if (!cpi_->cyclic_refresh->apply_cyclic_refresh) return false; + + segmentation_data->segmentation_map = cpi_->segmentation_map; + segmentation_data->segmentation_map_size = + cpi_->common.mi_cols * cpi_->common.mi_rows; + segmentation_data->delta_q = cpi_->cyclic_refresh->qindex_delta; + segmentation_data->delta_q_size = 3u; + return true; +} + +void VP9RateControlRTC::PostEncodeUpdate( + uint64_t encoded_frame_size, const VP9FrameParamsQpRTC &frame_params) { + cpi_->common.frame_type = static_cast(frame_params.frame_type); + cpi_->svc.spatial_layer_id = frame_params.spatial_layer_id; + cpi_->svc.temporal_layer_id = frame_params.temporal_layer_id; + if (cpi_->svc.number_spatial_layers > 1 || + cpi_->svc.number_temporal_layers > 1) { + vp9_restore_layer_context(cpi_); + const int layer = LAYER_IDS_TO_IDX(cpi_->svc.spatial_layer_id, + cpi_->svc.temporal_layer_id, + cpi_->svc.number_temporal_layers); + LAYER_CONTEXT *lc = &cpi_->svc.layer_context[layer]; + cpi_->common.base_qindex = lc->frame_qp; + cpi_->common.MBs = lc->MBs; + // For spatial-svc, allow cyclic-refresh to be applied on the spatial + // layers, for the base temporal layer. + if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ && + cpi_->svc.number_spatial_layers > 1 && + cpi_->svc.temporal_layer_id == 0) { + CYCLIC_REFRESH *const cr = cpi_->cyclic_refresh; + cr->qindex_delta[0] = lc->qindex_delta[0]; + cr->qindex_delta[1] = lc->qindex_delta[1]; + cr->qindex_delta[2] = lc->qindex_delta[2]; + } + } + vp9_rc_postencode_update(cpi_, encoded_frame_size); + if (cpi_->svc.number_spatial_layers > 1 || + cpi_->svc.number_temporal_layers > 1) + vp9_save_layer_context(cpi_); + cpi_->common.current_video_frame++; +} + +} // namespace libvpx diff --git a/media/libvpx/libvpx/vp9/ratectrl_rtc.h b/media/libvpx/libvpx/vp9/ratectrl_rtc.h new file mode 100644 index 0000000000..85005c5474 --- /dev/null +++ b/media/libvpx/libvpx/vp9/ratectrl_rtc.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2020 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_RATECTRL_RTC_H_ +#define VPX_VP9_RATECTRL_RTC_H_ + +#include +#include + +#include "vp9/common/vp9_enums.h" +#include "vp9/vp9_iface_common.h" +#include "vp9/encoder/vp9_aq_cyclicrefresh.h" +#include "vp9/vp9_cx_iface.h" +#include "vpx/internal/vpx_ratectrl_rtc.h" +#include "vpx_mem/vpx_mem.h" + +struct VP9_COMP; + +namespace libvpx { +struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig { + public: + VP9RateControlRtcConfig() { + ss_number_layers = 1; + vp9_zero(max_quantizers); + vp9_zero(min_quantizers); + vp9_zero(scaling_factor_den); + vp9_zero(scaling_factor_num); + vp9_zero(layer_target_bitrate); + vp9_zero(ts_rate_decimator); + scaling_factor_num[0] = 1; + scaling_factor_den[0] = 1; + max_quantizers[0] = max_quantizer; + min_quantizers[0] = min_quantizer; + max_consec_drop = INT_MAX; + } + + // Number of spatial layers + int ss_number_layers; + int max_quantizers[VPX_MAX_LAYERS]; + int min_quantizers[VPX_MAX_LAYERS]; + int scaling_factor_num[VPX_SS_MAX_LAYERS]; + int scaling_factor_den[VPX_SS_MAX_LAYERS]; + // This is only for SVC for now. + int max_consec_drop; +}; + +struct VP9FrameParamsQpRTC { + RcFrameType frame_type; + int spatial_layer_id; + int temporal_layer_id; +}; + +struct VP9SegmentationData { + const uint8_t *segmentation_map; + size_t segmentation_map_size; + const int *delta_q; + size_t delta_q_size; +}; + +// This interface allows using VP9 real-time rate control without initializing +// the encoder. To use this interface, you need to link with libvpxrc.a. +// +// #include "vp9/ratectrl_rtc.h" +// VP9RateControlRtcConfig cfg; +// VP9FrameParamsQpRTC frame_params; +// +// YourFunctionToInitializeConfig(cfg); +// std::unique_ptr rc_api = VP9RateControlRTC::Create(cfg); +// // start encoding +// while (frame_to_encode) { +// if (config_changed) +// rc_api->UpdateRateControl(cfg); +// YourFunctionToFillFrameParams(frame_params); +// rc_api->ComputeQP(frame_params); +// YourFunctionToUseQP(rc_api->GetQP()); +// YourFunctionToUseLoopfilter(rc_api->GetLoopfilterLevel()); +// // After encoding +// rc_api->PostEncode(encoded_frame_size, frame_params); +// } +class VP9RateControlRTC { + public: + static std::unique_ptr Create( + const VP9RateControlRtcConfig &cfg); + ~VP9RateControlRTC(); + + bool UpdateRateControl(const VP9RateControlRtcConfig &rc_cfg); + // GetQP() needs to be called after ComputeQP() to get the latest QP + int GetQP() const; + int GetLoopfilterLevel() const; + bool GetSegmentationData(VP9SegmentationData *segmentation_data) const; + // ComputeQP computes the QP if the frame is not dropped (kOk return), + // otherwise it returns kDrop and subsequent GetQP and PostEncodeUpdate + // are not to be called (vp9_rc_postencode_update_drop_frame is already + // called via ComputeQP if drop is decided). + FrameDropDecision ComputeQP(const VP9FrameParamsQpRTC &frame_params); + // Feedback to rate control with the size of current encoded frame + void PostEncodeUpdate(uint64_t encoded_frame_size, + const VP9FrameParamsQpRTC &frame_params); + + private: + VP9RateControlRTC() {} + bool InitRateControl(const VP9RateControlRtcConfig &cfg); + struct VP9_COMP *cpi_; +}; + +} // namespace libvpx + +#endif // VPX_VP9_RATECTRL_RTC_H_ diff --git a/media/libvpx/libvpx/vp9/simple_encode.cc b/media/libvpx/libvpx/vp9/simple_encode.cc new file mode 100644 index 0000000000..2e6f9a4513 --- /dev/null +++ b/media/libvpx/libvpx/vp9/simple_encode.cc @@ -0,0 +1,1332 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include "./ivfenc.h" +#include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/vp9_iface_common.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_firstpass.h" +#include "vp9/simple_encode.h" +#include "vp9/vp9_cx_iface.h" + +namespace vp9 { + +static int get_plane_height(vpx_img_fmt_t img_fmt, int frame_height, + int plane) { + assert(plane < 3); + if (plane == 0) { + return frame_height; + } + switch (img_fmt) { + case VPX_IMG_FMT_I420: + case VPX_IMG_FMT_I440: + case VPX_IMG_FMT_YV12: + case VPX_IMG_FMT_I42016: + case VPX_IMG_FMT_I44016: return (frame_height + 1) >> 1; + default: return frame_height; + } +} + +static int get_plane_width(vpx_img_fmt_t img_fmt, int frame_width, int plane) { + assert(plane < 3); + if (plane == 0) { + return frame_width; + } + switch (img_fmt) { + case VPX_IMG_FMT_I420: + case VPX_IMG_FMT_YV12: + case VPX_IMG_FMT_I422: + case VPX_IMG_FMT_I42016: + case VPX_IMG_FMT_I42216: return (frame_width + 1) >> 1; + default: return frame_width; + } +} + +// TODO(angiebird): Merge this function with vpx_img_plane_width() +static int img_plane_width(const vpx_image_t *img, int plane) { + if (plane > 0 && img->x_chroma_shift > 0) + return (img->d_w + 1) >> img->x_chroma_shift; + else + return img->d_w; +} + +// TODO(angiebird): Merge this function with vpx_img_plane_height() +static int img_plane_height(const vpx_image_t *img, int plane) { + if (plane > 0 && img->y_chroma_shift > 0) + return (img->d_h + 1) >> img->y_chroma_shift; + else + return img->d_h; +} + +// TODO(angiebird): Merge this function with vpx_img_read() +static int img_read(vpx_image_t *img, FILE *file) { + int plane; + + for (plane = 0; plane < 3; ++plane) { + unsigned char *buf = img->planes[plane]; + const int stride = img->stride[plane]; + const int w = img_plane_width(img, plane) * + ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); + const int h = img_plane_height(img, plane); + int y; + + for (y = 0; y < h; ++y) { + if (fread(buf, 1, w, file) != (size_t)w) return 0; + buf += stride; + } + } + + return 1; +} + +// Assume every config in VP9EncoderConfig is less than 100 characters. +#define ENCODE_CONFIG_BUF_SIZE 100 +struct EncodeConfig { + char name[ENCODE_CONFIG_BUF_SIZE]; + char value[ENCODE_CONFIG_BUF_SIZE]; +}; + +class SimpleEncode::EncodeImpl { + public: + VP9_COMP *cpi; + vpx_img_fmt_t img_fmt; + vpx_image_t tmp_img; + std::vector first_pass_stats; + std::vector encode_config_list; +}; + +static VP9_COMP *init_encoder(const VP9EncoderConfig *oxcf, + vpx_img_fmt_t img_fmt) { + VP9_COMP *cpi; + BufferPool *buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(*buffer_pool)); + if (!buffer_pool) return NULL; + vp9_initialize_enc(); + cpi = vp9_create_compressor(oxcf, buffer_pool); + vp9_update_compressor_with_img_fmt(cpi, img_fmt); + return cpi; +} + +static void free_encoder(VP9_COMP *cpi) { + BufferPool *buffer_pool = cpi->common.buffer_pool; + vp9_remove_compressor(cpi); + // buffer_pool needs to be free after cpi because buffer_pool contains + // allocated buffers that will be free in vp9_remove_compressor() + vpx_free(buffer_pool); +} + +static INLINE vpx_rational_t make_vpx_rational(int num, int den) { + vpx_rational_t v; + v.num = num; + v.den = den; + return v; +} + +static INLINE FrameType +get_frame_type_from_update_type(FRAME_UPDATE_TYPE update_type) { + switch (update_type) { + case KF_UPDATE: return kFrameTypeKey; + case ARF_UPDATE: return kFrameTypeAltRef; + case GF_UPDATE: return kFrameTypeGolden; + case OVERLAY_UPDATE: return kFrameTypeOverlay; + case LF_UPDATE: return kFrameTypeInter; + default: + fprintf(stderr, "Unsupported update_type %d\n", update_type); + abort(); + } +} + +static void update_partition_info(const PARTITION_INFO *input_partition_info, + const int num_rows_4x4, + const int num_cols_4x4, + PartitionInfo *output_partition_info) { + const int num_units_4x4 = num_rows_4x4 * num_cols_4x4; + for (int i = 0; i < num_units_4x4; ++i) { + output_partition_info[i].row = input_partition_info[i].row; + output_partition_info[i].column = input_partition_info[i].column; + output_partition_info[i].row_start = input_partition_info[i].row_start; + output_partition_info[i].column_start = + input_partition_info[i].column_start; + output_partition_info[i].width = input_partition_info[i].width; + output_partition_info[i].height = input_partition_info[i].height; + } +} + +// translate MV_REFERENCE_FRAME to RefFrameType +static RefFrameType mv_ref_frame_to_ref_frame_type( + MV_REFERENCE_FRAME mv_ref_frame) { + switch (mv_ref_frame) { + case LAST_FRAME: return kRefFrameTypeLast; + case GOLDEN_FRAME: return kRefFrameTypePast; + case ALTREF_FRAME: return kRefFrameTypeFuture; + default: return kRefFrameTypeNone; + } +} + +static void update_motion_vector_info( + const MOTION_VECTOR_INFO *input_motion_vector_info, const int num_rows_4x4, + const int num_cols_4x4, MotionVectorInfo *output_motion_vector_info, + int motion_vector_scale) { + const int num_units_4x4 = num_rows_4x4 * num_cols_4x4; + for (int i = 0; i < num_units_4x4; ++i) { + const MV_REFERENCE_FRAME *in_ref_frame = + input_motion_vector_info[i].ref_frame; + output_motion_vector_info[i].mv_count = + (in_ref_frame[0] == INTRA_FRAME) + ? 0 + : ((in_ref_frame[1] == NO_REF_FRAME) ? 1 : 2); + if (in_ref_frame[0] == NO_REF_FRAME) { + fprintf(stderr, "in_ref_frame[0] shouldn't be NO_REF_FRAME\n"); + abort(); + } + output_motion_vector_info[i].ref_frame[0] = + mv_ref_frame_to_ref_frame_type(in_ref_frame[0]); + output_motion_vector_info[i].ref_frame[1] = + mv_ref_frame_to_ref_frame_type(in_ref_frame[1]); + output_motion_vector_info[i].mv_row[0] = + (double)input_motion_vector_info[i].mv[0].as_mv.row / + motion_vector_scale; + output_motion_vector_info[i].mv_column[0] = + (double)input_motion_vector_info[i].mv[0].as_mv.col / + motion_vector_scale; + output_motion_vector_info[i].mv_row[1] = + (double)input_motion_vector_info[i].mv[1].as_mv.row / + motion_vector_scale; + output_motion_vector_info[i].mv_column[1] = + (double)input_motion_vector_info[i].mv[1].as_mv.col / + motion_vector_scale; + } +} + +static void update_tpl_stats_info(const TplDepStats *input_tpl_stats_info, + const int show_frame_count, + TplStatsInfo *output_tpl_stats_info) { + int frame_idx; + for (frame_idx = 0; frame_idx < show_frame_count; ++frame_idx) { + output_tpl_stats_info[frame_idx].intra_cost = + input_tpl_stats_info[frame_idx].intra_cost; + output_tpl_stats_info[frame_idx].inter_cost = + input_tpl_stats_info[frame_idx].inter_cost; + output_tpl_stats_info[frame_idx].mc_flow = + input_tpl_stats_info[frame_idx].mc_flow; + output_tpl_stats_info[frame_idx].mc_dep_cost = + input_tpl_stats_info[frame_idx].mc_dep_cost; + output_tpl_stats_info[frame_idx].mc_ref_cost = + input_tpl_stats_info[frame_idx].mc_ref_cost; + } +} + +static void update_frame_counts(const FRAME_COUNTS *input_counts, + FrameCounts *output_counts) { + // Init array sizes. + output_counts->y_mode.resize(BLOCK_SIZE_GROUPS); + for (int i = 0; i < BLOCK_SIZE_GROUPS; ++i) { + output_counts->y_mode[i].resize(INTRA_MODES); + } + + output_counts->uv_mode.resize(INTRA_MODES); + for (int i = 0; i < INTRA_MODES; ++i) { + output_counts->uv_mode[i].resize(INTRA_MODES); + } + + output_counts->partition.resize(PARTITION_CONTEXTS); + for (int i = 0; i < PARTITION_CONTEXTS; ++i) { + output_counts->partition[i].resize(PARTITION_TYPES); + } + + output_counts->coef.resize(TX_SIZES); + output_counts->eob_branch.resize(TX_SIZES); + for (int i = 0; i < TX_SIZES; ++i) { + output_counts->coef[i].resize(PLANE_TYPES); + output_counts->eob_branch[i].resize(PLANE_TYPES); + for (int j = 0; j < PLANE_TYPES; ++j) { + output_counts->coef[i][j].resize(REF_TYPES); + output_counts->eob_branch[i][j].resize(REF_TYPES); + for (int k = 0; k < REF_TYPES; ++k) { + output_counts->coef[i][j][k].resize(COEF_BANDS); + output_counts->eob_branch[i][j][k].resize(COEF_BANDS); + for (int l = 0; l < COEF_BANDS; ++l) { + output_counts->coef[i][j][k][l].resize(COEFF_CONTEXTS); + output_counts->eob_branch[i][j][k][l].resize(COEFF_CONTEXTS); + for (int m = 0; m < COEFF_CONTEXTS; ++m) { + output_counts->coef[i][j][k][l][m].resize(UNCONSTRAINED_NODES + 1); + } + } + } + } + } + + output_counts->switchable_interp.resize(SWITCHABLE_FILTER_CONTEXTS); + for (int i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) { + output_counts->switchable_interp[i].resize(SWITCHABLE_FILTERS); + } + + output_counts->inter_mode.resize(INTER_MODE_CONTEXTS); + for (int i = 0; i < INTER_MODE_CONTEXTS; ++i) { + output_counts->inter_mode[i].resize(INTER_MODES); + } + + output_counts->intra_inter.resize(INTRA_INTER_CONTEXTS); + for (int i = 0; i < INTRA_INTER_CONTEXTS; ++i) { + output_counts->intra_inter[i].resize(2); + } + + output_counts->comp_inter.resize(COMP_INTER_CONTEXTS); + for (int i = 0; i < COMP_INTER_CONTEXTS; ++i) { + output_counts->comp_inter[i].resize(2); + } + + output_counts->single_ref.resize(REF_CONTEXTS); + for (int i = 0; i < REF_CONTEXTS; ++i) { + output_counts->single_ref[i].resize(2); + for (int j = 0; j < 2; ++j) { + output_counts->single_ref[i][j].resize(2); + } + } + + output_counts->comp_ref.resize(REF_CONTEXTS); + for (int i = 0; i < REF_CONTEXTS; ++i) { + output_counts->comp_ref[i].resize(2); + } + + output_counts->skip.resize(SKIP_CONTEXTS); + for (int i = 0; i < SKIP_CONTEXTS; ++i) { + output_counts->skip[i].resize(2); + } + + output_counts->tx.p32x32.resize(TX_SIZE_CONTEXTS); + output_counts->tx.p16x16.resize(TX_SIZE_CONTEXTS); + output_counts->tx.p8x8.resize(TX_SIZE_CONTEXTS); + for (int i = 0; i < TX_SIZE_CONTEXTS; i++) { + output_counts->tx.p32x32[i].resize(TX_SIZES); + output_counts->tx.p16x16[i].resize(TX_SIZES - 1); + output_counts->tx.p8x8[i].resize(TX_SIZES - 2); + } + output_counts->tx.tx_totals.resize(TX_SIZES); + + output_counts->mv.joints.resize(MV_JOINTS); + output_counts->mv.comps.resize(2); + for (int i = 0; i < 2; ++i) { + output_counts->mv.comps[i].sign.resize(2); + output_counts->mv.comps[i].classes.resize(MV_CLASSES); + output_counts->mv.comps[i].class0.resize(CLASS0_SIZE); + output_counts->mv.comps[i].bits.resize(MV_OFFSET_BITS); + for (int j = 0; j < MV_OFFSET_BITS; ++j) { + output_counts->mv.comps[i].bits[j].resize(2); + } + output_counts->mv.comps[i].class0_fp.resize(CLASS0_SIZE); + for (int j = 0; j < CLASS0_SIZE; ++j) { + output_counts->mv.comps[i].class0_fp[j].resize(MV_FP_SIZE); + } + output_counts->mv.comps[i].fp.resize(MV_FP_SIZE); + output_counts->mv.comps[i].class0_hp.resize(2); + output_counts->mv.comps[i].hp.resize(2); + } + + // Populate counts. + for (int i = 0; i < BLOCK_SIZE_GROUPS; ++i) { + for (int j = 0; j < INTRA_MODES; ++j) { + output_counts->y_mode[i][j] = input_counts->y_mode[i][j]; + } + } + for (int i = 0; i < INTRA_MODES; ++i) { + for (int j = 0; j < INTRA_MODES; ++j) { + output_counts->uv_mode[i][j] = input_counts->uv_mode[i][j]; + } + } + for (int i = 0; i < PARTITION_CONTEXTS; ++i) { + for (int j = 0; j < PARTITION_TYPES; ++j) { + output_counts->partition[i][j] = input_counts->partition[i][j]; + } + } + for (int i = 0; i < TX_SIZES; ++i) { + for (int j = 0; j < PLANE_TYPES; ++j) { + for (int k = 0; k < REF_TYPES; ++k) { + for (int l = 0; l < COEF_BANDS; ++l) { + for (int m = 0; m < COEFF_CONTEXTS; ++m) { + output_counts->eob_branch[i][j][k][l][m] = + input_counts->eob_branch[i][j][k][l][m]; + for (int n = 0; n < UNCONSTRAINED_NODES + 1; n++) { + output_counts->coef[i][j][k][l][m][n] = + input_counts->coef[i][j][k][l][m][n]; + } + } + } + } + } + } + for (int i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) { + for (int j = 0; j < SWITCHABLE_FILTERS; ++j) { + output_counts->switchable_interp[i][j] = + input_counts->switchable_interp[i][j]; + } + } + for (int i = 0; i < INTER_MODE_CONTEXTS; ++i) { + for (int j = 0; j < INTER_MODES; ++j) { + output_counts->inter_mode[i][j] = input_counts->inter_mode[i][j]; + } + } + for (int i = 0; i < INTRA_INTER_CONTEXTS; ++i) { + for (int j = 0; j < 2; ++j) { + output_counts->intra_inter[i][j] = input_counts->intra_inter[i][j]; + } + } + for (int i = 0; i < COMP_INTER_CONTEXTS; ++i) { + for (int j = 0; j < 2; ++j) { + output_counts->comp_inter[i][j] = input_counts->comp_inter[i][j]; + } + } + for (int i = 0; i < REF_CONTEXTS; ++i) { + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 2; ++k) { + output_counts->single_ref[i][j][k] = input_counts->single_ref[i][j][k]; + } + } + } + for (int i = 0; i < REF_CONTEXTS; ++i) { + for (int j = 0; j < 2; ++j) { + output_counts->comp_ref[i][j] = input_counts->comp_ref[i][j]; + } + } + for (int i = 0; i < SKIP_CONTEXTS; ++i) { + for (int j = 0; j < 2; ++j) { + output_counts->skip[i][j] = input_counts->skip[i][j]; + } + } + for (int i = 0; i < TX_SIZE_CONTEXTS; i++) { + for (int j = 0; j < TX_SIZES; j++) { + output_counts->tx.p32x32[i][j] = input_counts->tx.p32x32[i][j]; + } + for (int j = 0; j < TX_SIZES - 1; j++) { + output_counts->tx.p16x16[i][j] = input_counts->tx.p16x16[i][j]; + } + for (int j = 0; j < TX_SIZES - 2; j++) { + output_counts->tx.p8x8[i][j] = input_counts->tx.p8x8[i][j]; + } + } + for (int i = 0; i < TX_SIZES; i++) { + output_counts->tx.tx_totals[i] = input_counts->tx.tx_totals[i]; + } + for (int i = 0; i < MV_JOINTS; i++) { + output_counts->mv.joints[i] = input_counts->mv.joints[i]; + } + for (int k = 0; k < 2; k++) { + const nmv_component_counts *const comps_t = &input_counts->mv.comps[k]; + for (int i = 0; i < 2; i++) { + output_counts->mv.comps[k].sign[i] = comps_t->sign[i]; + output_counts->mv.comps[k].class0_hp[i] = comps_t->class0_hp[i]; + output_counts->mv.comps[k].hp[i] = comps_t->hp[i]; + } + for (int i = 0; i < MV_CLASSES; i++) { + output_counts->mv.comps[k].classes[i] = comps_t->classes[i]; + } + for (int i = 0; i < CLASS0_SIZE; i++) { + output_counts->mv.comps[k].class0[i] = comps_t->class0[i]; + for (int j = 0; j < MV_FP_SIZE; j++) { + output_counts->mv.comps[k].class0_fp[i][j] = comps_t->class0_fp[i][j]; + } + } + for (int i = 0; i < MV_OFFSET_BITS; i++) { + for (int j = 0; j < 2; j++) { + output_counts->mv.comps[k].bits[i][j] = comps_t->bits[i][j]; + } + } + for (int i = 0; i < MV_FP_SIZE; i++) { + output_counts->mv.comps[k].fp[i] = comps_t->fp[i]; + } + } +} + +void output_image_buffer(const ImageBuffer &image_buffer, std::FILE *out_file) { + for (int plane = 0; plane < 3; ++plane) { + const int w = image_buffer.plane_width[plane]; + const int h = image_buffer.plane_height[plane]; + const uint8_t *buf = image_buffer.plane_buffer[plane].get(); + fprintf(out_file, "%d %d\n", h, w); + for (int i = 0; i < w * h; ++i) { + fprintf(out_file, "%d ", (int)buf[i]); + } + fprintf(out_file, "\n"); + } +} + +static bool init_image_buffer(ImageBuffer *image_buffer, int frame_width, + int frame_height, vpx_img_fmt_t img_fmt) { + for (int plane = 0; plane < 3; ++plane) { + const int w = get_plane_width(img_fmt, frame_width, plane); + const int h = get_plane_height(img_fmt, frame_height, plane); + image_buffer->plane_width[plane] = w; + image_buffer->plane_height[plane] = h; + image_buffer->plane_buffer[plane].reset(new (std::nothrow) uint8_t[w * h]); + if (image_buffer->plane_buffer[plane].get() == nullptr) { + return false; + } + } + return true; +} + +static void ImageBuffer_to_IMAGE_BUFFER(const ImageBuffer &image_buffer, + IMAGE_BUFFER *image_buffer_c) { + image_buffer_c->allocated = 1; + for (int plane = 0; plane < 3; ++plane) { + image_buffer_c->plane_width[plane] = image_buffer.plane_width[plane]; + image_buffer_c->plane_height[plane] = image_buffer.plane_height[plane]; + image_buffer_c->plane_buffer[plane] = + image_buffer.plane_buffer[plane].get(); + } +} + +static size_t get_max_coding_data_byte_size(int frame_width, int frame_height) { + return frame_width * frame_height * 3; +} + +static bool init_encode_frame_result(EncodeFrameResult *encode_frame_result, + int frame_width, int frame_height, + vpx_img_fmt_t img_fmt) { + const size_t max_coding_data_byte_size = + get_max_coding_data_byte_size(frame_width, frame_height); + + encode_frame_result->coding_data.reset( + new (std::nothrow) uint8_t[max_coding_data_byte_size]); + + encode_frame_result->num_rows_4x4 = get_num_unit_4x4(frame_height); + encode_frame_result->num_cols_4x4 = get_num_unit_4x4(frame_width); + encode_frame_result->partition_info.resize(encode_frame_result->num_rows_4x4 * + encode_frame_result->num_cols_4x4); + encode_frame_result->motion_vector_info.resize( + encode_frame_result->num_rows_4x4 * encode_frame_result->num_cols_4x4); + encode_frame_result->tpl_stats_info.resize(MAX_LAG_BUFFERS); + + if (encode_frame_result->coding_data.get() == nullptr) { + return false; + } + return init_image_buffer(&encode_frame_result->coded_frame, frame_width, + frame_height, img_fmt); +} + +static void encode_frame_result_update_rq_history( + const RATE_QINDEX_HISTORY *rq_history, + EncodeFrameResult *encode_frame_result) { + encode_frame_result->recode_count = rq_history->recode_count; + for (int i = 0; i < encode_frame_result->recode_count; ++i) { + const int q_index = rq_history->q_index_history[i]; + const int rate = rq_history->rate_history[i]; + encode_frame_result->q_index_history.push_back(q_index); + encode_frame_result->rate_history.push_back(rate); + } +} + +static void update_encode_frame_result( + EncodeFrameResult *encode_frame_result, const int show_frame_count, + const ENCODE_FRAME_RESULT *encode_frame_info) { + encode_frame_result->coding_data_bit_size = + encode_frame_result->coding_data_byte_size * 8; + encode_frame_result->show_idx = encode_frame_info->show_idx; + encode_frame_result->coding_idx = encode_frame_info->frame_coding_index; + assert(kRefFrameTypeMax == MAX_INTER_REF_FRAMES); + for (int i = 0; i < kRefFrameTypeMax; ++i) { + encode_frame_result->ref_frame_info.coding_indexes[i] = + encode_frame_info->ref_frame_coding_indexes[i]; + encode_frame_result->ref_frame_info.valid_list[i] = + encode_frame_info->ref_frame_valid_list[i]; + } + encode_frame_result->frame_type = + get_frame_type_from_update_type(encode_frame_info->update_type); + encode_frame_result->psnr = encode_frame_info->psnr; + encode_frame_result->sse = encode_frame_info->sse; + encode_frame_result->quantize_index = encode_frame_info->quantize_index; + update_partition_info(encode_frame_info->partition_info, + encode_frame_result->num_rows_4x4, + encode_frame_result->num_cols_4x4, + &encode_frame_result->partition_info[0]); + update_motion_vector_info(encode_frame_info->motion_vector_info, + encode_frame_result->num_rows_4x4, + encode_frame_result->num_cols_4x4, + &encode_frame_result->motion_vector_info[0], + kMotionVectorSubPixelPrecision); + update_frame_counts(&encode_frame_info->frame_counts, + &encode_frame_result->frame_counts); + if (encode_frame_result->frame_type == kFrameTypeAltRef) { + update_tpl_stats_info(encode_frame_info->tpl_stats_info, show_frame_count, + &encode_frame_result->tpl_stats_info[0]); + } + encode_frame_result_update_rq_history(&encode_frame_info->rq_history, + encode_frame_result); +} + +static void IncreaseGroupOfPictureIndex(GroupOfPicture *group_of_picture) { + ++group_of_picture->next_encode_frame_index; +} + +static int IsGroupOfPictureFinished(const GroupOfPicture &group_of_picture) { + return static_cast(group_of_picture.next_encode_frame_index) == + group_of_picture.encode_frame_list.size(); +} + +bool operator==(const RefFrameInfo &a, const RefFrameInfo &b) { + bool match = true; + for (int i = 0; i < kRefFrameTypeMax; ++i) { + match &= a.coding_indexes[i] == b.coding_indexes[i]; + match &= a.valid_list[i] == b.valid_list[i]; + } + return match; +} + +static void InitRefFrameInfo(RefFrameInfo *ref_frame_info) { + for (int i = 0; i < kRefFrameTypeMax; ++i) { + ref_frame_info->coding_indexes[i] = -1; + ref_frame_info->valid_list[i] = 0; + } +} + +// After finishing coding a frame, this function will update the coded frame +// into the ref_frame_info based on the frame_type and the coding_index. +static void PostUpdateRefFrameInfo(FrameType frame_type, int frame_coding_index, + RefFrameInfo *ref_frame_info) { + // This part is written based on the logics in vp9_configure_buffer_updates() + // and update_ref_frames() + int *ref_frame_coding_indexes = ref_frame_info->coding_indexes; + switch (frame_type) { + case kFrameTypeKey: + ref_frame_coding_indexes[kRefFrameTypeLast] = frame_coding_index; + ref_frame_coding_indexes[kRefFrameTypePast] = frame_coding_index; + ref_frame_coding_indexes[kRefFrameTypeFuture] = frame_coding_index; + break; + case kFrameTypeInter: + ref_frame_coding_indexes[kRefFrameTypeLast] = frame_coding_index; + break; + case kFrameTypeAltRef: + ref_frame_coding_indexes[kRefFrameTypeFuture] = frame_coding_index; + break; + case kFrameTypeOverlay: + // Reserve the past coding_index in the future slot. This logic is from + // update_ref_frames() with condition vp9_preserve_existing_gf() == 1 + // TODO(angiebird): Invetegate why we need this. + ref_frame_coding_indexes[kRefFrameTypeFuture] = + ref_frame_coding_indexes[kRefFrameTypePast]; + ref_frame_coding_indexes[kRefFrameTypePast] = frame_coding_index; + break; + case kFrameTypeGolden: + ref_frame_coding_indexes[kRefFrameTypePast] = frame_coding_index; + ref_frame_coding_indexes[kRefFrameTypeLast] = frame_coding_index; + break; + } + + // This part is written based on the logics in get_ref_frame_flags() but we + // rename the flags alt, golden to future, past respectively. Mark + // non-duplicated reference frames as valid. The priorities are + // kRefFrameTypeLast > kRefFrameTypePast > kRefFrameTypeFuture. + const int last_index = ref_frame_coding_indexes[kRefFrameTypeLast]; + const int past_index = ref_frame_coding_indexes[kRefFrameTypePast]; + const int future_index = ref_frame_coding_indexes[kRefFrameTypeFuture]; + + int *ref_frame_valid_list = ref_frame_info->valid_list; + for (int ref_frame_idx = 0; ref_frame_idx < kRefFrameTypeMax; + ++ref_frame_idx) { + ref_frame_valid_list[ref_frame_idx] = 1; + } + + if (past_index == last_index) { + ref_frame_valid_list[kRefFrameTypePast] = 0; + } + + if (future_index == last_index) { + ref_frame_valid_list[kRefFrameTypeFuture] = 0; + } + + if (future_index == past_index) { + ref_frame_valid_list[kRefFrameTypeFuture] = 0; + } +} + +static void SetGroupOfPicture(int first_is_key_frame, int use_alt_ref, + int coding_frame_count, int first_show_idx, + int last_gop_use_alt_ref, int start_coding_index, + const RefFrameInfo &start_ref_frame_info, + GroupOfPicture *group_of_picture) { + // Clean up the state of previous group of picture. + group_of_picture->encode_frame_list.clear(); + group_of_picture->next_encode_frame_index = 0; + group_of_picture->show_frame_count = coding_frame_count - use_alt_ref; + group_of_picture->start_show_index = first_show_idx; + group_of_picture->start_coding_index = start_coding_index; + group_of_picture->first_is_key_frame = first_is_key_frame; + group_of_picture->use_alt_ref = use_alt_ref; + group_of_picture->last_gop_use_alt_ref = last_gop_use_alt_ref; + + // We need to make a copy of start reference frame info because we + // use it to simulate the ref frame update. + RefFrameInfo ref_frame_info = start_ref_frame_info; + + { + // First frame in the group of pictures. It's either key frame or show inter + // frame. + EncodeFrameInfo encode_frame_info; + // Set frame_type + if (first_is_key_frame) { + encode_frame_info.frame_type = kFrameTypeKey; + } else { + if (last_gop_use_alt_ref) { + encode_frame_info.frame_type = kFrameTypeOverlay; + } else { + encode_frame_info.frame_type = kFrameTypeGolden; + } + } + + encode_frame_info.show_idx = first_show_idx; + encode_frame_info.coding_index = start_coding_index; + + encode_frame_info.ref_frame_info = ref_frame_info; + PostUpdateRefFrameInfo(encode_frame_info.frame_type, + encode_frame_info.coding_index, &ref_frame_info); + + group_of_picture->encode_frame_list.push_back(encode_frame_info); + } + + const int show_frame_count = coding_frame_count - use_alt_ref; + if (use_alt_ref) { + // If there is alternate reference, it is always coded at the second place. + // Its show index (or timestamp) is at the last of this group + EncodeFrameInfo encode_frame_info; + encode_frame_info.frame_type = kFrameTypeAltRef; + encode_frame_info.show_idx = first_show_idx + show_frame_count; + encode_frame_info.coding_index = start_coding_index + 1; + + encode_frame_info.ref_frame_info = ref_frame_info; + PostUpdateRefFrameInfo(encode_frame_info.frame_type, + encode_frame_info.coding_index, &ref_frame_info); + + group_of_picture->encode_frame_list.push_back(encode_frame_info); + } + + // Encode the rest show inter frames. + for (int i = 1; i < show_frame_count; ++i) { + EncodeFrameInfo encode_frame_info; + encode_frame_info.frame_type = kFrameTypeInter; + encode_frame_info.show_idx = first_show_idx + i; + encode_frame_info.coding_index = start_coding_index + use_alt_ref + i; + + encode_frame_info.ref_frame_info = ref_frame_info; + PostUpdateRefFrameInfo(encode_frame_info.frame_type, + encode_frame_info.coding_index, &ref_frame_info); + + group_of_picture->encode_frame_list.push_back(encode_frame_info); + } +} + +// Gets group of picture information from VP9's decision, and update +// |group_of_picture| accordingly. +// This is called at the starting of encoding of each group of picture. +static void UpdateGroupOfPicture(const VP9_COMP *cpi, int start_coding_index, + const RefFrameInfo &start_ref_frame_info, + GroupOfPicture *group_of_picture) { + int first_is_key_frame; + int use_alt_ref; + int coding_frame_count; + int first_show_idx; + int last_gop_use_alt_ref; + vp9_get_next_group_of_picture(cpi, &first_is_key_frame, &use_alt_ref, + &coding_frame_count, &first_show_idx, + &last_gop_use_alt_ref); + SetGroupOfPicture(first_is_key_frame, use_alt_ref, coding_frame_count, + first_show_idx, last_gop_use_alt_ref, start_coding_index, + start_ref_frame_info, group_of_picture); +} + +#define SET_STRUCT_VALUE(config, structure, ret, field) \ + do { \ + if (strcmp(config.name, #field) == 0) { \ + structure->field = atoi(config.value); \ + ret = 1; \ + } \ + } while (false) + +static void UpdateEncodeConfig(const EncodeConfig &config, + VP9EncoderConfig *oxcf) { + int ret = 0; + SET_STRUCT_VALUE(config, oxcf, ret, key_freq); + SET_STRUCT_VALUE(config, oxcf, ret, two_pass_vbrmin_section); + SET_STRUCT_VALUE(config, oxcf, ret, two_pass_vbrmax_section); + SET_STRUCT_VALUE(config, oxcf, ret, under_shoot_pct); + SET_STRUCT_VALUE(config, oxcf, ret, over_shoot_pct); + SET_STRUCT_VALUE(config, oxcf, ret, max_threads); + SET_STRUCT_VALUE(config, oxcf, ret, frame_parallel_decoding_mode); + SET_STRUCT_VALUE(config, oxcf, ret, tile_columns); + SET_STRUCT_VALUE(config, oxcf, ret, arnr_max_frames); + SET_STRUCT_VALUE(config, oxcf, ret, arnr_strength); + SET_STRUCT_VALUE(config, oxcf, ret, lag_in_frames); + SET_STRUCT_VALUE(config, oxcf, ret, encode_breakout); + SET_STRUCT_VALUE(config, oxcf, ret, enable_tpl_model); + SET_STRUCT_VALUE(config, oxcf, ret, enable_auto_arf); + if (strcmp(config.name, "rc_mode") == 0) { + int rc_mode = atoi(config.value); + if (rc_mode >= VPX_VBR && rc_mode <= VPX_Q) { + oxcf->rc_mode = (enum vpx_rc_mode)rc_mode; + ret = 1; + } else { + fprintf(stderr, "Invalid rc_mode value: %d\n", rc_mode); + } + } + SET_STRUCT_VALUE(config, oxcf, ret, cq_level); + if (ret == 0) { + fprintf(stderr, "Ignored unsupported encode_config %s\n", config.name); + } +} + +static VP9EncoderConfig GetEncodeConfig( + int frame_width, int frame_height, vpx_rational_t frame_rate, + int target_bitrate, int encode_speed, int target_level, + vpx_enc_pass enc_pass, + const std::vector &encode_config_list) { + VP9EncoderConfig oxcf = vp9_get_encoder_config( + frame_width, frame_height, frame_rate, target_bitrate, encode_speed, + target_level, enc_pass); + for (const auto &config : encode_config_list) { + UpdateEncodeConfig(config, &oxcf); + } + if (enc_pass == VPX_RC_FIRST_PASS) { + oxcf.lag_in_frames = 0; + } + oxcf.use_simple_encode_api = 1; + return oxcf; +} + +SimpleEncode::SimpleEncode(int frame_width, int frame_height, + int frame_rate_num, int frame_rate_den, + int target_bitrate, int num_frames, int target_level, + const char *infile_path, const char *outfile_path) { + impl_ptr_ = std::unique_ptr(new EncodeImpl()); + frame_width_ = frame_width; + frame_height_ = frame_height; + frame_rate_num_ = frame_rate_num; + frame_rate_den_ = frame_rate_den; + target_bitrate_ = target_bitrate; + num_frames_ = num_frames; + encode_speed_ = 0; + target_level_ = target_level; + + frame_coding_index_ = 0; + show_frame_count_ = 0; + + key_frame_group_index_ = 0; + key_frame_group_size_ = 0; + + // TODO(angirbid): Should we keep a file pointer here or keep the file_path? + assert(infile_path != nullptr); + in_file_ = fopen(infile_path, "r"); + if (outfile_path != nullptr) { + out_file_ = fopen(outfile_path, "w"); + } else { + out_file_ = nullptr; + } + impl_ptr_->cpi = nullptr; + impl_ptr_->img_fmt = VPX_IMG_FMT_I420; + + InitRefFrameInfo(&ref_frame_info_); +} + +void SimpleEncode::SetEncodeSpeed(int encode_speed) { + encode_speed_ = encode_speed; +} + +StatusCode SimpleEncode::SetEncodeConfig(const char *name, const char *value) { + if (name == nullptr || value == nullptr) { + fprintf(stderr, "SetEncodeConfig: null pointer, name %p value %p\n", name, + value); + return StatusError; + } + EncodeConfig config; + snprintf(config.name, ENCODE_CONFIG_BUF_SIZE, "%s", name); + snprintf(config.value, ENCODE_CONFIG_BUF_SIZE, "%s", value); + impl_ptr_->encode_config_list.push_back(config); + return StatusOk; +} + +StatusCode SimpleEncode::DumpEncodeConfigs(int pass, FILE *fp) { + if (fp == nullptr) { + fprintf(stderr, "DumpEncodeConfigs: null pointer, fp %p\n", fp); + return StatusError; + } + vpx_enc_pass enc_pass; + if (pass == 1) { + enc_pass = VPX_RC_FIRST_PASS; + } else { + enc_pass = VPX_RC_LAST_PASS; + } + const vpx_rational_t frame_rate = + make_vpx_rational(frame_rate_num_, frame_rate_den_); + const VP9EncoderConfig oxcf = GetEncodeConfig( + frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_, + target_level_, enc_pass, impl_ptr_->encode_config_list); + vp9_dump_encoder_config(&oxcf, fp); + return StatusOk; +} + +void SimpleEncode::ComputeFirstPassStats() { + vpx_rational_t frame_rate = + make_vpx_rational(frame_rate_num_, frame_rate_den_); + const VP9EncoderConfig oxcf = GetEncodeConfig( + frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_, + target_level_, VPX_RC_FIRST_PASS, impl_ptr_->encode_config_list); + impl_ptr_->cpi = init_encoder(&oxcf, impl_ptr_->img_fmt); + struct lookahead_ctx *lookahead = impl_ptr_->cpi->lookahead; + int i; + int use_highbitdepth = 0; + const int num_rows_16x16 = get_num_unit_16x16(frame_height_); + const int num_cols_16x16 = get_num_unit_16x16(frame_width_); +#if CONFIG_VP9_HIGHBITDEPTH + use_highbitdepth = impl_ptr_->cpi->common.use_highbitdepth; +#endif + vpx_image_t img; + vpx_img_alloc(&img, impl_ptr_->img_fmt, frame_width_, frame_height_, 1); + rewind(in_file_); + impl_ptr_->first_pass_stats.clear(); + for (i = 0; i < num_frames_; ++i) { + assert(!vp9_lookahead_full(lookahead)); + if (img_read(&img, in_file_)) { + int next_show_idx = vp9_lookahead_next_show_idx(lookahead); + int64_t ts_start = + timebase_units_to_ticks(&oxcf.g_timebase_in_ts, next_show_idx); + int64_t ts_end = + timebase_units_to_ticks(&oxcf.g_timebase_in_ts, next_show_idx + 1); + YV12_BUFFER_CONFIG sd; + image2yuvconfig(&img, &sd); + vp9_lookahead_push(lookahead, &sd, ts_start, ts_end, use_highbitdepth, 0); + { + int64_t time_stamp; + int64_t time_end; + int flush = 1; // Makes vp9_get_compressed_data process a frame + size_t size; + unsigned int frame_flags = 0; + ENCODE_FRAME_RESULT encode_frame_info; + vp9_init_encode_frame_result(&encode_frame_info); + // TODO(angiebird): Call vp9_first_pass directly + vp9_get_compressed_data(impl_ptr_->cpi, &frame_flags, &size, nullptr, + &time_stamp, &time_end, flush, + &encode_frame_info); + // vp9_get_compressed_data only generates first pass stats not + // compresses data + assert(size == 0); + // Get vp9 first pass motion vector info. + std::vector mv_info(num_rows_16x16 * num_cols_16x16); + update_motion_vector_info( + impl_ptr_->cpi->fp_motion_vector_info, num_rows_16x16, + num_cols_16x16, mv_info.data(), kMotionVectorFullPixelPrecision); + fp_motion_vector_info_.push_back(mv_info); + } + impl_ptr_->first_pass_stats.push_back( + vp9_get_frame_stats(&impl_ptr_->cpi->twopass)); + } + } + // TODO(angiebird): Store the total_stats apart form first_pass_stats + impl_ptr_->first_pass_stats.push_back( + vp9_get_total_stats(&impl_ptr_->cpi->twopass)); + vp9_end_first_pass(impl_ptr_->cpi); + + // Generate key_frame_map based on impl_ptr_->first_pass_stats. + key_frame_map_ = ComputeKeyFrameMap(); + + free_encoder(impl_ptr_->cpi); + impl_ptr_->cpi = nullptr; + rewind(in_file_); + vpx_img_free(&img); +} + +std::vector> SimpleEncode::ObserveFirstPassStats() { + std::vector> output_stats; + // TODO(angiebird): This function make several assumptions of + // FIRSTPASS_STATS. 1) All elements in FIRSTPASS_STATS are double except the + // last one. 2) The last entry of first_pass_stats is the total_stats. + // Change the code structure, so that we don't have to make these assumptions + + // Note the last entry of first_pass_stats is the total_stats, we don't need + // it. + for (size_t i = 0; i < impl_ptr_->first_pass_stats.size() - 1; ++i) { + double *buf_start = + reinterpret_cast(&impl_ptr_->first_pass_stats[i]); + // We use - 1 here because the last member in FIRSTPASS_STATS is not double + double *buf_end = + buf_start + sizeof(impl_ptr_->first_pass_stats[i]) / sizeof(*buf_end) - + 1; + std::vector this_stats(buf_start, buf_end); + output_stats.push_back(this_stats); + } + return output_stats; +} + +std::vector> +SimpleEncode::ObserveFirstPassMotionVectors() { + return fp_motion_vector_info_; +} + +void SimpleEncode::SetExternalGroupOfPicturesMap(int *gop_map, + int gop_map_size) { + for (int i = 0; i < gop_map_size; ++i) { + gop_map_.push_back(gop_map[i]); + } + // The following will check and modify gop_map_ to make sure the + // gop_map_ satisfies the constraints. + // 1) Each key frame position should be at the start of a gop. + // 2) The last gop should not use an alt ref. + assert(gop_map_.size() == key_frame_map_.size()); + int last_gop_start = 0; + for (int i = 0; static_cast(i) < gop_map_.size(); ++i) { + if (key_frame_map_[i] == 1 && gop_map_[i] == 0) { + fprintf(stderr, "Add an extra gop start at show_idx %d\n", i); + // Insert a gop start at key frame location. + gop_map_[i] |= kGopMapFlagStart; + gop_map_[i] |= kGopMapFlagUseAltRef; + } + if (gop_map_[i] & kGopMapFlagStart) { + last_gop_start = i; + } + } + if (gop_map_[last_gop_start] & kGopMapFlagUseAltRef) { + fprintf(stderr, + "Last group of pictures starting at show_idx %d shouldn't use alt " + "ref\n", + last_gop_start); + gop_map_[last_gop_start] &= ~kGopMapFlagUseAltRef; + } +} + +std::vector SimpleEncode::ObserveExternalGroupOfPicturesMap() { + return gop_map_; +} + +template +T *GetVectorData(const std::vector &v) { + if (v.empty()) { + return nullptr; + } + return const_cast(v.data()); +} + +static GOP_COMMAND GetGopCommand(const std::vector &gop_map, + int start_show_index) { + GOP_COMMAND gop_command; + if (static_cast(start_show_index) < gop_map.size()) { + assert((gop_map[start_show_index] & kGopMapFlagStart) != 0); + int end_show_index = start_show_index + 1; + // gop_map[end_show_index] & kGopMapFlagStart == 0 means this is + // the start of a gop. + while (static_cast(end_show_index) < gop_map.size() && + (gop_map[end_show_index] & kGopMapFlagStart) == 0) { + ++end_show_index; + } + const int show_frame_count = end_show_index - start_show_index; + int use_alt_ref = (gop_map[start_show_index] & kGopMapFlagUseAltRef) != 0; + if (static_cast(end_show_index) == gop_map.size()) { + // This is the last gop group, there must be no altref. + use_alt_ref = 0; + } + gop_command_on(&gop_command, show_frame_count, use_alt_ref); + } else { + gop_command_off(&gop_command); + } + return gop_command; +} + +void SimpleEncode::StartEncode() { + assert(impl_ptr_->first_pass_stats.size() > 0); + vpx_rational_t frame_rate = + make_vpx_rational(frame_rate_num_, frame_rate_den_); + VP9EncoderConfig oxcf = GetEncodeConfig( + frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_, + target_level_, VPX_RC_LAST_PASS, impl_ptr_->encode_config_list); + + vpx_fixed_buf_t stats; + stats.buf = GetVectorData(impl_ptr_->first_pass_stats); + stats.sz = sizeof(impl_ptr_->first_pass_stats[0]) * + impl_ptr_->first_pass_stats.size(); + + vp9_set_first_pass_stats(&oxcf, &stats); + assert(impl_ptr_->cpi == nullptr); + impl_ptr_->cpi = init_encoder(&oxcf, impl_ptr_->img_fmt); + vpx_img_alloc(&impl_ptr_->tmp_img, impl_ptr_->img_fmt, frame_width_, + frame_height_, 1); + + frame_coding_index_ = 0; + show_frame_count_ = 0; + + assert(impl_ptr_->cpi != nullptr); + FRAME_INFO frame_info = vp9_get_frame_info(&oxcf); + unsigned int screen_area = frame_info.frame_width * frame_info.frame_height; + vp9_init_vizier_params(&impl_ptr_->cpi->twopass, screen_area); + + UpdateKeyFrameGroup(show_frame_count_); + + const GOP_COMMAND gop_command = GetGopCommand(gop_map_, show_frame_count_); + encode_command_set_gop_command(&impl_ptr_->cpi->encode_command, gop_command); + UpdateGroupOfPicture(impl_ptr_->cpi, frame_coding_index_, ref_frame_info_, + &group_of_picture_); + rewind(in_file_); + + if (out_file_ != nullptr) { + const char *fourcc = "VP90"; + // In SimpleEncode, we use time_base = 1 / TICKS_PER_SEC. + // Based on that, the ivf_timestamp for each image is set to + // show_idx * TICKS_PER_SEC / frame_rate + // such that each image's actual timestamp in seconds can be computed as + // ivf_timestamp * time_base == show_idx / frame_rate + // TODO(angiebird): 1) Add unit test for ivf timestamp. + // 2) Simplify the frame_rate setting process. + vpx_rational_t time_base = make_vpx_rational(1, TICKS_PER_SEC); + ivf_write_file_header_with_video_info(out_file_, *(const uint32_t *)fourcc, + num_frames_, frame_width_, + frame_height_, time_base); + } +} + +void SimpleEncode::EndEncode() { + free_encoder(impl_ptr_->cpi); + impl_ptr_->cpi = nullptr; + vpx_img_free(&impl_ptr_->tmp_img); + rewind(in_file_); +} + +void SimpleEncode::UpdateKeyFrameGroup(int key_frame_show_index) { + const VP9_COMP *cpi = impl_ptr_->cpi; + key_frame_group_index_ = 0; + key_frame_group_size_ = vp9_get_frames_to_next_key( + &cpi->oxcf, &cpi->twopass, key_frame_show_index, cpi->rc.min_gf_interval); + assert(key_frame_group_size_ > 0); + // Init the reference frame info when a new key frame group appears. + InitRefFrameInfo(&ref_frame_info_); +} + +void SimpleEncode::PostUpdateKeyFrameGroupIndex(FrameType frame_type) { + if (frame_type != kFrameTypeAltRef) { + // key_frame_group_index_ only counts show frames + ++key_frame_group_index_; + } +} + +int SimpleEncode::GetKeyFrameGroupSize() const { return key_frame_group_size_; } + +GroupOfPicture SimpleEncode::ObserveGroupOfPicture() const { + return group_of_picture_; +} + +EncodeFrameInfo SimpleEncode::GetNextEncodeFrameInfo() const { + return group_of_picture_ + .encode_frame_list[group_of_picture_.next_encode_frame_index]; +} + +void SimpleEncode::PostUpdateState( + const EncodeFrameResult &encode_frame_result) { + // This function needs to be called before the increament of + // frame_coding_index_ + PostUpdateRefFrameInfo(encode_frame_result.frame_type, frame_coding_index_, + &ref_frame_info_); + ++frame_coding_index_; + if (encode_frame_result.frame_type != kFrameTypeAltRef) { + // Only kFrameTypeAltRef is not a show frame + ++show_frame_count_; + } + + PostUpdateKeyFrameGroupIndex(encode_frame_result.frame_type); + if (key_frame_group_index_ == key_frame_group_size_) { + UpdateKeyFrameGroup(show_frame_count_); + } + + IncreaseGroupOfPictureIndex(&group_of_picture_); + if (IsGroupOfPictureFinished(group_of_picture_)) { + const GOP_COMMAND gop_command = GetGopCommand(gop_map_, show_frame_count_); + encode_command_set_gop_command(&impl_ptr_->cpi->encode_command, + gop_command); + // This function needs to be called after ref_frame_info_ is updated + // properly in PostUpdateRefFrameInfo() and UpdateKeyFrameGroup(). + UpdateGroupOfPicture(impl_ptr_->cpi, frame_coding_index_, ref_frame_info_, + &group_of_picture_); + } +} + +void SimpleEncode::EncodeFrame(EncodeFrameResult *encode_frame_result) { + VP9_COMP *cpi = impl_ptr_->cpi; + struct lookahead_ctx *lookahead = cpi->lookahead; + int use_highbitdepth = 0; +#if CONFIG_VP9_HIGHBITDEPTH + use_highbitdepth = cpi->common.use_highbitdepth; +#endif + // The lookahead's size is set to oxcf->lag_in_frames. + // We want to fill lookahead to it's max capacity if possible so that the + // encoder can construct alt ref frame in time. + // In the other words, we hope vp9_get_compressed_data to encode a frame + // every time in the function + while (!vp9_lookahead_full(lookahead)) { + // TODO(angiebird): Check whether we can move this file read logics to + // lookahead + if (img_read(&impl_ptr_->tmp_img, in_file_)) { + int next_show_idx = vp9_lookahead_next_show_idx(lookahead); + int64_t ts_start = + timebase_units_to_ticks(&cpi->oxcf.g_timebase_in_ts, next_show_idx); + int64_t ts_end = timebase_units_to_ticks(&cpi->oxcf.g_timebase_in_ts, + next_show_idx + 1); + YV12_BUFFER_CONFIG sd; + image2yuvconfig(&impl_ptr_->tmp_img, &sd); + vp9_lookahead_push(lookahead, &sd, ts_start, ts_end, use_highbitdepth, 0); + } else { + break; + } + } + + if (init_encode_frame_result(encode_frame_result, frame_width_, frame_height_, + impl_ptr_->img_fmt)) { + int64_t time_stamp; + int64_t time_end; + int flush = 1; // Make vp9_get_compressed_data encode a frame + unsigned int frame_flags = 0; + ENCODE_FRAME_RESULT encode_frame_info; + vp9_init_encode_frame_result(&encode_frame_info); + ImageBuffer_to_IMAGE_BUFFER(encode_frame_result->coded_frame, + &encode_frame_info.coded_frame); + vp9_get_compressed_data(cpi, &frame_flags, + &encode_frame_result->coding_data_byte_size, + encode_frame_result->coding_data.get(), &time_stamp, + &time_end, flush, &encode_frame_info); + if (out_file_ != nullptr) { + ivf_write_frame_header(out_file_, time_stamp, + encode_frame_result->coding_data_byte_size); + fwrite(encode_frame_result->coding_data.get(), 1, + encode_frame_result->coding_data_byte_size, out_file_); + } + + // vp9_get_compressed_data is expected to encode a frame every time, so the + // data size should be greater than zero. + if (encode_frame_result->coding_data_byte_size <= 0) { + fprintf(stderr, "Coding data size <= 0\n"); + abort(); + } + const size_t max_coding_data_byte_size = + get_max_coding_data_byte_size(frame_width_, frame_height_); + if (encode_frame_result->coding_data_byte_size > + max_coding_data_byte_size) { + fprintf(stderr, "Coding data size exceeds the maximum.\n"); + abort(); + } + + const GroupOfPicture group_of_picture = this->ObserveGroupOfPicture(); + const int show_frame_count = group_of_picture.show_frame_count; + update_encode_frame_result(encode_frame_result, show_frame_count, + &encode_frame_info); + PostUpdateState(*encode_frame_result); + } else { + // TODO(angiebird): Clean up encode_frame_result. + fprintf(stderr, "init_encode_frame_result() failed.\n"); + this->EndEncode(); + } +} + +void SimpleEncode::EncodeFrameWithQuantizeIndex( + EncodeFrameResult *encode_frame_result, int quantize_index) { + encode_command_set_external_quantize_index(&impl_ptr_->cpi->encode_command, + quantize_index); + EncodeFrame(encode_frame_result); + encode_command_reset_external_quantize_index(&impl_ptr_->cpi->encode_command); +} + +void SimpleEncode::EncodeFrameWithTargetFrameBits( + EncodeFrameResult *encode_frame_result, int target_frame_bits, + double percent_diff) { + encode_command_set_target_frame_bits(&impl_ptr_->cpi->encode_command, + target_frame_bits, percent_diff); + EncodeFrame(encode_frame_result); + encode_command_reset_target_frame_bits(&impl_ptr_->cpi->encode_command); +} + +static int GetCodingFrameNumFromGopMap(const std::vector &gop_map) { + int start_show_index = 0; + int coding_frame_count = 0; + while (static_cast(start_show_index) < gop_map.size()) { + const GOP_COMMAND gop_command = GetGopCommand(gop_map, start_show_index); + start_show_index += gop_command.show_frame_count; + coding_frame_count += gop_command_coding_frame_count(&gop_command); + } + assert(static_cast(start_show_index) == gop_map.size()); + return coding_frame_count; +} + +int SimpleEncode::GetCodingFrameNum() const { + assert(impl_ptr_->first_pass_stats.size() > 0); + if (gop_map_.size() > 0) { + return GetCodingFrameNumFromGopMap(gop_map_); + } + + // These are the default settings for now. + TWO_PASS twopass; + const int multi_layer_arf = 0; + const int allow_alt_ref = 1; + vpx_rational_t frame_rate = + make_vpx_rational(frame_rate_num_, frame_rate_den_); + const VP9EncoderConfig oxcf = GetEncodeConfig( + frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_, + target_level_, VPX_RC_LAST_PASS, impl_ptr_->encode_config_list); + FRAME_INFO frame_info = vp9_get_frame_info(&oxcf); + fps_init_first_pass_info(&twopass.first_pass_info, + GetVectorData(impl_ptr_->first_pass_stats), + num_frames_); + unsigned int screen_area = frame_info.frame_width * frame_info.frame_height; + vp9_init_vizier_params(&twopass, screen_area); + return vp9_get_coding_frame_num(&oxcf, &twopass, &frame_info, multi_layer_arf, + allow_alt_ref); +} + +std::vector SimpleEncode::ComputeKeyFrameMap() const { + // The last entry of first_pass_stats is the overall stats. + assert(impl_ptr_->first_pass_stats.size() == + static_cast(num_frames_) + 1); + vpx_rational_t frame_rate = + make_vpx_rational(frame_rate_num_, frame_rate_den_); + const VP9EncoderConfig oxcf = GetEncodeConfig( + frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_, + target_level_, VPX_RC_LAST_PASS, impl_ptr_->encode_config_list); + TWO_PASS twopass; + fps_init_first_pass_info(&twopass.first_pass_info, + GetVectorData(impl_ptr_->first_pass_stats), + num_frames_); + std::vector key_frame_map(num_frames_, 0); + vp9_get_key_frame_map(&oxcf, &twopass, GetVectorData(key_frame_map)); + return key_frame_map; +} + +std::vector SimpleEncode::ObserveKeyFrameMap() const { + return key_frame_map_; +} + +uint64_t SimpleEncode::GetFramePixelCount() const { + assert(frame_width_ % 2 == 0); + assert(frame_height_ % 2 == 0); + switch (impl_ptr_->img_fmt) { + case VPX_IMG_FMT_I420: return frame_width_ * frame_height_ * 3 / 2; + case VPX_IMG_FMT_I422: return frame_width_ * frame_height_ * 2; + case VPX_IMG_FMT_I444: return frame_width_ * frame_height_ * 3; + case VPX_IMG_FMT_I440: return frame_width_ * frame_height_ * 2; + case VPX_IMG_FMT_I42016: return frame_width_ * frame_height_ * 3 / 2; + case VPX_IMG_FMT_I42216: return frame_width_ * frame_height_ * 2; + case VPX_IMG_FMT_I44416: return frame_width_ * frame_height_ * 3; + case VPX_IMG_FMT_I44016: return frame_width_ * frame_height_ * 2; + default: return 0; + } +} + +SimpleEncode::~SimpleEncode() { + if (in_file_ != nullptr) { + fclose(in_file_); + } + if (out_file_ != nullptr) { + fclose(out_file_); + } +} + +} // namespace vp9 diff --git a/media/libvpx/libvpx/vp9/simple_encode.h b/media/libvpx/libvpx/vp9/simple_encode.h new file mode 100644 index 0000000000..d610a5e159 --- /dev/null +++ b/media/libvpx/libvpx/vp9/simple_encode.h @@ -0,0 +1,583 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_SIMPLE_ENCODE_H_ +#define VPX_VP9_SIMPLE_ENCODE_H_ + +#include +#include +#include +#include +#include + +namespace vp9 { + +enum StatusCode { + StatusOk = 0, + StatusError, +}; + +// TODO(angiebird): Add description for each frame type. +enum FrameType { + kFrameTypeKey = 0, + kFrameTypeInter = 1, + kFrameTypeAltRef = 2, + kFrameTypeOverlay = 3, + kFrameTypeGolden = 4, +}; + +// TODO(angiebird): Add description for each reference frame type. +// This enum numbers have to be contiguous and start from zero except +// kNoneRefFrame. +enum RefFrameType { + kRefFrameTypeLast = 0, + kRefFrameTypePast = 1, + kRefFrameTypeFuture = 2, + kRefFrameTypeMax = 3, + kRefFrameTypeNone = -1, +}; + +enum VP9_LEVEL { + LEVEL_UNKNOWN = 0, + LEVEL_AUTO = 1, + LEVEL_1 = 10, + LEVEL_1_1 = 11, + LEVEL_2 = 20, + LEVEL_2_1 = 21, + LEVEL_3 = 30, + LEVEL_3_1 = 31, + LEVEL_4 = 40, + LEVEL_4_1 = 41, + LEVEL_5 = 50, + LEVEL_5_1 = 51, + LEVEL_5_2 = 52, + LEVEL_6 = 60, + LEVEL_6_1 = 61, + LEVEL_6_2 = 62, + LEVEL_MAX = 255 +}; + +enum GopMapFlag { + kGopMapFlagStart = + 1 << 0, // Indicate this location is the start of a group of pictures. + kGopMapFlagUseAltRef = + 1 << 1, // Indicate this group of pictures will use an alt ref. Only set + // this flag when kGopMapFlagStart is set. +}; + +// The frame is split to 4x4 blocks. +// This structure contains the information of each 4x4 block. +struct PartitionInfo { + int row; // row pixel offset of current 4x4 block + int column; // column pixel offset of current 4x4 block + int row_start; // row pixel offset of the start of the prediction block + int column_start; // column pixel offset of the start of the prediction block + int width; // prediction block width + int height; // prediction block height +}; + +constexpr int kMotionVectorSubPixelPrecision = 8; +constexpr int kMotionVectorFullPixelPrecision = 1; + +// In the first pass. The frame is split to 16x16 blocks. +// This structure contains the information of each 16x16 block. +// In the second pass. The frame is split to 4x4 blocks. +// This structure contains the information of each 4x4 block. +struct MotionVectorInfo { + // Number of valid motion vectors, always 0 if this block is in the key frame. + // For inter frames, it could be 1 or 2. + int mv_count; + // The reference frame for motion vectors. If the second motion vector does + // not exist (mv_count = 1), the reference frame is kNoneRefFrame. + // Otherwise, the reference frame is either kRefFrameTypeLast, or + // kRefFrameTypePast, or kRefFrameTypeFuture. + RefFrameType ref_frame[2]; + // The row offset of motion vectors in the unit of pixel. + // If the second motion vector does not exist, the value is 0. + double mv_row[2]; + // The column offset of motion vectors in the unit of pixel. + // If the second motion vector does not exist, the value is 0. + double mv_column[2]; +}; + +// Accumulated tpl stats of all blocks in one frame. +// For each frame, the tpl stats are computed per 32x32 block. +struct TplStatsInfo { + // Intra complexity: the sum of absolute transform difference (SATD) of + // intra predicted residuals. + int64_t intra_cost; + // Inter complexity: the SATD of inter predicted residuals. + int64_t inter_cost; + // Motion compensated information flow. It measures how much information + // is propagated from the current frame to other frames. + int64_t mc_flow; + // Motion compensated dependency cost. It equals to its own intra_cost + // plus the mc_flow. + int64_t mc_dep_cost; + // Motion compensated reference cost. + int64_t mc_ref_cost; +}; + +struct RefFrameInfo { + int coding_indexes[kRefFrameTypeMax]; + + // Indicate whether the reference frames are available or not. + // When the reference frame type is not valid, it means either the to-be-coded + // frame is a key frame or the reference frame already appears in other + // reference frame type. vp9 always keeps three types of reference frame + // available. However, the duplicated reference frames will not be + // chosen by the encoder. The priorities of choosing reference frames are + // kRefFrameTypeLast > kRefFrameTypePast > kRefFrameTypeFuture. + // For example, if kRefFrameTypeLast and kRefFrameTypePast both point to the + // same frame, kRefFrameTypePast will be set to invalid. + // 1: the ref frame type is available 0: the ref frame type is not available + int valid_list[kRefFrameTypeMax]; +}; + +bool operator==(const RefFrameInfo &a, const RefFrameInfo &b); + +struct EncodeFrameInfo { + int show_idx; + + // Each show or no show frame is assigned with a coding index based on its + // coding order (starting from zero) in the coding process of the entire + // video. The coding index for each frame is unique. + int coding_index; + RefFrameInfo ref_frame_info; + FrameType frame_type; +}; + +// This structure is a copy of vp9 |nmv_component_counts|. +struct NewMotionvectorComponentCounts { + std::vector sign; + std::vector classes; + std::vector class0; + std::vector> bits; + std::vector> class0_fp; + std::vector fp; + std::vector class0_hp; + std::vector hp; +}; + +// This structure is a copy of vp9 |nmv_context_counts|. +struct NewMotionVectorContextCounts { + std::vector joints; + std::vector comps; +}; + +using UintArray2D = std::vector>; +using UintArray3D = std::vector>>; +using UintArray5D = std::vector< + std::vector>>>>; +using UintArray6D = std::vector>>>>>; + +// This structure is a copy of vp9 |tx_counts|. +struct TransformSizeCounts { + // Transform size found in blocks of partition size 32x32. + // First dimension: transform size contexts (2). + // Second dimension: transform size type (3: 32x32, 16x16, 8x8) + UintArray2D p32x32; + // Transform size found in blocks of partition size 16x16. + // First dimension: transform size contexts (2). + // Second dimension: transform size type (2: 16x16, 8x8) + UintArray2D p16x16; + // Transform size found in blocks of partition size 8x8. + // First dimension: transform size contexts (2). + // Second dimension: transform size type (1: 8x8) + UintArray2D p8x8; + // Overall transform size count. + std::vector tx_totals; +}; + +// This structure is a copy of vp9 |FRAME_COUNTS|. +struct FrameCounts { + // Intra prediction mode for luma plane. First dimension: block size (4). + // Second dimension: intra prediction mode (10). + UintArray2D y_mode; + // Intra prediction mode for chroma plane. First and second dimension: + // intra prediction mode (10). + UintArray2D uv_mode; + // Partition type. First dimension: partition contexts (16). + // Second dimension: partition type (4). + UintArray2D partition; + // Transform coefficient. + UintArray6D coef; + // End of block (the position of the last non-zero transform coefficient) + UintArray5D eob_branch; + // Interpolation filter type. First dimension: switchable filter contexts (4). + // Second dimension: filter types (3). + UintArray2D switchable_interp; + // Inter prediction mode (the motion vector type). + // First dimension: inter mode contexts (7). + // Second dimension: mode type (4). + UintArray2D inter_mode; + // Block is intra or inter predicted. First dimension: contexts (4). + // Second dimension: type (0 for intra, 1 for inter). + UintArray2D intra_inter; + // Block is compound predicted (predicted from average of two blocks). + // First dimension: contexts (5). + // Second dimension: type (0 for single, 1 for compound prediction). + UintArray2D comp_inter; + // Type of the reference frame. Only one reference frame. + // First dimension: context (5). Second dimension: context (2). + // Third dimension: count (2). + UintArray3D single_ref; + // Type of the two reference frames. + // First dimension: context (5). Second dimension: count (2). + UintArray2D comp_ref; + // Block skips transform and quantization, uses prediction as reconstruction. + // First dimension: contexts (3). Second dimension: type (0 not skip, 1 skip). + UintArray2D skip; + // Transform size. + TransformSizeCounts tx; + // New motion vector. + NewMotionVectorContextCounts mv; +}; + +struct ImageBuffer { + // The image data is stored in raster order, + // i.e. image[plane][r][c] = + // plane_buffer[plane][r * plane_width[plane] + plane_height[plane]]. + std::unique_ptr plane_buffer[3]; + int plane_width[3]; + int plane_height[3]; +}; + +void output_image_buffer(const ImageBuffer &image_buffer, std::FILE *out_file); + +struct EncodeFrameResult { + int show_idx; + FrameType frame_type; + int coding_idx; + RefFrameInfo ref_frame_info; + size_t coding_data_bit_size; + size_t coding_data_byte_size; + // The EncodeFrame will allocate a buffer, write the coding data into the + // buffer and give the ownership of the buffer to coding_data. + std::unique_ptr coding_data; + double psnr; + uint64_t sse; + int quantize_index; + FrameCounts frame_counts; + int num_rows_4x4; // number of row units, in size of 4. + int num_cols_4x4; // number of column units, in size of 4. + // A vector of the partition information of the frame. + // The number of elements is |num_rows_4x4| * |num_cols_4x4|. + // The frame is divided 4x4 blocks of |num_rows_4x4| rows and + // |num_cols_4x4| columns. + // Each 4x4 block contains the current pixel position (|row|, |column|), + // the start pixel position of the partition (|row_start|, |column_start|), + // and the |width|, |height| of the partition. + // The current pixel position can be the same as the start pixel position + // if the 4x4 block is the top-left block in the partition. Otherwise, they + // are different. + // Within the same partition, all 4x4 blocks have the same |row_start|, + // |column_start|, |width| and |height|. + // For example, if the frame is partitioned to a 32x32 block, + // starting at (0, 0). Then, there're 64 4x4 blocks within this partition. + // They all have the same |row_start|, |column_start|, |width|, |height|, + // which can be used to figure out the start of the current partition and + // the start of the next partition block. + // Horizontal next: |column_start| + |width|, + // Vertical next: |row_start| + |height|. + std::vector partition_info; + // A vector of the motion vector information of the frame. + // The number of elements is |num_rows_4x4| * |num_cols_4x4|. + // The frame is divided into 4x4 blocks of |num_rows_4x4| rows and + // |num_cols_4x4| columns. + // Each 4x4 block contains 0 motion vector if this is an intra predicted + // frame (for example, the key frame). If the frame is inter predicted, + // each 4x4 block contains either 1 or 2 motion vectors. + // Similar to partition info, all 4x4 blocks inside the same partition block + // share the same motion vector information. + std::vector motion_vector_info; + // A vector of the tpl stats information. + // The tpl stats measure the complexity of a frame, as well as the + // information propagated along the motion trajectory between frames, in + // the reference frame structure. + // The tpl stats could be used as a more accurate spatial and temporal + // complexity measure in addition to the first pass stats. + // The vector contains tpl stats for all show frames in a GOP. + // The tpl stats stored in the vector is according to the encoding order. + // For example, suppose there are N show frames for the current GOP. + // Then tpl_stats_info[0] stores the information of the first frame to be + // encoded for this GOP, i.e., the AltRef frame. + std::vector tpl_stats_info; + ImageBuffer coded_frame; + + // recode_count, q_index_history and rate_history are only available when + // EncodeFrameWithTargetFrameBits() is used. + int recode_count; + std::vector q_index_history; + std::vector rate_history; +}; + +struct GroupOfPicture { + // This list will be updated internally in StartEncode() and + // EncodeFrame()/EncodeFrameWithQuantizeIndex(). + // In EncodeFrame()/EncodeFrameWithQuantizeIndex(), the update will only be + // triggered when the coded frame is the last one in the previous group of + // pictures. + std::vector encode_frame_list; + + // Indicates the index of the next coding frame in encode_frame_list. + // In other words, EncodeFrameInfo of the next coding frame can be + // obtained with encode_frame_list[next_encode_frame_index]. + // Internally, next_encode_frame_index will be set to zero after the last + // frame of the group of pictures is coded. Otherwise, next_encode_frame_index + // will be increased after each EncodeFrame()/EncodeFrameWithQuantizeIndex() + // call. + int next_encode_frame_index; + + // Number of show frames in this group of pictures. + int show_frame_count; + + // The show index/timestamp of the earliest show frame in the group of + // pictures. + int start_show_index; + + // The coding index of the first coding frame in the group of pictures. + int start_coding_index; + + // Indicates whether this group of pictures starts with a key frame. + int first_is_key_frame; + + // Indicates whether this group of pictures uses an alt ref. + int use_alt_ref; + + // Indicates whether previous group of pictures used an alt ref. + int last_gop_use_alt_ref; +}; + +class SimpleEncode { + public: + // When outfile_path is set, the encoder will output the bitstream in ivf + // format. + SimpleEncode(int frame_width, int frame_height, int frame_rate_num, + int frame_rate_den, int target_bitrate, int num_frames, + int target_level, const char *infile_path, + const char *outfile_path = nullptr); + ~SimpleEncode(); + SimpleEncode(SimpleEncode &) = delete; + SimpleEncode &operator=(const SimpleEncode &) = delete; + + // Adjusts the encoder's coding speed. + // If this function is not called, the encoder will use default encode_speed + // 0. Call this function before ComputeFirstPassStats() if needed. + // The encode_speed is equivalent to --cpu-used of the vpxenc command. + // The encode_speed's range should be [0, 9]. + // Setting the encode_speed to a higher level will yield faster coding + // at the cost of lower compression efficiency. + void SetEncodeSpeed(int encode_speed); + + // Set encoder config + // The following configs in VP9EncoderConfig are allowed to change in this + // function. See https://ffmpeg.org/ffmpeg-codecs.html#libvpx for each + // config's meaning. + // Configs in VP9EncoderConfig: Equivalent configs in ffmpeg: + // 1 key_freq -g + // 2 two_pass_vbrmin_section -minrate * 100LL / bit_rate + // 3 two_pass_vbrmax_section -maxrate * 100LL / bit_rate + // 4 under_shoot_pct -undershoot-pct + // 5 over_shoot_pct -overshoot-pct + // 6 max_threads -threads + // 7 frame_parallel_decoding_mode -frame-parallel + // 8 tile_column -tile-columns + // 9 arnr_max_frames -arnr-maxframes + // 10 arnr_strength -arnr-strength + // 11 lag_in_frames -rc_lookahead + // 12 encode_breakout -static-thresh + // 13 enable_tpl_model -enable-tpl + // 14 enable_auto_arf -auto-alt-ref + // 15 rc_mode + // Possible Settings: + // 0 - Variable Bit Rate (VPX_VBR) -b:v + // 1 - Constant Bit Rate (VPX_CBR) -b:v -minrate + // -maxrate + // two_pass_vbrmin_section == 100 i.e. bit_rate == minrate == maxrate + // two_pass_vbrmax_section == 100 + // 2 - Constrained Quality (VPX_CQ) -crf -b:v bit_rate + // 3 - Constant Quality (VPX_Q) -crf -b:v 0 + // See https://trac.ffmpeg.org/wiki/Encode/VP9 for more details. + // 16 cq_level see rc_mode for details. + StatusCode SetEncodeConfig(const char *name, const char *value); + + // A debug function that dumps configs from VP9EncoderConfig + // pass = 1: first pass, pass = 2: second pass + // fp: file pointer for dumping config + StatusCode DumpEncodeConfigs(int pass, FILE *fp); + + // Makes encoder compute the first pass stats and store it at + // impl_ptr_->first_pass_stats. key_frame_map_ is also computed based on the + // first pass stats. + void ComputeFirstPassStats(); + + // Outputs the first pass stats represented by a 2-D vector. + // One can use the frame index at first dimension to retrieve the stats for + // each video frame. The stats of each video frame is a vector of 25 double + // values. For details, please check FIRSTPASS_STATS in vp9_firstpass.h + std::vector> ObserveFirstPassStats(); + + // Outputs the first pass motion vectors represented by a 2-D vector. + // One can use the frame index at first dimension to retrieve the mvs for + // each video frame. The frame is divided into 16x16 blocks. The number of + // elements is round_up(|num_rows_4x4| / 4) * round_up(|num_cols_4x4| / 4). + std::vector> ObserveFirstPassMotionVectors(); + + // Ouputs a copy of key_frame_map_, a binary vector with size equal to the + // number of show frames in the video. For each entry in the vector, 1 + // indicates the position is a key frame and 0 indicates it's not a key frame. + // This function should be called after ComputeFirstPassStats() + std::vector ObserveKeyFrameMap() const; + + // Sets group of pictures map for coding the entire video. + // Each entry in the gop_map corresponds to a show frame in the video. + // Therefore, the size of gop_map should equal to the number of show frames in + // the entire video. + // If a given entry's kGopMapFlagStart is set, it means this is the start of a + // gop. Once kGopMapFlagStart is set, one can set kGopMapFlagUseAltRef to + // indicate whether this gop use altref. + // If a given entry is zero, it means it's in the middle of a gop. + // This function should be called only once after ComputeFirstPassStats(), + // before StartEncode(). + // This API will check and modify the gop_map to satisfy the following + // constraints. + // 1) Each key frame position should be at the start of a gop. + // 2) The last gop should not use an alt ref. + void SetExternalGroupOfPicturesMap(int *gop_map, int gop_map_size); + + // Observe the group of pictures map set through + // SetExternalGroupOfPicturesMap(). This function should be called after + // SetExternalGroupOfPicturesMap(). + std::vector ObserveExternalGroupOfPicturesMap(); + + // Initializes the encoder for actual encoding. + // This function should be called after ComputeFirstPassStats(). + void StartEncode(); + + // Frees the encoder. + // This function should be called after StartEncode() or EncodeFrame(). + void EndEncode(); + + // The key frame group size includes one key frame plus the number of + // following inter frames. Note that the key frame group size only counts the + // show frames. The number of no show frames like alternate refereces are not + // counted. + int GetKeyFrameGroupSize() const; + + // Provides the group of pictures that the next coding frame is in. + // Only call this function between StartEncode() and EndEncode() + GroupOfPicture ObserveGroupOfPicture() const; + + // Gets encode_frame_info for the next coding frame. + // Only call this function between StartEncode() and EndEncode() + EncodeFrameInfo GetNextEncodeFrameInfo() const; + + // Encodes a frame + // This function should be called after StartEncode() and before EndEncode(). + void EncodeFrame(EncodeFrameResult *encode_frame_result); + + // Encodes a frame with a specific quantize index. + // This function should be called after StartEncode() and before EndEncode(). + void EncodeFrameWithQuantizeIndex(EncodeFrameResult *encode_frame_result, + int quantize_index); + + // Encode a frame with target frame bits usage. + // The encoder will find a quantize index to make the actual frame bits usage + // match the target. EncodeFrameWithTargetFrameBits() will recode the frame + // up to 7 times to find a q_index to make the actual_frame_bits satisfy the + // following inequality. |actual_frame_bits - target_frame_bits| * 100 / + // target_frame_bits + // <= percent_diff. + void EncodeFrameWithTargetFrameBits(EncodeFrameResult *encode_frame_result, + int target_frame_bits, + double percent_diff); + + // Gets the number of coding frames for the video. The coding frames include + // show frame and no show frame. + // This function should be called after ComputeFirstPassStats(). + int GetCodingFrameNum() const; + + // Gets the total number of pixels of YUV planes per frame. + uint64_t GetFramePixelCount() const; + + private: + // Compute the key frame locations of the video based on first pass stats. + // The results are returned as a binary vector with 1s indicating keyframes + // and 0s indicating non keyframes. + // It has to be called after impl_ptr_->first_pass_stats is computed. + std::vector ComputeKeyFrameMap() const; + + // Updates key_frame_group_size_, reset key_frame_group_index_ and init + // ref_frame_info_. + void UpdateKeyFrameGroup(int key_frame_show_index); + + // Update key_frame_group_index_. + void PostUpdateKeyFrameGroupIndex(FrameType frame_type); + + void PostUpdateState(const EncodeFrameResult &encode_frame_result); + + class EncodeImpl; + + int frame_width_; // frame width in pixels. + int frame_height_; // frame height in pixels. + int frame_rate_num_; + int frame_rate_den_; + int target_bitrate_; + int num_frames_; + int encode_speed_; + int target_level_; + + std::FILE *in_file_; + std::FILE *out_file_; + std::unique_ptr impl_ptr_; + + std::vector key_frame_map_; + std::vector gop_map_; + GroupOfPicture group_of_picture_; + + // The key frame group size includes one key frame plus the number of + // following inter frames. Note that the key frame group size only counts the + // show frames. The number of no show frames like alternate references are not + // counted. + int key_frame_group_size_; + + // The index for the to-be-coded show frame in the key frame group. + int key_frame_group_index_; + + // Each show or no show frame is assigned with a coding index based on its + // coding order (starting from zero) in the coding process of the entire + // video. The coding index of the to-be-coded frame. + int frame_coding_index_; + + // Number of show frames we have coded so far. + int show_frame_count_; + + // TODO(angiebird): Do we need to reset ref_frames_info_ when the next key + // frame appears? + // Reference frames info of the to-be-coded frame. + RefFrameInfo ref_frame_info_; + + // A 2-D vector of motion vector information of the frame collected + // from the first pass. The first dimension is the frame index. + // Each frame is divided into 16x16 blocks. The number of elements is + // round_up(|num_rows_4x4| / 4) * round_up(|num_cols_4x4| / 4). + // Each 16x16 block contains 0 motion vector if this is an intra predicted + // frame (for example, the key frame). If the frame is inter predicted, + // each 16x16 block contains either 1 or 2 motion vectors. + // The first motion vector is always from the LAST_FRAME. + // The second motion vector is always from the GOLDEN_FRAME. + std::vector> fp_motion_vector_info_; +}; + +} // namespace vp9 + +#endif // VPX_VP9_SIMPLE_ENCODE_H_ diff --git a/media/libvpx/libvpx/vp9/vp9_common.mk b/media/libvpx/libvpx/vp9/vp9_common.mk new file mode 100644 index 0000000000..5ef2f891a8 --- /dev/null +++ b/media/libvpx/libvpx/vp9/vp9_common.mk @@ -0,0 +1,99 @@ +## +## Copyright (c) 2010 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +VP9_COMMON_SRCS-yes += vp9_common.mk +VP9_COMMON_SRCS-yes += vp9_iface_common.h +VP9_COMMON_SRCS-yes += vp9_iface_common.c +VP9_COMMON_SRCS-yes += common/vp9_ppflags.h +VP9_COMMON_SRCS-yes += common/vp9_alloccommon.c +VP9_COMMON_SRCS-yes += common/vp9_blockd.c +# VP9_COMMON_SRCS-yes += common/vp9_debugmodes.c +VP9_COMMON_SRCS-yes += common/vp9_entropy.c +VP9_COMMON_SRCS-yes += common/vp9_entropymode.c +VP9_COMMON_SRCS-yes += common/vp9_entropymv.c +VP9_COMMON_SRCS-yes += common/vp9_frame_buffers.c +VP9_COMMON_SRCS-yes += common/vp9_frame_buffers.h +VP9_COMMON_SRCS-yes += common/vp9_idct.c +VP9_COMMON_SRCS-yes += common/vp9_alloccommon.h +VP9_COMMON_SRCS-yes += common/vp9_blockd.h +VP9_COMMON_SRCS-yes += common/vp9_common.h +VP9_COMMON_SRCS-yes += common/vp9_entropy.h +VP9_COMMON_SRCS-yes += common/vp9_entropymode.h +VP9_COMMON_SRCS-yes += common/vp9_entropymv.h +VP9_COMMON_SRCS-yes += common/vp9_enums.h +VP9_COMMON_SRCS-yes += common/vp9_filter.h +VP9_COMMON_SRCS-yes += common/vp9_filter.c +VP9_COMMON_SRCS-yes += common/vp9_idct.h +VP9_COMMON_SRCS-yes += common/vp9_loopfilter.h +VP9_COMMON_SRCS-yes += common/vp9_thread_common.h +VP9_COMMON_SRCS-yes += common/vp9_mv.h +VP9_COMMON_SRCS-yes += common/vp9_onyxc_int.h +VP9_COMMON_SRCS-yes += common/vp9_pred_common.h +VP9_COMMON_SRCS-yes += common/vp9_pred_common.c +VP9_COMMON_SRCS-yes += common/vp9_quant_common.h +VP9_COMMON_SRCS-yes += common/vp9_reconinter.h +VP9_COMMON_SRCS-yes += common/vp9_reconintra.h +VP9_COMMON_SRCS-yes += common/vp9_rtcd.c +VP9_COMMON_SRCS-yes += common/vp9_rtcd_defs.pl +VP9_COMMON_SRCS-yes += common/vp9_scale.h +VP9_COMMON_SRCS-yes += common/vp9_scale.c +VP9_COMMON_SRCS-yes += common/vp9_seg_common.h +VP9_COMMON_SRCS-yes += common/vp9_seg_common.c +VP9_COMMON_SRCS-yes += common/vp9_tile_common.h +VP9_COMMON_SRCS-yes += common/vp9_tile_common.c +VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c +VP9_COMMON_SRCS-yes += common/vp9_thread_common.c +VP9_COMMON_SRCS-yes += common/vp9_mvref_common.c +VP9_COMMON_SRCS-yes += common/vp9_mvref_common.h +VP9_COMMON_SRCS-yes += common/vp9_quant_common.c +VP9_COMMON_SRCS-yes += common/vp9_reconinter.c +VP9_COMMON_SRCS-yes += common/vp9_reconintra.c +VP9_COMMON_SRCS-yes += common/vp9_common_data.c +VP9_COMMON_SRCS-yes += common/vp9_common_data.h +VP9_COMMON_SRCS-yes += common/vp9_scan.c +VP9_COMMON_SRCS-yes += common/vp9_scan.h + +VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h +VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c +VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h +VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c + +ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct4x4_msa.c +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c +endif # !CONFIG_VP9_HIGHBITDEPTH + +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c +VP9_COMMON_SRCS-$(HAVE_VSX) += common/ppc/vp9_idct_vsx.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht16x16_add_neon.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht_neon.h + +ifeq ($(CONFIG_VP9_POSTPROC),yes) +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_mfqe_msa.c +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm +endif + +ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans4_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans8_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans16_dspr2.c +else +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_highbd_iht4x4_add_neon.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_highbd_iht8x8_add_neon.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_highbd_iht16x16_add_neon.c +VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht4x4_add_sse4.c +VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht8x8_add_sse4.c +VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht16x16_add_sse4.c +endif + +$(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl)) diff --git a/media/libvpx/libvpx/vp9/vp9_cx_iface.c b/media/libvpx/libvpx/vp9/vp9_cx_iface.c new file mode 100644 index 0000000000..8df04f29f0 --- /dev/null +++ b/media/libvpx/libvpx/vp9/vp9_cx_iface.c @@ -0,0 +1,2432 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "vpx/vpx_encoder.h" +#include "vpx/vpx_ext_ratectrl.h" +#include "vpx_dsp/psnr.h" +#include "vpx_ports/static_assert.h" +#include "vpx_ports/system_state.h" +#include "vpx_util/vpx_timestamp.h" +#include "vpx/internal/vpx_codec_internal.h" +#include "./vpx_version.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vpx/vp8cx.h" +#include "vp9/common/vp9_alloccommon.h" +#include "vp9/common/vp9_scale.h" +#include "vp9/vp9_cx_iface.h" +#include "vp9/encoder/vp9_firstpass.h" +#include "vp9/encoder/vp9_lookahead.h" +#include "vp9/vp9_cx_iface.h" +#include "vp9/vp9_iface_common.h" + +#include "vpx/vpx_tpl.h" + +typedef struct vp9_extracfg { + int cpu_used; // available cpu percentage in 1/16 + unsigned int enable_auto_alt_ref; + unsigned int noise_sensitivity; + unsigned int sharpness; + unsigned int static_thresh; + unsigned int tile_columns; + unsigned int tile_rows; + unsigned int enable_tpl_model; + unsigned int arnr_max_frames; + unsigned int arnr_strength; + unsigned int min_gf_interval; + unsigned int max_gf_interval; + vp8e_tuning tuning; + unsigned int cq_level; // constrained quality level + unsigned int rc_max_intra_bitrate_pct; + unsigned int rc_max_inter_bitrate_pct; + unsigned int gf_cbr_boost_pct; + unsigned int lossless; + unsigned int target_level; + unsigned int frame_parallel_decoding_mode; + AQ_MODE aq_mode; + int alt_ref_aq; + unsigned int frame_periodic_boost; + vpx_bit_depth_t bit_depth; + vp9e_tune_content content; + vpx_color_space_t color_space; + vpx_color_range_t color_range; + int render_width; + int render_height; + unsigned int row_mt; + unsigned int motion_vector_unit_test; + int delta_q_uv; +} vp9_extracfg; + +static struct vp9_extracfg default_extra_cfg = { +#if CONFIG_REALTIME_ONLY + 5, // cpu_used +#else + 0, // cpu_used +#endif + 1, // enable_auto_alt_ref + 0, // noise_sensitivity + 0, // sharpness + 0, // static_thresh + 6, // tile_columns + 0, // tile_rows + 1, // enable_tpl_model + 7, // arnr_max_frames + 5, // arnr_strength + 0, // min_gf_interval; 0 -> default decision + 0, // max_gf_interval; 0 -> default decision + VP8_TUNE_PSNR, // tuning + 10, // cq_level + 0, // rc_max_intra_bitrate_pct + 0, // rc_max_inter_bitrate_pct + 0, // gf_cbr_boost_pct + 0, // lossless + 255, // target_level + 1, // frame_parallel_decoding_mode + NO_AQ, // aq_mode + 0, // alt_ref_aq + 0, // frame_periodic_delta_q + VPX_BITS_8, // Bit depth + VP9E_CONTENT_DEFAULT, // content + VPX_CS_UNKNOWN, // color space + 0, // color range + 0, // render width + 0, // render height + 0, // row_mt + 0, // motion_vector_unit_test + 0, // delta_q_uv +}; + +struct vpx_codec_alg_priv { + vpx_codec_priv_t base; + vpx_codec_enc_cfg_t cfg; + struct vp9_extracfg extra_cfg; + vpx_rational64_t timestamp_ratio; + vpx_codec_pts_t pts_offset; + unsigned char pts_offset_initialized; + VP9EncoderConfig oxcf; + VP9_COMP *cpi; + unsigned char *cx_data; + size_t cx_data_sz; + unsigned char *pending_cx_data; + size_t pending_cx_data_sz; + int pending_frame_count; + size_t pending_frame_sizes[8]; + size_t pending_frame_magnitude; + vpx_image_t preview_img; + vpx_enc_frame_flags_t next_frame_flags; + vp8_postproc_cfg_t preview_ppcfg; + vpx_codec_pkt_list_decl(256) pkt_list; + unsigned int fixed_kf_cntr; + vpx_codec_priv_output_cx_pkt_cb_pair_t output_cx_pkt_cb; + // BufferPool that holds all reference frames. + BufferPool *buffer_pool; +}; + +// Called by encoder_set_config() and encoder_encode() only. Must not be called +// by encoder_init() because the `error` paramerer (cpi->common.error) will be +// destroyed by vpx_codec_enc_init_ver() after encoder_init() returns an error. +// See the "IMPORTANT" comment in vpx_codec_enc_init_ver(). +static vpx_codec_err_t update_error_state( + vpx_codec_alg_priv_t *ctx, const struct vpx_internal_error_info *error) { + const vpx_codec_err_t res = error->error_code; + + if (res != VPX_CODEC_OK) + ctx->base.err_detail = error->has_detail ? error->detail : NULL; + + return res; +} + +#undef ERROR +#define ERROR(str) \ + do { \ + ctx->base.err_detail = str; \ + return VPX_CODEC_INVALID_PARAM; \ + } while (0) + +#define RANGE_CHECK(p, memb, lo, hi) \ + do { \ + if (!(((p)->memb == (lo) || (p)->memb > (lo)) && (p)->memb <= (hi))) \ + ERROR(#memb " out of range [" #lo ".." #hi "]"); \ + } while (0) + +#define RANGE_CHECK_HI(p, memb, hi) \ + do { \ + if (!((p)->memb <= (hi))) ERROR(#memb " out of range [.." #hi "]"); \ + } while (0) + +#define RANGE_CHECK_LO(p, memb, lo) \ + do { \ + if (!((p)->memb >= (lo))) ERROR(#memb " out of range [" #lo "..]"); \ + } while (0) + +#define RANGE_CHECK_BOOL(p, memb) \ + do { \ + if (!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean"); \ + } while (0) + +static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, + const vpx_codec_enc_cfg_t *cfg, + const struct vp9_extracfg *extra_cfg) { + RANGE_CHECK(cfg, g_w, 1, 65536); // 16 bits available + RANGE_CHECK(cfg, g_h, 1, 65536); // 16 bits available + RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000); + RANGE_CHECK(cfg, g_timebase.num, 1, 1000000000); + RANGE_CHECK_HI(cfg, g_profile, 3); + + RANGE_CHECK_HI(cfg, rc_max_quantizer, 63); + RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer); + RANGE_CHECK_BOOL(extra_cfg, lossless); + RANGE_CHECK_BOOL(extra_cfg, frame_parallel_decoding_mode); + RANGE_CHECK(extra_cfg, aq_mode, 0, AQ_MODE_COUNT - 2); + RANGE_CHECK(extra_cfg, alt_ref_aq, 0, 1); + RANGE_CHECK(extra_cfg, frame_periodic_boost, 0, 1); + RANGE_CHECK_HI(cfg, g_threads, 64); + RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS); + RANGE_CHECK(cfg, rc_end_usage, VPX_VBR, VPX_Q); + RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100); + RANGE_CHECK_HI(cfg, rc_overshoot_pct, 100); + RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100); + RANGE_CHECK(cfg, rc_2pass_vbr_corpus_complexity, 0, 10000); + RANGE_CHECK(cfg, kf_mode, VPX_KF_DISABLED, VPX_KF_AUTO); + RANGE_CHECK_BOOL(cfg, rc_resize_allowed); + RANGE_CHECK_HI(cfg, rc_dropframe_thresh, 100); + RANGE_CHECK_HI(cfg, rc_resize_up_thresh, 100); + RANGE_CHECK_HI(cfg, rc_resize_down_thresh, 100); +#if CONFIG_REALTIME_ONLY + RANGE_CHECK(cfg, g_pass, VPX_RC_ONE_PASS, VPX_RC_ONE_PASS); +#else + RANGE_CHECK(cfg, g_pass, VPX_RC_ONE_PASS, VPX_RC_LAST_PASS); +#endif + RANGE_CHECK(extra_cfg, min_gf_interval, 0, (MAX_LAG_BUFFERS - 1)); + RANGE_CHECK(extra_cfg, max_gf_interval, 0, (MAX_LAG_BUFFERS - 1)); + if (extra_cfg->max_gf_interval > 0) { + RANGE_CHECK(extra_cfg, max_gf_interval, 2, (MAX_LAG_BUFFERS - 1)); + } + if (extra_cfg->min_gf_interval > 0 && extra_cfg->max_gf_interval > 0) { + RANGE_CHECK(extra_cfg, max_gf_interval, extra_cfg->min_gf_interval, + (MAX_LAG_BUFFERS - 1)); + } + + // For formation of valid ARF groups lag_in _frames should be 0 or greater + // than the max_gf_interval + 2 + if (cfg->g_lag_in_frames > 0 && extra_cfg->max_gf_interval > 0 && + cfg->g_lag_in_frames < extra_cfg->max_gf_interval + 2) { + ERROR("Set lag in frames to 0 (low delay) or >= (max-gf-interval + 2)"); + } + + if (cfg->rc_resize_allowed == 1) { + RANGE_CHECK(cfg, rc_scaled_width, 0, cfg->g_w); + RANGE_CHECK(cfg, rc_scaled_height, 0, cfg->g_h); + } + + RANGE_CHECK(cfg, ss_number_layers, 1, VPX_SS_MAX_LAYERS); + RANGE_CHECK(cfg, ts_number_layers, 1, VPX_TS_MAX_LAYERS); + + { + unsigned int level = extra_cfg->target_level; + if (level != LEVEL_1 && level != LEVEL_1_1 && level != LEVEL_2 && + level != LEVEL_2_1 && level != LEVEL_3 && level != LEVEL_3_1 && + level != LEVEL_4 && level != LEVEL_4_1 && level != LEVEL_5 && + level != LEVEL_5_1 && level != LEVEL_5_2 && level != LEVEL_6 && + level != LEVEL_6_1 && level != LEVEL_6_2 && level != LEVEL_UNKNOWN && + level != LEVEL_AUTO && level != LEVEL_MAX) + ERROR("target_level is invalid"); + } + + if (cfg->ss_number_layers * cfg->ts_number_layers > VPX_MAX_LAYERS) + ERROR("ss_number_layers * ts_number_layers is out of range"); + if (cfg->ts_number_layers > 1) { + unsigned int sl, tl; + for (sl = 1; sl < cfg->ss_number_layers; ++sl) { + for (tl = 1; tl < cfg->ts_number_layers; ++tl) { + const int layer = LAYER_IDS_TO_IDX(sl, tl, cfg->ts_number_layers); + if (cfg->layer_target_bitrate[layer] < + cfg->layer_target_bitrate[layer - 1]) + ERROR("ts_target_bitrate entries are not increasing"); + } + } + + RANGE_CHECK(cfg, ts_rate_decimator[cfg->ts_number_layers - 1], 1, 1); + for (tl = cfg->ts_number_layers - 2; tl > 0; --tl) + if (cfg->ts_rate_decimator[tl - 1] != 2 * cfg->ts_rate_decimator[tl]) + ERROR("ts_rate_decimator factors are not powers of 2"); + } + + // VP9 does not support a lower bound on the keyframe interval in + // automatic keyframe placement mode. + if (cfg->kf_mode != VPX_KF_DISABLED && cfg->kf_min_dist != cfg->kf_max_dist && + cfg->kf_min_dist > 0) + ERROR( + "kf_min_dist not supported in auto mode, use 0 " + "or kf_max_dist instead."); + + RANGE_CHECK(extra_cfg, row_mt, 0, 1); + RANGE_CHECK(extra_cfg, motion_vector_unit_test, 0, 2); + RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, MAX_ARF_LAYERS); + RANGE_CHECK(extra_cfg, cpu_used, -9, 9); + RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6); + RANGE_CHECK(extra_cfg, tile_columns, 0, 6); + RANGE_CHECK(extra_cfg, tile_rows, 0, 2); + RANGE_CHECK_HI(extra_cfg, sharpness, 7); + RANGE_CHECK(extra_cfg, arnr_max_frames, 0, 15); + RANGE_CHECK_HI(extra_cfg, arnr_strength, 6); + RANGE_CHECK(extra_cfg, cq_level, 0, 63); + RANGE_CHECK(cfg, g_bit_depth, VPX_BITS_8, VPX_BITS_12); + RANGE_CHECK(cfg, g_input_bit_depth, 8, 12); + RANGE_CHECK(extra_cfg, content, VP9E_CONTENT_DEFAULT, + VP9E_CONTENT_INVALID - 1); + +#if !CONFIG_REALTIME_ONLY + if (cfg->g_pass == VPX_RC_LAST_PASS) { + const size_t packet_sz = sizeof(FIRSTPASS_STATS); + const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz); + const FIRSTPASS_STATS *stats; + + if (cfg->rc_twopass_stats_in.buf == NULL) + ERROR("rc_twopass_stats_in.buf not set."); + + if (cfg->rc_twopass_stats_in.sz % packet_sz) + ERROR("rc_twopass_stats_in.sz indicates truncated packet."); + + if (cfg->ss_number_layers > 1 || cfg->ts_number_layers > 1) { + int i; + unsigned int n_packets_per_layer[VPX_SS_MAX_LAYERS] = { 0 }; + + stats = cfg->rc_twopass_stats_in.buf; + for (i = 0; i < n_packets; ++i) { + const int layer_id = (int)stats[i].spatial_layer_id; + if (layer_id >= 0 && layer_id < (int)cfg->ss_number_layers) { + ++n_packets_per_layer[layer_id]; + } + } + + for (i = 0; i < (int)cfg->ss_number_layers; ++i) { + unsigned int layer_id; + if (n_packets_per_layer[i] < 2) { + ERROR( + "rc_twopass_stats_in requires at least two packets for each " + "layer."); + } + + stats = (const FIRSTPASS_STATS *)cfg->rc_twopass_stats_in.buf + + n_packets - cfg->ss_number_layers + i; + layer_id = (int)stats->spatial_layer_id; + + if (layer_id >= cfg->ss_number_layers || + (unsigned int)(stats->count + 0.5) != + n_packets_per_layer[layer_id] - 1) + ERROR("rc_twopass_stats_in missing EOS stats packet"); + } + } else { + if (cfg->rc_twopass_stats_in.sz < 2 * packet_sz) + ERROR("rc_twopass_stats_in requires at least two packets."); + + stats = + (const FIRSTPASS_STATS *)cfg->rc_twopass_stats_in.buf + n_packets - 1; + + if ((int)(stats->count + 0.5) != n_packets - 1) + ERROR("rc_twopass_stats_in missing EOS stats packet"); + } + } +#endif // !CONFIG_REALTIME_ONLY + +#if !CONFIG_VP9_HIGHBITDEPTH + if (cfg->g_profile > (unsigned int)PROFILE_1) { + ERROR("Profile > 1 not supported in this build configuration"); + } +#endif + if (cfg->g_profile <= (unsigned int)PROFILE_1 && + cfg->g_bit_depth > VPX_BITS_8) { + ERROR("Codec high bit-depth not supported in profile < 2"); + } + if (cfg->g_profile <= (unsigned int)PROFILE_1 && cfg->g_input_bit_depth > 8) { + ERROR("Source high bit-depth not supported in profile < 2"); + } + if (cfg->g_profile > (unsigned int)PROFILE_1 && + cfg->g_bit_depth == VPX_BITS_8) { + ERROR("Codec bit-depth 8 not supported in profile > 1"); + } + RANGE_CHECK(extra_cfg, color_space, VPX_CS_UNKNOWN, VPX_CS_SRGB); + RANGE_CHECK(extra_cfg, color_range, VPX_CR_STUDIO_RANGE, VPX_CR_FULL_RANGE); + + // The range below shall be further tuned. + RANGE_CHECK(cfg, use_vizier_rc_params, 0, 1); + RANGE_CHECK(cfg, active_wq_factor.den, 1, 1000); + RANGE_CHECK(cfg, err_per_mb_factor.den, 1, 1000); + RANGE_CHECK(cfg, sr_default_decay_limit.den, 1, 1000); + RANGE_CHECK(cfg, sr_diff_factor.den, 1, 1000); + RANGE_CHECK(cfg, kf_err_per_mb_factor.den, 1, 1000); + RANGE_CHECK(cfg, kf_frame_min_boost_factor.den, 1, 1000); + RANGE_CHECK(cfg, kf_frame_max_boost_subs_factor.den, 1, 1000); + RANGE_CHECK(cfg, kf_max_total_boost_factor.den, 1, 1000); + RANGE_CHECK(cfg, gf_max_total_boost_factor.den, 1, 1000); + RANGE_CHECK(cfg, gf_frame_max_boost_factor.den, 1, 1000); + RANGE_CHECK(cfg, zm_factor.den, 1, 1000); + RANGE_CHECK(cfg, rd_mult_inter_qp_fac.den, 1, 1000); + RANGE_CHECK(cfg, rd_mult_arf_qp_fac.den, 1, 1000); + RANGE_CHECK(cfg, rd_mult_key_qp_fac.den, 1, 1000); + + return VPX_CODEC_OK; +} + +static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx, + const vpx_image_t *img) { + switch (img->fmt) { + case VPX_IMG_FMT_YV12: + case VPX_IMG_FMT_I420: + case VPX_IMG_FMT_I42016: + case VPX_IMG_FMT_NV12: break; + case VPX_IMG_FMT_I422: + case VPX_IMG_FMT_I444: + case VPX_IMG_FMT_I440: + if (ctx->cfg.g_profile != (unsigned int)PROFILE_1) { + ERROR( + "Invalid image format. I422, I444, I440 images are not supported " + "in profile."); + } + break; + case VPX_IMG_FMT_I42216: + case VPX_IMG_FMT_I44416: + case VPX_IMG_FMT_I44016: + if (ctx->cfg.g_profile != (unsigned int)PROFILE_1 && + ctx->cfg.g_profile != (unsigned int)PROFILE_3) { + ERROR( + "Invalid image format. 16-bit I422, I444, I440 images are " + "not supported in profile."); + } + break; + default: + ERROR( + "Invalid image format. Only YV12, I420, I422, I444, I440, NV12 " + "images are supported."); + break; + } + + if (img->d_w != ctx->cfg.g_w || img->d_h != ctx->cfg.g_h) + ERROR("Image size must match encoder init configuration size"); + + return VPX_CODEC_OK; +} + +static int get_image_bps(const vpx_image_t *img) { + switch (img->fmt) { + case VPX_IMG_FMT_YV12: + case VPX_IMG_FMT_NV12: + case VPX_IMG_FMT_I420: return 12; + case VPX_IMG_FMT_I422: return 16; + case VPX_IMG_FMT_I444: return 24; + case VPX_IMG_FMT_I440: return 16; + case VPX_IMG_FMT_I42016: return 24; + case VPX_IMG_FMT_I42216: return 32; + case VPX_IMG_FMT_I44416: return 48; + case VPX_IMG_FMT_I44016: return 32; + default: assert(0 && "Invalid image format"); break; + } + return 0; +} + +// Modify the encoder config for the target level. +static void config_target_level(VP9EncoderConfig *oxcf) { + double max_average_bitrate; // in bits per second + int max_over_shoot_pct; + const int target_level_index = get_level_index(oxcf->target_level); + + vpx_clear_system_state(); + assert(target_level_index >= 0); + assert(target_level_index < VP9_LEVELS); + + // Maximum target bit-rate is level_limit * 80%. + max_average_bitrate = + vp9_level_defs[target_level_index].average_bitrate * 800.0; + if ((double)oxcf->target_bandwidth > max_average_bitrate) + oxcf->target_bandwidth = (int64_t)(max_average_bitrate); + if (oxcf->ss_number_layers == 1 && oxcf->pass != 0) + oxcf->ss_target_bitrate[0] = (int)oxcf->target_bandwidth; + + // Adjust max over-shoot percentage. + max_over_shoot_pct = + (int)((max_average_bitrate * 1.10 - (double)oxcf->target_bandwidth) * + 100 / (double)(oxcf->target_bandwidth)); + if (oxcf->over_shoot_pct > max_over_shoot_pct) + oxcf->over_shoot_pct = max_over_shoot_pct; + + // Adjust worst allowed quantizer. + oxcf->worst_allowed_q = vp9_quantizer_to_qindex(63); + + // Adjust minimum art-ref distance. + // min_gf_interval should be no less than min_altref_distance + 1, + // as the encoder may produce bitstream with alt-ref distance being + // min_gf_interval - 1. + if (oxcf->min_gf_interval <= + (int)vp9_level_defs[target_level_index].min_altref_distance) { + oxcf->min_gf_interval = + (int)vp9_level_defs[target_level_index].min_altref_distance + 1; + // If oxcf->max_gf_interval == 0, it will be assigned with a default value + // in vp9_rc_set_gf_interval_range(). + if (oxcf->max_gf_interval != 0) { + oxcf->max_gf_interval = + VPXMAX(oxcf->max_gf_interval, oxcf->min_gf_interval); + } + } + + // Adjust maximum column tiles. + if (vp9_level_defs[target_level_index].max_col_tiles < + (1 << oxcf->tile_columns)) { + while (oxcf->tile_columns > 0 && + vp9_level_defs[target_level_index].max_col_tiles < + (1 << oxcf->tile_columns)) + --oxcf->tile_columns; + } +} + +static vpx_rational64_t get_g_timebase_in_ts(vpx_rational_t g_timebase) { + vpx_rational64_t g_timebase_in_ts; + g_timebase_in_ts.den = g_timebase.den; + g_timebase_in_ts.num = g_timebase.num; + g_timebase_in_ts.num *= TICKS_PER_SEC; + reduce_ratio(&g_timebase_in_ts); + return g_timebase_in_ts; +} + +static vpx_codec_err_t set_encoder_config( + VP9EncoderConfig *oxcf, vpx_codec_enc_cfg_t *cfg, + const struct vp9_extracfg *extra_cfg) { + const int is_vbr = cfg->rc_end_usage == VPX_VBR; + int sl, tl; + unsigned int raw_target_rate; + oxcf->profile = cfg->g_profile; + oxcf->max_threads = (int)cfg->g_threads; + oxcf->width = cfg->g_w; + oxcf->height = cfg->g_h; + oxcf->bit_depth = cfg->g_bit_depth; + oxcf->input_bit_depth = cfg->g_input_bit_depth; + // TODO(angiebird): Figure out if we can just use g_timebase to indicate the + // inverse of framerate + // guess a frame rate if out of whack, use 30 + oxcf->init_framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num; + if (oxcf->init_framerate > 180) oxcf->init_framerate = 30; + oxcf->g_timebase = cfg->g_timebase; + oxcf->g_timebase_in_ts = get_g_timebase_in_ts(oxcf->g_timebase); + + oxcf->mode = GOOD; + + switch (cfg->g_pass) { + case VPX_RC_ONE_PASS: oxcf->pass = 0; break; + case VPX_RC_FIRST_PASS: oxcf->pass = 1; break; + case VPX_RC_LAST_PASS: oxcf->pass = 2; break; + } + + oxcf->lag_in_frames = + cfg->g_pass == VPX_RC_FIRST_PASS ? 0 : cfg->g_lag_in_frames; + oxcf->rc_mode = cfg->rc_end_usage; + + raw_target_rate = + (unsigned int)((int64_t)oxcf->width * oxcf->height * oxcf->bit_depth * 3 * + oxcf->init_framerate / 1000); + // Cap target bitrate to raw rate or 1000Mbps, whichever is less + cfg->rc_target_bitrate = + VPXMIN(VPXMIN(raw_target_rate, cfg->rc_target_bitrate), 1000000); + + // Convert target bandwidth from Kbit/s to Bit/s + oxcf->target_bandwidth = 1000 * (int64_t)cfg->rc_target_bitrate; + oxcf->rc_max_intra_bitrate_pct = extra_cfg->rc_max_intra_bitrate_pct; + oxcf->rc_max_inter_bitrate_pct = extra_cfg->rc_max_inter_bitrate_pct; + oxcf->gf_cbr_boost_pct = extra_cfg->gf_cbr_boost_pct; + + oxcf->best_allowed_q = + extra_cfg->lossless ? 0 : vp9_quantizer_to_qindex(cfg->rc_min_quantizer); + oxcf->worst_allowed_q = + extra_cfg->lossless ? 0 : vp9_quantizer_to_qindex(cfg->rc_max_quantizer); + oxcf->cq_level = vp9_quantizer_to_qindex(extra_cfg->cq_level); + oxcf->fixed_q = -1; + + oxcf->under_shoot_pct = cfg->rc_undershoot_pct; + oxcf->over_shoot_pct = cfg->rc_overshoot_pct; + + oxcf->scaled_frame_width = cfg->rc_scaled_width; + oxcf->scaled_frame_height = cfg->rc_scaled_height; + if (cfg->rc_resize_allowed == 1) { + oxcf->resize_mode = + (oxcf->scaled_frame_width == 0 || oxcf->scaled_frame_height == 0) + ? RESIZE_DYNAMIC + : RESIZE_FIXED; + } else { + oxcf->resize_mode = RESIZE_NONE; + } + + oxcf->maximum_buffer_size_ms = is_vbr ? 240000 : cfg->rc_buf_sz; + oxcf->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz; + oxcf->optimal_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_optimal_sz; + + oxcf->drop_frames_water_mark = cfg->rc_dropframe_thresh; + + oxcf->two_pass_vbrbias = cfg->rc_2pass_vbr_bias_pct; + oxcf->two_pass_vbrmin_section = cfg->rc_2pass_vbr_minsection_pct; + oxcf->two_pass_vbrmax_section = cfg->rc_2pass_vbr_maxsection_pct; + oxcf->vbr_corpus_complexity = cfg->rc_2pass_vbr_corpus_complexity; + + oxcf->auto_key = + cfg->kf_mode == VPX_KF_AUTO && cfg->kf_min_dist != cfg->kf_max_dist; + + oxcf->key_freq = cfg->kf_max_dist; + + oxcf->speed = abs(extra_cfg->cpu_used); + oxcf->encode_breakout = extra_cfg->static_thresh; + oxcf->enable_auto_arf = extra_cfg->enable_auto_alt_ref; + if (oxcf->bit_depth == VPX_BITS_8) { + oxcf->noise_sensitivity = extra_cfg->noise_sensitivity; + } else { + // Disable denoiser for high bitdepth since vp9_denoiser_filter only works + // for 8 bits. + oxcf->noise_sensitivity = 0; + } + oxcf->sharpness = extra_cfg->sharpness; + + vp9_set_first_pass_stats(oxcf, &cfg->rc_twopass_stats_in); + + oxcf->color_space = extra_cfg->color_space; + oxcf->color_range = extra_cfg->color_range; + oxcf->render_width = extra_cfg->render_width; + oxcf->render_height = extra_cfg->render_height; + oxcf->arnr_max_frames = extra_cfg->arnr_max_frames; + oxcf->arnr_strength = extra_cfg->arnr_strength; + oxcf->min_gf_interval = extra_cfg->min_gf_interval; + oxcf->max_gf_interval = extra_cfg->max_gf_interval; + + oxcf->tuning = extra_cfg->tuning; + oxcf->content = extra_cfg->content; + + oxcf->tile_columns = extra_cfg->tile_columns; + + oxcf->enable_tpl_model = extra_cfg->enable_tpl_model; + + // TODO(yunqing): The dependencies between row tiles cause error in multi- + // threaded encoding. For now, tile_rows is forced to be 0 in this case. + // The further fix can be done by adding synchronizations after a tile row + // is encoded. But this will hurt multi-threaded encoder performance. So, + // it is recommended to use tile-rows=0 while encoding with threads > 1. + if (oxcf->max_threads > 1 && oxcf->tile_columns > 0) + oxcf->tile_rows = 0; + else + oxcf->tile_rows = extra_cfg->tile_rows; + + oxcf->error_resilient_mode = cfg->g_error_resilient; + oxcf->frame_parallel_decoding_mode = extra_cfg->frame_parallel_decoding_mode; + + oxcf->aq_mode = extra_cfg->aq_mode; + oxcf->alt_ref_aq = extra_cfg->alt_ref_aq; + + oxcf->frame_periodic_boost = extra_cfg->frame_periodic_boost; + + oxcf->ss_number_layers = cfg->ss_number_layers; + oxcf->ts_number_layers = cfg->ts_number_layers; + oxcf->temporal_layering_mode = + (enum vp9e_temporal_layering_mode)cfg->temporal_layering_mode; + + oxcf->target_level = extra_cfg->target_level; + + oxcf->row_mt = extra_cfg->row_mt; + oxcf->motion_vector_unit_test = extra_cfg->motion_vector_unit_test; + + oxcf->delta_q_uv = extra_cfg->delta_q_uv; + + for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { + for (tl = 0; tl < oxcf->ts_number_layers; ++tl) { + const int layer = sl * oxcf->ts_number_layers + tl; + if (cfg->layer_target_bitrate[layer] > INT_MAX / 1000) + oxcf->layer_target_bitrate[layer] = INT_MAX; + else + oxcf->layer_target_bitrate[layer] = + 1000 * cfg->layer_target_bitrate[layer]; + } + } + if (oxcf->ss_number_layers == 1 && oxcf->pass != 0) { + oxcf->ss_target_bitrate[0] = (int)oxcf->target_bandwidth; + } + if (oxcf->ts_number_layers > 1) { + for (tl = 0; tl < VPX_TS_MAX_LAYERS; ++tl) { + oxcf->ts_rate_decimator[tl] = + cfg->ts_rate_decimator[tl] ? cfg->ts_rate_decimator[tl] : 1; + } + } else if (oxcf->ts_number_layers == 1) { + oxcf->ts_rate_decimator[0] = 1; + } + + if (get_level_index(oxcf->target_level) >= 0) config_target_level(oxcf); + oxcf->use_simple_encode_api = 0; + // vp9_dump_encoder_config(oxcf, stderr); + return VPX_CODEC_OK; +} + +static vpx_codec_err_t set_twopass_params_from_config( + const vpx_codec_enc_cfg_t *const cfg, struct VP9_COMP *cpi) { + if (!cfg->use_vizier_rc_params) return VPX_CODEC_OK; + if (cpi == NULL) return VPX_CODEC_ERROR; + + cpi->twopass.use_vizier_rc_params = cfg->use_vizier_rc_params; + + // The values set here are factors that will be applied to default values + // to get the final value used in the two pass code. Hence 1.0 will + // match the default behaviour when not using passed in values. + // We also apply limits here to prevent the user from applying settings + // that make no sense. + cpi->twopass.active_wq_factor = + (double)cfg->active_wq_factor.num / (double)cfg->active_wq_factor.den; + if (cpi->twopass.active_wq_factor < 0.25) + cpi->twopass.active_wq_factor = 0.25; + else if (cpi->twopass.active_wq_factor > 16.0) + cpi->twopass.active_wq_factor = 16.0; + + cpi->twopass.err_per_mb = + (double)cfg->err_per_mb_factor.num / (double)cfg->err_per_mb_factor.den; + if (cpi->twopass.err_per_mb < 0.25) + cpi->twopass.err_per_mb = 0.25; + else if (cpi->twopass.err_per_mb > 4.0) + cpi->twopass.err_per_mb = 4.0; + + cpi->twopass.sr_default_decay_limit = + (double)cfg->sr_default_decay_limit.num / + (double)cfg->sr_default_decay_limit.den; + if (cpi->twopass.sr_default_decay_limit < 0.25) + cpi->twopass.sr_default_decay_limit = 0.25; + // If the default changes this will need to change. + else if (cpi->twopass.sr_default_decay_limit > 1.33) + cpi->twopass.sr_default_decay_limit = 1.33; + + cpi->twopass.sr_diff_factor = + (double)cfg->sr_diff_factor.num / (double)cfg->sr_diff_factor.den; + if (cpi->twopass.sr_diff_factor < 0.25) + cpi->twopass.sr_diff_factor = 0.25; + else if (cpi->twopass.sr_diff_factor > 4.0) + cpi->twopass.sr_diff_factor = 4.0; + + cpi->twopass.kf_err_per_mb = (double)cfg->kf_err_per_mb_factor.num / + (double)cfg->kf_err_per_mb_factor.den; + if (cpi->twopass.kf_err_per_mb < 0.25) + cpi->twopass.kf_err_per_mb = 0.25; + else if (cpi->twopass.kf_err_per_mb > 4.0) + cpi->twopass.kf_err_per_mb = 4.0; + + cpi->twopass.kf_frame_min_boost = (double)cfg->kf_frame_min_boost_factor.num / + (double)cfg->kf_frame_min_boost_factor.den; + if (cpi->twopass.kf_frame_min_boost < 0.25) + cpi->twopass.kf_frame_min_boost = 0.25; + else if (cpi->twopass.kf_frame_min_boost > 4.0) + cpi->twopass.kf_frame_min_boost = 4.0; + + cpi->twopass.kf_frame_max_boost_first = + (double)cfg->kf_frame_max_boost_first_factor.num / + (double)cfg->kf_frame_max_boost_first_factor.den; + if (cpi->twopass.kf_frame_max_boost_first < 0.25) + cpi->twopass.kf_frame_max_boost_first = 0.25; + else if (cpi->twopass.kf_frame_max_boost_first > 4.0) + cpi->twopass.kf_frame_max_boost_first = 4.0; + + cpi->twopass.kf_frame_max_boost_subs = + (double)cfg->kf_frame_max_boost_subs_factor.num / + (double)cfg->kf_frame_max_boost_subs_factor.den; + if (cpi->twopass.kf_frame_max_boost_subs < 0.25) + cpi->twopass.kf_frame_max_boost_subs = 0.25; + else if (cpi->twopass.kf_frame_max_boost_subs > 4.0) + cpi->twopass.kf_frame_max_boost_subs = 4.0; + + cpi->twopass.kf_max_total_boost = (double)cfg->kf_max_total_boost_factor.num / + (double)cfg->kf_max_total_boost_factor.den; + if (cpi->twopass.kf_max_total_boost < 0.25) + cpi->twopass.kf_max_total_boost = 0.25; + else if (cpi->twopass.kf_max_total_boost > 4.0) + cpi->twopass.kf_max_total_boost = 4.0; + + cpi->twopass.gf_max_total_boost = (double)cfg->gf_max_total_boost_factor.num / + (double)cfg->gf_max_total_boost_factor.den; + if (cpi->twopass.gf_max_total_boost < 0.25) + cpi->twopass.gf_max_total_boost = 0.25; + else if (cpi->twopass.gf_max_total_boost > 4.0) + cpi->twopass.gf_max_total_boost = 4.0; + + cpi->twopass.gf_frame_max_boost = (double)cfg->gf_frame_max_boost_factor.num / + (double)cfg->gf_frame_max_boost_factor.den; + if (cpi->twopass.gf_frame_max_boost < 0.25) + cpi->twopass.gf_frame_max_boost = 0.25; + else if (cpi->twopass.gf_frame_max_boost > 4.0) + cpi->twopass.gf_frame_max_boost = 4.0; + + cpi->twopass.zm_factor = + (double)cfg->zm_factor.num / (double)cfg->zm_factor.den; + if (cpi->twopass.zm_factor < 0.25) + cpi->twopass.zm_factor = 0.25; + else if (cpi->twopass.zm_factor > 2.0) + cpi->twopass.zm_factor = 2.0; + + cpi->rd_ctrl.rd_mult_inter_qp_fac = (double)cfg->rd_mult_inter_qp_fac.num / + (double)cfg->rd_mult_inter_qp_fac.den; + if (cpi->rd_ctrl.rd_mult_inter_qp_fac < 0.25) + cpi->rd_ctrl.rd_mult_inter_qp_fac = 0.25; + else if (cpi->rd_ctrl.rd_mult_inter_qp_fac > 4.0) + cpi->rd_ctrl.rd_mult_inter_qp_fac = 4.0; + + cpi->rd_ctrl.rd_mult_arf_qp_fac = + (double)cfg->rd_mult_arf_qp_fac.num / (double)cfg->rd_mult_arf_qp_fac.den; + if (cpi->rd_ctrl.rd_mult_arf_qp_fac < 0.25) + cpi->rd_ctrl.rd_mult_arf_qp_fac = 0.25; + else if (cpi->rd_ctrl.rd_mult_arf_qp_fac > 4.0) + cpi->rd_ctrl.rd_mult_arf_qp_fac = 4.0; + + cpi->rd_ctrl.rd_mult_key_qp_fac = + (double)cfg->rd_mult_key_qp_fac.num / (double)cfg->rd_mult_key_qp_fac.den; + if (cpi->rd_ctrl.rd_mult_key_qp_fac < 0.25) + cpi->rd_ctrl.rd_mult_key_qp_fac = 0.25; + else if (cpi->rd_ctrl.rd_mult_key_qp_fac > 4.0) + cpi->rd_ctrl.rd_mult_key_qp_fac = 4.0; + + return VPX_CODEC_OK; +} + +static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx, + const vpx_codec_enc_cfg_t *cfg) { + vpx_codec_err_t res; + volatile int force_key = 0; + + if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) { + if (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS) + ERROR("Cannot change width or height after initialization"); + // Note: function encoder_set_config() is allowed to be called multiple + // times. However, when the original frame width or height is less than two + // times of the new frame width or height, a forced key frame should be + // used. To make sure the correct detection of a forced key frame, we need + // to update the frame width and height only when the actual encoding is + // performed. cpi->last_coded_width and cpi->last_coded_height are used to + // track the actual coded frame size. + if ((ctx->cpi->last_coded_width && ctx->cpi->last_coded_height && + !valid_ref_frame_size(ctx->cpi->last_coded_width, + ctx->cpi->last_coded_height, cfg->g_w, + cfg->g_h)) || + (ctx->cpi->initial_width && (int)cfg->g_w > ctx->cpi->initial_width) || + (ctx->cpi->initial_height && + (int)cfg->g_h > ctx->cpi->initial_height)) { + force_key = 1; + } + } + + // Prevent increasing lag_in_frames. This check is stricter than it needs + // to be -- the limit is not increasing past the first lag_in_frames + // value, but we don't track the initial config, only the last successful + // config. + if (cfg->g_lag_in_frames > ctx->cfg.g_lag_in_frames) + ERROR("Cannot increase lag_in_frames"); + + res = validate_config(ctx, cfg, &ctx->extra_cfg); + if (res != VPX_CODEC_OK) return res; + + if (setjmp(ctx->cpi->common.error.jmp)) { + const vpx_codec_err_t codec_err = + update_error_state(ctx, &ctx->cpi->common.error); + ctx->cpi->common.error.setjmp = 0; + vpx_clear_system_state(); + assert(codec_err != VPX_CODEC_OK); + return codec_err; + } + ctx->cpi->common.error.setjmp = 1; + + ctx->cfg = *cfg; + set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg); + set_twopass_params_from_config(&ctx->cfg, ctx->cpi); + // On profile change, request a key frame + force_key |= ctx->cpi->common.profile != ctx->oxcf.profile; + vp9_change_config(ctx->cpi, &ctx->oxcf); + + if (force_key) ctx->next_frame_flags |= VPX_EFLAG_FORCE_KF; + + ctx->cpi->common.error.setjmp = 0; + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_get_quantizer(vpx_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL) return VPX_CODEC_INVALID_PARAM; + *arg = vp9_get_quantizer(ctx->cpi); + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_get_quantizer64(vpx_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL) return VPX_CODEC_INVALID_PARAM; + *arg = vp9_qindex_to_quantizer(vp9_get_quantizer(ctx->cpi)); + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_get_quantizer_svc_layers(vpx_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + int i; + if (arg == NULL) return VPX_CODEC_INVALID_PARAM; + for (i = 0; i < VPX_SS_MAX_LAYERS; i++) { + arg[i] = ctx->cpi->svc.base_qindex[i]; + } + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_get_loopfilter_level(vpx_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL) return VPX_CODEC_INVALID_PARAM; + *arg = ctx->cpi->common.lf.filter_level; + return VPX_CODEC_OK; +} + +static vpx_codec_err_t update_extra_cfg(vpx_codec_alg_priv_t *ctx, + const struct vp9_extracfg *extra_cfg) { + const vpx_codec_err_t res = validate_config(ctx, &ctx->cfg, extra_cfg); + if (res == VPX_CODEC_OK) { + ctx->extra_cfg = *extra_cfg; + set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg); + set_twopass_params_from_config(&ctx->cfg, ctx->cpi); + vp9_change_config(ctx->cpi, &ctx->oxcf); + } + return res; +} + +static vpx_codec_err_t ctrl_set_cpuused(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + // Use fastest speed setting (speed 9 or -9) if it's set beyond the range. + extra_cfg.cpu_used = CAST(VP8E_SET_CPUUSED, args); + extra_cfg.cpu_used = VPXMIN(9, extra_cfg.cpu_used); + extra_cfg.cpu_used = VPXMAX(-9, extra_cfg.cpu_used); +#if CONFIG_REALTIME_ONLY + if (extra_cfg.cpu_used > -5 && extra_cfg.cpu_used < 5) + extra_cfg.cpu_used = (extra_cfg.cpu_used > 0) ? 5 : -5; +#endif + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_enable_auto_alt_ref(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_auto_alt_ref = CAST(VP8E_SET_ENABLEAUTOALTREF, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_noise_sensitivity(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.noise_sensitivity = CAST(VP9E_SET_NOISE_SENSITIVITY, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_sharpness(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.sharpness = CAST(VP8E_SET_SHARPNESS, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_static_thresh(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.static_thresh = CAST(VP8E_SET_STATIC_THRESHOLD, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_tile_columns(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.tile_columns = CAST(VP9E_SET_TILE_COLUMNS, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_tile_rows(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.tile_rows = CAST(VP9E_SET_TILE_ROWS, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_tpl_model(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_tpl_model = CAST(VP9E_SET_TPL, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_arnr_max_frames(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.arnr_max_frames = CAST(VP8E_SET_ARNR_MAXFRAMES, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_arnr_strength(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.arnr_strength = CAST(VP8E_SET_ARNR_STRENGTH, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_arnr_type(vpx_codec_alg_priv_t *ctx, + va_list args) { + (void)ctx; + (void)args; + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_tuning(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.tuning = CAST(VP8E_SET_TUNING, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_cq_level(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.cq_level = CAST(VP8E_SET_CQ_LEVEL, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_rc_max_intra_bitrate_pct( + vpx_codec_alg_priv_t *ctx, va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.rc_max_intra_bitrate_pct = + CAST(VP8E_SET_MAX_INTRA_BITRATE_PCT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_rc_max_inter_bitrate_pct( + vpx_codec_alg_priv_t *ctx, va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.rc_max_inter_bitrate_pct = + CAST(VP9E_SET_MAX_INTER_BITRATE_PCT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_rc_gf_cbr_boost_pct(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.gf_cbr_boost_pct = CAST(VP9E_SET_GF_CBR_BOOST_PCT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_lossless(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.lossless = CAST(VP9E_SET_LOSSLESS, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_frame_parallel_decoding_mode( + vpx_codec_alg_priv_t *ctx, va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.frame_parallel_decoding_mode = + CAST(VP9E_SET_FRAME_PARALLEL_DECODING, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_aq_mode(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.aq_mode = CAST(VP9E_SET_AQ_MODE, args); + if (ctx->cpi->fixed_qp_onepass) extra_cfg.aq_mode = 0; + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_alt_ref_aq(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.alt_ref_aq = CAST(VP9E_SET_ALT_REF_AQ, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_min_gf_interval(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.min_gf_interval = CAST(VP9E_SET_MIN_GF_INTERVAL, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_max_gf_interval(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.max_gf_interval = CAST(VP9E_SET_MAX_GF_INTERVAL, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_frame_periodic_boost(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.frame_periodic_boost = CAST(VP9E_SET_FRAME_PERIODIC_BOOST, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_target_level(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.target_level = CAST(VP9E_SET_TARGET_LEVEL, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_row_mt(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.row_mt = CAST(VP9E_SET_ROW_MT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_rtc_external_ratectrl(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + const unsigned int data = va_arg(args, unsigned int); + if (data) { + cpi->compute_frame_low_motion_onepass = 0; + cpi->rc.constrain_gf_key_freq_onepass_vbr = 0; + cpi->cyclic_refresh->content_mode = 0; + cpi->disable_scene_detection_rtc_ratectrl = 1; + } + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_enable_motion_vector_unit_test( + vpx_codec_alg_priv_t *ctx, va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.motion_vector_unit_test = + CAST(VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_get_level(vpx_codec_alg_priv_t *ctx, va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL) return VPX_CODEC_INVALID_PARAM; + *arg = (int)vp9_get_level(&ctx->cpi->level_info.level_spec); + return VPX_CODEC_OK; +} + +static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx, + vpx_codec_priv_enc_mr_cfg_t *data) { + vpx_codec_err_t res = VPX_CODEC_OK; + (void)data; + + if (ctx->priv == NULL) { + vpx_codec_alg_priv_t *const priv = vpx_calloc(1, sizeof(*priv)); + if (priv == NULL) return VPX_CODEC_MEM_ERROR; + + ctx->priv = (vpx_codec_priv_t *)priv; + ctx->priv->init_flags = ctx->init_flags; + ctx->priv->enc.total_encoders = 1; + priv->buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(BufferPool)); + if (priv->buffer_pool == NULL) return VPX_CODEC_MEM_ERROR; + + if (ctx->config.enc) { + // Update the reference to the config structure to an internal copy. + priv->cfg = *ctx->config.enc; + ctx->config.enc = &priv->cfg; + } + + priv->extra_cfg = default_extra_cfg; + vp9_initialize_enc(); + + res = validate_config(priv, &priv->cfg, &priv->extra_cfg); + + if (res == VPX_CODEC_OK) { + priv->pts_offset_initialized = 0; + // TODO(angiebird): Replace priv->timestamp_ratio by + // oxcf->g_timebase_in_ts + priv->timestamp_ratio = get_g_timebase_in_ts(priv->cfg.g_timebase); + + set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg); +#if CONFIG_VP9_HIGHBITDEPTH + priv->oxcf.use_highbitdepth = + (ctx->init_flags & VPX_CODEC_USE_HIGHBITDEPTH) ? 1 : 0; +#endif + priv->cpi = vp9_create_compressor(&priv->oxcf, priv->buffer_pool); + if (priv->cpi == NULL) res = VPX_CODEC_MEM_ERROR; + set_twopass_params_from_config(&priv->cfg, priv->cpi); + } + } + + return res; +} + +static vpx_codec_err_t encoder_destroy(vpx_codec_alg_priv_t *ctx) { + free(ctx->cx_data); + vp9_remove_compressor(ctx->cpi); + vpx_free(ctx->buffer_pool); + vpx_free(ctx); + return VPX_CODEC_OK; +} + +static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, + unsigned long duration, + vpx_enc_deadline_t deadline) { + MODE new_mode = BEST; + +#if CONFIG_REALTIME_ONLY + (void)duration; + deadline = VPX_DL_REALTIME; +#else + switch (ctx->cfg.g_pass) { + case VPX_RC_ONE_PASS: + if (deadline > 0) { + // Convert duration parameter from stream timebase to microseconds. + uint64_t duration_us; + + VPX_STATIC_ASSERT(TICKS_PER_SEC > 1000000 && + (TICKS_PER_SEC % 1000000) == 0); + + duration_us = duration * (uint64_t)ctx->timestamp_ratio.num / + (ctx->timestamp_ratio.den * (TICKS_PER_SEC / 1000000)); + + // If the deadline is more that the duration this frame is to be shown, + // use good quality mode. Otherwise use realtime mode. + new_mode = (deadline > duration_us) ? GOOD : REALTIME; + } else { + new_mode = BEST; + } + break; + case VPX_RC_FIRST_PASS: break; + case VPX_RC_LAST_PASS: new_mode = deadline > 0 ? GOOD : BEST; break; + } +#endif // CONFIG_REALTIME_ONLY + + if (deadline == VPX_DL_REALTIME) { + ctx->oxcf.pass = 0; + new_mode = REALTIME; + } + + if (ctx->oxcf.mode != new_mode) { + ctx->oxcf.mode = new_mode; + vp9_change_config(ctx->cpi, &ctx->oxcf); + } +} + +// Turn on to test if supplemental superframe data breaks decoding +// #define TEST_SUPPLEMENTAL_SUPERFRAME_DATA +static int write_superframe_index(vpx_codec_alg_priv_t *ctx) { + uint8_t marker = 0xc0; + unsigned int mask; + int mag, index_sz; + + assert(ctx->pending_frame_count); + assert(ctx->pending_frame_count <= 8); + + // Add the number of frames to the marker byte + marker |= ctx->pending_frame_count - 1; + + // Choose the magnitude + for (mag = 0, mask = 0xff; mag < 4; mag++) { + if (ctx->pending_frame_magnitude < mask) break; + mask <<= 8; + mask |= 0xff; + } + marker |= mag << 3; + + // Write the index + index_sz = 2 + (mag + 1) * ctx->pending_frame_count; + if (ctx->pending_cx_data_sz + index_sz < ctx->cx_data_sz) { + uint8_t *x = ctx->pending_cx_data + ctx->pending_cx_data_sz; + int i, j; +#ifdef TEST_SUPPLEMENTAL_SUPERFRAME_DATA + uint8_t marker_test = 0xc0; + int mag_test = 2; // 1 - 4 + int frames_test = 4; // 1 - 8 + int index_sz_test = 2 + mag_test * frames_test; + marker_test |= frames_test - 1; + marker_test |= (mag_test - 1) << 3; + *x++ = marker_test; + for (i = 0; i < mag_test * frames_test; ++i) + *x++ = 0; // fill up with arbitrary data + *x++ = marker_test; + ctx->pending_cx_data_sz += index_sz_test; + printf("Added supplemental superframe data\n"); +#endif + + *x++ = marker; + for (i = 0; i < ctx->pending_frame_count; i++) { + unsigned int this_sz = (unsigned int)ctx->pending_frame_sizes[i]; + + for (j = 0; j <= mag; j++) { + *x++ = this_sz & 0xff; + this_sz >>= 8; + } + } + *x++ = marker; + ctx->pending_cx_data_sz += index_sz; +#ifdef TEST_SUPPLEMENTAL_SUPERFRAME_DATA + index_sz += index_sz_test; +#endif + } + return index_sz; +} + +static vpx_codec_frame_flags_t get_frame_pkt_flags(const VP9_COMP *cpi, + unsigned int lib_flags) { + vpx_codec_frame_flags_t flags = lib_flags << 16; + + if (lib_flags & FRAMEFLAGS_KEY || + (cpi->use_svc && cpi->svc + .layer_context[cpi->svc.spatial_layer_id * + cpi->svc.number_temporal_layers + + cpi->svc.temporal_layer_id] + .is_key_frame)) + flags |= VPX_FRAME_IS_KEY; + + if (cpi->droppable) flags |= VPX_FRAME_IS_DROPPABLE; + + return flags; +} + +static INLINE vpx_codec_cx_pkt_t get_psnr_pkt(const PSNR_STATS *psnr) { + vpx_codec_cx_pkt_t pkt; + pkt.kind = VPX_CODEC_PSNR_PKT; + pkt.data.psnr = *psnr; + return pkt; +} + +#if !CONFIG_REALTIME_ONLY +static INLINE vpx_codec_cx_pkt_t +get_first_pass_stats_pkt(FIRSTPASS_STATS *stats) { + // WARNNING: This function assumes that stats will + // exist and not be changed until the packet is processed + // TODO(angiebird): Refactor the code to avoid using the assumption. + vpx_codec_cx_pkt_t pkt; + pkt.kind = VPX_CODEC_STATS_PKT; + pkt.data.twopass_stats.buf = stats; + pkt.data.twopass_stats.sz = sizeof(*stats); + return pkt; +} +#endif + +const size_t kMinCompressedSize = 8192; +static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, + const vpx_image_t *img, + vpx_codec_pts_t pts_val, + unsigned long duration, + vpx_enc_frame_flags_t enc_flags, + vpx_enc_deadline_t deadline) { + volatile vpx_codec_err_t res = VPX_CODEC_OK; + volatile vpx_enc_frame_flags_t flags = enc_flags; + volatile vpx_codec_pts_t pts = pts_val; + VP9_COMP *const cpi = ctx->cpi; + const vpx_rational64_t *const timestamp_ratio = &ctx->timestamp_ratio; + size_t data_sz; + vpx_codec_cx_pkt_t pkt; + memset(&pkt, 0, sizeof(pkt)); + + if (cpi == NULL) return VPX_CODEC_INVALID_PARAM; + + cpi->last_coded_width = ctx->oxcf.width; + cpi->last_coded_height = ctx->oxcf.height; + + if (img != NULL) { + res = validate_img(ctx, img); + if (res == VPX_CODEC_OK) { + // There's no codec control for multiple alt-refs so check the encoder + // instance for its status to determine the compressed data size. + data_sz = ctx->cfg.g_w * ctx->cfg.g_h * get_image_bps(img) / 8 * + (cpi->multi_layer_arf ? 8 : 2); + if (data_sz < kMinCompressedSize) data_sz = kMinCompressedSize; + if (ctx->cx_data == NULL || ctx->cx_data_sz < data_sz) { + ctx->cx_data_sz = data_sz; + free(ctx->cx_data); + ctx->cx_data = (unsigned char *)malloc(ctx->cx_data_sz); + if (ctx->cx_data == NULL) { + return VPX_CODEC_MEM_ERROR; + } + } + } + } + + if (!ctx->pts_offset_initialized) { + ctx->pts_offset = pts; + ctx->pts_offset_initialized = 1; + } + pts -= ctx->pts_offset; + + pick_quickcompress_mode(ctx, duration, deadline); + vpx_codec_pkt_list_init(&ctx->pkt_list); + + // Handle Flags + if (((flags & VP8_EFLAG_NO_UPD_GF) && (flags & VP8_EFLAG_FORCE_GF)) || + ((flags & VP8_EFLAG_NO_UPD_ARF) && (flags & VP8_EFLAG_FORCE_ARF))) { + ctx->base.err_detail = "Conflicting flags."; + return VPX_CODEC_INVALID_PARAM; + } + + if (setjmp(cpi->common.error.jmp)) { + cpi->common.error.setjmp = 0; + res = update_error_state(ctx, &cpi->common.error); + vpx_clear_system_state(); + return res; + } + cpi->common.error.setjmp = 1; + + if (res == VPX_CODEC_OK) vp9_apply_encoding_flags(cpi, flags); + + // Handle fixed keyframe intervals + if (ctx->cfg.kf_mode == VPX_KF_AUTO && + ctx->cfg.kf_min_dist == ctx->cfg.kf_max_dist) { + if (++ctx->fixed_kf_cntr > ctx->cfg.kf_min_dist) { + flags |= VPX_EFLAG_FORCE_KF; + ctx->fixed_kf_cntr = 1; + } + } + + if (res == VPX_CODEC_OK) { + unsigned int lib_flags = 0; + YV12_BUFFER_CONFIG sd; + int64_t dst_time_stamp = timebase_units_to_ticks(timestamp_ratio, pts); + size_t size, cx_data_sz; + unsigned char *cx_data; + + cpi->svc.timebase_fac = timebase_units_to_ticks(timestamp_ratio, 1); + cpi->svc.time_stamp_superframe = dst_time_stamp; + + // Set up internal flags + if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) cpi->b_calculate_psnr = 1; + + if (img != NULL) { + const int64_t dst_end_time_stamp = + timebase_units_to_ticks(timestamp_ratio, pts + duration); + res = image2yuvconfig(img, &sd); + + if (sd.y_width != ctx->cfg.g_w || sd.y_height != ctx->cfg.g_h) { + /* from vpx_encoder.h for g_w/g_h: + "Note that the frames passed as input to the encoder must have this + resolution" + */ + ctx->base.err_detail = "Invalid input frame resolution"; + res = VPX_CODEC_INVALID_PARAM; + } else { + // Store the original flags in to the frame buffer. Will extract the + // key frame flag when we actually encode this frame. + if (vp9_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd, + dst_time_stamp, dst_end_time_stamp)) { + res = update_error_state(ctx, &cpi->common.error); + } + ctx->next_frame_flags = 0; + } + } + + cx_data = ctx->cx_data; + cx_data_sz = ctx->cx_data_sz; + + /* Any pending invisible frames? */ + if (ctx->pending_cx_data) { + memmove(cx_data, ctx->pending_cx_data, ctx->pending_cx_data_sz); + ctx->pending_cx_data = cx_data; + cx_data += ctx->pending_cx_data_sz; + cx_data_sz -= ctx->pending_cx_data_sz; + + /* TODO: this is a minimal check, the underlying codec doesn't respect + * the buffer size anyway. + */ + if (cx_data_sz < ctx->cx_data_sz / 2) { + vpx_internal_error(&cpi->common.error, VPX_CODEC_ERROR, + "Compressed data buffer too small"); + return VPX_CODEC_ERROR; + } + } + + if (cpi->oxcf.pass == 1 && !cpi->use_svc) { +#if !CONFIG_REALTIME_ONLY + // compute first pass stats + if (img) { + int ret; + int64_t dst_end_time_stamp; + vpx_codec_cx_pkt_t fps_pkt; + ENCODE_FRAME_RESULT encode_frame_result; + vp9_init_encode_frame_result(&encode_frame_result); + // TODO(angiebird): Call vp9_first_pass directly + ret = vp9_get_compressed_data(cpi, &lib_flags, &size, cx_data, + &dst_time_stamp, &dst_end_time_stamp, + !img, &encode_frame_result); + assert(size == 0); // There is no compressed data in the first pass + (void)ret; + assert(ret == 0); + fps_pkt = get_first_pass_stats_pkt(&cpi->twopass.this_frame_stats); + vpx_codec_pkt_list_add(&ctx->pkt_list.head, &fps_pkt); + } else { + if (!cpi->twopass.first_pass_done) { + vpx_codec_cx_pkt_t fps_pkt; + vp9_end_first_pass(cpi); + fps_pkt = get_first_pass_stats_pkt(&cpi->twopass.total_stats); + vpx_codec_pkt_list_add(&ctx->pkt_list.head, &fps_pkt); + } + } +#else // !CONFIG_REALTIME_ONLY + assert(0); +#endif // !CONFIG_REALTIME_ONLY + } else { + ENCODE_FRAME_RESULT encode_frame_result; + int64_t dst_end_time_stamp; + vp9_init_encode_frame_result(&encode_frame_result); + while (cx_data_sz >= ctx->cx_data_sz / 2 && + -1 != vp9_get_compressed_data(cpi, &lib_flags, &size, cx_data, + &dst_time_stamp, &dst_end_time_stamp, + !img, &encode_frame_result)) { + // Pack psnr pkt + if (size > 0 && !cpi->use_svc) { + // TODO(angiebird): Figure out while we don't need psnr pkt when + // use_svc is on + PSNR_STATS psnr; + if (vp9_get_psnr(cpi, &psnr)) { + vpx_codec_cx_pkt_t psnr_pkt = get_psnr_pkt(&psnr); + vpx_codec_pkt_list_add(&ctx->pkt_list.head, &psnr_pkt); + } + } + + if (size || (cpi->use_svc && cpi->svc.skip_enhancement_layer)) { + // Pack invisible frames with the next visible frame + if (!cpi->common.show_frame || + (cpi->use_svc && cpi->svc.spatial_layer_id < + cpi->svc.number_spatial_layers - 1)) { + if (ctx->pending_cx_data == 0) ctx->pending_cx_data = cx_data; + ctx->pending_cx_data_sz += size; + if (size) + ctx->pending_frame_sizes[ctx->pending_frame_count++] = size; + ctx->pending_frame_magnitude |= size; + cx_data += size; + cx_data_sz -= size; + pkt.data.frame.width[cpi->svc.spatial_layer_id] = cpi->common.width; + pkt.data.frame.height[cpi->svc.spatial_layer_id] = + cpi->common.height; + pkt.data.frame.spatial_layer_encoded[cpi->svc.spatial_layer_id] = + 1 - cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id]; + + if (ctx->output_cx_pkt_cb.output_cx_pkt) { + pkt.kind = VPX_CODEC_CX_FRAME_PKT; + pkt.data.frame.pts = + ticks_to_timebase_units(timestamp_ratio, dst_time_stamp) + + ctx->pts_offset; + pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units( + timestamp_ratio, dst_end_time_stamp - dst_time_stamp); + pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags); + pkt.data.frame.buf = ctx->pending_cx_data; + pkt.data.frame.sz = size; + ctx->pending_cx_data = NULL; + ctx->pending_cx_data_sz = 0; + ctx->pending_frame_count = 0; + ctx->pending_frame_magnitude = 0; + ctx->output_cx_pkt_cb.output_cx_pkt( + &pkt, ctx->output_cx_pkt_cb.user_priv); + } + continue; + } + + // Add the frame packet to the list of returned packets. + pkt.kind = VPX_CODEC_CX_FRAME_PKT; + pkt.data.frame.pts = + ticks_to_timebase_units(timestamp_ratio, dst_time_stamp) + + ctx->pts_offset; + pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units( + timestamp_ratio, dst_end_time_stamp - dst_time_stamp); + pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags); + pkt.data.frame.width[cpi->svc.spatial_layer_id] = cpi->common.width; + pkt.data.frame.height[cpi->svc.spatial_layer_id] = cpi->common.height; + pkt.data.frame.spatial_layer_encoded[cpi->svc.spatial_layer_id] = + 1 - cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id]; + + if (ctx->pending_cx_data) { + if (size) + ctx->pending_frame_sizes[ctx->pending_frame_count++] = size; + ctx->pending_frame_magnitude |= size; + ctx->pending_cx_data_sz += size; + // write the superframe only for the case when + if (!ctx->output_cx_pkt_cb.output_cx_pkt) + size += write_superframe_index(ctx); + pkt.data.frame.buf = ctx->pending_cx_data; + pkt.data.frame.sz = ctx->pending_cx_data_sz; + ctx->pending_cx_data = NULL; + ctx->pending_cx_data_sz = 0; + ctx->pending_frame_count = 0; + ctx->pending_frame_magnitude = 0; + } else { + pkt.data.frame.buf = cx_data; + pkt.data.frame.sz = size; + } + pkt.data.frame.partition_id = -1; + + if (ctx->output_cx_pkt_cb.output_cx_pkt) + ctx->output_cx_pkt_cb.output_cx_pkt( + &pkt, ctx->output_cx_pkt_cb.user_priv); + else + vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt); + + cx_data += size; + cx_data_sz -= size; + if (is_one_pass_svc(cpi) && (cpi->svc.spatial_layer_id == + cpi->svc.number_spatial_layers - 1)) { + // Encoded all spatial layers; exit loop. + break; + } + } + } + } + } + + cpi->common.error.setjmp = 0; + return res; +} + +static const vpx_codec_cx_pkt_t *encoder_get_cxdata(vpx_codec_alg_priv_t *ctx, + vpx_codec_iter_t *iter) { + return vpx_codec_pkt_list_get(&ctx->pkt_list.head, iter); +} + +static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx, + va_list args) { + vpx_ref_frame_t *const frame = va_arg(args, vpx_ref_frame_t *); + + if (frame != NULL) { + YV12_BUFFER_CONFIG sd; + + image2yuvconfig(&frame->img, &sd); + vp9_set_reference_enc(ctx->cpi, ref_frame_to_vp9_reframe(frame->frame_type), + &sd); + return VPX_CODEC_OK; + } + return VPX_CODEC_INVALID_PARAM; +} + +static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx, + va_list args) { + vpx_ref_frame_t *const frame = va_arg(args, vpx_ref_frame_t *); + + if (frame != NULL) { + YV12_BUFFER_CONFIG sd; + + image2yuvconfig(&frame->img, &sd); + vp9_copy_reference_enc(ctx->cpi, + ref_frame_to_vp9_reframe(frame->frame_type), &sd); + return VPX_CODEC_OK; + } + return VPX_CODEC_INVALID_PARAM; +} + +static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx, + va_list args) { + vp9_ref_frame_t *const frame = va_arg(args, vp9_ref_frame_t *); + + if (frame != NULL) { + const int fb_idx = ctx->cpi->common.cur_show_frame_fb_idx; + YV12_BUFFER_CONFIG *fb = get_buf_frame(&ctx->cpi->common, fb_idx); + if (fb == NULL) return VPX_CODEC_ERROR; + yuvconfig2image(&frame->img, fb, NULL); + return VPX_CODEC_OK; + } + return VPX_CODEC_INVALID_PARAM; +} + +static vpx_codec_err_t ctrl_set_previewpp(vpx_codec_alg_priv_t *ctx, + va_list args) { +#if CONFIG_VP9_POSTPROC + vp8_postproc_cfg_t *config = va_arg(args, vp8_postproc_cfg_t *); + if (config != NULL) { + ctx->preview_ppcfg = *config; + return VPX_CODEC_OK; + } + return VPX_CODEC_INVALID_PARAM; +#else + (void)ctx; + (void)args; + return VPX_CODEC_INCAPABLE; +#endif +} + +static vpx_image_t *encoder_get_preview(vpx_codec_alg_priv_t *ctx) { + YV12_BUFFER_CONFIG sd; + vp9_ppflags_t flags; + vp9_zero(flags); + + if (ctx->preview_ppcfg.post_proc_flag) { + flags.post_proc_flag = ctx->preview_ppcfg.post_proc_flag; + flags.deblocking_level = ctx->preview_ppcfg.deblocking_level; + flags.noise_level = ctx->preview_ppcfg.noise_level; + } + + if (vp9_get_preview_raw_frame(ctx->cpi, &sd, &flags) == 0) { + yuvconfig2image(&ctx->preview_img, &sd, NULL); + return &ctx->preview_img; + } + return NULL; +} + +static vpx_codec_err_t ctrl_set_roi_map(vpx_codec_alg_priv_t *ctx, + va_list args) { + vpx_roi_map_t *data = va_arg(args, vpx_roi_map_t *); + + if (data) { + vpx_roi_map_t *roi = (vpx_roi_map_t *)data; + return vp9_set_roi_map(ctx->cpi, roi->roi_map, roi->rows, roi->cols, + roi->delta_q, roi->delta_lf, roi->skip, + roi->ref_frame); + } + return VPX_CODEC_INVALID_PARAM; +} + +static vpx_codec_err_t ctrl_set_active_map(vpx_codec_alg_priv_t *ctx, + va_list args) { + vpx_active_map_t *const map = va_arg(args, vpx_active_map_t *); + + if (map) { + if (!vp9_set_active_map(ctx->cpi, map->active_map, (int)map->rows, + (int)map->cols)) + return VPX_CODEC_OK; + + return VPX_CODEC_INVALID_PARAM; + } + return VPX_CODEC_INVALID_PARAM; +} + +static vpx_codec_err_t ctrl_get_active_map(vpx_codec_alg_priv_t *ctx, + va_list args) { + vpx_active_map_t *const map = va_arg(args, vpx_active_map_t *); + + if (map) { + if (!vp9_get_active_map(ctx->cpi, map->active_map, (int)map->rows, + (int)map->cols)) + return VPX_CODEC_OK; + + return VPX_CODEC_INVALID_PARAM; + } + return VPX_CODEC_INVALID_PARAM; +} + +static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx, + va_list args) { + vpx_scaling_mode_t *const mode = va_arg(args, vpx_scaling_mode_t *); + + if (mode) { + const int res = vp9_set_internal_size(ctx->cpi, mode->h_scaling_mode, + mode->v_scaling_mode); + return (res == 0) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM; + } + return VPX_CODEC_INVALID_PARAM; +} + +static vpx_codec_err_t ctrl_set_svc(vpx_codec_alg_priv_t *ctx, va_list args) { + int data = va_arg(args, int); + const vpx_codec_enc_cfg_t *cfg = &ctx->cfg; + // Both one-pass and two-pass RC are supported now. + // User setting this has to make sure of the following. + // In two-pass setting: either (but not both) + // cfg->ss_number_layers > 1, or cfg->ts_number_layers > 1 + // In one-pass setting: + // either or both cfg->ss_number_layers > 1, or cfg->ts_number_layers > 1 + + vp9_set_svc(ctx->cpi, data); + + if (data == 1 && + (cfg->g_pass == VPX_RC_FIRST_PASS || cfg->g_pass == VPX_RC_LAST_PASS) && + cfg->ss_number_layers > 1 && cfg->ts_number_layers > 1) { + return VPX_CODEC_INVALID_PARAM; + } + + vp9_set_row_mt(ctx->cpi); + + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_svc_layer_id(vpx_codec_alg_priv_t *ctx, + va_list args) { + vpx_svc_layer_id_t *const data = va_arg(args, vpx_svc_layer_id_t *); + VP9_COMP *const cpi = (VP9_COMP *)ctx->cpi; + SVC *const svc = &cpi->svc; + int sl; + + svc->spatial_layer_to_encode = data->spatial_layer_id; + svc->first_spatial_layer_to_encode = data->spatial_layer_id; + // TODO(jianj): Deprecated to be removed. + svc->temporal_layer_id = data->temporal_layer_id; + // Allow for setting temporal layer per spatial layer for superframe. + for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) { + svc->temporal_layer_id_per_spatial[sl] = + data->temporal_layer_id_per_spatial[sl]; + } + // Checks on valid layer_id input. + if (svc->temporal_layer_id < 0 || + svc->temporal_layer_id >= (int)ctx->cfg.ts_number_layers) { + return VPX_CODEC_INVALID_PARAM; + } + + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_get_svc_layer_id(vpx_codec_alg_priv_t *ctx, + va_list args) { + vpx_svc_layer_id_t *data = va_arg(args, vpx_svc_layer_id_t *); + VP9_COMP *const cpi = (VP9_COMP *)ctx->cpi; + SVC *const svc = &cpi->svc; + + data->spatial_layer_id = svc->spatial_layer_id; + data->temporal_layer_id = svc->temporal_layer_id; + + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_svc_parameters(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + vpx_svc_extra_cfg_t *const params = va_arg(args, vpx_svc_extra_cfg_t *); + int sl, tl; + + // Number of temporal layers and number of spatial layers have to be set + // properly before calling this control function. + for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) { + for (tl = 0; tl < cpi->svc.number_temporal_layers; ++tl) { + const int layer = + LAYER_IDS_TO_IDX(sl, tl, cpi->svc.number_temporal_layers); + LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; + lc->max_q = params->max_quantizers[layer]; + lc->min_q = params->min_quantizers[layer]; + lc->scaling_factor_num = params->scaling_factor_num[sl]; + lc->scaling_factor_den = params->scaling_factor_den[sl]; + lc->speed = params->speed_per_layer[sl]; + lc->loopfilter_ctrl = params->loopfilter_ctrl[sl]; + } + } + + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_get_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + vpx_svc_ref_frame_config_t *data = va_arg(args, vpx_svc_ref_frame_config_t *); + int sl; + for (sl = 0; sl <= cpi->svc.spatial_layer_id; sl++) { + data->update_buffer_slot[sl] = cpi->svc.update_buffer_slot[sl]; + data->reference_last[sl] = cpi->svc.reference_last[sl]; + data->reference_golden[sl] = cpi->svc.reference_golden[sl]; + data->reference_alt_ref[sl] = cpi->svc.reference_altref[sl]; + data->lst_fb_idx[sl] = cpi->svc.lst_fb_idx[sl]; + data->gld_fb_idx[sl] = cpi->svc.gld_fb_idx[sl]; + data->alt_fb_idx[sl] = cpi->svc.alt_fb_idx[sl]; + // TODO(jianj): Remove these 3, deprecated. + data->update_last[sl] = cpi->svc.update_last[sl]; + data->update_golden[sl] = cpi->svc.update_golden[sl]; + data->update_alt_ref[sl] = cpi->svc.update_altref[sl]; + } + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + vpx_svc_ref_frame_config_t *data = va_arg(args, vpx_svc_ref_frame_config_t *); + int sl; + cpi->svc.use_set_ref_frame_config = 1; + for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) { + cpi->svc.update_buffer_slot[sl] = data->update_buffer_slot[sl]; + cpi->svc.reference_last[sl] = data->reference_last[sl]; + cpi->svc.reference_golden[sl] = data->reference_golden[sl]; + cpi->svc.reference_altref[sl] = data->reference_alt_ref[sl]; + cpi->svc.lst_fb_idx[sl] = data->lst_fb_idx[sl]; + cpi->svc.gld_fb_idx[sl] = data->gld_fb_idx[sl]; + cpi->svc.alt_fb_idx[sl] = data->alt_fb_idx[sl]; + cpi->svc.duration[sl] = data->duration[sl]; + } + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_svc_inter_layer_pred(vpx_codec_alg_priv_t *ctx, + va_list args) { + const int data = va_arg(args, int); + VP9_COMP *const cpi = ctx->cpi; + cpi->svc.disable_inter_layer_pred = data; + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_svc_frame_drop_layer(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + vpx_svc_frame_drop_t *data = va_arg(args, vpx_svc_frame_drop_t *); + int sl; + cpi->svc.framedrop_mode = data->framedrop_mode; + for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) + cpi->svc.framedrop_thresh[sl] = data->framedrop_thresh[sl]; + // Don't allow max_consec_drop values below 1. + cpi->svc.max_consec_drop = VPXMAX(1, data->max_consec_drop); + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_svc_gf_temporal_ref(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + const unsigned int data = va_arg(args, unsigned int); + cpi->svc.use_gf_temporal_ref = data; + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_svc_spatial_layer_sync( + vpx_codec_alg_priv_t *ctx, va_list args) { + VP9_COMP *const cpi = ctx->cpi; + vpx_svc_spatial_layer_sync_t *data = + va_arg(args, vpx_svc_spatial_layer_sync_t *); + int sl; + for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) + cpi->svc.spatial_layer_sync[sl] = data->spatial_layer_sync[sl]; + cpi->svc.set_intra_only_frame = data->base_layer_intra_only; + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_delta_q_uv(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + int data = va_arg(args, int); + data = VPXMIN(VPXMAX(data, -15), 15); + extra_cfg.delta_q_uv = data; + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_register_cx_callback(vpx_codec_alg_priv_t *ctx, + va_list args) { + vpx_codec_priv_output_cx_pkt_cb_pair_t *cbp = + (vpx_codec_priv_output_cx_pkt_cb_pair_t *)va_arg(args, void *); + ctx->output_cx_pkt_cb.output_cx_pkt = cbp->output_cx_pkt; + ctx->output_cx_pkt_cb.user_priv = cbp->user_priv; + + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_tune_content(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.content = CAST(VP9E_SET_TUNE_CONTENT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_color_space(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.color_space = CAST(VP9E_SET_COLOR_SPACE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_color_range(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.color_range = CAST(VP9E_SET_COLOR_RANGE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_render_size(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + int *const render_size = va_arg(args, int *); + extra_cfg.render_width = render_size[0]; + extra_cfg.render_height = render_size[1]; + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_postencode_drop(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + const unsigned int data = va_arg(args, unsigned int); + cpi->rc.ext_use_post_encode_drop = data; + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_disable_overshoot_maxq_cbr( + vpx_codec_alg_priv_t *ctx, va_list args) { + VP9_COMP *const cpi = ctx->cpi; + const unsigned int data = va_arg(args, unsigned int); + cpi->rc.disable_overshoot_maxq_cbr = data; + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_disable_loopfilter(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + const unsigned int data = va_arg(args, unsigned int); + cpi->loopfilter_ctrl = data; + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx, + va_list args) { + vpx_rc_funcs_t funcs = *CAST(VP9E_SET_EXTERNAL_RATE_CONTROL, args); + VP9_COMP *cpi = ctx->cpi; + EXT_RATECTRL *ext_ratectrl = &cpi->ext_ratectrl; + const VP9EncoderConfig *oxcf = &cpi->oxcf; + // TODO(angiebird): Check the possibility of this flag being set at pass == 1 + if (oxcf->pass == 2) { + const FRAME_INFO *frame_info = &cpi->frame_info; + vpx_rc_config_t ratectrl_config; + vpx_codec_err_t codec_status; + memset(&ratectrl_config, 0, sizeof(ratectrl_config)); + + ratectrl_config.frame_width = frame_info->frame_width; + ratectrl_config.frame_height = frame_info->frame_height; + ratectrl_config.show_frame_count = cpi->twopass.first_pass_info.num_frames; + ratectrl_config.max_gf_interval = oxcf->max_gf_interval; + ratectrl_config.min_gf_interval = oxcf->min_gf_interval; + // TODO(angiebird): Double check whether this is the proper way to set up + // target_bitrate and frame_rate. + ratectrl_config.target_bitrate_kbps = (int)(oxcf->target_bandwidth / 1000); + ratectrl_config.frame_rate_num = oxcf->g_timebase.den; + ratectrl_config.frame_rate_den = oxcf->g_timebase.num; + ratectrl_config.overshoot_percent = oxcf->over_shoot_pct; + ratectrl_config.undershoot_percent = oxcf->under_shoot_pct; + + if (oxcf->rc_mode == VPX_VBR) { + ratectrl_config.rc_mode = VPX_RC_VBR; + } else if (oxcf->rc_mode == VPX_Q) { + ratectrl_config.rc_mode = VPX_RC_QMODE; + } else if (oxcf->rc_mode == VPX_CQ) { + ratectrl_config.rc_mode = VPX_RC_CQ; + } + + codec_status = vp9_extrc_create(funcs, ratectrl_config, ext_ratectrl); + if (codec_status != VPX_CODEC_OK) { + return codec_status; + } + } + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_quantizer_one_pass(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + const int qp = va_arg(args, int); + vpx_codec_enc_cfg_t *cfg = &ctx->cfg; + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + vpx_codec_err_t res; + + if (qp < 0 || qp > 63) return VPX_CODEC_INVALID_PARAM; + + cfg->rc_min_quantizer = cfg->rc_max_quantizer = qp; + extra_cfg.aq_mode = 0; + cpi->fixed_qp_onepass = 1; + + res = update_extra_cfg(ctx, &extra_cfg); + return res; +} + +static vpx_codec_err_t ctrl_enable_external_rc_tpl(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + const int enable_flag = va_arg(args, int); + if (enable_flag != 0 && enable_flag != 1) return VPX_CODEC_INVALID_PARAM; + cpi->tpl_with_external_rc = enable_flag; + return VPX_CODEC_OK; +} + +static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { + { VP8_COPY_REFERENCE, ctrl_copy_reference }, + + // Setters + { VP8_SET_REFERENCE, ctrl_set_reference }, + { VP8_SET_POSTPROC, ctrl_set_previewpp }, + { VP9E_SET_ROI_MAP, ctrl_set_roi_map }, + { VP8E_SET_ACTIVEMAP, ctrl_set_active_map }, + { VP8E_SET_SCALEMODE, ctrl_set_scale_mode }, + { VP8E_SET_CPUUSED, ctrl_set_cpuused }, + { VP8E_SET_ENABLEAUTOALTREF, ctrl_set_enable_auto_alt_ref }, + { VP8E_SET_SHARPNESS, ctrl_set_sharpness }, + { VP8E_SET_STATIC_THRESHOLD, ctrl_set_static_thresh }, + { VP9E_SET_TILE_COLUMNS, ctrl_set_tile_columns }, + { VP9E_SET_TILE_ROWS, ctrl_set_tile_rows }, + { VP9E_SET_TPL, ctrl_set_tpl_model }, + { VP8E_SET_ARNR_MAXFRAMES, ctrl_set_arnr_max_frames }, + { VP8E_SET_ARNR_STRENGTH, ctrl_set_arnr_strength }, + { VP8E_SET_ARNR_TYPE, ctrl_set_arnr_type }, + { VP8E_SET_TUNING, ctrl_set_tuning }, + { VP8E_SET_CQ_LEVEL, ctrl_set_cq_level }, + { VP8E_SET_MAX_INTRA_BITRATE_PCT, ctrl_set_rc_max_intra_bitrate_pct }, + { VP9E_SET_MAX_INTER_BITRATE_PCT, ctrl_set_rc_max_inter_bitrate_pct }, + { VP9E_SET_GF_CBR_BOOST_PCT, ctrl_set_rc_gf_cbr_boost_pct }, + { VP9E_SET_LOSSLESS, ctrl_set_lossless }, + { VP9E_SET_FRAME_PARALLEL_DECODING, ctrl_set_frame_parallel_decoding_mode }, + { VP9E_SET_AQ_MODE, ctrl_set_aq_mode }, + { VP9E_SET_ALT_REF_AQ, ctrl_set_alt_ref_aq }, + { VP9E_SET_FRAME_PERIODIC_BOOST, ctrl_set_frame_periodic_boost }, + { VP9E_SET_SVC, ctrl_set_svc }, + { VP9E_SET_SVC_PARAMETERS, ctrl_set_svc_parameters }, + { VP9E_REGISTER_CX_CALLBACK, ctrl_register_cx_callback }, + { VP9E_SET_SVC_LAYER_ID, ctrl_set_svc_layer_id }, + { VP9E_SET_TUNE_CONTENT, ctrl_set_tune_content }, + { VP9E_SET_COLOR_SPACE, ctrl_set_color_space }, + { VP9E_SET_COLOR_RANGE, ctrl_set_color_range }, + { VP9E_SET_NOISE_SENSITIVITY, ctrl_set_noise_sensitivity }, + { VP9E_SET_MIN_GF_INTERVAL, ctrl_set_min_gf_interval }, + { VP9E_SET_MAX_GF_INTERVAL, ctrl_set_max_gf_interval }, + { VP9E_SET_SVC_REF_FRAME_CONFIG, ctrl_set_svc_ref_frame_config }, + { VP9E_SET_RENDER_SIZE, ctrl_set_render_size }, + { VP9E_SET_TARGET_LEVEL, ctrl_set_target_level }, + { VP9E_SET_ROW_MT, ctrl_set_row_mt }, + { VP9E_SET_POSTENCODE_DROP, ctrl_set_postencode_drop }, + { VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, ctrl_set_disable_overshoot_maxq_cbr }, + { VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, ctrl_enable_motion_vector_unit_test }, + { VP9E_SET_SVC_INTER_LAYER_PRED, ctrl_set_svc_inter_layer_pred }, + { VP9E_SET_SVC_FRAME_DROP_LAYER, ctrl_set_svc_frame_drop_layer }, + { VP9E_SET_SVC_GF_TEMPORAL_REF, ctrl_set_svc_gf_temporal_ref }, + { VP9E_SET_SVC_SPATIAL_LAYER_SYNC, ctrl_set_svc_spatial_layer_sync }, + { VP9E_SET_DELTA_Q_UV, ctrl_set_delta_q_uv }, + { VP9E_SET_DISABLE_LOOPFILTER, ctrl_set_disable_loopfilter }, + { VP9E_SET_RTC_EXTERNAL_RATECTRL, ctrl_set_rtc_external_ratectrl }, + { VP9E_SET_EXTERNAL_RATE_CONTROL, ctrl_set_external_rate_control }, + { VP9E_SET_QUANTIZER_ONE_PASS, ctrl_set_quantizer_one_pass }, + { VP9E_ENABLE_EXTERNAL_RC_TPL, ctrl_enable_external_rc_tpl }, + + // Getters + { VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer }, + { VP8E_GET_LAST_QUANTIZER_64, ctrl_get_quantizer64 }, + { VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, ctrl_get_quantizer_svc_layers }, + { VP9E_GET_LOOPFILTER_LEVEL, ctrl_get_loopfilter_level }, + { VP9_GET_REFERENCE, ctrl_get_reference }, + { VP9E_GET_SVC_LAYER_ID, ctrl_get_svc_layer_id }, + { VP9E_GET_ACTIVEMAP, ctrl_get_active_map }, + { VP9E_GET_LEVEL, ctrl_get_level }, + { VP9E_GET_SVC_REF_FRAME_CONFIG, ctrl_get_svc_ref_frame_config }, + + { -1, NULL }, +}; + +static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { + { 0, + { + // NOLINT + 0, // g_usage (unused) + 8, // g_threads + 0, // g_profile + + 320, // g_width + 240, // g_height + VPX_BITS_8, // g_bit_depth + 8, // g_input_bit_depth + + { 1, 30 }, // g_timebase + + 0, // g_error_resilient + + VPX_RC_ONE_PASS, // g_pass + + 25, // g_lag_in_frames + + 0, // rc_dropframe_thresh + 0, // rc_resize_allowed + 0, // rc_scaled_width + 0, // rc_scaled_height + 60, // rc_resize_down_thresh + 30, // rc_resize_up_thresh + + VPX_VBR, // rc_end_usage + { NULL, 0 }, // rc_twopass_stats_in + { NULL, 0 }, // rc_firstpass_mb_stats_in + 256, // rc_target_bitrate + 0, // rc_min_quantizer + 63, // rc_max_quantizer + 25, // rc_undershoot_pct + 25, // rc_overshoot_pct + + 6000, // rc_max_buffer_size + 4000, // rc_buffer_initial_size + 5000, // rc_buffer_optimal_size + + 50, // rc_two_pass_vbrbias + 0, // rc_two_pass_vbrmin_section + 2000, // rc_two_pass_vbrmax_section + 0, // rc_2pass_vbr_corpus_complexity (non 0 for corpus vbr) + + // keyframing settings (kf) + VPX_KF_AUTO, // g_kfmode + 0, // kf_min_dist + 128, // kf_max_dist + + VPX_SS_DEFAULT_LAYERS, // ss_number_layers + { 0 }, + { 0 }, // ss_target_bitrate + 1, // ts_number_layers + { 0 }, // ts_target_bitrate + { 0 }, // ts_rate_decimator + 0, // ts_periodicity + { 0 }, // ts_layer_id + { 0 }, // layer_target_bitrate + 0, // temporal_layering_mode + 0, // use_vizier_rc_params + { 1, 1 }, // active_wq_factor + { 1, 1 }, // err_per_mb_factor + { 1, 1 }, // sr_default_decay_limit + { 1, 1 }, // sr_diff_factor + { 1, 1 }, // kf_err_per_mb_factor + { 1, 1 }, // kf_frame_min_boost_factor + { 1, 1 }, // kf_frame_max_boost_first_factor + { 1, 1 }, // kf_frame_max_boost_subs_factor + { 1, 1 }, // kf_max_total_boost_factor + { 1, 1 }, // gf_max_total_boost_factor + { 1, 1 }, // gf_frame_max_boost_factor + { 1, 1 }, // zm_factor + { 1, 1 }, // rd_mult_inter_qp_fac + { 1, 1 }, // rd_mult_arf_qp_fac + { 1, 1 }, // rd_mult_key_qp_fac + } }, +}; + +#ifndef VERSION_STRING +#define VERSION_STRING +#endif +CODEC_INTERFACE(vpx_codec_vp9_cx) = { + "WebM Project VP9 Encoder" VERSION_STRING, + VPX_CODEC_INTERNAL_ABI_VERSION, +#if CONFIG_VP9_HIGHBITDEPTH + VPX_CODEC_CAP_HIGHBITDEPTH | +#endif + VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR, // vpx_codec_caps_t + encoder_init, // vpx_codec_init_fn_t + encoder_destroy, // vpx_codec_destroy_fn_t + encoder_ctrl_maps, // vpx_codec_ctrl_fn_map_t + { + // NOLINT + NULL, // vpx_codec_peek_si_fn_t + NULL, // vpx_codec_get_si_fn_t + NULL, // vpx_codec_decode_fn_t + NULL, // vpx_codec_frame_get_fn_t + NULL // vpx_codec_set_fb_fn_t + }, + { + // NOLINT + 1, // 1 cfg map + encoder_usage_cfg_map, // vpx_codec_enc_cfg_map_t + encoder_encode, // vpx_codec_encode_fn_t + encoder_get_cxdata, // vpx_codec_get_cx_data_fn_t + encoder_set_config, // vpx_codec_enc_config_set_fn_t + NULL, // vpx_codec_get_global_headers_fn_t + encoder_get_preview, // vpx_codec_get_preview_frame_fn_t + NULL // vpx_codec_enc_mr_get_mem_loc_fn_t + } +}; + +static vpx_codec_enc_cfg_t get_enc_cfg(int frame_width, int frame_height, + vpx_rational_t frame_rate, + int target_bitrate, + vpx_enc_pass enc_pass) { + vpx_codec_enc_cfg_t enc_cfg = encoder_usage_cfg_map[0].cfg; + enc_cfg.g_w = frame_width; + enc_cfg.g_h = frame_height; + enc_cfg.rc_target_bitrate = target_bitrate; + enc_cfg.g_pass = enc_pass; + // g_timebase is the inverse of frame_rate + enc_cfg.g_timebase.num = frame_rate.den; + enc_cfg.g_timebase.den = frame_rate.num; + return enc_cfg; +} + +static vp9_extracfg get_extra_cfg() { + vp9_extracfg extra_cfg = default_extra_cfg; + return extra_cfg; +} + +VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height, + vpx_rational_t frame_rate, + int target_bitrate, int encode_speed, + int target_level, + vpx_enc_pass enc_pass) { + /* This function will generate the same VP9EncoderConfig used by the + * vpxenc command given below. + * The configs in the vpxenc command corresponds to parameters of + * vp9_get_encoder_config() as follows. + * + * WIDTH: frame_width + * HEIGHT: frame_height + * FPS: frame_rate + * BITRATE: target_bitrate + * CPU_USED:encode_speed + * TARGET_LEVEL: target_level + * + * INPUT, OUTPUT, LIMIT will not affect VP9EncoderConfig + * + * vpxenc command: + * INPUT=bus_cif.y4m + * OUTPUT=output.webm + * WIDTH=352 + * HEIGHT=288 + * BITRATE=600 + * FPS=30/1 + * LIMIT=150 + * CPU_USED=0 + * TARGET_LEVEL=0 + * ./vpxenc --limit=$LIMIT --width=$WIDTH --height=$HEIGHT --fps=$FPS + * --lag-in-frames=25 \ + * --codec=vp9 --good --cpu-used=CPU_USED --threads=0 --profile=0 \ + * --min-q=0 --max-q=63 --auto-alt-ref=1 --passes=2 --kf-max-dist=150 \ + * --kf-min-dist=0 --drop-frame=0 --static-thresh=0 --bias-pct=50 \ + * --minsection-pct=0 --maxsection-pct=150 --arnr-maxframes=7 --psnr \ + * --arnr-strength=5 --sharpness=0 --undershoot-pct=100 --overshoot-pct=100 \ + * --frame-parallel=0 --tile-columns=0 --cpu-used=0 --end-usage=vbr \ + * --target-bitrate=$BITRATE --target-level=0 -o $OUTPUT $INPUT + */ + + VP9EncoderConfig oxcf; + vp9_extracfg extra_cfg = get_extra_cfg(); + vpx_codec_enc_cfg_t enc_cfg = get_enc_cfg( + frame_width, frame_height, frame_rate, target_bitrate, enc_pass); + set_encoder_config(&oxcf, &enc_cfg, &extra_cfg); + + // These settings are made to match the settings of the vpxenc command. + oxcf.key_freq = 150; + oxcf.under_shoot_pct = 100; + oxcf.over_shoot_pct = 100; + oxcf.max_threads = 0; + oxcf.tile_columns = 0; + oxcf.frame_parallel_decoding_mode = 0; + oxcf.two_pass_vbrmax_section = 150; + oxcf.speed = abs(encode_speed); + oxcf.target_level = target_level; + return oxcf; +} + +#define DUMP_STRUCT_VALUE(fp, structure, value) \ + fprintf(fp, #value " %" PRId64 "\n", (int64_t)(structure)->value) + +void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf, FILE *fp) { + DUMP_STRUCT_VALUE(fp, oxcf, profile); + DUMP_STRUCT_VALUE(fp, oxcf, bit_depth); + DUMP_STRUCT_VALUE(fp, oxcf, width); + DUMP_STRUCT_VALUE(fp, oxcf, height); + DUMP_STRUCT_VALUE(fp, oxcf, input_bit_depth); + DUMP_STRUCT_VALUE(fp, oxcf, init_framerate); + // TODO(angiebird): dump g_timebase + // TODO(angiebird): dump g_timebase_in_ts + + DUMP_STRUCT_VALUE(fp, oxcf, target_bandwidth); + + DUMP_STRUCT_VALUE(fp, oxcf, noise_sensitivity); + DUMP_STRUCT_VALUE(fp, oxcf, sharpness); + DUMP_STRUCT_VALUE(fp, oxcf, speed); + DUMP_STRUCT_VALUE(fp, oxcf, rc_max_intra_bitrate_pct); + DUMP_STRUCT_VALUE(fp, oxcf, rc_max_inter_bitrate_pct); + DUMP_STRUCT_VALUE(fp, oxcf, gf_cbr_boost_pct); + + DUMP_STRUCT_VALUE(fp, oxcf, mode); + DUMP_STRUCT_VALUE(fp, oxcf, pass); + + // Key Framing Operations + DUMP_STRUCT_VALUE(fp, oxcf, auto_key); + DUMP_STRUCT_VALUE(fp, oxcf, key_freq); + + DUMP_STRUCT_VALUE(fp, oxcf, lag_in_frames); + + // ---------------------------------------------------------------- + // DATARATE CONTROL OPTIONS + + // vbr, cbr, constrained quality or constant quality + DUMP_STRUCT_VALUE(fp, oxcf, rc_mode); + + // buffer targeting aggressiveness + DUMP_STRUCT_VALUE(fp, oxcf, under_shoot_pct); + DUMP_STRUCT_VALUE(fp, oxcf, over_shoot_pct); + + // buffering parameters + // TODO(angiebird): dump tarting_buffer_level_ms + // TODO(angiebird): dump ptimal_buffer_level_ms + // TODO(angiebird): dump maximum_buffer_size_ms + + // Frame drop threshold. + DUMP_STRUCT_VALUE(fp, oxcf, drop_frames_water_mark); + + // controlling quality + DUMP_STRUCT_VALUE(fp, oxcf, fixed_q); + DUMP_STRUCT_VALUE(fp, oxcf, worst_allowed_q); + DUMP_STRUCT_VALUE(fp, oxcf, best_allowed_q); + DUMP_STRUCT_VALUE(fp, oxcf, cq_level); + DUMP_STRUCT_VALUE(fp, oxcf, aq_mode); + + // Special handling of Adaptive Quantization for AltRef frames + DUMP_STRUCT_VALUE(fp, oxcf, alt_ref_aq); + + // Internal frame size scaling. + DUMP_STRUCT_VALUE(fp, oxcf, resize_mode); + DUMP_STRUCT_VALUE(fp, oxcf, scaled_frame_width); + DUMP_STRUCT_VALUE(fp, oxcf, scaled_frame_height); + + // Enable feature to reduce the frame quantization every x frames. + DUMP_STRUCT_VALUE(fp, oxcf, frame_periodic_boost); + + // two pass datarate control + DUMP_STRUCT_VALUE(fp, oxcf, two_pass_vbrbias); + DUMP_STRUCT_VALUE(fp, oxcf, two_pass_vbrmin_section); + DUMP_STRUCT_VALUE(fp, oxcf, two_pass_vbrmax_section); + DUMP_STRUCT_VALUE(fp, oxcf, vbr_corpus_complexity); + // END DATARATE CONTROL OPTIONS + // ---------------------------------------------------------------- + + // Spatial and temporal scalability. + DUMP_STRUCT_VALUE(fp, oxcf, ss_number_layers); + DUMP_STRUCT_VALUE(fp, oxcf, ts_number_layers); + + // Bitrate allocation for spatial layers. + // TODO(angiebird): dump layer_target_bitrate[VPX_MAX_LAYERS] + // TODO(angiebird): dump ss_target_bitrate[VPX_SS_MAX_LAYERS] + // TODO(angiebird): dump ss_enable_auto_arf[VPX_SS_MAX_LAYERS] + // TODO(angiebird): dump ts_rate_decimator[VPX_TS_MAX_LAYERS] + + DUMP_STRUCT_VALUE(fp, oxcf, enable_auto_arf); + DUMP_STRUCT_VALUE(fp, oxcf, encode_breakout); + DUMP_STRUCT_VALUE(fp, oxcf, error_resilient_mode); + DUMP_STRUCT_VALUE(fp, oxcf, frame_parallel_decoding_mode); + + DUMP_STRUCT_VALUE(fp, oxcf, arnr_max_frames); + DUMP_STRUCT_VALUE(fp, oxcf, arnr_strength); + + DUMP_STRUCT_VALUE(fp, oxcf, min_gf_interval); + DUMP_STRUCT_VALUE(fp, oxcf, max_gf_interval); + + DUMP_STRUCT_VALUE(fp, oxcf, tile_columns); + DUMP_STRUCT_VALUE(fp, oxcf, tile_rows); + + DUMP_STRUCT_VALUE(fp, oxcf, enable_tpl_model); + + DUMP_STRUCT_VALUE(fp, oxcf, max_threads); + + DUMP_STRUCT_VALUE(fp, oxcf, target_level); + + // TODO(angiebird): dump two_pass_stats_in + DUMP_STRUCT_VALUE(fp, oxcf, tuning); + DUMP_STRUCT_VALUE(fp, oxcf, content); +#if CONFIG_VP9_HIGHBITDEPTH + DUMP_STRUCT_VALUE(fp, oxcf, use_highbitdepth); +#endif + DUMP_STRUCT_VALUE(fp, oxcf, color_space); + DUMP_STRUCT_VALUE(fp, oxcf, color_range); + DUMP_STRUCT_VALUE(fp, oxcf, render_width); + DUMP_STRUCT_VALUE(fp, oxcf, render_height); + DUMP_STRUCT_VALUE(fp, oxcf, temporal_layering_mode); + + DUMP_STRUCT_VALUE(fp, oxcf, row_mt); + DUMP_STRUCT_VALUE(fp, oxcf, motion_vector_unit_test); + DUMP_STRUCT_VALUE(fp, oxcf, delta_q_uv); + DUMP_STRUCT_VALUE(fp, oxcf, use_simple_encode_api); +} + +FRAME_INFO vp9_get_frame_info(const VP9EncoderConfig *oxcf) { + FRAME_INFO frame_info; + int dummy; + frame_info.frame_width = oxcf->width; + frame_info.frame_height = oxcf->height; + frame_info.render_frame_width = oxcf->width; + frame_info.render_frame_height = oxcf->height; + frame_info.bit_depth = oxcf->bit_depth; + vp9_set_mi_size(&frame_info.mi_rows, &frame_info.mi_cols, &dummy, + frame_info.frame_width, frame_info.frame_height); + vp9_set_mb_size(&frame_info.mb_rows, &frame_info.mb_cols, &frame_info.num_mbs, + frame_info.mi_rows, frame_info.mi_cols); + // TODO(angiebird): Figure out how to get subsampling_x/y here + return frame_info; +} + +void vp9_set_first_pass_stats(VP9EncoderConfig *oxcf, + const vpx_fixed_buf_t *stats) { + oxcf->two_pass_stats_in = *stats; +} diff --git a/media/libvpx/libvpx/vp9/vp9_cx_iface.h b/media/libvpx/libvpx/vp9/vp9_cx_iface.h new file mode 100644 index 0000000000..f2de8507ff --- /dev/null +++ b/media/libvpx/libvpx/vp9/vp9_cx_iface.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_VP9_CX_IFACE_H_ +#define VPX_VP9_VP9_CX_IFACE_H_ +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/common/vp9_onyxc_int.h" + +#ifdef __cplusplus +extern "C" { +#endif + +VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height, + vpx_rational_t frame_rate, + int target_bitrate, int encode_speed, + int target_level, + vpx_enc_pass enc_pass); + +void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf, FILE *fp); + +FRAME_INFO vp9_get_frame_info(const VP9EncoderConfig *oxcf); + +static INLINE int64_t +timebase_units_to_ticks(const vpx_rational64_t *timestamp_ratio, int64_t n) { + return n * timestamp_ratio->num / timestamp_ratio->den; +} + +static INLINE int64_t +ticks_to_timebase_units(const vpx_rational64_t *timestamp_ratio, int64_t n) { + int64_t round = timestamp_ratio->num / 2; + if (round > 0) --round; + return (n * timestamp_ratio->den + round) / timestamp_ratio->num; +} + +void vp9_set_first_pass_stats(VP9EncoderConfig *oxcf, + const vpx_fixed_buf_t *stats); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_VP9_CX_IFACE_H_ diff --git a/media/libvpx/libvpx/vp9/vp9_dx_iface.c b/media/libvpx/libvpx/vp9/vp9_dx_iface.c new file mode 100644 index 0000000000..860f721dc5 --- /dev/null +++ b/media/libvpx/libvpx/vp9/vp9_dx_iface.c @@ -0,0 +1,743 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_version.h" + +#include "vpx/internal/vpx_codec_internal.h" +#include "vpx/vp8dx.h" +#include "vpx/vpx_decoder.h" +#include "vpx_dsp/bitreader_buffer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_util/vpx_thread.h" + +#include "vp9/common/vp9_alloccommon.h" +#include "vp9/common/vp9_frame_buffers.h" + +#include "vp9/decoder/vp9_decodeframe.h" + +#include "vp9/vp9_dx_iface.h" +#include "vp9/vp9_iface_common.h" + +#define VP9_CAP_POSTPROC (CONFIG_VP9_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0) + +static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx, + vpx_codec_priv_enc_mr_cfg_t *data) { + // This function only allocates space for the vpx_codec_alg_priv_t + // structure. More memory may be required at the time the stream + // information becomes known. + (void)data; + + if (!ctx->priv) { + vpx_codec_alg_priv_t *const priv = + (vpx_codec_alg_priv_t *)vpx_calloc(1, sizeof(*priv)); + if (priv == NULL) return VPX_CODEC_MEM_ERROR; + + ctx->priv = (vpx_codec_priv_t *)priv; + ctx->priv->init_flags = ctx->init_flags; + priv->si.sz = sizeof(priv->si); + priv->flushed = 0; + if (ctx->config.dec) { + priv->cfg = *ctx->config.dec; + ctx->config.dec = &priv->cfg; + } + } + + return VPX_CODEC_OK; +} + +static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) { + if (ctx->pbi != NULL) { + vp9_decoder_remove(ctx->pbi); + } + + if (ctx->buffer_pool) { + vp9_free_ref_frame_buffers(ctx->buffer_pool); + vp9_free_internal_frame_buffers(&ctx->buffer_pool->int_frame_buffers); + } + + vpx_free(ctx->buffer_pool); + vpx_free(ctx); + return VPX_CODEC_OK; +} + +static int parse_bitdepth_colorspace_sampling(BITSTREAM_PROFILE profile, + struct vpx_read_bit_buffer *rb) { + vpx_color_space_t color_space; + if (profile >= PROFILE_2) rb->bit_offset += 1; // Bit-depth 10 or 12. + color_space = (vpx_color_space_t)vpx_rb_read_literal(rb, 3); + if (color_space != VPX_CS_SRGB) { + rb->bit_offset += 1; // [16,235] (including xvycc) vs [0,255] range. + if (profile == PROFILE_1 || profile == PROFILE_3) { + rb->bit_offset += 2; // subsampling x/y. + rb->bit_offset += 1; // unused. + } + } else { + if (profile == PROFILE_1 || profile == PROFILE_3) { + rb->bit_offset += 1; // unused + } else { + // RGB is only available in version 1. + return 0; + } + } + return 1; +} + +static vpx_codec_err_t decoder_peek_si_internal( + const uint8_t *data, unsigned int data_sz, vpx_codec_stream_info_t *si, + int *is_intra_only, vpx_decrypt_cb decrypt_cb, void *decrypt_state) { + int intra_only_flag = 0; + uint8_t clear_buffer[11]; + + if (data + data_sz <= data) return VPX_CODEC_INVALID_PARAM; + + si->is_kf = 0; + si->w = si->h = 0; + + if (decrypt_cb) { + data_sz = VPXMIN(sizeof(clear_buffer), data_sz); + decrypt_cb(decrypt_state, data, clear_buffer, data_sz); + data = clear_buffer; + } + + // A maximum of 6 bits are needed to read the frame marker, profile and + // show_existing_frame. + if (data_sz < 1) return VPX_CODEC_UNSUP_BITSTREAM; + + { + int show_frame; + int error_resilient; + struct vpx_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL }; + const int frame_marker = vpx_rb_read_literal(&rb, 2); + const BITSTREAM_PROFILE profile = vp9_read_profile(&rb); + + if (frame_marker != VP9_FRAME_MARKER) return VPX_CODEC_UNSUP_BITSTREAM; + + if (profile >= MAX_PROFILES) return VPX_CODEC_UNSUP_BITSTREAM; + + if (vpx_rb_read_bit(&rb)) { // show an existing frame + // If profile is > 2 and show_existing_frame is true, then at least 1 more + // byte (6+3=9 bits) is needed. + if (profile > 2 && data_sz < 2) return VPX_CODEC_UNSUP_BITSTREAM; + vpx_rb_read_literal(&rb, 3); // Frame buffer to show. + return VPX_CODEC_OK; + } + + // For the rest of the function, a maximum of 9 more bytes are needed + // (computed by taking the maximum possible bits needed in each case). Note + // that this has to be updated if we read any more bits in this function. + if (data_sz < 10) return VPX_CODEC_UNSUP_BITSTREAM; + + si->is_kf = !vpx_rb_read_bit(&rb); + show_frame = vpx_rb_read_bit(&rb); + error_resilient = vpx_rb_read_bit(&rb); + + if (si->is_kf) { + if (!vp9_read_sync_code(&rb)) return VPX_CODEC_UNSUP_BITSTREAM; + + if (!parse_bitdepth_colorspace_sampling(profile, &rb)) + return VPX_CODEC_UNSUP_BITSTREAM; + vp9_read_frame_size(&rb, (int *)&si->w, (int *)&si->h); + } else { + intra_only_flag = show_frame ? 0 : vpx_rb_read_bit(&rb); + + rb.bit_offset += error_resilient ? 0 : 2; // reset_frame_context + + if (intra_only_flag) { + if (!vp9_read_sync_code(&rb)) return VPX_CODEC_UNSUP_BITSTREAM; + if (profile > PROFILE_0) { + if (!parse_bitdepth_colorspace_sampling(profile, &rb)) + return VPX_CODEC_UNSUP_BITSTREAM; + // The colorspace info may cause vp9_read_frame_size() to need 11 + // bytes. + if (data_sz < 11) return VPX_CODEC_UNSUP_BITSTREAM; + } + rb.bit_offset += REF_FRAMES; // refresh_frame_flags + vp9_read_frame_size(&rb, (int *)&si->w, (int *)&si->h); + } + } + } + if (is_intra_only != NULL) *is_intra_only = intra_only_flag; + return VPX_CODEC_OK; +} + +static vpx_codec_err_t decoder_peek_si(const uint8_t *data, + unsigned int data_sz, + vpx_codec_stream_info_t *si) { + return decoder_peek_si_internal(data, data_sz, si, NULL, NULL, NULL); +} + +static vpx_codec_err_t decoder_get_si(vpx_codec_alg_priv_t *ctx, + vpx_codec_stream_info_t *si) { + const size_t sz = (si->sz >= sizeof(vp9_stream_info_t)) + ? sizeof(vp9_stream_info_t) + : sizeof(vpx_codec_stream_info_t); + memcpy(si, &ctx->si, sz); + si->sz = (unsigned int)sz; + + return VPX_CODEC_OK; +} + +static void set_error_detail(vpx_codec_alg_priv_t *ctx, + const char *const error) { + ctx->base.err_detail = error; +} + +static vpx_codec_err_t update_error_state( + vpx_codec_alg_priv_t *ctx, const struct vpx_internal_error_info *error) { + if (error->error_code) + set_error_detail(ctx, error->has_detail ? error->detail : NULL); + + return error->error_code; +} + +static vpx_codec_err_t init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) { + VP9_COMMON *const cm = &ctx->pbi->common; + BufferPool *const pool = cm->buffer_pool; + + cm->new_fb_idx = INVALID_IDX; + cm->byte_alignment = ctx->byte_alignment; + cm->skip_loop_filter = ctx->skip_loop_filter; + + if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) { + pool->get_fb_cb = ctx->get_ext_fb_cb; + pool->release_fb_cb = ctx->release_ext_fb_cb; + pool->cb_priv = ctx->ext_priv; + } else { + pool->get_fb_cb = vp9_get_frame_buffer; + pool->release_fb_cb = vp9_release_frame_buffer; + + if (vp9_alloc_internal_frame_buffers(&pool->int_frame_buffers)) { + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to initialize internal frame buffers"); + return VPX_CODEC_MEM_ERROR; + } + + pool->cb_priv = &pool->int_frame_buffers; + } + + return VPX_CODEC_OK; +} + +static void set_default_ppflags(vp8_postproc_cfg_t *cfg) { + cfg->post_proc_flag = VP8_DEBLOCK | VP8_DEMACROBLOCK; + cfg->deblocking_level = 4; + cfg->noise_level = 0; +} + +static void set_ppflags(const vpx_codec_alg_priv_t *ctx, vp9_ppflags_t *flags) { + flags->post_proc_flag = ctx->postproc_cfg.post_proc_flag; + + flags->deblocking_level = ctx->postproc_cfg.deblocking_level; + flags->noise_level = ctx->postproc_cfg.noise_level; +} + +#undef ERROR +#define ERROR(str) \ + do { \ + ctx->base.err_detail = str; \ + return VPX_CODEC_INVALID_PARAM; \ + } while (0) + +#define RANGE_CHECK(p, memb, lo, hi) \ + do { \ + if (!(((p)->memb == (lo) || (p)->memb > (lo)) && (p)->memb <= (hi))) \ + ERROR(#memb " out of range [" #lo ".." #hi "]"); \ + } while (0) + +static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) { + vpx_codec_err_t res; + ctx->last_show_frame = -1; + ctx->need_resync = 1; + ctx->flushed = 0; + + ctx->buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(BufferPool)); + if (ctx->buffer_pool == NULL) return VPX_CODEC_MEM_ERROR; + + ctx->pbi = vp9_decoder_create(ctx->buffer_pool); + if (ctx->pbi == NULL) { + vpx_free(ctx->buffer_pool); + ctx->buffer_pool = NULL; + set_error_detail(ctx, "Failed to allocate decoder"); + return VPX_CODEC_MEM_ERROR; + } + ctx->pbi->max_threads = ctx->cfg.threads; + ctx->pbi->inv_tile_order = ctx->invert_tile_order; + + RANGE_CHECK(ctx, row_mt, 0, 1); + ctx->pbi->row_mt = ctx->row_mt; + + RANGE_CHECK(ctx, lpf_opt, 0, 1); + ctx->pbi->lpf_mt_opt = ctx->lpf_opt; + + // If postprocessing was enabled by the application and a + // configuration has not been provided, default it. + if (!ctx->postproc_cfg_set && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)) + set_default_ppflags(&ctx->postproc_cfg); + + res = init_buffer_callbacks(ctx); + if (res != VPX_CODEC_OK) { + vpx_free(ctx->buffer_pool); + ctx->buffer_pool = NULL; + vp9_decoder_remove(ctx->pbi); + ctx->pbi = NULL; + } + return res; +} + +static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx, + const VP9Decoder *const pbi) { + // Clear resync flag if the decoder got a key frame or intra only frame. + if (ctx->need_resync == 1 && pbi->need_resync == 0 && + (pbi->common.intra_only || pbi->common.frame_type == KEY_FRAME)) + ctx->need_resync = 0; +} + +static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx, + const uint8_t **data, unsigned int data_sz, + void *user_priv) { + // Determine the stream parameters. Note that we rely on peek_si to + // validate that we have a buffer that does not wrap around the top + // of the heap. + if (!ctx->si.h) { + int is_intra_only = 0; + const vpx_codec_err_t res = + decoder_peek_si_internal(*data, data_sz, &ctx->si, &is_intra_only, + ctx->decrypt_cb, ctx->decrypt_state); + if (res != VPX_CODEC_OK) return res; + + if (!ctx->si.is_kf && !is_intra_only) return VPX_CODEC_ERROR; + } + + ctx->user_priv = user_priv; + + // Set these even if already initialized. The caller may have changed the + // decrypt config between frames. + ctx->pbi->decrypt_cb = ctx->decrypt_cb; + ctx->pbi->decrypt_state = ctx->decrypt_state; + + if (vp9_receive_compressed_data(ctx->pbi, data_sz, data)) { + ctx->pbi->cur_buf->buf.corrupted = 1; + ctx->pbi->need_resync = 1; + ctx->need_resync = 1; + return update_error_state(ctx, &ctx->pbi->common.error); + } + + check_resync(ctx, ctx->pbi); + + return VPX_CODEC_OK; +} + +static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, + const uint8_t *data, unsigned int data_sz, + void *user_priv) { + const uint8_t *data_start = data; + vpx_codec_err_t res; + uint32_t frame_sizes[8]; + int frame_count; + + if (data == NULL && data_sz == 0) { + ctx->flushed = 1; + return VPX_CODEC_OK; + } + + // Reset flushed when receiving a valid frame. + ctx->flushed = 0; + + // Initialize the decoder on the first frame. + if (ctx->pbi == NULL) { + res = init_decoder(ctx); + if (res != VPX_CODEC_OK) return res; + } + + res = vp9_parse_superframe_index(data, data_sz, frame_sizes, &frame_count, + ctx->decrypt_cb, ctx->decrypt_state); + if (res != VPX_CODEC_OK) return res; + + if (ctx->svc_decoding && ctx->svc_spatial_layer < frame_count - 1) + frame_count = ctx->svc_spatial_layer + 1; + + // Decode in serial mode. + if (frame_count > 0) { + const uint8_t *const data_end = data + data_sz; + int i; + + for (i = 0; i < frame_count; ++i) { + const uint8_t *data_start_copy = data_start; + const uint32_t frame_size = frame_sizes[i]; + if (data_start < data || frame_size > (uint32_t)(data_end - data_start)) { + set_error_detail(ctx, "Invalid frame size in index"); + return VPX_CODEC_CORRUPT_FRAME; + } + + res = decode_one(ctx, &data_start_copy, frame_size, user_priv); + if (res != VPX_CODEC_OK) return res; + + data_start += frame_size; + } + } else { + const uint8_t *const data_end = data + data_sz; + while (data_start < data_end) { + const uint32_t frame_size = (uint32_t)(data_end - data_start); + res = decode_one(ctx, &data_start, frame_size, user_priv); + if (res != VPX_CODEC_OK) return res; + + // Account for suboptimal termination by the encoder. + while (data_start < data_end) { + const uint8_t marker = + read_marker(ctx->decrypt_cb, ctx->decrypt_state, data_start); + if (marker) break; + ++data_start; + } + } + } + + return res; +} + +static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx, + vpx_codec_iter_t *iter) { + vpx_image_t *img = NULL; + + // Legacy parameter carried over from VP8. Has no effect for VP9 since we + // always return only 1 frame per decode call. + (void)iter; + + if (ctx->pbi != NULL) { + YV12_BUFFER_CONFIG sd; + vp9_ppflags_t flags = { 0, 0, 0 }; + if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) set_ppflags(ctx, &flags); + if (vp9_get_raw_frame(ctx->pbi, &sd, &flags) == 0) { + VP9_COMMON *const cm = &ctx->pbi->common; + RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; + ctx->last_show_frame = ctx->pbi->common.new_fb_idx; + if (ctx->need_resync) return NULL; + yuvconfig2image(&ctx->img, &sd, ctx->user_priv); + ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv; + img = &ctx->img; + return img; + } + } + return NULL; +} + +static vpx_codec_err_t decoder_set_fb_fn( + vpx_codec_alg_priv_t *ctx, vpx_get_frame_buffer_cb_fn_t cb_get, + vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) { + if (cb_get == NULL || cb_release == NULL) { + return VPX_CODEC_INVALID_PARAM; + } else if (ctx->pbi == NULL) { + // If the decoder has already been initialized, do not accept changes to + // the frame buffer functions. + ctx->get_ext_fb_cb = cb_get; + ctx->release_ext_fb_cb = cb_release; + ctx->ext_priv = cb_priv; + return VPX_CODEC_OK; + } + + return VPX_CODEC_ERROR; +} + +static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx, + va_list args) { + vpx_ref_frame_t *const data = va_arg(args, vpx_ref_frame_t *); + + if (data) { + vpx_ref_frame_t *const frame = (vpx_ref_frame_t *)data; + YV12_BUFFER_CONFIG sd; + image2yuvconfig(&frame->img, &sd); + return vp9_set_reference_dec( + &ctx->pbi->common, ref_frame_to_vp9_reframe(frame->frame_type), &sd); + } else { + return VPX_CODEC_INVALID_PARAM; + } +} + +static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx, + va_list args) { + vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *); + + if (data) { + vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data; + YV12_BUFFER_CONFIG sd; + image2yuvconfig(&frame->img, &sd); + return vp9_copy_reference_dec(ctx->pbi, (VP9_REFFRAME)frame->frame_type, + &sd); + } else { + return VPX_CODEC_INVALID_PARAM; + } +} + +static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx, + va_list args) { + vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *); + + if (data) { + if (ctx->pbi) { + const int fb_idx = ctx->pbi->common.cur_show_frame_fb_idx; + YV12_BUFFER_CONFIG *fb = get_buf_frame(&ctx->pbi->common, fb_idx); + if (fb == NULL) return VPX_CODEC_ERROR; + yuvconfig2image(&data->img, fb, NULL); + return VPX_CODEC_OK; + } else { + return VPX_CODEC_ERROR; + } + } else { + return VPX_CODEC_INVALID_PARAM; + } +} + +static vpx_codec_err_t ctrl_set_postproc(vpx_codec_alg_priv_t *ctx, + va_list args) { +#if CONFIG_VP9_POSTPROC + vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *); + + if (data) { + ctx->postproc_cfg_set = 1; + ctx->postproc_cfg = *((vp8_postproc_cfg_t *)data); + return VPX_CODEC_OK; + } else { + return VPX_CODEC_INVALID_PARAM; + } +#else + (void)ctx; + (void)args; + return VPX_CODEC_INCAPABLE; +#endif +} + +static vpx_codec_err_t ctrl_get_quantizer(vpx_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL || ctx->pbi == NULL) return VPX_CODEC_INVALID_PARAM; + *arg = ctx->pbi->common.base_qindex; + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_get_last_ref_updates(vpx_codec_alg_priv_t *ctx, + va_list args) { + int *const update_info = va_arg(args, int *); + + if (update_info) { + if (ctx->pbi != NULL) { + *update_info = ctx->pbi->refresh_frame_flags; + return VPX_CODEC_OK; + } else { + return VPX_CODEC_ERROR; + } + } + + return VPX_CODEC_INVALID_PARAM; +} + +static vpx_codec_err_t ctrl_get_frame_corrupted(vpx_codec_alg_priv_t *ctx, + va_list args) { + int *corrupted = va_arg(args, int *); + + if (corrupted) { + if (ctx->pbi != NULL) { + RefCntBuffer *const frame_bufs = ctx->pbi->common.buffer_pool->frame_bufs; + if (ctx->pbi->common.frame_to_show == NULL) return VPX_CODEC_ERROR; + if (ctx->last_show_frame >= 0) + *corrupted = frame_bufs[ctx->last_show_frame].buf.corrupted; + return VPX_CODEC_OK; + } else { + return VPX_CODEC_ERROR; + } + } + + return VPX_CODEC_INVALID_PARAM; +} + +static vpx_codec_err_t ctrl_get_frame_size(vpx_codec_alg_priv_t *ctx, + va_list args) { + int *const frame_size = va_arg(args, int *); + + if (frame_size) { + if (ctx->pbi != NULL) { + const VP9_COMMON *const cm = &ctx->pbi->common; + frame_size[0] = cm->width; + frame_size[1] = cm->height; + return VPX_CODEC_OK; + } else { + return VPX_CODEC_ERROR; + } + } + + return VPX_CODEC_INVALID_PARAM; +} + +static vpx_codec_err_t ctrl_get_render_size(vpx_codec_alg_priv_t *ctx, + va_list args) { + int *const render_size = va_arg(args, int *); + + if (render_size) { + if (ctx->pbi != NULL) { + const VP9_COMMON *const cm = &ctx->pbi->common; + render_size[0] = cm->render_width; + render_size[1] = cm->render_height; + return VPX_CODEC_OK; + } else { + return VPX_CODEC_ERROR; + } + } + + return VPX_CODEC_INVALID_PARAM; +} + +static vpx_codec_err_t ctrl_get_bit_depth(vpx_codec_alg_priv_t *ctx, + va_list args) { + unsigned int *const bit_depth = va_arg(args, unsigned int *); + + if (bit_depth) { + if (ctx->pbi != NULL) { + const VP9_COMMON *const cm = &ctx->pbi->common; + *bit_depth = cm->bit_depth; + return VPX_CODEC_OK; + } else { + return VPX_CODEC_ERROR; + } + } + + return VPX_CODEC_INVALID_PARAM; +} + +static vpx_codec_err_t ctrl_set_invert_tile_order(vpx_codec_alg_priv_t *ctx, + va_list args) { + ctx->invert_tile_order = va_arg(args, int); + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_decryptor(vpx_codec_alg_priv_t *ctx, + va_list args) { + vpx_decrypt_init *init = va_arg(args, vpx_decrypt_init *); + ctx->decrypt_cb = init ? init->decrypt_cb : NULL; + ctx->decrypt_state = init ? init->decrypt_state : NULL; + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_byte_alignment(vpx_codec_alg_priv_t *ctx, + va_list args) { + const int legacy_byte_alignment = 0; + const int min_byte_alignment = 32; + const int max_byte_alignment = 1024; + const int byte_alignment = va_arg(args, int); + + if (byte_alignment != legacy_byte_alignment && + (byte_alignment < min_byte_alignment || + byte_alignment > max_byte_alignment || + (byte_alignment & (byte_alignment - 1)) != 0)) + return VPX_CODEC_INVALID_PARAM; + + ctx->byte_alignment = byte_alignment; + if (ctx->pbi != NULL) { + ctx->pbi->common.byte_alignment = byte_alignment; + } + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_skip_loop_filter(vpx_codec_alg_priv_t *ctx, + va_list args) { + ctx->skip_loop_filter = va_arg(args, int); + + if (ctx->pbi != NULL) { + ctx->pbi->common.skip_loop_filter = ctx->skip_loop_filter; + } + + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_spatial_layer_svc(vpx_codec_alg_priv_t *ctx, + va_list args) { + ctx->svc_decoding = 1; + ctx->svc_spatial_layer = va_arg(args, int); + if (ctx->svc_spatial_layer < 0) + return VPX_CODEC_INVALID_PARAM; + else + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_row_mt(vpx_codec_alg_priv_t *ctx, + va_list args) { + ctx->row_mt = va_arg(args, int); + + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_enable_lpf_opt(vpx_codec_alg_priv_t *ctx, + va_list args) { + ctx->lpf_opt = va_arg(args, int); + + return VPX_CODEC_OK; +} + +static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { + { VP8_COPY_REFERENCE, ctrl_copy_reference }, + + // Setters + { VP8_SET_REFERENCE, ctrl_set_reference }, + { VP8_SET_POSTPROC, ctrl_set_postproc }, + { VP9_INVERT_TILE_DECODE_ORDER, ctrl_set_invert_tile_order }, + { VPXD_SET_DECRYPTOR, ctrl_set_decryptor }, + { VP9_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment }, + { VP9_SET_SKIP_LOOP_FILTER, ctrl_set_skip_loop_filter }, + { VP9_DECODE_SVC_SPATIAL_LAYER, ctrl_set_spatial_layer_svc }, + { VP9D_SET_ROW_MT, ctrl_set_row_mt }, + { VP9D_SET_LOOP_FILTER_OPT, ctrl_enable_lpf_opt }, + + // Getters + { VPXD_GET_LAST_QUANTIZER, ctrl_get_quantizer }, + { VP8D_GET_LAST_REF_UPDATES, ctrl_get_last_ref_updates }, + { VP8D_GET_FRAME_CORRUPTED, ctrl_get_frame_corrupted }, + { VP9_GET_REFERENCE, ctrl_get_reference }, + { VP9D_GET_DISPLAY_SIZE, ctrl_get_render_size }, + { VP9D_GET_BIT_DEPTH, ctrl_get_bit_depth }, + { VP9D_GET_FRAME_SIZE, ctrl_get_frame_size }, + + { -1, NULL }, +}; + +#ifndef VERSION_STRING +#define VERSION_STRING +#endif +CODEC_INTERFACE(vpx_codec_vp9_dx) = { + "WebM Project VP9 Decoder" VERSION_STRING, + VPX_CODEC_INTERNAL_ABI_VERSION, +#if CONFIG_VP9_HIGHBITDEPTH + VPX_CODEC_CAP_HIGHBITDEPTH | +#endif + VPX_CODEC_CAP_DECODER | VP9_CAP_POSTPROC | + VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER, // vpx_codec_caps_t + decoder_init, // vpx_codec_init_fn_t + decoder_destroy, // vpx_codec_destroy_fn_t + decoder_ctrl_maps, // vpx_codec_ctrl_fn_map_t + { + // NOLINT + decoder_peek_si, // vpx_codec_peek_si_fn_t + decoder_get_si, // vpx_codec_get_si_fn_t + decoder_decode, // vpx_codec_decode_fn_t + decoder_get_frame, // vpx_codec_frame_get_fn_t + decoder_set_fb_fn, // vpx_codec_set_fb_fn_t + }, + { + // NOLINT + 0, + NULL, // vpx_codec_enc_cfg_map_t + NULL, // vpx_codec_encode_fn_t + NULL, // vpx_codec_get_cx_data_fn_t + NULL, // vpx_codec_enc_config_set_fn_t + NULL, // vpx_codec_get_global_headers_fn_t + NULL, // vpx_codec_get_preview_frame_fn_t + NULL // vpx_codec_enc_mr_get_mem_loc_fn_t + } +}; diff --git a/media/libvpx/libvpx/vp9/vp9_dx_iface.h b/media/libvpx/libvpx/vp9/vp9_dx_iface.h new file mode 100644 index 0000000000..f60688c4db --- /dev/null +++ b/media/libvpx/libvpx/vp9/vp9_dx_iface.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_VP9_DX_IFACE_H_ +#define VPX_VP9_VP9_DX_IFACE_H_ + +#include "vp9/decoder/vp9_decoder.h" + +typedef vpx_codec_stream_info_t vp9_stream_info_t; + +struct vpx_codec_alg_priv { + vpx_codec_priv_t base; + vpx_codec_dec_cfg_t cfg; + vp9_stream_info_t si; + VP9Decoder *pbi; + void *user_priv; + int postproc_cfg_set; + vp8_postproc_cfg_t postproc_cfg; + vpx_decrypt_cb decrypt_cb; + void *decrypt_state; + vpx_image_t img; + int img_avail; + int flushed; + int invert_tile_order; + int last_show_frame; // Index of last output frame. + int byte_alignment; + int skip_loop_filter; + + int need_resync; // wait for key/intra-only frame + // BufferPool that holds all reference frames. + BufferPool *buffer_pool; + + // External frame buffer info to save for VP9 common. + void *ext_priv; // Private data associated with the external frame buffers. + vpx_get_frame_buffer_cb_fn_t get_ext_fb_cb; + vpx_release_frame_buffer_cb_fn_t release_ext_fb_cb; + + // Allow for decoding up to a given spatial layer for SVC stream. + int svc_decoding; + int svc_spatial_layer; + int row_mt; + int lpf_opt; +}; + +#endif // VPX_VP9_VP9_DX_IFACE_H_ diff --git a/media/libvpx/libvpx/vp9/vp9_iface_common.c b/media/libvpx/libvpx/vp9/vp9_iface_common.c new file mode 100644 index 0000000000..8d031694d8 --- /dev/null +++ b/media/libvpx/libvpx/vp9/vp9_iface_common.c @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file in the root of the source tree. An additional + * intellectual property rights grant can be found in the file PATENTS. + * All contributing project authors may be found in the AUTHORS file in + * the root of the source tree. + */ + +#include "vp9/vp9_iface_common.h" +void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12, + void *user_priv) { + /** vpx_img_wrap() doesn't allow specifying independent strides for + * the Y, U, and V planes, nor other alignment adjustments that + * might be representable by a YV12_BUFFER_CONFIG, so we just + * initialize all the fields.*/ + int bps; + if (!yv12->subsampling_y) { + if (!yv12->subsampling_x) { + img->fmt = VPX_IMG_FMT_I444; + bps = 24; + } else { + img->fmt = VPX_IMG_FMT_I422; + bps = 16; + } + } else { + if (!yv12->subsampling_x) { + img->fmt = VPX_IMG_FMT_I440; + bps = 16; + } else { + img->fmt = VPX_IMG_FMT_I420; + bps = 12; + } + } + img->cs = yv12->color_space; + img->range = yv12->color_range; + img->bit_depth = 8; + img->w = yv12->y_stride; + img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * VP9_ENC_BORDER_IN_PIXELS, 3); + img->d_w = yv12->y_crop_width; + img->d_h = yv12->y_crop_height; + img->r_w = yv12->render_width; + img->r_h = yv12->render_height; + img->x_chroma_shift = yv12->subsampling_x; + img->y_chroma_shift = yv12->subsampling_y; + img->planes[VPX_PLANE_Y] = yv12->y_buffer; + img->planes[VPX_PLANE_U] = yv12->u_buffer; + img->planes[VPX_PLANE_V] = yv12->v_buffer; + img->planes[VPX_PLANE_ALPHA] = NULL; + img->stride[VPX_PLANE_Y] = yv12->y_stride; + img->stride[VPX_PLANE_U] = yv12->uv_stride; + img->stride[VPX_PLANE_V] = yv12->uv_stride; + img->stride[VPX_PLANE_ALPHA] = yv12->y_stride; +#if CONFIG_VP9_HIGHBITDEPTH + if (yv12->flags & YV12_FLAG_HIGHBITDEPTH) { + // vpx_image_t uses byte strides and a pointer to the first byte + // of the image. + img->fmt = (vpx_img_fmt_t)(img->fmt | VPX_IMG_FMT_HIGHBITDEPTH); + img->bit_depth = yv12->bit_depth; + img->planes[VPX_PLANE_Y] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->y_buffer); + img->planes[VPX_PLANE_U] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->u_buffer); + img->planes[VPX_PLANE_V] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->v_buffer); + img->planes[VPX_PLANE_ALPHA] = NULL; + img->stride[VPX_PLANE_Y] = 2 * yv12->y_stride; + img->stride[VPX_PLANE_U] = 2 * yv12->uv_stride; + img->stride[VPX_PLANE_V] = 2 * yv12->uv_stride; + img->stride[VPX_PLANE_ALPHA] = 2 * yv12->y_stride; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + img->bps = bps; + img->user_priv = user_priv; + img->img_data = yv12->buffer_alloc; + img->img_data_owner = 0; + img->self_allocd = 0; +} + +vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, + YV12_BUFFER_CONFIG *yv12) { + yv12->y_buffer = img->planes[VPX_PLANE_Y]; + yv12->u_buffer = img->planes[VPX_PLANE_U]; + yv12->v_buffer = img->planes[VPX_PLANE_V]; + + yv12->y_crop_width = img->d_w; + yv12->y_crop_height = img->d_h; + yv12->render_width = img->r_w; + yv12->render_height = img->r_h; + yv12->y_width = img->d_w; + yv12->y_height = img->d_h; + + yv12->uv_width = img->x_chroma_shift == 1 || img->fmt == VPX_IMG_FMT_NV12 + ? (1 + yv12->y_width) / 2 + : yv12->y_width; + yv12->uv_height = + img->y_chroma_shift == 1 ? (1 + yv12->y_height) / 2 : yv12->y_height; + yv12->uv_crop_width = yv12->uv_width; + yv12->uv_crop_height = yv12->uv_height; + + yv12->y_stride = img->stride[VPX_PLANE_Y]; + yv12->uv_stride = img->stride[VPX_PLANE_U]; + yv12->color_space = img->cs; + yv12->color_range = img->range; + +#if CONFIG_VP9_HIGHBITDEPTH + if (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) { + // In vpx_image_t + // planes point to uint8 address of start of data + // stride counts uint8s to reach next row + // In YV12_BUFFER_CONFIG + // y_buffer, u_buffer, v_buffer point to uint16 address of data + // stride and border counts in uint16s + // This means that all the address calculations in the main body of code + // should work correctly. + // However, before we do any pixel operations we need to cast the address + // to a uint16 ponter and double its value. + yv12->y_buffer = CONVERT_TO_BYTEPTR(yv12->y_buffer); + yv12->u_buffer = CONVERT_TO_BYTEPTR(yv12->u_buffer); + yv12->v_buffer = CONVERT_TO_BYTEPTR(yv12->v_buffer); + yv12->y_stride >>= 1; + yv12->uv_stride >>= 1; + yv12->flags = YV12_FLAG_HIGHBITDEPTH; + } else { + yv12->flags = 0; + } + yv12->border = (yv12->y_stride - img->w) / 2; +#else + yv12->border = (img->stride[VPX_PLANE_Y] - img->w) / 2; +#endif // CONFIG_VP9_HIGHBITDEPTH + yv12->subsampling_x = img->x_chroma_shift; + yv12->subsampling_y = img->y_chroma_shift; + // When reading the data, UV are in one plane for NV12 format, thus + // x_chroma_shift is 0. After converting, UV are in separate planes, and + // subsampling_x should be set to 1. + if (img->fmt == VPX_IMG_FMT_NV12) yv12->subsampling_x = 1; + return VPX_CODEC_OK; +} diff --git a/media/libvpx/libvpx/vp9/vp9_iface_common.h b/media/libvpx/libvpx/vp9/vp9_iface_common.h new file mode 100644 index 0000000000..e646917c69 --- /dev/null +++ b/media/libvpx/libvpx/vp9/vp9_iface_common.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_VP9_VP9_IFACE_COMMON_H_ +#define VPX_VP9_VP9_IFACE_COMMON_H_ + +#include +#include "vpx_ports/mem.h" +#include "vpx/vp8.h" +#include "vpx_scale/yv12config.h" +#include "common/vp9_enums.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12, + void *user_priv); + +vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, + YV12_BUFFER_CONFIG *yv12); + +static INLINE VP9_REFFRAME +ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) { + switch (frame) { + case VP8_LAST_FRAME: return VP9_LAST_FLAG; + case VP8_GOLD_FRAME: return VP9_GOLD_FLAG; + case VP8_ALTR_FRAME: return VP9_ALT_FLAG; + } + assert(0 && "Invalid Reference Frame"); + return VP9_LAST_FLAG; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_VP9_IFACE_COMMON_H_ diff --git a/media/libvpx/libvpx/vp9/vp9cx.mk b/media/libvpx/libvpx/vp9/vp9cx.mk new file mode 100644 index 0000000000..44790ef6a4 --- /dev/null +++ b/media/libvpx/libvpx/vp9/vp9cx.mk @@ -0,0 +1,178 @@ +## +## Copyright (c) 2010 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +VP9_CX_EXPORTS += exports_enc + +VP9_CX_SRCS-yes += $(VP9_COMMON_SRCS-yes) +VP9_CX_SRCS-no += $(VP9_COMMON_SRCS-no) +VP9_CX_SRCS_REMOVE-yes += $(VP9_COMMON_SRCS_REMOVE-yes) +VP9_CX_SRCS_REMOVE-no += $(VP9_COMMON_SRCS_REMOVE-no) + +VP9_CX_SRCS-yes += vp9_cx_iface.c +VP9_CX_SRCS-yes += vp9_cx_iface.h + +VP9_CX_SRCS-yes += encoder/vp9_bitstream.c +VP9_CX_SRCS-yes += encoder/vp9_context_tree.c +VP9_CX_SRCS-yes += encoder/vp9_context_tree.h +VP9_CX_SRCS-yes += encoder/vp9_cost.h +VP9_CX_SRCS-yes += encoder/vp9_cost.c +VP9_CX_SRCS-yes += encoder/vp9_dct.c +VP9_CX_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += encoder/vp9_denoiser.c +VP9_CX_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += encoder/vp9_denoiser.h +VP9_CX_SRCS-yes += encoder/vp9_encodeframe.c +VP9_CX_SRCS-yes += encoder/vp9_encodeframe.h +VP9_CX_SRCS-yes += encoder/vp9_encodemb.c +VP9_CX_SRCS-yes += encoder/vp9_encodemv.c +VP9_CX_SRCS-yes += encoder/vp9_ethread.h +VP9_CX_SRCS-yes += encoder/vp9_ethread.c +VP9_CX_SRCS-yes += encoder/vp9_extend.c +VP9_CX_SRCS-yes += encoder/vp9_firstpass.c +VP9_CX_SRCS-yes += encoder/vp9_block.h +VP9_CX_SRCS-yes += encoder/vp9_bitstream.h +VP9_CX_SRCS-yes += encoder/vp9_encodemb.h +VP9_CX_SRCS-yes += encoder/vp9_encodemv.h +VP9_CX_SRCS-yes += encoder/vp9_extend.h +VP9_CX_SRCS-yes += encoder/vp9_firstpass.h +VP9_CX_SRCS-yes += encoder/vp9_firstpass_stats.h +VP9_CX_SRCS-yes += encoder/vp9_frame_scale.c +VP9_CX_SRCS-yes += encoder/vp9_job_queue.h +VP9_CX_SRCS-yes += encoder/vp9_lookahead.c +VP9_CX_SRCS-yes += encoder/vp9_lookahead.h +VP9_CX_SRCS-yes += encoder/vp9_mcomp.h +VP9_CX_SRCS-yes += encoder/vp9_multi_thread.c +VP9_CX_SRCS-yes += encoder/vp9_multi_thread.h +VP9_CX_SRCS-yes += encoder/vp9_encoder.h +VP9_CX_SRCS-yes += encoder/vp9_quantize.h +VP9_CX_SRCS-yes += encoder/vp9_ratectrl.h +VP9_CX_SRCS-yes += encoder/vp9_rd.h +VP9_CX_SRCS-yes += encoder/vp9_rdopt.h +VP9_CX_SRCS-yes += encoder/vp9_pickmode.h +VP9_CX_SRCS-yes += encoder/vp9_svc_layercontext.h +VP9_CX_SRCS-yes += encoder/vp9_tokenize.h +VP9_CX_SRCS-yes += encoder/vp9_treewriter.h +VP9_CX_SRCS-yes += encoder/vp9_mcomp.c +VP9_CX_SRCS-yes += encoder/vp9_encoder.c +VP9_CX_SRCS-yes += encoder/vp9_picklpf.c +VP9_CX_SRCS-yes += encoder/vp9_picklpf.h +VP9_CX_SRCS-yes += encoder/vp9_quantize.c +VP9_CX_SRCS-yes += encoder/vp9_ratectrl.c +VP9_CX_SRCS-yes += encoder/vp9_rd.c +VP9_CX_SRCS-yes += encoder/vp9_rdopt.c +VP9_CX_SRCS-yes += encoder/vp9_pickmode.c +VP9_CX_SRCS-yes += encoder/vp9_partition_models.h +VP9_CX_SRCS-yes += encoder/vp9_segmentation.c +VP9_CX_SRCS-yes += encoder/vp9_segmentation.h +VP9_CX_SRCS-yes += encoder/vp9_speed_features.c +VP9_CX_SRCS-yes += encoder/vp9_speed_features.h +VP9_CX_SRCS-yes += encoder/vp9_subexp.c +VP9_CX_SRCS-yes += encoder/vp9_subexp.h +VP9_CX_SRCS-yes += encoder/vp9_svc_layercontext.c +VP9_CX_SRCS-yes += encoder/vp9_resize.c +VP9_CX_SRCS-yes += encoder/vp9_resize.h +VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_blockiness.c +VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_blockiness.h +VP9_CX_SRCS-$(CONFIG_NON_GREEDY_MV) += encoder/vp9_non_greedy_mv.c +VP9_CX_SRCS-$(CONFIG_NON_GREEDY_MV) += encoder/vp9_non_greedy_mv.h + +VP9_CX_SRCS-yes += encoder/vp9_tokenize.c +VP9_CX_SRCS-yes += encoder/vp9_treewriter.c +VP9_CX_SRCS-yes += encoder/vp9_aq_variance.c +VP9_CX_SRCS-yes += encoder/vp9_aq_variance.h +VP9_CX_SRCS-yes += encoder/vp9_aq_360.c +VP9_CX_SRCS-yes += encoder/vp9_aq_360.h +VP9_CX_SRCS-yes += encoder/vp9_aq_cyclicrefresh.c +VP9_CX_SRCS-yes += encoder/vp9_aq_cyclicrefresh.h +VP9_CX_SRCS-yes += encoder/vp9_aq_complexity.c +VP9_CX_SRCS-yes += encoder/vp9_aq_complexity.h +VP9_CX_SRCS-yes += encoder/vp9_alt_ref_aq.h +VP9_CX_SRCS-yes += encoder/vp9_alt_ref_aq.c +VP9_CX_SRCS-yes += encoder/vp9_skin_detection.c +VP9_CX_SRCS-yes += encoder/vp9_skin_detection.h +VP9_CX_SRCS-yes += encoder/vp9_noise_estimate.c +VP9_CX_SRCS-yes += encoder/vp9_noise_estimate.h +VP9_CX_SRCS-yes += encoder/vp9_ext_ratectrl.c +VP9_CX_SRCS-yes += encoder/vp9_ext_ratectrl.h +ifeq ($(CONFIG_VP9_POSTPROC),yes) +VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.h +VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.c +endif +VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.c +VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h +VP9_CX_SRCS-yes += encoder/vp9_tpl_model.c +VP9_CX_SRCS-yes += encoder/vp9_tpl_model.h +VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c +VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h + +VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_sse4.c +VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/vp9_temporal_filter_constants.h +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_temporal_filter_neon.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/vp9_temporal_filter_constants.h + +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c +VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.c +VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_quantize_avx2.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_diamond_search_sad_neon.c +ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c +VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_temporal_filter_sse4.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_highbd_temporal_filter_neon.c +endif + +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm + +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_intrin_sse2.c +VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_frame_scale_ssse3.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c + +ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes) +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_denoiser_sse2.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_denoiser_neon.c +endif + +VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_avx2.c + +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_frame_scale_neon.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c +ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_highbd_error_neon.c +endif + +VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c + +ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct4x4_msa.c +VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c +VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c +VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h +endif # !CONFIG_VP9_HIGHBITDEPTH + +VP9_CX_SRCS-$(HAVE_VSX) += encoder/ppc/vp9_quantize_vsx.c + +# Strip unnecessary files with CONFIG_REALTIME_ONLY +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_firstpass.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_mbgraph.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_temporal_filter.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/temporal_filter_sse4.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_temporal_filter_constants.h +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/highbd_temporal_filter_sse4.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/arm/neon/vp9_temporal_filter_neon.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/arm/neon/vp9_highbd_temporal_filter_neon.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_alt_ref_aq.h +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_alt_ref_aq.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_variance.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_variance.h +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_360.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_360.h +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_complexity.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_complexity.h + +VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes)) diff --git a/media/libvpx/libvpx/vp9/vp9dx.mk b/media/libvpx/libvpx/vp9/vp9dx.mk new file mode 100644 index 0000000000..93a5f368bd --- /dev/null +++ b/media/libvpx/libvpx/vp9/vp9dx.mk @@ -0,0 +1,34 @@ +## +## Copyright (c) 2010 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +VP9_DX_EXPORTS += exports_dec + +VP9_DX_SRCS-yes += $(VP9_COMMON_SRCS-yes) +VP9_DX_SRCS-no += $(VP9_COMMON_SRCS-no) +VP9_DX_SRCS_REMOVE-yes += $(VP9_COMMON_SRCS_REMOVE-yes) +VP9_DX_SRCS_REMOVE-no += $(VP9_COMMON_SRCS_REMOVE-no) + +VP9_DX_SRCS-yes += vp9_dx_iface.c +VP9_DX_SRCS-yes += vp9_dx_iface.h + +VP9_DX_SRCS-yes += decoder/vp9_decodemv.c +VP9_DX_SRCS-yes += decoder/vp9_decodeframe.c +VP9_DX_SRCS-yes += decoder/vp9_decodeframe.h +VP9_DX_SRCS-yes += decoder/vp9_detokenize.c +VP9_DX_SRCS-yes += decoder/vp9_decodemv.h +VP9_DX_SRCS-yes += decoder/vp9_detokenize.h +VP9_DX_SRCS-yes += decoder/vp9_decoder.c +VP9_DX_SRCS-yes += decoder/vp9_decoder.h +VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c +VP9_DX_SRCS-yes += decoder/vp9_dsubexp.h +VP9_DX_SRCS-yes += decoder/vp9_job_queue.c +VP9_DX_SRCS-yes += decoder/vp9_job_queue.h + +VP9_DX_SRCS-yes := $(filter-out $(VP9_DX_SRCS_REMOVE-yes),$(VP9_DX_SRCS-yes)) diff --git a/media/libvpx/libvpx/vpx/exports_com b/media/libvpx/libvpx/vpx/exports_com new file mode 100644 index 0000000000..f0b46aa175 --- /dev/null +++ b/media/libvpx/libvpx/vpx/exports_com @@ -0,0 +1,19 @@ +text vpx_codec_build_config +text vpx_codec_control_ +text vpx_codec_destroy +text vpx_codec_err_to_string +text vpx_codec_error +text vpx_codec_error_detail +text vpx_codec_get_caps +text vpx_codec_iface_name +text vpx_codec_version +text vpx_codec_version_extra_str +text vpx_codec_version_str +text vpx_img_alloc +text vpx_img_flip +text vpx_img_free +text vpx_img_set_rect +text vpx_img_wrap +text vpx_free_tpl_gop_stats +text vpx_read_tpl_gop_stats +text vpx_write_tpl_gop_stats diff --git a/media/libvpx/libvpx/vpx/exports_dec b/media/libvpx/libvpx/vpx/exports_dec new file mode 100644 index 0000000000..c694ebae12 --- /dev/null +++ b/media/libvpx/libvpx/vpx/exports_dec @@ -0,0 +1,8 @@ +text vpx_codec_dec_init_ver +text vpx_codec_decode +text vpx_codec_get_frame +text vpx_codec_get_stream_info +text vpx_codec_peek_stream_info +text vpx_codec_register_put_frame_cb +text vpx_codec_register_put_slice_cb +text vpx_codec_set_frame_buffer_functions diff --git a/media/libvpx/libvpx/vpx/exports_enc b/media/libvpx/libvpx/vpx/exports_enc new file mode 100644 index 0000000000..914e36cd4c --- /dev/null +++ b/media/libvpx/libvpx/vpx/exports_enc @@ -0,0 +1,9 @@ +text vpx_codec_enc_config_default +text vpx_codec_enc_config_set +text vpx_codec_enc_init_multi_ver +text vpx_codec_enc_init_ver +text vpx_codec_encode +text vpx_codec_get_cx_data +text vpx_codec_get_global_headers +text vpx_codec_get_preview_frame +text vpx_codec_set_cx_data_buf diff --git a/media/libvpx/libvpx/vpx/internal/vpx_codec_internal.h b/media/libvpx/libvpx/vpx/internal/vpx_codec_internal.h new file mode 100644 index 0000000000..275b6a436c --- /dev/null +++ b/media/libvpx/libvpx/vpx/internal/vpx_codec_internal.h @@ -0,0 +1,480 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/*!\file + * \brief Describes the decoder algorithm interface for algorithm + * implementations. + * + * This file defines the private structures and data types that are only + * relevant to implementing an algorithm, as opposed to using it. + * + * To create a decoder algorithm class, an interface structure is put + * into the global namespace: + *

+ *     my_codec.c:
+ *       vpx_codec_iface_t my_codec = {
+ *           "My Codec v1.0",
+ *           VPX_CODEC_ALG_ABI_VERSION,
+ *           ...
+ *       };
+ *     
+ * + * An application instantiates a specific decoder instance by using + * vpx_codec_dec_init() and a pointer to the algorithm's interface structure: + *
+ *     my_app.c:
+ *       extern vpx_codec_iface_t my_codec;
+ *       {
+ *           vpx_codec_ctx_t algo;
+ *           int threads = 4;
+ *           vpx_codec_dec_cfg_t cfg = { threads, 0, 0 };
+ *           res = vpx_codec_dec_init(&algo, &my_codec, &cfg, 0);
+ *       }
+ *     
+ * + * Once initialized, the instance is manged using other functions from + * the vpx_codec_* family. + */ +#ifndef VPX_VPX_INTERNAL_VPX_CODEC_INTERNAL_H_ +#define VPX_VPX_INTERNAL_VPX_CODEC_INTERNAL_H_ +#include "../vpx_decoder.h" +#include "../vpx_encoder.h" +#include + +#include "vpx_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures + */ +#define VPX_CODEC_INTERNAL_ABI_VERSION (5) /**<\hideinitializer*/ + +typedef struct vpx_codec_alg_priv vpx_codec_alg_priv_t; +typedef struct vpx_codec_priv_enc_mr_cfg vpx_codec_priv_enc_mr_cfg_t; + +/*!\brief init function pointer prototype + * + * Performs algorithm-specific initialization of the decoder context. This + * function is called by vpx_codec_dec_init() and vpx_codec_enc_init(), so + * plugins implementing this interface may trust the input parameters to be + * properly initialized. + * + * \param[in] ctx Pointer to this instance's context + * \retval #VPX_CODEC_OK + * The input stream was recognized and decoder initialized. + * \retval #VPX_CODEC_MEM_ERROR + * Memory operation failed. + */ +typedef vpx_codec_err_t (*vpx_codec_init_fn_t)( + vpx_codec_ctx_t *ctx, vpx_codec_priv_enc_mr_cfg_t *data); + +/*!\brief destroy function pointer prototype + * + * Performs algorithm-specific destruction of the decoder context. This + * function is called by the generic vpx_codec_destroy() wrapper function, + * so plugins implementing this interface may trust the input parameters + * to be properly initialized. + * + * \param[in] ctx Pointer to this instance's context + * \retval #VPX_CODEC_OK + * The input stream was recognized and decoder initialized. + * \retval #VPX_CODEC_MEM_ERROR + * Memory operation failed. + */ +typedef vpx_codec_err_t (*vpx_codec_destroy_fn_t)(vpx_codec_alg_priv_t *ctx); + +/*!\brief parse stream info function pointer prototype + * + * Performs high level parsing of the bitstream. This function is called by the + * generic vpx_codec_peek_stream_info() wrapper function, so plugins + * implementing this interface may trust the input parameters to be properly + * initialized. + * + * \param[in] data Pointer to a block of data to parse + * \param[in] data_sz Size of the data buffer + * \param[in,out] si Pointer to stream info to update. The size member + * \ref MUST be properly initialized, but \ref MAY be + * clobbered by the algorithm. This parameter \ref MAY + * be NULL. + * + * \retval #VPX_CODEC_OK + * Bitstream is parsable and stream information updated + */ +typedef vpx_codec_err_t (*vpx_codec_peek_si_fn_t)(const uint8_t *data, + unsigned int data_sz, + vpx_codec_stream_info_t *si); + +/*!\brief Return information about the current stream. + * + * Returns information about the stream that has been parsed during decoding. + * + * \param[in] ctx Pointer to this instance's context + * \param[in,out] si Pointer to stream info to update. The size member + * \ref MUST be properly initialized, but \ref MAY be + * clobbered by the algorithm. This parameter \ref MAY + * be NULL. + * + * \retval #VPX_CODEC_OK + * Bitstream is parsable and stream information updated + */ +typedef vpx_codec_err_t (*vpx_codec_get_si_fn_t)(vpx_codec_alg_priv_t *ctx, + vpx_codec_stream_info_t *si); + +/*!\brief control function pointer prototype + * + * This function is used to exchange algorithm specific data with the decoder + * instance. This can be used to implement features specific to a particular + * algorithm. + * + * This function is called by the generic vpx_codec_control() wrapper + * function, so plugins implementing this interface may trust the input + * parameters to be properly initialized. However, this interface does not + * provide type safety for the exchanged data or assign meanings to the + * control codes. Those details should be specified in the algorithm's + * header file. In particular, the ctrl_id parameter is guaranteed to exist + * in the algorithm's control mapping table, and the data parameter may be NULL. + * + * + * \param[in] ctx Pointer to this instance's context + * \param[in] ctrl_id Algorithm specific control identifier + * \param[in,out] data Data to exchange with algorithm instance. + * + * \retval #VPX_CODEC_OK + * The internal state data was deserialized. + */ +typedef vpx_codec_err_t (*vpx_codec_control_fn_t)(vpx_codec_alg_priv_t *ctx, + va_list ap); + +/*!\brief control function pointer mapping + * + * This structure stores the mapping between control identifiers and + * implementing functions. Each algorithm provides a list of these + * mappings. This list is searched by the vpx_codec_control() wrapper + * function to determine which function to invoke. The special + * value {0, NULL} is used to indicate end-of-list, and must be + * present. The special value {0, } can be used as a catch-all + * mapping. This implies that ctrl_id values chosen by the algorithm + * \ref MUST be non-zero. + */ +typedef const struct vpx_codec_ctrl_fn_map { + int ctrl_id; + vpx_codec_control_fn_t fn; +} vpx_codec_ctrl_fn_map_t; + +/*!\brief decode data function pointer prototype + * + * Processes a buffer of coded data. If the processing results in a new + * decoded frame becoming available, put_slice and put_frame callbacks + * are invoked as appropriate. This function is called by the generic + * vpx_codec_decode() wrapper function, so plugins implementing this + * interface may trust the input parameters to be properly initialized. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] data Pointer to this block of new coded data. If + * NULL, the put_frame callback is invoked for + * the previously decoded frame. + * \param[in] data_sz Size of the coded data, in bytes. + * + * \return Returns #VPX_CODEC_OK if the coded data was processed completely + * and future pictures can be decoded without error. Otherwise, + * see the descriptions of the other error codes in ::vpx_codec_err_t + * for recoverability capabilities. + */ +typedef vpx_codec_err_t (*vpx_codec_decode_fn_t)(vpx_codec_alg_priv_t *ctx, + const uint8_t *data, + unsigned int data_sz, + void *user_priv); + +/*!\brief Decoded frames iterator + * + * Iterates over a list of the frames available for display. The iterator + * storage should be initialized to NULL to start the iteration. Iteration is + * complete when this function returns NULL. + * + * The list of available frames becomes valid upon completion of the + * vpx_codec_decode call, and remains valid until the next call to + * vpx_codec_decode. + * + * \param[in] ctx Pointer to this instance's context + * \param[in out] iter Iterator storage, initialized to NULL + * + * \return Returns a pointer to an image, if one is ready for display. Frames + * produced will always be in PTS (presentation time stamp) order. + */ +typedef vpx_image_t *(*vpx_codec_get_frame_fn_t)(vpx_codec_alg_priv_t *ctx, + vpx_codec_iter_t *iter); + +/*!\brief Pass in external frame buffers for the decoder to use. + * + * Registers functions to be called when libvpx needs a frame buffer + * to decode the current frame and a function to be called when libvpx does + * not internally reference the frame buffer. This set function must + * be called before the first call to decode or libvpx will assume the + * default behavior of allocating frame buffers internally. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] cb_get Pointer to the get callback function + * \param[in] cb_release Pointer to the release callback function + * \param[in] cb_priv Callback's private data + * + * \retval #VPX_CODEC_OK + * External frame buffers will be used by libvpx. + * \retval #VPX_CODEC_INVALID_PARAM + * One or more of the callbacks were NULL. + * \retval #VPX_CODEC_ERROR + * Decoder context not initialized, or algorithm not capable of + * using external frame buffers. + * + * \note + * When decoding VP9, the application may be required to pass in at least + * #VP9_MAXIMUM_REF_BUFFERS + #VPX_MAXIMUM_WORK_BUFFERS external frame + * buffers. + */ +typedef vpx_codec_err_t (*vpx_codec_set_fb_fn_t)( + vpx_codec_alg_priv_t *ctx, vpx_get_frame_buffer_cb_fn_t cb_get, + vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv); + +typedef vpx_codec_err_t (*vpx_codec_encode_fn_t)(vpx_codec_alg_priv_t *ctx, + const vpx_image_t *img, + vpx_codec_pts_t pts, + unsigned long duration, + vpx_enc_frame_flags_t flags, + vpx_enc_deadline_t deadline); +typedef const vpx_codec_cx_pkt_t *(*vpx_codec_get_cx_data_fn_t)( + vpx_codec_alg_priv_t *ctx, vpx_codec_iter_t *iter); + +typedef vpx_codec_err_t (*vpx_codec_enc_config_set_fn_t)( + vpx_codec_alg_priv_t *ctx, const vpx_codec_enc_cfg_t *cfg); +typedef vpx_fixed_buf_t *(*vpx_codec_get_global_headers_fn_t)( + vpx_codec_alg_priv_t *ctx); + +typedef vpx_image_t *(*vpx_codec_get_preview_frame_fn_t)( + vpx_codec_alg_priv_t *ctx); + +typedef vpx_codec_err_t (*vpx_codec_enc_mr_get_mem_loc_fn_t)( + const vpx_codec_enc_cfg_t *cfg, void **mem_loc); + +/*!\brief usage configuration mapping + * + * This structure stores the mapping between usage identifiers and + * configuration structures. Each algorithm provides a list of these + * mappings. This list is searched by the vpx_codec_enc_config_default() + * wrapper function to determine which config to return. The special value + * {-1, {0}} is used to indicate end-of-list, and must be present. At least + * one mapping must be present, in addition to the end-of-list. + * + */ +typedef const struct vpx_codec_enc_cfg_map { + int usage; + vpx_codec_enc_cfg_t cfg; +} vpx_codec_enc_cfg_map_t; + +/*!\brief Decoder algorithm interface + * + * All decoders \ref MUST expose a variable of this type. + */ +struct vpx_codec_iface { + const char *name; /**< Identification String */ + int abi_version; /**< Implemented ABI version */ + vpx_codec_caps_t caps; /**< Decoder capabilities */ + vpx_codec_init_fn_t init; /**< \copydoc ::vpx_codec_init_fn_t */ + vpx_codec_destroy_fn_t destroy; /**< \copydoc ::vpx_codec_destroy_fn_t */ + vpx_codec_ctrl_fn_map_t *ctrl_maps; /**< \copydoc ::vpx_codec_ctrl_fn_map_t */ + struct vpx_codec_dec_iface { + vpx_codec_peek_si_fn_t peek_si; /**< \copydoc ::vpx_codec_peek_si_fn_t */ + vpx_codec_get_si_fn_t get_si; /**< \copydoc ::vpx_codec_get_si_fn_t */ + vpx_codec_decode_fn_t decode; /**< \copydoc ::vpx_codec_decode_fn_t */ + vpx_codec_get_frame_fn_t + get_frame; /**< \copydoc ::vpx_codec_get_frame_fn_t */ + vpx_codec_set_fb_fn_t set_fb_fn; /**< \copydoc ::vpx_codec_set_fb_fn_t */ + } dec; + struct vpx_codec_enc_iface { + int cfg_map_count; + vpx_codec_enc_cfg_map_t + *cfg_maps; /**< \copydoc ::vpx_codec_enc_cfg_map_t */ + vpx_codec_encode_fn_t encode; /**< \copydoc ::vpx_codec_encode_fn_t */ + vpx_codec_get_cx_data_fn_t + get_cx_data; /**< \copydoc ::vpx_codec_get_cx_data_fn_t */ + vpx_codec_enc_config_set_fn_t + cfg_set; /**< \copydoc ::vpx_codec_enc_config_set_fn_t */ + vpx_codec_get_global_headers_fn_t + get_glob_hdrs; /**< \copydoc ::vpx_codec_get_global_headers_fn_t */ + vpx_codec_get_preview_frame_fn_t + get_preview; /**< \copydoc ::vpx_codec_get_preview_frame_fn_t */ + vpx_codec_enc_mr_get_mem_loc_fn_t + mr_get_mem_loc; /**< \copydoc ::vpx_codec_enc_mr_get_mem_loc_fn_t */ + } enc; +}; + +/*!\brief Callback function pointer / user data pair storage */ +typedef struct vpx_codec_priv_cb_pair { + union { + vpx_codec_put_frame_cb_fn_t put_frame; + vpx_codec_put_slice_cb_fn_t put_slice; + } u; + void *user_priv; +} vpx_codec_priv_cb_pair_t; + +/*!\brief Instance private storage + * + * This structure is allocated by the algorithm's init function. It can be + * extended in one of two ways. First, a second, algorithm specific structure + * can be allocated and the priv member pointed to it. Alternatively, this + * structure can be made the first member of the algorithm specific structure, + * and the pointer cast to the proper type. + */ +struct vpx_codec_priv { + const char *err_detail; + vpx_codec_flags_t init_flags; + struct { + vpx_codec_priv_cb_pair_t put_frame_cb; + vpx_codec_priv_cb_pair_t put_slice_cb; + } dec; + struct { + vpx_fixed_buf_t cx_data_dst_buf; + unsigned int cx_data_pad_before; + unsigned int cx_data_pad_after; + vpx_codec_cx_pkt_t cx_data_pkt; + unsigned int total_encoders; + } enc; +}; + +/* + * Multi-resolution encoding internal configuration + */ +struct vpx_codec_priv_enc_mr_cfg { + unsigned int mr_total_resolutions; + unsigned int mr_encoder_id; + struct vpx_rational mr_down_sampling_factor; + void *mr_low_res_mode_info; +}; + +#undef VPX_CTRL_USE_TYPE +#define VPX_CTRL_USE_TYPE(id, typ) \ + static VPX_INLINE typ id##__value(va_list args) { return va_arg(args, typ); } + +#undef VPX_CTRL_USE_TYPE_DEPRECATED +#define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ) \ + static VPX_INLINE typ id##__value(va_list args) { return va_arg(args, typ); } + +#define CAST(id, arg) id##__value(arg) + +/* CODEC_INTERFACE convenience macro + * + * By convention, each codec interface is a struct with extern linkage, where + * the symbol is suffixed with _algo. A getter function is also defined to + * return a pointer to the struct, since in some cases it's easier to work + * with text symbols than data symbols (see issue #169). This function has + * the same name as the struct, less the _algo suffix. The CODEC_INTERFACE + * macro is provided to define this getter function automatically. + */ +#define CODEC_INTERFACE(id) \ + vpx_codec_iface_t *id(void) { return &id##_algo; } \ + vpx_codec_iface_t id##_algo + +/* Internal Utility Functions + * + * The following functions are intended to be used inside algorithms as + * utilities for manipulating vpx_codec_* data structures. + */ +struct vpx_codec_pkt_list { + unsigned int cnt; + unsigned int max; + struct vpx_codec_cx_pkt pkts[1]; +}; + +#define vpx_codec_pkt_list_decl(n) \ + union { \ + struct vpx_codec_pkt_list head; \ + struct { \ + struct vpx_codec_pkt_list head; \ + struct vpx_codec_cx_pkt pkts[n]; \ + } alloc; \ + } + +#define vpx_codec_pkt_list_init(m) \ + (m)->alloc.head.cnt = 0, \ + (m)->alloc.head.max = sizeof((m)->alloc.pkts) / sizeof((m)->alloc.pkts[0]) + +int vpx_codec_pkt_list_add(struct vpx_codec_pkt_list *, + const struct vpx_codec_cx_pkt *); + +const vpx_codec_cx_pkt_t *vpx_codec_pkt_list_get( + struct vpx_codec_pkt_list *list, vpx_codec_iter_t *iter); + +#include +#include + +struct vpx_internal_error_info { + vpx_codec_err_t error_code; + int has_detail; + char detail[80]; + int setjmp; + jmp_buf jmp; +}; + +#if CONFIG_DEBUG +#define CHECK_MEM_ERROR(error, lval, expr) \ + do { \ + assert((error)->setjmp); \ + (lval) = (expr); \ + if (!(lval)) \ + vpx_internal_error(error, VPX_CODEC_MEM_ERROR, \ + "Failed to allocate " #lval " at %s:%d", __FILE__, \ + __LINE__); \ + } while (0) +#else +#define CHECK_MEM_ERROR(error, lval, expr) \ + do { \ + assert((error)->setjmp); \ + (lval) = (expr); \ + if (!(lval)) \ + vpx_internal_error(error, VPX_CODEC_MEM_ERROR, \ + "Failed to allocate " #lval); \ + } while (0) +#endif + +#define CLANG_ANALYZER_NORETURN +#if defined(__has_feature) +#if __has_feature(attribute_analyzer_noreturn) +#undef CLANG_ANALYZER_NORETURN +#define CLANG_ANALYZER_NORETURN __attribute__((analyzer_noreturn)) +#endif +#endif + +// Tells the compiler to perform `printf` format string checking if the +// compiler supports it; see the 'format' attribute in +// . +#define LIBVPX_FORMAT_PRINTF(string_index, first_to_check) +#if defined(__has_attribute) +#if __has_attribute(format) +#undef LIBVPX_FORMAT_PRINTF +#define LIBVPX_FORMAT_PRINTF(string_index, first_to_check) \ + __attribute__((__format__(__printf__, string_index, first_to_check))) +#endif +#endif + +void vpx_internal_error(struct vpx_internal_error_info *info, + vpx_codec_err_t error, const char *fmt, ...) + LIBVPX_FORMAT_PRINTF(3, 4) CLANG_ANALYZER_NORETURN; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_INTERNAL_VPX_CODEC_INTERNAL_H_ diff --git a/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h b/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h new file mode 100644 index 0000000000..01d64b14b7 --- /dev/null +++ b/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2021 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_INTERNAL_VPX_RATECTRL_RTC_H_ +#define VPX_VPX_INTERNAL_VPX_RATECTRL_RTC_H_ + +#include "vpx/vpx_encoder.h" + +namespace libvpx { + +enum class RcFrameType { kKeyFrame = 0, kInterFrame = 1 }; + +enum class FrameDropDecision { + kOk, // Frame is encoded. + kDrop, // Frame is dropped. +}; + +struct VpxRateControlRtcConfig { + public: + VpxRateControlRtcConfig() { + width = 1280; + height = 720; + max_quantizer = 63; + min_quantizer = 2; + target_bandwidth = 1000; + buf_initial_sz = 600; + buf_optimal_sz = 600; + buf_sz = 1000; + undershoot_pct = overshoot_pct = 50; + max_intra_bitrate_pct = 50; + max_inter_bitrate_pct = 0; + framerate = 30.0; + ts_number_layers = 1; + rc_mode = VPX_CBR; + aq_mode = 0; + layer_target_bitrate[0] = static_cast(target_bandwidth); + ts_rate_decimator[0] = 1; + frame_drop_thresh = 0; + is_screen = false; + } + + int width; + int height; + // 0-63 + int max_quantizer; + int min_quantizer; + int64_t target_bandwidth; + int64_t buf_initial_sz; + int64_t buf_optimal_sz; + int64_t buf_sz; + int undershoot_pct; + int overshoot_pct; + int max_intra_bitrate_pct; + int max_inter_bitrate_pct; + double framerate; + // Number of temporal layers + int ts_number_layers; + int layer_target_bitrate[VPX_MAX_LAYERS]; + int ts_rate_decimator[VPX_TS_MAX_LAYERS]; + // vbr, cbr + enum vpx_rc_mode rc_mode; + int aq_mode; + int frame_drop_thresh; + bool is_screen; +}; +} // namespace libvpx +#endif // VPX_VPX_INTERNAL_VPX_RATECTRL_RTC_H_ diff --git a/media/libvpx/libvpx/vpx/src/vpx_codec.c b/media/libvpx/libvpx/vpx/src/vpx_codec.c new file mode 100644 index 0000000000..24528d860a --- /dev/null +++ b/media/libvpx/libvpx/vpx/src/vpx_codec.c @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/*!\file + * \brief Provides the high level interface to wrap decoder algorithms. + * + */ +#include +#include +#include "vpx/vpx_integer.h" +#include "vpx/internal/vpx_codec_internal.h" +#include "vpx_version.h" + +#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var) + +int vpx_codec_version(void) { return VERSION_PACKED; } + +const char *vpx_codec_version_str(void) { return VERSION_STRING_NOSP; } + +const char *vpx_codec_version_extra_str(void) { return VERSION_EXTRA; } + +const char *vpx_codec_iface_name(vpx_codec_iface_t *iface) { + return iface ? iface->name : ""; +} + +const char *vpx_codec_err_to_string(vpx_codec_err_t err) { + switch (err) { + case VPX_CODEC_OK: return "Success"; + case VPX_CODEC_ERROR: return "Unspecified internal error"; + case VPX_CODEC_MEM_ERROR: return "Memory allocation error"; + case VPX_CODEC_ABI_MISMATCH: return "ABI version mismatch"; + case VPX_CODEC_INCAPABLE: + return "Codec does not implement requested capability"; + case VPX_CODEC_UNSUP_BITSTREAM: + return "Bitstream not supported by this decoder"; + case VPX_CODEC_UNSUP_FEATURE: + return "Bitstream required feature not supported by this decoder"; + case VPX_CODEC_CORRUPT_FRAME: return "Corrupt frame detected"; + case VPX_CODEC_INVALID_PARAM: return "Invalid parameter"; + case VPX_CODEC_LIST_END: return "End of iterated list"; + } + + return "Unrecognized error code"; +} + +const char *vpx_codec_error(const vpx_codec_ctx_t *ctx) { + return (ctx) ? vpx_codec_err_to_string(ctx->err) + : vpx_codec_err_to_string(VPX_CODEC_INVALID_PARAM); +} + +const char *vpx_codec_error_detail(const vpx_codec_ctx_t *ctx) { + if (ctx && ctx->err) + return ctx->priv ? ctx->priv->err_detail : ctx->err_detail; + + return NULL; +} + +vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx) { + vpx_codec_err_t res; + + if (!ctx) + res = VPX_CODEC_INVALID_PARAM; + else if (!ctx->iface || !ctx->priv) + res = VPX_CODEC_ERROR; + else { + ctx->iface->destroy((vpx_codec_alg_priv_t *)ctx->priv); + + ctx->iface = NULL; + ctx->name = NULL; + ctx->priv = NULL; + res = VPX_CODEC_OK; + } + + return SAVE_STATUS(ctx, res); +} + +vpx_codec_caps_t vpx_codec_get_caps(vpx_codec_iface_t *iface) { + return iface ? iface->caps : 0; +} + +vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...) { + vpx_codec_err_t res; + + if (!ctx || !ctrl_id) + res = VPX_CODEC_INVALID_PARAM; + else if (!ctx->iface || !ctx->priv || !ctx->iface->ctrl_maps) + res = VPX_CODEC_ERROR; + else { + vpx_codec_ctrl_fn_map_t *entry; + + res = VPX_CODEC_INCAPABLE; + + for (entry = ctx->iface->ctrl_maps; entry->fn; entry++) { + if (!entry->ctrl_id || entry->ctrl_id == ctrl_id) { + va_list ap; + + va_start(ap, ctrl_id); + res = entry->fn((vpx_codec_alg_priv_t *)ctx->priv, ap); + va_end(ap); + break; + } + } + } + + return SAVE_STATUS(ctx, res); +} + +void vpx_internal_error(struct vpx_internal_error_info *info, + vpx_codec_err_t error, const char *fmt, ...) { + va_list ap; + + info->error_code = error; + info->has_detail = 0; + + if (fmt) { + size_t sz = sizeof(info->detail); + + info->has_detail = 1; + va_start(ap, fmt); + vsnprintf(info->detail, sz - 1, fmt, ap); + va_end(ap); + info->detail[sz - 1] = '\0'; + } + + if (info->setjmp) longjmp(info->jmp, info->error_code); +} diff --git a/media/libvpx/libvpx/vpx/src/vpx_decoder.c b/media/libvpx/libvpx/vpx/src/vpx_decoder.c new file mode 100644 index 0000000000..c79cc708cd --- /dev/null +++ b/media/libvpx/libvpx/vpx/src/vpx_decoder.c @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/*!\file + * \brief Provides the high level interface to wrap decoder algorithms. + * + */ +#include +#include "vpx/internal/vpx_codec_internal.h" + +#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var) + +static vpx_codec_alg_priv_t *get_alg_priv(vpx_codec_ctx_t *ctx) { + return (vpx_codec_alg_priv_t *)ctx->priv; +} + +vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t *ctx, + vpx_codec_iface_t *iface, + const vpx_codec_dec_cfg_t *cfg, + vpx_codec_flags_t flags, int ver) { + vpx_codec_err_t res; + + if (ver != VPX_DECODER_ABI_VERSION) + res = VPX_CODEC_ABI_MISMATCH; + else if (!ctx || !iface) + res = VPX_CODEC_INVALID_PARAM; + else if (iface->abi_version != VPX_CODEC_INTERNAL_ABI_VERSION) + res = VPX_CODEC_ABI_MISMATCH; + else if ((flags & VPX_CODEC_USE_POSTPROC) && + !(iface->caps & VPX_CODEC_CAP_POSTPROC)) + res = VPX_CODEC_INCAPABLE; + else if ((flags & VPX_CODEC_USE_ERROR_CONCEALMENT) && + !(iface->caps & VPX_CODEC_CAP_ERROR_CONCEALMENT)) + res = VPX_CODEC_INCAPABLE; + else if ((flags & VPX_CODEC_USE_INPUT_FRAGMENTS) && + !(iface->caps & VPX_CODEC_CAP_INPUT_FRAGMENTS)) + res = VPX_CODEC_INCAPABLE; + else if (!(iface->caps & VPX_CODEC_CAP_DECODER)) + res = VPX_CODEC_INCAPABLE; + else { + memset(ctx, 0, sizeof(*ctx)); + ctx->iface = iface; + ctx->name = iface->name; + ctx->priv = NULL; + ctx->init_flags = flags; + ctx->config.dec = cfg; + + res = ctx->iface->init(ctx, NULL); + if (res) { + ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL; + vpx_codec_destroy(ctx); + } + } + + return SAVE_STATUS(ctx, res); +} + +vpx_codec_err_t vpx_codec_peek_stream_info(vpx_codec_iface_t *iface, + const uint8_t *data, + unsigned int data_sz, + vpx_codec_stream_info_t *si) { + vpx_codec_err_t res; + + if (!iface || !data || !data_sz || !si || + si->sz < sizeof(vpx_codec_stream_info_t)) + res = VPX_CODEC_INVALID_PARAM; + else { + /* Set default/unknown values */ + si->w = 0; + si->h = 0; + + res = iface->dec.peek_si(data, data_sz, si); + } + + return res; +} + +vpx_codec_err_t vpx_codec_get_stream_info(vpx_codec_ctx_t *ctx, + vpx_codec_stream_info_t *si) { + vpx_codec_err_t res; + + if (!ctx || !si || si->sz < sizeof(vpx_codec_stream_info_t)) + res = VPX_CODEC_INVALID_PARAM; + else if (!ctx->iface || !ctx->priv) + res = VPX_CODEC_ERROR; + else { + /* Set default/unknown values */ + si->w = 0; + si->h = 0; + + res = ctx->iface->dec.get_si(get_alg_priv(ctx), si); + } + + return SAVE_STATUS(ctx, res); +} + +vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t *ctx, const uint8_t *data, + unsigned int data_sz, void *user_priv, + long deadline) { + vpx_codec_err_t res; + (void)deadline; + + /* Sanity checks */ + /* NULL data ptr allowed if data_sz is 0 too */ + if (!ctx || (!data && data_sz) || (data && !data_sz)) + res = VPX_CODEC_INVALID_PARAM; + else if (!ctx->iface || !ctx->priv) + res = VPX_CODEC_ERROR; + else + res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv); + + return SAVE_STATUS(ctx, res); +} + +vpx_image_t *vpx_codec_get_frame(vpx_codec_ctx_t *ctx, vpx_codec_iter_t *iter) { + vpx_image_t *img; + + if (!ctx || !iter || !ctx->iface || !ctx->priv) + img = NULL; + else + img = ctx->iface->dec.get_frame(get_alg_priv(ctx), iter); + + return img; +} + +vpx_codec_err_t vpx_codec_register_put_frame_cb(vpx_codec_ctx_t *ctx, + vpx_codec_put_frame_cb_fn_t cb, + void *user_priv) { + vpx_codec_err_t res; + + if (!ctx || !cb) + res = VPX_CODEC_INVALID_PARAM; + else if (!ctx->iface || !ctx->priv) + res = VPX_CODEC_ERROR; + else if (!(ctx->iface->caps & VPX_CODEC_CAP_PUT_FRAME)) + res = VPX_CODEC_INCAPABLE; + else { + ctx->priv->dec.put_frame_cb.u.put_frame = cb; + ctx->priv->dec.put_frame_cb.user_priv = user_priv; + res = VPX_CODEC_OK; + } + + return SAVE_STATUS(ctx, res); +} + +vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t *ctx, + vpx_codec_put_slice_cb_fn_t cb, + void *user_priv) { + vpx_codec_err_t res; + + if (!ctx || !cb) + res = VPX_CODEC_INVALID_PARAM; + else if (!ctx->iface || !ctx->priv) + res = VPX_CODEC_ERROR; + else if (!(ctx->iface->caps & VPX_CODEC_CAP_PUT_SLICE)) + res = VPX_CODEC_INCAPABLE; + else { + ctx->priv->dec.put_slice_cb.u.put_slice = cb; + ctx->priv->dec.put_slice_cb.user_priv = user_priv; + res = VPX_CODEC_OK; + } + + return SAVE_STATUS(ctx, res); +} + +vpx_codec_err_t vpx_codec_set_frame_buffer_functions( + vpx_codec_ctx_t *ctx, vpx_get_frame_buffer_cb_fn_t cb_get, + vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) { + vpx_codec_err_t res; + + if (!ctx || !cb_get || !cb_release) { + res = VPX_CODEC_INVALID_PARAM; + } else if (!ctx->iface || !ctx->priv) { + res = VPX_CODEC_ERROR; + } else if (!(ctx->iface->caps & VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER)) { + res = VPX_CODEC_INCAPABLE; + } else { + res = ctx->iface->dec.set_fb_fn(get_alg_priv(ctx), cb_get, cb_release, + cb_priv); + } + + return SAVE_STATUS(ctx, res); +} diff --git a/media/libvpx/libvpx/vpx/src/vpx_encoder.c b/media/libvpx/libvpx/vpx/src/vpx_encoder.c new file mode 100644 index 0000000000..017525aeee --- /dev/null +++ b/media/libvpx/libvpx/vpx/src/vpx_encoder.c @@ -0,0 +1,382 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/*!\file + * \brief Provides the high level interface to wrap encoder algorithms. + * + */ +#include +#include +#include +#include +#include "vp8/common/blockd.h" +#include "vpx_config.h" +#include "vpx/internal/vpx_codec_internal.h" + +#define SAVE_STATUS(ctx, var) ((ctx) ? ((ctx)->err = (var)) : (var)) + +static vpx_codec_alg_priv_t *get_alg_priv(vpx_codec_ctx_t *ctx) { + return (vpx_codec_alg_priv_t *)ctx->priv; +} + +vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx, + vpx_codec_iface_t *iface, + const vpx_codec_enc_cfg_t *cfg, + vpx_codec_flags_t flags, int ver) { + vpx_codec_err_t res; + + if (ver != VPX_ENCODER_ABI_VERSION) + res = VPX_CODEC_ABI_MISMATCH; + else if (!ctx || !iface || !cfg) + res = VPX_CODEC_INVALID_PARAM; + else if (iface->abi_version != VPX_CODEC_INTERNAL_ABI_VERSION) + res = VPX_CODEC_ABI_MISMATCH; + else if (!(iface->caps & VPX_CODEC_CAP_ENCODER)) + res = VPX_CODEC_INCAPABLE; + else if ((flags & VPX_CODEC_USE_PSNR) && !(iface->caps & VPX_CODEC_CAP_PSNR)) + res = VPX_CODEC_INCAPABLE; + else if ((flags & VPX_CODEC_USE_OUTPUT_PARTITION) && + !(iface->caps & VPX_CODEC_CAP_OUTPUT_PARTITION)) + res = VPX_CODEC_INCAPABLE; + else { + ctx->iface = iface; + ctx->name = iface->name; + ctx->priv = NULL; + ctx->init_flags = flags; + ctx->config.enc = cfg; + res = ctx->iface->init(ctx, NULL); + + if (res) { + // IMPORTANT: ctx->priv->err_detail must be null or point to a string + // that remains valid after ctx->priv is destroyed, such as a C string + // literal. This makes it safe to call vpx_codec_error_detail() after + // vpx_codec_enc_init_ver() failed. + ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL; + vpx_codec_destroy(ctx); + } + } + + return SAVE_STATUS(ctx, res); +} + +vpx_codec_err_t vpx_codec_enc_init_multi_ver( + vpx_codec_ctx_t *ctx, vpx_codec_iface_t *iface, vpx_codec_enc_cfg_t *cfg, + int num_enc, vpx_codec_flags_t flags, vpx_rational_t *dsf, int ver) { + vpx_codec_err_t res = VPX_CODEC_OK; + + if (ver != VPX_ENCODER_ABI_VERSION) + res = VPX_CODEC_ABI_MISMATCH; + else if (!ctx || !iface || !cfg || (num_enc > 16 || num_enc < 1)) + res = VPX_CODEC_INVALID_PARAM; + else if (iface->abi_version != VPX_CODEC_INTERNAL_ABI_VERSION) + res = VPX_CODEC_ABI_MISMATCH; + else if (!(iface->caps & VPX_CODEC_CAP_ENCODER)) + res = VPX_CODEC_INCAPABLE; + else if ((flags & VPX_CODEC_USE_PSNR) && !(iface->caps & VPX_CODEC_CAP_PSNR)) + res = VPX_CODEC_INCAPABLE; + else if ((flags & VPX_CODEC_USE_OUTPUT_PARTITION) && + !(iface->caps & VPX_CODEC_CAP_OUTPUT_PARTITION)) + res = VPX_CODEC_INCAPABLE; + else { + int i; +#if CONFIG_MULTI_RES_ENCODING + int mem_loc_owned = 0; +#endif + void *mem_loc = NULL; + + if (iface->enc.mr_get_mem_loc == NULL) return VPX_CODEC_INCAPABLE; + + if (!(res = iface->enc.mr_get_mem_loc(cfg, &mem_loc))) { + for (i = 0; i < num_enc; i++) { + vpx_codec_priv_enc_mr_cfg_t mr_cfg; + + /* Validate down-sampling factor. */ + if (dsf->num < 1 || dsf->num > 4096 || dsf->den < 1 || + dsf->den > dsf->num) { + res = VPX_CODEC_INVALID_PARAM; + } else { + mr_cfg.mr_low_res_mode_info = mem_loc; + mr_cfg.mr_total_resolutions = num_enc; + mr_cfg.mr_encoder_id = num_enc - 1 - i; + mr_cfg.mr_down_sampling_factor.num = dsf->num; + mr_cfg.mr_down_sampling_factor.den = dsf->den; + + ctx->iface = iface; + ctx->name = iface->name; + ctx->priv = NULL; + ctx->init_flags = flags; + ctx->config.enc = cfg; + res = ctx->iface->init(ctx, &mr_cfg); + } + + if (res) { + const char *error_detail = ctx->priv ? ctx->priv->err_detail : NULL; + /* Destroy current ctx */ + ctx->err_detail = error_detail; + vpx_codec_destroy(ctx); + + /* Destroy already allocated high-level ctx */ + while (i) { + ctx--; + ctx->err_detail = error_detail; + vpx_codec_destroy(ctx); + i--; + } +#if CONFIG_MULTI_RES_ENCODING + if (!mem_loc_owned) { + assert(mem_loc); + free(((LOWER_RES_FRAME_INFO *)mem_loc)->mb_info); + free(mem_loc); + } +#endif + return SAVE_STATUS(ctx, res); + } +#if CONFIG_MULTI_RES_ENCODING + mem_loc_owned = 1; +#endif + ctx++; + cfg++; + dsf++; + } + ctx--; + } + } + + return SAVE_STATUS(ctx, res); +} + +vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface, + vpx_codec_enc_cfg_t *cfg, + unsigned int usage) { + vpx_codec_err_t res; + + if (!iface || !cfg || usage != 0) + res = VPX_CODEC_INVALID_PARAM; + else if (!(iface->caps & VPX_CODEC_CAP_ENCODER)) + res = VPX_CODEC_INCAPABLE; + else { + assert(iface->enc.cfg_map_count == 1); + *cfg = iface->enc.cfg_maps->cfg; + res = VPX_CODEC_OK; + } + + return res; +} + +#if VPX_ARCH_X86 || VPX_ARCH_X86_64 +/* On X86, disable the x87 unit's internal 80 bit precision for better + * consistency with the SSE unit's 64 bit precision. + */ +#include "vpx_ports/x86.h" +#define FLOATING_POINT_INIT() \ + do { \ + unsigned short x87_orig_mode = x87_set_double_precision() +#define FLOATING_POINT_RESTORE() \ + x87_set_control_word(x87_orig_mode); \ + } \ + while (0) + +#else +static void FLOATING_POINT_INIT() {} +static void FLOATING_POINT_RESTORE() {} +#endif + +vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img, + vpx_codec_pts_t pts, unsigned long duration, + vpx_enc_frame_flags_t flags, + vpx_enc_deadline_t deadline) { + vpx_codec_err_t res = VPX_CODEC_OK; + + if (!ctx || (img && !duration)) + res = VPX_CODEC_INVALID_PARAM; + else if (!ctx->iface || !ctx->priv) + res = VPX_CODEC_ERROR; + else if (!(ctx->iface->caps & VPX_CODEC_CAP_ENCODER)) + res = VPX_CODEC_INCAPABLE; + else { + unsigned int num_enc = ctx->priv->enc.total_encoders; + + /* Execute in a normalized floating point environment, if the platform + * requires it. + */ + FLOATING_POINT_INIT(); + + if (num_enc == 1) + res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts, duration, flags, + deadline); + else { + /* Multi-resolution encoding: + * Encode multi-levels in reverse order. For example, + * if mr_total_resolutions = 3, first encode level 2, + * then encode level 1, and finally encode level 0. + */ + int i; + + ctx += num_enc - 1; + if (img) img += num_enc - 1; + + for (i = num_enc - 1; i >= 0; i--) { + if ((res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts, duration, + flags, deadline))) + break; + + ctx--; + if (img) img--; + } + ctx++; + } + + FLOATING_POINT_RESTORE(); + } + + return SAVE_STATUS(ctx, res); +} + +const vpx_codec_cx_pkt_t *vpx_codec_get_cx_data(vpx_codec_ctx_t *ctx, + vpx_codec_iter_t *iter) { + const vpx_codec_cx_pkt_t *pkt = NULL; + + if (ctx) { + if (!iter) + ctx->err = VPX_CODEC_INVALID_PARAM; + else if (!ctx->iface || !ctx->priv) + ctx->err = VPX_CODEC_ERROR; + else if (!(ctx->iface->caps & VPX_CODEC_CAP_ENCODER)) + ctx->err = VPX_CODEC_INCAPABLE; + else + pkt = ctx->iface->enc.get_cx_data(get_alg_priv(ctx), iter); + } + + if (pkt && pkt->kind == VPX_CODEC_CX_FRAME_PKT) { + // If the application has specified a destination area for the + // compressed data, and the codec has not placed the data there, + // and it fits, copy it. + vpx_codec_priv_t *const priv = ctx->priv; + char *const dst_buf = (char *)priv->enc.cx_data_dst_buf.buf; + + if (dst_buf && pkt->data.raw.buf != dst_buf && + pkt->data.raw.sz + priv->enc.cx_data_pad_before + + priv->enc.cx_data_pad_after <= + priv->enc.cx_data_dst_buf.sz) { + vpx_codec_cx_pkt_t *modified_pkt = &priv->enc.cx_data_pkt; + + memcpy(dst_buf + priv->enc.cx_data_pad_before, pkt->data.raw.buf, + pkt->data.raw.sz); + *modified_pkt = *pkt; + modified_pkt->data.raw.buf = dst_buf; + modified_pkt->data.raw.sz += + priv->enc.cx_data_pad_before + priv->enc.cx_data_pad_after; + pkt = modified_pkt; + } + + if (dst_buf == pkt->data.raw.buf) { + priv->enc.cx_data_dst_buf.buf = dst_buf + pkt->data.raw.sz; + priv->enc.cx_data_dst_buf.sz -= pkt->data.raw.sz; + } + } + + return pkt; +} + +vpx_codec_err_t vpx_codec_set_cx_data_buf(vpx_codec_ctx_t *ctx, + const vpx_fixed_buf_t *buf, + unsigned int pad_before, + unsigned int pad_after) { + if (!ctx || !ctx->priv) return VPX_CODEC_INVALID_PARAM; + + if (buf) { + ctx->priv->enc.cx_data_dst_buf = *buf; + ctx->priv->enc.cx_data_pad_before = pad_before; + ctx->priv->enc.cx_data_pad_after = pad_after; + } else { + ctx->priv->enc.cx_data_dst_buf.buf = NULL; + ctx->priv->enc.cx_data_dst_buf.sz = 0; + ctx->priv->enc.cx_data_pad_before = 0; + ctx->priv->enc.cx_data_pad_after = 0; + } + + return VPX_CODEC_OK; +} + +const vpx_image_t *vpx_codec_get_preview_frame(vpx_codec_ctx_t *ctx) { + vpx_image_t *img = NULL; + + if (ctx) { + if (!ctx->iface || !ctx->priv) + ctx->err = VPX_CODEC_ERROR; + else if (!(ctx->iface->caps & VPX_CODEC_CAP_ENCODER)) + ctx->err = VPX_CODEC_INCAPABLE; + else if (!ctx->iface->enc.get_preview) + ctx->err = VPX_CODEC_INCAPABLE; + else + img = ctx->iface->enc.get_preview(get_alg_priv(ctx)); + } + + return img; +} + +vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx) { + vpx_fixed_buf_t *buf = NULL; + + if (ctx) { + if (!ctx->iface || !ctx->priv) + ctx->err = VPX_CODEC_ERROR; + else if (!(ctx->iface->caps & VPX_CODEC_CAP_ENCODER)) + ctx->err = VPX_CODEC_INCAPABLE; + else if (!ctx->iface->enc.get_glob_hdrs) + ctx->err = VPX_CODEC_INCAPABLE; + else + buf = ctx->iface->enc.get_glob_hdrs(get_alg_priv(ctx)); + } + + return buf; +} + +vpx_codec_err_t vpx_codec_enc_config_set(vpx_codec_ctx_t *ctx, + const vpx_codec_enc_cfg_t *cfg) { + vpx_codec_err_t res; + + if (!ctx || !ctx->iface || !ctx->priv || !cfg) + res = VPX_CODEC_INVALID_PARAM; + else if (!(ctx->iface->caps & VPX_CODEC_CAP_ENCODER)) + res = VPX_CODEC_INCAPABLE; + else + res = ctx->iface->enc.cfg_set(get_alg_priv(ctx), cfg); + + return SAVE_STATUS(ctx, res); +} + +int vpx_codec_pkt_list_add(struct vpx_codec_pkt_list *list, + const struct vpx_codec_cx_pkt *pkt) { + if (list->cnt < list->max) { + list->pkts[list->cnt++] = *pkt; + return 0; + } + + return 1; +} + +const vpx_codec_cx_pkt_t *vpx_codec_pkt_list_get( + struct vpx_codec_pkt_list *list, vpx_codec_iter_t *iter) { + const vpx_codec_cx_pkt_t *pkt; + + if (!(*iter)) { + *iter = list->pkts; + } + + pkt = (const vpx_codec_cx_pkt_t *)*iter; + + if ((size_t)(pkt - list->pkts) < list->cnt) + *iter = pkt + 1; + else + pkt = NULL; + + return pkt; +} diff --git a/media/libvpx/libvpx/vpx/src/vpx_image.c b/media/libvpx/libvpx/vpx/src/vpx_image.c new file mode 100644 index 0000000000..f9f0dd6025 --- /dev/null +++ b/media/libvpx/libvpx/vpx/src/vpx_image.c @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "vpx/vpx_image.h" +#include "vpx/vpx_integer.h" +#include "vpx_mem/vpx_mem.h" + +static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt, + unsigned int d_w, unsigned int d_h, + unsigned int buf_align, + unsigned int stride_align, + unsigned char *img_data) { + unsigned int h, w, s, xcs, ycs, bps; + unsigned int stride_in_bytes; + unsigned int align; + + if (img != NULL) memset(img, 0, sizeof(vpx_image_t)); + + /* Treat align==0 like align==1 */ + if (!buf_align) buf_align = 1; + + /* Validate alignment (must be power of 2) */ + if (buf_align & (buf_align - 1)) goto fail; + + /* Treat align==0 like align==1 */ + if (!stride_align) stride_align = 1; + + /* Validate alignment (must be power of 2) */ + if (stride_align & (stride_align - 1)) goto fail; + + /* Get sample size for this format */ + switch (fmt) { + case VPX_IMG_FMT_I420: + case VPX_IMG_FMT_YV12: + case VPX_IMG_FMT_NV12: bps = 12; break; + case VPX_IMG_FMT_I422: + case VPX_IMG_FMT_I440: bps = 16; break; + case VPX_IMG_FMT_I444: bps = 24; break; + case VPX_IMG_FMT_I42016: bps = 24; break; + case VPX_IMG_FMT_I42216: + case VPX_IMG_FMT_I44016: bps = 32; break; + case VPX_IMG_FMT_I44416: bps = 48; break; + default: bps = 16; break; + } + + /* Get chroma shift values for this format */ + // For VPX_IMG_FMT_NV12, xcs needs to be 0 such that UV data is all read at + // one time. + switch (fmt) { + case VPX_IMG_FMT_I420: + case VPX_IMG_FMT_YV12: + case VPX_IMG_FMT_I422: + case VPX_IMG_FMT_I42016: + case VPX_IMG_FMT_I42216: xcs = 1; break; + default: xcs = 0; break; + } + + switch (fmt) { + case VPX_IMG_FMT_I420: + case VPX_IMG_FMT_NV12: + case VPX_IMG_FMT_I440: + case VPX_IMG_FMT_YV12: + case VPX_IMG_FMT_I42016: + case VPX_IMG_FMT_I44016: ycs = 1; break; + default: ycs = 0; break; + } + + /* Calculate storage sizes. If the buffer was allocated externally, the width + * and height shouldn't be adjusted. */ + w = d_w; + h = d_h; + s = (fmt & VPX_IMG_FMT_PLANAR) ? w : bps * w / 8; + s = (s + stride_align - 1) & ~(stride_align - 1); + stride_in_bytes = (fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? s * 2 : s; + + /* Allocate the new image */ + if (!img) { + img = (vpx_image_t *)calloc(1, sizeof(vpx_image_t)); + + if (!img) goto fail; + + img->self_allocd = 1; + } + + img->img_data = img_data; + + if (!img_data) { + uint64_t alloc_size; + /* Calculate storage sizes given the chroma subsampling */ + align = (1 << xcs) - 1; + w = (d_w + align) & ~align; + align = (1 << ycs) - 1; + h = (d_h + align) & ~align; + + s = (fmt & VPX_IMG_FMT_PLANAR) ? w : bps * w / 8; + s = (s + stride_align - 1) & ~(stride_align - 1); + stride_in_bytes = (fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? s * 2 : s; + alloc_size = (fmt & VPX_IMG_FMT_PLANAR) ? (uint64_t)h * s * bps / 8 + : (uint64_t)h * s; + + if (alloc_size != (size_t)alloc_size) goto fail; + + img->img_data = (uint8_t *)vpx_memalign(buf_align, (size_t)alloc_size); + img->img_data_owner = 1; + } + + if (!img->img_data) goto fail; + + img->fmt = fmt; + img->bit_depth = (fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 16 : 8; + img->w = w; + img->h = h; + img->x_chroma_shift = xcs; + img->y_chroma_shift = ycs; + img->bps = bps; + + /* Calculate strides */ + img->stride[VPX_PLANE_Y] = img->stride[VPX_PLANE_ALPHA] = stride_in_bytes; + img->stride[VPX_PLANE_U] = img->stride[VPX_PLANE_V] = stride_in_bytes >> xcs; + + /* Default viewport to entire image */ + if (!vpx_img_set_rect(img, 0, 0, d_w, d_h)) return img; + +fail: + vpx_img_free(img); + return NULL; +} + +vpx_image_t *vpx_img_alloc(vpx_image_t *img, vpx_img_fmt_t fmt, + unsigned int d_w, unsigned int d_h, + unsigned int align) { + return img_alloc_helper(img, fmt, d_w, d_h, align, align, NULL); +} + +vpx_image_t *vpx_img_wrap(vpx_image_t *img, vpx_img_fmt_t fmt, unsigned int d_w, + unsigned int d_h, unsigned int stride_align, + unsigned char *img_data) { + /* By setting buf_align = 1, we don't change buffer alignment in this + * function. */ + return img_alloc_helper(img, fmt, d_w, d_h, 1, stride_align, img_data); +} + +int vpx_img_set_rect(vpx_image_t *img, unsigned int x, unsigned int y, + unsigned int w, unsigned int h) { + if (x <= UINT_MAX - w && x + w <= img->w && y <= UINT_MAX - h && + y + h <= img->h) { + img->d_w = w; + img->d_h = h; + + /* Calculate plane pointers */ + if (!(img->fmt & VPX_IMG_FMT_PLANAR)) { + img->planes[VPX_PLANE_PACKED] = + img->img_data + x * img->bps / 8 + y * img->stride[VPX_PLANE_PACKED]; + } else { + const int bytes_per_sample = + (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; + unsigned char *data = img->img_data; + + if (img->fmt & VPX_IMG_FMT_HAS_ALPHA) { + img->planes[VPX_PLANE_ALPHA] = + data + x * bytes_per_sample + y * img->stride[VPX_PLANE_ALPHA]; + data += img->h * img->stride[VPX_PLANE_ALPHA]; + } + + img->planes[VPX_PLANE_Y] = + data + x * bytes_per_sample + y * img->stride[VPX_PLANE_Y]; + data += img->h * img->stride[VPX_PLANE_Y]; + + if (img->fmt == VPX_IMG_FMT_NV12) { + img->planes[VPX_PLANE_U] = + data + (x >> img->x_chroma_shift) + + (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_U]; + img->planes[VPX_PLANE_V] = img->planes[VPX_PLANE_U] + 1; + } else if (!(img->fmt & VPX_IMG_FMT_UV_FLIP)) { + img->planes[VPX_PLANE_U] = + data + (x >> img->x_chroma_shift) * bytes_per_sample + + (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_U]; + data += (img->h >> img->y_chroma_shift) * img->stride[VPX_PLANE_U]; + img->planes[VPX_PLANE_V] = + data + (x >> img->x_chroma_shift) * bytes_per_sample + + (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_V]; + } else { + img->planes[VPX_PLANE_V] = + data + (x >> img->x_chroma_shift) * bytes_per_sample + + (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_V]; + data += (img->h >> img->y_chroma_shift) * img->stride[VPX_PLANE_V]; + img->planes[VPX_PLANE_U] = + data + (x >> img->x_chroma_shift) * bytes_per_sample + + (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_U]; + } + } + return 0; + } + return -1; +} + +void vpx_img_flip(vpx_image_t *img) { + /* Note: In the calculation pointer adjustment calculation, we want the + * rhs to be promoted to a signed type. Section 6.3.1.8 of the ISO C99 + * standard indicates that if the adjustment parameter is unsigned, the + * stride parameter will be promoted to unsigned, causing errors when + * the lhs is a larger type than the rhs. + */ + img->planes[VPX_PLANE_Y] += (signed)(img->d_h - 1) * img->stride[VPX_PLANE_Y]; + img->stride[VPX_PLANE_Y] = -img->stride[VPX_PLANE_Y]; + + img->planes[VPX_PLANE_U] += (signed)((img->d_h >> img->y_chroma_shift) - 1) * + img->stride[VPX_PLANE_U]; + img->stride[VPX_PLANE_U] = -img->stride[VPX_PLANE_U]; + + img->planes[VPX_PLANE_V] += (signed)((img->d_h >> img->y_chroma_shift) - 1) * + img->stride[VPX_PLANE_V]; + img->stride[VPX_PLANE_V] = -img->stride[VPX_PLANE_V]; + + img->planes[VPX_PLANE_ALPHA] += + (signed)(img->d_h - 1) * img->stride[VPX_PLANE_ALPHA]; + img->stride[VPX_PLANE_ALPHA] = -img->stride[VPX_PLANE_ALPHA]; +} + +void vpx_img_free(vpx_image_t *img) { + if (img) { + if (img->img_data && img->img_data_owner) vpx_free(img->img_data); + + if (img->self_allocd) free(img); + } +} diff --git a/media/libvpx/libvpx/vpx/src/vpx_tpl.c b/media/libvpx/libvpx/vpx/src/vpx_tpl.c new file mode 100644 index 0000000000..62c2a9c857 --- /dev/null +++ b/media/libvpx/libvpx/vpx/src/vpx_tpl.c @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vpx/vpx_codec.h" +#include "vpx/vpx_tpl.h" +#include "vpx_mem/vpx_mem.h" + +#define CHECK_FPRINTF_ERROR(expr) \ + do { \ + if (expr < 0) { \ + return VPX_CODEC_ERROR; \ + } \ + } while (0) + +#define CHECK_FSCANF_ERROR(expr, expected_value) \ + do { \ + if (expr != expected_value) { \ + return VPX_CODEC_ERROR; \ + } \ + } while (0) + +vpx_codec_err_t vpx_write_tpl_gop_stats(FILE *tpl_file, + const VpxTplGopStats *tpl_gop_stats) { + int i; + if (tpl_file == NULL || tpl_gop_stats == NULL) return VPX_CODEC_INVALID_PARAM; + CHECK_FPRINTF_ERROR(fprintf(tpl_file, "%d\n", tpl_gop_stats->size)); + + for (i = 0; i < tpl_gop_stats->size; i++) { + VpxTplFrameStats frame_stats = tpl_gop_stats->frame_stats_list[i]; + const int num_blocks = frame_stats.num_blocks; + int block; + CHECK_FPRINTF_ERROR(fprintf(tpl_file, "%d %d %d\n", frame_stats.frame_width, + frame_stats.frame_height, num_blocks)); + for (block = 0; block < num_blocks; block++) { + VpxTplBlockStats block_stats = frame_stats.block_stats_list[block]; + CHECK_FPRINTF_ERROR( + fprintf(tpl_file, + "%" PRId64 " %" PRId64 " %" PRId16 " %" PRId16 " %" PRId64 + " %" PRId64 " %d\n", + block_stats.inter_cost, block_stats.intra_cost, + block_stats.mv_c, block_stats.mv_r, block_stats.recrf_dist, + block_stats.recrf_rate, block_stats.ref_frame_index)); + } + } + + return VPX_CODEC_OK; +} + +vpx_codec_err_t vpx_read_tpl_gop_stats(FILE *tpl_file, + VpxTplGopStats *tpl_gop_stats) { + int i, frame_list_size; + if (tpl_file == NULL || tpl_gop_stats == NULL) return VPX_CODEC_INVALID_PARAM; + CHECK_FSCANF_ERROR(fscanf(tpl_file, "%d\n", &frame_list_size), 1); + tpl_gop_stats->size = frame_list_size; + tpl_gop_stats->frame_stats_list = (VpxTplFrameStats *)vpx_calloc( + frame_list_size, sizeof(tpl_gop_stats->frame_stats_list[0])); + if (tpl_gop_stats->frame_stats_list == NULL) { + return VPX_CODEC_MEM_ERROR; + } + for (i = 0; i < frame_list_size; i++) { + VpxTplFrameStats *frame_stats = &tpl_gop_stats->frame_stats_list[i]; + int num_blocks, width, height, block; + CHECK_FSCANF_ERROR( + fscanf(tpl_file, "%d %d %d\n", &width, &height, &num_blocks), 3); + frame_stats->num_blocks = num_blocks; + frame_stats->frame_width = width; + frame_stats->frame_height = height; + frame_stats->block_stats_list = (VpxTplBlockStats *)vpx_calloc( + num_blocks, sizeof(frame_stats->block_stats_list[0])); + if (frame_stats->block_stats_list == NULL) { + vpx_free_tpl_gop_stats(tpl_gop_stats); + return VPX_CODEC_MEM_ERROR; + } + for (block = 0; block < num_blocks; block++) { + VpxTplBlockStats *block_stats = &frame_stats->block_stats_list[block]; + CHECK_FSCANF_ERROR( + fscanf(tpl_file, + "%" SCNd64 " %" SCNd64 " %" SCNd16 " %" SCNd16 " %" SCNd64 + " %" SCNd64 " %d\n", + &block_stats->inter_cost, &block_stats->intra_cost, + &block_stats->mv_c, &block_stats->mv_r, + &block_stats->recrf_dist, &block_stats->recrf_rate, + &block_stats->ref_frame_index), + 7); + } + } + + return VPX_CODEC_OK; +} + +void vpx_free_tpl_gop_stats(VpxTplGopStats *tpl_gop_stats) { + int frame; + if (tpl_gop_stats == NULL) return; + for (frame = 0; frame < tpl_gop_stats->size; frame++) { + vpx_free(tpl_gop_stats->frame_stats_list[frame].block_stats_list); + } + vpx_free(tpl_gop_stats->frame_stats_list); +} diff --git a/media/libvpx/libvpx/vpx/vp8.h b/media/libvpx/libvpx/vpx/vp8.h new file mode 100644 index 0000000000..f30dafed58 --- /dev/null +++ b/media/libvpx/libvpx/vpx/vp8.h @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/*!\defgroup vp8 VP8 + * \ingroup codecs + * VP8 is a video compression algorithm that uses motion + * compensated prediction, Discrete Cosine Transform (DCT) coding of the + * prediction error signal and context dependent entropy coding techniques + * based on arithmetic principles. It features: + * - YUV 4:2:0 image format + * - Macro-block based coding (16x16 luma plus two 8x8 chroma) + * - 1/4 (1/8) pixel accuracy motion compensated prediction + * - 4x4 DCT transform + * - 128 level linear quantizer + * - In loop deblocking filter + * - Context-based entropy coding + * + * @{ + */ +/*!\file + * \brief Provides controls common to both the VP8 encoder and decoder. + */ +#ifndef VPX_VPX_VP8_H_ +#define VPX_VPX_VP8_H_ + +#include "./vpx_codec.h" +#include "./vpx_image.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\brief Control functions + * + * The set of macros define the control functions of VP8 interface + */ +enum vp8_com_control_id { + /*!\brief pass in an external frame into decoder to be used as reference frame + */ + VP8_SET_REFERENCE = 1, + VP8_COPY_REFERENCE = 2, /**< get a copy of reference frame from the decoder */ + VP8_SET_POSTPROC = 3, /**< set the decoder's post processing settings */ + + /* TODO(jkoleszar): The encoder incorrectly reuses some of these values (5+) + * for its control ids. These should be migrated to something like the + * VP8_DECODER_CTRL_ID_START range next time we're ready to break the ABI. + */ + VP9_GET_REFERENCE = 128, /**< get a pointer to a reference frame */ + VP8_COMMON_CTRL_ID_MAX, + VP8_DECODER_CTRL_ID_START = 256 +}; + +/*!\brief post process flags + * + * The set of macros define VP8 decoder post processing flags + */ +enum vp8_postproc_level { + VP8_NOFILTERING = 0, + VP8_DEBLOCK = 1 << 0, + VP8_DEMACROBLOCK = 1 << 1, + VP8_ADDNOISE = 1 << 2, + VP8_MFQE = 1 << 3 +}; + +/*!\brief post process flags + * + * This define a structure that describe the post processing settings. For + * the best objective measure (using the PSNR metric) set post_proc_flag + * to VP8_DEBLOCK and deblocking_level to 1. + */ + +typedef struct vp8_postproc_cfg { + /*!\brief the types of post processing to be done, should be combination of + * "vp8_postproc_level" */ + int post_proc_flag; + int deblocking_level; /**< the strength of deblocking, valid range [0, 16] */ + int noise_level; /**< the strength of additive noise, valid range [0, 16] */ +} vp8_postproc_cfg_t; + +/*!\brief reference frame type + * + * The set of macros define the type of VP8 reference frames + */ +typedef enum vpx_ref_frame_type { + VP8_LAST_FRAME = 1, + VP8_GOLD_FRAME = 2, + VP8_ALTR_FRAME = 4 +} vpx_ref_frame_type_t; + +/*!\brief reference frame data struct + * + * Define the data struct to access vp8 reference frames. + */ +typedef struct vpx_ref_frame { + vpx_ref_frame_type_t frame_type; /**< which reference frame */ + vpx_image_t img; /**< reference frame data in image format */ +} vpx_ref_frame_t; + +/*!\brief VP9 specific reference frame data struct + * + * Define the data struct to access vp9 reference frames. + */ +typedef struct vp9_ref_frame { + int idx; /**< frame index to get (input) */ + vpx_image_t img; /**< img structure to populate (output) */ +} vp9_ref_frame_t; + +/*!\cond */ +/*!\brief vp8 decoder control function parameter type + * + * defines the data type for each of VP8 decoder control function requires + */ +VPX_CTRL_USE_TYPE(VP8_SET_REFERENCE, vpx_ref_frame_t *) +#define VPX_CTRL_VP8_SET_REFERENCE +VPX_CTRL_USE_TYPE(VP8_COPY_REFERENCE, vpx_ref_frame_t *) +#define VPX_CTRL_VP8_COPY_REFERENCE +VPX_CTRL_USE_TYPE(VP8_SET_POSTPROC, vp8_postproc_cfg_t *) +#define VPX_CTRL_VP8_SET_POSTPROC +VPX_CTRL_USE_TYPE(VP9_GET_REFERENCE, vp9_ref_frame_t *) +#define VPX_CTRL_VP9_GET_REFERENCE + +/*!\endcond */ +/*! @} - end defgroup vp8 */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_VP8_H_ diff --git a/media/libvpx/libvpx/vpx/vp8cx.h b/media/libvpx/libvpx/vpx/vp8cx.h new file mode 100644 index 0000000000..b12938d3d8 --- /dev/null +++ b/media/libvpx/libvpx/vpx/vp8cx.h @@ -0,0 +1,1118 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_VPX_VP8CX_H_ +#define VPX_VPX_VP8CX_H_ + +/*!\defgroup vp8_encoder WebM VP8/VP9 Encoder + * \ingroup vp8 + * + * @{ + */ +#include "./vp8.h" +#include "./vpx_encoder.h" +#include "./vpx_ext_ratectrl.h" + +/*!\file + * \brief Provides definitions for using VP8 or VP9 encoder algorithm within the + * vpx Codec Interface. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\name Algorithm interface for VP8 + * + * This interface provides the capability to encode raw VP8 streams. + * @{ + */ + +/*!\brief A single instance of the VP8 encoder. + *\deprecated This access mechanism is provided for backwards compatibility; + * prefer vpx_codec_vp8_cx(). + */ +extern vpx_codec_iface_t vpx_codec_vp8_cx_algo; + +/*!\brief The interface to the VP8 encoder. + */ +extern vpx_codec_iface_t *vpx_codec_vp8_cx(void); +/*!@} - end algorithm interface member group*/ + +/*!\name Algorithm interface for VP9 + * + * This interface provides the capability to encode raw VP9 streams. + * @{ + */ + +/*!\brief A single instance of the VP9 encoder. + *\deprecated This access mechanism is provided for backwards compatibility; + * prefer vpx_codec_vp9_cx(). + */ +extern vpx_codec_iface_t vpx_codec_vp9_cx_algo; + +/*!\brief The interface to the VP9 encoder. + */ +extern vpx_codec_iface_t *vpx_codec_vp9_cx(void); +/*!@} - end algorithm interface member group*/ + +/* + * Algorithm Flags + */ + +/*!\brief Don't reference the last frame + * + * When this flag is set, the encoder will not use the last frame as a + * predictor. When not set, the encoder will choose whether to use the + * last frame or not automatically. + */ +#define VP8_EFLAG_NO_REF_LAST (1 << 16) + +/*!\brief Don't reference the golden frame + * + * When this flag is set, the encoder will not use the golden frame as a + * predictor. When not set, the encoder will choose whether to use the + * golden frame or not automatically. + */ +#define VP8_EFLAG_NO_REF_GF (1 << 17) + +/*!\brief Don't reference the alternate reference frame + * + * When this flag is set, the encoder will not use the alt ref frame as a + * predictor. When not set, the encoder will choose whether to use the + * alt ref frame or not automatically. + */ +#define VP8_EFLAG_NO_REF_ARF (1 << 21) + +/*!\brief Don't update the last frame + * + * When this flag is set, the encoder will not update the last frame with + * the contents of the current frame. + */ +#define VP8_EFLAG_NO_UPD_LAST (1 << 18) + +/*!\brief Don't update the golden frame + * + * When this flag is set, the encoder will not update the golden frame with + * the contents of the current frame. + */ +#define VP8_EFLAG_NO_UPD_GF (1 << 22) + +/*!\brief Don't update the alternate reference frame + * + * When this flag is set, the encoder will not update the alt ref frame with + * the contents of the current frame. + */ +#define VP8_EFLAG_NO_UPD_ARF (1 << 23) + +/*!\brief Force golden frame update + * + * When this flag is set, the encoder copy the contents of the current frame + * to the golden frame buffer. + */ +#define VP8_EFLAG_FORCE_GF (1 << 19) + +/*!\brief Force alternate reference frame update + * + * When this flag is set, the encoder copy the contents of the current frame + * to the alternate reference frame buffer. + */ +#define VP8_EFLAG_FORCE_ARF (1 << 24) + +/*!\brief Disable entropy update + * + * When this flag is set, the encoder will not update its internal entropy + * model based on the entropy of this frame. + */ +#define VP8_EFLAG_NO_UPD_ENTROPY (1 << 20) + +/*!\brief VPx encoder control functions + * + * This set of macros define the control functions available for VPx + * encoder interface. + * + * \sa #vpx_codec_control + */ +enum vp8e_enc_control_id { + /*!\brief Codec control function to pass an ROI map to encoder. + * + * Supported in codecs: VP8 + */ + VP8E_SET_ROI_MAP = 8, + + /*!\brief Codec control function to pass an Active map to encoder. + * + * Supported in codecs: VP8, VP9 + */ + VP8E_SET_ACTIVEMAP, + + /*!\brief Codec control function to set encoder scaling mode. + * + * Supported in codecs: VP8, VP9 + */ + VP8E_SET_SCALEMODE = 11, + + /*!\brief Codec control function to set encoder internal speed settings. + * + * Changes in this value influences, among others, the encoder's selection + * of motion estimation methods. Values greater than 0 will increase encoder + * speed at the expense of quality. + * + * \note Valid range for VP8: -16..16 + * \note Valid range for VP9: -9..9 + * \note A negative value (-n) is treated as its absolute value (n) in VP9. + * + * Supported in codecs: VP8, VP9 + */ + VP8E_SET_CPUUSED = 13, + + /*!\brief Codec control function to enable automatic use of arf frames. + * + * \note Valid range for VP8: 0..1 + * \note Valid range for VP9: 0..6 + * + * Supported in codecs: VP8, VP9 + */ + VP8E_SET_ENABLEAUTOALTREF, + + /*!\brief control function to set noise sensitivity + * + * 0: off, 1: OnYOnly, 2: OnYUV, + * 3: OnYUVAggressive, 4: Adaptive + * + * Supported in codecs: VP8 + */ + VP8E_SET_NOISE_SENSITIVITY, + + /*!\brief Codec control function to set higher sharpness at the expense + * of a lower PSNR. + * + * \note Valid range: 0..7 + * + * Supported in codecs: VP8, VP9 + */ + VP8E_SET_SHARPNESS, + + /*!\brief Codec control function to set the threshold for MBs treated static. + * + * Supported in codecs: VP8, VP9 + */ + VP8E_SET_STATIC_THRESHOLD, + + /*!\brief Codec control function to set the number of token partitions. + * + * Supported in codecs: VP8 + */ + VP8E_SET_TOKEN_PARTITIONS, + + /*!\brief Codec control function to get last quantizer chosen by the encoder. + * + * Return value uses internal quantizer scale defined by the codec. + * + * Supported in codecs: VP8, VP9 + */ + VP8E_GET_LAST_QUANTIZER, + + /*!\brief Codec control function to get last quantizer chosen by the encoder. + * + * Return value uses the 0..63 scale as used by the rc_*_quantizer config + * parameters. + * + * Supported in codecs: VP8, VP9 + */ + VP8E_GET_LAST_QUANTIZER_64, + + /*!\brief Codec control function to set the max no of frames to create arf. + * + * Supported in codecs: VP8, VP9 + */ + VP8E_SET_ARNR_MAXFRAMES, + + /*!\brief Codec control function to set the filter strength for the arf. + * + * Supported in codecs: VP8, VP9 + */ + VP8E_SET_ARNR_STRENGTH, + + /*!\deprecated control function to set the filter type to use for the arf. */ + VP8E_SET_ARNR_TYPE, + + /*!\brief Codec control function to set visual tuning. + * + * Supported in codecs: VP8, VP9 + */ + VP8E_SET_TUNING, + + /*!\brief Codec control function to set constrained / constant quality level. + * + * \attention For this value to be used vpx_codec_enc_cfg_t::rc_end_usage must + * be set to #VPX_CQ or #VPX_Q + * \note Valid range: 0..63 + * + * Supported in codecs: VP8, VP9 + */ + VP8E_SET_CQ_LEVEL, + + /*!\brief Codec control function to set Max data rate for Intra frames. + * + * This value controls additional clamping on the maximum size of a + * keyframe. It is expressed as a percentage of the average + * per-frame bitrate, with the special (and default) value 0 meaning + * unlimited, or no additional clamping beyond the codec's built-in + * algorithm. + * + * For example, to allocate no more than 4.5 frames worth of bitrate + * to a keyframe, set this to 450. + * + * Supported in codecs: VP8, VP9 + */ + VP8E_SET_MAX_INTRA_BITRATE_PCT, + + /*!\brief Codec control function to set reference and update frame flags. + * + * Supported in codecs: VP8 + */ + VP8E_SET_FRAME_FLAGS, + + /*!\brief Codec control function to set max data rate for Inter frames. + * + * This value controls additional clamping on the maximum size of an + * inter frame. It is expressed as a percentage of the average + * per-frame bitrate, with the special (and default) value 0 meaning + * unlimited, or no additional clamping beyond the codec's built-in + * algorithm. + * + * For example, to allow no more than 4.5 frames worth of bitrate + * to an inter frame, set this to 450. + * + * Supported in codecs: VP9 + */ + VP9E_SET_MAX_INTER_BITRATE_PCT, + + /*!\brief Boost percentage for Golden Frame in CBR mode. + * + * This value controls the amount of boost given to Golden Frame in + * CBR mode. It is expressed as a percentage of the average + * per-frame bitrate, with the special (and default) value 0 meaning + * the feature is off, i.e., no golden frame boost in CBR mode and + * average bitrate target is used. + * + * For example, to allow 100% more bits, i.e., 2X, in a golden frame + * than average frame, set this to 100. + * + * Supported in codecs: VP9 + */ + VP9E_SET_GF_CBR_BOOST_PCT, + + /*!\brief Codec control function to set the temporal layer id. + * + * For temporal scalability: this control allows the application to set the + * layer id for each frame to be encoded. Note that this control must be set + * for every frame prior to encoding. The usage of this control function + * supersedes the internal temporal pattern counter, which is now deprecated. + * + * Supported in codecs: VP8 + */ + VP8E_SET_TEMPORAL_LAYER_ID, + + /*!\brief Codec control function to set encoder screen content mode. + * + * 0: off, 1: On, 2: On with more aggressive rate control. + * + * Supported in codecs: VP8 + */ + VP8E_SET_SCREEN_CONTENT_MODE, + + /*!\brief Codec control function to set lossless encoding mode. + * + * VP9 can operate in lossless encoding mode, in which the bitstream + * produced will be able to decode and reconstruct a perfect copy of + * input source. This control function provides a mean to switch encoder + * into lossless coding mode(1) or normal coding mode(0) that may be lossy. + * 0 = lossy coding mode + * 1 = lossless coding mode + * + * By default, encoder operates in normal coding mode (maybe lossy). + * + * Supported in codecs: VP9 + */ + VP9E_SET_LOSSLESS, + + /*!\brief Codec control function to set number of tile columns. + * + * In encoding and decoding, VP9 allows an input image frame be partitioned + * into separated vertical tile columns, which can be encoded or decoded + * independently. This enables easy implementation of parallel encoding and + * decoding. This control requests the encoder to use column tiles in + * encoding an input frame, with number of tile columns (in Log2 unit) as + * the parameter: + * 0 = 1 tile column + * 1 = 2 tile columns + * 2 = 4 tile columns + * ..... + * n = 2**n tile columns + * The requested tile columns will be capped by the encoder based on image + * size limitations (The minimum width of a tile column is 256 pixels, the + * maximum is 4096). + * + * By default, the value is 6, i.e., the maximum number of tiles supported by + * the resolution. + * + * Supported in codecs: VP9 + */ + VP9E_SET_TILE_COLUMNS, + + /*!\brief Codec control function to set number of tile rows. + * + * In encoding and decoding, VP9 allows an input image frame be partitioned + * into separated horizontal tile rows. Tile rows are encoded or decoded + * sequentially. Even though encoding/decoding of later tile rows depends on + * earlier ones, this allows the encoder to output data packets for tile rows + * prior to completely processing all tile rows in a frame, thereby reducing + * the latency in processing between input and output. The parameter + * for this control describes the number of tile rows, which has a valid + * range [0, 2]: + * 0 = 1 tile row + * 1 = 2 tile rows + * 2 = 4 tile rows + * + * By default, the value is 0, i.e. one single row tile for entire image. + * + * Supported in codecs: VP9 + */ + VP9E_SET_TILE_ROWS, + + /*!\brief Codec control function to enable frame parallel decoding feature. + * + * VP9 has a bitstream feature to reduce decoding dependency between frames + * by turning off backward update of probability context used in encoding + * and decoding. This allows staged parallel processing of more than one + * video frame in the decoder. This control function provides a means to + * turn this feature on or off for bitstreams produced by encoder. + * + * By default, this feature is on. + * + * Supported in codecs: VP9 + */ + VP9E_SET_FRAME_PARALLEL_DECODING, + + /*!\brief Codec control function to set adaptive quantization mode. + * + * VP9 has a segment based feature that allows encoder to adaptively change + * quantization parameter for each segment within a frame to improve the + * subjective quality. This control makes encoder operate in one of the + * several AQ_modes supported. + * + * By default, encoder operates with AQ_Mode 0(adaptive quantization off). + * + * Supported in codecs: VP9 + */ + VP9E_SET_AQ_MODE, + + /*!\brief Codec control function to enable/disable periodic Q boost. + * + * One VP9 encoder speed feature is to enable quality boost by lowering + * frame level Q periodically. This control function provides a mean to + * turn on/off this feature. + * 0 = off + * 1 = on + * + * By default, the encoder is allowed to use this feature for appropriate + * encoding modes. + * + * Supported in codecs: VP9 + */ + VP9E_SET_FRAME_PERIODIC_BOOST, + + /*!\brief Codec control function to set noise sensitivity. + * + * 0: off, 1: On(YOnly), 2: For SVC only, on top two spatial layers(YOnly) + * + * Supported in codecs: VP9 + */ + VP9E_SET_NOISE_SENSITIVITY, + + /*!\brief Codec control function to turn on/off SVC in encoder. + * \note Return value is VPX_CODEC_INVALID_PARAM if the encoder does not + * support SVC in its current encoding mode + * 0: off, 1: on + * + * Supported in codecs: VP9 + */ + VP9E_SET_SVC, + + /*!\brief Codec control function to pass an ROI map to encoder. + * + * Supported in codecs: VP9 + */ + VP9E_SET_ROI_MAP, + + /*!\brief Codec control function to set parameters for SVC. + * \note Parameters contain min_q, max_q, scaling factor for each of the + * SVC layers. + * + * Supported in codecs: VP9 + */ + VP9E_SET_SVC_PARAMETERS, + + /*!\brief Codec control function to set svc layer for spatial and temporal. + * \note Valid ranges: 0..#vpx_codec_enc_cfg::ss_number_layers for spatial + * layer and 0..#vpx_codec_enc_cfg::ts_number_layers for + * temporal layer. + * + * Supported in codecs: VP9 + */ + VP9E_SET_SVC_LAYER_ID, + + /*!\brief Codec control function to set content type. + * \note Valid parameter range: + * VP9E_CONTENT_DEFAULT = Regular video content (Default) + * VP9E_CONTENT_SCREEN = Screen capture content + * VP9E_CONTENT_FILM = Film content: improves grain retention + * + * Supported in codecs: VP9 + */ + VP9E_SET_TUNE_CONTENT, + + /*!\brief Codec control function to get svc layer ID. + * \note The layer ID returned is for the data packet from the registered + * callback function. + * + * Supported in codecs: VP9 + */ + VP9E_GET_SVC_LAYER_ID, + + /*!\brief Codec control function to register callback to get per layer packet. + * \note Parameter for this control function is a structure with a callback + * function and a pointer to private data used by the callback. + * + * Supported in codecs: VP9 + */ + VP9E_REGISTER_CX_CALLBACK, + + /*!\brief Codec control function to set color space info. + * \note Valid ranges: 0..7, default is "UNKNOWN". + * 0 = UNKNOWN, + * 1 = BT_601 + * 2 = BT_709 + * 3 = SMPTE_170 + * 4 = SMPTE_240 + * 5 = BT_2020 + * 6 = RESERVED + * 7 = SRGB + * + * Supported in codecs: VP9 + */ + VP9E_SET_COLOR_SPACE, + + /*!\brief Codec control function to set minimum interval between GF/ARF frames + * + * By default the value is set as 4. + * + * Supported in codecs: VP9 + */ + VP9E_SET_MIN_GF_INTERVAL = 48, + + /*!\brief Codec control function to set minimum interval between GF/ARF frames + * + * By default the value is set as 16. + * + * Supported in codecs: VP9 + */ + VP9E_SET_MAX_GF_INTERVAL, + + /*!\brief Codec control function to get an Active map back from the encoder. + * + * Supported in codecs: VP9 + */ + VP9E_GET_ACTIVEMAP, + + /*!\brief Codec control function to set color range bit. + * \note Valid ranges: 0..1, default is 0 + * 0 = Limited range (16..235 or HBD equivalent) + * 1 = Full range (0..255 or HBD equivalent) + * + * Supported in codecs: VP9 + */ + VP9E_SET_COLOR_RANGE, + + /*!\brief Codec control function to set the frame flags and buffer indices + * for spatial layers. The frame flags and buffer indices are set using the + * struct #vpx_svc_ref_frame_config defined below. + * + * Supported in codecs: VP9 + */ + VP9E_SET_SVC_REF_FRAME_CONFIG, + + /*!\brief Codec control function to set intended rendering image size. + * + * By default, this is identical to the image size in pixels. + * + * Supported in codecs: VP9 + */ + VP9E_SET_RENDER_SIZE, + + /*!\brief Codec control function to set target level. + * + * 255: off (default); 0: only keep level stats; 10: target for level 1.0; + * 11: target for level 1.1; ... 62: target for level 6.2 + * + * Supported in codecs: VP9 + */ + VP9E_SET_TARGET_LEVEL, + + /*!\brief Codec control function to set row level multi-threading. + * + * 0 : off, 1 : on + * + * Supported in codecs: VP9 + */ + VP9E_SET_ROW_MT, + + /*!\brief Codec control function to get bitstream level. + * + * Supported in codecs: VP9 + */ + VP9E_GET_LEVEL, + + /*!\brief Codec control function to enable/disable special mode for altref + * adaptive quantization. You can use it with --aq-mode concurrently. + * + * Enable special adaptive quantization for altref frames based on their + * expected prediction quality for the future frames. + * + * Supported in codecs: VP9 + */ + VP9E_SET_ALT_REF_AQ, + + /*!\brief Boost percentage for Golden Frame in CBR mode. + * + * This value controls the amount of boost given to Golden Frame in + * CBR mode. It is expressed as a percentage of the average + * per-frame bitrate, with the special (and default) value 0 meaning + * the feature is off, i.e., no golden frame boost in CBR mode and + * average bitrate target is used. + * + * For example, to allow 100% more bits, i.e., 2X, in a golden frame + * than average frame, set this to 100. + * + * Supported in codecs: VP8 + */ + VP8E_SET_GF_CBR_BOOST_PCT, + + /*!\brief Codec control function to enable the extreme motion vector unit test + * in VP9. Please note that this is only used in motion vector unit test. + * + * 0 : off, 1 : MAX_EXTREME_MV, 2 : MIN_EXTREME_MV + * + * Supported in codecs: VP9 + */ + VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, + + /*!\brief Codec control function to constrain the inter-layer prediction + * (prediction of lower spatial resolution) in VP9 SVC. + * + * 0 : inter-layer prediction on, 1 : off, 2 : off only on non-key frames + * + * Supported in codecs: VP9 + */ + VP9E_SET_SVC_INTER_LAYER_PRED, + + /*!\brief Codec control function to set mode and thresholds for frame + * dropping in SVC. Drop frame thresholds are set per-layer. Mode is set as: + * 0 : layer-dependent dropping, 1 : constrained dropping, current layer drop + * forces drop on all upper layers. Default mode is 0. + * + * Supported in codecs: VP9 + */ + VP9E_SET_SVC_FRAME_DROP_LAYER, + + /*!\brief Codec control function to get the refresh and reference flags and + * the buffer indices, up to the last encoded spatial layer. + * + * Supported in codecs: VP9 + */ + VP9E_GET_SVC_REF_FRAME_CONFIG, + + /*!\brief Codec control function to enable/disable use of golden reference as + * a second temporal reference for SVC. Only used when inter-layer prediction + * is disabled on INTER frames. + * + * 0: Off, 1: Enabled (default) + * + * Supported in codecs: VP9 + */ + VP9E_SET_SVC_GF_TEMPORAL_REF, + + /*!\brief Codec control function to enable spatial layer sync frame, for any + * spatial layer. Enabling it for layer k means spatial layer k will disable + * all temporal prediction, but keep the inter-layer prediction. It will + * refresh any temporal reference buffer for that layer, and reset the + * temporal layer for the superframe to 0. Setting the layer sync for base + * spatial layer forces a key frame. Default is off (0) for all spatial + * layers. Spatial layer sync flag is reset to 0 after each encoded layer, + * so when control is invoked it is only used for the current superframe. + * + * 0: Off (default), 1: Enabled + * + * Supported in codecs: VP9 + */ + VP9E_SET_SVC_SPATIAL_LAYER_SYNC, + + /*!\brief Codec control function to enable temporal dependency model. + * + * Vp9 allows the encoder to run temporal dependency model and use it to + * improve the compression performance. To enable, set this parameter to be + * 1. The default value is set to be 1. + */ + VP9E_SET_TPL, + + /*!\brief Codec control function to enable postencode frame drop. + * + * This will allow encoder to drop frame after it's encoded. + * + * 0: Off (default), 1: Enabled + * + * Supported in codecs: VP9 + */ + VP9E_SET_POSTENCODE_DROP, + + /*!\brief Codec control function to set delta q for uv. + * + * Cap it at +/-15. + * + * Supported in codecs: VP9 + */ + VP9E_SET_DELTA_Q_UV, + + /*!\brief Codec control function to disable increase Q on overshoot in CBR. + * + * 0: On (default), 1: Disable. + * + * Supported in codecs: VP9 + */ + VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, + + /*!\brief Codec control function to disable loopfilter. + * + * 0: Loopfilter on all frames, 1: Disable on non reference frames. + * 2: Disable on all frames. + * + * Supported in codecs: VP9 + */ + VP9E_SET_DISABLE_LOOPFILTER, + + /*!\brief Codec control function to enable external rate control library. + * + * args[0]: path of the rate control library + * + * args[1]: private config of the rate control library + * + * Supported in codecs: VP9 + */ + VP9E_SET_EXTERNAL_RATE_CONTROL, + + /*!\brief Codec control to disable internal features in rate control. + * + * This will do 3 things, only for 1 pass: + * - Turn off low motion computation + * - Turn off gf update constraint on key frame frequency + * - Turn off content mode for cyclic refresh + * + * With those, the rate control is expected to work exactly the same as the + * interface provided in ratectrl_rtc.cc/h + * + * Supported in codecs: VP9 + */ + VP9E_SET_RTC_EXTERNAL_RATECTRL, + + /*!\brief Codec control function to get loopfilter level in the encoder. + * + * Supported in codecs: VP9 + */ + VP9E_GET_LOOPFILTER_LEVEL, + + /*!\brief Codec control to get last quantizers for all spatial layers. + * + * Return value uses an array of internal quantizers scale defined by the + * codec, for all spatial layers. + * The size of the array passed in should be #VPX_SS_MAX_LAYERS. + * + * Supported in codecs: VP9 + */ + VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, + + /*!\brief Codec control to disable internal features in rate control. + * + * This will turn off cyclic refresh for vp8. + * + * With this, the rate control is expected to work exactly the same as the + * interface provided in vp8_ratectrl_rtc.cc/h + * + * Supported in codecs: VP8 + */ + VP8E_SET_RTC_EXTERNAL_RATECTRL, + + /*!\brief Codec control to set quantizer for the next frame. + * + * This will turn off cyclic refresh. Only applicable to 1-pass without + * spatial layers. + * + * Supported in codecs: VP9 + * + */ + VP9E_SET_QUANTIZER_ONE_PASS, + + /*!\brief Codec control to use external RC to control TPL. + * + * This will use external RC to control the QP and GOP structure for TPL. + * + * Supported in codecs: VP9 + */ + VP9E_ENABLE_EXTERNAL_RC_TPL, +}; + +/*!\brief vpx 1-D scaling mode + * + * This set of constants define 1-D vpx scaling modes + */ +typedef enum vpx_scaling_mode_1d { + VP8E_NORMAL = 0, + VP8E_FOURFIVE = 1, + VP8E_THREEFIVE = 2, + VP8E_ONETWO = 3 +} VPX_SCALING_MODE; + +/*!\brief Temporal layering mode enum for VP9 SVC. + * + * This set of macros define the different temporal layering modes. + * Supported codecs: VP9 (in SVC mode) + * + */ +typedef enum vp9e_temporal_layering_mode { + /*!\brief No temporal layering. + * Used when only spatial layering is used. + */ + VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING = 0, + + /*!\brief Bypass mode. + * Used when application needs to control temporal layering. + * This will only work when the number of spatial layers equals 1. + */ + VP9E_TEMPORAL_LAYERING_MODE_BYPASS = 1, + + /*!\brief 0-1-0-1... temporal layering scheme with two temporal layers. + */ + VP9E_TEMPORAL_LAYERING_MODE_0101 = 2, + + /*!\brief 0-2-1-2... temporal layering scheme with three temporal layers. + */ + VP9E_TEMPORAL_LAYERING_MODE_0212 = 3 +} VP9E_TEMPORAL_LAYERING_MODE; + +/*!\brief vpx region of interest map + * + * These defines the data structures for the region of interest map + * + */ + +typedef struct vpx_roi_map { + /*! If ROI is enabled. */ + uint8_t enabled; + /*! An id between 0-3 (0-7 for vp9) for each 16x16 (8x8 for VP9) + * region within a frame. */ + unsigned char *roi_map; + unsigned int rows; /**< Number of rows. */ + unsigned int cols; /**< Number of columns. */ + /*! VP8 only uses the first 4 segments. VP9 uses 8 segments. */ + int delta_q[8]; /**< Quantizer deltas. Valid range: [-63, 63].*/ + int delta_lf[8]; /**< Loop filter deltas. Valid range: [-63, 63].*/ + /*! skip and ref frame segment is only used in VP9. */ + int skip[8]; /**< Skip this block. */ + int ref_frame[8]; /**< Reference frame for this block. */ + /*! Static breakout threshold for each segment. Only used in VP8. */ + unsigned int static_threshold[4]; +} vpx_roi_map_t; + +/*!\brief vpx active region map + * + * These defines the data structures for active region map + * + */ + +typedef struct vpx_active_map { + /*!\brief specify an on (1) or off (0) each 16x16 region within a frame */ + unsigned char *active_map; + unsigned int rows; /**< number of rows */ + unsigned int cols; /**< number of cols */ +} vpx_active_map_t; + +/*!\brief vpx image scaling mode + * + * This defines the data structure for image scaling mode + * + */ +typedef struct vpx_scaling_mode { + VPX_SCALING_MODE h_scaling_mode; /**< horizontal scaling mode */ + VPX_SCALING_MODE v_scaling_mode; /**< vertical scaling mode */ +} vpx_scaling_mode_t; + +/*!\brief VP8 token partition mode + * + * This defines VP8 partitioning mode for compressed data, i.e., the number of + * sub-streams in the bitstream. Used for parallelized decoding. + * + */ + +typedef enum { + VP8_ONE_TOKENPARTITION = 0, + VP8_TWO_TOKENPARTITION = 1, + VP8_FOUR_TOKENPARTITION = 2, + VP8_EIGHT_TOKENPARTITION = 3 +} vp8e_token_partitions; + +/*!brief VP9 encoder content type */ +typedef enum { + VP9E_CONTENT_DEFAULT, + VP9E_CONTENT_SCREEN, + VP9E_CONTENT_FILM, + VP9E_CONTENT_INVALID +} vp9e_tune_content; + +/*!\brief VP8 model tuning parameters + * + * Changes the encoder to tune for certain types of input material. + * + */ +typedef enum { VP8_TUNE_PSNR, VP8_TUNE_SSIM } vp8e_tuning; + +/*!\brief vp9 svc layer parameters + * + * This defines the spatial and temporal layer id numbers for svc encoding. + * This is used with the #VP9E_SET_SVC_LAYER_ID control to set the spatial and + * temporal layer id for the current frame. + * + */ +typedef struct vpx_svc_layer_id { + int spatial_layer_id; /**< First spatial layer to start encoding. */ + // TODO(jianj): Deprecated, to be removed. + int temporal_layer_id; /**< Temporal layer id number. */ + int temporal_layer_id_per_spatial[VPX_SS_MAX_LAYERS]; /**< Temp layer id. */ +} vpx_svc_layer_id_t; + +/*!\brief vp9 svc frame flag parameters. + * + * This defines the frame flags and buffer indices for each spatial layer for + * svc encoding. + * This is used with the #VP9E_SET_SVC_REF_FRAME_CONFIG control to set frame + * flags and buffer indices for each spatial layer for the current (super)frame. + * + */ +typedef struct vpx_svc_ref_frame_config { + int lst_fb_idx[VPX_SS_MAX_LAYERS]; /**< Last buffer index. */ + int gld_fb_idx[VPX_SS_MAX_LAYERS]; /**< Golden buffer index. */ + int alt_fb_idx[VPX_SS_MAX_LAYERS]; /**< Altref buffer index. */ + int update_buffer_slot[VPX_SS_MAX_LAYERS]; /**< Update reference frames. */ + // TODO(jianj): Remove update_last/golden/alt_ref, these are deprecated. + int update_last[VPX_SS_MAX_LAYERS]; /**< Update last. */ + int update_golden[VPX_SS_MAX_LAYERS]; /**< Update golden. */ + int update_alt_ref[VPX_SS_MAX_LAYERS]; /**< Update altref. */ + int reference_last[VPX_SS_MAX_LAYERS]; /**< Last as reference. */ + int reference_golden[VPX_SS_MAX_LAYERS]; /**< Golden as reference. */ + int reference_alt_ref[VPX_SS_MAX_LAYERS]; /**< Altref as reference. */ + int64_t duration[VPX_SS_MAX_LAYERS]; /**< Duration per spatial layer. */ +} vpx_svc_ref_frame_config_t; + +/*!\brief VP9 svc frame dropping mode. + * + * This defines the frame drop mode for SVC. + * + */ +typedef enum { + CONSTRAINED_LAYER_DROP, + /**< Upper layers are constrained to drop if current layer drops. */ + LAYER_DROP, /**< Any spatial layer can drop. */ + FULL_SUPERFRAME_DROP, /**< Only full superframe can drop. */ + CONSTRAINED_FROM_ABOVE_DROP, + /**< Lower layers are constrained to drop if current layer drops. */ +} SVC_LAYER_DROP_MODE; + +/*!\brief vp9 svc frame dropping parameters. + * + * This defines the frame drop thresholds for each spatial layer, and + * the frame dropping mode: 0 = layer based frame dropping (default), + * 1 = constrained dropping where current layer drop forces all upper + * spatial layers to drop. + */ +typedef struct vpx_svc_frame_drop { + int framedrop_thresh[VPX_SS_MAX_LAYERS]; /**< Frame drop thresholds */ + SVC_LAYER_DROP_MODE + framedrop_mode; /**< Layer-based or constrained dropping. */ + int max_consec_drop; /**< Maximum consecutive drops, for any layer. */ +} vpx_svc_frame_drop_t; + +/*!\brief vp9 svc spatial layer sync parameters. + * + * This defines the spatial layer sync flag, defined per spatial layer. + * + */ +typedef struct vpx_svc_spatial_layer_sync { + int spatial_layer_sync[VPX_SS_MAX_LAYERS]; /**< Sync layer flags */ + int base_layer_intra_only; /**< Flag for setting Intra-only frame on base */ +} vpx_svc_spatial_layer_sync_t; + +/*!\cond */ +/*!\brief VP8 encoder control function parameter type + * + * Defines the data types that VP8E control functions take. Note that + * additional common controls are defined in vp8.h + * + */ + +VPX_CTRL_USE_TYPE(VP8E_SET_ROI_MAP, vpx_roi_map_t *) +#define VPX_CTRL_VP8E_SET_ROI_MAP +VPX_CTRL_USE_TYPE(VP8E_SET_ACTIVEMAP, vpx_active_map_t *) +#define VPX_CTRL_VP8E_SET_ACTIVEMAP +VPX_CTRL_USE_TYPE(VP8E_SET_SCALEMODE, vpx_scaling_mode_t *) +#define VPX_CTRL_VP8E_SET_SCALEMODE +VPX_CTRL_USE_TYPE(VP8E_SET_CPUUSED, int) +#define VPX_CTRL_VP8E_SET_CPUUSED +VPX_CTRL_USE_TYPE(VP8E_SET_ENABLEAUTOALTREF, unsigned int) +#define VPX_CTRL_VP8E_SET_ENABLEAUTOALTREF +VPX_CTRL_USE_TYPE(VP8E_SET_NOISE_SENSITIVITY, unsigned int) +#define VPX_CTRL_VP8E_SET_NOISE_SENSITIVITY +VPX_CTRL_USE_TYPE(VP8E_SET_SHARPNESS, unsigned int) +#define VPX_CTRL_VP8E_SET_SHARPNESS +VPX_CTRL_USE_TYPE(VP8E_SET_STATIC_THRESHOLD, unsigned int) +#define VPX_CTRL_VP8E_SET_STATIC_THRESHOLD +VPX_CTRL_USE_TYPE(VP8E_SET_TOKEN_PARTITIONS, int) /* vp8e_token_partitions */ +#define VPX_CTRL_VP8E_SET_TOKEN_PARTITIONS +VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER, int *) +#define VPX_CTRL_VP8E_GET_LAST_QUANTIZER +VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64, int *) +#define VPX_CTRL_VP8E_GET_LAST_QUANTIZER_64 +VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_MAXFRAMES, unsigned int) +#define VPX_CTRL_VP8E_SET_ARNR_MAXFRAMES +VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_STRENGTH, unsigned int) +#define VPX_CTRL_VP8E_SET_ARNR_STRENGTH +VPX_CTRL_USE_TYPE_DEPRECATED(VP8E_SET_ARNR_TYPE, unsigned int) +#define VPX_CTRL_VP8E_SET_ARNR_TYPE +VPX_CTRL_USE_TYPE(VP8E_SET_TUNING, int) /* vp8e_tuning */ +#define VPX_CTRL_VP8E_SET_TUNING +VPX_CTRL_USE_TYPE(VP8E_SET_CQ_LEVEL, unsigned int) +#define VPX_CTRL_VP8E_SET_CQ_LEVEL +VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTRA_BITRATE_PCT, unsigned int) +#define VPX_CTRL_VP8E_SET_MAX_INTRA_BITRATE_PCT +VPX_CTRL_USE_TYPE(VP8E_SET_FRAME_FLAGS, int) +#define VPX_CTRL_VP8E_SET_FRAME_FLAGS +VPX_CTRL_USE_TYPE(VP9E_SET_MAX_INTER_BITRATE_PCT, unsigned int) +#define VPX_CTRL_VP9E_SET_MAX_INTER_BITRATE_PCT +VPX_CTRL_USE_TYPE(VP9E_SET_GF_CBR_BOOST_PCT, unsigned int) +#define VPX_CTRL_VP9E_SET_GF_CBR_BOOST_PCT +VPX_CTRL_USE_TYPE(VP8E_SET_TEMPORAL_LAYER_ID, int) +#define VPX_CTRL_VP8E_SET_TEMPORAL_LAYER_ID +VPX_CTRL_USE_TYPE(VP8E_SET_SCREEN_CONTENT_MODE, unsigned int) +#define VPX_CTRL_VP8E_SET_SCREEN_CONTENT_MODE +VPX_CTRL_USE_TYPE(VP9E_SET_LOSSLESS, unsigned int) +#define VPX_CTRL_VP9E_SET_LOSSLESS +VPX_CTRL_USE_TYPE(VP9E_SET_TILE_COLUMNS, int) +#define VPX_CTRL_VP9E_SET_TILE_COLUMNS +VPX_CTRL_USE_TYPE(VP9E_SET_TILE_ROWS, int) +#define VPX_CTRL_VP9E_SET_TILE_ROWS +VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PARALLEL_DECODING, unsigned int) +#define VPX_CTRL_VP9E_SET_FRAME_PARALLEL_DECODING +VPX_CTRL_USE_TYPE(VP9E_SET_AQ_MODE, unsigned int) +#define VPX_CTRL_VP9E_SET_AQ_MODE +VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PERIODIC_BOOST, unsigned int) +#define VPX_CTRL_VP9E_SET_FRAME_PERIODIC_BOOST +VPX_CTRL_USE_TYPE(VP9E_SET_NOISE_SENSITIVITY, unsigned int) +#define VPX_CTRL_VP9E_SET_NOISE_SENSITIVITY +VPX_CTRL_USE_TYPE(VP9E_SET_SVC, int) +#define VPX_CTRL_VP9E_SET_SVC +VPX_CTRL_USE_TYPE(VP9E_SET_ROI_MAP, vpx_roi_map_t *) +#define VPX_CTRL_VP9E_SET_ROI_MAP +VPX_CTRL_USE_TYPE(VP9E_SET_SVC_PARAMETERS, void *) +#define VPX_CTRL_VP9E_SET_SVC_PARAMETERS +VPX_CTRL_USE_TYPE(VP9E_SET_SVC_LAYER_ID, vpx_svc_layer_id_t *) +#define VPX_CTRL_VP9E_SET_SVC_LAYER_ID +VPX_CTRL_USE_TYPE(VP9E_SET_TUNE_CONTENT, int) /* vp9e_tune_content */ +#define VPX_CTRL_VP9E_SET_TUNE_CONTENT +VPX_CTRL_USE_TYPE(VP9E_GET_SVC_LAYER_ID, vpx_svc_layer_id_t *) +#define VPX_CTRL_VP9E_GET_SVC_LAYER_ID +VPX_CTRL_USE_TYPE(VP9E_REGISTER_CX_CALLBACK, void *) +#define VPX_CTRL_VP9E_REGISTER_CX_CALLBACK +VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_SPACE, int) +#define VPX_CTRL_VP9E_SET_COLOR_SPACE +VPX_CTRL_USE_TYPE(VP9E_SET_MIN_GF_INTERVAL, unsigned int) +#define VPX_CTRL_VP9E_SET_MIN_GF_INTERVAL +VPX_CTRL_USE_TYPE(VP9E_SET_MAX_GF_INTERVAL, unsigned int) +#define VPX_CTRL_VP9E_SET_MAX_GF_INTERVAL +VPX_CTRL_USE_TYPE(VP9E_GET_ACTIVEMAP, vpx_active_map_t *) +#define VPX_CTRL_VP9E_GET_ACTIVEMAP +VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_RANGE, int) +#define VPX_CTRL_VP9E_SET_COLOR_RANGE +VPX_CTRL_USE_TYPE(VP9E_SET_SVC_REF_FRAME_CONFIG, vpx_svc_ref_frame_config_t *) +#define VPX_CTRL_VP9E_SET_SVC_REF_FRAME_CONFIG +VPX_CTRL_USE_TYPE(VP9E_SET_RENDER_SIZE, int *) +#define VPX_CTRL_VP9E_SET_RENDER_SIZE +VPX_CTRL_USE_TYPE(VP9E_SET_TARGET_LEVEL, unsigned int) +#define VPX_CTRL_VP9E_SET_TARGET_LEVEL +VPX_CTRL_USE_TYPE(VP9E_SET_ROW_MT, unsigned int) +#define VPX_CTRL_VP9E_SET_ROW_MT +VPX_CTRL_USE_TYPE(VP9E_GET_LEVEL, int *) +#define VPX_CTRL_VP9E_GET_LEVEL +VPX_CTRL_USE_TYPE(VP9E_SET_ALT_REF_AQ, int) +#define VPX_CTRL_VP9E_SET_ALT_REF_AQ +VPX_CTRL_USE_TYPE(VP8E_SET_GF_CBR_BOOST_PCT, unsigned int) +#define VPX_CTRL_VP8E_SET_GF_CBR_BOOST_PCT +VPX_CTRL_USE_TYPE(VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, unsigned int) +#define VPX_CTRL_VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST +VPX_CTRL_USE_TYPE(VP9E_SET_SVC_INTER_LAYER_PRED, unsigned int) +#define VPX_CTRL_VP9E_SET_SVC_INTER_LAYER_PRED +VPX_CTRL_USE_TYPE(VP9E_SET_SVC_FRAME_DROP_LAYER, vpx_svc_frame_drop_t *) +#define VPX_CTRL_VP9E_SET_SVC_FRAME_DROP_LAYER +VPX_CTRL_USE_TYPE(VP9E_GET_SVC_REF_FRAME_CONFIG, vpx_svc_ref_frame_config_t *) +#define VPX_CTRL_VP9E_GET_SVC_REF_FRAME_CONFIG +VPX_CTRL_USE_TYPE(VP9E_SET_SVC_GF_TEMPORAL_REF, unsigned int) +#define VPX_CTRL_VP9E_SET_SVC_GF_TEMPORAL_REF +VPX_CTRL_USE_TYPE(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, + vpx_svc_spatial_layer_sync_t *) +#define VPX_CTRL_VP9E_SET_SVC_SPATIAL_LAYER_SYNC +VPX_CTRL_USE_TYPE(VP9E_SET_TPL, int) +#define VPX_CTRL_VP9E_SET_TPL +VPX_CTRL_USE_TYPE(VP9E_SET_POSTENCODE_DROP, unsigned int) +#define VPX_CTRL_VP9E_SET_POSTENCODE_DROP +VPX_CTRL_USE_TYPE(VP9E_SET_DELTA_Q_UV, int) +#define VPX_CTRL_VP9E_SET_DELTA_Q_UV +VPX_CTRL_USE_TYPE(VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, int) +#define VPX_CTRL_VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR +VPX_CTRL_USE_TYPE(VP9E_SET_DISABLE_LOOPFILTER, int) +#define VPX_CTRL_VP9E_SET_DISABLE_LOOPFILTER +VPX_CTRL_USE_TYPE(VP9E_SET_EXTERNAL_RATE_CONTROL, vpx_rc_funcs_t *) +#define VPX_CTRL_VP9E_SET_EXTERNAL_RATE_CONTROL +VPX_CTRL_USE_TYPE(VP9E_SET_RTC_EXTERNAL_RATECTRL, int) +#define VPX_CTRL_VP9E_SET_RTC_EXTERNAL_RATECTRL +VPX_CTRL_USE_TYPE(VP9E_GET_LOOPFILTER_LEVEL, int *) +#define VPX_CTRL_VP9E_GET_LOOPFILTER_LEVEL +VPX_CTRL_USE_TYPE(VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, int *) +#define VPX_CTRL_VP9E_GET_LAST_QUANTIZER_SVC_LAYERS +VPX_CTRL_USE_TYPE(VP8E_SET_RTC_EXTERNAL_RATECTRL, int) +#define VPX_CTRL_VP8E_SET_RTC_EXTERNAL_RATECTRL +VPX_CTRL_USE_TYPE(VP9E_SET_QUANTIZER_ONE_PASS, int) +#define VPX_CTRL_VP9E_SET_QUANTIZER_ONE_PASS +VPX_CTRL_USE_TYPE(VP9E_ENABLE_EXTERNAL_RC_TPL, int) +#define VPX_CTRL_VP9E_ENABLE_EXTERNAL_RC_TPL + +/*!\endcond */ +/*! @} - end defgroup vp8_encoder */ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_VP8CX_H_ diff --git a/media/libvpx/libvpx/vpx/vp8dx.h b/media/libvpx/libvpx/vpx/vp8dx.h new file mode 100644 index 0000000000..8c13649f4a --- /dev/null +++ b/media/libvpx/libvpx/vpx/vp8dx.h @@ -0,0 +1,228 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/*!\defgroup vp8_decoder WebM VP8/VP9 Decoder + * \ingroup vp8 + * + * @{ + */ +/*!\file + * \brief Provides definitions for using VP8 or VP9 within the vpx Decoder + * interface. + */ +#ifndef VPX_VPX_VP8DX_H_ +#define VPX_VPX_VP8DX_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* Include controls common to both the encoder and decoder */ +#include "./vp8.h" + +/*!\name Algorithm interface for VP8 + * + * This interface provides the capability to decode VP8 streams. + * @{ + */ + +/*!\brief A single instance of the VP8 decoder. + *\deprecated This access mechanism is provided for backwards compatibility; + * prefer vpx_codec_vp8_dx(). + */ +extern vpx_codec_iface_t vpx_codec_vp8_dx_algo; + +/*!\brief The interface to the VP8 decoder. + */ +extern vpx_codec_iface_t *vpx_codec_vp8_dx(void); +/*!@} - end algorithm interface member group*/ + +/*!\name Algorithm interface for VP9 + * + * This interface provides the capability to decode VP9 streams. + * @{ + */ + +/*!\brief A single instance of the VP9 decoder. + *\deprecated This access mechanism is provided for backwards compatibility; + * prefer vpx_codec_vp9_dx(). + */ +extern vpx_codec_iface_t vpx_codec_vp9_dx_algo; + +/*!\brief The interface to the VP9 decoder. + */ +extern vpx_codec_iface_t *vpx_codec_vp9_dx(void); +/*!@} - end algorithm interface member group*/ + +/*!\enum vp8_dec_control_id + * \brief VP8 decoder control functions + * + * This set of macros define the control functions available for the VP8 + * decoder interface. + * + * \sa #vpx_codec_control + */ +enum vp8_dec_control_id { + /** control function to get info on which reference frames were updated + * by the last decode + */ + VP8D_GET_LAST_REF_UPDATES = VP8_DECODER_CTRL_ID_START, + + /** check if the indicated frame is corrupted */ + VP8D_GET_FRAME_CORRUPTED, + + /** control function to get info on which reference frames were used + * by the last decode + */ + VP8D_GET_LAST_REF_USED, + + /** decryption function to decrypt encoded buffer data immediately + * before decoding. Takes a vpx_decrypt_init, which contains + * a callback function and opaque context pointer. + */ + VPXD_SET_DECRYPTOR, + VP8D_SET_DECRYPTOR = VPXD_SET_DECRYPTOR, + + /** control function to get the dimensions that the current frame is decoded + * at. This may be different to the intended display size for the frame as + * specified in the wrapper or frame header (see VP9D_GET_DISPLAY_SIZE). */ + VP9D_GET_FRAME_SIZE, + + /** control function to get the current frame's intended display dimensions + * (as specified in the wrapper or frame header). This may be different to + * the decoded dimensions of this frame (see VP9D_GET_FRAME_SIZE). */ + VP9D_GET_DISPLAY_SIZE, + + /** control function to get the bit depth of the stream. */ + VP9D_GET_BIT_DEPTH, + + /** control function to set the byte alignment of the planes in the reference + * buffers. Valid values are power of 2, from 32 to 1024. A value of 0 sets + * legacy alignment. I.e. Y plane is aligned to 32 bytes, U plane directly + * follows Y plane, and V plane directly follows U plane. Default value is 0. + */ + VP9_SET_BYTE_ALIGNMENT, + + /** control function to invert the decoding order to from right to left. The + * function is used in a test to confirm the decoding independence of tile + * columns. The function may be used in application where this order + * of decoding is desired. + * + * TODO(yaowu): Rework the unit test that uses this control, and in a future + * release, this test-only control shall be removed. + */ + VP9_INVERT_TILE_DECODE_ORDER, + + /** control function to set the skip loop filter flag. Valid values are + * integers. The decoder will skip the loop filter when its value is set to + * nonzero. If the loop filter is skipped the decoder may accumulate decode + * artifacts. The default value is 0. + */ + VP9_SET_SKIP_LOOP_FILTER, + + /** control function to decode SVC stream up to the x spatial layers, + * where x is passed in through the control, and is 0 for base layer. + */ + VP9_DECODE_SVC_SPATIAL_LAYER, + + /*!\brief Codec control function to get last decoded frame quantizer. + * + * Return value uses internal quantizer scale defined by the codec. + * + * Supported in codecs: VP8, VP9 + */ + VPXD_GET_LAST_QUANTIZER, + + /*!\brief Codec control function to set row level multi-threading. + * + * 0 : off, 1 : on + * + * Supported in codecs: VP9 + */ + VP9D_SET_ROW_MT, + + /*!\brief Codec control function to set loopfilter optimization. + * + * 0 : off, Loop filter is done after all tiles have been decoded + * 1 : on, Loop filter is done immediately after decode without + * waiting for all threads to sync. + * + * Supported in codecs: VP9 + */ + VP9D_SET_LOOP_FILTER_OPT, + + VP8_DECODER_CTRL_ID_MAX +}; + +/** Decrypt n bytes of data from input -> output, using the decrypt_state + * passed in VPXD_SET_DECRYPTOR. + */ +typedef void (*vpx_decrypt_cb)(void *decrypt_state, const unsigned char *input, + unsigned char *output, int count); + +/*!\brief Structure to hold decryption state + * + * Defines a structure to hold the decryption state and access function. + */ +typedef struct vpx_decrypt_init { + /*! Decrypt callback. */ + vpx_decrypt_cb decrypt_cb; + + /*! Decryption state. */ + void *decrypt_state; +} vpx_decrypt_init; + +/*!\cond */ +/*!\brief VP8 decoder control function parameter type + * + * Defines the data types that VP8D control functions take. Note that + * additional common controls are defined in vp8.h + * + */ + +VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_UPDATES, int *) +#define VPX_CTRL_VP8D_GET_LAST_REF_UPDATES +VPX_CTRL_USE_TYPE(VP8D_GET_FRAME_CORRUPTED, int *) +#define VPX_CTRL_VP8D_GET_FRAME_CORRUPTED +VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_USED, int *) +#define VPX_CTRL_VP8D_GET_LAST_REF_USED +VPX_CTRL_USE_TYPE(VPXD_SET_DECRYPTOR, vpx_decrypt_init *) +#define VPX_CTRL_VPXD_SET_DECRYPTOR +VPX_CTRL_USE_TYPE(VP8D_SET_DECRYPTOR, vpx_decrypt_init *) +#define VPX_CTRL_VP8D_SET_DECRYPTOR +VPX_CTRL_USE_TYPE(VP9D_GET_FRAME_SIZE, int *) +#define VPX_CTRL_VP9D_GET_FRAME_SIZE +VPX_CTRL_USE_TYPE(VP9D_GET_DISPLAY_SIZE, int *) +#define VPX_CTRL_VP9D_GET_DISPLAY_SIZE +VPX_CTRL_USE_TYPE(VP9D_GET_BIT_DEPTH, unsigned int *) +#define VPX_CTRL_VP9D_GET_BIT_DEPTH +VPX_CTRL_USE_TYPE(VP9_SET_BYTE_ALIGNMENT, int) +#define VPX_CTRL_VP9_SET_BYTE_ALIGNMENT +VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int) +#define VPX_CTRL_VP9_INVERT_TILE_DECODE_ORDER +VPX_CTRL_USE_TYPE(VP9_SET_SKIP_LOOP_FILTER, int) +#define VPX_CTRL_VP9_SET_SKIP_LOOP_FILTER +VPX_CTRL_USE_TYPE(VP9_DECODE_SVC_SPATIAL_LAYER, int) +#define VPX_CTRL_VP9_DECODE_SVC_SPATIAL_LAYER +VPX_CTRL_USE_TYPE(VPXD_GET_LAST_QUANTIZER, int *) +#define VPX_CTRL_VPXD_GET_LAST_QUANTIZER +VPX_CTRL_USE_TYPE(VP9D_SET_ROW_MT, int) +#define VPX_CTRL_VP9_DECODE_SET_ROW_MT +VPX_CTRL_USE_TYPE(VP9D_SET_LOOP_FILTER_OPT, int) +#define VPX_CTRL_VP9_SET_LOOP_FILTER_OPT + +/*!\endcond */ +/*! @} - end defgroup vp8_decoder */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_VP8DX_H_ diff --git a/media/libvpx/libvpx/vpx/vpx_codec.h b/media/libvpx/libvpx/vpx/vpx_codec.h new file mode 100644 index 0000000000..0d61b07381 --- /dev/null +++ b/media/libvpx/libvpx/vpx/vpx_codec.h @@ -0,0 +1,475 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/*!\defgroup codec Common Algorithm Interface + * This abstraction allows applications to easily support multiple video + * formats with minimal code duplication. This section describes the interface + * common to all codecs (both encoders and decoders). + * @{ + */ + +/*!\file + * \brief Describes the codec algorithm interface to applications. + * + * This file describes the interface between an application and a + * video codec algorithm. + * + * An application instantiates a specific codec instance by using + * vpx_codec_dec_init() or vpx_codec_enc_init() and a pointer to the + * algorithm's interface structure: + *
+ *     my_app.c:
+ *       extern vpx_codec_iface_t my_codec;
+ *       {
+ *           vpx_codec_ctx_t algo;
+ *           int threads = 4;
+ *           vpx_codec_dec_cfg_t cfg = { threads, 0, 0 };
+ *           res = vpx_codec_dec_init(&algo, &my_codec, &cfg, 0);
+ *       }
+ *     
+ * + * Once initialized, the instance is manged using other functions from + * the vpx_codec_* family. + */ +#ifndef VPX_VPX_VPX_CODEC_H_ +#define VPX_VPX_VPX_CODEC_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "./vpx_image.h" +#include "./vpx_integer.h" + +/*!\brief Decorator indicating a function is deprecated */ +#ifndef VPX_DEPRECATED +#if defined(__GNUC__) && __GNUC__ +#define VPX_DEPRECATED __attribute__((deprecated)) +#elif defined(_MSC_VER) +#define VPX_DEPRECATED +#else +#define VPX_DEPRECATED +#endif +#endif /* VPX_DEPRECATED */ + +#ifndef VPX_DECLSPEC_DEPRECATED +#if defined(__GNUC__) && __GNUC__ +#define VPX_DECLSPEC_DEPRECATED /**< \copydoc #VPX_DEPRECATED */ +#elif defined(_MSC_VER) +/*!\brief \copydoc #VPX_DEPRECATED */ +#define VPX_DECLSPEC_DEPRECATED __declspec(deprecated) +#else +#define VPX_DECLSPEC_DEPRECATED /**< \copydoc #VPX_DEPRECATED */ +#endif +#endif /* VPX_DECLSPEC_DEPRECATED */ + +/*!\brief Decorator indicating a function is potentially unused */ +#ifndef VPX_UNUSED +#if defined(__GNUC__) || defined(__clang__) +#define VPX_UNUSED __attribute__((unused)) +#else +#define VPX_UNUSED +#endif +#endif /* VPX_UNUSED */ + +/*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures + */ +#define VPX_CODEC_ABI_VERSION (4 + VPX_IMAGE_ABI_VERSION) /**<\hideinitializer*/ + +/*!\brief Algorithm return codes */ +typedef enum { + /*!\brief Operation completed without error */ + VPX_CODEC_OK, + + /*!\brief Unspecified error */ + VPX_CODEC_ERROR, + + /*!\brief Memory operation failed */ + VPX_CODEC_MEM_ERROR, + + /*!\brief ABI version mismatch */ + VPX_CODEC_ABI_MISMATCH, + + /*!\brief Algorithm does not have required capability */ + VPX_CODEC_INCAPABLE, + + /*!\brief The given bitstream is not supported. + * + * The bitstream was unable to be parsed at the highest level. The decoder + * is unable to proceed. This error \ref SHOULD be treated as fatal to the + * stream. */ + VPX_CODEC_UNSUP_BITSTREAM, + + /*!\brief Encoded bitstream uses an unsupported feature + * + * The decoder does not implement a feature required by the encoder. This + * return code should only be used for features that prevent future + * pictures from being properly decoded. This error \ref MAY be treated as + * fatal to the stream or \ref MAY be treated as fatal to the current GOP. + */ + VPX_CODEC_UNSUP_FEATURE, + + /*!\brief The coded data for this stream is corrupt or incomplete + * + * There was a problem decoding the current frame. This return code + * should only be used for failures that prevent future pictures from + * being properly decoded. This error \ref MAY be treated as fatal to the + * stream or \ref MAY be treated as fatal to the current GOP. If decoding + * is continued for the current GOP, artifacts may be present. + */ + VPX_CODEC_CORRUPT_FRAME, + + /*!\brief An application-supplied parameter is not valid. + * + */ + VPX_CODEC_INVALID_PARAM, + + /*!\brief An iterator reached the end of list. + * + */ + VPX_CODEC_LIST_END + +} vpx_codec_err_t; + +/*! \brief Codec capabilities bitfield + * + * Each codec advertises the capabilities it supports as part of its + * ::vpx_codec_iface_t interface structure. Capabilities are extra interfaces + * or functionality, and are not required to be supported. + * + * The available flags are specified by VPX_CODEC_CAP_* defines. + */ +typedef long vpx_codec_caps_t; +#define VPX_CODEC_CAP_DECODER 0x1 /**< Is a decoder */ +#define VPX_CODEC_CAP_ENCODER 0x2 /**< Is an encoder */ + +/*! Can support images at greater than 8 bitdepth. + */ +#define VPX_CODEC_CAP_HIGHBITDEPTH 0x4 + +/*! \brief Initialization-time Feature Enabling + * + * Certain codec features must be known at initialization time, to allow for + * proper memory allocation. + * + * The available flags are specified by VPX_CODEC_USE_* defines. + */ +typedef long vpx_codec_flags_t; + +/*!\brief Codec interface structure. + * + * Contains function pointers and other data private to the codec + * implementation. This structure is opaque to the application. + */ +typedef const struct vpx_codec_iface vpx_codec_iface_t; + +/*!\brief Codec private data structure. + * + * Contains data private to the codec implementation. This structure is opaque + * to the application. + */ +typedef struct vpx_codec_priv vpx_codec_priv_t; + +/*!\brief Iterator + * + * Opaque storage used for iterating over lists. + */ +typedef const void *vpx_codec_iter_t; + +/*!\brief Codec context structure + * + * All codecs \ref MUST support this context structure fully. In general, + * this data should be considered private to the codec algorithm, and + * not be manipulated or examined by the calling application. Applications + * may reference the 'name' member to get a printable description of the + * algorithm. + */ +typedef struct vpx_codec_ctx { + const char *name; /**< Printable interface name */ + vpx_codec_iface_t *iface; /**< Interface pointers */ + vpx_codec_err_t err; /**< Last returned error */ + const char *err_detail; /**< Detailed info, if available */ + vpx_codec_flags_t init_flags; /**< Flags passed at init time */ + union { + /**< Decoder Configuration Pointer */ + const struct vpx_codec_dec_cfg *dec; + /**< Encoder Configuration Pointer */ + const struct vpx_codec_enc_cfg *enc; + const void *raw; + } config; /**< Configuration pointer aliasing union */ + vpx_codec_priv_t *priv; /**< Algorithm private storage */ +} vpx_codec_ctx_t; + +/*!\brief Bit depth for codec + * * + * This enumeration determines the bit depth of the codec. + */ +typedef enum vpx_bit_depth { + VPX_BITS_8 = 8, /**< 8 bits */ + VPX_BITS_10 = 10, /**< 10 bits */ + VPX_BITS_12 = 12, /**< 12 bits */ +} vpx_bit_depth_t; + +/* + * Library Version Number Interface + * + * For example, see the following sample return values: + * vpx_codec_version() (1<<16 | 2<<8 | 3) + * vpx_codec_version_str() "v1.2.3-rc1-16-gec6a1ba" + * vpx_codec_version_extra_str() "rc1-16-gec6a1ba" + */ + +/*!\brief Return the version information (as an integer) + * + * Returns a packed encoding of the library version number. This will only + * include + * the major.minor.patch component of the version number. Note that this encoded + * value should be accessed through the macros provided, as the encoding may + * change + * in the future. + * + */ +int vpx_codec_version(void); +#define VPX_VERSION_MAJOR(v) \ + (((v) >> 16) & 0xff) /**< extract major from packed version */ +#define VPX_VERSION_MINOR(v) \ + (((v) >> 8) & 0xff) /**< extract minor from packed version */ +#define VPX_VERSION_PATCH(v) \ + (((v) >> 0) & 0xff) /**< extract patch from packed version */ + +/*!\brief Return the version major number */ +#define vpx_codec_version_major() ((vpx_codec_version() >> 16) & 0xff) + +/*!\brief Return the version minor number */ +#define vpx_codec_version_minor() ((vpx_codec_version() >> 8) & 0xff) + +/*!\brief Return the version patch number */ +#define vpx_codec_version_patch() ((vpx_codec_version() >> 0) & 0xff) + +/*!\brief Return the version information (as a string) + * + * Returns a printable string containing the full library version number. This + * may + * contain additional text following the three digit version number, as to + * indicate + * release candidates, prerelease versions, etc. + * + */ +const char *vpx_codec_version_str(void); + +/*!\brief Return the version information (as a string) + * + * Returns a printable "extra string". This is the component of the string + * returned + * by vpx_codec_version_str() following the three digit version number. + * + */ +const char *vpx_codec_version_extra_str(void); + +/*!\brief Return the build configuration + * + * Returns a printable string containing an encoded version of the build + * configuration. This may be useful to vpx support. + * + */ +const char *vpx_codec_build_config(void); + +/*!\brief Return the name for a given interface + * + * Returns a human readable string for name of the given codec interface. + * + * \param[in] iface Interface pointer + * + */ +const char *vpx_codec_iface_name(vpx_codec_iface_t *iface); + +/*!\brief Convert error number to printable string + * + * Returns a human readable string for the last error returned by the + * algorithm. The returned error will be one line and will not contain + * any newline characters. + * + * + * \param[in] err Error number. + * + */ +const char *vpx_codec_err_to_string(vpx_codec_err_t err); + +/*!\brief Retrieve error synopsis for codec context + * + * Returns a human readable string for the last error returned by the + * algorithm. The returned error will be one line and will not contain + * any newline characters. + * + * + * \param[in] ctx Pointer to this instance's context. + * + */ +const char *vpx_codec_error(const vpx_codec_ctx_t *ctx); + +/*!\brief Retrieve detailed error information for codec context + * + * Returns a human readable string providing detailed information about + * the last error. The returned string is only valid until the next + * vpx_codec_* function call (except vpx_codec_error and + * vpx_codec_error_detail) on the codec context. + * + * \param[in] ctx Pointer to this instance's context. + * + * \retval NULL + * No detailed information is available. + */ +const char *vpx_codec_error_detail(const vpx_codec_ctx_t *ctx); + +/* REQUIRED FUNCTIONS + * + * The following functions are required to be implemented for all codecs. + * They represent the base case functionality expected of all codecs. + */ + +/*!\brief Destroy a codec instance + * + * Destroys a codec context, freeing any associated memory buffers. + * + * \param[in] ctx Pointer to this instance's context + * + * \retval #VPX_CODEC_OK + * The codec instance has been destroyed. + * \retval #VPX_CODEC_INVALID_PARAM + * ctx is a null pointer. + * \retval #VPX_CODEC_ERROR + * Codec context not initialized. + */ +vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx); + +/*!\brief Get the capabilities of an algorithm. + * + * Retrieves the capabilities bitfield from the algorithm's interface. + * + * \param[in] iface Pointer to the algorithm interface + * + */ +vpx_codec_caps_t vpx_codec_get_caps(vpx_codec_iface_t *iface); + +/*!\brief Control algorithm + * + * This function is used to exchange algorithm specific data with the codec + * instance. This can be used to implement features specific to a particular + * algorithm. + * + * This wrapper function dispatches the request to the helper function + * associated with the given ctrl_id. It tries to call this function + * transparently, but will return #VPX_CODEC_ERROR if the request could not + * be dispatched. + * + * Note that this function should not be used directly. Call the + * #vpx_codec_control wrapper macro instead. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] ctrl_id Algorithm specific control identifier + * + * \retval #VPX_CODEC_OK + * The control request was processed. + * \retval #VPX_CODEC_ERROR + * The control request was not processed. + * \retval #VPX_CODEC_INVALID_PARAM + * The data was not valid. + */ +vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...); +#if defined(VPX_DISABLE_CTRL_TYPECHECKS) && VPX_DISABLE_CTRL_TYPECHECKS +#define vpx_codec_control(ctx, id, data) vpx_codec_control_(ctx, id, data) +#define VPX_CTRL_USE_TYPE(id, typ) +#define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ) +#define VPX_CTRL_VOID(id, typ) + +#else +/*!\brief vpx_codec_control wrapper macro + * + * This macro allows for type safe conversions across the variadic parameter + * to vpx_codec_control_(). + * + * \internal + * It works by dispatching the call to the control function through a wrapper + * function named with the id parameter. + */ +#define vpx_codec_control(ctx, id, data) \ + vpx_codec_control_##id(ctx, id, data) /**<\hideinitializer*/ + +/*!\brief vpx_codec_control type definition macro + * + * This macro allows for type safe conversions across the variadic parameter + * to vpx_codec_control_(). It defines the type of the argument for a given + * control identifier. + * + * \internal + * It defines a static function with + * the correctly typed arguments as a wrapper to the type-unsafe internal + * function. + */ +#define VPX_CTRL_USE_TYPE(id, typ) \ + static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *, int, typ) \ + VPX_UNUSED; \ + \ + static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *ctx, \ + int ctrl_id, typ data) { \ + return vpx_codec_control_(ctx, ctrl_id, data); \ + } /**<\hideinitializer*/ + +/*!\brief vpx_codec_control deprecated type definition macro + * + * Like #VPX_CTRL_USE_TYPE, but indicates that the specified control is + * deprecated and should not be used. Consult the documentation for your + * codec for more information. + * + * \internal + * It defines a static function with the correctly typed arguments as a + * wrapper to the type-unsafe internal function. + */ +#define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ) \ + VPX_DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \ + vpx_codec_ctx_t *, int, typ) VPX_DEPRECATED VPX_UNUSED; \ + \ + VPX_DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \ + vpx_codec_ctx_t *ctx, int ctrl_id, typ data) { \ + return vpx_codec_control_(ctx, ctrl_id, data); \ + } /**<\hideinitializer*/ + +/*!\brief vpx_codec_control void type definition macro + * + * This macro allows for type safe conversions across the variadic parameter + * to vpx_codec_control_(). It indicates that a given control identifier takes + * no argument. + * + * \internal + * It defines a static function without a data argument as a wrapper to the + * type-unsafe internal function. + */ +#define VPX_CTRL_VOID(id) \ + static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *, int) \ + VPX_UNUSED; \ + \ + static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *ctx, \ + int ctrl_id) { \ + return vpx_codec_control_(ctx, ctrl_id); \ + } /**<\hideinitializer*/ + +#endif + +/*!@} - end defgroup codec*/ +#ifdef __cplusplus +} +#endif +#endif // VPX_VPX_VPX_CODEC_H_ diff --git a/media/libvpx/libvpx/vpx/vpx_codec.mk b/media/libvpx/libvpx/vpx/vpx_codec.mk new file mode 100644 index 0000000000..25c815ef51 --- /dev/null +++ b/media/libvpx/libvpx/vpx/vpx_codec.mk @@ -0,0 +1,47 @@ +## +## Copyright (c) 2010 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + + +API_EXPORTS += exports + +API_SRCS-$(CONFIG_VP8_ENCODER) += vp8.h +API_SRCS-$(CONFIG_VP8_ENCODER) += vp8cx.h +API_DOC_SRCS-$(CONFIG_VP8_ENCODER) += vp8.h +API_DOC_SRCS-$(CONFIG_VP8_ENCODER) += vp8cx.h + +API_SRCS-$(CONFIG_VP8_DECODER) += vp8.h +API_SRCS-$(CONFIG_VP8_DECODER) += vp8dx.h +API_DOC_SRCS-$(CONFIG_VP8_DECODER) += vp8.h +API_DOC_SRCS-$(CONFIG_VP8_DECODER) += vp8dx.h + +API_DOC_SRCS-yes += vpx_codec.h +API_DOC_SRCS-yes += vpx_decoder.h +API_DOC_SRCS-yes += vpx_encoder.h +API_DOC_SRCS-yes += vpx_ext_ratectrl.h +API_DOC_SRCS-yes += vpx_frame_buffer.h +API_DOC_SRCS-yes += vpx_image.h +API_DOC_SRCS-yes += vpx_tpl.h + +API_SRCS-yes += src/vpx_decoder.c +API_SRCS-yes += vpx_decoder.h +API_SRCS-yes += src/vpx_encoder.c +API_SRCS-yes += vpx_encoder.h +API_SRCS-yes += internal/vpx_codec_internal.h +API_SRCS-yes += internal/vpx_ratectrl_rtc.h +API_SRCS-yes += src/vpx_codec.c +API_SRCS-yes += src/vpx_image.c +API_SRCS-yes += src/vpx_tpl.c +API_SRCS-yes += vpx_codec.h +API_SRCS-yes += vpx_codec.mk +API_SRCS-yes += vpx_frame_buffer.h +API_SRCS-yes += vpx_image.h +API_SRCS-yes += vpx_integer.h +API_SRCS-yes += vpx_ext_ratectrl.h +API_SRCS-yes += vpx_tpl.h diff --git a/media/libvpx/libvpx/vpx/vpx_decoder.h b/media/libvpx/libvpx/vpx/vpx_decoder.h new file mode 100644 index 0000000000..0e9611e31f --- /dev/null +++ b/media/libvpx/libvpx/vpx/vpx_decoder.h @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_VPX_VPX_DECODER_H_ +#define VPX_VPX_VPX_DECODER_H_ + +/*!\defgroup decoder Decoder Algorithm Interface + * \ingroup codec + * This abstraction allows applications using this decoder to easily support + * multiple video formats with minimal code duplication. This section describes + * the interface common to all decoders. + * @{ + */ + +/*!\file + * \brief Describes the decoder algorithm interface to applications. + * + * This file describes the interface between an application and a + * video decoder algorithm. + * + */ +#ifdef __cplusplus +extern "C" { +#endif + +#include "./vpx_codec.h" // IWYU pragma: export +#include "./vpx_frame_buffer.h" + +/*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures + */ +#define VPX_DECODER_ABI_VERSION \ + (3 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/ + +/*! \brief Decoder capabilities bitfield + * + * Each decoder advertises the capabilities it supports as part of its + * ::vpx_codec_iface_t interface structure. Capabilities are extra interfaces + * or functionality, and are not required to be supported by a decoder. + * + * The available flags are specified by VPX_CODEC_CAP_* defines. + */ +#define VPX_CODEC_CAP_PUT_SLICE 0x10000 /**< Will issue put_slice callbacks */ +#define VPX_CODEC_CAP_PUT_FRAME 0x20000 /**< Will issue put_frame callbacks */ +#define VPX_CODEC_CAP_POSTPROC 0x40000 /**< Can postprocess decoded frame */ +/*!\brief Can conceal errors due to packet loss */ +#define VPX_CODEC_CAP_ERROR_CONCEALMENT 0x80000 +/*!\brief Can receive encoded frames one fragment at a time */ +#define VPX_CODEC_CAP_INPUT_FRAGMENTS 0x100000 +/*!\brief Can support frame-based multi-threading */ +#define VPX_CODEC_CAP_FRAME_THREADING 0x200000 +/*!brief Can support external frame buffers */ +#define VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER 0x400000 + +/*! \brief Initialization-time Feature Enabling + * + * Certain codec features must be known at initialization time, to allow for + * proper memory allocation. + * + * The available flags are specified by VPX_CODEC_USE_* defines. + */ +#define VPX_CODEC_USE_POSTPROC 0x10000 /**< Postprocess decoded frame */ +/*!\brief Conceal errors in decoded frames */ +#define VPX_CODEC_USE_ERROR_CONCEALMENT 0x20000 +/*!\brief The input frame should be passed to the decoder one fragment at a + * time */ +#define VPX_CODEC_USE_INPUT_FRAGMENTS 0x40000 +/*!\brief Enable frame-based multi-threading */ +#define VPX_CODEC_USE_FRAME_THREADING 0x80000 + +/*!\brief Stream properties + * + * This structure is used to query or set properties of the decoded + * stream. Algorithms may extend this structure with data specific + * to their bitstream by setting the sz member appropriately. + */ +typedef struct vpx_codec_stream_info { + unsigned int sz; /**< Size of this structure */ + unsigned int w; /**< Width (or 0 for unknown/default) */ + unsigned int h; /**< Height (or 0 for unknown/default) */ + unsigned int is_kf; /**< Current frame is a keyframe */ +} vpx_codec_stream_info_t; + +/* REQUIRED FUNCTIONS + * + * The following functions are required to be implemented for all decoders. + * They represent the base case functionality expected of all decoders. + */ + +/*!\brief Initialization Configurations + * + * This structure is used to pass init time configuration options to the + * decoder. + */ +typedef struct vpx_codec_dec_cfg { + unsigned int threads; /**< Maximum number of threads to use, default 1 */ + unsigned int w; /**< Width */ + unsigned int h; /**< Height */ +} vpx_codec_dec_cfg_t; /**< alias for struct vpx_codec_dec_cfg */ + +/*!\brief Initialize a decoder instance + * + * Initializes a decoder context using the given interface. Applications + * should call the vpx_codec_dec_init convenience macro instead of this + * function directly, to ensure that the ABI version number parameter + * is properly initialized. + * + * If the library was configured with --disable-multithread, this call + * is not thread safe and should be guarded with a lock if being used + * in a multithreaded context. + * + * \param[in] ctx Pointer to this instance's context. + * \param[in] iface Pointer to the algorithm interface to use. + * \param[in] cfg Configuration to use, if known. May be NULL. + * \param[in] flags Bitfield of VPX_CODEC_USE_* flags + * \param[in] ver ABI version number. Must be set to + * VPX_DECODER_ABI_VERSION + * \retval #VPX_CODEC_OK + * The decoder algorithm has been initialized. + * \retval #VPX_CODEC_MEM_ERROR + * Memory allocation failed. + */ +vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t *ctx, + vpx_codec_iface_t *iface, + const vpx_codec_dec_cfg_t *cfg, + vpx_codec_flags_t flags, int ver); + +/*!\brief Convenience macro for vpx_codec_dec_init_ver() + * + * Ensures the ABI version parameter is properly set. + */ +#define vpx_codec_dec_init(ctx, iface, cfg, flags) \ + vpx_codec_dec_init_ver(ctx, iface, cfg, flags, VPX_DECODER_ABI_VERSION) + +/*!\brief Parse stream info from a buffer + * + * Performs high level parsing of the bitstream. Construction of a decoder + * context is not necessary. Can be used to determine if the bitstream is + * of the proper format, and to extract information from the stream. + * + * \param[in] iface Pointer to the algorithm interface + * \param[in] data Pointer to a block of data to parse + * \param[in] data_sz Size of the data buffer + * \param[in,out] si Pointer to stream info to update. The size member + * \ref MUST be properly initialized, but \ref MAY be + * clobbered by the algorithm. This parameter \ref MAY + * be NULL. + * + * \retval #VPX_CODEC_OK + * Bitstream is parsable and stream information updated + */ +vpx_codec_err_t vpx_codec_peek_stream_info(vpx_codec_iface_t *iface, + const uint8_t *data, + unsigned int data_sz, + vpx_codec_stream_info_t *si); + +/*!\brief Return information about the current stream. + * + * Returns information about the stream that has been parsed during decoding. + * + * \param[in] ctx Pointer to this instance's context + * \param[in,out] si Pointer to stream info to update. The size member + * \ref MUST be properly initialized, but \ref MAY be + * clobbered by the algorithm. This parameter \ref MAY + * be NULL. + * + * \retval #VPX_CODEC_OK + * Bitstream is parsable and stream information updated + */ +vpx_codec_err_t vpx_codec_get_stream_info(vpx_codec_ctx_t *ctx, + vpx_codec_stream_info_t *si); + +/*!\brief Decode data + * + * Processes a buffer of coded data. If the processing results in a new + * decoded frame becoming available, put_slice and put_frame callbacks may be + * invoked, as appropriate. Encoded data \ref MUST be passed in DTS (decode + * time stamp) order. Frames produced will always be in PTS (presentation + * time stamp) order. + * If the decoder is configured with VPX_CODEC_USE_INPUT_FRAGMENTS enabled, + * data and data_sz can contain a fragment of the encoded frame. Fragment + * \#n must contain at least partition \#n, but can also contain subsequent + * partitions (\#n+1 - \#n+i), and if so, fragments \#n+1, .., \#n+i must + * be empty. When no more data is available, this function should be called + * with NULL as data and 0 as data_sz. The memory passed to this function + * must be available until the frame has been decoded. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] data Pointer to this block of new coded data. If + * NULL, the put_frame callback is invoked for + * the previously decoded frame. + * \param[in] data_sz Size of the coded data, in bytes. + * \param[in] user_priv Application specific data to associate with + * this frame. + * \param[in] deadline Soft deadline the decoder should attempt to meet, + * in us. Set to zero for unlimited. + * NOTE: The deadline parameter is ignored. Always + * pass 0. + * + * \return Returns #VPX_CODEC_OK if the coded data was processed completely + * and future pictures can be decoded without error. Otherwise, + * see the descriptions of the other error codes in ::vpx_codec_err_t + * for recoverability capabilities. + */ +vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t *ctx, const uint8_t *data, + unsigned int data_sz, void *user_priv, + long deadline); + +/*!\brief Decoded frames iterator + * + * Iterates over a list of the frames available for display. The iterator + * storage should be initialized to NULL to start the iteration. Iteration is + * complete when this function returns NULL. + * + * The list of available frames becomes valid upon completion of the + * vpx_codec_decode call, and remains valid until the next call to + * vpx_codec_decode. + * + * \param[in] ctx Pointer to this instance's context + * \param[in,out] iter Iterator storage, initialized to NULL + * + * \return Returns a pointer to an image, if one is ready for display. Frames + * produced will always be in PTS (presentation time stamp) order. + */ +vpx_image_t *vpx_codec_get_frame(vpx_codec_ctx_t *ctx, vpx_codec_iter_t *iter); + +/*!\defgroup cap_put_frame Frame-Based Decoding Functions + * + * The following function is required to be implemented for all decoders + * that advertise the VPX_CODEC_CAP_PUT_FRAME capability. Calling this + * function for codecs that don't advertise this capability will result in + * an error code being returned, usually VPX_CODEC_INCAPABLE. + * @{ + */ + +/*!\brief put frame callback prototype + * + * This callback is invoked by the decoder to notify the application of + * the availability of decoded image data. + */ +typedef void (*vpx_codec_put_frame_cb_fn_t)(void *user_priv, + const vpx_image_t *img); + +/*!\brief Register for notification of frame completion. + * + * Registers a given function to be called when a decoded frame is + * available. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] cb Pointer to the callback function + * \param[in] user_priv User's private data + * + * \retval #VPX_CODEC_OK + * Callback successfully registered. + * \retval #VPX_CODEC_ERROR + * Decoder context not initialized. + * \retval #VPX_CODEC_INCAPABLE + * Algorithm not capable of posting frame completion. + */ +vpx_codec_err_t vpx_codec_register_put_frame_cb(vpx_codec_ctx_t *ctx, + vpx_codec_put_frame_cb_fn_t cb, + void *user_priv); + +/*!@} - end defgroup cap_put_frame */ + +/*!\defgroup cap_put_slice Slice-Based Decoding Functions + * + * The following function is required to be implemented for all decoders + * that advertise the VPX_CODEC_CAP_PUT_SLICE capability. Calling this + * function for codecs that don't advertise this capability will result in + * an error code being returned, usually VPX_CODEC_INCAPABLE. + * @{ + */ + +/*!\brief put slice callback prototype + * + * This callback is invoked by the decoder to notify the application of + * the availability of partially decoded image data. + */ +typedef void (*vpx_codec_put_slice_cb_fn_t)(void *user_priv, + const vpx_image_t *img, + const vpx_image_rect_t *valid, + const vpx_image_rect_t *update); + +/*!\brief Register for notification of slice completion. + * + * Registers a given function to be called when a decoded slice is + * available. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] cb Pointer to the callback function + * \param[in] user_priv User's private data + * + * \retval #VPX_CODEC_OK + * Callback successfully registered. + * \retval #VPX_CODEC_ERROR + * Decoder context not initialized. + * \retval #VPX_CODEC_INCAPABLE + * Algorithm not capable of posting slice completion. + */ +vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t *ctx, + vpx_codec_put_slice_cb_fn_t cb, + void *user_priv); + +/*!@} - end defgroup cap_put_slice*/ + +/*!\defgroup cap_external_frame_buffer External Frame Buffer Functions + * + * The following function is required to be implemented for all decoders + * that advertise the VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER capability. + * Calling this function for codecs that don't advertise this capability + * will result in an error code being returned, usually VPX_CODEC_INCAPABLE. + * + * \note + * Currently this only works with VP9. + * @{ + */ + +/*!\brief Pass in external frame buffers for the decoder to use. + * + * Registers functions to be called when libvpx needs a frame buffer + * to decode the current frame and a function to be called when libvpx does + * not internally reference the frame buffer. This set function must + * be called before the first call to decode or libvpx will assume the + * default behavior of allocating frame buffers internally. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] cb_get Pointer to the get callback function + * \param[in] cb_release Pointer to the release callback function + * \param[in] cb_priv Callback's private data + * + * \retval #VPX_CODEC_OK + * External frame buffers will be used by libvpx. + * \retval #VPX_CODEC_INVALID_PARAM + * One or more of the callbacks were NULL. + * \retval #VPX_CODEC_ERROR + * Decoder context not initialized. + * \retval #VPX_CODEC_INCAPABLE + * Algorithm not capable of using external frame buffers. + * + * \note + * When decoding VP9, the application may be required to pass in at least + * #VP9_MAXIMUM_REF_BUFFERS + #VPX_MAXIMUM_WORK_BUFFERS external frame + * buffers. + */ +vpx_codec_err_t vpx_codec_set_frame_buffer_functions( + vpx_codec_ctx_t *ctx, vpx_get_frame_buffer_cb_fn_t cb_get, + vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv); + +/*!@} - end defgroup cap_external_frame_buffer */ + +/*!@} - end defgroup decoder*/ +#ifdef __cplusplus +} +#endif +#endif // VPX_VPX_VPX_DECODER_H_ diff --git a/media/libvpx/libvpx/vpx/vpx_encoder.h b/media/libvpx/libvpx/vpx/vpx_encoder.h new file mode 100644 index 0000000000..18e3862bd7 --- /dev/null +++ b/media/libvpx/libvpx/vpx/vpx_encoder.h @@ -0,0 +1,1127 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_VPX_VPX_ENCODER_H_ +#define VPX_VPX_VPX_ENCODER_H_ + +/*!\defgroup encoder Encoder Algorithm Interface + * \ingroup codec + * This abstraction allows applications using this encoder to easily support + * multiple video formats with minimal code duplication. This section describes + * the interface common to all encoders. + * @{ + */ + +/*!\file + * \brief Describes the encoder algorithm interface to applications. + * + * This file describes the interface between an application and a + * video encoder algorithm. + * + */ +#ifdef __cplusplus +extern "C" { +#endif + +#include "./vpx_codec.h" // IWYU pragma: export +#include "./vpx_ext_ratectrl.h" +#include "./vpx_tpl.h" + +/*! Temporal Scalability: Maximum length of the sequence defining frame + * layer membership + */ +#define VPX_TS_MAX_PERIODICITY 16 + +/*! Temporal Scalability: Maximum number of coding layers */ +#define VPX_TS_MAX_LAYERS 5 + +/*! Temporal+Spatial Scalability: Maximum number of coding layers */ +#define VPX_MAX_LAYERS 12 // 3 temporal + 4 spatial layers are allowed. + +/*! Spatial Scalability: Maximum number of coding layers */ +#define VPX_SS_MAX_LAYERS 5 + +/*! Spatial Scalability: Default number of coding layers */ +#define VPX_SS_DEFAULT_LAYERS 1 + +/*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures + */ +#define VPX_ENCODER_ABI_VERSION \ + (16 + VPX_CODEC_ABI_VERSION + VPX_EXT_RATECTRL_ABI_VERSION + \ + VPX_TPL_ABI_VERSION) /**<\hideinitializer*/ + +/*! \brief Encoder capabilities bitfield + * + * Each encoder advertises the capabilities it supports as part of its + * ::vpx_codec_iface_t interface structure. Capabilities are extra + * interfaces or functionality, and are not required to be supported + * by an encoder. + * + * The available flags are specified by VPX_CODEC_CAP_* defines. + */ +#define VPX_CODEC_CAP_PSNR 0x10000 /**< Can issue PSNR packets */ + +/*! Can output one partition at a time. Each partition is returned in its + * own VPX_CODEC_CX_FRAME_PKT, with the FRAME_IS_FRAGMENT flag set for + * every partition but the last. In this mode all frames are always + * returned partition by partition. + */ +#define VPX_CODEC_CAP_OUTPUT_PARTITION 0x20000 + +/*! \brief Initialization-time Feature Enabling + * + * Certain codec features must be known at initialization time, to allow + * for proper memory allocation. + * + * The available flags are specified by VPX_CODEC_USE_* defines. + */ +#define VPX_CODEC_USE_PSNR 0x10000 /**< Calculate PSNR on each frame */ +/*!\brief Make the encoder output one partition at a time. */ +#define VPX_CODEC_USE_OUTPUT_PARTITION 0x20000 +#define VPX_CODEC_USE_HIGHBITDEPTH 0x40000 /**< Use high bitdepth */ + +/*!\brief Generic fixed size buffer structure + * + * This structure is able to hold a reference to any fixed size buffer. + */ +typedef struct vpx_fixed_buf { + void *buf; /**< Pointer to the data */ + size_t sz; /**< Length of the buffer, in chars */ +} vpx_fixed_buf_t; /**< alias for struct vpx_fixed_buf */ + +/*!\brief Time Stamp Type + * + * An integer, which when multiplied by the stream's time base, provides + * the absolute time of a sample. + */ +typedef int64_t vpx_codec_pts_t; + +/*!\brief Compressed Frame Flags + * + * This type represents a bitfield containing information about a compressed + * frame that may be useful to an application. The most significant 16 bits + * can be used by an algorithm to provide additional detail, for example to + * support frame types that are codec specific (MPEG-1 D-frames for example) + */ +typedef uint32_t vpx_codec_frame_flags_t; +#define VPX_FRAME_IS_KEY 0x1u /**< frame is the start of a GOP */ +/*!\brief frame can be dropped without affecting the stream (no future frame + * depends on this one) */ +#define VPX_FRAME_IS_DROPPABLE 0x2u +/*!\brief frame should be decoded but will not be shown */ +#define VPX_FRAME_IS_INVISIBLE 0x4u +/*!\brief this is a fragment of the encoded frame */ +#define VPX_FRAME_IS_FRAGMENT 0x8u + +/*!\brief Error Resilient flags + * + * These flags define which error resilient features to enable in the + * encoder. The flags are specified through the + * vpx_codec_enc_cfg::g_error_resilient variable. + */ +typedef uint32_t vpx_codec_er_flags_t; +/*!\brief Improve resiliency against losses of whole frames */ +#define VPX_ERROR_RESILIENT_DEFAULT 0x1u +/*!\brief The frame partitions are independently decodable by the bool decoder, + * meaning that partitions can be decoded even though earlier partitions have + * been lost. Note that intra prediction is still done over the partition + * boundary. + * \note This is only supported by VP8.*/ +#define VPX_ERROR_RESILIENT_PARTITIONS 0x2u + +/*!\brief Encoder output packet variants + * + * This enumeration lists the different kinds of data packets that can be + * returned by calls to vpx_codec_get_cx_data(). Algorithms \ref MAY + * extend this list to provide additional functionality. + */ +enum vpx_codec_cx_pkt_kind { + VPX_CODEC_CX_FRAME_PKT, /**< Compressed video frame */ + VPX_CODEC_STATS_PKT, /**< Two-pass statistics for this frame */ + VPX_CODEC_FPMB_STATS_PKT, /**< first pass mb statistics for this frame */ + VPX_CODEC_PSNR_PKT, /**< PSNR statistics for this frame */ + VPX_CODEC_CUSTOM_PKT = 256 /**< Algorithm extensions */ +}; + +/*!\brief Encoder output packet + * + * This structure contains the different kinds of output data the encoder + * may produce while compressing a frame. + */ +typedef struct vpx_codec_cx_pkt { + enum vpx_codec_cx_pkt_kind kind; /**< packet variant */ + union { + struct { + void *buf; /**< compressed data buffer */ + size_t sz; /**< length of compressed data */ + /*!\brief time stamp to show frame (in timebase units) */ + vpx_codec_pts_t pts; + /*!\brief duration to show frame (in timebase units) */ + unsigned long duration; + vpx_codec_frame_flags_t flags; /**< flags for this frame */ + /*!\brief the partition id defines the decoding order of the partitions. + * Only applicable when "output partition" mode is enabled. First + * partition has id 0.*/ + int partition_id; + /*!\brief Width and height of frames in this packet. VP8 will only use the + * first one.*/ + unsigned int width[VPX_SS_MAX_LAYERS]; /**< frame width */ + unsigned int height[VPX_SS_MAX_LAYERS]; /**< frame height */ + /*!\brief Flag to indicate if spatial layer frame in this packet is + * encoded or dropped. VP8 will always be set to 1.*/ + uint8_t spatial_layer_encoded[VPX_SS_MAX_LAYERS]; + } frame; /**< data for compressed frame packet */ + vpx_fixed_buf_t twopass_stats; /**< data for two-pass packet */ + vpx_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */ + struct vpx_psnr_pkt { + unsigned int samples[4]; /**< Number of samples, total/y/u/v */ + uint64_t sse[4]; /**< sum squared error, total/y/u/v */ + double psnr[4]; /**< PSNR, total/y/u/v */ + } psnr; /**< data for PSNR packet */ + vpx_fixed_buf_t raw; /**< data for arbitrary packets */ + + /* This packet size is fixed to allow codecs to extend this + * interface without having to manage storage for raw packets, + * i.e., if it's smaller than 128 bytes, you can store in the + * packet list directly. + */ + char pad[128 - sizeof(enum vpx_codec_cx_pkt_kind)]; /**< fixed sz */ + } data; /**< packet data */ +} vpx_codec_cx_pkt_t; /**< alias for struct vpx_codec_cx_pkt */ + +/*!\brief Encoder return output buffer callback + * + * This callback function, when registered, returns with packets when each + * spatial layer is encoded. + */ +typedef void (*vpx_codec_enc_output_cx_pkt_cb_fn_t)(vpx_codec_cx_pkt_t *pkt, + void *user_data); + +/*!\brief Callback function pointer / user data pair storage */ +typedef struct vpx_codec_enc_output_cx_cb_pair { + vpx_codec_enc_output_cx_pkt_cb_fn_t output_cx_pkt; /**< Callback function */ + void *user_priv; /**< Pointer to private data */ +} vpx_codec_priv_output_cx_pkt_cb_pair_t; + +/*!\brief Rational Number + * + * This structure holds a fractional value. + */ +typedef struct vpx_rational { + int num; /**< fraction numerator */ + int den; /**< fraction denominator */ +} vpx_rational_t; /**< alias for struct vpx_rational */ + +/*!\brief Multi-pass Encoding Pass */ +typedef enum vpx_enc_pass { + VPX_RC_ONE_PASS, /**< Single pass mode */ + VPX_RC_FIRST_PASS, /**< First pass of multi-pass mode */ + VPX_RC_LAST_PASS /**< Final pass of multi-pass mode */ +} vpx_enc_pass; + +/*!\brief Rate control mode */ +enum vpx_rc_mode { + VPX_VBR, /**< Variable Bit Rate (VBR) mode */ + VPX_CBR, /**< Constant Bit Rate (CBR) mode */ + VPX_CQ, /**< Constrained Quality (CQ) mode */ + VPX_Q, /**< Constant Quality (Q) mode */ +}; + +/*!\brief Keyframe placement mode. + * + * This enumeration determines whether keyframes are placed automatically by + * the encoder or whether this behavior is disabled. Older releases of this + * SDK were implemented such that VPX_KF_FIXED meant keyframes were disabled. + * This name is confusing for this behavior, so the new symbols to be used + * are VPX_KF_AUTO and VPX_KF_DISABLED. + */ +enum vpx_kf_mode { + VPX_KF_FIXED, /**< deprecated, implies VPX_KF_DISABLED */ + VPX_KF_AUTO, /**< Encoder determines optimal placement automatically */ + VPX_KF_DISABLED = 0 /**< Encoder does not place keyframes. */ +}; + +/*!\brief Encoded Frame Flags + * + * This type indicates a bitfield to be passed to vpx_codec_encode(), defining + * per-frame boolean values. By convention, bits common to all codecs will be + * named VPX_EFLAG_*, and bits specific to an algorithm will be named + * /algo/_eflag_*. The lower order 16 bits are reserved for common use. + */ +typedef long vpx_enc_frame_flags_t; +#define VPX_EFLAG_FORCE_KF (1 << 0) /**< Force this frame to be a keyframe */ + +/*!\brief Encoder configuration structure + * + * This structure contains the encoder settings that have common representations + * across all codecs. This doesn't imply that all codecs support all features, + * however. + */ +typedef struct vpx_codec_enc_cfg { + /* + * generic settings (g) + */ + + /*!\brief Deprecated: Algorithm specific "usage" value + * + * This value must be zero. + */ + unsigned int g_usage; + + /*!\brief Maximum number of threads to use + * + * For multi-threaded implementations, use no more than this number of + * threads. The codec may use fewer threads than allowed. The value + * 0 is equivalent to the value 1. + */ + unsigned int g_threads; + + /*!\brief Bitstream profile to use + * + * Some codecs support a notion of multiple bitstream profiles. Typically + * this maps to a set of features that are turned on or off. Often the + * profile to use is determined by the features of the intended decoder. + * Consult the documentation for the codec to determine the valid values + * for this parameter, or set to zero for a sane default. + */ + unsigned int g_profile; /**< profile of bitstream to use */ + + /*!\brief Width of the frame + * + * This value identifies the presentation resolution of the frame, + * in pixels. Note that the frames passed as input to the encoder must + * have this resolution. Frames will be presented by the decoder in this + * resolution, independent of any spatial resampling the encoder may do. + */ + unsigned int g_w; + + /*!\brief Height of the frame + * + * This value identifies the presentation resolution of the frame, + * in pixels. Note that the frames passed as input to the encoder must + * have this resolution. Frames will be presented by the decoder in this + * resolution, independent of any spatial resampling the encoder may do. + */ + unsigned int g_h; + + /*!\brief Bit-depth of the codec + * + * This value identifies the bit_depth of the codec, + * Only certain bit-depths are supported as identified in the + * vpx_bit_depth_t enum. + */ + vpx_bit_depth_t g_bit_depth; + + /*!\brief Bit-depth of the input frames + * + * This value identifies the bit_depth of the input frames in bits. + * Note that the frames passed as input to the encoder must have + * this bit-depth. + */ + unsigned int g_input_bit_depth; + + /*!\brief Stream timebase units + * + * Indicates the smallest interval of time, in seconds, used by the stream. + * For fixed frame rate material, or variable frame rate material where + * frames are timed at a multiple of a given clock (ex: video capture), + * the \ref RECOMMENDED method is to set the timebase to the reciprocal + * of the frame rate (ex: 1001/30000 for 29.970 Hz NTSC). This allows the + * pts to correspond to the frame number, which can be handy. For + * re-encoding video from containers with absolute time timestamps, the + * \ref RECOMMENDED method is to set the timebase to that of the parent + * container or multimedia framework (ex: 1/1000 for ms, as in FLV). + */ + struct vpx_rational g_timebase; + + /*!\brief Enable error resilient modes. + * + * The error resilient bitfield indicates to the encoder which features + * it should enable to take measures for streaming over lossy or noisy + * links. + */ + vpx_codec_er_flags_t g_error_resilient; + + /*!\brief Multi-pass Encoding Mode + * + * This value should be set to the current phase for multi-pass encoding. + * For single pass, set to #VPX_RC_ONE_PASS. + */ + enum vpx_enc_pass g_pass; + + /*!\brief Allow lagged encoding + * + * If set, this value allows the encoder to consume a number of input + * frames before producing output frames. This allows the encoder to + * base decisions for the current frame on future frames. This does + * increase the latency of the encoding pipeline, so it is not appropriate + * in all situations (ex: realtime encoding). + * + * Note that this is a maximum value -- the encoder may produce frames + * sooner than the given limit. Set this value to 0 to disable this + * feature. + */ + unsigned int g_lag_in_frames; + + /* + * rate control settings (rc) + */ + + /*!\brief Temporal resampling configuration, if supported by the codec. + * + * Temporal resampling allows the codec to "drop" frames as a strategy to + * meet its target data rate. This can cause temporal discontinuities in + * the encoded video, which may appear as stuttering during playback. This + * trade-off is often acceptable, but for many applications is not. It can + * be disabled in these cases. + * + * This threshold is described as a percentage of the target data buffer. + * When the data buffer falls below this percentage of fullness, a + * dropped frame is indicated. Set the threshold to zero (0) to disable + * this feature. + */ + unsigned int rc_dropframe_thresh; + + /*!\brief Enable/disable spatial resampling, if supported by the codec. + * + * Spatial resampling allows the codec to compress a lower resolution + * version of the frame, which is then upscaled by the encoder to the + * correct presentation resolution. This increases visual quality at + * low data rates, at the expense of CPU time on the encoder/decoder. + */ + unsigned int rc_resize_allowed; + + /*!\brief Internal coded frame width. + * + * If spatial resampling is enabled this specifies the width of the + * encoded frame. + */ + unsigned int rc_scaled_width; + + /*!\brief Internal coded frame height. + * + * If spatial resampling is enabled this specifies the height of the + * encoded frame. + */ + unsigned int rc_scaled_height; + + /*!\brief Spatial resampling up watermark. + * + * This threshold is described as a percentage of the target data buffer. + * When the data buffer rises above this percentage of fullness, the + * encoder will step up to a higher resolution version of the frame. + */ + unsigned int rc_resize_up_thresh; + + /*!\brief Spatial resampling down watermark. + * + * This threshold is described as a percentage of the target data buffer. + * When the data buffer falls below this percentage of fullness, the + * encoder will step down to a lower resolution version of the frame. + */ + unsigned int rc_resize_down_thresh; + + /*!\brief Rate control algorithm to use. + * + * Indicates whether the end usage of this stream is to be streamed over + * a bandwidth constrained link, indicating that Constant Bit Rate (CBR) + * mode should be used, or whether it will be played back on a high + * bandwidth link, as from a local disk, where higher variations in + * bitrate are acceptable. + */ + enum vpx_rc_mode rc_end_usage; + + /*!\brief Two-pass stats buffer. + * + * A buffer containing all of the stats packets produced in the first + * pass, concatenated. + */ + vpx_fixed_buf_t rc_twopass_stats_in; + + /*!\brief first pass mb stats buffer. + * + * A buffer containing all of the first pass mb stats packets produced + * in the first pass, concatenated. + */ + vpx_fixed_buf_t rc_firstpass_mb_stats_in; + + /*!\brief Target data rate + * + * Target bitrate to use for this stream, in kilobits per second. + */ + unsigned int rc_target_bitrate; + + /* + * quantizer settings + */ + + /*!\brief Minimum (Best Quality) Quantizer + * + * The quantizer is the most direct control over the quality of the + * encoded image. The range of valid values for the quantizer is codec + * specific. Consult the documentation for the codec to determine the + * values to use. + */ + unsigned int rc_min_quantizer; + + /*!\brief Maximum (Worst Quality) Quantizer + * + * The quantizer is the most direct control over the quality of the + * encoded image. The range of valid values for the quantizer is codec + * specific. Consult the documentation for the codec to determine the + * values to use. + */ + unsigned int rc_max_quantizer; + + /* + * bitrate tolerance + */ + + /*!\brief Rate control adaptation undershoot control + * + * VP8: Expressed as a percentage of the target bitrate, + * controls the maximum allowed adaptation speed of the codec. + * This factor controls the maximum amount of bits that can + * be subtracted from the target bitrate in order to compensate + * for prior overshoot. + * VP9: Expressed as a percentage of the target bitrate, a threshold + * undershoot level (current rate vs target) beyond which more aggressive + * corrective measures are taken. + * * + * Valid values in the range VP8:0-100 VP9: 0-100. + */ + unsigned int rc_undershoot_pct; + + /*!\brief Rate control adaptation overshoot control + * + * VP8: Expressed as a percentage of the target bitrate, + * controls the maximum allowed adaptation speed of the codec. + * This factor controls the maximum amount of bits that can + * be added to the target bitrate in order to compensate for + * prior undershoot. + * VP9: Expressed as a percentage of the target bitrate, a threshold + * overshoot level (current rate vs target) beyond which more aggressive + * corrective measures are taken. + * + * Valid values in the range VP8:0-100 VP9: 0-100. + */ + unsigned int rc_overshoot_pct; + + /* + * decoder buffer model parameters + */ + + /*!\brief Decoder Buffer Size + * + * This value indicates the amount of data that may be buffered by the + * decoding application. Note that this value is expressed in units of + * time (milliseconds). For example, a value of 5000 indicates that the + * client will buffer (at least) 5000ms worth of encoded data. Use the + * target bitrate (#rc_target_bitrate) to convert to bits/bytes, if + * necessary. + */ + unsigned int rc_buf_sz; + + /*!\brief Decoder Buffer Initial Size + * + * This value indicates the amount of data that will be buffered by the + * decoding application prior to beginning playback. This value is + * expressed in units of time (milliseconds). Use the target bitrate + * (#rc_target_bitrate) to convert to bits/bytes, if necessary. + */ + unsigned int rc_buf_initial_sz; + + /*!\brief Decoder Buffer Optimal Size + * + * This value indicates the amount of data that the encoder should try + * to maintain in the decoder's buffer. This value is expressed in units + * of time (milliseconds). Use the target bitrate (#rc_target_bitrate) + * to convert to bits/bytes, if necessary. + */ + unsigned int rc_buf_optimal_sz; + + /* + * 2 pass rate control parameters + */ + + /*!\brief Two-pass mode CBR/VBR bias + * + * Bias, expressed on a scale of 0 to 100, for determining target size + * for the current frame. The value 0 indicates the optimal CBR mode + * value should be used. The value 100 indicates the optimal VBR mode + * value should be used. Values in between indicate which way the + * encoder should "lean." + */ + unsigned int rc_2pass_vbr_bias_pct; + + /*!\brief Two-pass mode per-GOP minimum bitrate + * + * This value, expressed as a percentage of the target bitrate, indicates + * the minimum bitrate to be used for a single GOP (aka "section") + */ + unsigned int rc_2pass_vbr_minsection_pct; + + /*!\brief Two-pass mode per-GOP maximum bitrate + * + * This value, expressed as a percentage of the target bitrate, indicates + * the maximum bitrate to be used for a single GOP (aka "section") + */ + unsigned int rc_2pass_vbr_maxsection_pct; + + /*!\brief Two-pass corpus vbr mode complexity control + * Used only in VP9: A value representing the corpus midpoint complexity + * for corpus vbr mode. This value defaults to 0 which disables corpus vbr + * mode in favour of normal vbr mode. + */ + unsigned int rc_2pass_vbr_corpus_complexity; + + /* + * keyframing settings (kf) + */ + + /*!\brief Keyframe placement mode + * + * This value indicates whether the encoder should place keyframes at a + * fixed interval, or determine the optimal placement automatically + * (as governed by the #kf_min_dist and #kf_max_dist parameters) + */ + enum vpx_kf_mode kf_mode; + + /*!\brief Keyframe minimum interval + * + * This value, expressed as a number of frames, prevents the encoder from + * placing a keyframe nearer than kf_min_dist to the previous keyframe. At + * least kf_min_dist frames non-keyframes will be coded before the next + * keyframe. Set kf_min_dist equal to kf_max_dist for a fixed interval. + */ + unsigned int kf_min_dist; + + /*!\brief Keyframe maximum interval + * + * This value, expressed as a number of frames, forces the encoder to code + * a keyframe if one has not been coded in the last kf_max_dist frames. + * A value of 0 implies all frames will be keyframes. Set kf_min_dist + * equal to kf_max_dist for a fixed interval. + */ + unsigned int kf_max_dist; + + /* + * Spatial scalability settings (ss) + */ + + /*!\brief Number of spatial coding layers. + * + * This value specifies the number of spatial coding layers to be used. + */ + unsigned int ss_number_layers; + + /*!\brief Enable auto alt reference flags for each spatial layer. + * + * These values specify if auto alt reference frame is enabled for each + * spatial layer. + */ + int ss_enable_auto_alt_ref[VPX_SS_MAX_LAYERS]; + + /*!\brief Target bitrate for each spatial layer. + * + * These values specify the target coding bitrate to be used for each + * spatial layer. (in kbps) + */ + unsigned int ss_target_bitrate[VPX_SS_MAX_LAYERS]; + + /*!\brief Number of temporal coding layers. + * + * This value specifies the number of temporal layers to be used. + */ + unsigned int ts_number_layers; + + /*!\brief Target bitrate for each temporal layer. + * + * These values specify the target coding bitrate to be used for each + * temporal layer. (in kbps) + */ + unsigned int ts_target_bitrate[VPX_TS_MAX_LAYERS]; + + /*!\brief Frame rate decimation factor for each temporal layer. + * + * These values specify the frame rate decimation factors to apply + * to each temporal layer. + */ + unsigned int ts_rate_decimator[VPX_TS_MAX_LAYERS]; + + /*!\brief Length of the sequence defining frame temporal layer membership. + * + * This value specifies the length of the sequence that defines the + * membership of frames to temporal layers. For example, if the + * ts_periodicity = 8, then the frames are assigned to coding layers with a + * repeated sequence of length 8. + */ + unsigned int ts_periodicity; + + /*!\brief Template defining the membership of frames to temporal layers. + * + * This array defines the membership of frames to temporal coding layers. + * For a 2-layer encoding that assigns even numbered frames to one temporal + * layer (0) and odd numbered frames to a second temporal layer (1) with + * ts_periodicity=8, then ts_layer_id = (0,1,0,1,0,1,0,1). + */ + unsigned int ts_layer_id[VPX_TS_MAX_PERIODICITY]; + + /*!\brief Target bitrate for each spatial/temporal layer. + * + * These values specify the target coding bitrate to be used for each + * spatial/temporal layer. (in kbps) + * + */ + unsigned int layer_target_bitrate[VPX_MAX_LAYERS]; + + /*!\brief Temporal layering mode indicating which temporal layering scheme to + * use. + * + * The value (refer to VP9E_TEMPORAL_LAYERING_MODE) specifies the + * temporal layering mode to use. + * + */ + int temporal_layering_mode; + + /*!\brief A flag indicating whether to use external rate control parameters. + * By default is 0. If set to 1, the following parameters will be used in the + * rate control system. + */ + int use_vizier_rc_params; + + /*!\brief Active worst quality factor. + * + * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t active_wq_factor; + + /*!\brief Error per macroblock adjustment factor. + * + * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t err_per_mb_factor; + + /*!\brief Second reference default decay limit. + * + * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t sr_default_decay_limit; + + /*!\brief Second reference difference factor. + * + * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t sr_diff_factor; + + /*!\brief Keyframe error per macroblock adjustment factor. + * + * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t kf_err_per_mb_factor; + + /*!\brief Keyframe minimum boost adjustment factor. + * + * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t kf_frame_min_boost_factor; + + /*!\brief Keyframe maximum boost adjustment factor, for the first keyframe + * in a chunk. + * + * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t kf_frame_max_boost_first_factor; + + /*!\brief Keyframe maximum boost adjustment factor, for subsequent keyframes. + * + * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t kf_frame_max_boost_subs_factor; + + /*!\brief Keyframe maximum total boost adjustment factor. + * + * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t kf_max_total_boost_factor; + + /*!\brief Golden frame maximum total boost adjustment factor. + * + * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t gf_max_total_boost_factor; + + /*!\brief Golden frame maximum boost adjustment factor. + * + * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t gf_frame_max_boost_factor; + + /*!\brief Zero motion power factor. + * + * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t zm_factor; + + /*!\brief Rate-distortion multiplier for inter frames. + * The multiplier is a crucial parameter in the calculation of rate distortion + * cost. It is often related to the qp (qindex) value. + * Rate control parameters, could be set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t rd_mult_inter_qp_fac; + + /*!\brief Rate-distortion multiplier for alt-ref frames. + * The multiplier is a crucial parameter in the calculation of rate distortion + * cost. It is often related to the qp (qindex) value. + * Rate control parameters, could be set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t rd_mult_arf_qp_fac; + + /*!\brief Rate-distortion multiplier for key frames. + * The multiplier is a crucial parameter in the calculation of rate distortion + * cost. It is often related to the qp (qindex) value. + * Rate control parameters, could be set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t rd_mult_key_qp_fac; +} vpx_codec_enc_cfg_t; /**< alias for struct vpx_codec_enc_cfg */ + +/*!\brief vp9 svc extra configure parameters + * + * This defines max/min quantizers and scale factors for each layer + * + */ +typedef struct vpx_svc_parameters { + int max_quantizers[VPX_MAX_LAYERS]; /**< Max Q for each layer */ + int min_quantizers[VPX_MAX_LAYERS]; /**< Min Q for each layer */ + int scaling_factor_num[VPX_MAX_LAYERS]; /**< Scaling factor-numerator */ + int scaling_factor_den[VPX_MAX_LAYERS]; /**< Scaling factor-denominator */ + int speed_per_layer[VPX_MAX_LAYERS]; /**< Speed setting for each sl */ + int temporal_layering_mode; /**< Temporal layering mode */ + int loopfilter_ctrl[VPX_MAX_LAYERS]; /**< Loopfilter ctrl for each sl */ +} vpx_svc_extra_cfg_t; + +/*!\brief Initialize an encoder instance + * + * Initializes an encoder context using the given interface. Applications + * should call the vpx_codec_enc_init convenience macro instead of this + * function directly, to ensure that the ABI version number parameter + * is properly initialized. + * + * If the library was configured with --disable-multithread, this call + * is not thread safe and should be guarded with a lock if being used + * in a multithreaded context. + * + * If vpx_codec_enc_init_ver() fails, it is not necessary to call + * vpx_codec_destroy() on the encoder context. + * + * \param[in] ctx Pointer to this instance's context. + * \param[in] iface Pointer to the algorithm interface to use. + * \param[in] cfg Configuration to use, if known. May be NULL. + * \param[in] flags Bitfield of VPX_CODEC_USE_* flags + * \param[in] ver ABI version number. Must be set to + * VPX_ENCODER_ABI_VERSION + * \retval #VPX_CODEC_OK + * The decoder algorithm initialized. + * \retval #VPX_CODEC_MEM_ERROR + * Memory allocation failed. + */ +vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx, + vpx_codec_iface_t *iface, + const vpx_codec_enc_cfg_t *cfg, + vpx_codec_flags_t flags, int ver); + +/*!\brief Convenience macro for vpx_codec_enc_init_ver() + * + * Ensures the ABI version parameter is properly set. + */ +#define vpx_codec_enc_init(ctx, iface, cfg, flags) \ + vpx_codec_enc_init_ver(ctx, iface, cfg, flags, VPX_ENCODER_ABI_VERSION) + +/*!\brief Initialize multi-encoder instance + * + * Initializes multi-encoder context using the given interface. + * Applications should call the vpx_codec_enc_init_multi convenience macro + * instead of this function directly, to ensure that the ABI version number + * parameter is properly initialized. + * + * \param[in] ctx Pointer to this instance's context. + * \param[in] iface Pointer to the algorithm interface to use. + * \param[in] cfg Configuration to use, if known. May be NULL. + * \param[in] num_enc Total number of encoders. + * \param[in] flags Bitfield of VPX_CODEC_USE_* flags + * \param[in] dsf Pointer to down-sampling factors. + * \param[in] ver ABI version number. Must be set to + * VPX_ENCODER_ABI_VERSION + * \retval #VPX_CODEC_OK + * The encoder algorithm has been initialized. + * \retval #VPX_CODEC_MEM_ERROR + * Memory allocation failed. + */ +vpx_codec_err_t vpx_codec_enc_init_multi_ver( + vpx_codec_ctx_t *ctx, vpx_codec_iface_t *iface, vpx_codec_enc_cfg_t *cfg, + int num_enc, vpx_codec_flags_t flags, vpx_rational_t *dsf, int ver); + +/*!\brief Convenience macro for vpx_codec_enc_init_multi_ver() + * + * Ensures the ABI version parameter is properly set. + */ +#define vpx_codec_enc_init_multi(ctx, iface, cfg, num_enc, flags, dsf) \ + vpx_codec_enc_init_multi_ver(ctx, iface, cfg, num_enc, flags, dsf, \ + VPX_ENCODER_ABI_VERSION) + +/*!\brief Get a default configuration + * + * Initializes a encoder configuration structure with default values. Supports + * the notion of "usages" so that an algorithm may offer different default + * settings depending on the user's intended goal. This function \ref SHOULD + * be called by all applications to initialize the configuration structure + * before specializing the configuration with application specific values. + * + * \param[in] iface Pointer to the algorithm interface to use. + * \param[out] cfg Configuration buffer to populate. + * \param[in] usage Must be set to 0. + * + * \retval #VPX_CODEC_OK + * The configuration was populated. + * \retval #VPX_CODEC_INCAPABLE + * Interface is not an encoder interface. + * \retval #VPX_CODEC_INVALID_PARAM + * A parameter was NULL, or the usage value was not recognized. + */ +vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface, + vpx_codec_enc_cfg_t *cfg, + unsigned int usage); + +/*!\brief Set or change configuration + * + * Reconfigures an encoder instance according to the given configuration. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] cfg Configuration buffer to use + * + * \retval #VPX_CODEC_OK + * The configuration was populated. + * \retval #VPX_CODEC_INCAPABLE + * Interface is not an encoder interface. + * \retval #VPX_CODEC_INVALID_PARAM + * A parameter was NULL, or the usage value was not recognized. + */ +vpx_codec_err_t vpx_codec_enc_config_set(vpx_codec_ctx_t *ctx, + const vpx_codec_enc_cfg_t *cfg); + +/*!\brief Get global stream headers + * + * Retrieves a stream level global header packet, if supported by the codec. + * + * \param[in] ctx Pointer to this instance's context + * + * \retval NULL + * Encoder does not support global header + * \retval Non-NULL + * Pointer to buffer containing global header packet + */ +vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx); + +/*!\brief Encode Deadline + * + * This type indicates a deadline, in microseconds, to be passed to + * vpx_codec_encode(). + */ +typedef unsigned long vpx_enc_deadline_t; +/*!\brief deadline parameter analogous to VPx REALTIME mode. */ +#define VPX_DL_REALTIME 1ul +/*!\brief deadline parameter analogous to VPx GOOD QUALITY mode. */ +#define VPX_DL_GOOD_QUALITY 1000000ul +/*!\brief deadline parameter analogous to VPx BEST QUALITY mode. */ +#define VPX_DL_BEST_QUALITY 0ul +/*!\brief Encode a frame + * + * Encodes a video frame at the given "presentation time." The presentation + * time stamp (PTS) \ref MUST be strictly increasing. + * + * The encoder supports the notion of a soft real-time deadline. Given a + * non-zero value to the deadline parameter, the encoder will make a "best + * effort" guarantee to return before the given time slice expires. It is + * implicit that limiting the available time to encode will degrade the + * output quality. The encoder can be given an unlimited time to produce the + * best possible frame by specifying a deadline of '0'. This deadline + * supersedes the VPx notion of "best quality, good quality, realtime". + * Applications that wish to map these former settings to the new deadline + * based system can use the symbols #VPX_DL_REALTIME, #VPX_DL_GOOD_QUALITY, + * and #VPX_DL_BEST_QUALITY. + * + * When the last frame has been passed to the encoder, this function should + * continue to be called, with the img parameter set to NULL. This will + * signal the end-of-stream condition to the encoder and allow it to encode + * any held buffers. Encoding is complete when vpx_codec_encode() is called + * and vpx_codec_get_cx_data() returns no data. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] img Image data to encode, NULL to flush. + * \param[in] pts Presentation time stamp, in timebase units. + * \param[in] duration Duration to show frame, in timebase units. + * \param[in] flags Flags to use for encoding this frame. + * \param[in] deadline Time to spend encoding, in microseconds. (0=infinite) + * + * \retval #VPX_CODEC_OK + * The configuration was populated. + * \retval #VPX_CODEC_INCAPABLE + * Interface is not an encoder interface. + * \retval #VPX_CODEC_INVALID_PARAM + * A parameter was NULL, the image format is unsupported, etc. + */ +vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img, + vpx_codec_pts_t pts, unsigned long duration, + vpx_enc_frame_flags_t flags, + vpx_enc_deadline_t deadline); + +/*!\brief Set compressed data output buffer + * + * Sets the buffer that the codec should output the compressed data + * into. This call effectively sets the buffer pointer returned in the + * next VPX_CODEC_CX_FRAME_PKT packet. Subsequent packets will be + * appended into this buffer. The buffer is preserved across frames, + * so applications must periodically call this function after flushing + * the accumulated compressed data to disk or to the network to reset + * the pointer to the buffer's head. + * + * `pad_before` bytes will be skipped before writing the compressed + * data, and `pad_after` bytes will be appended to the packet. The size + * of the packet will be the sum of the size of the actual compressed + * data, pad_before, and pad_after. The padding bytes will be preserved + * (not overwritten). + * + * Note that calling this function does not guarantee that the returned + * compressed data will be placed into the specified buffer. In the + * event that the encoded data will not fit into the buffer provided, + * the returned packet \ref MAY point to an internal buffer, as it would + * if this call were never used. In this event, the output packet will + * NOT have any padding, and the application must free space and copy it + * to the proper place. This is of particular note in configurations + * that may output multiple packets for a single encoded frame (e.g., lagged + * encoding) or if the application does not reset the buffer periodically. + * + * Applications may restore the default behavior of the codec providing + * the compressed data buffer by calling this function with a NULL + * buffer. + * + * Applications \ref MUSTNOT call this function during iteration of + * vpx_codec_get_cx_data(). + * + * \param[in] ctx Pointer to this instance's context + * \param[in] buf Buffer to store compressed data into + * \param[in] pad_before Bytes to skip before writing compressed data + * \param[in] pad_after Bytes to skip after writing compressed data + * + * \retval #VPX_CODEC_OK + * The buffer was set successfully. + * \retval #VPX_CODEC_INVALID_PARAM + * A parameter was NULL, the image format is unsupported, etc. + */ +vpx_codec_err_t vpx_codec_set_cx_data_buf(vpx_codec_ctx_t *ctx, + const vpx_fixed_buf_t *buf, + unsigned int pad_before, + unsigned int pad_after); + +/*!\brief Encoded data iterator + * + * Iterates over a list of data packets to be passed from the encoder to the + * application. The different kinds of packets available are enumerated in + * #vpx_codec_cx_pkt_kind. + * + * #VPX_CODEC_CX_FRAME_PKT packets should be passed to the application's + * muxer. Multiple compressed frames may be in the list. + * #VPX_CODEC_STATS_PKT packets should be appended to a global buffer. + * + * The application \ref MUST silently ignore any packet kinds that it does + * not recognize or support. + * + * The data buffers returned from this function are only guaranteed to be + * valid until the application makes another call to any vpx_codec_* function. + * + * \param[in] ctx Pointer to this instance's context + * \param[in,out] iter Iterator storage, initialized to NULL + * + * \return Returns a pointer to an output data packet (compressed frame data, + * two-pass statistics, etc.) or NULL to signal end-of-list. + * + */ +const vpx_codec_cx_pkt_t *vpx_codec_get_cx_data(vpx_codec_ctx_t *ctx, + vpx_codec_iter_t *iter); + +/*!\brief Get Preview Frame + * + * Returns an image that can be used as a preview. Shows the image as it would + * exist at the decompressor. The application \ref MUST NOT write into this + * image buffer. + * + * \param[in] ctx Pointer to this instance's context + * + * \return Returns a pointer to a preview image, or NULL if no image is + * available. + * + */ +const vpx_image_t *vpx_codec_get_preview_frame(vpx_codec_ctx_t *ctx); + +/*!@} - end defgroup encoder*/ +#ifdef __cplusplus +} +#endif +#endif // VPX_VPX_VPX_ENCODER_H_ diff --git a/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h b/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h new file mode 100644 index 0000000000..46d290dff4 --- /dev/null +++ b/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h @@ -0,0 +1,558 @@ +/* + * Copyright (c) 2020 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_VPX_EXT_RATECTRL_H_ +#define VPX_VPX_VPX_EXT_RATECTRL_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "./vpx_integer.h" +#include "./vpx_tpl.h" + +/*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures. + */ +#define VPX_EXT_RATECTRL_ABI_VERSION (7) + +/*!\brief The control type of the inference API. + * In VPX_RC_QP mode, the external rate control model determines the + * quantization parameter (QP) for each frame. + * In VPX_RC_GOP mode, the external rate control model determines the + * group of picture (GOP) of the video sequence. + * In VPX_RC_RDMULT mode, the external rate control model determines the + * rate-distortion multiplier (rdmult) for the current frame. + * In VPX_RC_GOP_QP mode, the external rate control model determines + * both the QP and the GOP. + * In VPX_RC_GOP_QP_RDMULT mode, the external rate control model determines + * the QP, GOP and the rdmult. + */ +typedef enum vpx_rc_type { + VPX_RC_QP = 1 << 0, + VPX_RC_GOP = 1 << 1, + VPX_RC_RDMULT = 1 << 2, + VPX_RC_GOP_QP = VPX_RC_QP | VPX_RC_GOP, + VPX_RC_GOP_QP_RDMULT = VPX_RC_QP | VPX_RC_GOP | VPX_RC_RDMULT +} vpx_rc_type_t; + +/*!\brief The rate control mode for the external rate control model. + */ +typedef enum vpx_ext_rc_mode { + VPX_RC_QMODE = 0, + VPX_RC_VBR = 1, + VPX_RC_CQ = 2, +} vpx_ext_rc_mode_t; + +/*!\brief Abstract rate control model handler + * + * The encoder will receive the model handler from create_model() defined in + * vpx_rc_funcs_t. + */ +typedef void *vpx_rc_model_t; + +/*!\brief A reserved value for the q index. + * If the external rate control model returns this value, + * the encoder will use the default q selected by libvpx's rate control + * system. + */ +#define VPX_DEFAULT_Q -1 + +/*!\brief A reserved value for the rdmult. + * If the external rate control model returns this value, + * the encoder will use the default rdmult selected by libvpx's rate control + * system. + */ +#define VPX_DEFAULT_RDMULT -1 + +/*!\brief Encode frame decision made by the external rate control model + * + * The encoder will receive the decision from the external rate control model + * through get_encodeframe_decision() defined in vpx_rc_funcs_t. + * + * If q_index = VPX_DEFAULT_Q, the encoder will use libvpx's default q. + * + * If max_frame_size = 0, the encoding ignores max frame size limit. + * If max_frame_size = -1, the encoding uses VP9's max frame size as the limit. + * If the encoded frame size is larger than max_frame_size, the frame is + * recoded to meet the size limit, following VP9's recoding principles. + */ +typedef struct vpx_rc_encodeframe_decision { + int q_index; /**< Quantizer step index [0..255]*/ + int max_frame_size; /**< Maximal frame size allowed to encode a frame*/ +} vpx_rc_encodeframe_decision_t; + +/*!\brief Information for the frame to be encoded. + * + * The encoder will send the information to external rate control model through + * get_encodeframe_decision() defined in vpx_rc_funcs_t. + * + */ +typedef struct vpx_rc_encodeframe_info { + /*! + * 0: Key frame + * 1: Inter frame + * 2: Alternate reference frame + * 3: Overlay frame + * 4: Golden frame + */ + int frame_type; + int show_index; /**< display index, starts from zero*/ + int coding_index; /**< coding index, starts from zero*/ + /*! + * index of the current frame in this group of picture, starts from zero. + */ + int gop_index; + int ref_frame_coding_indexes[3]; /**< three reference frames' coding indices*/ + /*! + * The validity of the three reference frames. + * 0: Invalid + * 1: Valid + */ + int ref_frame_valid_list[3]; + /*! + * The length of the current GOP. + */ + int gop_size; + /*! + * Whether the current GOP uses an alt ref. + */ + int use_alt_ref; +} vpx_rc_encodeframe_info_t; + +/*!\brief Frame coding result + * + * The encoder will send the result to the external rate control model through + * update_encodeframe_result() defined in vpx_rc_funcs_t. + */ +typedef struct vpx_rc_encodeframe_result { + int64_t sse; /**< sum of squared error of the reconstructed frame */ + int64_t bit_count; /**< number of bits spent on coding the frame*/ + int64_t pixel_count; /**< number of pixels in YUV planes of the frame*/ + int actual_encoding_qindex; /**< the actual qindex used to encode the frame*/ +} vpx_rc_encodeframe_result_t; + +/*!\brief Status returned by rate control callback functions. + */ +typedef enum vpx_rc_status { + VPX_RC_OK = 0, + VPX_RC_ERROR = 1, +} vpx_rc_status_t; + +/*!\brief First pass frame stats + * This is a mirror of vp9's FIRSTPASS_STATS except that spatial_layer_id is + * omitted + */ +typedef struct vpx_rc_frame_stats { + /*! + * Frame number in display order, if stats are for a single frame. + * No real meaning for a collection of frames. + */ + double frame; + /*! + * Weight assigned to this frame (or total weight for the collection of + * frames) currently based on intra factor and brightness factor. This is used + * to distribute bits between easier and harder frames. + */ + double weight; + /*! + * Intra prediction error. + */ + double intra_error; + /*! + * Best of intra pred error and inter pred error using last frame as ref. + */ + double coded_error; + /*! + * Best of intra pred error and inter pred error using golden frame as ref. + */ + double sr_coded_error; + /*! + * Estimate the noise energy of the current frame. + */ + double frame_noise_energy; + /*! + * Percentage of blocks with inter pred error < intra pred error. + */ + double pcnt_inter; + /*! + * Percentage of blocks using (inter prediction and) non-zero motion vectors. + */ + double pcnt_motion; + /*! + * Percentage of blocks where golden frame was better than last or intra: + * inter pred error using golden frame < inter pred error using last frame and + * inter pred error using golden frame < intra pred error + */ + double pcnt_second_ref; + /*! + * Percentage of blocks where intra and inter prediction errors were very + * close. + */ + double pcnt_neutral; + /*! + * Percentage of blocks that have intra error < inter error and inter error < + * LOW_I_THRESH + * - bit_depth 8: LOW_I_THRESH = 24000 + * - bit_depth 10: LOW_I_THRESH = 24000 << 4 + * - bit_depth 12: LOW_I_THRESH = 24000 << 8 + */ + double pcnt_intra_low; + /*! + * Percentage of blocks that have intra error < inter error and intra error < + * LOW_I_THRESH but inter error >= LOW_I_THRESH LOW_I_THRESH + * - bit_depth 8: LOW_I_THRESH = 24000 + * - bit_depth 10: LOW_I_THRESH = 24000 << 4 + * - bit_depth 12: LOW_I_THRESH = 24000 << 8 + */ + double pcnt_intra_high; + /*! + * Percentage of blocks that have almost no intra error residual + * (i.e. are in effect completely flat and untextured in the intra + * domain). In natural videos this is uncommon, but it is much more + * common in animations, graphics and screen content, so may be used + * as a signal to detect these types of content. + */ + double intra_skip_pct; + /*! + * Percentage of blocks that have intra error < SMOOTH_INTRA_THRESH + * - bit_depth 8: SMOOTH_INTRA_THRESH = 4000 + * - bit_depth 10: SMOOTH_INTRA_THRESH = 4000 << 4 + * - bit_depth 12: SMOOTH_INTRA_THRESH = 4000 << 8 + */ + double intra_smooth_pct; + /*! + * Image mask rows top and bottom. + */ + double inactive_zone_rows; + /*! + * Image mask columns at left and right edges. + */ + double inactive_zone_cols; + /*! + * Mean of row motion vectors. + */ + double MVr; + /*! + * Mean of absolute value of row motion vectors. + */ + double mvr_abs; + /*! + * Mean of column motion vectors. + */ + double MVc; + /*! + * Mean of absolute value of column motion vectors. + */ + double mvc_abs; + /*! + * Variance of row motion vectors. + */ + double MVrv; + /*! + * Variance of column motion vectors. + */ + double MVcv; + /*! + * Value in range [-1,1] indicating fraction of row and column motion vectors + * that point inwards (negative MV value) or outwards (positive MV value). + * For example, value of 1 indicates, all row/column MVs are inwards. + */ + double mv_in_out_count; + /*! + * Duration of the frame / collection of frames. + */ + double duration; + /*! + * 1.0 if stats are for a single frame, or + * number of frames whose stats are accumulated. + */ + double count; + /*! + * Number of new mv in a frame. + */ + double new_mv_count; +} vpx_rc_frame_stats_t; + +/*!\brief Collection of first pass frame stats + */ +typedef struct vpx_rc_firstpass_stats { + /*! + * Pointer to first pass frame stats. + * The pointed array of vpx_rc_frame_stats_t should have length equal to + * number of show frames in the video. + */ + vpx_rc_frame_stats_t *frame_stats; + /*! + * Number of show frames in the video. + */ + int num_frames; +} vpx_rc_firstpass_stats_t; + +/*!\brief Encode config sent to external rate control model + */ +typedef struct vpx_rc_config { + int frame_width; /**< frame width */ + int frame_height; /**< frame height */ + int show_frame_count; /**< number of visible frames in the video */ + int max_gf_interval; /**< max GOP size in number of show frames */ + int min_gf_interval; /**< min GOP size in number of show frames */ + /*! + * Target bitrate in kilobytes per second + */ + int target_bitrate_kbps; + int frame_rate_num; /**< numerator of frame rate */ + int frame_rate_den; /**< denominator of frame rate */ + /*! + * The following fields are only for external rate control models that support + * different rate control modes. + */ + vpx_ext_rc_mode_t rc_mode; /**< Q mode or VBR mode */ + int overshoot_percent; /**< for VBR mode only */ + int undershoot_percent; /**< for VBR mode only */ +} vpx_rc_config_t; + +/*!\brief Information passed to the external rate control model to + * help make GOP decisions. + */ +typedef struct vpx_rc_gop_info { + /*! + * Minimum allowed gf interval, fixed for the whole clip. + * Note that it will be modified to match vp9's level constraints + * in the encoder. + * The level constraint is defined in vp9_encoder.c: + * const Vp9LevelSpec vp9_level_defs[VP9_LEVELS]. + */ + int min_gf_interval; + /*! + * Maximum allowed gf interval, fixed for the whole clip. + */ + int max_gf_interval; + /*! + * Minimum allowed gf interval for the current GOP, determined + * by the encoder. + */ + int active_min_gf_interval; + /*! + * Maximum allowed gf interval for the current GOP, determined + * by the encoder. + */ + int active_max_gf_interval; + /*! + * Whether to allow the use of alt ref, determined by the encoder. + * It is fixed for the entire encode. + * See function "is_altref_enabled" in vp9_encoder.h. + */ + int allow_alt_ref; + /*! + * Is the current frame a key frame. + */ + int is_key_frame; + /*! + * Does the previous gop use alt ref or not. + */ + int last_gop_use_alt_ref; + /*! + * Current frame distance to the last keyframe, e.g., if Nth frame is a key, + * then the value of the N+1 th frame is 1. + */ + int frames_since_key; + /*! + * Current frame distance to the next keyframe, e.g. if Nth frame is a key, + * then the value of frame N - 1 is 1. + */ + int frames_to_key; + /*! + * Number of lookahead source frames. + */ + int lag_in_frames; + /*! + * Display index (temporal stamp) of this frame in the whole clip, + * starts from zero. + */ + int show_index; + /*! + * Coding index of this frame in the whole clip, starts from zero. + */ + int coding_index; + /*! + * The index of the current gop, starts from zero, resets to zero + * when a keyframe is set. + */ + int gop_global_index; +} vpx_rc_gop_info_t; + +/*!\brief The decision made by the external rate control model to set the + * group of picture. + */ +typedef struct vpx_rc_gop_decision { + int gop_coding_frames; /**< The number of frames of this GOP */ + int use_alt_ref; /**< Whether to use alt ref for this GOP */ +} vpx_rc_gop_decision_t; + +/*!\brief Create an external rate control model callback prototype + * + * This callback is invoked by the encoder to create an external rate control + * model. + * + * \param[in] priv Callback's private data + * \param[in] ratectrl_config Pointer to vpx_rc_config_t + * \param[out] rate_ctrl_model_ptr Pointer to vpx_rc_model_t + */ +typedef vpx_rc_status_t (*vpx_rc_create_model_cb_fn_t)( + void *priv, const vpx_rc_config_t *ratectrl_config, + vpx_rc_model_t *rate_ctrl_model_ptr); + +/*!\brief Send first pass stats to the external rate control model callback + * prototype + * + * This callback is invoked by the encoder to send first pass stats to the + * external rate control model. + * + * \param[in] rate_ctrl_model rate control model + * \param[in] first_pass_stats first pass stats + */ +typedef vpx_rc_status_t (*vpx_rc_send_firstpass_stats_cb_fn_t)( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_firstpass_stats_t *first_pass_stats); + +/*!\brief Send TPL stats for the current GOP to the external rate control model + * callback prototype + * + * This callback is invoked by the encoder to send TPL stats for the GOP to the + * external rate control model. + * + * \param[in] rate_ctrl_model rate control model + * \param[in] tpl_gop_stats TPL stats for current GOP + */ +typedef vpx_rc_status_t (*vpx_rc_send_tpl_gop_stats_cb_fn_t)( + vpx_rc_model_t rate_ctrl_model, const VpxTplGopStats *tpl_gop_stats); + +/*!\brief Receive encode frame decision callback prototype + * + * This callback is invoked by the encoder to receive encode frame decision from + * the external rate control model. + * + * \param[in] rate_ctrl_model rate control model + * \param[in] encode_frame_info information of the coding frame + * \param[out] frame_decision encode decision of the coding frame + */ +typedef vpx_rc_status_t (*vpx_rc_get_encodeframe_decision_cb_fn_t)( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_encodeframe_info_t *encode_frame_info, + vpx_rc_encodeframe_decision_t *frame_decision); + +/*!\brief Update encode frame result callback prototype + * + * This callback is invoked by the encoder to update encode frame result to the + * external rate control model. + * + * \param[in] rate_ctrl_model rate control model + * \param[out] encode_frame_result encode result of the coding frame + */ +typedef vpx_rc_status_t (*vpx_rc_update_encodeframe_result_cb_fn_t)( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_encodeframe_result_t *encode_frame_result); + +/*!\brief Get the GOP structure from the external rate control model. + * + * This callback is invoked by the encoder to get GOP decisions from + * the external rate control model. + * + * \param[in] rate_ctrl_model rate control model + * \param[in] gop_info information collected from the encoder + * \param[out] gop_decision GOP decision from the model + */ +typedef vpx_rc_status_t (*vpx_rc_get_gop_decision_cb_fn_t)( + vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info, + vpx_rc_gop_decision_t *gop_decision); + +/*!\brief Get the frame rdmult from the external rate control model. + * + * This callback is invoked by the encoder to get rdmult from + * the external rate control model. + * + * \param[in] rate_ctrl_model rate control model + * \param[in] frame_info information collected from the encoder + * \param[out] rdmult frame rate-distortion multiplier from the model + */ +typedef vpx_rc_status_t (*vpx_rc_get_frame_rdmult_cb_fn_t)( + vpx_rc_model_t rate_ctrl_model, const vpx_rc_encodeframe_info_t *frame_info, + int *rdmult); + +/*!\brief Delete the external rate control model callback prototype + * + * This callback is invoked by the encoder to delete the external rate control + * model. + * + * \param[in] rate_ctrl_model rate control model + */ +typedef vpx_rc_status_t (*vpx_rc_delete_model_cb_fn_t)( + vpx_rc_model_t rate_ctrl_model); + +/*!\brief Callback function set for external rate control. + * + * The user can enable external rate control by registering + * a set of callback functions with the codec control flag + * VP9E_SET_EXTERNAL_RATE_CONTROL. + */ +typedef struct vpx_rc_funcs { + /*! + * The rate control type of this API. + */ + vpx_rc_type_t rc_type; + /*! + * Create an external rate control model. + */ + vpx_rc_create_model_cb_fn_t create_model; + /*! + * Send first pass stats to the external rate control model. + */ + vpx_rc_send_firstpass_stats_cb_fn_t send_firstpass_stats; + /*! + * Send TPL stats for current GOP to the external rate control model. + */ + vpx_rc_send_tpl_gop_stats_cb_fn_t send_tpl_gop_stats; + /*! + * Get encodeframe decision from the external rate control model. + */ + vpx_rc_get_encodeframe_decision_cb_fn_t get_encodeframe_decision; + /*! + * Update encodeframe result to the external rate control model. + */ + vpx_rc_update_encodeframe_result_cb_fn_t update_encodeframe_result; + /*! + * Get GOP decisions from the external rate control model. + */ + vpx_rc_get_gop_decision_cb_fn_t get_gop_decision; + /*! + * Get rdmult for the frame from the external rate control model. + */ + vpx_rc_get_frame_rdmult_cb_fn_t get_frame_rdmult; + /*! + * Delete the external rate control model. + */ + vpx_rc_delete_model_cb_fn_t delete_model; + /*! + * Private data for the external rate control model. + */ + void *priv; +} vpx_rc_funcs_t; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_VPX_EXT_RATECTRL_H_ diff --git a/media/libvpx/libvpx/vpx/vpx_frame_buffer.h b/media/libvpx/libvpx/vpx/vpx_frame_buffer.h new file mode 100644 index 0000000000..fc8320017b --- /dev/null +++ b/media/libvpx/libvpx/vpx/vpx_frame_buffer.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_VPX_FRAME_BUFFER_H_ +#define VPX_VPX_VPX_FRAME_BUFFER_H_ + +/*!\file + * \brief Describes the decoder external frame buffer interface. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "./vpx_integer.h" + +/*!\brief The maximum number of work buffers used by libvpx. + * Support maximum 4 threads to decode video in parallel. + * Each thread will use one work buffer. + * TODO(hkuang): Add support to set number of worker threads dynamically. + */ +#define VPX_MAXIMUM_WORK_BUFFERS 8 + +/*!\brief The maximum number of reference buffers that a VP9 encoder may use. + */ +#define VP9_MAXIMUM_REF_BUFFERS 8 + +/*!\brief External frame buffer + * + * This structure holds allocated frame buffers used by the decoder. + */ +typedef struct vpx_codec_frame_buffer { + uint8_t *data; /**< Pointer to the data buffer */ + size_t size; /**< Size of data in bytes */ + void *priv; /**< Frame's private data */ +} vpx_codec_frame_buffer_t; + +/*!\brief get frame buffer callback prototype + * + * This callback is invoked by the decoder to retrieve data for the frame + * buffer in order for the decode call to complete. The callback must + * allocate at least min_size in bytes and assign it to fb->data. The callback + * must zero out all the data allocated. Then the callback must set fb->size + * to the allocated size. The application does not need to align the allocated + * data. The callback is triggered when the decoder needs a frame buffer to + * decode a compressed image into. This function may be called more than once + * for every call to vpx_codec_decode. The application may set fb->priv to + * some data which will be passed back in the vpx_image_t and the release + * function call. |fb| is guaranteed to not be NULL. On success the callback + * must return 0. Any failure the callback must return a value less than 0. + * + * \param[in] priv Callback's private data + * \param[in] min_size Size in bytes needed by the buffer + * \param[in,out] fb Pointer to vpx_codec_frame_buffer_t + */ +typedef int (*vpx_get_frame_buffer_cb_fn_t)(void *priv, size_t min_size, + vpx_codec_frame_buffer_t *fb); + +/*!\brief release frame buffer callback prototype + * + * This callback is invoked by the decoder when the frame buffer is not + * referenced by any other buffers. |fb| is guaranteed to not be NULL. On + * success the callback must return 0. Any failure the callback must return + * a value less than 0. + * + * \param[in] priv Callback's private data + * \param[in] fb Pointer to vpx_codec_frame_buffer_t + */ +typedef int (*vpx_release_frame_buffer_cb_fn_t)(void *priv, + vpx_codec_frame_buffer_t *fb); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_VPX_FRAME_BUFFER_H_ diff --git a/media/libvpx/libvpx/vpx/vpx_image.h b/media/libvpx/libvpx/vpx/vpx_image.h new file mode 100644 index 0000000000..1adc9b9d9e --- /dev/null +++ b/media/libvpx/libvpx/vpx/vpx_image.h @@ -0,0 +1,209 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/*!\file + * \brief Describes the vpx image descriptor and associated operations + * + */ +#ifndef VPX_VPX_VPX_IMAGE_H_ +#define VPX_VPX_VPX_IMAGE_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures + */ +#define VPX_IMAGE_ABI_VERSION (5) /**<\hideinitializer*/ + +#define VPX_IMG_FMT_PLANAR 0x100 /**< Image is a planar format. */ +#define VPX_IMG_FMT_UV_FLIP 0x200 /**< V plane precedes U in memory. */ +#define VPX_IMG_FMT_HAS_ALPHA 0x400 /**< Image has an alpha channel. */ +#define VPX_IMG_FMT_HIGHBITDEPTH 0x800 /**< Image uses 16bit framebuffer. */ + +/*!\brief List of supported image formats */ +typedef enum vpx_img_fmt { + VPX_IMG_FMT_NONE, + VPX_IMG_FMT_YV12 = + VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP | 1, /**< planar YVU */ + VPX_IMG_FMT_I420 = VPX_IMG_FMT_PLANAR | 2, + VPX_IMG_FMT_I422 = VPX_IMG_FMT_PLANAR | 5, + VPX_IMG_FMT_I444 = VPX_IMG_FMT_PLANAR | 6, + VPX_IMG_FMT_I440 = VPX_IMG_FMT_PLANAR | 7, + VPX_IMG_FMT_NV12 = VPX_IMG_FMT_PLANAR | 9, + VPX_IMG_FMT_I42016 = VPX_IMG_FMT_I420 | VPX_IMG_FMT_HIGHBITDEPTH, + VPX_IMG_FMT_I42216 = VPX_IMG_FMT_I422 | VPX_IMG_FMT_HIGHBITDEPTH, + VPX_IMG_FMT_I44416 = VPX_IMG_FMT_I444 | VPX_IMG_FMT_HIGHBITDEPTH, + VPX_IMG_FMT_I44016 = VPX_IMG_FMT_I440 | VPX_IMG_FMT_HIGHBITDEPTH +} vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */ + +/*!\brief List of supported color spaces */ +typedef enum vpx_color_space { + VPX_CS_UNKNOWN = 0, /**< Unknown */ + VPX_CS_BT_601 = 1, /**< BT.601 */ + VPX_CS_BT_709 = 2, /**< BT.709 */ + VPX_CS_SMPTE_170 = 3, /**< SMPTE.170 */ + VPX_CS_SMPTE_240 = 4, /**< SMPTE.240 */ + VPX_CS_BT_2020 = 5, /**< BT.2020 */ + VPX_CS_RESERVED = 6, /**< Reserved */ + VPX_CS_SRGB = 7 /**< sRGB */ +} vpx_color_space_t; /**< alias for enum vpx_color_space */ + +/*!\brief List of supported color range */ +typedef enum vpx_color_range { + VPX_CR_STUDIO_RANGE = 0, /**< Y [16..235], UV [16..240] */ + VPX_CR_FULL_RANGE = 1 /**< YUV/RGB [0..255] */ +} vpx_color_range_t; /**< alias for enum vpx_color_range */ + +/**\brief Image Descriptor */ +typedef struct vpx_image { + vpx_img_fmt_t fmt; /**< Image Format */ + vpx_color_space_t cs; /**< Color Space */ + vpx_color_range_t range; /**< Color Range */ + + /* Image storage dimensions */ + unsigned int w; /**< Stored image width */ + unsigned int h; /**< Stored image height */ + unsigned int bit_depth; /**< Stored image bit-depth */ + + /* Image display dimensions */ + unsigned int d_w; /**< Displayed image width */ + unsigned int d_h; /**< Displayed image height */ + + /* Image intended rendering dimensions */ + unsigned int r_w; /**< Intended rendering image width */ + unsigned int r_h; /**< Intended rendering image height */ + + /* Chroma subsampling info */ + unsigned int x_chroma_shift; /**< subsampling order, X */ + unsigned int y_chroma_shift; /**< subsampling order, Y */ + +/* Image data pointers. */ +#define VPX_PLANE_PACKED 0 /**< To be used for all packed formats */ +#define VPX_PLANE_Y 0 /**< Y (Luminance) plane */ +#define VPX_PLANE_U 1 /**< U (Chroma) plane */ +#define VPX_PLANE_V 2 /**< V (Chroma) plane */ +#define VPX_PLANE_ALPHA 3 /**< A (Transparency) plane */ + unsigned char *planes[4]; /**< pointer to the top left pixel for each plane */ + int stride[4]; /**< stride between rows for each plane */ + + int bps; /**< bits per sample (for packed formats) */ + + /*!\brief The following member may be set by the application to associate + * data with this image. + */ + void *user_priv; + + /* The following members should be treated as private. */ + unsigned char *img_data; /**< private */ + int img_data_owner; /**< private */ + int self_allocd; /**< private */ + + void *fb_priv; /**< Frame buffer data associated with the image. */ +} vpx_image_t; /**< alias for struct vpx_image */ + +/**\brief Representation of a rectangle on a surface */ +typedef struct vpx_image_rect { + unsigned int x; /**< leftmost column */ + unsigned int y; /**< topmost row */ + unsigned int w; /**< width */ + unsigned int h; /**< height */ +} vpx_image_rect_t; /**< alias for struct vpx_image_rect */ + +/*!\brief Open a descriptor, allocating storage for the underlying image + * + * Returns a descriptor for storing an image of the given format. The + * storage for the descriptor is allocated on the heap. + * + * \param[in] img Pointer to storage for descriptor. If this parameter + * is NULL, the storage for the descriptor will be + * allocated on the heap. + * \param[in] fmt Format for the image + * \param[in] d_w Width of the image + * \param[in] d_h Height of the image + * \param[in] align Alignment, in bytes, of the image buffer and + * each row in the image(stride). + * + * \return Returns a pointer to the initialized image descriptor. If the img + * parameter is non-null, the value of the img parameter will be + * returned. + */ +vpx_image_t *vpx_img_alloc(vpx_image_t *img, vpx_img_fmt_t fmt, + unsigned int d_w, unsigned int d_h, + unsigned int align); + +/*!\brief Open a descriptor, using existing storage for the underlying image + * + * Returns a descriptor for storing an image of the given format. The + * storage for descriptor has been allocated elsewhere, and a descriptor is + * desired to "wrap" that storage. + * + * \param[in] img Pointer to storage for descriptor. If this + * parameter is NULL, the storage for the descriptor + * will be allocated on the heap. + * \param[in] fmt Format for the image + * \param[in] d_w Width of the image + * \param[in] d_h Height of the image + * \param[in] stride_align Alignment, in bytes, of each row in the image. + * \param[in] img_data Storage to use for the image + * + * \return Returns a pointer to the initialized image descriptor. If the img + * parameter is non-null, the value of the img parameter will be + * returned. + */ +vpx_image_t *vpx_img_wrap(vpx_image_t *img, vpx_img_fmt_t fmt, unsigned int d_w, + unsigned int d_h, unsigned int stride_align, + unsigned char *img_data); + +/*!\brief Set the rectangle identifying the displayed portion of the image + * + * Updates the displayed rectangle (aka viewport) on the image surface to + * match the specified coordinates and size. Specifically, sets img->d_w, + * img->d_h, and elements of the img->planes[] array. + * + * \param[in] img Image descriptor + * \param[in] x leftmost column + * \param[in] y topmost row + * \param[in] w width + * \param[in] h height + * + * \return 0 if the requested rectangle is valid, nonzero (-1) otherwise. + */ +int vpx_img_set_rect(vpx_image_t *img, unsigned int x, unsigned int y, + unsigned int w, unsigned int h); + +/*!\brief Flip the image vertically (top for bottom) + * + * Adjusts the image descriptor's pointers and strides to make the image + * be referenced upside-down. + * + * \param[in] img Image descriptor + */ +void vpx_img_flip(vpx_image_t *img); + +/*!\brief Close an image descriptor + * + * Frees all allocated storage associated with an image descriptor. + * + * \param[in] img Image descriptor + */ +void vpx_img_free(vpx_image_t *img); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_VPX_IMAGE_H_ diff --git a/media/libvpx/libvpx/vpx/vpx_integer.h b/media/libvpx/libvpx/vpx/vpx_integer.h new file mode 100644 index 0000000000..34e3796411 --- /dev/null +++ b/media/libvpx/libvpx/vpx/vpx_integer.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_VPX_INTEGER_H_ +#define VPX_VPX_VPX_INTEGER_H_ + +/* get ptrdiff_t, size_t, wchar_t, NULL */ +#include // IWYU pragma: export + +#if defined(_MSC_VER) +#define VPX_FORCE_INLINE __forceinline +#define VPX_INLINE __inline +#else +#define VPX_FORCE_INLINE __inline__ __attribute__((always_inline)) +// TODO(jbb): Allow a way to force inline off for older compilers. +#define VPX_INLINE inline +#endif + +/* Assume platforms have the C99 standard integer types. */ + +#if defined(__cplusplus) +#if !defined(__STDC_FORMAT_MACROS) +#define __STDC_FORMAT_MACROS +#endif +#if !defined(__STDC_LIMIT_MACROS) +#define __STDC_LIMIT_MACROS +#endif +#endif // __cplusplus + +#include // IWYU pragma: export +#include // IWYU pragma: export + +#endif // VPX_VPX_VPX_INTEGER_H_ diff --git a/media/libvpx/libvpx/vpx/vpx_tpl.h b/media/libvpx/libvpx/vpx/vpx_tpl.h new file mode 100644 index 0000000000..a250aada60 --- /dev/null +++ b/media/libvpx/libvpx/vpx/vpx_tpl.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/*!\file + * \brief Describes the TPL stats descriptor and associated operations + * + */ +#ifndef VPX_VPX_VPX_TPL_H_ +#define VPX_VPX_VPX_TPL_H_ + +#include + +#include "./vpx_integer.h" +#include "./vpx_codec.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures + */ +#define VPX_TPL_ABI_VERSION (2) /**<\hideinitializer*/ + +/*!\brief Temporal dependency model stats for each block before propagation */ +typedef struct VpxTplBlockStats { + int16_t row; /**< Pixel row of the top left corner */ + int16_t col; /**< Pixel col of the top left corner */ + int64_t intra_cost; /**< Intra cost */ + int64_t inter_cost; /**< Inter cost */ + int16_t mv_r; /**< Motion vector row */ + int16_t mv_c; /**< Motion vector col */ + int64_t recrf_rate; /**< Rate from reconstructed ref frame */ + int64_t recrf_dist; /**< Distortion from reconstructed ref frame */ + int ref_frame_index; /**< Ref frame index in the ref frame buffer */ +} VpxTplBlockStats; + +/*!\brief Temporal dependency model stats for each frame before propagation */ +typedef struct VpxTplFrameStats { + int frame_width; /**< Frame width */ + int frame_height; /**< Frame height */ + int num_blocks; /**< Number of blocks. Size of block_stats_list */ + VpxTplBlockStats *block_stats_list; /**< List of tpl stats for each block */ +} VpxTplFrameStats; + +/*!\brief Temporal dependency model stats for each GOP before propagation */ +typedef struct VpxTplGopStats { + int size; /**< GOP size, also the size of frame_stats_list. */ + VpxTplFrameStats *frame_stats_list; /**< List of tpl stats for each frame */ +} VpxTplGopStats; + +/*!\brief Write VpxTplGopStats to file + * + * Accepts an opened file handle and writes \p tpl_gop_stats. + * + * \param[in] tpl_file A FILE pointer that's already been opened. + * \param[in] tpl_gop_stats VpxTplGopStats that contains TPL stats for the + * whole GOP. + * + * \return VPX_CODEC_OK if TPL stats are successfully written. + */ +vpx_codec_err_t vpx_write_tpl_gop_stats(FILE *tpl_file, + const VpxTplGopStats *tpl_gop_stats); + +/*!\brief Read VpxTplGopStats from file + * + * Accepts an opened file handle and reads TPL stats and stores them into + * \p tpl_gop_stats. Allocates memory for TPL stats. + * + * \param[in] tpl_file A FILE pointer that's already been opened. + * \param[out] tpl_gop_stats VpxTplGopStats that contains TPL stats for the + * whole GOP. + * + * \return VPX_CODEC_OK if TPL stats are successfully read from file. + */ +vpx_codec_err_t vpx_read_tpl_gop_stats(FILE *tpl_file, + VpxTplGopStats *tpl_gop_stats); + +/*!\brief Free the memory allocated for VpxTplGopStats + * + * \param[in] tpl_gop_stats VpxTplGopStats that contains TPL stats for the + * whole GOP. + */ +void vpx_free_tpl_gop_stats(VpxTplGopStats *tpl_gop_stats); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_VPX_TPL_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/add_noise.c b/media/libvpx/libvpx/vpx_dsp/add_noise.c new file mode 100644 index 0000000000..6839e97928 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/add_noise.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/postproc.h" +#include "vpx_ports/mem.h" + +void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp, + int whiteclamp, int width, int height, int pitch) { + int i, j; + int bothclamp = blackclamp + whiteclamp; + for (i = 0; i < height; ++i) { + uint8_t *pos = start + i * pitch; + const int8_t *ref = (const int8_t *)(noise + (rand() & 0xff)); // NOLINT + + for (j = 0; j < width; ++j) { + int v = pos[j]; + + v = clamp(v - blackclamp, 0, 255); + v = clamp(v + bothclamp, 0, 255); + v = clamp(v - whiteclamp, 0, 255); + + pos[j] = v + ref[j]; + } + } +} + +static double gaussian(double sigma, double mu, double x) { + return 1 / (sigma * sqrt(2.0 * 3.14159265)) * + (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma))); +} + +int vpx_setup_noise(double sigma, int8_t *noise, int size) { + int8_t char_dist[256]; + int next = 0, i, j; + + // set up a 256 entry lookup that matches gaussian distribution + for (i = -32; i < 32; ++i) { + const int a_i = (int)(0.5 + 256 * gaussian(sigma, 0, i)); + if (a_i) { + for (j = 0; j < a_i; ++j) { + if (next + j >= 256) goto set_noise; + char_dist[next + j] = (int8_t)i; + } + next = next + j; + } + } + + // Rounding error - might mean we have less than 256. + for (; next < 256; ++next) { + char_dist[next] = 0; + } + +set_noise: + for (i = 0; i < size; ++i) { + noise[i] = char_dist[rand() & 0xff]; // NOLINT + } + + // Returns the highest non 0 value used in distribution. + return -char_dist[0]; +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/avg_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/avg_neon.c new file mode 100644 index 0000000000..1b17a326b4 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/avg_neon.c @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +uint32_t vpx_avg_4x4_neon(const uint8_t *a, int a_stride) { + const uint8x16_t b = load_unaligned_u8q(a, a_stride); + const uint16x8_t c = vaddl_u8(vget_low_u8(b), vget_high_u8(b)); + return (horizontal_add_uint16x8(c) + (1 << 3)) >> 4; +} + +uint32_t vpx_avg_8x8_neon(const uint8_t *a, int a_stride) { + int i; + uint8x8_t b, c; + uint16x8_t sum; + b = vld1_u8(a); + a += a_stride; + c = vld1_u8(a); + a += a_stride; + sum = vaddl_u8(b, c); + + for (i = 0; i < 6; ++i) { + const uint8x8_t d = vld1_u8(a); + a += a_stride; + sum = vaddw_u8(sum, d); + } + + return (horizontal_add_uint16x8(sum) + (1 << 5)) >> 6; +} + +// coeff: 16 bits, dynamic range [-32640, 32640]. +// length: value range {16, 64, 256, 1024}. +// satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] +int vpx_satd_neon(const tran_low_t *coeff, int length) { + int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + + do { + int16x8_t abs0, abs1; + const int16x8_t s0 = load_tran_low_to_s16q(coeff); + const int16x8_t s1 = load_tran_low_to_s16q(coeff + 8); + + abs0 = vabsq_s16(s0); + sum_s32[0] = vpadalq_s16(sum_s32[0], abs0); + abs1 = vabsq_s16(s1); + sum_s32[1] = vpadalq_s16(sum_s32[1], abs1); + + length -= 16; + coeff += 16; + } while (length != 0); + + return horizontal_add_int32x4(vaddq_s32(sum_s32[0], sum_s32[1])); +} + +void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref, + const int ref_stride, const int height) { + int i; + uint8x16_t r0, r1, r2, r3; + uint16x8_t sum_lo[2], sum_hi[2]; + uint16x8_t tmp_lo[2], tmp_hi[2]; + int16x8_t avg_lo, avg_hi; + + const int norm_factor = (height >> 5) + 3; + const int16x8_t neg_norm_factor = vdupq_n_s16(-norm_factor); + + assert(height >= 4 && height % 4 == 0); + + r0 = vld1q_u8(ref + 0 * ref_stride); + r1 = vld1q_u8(ref + 1 * ref_stride); + r2 = vld1q_u8(ref + 2 * ref_stride); + r3 = vld1q_u8(ref + 3 * ref_stride); + + sum_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1)); + sum_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1)); + sum_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3)); + sum_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3)); + + ref += 4 * ref_stride; + + for (i = 4; i < height; i += 4) { + r0 = vld1q_u8(ref + 0 * ref_stride); + r1 = vld1q_u8(ref + 1 * ref_stride); + r2 = vld1q_u8(ref + 2 * ref_stride); + r3 = vld1q_u8(ref + 3 * ref_stride); + + tmp_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1)); + tmp_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1)); + tmp_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3)); + tmp_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3)); + + sum_lo[0] = vaddq_u16(sum_lo[0], tmp_lo[0]); + sum_hi[0] = vaddq_u16(sum_hi[0], tmp_hi[0]); + sum_lo[1] = vaddq_u16(sum_lo[1], tmp_lo[1]); + sum_hi[1] = vaddq_u16(sum_hi[1], tmp_hi[1]); + + ref += 4 * ref_stride; + } + + sum_lo[0] = vaddq_u16(sum_lo[0], sum_lo[1]); + sum_hi[0] = vaddq_u16(sum_hi[0], sum_hi[1]); + + avg_lo = vshlq_s16(vreinterpretq_s16_u16(sum_lo[0]), neg_norm_factor); + avg_hi = vshlq_s16(vreinterpretq_s16_u16(sum_hi[0]), neg_norm_factor); + + vst1q_s16(hbuf, avg_lo); + vst1q_s16(hbuf + 8, avg_hi); +} + +int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) { + uint16x8_t sum; + int i; + + assert(width >= 16 && width % 16 == 0); + + sum = vpaddlq_u8(vld1q_u8(ref)); + for (i = 16; i < width; i += 16) { + sum = vpadalq_u8(sum, vld1q_u8(ref + i)); + } + + return (int16_t)horizontal_add_uint16x8(sum); +} + +// ref, src = [0, 510] - max diff = 16-bits +// bwl = {2, 3, 4}, width = {16, 32, 64} +int vpx_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) { + int width = 4 << bwl; + int32x4_t sse = vdupq_n_s32(0); + int16x8_t total = vdupq_n_s16(0); + + assert(width >= 8); + assert((width % 8) == 0); + + do { + const int16x8_t r = vld1q_s16(ref); + const int16x8_t s = vld1q_s16(src); + const int16x8_t diff = vsubq_s16(r, s); // [-510, 510], 10 bits. + const int16x4_t diff_lo = vget_low_s16(diff); + const int16x4_t diff_hi = vget_high_s16(diff); + sse = vmlal_s16(sse, diff_lo, diff_lo); // dynamic range 26 bits. + sse = vmlal_s16(sse, diff_hi, diff_hi); + total = vaddq_s16(total, diff); // dynamic range 16 bits. + + ref += 8; + src += 8; + width -= 8; + } while (width != 0); + + { + // Note: 'total''s pairwise addition could be implemented similarly to + // horizontal_add_uint16x8(), but one less vpaddl with 'total' when paired + // with the summation of 'sse' performed better on a Cortex-A15. + const int32x4_t t0 = vpaddlq_s16(total); // cascading summation of 'total' + const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0)); + const int32x2_t t2 = vpadd_s32(t1, t1); + const int t = vget_lane_s32(t2, 0); + const int64x2_t s0 = vpaddlq_s32(sse); // cascading summation of 'sse'. + const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)), + vreinterpret_s32_s64(vget_high_s64(s0))); + const int s = vget_lane_s32(s1, 0); + const int shift_factor = bwl + 2; + return s - ((t * t) >> shift_factor); + } +} + +void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int *min, int *max) { + // Load and concatenate. + const uint8x16_t a01 = vcombine_u8(vld1_u8(a), vld1_u8(a + a_stride)); + const uint8x16_t a23 = + vcombine_u8(vld1_u8(a + 2 * a_stride), vld1_u8(a + 3 * a_stride)); + const uint8x16_t a45 = + vcombine_u8(vld1_u8(a + 4 * a_stride), vld1_u8(a + 5 * a_stride)); + const uint8x16_t a67 = + vcombine_u8(vld1_u8(a + 6 * a_stride), vld1_u8(a + 7 * a_stride)); + + const uint8x16_t b01 = vcombine_u8(vld1_u8(b), vld1_u8(b + b_stride)); + const uint8x16_t b23 = + vcombine_u8(vld1_u8(b + 2 * b_stride), vld1_u8(b + 3 * b_stride)); + const uint8x16_t b45 = + vcombine_u8(vld1_u8(b + 4 * b_stride), vld1_u8(b + 5 * b_stride)); + const uint8x16_t b67 = + vcombine_u8(vld1_u8(b + 6 * b_stride), vld1_u8(b + 7 * b_stride)); + + // Absolute difference. + const uint8x16_t ab01_diff = vabdq_u8(a01, b01); + const uint8x16_t ab23_diff = vabdq_u8(a23, b23); + const uint8x16_t ab45_diff = vabdq_u8(a45, b45); + const uint8x16_t ab67_diff = vabdq_u8(a67, b67); + + // Max values between the Q vectors. + const uint8x16_t ab0123_max = vmaxq_u8(ab01_diff, ab23_diff); + const uint8x16_t ab4567_max = vmaxq_u8(ab45_diff, ab67_diff); + const uint8x16_t ab0123_min = vminq_u8(ab01_diff, ab23_diff); + const uint8x16_t ab4567_min = vminq_u8(ab45_diff, ab67_diff); + + const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max); + const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min); + +#if VPX_ARCH_AARCH64 + *min = *max = 0; // Clear high bits + *((uint8_t *)max) = vmaxvq_u8(ab07_max); + *((uint8_t *)min) = vminvq_u8(ab07_min); +#else + // Split into 64-bit vectors and execute pairwise min/max. + uint8x8_t ab_max = vmax_u8(vget_high_u8(ab07_max), vget_low_u8(ab07_max)); + uint8x8_t ab_min = vmin_u8(vget_high_u8(ab07_min), vget_low_u8(ab07_min)); + + // Enough runs of vpmax/min propagate the max/min values to every position. + ab_max = vpmax_u8(ab_max, ab_max); + ab_min = vpmin_u8(ab_min, ab_min); + + ab_max = vpmax_u8(ab_max, ab_max); + ab_min = vpmin_u8(ab_min, ab_min); + + ab_max = vpmax_u8(ab_max, ab_max); + ab_min = vpmin_u8(ab_min, ab_min); + + *min = *max = 0; // Clear high bits + // Store directly to avoid costly neon->gpr transfer. + vst1_lane_u8((uint8_t *)max, ab_max, 0); + vst1_lane_u8((uint8_t *)min, ab_min, 0); +#endif +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/avg_pred_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/avg_pred_neon.c new file mode 100644 index 0000000000..5afdece0ab --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/avg_pred_neon.c @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" + +void vpx_comp_avg_pred_neon(uint8_t *comp, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + if (width > 8) { + int x, y = height; + do { + for (x = 0; x < width; x += 16) { + const uint8x16_t p = vld1q_u8(pred + x); + const uint8x16_t r = vld1q_u8(ref + x); + const uint8x16_t avg = vrhaddq_u8(p, r); + vst1q_u8(comp + x, avg); + } + comp += width; + pred += width; + ref += ref_stride; + } while (--y); + } else if (width == 8) { + int i = width * height; + do { + const uint8x16_t p = vld1q_u8(pred); + uint8x16_t r; + const uint8x8_t r_0 = vld1_u8(ref); + const uint8x8_t r_1 = vld1_u8(ref + ref_stride); + r = vcombine_u8(r_0, r_1); + ref += 2 * ref_stride; + r = vrhaddq_u8(r, p); + vst1q_u8(comp, r); + + pred += 16; + comp += 16; + i -= 16; + } while (i); + } else { + int i = width * height; + assert(width == 4); + do { + const uint8x16_t p = vld1q_u8(pred); + uint8x16_t r; + + r = load_unaligned_u8q(ref, ref_stride); + ref += 4 * ref_stride; + r = vrhaddq_u8(r, p); + vst1q_u8(comp, r); + + pred += 16; + comp += 16; + i -= 16; + } while (i); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/deblock_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/deblock_neon.c new file mode 100644 index 0000000000..7efce32735 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/deblock_neon.c @@ -0,0 +1,480 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/transpose_neon.h" + +extern const int16_t vpx_rv[]; + +static uint8x8_t average_k_out(const uint8x8_t a2, const uint8x8_t a1, + const uint8x8_t v0, const uint8x8_t b1, + const uint8x8_t b2) { + const uint8x8_t k1 = vrhadd_u8(a2, a1); + const uint8x8_t k2 = vrhadd_u8(b2, b1); + const uint8x8_t k3 = vrhadd_u8(k1, k2); + return vrhadd_u8(k3, v0); +} + +static uint8x8_t generate_mask(const uint8x8_t a2, const uint8x8_t a1, + const uint8x8_t v0, const uint8x8_t b1, + const uint8x8_t b2, const uint8x8_t filter) { + const uint8x8_t a2_v0 = vabd_u8(a2, v0); + const uint8x8_t a1_v0 = vabd_u8(a1, v0); + const uint8x8_t b1_v0 = vabd_u8(b1, v0); + const uint8x8_t b2_v0 = vabd_u8(b2, v0); + + uint8x8_t max = vmax_u8(a2_v0, a1_v0); + max = vmax_u8(b1_v0, max); + max = vmax_u8(b2_v0, max); + return vclt_u8(max, filter); +} + +static uint8x8_t generate_output(const uint8x8_t a2, const uint8x8_t a1, + const uint8x8_t v0, const uint8x8_t b1, + const uint8x8_t b2, const uint8x8_t filter) { + const uint8x8_t k_out = average_k_out(a2, a1, v0, b1, b2); + const uint8x8_t mask = generate_mask(a2, a1, v0, b1, b2, filter); + + return vbsl_u8(mask, k_out, v0); +} + +// Same functions but for uint8x16_t. +static uint8x16_t average_k_outq(const uint8x16_t a2, const uint8x16_t a1, + const uint8x16_t v0, const uint8x16_t b1, + const uint8x16_t b2) { + const uint8x16_t k1 = vrhaddq_u8(a2, a1); + const uint8x16_t k2 = vrhaddq_u8(b2, b1); + const uint8x16_t k3 = vrhaddq_u8(k1, k2); + return vrhaddq_u8(k3, v0); +} + +static uint8x16_t generate_maskq(const uint8x16_t a2, const uint8x16_t a1, + const uint8x16_t v0, const uint8x16_t b1, + const uint8x16_t b2, const uint8x16_t filter) { + const uint8x16_t a2_v0 = vabdq_u8(a2, v0); + const uint8x16_t a1_v0 = vabdq_u8(a1, v0); + const uint8x16_t b1_v0 = vabdq_u8(b1, v0); + const uint8x16_t b2_v0 = vabdq_u8(b2, v0); + + uint8x16_t max = vmaxq_u8(a2_v0, a1_v0); + max = vmaxq_u8(b1_v0, max); + max = vmaxq_u8(b2_v0, max); + return vcltq_u8(max, filter); +} + +static uint8x16_t generate_outputq(const uint8x16_t a2, const uint8x16_t a1, + const uint8x16_t v0, const uint8x16_t b1, + const uint8x16_t b2, + const uint8x16_t filter) { + const uint8x16_t k_out = average_k_outq(a2, a1, v0, b1, b2); + const uint8x16_t mask = generate_maskq(a2, a1, v0, b1, b2, filter); + + return vbslq_u8(mask, k_out, v0); +} + +void vpx_post_proc_down_and_across_mb_row_neon(uint8_t *src_ptr, + uint8_t *dst_ptr, int src_stride, + int dst_stride, int cols, + uint8_t *f, int size) { + uint8_t *src, *dst; + int row; + int col; + + // While columns of length 16 can be processed, load them. + for (col = 0; col < cols - 8; col += 16) { + uint8x16_t a0, a1, a2, a3, a4, a5, a6, a7; + src = src_ptr - 2 * src_stride; + dst = dst_ptr; + + a0 = vld1q_u8(src); + src += src_stride; + a1 = vld1q_u8(src); + src += src_stride; + a2 = vld1q_u8(src); + src += src_stride; + a3 = vld1q_u8(src); + src += src_stride; + + for (row = 0; row < size; row += 4) { + uint8x16_t v_out_0, v_out_1, v_out_2, v_out_3; + const uint8x16_t filterq = vld1q_u8(f + col); + + a4 = vld1q_u8(src); + src += src_stride; + a5 = vld1q_u8(src); + src += src_stride; + a6 = vld1q_u8(src); + src += src_stride; + a7 = vld1q_u8(src); + src += src_stride; + + v_out_0 = generate_outputq(a0, a1, a2, a3, a4, filterq); + v_out_1 = generate_outputq(a1, a2, a3, a4, a5, filterq); + v_out_2 = generate_outputq(a2, a3, a4, a5, a6, filterq); + v_out_3 = generate_outputq(a3, a4, a5, a6, a7, filterq); + + vst1q_u8(dst, v_out_0); + dst += dst_stride; + vst1q_u8(dst, v_out_1); + dst += dst_stride; + vst1q_u8(dst, v_out_2); + dst += dst_stride; + vst1q_u8(dst, v_out_3); + dst += dst_stride; + + // Rotate over to the next slot. + a0 = a4; + a1 = a5; + a2 = a6; + a3 = a7; + } + + src_ptr += 16; + dst_ptr += 16; + } + + // Clean up any left over column of length 8. + if (col != cols) { + uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7; + src = src_ptr - 2 * src_stride; + dst = dst_ptr; + + a0 = vld1_u8(src); + src += src_stride; + a1 = vld1_u8(src); + src += src_stride; + a2 = vld1_u8(src); + src += src_stride; + a3 = vld1_u8(src); + src += src_stride; + + for (row = 0; row < size; row += 4) { + uint8x8_t v_out_0, v_out_1, v_out_2, v_out_3; + const uint8x8_t filter = vld1_u8(f + col); + + a4 = vld1_u8(src); + src += src_stride; + a5 = vld1_u8(src); + src += src_stride; + a6 = vld1_u8(src); + src += src_stride; + a7 = vld1_u8(src); + src += src_stride; + + v_out_0 = generate_output(a0, a1, a2, a3, a4, filter); + v_out_1 = generate_output(a1, a2, a3, a4, a5, filter); + v_out_2 = generate_output(a2, a3, a4, a5, a6, filter); + v_out_3 = generate_output(a3, a4, a5, a6, a7, filter); + + vst1_u8(dst, v_out_0); + dst += dst_stride; + vst1_u8(dst, v_out_1); + dst += dst_stride; + vst1_u8(dst, v_out_2); + dst += dst_stride; + vst1_u8(dst, v_out_3); + dst += dst_stride; + + // Rotate over to the next slot. + a0 = a4; + a1 = a5; + a2 = a6; + a3 = a7; + } + + // Not strictly necessary but makes resetting dst_ptr easier. + dst_ptr += 8; + } + + dst_ptr -= cols; + + for (row = 0; row < size; row += 8) { + uint8x8_t a0, a1, a2, a3; + uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7; + + src = dst_ptr; + dst = dst_ptr; + + // Load 8 values, transpose 4 of them, and discard 2 because they will be + // reloaded later. + load_and_transpose_u8_4x8(src, dst_stride, &a0, &a1, &a2, &a3); + a3 = a1; + a2 = a1 = a0; // Extend left border. + + src += 2; + + for (col = 0; col < cols; col += 8) { + uint8x8_t v_out_0, v_out_1, v_out_2, v_out_3, v_out_4, v_out_5, v_out_6, + v_out_7; + // Although the filter is meant to be applied vertically and is instead + // being applied horizontally here it's OK because it's set in blocks of 8 + // (or 16). + const uint8x8_t filter = vld1_u8(f + col); + + load_and_transpose_u8_8x8(src, dst_stride, &b0, &b1, &b2, &b3, &b4, &b5, + &b6, &b7); + + if (col + 8 == cols) { + // Last row. Extend border (b5). + b6 = b7 = b5; + } + + v_out_0 = generate_output(a0, a1, a2, a3, b0, filter); + v_out_1 = generate_output(a1, a2, a3, b0, b1, filter); + v_out_2 = generate_output(a2, a3, b0, b1, b2, filter); + v_out_3 = generate_output(a3, b0, b1, b2, b3, filter); + v_out_4 = generate_output(b0, b1, b2, b3, b4, filter); + v_out_5 = generate_output(b1, b2, b3, b4, b5, filter); + v_out_6 = generate_output(b2, b3, b4, b5, b6, filter); + v_out_7 = generate_output(b3, b4, b5, b6, b7, filter); + + transpose_and_store_u8_8x8(dst, dst_stride, v_out_0, v_out_1, v_out_2, + v_out_3, v_out_4, v_out_5, v_out_6, v_out_7); + + a0 = b4; + a1 = b5; + a2 = b6; + a3 = b7; + + src += 8; + dst += 8; + } + + dst_ptr += 8 * dst_stride; + } +} + +// sum += x; +// sumsq += x * y; +static void accumulate_sum_sumsq(const int16x4_t x, const int32x4_t xy, + int16x4_t *const sum, int32x4_t *const sumsq) { + const int16x4_t zero = vdup_n_s16(0); + const int32x4_t zeroq = vdupq_n_s32(0); + + // Add in the first set because vext doesn't work with '0'. + *sum = vadd_s16(*sum, x); + *sumsq = vaddq_s32(*sumsq, xy); + + // Shift x and xy to the right and sum. vext requires an immediate. + *sum = vadd_s16(*sum, vext_s16(zero, x, 1)); + *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 1)); + + *sum = vadd_s16(*sum, vext_s16(zero, x, 2)); + *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 2)); + + *sum = vadd_s16(*sum, vext_s16(zero, x, 3)); + *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 3)); +} + +// Generate mask based on (sumsq * 15 - sum * sum < flimit) +static uint16x4_t calculate_mask(const int16x4_t sum, const int32x4_t sumsq, + const int32x4_t f, const int32x4_t fifteen) { + const int32x4_t a = vmulq_s32(sumsq, fifteen); + const int32x4_t b = vmlsl_s16(a, sum, sum); + const uint32x4_t mask32 = vcltq_s32(b, f); + return vmovn_u32(mask32); +} + +static uint8x8_t combine_mask(const int16x4_t sum_low, const int16x4_t sum_high, + const int32x4_t sumsq_low, + const int32x4_t sumsq_high, const int32x4_t f) { + const int32x4_t fifteen = vdupq_n_s32(15); + const uint16x4_t mask16_low = calculate_mask(sum_low, sumsq_low, f, fifteen); + const uint16x4_t mask16_high = + calculate_mask(sum_high, sumsq_high, f, fifteen); + return vmovn_u16(vcombine_u16(mask16_low, mask16_high)); +} + +// Apply filter of (8 + sum + s[c]) >> 4. +static uint8x8_t filter_pixels(const int16x8_t sum, const uint8x8_t s) { + const int16x8_t s16 = vreinterpretq_s16_u16(vmovl_u8(s)); + const int16x8_t sum_s = vaddq_s16(sum, s16); + + return vqrshrun_n_s16(sum_s, 4); +} + +void vpx_mbpost_proc_across_ip_neon(uint8_t *src, int pitch, int rows, int cols, + int flimit) { + int row, col; + const int32x4_t f = vdupq_n_s32(flimit); + + assert(cols % 8 == 0); + + for (row = 0; row < rows; ++row) { + // Sum the first 8 elements, which are extended from s[0]. + // sumsq gets primed with +16. + int sumsq = src[0] * src[0] * 9 + 16; + int sum = src[0] * 9; + + uint8x8_t left_context, s, right_context; + int16x4_t sum_low, sum_high; + int32x4_t sumsq_low, sumsq_high; + + // Sum (+square) the next 6 elements. + // Skip [0] because it's included above. + for (col = 1; col <= 6; ++col) { + sumsq += src[col] * src[col]; + sum += src[col]; + } + + // Prime the sums. Later the loop uses the _high values to prime the new + // vectors. + sumsq_high = vdupq_n_s32(sumsq); + sum_high = vdup_n_s16(sum); + + // Manually extend the left border. + left_context = vdup_n_u8(src[0]); + + for (col = 0; col < cols; col += 8) { + uint8x8_t mask, output; + int16x8_t x, y; + int32x4_t xy_low, xy_high; + + s = vld1_u8(src + col); + + if (col + 8 == cols) { + // Last row. Extend border. + right_context = vdup_n_u8(src[col + 7]); + } else { + right_context = vld1_u8(src + col + 7); + } + + x = vreinterpretq_s16_u16(vsubl_u8(right_context, left_context)); + y = vreinterpretq_s16_u16(vaddl_u8(right_context, left_context)); + xy_low = vmull_s16(vget_low_s16(x), vget_low_s16(y)); + xy_high = vmull_s16(vget_high_s16(x), vget_high_s16(y)); + + // Catch up to the last sum'd value. + sum_low = vdup_lane_s16(sum_high, 3); + sumsq_low = vdupq_lane_s32(vget_high_s32(sumsq_high), 1); + + accumulate_sum_sumsq(vget_low_s16(x), xy_low, &sum_low, &sumsq_low); + + // Need to do this sequentially because we need the max value from + // sum_low. + sum_high = vdup_lane_s16(sum_low, 3); + sumsq_high = vdupq_lane_s32(vget_high_s32(sumsq_low), 1); + + accumulate_sum_sumsq(vget_high_s16(x), xy_high, &sum_high, &sumsq_high); + + mask = combine_mask(sum_low, sum_high, sumsq_low, sumsq_high, f); + + output = filter_pixels(vcombine_s16(sum_low, sum_high), s); + output = vbsl_u8(mask, output, s); + + vst1_u8(src + col, output); + + left_context = s; + } + + src += pitch; + } +} + +// Apply filter of (vpx_rv + sum + s[c]) >> 4. +static uint8x8_t filter_pixels_rv(const int16x8_t sum, const uint8x8_t s, + const int16x8_t rv) { + const int16x8_t s16 = vreinterpretq_s16_u16(vmovl_u8(s)); + const int16x8_t sum_s = vaddq_s16(sum, s16); + const int16x8_t rounded = vaddq_s16(sum_s, rv); + + return vqshrun_n_s16(rounded, 4); +} + +void vpx_mbpost_proc_down_neon(uint8_t *dst, int pitch, int rows, int cols, + int flimit) { + int row, col, i; + const int32x4_t f = vdupq_n_s32(flimit); + uint8x8_t below_context = vdup_n_u8(0); + + // 8 columns are processed at a time. + // If rows is less than 8 the bottom border extension fails. + assert(cols % 8 == 0); + assert(rows >= 8); + + // Load and keep the first 8 values in memory. Process a vertical stripe that + // is 8 wide. + for (col = 0; col < cols; col += 8) { + uint8x8_t s, above_context[8]; + int16x8_t sum, sum_tmp; + int32x4_t sumsq_low, sumsq_high; + + // Load and extend the top border. + s = vld1_u8(dst); + for (i = 0; i < 8; i++) { + above_context[i] = s; + } + + sum_tmp = vreinterpretq_s16_u16(vmovl_u8(s)); + + // sum * 9 + sum = vmulq_n_s16(sum_tmp, 9); + + // (sum * 9) * sum == sum * sum * 9 + sumsq_low = vmull_s16(vget_low_s16(sum), vget_low_s16(sum_tmp)); + sumsq_high = vmull_s16(vget_high_s16(sum), vget_high_s16(sum_tmp)); + + // Load and discard the next 6 values to prime sum and sumsq. + for (i = 1; i <= 6; ++i) { + const uint8x8_t a = vld1_u8(dst + i * pitch); + const int16x8_t b = vreinterpretq_s16_u16(vmovl_u8(a)); + sum = vaddq_s16(sum, b); + + sumsq_low = vmlal_s16(sumsq_low, vget_low_s16(b), vget_low_s16(b)); + sumsq_high = vmlal_s16(sumsq_high, vget_high_s16(b), vget_high_s16(b)); + } + + for (row = 0; row < rows; ++row) { + uint8x8_t mask, output; + int16x8_t x, y; + int32x4_t xy_low, xy_high; + + s = vld1_u8(dst + row * pitch); + + // Extend the bottom border. + if (row + 7 < rows) { + below_context = vld1_u8(dst + (row + 7) * pitch); + } + + x = vreinterpretq_s16_u16(vsubl_u8(below_context, above_context[0])); + y = vreinterpretq_s16_u16(vaddl_u8(below_context, above_context[0])); + xy_low = vmull_s16(vget_low_s16(x), vget_low_s16(y)); + xy_high = vmull_s16(vget_high_s16(x), vget_high_s16(y)); + + sum = vaddq_s16(sum, x); + + sumsq_low = vaddq_s32(sumsq_low, xy_low); + sumsq_high = vaddq_s32(sumsq_high, xy_high); + + mask = combine_mask(vget_low_s16(sum), vget_high_s16(sum), sumsq_low, + sumsq_high, f); + + output = filter_pixels_rv(sum, s, vld1q_s16(vpx_rv + (row & 127))); + output = vbsl_u8(mask, output, s); + + vst1_u8(dst + row * pitch, output); + + above_context[0] = above_context[1]; + above_context[1] = above_context[2]; + above_context[2] = above_context[3]; + above_context[3] = above_context[4]; + above_context[4] = above_context[5]; + above_context[5] = above_context[6]; + above_context[6] = above_context[7]; + above_context[7] = s; + } + + dst += 8; + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.c new file mode 100644 index 0000000000..fde71ff30d --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.c @@ -0,0 +1,439 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/fdct16x16_neon.h" + +// Some builds of gcc 4.9.2 and .3 have trouble with some of the inline +// functions. +#if !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && \ + __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __GNUC_PATCHLEVEL__ < 4 + +void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { + vpx_fdct16x16_c(input, output, stride); +} + +#else + +// Main body of fdct16x16. +static void vpx_fdct8x16_body(const int16x8_t *in /*[16]*/, + int16x8_t *out /*[16]*/) { + int16x8_t s[8]; + int16x8_t x[4]; + int16x8_t step[8]; + + // stage 1 + // From fwd_txfm.c: Work on the first eight values; fdct8(input, + // even_results);" + s[0] = vaddq_s16(in[0], in[7]); + s[1] = vaddq_s16(in[1], in[6]); + s[2] = vaddq_s16(in[2], in[5]); + s[3] = vaddq_s16(in[3], in[4]); + s[4] = vsubq_s16(in[3], in[4]); + s[5] = vsubq_s16(in[2], in[5]); + s[6] = vsubq_s16(in[1], in[6]); + s[7] = vsubq_s16(in[0], in[7]); + + // fdct4(step, step); + x[0] = vaddq_s16(s[0], s[3]); + x[1] = vaddq_s16(s[1], s[2]); + x[2] = vsubq_s16(s[1], s[2]); + x[3] = vsubq_s16(s[0], s[3]); + + // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64) + // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0], + &out[8]); + // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); + // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64); + butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[4], &out[12]); + + // Stage 2 + // Re-using source s5/s6 + // s5 = fdct_round_shift((s6 - s5) * cospi_16_64) + // s6 = fdct_round_shift((s6 + s5) * cospi_16_64) + butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &s[6], &s[5]); + + // Stage 3 + x[0] = vaddq_s16(s[4], s[5]); + x[1] = vsubq_s16(s[4], s[5]); + x[2] = vsubq_s16(s[7], s[6]); + x[3] = vaddq_s16(s[7], s[6]); + + // Stage 4 + // out[2] = fdct_round_shift(x3 * cospi_4_64 + x0 * cospi_28_64) + // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64) + butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[2], &out[14]); + // out[6] = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64) + // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64) + butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[10], &out[6]); + + // step 2 + // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results" + // That file distinguished between "in_high" and "step1" but the only + // difference is that "in_high" is the first 8 values and "step 1" is the + // second. Here, since they are all in one array, "step1" values are += 8. + + // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64) + // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64) + // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64) + // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64) + butterfly_one_coeff_s16_fast(in[13], in[10], cospi_16_64, &s[5], &s[2]); + butterfly_one_coeff_s16_fast(in[12], in[11], cospi_16_64, &s[4], &s[3]); + + // step 3 + s[0] = vaddq_s16(in[8], s[3]); + s[1] = vaddq_s16(in[9], s[2]); + x[0] = vsubq_s16(in[9], s[2]); + x[1] = vsubq_s16(in[8], s[3]); + x[2] = vsubq_s16(in[15], s[4]); + x[3] = vsubq_s16(in[14], s[5]); + s[6] = vaddq_s16(in[14], s[5]); + s[7] = vaddq_s16(in[15], s[4]); + + // step 4 + // step2[6] = fdct_round_shift(step3[6] * cospi_8_64 + step3[1] * + // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1] + // * cospi_8_64) + butterfly_two_coeff(s[6], s[1], cospi_8_64, cospi_24_64, &s[6], &s[1]); + + // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64) + // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * + // cospi_24_64) + butterfly_two_coeff(x[0], x[3], cospi_24_64, cospi_8_64, &s[2], &s[5]); + + // step 5 + step[0] = vaddq_s16(s[0], s[1]); + step[1] = vsubq_s16(s[0], s[1]); + step[2] = vaddq_s16(x[1], s[2]); + step[3] = vsubq_s16(x[1], s[2]); + step[4] = vsubq_s16(x[2], s[5]); + step[5] = vaddq_s16(x[2], s[5]); + step[6] = vsubq_s16(s[7], s[6]); + step[7] = vaddq_s16(s[7], s[6]); + + // step 6 + // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64) + // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64) + butterfly_two_coeff(step[6], step[1], cospi_18_64, cospi_14_64, &out[9], + &out[7]); + // out[1] = fdct_round_shift(step1[7] * cospi_2_64 + step1[0] * cospi_30_64) + // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64) + butterfly_two_coeff(step[7], step[0], cospi_2_64, cospi_30_64, &out[1], + &out[15]); + + // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64) + // out[3] = fdct_round_shift(step1[4] * cospi_6_64 - step1[3] * cospi_26_64) + butterfly_two_coeff(step[4], step[3], cospi_26_64, cospi_6_64, &out[13], + &out[3]); + + // out[5] = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64) + // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64) + butterfly_two_coeff(step[5], step[2], cospi_10_64, cospi_22_64, &out[5], + &out[11]); +} + +void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { + int16x8_t temp0[16]; + int16x8_t temp1[16]; + int16x8_t temp2[16]; + int16x8_t temp3[16]; + + // Left half. + load_cross(input, stride, temp0); + scale_input(temp0, temp1); + vpx_fdct8x16_body(temp1, temp0); + + // Right half. + load_cross(input + 8, stride, temp1); + scale_input(temp1, temp2); + vpx_fdct8x16_body(temp2, temp1); + + // Transpose top left and top right quarters into one contiguous location to + // process to the top half. + + transpose_s16_8x8q(&temp0[0], &temp2[0]); + transpose_s16_8x8q(&temp1[0], &temp2[8]); + partial_round_shift(temp2); + cross_input(temp2, temp3); + vpx_fdct8x16_body(temp3, temp2); + transpose_s16_8x8(&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4], + &temp2[5], &temp2[6], &temp2[7]); + transpose_s16_8x8(&temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12], + &temp2[13], &temp2[14], &temp2[15]); + store(output, temp2); + store(output + 8, temp2 + 8); + output += 8 * 16; + + // Transpose bottom left and bottom right quarters into one contiguous + // location to process to the bottom half. + transpose_s16_8x8q(&temp0[8], &temp1[0]); + + transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12], + &temp1[13], &temp1[14], &temp1[15]); + partial_round_shift(temp1); + cross_input(temp1, temp0); + vpx_fdct8x16_body(temp0, temp1); + transpose_s16_8x8(&temp1[0], &temp1[1], &temp1[2], &temp1[3], &temp1[4], + &temp1[5], &temp1[6], &temp1[7]); + transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12], + &temp1[13], &temp1[14], &temp1[15]); + store(output, temp1); + store(output + 8, temp1 + 8); +} + +#if CONFIG_VP9_HIGHBITDEPTH + +// Main body of fdct8x16 column +static void vpx_highbd_fdct8x16_body(int32x4_t *left /*[16]*/, + int32x4_t *right /* [16] */) { + int32x4_t sl[8]; + int32x4_t sr[8]; + int32x4_t xl[4]; + int32x4_t xr[4]; + int32x4_t inl[8]; + int32x4_t inr[8]; + int32x4_t stepl[8]; + int32x4_t stepr[8]; + + // stage 1 + // From fwd_txfm.c: Work on the first eight values; fdct8(input, + // even_results);" + sl[0] = vaddq_s32(left[0], left[7]); + sr[0] = vaddq_s32(right[0], right[7]); + sl[1] = vaddq_s32(left[1], left[6]); + sr[1] = vaddq_s32(right[1], right[6]); + sl[2] = vaddq_s32(left[2], left[5]); + sr[2] = vaddq_s32(right[2], right[5]); + sl[3] = vaddq_s32(left[3], left[4]); + sr[3] = vaddq_s32(right[3], right[4]); + sl[4] = vsubq_s32(left[3], left[4]); + sr[4] = vsubq_s32(right[3], right[4]); + sl[5] = vsubq_s32(left[2], left[5]); + sr[5] = vsubq_s32(right[2], right[5]); + sl[6] = vsubq_s32(left[1], left[6]); + sr[6] = vsubq_s32(right[1], right[6]); + sl[7] = vsubq_s32(left[0], left[7]); + sr[7] = vsubq_s32(right[0], right[7]); + + // Copy values 8-15 as we're storing in-place + inl[0] = left[8]; + inr[0] = right[8]; + inl[1] = left[9]; + inr[1] = right[9]; + inl[2] = left[10]; + inr[2] = right[10]; + inl[3] = left[11]; + inr[3] = right[11]; + inl[4] = left[12]; + inr[4] = right[12]; + inl[5] = left[13]; + inr[5] = right[13]; + inl[6] = left[14]; + inr[6] = right[14]; + inl[7] = left[15]; + inr[7] = right[15]; + + // fdct4(step, step); + xl[0] = vaddq_s32(sl[0], sl[3]); + xr[0] = vaddq_s32(sr[0], sr[3]); + xl[1] = vaddq_s32(sl[1], sl[2]); + xr[1] = vaddq_s32(sr[1], sr[2]); + xl[2] = vsubq_s32(sl[1], sl[2]); + xr[2] = vsubq_s32(sr[1], sr[2]); + xl[3] = vsubq_s32(sl[0], sl[3]); + xr[3] = vsubq_s32(sr[0], sr[3]); + + // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64) + // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64, + &left[0], &right[0], &left[8], &right[8]); + + // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); + // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64); + butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64, + cospi_24_64, &left[4], &right[4], + &left[12], &right[12]); + + // Stage 2 + // Re-using source s5/s6 + // s5 = fdct_round_shift((s6 - s5) * cospi_16_64) + // s6 = fdct_round_shift((s6 + s5) * cospi_16_64) + butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &sl[6], + &sr[6], &sl[5], &sr[5]); + + // Stage 3 + xl[0] = vaddq_s32(sl[4], sl[5]); + xr[0] = vaddq_s32(sr[4], sr[5]); + xl[1] = vsubq_s32(sl[4], sl[5]); + xr[1] = vsubq_s32(sr[4], sr[5]); + xl[2] = vsubq_s32(sl[7], sl[6]); + xr[2] = vsubq_s32(sr[7], sr[6]); + xl[3] = vaddq_s32(sl[7], sl[6]); + xr[3] = vaddq_s32(sr[7], sr[6]); + + // Stage 4 + // out[2] = fdct_round_shift(x3 * cospi_4_64 + x0 * cospi_28_64) + // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64) + butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64, + cospi_28_64, &left[2], &right[2], + &left[14], &right[14]); + // out[6] = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64) + // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64) + butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64, + cospi_12_64, &left[10], &right[10], + &left[6], &right[6]); + + // step 2 + // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results" + // That file distinguished between "in_high" and "step1" but the only + // difference is that "in_high" is the first 8 values and "step 1" is the + // second. Here, since they are all in one array, "step1" values are += 8. + + // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64) + // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64) + // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64) + // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64) + butterfly_one_coeff_s32_fast(inl[5], inr[5], inl[2], inr[2], cospi_16_64, + &sl[5], &sr[5], &sl[2], &sr[2]); + butterfly_one_coeff_s32_fast(inl[4], inr[4], inl[3], inr[3], cospi_16_64, + &sl[4], &sr[4], &sl[3], &sr[3]); + + // step 3 + sl[0] = vaddq_s32(inl[0], sl[3]); + sr[0] = vaddq_s32(inr[0], sr[3]); + sl[1] = vaddq_s32(inl[1], sl[2]); + sr[1] = vaddq_s32(inr[1], sr[2]); + xl[0] = vsubq_s32(inl[1], sl[2]); + xr[0] = vsubq_s32(inr[1], sr[2]); + xl[1] = vsubq_s32(inl[0], sl[3]); + xr[1] = vsubq_s32(inr[0], sr[3]); + xl[2] = vsubq_s32(inl[7], sl[4]); + xr[2] = vsubq_s32(inr[7], sr[4]); + xl[3] = vsubq_s32(inl[6], sl[5]); + xr[3] = vsubq_s32(inr[6], sr[5]); + sl[6] = vaddq_s32(inl[6], sl[5]); + sr[6] = vaddq_s32(inr[6], sr[5]); + sl[7] = vaddq_s32(inl[7], sl[4]); + sr[7] = vaddq_s32(inr[7], sr[4]); + + // step 4 + // step2[6] = fdct_round_shift(step3[6] * cospi_8_64 + step3[1] * + // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1] + // * cospi_8_64) + butterfly_two_coeff_s32_s64_narrow(sl[6], sr[6], sl[1], sr[1], cospi_8_64, + cospi_24_64, &sl[6], &sr[6], &sl[1], + &sr[1]); + // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64) + // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * + // cospi_24_64) + butterfly_two_coeff_s32_s64_narrow(xl[0], xr[0], xl[3], xr[3], cospi_24_64, + cospi_8_64, &sl[2], &sr[2], &sl[5], + &sr[5]); + + // step 5 + stepl[0] = vaddq_s32(sl[0], sl[1]); + stepr[0] = vaddq_s32(sr[0], sr[1]); + stepl[1] = vsubq_s32(sl[0], sl[1]); + stepr[1] = vsubq_s32(sr[0], sr[1]); + stepl[2] = vaddq_s32(xl[1], sl[2]); + stepr[2] = vaddq_s32(xr[1], sr[2]); + stepl[3] = vsubq_s32(xl[1], sl[2]); + stepr[3] = vsubq_s32(xr[1], sr[2]); + stepl[4] = vsubq_s32(xl[2], sl[5]); + stepr[4] = vsubq_s32(xr[2], sr[5]); + stepl[5] = vaddq_s32(xl[2], sl[5]); + stepr[5] = vaddq_s32(xr[2], sr[5]); + stepl[6] = vsubq_s32(sl[7], sl[6]); + stepr[6] = vsubq_s32(sr[7], sr[6]); + stepl[7] = vaddq_s32(sl[7], sl[6]); + stepr[7] = vaddq_s32(sr[7], sr[6]); + + // step 6 + // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64) + // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64) + butterfly_two_coeff_s32_s64_narrow(stepl[6], stepr[6], stepl[1], stepr[1], + cospi_18_64, cospi_14_64, &left[9], + &right[9], &left[7], &right[7]); + // out[1] = fdct_round_shift(step1[7] * cospi_2_64 + step1[0] * cospi_30_64) + // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64) + butterfly_two_coeff_s32_s64_narrow(stepl[7], stepr[7], stepl[0], stepr[0], + cospi_2_64, cospi_30_64, &left[1], + &right[1], &left[15], &right[15]); + // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64) + // out[3] = fdct_round_shift(step1[4] * cospi_6_64 - step1[3] * cospi_26_64) + butterfly_two_coeff_s32_s64_narrow(stepl[4], stepr[4], stepl[3], stepr[3], + cospi_26_64, cospi_6_64, &left[13], + &right[13], &left[3], &right[3]); + // out[5] = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64) + // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64) + butterfly_two_coeff_s32_s64_narrow(stepl[5], stepr[5], stepl[2], stepr[2], + cospi_10_64, cospi_22_64, &left[5], + &right[5], &left[11], &right[11]); +} + +void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output, + int stride) { + int16x8_t temp0[16]; + int32x4_t left1[16], left2[16], left3[16], left4[16], right1[16], right2[16], + right3[16], right4[16]; + + // Left half. + load_cross(input, stride, temp0); + highbd_scale_input(temp0, left1, right1); + vpx_highbd_fdct8x16_body(left1, right1); + + // right half. + load_cross(input + 8, stride, temp0); + highbd_scale_input(temp0, left2, right2); + vpx_highbd_fdct8x16_body(left2, right2); + + // Transpose top left and top right quarters into one contiguous location to + // process to the top half. + + transpose_s32_8x8_2(left1, right1, left3, right3); + transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8); + transpose_s32_8x8_2(left1 + 8, right1 + 8, left4, right4); + transpose_s32_8x8_2(left2 + 8, right2 + 8, left4 + 8, right4 + 8); + + highbd_partial_round_shift(left3, right3); + highbd_cross_input(left3, right3, left1, right1); + vpx_highbd_fdct8x16_body(left1, right1); + + // Transpose bottom left and bottom right quarters into one contiguous + // location to process to the bottom half. + + highbd_partial_round_shift(left4, right4); + highbd_cross_input(left4, right4, left2, right2); + vpx_highbd_fdct8x16_body(left2, right2); + + transpose_s32_8x8_2(left1, right1, left3, right3); + transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8); + transpose_s32_8x8_2(left1 + 8, right1 + 8, left4, right4); + transpose_s32_8x8_2(left2 + 8, right2 + 8, left4 + 8, right4 + 8); + store16_s32(output, left3); + output += 4; + store16_s32(output, right3); + output += 4; + + store16_s32(output, left4); + output += 4; + store16_s32(output, right4); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +#endif // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && + // __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __GNUC_PATCHLEVEL__ < 4 diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.h new file mode 100644 index 0000000000..cd58675ca4 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.h @@ -0,0 +1,318 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_ +#define VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_ + +#include + +#include "fdct_neon.h" + +static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) { + b[0] = vld1q_s16(a); + a += stride; + b[1] = vld1q_s16(a); + a += stride; + b[2] = vld1q_s16(a); + a += stride; + b[3] = vld1q_s16(a); + a += stride; + b[4] = vld1q_s16(a); + a += stride; + b[5] = vld1q_s16(a); + a += stride; + b[6] = vld1q_s16(a); + a += stride; + b[7] = vld1q_s16(a); + a += stride; + b[8] = vld1q_s16(a); + a += stride; + b[9] = vld1q_s16(a); + a += stride; + b[10] = vld1q_s16(a); + a += stride; + b[11] = vld1q_s16(a); + a += stride; + b[12] = vld1q_s16(a); + a += stride; + b[13] = vld1q_s16(a); + a += stride; + b[14] = vld1q_s16(a); + a += stride; + b[15] = vld1q_s16(a); +} + +// Store 8 16x8 values, assuming stride == 16. +static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) { + store_s16q_to_tran_low(a, b[0]); + a += 16; + store_s16q_to_tran_low(a, b[1]); + a += 16; + store_s16q_to_tran_low(a, b[2]); + a += 16; + store_s16q_to_tran_low(a, b[3]); + a += 16; + store_s16q_to_tran_low(a, b[4]); + a += 16; + store_s16q_to_tran_low(a, b[5]); + a += 16; + store_s16q_to_tran_low(a, b[6]); + a += 16; + store_s16q_to_tran_low(a, b[7]); +} + +// Load step of each pass. Add and subtract clear across the input, requiring +// all 16 values to be loaded. For the first pass it also multiplies by 4. + +// To maybe reduce register usage this could be combined with the load() step to +// get the first 4 and last 4 values, cross those, then load the middle 8 values +// and cross them. +static INLINE void scale_input(const int16x8_t *a /*[16]*/, + int16x8_t *b /*[16]*/) { + b[0] = vshlq_n_s16(a[0], 2); + b[1] = vshlq_n_s16(a[1], 2); + b[2] = vshlq_n_s16(a[2], 2); + b[3] = vshlq_n_s16(a[3], 2); + b[4] = vshlq_n_s16(a[4], 2); + b[5] = vshlq_n_s16(a[5], 2); + b[6] = vshlq_n_s16(a[6], 2); + b[7] = vshlq_n_s16(a[7], 2); + + b[8] = vshlq_n_s16(a[8], 2); + b[9] = vshlq_n_s16(a[9], 2); + b[10] = vshlq_n_s16(a[10], 2); + b[11] = vshlq_n_s16(a[11], 2); + b[12] = vshlq_n_s16(a[12], 2); + b[13] = vshlq_n_s16(a[13], 2); + b[14] = vshlq_n_s16(a[14], 2); + b[15] = vshlq_n_s16(a[15], 2); +} + +static INLINE void cross_input(const int16x8_t *a /*[16]*/, + int16x8_t *b /*[16]*/) { + b[0] = vaddq_s16(a[0], a[15]); + b[1] = vaddq_s16(a[1], a[14]); + b[2] = vaddq_s16(a[2], a[13]); + b[3] = vaddq_s16(a[3], a[12]); + b[4] = vaddq_s16(a[4], a[11]); + b[5] = vaddq_s16(a[5], a[10]); + b[6] = vaddq_s16(a[6], a[9]); + b[7] = vaddq_s16(a[7], a[8]); + + b[8] = vsubq_s16(a[7], a[8]); + b[9] = vsubq_s16(a[6], a[9]); + b[10] = vsubq_s16(a[5], a[10]); + b[11] = vsubq_s16(a[4], a[11]); + b[12] = vsubq_s16(a[3], a[12]); + b[13] = vsubq_s16(a[2], a[13]); + b[14] = vsubq_s16(a[1], a[14]); + b[15] = vsubq_s16(a[0], a[15]); +} + +static INLINE void load_cross(const int16_t *a, int stride, + int16x8_t *b /*[16]*/) { + b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 15 * stride)); + b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 14 * stride)); + b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 13 * stride)); + b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 12 * stride)); + b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 11 * stride)); + b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 10 * stride)); + b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 9 * stride)); + b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 8 * stride)); + + b[8] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 8 * stride)); + b[9] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 9 * stride)); + b[10] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 10 * stride)); + b[11] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 11 * stride)); + b[12] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 12 * stride)); + b[13] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 13 * stride)); + b[14] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 14 * stride)); + b[15] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 15 * stride)); +} + +// Quarter round at the beginning of the second pass. Can't use vrshr (rounding) +// because this only adds 1, not 1 << 2. +static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) { + const int16x8_t one = vdupq_n_s16(1); + a[0] = vshrq_n_s16(vaddq_s16(a[0], one), 2); + a[1] = vshrq_n_s16(vaddq_s16(a[1], one), 2); + a[2] = vshrq_n_s16(vaddq_s16(a[2], one), 2); + a[3] = vshrq_n_s16(vaddq_s16(a[3], one), 2); + a[4] = vshrq_n_s16(vaddq_s16(a[4], one), 2); + a[5] = vshrq_n_s16(vaddq_s16(a[5], one), 2); + a[6] = vshrq_n_s16(vaddq_s16(a[6], one), 2); + a[7] = vshrq_n_s16(vaddq_s16(a[7], one), 2); + a[8] = vshrq_n_s16(vaddq_s16(a[8], one), 2); + a[9] = vshrq_n_s16(vaddq_s16(a[9], one), 2); + a[10] = vshrq_n_s16(vaddq_s16(a[10], one), 2); + a[11] = vshrq_n_s16(vaddq_s16(a[11], one), 2); + a[12] = vshrq_n_s16(vaddq_s16(a[12], one), 2); + a[13] = vshrq_n_s16(vaddq_s16(a[13], one), 2); + a[14] = vshrq_n_s16(vaddq_s16(a[14], one), 2); + a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2); +} + +#if CONFIG_VP9_HIGHBITDEPTH + +static INLINE void highbd_scale_input(const int16x8_t *a /*[16]*/, + int32x4_t *left /*[16]*/, + int32x4_t *right /* [16] */) { + left[0] = vshll_n_s16(vget_low_s16(a[0]), 2); + left[1] = vshll_n_s16(vget_low_s16(a[1]), 2); + left[2] = vshll_n_s16(vget_low_s16(a[2]), 2); + left[3] = vshll_n_s16(vget_low_s16(a[3]), 2); + left[4] = vshll_n_s16(vget_low_s16(a[4]), 2); + left[5] = vshll_n_s16(vget_low_s16(a[5]), 2); + left[6] = vshll_n_s16(vget_low_s16(a[6]), 2); + left[7] = vshll_n_s16(vget_low_s16(a[7]), 2); + left[8] = vshll_n_s16(vget_low_s16(a[8]), 2); + left[9] = vshll_n_s16(vget_low_s16(a[9]), 2); + left[10] = vshll_n_s16(vget_low_s16(a[10]), 2); + left[11] = vshll_n_s16(vget_low_s16(a[11]), 2); + left[12] = vshll_n_s16(vget_low_s16(a[12]), 2); + left[13] = vshll_n_s16(vget_low_s16(a[13]), 2); + left[14] = vshll_n_s16(vget_low_s16(a[14]), 2); + left[15] = vshll_n_s16(vget_low_s16(a[15]), 2); + + right[0] = vshll_n_s16(vget_high_s16(a[0]), 2); + right[1] = vshll_n_s16(vget_high_s16(a[1]), 2); + right[2] = vshll_n_s16(vget_high_s16(a[2]), 2); + right[3] = vshll_n_s16(vget_high_s16(a[3]), 2); + right[4] = vshll_n_s16(vget_high_s16(a[4]), 2); + right[5] = vshll_n_s16(vget_high_s16(a[5]), 2); + right[6] = vshll_n_s16(vget_high_s16(a[6]), 2); + right[7] = vshll_n_s16(vget_high_s16(a[7]), 2); + right[8] = vshll_n_s16(vget_high_s16(a[8]), 2); + right[9] = vshll_n_s16(vget_high_s16(a[9]), 2); + right[10] = vshll_n_s16(vget_high_s16(a[10]), 2); + right[11] = vshll_n_s16(vget_high_s16(a[11]), 2); + right[12] = vshll_n_s16(vget_high_s16(a[12]), 2); + right[13] = vshll_n_s16(vget_high_s16(a[13]), 2); + right[14] = vshll_n_s16(vget_high_s16(a[14]), 2); + right[15] = vshll_n_s16(vget_high_s16(a[15]), 2); +} + +static INLINE void highbd_cross_input(const int32x4_t *a_left /*[16]*/, + int32x4_t *a_right /*[16]*/, + int32x4_t *b_left /*[16]*/, + int32x4_t *b_right /*[16]*/) { + b_left[0] = vaddq_s32(a_left[0], a_left[15]); + b_left[1] = vaddq_s32(a_left[1], a_left[14]); + b_left[2] = vaddq_s32(a_left[2], a_left[13]); + b_left[3] = vaddq_s32(a_left[3], a_left[12]); + b_left[4] = vaddq_s32(a_left[4], a_left[11]); + b_left[5] = vaddq_s32(a_left[5], a_left[10]); + b_left[6] = vaddq_s32(a_left[6], a_left[9]); + b_left[7] = vaddq_s32(a_left[7], a_left[8]); + + b_right[0] = vaddq_s32(a_right[0], a_right[15]); + b_right[1] = vaddq_s32(a_right[1], a_right[14]); + b_right[2] = vaddq_s32(a_right[2], a_right[13]); + b_right[3] = vaddq_s32(a_right[3], a_right[12]); + b_right[4] = vaddq_s32(a_right[4], a_right[11]); + b_right[5] = vaddq_s32(a_right[5], a_right[10]); + b_right[6] = vaddq_s32(a_right[6], a_right[9]); + b_right[7] = vaddq_s32(a_right[7], a_right[8]); + + b_left[8] = vsubq_s32(a_left[7], a_left[8]); + b_left[9] = vsubq_s32(a_left[6], a_left[9]); + b_left[10] = vsubq_s32(a_left[5], a_left[10]); + b_left[11] = vsubq_s32(a_left[4], a_left[11]); + b_left[12] = vsubq_s32(a_left[3], a_left[12]); + b_left[13] = vsubq_s32(a_left[2], a_left[13]); + b_left[14] = vsubq_s32(a_left[1], a_left[14]); + b_left[15] = vsubq_s32(a_left[0], a_left[15]); + + b_right[8] = vsubq_s32(a_right[7], a_right[8]); + b_right[9] = vsubq_s32(a_right[6], a_right[9]); + b_right[10] = vsubq_s32(a_right[5], a_right[10]); + b_right[11] = vsubq_s32(a_right[4], a_right[11]); + b_right[12] = vsubq_s32(a_right[3], a_right[12]); + b_right[13] = vsubq_s32(a_right[2], a_right[13]); + b_right[14] = vsubq_s32(a_right[1], a_right[14]); + b_right[15] = vsubq_s32(a_right[0], a_right[15]); +} + +static INLINE void highbd_partial_round_shift(int32x4_t *left /*[16]*/, + int32x4_t *right /* [16] */) { + const int32x4_t one = vdupq_n_s32(1); + left[0] = vshrq_n_s32(vaddq_s32(left[0], one), 2); + left[1] = vshrq_n_s32(vaddq_s32(left[1], one), 2); + left[2] = vshrq_n_s32(vaddq_s32(left[2], one), 2); + left[3] = vshrq_n_s32(vaddq_s32(left[3], one), 2); + left[4] = vshrq_n_s32(vaddq_s32(left[4], one), 2); + left[5] = vshrq_n_s32(vaddq_s32(left[5], one), 2); + left[6] = vshrq_n_s32(vaddq_s32(left[6], one), 2); + left[7] = vshrq_n_s32(vaddq_s32(left[7], one), 2); + left[8] = vshrq_n_s32(vaddq_s32(left[8], one), 2); + left[9] = vshrq_n_s32(vaddq_s32(left[9], one), 2); + left[10] = vshrq_n_s32(vaddq_s32(left[10], one), 2); + left[11] = vshrq_n_s32(vaddq_s32(left[11], one), 2); + left[12] = vshrq_n_s32(vaddq_s32(left[12], one), 2); + left[13] = vshrq_n_s32(vaddq_s32(left[13], one), 2); + left[14] = vshrq_n_s32(vaddq_s32(left[14], one), 2); + left[15] = vshrq_n_s32(vaddq_s32(left[15], one), 2); + + right[0] = vshrq_n_s32(vaddq_s32(right[0], one), 2); + right[1] = vshrq_n_s32(vaddq_s32(right[1], one), 2); + right[2] = vshrq_n_s32(vaddq_s32(right[2], one), 2); + right[3] = vshrq_n_s32(vaddq_s32(right[3], one), 2); + right[4] = vshrq_n_s32(vaddq_s32(right[4], one), 2); + right[5] = vshrq_n_s32(vaddq_s32(right[5], one), 2); + right[6] = vshrq_n_s32(vaddq_s32(right[6], one), 2); + right[7] = vshrq_n_s32(vaddq_s32(right[7], one), 2); + right[8] = vshrq_n_s32(vaddq_s32(right[8], one), 2); + right[9] = vshrq_n_s32(vaddq_s32(right[9], one), 2); + right[10] = vshrq_n_s32(vaddq_s32(right[10], one), 2); + right[11] = vshrq_n_s32(vaddq_s32(right[11], one), 2); + right[12] = vshrq_n_s32(vaddq_s32(right[12], one), 2); + right[13] = vshrq_n_s32(vaddq_s32(right[13], one), 2); + right[14] = vshrq_n_s32(vaddq_s32(right[14], one), 2); + right[15] = vshrq_n_s32(vaddq_s32(right[15], one), 2); +} + +// Store 16 32x4 vectors, assuming stride == 16. +static INLINE void store16_s32(tran_low_t *a, const int32x4_t *b /*[32]*/) { + vst1q_s32(a, b[0]); + a += 16; + vst1q_s32(a, b[1]); + a += 16; + vst1q_s32(a, b[2]); + a += 16; + vst1q_s32(a, b[3]); + a += 16; + vst1q_s32(a, b[4]); + a += 16; + vst1q_s32(a, b[5]); + a += 16; + vst1q_s32(a, b[6]); + a += 16; + vst1q_s32(a, b[7]); + a += 16; + vst1q_s32(a, b[8]); + a += 16; + vst1q_s32(a, b[9]); + a += 16; + vst1q_s32(a, b[10]); + a += 16; + vst1q_s32(a, b[11]); + a += 16; + vst1q_s32(a, b[12]); + a += 16; + vst1q_s32(a, b[13]); + a += 16; + vst1q_s32(a, b[14]); + a += 16; + vst1q_s32(a, b[15]); +} + +#endif // CONFIG_VP9_HIGHBITDEPTH + +#endif // VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.c new file mode 100644 index 0000000000..a91730ce8b --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.c @@ -0,0 +1,419 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/fdct_neon.h" +#include "vpx_dsp/arm/fdct32x32_neon.h" + +// Most gcc 4.9 distributions outside of Android do not generate correct code +// for this function. +#if !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && \ + __GNUC__ == 4 && __GNUC_MINOR__ <= 9 + +void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { + vpx_fdct32x32_c(input, output, stride); +} + +void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, + int stride) { + vpx_fdct32x32_rd_c(input, output, stride); +} + +#else + +void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { + int16x8_t temp0[32]; + int16x8_t temp1[32]; + int16x8_t temp2[32]; + int16x8_t temp3[32]; + int16x8_t temp4[32]; + int16x8_t temp5[32]; + + // Process in 8x32 columns. + load_cross(input, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp1); + + load_cross(input + 8, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp2); + + load_cross(input + 16, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp3); + + load_cross(input + 24, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp4); + + // Generate the top row by munging the first set of 8 from each one together. + transpose_s16_8x8q(&temp1[0], &temp0[0]); + transpose_s16_8x8q(&temp2[0], &temp0[8]); + transpose_s16_8x8q(&temp3[0], &temp0[16]); + transpose_s16_8x8q(&temp4[0], &temp0[24]); + + dct_body_second_pass(temp0, temp5); + + transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], + &temp5[5], &temp5[6], &temp5[7]); + transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], + &temp5[13], &temp5[14], &temp5[15]); + transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], + &temp5[21], &temp5[22], &temp5[23]); + transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], + &temp5[29], &temp5[30], &temp5[31]); + store(output, temp5); + + // Second row of 8x32. + transpose_s16_8x8q(&temp1[8], &temp0[0]); + transpose_s16_8x8q(&temp2[8], &temp0[8]); + transpose_s16_8x8q(&temp3[8], &temp0[16]); + transpose_s16_8x8q(&temp4[8], &temp0[24]); + + dct_body_second_pass(temp0, temp5); + + transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], + &temp5[5], &temp5[6], &temp5[7]); + transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], + &temp5[13], &temp5[14], &temp5[15]); + transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], + &temp5[21], &temp5[22], &temp5[23]); + transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], + &temp5[29], &temp5[30], &temp5[31]); + store(output + 8 * 32, temp5); + + // Third row of 8x32 + transpose_s16_8x8q(&temp1[16], &temp0[0]); + transpose_s16_8x8q(&temp2[16], &temp0[8]); + transpose_s16_8x8q(&temp3[16], &temp0[16]); + transpose_s16_8x8q(&temp4[16], &temp0[24]); + + dct_body_second_pass(temp0, temp5); + + transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], + &temp5[5], &temp5[6], &temp5[7]); + transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], + &temp5[13], &temp5[14], &temp5[15]); + transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], + &temp5[21], &temp5[22], &temp5[23]); + transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], + &temp5[29], &temp5[30], &temp5[31]); + store(output + 16 * 32, temp5); + + // Final row of 8x32. + transpose_s16_8x8q(&temp1[24], &temp0[0]); + transpose_s16_8x8q(&temp2[24], &temp0[8]); + transpose_s16_8x8q(&temp3[24], &temp0[16]); + transpose_s16_8x8q(&temp4[24], &temp0[24]); + + dct_body_second_pass(temp0, temp5); + + transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], + &temp5[5], &temp5[6], &temp5[7]); + transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], + &temp5[13], &temp5[14], &temp5[15]); + transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], + &temp5[21], &temp5[22], &temp5[23]); + transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], + &temp5[29], &temp5[30], &temp5[31]); + store(output + 24 * 32, temp5); +} + +void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, + int stride) { + int16x8_t temp0[32]; + int16x8_t temp1[32]; + int16x8_t temp2[32]; + int16x8_t temp3[32]; + int16x8_t temp4[32]; + int16x8_t temp5[32]; + + // Process in 8x32 columns. + load_cross(input, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp1); + + load_cross(input + 8, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp2); + + load_cross(input + 16, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp3); + + load_cross(input + 24, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp4); + + // Generate the top row by munging the first set of 8 from each one together. + transpose_s16_8x8q(&temp1[0], &temp0[0]); + transpose_s16_8x8q(&temp2[0], &temp0[8]); + transpose_s16_8x8q(&temp3[0], &temp0[16]); + transpose_s16_8x8q(&temp4[0], &temp0[24]); + + dct_body_second_pass_rd(temp0, temp5); + + transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], + &temp5[5], &temp5[6], &temp5[7]); + transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], + &temp5[13], &temp5[14], &temp5[15]); + transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], + &temp5[21], &temp5[22], &temp5[23]); + transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], + &temp5[29], &temp5[30], &temp5[31]); + store(output, temp5); + + // Second row of 8x32. + transpose_s16_8x8q(&temp1[8], &temp0[0]); + transpose_s16_8x8q(&temp2[8], &temp0[8]); + transpose_s16_8x8q(&temp3[8], &temp0[16]); + transpose_s16_8x8q(&temp4[8], &temp0[24]); + + dct_body_second_pass_rd(temp0, temp5); + + transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], + &temp5[5], &temp5[6], &temp5[7]); + transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], + &temp5[13], &temp5[14], &temp5[15]); + transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], + &temp5[21], &temp5[22], &temp5[23]); + transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], + &temp5[29], &temp5[30], &temp5[31]); + store(output + 8 * 32, temp5); + + // Third row of 8x32 + transpose_s16_8x8q(&temp1[16], &temp0[0]); + transpose_s16_8x8q(&temp2[16], &temp0[8]); + transpose_s16_8x8q(&temp3[16], &temp0[16]); + transpose_s16_8x8q(&temp4[16], &temp0[24]); + + dct_body_second_pass_rd(temp0, temp5); + + transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], + &temp5[5], &temp5[6], &temp5[7]); + transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], + &temp5[13], &temp5[14], &temp5[15]); + transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], + &temp5[21], &temp5[22], &temp5[23]); + transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], + &temp5[29], &temp5[30], &temp5[31]); + store(output + 16 * 32, temp5); + + // Final row of 8x32. + transpose_s16_8x8q(&temp1[24], &temp0[0]); + transpose_s16_8x8q(&temp2[24], &temp0[8]); + transpose_s16_8x8q(&temp3[24], &temp0[16]); + transpose_s16_8x8q(&temp4[24], &temp0[24]); + + dct_body_second_pass_rd(temp0, temp5); + + transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], + &temp5[5], &temp5[6], &temp5[7]); + transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], + &temp5[13], &temp5[14], &temp5[15]); + transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], + &temp5[21], &temp5[22], &temp5[23]); + transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], + &temp5[29], &temp5[30], &temp5[31]); + store(output + 24 * 32, temp5); +} + +#if CONFIG_VP9_HIGHBITDEPTH + +void vpx_highbd_fdct32x32_neon(const int16_t *input, tran_low_t *output, + int stride) { + int16x8_t temp0[32]; + int32x4_t left1[32], left2[32], left3[32], left4[32], right1[32], right2[32], + right3[32], right4[32]; + int32x4_t left5[32], right5[32], left6[32], right6[32], left7[32], right7[32], + left8[32], right8[32]; + int32x4_t temp1[32], temp2[32]; + + // Process in 8x32 columns. + load_cross(input, stride, temp0); + highbd_scale_input(temp0, left1, right1); + highbd_dct8x32_body_first_pass(left1, right1); + highbd_partial_sub_round_shift(left1, right1); + + load_cross(input + 8, stride, temp0); + highbd_scale_input(temp0, left2, right2); + highbd_dct8x32_body_first_pass(left2, right2); + highbd_partial_sub_round_shift(left2, right2); + + load_cross(input + 16, stride, temp0); + highbd_scale_input(temp0, left3, right3); + highbd_dct8x32_body_first_pass(left3, right3); + highbd_partial_sub_round_shift(left3, right3); + + load_cross(input + 24, stride, temp0); + highbd_scale_input(temp0, left4, right4); + highbd_dct8x32_body_first_pass(left4, right4); + highbd_partial_sub_round_shift(left4, right4); + + // Generate the top row by munging the first set of 8 from each one together. + transpose_s32_8x8_2(left1, right1, temp1, temp2); + transpose_s32_8x8_2(left2, right2, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3, right3, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4, right4, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left5, right5); + highbd_dct8x32_body_second_pass(left5, right5); + highbd_partial_add_round_shift(left5, right5); + + // Second row of 8x32. + transpose_s32_8x8_2(left1 + 8, right1 + 8, temp1, temp2); + transpose_s32_8x8_2(left2 + 8, right2 + 8, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3 + 8, right3 + 8, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4 + 8, right4 + 8, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left6, right6); + highbd_dct8x32_body_second_pass(left6, right6); + highbd_partial_add_round_shift(left6, right6); + + // Third row of 8x32 + transpose_s32_8x8_2(left1 + 16, right1 + 16, temp1, temp2); + transpose_s32_8x8_2(left2 + 16, right2 + 16, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3 + 16, right3 + 16, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4 + 16, right4 + 16, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left7, right7); + highbd_dct8x32_body_second_pass(left7, right7); + highbd_partial_add_round_shift(left7, right7); + + // Final row of 8x32. + transpose_s32_8x8_2(left1 + 24, right1 + 24, temp1, temp2); + transpose_s32_8x8_2(left2 + 24, right2 + 24, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3 + 24, right3 + 24, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4 + 24, right4 + 24, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left8, right8); + highbd_dct8x32_body_second_pass(left8, right8); + highbd_partial_add_round_shift(left8, right8); + + // Final transpose + transpose_s32_8x8_2(left5, right5, left1, right1); + transpose_s32_8x8_2(left5 + 8, right5 + 8, left2, right2); + transpose_s32_8x8_2(left5 + 16, right5 + 16, left3, right3); + transpose_s32_8x8_2(left5 + 24, right5 + 24, left4, right4); + transpose_s32_8x8_2(left6, right6, left1 + 8, right1 + 8); + transpose_s32_8x8_2(left6 + 8, right6 + 8, left2 + 8, right2 + 8); + transpose_s32_8x8_2(left6 + 16, right6 + 16, left3 + 8, right3 + 8); + transpose_s32_8x8_2(left6 + 24, right6 + 24, left4 + 8, right4 + 8); + transpose_s32_8x8_2(left7, right7, left1 + 16, right1 + 16); + transpose_s32_8x8_2(left7 + 8, right7 + 8, left2 + 16, right2 + 16); + transpose_s32_8x8_2(left7 + 16, right7 + 16, left3 + 16, right3 + 16); + transpose_s32_8x8_2(left7 + 24, right7 + 24, left4 + 16, right4 + 16); + transpose_s32_8x8_2(left8, right8, left1 + 24, right1 + 24); + transpose_s32_8x8_2(left8 + 8, right8 + 8, left2 + 24, right2 + 24); + transpose_s32_8x8_2(left8 + 16, right8 + 16, left3 + 24, right3 + 24); + transpose_s32_8x8_2(left8 + 24, right8 + 24, left4 + 24, right4 + 24); + + store32x32_s32(output, left1, right1, left2, right2, left3, right3, left4, + right4); +} + +void vpx_highbd_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, + int stride) { + int16x8_t temp0[32]; + int32x4_t left1[32], left2[32], left3[32], left4[32], right1[32], right2[32], + right3[32], right4[32]; + int32x4_t left5[32], right5[32], left6[32], right6[32], left7[32], right7[32], + left8[32], right8[32]; + int32x4_t temp1[32], temp2[32]; + + // Process in 8x32 columns. + load_cross(input, stride, temp0); + highbd_scale_input(temp0, left1, right1); + highbd_dct8x32_body_first_pass(left1, right1); + highbd_partial_sub_round_shift(left1, right1); + + load_cross(input + 8, stride, temp0); + highbd_scale_input(temp0, left2, right2); + highbd_dct8x32_body_first_pass(left2, right2); + highbd_partial_sub_round_shift(left2, right2); + + load_cross(input + 16, stride, temp0); + highbd_scale_input(temp0, left3, right3); + highbd_dct8x32_body_first_pass(left3, right3); + highbd_partial_sub_round_shift(left3, right3); + + load_cross(input + 24, stride, temp0); + highbd_scale_input(temp0, left4, right4); + highbd_dct8x32_body_first_pass(left4, right4); + highbd_partial_sub_round_shift(left4, right4); + + // Generate the top row by munging the first set of 8 from each one together. + transpose_s32_8x8_2(left1, right1, temp1, temp2); + transpose_s32_8x8_2(left2, right2, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3, right3, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4, right4, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left5, right5); + highbd_dct8x32_body_second_pass_rd(left5, right5); + + // Second row of 8x32. + transpose_s32_8x8_2(left1 + 8, right1 + 8, temp1, temp2); + transpose_s32_8x8_2(left2 + 8, right2 + 8, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3 + 8, right3 + 8, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4 + 8, right4 + 8, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left6, right6); + highbd_dct8x32_body_second_pass_rd(left6, right6); + + // Third row of 8x32 + transpose_s32_8x8_2(left1 + 16, right1 + 16, temp1, temp2); + transpose_s32_8x8_2(left2 + 16, right2 + 16, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3 + 16, right3 + 16, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4 + 16, right4 + 16, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left7, right7); + highbd_dct8x32_body_second_pass_rd(left7, right7); + + // Final row of 8x32. + transpose_s32_8x8_2(left1 + 24, right1 + 24, temp1, temp2); + transpose_s32_8x8_2(left2 + 24, right2 + 24, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3 + 24, right3 + 24, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4 + 24, right4 + 24, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left8, right8); + highbd_dct8x32_body_second_pass_rd(left8, right8); + + // Final transpose + transpose_s32_8x8_2(left5, right5, left1, right1); + transpose_s32_8x8_2(left5 + 8, right5 + 8, left2, right2); + transpose_s32_8x8_2(left5 + 16, right5 + 16, left3, right3); + transpose_s32_8x8_2(left5 + 24, right5 + 24, left4, right4); + transpose_s32_8x8_2(left6, right6, left1 + 8, right1 + 8); + transpose_s32_8x8_2(left6 + 8, right6 + 8, left2 + 8, right2 + 8); + transpose_s32_8x8_2(left6 + 16, right6 + 16, left3 + 8, right3 + 8); + transpose_s32_8x8_2(left6 + 24, right6 + 24, left4 + 8, right4 + 8); + transpose_s32_8x8_2(left7, right7, left1 + 16, right1 + 16); + transpose_s32_8x8_2(left7 + 8, right7 + 8, left2 + 16, right2 + 16); + transpose_s32_8x8_2(left7 + 16, right7 + 16, left3 + 16, right3 + 16); + transpose_s32_8x8_2(left7 + 24, right7 + 24, left4 + 16, right4 + 16); + transpose_s32_8x8_2(left8, right8, left1 + 24, right1 + 24); + transpose_s32_8x8_2(left8 + 8, right8 + 8, left2 + 24, right2 + 24); + transpose_s32_8x8_2(left8 + 16, right8 + 16, left3 + 24, right3 + 24); + transpose_s32_8x8_2(left8 + 24, right8 + 24, left4 + 24, right4 + 24); + + store32x32_s32(output, left1, right1, left2, right2, left3, right3, left4, + right4); +} + +#endif // CONFIG_VP9_HIGHBITDEPTH + +#endif // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && + // __GNUC__ == 4 && __GNUC_MINOR__ <= 9 diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.h new file mode 100644 index 0000000000..3b9e64c6df --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.h @@ -0,0 +1,2919 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_ +#define VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/fdct_neon.h" + +// Load & cross the first 8 and last 8, then the middle +static INLINE void load_cross(const int16_t *a, int stride, int16x8_t *b) { + b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride)); + b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride)); + b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride)); + b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride)); + b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride)); + b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride)); + b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride)); + b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride)); + + b[24] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride)); + b[25] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride)); + b[26] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride)); + b[27] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride)); + b[28] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride)); + b[29] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride)); + b[30] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride)); + b[31] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride)); + + b[8] = vaddq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride)); + b[9] = vaddq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride)); + b[10] = vaddq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride)); + b[11] = vaddq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride)); + b[12] = vaddq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride)); + b[13] = vaddq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride)); + b[14] = vaddq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride)); + b[15] = vaddq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride)); + + b[16] = vsubq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride)); + b[17] = vsubq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride)); + b[18] = vsubq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride)); + b[19] = vsubq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride)); + b[20] = vsubq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride)); + b[21] = vsubq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride)); + b[22] = vsubq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride)); + b[23] = vsubq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride)); +} + +#define STORE_S16(src, index, dest) \ + do { \ + store_s16q_to_tran_low(dest, src[index]); \ + dest += 8; \ + } while (0) + +// Store 32 16x8 values, assuming stride == 32. +// Slight twist: store horizontally in blocks of 8. +static INLINE void store(tran_low_t *a, const int16x8_t *b) { + STORE_S16(b, 0, a); + STORE_S16(b, 8, a); + STORE_S16(b, 16, a); + STORE_S16(b, 24, a); + STORE_S16(b, 1, a); + STORE_S16(b, 9, a); + STORE_S16(b, 17, a); + STORE_S16(b, 25, a); + STORE_S16(b, 2, a); + STORE_S16(b, 10, a); + STORE_S16(b, 18, a); + STORE_S16(b, 26, a); + STORE_S16(b, 3, a); + STORE_S16(b, 11, a); + STORE_S16(b, 19, a); + STORE_S16(b, 27, a); + STORE_S16(b, 4, a); + STORE_S16(b, 12, a); + STORE_S16(b, 20, a); + STORE_S16(b, 28, a); + STORE_S16(b, 5, a); + STORE_S16(b, 13, a); + STORE_S16(b, 21, a); + STORE_S16(b, 29, a); + STORE_S16(b, 6, a); + STORE_S16(b, 14, a); + STORE_S16(b, 22, a); + STORE_S16(b, 30, a); + STORE_S16(b, 7, a); + STORE_S16(b, 15, a); + STORE_S16(b, 23, a); + STORE_S16(b, 31, a); +} + +#undef STORE_S16 + +static INLINE void scale_input(const int16x8_t *in /*32*/, + int16x8_t *out /*32*/) { + out[0] = vshlq_n_s16(in[0], 2); + out[1] = vshlq_n_s16(in[1], 2); + out[2] = vshlq_n_s16(in[2], 2); + out[3] = vshlq_n_s16(in[3], 2); + out[4] = vshlq_n_s16(in[4], 2); + out[5] = vshlq_n_s16(in[5], 2); + out[6] = vshlq_n_s16(in[6], 2); + out[7] = vshlq_n_s16(in[7], 2); + + out[8] = vshlq_n_s16(in[8], 2); + out[9] = vshlq_n_s16(in[9], 2); + out[10] = vshlq_n_s16(in[10], 2); + out[11] = vshlq_n_s16(in[11], 2); + out[12] = vshlq_n_s16(in[12], 2); + out[13] = vshlq_n_s16(in[13], 2); + out[14] = vshlq_n_s16(in[14], 2); + out[15] = vshlq_n_s16(in[15], 2); + + out[16] = vshlq_n_s16(in[16], 2); + out[17] = vshlq_n_s16(in[17], 2); + out[18] = vshlq_n_s16(in[18], 2); + out[19] = vshlq_n_s16(in[19], 2); + out[20] = vshlq_n_s16(in[20], 2); + out[21] = vshlq_n_s16(in[21], 2); + out[22] = vshlq_n_s16(in[22], 2); + out[23] = vshlq_n_s16(in[23], 2); + + out[24] = vshlq_n_s16(in[24], 2); + out[25] = vshlq_n_s16(in[25], 2); + out[26] = vshlq_n_s16(in[26], 2); + out[27] = vshlq_n_s16(in[27], 2); + out[28] = vshlq_n_s16(in[28], 2); + out[29] = vshlq_n_s16(in[29], 2); + out[30] = vshlq_n_s16(in[30], 2); + out[31] = vshlq_n_s16(in[31], 2); +} + +static INLINE void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) { + int16x8_t a[32]; + int16x8_t b[32]; + + // Stage 1: Done as part of the load. + + // Stage 2. + // Mini cross. X the first 16 values and the middle 8 of the second half. + a[0] = vaddq_s16(in[0], in[15]); + a[1] = vaddq_s16(in[1], in[14]); + a[2] = vaddq_s16(in[2], in[13]); + a[3] = vaddq_s16(in[3], in[12]); + a[4] = vaddq_s16(in[4], in[11]); + a[5] = vaddq_s16(in[5], in[10]); + a[6] = vaddq_s16(in[6], in[9]); + a[7] = vaddq_s16(in[7], in[8]); + + a[8] = vsubq_s16(in[7], in[8]); + a[9] = vsubq_s16(in[6], in[9]); + a[10] = vsubq_s16(in[5], in[10]); + a[11] = vsubq_s16(in[4], in[11]); + a[12] = vsubq_s16(in[3], in[12]); + a[13] = vsubq_s16(in[2], in[13]); + a[14] = vsubq_s16(in[1], in[14]); + a[15] = vsubq_s16(in[0], in[15]); + + a[16] = in[16]; + a[17] = in[17]; + a[18] = in[18]; + a[19] = in[19]; + + butterfly_one_coeff_s16_s32_narrow(in[27], in[20], cospi_16_64, &a[27], + &a[20]); + butterfly_one_coeff_s16_s32_narrow(in[26], in[21], cospi_16_64, &a[26], + &a[21]); + butterfly_one_coeff_s16_s32_narrow(in[25], in[22], cospi_16_64, &a[25], + &a[22]); + butterfly_one_coeff_s16_s32_narrow(in[24], in[23], cospi_16_64, &a[24], + &a[23]); + + a[28] = in[28]; + a[29] = in[29]; + a[30] = in[30]; + a[31] = in[31]; + + // Stage 3. + b[0] = vaddq_s16(a[0], a[7]); + b[1] = vaddq_s16(a[1], a[6]); + b[2] = vaddq_s16(a[2], a[5]); + b[3] = vaddq_s16(a[3], a[4]); + + b[4] = vsubq_s16(a[3], a[4]); + b[5] = vsubq_s16(a[2], a[5]); + b[6] = vsubq_s16(a[1], a[6]); + b[7] = vsubq_s16(a[0], a[7]); + + b[8] = a[8]; + b[9] = a[9]; + + butterfly_one_coeff_s16_s32_narrow(a[13], a[10], cospi_16_64, &b[13], &b[10]); + butterfly_one_coeff_s16_s32_narrow(a[12], a[11], cospi_16_64, &b[12], &b[11]); + + b[14] = a[14]; + b[15] = a[15]; + + b[16] = vaddq_s16(in[16], a[23]); + b[17] = vaddq_s16(in[17], a[22]); + b[18] = vaddq_s16(in[18], a[21]); + b[19] = vaddq_s16(in[19], a[20]); + + b[20] = vsubq_s16(in[19], a[20]); + b[21] = vsubq_s16(in[18], a[21]); + b[22] = vsubq_s16(in[17], a[22]); + b[23] = vsubq_s16(in[16], a[23]); + + b[24] = vsubq_s16(in[31], a[24]); + b[25] = vsubq_s16(in[30], a[25]); + b[26] = vsubq_s16(in[29], a[26]); + b[27] = vsubq_s16(in[28], a[27]); + + b[28] = vaddq_s16(in[28], a[27]); + b[29] = vaddq_s16(in[29], a[26]); + b[30] = vaddq_s16(in[30], a[25]); + b[31] = vaddq_s16(in[31], a[24]); + + // Stage 4. + a[0] = vaddq_s16(b[0], b[3]); + a[1] = vaddq_s16(b[1], b[2]); + a[2] = vsubq_s16(b[1], b[2]); + a[3] = vsubq_s16(b[0], b[3]); + + a[4] = b[4]; + + butterfly_one_coeff_s16_s32_narrow(b[6], b[5], cospi_16_64, &a[6], &a[5]); + + a[7] = b[7]; + + a[8] = vaddq_s16(b[8], b[11]); + a[9] = vaddq_s16(b[9], b[10]); + a[10] = vsubq_s16(b[9], b[10]); + a[11] = vsubq_s16(b[8], b[11]); + a[12] = vsubq_s16(b[15], b[12]); + a[13] = vsubq_s16(b[14], b[13]); + a[14] = vaddq_s16(b[14], b[13]); + a[15] = vaddq_s16(b[15], b[12]); + + a[16] = b[16]; + a[17] = b[17]; + + butterfly_two_coeff(b[29], b[18], cospi_8_64, cospi_24_64, &a[29], &a[18]); + butterfly_two_coeff(b[28], b[19], cospi_8_64, cospi_24_64, &a[28], &a[19]); + butterfly_two_coeff(b[27], b[20], cospi_24_64, -cospi_8_64, &a[27], &a[20]); + butterfly_two_coeff(b[26], b[21], cospi_24_64, -cospi_8_64, &a[26], &a[21]); + + a[22] = b[22]; + a[23] = b[23]; + a[24] = b[24]; + a[25] = b[25]; + + a[30] = b[30]; + a[31] = b[31]; + + // Stage 5. + butterfly_one_coeff_s16_fast(a[0], a[1], cospi_16_64, &b[0], &b[1]); + butterfly_two_coeff(a[3], a[2], cospi_8_64, cospi_24_64, &b[2], &b[3]); + + b[4] = vaddq_s16(a[4], a[5]); + b[5] = vsubq_s16(a[4], a[5]); + b[6] = vsubq_s16(a[7], a[6]); + b[7] = vaddq_s16(a[7], a[6]); + + b[8] = a[8]; + + butterfly_two_coeff(a[14], a[9], cospi_8_64, cospi_24_64, &b[14], &b[9]); + butterfly_two_coeff(a[13], a[10], cospi_24_64, -cospi_8_64, &b[13], &b[10]); + + b[11] = a[11]; + b[12] = a[12]; + + b[15] = a[15]; + + b[16] = vaddq_s16(a[19], a[16]); + b[17] = vaddq_s16(a[18], a[17]); + b[18] = vsubq_s16(a[17], a[18]); + b[19] = vsubq_s16(a[16], a[19]); + b[20] = vsubq_s16(a[23], a[20]); + b[21] = vsubq_s16(a[22], a[21]); + b[22] = vaddq_s16(a[21], a[22]); + b[23] = vaddq_s16(a[20], a[23]); + b[24] = vaddq_s16(a[27], a[24]); + b[25] = vaddq_s16(a[26], a[25]); + b[26] = vsubq_s16(a[25], a[26]); + b[27] = vsubq_s16(a[24], a[27]); + b[28] = vsubq_s16(a[31], a[28]); + b[29] = vsubq_s16(a[30], a[29]); + b[30] = vaddq_s16(a[29], a[30]); + b[31] = vaddq_s16(a[28], a[31]); + + // Stage 6. + a[0] = b[0]; + a[1] = b[1]; + a[2] = b[2]; + a[3] = b[3]; + + butterfly_two_coeff(b[7], b[4], cospi_4_64, cospi_28_64, &a[4], &a[7]); + butterfly_two_coeff(b[6], b[5], cospi_20_64, cospi_12_64, &a[5], &a[6]); + + a[8] = vaddq_s16(b[8], b[9]); + a[9] = vsubq_s16(b[8], b[9]); + a[10] = vsubq_s16(b[11], b[10]); + a[11] = vaddq_s16(b[11], b[10]); + a[12] = vaddq_s16(b[12], b[13]); + a[13] = vsubq_s16(b[12], b[13]); + a[14] = vsubq_s16(b[15], b[14]); + a[15] = vaddq_s16(b[15], b[14]); + + a[16] = b[16]; + a[19] = b[19]; + a[20] = b[20]; + a[23] = b[23]; + a[24] = b[24]; + a[27] = b[27]; + a[28] = b[28]; + a[31] = b[31]; + + butterfly_two_coeff(b[30], b[17], cospi_4_64, cospi_28_64, &a[30], &a[17]); + butterfly_two_coeff(b[29], b[18], cospi_28_64, -cospi_4_64, &a[29], &a[18]); + + butterfly_two_coeff(b[26], b[21], cospi_20_64, cospi_12_64, &a[26], &a[21]); + butterfly_two_coeff(b[25], b[22], cospi_12_64, -cospi_20_64, &a[25], &a[22]); + + // Stage 7. + b[0] = a[0]; + b[1] = a[1]; + b[2] = a[2]; + b[3] = a[3]; + b[4] = a[4]; + b[5] = a[5]; + b[6] = a[6]; + b[7] = a[7]; + + butterfly_two_coeff(a[15], a[8], cospi_2_64, cospi_30_64, &b[8], &b[15]); + butterfly_two_coeff(a[14], a[9], cospi_18_64, cospi_14_64, &b[9], &b[14]); + butterfly_two_coeff(a[13], a[10], cospi_10_64, cospi_22_64, &b[10], &b[13]); + butterfly_two_coeff(a[12], a[11], cospi_26_64, cospi_6_64, &b[11], &b[12]); + + b[16] = vaddq_s16(a[16], a[17]); + b[17] = vsubq_s16(a[16], a[17]); + b[18] = vsubq_s16(a[19], a[18]); + b[19] = vaddq_s16(a[19], a[18]); + b[20] = vaddq_s16(a[20], a[21]); + b[21] = vsubq_s16(a[20], a[21]); + b[22] = vsubq_s16(a[23], a[22]); + b[23] = vaddq_s16(a[23], a[22]); + b[24] = vaddq_s16(a[24], a[25]); + b[25] = vsubq_s16(a[24], a[25]); + b[26] = vsubq_s16(a[27], a[26]); + b[27] = vaddq_s16(a[27], a[26]); + b[28] = vaddq_s16(a[28], a[29]); + b[29] = vsubq_s16(a[28], a[29]); + b[30] = vsubq_s16(a[31], a[30]); + b[31] = vaddq_s16(a[31], a[30]); + + // Final stage. + // Also compute partial rounding shift: + // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + out[0] = sub_round_shift_s16(b[0]); + out[16] = sub_round_shift_s16(b[1]); + out[8] = sub_round_shift_s16(b[2]); + out[24] = sub_round_shift_s16(b[3]); + out[4] = sub_round_shift_s16(b[4]); + out[20] = sub_round_shift_s16(b[5]); + out[12] = sub_round_shift_s16(b[6]); + out[28] = sub_round_shift_s16(b[7]); + out[2] = sub_round_shift_s16(b[8]); + out[18] = sub_round_shift_s16(b[9]); + out[10] = sub_round_shift_s16(b[10]); + out[26] = sub_round_shift_s16(b[11]); + out[6] = sub_round_shift_s16(b[12]); + out[22] = sub_round_shift_s16(b[13]); + out[14] = sub_round_shift_s16(b[14]); + out[30] = sub_round_shift_s16(b[15]); + + butterfly_two_coeff(b[31], b[16], cospi_1_64, cospi_31_64, &a[1], &a[31]); + out[1] = sub_round_shift_s16(a[1]); + out[31] = sub_round_shift_s16(a[31]); + + butterfly_two_coeff(b[30], b[17], cospi_17_64, cospi_15_64, &a[17], &a[15]); + out[17] = sub_round_shift_s16(a[17]); + out[15] = sub_round_shift_s16(a[15]); + + butterfly_two_coeff(b[29], b[18], cospi_9_64, cospi_23_64, &a[9], &a[23]); + out[9] = sub_round_shift_s16(a[9]); + out[23] = sub_round_shift_s16(a[23]); + + butterfly_two_coeff(b[28], b[19], cospi_25_64, cospi_7_64, &a[25], &a[7]); + out[25] = sub_round_shift_s16(a[25]); + out[7] = sub_round_shift_s16(a[7]); + + butterfly_two_coeff(b[27], b[20], cospi_5_64, cospi_27_64, &a[5], &a[27]); + out[5] = sub_round_shift_s16(a[5]); + out[27] = sub_round_shift_s16(a[27]); + + butterfly_two_coeff(b[26], b[21], cospi_21_64, cospi_11_64, &a[21], &a[11]); + out[21] = sub_round_shift_s16(a[21]); + out[11] = sub_round_shift_s16(a[11]); + + butterfly_two_coeff(b[25], b[22], cospi_13_64, cospi_19_64, &a[13], &a[19]); + out[13] = sub_round_shift_s16(a[13]); + out[19] = sub_round_shift_s16(a[19]); + + butterfly_two_coeff(b[24], b[23], cospi_29_64, cospi_3_64, &a[29], &a[3]); + out[29] = sub_round_shift_s16(a[29]); + out[3] = sub_round_shift_s16(a[3]); +} + +#define PASS_THROUGH(src, dst, element) \ + do { \ + dst##_lo[element] = src##_lo[element]; \ + dst##_hi[element] = src##_hi[element]; \ + } while (0) + +#define ADD_S16_S32(a, left_index, right_index, b, b_index) \ + do { \ + b##_lo[b_index] = \ + vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \ + b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]), \ + vget_high_s16(a[right_index])); \ + } while (0) + +#define SUB_S16_S32(a, left_index, right_index, b, b_index) \ + do { \ + b##_lo[b_index] = \ + vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \ + b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]), \ + vget_high_s16(a[right_index])); \ + } while (0) + +#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index) \ + do { \ + c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index])); \ + c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \ + } while (0) + +#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \ + do { \ + temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index])); \ + temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index])); \ + c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]); \ + c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]); \ + } while (0) + +#define ADD_S32(a, left_index, right_index, b, b_index) \ + do { \ + b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \ + b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \ + } while (0) + +#define SUB_S32(a, left_index, right_index, b, b_index) \ + do { \ + b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \ + b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \ + } while (0) + +#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b, \ + add_index, sub_index) \ + do { \ + butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \ + &b##_lo[add_index], &b##_hi[add_index], \ + &b##_lo[sub_index], &b##_hi[sub_index]); \ + } while (0) + +#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \ + sub_index) \ + do { \ + butterfly_one_coeff_s32_fast( \ + a##_lo[left_index], a##_hi[left_index], a##_lo[right_index], \ + a##_hi[right_index], constant, &b##_lo[add_index], &b##_hi[add_index], \ + &b##_lo[sub_index], &b##_hi[sub_index]); \ + } while (0) + +#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant, \ + right_constant, b, add_index, sub_index) \ + do { \ + butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index], \ + a##_lo[right_index], a##_hi[right_index], \ + left_constant, right_constant, &b##_lo[add_index], \ + &b##_hi[add_index], &b##_lo[sub_index], \ + &b##_hi[sub_index]); \ + } while (0) + +static INLINE void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) { + int16x8_t a[32]; + int16x8_t b[32]; + int32x4_t c_lo[32]; + int32x4_t c_hi[32]; + int32x4_t d_lo[32]; + int32x4_t d_hi[32]; + + // Stage 1. Done as part of the load for the first pass. + a[0] = vaddq_s16(in[0], in[31]); + a[1] = vaddq_s16(in[1], in[30]); + a[2] = vaddq_s16(in[2], in[29]); + a[3] = vaddq_s16(in[3], in[28]); + a[4] = vaddq_s16(in[4], in[27]); + a[5] = vaddq_s16(in[5], in[26]); + a[6] = vaddq_s16(in[6], in[25]); + a[7] = vaddq_s16(in[7], in[24]); + a[8] = vaddq_s16(in[8], in[23]); + a[9] = vaddq_s16(in[9], in[22]); + a[10] = vaddq_s16(in[10], in[21]); + a[11] = vaddq_s16(in[11], in[20]); + a[12] = vaddq_s16(in[12], in[19]); + a[13] = vaddq_s16(in[13], in[18]); + a[14] = vaddq_s16(in[14], in[17]); + a[15] = vaddq_s16(in[15], in[16]); + a[16] = vsubq_s16(in[15], in[16]); + a[17] = vsubq_s16(in[14], in[17]); + a[18] = vsubq_s16(in[13], in[18]); + a[19] = vsubq_s16(in[12], in[19]); + a[20] = vsubq_s16(in[11], in[20]); + a[21] = vsubq_s16(in[10], in[21]); + a[22] = vsubq_s16(in[9], in[22]); + a[23] = vsubq_s16(in[8], in[23]); + a[24] = vsubq_s16(in[7], in[24]); + a[25] = vsubq_s16(in[6], in[25]); + a[26] = vsubq_s16(in[5], in[26]); + a[27] = vsubq_s16(in[4], in[27]); + a[28] = vsubq_s16(in[3], in[28]); + a[29] = vsubq_s16(in[2], in[29]); + a[30] = vsubq_s16(in[1], in[30]); + a[31] = vsubq_s16(in[0], in[31]); + + // Stage 2. + b[0] = vaddq_s16(a[0], a[15]); + b[1] = vaddq_s16(a[1], a[14]); + b[2] = vaddq_s16(a[2], a[13]); + b[3] = vaddq_s16(a[3], a[12]); + b[4] = vaddq_s16(a[4], a[11]); + b[5] = vaddq_s16(a[5], a[10]); + b[6] = vaddq_s16(a[6], a[9]); + b[7] = vaddq_s16(a[7], a[8]); + + b[8] = vsubq_s16(a[7], a[8]); + b[9] = vsubq_s16(a[6], a[9]); + b[10] = vsubq_s16(a[5], a[10]); + b[11] = vsubq_s16(a[4], a[11]); + b[12] = vsubq_s16(a[3], a[12]); + b[13] = vsubq_s16(a[2], a[13]); + b[14] = vsubq_s16(a[1], a[14]); + b[15] = vsubq_s16(a[0], a[15]); + + b[16] = a[16]; + b[17] = a[17]; + b[18] = a[18]; + b[19] = a[19]; + + butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]); + butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]); + butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]); + butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]); + + b[28] = a[28]; + b[29] = a[29]; + b[30] = a[30]; + b[31] = a[31]; + + // Stage 3. With extreme values for input this calculation rolls over int16_t. + // The sources for b[0] get added multiple times and, through testing, have + // been shown to overflow starting here. + ADD_S16_S32(b, 0, 7, c, 0); + ADD_S16_S32(b, 1, 6, c, 1); + ADD_S16_S32(b, 2, 5, c, 2); + ADD_S16_S32(b, 3, 4, c, 3); + SUB_S16_S32(b, 3, 4, c, 4); + SUB_S16_S32(b, 2, 5, c, 5); + SUB_S16_S32(b, 1, 6, c, 6); + SUB_S16_S32(b, 0, 7, c, 7); + + a[8] = b[8]; + a[9] = b[9]; + + BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10); + BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11); + + a[14] = b[14]; + a[15] = b[15]; + + ADD_S16_S32(b, 16, 23, c, 16); + ADD_S16_S32(b, 17, 22, c, 17); + ADD_S16_S32(b, 18, 21, c, 18); + ADD_S16_S32(b, 19, 20, c, 19); + SUB_S16_S32(b, 19, 20, c, 20); + SUB_S16_S32(b, 18, 21, c, 21); + SUB_S16_S32(b, 17, 22, c, 22); + SUB_S16_S32(b, 16, 23, c, 23); + SUB_S16_S32(b, 31, 24, c, 24); + SUB_S16_S32(b, 30, 25, c, 25); + SUB_S16_S32(b, 29, 26, c, 26); + SUB_S16_S32(b, 28, 27, c, 27); + ADD_S16_S32(b, 28, 27, c, 28); + ADD_S16_S32(b, 29, 26, c, 29); + ADD_S16_S32(b, 30, 25, c, 30); + ADD_S16_S32(b, 31, 24, c, 31); + + // Stage 4. + ADD_S32(c, 0, 3, d, 0); + ADD_S32(c, 1, 2, d, 1); + SUB_S32(c, 1, 2, d, 2); + SUB_S32(c, 0, 3, d, 3); + + PASS_THROUGH(c, d, 4); + + BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5); + + PASS_THROUGH(c, d, 7); + + ADDW_S16_S32(c, 11, a, 8, d, 8); + ADDW_S16_S32(c, 10, a, 9, d, 9); + SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10); + SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11); + SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12); + SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13); + ADDW_S16_S32(c, 13, b, 14, d, 14); + ADDW_S16_S32(c, 12, b, 15, d, 15); + + PASS_THROUGH(c, d, 16); + PASS_THROUGH(c, d, 17); + + BUTTERFLY_TWO_S32(c, 29, 18, cospi_8_64, cospi_24_64, d, 29, 18); + BUTTERFLY_TWO_S32(c, 28, 19, cospi_8_64, cospi_24_64, d, 28, 19); + BUTTERFLY_TWO_S32(c, 27, 20, cospi_24_64, -cospi_8_64, d, 27, 20); + BUTTERFLY_TWO_S32(c, 26, 21, cospi_24_64, -cospi_8_64, d, 26, 21); + + PASS_THROUGH(c, d, 22); + PASS_THROUGH(c, d, 23); + PASS_THROUGH(c, d, 24); + PASS_THROUGH(c, d, 25); + + PASS_THROUGH(c, d, 30); + PASS_THROUGH(c, d, 31); + + // Stage 5. + BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1); + BUTTERFLY_TWO_S32(d, 3, 2, cospi_8_64, cospi_24_64, c, 2, 3); + + ADD_S32(d, 4, 5, c, 4); + SUB_S32(d, 4, 5, c, 5); + SUB_S32(d, 7, 6, c, 6); + ADD_S32(d, 7, 6, c, 7); + + PASS_THROUGH(d, c, 8); + + BUTTERFLY_TWO_S32(d, 14, 9, cospi_8_64, cospi_24_64, c, 14, 9); + BUTTERFLY_TWO_S32(d, 13, 10, cospi_24_64, -cospi_8_64, c, 13, 10); + + PASS_THROUGH(d, c, 11); + PASS_THROUGH(d, c, 12); + PASS_THROUGH(d, c, 15); + + ADD_S32(d, 16, 19, c, 16); + ADD_S32(d, 17, 18, c, 17); + SUB_S32(d, 17, 18, c, 18); + SUB_S32(d, 16, 19, c, 19); + SUB_S32(d, 23, 20, c, 20); + SUB_S32(d, 22, 21, c, 21); + ADD_S32(d, 22, 21, c, 22); + ADD_S32(d, 23, 20, c, 23); + ADD_S32(d, 24, 27, c, 24); + ADD_S32(d, 25, 26, c, 25); + SUB_S32(d, 25, 26, c, 26); + SUB_S32(d, 24, 27, c, 27); + SUB_S32(d, 31, 28, c, 28); + SUB_S32(d, 30, 29, c, 29); + ADD_S32(d, 30, 29, c, 30); + ADD_S32(d, 31, 28, c, 31); + + // Stage 6. + PASS_THROUGH(c, d, 0); + PASS_THROUGH(c, d, 1); + PASS_THROUGH(c, d, 2); + PASS_THROUGH(c, d, 3); + + BUTTERFLY_TWO_S32(c, 7, 4, cospi_4_64, cospi_28_64, d, 4, 7); + BUTTERFLY_TWO_S32(c, 6, 5, cospi_20_64, cospi_12_64, d, 5, 6); + + ADD_S32(c, 8, 9, d, 8); + SUB_S32(c, 8, 9, d, 9); + SUB_S32(c, 11, 10, d, 10); + ADD_S32(c, 11, 10, d, 11); + ADD_S32(c, 12, 13, d, 12); + SUB_S32(c, 12, 13, d, 13); + SUB_S32(c, 15, 14, d, 14); + ADD_S32(c, 15, 14, d, 15); + + PASS_THROUGH(c, d, 16); + PASS_THROUGH(c, d, 19); + PASS_THROUGH(c, d, 20); + PASS_THROUGH(c, d, 23); + PASS_THROUGH(c, d, 24); + PASS_THROUGH(c, d, 27); + PASS_THROUGH(c, d, 28); + PASS_THROUGH(c, d, 31); + + BUTTERFLY_TWO_S32(c, 30, 17, cospi_4_64, cospi_28_64, d, 30, 17); + BUTTERFLY_TWO_S32(c, 29, 18, cospi_28_64, -cospi_4_64, d, 29, 18); + BUTTERFLY_TWO_S32(c, 26, 21, cospi_20_64, cospi_12_64, d, 26, 21); + BUTTERFLY_TWO_S32(c, 25, 22, cospi_12_64, -cospi_20_64, d, 25, 22); + + // Stage 7. + PASS_THROUGH(d, c, 0); + PASS_THROUGH(d, c, 1); + PASS_THROUGH(d, c, 2); + PASS_THROUGH(d, c, 3); + PASS_THROUGH(d, c, 4); + PASS_THROUGH(d, c, 5); + PASS_THROUGH(d, c, 6); + PASS_THROUGH(d, c, 7); + + BUTTERFLY_TWO_S32(d, 15, 8, cospi_2_64, cospi_30_64, c, 8, 15); + BUTTERFLY_TWO_S32(d, 14, 9, cospi_18_64, cospi_14_64, c, 9, 14); + BUTTERFLY_TWO_S32(d, 13, 10, cospi_10_64, cospi_22_64, c, 10, 13); + BUTTERFLY_TWO_S32(d, 12, 11, cospi_26_64, cospi_6_64, c, 11, 12); + + ADD_S32(d, 16, 17, c, 16); + SUB_S32(d, 16, 17, c, 17); + SUB_S32(d, 19, 18, c, 18); + ADD_S32(d, 19, 18, c, 19); + ADD_S32(d, 20, 21, c, 20); + SUB_S32(d, 20, 21, c, 21); + SUB_S32(d, 23, 22, c, 22); + ADD_S32(d, 23, 22, c, 23); + ADD_S32(d, 24, 25, c, 24); + SUB_S32(d, 24, 25, c, 25); + SUB_S32(d, 27, 26, c, 26); + ADD_S32(d, 27, 26, c, 27); + ADD_S32(d, 28, 29, c, 28); + SUB_S32(d, 28, 29, c, 29); + SUB_S32(d, 31, 30, c, 30); + ADD_S32(d, 31, 30, c, 31); + + // Final stage. + // Roll rounding into this function so we can pass back int16x8. + + out[0] = add_round_shift_s32_narrow(c_lo[0], c_hi[0]); + out[16] = add_round_shift_s32_narrow(c_lo[1], c_hi[1]); + + out[8] = add_round_shift_s32_narrow(c_lo[2], c_hi[2]); + out[24] = add_round_shift_s32_narrow(c_lo[3], c_hi[3]); + out[4] = add_round_shift_s32_narrow(c_lo[4], c_hi[4]); + out[20] = add_round_shift_s32_narrow(c_lo[5], c_hi[5]); + out[12] = add_round_shift_s32_narrow(c_lo[6], c_hi[6]); + + out[28] = add_round_shift_s32_narrow(c_lo[7], c_hi[7]); + out[2] = add_round_shift_s32_narrow(c_lo[8], c_hi[8]); + out[18] = add_round_shift_s32_narrow(c_lo[9], c_hi[9]); + out[10] = add_round_shift_s32_narrow(c_lo[10], c_hi[10]); + + out[26] = add_round_shift_s32_narrow(c_lo[11], c_hi[11]); + out[6] = add_round_shift_s32_narrow(c_lo[12], c_hi[12]); + out[22] = add_round_shift_s32_narrow(c_lo[13], c_hi[13]); + out[14] = add_round_shift_s32_narrow(c_lo[14], c_hi[14]); + out[30] = add_round_shift_s32_narrow(c_lo[15], c_hi[15]); + + BUTTERFLY_TWO_S32(c, 31, 16, cospi_1_64, cospi_31_64, d, 1, 31); + out[1] = add_round_shift_s32_narrow(d_lo[1], d_hi[1]); + out[31] = add_round_shift_s32_narrow(d_lo[31], d_hi[31]); + + BUTTERFLY_TWO_S32(c, 30, 17, cospi_17_64, cospi_15_64, d, 17, 15); + out[17] = add_round_shift_s32_narrow(d_lo[17], d_hi[17]); + out[15] = add_round_shift_s32_narrow(d_lo[15], d_hi[15]); + + BUTTERFLY_TWO_S32(c, 29, 18, cospi_9_64, cospi_23_64, d, 9, 23); + out[9] = add_round_shift_s32_narrow(d_lo[9], d_hi[9]); + out[23] = add_round_shift_s32_narrow(d_lo[23], d_hi[23]); + + BUTTERFLY_TWO_S32(c, 28, 19, cospi_25_64, cospi_7_64, d, 25, 7); + out[25] = add_round_shift_s32_narrow(d_lo[25], d_hi[25]); + out[7] = add_round_shift_s32_narrow(d_lo[7], d_hi[7]); + + BUTTERFLY_TWO_S32(c, 27, 20, cospi_5_64, cospi_27_64, d, 5, 27); + out[5] = add_round_shift_s32_narrow(d_lo[5], d_hi[5]); + out[27] = add_round_shift_s32_narrow(d_lo[27], d_hi[27]); + + BUTTERFLY_TWO_S32(c, 26, 21, cospi_21_64, cospi_11_64, d, 21, 11); + out[21] = add_round_shift_s32_narrow(d_lo[21], d_hi[21]); + out[11] = add_round_shift_s32_narrow(d_lo[11], d_hi[11]); + + BUTTERFLY_TWO_S32(c, 25, 22, cospi_13_64, cospi_19_64, d, 13, 19); + out[13] = add_round_shift_s32_narrow(d_lo[13], d_hi[13]); + out[19] = add_round_shift_s32_narrow(d_lo[19], d_hi[19]); + + BUTTERFLY_TWO_S32(c, 24, 23, cospi_29_64, cospi_3_64, d, 29, 3); + out[29] = add_round_shift_s32_narrow(d_lo[29], d_hi[29]); + out[3] = add_round_shift_s32_narrow(d_lo[3], d_hi[3]); +} + +static INLINE void dct_body_second_pass_rd(const int16x8_t *in, + int16x8_t *out) { + int16x8_t a[32]; + int16x8_t b[32]; + + // Stage 1. Done as part of the load for the first pass. + a[0] = vaddq_s16(in[0], in[31]); + a[1] = vaddq_s16(in[1], in[30]); + a[2] = vaddq_s16(in[2], in[29]); + a[3] = vaddq_s16(in[3], in[28]); + a[4] = vaddq_s16(in[4], in[27]); + a[5] = vaddq_s16(in[5], in[26]); + a[6] = vaddq_s16(in[6], in[25]); + a[7] = vaddq_s16(in[7], in[24]); + a[8] = vaddq_s16(in[8], in[23]); + a[9] = vaddq_s16(in[9], in[22]); + a[10] = vaddq_s16(in[10], in[21]); + a[11] = vaddq_s16(in[11], in[20]); + a[12] = vaddq_s16(in[12], in[19]); + a[13] = vaddq_s16(in[13], in[18]); + a[14] = vaddq_s16(in[14], in[17]); + a[15] = vaddq_s16(in[15], in[16]); + a[16] = vsubq_s16(in[15], in[16]); + a[17] = vsubq_s16(in[14], in[17]); + a[18] = vsubq_s16(in[13], in[18]); + a[19] = vsubq_s16(in[12], in[19]); + a[20] = vsubq_s16(in[11], in[20]); + a[21] = vsubq_s16(in[10], in[21]); + a[22] = vsubq_s16(in[9], in[22]); + a[23] = vsubq_s16(in[8], in[23]); + a[24] = vsubq_s16(in[7], in[24]); + a[25] = vsubq_s16(in[6], in[25]); + a[26] = vsubq_s16(in[5], in[26]); + a[27] = vsubq_s16(in[4], in[27]); + a[28] = vsubq_s16(in[3], in[28]); + a[29] = vsubq_s16(in[2], in[29]); + a[30] = vsubq_s16(in[1], in[30]); + a[31] = vsubq_s16(in[0], in[31]); + + // Stage 2. + // For the "rd" version, all the values are rounded down after stage 2 to keep + // the values in 16 bits. + b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15])); + b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14])); + b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13])); + b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12])); + b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11])); + b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10])); + b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9])); + b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8])); + + b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8])); + b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9])); + b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10])); + b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11])); + b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12])); + b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13])); + b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14])); + b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15])); + + b[16] = add_round_shift_s16(a[16]); + b[17] = add_round_shift_s16(a[17]); + b[18] = add_round_shift_s16(a[18]); + b[19] = add_round_shift_s16(a[19]); + + butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]); + butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]); + butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]); + butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]); + b[20] = add_round_shift_s16(b[20]); + b[21] = add_round_shift_s16(b[21]); + b[22] = add_round_shift_s16(b[22]); + b[23] = add_round_shift_s16(b[23]); + b[24] = add_round_shift_s16(b[24]); + b[25] = add_round_shift_s16(b[25]); + b[26] = add_round_shift_s16(b[26]); + b[27] = add_round_shift_s16(b[27]); + + b[28] = add_round_shift_s16(a[28]); + b[29] = add_round_shift_s16(a[29]); + b[30] = add_round_shift_s16(a[30]); + b[31] = add_round_shift_s16(a[31]); + + // Stage 3. + a[0] = vaddq_s16(b[0], b[7]); + a[1] = vaddq_s16(b[1], b[6]); + a[2] = vaddq_s16(b[2], b[5]); + a[3] = vaddq_s16(b[3], b[4]); + + a[4] = vsubq_s16(b[3], b[4]); + a[5] = vsubq_s16(b[2], b[5]); + a[6] = vsubq_s16(b[1], b[6]); + a[7] = vsubq_s16(b[0], b[7]); + + a[8] = b[8]; + a[9] = b[9]; + + butterfly_one_coeff_s16_s32_narrow(b[13], b[10], cospi_16_64, &a[13], &a[10]); + butterfly_one_coeff_s16_s32_narrow(b[12], b[11], cospi_16_64, &a[12], &a[11]); + + a[14] = b[14]; + a[15] = b[15]; + + a[16] = vaddq_s16(b[16], b[23]); + a[17] = vaddq_s16(b[17], b[22]); + a[18] = vaddq_s16(b[18], b[21]); + a[19] = vaddq_s16(b[19], b[20]); + + a[20] = vsubq_s16(b[19], b[20]); + a[21] = vsubq_s16(b[18], b[21]); + a[22] = vsubq_s16(b[17], b[22]); + a[23] = vsubq_s16(b[16], b[23]); + + a[24] = vsubq_s16(b[31], b[24]); + a[25] = vsubq_s16(b[30], b[25]); + a[26] = vsubq_s16(b[29], b[26]); + a[27] = vsubq_s16(b[28], b[27]); + + a[28] = vaddq_s16(b[28], b[27]); + a[29] = vaddq_s16(b[29], b[26]); + a[30] = vaddq_s16(b[30], b[25]); + a[31] = vaddq_s16(b[31], b[24]); + + // Stage 4. + b[0] = vaddq_s16(a[0], a[3]); + b[1] = vaddq_s16(a[1], a[2]); + b[2] = vsubq_s16(a[1], a[2]); + b[3] = vsubq_s16(a[0], a[3]); + + b[4] = a[4]; + + butterfly_one_coeff_s16_s32_narrow(a[6], a[5], cospi_16_64, &b[6], &b[5]); + + b[7] = a[7]; + + b[8] = vaddq_s16(a[8], a[11]); + b[9] = vaddq_s16(a[9], a[10]); + b[10] = vsubq_s16(a[9], a[10]); + b[11] = vsubq_s16(a[8], a[11]); + b[12] = vsubq_s16(a[15], a[12]); + b[13] = vsubq_s16(a[14], a[13]); + b[14] = vaddq_s16(a[14], a[13]); + b[15] = vaddq_s16(a[15], a[12]); + + b[16] = a[16]; + b[17] = a[17]; + + butterfly_two_coeff(a[29], a[18], cospi_8_64, cospi_24_64, &b[29], &b[18]); + butterfly_two_coeff(a[28], a[19], cospi_8_64, cospi_24_64, &b[28], &b[19]); + butterfly_two_coeff(a[27], a[20], cospi_24_64, -cospi_8_64, &b[27], &b[20]); + butterfly_two_coeff(a[26], a[21], cospi_24_64, -cospi_8_64, &b[26], &b[21]); + + b[22] = a[22]; + b[23] = a[23]; + b[24] = a[24]; + b[25] = a[25]; + + b[30] = a[30]; + b[31] = a[31]; + + // Stage 5. + butterfly_one_coeff_s16_s32_narrow(b[0], b[1], cospi_16_64, &a[0], &a[1]); + butterfly_two_coeff(b[3], b[2], cospi_8_64, cospi_24_64, &a[2], &a[3]); + + a[4] = vaddq_s16(b[4], b[5]); + a[5] = vsubq_s16(b[4], b[5]); + a[6] = vsubq_s16(b[7], b[6]); + a[7] = vaddq_s16(b[7], b[6]); + + a[8] = b[8]; + + butterfly_two_coeff(b[14], b[9], cospi_8_64, cospi_24_64, &a[14], &a[9]); + butterfly_two_coeff(b[13], b[10], cospi_24_64, -cospi_8_64, &a[13], &a[10]); + + a[11] = b[11]; + a[12] = b[12]; + + a[15] = b[15]; + + a[16] = vaddq_s16(b[19], b[16]); + a[17] = vaddq_s16(b[18], b[17]); + a[18] = vsubq_s16(b[17], b[18]); + a[19] = vsubq_s16(b[16], b[19]); + a[20] = vsubq_s16(b[23], b[20]); + a[21] = vsubq_s16(b[22], b[21]); + a[22] = vaddq_s16(b[21], b[22]); + a[23] = vaddq_s16(b[20], b[23]); + a[24] = vaddq_s16(b[27], b[24]); + a[25] = vaddq_s16(b[26], b[25]); + a[26] = vsubq_s16(b[25], b[26]); + a[27] = vsubq_s16(b[24], b[27]); + a[28] = vsubq_s16(b[31], b[28]); + a[29] = vsubq_s16(b[30], b[29]); + a[30] = vaddq_s16(b[29], b[30]); + a[31] = vaddq_s16(b[28], b[31]); + + // Stage 6. + b[0] = a[0]; + b[1] = a[1]; + b[2] = a[2]; + b[3] = a[3]; + + butterfly_two_coeff(a[7], a[4], cospi_4_64, cospi_28_64, &b[4], &b[7]); + butterfly_two_coeff(a[6], a[5], cospi_20_64, cospi_12_64, &b[5], &b[6]); + + b[8] = vaddq_s16(a[8], a[9]); + b[9] = vsubq_s16(a[8], a[9]); + b[10] = vsubq_s16(a[11], a[10]); + b[11] = vaddq_s16(a[11], a[10]); + b[12] = vaddq_s16(a[12], a[13]); + b[13] = vsubq_s16(a[12], a[13]); + b[14] = vsubq_s16(a[15], a[14]); + b[15] = vaddq_s16(a[15], a[14]); + + b[16] = a[16]; + b[19] = a[19]; + b[20] = a[20]; + b[23] = a[23]; + b[24] = a[24]; + b[27] = a[27]; + b[28] = a[28]; + b[31] = a[31]; + + butterfly_two_coeff(a[30], a[17], cospi_4_64, cospi_28_64, &b[30], &b[17]); + butterfly_two_coeff(a[29], a[18], cospi_28_64, -cospi_4_64, &b[29], &b[18]); + + butterfly_two_coeff(a[26], a[21], cospi_20_64, cospi_12_64, &b[26], &b[21]); + butterfly_two_coeff(a[25], a[22], cospi_12_64, -cospi_20_64, &b[25], &b[22]); + + // Stage 7. + a[0] = b[0]; + a[1] = b[1]; + a[2] = b[2]; + a[3] = b[3]; + a[4] = b[4]; + a[5] = b[5]; + a[6] = b[6]; + a[7] = b[7]; + + butterfly_two_coeff(b[15], b[8], cospi_2_64, cospi_30_64, &a[8], &a[15]); + butterfly_two_coeff(b[14], b[9], cospi_18_64, cospi_14_64, &a[9], &a[14]); + butterfly_two_coeff(b[13], b[10], cospi_10_64, cospi_22_64, &a[10], &a[13]); + butterfly_two_coeff(b[12], b[11], cospi_26_64, cospi_6_64, &a[11], &a[12]); + + a[16] = vaddq_s16(b[16], b[17]); + a[17] = vsubq_s16(b[16], b[17]); + a[18] = vsubq_s16(b[19], b[18]); + a[19] = vaddq_s16(b[19], b[18]); + a[20] = vaddq_s16(b[20], b[21]); + a[21] = vsubq_s16(b[20], b[21]); + a[22] = vsubq_s16(b[23], b[22]); + a[23] = vaddq_s16(b[23], b[22]); + a[24] = vaddq_s16(b[24], b[25]); + a[25] = vsubq_s16(b[24], b[25]); + a[26] = vsubq_s16(b[27], b[26]); + a[27] = vaddq_s16(b[27], b[26]); + a[28] = vaddq_s16(b[28], b[29]); + a[29] = vsubq_s16(b[28], b[29]); + a[30] = vsubq_s16(b[31], b[30]); + a[31] = vaddq_s16(b[31], b[30]); + + // Final stage. + out[0] = a[0]; + out[16] = a[1]; + out[8] = a[2]; + out[24] = a[3]; + out[4] = a[4]; + out[20] = a[5]; + out[12] = a[6]; + out[28] = a[7]; + out[2] = a[8]; + out[18] = a[9]; + out[10] = a[10]; + out[26] = a[11]; + out[6] = a[12]; + out[22] = a[13]; + out[14] = a[14]; + out[30] = a[15]; + + butterfly_two_coeff(a[31], a[16], cospi_1_64, cospi_31_64, &out[1], &out[31]); + butterfly_two_coeff(a[30], a[17], cospi_17_64, cospi_15_64, &out[17], + &out[15]); + butterfly_two_coeff(a[29], a[18], cospi_9_64, cospi_23_64, &out[9], &out[23]); + butterfly_two_coeff(a[28], a[19], cospi_25_64, cospi_7_64, &out[25], &out[7]); + butterfly_two_coeff(a[27], a[20], cospi_5_64, cospi_27_64, &out[5], &out[27]); + butterfly_two_coeff(a[26], a[21], cospi_21_64, cospi_11_64, &out[21], + &out[11]); + butterfly_two_coeff(a[25], a[22], cospi_13_64, cospi_19_64, &out[13], + &out[19]); + butterfly_two_coeff(a[24], a[23], cospi_29_64, cospi_3_64, &out[29], &out[3]); +} + +#undef PASS_THROUGH +#undef ADD_S16_S32 +#undef SUB_S16_S32 +#undef ADDW_S16_S32 +#undef SUBW_S16_S32 +#undef ADD_S32 +#undef SUB_S32 +#undef BUTTERFLY_ONE_S16_S32 +#undef BUTTERFLY_ONE_S32 +#undef BUTTERFLY_TWO_S32 + +#if CONFIG_VP9_HIGHBITDEPTH + +// Store 32 32x4 vectors, assuming stride == 32. +static INLINE void store32x32_s32( + tran_low_t *a, const int32x4_t *l1 /*[16]*/, const int32x4_t *r1 /*[16]*/, + const int32x4_t *l2 /*[16]*/, const int32x4_t *r2 /*[16]*/, + const int32x4_t *l3 /*[16]*/, const int32x4_t *r3 /*[16]*/, + const int32x4_t *l4 /*[16]*/, const int32x4_t *r4 /*[16]*/) { + int i; + for (i = 0; i < 32; i++) { + vst1q_s32(a, l1[i]); + vst1q_s32(a + 4, r1[i]); + vst1q_s32(a + 8, l2[i]); + vst1q_s32(a + 12, r2[i]); + vst1q_s32(a + 16, l3[i]); + vst1q_s32(a + 20, r3[i]); + vst1q_s32(a + 24, l4[i]); + vst1q_s32(a + 28, r4[i]); + a += 32; + } +} + +static INLINE void highbd_scale_input(const int16x8_t *a /*[32]*/, + int32x4_t *left /*[32]*/, + int32x4_t *right /* [32] */) { + left[0] = vshll_n_s16(vget_low_s16(a[0]), 2); + left[1] = vshll_n_s16(vget_low_s16(a[1]), 2); + left[2] = vshll_n_s16(vget_low_s16(a[2]), 2); + left[3] = vshll_n_s16(vget_low_s16(a[3]), 2); + left[4] = vshll_n_s16(vget_low_s16(a[4]), 2); + left[5] = vshll_n_s16(vget_low_s16(a[5]), 2); + left[6] = vshll_n_s16(vget_low_s16(a[6]), 2); + left[7] = vshll_n_s16(vget_low_s16(a[7]), 2); + left[8] = vshll_n_s16(vget_low_s16(a[8]), 2); + left[9] = vshll_n_s16(vget_low_s16(a[9]), 2); + left[10] = vshll_n_s16(vget_low_s16(a[10]), 2); + left[11] = vshll_n_s16(vget_low_s16(a[11]), 2); + left[12] = vshll_n_s16(vget_low_s16(a[12]), 2); + left[13] = vshll_n_s16(vget_low_s16(a[13]), 2); + left[14] = vshll_n_s16(vget_low_s16(a[14]), 2); + left[15] = vshll_n_s16(vget_low_s16(a[15]), 2); + left[16] = vshll_n_s16(vget_low_s16(a[16]), 2); + left[17] = vshll_n_s16(vget_low_s16(a[17]), 2); + left[18] = vshll_n_s16(vget_low_s16(a[18]), 2); + left[19] = vshll_n_s16(vget_low_s16(a[19]), 2); + left[20] = vshll_n_s16(vget_low_s16(a[20]), 2); + left[21] = vshll_n_s16(vget_low_s16(a[21]), 2); + left[22] = vshll_n_s16(vget_low_s16(a[22]), 2); + left[23] = vshll_n_s16(vget_low_s16(a[23]), 2); + left[24] = vshll_n_s16(vget_low_s16(a[24]), 2); + left[25] = vshll_n_s16(vget_low_s16(a[25]), 2); + left[26] = vshll_n_s16(vget_low_s16(a[26]), 2); + left[27] = vshll_n_s16(vget_low_s16(a[27]), 2); + left[28] = vshll_n_s16(vget_low_s16(a[28]), 2); + left[29] = vshll_n_s16(vget_low_s16(a[29]), 2); + left[30] = vshll_n_s16(vget_low_s16(a[30]), 2); + left[31] = vshll_n_s16(vget_low_s16(a[31]), 2); + + right[0] = vshll_n_s16(vget_high_s16(a[0]), 2); + right[1] = vshll_n_s16(vget_high_s16(a[1]), 2); + right[2] = vshll_n_s16(vget_high_s16(a[2]), 2); + right[3] = vshll_n_s16(vget_high_s16(a[3]), 2); + right[4] = vshll_n_s16(vget_high_s16(a[4]), 2); + right[5] = vshll_n_s16(vget_high_s16(a[5]), 2); + right[6] = vshll_n_s16(vget_high_s16(a[6]), 2); + right[7] = vshll_n_s16(vget_high_s16(a[7]), 2); + right[8] = vshll_n_s16(vget_high_s16(a[8]), 2); + right[9] = vshll_n_s16(vget_high_s16(a[9]), 2); + right[10] = vshll_n_s16(vget_high_s16(a[10]), 2); + right[11] = vshll_n_s16(vget_high_s16(a[11]), 2); + right[12] = vshll_n_s16(vget_high_s16(a[12]), 2); + right[13] = vshll_n_s16(vget_high_s16(a[13]), 2); + right[14] = vshll_n_s16(vget_high_s16(a[14]), 2); + right[15] = vshll_n_s16(vget_high_s16(a[15]), 2); + right[16] = vshll_n_s16(vget_high_s16(a[16]), 2); + right[17] = vshll_n_s16(vget_high_s16(a[17]), 2); + right[18] = vshll_n_s16(vget_high_s16(a[18]), 2); + right[19] = vshll_n_s16(vget_high_s16(a[19]), 2); + right[20] = vshll_n_s16(vget_high_s16(a[20]), 2); + right[21] = vshll_n_s16(vget_high_s16(a[21]), 2); + right[22] = vshll_n_s16(vget_high_s16(a[22]), 2); + right[23] = vshll_n_s16(vget_high_s16(a[23]), 2); + right[24] = vshll_n_s16(vget_high_s16(a[24]), 2); + right[25] = vshll_n_s16(vget_high_s16(a[25]), 2); + right[26] = vshll_n_s16(vget_high_s16(a[26]), 2); + right[27] = vshll_n_s16(vget_high_s16(a[27]), 2); + right[28] = vshll_n_s16(vget_high_s16(a[28]), 2); + right[29] = vshll_n_s16(vget_high_s16(a[29]), 2); + right[30] = vshll_n_s16(vget_high_s16(a[30]), 2); + right[31] = vshll_n_s16(vget_high_s16(a[31]), 2); +} + +static INLINE void highbd_cross_input(const int32x4_t *a_left /*[32]*/, + int32x4_t *a_right /*[32]*/, + int32x4_t *b_left /*[32]*/, + int32x4_t *b_right /*[32]*/) { + // Stage 1. Done as part of the load for the first pass. + b_left[0] = vaddq_s32(a_left[0], a_left[31]); + b_left[1] = vaddq_s32(a_left[1], a_left[30]); + b_left[2] = vaddq_s32(a_left[2], a_left[29]); + b_left[3] = vaddq_s32(a_left[3], a_left[28]); + b_left[4] = vaddq_s32(a_left[4], a_left[27]); + b_left[5] = vaddq_s32(a_left[5], a_left[26]); + b_left[6] = vaddq_s32(a_left[6], a_left[25]); + b_left[7] = vaddq_s32(a_left[7], a_left[24]); + b_left[8] = vaddq_s32(a_left[8], a_left[23]); + b_left[9] = vaddq_s32(a_left[9], a_left[22]); + b_left[10] = vaddq_s32(a_left[10], a_left[21]); + b_left[11] = vaddq_s32(a_left[11], a_left[20]); + b_left[12] = vaddq_s32(a_left[12], a_left[19]); + b_left[13] = vaddq_s32(a_left[13], a_left[18]); + b_left[14] = vaddq_s32(a_left[14], a_left[17]); + b_left[15] = vaddq_s32(a_left[15], a_left[16]); + + b_right[0] = vaddq_s32(a_right[0], a_right[31]); + b_right[1] = vaddq_s32(a_right[1], a_right[30]); + b_right[2] = vaddq_s32(a_right[2], a_right[29]); + b_right[3] = vaddq_s32(a_right[3], a_right[28]); + b_right[4] = vaddq_s32(a_right[4], a_right[27]); + b_right[5] = vaddq_s32(a_right[5], a_right[26]); + b_right[6] = vaddq_s32(a_right[6], a_right[25]); + b_right[7] = vaddq_s32(a_right[7], a_right[24]); + b_right[8] = vaddq_s32(a_right[8], a_right[23]); + b_right[9] = vaddq_s32(a_right[9], a_right[22]); + b_right[10] = vaddq_s32(a_right[10], a_right[21]); + b_right[11] = vaddq_s32(a_right[11], a_right[20]); + b_right[12] = vaddq_s32(a_right[12], a_right[19]); + b_right[13] = vaddq_s32(a_right[13], a_right[18]); + b_right[14] = vaddq_s32(a_right[14], a_right[17]); + b_right[15] = vaddq_s32(a_right[15], a_right[16]); + + b_left[16] = vsubq_s32(a_left[15], a_left[16]); + b_left[17] = vsubq_s32(a_left[14], a_left[17]); + b_left[18] = vsubq_s32(a_left[13], a_left[18]); + b_left[19] = vsubq_s32(a_left[12], a_left[19]); + b_left[20] = vsubq_s32(a_left[11], a_left[20]); + b_left[21] = vsubq_s32(a_left[10], a_left[21]); + b_left[22] = vsubq_s32(a_left[9], a_left[22]); + b_left[23] = vsubq_s32(a_left[8], a_left[23]); + b_left[24] = vsubq_s32(a_left[7], a_left[24]); + b_left[25] = vsubq_s32(a_left[6], a_left[25]); + b_left[26] = vsubq_s32(a_left[5], a_left[26]); + b_left[27] = vsubq_s32(a_left[4], a_left[27]); + b_left[28] = vsubq_s32(a_left[3], a_left[28]); + b_left[29] = vsubq_s32(a_left[2], a_left[29]); + b_left[30] = vsubq_s32(a_left[1], a_left[30]); + b_left[31] = vsubq_s32(a_left[0], a_left[31]); + + b_right[16] = vsubq_s32(a_right[15], a_right[16]); + b_right[17] = vsubq_s32(a_right[14], a_right[17]); + b_right[18] = vsubq_s32(a_right[13], a_right[18]); + b_right[19] = vsubq_s32(a_right[12], a_right[19]); + b_right[20] = vsubq_s32(a_right[11], a_right[20]); + b_right[21] = vsubq_s32(a_right[10], a_right[21]); + b_right[22] = vsubq_s32(a_right[9], a_right[22]); + b_right[23] = vsubq_s32(a_right[8], a_right[23]); + b_right[24] = vsubq_s32(a_right[7], a_right[24]); + b_right[25] = vsubq_s32(a_right[6], a_right[25]); + b_right[26] = vsubq_s32(a_right[5], a_right[26]); + b_right[27] = vsubq_s32(a_right[4], a_right[27]); + b_right[28] = vsubq_s32(a_right[3], a_right[28]); + b_right[29] = vsubq_s32(a_right[2], a_right[29]); + b_right[30] = vsubq_s32(a_right[1], a_right[30]); + b_right[31] = vsubq_s32(a_right[0], a_right[31]); +} + +static INLINE void highbd_partial_add_round_shift(int32x4_t *left /*[32]*/, + int32x4_t *right /* [32] */) { + // Also compute partial rounding shift: + // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + + left[0] = add_round_shift_s32(left[0]); + left[1] = add_round_shift_s32(left[1]); + left[2] = add_round_shift_s32(left[2]); + left[3] = add_round_shift_s32(left[3]); + left[4] = add_round_shift_s32(left[4]); + left[5] = add_round_shift_s32(left[5]); + left[6] = add_round_shift_s32(left[6]); + left[7] = add_round_shift_s32(left[7]); + left[8] = add_round_shift_s32(left[8]); + left[9] = add_round_shift_s32(left[9]); + left[10] = add_round_shift_s32(left[10]); + left[11] = add_round_shift_s32(left[11]); + left[12] = add_round_shift_s32(left[12]); + left[13] = add_round_shift_s32(left[13]); + left[14] = add_round_shift_s32(left[14]); + left[15] = add_round_shift_s32(left[15]); + left[16] = add_round_shift_s32(left[16]); + left[17] = add_round_shift_s32(left[17]); + left[18] = add_round_shift_s32(left[18]); + left[19] = add_round_shift_s32(left[19]); + left[20] = add_round_shift_s32(left[20]); + left[21] = add_round_shift_s32(left[21]); + left[22] = add_round_shift_s32(left[22]); + left[23] = add_round_shift_s32(left[23]); + left[24] = add_round_shift_s32(left[24]); + left[25] = add_round_shift_s32(left[25]); + left[26] = add_round_shift_s32(left[26]); + left[27] = add_round_shift_s32(left[27]); + left[28] = add_round_shift_s32(left[28]); + left[29] = add_round_shift_s32(left[29]); + left[30] = add_round_shift_s32(left[30]); + left[31] = add_round_shift_s32(left[31]); + + right[0] = add_round_shift_s32(right[0]); + right[1] = add_round_shift_s32(right[1]); + right[2] = add_round_shift_s32(right[2]); + right[3] = add_round_shift_s32(right[3]); + right[4] = add_round_shift_s32(right[4]); + right[5] = add_round_shift_s32(right[5]); + right[6] = add_round_shift_s32(right[6]); + right[7] = add_round_shift_s32(right[7]); + right[8] = add_round_shift_s32(right[8]); + right[9] = add_round_shift_s32(right[9]); + right[10] = add_round_shift_s32(right[10]); + right[11] = add_round_shift_s32(right[11]); + right[12] = add_round_shift_s32(right[12]); + right[13] = add_round_shift_s32(right[13]); + right[14] = add_round_shift_s32(right[14]); + right[15] = add_round_shift_s32(right[15]); + right[16] = add_round_shift_s32(right[16]); + right[17] = add_round_shift_s32(right[17]); + right[18] = add_round_shift_s32(right[18]); + right[19] = add_round_shift_s32(right[19]); + right[20] = add_round_shift_s32(right[20]); + right[21] = add_round_shift_s32(right[21]); + right[22] = add_round_shift_s32(right[22]); + right[23] = add_round_shift_s32(right[23]); + right[24] = add_round_shift_s32(right[24]); + right[25] = add_round_shift_s32(right[25]); + right[26] = add_round_shift_s32(right[26]); + right[27] = add_round_shift_s32(right[27]); + right[28] = add_round_shift_s32(right[28]); + right[29] = add_round_shift_s32(right[29]); + right[30] = add_round_shift_s32(right[30]); + right[31] = add_round_shift_s32(right[31]); +} + +static INLINE void highbd_partial_sub_round_shift(int32x4_t *left /*[32]*/, + int32x4_t *right /* [32] */) { + // Also compute partial rounding shift: + // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + + left[0] = sub_round_shift_s32(left[0]); + left[1] = sub_round_shift_s32(left[1]); + left[2] = sub_round_shift_s32(left[2]); + left[3] = sub_round_shift_s32(left[3]); + left[4] = sub_round_shift_s32(left[4]); + left[5] = sub_round_shift_s32(left[5]); + left[6] = sub_round_shift_s32(left[6]); + left[7] = sub_round_shift_s32(left[7]); + left[8] = sub_round_shift_s32(left[8]); + left[9] = sub_round_shift_s32(left[9]); + left[10] = sub_round_shift_s32(left[10]); + left[11] = sub_round_shift_s32(left[11]); + left[12] = sub_round_shift_s32(left[12]); + left[13] = sub_round_shift_s32(left[13]); + left[14] = sub_round_shift_s32(left[14]); + left[15] = sub_round_shift_s32(left[15]); + left[16] = sub_round_shift_s32(left[16]); + left[17] = sub_round_shift_s32(left[17]); + left[18] = sub_round_shift_s32(left[18]); + left[19] = sub_round_shift_s32(left[19]); + left[20] = sub_round_shift_s32(left[20]); + left[21] = sub_round_shift_s32(left[21]); + left[22] = sub_round_shift_s32(left[22]); + left[23] = sub_round_shift_s32(left[23]); + left[24] = sub_round_shift_s32(left[24]); + left[25] = sub_round_shift_s32(left[25]); + left[26] = sub_round_shift_s32(left[26]); + left[27] = sub_round_shift_s32(left[27]); + left[28] = sub_round_shift_s32(left[28]); + left[29] = sub_round_shift_s32(left[29]); + left[30] = sub_round_shift_s32(left[30]); + left[31] = sub_round_shift_s32(left[31]); + + right[0] = sub_round_shift_s32(right[0]); + right[1] = sub_round_shift_s32(right[1]); + right[2] = sub_round_shift_s32(right[2]); + right[3] = sub_round_shift_s32(right[3]); + right[4] = sub_round_shift_s32(right[4]); + right[5] = sub_round_shift_s32(right[5]); + right[6] = sub_round_shift_s32(right[6]); + right[7] = sub_round_shift_s32(right[7]); + right[8] = sub_round_shift_s32(right[8]); + right[9] = sub_round_shift_s32(right[9]); + right[10] = sub_round_shift_s32(right[10]); + right[11] = sub_round_shift_s32(right[11]); + right[12] = sub_round_shift_s32(right[12]); + right[13] = sub_round_shift_s32(right[13]); + right[14] = sub_round_shift_s32(right[14]); + right[15] = sub_round_shift_s32(right[15]); + right[16] = sub_round_shift_s32(right[16]); + right[17] = sub_round_shift_s32(right[17]); + right[18] = sub_round_shift_s32(right[18]); + right[19] = sub_round_shift_s32(right[19]); + right[20] = sub_round_shift_s32(right[20]); + right[21] = sub_round_shift_s32(right[21]); + right[22] = sub_round_shift_s32(right[22]); + right[23] = sub_round_shift_s32(right[23]); + right[24] = sub_round_shift_s32(right[24]); + right[25] = sub_round_shift_s32(right[25]); + right[26] = sub_round_shift_s32(right[26]); + right[27] = sub_round_shift_s32(right[27]); + right[28] = sub_round_shift_s32(right[28]); + right[29] = sub_round_shift_s32(right[29]); + right[30] = sub_round_shift_s32(right[30]); + right[31] = sub_round_shift_s32(right[31]); +} + +static INLINE void highbd_dct8x32_body_first_pass(int32x4_t *left /*32*/, + int32x4_t *right /*32*/) { + int32x4_t al[32], ar[32]; + int32x4_t bl[32], br[32]; + + // Stage 1: Done as part of the load. + + // Stage 2. + // Mini cross. X the first 16 values and the middle 8 of the second half. + al[0] = vaddq_s32(left[0], left[15]); + ar[0] = vaddq_s32(right[0], right[15]); + al[1] = vaddq_s32(left[1], left[14]); + ar[1] = vaddq_s32(right[1], right[14]); + al[2] = vaddq_s32(left[2], left[13]); + ar[2] = vaddq_s32(right[2], right[13]); + al[3] = vaddq_s32(left[3], left[12]); + ar[3] = vaddq_s32(right[3], right[12]); + al[4] = vaddq_s32(left[4], left[11]); + ar[4] = vaddq_s32(right[4], right[11]); + al[5] = vaddq_s32(left[5], left[10]); + ar[5] = vaddq_s32(right[5], right[10]); + al[6] = vaddq_s32(left[6], left[9]); + ar[6] = vaddq_s32(right[6], right[9]); + al[7] = vaddq_s32(left[7], left[8]); + ar[7] = vaddq_s32(right[7], right[8]); + + al[8] = vsubq_s32(left[7], left[8]); + ar[8] = vsubq_s32(right[7], right[8]); + al[9] = vsubq_s32(left[6], left[9]); + ar[9] = vsubq_s32(right[6], right[9]); + al[10] = vsubq_s32(left[5], left[10]); + ar[10] = vsubq_s32(right[5], right[10]); + al[11] = vsubq_s32(left[4], left[11]); + ar[11] = vsubq_s32(right[4], right[11]); + al[12] = vsubq_s32(left[3], left[12]); + ar[12] = vsubq_s32(right[3], right[12]); + al[13] = vsubq_s32(left[2], left[13]); + ar[13] = vsubq_s32(right[2], right[13]); + al[14] = vsubq_s32(left[1], left[14]); + ar[14] = vsubq_s32(right[1], right[14]); + al[15] = vsubq_s32(left[0], left[15]); + ar[15] = vsubq_s32(right[0], right[15]); + + al[16] = left[16]; + ar[16] = right[16]; + al[17] = left[17]; + ar[17] = right[17]; + al[18] = left[18]; + ar[18] = right[18]; + al[19] = left[19]; + ar[19] = right[19]; + + butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20], + cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]); + butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21], + cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]); + butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22], + cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]); + butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23], + cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]); + + al[28] = left[28]; + ar[28] = right[28]; + al[29] = left[29]; + ar[29] = right[29]; + al[30] = left[30]; + ar[30] = right[30]; + al[31] = left[31]; + ar[31] = right[31]; + + // Stage 3. + bl[0] = vaddq_s32(al[0], al[7]); + br[0] = vaddq_s32(ar[0], ar[7]); + bl[1] = vaddq_s32(al[1], al[6]); + br[1] = vaddq_s32(ar[1], ar[6]); + bl[2] = vaddq_s32(al[2], al[5]); + br[2] = vaddq_s32(ar[2], ar[5]); + bl[3] = vaddq_s32(al[3], al[4]); + br[3] = vaddq_s32(ar[3], ar[4]); + + bl[4] = vsubq_s32(al[3], al[4]); + br[4] = vsubq_s32(ar[3], ar[4]); + bl[5] = vsubq_s32(al[2], al[5]); + br[5] = vsubq_s32(ar[2], ar[5]); + bl[6] = vsubq_s32(al[1], al[6]); + br[6] = vsubq_s32(ar[1], ar[6]); + bl[7] = vsubq_s32(al[0], al[7]); + br[7] = vsubq_s32(ar[0], ar[7]); + + bl[8] = al[8]; + br[8] = ar[8]; + bl[9] = al[9]; + br[9] = ar[9]; + + butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64, + &bl[13], &br[13], &bl[10], &br[10]); + butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64, + &bl[12], &br[12], &bl[11], &br[11]); + + bl[14] = al[14]; + br[14] = ar[14]; + bl[15] = al[15]; + br[15] = ar[15]; + + bl[16] = vaddq_s32(left[16], al[23]); + br[16] = vaddq_s32(right[16], ar[23]); + bl[17] = vaddq_s32(left[17], al[22]); + br[17] = vaddq_s32(right[17], ar[22]); + bl[18] = vaddq_s32(left[18], al[21]); + br[18] = vaddq_s32(right[18], ar[21]); + bl[19] = vaddq_s32(left[19], al[20]); + br[19] = vaddq_s32(right[19], ar[20]); + + bl[20] = vsubq_s32(left[19], al[20]); + br[20] = vsubq_s32(right[19], ar[20]); + bl[21] = vsubq_s32(left[18], al[21]); + br[21] = vsubq_s32(right[18], ar[21]); + bl[22] = vsubq_s32(left[17], al[22]); + br[22] = vsubq_s32(right[17], ar[22]); + bl[23] = vsubq_s32(left[16], al[23]); + br[23] = vsubq_s32(right[16], ar[23]); + + bl[24] = vsubq_s32(left[31], al[24]); + br[24] = vsubq_s32(right[31], ar[24]); + bl[25] = vsubq_s32(left[30], al[25]); + br[25] = vsubq_s32(right[30], ar[25]); + bl[26] = vsubq_s32(left[29], al[26]); + br[26] = vsubq_s32(right[29], ar[26]); + bl[27] = vsubq_s32(left[28], al[27]); + br[27] = vsubq_s32(right[28], ar[27]); + + bl[28] = vaddq_s32(left[28], al[27]); + br[28] = vaddq_s32(right[28], ar[27]); + bl[29] = vaddq_s32(left[29], al[26]); + br[29] = vaddq_s32(right[29], ar[26]); + bl[30] = vaddq_s32(left[30], al[25]); + br[30] = vaddq_s32(right[30], ar[25]); + bl[31] = vaddq_s32(left[31], al[24]); + br[31] = vaddq_s32(right[31], ar[24]); + + // Stage 4. + al[0] = vaddq_s32(bl[0], bl[3]); + ar[0] = vaddq_s32(br[0], br[3]); + al[1] = vaddq_s32(bl[1], bl[2]); + ar[1] = vaddq_s32(br[1], br[2]); + al[2] = vsubq_s32(bl[1], bl[2]); + ar[2] = vsubq_s32(br[1], br[2]); + al[3] = vsubq_s32(bl[0], bl[3]); + ar[3] = vsubq_s32(br[0], br[3]); + + al[4] = bl[4]; + ar[4] = br[4]; + + butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6], + &ar[6], &al[5], &ar[5]); + + al[7] = bl[7]; + ar[7] = br[7]; + + al[8] = vaddq_s32(bl[8], bl[11]); + ar[8] = vaddq_s32(br[8], br[11]); + al[9] = vaddq_s32(bl[9], bl[10]); + ar[9] = vaddq_s32(br[9], br[10]); + al[10] = vsubq_s32(bl[9], bl[10]); + ar[10] = vsubq_s32(br[9], br[10]); + al[11] = vsubq_s32(bl[8], bl[11]); + ar[11] = vsubq_s32(br[8], br[11]); + al[12] = vsubq_s32(bl[15], bl[12]); + ar[12] = vsubq_s32(br[15], br[12]); + al[13] = vsubq_s32(bl[14], bl[13]); + ar[13] = vsubq_s32(br[14], br[13]); + al[14] = vaddq_s32(bl[14], bl[13]); + ar[14] = vaddq_s32(br[14], br[13]); + al[15] = vaddq_s32(bl[15], bl[12]); + ar[15] = vaddq_s32(br[15], br[12]); + + al[16] = bl[16]; + ar[16] = br[16]; + al[17] = bl[17]; + ar[17] = br[17]; + + butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64, + cospi_24_64, &al[29], &ar[29], &al[18], + &ar[18]); + butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64, + cospi_24_64, &al[28], &ar[28], &al[19], + &ar[19]); + butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], + cospi_24_64, -cospi_8_64, &al[27], &ar[27], + &al[20], &ar[20]); + butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21], + cospi_24_64, -cospi_8_64, &al[26], &ar[26], + &al[21], &ar[21]); + + al[22] = bl[22]; + ar[22] = br[22]; + al[23] = bl[23]; + ar[23] = br[23]; + al[24] = bl[24]; + ar[24] = br[24]; + al[25] = bl[25]; + ar[25] = br[25]; + + al[30] = bl[30]; + ar[30] = br[30]; + al[31] = bl[31]; + ar[31] = br[31]; + + // Stage 5. + butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0], + &br[0], &bl[1], &br[1]); + butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64, + cospi_24_64, &bl[2], &br[2], &bl[3], + &br[3]); + + bl[4] = vaddq_s32(al[4], al[5]); + br[4] = vaddq_s32(ar[4], ar[5]); + bl[5] = vsubq_s32(al[4], al[5]); + br[5] = vsubq_s32(ar[4], ar[5]); + bl[6] = vsubq_s32(al[7], al[6]); + br[6] = vsubq_s32(ar[7], ar[6]); + bl[7] = vaddq_s32(al[7], al[6]); + br[7] = vaddq_s32(ar[7], ar[6]); + + bl[8] = al[8]; + br[8] = ar[8]; + + butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64, + cospi_24_64, &bl[14], &br[14], &bl[9], + &br[9]); + butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10], + cospi_24_64, -cospi_8_64, &bl[13], &br[13], + &bl[10], &br[10]); + + bl[11] = al[11]; + br[11] = ar[11]; + bl[12] = al[12]; + br[12] = ar[12]; + + bl[15] = al[15]; + br[15] = ar[15]; + + bl[16] = vaddq_s32(al[19], al[16]); + br[16] = vaddq_s32(ar[19], ar[16]); + bl[17] = vaddq_s32(al[18], al[17]); + br[17] = vaddq_s32(ar[18], ar[17]); + bl[18] = vsubq_s32(al[17], al[18]); + br[18] = vsubq_s32(ar[17], ar[18]); + bl[19] = vsubq_s32(al[16], al[19]); + br[19] = vsubq_s32(ar[16], ar[19]); + bl[20] = vsubq_s32(al[23], al[20]); + br[20] = vsubq_s32(ar[23], ar[20]); + bl[21] = vsubq_s32(al[22], al[21]); + br[21] = vsubq_s32(ar[22], ar[21]); + bl[22] = vaddq_s32(al[21], al[22]); + br[22] = vaddq_s32(ar[21], ar[22]); + bl[23] = vaddq_s32(al[20], al[23]); + br[23] = vaddq_s32(ar[20], ar[23]); + bl[24] = vaddq_s32(al[27], al[24]); + br[24] = vaddq_s32(ar[27], ar[24]); + bl[25] = vaddq_s32(al[26], al[25]); + br[25] = vaddq_s32(ar[26], ar[25]); + bl[26] = vsubq_s32(al[25], al[26]); + br[26] = vsubq_s32(ar[25], ar[26]); + bl[27] = vsubq_s32(al[24], al[27]); + br[27] = vsubq_s32(ar[24], ar[27]); + bl[28] = vsubq_s32(al[31], al[28]); + br[28] = vsubq_s32(ar[31], ar[28]); + bl[29] = vsubq_s32(al[30], al[29]); + br[29] = vsubq_s32(ar[30], ar[29]); + bl[30] = vaddq_s32(al[29], al[30]); + br[30] = vaddq_s32(ar[29], ar[30]); + bl[31] = vaddq_s32(al[28], al[31]); + br[31] = vaddq_s32(ar[28], ar[31]); + + // Stage 6. + al[0] = bl[0]; + ar[0] = br[0]; + al[1] = bl[1]; + ar[1] = br[1]; + al[2] = bl[2]; + ar[2] = br[2]; + al[3] = bl[3]; + ar[3] = br[3]; + + butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64, + cospi_28_64, &al[4], &ar[4], &al[7], + &ar[7]); + butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64, + cospi_12_64, &al[5], &ar[5], &al[6], + &ar[6]); + + al[8] = vaddq_s32(bl[8], bl[9]); + ar[8] = vaddq_s32(br[8], br[9]); + al[9] = vsubq_s32(bl[8], bl[9]); + ar[9] = vsubq_s32(br[8], br[9]); + al[10] = vsubq_s32(bl[11], bl[10]); + ar[10] = vsubq_s32(br[11], br[10]); + al[11] = vaddq_s32(bl[11], bl[10]); + ar[11] = vaddq_s32(br[11], br[10]); + al[12] = vaddq_s32(bl[12], bl[13]); + ar[12] = vaddq_s32(br[12], br[13]); + al[13] = vsubq_s32(bl[12], bl[13]); + ar[13] = vsubq_s32(br[12], br[13]); + al[14] = vsubq_s32(bl[15], bl[14]); + ar[14] = vsubq_s32(br[15], br[14]); + al[15] = vaddq_s32(bl[15], bl[14]); + ar[15] = vaddq_s32(br[15], br[14]); + + al[16] = bl[16]; + ar[16] = br[16]; + al[19] = bl[19]; + ar[19] = br[19]; + al[20] = bl[20]; + ar[20] = br[20]; + al[23] = bl[23]; + ar[23] = br[23]; + al[24] = bl[24]; + ar[24] = br[24]; + al[27] = bl[27]; + ar[27] = br[27]; + al[28] = bl[28]; + ar[28] = br[28]; + al[31] = bl[31]; + ar[31] = br[31]; + + butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64, + cospi_28_64, &al[30], &ar[30], &al[17], + &ar[17]); + butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], + cospi_28_64, -cospi_4_64, &al[29], &ar[29], + &al[18], &ar[18]); + butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21], + cospi_20_64, cospi_12_64, &al[26], &ar[26], + &al[21], &ar[21]); + butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22], + cospi_12_64, -cospi_20_64, &al[25], + &ar[25], &al[22], &ar[22]); + + // Stage 7. + bl[0] = al[0]; + br[0] = ar[0]; + bl[1] = al[1]; + br[1] = ar[1]; + bl[2] = al[2]; + br[2] = ar[2]; + bl[3] = al[3]; + br[3] = ar[3]; + bl[4] = al[4]; + br[4] = ar[4]; + bl[5] = al[5]; + br[5] = ar[5]; + bl[6] = al[6]; + br[6] = ar[6]; + bl[7] = al[7]; + br[7] = ar[7]; + + butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64, + cospi_30_64, &bl[8], &br[8], &bl[15], + &br[15]); + butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64, + cospi_14_64, &bl[9], &br[9], &bl[14], + &br[14]); + butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10], + cospi_10_64, cospi_22_64, &bl[10], &br[10], + &bl[13], &br[13]); + butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11], + cospi_26_64, cospi_6_64, &bl[11], &br[11], + &bl[12], &br[12]); + + bl[16] = vaddq_s32(al[16], al[17]); + br[16] = vaddq_s32(ar[16], ar[17]); + bl[17] = vsubq_s32(al[16], al[17]); + br[17] = vsubq_s32(ar[16], ar[17]); + bl[18] = vsubq_s32(al[19], al[18]); + br[18] = vsubq_s32(ar[19], ar[18]); + bl[19] = vaddq_s32(al[19], al[18]); + br[19] = vaddq_s32(ar[19], ar[18]); + bl[20] = vaddq_s32(al[20], al[21]); + br[20] = vaddq_s32(ar[20], ar[21]); + bl[21] = vsubq_s32(al[20], al[21]); + br[21] = vsubq_s32(ar[20], ar[21]); + bl[22] = vsubq_s32(al[23], al[22]); + br[22] = vsubq_s32(ar[23], ar[22]); + bl[23] = vaddq_s32(al[23], al[22]); + br[23] = vaddq_s32(ar[23], ar[22]); + bl[24] = vaddq_s32(al[24], al[25]); + br[24] = vaddq_s32(ar[24], ar[25]); + bl[25] = vsubq_s32(al[24], al[25]); + br[25] = vsubq_s32(ar[24], ar[25]); + bl[26] = vsubq_s32(al[27], al[26]); + br[26] = vsubq_s32(ar[27], ar[26]); + bl[27] = vaddq_s32(al[27], al[26]); + br[27] = vaddq_s32(ar[27], ar[26]); + bl[28] = vaddq_s32(al[28], al[29]); + br[28] = vaddq_s32(ar[28], ar[29]); + bl[29] = vsubq_s32(al[28], al[29]); + br[29] = vsubq_s32(ar[28], ar[29]); + bl[30] = vsubq_s32(al[31], al[30]); + br[30] = vsubq_s32(ar[31], ar[30]); + bl[31] = vaddq_s32(al[31], al[30]); + br[31] = vaddq_s32(ar[31], ar[30]); + + // Final stage. + + left[0] = bl[0]; + right[0] = br[0]; + left[16] = bl[1]; + right[16] = br[1]; + left[8] = bl[2]; + right[8] = br[2]; + left[24] = bl[3]; + right[24] = br[3]; + left[4] = bl[4]; + right[4] = br[4]; + left[20] = bl[5]; + right[20] = br[5]; + left[12] = bl[6]; + right[12] = br[6]; + left[28] = bl[7]; + right[28] = br[7]; + left[2] = bl[8]; + right[2] = br[8]; + left[18] = bl[9]; + right[18] = br[9]; + left[10] = bl[10]; + right[10] = br[10]; + left[26] = bl[11]; + right[26] = br[11]; + left[6] = bl[12]; + right[6] = br[12]; + left[22] = bl[13]; + right[22] = br[13]; + left[14] = bl[14]; + right[14] = br[14]; + left[30] = bl[15]; + right[30] = br[15]; + + butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64, + cospi_31_64, &al[1], &ar[1], &al[31], + &ar[31]); + left[1] = al[1]; + right[1] = ar[1]; + left[31] = al[31]; + right[31] = ar[31]; + + butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], + cospi_17_64, cospi_15_64, &al[17], &ar[17], + &al[15], &ar[15]); + left[17] = al[17]; + right[17] = ar[17]; + left[15] = al[15]; + right[15] = ar[15]; + + butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64, + cospi_23_64, &al[9], &ar[9], &al[23], + &ar[23]); + left[9] = al[9]; + right[9] = ar[9]; + left[23] = al[23]; + right[23] = ar[23]; + + butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], + cospi_25_64, cospi_7_64, &al[25], &ar[25], + &al[7], &ar[7]); + left[25] = al[25]; + right[25] = ar[25]; + left[7] = al[7]; + right[7] = ar[7]; + + butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64, + cospi_27_64, &al[5], &ar[5], &al[27], + &ar[27]); + left[5] = al[5]; + right[5] = ar[5]; + left[27] = al[27]; + right[27] = ar[27]; + + butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21], + cospi_21_64, cospi_11_64, &al[21], &ar[21], + &al[11], &ar[11]); + left[21] = al[21]; + right[21] = ar[21]; + left[11] = al[11]; + right[11] = ar[11]; + + butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22], + cospi_13_64, cospi_19_64, &al[13], &ar[13], + &al[19], &ar[19]); + left[13] = al[13]; + right[13] = ar[13]; + left[19] = al[19]; + right[19] = ar[19]; + + butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23], + cospi_29_64, cospi_3_64, &al[29], &ar[29], + &al[3], &ar[3]); + left[29] = al[29]; + right[29] = ar[29]; + left[3] = al[3]; + right[3] = ar[3]; +} + +static INLINE void highbd_dct8x32_body_second_pass(int32x4_t *left /*32*/, + int32x4_t *right /*32*/) { + int32x4_t al[32], ar[32]; + int32x4_t bl[32], br[32]; + + // Stage 1: Done as part of the load. + + // Stage 2. + // Mini cross. X the first 16 values and the middle 8 of the second half. + al[0] = vaddq_s32(left[0], left[15]); + ar[0] = vaddq_s32(right[0], right[15]); + al[1] = vaddq_s32(left[1], left[14]); + ar[1] = vaddq_s32(right[1], right[14]); + al[2] = vaddq_s32(left[2], left[13]); + ar[2] = vaddq_s32(right[2], right[13]); + al[3] = vaddq_s32(left[3], left[12]); + ar[3] = vaddq_s32(right[3], right[12]); + al[4] = vaddq_s32(left[4], left[11]); + ar[4] = vaddq_s32(right[4], right[11]); + al[5] = vaddq_s32(left[5], left[10]); + ar[5] = vaddq_s32(right[5], right[10]); + al[6] = vaddq_s32(left[6], left[9]); + ar[6] = vaddq_s32(right[6], right[9]); + al[7] = vaddq_s32(left[7], left[8]); + ar[7] = vaddq_s32(right[7], right[8]); + + al[8] = vsubq_s32(left[7], left[8]); + ar[8] = vsubq_s32(right[7], right[8]); + al[9] = vsubq_s32(left[6], left[9]); + ar[9] = vsubq_s32(right[6], right[9]); + al[10] = vsubq_s32(left[5], left[10]); + ar[10] = vsubq_s32(right[5], right[10]); + al[11] = vsubq_s32(left[4], left[11]); + ar[11] = vsubq_s32(right[4], right[11]); + al[12] = vsubq_s32(left[3], left[12]); + ar[12] = vsubq_s32(right[3], right[12]); + al[13] = vsubq_s32(left[2], left[13]); + ar[13] = vsubq_s32(right[2], right[13]); + al[14] = vsubq_s32(left[1], left[14]); + ar[14] = vsubq_s32(right[1], right[14]); + al[15] = vsubq_s32(left[0], left[15]); + ar[15] = vsubq_s32(right[0], right[15]); + + al[16] = left[16]; + ar[16] = right[16]; + al[17] = left[17]; + ar[17] = right[17]; + al[18] = left[18]; + ar[18] = right[18]; + al[19] = left[19]; + ar[19] = right[19]; + + butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20], + cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]); + butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21], + cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]); + butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22], + cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]); + butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23], + cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]); + + al[28] = left[28]; + ar[28] = right[28]; + al[29] = left[29]; + ar[29] = right[29]; + al[30] = left[30]; + ar[30] = right[30]; + al[31] = left[31]; + ar[31] = right[31]; + + // Stage 3. + bl[0] = vaddq_s32(al[0], al[7]); + br[0] = vaddq_s32(ar[0], ar[7]); + bl[1] = vaddq_s32(al[1], al[6]); + br[1] = vaddq_s32(ar[1], ar[6]); + bl[2] = vaddq_s32(al[2], al[5]); + br[2] = vaddq_s32(ar[2], ar[5]); + bl[3] = vaddq_s32(al[3], al[4]); + br[3] = vaddq_s32(ar[3], ar[4]); + + bl[4] = vsubq_s32(al[3], al[4]); + br[4] = vsubq_s32(ar[3], ar[4]); + bl[5] = vsubq_s32(al[2], al[5]); + br[5] = vsubq_s32(ar[2], ar[5]); + bl[6] = vsubq_s32(al[1], al[6]); + br[6] = vsubq_s32(ar[1], ar[6]); + bl[7] = vsubq_s32(al[0], al[7]); + br[7] = vsubq_s32(ar[0], ar[7]); + + bl[8] = al[8]; + br[8] = ar[8]; + bl[9] = al[9]; + br[9] = ar[9]; + + butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64, + &bl[13], &br[13], &bl[10], &br[10]); + butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64, + &bl[12], &br[12], &bl[11], &br[11]); + + bl[14] = al[14]; + br[14] = ar[14]; + bl[15] = al[15]; + br[15] = ar[15]; + + bl[16] = vaddq_s32(left[16], al[23]); + br[16] = vaddq_s32(right[16], ar[23]); + bl[17] = vaddq_s32(left[17], al[22]); + br[17] = vaddq_s32(right[17], ar[22]); + bl[18] = vaddq_s32(left[18], al[21]); + br[18] = vaddq_s32(right[18], ar[21]); + bl[19] = vaddq_s32(left[19], al[20]); + br[19] = vaddq_s32(right[19], ar[20]); + + bl[20] = vsubq_s32(left[19], al[20]); + br[20] = vsubq_s32(right[19], ar[20]); + bl[21] = vsubq_s32(left[18], al[21]); + br[21] = vsubq_s32(right[18], ar[21]); + bl[22] = vsubq_s32(left[17], al[22]); + br[22] = vsubq_s32(right[17], ar[22]); + bl[23] = vsubq_s32(left[16], al[23]); + br[23] = vsubq_s32(right[16], ar[23]); + + bl[24] = vsubq_s32(left[31], al[24]); + br[24] = vsubq_s32(right[31], ar[24]); + bl[25] = vsubq_s32(left[30], al[25]); + br[25] = vsubq_s32(right[30], ar[25]); + bl[26] = vsubq_s32(left[29], al[26]); + br[26] = vsubq_s32(right[29], ar[26]); + bl[27] = vsubq_s32(left[28], al[27]); + br[27] = vsubq_s32(right[28], ar[27]); + + bl[28] = vaddq_s32(left[28], al[27]); + br[28] = vaddq_s32(right[28], ar[27]); + bl[29] = vaddq_s32(left[29], al[26]); + br[29] = vaddq_s32(right[29], ar[26]); + bl[30] = vaddq_s32(left[30], al[25]); + br[30] = vaddq_s32(right[30], ar[25]); + bl[31] = vaddq_s32(left[31], al[24]); + br[31] = vaddq_s32(right[31], ar[24]); + + // Stage 4. + al[0] = vaddq_s32(bl[0], bl[3]); + ar[0] = vaddq_s32(br[0], br[3]); + al[1] = vaddq_s32(bl[1], bl[2]); + ar[1] = vaddq_s32(br[1], br[2]); + al[2] = vsubq_s32(bl[1], bl[2]); + ar[2] = vsubq_s32(br[1], br[2]); + al[3] = vsubq_s32(bl[0], bl[3]); + ar[3] = vsubq_s32(br[0], br[3]); + + al[4] = bl[4]; + ar[4] = br[4]; + + butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6], + &ar[6], &al[5], &ar[5]); + + al[7] = bl[7]; + ar[7] = br[7]; + + al[8] = vaddq_s32(bl[8], bl[11]); + ar[8] = vaddq_s32(br[8], br[11]); + al[9] = vaddq_s32(bl[9], bl[10]); + ar[9] = vaddq_s32(br[9], br[10]); + al[10] = vsubq_s32(bl[9], bl[10]); + ar[10] = vsubq_s32(br[9], br[10]); + al[11] = vsubq_s32(bl[8], bl[11]); + ar[11] = vsubq_s32(br[8], br[11]); + al[12] = vsubq_s32(bl[15], bl[12]); + ar[12] = vsubq_s32(br[15], br[12]); + al[13] = vsubq_s32(bl[14], bl[13]); + ar[13] = vsubq_s32(br[14], br[13]); + al[14] = vaddq_s32(bl[14], bl[13]); + ar[14] = vaddq_s32(br[14], br[13]); + al[15] = vaddq_s32(bl[15], bl[12]); + ar[15] = vaddq_s32(br[15], br[12]); + + al[16] = bl[16]; + ar[16] = br[16]; + al[17] = bl[17]; + ar[17] = br[17]; + + butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64, + cospi_24_64, &al[29], &ar[29], &al[18], + &ar[18]); + butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64, + cospi_24_64, &al[28], &ar[28], &al[19], + &ar[19]); + butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], + cospi_24_64, -cospi_8_64, &al[27], &ar[27], + &al[20], &ar[20]); + butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21], + cospi_24_64, -cospi_8_64, &al[26], &ar[26], + &al[21], &ar[21]); + + al[22] = bl[22]; + ar[22] = br[22]; + al[23] = bl[23]; + ar[23] = br[23]; + al[24] = bl[24]; + ar[24] = br[24]; + al[25] = bl[25]; + ar[25] = br[25]; + + al[30] = bl[30]; + ar[30] = br[30]; + al[31] = bl[31]; + ar[31] = br[31]; + + // Stage 5. + butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0], + &br[0], &bl[1], &br[1]); + butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64, + cospi_24_64, &bl[2], &br[2], &bl[3], + &br[3]); + + bl[4] = vaddq_s32(al[4], al[5]); + br[4] = vaddq_s32(ar[4], ar[5]); + bl[5] = vsubq_s32(al[4], al[5]); + br[5] = vsubq_s32(ar[4], ar[5]); + bl[6] = vsubq_s32(al[7], al[6]); + br[6] = vsubq_s32(ar[7], ar[6]); + bl[7] = vaddq_s32(al[7], al[6]); + br[7] = vaddq_s32(ar[7], ar[6]); + + bl[8] = al[8]; + br[8] = ar[8]; + + butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64, + cospi_24_64, &bl[14], &br[14], &bl[9], + &br[9]); + butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10], + cospi_24_64, -cospi_8_64, &bl[13], &br[13], + &bl[10], &br[10]); + + bl[11] = al[11]; + br[11] = ar[11]; + bl[12] = al[12]; + br[12] = ar[12]; + + bl[15] = al[15]; + br[15] = ar[15]; + + bl[16] = vaddq_s32(al[19], al[16]); + br[16] = vaddq_s32(ar[19], ar[16]); + bl[17] = vaddq_s32(al[18], al[17]); + br[17] = vaddq_s32(ar[18], ar[17]); + bl[18] = vsubq_s32(al[17], al[18]); + br[18] = vsubq_s32(ar[17], ar[18]); + bl[19] = vsubq_s32(al[16], al[19]); + br[19] = vsubq_s32(ar[16], ar[19]); + bl[20] = vsubq_s32(al[23], al[20]); + br[20] = vsubq_s32(ar[23], ar[20]); + bl[21] = vsubq_s32(al[22], al[21]); + br[21] = vsubq_s32(ar[22], ar[21]); + bl[22] = vaddq_s32(al[21], al[22]); + br[22] = vaddq_s32(ar[21], ar[22]); + bl[23] = vaddq_s32(al[20], al[23]); + br[23] = vaddq_s32(ar[20], ar[23]); + bl[24] = vaddq_s32(al[27], al[24]); + br[24] = vaddq_s32(ar[27], ar[24]); + bl[25] = vaddq_s32(al[26], al[25]); + br[25] = vaddq_s32(ar[26], ar[25]); + bl[26] = vsubq_s32(al[25], al[26]); + br[26] = vsubq_s32(ar[25], ar[26]); + bl[27] = vsubq_s32(al[24], al[27]); + br[27] = vsubq_s32(ar[24], ar[27]); + bl[28] = vsubq_s32(al[31], al[28]); + br[28] = vsubq_s32(ar[31], ar[28]); + bl[29] = vsubq_s32(al[30], al[29]); + br[29] = vsubq_s32(ar[30], ar[29]); + bl[30] = vaddq_s32(al[29], al[30]); + br[30] = vaddq_s32(ar[29], ar[30]); + bl[31] = vaddq_s32(al[28], al[31]); + br[31] = vaddq_s32(ar[28], ar[31]); + + // Stage 6. + al[0] = bl[0]; + ar[0] = br[0]; + al[1] = bl[1]; + ar[1] = br[1]; + al[2] = bl[2]; + ar[2] = br[2]; + al[3] = bl[3]; + ar[3] = br[3]; + + butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64, + cospi_28_64, &al[4], &ar[4], &al[7], + &ar[7]); + butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64, + cospi_12_64, &al[5], &ar[5], &al[6], + &ar[6]); + + al[8] = vaddq_s32(bl[8], bl[9]); + ar[8] = vaddq_s32(br[8], br[9]); + al[9] = vsubq_s32(bl[8], bl[9]); + ar[9] = vsubq_s32(br[8], br[9]); + al[10] = vsubq_s32(bl[11], bl[10]); + ar[10] = vsubq_s32(br[11], br[10]); + al[11] = vaddq_s32(bl[11], bl[10]); + ar[11] = vaddq_s32(br[11], br[10]); + al[12] = vaddq_s32(bl[12], bl[13]); + ar[12] = vaddq_s32(br[12], br[13]); + al[13] = vsubq_s32(bl[12], bl[13]); + ar[13] = vsubq_s32(br[12], br[13]); + al[14] = vsubq_s32(bl[15], bl[14]); + ar[14] = vsubq_s32(br[15], br[14]); + al[15] = vaddq_s32(bl[15], bl[14]); + ar[15] = vaddq_s32(br[15], br[14]); + + al[16] = bl[16]; + ar[16] = br[16]; + al[19] = bl[19]; + ar[19] = br[19]; + al[20] = bl[20]; + ar[20] = br[20]; + al[23] = bl[23]; + ar[23] = br[23]; + al[24] = bl[24]; + ar[24] = br[24]; + al[27] = bl[27]; + ar[27] = br[27]; + al[28] = bl[28]; + ar[28] = br[28]; + al[31] = bl[31]; + ar[31] = br[31]; + + butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64, + cospi_28_64, &al[30], &ar[30], &al[17], + &ar[17]); + butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], + cospi_28_64, -cospi_4_64, &al[29], &ar[29], + &al[18], &ar[18]); + butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21], + cospi_20_64, cospi_12_64, &al[26], &ar[26], + &al[21], &ar[21]); + butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22], + cospi_12_64, -cospi_20_64, &al[25], + &ar[25], &al[22], &ar[22]); + + // Stage 7. + bl[0] = al[0]; + br[0] = ar[0]; + bl[1] = al[1]; + br[1] = ar[1]; + bl[2] = al[2]; + br[2] = ar[2]; + bl[3] = al[3]; + br[3] = ar[3]; + bl[4] = al[4]; + br[4] = ar[4]; + bl[5] = al[5]; + br[5] = ar[5]; + bl[6] = al[6]; + br[6] = ar[6]; + bl[7] = al[7]; + br[7] = ar[7]; + + butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64, + cospi_30_64, &bl[8], &br[8], &bl[15], + &br[15]); + butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64, + cospi_14_64, &bl[9], &br[9], &bl[14], + &br[14]); + butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10], + cospi_10_64, cospi_22_64, &bl[10], &br[10], + &bl[13], &br[13]); + butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11], + cospi_26_64, cospi_6_64, &bl[11], &br[11], + &bl[12], &br[12]); + + bl[16] = vaddq_s32(al[16], al[17]); + br[16] = vaddq_s32(ar[16], ar[17]); + bl[17] = vsubq_s32(al[16], al[17]); + br[17] = vsubq_s32(ar[16], ar[17]); + bl[18] = vsubq_s32(al[19], al[18]); + br[18] = vsubq_s32(ar[19], ar[18]); + bl[19] = vaddq_s32(al[19], al[18]); + br[19] = vaddq_s32(ar[19], ar[18]); + bl[20] = vaddq_s32(al[20], al[21]); + br[20] = vaddq_s32(ar[20], ar[21]); + bl[21] = vsubq_s32(al[20], al[21]); + br[21] = vsubq_s32(ar[20], ar[21]); + bl[22] = vsubq_s32(al[23], al[22]); + br[22] = vsubq_s32(ar[23], ar[22]); + bl[23] = vaddq_s32(al[23], al[22]); + br[23] = vaddq_s32(ar[23], ar[22]); + bl[24] = vaddq_s32(al[24], al[25]); + br[24] = vaddq_s32(ar[24], ar[25]); + bl[25] = vsubq_s32(al[24], al[25]); + br[25] = vsubq_s32(ar[24], ar[25]); + bl[26] = vsubq_s32(al[27], al[26]); + br[26] = vsubq_s32(ar[27], ar[26]); + bl[27] = vaddq_s32(al[27], al[26]); + br[27] = vaddq_s32(ar[27], ar[26]); + bl[28] = vaddq_s32(al[28], al[29]); + br[28] = vaddq_s32(ar[28], ar[29]); + bl[29] = vsubq_s32(al[28], al[29]); + br[29] = vsubq_s32(ar[28], ar[29]); + bl[30] = vsubq_s32(al[31], al[30]); + br[30] = vsubq_s32(ar[31], ar[30]); + bl[31] = vaddq_s32(al[31], al[30]); + br[31] = vaddq_s32(ar[31], ar[30]); + + // Final stage. + + left[0] = bl[0]; + right[0] = br[0]; + left[16] = bl[1]; + right[16] = br[1]; + left[8] = bl[2]; + right[8] = br[2]; + left[24] = bl[3]; + right[24] = br[3]; + left[4] = bl[4]; + right[4] = br[4]; + left[20] = bl[5]; + right[20] = br[5]; + left[12] = bl[6]; + right[12] = br[6]; + left[28] = bl[7]; + right[28] = br[7]; + left[2] = bl[8]; + right[2] = br[8]; + left[18] = bl[9]; + right[18] = br[9]; + left[10] = bl[10]; + right[10] = br[10]; + left[26] = bl[11]; + right[26] = br[11]; + left[6] = bl[12]; + right[6] = br[12]; + left[22] = bl[13]; + right[22] = br[13]; + left[14] = bl[14]; + right[14] = br[14]; + left[30] = bl[15]; + right[30] = br[15]; + + butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64, + cospi_31_64, &al[1], &ar[1], &al[31], + &ar[31]); + left[1] = al[1]; + right[1] = ar[1]; + left[31] = al[31]; + right[31] = ar[31]; + + butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], + cospi_17_64, cospi_15_64, &al[17], &ar[17], + &al[15], &ar[15]); + left[17] = al[17]; + right[17] = ar[17]; + left[15] = al[15]; + right[15] = ar[15]; + + butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64, + cospi_23_64, &al[9], &ar[9], &al[23], + &ar[23]); + left[9] = al[9]; + right[9] = ar[9]; + left[23] = al[23]; + right[23] = ar[23]; + + butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], + cospi_25_64, cospi_7_64, &al[25], &ar[25], + &al[7], &ar[7]); + left[25] = al[25]; + right[25] = ar[25]; + left[7] = al[7]; + right[7] = ar[7]; + + butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64, + cospi_27_64, &al[5], &ar[5], &al[27], + &ar[27]); + left[5] = al[5]; + right[5] = ar[5]; + left[27] = al[27]; + right[27] = ar[27]; + + butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21], + cospi_21_64, cospi_11_64, &al[21], &ar[21], + &al[11], &ar[11]); + left[21] = al[21]; + right[21] = ar[21]; + left[11] = al[11]; + right[11] = ar[11]; + + butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22], + cospi_13_64, cospi_19_64, &al[13], &ar[13], + &al[19], &ar[19]); + left[13] = al[13]; + right[13] = ar[13]; + left[19] = al[19]; + right[19] = ar[19]; + + butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23], + cospi_29_64, cospi_3_64, &al[29], &ar[29], + &al[3], &ar[3]); + left[29] = al[29]; + right[29] = ar[29]; + left[3] = al[3]; + right[3] = ar[3]; +} + +static INLINE void highbd_dct8x32_body_second_pass_rd(int32x4_t *left /*32*/, + int32x4_t *right /*32*/) { + int32x4_t al[32], ar[32]; + int32x4_t bl[32], br[32]; + + // Stage 1: Done as part of the load. + + // Stage 2. + // For the "rd" version, all the values are rounded down after stage 2 to keep + // the values in 16 bits. + al[0] = add_round_shift_s32(vaddq_s32(left[0], left[15])); + ar[0] = add_round_shift_s32(vaddq_s32(right[0], right[15])); + al[1] = add_round_shift_s32(vaddq_s32(left[1], left[14])); + ar[1] = add_round_shift_s32(vaddq_s32(right[1], right[14])); + al[2] = add_round_shift_s32(vaddq_s32(left[2], left[13])); + ar[2] = add_round_shift_s32(vaddq_s32(right[2], right[13])); + al[3] = add_round_shift_s32(vaddq_s32(left[3], left[12])); + ar[3] = add_round_shift_s32(vaddq_s32(right[3], right[12])); + al[4] = add_round_shift_s32(vaddq_s32(left[4], left[11])); + ar[4] = add_round_shift_s32(vaddq_s32(right[4], right[11])); + al[5] = add_round_shift_s32(vaddq_s32(left[5], left[10])); + ar[5] = add_round_shift_s32(vaddq_s32(right[5], right[10])); + al[6] = add_round_shift_s32(vaddq_s32(left[6], left[9])); + ar[6] = add_round_shift_s32(vaddq_s32(right[6], right[9])); + al[7] = add_round_shift_s32(vaddq_s32(left[7], left[8])); + ar[7] = add_round_shift_s32(vaddq_s32(right[7], right[8])); + + al[8] = add_round_shift_s32(vsubq_s32(left[7], left[8])); + ar[8] = add_round_shift_s32(vsubq_s32(right[7], right[8])); + al[9] = add_round_shift_s32(vsubq_s32(left[6], left[9])); + ar[9] = add_round_shift_s32(vsubq_s32(right[6], right[9])); + al[10] = add_round_shift_s32(vsubq_s32(left[5], left[10])); + ar[10] = add_round_shift_s32(vsubq_s32(right[5], right[10])); + al[11] = add_round_shift_s32(vsubq_s32(left[4], left[11])); + ar[11] = add_round_shift_s32(vsubq_s32(right[4], right[11])); + al[12] = add_round_shift_s32(vsubq_s32(left[3], left[12])); + ar[12] = add_round_shift_s32(vsubq_s32(right[3], right[12])); + al[13] = add_round_shift_s32(vsubq_s32(left[2], left[13])); + ar[13] = add_round_shift_s32(vsubq_s32(right[2], right[13])); + al[14] = add_round_shift_s32(vsubq_s32(left[1], left[14])); + ar[14] = add_round_shift_s32(vsubq_s32(right[1], right[14])); + al[15] = add_round_shift_s32(vsubq_s32(left[0], left[15])); + ar[15] = add_round_shift_s32(vsubq_s32(right[0], right[15])); + + al[16] = add_round_shift_s32(left[16]); + ar[16] = add_round_shift_s32(right[16]); + al[17] = add_round_shift_s32(left[17]); + ar[17] = add_round_shift_s32(right[17]); + al[18] = add_round_shift_s32(left[18]); + ar[18] = add_round_shift_s32(right[18]); + al[19] = add_round_shift_s32(left[19]); + ar[19] = add_round_shift_s32(right[19]); + + butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20], + cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]); + butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21], + cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]); + butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22], + cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]); + butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23], + cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]); + + al[20] = add_round_shift_s32(al[20]); + ar[20] = add_round_shift_s32(ar[20]); + al[21] = add_round_shift_s32(al[21]); + ar[21] = add_round_shift_s32(ar[21]); + al[22] = add_round_shift_s32(al[22]); + ar[22] = add_round_shift_s32(ar[22]); + al[23] = add_round_shift_s32(al[23]); + ar[23] = add_round_shift_s32(ar[23]); + al[24] = add_round_shift_s32(al[24]); + ar[24] = add_round_shift_s32(ar[24]); + al[25] = add_round_shift_s32(al[25]); + ar[25] = add_round_shift_s32(ar[25]); + al[26] = add_round_shift_s32(al[26]); + ar[26] = add_round_shift_s32(ar[26]); + al[27] = add_round_shift_s32(al[27]); + ar[27] = add_round_shift_s32(ar[27]); + + al[28] = add_round_shift_s32(left[28]); + ar[28] = add_round_shift_s32(right[28]); + al[29] = add_round_shift_s32(left[29]); + ar[29] = add_round_shift_s32(right[29]); + al[30] = add_round_shift_s32(left[30]); + ar[30] = add_round_shift_s32(right[30]); + al[31] = add_round_shift_s32(left[31]); + ar[31] = add_round_shift_s32(right[31]); + + // Stage 3. + bl[0] = vaddq_s32(al[0], al[7]); + br[0] = vaddq_s32(ar[0], ar[7]); + bl[1] = vaddq_s32(al[1], al[6]); + br[1] = vaddq_s32(ar[1], ar[6]); + bl[2] = vaddq_s32(al[2], al[5]); + br[2] = vaddq_s32(ar[2], ar[5]); + bl[3] = vaddq_s32(al[3], al[4]); + br[3] = vaddq_s32(ar[3], ar[4]); + + bl[4] = vsubq_s32(al[3], al[4]); + br[4] = vsubq_s32(ar[3], ar[4]); + bl[5] = vsubq_s32(al[2], al[5]); + br[5] = vsubq_s32(ar[2], ar[5]); + bl[6] = vsubq_s32(al[1], al[6]); + br[6] = vsubq_s32(ar[1], ar[6]); + bl[7] = vsubq_s32(al[0], al[7]); + br[7] = vsubq_s32(ar[0], ar[7]); + + bl[8] = al[8]; + br[8] = ar[8]; + bl[9] = al[9]; + br[9] = ar[9]; + + butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64, + &bl[13], &br[13], &bl[10], &br[10]); + butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64, + &bl[12], &br[12], &bl[11], &br[11]); + + bl[14] = al[14]; + br[14] = ar[14]; + bl[15] = al[15]; + br[15] = ar[15]; + + bl[16] = vaddq_s32(al[16], al[23]); + br[16] = vaddq_s32(ar[16], ar[23]); + bl[17] = vaddq_s32(al[17], al[22]); + br[17] = vaddq_s32(ar[17], ar[22]); + bl[18] = vaddq_s32(al[18], al[21]); + br[18] = vaddq_s32(ar[18], ar[21]); + bl[19] = vaddq_s32(al[19], al[20]); + br[19] = vaddq_s32(ar[19], ar[20]); + + bl[20] = vsubq_s32(al[19], al[20]); + br[20] = vsubq_s32(ar[19], ar[20]); + bl[21] = vsubq_s32(al[18], al[21]); + br[21] = vsubq_s32(ar[18], ar[21]); + bl[22] = vsubq_s32(al[17], al[22]); + br[22] = vsubq_s32(ar[17], ar[22]); + bl[23] = vsubq_s32(al[16], al[23]); + br[23] = vsubq_s32(ar[16], ar[23]); + + bl[24] = vsubq_s32(al[31], al[24]); + br[24] = vsubq_s32(ar[31], ar[24]); + bl[25] = vsubq_s32(al[30], al[25]); + br[25] = vsubq_s32(ar[30], ar[25]); + bl[26] = vsubq_s32(al[29], al[26]); + br[26] = vsubq_s32(ar[29], ar[26]); + bl[27] = vsubq_s32(al[28], al[27]); + br[27] = vsubq_s32(ar[28], ar[27]); + + bl[28] = vaddq_s32(al[28], al[27]); + br[28] = vaddq_s32(ar[28], ar[27]); + bl[29] = vaddq_s32(al[29], al[26]); + br[29] = vaddq_s32(ar[29], ar[26]); + bl[30] = vaddq_s32(al[30], al[25]); + br[30] = vaddq_s32(ar[30], ar[25]); + bl[31] = vaddq_s32(al[31], al[24]); + br[31] = vaddq_s32(ar[31], ar[24]); + + // Stage 4. + al[0] = vaddq_s32(bl[0], bl[3]); + ar[0] = vaddq_s32(br[0], br[3]); + al[1] = vaddq_s32(bl[1], bl[2]); + ar[1] = vaddq_s32(br[1], br[2]); + al[2] = vsubq_s32(bl[1], bl[2]); + ar[2] = vsubq_s32(br[1], br[2]); + al[3] = vsubq_s32(bl[0], bl[3]); + ar[3] = vsubq_s32(br[0], br[3]); + + al[4] = bl[4]; + ar[4] = br[4]; + + butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6], + &ar[6], &al[5], &ar[5]); + + al[7] = bl[7]; + ar[7] = br[7]; + + al[8] = vaddq_s32(bl[8], bl[11]); + ar[8] = vaddq_s32(br[8], br[11]); + al[9] = vaddq_s32(bl[9], bl[10]); + ar[9] = vaddq_s32(br[9], br[10]); + al[10] = vsubq_s32(bl[9], bl[10]); + ar[10] = vsubq_s32(br[9], br[10]); + al[11] = vsubq_s32(bl[8], bl[11]); + ar[11] = vsubq_s32(br[8], br[11]); + al[12] = vsubq_s32(bl[15], bl[12]); + ar[12] = vsubq_s32(br[15], br[12]); + al[13] = vsubq_s32(bl[14], bl[13]); + ar[13] = vsubq_s32(br[14], br[13]); + al[14] = vaddq_s32(bl[14], bl[13]); + ar[14] = vaddq_s32(br[14], br[13]); + al[15] = vaddq_s32(bl[15], bl[12]); + ar[15] = vaddq_s32(br[15], br[12]); + + al[16] = bl[16]; + ar[16] = br[16]; + al[17] = bl[17]; + ar[17] = br[17]; + + butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_8_64, + cospi_24_64, &al[29], &ar[29], &al[18], &ar[18]); + butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_8_64, + cospi_24_64, &al[28], &ar[28], &al[19], &ar[19]); + butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_24_64, + -cospi_8_64, &al[27], &ar[27], &al[20], &ar[20]); + butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_24_64, + -cospi_8_64, &al[26], &ar[26], &al[21], &ar[21]); + + al[22] = bl[22]; + ar[22] = br[22]; + al[23] = bl[23]; + ar[23] = br[23]; + al[24] = bl[24]; + ar[24] = br[24]; + al[25] = bl[25]; + ar[25] = br[25]; + + al[30] = bl[30]; + ar[30] = br[30]; + al[31] = bl[31]; + ar[31] = br[31]; + + // Stage 5. + butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0], + &br[0], &bl[1], &br[1]); + butterfly_two_coeff_s32(al[3], ar[3], al[2], ar[2], cospi_8_64, cospi_24_64, + &bl[2], &br[2], &bl[3], &br[3]); + + bl[4] = vaddq_s32(al[4], al[5]); + br[4] = vaddq_s32(ar[4], ar[5]); + bl[5] = vsubq_s32(al[4], al[5]); + br[5] = vsubq_s32(ar[4], ar[5]); + bl[6] = vsubq_s32(al[7], al[6]); + br[6] = vsubq_s32(ar[7], ar[6]); + bl[7] = vaddq_s32(al[7], al[6]); + br[7] = vaddq_s32(ar[7], ar[6]); + + bl[8] = al[8]; + br[8] = ar[8]; + + butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_8_64, cospi_24_64, + &bl[14], &br[14], &bl[9], &br[9]); + butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_24_64, + -cospi_8_64, &bl[13], &br[13], &bl[10], &br[10]); + + bl[11] = al[11]; + br[11] = ar[11]; + bl[12] = al[12]; + br[12] = ar[12]; + + bl[15] = al[15]; + br[15] = ar[15]; + + bl[16] = vaddq_s32(al[19], al[16]); + br[16] = vaddq_s32(ar[19], ar[16]); + bl[17] = vaddq_s32(al[18], al[17]); + br[17] = vaddq_s32(ar[18], ar[17]); + bl[18] = vsubq_s32(al[17], al[18]); + br[18] = vsubq_s32(ar[17], ar[18]); + bl[19] = vsubq_s32(al[16], al[19]); + br[19] = vsubq_s32(ar[16], ar[19]); + bl[20] = vsubq_s32(al[23], al[20]); + br[20] = vsubq_s32(ar[23], ar[20]); + bl[21] = vsubq_s32(al[22], al[21]); + br[21] = vsubq_s32(ar[22], ar[21]); + bl[22] = vaddq_s32(al[21], al[22]); + br[22] = vaddq_s32(ar[21], ar[22]); + bl[23] = vaddq_s32(al[20], al[23]); + br[23] = vaddq_s32(ar[20], ar[23]); + bl[24] = vaddq_s32(al[27], al[24]); + br[24] = vaddq_s32(ar[27], ar[24]); + bl[25] = vaddq_s32(al[26], al[25]); + br[25] = vaddq_s32(ar[26], ar[25]); + bl[26] = vsubq_s32(al[25], al[26]); + br[26] = vsubq_s32(ar[25], ar[26]); + bl[27] = vsubq_s32(al[24], al[27]); + br[27] = vsubq_s32(ar[24], ar[27]); + bl[28] = vsubq_s32(al[31], al[28]); + br[28] = vsubq_s32(ar[31], ar[28]); + bl[29] = vsubq_s32(al[30], al[29]); + br[29] = vsubq_s32(ar[30], ar[29]); + bl[30] = vaddq_s32(al[29], al[30]); + br[30] = vaddq_s32(ar[29], ar[30]); + bl[31] = vaddq_s32(al[28], al[31]); + br[31] = vaddq_s32(ar[28], ar[31]); + + // Stage 6. + al[0] = bl[0]; + ar[0] = br[0]; + al[1] = bl[1]; + ar[1] = br[1]; + al[2] = bl[2]; + ar[2] = br[2]; + al[3] = bl[3]; + ar[3] = br[3]; + + butterfly_two_coeff_s32(bl[7], br[7], bl[4], br[4], cospi_4_64, cospi_28_64, + &al[4], &ar[4], &al[7], &ar[7]); + butterfly_two_coeff_s32(bl[6], br[6], bl[5], br[5], cospi_20_64, cospi_12_64, + &al[5], &ar[5], &al[6], &ar[6]); + + al[8] = vaddq_s32(bl[8], bl[9]); + ar[8] = vaddq_s32(br[8], br[9]); + al[9] = vsubq_s32(bl[8], bl[9]); + ar[9] = vsubq_s32(br[8], br[9]); + al[10] = vsubq_s32(bl[11], bl[10]); + ar[10] = vsubq_s32(br[11], br[10]); + al[11] = vaddq_s32(bl[11], bl[10]); + ar[11] = vaddq_s32(br[11], br[10]); + al[12] = vaddq_s32(bl[12], bl[13]); + ar[12] = vaddq_s32(br[12], br[13]); + al[13] = vsubq_s32(bl[12], bl[13]); + ar[13] = vsubq_s32(br[12], br[13]); + al[14] = vsubq_s32(bl[15], bl[14]); + ar[14] = vsubq_s32(br[15], br[14]); + al[15] = vaddq_s32(bl[15], bl[14]); + ar[15] = vaddq_s32(br[15], br[14]); + + al[16] = bl[16]; + ar[16] = br[16]; + al[19] = bl[19]; + ar[19] = br[19]; + al[20] = bl[20]; + ar[20] = br[20]; + al[23] = bl[23]; + ar[23] = br[23]; + al[24] = bl[24]; + ar[24] = br[24]; + al[27] = bl[27]; + ar[27] = br[27]; + al[28] = bl[28]; + ar[28] = br[28]; + al[31] = bl[31]; + ar[31] = br[31]; + + butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_4_64, + cospi_28_64, &al[30], &ar[30], &al[17], &ar[17]); + butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_28_64, + -cospi_4_64, &al[29], &ar[29], &al[18], &ar[18]); + butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_20_64, + cospi_12_64, &al[26], &ar[26], &al[21], &ar[21]); + butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_12_64, + -cospi_20_64, &al[25], &ar[25], &al[22], &ar[22]); + + // Stage 7. + bl[0] = al[0]; + br[0] = ar[0]; + bl[1] = al[1]; + br[1] = ar[1]; + bl[2] = al[2]; + br[2] = ar[2]; + bl[3] = al[3]; + br[3] = ar[3]; + bl[4] = al[4]; + br[4] = ar[4]; + bl[5] = al[5]; + br[5] = ar[5]; + bl[6] = al[6]; + br[6] = ar[6]; + bl[7] = al[7]; + br[7] = ar[7]; + + butterfly_two_coeff_s32(al[15], ar[15], al[8], ar[8], cospi_2_64, cospi_30_64, + &bl[8], &br[8], &bl[15], &br[15]); + butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_18_64, + cospi_14_64, &bl[9], &br[9], &bl[14], &br[14]); + butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_10_64, + cospi_22_64, &bl[10], &br[10], &bl[13], &br[13]); + butterfly_two_coeff_s32(al[12], ar[12], al[11], ar[11], cospi_26_64, + cospi_6_64, &bl[11], &br[11], &bl[12], &br[12]); + + bl[16] = vaddq_s32(al[16], al[17]); + br[16] = vaddq_s32(ar[16], ar[17]); + bl[17] = vsubq_s32(al[16], al[17]); + br[17] = vsubq_s32(ar[16], ar[17]); + bl[18] = vsubq_s32(al[19], al[18]); + br[18] = vsubq_s32(ar[19], ar[18]); + bl[19] = vaddq_s32(al[19], al[18]); + br[19] = vaddq_s32(ar[19], ar[18]); + bl[20] = vaddq_s32(al[20], al[21]); + br[20] = vaddq_s32(ar[20], ar[21]); + bl[21] = vsubq_s32(al[20], al[21]); + br[21] = vsubq_s32(ar[20], ar[21]); + bl[22] = vsubq_s32(al[23], al[22]); + br[22] = vsubq_s32(ar[23], ar[22]); + bl[23] = vaddq_s32(al[23], al[22]); + br[23] = vaddq_s32(ar[23], ar[22]); + bl[24] = vaddq_s32(al[24], al[25]); + br[24] = vaddq_s32(ar[24], ar[25]); + bl[25] = vsubq_s32(al[24], al[25]); + br[25] = vsubq_s32(ar[24], ar[25]); + bl[26] = vsubq_s32(al[27], al[26]); + br[26] = vsubq_s32(ar[27], ar[26]); + bl[27] = vaddq_s32(al[27], al[26]); + br[27] = vaddq_s32(ar[27], ar[26]); + bl[28] = vaddq_s32(al[28], al[29]); + br[28] = vaddq_s32(ar[28], ar[29]); + bl[29] = vsubq_s32(al[28], al[29]); + br[29] = vsubq_s32(ar[28], ar[29]); + bl[30] = vsubq_s32(al[31], al[30]); + br[30] = vsubq_s32(ar[31], ar[30]); + bl[31] = vaddq_s32(al[31], al[30]); + br[31] = vaddq_s32(ar[31], ar[30]); + + // Final stage. + left[0] = bl[0]; + right[0] = br[0]; + left[16] = bl[1]; + right[16] = br[1]; + left[8] = bl[2]; + right[8] = br[2]; + left[24] = bl[3]; + right[24] = br[3]; + left[4] = bl[4]; + right[4] = br[4]; + left[20] = bl[5]; + right[20] = br[5]; + left[12] = bl[6]; + right[12] = br[6]; + left[28] = bl[7]; + right[28] = br[7]; + left[2] = bl[8]; + right[2] = br[8]; + left[18] = bl[9]; + right[18] = br[9]; + left[10] = bl[10]; + right[10] = br[10]; + left[26] = bl[11]; + right[26] = br[11]; + left[6] = bl[12]; + right[6] = br[12]; + left[22] = bl[13]; + right[22] = br[13]; + left[14] = bl[14]; + right[14] = br[14]; + left[30] = bl[15]; + right[30] = br[15]; + + butterfly_two_coeff_s32(bl[31], br[31], bl[16], br[16], cospi_1_64, + cospi_31_64, &al[1], &ar[1], &al[31], &ar[31]); + left[1] = al[1]; + right[1] = ar[1]; + left[31] = al[31]; + right[31] = ar[31]; + + butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_17_64, + cospi_15_64, &al[17], &ar[17], &al[15], &ar[15]); + left[17] = al[17]; + right[17] = ar[17]; + left[15] = al[15]; + right[15] = ar[15]; + + butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_9_64, + cospi_23_64, &al[9], &ar[9], &al[23], &ar[23]); + left[9] = al[9]; + right[9] = ar[9]; + left[23] = al[23]; + right[23] = ar[23]; + + butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_25_64, + cospi_7_64, &al[25], &ar[25], &al[7], &ar[7]); + left[25] = al[25]; + right[25] = ar[25]; + left[7] = al[7]; + right[7] = ar[7]; + + butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_5_64, + cospi_27_64, &al[5], &ar[5], &al[27], &ar[27]); + left[5] = al[5]; + right[5] = ar[5]; + left[27] = al[27]; + right[27] = ar[27]; + + butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_21_64, + cospi_11_64, &al[21], &ar[21], &al[11], &ar[11]); + left[21] = al[21]; + right[21] = ar[21]; + left[11] = al[11]; + right[11] = ar[11]; + + butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_13_64, + cospi_19_64, &al[13], &ar[13], &al[19], &ar[19]); + left[13] = al[13]; + right[13] = ar[13]; + left[19] = al[19]; + right[19] = ar[19]; + + butterfly_two_coeff_s32(bl[24], br[24], bl[23], br[23], cospi_29_64, + cospi_3_64, &al[29], &ar[29], &al[3], &ar[3]); + left[29] = al[29]; + right[29] = ar[29]; + left[3] = al[3]; + right[3] = ar[3]; +} + +#endif // CONFIG_VP9_HIGHBITDEPTH + +#endif // VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.c new file mode 100644 index 0000000000..4bc968ecba --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.c @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/fdct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/fdct4x4_neon.h" + +void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, + int stride) { + // input[M * stride] * 16 + int16x4_t in[4]; + in[0] = vshl_n_s16(vld1_s16(input + 0 * stride), 4); + in[1] = vshl_n_s16(vld1_s16(input + 1 * stride), 4); + in[2] = vshl_n_s16(vld1_s16(input + 2 * stride), 4); + in[3] = vshl_n_s16(vld1_s16(input + 3 * stride), 4); + + // If the very first value != 0, then add 1. + if (input[0] != 0) { + const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1)); + in[0] = vadd_s16(in[0], one); + } + vpx_fdct4x4_pass1_neon(in); + vpx_fdct4x4_pass2_neon(in); + { + // Not quite a rounding shift. Only add 1 despite shifting by 2. + const int16x8_t one = vdupq_n_s16(1); + int16x8_t out_01 = vcombine_s16(in[0], in[1]); + int16x8_t out_23 = vcombine_s16(in[2], in[3]); + out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2); + out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2); + store_s16q_to_tran_low(final_output + 0 * 8, out_01); + store_s16q_to_tran_low(final_output + 1 * 8, out_23); + } +} + +#if CONFIG_VP9_HIGHBITDEPTH + +void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, + int stride) { + const int32x4_t const_one = vdupq_n_s32(1); + + // input[M * stride] * 16 + int32x4_t in[4]; + in[0] = vshll_n_s16(vld1_s16(input + 0 * stride), 4); + in[1] = vshll_n_s16(vld1_s16(input + 1 * stride), 4); + in[2] = vshll_n_s16(vld1_s16(input + 2 * stride), 4); + in[3] = vshll_n_s16(vld1_s16(input + 3 * stride), 4); + + // If the very first value != 0, then add 1. + if (input[0] != 0) { + static const int32_t k1000[4] = { 1, 0, 0, 0 }; + in[0] = vaddq_s32(in[0], vld1q_s32(k1000)); + } + + vpx_highbd_fdct4x4_pass1_neon(in); + vpx_highbd_fdct4x4_pass1_neon(in); + { + // Not quite a rounding shift. Only add 1 despite shifting by 2. + in[0] = vshrq_n_s32(vaddq_s32(in[0], const_one), 2); + in[1] = vshrq_n_s32(vaddq_s32(in[1], const_one), 2); + in[2] = vshrq_n_s32(vaddq_s32(in[2], const_one), 2); + in[3] = vshrq_n_s32(vaddq_s32(in[3], const_one), 2); + + vst1q_s32(final_output, in[0]); + vst1q_s32(final_output + 4, in[1]); + vst1q_s32(final_output + 8, in[2]); + vst1q_s32(final_output + 12, in[3]); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.h new file mode 100644 index 0000000000..de3db9774c --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_ +#define VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_ + +#include + +static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) { + int16x4_t out[4]; + + const int16x8_t input_01 = vcombine_s16(in[0], in[1]); + const int16x8_t input_32 = vcombine_s16(in[3], in[2]); + + // in_0 +/- in_3, in_1 +/- in_2 + const int16x8_t s_01 = vaddq_s16(input_01, input_32); + const int16x8_t s_32 = vsubq_s16(input_01, input_32); + + // step_0 +/- step_1, step_2 +/- step_3 + const int16x4_t s_0 = vget_low_s16(s_01); + const int16x4_t s_1 = vget_high_s16(s_01); + const int16x4_t s_2 = vget_high_s16(s_32); + const int16x4_t s_3 = vget_low_s16(s_32); + + // fdct_round_shift(s_0 +/- s_1) * cospi_16_64 + butterfly_one_coeff_s16_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]); + + // s_3 * cospi_8_64 + s_2 * cospi_24_64 + // s_3 * cospi_24_64 - s_2 * cospi_8_64 + butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]); + + transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]); + + in[0] = out[0]; + in[1] = out[1]; + in[2] = out[2]; + in[3] = out[3]; +} + +static INLINE void vpx_fdct4x4_pass2_neon(int16x4_t *in) { + int16x4_t out[4]; + + const int16x8_t input_01 = vcombine_s16(in[0], in[1]); + const int16x8_t input_32 = vcombine_s16(in[3], in[2]); + + // in_0 +/- in_3, in_1 +/- in_2 + const int16x8_t s_01 = vaddq_s16(input_01, input_32); + const int16x8_t s_32 = vsubq_s16(input_01, input_32); + + // step_0 +/- step_1, step_2 +/- step_3 + const int16x4_t s_0 = vget_low_s16(s_01); + const int16x4_t s_1 = vget_high_s16(s_01); + const int16x4_t s_2 = vget_high_s16(s_32); + const int16x4_t s_3 = vget_low_s16(s_32); + + // fdct_round_shift(s_0 +/- s_1) * cospi_16_64 + butterfly_one_coeff_s16_s32_fast_narrow_half(s_0, s_1, cospi_16_64, &out[0], + &out[2]); + + // s_3 * cospi_8_64 + s_2 * cospi_24_64 + // s_3 * cospi_24_64 - s_2 * cospi_8_64 + butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]); + + transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]); + + in[0] = out[0]; + in[1] = out[1]; + in[2] = out[2]; + in[3] = out[3]; +} + +#if CONFIG_VP9_HIGHBITDEPTH + +static INLINE void vpx_highbd_fdct4x4_pass1_neon(int32x4_t *in) { + int32x4_t out[4]; + // in_0 +/- in_3, in_1 +/- in_2 + const int32x4_t s_0 = vaddq_s32(in[0], in[3]); + const int32x4_t s_1 = vaddq_s32(in[1], in[2]); + const int32x4_t s_2 = vsubq_s32(in[1], in[2]); + const int32x4_t s_3 = vsubq_s32(in[0], in[3]); + + butterfly_one_coeff_s32_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]); + + // out[1] = s_3 * cospi_8_64 + s_2 * cospi_24_64 + // out[3] = s_3 * cospi_24_64 - s_2 * cospi_8_64 + butterfly_two_coeff_s32_s64_narrow_half(s_3, s_2, cospi_8_64, cospi_24_64, + &out[1], &out[3]); + + transpose_s32_4x4(&out[0], &out[1], &out[2], &out[3]); + + in[0] = out[0]; + in[1] = out[1]; + in[2] = out[2]; + in[3] = out[3]; +} + +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.c new file mode 100644 index 0000000000..75ee6f2230 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.c @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/fdct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/fdct8x8_neon.h" + +void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, + int stride) { + // stage 1 + int16x8_t in[8]; + in[0] = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2); + in[1] = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2); + in[2] = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2); + in[3] = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2); + in[4] = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2); + in[5] = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2); + in[6] = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2); + in[7] = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2); + + vpx_fdct8x8_pass1_neon(in); + vpx_fdct8x8_pass2_neon(in); + { + // from vpx_dct_sse2.c + // Post-condition (division by two) + // division of two 16 bits signed numbers using shifts + // n / 2 = (n - (n >> 15)) >> 1 + const int16x8_t sign_in0 = vshrq_n_s16(in[0], 15); + const int16x8_t sign_in1 = vshrq_n_s16(in[1], 15); + const int16x8_t sign_in2 = vshrq_n_s16(in[2], 15); + const int16x8_t sign_in3 = vshrq_n_s16(in[3], 15); + const int16x8_t sign_in4 = vshrq_n_s16(in[4], 15); + const int16x8_t sign_in5 = vshrq_n_s16(in[5], 15); + const int16x8_t sign_in6 = vshrq_n_s16(in[6], 15); + const int16x8_t sign_in7 = vshrq_n_s16(in[7], 15); + in[0] = vhsubq_s16(in[0], sign_in0); + in[1] = vhsubq_s16(in[1], sign_in1); + in[2] = vhsubq_s16(in[2], sign_in2); + in[3] = vhsubq_s16(in[3], sign_in3); + in[4] = vhsubq_s16(in[4], sign_in4); + in[5] = vhsubq_s16(in[5], sign_in5); + in[6] = vhsubq_s16(in[6], sign_in6); + in[7] = vhsubq_s16(in[7], sign_in7); + // store results + store_s16q_to_tran_low(final_output + 0 * 8, in[0]); + store_s16q_to_tran_low(final_output + 1 * 8, in[1]); + store_s16q_to_tran_low(final_output + 2 * 8, in[2]); + store_s16q_to_tran_low(final_output + 3 * 8, in[3]); + store_s16q_to_tran_low(final_output + 4 * 8, in[4]); + store_s16q_to_tran_low(final_output + 5 * 8, in[5]); + store_s16q_to_tran_low(final_output + 6 * 8, in[6]); + store_s16q_to_tran_low(final_output + 7 * 8, in[7]); + } +} + +#if CONFIG_VP9_HIGHBITDEPTH + +void vpx_highbd_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, + int stride) { + // input[M * stride] * 16 + int32x4_t left[8], right[8]; + int16x8_t in[8]; + in[0] = vld1q_s16(input + 0 * stride); + in[1] = vld1q_s16(input + 1 * stride); + in[2] = vld1q_s16(input + 2 * stride); + in[3] = vld1q_s16(input + 3 * stride); + in[4] = vld1q_s16(input + 4 * stride); + in[5] = vld1q_s16(input + 5 * stride); + in[6] = vld1q_s16(input + 6 * stride); + in[7] = vld1q_s16(input + 7 * stride); + + left[0] = vshll_n_s16(vget_low_s16(in[0]), 2); + left[1] = vshll_n_s16(vget_low_s16(in[1]), 2); + left[2] = vshll_n_s16(vget_low_s16(in[2]), 2); + left[3] = vshll_n_s16(vget_low_s16(in[3]), 2); + left[4] = vshll_n_s16(vget_low_s16(in[4]), 2); + left[5] = vshll_n_s16(vget_low_s16(in[5]), 2); + left[6] = vshll_n_s16(vget_low_s16(in[6]), 2); + left[7] = vshll_n_s16(vget_low_s16(in[7]), 2); + right[0] = vshll_n_s16(vget_high_s16(in[0]), 2); + right[1] = vshll_n_s16(vget_high_s16(in[1]), 2); + right[2] = vshll_n_s16(vget_high_s16(in[2]), 2); + right[3] = vshll_n_s16(vget_high_s16(in[3]), 2); + right[4] = vshll_n_s16(vget_high_s16(in[4]), 2); + right[5] = vshll_n_s16(vget_high_s16(in[5]), 2); + right[6] = vshll_n_s16(vget_high_s16(in[6]), 2); + right[7] = vshll_n_s16(vget_high_s16(in[7]), 2); + + vpx_highbd_fdct8x8_pass1_neon(left, right); + vpx_highbd_fdct8x8_pass2_neon(left, right); + { + left[0] = add_round_shift_half_s32(left[0]); + left[1] = add_round_shift_half_s32(left[1]); + left[2] = add_round_shift_half_s32(left[2]); + left[3] = add_round_shift_half_s32(left[3]); + left[4] = add_round_shift_half_s32(left[4]); + left[5] = add_round_shift_half_s32(left[5]); + left[6] = add_round_shift_half_s32(left[6]); + left[7] = add_round_shift_half_s32(left[7]); + right[0] = add_round_shift_half_s32(right[0]); + right[1] = add_round_shift_half_s32(right[1]); + right[2] = add_round_shift_half_s32(right[2]); + right[3] = add_round_shift_half_s32(right[3]); + right[4] = add_round_shift_half_s32(right[4]); + right[5] = add_round_shift_half_s32(right[5]); + right[6] = add_round_shift_half_s32(right[6]); + right[7] = add_round_shift_half_s32(right[7]); + + // store results + vst1q_s32(final_output, left[0]); + vst1q_s32(final_output + 4, right[0]); + vst1q_s32(final_output + 8, left[1]); + vst1q_s32(final_output + 12, right[1]); + vst1q_s32(final_output + 16, left[2]); + vst1q_s32(final_output + 20, right[2]); + vst1q_s32(final_output + 24, left[3]); + vst1q_s32(final_output + 28, right[3]); + vst1q_s32(final_output + 32, left[4]); + vst1q_s32(final_output + 36, right[4]); + vst1q_s32(final_output + 40, left[5]); + vst1q_s32(final_output + 44, right[5]); + vst1q_s32(final_output + 48, left[6]); + vst1q_s32(final_output + 52, right[6]); + vst1q_s32(final_output + 56, left[7]); + vst1q_s32(final_output + 60, right[7]); + } +} + +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.h new file mode 100644 index 0000000000..cc65157430 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.h @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_ +#define VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_ + +#include + +static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in, + int16x8_t *out) { + int16x8_t s[8], x[4], t[2]; + + s[0] = vaddq_s16(in[0], in[7]); + s[1] = vaddq_s16(in[1], in[6]); + s[2] = vaddq_s16(in[2], in[5]); + s[3] = vaddq_s16(in[3], in[4]); + s[4] = vsubq_s16(in[3], in[4]); + s[5] = vsubq_s16(in[2], in[5]); + s[6] = vsubq_s16(in[1], in[6]); + s[7] = vsubq_s16(in[0], in[7]); + // fdct4(step, step); + x[0] = vaddq_s16(s[0], s[3]); + x[1] = vaddq_s16(s[1], s[2]); + x[2] = vsubq_s16(s[1], s[2]); + x[3] = vsubq_s16(s[0], s[3]); + + // fdct4(step, step); + // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64) + // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s16_fast(x[0], x[1], cospi_16_64, &out[0], &out[4]); + // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64) + // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64) + butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]); + + // Stage 2 + // t0 = (s6 - s5) * cospi_16_64; + // t1 = (s6 + s5) * cospi_16_64; + butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &t[1], &t[0]); + + // Stage 3 + x[0] = vaddq_s16(s[4], t[0]); + x[1] = vsubq_s16(s[4], t[0]); + x[2] = vsubq_s16(s[7], t[1]); + x[3] = vaddq_s16(s[7], t[1]); + + // Stage 4 + // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) + // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) + butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]); + + // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) + // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) + butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]); +} + +static INLINE void vpx_fdct8x8_pass2_notranspose_neon(int16x8_t *in, + int16x8_t *out) { + int16x8_t s[8], x[4], t[2]; + + s[0] = vaddq_s16(in[0], in[7]); + s[1] = vaddq_s16(in[1], in[6]); + s[2] = vaddq_s16(in[2], in[5]); + s[3] = vaddq_s16(in[3], in[4]); + s[4] = vsubq_s16(in[3], in[4]); + s[5] = vsubq_s16(in[2], in[5]); + s[6] = vsubq_s16(in[1], in[6]); + s[7] = vsubq_s16(in[0], in[7]); + // fdct4(step, step); + x[0] = vaddq_s16(s[0], s[3]); + x[1] = vaddq_s16(s[1], s[2]); + x[2] = vsubq_s16(s[1], s[2]); + x[3] = vsubq_s16(s[0], s[3]); + + // fdct4(step, step); + // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64) + // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0], + &out[4]); + // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64) + // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64) + butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]); + + // Stage 2 + // t0 = (s6 - s5) * cospi_16_64; + // t1 = (s6 + s5) * cospi_16_64; + butterfly_one_coeff_s16_s32_fast_narrow(s[6], s[5], cospi_16_64, &t[1], + &t[0]); + + // Stage 3 + x[0] = vaddq_s16(s[4], t[0]); + x[1] = vsubq_s16(s[4], t[0]); + x[2] = vsubq_s16(s[7], t[1]); + x[3] = vaddq_s16(s[7], t[1]); + + // Stage 4 + // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) + // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) + butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]); + + // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) + // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) + butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]); +} + +static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) { + int16x8_t out[8]; + vpx_fdct8x8_pass1_notranspose_neon(in, out); + // transpose 8x8 + transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], + &out[6], &out[7]); + in[0] = out[0]; + in[1] = out[1]; + in[2] = out[2]; + in[3] = out[3]; + in[4] = out[4]; + in[5] = out[5]; + in[6] = out[6]; + in[7] = out[7]; +} + +static INLINE void vpx_fdct8x8_pass2_neon(int16x8_t *in) { + int16x8_t out[8]; + vpx_fdct8x8_pass2_notranspose_neon(in, out); + // transpose 8x8 + transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], + &out[6], &out[7]); + in[0] = out[0]; + in[1] = out[1]; + in[2] = out[2]; + in[3] = out[3]; + in[4] = out[4]; + in[5] = out[5]; + in[6] = out[6]; + in[7] = out[7]; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void vpx_highbd_fdct8x8_pass1_notranspose_neon(int32x4_t *left, + int32x4_t *right) { + int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4]; + + sl[0] = vaddq_s32(left[0], left[7]); + sl[1] = vaddq_s32(left[1], left[6]); + sl[2] = vaddq_s32(left[2], left[5]); + sl[3] = vaddq_s32(left[3], left[4]); + sl[4] = vsubq_s32(left[3], left[4]); + sl[5] = vsubq_s32(left[2], left[5]); + sl[6] = vsubq_s32(left[1], left[6]); + sl[7] = vsubq_s32(left[0], left[7]); + sr[0] = vaddq_s32(right[0], right[7]); + sr[1] = vaddq_s32(right[1], right[6]); + sr[2] = vaddq_s32(right[2], right[5]); + sr[3] = vaddq_s32(right[3], right[4]); + sr[4] = vsubq_s32(right[3], right[4]); + sr[5] = vsubq_s32(right[2], right[5]); + sr[6] = vsubq_s32(right[1], right[6]); + sr[7] = vsubq_s32(right[0], right[7]); + + // fdct4(step, step); + // x0 = s0 + s3; + xl[0] = vaddq_s32(sl[0], sl[3]); + xr[0] = vaddq_s32(sr[0], sr[3]); + // x1 = s1 + s2; + xl[1] = vaddq_s32(sl[1], sl[2]); + xr[1] = vaddq_s32(sr[1], sr[2]); + // x2 = s1 - s2; + xl[2] = vsubq_s32(sl[1], sl[2]); + xr[2] = vsubq_s32(sr[1], sr[2]); + // x3 = s0 - s3; + xl[3] = vsubq_s32(sl[0], sl[3]); + xr[3] = vsubq_s32(sr[0], sr[3]); + + // fdct4(step, step); + // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64) + // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64, + &left[0], &right[0], &left[4], &right[4]); + // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64) + // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64) + butterfly_two_coeff_s32(xl[3], xr[3], xl[2], xr[2], cospi_8_64, cospi_24_64, + &left[2], &right[2], &left[6], &right[6]); + + // Stage 2 + // t0 = (s6 - s5) * cospi_16_64; + // t1 = (s6 + s5) * cospi_16_64; + butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1], + &tr[1], &tl[0], &tr[0]); + + // Stage 3 + xl[0] = vaddq_s32(sl[4], tl[0]); + xr[0] = vaddq_s32(sr[4], tr[0]); + xl[1] = vsubq_s32(sl[4], tl[0]); + xr[1] = vsubq_s32(sr[4], tr[0]); + xl[2] = vsubq_s32(sl[7], tl[1]); + xr[2] = vsubq_s32(sr[7], tr[1]); + xl[3] = vaddq_s32(sl[7], tl[1]); + xr[3] = vaddq_s32(sr[7], tr[1]); + + // Stage 4 + // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) + // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) + butterfly_two_coeff_s32(xl[3], xr[3], xl[0], xr[0], cospi_4_64, cospi_28_64, + &left[1], &right[1], &left[7], &right[7]); + + // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) + // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) + butterfly_two_coeff_s32(xl[2], xr[2], xl[1], xr[1], cospi_20_64, cospi_12_64, + &left[5], &right[5], &left[3], &right[3]); +} + +static INLINE void vpx_highbd_fdct8x8_pass2_notranspose_neon(int32x4_t *left, + int32x4_t *right) { + int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4]; + + sl[0] = vaddq_s32(left[0], left[7]); + sl[1] = vaddq_s32(left[1], left[6]); + sl[2] = vaddq_s32(left[2], left[5]); + sl[3] = vaddq_s32(left[3], left[4]); + sl[4] = vsubq_s32(left[3], left[4]); + sl[5] = vsubq_s32(left[2], left[5]); + sl[6] = vsubq_s32(left[1], left[6]); + sl[7] = vsubq_s32(left[0], left[7]); + sr[0] = vaddq_s32(right[0], right[7]); + sr[1] = vaddq_s32(right[1], right[6]); + sr[2] = vaddq_s32(right[2], right[5]); + sr[3] = vaddq_s32(right[3], right[4]); + sr[4] = vsubq_s32(right[3], right[4]); + sr[5] = vsubq_s32(right[2], right[5]); + sr[6] = vsubq_s32(right[1], right[6]); + sr[7] = vsubq_s32(right[0], right[7]); + + // fdct4(step, step); + // x0 = s0 + s3; + xl[0] = vaddq_s32(sl[0], sl[3]); + xr[0] = vaddq_s32(sr[0], sr[3]); + // x1 = s1 + s2; + xl[1] = vaddq_s32(sl[1], sl[2]); + xr[1] = vaddq_s32(sr[1], sr[2]); + // x2 = s1 - s2; + xl[2] = vsubq_s32(sl[1], sl[2]); + xr[2] = vsubq_s32(sr[1], sr[2]); + // x3 = s0 - s3; + xl[3] = vsubq_s32(sl[0], sl[3]); + xr[3] = vsubq_s32(sr[0], sr[3]); + + // fdct4(step, step); + // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64) + // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64, + &left[0], &right[0], &left[4], &right[4]); + // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64) + // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64) + butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64, + cospi_24_64, &left[2], &right[2], &left[6], + &right[6]); + + // Stage 2 + // t0 = (s6 - s5) * cospi_16_64; + // t1 = (s6 + s5) * cospi_16_64; + butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1], + &tr[1], &tl[0], &tr[0]); + + // Stage 3 + xl[0] = vaddq_s32(sl[4], tl[0]); + xr[0] = vaddq_s32(sr[4], tr[0]); + xl[1] = vsubq_s32(sl[4], tl[0]); + xr[1] = vsubq_s32(sr[4], tr[0]); + xl[2] = vsubq_s32(sl[7], tl[1]); + xr[2] = vsubq_s32(sr[7], tr[1]); + xl[3] = vaddq_s32(sl[7], tl[1]); + xr[3] = vaddq_s32(sr[7], tr[1]); + + // Stage 4 + // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) + // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) + butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64, + cospi_28_64, &left[1], &right[1], &left[7], + &right[7]); + + // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) + // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) + butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64, + cospi_12_64, &left[5], &right[5], &left[3], + &right[3]); +} + +static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left, + int32x4_t *right) { + vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right); + transpose_s32_8x8_2(left, right, left, right); +} + +static INLINE void vpx_highbd_fdct8x8_pass2_neon(int32x4_t *left, + int32x4_t *right) { + vpx_highbd_fdct8x8_pass2_notranspose_neon(left, right); + transpose_s32_8x8_2(left, right, left, right); +} + +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/fdct_neon.h new file mode 100644 index 0000000000..16f5c5fc0e --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct_neon.h @@ -0,0 +1,542 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_FDCT_NEON_H_ +#define VPX_VPX_DSP_ARM_FDCT_NEON_H_ + +#include + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulh_s16 operation on half vector +// can be slightly less accurate, adequate for pass1 +static INLINE void butterfly_one_coeff_s16_fast_half(const int16x4_t a, + const int16x4_t b, + const tran_coef_t constant, + int16x4_t *add, + int16x4_t *sub) { + int16x4_t c = vdup_n_s16(2 * constant); + *add = vqrdmulh_s16(vadd_s16(a, b), c); + *sub = vqrdmulh_s16(vsub_s16(a, b), c); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulh_s16 operation on full vector +// can be slightly less accurate, adequate for pass1 +static INLINE void butterfly_one_coeff_s16_fast(const int16x8_t a, + const int16x8_t b, + const tran_coef_t constant, + int16x8_t *add, + int16x8_t *sub) { + int16x8_t c = vdupq_n_s16(2 * constant); + *add = vqrdmulhq_s16(vaddq_s16(a, b), c); + *sub = vqrdmulhq_s16(vsubq_s16(a, b), c); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes 16-bit input values, +// returns full 32-bit values, high/low +static INLINE void butterfly_one_coeff_s16_s32_fast( + const int16x8_t a, const int16x8_t b, const tran_coef_t constant, + int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo, + int32x4_t *sub_hi) { + int32x4_t c = vdupq_n_s32(constant << 17); + const int16x4_t a_lo = vget_low_s16(a); + const int16x4_t a_hi = vget_high_s16(a); + const int16x4_t b_lo = vget_low_s16(b); + const int16x4_t b_hi = vget_high_s16(b); + *add_lo = vqrdmulhq_s32(vaddl_s16(a_lo, b_lo), c); + *add_hi = vqrdmulhq_s32(vaddl_s16(a_hi, b_hi), c); + *sub_lo = vqrdmulhq_s32(vsubl_s16(a_lo, b_lo), c); + *sub_hi = vqrdmulhq_s32(vsubl_s16(a_hi, b_hi), c); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes 16-bit input values, +// returns full 32-bit values, high/low +static INLINE void butterfly_one_coeff_s16_s32_fast_narrow( + const int16x8_t a, const int16x8_t b, const tran_coef_t constant, + int16x8_t *add, int16x8_t *sub) { + int32x4_t add_lo, add_hi, sub_lo, sub_hi; + butterfly_one_coeff_s16_s32_fast(a, b, constant, &add_lo, &add_hi, &sub_lo, + &sub_hi); + *add = vcombine_s16(vmovn_s32(add_lo), vmovn_s32(add_hi)); + *sub = vcombine_s16(vmovn_s32(sub_lo), vmovn_s32(sub_hi)); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes 16-bit input values, +// returns full 32-bit values, high/low +static INLINE void butterfly_one_coeff_s16_s32_fast_half( + const int16x4_t a, const int16x4_t b, const tran_coef_t constant, + int32x4_t *add, int32x4_t *sub) { + int32x4_t c = vdupq_n_s32(constant << 17); + *add = vqrdmulhq_s32(vaddl_s16(a, b), c); + *sub = vqrdmulhq_s32(vsubl_s16(a, b), c); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on half vector +// more accurate does 32-bit processing, takes 16-bit input values, +// returns narrowed down 16-bit values +static INLINE void butterfly_one_coeff_s16_s32_fast_narrow_half( + const int16x4_t a, const int16x4_t b, const tran_coef_t constant, + int16x4_t *add, int16x4_t *sub) { + int32x4_t add32, sub32; + butterfly_one_coeff_s16_s32_fast_half(a, b, constant, &add32, &sub32); + *add = vmovn_s32(add32); + *sub = vmovn_s32(sub32); +} + +// fdct_round_shift((a +/- b) * c) +// Original Variant that performs normal implementation on full vector +// fully accurate does 32-bit processing, takes 16-bit values +static INLINE void butterfly_one_coeff_s16_s32( + const int16x8_t a, const int16x8_t b, const tran_coef_t constant, + int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo, + int32x4_t *sub_hi) { + const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant); + const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant); + const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant); + const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant); + const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant); + const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant); + *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); + *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); + *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); + *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); +} + +// fdct_round_shift((a +/- b) * c) +// Original Variant that performs normal implementation on full vector +// fully accurate does 32-bit processing, takes 16-bit values +// returns narrowed down 16-bit values +static INLINE void butterfly_one_coeff_s16_s32_narrow( + const int16x8_t a, const int16x8_t b, const tran_coef_t constant, + int16x8_t *add, int16x8_t *sub) { + int32x4_t add32_lo, add32_hi, sub32_lo, sub32_hi; + butterfly_one_coeff_s16_s32(a, b, constant, &add32_lo, &add32_hi, &sub32_lo, + &sub32_hi); + *add = vcombine_s16(vmovn_s32(add32_lo), vmovn_s32(add32_hi)); + *sub = vcombine_s16(vmovn_s32(sub32_lo), vmovn_s32(sub32_hi)); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values, +// high/low +static INLINE void butterfly_one_coeff_s32_noround( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo, + int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) { + const int32x4_t a1 = vmulq_n_s32(a_lo, constant); + const int32x4_t a2 = vmulq_n_s32(a_hi, constant); + const int32x4_t a3 = vmulq_n_s32(a_lo, constant); + const int32x4_t a4 = vmulq_n_s32(a_hi, constant); + *add_lo = vmlaq_n_s32(a1, b_lo, constant); + *add_hi = vmlaq_n_s32(a2, b_hi, constant); + *sub_lo = vmlsq_n_s32(a3, b_lo, constant); + *sub_hi = vmlsq_n_s32(a4, b_hi, constant); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values, +// high/low +static INLINE void butterfly_one_coeff_s32_fast_half(const int32x4_t a, + const int32x4_t b, + const tran_coef_t constant, + int32x4_t *add, + int32x4_t *sub) { + const int32x4_t c = vdupq_n_s32(constant << 17); + *add = vqrdmulhq_s32(vaddq_s32(a, b), c); + *sub = vqrdmulhq_s32(vsubq_s32(a, b), c); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values, +// high/low +static INLINE void butterfly_one_coeff_s32_fast( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo, + int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) { + const int32x4_t c = vdupq_n_s32(constant << 17); + *add_lo = vqrdmulhq_s32(vaddq_s32(a_lo, b_lo), c); + *add_hi = vqrdmulhq_s32(vaddq_s32(a_hi, b_hi), c); + *sub_lo = vqrdmulhq_s32(vsubq_s32(a_lo, b_lo), c); + *sub_hi = vqrdmulhq_s32(vsubq_s32(a_hi, b_hi), c); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs normal implementation on full vector +// more accurate does 64-bit processing, takes and returns 32-bit values +// returns narrowed results +static INLINE void butterfly_one_coeff_s32_s64_narrow( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo, + int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) { + // ac holds the following values: + // ac: vget_low_s32(a_lo) * c, vget_high_s32(a_lo) * c, + // vget_low_s32(a_hi) * c, vget_high_s32(a_hi) * c + int64x2_t ac[4]; + int64x2_t sum[4]; + int64x2_t diff[4]; + + ac[0] = vmull_n_s32(vget_low_s32(a_lo), constant); + ac[1] = vmull_n_s32(vget_high_s32(a_lo), constant); + ac[2] = vmull_n_s32(vget_low_s32(a_hi), constant); + ac[3] = vmull_n_s32(vget_high_s32(a_hi), constant); + + sum[0] = vmlal_n_s32(ac[0], vget_low_s32(b_lo), constant); + sum[1] = vmlal_n_s32(ac[1], vget_high_s32(b_lo), constant); + sum[2] = vmlal_n_s32(ac[2], vget_low_s32(b_hi), constant); + sum[3] = vmlal_n_s32(ac[3], vget_high_s32(b_hi), constant); + *add_lo = vcombine_s32(vrshrn_n_s64(sum[0], DCT_CONST_BITS), + vrshrn_n_s64(sum[1], DCT_CONST_BITS)); + *add_hi = vcombine_s32(vrshrn_n_s64(sum[2], DCT_CONST_BITS), + vrshrn_n_s64(sum[3], DCT_CONST_BITS)); + + diff[0] = vmlsl_n_s32(ac[0], vget_low_s32(b_lo), constant); + diff[1] = vmlsl_n_s32(ac[1], vget_high_s32(b_lo), constant); + diff[2] = vmlsl_n_s32(ac[2], vget_low_s32(b_hi), constant); + diff[3] = vmlsl_n_s32(ac[3], vget_high_s32(b_hi), constant); + *sub_lo = vcombine_s32(vrshrn_n_s64(diff[0], DCT_CONST_BITS), + vrshrn_n_s64(diff[1], DCT_CONST_BITS)); + *sub_hi = vcombine_s32(vrshrn_n_s64(diff[2], DCT_CONST_BITS), + vrshrn_n_s64(diff[3], DCT_CONST_BITS)); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Variant that performs normal implementation on half vector +// more accurate does 64-bit processing, takes and returns 32-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff_s32_s64_narrow_half( + const int32x4_t a, const int32x4_t b, const tran_coef_t constant1, + const tran_coef_t constant2, int32x4_t *add, int32x4_t *sub) { + const int32x2_t a_lo = vget_low_s32(a); + const int32x2_t a_hi = vget_high_s32(a); + const int32x2_t b_lo = vget_low_s32(b); + const int32x2_t b_hi = vget_high_s32(b); + + const int64x2_t axc0_64_lo = vmull_n_s32(a_lo, constant1); + const int64x2_t axc0_64_hi = vmull_n_s32(a_hi, constant1); + const int64x2_t axc1_64_lo = vmull_n_s32(a_lo, constant2); + const int64x2_t axc1_64_hi = vmull_n_s32(a_hi, constant2); + + const int64x2_t sum_lo = vmlal_n_s32(axc0_64_lo, b_lo, constant2); + const int64x2_t sum_hi = vmlal_n_s32(axc0_64_hi, b_hi, constant2); + const int64x2_t diff_lo = vmlsl_n_s32(axc1_64_lo, b_lo, constant1); + const int64x2_t diff_hi = vmlsl_n_s32(axc1_64_hi, b_hi, constant1); + + *add = vcombine_s32(vrshrn_n_s64(sum_lo, DCT_CONST_BITS), + vrshrn_n_s64(sum_hi, DCT_CONST_BITS)); + *sub = vcombine_s32(vrshrn_n_s64(diff_lo, DCT_CONST_BITS), + vrshrn_n_s64(diff_hi, DCT_CONST_BITS)); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Variant that performs normal implementation on full vector +// more accurate does 64-bit processing, takes and returns 64-bit values +// returns results without rounding +static INLINE void butterfly_two_coeff_s32_s64_noround( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const tran_coef_t constant1, + const tran_coef_t constant2, int64x2_t *add_lo /*[2]*/, + int64x2_t *add_hi /*[2]*/, int64x2_t *sub_lo /*[2]*/, + int64x2_t *sub_hi /*[2]*/) { + // ac1/ac2 hold the following values: + // ac1: vget_low_s32(a_lo) * c1, vget_high_s32(a_lo) * c1, + // vget_low_s32(a_hi) * c1, vget_high_s32(a_hi) * c1 + // ac2: vget_low_s32(a_lo) * c2, vget_high_s32(a_lo) * c2, + // vget_low_s32(a_hi) * c2, vget_high_s32(a_hi) * c2 + int64x2_t ac1[4]; + int64x2_t ac2[4]; + + ac1[0] = vmull_n_s32(vget_low_s32(a_lo), constant1); + ac1[1] = vmull_n_s32(vget_high_s32(a_lo), constant1); + ac1[2] = vmull_n_s32(vget_low_s32(a_hi), constant1); + ac1[3] = vmull_n_s32(vget_high_s32(a_hi), constant1); + ac2[0] = vmull_n_s32(vget_low_s32(a_lo), constant2); + ac2[1] = vmull_n_s32(vget_high_s32(a_lo), constant2); + ac2[2] = vmull_n_s32(vget_low_s32(a_hi), constant2); + ac2[3] = vmull_n_s32(vget_high_s32(a_hi), constant2); + + add_lo[0] = vmlal_n_s32(ac1[0], vget_low_s32(b_lo), constant2); + add_lo[1] = vmlal_n_s32(ac1[1], vget_high_s32(b_lo), constant2); + add_hi[0] = vmlal_n_s32(ac1[2], vget_low_s32(b_hi), constant2); + add_hi[1] = vmlal_n_s32(ac1[3], vget_high_s32(b_hi), constant2); + + sub_lo[0] = vmlsl_n_s32(ac2[0], vget_low_s32(b_lo), constant1); + sub_lo[1] = vmlsl_n_s32(ac2[1], vget_high_s32(b_lo), constant1); + sub_hi[0] = vmlsl_n_s32(ac2[2], vget_low_s32(b_hi), constant1); + sub_hi[1] = vmlsl_n_s32(ac2[3], vget_high_s32(b_hi), constant1); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Variant that performs normal implementation on full vector +// more accurate does 64-bit processing, takes and returns 32-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff_s32_s64_narrow( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const tran_coef_t constant1, + const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi, + int32x4_t *sub_lo, int32x4_t *sub_hi) { + // ac1/ac2 hold the following values: + // ac1: vget_low_s32(a_lo) * c1, vget_high_s32(a_lo) * c1, + // vget_low_s32(a_hi) * c1, vget_high_s32(a_hi) * c1 + // ac2: vget_low_s32(a_lo) * c2, vget_high_s32(a_lo) * c2, + // vget_low_s32(a_hi) * c2, vget_high_s32(a_hi) * c2 + int64x2_t ac1[4]; + int64x2_t ac2[4]; + int64x2_t sum[4]; + int64x2_t diff[4]; + + ac1[0] = vmull_n_s32(vget_low_s32(a_lo), constant1); + ac1[1] = vmull_n_s32(vget_high_s32(a_lo), constant1); + ac1[2] = vmull_n_s32(vget_low_s32(a_hi), constant1); + ac1[3] = vmull_n_s32(vget_high_s32(a_hi), constant1); + ac2[0] = vmull_n_s32(vget_low_s32(a_lo), constant2); + ac2[1] = vmull_n_s32(vget_high_s32(a_lo), constant2); + ac2[2] = vmull_n_s32(vget_low_s32(a_hi), constant2); + ac2[3] = vmull_n_s32(vget_high_s32(a_hi), constant2); + + sum[0] = vmlal_n_s32(ac1[0], vget_low_s32(b_lo), constant2); + sum[1] = vmlal_n_s32(ac1[1], vget_high_s32(b_lo), constant2); + sum[2] = vmlal_n_s32(ac1[2], vget_low_s32(b_hi), constant2); + sum[3] = vmlal_n_s32(ac1[3], vget_high_s32(b_hi), constant2); + *add_lo = vcombine_s32(vrshrn_n_s64(sum[0], DCT_CONST_BITS), + vrshrn_n_s64(sum[1], DCT_CONST_BITS)); + *add_hi = vcombine_s32(vrshrn_n_s64(sum[2], DCT_CONST_BITS), + vrshrn_n_s64(sum[3], DCT_CONST_BITS)); + + diff[0] = vmlsl_n_s32(ac2[0], vget_low_s32(b_lo), constant1); + diff[1] = vmlsl_n_s32(ac2[1], vget_high_s32(b_lo), constant1); + diff[2] = vmlsl_n_s32(ac2[2], vget_low_s32(b_hi), constant1); + diff[3] = vmlsl_n_s32(ac2[3], vget_high_s32(b_hi), constant1); + *sub_lo = vcombine_s32(vrshrn_n_s64(diff[0], DCT_CONST_BITS), + vrshrn_n_s64(diff[1], DCT_CONST_BITS)); + *sub_hi = vcombine_s32(vrshrn_n_s64(diff[2], DCT_CONST_BITS), + vrshrn_n_s64(diff[3], DCT_CONST_BITS)); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Original Variant that performs normal implementation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff_s16_s32_noround( + const int16x4_t a_lo, const int16x4_t a_hi, const int16x4_t b_lo, + const int16x4_t b_hi, const tran_coef_t constant1, + const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi, + int32x4_t *sub_lo, int32x4_t *sub_hi) { + const int32x4_t a1 = vmull_n_s16(a_lo, constant1); + const int32x4_t a2 = vmull_n_s16(a_hi, constant1); + const int32x4_t a3 = vmull_n_s16(a_lo, constant2); + const int32x4_t a4 = vmull_n_s16(a_hi, constant2); + *add_lo = vmlal_n_s16(a1, b_lo, constant2); + *add_hi = vmlal_n_s16(a2, b_hi, constant2); + *sub_lo = vmlsl_n_s16(a3, b_lo, constant1); + *sub_hi = vmlsl_n_s16(a4, b_hi, constant1); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Original Variant that performs normal implementation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff_s32_noround( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const tran_coef_t constant1, + const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi, + int32x4_t *sub_lo, int32x4_t *sub_hi) { + const int32x4_t a1 = vmulq_n_s32(a_lo, constant1); + const int32x4_t a2 = vmulq_n_s32(a_hi, constant1); + const int32x4_t a3 = vmulq_n_s32(a_lo, constant2); + const int32x4_t a4 = vmulq_n_s32(a_hi, constant2); + *add_lo = vmlaq_n_s32(a1, b_lo, constant2); + *add_hi = vmlaq_n_s32(a2, b_hi, constant2); + *sub_lo = vmlsq_n_s32(a3, b_lo, constant1); + *sub_hi = vmlsq_n_s32(a4, b_hi, constant1); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Variant that performs normal implementation on half vector +// more accurate does 32-bit processing, takes and returns 16-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff_half(const int16x4_t a, + const int16x4_t b, + const tran_coef_t constant1, + const tran_coef_t constant2, + int16x4_t *add, int16x4_t *sub) { + const int32x4_t a1 = vmull_n_s16(a, constant1); + const int32x4_t a2 = vmull_n_s16(a, constant2); + const int32x4_t sum = vmlal_n_s16(a1, b, constant2); + const int32x4_t diff = vmlsl_n_s16(a2, b, constant1); + *add = vqrshrn_n_s32(sum, DCT_CONST_BITS); + *sub = vqrshrn_n_s32(diff, DCT_CONST_BITS); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Original Variant that performs normal implementation on full vector +// more accurate does 32-bit processing, takes and returns 16-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b, + const tran_coef_t constant1, + const tran_coef_t constant2, + int16x8_t *add, int16x8_t *sub) { + const int32x4_t a1 = vmull_n_s16(vget_low_s16(a), constant1); + const int32x4_t a2 = vmull_n_s16(vget_high_s16(a), constant1); + const int32x4_t a3 = vmull_n_s16(vget_low_s16(a), constant2); + const int32x4_t a4 = vmull_n_s16(vget_high_s16(a), constant2); + const int32x4_t sum0 = vmlal_n_s16(a1, vget_low_s16(b), constant2); + const int32x4_t sum1 = vmlal_n_s16(a2, vget_high_s16(b), constant2); + const int32x4_t diff0 = vmlsl_n_s16(a3, vget_low_s16(b), constant1); + const int32x4_t diff1 = vmlsl_n_s16(a4, vget_high_s16(b), constant1); + const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS); + const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS); + const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS); + const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS); + *add = vcombine_s16(rounded0, rounded1); + *sub = vcombine_s16(rounded2, rounded3); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Original Variant that performs normal implementation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff_s32( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const tran_coef_t constant1, + const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi, + int32x4_t *sub_lo, int32x4_t *sub_hi) { + const int32x4_t a1 = vmulq_n_s32(a_lo, constant1); + const int32x4_t a2 = vmulq_n_s32(a_hi, constant1); + const int32x4_t a3 = vmulq_n_s32(a_lo, constant2); + const int32x4_t a4 = vmulq_n_s32(a_hi, constant2); + const int32x4_t sum0 = vmlaq_n_s32(a1, b_lo, constant2); + const int32x4_t sum1 = vmlaq_n_s32(a2, b_hi, constant2); + const int32x4_t diff0 = vmlsq_n_s32(a3, b_lo, constant1); + const int32x4_t diff1 = vmlsq_n_s32(a4, b_hi, constant1); + *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); + *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); + *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); + *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); +} + +// Add 1 if positive, 2 if negative, and shift by 2. +// In practice, add 1, then add the sign bit, then shift without rounding. +static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) { + const int16x8_t one = vdupq_n_s16(1); + const uint16x8_t a_u16 = vreinterpretq_u16_s16(a); + const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15); + const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16); + return vshrq_n_s16(vaddq_s16(vaddq_s16(a, a_sign_s16), one), 2); +} + +// Add 1 if positive, 2 if negative, and shift by 2. +// In practice, add 1, then add the sign bit, then shift and round, +// return narrowed results +static INLINE int16x8_t add_round_shift_s32_narrow(const int32x4_t a_lo, + const int32x4_t a_hi) { + const int32x4_t one = vdupq_n_s32(1); + const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo); + const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31); + const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32); + const int16x4_t b_lo = + vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2); + const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi); + const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31); + const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32); + const int16x4_t b_hi = + vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2); + return vcombine_s16(b_lo, b_hi); +} + +// Add 1 if negative, and shift by 1. +// In practice, add the sign bit, then shift and round +static INLINE int32x4_t add_round_shift_half_s32(const int32x4_t a) { + const uint32x4_t a_u32 = vreinterpretq_u32_s32(a); + const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31); + const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32); + return vshrq_n_s32(vaddq_s32(a, a_sign_s32), 1); +} + +// Add 1 if positive, 2 if negative, and shift by 2. +// In practice, add 1, then add the sign bit, then shift without rounding. +static INLINE int32x4_t add_round_shift_s32(const int32x4_t a) { + const int32x4_t one = vdupq_n_s32(1); + const uint32x4_t a_u32 = vreinterpretq_u32_s32(a); + const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31); + const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32); + return vshrq_n_s32(vaddq_s32(vaddq_s32(a, a_sign_s32), one), 2); +} + +// Add 2 if positive, 1 if negative, and shift by 2. +// In practice, subtract the sign bit, then shift with rounding. +static INLINE int16x8_t sub_round_shift_s16(const int16x8_t a) { + const uint16x8_t a_u16 = vreinterpretq_u16_s16(a); + const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15); + const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16); + return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2); +} + +// Add 2 if positive, 1 if negative, and shift by 2. +// In practice, subtract the sign bit, then shift with rounding. +static INLINE int32x4_t sub_round_shift_s32(const int32x4_t a) { + const uint32x4_t a_u32 = vreinterpretq_u32_s32(a); + const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31); + const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32); + return vrshrq_n_s32(vsubq_s32(a, a_sign_s32), 2); +} + +static INLINE int32x4_t add_s64_round_narrow(const int64x2_t *a /*[2]*/, + const int64x2_t *b /*[2]*/) { + int64x2_t result[2]; + result[0] = vaddq_s64(a[0], b[0]); + result[1] = vaddq_s64(a[1], b[1]); + return vcombine_s32(vrshrn_n_s64(result[0], DCT_CONST_BITS), + vrshrn_n_s64(result[1], DCT_CONST_BITS)); +} + +static INLINE int32x4_t sub_s64_round_narrow(const int64x2_t *a /*[2]*/, + const int64x2_t *b /*[2]*/) { + int64x2_t result[2]; + result[0] = vsubq_s64(a[0], b[0]); + result[1] = vsubq_s64(a[1], b[1]); + return vcombine_s32(vrshrn_n_s64(result[0], DCT_CONST_BITS), + vrshrn_n_s64(result[1], DCT_CONST_BITS)); +} + +static INLINE int32x4_t add_s32_s64_narrow(const int32x4_t a, + const int32x4_t b) { + int64x2_t a64[2], b64[2], result[2]; + a64[0] = vmovl_s32(vget_low_s32(a)); + a64[1] = vmovl_s32(vget_high_s32(a)); + b64[0] = vmovl_s32(vget_low_s32(b)); + b64[1] = vmovl_s32(vget_high_s32(b)); + result[0] = vaddq_s64(a64[0], b64[0]); + result[1] = vaddq_s64(a64[1], b64[1]); + return vcombine_s32(vmovn_s64(result[0]), vmovn_s64(result[1])); +} + +static INLINE int32x4_t sub_s32_s64_narrow(const int32x4_t a, + const int32x4_t b) { + int64x2_t a64[2], b64[2], result[2]; + a64[0] = vmovl_s32(vget_low_s32(a)); + a64[1] = vmovl_s32(vget_high_s32(a)); + b64[0] = vmovl_s32(vget_low_s32(b)); + b64[1] = vmovl_s32(vget_high_s32(b)); + result[0] = vsubq_s64(a64[0], b64[0]); + result[1] = vsubq_s64(a64[1], b64[1]); + return vcombine_s32(vmovn_s64(result[0]), vmovn_s64(result[1])); +} + +#endif // VPX_VPX_DSP_ARM_FDCT_NEON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct_partial_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/fdct_partial_neon.c new file mode 100644 index 0000000000..df0da543ce --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct_partial_neon.c @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) { + int16x4_t a0, a1, a2, a3; + int16x8_t b0, b1; + int16x8_t c; + + a0 = vld1_s16(input); + input += stride; + a1 = vld1_s16(input); + input += stride; + a2 = vld1_s16(input); + input += stride; + a3 = vld1_s16(input); + + b0 = vcombine_s16(a0, a1); + b1 = vcombine_s16(a2, a3); + + c = vaddq_s16(b0, b1); + + output[0] = (tran_low_t)(horizontal_add_int16x8(c) << 1); + output[1] = 0; +} + +// Visual Studio 2022 (cl.exe) targeting AArch64 with optimizations enabled +// will fail with an internal compiler error. +// See: +// https://developercommunity.visualstudio.com/t/Compiler-crash-C1001-when-building-a-for/10346110 +// TODO(jzern): check the compiler version after a fix for the issue is +// released. +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) +#pragma optimize("", off) +#endif +void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) { + int r; + int16x8_t sum = vld1q_s16(&input[0]); + + for (r = 1; r < 8; ++r) { + const int16x8_t input_00 = vld1q_s16(&input[r * stride]); + sum = vaddq_s16(sum, input_00); + } + + output[0] = (tran_low_t)horizontal_add_int16x8(sum); + output[1] = 0; +} +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) +#pragma optimize("", on) +#endif + +void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, + int stride) { + int r; + int16x8_t left = vld1q_s16(input); + int16x8_t right = vld1q_s16(input + 8); + int32_t sum; + input += stride; + + for (r = 1; r < 16; ++r) { + const int16x8_t a = vld1q_s16(input); + const int16x8_t b = vld1q_s16(input + 8); + input += stride; + left = vaddq_s16(left, a); + right = vaddq_s16(right, b); + } + + sum = horizontal_add_int16x8(left) + horizontal_add_int16x8(right); + + output[0] = (tran_low_t)(sum >> 1); + output[1] = 0; +} + +void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, + int stride) { + int r; + int16x8_t a0 = vld1q_s16(input); + int16x8_t a1 = vld1q_s16(input + 8); + int16x8_t a2 = vld1q_s16(input + 16); + int16x8_t a3 = vld1q_s16(input + 24); + int32_t sum; + input += stride; + + for (r = 1; r < 32; ++r) { + const int16x8_t b0 = vld1q_s16(input); + const int16x8_t b1 = vld1q_s16(input + 8); + const int16x8_t b2 = vld1q_s16(input + 16); + const int16x8_t b3 = vld1q_s16(input + 24); + input += stride; + a0 = vaddq_s16(a0, b0); + a1 = vaddq_s16(a1, b1); + a2 = vaddq_s16(a2, b2); + a3 = vaddq_s16(a3, b3); + } + + sum = horizontal_add_int16x8(a0); + sum += horizontal_add_int16x8(a1); + sum += horizontal_add_int16x8(a2); + sum += horizontal_add_int16x8(a3); + output[0] = (tran_low_t)(sum >> 3); + output[1] = 0; +} + +#if CONFIG_VP9_HIGHBITDEPTH + +void vpx_highbd_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, + int stride) { + int32x4_t partial_sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), + vdupq_n_s32(0) }; + int32_t sum; + + int r = 0; + do { + const int16x8_t a = vld1q_s16(input); + const int16x8_t b = vld1q_s16(input + 8); + input += stride; + partial_sum[0] = vaddw_s16(partial_sum[0], vget_low_s16(a)); + partial_sum[1] = vaddw_s16(partial_sum[1], vget_high_s16(a)); + partial_sum[2] = vaddw_s16(partial_sum[2], vget_low_s16(b)); + partial_sum[3] = vaddw_s16(partial_sum[3], vget_high_s16(b)); + r++; + } while (r < 16); + + partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[1]); + partial_sum[2] = vaddq_s32(partial_sum[2], partial_sum[3]); + partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[2]); + sum = horizontal_add_int32x4(partial_sum[0]); + + output[0] = (tran_low_t)(sum >> 1); + output[1] = 0; +} + +void vpx_highbd_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, + int stride) { + int32x4_t partial_sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), + vdupq_n_s32(0) }; + + int32_t sum; + + int r = 0; + do { + const int16x8_t a0 = vld1q_s16(input); + const int16x8_t a1 = vld1q_s16(input + 8); + const int16x8_t a2 = vld1q_s16(input + 16); + const int16x8_t a3 = vld1q_s16(input + 24); + input += stride; + partial_sum[0] = vaddw_s16(partial_sum[0], vget_low_s16(a0)); + partial_sum[0] = vaddw_s16(partial_sum[0], vget_high_s16(a0)); + partial_sum[1] = vaddw_s16(partial_sum[1], vget_low_s16(a1)); + partial_sum[1] = vaddw_s16(partial_sum[1], vget_high_s16(a1)); + partial_sum[2] = vaddw_s16(partial_sum[2], vget_low_s16(a2)); + partial_sum[2] = vaddw_s16(partial_sum[2], vget_high_s16(a2)); + partial_sum[3] = vaddw_s16(partial_sum[3], vget_low_s16(a3)); + partial_sum[3] = vaddw_s16(partial_sum[3], vget_high_s16(a3)); + r++; + } while (r < 32); + + partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[1]); + partial_sum[2] = vaddq_s32(partial_sum[2], partial_sum[3]); + partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[2]); + sum = horizontal_add_int32x4(partial_sum[0]); + + output[0] = (tran_low_t)(sum >> 3); + output[1] = 0; +} + +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/arm/hadamard_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/hadamard_neon.c new file mode 100644 index 0000000000..f5a044be4d --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/hadamard_neon.c @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" + +static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, + int16x8_t *a3, int16x8_t *a4, int16x8_t *a5, + int16x8_t *a6, int16x8_t *a7) { + const int16x8_t b0 = vaddq_s16(*a0, *a1); + const int16x8_t b1 = vsubq_s16(*a0, *a1); + const int16x8_t b2 = vaddq_s16(*a2, *a3); + const int16x8_t b3 = vsubq_s16(*a2, *a3); + const int16x8_t b4 = vaddq_s16(*a4, *a5); + const int16x8_t b5 = vsubq_s16(*a4, *a5); + const int16x8_t b6 = vaddq_s16(*a6, *a7); + const int16x8_t b7 = vsubq_s16(*a6, *a7); + + const int16x8_t c0 = vaddq_s16(b0, b2); + const int16x8_t c1 = vaddq_s16(b1, b3); + const int16x8_t c2 = vsubq_s16(b0, b2); + const int16x8_t c3 = vsubq_s16(b1, b3); + const int16x8_t c4 = vaddq_s16(b4, b6); + const int16x8_t c5 = vaddq_s16(b5, b7); + const int16x8_t c6 = vsubq_s16(b4, b6); + const int16x8_t c7 = vsubq_s16(b5, b7); + + *a0 = vaddq_s16(c0, c4); + *a1 = vsubq_s16(c2, c6); + *a2 = vsubq_s16(c0, c4); + *a3 = vaddq_s16(c2, c6); + *a4 = vaddq_s16(c3, c7); + *a5 = vsubq_s16(c3, c7); + *a6 = vsubq_s16(c1, c5); + *a7 = vaddq_s16(c1, c5); +} + +void vpx_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int16x8_t a0 = vld1q_s16(src_diff); + int16x8_t a1 = vld1q_s16(src_diff + src_stride); + int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride); + int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride); + int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride); + int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride); + int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride); + int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride); + + hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + // Skip the second transpose because it is not required. + + store_s16q_to_tran_low(coeff + 0, a0); + store_s16q_to_tran_low(coeff + 8, a1); + store_s16q_to_tran_low(coeff + 16, a2); + store_s16q_to_tran_low(coeff + 24, a3); + store_s16q_to_tran_low(coeff + 32, a4); + store_s16q_to_tran_low(coeff + 40, a5); + store_s16q_to_tran_low(coeff + 48, a6); + store_s16q_to_tran_low(coeff + 56, a7); +} + +void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int i; + + /* Rearrange 16x16 to 8x32 and remove stride. + * Top left first. */ + vpx_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0); + /* Top right. */ + vpx_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64); + /* Bottom left. */ + vpx_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128); + /* Bottom right. */ + vpx_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192); + + for (i = 0; i < 64; i += 8) { + const int16x8_t a0 = load_tran_low_to_s16q(coeff + 0); + const int16x8_t a1 = load_tran_low_to_s16q(coeff + 64); + const int16x8_t a2 = load_tran_low_to_s16q(coeff + 128); + const int16x8_t a3 = load_tran_low_to_s16q(coeff + 192); + + const int16x8_t b0 = vhaddq_s16(a0, a1); + const int16x8_t b1 = vhsubq_s16(a0, a1); + const int16x8_t b2 = vhaddq_s16(a2, a3); + const int16x8_t b3 = vhsubq_s16(a2, a3); + + const int16x8_t c0 = vaddq_s16(b0, b2); + const int16x8_t c1 = vaddq_s16(b1, b3); + const int16x8_t c2 = vsubq_s16(b0, b2); + const int16x8_t c3 = vsubq_s16(b1, b3); + + store_s16q_to_tran_low(coeff + 0, c0); + store_s16q_to_tran_low(coeff + 64, c1); + store_s16q_to_tran_low(coeff + 128, c2); + store_s16q_to_tran_low(coeff + 192, c3); + + coeff += 8; + } +} + +void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int i; + + /* Rearrange 32x32 to 16x64 and remove stride. + * Top left first. */ + vpx_hadamard_16x16_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0); + /* Top right. */ + vpx_hadamard_16x16_neon(src_diff + 16 + 0 * src_stride, src_stride, + coeff + 256); + /* Bottom left. */ + vpx_hadamard_16x16_neon(src_diff + 0 + 16 * src_stride, src_stride, + coeff + 512); + /* Bottom right. */ + vpx_hadamard_16x16_neon(src_diff + 16 + 16 * src_stride, src_stride, + coeff + 768); + + for (i = 0; i < 256; i += 8) { + const int16x8_t a0 = load_tran_low_to_s16q(coeff + 0); + const int16x8_t a1 = load_tran_low_to_s16q(coeff + 256); + const int16x8_t a2 = load_tran_low_to_s16q(coeff + 512); + const int16x8_t a3 = load_tran_low_to_s16q(coeff + 768); + + const int16x8_t b0 = vshrq_n_s16(vhaddq_s16(a0, a1), 1); + const int16x8_t b1 = vshrq_n_s16(vhsubq_s16(a0, a1), 1); + const int16x8_t b2 = vshrq_n_s16(vhaddq_s16(a2, a3), 1); + const int16x8_t b3 = vshrq_n_s16(vhsubq_s16(a2, a3), 1); + + const int16x8_t c0 = vaddq_s16(b0, b2); + const int16x8_t c1 = vaddq_s16(b1, b3); + const int16x8_t c2 = vsubq_s16(b0, b2); + const int16x8_t c3 = vsubq_s16(b1, b3); + + store_s16q_to_tran_low(coeff + 0, c0); + store_s16q_to_tran_low(coeff + 256, c1); + store_s16q_to_tran_low(coeff + 512, c2); + store_s16q_to_tran_low(coeff + 768, c3); + + coeff += 8; + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_neon.c new file mode 100644 index 0000000000..4265596c8c --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_neon.c @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +uint32_t vpx_highbd_avg_4x4_neon(const uint8_t *s8, int p) { + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8); + const uint16x8_t a0 = load_unaligned_u16q(a_ptr + 0 * p, p); + const uint16x8_t a1 = load_unaligned_u16q(a_ptr + 2 * p, p); + return (horizontal_add_uint16x8(vaddq_u16(a0, a1)) + (1 << 3)) >> 4; +} + +uint32_t vpx_highbd_avg_8x8_neon(const uint8_t *s8, int p) { + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8); + uint16x8_t sum, a0, a1, a2, a3, a4, a5, a6, a7; + + load_u16_8x8(a_ptr, p, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + sum = vaddq_u16(a0, a1); + sum = vaddq_u16(sum, a2); + sum = vaddq_u16(sum, a3); + sum = vaddq_u16(sum, a4); + sum = vaddq_u16(sum, a5); + sum = vaddq_u16(sum, a6); + sum = vaddq_u16(sum, a7); + + return (horizontal_add_uint16x8(sum) + (1 << 5)) >> 6; +} + +// coeff: 32 bits, dynamic range [-2147483648, 2147483647]. +// length: value range {16, 64, 256, 1024}. +// satd: 42 bits, dynamic range [-2147483648 * 1024, 2147483647 * 1024] +int vpx_highbd_satd_neon(const tran_low_t *coeff, int length) { + int64x2_t sum_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + do { + int32x4_t abs0, abs1; + const int32x4_t s0 = load_tran_low_to_s32q(coeff); + const int32x4_t s1 = load_tran_low_to_s32q(coeff + 4); + + abs0 = vabsq_s32(s0); + sum_s64[0] = vpadalq_s32(sum_s64[0], abs0); + abs1 = vabsq_s32(s1); + sum_s64[1] = vpadalq_s32(sum_s64[1], abs1); + + length -= 8; + coeff += 8; + } while (length != 0); + + return (int)horizontal_add_int64x2(vaddq_s64(sum_s64[0], sum_s64[1])); +} + +void vpx_highbd_minmax_8x8_neon(const uint8_t *s8, int p, const uint8_t *d8, + int dp, int *min, int *max) { + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(d8); + + const uint16x8_t a0 = vld1q_u16(a_ptr + 0 * p); + const uint16x8_t a1 = vld1q_u16(a_ptr + 1 * p); + const uint16x8_t a2 = vld1q_u16(a_ptr + 2 * p); + const uint16x8_t a3 = vld1q_u16(a_ptr + 3 * p); + const uint16x8_t a4 = vld1q_u16(a_ptr + 4 * p); + const uint16x8_t a5 = vld1q_u16(a_ptr + 5 * p); + const uint16x8_t a6 = vld1q_u16(a_ptr + 6 * p); + const uint16x8_t a7 = vld1q_u16(a_ptr + 7 * p); + + const uint16x8_t b0 = vld1q_u16(b_ptr + 0 * dp); + const uint16x8_t b1 = vld1q_u16(b_ptr + 1 * dp); + const uint16x8_t b2 = vld1q_u16(b_ptr + 2 * dp); + const uint16x8_t b3 = vld1q_u16(b_ptr + 3 * dp); + const uint16x8_t b4 = vld1q_u16(b_ptr + 4 * dp); + const uint16x8_t b5 = vld1q_u16(b_ptr + 5 * dp); + const uint16x8_t b6 = vld1q_u16(b_ptr + 6 * dp); + const uint16x8_t b7 = vld1q_u16(b_ptr + 7 * dp); + + const uint16x8_t abs_diff0 = vabdq_u16(a0, b0); + const uint16x8_t abs_diff1 = vabdq_u16(a1, b1); + const uint16x8_t abs_diff2 = vabdq_u16(a2, b2); + const uint16x8_t abs_diff3 = vabdq_u16(a3, b3); + const uint16x8_t abs_diff4 = vabdq_u16(a4, b4); + const uint16x8_t abs_diff5 = vabdq_u16(a5, b5); + const uint16x8_t abs_diff6 = vabdq_u16(a6, b6); + const uint16x8_t abs_diff7 = vabdq_u16(a7, b7); + + const uint16x8_t max01 = vmaxq_u16(abs_diff0, abs_diff1); + const uint16x8_t max23 = vmaxq_u16(abs_diff2, abs_diff3); + const uint16x8_t max45 = vmaxq_u16(abs_diff4, abs_diff5); + const uint16x8_t max67 = vmaxq_u16(abs_diff6, abs_diff7); + + const uint16x8_t max0123 = vmaxq_u16(max01, max23); + const uint16x8_t max4567 = vmaxq_u16(max45, max67); + const uint16x8_t max07 = vmaxq_u16(max0123, max4567); + + const uint16x8_t min01 = vminq_u16(abs_diff0, abs_diff1); + const uint16x8_t min23 = vminq_u16(abs_diff2, abs_diff3); + const uint16x8_t min45 = vminq_u16(abs_diff4, abs_diff5); + const uint16x8_t min67 = vminq_u16(abs_diff6, abs_diff7); + + const uint16x8_t min0123 = vminq_u16(min01, min23); + const uint16x8_t min4567 = vminq_u16(min45, min67); + const uint16x8_t min07 = vminq_u16(min0123, min4567); + +#if VPX_ARCH_AARCH64 + *min = *max = 0; // Clear high bits + *((uint16_t *)max) = vmaxvq_u16(max07); + *((uint16_t *)min) = vminvq_u16(min07); +#else + // Split into 64-bit vectors and execute pairwise min/max. + uint16x4_t ab_max = vmax_u16(vget_high_u16(max07), vget_low_u16(max07)); + uint16x4_t ab_min = vmin_u16(vget_high_u16(min07), vget_low_u16(min07)); + + // Enough runs of vpmax/min propagate the max/min values to every position. + ab_max = vpmax_u16(ab_max, ab_max); + ab_min = vpmin_u16(ab_min, ab_min); + + ab_max = vpmax_u16(ab_max, ab_max); + ab_min = vpmin_u16(ab_min, ab_min); + + ab_max = vpmax_u16(ab_max, ab_max); + ab_min = vpmin_u16(ab_min, ab_min); + + *min = *max = 0; // Clear high bits + // Store directly to avoid costly neon->gpr transfer. + vst1_lane_u16((uint16_t *)max, ab_max, 0); + vst1_lane_u16((uint16_t *)min, ab_min, 0); +#endif +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_pred_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_pred_neon.c new file mode 100644 index 0000000000..3063acbb3e --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_pred_neon.c @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred, + int width, int height, const uint16_t *ref, + int ref_stride) { + int i = height; + if (width > 8) { + do { + int j = 0; + do { + const uint16x8_t p = vld1q_u16(pred + j); + const uint16x8_t r = vld1q_u16(ref + j); + + uint16x8_t avg = vrhaddq_u16(p, r); + vst1q_u16(comp_pred + j, avg); + + j += 8; + } while (j < width); + + comp_pred += width; + pred += width; + ref += ref_stride; + } while (--i != 0); + } else if (width == 8) { + do { + const uint16x8_t p = vld1q_u16(pred); + const uint16x8_t r = vld1q_u16(ref); + + uint16x8_t avg = vrhaddq_u16(p, r); + vst1q_u16(comp_pred, avg); + + comp_pred += width; + pred += width; + ref += ref_stride; + } while (--i != 0); + } else { + assert(width == 4); + do { + const uint16x4_t p = vld1_u16(pred); + const uint16x4_t r = vld1_u16(ref); + + uint16x4_t avg = vrhadd_u16(p, r); + vst1_u16(comp_pred, avg); + + comp_pred += width; + pred += width; + ref += ref_stride; + } while (--i != 0); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_hadamard_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_hadamard_neon.c new file mode 100644 index 0000000000..7be88f6bcb --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_hadamard_neon.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" + +static INLINE void hadamard_highbd_col8_first_pass(int16x8_t *a0, int16x8_t *a1, + int16x8_t *a2, int16x8_t *a3, + int16x8_t *a4, int16x8_t *a5, + int16x8_t *a6, + int16x8_t *a7) { + int16x8_t b0 = vaddq_s16(*a0, *a1); + int16x8_t b1 = vsubq_s16(*a0, *a1); + int16x8_t b2 = vaddq_s16(*a2, *a3); + int16x8_t b3 = vsubq_s16(*a2, *a3); + int16x8_t b4 = vaddq_s16(*a4, *a5); + int16x8_t b5 = vsubq_s16(*a4, *a5); + int16x8_t b6 = vaddq_s16(*a6, *a7); + int16x8_t b7 = vsubq_s16(*a6, *a7); + + int16x8_t c0 = vaddq_s16(b0, b2); + int16x8_t c2 = vsubq_s16(b0, b2); + int16x8_t c1 = vaddq_s16(b1, b3); + int16x8_t c3 = vsubq_s16(b1, b3); + int16x8_t c4 = vaddq_s16(b4, b6); + int16x8_t c6 = vsubq_s16(b4, b6); + int16x8_t c5 = vaddq_s16(b5, b7); + int16x8_t c7 = vsubq_s16(b5, b7); + + *a0 = vaddq_s16(c0, c4); + *a2 = vsubq_s16(c0, c4); + *a7 = vaddq_s16(c1, c5); + *a6 = vsubq_s16(c1, c5); + *a3 = vaddq_s16(c2, c6); + *a1 = vsubq_s16(c2, c6); + *a4 = vaddq_s16(c3, c7); + *a5 = vsubq_s16(c3, c7); +} + +static INLINE void hadamard_highbd_col4_second_pass(int16x4_t a0, int16x4_t a1, + int16x4_t a2, int16x4_t a3, + int16x4_t a4, int16x4_t a5, + int16x4_t a6, int16x4_t a7, + tran_low_t *coeff) { + int32x4_t b0 = vaddl_s16(a0, a1); + int32x4_t b1 = vsubl_s16(a0, a1); + int32x4_t b2 = vaddl_s16(a2, a3); + int32x4_t b3 = vsubl_s16(a2, a3); + int32x4_t b4 = vaddl_s16(a4, a5); + int32x4_t b5 = vsubl_s16(a4, a5); + int32x4_t b6 = vaddl_s16(a6, a7); + int32x4_t b7 = vsubl_s16(a6, a7); + + int32x4_t c0 = vaddq_s32(b0, b2); + int32x4_t c2 = vsubq_s32(b0, b2); + int32x4_t c1 = vaddq_s32(b1, b3); + int32x4_t c3 = vsubq_s32(b1, b3); + int32x4_t c4 = vaddq_s32(b4, b6); + int32x4_t c6 = vsubq_s32(b4, b6); + int32x4_t c5 = vaddq_s32(b5, b7); + int32x4_t c7 = vsubq_s32(b5, b7); + + int32x4_t d0 = vaddq_s32(c0, c4); + int32x4_t d2 = vsubq_s32(c0, c4); + int32x4_t d7 = vaddq_s32(c1, c5); + int32x4_t d6 = vsubq_s32(c1, c5); + int32x4_t d3 = vaddq_s32(c2, c6); + int32x4_t d1 = vsubq_s32(c2, c6); + int32x4_t d4 = vaddq_s32(c3, c7); + int32x4_t d5 = vsubq_s32(c3, c7); + + store_s32q_to_tran_low(coeff + 0, d0); + store_s32q_to_tran_low(coeff + 4, d1); + store_s32q_to_tran_low(coeff + 8, d2); + store_s32q_to_tran_low(coeff + 12, d3); + store_s32q_to_tran_low(coeff + 16, d4); + store_s32q_to_tran_low(coeff + 20, d5); + store_s32q_to_tran_low(coeff + 24, d6); + store_s32q_to_tran_low(coeff + 28, d7); +} + +void vpx_highbd_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int16x4_t b0, b1, b2, b3, b4, b5, b6, b7; + + int16x8_t s0 = vld1q_s16(src_diff + 0 * src_stride); + int16x8_t s1 = vld1q_s16(src_diff + 1 * src_stride); + int16x8_t s2 = vld1q_s16(src_diff + 2 * src_stride); + int16x8_t s3 = vld1q_s16(src_diff + 3 * src_stride); + int16x8_t s4 = vld1q_s16(src_diff + 4 * src_stride); + int16x8_t s5 = vld1q_s16(src_diff + 5 * src_stride); + int16x8_t s6 = vld1q_s16(src_diff + 6 * src_stride); + int16x8_t s7 = vld1q_s16(src_diff + 7 * src_stride); + + // For the first pass we can stay in 16-bit elements (4095*8 = 32760). + hadamard_highbd_col8_first_pass(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + // For the second pass we need to widen to 32-bit elements, so we're + // processing 4 columns at a time. + // Skip the second transpose because it is not required. + + b0 = vget_low_s16(s0); + b1 = vget_low_s16(s1); + b2 = vget_low_s16(s2); + b3 = vget_low_s16(s3); + b4 = vget_low_s16(s4); + b5 = vget_low_s16(s5); + b6 = vget_low_s16(s6); + b7 = vget_low_s16(s7); + + hadamard_highbd_col4_second_pass(b0, b1, b2, b3, b4, b5, b6, b7, coeff); + + b0 = vget_high_s16(s0); + b1 = vget_high_s16(s1); + b2 = vget_high_s16(s2); + b3 = vget_high_s16(s3); + b4 = vget_high_s16(s4); + b5 = vget_high_s16(s5); + b6 = vget_high_s16(s6); + b7 = vget_high_s16(s7); + + hadamard_highbd_col4_second_pass(b0, b1, b2, b3, b4, b5, b6, b7, coeff + 32); +} + +void vpx_highbd_hadamard_16x16_neon(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff) { + int i = 0; + + // Rearrange 16x16 to 8x32 and remove stride. + // Top left first. + vpx_highbd_hadamard_8x8_neon(src_diff, src_stride, coeff); + // Top right. + vpx_highbd_hadamard_8x8_neon(src_diff + 8, src_stride, coeff + 64); + // Bottom left. + vpx_highbd_hadamard_8x8_neon(src_diff + 8 * src_stride, src_stride, + coeff + 128); + // Bottom right. + vpx_highbd_hadamard_8x8_neon(src_diff + 8 * src_stride + 8, src_stride, + coeff + 192); + + do { + int32x4_t a0 = load_tran_low_to_s32q(coeff + 4 * i); + int32x4_t a1 = load_tran_low_to_s32q(coeff + 4 * i + 64); + int32x4_t a2 = load_tran_low_to_s32q(coeff + 4 * i + 128); + int32x4_t a3 = load_tran_low_to_s32q(coeff + 4 * i + 192); + + int32x4_t b0 = vhaddq_s32(a0, a1); + int32x4_t b1 = vhsubq_s32(a0, a1); + int32x4_t b2 = vhaddq_s32(a2, a3); + int32x4_t b3 = vhsubq_s32(a2, a3); + + int32x4_t c0 = vaddq_s32(b0, b2); + int32x4_t c1 = vaddq_s32(b1, b3); + int32x4_t c2 = vsubq_s32(b0, b2); + int32x4_t c3 = vsubq_s32(b1, b3); + + store_s32q_to_tran_low(coeff + 4 * i, c0); + store_s32q_to_tran_low(coeff + 4 * i + 64, c1); + store_s32q_to_tran_low(coeff + 4 * i + 128, c2); + store_s32q_to_tran_low(coeff + 4 * i + 192, c3); + } while (++i < 16); +} + +void vpx_highbd_hadamard_32x32_neon(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff) { + int i = 0; + + // Rearrange 32x32 to 16x64 and remove stride. + // Top left first. + vpx_highbd_hadamard_16x16_neon(src_diff, src_stride, coeff); + // Top right. + vpx_highbd_hadamard_16x16_neon(src_diff + 16, src_stride, coeff + 256); + // Bottom left. + vpx_highbd_hadamard_16x16_neon(src_diff + 16 * src_stride, src_stride, + coeff + 512); + // Bottom right. + vpx_highbd_hadamard_16x16_neon(src_diff + 16 * src_stride + 16, src_stride, + coeff + 768); + + do { + int32x4_t a0 = load_tran_low_to_s32q(coeff + 4 * i); + int32x4_t a1 = load_tran_low_to_s32q(coeff + 4 * i + 256); + int32x4_t a2 = load_tran_low_to_s32q(coeff + 4 * i + 512); + int32x4_t a3 = load_tran_low_to_s32q(coeff + 4 * i + 768); + + int32x4_t b0 = vshrq_n_s32(vaddq_s32(a0, a1), 2); + int32x4_t b1 = vshrq_n_s32(vsubq_s32(a0, a1), 2); + int32x4_t b2 = vshrq_n_s32(vaddq_s32(a2, a3), 2); + int32x4_t b3 = vshrq_n_s32(vsubq_s32(a2, a3), 2); + + int32x4_t c0 = vaddq_s32(b0, b2); + int32x4_t c1 = vaddq_s32(b1, b3); + int32x4_t c2 = vsubq_s32(b0, b2); + int32x4_t c3 = vsubq_s32(b1, b3); + + store_s32q_to_tran_low(coeff + 4 * i, c0); + store_s32q_to_tran_low(coeff + 4 * i + 256, c1); + store_s32q_to_tran_low(coeff + 4 * i + 512, c2); + store_s32q_to_tran_low(coeff + 4 * i + 768, c3); + } while (++i < 64); +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c new file mode 100644 index 0000000000..654ab42ca4 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c @@ -0,0 +1,1361 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/highbd_idct_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/inv_txfm.h" + +static INLINE int32x4_t dct_const_round_shift_high_4(const int64x2x2_t in) { + int32x2x2_t t32; + + t32.val[0] = vrshrn_n_s64(in.val[0], DCT_CONST_BITS); + t32.val[1] = vrshrn_n_s64(in.val[1], DCT_CONST_BITS); + return vcombine_s32(t32.val[0], t32.val[1]); +} + +static INLINE void dct_const_round_shift_high_4_dual( + const int64x2x2_t *const in, int32x4_t *const d0, int32x4_t *const d1) { + *d0 = dct_const_round_shift_high_4(in[0]); + *d1 = dct_const_round_shift_high_4(in[1]); +} + +static INLINE int32x4x2_t +dct_const_round_shift_high_4x2_int64x2x2(const int64x2x2_t *const in) { + int32x4x2_t out; + out.val[0] = dct_const_round_shift_high_4(in[0]); + out.val[1] = dct_const_round_shift_high_4(in[1]); + return out; +} + +static INLINE void dct_const_round_shift_high_4x2x2(const int64x2x2_t *const in, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + *d0 = dct_const_round_shift_high_4x2_int64x2x2(in + 0); + *d1 = dct_const_round_shift_high_4x2_int64x2x2(in + 2); +} + +static INLINE void highbd_idct_cospi_2_30(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_2_30_10_22, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[4]; + + t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]), + vget_low_s32(cospi_2_30_10_22), 1); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]), + vget_low_s32(cospi_2_30_10_22), 1); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]), + vget_low_s32(cospi_2_30_10_22), 1); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]), + vget_low_s32(cospi_2_30_10_22), 1); + t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]), + vget_low_s32(cospi_2_30_10_22), 1); + t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]), + vget_low_s32(cospi_2_30_10_22), 1); + t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]), + vget_low_s32(cospi_2_30_10_22), 1); + t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]), + vget_low_s32(cospi_2_30_10_22), 1); + t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]), + vget_low_s32(cospi_2_30_10_22), 0); + t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]), + vget_low_s32(cospi_2_30_10_22), 0); + t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]), + vget_low_s32(cospi_2_30_10_22), 0); + t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]), + vget_low_s32(cospi_2_30_10_22), 0); + t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]), + vget_low_s32(cospi_2_30_10_22), 0); + t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]), + vget_low_s32(cospi_2_30_10_22), 0); + t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]), + vget_low_s32(cospi_2_30_10_22), 0); + t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), + vget_low_s32(cospi_2_30_10_22), 0); + dct_const_round_shift_high_4x2x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_4_28(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_4_12_20N_28, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[4]; + + t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]), + vget_high_s32(cospi_4_12_20N_28), 1); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]), + vget_high_s32(cospi_4_12_20N_28), 1); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]), + vget_high_s32(cospi_4_12_20N_28), 1); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]), + vget_high_s32(cospi_4_12_20N_28), 1); + t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]), + vget_high_s32(cospi_4_12_20N_28), 1); + t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]), + vget_high_s32(cospi_4_12_20N_28), 1); + t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]), + vget_high_s32(cospi_4_12_20N_28), 1); + t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]), + vget_high_s32(cospi_4_12_20N_28), 1); + t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]), + vget_low_s32(cospi_4_12_20N_28), 0); + t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]), + vget_low_s32(cospi_4_12_20N_28), 0); + t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]), + vget_low_s32(cospi_4_12_20N_28), 0); + t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]), + vget_low_s32(cospi_4_12_20N_28), 0); + t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]), + vget_low_s32(cospi_4_12_20N_28), 0); + t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]), + vget_low_s32(cospi_4_12_20N_28), 0); + t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]), + vget_low_s32(cospi_4_12_20N_28), 0); + t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), + vget_low_s32(cospi_4_12_20N_28), 0); + dct_const_round_shift_high_4x2x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_6_26(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_6_26N_14_18N, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[4]; + + t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]), + vget_low_s32(cospi_6_26N_14_18N), 0); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]), + vget_low_s32(cospi_6_26N_14_18N), 0); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]), + vget_low_s32(cospi_6_26N_14_18N), 0); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]), + vget_low_s32(cospi_6_26N_14_18N), 0); + t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]), + vget_low_s32(cospi_6_26N_14_18N), 0); + t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]), + vget_low_s32(cospi_6_26N_14_18N), 0); + t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]), + vget_low_s32(cospi_6_26N_14_18N), 0); + t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]), + vget_low_s32(cospi_6_26N_14_18N), 0); + t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]), + vget_low_s32(cospi_6_26N_14_18N), 1); + t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]), + vget_low_s32(cospi_6_26N_14_18N), 1); + t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]), + vget_low_s32(cospi_6_26N_14_18N), 1); + t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]), + vget_low_s32(cospi_6_26N_14_18N), 1); + t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]), + vget_low_s32(cospi_6_26N_14_18N), 1); + t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]), + vget_low_s32(cospi_6_26N_14_18N), 1); + t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]), + vget_low_s32(cospi_6_26N_14_18N), 1); + t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), + vget_low_s32(cospi_6_26N_14_18N), 1); + dct_const_round_shift_high_4x2x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_10_22(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_2_30_10_22, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[4]; + + t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]), + vget_high_s32(cospi_2_30_10_22), 1); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]), + vget_high_s32(cospi_2_30_10_22), 1); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]), + vget_high_s32(cospi_2_30_10_22), 1); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]), + vget_high_s32(cospi_2_30_10_22), 1); + t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]), + vget_high_s32(cospi_2_30_10_22), 1); + t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]), + vget_high_s32(cospi_2_30_10_22), 1); + t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]), + vget_high_s32(cospi_2_30_10_22), 1); + t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]), + vget_high_s32(cospi_2_30_10_22), 1); + t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]), + vget_high_s32(cospi_2_30_10_22), 0); + t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]), + vget_high_s32(cospi_2_30_10_22), 0); + t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]), + vget_high_s32(cospi_2_30_10_22), 0); + t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]), + vget_high_s32(cospi_2_30_10_22), 0); + t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]), + vget_high_s32(cospi_2_30_10_22), 0); + t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]), + vget_high_s32(cospi_2_30_10_22), 0); + t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]), + vget_high_s32(cospi_2_30_10_22), 0); + t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), + vget_high_s32(cospi_2_30_10_22), 0); + dct_const_round_shift_high_4x2x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_12_20(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_4_12_20N_28, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[4]; + + t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]), + vget_low_s32(cospi_4_12_20N_28), 1); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]), + vget_low_s32(cospi_4_12_20N_28), 1); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]), + vget_low_s32(cospi_4_12_20N_28), 1); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]), + vget_low_s32(cospi_4_12_20N_28), 1); + t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]), + vget_low_s32(cospi_4_12_20N_28), 1); + t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]), + vget_low_s32(cospi_4_12_20N_28), 1); + t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]), + vget_low_s32(cospi_4_12_20N_28), 1); + t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]), + vget_low_s32(cospi_4_12_20N_28), 1); + t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]), + vget_high_s32(cospi_4_12_20N_28), 0); + t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]), + vget_high_s32(cospi_4_12_20N_28), 0); + t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]), + vget_high_s32(cospi_4_12_20N_28), 0); + t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]), + vget_high_s32(cospi_4_12_20N_28), 0); + t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]), + vget_high_s32(cospi_4_12_20N_28), 0); + t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]), + vget_high_s32(cospi_4_12_20N_28), 0); + t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]), + vget_high_s32(cospi_4_12_20N_28), 0); + t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), + vget_high_s32(cospi_4_12_20N_28), 0); + dct_const_round_shift_high_4x2x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_14_18(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_6_26N_14_18N, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[4]; + + t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]), + vget_high_s32(cospi_6_26N_14_18N), 0); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]), + vget_high_s32(cospi_6_26N_14_18N), 0); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]), + vget_high_s32(cospi_6_26N_14_18N), 0); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]), + vget_high_s32(cospi_6_26N_14_18N), 0); + t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]), + vget_high_s32(cospi_6_26N_14_18N), 0); + t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]), + vget_high_s32(cospi_6_26N_14_18N), 0); + t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]), + vget_high_s32(cospi_6_26N_14_18N), 0); + t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]), + vget_high_s32(cospi_6_26N_14_18N), 0); + t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]), + vget_high_s32(cospi_6_26N_14_18N), 1); + t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]), + vget_high_s32(cospi_6_26N_14_18N), 1); + t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]), + vget_high_s32(cospi_6_26N_14_18N), 1); + t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]), + vget_high_s32(cospi_6_26N_14_18N), 1); + t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]), + vget_high_s32(cospi_6_26N_14_18N), 1); + t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]), + vget_high_s32(cospi_6_26N_14_18N), 1); + t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]), + vget_high_s32(cospi_6_26N_14_18N), 1); + t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), + vget_high_s32(cospi_6_26N_14_18N), 1); + dct_const_round_shift_high_4x2x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_8_24_q_kernel( + const int32x4x2_t s0, const int32x4x2_t s1, const int32x4_t cospi_0_8_16_24, + int64x2x2_t *const t) { + t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]), + vget_high_s32(cospi_0_8_16_24), 1); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]), + vget_high_s32(cospi_0_8_16_24), 1); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]), + vget_high_s32(cospi_0_8_16_24), 1); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]), + vget_high_s32(cospi_0_8_16_24), 1); + t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]), + vget_high_s32(cospi_0_8_16_24), 1); + t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]), + vget_high_s32(cospi_0_8_16_24), 1); + t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]), + vget_high_s32(cospi_0_8_16_24), 1); + t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]), + vget_high_s32(cospi_0_8_16_24), 1); + t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]), + vget_low_s32(cospi_0_8_16_24), 1); + t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]), + vget_low_s32(cospi_0_8_16_24), 1); + t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]), + vget_low_s32(cospi_0_8_16_24), 1); + t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]), + vget_low_s32(cospi_0_8_16_24), 1); + t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]), + vget_low_s32(cospi_0_8_16_24), 1); + t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]), + vget_low_s32(cospi_0_8_16_24), 1); + t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]), + vget_low_s32(cospi_0_8_16_24), 1); + t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), + vget_low_s32(cospi_0_8_16_24), 1); +} + +static INLINE void highbd_idct_cospi_8_24_d_kernel( + const int32x4_t s0, const int32x4_t s1, const int32x4_t cospi_0_8_16_24, + int64x2x2_t *const t) { + t[0].val[0] = + vmull_lane_s32(vget_low_s32(s0), vget_high_s32(cospi_0_8_16_24), 1); + t[0].val[1] = + vmull_lane_s32(vget_high_s32(s0), vget_high_s32(cospi_0_8_16_24), 1); + t[1].val[0] = + vmull_lane_s32(vget_low_s32(s1), vget_high_s32(cospi_0_8_16_24), 1); + t[1].val[1] = + vmull_lane_s32(vget_high_s32(s1), vget_high_s32(cospi_0_8_16_24), 1); + t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1), + vget_low_s32(cospi_0_8_16_24), 1); + t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1), + vget_low_s32(cospi_0_8_16_24), 1); + t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s0), + vget_low_s32(cospi_0_8_16_24), 1); + t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s0), + vget_low_s32(cospi_0_8_16_24), 1); +} + +static INLINE void highbd_idct_cospi_8_24_q(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_0_8_16_24, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[4]; + + highbd_idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t); + dct_const_round_shift_high_4x2x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_8_24_d(const int32x4_t s0, + const int32x4_t s1, + const int32x4_t cospi_0_8_16_24, + int32x4_t *const d0, + int32x4_t *const d1) { + int64x2x2_t t[2]; + + highbd_idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t); + dct_const_round_shift_high_4_dual(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_8_24_neg_q(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_0_8_16_24, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[4]; + + highbd_idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t); + t[2].val[0] = vsubq_s64(vdupq_n_s64(0), t[2].val[0]); + t[2].val[1] = vsubq_s64(vdupq_n_s64(0), t[2].val[1]); + t[3].val[0] = vsubq_s64(vdupq_n_s64(0), t[3].val[0]); + t[3].val[1] = vsubq_s64(vdupq_n_s64(0), t[3].val[1]); + dct_const_round_shift_high_4x2x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_8_24_neg_d(const int32x4_t s0, + const int32x4_t s1, + const int32x4_t cospi_0_8_16_24, + int32x4_t *const d0, + int32x4_t *const d1) { + int64x2x2_t t[2]; + + highbd_idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t); + t[1].val[0] = vsubq_s64(vdupq_n_s64(0), t[1].val[0]); + t[1].val[1] = vsubq_s64(vdupq_n_s64(0), t[1].val[1]); + dct_const_round_shift_high_4_dual(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_16_16_q(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_0_8_16_24, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[6]; + + t[4].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]), + vget_high_s32(cospi_0_8_16_24), 0); + t[4].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]), + vget_high_s32(cospi_0_8_16_24), 0); + t[5].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]), + vget_high_s32(cospi_0_8_16_24), 0); + t[5].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]), + vget_high_s32(cospi_0_8_16_24), 0); + t[0].val[0] = vmlsl_lane_s32(t[4].val[0], vget_low_s32(s0.val[0]), + vget_high_s32(cospi_0_8_16_24), 0); + t[0].val[1] = vmlsl_lane_s32(t[4].val[1], vget_high_s32(s0.val[0]), + vget_high_s32(cospi_0_8_16_24), 0); + t[1].val[0] = vmlsl_lane_s32(t[5].val[0], vget_low_s32(s0.val[1]), + vget_high_s32(cospi_0_8_16_24), 0); + t[1].val[1] = vmlsl_lane_s32(t[5].val[1], vget_high_s32(s0.val[1]), + vget_high_s32(cospi_0_8_16_24), 0); + t[2].val[0] = vmlal_lane_s32(t[4].val[0], vget_low_s32(s0.val[0]), + vget_high_s32(cospi_0_8_16_24), 0); + t[2].val[1] = vmlal_lane_s32(t[4].val[1], vget_high_s32(s0.val[0]), + vget_high_s32(cospi_0_8_16_24), 0); + t[3].val[0] = vmlal_lane_s32(t[5].val[0], vget_low_s32(s0.val[1]), + vget_high_s32(cospi_0_8_16_24), 0); + t[3].val[1] = vmlal_lane_s32(t[5].val[1], vget_high_s32(s0.val[1]), + vget_high_s32(cospi_0_8_16_24), 0); + dct_const_round_shift_high_4x2x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_16_16_d(const int32x4_t s0, + const int32x4_t s1, + const int32x4_t cospi_0_8_16_24, + int32x4_t *const d0, + int32x4_t *const d1) { + int64x2x2_t t[3]; + + t[2].val[0] = + vmull_lane_s32(vget_low_s32(s1), vget_high_s32(cospi_0_8_16_24), 0); + t[2].val[1] = + vmull_lane_s32(vget_high_s32(s1), vget_high_s32(cospi_0_8_16_24), 0); + t[0].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0), + vget_high_s32(cospi_0_8_16_24), 0); + t[0].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0), + vget_high_s32(cospi_0_8_16_24), 0); + t[1].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0), + vget_high_s32(cospi_0_8_16_24), 0); + t[1].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0), + vget_high_s32(cospi_0_8_16_24), 0); + dct_const_round_shift_high_4_dual(t, d0, d1); +} + +static INLINE void highbd_idct16x16_add_stage7_dual( + const int32x4x2_t *const step2, int32x4x2_t *const out) { + out[0].val[0] = vaddq_s32(step2[0].val[0], step2[15].val[0]); + out[0].val[1] = vaddq_s32(step2[0].val[1], step2[15].val[1]); + out[1].val[0] = vaddq_s32(step2[1].val[0], step2[14].val[0]); + out[1].val[1] = vaddq_s32(step2[1].val[1], step2[14].val[1]); + out[2].val[0] = vaddq_s32(step2[2].val[0], step2[13].val[0]); + out[2].val[1] = vaddq_s32(step2[2].val[1], step2[13].val[1]); + out[3].val[0] = vaddq_s32(step2[3].val[0], step2[12].val[0]); + out[3].val[1] = vaddq_s32(step2[3].val[1], step2[12].val[1]); + out[4].val[0] = vaddq_s32(step2[4].val[0], step2[11].val[0]); + out[4].val[1] = vaddq_s32(step2[4].val[1], step2[11].val[1]); + out[5].val[0] = vaddq_s32(step2[5].val[0], step2[10].val[0]); + out[5].val[1] = vaddq_s32(step2[5].val[1], step2[10].val[1]); + out[6].val[0] = vaddq_s32(step2[6].val[0], step2[9].val[0]); + out[6].val[1] = vaddq_s32(step2[6].val[1], step2[9].val[1]); + out[7].val[0] = vaddq_s32(step2[7].val[0], step2[8].val[0]); + out[7].val[1] = vaddq_s32(step2[7].val[1], step2[8].val[1]); + out[8].val[0] = vsubq_s32(step2[7].val[0], step2[8].val[0]); + out[8].val[1] = vsubq_s32(step2[7].val[1], step2[8].val[1]); + out[9].val[0] = vsubq_s32(step2[6].val[0], step2[9].val[0]); + out[9].val[1] = vsubq_s32(step2[6].val[1], step2[9].val[1]); + out[10].val[0] = vsubq_s32(step2[5].val[0], step2[10].val[0]); + out[10].val[1] = vsubq_s32(step2[5].val[1], step2[10].val[1]); + out[11].val[0] = vsubq_s32(step2[4].val[0], step2[11].val[0]); + out[11].val[1] = vsubq_s32(step2[4].val[1], step2[11].val[1]); + out[12].val[0] = vsubq_s32(step2[3].val[0], step2[12].val[0]); + out[12].val[1] = vsubq_s32(step2[3].val[1], step2[12].val[1]); + out[13].val[0] = vsubq_s32(step2[2].val[0], step2[13].val[0]); + out[13].val[1] = vsubq_s32(step2[2].val[1], step2[13].val[1]); + out[14].val[0] = vsubq_s32(step2[1].val[0], step2[14].val[0]); + out[14].val[1] = vsubq_s32(step2[1].val[1], step2[14].val[1]); + out[15].val[0] = vsubq_s32(step2[0].val[0], step2[15].val[0]); + out[15].val[1] = vsubq_s32(step2[0].val[1], step2[15].val[1]); +} + +static INLINE void highbd_idct16x16_add_stage7(const int32x4_t *const step2, + int32x4_t *const out) { + out[0] = vaddq_s32(step2[0], step2[15]); + out[1] = vaddq_s32(step2[1], step2[14]); + out[2] = vaddq_s32(step2[2], step2[13]); + out[3] = vaddq_s32(step2[3], step2[12]); + out[4] = vaddq_s32(step2[4], step2[11]); + out[5] = vaddq_s32(step2[5], step2[10]); + out[6] = vaddq_s32(step2[6], step2[9]); + out[7] = vaddq_s32(step2[7], step2[8]); + out[8] = vsubq_s32(step2[7], step2[8]); + out[9] = vsubq_s32(step2[6], step2[9]); + out[10] = vsubq_s32(step2[5], step2[10]); + out[11] = vsubq_s32(step2[4], step2[11]); + out[12] = vsubq_s32(step2[3], step2[12]); + out[13] = vsubq_s32(step2[2], step2[13]); + out[14] = vsubq_s32(step2[1], step2[14]); + out[15] = vsubq_s32(step2[0], step2[15]); +} + +void vpx_highbd_idct16x16_256_add_half1d(const int32_t *input, int32_t *output, + uint16_t *dest, const int stride, + const int bd) { + const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0); + const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4); + const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8); + const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12); + int32x4x2_t in[16], step1[16], step2[16], out[16]; + + // Load input (16x8) + in[0].val[0] = vld1q_s32(input); + in[0].val[1] = vld1q_s32(input + 4); + input += 8; + in[8].val[0] = vld1q_s32(input); + in[8].val[1] = vld1q_s32(input + 4); + input += 8; + in[1].val[0] = vld1q_s32(input); + in[1].val[1] = vld1q_s32(input + 4); + input += 8; + in[9].val[0] = vld1q_s32(input); + in[9].val[1] = vld1q_s32(input + 4); + input += 8; + in[2].val[0] = vld1q_s32(input); + in[2].val[1] = vld1q_s32(input + 4); + input += 8; + in[10].val[0] = vld1q_s32(input); + in[10].val[1] = vld1q_s32(input + 4); + input += 8; + in[3].val[0] = vld1q_s32(input); + in[3].val[1] = vld1q_s32(input + 4); + input += 8; + in[11].val[0] = vld1q_s32(input); + in[11].val[1] = vld1q_s32(input + 4); + input += 8; + in[4].val[0] = vld1q_s32(input); + in[4].val[1] = vld1q_s32(input + 4); + input += 8; + in[12].val[0] = vld1q_s32(input); + in[12].val[1] = vld1q_s32(input + 4); + input += 8; + in[5].val[0] = vld1q_s32(input); + in[5].val[1] = vld1q_s32(input + 4); + input += 8; + in[13].val[0] = vld1q_s32(input); + in[13].val[1] = vld1q_s32(input + 4); + input += 8; + in[6].val[0] = vld1q_s32(input); + in[6].val[1] = vld1q_s32(input + 4); + input += 8; + in[14].val[0] = vld1q_s32(input); + in[14].val[1] = vld1q_s32(input + 4); + input += 8; + in[7].val[0] = vld1q_s32(input); + in[7].val[1] = vld1q_s32(input + 4); + input += 8; + in[15].val[0] = vld1q_s32(input); + in[15].val[1] = vld1q_s32(input + 4); + + // Transpose + transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + transpose_s32_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14], + &in[15]); + + // stage 1 + step1[0] = in[0 / 2]; + step1[1] = in[16 / 2]; + step1[2] = in[8 / 2]; + step1[3] = in[24 / 2]; + step1[4] = in[4 / 2]; + step1[5] = in[20 / 2]; + step1[6] = in[12 / 2]; + step1[7] = in[28 / 2]; + step1[8] = in[2 / 2]; + step1[9] = in[18 / 2]; + step1[10] = in[10 / 2]; + step1[11] = in[26 / 2]; + step1[12] = in[6 / 2]; + step1[13] = in[22 / 2]; + step1[14] = in[14 / 2]; + step1[15] = in[30 / 2]; + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + highbd_idct_cospi_2_30(step1[8], step1[15], cospi_2_30_10_22, &step2[8], + &step2[15]); + highbd_idct_cospi_14_18(step1[9], step1[14], cospi_6_26N_14_18N, &step2[9], + &step2[14]); + highbd_idct_cospi_10_22(step1[10], step1[13], cospi_2_30_10_22, &step2[10], + &step2[13]); + highbd_idct_cospi_6_26(step1[11], step1[12], cospi_6_26N_14_18N, &step2[11], + &step2[12]); + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + highbd_idct_cospi_4_28(step2[4], step2[7], cospi_4_12_20N_28, &step1[4], + &step1[7]); + highbd_idct_cospi_12_20(step2[5], step2[6], cospi_4_12_20N_28, &step1[5], + &step1[6]); + step1[8].val[0] = vaddq_s32(step2[8].val[0], step2[9].val[0]); + step1[8].val[1] = vaddq_s32(step2[8].val[1], step2[9].val[1]); + step1[9].val[0] = vsubq_s32(step2[8].val[0], step2[9].val[0]); + step1[9].val[1] = vsubq_s32(step2[8].val[1], step2[9].val[1]); + step1[10].val[0] = vsubq_s32(step2[11].val[0], step2[10].val[0]); + step1[10].val[1] = vsubq_s32(step2[11].val[1], step2[10].val[1]); + step1[11].val[0] = vaddq_s32(step2[11].val[0], step2[10].val[0]); + step1[11].val[1] = vaddq_s32(step2[11].val[1], step2[10].val[1]); + step1[12].val[0] = vaddq_s32(step2[12].val[0], step2[13].val[0]); + step1[12].val[1] = vaddq_s32(step2[12].val[1], step2[13].val[1]); + step1[13].val[0] = vsubq_s32(step2[12].val[0], step2[13].val[0]); + step1[13].val[1] = vsubq_s32(step2[12].val[1], step2[13].val[1]); + step1[14].val[0] = vsubq_s32(step2[15].val[0], step2[14].val[0]); + step1[14].val[1] = vsubq_s32(step2[15].val[1], step2[14].val[1]); + step1[15].val[0] = vaddq_s32(step2[15].val[0], step2[14].val[0]); + step1[15].val[1] = vaddq_s32(step2[15].val[1], step2[14].val[1]); + + // stage 4 + highbd_idct_cospi_16_16_q(step1[1], step1[0], cospi_0_8_16_24, &step2[1], + &step2[0]); + highbd_idct_cospi_8_24_q(step1[2], step1[3], cospi_0_8_16_24, &step2[2], + &step2[3]); + step2[4].val[0] = vaddq_s32(step1[4].val[0], step1[5].val[0]); + step2[4].val[1] = vaddq_s32(step1[4].val[1], step1[5].val[1]); + step2[5].val[0] = vsubq_s32(step1[4].val[0], step1[5].val[0]); + step2[5].val[1] = vsubq_s32(step1[4].val[1], step1[5].val[1]); + step2[6].val[0] = vsubq_s32(step1[7].val[0], step1[6].val[0]); + step2[6].val[1] = vsubq_s32(step1[7].val[1], step1[6].val[1]); + step2[7].val[0] = vaddq_s32(step1[7].val[0], step1[6].val[0]); + step2[7].val[1] = vaddq_s32(step1[7].val[1], step1[6].val[1]); + step2[8] = step1[8]; + highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9], + &step2[14]); + highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, + &step2[13], &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0].val[0] = vaddq_s32(step2[0].val[0], step2[3].val[0]); + step1[0].val[1] = vaddq_s32(step2[0].val[1], step2[3].val[1]); + step1[1].val[0] = vaddq_s32(step2[1].val[0], step2[2].val[0]); + step1[1].val[1] = vaddq_s32(step2[1].val[1], step2[2].val[1]); + step1[2].val[0] = vsubq_s32(step2[1].val[0], step2[2].val[0]); + step1[2].val[1] = vsubq_s32(step2[1].val[1], step2[2].val[1]); + step1[3].val[0] = vsubq_s32(step2[0].val[0], step2[3].val[0]); + step1[3].val[1] = vsubq_s32(step2[0].val[1], step2[3].val[1]); + step1[4] = step2[4]; + highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], + &step1[6]); + step1[7] = step2[7]; + step1[8].val[0] = vaddq_s32(step2[8].val[0], step2[11].val[0]); + step1[8].val[1] = vaddq_s32(step2[8].val[1], step2[11].val[1]); + step1[9].val[0] = vaddq_s32(step2[9].val[0], step2[10].val[0]); + step1[9].val[1] = vaddq_s32(step2[9].val[1], step2[10].val[1]); + step1[10].val[0] = vsubq_s32(step2[9].val[0], step2[10].val[0]); + step1[10].val[1] = vsubq_s32(step2[9].val[1], step2[10].val[1]); + step1[11].val[0] = vsubq_s32(step2[8].val[0], step2[11].val[0]); + step1[11].val[1] = vsubq_s32(step2[8].val[1], step2[11].val[1]); + step1[12].val[0] = vsubq_s32(step2[15].val[0], step2[12].val[0]); + step1[12].val[1] = vsubq_s32(step2[15].val[1], step2[12].val[1]); + step1[13].val[0] = vsubq_s32(step2[14].val[0], step2[13].val[0]); + step1[13].val[1] = vsubq_s32(step2[14].val[1], step2[13].val[1]); + step1[14].val[0] = vaddq_s32(step2[14].val[0], step2[13].val[0]); + step1[14].val[1] = vaddq_s32(step2[14].val[1], step2[13].val[1]); + step1[15].val[0] = vaddq_s32(step2[15].val[0], step2[12].val[0]); + step1[15].val[1] = vaddq_s32(step2[15].val[1], step2[12].val[1]); + + // stage 6 + step2[0].val[0] = vaddq_s32(step1[0].val[0], step1[7].val[0]); + step2[0].val[1] = vaddq_s32(step1[0].val[1], step1[7].val[1]); + step2[1].val[0] = vaddq_s32(step1[1].val[0], step1[6].val[0]); + step2[1].val[1] = vaddq_s32(step1[1].val[1], step1[6].val[1]); + step2[2].val[0] = vaddq_s32(step1[2].val[0], step1[5].val[0]); + step2[2].val[1] = vaddq_s32(step1[2].val[1], step1[5].val[1]); + step2[3].val[0] = vaddq_s32(step1[3].val[0], step1[4].val[0]); + step2[3].val[1] = vaddq_s32(step1[3].val[1], step1[4].val[1]); + step2[4].val[0] = vsubq_s32(step1[3].val[0], step1[4].val[0]); + step2[4].val[1] = vsubq_s32(step1[3].val[1], step1[4].val[1]); + step2[5].val[0] = vsubq_s32(step1[2].val[0], step1[5].val[0]); + step2[5].val[1] = vsubq_s32(step1[2].val[1], step1[5].val[1]); + step2[6].val[0] = vsubq_s32(step1[1].val[0], step1[6].val[0]); + step2[6].val[1] = vsubq_s32(step1[1].val[1], step1[6].val[1]); + step2[7].val[0] = vsubq_s32(step1[0].val[0], step1[7].val[0]); + step2[7].val[1] = vsubq_s32(step1[0].val[1], step1[7].val[1]); + highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10], + &step2[13]); + highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11], + &step2[12]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + highbd_idct16x16_add_stage7_dual(step2, out); + + if (output) { + highbd_idct16x16_store_pass1(out, output); + } else { + highbd_idct16x16_add_store(out, dest, stride, bd); + } +} + +static INLINE int32x4x2_t highbd_idct_cospi_lane0_dual(const int32x4x2_t s, + const int32x2_t coef) { + int64x2x2_t t[2]; + + t[0].val[0] = vmull_lane_s32(vget_low_s32(s.val[0]), coef, 0); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s.val[0]), coef, 0); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s.val[1]), coef, 0); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s.val[1]), coef, 0); + return dct_const_round_shift_high_4x2_int64x2x2(t); +} + +static INLINE int32x4_t highbd_idct_cospi_lane0(const int32x4_t s, + const int32x2_t coef) { + int64x2x2_t t; + + t.val[0] = vmull_lane_s32(vget_low_s32(s), coef, 0); + t.val[1] = vmull_lane_s32(vget_high_s32(s), coef, 0); + return dct_const_round_shift_high_4(t); +} + +static INLINE int32x4x2_t highbd_idct_cospi_lane1_dual(const int32x4x2_t s, + const int32x2_t coef) { + int64x2x2_t t[2]; + + t[0].val[0] = vmull_lane_s32(vget_low_s32(s.val[0]), coef, 1); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s.val[0]), coef, 1); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s.val[1]), coef, 1); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s.val[1]), coef, 1); + return dct_const_round_shift_high_4x2_int64x2x2(t); +} + +static INLINE int32x4_t highbd_idct_cospi_lane1(const int32x4_t s, + const int32x2_t coef) { + int64x2x2_t t; + + t.val[0] = vmull_lane_s32(vget_low_s32(s), coef, 1); + t.val[1] = vmull_lane_s32(vget_high_s32(s), coef, 1); + return dct_const_round_shift_high_4(t); +} + +static void vpx_highbd_idct16x16_38_add_half1d(const int32_t *input, + int32_t *output, uint16_t *dest, + const int stride, const int bd) { + const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0); + const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4); + const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8); + const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12); + int32x4x2_t in[8], step1[16], step2[16], out[16]; + + // Load input (8x8) + in[0].val[0] = vld1q_s32(input); + in[0].val[1] = vld1q_s32(input + 4); + input += 16; + in[1].val[0] = vld1q_s32(input); + in[1].val[1] = vld1q_s32(input + 4); + input += 16; + in[2].val[0] = vld1q_s32(input); + in[2].val[1] = vld1q_s32(input + 4); + input += 16; + in[3].val[0] = vld1q_s32(input); + in[3].val[1] = vld1q_s32(input + 4); + input += 16; + in[4].val[0] = vld1q_s32(input); + in[4].val[1] = vld1q_s32(input + 4); + input += 16; + in[5].val[0] = vld1q_s32(input); + in[5].val[1] = vld1q_s32(input + 4); + input += 16; + in[6].val[0] = vld1q_s32(input); + in[6].val[1] = vld1q_s32(input + 4); + input += 16; + in[7].val[0] = vld1q_s32(input); + in[7].val[1] = vld1q_s32(input + 4); + + // Transpose + transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + + // stage 1 + step1[0] = in[0 / 2]; + step1[2] = in[8 / 2]; + step1[4] = in[4 / 2]; + step1[6] = in[12 / 2]; + step1[8] = in[2 / 2]; + step1[10] = in[10 / 2]; + step1[12] = in[6 / 2]; + step1[14] = in[14 / 2]; // 0 in pass 1 + + // stage 2 + step2[0] = step1[0]; + step2[2] = step1[2]; + step2[4] = step1[4]; + step2[6] = step1[6]; + step2[8] = + highbd_idct_cospi_lane1_dual(step1[8], vget_low_s32(cospi_2_30_10_22)); + step2[9] = highbd_idct_cospi_lane1_dual(step1[14], + vget_high_s32(cospi_6_26N_14_18N)); + step2[10] = + highbd_idct_cospi_lane1_dual(step1[10], vget_high_s32(cospi_2_30_10_22)); + step2[11] = + highbd_idct_cospi_lane1_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N)); + step2[12] = + highbd_idct_cospi_lane0_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N)); + step2[13] = + highbd_idct_cospi_lane0_dual(step1[10], vget_high_s32(cospi_2_30_10_22)); + step2[14] = highbd_idct_cospi_lane0_dual(step1[14], + vget_high_s32(cospi_6_26N_14_18N)); + step2[15] = + highbd_idct_cospi_lane0_dual(step1[8], vget_low_s32(cospi_2_30_10_22)); + + // stage 3 + step1[0] = step2[0]; + step1[2] = step2[2]; + step1[4] = + highbd_idct_cospi_lane1_dual(step2[4], vget_high_s32(cospi_4_12_20N_28)); + step1[5] = + highbd_idct_cospi_lane0_dual(step2[6], vget_high_s32(cospi_4_12_20N_28)); + step1[6] = + highbd_idct_cospi_lane1_dual(step2[6], vget_low_s32(cospi_4_12_20N_28)); + step1[7] = + highbd_idct_cospi_lane0_dual(step2[4], vget_low_s32(cospi_4_12_20N_28)); + step1[8] = highbd_idct_add_dual(step2[8], step2[9]); + step1[9] = highbd_idct_sub_dual(step2[8], step2[9]); + step1[10] = highbd_idct_sub_dual(step2[11], step2[10]); + step1[11] = highbd_idct_add_dual(step2[11], step2[10]); + step1[12] = highbd_idct_add_dual(step2[12], step2[13]); + step1[13] = highbd_idct_sub_dual(step2[12], step2[13]); + step1[14] = highbd_idct_sub_dual(step2[15], step2[14]); + step1[15] = highbd_idct_add_dual(step2[15], step2[14]); + + // stage 4 + step2[0] = step2[1] = + highbd_idct_cospi_lane0_dual(step1[0], vget_high_s32(cospi_0_8_16_24)); + step2[2] = + highbd_idct_cospi_lane1_dual(step1[2], vget_high_s32(cospi_0_8_16_24)); + step2[3] = + highbd_idct_cospi_lane1_dual(step1[2], vget_low_s32(cospi_0_8_16_24)); + step2[4] = highbd_idct_add_dual(step1[4], step1[5]); + step2[5] = highbd_idct_sub_dual(step1[4], step1[5]); + step2[6] = highbd_idct_sub_dual(step1[7], step1[6]); + step2[7] = highbd_idct_add_dual(step1[7], step1[6]); + step2[8] = step1[8]; + highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9], + &step2[14]); + highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, + &step2[13], &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0] = highbd_idct_add_dual(step2[0], step2[3]); + step1[1] = highbd_idct_add_dual(step2[1], step2[2]); + step1[2] = highbd_idct_sub_dual(step2[1], step2[2]); + step1[3] = highbd_idct_sub_dual(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], + &step1[6]); + step1[7] = step2[7]; + step1[8] = highbd_idct_add_dual(step2[8], step2[11]); + step1[9] = highbd_idct_add_dual(step2[9], step2[10]); + step1[10] = highbd_idct_sub_dual(step2[9], step2[10]); + step1[11] = highbd_idct_sub_dual(step2[8], step2[11]); + step1[12] = highbd_idct_sub_dual(step2[15], step2[12]); + step1[13] = highbd_idct_sub_dual(step2[14], step2[13]); + step1[14] = highbd_idct_add_dual(step2[14], step2[13]); + step1[15] = highbd_idct_add_dual(step2[15], step2[12]); + + // stage 6 + step2[0] = highbd_idct_add_dual(step1[0], step1[7]); + step2[1] = highbd_idct_add_dual(step1[1], step1[6]); + step2[2] = highbd_idct_add_dual(step1[2], step1[5]); + step2[3] = highbd_idct_add_dual(step1[3], step1[4]); + step2[4] = highbd_idct_sub_dual(step1[3], step1[4]); + step2[5] = highbd_idct_sub_dual(step1[2], step1[5]); + step2[6] = highbd_idct_sub_dual(step1[1], step1[6]); + step2[7] = highbd_idct_sub_dual(step1[0], step1[7]); + highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10], + &step2[13]); + highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11], + &step2[12]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + highbd_idct16x16_add_stage7_dual(step2, out); + + if (output) { + highbd_idct16x16_store_pass1(out, output); + } else { + highbd_idct16x16_add_store(out, dest, stride, bd); + } +} + +static void highbd_idct16x16_10_add_half1d_pass1(const tran_low_t *input, + int32_t *output) { + const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0); + const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4); + const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8); + const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12); + int32x4_t in[4], step1[16], step2[16], out[16]; + + // Load input (4x4) + in[0] = vld1q_s32(input); + input += 16; + in[1] = vld1q_s32(input); + input += 16; + in[2] = vld1q_s32(input); + input += 16; + in[3] = vld1q_s32(input); + + // Transpose + transpose_s32_4x4(&in[0], &in[1], &in[2], &in[3]); + + // stage 1 + step1[0] = in[0 / 2]; + step1[4] = in[4 / 2]; + step1[8] = in[2 / 2]; + step1[12] = in[6 / 2]; + + // stage 2 + step2[0] = step1[0]; + step2[4] = step1[4]; + step2[8] = highbd_idct_cospi_lane1(step1[8], vget_low_s32(cospi_2_30_10_22)); + step2[11] = + highbd_idct_cospi_lane1(step1[12], vget_low_s32(cospi_6_26N_14_18N)); + step2[12] = + highbd_idct_cospi_lane0(step1[12], vget_low_s32(cospi_6_26N_14_18N)); + step2[15] = highbd_idct_cospi_lane0(step1[8], vget_low_s32(cospi_2_30_10_22)); + + // stage 3 + step1[0] = step2[0]; + step1[4] = + highbd_idct_cospi_lane1(step2[4], vget_high_s32(cospi_4_12_20N_28)); + step1[7] = highbd_idct_cospi_lane0(step2[4], vget_low_s32(cospi_4_12_20N_28)); + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + step1[14] = step2[15]; + step1[15] = step2[15]; + + // stage 4 + step2[0] = step2[1] = + highbd_idct_cospi_lane0(step1[0], vget_high_s32(cospi_0_8_16_24)); + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + step2[8] = step1[8]; + highbd_idct_cospi_8_24_d(step1[14], step1[9], cospi_0_8_16_24, &step2[9], + &step2[14]); + highbd_idct_cospi_8_24_neg_d(step1[13], step1[10], cospi_0_8_16_24, + &step2[13], &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[1]; + step1[3] = step2[0]; + step1[4] = step2[4]; + highbd_idct_cospi_16_16_d(step2[5], step2[6], cospi_0_8_16_24, &step1[5], + &step1[6]); + step1[7] = step2[7]; + step1[8] = vaddq_s32(step2[8], step2[11]); + step1[9] = vaddq_s32(step2[9], step2[10]); + step1[10] = vsubq_s32(step2[9], step2[10]); + step1[11] = vsubq_s32(step2[8], step2[11]); + step1[12] = vsubq_s32(step2[15], step2[12]); + step1[13] = vsubq_s32(step2[14], step2[13]); + step1[14] = vaddq_s32(step2[14], step2[13]); + step1[15] = vaddq_s32(step2[15], step2[12]); + + // stage 6 + step2[0] = vaddq_s32(step1[0], step1[7]); + step2[1] = vaddq_s32(step1[1], step1[6]); + step2[2] = vaddq_s32(step1[2], step1[5]); + step2[3] = vaddq_s32(step1[3], step1[4]); + step2[4] = vsubq_s32(step1[3], step1[4]); + step2[5] = vsubq_s32(step1[2], step1[5]); + step2[6] = vsubq_s32(step1[1], step1[6]); + step2[7] = vsubq_s32(step1[0], step1[7]); + highbd_idct_cospi_16_16_d(step1[10], step1[13], cospi_0_8_16_24, &step2[10], + &step2[13]); + highbd_idct_cospi_16_16_d(step1[11], step1[12], cospi_0_8_16_24, &step2[11], + &step2[12]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + highbd_idct16x16_add_stage7(step2, out); + + // pass 1: save the result into output + vst1q_s32(output, out[0]); + output += 4; + vst1q_s32(output, out[1]); + output += 4; + vst1q_s32(output, out[2]); + output += 4; + vst1q_s32(output, out[3]); + output += 4; + vst1q_s32(output, out[4]); + output += 4; + vst1q_s32(output, out[5]); + output += 4; + vst1q_s32(output, out[6]); + output += 4; + vst1q_s32(output, out[7]); + output += 4; + vst1q_s32(output, out[8]); + output += 4; + vst1q_s32(output, out[9]); + output += 4; + vst1q_s32(output, out[10]); + output += 4; + vst1q_s32(output, out[11]); + output += 4; + vst1q_s32(output, out[12]); + output += 4; + vst1q_s32(output, out[13]); + output += 4; + vst1q_s32(output, out[14]); + output += 4; + vst1q_s32(output, out[15]); +} + +static void highbd_idct16x16_10_add_half1d_pass2(const int32_t *input, + int32_t *const output, + uint16_t *const dest, + const int stride, + const int bd) { + const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0); + const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4); + const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8); + const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12); + int32x4x2_t in[4], step1[16], step2[16], out[16]; + + // Load input (4x8) + in[0].val[0] = vld1q_s32(input); + input += 4; + in[0].val[1] = vld1q_s32(input); + input += 4; + in[1].val[0] = vld1q_s32(input); + input += 4; + in[1].val[1] = vld1q_s32(input); + input += 4; + in[2].val[0] = vld1q_s32(input); + input += 4; + in[2].val[1] = vld1q_s32(input); + input += 4; + in[3].val[0] = vld1q_s32(input); + input += 4; + in[3].val[1] = vld1q_s32(input); + + // Transpose + transpose_s32_4x8(&in[0].val[0], &in[0].val[1], &in[1].val[0], &in[1].val[1], + &in[2].val[0], &in[2].val[1], &in[3].val[0], &in[3].val[1]); + + // stage 1 + step1[0] = in[0 / 2]; + step1[4] = in[4 / 2]; + step1[8] = in[2 / 2]; + step1[12] = in[6 / 2]; + + // stage 2 + step2[0] = step1[0]; + step2[4] = step1[4]; + step2[8] = + highbd_idct_cospi_lane1_dual(step1[8], vget_low_s32(cospi_2_30_10_22)); + step2[11] = + highbd_idct_cospi_lane1_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N)); + step2[12] = + highbd_idct_cospi_lane0_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N)); + step2[15] = + highbd_idct_cospi_lane0_dual(step1[8], vget_low_s32(cospi_2_30_10_22)); + + // stage 3 + step1[0] = step2[0]; + step1[4] = + highbd_idct_cospi_lane1_dual(step2[4], vget_high_s32(cospi_4_12_20N_28)); + step1[7] = + highbd_idct_cospi_lane0_dual(step2[4], vget_low_s32(cospi_4_12_20N_28)); + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + step1[14] = step2[15]; + step1[15] = step2[15]; + + // stage 4 + step2[0] = step2[1] = + highbd_idct_cospi_lane0_dual(step1[0], vget_high_s32(cospi_0_8_16_24)); + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + step2[8] = step1[8]; + highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9], + &step2[14]); + highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, + &step2[13], &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[1]; + step1[3] = step2[0]; + step1[4] = step2[4]; + highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], + &step1[6]); + step1[7] = step2[7]; + step1[8] = highbd_idct_add_dual(step2[8], step2[11]); + step1[9] = highbd_idct_add_dual(step2[9], step2[10]); + step1[10] = highbd_idct_sub_dual(step2[9], step2[10]); + step1[11] = highbd_idct_sub_dual(step2[8], step2[11]); + step1[12] = highbd_idct_sub_dual(step2[15], step2[12]); + step1[13] = highbd_idct_sub_dual(step2[14], step2[13]); + step1[14] = highbd_idct_add_dual(step2[14], step2[13]); + step1[15] = highbd_idct_add_dual(step2[15], step2[12]); + + // stage 6 + step2[0] = highbd_idct_add_dual(step1[0], step1[7]); + step2[1] = highbd_idct_add_dual(step1[1], step1[6]); + step2[2] = highbd_idct_add_dual(step1[2], step1[5]); + step2[3] = highbd_idct_add_dual(step1[3], step1[4]); + step2[4] = highbd_idct_sub_dual(step1[3], step1[4]); + step2[5] = highbd_idct_sub_dual(step1[2], step1[5]); + step2[6] = highbd_idct_sub_dual(step1[1], step1[6]); + step2[7] = highbd_idct_sub_dual(step1[0], step1[7]); + highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10], + &step2[13]); + highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11], + &step2[12]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + highbd_idct16x16_add_stage7_dual(step2, out); + + if (output) { + highbd_idct16x16_store_pass1(out, output); + } else { + highbd_idct16x16_add_store(out, dest, stride, bd); + } +} + +void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + if (bd == 8) { + int16_t row_idct_output[16 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + vpx_idct16x16_256_add_half1d(input, row_idct_output, dest, stride, 1); + + // Parallel idct on the lower 8 rows + vpx_idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, dest, + stride, 1); + + // pass 2 + // Parallel idct to get the left 8 columns + vpx_idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride, 1); + + // Parallel idct to get the right 8 columns + vpx_idct16x16_256_add_half1d(row_idct_output + 8 * 16, NULL, dest + 8, + stride, 1); + } else { + int32_t row_idct_output[16 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + vpx_highbd_idct16x16_256_add_half1d(input, row_idct_output, dest, stride, + bd); + + // Parallel idct on the lower 8 rows + vpx_highbd_idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, + dest, stride, bd); + + // pass 2 + // Parallel idct to get the left 8 columns + vpx_highbd_idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride, + bd); + + // Parallel idct to get the right 8 columns + vpx_highbd_idct16x16_256_add_half1d(row_idct_output + 8 * 16, NULL, + dest + 8, stride, bd); + } +} + +void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + if (bd == 8) { + int16_t row_idct_output[16 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + vpx_idct16x16_38_add_half1d(input, row_idct_output, dest, stride, 1); + + // pass 2 + // Parallel idct to get the left 8 columns + vpx_idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, 1); + + // Parallel idct to get the right 8 columns + vpx_idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, + stride, 1); + } else { + int32_t row_idct_output[16 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + vpx_highbd_idct16x16_38_add_half1d(input, row_idct_output, dest, stride, + bd); + + // pass 2 + // Parallel idct to get the left 8 columns + vpx_highbd_idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, bd); + + // Parallel idct to get the right 8 columns + vpx_highbd_idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, + stride, bd); + } +} + +void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + if (bd == 8) { + int16_t row_idct_output[4 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + vpx_idct16x16_10_add_half1d_pass1(input, row_idct_output); + + // pass 2 + // Parallel idct to get the left 8 columns + vpx_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride, 1); + + // Parallel idct to get the right 8 columns + vpx_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8, + stride, 1); + } else { + int32_t row_idct_output[4 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + highbd_idct16x16_10_add_half1d_pass1(input, row_idct_output); + + // pass 2 + // Parallel idct to get the left 8 columns + highbd_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride, + bd); + + // Parallel idct to get the right 8 columns + highbd_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, + dest + 8, stride, bd); + } +} + +static INLINE void highbd_idct16x16_1_add_pos_kernel(uint16_t **dest, + const int stride, + const int16x8_t res, + const int16x8_t max) { + const uint16x8_t a0 = vld1q_u16(*dest + 0); + const uint16x8_t a1 = vld1q_u16(*dest + 8); + const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0)); + const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1)); + const int16x8_t c0 = vminq_s16(b0, max); + const int16x8_t c1 = vminq_s16(b1, max); + vst1q_u16(*dest + 0, vreinterpretq_u16_s16(c0)); + vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1)); + *dest += stride; +} + +static INLINE void highbd_idct16x16_1_add_neg_kernel(uint16_t **dest, + const int stride, + const int16x8_t res) { + const uint16x8_t a0 = vld1q_u16(*dest + 0); + const uint16x8_t a1 = vld1q_u16(*dest + 8); + const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0)); + const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1)); + const uint16x8_t c0 = vqshluq_n_s16(b0, 0); + const uint16x8_t c1 = vqshluq_n_s16(b1, 0); + vst1q_u16(*dest + 0, c0); + vst1q_u16(*dest + 8, c1); + *dest += stride; +} + +void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + const tran_low_t out0 = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); + const tran_low_t out1 = HIGHBD_WRAPLOW( + dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd); + const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6); + const int16x8_t dc = vdupq_n_s16(a1); + int i; + + if (a1 >= 0) { + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + for (i = 0; i < 4; ++i) { + highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max); + } + } else { + for (i = 0; i < 4; ++i) { + highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc); + highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc); + highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc); + highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc); + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c new file mode 100644 index 0000000000..5b36f73367 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c @@ -0,0 +1,640 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/txfm_common.h" + +static INLINE void load_from_transformed(const int32_t *const trans_buf, + const int first, const int second, + int32x4x2_t *const q0, + int32x4x2_t *const q1) { + q0->val[0] = vld1q_s32(trans_buf + first * 8); + q0->val[1] = vld1q_s32(trans_buf + first * 8 + 4); + q1->val[0] = vld1q_s32(trans_buf + second * 8); + q1->val[1] = vld1q_s32(trans_buf + second * 8 + 4); +} + +static INLINE void load_from_output(const int32_t *const out, const int first, + const int second, int32x4x2_t *const q0, + int32x4x2_t *const q1) { + q0->val[0] = vld1q_s32(out + first * 32); + q0->val[1] = vld1q_s32(out + first * 32 + 4); + q1->val[0] = vld1q_s32(out + second * 32); + q1->val[1] = vld1q_s32(out + second * 32 + 4); +} + +static INLINE void store_in_output(int32_t *const out, const int first, + const int second, const int32x4x2_t q0, + const int32x4x2_t q1) { + vst1q_s32(out + first * 32, q0.val[0]); + vst1q_s32(out + first * 32 + 4, q0.val[1]); + vst1q_s32(out + second * 32, q1.val[0]); + vst1q_s32(out + second * 32 + 4, q1.val[1]); +} + +static INLINE void highbd_store_combine_results( + uint16_t *p1, uint16_t *p2, const int stride, const int32x4x2_t q0, + const int32x4x2_t q1, const int32x4x2_t q2, const int32x4x2_t q3, + const int16x8_t max) { + int16x8_t o[4]; + uint16x8_t d[4]; + + d[0] = vld1q_u16(p1); + p1 += stride; + d[1] = vld1q_u16(p1); + d[3] = vld1q_u16(p2); + p2 -= stride; + d[2] = vld1q_u16(p2); + + o[0] = vcombine_s16(vrshrn_n_s32(q0.val[0], 6), vrshrn_n_s32(q0.val[1], 6)); + o[1] = vcombine_s16(vrshrn_n_s32(q1.val[0], 6), vrshrn_n_s32(q1.val[1], 6)); + o[2] = vcombine_s16(vrshrn_n_s32(q2.val[0], 6), vrshrn_n_s32(q2.val[1], 6)); + o[3] = vcombine_s16(vrshrn_n_s32(q3.val[0], 6), vrshrn_n_s32(q3.val[1], 6)); + + o[0] = vqaddq_s16(o[0], vreinterpretq_s16_u16(d[0])); + o[1] = vqaddq_s16(o[1], vreinterpretq_s16_u16(d[1])); + o[2] = vqaddq_s16(o[2], vreinterpretq_s16_u16(d[2])); + o[3] = vqaddq_s16(o[3], vreinterpretq_s16_u16(d[3])); + o[0] = vminq_s16(o[0], max); + o[1] = vminq_s16(o[1], max); + o[2] = vminq_s16(o[2], max); + o[3] = vminq_s16(o[3], max); + d[0] = vqshluq_n_s16(o[0], 0); + d[1] = vqshluq_n_s16(o[1], 0); + d[2] = vqshluq_n_s16(o[2], 0); + d[3] = vqshluq_n_s16(o[3], 0); + + vst1q_u16(p1, d[1]); + p1 -= stride; + vst1q_u16(p1, d[0]); + vst1q_u16(p2, d[2]); + p2 += stride; + vst1q_u16(p2, d[3]); +} + +static INLINE void do_butterfly(const int32x4x2_t qIn0, const int32x4x2_t qIn1, + const int32_t first_const, + const int32_t second_const, + int32x4x2_t *const qOut0, + int32x4x2_t *const qOut1) { + int64x2x2_t q[4]; + int32x2_t d[6]; + + // Note: using v{mul, mla, mls}l_n_s32 here slows down 35% with gcc 4.9. + d[4] = vdup_n_s32(first_const); + d[5] = vdup_n_s32(second_const); + + q[0].val[0] = vmull_s32(vget_low_s32(qIn0.val[0]), d[4]); + q[0].val[1] = vmull_s32(vget_high_s32(qIn0.val[0]), d[4]); + q[1].val[0] = vmull_s32(vget_low_s32(qIn0.val[1]), d[4]); + q[1].val[1] = vmull_s32(vget_high_s32(qIn0.val[1]), d[4]); + q[0].val[0] = vmlsl_s32(q[0].val[0], vget_low_s32(qIn1.val[0]), d[5]); + q[0].val[1] = vmlsl_s32(q[0].val[1], vget_high_s32(qIn1.val[0]), d[5]); + q[1].val[0] = vmlsl_s32(q[1].val[0], vget_low_s32(qIn1.val[1]), d[5]); + q[1].val[1] = vmlsl_s32(q[1].val[1], vget_high_s32(qIn1.val[1]), d[5]); + + q[2].val[0] = vmull_s32(vget_low_s32(qIn0.val[0]), d[5]); + q[2].val[1] = vmull_s32(vget_high_s32(qIn0.val[0]), d[5]); + q[3].val[0] = vmull_s32(vget_low_s32(qIn0.val[1]), d[5]); + q[3].val[1] = vmull_s32(vget_high_s32(qIn0.val[1]), d[5]); + q[2].val[0] = vmlal_s32(q[2].val[0], vget_low_s32(qIn1.val[0]), d[4]); + q[2].val[1] = vmlal_s32(q[2].val[1], vget_high_s32(qIn1.val[0]), d[4]); + q[3].val[0] = vmlal_s32(q[3].val[0], vget_low_s32(qIn1.val[1]), d[4]); + q[3].val[1] = vmlal_s32(q[3].val[1], vget_high_s32(qIn1.val[1]), d[4]); + + qOut0->val[0] = vcombine_s32(vrshrn_n_s64(q[0].val[0], DCT_CONST_BITS), + vrshrn_n_s64(q[0].val[1], DCT_CONST_BITS)); + qOut0->val[1] = vcombine_s32(vrshrn_n_s64(q[1].val[0], DCT_CONST_BITS), + vrshrn_n_s64(q[1].val[1], DCT_CONST_BITS)); + qOut1->val[0] = vcombine_s32(vrshrn_n_s64(q[2].val[0], DCT_CONST_BITS), + vrshrn_n_s64(q[2].val[1], DCT_CONST_BITS)); + qOut1->val[1] = vcombine_s32(vrshrn_n_s64(q[3].val[0], DCT_CONST_BITS), + vrshrn_n_s64(q[3].val[1], DCT_CONST_BITS)); +} + +static INLINE void load_s32x4q_dual(const int32_t *in, int32x4x2_t *const s) { + s[0].val[0] = vld1q_s32(in); + s[0].val[1] = vld1q_s32(in + 4); + in += 32; + s[1].val[0] = vld1q_s32(in); + s[1].val[1] = vld1q_s32(in + 4); + in += 32; + s[2].val[0] = vld1q_s32(in); + s[2].val[1] = vld1q_s32(in + 4); + in += 32; + s[3].val[0] = vld1q_s32(in); + s[3].val[1] = vld1q_s32(in + 4); + in += 32; + s[4].val[0] = vld1q_s32(in); + s[4].val[1] = vld1q_s32(in + 4); + in += 32; + s[5].val[0] = vld1q_s32(in); + s[5].val[1] = vld1q_s32(in + 4); + in += 32; + s[6].val[0] = vld1q_s32(in); + s[6].val[1] = vld1q_s32(in + 4); + in += 32; + s[7].val[0] = vld1q_s32(in); + s[7].val[1] = vld1q_s32(in + 4); +} + +static INLINE void transpose_and_store_s32_8x8(int32x4x2_t *const a, + int32_t **out) { + transpose_s32_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + + vst1q_s32(*out, a[0].val[0]); + *out += 4; + vst1q_s32(*out, a[0].val[1]); + *out += 4; + vst1q_s32(*out, a[1].val[0]); + *out += 4; + vst1q_s32(*out, a[1].val[1]); + *out += 4; + vst1q_s32(*out, a[2].val[0]); + *out += 4; + vst1q_s32(*out, a[2].val[1]); + *out += 4; + vst1q_s32(*out, a[3].val[0]); + *out += 4; + vst1q_s32(*out, a[3].val[1]); + *out += 4; + vst1q_s32(*out, a[4].val[0]); + *out += 4; + vst1q_s32(*out, a[4].val[1]); + *out += 4; + vst1q_s32(*out, a[5].val[0]); + *out += 4; + vst1q_s32(*out, a[5].val[1]); + *out += 4; + vst1q_s32(*out, a[6].val[0]); + *out += 4; + vst1q_s32(*out, a[6].val[1]); + *out += 4; + vst1q_s32(*out, a[7].val[0]); + *out += 4; + vst1q_s32(*out, a[7].val[1]); + *out += 4; +} + +static INLINE void idct32_transpose_pair(const int32_t *input, int32_t *t_buf) { + int i; + int32x4x2_t s[8]; + + for (i = 0; i < 4; i++, input += 8) { + load_s32x4q_dual(input, s); + transpose_and_store_s32_8x8(s, &t_buf); + } +} + +static INLINE void idct32_bands_end_1st_pass(int32_t *const out, + int32x4x2_t *const q) { + store_in_output(out, 16, 17, q[6], q[7]); + store_in_output(out, 14, 15, q[8], q[9]); + + load_from_output(out, 30, 31, &q[0], &q[1]); + q[4] = highbd_idct_add_dual(q[2], q[1]); + q[5] = highbd_idct_add_dual(q[3], q[0]); + q[6] = highbd_idct_sub_dual(q[3], q[0]); + q[7] = highbd_idct_sub_dual(q[2], q[1]); + store_in_output(out, 30, 31, q[6], q[7]); + store_in_output(out, 0, 1, q[4], q[5]); + + load_from_output(out, 12, 13, &q[0], &q[1]); + q[2] = highbd_idct_add_dual(q[10], q[1]); + q[3] = highbd_idct_add_dual(q[11], q[0]); + q[4] = highbd_idct_sub_dual(q[11], q[0]); + q[5] = highbd_idct_sub_dual(q[10], q[1]); + + load_from_output(out, 18, 19, &q[0], &q[1]); + q[8] = highbd_idct_add_dual(q[4], q[1]); + q[9] = highbd_idct_add_dual(q[5], q[0]); + q[6] = highbd_idct_sub_dual(q[5], q[0]); + q[7] = highbd_idct_sub_dual(q[4], q[1]); + store_in_output(out, 18, 19, q[6], q[7]); + store_in_output(out, 12, 13, q[8], q[9]); + + load_from_output(out, 28, 29, &q[0], &q[1]); + q[4] = highbd_idct_add_dual(q[2], q[1]); + q[5] = highbd_idct_add_dual(q[3], q[0]); + q[6] = highbd_idct_sub_dual(q[3], q[0]); + q[7] = highbd_idct_sub_dual(q[2], q[1]); + store_in_output(out, 28, 29, q[6], q[7]); + store_in_output(out, 2, 3, q[4], q[5]); + + load_from_output(out, 10, 11, &q[0], &q[1]); + q[2] = highbd_idct_add_dual(q[12], q[1]); + q[3] = highbd_idct_add_dual(q[13], q[0]); + q[4] = highbd_idct_sub_dual(q[13], q[0]); + q[5] = highbd_idct_sub_dual(q[12], q[1]); + + load_from_output(out, 20, 21, &q[0], &q[1]); + q[8] = highbd_idct_add_dual(q[4], q[1]); + q[9] = highbd_idct_add_dual(q[5], q[0]); + q[6] = highbd_idct_sub_dual(q[5], q[0]); + q[7] = highbd_idct_sub_dual(q[4], q[1]); + store_in_output(out, 20, 21, q[6], q[7]); + store_in_output(out, 10, 11, q[8], q[9]); + + load_from_output(out, 26, 27, &q[0], &q[1]); + q[4] = highbd_idct_add_dual(q[2], q[1]); + q[5] = highbd_idct_add_dual(q[3], q[0]); + q[6] = highbd_idct_sub_dual(q[3], q[0]); + q[7] = highbd_idct_sub_dual(q[2], q[1]); + store_in_output(out, 26, 27, q[6], q[7]); + store_in_output(out, 4, 5, q[4], q[5]); + + load_from_output(out, 8, 9, &q[0], &q[1]); + q[2] = highbd_idct_add_dual(q[14], q[1]); + q[3] = highbd_idct_add_dual(q[15], q[0]); + q[4] = highbd_idct_sub_dual(q[15], q[0]); + q[5] = highbd_idct_sub_dual(q[14], q[1]); + + load_from_output(out, 22, 23, &q[0], &q[1]); + q[8] = highbd_idct_add_dual(q[4], q[1]); + q[9] = highbd_idct_add_dual(q[5], q[0]); + q[6] = highbd_idct_sub_dual(q[5], q[0]); + q[7] = highbd_idct_sub_dual(q[4], q[1]); + store_in_output(out, 22, 23, q[6], q[7]); + store_in_output(out, 8, 9, q[8], q[9]); + + load_from_output(out, 24, 25, &q[0], &q[1]); + q[4] = highbd_idct_add_dual(q[2], q[1]); + q[5] = highbd_idct_add_dual(q[3], q[0]); + q[6] = highbd_idct_sub_dual(q[3], q[0]); + q[7] = highbd_idct_sub_dual(q[2], q[1]); + store_in_output(out, 24, 25, q[6], q[7]); + store_in_output(out, 6, 7, q[4], q[5]); +} + +static INLINE void idct32_bands_end_2nd_pass(const int32_t *const out, + uint16_t *const dest, + const int stride, + const int16x8_t max, + int32x4x2_t *const q) { + uint16_t *dest0 = dest + 0 * stride; + uint16_t *dest1 = dest + 31 * stride; + uint16_t *dest2 = dest + 16 * stride; + uint16_t *dest3 = dest + 15 * stride; + const int str2 = stride << 1; + + highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9], + max); + dest2 += str2; + dest3 -= str2; + + load_from_output(out, 30, 31, &q[0], &q[1]); + q[4] = highbd_idct_add_dual(q[2], q[1]); + q[5] = highbd_idct_add_dual(q[3], q[0]); + q[6] = highbd_idct_sub_dual(q[3], q[0]); + q[7] = highbd_idct_sub_dual(q[2], q[1]); + highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7], + max); + dest0 += str2; + dest1 -= str2; + + load_from_output(out, 12, 13, &q[0], &q[1]); + q[2] = highbd_idct_add_dual(q[10], q[1]); + q[3] = highbd_idct_add_dual(q[11], q[0]); + q[4] = highbd_idct_sub_dual(q[11], q[0]); + q[5] = highbd_idct_sub_dual(q[10], q[1]); + + load_from_output(out, 18, 19, &q[0], &q[1]); + q[8] = highbd_idct_add_dual(q[4], q[1]); + q[9] = highbd_idct_add_dual(q[5], q[0]); + q[6] = highbd_idct_sub_dual(q[5], q[0]); + q[7] = highbd_idct_sub_dual(q[4], q[1]); + highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9], + max); + dest2 += str2; + dest3 -= str2; + + load_from_output(out, 28, 29, &q[0], &q[1]); + q[4] = highbd_idct_add_dual(q[2], q[1]); + q[5] = highbd_idct_add_dual(q[3], q[0]); + q[6] = highbd_idct_sub_dual(q[3], q[0]); + q[7] = highbd_idct_sub_dual(q[2], q[1]); + highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7], + max); + dest0 += str2; + dest1 -= str2; + + load_from_output(out, 10, 11, &q[0], &q[1]); + q[2] = highbd_idct_add_dual(q[12], q[1]); + q[3] = highbd_idct_add_dual(q[13], q[0]); + q[4] = highbd_idct_sub_dual(q[13], q[0]); + q[5] = highbd_idct_sub_dual(q[12], q[1]); + + load_from_output(out, 20, 21, &q[0], &q[1]); + q[8] = highbd_idct_add_dual(q[4], q[1]); + q[9] = highbd_idct_add_dual(q[5], q[0]); + q[6] = highbd_idct_sub_dual(q[5], q[0]); + q[7] = highbd_idct_sub_dual(q[4], q[1]); + highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9], + max); + dest2 += str2; + dest3 -= str2; + + load_from_output(out, 26, 27, &q[0], &q[1]); + q[4] = highbd_idct_add_dual(q[2], q[1]); + q[5] = highbd_idct_add_dual(q[3], q[0]); + q[6] = highbd_idct_sub_dual(q[3], q[0]); + q[7] = highbd_idct_sub_dual(q[2], q[1]); + highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7], + max); + dest0 += str2; + dest1 -= str2; + + load_from_output(out, 8, 9, &q[0], &q[1]); + q[2] = highbd_idct_add_dual(q[14], q[1]); + q[3] = highbd_idct_add_dual(q[15], q[0]); + q[4] = highbd_idct_sub_dual(q[15], q[0]); + q[5] = highbd_idct_sub_dual(q[14], q[1]); + + load_from_output(out, 22, 23, &q[0], &q[1]); + q[8] = highbd_idct_add_dual(q[4], q[1]); + q[9] = highbd_idct_add_dual(q[5], q[0]); + q[6] = highbd_idct_sub_dual(q[5], q[0]); + q[7] = highbd_idct_sub_dual(q[4], q[1]); + highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9], + max); + + load_from_output(out, 24, 25, &q[0], &q[1]); + q[4] = highbd_idct_add_dual(q[2], q[1]); + q[5] = highbd_idct_add_dual(q[3], q[0]); + q[6] = highbd_idct_sub_dual(q[3], q[0]); + q[7] = highbd_idct_sub_dual(q[2], q[1]); + highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7], + max); +} + +static INLINE void vpx_highbd_idct32_32_neon(const tran_low_t *input, + uint16_t *dst, const int stride, + const int bd) { + int i, idct32_pass_loop; + int32_t trans_buf[32 * 8]; + int32_t pass1[32 * 32]; + int32_t pass2[32 * 32]; + int32_t *out; + int32x4x2_t q[16]; + + for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2; + idct32_pass_loop++, input = pass1, out = pass2) { + for (i = 0; i < 4; i++, out += 8) { // idct32_bands_loop + idct32_transpose_pair(input, trans_buf); + input += 32 * 8; + + // ----------------------------------------- + // BLOCK A: 16-19,28-31 + // ----------------------------------------- + // generate 16,17,30,31 + // part of stage 1 + load_from_transformed(trans_buf, 1, 31, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_31_64, cospi_1_64, &q[0], &q[2]); + load_from_transformed(trans_buf, 17, 15, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_15_64, cospi_17_64, &q[1], &q[3]); + // part of stage 2 + q[4] = highbd_idct_add_dual(q[0], q[1]); + q[13] = highbd_idct_sub_dual(q[0], q[1]); + q[6] = highbd_idct_add_dual(q[2], q[3]); + q[14] = highbd_idct_sub_dual(q[2], q[3]); + // part of stage 3 + do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[5], &q[7]); + + // generate 18,19,28,29 + // part of stage 1 + load_from_transformed(trans_buf, 9, 23, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_23_64, cospi_9_64, &q[0], &q[2]); + load_from_transformed(trans_buf, 25, 7, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_7_64, cospi_25_64, &q[1], &q[3]); + // part of stage 2 + q[13] = highbd_idct_sub_dual(q[3], q[2]); + q[3] = highbd_idct_add_dual(q[3], q[2]); + q[14] = highbd_idct_sub_dual(q[1], q[0]); + q[2] = highbd_idct_add_dual(q[1], q[0]); + // part of stage 3 + do_butterfly(q[14], q[13], -cospi_4_64, -cospi_28_64, &q[1], &q[0]); + // part of stage 4 + q[8] = highbd_idct_add_dual(q[4], q[2]); + q[9] = highbd_idct_add_dual(q[5], q[0]); + q[10] = highbd_idct_add_dual(q[7], q[1]); + q[15] = highbd_idct_add_dual(q[6], q[3]); + q[13] = highbd_idct_sub_dual(q[5], q[0]); + q[14] = highbd_idct_sub_dual(q[7], q[1]); + store_in_output(out, 16, 31, q[8], q[15]); + store_in_output(out, 17, 30, q[9], q[10]); + // part of stage 5 + do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[0], &q[1]); + store_in_output(out, 29, 18, q[1], q[0]); + // part of stage 4 + q[13] = highbd_idct_sub_dual(q[4], q[2]); + q[14] = highbd_idct_sub_dual(q[6], q[3]); + // part of stage 5 + do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[4], &q[6]); + store_in_output(out, 19, 28, q[4], q[6]); + + // ----------------------------------------- + // BLOCK B: 20-23,24-27 + // ----------------------------------------- + // generate 20,21,26,27 + // part of stage 1 + load_from_transformed(trans_buf, 5, 27, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_27_64, cospi_5_64, &q[0], &q[2]); + load_from_transformed(trans_buf, 21, 11, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_11_64, cospi_21_64, &q[1], &q[3]); + // part of stage 2 + q[13] = highbd_idct_sub_dual(q[0], q[1]); + q[0] = highbd_idct_add_dual(q[0], q[1]); + q[14] = highbd_idct_sub_dual(q[2], q[3]); + q[2] = highbd_idct_add_dual(q[2], q[3]); + // part of stage 3 + do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]); + + // generate 22,23,24,25 + // part of stage 1 + load_from_transformed(trans_buf, 13, 19, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_19_64, cospi_13_64, &q[5], &q[7]); + load_from_transformed(trans_buf, 29, 3, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_3_64, cospi_29_64, &q[4], &q[6]); + // part of stage 2 + q[14] = highbd_idct_sub_dual(q[4], q[5]); + q[5] = highbd_idct_add_dual(q[4], q[5]); + q[13] = highbd_idct_sub_dual(q[6], q[7]); + q[6] = highbd_idct_add_dual(q[6], q[7]); + // part of stage 3 + do_butterfly(q[14], q[13], -cospi_20_64, -cospi_12_64, &q[4], &q[7]); + // part of stage 4 + q[10] = highbd_idct_add_dual(q[7], q[1]); + q[11] = highbd_idct_add_dual(q[5], q[0]); + q[12] = highbd_idct_add_dual(q[6], q[2]); + q[15] = highbd_idct_add_dual(q[4], q[3]); + // part of stage 6 + load_from_output(out, 16, 17, &q[14], &q[13]); + q[8] = highbd_idct_add_dual(q[14], q[11]); + q[9] = highbd_idct_add_dual(q[13], q[10]); + q[13] = highbd_idct_sub_dual(q[13], q[10]); + q[11] = highbd_idct_sub_dual(q[14], q[11]); + store_in_output(out, 17, 16, q[9], q[8]); + load_from_output(out, 30, 31, &q[14], &q[9]); + q[8] = highbd_idct_sub_dual(q[9], q[12]); + q[10] = highbd_idct_add_dual(q[14], q[15]); + q[14] = highbd_idct_sub_dual(q[14], q[15]); + q[12] = highbd_idct_add_dual(q[9], q[12]); + store_in_output(out, 30, 31, q[10], q[12]); + // part of stage 7 + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]); + store_in_output(out, 25, 22, q[14], q[13]); + do_butterfly(q[8], q[11], cospi_16_64, cospi_16_64, &q[13], &q[14]); + store_in_output(out, 24, 23, q[14], q[13]); + // part of stage 4 + q[14] = highbd_idct_sub_dual(q[5], q[0]); + q[13] = highbd_idct_sub_dual(q[6], q[2]); + do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[5], &q[6]); + q[14] = highbd_idct_sub_dual(q[7], q[1]); + q[13] = highbd_idct_sub_dual(q[4], q[3]); + do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[0], &q[1]); + // part of stage 6 + load_from_output(out, 18, 19, &q[14], &q[13]); + q[8] = highbd_idct_add_dual(q[14], q[1]); + q[9] = highbd_idct_add_dual(q[13], q[6]); + q[13] = highbd_idct_sub_dual(q[13], q[6]); + q[1] = highbd_idct_sub_dual(q[14], q[1]); + store_in_output(out, 18, 19, q[8], q[9]); + load_from_output(out, 28, 29, &q[8], &q[9]); + q[14] = highbd_idct_sub_dual(q[8], q[5]); + q[10] = highbd_idct_add_dual(q[8], q[5]); + q[11] = highbd_idct_add_dual(q[9], q[0]); + q[0] = highbd_idct_sub_dual(q[9], q[0]); + store_in_output(out, 28, 29, q[10], q[11]); + // part of stage 7 + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]); + store_in_output(out, 20, 27, q[13], q[14]); + do_butterfly(q[0], q[1], cospi_16_64, cospi_16_64, &q[1], &q[0]); + store_in_output(out, 21, 26, q[1], q[0]); + + // ----------------------------------------- + // BLOCK C: 8-10,11-15 + // ----------------------------------------- + // generate 8,9,14,15 + // part of stage 2 + load_from_transformed(trans_buf, 2, 30, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_30_64, cospi_2_64, &q[0], &q[2]); + load_from_transformed(trans_buf, 18, 14, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_14_64, cospi_18_64, &q[1], &q[3]); + // part of stage 3 + q[13] = highbd_idct_sub_dual(q[0], q[1]); + q[0] = highbd_idct_add_dual(q[0], q[1]); + q[14] = highbd_idct_sub_dual(q[2], q[3]); + q[2] = highbd_idct_add_dual(q[2], q[3]); + // part of stage 4 + do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[1], &q[3]); + + // generate 10,11,12,13 + // part of stage 2 + load_from_transformed(trans_buf, 10, 22, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_22_64, cospi_10_64, &q[5], &q[7]); + load_from_transformed(trans_buf, 26, 6, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_6_64, cospi_26_64, &q[4], &q[6]); + // part of stage 3 + q[14] = highbd_idct_sub_dual(q[4], q[5]); + q[5] = highbd_idct_add_dual(q[4], q[5]); + q[13] = highbd_idct_sub_dual(q[6], q[7]); + q[6] = highbd_idct_add_dual(q[6], q[7]); + // part of stage 4 + do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[4], &q[7]); + // part of stage 5 + q[8] = highbd_idct_add_dual(q[0], q[5]); + q[9] = highbd_idct_add_dual(q[1], q[7]); + q[13] = highbd_idct_sub_dual(q[1], q[7]); + q[14] = highbd_idct_sub_dual(q[3], q[4]); + q[10] = highbd_idct_add_dual(q[3], q[4]); + q[15] = highbd_idct_add_dual(q[2], q[6]); + store_in_output(out, 8, 15, q[8], q[15]); + store_in_output(out, 9, 14, q[9], q[10]); + // part of stage 6 + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]); + store_in_output(out, 13, 10, q[3], q[1]); + q[13] = highbd_idct_sub_dual(q[0], q[5]); + q[14] = highbd_idct_sub_dual(q[2], q[6]); + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]); + store_in_output(out, 11, 12, q[1], q[3]); + + // ----------------------------------------- + // BLOCK D: 0-3,4-7 + // ----------------------------------------- + // generate 4,5,6,7 + // part of stage 3 + load_from_transformed(trans_buf, 4, 28, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[0], &q[2]); + load_from_transformed(trans_buf, 20, 12, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]); + // part of stage 4 + q[13] = highbd_idct_sub_dual(q[0], q[1]); + q[0] = highbd_idct_add_dual(q[0], q[1]); + q[14] = highbd_idct_sub_dual(q[2], q[3]); + q[2] = highbd_idct_add_dual(q[2], q[3]); + // part of stage 5 + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]); + + // generate 0,1,2,3 + // part of stage 4 + load_from_transformed(trans_buf, 0, 16, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[5], &q[7]); + load_from_transformed(trans_buf, 8, 24, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[14], &q[6]); + // part of stage 5 + q[4] = highbd_idct_add_dual(q[7], q[6]); + q[7] = highbd_idct_sub_dual(q[7], q[6]); + q[6] = highbd_idct_sub_dual(q[5], q[14]); + q[5] = highbd_idct_add_dual(q[5], q[14]); + // part of stage 6 + q[8] = highbd_idct_add_dual(q[4], q[2]); + q[9] = highbd_idct_add_dual(q[5], q[3]); + q[10] = highbd_idct_add_dual(q[6], q[1]); + q[11] = highbd_idct_add_dual(q[7], q[0]); + q[12] = highbd_idct_sub_dual(q[7], q[0]); + q[13] = highbd_idct_sub_dual(q[6], q[1]); + q[14] = highbd_idct_sub_dual(q[5], q[3]); + q[15] = highbd_idct_sub_dual(q[4], q[2]); + // part of stage 7 + load_from_output(out, 14, 15, &q[0], &q[1]); + q[2] = highbd_idct_add_dual(q[8], q[1]); + q[3] = highbd_idct_add_dual(q[9], q[0]); + q[4] = highbd_idct_sub_dual(q[9], q[0]); + q[5] = highbd_idct_sub_dual(q[8], q[1]); + load_from_output(out, 16, 17, &q[0], &q[1]); + q[8] = highbd_idct_add_dual(q[4], q[1]); + q[9] = highbd_idct_add_dual(q[5], q[0]); + q[6] = highbd_idct_sub_dual(q[5], q[0]); + q[7] = highbd_idct_sub_dual(q[4], q[1]); + + if (idct32_pass_loop == 0) { + idct32_bands_end_1st_pass(out, q); + } else { + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + idct32_bands_end_2nd_pass(out, dst, stride, max, q); + dst += 8; + } + } + } +} + +void vpx_highbd_idct32x32_1024_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + if (bd == 8) { + vpx_idct32_32_neon(input, CAST_TO_BYTEPTR(dest), stride, 1); + } else { + vpx_highbd_idct32_32_neon(input, dest, stride, bd); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c new file mode 100644 index 0000000000..6750c1a426 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c @@ -0,0 +1,757 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/highbd_idct_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/txfm_common.h" + +static INLINE void load_8x8_s32_dual( + const tran_low_t *input, int32x4x2_t *const in0, int32x4x2_t *const in1, + int32x4x2_t *const in2, int32x4x2_t *const in3, int32x4x2_t *const in4, + int32x4x2_t *const in5, int32x4x2_t *const in6, int32x4x2_t *const in7) { + in0->val[0] = vld1q_s32(input); + in0->val[1] = vld1q_s32(input + 4); + input += 32; + in1->val[0] = vld1q_s32(input); + in1->val[1] = vld1q_s32(input + 4); + input += 32; + in2->val[0] = vld1q_s32(input); + in2->val[1] = vld1q_s32(input + 4); + input += 32; + in3->val[0] = vld1q_s32(input); + in3->val[1] = vld1q_s32(input + 4); + input += 32; + in4->val[0] = vld1q_s32(input); + in4->val[1] = vld1q_s32(input + 4); + input += 32; + in5->val[0] = vld1q_s32(input); + in5->val[1] = vld1q_s32(input + 4); + input += 32; + in6->val[0] = vld1q_s32(input); + in6->val[1] = vld1q_s32(input + 4); + input += 32; + in7->val[0] = vld1q_s32(input); + in7->val[1] = vld1q_s32(input + 4); +} + +static INLINE void load_4x8_s32_dual(const tran_low_t *input, + int32x4_t *const in0, int32x4_t *const in1, + int32x4_t *const in2, int32x4_t *const in3, + int32x4_t *const in4, int32x4_t *const in5, + int32x4_t *const in6, + int32x4_t *const in7) { + *in0 = vld1q_s32(input); + input += 32; + *in1 = vld1q_s32(input); + input += 32; + *in2 = vld1q_s32(input); + input += 32; + *in3 = vld1q_s32(input); + input += 32; + *in4 = vld1q_s32(input); + input += 32; + *in5 = vld1q_s32(input); + input += 32; + *in6 = vld1q_s32(input); + input += 32; + *in7 = vld1q_s32(input); +} + +// Only for the first pass of the _135_ variant. Since it only uses values from +// the top left 16x16 it can safely assume all the remaining values are 0 and +// skip an awful lot of calculations. In fact, only the first 12 columns make +// the cut. None of the elements in the 13th, 14th, 15th or 16th columns are +// used so it skips any calls to input[12|13|14|15] too. +// In C this does a single row of 32 for each call. Here it transposes the top +// left 12x8 to allow using SIMD. + +// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 135 non-zero +// coefficients as follows: +// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +// 0 0 2 5 10 17 25 38 47 62 83 101 121 +// 1 1 4 8 15 22 30 45 58 74 92 112 133 +// 2 3 7 12 18 28 36 52 64 82 102 118 +// 3 6 11 16 23 31 43 60 73 90 109 126 +// 4 9 14 19 29 37 50 65 78 98 116 134 +// 5 13 20 26 35 44 54 72 85 105 123 +// 6 21 27 33 42 53 63 80 94 113 132 +// 7 24 32 39 48 57 71 88 104 120 +// 8 34 40 46 56 68 81 96 111 130 +// 9 41 49 55 67 77 91 107 124 +// 10 51 59 66 76 89 99 119 131 +// 11 61 69 75 87 100 114 129 +// 12 70 79 86 97 108 122 +// 13 84 93 103 110 125 +// 14 98 106 115 127 +// 15 117 128 +static void vpx_highbd_idct32_12_neon(const tran_low_t *const input, + int32_t *output) { + int32x4x2_t in[12], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32], + s8[32]; + + load_8x8_s32_dual(input, &in[0], &in[1], &in[2], &in[3], &in[4], &in[5], + &in[6], &in[7]); + transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + + load_4x8_s32_dual(input + 8, &in[8].val[0], &in[8].val[1], &in[9].val[0], + &in[9].val[1], &in[10].val[0], &in[10].val[1], + &in[11].val[0], &in[11].val[1]); + transpose_s32_4x8(&in[8].val[0], &in[8].val[1], &in[9].val[0], &in[9].val[1], + &in[10].val[0], &in[10].val[1], &in[11].val[0], + &in[11].val[1]); + + // stage 1 + s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64); + s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64); + + s1[18] = multiply_shift_and_narrow_s32_dual(in[9], cospi_23_64); + s1[29] = multiply_shift_and_narrow_s32_dual(in[9], cospi_9_64); + + s1[19] = multiply_shift_and_narrow_s32_dual(in[7], -cospi_25_64); + s1[28] = multiply_shift_and_narrow_s32_dual(in[7], cospi_7_64); + + s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64); + s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64); + + s1[21] = multiply_shift_and_narrow_s32_dual(in[11], -cospi_21_64); + s1[26] = multiply_shift_and_narrow_s32_dual(in[11], cospi_11_64); + + s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64); + s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64); + + // stage 2 + s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64); + s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64); + + s2[10] = multiply_shift_and_narrow_s32_dual(in[10], cospi_22_64); + s2[13] = multiply_shift_and_narrow_s32_dual(in[10], cospi_10_64); + + s2[11] = multiply_shift_and_narrow_s32_dual(in[6], -cospi_26_64); + s2[12] = multiply_shift_and_narrow_s32_dual(in[6], cospi_6_64); + + s2[18] = highbd_idct_sub_dual(s1[19], s1[18]); + s2[19] = highbd_idct_add_dual(s1[18], s1[19]); + s2[20] = highbd_idct_add_dual(s1[20], s1[21]); + s2[21] = highbd_idct_sub_dual(s1[20], s1[21]); + s2[26] = highbd_idct_sub_dual(s1[27], s1[26]); + s2[27] = highbd_idct_add_dual(s1[26], s1[27]); + s2[28] = highbd_idct_add_dual(s1[28], s1[29]); + s2[29] = highbd_idct_sub_dual(s1[28], s1[29]); + + // stage 3 + s3[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64); + s3[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64); + + s3[10] = highbd_idct_sub_dual(s2[11], s2[10]); + s3[11] = highbd_idct_add_dual(s2[10], s2[11]); + s3[12] = highbd_idct_add_dual(s2[12], s2[13]); + s3[13] = highbd_idct_sub_dual(s2[12], s2[13]); + + s3[17] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_4_64, + s1[31], cospi_28_64); + s3[30] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_28_64, + s1[31], cospi_4_64); + + s3[18] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_28_64, + s2[29], -cospi_4_64); + s3[29] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_4_64, + s2[29], cospi_28_64); + + s3[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_20_64, + s2[26], cospi_12_64); + s3[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], cospi_12_64, + s2[26], cospi_20_64); + + s3[22] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_12_64, + s1[24], -cospi_20_64); + s3[25] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_20_64, + s1[24], cospi_12_64); + + // stage 4 + s4[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64); + s4[2] = multiply_shift_and_narrow_s32_dual(in[8], cospi_24_64); + s4[3] = multiply_shift_and_narrow_s32_dual(in[8], cospi_8_64); + + s4[9] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], -cospi_8_64, + s2[15], cospi_24_64); + s4[14] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], cospi_24_64, + s2[15], cospi_8_64); + + s4[10] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_24_64, + s3[13], -cospi_8_64); + s4[13] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_8_64, + s3[13], cospi_24_64); + + s4[16] = highbd_idct_add_dual(s1[16], s2[19]); + s4[17] = highbd_idct_add_dual(s3[17], s3[18]); + s4[18] = highbd_idct_sub_dual(s3[17], s3[18]); + s4[19] = highbd_idct_sub_dual(s1[16], s2[19]); + s4[20] = highbd_idct_sub_dual(s1[23], s2[20]); + s4[21] = highbd_idct_sub_dual(s3[22], s3[21]); + s4[22] = highbd_idct_add_dual(s3[21], s3[22]); + s4[23] = highbd_idct_add_dual(s2[20], s1[23]); + s4[24] = highbd_idct_add_dual(s1[24], s2[27]); + s4[25] = highbd_idct_add_dual(s3[25], s3[26]); + s4[26] = highbd_idct_sub_dual(s3[25], s3[26]); + s4[27] = highbd_idct_sub_dual(s1[24], s2[27]); + s4[28] = highbd_idct_sub_dual(s1[31], s2[28]); + s4[29] = highbd_idct_sub_dual(s3[30], s3[29]); + s4[30] = highbd_idct_add_dual(s3[29], s3[30]); + s4[31] = highbd_idct_add_dual(s2[28], s1[31]); + + // stage 5 + s5[0] = highbd_idct_add_dual(s4[0], s4[3]); + s5[1] = highbd_idct_add_dual(s4[0], s4[2]); + s5[2] = highbd_idct_sub_dual(s4[0], s4[2]); + s5[3] = highbd_idct_sub_dual(s4[0], s4[3]); + + s5[5] = sub_multiply_shift_and_narrow_s32_dual(s3[7], s3[4], cospi_16_64); + s5[6] = add_multiply_shift_and_narrow_s32_dual(s3[4], s3[7], cospi_16_64); + + s5[8] = highbd_idct_add_dual(s2[8], s3[11]); + s5[9] = highbd_idct_add_dual(s4[9], s4[10]); + s5[10] = highbd_idct_sub_dual(s4[9], s4[10]); + s5[11] = highbd_idct_sub_dual(s2[8], s3[11]); + s5[12] = highbd_idct_sub_dual(s2[15], s3[12]); + s5[13] = highbd_idct_sub_dual(s4[14], s4[13]); + s5[14] = highbd_idct_add_dual(s4[13], s4[14]); + s5[15] = highbd_idct_add_dual(s2[15], s3[12]); + + s5[18] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], -cospi_8_64, + s4[29], cospi_24_64); + s5[29] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], cospi_24_64, + s4[29], cospi_8_64); + + s5[19] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], -cospi_8_64, + s4[28], cospi_24_64); + s5[28] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], cospi_24_64, + s4[28], cospi_8_64); + + s5[20] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_24_64, + s4[27], -cospi_8_64); + s5[27] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_8_64, + s4[27], cospi_24_64); + + s5[21] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_24_64, + s4[26], -cospi_8_64); + s5[26] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_8_64, + s4[26], cospi_24_64); + + // stage 6 + s6[0] = highbd_idct_add_dual(s5[0], s3[7]); + s6[1] = highbd_idct_add_dual(s5[1], s5[6]); + s6[2] = highbd_idct_add_dual(s5[2], s5[5]); + s6[3] = highbd_idct_add_dual(s5[3], s3[4]); + s6[4] = highbd_idct_sub_dual(s5[3], s3[4]); + s6[5] = highbd_idct_sub_dual(s5[2], s5[5]); + s6[6] = highbd_idct_sub_dual(s5[1], s5[6]); + s6[7] = highbd_idct_sub_dual(s5[0], s3[7]); + + s6[10] = sub_multiply_shift_and_narrow_s32_dual(s5[13], s5[10], cospi_16_64); + s6[13] = add_multiply_shift_and_narrow_s32_dual(s5[10], s5[13], cospi_16_64); + + s6[11] = sub_multiply_shift_and_narrow_s32_dual(s5[12], s5[11], cospi_16_64); + s6[12] = add_multiply_shift_and_narrow_s32_dual(s5[11], s5[12], cospi_16_64); + + s6[16] = highbd_idct_add_dual(s4[16], s4[23]); + s6[17] = highbd_idct_add_dual(s4[17], s4[22]); + s6[18] = highbd_idct_add_dual(s5[18], s5[21]); + s6[19] = highbd_idct_add_dual(s5[19], s5[20]); + s6[20] = highbd_idct_sub_dual(s5[19], s5[20]); + s6[21] = highbd_idct_sub_dual(s5[18], s5[21]); + s6[22] = highbd_idct_sub_dual(s4[17], s4[22]); + s6[23] = highbd_idct_sub_dual(s4[16], s4[23]); + + s6[24] = highbd_idct_sub_dual(s4[31], s4[24]); + s6[25] = highbd_idct_sub_dual(s4[30], s4[25]); + s6[26] = highbd_idct_sub_dual(s5[29], s5[26]); + s6[27] = highbd_idct_sub_dual(s5[28], s5[27]); + s6[28] = highbd_idct_add_dual(s5[27], s5[28]); + s6[29] = highbd_idct_add_dual(s5[26], s5[29]); + s6[30] = highbd_idct_add_dual(s4[25], s4[30]); + s6[31] = highbd_idct_add_dual(s4[24], s4[31]); + + // stage 7 + s7[0] = highbd_idct_add_dual(s6[0], s5[15]); + s7[1] = highbd_idct_add_dual(s6[1], s5[14]); + s7[2] = highbd_idct_add_dual(s6[2], s6[13]); + s7[3] = highbd_idct_add_dual(s6[3], s6[12]); + s7[4] = highbd_idct_add_dual(s6[4], s6[11]); + s7[5] = highbd_idct_add_dual(s6[5], s6[10]); + s7[6] = highbd_idct_add_dual(s6[6], s5[9]); + s7[7] = highbd_idct_add_dual(s6[7], s5[8]); + s7[8] = highbd_idct_sub_dual(s6[7], s5[8]); + s7[9] = highbd_idct_sub_dual(s6[6], s5[9]); + s7[10] = highbd_idct_sub_dual(s6[5], s6[10]); + s7[11] = highbd_idct_sub_dual(s6[4], s6[11]); + s7[12] = highbd_idct_sub_dual(s6[3], s6[12]); + s7[13] = highbd_idct_sub_dual(s6[2], s6[13]); + s7[14] = highbd_idct_sub_dual(s6[1], s5[14]); + s7[15] = highbd_idct_sub_dual(s6[0], s5[15]); + + s7[20] = sub_multiply_shift_and_narrow_s32_dual(s6[27], s6[20], cospi_16_64); + s7[27] = add_multiply_shift_and_narrow_s32_dual(s6[20], s6[27], cospi_16_64); + + s7[21] = sub_multiply_shift_and_narrow_s32_dual(s6[26], s6[21], cospi_16_64); + s7[26] = add_multiply_shift_and_narrow_s32_dual(s6[21], s6[26], cospi_16_64); + + s7[22] = sub_multiply_shift_and_narrow_s32_dual(s6[25], s6[22], cospi_16_64); + s7[25] = add_multiply_shift_and_narrow_s32_dual(s6[22], s6[25], cospi_16_64); + + s7[23] = sub_multiply_shift_and_narrow_s32_dual(s6[24], s6[23], cospi_16_64); + s7[24] = add_multiply_shift_and_narrow_s32_dual(s6[23], s6[24], cospi_16_64); + + // final stage + s8[0] = highbd_idct_add_dual(s7[0], s6[31]); + s8[1] = highbd_idct_add_dual(s7[1], s6[30]); + s8[2] = highbd_idct_add_dual(s7[2], s6[29]); + s8[3] = highbd_idct_add_dual(s7[3], s6[28]); + s8[4] = highbd_idct_add_dual(s7[4], s7[27]); + s8[5] = highbd_idct_add_dual(s7[5], s7[26]); + s8[6] = highbd_idct_add_dual(s7[6], s7[25]); + s8[7] = highbd_idct_add_dual(s7[7], s7[24]); + s8[8] = highbd_idct_add_dual(s7[8], s7[23]); + s8[9] = highbd_idct_add_dual(s7[9], s7[22]); + s8[10] = highbd_idct_add_dual(s7[10], s7[21]); + s8[11] = highbd_idct_add_dual(s7[11], s7[20]); + s8[12] = highbd_idct_add_dual(s7[12], s6[19]); + s8[13] = highbd_idct_add_dual(s7[13], s6[18]); + s8[14] = highbd_idct_add_dual(s7[14], s6[17]); + s8[15] = highbd_idct_add_dual(s7[15], s6[16]); + s8[16] = highbd_idct_sub_dual(s7[15], s6[16]); + s8[17] = highbd_idct_sub_dual(s7[14], s6[17]); + s8[18] = highbd_idct_sub_dual(s7[13], s6[18]); + s8[19] = highbd_idct_sub_dual(s7[12], s6[19]); + s8[20] = highbd_idct_sub_dual(s7[11], s7[20]); + s8[21] = highbd_idct_sub_dual(s7[10], s7[21]); + s8[22] = highbd_idct_sub_dual(s7[9], s7[22]); + s8[23] = highbd_idct_sub_dual(s7[8], s7[23]); + s8[24] = highbd_idct_sub_dual(s7[7], s7[24]); + s8[25] = highbd_idct_sub_dual(s7[6], s7[25]); + s8[26] = highbd_idct_sub_dual(s7[5], s7[26]); + s8[27] = highbd_idct_sub_dual(s7[4], s7[27]); + s8[28] = highbd_idct_sub_dual(s7[3], s6[28]); + s8[29] = highbd_idct_sub_dual(s7[2], s6[29]); + s8[30] = highbd_idct_sub_dual(s7[1], s6[30]); + s8[31] = highbd_idct_sub_dual(s7[0], s6[31]); + + vst1q_s32(output + 0, s8[0].val[0]); + vst1q_s32(output + 4, s8[0].val[1]); + output += 16; + vst1q_s32(output + 0, s8[1].val[0]); + vst1q_s32(output + 4, s8[1].val[1]); + output += 16; + vst1q_s32(output + 0, s8[2].val[0]); + vst1q_s32(output + 4, s8[2].val[1]); + output += 16; + vst1q_s32(output + 0, s8[3].val[0]); + vst1q_s32(output + 4, s8[3].val[1]); + output += 16; + vst1q_s32(output + 0, s8[4].val[0]); + vst1q_s32(output + 4, s8[4].val[1]); + output += 16; + vst1q_s32(output + 0, s8[5].val[0]); + vst1q_s32(output + 4, s8[5].val[1]); + output += 16; + vst1q_s32(output + 0, s8[6].val[0]); + vst1q_s32(output + 4, s8[6].val[1]); + output += 16; + vst1q_s32(output + 0, s8[7].val[0]); + vst1q_s32(output + 4, s8[7].val[1]); + output += 16; + + vst1q_s32(output + 0, s8[8].val[0]); + vst1q_s32(output + 4, s8[8].val[1]); + output += 16; + vst1q_s32(output + 0, s8[9].val[0]); + vst1q_s32(output + 4, s8[9].val[1]); + output += 16; + vst1q_s32(output + 0, s8[10].val[0]); + vst1q_s32(output + 4, s8[10].val[1]); + output += 16; + vst1q_s32(output + 0, s8[11].val[0]); + vst1q_s32(output + 4, s8[11].val[1]); + output += 16; + vst1q_s32(output + 0, s8[12].val[0]); + vst1q_s32(output + 4, s8[12].val[1]); + output += 16; + vst1q_s32(output + 0, s8[13].val[0]); + vst1q_s32(output + 4, s8[13].val[1]); + output += 16; + vst1q_s32(output + 0, s8[14].val[0]); + vst1q_s32(output + 4, s8[14].val[1]); + output += 16; + vst1q_s32(output + 0, s8[15].val[0]); + vst1q_s32(output + 4, s8[15].val[1]); + output += 16; + + vst1q_s32(output + 0, s8[16].val[0]); + vst1q_s32(output + 4, s8[16].val[1]); + output += 16; + vst1q_s32(output + 0, s8[17].val[0]); + vst1q_s32(output + 4, s8[17].val[1]); + output += 16; + vst1q_s32(output + 0, s8[18].val[0]); + vst1q_s32(output + 4, s8[18].val[1]); + output += 16; + vst1q_s32(output + 0, s8[19].val[0]); + vst1q_s32(output + 4, s8[19].val[1]); + output += 16; + vst1q_s32(output + 0, s8[20].val[0]); + vst1q_s32(output + 4, s8[20].val[1]); + output += 16; + vst1q_s32(output + 0, s8[21].val[0]); + vst1q_s32(output + 4, s8[21].val[1]); + output += 16; + vst1q_s32(output + 0, s8[22].val[0]); + vst1q_s32(output + 4, s8[22].val[1]); + output += 16; + vst1q_s32(output + 0, s8[23].val[0]); + vst1q_s32(output + 4, s8[23].val[1]); + output += 16; + + vst1q_s32(output + 0, s8[24].val[0]); + vst1q_s32(output + 4, s8[24].val[1]); + output += 16; + vst1q_s32(output + 0, s8[25].val[0]); + vst1q_s32(output + 4, s8[25].val[1]); + output += 16; + vst1q_s32(output + 0, s8[26].val[0]); + vst1q_s32(output + 4, s8[26].val[1]); + output += 16; + vst1q_s32(output + 0, s8[27].val[0]); + vst1q_s32(output + 4, s8[27].val[1]); + output += 16; + vst1q_s32(output + 0, s8[28].val[0]); + vst1q_s32(output + 4, s8[28].val[1]); + output += 16; + vst1q_s32(output + 0, s8[29].val[0]); + vst1q_s32(output + 4, s8[29].val[1]); + output += 16; + vst1q_s32(output + 0, s8[30].val[0]); + vst1q_s32(output + 4, s8[30].val[1]); + output += 16; + vst1q_s32(output + 0, s8[31].val[0]); + vst1q_s32(output + 4, s8[31].val[1]); +} + +static void vpx_highbd_idct32_16_neon(const int32_t *const input, + uint16_t *const output, const int stride, + const int bd) { + int32x4x2_t in[16], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32], + out[32]; + + load_and_transpose_s32_8x8(input, 16, &in[0], &in[1], &in[2], &in[3], &in[4], + &in[5], &in[6], &in[7]); + + load_and_transpose_s32_8x8(input + 8, 16, &in[8], &in[9], &in[10], &in[11], + &in[12], &in[13], &in[14], &in[15]); + + // stage 1 + s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64); + s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64); + + s1[17] = multiply_shift_and_narrow_s32_dual(in[15], -cospi_17_64); + s1[30] = multiply_shift_and_narrow_s32_dual(in[15], cospi_15_64); + + s1[18] = multiply_shift_and_narrow_s32_dual(in[9], cospi_23_64); + s1[29] = multiply_shift_and_narrow_s32_dual(in[9], cospi_9_64); + + s1[19] = multiply_shift_and_narrow_s32_dual(in[7], -cospi_25_64); + s1[28] = multiply_shift_and_narrow_s32_dual(in[7], cospi_7_64); + + s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64); + s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64); + + s1[21] = multiply_shift_and_narrow_s32_dual(in[11], -cospi_21_64); + s1[26] = multiply_shift_and_narrow_s32_dual(in[11], cospi_11_64); + + s1[22] = multiply_shift_and_narrow_s32_dual(in[13], cospi_19_64); + s1[25] = multiply_shift_and_narrow_s32_dual(in[13], cospi_13_64); + + s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64); + s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64); + + // stage 2 + s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64); + s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64); + + s2[9] = multiply_shift_and_narrow_s32_dual(in[14], -cospi_18_64); + s2[14] = multiply_shift_and_narrow_s32_dual(in[14], cospi_14_64); + + s2[10] = multiply_shift_and_narrow_s32_dual(in[10], cospi_22_64); + s2[13] = multiply_shift_and_narrow_s32_dual(in[10], cospi_10_64); + + s2[11] = multiply_shift_and_narrow_s32_dual(in[6], -cospi_26_64); + s2[12] = multiply_shift_and_narrow_s32_dual(in[6], cospi_6_64); + + s2[16] = highbd_idct_add_dual(s1[16], s1[17]); + s2[17] = highbd_idct_sub_dual(s1[16], s1[17]); + s2[18] = highbd_idct_sub_dual(s1[19], s1[18]); + s2[19] = highbd_idct_add_dual(s1[18], s1[19]); + s2[20] = highbd_idct_add_dual(s1[20], s1[21]); + s2[21] = highbd_idct_sub_dual(s1[20], s1[21]); + s2[22] = highbd_idct_sub_dual(s1[23], s1[22]); + s2[23] = highbd_idct_add_dual(s1[22], s1[23]); + s2[24] = highbd_idct_add_dual(s1[24], s1[25]); + s2[25] = highbd_idct_sub_dual(s1[24], s1[25]); + s2[26] = highbd_idct_sub_dual(s1[27], s1[26]); + s2[27] = highbd_idct_add_dual(s1[26], s1[27]); + s2[28] = highbd_idct_add_dual(s1[28], s1[29]); + s2[29] = highbd_idct_sub_dual(s1[28], s1[29]); + s2[30] = highbd_idct_sub_dual(s1[31], s1[30]); + s2[31] = highbd_idct_add_dual(s1[30], s1[31]); + + // stage 3 + s3[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64); + s3[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64); + + s3[5] = multiply_shift_and_narrow_s32_dual(in[12], -cospi_20_64); + s3[6] = multiply_shift_and_narrow_s32_dual(in[12], cospi_12_64); + + s3[8] = highbd_idct_add_dual(s2[8], s2[9]); + s3[9] = highbd_idct_sub_dual(s2[8], s2[9]); + s3[10] = highbd_idct_sub_dual(s2[11], s2[10]); + s3[11] = highbd_idct_add_dual(s2[10], s2[11]); + s3[12] = highbd_idct_add_dual(s2[12], s2[13]); + s3[13] = highbd_idct_sub_dual(s2[12], s2[13]); + s3[14] = highbd_idct_sub_dual(s2[15], s2[14]); + s3[15] = highbd_idct_add_dual(s2[14], s2[15]); + + s3[17] = multiply_accumulate_shift_and_narrow_s32_dual(s2[17], -cospi_4_64, + s2[30], cospi_28_64); + s3[30] = multiply_accumulate_shift_and_narrow_s32_dual(s2[17], cospi_28_64, + s2[30], cospi_4_64); + + s3[18] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_28_64, + s2[29], -cospi_4_64); + s3[29] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_4_64, + s2[29], cospi_28_64); + + s3[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_20_64, + s2[26], cospi_12_64); + s3[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], cospi_12_64, + s2[26], cospi_20_64); + + s3[22] = multiply_accumulate_shift_and_narrow_s32_dual(s2[22], -cospi_12_64, + s2[25], -cospi_20_64); + s3[25] = multiply_accumulate_shift_and_narrow_s32_dual(s2[22], -cospi_20_64, + s2[25], cospi_12_64); + + // stage 4 + s4[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64); + s4[2] = multiply_shift_and_narrow_s32_dual(in[8], cospi_24_64); + s4[3] = multiply_shift_and_narrow_s32_dual(in[8], cospi_8_64); + + s4[4] = highbd_idct_add_dual(s3[4], s3[5]); + s4[5] = highbd_idct_sub_dual(s3[4], s3[5]); + s4[6] = highbd_idct_sub_dual(s3[7], s3[6]); + s4[7] = highbd_idct_add_dual(s3[6], s3[7]); + + s4[9] = multiply_accumulate_shift_and_narrow_s32_dual(s3[9], -cospi_8_64, + s3[14], cospi_24_64); + s4[14] = multiply_accumulate_shift_and_narrow_s32_dual(s3[9], cospi_24_64, + s3[14], cospi_8_64); + + s4[10] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_24_64, + s3[13], -cospi_8_64); + s4[13] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_8_64, + s3[13], cospi_24_64); + + s4[16] = highbd_idct_add_dual(s2[16], s2[19]); + s4[17] = highbd_idct_add_dual(s3[17], s3[18]); + s4[18] = highbd_idct_sub_dual(s3[17], s3[18]); + s4[19] = highbd_idct_sub_dual(s2[16], s2[19]); + s4[20] = highbd_idct_sub_dual(s2[23], s2[20]); + s4[21] = highbd_idct_sub_dual(s3[22], s3[21]); + s4[22] = highbd_idct_add_dual(s3[21], s3[22]); + s4[23] = highbd_idct_add_dual(s2[20], s2[23]); + s4[24] = highbd_idct_add_dual(s2[24], s2[27]); + s4[25] = highbd_idct_add_dual(s3[25], s3[26]); + s4[26] = highbd_idct_sub_dual(s3[25], s3[26]); + s4[27] = highbd_idct_sub_dual(s2[24], s2[27]); + s4[28] = highbd_idct_sub_dual(s2[31], s2[28]); + s4[29] = highbd_idct_sub_dual(s3[30], s3[29]); + s4[30] = highbd_idct_add_dual(s3[29], s3[30]); + s4[31] = highbd_idct_add_dual(s2[28], s2[31]); + + // stage 5 + s5[0] = highbd_idct_add_dual(s4[0], s4[3]); + s5[1] = highbd_idct_add_dual(s4[0], s4[2]); + s5[2] = highbd_idct_sub_dual(s4[0], s4[2]); + s5[3] = highbd_idct_sub_dual(s4[0], s4[3]); + + s5[5] = sub_multiply_shift_and_narrow_s32_dual(s4[6], s4[5], cospi_16_64); + s5[6] = add_multiply_shift_and_narrow_s32_dual(s4[5], s4[6], cospi_16_64); + + s5[8] = highbd_idct_add_dual(s3[8], s3[11]); + s5[9] = highbd_idct_add_dual(s4[9], s4[10]); + s5[10] = highbd_idct_sub_dual(s4[9], s4[10]); + s5[11] = highbd_idct_sub_dual(s3[8], s3[11]); + s5[12] = highbd_idct_sub_dual(s3[15], s3[12]); + s5[13] = highbd_idct_sub_dual(s4[14], s4[13]); + s5[14] = highbd_idct_add_dual(s4[13], s4[14]); + s5[15] = highbd_idct_add_dual(s3[15], s3[12]); + + s5[18] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], -cospi_8_64, + s4[29], cospi_24_64); + s5[29] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], cospi_24_64, + s4[29], cospi_8_64); + + s5[19] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], -cospi_8_64, + s4[28], cospi_24_64); + s5[28] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], cospi_24_64, + s4[28], cospi_8_64); + + s5[20] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_24_64, + s4[27], -cospi_8_64); + s5[27] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_8_64, + s4[27], cospi_24_64); + + s5[21] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_24_64, + s4[26], -cospi_8_64); + s5[26] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_8_64, + s4[26], cospi_24_64); + + // stage 6 + s6[0] = highbd_idct_add_dual(s5[0], s4[7]); + s6[1] = highbd_idct_add_dual(s5[1], s5[6]); + s6[2] = highbd_idct_add_dual(s5[2], s5[5]); + s6[3] = highbd_idct_add_dual(s5[3], s4[4]); + s6[4] = highbd_idct_sub_dual(s5[3], s4[4]); + s6[5] = highbd_idct_sub_dual(s5[2], s5[5]); + s6[6] = highbd_idct_sub_dual(s5[1], s5[6]); + s6[7] = highbd_idct_sub_dual(s5[0], s4[7]); + + s6[10] = sub_multiply_shift_and_narrow_s32_dual(s5[13], s5[10], cospi_16_64); + s6[13] = add_multiply_shift_and_narrow_s32_dual(s5[10], s5[13], cospi_16_64); + + s6[11] = sub_multiply_shift_and_narrow_s32_dual(s5[12], s5[11], cospi_16_64); + s6[12] = add_multiply_shift_and_narrow_s32_dual(s5[11], s5[12], cospi_16_64); + + s6[16] = highbd_idct_add_dual(s4[16], s4[23]); + s6[17] = highbd_idct_add_dual(s4[17], s4[22]); + s6[18] = highbd_idct_add_dual(s5[18], s5[21]); + s6[19] = highbd_idct_add_dual(s5[19], s5[20]); + s6[20] = highbd_idct_sub_dual(s5[19], s5[20]); + s6[21] = highbd_idct_sub_dual(s5[18], s5[21]); + s6[22] = highbd_idct_sub_dual(s4[17], s4[22]); + s6[23] = highbd_idct_sub_dual(s4[16], s4[23]); + s6[24] = highbd_idct_sub_dual(s4[31], s4[24]); + s6[25] = highbd_idct_sub_dual(s4[30], s4[25]); + s6[26] = highbd_idct_sub_dual(s5[29], s5[26]); + s6[27] = highbd_idct_sub_dual(s5[28], s5[27]); + s6[28] = highbd_idct_add_dual(s5[27], s5[28]); + s6[29] = highbd_idct_add_dual(s5[26], s5[29]); + s6[30] = highbd_idct_add_dual(s4[25], s4[30]); + s6[31] = highbd_idct_add_dual(s4[24], s4[31]); + + // stage 7 + s7[0] = highbd_idct_add_dual(s6[0], s5[15]); + s7[1] = highbd_idct_add_dual(s6[1], s5[14]); + s7[2] = highbd_idct_add_dual(s6[2], s6[13]); + s7[3] = highbd_idct_add_dual(s6[3], s6[12]); + s7[4] = highbd_idct_add_dual(s6[4], s6[11]); + s7[5] = highbd_idct_add_dual(s6[5], s6[10]); + s7[6] = highbd_idct_add_dual(s6[6], s5[9]); + s7[7] = highbd_idct_add_dual(s6[7], s5[8]); + s7[8] = highbd_idct_sub_dual(s6[7], s5[8]); + s7[9] = highbd_idct_sub_dual(s6[6], s5[9]); + s7[10] = highbd_idct_sub_dual(s6[5], s6[10]); + s7[11] = highbd_idct_sub_dual(s6[4], s6[11]); + s7[12] = highbd_idct_sub_dual(s6[3], s6[12]); + s7[13] = highbd_idct_sub_dual(s6[2], s6[13]); + s7[14] = highbd_idct_sub_dual(s6[1], s5[14]); + s7[15] = highbd_idct_sub_dual(s6[0], s5[15]); + + s7[20] = sub_multiply_shift_and_narrow_s32_dual(s6[27], s6[20], cospi_16_64); + s7[27] = add_multiply_shift_and_narrow_s32_dual(s6[20], s6[27], cospi_16_64); + + s7[21] = sub_multiply_shift_and_narrow_s32_dual(s6[26], s6[21], cospi_16_64); + s7[26] = add_multiply_shift_and_narrow_s32_dual(s6[21], s6[26], cospi_16_64); + + s7[22] = sub_multiply_shift_and_narrow_s32_dual(s6[25], s6[22], cospi_16_64); + s7[25] = add_multiply_shift_and_narrow_s32_dual(s6[22], s6[25], cospi_16_64); + + s7[23] = sub_multiply_shift_and_narrow_s32_dual(s6[24], s6[23], cospi_16_64); + s7[24] = add_multiply_shift_and_narrow_s32_dual(s6[23], s6[24], cospi_16_64); + + // final stage + out[0] = highbd_idct_add_dual(s7[0], s6[31]); + out[1] = highbd_idct_add_dual(s7[1], s6[30]); + out[2] = highbd_idct_add_dual(s7[2], s6[29]); + out[3] = highbd_idct_add_dual(s7[3], s6[28]); + out[4] = highbd_idct_add_dual(s7[4], s7[27]); + out[5] = highbd_idct_add_dual(s7[5], s7[26]); + out[6] = highbd_idct_add_dual(s7[6], s7[25]); + out[7] = highbd_idct_add_dual(s7[7], s7[24]); + out[8] = highbd_idct_add_dual(s7[8], s7[23]); + out[9] = highbd_idct_add_dual(s7[9], s7[22]); + out[10] = highbd_idct_add_dual(s7[10], s7[21]); + out[11] = highbd_idct_add_dual(s7[11], s7[20]); + out[12] = highbd_idct_add_dual(s7[12], s6[19]); + out[13] = highbd_idct_add_dual(s7[13], s6[18]); + out[14] = highbd_idct_add_dual(s7[14], s6[17]); + out[15] = highbd_idct_add_dual(s7[15], s6[16]); + out[16] = highbd_idct_sub_dual(s7[15], s6[16]); + out[17] = highbd_idct_sub_dual(s7[14], s6[17]); + out[18] = highbd_idct_sub_dual(s7[13], s6[18]); + out[19] = highbd_idct_sub_dual(s7[12], s6[19]); + out[20] = highbd_idct_sub_dual(s7[11], s7[20]); + out[21] = highbd_idct_sub_dual(s7[10], s7[21]); + out[22] = highbd_idct_sub_dual(s7[9], s7[22]); + out[23] = highbd_idct_sub_dual(s7[8], s7[23]); + out[24] = highbd_idct_sub_dual(s7[7], s7[24]); + out[25] = highbd_idct_sub_dual(s7[6], s7[25]); + out[26] = highbd_idct_sub_dual(s7[5], s7[26]); + out[27] = highbd_idct_sub_dual(s7[4], s7[27]); + out[28] = highbd_idct_sub_dual(s7[3], s6[28]); + out[29] = highbd_idct_sub_dual(s7[2], s6[29]); + out[30] = highbd_idct_sub_dual(s7[1], s6[30]); + out[31] = highbd_idct_sub_dual(s7[0], s6[31]); + + highbd_idct16x16_add_store(out, output, stride, bd); + highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd); +} + +void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i; + + if (bd == 8) { + int16_t temp[32 * 16]; + int16_t *t = temp; + vpx_idct32_12_neon(input, temp); + vpx_idct32_12_neon(input + 32 * 8, temp + 8); + + for (i = 0; i < 32; i += 8) { + vpx_idct32_16_neon(t, dest, stride, 1); + t += (16 * 8); + dest += 8; + } + } else { + int32_t temp[32 * 16]; + int32_t *t = temp; + vpx_highbd_idct32_12_neon(input, temp); + vpx_highbd_idct32_12_neon(input + 32 * 8, temp + 8); + + for (i = 0; i < 32; i += 8) { + vpx_highbd_idct32_16_neon(t, dest, stride, bd); + t += (16 * 8); + dest += 8; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c new file mode 100644 index 0000000000..f05932cec3 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c @@ -0,0 +1,625 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/highbd_idct_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/txfm_common.h" + +// Only for the first pass of the _34_ variant. Since it only uses values from +// the top left 8x8 it can safely assume all the remaining values are 0 and skip +// an awful lot of calculations. In fact, only the first 6 columns make the cut. +// None of the elements in the 7th or 8th column are used so it skips any calls +// to input[67] too. +// In C this does a single row of 32 for each call. Here it transposes the top +// left 8x8 to allow using SIMD. + +// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 34 non-zero +// coefficients as follows: +// 0 1 2 3 4 5 6 7 +// 0 0 2 5 10 17 25 +// 1 1 4 8 15 22 30 +// 2 3 7 12 18 28 +// 3 6 11 16 23 31 +// 4 9 14 19 29 +// 5 13 20 26 +// 6 21 27 33 +// 7 24 32 +static void vpx_highbd_idct32_6_neon(const tran_low_t *input, int32_t *output) { + int32x4x2_t in[8], s1[32], s2[32], s3[32]; + + in[0].val[0] = vld1q_s32(input); + in[0].val[1] = vld1q_s32(input + 4); + input += 32; + in[1].val[0] = vld1q_s32(input); + in[1].val[1] = vld1q_s32(input + 4); + input += 32; + in[2].val[0] = vld1q_s32(input); + in[2].val[1] = vld1q_s32(input + 4); + input += 32; + in[3].val[0] = vld1q_s32(input); + in[3].val[1] = vld1q_s32(input + 4); + input += 32; + in[4].val[0] = vld1q_s32(input); + in[4].val[1] = vld1q_s32(input + 4); + input += 32; + in[5].val[0] = vld1q_s32(input); + in[5].val[1] = vld1q_s32(input + 4); + input += 32; + in[6].val[0] = vld1q_s32(input); + in[6].val[1] = vld1q_s32(input + 4); + input += 32; + in[7].val[0] = vld1q_s32(input); + in[7].val[1] = vld1q_s32(input + 4); + transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + + // stage 1 + // input[1] * cospi_31_64 - input[31] * cospi_1_64 (but input[31] == 0) + s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64); + // input[1] * cospi_1_64 + input[31] * cospi_31_64 (but input[31] == 0) + s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64); + + s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64); + s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64); + + s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64); + s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64); + + // stage 2 + s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64); + s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64); + + // stage 3 + s1[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64); + s1[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64); + + s1[17] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_4_64, + s1[31], cospi_28_64); + s1[30] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_28_64, + s1[31], cospi_4_64); + + s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], -cospi_20_64, + s1[27], cospi_12_64); + s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], cospi_12_64, + s1[27], cospi_20_64); + + s1[22] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_12_64, + s1[24], -cospi_20_64); + s1[25] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_20_64, + s1[24], cospi_12_64); + + // stage 4 + s1[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64); + + s2[9] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], -cospi_8_64, + s2[15], cospi_24_64); + s2[14] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], cospi_24_64, + s2[15], cospi_8_64); + + s2[20] = highbd_idct_sub_dual(s1[23], s1[20]); + s2[21] = highbd_idct_sub_dual(s1[22], s1[21]); + s2[22] = highbd_idct_add_dual(s1[21], s1[22]); + s2[23] = highbd_idct_add_dual(s1[20], s1[23]); + s2[24] = highbd_idct_add_dual(s1[24], s1[27]); + s2[25] = highbd_idct_add_dual(s1[25], s1[26]); + s2[26] = highbd_idct_sub_dual(s1[25], s1[26]); + s2[27] = highbd_idct_sub_dual(s1[24], s1[27]); + + // stage 5 + s1[5] = sub_multiply_shift_and_narrow_s32_dual(s1[7], s1[4], cospi_16_64); + s1[6] = add_multiply_shift_and_narrow_s32_dual(s1[4], s1[7], cospi_16_64); + + s1[18] = multiply_accumulate_shift_and_narrow_s32_dual(s1[17], -cospi_8_64, + s1[30], cospi_24_64); + s1[29] = multiply_accumulate_shift_and_narrow_s32_dual(s1[17], cospi_24_64, + s1[30], cospi_8_64); + + s1[19] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_8_64, + s1[31], cospi_24_64); + s1[28] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_24_64, + s1[31], cospi_8_64); + + s1[20] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_24_64, + s2[27], -cospi_8_64); + s1[27] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_8_64, + s2[27], cospi_24_64); + + s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_24_64, + s2[26], -cospi_8_64); + s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_8_64, + s2[26], cospi_24_64); + + // stage 6 + s2[0] = highbd_idct_add_dual(s1[0], s1[7]); + s2[1] = highbd_idct_add_dual(s1[0], s1[6]); + s2[2] = highbd_idct_add_dual(s1[0], s1[5]); + s2[3] = highbd_idct_add_dual(s1[0], s1[4]); + s2[4] = highbd_idct_sub_dual(s1[0], s1[4]); + s2[5] = highbd_idct_sub_dual(s1[0], s1[5]); + s2[6] = highbd_idct_sub_dual(s1[0], s1[6]); + s2[7] = highbd_idct_sub_dual(s1[0], s1[7]); + + s2[10] = sub_multiply_shift_and_narrow_s32_dual(s2[14], s2[9], cospi_16_64); + s2[13] = add_multiply_shift_and_narrow_s32_dual(s2[9], s2[14], cospi_16_64); + + s2[11] = sub_multiply_shift_and_narrow_s32_dual(s2[15], s2[8], cospi_16_64); + s2[12] = add_multiply_shift_and_narrow_s32_dual(s2[8], s2[15], cospi_16_64); + + s2[16] = highbd_idct_add_dual(s1[16], s2[23]); + s2[17] = highbd_idct_add_dual(s1[17], s2[22]); + s2[18] = highbd_idct_add_dual(s1[18], s1[21]); + s2[19] = highbd_idct_add_dual(s1[19], s1[20]); + s2[20] = highbd_idct_sub_dual(s1[19], s1[20]); + s2[21] = highbd_idct_sub_dual(s1[18], s1[21]); + s2[22] = highbd_idct_sub_dual(s1[17], s2[22]); + s2[23] = highbd_idct_sub_dual(s1[16], s2[23]); + + s3[24] = highbd_idct_sub_dual(s1[31], s2[24]); + s3[25] = highbd_idct_sub_dual(s1[30], s2[25]); + s3[26] = highbd_idct_sub_dual(s1[29], s1[26]); + s3[27] = highbd_idct_sub_dual(s1[28], s1[27]); + s2[28] = highbd_idct_add_dual(s1[27], s1[28]); + s2[29] = highbd_idct_add_dual(s1[26], s1[29]); + s2[30] = highbd_idct_add_dual(s2[25], s1[30]); + s2[31] = highbd_idct_add_dual(s2[24], s1[31]); + + // stage 7 + s1[0] = highbd_idct_add_dual(s2[0], s2[15]); + s1[1] = highbd_idct_add_dual(s2[1], s2[14]); + s1[2] = highbd_idct_add_dual(s2[2], s2[13]); + s1[3] = highbd_idct_add_dual(s2[3], s2[12]); + s1[4] = highbd_idct_add_dual(s2[4], s2[11]); + s1[5] = highbd_idct_add_dual(s2[5], s2[10]); + s1[6] = highbd_idct_add_dual(s2[6], s2[9]); + s1[7] = highbd_idct_add_dual(s2[7], s2[8]); + s1[8] = highbd_idct_sub_dual(s2[7], s2[8]); + s1[9] = highbd_idct_sub_dual(s2[6], s2[9]); + s1[10] = highbd_idct_sub_dual(s2[5], s2[10]); + s1[11] = highbd_idct_sub_dual(s2[4], s2[11]); + s1[12] = highbd_idct_sub_dual(s2[3], s2[12]); + s1[13] = highbd_idct_sub_dual(s2[2], s2[13]); + s1[14] = highbd_idct_sub_dual(s2[1], s2[14]); + s1[15] = highbd_idct_sub_dual(s2[0], s2[15]); + + s1[20] = sub_multiply_shift_and_narrow_s32_dual(s3[27], s2[20], cospi_16_64); + s1[27] = add_multiply_shift_and_narrow_s32_dual(s2[20], s3[27], cospi_16_64); + + s1[21] = sub_multiply_shift_and_narrow_s32_dual(s3[26], s2[21], cospi_16_64); + s1[26] = add_multiply_shift_and_narrow_s32_dual(s2[21], s3[26], cospi_16_64); + + s1[22] = sub_multiply_shift_and_narrow_s32_dual(s3[25], s2[22], cospi_16_64); + s1[25] = add_multiply_shift_and_narrow_s32_dual(s2[22], s3[25], cospi_16_64); + + s1[23] = sub_multiply_shift_and_narrow_s32_dual(s3[24], s2[23], cospi_16_64); + s1[24] = add_multiply_shift_and_narrow_s32_dual(s2[23], s3[24], cospi_16_64); + + // final stage + s3[0] = highbd_idct_add_dual(s1[0], s2[31]); + s3[1] = highbd_idct_add_dual(s1[1], s2[30]); + s3[2] = highbd_idct_add_dual(s1[2], s2[29]); + s3[3] = highbd_idct_add_dual(s1[3], s2[28]); + s3[4] = highbd_idct_add_dual(s1[4], s1[27]); + s3[5] = highbd_idct_add_dual(s1[5], s1[26]); + s3[6] = highbd_idct_add_dual(s1[6], s1[25]); + s3[7] = highbd_idct_add_dual(s1[7], s1[24]); + s3[8] = highbd_idct_add_dual(s1[8], s1[23]); + s3[9] = highbd_idct_add_dual(s1[9], s1[22]); + s3[10] = highbd_idct_add_dual(s1[10], s1[21]); + s3[11] = highbd_idct_add_dual(s1[11], s1[20]); + s3[12] = highbd_idct_add_dual(s1[12], s2[19]); + s3[13] = highbd_idct_add_dual(s1[13], s2[18]); + s3[14] = highbd_idct_add_dual(s1[14], s2[17]); + s3[15] = highbd_idct_add_dual(s1[15], s2[16]); + s3[16] = highbd_idct_sub_dual(s1[15], s2[16]); + s3[17] = highbd_idct_sub_dual(s1[14], s2[17]); + s3[18] = highbd_idct_sub_dual(s1[13], s2[18]); + s3[19] = highbd_idct_sub_dual(s1[12], s2[19]); + s3[20] = highbd_idct_sub_dual(s1[11], s1[20]); + s3[21] = highbd_idct_sub_dual(s1[10], s1[21]); + s3[22] = highbd_idct_sub_dual(s1[9], s1[22]); + s3[23] = highbd_idct_sub_dual(s1[8], s1[23]); + s3[24] = highbd_idct_sub_dual(s1[7], s1[24]); + s3[25] = highbd_idct_sub_dual(s1[6], s1[25]); + s3[26] = highbd_idct_sub_dual(s1[5], s1[26]); + s3[27] = highbd_idct_sub_dual(s1[4], s1[27]); + s3[28] = highbd_idct_sub_dual(s1[3], s2[28]); + s3[29] = highbd_idct_sub_dual(s1[2], s2[29]); + s3[30] = highbd_idct_sub_dual(s1[1], s2[30]); + s3[31] = highbd_idct_sub_dual(s1[0], s2[31]); + + vst1q_s32(output, s3[0].val[0]); + output += 4; + vst1q_s32(output, s3[0].val[1]); + output += 4; + vst1q_s32(output, s3[1].val[0]); + output += 4; + vst1q_s32(output, s3[1].val[1]); + output += 4; + vst1q_s32(output, s3[2].val[0]); + output += 4; + vst1q_s32(output, s3[2].val[1]); + output += 4; + vst1q_s32(output, s3[3].val[0]); + output += 4; + vst1q_s32(output, s3[3].val[1]); + output += 4; + vst1q_s32(output, s3[4].val[0]); + output += 4; + vst1q_s32(output, s3[4].val[1]); + output += 4; + vst1q_s32(output, s3[5].val[0]); + output += 4; + vst1q_s32(output, s3[5].val[1]); + output += 4; + vst1q_s32(output, s3[6].val[0]); + output += 4; + vst1q_s32(output, s3[6].val[1]); + output += 4; + vst1q_s32(output, s3[7].val[0]); + output += 4; + vst1q_s32(output, s3[7].val[1]); + output += 4; + + vst1q_s32(output, s3[8].val[0]); + output += 4; + vst1q_s32(output, s3[8].val[1]); + output += 4; + vst1q_s32(output, s3[9].val[0]); + output += 4; + vst1q_s32(output, s3[9].val[1]); + output += 4; + vst1q_s32(output, s3[10].val[0]); + output += 4; + vst1q_s32(output, s3[10].val[1]); + output += 4; + vst1q_s32(output, s3[11].val[0]); + output += 4; + vst1q_s32(output, s3[11].val[1]); + output += 4; + vst1q_s32(output, s3[12].val[0]); + output += 4; + vst1q_s32(output, s3[12].val[1]); + output += 4; + vst1q_s32(output, s3[13].val[0]); + output += 4; + vst1q_s32(output, s3[13].val[1]); + output += 4; + vst1q_s32(output, s3[14].val[0]); + output += 4; + vst1q_s32(output, s3[14].val[1]); + output += 4; + vst1q_s32(output, s3[15].val[0]); + output += 4; + vst1q_s32(output, s3[15].val[1]); + output += 4; + + vst1q_s32(output, s3[16].val[0]); + output += 4; + vst1q_s32(output, s3[16].val[1]); + output += 4; + vst1q_s32(output, s3[17].val[0]); + output += 4; + vst1q_s32(output, s3[17].val[1]); + output += 4; + vst1q_s32(output, s3[18].val[0]); + output += 4; + vst1q_s32(output, s3[18].val[1]); + output += 4; + vst1q_s32(output, s3[19].val[0]); + output += 4; + vst1q_s32(output, s3[19].val[1]); + output += 4; + vst1q_s32(output, s3[20].val[0]); + output += 4; + vst1q_s32(output, s3[20].val[1]); + output += 4; + vst1q_s32(output, s3[21].val[0]); + output += 4; + vst1q_s32(output, s3[21].val[1]); + output += 4; + vst1q_s32(output, s3[22].val[0]); + output += 4; + vst1q_s32(output, s3[22].val[1]); + output += 4; + vst1q_s32(output, s3[23].val[0]); + output += 4; + vst1q_s32(output, s3[23].val[1]); + output += 4; + + vst1q_s32(output, s3[24].val[0]); + output += 4; + vst1q_s32(output, s3[24].val[1]); + output += 4; + vst1q_s32(output, s3[25].val[0]); + output += 4; + vst1q_s32(output, s3[25].val[1]); + output += 4; + vst1q_s32(output, s3[26].val[0]); + output += 4; + vst1q_s32(output, s3[26].val[1]); + output += 4; + vst1q_s32(output, s3[27].val[0]); + output += 4; + vst1q_s32(output, s3[27].val[1]); + output += 4; + vst1q_s32(output, s3[28].val[0]); + output += 4; + vst1q_s32(output, s3[28].val[1]); + output += 4; + vst1q_s32(output, s3[29].val[0]); + output += 4; + vst1q_s32(output, s3[29].val[1]); + output += 4; + vst1q_s32(output, s3[30].val[0]); + output += 4; + vst1q_s32(output, s3[30].val[1]); + output += 4; + vst1q_s32(output, s3[31].val[0]); + output += 4; + vst1q_s32(output, s3[31].val[1]); +} + +static void vpx_highbd_idct32_8_neon(const int32_t *input, uint16_t *output, + int stride, const int bd) { + int32x4x2_t in[8], s1[32], s2[32], s3[32], out[32]; + + load_and_transpose_s32_8x8(input, 8, &in[0], &in[1], &in[2], &in[3], &in[4], + &in[5], &in[6], &in[7]); + + // stage 1 + s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64); + s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64); + + // Different for _8_ + s1[19] = multiply_shift_and_narrow_s32_dual(in[7], -cospi_25_64); + s1[28] = multiply_shift_and_narrow_s32_dual(in[7], cospi_7_64); + + s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64); + s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64); + + s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64); + s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64); + + // stage 2 + s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64); + s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64); + + s2[11] = multiply_shift_and_narrow_s32_dual(in[6], -cospi_26_64); + s2[12] = multiply_shift_and_narrow_s32_dual(in[6], cospi_6_64); + + // stage 3 + s1[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64); + s1[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64); + + s1[17] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_4_64, + s1[31], cospi_28_64); + s1[30] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_28_64, + s1[31], cospi_4_64); + + // Different for _8_ + s1[18] = multiply_accumulate_shift_and_narrow_s32_dual(s1[19], -cospi_28_64, + s1[28], -cospi_4_64); + s1[29] = multiply_accumulate_shift_and_narrow_s32_dual(s1[19], -cospi_4_64, + s1[28], cospi_28_64); + + s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], -cospi_20_64, + s1[27], cospi_12_64); + s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], cospi_12_64, + s1[27], cospi_20_64); + + s1[22] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_12_64, + s1[24], -cospi_20_64); + s1[25] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_20_64, + s1[24], cospi_12_64); + + // stage 4 + s1[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64); + + s2[9] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], -cospi_8_64, + s2[15], cospi_24_64); + s2[14] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], cospi_24_64, + s2[15], cospi_8_64); + + s2[10] = multiply_accumulate_shift_and_narrow_s32_dual(s2[11], -cospi_24_64, + s2[12], -cospi_8_64); + s2[13] = multiply_accumulate_shift_and_narrow_s32_dual(s2[11], -cospi_8_64, + s2[12], cospi_24_64); + + s2[16] = highbd_idct_add_dual(s1[16], s1[19]); + + s2[17] = highbd_idct_add_dual(s1[17], s1[18]); + s2[18] = highbd_idct_sub_dual(s1[17], s1[18]); + + s2[19] = highbd_idct_sub_dual(s1[16], s1[19]); + + s2[20] = highbd_idct_sub_dual(s1[23], s1[20]); + s2[21] = highbd_idct_sub_dual(s1[22], s1[21]); + + s2[22] = highbd_idct_add_dual(s1[21], s1[22]); + s2[23] = highbd_idct_add_dual(s1[20], s1[23]); + + s2[24] = highbd_idct_add_dual(s1[24], s1[27]); + s2[25] = highbd_idct_add_dual(s1[25], s1[26]); + s2[26] = highbd_idct_sub_dual(s1[25], s1[26]); + s2[27] = highbd_idct_sub_dual(s1[24], s1[27]); + + s2[28] = highbd_idct_sub_dual(s1[31], s1[28]); + s2[29] = highbd_idct_sub_dual(s1[30], s1[29]); + s2[30] = highbd_idct_add_dual(s1[29], s1[30]); + s2[31] = highbd_idct_add_dual(s1[28], s1[31]); + + // stage 5 + s1[5] = sub_multiply_shift_and_narrow_s32_dual(s1[7], s1[4], cospi_16_64); + s1[6] = add_multiply_shift_and_narrow_s32_dual(s1[4], s1[7], cospi_16_64); + + s1[8] = highbd_idct_add_dual(s2[8], s2[11]); + s1[9] = highbd_idct_add_dual(s2[9], s2[10]); + s1[10] = highbd_idct_sub_dual(s2[9], s2[10]); + s1[11] = highbd_idct_sub_dual(s2[8], s2[11]); + s1[12] = highbd_idct_sub_dual(s2[15], s2[12]); + s1[13] = highbd_idct_sub_dual(s2[14], s2[13]); + s1[14] = highbd_idct_add_dual(s2[13], s2[14]); + s1[15] = highbd_idct_add_dual(s2[12], s2[15]); + + s1[18] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_8_64, + s2[29], cospi_24_64); + s1[29] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], cospi_24_64, + s2[29], cospi_8_64); + + s1[19] = multiply_accumulate_shift_and_narrow_s32_dual(s2[19], -cospi_8_64, + s2[28], cospi_24_64); + s1[28] = multiply_accumulate_shift_and_narrow_s32_dual(s2[19], cospi_24_64, + s2[28], cospi_8_64); + + s1[20] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_24_64, + s2[27], -cospi_8_64); + s1[27] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_8_64, + s2[27], cospi_24_64); + + s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_24_64, + s2[26], -cospi_8_64); + s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_8_64, + s2[26], cospi_24_64); + + // stage 6 + s2[0] = highbd_idct_add_dual(s1[0], s1[7]); + s2[1] = highbd_idct_add_dual(s1[0], s1[6]); + s2[2] = highbd_idct_add_dual(s1[0], s1[5]); + s2[3] = highbd_idct_add_dual(s1[0], s1[4]); + s2[4] = highbd_idct_sub_dual(s1[0], s1[4]); + s2[5] = highbd_idct_sub_dual(s1[0], s1[5]); + s2[6] = highbd_idct_sub_dual(s1[0], s1[6]); + s2[7] = highbd_idct_sub_dual(s1[0], s1[7]); + + s2[10] = sub_multiply_shift_and_narrow_s32_dual(s1[13], s1[10], cospi_16_64); + s2[13] = add_multiply_shift_and_narrow_s32_dual(s1[10], s1[13], cospi_16_64); + + s2[11] = sub_multiply_shift_and_narrow_s32_dual(s1[12], s1[11], cospi_16_64); + s2[12] = add_multiply_shift_and_narrow_s32_dual(s1[11], s1[12], cospi_16_64); + + s1[16] = highbd_idct_add_dual(s2[16], s2[23]); + s1[17] = highbd_idct_add_dual(s2[17], s2[22]); + s2[18] = highbd_idct_add_dual(s1[18], s1[21]); + s2[19] = highbd_idct_add_dual(s1[19], s1[20]); + s2[20] = highbd_idct_sub_dual(s1[19], s1[20]); + s2[21] = highbd_idct_sub_dual(s1[18], s1[21]); + s1[22] = highbd_idct_sub_dual(s2[17], s2[22]); + s1[23] = highbd_idct_sub_dual(s2[16], s2[23]); + + s3[24] = highbd_idct_sub_dual(s2[31], s2[24]); + s3[25] = highbd_idct_sub_dual(s2[30], s2[25]); + s3[26] = highbd_idct_sub_dual(s1[29], s1[26]); + s3[27] = highbd_idct_sub_dual(s1[28], s1[27]); + s2[28] = highbd_idct_add_dual(s1[27], s1[28]); + s2[29] = highbd_idct_add_dual(s1[26], s1[29]); + s2[30] = highbd_idct_add_dual(s2[25], s2[30]); + s2[31] = highbd_idct_add_dual(s2[24], s2[31]); + + // stage 7 + s1[0] = highbd_idct_add_dual(s2[0], s1[15]); + s1[1] = highbd_idct_add_dual(s2[1], s1[14]); + s1[2] = highbd_idct_add_dual(s2[2], s2[13]); + s1[3] = highbd_idct_add_dual(s2[3], s2[12]); + s1[4] = highbd_idct_add_dual(s2[4], s2[11]); + s1[5] = highbd_idct_add_dual(s2[5], s2[10]); + s1[6] = highbd_idct_add_dual(s2[6], s1[9]); + s1[7] = highbd_idct_add_dual(s2[7], s1[8]); + s1[8] = highbd_idct_sub_dual(s2[7], s1[8]); + s1[9] = highbd_idct_sub_dual(s2[6], s1[9]); + s1[10] = highbd_idct_sub_dual(s2[5], s2[10]); + s1[11] = highbd_idct_sub_dual(s2[4], s2[11]); + s1[12] = highbd_idct_sub_dual(s2[3], s2[12]); + s1[13] = highbd_idct_sub_dual(s2[2], s2[13]); + s1[14] = highbd_idct_sub_dual(s2[1], s1[14]); + s1[15] = highbd_idct_sub_dual(s2[0], s1[15]); + + s1[20] = sub_multiply_shift_and_narrow_s32_dual(s3[27], s2[20], cospi_16_64); + s1[27] = add_multiply_shift_and_narrow_s32_dual(s2[20], s3[27], cospi_16_64); + + s1[21] = sub_multiply_shift_and_narrow_s32_dual(s3[26], s2[21], cospi_16_64); + s1[26] = add_multiply_shift_and_narrow_s32_dual(s2[21], s3[26], cospi_16_64); + + s2[22] = sub_multiply_shift_and_narrow_s32_dual(s3[25], s1[22], cospi_16_64); + s1[25] = add_multiply_shift_and_narrow_s32_dual(s1[22], s3[25], cospi_16_64); + + s2[23] = sub_multiply_shift_and_narrow_s32_dual(s3[24], s1[23], cospi_16_64); + s1[24] = add_multiply_shift_and_narrow_s32_dual(s1[23], s3[24], cospi_16_64); + + // final stage + out[0] = highbd_idct_add_dual(s1[0], s2[31]); + out[1] = highbd_idct_add_dual(s1[1], s2[30]); + out[2] = highbd_idct_add_dual(s1[2], s2[29]); + out[3] = highbd_idct_add_dual(s1[3], s2[28]); + out[4] = highbd_idct_add_dual(s1[4], s1[27]); + out[5] = highbd_idct_add_dual(s1[5], s1[26]); + out[6] = highbd_idct_add_dual(s1[6], s1[25]); + out[7] = highbd_idct_add_dual(s1[7], s1[24]); + out[8] = highbd_idct_add_dual(s1[8], s2[23]); + out[9] = highbd_idct_add_dual(s1[9], s2[22]); + out[10] = highbd_idct_add_dual(s1[10], s1[21]); + out[11] = highbd_idct_add_dual(s1[11], s1[20]); + out[12] = highbd_idct_add_dual(s1[12], s2[19]); + out[13] = highbd_idct_add_dual(s1[13], s2[18]); + out[14] = highbd_idct_add_dual(s1[14], s1[17]); + out[15] = highbd_idct_add_dual(s1[15], s1[16]); + out[16] = highbd_idct_sub_dual(s1[15], s1[16]); + out[17] = highbd_idct_sub_dual(s1[14], s1[17]); + out[18] = highbd_idct_sub_dual(s1[13], s2[18]); + out[19] = highbd_idct_sub_dual(s1[12], s2[19]); + out[20] = highbd_idct_sub_dual(s1[11], s1[20]); + out[21] = highbd_idct_sub_dual(s1[10], s1[21]); + out[22] = highbd_idct_sub_dual(s1[9], s2[22]); + out[23] = highbd_idct_sub_dual(s1[8], s2[23]); + out[24] = highbd_idct_sub_dual(s1[7], s1[24]); + out[25] = highbd_idct_sub_dual(s1[6], s1[25]); + out[26] = highbd_idct_sub_dual(s1[5], s1[26]); + out[27] = highbd_idct_sub_dual(s1[4], s1[27]); + out[28] = highbd_idct_sub_dual(s1[3], s2[28]); + out[29] = highbd_idct_sub_dual(s1[2], s2[29]); + out[30] = highbd_idct_sub_dual(s1[1], s2[30]); + out[31] = highbd_idct_sub_dual(s1[0], s2[31]); + + highbd_idct16x16_add_store(out, output, stride, bd); + highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd); +} + +void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i; + + if (bd == 8) { + int16_t temp[32 * 8]; + int16_t *t = temp; + + vpx_idct32_6_neon(input, t); + + for (i = 0; i < 32; i += 8) { + vpx_idct32_8_neon(t, dest, stride, 1); + t += (8 * 8); + dest += 8; + } + } else { + int32_t temp[32 * 8]; + int32_t *t = temp; + + vpx_highbd_idct32_6_neon(input, t); + + for (i = 0; i < 32; i += 8) { + vpx_highbd_idct32_8_neon(t, dest, stride, bd); + t += (8 * 8); + dest += 8; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c new file mode 100644 index 0000000000..c1354c0c1a --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/inv_txfm.h" + +static INLINE void highbd_idct32x32_1_add_pos_kernel(uint16_t **dest, + const int stride, + const int16x8_t res, + const int16x8_t max) { + const uint16x8_t a0 = vld1q_u16(*dest); + const uint16x8_t a1 = vld1q_u16(*dest + 8); + const uint16x8_t a2 = vld1q_u16(*dest + 16); + const uint16x8_t a3 = vld1q_u16(*dest + 24); + const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0)); + const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1)); + const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2)); + const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3)); + const int16x8_t c0 = vminq_s16(b0, max); + const int16x8_t c1 = vminq_s16(b1, max); + const int16x8_t c2 = vminq_s16(b2, max); + const int16x8_t c3 = vminq_s16(b3, max); + vst1q_u16(*dest, vreinterpretq_u16_s16(c0)); + vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1)); + vst1q_u16(*dest + 16, vreinterpretq_u16_s16(c2)); + vst1q_u16(*dest + 24, vreinterpretq_u16_s16(c3)); + *dest += stride; +} + +static INLINE void highbd_idct32x32_1_add_neg_kernel(uint16_t **dest, + const int stride, + const int16x8_t res) { + const uint16x8_t a0 = vld1q_u16(*dest); + const uint16x8_t a1 = vld1q_u16(*dest + 8); + const uint16x8_t a2 = vld1q_u16(*dest + 16); + const uint16x8_t a3 = vld1q_u16(*dest + 24); + const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0)); + const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1)); + const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2)); + const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3)); + const uint16x8_t c0 = vqshluq_n_s16(b0, 0); + const uint16x8_t c1 = vqshluq_n_s16(b1, 0); + const uint16x8_t c2 = vqshluq_n_s16(b2, 0); + const uint16x8_t c3 = vqshluq_n_s16(b3, 0); + vst1q_u16(*dest, c0); + vst1q_u16(*dest + 8, c1); + vst1q_u16(*dest + 16, c2); + vst1q_u16(*dest + 24, c3); + *dest += stride; +} + +void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + const tran_low_t out0 = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); + const tran_low_t out1 = HIGHBD_WRAPLOW( + dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd); + const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6); + const int16x8_t dc = vdupq_n_s16(a1); + int i; + + if (a1 >= 0) { + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + for (i = 0; i < 8; ++i) { + highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max); + } + } else { + for (i = 0; i < 8; ++i) { + highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc); + highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc); + highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc); + highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc); + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c new file mode 100644 index 0000000000..7be1dad1d3 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/highbd_idct_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/inv_txfm.h" + +// res is in reverse row order +static INLINE void highbd_idct4x4_1_add_kernel2(uint16_t **dest, + const int stride, + const int16x8_t res, + const int16x8_t max) { + const uint16x4_t a0 = vld1_u16(*dest); + const uint16x4_t a1 = vld1_u16(*dest + stride); + const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a1, a0)); + // Note: In some profile tests, res is quite close to +/-32767. + // We use saturating addition. + const int16x8_t b = vqaddq_s16(res, a); + const int16x8_t c = vminq_s16(b, max); + const uint16x8_t d = vqshluq_n_s16(c, 0); + vst1_u16(*dest, vget_high_u16(d)); + *dest += stride; + vst1_u16(*dest, vget_low_u16(d)); + *dest += stride; +} + +void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + const tran_low_t out0 = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); + const tran_low_t out1 = HIGHBD_WRAPLOW( + dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd); + const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4); + const int16x8_t dc = vdupq_n_s16(a1); + + highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max); + highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max); +} + +void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + int16x8_t a[2]; + int32x4_t c[4]; + + c[0] = vld1q_s32(input); + c[1] = vld1q_s32(input + 4); + c[2] = vld1q_s32(input + 8); + c[3] = vld1q_s32(input + 12); + + if (bd == 8) { + // Rows + a[0] = vcombine_s16(vmovn_s32(c[0]), vmovn_s32(c[1])); + a[1] = vcombine_s16(vmovn_s32(c[2]), vmovn_s32(c[3])); + transpose_idct4x4_16_bd8(a); + + // Columns + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + transpose_idct4x4_16_bd8(a); + a[0] = vrshrq_n_s16(a[0], 4); + a[1] = vrshrq_n_s16(a[1], 4); + } else { + const int32x4_t cospis = vld1q_s32(kCospi32); + + if (bd == 10) { + idct4x4_16_kernel_bd10(cospis, c); + idct4x4_16_kernel_bd10(cospis, c); + } else { + idct4x4_16_kernel_bd12(cospis, c); + idct4x4_16_kernel_bd12(cospis, c); + } + a[0] = vcombine_s16(vqrshrn_n_s32(c[0], 4), vqrshrn_n_s32(c[1], 4)); + a[1] = vcombine_s16(vqrshrn_n_s32(c[3], 4), vqrshrn_n_s32(c[2], 4)); + } + + highbd_idct4x4_1_add_kernel1(&dest, stride, a[0], max); + highbd_idct4x4_1_add_kernel2(&dest, stride, a[1], max); +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c new file mode 100644 index 0000000000..bed3227ca7 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c @@ -0,0 +1,371 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/highbd_idct_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/inv_txfm.h" + +static INLINE void highbd_idct8x8_1_add_pos_kernel(uint16_t **dest, + const int stride, + const int16x8_t res, + const int16x8_t max) { + const uint16x8_t a = vld1q_u16(*dest); + const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a)); + const int16x8_t c = vminq_s16(b, max); + vst1q_u16(*dest, vreinterpretq_u16_s16(c)); + *dest += stride; +} + +static INLINE void highbd_idct8x8_1_add_neg_kernel(uint16_t **dest, + const int stride, + const int16x8_t res) { + const uint16x8_t a = vld1q_u16(*dest); + const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a)); + const uint16x8_t c = vqshluq_n_s16(b, 0); + vst1q_u16(*dest, c); + *dest += stride; +} + +void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + const tran_low_t out0 = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); + const tran_low_t out1 = HIGHBD_WRAPLOW( + dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd); + const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5); + const int16x8_t dc = vdupq_n_s16(a1); + + if (a1 >= 0) { + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max); + } else { + highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc); + highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc); + highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc); + highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc); + highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc); + highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc); + highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc); + highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc); + } +} + +static INLINE void idct8x8_12_half1d_bd10( + const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0, + int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3, + int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6, + int32x4_t *const io7) { + int32x4_t step1[8], step2[8]; + + transpose_s32_4x4(io0, io1, io2, io3); + + // stage 1 + step1[4] = vmulq_lane_s32(*io1, vget_high_s32(cospis1), 1); + step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0); + step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1); + step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0); + step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS); + step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS); + step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS); + step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS); + + // stage 2 + step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0); + step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1); + step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1); + step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS); + step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS); + step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS); + + step2[4] = vaddq_s32(step1[4], step1[5]); + step2[5] = vsubq_s32(step1[4], step1[5]); + step2[6] = vsubq_s32(step1[7], step1[6]); + step2[7] = vaddq_s32(step1[7], step1[6]); + + // stage 3 + step1[0] = vaddq_s32(step2[1], step2[3]); + step1[1] = vaddq_s32(step2[1], step2[2]); + step1[2] = vsubq_s32(step2[1], step2[2]); + step1[3] = vsubq_s32(step2[1], step2[3]); + + step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0); + step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0); + step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0); + step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS); + step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS); + + // stage 4 + *io0 = vaddq_s32(step1[0], step2[7]); + *io1 = vaddq_s32(step1[1], step1[6]); + *io2 = vaddq_s32(step1[2], step1[5]); + *io3 = vaddq_s32(step1[3], step2[4]); + *io4 = vsubq_s32(step1[3], step2[4]); + *io5 = vsubq_s32(step1[2], step1[5]); + *io6 = vsubq_s32(step1[1], step1[6]); + *io7 = vsubq_s32(step1[0], step2[7]); +} + +static INLINE void idct8x8_12_half1d_bd12( + const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0, + int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3, + int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6, + int32x4_t *const io7) { + int32x2_t input1l, input1h, input3l, input3h; + int32x2_t step1l[2], step1h[2]; + int32x4_t step1[8], step2[8]; + int64x2_t t64[8]; + int32x2_t t32[8]; + + transpose_s32_4x4(io0, io1, io2, io3); + + // stage 1 + input1l = vget_low_s32(*io1); + input1h = vget_high_s32(*io1); + input3l = vget_low_s32(*io3); + input3h = vget_high_s32(*io3); + step1l[0] = vget_low_s32(*io0); + step1h[0] = vget_high_s32(*io0); + step1l[1] = vget_low_s32(*io2); + step1h[1] = vget_high_s32(*io2); + + t64[0] = vmull_lane_s32(input1l, vget_high_s32(cospis1), 1); + t64[1] = vmull_lane_s32(input1h, vget_high_s32(cospis1), 1); + t64[2] = vmull_lane_s32(input3l, vget_high_s32(cospis1), 0); + t64[3] = vmull_lane_s32(input3h, vget_high_s32(cospis1), 0); + t64[4] = vmull_lane_s32(input3l, vget_low_s32(cospis1), 1); + t64[5] = vmull_lane_s32(input3h, vget_low_s32(cospis1), 1); + t64[6] = vmull_lane_s32(input1l, vget_low_s32(cospis1), 0); + t64[7] = vmull_lane_s32(input1h, vget_low_s32(cospis1), 0); + t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); + t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); + t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS); + t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS); + t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS); + t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS); + step1[4] = vcombine_s32(t32[0], t32[1]); + step1[5] = vcombine_s32(t32[2], t32[3]); + step1[6] = vcombine_s32(t32[4], t32[5]); + step1[7] = vcombine_s32(t32[6], t32[7]); + + // stage 2 + t64[2] = vmull_lane_s32(step1l[0], vget_high_s32(cospis0), 0); + t64[3] = vmull_lane_s32(step1h[0], vget_high_s32(cospis0), 0); + t64[4] = vmull_lane_s32(step1l[1], vget_high_s32(cospis0), 1); + t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1); + t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1); + t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); + t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS); + t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS); + t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS); + t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS); + step2[1] = vcombine_s32(t32[2], t32[3]); + step2[2] = vcombine_s32(t32[4], t32[5]); + step2[3] = vcombine_s32(t32[6], t32[7]); + + step2[4] = vaddq_s32(step1[4], step1[5]); + step2[5] = vsubq_s32(step1[4], step1[5]); + step2[6] = vsubq_s32(step1[7], step1[6]); + step2[7] = vaddq_s32(step1[7], step1[6]); + + // stage 3 + step1[0] = vaddq_s32(step2[1], step2[3]); + step1[1] = vaddq_s32(step2[1], step2[2]); + step1[2] = vsubq_s32(step2[1], step2[2]); + step1[3] = vsubq_s32(step2[1], step2[3]); + + t64[2] = vmull_lane_s32(vget_low_s32(step2[6]), vget_high_s32(cospis0), 0); + t64[3] = vmull_lane_s32(vget_high_s32(step2[6]), vget_high_s32(cospis0), 0); + t64[0] = + vmlsl_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0); + t64[1] = vmlsl_lane_s32(t64[3], vget_high_s32(step2[5]), + vget_high_s32(cospis0), 0); + t64[2] = + vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0); + t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]), + vget_high_s32(cospis0), 0); + t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); + t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); + step1[5] = vcombine_s32(t32[0], t32[1]); + step1[6] = vcombine_s32(t32[2], t32[3]); + + // stage 4 + *io0 = vaddq_s32(step1[0], step2[7]); + *io1 = vaddq_s32(step1[1], step1[6]); + *io2 = vaddq_s32(step1[2], step1[5]); + *io3 = vaddq_s32(step1[3], step2[4]); + *io4 = vsubq_s32(step1[3], step2[4]); + *io5 = vsubq_s32(step1[2], step1[5]); + *io6 = vsubq_s32(step1[1], step1[6]); + *io7 = vsubq_s32(step1[0], step2[7]); +} + +void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int32x4_t a[16]; + int16x8_t c[8]; + + a[0] = vld1q_s32(input); + a[1] = vld1q_s32(input + 8); + a[2] = vld1q_s32(input + 16); + a[3] = vld1q_s32(input + 24); + + if (bd == 8) { + const int16x8_t cospis = vld1q_s16(kCospi); + const int16x8_t cospisd = vaddq_s16(cospis, cospis); + const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 + const int16x4_t cospisd0 = vget_low_s16(cospisd); // doubled 0, 8, 16, 24 + const int16x4_t cospisd1 = vget_high_s16(cospisd); // doubled 4, 12, 20, 28 + int16x4_t b[8]; + + b[0] = vmovn_s32(a[0]); + b[1] = vmovn_s32(a[1]); + b[2] = vmovn_s32(a[2]); + b[3] = vmovn_s32(a[3]); + + idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, b); + idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, b, c); + c[0] = vrshrq_n_s16(c[0], 5); + c[1] = vrshrq_n_s16(c[1], 5); + c[2] = vrshrq_n_s16(c[2], 5); + c[3] = vrshrq_n_s16(c[3], 5); + c[4] = vrshrq_n_s16(c[4], 5); + c[5] = vrshrq_n_s16(c[5], 5); + c[6] = vrshrq_n_s16(c[6], 5); + c[7] = vrshrq_n_s16(c[7], 5); + } else { + const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24 + const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28 + + if (bd == 10) { + idct8x8_12_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6], &a[7]); + idct8x8_12_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[8], &a[9], &a[10], &a[11]); + idct8x8_12_half1d_bd10(cospis0, cospis1, &a[4], &a[5], &a[6], &a[7], + &a[12], &a[13], &a[14], &a[15]); + } else { + idct8x8_12_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6], &a[7]); + idct8x8_12_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[8], &a[9], &a[10], &a[11]); + idct8x8_12_half1d_bd12(cospis0, cospis1, &a[4], &a[5], &a[6], &a[7], + &a[12], &a[13], &a[14], &a[15]); + } + c[0] = vcombine_s16(vrshrn_n_s32(a[0], 5), vrshrn_n_s32(a[4], 5)); + c[1] = vcombine_s16(vrshrn_n_s32(a[1], 5), vrshrn_n_s32(a[5], 5)); + c[2] = vcombine_s16(vrshrn_n_s32(a[2], 5), vrshrn_n_s32(a[6], 5)); + c[3] = vcombine_s16(vrshrn_n_s32(a[3], 5), vrshrn_n_s32(a[7], 5)); + c[4] = vcombine_s16(vrshrn_n_s32(a[8], 5), vrshrn_n_s32(a[12], 5)); + c[5] = vcombine_s16(vrshrn_n_s32(a[9], 5), vrshrn_n_s32(a[13], 5)); + c[6] = vcombine_s16(vrshrn_n_s32(a[10], 5), vrshrn_n_s32(a[14], 5)); + c[7] = vcombine_s16(vrshrn_n_s32(a[11], 5), vrshrn_n_s32(a[15], 5)); + } + highbd_add8x8(c, dest, stride, bd); +} + +void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int32x4_t a[16]; + int16x8_t c[8]; + + a[0] = vld1q_s32(input); + a[1] = vld1q_s32(input + 4); + a[2] = vld1q_s32(input + 8); + a[3] = vld1q_s32(input + 12); + a[4] = vld1q_s32(input + 16); + a[5] = vld1q_s32(input + 20); + a[6] = vld1q_s32(input + 24); + a[7] = vld1q_s32(input + 28); + a[8] = vld1q_s32(input + 32); + a[9] = vld1q_s32(input + 36); + a[10] = vld1q_s32(input + 40); + a[11] = vld1q_s32(input + 44); + a[12] = vld1q_s32(input + 48); + a[13] = vld1q_s32(input + 52); + a[14] = vld1q_s32(input + 56); + a[15] = vld1q_s32(input + 60); + + if (bd == 8) { + const int16x8_t cospis = vld1q_s16(kCospi); + const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 + const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 + int16x8_t b[8]; + + b[0] = vcombine_s16(vmovn_s32(a[0]), vmovn_s32(a[1])); + b[1] = vcombine_s16(vmovn_s32(a[2]), vmovn_s32(a[3])); + b[2] = vcombine_s16(vmovn_s32(a[4]), vmovn_s32(a[5])); + b[3] = vcombine_s16(vmovn_s32(a[6]), vmovn_s32(a[7])); + b[4] = vcombine_s16(vmovn_s32(a[8]), vmovn_s32(a[9])); + b[5] = vcombine_s16(vmovn_s32(a[10]), vmovn_s32(a[11])); + b[6] = vcombine_s16(vmovn_s32(a[12]), vmovn_s32(a[13])); + b[7] = vcombine_s16(vmovn_s32(a[14]), vmovn_s32(a[15])); + + idct8x8_64_1d_bd8(cospis0, cospis1, b); + idct8x8_64_1d_bd8(cospis0, cospis1, b); + + c[0] = vrshrq_n_s16(b[0], 5); + c[1] = vrshrq_n_s16(b[1], 5); + c[2] = vrshrq_n_s16(b[2], 5); + c[3] = vrshrq_n_s16(b[3], 5); + c[4] = vrshrq_n_s16(b[4], 5); + c[5] = vrshrq_n_s16(b[5], 5); + c[6] = vrshrq_n_s16(b[6], 5); + c[7] = vrshrq_n_s16(b[7], 5); + } else { + const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24 + const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28 + + if (bd == 10) { + idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6], &a[7]); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11], + &a[12], &a[13], &a[14], &a[15]); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9], + &a[2], &a[10], &a[3], &a[11]); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13], + &a[6], &a[14], &a[7], &a[15]); + } else { + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6], &a[7]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11], + &a[12], &a[13], &a[14], &a[15]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9], + &a[2], &a[10], &a[3], &a[11]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13], + &a[6], &a[14], &a[7], &a[15]); + } + c[0] = vcombine_s16(vrshrn_n_s32(a[0], 5), vrshrn_n_s32(a[4], 5)); + c[1] = vcombine_s16(vrshrn_n_s32(a[8], 5), vrshrn_n_s32(a[12], 5)); + c[2] = vcombine_s16(vrshrn_n_s32(a[1], 5), vrshrn_n_s32(a[5], 5)); + c[3] = vcombine_s16(vrshrn_n_s32(a[9], 5), vrshrn_n_s32(a[13], 5)); + c[4] = vcombine_s16(vrshrn_n_s32(a[2], 5), vrshrn_n_s32(a[6], 5)); + c[5] = vcombine_s16(vrshrn_n_s32(a[10], 5), vrshrn_n_s32(a[14], 5)); + c[6] = vcombine_s16(vrshrn_n_s32(a[3], 5), vrshrn_n_s32(a[7], 5)); + c[7] = vcombine_s16(vrshrn_n_s32(a[11], 5), vrshrn_n_s32(a[15], 5)); + } + highbd_add8x8(c, dest, stride, bd); +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct_neon.h new file mode 100644 index 0000000000..518ef4336e --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct_neon.h @@ -0,0 +1,474 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_HIGHBD_IDCT_NEON_H_ +#define VPX_VPX_DSP_ARM_HIGHBD_IDCT_NEON_H_ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/inv_txfm.h" + +static INLINE void highbd_idct4x4_1_add_kernel1(uint16_t **dest, + const int stride, + const int16x8_t res, + const int16x8_t max) { + const uint16x4_t a0 = vld1_u16(*dest); + const uint16x4_t a1 = vld1_u16(*dest + stride); + const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a0, a1)); + // Note: In some profile tests, res is quite close to +/-32767. + // We use saturating addition. + const int16x8_t b = vqaddq_s16(res, a); + const int16x8_t c = vminq_s16(b, max); + const uint16x8_t d = vqshluq_n_s16(c, 0); + vst1_u16(*dest, vget_low_u16(d)); + *dest += stride; + vst1_u16(*dest, vget_high_u16(d)); + *dest += stride; +} + +static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis, + int32x4_t *const a) { + int32x4_t b0, b1, b2, b3; + + transpose_s32_4x4(&a[0], &a[1], &a[2], &a[3]); + b0 = vaddq_s32(a[0], a[2]); + b1 = vsubq_s32(a[0], a[2]); + b0 = vmulq_lane_s32(b0, vget_high_s32(cospis), 0); + b1 = vmulq_lane_s32(b1, vget_high_s32(cospis), 0); + b2 = vmulq_lane_s32(a[1], vget_high_s32(cospis), 1); + b3 = vmulq_lane_s32(a[1], vget_low_s32(cospis), 1); + b2 = vmlsq_lane_s32(b2, a[3], vget_low_s32(cospis), 1); + b3 = vmlaq_lane_s32(b3, a[3], vget_high_s32(cospis), 1); + b0 = vrshrq_n_s32(b0, DCT_CONST_BITS); + b1 = vrshrq_n_s32(b1, DCT_CONST_BITS); + b2 = vrshrq_n_s32(b2, DCT_CONST_BITS); + b3 = vrshrq_n_s32(b3, DCT_CONST_BITS); + a[0] = vaddq_s32(b0, b3); + a[1] = vaddq_s32(b1, b2); + a[2] = vsubq_s32(b1, b2); + a[3] = vsubq_s32(b0, b3); +} + +static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis, + int32x4_t *const a) { + int32x4_t b0, b1, b2, b3; + int64x2_t c[12]; + + transpose_s32_4x4(&a[0], &a[1], &a[2], &a[3]); + b0 = vaddq_s32(a[0], a[2]); + b1 = vsubq_s32(a[0], a[2]); + c[0] = vmull_lane_s32(vget_low_s32(b0), vget_high_s32(cospis), 0); + c[1] = vmull_lane_s32(vget_high_s32(b0), vget_high_s32(cospis), 0); + c[2] = vmull_lane_s32(vget_low_s32(b1), vget_high_s32(cospis), 0); + c[3] = vmull_lane_s32(vget_high_s32(b1), vget_high_s32(cospis), 0); + c[4] = vmull_lane_s32(vget_low_s32(a[1]), vget_high_s32(cospis), 1); + c[5] = vmull_lane_s32(vget_high_s32(a[1]), vget_high_s32(cospis), 1); + c[6] = vmull_lane_s32(vget_low_s32(a[1]), vget_low_s32(cospis), 1); + c[7] = vmull_lane_s32(vget_high_s32(a[1]), vget_low_s32(cospis), 1); + c[8] = vmull_lane_s32(vget_low_s32(a[3]), vget_low_s32(cospis), 1); + c[9] = vmull_lane_s32(vget_high_s32(a[3]), vget_low_s32(cospis), 1); + c[10] = vmull_lane_s32(vget_low_s32(a[3]), vget_high_s32(cospis), 1); + c[11] = vmull_lane_s32(vget_high_s32(a[3]), vget_high_s32(cospis), 1); + c[4] = vsubq_s64(c[4], c[8]); + c[5] = vsubq_s64(c[5], c[9]); + c[6] = vaddq_s64(c[6], c[10]); + c[7] = vaddq_s64(c[7], c[11]); + b0 = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS), + vrshrn_n_s64(c[1], DCT_CONST_BITS)); + b1 = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS), + vrshrn_n_s64(c[3], DCT_CONST_BITS)); + b2 = vcombine_s32(vrshrn_n_s64(c[4], DCT_CONST_BITS), + vrshrn_n_s64(c[5], DCT_CONST_BITS)); + b3 = vcombine_s32(vrshrn_n_s64(c[6], DCT_CONST_BITS), + vrshrn_n_s64(c[7], DCT_CONST_BITS)); + a[0] = vaddq_s32(b0, b3); + a[1] = vaddq_s32(b1, b2); + a[2] = vsubq_s32(b1, b2); + a[3] = vsubq_s32(b0, b3); +} + +static INLINE void highbd_add8x8(int16x8_t *const a, uint16_t *dest, + const int stride, const int bd) { + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + const uint16_t *dst = dest; + uint16x8_t d0, d1, d2, d3, d4, d5, d6, d7; + uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16, d4_u16, d5_u16, d6_u16, d7_u16; + int16x8_t d0_s16, d1_s16, d2_s16, d3_s16, d4_s16, d5_s16, d6_s16, d7_s16; + + d0 = vld1q_u16(dst); + dst += stride; + d1 = vld1q_u16(dst); + dst += stride; + d2 = vld1q_u16(dst); + dst += stride; + d3 = vld1q_u16(dst); + dst += stride; + d4 = vld1q_u16(dst); + dst += stride; + d5 = vld1q_u16(dst); + dst += stride; + d6 = vld1q_u16(dst); + dst += stride; + d7 = vld1q_u16(dst); + + d0_s16 = vqaddq_s16(a[0], vreinterpretq_s16_u16(d0)); + d1_s16 = vqaddq_s16(a[1], vreinterpretq_s16_u16(d1)); + d2_s16 = vqaddq_s16(a[2], vreinterpretq_s16_u16(d2)); + d3_s16 = vqaddq_s16(a[3], vreinterpretq_s16_u16(d3)); + d4_s16 = vqaddq_s16(a[4], vreinterpretq_s16_u16(d4)); + d5_s16 = vqaddq_s16(a[5], vreinterpretq_s16_u16(d5)); + d6_s16 = vqaddq_s16(a[6], vreinterpretq_s16_u16(d6)); + d7_s16 = vqaddq_s16(a[7], vreinterpretq_s16_u16(d7)); + + d0_s16 = vminq_s16(d0_s16, max); + d1_s16 = vminq_s16(d1_s16, max); + d2_s16 = vminq_s16(d2_s16, max); + d3_s16 = vminq_s16(d3_s16, max); + d4_s16 = vminq_s16(d4_s16, max); + d5_s16 = vminq_s16(d5_s16, max); + d6_s16 = vminq_s16(d6_s16, max); + d7_s16 = vminq_s16(d7_s16, max); + d0_u16 = vqshluq_n_s16(d0_s16, 0); + d1_u16 = vqshluq_n_s16(d1_s16, 0); + d2_u16 = vqshluq_n_s16(d2_s16, 0); + d3_u16 = vqshluq_n_s16(d3_s16, 0); + d4_u16 = vqshluq_n_s16(d4_s16, 0); + d5_u16 = vqshluq_n_s16(d5_s16, 0); + d6_u16 = vqshluq_n_s16(d6_s16, 0); + d7_u16 = vqshluq_n_s16(d7_s16, 0); + + vst1q_u16(dest, d0_u16); + dest += stride; + vst1q_u16(dest, d1_u16); + dest += stride; + vst1q_u16(dest, d2_u16); + dest += stride; + vst1q_u16(dest, d3_u16); + dest += stride; + vst1q_u16(dest, d4_u16); + dest += stride; + vst1q_u16(dest, d5_u16); + dest += stride; + vst1q_u16(dest, d6_u16); + dest += stride; + vst1q_u16(dest, d7_u16); +} + +static INLINE void idct8x8_64_half1d_bd10( + const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0, + int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3, + int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6, + int32x4_t *const io7) { + int32x4_t step1[8], step2[8]; + + transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7); + + // stage 1 + step1[4] = vmulq_lane_s32(*io1, vget_high_s32(cospis1), 1); + step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0); + step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1); + step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0); + + step1[4] = vmlsq_lane_s32(step1[4], *io7, vget_low_s32(cospis1), 0); + step1[5] = vmlaq_lane_s32(step1[5], *io5, vget_low_s32(cospis1), 1); + step1[6] = vmlsq_lane_s32(step1[6], *io5, vget_high_s32(cospis1), 0); + step1[7] = vmlaq_lane_s32(step1[7], *io7, vget_high_s32(cospis1), 1); + + step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS); + step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS); + step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS); + step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS); + + // stage 2 + step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0); + step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1); + step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1); + + step2[0] = vmlaq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0); + step2[1] = vmlsq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0); + step2[2] = vmlsq_lane_s32(step2[2], *io6, vget_low_s32(cospis0), 1); + step2[3] = vmlaq_lane_s32(step2[3], *io6, vget_high_s32(cospis0), 1); + + step2[0] = vrshrq_n_s32(step2[0], DCT_CONST_BITS); + step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS); + step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS); + step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS); + + step2[4] = vaddq_s32(step1[4], step1[5]); + step2[5] = vsubq_s32(step1[4], step1[5]); + step2[6] = vsubq_s32(step1[7], step1[6]); + step2[7] = vaddq_s32(step1[7], step1[6]); + + // stage 3 + step1[0] = vaddq_s32(step2[0], step2[3]); + step1[1] = vaddq_s32(step2[1], step2[2]); + step1[2] = vsubq_s32(step2[1], step2[2]); + step1[3] = vsubq_s32(step2[0], step2[3]); + + step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0); + step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0); + step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0); + step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS); + step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS); + + // stage 4 + *io0 = vaddq_s32(step1[0], step2[7]); + *io1 = vaddq_s32(step1[1], step1[6]); + *io2 = vaddq_s32(step1[2], step1[5]); + *io3 = vaddq_s32(step1[3], step2[4]); + *io4 = vsubq_s32(step1[3], step2[4]); + *io5 = vsubq_s32(step1[2], step1[5]); + *io6 = vsubq_s32(step1[1], step1[6]); + *io7 = vsubq_s32(step1[0], step2[7]); +} + +static INLINE void idct8x8_64_half1d_bd12( + const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0, + int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3, + int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6, + int32x4_t *const io7) { + int32x2_t input1l, input1h, input3l, input3h, input5l, input5h, input7l, + input7h; + int32x2_t step1l[4], step1h[4]; + int32x4_t step1[8], step2[8]; + int64x2_t t64[8]; + int32x2_t t32[8]; + + transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7); + + // stage 1 + input1l = vget_low_s32(*io1); + input1h = vget_high_s32(*io1); + input3l = vget_low_s32(*io3); + input3h = vget_high_s32(*io3); + input5l = vget_low_s32(*io5); + input5h = vget_high_s32(*io5); + input7l = vget_low_s32(*io7); + input7h = vget_high_s32(*io7); + step1l[0] = vget_low_s32(*io0); + step1h[0] = vget_high_s32(*io0); + step1l[1] = vget_low_s32(*io2); + step1h[1] = vget_high_s32(*io2); + step1l[2] = vget_low_s32(*io4); + step1h[2] = vget_high_s32(*io4); + step1l[3] = vget_low_s32(*io6); + step1h[3] = vget_high_s32(*io6); + + t64[0] = vmull_lane_s32(input1l, vget_high_s32(cospis1), 1); + t64[1] = vmull_lane_s32(input1h, vget_high_s32(cospis1), 1); + t64[2] = vmull_lane_s32(input3l, vget_high_s32(cospis1), 0); + t64[3] = vmull_lane_s32(input3h, vget_high_s32(cospis1), 0); + t64[4] = vmull_lane_s32(input3l, vget_low_s32(cospis1), 1); + t64[5] = vmull_lane_s32(input3h, vget_low_s32(cospis1), 1); + t64[6] = vmull_lane_s32(input1l, vget_low_s32(cospis1), 0); + t64[7] = vmull_lane_s32(input1h, vget_low_s32(cospis1), 0); + t64[0] = vmlsl_lane_s32(t64[0], input7l, vget_low_s32(cospis1), 0); + t64[1] = vmlsl_lane_s32(t64[1], input7h, vget_low_s32(cospis1), 0); + t64[2] = vmlal_lane_s32(t64[2], input5l, vget_low_s32(cospis1), 1); + t64[3] = vmlal_lane_s32(t64[3], input5h, vget_low_s32(cospis1), 1); + t64[4] = vmlsl_lane_s32(t64[4], input5l, vget_high_s32(cospis1), 0); + t64[5] = vmlsl_lane_s32(t64[5], input5h, vget_high_s32(cospis1), 0); + t64[6] = vmlal_lane_s32(t64[6], input7l, vget_high_s32(cospis1), 1); + t64[7] = vmlal_lane_s32(t64[7], input7h, vget_high_s32(cospis1), 1); + t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); + t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); + t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS); + t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS); + t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS); + t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS); + step1[4] = vcombine_s32(t32[0], t32[1]); + step1[5] = vcombine_s32(t32[2], t32[3]); + step1[6] = vcombine_s32(t32[4], t32[5]); + step1[7] = vcombine_s32(t32[6], t32[7]); + + // stage 2 + t64[2] = vmull_lane_s32(step1l[0], vget_high_s32(cospis0), 0); + t64[3] = vmull_lane_s32(step1h[0], vget_high_s32(cospis0), 0); + t64[4] = vmull_lane_s32(step1l[1], vget_high_s32(cospis0), 1); + t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1); + t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1); + t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1); + t64[0] = vmlal_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0); + t64[1] = vmlal_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0); + t64[2] = vmlsl_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0); + t64[3] = vmlsl_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0); + t64[4] = vmlsl_lane_s32(t64[4], step1l[3], vget_low_s32(cospis0), 1); + t64[5] = vmlsl_lane_s32(t64[5], step1h[3], vget_low_s32(cospis0), 1); + t64[6] = vmlal_lane_s32(t64[6], step1l[3], vget_high_s32(cospis0), 1); + t64[7] = vmlal_lane_s32(t64[7], step1h[3], vget_high_s32(cospis0), 1); + t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); + t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); + t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS); + t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS); + t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS); + t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS); + step2[0] = vcombine_s32(t32[0], t32[1]); + step2[1] = vcombine_s32(t32[2], t32[3]); + step2[2] = vcombine_s32(t32[4], t32[5]); + step2[3] = vcombine_s32(t32[6], t32[7]); + + step2[4] = vaddq_s32(step1[4], step1[5]); + step2[5] = vsubq_s32(step1[4], step1[5]); + step2[6] = vsubq_s32(step1[7], step1[6]); + step2[7] = vaddq_s32(step1[7], step1[6]); + + // stage 3 + step1[0] = vaddq_s32(step2[0], step2[3]); + step1[1] = vaddq_s32(step2[1], step2[2]); + step1[2] = vsubq_s32(step2[1], step2[2]); + step1[3] = vsubq_s32(step2[0], step2[3]); + + t64[2] = vmull_lane_s32(vget_low_s32(step2[6]), vget_high_s32(cospis0), 0); + t64[3] = vmull_lane_s32(vget_high_s32(step2[6]), vget_high_s32(cospis0), 0); + t64[0] = + vmlsl_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0); + t64[1] = vmlsl_lane_s32(t64[3], vget_high_s32(step2[5]), + vget_high_s32(cospis0), 0); + t64[2] = + vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0); + t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]), + vget_high_s32(cospis0), 0); + t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); + t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); + step1[5] = vcombine_s32(t32[0], t32[1]); + step1[6] = vcombine_s32(t32[2], t32[3]); + + // stage 4 + *io0 = vaddq_s32(step1[0], step2[7]); + *io1 = vaddq_s32(step1[1], step1[6]); + *io2 = vaddq_s32(step1[2], step1[5]); + *io3 = vaddq_s32(step1[3], step2[4]); + *io4 = vsubq_s32(step1[3], step2[4]); + *io5 = vsubq_s32(step1[2], step1[5]); + *io6 = vsubq_s32(step1[1], step1[6]); + *io7 = vsubq_s32(step1[0], step2[7]); +} + +static INLINE void highbd_idct16x16_store_pass1(const int32x4x2_t *const out, + int32_t *output) { + // Save the result into output + vst1q_s32(output + 0, out[0].val[0]); + vst1q_s32(output + 4, out[0].val[1]); + output += 16; + vst1q_s32(output + 0, out[1].val[0]); + vst1q_s32(output + 4, out[1].val[1]); + output += 16; + vst1q_s32(output + 0, out[2].val[0]); + vst1q_s32(output + 4, out[2].val[1]); + output += 16; + vst1q_s32(output + 0, out[3].val[0]); + vst1q_s32(output + 4, out[3].val[1]); + output += 16; + vst1q_s32(output + 0, out[4].val[0]); + vst1q_s32(output + 4, out[4].val[1]); + output += 16; + vst1q_s32(output + 0, out[5].val[0]); + vst1q_s32(output + 4, out[5].val[1]); + output += 16; + vst1q_s32(output + 0, out[6].val[0]); + vst1q_s32(output + 4, out[6].val[1]); + output += 16; + vst1q_s32(output + 0, out[7].val[0]); + vst1q_s32(output + 4, out[7].val[1]); + output += 16; + vst1q_s32(output + 0, out[8].val[0]); + vst1q_s32(output + 4, out[8].val[1]); + output += 16; + vst1q_s32(output + 0, out[9].val[0]); + vst1q_s32(output + 4, out[9].val[1]); + output += 16; + vst1q_s32(output + 0, out[10].val[0]); + vst1q_s32(output + 4, out[10].val[1]); + output += 16; + vst1q_s32(output + 0, out[11].val[0]); + vst1q_s32(output + 4, out[11].val[1]); + output += 16; + vst1q_s32(output + 0, out[12].val[0]); + vst1q_s32(output + 4, out[12].val[1]); + output += 16; + vst1q_s32(output + 0, out[13].val[0]); + vst1q_s32(output + 4, out[13].val[1]); + output += 16; + vst1q_s32(output + 0, out[14].val[0]); + vst1q_s32(output + 4, out[14].val[1]); + output += 16; + vst1q_s32(output + 0, out[15].val[0]); + vst1q_s32(output + 4, out[15].val[1]); +} + +static INLINE void highbd_idct16x16_add_store(const int32x4x2_t *const out, + uint16_t *dest, const int stride, + const int bd) { + // Add the result to dest + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + int16x8_t o[16]; + o[0] = vcombine_s16(vrshrn_n_s32(out[0].val[0], 6), + vrshrn_n_s32(out[0].val[1], 6)); + o[1] = vcombine_s16(vrshrn_n_s32(out[1].val[0], 6), + vrshrn_n_s32(out[1].val[1], 6)); + o[2] = vcombine_s16(vrshrn_n_s32(out[2].val[0], 6), + vrshrn_n_s32(out[2].val[1], 6)); + o[3] = vcombine_s16(vrshrn_n_s32(out[3].val[0], 6), + vrshrn_n_s32(out[3].val[1], 6)); + o[4] = vcombine_s16(vrshrn_n_s32(out[4].val[0], 6), + vrshrn_n_s32(out[4].val[1], 6)); + o[5] = vcombine_s16(vrshrn_n_s32(out[5].val[0], 6), + vrshrn_n_s32(out[5].val[1], 6)); + o[6] = vcombine_s16(vrshrn_n_s32(out[6].val[0], 6), + vrshrn_n_s32(out[6].val[1], 6)); + o[7] = vcombine_s16(vrshrn_n_s32(out[7].val[0], 6), + vrshrn_n_s32(out[7].val[1], 6)); + o[8] = vcombine_s16(vrshrn_n_s32(out[8].val[0], 6), + vrshrn_n_s32(out[8].val[1], 6)); + o[9] = vcombine_s16(vrshrn_n_s32(out[9].val[0], 6), + vrshrn_n_s32(out[9].val[1], 6)); + o[10] = vcombine_s16(vrshrn_n_s32(out[10].val[0], 6), + vrshrn_n_s32(out[10].val[1], 6)); + o[11] = vcombine_s16(vrshrn_n_s32(out[11].val[0], 6), + vrshrn_n_s32(out[11].val[1], 6)); + o[12] = vcombine_s16(vrshrn_n_s32(out[12].val[0], 6), + vrshrn_n_s32(out[12].val[1], 6)); + o[13] = vcombine_s16(vrshrn_n_s32(out[13].val[0], 6), + vrshrn_n_s32(out[13].val[1], 6)); + o[14] = vcombine_s16(vrshrn_n_s32(out[14].val[0], 6), + vrshrn_n_s32(out[14].val[1], 6)); + o[15] = vcombine_s16(vrshrn_n_s32(out[15].val[0], 6), + vrshrn_n_s32(out[15].val[1], 6)); + highbd_idct16x16_add8x1(o[0], max, &dest, stride); + highbd_idct16x16_add8x1(o[1], max, &dest, stride); + highbd_idct16x16_add8x1(o[2], max, &dest, stride); + highbd_idct16x16_add8x1(o[3], max, &dest, stride); + highbd_idct16x16_add8x1(o[4], max, &dest, stride); + highbd_idct16x16_add8x1(o[5], max, &dest, stride); + highbd_idct16x16_add8x1(o[6], max, &dest, stride); + highbd_idct16x16_add8x1(o[7], max, &dest, stride); + highbd_idct16x16_add8x1(o[8], max, &dest, stride); + highbd_idct16x16_add8x1(o[9], max, &dest, stride); + highbd_idct16x16_add8x1(o[10], max, &dest, stride); + highbd_idct16x16_add8x1(o[11], max, &dest, stride); + highbd_idct16x16_add8x1(o[12], max, &dest, stride); + highbd_idct16x16_add8x1(o[13], max, &dest, stride); + highbd_idct16x16_add8x1(o[14], max, &dest, stride); + highbd_idct16x16_add8x1(o[15], max, &dest, stride); +} + +void vpx_highbd_idct16x16_256_add_half1d(const int32_t *input, int32_t *output, + uint16_t *dest, const int stride, + const int bd); + +#endif // VPX_VPX_DSP_ARM_HIGHBD_IDCT_NEON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c new file mode 100644 index 0000000000..235cb5b996 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c @@ -0,0 +1,2514 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "sum_neon.h" +#include "vpx/vpx_integer.h" + +//------------------------------------------------------------------------------ +// DC 4x4 + +static INLINE uint16_t dc_sum_4(const uint16_t *ref) { + const uint16x4_t ref_u16 = vld1_u16(ref); + return horizontal_add_uint16x4(ref_u16); +} + +static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride, + const uint16x4_t dc) { + int i; + for (i = 0; i < 4; ++i, dst += stride) { + vst1_u16(dst, dc); + } +} + +void vpx_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x4_t a = vld1_u16(above); + const uint16x4_t l = vld1_u16(left); + const uint16_t sum = horizontal_add_uint16x4(vadd_u16(a, l)); + const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 3); + (void)bd; + dc_store_4x4(dst, stride, dc); +} + +void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16_t sum = dc_sum_4(left); + const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 2); + (void)above; + (void)bd; + dc_store_4x4(dst, stride, dc); +} + +void vpx_highbd_dc_top_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16_t sum = dc_sum_4(above); + const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 2); + (void)left; + (void)bd; + dc_store_4x4(dst, stride, dc); +} + +void vpx_highbd_dc_128_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x4_t dc = vdup_n_u16(1 << (bd - 1)); + (void)above; + (void)left; + dc_store_4x4(dst, stride, dc); +} + +//------------------------------------------------------------------------------ +// DC 8x8 + +static INLINE uint16_t dc_sum_8(const uint16_t *ref) { + const uint16x8_t ref_u16 = vld1q_u16(ref); + return horizontal_add_uint16x8(ref_u16); +} + +static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride, + const uint16x8_t dc) { + int i; + for (i = 0; i < 8; ++i, dst += stride) { + vst1q_u16(dst, dc); + } +} + +void vpx_highbd_dc_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t above_u16 = vld1q_u16(above); + const uint16x8_t left_u16 = vld1q_u16(left); + const uint16x8_t p0 = vaddq_u16(above_u16, left_u16); + const uint16_t sum = horizontal_add_uint16x8(p0); + const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4); + (void)bd; + dc_store_8x8(dst, stride, dc); +} + +void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16_t sum = dc_sum_8(left); + const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 3); + (void)above; + (void)bd; + dc_store_8x8(dst, stride, dc); +} + +void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16_t sum = dc_sum_8(above); + const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 3); + (void)left; + (void)bd; + dc_store_8x8(dst, stride, dc); +} + +void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1)); + (void)above; + (void)left; + dc_store_8x8(dst, stride, dc); +} + +//------------------------------------------------------------------------------ +// DC 16x16 + +static INLINE uint16_t dc_sum_16(const uint16_t *ref) { + const uint16x8_t ref_u16_0 = vld1q_u16(ref + 0); + const uint16x8_t ref_u16_1 = vld1q_u16(ref + 8); + const uint16x8_t p0 = vaddq_u16(ref_u16_0, ref_u16_1); + return horizontal_add_uint16x8(p0); +} + +static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride, + const uint16x8_t dc) { + int i; + for (i = 0; i < 16; ++i, dst += stride) { + vst1q_u16(dst + 0, dc); + vst1q_u16(dst + 8, dc); + } +} + +void vpx_highbd_dc_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t a0 = vld1q_u16(above + 0); + const uint16x8_t a1 = vld1q_u16(above + 8); + const uint16x8_t l0 = vld1q_u16(left + 0); + const uint16x8_t l1 = vld1q_u16(left + 8); + const uint16x8_t pa = vaddq_u16(a0, a1); + const uint16x8_t pl = vaddq_u16(l0, l1); + const uint16x8_t pal0 = vaddq_u16(pa, pl); + const uint32_t sum = horizontal_add_uint16x8(pal0); + const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0); + (void)bd; + dc_store_16x16(dst, stride, dc); +} + +void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16_t sum = dc_sum_16(left); + const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4); + (void)above; + (void)bd; + dc_store_16x16(dst, stride, dc); +} + +void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16_t sum = dc_sum_16(above); + const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4); + (void)left; + (void)bd; + dc_store_16x16(dst, stride, dc); +} + +void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1)); + (void)above; + (void)left; + dc_store_16x16(dst, stride, dc); +} + +//------------------------------------------------------------------------------ +// DC 32x32 + +static INLINE uint32_t dc_sum_32(const uint16_t *ref) { + const uint16x8_t r0 = vld1q_u16(ref + 0); + const uint16x8_t r1 = vld1q_u16(ref + 8); + const uint16x8_t r2 = vld1q_u16(ref + 16); + const uint16x8_t r3 = vld1q_u16(ref + 24); + const uint16x8_t p0 = vaddq_u16(r0, r1); + const uint16x8_t p1 = vaddq_u16(r2, r3); + const uint16x8_t p2 = vaddq_u16(p0, p1); + return horizontal_add_uint16x8(p2); +} + +static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride, + const uint16x8_t dc) { + int i; + for (i = 0; i < 32; ++i) { + vst1q_u16(dst + 0, dc); + vst1q_u16(dst + 8, dc); + vst1q_u16(dst + 16, dc); + vst1q_u16(dst + 24, dc); + dst += stride; + } +} + +void vpx_highbd_dc_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t a0 = vld1q_u16(above + 0); + const uint16x8_t a1 = vld1q_u16(above + 8); + const uint16x8_t a2 = vld1q_u16(above + 16); + const uint16x8_t a3 = vld1q_u16(above + 24); + const uint16x8_t l0 = vld1q_u16(left + 0); + const uint16x8_t l1 = vld1q_u16(left + 8); + const uint16x8_t l2 = vld1q_u16(left + 16); + const uint16x8_t l3 = vld1q_u16(left + 24); + const uint16x8_t pa0 = vaddq_u16(a0, a1); + const uint16x8_t pa1 = vaddq_u16(a2, a3); + const uint16x8_t pl0 = vaddq_u16(l0, l1); + const uint16x8_t pl1 = vaddq_u16(l2, l3); + const uint16x8_t pa = vaddq_u16(pa0, pa1); + const uint16x8_t pl = vaddq_u16(pl0, pl1); + const uint16x8_t pal0 = vaddq_u16(pa, pl); + const uint32_t sum = horizontal_add_uint16x8(pal0); + const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 6), 0); + (void)bd; + dc_store_32x32(dst, stride, dc); +} + +void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint32_t sum = dc_sum_32(left); + const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0); + (void)above; + (void)bd; + dc_store_32x32(dst, stride, dc); +} + +void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint32_t sum = dc_sum_32(above); + const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0); + (void)left; + (void)bd; + dc_store_32x32(dst, stride, dc); +} + +void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1)); + (void)above; + (void)left; + dc_store_32x32(dst, stride, dc); +} + +// ----------------------------------------------------------------------------- + +void vpx_highbd_d45_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t a0, a1, a2, d0; + uint16_t a7; + (void)left; + (void)bd; + + a0 = vld1q_u16(above); + a7 = above[7]; + + // [ above[1], ..., above[6], x, x ] + a1 = vextq_u16(a0, a0, 1); + // [ above[2], ..., above[7], x, x ] + a2 = vextq_u16(a0, a0, 2); + + // d0[0] = AVG3(above[0], above[1], above[2]); + // ... + // d0[5] = AVG3(above[5], above[6], above[7]); + // d0[6] = x (don't care) + // d0[7] = x (don't care) + d0 = vrhaddq_u16(vhaddq_u16(a0, a2), a1); + + // We want: + // stride=0 [ d0[0], d0[1], d0[2], d0[3] ] + // stride=1 [ d0[1], d0[2], d0[3], d0[4] ] + // stride=2 [ d0[2], d0[3], d0[4], d0[5] ] + // stride=2 [ d0[3], d0[4], d0[5], above[7] ] + vst1_u16(dst + 0 * stride, vget_low_u16(d0)); + vst1_u16(dst + 1 * stride, vget_low_u16(vextq_u16(d0, d0, 1))); + vst1_u16(dst + 2 * stride, vget_low_u16(vextq_u16(d0, d0, 2))); + vst1_u16(dst + 3 * stride, vget_low_u16(vextq_u16(d0, d0, 3))); + + // We stored d0[6] above, so fixup into above[7]. + dst[3 * stride + 3] = a7; +} + +void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t ax0, a0, a1, a7, d0; + (void)left; + (void)bd; + + a0 = vld1q_u16(above + 0); + a1 = vld1q_u16(above + 1); + a7 = vld1q_dup_u16(above + 7); + + // We want to calculate the AVG3 result in lanes 1-7 inclusive so we can + // shift in above[7] later, so shift a0 across by one to get the right + // inputs: + // [ x, above[0], ... , above[6] ] + ax0 = vextq_u16(a0, a0, 7); + + // d0[0] = x (don't care) + // d0[1] = AVG3(above[0], above[1], above[2]); + // ... + // d0[7] = AVG3(above[6], above[7], above[8]); + d0 = vrhaddq_u16(vhaddq_u16(ax0, a1), a0); + + // Undo the earlier ext, incrementally shift in duplicates of above[7]. + vst1q_u16(dst + 0 * stride, vextq_u16(d0, a7, 1)); + vst1q_u16(dst + 1 * stride, vextq_u16(d0, a7, 2)); + vst1q_u16(dst + 2 * stride, vextq_u16(d0, a7, 3)); + vst1q_u16(dst + 3 * stride, vextq_u16(d0, a7, 4)); + vst1q_u16(dst + 4 * stride, vextq_u16(d0, a7, 5)); + vst1q_u16(dst + 5 * stride, vextq_u16(d0, a7, 6)); + vst1q_u16(dst + 6 * stride, vextq_u16(d0, a7, 7)); + vst1q_u16(dst + 7 * stride, a7); +} + +void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t ax0, a0, a1, a7, a8, a9, a15, d0[2]; + (void)left; + (void)bd; + + a0 = vld1q_u16(above + 0); + a1 = vld1q_u16(above + 1); + a7 = vld1q_u16(above + 7); + a8 = vld1q_u16(above + 8); + a9 = vld1q_u16(above + 9); + a15 = vld1q_dup_u16(above + 15); + + // [ x, above[0], ... , above[6] ] + ax0 = vextq_u16(a0, a0, 7); + + // We have one unused lane here to leave room to shift in above[15] in the + // last lane: + // d0[0][1] = x (don't care) + // d0[0][1] = AVG3(above[0], above[1], above[2]); + // ... + // d0[0][7] = AVG3(above[6], above[7], above[8]); + // d0[1][0] = AVG3(above[7], above[8], above[9]); + // ... + // d0[1][7] = AVG3(above[14], above[15], above[16]); + d0[0] = vrhaddq_u16(vhaddq_u16(ax0, a1), a0); + d0[1] = vrhaddq_u16(vhaddq_u16(a7, a9), a8); + + // Incrementally shift in duplicates of above[15]. + vst1q_u16(dst + 0 * stride + 0, vextq_u16(d0[0], d0[1], 1)); + vst1q_u16(dst + 0 * stride + 8, vextq_u16(d0[1], a15, 1)); + vst1q_u16(dst + 1 * stride + 0, vextq_u16(d0[0], d0[1], 2)); + vst1q_u16(dst + 1 * stride + 8, vextq_u16(d0[1], a15, 2)); + vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 3)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[1], a15, 3)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(d0[0], d0[1], 4)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(d0[1], a15, 4)); + vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 5)); + vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[1], a15, 5)); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(d0[0], d0[1], 6)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(d0[1], a15, 6)); + vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 7)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[1], a15, 7)); + vst1q_u16(dst + 7 * stride + 0, d0[1]); + vst1q_u16(dst + 7 * stride + 8, a15); + + vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[1], a15, 1)); + vst1q_u16(dst + 8 * stride + 8, a15); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(d0[1], a15, 2)); + vst1q_u16(dst + 9 * stride + 8, a15); + vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[1], a15, 3)); + vst1q_u16(dst + 10 * stride + 8, a15); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(d0[1], a15, 4)); + vst1q_u16(dst + 11 * stride + 8, a15); + vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[1], a15, 5)); + vst1q_u16(dst + 12 * stride + 8, a15); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(d0[1], a15, 6)); + vst1q_u16(dst + 13 * stride + 8, a15); + vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[1], a15, 7)); + vst1q_u16(dst + 14 * stride + 8, a15); + vst1q_u16(dst + 15 * stride + 0, a15); + vst1q_u16(dst + 15 * stride + 8, a15); +} + +void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t ax0, a0, a1, a7, a8, a9, a15, a16, a17, a23, a24, a25, a31, d0[4]; + int i; + (void)left; + (void)bd; + + a0 = vld1q_u16(above + 0); + a1 = vld1q_u16(above + 1); + a7 = vld1q_u16(above + 7); + a8 = vld1q_u16(above + 8); + a9 = vld1q_u16(above + 9); + a15 = vld1q_u16(above + 15); + a16 = vld1q_u16(above + 16); + a17 = vld1q_u16(above + 17); + a23 = vld1q_u16(above + 23); + a24 = vld1q_u16(above + 24); + a25 = vld1q_u16(above + 25); + a31 = vld1q_dup_u16(above + 31); + + // [ x, above[0], ... , above[6] ] + ax0 = vextq_u16(a0, a0, 7); + + d0[0] = vrhaddq_u16(vhaddq_u16(ax0, a1), a0); + d0[1] = vrhaddq_u16(vhaddq_u16(a7, a9), a8); + d0[2] = vrhaddq_u16(vhaddq_u16(a15, a17), a16); + d0[3] = vrhaddq_u16(vhaddq_u16(a23, a25), a24); + + for (i = 0; i < 32; ++i) { + d0[0] = vextq_u16(d0[0], d0[1], 1); + d0[1] = vextq_u16(d0[1], d0[2], 1); + d0[2] = vextq_u16(d0[2], d0[3], 1); + d0[3] = vextq_u16(d0[3], a31, 1); + vst1q_u16(dst + 0, d0[0]); + vst1q_u16(dst + 8, d0[1]); + vst1q_u16(dst + 16, d0[2]); + vst1q_u16(dst + 24, d0[3]); + dst += stride; + } +} + +// ----------------------------------------------------------------------------- + +void vpx_highbd_d63_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x4_t a0, a1, a2, a3, d0, d1, d2, d3; + (void)left; + (void)bd; + + a0 = vld1_u16(above + 0); + a1 = vld1_u16(above + 1); + a2 = vld1_u16(above + 2); + a3 = vld1_u16(above + 3); + + d0 = vrhadd_u16(a0, a1); + d1 = vrhadd_u16(vhadd_u16(a0, a2), a1); + d2 = vrhadd_u16(a1, a2); + d3 = vrhadd_u16(vhadd_u16(a1, a3), a2); + + // Note that here we are performing a full avg calculation for the final + // elements rather than storing a duplicate of above[3], which differs + // (correctly) from the general scheme employed by the bs={8,16,32} + // implementations in order to match the original C implementation. + vst1_u16(dst + 0 * stride, d0); + vst1_u16(dst + 1 * stride, d1); + vst1_u16(dst + 2 * stride, d2); + vst1_u16(dst + 3 * stride, d3); +} + +void vpx_highbd_d63_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t a0, a1, a2, a7, d0, d1, d0_ext, d1_ext; + (void)left; + (void)bd; + + a0 = vld1q_u16(above + 0); + a1 = vld1q_u16(above + 1); + a2 = vld1q_u16(above + 2); + a7 = vld1q_dup_u16(above + 7); + + d0 = vrhaddq_u16(a0, a1); + d1 = vrhaddq_u16(vhaddq_u16(a0, a2), a1); + + // We want to store: + // stride=0 [ d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], d0[7] ] + // stride=1 [ d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], d1[7] ] + // stride=2 [ d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], a[7], a[7] ] + // stride=3 [ d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], a[7], a[7] ] + // stride=4 [ d0[2], d0[3], d0[4], d0[5], d0[6], a[7], a[7], a[7] ] + // stride=5 [ d1[2], d1[3], d1[4], d1[5], d1[6], a[7], a[7], a[7] ] + // stride=6 [ d0[3], d0[4], d0[5], d0[6], a[7], a[7], a[7], a[7] ] + // stride=7 [ d1[3], d1[4], d1[5], d1[6], a[7], a[7], a[7], a[7] ] + // Note in particular that d0[7] and d1[7] are only ever referenced in the + // stride=0 and stride=1 cases respectively, and in later strides are + // replaced by a copy of above[7]. These are equivalent if for i>7, + // above[i]==above[7], however that is not always the case. + + // Strip out d0[7] and d1[7] so that we can replace it with an additional + // copy of above[7], the first vector here doesn't matter so just reuse + // d0/d1. + d0_ext = vextq_u16(d0, d0, 7); + d1_ext = vextq_u16(d1, d1, 7); + + // Shuffle in duplicates of above[7] and store. + vst1q_u16(dst + 0 * stride, d0); + vst1q_u16(dst + 1 * stride, d1); + vst1q_u16(dst + 2 * stride, vextq_u16(d0_ext, a7, 2)); + vst1q_u16(dst + 3 * stride, vextq_u16(d1_ext, a7, 2)); + vst1q_u16(dst + 4 * stride, vextq_u16(d0_ext, a7, 3)); + vst1q_u16(dst + 5 * stride, vextq_u16(d1_ext, a7, 3)); + vst1q_u16(dst + 6 * stride, vextq_u16(d0_ext, a7, 4)); + vst1q_u16(dst + 7 * stride, vextq_u16(d1_ext, a7, 4)); +} + +void vpx_highbd_d63_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + // See vpx_highbd_d63_predictor_8x8_neon for details on the implementation. + uint16x8_t a0, a1, a2, a8, a9, a10, a15, d0[2], d1[2], d0_ext, d1_ext; + (void)left; + (void)bd; + + a0 = vld1q_u16(above + 0); + a1 = vld1q_u16(above + 1); + a2 = vld1q_u16(above + 2); + a8 = vld1q_u16(above + 8); + a9 = vld1q_u16(above + 9); + a10 = vld1q_u16(above + 10); + a15 = vld1q_dup_u16(above + 15); + + d0[0] = vrhaddq_u16(a0, a1); + d0[1] = vrhaddq_u16(a8, a9); + d1[0] = vrhaddq_u16(vhaddq_u16(a0, a2), a1); + d1[1] = vrhaddq_u16(vhaddq_u16(a8, a10), a9); + + // Strip out the final element of d0/d1 so that we can replace it with an + // additional copy of above[7], the first vector here doesn't matter so just + // reuse the same vector. + d0_ext = vextq_u16(d0[1], d0[1], 7); + d1_ext = vextq_u16(d1[1], d1[1], 7); + + // Shuffle in duplicates of above[7] and store. Note that cases involving + // {d0,d1}_ext require an extra shift to undo the shifting out of the final + // element from above. + vst1q_u16(dst + 0 * stride + 0, d0[0]); + vst1q_u16(dst + 0 * stride + 8, d0[1]); + vst1q_u16(dst + 1 * stride + 0, d1[0]); + vst1q_u16(dst + 1 * stride + 8, d1[1]); + vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 1)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0_ext, a15, 2)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1[0], d1[1], 1)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1_ext, a15, 2)); + vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 2)); + vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0_ext, a15, 3)); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1[0], d1[1], 2)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1_ext, a15, 3)); + vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 3)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0_ext, a15, 4)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1_ext, a15, 4)); + vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[0], d0[1], 4)); + vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0_ext, a15, 5)); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1[0], d1[1], 4)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1_ext, a15, 5)); + vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[0], d0[1], 5)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0_ext, a15, 6)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1_ext, a15, 6)); + vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[0], d0[1], 6)); + vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0_ext, a15, 7)); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1[0], d1[1], 6)); + vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1_ext, a15, 7)); + vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[0], d0[1], 7)); + vst1q_u16(dst + 14 * stride + 8, a15); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 15 * stride + 8, a15); +} + +void vpx_highbd_d63_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + // See vpx_highbd_d63_predictor_8x8_neon for details on the implementation. + uint16x8_t a0, a1, a2, a8, a9, a10, a16, a17, a18, a24, a25, a26, a31, d0[4], + d1[4], d0_ext, d1_ext; + (void)left; + (void)bd; + + a0 = vld1q_u16(above + 0); + a1 = vld1q_u16(above + 1); + a2 = vld1q_u16(above + 2); + a8 = vld1q_u16(above + 8); + a9 = vld1q_u16(above + 9); + a10 = vld1q_u16(above + 10); + a16 = vld1q_u16(above + 16); + a17 = vld1q_u16(above + 17); + a18 = vld1q_u16(above + 18); + a24 = vld1q_u16(above + 24); + a25 = vld1q_u16(above + 25); + a26 = vld1q_u16(above + 26); + a31 = vld1q_dup_u16(above + 31); + + d0[0] = vrhaddq_u16(a0, a1); + d0[1] = vrhaddq_u16(a8, a9); + d0[2] = vrhaddq_u16(a16, a17); + d0[3] = vrhaddq_u16(a24, a25); + d1[0] = vrhaddq_u16(vhaddq_u16(a0, a2), a1); + d1[1] = vrhaddq_u16(vhaddq_u16(a8, a10), a9); + d1[2] = vrhaddq_u16(vhaddq_u16(a16, a18), a17); + d1[3] = vrhaddq_u16(vhaddq_u16(a24, a26), a25); + + // Strip out the final element of d0/d1 so that we can replace it with an + // additional copy of above[7], the first vector here doesn't matter so just + // reuse the same vector. + d0_ext = vextq_u16(d0[3], d0[3], 7); + d1_ext = vextq_u16(d1[3], d1[3], 7); + + // Shuffle in duplicates of above[7] and store. Note that cases involving + // {d0,d1}_ext require an extra shift to undo the shifting out of the final + // element from above. + + vst1q_u16(dst + 0 * stride + 0, d0[0]); + vst1q_u16(dst + 0 * stride + 8, d0[1]); + vst1q_u16(dst + 0 * stride + 16, d0[2]); + vst1q_u16(dst + 0 * stride + 24, d0[3]); + vst1q_u16(dst + 1 * stride + 0, d1[0]); + vst1q_u16(dst + 1 * stride + 8, d1[1]); + vst1q_u16(dst + 1 * stride + 16, d1[2]); + vst1q_u16(dst + 1 * stride + 24, d1[3]); + + vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 1)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[1], d0[2], 1)); + vst1q_u16(dst + 2 * stride + 16, vextq_u16(d0[2], d0[3], 1)); + vst1q_u16(dst + 2 * stride + 24, vextq_u16(d0_ext, a31, 2)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1[0], d1[1], 1)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[1], d1[2], 1)); + vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[2], d1[3], 1)); + vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1_ext, a31, 2)); + + vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 2)); + vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[1], d0[2], 2)); + vst1q_u16(dst + 4 * stride + 16, vextq_u16(d0[2], d0[3], 2)); + vst1q_u16(dst + 4 * stride + 24, vextq_u16(d0_ext, a31, 3)); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1[0], d1[1], 2)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1[1], d1[2], 2)); + vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[2], d1[3], 2)); + vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1_ext, a31, 3)); + + vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 3)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[1], d0[2], 3)); + vst1q_u16(dst + 6 * stride + 16, vextq_u16(d0[2], d0[3], 3)); + vst1q_u16(dst + 6 * stride + 24, vextq_u16(d0_ext, a31, 4)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1[1], d1[2], 3)); + vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[2], d1[3], 3)); + vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1_ext, a31, 4)); + + vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[0], d0[1], 4)); + vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0[1], d0[2], 4)); + vst1q_u16(dst + 8 * stride + 16, vextq_u16(d0[2], d0[3], 4)); + vst1q_u16(dst + 8 * stride + 24, vextq_u16(d0_ext, a31, 5)); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1[0], d1[1], 4)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1[1], d1[2], 4)); + vst1q_u16(dst + 9 * stride + 16, vextq_u16(d1[2], d1[3], 4)); + vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1_ext, a31, 5)); + + vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[0], d0[1], 5)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0[1], d0[2], 5)); + vst1q_u16(dst + 10 * stride + 16, vextq_u16(d0[2], d0[3], 5)); + vst1q_u16(dst + 10 * stride + 24, vextq_u16(d0_ext, a31, 6)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1[1], d1[2], 5)); + vst1q_u16(dst + 11 * stride + 16, vextq_u16(d1[2], d1[3], 5)); + vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1_ext, a31, 6)); + + vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[0], d0[1], 6)); + vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0[1], d0[2], 6)); + vst1q_u16(dst + 12 * stride + 16, vextq_u16(d0[2], d0[3], 6)); + vst1q_u16(dst + 12 * stride + 24, vextq_u16(d0_ext, a31, 7)); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1[0], d1[1], 6)); + vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1[1], d1[2], 6)); + vst1q_u16(dst + 13 * stride + 16, vextq_u16(d1[2], d1[3], 6)); + vst1q_u16(dst + 13 * stride + 24, vextq_u16(d1_ext, a31, 7)); + + vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[0], d0[1], 7)); + vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0[1], d0[2], 7)); + vst1q_u16(dst + 14 * stride + 16, vextq_u16(d0[2], d0[3], 7)); + vst1q_u16(dst + 14 * stride + 24, a31); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1[1], d1[2], 7)); + vst1q_u16(dst + 15 * stride + 16, vextq_u16(d1[2], d1[3], 7)); + vst1q_u16(dst + 15 * stride + 24, a31); + + vst1q_u16(dst + 16 * stride + 0, d0[1]); + vst1q_u16(dst + 16 * stride + 8, d0[2]); + vst1q_u16(dst + 16 * stride + 16, vextq_u16(d0_ext, a31, 1)); + vst1q_u16(dst + 16 * stride + 24, a31); + vst1q_u16(dst + 17 * stride + 0, d1[1]); + vst1q_u16(dst + 17 * stride + 8, d1[2]); + vst1q_u16(dst + 17 * stride + 16, vextq_u16(d1_ext, a31, 1)); + vst1q_u16(dst + 17 * stride + 24, a31); + + vst1q_u16(dst + 18 * stride + 0, vextq_u16(d0[1], d0[2], 1)); + vst1q_u16(dst + 18 * stride + 8, vextq_u16(d0[2], d0[3], 1)); + vst1q_u16(dst + 18 * stride + 16, vextq_u16(d0_ext, a31, 2)); + vst1q_u16(dst + 18 * stride + 24, a31); + vst1q_u16(dst + 19 * stride + 0, vextq_u16(d1[1], d1[2], 1)); + vst1q_u16(dst + 19 * stride + 8, vextq_u16(d1[2], d1[3], 1)); + vst1q_u16(dst + 19 * stride + 16, vextq_u16(d1_ext, a31, 2)); + vst1q_u16(dst + 19 * stride + 24, a31); + + vst1q_u16(dst + 20 * stride + 0, vextq_u16(d0[1], d0[2], 2)); + vst1q_u16(dst + 20 * stride + 8, vextq_u16(d0[2], d0[3], 2)); + vst1q_u16(dst + 20 * stride + 16, vextq_u16(d0_ext, a31, 3)); + vst1q_u16(dst + 20 * stride + 24, a31); + vst1q_u16(dst + 21 * stride + 0, vextq_u16(d1[1], d1[2], 2)); + vst1q_u16(dst + 21 * stride + 8, vextq_u16(d1[2], d1[3], 2)); + vst1q_u16(dst + 21 * stride + 16, vextq_u16(d1_ext, a31, 3)); + vst1q_u16(dst + 21 * stride + 24, a31); + + vst1q_u16(dst + 22 * stride + 0, vextq_u16(d0[1], d0[2], 3)); + vst1q_u16(dst + 22 * stride + 8, vextq_u16(d0[2], d0[3], 3)); + vst1q_u16(dst + 22 * stride + 16, vextq_u16(d0_ext, a31, 4)); + vst1q_u16(dst + 22 * stride + 24, a31); + vst1q_u16(dst + 23 * stride + 0, vextq_u16(d1[1], d1[2], 3)); + vst1q_u16(dst + 23 * stride + 8, vextq_u16(d1[2], d1[3], 3)); + vst1q_u16(dst + 23 * stride + 16, vextq_u16(d1_ext, a31, 4)); + vst1q_u16(dst + 23 * stride + 24, a31); + + vst1q_u16(dst + 24 * stride + 0, vextq_u16(d0[1], d0[2], 4)); + vst1q_u16(dst + 24 * stride + 8, vextq_u16(d0[2], d0[3], 4)); + vst1q_u16(dst + 24 * stride + 16, vextq_u16(d0_ext, a31, 5)); + vst1q_u16(dst + 24 * stride + 24, a31); + vst1q_u16(dst + 25 * stride + 0, vextq_u16(d1[1], d1[2], 4)); + vst1q_u16(dst + 25 * stride + 8, vextq_u16(d1[2], d1[3], 4)); + vst1q_u16(dst + 25 * stride + 16, vextq_u16(d1_ext, a31, 5)); + vst1q_u16(dst + 25 * stride + 24, a31); + + vst1q_u16(dst + 26 * stride + 0, vextq_u16(d0[1], d0[2], 5)); + vst1q_u16(dst + 26 * stride + 8, vextq_u16(d0[2], d0[3], 5)); + vst1q_u16(dst + 26 * stride + 16, vextq_u16(d0_ext, a31, 6)); + vst1q_u16(dst + 26 * stride + 24, a31); + vst1q_u16(dst + 27 * stride + 0, vextq_u16(d1[1], d1[2], 5)); + vst1q_u16(dst + 27 * stride + 8, vextq_u16(d1[2], d1[3], 5)); + vst1q_u16(dst + 27 * stride + 16, vextq_u16(d1_ext, a31, 6)); + vst1q_u16(dst + 27 * stride + 24, a31); + + vst1q_u16(dst + 28 * stride + 0, vextq_u16(d0[1], d0[2], 6)); + vst1q_u16(dst + 28 * stride + 8, vextq_u16(d0[2], d0[3], 6)); + vst1q_u16(dst + 28 * stride + 16, vextq_u16(d0_ext, a31, 7)); + vst1q_u16(dst + 28 * stride + 24, a31); + vst1q_u16(dst + 29 * stride + 0, vextq_u16(d1[1], d1[2], 6)); + vst1q_u16(dst + 29 * stride + 8, vextq_u16(d1[2], d1[3], 6)); + vst1q_u16(dst + 29 * stride + 16, vextq_u16(d1_ext, a31, 7)); + vst1q_u16(dst + 29 * stride + 24, a31); + + vst1q_u16(dst + 30 * stride + 0, vextq_u16(d0[1], d0[2], 7)); + vst1q_u16(dst + 30 * stride + 8, vextq_u16(d0[2], d0[3], 7)); + vst1q_u16(dst + 30 * stride + 16, a31); + vst1q_u16(dst + 30 * stride + 24, a31); + vst1q_u16(dst + 31 * stride + 0, vextq_u16(d1[1], d1[2], 7)); + vst1q_u16(dst + 31 * stride + 8, vextq_u16(d1[2], d1[3], 7)); + vst1q_u16(dst + 31 * stride + 16, a31); + vst1q_u16(dst + 31 * stride + 24, a31); +} + +// ----------------------------------------------------------------------------- + +void vpx_highbd_d117_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x4_t az, a0, l0az, l0, l1, azl0, col0, col0_even, col0_odd, d0, d1; + (void)bd; + + az = vld1_u16(above - 1); + a0 = vld1_u16(above + 0); + // [ left[0], above[-1], above[0], above[1] ] + l0az = vext_u16(vld1_dup_u16(left), az, 3); + + l0 = vld1_u16(left + 0); + // The last lane here is unused, reading left[4] could cause a buffer + // over-read, so just fill with a duplicate of left[0] to avoid needing to + // materialize a zero: + // [ left[1], left[2], left[3], x ] + l1 = vext_u16(l0, l0, 1); + // [ above[-1], left[0], left[1], left[2] ] + azl0 = vext_u16(vld1_dup_u16(above - 1), l0, 3); + + d0 = vrhadd_u16(az, a0); + d1 = vrhadd_u16(vhadd_u16(l0az, a0), az); + + col0 = vrhadd_u16(vhadd_u16(azl0, l1), l0); + col0_even = vdup_lane_u16(col0, 0); + col0_odd = vdup_lane_u16(col0, 1); + + vst1_u16(dst + 0 * stride, d0); + vst1_u16(dst + 1 * stride, d1); + vst1_u16(dst + 2 * stride, vext_u16(col0_even, d0, 3)); + vst1_u16(dst + 3 * stride, vext_u16(col0_odd, d1, 3)); +} + +void vpx_highbd_d117_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t az, a0, l0az, l0, l1, azl0, col0, col0_even, col0_odd, d0, d1; + (void)bd; + + az = vld1q_u16(above - 1); + a0 = vld1q_u16(above + 0); + // [ left[0], above[-1], ..., left[5] ] + l0az = vextq_u16(vld1q_dup_u16(left), az, 7); + + l0 = vld1q_u16(left + 0); + // The last lane here is unused, reading left[8] could cause a buffer + // over-read, so just fill with a duplicate of left[0] to avoid needing to + // materialize a zero: + // [ left[1], ... , left[7], x ] + l1 = vextq_u16(l0, l0, 1); + // [ above[-1], left[0], ..., left[6] ] + azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7); + + // d0[0] = AVG2(above[-1], above[0]) + // ... + // d0[7] = AVG2(above[6], above[7]) + d0 = vrhaddq_u16(az, a0); + + // d1[0] = AVG3(left[0], above[-1], above[0]) + // d1[1] = AVG3(above[-1], above[0], above[1]) + // ... + // d1[7] = AVG3(above[5], above[6], above[7]) + d1 = vrhaddq_u16(vhaddq_u16(l0az, a0), az); + + // The ext instruction shifts elements in from the end of the vector rather + // than the start, so reverse the vector to put the elements to be shifted in + // at the end: + // col0[7] = AVG3(above[-1], left[0], left[1]) + // col0[6] = AVG3(left[0], left[1], left[2]) + // ... + // col0[0] = AVG3(left[6], left[7], left[8]) + col0 = vrhaddq_u16(vhaddq_u16(azl0, l1), l0); + col0 = vrev64q_u16(vextq_u16(col0, col0, 4)); + + // We don't care about the first parameter to this uzp since we only ever use + // the high three elements, we just use col0 again since it is already + // available: + // col0_even = [ x, x, x, x, x, col0[3], col0[5], col0[7] ] + // col0_odd = [ x, x, x, x, x, col0[2], col0[4], col0[6] ] + col0_even = vuzpq_u16(col0, col0).val[1]; + col0_odd = vuzpq_u16(col0, col0).val[0]; + + // Incrementally shift more elements from col0 into d0/1: + // stride=0 [ d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], d0[7] ] + // stride=1 [ d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], d1[7] ] + // stride=2 [ col0[7], d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6] ] + // stride=3 [ col0[6], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ] + // stride=4 [ col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4], d0[5] ] + // stride=5 [ col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5] ] + // stride=6 [ col0[3], col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4] ] + // stride=7 [ col0[2], col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4] ] + vst1q_u16(dst + 0 * stride, d0); + vst1q_u16(dst + 1 * stride, d1); + vst1q_u16(dst + 2 * stride, vextq_u16(col0_even, d0, 7)); + vst1q_u16(dst + 3 * stride, vextq_u16(col0_odd, d1, 7)); + vst1q_u16(dst + 4 * stride, vextq_u16(col0_even, d0, 6)); + vst1q_u16(dst + 5 * stride, vextq_u16(col0_odd, d1, 6)); + vst1q_u16(dst + 6 * stride, vextq_u16(col0_even, d0, 5)); + vst1q_u16(dst + 7 * stride, vextq_u16(col0_odd, d1, 5)); +} + +void vpx_highbd_d117_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t az, a0, a6, a7, a8, l0az, l0, l1, l7, l8, l9, azl0, col0_lo, + col0_hi, col0_even, col0_odd, d0_lo, d0_hi, d1_lo, d1_hi; + (void)bd; + + az = vld1q_u16(above - 1); + a0 = vld1q_u16(above + 0); + a6 = vld1q_u16(above + 6); + a7 = vld1q_u16(above + 7); + a8 = vld1q_u16(above + 8); + // [ left[0], above[-1], ..., left[5] ] + l0az = vextq_u16(vld1q_dup_u16(left), az, 7); + + l0 = vld1q_u16(left + 0); + l1 = vld1q_u16(left + 1); + l7 = vld1q_u16(left + 7); + l8 = vld1q_u16(left + 8); + // The last lane here is unused, reading left[16] could cause a buffer + // over-read, so just fill with a duplicate of left[8] to avoid needing to + // materialize a zero: + // [ left[9], ... , left[15], x ] + l9 = vextq_u16(l8, l8, 1); + // [ above[-1], left[0], ..., left[6] ] + azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7); + + d0_lo = vrhaddq_u16(az, a0); + d0_hi = vrhaddq_u16(a7, a8); + d1_lo = vrhaddq_u16(vhaddq_u16(l0az, a0), az); + d1_hi = vrhaddq_u16(vhaddq_u16(a6, a8), a7); + + col0_lo = vrhaddq_u16(vhaddq_u16(azl0, l1), l0); + col0_hi = vrhaddq_u16(vhaddq_u16(l7, l9), l8); + + // Reverse within each vector, then swap the array indices in the uzp to + // complete the reversal across all 16 elements. + col0_lo = vrev64q_u16(vextq_u16(col0_lo, col0_lo, 4)); + col0_hi = vrev64q_u16(vextq_u16(col0_hi, col0_hi, 4)); + col0_even = vuzpq_u16(col0_hi, col0_lo).val[1]; + col0_odd = vuzpq_u16(col0_hi, col0_lo).val[0]; + + vst1q_u16(dst + 0 * stride + 0, d0_lo); + vst1q_u16(dst + 0 * stride + 8, d0_hi); + vst1q_u16(dst + 1 * stride + 0, d1_lo); + vst1q_u16(dst + 1 * stride + 8, d1_hi); + + vst1q_u16(dst + 2 * stride + 0, vextq_u16(col0_even, d0_lo, 7)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0_lo, d0_hi, 7)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(col0_odd, d1_lo, 7)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1_lo, d1_hi, 7)); + + vst1q_u16(dst + 4 * stride + 0, vextq_u16(col0_even, d0_lo, 6)); + vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0_lo, d0_hi, 6)); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(col0_odd, d1_lo, 6)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1_lo, d1_hi, 6)); + + vst1q_u16(dst + 6 * stride + 0, vextq_u16(col0_even, d0_lo, 5)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0_lo, d0_hi, 5)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(col0_odd, d1_lo, 5)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1_lo, d1_hi, 5)); + + vst1q_u16(dst + 8 * stride + 0, vextq_u16(col0_even, d0_lo, 4)); + vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0_lo, d0_hi, 4)); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(col0_odd, d1_lo, 4)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1_lo, d1_hi, 4)); + + vst1q_u16(dst + 10 * stride + 0, vextq_u16(col0_even, d0_lo, 3)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0_lo, d0_hi, 3)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(col0_odd, d1_lo, 3)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1_lo, d1_hi, 3)); + + vst1q_u16(dst + 12 * stride + 0, vextq_u16(col0_even, d0_lo, 2)); + vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0_lo, d0_hi, 2)); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(col0_odd, d1_lo, 2)); + vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1_lo, d1_hi, 2)); + + vst1q_u16(dst + 14 * stride + 0, vextq_u16(col0_even, d0_lo, 1)); + vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0_lo, d0_hi, 1)); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(col0_odd, d1_lo, 1)); + vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1_lo, d1_hi, 1)); +} + +void vpx_highbd_d117_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t az, a0, a6, a7, a8, a14, a15, a16, a22, a23, a24, l0az, l0, l1, l7, + l8, l9, l15, l16, l17, l23, l24, l25, azl0, d0[4], d1[4], col0[4], + col0_even[2], col0_odd[2]; + (void)bd; + + az = vld1q_u16(above - 1); + a0 = vld1q_u16(above + 0); + a6 = vld1q_u16(above + 6); + a7 = vld1q_u16(above + 7); + a8 = vld1q_u16(above + 8); + a14 = vld1q_u16(above + 14); + a15 = vld1q_u16(above + 15); + a16 = vld1q_u16(above + 16); + a22 = vld1q_u16(above + 22); + a23 = vld1q_u16(above + 23); + a24 = vld1q_u16(above + 24); + // [ left[0], above[-1], ..., left[5] ] + l0az = vextq_u16(vld1q_dup_u16(left), az, 7); + + l0 = vld1q_u16(left + 0); + l1 = vld1q_u16(left + 1); + l7 = vld1q_u16(left + 7); + l8 = vld1q_u16(left + 8); + l9 = vld1q_u16(left + 9); + l15 = vld1q_u16(left + 15); + l16 = vld1q_u16(left + 16); + l17 = vld1q_u16(left + 17); + l23 = vld1q_u16(left + 23); + l24 = vld1q_u16(left + 24); + l25 = vld1q_u16(left + 25); + // The last lane here is unused, reading left[32] could cause a buffer + // over-read, so just fill with a duplicate of left[24] to avoid needing to + // materialize a zero: + // [ left[25], ... , left[31], x ] + l25 = vextq_u16(l24, l24, 1); + // [ above[-1], left[0], ..., left[6] ] + azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7); + + d0[0] = vrhaddq_u16(az, a0); + d0[1] = vrhaddq_u16(a7, a8); + d0[2] = vrhaddq_u16(a15, a16); + d0[3] = vrhaddq_u16(a23, a24); + d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az); + d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7); + d1[2] = vrhaddq_u16(vhaddq_u16(a14, a16), a15); + d1[3] = vrhaddq_u16(vhaddq_u16(a22, a24), a23); + + col0[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0); + col0[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8); + col0[2] = vrhaddq_u16(vhaddq_u16(l15, l17), l16); + col0[3] = vrhaddq_u16(vhaddq_u16(l23, l25), l24); + + // Reverse within each vector, then swap the array indices in both the uzp + // and the col0_{even,odd} assignment to complete the reversal across all + // 32-elements. + col0[0] = vrev64q_u16(vextq_u16(col0[0], col0[0], 4)); + col0[1] = vrev64q_u16(vextq_u16(col0[1], col0[1], 4)); + col0[2] = vrev64q_u16(vextq_u16(col0[2], col0[2], 4)); + col0[3] = vrev64q_u16(vextq_u16(col0[3], col0[3], 4)); + + col0_even[1] = vuzpq_u16(col0[1], col0[0]).val[1]; + col0_even[0] = vuzpq_u16(col0[3], col0[2]).val[1]; + col0_odd[1] = vuzpq_u16(col0[1], col0[0]).val[0]; + col0_odd[0] = vuzpq_u16(col0[3], col0[2]).val[0]; + + vst1q_u16(dst + 0 * stride + 0, d0[0]); + vst1q_u16(dst + 0 * stride + 8, d0[1]); + vst1q_u16(dst + 0 * stride + 16, d0[2]); + vst1q_u16(dst + 0 * stride + 24, d0[3]); + vst1q_u16(dst + 1 * stride + 0, d1[0]); + vst1q_u16(dst + 1 * stride + 8, d1[1]); + vst1q_u16(dst + 1 * stride + 16, d1[2]); + vst1q_u16(dst + 1 * stride + 24, d1[3]); + + vst1q_u16(dst + 2 * stride + 0, vextq_u16(col0_even[1], d0[0], 7)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[0], d0[1], 7)); + vst1q_u16(dst + 2 * stride + 16, vextq_u16(d0[1], d0[2], 7)); + vst1q_u16(dst + 2 * stride + 24, vextq_u16(d0[2], d0[3], 7)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(col0_odd[1], d1[0], 7)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[1], d1[2], 7)); + vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1[2], d1[3], 7)); + + vst1q_u16(dst + 4 * stride + 0, vextq_u16(col0_even[1], d0[0], 6)); + vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[0], d0[1], 6)); + vst1q_u16(dst + 4 * stride + 16, vextq_u16(d0[1], d0[2], 6)); + vst1q_u16(dst + 4 * stride + 24, vextq_u16(d0[2], d0[3], 6)); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(col0_odd[1], d1[0], 6)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1[0], d1[1], 6)); + vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[1], d1[2], 6)); + vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1[2], d1[3], 6)); + + vst1q_u16(dst + 6 * stride + 0, vextq_u16(col0_even[1], d0[0], 5)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[0], d0[1], 5)); + vst1q_u16(dst + 6 * stride + 16, vextq_u16(d0[1], d0[2], 5)); + vst1q_u16(dst + 6 * stride + 24, vextq_u16(d0[2], d0[3], 5)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(col0_odd[1], d1[0], 5)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[1], d1[2], 5)); + vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1[2], d1[3], 5)); + + vst1q_u16(dst + 8 * stride + 0, vextq_u16(col0_even[1], d0[0], 4)); + vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0[0], d0[1], 4)); + vst1q_u16(dst + 8 * stride + 16, vextq_u16(d0[1], d0[2], 4)); + vst1q_u16(dst + 8 * stride + 24, vextq_u16(d0[2], d0[3], 4)); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(col0_odd[1], d1[0], 4)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1[0], d1[1], 4)); + vst1q_u16(dst + 9 * stride + 16, vextq_u16(d1[1], d1[2], 4)); + vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1[2], d1[3], 4)); + + vst1q_u16(dst + 10 * stride + 0, vextq_u16(col0_even[1], d0[0], 3)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0[0], d0[1], 3)); + vst1q_u16(dst + 10 * stride + 16, vextq_u16(d0[1], d0[2], 3)); + vst1q_u16(dst + 10 * stride + 24, vextq_u16(d0[2], d0[3], 3)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(col0_odd[1], d1[0], 3)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 11 * stride + 16, vextq_u16(d1[1], d1[2], 3)); + vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1[2], d1[3], 3)); + + vst1q_u16(dst + 12 * stride + 0, vextq_u16(col0_even[1], d0[0], 2)); + vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0[0], d0[1], 2)); + vst1q_u16(dst + 12 * stride + 16, vextq_u16(d0[1], d0[2], 2)); + vst1q_u16(dst + 12 * stride + 24, vextq_u16(d0[2], d0[3], 2)); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(col0_odd[1], d1[0], 2)); + vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1[0], d1[1], 2)); + vst1q_u16(dst + 13 * stride + 16, vextq_u16(d1[1], d1[2], 2)); + vst1q_u16(dst + 13 * stride + 24, vextq_u16(d1[2], d1[3], 2)); + + vst1q_u16(dst + 14 * stride + 0, vextq_u16(col0_even[1], d0[0], 1)); + vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0[0], d0[1], 1)); + vst1q_u16(dst + 14 * stride + 16, vextq_u16(d0[1], d0[2], 1)); + vst1q_u16(dst + 14 * stride + 24, vextq_u16(d0[2], d0[3], 1)); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(col0_odd[1], d1[0], 1)); + vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1[0], d1[1], 1)); + vst1q_u16(dst + 15 * stride + 16, vextq_u16(d1[1], d1[2], 1)); + vst1q_u16(dst + 15 * stride + 24, vextq_u16(d1[2], d1[3], 1)); + + vst1q_u16(dst + 16 * stride + 0, col0_even[1]); + vst1q_u16(dst + 16 * stride + 8, d0[0]); + vst1q_u16(dst + 16 * stride + 16, d0[1]); + vst1q_u16(dst + 16 * stride + 24, d0[2]); + vst1q_u16(dst + 17 * stride + 0, col0_odd[1]); + vst1q_u16(dst + 17 * stride + 8, d1[0]); + vst1q_u16(dst + 17 * stride + 16, d1[1]); + vst1q_u16(dst + 17 * stride + 24, d1[2]); + + vst1q_u16(dst + 18 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 7)); + vst1q_u16(dst + 18 * stride + 8, vextq_u16(col0_even[1], d0[0], 7)); + vst1q_u16(dst + 18 * stride + 16, vextq_u16(d0[0], d0[1], 7)); + vst1q_u16(dst + 18 * stride + 24, vextq_u16(d0[1], d0[2], 7)); + vst1q_u16(dst + 19 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 7)); + vst1q_u16(dst + 19 * stride + 8, vextq_u16(col0_odd[1], d1[0], 7)); + vst1q_u16(dst + 19 * stride + 16, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 19 * stride + 24, vextq_u16(d1[1], d1[2], 7)); + + vst1q_u16(dst + 20 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 6)); + vst1q_u16(dst + 20 * stride + 8, vextq_u16(col0_even[1], d0[0], 6)); + vst1q_u16(dst + 20 * stride + 16, vextq_u16(d0[0], d0[1], 6)); + vst1q_u16(dst + 20 * stride + 24, vextq_u16(d0[1], d0[2], 6)); + vst1q_u16(dst + 21 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 6)); + vst1q_u16(dst + 21 * stride + 8, vextq_u16(col0_odd[1], d1[0], 6)); + vst1q_u16(dst + 21 * stride + 16, vextq_u16(d1[0], d1[1], 6)); + vst1q_u16(dst + 21 * stride + 24, vextq_u16(d1[1], d1[2], 6)); + + vst1q_u16(dst + 22 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 5)); + vst1q_u16(dst + 22 * stride + 8, vextq_u16(col0_even[1], d0[0], 5)); + vst1q_u16(dst + 22 * stride + 16, vextq_u16(d0[0], d0[1], 5)); + vst1q_u16(dst + 22 * stride + 24, vextq_u16(d0[1], d0[2], 5)); + vst1q_u16(dst + 23 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 5)); + vst1q_u16(dst + 23 * stride + 8, vextq_u16(col0_odd[1], d1[0], 5)); + vst1q_u16(dst + 23 * stride + 16, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 23 * stride + 24, vextq_u16(d1[1], d1[2], 5)); + + vst1q_u16(dst + 24 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 4)); + vst1q_u16(dst + 24 * stride + 8, vextq_u16(col0_even[1], d0[0], 4)); + vst1q_u16(dst + 24 * stride + 16, vextq_u16(d0[0], d0[1], 4)); + vst1q_u16(dst + 24 * stride + 24, vextq_u16(d0[1], d0[2], 4)); + vst1q_u16(dst + 25 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 4)); + vst1q_u16(dst + 25 * stride + 8, vextq_u16(col0_odd[1], d1[0], 4)); + vst1q_u16(dst + 25 * stride + 16, vextq_u16(d1[0], d1[1], 4)); + vst1q_u16(dst + 25 * stride + 24, vextq_u16(d1[1], d1[2], 4)); + + vst1q_u16(dst + 26 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 3)); + vst1q_u16(dst + 26 * stride + 8, vextq_u16(col0_even[1], d0[0], 3)); + vst1q_u16(dst + 26 * stride + 16, vextq_u16(d0[0], d0[1], 3)); + vst1q_u16(dst + 26 * stride + 24, vextq_u16(d0[1], d0[2], 3)); + vst1q_u16(dst + 27 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 3)); + vst1q_u16(dst + 27 * stride + 8, vextq_u16(col0_odd[1], d1[0], 3)); + vst1q_u16(dst + 27 * stride + 16, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 27 * stride + 24, vextq_u16(d1[1], d1[2], 3)); + + vst1q_u16(dst + 28 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 2)); + vst1q_u16(dst + 28 * stride + 8, vextq_u16(col0_even[1], d0[0], 2)); + vst1q_u16(dst + 28 * stride + 16, vextq_u16(d0[0], d0[1], 2)); + vst1q_u16(dst + 28 * stride + 24, vextq_u16(d0[1], d0[2], 2)); + vst1q_u16(dst + 29 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 2)); + vst1q_u16(dst + 29 * stride + 8, vextq_u16(col0_odd[1], d1[0], 2)); + vst1q_u16(dst + 29 * stride + 16, vextq_u16(d1[0], d1[1], 2)); + vst1q_u16(dst + 29 * stride + 24, vextq_u16(d1[1], d1[2], 2)); + + vst1q_u16(dst + 30 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 1)); + vst1q_u16(dst + 30 * stride + 8, vextq_u16(col0_even[1], d0[0], 1)); + vst1q_u16(dst + 30 * stride + 16, vextq_u16(d0[0], d0[1], 1)); + vst1q_u16(dst + 30 * stride + 24, vextq_u16(d0[1], d0[2], 1)); + vst1q_u16(dst + 31 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 1)); + vst1q_u16(dst + 31 * stride + 8, vextq_u16(col0_odd[1], d1[0], 1)); + vst1q_u16(dst + 31 * stride + 16, vextq_u16(d1[0], d1[1], 1)); + vst1q_u16(dst + 31 * stride + 24, vextq_u16(d1[1], d1[2], 1)); +} + +// ----------------------------------------------------------------------------- + +void vpx_highbd_d153_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation. + uint16x4_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d20_lo, d20_hi; + (void)bd; + + az = vld1_u16(above - 1); + a0 = vld1_u16(above + 0); + // [ left[0], above[-1], above[0], above[1] ] + l0az = vext_u16(vld1_dup_u16(left), az, 3); + + l0 = vld1_u16(left); + // The last lane here is unused, reading left[4] could cause a buffer + // over-read, so just fill with a duplicate of left[0] to avoid needing to + // materialize a zero: + // [ left[1], left[2], left[3], x ] + l1 = vext_u16(l0, l0, 1); + // [ above[-1], left[0], left[1], left[2] ] + azl0 = vext_u16(vld1_dup_u16(above - 1), l0, 3); + + d0 = vrhadd_u16(azl0, l0); + d1 = vrhadd_u16(vhadd_u16(l0az, a0), az); + d2 = vrhadd_u16(vhadd_u16(azl0, l1), l0); + + d20_lo = vzip_u16(vrev64_u16(d2), vrev64_u16(d0)).val[0]; + d20_hi = vzip_u16(vrev64_u16(d2), vrev64_u16(d0)).val[1]; + + // Incrementally shift more elements from d0/d2 reversed into d1: + // stride=0 [ d0[0], d1[0], d1[1], d1[2] ] + // stride=1 [ d0[1], d2[0], d0[0], d1[0] ] + // stride=2 [ d0[2], d2[1], d0[1], d2[0] ] + // stride=3 [ d0[3], d2[2], d0[2], d2[1] ] + vst1_u16(dst + 0 * stride, vext_u16(d20_hi, d1, 3)); + vst1_u16(dst + 1 * stride, vext_u16(d20_hi, d1, 1)); + vst1_u16(dst + 2 * stride, vext_u16(d20_lo, d20_hi, 3)); + vst1_u16(dst + 3 * stride, vext_u16(d20_lo, d20_hi, 1)); +} + +void vpx_highbd_d153_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d0_rev, d2_rev, d20_lo, + d20_hi; + (void)bd; + + az = vld1q_u16(above - 1); + a0 = vld1q_u16(above + 0); + // [ left[0], above[-1], ... , above[5] ] + l0az = vextq_u16(vld1q_dup_u16(left), az, 7); + + l0 = vld1q_u16(left); + // The last lane here is unused, reading left[8] could cause a buffer + // over-read, so just fill with a duplicate of left[0] to avoid needing to + // materialize a zero: + // [ left[1], ... , left[7], x ] + l1 = vextq_u16(l0, l0, 1); + // [ above[-1], left[0], ... , left[6] ] + azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7); + + // d0[0] = AVG2(above[-1], left[0]) + // d0[1] = AVG2(left[0], left[1]) + // ... + // d0[7] = AVG2(left[6], left[7]) + d0 = vrhaddq_u16(azl0, l0); + + // d1[0] = AVG3(left[0], above[-1], above[0]) + // d1[1] = AVG3(above[-1], above[0], above[1]) + // ... + // d1[7] = AVG3(above[5], above[6], above[7]) + d1 = vrhaddq_u16(vhaddq_u16(l0az, a0), az); + + // d2[0] = AVG3(above[-1], left[0], left[1]) + // d2[1] = AVG3(left[0], left[1], left[2]) + // ... + // d2[7] = AVG3(left[6], left[7], left[8]) + d2 = vrhaddq_u16(vhaddq_u16(azl0, l1), l0); + + // The ext instruction shifts elements in from the end of the vector rather + // than the start, so reverse the vectors to put the elements to be shifted + // in at the end: + d0_rev = vrev64q_u16(vextq_u16(d0, d0, 4)); + d2_rev = vrev64q_u16(vextq_u16(d2, d2, 4)); + + d20_lo = vzipq_u16(d2_rev, d0_rev).val[0]; + d20_hi = vzipq_u16(d2_rev, d0_rev).val[1]; + + // Incrementally shift more elements from d0/d2 reversed into d1: + // stride=0 [ d0[0], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ] + // stride=1 [ d0[1], d2[0], d0[0], d1[0], d1[1], d1[2], d1[3], d1[4] ] + // stride=2 [ d0[2], d2[1], d0[1], d2[0], d0[0], d1[0], d1[1], d1[2] ] + // stride=3 [ d0[3], d2[2], d0[2], d2[1], d0[1], d2[0], d0[0], d1[0] ] + // stride=4 [ d0[4], d2[3], d0[3], d2[2], d0[2], d2[1], d0[1], d2[0] ] + // stride=5 [ d0[5], d2[4], d0[4], d2[3], d0[3], d2[2], d0[2], d2[1] ] + // stride=6 [ d0[6], d2[5], d0[5], d2[4], d0[4], d2[3], d0[3], d2[2] ] + // stride=7 [ d0[7], d2[6], d0[6], d2[5], d0[5], d2[4], d0[4], d2[3] ] + vst1q_u16(dst + 0 * stride, vextq_u16(d20_hi, d1, 7)); + vst1q_u16(dst + 1 * stride, vextq_u16(d20_hi, d1, 5)); + vst1q_u16(dst + 2 * stride, vextq_u16(d20_hi, d1, 3)); + vst1q_u16(dst + 3 * stride, vextq_u16(d20_hi, d1, 1)); + vst1q_u16(dst + 4 * stride, vextq_u16(d20_lo, d20_hi, 7)); + vst1q_u16(dst + 5 * stride, vextq_u16(d20_lo, d20_hi, 5)); + vst1q_u16(dst + 6 * stride, vextq_u16(d20_lo, d20_hi, 3)); + vst1q_u16(dst + 7 * stride, vextq_u16(d20_lo, d20_hi, 1)); +} + +void vpx_highbd_d153_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation. + uint16x8_t az, a0, a6, a7, a8, l0az, l0, l1, l7, l8, l9, azl0, d0[2], d1[2], + d2[2], d20[4]; + (void)bd; + + az = vld1q_u16(above - 1); + a0 = vld1q_u16(above + 0); + a6 = vld1q_u16(above + 6); + a7 = vld1q_u16(above + 7); + a8 = vld1q_u16(above + 8); + // [ left[0], above[-1], ... , above[13] ] + l0az = vextq_u16(vld1q_dup_u16(left), az, 7); + + l0 = vld1q_u16(left + 0); + l1 = vld1q_u16(left + 1); + l7 = vld1q_u16(left + 7); + l8 = vld1q_u16(left + 8); + // The last lane here is unused, reading left[16] could cause a buffer + // over-read, so just fill with a duplicate of left[8] to avoid needing to + // materialize a zero: + // [ left[9], ... , left[15], x ] + l9 = vextq_u16(l8, l8, 1); + // [ above[-1], left[0], ... , left[14] ] + azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7); + + d0[0] = vrhaddq_u16(azl0, l0); + d0[1] = vrhaddq_u16(l7, l8); + d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az); + d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7); + d2[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0); + d2[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8); + + d0[0] = vrev64q_u16(vextq_u16(d0[0], d0[0], 4)); + d0[1] = vrev64q_u16(vextq_u16(d0[1], d0[1], 4)); + d2[0] = vrev64q_u16(vextq_u16(d2[0], d2[0], 4)); + d2[1] = vrev64q_u16(vextq_u16(d2[1], d2[1], 4)); + + d20[0] = vzipq_u16(d2[1], d0[1]).val[0]; + d20[1] = vzipq_u16(d2[1], d0[1]).val[1]; + d20[2] = vzipq_u16(d2[0], d0[0]).val[0]; + d20[3] = vzipq_u16(d2[0], d0[0]).val[1]; + + vst1q_u16(dst + 0 * stride + 0, vextq_u16(d20[3], d1[0], 7)); + vst1q_u16(dst + 0 * stride + 8, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 1 * stride + 0, vextq_u16(d20[3], d1[0], 5)); + vst1q_u16(dst + 1 * stride + 8, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 2 * stride + 0, vextq_u16(d20[3], d1[0], 3)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(d20[3], d1[0], 1)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 1)); + + vst1q_u16(dst + 4 * stride + 0, vextq_u16(d20[2], d20[3], 7)); + vst1q_u16(dst + 4 * stride + 8, vextq_u16(d20[3], d1[0], 7)); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(d20[2], d20[3], 5)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(d20[3], d1[0], 5)); + vst1q_u16(dst + 6 * stride + 0, vextq_u16(d20[2], d20[3], 3)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(d20[3], d1[0], 3)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(d20[2], d20[3], 1)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(d20[3], d1[0], 1)); + + vst1q_u16(dst + 8 * stride + 0, vextq_u16(d20[1], d20[2], 7)); + vst1q_u16(dst + 8 * stride + 8, vextq_u16(d20[2], d20[3], 7)); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(d20[1], d20[2], 5)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(d20[2], d20[3], 5)); + vst1q_u16(dst + 10 * stride + 0, vextq_u16(d20[1], d20[2], 3)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(d20[2], d20[3], 3)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(d20[1], d20[2], 1)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(d20[2], d20[3], 1)); + + vst1q_u16(dst + 12 * stride + 0, vextq_u16(d20[0], d20[1], 7)); + vst1q_u16(dst + 12 * stride + 8, vextq_u16(d20[1], d20[2], 7)); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(d20[0], d20[1], 5)); + vst1q_u16(dst + 13 * stride + 8, vextq_u16(d20[1], d20[2], 5)); + vst1q_u16(dst + 14 * stride + 0, vextq_u16(d20[0], d20[1], 3)); + vst1q_u16(dst + 14 * stride + 8, vextq_u16(d20[1], d20[2], 3)); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(d20[0], d20[1], 1)); + vst1q_u16(dst + 15 * stride + 8, vextq_u16(d20[1], d20[2], 1)); +} + +void vpx_highbd_d153_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation. + uint16x8_t az, a0, a6, a7, a8, a14, a15, a16, a22, a23, a24, l0az, l0, l1, l7, + l8, l9, l15, l16, l17, l23, l24, l25, azl0, d0[4], d1[4], d2[4], d20[8]; + (void)bd; + + az = vld1q_u16(above - 1); + a0 = vld1q_u16(above + 0); + a6 = vld1q_u16(above + 6); + a7 = vld1q_u16(above + 7); + a8 = vld1q_u16(above + 8); + a14 = vld1q_u16(above + 14); + a15 = vld1q_u16(above + 15); + a16 = vld1q_u16(above + 16); + a22 = vld1q_u16(above + 22); + a23 = vld1q_u16(above + 23); + a24 = vld1q_u16(above + 24); + // [ left[0], above[-1], ... , above[13] ] + l0az = vextq_u16(vld1q_dup_u16(left), az, 7); + + l0 = vld1q_u16(left + 0); + l1 = vld1q_u16(left + 1); + l7 = vld1q_u16(left + 7); + l8 = vld1q_u16(left + 8); + l9 = vld1q_u16(left + 9); + l15 = vld1q_u16(left + 15); + l16 = vld1q_u16(left + 16); + l17 = vld1q_u16(left + 17); + l23 = vld1q_u16(left + 23); + l24 = vld1q_u16(left + 24); + // The last lane here is unused, reading left[32] could cause a buffer + // over-read, so just fill with a duplicate of left[24] to avoid needing to + // materialize a zero: + // [ left[25], ... , left[31], x ] + l25 = vextq_u16(l24, l24, 1); + // [ above[-1], left[0], ... , left[14] ] + azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7); + + d0[0] = vrhaddq_u16(azl0, l0); + d0[1] = vrhaddq_u16(l7, l8); + d0[2] = vrhaddq_u16(l15, l16); + d0[3] = vrhaddq_u16(l23, l24); + + d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az); + d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7); + d1[2] = vrhaddq_u16(vhaddq_u16(a14, a16), a15); + d1[3] = vrhaddq_u16(vhaddq_u16(a22, a24), a23); + + d2[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0); + d2[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8); + d2[2] = vrhaddq_u16(vhaddq_u16(l15, l17), l16); + d2[3] = vrhaddq_u16(vhaddq_u16(l23, l25), l24); + + d0[0] = vrev64q_u16(vextq_u16(d0[0], d0[0], 4)); + d0[1] = vrev64q_u16(vextq_u16(d0[1], d0[1], 4)); + d0[2] = vrev64q_u16(vextq_u16(d0[2], d0[2], 4)); + d0[3] = vrev64q_u16(vextq_u16(d0[3], d0[3], 4)); + d2[0] = vrev64q_u16(vextq_u16(d2[0], d2[0], 4)); + d2[1] = vrev64q_u16(vextq_u16(d2[1], d2[1], 4)); + d2[2] = vrev64q_u16(vextq_u16(d2[2], d2[2], 4)); + d2[3] = vrev64q_u16(vextq_u16(d2[3], d2[3], 4)); + + d20[0] = vzipq_u16(d2[3], d0[3]).val[0]; + d20[1] = vzipq_u16(d2[3], d0[3]).val[1]; + d20[2] = vzipq_u16(d2[2], d0[2]).val[0]; + d20[3] = vzipq_u16(d2[2], d0[2]).val[1]; + d20[4] = vzipq_u16(d2[1], d0[1]).val[0]; + d20[5] = vzipq_u16(d2[1], d0[1]).val[1]; + d20[6] = vzipq_u16(d2[0], d0[0]).val[0]; + d20[7] = vzipq_u16(d2[0], d0[0]).val[1]; + + vst1q_u16(dst + 0 * stride + 0, vextq_u16(d20[7], d1[0], 7)); + vst1q_u16(dst + 0 * stride + 8, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 0 * stride + 16, vextq_u16(d1[1], d1[2], 7)); + vst1q_u16(dst + 0 * stride + 24, vextq_u16(d1[2], d1[3], 7)); + vst1q_u16(dst + 1 * stride + 0, vextq_u16(d20[7], d1[0], 5)); + vst1q_u16(dst + 1 * stride + 8, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 1 * stride + 16, vextq_u16(d1[1], d1[2], 5)); + vst1q_u16(dst + 1 * stride + 24, vextq_u16(d1[2], d1[3], 5)); + vst1q_u16(dst + 2 * stride + 0, vextq_u16(d20[7], d1[0], 3)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 2 * stride + 16, vextq_u16(d1[1], d1[2], 3)); + vst1q_u16(dst + 2 * stride + 24, vextq_u16(d1[2], d1[3], 3)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(d20[7], d1[0], 1)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 1)); + vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[1], d1[2], 1)); + vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1[2], d1[3], 1)); + + vst1q_u16(dst + 4 * stride + 0, vextq_u16(d20[6], d20[7], 7)); + vst1q_u16(dst + 4 * stride + 8, vextq_u16(d20[7], d1[0], 7)); + vst1q_u16(dst + 4 * stride + 16, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 4 * stride + 24, vextq_u16(d1[1], d1[2], 7)); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(d20[6], d20[7], 5)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(d20[7], d1[0], 5)); + vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1[1], d1[2], 5)); + vst1q_u16(dst + 6 * stride + 0, vextq_u16(d20[6], d20[7], 3)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(d20[7], d1[0], 3)); + vst1q_u16(dst + 6 * stride + 16, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 6 * stride + 24, vextq_u16(d1[1], d1[2], 3)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(d20[6], d20[7], 1)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(d20[7], d1[0], 1)); + vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[0], d1[1], 1)); + vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1[1], d1[2], 1)); + + vst1q_u16(dst + 8 * stride + 0, vextq_u16(d20[5], d20[6], 7)); + vst1q_u16(dst + 8 * stride + 8, vextq_u16(d20[6], d20[7], 7)); + vst1q_u16(dst + 8 * stride + 16, vextq_u16(d20[7], d1[0], 7)); + vst1q_u16(dst + 8 * stride + 24, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(d20[5], d20[6], 5)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(d20[6], d20[7], 5)); + vst1q_u16(dst + 9 * stride + 16, vextq_u16(d20[7], d1[0], 5)); + vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 10 * stride + 0, vextq_u16(d20[5], d20[6], 3)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(d20[6], d20[7], 3)); + vst1q_u16(dst + 10 * stride + 16, vextq_u16(d20[7], d1[0], 3)); + vst1q_u16(dst + 10 * stride + 24, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(d20[5], d20[6], 1)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(d20[6], d20[7], 1)); + vst1q_u16(dst + 11 * stride + 16, vextq_u16(d20[7], d1[0], 1)); + vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1[0], d1[1], 1)); + + vst1q_u16(dst + 12 * stride + 0, vextq_u16(d20[4], d20[5], 7)); + vst1q_u16(dst + 12 * stride + 8, vextq_u16(d20[5], d20[6], 7)); + vst1q_u16(dst + 12 * stride + 16, vextq_u16(d20[6], d20[7], 7)); + vst1q_u16(dst + 12 * stride + 24, vextq_u16(d20[7], d1[0], 7)); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(d20[4], d20[5], 5)); + vst1q_u16(dst + 13 * stride + 8, vextq_u16(d20[5], d20[6], 5)); + vst1q_u16(dst + 13 * stride + 16, vextq_u16(d20[6], d20[7], 5)); + vst1q_u16(dst + 13 * stride + 24, vextq_u16(d20[7], d1[0], 5)); + vst1q_u16(dst + 14 * stride + 0, vextq_u16(d20[4], d20[5], 3)); + vst1q_u16(dst + 14 * stride + 8, vextq_u16(d20[5], d20[6], 3)); + vst1q_u16(dst + 14 * stride + 16, vextq_u16(d20[6], d20[7], 3)); + vst1q_u16(dst + 14 * stride + 24, vextq_u16(d20[7], d1[0], 3)); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(d20[4], d20[5], 1)); + vst1q_u16(dst + 15 * stride + 8, vextq_u16(d20[5], d20[6], 1)); + vst1q_u16(dst + 15 * stride + 16, vextq_u16(d20[6], d20[7], 1)); + vst1q_u16(dst + 15 * stride + 24, vextq_u16(d20[7], d1[0], 1)); + + vst1q_u16(dst + 16 * stride + 0, vextq_u16(d20[3], d20[4], 7)); + vst1q_u16(dst + 16 * stride + 8, vextq_u16(d20[4], d20[5], 7)); + vst1q_u16(dst + 16 * stride + 16, vextq_u16(d20[5], d20[6], 7)); + vst1q_u16(dst + 16 * stride + 24, vextq_u16(d20[6], d20[7], 7)); + vst1q_u16(dst + 17 * stride + 0, vextq_u16(d20[3], d20[4], 5)); + vst1q_u16(dst + 17 * stride + 8, vextq_u16(d20[4], d20[5], 5)); + vst1q_u16(dst + 17 * stride + 16, vextq_u16(d20[5], d20[6], 5)); + vst1q_u16(dst + 17 * stride + 24, vextq_u16(d20[6], d20[7], 5)); + vst1q_u16(dst + 18 * stride + 0, vextq_u16(d20[3], d20[4], 3)); + vst1q_u16(dst + 18 * stride + 8, vextq_u16(d20[4], d20[5], 3)); + vst1q_u16(dst + 18 * stride + 16, vextq_u16(d20[5], d20[6], 3)); + vst1q_u16(dst + 18 * stride + 24, vextq_u16(d20[6], d20[7], 3)); + vst1q_u16(dst + 19 * stride + 0, vextq_u16(d20[3], d20[4], 1)); + vst1q_u16(dst + 19 * stride + 8, vextq_u16(d20[4], d20[5], 1)); + vst1q_u16(dst + 19 * stride + 16, vextq_u16(d20[5], d20[6], 1)); + vst1q_u16(dst + 19 * stride + 24, vextq_u16(d20[6], d20[7], 1)); + + vst1q_u16(dst + 20 * stride + 0, vextq_u16(d20[2], d20[3], 7)); + vst1q_u16(dst + 20 * stride + 8, vextq_u16(d20[3], d20[4], 7)); + vst1q_u16(dst + 20 * stride + 16, vextq_u16(d20[4], d20[5], 7)); + vst1q_u16(dst + 20 * stride + 24, vextq_u16(d20[5], d20[6], 7)); + vst1q_u16(dst + 21 * stride + 0, vextq_u16(d20[2], d20[3], 5)); + vst1q_u16(dst + 21 * stride + 8, vextq_u16(d20[3], d20[4], 5)); + vst1q_u16(dst + 21 * stride + 16, vextq_u16(d20[4], d20[5], 5)); + vst1q_u16(dst + 21 * stride + 24, vextq_u16(d20[5], d20[6], 5)); + vst1q_u16(dst + 22 * stride + 0, vextq_u16(d20[2], d20[3], 3)); + vst1q_u16(dst + 22 * stride + 8, vextq_u16(d20[3], d20[4], 3)); + vst1q_u16(dst + 22 * stride + 16, vextq_u16(d20[4], d20[5], 3)); + vst1q_u16(dst + 22 * stride + 24, vextq_u16(d20[5], d20[6], 3)); + vst1q_u16(dst + 23 * stride + 0, vextq_u16(d20[2], d20[3], 1)); + vst1q_u16(dst + 23 * stride + 8, vextq_u16(d20[3], d20[4], 1)); + vst1q_u16(dst + 23 * stride + 16, vextq_u16(d20[4], d20[5], 1)); + vst1q_u16(dst + 23 * stride + 24, vextq_u16(d20[5], d20[6], 1)); + + vst1q_u16(dst + 24 * stride + 0, vextq_u16(d20[1], d20[2], 7)); + vst1q_u16(dst + 24 * stride + 8, vextq_u16(d20[2], d20[3], 7)); + vst1q_u16(dst + 24 * stride + 16, vextq_u16(d20[3], d20[4], 7)); + vst1q_u16(dst + 24 * stride + 24, vextq_u16(d20[4], d20[5], 7)); + vst1q_u16(dst + 25 * stride + 0, vextq_u16(d20[1], d20[2], 5)); + vst1q_u16(dst + 25 * stride + 8, vextq_u16(d20[2], d20[3], 5)); + vst1q_u16(dst + 25 * stride + 16, vextq_u16(d20[3], d20[4], 5)); + vst1q_u16(dst + 25 * stride + 24, vextq_u16(d20[4], d20[5], 5)); + vst1q_u16(dst + 26 * stride + 0, vextq_u16(d20[1], d20[2], 3)); + vst1q_u16(dst + 26 * stride + 8, vextq_u16(d20[2], d20[3], 3)); + vst1q_u16(dst + 26 * stride + 16, vextq_u16(d20[3], d20[4], 3)); + vst1q_u16(dst + 26 * stride + 24, vextq_u16(d20[4], d20[5], 3)); + vst1q_u16(dst + 27 * stride + 0, vextq_u16(d20[1], d20[2], 1)); + vst1q_u16(dst + 27 * stride + 8, vextq_u16(d20[2], d20[3], 1)); + vst1q_u16(dst + 27 * stride + 16, vextq_u16(d20[3], d20[4], 1)); + vst1q_u16(dst + 27 * stride + 24, vextq_u16(d20[4], d20[5], 1)); + + vst1q_u16(dst + 28 * stride + 0, vextq_u16(d20[0], d20[1], 7)); + vst1q_u16(dst + 28 * stride + 8, vextq_u16(d20[1], d20[2], 7)); + vst1q_u16(dst + 28 * stride + 16, vextq_u16(d20[2], d20[3], 7)); + vst1q_u16(dst + 28 * stride + 24, vextq_u16(d20[3], d20[4], 7)); + vst1q_u16(dst + 29 * stride + 0, vextq_u16(d20[0], d20[1], 5)); + vst1q_u16(dst + 29 * stride + 8, vextq_u16(d20[1], d20[2], 5)); + vst1q_u16(dst + 29 * stride + 16, vextq_u16(d20[2], d20[3], 5)); + vst1q_u16(dst + 29 * stride + 24, vextq_u16(d20[3], d20[4], 5)); + vst1q_u16(dst + 30 * stride + 0, vextq_u16(d20[0], d20[1], 3)); + vst1q_u16(dst + 30 * stride + 8, vextq_u16(d20[1], d20[2], 3)); + vst1q_u16(dst + 30 * stride + 16, vextq_u16(d20[2], d20[3], 3)); + vst1q_u16(dst + 30 * stride + 24, vextq_u16(d20[3], d20[4], 3)); + vst1q_u16(dst + 31 * stride + 0, vextq_u16(d20[0], d20[1], 1)); + vst1q_u16(dst + 31 * stride + 8, vextq_u16(d20[1], d20[2], 1)); + vst1q_u16(dst + 31 * stride + 16, vextq_u16(d20[2], d20[3], 1)); + vst1q_u16(dst + 31 * stride + 24, vextq_u16(d20[3], d20[4], 1)); +} + +// ----------------------------------------------------------------------------- + +void vpx_highbd_d135_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t XA0123___ = vld1q_u16(above - 1); + const uint16x4_t L0123 = vld1_u16(left); + const uint16x4_t L3210 = vrev64_u16(L0123); + const uint16x8_t L____3210 = vcombine_u16(L0123, L3210); + const uint16x8_t L3210XA012 = vcombine_u16(L3210, vget_low_u16(XA0123___)); + const uint16x8_t L210XA0123 = vextq_u16(L____3210, XA0123___, 5); + const uint16x8_t L10XA0123_ = vextq_u16(L____3210, XA0123___, 6); + const uint16x8_t avg1 = vhaddq_u16(L3210XA012, L10XA0123_); + const uint16x8_t avg2 = vrhaddq_u16(avg1, L210XA0123); + const uint16x4_t row_0 = vget_low_u16(avg2); + const uint16x4_t row_1 = vget_high_u16(avg2); + const uint16x4_t r0 = vext_u16(row_0, row_1, 3); + const uint16x4_t r1 = vext_u16(row_0, row_1, 2); + const uint16x4_t r2 = vext_u16(row_0, row_1, 1); + (void)bd; + vst1_u16(dst, r0); + dst += stride; + vst1_u16(dst, r1); + dst += stride; + vst1_u16(dst, r2); + dst += stride; + vst1_u16(dst, row_0); +} + +void vpx_highbd_d135_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t XA0123456 = vld1q_u16(above - 1); + const uint16x8_t A01234567 = vld1q_u16(above); + const uint16x8_t A1234567_ = vld1q_u16(above + 1); + const uint16x8_t L01234567 = vld1q_u16(left); + const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567)); + const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567)); + const uint16x8_t L76543210 = vcombine_u16(L7654, L3210); + const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1); + const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2); + const uint16x8_t avg_0 = vhaddq_u16(L76543210, L543210XA0); + const uint16x8_t avg_1 = vhaddq_u16(XA0123456, A1234567_); + const uint16x8_t row_0 = vrhaddq_u16(avg_0, L6543210X); + const uint16x8_t row_1 = vrhaddq_u16(avg_1, A01234567); + const uint16x8_t r0 = vextq_u16(row_0, row_1, 7); + const uint16x8_t r1 = vextq_u16(row_0, row_1, 6); + const uint16x8_t r2 = vextq_u16(row_0, row_1, 5); + const uint16x8_t r3 = vextq_u16(row_0, row_1, 4); + const uint16x8_t r4 = vextq_u16(row_0, row_1, 3); + const uint16x8_t r5 = vextq_u16(row_0, row_1, 2); + const uint16x8_t r6 = vextq_u16(row_0, row_1, 1); + (void)bd; + vst1q_u16(dst, r0); + dst += stride; + vst1q_u16(dst, r1); + dst += stride; + vst1q_u16(dst, r2); + dst += stride; + vst1q_u16(dst, r3); + dst += stride; + vst1q_u16(dst, r4); + dst += stride; + vst1q_u16(dst, r5); + dst += stride; + vst1q_u16(dst, r6); + dst += stride; + vst1q_u16(dst, row_0); +} + +static INLINE void d135_store_16(uint16_t **dst, const ptrdiff_t stride, + const uint16x8_t row_0, + const uint16x8_t row_1) { + vst1q_u16(*dst, row_0); + *dst += 8; + vst1q_u16(*dst, row_1); + *dst += stride - 8; +} + +void vpx_highbd_d135_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t L01234567 = vld1q_u16(left); + const uint16x8_t L89abcdef = vld1q_u16(left + 8); + const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567)); + const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567)); + const uint16x4_t Lba98 = vrev64_u16(vget_low_u16(L89abcdef)); + const uint16x4_t Lfedc = vrev64_u16(vget_high_u16(L89abcdef)); + const uint16x8_t L76543210 = vcombine_u16(L7654, L3210); + const uint16x8_t Lfedcba98 = vcombine_u16(Lfedc, Lba98); + const uint16x8_t Ledcba987 = vextq_u16(Lfedcba98, L76543210, 1); + const uint16x8_t Ldcba9876 = vextq_u16(Lfedcba98, L76543210, 2); + const uint16x8_t avg_0 = vhaddq_u16(Lfedcba98, Ldcba9876); + const uint16x8_t row_0 = vrhaddq_u16(avg_0, Ledcba987); + + const uint16x8_t XA0123456 = vld1q_u16(above - 1); + const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1); + const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2); + const uint16x8_t avg_1 = vhaddq_u16(L76543210, L543210XA0); + const uint16x8_t row_1 = vrhaddq_u16(avg_1, L6543210X); + + const uint16x8_t A01234567 = vld1q_u16(above); + const uint16x8_t A12345678 = vld1q_u16(above + 1); + const uint16x8_t avg_2 = vhaddq_u16(XA0123456, A12345678); + const uint16x8_t row_2 = vrhaddq_u16(avg_2, A01234567); + + const uint16x8_t A789abcde = vld1q_u16(above + 7); + const uint16x8_t A89abcdef = vld1q_u16(above + 8); + const uint16x8_t A9abcdef_ = vld1q_u16(above + 9); + const uint16x8_t avg_3 = vhaddq_u16(A789abcde, A9abcdef_); + const uint16x8_t row_3 = vrhaddq_u16(avg_3, A89abcdef); + + const uint16x8_t r0_0 = vextq_u16(row_1, row_2, 7); + const uint16x8_t r0_1 = vextq_u16(row_2, row_3, 7); + const uint16x8_t r1_0 = vextq_u16(row_1, row_2, 6); + const uint16x8_t r1_1 = vextq_u16(row_2, row_3, 6); + const uint16x8_t r2_0 = vextq_u16(row_1, row_2, 5); + const uint16x8_t r2_1 = vextq_u16(row_2, row_3, 5); + const uint16x8_t r3_0 = vextq_u16(row_1, row_2, 4); + const uint16x8_t r3_1 = vextq_u16(row_2, row_3, 4); + const uint16x8_t r4_0 = vextq_u16(row_1, row_2, 3); + const uint16x8_t r4_1 = vextq_u16(row_2, row_3, 3); + const uint16x8_t r5_0 = vextq_u16(row_1, row_2, 2); + const uint16x8_t r5_1 = vextq_u16(row_2, row_3, 2); + const uint16x8_t r6_0 = vextq_u16(row_1, row_2, 1); + const uint16x8_t r6_1 = vextq_u16(row_2, row_3, 1); + const uint16x8_t r8_0 = vextq_u16(row_0, row_1, 7); + const uint16x8_t r9_0 = vextq_u16(row_0, row_1, 6); + const uint16x8_t ra_0 = vextq_u16(row_0, row_1, 5); + const uint16x8_t rb_0 = vextq_u16(row_0, row_1, 4); + const uint16x8_t rc_0 = vextq_u16(row_0, row_1, 3); + const uint16x8_t rd_0 = vextq_u16(row_0, row_1, 2); + const uint16x8_t re_0 = vextq_u16(row_0, row_1, 1); + (void)bd; + + d135_store_16(&dst, stride, r0_0, r0_1); + d135_store_16(&dst, stride, r1_0, r1_1); + d135_store_16(&dst, stride, r2_0, r2_1); + d135_store_16(&dst, stride, r3_0, r3_1); + d135_store_16(&dst, stride, r4_0, r4_1); + d135_store_16(&dst, stride, r5_0, r5_1); + d135_store_16(&dst, stride, r6_0, r6_1); + d135_store_16(&dst, stride, row_1, row_2); + d135_store_16(&dst, stride, r8_0, r0_0); + d135_store_16(&dst, stride, r9_0, r1_0); + d135_store_16(&dst, stride, ra_0, r2_0); + d135_store_16(&dst, stride, rb_0, r3_0); + d135_store_16(&dst, stride, rc_0, r4_0); + d135_store_16(&dst, stride, rd_0, r5_0); + d135_store_16(&dst, stride, re_0, r6_0); + vst1q_u16(dst, row_0); + dst += 8; + vst1q_u16(dst, row_1); +} + +void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t LL01234567 = vld1q_u16(left + 16); + const uint16x8_t LL89abcdef = vld1q_u16(left + 24); + const uint16x4_t LL3210 = vrev64_u16(vget_low_u16(LL01234567)); + const uint16x4_t LL7654 = vrev64_u16(vget_high_u16(LL01234567)); + const uint16x4_t LLba98 = vrev64_u16(vget_low_u16(LL89abcdef)); + const uint16x4_t LLfedc = vrev64_u16(vget_high_u16(LL89abcdef)); + const uint16x8_t LL76543210 = vcombine_u16(LL7654, LL3210); + const uint16x8_t LLfedcba98 = vcombine_u16(LLfedc, LLba98); + const uint16x8_t LLedcba987 = vextq_u16(LLfedcba98, LL76543210, 1); + const uint16x8_t LLdcba9876 = vextq_u16(LLfedcba98, LL76543210, 2); + const uint16x8_t avg_0 = vhaddq_u16(LLfedcba98, LLdcba9876); + uint16x8_t row_0 = vrhaddq_u16(avg_0, LLedcba987); + + const uint16x8_t LU01234567 = vld1q_u16(left); + const uint16x8_t LU89abcdef = vld1q_u16(left + 8); + const uint16x4_t LU3210 = vrev64_u16(vget_low_u16(LU01234567)); + const uint16x4_t LU7654 = vrev64_u16(vget_high_u16(LU01234567)); + const uint16x4_t LUba98 = vrev64_u16(vget_low_u16(LU89abcdef)); + const uint16x4_t LUfedc = vrev64_u16(vget_high_u16(LU89abcdef)); + const uint16x8_t LU76543210 = vcombine_u16(LU7654, LU3210); + const uint16x8_t LUfedcba98 = vcombine_u16(LUfedc, LUba98); + const uint16x8_t LL6543210Uf = vextq_u16(LL76543210, LUfedcba98, 1); + const uint16x8_t LL543210Ufe = vextq_u16(LL76543210, LUfedcba98, 2); + const uint16x8_t avg_1 = vhaddq_u16(LL76543210, LL543210Ufe); + uint16x8_t row_1 = vrhaddq_u16(avg_1, LL6543210Uf); + + const uint16x8_t LUedcba987 = vextq_u16(LUfedcba98, LU76543210, 1); + const uint16x8_t LUdcba9876 = vextq_u16(LUfedcba98, LU76543210, 2); + const uint16x8_t avg_2 = vhaddq_u16(LUfedcba98, LUdcba9876); + uint16x8_t row_2 = vrhaddq_u16(avg_2, LUedcba987); + + const uint16x8_t XAL0123456 = vld1q_u16(above - 1); + const uint16x8_t LU6543210X = vextq_u16(LU76543210, XAL0123456, 1); + const uint16x8_t LU543210XA0 = vextq_u16(LU76543210, XAL0123456, 2); + const uint16x8_t avg_3 = vhaddq_u16(LU76543210, LU543210XA0); + uint16x8_t row_3 = vrhaddq_u16(avg_3, LU6543210X); + + const uint16x8_t AL01234567 = vld1q_u16(above); + const uint16x8_t AL12345678 = vld1q_u16(above + 1); + const uint16x8_t avg_4 = vhaddq_u16(XAL0123456, AL12345678); + uint16x8_t row_4 = vrhaddq_u16(avg_4, AL01234567); + + const uint16x8_t AL789abcde = vld1q_u16(above + 7); + const uint16x8_t AL89abcdef = vld1q_u16(above + 8); + const uint16x8_t AL9abcdefg = vld1q_u16(above + 9); + const uint16x8_t avg_5 = vhaddq_u16(AL789abcde, AL9abcdefg); + uint16x8_t row_5 = vrhaddq_u16(avg_5, AL89abcdef); + + const uint16x8_t ALfR0123456 = vld1q_u16(above + 15); + const uint16x8_t AR01234567 = vld1q_u16(above + 16); + const uint16x8_t AR12345678 = vld1q_u16(above + 17); + const uint16x8_t avg_6 = vhaddq_u16(ALfR0123456, AR12345678); + uint16x8_t row_6 = vrhaddq_u16(avg_6, AR01234567); + + const uint16x8_t AR789abcde = vld1q_u16(above + 23); + const uint16x8_t AR89abcdef = vld1q_u16(above + 24); + const uint16x8_t AR9abcdef_ = vld1q_u16(above + 25); + const uint16x8_t avg_7 = vhaddq_u16(AR789abcde, AR9abcdef_); + uint16x8_t row_7 = vrhaddq_u16(avg_7, AR89abcdef); + int i, j; + (void)bd; + + dst += 31 * stride; + for (i = 0; i < 4; ++i) { + for (j = 0; j < 8; ++j) { + vst1q_u16(dst, row_0); + dst += 8; + vst1q_u16(dst, row_1); + dst += 8; + vst1q_u16(dst, row_2); + dst += 8; + vst1q_u16(dst, row_3); + dst -= stride + 24; + row_0 = vextq_u16(row_0, row_1, 1); + row_1 = vextq_u16(row_1, row_2, 1); + row_2 = vextq_u16(row_2, row_3, 1); + row_3 = vextq_u16(row_3, row_4, 1); + row_4 = vextq_u16(row_4, row_4, 1); + } + row_4 = row_5; + row_5 = row_6; + row_6 = row_7; + } +} + +//------------------------------------------------------------------------------ + +void vpx_highbd_d207_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x4_t l0, l1, l2, l3, c0, c1, c01_lo, c01_hi; + (void)above; + (void)bd; + + l0 = vld1_u16(left + 0); + l3 = vld1_dup_u16(left + 3); + + // [ left[1], left[2], left[3], left[3] ] + l1 = vext_u16(l0, l3, 1); + // [ left[2], left[3], left[3], left[3] ] + l2 = vext_u16(l0, l3, 2); + + c0 = vrhadd_u16(l0, l1); + c1 = vrhadd_u16(vhadd_u16(l0, l2), l1); + + c01_lo = vzip_u16(c0, c1).val[0]; + c01_hi = vzip_u16(c0, c1).val[1]; + + // stride=0 [ c0[0], c1[0], c0[1], c1[1] ] + // stride=1 [ c0[1], c1[1], c0[2], c1[2] ] + // stride=2 [ c0[2], c1[2], c0[3], c1[3] ] + // stride=3 [ c0[3], c1[3], left[3], left[3] ] + vst1_u16(dst + 0 * stride, c01_lo); + vst1_u16(dst + 1 * stride, vext_u16(c01_lo, c01_hi, 2)); + vst1_u16(dst + 2 * stride, c01_hi); + vst1_u16(dst + 3 * stride, vext_u16(c01_hi, l3, 2)); +} + +void vpx_highbd_d207_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t l0, l1, l2, l7, c0, c1, c01_lo, c01_hi; + (void)above; + (void)bd; + + l0 = vld1q_u16(left + 0); + l7 = vld1q_dup_u16(left + 7); + + // [ left[1], left[2], left[3], left[4], left[5], left[6], left[7], left[7] ] + l1 = vextq_u16(l0, l7, 1); + // [ left[2], left[3], left[4], left[5], left[6], left[7], left[7], left[7] ] + l2 = vextq_u16(l0, l7, 2); + + c0 = vrhaddq_u16(l0, l1); + c1 = vrhaddq_u16(vhaddq_u16(l0, l2), l1); + + c01_lo = vzipq_u16(c0, c1).val[0]; + c01_hi = vzipq_u16(c0, c1).val[1]; + + vst1q_u16(dst + 0 * stride, c01_lo); + vst1q_u16(dst + 1 * stride, vextq_u16(c01_lo, c01_hi, 2)); + vst1q_u16(dst + 2 * stride, vextq_u16(c01_lo, c01_hi, 4)); + vst1q_u16(dst + 3 * stride, vextq_u16(c01_lo, c01_hi, 6)); + vst1q_u16(dst + 4 * stride, c01_hi); + vst1q_u16(dst + 5 * stride, vextq_u16(c01_hi, l7, 2)); + vst1q_u16(dst + 6 * stride, vextq_u16(c01_hi, l7, 4)); + vst1q_u16(dst + 7 * stride, vextq_u16(c01_hi, l7, 6)); +} + +void vpx_highbd_d207_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t l0, l1, l2, l8, l9, l10, l15, c0[2], c1[2], c01[4]; + (void)above; + (void)bd; + + l0 = vld1q_u16(left + 0); + l1 = vld1q_u16(left + 1); + l2 = vld1q_u16(left + 2); + l8 = vld1q_u16(left + 8); + l15 = vld1q_dup_u16(left + 15); + + l9 = vextq_u16(l8, l15, 1); + l10 = vextq_u16(l8, l15, 2); + + c0[0] = vrhaddq_u16(l0, l1); + c0[1] = vrhaddq_u16(l8, l9); + c1[0] = vrhaddq_u16(vhaddq_u16(l0, l2), l1); + c1[1] = vrhaddq_u16(vhaddq_u16(l8, l10), l9); + + c01[0] = vzipq_u16(c0[0], c1[0]).val[0]; + c01[1] = vzipq_u16(c0[0], c1[0]).val[1]; + c01[2] = vzipq_u16(c0[1], c1[1]).val[0]; + c01[3] = vzipq_u16(c0[1], c1[1]).val[1]; + + vst1q_u16(dst + 0 * stride + 0, c01[0]); + vst1q_u16(dst + 0 * stride + 8, c01[1]); + vst1q_u16(dst + 1 * stride + 0, vextq_u16(c01[0], c01[1], 2)); + vst1q_u16(dst + 1 * stride + 8, vextq_u16(c01[1], c01[2], 2)); + vst1q_u16(dst + 2 * stride + 0, vextq_u16(c01[0], c01[1], 4)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(c01[1], c01[2], 4)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(c01[0], c01[1], 6)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(c01[1], c01[2], 6)); + + vst1q_u16(dst + 4 * stride + 0, c01[1]); + vst1q_u16(dst + 4 * stride + 8, c01[2]); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(c01[1], c01[2], 2)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(c01[2], c01[3], 2)); + vst1q_u16(dst + 6 * stride + 0, vextq_u16(c01[1], c01[2], 4)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(c01[2], c01[3], 4)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(c01[1], c01[2], 6)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(c01[2], c01[3], 6)); + + vst1q_u16(dst + 8 * stride + 0, c01[2]); + vst1q_u16(dst + 8 * stride + 8, c01[3]); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(c01[2], c01[3], 2)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(c01[3], l15, 2)); + vst1q_u16(dst + 10 * stride + 0, vextq_u16(c01[2], c01[3], 4)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(c01[3], l15, 4)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(c01[2], c01[3], 6)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(c01[3], l15, 6)); + + vst1q_u16(dst + 12 * stride + 0, c01[3]); + vst1q_u16(dst + 12 * stride + 8, l15); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(c01[3], l15, 2)); + vst1q_u16(dst + 13 * stride + 8, l15); + vst1q_u16(dst + 14 * stride + 0, vextq_u16(c01[3], l15, 4)); + vst1q_u16(dst + 14 * stride + 8, l15); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(c01[3], l15, 6)); + vst1q_u16(dst + 15 * stride + 8, l15); +} + +void vpx_highbd_d207_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t l0, l1, l2, l8, l9, l10, l16, l17, l18, l24, l25, l26, l31, c0[4], + c1[4], c01[8]; + (void)above; + (void)bd; + + l0 = vld1q_u16(left + 0); + l1 = vld1q_u16(left + 1); + l2 = vld1q_u16(left + 2); + l8 = vld1q_u16(left + 8); + l9 = vld1q_u16(left + 9); + l10 = vld1q_u16(left + 10); + l16 = vld1q_u16(left + 16); + l17 = vld1q_u16(left + 17); + l18 = vld1q_u16(left + 18); + l24 = vld1q_u16(left + 24); + l31 = vld1q_dup_u16(left + 31); + + l25 = vextq_u16(l24, l31, 1); + l26 = vextq_u16(l24, l31, 2); + + c0[0] = vrhaddq_u16(l0, l1); + c0[1] = vrhaddq_u16(l8, l9); + c0[2] = vrhaddq_u16(l16, l17); + c0[3] = vrhaddq_u16(l24, l25); + c1[0] = vrhaddq_u16(vhaddq_u16(l0, l2), l1); + c1[1] = vrhaddq_u16(vhaddq_u16(l8, l10), l9); + c1[2] = vrhaddq_u16(vhaddq_u16(l16, l18), l17); + c1[3] = vrhaddq_u16(vhaddq_u16(l24, l26), l25); + + c01[0] = vzipq_u16(c0[0], c1[0]).val[0]; + c01[1] = vzipq_u16(c0[0], c1[0]).val[1]; + c01[2] = vzipq_u16(c0[1], c1[1]).val[0]; + c01[3] = vzipq_u16(c0[1], c1[1]).val[1]; + c01[4] = vzipq_u16(c0[2], c1[2]).val[0]; + c01[5] = vzipq_u16(c0[2], c1[2]).val[1]; + c01[6] = vzipq_u16(c0[3], c1[3]).val[0]; + c01[7] = vzipq_u16(c0[3], c1[3]).val[1]; + + vst1q_u16(dst + 0 * stride + 0, c01[0]); + vst1q_u16(dst + 0 * stride + 8, c01[1]); + vst1q_u16(dst + 0 * stride + 16, c01[2]); + vst1q_u16(dst + 0 * stride + 24, c01[3]); + vst1q_u16(dst + 1 * stride + 0, vextq_u16(c01[0], c01[1], 2)); + vst1q_u16(dst + 1 * stride + 8, vextq_u16(c01[1], c01[2], 2)); + vst1q_u16(dst + 1 * stride + 16, vextq_u16(c01[2], c01[3], 2)); + vst1q_u16(dst + 1 * stride + 24, vextq_u16(c01[3], c01[4], 2)); + vst1q_u16(dst + 2 * stride + 0, vextq_u16(c01[0], c01[1], 4)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(c01[1], c01[2], 4)); + vst1q_u16(dst + 2 * stride + 16, vextq_u16(c01[2], c01[3], 4)); + vst1q_u16(dst + 2 * stride + 24, vextq_u16(c01[3], c01[4], 4)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(c01[0], c01[1], 6)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(c01[1], c01[2], 6)); + vst1q_u16(dst + 3 * stride + 16, vextq_u16(c01[2], c01[3], 6)); + vst1q_u16(dst + 3 * stride + 24, vextq_u16(c01[3], c01[4], 6)); + + vst1q_u16(dst + 4 * stride + 0, c01[1]); + vst1q_u16(dst + 4 * stride + 8, c01[2]); + vst1q_u16(dst + 4 * stride + 16, c01[3]); + vst1q_u16(dst + 4 * stride + 24, c01[4]); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(c01[1], c01[2], 2)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(c01[2], c01[3], 2)); + vst1q_u16(dst + 5 * stride + 16, vextq_u16(c01[3], c01[4], 2)); + vst1q_u16(dst + 5 * stride + 24, vextq_u16(c01[4], c01[5], 2)); + vst1q_u16(dst + 6 * stride + 0, vextq_u16(c01[1], c01[2], 4)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(c01[2], c01[3], 4)); + vst1q_u16(dst + 6 * stride + 16, vextq_u16(c01[3], c01[4], 4)); + vst1q_u16(dst + 6 * stride + 24, vextq_u16(c01[4], c01[5], 4)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(c01[1], c01[2], 6)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(c01[2], c01[3], 6)); + vst1q_u16(dst + 7 * stride + 16, vextq_u16(c01[3], c01[4], 6)); + vst1q_u16(dst + 7 * stride + 24, vextq_u16(c01[4], c01[5], 6)); + + vst1q_u16(dst + 8 * stride + 0, c01[2]); + vst1q_u16(dst + 8 * stride + 8, c01[3]); + vst1q_u16(dst + 8 * stride + 16, c01[4]); + vst1q_u16(dst + 8 * stride + 24, c01[5]); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(c01[2], c01[3], 2)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(c01[3], c01[4], 2)); + vst1q_u16(dst + 9 * stride + 16, vextq_u16(c01[4], c01[5], 2)); + vst1q_u16(dst + 9 * stride + 24, vextq_u16(c01[5], c01[6], 2)); + vst1q_u16(dst + 10 * stride + 0, vextq_u16(c01[2], c01[3], 4)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(c01[3], c01[4], 4)); + vst1q_u16(dst + 10 * stride + 16, vextq_u16(c01[4], c01[5], 4)); + vst1q_u16(dst + 10 * stride + 24, vextq_u16(c01[5], c01[6], 4)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(c01[2], c01[3], 6)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(c01[3], c01[4], 6)); + vst1q_u16(dst + 11 * stride + 16, vextq_u16(c01[4], c01[5], 6)); + vst1q_u16(dst + 11 * stride + 24, vextq_u16(c01[5], c01[6], 6)); + + vst1q_u16(dst + 12 * stride + 0, c01[3]); + vst1q_u16(dst + 12 * stride + 8, c01[4]); + vst1q_u16(dst + 12 * stride + 16, c01[5]); + vst1q_u16(dst + 12 * stride + 24, c01[6]); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(c01[3], c01[4], 2)); + vst1q_u16(dst + 13 * stride + 8, vextq_u16(c01[4], c01[5], 2)); + vst1q_u16(dst + 13 * stride + 16, vextq_u16(c01[5], c01[6], 2)); + vst1q_u16(dst + 13 * stride + 24, vextq_u16(c01[6], c01[7], 2)); + vst1q_u16(dst + 14 * stride + 0, vextq_u16(c01[3], c01[4], 4)); + vst1q_u16(dst + 14 * stride + 8, vextq_u16(c01[4], c01[5], 4)); + vst1q_u16(dst + 14 * stride + 16, vextq_u16(c01[5], c01[6], 4)); + vst1q_u16(dst + 14 * stride + 24, vextq_u16(c01[6], c01[7], 4)); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(c01[3], c01[4], 6)); + vst1q_u16(dst + 15 * stride + 8, vextq_u16(c01[4], c01[5], 6)); + vst1q_u16(dst + 15 * stride + 16, vextq_u16(c01[5], c01[6], 6)); + vst1q_u16(dst + 15 * stride + 24, vextq_u16(c01[6], c01[7], 6)); + + vst1q_u16(dst + 16 * stride + 0, c01[4]); + vst1q_u16(dst + 16 * stride + 8, c01[5]); + vst1q_u16(dst + 16 * stride + 16, c01[6]); + vst1q_u16(dst + 16 * stride + 24, c01[7]); + vst1q_u16(dst + 17 * stride + 0, vextq_u16(c01[4], c01[5], 2)); + vst1q_u16(dst + 17 * stride + 8, vextq_u16(c01[5], c01[6], 2)); + vst1q_u16(dst + 17 * stride + 16, vextq_u16(c01[6], c01[7], 2)); + vst1q_u16(dst + 17 * stride + 24, vextq_u16(c01[7], l31, 2)); + vst1q_u16(dst + 18 * stride + 0, vextq_u16(c01[4], c01[5], 4)); + vst1q_u16(dst + 18 * stride + 8, vextq_u16(c01[5], c01[6], 4)); + vst1q_u16(dst + 18 * stride + 16, vextq_u16(c01[6], c01[7], 4)); + vst1q_u16(dst + 18 * stride + 24, vextq_u16(c01[7], l31, 4)); + vst1q_u16(dst + 19 * stride + 0, vextq_u16(c01[4], c01[5], 6)); + vst1q_u16(dst + 19 * stride + 8, vextq_u16(c01[5], c01[6], 6)); + vst1q_u16(dst + 19 * stride + 16, vextq_u16(c01[6], c01[7], 6)); + vst1q_u16(dst + 19 * stride + 24, vextq_u16(c01[7], l31, 6)); + + vst1q_u16(dst + 20 * stride + 0, c01[5]); + vst1q_u16(dst + 20 * stride + 8, c01[6]); + vst1q_u16(dst + 20 * stride + 16, c01[7]); + vst1q_u16(dst + 20 * stride + 24, l31); + vst1q_u16(dst + 21 * stride + 0, vextq_u16(c01[5], c01[6], 2)); + vst1q_u16(dst + 21 * stride + 8, vextq_u16(c01[6], c01[7], 2)); + vst1q_u16(dst + 21 * stride + 16, vextq_u16(c01[7], l31, 2)); + vst1q_u16(dst + 21 * stride + 24, vextq_u16(l31, l31, 2)); + vst1q_u16(dst + 22 * stride + 0, vextq_u16(c01[5], c01[6], 4)); + vst1q_u16(dst + 22 * stride + 8, vextq_u16(c01[6], c01[7], 4)); + vst1q_u16(dst + 22 * stride + 16, vextq_u16(c01[7], l31, 4)); + vst1q_u16(dst + 22 * stride + 24, vextq_u16(l31, l31, 4)); + vst1q_u16(dst + 23 * stride + 0, vextq_u16(c01[5], c01[6], 6)); + vst1q_u16(dst + 23 * stride + 8, vextq_u16(c01[6], c01[7], 6)); + vst1q_u16(dst + 23 * stride + 16, vextq_u16(c01[7], l31, 6)); + vst1q_u16(dst + 23 * stride + 24, vextq_u16(l31, l31, 6)); + + vst1q_u16(dst + 24 * stride + 0, c01[6]); + vst1q_u16(dst + 24 * stride + 8, c01[7]); + vst1q_u16(dst + 24 * stride + 16, l31); + vst1q_u16(dst + 24 * stride + 24, l31); + vst1q_u16(dst + 25 * stride + 0, vextq_u16(c01[6], c01[7], 2)); + vst1q_u16(dst + 25 * stride + 8, vextq_u16(c01[7], l31, 2)); + vst1q_u16(dst + 25 * stride + 16, vextq_u16(l31, l31, 2)); + vst1q_u16(dst + 25 * stride + 24, vextq_u16(l31, l31, 2)); + vst1q_u16(dst + 26 * stride + 0, vextq_u16(c01[6], c01[7], 4)); + vst1q_u16(dst + 26 * stride + 8, vextq_u16(c01[7], l31, 4)); + vst1q_u16(dst + 26 * stride + 16, vextq_u16(l31, l31, 4)); + vst1q_u16(dst + 26 * stride + 24, vextq_u16(l31, l31, 4)); + vst1q_u16(dst + 27 * stride + 0, vextq_u16(c01[6], c01[7], 6)); + vst1q_u16(dst + 27 * stride + 8, vextq_u16(c01[7], l31, 6)); + vst1q_u16(dst + 27 * stride + 16, vextq_u16(l31, l31, 6)); + vst1q_u16(dst + 27 * stride + 24, vextq_u16(l31, l31, 6)); + + vst1q_u16(dst + 28 * stride + 0, c01[7]); + vst1q_u16(dst + 28 * stride + 8, l31); + vst1q_u16(dst + 28 * stride + 16, l31); + vst1q_u16(dst + 28 * stride + 24, l31); + vst1q_u16(dst + 29 * stride + 0, vextq_u16(c01[7], l31, 2)); + vst1q_u16(dst + 29 * stride + 8, vextq_u16(l31, l31, 2)); + vst1q_u16(dst + 29 * stride + 16, vextq_u16(l31, l31, 2)); + vst1q_u16(dst + 29 * stride + 24, vextq_u16(l31, l31, 2)); + vst1q_u16(dst + 30 * stride + 0, vextq_u16(c01[7], l31, 4)); + vst1q_u16(dst + 30 * stride + 8, vextq_u16(l31, l31, 4)); + vst1q_u16(dst + 30 * stride + 16, vextq_u16(l31, l31, 4)); + vst1q_u16(dst + 30 * stride + 24, vextq_u16(l31, l31, 4)); + vst1q_u16(dst + 31 * stride + 0, vextq_u16(c01[7], l31, 6)); + vst1q_u16(dst + 31 * stride + 8, vextq_u16(l31, l31, 6)); + vst1q_u16(dst + 31 * stride + 16, vextq_u16(l31, l31, 6)); + vst1q_u16(dst + 31 * stride + 24, vextq_u16(l31, l31, 6)); +} + +//------------------------------------------------------------------------------ + +void vpx_highbd_v_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x4_t row = vld1_u16(above); + int i; + (void)left; + (void)bd; + + for (i = 0; i < 4; i++, dst += stride) { + vst1_u16(dst, row); + } +} + +void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t row = vld1q_u16(above); + int i; + (void)left; + (void)bd; + + for (i = 0; i < 8; i++, dst += stride) { + vst1q_u16(dst, row); + } +} + +void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t row0 = vld1q_u16(above + 0); + const uint16x8_t row1 = vld1q_u16(above + 8); + int i; + (void)left; + (void)bd; + + for (i = 0; i < 16; i++) { + vst1q_u16(dst + 0, row0); + vst1q_u16(dst + 8, row1); + dst += stride; + } +} + +void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t row0 = vld1q_u16(above + 0); + const uint16x8_t row1 = vld1q_u16(above + 8); + const uint16x8_t row2 = vld1q_u16(above + 16); + const uint16x8_t row3 = vld1q_u16(above + 24); + int i; + (void)left; + (void)bd; + + for (i = 0; i < 32; i++) { + vst1q_u16(dst + 0, row0); + vst1q_u16(dst + 8, row1); + vst1q_u16(dst + 16, row2); + vst1q_u16(dst + 24, row3); + dst += stride; + } +} + +// ----------------------------------------------------------------------------- + +void vpx_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x4_t left_u16 = vld1_u16(left); + uint16x4_t row; + (void)above; + (void)bd; + + row = vdup_lane_u16(left_u16, 0); + vst1_u16(dst, row); + dst += stride; + row = vdup_lane_u16(left_u16, 1); + vst1_u16(dst, row); + dst += stride; + row = vdup_lane_u16(left_u16, 2); + vst1_u16(dst, row); + dst += stride; + row = vdup_lane_u16(left_u16, 3); + vst1_u16(dst, row); +} + +void vpx_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t left_u16 = vld1q_u16(left); + const uint16x4_t left_low = vget_low_u16(left_u16); + const uint16x4_t left_high = vget_high_u16(left_u16); + uint16x8_t row; + (void)above; + (void)bd; + + row = vdupq_lane_u16(left_low, 0); + vst1q_u16(dst, row); + dst += stride; + row = vdupq_lane_u16(left_low, 1); + vst1q_u16(dst, row); + dst += stride; + row = vdupq_lane_u16(left_low, 2); + vst1q_u16(dst, row); + dst += stride; + row = vdupq_lane_u16(left_low, 3); + vst1q_u16(dst, row); + dst += stride; + row = vdupq_lane_u16(left_high, 0); + vst1q_u16(dst, row); + dst += stride; + row = vdupq_lane_u16(left_high, 1); + vst1q_u16(dst, row); + dst += stride; + row = vdupq_lane_u16(left_high, 2); + vst1q_u16(dst, row); + dst += stride; + row = vdupq_lane_u16(left_high, 3); + vst1q_u16(dst, row); +} + +static INLINE void h_store_16(uint16_t **dst, const ptrdiff_t stride, + const uint16x8_t row) { + // Note: vst1q is faster than vst2q + vst1q_u16(*dst, row); + *dst += 8; + vst1q_u16(*dst, row); + *dst += stride - 8; +} + +void vpx_highbd_h_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 2; i++, left += 8) { + const uint16x8_t left_u16q = vld1q_u16(left); + const uint16x4_t left_low = vget_low_u16(left_u16q); + const uint16x4_t left_high = vget_high_u16(left_u16q); + uint16x8_t row; + + row = vdupq_lane_u16(left_low, 0); + h_store_16(&dst, stride, row); + row = vdupq_lane_u16(left_low, 1); + h_store_16(&dst, stride, row); + row = vdupq_lane_u16(left_low, 2); + h_store_16(&dst, stride, row); + row = vdupq_lane_u16(left_low, 3); + h_store_16(&dst, stride, row); + row = vdupq_lane_u16(left_high, 0); + h_store_16(&dst, stride, row); + row = vdupq_lane_u16(left_high, 1); + h_store_16(&dst, stride, row); + row = vdupq_lane_u16(left_high, 2); + h_store_16(&dst, stride, row); + row = vdupq_lane_u16(left_high, 3); + h_store_16(&dst, stride, row); + } +} + +static INLINE void h_store_32(uint16_t **dst, const ptrdiff_t stride, + const uint16x8_t row) { + // Note: vst1q is faster than vst2q + vst1q_u16(*dst, row); + *dst += 8; + vst1q_u16(*dst, row); + *dst += 8; + vst1q_u16(*dst, row); + *dst += 8; + vst1q_u16(*dst, row); + *dst += stride - 24; +} + +void vpx_highbd_h_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 4; i++, left += 8) { + const uint16x8_t left_u16q = vld1q_u16(left); + const uint16x4_t left_low = vget_low_u16(left_u16q); + const uint16x4_t left_high = vget_high_u16(left_u16q); + uint16x8_t row; + + row = vdupq_lane_u16(left_low, 0); + h_store_32(&dst, stride, row); + row = vdupq_lane_u16(left_low, 1); + h_store_32(&dst, stride, row); + row = vdupq_lane_u16(left_low, 2); + h_store_32(&dst, stride, row); + row = vdupq_lane_u16(left_low, 3); + h_store_32(&dst, stride, row); + row = vdupq_lane_u16(left_high, 0); + h_store_32(&dst, stride, row); + row = vdupq_lane_u16(left_high, 1); + h_store_32(&dst, stride, row); + row = vdupq_lane_u16(left_high, 2); + h_store_32(&dst, stride, row); + row = vdupq_lane_u16(left_high, 3); + h_store_32(&dst, stride, row); + } +} + +// ----------------------------------------------------------------------------- + +void vpx_highbd_tm_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int16x8_t max = vmovq_n_s16((1 << bd) - 1); + const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1)); + const int16x4_t above_s16d = vld1_s16((const int16_t *)above); + const int16x8_t above_s16 = vcombine_s16(above_s16d, above_s16d); + const int16x4_t left_s16 = vld1_s16((const int16_t *)left); + const int16x8_t sub = vsubq_s16(above_s16, top_left); + int16x8_t sum; + uint16x8_t row; + + sum = vcombine_s16(vdup_lane_s16(left_s16, 0), vdup_lane_s16(left_s16, 1)); + sum = vaddq_s16(sum, sub); + sum = vminq_s16(sum, max); + row = vqshluq_n_s16(sum, 0); + vst1_u16(dst, vget_low_u16(row)); + dst += stride; + vst1_u16(dst, vget_high_u16(row)); + dst += stride; + + sum = vcombine_s16(vdup_lane_s16(left_s16, 2), vdup_lane_s16(left_s16, 3)); + sum = vaddq_s16(sum, sub); + sum = vminq_s16(sum, max); + row = vqshluq_n_s16(sum, 0); + vst1_u16(dst, vget_low_u16(row)); + dst += stride; + vst1_u16(dst, vget_high_u16(row)); +} + +static INLINE void tm_8_kernel(uint16_t **dst, const ptrdiff_t stride, + const int16x8_t left_dup, const int16x8_t sub, + const int16x8_t max) { + uint16x8_t row; + int16x8_t sum = vaddq_s16(left_dup, sub); + sum = vminq_s16(sum, max); + row = vqshluq_n_s16(sum, 0); + vst1q_u16(*dst, row); + *dst += stride; +} + +void vpx_highbd_tm_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int16x8_t max = vmovq_n_s16((1 << bd) - 1); + const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1)); + const int16x8_t above_s16 = vld1q_s16((const int16_t *)above); + const int16x8_t left_s16 = vld1q_s16((const int16_t *)left); + const int16x8_t sub = vsubq_s16(above_s16, top_left); + int16x4_t left_s16d; + int16x8_t left_dup; + int i; + + left_s16d = vget_low_s16(left_s16); + + for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16)) { + left_dup = vdupq_lane_s16(left_s16d, 0); + tm_8_kernel(&dst, stride, left_dup, sub, max); + + left_dup = vdupq_lane_s16(left_s16d, 1); + tm_8_kernel(&dst, stride, left_dup, sub, max); + + left_dup = vdupq_lane_s16(left_s16d, 2); + tm_8_kernel(&dst, stride, left_dup, sub, max); + + left_dup = vdupq_lane_s16(left_s16d, 3); + tm_8_kernel(&dst, stride, left_dup, sub, max); + } +} + +static INLINE void tm_16_kernel(uint16_t **dst, const ptrdiff_t stride, + const int16x8_t left_dup, const int16x8_t sub0, + const int16x8_t sub1, const int16x8_t max) { + uint16x8_t row0, row1; + int16x8_t sum0 = vaddq_s16(left_dup, sub0); + int16x8_t sum1 = vaddq_s16(left_dup, sub1); + sum0 = vminq_s16(sum0, max); + sum1 = vminq_s16(sum1, max); + row0 = vqshluq_n_s16(sum0, 0); + row1 = vqshluq_n_s16(sum1, 0); + vst1q_u16(*dst, row0); + *dst += 8; + vst1q_u16(*dst, row1); + *dst += stride - 8; +} + +void vpx_highbd_tm_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int16x8_t max = vmovq_n_s16((1 << bd) - 1); + const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1)); + const int16x8_t above0 = vld1q_s16((const int16_t *)above); + const int16x8_t above1 = vld1q_s16((const int16_t *)(above + 8)); + const int16x8_t sub0 = vsubq_s16(above0, top_left); + const int16x8_t sub1 = vsubq_s16(above1, top_left); + int16x8_t left_dup; + int i, j; + + for (j = 0; j < 2; j++, left += 8) { + const int16x8_t left_s16q = vld1q_s16((const int16_t *)left); + int16x4_t left_s16d = vget_low_s16(left_s16q); + for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) { + left_dup = vdupq_lane_s16(left_s16d, 0); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max); + + left_dup = vdupq_lane_s16(left_s16d, 1); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max); + + left_dup = vdupq_lane_s16(left_s16d, 2); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max); + + left_dup = vdupq_lane_s16(left_s16d, 3); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max); + } + } +} + +static INLINE void tm_32_kernel(uint16_t **dst, const ptrdiff_t stride, + const int16x8_t left_dup, const int16x8_t sub0, + const int16x8_t sub1, const int16x8_t sub2, + const int16x8_t sub3, const int16x8_t max) { + uint16x8_t row0, row1, row2, row3; + int16x8_t sum0 = vaddq_s16(left_dup, sub0); + int16x8_t sum1 = vaddq_s16(left_dup, sub1); + int16x8_t sum2 = vaddq_s16(left_dup, sub2); + int16x8_t sum3 = vaddq_s16(left_dup, sub3); + sum0 = vminq_s16(sum0, max); + sum1 = vminq_s16(sum1, max); + sum2 = vminq_s16(sum2, max); + sum3 = vminq_s16(sum3, max); + row0 = vqshluq_n_s16(sum0, 0); + row1 = vqshluq_n_s16(sum1, 0); + row2 = vqshluq_n_s16(sum2, 0); + row3 = vqshluq_n_s16(sum3, 0); + vst1q_u16(*dst, row0); + *dst += 8; + vst1q_u16(*dst, row1); + *dst += 8; + vst1q_u16(*dst, row2); + *dst += 8; + vst1q_u16(*dst, row3); + *dst += stride - 24; +} + +void vpx_highbd_tm_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int16x8_t max = vmovq_n_s16((1 << bd) - 1); + const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1)); + const int16x8_t above0 = vld1q_s16((const int16_t *)above); + const int16x8_t above1 = vld1q_s16((const int16_t *)(above + 8)); + const int16x8_t above2 = vld1q_s16((const int16_t *)(above + 16)); + const int16x8_t above3 = vld1q_s16((const int16_t *)(above + 24)); + const int16x8_t sub0 = vsubq_s16(above0, top_left); + const int16x8_t sub1 = vsubq_s16(above1, top_left); + const int16x8_t sub2 = vsubq_s16(above2, top_left); + const int16x8_t sub3 = vsubq_s16(above3, top_left); + int16x8_t left_dup; + int i, j; + + for (i = 0; i < 4; i++, left += 8) { + const int16x8_t left_s16q = vld1q_s16((const int16_t *)left); + int16x4_t left_s16d = vget_low_s16(left_s16q); + for (j = 0; j < 2; j++, left_s16d = vget_high_s16(left_s16q)) { + left_dup = vdupq_lane_s16(left_s16d, 0); + tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max); + + left_dup = vdupq_lane_s16(left_s16d, 1); + tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max); + + left_dup = vdupq_lane_s16(left_s16d, 2); + tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max); + + left_dup = vdupq_lane_s16(left_s16d, 3); + tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max); + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c new file mode 100644 index 0000000000..8d6e8acc4c --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c @@ -0,0 +1,776 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/transpose_neon.h" + +static INLINE void load_thresh(const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, uint16x8_t *blimit_vec, + uint16x8_t *limit_vec, uint16x8_t *thresh_vec, + const int bd) { + const int16x8_t shift = vdupq_n_s16(bd - 8); + *blimit_vec = vmovl_u8(vld1_dup_u8(blimit)); + *limit_vec = vmovl_u8(vld1_dup_u8(limit)); + *thresh_vec = vmovl_u8(vld1_dup_u8(thresh)); + *blimit_vec = vshlq_u16(*blimit_vec, shift); + *limit_vec = vshlq_u16(*limit_vec, shift); + *thresh_vec = vshlq_u16(*thresh_vec, shift); +} + +// Here flat is 128-bit long, with each 16-bit chunk being a mask of +// a pixel. When used to control filter branches, we only detect whether it is +// all 0s or all 1s. We pairwise add flat to a 32-bit long number flat_status. +// flat equals 0 if and only if flat_status equals 0. +// flat equals -1 (all 1s) if and only if flat_status equals -4. (This is true +// because each mask occupies more than 1 bit.) +static INLINE uint32_t calc_flat_status(const uint16x8_t flat) { + const uint64x1_t t0 = vadd_u64(vreinterpret_u64_u16(vget_low_u16(flat)), + vreinterpret_u64_u16(vget_high_u16(flat))); + const uint64x1_t t1 = vpaddl_u32(vreinterpret_u32_u64(t0)); + return vget_lane_u32(vreinterpret_u32_u64(t1), 0); +} + +static INLINE uint16x8_t +filter_hev_mask4(const uint16x8_t limit, const uint16x8_t blimit, + const uint16x8_t thresh, const uint16x8_t p3, + const uint16x8_t p2, const uint16x8_t p1, const uint16x8_t p0, + const uint16x8_t q0, const uint16x8_t q1, const uint16x8_t q2, + const uint16x8_t q3, uint16x8_t *hev, uint16x8_t *mask) { + uint16x8_t max, t0, t1; + + max = vabdq_u16(p1, p0); + max = vmaxq_u16(max, vabdq_u16(q1, q0)); + *hev = vcgtq_u16(max, thresh); + *mask = vmaxq_u16(max, vabdq_u16(p3, p2)); + *mask = vmaxq_u16(*mask, vabdq_u16(p2, p1)); + *mask = vmaxq_u16(*mask, vabdq_u16(q2, q1)); + *mask = vmaxq_u16(*mask, vabdq_u16(q3, q2)); + t0 = vabdq_u16(p0, q0); + t1 = vabdq_u16(p1, q1); + t0 = vaddq_u16(t0, t0); + t1 = vshrq_n_u16(t1, 1); + t0 = vaddq_u16(t0, t1); + *mask = vcleq_u16(*mask, limit); + t0 = vcleq_u16(t0, blimit); + *mask = vandq_u16(*mask, t0); + + return max; +} + +static INLINE uint16x8_t filter_flat_hev_mask( + const uint16x8_t limit, const uint16x8_t blimit, const uint16x8_t thresh, + const uint16x8_t p3, const uint16x8_t p2, const uint16x8_t p1, + const uint16x8_t p0, const uint16x8_t q0, const uint16x8_t q1, + const uint16x8_t q2, const uint16x8_t q3, uint16x8_t *flat, + uint32_t *flat_status, uint16x8_t *hev, const int bd) { + uint16x8_t mask; + const uint16x8_t max = filter_hev_mask4(limit, blimit, thresh, p3, p2, p1, p0, + q0, q1, q2, q3, hev, &mask); + *flat = vmaxq_u16(max, vabdq_u16(p2, p0)); + *flat = vmaxq_u16(*flat, vabdq_u16(q2, q0)); + *flat = vmaxq_u16(*flat, vabdq_u16(p3, p0)); + *flat = vmaxq_u16(*flat, vabdq_u16(q3, q0)); + *flat = vcleq_u16(*flat, vdupq_n_u16(1 << (bd - 8))); /* flat_mask4() */ + *flat = vandq_u16(*flat, mask); + *flat_status = calc_flat_status(*flat); + + return mask; +} + +static INLINE uint16x8_t flat_mask5(const uint16x8_t p4, const uint16x8_t p3, + const uint16x8_t p2, const uint16x8_t p1, + const uint16x8_t p0, const uint16x8_t q0, + const uint16x8_t q1, const uint16x8_t q2, + const uint16x8_t q3, const uint16x8_t q4, + const uint16x8_t flat, + uint32_t *flat2_status, const int bd) { + uint16x8_t flat2 = vabdq_u16(p4, p0); + flat2 = vmaxq_u16(flat2, vabdq_u16(p3, p0)); + flat2 = vmaxq_u16(flat2, vabdq_u16(p2, p0)); + flat2 = vmaxq_u16(flat2, vabdq_u16(p1, p0)); + flat2 = vmaxq_u16(flat2, vabdq_u16(q1, q0)); + flat2 = vmaxq_u16(flat2, vabdq_u16(q2, q0)); + flat2 = vmaxq_u16(flat2, vabdq_u16(q3, q0)); + flat2 = vmaxq_u16(flat2, vabdq_u16(q4, q0)); + flat2 = vcleq_u16(flat2, vdupq_n_u16(1 << (bd - 8))); + flat2 = vandq_u16(flat2, flat); + *flat2_status = calc_flat_status(flat2); + + return flat2; +} + +static INLINE int16x8_t flip_sign(const uint16x8_t v, const int bd) { + const uint16x8_t offset = vdupq_n_u16(0x80 << (bd - 8)); + return vreinterpretq_s16_u16(vsubq_u16(v, offset)); +} + +static INLINE uint16x8_t flip_sign_back(const int16x8_t v, const int bd) { + const int16x8_t offset = vdupq_n_s16(0x80 << (bd - 8)); + return vreinterpretq_u16_s16(vaddq_s16(v, offset)); +} + +static INLINE void filter_update(const uint16x8_t sub0, const uint16x8_t sub1, + const uint16x8_t add0, const uint16x8_t add1, + uint16x8_t *sum) { + *sum = vsubq_u16(*sum, sub0); + *sum = vsubq_u16(*sum, sub1); + *sum = vaddq_u16(*sum, add0); + *sum = vaddq_u16(*sum, add1); +} + +static INLINE uint16x8_t calc_7_tap_filter_kernel(const uint16x8_t sub0, + const uint16x8_t sub1, + const uint16x8_t add0, + const uint16x8_t add1, + uint16x8_t *sum) { + filter_update(sub0, sub1, add0, add1, sum); + return vrshrq_n_u16(*sum, 3); +} + +static INLINE uint16x8_t apply_15_tap_filter_kernel( + const uint16x8_t flat, const uint16x8_t sub0, const uint16x8_t sub1, + const uint16x8_t add0, const uint16x8_t add1, const uint16x8_t in, + uint16x8_t *sum) { + filter_update(sub0, sub1, add0, add1, sum); + return vbslq_u16(flat, vrshrq_n_u16(*sum, 4), in); +} + +// 7-tap filter [1, 1, 1, 2, 1, 1, 1] +static INLINE void calc_7_tap_filter(const uint16x8_t p3, const uint16x8_t p2, + const uint16x8_t p1, const uint16x8_t p0, + const uint16x8_t q0, const uint16x8_t q1, + const uint16x8_t q2, const uint16x8_t q3, + uint16x8_t *op2, uint16x8_t *op1, + uint16x8_t *op0, uint16x8_t *oq0, + uint16x8_t *oq1, uint16x8_t *oq2) { + uint16x8_t sum; + sum = vaddq_u16(p3, p3); // 2*p3 + sum = vaddq_u16(sum, p3); // 3*p3 + sum = vaddq_u16(sum, p2); // 3*p3+p2 + sum = vaddq_u16(sum, p2); // 3*p3+2*p2 + sum = vaddq_u16(sum, p1); // 3*p3+2*p2+p1 + sum = vaddq_u16(sum, p0); // 3*p3+2*p2+p1+p0 + sum = vaddq_u16(sum, q0); // 3*p3+2*p2+p1+p0+q0 + *op2 = vrshrq_n_u16(sum, 3); + *op1 = calc_7_tap_filter_kernel(p3, p2, p1, q1, &sum); + *op0 = calc_7_tap_filter_kernel(p3, p1, p0, q2, &sum); + *oq0 = calc_7_tap_filter_kernel(p3, p0, q0, q3, &sum); + *oq1 = calc_7_tap_filter_kernel(p2, q0, q1, q3, &sum); + *oq2 = calc_7_tap_filter_kernel(p1, q1, q2, q3, &sum); +} + +static INLINE void apply_7_tap_filter(const uint16x8_t flat, + const uint16x8_t p3, const uint16x8_t p2, + const uint16x8_t p1, const uint16x8_t p0, + const uint16x8_t q0, const uint16x8_t q1, + const uint16x8_t q2, const uint16x8_t q3, + uint16x8_t *op2, uint16x8_t *op1, + uint16x8_t *op0, uint16x8_t *oq0, + uint16x8_t *oq1, uint16x8_t *oq2) { + uint16x8_t tp1, tp0, tq0, tq1; + calc_7_tap_filter(p3, p2, p1, p0, q0, q1, q2, q3, op2, &tp1, &tp0, &tq0, &tq1, + oq2); + *op2 = vbslq_u16(flat, *op2, p2); + *op1 = vbslq_u16(flat, tp1, *op1); + *op0 = vbslq_u16(flat, tp0, *op0); + *oq0 = vbslq_u16(flat, tq0, *oq0); + *oq1 = vbslq_u16(flat, tq1, *oq1); + *oq2 = vbslq_u16(flat, *oq2, q2); +} + +// 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1] +static INLINE void apply_15_tap_filter( + const uint16x8_t flat2, const uint16x8_t p7, const uint16x8_t p6, + const uint16x8_t p5, const uint16x8_t p4, const uint16x8_t p3, + const uint16x8_t p2, const uint16x8_t p1, const uint16x8_t p0, + const uint16x8_t q0, const uint16x8_t q1, const uint16x8_t q2, + const uint16x8_t q3, const uint16x8_t q4, const uint16x8_t q5, + const uint16x8_t q6, const uint16x8_t q7, uint16x8_t *op6, uint16x8_t *op5, + uint16x8_t *op4, uint16x8_t *op3, uint16x8_t *op2, uint16x8_t *op1, + uint16x8_t *op0, uint16x8_t *oq0, uint16x8_t *oq1, uint16x8_t *oq2, + uint16x8_t *oq3, uint16x8_t *oq4, uint16x8_t *oq5, uint16x8_t *oq6) { + uint16x8_t sum; + sum = vshlq_n_u16(p7, 3); // 8*p7 + sum = vsubq_u16(sum, p7); // 7*p7 + sum = vaddq_u16(sum, p6); // 7*p7+p6 + sum = vaddq_u16(sum, p6); // 7*p7+2*p6 + sum = vaddq_u16(sum, p5); // 7*p7+2*p6+p5 + sum = vaddq_u16(sum, p4); // 7*p7+2*p6+p5+p4 + sum = vaddq_u16(sum, p3); // 7*p7+2*p6+p5+p4+p3 + sum = vaddq_u16(sum, p2); // 7*p7+2*p6+p5+p4+p3+p2 + sum = vaddq_u16(sum, p1); // 7*p7+2*p6+p5+p4+p3+p2+p1 + sum = vaddq_u16(sum, p0); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0 + sum = vaddq_u16(sum, q0); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0 + *op6 = vbslq_u16(flat2, vrshrq_n_u16(sum, 4), p6); + *op5 = apply_15_tap_filter_kernel(flat2, p7, p6, p5, q1, p5, &sum); + *op4 = apply_15_tap_filter_kernel(flat2, p7, p5, p4, q2, p4, &sum); + *op3 = apply_15_tap_filter_kernel(flat2, p7, p4, p3, q3, p3, &sum); + *op2 = apply_15_tap_filter_kernel(flat2, p7, p3, p2, q4, *op2, &sum); + *op1 = apply_15_tap_filter_kernel(flat2, p7, p2, p1, q5, *op1, &sum); + *op0 = apply_15_tap_filter_kernel(flat2, p7, p1, p0, q6, *op0, &sum); + *oq0 = apply_15_tap_filter_kernel(flat2, p7, p0, q0, q7, *oq0, &sum); + *oq1 = apply_15_tap_filter_kernel(flat2, p6, q0, q1, q7, *oq1, &sum); + *oq2 = apply_15_tap_filter_kernel(flat2, p5, q1, q2, q7, *oq2, &sum); + *oq3 = apply_15_tap_filter_kernel(flat2, p4, q2, q3, q7, q3, &sum); + *oq4 = apply_15_tap_filter_kernel(flat2, p3, q3, q4, q7, q4, &sum); + *oq5 = apply_15_tap_filter_kernel(flat2, p2, q4, q5, q7, q5, &sum); + *oq6 = apply_15_tap_filter_kernel(flat2, p1, q5, q6, q7, q6, &sum); +} + +static INLINE void filter4(const uint16x8_t mask, const uint16x8_t hev, + const uint16x8_t p1, const uint16x8_t p0, + const uint16x8_t q0, const uint16x8_t q1, + uint16x8_t *op1, uint16x8_t *op0, uint16x8_t *oq0, + uint16x8_t *oq1, const int bd) { + const int16x8_t max = vdupq_n_s16((1 << (bd - 1)) - 1); + const int16x8_t min = vdupq_n_s16((int16_t)(((uint32_t)-1) << (bd - 1))); + int16x8_t filter, filter1, filter2, t; + int16x8_t ps1 = flip_sign(p1, bd); + int16x8_t ps0 = flip_sign(p0, bd); + int16x8_t qs0 = flip_sign(q0, bd); + int16x8_t qs1 = flip_sign(q1, bd); + + /* add outer taps if we have high edge variance */ + filter = vsubq_s16(ps1, qs1); + filter = vmaxq_s16(filter, min); + filter = vminq_s16(filter, max); + filter = vandq_s16(filter, vreinterpretq_s16_u16(hev)); + t = vsubq_s16(qs0, ps0); + + /* inner taps */ + filter = vaddq_s16(filter, t); + filter = vaddq_s16(filter, t); + filter = vaddq_s16(filter, t); + filter = vmaxq_s16(filter, min); + filter = vminq_s16(filter, max); + filter = vandq_s16(filter, vreinterpretq_s16_u16(mask)); + + /* save bottom 3 bits so that we round one side +4 and the other +3 */ + /* if it equals 4 we'll set it to adjust by -1 to account for the fact */ + /* we'd round it by 3 the other way */ + t = vaddq_s16(filter, vdupq_n_s16(4)); + t = vminq_s16(t, max); + filter1 = vshrq_n_s16(t, 3); + t = vaddq_s16(filter, vdupq_n_s16(3)); + t = vminq_s16(t, max); + filter2 = vshrq_n_s16(t, 3); + + qs0 = vsubq_s16(qs0, filter1); + qs0 = vmaxq_s16(qs0, min); + qs0 = vminq_s16(qs0, max); + ps0 = vaddq_s16(ps0, filter2); + ps0 = vmaxq_s16(ps0, min); + ps0 = vminq_s16(ps0, max); + *oq0 = flip_sign_back(qs0, bd); + *op0 = flip_sign_back(ps0, bd); + + /* outer tap adjustments */ + filter = vrshrq_n_s16(filter1, 1); + filter = vbicq_s16(filter, vreinterpretq_s16_u16(hev)); + + qs1 = vsubq_s16(qs1, filter); + qs1 = vmaxq_s16(qs1, min); + qs1 = vminq_s16(qs1, max); + ps1 = vaddq_s16(ps1, filter); + ps1 = vmaxq_s16(ps1, min); + ps1 = vminq_s16(ps1, max); + *oq1 = flip_sign_back(qs1, bd); + *op1 = flip_sign_back(ps1, bd); +} + +static INLINE void filter8(const uint16x8_t mask, const uint16x8_t flat, + const uint32_t flat_status, const uint16x8_t hev, + const uint16x8_t p3, const uint16x8_t p2, + const uint16x8_t p1, const uint16x8_t p0, + const uint16x8_t q0, const uint16x8_t q1, + const uint16x8_t q2, const uint16x8_t q3, + uint16x8_t *op2, uint16x8_t *op1, uint16x8_t *op0, + uint16x8_t *oq0, uint16x8_t *oq1, uint16x8_t *oq2, + const int bd) { + if (flat_status != (uint32_t)-4) { + filter4(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1, bd); + *op2 = p2; + *oq2 = q2; + if (flat_status) { + apply_7_tap_filter(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0, + oq0, oq1, oq2); + } + } else { + calc_7_tap_filter(p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0, oq0, oq1, + oq2); + } +} + +static INLINE void filter16( + const uint16x8_t mask, const uint16x8_t flat, const uint32_t flat_status, + const uint16x8_t flat2, const uint32_t flat2_status, const uint16x8_t hev, + const uint16x8_t p7, const uint16x8_t p6, const uint16x8_t p5, + const uint16x8_t p4, const uint16x8_t p3, const uint16x8_t p2, + const uint16x8_t p1, const uint16x8_t p0, const uint16x8_t q0, + const uint16x8_t q1, const uint16x8_t q2, const uint16x8_t q3, + const uint16x8_t q4, const uint16x8_t q5, const uint16x8_t q6, + const uint16x8_t q7, uint16x8_t *op6, uint16x8_t *op5, uint16x8_t *op4, + uint16x8_t *op3, uint16x8_t *op2, uint16x8_t *op1, uint16x8_t *op0, + uint16x8_t *oq0, uint16x8_t *oq1, uint16x8_t *oq2, uint16x8_t *oq3, + uint16x8_t *oq4, uint16x8_t *oq5, uint16x8_t *oq6, const int bd) { + if (flat_status != (uint32_t)-4) { + filter4(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1, bd); + } + + if (flat_status) { + *op2 = p2; + *oq2 = q2; + if (flat2_status != (uint32_t)-4) { + apply_7_tap_filter(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0, + oq0, oq1, oq2); + } + if (flat2_status) { + apply_15_tap_filter(flat2, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, + q4, q5, q6, q7, op6, op5, op4, op3, op2, op1, op0, + oq0, oq1, oq2, oq3, oq4, oq5, oq6); + } + } +} + +static INLINE void load_8x8(const uint16_t *s, const int p, uint16x8_t *p3, + uint16x8_t *p2, uint16x8_t *p1, uint16x8_t *p0, + uint16x8_t *q0, uint16x8_t *q1, uint16x8_t *q2, + uint16x8_t *q3) { + *p3 = vld1q_u16(s); + s += p; + *p2 = vld1q_u16(s); + s += p; + *p1 = vld1q_u16(s); + s += p; + *p0 = vld1q_u16(s); + s += p; + *q0 = vld1q_u16(s); + s += p; + *q1 = vld1q_u16(s); + s += p; + *q2 = vld1q_u16(s); + s += p; + *q3 = vld1q_u16(s); +} + +static INLINE void load_8x16(const uint16_t *s, const int p, uint16x8_t *s0, + uint16x8_t *s1, uint16x8_t *s2, uint16x8_t *s3, + uint16x8_t *s4, uint16x8_t *s5, uint16x8_t *s6, + uint16x8_t *s7, uint16x8_t *s8, uint16x8_t *s9, + uint16x8_t *s10, uint16x8_t *s11, uint16x8_t *s12, + uint16x8_t *s13, uint16x8_t *s14, + uint16x8_t *s15) { + *s0 = vld1q_u16(s); + s += p; + *s1 = vld1q_u16(s); + s += p; + *s2 = vld1q_u16(s); + s += p; + *s3 = vld1q_u16(s); + s += p; + *s4 = vld1q_u16(s); + s += p; + *s5 = vld1q_u16(s); + s += p; + *s6 = vld1q_u16(s); + s += p; + *s7 = vld1q_u16(s); + s += p; + *s8 = vld1q_u16(s); + s += p; + *s9 = vld1q_u16(s); + s += p; + *s10 = vld1q_u16(s); + s += p; + *s11 = vld1q_u16(s); + s += p; + *s12 = vld1q_u16(s); + s += p; + *s13 = vld1q_u16(s); + s += p; + *s14 = vld1q_u16(s); + s += p; + *s15 = vld1q_u16(s); +} + +static INLINE void store_8x4(uint16_t *s, const int p, const uint16x8_t s0, + const uint16x8_t s1, const uint16x8_t s2, + const uint16x8_t s3) { + vst1q_u16(s, s0); + s += p; + vst1q_u16(s, s1); + s += p; + vst1q_u16(s, s2); + s += p; + vst1q_u16(s, s3); +} + +static INLINE void store_8x6(uint16_t *s, const int p, const uint16x8_t s0, + const uint16x8_t s1, const uint16x8_t s2, + const uint16x8_t s3, const uint16x8_t s4, + const uint16x8_t s5) { + vst1q_u16(s, s0); + s += p; + vst1q_u16(s, s1); + s += p; + vst1q_u16(s, s2); + s += p; + vst1q_u16(s, s3); + s += p; + vst1q_u16(s, s4); + s += p; + vst1q_u16(s, s5); +} + +static INLINE void store_4x8(uint16_t *s, const int p, const uint16x8_t p1, + const uint16x8_t p0, const uint16x8_t q0, + const uint16x8_t q1) { + uint16x8x4_t o; + + o.val[0] = p1; + o.val[1] = p0; + o.val[2] = q0; + o.val[3] = q1; + vst4q_lane_u16(s, o, 0); + s += p; + vst4q_lane_u16(s, o, 1); + s += p; + vst4q_lane_u16(s, o, 2); + s += p; + vst4q_lane_u16(s, o, 3); + s += p; + vst4q_lane_u16(s, o, 4); + s += p; + vst4q_lane_u16(s, o, 5); + s += p; + vst4q_lane_u16(s, o, 6); + s += p; + vst4q_lane_u16(s, o, 7); +} + +static INLINE void store_6x8(uint16_t *s, const int p, const uint16x8_t s0, + const uint16x8_t s1, const uint16x8_t s2, + const uint16x8_t s3, const uint16x8_t s4, + const uint16x8_t s5) { + uint16x8x3_t o0, o1; + + o0.val[0] = s0; + o0.val[1] = s1; + o0.val[2] = s2; + o1.val[0] = s3; + o1.val[1] = s4; + o1.val[2] = s5; + vst3q_lane_u16(s - 3, o0, 0); + vst3q_lane_u16(s + 0, o1, 0); + s += p; + vst3q_lane_u16(s - 3, o0, 1); + vst3q_lane_u16(s + 0, o1, 1); + s += p; + vst3q_lane_u16(s - 3, o0, 2); + vst3q_lane_u16(s + 0, o1, 2); + s += p; + vst3q_lane_u16(s - 3, o0, 3); + vst3q_lane_u16(s + 0, o1, 3); + s += p; + vst3q_lane_u16(s - 3, o0, 4); + vst3q_lane_u16(s + 0, o1, 4); + s += p; + vst3q_lane_u16(s - 3, o0, 5); + vst3q_lane_u16(s + 0, o1, 5); + s += p; + vst3q_lane_u16(s - 3, o0, 6); + vst3q_lane_u16(s + 0, o1, 6); + s += p; + vst3q_lane_u16(s - 3, o0, 7); + vst3q_lane_u16(s + 0, o1, 7); +} + +static INLINE void store_7x8(uint16_t *s, const int p, const uint16x8_t s0, + const uint16x8_t s1, const uint16x8_t s2, + const uint16x8_t s3, const uint16x8_t s4, + const uint16x8_t s5, const uint16x8_t s6) { + uint16x8x4_t o0; + uint16x8x3_t o1; + + o0.val[0] = s0; + o0.val[1] = s1; + o0.val[2] = s2; + o0.val[3] = s3; + o1.val[0] = s4; + o1.val[1] = s5; + o1.val[2] = s6; + vst4q_lane_u16(s - 4, o0, 0); + vst3q_lane_u16(s + 0, o1, 0); + s += p; + vst4q_lane_u16(s - 4, o0, 1); + vst3q_lane_u16(s + 0, o1, 1); + s += p; + vst4q_lane_u16(s - 4, o0, 2); + vst3q_lane_u16(s + 0, o1, 2); + s += p; + vst4q_lane_u16(s - 4, o0, 3); + vst3q_lane_u16(s + 0, o1, 3); + s += p; + vst4q_lane_u16(s - 4, o0, 4); + vst3q_lane_u16(s + 0, o1, 4); + s += p; + vst4q_lane_u16(s - 4, o0, 5); + vst3q_lane_u16(s + 0, o1, 5); + s += p; + vst4q_lane_u16(s - 4, o0, 6); + vst3q_lane_u16(s + 0, o1, 6); + s += p; + vst4q_lane_u16(s - 4, o0, 7); + vst3q_lane_u16(s + 0, o1, 7); +} + +static INLINE void store_8x14(uint16_t *s, const int p, const uint16x8_t p6, + const uint16x8_t p5, const uint16x8_t p4, + const uint16x8_t p3, const uint16x8_t p2, + const uint16x8_t p1, const uint16x8_t p0, + const uint16x8_t q0, const uint16x8_t q1, + const uint16x8_t q2, const uint16x8_t q3, + const uint16x8_t q4, const uint16x8_t q5, + const uint16x8_t q6, const uint32_t flat_status, + const uint32_t flat2_status) { + if (flat_status) { + if (flat2_status) { + vst1q_u16(s - 7 * p, p6); + vst1q_u16(s - 6 * p, p5); + vst1q_u16(s - 5 * p, p4); + vst1q_u16(s - 4 * p, p3); + vst1q_u16(s + 3 * p, q3); + vst1q_u16(s + 4 * p, q4); + vst1q_u16(s + 5 * p, q5); + vst1q_u16(s + 6 * p, q6); + } + vst1q_u16(s - 3 * p, p2); + vst1q_u16(s + 2 * p, q2); + } + vst1q_u16(s - 2 * p, p1); + vst1q_u16(s - 1 * p, p0); + vst1q_u16(s + 0 * p, q0); + vst1q_u16(s + 1 * p, q1); +} + +void vpx_highbd_lpf_horizontal_4_neon(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, + mask, hev; + + load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd); + load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + filter_hev_mask4(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, + q2, q3, &hev, &mask); + filter4(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1, bd); + store_8x4(s - 2 * p, p, p1, p0, q0, q1); +} + +void vpx_highbd_lpf_horizontal_4_dual_neon( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + vpx_highbd_lpf_horizontal_4_neon(s, p, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_horizontal_4_neon(s + 8, p, blimit1, limit1, thresh1, bd); +} + +void vpx_highbd_lpf_vertical_4_neon(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, + mask, hev; + + load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + transpose_s16_8x8((int16x8_t *)&p3, (int16x8_t *)&p2, (int16x8_t *)&p1, + (int16x8_t *)&p0, (int16x8_t *)&q0, (int16x8_t *)&q1, + (int16x8_t *)&q2, (int16x8_t *)&q3); + load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd); + filter_hev_mask4(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, + q2, q3, &hev, &mask); + filter4(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1, bd); + store_4x8(s - 2, p, p1, p0, q0, q1); +} + +void vpx_highbd_lpf_vertical_4_dual_neon( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + vpx_highbd_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, bd); +} + +void vpx_highbd_lpf_horizontal_8_neon(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, + op2, op1, op0, oq0, oq1, oq2, mask, flat, hev; + uint32_t flat_status; + + load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd); + load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, + q0, q1, q2, q3, &flat, &flat_status, &hev, bd); + filter8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2, + &op1, &op0, &oq0, &oq1, &oq2, bd); + store_8x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2); +} + +void vpx_highbd_lpf_horizontal_8_dual_neon( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + vpx_highbd_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, bd); +} + +void vpx_highbd_lpf_vertical_8_neon(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, + op2, op1, op0, oq0, oq1, oq2, mask, flat, hev; + uint32_t flat_status; + + load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + transpose_s16_8x8((int16x8_t *)&p3, (int16x8_t *)&p2, (int16x8_t *)&p1, + (int16x8_t *)&p0, (int16x8_t *)&q0, (int16x8_t *)&q1, + (int16x8_t *)&q2, (int16x8_t *)&q3); + load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd); + mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, + q0, q1, q2, q3, &flat, &flat_status, &hev, bd); + filter8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2, + &op1, &op0, &oq0, &oq1, &oq2, bd); + // Note: store_6x8() is faster than transpose + store_8x8(). + store_6x8(s, p, op2, op1, op0, oq0, oq1, oq2); +} + +void vpx_highbd_lpf_vertical_8_dual_neon( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + vpx_highbd_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, bd); +} + +// Quiet warnings of the form: 'vpx_dsp/arm/highbd_loopfilter_neon.c|675 col 67| +// warning: 'oq1' may be used uninitialized in this function +// [-Wmaybe-uninitialized]', for oq1-op1. Without reworking the code or adding +// an additional branch this warning cannot be silenced otherwise. The +// loopfilter is only called when needed for a block so these output pixels +// will be set. +#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif + +static void lpf_horizontal_16_kernel(uint16_t *s, int p, + const uint16x8_t blimit_vec, + const uint16x8_t limit_vec, + const uint16x8_t thresh_vec, + const int bd) { + uint16x8_t mask, flat, flat2, hev, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, + q3, q4, q5, q6, q7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, + oq4, oq5, oq6; + uint32_t flat_status, flat2_status; + + load_8x16(s - 8 * p, p, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &q1, &q2, + &q3, &q4, &q5, &q6, &q7); + mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, + q0, q1, q2, q3, &flat, &flat_status, &hev, bd); + flat2 = flat_mask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, + &flat2_status, bd); + filter16(mask, flat, flat_status, flat2, flat2_status, hev, p7, p6, p5, p4, + p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, + &op3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, + bd); + store_8x14(s, p, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, + oq5, oq6, flat_status, flat2_status); +} + +static void lpf_vertical_16_kernel(uint16_t *s, int p, + const uint16x8_t blimit_vec, + const uint16x8_t limit_vec, + const uint16x8_t thresh_vec, const int bd) { + uint16x8_t mask, flat, flat2, hev, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, + q3, q4, q5, q6, q7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, + oq4, oq5, oq6; + uint32_t flat_status, flat2_status; + + load_8x8(s - 8, p, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0); + transpose_s16_8x8((int16x8_t *)&p7, (int16x8_t *)&p6, (int16x8_t *)&p5, + (int16x8_t *)&p4, (int16x8_t *)&p3, (int16x8_t *)&p2, + (int16x8_t *)&p1, (int16x8_t *)&p0); + load_8x8(s, p, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7); + transpose_s16_8x8((int16x8_t *)&q0, (int16x8_t *)&q1, (int16x8_t *)&q2, + (int16x8_t *)&q3, (int16x8_t *)&q4, (int16x8_t *)&q5, + (int16x8_t *)&q6, (int16x8_t *)&q7); + mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, + q0, q1, q2, q3, &flat, &flat_status, &hev, bd); + flat2 = flat_mask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, + &flat2_status, bd); + filter16(mask, flat, flat_status, flat2, flat2_status, hev, p7, p6, p5, p4, + p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, + &op3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, + bd); + if (flat_status) { + if (flat2_status) { + store_7x8(s - 3, p, op6, op5, op4, op3, op2, op1, op0); + store_7x8(s + 4, p, oq0, oq1, oq2, oq3, oq4, oq5, oq6); + } else { + // Note: store_6x8() is faster than transpose + store_8x8(). + store_6x8(s, p, op2, op1, op0, oq0, oq1, oq2); + } + } else { + store_4x8(s - 2, p, op1, op0, oq0, oq1); + } +} + +#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__) +#pragma GCC diagnostic pop +#endif + +void vpx_highbd_lpf_horizontal_16_neon(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + uint16x8_t blimit_vec, limit_vec, thresh_vec; + load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd); + lpf_horizontal_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd); +} + +void vpx_highbd_lpf_horizontal_16_dual_neon(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + uint16x8_t blimit_vec, limit_vec, thresh_vec; + load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd); + lpf_horizontal_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd); + lpf_horizontal_16_kernel(s + 8, p, blimit_vec, limit_vec, thresh_vec, bd); +} + +void vpx_highbd_lpf_vertical_16_neon(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + uint16x8_t blimit_vec, limit_vec, thresh_vec; + load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd); + lpf_vertical_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd); +} + +void vpx_highbd_lpf_vertical_16_dual_neon(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + uint16x8_t blimit_vec, limit_vec, thresh_vec; + load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd); + lpf_vertical_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd); + lpf_vertical_16_kernel(s + 8 * p, p, blimit_vec, limit_vec, thresh_vec, bd); +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_quantize_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_quantize_neon.c new file mode 100644 index 0000000000..c2ad34a695 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_quantize_neon.c @@ -0,0 +1,300 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" + +static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store( + const int32x4_t dqcoeff_0, const int32x4_t dqcoeff_1, + tran_low_t *dqcoeff_ptr) { + vst1q_s32(dqcoeff_ptr, dqcoeff_0); + vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1); +} + +static VPX_FORCE_INLINE void highbd_quantize_8_neon( + const int32x4_t coeff_0, const int32x4_t coeff_1, const int32x4_t zbin, + const int32x4_t round, const int32x4_t quant, const int32x4_t quant_shift, + int32x4_t *qcoeff_0, int32x4_t *qcoeff_1) { + // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values + const int32x4_t coeff_0_sign = vshrq_n_s32(coeff_0, 31); + const int32x4_t coeff_1_sign = vshrq_n_s32(coeff_1, 31); + const int32x4_t coeff_0_abs = vabsq_s32(coeff_0); + const int32x4_t coeff_1_abs = vabsq_s32(coeff_1); + + // Calculate 2 masks of elements outside the bin + const int32x4_t zbin_mask_0 = + vreinterpretq_s32_u32(vcgeq_s32(coeff_0_abs, zbin)); + const int32x4_t zbin_mask_1 = vreinterpretq_s32_u32( + vcgeq_s32(coeff_1_abs, vdupq_lane_s32(vget_low_s32(zbin), 1))); + + // Get the rounded values + const int32x4_t rounded_0 = vaddq_s32(coeff_0_abs, round); + const int32x4_t rounded_1 = + vaddq_s32(coeff_1_abs, vdupq_lane_s32(vget_low_s32(round), 1)); + + // (round * (quant << 15) * 2) >> 16 == (round * quant) + int32x4_t qcoeff_tmp_0 = vqdmulhq_s32(rounded_0, quant); + int32x4_t qcoeff_tmp_1 = + vqdmulhq_s32(rounded_1, vdupq_lane_s32(vget_low_s32(quant), 1)); + + // Add rounded values + qcoeff_tmp_0 = vaddq_s32(qcoeff_tmp_0, rounded_0); + qcoeff_tmp_1 = vaddq_s32(qcoeff_tmp_1, rounded_1); + + // (round * (quant_shift << 15) * 2) >> 16 == (round * quant_shift) + qcoeff_tmp_0 = vqdmulhq_s32(qcoeff_tmp_0, quant_shift); + qcoeff_tmp_1 = + vqdmulhq_s32(qcoeff_tmp_1, vdupq_lane_s32(vget_low_s32(quant_shift), 1)); + + // Restore the sign bit. + qcoeff_tmp_0 = veorq_s32(qcoeff_tmp_0, coeff_0_sign); + qcoeff_tmp_1 = veorq_s32(qcoeff_tmp_1, coeff_1_sign); + qcoeff_tmp_0 = vsubq_s32(qcoeff_tmp_0, coeff_0_sign); + qcoeff_tmp_1 = vsubq_s32(qcoeff_tmp_1, coeff_1_sign); + + // Only keep the relevant coeffs + *qcoeff_0 = vandq_s32(qcoeff_tmp_0, zbin_mask_0); + *qcoeff_1 = vandq_s32(qcoeff_tmp_1, zbin_mask_1); +} + +static VPX_FORCE_INLINE int16x8_t +highbd_quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int32x4_t zbin, + const int32x4_t round, const int32x4_t quant, + const int32x4_t quant_shift, const int32x4_t dequant) { + int32x4_t qcoeff_0, qcoeff_1, dqcoeff_0, dqcoeff_1; + + // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values + const int32x4_t coeff_0 = vld1q_s32(coeff_ptr); + const int32x4_t coeff_1 = vld1q_s32(coeff_ptr + 4); + highbd_quantize_8_neon(coeff_0, coeff_1, zbin, round, quant, quant_shift, + &qcoeff_0, &qcoeff_1); + + // Store the 32-bit qcoeffs + vst1q_s32(qcoeff_ptr, qcoeff_0); + vst1q_s32(qcoeff_ptr + 4, qcoeff_1); + + // Calculate and store the dqcoeffs + dqcoeff_0 = vmulq_s32(qcoeff_0, dequant); + dqcoeff_1 = vmulq_s32(qcoeff_1, vdupq_lane_s32(vget_low_s32(dequant), 1)); + + highbd_calculate_dqcoeff_and_store(dqcoeff_0, dqcoeff_1, dqcoeff_ptr); + + return vcombine_s16(vmovn_s32(qcoeff_0), vmovn_s32(qcoeff_1)); +} + +void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const int16x8_t neg_one = vdupq_n_s16(-1); + uint16x8_t eob_max; + const int16_t *iscan = scan_order->iscan; + + // Only the first element of each vector is DC. + // High half has identical elements, but we can reconstruct it from the low + // half by duplicating the 2nd element. So we only need to pass a 4x32-bit + // vector + int32x4_t zbin = vmovl_s16(vld1_s16(mb_plane->zbin)); + int32x4_t round = vmovl_s16(vld1_s16(mb_plane->round)); + // Extend the quant, quant_shift vectors to ones of 32-bit elements + // scale to high-half, so we can use vqdmulhq_s32 + int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant)), 15); + int32x4_t quant_shift = + vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant_shift)), 15); + int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr)); + + // Process first 8 values which include a dc component. + { + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); + + const int16x8_t qcoeff = + highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, + quant, quant_shift, dequant); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan); + + __builtin_prefetch(coeff_ptr + 64); + + coeff_ptr += 8; + iscan += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + } + + n_coeffs -= 8; + + { + zbin = vdupq_lane_s32(vget_low_s32(zbin), 1); + round = vdupq_lane_s32(vget_low_s32(round), 1); + quant = vdupq_lane_s32(vget_low_s32(quant), 1); + quant_shift = vdupq_lane_s32(vget_low_s32(quant_shift), 1); + dequant = vdupq_lane_s32(vget_low_s32(dequant), 1); + + do { + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); + + const int16x8_t qcoeff = + highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, + round, quant, quant_shift, dequant); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = + vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan)); + + __builtin_prefetch(coeff_ptr + 64); + coeff_ptr += 8; + iscan += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + n_coeffs -= 8; + } while (n_coeffs > 0); + } + +#if VPX_ARCH_AARCH64 + *eob_ptr = vmaxvq_u16(eob_max); +#else + { + const uint16x4_t eob_max_0 = + vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max)); + const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0); + const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); + vst1_lane_u16(eob_ptr, eob_max_2, 0); + } +#endif // VPX_ARCH_AARCH64 +} + +static VPX_FORCE_INLINE int32x4_t extract_sign_bit(int32x4_t a) { + return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31)); +} + +static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store_32x32( + int32x4_t dqcoeff_0, int32x4_t dqcoeff_1, tran_low_t *dqcoeff_ptr) { + // Add 1 if negative to round towards zero because the C uses division. + dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0)); + dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1)); + + dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1); + dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1); + vst1q_s32(dqcoeff_ptr, dqcoeff_0); + vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1); +} + +static VPX_FORCE_INLINE int16x8_t highbd_quantize_b_32x32_neon( + const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int32x4_t zbin, const int32x4_t round, + const int32x4_t quant, const int32x4_t quant_shift, + const int32x4_t dequant) { + int32x4_t qcoeff_0, qcoeff_1, dqcoeff_0, dqcoeff_1; + + // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values + const int32x4_t coeff_0 = vld1q_s32(coeff_ptr); + const int32x4_t coeff_1 = vld1q_s32(coeff_ptr + 4); + highbd_quantize_8_neon(coeff_0, coeff_1, zbin, round, quant, quant_shift, + &qcoeff_0, &qcoeff_1); + + // Store the 32-bit qcoeffs + vst1q_s32(qcoeff_ptr, qcoeff_0); + vst1q_s32(qcoeff_ptr + 4, qcoeff_1); + + // Calculate and store the dqcoeffs + dqcoeff_0 = vmulq_s32(qcoeff_0, dequant); + dqcoeff_1 = vmulq_s32(qcoeff_1, vdupq_lane_s32(vget_low_s32(dequant), 1)); + + highbd_calculate_dqcoeff_and_store_32x32(dqcoeff_0, dqcoeff_1, dqcoeff_ptr); + + return vcombine_s16(vmovn_s32(qcoeff_0), vmovn_s32(qcoeff_1)); +} + +void vpx_highbd_quantize_b_32x32_neon( + const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const struct ScanOrder *const scan_order) { + const int16x8_t neg_one = vdupq_n_s16(-1); + uint16x8_t eob_max; + int i; + const int16_t *iscan = scan_order->iscan; + + // Only the first element of each vector is DC. + // High half has identical elements, but we can reconstruct it from the low + // half by duplicating the 2nd element. So we only need to pass a 4x32-bit + // vector + int32x4_t zbin = vrshrq_n_s32(vmovl_s16(vld1_s16(mb_plane->zbin)), 1); + int32x4_t round = vrshrq_n_s32(vmovl_s16(vld1_s16(mb_plane->round)), 1); + // Extend the quant, quant_shift vectors to ones of 32-bit elements + // scale to high-half, so we can use vqdmulhq_s32 + int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant)), 15); + int32x4_t quant_shift = + vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant_shift)), 16); + int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr)); + + // Process first 8 values which include a dc component. + { + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); + + const int16x8_t qcoeff = + highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, + round, quant, quant_shift, dequant); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan); + + __builtin_prefetch(coeff_ptr + 64); + coeff_ptr += 8; + iscan += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + } + + { + zbin = vdupq_lane_s32(vget_low_s32(zbin), 1); + round = vdupq_lane_s32(vget_low_s32(round), 1); + quant = vdupq_lane_s32(vget_low_s32(quant), 1); + quant_shift = vdupq_lane_s32(vget_low_s32(quant_shift), 1); + dequant = vdupq_lane_s32(vget_low_s32(dequant), 1); + + for (i = 1; i < 32 * 32 / 8; ++i) { + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); + + const int16x8_t qcoeff = + highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, + round, quant, quant_shift, dequant); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = + vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan)); + + __builtin_prefetch(coeff_ptr + 64); + coeff_ptr += 8; + iscan += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + } + } + +#if VPX_ARCH_AARCH64 + *eob_ptr = vmaxvq_u16(eob_max); +#else + { + const uint16x4_t eob_max_0 = + vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max)); + const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0); + const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); + vst1_lane_u16(eob_ptr, eob_max_2, 0); + } +#endif // VPX_ARCH_AARCH64 +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad4d_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad4d_neon.c new file mode 100644 index 0000000000..a6684b0534 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad4d_neon.c @@ -0,0 +1,273 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +static INLINE void highbd_sad4xhx4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); + const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); + const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); + const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); + + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + + int i = 0; + do { + uint16x4_t s = vld1_u16(src16_ptr + i * src_stride); + uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride); + uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride); + uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride); + uint16x4_t r3 = vld1_u16(ref16_ptr3 + i * ref_stride); + + sum[0] = vabal_u16(sum[0], s, r0); + sum[1] = vabal_u16(sum[1], s, r1); + sum[2] = vabal_u16(sum[2], s, r2); + sum[3] = vabal_u16(sum[3], s, r3); + + } while (++i < h); + + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); +} + +static INLINE void highbd_sad8xhx4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); + const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); + const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); + const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); + + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint32x4_t sum_u32[4]; + + int i = 0; + do { + uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride); + + sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride)); + sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride)); + sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride)); + sum[3] = vabaq_u16(sum[3], s, vld1q_u16(ref16_ptr3 + i * ref_stride)); + + } while (++i < h); + + sum_u32[0] = vpaddlq_u16(sum[0]); + sum_u32[1] = vpaddlq_u16(sum[1]); + sum_u32[2] = vpaddlq_u16(sum[2]); + sum_u32[3] = vpaddlq_u16(sum[3]); + vst1q_u32(res, horizontal_add_4d_uint32x4(sum_u32)); +} + +static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref, + uint32x4_t *const sad_sum) { + uint16x8_t abs_diff = vabdq_u16(src, ref); + *sad_sum = vpadalq_u16(*sad_sum, abs_diff); +} + +static INLINE void highbd_sad16xhx4d_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); + const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); + const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); + const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); + + uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum[4]; + + int i = 0; + do { + uint16x8_t s0, s1; + + s0 = vld1q_u16(src16_ptr + i * src_stride); + sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]); + sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]); + sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]); + sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum_lo[3]); + + s1 = vld1q_u16(src16_ptr + i * src_stride + 8); + sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]); + sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]); + sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]); + sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + 8), &sum_hi[3]); + + } while (++i < h); + + sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); + sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); + sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); + sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); + + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); +} + +static INLINE void highbd_sadwxhx4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], int w, + int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); + const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); + const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); + const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); + + uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum[4]; + + int i = 0; + do { + int j = 0; + do { + uint16x8_t s0, s1, s2, s3; + + s0 = vld1q_u16(src16_ptr + i * src_stride + j); + sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]); + sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]); + sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]); + sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride + j), &sum_lo[3]); + + s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8); + sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]); + sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]); + sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]); + sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 8), &sum_hi[3]); + + s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16); + sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16), + &sum_lo[0]); + sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16), + &sum_lo[1]); + sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16), + &sum_lo[2]); + sad8_neon(s2, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 16), + &sum_lo[3]); + + s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24); + sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24), + &sum_hi[0]); + sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24), + &sum_hi[1]); + sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24), + &sum_hi[2]); + sad8_neon(s3, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 24), + &sum_hi[3]); + + j += 32; + } while (j < w); + + } while (++i < h); + + sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); + sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); + sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); + sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); + + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); +} + +static INLINE void highbd_sad64xhx4d_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + highbd_sadwxhx4d_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64, h); +} + +static INLINE void highbd_sad32xhx4d_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + highbd_sadwxhx4d_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32, h); +} + +#define HBD_SAD_WXH_4D_NEON(w, h) \ + void vpx_highbd_sad##w##x##h##x4d_neon( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad##w##xhx4d_neon(src, src_stride, ref_array, ref_stride, \ + sad_array, (h)); \ + } + +HBD_SAD_WXH_4D_NEON(4, 4) +HBD_SAD_WXH_4D_NEON(4, 8) + +HBD_SAD_WXH_4D_NEON(8, 4) +HBD_SAD_WXH_4D_NEON(8, 8) +HBD_SAD_WXH_4D_NEON(8, 16) + +HBD_SAD_WXH_4D_NEON(16, 8) +HBD_SAD_WXH_4D_NEON(16, 16) +HBD_SAD_WXH_4D_NEON(16, 32) + +HBD_SAD_WXH_4D_NEON(32, 16) +HBD_SAD_WXH_4D_NEON(32, 32) +HBD_SAD_WXH_4D_NEON(32, 64) + +HBD_SAD_WXH_4D_NEON(64, 32) +HBD_SAD_WXH_4D_NEON(64, 64) + +#undef HBD_SAD_WXH_4D_NEON + +#define HBD_SAD_SKIP_WXH_4D_NEON(w, h) \ + void vpx_highbd_sad_skip_##w##x##h##x4d_neon( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad##w##xhx4d_neon(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + sad_array, ((h) >> 1)); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } + +HBD_SAD_SKIP_WXH_4D_NEON(4, 4) +HBD_SAD_SKIP_WXH_4D_NEON(4, 8) + +HBD_SAD_SKIP_WXH_4D_NEON(8, 4) +HBD_SAD_SKIP_WXH_4D_NEON(8, 8) +HBD_SAD_SKIP_WXH_4D_NEON(8, 16) + +HBD_SAD_SKIP_WXH_4D_NEON(16, 8) +HBD_SAD_SKIP_WXH_4D_NEON(16, 16) +HBD_SAD_SKIP_WXH_4D_NEON(16, 32) + +HBD_SAD_SKIP_WXH_4D_NEON(32, 16) +HBD_SAD_SKIP_WXH_4D_NEON(32, 32) +HBD_SAD_SKIP_WXH_4D_NEON(32, 64) + +HBD_SAD_SKIP_WXH_4D_NEON(64, 32) +HBD_SAD_SKIP_WXH_4D_NEON(64, 64) + +#undef HBD_SAD_SKIP_WXH_4D_NEON diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad_neon.c new file mode 100644 index 0000000000..b99bac66cd --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad_neon.c @@ -0,0 +1,408 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +static INLINE uint32_t highbd_sad4xh_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + uint32x4_t sum = vdupq_n_u32(0); + + int i = h; + do { + uint16x4_t s = vld1_u16(src16_ptr); + uint16x4_t r = vld1_u16(ref16_ptr); + sum = vabal_u16(sum, s, r); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(sum); +} + +static INLINE uint32_t highbd_sad8xh_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + uint16x8_t sum = vdupq_n_u16(0); + + int i = h; + do { + uint16x8_t s = vld1q_u16(src16_ptr); + uint16x8_t r = vld1q_u16(ref16_ptr); + sum = vabaq_u16(sum, s, r); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_uint16x8(sum); +} + +static INLINE uint32_t highbd_sad16xh_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + uint16x8_t s0, s1, r0, r1; + uint16x8_t diff0, diff1; + + s0 = vld1q_u16(src16_ptr); + r0 = vld1q_u16(ref16_ptr); + diff0 = vabdq_u16(s0, r0); + sum[0] = vpadalq_u16(sum[0], diff0); + + s1 = vld1q_u16(src16_ptr + 8); + r1 = vld1q_u16(ref16_ptr + 8); + diff1 = vabdq_u16(s1, r1); + sum[1] = vpadalq_u16(sum[1], diff1); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + } while (--i != 0); + + sum[0] = vaddq_u32(sum[0], sum[1]); + return horizontal_add_uint32x4(sum[0]); +} + +static INLINE uint32_t highbd_sadwxh_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int w, int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + + int i = h; + do { + int j = 0; + do { + uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3; + uint16x8_t diff0, diff1, diff2, diff3; + + s0 = vld1q_u16(src16_ptr + j); + r0 = vld1q_u16(ref16_ptr + j); + diff0 = vabdq_u16(s0, r0); + sum[0] = vpadalq_u16(sum[0], diff0); + + s1 = vld1q_u16(src16_ptr + j + 8); + r1 = vld1q_u16(ref16_ptr + j + 8); + diff1 = vabdq_u16(s1, r1); + sum[1] = vpadalq_u16(sum[1], diff1); + + s2 = vld1q_u16(src16_ptr + j + 16); + r2 = vld1q_u16(ref16_ptr + j + 16); + diff2 = vabdq_u16(s2, r2); + sum[2] = vpadalq_u16(sum[2], diff2); + + s3 = vld1q_u16(src16_ptr + j + 24); + r3 = vld1q_u16(ref16_ptr + j + 24); + diff3 = vabdq_u16(s3, r3); + sum[3] = vpadalq_u16(sum[3], diff3); + + j += 32; + } while (j < w); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + } while (--i != 0); + + sum[0] = vaddq_u32(sum[0], sum[1]); + sum[2] = vaddq_u32(sum[2], sum[3]); + sum[0] = vaddq_u32(sum[0], sum[2]); + + return horizontal_add_uint32x4(sum[0]); +} + +static INLINE unsigned int highbd_sad64xh_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + return highbd_sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h); +} + +static INLINE unsigned int highbd_sad32xh_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + return highbd_sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h); +} + +#define HBD_SAD_WXH_NEON(w, h) \ + unsigned int vpx_highbd_sad##w##x##h##_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad##w##xh_neon(src, src_stride, ref, ref_stride, (h)); \ + } + +HBD_SAD_WXH_NEON(4, 4) +HBD_SAD_WXH_NEON(4, 8) + +HBD_SAD_WXH_NEON(8, 4) +HBD_SAD_WXH_NEON(8, 8) +HBD_SAD_WXH_NEON(8, 16) + +HBD_SAD_WXH_NEON(16, 8) +HBD_SAD_WXH_NEON(16, 16) +HBD_SAD_WXH_NEON(16, 32) + +HBD_SAD_WXH_NEON(32, 16) +HBD_SAD_WXH_NEON(32, 32) +HBD_SAD_WXH_NEON(32, 64) + +HBD_SAD_WXH_NEON(64, 32) +HBD_SAD_WXH_NEON(64, 64) + +#undef HBD_SAD_WXH_NEON + +#define HBD_SAD_SKIP_WXH_NEON(w, h) \ + unsigned int vpx_highbd_sad_skip_##w##x##h##_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * highbd_sad##w##xh_neon(src, 2 * src_stride, ref, \ + 2 * ref_stride, (h) / 2); \ + } + +HBD_SAD_SKIP_WXH_NEON(4, 4) +HBD_SAD_SKIP_WXH_NEON(4, 8) + +HBD_SAD_SKIP_WXH_NEON(8, 4) +HBD_SAD_SKIP_WXH_NEON(8, 8) +HBD_SAD_SKIP_WXH_NEON(8, 16) + +HBD_SAD_SKIP_WXH_NEON(16, 8) +HBD_SAD_SKIP_WXH_NEON(16, 16) +HBD_SAD_SKIP_WXH_NEON(16, 32) + +HBD_SAD_SKIP_WXH_NEON(32, 16) +HBD_SAD_SKIP_WXH_NEON(32, 32) +HBD_SAD_SKIP_WXH_NEON(32, 64) + +HBD_SAD_SKIP_WXH_NEON(64, 32) +HBD_SAD_SKIP_WXH_NEON(64, 64) + +#undef HBD_SAD_SKIP_WXH_NEON + +static INLINE uint32_t highbd_sad4xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred); + uint32x4_t sum = vdupq_n_u32(0); + + int i = h; + do { + uint16x4_t s = vld1_u16(src16_ptr); + uint16x4_t r = vld1_u16(ref16_ptr); + uint16x4_t p = vld1_u16(pred16_ptr); + + uint16x4_t avg = vrhadd_u16(r, p); + sum = vabal_u16(sum, s, avg); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + pred16_ptr += 4; + } while (--i != 0); + + return horizontal_add_uint32x4(sum); +} + +static INLINE uint32_t highbd_sad8xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred); + uint32x4_t sum = vdupq_n_u32(0); + + int i = h; + do { + uint16x8_t s = vld1q_u16(src16_ptr); + uint16x8_t r = vld1q_u16(ref16_ptr); + uint16x8_t p = vld1q_u16(pred16_ptr); + + uint16x8_t avg = vrhaddq_u16(r, p); + uint16x8_t diff = vabdq_u16(s, avg); + sum = vpadalq_u16(sum, diff); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + pred16_ptr += 8; + } while (--i != 0); + + return horizontal_add_uint32x4(sum); +} + +static INLINE uint32_t highbd_sad16xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred); + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + uint16x8_t s0, s1, r0, r1, p0, p1; + uint16x8_t avg0, avg1, diff0, diff1; + + s0 = vld1q_u16(src16_ptr); + r0 = vld1q_u16(ref16_ptr); + p0 = vld1q_u16(pred16_ptr); + avg0 = vrhaddq_u16(r0, p0); + diff0 = vabdq_u16(s0, avg0); + sum[0] = vpadalq_u16(sum[0], diff0); + + s1 = vld1q_u16(src16_ptr + 8); + r1 = vld1q_u16(ref16_ptr + 8); + p1 = vld1q_u16(pred16_ptr + 8); + avg1 = vrhaddq_u16(r1, p1); + diff1 = vabdq_u16(s1, avg1); + sum[1] = vpadalq_u16(sum[1], diff1); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + pred16_ptr += 16; + } while (--i != 0); + + sum[0] = vaddq_u32(sum[0], sum[1]); + return horizontal_add_uint32x4(sum[0]); +} + +static INLINE uint32_t highbd_sadwxh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int w, int h, + const uint8_t *second_pred) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred); + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + + int i = h; + do { + int j = 0; + do { + uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3; + uint16x8_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3; + + s0 = vld1q_u16(src16_ptr + j); + r0 = vld1q_u16(ref16_ptr + j); + p0 = vld1q_u16(pred16_ptr + j); + avg0 = vrhaddq_u16(r0, p0); + diff0 = vabdq_u16(s0, avg0); + sum[0] = vpadalq_u16(sum[0], diff0); + + s1 = vld1q_u16(src16_ptr + j + 8); + r1 = vld1q_u16(ref16_ptr + j + 8); + p1 = vld1q_u16(pred16_ptr + j + 8); + avg1 = vrhaddq_u16(r1, p1); + diff1 = vabdq_u16(s1, avg1); + sum[1] = vpadalq_u16(sum[1], diff1); + + s2 = vld1q_u16(src16_ptr + j + 16); + r2 = vld1q_u16(ref16_ptr + j + 16); + p2 = vld1q_u16(pred16_ptr + j + 16); + avg2 = vrhaddq_u16(r2, p2); + diff2 = vabdq_u16(s2, avg2); + sum[2] = vpadalq_u16(sum[2], diff2); + + s3 = vld1q_u16(src16_ptr + j + 24); + r3 = vld1q_u16(ref16_ptr + j + 24); + p3 = vld1q_u16(pred16_ptr + j + 24); + avg3 = vrhaddq_u16(r3, p3); + diff3 = vabdq_u16(s3, avg3); + sum[3] = vpadalq_u16(sum[3], diff3); + + j += 32; + } while (j < w); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + pred16_ptr += w; + } while (--i != 0); + + sum[0] = vaddq_u32(sum[0], sum[1]); + sum[2] = vaddq_u32(sum[2], sum[3]); + sum[0] = vaddq_u32(sum[0], sum[2]); + + return horizontal_add_uint32x4(sum[0]); +} + +static INLINE unsigned int highbd_sad64xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h, + second_pred); +} + +static INLINE unsigned int highbd_sad32xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h, + second_pred); +} + +#define HBD_SAD_WXH_AVG_NEON(w, h) \ + uint32_t vpx_highbd_sad##w##x##h##_avg_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return highbd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \ + second_pred); \ + } + +HBD_SAD_WXH_AVG_NEON(4, 4) +HBD_SAD_WXH_AVG_NEON(4, 8) + +HBD_SAD_WXH_AVG_NEON(8, 4) +HBD_SAD_WXH_AVG_NEON(8, 8) +HBD_SAD_WXH_AVG_NEON(8, 16) + +HBD_SAD_WXH_AVG_NEON(16, 8) +HBD_SAD_WXH_AVG_NEON(16, 16) +HBD_SAD_WXH_AVG_NEON(16, 32) + +HBD_SAD_WXH_AVG_NEON(32, 16) +HBD_SAD_WXH_AVG_NEON(32, 32) +HBD_SAD_WXH_AVG_NEON(32, 64) + +HBD_SAD_WXH_AVG_NEON(64, 32) +HBD_SAD_WXH_AVG_NEON(64, 64) diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_sse_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sse_neon.c new file mode 100644 index 0000000000..91dfebf900 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sse_neon.c @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/sum_neon.h" + +static INLINE void highbd_sse_8x1_init_neon(const uint16_t *src, + const uint16_t *ref, + uint32x4_t *sse_acc0, + uint32x4_t *sse_acc1) { + uint16x8_t s = vld1q_u16(src); + uint16x8_t r = vld1q_u16(ref); + + uint16x8_t abs_diff = vabdq_u16(s, r); + uint16x4_t abs_diff_lo = vget_low_u16(abs_diff); + uint16x4_t abs_diff_hi = vget_high_u16(abs_diff); + + *sse_acc0 = vmull_u16(abs_diff_lo, abs_diff_lo); + *sse_acc1 = vmull_u16(abs_diff_hi, abs_diff_hi); +} + +static INLINE void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref, + uint32x4_t *sse_acc0, + uint32x4_t *sse_acc1) { + uint16x8_t s = vld1q_u16(src); + uint16x8_t r = vld1q_u16(ref); + + uint16x8_t abs_diff = vabdq_u16(s, r); + uint16x4_t abs_diff_lo = vget_low_u16(abs_diff); + uint16x4_t abs_diff_hi = vget_high_u16(abs_diff); + + *sse_acc0 = vmlal_u16(*sse_acc0, abs_diff_lo, abs_diff_lo); + *sse_acc1 = vmlal_u16(*sse_acc1, abs_diff_hi, abs_diff_hi); +} + +static INLINE int64_t highbd_sse_64xh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint32x4_t sse[8]; + highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); + highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]); + + src += src_stride; + ref += ref_stride; + + while (--height != 0) { + highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); + highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]); + + src += src_stride; + ref += ref_stride; + } + + return horizontal_long_add_uint32x4_x8(sse); +} + +static INLINE int64_t highbd_sse_32xh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint32x4_t sse[8]; + highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); + + src += src_stride; + ref += ref_stride; + + while (--height != 0) { + highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); + + src += src_stride; + ref += ref_stride; + } + + return horizontal_long_add_uint32x4_x8(sse); +} + +static INLINE int64_t highbd_sse_16xh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint32x4_t sse[4]; + highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + + src += src_stride; + ref += ref_stride; + + while (--height != 0) { + highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + + src += src_stride; + ref += ref_stride; + } + + return horizontal_long_add_uint32x4_x4(sse); +} + +static INLINE int64_t highbd_sse_8xh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2]; + highbd_sse_8x1_init_neon(src, ref, &sse[0], &sse[1]); + + src += src_stride; + ref += ref_stride; + + while (--height != 0) { + highbd_sse_8x1_neon(src, ref, &sse[0], &sse[1]); + + src += src_stride; + ref += ref_stride; + } + + return horizontal_long_add_uint32x4_x2(sse); +} + +static INLINE int64_t highbd_sse_4xh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + // Peel the first loop iteration. + uint16x4_t s = vld1_u16(src); + uint16x4_t r = vld1_u16(ref); + + uint16x4_t abs_diff = vabd_u16(s, r); + uint32x4_t sse = vmull_u16(abs_diff, abs_diff); + + src += src_stride; + ref += ref_stride; + + while (--height != 0) { + s = vld1_u16(src); + r = vld1_u16(ref); + + abs_diff = vabd_u16(s, r); + sse = vmlal_u16(sse, abs_diff, abs_diff); + + src += src_stride; + ref += ref_stride; + } + + return horizontal_long_add_uint32x4(sse); +} + +static INLINE int64_t highbd_sse_wxh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int width, int height) { + // { 0, 1, 2, 3, 4, 5, 6, 7 } + uint16x8_t k01234567 = vmovl_u8(vcreate_u8(0x0706050403020100)); + uint16x8_t remainder_mask = vcltq_u16(k01234567, vdupq_n_u16(width & 7)); + uint64_t sse = 0; + + do { + int w = width; + int offset = 0; + + do { + uint16x8_t s = vld1q_u16(src + offset); + uint16x8_t r = vld1q_u16(ref + offset); + uint16x8_t abs_diff; + uint16x4_t abs_diff_lo; + uint16x4_t abs_diff_hi; + uint32x4_t sse_u32; + + if (w < 8) { + // Mask out-of-range elements. + s = vandq_u16(s, remainder_mask); + r = vandq_u16(r, remainder_mask); + } + + abs_diff = vabdq_u16(s, r); + abs_diff_lo = vget_low_u16(abs_diff); + abs_diff_hi = vget_high_u16(abs_diff); + + sse_u32 = vmull_u16(abs_diff_lo, abs_diff_lo); + sse_u32 = vmlal_u16(sse_u32, abs_diff_hi, abs_diff_hi); + + sse += horizontal_long_add_uint32x4(sse_u32); + + offset += 8; + w -= 8; + } while (w > 0); + + src += src_stride; + ref += ref_stride; + } while (--height != 0); + + return sse; +} + +int64_t vpx_highbd_sse_neon(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, int width, + int height) { + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + + switch (width) { + case 4: + return highbd_sse_4xh_neon(src, src_stride, ref, ref_stride, height); + case 8: + return highbd_sse_8xh_neon(src, src_stride, ref, ref_stride, height); + case 16: + return highbd_sse_16xh_neon(src, src_stride, ref, ref_stride, height); + case 32: + return highbd_sse_32xh_neon(src, src_stride, ref, ref_stride, height); + case 64: + return highbd_sse_64xh_neon(src, src_stride, ref, ref_stride, height); + default: + return highbd_sse_wxh_neon(src, src_stride, ref, ref_stride, width, + height); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c new file mode 100644 index 0000000000..683df5797a --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c @@ -0,0 +1,586 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" + +// The bilinear filters look like this: +// +// {{ 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, +// { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }} +// +// We can factor out the highest common multiple, such that the sum of both +// weights will be 8 instead of 128. The benefits of this are two-fold: +// +// 1) We can infer the filter values from the filter_offset parameter in the +// bilinear filter functions below - we don't have to actually load the values +// from memory: +// f0 = 8 - filter_offset +// f1 = filter_offset +// +// 2) Scaling the pixel values by 8, instead of 128 enables us to operate on +// 16-bit data types at all times, rather than widening out to 32-bit and +// requiring double the number of data processing instructions. (12-bit * 8 = +// 15-bit.) + +// Process a block exactly 4 wide and any height. +static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset) { + const uint16x4_t f0 = vdup_n_u16(8 - filter_offset); + const uint16x4_t f1 = vdup_n_u16(filter_offset); + + int i = dst_height; + do { + uint16x4_t s0 = load_unaligned_u16(src_ptr); + uint16x4_t s1 = load_unaligned_u16(src_ptr + pixel_step); + + uint16x4_t blend = vmul_u16(s0, f0); + blend = vmla_u16(blend, s1, f1); + blend = vrshr_n_u16(blend, 3); + + vst1_u16(dst_ptr, blend); + + src_ptr += src_stride; + dst_ptr += 4; + } while (--i != 0); +} + +// Process a block which is a multiple of 8 and any height. +static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr, + uint16_t *dst_ptr, + int src_stride, int pixel_step, + int dst_width, int dst_height, + int filter_offset) { + const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset); + const uint16x8_t f1 = vdupq_n_u16(filter_offset); + + int i = dst_height; + do { + int j = 0; + do { + uint16x8_t s0 = vld1q_u16(src_ptr + j); + uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); + + uint16x8_t blend = vmulq_u16(s0, f0); + blend = vmlaq_u16(blend, s1, f1); + blend = vrshrq_n_u16(blend, 3); + + vst1q_u16(dst_ptr + j, blend); + + j += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +static void highbd_var_filter_block2d_bil_w8(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset) { + highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, + 8, dst_height, filter_offset); +} +static void highbd_var_filter_block2d_bil_w16(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset) { + highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, + 16, dst_height, filter_offset); +} +static void highbd_var_filter_block2d_bil_w32(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset) { + highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, + 32, dst_height, filter_offset); +} +static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset) { + highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, + 64, dst_height, filter_offset); +} + +static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_width, + int dst_height) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint16x8_t s0 = vld1q_u16(src_ptr + j); + uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); + uint16x8_t avg = vrhaddq_u16(s0, s1); + vst1q_u16(dst_ptr + j, avg); + + j += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +#define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \ + unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse) { \ + uint16_t tmp0[w * (h + 1)]; \ + uint16_t tmp1[w * h]; \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \ + xoffset); \ + highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + \ + return vpx_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \ + w, ref, ref_stride, sse); \ + } + +#define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \ + unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, unsigned int *sse) { \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + if (xoffset == 0) { \ + if (yoffset == 0) { \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp[w * h]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \ + h); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride, \ + src_stride, h, yoffset); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ + } \ + } else if (xoffset == 4) { \ + uint16_t tmp0[w * (h + 1)]; \ + if (yoffset == 0) { \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp1[w * (h + 1)]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \ + (h + 1)); \ + highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp1[w * (h + 1)]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \ + (h + 1)); \ + highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } \ + } else { \ + uint16_t tmp0[w * (h + 1)]; \ + if (yoffset == 0) { \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \ + xoffset); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ + (h + 1), xoffset); \ + highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ + (h + 1), xoffset); \ + highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } \ + } \ + } + +// 8-bit +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4) +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8) + +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4) +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8) +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64) + +// 10-bit +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4) +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8) + +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4) +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8) +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64) + +// 12-bit +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4) +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8) + +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4) +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8) +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64) + +// Combine bilinear filter with vpx_highbd_comp_avg_pred for blocks having +// width 4. +static void highbd_avg_pred_var_filter_block2d_bil_w4( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred) { + const uint16x4_t f0 = vdup_n_u16(8 - filter_offset); + const uint16x4_t f1 = vdup_n_u16(filter_offset); + + int i = dst_height; + do { + uint16x4_t s0 = load_unaligned_u16(src_ptr); + uint16x4_t s1 = load_unaligned_u16(src_ptr + pixel_step); + uint16x4_t p = vld1_u16(second_pred); + + uint16x4_t blend = vmul_u16(s0, f0); + blend = vmla_u16(blend, s1, f1); + blend = vrshr_n_u16(blend, 3); + + vst1_u16(dst_ptr, vrhadd_u16(blend, p)); + + src_ptr += src_stride; + dst_ptr += 4; + second_pred += 4; + } while (--i != 0); +} + +// Combine bilinear filter with vpx_highbd_comp_avg_pred for large blocks. +static void highbd_avg_pred_var_filter_block2d_bil_large( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_width, int dst_height, int filter_offset, + const uint16_t *second_pred) { + const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset); + const uint16x8_t f1 = vdupq_n_u16(filter_offset); + + int i = dst_height; + do { + int j = 0; + do { + uint16x8_t s0 = vld1q_u16(src_ptr + j); + uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); + uint16x8_t p = vld1q_u16(second_pred); + + uint16x8_t blend = vmulq_u16(s0, f0); + blend = vmlaq_u16(blend, s1, f1); + blend = vrshrq_n_u16(blend, 3); + + vst1q_u16(dst_ptr + j, vrhaddq_u16(blend, p)); + + j += 8; + second_pred += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +static void highbd_avg_pred_var_filter_block2d_bil_w8( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred) { + highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 8, dst_height, + filter_offset, second_pred); +} +static void highbd_avg_pred_var_filter_block2d_bil_w16( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred) { + highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 16, dst_height, + filter_offset, second_pred); +} +static void highbd_avg_pred_var_filter_block2d_bil_w32( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred) { + highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 32, dst_height, + filter_offset, second_pred); +} +static void highbd_avg_pred_var_filter_block2d_bil_w64( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred) { + highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 64, dst_height, + filter_offset, second_pred); +} + +// Combine averaging subpel filter with vpx_highbd_comp_avg_pred. +static void highbd_avg_pred_var_filter_block2d_avg( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_width, int dst_height, const uint16_t *second_pred) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint16x8_t s0 = vld1q_u16(src_ptr + j); + uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); + uint16x8_t avg = vrhaddq_u16(s0, s1); + + uint16x8_t p = vld1q_u16(second_pred); + avg = vrhaddq_u16(avg, p); + + vst1q_u16(dst_ptr + j, avg); + + j += 8; + second_pred += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +// Implementation of vpx_highbd_comp_avg_pred for blocks having width >= 16. +static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, + int src_stride, int dst_width, int dst_height, + const uint16_t *second_pred) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint16x8_t s = vld1q_u16(src_ptr + j); + uint16x8_t p = vld1q_u16(second_pred); + + uint16x8_t avg = vrhaddq_u16(s, p); + + vst1q_u16(dst_ptr + j, avg); + + j += 8; + second_pred += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \ + uint32_t vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t tmp0[w * (h + 1)]; \ + uint16_t tmp1[w * h]; \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \ + xoffset); \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ + \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } + +#define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \ + unsigned int vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int source_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, unsigned int *sse, \ + const uint8_t *second_pred) { \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + if (xoffset == 0) { \ + uint16_t tmp[w * h]; \ + if (yoffset == 0) { \ + highbd_avg_pred(src_ptr, tmp, source_stride, w, h, \ + CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + highbd_avg_pred_var_filter_block2d_avg( \ + src_ptr, tmp, source_stride, source_stride, w, h, \ + CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ + } else { \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + src_ptr, tmp, source_stride, source_stride, h, yoffset, \ + CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ + } \ + } else if (xoffset == 4) { \ + uint16_t tmp0[w * (h + 1)]; \ + if (yoffset == 0) { \ + highbd_avg_pred_var_filter_block2d_avg( \ + src_ptr, tmp0, source_stride, 1, w, h, \ + CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp1[w * (h + 1)]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \ + (h + 1)); \ + highbd_avg_pred_var_filter_block2d_avg( \ + tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp1[w * (h + 1)]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \ + (h + 1)); \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } \ + } else { \ + uint16_t tmp0[w * (h + 1)]; \ + if (yoffset == 0) { \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + src_ptr, tmp0, source_stride, 1, h, xoffset, \ + CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \ + (h + 1), xoffset); \ + highbd_avg_pred_var_filter_block2d_avg( \ + tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \ + (h + 1), xoffset); \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } \ + } \ + } + +// 8-bit +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8) + +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64) + +// 10-bit +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8) + +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64) + +// 12-bit +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8) + +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64) diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon.c new file mode 100644 index 0000000000..309ae7fd35 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon.c @@ -0,0 +1,436 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" +#include "vpx_ports/mem.h" + +// Process a block of width 4 two rows at a time. +static INLINE void highbd_variance_4xh_neon(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, + uint64_t *sse, int64_t *sum) { + int16x8_t sum_s16 = vdupq_n_s16(0); + int32x4_t sse_s32 = vdupq_n_s32(0); + + int i = h; + do { + const uint16x8_t s = load_unaligned_u16q(src_ptr, src_stride); + const uint16x8_t r = load_unaligned_u16q(ref_ptr, ref_stride); + + int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r)); + sum_s16 = vaddq_s16(sum_s16, diff); + + sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff)); + sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff)); + + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + i -= 2; + } while (i != 0); + + *sum = horizontal_add_int16x8(sum_s16); + *sse = horizontal_add_int32x4(sse_s32); +} + +// For 8-bit and 10-bit data, since we're using two int32x4 accumulators, all +// block sizes can be processed in 32-bit elements (1023*1023*64*16 = 1071645696 +// for a 64x64 block). +static INLINE void highbd_variance_large_neon(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int w, int h, + uint64_t *sse, int64_t *sum) { + int32x4_t sum_s32 = vdupq_n_s32(0); + int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + + int i = h; + do { + int j = 0; + do { + const uint16x8_t s = vld1q_u16(src_ptr + j); + const uint16x8_t r = vld1q_u16(ref_ptr + j); + + const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r)); + sum_s32 = vpadalq_s16(sum_s32, diff); + + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff)); + + j += 8; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + *sum = horizontal_add_int32x4(sum_s32); + *sse = horizontal_long_add_uint32x4(vaddq_u32( + vreinterpretq_u32_s32(sse_s32[0]), vreinterpretq_u32_s32(sse_s32[1]))); +} + +static INLINE void highbd_variance_8xh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int h, uint64_t *sse, + int64_t *sum) { + highbd_variance_large_neon(src, src_stride, ref, ref_stride, 8, h, sse, sum); +} + +static INLINE void highbd_variance_16xh_neon(const uint16_t *src, + int src_stride, + const uint16_t *ref, + int ref_stride, int h, + uint64_t *sse, int64_t *sum) { + highbd_variance_large_neon(src, src_stride, ref, ref_stride, 16, h, sse, sum); +} + +static INLINE void highbd_variance_32xh_neon(const uint16_t *src, + int src_stride, + const uint16_t *ref, + int ref_stride, int h, + uint64_t *sse, int64_t *sum) { + highbd_variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum); +} + +static INLINE void highbd_variance_64xh_neon(const uint16_t *src, + int src_stride, + const uint16_t *ref, + int ref_stride, int h, + uint64_t *sse, int64_t *sum) { + highbd_variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum); +} + +// For 12-bit data, we can only accumulate up to 128 elements in the sum of +// squares (4095*4095*128 = 2146435200), and because we're using two int32x4 +// accumulators, we can only process up to 32 32-element rows (32*32/8 = 128) +// or 16 64-element rows before we have to accumulate into 64-bit elements. +// Therefore blocks of size 32x64, 64x32 and 64x64 are processed in a different +// helper function. + +// Process a block of any size where the width is divisible by 8, with +// accumulation into 64-bit elements. +static INLINE void highbd_variance_xlarge_neon( + const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr, + int ref_stride, int w, int h, int h_limit, uint64_t *sse, int64_t *sum) { + int32x4_t sum_s32 = vdupq_n_s32(0); + int64x2_t sse_s64 = vdupq_n_s64(0); + + // 'h_limit' is the number of 'w'-width rows we can process before our 32-bit + // accumulator overflows. After hitting this limit we accumulate into 64-bit + // elements. + int h_tmp = h > h_limit ? h_limit : h; + + int i = 0; + do { + int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + do { + int j = 0; + do { + const uint16x8_t s0 = vld1q_u16(src_ptr + j); + const uint16x8_t r0 = vld1q_u16(ref_ptr + j); + + const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s0, r0)); + sum_s32 = vpadalq_s16(sum_s32, diff); + + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff)); + + j += 8; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + i++; + } while (i < h_tmp); + + sse_s64 = vpadalq_s32(sse_s64, sse_s32[0]); + sse_s64 = vpadalq_s32(sse_s64, sse_s32[1]); + h_tmp += h_limit; + } while (i < h); + + *sum = horizontal_add_int32x4(sum_s32); + *sse = (uint64_t)horizontal_add_int64x2(sse_s64); +} + +static INLINE void highbd_variance_32xh_xlarge_neon( + const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, + int h, uint64_t *sse, int64_t *sum) { + highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 32, h, 32, sse, + sum); +} + +static INLINE void highbd_variance_64xh_xlarge_neon( + const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, + int h, uint64_t *sse, int64_t *sum) { + highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 64, h, 16, sse, + sum); +} + +#define HBD_VARIANCE_WXH_8_NEON(w, h) \ + uint32_t vpx_highbd_8_variance##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)sse_long; \ + sum = (int)sum_long; \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (w * h)); \ + } + +#define HBD_VARIANCE_WXH_10_NEON(w, h) \ + uint32_t vpx_highbd_10_variance##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \ + sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define HBD_VARIANCE_WXH_12_NEON(w, h) \ + uint32_t vpx_highbd_12_variance##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \ + sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define HBD_VARIANCE_WXH_12_XLARGE_NEON(w, h) \ + uint32_t vpx_highbd_12_variance##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_xlarge_neon(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \ + sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +// 8-bit +HBD_VARIANCE_WXH_8_NEON(4, 4) +HBD_VARIANCE_WXH_8_NEON(4, 8) + +HBD_VARIANCE_WXH_8_NEON(8, 4) +HBD_VARIANCE_WXH_8_NEON(8, 8) +HBD_VARIANCE_WXH_8_NEON(8, 16) + +HBD_VARIANCE_WXH_8_NEON(16, 8) +HBD_VARIANCE_WXH_8_NEON(16, 16) +HBD_VARIANCE_WXH_8_NEON(16, 32) + +HBD_VARIANCE_WXH_8_NEON(32, 16) +HBD_VARIANCE_WXH_8_NEON(32, 32) +HBD_VARIANCE_WXH_8_NEON(32, 64) + +HBD_VARIANCE_WXH_8_NEON(64, 32) +HBD_VARIANCE_WXH_8_NEON(64, 64) + +// 10-bit +HBD_VARIANCE_WXH_10_NEON(4, 4) +HBD_VARIANCE_WXH_10_NEON(4, 8) + +HBD_VARIANCE_WXH_10_NEON(8, 4) +HBD_VARIANCE_WXH_10_NEON(8, 8) +HBD_VARIANCE_WXH_10_NEON(8, 16) + +HBD_VARIANCE_WXH_10_NEON(16, 8) +HBD_VARIANCE_WXH_10_NEON(16, 16) +HBD_VARIANCE_WXH_10_NEON(16, 32) + +HBD_VARIANCE_WXH_10_NEON(32, 16) +HBD_VARIANCE_WXH_10_NEON(32, 32) +HBD_VARIANCE_WXH_10_NEON(32, 64) + +HBD_VARIANCE_WXH_10_NEON(64, 32) +HBD_VARIANCE_WXH_10_NEON(64, 64) + +// 12-bit +HBD_VARIANCE_WXH_12_NEON(4, 4) +HBD_VARIANCE_WXH_12_NEON(4, 8) + +HBD_VARIANCE_WXH_12_NEON(8, 4) +HBD_VARIANCE_WXH_12_NEON(8, 8) +HBD_VARIANCE_WXH_12_NEON(8, 16) + +HBD_VARIANCE_WXH_12_NEON(16, 8) +HBD_VARIANCE_WXH_12_NEON(16, 16) +HBD_VARIANCE_WXH_12_NEON(16, 32) + +HBD_VARIANCE_WXH_12_NEON(32, 16) +HBD_VARIANCE_WXH_12_NEON(32, 32) +HBD_VARIANCE_WXH_12_XLARGE_NEON(32, 64) + +HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 32) +HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 64) + +#define HIGHBD_GET_VAR(S) \ + void vpx_highbd_8_get##S##x##S##var_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##S##xh_neon(src, src_stride, ref, ref_stride, S, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)sse_long; \ + *sum = (int)sum_long; \ + } \ + \ + void vpx_highbd_10_get##S##x##S##var_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##S##xh_neon(src, src_stride, ref, ref_stride, S, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \ + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); \ + } \ + \ + void vpx_highbd_12_get##S##x##S##var_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##S##xh_neon(src, src_stride, ref, ref_stride, S, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \ + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \ + } + +HIGHBD_GET_VAR(8) +HIGHBD_GET_VAR(16) + +static INLINE uint32_t highbd_mse_wxh_neon(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int w, int h) { + uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + int j = 0; + do { + uint16x8_t s = vld1q_u16(src_ptr + j); + uint16x8_t r = vld1q_u16(ref_ptr + j); + + uint16x8_t diff = vabdq_u16(s, r); + + sse_u32[0] = + vmlal_u16(sse_u32[0], vget_low_u16(diff), vget_low_u16(diff)); + sse_u32[1] = + vmlal_u16(sse_u32[1], vget_high_u16(diff), vget_high_u16(diff)); + + j += 8; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1])); +} + +static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h) { + return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 8, h); +} + +static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h) { + return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, h); +} + +#define HIGHBD_MSE_WXH_NEON(w, h) \ + uint32_t vpx_highbd_8_mse##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + *sse = highbd_mse8_##w##xh_neon(src, src_stride, ref, ref_stride, h); \ + return *sse; \ + } \ + \ + uint32_t vpx_highbd_10_mse##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + *sse = highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h); \ + *sse = ROUND_POWER_OF_TWO(*sse, 4); \ + return *sse; \ + } \ + \ + uint32_t vpx_highbd_12_mse##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + *sse = highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h); \ + *sse = ROUND_POWER_OF_TWO(*sse, 8); \ + return *sse; \ + } + +HIGHBD_MSE_WXH_NEON(16, 16) +HIGHBD_MSE_WXH_NEON(16, 8) +HIGHBD_MSE_WXH_NEON(8, 16) +HIGHBD_MSE_WXH_NEON(8, 8) + +#undef HIGHBD_MSE_WXH_NEON diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon_dotprod.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon_dotprod.c new file mode 100644 index 0000000000..1a88720172 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon_dotprod.c @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" +#include "vpx_ports/mem.h" + +static INLINE uint32_t highbd_mse8_8xh_neon_dotprod(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h) { + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h / 2; + do { + uint16x8_t s0, s1, r0, r1; + uint8x16_t s, r, diff; + + s0 = vld1q_u16(src_ptr); + src_ptr += src_stride; + s1 = vld1q_u16(src_ptr); + src_ptr += src_stride; + r0 = vld1q_u16(ref_ptr); + ref_ptr += ref_stride; + r1 = vld1q_u16(ref_ptr); + ref_ptr += ref_stride; + + s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1)); + r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1)); + + diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, diff, diff); + } while (--i != 0); + + return horizontal_add_uint32x4(sse_u32); +} + +static INLINE uint32_t highbd_mse8_16xh_neon_dotprod(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h) { + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h; + do { + uint16x8_t s0, s1, r0, r1; + uint8x16_t s, r, diff; + + s0 = vld1q_u16(src_ptr); + s1 = vld1q_u16(src_ptr + 8); + r0 = vld1q_u16(ref_ptr); + r1 = vld1q_u16(ref_ptr + 8); + + s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1)); + r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1)); + + diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, diff, diff); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(sse_u32); +} + +#define HIGHBD_MSE_WXH_NEON_DOTPROD(w, h) \ + uint32_t vpx_highbd_8_mse##w##x##h##_neon_dotprod( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + *sse = \ + highbd_mse8_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h); \ + return *sse; \ + } + +HIGHBD_MSE_WXH_NEON_DOTPROD(16, 16) +HIGHBD_MSE_WXH_NEON_DOTPROD(16, 8) +HIGHBD_MSE_WXH_NEON_DOTPROD(8, 16) +HIGHBD_MSE_WXH_NEON_DOTPROD(8, 8) + +#undef HIGHBD_MSE_WXH_NEON_DOTPROD diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c new file mode 100644 index 0000000000..47684473ca --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c @@ -0,0 +1,931 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_ports/mem.h" + +static INLINE void load_4x4(const int16_t *s, const ptrdiff_t p, + int16x4_t *const s0, int16x4_t *const s1, + int16x4_t *const s2, int16x4_t *const s3) { + *s0 = vld1_s16(s); + s += p; + *s1 = vld1_s16(s); + s += p; + *s2 = vld1_s16(s); + s += p; + *s3 = vld1_s16(s); +} + +static INLINE void load_8x4(const uint16_t *s, const ptrdiff_t p, + uint16x8_t *const s0, uint16x8_t *const s1, + uint16x8_t *const s2, uint16x8_t *const s3) { + *s0 = vld1q_u16(s); + s += p; + *s1 = vld1q_u16(s); + s += p; + *s2 = vld1q_u16(s); + s += p; + *s3 = vld1q_u16(s); +} + +static INLINE void load_8x8(const int16_t *s, const ptrdiff_t p, + int16x8_t *const s0, int16x8_t *const s1, + int16x8_t *const s2, int16x8_t *const s3, + int16x8_t *const s4, int16x8_t *const s5, + int16x8_t *const s6, int16x8_t *const s7) { + *s0 = vld1q_s16(s); + s += p; + *s1 = vld1q_s16(s); + s += p; + *s2 = vld1q_s16(s); + s += p; + *s3 = vld1q_s16(s); + s += p; + *s4 = vld1q_s16(s); + s += p; + *s5 = vld1q_s16(s); + s += p; + *s6 = vld1q_s16(s); + s += p; + *s7 = vld1q_s16(s); +} + +static INLINE void store_8x8(uint16_t *s, const ptrdiff_t p, + const uint16x8_t s0, const uint16x8_t s1, + const uint16x8_t s2, const uint16x8_t s3, + const uint16x8_t s4, const uint16x8_t s5, + const uint16x8_t s6, const uint16x8_t s7) { + vst1q_u16(s, s0); + s += p; + vst1q_u16(s, s1); + s += p; + vst1q_u16(s, s2); + s += p; + vst1q_u16(s, s3); + s += p; + vst1q_u16(s, s4); + s += p; + vst1q_u16(s, s5); + s += p; + vst1q_u16(s, s6); + s += p; + vst1q_u16(s, s7); +} + +static INLINE int32x4_t highbd_convolve8_4( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16x8_t filters) { + const int16x4_t filters_lo = vget_low_s16(filters); + const int16x4_t filters_hi = vget_high_s16(filters); + int32x4_t sum; + + sum = vmull_lane_s16(s0, filters_lo, 0); + sum = vmlal_lane_s16(sum, s1, filters_lo, 1); + sum = vmlal_lane_s16(sum, s2, filters_lo, 2); + sum = vmlal_lane_s16(sum, s3, filters_lo, 3); + sum = vmlal_lane_s16(sum, s4, filters_hi, 0); + sum = vmlal_lane_s16(sum, s5, filters_hi, 1); + sum = vmlal_lane_s16(sum, s6, filters_hi, 2); + sum = vmlal_lane_s16(sum, s7, filters_hi, 3); + return sum; +} + +static INLINE uint16x8_t +highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t filters, const uint16x8_t max) { + const int16x4_t filters_lo = vget_low_s16(filters); + const int16x4_t filters_hi = vget_high_s16(filters); + int32x4_t sum0, sum1; + uint16x8_t d; + + sum0 = vmull_lane_s16(vget_low_s16(s0), filters_lo, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters_lo, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters_lo, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters_lo, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filters_hi, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filters_hi, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filters_hi, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filters_hi, 3); + sum1 = vmull_lane_s16(vget_high_s16(s0), filters_lo, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters_lo, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters_lo, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters_lo, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filters_hi, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filters_hi, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filters_hi, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filters_hi, 3); + d = vcombine_u16(vqrshrun_n_s32(sum0, 7), vqrshrun_n_s32(sum1, 7)); + d = vminq_u16(d, max); + return d; +} + +void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + if (x_step_q4 != 16) { + vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); + } else { + const int16x8_t filters = vld1q_s16(filter[x0_q4]); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + uint16x8_t t0, t1, t2, t3; + + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); + + src -= 3; + + if (h == 4) { + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + int32x4_t d0, d1, d2, d3; + uint16x8_t d01, d23; + + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + load_8x4(src, src_stride, &t0, &t1, &t2, &t3); + transpose_u16_8x4(&t0, &t1, &t2, &t3); + s0 = vreinterpret_s16_u16(vget_low_u16(t0)); + s1 = vreinterpret_s16_u16(vget_low_u16(t1)); + s2 = vreinterpret_s16_u16(vget_low_u16(t2)); + s3 = vreinterpret_s16_u16(vget_low_u16(t3)); + s4 = vreinterpret_s16_u16(vget_high_u16(t0)); + s5 = vreinterpret_s16_u16(vget_high_u16(t1)); + s6 = vreinterpret_s16_u16(vget_high_u16(t2)); + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + src += 7; + + do { + load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10); + transpose_s16_4x4d(&s7, &s8, &s9, &s10); + + d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + + d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7)); + d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7)); + d01 = vminq_u16(d01, max); + d23 = vminq_u16(d23, max); + transpose_u16_4x4q(&d01, &d23); + + vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01)); + vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23)); + vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01)); + vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23)); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + src += 4; + dst += 4; + w -= 4; + } while (w > 0); + } else { + int16x8_t t4, t5, t6, t7; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + uint16x8_t d0, d1, d2, d3; + + if (w == 4) { + do { + load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4, + &s5, &s6, &s7); + transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10, + &t4, &t5, &t6, &t7); + src += 8 * src_stride; + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(dst + 4 * dst_stride); + __builtin_prefetch(dst + 5 * dst_stride); + __builtin_prefetch(dst + 6 * dst_stride); + __builtin_prefetch(dst + 7 * dst_stride); + transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7); + + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + __builtin_prefetch(src + 7 * src_stride); + d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); + d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); + d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); + d3 = + highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); + + transpose_u16_8x4(&d0, &d1, &d2, &d3); + vst1_u16(dst, vget_low_u16(d0)); + dst += dst_stride; + vst1_u16(dst, vget_low_u16(d1)); + dst += dst_stride; + vst1_u16(dst, vget_low_u16(d2)); + dst += dst_stride; + vst1_u16(dst, vget_low_u16(d3)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d0)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d1)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d2)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d3)); + dst += dst_stride; + h -= 8; + } while (h > 0); + } else { + int width; + const uint16_t *s; + uint16_t *d; + int16x8_t s11, s12, s13, s14; + uint16x8_t d4, d5, d6, d7; + + do { + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + __builtin_prefetch(src + 7 * src_stride); + load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4, + &s5, &s6, &s7); + transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + width = w; + s = src + 7; + d = dst; + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(dst + 4 * dst_stride); + __builtin_prefetch(dst + 5 * dst_stride); + __builtin_prefetch(dst + 6 * dst_stride); + __builtin_prefetch(dst + 7 * dst_stride); + + do { + load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11, + &s12, &s13, &s14); + transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14); + + d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, + max); + d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, + max); + d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, + max); + d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, + max); + d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, + max); + d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, + max); + d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, + max); + d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, + filters, max); + + transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += 8 * src_stride; + dst += 8 * dst_stride; + h -= 8; + } while (h > 0); + } + } + } +} + +void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src, + ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, + int y_step_q4, int w, int h, int bd) { + if (x_step_q4 != 16) { + vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, + bd); + } else { + const int16x8_t filters = vld1q_s16(filter[x0_q4]); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); + + src -= 3; + + if (h == 4) { + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + int32x4_t d0, d1, d2, d3; + uint16x8_t t0, t1, t2, t3; + uint16x8_t d01, d23, t01, t23; + + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + load_8x4(src, src_stride, &t0, &t1, &t2, &t3); + transpose_u16_8x4(&t0, &t1, &t2, &t3); + s0 = vreinterpret_s16_u16(vget_low_u16(t0)); + s1 = vreinterpret_s16_u16(vget_low_u16(t1)); + s2 = vreinterpret_s16_u16(vget_low_u16(t2)); + s3 = vreinterpret_s16_u16(vget_low_u16(t3)); + s4 = vreinterpret_s16_u16(vget_high_u16(t0)); + s5 = vreinterpret_s16_u16(vget_high_u16(t1)); + s6 = vreinterpret_s16_u16(vget_high_u16(t2)); + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + src += 7; + + do { + load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10); + transpose_s16_4x4d(&s7, &s8, &s9, &s10); + + d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + + t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7)); + t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7)); + t01 = vminq_u16(t01, max); + t23 = vminq_u16(t23, max); + transpose_u16_4x4q(&t01, &t23); + + d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride), + vld1_u16(dst + 2 * dst_stride)); + d23 = vcombine_u16(vld1_u16(dst + 1 * dst_stride), + vld1_u16(dst + 3 * dst_stride)); + d01 = vrhaddq_u16(d01, t01); + d23 = vrhaddq_u16(d23, t23); + + vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01)); + vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23)); + vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01)); + vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23)); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + src += 4; + dst += 4; + w -= 4; + } while (w > 0); + } else { + int16x8_t t4, t5, t6, t7; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3; + + if (w == 4) { + do { + load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4, + &s5, &s6, &s7); + transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10, + &t4, &t5, &t6, &t7); + src += 8 * src_stride; + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(dst + 4 * dst_stride); + __builtin_prefetch(dst + 5 * dst_stride); + __builtin_prefetch(dst + 6 * dst_stride); + __builtin_prefetch(dst + 7 * dst_stride); + transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7); + + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + __builtin_prefetch(src + 7 * src_stride); + t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); + t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); + t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); + t3 = + highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); + transpose_u16_8x4(&t0, &t1, &t2, &t3); + + d0 = vcombine_u16(vld1_u16(dst + 0 * dst_stride), + vld1_u16(dst + 4 * dst_stride)); + d1 = vcombine_u16(vld1_u16(dst + 1 * dst_stride), + vld1_u16(dst + 5 * dst_stride)); + d2 = vcombine_u16(vld1_u16(dst + 2 * dst_stride), + vld1_u16(dst + 6 * dst_stride)); + d3 = vcombine_u16(vld1_u16(dst + 3 * dst_stride), + vld1_u16(dst + 7 * dst_stride)); + d0 = vrhaddq_u16(d0, t0); + d1 = vrhaddq_u16(d1, t1); + d2 = vrhaddq_u16(d2, t2); + d3 = vrhaddq_u16(d3, t3); + + vst1_u16(dst, vget_low_u16(d0)); + dst += dst_stride; + vst1_u16(dst, vget_low_u16(d1)); + dst += dst_stride; + vst1_u16(dst, vget_low_u16(d2)); + dst += dst_stride; + vst1_u16(dst, vget_low_u16(d3)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d0)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d1)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d2)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d3)); + dst += dst_stride; + h -= 8; + } while (h > 0); + } else { + int width; + const uint16_t *s; + uint16_t *d; + int16x8_t s11, s12, s13, s14; + uint16x8_t d4, d5, d6, d7; + + do { + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + __builtin_prefetch(src + 7 * src_stride); + load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4, + &s5, &s6, &s7); + transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + width = w; + s = src + 7; + d = dst; + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(dst + 4 * dst_stride); + __builtin_prefetch(dst + 5 * dst_stride); + __builtin_prefetch(dst + 6 * dst_stride); + __builtin_prefetch(dst + 7 * dst_stride); + + do { + load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11, + &s12, &s13, &s14); + transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14); + + d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, + max); + d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, + max); + d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, + max); + d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, + max); + d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, + max); + d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, + max); + d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, + max); + d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, + filters, max); + + transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + + d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride)); + d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride)); + d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride)); + d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride)); + d4 = vrhaddq_u16(d4, vld1q_u16(d + 4 * dst_stride)); + d5 = vrhaddq_u16(d5, vld1q_u16(d + 5 * dst_stride)); + d6 = vrhaddq_u16(d6, vld1q_u16(d + 6 * dst_stride)); + d7 = vrhaddq_u16(d7, vld1q_u16(d + 7 * dst_stride)); + + store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += 8 * src_stride; + dst += 8 * dst_stride; + h -= 8; + } while (h > 0); + } + } + } +} + +void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + if (y_step_q4 != 16) { + vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h, bd); + } else { + const int16x8_t filters = vld1q_s16(filter[y0_q4]); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); + + src -= 3 * src_stride; + + if (w == 4) { + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + int32x4_t d0, d1, d2, d3; + uint16x8_t d01, d23; + + s0 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s1 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s2 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s3 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s4 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s5 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s6 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + + do { + s7 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s8 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s9 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s10 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + + d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7)); + d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7)); + d01 = vminq_u16(d01, max); + d23 = vminq_u16(d23, max); + vst1_u16(dst, vget_low_u16(d01)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d01)); + dst += dst_stride; + vst1_u16(dst, vget_low_u16(d23)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d23)); + dst += dst_stride; + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + h -= 4; + } while (h > 0); + } else { + int height; + const uint16_t *s; + uint16_t *d; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + uint16x8_t d0, d1, d2, d3; + + do { + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + s = src; + s0 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s1 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s2 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s3 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s4 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s5 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s6 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + d = dst; + height = h; + + do { + s7 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s8 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s9 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s10 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + __builtin_prefetch(s + 0 * src_stride); + __builtin_prefetch(s + 1 * src_stride); + __builtin_prefetch(s + 2 * src_stride); + __builtin_prefetch(s + 3 * src_stride); + d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); + d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); + d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); + d3 = + highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); + + vst1q_u16(d, d0); + d += dst_stride; + vst1q_u16(d, d1); + d += dst_stride; + vst1q_u16(d, d2); + d += dst_stride; + vst1q_u16(d, d3); + d += dst_stride; + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + height -= 4; + } while (height > 0); + src += 8; + dst += 8; + w -= 8; + } while (w > 0); + } + } +} + +void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src, + ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + if (y_step_q4 != 16) { + vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, + bd); + } else { + const int16x8_t filters = vld1q_s16(filter[y0_q4]); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); + + src -= 3 * src_stride; + + if (w == 4) { + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + int32x4_t d0, d1, d2, d3; + uint16x8_t d01, d23, t01, t23; + + s0 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s1 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s2 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s3 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s4 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s5 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s6 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + + do { + s7 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s8 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s9 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s10 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + + t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7)); + t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7)); + t01 = vminq_u16(t01, max); + t23 = vminq_u16(t23, max); + + d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride), + vld1_u16(dst + 1 * dst_stride)); + d23 = vcombine_u16(vld1_u16(dst + 2 * dst_stride), + vld1_u16(dst + 3 * dst_stride)); + d01 = vrhaddq_u16(d01, t01); + d23 = vrhaddq_u16(d23, t23); + + vst1_u16(dst, vget_low_u16(d01)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d01)); + dst += dst_stride; + vst1_u16(dst, vget_low_u16(d23)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d23)); + dst += dst_stride; + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + h -= 4; + } while (h > 0); + } else { + int height; + const uint16_t *s; + uint16_t *d; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3; + + do { + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + s = src; + s0 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s1 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s2 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s3 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s4 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s5 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s6 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + d = dst; + height = h; + + do { + s7 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s8 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s9 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s10 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + __builtin_prefetch(s + 0 * src_stride); + __builtin_prefetch(s + 1 * src_stride); + __builtin_prefetch(s + 2 * src_stride); + __builtin_prefetch(s + 3 * src_stride); + t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); + t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); + t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); + t3 = + highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); + + d0 = vld1q_u16(d + 0 * dst_stride); + d1 = vld1q_u16(d + 1 * dst_stride); + d2 = vld1q_u16(d + 2 * dst_stride); + d3 = vld1q_u16(d + 3 * dst_stride); + d0 = vrhaddq_u16(d0, t0); + d1 = vrhaddq_u16(d1, t1); + d2 = vrhaddq_u16(d2, t2); + d3 = vrhaddq_u16(d3, t3); + + vst1q_u16(d, d0); + d += dst_stride; + vst1q_u16(d, d1); + d += dst_stride; + vst1q_u16(d, d2); + d += dst_stride; + vst1q_u16(d, d3); + d += dst_stride; + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + height -= 4; + } while (height > 0); + src += 8; + dst += 8; + w -= 8; + } while (w > 0); + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c new file mode 100644 index 0000000000..765a054f8d --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +void vpx_highbd_convolve_avg_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + (void)bd; + + if (w < 8) { // avg4 + uint16x4_t s0, s1, d0, d1; + uint16x8_t s01, d01; + do { + s0 = vld1_u16(src); + d0 = vld1_u16(dst); + src += src_stride; + s1 = vld1_u16(src); + d1 = vld1_u16(dst + dst_stride); + src += src_stride; + s01 = vcombine_u16(s0, s1); + d01 = vcombine_u16(d0, d1); + d01 = vrhaddq_u16(s01, d01); + vst1_u16(dst, vget_low_u16(d01)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d01)); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else if (w == 8) { // avg8 + uint16x8_t s0, s1, d0, d1; + do { + s0 = vld1q_u16(src); + d0 = vld1q_u16(dst); + src += src_stride; + s1 = vld1q_u16(src); + d1 = vld1q_u16(dst + dst_stride); + src += src_stride; + + d0 = vrhaddq_u16(s0, d0); + d1 = vrhaddq_u16(s1, d1); + + vst1q_u16(dst, d0); + dst += dst_stride; + vst1q_u16(dst, d1); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else if (w < 32) { // avg16 + uint16x8_t s0l, s0h, s1l, s1h, d0l, d0h, d1l, d1h; + do { + s0l = vld1q_u16(src); + s0h = vld1q_u16(src + 8); + d0l = vld1q_u16(dst); + d0h = vld1q_u16(dst + 8); + src += src_stride; + s1l = vld1q_u16(src); + s1h = vld1q_u16(src + 8); + d1l = vld1q_u16(dst + dst_stride); + d1h = vld1q_u16(dst + dst_stride + 8); + src += src_stride; + + d0l = vrhaddq_u16(s0l, d0l); + d0h = vrhaddq_u16(s0h, d0h); + d1l = vrhaddq_u16(s1l, d1l); + d1h = vrhaddq_u16(s1h, d1h); + + vst1q_u16(dst, d0l); + vst1q_u16(dst + 8, d0h); + dst += dst_stride; + vst1q_u16(dst, d1l); + vst1q_u16(dst + 8, d1h); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else if (w == 32) { // avg32 + uint16x8_t s0, s1, s2, s3, d0, d1, d2, d3; + do { + s0 = vld1q_u16(src); + s1 = vld1q_u16(src + 8); + s2 = vld1q_u16(src + 16); + s3 = vld1q_u16(src + 24); + d0 = vld1q_u16(dst); + d1 = vld1q_u16(dst + 8); + d2 = vld1q_u16(dst + 16); + d3 = vld1q_u16(dst + 24); + src += src_stride; + + d0 = vrhaddq_u16(s0, d0); + d1 = vrhaddq_u16(s1, d1); + d2 = vrhaddq_u16(s2, d2); + d3 = vrhaddq_u16(s3, d3); + + vst1q_u16(dst, d0); + vst1q_u16(dst + 8, d1); + vst1q_u16(dst + 16, d2); + vst1q_u16(dst + 24, d3); + dst += dst_stride; + + s0 = vld1q_u16(src); + s1 = vld1q_u16(src + 8); + s2 = vld1q_u16(src + 16); + s3 = vld1q_u16(src + 24); + d0 = vld1q_u16(dst); + d1 = vld1q_u16(dst + 8); + d2 = vld1q_u16(dst + 16); + d3 = vld1q_u16(dst + 24); + src += src_stride; + + d0 = vrhaddq_u16(s0, d0); + d1 = vrhaddq_u16(s1, d1); + d2 = vrhaddq_u16(s2, d2); + d3 = vrhaddq_u16(s3, d3); + + vst1q_u16(dst, d0); + vst1q_u16(dst + 8, d1); + vst1q_u16(dst + 16, d2); + vst1q_u16(dst + 24, d3); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else { // avg64 + uint16x8_t s0, s1, s2, s3, d0, d1, d2, d3; + do { + s0 = vld1q_u16(src); + s1 = vld1q_u16(src + 8); + s2 = vld1q_u16(src + 16); + s3 = vld1q_u16(src + 24); + d0 = vld1q_u16(dst); + d1 = vld1q_u16(dst + 8); + d2 = vld1q_u16(dst + 16); + d3 = vld1q_u16(dst + 24); + + d0 = vrhaddq_u16(s0, d0); + d1 = vrhaddq_u16(s1, d1); + d2 = vrhaddq_u16(s2, d2); + d3 = vrhaddq_u16(s3, d3); + + vst1q_u16(dst, d0); + vst1q_u16(dst + 8, d1); + vst1q_u16(dst + 16, d2); + vst1q_u16(dst + 24, d3); + + s0 = vld1q_u16(src + 32); + s1 = vld1q_u16(src + 40); + s2 = vld1q_u16(src + 48); + s3 = vld1q_u16(src + 56); + d0 = vld1q_u16(dst + 32); + d1 = vld1q_u16(dst + 40); + d2 = vld1q_u16(dst + 48); + d3 = vld1q_u16(dst + 56); + + d0 = vrhaddq_u16(s0, d0); + d1 = vrhaddq_u16(s1, d1); + d2 = vrhaddq_u16(s2, d2); + d3 = vrhaddq_u16(s3, d3); + + vst1q_u16(dst + 32, d0); + vst1q_u16(dst + 40, d1); + vst1q_u16(dst + 48, d2); + vst1q_u16(dst + 56, d3); + src += src_stride; + dst += dst_stride; + } while (--h); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c new file mode 100644 index 0000000000..7751082083 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + (void)bd; + + if (w < 8) { // copy4 + uint16x4_t s0, s1; + do { + s0 = vld1_u16(src); + src += src_stride; + s1 = vld1_u16(src); + src += src_stride; + + vst1_u16(dst, s0); + dst += dst_stride; + vst1_u16(dst, s1); + dst += dst_stride; + h -= 2; + } while (h != 0); + } else if (w == 8) { // copy8 + uint16x8_t s0, s1; + do { + s0 = vld1q_u16(src); + src += src_stride; + s1 = vld1q_u16(src); + src += src_stride; + + vst1q_u16(dst, s0); + dst += dst_stride; + vst1q_u16(dst, s1); + dst += dst_stride; + h -= 2; + } while (h != 0); + } else if (w < 32) { // copy16 + uint16x8_t s0, s1, s2, s3; + do { + s0 = vld1q_u16(src); + s1 = vld1q_u16(src + 8); + src += src_stride; + s2 = vld1q_u16(src); + s3 = vld1q_u16(src + 8); + src += src_stride; + + vst1q_u16(dst, s0); + vst1q_u16(dst + 8, s1); + dst += dst_stride; + vst1q_u16(dst, s2); + vst1q_u16(dst + 8, s3); + dst += dst_stride; + h -= 2; + } while (h != 0); + } else if (w == 32) { // copy32 + uint16x8_t s0, s1, s2, s3; + do { + s0 = vld1q_u16(src); + s1 = vld1q_u16(src + 8); + s2 = vld1q_u16(src + 16); + s3 = vld1q_u16(src + 24); + src += src_stride; + + vst1q_u16(dst, s0); + vst1q_u16(dst + 8, s1); + vst1q_u16(dst + 16, s2); + vst1q_u16(dst + 24, s3); + dst += dst_stride; + } while (--h != 0); + } else { // copy64 + uint16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + do { + s0 = vld1q_u16(src); + s1 = vld1q_u16(src + 8); + s2 = vld1q_u16(src + 16); + s3 = vld1q_u16(src + 24); + s4 = vld1q_u16(src + 32); + s5 = vld1q_u16(src + 40); + s6 = vld1q_u16(src + 48); + s7 = vld1q_u16(src + 56); + src += src_stride; + + vst1q_u16(dst, s0); + vst1q_u16(dst + 8, s1); + vst1q_u16(dst + 16, s2); + vst1q_u16(dst + 24, s3); + vst1q_u16(dst + 32, s4); + vst1q_u16(dst + 40, s5); + vst1q_u16(dst + 48, s6); + vst1q_u16(dst + 56, s7); + dst += dst_stride; + } while (--h != 0); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c new file mode 100644 index 0000000000..414ade3530 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_ports/mem.h" + +void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h, int bd) { + // + 1 to make it divisible by 4 + uint16_t temp[64 * 136]; + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + /* Filter starting 3 lines back. The neon implementation will ignore the given + * height and filter a multiple of 4 lines. Since this goes in to the temp + * buffer which has lots of extra room and is subsequently discarded this is + * safe if somewhat less than ideal. */ + vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, + filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, + intermediate_height, bd); + + /* Step into the temp buffer 3 lines to get the actual frame data */ + vpx_highbd_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); +} + +void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + // + 1 to make it divisible by 4 + uint16_t temp[64 * 136]; + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + /* This implementation has the same issues as above. In addition, we only want + * to average the values after both passes. + */ + vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, + filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, + intermediate_height, bd); + vpx_highbd_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, + bd); +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c new file mode 100644 index 0000000000..bf5192a683 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/inv_txfm.h" + +static INLINE void idct16x16_1_add_pos_kernel(uint8_t **dest, const int stride, + const uint8x16_t res) { + const uint8x16_t a = vld1q_u8(*dest); + const uint8x16_t b = vqaddq_u8(a, res); + vst1q_u8(*dest, b); + *dest += stride; +} + +static INLINE void idct16x16_1_add_neg_kernel(uint8_t **dest, const int stride, + const uint8x16_t res) { + const uint8x16_t a = vld1q_u8(*dest); + const uint8x16_t b = vqsubq_u8(a, res); + vst1q_u8(*dest, b); + *dest += stride; +} + +void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + const int16_t out0 = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); + const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64)); + const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6); + + if (a1 >= 0) { + const uint8x16_t dc = create_dcq(a1); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + } else { + const uint8x16_t dc = create_dcq(-a1); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_add_neon.c new file mode 100644 index 0000000000..fc7f4a7747 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_add_neon.c @@ -0,0 +1,764 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/txfm_common.h" + +static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0, + int16x4_t *const d1) { + *d0 = vrshrn_n_s32(t32[0], DCT_CONST_BITS); + *d1 = vrshrn_n_s32(t32[1], DCT_CONST_BITS); +} + +static INLINE void idct_cospi_8_24_d_kernel(const int16x4_t s0, + const int16x4_t s1, + const int16x4_t cospi_0_8_16_24, + int32x4_t *const t32) { + t32[0] = vmull_lane_s16(s0, cospi_0_8_16_24, 3); + t32[1] = vmull_lane_s16(s1, cospi_0_8_16_24, 3); + t32[0] = vmlsl_lane_s16(t32[0], s1, cospi_0_8_16_24, 1); + t32[1] = vmlal_lane_s16(t32[1], s0, cospi_0_8_16_24, 1); +} + +static INLINE void idct_cospi_8_24_d(const int16x4_t s0, const int16x4_t s1, + const int16x4_t cospi_0_8_16_24, + int16x4_t *const d0, int16x4_t *const d1) { + int32x4_t t32[2]; + + idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32); + wrap_low_4x2(t32, d0, d1); +} + +static INLINE void idct_cospi_8_24_neg_d(const int16x4_t s0, const int16x4_t s1, + const int16x4_t cospi_0_8_16_24, + int16x4_t *const d0, + int16x4_t *const d1) { + int32x4_t t32[2]; + + idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32); + t32[1] = vnegq_s32(t32[1]); + wrap_low_4x2(t32, d0, d1); +} + +static INLINE void idct_cospi_16_16_d(const int16x4_t s0, const int16x4_t s1, + const int16x4_t cospi_0_8_16_24, + int16x4_t *const d0, + int16x4_t *const d1) { + int32x4_t t32[3]; + + t32[2] = vmull_lane_s16(s1, cospi_0_8_16_24, 2); + t32[0] = vmlsl_lane_s16(t32[2], s0, cospi_0_8_16_24, 2); + t32[1] = vmlal_lane_s16(t32[2], s0, cospi_0_8_16_24, 2); + wrap_low_4x2(t32, d0, d1); +} + +void vpx_idct16x16_256_add_half1d(const void *const input, int16_t *output, + void *const dest, const int stride, + const int highbd_flag) { + const int16x8_t cospis0 = vld1q_s16(kCospi); + const int16x8_t cospis1 = vld1q_s16(kCospi + 8); + const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0); + const int16x4_t cospi_4_12_20N_28 = vget_high_s16(cospis0); + const int16x4_t cospi_2_30_10_22 = vget_low_s16(cospis1); + const int16x4_t cospi_6_26N_14_18N = vget_high_s16(cospis1); + int16x8_t in[16], step1[16], step2[16], out[16]; + + // Load input (16x8) + if (output) { + const tran_low_t *inputT = (const tran_low_t *)input; + in[0] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[8] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[1] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[9] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[2] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[10] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[3] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[11] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[4] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[12] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[5] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[13] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[6] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[14] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[7] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[15] = load_tran_low_to_s16q(inputT); + } else { + const int16_t *inputT = (const int16_t *)input; + in[0] = vld1q_s16(inputT); + inputT += 8; + in[8] = vld1q_s16(inputT); + inputT += 8; + in[1] = vld1q_s16(inputT); + inputT += 8; + in[9] = vld1q_s16(inputT); + inputT += 8; + in[2] = vld1q_s16(inputT); + inputT += 8; + in[10] = vld1q_s16(inputT); + inputT += 8; + in[3] = vld1q_s16(inputT); + inputT += 8; + in[11] = vld1q_s16(inputT); + inputT += 8; + in[4] = vld1q_s16(inputT); + inputT += 8; + in[12] = vld1q_s16(inputT); + inputT += 8; + in[5] = vld1q_s16(inputT); + inputT += 8; + in[13] = vld1q_s16(inputT); + inputT += 8; + in[6] = vld1q_s16(inputT); + inputT += 8; + in[14] = vld1q_s16(inputT); + inputT += 8; + in[7] = vld1q_s16(inputT); + inputT += 8; + in[15] = vld1q_s16(inputT); + } + + // Transpose + transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + transpose_s16_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14], + &in[15]); + + // stage 1 + step1[0] = in[0 / 2]; + step1[1] = in[16 / 2]; + step1[2] = in[8 / 2]; + step1[3] = in[24 / 2]; + step1[4] = in[4 / 2]; + step1[5] = in[20 / 2]; + step1[6] = in[12 / 2]; + step1[7] = in[28 / 2]; + step1[8] = in[2 / 2]; + step1[9] = in[18 / 2]; + step1[10] = in[10 / 2]; + step1[11] = in[26 / 2]; + step1[12] = in[6 / 2]; + step1[13] = in[22 / 2]; + step1[14] = in[14 / 2]; + step1[15] = in[30 / 2]; + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + idct_cospi_2_30(step1[8], step1[15], cospi_2_30_10_22, &step2[8], &step2[15]); + idct_cospi_14_18(step1[9], step1[14], cospi_6_26N_14_18N, &step2[9], + &step2[14]); + idct_cospi_10_22(step1[10], step1[13], cospi_2_30_10_22, &step2[10], + &step2[13]); + idct_cospi_6_26(step1[11], step1[12], cospi_6_26N_14_18N, &step2[11], + &step2[12]); + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + idct_cospi_4_28(step2[4], step2[7], cospi_4_12_20N_28, &step1[4], &step1[7]); + idct_cospi_12_20(step2[5], step2[6], cospi_4_12_20N_28, &step1[5], &step1[6]); + step1[8] = vaddq_s16(step2[8], step2[9]); + step1[9] = vsubq_s16(step2[8], step2[9]); + step1[10] = vsubq_s16(step2[11], step2[10]); + step1[11] = vaddq_s16(step2[11], step2[10]); + step1[12] = vaddq_s16(step2[12], step2[13]); + step1[13] = vsubq_s16(step2[12], step2[13]); + step1[14] = vsubq_s16(step2[15], step2[14]); + step1[15] = vaddq_s16(step2[15], step2[14]); + + // stage 4 + idct_cospi_16_16_q(step1[1], step1[0], cospi_0_8_16_24, &step2[1], &step2[0]); + idct_cospi_8_24_q(step1[2], step1[3], cospi_0_8_16_24, &step2[2], &step2[3]); + step2[4] = vaddq_s16(step1[4], step1[5]); + step2[5] = vsubq_s16(step1[4], step1[5]); + step2[6] = vsubq_s16(step1[7], step1[6]); + step2[7] = vaddq_s16(step1[7], step1[6]); + step2[8] = step1[8]; + idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9], + &step2[14]); + idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13], + &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0] = vaddq_s16(step2[0], step2[3]); + step1[1] = vaddq_s16(step2[1], step2[2]); + step1[2] = vsubq_s16(step2[1], step2[2]); + step1[3] = vsubq_s16(step2[0], step2[3]); + step1[4] = step2[4]; + idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]); + step1[7] = step2[7]; + step1[8] = vaddq_s16(step2[8], step2[11]); + step1[9] = vaddq_s16(step2[9], step2[10]); + step1[10] = vsubq_s16(step2[9], step2[10]); + step1[11] = vsubq_s16(step2[8], step2[11]); + step1[12] = vsubq_s16(step2[15], step2[12]); + step1[13] = vsubq_s16(step2[14], step2[13]); + step1[14] = vaddq_s16(step2[14], step2[13]); + step1[15] = vaddq_s16(step2[15], step2[12]); + + // stage 6 + step2[0] = vaddq_s16(step1[0], step1[7]); + step2[1] = vaddq_s16(step1[1], step1[6]); + step2[2] = vaddq_s16(step1[2], step1[5]); + step2[3] = vaddq_s16(step1[3], step1[4]); + step2[4] = vsubq_s16(step1[3], step1[4]); + step2[5] = vsubq_s16(step1[2], step1[5]); + step2[6] = vsubq_s16(step1[1], step1[6]); + step2[7] = vsubq_s16(step1[0], step1[7]); + idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10], + &step2[13]); + idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11], + &step2[12]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + idct16x16_add_stage7(step2, out); + + if (output) { + idct16x16_store_pass1(out, output); + } else { + if (highbd_flag) { + idct16x16_add_store_bd8(out, dest, stride); + } else { + idct16x16_add_store(out, dest, stride); + } + } +} + +void vpx_idct16x16_38_add_half1d(const void *const input, int16_t *const output, + void *const dest, const int stride, + const int highbd_flag) { + const int16x8_t cospis0 = vld1q_s16(kCospi); + const int16x8_t cospis1 = vld1q_s16(kCospi + 8); + const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0); + const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1); + const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0); + const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0); + const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0); + const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1); + const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1); + int16x8_t in[8], step1[16], step2[16], out[16]; + + // Load input (8x8) + if (output) { + const tran_low_t *inputT = (const tran_low_t *)input; + in[0] = load_tran_low_to_s16q(inputT); + inputT += 16; + in[1] = load_tran_low_to_s16q(inputT); + inputT += 16; + in[2] = load_tran_low_to_s16q(inputT); + inputT += 16; + in[3] = load_tran_low_to_s16q(inputT); + inputT += 16; + in[4] = load_tran_low_to_s16q(inputT); + inputT += 16; + in[5] = load_tran_low_to_s16q(inputT); + inputT += 16; + in[6] = load_tran_low_to_s16q(inputT); + inputT += 16; + in[7] = load_tran_low_to_s16q(inputT); + } else { + const int16_t *inputT = (const int16_t *)input; + in[0] = vld1q_s16(inputT); + inputT += 16; + in[1] = vld1q_s16(inputT); + inputT += 16; + in[2] = vld1q_s16(inputT); + inputT += 16; + in[3] = vld1q_s16(inputT); + inputT += 16; + in[4] = vld1q_s16(inputT); + inputT += 16; + in[5] = vld1q_s16(inputT); + inputT += 16; + in[6] = vld1q_s16(inputT); + inputT += 16; + in[7] = vld1q_s16(inputT); + } + + // Transpose + transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + + // stage 1 + step1[0] = in[0 / 2]; + step1[2] = in[8 / 2]; + step1[4] = in[4 / 2]; + step1[6] = in[12 / 2]; + step1[8] = in[2 / 2]; + step1[10] = in[10 / 2]; + step1[12] = in[6 / 2]; + step1[14] = in[14 / 2]; // 0 in pass 1 + + // stage 2 + step2[0] = step1[0]; + step2[2] = step1[2]; + step2[4] = step1[4]; + step2[6] = step1[6]; + step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1); + step2[9] = vqrdmulhq_lane_s16(step1[14], cospid_6_26_14_18N, 3); + step2[10] = vqrdmulhq_lane_s16(step1[10], cospid_2_30_10_22, 3); + step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1); + step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0); + step2[13] = vqrdmulhq_lane_s16(step1[10], cospid_2_30_10_22, 2); + step2[14] = vqrdmulhq_lane_s16(step1[14], cospid_6_26_14_18N, 2); + step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0); + + // stage 3 + step1[0] = step2[0]; + step1[2] = step2[2]; + step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3); + step1[5] = vqrdmulhq_lane_s16(step2[6], cospid_4_12_20N_28, 2); + step1[6] = vqrdmulhq_lane_s16(step2[6], cospid_4_12_20N_28, 1); + step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0); + step1[8] = vaddq_s16(step2[8], step2[9]); + step1[9] = vsubq_s16(step2[8], step2[9]); + step1[10] = vsubq_s16(step2[11], step2[10]); + step1[11] = vaddq_s16(step2[11], step2[10]); + step1[12] = vaddq_s16(step2[12], step2[13]); + step1[13] = vsubq_s16(step2[12], step2[13]); + step1[14] = vsubq_s16(step2[15], step2[14]); + step1[15] = vaddq_s16(step2[15], step2[14]); + + // stage 4 + step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2); + step2[2] = vqrdmulhq_lane_s16(step1[2], cospid_0_8_16_24, 3); + step2[3] = vqrdmulhq_lane_s16(step1[2], cospid_0_8_16_24, 1); + step2[4] = vaddq_s16(step1[4], step1[5]); + step2[5] = vsubq_s16(step1[4], step1[5]); + step2[6] = vsubq_s16(step1[7], step1[6]); + step2[7] = vaddq_s16(step1[7], step1[6]); + step2[8] = step1[8]; + idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9], + &step2[14]); + idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13], + &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0] = vaddq_s16(step2[0], step2[3]); + step1[1] = vaddq_s16(step2[1], step2[2]); + step1[2] = vsubq_s16(step2[1], step2[2]); + step1[3] = vsubq_s16(step2[0], step2[3]); + step1[4] = step2[4]; + idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]); + step1[7] = step2[7]; + step1[8] = vaddq_s16(step2[8], step2[11]); + step1[9] = vaddq_s16(step2[9], step2[10]); + step1[10] = vsubq_s16(step2[9], step2[10]); + step1[11] = vsubq_s16(step2[8], step2[11]); + step1[12] = vsubq_s16(step2[15], step2[12]); + step1[13] = vsubq_s16(step2[14], step2[13]); + step1[14] = vaddq_s16(step2[14], step2[13]); + step1[15] = vaddq_s16(step2[15], step2[12]); + + // stage 6 + step2[0] = vaddq_s16(step1[0], step1[7]); + step2[1] = vaddq_s16(step1[1], step1[6]); + step2[2] = vaddq_s16(step1[2], step1[5]); + step2[3] = vaddq_s16(step1[3], step1[4]); + step2[4] = vsubq_s16(step1[3], step1[4]); + step2[5] = vsubq_s16(step1[2], step1[5]); + step2[6] = vsubq_s16(step1[1], step1[6]); + step2[7] = vsubq_s16(step1[0], step1[7]); + idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10], + &step2[13]); + idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11], + &step2[12]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + idct16x16_add_stage7(step2, out); + + if (output) { + idct16x16_store_pass1(out, output); + } else { + if (highbd_flag) { + idct16x16_add_store_bd8(out, dest, stride); + } else { + idct16x16_add_store(out, dest, stride); + } + } +} + +void vpx_idct16x16_10_add_half1d_pass1(const tran_low_t *input, + int16_t *output) { + const int16x8_t cospis0 = vld1q_s16(kCospi); + const int16x8_t cospis1 = vld1q_s16(kCospi + 8); + const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0); + const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1); + const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0); + const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0); + const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0); + const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1); + const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1); + int16x4_t in[4], step1[16], step2[16], out[16]; + + // Load input (4x4) + in[0] = load_tran_low_to_s16d(input); + input += 16; + in[1] = load_tran_low_to_s16d(input); + input += 16; + in[2] = load_tran_low_to_s16d(input); + input += 16; + in[3] = load_tran_low_to_s16d(input); + + // Transpose + transpose_s16_4x4d(&in[0], &in[1], &in[2], &in[3]); + + // stage 1 + step1[0] = in[0 / 2]; + step1[4] = in[4 / 2]; + step1[8] = in[2 / 2]; + step1[12] = in[6 / 2]; + + // stage 2 + step2[0] = step1[0]; + step2[4] = step1[4]; + step2[8] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 1); + step2[11] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 1); + step2[12] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 0); + step2[15] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 0); + + // stage 3 + step1[0] = step2[0]; + step1[4] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 3); + step1[7] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 0); + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + step1[14] = step2[15]; + step1[15] = step2[15]; + + // stage 4 + step2[0] = step2[1] = vqrdmulh_lane_s16(step1[0], cospid_0_8_16_24, 2); + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + step2[8] = step1[8]; + idct_cospi_8_24_d(step1[14], step1[9], cospi_0_8_16_24, &step2[9], + &step2[14]); + idct_cospi_8_24_neg_d(step1[13], step1[10], cospi_0_8_16_24, &step2[13], + &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[1]; + step1[3] = step2[0]; + step1[4] = step2[4]; + idct_cospi_16_16_d(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]); + step1[7] = step2[7]; + step1[8] = vadd_s16(step2[8], step2[11]); + step1[9] = vadd_s16(step2[9], step2[10]); + step1[10] = vsub_s16(step2[9], step2[10]); + step1[11] = vsub_s16(step2[8], step2[11]); + step1[12] = vsub_s16(step2[15], step2[12]); + step1[13] = vsub_s16(step2[14], step2[13]); + step1[14] = vadd_s16(step2[14], step2[13]); + step1[15] = vadd_s16(step2[15], step2[12]); + + // stage 6 + step2[0] = vadd_s16(step1[0], step1[7]); + step2[1] = vadd_s16(step1[1], step1[6]); + step2[2] = vadd_s16(step1[2], step1[5]); + step2[3] = vadd_s16(step1[3], step1[4]); + step2[4] = vsub_s16(step1[3], step1[4]); + step2[5] = vsub_s16(step1[2], step1[5]); + step2[6] = vsub_s16(step1[1], step1[6]); + step2[7] = vsub_s16(step1[0], step1[7]); + idct_cospi_16_16_d(step1[10], step1[13], cospi_0_8_16_24, &step2[10], + &step2[13]); + idct_cospi_16_16_d(step1[11], step1[12], cospi_0_8_16_24, &step2[11], + &step2[12]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + out[0] = vadd_s16(step2[0], step2[15]); + out[1] = vadd_s16(step2[1], step2[14]); + out[2] = vadd_s16(step2[2], step2[13]); + out[3] = vadd_s16(step2[3], step2[12]); + out[4] = vadd_s16(step2[4], step2[11]); + out[5] = vadd_s16(step2[5], step2[10]); + out[6] = vadd_s16(step2[6], step2[9]); + out[7] = vadd_s16(step2[7], step2[8]); + out[8] = vsub_s16(step2[7], step2[8]); + out[9] = vsub_s16(step2[6], step2[9]); + out[10] = vsub_s16(step2[5], step2[10]); + out[11] = vsub_s16(step2[4], step2[11]); + out[12] = vsub_s16(step2[3], step2[12]); + out[13] = vsub_s16(step2[2], step2[13]); + out[14] = vsub_s16(step2[1], step2[14]); + out[15] = vsub_s16(step2[0], step2[15]); + + // pass 1: save the result into output + vst1_s16(output, out[0]); + output += 4; + vst1_s16(output, out[1]); + output += 4; + vst1_s16(output, out[2]); + output += 4; + vst1_s16(output, out[3]); + output += 4; + vst1_s16(output, out[4]); + output += 4; + vst1_s16(output, out[5]); + output += 4; + vst1_s16(output, out[6]); + output += 4; + vst1_s16(output, out[7]); + output += 4; + vst1_s16(output, out[8]); + output += 4; + vst1_s16(output, out[9]); + output += 4; + vst1_s16(output, out[10]); + output += 4; + vst1_s16(output, out[11]); + output += 4; + vst1_s16(output, out[12]); + output += 4; + vst1_s16(output, out[13]); + output += 4; + vst1_s16(output, out[14]); + output += 4; + vst1_s16(output, out[15]); +} + +void vpx_idct16x16_10_add_half1d_pass2(const int16_t *input, + int16_t *const output, void *const dest, + const int stride, + const int highbd_flag) { + const int16x8_t cospis0 = vld1q_s16(kCospi); + const int16x8_t cospis1 = vld1q_s16(kCospi + 8); + const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0); + const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1); + const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0); + const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0); + const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0); + const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1); + const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1); + int16x4_t ind[8]; + int16x8_t in[4], step1[16], step2[16], out[16]; + + // Load input (4x8) + ind[0] = vld1_s16(input); + input += 4; + ind[1] = vld1_s16(input); + input += 4; + ind[2] = vld1_s16(input); + input += 4; + ind[3] = vld1_s16(input); + input += 4; + ind[4] = vld1_s16(input); + input += 4; + ind[5] = vld1_s16(input); + input += 4; + ind[6] = vld1_s16(input); + input += 4; + ind[7] = vld1_s16(input); + + // Transpose + transpose_s16_4x8(ind[0], ind[1], ind[2], ind[3], ind[4], ind[5], ind[6], + ind[7], &in[0], &in[1], &in[2], &in[3]); + + // stage 1 + step1[0] = in[0 / 2]; + step1[4] = in[4 / 2]; + step1[8] = in[2 / 2]; + step1[12] = in[6 / 2]; + + // stage 2 + step2[0] = step1[0]; + step2[4] = step1[4]; + step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1); + step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1); + step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0); + step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0); + + // stage 3 + step1[0] = step2[0]; + step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3); + step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0); + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + step1[14] = step2[15]; + step1[15] = step2[15]; + + // stage 4 + step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2); + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + step2[8] = step1[8]; + idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9], + &step2[14]); + idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13], + &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[1]; + step1[3] = step2[0]; + step1[4] = step2[4]; + idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]); + step1[7] = step2[7]; + step1[8] = vaddq_s16(step2[8], step2[11]); + step1[9] = vaddq_s16(step2[9], step2[10]); + step1[10] = vsubq_s16(step2[9], step2[10]); + step1[11] = vsubq_s16(step2[8], step2[11]); + step1[12] = vsubq_s16(step2[15], step2[12]); + step1[13] = vsubq_s16(step2[14], step2[13]); + step1[14] = vaddq_s16(step2[14], step2[13]); + step1[15] = vaddq_s16(step2[15], step2[12]); + + // stage 6 + step2[0] = vaddq_s16(step1[0], step1[7]); + step2[1] = vaddq_s16(step1[1], step1[6]); + step2[2] = vaddq_s16(step1[2], step1[5]); + step2[3] = vaddq_s16(step1[3], step1[4]); + step2[4] = vsubq_s16(step1[3], step1[4]); + step2[5] = vsubq_s16(step1[2], step1[5]); + step2[6] = vsubq_s16(step1[1], step1[6]); + step2[7] = vsubq_s16(step1[0], step1[7]); + idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10], + &step2[13]); + idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11], + &step2[12]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + idct16x16_add_stage7(step2, out); + + if (output) { + idct16x16_store_pass1(out, output); + } else { + if (highbd_flag) { + idct16x16_add_store_bd8(out, dest, stride); + } else { + idct16x16_add_store(out, dest, stride); + } + } +} + +void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + int16_t row_idct_output[16 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + vpx_idct16x16_256_add_half1d(input, row_idct_output, dest, stride, 0); + + // Parallel idct on the lower 8 rows + vpx_idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, dest, + stride, 0); + + // pass 2 + // Parallel idct to get the left 8 columns + vpx_idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride, 0); + + // Parallel idct to get the right 8 columns + vpx_idct16x16_256_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride, + 0); +} + +void vpx_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + int16_t row_idct_output[16 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + vpx_idct16x16_38_add_half1d(input, row_idct_output, dest, stride, 0); + + // pass 2 + // Parallel idct to get the left 8 columns + vpx_idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, 0); + + // Parallel idct to get the right 8 columns + vpx_idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride, + 0); +} + +void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + int16_t row_idct_output[4 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + vpx_idct16x16_10_add_half1d_pass1(input, row_idct_output); + + // pass 2 + // Parallel idct to get the left 8 columns + vpx_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride, 0); + + // Parallel idct to get the right 8 columns + vpx_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8, + stride, 0); +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c new file mode 100644 index 0000000000..057731ad92 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c @@ -0,0 +1,674 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/txfm_common.h" + +static INLINE void load_8x8_s16(const tran_low_t *input, int16x8_t *const in0, + int16x8_t *const in1, int16x8_t *const in2, + int16x8_t *const in3, int16x8_t *const in4, + int16x8_t *const in5, int16x8_t *const in6, + int16x8_t *const in7) { + *in0 = load_tran_low_to_s16q(input); + input += 32; + *in1 = load_tran_low_to_s16q(input); + input += 32; + *in2 = load_tran_low_to_s16q(input); + input += 32; + *in3 = load_tran_low_to_s16q(input); + input += 32; + *in4 = load_tran_low_to_s16q(input); + input += 32; + *in5 = load_tran_low_to_s16q(input); + input += 32; + *in6 = load_tran_low_to_s16q(input); + input += 32; + *in7 = load_tran_low_to_s16q(input); +} + +static INLINE void load_4x8_s16(const tran_low_t *input, int16x4_t *const in0, + int16x4_t *const in1, int16x4_t *const in2, + int16x4_t *const in3, int16x4_t *const in4, + int16x4_t *const in5, int16x4_t *const in6, + int16x4_t *const in7) { + *in0 = load_tran_low_to_s16d(input); + input += 32; + *in1 = load_tran_low_to_s16d(input); + input += 32; + *in2 = load_tran_low_to_s16d(input); + input += 32; + *in3 = load_tran_low_to_s16d(input); + input += 32; + *in4 = load_tran_low_to_s16d(input); + input += 32; + *in5 = load_tran_low_to_s16d(input); + input += 32; + *in6 = load_tran_low_to_s16d(input); + input += 32; + *in7 = load_tran_low_to_s16d(input); +} + +// Only for the first pass of the _135_ variant. Since it only uses values from +// the top left 16x16 it can safely assume all the remaining values are 0 and +// skip an awful lot of calculations. In fact, only the first 12 columns make +// the cut. None of the elements in the 13th, 14th, 15th or 16th columns are +// used so it skips any calls to input[12|13|14|15] too. +// In C this does a single row of 32 for each call. Here it transposes the top +// left 12x8 to allow using SIMD. + +// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 135 non-zero +// coefficients as follows: +// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +// 0 0 2 5 10 17 25 38 47 62 83 101 121 +// 1 1 4 8 15 22 30 45 58 74 92 112 133 +// 2 3 7 12 18 28 36 52 64 82 102 118 +// 3 6 11 16 23 31 43 60 73 90 109 126 +// 4 9 14 19 29 37 50 65 78 98 116 134 +// 5 13 20 26 35 44 54 72 85 105 123 +// 6 21 27 33 42 53 63 80 94 113 132 +// 7 24 32 39 48 57 71 88 104 120 +// 8 34 40 46 56 68 81 96 111 130 +// 9 41 49 55 67 77 91 107 124 +// 10 51 59 66 76 89 99 119 131 +// 11 61 69 75 87 100 114 129 +// 12 70 79 86 97 108 122 +// 13 84 93 103 110 125 +// 14 98 106 115 127 +// 15 117 128 +void vpx_idct32_12_neon(const tran_low_t *const input, int16_t *output) { + int16x4_t tmp[8]; + int16x8_t in[12], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32]; + + load_8x8_s16(input, &in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + + load_4x8_s16(input + 8, &tmp[0], &tmp[1], &tmp[2], &tmp[3], &tmp[4], &tmp[5], + &tmp[6], &tmp[7]); + transpose_s16_4x8(tmp[0], tmp[1], tmp[2], tmp[3], tmp[4], tmp[5], tmp[6], + tmp[7], &in[8], &in[9], &in[10], &in[11]); + + // stage 1 + s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64); + s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64); + + s1[18] = multiply_shift_and_narrow_s16(in[9], cospi_23_64); + s1[29] = multiply_shift_and_narrow_s16(in[9], cospi_9_64); + + s1[19] = multiply_shift_and_narrow_s16(in[7], -cospi_25_64); + s1[28] = multiply_shift_and_narrow_s16(in[7], cospi_7_64); + + s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64); + s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64); + + s1[21] = multiply_shift_and_narrow_s16(in[11], -cospi_21_64); + s1[26] = multiply_shift_and_narrow_s16(in[11], cospi_11_64); + + s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64); + s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64); + + // stage 2 + s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64); + s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64); + + s2[10] = multiply_shift_and_narrow_s16(in[10], cospi_22_64); + s2[13] = multiply_shift_and_narrow_s16(in[10], cospi_10_64); + + s2[11] = multiply_shift_and_narrow_s16(in[6], -cospi_26_64); + s2[12] = multiply_shift_and_narrow_s16(in[6], cospi_6_64); + + s2[18] = vsubq_s16(s1[19], s1[18]); + s2[19] = vaddq_s16(s1[18], s1[19]); + s2[20] = vaddq_s16(s1[20], s1[21]); + s2[21] = vsubq_s16(s1[20], s1[21]); + s2[26] = vsubq_s16(s1[27], s1[26]); + s2[27] = vaddq_s16(s1[26], s1[27]); + s2[28] = vaddq_s16(s1[28], s1[29]); + s2[29] = vsubq_s16(s1[28], s1[29]); + + // stage 3 + s3[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64); + s3[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64); + + s3[10] = vsubq_s16(s2[11], s2[10]); + s3[11] = vaddq_s16(s2[10], s2[11]); + s3[12] = vaddq_s16(s2[12], s2[13]); + s3[13] = vsubq_s16(s2[12], s2[13]); + + s3[17] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_4_64, s1[31], + cospi_28_64); + s3[30] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_28_64, s1[31], + cospi_4_64); + + s3[18] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_28_64, + s2[29], -cospi_4_64); + s3[29] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_4_64, s2[29], + cospi_28_64); + + s3[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_20_64, + s2[26], cospi_12_64); + s3[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], cospi_12_64, s2[26], + cospi_20_64); + + s3[22] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_12_64, + s1[24], -cospi_20_64); + s3[25] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_20_64, + s1[24], cospi_12_64); + + // stage 4 + s4[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64); + s4[2] = multiply_shift_and_narrow_s16(in[8], cospi_24_64); + s4[3] = multiply_shift_and_narrow_s16(in[8], cospi_8_64); + + s4[9] = multiply_accumulate_shift_and_narrow_s16(s2[8], -cospi_8_64, s2[15], + cospi_24_64); + s4[14] = multiply_accumulate_shift_and_narrow_s16(s2[8], cospi_24_64, s2[15], + cospi_8_64); + + s4[10] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_24_64, + s3[13], -cospi_8_64); + s4[13] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_8_64, s3[13], + cospi_24_64); + + s4[16] = vaddq_s16(s1[16], s2[19]); + s4[17] = vaddq_s16(s3[17], s3[18]); + s4[18] = vsubq_s16(s3[17], s3[18]); + s4[19] = vsubq_s16(s1[16], s2[19]); + s4[20] = vsubq_s16(s1[23], s2[20]); + s4[21] = vsubq_s16(s3[22], s3[21]); + s4[22] = vaddq_s16(s3[21], s3[22]); + s4[23] = vaddq_s16(s2[20], s1[23]); + s4[24] = vaddq_s16(s1[24], s2[27]); + s4[25] = vaddq_s16(s3[25], s3[26]); + s4[26] = vsubq_s16(s3[25], s3[26]); + s4[27] = vsubq_s16(s1[24], s2[27]); + s4[28] = vsubq_s16(s1[31], s2[28]); + s4[29] = vsubq_s16(s3[30], s3[29]); + s4[30] = vaddq_s16(s3[29], s3[30]); + s4[31] = vaddq_s16(s2[28], s1[31]); + + // stage 5 + s5[0] = vaddq_s16(s4[0], s4[3]); + s5[1] = vaddq_s16(s4[0], s4[2]); + s5[2] = vsubq_s16(s4[0], s4[2]); + s5[3] = vsubq_s16(s4[0], s4[3]); + + s5[5] = sub_multiply_shift_and_narrow_s16(s3[7], s3[4], cospi_16_64); + s5[6] = add_multiply_shift_and_narrow_s16(s3[4], s3[7], cospi_16_64); + + s5[8] = vaddq_s16(s2[8], s3[11]); + s5[9] = vaddq_s16(s4[9], s4[10]); + s5[10] = vsubq_s16(s4[9], s4[10]); + s5[11] = vsubq_s16(s2[8], s3[11]); + s5[12] = vsubq_s16(s2[15], s3[12]); + s5[13] = vsubq_s16(s4[14], s4[13]); + s5[14] = vaddq_s16(s4[13], s4[14]); + s5[15] = vaddq_s16(s2[15], s3[12]); + + s5[18] = multiply_accumulate_shift_and_narrow_s16(s4[18], -cospi_8_64, s4[29], + cospi_24_64); + s5[29] = multiply_accumulate_shift_and_narrow_s16(s4[18], cospi_24_64, s4[29], + cospi_8_64); + + s5[19] = multiply_accumulate_shift_and_narrow_s16(s4[19], -cospi_8_64, s4[28], + cospi_24_64); + s5[28] = multiply_accumulate_shift_and_narrow_s16(s4[19], cospi_24_64, s4[28], + cospi_8_64); + + s5[20] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_24_64, + s4[27], -cospi_8_64); + s5[27] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_8_64, s4[27], + cospi_24_64); + + s5[21] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_24_64, + s4[26], -cospi_8_64); + s5[26] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_8_64, s4[26], + cospi_24_64); + + // stage 6 + s6[0] = vaddq_s16(s5[0], s3[7]); + s6[1] = vaddq_s16(s5[1], s5[6]); + s6[2] = vaddq_s16(s5[2], s5[5]); + s6[3] = vaddq_s16(s5[3], s3[4]); + s6[4] = vsubq_s16(s5[3], s3[4]); + s6[5] = vsubq_s16(s5[2], s5[5]); + s6[6] = vsubq_s16(s5[1], s5[6]); + s6[7] = vsubq_s16(s5[0], s3[7]); + + s6[10] = sub_multiply_shift_and_narrow_s16(s5[13], s5[10], cospi_16_64); + s6[13] = add_multiply_shift_and_narrow_s16(s5[10], s5[13], cospi_16_64); + + s6[11] = sub_multiply_shift_and_narrow_s16(s5[12], s5[11], cospi_16_64); + s6[12] = add_multiply_shift_and_narrow_s16(s5[11], s5[12], cospi_16_64); + + s6[16] = vaddq_s16(s4[16], s4[23]); + s6[17] = vaddq_s16(s4[17], s4[22]); + s6[18] = vaddq_s16(s5[18], s5[21]); + s6[19] = vaddq_s16(s5[19], s5[20]); + s6[20] = vsubq_s16(s5[19], s5[20]); + s6[21] = vsubq_s16(s5[18], s5[21]); + s6[22] = vsubq_s16(s4[17], s4[22]); + s6[23] = vsubq_s16(s4[16], s4[23]); + + s6[24] = vsubq_s16(s4[31], s4[24]); + s6[25] = vsubq_s16(s4[30], s4[25]); + s6[26] = vsubq_s16(s5[29], s5[26]); + s6[27] = vsubq_s16(s5[28], s5[27]); + s6[28] = vaddq_s16(s5[27], s5[28]); + s6[29] = vaddq_s16(s5[26], s5[29]); + s6[30] = vaddq_s16(s4[25], s4[30]); + s6[31] = vaddq_s16(s4[24], s4[31]); + + // stage 7 + s7[0] = vaddq_s16(s6[0], s5[15]); + s7[1] = vaddq_s16(s6[1], s5[14]); + s7[2] = vaddq_s16(s6[2], s6[13]); + s7[3] = vaddq_s16(s6[3], s6[12]); + s7[4] = vaddq_s16(s6[4], s6[11]); + s7[5] = vaddq_s16(s6[5], s6[10]); + s7[6] = vaddq_s16(s6[6], s5[9]); + s7[7] = vaddq_s16(s6[7], s5[8]); + s7[8] = vsubq_s16(s6[7], s5[8]); + s7[9] = vsubq_s16(s6[6], s5[9]); + s7[10] = vsubq_s16(s6[5], s6[10]); + s7[11] = vsubq_s16(s6[4], s6[11]); + s7[12] = vsubq_s16(s6[3], s6[12]); + s7[13] = vsubq_s16(s6[2], s6[13]); + s7[14] = vsubq_s16(s6[1], s5[14]); + s7[15] = vsubq_s16(s6[0], s5[15]); + + s7[20] = sub_multiply_shift_and_narrow_s16(s6[27], s6[20], cospi_16_64); + s7[27] = add_multiply_shift_and_narrow_s16(s6[20], s6[27], cospi_16_64); + + s7[21] = sub_multiply_shift_and_narrow_s16(s6[26], s6[21], cospi_16_64); + s7[26] = add_multiply_shift_and_narrow_s16(s6[21], s6[26], cospi_16_64); + + s7[22] = sub_multiply_shift_and_narrow_s16(s6[25], s6[22], cospi_16_64); + s7[25] = add_multiply_shift_and_narrow_s16(s6[22], s6[25], cospi_16_64); + + s7[23] = sub_multiply_shift_and_narrow_s16(s6[24], s6[23], cospi_16_64); + s7[24] = add_multiply_shift_and_narrow_s16(s6[23], s6[24], cospi_16_64); + + // final stage + vst1q_s16(output, vaddq_s16(s7[0], s6[31])); + output += 16; + vst1q_s16(output, vaddq_s16(s7[1], s6[30])); + output += 16; + vst1q_s16(output, vaddq_s16(s7[2], s6[29])); + output += 16; + vst1q_s16(output, vaddq_s16(s7[3], s6[28])); + output += 16; + vst1q_s16(output, vaddq_s16(s7[4], s7[27])); + output += 16; + vst1q_s16(output, vaddq_s16(s7[5], s7[26])); + output += 16; + vst1q_s16(output, vaddq_s16(s7[6], s7[25])); + output += 16; + vst1q_s16(output, vaddq_s16(s7[7], s7[24])); + output += 16; + + vst1q_s16(output, vaddq_s16(s7[8], s7[23])); + output += 16; + vst1q_s16(output, vaddq_s16(s7[9], s7[22])); + output += 16; + vst1q_s16(output, vaddq_s16(s7[10], s7[21])); + output += 16; + vst1q_s16(output, vaddq_s16(s7[11], s7[20])); + output += 16; + vst1q_s16(output, vaddq_s16(s7[12], s6[19])); + output += 16; + vst1q_s16(output, vaddq_s16(s7[13], s6[18])); + output += 16; + vst1q_s16(output, vaddq_s16(s7[14], s6[17])); + output += 16; + vst1q_s16(output, vaddq_s16(s7[15], s6[16])); + output += 16; + + vst1q_s16(output, vsubq_s16(s7[15], s6[16])); + output += 16; + vst1q_s16(output, vsubq_s16(s7[14], s6[17])); + output += 16; + vst1q_s16(output, vsubq_s16(s7[13], s6[18])); + output += 16; + vst1q_s16(output, vsubq_s16(s7[12], s6[19])); + output += 16; + vst1q_s16(output, vsubq_s16(s7[11], s7[20])); + output += 16; + vst1q_s16(output, vsubq_s16(s7[10], s7[21])); + output += 16; + vst1q_s16(output, vsubq_s16(s7[9], s7[22])); + output += 16; + vst1q_s16(output, vsubq_s16(s7[8], s7[23])); + output += 16; + + vst1q_s16(output, vsubq_s16(s7[7], s7[24])); + output += 16; + vst1q_s16(output, vsubq_s16(s7[6], s7[25])); + output += 16; + vst1q_s16(output, vsubq_s16(s7[5], s7[26])); + output += 16; + vst1q_s16(output, vsubq_s16(s7[4], s7[27])); + output += 16; + vst1q_s16(output, vsubq_s16(s7[3], s6[28])); + output += 16; + vst1q_s16(output, vsubq_s16(s7[2], s6[29])); + output += 16; + vst1q_s16(output, vsubq_s16(s7[1], s6[30])); + output += 16; + vst1q_s16(output, vsubq_s16(s7[0], s6[31])); +} + +void vpx_idct32_16_neon(const int16_t *const input, void *const output, + const int stride, const int highbd_flag) { + int16x8_t in[16], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32], + out[32]; + + load_and_transpose_s16_8x8(input, 16, &in[0], &in[1], &in[2], &in[3], &in[4], + &in[5], &in[6], &in[7]); + + load_and_transpose_s16_8x8(input + 8, 16, &in[8], &in[9], &in[10], &in[11], + &in[12], &in[13], &in[14], &in[15]); + + // stage 1 + s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64); + s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64); + + s1[17] = multiply_shift_and_narrow_s16(in[15], -cospi_17_64); + s1[30] = multiply_shift_and_narrow_s16(in[15], cospi_15_64); + + s1[18] = multiply_shift_and_narrow_s16(in[9], cospi_23_64); + s1[29] = multiply_shift_and_narrow_s16(in[9], cospi_9_64); + + s1[19] = multiply_shift_and_narrow_s16(in[7], -cospi_25_64); + s1[28] = multiply_shift_and_narrow_s16(in[7], cospi_7_64); + + s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64); + s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64); + + s1[21] = multiply_shift_and_narrow_s16(in[11], -cospi_21_64); + s1[26] = multiply_shift_and_narrow_s16(in[11], cospi_11_64); + + s1[22] = multiply_shift_and_narrow_s16(in[13], cospi_19_64); + s1[25] = multiply_shift_and_narrow_s16(in[13], cospi_13_64); + + s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64); + s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64); + + // stage 2 + s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64); + s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64); + + s2[9] = multiply_shift_and_narrow_s16(in[14], -cospi_18_64); + s2[14] = multiply_shift_and_narrow_s16(in[14], cospi_14_64); + + s2[10] = multiply_shift_and_narrow_s16(in[10], cospi_22_64); + s2[13] = multiply_shift_and_narrow_s16(in[10], cospi_10_64); + + s2[11] = multiply_shift_and_narrow_s16(in[6], -cospi_26_64); + s2[12] = multiply_shift_and_narrow_s16(in[6], cospi_6_64); + + s2[16] = vaddq_s16(s1[16], s1[17]); + s2[17] = vsubq_s16(s1[16], s1[17]); + s2[18] = vsubq_s16(s1[19], s1[18]); + s2[19] = vaddq_s16(s1[18], s1[19]); + s2[20] = vaddq_s16(s1[20], s1[21]); + s2[21] = vsubq_s16(s1[20], s1[21]); + s2[22] = vsubq_s16(s1[23], s1[22]); + s2[23] = vaddq_s16(s1[22], s1[23]); + s2[24] = vaddq_s16(s1[24], s1[25]); + s2[25] = vsubq_s16(s1[24], s1[25]); + s2[26] = vsubq_s16(s1[27], s1[26]); + s2[27] = vaddq_s16(s1[26], s1[27]); + s2[28] = vaddq_s16(s1[28], s1[29]); + s2[29] = vsubq_s16(s1[28], s1[29]); + s2[30] = vsubq_s16(s1[31], s1[30]); + s2[31] = vaddq_s16(s1[30], s1[31]); + + // stage 3 + s3[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64); + s3[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64); + + s3[5] = multiply_shift_and_narrow_s16(in[12], -cospi_20_64); + s3[6] = multiply_shift_and_narrow_s16(in[12], cospi_12_64); + + s3[8] = vaddq_s16(s2[8], s2[9]); + s3[9] = vsubq_s16(s2[8], s2[9]); + s3[10] = vsubq_s16(s2[11], s2[10]); + s3[11] = vaddq_s16(s2[10], s2[11]); + s3[12] = vaddq_s16(s2[12], s2[13]); + s3[13] = vsubq_s16(s2[12], s2[13]); + s3[14] = vsubq_s16(s2[15], s2[14]); + s3[15] = vaddq_s16(s2[14], s2[15]); + + s3[17] = multiply_accumulate_shift_and_narrow_s16(s2[17], -cospi_4_64, s2[30], + cospi_28_64); + s3[30] = multiply_accumulate_shift_and_narrow_s16(s2[17], cospi_28_64, s2[30], + cospi_4_64); + + s3[18] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_28_64, + s2[29], -cospi_4_64); + s3[29] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_4_64, s2[29], + cospi_28_64); + + s3[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_20_64, + s2[26], cospi_12_64); + s3[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], cospi_12_64, s2[26], + cospi_20_64); + + s3[22] = multiply_accumulate_shift_and_narrow_s16(s2[22], -cospi_12_64, + s2[25], -cospi_20_64); + s3[25] = multiply_accumulate_shift_and_narrow_s16(s2[22], -cospi_20_64, + s2[25], cospi_12_64); + + // stage 4 + s4[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64); + s4[2] = multiply_shift_and_narrow_s16(in[8], cospi_24_64); + s4[3] = multiply_shift_and_narrow_s16(in[8], cospi_8_64); + + s4[4] = vaddq_s16(s3[4], s3[5]); + s4[5] = vsubq_s16(s3[4], s3[5]); + s4[6] = vsubq_s16(s3[7], s3[6]); + s4[7] = vaddq_s16(s3[6], s3[7]); + + s4[9] = multiply_accumulate_shift_and_narrow_s16(s3[9], -cospi_8_64, s3[14], + cospi_24_64); + s4[14] = multiply_accumulate_shift_and_narrow_s16(s3[9], cospi_24_64, s3[14], + cospi_8_64); + + s4[10] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_24_64, + s3[13], -cospi_8_64); + s4[13] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_8_64, s3[13], + cospi_24_64); + + s4[16] = vaddq_s16(s2[16], s2[19]); + s4[17] = vaddq_s16(s3[17], s3[18]); + s4[18] = vsubq_s16(s3[17], s3[18]); + s4[19] = vsubq_s16(s2[16], s2[19]); + s4[20] = vsubq_s16(s2[23], s2[20]); + s4[21] = vsubq_s16(s3[22], s3[21]); + s4[22] = vaddq_s16(s3[21], s3[22]); + s4[23] = vaddq_s16(s2[20], s2[23]); + s4[24] = vaddq_s16(s2[24], s2[27]); + s4[25] = vaddq_s16(s3[25], s3[26]); + s4[26] = vsubq_s16(s3[25], s3[26]); + s4[27] = vsubq_s16(s2[24], s2[27]); + s4[28] = vsubq_s16(s2[31], s2[28]); + s4[29] = vsubq_s16(s3[30], s3[29]); + s4[30] = vaddq_s16(s3[29], s3[30]); + s4[31] = vaddq_s16(s2[28], s2[31]); + + // stage 5 + s5[0] = vaddq_s16(s4[0], s4[3]); + s5[1] = vaddq_s16(s4[0], s4[2]); + s5[2] = vsubq_s16(s4[0], s4[2]); + s5[3] = vsubq_s16(s4[0], s4[3]); + + s5[5] = sub_multiply_shift_and_narrow_s16(s4[6], s4[5], cospi_16_64); + s5[6] = add_multiply_shift_and_narrow_s16(s4[5], s4[6], cospi_16_64); + + s5[8] = vaddq_s16(s3[8], s3[11]); + s5[9] = vaddq_s16(s4[9], s4[10]); + s5[10] = vsubq_s16(s4[9], s4[10]); + s5[11] = vsubq_s16(s3[8], s3[11]); + s5[12] = vsubq_s16(s3[15], s3[12]); + s5[13] = vsubq_s16(s4[14], s4[13]); + s5[14] = vaddq_s16(s4[13], s4[14]); + s5[15] = vaddq_s16(s3[15], s3[12]); + + s5[18] = multiply_accumulate_shift_and_narrow_s16(s4[18], -cospi_8_64, s4[29], + cospi_24_64); + s5[29] = multiply_accumulate_shift_and_narrow_s16(s4[18], cospi_24_64, s4[29], + cospi_8_64); + + s5[19] = multiply_accumulate_shift_and_narrow_s16(s4[19], -cospi_8_64, s4[28], + cospi_24_64); + s5[28] = multiply_accumulate_shift_and_narrow_s16(s4[19], cospi_24_64, s4[28], + cospi_8_64); + + s5[20] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_24_64, + s4[27], -cospi_8_64); + s5[27] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_8_64, s4[27], + cospi_24_64); + + s5[21] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_24_64, + s4[26], -cospi_8_64); + s5[26] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_8_64, s4[26], + cospi_24_64); + + // stage 6 + s6[0] = vaddq_s16(s5[0], s4[7]); + s6[1] = vaddq_s16(s5[1], s5[6]); + s6[2] = vaddq_s16(s5[2], s5[5]); + s6[3] = vaddq_s16(s5[3], s4[4]); + s6[4] = vsubq_s16(s5[3], s4[4]); + s6[5] = vsubq_s16(s5[2], s5[5]); + s6[6] = vsubq_s16(s5[1], s5[6]); + s6[7] = vsubq_s16(s5[0], s4[7]); + + s6[10] = sub_multiply_shift_and_narrow_s16(s5[13], s5[10], cospi_16_64); + s6[13] = add_multiply_shift_and_narrow_s16(s5[10], s5[13], cospi_16_64); + + s6[11] = sub_multiply_shift_and_narrow_s16(s5[12], s5[11], cospi_16_64); + s6[12] = add_multiply_shift_and_narrow_s16(s5[11], s5[12], cospi_16_64); + + s6[16] = vaddq_s16(s4[16], s4[23]); + s6[17] = vaddq_s16(s4[17], s4[22]); + s6[18] = vaddq_s16(s5[18], s5[21]); + s6[19] = vaddq_s16(s5[19], s5[20]); + s6[20] = vsubq_s16(s5[19], s5[20]); + s6[21] = vsubq_s16(s5[18], s5[21]); + s6[22] = vsubq_s16(s4[17], s4[22]); + s6[23] = vsubq_s16(s4[16], s4[23]); + s6[24] = vsubq_s16(s4[31], s4[24]); + s6[25] = vsubq_s16(s4[30], s4[25]); + s6[26] = vsubq_s16(s5[29], s5[26]); + s6[27] = vsubq_s16(s5[28], s5[27]); + s6[28] = vaddq_s16(s5[27], s5[28]); + s6[29] = vaddq_s16(s5[26], s5[29]); + s6[30] = vaddq_s16(s4[25], s4[30]); + s6[31] = vaddq_s16(s4[24], s4[31]); + + // stage 7 + s7[0] = vaddq_s16(s6[0], s5[15]); + s7[1] = vaddq_s16(s6[1], s5[14]); + s7[2] = vaddq_s16(s6[2], s6[13]); + s7[3] = vaddq_s16(s6[3], s6[12]); + s7[4] = vaddq_s16(s6[4], s6[11]); + s7[5] = vaddq_s16(s6[5], s6[10]); + s7[6] = vaddq_s16(s6[6], s5[9]); + s7[7] = vaddq_s16(s6[7], s5[8]); + s7[8] = vsubq_s16(s6[7], s5[8]); + s7[9] = vsubq_s16(s6[6], s5[9]); + s7[10] = vsubq_s16(s6[5], s6[10]); + s7[11] = vsubq_s16(s6[4], s6[11]); + s7[12] = vsubq_s16(s6[3], s6[12]); + s7[13] = vsubq_s16(s6[2], s6[13]); + s7[14] = vsubq_s16(s6[1], s5[14]); + s7[15] = vsubq_s16(s6[0], s5[15]); + + s7[20] = sub_multiply_shift_and_narrow_s16(s6[27], s6[20], cospi_16_64); + s7[27] = add_multiply_shift_and_narrow_s16(s6[20], s6[27], cospi_16_64); + + s7[21] = sub_multiply_shift_and_narrow_s16(s6[26], s6[21], cospi_16_64); + s7[26] = add_multiply_shift_and_narrow_s16(s6[21], s6[26], cospi_16_64); + + s7[22] = sub_multiply_shift_and_narrow_s16(s6[25], s6[22], cospi_16_64); + s7[25] = add_multiply_shift_and_narrow_s16(s6[22], s6[25], cospi_16_64); + + s7[23] = sub_multiply_shift_and_narrow_s16(s6[24], s6[23], cospi_16_64); + s7[24] = add_multiply_shift_and_narrow_s16(s6[23], s6[24], cospi_16_64); + + // final stage + out[0] = final_add(s7[0], s6[31]); + out[1] = final_add(s7[1], s6[30]); + out[2] = final_add(s7[2], s6[29]); + out[3] = final_add(s7[3], s6[28]); + out[4] = final_add(s7[4], s7[27]); + out[5] = final_add(s7[5], s7[26]); + out[6] = final_add(s7[6], s7[25]); + out[7] = final_add(s7[7], s7[24]); + out[8] = final_add(s7[8], s7[23]); + out[9] = final_add(s7[9], s7[22]); + out[10] = final_add(s7[10], s7[21]); + out[11] = final_add(s7[11], s7[20]); + out[12] = final_add(s7[12], s6[19]); + out[13] = final_add(s7[13], s6[18]); + out[14] = final_add(s7[14], s6[17]); + out[15] = final_add(s7[15], s6[16]); + out[16] = final_sub(s7[15], s6[16]); + out[17] = final_sub(s7[14], s6[17]); + out[18] = final_sub(s7[13], s6[18]); + out[19] = final_sub(s7[12], s6[19]); + out[20] = final_sub(s7[11], s7[20]); + out[21] = final_sub(s7[10], s7[21]); + out[22] = final_sub(s7[9], s7[22]); + out[23] = final_sub(s7[8], s7[23]); + out[24] = final_sub(s7[7], s7[24]); + out[25] = final_sub(s7[6], s7[25]); + out[26] = final_sub(s7[5], s7[26]); + out[27] = final_sub(s7[4], s7[27]); + out[28] = final_sub(s7[3], s6[28]); + out[29] = final_sub(s7[2], s6[29]); + out[30] = final_sub(s7[1], s6[30]); + out[31] = final_sub(s7[0], s6[31]); + + if (highbd_flag) { + highbd_add_and_store_bd8(out, output, stride); + } else { + uint8_t *const outputT = (uint8_t *)output; + add_and_store_u8_s16(out + 0, outputT, stride); + add_and_store_u8_s16(out + 8, outputT + (8 * stride), stride); + add_and_store_u8_s16(out + 16, outputT + (16 * stride), stride); + add_and_store_u8_s16(out + 24, outputT + (24 * stride), stride); + } +} + +void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + int i; + int16_t temp[32 * 16]; + int16_t *t = temp; + + vpx_idct32_12_neon(input, temp); + vpx_idct32_12_neon(input + 32 * 8, temp + 8); + + for (i = 0; i < 32; i += 8) { + vpx_idct32_16_neon(t, dest, stride, 0); + t += (16 * 8); + dest += 8; + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c new file mode 100644 index 0000000000..8920b93363 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/inv_txfm.h" + +static INLINE void idct32x32_1_add_pos_kernel(uint8_t **dest, const int stride, + const uint8x16_t res) { + const uint8x16_t a0 = vld1q_u8(*dest); + const uint8x16_t a1 = vld1q_u8(*dest + 16); + const uint8x16_t b0 = vqaddq_u8(a0, res); + const uint8x16_t b1 = vqaddq_u8(a1, res); + vst1q_u8(*dest, b0); + vst1q_u8(*dest + 16, b1); + *dest += stride; +} + +static INLINE void idct32x32_1_add_neg_kernel(uint8_t **dest, const int stride, + const uint8x16_t res) { + const uint8x16_t a0 = vld1q_u8(*dest); + const uint8x16_t a1 = vld1q_u8(*dest + 16); + const uint8x16_t b0 = vqsubq_u8(a0, res); + const uint8x16_t b1 = vqsubq_u8(a1, res); + vst1q_u8(*dest, b0); + vst1q_u8(*dest + 16, b1); + *dest += stride; +} + +void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + int i; + const int16_t out0 = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); + const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64)); + const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6); + + if (a1 >= 0) { + const uint8x16_t dc = create_dcq(a1); + for (i = 0; i < 32; i++) { + idct32x32_1_add_pos_kernel(&dest, stride, dc); + } + } else { + const uint8x16_t dc = create_dcq(-a1); + for (i = 0; i < 32; i++) { + idct32x32_1_add_neg_kernel(&dest, stride, dc); + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c new file mode 100644 index 0000000000..f570547e44 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c @@ -0,0 +1,513 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/txfm_common.h" + +// Only for the first pass of the _34_ variant. Since it only uses values from +// the top left 8x8 it can safely assume all the remaining values are 0 and skip +// an awful lot of calculations. In fact, only the first 6 columns make the cut. +// None of the elements in the 7th or 8th column are used so it skips any calls +// to input[67] too. +// In C this does a single row of 32 for each call. Here it transposes the top +// left 8x8 to allow using SIMD. + +// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 34 non-zero +// coefficients as follows: +// 0 1 2 3 4 5 6 7 +// 0 0 2 5 10 17 25 +// 1 1 4 8 15 22 30 +// 2 3 7 12 18 28 +// 3 6 11 16 23 31 +// 4 9 14 19 29 +// 5 13 20 26 +// 6 21 27 33 +// 7 24 32 +void vpx_idct32_6_neon(const tran_low_t *input, int16_t *output) { + int16x8_t in[8], s1[32], s2[32], s3[32]; + + in[0] = load_tran_low_to_s16q(input); + input += 32; + in[1] = load_tran_low_to_s16q(input); + input += 32; + in[2] = load_tran_low_to_s16q(input); + input += 32; + in[3] = load_tran_low_to_s16q(input); + input += 32; + in[4] = load_tran_low_to_s16q(input); + input += 32; + in[5] = load_tran_low_to_s16q(input); + input += 32; + in[6] = load_tran_low_to_s16q(input); + input += 32; + in[7] = load_tran_low_to_s16q(input); + transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + + // stage 1 + // input[1] * cospi_31_64 - input[31] * cospi_1_64 (but input[31] == 0) + s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64); + // input[1] * cospi_1_64 + input[31] * cospi_31_64 (but input[31] == 0) + s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64); + + s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64); + s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64); + + s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64); + s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64); + + // stage 2 + s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64); + s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64); + + // stage 3 + s1[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64); + s1[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64); + + s1[17] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_4_64, s1[31], + cospi_28_64); + s1[30] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_28_64, s1[31], + cospi_4_64); + + s1[21] = multiply_accumulate_shift_and_narrow_s16(s1[20], -cospi_20_64, + s1[27], cospi_12_64); + s1[26] = multiply_accumulate_shift_and_narrow_s16(s1[20], cospi_12_64, s1[27], + cospi_20_64); + + s1[22] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_12_64, + s1[24], -cospi_20_64); + s1[25] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_20_64, + s1[24], cospi_12_64); + + // stage 4 + s1[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64); + + s2[9] = multiply_accumulate_shift_and_narrow_s16(s2[8], -cospi_8_64, s2[15], + cospi_24_64); + s2[14] = multiply_accumulate_shift_and_narrow_s16(s2[8], cospi_24_64, s2[15], + cospi_8_64); + + s2[20] = vsubq_s16(s1[23], s1[20]); + s2[21] = vsubq_s16(s1[22], s1[21]); + s2[22] = vaddq_s16(s1[21], s1[22]); + s2[23] = vaddq_s16(s1[20], s1[23]); + s2[24] = vaddq_s16(s1[24], s1[27]); + s2[25] = vaddq_s16(s1[25], s1[26]); + s2[26] = vsubq_s16(s1[25], s1[26]); + s2[27] = vsubq_s16(s1[24], s1[27]); + + // stage 5 + s1[5] = sub_multiply_shift_and_narrow_s16(s1[7], s1[4], cospi_16_64); + s1[6] = add_multiply_shift_and_narrow_s16(s1[4], s1[7], cospi_16_64); + + s1[18] = multiply_accumulate_shift_and_narrow_s16(s1[17], -cospi_8_64, s1[30], + cospi_24_64); + s1[29] = multiply_accumulate_shift_and_narrow_s16(s1[17], cospi_24_64, s1[30], + cospi_8_64); + + s1[19] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_8_64, s1[31], + cospi_24_64); + s1[28] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_24_64, s1[31], + cospi_8_64); + + s1[20] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_24_64, + s2[27], -cospi_8_64); + s1[27] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_8_64, s2[27], + cospi_24_64); + + s1[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_24_64, + s2[26], -cospi_8_64); + s1[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_8_64, s2[26], + cospi_24_64); + + // stage 6 + s2[0] = vaddq_s16(s1[0], s1[7]); + s2[1] = vaddq_s16(s1[0], s1[6]); + s2[2] = vaddq_s16(s1[0], s1[5]); + s2[3] = vaddq_s16(s1[0], s1[4]); + s2[4] = vsubq_s16(s1[0], s1[4]); + s2[5] = vsubq_s16(s1[0], s1[5]); + s2[6] = vsubq_s16(s1[0], s1[6]); + s2[7] = vsubq_s16(s1[0], s1[7]); + + s2[10] = sub_multiply_shift_and_narrow_s16(s2[14], s2[9], cospi_16_64); + s2[13] = add_multiply_shift_and_narrow_s16(s2[9], s2[14], cospi_16_64); + + s2[11] = sub_multiply_shift_and_narrow_s16(s2[15], s2[8], cospi_16_64); + s2[12] = add_multiply_shift_and_narrow_s16(s2[8], s2[15], cospi_16_64); + + s2[16] = vaddq_s16(s1[16], s2[23]); + s2[17] = vaddq_s16(s1[17], s2[22]); + s2[18] = vaddq_s16(s1[18], s1[21]); + s2[19] = vaddq_s16(s1[19], s1[20]); + s2[20] = vsubq_s16(s1[19], s1[20]); + s2[21] = vsubq_s16(s1[18], s1[21]); + s2[22] = vsubq_s16(s1[17], s2[22]); + s2[23] = vsubq_s16(s1[16], s2[23]); + + s3[24] = vsubq_s16(s1[31], s2[24]); + s3[25] = vsubq_s16(s1[30], s2[25]); + s3[26] = vsubq_s16(s1[29], s1[26]); + s3[27] = vsubq_s16(s1[28], s1[27]); + s2[28] = vaddq_s16(s1[27], s1[28]); + s2[29] = vaddq_s16(s1[26], s1[29]); + s2[30] = vaddq_s16(s2[25], s1[30]); + s2[31] = vaddq_s16(s2[24], s1[31]); + + // stage 7 + s1[0] = vaddq_s16(s2[0], s2[15]); + s1[1] = vaddq_s16(s2[1], s2[14]); + s1[2] = vaddq_s16(s2[2], s2[13]); + s1[3] = vaddq_s16(s2[3], s2[12]); + s1[4] = vaddq_s16(s2[4], s2[11]); + s1[5] = vaddq_s16(s2[5], s2[10]); + s1[6] = vaddq_s16(s2[6], s2[9]); + s1[7] = vaddq_s16(s2[7], s2[8]); + s1[8] = vsubq_s16(s2[7], s2[8]); + s1[9] = vsubq_s16(s2[6], s2[9]); + s1[10] = vsubq_s16(s2[5], s2[10]); + s1[11] = vsubq_s16(s2[4], s2[11]); + s1[12] = vsubq_s16(s2[3], s2[12]); + s1[13] = vsubq_s16(s2[2], s2[13]); + s1[14] = vsubq_s16(s2[1], s2[14]); + s1[15] = vsubq_s16(s2[0], s2[15]); + + s1[20] = sub_multiply_shift_and_narrow_s16(s3[27], s2[20], cospi_16_64); + s1[27] = add_multiply_shift_and_narrow_s16(s2[20], s3[27], cospi_16_64); + + s1[21] = sub_multiply_shift_and_narrow_s16(s3[26], s2[21], cospi_16_64); + s1[26] = add_multiply_shift_and_narrow_s16(s2[21], s3[26], cospi_16_64); + + s1[22] = sub_multiply_shift_and_narrow_s16(s3[25], s2[22], cospi_16_64); + s1[25] = add_multiply_shift_and_narrow_s16(s2[22], s3[25], cospi_16_64); + + s1[23] = sub_multiply_shift_and_narrow_s16(s3[24], s2[23], cospi_16_64); + s1[24] = add_multiply_shift_and_narrow_s16(s2[23], s3[24], cospi_16_64); + + // final stage + vst1q_s16(output, vaddq_s16(s1[0], s2[31])); + output += 8; + vst1q_s16(output, vaddq_s16(s1[1], s2[30])); + output += 8; + vst1q_s16(output, vaddq_s16(s1[2], s2[29])); + output += 8; + vst1q_s16(output, vaddq_s16(s1[3], s2[28])); + output += 8; + vst1q_s16(output, vaddq_s16(s1[4], s1[27])); + output += 8; + vst1q_s16(output, vaddq_s16(s1[5], s1[26])); + output += 8; + vst1q_s16(output, vaddq_s16(s1[6], s1[25])); + output += 8; + vst1q_s16(output, vaddq_s16(s1[7], s1[24])); + output += 8; + + vst1q_s16(output, vaddq_s16(s1[8], s1[23])); + output += 8; + vst1q_s16(output, vaddq_s16(s1[9], s1[22])); + output += 8; + vst1q_s16(output, vaddq_s16(s1[10], s1[21])); + output += 8; + vst1q_s16(output, vaddq_s16(s1[11], s1[20])); + output += 8; + vst1q_s16(output, vaddq_s16(s1[12], s2[19])); + output += 8; + vst1q_s16(output, vaddq_s16(s1[13], s2[18])); + output += 8; + vst1q_s16(output, vaddq_s16(s1[14], s2[17])); + output += 8; + vst1q_s16(output, vaddq_s16(s1[15], s2[16])); + output += 8; + + vst1q_s16(output, vsubq_s16(s1[15], s2[16])); + output += 8; + vst1q_s16(output, vsubq_s16(s1[14], s2[17])); + output += 8; + vst1q_s16(output, vsubq_s16(s1[13], s2[18])); + output += 8; + vst1q_s16(output, vsubq_s16(s1[12], s2[19])); + output += 8; + vst1q_s16(output, vsubq_s16(s1[11], s1[20])); + output += 8; + vst1q_s16(output, vsubq_s16(s1[10], s1[21])); + output += 8; + vst1q_s16(output, vsubq_s16(s1[9], s1[22])); + output += 8; + vst1q_s16(output, vsubq_s16(s1[8], s1[23])); + output += 8; + + vst1q_s16(output, vsubq_s16(s1[7], s1[24])); + output += 8; + vst1q_s16(output, vsubq_s16(s1[6], s1[25])); + output += 8; + vst1q_s16(output, vsubq_s16(s1[5], s1[26])); + output += 8; + vst1q_s16(output, vsubq_s16(s1[4], s1[27])); + output += 8; + vst1q_s16(output, vsubq_s16(s1[3], s2[28])); + output += 8; + vst1q_s16(output, vsubq_s16(s1[2], s2[29])); + output += 8; + vst1q_s16(output, vsubq_s16(s1[1], s2[30])); + output += 8; + vst1q_s16(output, vsubq_s16(s1[0], s2[31])); +} + +void vpx_idct32_8_neon(const int16_t *input, void *const output, int stride, + const int highbd_flag) { + int16x8_t in[8], s1[32], s2[32], s3[32], out[32]; + + load_and_transpose_s16_8x8(input, 8, &in[0], &in[1], &in[2], &in[3], &in[4], + &in[5], &in[6], &in[7]); + + // stage 1 + s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64); + s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64); + + // Different for _8_ + s1[19] = multiply_shift_and_narrow_s16(in[7], -cospi_25_64); + s1[28] = multiply_shift_and_narrow_s16(in[7], cospi_7_64); + + s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64); + s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64); + + s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64); + s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64); + + // stage 2 + s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64); + s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64); + + s2[11] = multiply_shift_and_narrow_s16(in[6], -cospi_26_64); + s2[12] = multiply_shift_and_narrow_s16(in[6], cospi_6_64); + + // stage 3 + s1[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64); + s1[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64); + + s1[17] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_4_64, s1[31], + cospi_28_64); + s1[30] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_28_64, s1[31], + cospi_4_64); + + // Different for _8_ + s1[18] = multiply_accumulate_shift_and_narrow_s16(s1[19], -cospi_28_64, + s1[28], -cospi_4_64); + s1[29] = multiply_accumulate_shift_and_narrow_s16(s1[19], -cospi_4_64, s1[28], + cospi_28_64); + + s1[21] = multiply_accumulate_shift_and_narrow_s16(s1[20], -cospi_20_64, + s1[27], cospi_12_64); + s1[26] = multiply_accumulate_shift_and_narrow_s16(s1[20], cospi_12_64, s1[27], + cospi_20_64); + + s1[22] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_12_64, + s1[24], -cospi_20_64); + s1[25] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_20_64, + s1[24], cospi_12_64); + + // stage 4 + s1[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64); + + s2[9] = multiply_accumulate_shift_and_narrow_s16(s2[8], -cospi_8_64, s2[15], + cospi_24_64); + s2[14] = multiply_accumulate_shift_and_narrow_s16(s2[8], cospi_24_64, s2[15], + cospi_8_64); + + s2[10] = multiply_accumulate_shift_and_narrow_s16(s2[11], -cospi_24_64, + s2[12], -cospi_8_64); + s2[13] = multiply_accumulate_shift_and_narrow_s16(s2[11], -cospi_8_64, s2[12], + cospi_24_64); + + s2[16] = vaddq_s16(s1[16], s1[19]); + + s2[17] = vaddq_s16(s1[17], s1[18]); + s2[18] = vsubq_s16(s1[17], s1[18]); + + s2[19] = vsubq_s16(s1[16], s1[19]); + + s2[20] = vsubq_s16(s1[23], s1[20]); + s2[21] = vsubq_s16(s1[22], s1[21]); + + s2[22] = vaddq_s16(s1[21], s1[22]); + s2[23] = vaddq_s16(s1[20], s1[23]); + + s2[24] = vaddq_s16(s1[24], s1[27]); + s2[25] = vaddq_s16(s1[25], s1[26]); + s2[26] = vsubq_s16(s1[25], s1[26]); + s2[27] = vsubq_s16(s1[24], s1[27]); + + s2[28] = vsubq_s16(s1[31], s1[28]); + s2[29] = vsubq_s16(s1[30], s1[29]); + s2[30] = vaddq_s16(s1[29], s1[30]); + s2[31] = vaddq_s16(s1[28], s1[31]); + + // stage 5 + s1[5] = sub_multiply_shift_and_narrow_s16(s1[7], s1[4], cospi_16_64); + s1[6] = add_multiply_shift_and_narrow_s16(s1[4], s1[7], cospi_16_64); + + s1[8] = vaddq_s16(s2[8], s2[11]); + s1[9] = vaddq_s16(s2[9], s2[10]); + s1[10] = vsubq_s16(s2[9], s2[10]); + s1[11] = vsubq_s16(s2[8], s2[11]); + s1[12] = vsubq_s16(s2[15], s2[12]); + s1[13] = vsubq_s16(s2[14], s2[13]); + s1[14] = vaddq_s16(s2[13], s2[14]); + s1[15] = vaddq_s16(s2[12], s2[15]); + + s1[18] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_8_64, s2[29], + cospi_24_64); + s1[29] = multiply_accumulate_shift_and_narrow_s16(s2[18], cospi_24_64, s2[29], + cospi_8_64); + + s1[19] = multiply_accumulate_shift_and_narrow_s16(s2[19], -cospi_8_64, s2[28], + cospi_24_64); + s1[28] = multiply_accumulate_shift_and_narrow_s16(s2[19], cospi_24_64, s2[28], + cospi_8_64); + + s1[20] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_24_64, + s2[27], -cospi_8_64); + s1[27] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_8_64, s2[27], + cospi_24_64); + + s1[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_24_64, + s2[26], -cospi_8_64); + s1[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_8_64, s2[26], + cospi_24_64); + + // stage 6 + s2[0] = vaddq_s16(s1[0], s1[7]); + s2[1] = vaddq_s16(s1[0], s1[6]); + s2[2] = vaddq_s16(s1[0], s1[5]); + s2[3] = vaddq_s16(s1[0], s1[4]); + s2[4] = vsubq_s16(s1[0], s1[4]); + s2[5] = vsubq_s16(s1[0], s1[5]); + s2[6] = vsubq_s16(s1[0], s1[6]); + s2[7] = vsubq_s16(s1[0], s1[7]); + + s2[10] = sub_multiply_shift_and_narrow_s16(s1[13], s1[10], cospi_16_64); + s2[13] = add_multiply_shift_and_narrow_s16(s1[10], s1[13], cospi_16_64); + + s2[11] = sub_multiply_shift_and_narrow_s16(s1[12], s1[11], cospi_16_64); + s2[12] = add_multiply_shift_and_narrow_s16(s1[11], s1[12], cospi_16_64); + + s1[16] = vaddq_s16(s2[16], s2[23]); + s1[17] = vaddq_s16(s2[17], s2[22]); + s2[18] = vaddq_s16(s1[18], s1[21]); + s2[19] = vaddq_s16(s1[19], s1[20]); + s2[20] = vsubq_s16(s1[19], s1[20]); + s2[21] = vsubq_s16(s1[18], s1[21]); + s1[22] = vsubq_s16(s2[17], s2[22]); + s1[23] = vsubq_s16(s2[16], s2[23]); + + s3[24] = vsubq_s16(s2[31], s2[24]); + s3[25] = vsubq_s16(s2[30], s2[25]); + s3[26] = vsubq_s16(s1[29], s1[26]); + s3[27] = vsubq_s16(s1[28], s1[27]); + s2[28] = vaddq_s16(s1[27], s1[28]); + s2[29] = vaddq_s16(s1[26], s1[29]); + s2[30] = vaddq_s16(s2[25], s2[30]); + s2[31] = vaddq_s16(s2[24], s2[31]); + + // stage 7 + s1[0] = vaddq_s16(s2[0], s1[15]); + s1[1] = vaddq_s16(s2[1], s1[14]); + s1[2] = vaddq_s16(s2[2], s2[13]); + s1[3] = vaddq_s16(s2[3], s2[12]); + s1[4] = vaddq_s16(s2[4], s2[11]); + s1[5] = vaddq_s16(s2[5], s2[10]); + s1[6] = vaddq_s16(s2[6], s1[9]); + s1[7] = vaddq_s16(s2[7], s1[8]); + s1[8] = vsubq_s16(s2[7], s1[8]); + s1[9] = vsubq_s16(s2[6], s1[9]); + s1[10] = vsubq_s16(s2[5], s2[10]); + s1[11] = vsubq_s16(s2[4], s2[11]); + s1[12] = vsubq_s16(s2[3], s2[12]); + s1[13] = vsubq_s16(s2[2], s2[13]); + s1[14] = vsubq_s16(s2[1], s1[14]); + s1[15] = vsubq_s16(s2[0], s1[15]); + + s1[20] = sub_multiply_shift_and_narrow_s16(s3[27], s2[20], cospi_16_64); + s1[27] = add_multiply_shift_and_narrow_s16(s2[20], s3[27], cospi_16_64); + + s1[21] = sub_multiply_shift_and_narrow_s16(s3[26], s2[21], cospi_16_64); + s1[26] = add_multiply_shift_and_narrow_s16(s2[21], s3[26], cospi_16_64); + + s2[22] = sub_multiply_shift_and_narrow_s16(s3[25], s1[22], cospi_16_64); + s1[25] = add_multiply_shift_and_narrow_s16(s1[22], s3[25], cospi_16_64); + + s2[23] = sub_multiply_shift_and_narrow_s16(s3[24], s1[23], cospi_16_64); + s1[24] = add_multiply_shift_and_narrow_s16(s1[23], s3[24], cospi_16_64); + + // final stage + out[0] = final_add(s1[0], s2[31]); + out[1] = final_add(s1[1], s2[30]); + out[2] = final_add(s1[2], s2[29]); + out[3] = final_add(s1[3], s2[28]); + out[4] = final_add(s1[4], s1[27]); + out[5] = final_add(s1[5], s1[26]); + out[6] = final_add(s1[6], s1[25]); + out[7] = final_add(s1[7], s1[24]); + out[8] = final_add(s1[8], s2[23]); + out[9] = final_add(s1[9], s2[22]); + out[10] = final_add(s1[10], s1[21]); + out[11] = final_add(s1[11], s1[20]); + out[12] = final_add(s1[12], s2[19]); + out[13] = final_add(s1[13], s2[18]); + out[14] = final_add(s1[14], s1[17]); + out[15] = final_add(s1[15], s1[16]); + out[16] = final_sub(s1[15], s1[16]); + out[17] = final_sub(s1[14], s1[17]); + out[18] = final_sub(s1[13], s2[18]); + out[19] = final_sub(s1[12], s2[19]); + out[20] = final_sub(s1[11], s1[20]); + out[21] = final_sub(s1[10], s1[21]); + out[22] = final_sub(s1[9], s2[22]); + out[23] = final_sub(s1[8], s2[23]); + out[24] = final_sub(s1[7], s1[24]); + out[25] = final_sub(s1[6], s1[25]); + out[26] = final_sub(s1[5], s1[26]); + out[27] = final_sub(s1[4], s1[27]); + out[28] = final_sub(s1[3], s2[28]); + out[29] = final_sub(s1[2], s2[29]); + out[30] = final_sub(s1[1], s2[30]); + out[31] = final_sub(s1[0], s2[31]); + + if (highbd_flag) { + highbd_add_and_store_bd8(out, output, stride); + } else { + uint8_t *const outputT = (uint8_t *)output; + add_and_store_u8_s16(out + 0, outputT, stride); + add_and_store_u8_s16(out + 8, outputT + (8 * stride), stride); + add_and_store_u8_s16(out + 16, outputT + (16 * stride), stride); + add_and_store_u8_s16(out + 24, outputT + (24 * stride), stride); + } +} + +void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + int i; + int16_t temp[32 * 8]; + int16_t *t = temp; + + vpx_idct32_6_neon(input, t); + + for (i = 0; i < 32; i += 8) { + vpx_idct32_8_neon(t, dest, stride, 0); + t += (8 * 8); + dest += 8; + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_add_neon.c new file mode 100644 index 0000000000..9f4589ea96 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_add_neon.c @@ -0,0 +1,776 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/txfm_common.h" + +static INLINE void load_from_transformed(const int16_t *const trans_buf, + const int first, const int second, + int16x8_t *const q0, + int16x8_t *const q1) { + *q0 = vld1q_s16(trans_buf + first * 8); + *q1 = vld1q_s16(trans_buf + second * 8); +} + +static INLINE void load_from_output(const int16_t *const out, const int first, + const int second, int16x8_t *const q0, + int16x8_t *const q1) { + *q0 = vld1q_s16(out + first * 32); + *q1 = vld1q_s16(out + second * 32); +} + +static INLINE void store_in_output(int16_t *const out, const int first, + const int second, const int16x8_t q0, + const int16x8_t q1) { + vst1q_s16(out + first * 32, q0); + vst1q_s16(out + second * 32, q1); +} + +static INLINE void store_combine_results(uint8_t *p1, uint8_t *p2, + const int stride, int16x8_t q0, + int16x8_t q1, int16x8_t q2, + int16x8_t q3) { + uint8x8_t d[4]; + + d[0] = vld1_u8(p1); + p1 += stride; + d[1] = vld1_u8(p1); + d[3] = vld1_u8(p2); + p2 -= stride; + d[2] = vld1_u8(p2); + + q0 = vrshrq_n_s16(q0, 6); + q1 = vrshrq_n_s16(q1, 6); + q2 = vrshrq_n_s16(q2, 6); + q3 = vrshrq_n_s16(q3, 6); + + q0 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q0), d[0])); + q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1), d[1])); + q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2), d[2])); + q3 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q3), d[3])); + + d[0] = vqmovun_s16(q0); + d[1] = vqmovun_s16(q1); + d[2] = vqmovun_s16(q2); + d[3] = vqmovun_s16(q3); + + vst1_u8(p1, d[1]); + p1 -= stride; + vst1_u8(p1, d[0]); + vst1_u8(p2, d[2]); + p2 += stride; + vst1_u8(p2, d[3]); +} + +static INLINE void highbd_store_combine_results_bd8(uint16_t *p1, uint16_t *p2, + const int stride, + int16x8_t q0, int16x8_t q1, + int16x8_t q2, + int16x8_t q3) { + uint16x8_t d[4]; + + d[0] = vld1q_u16(p1); + p1 += stride; + d[1] = vld1q_u16(p1); + d[3] = vld1q_u16(p2); + p2 -= stride; + d[2] = vld1q_u16(p2); + + q0 = vrshrq_n_s16(q0, 6); + q1 = vrshrq_n_s16(q1, 6); + q2 = vrshrq_n_s16(q2, 6); + q3 = vrshrq_n_s16(q3, 6); + + q0 = vaddq_s16(q0, vreinterpretq_s16_u16(d[0])); + q1 = vaddq_s16(q1, vreinterpretq_s16_u16(d[1])); + q2 = vaddq_s16(q2, vreinterpretq_s16_u16(d[2])); + q3 = vaddq_s16(q3, vreinterpretq_s16_u16(d[3])); + + d[0] = vmovl_u8(vqmovun_s16(q0)); + d[1] = vmovl_u8(vqmovun_s16(q1)); + d[2] = vmovl_u8(vqmovun_s16(q2)); + d[3] = vmovl_u8(vqmovun_s16(q3)); + + vst1q_u16(p1, d[1]); + p1 -= stride; + vst1q_u16(p1, d[0]); + vst1q_u16(p2, d[2]); + p2 += stride; + vst1q_u16(p2, d[3]); +} + +static INLINE void do_butterfly(const int16x8_t qIn0, const int16x8_t qIn1, + const int16_t first_const, + const int16_t second_const, + int16x8_t *const qOut0, + int16x8_t *const qOut1) { + int32x4_t q[4]; + int16x4_t d[6]; + + d[0] = vget_low_s16(qIn0); + d[1] = vget_high_s16(qIn0); + d[2] = vget_low_s16(qIn1); + d[3] = vget_high_s16(qIn1); + + // Note: using v{mul, mla, mls}l_n_s16 here slows down 35% with gcc 4.9. + d[4] = vdup_n_s16(first_const); + d[5] = vdup_n_s16(second_const); + + q[0] = vmull_s16(d[0], d[4]); + q[1] = vmull_s16(d[1], d[4]); + q[0] = vmlsl_s16(q[0], d[2], d[5]); + q[1] = vmlsl_s16(q[1], d[3], d[5]); + + q[2] = vmull_s16(d[0], d[5]); + q[3] = vmull_s16(d[1], d[5]); + q[2] = vmlal_s16(q[2], d[2], d[4]); + q[3] = vmlal_s16(q[3], d[3], d[4]); + + *qOut0 = vcombine_s16(vrshrn_n_s32(q[0], DCT_CONST_BITS), + vrshrn_n_s32(q[1], DCT_CONST_BITS)); + *qOut1 = vcombine_s16(vrshrn_n_s32(q[2], DCT_CONST_BITS), + vrshrn_n_s32(q[3], DCT_CONST_BITS)); +} + +static INLINE void load_s16x8q(const int16_t *in, int16x8_t *const s0, + int16x8_t *const s1, int16x8_t *const s2, + int16x8_t *const s3, int16x8_t *const s4, + int16x8_t *const s5, int16x8_t *const s6, + int16x8_t *const s7) { + *s0 = vld1q_s16(in); + in += 32; + *s1 = vld1q_s16(in); + in += 32; + *s2 = vld1q_s16(in); + in += 32; + *s3 = vld1q_s16(in); + in += 32; + *s4 = vld1q_s16(in); + in += 32; + *s5 = vld1q_s16(in); + in += 32; + *s6 = vld1q_s16(in); + in += 32; + *s7 = vld1q_s16(in); +} + +static INLINE void transpose_and_store_s16_8x8(int16x8_t a0, int16x8_t a1, + int16x8_t a2, int16x8_t a3, + int16x8_t a4, int16x8_t a5, + int16x8_t a6, int16x8_t a7, + int16_t **out) { + transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + vst1q_s16(*out, a0); + *out += 8; + vst1q_s16(*out, a1); + *out += 8; + vst1q_s16(*out, a2); + *out += 8; + vst1q_s16(*out, a3); + *out += 8; + vst1q_s16(*out, a4); + *out += 8; + vst1q_s16(*out, a5); + *out += 8; + vst1q_s16(*out, a6); + *out += 8; + vst1q_s16(*out, a7); + *out += 8; +} + +static INLINE void idct32_transpose_pair(const int16_t *input, int16_t *t_buf) { + int i; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + + for (i = 0; i < 4; i++, input += 8) { + load_s16x8q(input, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + transpose_and_store_s16_8x8(s0, s1, s2, s3, s4, s5, s6, s7, &t_buf); + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void load_s16x8q_tran_low( + const tran_low_t *in, int16x8_t *const s0, int16x8_t *const s1, + int16x8_t *const s2, int16x8_t *const s3, int16x8_t *const s4, + int16x8_t *const s5, int16x8_t *const s6, int16x8_t *const s7) { + *s0 = load_tran_low_to_s16q(in); + in += 32; + *s1 = load_tran_low_to_s16q(in); + in += 32; + *s2 = load_tran_low_to_s16q(in); + in += 32; + *s3 = load_tran_low_to_s16q(in); + in += 32; + *s4 = load_tran_low_to_s16q(in); + in += 32; + *s5 = load_tran_low_to_s16q(in); + in += 32; + *s6 = load_tran_low_to_s16q(in); + in += 32; + *s7 = load_tran_low_to_s16q(in); +} + +static INLINE void idct32_transpose_pair_tran_low(const tran_low_t *input, + int16_t *t_buf) { + int i; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + + for (i = 0; i < 4; i++, input += 8) { + load_s16x8q_tran_low(input, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + transpose_and_store_s16_8x8(s0, s1, s2, s3, s4, s5, s6, s7, &t_buf); + } +} +#else // !CONFIG_VP9_HIGHBITDEPTH +#define idct32_transpose_pair_tran_low idct32_transpose_pair +#endif // CONFIG_VP9_HIGHBITDEPTH + +static INLINE void idct32_bands_end_1st_pass(int16_t *const out, + int16x8_t *const q) { + store_in_output(out, 16, 17, q[6], q[7]); + store_in_output(out, 14, 15, q[8], q[9]); + + load_from_output(out, 30, 31, &q[0], &q[1]); + q[4] = vaddq_s16(q[2], q[1]); + q[5] = vaddq_s16(q[3], q[0]); + q[6] = vsubq_s16(q[3], q[0]); + q[7] = vsubq_s16(q[2], q[1]); + store_in_output(out, 30, 31, q[6], q[7]); + store_in_output(out, 0, 1, q[4], q[5]); + + load_from_output(out, 12, 13, &q[0], &q[1]); + q[2] = vaddq_s16(q[10], q[1]); + q[3] = vaddq_s16(q[11], q[0]); + q[4] = vsubq_s16(q[11], q[0]); + q[5] = vsubq_s16(q[10], q[1]); + + load_from_output(out, 18, 19, &q[0], &q[1]); + q[8] = vaddq_s16(q[4], q[1]); + q[9] = vaddq_s16(q[5], q[0]); + q[6] = vsubq_s16(q[5], q[0]); + q[7] = vsubq_s16(q[4], q[1]); + store_in_output(out, 18, 19, q[6], q[7]); + store_in_output(out, 12, 13, q[8], q[9]); + + load_from_output(out, 28, 29, &q[0], &q[1]); + q[4] = vaddq_s16(q[2], q[1]); + q[5] = vaddq_s16(q[3], q[0]); + q[6] = vsubq_s16(q[3], q[0]); + q[7] = vsubq_s16(q[2], q[1]); + store_in_output(out, 28, 29, q[6], q[7]); + store_in_output(out, 2, 3, q[4], q[5]); + + load_from_output(out, 10, 11, &q[0], &q[1]); + q[2] = vaddq_s16(q[12], q[1]); + q[3] = vaddq_s16(q[13], q[0]); + q[4] = vsubq_s16(q[13], q[0]); + q[5] = vsubq_s16(q[12], q[1]); + + load_from_output(out, 20, 21, &q[0], &q[1]); + q[8] = vaddq_s16(q[4], q[1]); + q[9] = vaddq_s16(q[5], q[0]); + q[6] = vsubq_s16(q[5], q[0]); + q[7] = vsubq_s16(q[4], q[1]); + store_in_output(out, 20, 21, q[6], q[7]); + store_in_output(out, 10, 11, q[8], q[9]); + + load_from_output(out, 26, 27, &q[0], &q[1]); + q[4] = vaddq_s16(q[2], q[1]); + q[5] = vaddq_s16(q[3], q[0]); + q[6] = vsubq_s16(q[3], q[0]); + q[7] = vsubq_s16(q[2], q[1]); + store_in_output(out, 26, 27, q[6], q[7]); + store_in_output(out, 4, 5, q[4], q[5]); + + load_from_output(out, 8, 9, &q[0], &q[1]); + q[2] = vaddq_s16(q[14], q[1]); + q[3] = vaddq_s16(q[15], q[0]); + q[4] = vsubq_s16(q[15], q[0]); + q[5] = vsubq_s16(q[14], q[1]); + + load_from_output(out, 22, 23, &q[0], &q[1]); + q[8] = vaddq_s16(q[4], q[1]); + q[9] = vaddq_s16(q[5], q[0]); + q[6] = vsubq_s16(q[5], q[0]); + q[7] = vsubq_s16(q[4], q[1]); + store_in_output(out, 22, 23, q[6], q[7]); + store_in_output(out, 8, 9, q[8], q[9]); + + load_from_output(out, 24, 25, &q[0], &q[1]); + q[4] = vaddq_s16(q[2], q[1]); + q[5] = vaddq_s16(q[3], q[0]); + q[6] = vsubq_s16(q[3], q[0]); + q[7] = vsubq_s16(q[2], q[1]); + store_in_output(out, 24, 25, q[6], q[7]); + store_in_output(out, 6, 7, q[4], q[5]); +} + +static INLINE void idct32_bands_end_2nd_pass(const int16_t *const out, + uint8_t *const dest, + const int stride, + int16x8_t *const q) { + uint8_t *dest0 = dest + 0 * stride; + uint8_t *dest1 = dest + 31 * stride; + uint8_t *dest2 = dest + 16 * stride; + uint8_t *dest3 = dest + 15 * stride; + const int str2 = stride << 1; + + store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]); + dest2 += str2; + dest3 -= str2; + + load_from_output(out, 30, 31, &q[0], &q[1]); + q[4] = final_add(q[2], q[1]); + q[5] = final_add(q[3], q[0]); + q[6] = final_sub(q[3], q[0]); + q[7] = final_sub(q[2], q[1]); + store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]); + dest0 += str2; + dest1 -= str2; + + load_from_output(out, 12, 13, &q[0], &q[1]); + q[2] = vaddq_s16(q[10], q[1]); + q[3] = vaddq_s16(q[11], q[0]); + q[4] = vsubq_s16(q[11], q[0]); + q[5] = vsubq_s16(q[10], q[1]); + + load_from_output(out, 18, 19, &q[0], &q[1]); + q[8] = final_add(q[4], q[1]); + q[9] = final_add(q[5], q[0]); + q[6] = final_sub(q[5], q[0]); + q[7] = final_sub(q[4], q[1]); + store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]); + dest2 += str2; + dest3 -= str2; + + load_from_output(out, 28, 29, &q[0], &q[1]); + q[4] = final_add(q[2], q[1]); + q[5] = final_add(q[3], q[0]); + q[6] = final_sub(q[3], q[0]); + q[7] = final_sub(q[2], q[1]); + store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]); + dest0 += str2; + dest1 -= str2; + + load_from_output(out, 10, 11, &q[0], &q[1]); + q[2] = vaddq_s16(q[12], q[1]); + q[3] = vaddq_s16(q[13], q[0]); + q[4] = vsubq_s16(q[13], q[0]); + q[5] = vsubq_s16(q[12], q[1]); + + load_from_output(out, 20, 21, &q[0], &q[1]); + q[8] = final_add(q[4], q[1]); + q[9] = final_add(q[5], q[0]); + q[6] = final_sub(q[5], q[0]); + q[7] = final_sub(q[4], q[1]); + store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]); + dest2 += str2; + dest3 -= str2; + + load_from_output(out, 26, 27, &q[0], &q[1]); + q[4] = final_add(q[2], q[1]); + q[5] = final_add(q[3], q[0]); + q[6] = final_sub(q[3], q[0]); + q[7] = final_sub(q[2], q[1]); + store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]); + dest0 += str2; + dest1 -= str2; + + load_from_output(out, 8, 9, &q[0], &q[1]); + q[2] = vaddq_s16(q[14], q[1]); + q[3] = vaddq_s16(q[15], q[0]); + q[4] = vsubq_s16(q[15], q[0]); + q[5] = vsubq_s16(q[14], q[1]); + + load_from_output(out, 22, 23, &q[0], &q[1]); + q[8] = final_add(q[4], q[1]); + q[9] = final_add(q[5], q[0]); + q[6] = final_sub(q[5], q[0]); + q[7] = final_sub(q[4], q[1]); + store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]); + + load_from_output(out, 24, 25, &q[0], &q[1]); + q[4] = final_add(q[2], q[1]); + q[5] = final_add(q[3], q[0]); + q[6] = final_sub(q[3], q[0]); + q[7] = final_sub(q[2], q[1]); + store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]); +} + +static INLINE void highbd_idct32_bands_end_2nd_pass_bd8( + const int16_t *const out, uint16_t *const dest, const int stride, + int16x8_t *const q) { + uint16_t *dest0 = dest + 0 * stride; + uint16_t *dest1 = dest + 31 * stride; + uint16_t *dest2 = dest + 16 * stride; + uint16_t *dest3 = dest + 15 * stride; + const int str2 = stride << 1; + + highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8], + q[9]); + dest2 += str2; + dest3 -= str2; + + load_from_output(out, 30, 31, &q[0], &q[1]); + q[4] = final_add(q[2], q[1]); + q[5] = final_add(q[3], q[0]); + q[6] = final_sub(q[3], q[0]); + q[7] = final_sub(q[2], q[1]); + highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6], + q[7]); + dest0 += str2; + dest1 -= str2; + + load_from_output(out, 12, 13, &q[0], &q[1]); + q[2] = vaddq_s16(q[10], q[1]); + q[3] = vaddq_s16(q[11], q[0]); + q[4] = vsubq_s16(q[11], q[0]); + q[5] = vsubq_s16(q[10], q[1]); + + load_from_output(out, 18, 19, &q[0], &q[1]); + q[8] = final_add(q[4], q[1]); + q[9] = final_add(q[5], q[0]); + q[6] = final_sub(q[5], q[0]); + q[7] = final_sub(q[4], q[1]); + highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8], + q[9]); + dest2 += str2; + dest3 -= str2; + + load_from_output(out, 28, 29, &q[0], &q[1]); + q[4] = final_add(q[2], q[1]); + q[5] = final_add(q[3], q[0]); + q[6] = final_sub(q[3], q[0]); + q[7] = final_sub(q[2], q[1]); + highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6], + q[7]); + dest0 += str2; + dest1 -= str2; + + load_from_output(out, 10, 11, &q[0], &q[1]); + q[2] = vaddq_s16(q[12], q[1]); + q[3] = vaddq_s16(q[13], q[0]); + q[4] = vsubq_s16(q[13], q[0]); + q[5] = vsubq_s16(q[12], q[1]); + + load_from_output(out, 20, 21, &q[0], &q[1]); + q[8] = final_add(q[4], q[1]); + q[9] = final_add(q[5], q[0]); + q[6] = final_sub(q[5], q[0]); + q[7] = final_sub(q[4], q[1]); + highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8], + q[9]); + dest2 += str2; + dest3 -= str2; + + load_from_output(out, 26, 27, &q[0], &q[1]); + q[4] = final_add(q[2], q[1]); + q[5] = final_add(q[3], q[0]); + q[6] = final_sub(q[3], q[0]); + q[7] = final_sub(q[2], q[1]); + highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6], + q[7]); + dest0 += str2; + dest1 -= str2; + + load_from_output(out, 8, 9, &q[0], &q[1]); + q[2] = vaddq_s16(q[14], q[1]); + q[3] = vaddq_s16(q[15], q[0]); + q[4] = vsubq_s16(q[15], q[0]); + q[5] = vsubq_s16(q[14], q[1]); + + load_from_output(out, 22, 23, &q[0], &q[1]); + q[8] = final_add(q[4], q[1]); + q[9] = final_add(q[5], q[0]); + q[6] = final_sub(q[5], q[0]); + q[7] = final_sub(q[4], q[1]); + highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8], + q[9]); + + load_from_output(out, 24, 25, &q[0], &q[1]); + q[4] = final_add(q[2], q[1]); + q[5] = final_add(q[3], q[0]); + q[6] = final_sub(q[3], q[0]); + q[7] = final_sub(q[2], q[1]); + highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6], + q[7]); +} + +void vpx_idct32_32_neon(const tran_low_t *input, uint8_t *dest, + const int stride, const int highbd_flag) { + int i, idct32_pass_loop; + int16_t trans_buf[32 * 8]; + int16_t pass1[32 * 32]; + int16_t pass2[32 * 32]; + const int16_t *input_pass2 = pass1; // input of pass2 is the result of pass1 + int16_t *out; + int16x8_t q[16]; + uint16_t *dst = CAST_TO_SHORTPTR(dest); + + for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2; + idct32_pass_loop++, out = pass2) { + for (i = 0; i < 4; i++, out += 8) { // idct32_bands_loop + if (idct32_pass_loop == 0) { + idct32_transpose_pair_tran_low(input, trans_buf); + input += 32 * 8; + } else { + idct32_transpose_pair(input_pass2, trans_buf); + input_pass2 += 32 * 8; + } + + // ----------------------------------------- + // BLOCK A: 16-19,28-31 + // ----------------------------------------- + // generate 16,17,30,31 + // part of stage 1 + load_from_transformed(trans_buf, 1, 31, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_31_64, cospi_1_64, &q[0], &q[2]); + load_from_transformed(trans_buf, 17, 15, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_15_64, cospi_17_64, &q[1], &q[3]); + // part of stage 2 + q[4] = vaddq_s16(q[0], q[1]); + q[13] = vsubq_s16(q[0], q[1]); + q[6] = vaddq_s16(q[2], q[3]); + q[14] = vsubq_s16(q[2], q[3]); + // part of stage 3 + do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[5], &q[7]); + + // generate 18,19,28,29 + // part of stage 1 + load_from_transformed(trans_buf, 9, 23, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_23_64, cospi_9_64, &q[0], &q[2]); + load_from_transformed(trans_buf, 25, 7, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_7_64, cospi_25_64, &q[1], &q[3]); + // part of stage 2 + q[13] = vsubq_s16(q[3], q[2]); + q[3] = vaddq_s16(q[3], q[2]); + q[14] = vsubq_s16(q[1], q[0]); + q[2] = vaddq_s16(q[1], q[0]); + // part of stage 3 + do_butterfly(q[14], q[13], -cospi_4_64, -cospi_28_64, &q[1], &q[0]); + // part of stage 4 + q[8] = vaddq_s16(q[4], q[2]); + q[9] = vaddq_s16(q[5], q[0]); + q[10] = vaddq_s16(q[7], q[1]); + q[15] = vaddq_s16(q[6], q[3]); + q[13] = vsubq_s16(q[5], q[0]); + q[14] = vsubq_s16(q[7], q[1]); + store_in_output(out, 16, 31, q[8], q[15]); + store_in_output(out, 17, 30, q[9], q[10]); + // part of stage 5 + do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[0], &q[1]); + store_in_output(out, 29, 18, q[1], q[0]); + // part of stage 4 + q[13] = vsubq_s16(q[4], q[2]); + q[14] = vsubq_s16(q[6], q[3]); + // part of stage 5 + do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[4], &q[6]); + store_in_output(out, 19, 28, q[4], q[6]); + + // ----------------------------------------- + // BLOCK B: 20-23,24-27 + // ----------------------------------------- + // generate 20,21,26,27 + // part of stage 1 + load_from_transformed(trans_buf, 5, 27, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_27_64, cospi_5_64, &q[0], &q[2]); + load_from_transformed(trans_buf, 21, 11, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_11_64, cospi_21_64, &q[1], &q[3]); + // part of stage 2 + q[13] = vsubq_s16(q[0], q[1]); + q[0] = vaddq_s16(q[0], q[1]); + q[14] = vsubq_s16(q[2], q[3]); + q[2] = vaddq_s16(q[2], q[3]); + // part of stage 3 + do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]); + + // generate 22,23,24,25 + // part of stage 1 + load_from_transformed(trans_buf, 13, 19, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_19_64, cospi_13_64, &q[5], &q[7]); + load_from_transformed(trans_buf, 29, 3, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_3_64, cospi_29_64, &q[4], &q[6]); + // part of stage 2 + q[14] = vsubq_s16(q[4], q[5]); + q[5] = vaddq_s16(q[4], q[5]); + q[13] = vsubq_s16(q[6], q[7]); + q[6] = vaddq_s16(q[6], q[7]); + // part of stage 3 + do_butterfly(q[14], q[13], -cospi_20_64, -cospi_12_64, &q[4], &q[7]); + // part of stage 4 + q[10] = vaddq_s16(q[7], q[1]); + q[11] = vaddq_s16(q[5], q[0]); + q[12] = vaddq_s16(q[6], q[2]); + q[15] = vaddq_s16(q[4], q[3]); + // part of stage 6 + load_from_output(out, 16, 17, &q[14], &q[13]); + q[8] = vaddq_s16(q[14], q[11]); + q[9] = vaddq_s16(q[13], q[10]); + q[13] = vsubq_s16(q[13], q[10]); + q[11] = vsubq_s16(q[14], q[11]); + store_in_output(out, 17, 16, q[9], q[8]); + load_from_output(out, 30, 31, &q[14], &q[9]); + q[8] = vsubq_s16(q[9], q[12]); + q[10] = vaddq_s16(q[14], q[15]); + q[14] = vsubq_s16(q[14], q[15]); + q[12] = vaddq_s16(q[9], q[12]); + store_in_output(out, 30, 31, q[10], q[12]); + // part of stage 7 + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]); + store_in_output(out, 25, 22, q[14], q[13]); + do_butterfly(q[8], q[11], cospi_16_64, cospi_16_64, &q[13], &q[14]); + store_in_output(out, 24, 23, q[14], q[13]); + // part of stage 4 + q[14] = vsubq_s16(q[5], q[0]); + q[13] = vsubq_s16(q[6], q[2]); + do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[5], &q[6]); + q[14] = vsubq_s16(q[7], q[1]); + q[13] = vsubq_s16(q[4], q[3]); + do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[0], &q[1]); + // part of stage 6 + load_from_output(out, 18, 19, &q[14], &q[13]); + q[8] = vaddq_s16(q[14], q[1]); + q[9] = vaddq_s16(q[13], q[6]); + q[13] = vsubq_s16(q[13], q[6]); + q[1] = vsubq_s16(q[14], q[1]); + store_in_output(out, 18, 19, q[8], q[9]); + load_from_output(out, 28, 29, &q[8], &q[9]); + q[14] = vsubq_s16(q[8], q[5]); + q[10] = vaddq_s16(q[8], q[5]); + q[11] = vaddq_s16(q[9], q[0]); + q[0] = vsubq_s16(q[9], q[0]); + store_in_output(out, 28, 29, q[10], q[11]); + // part of stage 7 + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]); + store_in_output(out, 20, 27, q[13], q[14]); + do_butterfly(q[0], q[1], cospi_16_64, cospi_16_64, &q[1], &q[0]); + store_in_output(out, 21, 26, q[1], q[0]); + + // ----------------------------------------- + // BLOCK C: 8-10,11-15 + // ----------------------------------------- + // generate 8,9,14,15 + // part of stage 2 + load_from_transformed(trans_buf, 2, 30, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_30_64, cospi_2_64, &q[0], &q[2]); + load_from_transformed(trans_buf, 18, 14, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_14_64, cospi_18_64, &q[1], &q[3]); + // part of stage 3 + q[13] = vsubq_s16(q[0], q[1]); + q[0] = vaddq_s16(q[0], q[1]); + q[14] = vsubq_s16(q[2], q[3]); + q[2] = vaddq_s16(q[2], q[3]); + // part of stage 4 + do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[1], &q[3]); + + // generate 10,11,12,13 + // part of stage 2 + load_from_transformed(trans_buf, 10, 22, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_22_64, cospi_10_64, &q[5], &q[7]); + load_from_transformed(trans_buf, 26, 6, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_6_64, cospi_26_64, &q[4], &q[6]); + // part of stage 3 + q[14] = vsubq_s16(q[4], q[5]); + q[5] = vaddq_s16(q[4], q[5]); + q[13] = vsubq_s16(q[6], q[7]); + q[6] = vaddq_s16(q[6], q[7]); + // part of stage 4 + do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[4], &q[7]); + // part of stage 5 + q[8] = vaddq_s16(q[0], q[5]); + q[9] = vaddq_s16(q[1], q[7]); + q[13] = vsubq_s16(q[1], q[7]); + q[14] = vsubq_s16(q[3], q[4]); + q[10] = vaddq_s16(q[3], q[4]); + q[15] = vaddq_s16(q[2], q[6]); + store_in_output(out, 8, 15, q[8], q[15]); + store_in_output(out, 9, 14, q[9], q[10]); + // part of stage 6 + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]); + store_in_output(out, 13, 10, q[3], q[1]); + q[13] = vsubq_s16(q[0], q[5]); + q[14] = vsubq_s16(q[2], q[6]); + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]); + store_in_output(out, 11, 12, q[1], q[3]); + + // ----------------------------------------- + // BLOCK D: 0-3,4-7 + // ----------------------------------------- + // generate 4,5,6,7 + // part of stage 3 + load_from_transformed(trans_buf, 4, 28, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[0], &q[2]); + load_from_transformed(trans_buf, 20, 12, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]); + // part of stage 4 + q[13] = vsubq_s16(q[0], q[1]); + q[0] = vaddq_s16(q[0], q[1]); + q[14] = vsubq_s16(q[2], q[3]); + q[2] = vaddq_s16(q[2], q[3]); + // part of stage 5 + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]); + + // generate 0,1,2,3 + // part of stage 4 + load_from_transformed(trans_buf, 0, 16, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[5], &q[7]); + load_from_transformed(trans_buf, 8, 24, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[14], &q[6]); + // part of stage 5 + q[4] = vaddq_s16(q[7], q[6]); + q[7] = vsubq_s16(q[7], q[6]); + q[6] = vsubq_s16(q[5], q[14]); + q[5] = vaddq_s16(q[5], q[14]); + // part of stage 6 + q[8] = vaddq_s16(q[4], q[2]); + q[9] = vaddq_s16(q[5], q[3]); + q[10] = vaddq_s16(q[6], q[1]); + q[11] = vaddq_s16(q[7], q[0]); + q[12] = vsubq_s16(q[7], q[0]); + q[13] = vsubq_s16(q[6], q[1]); + q[14] = vsubq_s16(q[5], q[3]); + q[15] = vsubq_s16(q[4], q[2]); + // part of stage 7 + load_from_output(out, 14, 15, &q[0], &q[1]); + q[2] = vaddq_s16(q[8], q[1]); + q[3] = vaddq_s16(q[9], q[0]); + q[4] = vsubq_s16(q[9], q[0]); + q[5] = vsubq_s16(q[8], q[1]); + load_from_output(out, 16, 17, &q[0], &q[1]); + q[8] = final_add(q[4], q[1]); + q[9] = final_add(q[5], q[0]); + q[6] = final_sub(q[5], q[0]); + q[7] = final_sub(q[4], q[1]); + + if (idct32_pass_loop == 0) { + idct32_bands_end_1st_pass(out, q); + } else { + if (highbd_flag) { + highbd_idct32_bands_end_2nd_pass_bd8(out, dst, stride, q); + dst += 8; + } else { + idct32_bands_end_2nd_pass(out, dest, stride, q); + dest += 8; + } + } + } + } +} + +void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + vpx_idct32_32_neon(input, dest, stride, 0); +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm new file mode 100644 index 0000000000..d83421e9e6 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm @@ -0,0 +1,66 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vpx_idct4x4_1_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void vpx_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, int stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int stride) + +|vpx_idct4x4_1_add_neon| PROC + ldrsh r0, [r0] + + ; cospi_16_64 = 11585 + movw r12, #0x2d41 + + ; out = dct_const_round_shift(input[0] * cospi_16_64) + mul r0, r0, r12 ; input[0] * cospi_16_64 + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; out = dct_const_round_shift(out * cospi_16_64) + mul r0, r0, r12 ; out * cospi_16_64 + mov r12, r1 ; save dest + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; a1 = ROUND_POWER_OF_TWO(out, 4) + add r0, r0, #8 ; + (1 <<((4) - 1)) + asr r0, r0, #4 ; >> 4 + + vdup.s16 q0, r0 ; duplicate a1 + + vld1.32 {d2[0]}, [r1], r2 + vld1.32 {d2[1]}, [r1], r2 + vld1.32 {d4[0]}, [r1], r2 + vld1.32 {d4[1]}, [r1] + + vaddw.u8 q8, q0, d2 ; dest[x] + a1 + vaddw.u8 q9, q0, d4 + + vqmovun.s16 d6, q8 ; clip_pixel + vqmovun.s16 d7, q9 + + vst1.32 {d6[0]}, [r12], r2 + vst1.32 {d6[1]}, [r12], r2 + vst1.32 {d7[0]}, [r12], r2 + vst1.32 {d7[1]}, [r12] + + bx lr + ENDP ; |vpx_idct4x4_1_add_neon| + + END diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c new file mode 100644 index 0000000000..a14b895431 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/inv_txfm.h" + +static INLINE void idct4x4_1_add_kernel(uint8_t **dest, const int stride, + const int16x8_t res, + uint32x2_t *const d) { + uint16x8_t a; + uint8x8_t b; + *d = vld1_lane_u32((const uint32_t *)*dest, *d, 0); + *d = vld1_lane_u32((const uint32_t *)(*dest + stride), *d, 1); + a = vaddw_u8(vreinterpretq_u16_s16(res), vreinterpret_u8_u32(*d)); + b = vqmovun_s16(vreinterpretq_s16_u16(a)); + vst1_lane_u32((uint32_t *)*dest, vreinterpret_u32_u8(b), 0); + *dest += stride; + vst1_lane_u32((uint32_t *)*dest, vreinterpret_u32_u8(b), 1); + *dest += stride; +} + +void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + const int16_t out0 = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); + const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64)); + const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4); + const int16x8_t dc = vdupq_n_s16(a1); + uint32x2_t d = vdup_n_u32(0); + + assert(!((intptr_t)dest % sizeof(uint32_t))); + assert(!(stride % sizeof(uint32_t))); + + idct4x4_1_add_kernel(&dest, stride, dc, &d); + idct4x4_1_add_kernel(&dest, stride, dc, &d); +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm new file mode 100644 index 0000000000..175ba7fbc2 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm @@ -0,0 +1,188 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vpx_idct4x4_16_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + INCLUDE vpx_dsp/arm/idct_neon.asm.S + + AREA Block, CODE, READONLY +;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int stride) + +|vpx_idct4x4_16_add_neon| PROC + + ; The 2D transform is done with two passes which are actually pretty + ; similar. We first transform the rows. This is done by transposing + ; the inputs, doing an SIMD column transform (the columns are the + ; transposed rows) and then transpose the results (so that it goes back + ; in normal/row positions). Then, we transform the columns by doing + ; another SIMD column transform. + ; So, two passes of a transpose followed by a column transform. + + ; load the inputs into q8-q9, d16-d19 + LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0 + + ; generate scalar constants + ; cospi_8_64 = 15137 + movw r0, #0x3b21 + ; cospi_16_64 = 11585 + movw r3, #0x2d41 + ; cospi_24_64 = 6270 + movw r12, #0x187e + + ; transpose the input data + ; 00 01 02 03 d16 + ; 10 11 12 13 d17 + ; 20 21 22 23 d18 + ; 30 31 32 33 d19 + vtrn.16 d16, d17 + vtrn.16 d18, d19 + + ; generate constant vectors + vdup.16 d20, r0 ; replicate cospi_8_64 + vdup.16 d21, r3 ; replicate cospi_16_64 + + ; 00 10 02 12 d16 + ; 01 11 03 13 d17 + ; 20 30 22 32 d18 + ; 21 31 23 33 d19 + vtrn.32 q8, q9 + ; 00 10 20 30 d16 + ; 01 11 21 31 d17 + ; 02 12 22 32 d18 + ; 03 13 23 33 d19 + + vdup.16 d22, r12 ; replicate cospi_24_64 + + ; do the transform on transposed rows + + ; stage 1 + vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64 + vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64 + + ; (input[0] + input[2]) * cospi_16_64; + ; (input[0] - input[2]) * cospi_16_64; + vmull.s16 q8, d16, d21 + vmull.s16 q14, d18, d21 + vadd.s32 q13, q8, q14 + vsub.s32 q14, q8, q14 + + ; input[1] * cospi_24_64 - input[3] * cospi_8_64; + ; input[1] * cospi_8_64 + input[3] * cospi_24_64; + vmlsl.s16 q15, d19, d20 + vmlal.s16 q1, d19, d22 + + ; dct_const_round_shift + vrshrn.s32 d26, q13, #14 + vrshrn.s32 d27, q14, #14 + vrshrn.s32 d29, q15, #14 + vrshrn.s32 d28, q1, #14 + + ; stage 2 + ; output[0] = step[0] + step[3]; + ; output[1] = step[1] + step[2]; + ; output[3] = step[0] - step[3]; + ; output[2] = step[1] - step[2]; + vadd.s16 q8, q13, q14 + vsub.s16 q9, q13, q14 + vswp d18, d19 + + ; transpose the results + ; 00 01 02 03 d16 + ; 10 11 12 13 d17 + ; 20 21 22 23 d18 + ; 30 31 32 33 d19 + vtrn.16 d16, d17 + vtrn.16 d18, d19 + ; 00 10 02 12 d16 + ; 01 11 03 13 d17 + ; 20 30 22 32 d18 + ; 21 31 23 33 d19 + vtrn.32 q8, q9 + ; 00 10 20 30 d16 + ; 01 11 21 31 d17 + ; 02 12 22 32 d18 + ; 03 13 23 33 d19 + + ; do the transform on columns + + ; stage 1 + vadd.s16 d23, d16, d18 ; (input[0] + input[2]) + vsub.s16 d24, d16, d18 ; (input[0] - input[2]) + + vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64 + vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64 + + ; (input[0] + input[2]) * cospi_16_64; + ; (input[0] - input[2]) * cospi_16_64; + vmull.s16 q13, d23, d21 + vmull.s16 q14, d24, d21 + + ; input[1] * cospi_24_64 - input[3] * cospi_8_64; + ; input[1] * cospi_8_64 + input[3] * cospi_24_64; + vmlsl.s16 q15, d19, d20 + vmlal.s16 q1, d19, d22 + + ; dct_const_round_shift + vrshrn.s32 d26, q13, #14 + vrshrn.s32 d27, q14, #14 + vrshrn.s32 d29, q15, #14 + vrshrn.s32 d28, q1, #14 + + ; stage 2 + ; output[0] = step[0] + step[3]; + ; output[1] = step[1] + step[2]; + ; output[3] = step[0] - step[3]; + ; output[2] = step[1] - step[2]; + vadd.s16 q8, q13, q14 + vsub.s16 q9, q13, q14 + + ; The results are in two registers, one of them being swapped. This will + ; be taken care of by loading the 'dest' value in a swapped fashion and + ; also storing them in the same swapped fashion. + ; temp_out[0, 1] = d16, d17 = q8 + ; temp_out[2, 3] = d19, d18 = q9 swapped + + ; ROUND_POWER_OF_TWO(temp_out[j], 4) + vrshr.s16 q8, q8, #4 + vrshr.s16 q9, q9, #4 + + vld1.32 {d26[0]}, [r1], r2 + vld1.32 {d26[1]}, [r1], r2 + vld1.32 {d27[1]}, [r1], r2 + vld1.32 {d27[0]}, [r1] ; no post-increment + + ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * stride + i] + vaddw.u8 q8, q8, d26 + vaddw.u8 q9, q9, d27 + + ; clip_pixel + vqmovun.s16 d26, q8 + vqmovun.s16 d27, q9 + + ; do the stores in reverse order with negative post-increment, by changing + ; the sign of the stride + rsb r2, r2, #0 + vst1.32 {d27[0]}, [r1], r2 + vst1.32 {d27[1]}, [r1], r2 + vst1.32 {d26[1]}, [r1], r2 + vst1.32 {d26[0]}, [r1] ; no post-increment + bx lr + ENDP ; |vpx_idct4x4_16_add_neon| + + END diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.c new file mode 100644 index 0000000000..8192ee4cf8 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/txfm_common.h" + +void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + const uint8_t *dst = dest; + uint32x2_t s32 = vdup_n_u32(0); + int16x8_t a[2]; + uint8x8_t s, d[2]; + uint16x8_t sum[2]; + + assert(!((intptr_t)dest % sizeof(uint32_t))); + assert(!(stride % sizeof(uint32_t))); + + // Rows + a[0] = load_tran_low_to_s16q(input); + a[1] = load_tran_low_to_s16q(input + 8); + transpose_idct4x4_16_bd8(a); + + // Columns + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + transpose_idct4x4_16_bd8(a); + a[0] = vrshrq_n_s16(a[0], 4); + a[1] = vrshrq_n_s16(a[1], 4); + + s = load_u8(dst, stride); + dst += 2 * stride; + // The elements are loaded in reverse order. + s32 = vld1_lane_u32((const uint32_t *)dst, s32, 1); + dst += stride; + s32 = vld1_lane_u32((const uint32_t *)dst, s32, 0); + + sum[0] = vaddw_u8(vreinterpretq_u16_s16(a[0]), s); + sum[1] = vaddw_u8(vreinterpretq_u16_s16(a[1]), vreinterpret_u8_u32(s32)); + d[0] = vqmovun_s16(vreinterpretq_s16_u16(sum[0])); + d[1] = vqmovun_s16(vreinterpretq_s16_u16(sum[1])); + + store_u8(dest, stride, d[0]); + dest += 2 * stride; + // The elements are stored in reverse order. + vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d[1]), 1); + dest += stride; + vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d[1]), 0); +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c new file mode 100644 index 0000000000..ce9b459589 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/inv_txfm.h" + +static INLINE uint8x8_t create_dcd(const int16_t dc) { + int16x8_t t = vdupq_n_s16(dc); + return vqmovun_s16(t); +} + +static INLINE void idct8x8_1_add_pos_kernel(uint8_t **dest, const int stride, + const uint8x8_t res) { + const uint8x8_t a = vld1_u8(*dest); + const uint8x8_t b = vqadd_u8(a, res); + vst1_u8(*dest, b); + *dest += stride; +} + +static INLINE void idct8x8_1_add_neg_kernel(uint8_t **dest, const int stride, + const uint8x8_t res) { + const uint8x8_t a = vld1_u8(*dest); + const uint8x8_t b = vqsub_u8(a, res); + vst1_u8(*dest, b); + *dest += stride; +} + +void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + const int16_t out0 = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); + const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64)); + const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5); + + if (a1 >= 0) { + const uint8x8_t dc = create_dcd(a1); + idct8x8_1_add_pos_kernel(&dest, stride, dc); + idct8x8_1_add_pos_kernel(&dest, stride, dc); + idct8x8_1_add_pos_kernel(&dest, stride, dc); + idct8x8_1_add_pos_kernel(&dest, stride, dc); + idct8x8_1_add_pos_kernel(&dest, stride, dc); + idct8x8_1_add_pos_kernel(&dest, stride, dc); + idct8x8_1_add_pos_kernel(&dest, stride, dc); + idct8x8_1_add_pos_kernel(&dest, stride, dc); + } else { + const uint8x8_t dc = create_dcd(-a1); + idct8x8_1_add_neg_kernel(&dest, stride, dc); + idct8x8_1_add_neg_kernel(&dest, stride, dc); + idct8x8_1_add_neg_kernel(&dest, stride, dc); + idct8x8_1_add_neg_kernel(&dest, stride, dc); + idct8x8_1_add_neg_kernel(&dest, stride, dc); + idct8x8_1_add_neg_kernel(&dest, stride, dc); + idct8x8_1_add_neg_kernel(&dest, stride, dc); + idct8x8_1_add_neg_kernel(&dest, stride, dc); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_add_neon.c new file mode 100644 index 0000000000..7471387e47 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_add_neon.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/txfm_common.h" + +void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + const int16x8_t cospis = vld1q_s16(kCospi); + const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 + const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 + int16x8_t a[8]; + + a[0] = load_tran_low_to_s16q(input); + a[1] = load_tran_low_to_s16q(input + 8); + a[2] = load_tran_low_to_s16q(input + 16); + a[3] = load_tran_low_to_s16q(input + 24); + a[4] = load_tran_low_to_s16q(input + 32); + a[5] = load_tran_low_to_s16q(input + 40); + a[6] = load_tran_low_to_s16q(input + 48); + a[7] = load_tran_low_to_s16q(input + 56); + + idct8x8_64_1d_bd8(cospis0, cospis1, a); + idct8x8_64_1d_bd8(cospis0, cospis1, a); + idct8x8_add8x8_neon(a, dest, stride); +} + +void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + const int16x8_t cospis = vld1q_s16(kCospi); + const int16x8_t cospisd = vaddq_s16(cospis, cospis); + const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 + const int16x4_t cospisd0 = vget_low_s16(cospisd); // doubled 0, 8, 16, 24 + const int16x4_t cospisd1 = vget_high_s16(cospisd); // doubled 4, 12, 20, 28 + int16x4_t a[8]; + int16x8_t b[8]; + + a[0] = load_tran_low_to_s16d(input); + a[1] = load_tran_low_to_s16d(input + 8); + a[2] = load_tran_low_to_s16d(input + 16); + a[3] = load_tran_low_to_s16d(input + 24); + + idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, a); + idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, a, b); + idct8x8_add8x8_neon(b, dest, stride); +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/idct_neon.asm new file mode 100644 index 0000000000..5dd9bdc788 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/idct_neon.asm @@ -0,0 +1,46 @@ +; +; Copyright (c) 2016 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + INCLUDE ./vpx_config.asm + + ; Helper functions used to load tran_low_t into int16, narrowing if + ; necessary. + + ; $dst0..3 are d registers with the pairs assumed to be contiguous in + ; non-high-bitdepth builds. q0-q3 are used as temporaries in high-bitdepth. + MACRO + LOAD_TRAN_LOW_TO_S16 $dst0, $dst1, $dst2, $dst3, $src + IF CONFIG_VP9_HIGHBITDEPTH + vld1.s32 {q0,q1}, [$src]! + vld1.s32 {q2,q3}, [$src]! + vmovn.i32 $dst0, q0 + vmovn.i32 $dst1, q1 + vmovn.i32 $dst2, q2 + vmovn.i32 $dst3, q3 + ELSE + vld1.s16 {$dst0-$dst1,$dst2-$dst3}, [$src]! + ENDIF + MEND + + ; $dst0..3 are d registers. q0-q3 are used as temporaries in high-bitdepth. + MACRO + LOAD_TRAN_LOW_TO_S16X2 $dst0, $dst1, $dst2, $dst3, $src + IF CONFIG_VP9_HIGHBITDEPTH + vld2.s32 {q0,q1}, [$src]! + vld2.s32 {q2,q3}, [$src]! + vmovn.i32 $dst0, q0 + vmovn.i32 $dst1, q2 + vmovn.i32 $dst2, q1 + vmovn.i32 $dst3, q3 + ELSE + vld2.s16 {$dst0,$dst1,$dst2,$dst3}, [$src]! + ENDIF + MEND + END diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/idct_neon.h new file mode 100644 index 0000000000..c02311326b --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/idct_neon.h @@ -0,0 +1,919 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_IDCT_NEON_H_ +#define VPX_VPX_DSP_ARM_IDCT_NEON_H_ + +#include + +#include "./vpx_config.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/vpx_dsp_common.h" + +static const int16_t kCospi[16] = { + 16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */, + 11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */, + 16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */, + -9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */, + 16305 /* cospi_2_64 */, 1606 /* cospi_30_64 */, + 14449 /* cospi_10_64 */, 7723 /* cospi_22_64 */, + 15679 /* cospi_6_64 */, -4756 /* -cospi_26_64 */, + 12665 /* cospi_14_64 */, -10394 /* -cospi_18_64 */ +}; + +static const int32_t kCospi32[16] = { + 16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */, + 11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */, + 16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */, + -9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */, + 16305 /* cospi_2_64 */, 1606 /* cospi_30_64 */, + 14449 /* cospi_10_64 */, 7723 /* cospi_22_64 */, + 15679 /* cospi_6_64 */, -4756 /* -cospi_26_64 */, + 12665 /* cospi_14_64 */, -10394 /* -cospi_18_64 */ +}; + +//------------------------------------------------------------------------------ +// Use saturating add/sub to avoid overflow in 2nd pass in high bit-depth +static INLINE int16x8_t final_add(const int16x8_t a, const int16x8_t b) { +#if CONFIG_VP9_HIGHBITDEPTH + return vqaddq_s16(a, b); +#else + return vaddq_s16(a, b); +#endif +} + +static INLINE int16x8_t final_sub(const int16x8_t a, const int16x8_t b) { +#if CONFIG_VP9_HIGHBITDEPTH + return vqsubq_s16(a, b); +#else + return vsubq_s16(a, b); +#endif +} + +//------------------------------------------------------------------------------ + +static INLINE int32x4x2_t highbd_idct_add_dual(const int32x4x2_t s0, + const int32x4x2_t s1) { + int32x4x2_t t; + t.val[0] = vaddq_s32(s0.val[0], s1.val[0]); + t.val[1] = vaddq_s32(s0.val[1], s1.val[1]); + return t; +} + +static INLINE int32x4x2_t highbd_idct_sub_dual(const int32x4x2_t s0, + const int32x4x2_t s1) { + int32x4x2_t t; + t.val[0] = vsubq_s32(s0.val[0], s1.val[0]); + t.val[1] = vsubq_s32(s0.val[1], s1.val[1]); + return t; +} + +//------------------------------------------------------------------------------ + +static INLINE int16x8_t dct_const_round_shift_low_8(const int32x4_t *const in) { + return vcombine_s16(vrshrn_n_s32(in[0], DCT_CONST_BITS), + vrshrn_n_s32(in[1], DCT_CONST_BITS)); +} + +static INLINE void dct_const_round_shift_low_8_dual(const int32x4_t *const t32, + int16x8_t *const d0, + int16x8_t *const d1) { + *d0 = dct_const_round_shift_low_8(t32 + 0); + *d1 = dct_const_round_shift_low_8(t32 + 2); +} + +static INLINE int32x4x2_t +dct_const_round_shift_high_4x2(const int64x2_t *const in) { + int32x4x2_t out; + out.val[0] = vcombine_s32(vrshrn_n_s64(in[0], DCT_CONST_BITS), + vrshrn_n_s64(in[1], DCT_CONST_BITS)); + out.val[1] = vcombine_s32(vrshrn_n_s64(in[2], DCT_CONST_BITS), + vrshrn_n_s64(in[3], DCT_CONST_BITS)); + return out; +} + +// Multiply a by a_const. Saturate, shift and narrow by DCT_CONST_BITS. +static INLINE int16x8_t multiply_shift_and_narrow_s16(const int16x8_t a, + const int16_t a_const) { + // Shift by DCT_CONST_BITS + rounding will be within 16 bits for well formed + // streams. See WRAPLOW and dct_const_round_shift for details. + // This instruction doubles the result and returns the high half, essentially + // resulting in a right shift by 15. By multiplying the constant first that + // becomes a right shift by DCT_CONST_BITS. + // The largest possible value used here is + // vpx_dsp/txfm_common.h:cospi_1_64 = 16364 (* 2 = 32728) a which falls *just* + // within the range of int16_t (+32767 / -32768) even when negated. + return vqrdmulhq_n_s16(a, a_const * 2); +} + +// Add a and b, then multiply by ab_const. Shift and narrow by DCT_CONST_BITS. +static INLINE int16x8_t add_multiply_shift_and_narrow_s16( + const int16x8_t a, const int16x8_t b, const int16_t ab_const) { + // In both add_ and it's pair, sub_, the input for well-formed streams will be + // well within 16 bits (input to the idct is the difference between two frames + // and will be within -255 to 255, or 9 bits) + // However, for inputs over about 25,000 (valid for int16_t, but not for idct + // input) this function can not use vaddq_s16. + // In order to match existing behavior and intentionally out of range tests, + // expand the addition up to 32 bits to prevent truncation. + int32x4_t t[2]; + t[0] = vaddl_s16(vget_low_s16(a), vget_low_s16(b)); + t[1] = vaddl_s16(vget_high_s16(a), vget_high_s16(b)); + t[0] = vmulq_n_s32(t[0], ab_const); + t[1] = vmulq_n_s32(t[1], ab_const); + return dct_const_round_shift_low_8(t); +} + +// Subtract b from a, then multiply by ab_const. Shift and narrow by +// DCT_CONST_BITS. +static INLINE int16x8_t sub_multiply_shift_and_narrow_s16( + const int16x8_t a, const int16x8_t b, const int16_t ab_const) { + int32x4_t t[2]; + t[0] = vsubl_s16(vget_low_s16(a), vget_low_s16(b)); + t[1] = vsubl_s16(vget_high_s16(a), vget_high_s16(b)); + t[0] = vmulq_n_s32(t[0], ab_const); + t[1] = vmulq_n_s32(t[1], ab_const); + return dct_const_round_shift_low_8(t); +} + +// Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by +// DCT_CONST_BITS. +static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16( + const int16x8_t a, const int16_t a_const, const int16x8_t b, + const int16_t b_const) { + int32x4_t t[2]; + t[0] = vmull_n_s16(vget_low_s16(a), a_const); + t[1] = vmull_n_s16(vget_high_s16(a), a_const); + t[0] = vmlal_n_s16(t[0], vget_low_s16(b), b_const); + t[1] = vmlal_n_s16(t[1], vget_high_s16(b), b_const); + return dct_const_round_shift_low_8(t); +} + +//------------------------------------------------------------------------------ + +// Note: The following 4 functions could use 32-bit operations for bit-depth 10. +// However, although it's 20% faster with gcc, it's 20% slower with clang. +// Use 64-bit operations for now. + +// Multiply a by a_const. Saturate, shift and narrow by DCT_CONST_BITS. +static INLINE int32x4x2_t +multiply_shift_and_narrow_s32_dual(const int32x4x2_t a, const int32_t a_const) { + int64x2_t b[4]; + + b[0] = vmull_n_s32(vget_low_s32(a.val[0]), a_const); + b[1] = vmull_n_s32(vget_high_s32(a.val[0]), a_const); + b[2] = vmull_n_s32(vget_low_s32(a.val[1]), a_const); + b[3] = vmull_n_s32(vget_high_s32(a.val[1]), a_const); + return dct_const_round_shift_high_4x2(b); +} + +// Add a and b, then multiply by ab_const. Shift and narrow by DCT_CONST_BITS. +static INLINE int32x4x2_t add_multiply_shift_and_narrow_s32_dual( + const int32x4x2_t a, const int32x4x2_t b, const int32_t ab_const) { + int32x4_t t[2]; + int64x2_t c[4]; + + t[0] = vaddq_s32(a.val[0], b.val[0]); + t[1] = vaddq_s32(a.val[1], b.val[1]); + c[0] = vmull_n_s32(vget_low_s32(t[0]), ab_const); + c[1] = vmull_n_s32(vget_high_s32(t[0]), ab_const); + c[2] = vmull_n_s32(vget_low_s32(t[1]), ab_const); + c[3] = vmull_n_s32(vget_high_s32(t[1]), ab_const); + return dct_const_round_shift_high_4x2(c); +} + +// Subtract b from a, then multiply by ab_const. Shift and narrow by +// DCT_CONST_BITS. +static INLINE int32x4x2_t sub_multiply_shift_and_narrow_s32_dual( + const int32x4x2_t a, const int32x4x2_t b, const int32_t ab_const) { + int32x4_t t[2]; + int64x2_t c[4]; + + t[0] = vsubq_s32(a.val[0], b.val[0]); + t[1] = vsubq_s32(a.val[1], b.val[1]); + c[0] = vmull_n_s32(vget_low_s32(t[0]), ab_const); + c[1] = vmull_n_s32(vget_high_s32(t[0]), ab_const); + c[2] = vmull_n_s32(vget_low_s32(t[1]), ab_const); + c[3] = vmull_n_s32(vget_high_s32(t[1]), ab_const); + return dct_const_round_shift_high_4x2(c); +} + +// Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by +// DCT_CONST_BITS. +static INLINE int32x4x2_t multiply_accumulate_shift_and_narrow_s32_dual( + const int32x4x2_t a, const int32_t a_const, const int32x4x2_t b, + const int32_t b_const) { + int64x2_t c[4]; + c[0] = vmull_n_s32(vget_low_s32(a.val[0]), a_const); + c[1] = vmull_n_s32(vget_high_s32(a.val[0]), a_const); + c[2] = vmull_n_s32(vget_low_s32(a.val[1]), a_const); + c[3] = vmull_n_s32(vget_high_s32(a.val[1]), a_const); + c[0] = vmlal_n_s32(c[0], vget_low_s32(b.val[0]), b_const); + c[1] = vmlal_n_s32(c[1], vget_high_s32(b.val[0]), b_const); + c[2] = vmlal_n_s32(c[2], vget_low_s32(b.val[1]), b_const); + c[3] = vmlal_n_s32(c[3], vget_high_s32(b.val[1]), b_const); + return dct_const_round_shift_high_4x2(c); +} + +// Shift the output down by 6 and add it to the destination buffer. +static INLINE void add_and_store_u8_s16(const int16x8_t *const a, uint8_t *d, + const int stride) { + uint8x8_t b[8]; + int16x8_t c[8]; + + b[0] = vld1_u8(d); + d += stride; + b[1] = vld1_u8(d); + d += stride; + b[2] = vld1_u8(d); + d += stride; + b[3] = vld1_u8(d); + d += stride; + b[4] = vld1_u8(d); + d += stride; + b[5] = vld1_u8(d); + d += stride; + b[6] = vld1_u8(d); + d += stride; + b[7] = vld1_u8(d); + d -= (7 * stride); + + // c = b + (a >> 6) + c[0] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[0])), a[0], 6); + c[1] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[1])), a[1], 6); + c[2] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[2])), a[2], 6); + c[3] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[3])), a[3], 6); + c[4] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[4])), a[4], 6); + c[5] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[5])), a[5], 6); + c[6] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[6])), a[6], 6); + c[7] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[7])), a[7], 6); + + b[0] = vqmovun_s16(c[0]); + b[1] = vqmovun_s16(c[1]); + b[2] = vqmovun_s16(c[2]); + b[3] = vqmovun_s16(c[3]); + b[4] = vqmovun_s16(c[4]); + b[5] = vqmovun_s16(c[5]); + b[6] = vqmovun_s16(c[6]); + b[7] = vqmovun_s16(c[7]); + + vst1_u8(d, b[0]); + d += stride; + vst1_u8(d, b[1]); + d += stride; + vst1_u8(d, b[2]); + d += stride; + vst1_u8(d, b[3]); + d += stride; + vst1_u8(d, b[4]); + d += stride; + vst1_u8(d, b[5]); + d += stride; + vst1_u8(d, b[6]); + d += stride; + vst1_u8(d, b[7]); +} + +static INLINE uint8x16_t create_dcq(const int16_t dc) { + // Clip both sides and gcc may compile to assembly 'usat'. + const int16_t t = (dc < 0) ? 0 : ((dc > 255) ? 255 : dc); + return vdupq_n_u8((uint8_t)t); +} + +static INLINE void idct4x4_16_kernel_bd8(int16x8_t *const a) { + const int16x4_t cospis = vld1_s16(kCospi); + int16x4_t b[4]; + int32x4_t c[4]; + int16x8_t d[2]; + + b[0] = vget_low_s16(a[0]); + b[1] = vget_high_s16(a[0]); + b[2] = vget_low_s16(a[1]); + b[3] = vget_high_s16(a[1]); + c[0] = vmull_lane_s16(b[0], cospis, 2); + c[2] = vmull_lane_s16(b[1], cospis, 2); + c[1] = vsubq_s32(c[0], c[2]); + c[0] = vaddq_s32(c[0], c[2]); + c[3] = vmull_lane_s16(b[2], cospis, 3); + c[2] = vmull_lane_s16(b[2], cospis, 1); + c[3] = vmlsl_lane_s16(c[3], b[3], cospis, 1); + c[2] = vmlal_lane_s16(c[2], b[3], cospis, 3); + dct_const_round_shift_low_8_dual(c, &d[0], &d[1]); + a[0] = vaddq_s16(d[0], d[1]); + a[1] = vsubq_s16(d[0], d[1]); +} + +static INLINE void transpose_idct4x4_16_bd8(int16x8_t *const a) { + transpose_s16_4x4q(&a[0], &a[1]); + idct4x4_16_kernel_bd8(a); +} + +static INLINE void idct8x8_12_pass1_bd8(const int16x4_t cospis0, + const int16x4_t cospisd0, + const int16x4_t cospisd1, + int16x4_t *const io) { + int16x4_t step1[8], step2[8]; + int32x4_t t32[2]; + + transpose_s16_4x4d(&io[0], &io[1], &io[2], &io[3]); + + // stage 1 + step1[4] = vqrdmulh_lane_s16(io[1], cospisd1, 3); + step1[5] = vqrdmulh_lane_s16(io[3], cospisd1, 2); + step1[6] = vqrdmulh_lane_s16(io[3], cospisd1, 1); + step1[7] = vqrdmulh_lane_s16(io[1], cospisd1, 0); + + // stage 2 + step2[1] = vqrdmulh_lane_s16(io[0], cospisd0, 2); + step2[2] = vqrdmulh_lane_s16(io[2], cospisd0, 3); + step2[3] = vqrdmulh_lane_s16(io[2], cospisd0, 1); + + step2[4] = vadd_s16(step1[4], step1[5]); + step2[5] = vsub_s16(step1[4], step1[5]); + step2[6] = vsub_s16(step1[7], step1[6]); + step2[7] = vadd_s16(step1[7], step1[6]); + + // stage 3 + step1[0] = vadd_s16(step2[1], step2[3]); + step1[1] = vadd_s16(step2[1], step2[2]); + step1[2] = vsub_s16(step2[1], step2[2]); + step1[3] = vsub_s16(step2[1], step2[3]); + + t32[1] = vmull_lane_s16(step2[6], cospis0, 2); + t32[0] = vmlsl_lane_s16(t32[1], step2[5], cospis0, 2); + t32[1] = vmlal_lane_s16(t32[1], step2[5], cospis0, 2); + step1[5] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); + step1[6] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); + + // stage 4 + io[0] = vadd_s16(step1[0], step2[7]); + io[1] = vadd_s16(step1[1], step1[6]); + io[2] = vadd_s16(step1[2], step1[5]); + io[3] = vadd_s16(step1[3], step2[4]); + io[4] = vsub_s16(step1[3], step2[4]); + io[5] = vsub_s16(step1[2], step1[5]); + io[6] = vsub_s16(step1[1], step1[6]); + io[7] = vsub_s16(step1[0], step2[7]); +} + +static INLINE void idct8x8_12_pass2_bd8(const int16x4_t cospis0, + const int16x4_t cospisd0, + const int16x4_t cospisd1, + const int16x4_t *const input, + int16x8_t *const output) { + int16x8_t in[4]; + int16x8_t step1[8], step2[8]; + int32x4_t t32[8]; + + transpose_s16_4x8(input[0], input[1], input[2], input[3], input[4], input[5], + input[6], input[7], &in[0], &in[1], &in[2], &in[3]); + + // stage 1 + step1[4] = vqrdmulhq_lane_s16(in[1], cospisd1, 3); + step1[5] = vqrdmulhq_lane_s16(in[3], cospisd1, 2); + step1[6] = vqrdmulhq_lane_s16(in[3], cospisd1, 1); + step1[7] = vqrdmulhq_lane_s16(in[1], cospisd1, 0); + + // stage 2 + step2[1] = vqrdmulhq_lane_s16(in[0], cospisd0, 2); + step2[2] = vqrdmulhq_lane_s16(in[2], cospisd0, 3); + step2[3] = vqrdmulhq_lane_s16(in[2], cospisd0, 1); + + step2[4] = vaddq_s16(step1[4], step1[5]); + step2[5] = vsubq_s16(step1[4], step1[5]); + step2[6] = vsubq_s16(step1[7], step1[6]); + step2[7] = vaddq_s16(step1[7], step1[6]); + + // stage 3 + step1[0] = vaddq_s16(step2[1], step2[3]); + step1[1] = vaddq_s16(step2[1], step2[2]); + step1[2] = vsubq_s16(step2[1], step2[2]); + step1[3] = vsubq_s16(step2[1], step2[3]); + + t32[2] = vmull_lane_s16(vget_low_s16(step2[6]), cospis0, 2); + t32[3] = vmull_lane_s16(vget_high_s16(step2[6]), cospis0, 2); + t32[0] = vmlsl_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2); + t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); + t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2); + t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); + dct_const_round_shift_low_8_dual(t32, &step1[5], &step1[6]); + + // stage 4 + output[0] = vaddq_s16(step1[0], step2[7]); + output[1] = vaddq_s16(step1[1], step1[6]); + output[2] = vaddq_s16(step1[2], step1[5]); + output[3] = vaddq_s16(step1[3], step2[4]); + output[4] = vsubq_s16(step1[3], step2[4]); + output[5] = vsubq_s16(step1[2], step1[5]); + output[6] = vsubq_s16(step1[1], step1[6]); + output[7] = vsubq_s16(step1[0], step2[7]); +} + +static INLINE void idct8x8_64_1d_bd8_kernel(const int16x4_t cospis0, + const int16x4_t cospis1, + int16x8_t *const io) { + int16x4_t input1l, input1h, input3l, input3h, input5l, input5h, input7l, + input7h; + int16x4_t step1l[4], step1h[4]; + int16x8_t step1[8], step2[8]; + int32x4_t t32[8]; + + // stage 1 + input1l = vget_low_s16(io[1]); + input1h = vget_high_s16(io[1]); + input3l = vget_low_s16(io[3]); + input3h = vget_high_s16(io[3]); + input5l = vget_low_s16(io[5]); + input5h = vget_high_s16(io[5]); + input7l = vget_low_s16(io[7]); + input7h = vget_high_s16(io[7]); + step1l[0] = vget_low_s16(io[0]); + step1h[0] = vget_high_s16(io[0]); + step1l[1] = vget_low_s16(io[2]); + step1h[1] = vget_high_s16(io[2]); + step1l[2] = vget_low_s16(io[4]); + step1h[2] = vget_high_s16(io[4]); + step1l[3] = vget_low_s16(io[6]); + step1h[3] = vget_high_s16(io[6]); + + t32[0] = vmull_lane_s16(input1l, cospis1, 3); + t32[1] = vmull_lane_s16(input1h, cospis1, 3); + t32[2] = vmull_lane_s16(input3l, cospis1, 2); + t32[3] = vmull_lane_s16(input3h, cospis1, 2); + t32[4] = vmull_lane_s16(input3l, cospis1, 1); + t32[5] = vmull_lane_s16(input3h, cospis1, 1); + t32[6] = vmull_lane_s16(input1l, cospis1, 0); + t32[7] = vmull_lane_s16(input1h, cospis1, 0); + t32[0] = vmlsl_lane_s16(t32[0], input7l, cospis1, 0); + t32[1] = vmlsl_lane_s16(t32[1], input7h, cospis1, 0); + t32[2] = vmlal_lane_s16(t32[2], input5l, cospis1, 1); + t32[3] = vmlal_lane_s16(t32[3], input5h, cospis1, 1); + t32[4] = vmlsl_lane_s16(t32[4], input5l, cospis1, 2); + t32[5] = vmlsl_lane_s16(t32[5], input5h, cospis1, 2); + t32[6] = vmlal_lane_s16(t32[6], input7l, cospis1, 3); + t32[7] = vmlal_lane_s16(t32[7], input7h, cospis1, 3); + dct_const_round_shift_low_8_dual(&t32[0], &step1[4], &step1[5]); + dct_const_round_shift_low_8_dual(&t32[4], &step1[6], &step1[7]); + + // stage 2 + t32[2] = vmull_lane_s16(step1l[0], cospis0, 2); + t32[3] = vmull_lane_s16(step1h[0], cospis0, 2); + t32[4] = vmull_lane_s16(step1l[1], cospis0, 3); + t32[5] = vmull_lane_s16(step1h[1], cospis0, 3); + t32[6] = vmull_lane_s16(step1l[1], cospis0, 1); + t32[7] = vmull_lane_s16(step1h[1], cospis0, 1); + t32[0] = vmlal_lane_s16(t32[2], step1l[2], cospis0, 2); + t32[1] = vmlal_lane_s16(t32[3], step1h[2], cospis0, 2); + t32[2] = vmlsl_lane_s16(t32[2], step1l[2], cospis0, 2); + t32[3] = vmlsl_lane_s16(t32[3], step1h[2], cospis0, 2); + t32[4] = vmlsl_lane_s16(t32[4], step1l[3], cospis0, 1); + t32[5] = vmlsl_lane_s16(t32[5], step1h[3], cospis0, 1); + t32[6] = vmlal_lane_s16(t32[6], step1l[3], cospis0, 3); + t32[7] = vmlal_lane_s16(t32[7], step1h[3], cospis0, 3); + dct_const_round_shift_low_8_dual(&t32[0], &step2[0], &step2[1]); + dct_const_round_shift_low_8_dual(&t32[4], &step2[2], &step2[3]); + + step2[4] = vaddq_s16(step1[4], step1[5]); + step2[5] = vsubq_s16(step1[4], step1[5]); + step2[6] = vsubq_s16(step1[7], step1[6]); + step2[7] = vaddq_s16(step1[7], step1[6]); + + // stage 3 + step1[0] = vaddq_s16(step2[0], step2[3]); + step1[1] = vaddq_s16(step2[1], step2[2]); + step1[2] = vsubq_s16(step2[1], step2[2]); + step1[3] = vsubq_s16(step2[0], step2[3]); + + t32[2] = vmull_lane_s16(vget_low_s16(step2[6]), cospis0, 2); + t32[3] = vmull_lane_s16(vget_high_s16(step2[6]), cospis0, 2); + t32[0] = vmlsl_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2); + t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); + t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2); + t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); + dct_const_round_shift_low_8_dual(t32, &step1[5], &step1[6]); + + // stage 4 + io[0] = vaddq_s16(step1[0], step2[7]); + io[1] = vaddq_s16(step1[1], step1[6]); + io[2] = vaddq_s16(step1[2], step1[5]); + io[3] = vaddq_s16(step1[3], step2[4]); + io[4] = vsubq_s16(step1[3], step2[4]); + io[5] = vsubq_s16(step1[2], step1[5]); + io[6] = vsubq_s16(step1[1], step1[6]); + io[7] = vsubq_s16(step1[0], step2[7]); +} + +static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0, + const int16x4_t cospis1, + int16x8_t *const io) { + transpose_s16_8x8(&io[0], &io[1], &io[2], &io[3], &io[4], &io[5], &io[6], + &io[7]); + idct8x8_64_1d_bd8_kernel(cospis0, cospis1, io); +} + +static INLINE void idct_cospi_8_24_q_kernel(const int16x8_t s0, + const int16x8_t s1, + const int16x4_t cospi_0_8_16_24, + int32x4_t *const t32) { + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_0_8_16_24, 3); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_0_8_16_24, 3); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_0_8_16_24, 3); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_0_8_16_24, 3); + t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_0_8_16_24, 1); + t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_0_8_16_24, 1); + t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_0_8_16_24, 1); + t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_0_8_16_24, 1); +} + +static INLINE void idct_cospi_8_24_q(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_0_8_16_24, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[4]; + + idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32); + dct_const_round_shift_low_8_dual(t32, d0, d1); +} + +static INLINE void idct_cospi_8_24_neg_q(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_0_8_16_24, + int16x8_t *const d0, + int16x8_t *const d1) { + int32x4_t t32[4]; + + idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32); + t32[2] = vnegq_s32(t32[2]); + t32[3] = vnegq_s32(t32[3]); + dct_const_round_shift_low_8_dual(t32, d0, d1); +} + +static INLINE void idct_cospi_16_16_q(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_0_8_16_24, + int16x8_t *const d0, + int16x8_t *const d1) { + int32x4_t t32[6]; + + t32[4] = vmull_lane_s16(vget_low_s16(s1), cospi_0_8_16_24, 2); + t32[5] = vmull_lane_s16(vget_high_s16(s1), cospi_0_8_16_24, 2); + t32[0] = vmlsl_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2); + t32[1] = vmlsl_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2); + t32[2] = vmlal_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2); + t32[3] = vmlal_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2); + dct_const_round_shift_low_8_dual(t32, d0, d1); +} + +static INLINE void idct_cospi_2_30(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_2_30_10_22, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[4]; + + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 1); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 1); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 1); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 1); + t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 0); + t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 0); + t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 0); + t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 0); + dct_const_round_shift_low_8_dual(t32, d0, d1); +} + +static INLINE void idct_cospi_4_28(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_4_12_20N_28, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[4]; + + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 3); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 3); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 3); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 3); + t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 0); + t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 0); + t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 0); + t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 0); + dct_const_round_shift_low_8_dual(t32, d0, d1); +} + +static INLINE void idct_cospi_6_26(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_6_26N_14_18N, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[4]; + + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26N_14_18N, 0); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26N_14_18N, 0); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26N_14_18N, 0); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26N_14_18N, 0); + t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26N_14_18N, 1); + t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26N_14_18N, 1); + t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26N_14_18N, 1); + t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26N_14_18N, 1); + dct_const_round_shift_low_8_dual(t32, d0, d1); +} + +static INLINE void idct_cospi_10_22(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_2_30_10_22, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[4]; + + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 3); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 3); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 3); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 3); + t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 2); + t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 2); + t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 2); + t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 2); + dct_const_round_shift_low_8_dual(t32, d0, d1); +} + +static INLINE void idct_cospi_12_20(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_4_12_20N_28, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[4]; + + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 1); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 1); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 1); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 1); + t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 2); + t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 2); + t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 2); + t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 2); + dct_const_round_shift_low_8_dual(t32, d0, d1); +} + +static INLINE void idct_cospi_14_18(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_6_26N_14_18N, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[4]; + + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26N_14_18N, 2); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26N_14_18N, 2); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26N_14_18N, 2); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26N_14_18N, 2); + t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26N_14_18N, 3); + t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26N_14_18N, 3); + t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26N_14_18N, 3); + t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26N_14_18N, 3); + dct_const_round_shift_low_8_dual(t32, d0, d1); +} + +static INLINE void idct16x16_add_stage7(const int16x8_t *const step2, + int16x8_t *const out) { +#if CONFIG_VP9_HIGHBITDEPTH + // Use saturating add/sub to avoid overflow in 2nd pass + out[0] = vqaddq_s16(step2[0], step2[15]); + out[1] = vqaddq_s16(step2[1], step2[14]); + out[2] = vqaddq_s16(step2[2], step2[13]); + out[3] = vqaddq_s16(step2[3], step2[12]); + out[4] = vqaddq_s16(step2[4], step2[11]); + out[5] = vqaddq_s16(step2[5], step2[10]); + out[6] = vqaddq_s16(step2[6], step2[9]); + out[7] = vqaddq_s16(step2[7], step2[8]); + out[8] = vqsubq_s16(step2[7], step2[8]); + out[9] = vqsubq_s16(step2[6], step2[9]); + out[10] = vqsubq_s16(step2[5], step2[10]); + out[11] = vqsubq_s16(step2[4], step2[11]); + out[12] = vqsubq_s16(step2[3], step2[12]); + out[13] = vqsubq_s16(step2[2], step2[13]); + out[14] = vqsubq_s16(step2[1], step2[14]); + out[15] = vqsubq_s16(step2[0], step2[15]); +#else + out[0] = vaddq_s16(step2[0], step2[15]); + out[1] = vaddq_s16(step2[1], step2[14]); + out[2] = vaddq_s16(step2[2], step2[13]); + out[3] = vaddq_s16(step2[3], step2[12]); + out[4] = vaddq_s16(step2[4], step2[11]); + out[5] = vaddq_s16(step2[5], step2[10]); + out[6] = vaddq_s16(step2[6], step2[9]); + out[7] = vaddq_s16(step2[7], step2[8]); + out[8] = vsubq_s16(step2[7], step2[8]); + out[9] = vsubq_s16(step2[6], step2[9]); + out[10] = vsubq_s16(step2[5], step2[10]); + out[11] = vsubq_s16(step2[4], step2[11]); + out[12] = vsubq_s16(step2[3], step2[12]); + out[13] = vsubq_s16(step2[2], step2[13]); + out[14] = vsubq_s16(step2[1], step2[14]); + out[15] = vsubq_s16(step2[0], step2[15]); +#endif +} + +static INLINE void idct16x16_store_pass1(const int16x8_t *const out, + int16_t *output) { + // Save the result into output + vst1q_s16(output, out[0]); + output += 16; + vst1q_s16(output, out[1]); + output += 16; + vst1q_s16(output, out[2]); + output += 16; + vst1q_s16(output, out[3]); + output += 16; + vst1q_s16(output, out[4]); + output += 16; + vst1q_s16(output, out[5]); + output += 16; + vst1q_s16(output, out[6]); + output += 16; + vst1q_s16(output, out[7]); + output += 16; + vst1q_s16(output, out[8]); + output += 16; + vst1q_s16(output, out[9]); + output += 16; + vst1q_s16(output, out[10]); + output += 16; + vst1q_s16(output, out[11]); + output += 16; + vst1q_s16(output, out[12]); + output += 16; + vst1q_s16(output, out[13]); + output += 16; + vst1q_s16(output, out[14]); + output += 16; + vst1q_s16(output, out[15]); +} + +static INLINE void idct8x8_add8x1(const int16x8_t a, uint8_t **const dest, + const int stride) { + const uint8x8_t s = vld1_u8(*dest); + const int16x8_t res = vrshrq_n_s16(a, 5); + const uint16x8_t q = vaddw_u8(vreinterpretq_u16_s16(res), s); + const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(q)); + vst1_u8(*dest, d); + *dest += stride; +} + +static INLINE void idct8x8_add8x8_neon(int16x8_t *const out, uint8_t *dest, + const int stride) { + idct8x8_add8x1(out[0], &dest, stride); + idct8x8_add8x1(out[1], &dest, stride); + idct8x8_add8x1(out[2], &dest, stride); + idct8x8_add8x1(out[3], &dest, stride); + idct8x8_add8x1(out[4], &dest, stride); + idct8x8_add8x1(out[5], &dest, stride); + idct8x8_add8x1(out[6], &dest, stride); + idct8x8_add8x1(out[7], &dest, stride); +} + +static INLINE void idct16x16_add8x1(const int16x8_t a, uint8_t **const dest, + const int stride) { + const uint8x8_t s = vld1_u8(*dest); + const int16x8_t res = vrshrq_n_s16(a, 6); + const uint16x8_t q = vaddw_u8(vreinterpretq_u16_s16(res), s); + const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(q)); + vst1_u8(*dest, d); + *dest += stride; +} + +static INLINE void idct16x16_add_store(const int16x8_t *const out, + uint8_t *dest, const int stride) { + // Add the result to dest + idct16x16_add8x1(out[0], &dest, stride); + idct16x16_add8x1(out[1], &dest, stride); + idct16x16_add8x1(out[2], &dest, stride); + idct16x16_add8x1(out[3], &dest, stride); + idct16x16_add8x1(out[4], &dest, stride); + idct16x16_add8x1(out[5], &dest, stride); + idct16x16_add8x1(out[6], &dest, stride); + idct16x16_add8x1(out[7], &dest, stride); + idct16x16_add8x1(out[8], &dest, stride); + idct16x16_add8x1(out[9], &dest, stride); + idct16x16_add8x1(out[10], &dest, stride); + idct16x16_add8x1(out[11], &dest, stride); + idct16x16_add8x1(out[12], &dest, stride); + idct16x16_add8x1(out[13], &dest, stride); + idct16x16_add8x1(out[14], &dest, stride); + idct16x16_add8x1(out[15], &dest, stride); +} + +static INLINE void highbd_idct16x16_add8x1(const int16x8_t a, + const int16x8_t max, + uint16_t **const dest, + const int stride) { + const uint16x8_t s = vld1q_u16(*dest); + const int16x8_t res0 = vqaddq_s16(a, vreinterpretq_s16_u16(s)); + const int16x8_t res1 = vminq_s16(res0, max); + const uint16x8_t d = vqshluq_n_s16(res1, 0); + vst1q_u16(*dest, d); + *dest += stride; +} + +static INLINE void idct16x16_add_store_bd8(int16x8_t *const out, uint16_t *dest, + const int stride) { + // Add the result to dest + const int16x8_t max = vdupq_n_s16((1 << 8) - 1); + out[0] = vrshrq_n_s16(out[0], 6); + out[1] = vrshrq_n_s16(out[1], 6); + out[2] = vrshrq_n_s16(out[2], 6); + out[3] = vrshrq_n_s16(out[3], 6); + out[4] = vrshrq_n_s16(out[4], 6); + out[5] = vrshrq_n_s16(out[5], 6); + out[6] = vrshrq_n_s16(out[6], 6); + out[7] = vrshrq_n_s16(out[7], 6); + out[8] = vrshrq_n_s16(out[8], 6); + out[9] = vrshrq_n_s16(out[9], 6); + out[10] = vrshrq_n_s16(out[10], 6); + out[11] = vrshrq_n_s16(out[11], 6); + out[12] = vrshrq_n_s16(out[12], 6); + out[13] = vrshrq_n_s16(out[13], 6); + out[14] = vrshrq_n_s16(out[14], 6); + out[15] = vrshrq_n_s16(out[15], 6); + highbd_idct16x16_add8x1(out[0], max, &dest, stride); + highbd_idct16x16_add8x1(out[1], max, &dest, stride); + highbd_idct16x16_add8x1(out[2], max, &dest, stride); + highbd_idct16x16_add8x1(out[3], max, &dest, stride); + highbd_idct16x16_add8x1(out[4], max, &dest, stride); + highbd_idct16x16_add8x1(out[5], max, &dest, stride); + highbd_idct16x16_add8x1(out[6], max, &dest, stride); + highbd_idct16x16_add8x1(out[7], max, &dest, stride); + highbd_idct16x16_add8x1(out[8], max, &dest, stride); + highbd_idct16x16_add8x1(out[9], max, &dest, stride); + highbd_idct16x16_add8x1(out[10], max, &dest, stride); + highbd_idct16x16_add8x1(out[11], max, &dest, stride); + highbd_idct16x16_add8x1(out[12], max, &dest, stride); + highbd_idct16x16_add8x1(out[13], max, &dest, stride); + highbd_idct16x16_add8x1(out[14], max, &dest, stride); + highbd_idct16x16_add8x1(out[15], max, &dest, stride); +} + +static INLINE void highbd_idct16x16_add8x1_bd8(const int16x8_t a, + uint16_t **const dest, + const int stride) { + const uint16x8_t s = vld1q_u16(*dest); + const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), a, 6); + const uint16x8_t d = vmovl_u8(vqmovun_s16(res)); + vst1q_u16(*dest, d); + *dest += stride; +} + +static INLINE void highbd_add_and_store_bd8(const int16x8_t *const a, + uint16_t *out, const int stride) { + highbd_idct16x16_add8x1_bd8(a[0], &out, stride); + highbd_idct16x16_add8x1_bd8(a[1], &out, stride); + highbd_idct16x16_add8x1_bd8(a[2], &out, stride); + highbd_idct16x16_add8x1_bd8(a[3], &out, stride); + highbd_idct16x16_add8x1_bd8(a[4], &out, stride); + highbd_idct16x16_add8x1_bd8(a[5], &out, stride); + highbd_idct16x16_add8x1_bd8(a[6], &out, stride); + highbd_idct16x16_add8x1_bd8(a[7], &out, stride); + highbd_idct16x16_add8x1_bd8(a[8], &out, stride); + highbd_idct16x16_add8x1_bd8(a[9], &out, stride); + highbd_idct16x16_add8x1_bd8(a[10], &out, stride); + highbd_idct16x16_add8x1_bd8(a[11], &out, stride); + highbd_idct16x16_add8x1_bd8(a[12], &out, stride); + highbd_idct16x16_add8x1_bd8(a[13], &out, stride); + highbd_idct16x16_add8x1_bd8(a[14], &out, stride); + highbd_idct16x16_add8x1_bd8(a[15], &out, stride); + highbd_idct16x16_add8x1_bd8(a[16], &out, stride); + highbd_idct16x16_add8x1_bd8(a[17], &out, stride); + highbd_idct16x16_add8x1_bd8(a[18], &out, stride); + highbd_idct16x16_add8x1_bd8(a[19], &out, stride); + highbd_idct16x16_add8x1_bd8(a[20], &out, stride); + highbd_idct16x16_add8x1_bd8(a[21], &out, stride); + highbd_idct16x16_add8x1_bd8(a[22], &out, stride); + highbd_idct16x16_add8x1_bd8(a[23], &out, stride); + highbd_idct16x16_add8x1_bd8(a[24], &out, stride); + highbd_idct16x16_add8x1_bd8(a[25], &out, stride); + highbd_idct16x16_add8x1_bd8(a[26], &out, stride); + highbd_idct16x16_add8x1_bd8(a[27], &out, stride); + highbd_idct16x16_add8x1_bd8(a[28], &out, stride); + highbd_idct16x16_add8x1_bd8(a[29], &out, stride); + highbd_idct16x16_add8x1_bd8(a[30], &out, stride); + highbd_idct16x16_add8x1_bd8(a[31], &out, stride); +} + +void vpx_idct16x16_256_add_half1d(const void *const input, int16_t *output, + void *const dest, const int stride, + const int highbd_flag); + +void vpx_idct16x16_38_add_half1d(const void *const input, int16_t *const output, + void *const dest, const int stride, + const int highbd_flag); + +void vpx_idct16x16_10_add_half1d_pass1(const tran_low_t *input, + int16_t *output); + +void vpx_idct16x16_10_add_half1d_pass2(const int16_t *input, + int16_t *const output, void *const dest, + const int stride, const int highbd_flag); + +void vpx_idct32_32_neon(const tran_low_t *input, uint8_t *dest, + const int stride, const int highbd_flag); + +void vpx_idct32_12_neon(const tran_low_t *const input, int16_t *output); +void vpx_idct32_16_neon(const int16_t *const input, void *const output, + const int stride, const int highbd_flag); + +void vpx_idct32_6_neon(const tran_low_t *input, int16_t *output); +void vpx_idct32_8_neon(const int16_t *input, void *const output, int stride, + const int highbd_flag); + +#endif // VPX_VPX_DSP_ARM_IDCT_NEON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon.c new file mode 100644 index 0000000000..4f909e4935 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon.c @@ -0,0 +1,1942 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "mem_neon.h" +#include "sum_neon.h" +#include "vpx/vpx_integer.h" + +//------------------------------------------------------------------------------ +// DC 4x4 + +static INLINE uint16_t dc_sum_4(const uint8_t *ref) { + return horizontal_add_uint8x4(load_unaligned_u8_4x1(ref)); +} + +static INLINE void dc_store_4x4(uint8_t *dst, ptrdiff_t stride, + const uint8x8_t dc) { + int i; + for (i = 0; i < 4; ++i, dst += stride) { + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dc), 0); + } +} + +void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x8_t a = load_unaligned_u8_4x1(above); + const uint8x8_t l = load_unaligned_u8_4x1(left); + const uint16x4_t al = vget_low_u16(vaddl_u8(a, l)); + const uint16_t sum = horizontal_add_uint16x4(al); + const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3); + dc_store_4x4(dst, stride, dc); +} + +void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint16_t sum = dc_sum_4(left); + const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 2); + (void)above; + dc_store_4x4(dst, stride, dc); +} + +void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint16_t sum = dc_sum_4(above); + const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 2); + (void)left; + dc_store_4x4(dst, stride, dc); +} + +void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x8_t dc = vdup_n_u8(0x80); + (void)above; + (void)left; + dc_store_4x4(dst, stride, dc); +} + +//------------------------------------------------------------------------------ +// DC 8x8 + +static INLINE uint16_t dc_sum_8(const uint8_t *ref) { + return horizontal_add_uint8x8(vld1_u8(ref)); +} + +static INLINE void dc_store_8x8(uint8_t *dst, ptrdiff_t stride, + const uint8x8_t dc) { + int i; + for (i = 0; i < 8; ++i, dst += stride) { + vst1_u8(dst, dc); + } +} + +void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x8_t above_u8 = vld1_u8(above); + const uint8x8_t left_u8 = vld1_u8(left); + const uint16x8_t al = vaddl_u8(above_u8, left_u8); + const uint16_t sum = horizontal_add_uint16x8(al); + const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 4); + dc_store_8x8(dst, stride, dc); +} + +void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint16_t sum = dc_sum_8(left); + const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3); + (void)above; + dc_store_8x8(dst, stride, dc); +} + +void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint16_t sum = dc_sum_8(above); + const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3); + (void)left; + dc_store_8x8(dst, stride, dc); +} + +void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x8_t dc = vdup_n_u8(0x80); + (void)above; + (void)left; + dc_store_8x8(dst, stride, dc); +} + +//------------------------------------------------------------------------------ +// DC 16x16 + +static INLINE uint16_t dc_sum_16(const uint8_t *ref) { + return horizontal_add_uint8x16(vld1q_u8(ref)); +} + +static INLINE void dc_store_16x16(uint8_t *dst, ptrdiff_t stride, + const uint8x16_t dc) { + int i; + for (i = 0; i < 16; ++i, dst += stride) { + vst1q_u8(dst + 0, dc); + } +} + +void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t ref0 = vld1q_u8(above); + const uint8x16_t ref1 = vld1q_u8(left); + const uint16x8_t a = vpaddlq_u8(ref0); + const uint16x8_t l = vpaddlq_u8(ref1); + const uint16x8_t al = vaddq_u16(a, l); + const uint16_t sum = horizontal_add_uint16x8(al); + const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0); + dc_store_16x16(dst, stride, dc); +} + +void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const uint16_t sum = dc_sum_16(left); + const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 4), 0); + (void)above; + dc_store_16x16(dst, stride, dc); +} + +void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const uint16_t sum = dc_sum_16(above); + const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 4), 0); + (void)left; + dc_store_16x16(dst, stride, dc); +} + +void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const uint8x16_t dc = vdupq_n_u8(0x80); + (void)above; + (void)left; + dc_store_16x16(dst, stride, dc); +} + +//------------------------------------------------------------------------------ +// DC 32x32 + +static INLINE uint16_t dc_sum_32(const uint8_t *ref) { + const uint8x16_t r0 = vld1q_u8(ref + 0); + const uint8x16_t r1 = vld1q_u8(ref + 16); + const uint16x8_t r01 = vaddq_u16(vpaddlq_u8(r0), vpaddlq_u8(r1)); + return horizontal_add_uint16x8(r01); +} + +static INLINE void dc_store_32x32(uint8_t *dst, ptrdiff_t stride, + const uint8x16_t dc) { + int i; + for (i = 0; i < 32; ++i, dst += stride) { + vst1q_u8(dst + 0, dc); + vst1q_u8(dst + 16, dc); + } +} + +void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t a0 = vld1q_u8(above + 0); + const uint8x16_t a1 = vld1q_u8(above + 16); + const uint8x16_t l0 = vld1q_u8(left + 0); + const uint8x16_t l1 = vld1q_u8(left + 16); + const uint16x8_t a01 = vaddq_u16(vpaddlq_u8(a0), vpaddlq_u8(a1)); + const uint16x8_t l01 = vaddq_u16(vpaddlq_u8(l0), vpaddlq_u8(l1)); + const uint16x8_t al = vaddq_u16(a01, l01); + const uint16_t sum = horizontal_add_uint16x8(al); + const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 6), 0); + dc_store_32x32(dst, stride, dc); +} + +void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const uint16_t sum = dc_sum_32(left); + const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0); + (void)above; + dc_store_32x32(dst, stride, dc); +} + +void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const uint16_t sum = dc_sum_32(above); + const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0); + (void)left; + dc_store_32x32(dst, stride, dc); +} + +void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const uint8x16_t dc = vdupq_n_u8(0x80); + (void)above; + (void)left; + dc_store_32x32(dst, stride, dc); +} + +// ----------------------------------------------------------------------------- + +void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t a0, a1, a2, d0; + uint8_t a7; + (void)left; + + a0 = vld1_u8(above); + a7 = above[7]; + + // [ above[1], ..., above[6], x, x ] + a1 = vext_u8(a0, a0, 1); + // [ above[2], ..., above[7], x, x ] + a2 = vext_u8(a0, a0, 2); + + // d0[0] = AVG3(above[0], above[1], above[2]); + // ... + // d0[5] = AVG3(above[5], above[6], above[7]); + // d0[6] = x (don't care) + // d0[7] = x (don't care) + d0 = vrhadd_u8(vhadd_u8(a0, a2), a1); + + // We want: + // stride=0 [ d0[0], d0[1], d0[2], d0[3] ] + // stride=1 [ d0[1], d0[2], d0[3], d0[4] ] + // stride=2 [ d0[2], d0[3], d0[4], d0[5] ] + // stride=2 [ d0[3], d0[4], d0[5], above[7] ] + store_u8_4x1(dst + 0 * stride, d0); + store_u8_4x1(dst + 1 * stride, vext_u8(d0, d0, 1)); + store_u8_4x1(dst + 2 * stride, vext_u8(d0, d0, 2)); + store_u8_4x1(dst + 3 * stride, vext_u8(d0, d0, 3)); + + // We stored d0[6] above, so fixup into above[7]. + dst[3 * stride + 3] = a7; +} + +void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t ax0, a0, a1, a7, d0; + (void)left; + + a0 = vld1_u8(above + 0); + a1 = vld1_u8(above + 1); + a7 = vld1_dup_u8(above + 7); + + // We want to calculate the AVG3 result in lanes 1-7 inclusive so we can + // shift in above[7] later, so shift a0 across by one to get the right + // inputs: + // [ x, above[0], ... , above[6] ] + ax0 = vext_u8(a0, a0, 7); + + // d0[0] = x (don't care) + // d0[1] = AVG3(above[0], above[1], above[2]); + // ... + // d0[7] = AVG3(above[6], above[7], above[8]); + d0 = vrhadd_u8(vhadd_u8(ax0, a1), a0); + + // Undo the earlier ext, incrementally shift in duplicates of above[7]. + vst1_u8(dst + 0 * stride, vext_u8(d0, a7, 1)); + vst1_u8(dst + 1 * stride, vext_u8(d0, a7, 2)); + vst1_u8(dst + 2 * stride, vext_u8(d0, a7, 3)); + vst1_u8(dst + 3 * stride, vext_u8(d0, a7, 4)); + vst1_u8(dst + 4 * stride, vext_u8(d0, a7, 5)); + vst1_u8(dst + 5 * stride, vext_u8(d0, a7, 6)); + vst1_u8(dst + 6 * stride, vext_u8(d0, a7, 7)); + vst1_u8(dst + 7 * stride, a7); +} + +void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x16_t ax0, a0, a1, a15, d0; + (void)left; + + a0 = vld1q_u8(above + 0); + a1 = vld1q_u8(above + 1); + a15 = vld1q_dup_u8(above + 15); + + // We want to calculate the AVG3 result in lanes 1-15 inclusive so we can + // shift in above[15] later, so shift a0 across by one to get the right + // inputs: + // [ x, above[0], ... , above[14] ] + ax0 = vextq_u8(a0, a0, 15); + + // d0[0] = x (don't care) + // d0[1] = AVG3(above[0], above[1], above[2]); + // ... + // d0[15] = AVG3(above[14], above[15], above[16]); + d0 = vrhaddq_u8(vhaddq_u8(ax0, a1), a0); + + // Undo the earlier ext, incrementally shift in duplicates of above[15]. + vst1q_u8(dst + 0 * stride, vextq_u8(d0, a15, 1)); + vst1q_u8(dst + 1 * stride, vextq_u8(d0, a15, 2)); + vst1q_u8(dst + 2 * stride, vextq_u8(d0, a15, 3)); + vst1q_u8(dst + 3 * stride, vextq_u8(d0, a15, 4)); + vst1q_u8(dst + 4 * stride, vextq_u8(d0, a15, 5)); + vst1q_u8(dst + 5 * stride, vextq_u8(d0, a15, 6)); + vst1q_u8(dst + 6 * stride, vextq_u8(d0, a15, 7)); + vst1q_u8(dst + 7 * stride, vextq_u8(d0, a15, 8)); + vst1q_u8(dst + 8 * stride, vextq_u8(d0, a15, 9)); + vst1q_u8(dst + 9 * stride, vextq_u8(d0, a15, 10)); + vst1q_u8(dst + 10 * stride, vextq_u8(d0, a15, 11)); + vst1q_u8(dst + 11 * stride, vextq_u8(d0, a15, 12)); + vst1q_u8(dst + 12 * stride, vextq_u8(d0, a15, 13)); + vst1q_u8(dst + 13 * stride, vextq_u8(d0, a15, 14)); + vst1q_u8(dst + 14 * stride, vextq_u8(d0, a15, 15)); + vst1q_u8(dst + 15 * stride, a15); +} + +void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x16_t ax0, a0, a1, a15, a16, a17, a31, d0[2]; + (void)left; + + a0 = vld1q_u8(above + 0); + a1 = vld1q_u8(above + 1); + a15 = vld1q_u8(above + 15); + a16 = vld1q_u8(above + 16); + a17 = vld1q_u8(above + 17); + a31 = vld1q_dup_u8(above + 31); + + // We want to calculate the AVG3 result in lanes 1-15 inclusive so we can + // shift in above[15] later, so shift a0 across by one to get the right + // inputs: + // [ x, above[0], ... , above[14] ] + ax0 = vextq_u8(a0, a0, 15); + + // d0[0] = x (don't care) + // d0[1] = AVG3(above[0], above[1], above[2]); + // ... + // d0[15] = AVG3(above[14], above[15], above[16]); + d0[0] = vrhaddq_u8(vhaddq_u8(ax0, a1), a0); + d0[1] = vrhaddq_u8(vhaddq_u8(a15, a17), a16); + + // Undo the earlier ext, incrementally shift in duplicates of above[15]. + vst1q_u8(dst + 0 * stride + 0, vextq_u8(d0[0], d0[1], 1)); + vst1q_u8(dst + 0 * stride + 16, vextq_u8(d0[1], a31, 1)); + vst1q_u8(dst + 1 * stride + 0, vextq_u8(d0[0], d0[1], 2)); + vst1q_u8(dst + 1 * stride + 16, vextq_u8(d0[1], a31, 2)); + vst1q_u8(dst + 2 * stride + 0, vextq_u8(d0[0], d0[1], 3)); + vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0[1], a31, 3)); + vst1q_u8(dst + 3 * stride + 0, vextq_u8(d0[0], d0[1], 4)); + vst1q_u8(dst + 3 * stride + 16, vextq_u8(d0[1], a31, 4)); + vst1q_u8(dst + 4 * stride + 0, vextq_u8(d0[0], d0[1], 5)); + vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0[1], a31, 5)); + vst1q_u8(dst + 5 * stride + 0, vextq_u8(d0[0], d0[1], 6)); + vst1q_u8(dst + 5 * stride + 16, vextq_u8(d0[1], a31, 6)); + vst1q_u8(dst + 6 * stride + 0, vextq_u8(d0[0], d0[1], 7)); + vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0[1], a31, 7)); + vst1q_u8(dst + 7 * stride + 0, vextq_u8(d0[0], d0[1], 8)); + vst1q_u8(dst + 7 * stride + 16, vextq_u8(d0[1], a31, 8)); + vst1q_u8(dst + 8 * stride + 0, vextq_u8(d0[0], d0[1], 9)); + vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0[1], a31, 9)); + vst1q_u8(dst + 9 * stride + 0, vextq_u8(d0[0], d0[1], 10)); + vst1q_u8(dst + 9 * stride + 16, vextq_u8(d0[1], a31, 10)); + vst1q_u8(dst + 10 * stride + 0, vextq_u8(d0[0], d0[1], 11)); + vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0[1], a31, 11)); + vst1q_u8(dst + 11 * stride + 0, vextq_u8(d0[0], d0[1], 12)); + vst1q_u8(dst + 11 * stride + 16, vextq_u8(d0[1], a31, 12)); + vst1q_u8(dst + 12 * stride + 0, vextq_u8(d0[0], d0[1], 13)); + vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0[1], a31, 13)); + vst1q_u8(dst + 13 * stride + 0, vextq_u8(d0[0], d0[1], 14)); + vst1q_u8(dst + 13 * stride + 16, vextq_u8(d0[1], a31, 14)); + vst1q_u8(dst + 14 * stride + 0, vextq_u8(d0[0], d0[1], 15)); + vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0[1], a31, 15)); + vst1q_u8(dst + 15 * stride + 0, d0[1]); + vst1q_u8(dst + 15 * stride + 16, a31); + + vst1q_u8(dst + 16 * stride + 0, vextq_u8(d0[1], a31, 1)); + vst1q_u8(dst + 16 * stride + 16, a31); + vst1q_u8(dst + 17 * stride + 0, vextq_u8(d0[1], a31, 2)); + vst1q_u8(dst + 17 * stride + 16, a31); + vst1q_u8(dst + 18 * stride + 0, vextq_u8(d0[1], a31, 3)); + vst1q_u8(dst + 18 * stride + 16, a31); + vst1q_u8(dst + 19 * stride + 0, vextq_u8(d0[1], a31, 4)); + vst1q_u8(dst + 19 * stride + 16, a31); + vst1q_u8(dst + 20 * stride + 0, vextq_u8(d0[1], a31, 5)); + vst1q_u8(dst + 20 * stride + 16, a31); + vst1q_u8(dst + 21 * stride + 0, vextq_u8(d0[1], a31, 6)); + vst1q_u8(dst + 21 * stride + 16, a31); + vst1q_u8(dst + 22 * stride + 0, vextq_u8(d0[1], a31, 7)); + vst1q_u8(dst + 22 * stride + 16, a31); + vst1q_u8(dst + 23 * stride + 0, vextq_u8(d0[1], a31, 8)); + vst1q_u8(dst + 23 * stride + 16, a31); + vst1q_u8(dst + 24 * stride + 0, vextq_u8(d0[1], a31, 9)); + vst1q_u8(dst + 24 * stride + 16, a31); + vst1q_u8(dst + 25 * stride + 0, vextq_u8(d0[1], a31, 10)); + vst1q_u8(dst + 25 * stride + 16, a31); + vst1q_u8(dst + 26 * stride + 0, vextq_u8(d0[1], a31, 11)); + vst1q_u8(dst + 26 * stride + 16, a31); + vst1q_u8(dst + 27 * stride + 0, vextq_u8(d0[1], a31, 12)); + vst1q_u8(dst + 27 * stride + 16, a31); + vst1q_u8(dst + 28 * stride + 0, vextq_u8(d0[1], a31, 13)); + vst1q_u8(dst + 28 * stride + 16, a31); + vst1q_u8(dst + 29 * stride + 0, vextq_u8(d0[1], a31, 14)); + vst1q_u8(dst + 29 * stride + 16, a31); + vst1q_u8(dst + 30 * stride + 0, vextq_u8(d0[1], a31, 15)); + vst1q_u8(dst + 30 * stride + 16, a31); + vst1q_u8(dst + 31 * stride + 0, a31); + vst1q_u8(dst + 31 * stride + 16, a31); +} + +// ----------------------------------------------------------------------------- + +void vpx_d63_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t a0, a1, a2, a3, d0, d1, d2, d3; + (void)left; + + a0 = load_unaligned_u8_4x1(above + 0); + a1 = load_unaligned_u8_4x1(above + 1); + a2 = load_unaligned_u8_4x1(above + 2); + a3 = load_unaligned_u8_4x1(above + 3); + + d0 = vrhadd_u8(a0, a1); + d1 = vrhadd_u8(vhadd_u8(a0, a2), a1); + d2 = vrhadd_u8(a1, a2); + d3 = vrhadd_u8(vhadd_u8(a1, a3), a2); + + store_u8_4x1(dst + 0 * stride, d0); + store_u8_4x1(dst + 1 * stride, d1); + store_u8_4x1(dst + 2 * stride, d2); + store_u8_4x1(dst + 3 * stride, d3); +} + +void vpx_d63_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t a0, a1, a2, a7, d0, d1; + (void)left; + + a0 = vld1_u8(above + 0); + a1 = vld1_u8(above + 1); + a2 = vld1_u8(above + 2); + a7 = vld1_dup_u8(above + 7); + + d0 = vrhadd_u8(a0, a1); + d1 = vrhadd_u8(vhadd_u8(a0, a2), a1); + + vst1_u8(dst + 0 * stride, d0); + vst1_u8(dst + 1 * stride, d1); + + d0 = vext_u8(d0, d0, 7); + d1 = vext_u8(d1, d1, 7); + + vst1_u8(dst + 2 * stride, vext_u8(d0, a7, 2)); + vst1_u8(dst + 3 * stride, vext_u8(d1, a7, 2)); + vst1_u8(dst + 4 * stride, vext_u8(d0, a7, 3)); + vst1_u8(dst + 5 * stride, vext_u8(d1, a7, 3)); + vst1_u8(dst + 6 * stride, vext_u8(d0, a7, 4)); + vst1_u8(dst + 7 * stride, vext_u8(d1, a7, 4)); +} + +void vpx_d63_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x16_t a0, a1, a2, a15, d0, d1; + (void)left; + + a0 = vld1q_u8(above + 0); + a1 = vld1q_u8(above + 1); + a2 = vld1q_u8(above + 2); + a15 = vld1q_dup_u8(above + 15); + + d0 = vrhaddq_u8(a0, a1); + d1 = vrhaddq_u8(vhaddq_u8(a0, a2), a1); + + vst1q_u8(dst + 0 * stride, d0); + vst1q_u8(dst + 1 * stride, d1); + + d0 = vextq_u8(d0, d0, 15); + d1 = vextq_u8(d1, d1, 15); + + vst1q_u8(dst + 2 * stride, vextq_u8(d0, a15, 2)); + vst1q_u8(dst + 3 * stride, vextq_u8(d1, a15, 2)); + vst1q_u8(dst + 4 * stride, vextq_u8(d0, a15, 3)); + vst1q_u8(dst + 5 * stride, vextq_u8(d1, a15, 3)); + vst1q_u8(dst + 6 * stride, vextq_u8(d0, a15, 4)); + vst1q_u8(dst + 7 * stride, vextq_u8(d1, a15, 4)); + vst1q_u8(dst + 8 * stride, vextq_u8(d0, a15, 5)); + vst1q_u8(dst + 9 * stride, vextq_u8(d1, a15, 5)); + vst1q_u8(dst + 10 * stride, vextq_u8(d0, a15, 6)); + vst1q_u8(dst + 11 * stride, vextq_u8(d1, a15, 6)); + vst1q_u8(dst + 12 * stride, vextq_u8(d0, a15, 7)); + vst1q_u8(dst + 13 * stride, vextq_u8(d1, a15, 7)); + vst1q_u8(dst + 14 * stride, vextq_u8(d0, a15, 8)); + vst1q_u8(dst + 15 * stride, vextq_u8(d1, a15, 8)); +} + +void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x16_t a0, a1, a2, a16, a17, a18, a31, d0_lo, d0_hi, d1_lo, d1_hi; + (void)left; + + a0 = vld1q_u8(above + 0); + a1 = vld1q_u8(above + 1); + a2 = vld1q_u8(above + 2); + a16 = vld1q_u8(above + 16); + a17 = vld1q_u8(above + 17); + a18 = vld1q_u8(above + 18); + a31 = vld1q_dup_u8(above + 31); + + d0_lo = vrhaddq_u8(a0, a1); + d0_hi = vrhaddq_u8(a16, a17); + d1_lo = vrhaddq_u8(vhaddq_u8(a0, a2), a1); + d1_hi = vrhaddq_u8(vhaddq_u8(a16, a18), a17); + + vst1q_u8(dst + 0 * stride + 0, d0_lo); + vst1q_u8(dst + 0 * stride + 16, d0_hi); + vst1q_u8(dst + 1 * stride + 0, d1_lo); + vst1q_u8(dst + 1 * stride + 16, d1_hi); + + d0_hi = vextq_u8(d0_lo, d0_hi, 15); + d0_lo = vextq_u8(d0_lo, d0_lo, 15); + d1_hi = vextq_u8(d1_lo, d1_hi, 15); + d1_lo = vextq_u8(d1_lo, d1_lo, 15); + + vst1q_u8(dst + 2 * stride + 0, vextq_u8(d0_lo, d0_hi, 2)); + vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_hi, a31, 2)); + vst1q_u8(dst + 3 * stride + 0, vextq_u8(d1_lo, d1_hi, 2)); + vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_hi, a31, 2)); + vst1q_u8(dst + 4 * stride + 0, vextq_u8(d0_lo, d0_hi, 3)); + vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_hi, a31, 3)); + vst1q_u8(dst + 5 * stride + 0, vextq_u8(d1_lo, d1_hi, 3)); + vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_hi, a31, 3)); + vst1q_u8(dst + 6 * stride + 0, vextq_u8(d0_lo, d0_hi, 4)); + vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_hi, a31, 4)); + vst1q_u8(dst + 7 * stride + 0, vextq_u8(d1_lo, d1_hi, 4)); + vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_hi, a31, 4)); + vst1q_u8(dst + 8 * stride + 0, vextq_u8(d0_lo, d0_hi, 5)); + vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_hi, a31, 5)); + vst1q_u8(dst + 9 * stride + 0, vextq_u8(d1_lo, d1_hi, 5)); + vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_hi, a31, 5)); + vst1q_u8(dst + 10 * stride + 0, vextq_u8(d0_lo, d0_hi, 6)); + vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_hi, a31, 6)); + vst1q_u8(dst + 11 * stride + 0, vextq_u8(d1_lo, d1_hi, 6)); + vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_hi, a31, 6)); + vst1q_u8(dst + 12 * stride + 0, vextq_u8(d0_lo, d0_hi, 7)); + vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_hi, a31, 7)); + vst1q_u8(dst + 13 * stride + 0, vextq_u8(d1_lo, d1_hi, 7)); + vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_hi, a31, 7)); + vst1q_u8(dst + 14 * stride + 0, vextq_u8(d0_lo, d0_hi, 8)); + vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_hi, a31, 8)); + vst1q_u8(dst + 15 * stride + 0, vextq_u8(d1_lo, d1_hi, 8)); + vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_hi, a31, 8)); + vst1q_u8(dst + 16 * stride + 0, vextq_u8(d0_lo, d0_hi, 9)); + vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_hi, a31, 9)); + vst1q_u8(dst + 17 * stride + 0, vextq_u8(d1_lo, d1_hi, 9)); + vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_hi, a31, 9)); + vst1q_u8(dst + 18 * stride + 0, vextq_u8(d0_lo, d0_hi, 10)); + vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_hi, a31, 10)); + vst1q_u8(dst + 19 * stride + 0, vextq_u8(d1_lo, d1_hi, 10)); + vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_hi, a31, 10)); + vst1q_u8(dst + 20 * stride + 0, vextq_u8(d0_lo, d0_hi, 11)); + vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_hi, a31, 11)); + vst1q_u8(dst + 21 * stride + 0, vextq_u8(d1_lo, d1_hi, 11)); + vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_hi, a31, 11)); + vst1q_u8(dst + 22 * stride + 0, vextq_u8(d0_lo, d0_hi, 12)); + vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_hi, a31, 12)); + vst1q_u8(dst + 23 * stride + 0, vextq_u8(d1_lo, d1_hi, 12)); + vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_hi, a31, 12)); + vst1q_u8(dst + 24 * stride + 0, vextq_u8(d0_lo, d0_hi, 13)); + vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_hi, a31, 13)); + vst1q_u8(dst + 25 * stride + 0, vextq_u8(d1_lo, d1_hi, 13)); + vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_hi, a31, 13)); + vst1q_u8(dst + 26 * stride + 0, vextq_u8(d0_lo, d0_hi, 14)); + vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_hi, a31, 14)); + vst1q_u8(dst + 27 * stride + 0, vextq_u8(d1_lo, d1_hi, 14)); + vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_hi, a31, 14)); + vst1q_u8(dst + 28 * stride + 0, vextq_u8(d0_lo, d0_hi, 15)); + vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_hi, a31, 15)); + vst1q_u8(dst + 29 * stride + 0, vextq_u8(d1_lo, d1_hi, 15)); + vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_hi, a31, 15)); + vst1q_u8(dst + 30 * stride + 0, d0_hi); + vst1q_u8(dst + 30 * stride + 16, a31); + vst1q_u8(dst + 31 * stride + 0, d1_hi); + vst1q_u8(dst + 31 * stride + 16, a31); +} + +// ----------------------------------------------------------------------------- + +void vpx_d117_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + // See vpx_d117_predictor_8x8_neon for more details on the implementation. + uint8x8_t az, a0, l0az, d0, d1, d2, d3, col0, col1; + + az = load_unaligned_u8_4x1(above - 1); + a0 = load_unaligned_u8_4x1(above + 0); + // [ left[0], above[-1], above[0], above[1], x, x, x, x ] + l0az = vext_u8(vld1_dup_u8(left), az, 7); + + col0 = vdup_n_u8((above[-1] + 2 * left[0] + left[1] + 2) >> 2); + col1 = vdup_n_u8((left[0] + 2 * left[1] + left[2] + 2) >> 2); + + d0 = vrhadd_u8(az, a0); + d1 = vrhadd_u8(vhadd_u8(l0az, a0), az); + d2 = vext_u8(col0, d0, 7); + d3 = vext_u8(col1, d1, 7); + + store_u8_4x1(dst + 0 * stride, d0); + store_u8_4x1(dst + 1 * stride, d1); + store_u8_4x1(dst + 2 * stride, d2); + store_u8_4x1(dst + 3 * stride, d3); +} + +void vpx_d117_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t az, a0, l0az, d0, d1, l0, l1, azl0, col0, col0_even, col0_odd; + + az = vld1_u8(above - 1); + a0 = vld1_u8(above + 0); + // [ left[0], above[-1], ... , above[5] ] + l0az = vext_u8(vld1_dup_u8(left), az, 7); + + l0 = vld1_u8(left + 0); + // The last lane here is unused, reading left[8] could cause a buffer + // over-read, so just fill with a duplicate of left[0] to avoid needing to + // materialize a zero: + // [ left[1], ... , left[7], x ] + l1 = vext_u8(l0, l0, 1); + // [ above[-1], left[0], ... , left[6] ] + azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7); + + // d0[0] = AVG2(above[-1], above[0]) + // d0[1] = AVG2(above[0], above[1]) + // ... + // d0[7] = AVG2(above[6], above[7]) + d0 = vrhadd_u8(az, a0); + + // d1[0] = AVG3(left[0], above[-1], above[0]) + // d1[1] = AVG3(above[-1], above[0], above[1]) + // ... + // d1[7] = AVG3(above[5], above[6], above[7]) + d1 = vrhadd_u8(vhadd_u8(l0az, a0), az); + + // The ext instruction shifts elements in from the end of the vector rather + // than the start, so reverse the vector to put the elements to be shifted in + // at the end. The lowest two lanes here are unused: + // col0[7] = AVG3(above[-1], left[0], left[1]) + // col0[6] = AVG3(left[0], left[1], left[2]) + // ... + // col0[2] = AVG3(left[4], left[5], left[6]) + // col0[1] = x (don't care) + // col0[0] = x (don't care) + col0 = vrev64_u8(vrhadd_u8(vhadd_u8(azl0, l1), l0)); + + // We don't care about the first parameter to this uzp since we only ever use + // the high three elements, we just use col0 again since it is already + // available: + // col0_even = [ x, x, x, x, x, col0[3], col0[5], col0[7] ] + // col0_odd = [ x, x, x, x, x, col0[2], col0[4], col0[6] ] + col0_even = vuzp_u8(col0, col0).val[1]; + col0_odd = vuzp_u8(col0, col0).val[0]; + + // Incrementally shift more elements from col0 into d0/1: + // stride=0 [ d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], d0[7] ] + // stride=1 [ d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], d1[7] ] + // stride=2 [ col0[7], d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6] ] + // stride=3 [ col0[6], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ] + // stride=4 [ col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4], d0[5] ] + // stride=5 [ col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5] ] + // stride=6 [ col0[3], col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4] ] + // stride=7 [ col0[2], col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4] ] + vst1_u8(dst + 0 * stride, d0); + vst1_u8(dst + 1 * stride, d1); + vst1_u8(dst + 2 * stride, vext_u8(col0_even, d0, 7)); + vst1_u8(dst + 3 * stride, vext_u8(col0_odd, d1, 7)); + vst1_u8(dst + 4 * stride, vext_u8(col0_even, d0, 6)); + vst1_u8(dst + 5 * stride, vext_u8(col0_odd, d1, 6)); + vst1_u8(dst + 6 * stride, vext_u8(col0_even, d0, 5)); + vst1_u8(dst + 7 * stride, vext_u8(col0_odd, d1, 5)); +} + +void vpx_d117_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + // See vpx_d117_predictor_8x8_neon for more details on the implementation. + uint8x16_t az, a0, l0az, d0, d1, l0, l1, azl0, col0, col0_even, col0_odd; + + az = vld1q_u8(above - 1); + a0 = vld1q_u8(above + 0); + // [ left[0], above[-1], ... , above[13] ] + l0az = vextq_u8(vld1q_dup_u8(left), az, 15); + + l0 = vld1q_u8(left + 0); + // The last lane here is unused, reading left[16] could cause a buffer + // over-read, so just fill with a duplicate of left[0] to avoid needing to + // materialize a zero: + // [ left[1], ... , left[15], x ] + l1 = vextq_u8(l0, l0, 1); + // [ above[-1], left[0], ... , left[14] ] + azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15); + + d0 = vrhaddq_u8(az, a0); + d1 = vrhaddq_u8(vhaddq_u8(l0az, a0), az); + + col0 = vrhaddq_u8(vhaddq_u8(azl0, l1), l0); + col0 = vrev64q_u8(vextq_u8(col0, col0, 8)); + + // The low nine lanes here are unused so the first input to the uzp is + // unused, so just use a duplicate of col0 since we have it already. This + // also means that the lowest lane of col0 here is unused. + col0_even = vuzpq_u8(col0, col0).val[1]; + col0_odd = vuzpq_u8(col0, col0).val[0]; + + vst1q_u8(dst + 0 * stride, d0); + vst1q_u8(dst + 1 * stride, d1); + vst1q_u8(dst + 2 * stride, vextq_u8(col0_even, d0, 15)); + vst1q_u8(dst + 3 * stride, vextq_u8(col0_odd, d1, 15)); + vst1q_u8(dst + 4 * stride, vextq_u8(col0_even, d0, 14)); + vst1q_u8(dst + 5 * stride, vextq_u8(col0_odd, d1, 14)); + vst1q_u8(dst + 6 * stride, vextq_u8(col0_even, d0, 13)); + vst1q_u8(dst + 7 * stride, vextq_u8(col0_odd, d1, 13)); + vst1q_u8(dst + 8 * stride, vextq_u8(col0_even, d0, 12)); + vst1q_u8(dst + 9 * stride, vextq_u8(col0_odd, d1, 12)); + vst1q_u8(dst + 10 * stride, vextq_u8(col0_even, d0, 11)); + vst1q_u8(dst + 11 * stride, vextq_u8(col0_odd, d1, 11)); + vst1q_u8(dst + 12 * stride, vextq_u8(col0_even, d0, 10)); + vst1q_u8(dst + 13 * stride, vextq_u8(col0_odd, d1, 10)); + vst1q_u8(dst + 14 * stride, vextq_u8(col0_even, d0, 9)); + vst1q_u8(dst + 15 * stride, vextq_u8(col0_odd, d1, 9)); +} + +void vpx_d117_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + // See vpx_d117_predictor_8x8_neon for more details on the implementation. + uint8x16_t az, a0, a14, a15, a16, l0az, d0_lo, d0_hi, d1_lo, d1_hi, l0, l1, + l15, l16, l17, azl0, col0_lo, col0_hi, col0_even, col0_odd; + + az = vld1q_u8(above - 1); + a0 = vld1q_u8(above + 0); + a14 = vld1q_u8(above + 14); + a15 = vld1q_u8(above + 15); + a16 = vld1q_u8(above + 16); + // [ left[0], above[-1], ... , above[13] ] + l0az = vextq_u8(vld1q_dup_u8(left), az, 15); + + l0 = vld1q_u8(left + 0); + l1 = vld1q_u8(left + 1); + l15 = vld1q_u8(left + 15); + l16 = vld1q_u8(left + 16); + // The last lane here is unused, reading left[32] would cause a buffer + // over-read (observed as an address-sanitizer failure), so just fill with a + // duplicate of left[16] to avoid needing to materialize a zero: + // [ left[17], ... , left[31], x ] + l17 = vextq_u8(l16, l16, 1); + // [ above[-1], left[0], ... , left[14] ] + azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15); + + d0_lo = vrhaddq_u8(az, a0); + d0_hi = vrhaddq_u8(a15, a16); + d1_lo = vrhaddq_u8(vhaddq_u8(l0az, a0), az); + d1_hi = vrhaddq_u8(vhaddq_u8(a14, a16), a15); + + // The last lane of col0_hi is unused here. + col0_lo = vrhaddq_u8(vhaddq_u8(azl0, l1), l0); + col0_hi = vrhaddq_u8(vhaddq_u8(l15, l17), l16); + + col0_lo = vrev64q_u8(vextq_u8(col0_lo, col0_lo, 8)); + col0_hi = vrev64q_u8(vextq_u8(col0_hi, col0_hi, 8)); + + // The first lane of these are unused since they are only ever called as + // ext(col0, _, i) where i >= 1. + col0_even = vuzpq_u8(col0_hi, col0_lo).val[1]; + col0_odd = vuzpq_u8(col0_hi, col0_lo).val[0]; + + vst1q_u8(dst + 0 * stride + 0, d0_lo); + vst1q_u8(dst + 0 * stride + 16, d0_hi); + vst1q_u8(dst + 1 * stride + 0, d1_lo); + vst1q_u8(dst + 1 * stride + 16, d1_hi); + vst1q_u8(dst + 2 * stride + 0, vextq_u8(col0_even, d0_lo, 15)); + vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_lo, d0_hi, 15)); + vst1q_u8(dst + 3 * stride + 0, vextq_u8(col0_odd, d1_lo, 15)); + vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_lo, d1_hi, 15)); + vst1q_u8(dst + 4 * stride + 0, vextq_u8(col0_even, d0_lo, 14)); + vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_lo, d0_hi, 14)); + vst1q_u8(dst + 5 * stride + 0, vextq_u8(col0_odd, d1_lo, 14)); + vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_lo, d1_hi, 14)); + vst1q_u8(dst + 6 * stride + 0, vextq_u8(col0_even, d0_lo, 13)); + vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_lo, d0_hi, 13)); + vst1q_u8(dst + 7 * stride + 0, vextq_u8(col0_odd, d1_lo, 13)); + vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_lo, d1_hi, 13)); + vst1q_u8(dst + 8 * stride + 0, vextq_u8(col0_even, d0_lo, 12)); + vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_lo, d0_hi, 12)); + vst1q_u8(dst + 9 * stride + 0, vextq_u8(col0_odd, d1_lo, 12)); + vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_lo, d1_hi, 12)); + vst1q_u8(dst + 10 * stride + 0, vextq_u8(col0_even, d0_lo, 11)); + vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_lo, d0_hi, 11)); + vst1q_u8(dst + 11 * stride + 0, vextq_u8(col0_odd, d1_lo, 11)); + vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_lo, d1_hi, 11)); + vst1q_u8(dst + 12 * stride + 0, vextq_u8(col0_even, d0_lo, 10)); + vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_lo, d0_hi, 10)); + vst1q_u8(dst + 13 * stride + 0, vextq_u8(col0_odd, d1_lo, 10)); + vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_lo, d1_hi, 10)); + vst1q_u8(dst + 14 * stride + 0, vextq_u8(col0_even, d0_lo, 9)); + vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_lo, d0_hi, 9)); + vst1q_u8(dst + 15 * stride + 0, vextq_u8(col0_odd, d1_lo, 9)); + vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_lo, d1_hi, 9)); + vst1q_u8(dst + 16 * stride + 0, vextq_u8(col0_even, d0_lo, 8)); + vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_lo, d0_hi, 8)); + vst1q_u8(dst + 17 * stride + 0, vextq_u8(col0_odd, d1_lo, 8)); + vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_lo, d1_hi, 8)); + vst1q_u8(dst + 18 * stride + 0, vextq_u8(col0_even, d0_lo, 7)); + vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_lo, d0_hi, 7)); + vst1q_u8(dst + 19 * stride + 0, vextq_u8(col0_odd, d1_lo, 7)); + vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_lo, d1_hi, 7)); + vst1q_u8(dst + 20 * stride + 0, vextq_u8(col0_even, d0_lo, 6)); + vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_lo, d0_hi, 6)); + vst1q_u8(dst + 21 * stride + 0, vextq_u8(col0_odd, d1_lo, 6)); + vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_lo, d1_hi, 6)); + vst1q_u8(dst + 22 * stride + 0, vextq_u8(col0_even, d0_lo, 5)); + vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_lo, d0_hi, 5)); + vst1q_u8(dst + 23 * stride + 0, vextq_u8(col0_odd, d1_lo, 5)); + vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_lo, d1_hi, 5)); + vst1q_u8(dst + 24 * stride + 0, vextq_u8(col0_even, d0_lo, 4)); + vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_lo, d0_hi, 4)); + vst1q_u8(dst + 25 * stride + 0, vextq_u8(col0_odd, d1_lo, 4)); + vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_lo, d1_hi, 4)); + vst1q_u8(dst + 26 * stride + 0, vextq_u8(col0_even, d0_lo, 3)); + vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_lo, d0_hi, 3)); + vst1q_u8(dst + 27 * stride + 0, vextq_u8(col0_odd, d1_lo, 3)); + vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_lo, d1_hi, 3)); + vst1q_u8(dst + 28 * stride + 0, vextq_u8(col0_even, d0_lo, 2)); + vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_lo, d0_hi, 2)); + vst1q_u8(dst + 29 * stride + 0, vextq_u8(col0_odd, d1_lo, 2)); + vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_lo, d1_hi, 2)); + vst1q_u8(dst + 30 * stride + 0, vextq_u8(col0_even, d0_lo, 1)); + vst1q_u8(dst + 30 * stride + 16, vextq_u8(d0_lo, d0_hi, 1)); + vst1q_u8(dst + 31 * stride + 0, vextq_u8(col0_odd, d1_lo, 1)); + vst1q_u8(dst + 31 * stride + 16, vextq_u8(d1_lo, d1_hi, 1)); +} + +// ----------------------------------------------------------------------------- + +void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x8_t XA0123 = vld1_u8(above - 1); + const uint8x8_t L0123 = vld1_u8(left); + const uint8x8_t L3210 = vrev64_u8(L0123); + const uint8x8_t L3210XA012 = vext_u8(L3210, XA0123, 4); + const uint8x8_t L210XA0123 = vext_u8(L3210, XA0123, 5); + const uint8x8_t L10XA0123_ = vext_u8(L210XA0123, L210XA0123, 1); + const uint8x8_t avg1 = vhadd_u8(L10XA0123_, L3210XA012); + const uint8x8_t avg2 = vrhadd_u8(avg1, L210XA0123); + + store_u8_4x1(dst + 0 * stride, vext_u8(avg2, avg2, 3)); + store_u8_4x1(dst + 1 * stride, vext_u8(avg2, avg2, 2)); + store_u8_4x1(dst + 2 * stride, vext_u8(avg2, avg2, 1)); + store_u8_4x1(dst + 3 * stride, avg2); +} + +void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x8_t XA0123456 = vld1_u8(above - 1); + const uint8x8_t A01234567 = vld1_u8(above); + const uint8x8_t A1234567_ = vld1_u8(above + 1); + const uint8x8_t L01234567 = vld1_u8(left); + const uint8x8_t L76543210 = vrev64_u8(L01234567); + const uint8x8_t L6543210X = vext_u8(L76543210, XA0123456, 1); + const uint8x8_t L543210XA0 = vext_u8(L76543210, XA0123456, 2); + const uint8x16_t L76543210XA0123456 = vcombine_u8(L76543210, XA0123456); + const uint8x16_t L6543210XA01234567 = vcombine_u8(L6543210X, A01234567); + const uint8x16_t L543210XA01234567_ = vcombine_u8(L543210XA0, A1234567_); + const uint8x16_t avg = vhaddq_u8(L76543210XA0123456, L543210XA01234567_); + const uint8x16_t row = vrhaddq_u8(avg, L6543210XA01234567); + + vst1_u8(dst + 0 * stride, vget_low_u8(vextq_u8(row, row, 7))); + vst1_u8(dst + 1 * stride, vget_low_u8(vextq_u8(row, row, 6))); + vst1_u8(dst + 2 * stride, vget_low_u8(vextq_u8(row, row, 5))); + vst1_u8(dst + 3 * stride, vget_low_u8(vextq_u8(row, row, 4))); + vst1_u8(dst + 4 * stride, vget_low_u8(vextq_u8(row, row, 3))); + vst1_u8(dst + 5 * stride, vget_low_u8(vextq_u8(row, row, 2))); + vst1_u8(dst + 6 * stride, vget_low_u8(vextq_u8(row, row, 1))); + vst1_u8(dst + 7 * stride, vget_low_u8(row)); +} + +static INLINE void d135_store_16x8( + uint8_t **dst, const ptrdiff_t stride, const uint8x16_t row_0, + const uint8x16_t row_1, const uint8x16_t row_2, const uint8x16_t row_3, + const uint8x16_t row_4, const uint8x16_t row_5, const uint8x16_t row_6, + const uint8x16_t row_7) { + vst1q_u8(*dst, row_0); + *dst += stride; + vst1q_u8(*dst, row_1); + *dst += stride; + vst1q_u8(*dst, row_2); + *dst += stride; + vst1q_u8(*dst, row_3); + *dst += stride; + vst1q_u8(*dst, row_4); + *dst += stride; + vst1q_u8(*dst, row_5); + *dst += stride; + vst1q_u8(*dst, row_6); + *dst += stride; + vst1q_u8(*dst, row_7); + *dst += stride; +} + +void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t XA0123456789abcde = vld1q_u8(above - 1); + const uint8x16_t A0123456789abcdef = vld1q_u8(above); + const uint8x16_t A123456789abcdef_ = vld1q_u8(above + 1); + const uint8x16_t L0123456789abcdef = vld1q_u8(left); + const uint8x8_t L76543210 = vrev64_u8(vget_low_u8(L0123456789abcdef)); + const uint8x8_t Lfedcba98 = vrev64_u8(vget_high_u8(L0123456789abcdef)); + const uint8x16_t Lfedcba9876543210 = vcombine_u8(Lfedcba98, L76543210); + const uint8x16_t Ledcba9876543210X = + vextq_u8(Lfedcba9876543210, XA0123456789abcde, 1); + const uint8x16_t Ldcba9876543210XA0 = + vextq_u8(Lfedcba9876543210, XA0123456789abcde, 2); + const uint8x16_t avg_0 = vhaddq_u8(Lfedcba9876543210, Ldcba9876543210XA0); + const uint8x16_t avg_1 = vhaddq_u8(XA0123456789abcde, A123456789abcdef_); + const uint8x16_t row_0 = vrhaddq_u8(avg_0, Ledcba9876543210X); + const uint8x16_t row_1 = vrhaddq_u8(avg_1, A0123456789abcdef); + + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 15); + const uint8x16_t r_1 = vextq_u8(row_0, row_1, 14); + const uint8x16_t r_2 = vextq_u8(row_0, row_1, 13); + const uint8x16_t r_3 = vextq_u8(row_0, row_1, 12); + const uint8x16_t r_4 = vextq_u8(row_0, row_1, 11); + const uint8x16_t r_5 = vextq_u8(row_0, row_1, 10); + const uint8x16_t r_6 = vextq_u8(row_0, row_1, 9); + const uint8x16_t r_7 = vextq_u8(row_0, row_1, 8); + const uint8x16_t r_8 = vextq_u8(row_0, row_1, 7); + const uint8x16_t r_9 = vextq_u8(row_0, row_1, 6); + const uint8x16_t r_a = vextq_u8(row_0, row_1, 5); + const uint8x16_t r_b = vextq_u8(row_0, row_1, 4); + const uint8x16_t r_c = vextq_u8(row_0, row_1, 3); + const uint8x16_t r_d = vextq_u8(row_0, row_1, 2); + const uint8x16_t r_e = vextq_u8(row_0, row_1, 1); + + d135_store_16x8(&dst, stride, r_0, r_1, r_2, r_3, r_4, r_5, r_6, r_7); + d135_store_16x8(&dst, stride, r_8, r_9, r_a, r_b, r_c, r_d, r_e, row_0); +} + +static INLINE void d135_store_32x2(uint8_t **dst, const ptrdiff_t stride, + const uint8x16_t row_0, + const uint8x16_t row_1, + const uint8x16_t row_2) { + uint8_t *dst2 = *dst; + vst1q_u8(dst2, row_1); + dst2 += 16; + vst1q_u8(dst2, row_2); + dst2 += 16 * stride - 16; + vst1q_u8(dst2, row_0); + dst2 += 16; + vst1q_u8(dst2, row_1); + *dst += stride; +} + +void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t LL0123456789abcdef = vld1q_u8(left + 16); + const uint8x16_t LU0123456789abcdef = vld1q_u8(left); + const uint8x8_t LL76543210 = vrev64_u8(vget_low_u8(LL0123456789abcdef)); + const uint8x8_t LU76543210 = vrev64_u8(vget_low_u8(LU0123456789abcdef)); + const uint8x8_t LLfedcba98 = vrev64_u8(vget_high_u8(LL0123456789abcdef)); + const uint8x8_t LUfedcba98 = vrev64_u8(vget_high_u8(LU0123456789abcdef)); + const uint8x16_t LLfedcba9876543210 = vcombine_u8(LLfedcba98, LL76543210); + const uint8x16_t LUfedcba9876543210 = vcombine_u8(LUfedcba98, LU76543210); + const uint8x16_t LLedcba9876543210Uf = + vextq_u8(LLfedcba9876543210, LUfedcba9876543210, 1); + const uint8x16_t LLdcba9876543210Ufe = + vextq_u8(LLfedcba9876543210, LUfedcba9876543210, 2); + const uint8x16_t avg_0 = vhaddq_u8(LLfedcba9876543210, LLdcba9876543210Ufe); + const uint8x16_t row_0 = vrhaddq_u8(avg_0, LLedcba9876543210Uf); + + const uint8x16_t XAL0123456789abcde = vld1q_u8(above - 1); + const uint8x16_t LUedcba9876543210X = + vextq_u8(LUfedcba9876543210, XAL0123456789abcde, 1); + const uint8x16_t LUdcba9876543210XA0 = + vextq_u8(LUfedcba9876543210, XAL0123456789abcde, 2); + const uint8x16_t avg_1 = vhaddq_u8(LUfedcba9876543210, LUdcba9876543210XA0); + const uint8x16_t row_1 = vrhaddq_u8(avg_1, LUedcba9876543210X); + + const uint8x16_t AL0123456789abcdef = vld1q_u8(above); + const uint8x16_t AL123456789abcdefg = vld1q_u8(above + 1); + const uint8x16_t ALfR0123456789abcde = vld1q_u8(above + 15); + const uint8x16_t AR0123456789abcdef = vld1q_u8(above + 16); + const uint8x16_t AR123456789abcdef_ = vld1q_u8(above + 17); + const uint8x16_t avg_2 = vhaddq_u8(XAL0123456789abcde, AL123456789abcdefg); + const uint8x16_t row_2 = vrhaddq_u8(avg_2, AL0123456789abcdef); + const uint8x16_t avg_3 = vhaddq_u8(ALfR0123456789abcde, AR123456789abcdef_); + const uint8x16_t row_3 = vrhaddq_u8(avg_3, AR0123456789abcdef); + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 15); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 15); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 15); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 14); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 14); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 14); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 13); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 13); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 13); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 12); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 12); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 12); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 11); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 11); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 11); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 10); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 10); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 10); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 9); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 9); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 9); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 8); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 8); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 8); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 7); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 7); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 7); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 6); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 6); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 6); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 5); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 5); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 5); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 4); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 4); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 4); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 3); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 3); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 3); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 2); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 2); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 2); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 1); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 1); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 1); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + d135_store_32x2(&dst, stride, row_0, row_1, row_2); +} + +// ----------------------------------------------------------------------------- + +void vpx_d153_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + // See vpx_d153_predictor_8x8_neon for more details on the implementation. + uint8x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02; + + az = load_unaligned_u8_4x1(above - 1); + a0 = load_unaligned_u8_4x1(above + 0); + // [ left[0], above[-1], above[0], above[1], x, x, x, x ] + l0az = vext_u8(vld1_dup_u8(left), az, 7); + + l0 = load_unaligned_u8_4x1(left + 0); + l1 = load_unaligned_u8_4x1(left + 1); + // [ above[-1], left[0], left[1], left[2], x, x, x, x ] + azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7); + + d0 = vrhadd_u8(azl0, l0); + d1 = vrhadd_u8(vhadd_u8(l0az, a0), az); + d2 = vrhadd_u8(vhadd_u8(azl0, l1), l0); + + d02 = vrev64_u8(vzip_u8(d0, d2).val[0]); + + store_u8_4x1(dst + 0 * stride, vext_u8(d02, d1, 7)); + store_u8_4x1(dst + 1 * stride, vext_u8(d02, d1, 5)); + store_u8_4x1(dst + 2 * stride, vext_u8(d02, d1, 3)); + store_u8_4x1(dst + 3 * stride, vext_u8(d02, d1, 1)); +} + +void vpx_d153_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02_lo, d02_hi; + + az = vld1_u8(above - 1); + a0 = vld1_u8(above + 0); + // [ left[0], above[-1], ... , above[5] ] + l0az = vext_u8(vld1_dup_u8(left), az, 7); + + l0 = vld1_u8(left); + // The last lane here is unused, reading left[8] could cause a buffer + // over-read, so just fill with a duplicate of left[0] to avoid needing to + // materialize a zero: + // [ left[1], ... , left[7], x ] + l1 = vext_u8(l0, l0, 1); + // [ above[-1], left[0], ... , left[6] ] + azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7); + + // d0[0] = AVG2(above[-1], left[0]) + // d0[1] = AVG2(left[0], left[1]) + // ... + // d0[7] = AVG2(left[6], left[7]) + d0 = vrhadd_u8(azl0, l0); + + // d1[0] = AVG3(left[0], above[-1], above[0]) + // d1[1] = AVG3(above[-1], above[0], above[1]) + // ... + // d1[7] = AVG3(above[5], above[6], above[7]) + d1 = vrhadd_u8(vhadd_u8(l0az, a0), az); + + // d2[0] = AVG3(above[-1], left[0], left[1]) + // d2[1] = AVG3(left[0], left[1], left[2]) + // ... + // d2[6] = AVG3(left[5], left[6], left[7]) + // d2[7] = x (don't care) + d2 = vrhadd_u8(vhadd_u8(azl0, l1), l0); + + // The ext instruction shifts elements in from the end of the vector rather + // than the start, so reverse the vectors to put the elements to be shifted + // in at the end. The lowest lane of d02_lo is unused. + d02_lo = vzip_u8(vrev64_u8(d2), vrev64_u8(d0)).val[0]; + d02_hi = vzip_u8(vrev64_u8(d2), vrev64_u8(d0)).val[1]; + + // Incrementally shift more elements from d0/d2 reversed into d1: + // stride=0 [ d0[0], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ] + // stride=1 [ d0[1], d2[0], d0[0], d1[0], d1[1], d1[2], d1[3], d1[4] ] + // stride=2 [ d0[2], d2[1], d0[1], d2[0], d0[0], d1[0], d1[1], d1[2] ] + // stride=3 [ d0[3], d2[2], d0[2], d2[1], d0[1], d2[0], d0[0], d1[0] ] + // stride=4 [ d0[4], d2[3], d0[3], d2[2], d0[2], d2[1], d0[1], d2[0] ] + // stride=5 [ d0[5], d2[4], d0[4], d2[3], d0[3], d2[2], d0[2], d2[1] ] + // stride=6 [ d0[6], d2[5], d0[5], d2[4], d0[4], d2[3], d0[3], d2[2] ] + // stride=7 [ d0[7], d2[6], d0[6], d2[5], d0[5], d2[4], d0[4], d2[3] ] + vst1_u8(dst + 0 * stride, vext_u8(d02_hi, d1, 7)); + vst1_u8(dst + 1 * stride, vext_u8(d02_hi, d1, 5)); + vst1_u8(dst + 2 * stride, vext_u8(d02_hi, d1, 3)); + vst1_u8(dst + 3 * stride, vext_u8(d02_hi, d1, 1)); + vst1_u8(dst + 4 * stride, vext_u8(d02_lo, d02_hi, 7)); + vst1_u8(dst + 5 * stride, vext_u8(d02_lo, d02_hi, 5)); + vst1_u8(dst + 6 * stride, vext_u8(d02_lo, d02_hi, 3)); + vst1_u8(dst + 7 * stride, vext_u8(d02_lo, d02_hi, 1)); +} + +void vpx_d153_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + // See vpx_d153_predictor_8x8_neon for more details on the implementation. + uint8x16_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02_lo, d02_hi; + + az = vld1q_u8(above - 1); + a0 = vld1q_u8(above + 0); + // [ left[0], above[-1], ... , above[13] ] + l0az = vextq_u8(vld1q_dup_u8(left), az, 15); + + l0 = vld1q_u8(left + 0); + // The last lane here is unused, reading left[16] could cause a buffer + // over-read, so just fill with a duplicate of left[0] to avoid needing to + // materialize a zero: + // [ left[1], ... , left[15], x ] + l1 = vextq_u8(l0, l0, 1); + // [ above[-1], left[0], ... , left[14] ] + azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15); + + d0 = vrhaddq_u8(azl0, l0); + d1 = vrhaddq_u8(vhaddq_u8(l0az, a0), az); + d2 = vrhaddq_u8(vhaddq_u8(azl0, l1), l0); + + d0 = vrev64q_u8(vextq_u8(d0, d0, 8)); + d2 = vrev64q_u8(vextq_u8(d2, d2, 8)); + + // The lowest lane of d02_lo is unused. + d02_lo = vzipq_u8(d2, d0).val[0]; + d02_hi = vzipq_u8(d2, d0).val[1]; + + vst1q_u8(dst + 0 * stride, vextq_u8(d02_hi, d1, 15)); + vst1q_u8(dst + 1 * stride, vextq_u8(d02_hi, d1, 13)); + vst1q_u8(dst + 2 * stride, vextq_u8(d02_hi, d1, 11)); + vst1q_u8(dst + 3 * stride, vextq_u8(d02_hi, d1, 9)); + vst1q_u8(dst + 4 * stride, vextq_u8(d02_hi, d1, 7)); + vst1q_u8(dst + 5 * stride, vextq_u8(d02_hi, d1, 5)); + vst1q_u8(dst + 6 * stride, vextq_u8(d02_hi, d1, 3)); + vst1q_u8(dst + 7 * stride, vextq_u8(d02_hi, d1, 1)); + vst1q_u8(dst + 8 * stride, vextq_u8(d02_lo, d02_hi, 15)); + vst1q_u8(dst + 9 * stride, vextq_u8(d02_lo, d02_hi, 13)); + vst1q_u8(dst + 10 * stride, vextq_u8(d02_lo, d02_hi, 11)); + vst1q_u8(dst + 11 * stride, vextq_u8(d02_lo, d02_hi, 9)); + vst1q_u8(dst + 12 * stride, vextq_u8(d02_lo, d02_hi, 7)); + vst1q_u8(dst + 13 * stride, vextq_u8(d02_lo, d02_hi, 5)); + vst1q_u8(dst + 14 * stride, vextq_u8(d02_lo, d02_hi, 3)); + vst1q_u8(dst + 15 * stride, vextq_u8(d02_lo, d02_hi, 1)); +} + +void vpx_d153_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + // See vpx_d153_predictor_8x8_neon for more details on the implementation. + uint8x16_t az, a0, a14, a15, a16, l0az, l0, l1, l15, l16, l17, azl0, d0_lo, + d0_hi, d1_lo, d1_hi, d2_lo, d2_hi; + uint8x16x2_t d02_hi, d02_lo; + + az = vld1q_u8(above - 1); + a0 = vld1q_u8(above + 0); + a14 = vld1q_u8(above + 14); + a15 = vld1q_u8(above + 15); + a16 = vld1q_u8(above + 16); + // [ left[0], above[-1], ... , above[13] ] + l0az = vextq_u8(vld1q_dup_u8(left), az, 15); + + l0 = vld1q_u8(left); + l1 = vld1q_u8(left + 1); + l15 = vld1q_u8(left + 15); + l16 = vld1q_u8(left + 16); + // The last lane here is unused, reading left[32] would cause a buffer + // over-read (observed as an address-sanitizer failure), so just fill with a + // duplicate of left[16] to avoid needing to materialize a zero: + // [ left[17], ... , left[31], x ] + l17 = vextq_u8(l16, l16, 1); + // [ above[-1], left[0], ... , left[14] ] + azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15); + + d0_lo = vrhaddq_u8(azl0, l0); + d0_hi = vrhaddq_u8(l15, l16); + + d1_lo = vrhaddq_u8(vhaddq_u8(l0az, a0), az); + d1_hi = vrhaddq_u8(vhaddq_u8(a14, a16), a15); + + // The highest lane of d2_hi is unused. + d2_lo = vrhaddq_u8(vhaddq_u8(azl0, l1), l0); + d2_hi = vrhaddq_u8(vhaddq_u8(l15, l17), l16); + + d0_lo = vrev64q_u8(vextq_u8(d0_lo, d0_lo, 8)); + d0_hi = vrev64q_u8(vextq_u8(d0_hi, d0_hi, 8)); + + d2_lo = vrev64q_u8(vextq_u8(d2_lo, d2_lo, 8)); + d2_hi = vrev64q_u8(vextq_u8(d2_hi, d2_hi, 8)); + + // d02_hi.val[0][0] is unused here. + d02_hi = vzipq_u8(d2_hi, d0_hi); + d02_lo = vzipq_u8(d2_lo, d0_lo); + + vst1q_u8(dst + 0 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 15)); + vst1q_u8(dst + 0 * stride + 16, vextq_u8(d1_lo, d1_hi, 15)); + vst1q_u8(dst + 1 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 13)); + vst1q_u8(dst + 1 * stride + 16, vextq_u8(d1_lo, d1_hi, 13)); + vst1q_u8(dst + 2 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 11)); + vst1q_u8(dst + 2 * stride + 16, vextq_u8(d1_lo, d1_hi, 11)); + vst1q_u8(dst + 3 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 9)); + vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_lo, d1_hi, 9)); + vst1q_u8(dst + 4 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 7)); + vst1q_u8(dst + 4 * stride + 16, vextq_u8(d1_lo, d1_hi, 7)); + vst1q_u8(dst + 5 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 5)); + vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_lo, d1_hi, 5)); + vst1q_u8(dst + 6 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 3)); + vst1q_u8(dst + 6 * stride + 16, vextq_u8(d1_lo, d1_hi, 3)); + vst1q_u8(dst + 7 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 1)); + vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_lo, d1_hi, 1)); + vst1q_u8(dst + 8 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 15)); + vst1q_u8(dst + 8 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 15)); + vst1q_u8(dst + 9 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 13)); + vst1q_u8(dst + 9 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 13)); + vst1q_u8(dst + 10 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 11)); + vst1q_u8(dst + 10 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 11)); + vst1q_u8(dst + 11 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 9)); + vst1q_u8(dst + 11 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 9)); + vst1q_u8(dst + 12 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 7)); + vst1q_u8(dst + 12 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 7)); + vst1q_u8(dst + 13 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 5)); + vst1q_u8(dst + 13 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 5)); + vst1q_u8(dst + 14 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 3)); + vst1q_u8(dst + 14 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 3)); + vst1q_u8(dst + 15 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 1)); + vst1q_u8(dst + 15 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 1)); + vst1q_u8(dst + 16 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 15)); + vst1q_u8(dst + 16 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 15)); + vst1q_u8(dst + 17 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 13)); + vst1q_u8(dst + 17 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 13)); + vst1q_u8(dst + 18 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 11)); + vst1q_u8(dst + 18 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 11)); + vst1q_u8(dst + 19 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 9)); + vst1q_u8(dst + 19 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 9)); + vst1q_u8(dst + 20 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 7)); + vst1q_u8(dst + 20 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 7)); + vst1q_u8(dst + 21 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 5)); + vst1q_u8(dst + 21 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 5)); + vst1q_u8(dst + 22 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 3)); + vst1q_u8(dst + 22 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 3)); + vst1q_u8(dst + 23 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 1)); + vst1q_u8(dst + 23 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 1)); + vst1q_u8(dst + 24 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 15)); + vst1q_u8(dst + 24 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 15)); + vst1q_u8(dst + 25 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 13)); + vst1q_u8(dst + 25 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 13)); + vst1q_u8(dst + 26 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 11)); + vst1q_u8(dst + 26 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 11)); + vst1q_u8(dst + 27 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 9)); + vst1q_u8(dst + 27 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 9)); + vst1q_u8(dst + 28 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 7)); + vst1q_u8(dst + 28 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 7)); + vst1q_u8(dst + 29 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 5)); + vst1q_u8(dst + 29 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 5)); + vst1q_u8(dst + 30 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 3)); + vst1q_u8(dst + 30 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 3)); + vst1q_u8(dst + 31 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 1)); + vst1q_u8(dst + 31 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 1)); +} + +// ----------------------------------------------------------------------------- + +void vpx_d207_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t l0, l3, l1, l2, c0, c1, c01, d0, d1; + (void)above; + + // We need the low half lanes here for the c0/c1 arithmetic but the high half + // lanes for the ext: + // [ left[0], left[1], left[2], left[3], left[0], left[1], left[2], left[3] ] + l0 = load_replicate_u8_4x1(left + 0); + l3 = vld1_dup_u8(left + 3); + + // [ left[1], left[2], left[3], left[3], x, x, x, x ] + l1 = vext_u8(l0, l3, 5); + // [ left[2], left[3], left[3], left[3], x, x, x, x ] + l2 = vext_u8(l0, l3, 6); + + c0 = vrhadd_u8(l0, l1); + c1 = vrhadd_u8(vhadd_u8(l0, l2), l1); + + // [ c0[0], c1[0], c0[1], c1[1], c0[2], c1[2], c0[3], c1[3] ] + c01 = vzip_u8(c0, c1).val[0]; + + d0 = c01; + d1 = vext_u8(c01, l3, 2); + + // Store the high half of the vector for stride={2,3} to avoid needing + // additional ext instructions: + // stride=0 [ c0[0], c1[0], c0[1], c1[1] ] + // stride=1 [ c0[1], c1[1], c0[2], c1[2] ] + // stride=2 [ c0[2], c1[2], c0[3], c1[3] ] + // stride=3 [ c0[3], c1[3], left[3], left[3] ] + store_u8_4x1(dst + 0 * stride, d0); + store_u8_4x1(dst + 1 * stride, d1); + store_u8_4x1_high(dst + 2 * stride, d0); + store_u8_4x1_high(dst + 3 * stride, d1); +} + +void vpx_d207_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t l7, l0, l1, l2, c0, c1, c01_lo, c01_hi; + (void)above; + + l0 = vld1_u8(left + 0); + l7 = vld1_dup_u8(left + 7); + + // [ left[1], left[2], left[3], left[4], left[5], left[6], left[7], left[7] ] + l1 = vext_u8(l0, l7, 1); + // [ left[2], left[3], left[4], left[5], left[6], left[7], left[7], left[7] ] + l2 = vext_u8(l0, l7, 2); + + c0 = vrhadd_u8(l0, l1); + c1 = vrhadd_u8(vhadd_u8(l0, l2), l1); + + c01_lo = vzip_u8(c0, c1).val[0]; + c01_hi = vzip_u8(c0, c1).val[1]; + + vst1_u8(dst + 0 * stride, c01_lo); + vst1_u8(dst + 1 * stride, vext_u8(c01_lo, c01_hi, 2)); + vst1_u8(dst + 2 * stride, vext_u8(c01_lo, c01_hi, 4)); + vst1_u8(dst + 3 * stride, vext_u8(c01_lo, c01_hi, 6)); + vst1_u8(dst + 4 * stride, c01_hi); + vst1_u8(dst + 5 * stride, vext_u8(c01_hi, l7, 2)); + vst1_u8(dst + 6 * stride, vext_u8(c01_hi, l7, 4)); + vst1_u8(dst + 7 * stride, vext_u8(c01_hi, l7, 6)); +} + +void vpx_d207_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x16_t l15, l0, l1, l2, c0, c1, c01_lo, c01_hi; + (void)above; + + l0 = vld1q_u8(left + 0); + l15 = vld1q_dup_u8(left + 15); + + l1 = vextq_u8(l0, l15, 1); + l2 = vextq_u8(l0, l15, 2); + + c0 = vrhaddq_u8(l0, l1); + c1 = vrhaddq_u8(vhaddq_u8(l0, l2), l1); + + c01_lo = vzipq_u8(c0, c1).val[0]; + c01_hi = vzipq_u8(c0, c1).val[1]; + + vst1q_u8(dst + 0 * stride, c01_lo); + vst1q_u8(dst + 1 * stride, vextq_u8(c01_lo, c01_hi, 2)); + vst1q_u8(dst + 2 * stride, vextq_u8(c01_lo, c01_hi, 4)); + vst1q_u8(dst + 3 * stride, vextq_u8(c01_lo, c01_hi, 6)); + vst1q_u8(dst + 4 * stride, vextq_u8(c01_lo, c01_hi, 8)); + vst1q_u8(dst + 5 * stride, vextq_u8(c01_lo, c01_hi, 10)); + vst1q_u8(dst + 6 * stride, vextq_u8(c01_lo, c01_hi, 12)); + vst1q_u8(dst + 7 * stride, vextq_u8(c01_lo, c01_hi, 14)); + vst1q_u8(dst + 8 * stride, c01_hi); + vst1q_u8(dst + 9 * stride, vextq_u8(c01_hi, l15, 2)); + vst1q_u8(dst + 10 * stride, vextq_u8(c01_hi, l15, 4)); + vst1q_u8(dst + 11 * stride, vextq_u8(c01_hi, l15, 6)); + vst1q_u8(dst + 12 * stride, vextq_u8(c01_hi, l15, 8)); + vst1q_u8(dst + 13 * stride, vextq_u8(c01_hi, l15, 10)); + vst1q_u8(dst + 14 * stride, vextq_u8(c01_hi, l15, 12)); + vst1q_u8(dst + 15 * stride, vextq_u8(c01_hi, l15, 14)); +} + +void vpx_d207_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x16_t l0_lo, l0_hi, l1_lo, l1_hi, l2_lo, l2_hi, l31, c0_lo, c0_hi, c1_lo, + c1_hi, c01[4]; + (void)above; + + l0_lo = vld1q_u8(left + 0); + l0_hi = vld1q_u8(left + 16); + l31 = vld1q_dup_u8(left + 31); + + l1_lo = vextq_u8(l0_lo, l0_hi, 1); + l1_hi = vextq_u8(l0_hi, l31, 1); + l2_lo = vextq_u8(l0_lo, l0_hi, 2); + l2_hi = vextq_u8(l0_hi, l31, 2); + + c0_lo = vrhaddq_u8(l0_lo, l1_lo); + c0_hi = vrhaddq_u8(l0_hi, l1_hi); + c1_lo = vrhaddq_u8(vhaddq_u8(l0_lo, l2_lo), l1_lo); + c1_hi = vrhaddq_u8(vhaddq_u8(l0_hi, l2_hi), l1_hi); + + c01[0] = vzipq_u8(c0_lo, c1_lo).val[0]; + c01[1] = vzipq_u8(c0_lo, c1_lo).val[1]; + c01[2] = vzipq_u8(c0_hi, c1_hi).val[0]; + c01[3] = vzipq_u8(c0_hi, c1_hi).val[1]; + + vst1q_u8(dst + 0 * stride + 0, c01[0]); + vst1q_u8(dst + 0 * stride + 16, c01[1]); + vst1q_u8(dst + 1 * stride + 0, vextq_u8(c01[0], c01[1], 2)); + vst1q_u8(dst + 1 * stride + 16, vextq_u8(c01[1], c01[2], 2)); + vst1q_u8(dst + 2 * stride + 0, vextq_u8(c01[0], c01[1], 4)); + vst1q_u8(dst + 2 * stride + 16, vextq_u8(c01[1], c01[2], 4)); + vst1q_u8(dst + 3 * stride + 0, vextq_u8(c01[0], c01[1], 6)); + vst1q_u8(dst + 3 * stride + 16, vextq_u8(c01[1], c01[2], 6)); + vst1q_u8(dst + 4 * stride + 0, vextq_u8(c01[0], c01[1], 8)); + vst1q_u8(dst + 4 * stride + 16, vextq_u8(c01[1], c01[2], 8)); + vst1q_u8(dst + 5 * stride + 0, vextq_u8(c01[0], c01[1], 10)); + vst1q_u8(dst + 5 * stride + 16, vextq_u8(c01[1], c01[2], 10)); + vst1q_u8(dst + 6 * stride + 0, vextq_u8(c01[0], c01[1], 12)); + vst1q_u8(dst + 6 * stride + 16, vextq_u8(c01[1], c01[2], 12)); + vst1q_u8(dst + 7 * stride + 0, vextq_u8(c01[0], c01[1], 14)); + vst1q_u8(dst + 7 * stride + 16, vextq_u8(c01[1], c01[2], 14)); + vst1q_u8(dst + 8 * stride + 0, c01[1]); + vst1q_u8(dst + 8 * stride + 16, c01[2]); + vst1q_u8(dst + 9 * stride + 0, vextq_u8(c01[1], c01[2], 2)); + vst1q_u8(dst + 9 * stride + 16, vextq_u8(c01[2], c01[3], 2)); + vst1q_u8(dst + 10 * stride + 0, vextq_u8(c01[1], c01[2], 4)); + vst1q_u8(dst + 10 * stride + 16, vextq_u8(c01[2], c01[3], 4)); + vst1q_u8(dst + 11 * stride + 0, vextq_u8(c01[1], c01[2], 6)); + vst1q_u8(dst + 11 * stride + 16, vextq_u8(c01[2], c01[3], 6)); + vst1q_u8(dst + 12 * stride + 0, vextq_u8(c01[1], c01[2], 8)); + vst1q_u8(dst + 12 * stride + 16, vextq_u8(c01[2], c01[3], 8)); + vst1q_u8(dst + 13 * stride + 0, vextq_u8(c01[1], c01[2], 10)); + vst1q_u8(dst + 13 * stride + 16, vextq_u8(c01[2], c01[3], 10)); + vst1q_u8(dst + 14 * stride + 0, vextq_u8(c01[1], c01[2], 12)); + vst1q_u8(dst + 14 * stride + 16, vextq_u8(c01[2], c01[3], 12)); + vst1q_u8(dst + 15 * stride + 0, vextq_u8(c01[1], c01[2], 14)); + vst1q_u8(dst + 15 * stride + 16, vextq_u8(c01[2], c01[3], 14)); + vst1q_u8(dst + 16 * stride + 0, c01[2]); + vst1q_u8(dst + 16 * stride + 16, c01[3]); + vst1q_u8(dst + 17 * stride + 0, vextq_u8(c01[2], c01[3], 2)); + vst1q_u8(dst + 17 * stride + 16, vextq_u8(c01[3], l31, 2)); + vst1q_u8(dst + 18 * stride + 0, vextq_u8(c01[2], c01[3], 4)); + vst1q_u8(dst + 18 * stride + 16, vextq_u8(c01[3], l31, 4)); + vst1q_u8(dst + 19 * stride + 0, vextq_u8(c01[2], c01[3], 6)); + vst1q_u8(dst + 19 * stride + 16, vextq_u8(c01[3], l31, 6)); + vst1q_u8(dst + 20 * stride + 0, vextq_u8(c01[2], c01[3], 8)); + vst1q_u8(dst + 20 * stride + 16, vextq_u8(c01[3], l31, 8)); + vst1q_u8(dst + 21 * stride + 0, vextq_u8(c01[2], c01[3], 10)); + vst1q_u8(dst + 21 * stride + 16, vextq_u8(c01[3], l31, 10)); + vst1q_u8(dst + 22 * stride + 0, vextq_u8(c01[2], c01[3], 12)); + vst1q_u8(dst + 22 * stride + 16, vextq_u8(c01[3], l31, 12)); + vst1q_u8(dst + 23 * stride + 0, vextq_u8(c01[2], c01[3], 14)); + vst1q_u8(dst + 23 * stride + 16, vextq_u8(c01[3], l31, 14)); + vst1q_u8(dst + 24 * stride + 0, c01[3]); + vst1q_u8(dst + 24 * stride + 16, l31); + vst1q_u8(dst + 25 * stride + 0, vextq_u8(c01[3], l31, 2)); + vst1q_u8(dst + 25 * stride + 16, l31); + vst1q_u8(dst + 26 * stride + 0, vextq_u8(c01[3], l31, 4)); + vst1q_u8(dst + 26 * stride + 16, l31); + vst1q_u8(dst + 27 * stride + 0, vextq_u8(c01[3], l31, 6)); + vst1q_u8(dst + 27 * stride + 16, l31); + vst1q_u8(dst + 28 * stride + 0, vextq_u8(c01[3], l31, 8)); + vst1q_u8(dst + 28 * stride + 16, l31); + vst1q_u8(dst + 29 * stride + 0, vextq_u8(c01[3], l31, 10)); + vst1q_u8(dst + 29 * stride + 16, l31); + vst1q_u8(dst + 30 * stride + 0, vextq_u8(c01[3], l31, 12)); + vst1q_u8(dst + 30 * stride + 16, l31); + vst1q_u8(dst + 31 * stride + 0, vextq_u8(c01[3], l31, 14)); + vst1q_u8(dst + 31 * stride + 16, l31); +} + +// ----------------------------------------------------------------------------- + +#if !HAVE_NEON_ASM + +void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint32_t d = *(const uint32_t *)above; + int i; + (void)left; + + for (i = 0; i < 4; i++, dst += stride) { + *(uint32_t *)dst = d; + } +} + +void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x8_t d = vld1_u8(above); + int i; + (void)left; + + for (i = 0; i < 8; i++, dst += stride) { + vst1_u8(dst, d); + } +} + +void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d = vld1q_u8(above); + int i; + (void)left; + + for (i = 0; i < 16; i++, dst += stride) { + vst1q_u8(dst, d); + } +} + +void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d0 = vld1q_u8(above); + const uint8x16_t d1 = vld1q_u8(above + 16); + int i; + (void)left; + + for (i = 0; i < 32; i++) { + // Note: performance was worse using vst2q_u8 under gcc-4.9 & clang-3.8. + // clang-3.8 unrolled the loop fully with no filler so the cause is likely + // the latency of the instruction. + vst1q_u8(dst, d0); + dst += 16; + vst1q_u8(dst, d1); + dst += stride - 16; + } +} + +// ----------------------------------------------------------------------------- + +void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint32x2_t zero = vdup_n_u32(0); + const uint8x8_t left_u8 = + vreinterpret_u8_u32(vld1_lane_u32((const uint32_t *)left, zero, 0)); + uint8x8_t d; + (void)above; + + d = vdup_lane_u8(left_u8, 0); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0); + dst += stride; + d = vdup_lane_u8(left_u8, 1); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0); + dst += stride; + d = vdup_lane_u8(left_u8, 2); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0); + dst += stride; + d = vdup_lane_u8(left_u8, 3); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0); +} + +void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x8_t left_u8 = vld1_u8(left); + uint8x8_t d; + (void)above; + + d = vdup_lane_u8(left_u8, 0); + vst1_u8(dst, d); + dst += stride; + d = vdup_lane_u8(left_u8, 1); + vst1_u8(dst, d); + dst += stride; + d = vdup_lane_u8(left_u8, 2); + vst1_u8(dst, d); + dst += stride; + d = vdup_lane_u8(left_u8, 3); + vst1_u8(dst, d); + dst += stride; + d = vdup_lane_u8(left_u8, 4); + vst1_u8(dst, d); + dst += stride; + d = vdup_lane_u8(left_u8, 5); + vst1_u8(dst, d); + dst += stride; + d = vdup_lane_u8(left_u8, 6); + vst1_u8(dst, d); + dst += stride; + d = vdup_lane_u8(left_u8, 7); + vst1_u8(dst, d); +} + +static INLINE void h_store_16x8(uint8_t **dst, const ptrdiff_t stride, + const uint8x8_t left) { + const uint8x16_t row_0 = vdupq_lane_u8(left, 0); + const uint8x16_t row_1 = vdupq_lane_u8(left, 1); + const uint8x16_t row_2 = vdupq_lane_u8(left, 2); + const uint8x16_t row_3 = vdupq_lane_u8(left, 3); + const uint8x16_t row_4 = vdupq_lane_u8(left, 4); + const uint8x16_t row_5 = vdupq_lane_u8(left, 5); + const uint8x16_t row_6 = vdupq_lane_u8(left, 6); + const uint8x16_t row_7 = vdupq_lane_u8(left, 7); + + vst1q_u8(*dst, row_0); + *dst += stride; + vst1q_u8(*dst, row_1); + *dst += stride; + vst1q_u8(*dst, row_2); + *dst += stride; + vst1q_u8(*dst, row_3); + *dst += stride; + vst1q_u8(*dst, row_4); + *dst += stride; + vst1q_u8(*dst, row_5); + *dst += stride; + vst1q_u8(*dst, row_6); + *dst += stride; + vst1q_u8(*dst, row_7); + *dst += stride; +} + +void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t left_u8q = vld1q_u8(left); + (void)above; + + h_store_16x8(&dst, stride, vget_low_u8(left_u8q)); + h_store_16x8(&dst, stride, vget_high_u8(left_u8q)); +} + +static INLINE void h_store_32x8(uint8_t **dst, const ptrdiff_t stride, + const uint8x8_t left) { + const uint8x16_t row_0 = vdupq_lane_u8(left, 0); + const uint8x16_t row_1 = vdupq_lane_u8(left, 1); + const uint8x16_t row_2 = vdupq_lane_u8(left, 2); + const uint8x16_t row_3 = vdupq_lane_u8(left, 3); + const uint8x16_t row_4 = vdupq_lane_u8(left, 4); + const uint8x16_t row_5 = vdupq_lane_u8(left, 5); + const uint8x16_t row_6 = vdupq_lane_u8(left, 6); + const uint8x16_t row_7 = vdupq_lane_u8(left, 7); + + vst1q_u8(*dst, row_0); // Note clang-3.8 produced poor code w/vst2q_u8 + *dst += 16; + vst1q_u8(*dst, row_0); + *dst += stride - 16; + vst1q_u8(*dst, row_1); + *dst += 16; + vst1q_u8(*dst, row_1); + *dst += stride - 16; + vst1q_u8(*dst, row_2); + *dst += 16; + vst1q_u8(*dst, row_2); + *dst += stride - 16; + vst1q_u8(*dst, row_3); + *dst += 16; + vst1q_u8(*dst, row_3); + *dst += stride - 16; + vst1q_u8(*dst, row_4); + *dst += 16; + vst1q_u8(*dst, row_4); + *dst += stride - 16; + vst1q_u8(*dst, row_5); + *dst += 16; + vst1q_u8(*dst, row_5); + *dst += stride - 16; + vst1q_u8(*dst, row_6); + *dst += 16; + vst1q_u8(*dst, row_6); + *dst += stride - 16; + vst1q_u8(*dst, row_7); + *dst += 16; + vst1q_u8(*dst, row_7); + *dst += stride - 16; +} + +void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int i; + (void)above; + + for (i = 0; i < 2; i++, left += 16) { + const uint8x16_t left_u8 = vld1q_u8(left); + h_store_32x8(&dst, stride, vget_low_u8(left_u8)); + h_store_32x8(&dst, stride, vget_high_u8(left_u8)); + } +} + +// ----------------------------------------------------------------------------- + +static INLINE int16x8_t convert_u8_to_s16(uint8x8_t v) { + return vreinterpretq_s16_u16(vmovl_u8(v)); +} + +void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x8_t top_left = vld1_dup_u8(above - 1); + const uint8x8_t left_u8 = vld1_u8(left); + const uint8x8_t above_u8 = vld1_u8(above); + const int16x4_t left_s16 = vget_low_s16(convert_u8_to_s16(left_u8)); + int16x8_t sub, sum; + uint32x2_t d; + + sub = vreinterpretq_s16_u16(vsubl_u8(above_u8, top_left)); + // Avoid vcombine_s16() which generates lots of redundant code with clang-3.8. + sub = vreinterpretq_s16_s64( + vdupq_lane_s64(vreinterpret_s64_s16(vget_low_s16(sub)), 0)); + + sum = vcombine_s16(vdup_lane_s16(left_s16, 0), vdup_lane_s16(left_s16, 1)); + sum = vaddq_s16(sum, sub); + d = vreinterpret_u32_u8(vqmovun_s16(sum)); + vst1_lane_u32((uint32_t *)dst, d, 0); + dst += stride; + vst1_lane_u32((uint32_t *)dst, d, 1); + dst += stride; + + sum = vcombine_s16(vdup_lane_s16(left_s16, 2), vdup_lane_s16(left_s16, 3)); + sum = vaddq_s16(sum, sub); + d = vreinterpret_u32_u8(vqmovun_s16(sum)); + vst1_lane_u32((uint32_t *)dst, d, 0); + dst += stride; + vst1_lane_u32((uint32_t *)dst, d, 1); +} + +static INLINE void tm_8_kernel(uint8_t **dst, const ptrdiff_t stride, + const int16x8_t left_dup, const int16x8_t sub) { + const int16x8_t sum = vaddq_s16(left_dup, sub); + const uint8x8_t d = vqmovun_s16(sum); + vst1_u8(*dst, d); + *dst += stride; +} + +void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x8_t top_left = vld1_dup_u8(above - 1); + const uint8x8_t above_u8 = vld1_u8(above); + const uint8x8_t left_u8 = vld1_u8(left); + const int16x8_t left_s16q = convert_u8_to_s16(left_u8); + const int16x8_t sub = vreinterpretq_s16_u16(vsubl_u8(above_u8, top_left)); + int16x4_t left_s16d = vget_low_s16(left_s16q); + int i; + + for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) { + int16x8_t left_dup; + + left_dup = vdupq_lane_s16(left_s16d, 0); + tm_8_kernel(&dst, stride, left_dup, sub); + left_dup = vdupq_lane_s16(left_s16d, 1); + tm_8_kernel(&dst, stride, left_dup, sub); + left_dup = vdupq_lane_s16(left_s16d, 2); + tm_8_kernel(&dst, stride, left_dup, sub); + left_dup = vdupq_lane_s16(left_s16d, 3); + tm_8_kernel(&dst, stride, left_dup, sub); + } +} + +static INLINE void tm_16_kernel(uint8_t **dst, const ptrdiff_t stride, + const int16x8_t left_dup, const int16x8_t sub0, + const int16x8_t sub1) { + const int16x8_t sum0 = vaddq_s16(left_dup, sub0); + const int16x8_t sum1 = vaddq_s16(left_dup, sub1); + const uint8x8_t d0 = vqmovun_s16(sum0); + const uint8x8_t d1 = vqmovun_s16(sum1); + vst1_u8(*dst, d0); + *dst += 8; + vst1_u8(*dst, d1); + *dst += stride - 8; +} + +void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t top_left = vld1q_dup_u8(above - 1); + const uint8x16_t above_u8 = vld1q_u8(above); + const int16x8_t sub0 = vreinterpretq_s16_u16( + vsubl_u8(vget_low_u8(above_u8), vget_low_u8(top_left))); + const int16x8_t sub1 = vreinterpretq_s16_u16( + vsubl_u8(vget_high_u8(above_u8), vget_high_u8(top_left))); + int16x8_t left_dup; + int i; + + for (i = 0; i < 2; i++, left += 8) { + const uint8x8_t left_u8 = vld1_u8(left); + const int16x8_t left_s16q = convert_u8_to_s16(left_u8); + const int16x4_t left_low = vget_low_s16(left_s16q); + const int16x4_t left_high = vget_high_s16(left_s16q); + + left_dup = vdupq_lane_s16(left_low, 0); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1); + left_dup = vdupq_lane_s16(left_low, 1); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1); + left_dup = vdupq_lane_s16(left_low, 2); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1); + left_dup = vdupq_lane_s16(left_low, 3); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1); + + left_dup = vdupq_lane_s16(left_high, 0); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1); + left_dup = vdupq_lane_s16(left_high, 1); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1); + left_dup = vdupq_lane_s16(left_high, 2); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1); + left_dup = vdupq_lane_s16(left_high, 3); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1); + } +} + +static INLINE void tm_32_kernel(uint8_t **dst, const ptrdiff_t stride, + const int16x8_t left_dup, const int16x8_t sub0, + const int16x8_t sub1, const int16x8_t sub2, + const int16x8_t sub3) { + const int16x8_t sum0 = vaddq_s16(left_dup, sub0); + const int16x8_t sum1 = vaddq_s16(left_dup, sub1); + const int16x8_t sum2 = vaddq_s16(left_dup, sub2); + const int16x8_t sum3 = vaddq_s16(left_dup, sub3); + const uint8x8_t d0 = vqmovun_s16(sum0); + const uint8x8_t d1 = vqmovun_s16(sum1); + const uint8x8_t d2 = vqmovun_s16(sum2); + const uint8x8_t d3 = vqmovun_s16(sum3); + + vst1q_u8(*dst, vcombine_u8(d0, d1)); + *dst += 16; + vst1q_u8(*dst, vcombine_u8(d2, d3)); + *dst += stride - 16; +} + +void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t top_left = vld1q_dup_u8(above - 1); + const uint8x16_t above_low = vld1q_u8(above); + const uint8x16_t above_high = vld1q_u8(above + 16); + const int16x8_t sub0 = vreinterpretq_s16_u16( + vsubl_u8(vget_low_u8(above_low), vget_low_u8(top_left))); + const int16x8_t sub1 = vreinterpretq_s16_u16( + vsubl_u8(vget_high_u8(above_low), vget_high_u8(top_left))); + const int16x8_t sub2 = vreinterpretq_s16_u16( + vsubl_u8(vget_low_u8(above_high), vget_low_u8(top_left))); + const int16x8_t sub3 = vreinterpretq_s16_u16( + vsubl_u8(vget_high_u8(above_high), vget_high_u8(top_left))); + int16x8_t left_dup; + int i, j; + + for (j = 0; j < 4; j++, left += 8) { + const uint8x8_t left_u8 = vld1_u8(left); + const int16x8_t left_s16q = convert_u8_to_s16(left_u8); + int16x4_t left_s16d = vget_low_s16(left_s16q); + for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) { + left_dup = vdupq_lane_s16(left_s16d, 0); + tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3); + left_dup = vdupq_lane_s16(left_s16d, 1); + tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3); + left_dup = vdupq_lane_s16(left_s16d, 2); + tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3); + left_dup = vdupq_lane_s16(left_s16d, 3); + tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3); + } + } +} +#endif // !HAVE_NEON_ASM diff --git a/media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon_asm.asm b/media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon_asm.asm new file mode 100644 index 0000000000..115790d480 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon_asm.asm @@ -0,0 +1,630 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vpx_v_predictor_4x4_neon| + EXPORT |vpx_v_predictor_8x8_neon| + EXPORT |vpx_v_predictor_16x16_neon| + EXPORT |vpx_v_predictor_32x32_neon| + EXPORT |vpx_h_predictor_4x4_neon| + EXPORT |vpx_h_predictor_8x8_neon| + EXPORT |vpx_h_predictor_16x16_neon| + EXPORT |vpx_h_predictor_32x32_neon| + EXPORT |vpx_tm_predictor_4x4_neon| + EXPORT |vpx_tm_predictor_8x8_neon| + EXPORT |vpx_tm_predictor_16x16_neon| + EXPORT |vpx_tm_predictor_32x32_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vpx_v_predictor_4x4_neon| PROC + vld1.32 {d0[0]}, [r2] + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + bx lr + ENDP ; |vpx_v_predictor_4x4_neon| + +;void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vpx_v_predictor_8x8_neon| PROC + vld1.8 {d0}, [r2] + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + bx lr + ENDP ; |vpx_v_predictor_8x8_neon| + +;void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vpx_v_predictor_16x16_neon| PROC + vld1.8 {q0}, [r2] + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + bx lr + ENDP ; |vpx_v_predictor_16x16_neon| + +;void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vpx_v_predictor_32x32_neon| PROC + vld1.8 {q0, q1}, [r2] + mov r2, #2 +loop_v + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + subs r2, r2, #1 + bgt loop_v + bx lr + ENDP ; |vpx_v_predictor_32x32_neon| + +;void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vpx_h_predictor_4x4_neon| PROC + vld1.32 {d1[0]}, [r3] + vdup.8 d0, d1[0] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[1] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[2] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[3] + vst1.32 {d0[0]}, [r0], r1 + bx lr + ENDP ; |vpx_h_predictor_4x4_neon| + +;void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vpx_h_predictor_8x8_neon| PROC + vld1.64 {d1}, [r3] + vdup.8 d0, d1[0] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[1] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[2] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[3] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[4] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[5] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[6] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[7] + vst1.64 {d0}, [r0], r1 + bx lr + ENDP ; |vpx_h_predictor_8x8_neon| + +;void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vpx_h_predictor_16x16_neon| PROC + vld1.8 {q1}, [r3] + vdup.8 q0, d2[0] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[1] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[2] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[3] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[4] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[5] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[6] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[7] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[0] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[1] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[2] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[3] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[4] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[5] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[6] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[7] + vst1.8 {q0}, [r0], r1 + bx lr + ENDP ; |vpx_h_predictor_16x16_neon| + +;void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vpx_h_predictor_32x32_neon| PROC + sub r1, r1, #16 + mov r2, #2 +loop_h + vld1.8 {q1}, [r3]! + vdup.8 q0, d2[0] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[1] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[2] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[3] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[4] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[5] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[6] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[7] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[0] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[1] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[2] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[3] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[4] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[5] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[6] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[7] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + subs r2, r2, #1 + bgt loop_h + bx lr + ENDP ; |vpx_h_predictor_32x32_neon| + +;void vpx_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vpx_tm_predictor_4x4_neon| PROC + ; Load ytop_left = above[-1]; + sub r12, r2, #1 + vld1.u8 {d0[]}, [r12] + + ; Load above 4 pixels + vld1.32 {d2[0]}, [r2] + + ; Compute above - ytop_left + vsubl.u8 q3, d2, d0 + + ; Load left row by row and compute left + (above - ytop_left) + ; 1st row and 2nd row + vld1.u8 {d2[]}, [r3]! + vld1.u8 {d4[]}, [r3]! + vmovl.u8 q1, d2 + vmovl.u8 q2, d4 + vadd.s16 q1, q1, q3 + vadd.s16 q2, q2, q3 + vqmovun.s16 d0, q1 + vqmovun.s16 d1, q2 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d1[0]}, [r0], r1 + + ; 3rd row and 4th row + vld1.u8 {d2[]}, [r3]! + vld1.u8 {d4[]}, [r3] + vmovl.u8 q1, d2 + vmovl.u8 q2, d4 + vadd.s16 q1, q1, q3 + vadd.s16 q2, q2, q3 + vqmovun.s16 d0, q1 + vqmovun.s16 d1, q2 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d1[0]}, [r0], r1 + bx lr + ENDP ; |vpx_tm_predictor_4x4_neon| + +;void vpx_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vpx_tm_predictor_8x8_neon| PROC + ; Load ytop_left = above[-1]; + sub r12, r2, #1 + vld1.8 {d0[]}, [r12] + + ; preload 8 left + vld1.8 {d30}, [r3] + + ; Load above 8 pixels + vld1.64 {d2}, [r2] + + vmovl.u8 q10, d30 + + ; Compute above - ytop_left + vsubl.u8 q3, d2, d0 + + ; Load left row by row and compute left + (above - ytop_left) + ; 1st row and 2nd row + vdup.16 q0, d20[0] + vdup.16 q1, d20[1] + vadd.s16 q0, q3, q0 + vadd.s16 q1, q3, q1 + + ; 3rd row and 4th row + vdup.16 q8, d20[2] + vdup.16 q9, d20[3] + vadd.s16 q8, q3, q8 + vadd.s16 q9, q3, q9 + + vqmovun.s16 d0, q0 + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q8 + vqmovun.s16 d3, q9 + + vst1.64 {d0}, [r0], r1 + vst1.64 {d1}, [r0], r1 + vst1.64 {d2}, [r0], r1 + vst1.64 {d3}, [r0], r1 + + ; 5th row and 6th row + vdup.16 q0, d21[0] + vdup.16 q1, d21[1] + vadd.s16 q0, q3, q0 + vadd.s16 q1, q3, q1 + + ; 7th row and 8th row + vdup.16 q8, d21[2] + vdup.16 q9, d21[3] + vadd.s16 q8, q3, q8 + vadd.s16 q9, q3, q9 + + vqmovun.s16 d0, q0 + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q8 + vqmovun.s16 d3, q9 + + vst1.64 {d0}, [r0], r1 + vst1.64 {d1}, [r0], r1 + vst1.64 {d2}, [r0], r1 + vst1.64 {d3}, [r0], r1 + + bx lr + ENDP ; |vpx_tm_predictor_8x8_neon| + +;void vpx_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vpx_tm_predictor_16x16_neon| PROC + ; Load ytop_left = above[-1]; + sub r12, r2, #1 + vld1.8 {d0[]}, [r12] + + ; Load above 8 pixels + vld1.8 {q1}, [r2] + + ; preload 8 left into r12 + vld1.8 {d18}, [r3]! + + ; Compute above - ytop_left + vsubl.u8 q2, d2, d0 + vsubl.u8 q3, d3, d0 + + vmovl.u8 q10, d18 + + ; Load left row by row and compute left + (above - ytop_left) + ; Process 8 rows in each single loop and loop 2 times to process 16 rows. + mov r2, #2 + +loop_16x16_neon + ; Process two rows. + vdup.16 q0, d20[0] + vdup.16 q8, d20[1] + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vdup.16 q0, d20[2] ; proload next 2 rows data + vdup.16 q8, d20[3] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + ; Process two rows. + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vdup.16 q0, d21[0] ; proload next 2 rows data + vdup.16 q8, d21[1] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vdup.16 q0, d21[2] ; proload next 2 rows data + vdup.16 q8, d21[3] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vld1.8 {d18}, [r3]! ; preload 8 left into r12 + vmovl.u8 q10, d18 + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + subs r2, r2, #1 + bgt loop_16x16_neon + + bx lr + ENDP ; |vpx_tm_predictor_16x16_neon| + +;void vpx_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vpx_tm_predictor_32x32_neon| PROC + ; Load ytop_left = above[-1]; + sub r12, r2, #1 + vld1.8 {d0[]}, [r12] + + ; Load above 32 pixels + vld1.8 {q1}, [r2]! + vld1.8 {q2}, [r2] + + ; preload 8 left pixels + vld1.8 {d26}, [r3]! + + ; Compute above - ytop_left + vsubl.u8 q8, d2, d0 + vsubl.u8 q9, d3, d0 + vsubl.u8 q10, d4, d0 + vsubl.u8 q11, d5, d0 + + vmovl.u8 q3, d26 + + ; Load left row by row and compute left + (above - ytop_left) + ; Process 8 rows in each single loop and loop 4 times to process 32 rows. + mov r2, #4 + +loop_32x32_neon + ; Process two rows. + vdup.16 q0, d6[0] + vdup.16 q2, d6[1] + vadd.s16 q12, q0, q8 + vadd.s16 q13, q0, q9 + vadd.s16 q14, q0, q10 + vadd.s16 q15, q0, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vdup.16 q1, d6[2] + vdup.16 q2, d6[3] + vst1.64 {d24-d27}, [r0], r1 + + ; Process two rows. + vadd.s16 q12, q1, q8 + vadd.s16 q13, q1, q9 + vadd.s16 q14, q1, q10 + vadd.s16 q15, q1, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vdup.16 q0, d7[0] + vdup.16 q2, d7[1] + vst1.64 {d24-d27}, [r0], r1 + + ; Process two rows. + vadd.s16 q12, q0, q8 + vadd.s16 q13, q0, q9 + vadd.s16 q14, q0, q10 + vadd.s16 q15, q0, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vdup.16 q0, d7[2] + vdup.16 q2, d7[3] + vst1.64 {d24-d27}, [r0], r1 + + ; Process two rows. + vadd.s16 q12, q0, q8 + vadd.s16 q13, q0, q9 + vadd.s16 q14, q0, q10 + vadd.s16 q15, q0, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vld1.8 {d0}, [r3]! ; preload 8 left pixels + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vmovl.u8 q3, d0 + vst1.64 {d24-d27}, [r0], r1 + + subs r2, r2, #1 + bgt loop_32x32_neon + + bx lr + ENDP ; |vpx_tm_predictor_32x32_neon| + + END diff --git a/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_16_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_16_neon.asm new file mode 100644 index 0000000000..730c40de0e --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_16_neon.asm @@ -0,0 +1,666 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vpx_lpf_horizontal_16_neon| + EXPORT |vpx_lpf_horizontal_16_dual_neon| + EXPORT |vpx_lpf_vertical_16_neon| + EXPORT |vpx_lpf_vertical_16_dual_neon| + ARM + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; void mb_lpf_horizontal_edge(uint8_t *s, int p, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh, +; int count) +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +; r12 int count +|mb_lpf_horizontal_edge| PROC + push {r4-r8, lr} + vpush {d8-d15} + ldr r4, [sp, #88] ; load thresh + +h_count + vld1.8 {d16[]}, [r2] ; load *blimit + vld1.8 {d17[]}, [r3] ; load *limit + vld1.8 {d18[]}, [r4] ; load *thresh + + sub r8, r0, r1, lsl #3 ; move src pointer down by 8 lines + + vld1.u8 {d0}, [r8@64], r1 ; p7 + vld1.u8 {d1}, [r8@64], r1 ; p6 + vld1.u8 {d2}, [r8@64], r1 ; p5 + vld1.u8 {d3}, [r8@64], r1 ; p4 + vld1.u8 {d4}, [r8@64], r1 ; p3 + vld1.u8 {d5}, [r8@64], r1 ; p2 + vld1.u8 {d6}, [r8@64], r1 ; p1 + vld1.u8 {d7}, [r8@64], r1 ; p0 + vld1.u8 {d8}, [r8@64], r1 ; q0 + vld1.u8 {d9}, [r8@64], r1 ; q1 + vld1.u8 {d10}, [r8@64], r1 ; q2 + vld1.u8 {d11}, [r8@64], r1 ; q3 + vld1.u8 {d12}, [r8@64], r1 ; q4 + vld1.u8 {d13}, [r8@64], r1 ; q5 + vld1.u8 {d14}, [r8@64], r1 ; q6 + vld1.u8 {d15}, [r8@64], r1 ; q7 + + bl vpx_wide_mbfilter_neon + + tst r7, #1 + beq h_mbfilter + + ; flat && mask were not set for any of the channels. Just store the values + ; from filter. + sub r8, r0, r1, lsl #1 + + vst1.u8 {d25}, [r8@64], r1 ; store op1 + vst1.u8 {d24}, [r8@64], r1 ; store op0 + vst1.u8 {d23}, [r8@64], r1 ; store oq0 + vst1.u8 {d26}, [r8@64], r1 ; store oq1 + + b h_next + +h_mbfilter + tst r7, #2 + beq h_wide_mbfilter + + ; flat2 was not set for any of the channels. Just store the values from + ; mbfilter. + sub r8, r0, r1, lsl #1 + sub r8, r8, r1 + + vst1.u8 {d18}, [r8@64], r1 ; store op2 + vst1.u8 {d19}, [r8@64], r1 ; store op1 + vst1.u8 {d20}, [r8@64], r1 ; store op0 + vst1.u8 {d21}, [r8@64], r1 ; store oq0 + vst1.u8 {d22}, [r8@64], r1 ; store oq1 + vst1.u8 {d23}, [r8@64], r1 ; store oq2 + + b h_next + +h_wide_mbfilter + sub r8, r0, r1, lsl #3 + add r8, r8, r1 + + vst1.u8 {d16}, [r8@64], r1 ; store op6 + vst1.u8 {d24}, [r8@64], r1 ; store op5 + vst1.u8 {d25}, [r8@64], r1 ; store op4 + vst1.u8 {d26}, [r8@64], r1 ; store op3 + vst1.u8 {d27}, [r8@64], r1 ; store op2 + vst1.u8 {d18}, [r8@64], r1 ; store op1 + vst1.u8 {d19}, [r8@64], r1 ; store op0 + vst1.u8 {d20}, [r8@64], r1 ; store oq0 + vst1.u8 {d21}, [r8@64], r1 ; store oq1 + vst1.u8 {d22}, [r8@64], r1 ; store oq2 + vst1.u8 {d23}, [r8@64], r1 ; store oq3 + vst1.u8 {d1}, [r8@64], r1 ; store oq4 + vst1.u8 {d2}, [r8@64], r1 ; store oq5 + vst1.u8 {d3}, [r8@64], r1 ; store oq6 + +h_next + add r0, r0, #8 + subs r12, r12, #1 + bne h_count + + vpop {d8-d15} + pop {r4-r8, pc} + + ENDP ; |mb_lpf_horizontal_edge| + +; void vpx_lpf_horizontal_16_neon(uint8_t *s, int pitch, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; r0 uint8_t *s, +; r1 int pitch, +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh +|vpx_lpf_horizontal_16_neon| PROC + mov r12, #1 + b mb_lpf_horizontal_edge + ENDP ; |vpx_lpf_horizontal_16_neon| + +; void vpx_lpf_horizontal_16_dual_neon(uint8_t *s, int pitch, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; r0 uint8_t *s, +; r1 int pitch, +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh +|vpx_lpf_horizontal_16_dual_neon| PROC + mov r12, #2 + b mb_lpf_horizontal_edge + ENDP ; |vpx_lpf_horizontal_16_dual_neon| + +; void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit, +; const uint8_t *limit, const uint8_t *thresh, +; int count) { +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +; r12 int count +|mb_lpf_vertical_edge_w| PROC + push {r4-r8, lr} + vpush {d8-d15} + ldr r4, [sp, #88] ; load thresh + +v_count + vld1.8 {d16[]}, [r2] ; load *blimit + vld1.8 {d17[]}, [r3] ; load *limit + vld1.8 {d18[]}, [r4] ; load *thresh + + sub r8, r0, #8 + + vld1.8 {d0}, [r8@64], r1 + vld1.8 {d8}, [r0@64], r1 + vld1.8 {d1}, [r8@64], r1 + vld1.8 {d9}, [r0@64], r1 + vld1.8 {d2}, [r8@64], r1 + vld1.8 {d10}, [r0@64], r1 + vld1.8 {d3}, [r8@64], r1 + vld1.8 {d11}, [r0@64], r1 + vld1.8 {d4}, [r8@64], r1 + vld1.8 {d12}, [r0@64], r1 + vld1.8 {d5}, [r8@64], r1 + vld1.8 {d13}, [r0@64], r1 + vld1.8 {d6}, [r8@64], r1 + vld1.8 {d14}, [r0@64], r1 + vld1.8 {d7}, [r8@64], r1 + vld1.8 {d15}, [r0@64], r1 + + sub r0, r0, r1, lsl #3 + + vtrn.32 q0, q2 + vtrn.32 q1, q3 + vtrn.32 q4, q6 + vtrn.32 q5, q7 + + vtrn.16 q0, q1 + vtrn.16 q2, q3 + vtrn.16 q4, q5 + vtrn.16 q6, q7 + + vtrn.8 d0, d1 + vtrn.8 d2, d3 + vtrn.8 d4, d5 + vtrn.8 d6, d7 + + vtrn.8 d8, d9 + vtrn.8 d10, d11 + vtrn.8 d12, d13 + vtrn.8 d14, d15 + + bl vpx_wide_mbfilter_neon + + tst r7, #1 + beq v_mbfilter + + ; flat && mask were not set for any of the channels. Just store the values + ; from filter. + sub r0, #2 + + vswp d23, d25 + + vst4.8 {d23[0], d24[0], d25[0], d26[0]}, [r0], r1 + vst4.8 {d23[1], d24[1], d25[1], d26[1]}, [r0], r1 + vst4.8 {d23[2], d24[2], d25[2], d26[2]}, [r0], r1 + vst4.8 {d23[3], d24[3], d25[3], d26[3]}, [r0], r1 + vst4.8 {d23[4], d24[4], d25[4], d26[4]}, [r0], r1 + vst4.8 {d23[5], d24[5], d25[5], d26[5]}, [r0], r1 + vst4.8 {d23[6], d24[6], d25[6], d26[6]}, [r0], r1 + vst4.8 {d23[7], d24[7], d25[7], d26[7]}, [r0], r1 + add r0, #2 + + b v_next + +v_mbfilter + tst r7, #2 + beq v_wide_mbfilter + + ; flat2 was not set for any of the channels. Just store the values from + ; mbfilter. + sub r8, r0, #3 + + vst3.8 {d18[0], d19[0], d20[0]}, [r8], r1 + vst3.8 {d21[0], d22[0], d23[0]}, [r0], r1 + vst3.8 {d18[1], d19[1], d20[1]}, [r8], r1 + vst3.8 {d21[1], d22[1], d23[1]}, [r0], r1 + vst3.8 {d18[2], d19[2], d20[2]}, [r8], r1 + vst3.8 {d21[2], d22[2], d23[2]}, [r0], r1 + vst3.8 {d18[3], d19[3], d20[3]}, [r8], r1 + vst3.8 {d21[3], d22[3], d23[3]}, [r0], r1 + vst3.8 {d18[4], d19[4], d20[4]}, [r8], r1 + vst3.8 {d21[4], d22[4], d23[4]}, [r0], r1 + vst3.8 {d18[5], d19[5], d20[5]}, [r8], r1 + vst3.8 {d21[5], d22[5], d23[5]}, [r0], r1 + vst3.8 {d18[6], d19[6], d20[6]}, [r8], r1 + vst3.8 {d21[6], d22[6], d23[6]}, [r0], r1 + vst3.8 {d18[7], d19[7], d20[7]}, [r8], r1 + vst3.8 {d21[7], d22[7], d23[7]}, [r0], r1 + + b v_next + +v_wide_mbfilter + sub r8, r0, #8 + + vtrn.32 d0, d26 + vtrn.32 d16, d27 + vtrn.32 d24, d18 + vtrn.32 d25, d19 + + vtrn.16 d0, d24 + vtrn.16 d16, d25 + vtrn.16 d26, d18 + vtrn.16 d27, d19 + + vtrn.8 d0, d16 + vtrn.8 d24, d25 + vtrn.8 d26, d27 + vtrn.8 d18, d19 + + vtrn.32 d20, d1 + vtrn.32 d21, d2 + vtrn.32 d22, d3 + vtrn.32 d23, d15 + + vtrn.16 d20, d22 + vtrn.16 d21, d23 + vtrn.16 d1, d3 + vtrn.16 d2, d15 + + vtrn.8 d20, d21 + vtrn.8 d22, d23 + vtrn.8 d1, d2 + vtrn.8 d3, d15 + + vst1.8 {d0}, [r8@64], r1 + vst1.8 {d20}, [r0@64], r1 + vst1.8 {d16}, [r8@64], r1 + vst1.8 {d21}, [r0@64], r1 + vst1.8 {d24}, [r8@64], r1 + vst1.8 {d22}, [r0@64], r1 + vst1.8 {d25}, [r8@64], r1 + vst1.8 {d23}, [r0@64], r1 + vst1.8 {d26}, [r8@64], r1 + vst1.8 {d1}, [r0@64], r1 + vst1.8 {d27}, [r8@64], r1 + vst1.8 {d2}, [r0@64], r1 + vst1.8 {d18}, [r8@64], r1 + vst1.8 {d3}, [r0@64], r1 + vst1.8 {d19}, [r8@64], r1 + vst1.8 {d15}, [r0@64], r1 + +v_next + subs r12, #1 + bne v_count + + vpop {d8-d15} + pop {r4-r8, pc} + + ENDP ; |mb_lpf_vertical_edge_w| + +; void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit, +; const uint8_t *limit, const uint8_t *thresh) +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh +|vpx_lpf_vertical_16_neon| PROC + mov r12, #1 + b mb_lpf_vertical_edge_w + ENDP ; |vpx_lpf_vertical_16_neon| + +; void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh +|vpx_lpf_vertical_16_dual_neon| PROC + mov r12, #2 + b mb_lpf_vertical_edge_w + ENDP ; |vpx_lpf_vertical_16_dual_neon| + +; void vpx_wide_mbfilter_neon(); +; This is a helper function for the loopfilters. The invidual functions do the +; necessary load, transpose (if necessary) and store. +; +; r0-r3 PRESERVE +; d16 blimit +; d17 limit +; d18 thresh +; d0 p7 +; d1 p6 +; d2 p5 +; d3 p4 +; d4 p3 +; d5 p2 +; d6 p1 +; d7 p0 +; d8 q0 +; d9 q1 +; d10 q2 +; d11 q3 +; d12 q4 +; d13 q5 +; d14 q6 +; d15 q7 +|vpx_wide_mbfilter_neon| PROC + mov r7, #0 + + ; filter_mask + vabd.u8 d19, d4, d5 ; abs(p3 - p2) + vabd.u8 d20, d5, d6 ; abs(p2 - p1) + vabd.u8 d21, d6, d7 ; abs(p1 - p0) + vabd.u8 d22, d9, d8 ; abs(q1 - q0) + vabd.u8 d23, d10, d9 ; abs(q2 - q1) + vabd.u8 d24, d11, d10 ; abs(q3 - q2) + + ; only compare the largest value to limit + vmax.u8 d19, d19, d20 ; max(abs(p3 - p2), abs(p2 - p1)) + vmax.u8 d20, d21, d22 ; max(abs(p1 - p0), abs(q1 - q0)) + vmax.u8 d23, d23, d24 ; max(abs(q2 - q1), abs(q3 - q2)) + vmax.u8 d19, d19, d20 + + vabd.u8 d24, d7, d8 ; abs(p0 - q0) + + vmax.u8 d19, d19, d23 + + vabd.u8 d23, d6, d9 ; a = abs(p1 - q1) + vqadd.u8 d24, d24, d24 ; b = abs(p0 - q0) * 2 + + ; abs () > limit + vcge.u8 d19, d17, d19 + + ; flatmask4 + vabd.u8 d25, d7, d5 ; abs(p0 - p2) + vabd.u8 d26, d8, d10 ; abs(q0 - q2) + vabd.u8 d27, d4, d7 ; abs(p3 - p0) + vabd.u8 d28, d11, d8 ; abs(q3 - q0) + + ; only compare the largest value to thresh + vmax.u8 d25, d25, d26 ; max(abs(p0 - p2), abs(q0 - q2)) + vmax.u8 d26, d27, d28 ; max(abs(p3 - p0), abs(q3 - q0)) + vmax.u8 d25, d25, d26 + vmax.u8 d20, d20, d25 + + vshr.u8 d23, d23, #1 ; a = a / 2 + vqadd.u8 d24, d24, d23 ; a = b + a + + vmov.u8 d30, #1 + vcge.u8 d24, d16, d24 ; (a > blimit * 2 + limit) * -1 + + vcge.u8 d20, d30, d20 ; flat + + vand d19, d19, d24 ; mask + + ; hevmask + vcgt.u8 d21, d21, d18 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 d22, d22, d18 ; (abs(q1 - q0) > thresh)*-1 + vorr d21, d21, d22 ; hev + + vand d16, d20, d19 ; flat && mask + vmov r5, r6, d16 + + ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7) + vabd.u8 d22, d3, d7 ; abs(p4 - p0) + vabd.u8 d23, d12, d8 ; abs(q4 - q0) + vabd.u8 d24, d7, d2 ; abs(p0 - p5) + vabd.u8 d25, d8, d13 ; abs(q0 - q5) + vabd.u8 d26, d1, d7 ; abs(p6 - p0) + vabd.u8 d27, d14, d8 ; abs(q6 - q0) + vabd.u8 d28, d0, d7 ; abs(p7 - p0) + vabd.u8 d29, d15, d8 ; abs(q7 - q0) + + ; only compare the largest value to thresh + vmax.u8 d22, d22, d23 ; max(abs(p4 - p0), abs(q4 - q0)) + vmax.u8 d23, d24, d25 ; max(abs(p0 - p5), abs(q0 - q5)) + vmax.u8 d24, d26, d27 ; max(abs(p6 - p0), abs(q6 - q0)) + vmax.u8 d25, d28, d29 ; max(abs(p7 - p0), abs(q7 - q0)) + + vmax.u8 d26, d22, d23 + vmax.u8 d27, d24, d25 + vmax.u8 d23, d26, d27 + + vcge.u8 d18, d30, d23 ; flat2 + + vmov.u8 d22, #0x80 + + orrs r5, r5, r6 ; Check for 0 + orreq r7, r7, #1 ; Only do filter branch + + vand d17, d18, d16 ; flat2 && flat && mask + vmov r5, r6, d17 + + ; mbfilter() function + + ; filter() function + ; convert to signed + veor d23, d8, d22 ; qs0 + veor d24, d7, d22 ; ps0 + veor d25, d6, d22 ; ps1 + veor d26, d9, d22 ; qs1 + + vmov.u8 d27, #3 + + vsub.s8 d28, d23, d24 ; ( qs0 - ps0) + vqsub.s8 d29, d25, d26 ; filter = clamp(ps1-qs1) + vmull.s8 q15, d28, d27 ; 3 * ( qs0 - ps0) + vand d29, d29, d21 ; filter &= hev + vaddw.s8 q15, q15, d29 ; filter + 3 * (qs0 - ps0) + vmov.u8 d29, #4 + + ; filter = clamp(filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d28, q15 + + vand d28, d28, d19 ; filter &= mask + + vqadd.s8 d30, d28, d27 ; filter2 = clamp(filter+3) + vqadd.s8 d29, d28, d29 ; filter1 = clamp(filter+4) + vshr.s8 d30, d30, #3 ; filter2 >>= 3 + vshr.s8 d29, d29, #3 ; filter1 >>= 3 + + + vqadd.s8 d24, d24, d30 ; op0 = clamp(ps0 + filter2) + vqsub.s8 d23, d23, d29 ; oq0 = clamp(qs0 - filter1) + + ; outer tap adjustments: ++filter1 >> 1 + vrshr.s8 d29, d29, #1 + vbic d29, d29, d21 ; filter &= ~hev + + vqadd.s8 d25, d25, d29 ; op1 = clamp(ps1 + filter) + vqsub.s8 d26, d26, d29 ; oq1 = clamp(qs1 - filter) + + veor d24, d24, d22 ; *f_op0 = u^0x80 + veor d23, d23, d22 ; *f_oq0 = u^0x80 + veor d25, d25, d22 ; *f_op1 = u^0x80 + veor d26, d26, d22 ; *f_oq1 = u^0x80 + + tst r7, #1 + bxne lr + + orrs r5, r5, r6 ; Check for 0 + orreq r7, r7, #2 ; Only do mbfilter branch + + ; mbfilter flat && mask branch + ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's + ; and using vibt on the q's? + vmov.u8 d29, #2 + vaddl.u8 q15, d7, d8 ; op2 = p0 + q0 + vmlal.u8 q15, d4, d27 ; op2 = p0 + q0 + p3 * 3 + vmlal.u8 q15, d5, d29 ; op2 = p0 + q0 + p3 * 3 + p2 * 2 + vaddl.u8 q10, d4, d5 + vaddw.u8 q15, d6 ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2 + vaddl.u8 q14, d6, d9 + vqrshrn.u16 d18, q15, #3 ; r_op2 + + vsub.i16 q15, q10 + vaddl.u8 q10, d4, d6 + vadd.i16 q15, q14 + vaddl.u8 q14, d7, d10 + vqrshrn.u16 d19, q15, #3 ; r_op1 + + vsub.i16 q15, q10 + vadd.i16 q15, q14 + vaddl.u8 q14, d8, d11 + vqrshrn.u16 d20, q15, #3 ; r_op0 + + vsubw.u8 q15, d4 ; oq0 = op0 - p3 + vsubw.u8 q15, d7 ; oq0 -= p0 + vadd.i16 q15, q14 + vaddl.u8 q14, d9, d11 + vqrshrn.u16 d21, q15, #3 ; r_oq0 + + vsubw.u8 q15, d5 ; oq1 = oq0 - p2 + vsubw.u8 q15, d8 ; oq1 -= q0 + vadd.i16 q15, q14 + vaddl.u8 q14, d10, d11 + vqrshrn.u16 d22, q15, #3 ; r_oq1 + + vsubw.u8 q15, d6 ; oq2 = oq0 - p1 + vsubw.u8 q15, d9 ; oq2 -= q1 + vadd.i16 q15, q14 + vqrshrn.u16 d27, q15, #3 ; r_oq2 + + ; Filter does not set op2 or oq2, so use p2 and q2. + vbif d18, d5, d16 ; t_op2 |= p2 & ~(flat & mask) + vbif d19, d25, d16 ; t_op1 |= f_op1 & ~(flat & mask) + vbif d20, d24, d16 ; t_op0 |= f_op0 & ~(flat & mask) + vbif d21, d23, d16 ; t_oq0 |= f_oq0 & ~(flat & mask) + vbif d22, d26, d16 ; t_oq1 |= f_oq1 & ~(flat & mask) + + vbit d23, d27, d16 ; t_oq2 |= r_oq2 & (flat & mask) + vbif d23, d10, d16 ; t_oq2 |= q2 & ~(flat & mask) + + tst r7, #2 + bxne lr + + ; wide_mbfilter flat2 && flat && mask branch + vmov.u8 d16, #7 + vaddl.u8 q15, d7, d8 ; op6 = p0 + q0 + vaddl.u8 q12, d2, d3 + vaddl.u8 q13, d4, d5 + vaddl.u8 q14, d1, d6 + vmlal.u8 q15, d0, d16 ; op6 += p7 * 3 + vadd.i16 q12, q13 + vadd.i16 q15, q14 + vaddl.u8 q14, d2, d9 + vadd.i16 q15, q12 + vaddl.u8 q12, d0, d1 + vaddw.u8 q15, d1 + vaddl.u8 q13, d0, d2 + vadd.i16 q14, q15, q14 + vqrshrn.u16 d16, q15, #4 ; w_op6 + + vsub.i16 q15, q14, q12 + vaddl.u8 q14, d3, d10 + vqrshrn.u16 d24, q15, #4 ; w_op5 + + vsub.i16 q15, q13 + vaddl.u8 q13, d0, d3 + vadd.i16 q15, q14 + vaddl.u8 q14, d4, d11 + vqrshrn.u16 d25, q15, #4 ; w_op4 + + vadd.i16 q15, q14 + vaddl.u8 q14, d0, d4 + vsub.i16 q15, q13 + vsub.i16 q14, q15, q14 + vqrshrn.u16 d26, q15, #4 ; w_op3 + + vaddw.u8 q15, q14, d5 ; op2 += p2 + vaddl.u8 q14, d0, d5 + vaddw.u8 q15, d12 ; op2 += q4 + vbif d26, d4, d17 ; op3 |= p3 & ~(f2 & f & m) + vqrshrn.u16 d27, q15, #4 ; w_op2 + + vsub.i16 q15, q14 + vaddl.u8 q14, d0, d6 + vaddw.u8 q15, d6 ; op1 += p1 + vaddw.u8 q15, d13 ; op1 += q5 + vbif d27, d18, d17 ; op2 |= t_op2 & ~(f2 & f & m) + vqrshrn.u16 d18, q15, #4 ; w_op1 + + vsub.i16 q15, q14 + vaddl.u8 q14, d0, d7 + vaddw.u8 q15, d7 ; op0 += p0 + vaddw.u8 q15, d14 ; op0 += q6 + vbif d18, d19, d17 ; op1 |= t_op1 & ~(f2 & f & m) + vqrshrn.u16 d19, q15, #4 ; w_op0 + + vsub.i16 q15, q14 + vaddl.u8 q14, d1, d8 + vaddw.u8 q15, d8 ; oq0 += q0 + vaddw.u8 q15, d15 ; oq0 += q7 + vbif d19, d20, d17 ; op0 |= t_op0 & ~(f2 & f & m) + vqrshrn.u16 d20, q15, #4 ; w_oq0 + + vsub.i16 q15, q14 + vaddl.u8 q14, d2, d9 + vaddw.u8 q15, d9 ; oq1 += q1 + vaddl.u8 q4, d10, d15 + vaddw.u8 q15, d15 ; oq1 += q7 + vbif d20, d21, d17 ; oq0 |= t_oq0 & ~(f2 & f & m) + vqrshrn.u16 d21, q15, #4 ; w_oq1 + + vsub.i16 q15, q14 + vaddl.u8 q14, d3, d10 + vadd.i16 q15, q4 + vaddl.u8 q4, d11, d15 + vbif d21, d22, d17 ; oq1 |= t_oq1 & ~(f2 & f & m) + vqrshrn.u16 d22, q15, #4 ; w_oq2 + + vsub.i16 q15, q14 + vaddl.u8 q14, d4, d11 + vadd.i16 q15, q4 + vaddl.u8 q4, d12, d15 + vbif d22, d23, d17 ; oq2 |= t_oq2 & ~(f2 & f & m) + vqrshrn.u16 d23, q15, #4 ; w_oq3 + + vsub.i16 q15, q14 + vaddl.u8 q14, d5, d12 + vadd.i16 q15, q4 + vaddl.u8 q4, d13, d15 + vbif d16, d1, d17 ; op6 |= p6 & ~(f2 & f & m) + vqrshrn.u16 d1, q15, #4 ; w_oq4 + + vsub.i16 q15, q14 + vaddl.u8 q14, d6, d13 + vadd.i16 q15, q4 + vaddl.u8 q4, d14, d15 + vbif d24, d2, d17 ; op5 |= p5 & ~(f2 & f & m) + vqrshrn.u16 d2, q15, #4 ; w_oq5 + + vsub.i16 q15, q14 + vbif d25, d3, d17 ; op4 |= p4 & ~(f2 & f & m) + vadd.i16 q15, q4 + vbif d23, d11, d17 ; oq3 |= q3 & ~(f2 & f & m) + vqrshrn.u16 d3, q15, #4 ; w_oq6 + vbif d1, d12, d17 ; oq4 |= q4 & ~(f2 & f & m) + vbif d2, d13, d17 ; oq5 |= q5 & ~(f2 & f & m) + vbif d3, d14, d17 ; oq6 |= q6 & ~(f2 & f & m) + + bx lr + ENDP ; |vpx_wide_mbfilter_neon| + + END diff --git a/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm new file mode 100644 index 0000000000..907e918380 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm @@ -0,0 +1,549 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vpx_lpf_horizontal_4_neon| + EXPORT |vpx_lpf_vertical_4_neon| + EXPORT |vpx_lpf_horizontal_4_dual_neon| + EXPORT |vpx_lpf_vertical_4_dual_neon| + ARM + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; Currently vpx only works on iterations 8 at a time. The vp8 loop filter +; works on 16 iterations at a time. +; +; void vpx_lpf_horizontal_4_neon(uint8_t *s, +; int p /* pitch */, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +|vpx_lpf_horizontal_4_neon| PROC + push {lr} + + vld1.8 {d0[]}, [r2] ; duplicate *blimit + ldr r2, [sp, #4] ; load thresh + add r1, r1, r1 ; double pitch + + vld1.8 {d1[]}, [r3] ; duplicate *limit + vld1.8 {d2[]}, [r2] ; duplicate *thresh + + sub r2, r0, r1, lsl #1 ; move src pointer down by 4 lines + add r3, r2, r1, lsr #1 ; set to 3 lines down + + vld1.u8 {d3}, [r2@64], r1 ; p3 + vld1.u8 {d4}, [r3@64], r1 ; p2 + vld1.u8 {d5}, [r2@64], r1 ; p1 + vld1.u8 {d6}, [r3@64], r1 ; p0 + vld1.u8 {d7}, [r2@64], r1 ; q0 + vld1.u8 {d16}, [r3@64], r1 ; q1 + vld1.u8 {d17}, [r2@64] ; q2 + vld1.u8 {d18}, [r3@64] ; q3 + + sub r2, r2, r1, lsl #1 + sub r3, r3, r1, lsl #1 + + bl filter4_8 + + vst1.u8 {d4}, [r2@64], r1 ; store op1 + vst1.u8 {d5}, [r3@64], r1 ; store op0 + vst1.u8 {d6}, [r2@64], r1 ; store oq0 + vst1.u8 {d7}, [r3@64], r1 ; store oq1 + + pop {pc} + ENDP ; |vpx_lpf_horizontal_4_neon| + +; Currently vpx only works on iterations 8 at a time. The vp8 loop filter +; works on 16 iterations at a time. +; +; void vpx_lpf_vertical_4_neon(uint8_t *s, +; int p /* pitch */, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +|vpx_lpf_vertical_4_neon| PROC + push {lr} + + vld1.8 {d0[]}, [r2] ; duplicate *blimit + vld1.8 {d1[]}, [r3] ; duplicate *limit + + ldr r3, [sp, #4] ; load thresh + sub r2, r0, #4 ; move s pointer down by 4 columns + + vld1.8 {d2[]}, [r3] ; duplicate *thresh + + vld1.u8 {d3}, [r2], r1 ; load s data + vld1.u8 {d4}, [r2], r1 + vld1.u8 {d5}, [r2], r1 + vld1.u8 {d6}, [r2], r1 + vld1.u8 {d7}, [r2], r1 + vld1.u8 {d16}, [r2], r1 + vld1.u8 {d17}, [r2], r1 + vld1.u8 {d18}, [r2] + + ;transpose to 8x16 matrix + vtrn.32 d3, d7 + vtrn.32 d4, d16 + vtrn.32 d5, d17 + vtrn.32 d6, d18 + + vtrn.16 d3, d5 + vtrn.16 d4, d6 + vtrn.16 d7, d17 + vtrn.16 d16, d18 + + vtrn.8 d3, d4 + vtrn.8 d5, d6 + vtrn.8 d7, d16 + vtrn.8 d17, d18 + + bl filter4_8 + + sub r0, r0, #2 + + ;store op1, op0, oq0, oq1 + vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r0], r1 + vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r0], r1 + vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r0], r1 + vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r0], r1 + vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r0], r1 + vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r0], r1 + vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r0], r1 + vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r0] + + pop {pc} + ENDP ; |vpx_lpf_vertical_4_neon| + +; void filter4_8(); +; This is a helper function for the loopfilters. The invidual functions do the +; necessary load, transpose (if necessary) and store. The function does not use +; registers d8-d15. +; +; Inputs: +; r0-r3, r12 PRESERVE +; d0 blimit +; d1 limit +; d2 thresh +; d3 p3 +; d4 p2 +; d5 p1 +; d6 p0 +; d7 q0 +; d16 q1 +; d17 q2 +; d18 q3 +; +; Outputs: +; d4 op1 +; d5 op0 +; d6 oq0 +; d7 oq1 +|filter4_8| PROC + ; filter_mask + vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2) + vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1) + vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0) + vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0) + vabd.u8 d3, d17, d16 ; m5 = abs(q2 - q1) + vabd.u8 d4, d18, d17 ; m6 = abs(q3 - q2) + + ; only compare the largest value to limit + vmax.u8 d19, d19, d20 ; m1 = max(m1, m2) + vmax.u8 d20, d21, d22 ; m2 = max(m3, m4) + + vabd.u8 d17, d6, d7 ; abs(p0 - q0) + + vmax.u8 d3, d3, d4 ; m3 = max(m5, m6) + + vmov.u8 d18, #0x80 + + vmax.u8 d23, d19, d20 ; m1 = max(m1, m2) + + ; hevmask + vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1 + vmax.u8 d23, d23, d3 ; m1 = max(m1, m3) + + vabd.u8 d28, d5, d16 ; a = abs(p1 - q1) + vqadd.u8 d17, d17, d17 ; b = abs(p0 - q0) * 2 + + veor d7, d7, d18 ; qs0 + + vcge.u8 d23, d1, d23 ; abs(m1) > limit + + ; filter() function + ; convert to signed + + vshr.u8 d28, d28, #1 ; a = a / 2 + veor d6, d6, d18 ; ps0 + + veor d5, d5, d18 ; ps1 + vqadd.u8 d17, d17, d28 ; a = b + a + + veor d16, d16, d18 ; qs1 + + vmov.u8 d19, #3 + + vsub.s8 d28, d7, d6 ; ( qs0 - ps0) + + vcge.u8 d17, d0, d17 ; a > blimit + + vqsub.s8 d27, d5, d16 ; filter = clamp(ps1-qs1) + vorr d22, d21, d22 ; hevmask + + vmull.s8 q12, d28, d19 ; 3 * ( qs0 - ps0) + + vand d27, d27, d22 ; filter &= hev + vand d23, d23, d17 ; filter_mask + + vaddw.s8 q12, q12, d27 ; filter + 3 * (qs0 - ps0) + + vmov.u8 d17, #4 + + ; filter = clamp(filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d27, q12 + + vand d27, d27, d23 ; filter &= mask + + vqadd.s8 d28, d27, d19 ; filter2 = clamp(filter+3) + vqadd.s8 d27, d27, d17 ; filter1 = clamp(filter+4) + vshr.s8 d28, d28, #3 ; filter2 >>= 3 + vshr.s8 d27, d27, #3 ; filter1 >>= 3 + + vqadd.s8 d19, d6, d28 ; u = clamp(ps0 + filter2) + vqsub.s8 d26, d7, d27 ; u = clamp(qs0 - filter1) + + ; outer tap adjustments + vrshr.s8 d27, d27, #1 ; filter = ++filter1 >> 1 + + veor d6, d26, d18 ; *oq0 = u^0x80 + + vbic d27, d27, d22 ; filter &= ~hev + + vqadd.s8 d21, d5, d27 ; u = clamp(ps1 + filter) + vqsub.s8 d20, d16, d27 ; u = clamp(qs1 - filter) + + veor d5, d19, d18 ; *op0 = u^0x80 + veor d4, d21, d18 ; *op1 = u^0x80 + veor d7, d20, d18 ; *oq1 = u^0x80 + + bx lr + ENDP ; |filter4_8| + +;void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p, +; const uint8_t *blimit0, +; const uint8_t *limit0, +; const uint8_t *thresh0, +; const uint8_t *blimit1, +; const uint8_t *limit1, +; const uint8_t *thresh1) +; r0 uint8_t *s, +; r1 int p, +; r2 const uint8_t *blimit0, +; r3 const uint8_t *limit0, +; sp const uint8_t *thresh0, +; sp+4 const uint8_t *blimit1, +; sp+8 const uint8_t *limit1, +; sp+12 const uint8_t *thresh1, + +|vpx_lpf_horizontal_4_dual_neon| PROC + push {lr} + + ldr r12, [sp, #4] ; load thresh0 + vld1.8 {d0}, [r2] ; load blimit0 to first half q + vld1.8 {d2}, [r3] ; load limit0 to first half q + + add r1, r1, r1 ; double pitch + ldr r2, [sp, #8] ; load blimit1 + + vld1.8 {d4}, [r12] ; load thresh0 to first half q + + ldr r3, [sp, #12] ; load limit1 + ldr r12, [sp, #16] ; load thresh1 + vld1.8 {d1}, [r2] ; load blimit1 to 2nd half q + + sub r2, r0, r1, lsl #1 ; s[-4 * p] + + vld1.8 {d3}, [r3] ; load limit1 to 2nd half q + vld1.8 {d5}, [r12] ; load thresh1 to 2nd half q + + vpush {d8-d15} ; save neon registers + + add r3, r2, r1, lsr #1 ; s[-3 * p] + + vld1.u8 {q3}, [r2@64], r1 ; p3 + vld1.u8 {q4}, [r3@64], r1 ; p2 + vld1.u8 {q5}, [r2@64], r1 ; p1 + vld1.u8 {q6}, [r3@64], r1 ; p0 + vld1.u8 {q7}, [r2@64], r1 ; q0 + vld1.u8 {q8}, [r3@64], r1 ; q1 + vld1.u8 {q9}, [r2@64] ; q2 + vld1.u8 {q10}, [r3@64] ; q3 + + sub r2, r2, r1, lsl #1 + sub r3, r3, r1, lsl #1 + + bl filter4_16 + + vst1.u8 {q5}, [r2@64], r1 ; store op1 + vst1.u8 {q6}, [r3@64], r1 ; store op0 + vst1.u8 {q7}, [r2@64], r1 ; store oq0 + vst1.u8 {q8}, [r3@64], r1 ; store oq1 + + vpop {d8-d15} ; restore neon registers + + pop {pc} + ENDP ; |vpx_lpf_horizontal_4_dual_neon| + +;void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, +; const uint8_t *blimit0, +; const uint8_t *limit0, +; const uint8_t *thresh0, +; const uint8_t *blimit1, +; const uint8_t *limit1, +; const uint8_t *thresh1) +; r0 uint8_t *s, +; r1 int p, +; r2 const uint8_t *blimit0, +; r3 const uint8_t *limit0, +; sp const uint8_t *thresh0, +; sp+4 const uint8_t *blimit1, +; sp+8 const uint8_t *limit1, +; sp+12 const uint8_t *thresh1, + +|vpx_lpf_vertical_4_dual_neon| PROC + push {lr} + + ldr r12, [sp, #4] ; load thresh0 + vld1.8 {d0}, [r2] ; load blimit0 to first half q + vld1.8 {d2}, [r3] ; load limit0 to first half q + + ldr r2, [sp, #8] ; load blimit1 + + vld1.8 {d4}, [r12] ; load thresh0 to first half q + + ldr r3, [sp, #12] ; load limit1 + ldr r12, [sp, #16] ; load thresh1 + vld1.8 {d1}, [r2] ; load blimit1 to 2nd half q + + sub r2, r0, #4 ; s[-4] + + vld1.8 {d3}, [r3] ; load limit1 to 2nd half q + vld1.8 {d5}, [r12] ; load thresh1 to 2nd half q + + vpush {d8-d15} ; save neon registers + + vld1.u8 {d6}, [r2], r1 ; 00 01 02 03 04 05 06 07 + vld1.u8 {d8}, [r2], r1 ; 10 11 12 13 14 15 16 17 + vld1.u8 {d10}, [r2], r1 ; 20 21 22 23 24 25 26 27 + vld1.u8 {d12}, [r2], r1 ; 30 31 32 33 34 35 36 37 + vld1.u8 {d14}, [r2], r1 ; 40 41 42 43 44 45 46 47 + vld1.u8 {d16}, [r2], r1 ; 50 51 52 53 54 55 56 57 + vld1.u8 {d18}, [r2], r1 ; 60 61 62 63 64 65 66 67 + vld1.u8 {d20}, [r2], r1 ; 70 71 72 73 74 75 76 77 + vld1.u8 {d7}, [r2], r1 ; 80 81 82 83 84 85 86 87 + vld1.u8 {d9}, [r2], r1 ; 90 91 92 93 94 95 96 97 + vld1.u8 {d11}, [r2], r1 ; A0 A1 A2 A3 A4 A5 A6 A7 + vld1.u8 {d13}, [r2], r1 ; B0 B1 B2 B3 B4 B5 B6 B7 + vld1.u8 {d15}, [r2], r1 ; C0 C1 C2 C3 C4 C5 C6 C7 + vld1.u8 {d17}, [r2], r1 ; D0 D1 D2 D3 D4 D5 D6 D7 + vld1.u8 {d19}, [r2], r1 ; E0 E1 E2 E3 E4 E5 E6 E7 + vld1.u8 {d21}, [r2] ; F0 F1 F2 F3 F4 F5 F6 F7 + + vtrn.8 q3, q4 ; q3 : 00 10 02 12 04 14 06 16 80 90 82 92 84 94 86 96 + ; q4 : 01 11 03 13 05 15 07 17 81 91 83 93 85 95 87 97 + vtrn.8 q5, q6 ; q5 : 20 30 22 32 24 34 26 36 A0 B0 A2 B2 A4 B4 A6 B6 + ; q6 : 21 31 23 33 25 35 27 37 A1 B1 A3 B3 A5 B5 A7 B7 + vtrn.8 q7, q8 ; q7 : 40 50 42 52 44 54 46 56 C0 D0 C2 D2 C4 D4 C6 D6 + ; q8 : 41 51 43 53 45 55 47 57 C1 D1 C3 D3 C5 D5 C7 D7 + vtrn.8 q9, q10 ; q9 : 60 70 62 72 64 74 66 76 E0 F0 E2 F2 E4 F4 E6 F6 + ; q10: 61 71 63 73 65 75 67 77 E1 F1 E3 F3 E5 F5 E7 F7 + + vtrn.16 q3, q5 ; q3 : 00 10 20 30 04 14 24 34 80 90 A0 B0 84 94 A4 B4 + ; q5 : 02 12 22 32 06 16 26 36 82 92 A2 B2 86 96 A6 B6 + vtrn.16 q4, q6 ; q4 : 01 11 21 31 05 15 25 35 81 91 A1 B1 85 95 A5 B5 + ; q6 : 03 13 23 33 07 17 27 37 83 93 A3 B3 87 97 A7 B7 + vtrn.16 q7, q9 ; q7 : 40 50 60 70 44 54 64 74 C0 D0 E0 F0 C4 D4 E4 F4 + ; q9 : 42 52 62 72 46 56 66 76 C2 D2 E2 F2 C6 D6 E6 F6 + vtrn.16 q8, q10 ; q8 : 41 51 61 71 45 55 65 75 C1 D1 E1 F1 C5 D5 E5 F5 + ; q10: 43 53 63 73 47 57 67 77 C3 D3 E3 F3 C7 D7 E7 F7 + + vtrn.32 q3, q7 ; q3 : 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0 + ; q7 : 04 14 24 34 44 54 64 74 84 94 A4 B4 C4 D4 E4 F4 + vtrn.32 q5, q9 ; q5 : 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2 + ; q9 : 06 16 26 36 46 56 66 76 86 96 A6 B6 C6 D6 E6 F6 + vtrn.32 q4, q8 ; q4 : 01 11 21 31 41 51 61 71 81 91 A1 B1 C1 D1 E1 F1 + ; q8 : 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5 + vtrn.32 q6, q10 ; q6 : 03 13 23 33 43 53 63 73 83 93 A3 B3 C3 D3 E3 F3 + ; q10: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7 + + bl filter4_16 + + sub r0, #2 + + vmov d0, d11 + vmov d1, d13 + vmov d2, d15 + vmov d3, d17 + vmov d11, d12 + vmov d12, d14 + vmov d13, d16 + vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 + vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1 + vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1 + vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1 + vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1 + vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1 + vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1 + vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0], r1 + vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1 + vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1 + vst4.8 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1 + vst4.8 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1 + vst4.8 {d0[4], d1[4], d2[4], d3[4]}, [r0], r1 + vst4.8 {d0[5], d1[5], d2[5], d3[5]}, [r0], r1 + vst4.8 {d0[6], d1[6], d2[6], d3[6]}, [r0], r1 + vst4.8 {d0[7], d1[7], d2[7], d3[7]}, [r0] + + vpop {d8-d15} ; restore neon registers + + pop {pc} + ENDP ; |vpx_lpf_vertical_4_dual_neon| + +; void filter4_16(); +; This is a helper function for the loopfilters. The invidual functions do the +; necessary load, transpose (if necessary) and store. This function uses +; registers d8-d15, so the calling function must save those registers. +; +; r0-r3, r12 PRESERVE +; q0 blimit +; q1 limit +; q2 thresh +; q3 p3 +; q4 p2 +; q5 p1 +; q6 p0 +; q7 q0 +; q8 q1 +; q9 q2 +; q10 q3 +; +; Outputs: +; q5 op1 +; q6 op0 +; q7 oq0 +; q8 oq1 +|filter4_16| PROC + + ; filter_mask + vabd.u8 q11, q3, q4 ; m1 = abs(p3 - p2) + vabd.u8 q12, q4, q5 ; m2 = abs(p2 - p1) + vabd.u8 q13, q5, q6 ; m3 = abs(p1 - p0) + vabd.u8 q14, q8, q7 ; m4 = abs(q1 - q0) + vabd.u8 q3, q9, q8 ; m5 = abs(q2 - q1) + vabd.u8 q4, q10, q9 ; m6 = abs(q3 - q2) + + ; only compare the largest value to limit + vmax.u8 q11, q11, q12 ; m7 = max(m1, m2) + vmax.u8 q12, q13, q14 ; m8 = max(m3, m4) + + vabd.u8 q9, q6, q7 ; abs(p0 - q0) + + vmax.u8 q3, q3, q4 ; m9 = max(m5, m6) + + vmov.u8 q10, #0x80 + + vmax.u8 q15, q11, q12 ; m10 = max(m7, m8) + + vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 + vmax.u8 q15, q15, q3 ; m11 = max(m10, m9) + + vabd.u8 q2, q5, q8 ; a = abs(p1 - q1) + vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2 + + veor q7, q7, q10 ; qs0 + + vcge.u8 q15, q1, q15 ; abs(m11) > limit + + vshr.u8 q2, q2, #1 ; a = a / 2 + veor q6, q6, q10 ; ps0 + + veor q5, q5, q10 ; ps1 + vqadd.u8 q9, q9, q2 ; a = b + a + + veor q8, q8, q10 ; qs1 + + vmov.u16 q4, #3 + + vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) + vsubl.s8 q11, d15, d13 + + vcge.u8 q9, q0, q9 ; a > blimit + + vqsub.s8 q1, q5, q8 ; filter = clamp(ps1-qs1) + vorr q14, q13, q14 ; hev + + vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0) + vmul.i16 q11, q11, q4 + + vand q1, q1, q14 ; filter &= hev + vand q15, q15, q9 ; mask + + vmov.u8 q4, #3 + + vaddw.s8 q2, q2, d2 ; filter + 3 * (qs0 - ps0) + vaddw.s8 q11, q11, d3 + + vmov.u8 q9, #4 + + ; filter = clamp(filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d2, q2 + vqmovn.s16 d3, q11 + vand q1, q1, q15 ; filter &= mask + + vqadd.s8 q2, q1, q4 ; filter2 = clamp(filter+3) + vqadd.s8 q1, q1, q9 ; filter1 = clamp(filter+4) + vshr.s8 q2, q2, #3 ; filter2 >>= 3 + vshr.s8 q1, q1, #3 ; filter1 >>= 3 + + + vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + filter2) + vqsub.s8 q0, q7, q1 ; u = clamp(qs0 - filter1) + + ; outer tap adjustments + vrshr.s8 q1, q1, #1 ; filter = ++filter1 >> 1 + + veor q7, q0, q10 ; *oq0 = u^0x80 + + vbic q1, q1, q14 ; filter &= ~hev + + vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + filter) + vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - filter) + + veor q6, q11, q10 ; *op0 = u^0x80 + veor q5, q13, q10 ; *op1 = u^0x80 + veor q8, q12, q10 ; *oq1 = u^0x80 + + bx lr + ENDP ; |filter4_16| + + END diff --git a/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm new file mode 100644 index 0000000000..a81a9d1013 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm @@ -0,0 +1,491 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vpx_lpf_horizontal_8_neon| + EXPORT |vpx_lpf_horizontal_8_dual_neon| + EXPORT |vpx_lpf_vertical_8_neon| + EXPORT |vpx_lpf_vertical_8_dual_neon| + ARM + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; Currently vpx only works on iterations 8 at a time. The vp8 loop filter +; works on 16 iterations at a time. +; +; void vpx_lpf_horizontal_8_neon(uint8_t *s, int p, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +|vpx_lpf_horizontal_8_neon| PROC + push {r4-r5, lr} + + vld1.8 {d0[]}, [r2] ; duplicate *blimit + ldr r2, [sp, #12] ; load thresh + add r1, r1, r1 ; double pitch + + vld1.8 {d1[]}, [r3] ; duplicate *limit + vld1.8 {d2[]}, [r2] ; duplicate *thresh + + sub r3, r0, r1, lsl #1 ; move src pointer down by 4 lines + add r2, r3, r1, lsr #1 ; set to 3 lines down + + vld1.u8 {d3}, [r3@64], r1 ; p3 + vld1.u8 {d4}, [r2@64], r1 ; p2 + vld1.u8 {d5}, [r3@64], r1 ; p1 + vld1.u8 {d6}, [r2@64], r1 ; p0 + vld1.u8 {d7}, [r3@64], r1 ; q0 + vld1.u8 {d16}, [r2@64], r1 ; q1 + vld1.u8 {d17}, [r3@64] ; q2 + vld1.u8 {d18}, [r2@64], r1 ; q3 + + sub r3, r3, r1, lsl #1 + sub r2, r2, r1, lsl #2 + + bl vpx_mbloop_filter_neon + + vst1.u8 {d0}, [r2@64], r1 ; store op2 + vst1.u8 {d1}, [r3@64], r1 ; store op1 + vst1.u8 {d2}, [r2@64], r1 ; store op0 + vst1.u8 {d3}, [r3@64], r1 ; store oq0 + vst1.u8 {d4}, [r2@64], r1 ; store oq1 + vst1.u8 {d5}, [r3@64], r1 ; store oq2 + + pop {r4-r5, pc} + + ENDP ; |vpx_lpf_horizontal_8_neon| + +;void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, +; int p, +; const uint8_t *blimit0, +; const uint8_t *limit0, +; const uint8_t *thresh0, +; const uint8_t *blimit1, +; const uint8_t *limit1, +; const uint8_t *thresh1) +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit0, +; r3 const uint8_t *limit0, +; sp const uint8_t *thresh0, +; sp + 4 const uint8_t *blimit1, +; sp + 8 const uint8_t *limit1, +; sp + 12 const uint8_t *thresh1, +|vpx_lpf_horizontal_8_dual_neon| PROC + push {r0-r1, lr} + ldr lr, [sp, #12] + push {lr} ; thresh0 + bl vpx_lpf_horizontal_8_neon + + ldr r2, [sp, #20] ; blimit1 + ldr r3, [sp, #24] ; limit1 + ldr lr, [sp, #28] + str lr, [sp, #16] ; thresh1 + add sp, #4 + pop {r0-r1, lr} + add r0, #8 ; s + 8 + b vpx_lpf_horizontal_8_neon + ENDP ; |vpx_lpf_horizontal_8_dual_neon| + +; void vpx_lpf_vertical_8_neon(uint8_t *s, +; int pitch, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; +; r0 uint8_t *s, +; r1 int pitch, +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +|vpx_lpf_vertical_8_neon| PROC + push {r4-r5, lr} + + vld1.8 {d0[]}, [r2] ; duplicate *blimit + vld1.8 {d1[]}, [r3] ; duplicate *limit + + ldr r3, [sp, #12] ; load thresh + sub r2, r0, #4 ; move s pointer down by 4 columns + + vld1.8 {d2[]}, [r3] ; duplicate *thresh + + vld1.u8 {d3}, [r2], r1 ; load s data + vld1.u8 {d4}, [r2], r1 + vld1.u8 {d5}, [r2], r1 + vld1.u8 {d6}, [r2], r1 + vld1.u8 {d7}, [r2], r1 + vld1.u8 {d16}, [r2], r1 + vld1.u8 {d17}, [r2], r1 + vld1.u8 {d18}, [r2] + + ;transpose to 8x16 matrix + vtrn.32 d3, d7 + vtrn.32 d4, d16 + vtrn.32 d5, d17 + vtrn.32 d6, d18 + + vtrn.16 d3, d5 + vtrn.16 d4, d6 + vtrn.16 d7, d17 + vtrn.16 d16, d18 + + vtrn.8 d3, d4 + vtrn.8 d5, d6 + vtrn.8 d7, d16 + vtrn.8 d17, d18 + + sub r2, r0, #3 + add r3, r0, #1 + + bl vpx_mbloop_filter_neon + + ;store op2, op1, op0, oq0 + vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r2], r1 + vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r2], r1 + vst4.8 {d0[2], d1[2], d2[2], d3[2]}, [r2], r1 + vst4.8 {d0[3], d1[3], d2[3], d3[3]}, [r2], r1 + vst4.8 {d0[4], d1[4], d2[4], d3[4]}, [r2], r1 + vst4.8 {d0[5], d1[5], d2[5], d3[5]}, [r2], r1 + vst4.8 {d0[6], d1[6], d2[6], d3[6]}, [r2], r1 + vst4.8 {d0[7], d1[7], d2[7], d3[7]}, [r2] + + ;store oq1, oq2 + vst2.8 {d4[0], d5[0]}, [r3], r1 + vst2.8 {d4[1], d5[1]}, [r3], r1 + vst2.8 {d4[2], d5[2]}, [r3], r1 + vst2.8 {d4[3], d5[3]}, [r3], r1 + vst2.8 {d4[4], d5[4]}, [r3], r1 + vst2.8 {d4[5], d5[5]}, [r3], r1 + vst2.8 {d4[6], d5[6]}, [r3], r1 + vst2.8 {d4[7], d5[7]}, [r3] + + pop {r4-r5, pc} + ENDP ; |vpx_lpf_vertical_8_neon| + +;void vpx_lpf_vertical_8_dual_neon(uint8_t *s, +; int pitch, +; const uint8_t *blimit0, +; const uint8_t *limit0, +; const uint8_t *thresh0, +; const uint8_t *blimit1, +; const uint8_t *limit1, +; const uint8_t *thresh1) +; r0 uint8_t *s, +; r1 int pitch +; r2 const uint8_t *blimit0, +; r3 const uint8_t *limit0, +; sp const uint8_t *thresh0, +; sp + 4 const uint8_t *blimit1, +; sp + 8 const uint8_t *limit1, +; sp + 12 const uint8_t *thresh1, +|vpx_lpf_vertical_8_dual_neon| PROC + push {r0-r1, lr} + ldr lr, [sp, #12] + push {lr} ; thresh0 + bl vpx_lpf_vertical_8_neon + + ldr r2, [sp, #20] ; blimit1 + ldr r3, [sp, #24] ; limit1 + ldr lr, [sp, #28] + str lr, [sp, #16] ; thresh1 + add sp, #4 + pop {r0-r1, lr} + add r0, r0, r1, lsl #3 ; s + 8 * pitch + b vpx_lpf_vertical_8_neon + ENDP ; |vpx_lpf_vertical_8_dual_neon| + +; void vpx_mbloop_filter_neon(); +; This is a helper function for the loopfilters. The invidual functions do the +; necessary load, transpose (if necessary) and store. The function does not use +; registers d8-d15. +; +; Inputs: +; r0-r3, r12 PRESERVE +; d0 blimit +; d1 limit +; d2 thresh +; d3 p3 +; d4 p2 +; d5 p1 +; d6 p0 +; d7 q0 +; d16 q1 +; d17 q2 +; d18 q3 +; +; Outputs: +; d0 op2 +; d1 op1 +; d2 op0 +; d3 oq0 +; d4 oq1 +; d5 oq2 +|vpx_mbloop_filter_neon| PROC + ; filter_mask + vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2) + vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1) + vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0) + vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0) + vabd.u8 d23, d17, d16 ; m5 = abs(q2 - q1) + vabd.u8 d24, d18, d17 ; m6 = abs(q3 - q2) + + ; only compare the largest value to limit + vmax.u8 d19, d19, d20 ; m1 = max(m1, m2) + vmax.u8 d20, d21, d22 ; m2 = max(m3, m4) + + vabd.u8 d25, d6, d4 ; m7 = abs(p0 - p2) + + vmax.u8 d23, d23, d24 ; m3 = max(m5, m6) + + vabd.u8 d26, d7, d17 ; m8 = abs(q0 - q2) + + vmax.u8 d19, d19, d20 + + vabd.u8 d24, d6, d7 ; m9 = abs(p0 - q0) + vabd.u8 d27, d3, d6 ; m10 = abs(p3 - p0) + vabd.u8 d28, d18, d7 ; m11 = abs(q3 - q0) + + vmax.u8 d19, d19, d23 + + vabd.u8 d23, d5, d16 ; a = abs(p1 - q1) + vqadd.u8 d24, d24, d24 ; b = abs(p0 - q0) * 2 + + ; abs () > limit + vcge.u8 d19, d1, d19 + + ; only compare the largest value to thresh + vmax.u8 d25, d25, d26 ; m4 = max(m7, m8) + vmax.u8 d26, d27, d28 ; m5 = max(m10, m11) + + vshr.u8 d23, d23, #1 ; a = a / 2 + + vmax.u8 d25, d25, d26 ; m4 = max(m4, m5) + + vqadd.u8 d24, d24, d23 ; a = b + a + + vmax.u8 d20, d20, d25 ; m2 = max(m2, m4) + + vmov.u8 d23, #1 + vcge.u8 d24, d0, d24 ; a > blimit + + vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1 + + vcge.u8 d20, d23, d20 ; flat + + vand d19, d19, d24 ; mask + + vcgt.u8 d23, d22, d2 ; (abs(q1 - q0) > thresh)*-1 + + vand d20, d20, d19 ; flat & mask + + vmov.u8 d22, #0x80 + + vorr d23, d21, d23 ; hev + + ; This instruction will truncate the "flat & mask" masks down to 4 bits + ; each to fit into one 32 bit arm register. The values are stored in + ; q10.64[0]. + vshrn.u16 d30, q10, #4 + vmov.u32 r4, d30[0] ; flat & mask 4bits + + adds r5, r4, #1 ; Check for all 1's + + ; If mask and flat are 1's for all vectors, then we only need to execute + ; the power branch for all vectors. + beq power_branch_only + + cmp r4, #0 ; Check for 0, set flag for later + + ; mbfilter() function + ; filter() function + ; convert to signed + veor d21, d7, d22 ; qs0 + veor d24, d6, d22 ; ps0 + veor d25, d5, d22 ; ps1 + veor d26, d16, d22 ; qs1 + + vmov.u8 d27, #3 + + vsub.s8 d28, d21, d24 ; ( qs0 - ps0) + + vqsub.s8 d29, d25, d26 ; filter = clamp(ps1-qs1) + + vmull.s8 q15, d28, d27 ; 3 * ( qs0 - ps0) + + vand d29, d29, d23 ; filter &= hev + + vaddw.s8 q15, q15, d29 ; filter + 3 * (qs0 - ps0) + + vmov.u8 d29, #4 + + ; filter = clamp(filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d28, q15 + + vand d28, d28, d19 ; filter &= mask + + vqadd.s8 d30, d28, d27 ; filter2 = clamp(filter+3) + vqadd.s8 d29, d28, d29 ; filter1 = clamp(filter+4) + vshr.s8 d30, d30, #3 ; filter2 >>= 3 + vshr.s8 d29, d29, #3 ; filter1 >>= 3 + + vqadd.s8 d24, d24, d30 ; op0 = clamp(ps0 + filter2) + vqsub.s8 d21, d21, d29 ; oq0 = clamp(qs0 - filter1) + + ; outer tap adjustments: ++filter1 >> 1 + vrshr.s8 d29, d29, #1 + vbic d29, d29, d23 ; filter &= ~hev + + vqadd.s8 d25, d25, d29 ; op1 = clamp(ps1 + filter) + vqsub.s8 d26, d26, d29 ; oq1 = clamp(qs1 - filter) + + ; If mask and flat are 0's for all vectors, then we only need to execute + ; the filter branch for all vectors. + beq filter_branch_only + + ; If mask and flat are mixed then we must perform both branches and + ; combine the data. + veor d24, d24, d22 ; *f_op0 = u^0x80 + veor d21, d21, d22 ; *f_oq0 = u^0x80 + veor d25, d25, d22 ; *f_op1 = u^0x80 + veor d26, d26, d22 ; *f_oq1 = u^0x80 + + ; At this point we have already executed the filter branch. The filter + ; branch does not set op2 or oq2, so use p2 and q2. Execute the power + ; branch and combine the data. + vmov.u8 d23, #2 + vaddl.u8 q14, d6, d7 ; r_op2 = p0 + q0 + vmlal.u8 q14, d3, d27 ; r_op2 += p3 * 3 + vmlal.u8 q14, d4, d23 ; r_op2 += p2 * 2 + + vbif d0, d4, d20 ; op2 |= p2 & ~(flat & mask) + + vaddw.u8 q14, d5 ; r_op2 += p1 + + vbif d1, d25, d20 ; op1 |= f_op1 & ~(flat & mask) + + vqrshrn.u16 d30, q14, #3 ; r_op2 + + vsubw.u8 q14, d3 ; r_op1 = r_op2 - p3 + vsubw.u8 q14, d4 ; r_op1 -= p2 + vaddw.u8 q14, d5 ; r_op1 += p1 + vaddw.u8 q14, d16 ; r_op1 += q1 + + vbif d2, d24, d20 ; op0 |= f_op0 & ~(flat & mask) + + vqrshrn.u16 d31, q14, #3 ; r_op1 + + vsubw.u8 q14, d3 ; r_op0 = r_op1 - p3 + vsubw.u8 q14, d5 ; r_op0 -= p1 + vaddw.u8 q14, d6 ; r_op0 += p0 + vaddw.u8 q14, d17 ; r_op0 += q2 + + vbit d0, d30, d20 ; op2 |= r_op2 & (flat & mask) + + vqrshrn.u16 d23, q14, #3 ; r_op0 + + vsubw.u8 q14, d3 ; r_oq0 = r_op0 - p3 + vsubw.u8 q14, d6 ; r_oq0 -= p0 + vaddw.u8 q14, d7 ; r_oq0 += q0 + + vbit d1, d31, d20 ; op1 |= r_op1 & (flat & mask) + + vaddw.u8 q14, d18 ; oq0 += q3 + + vbit d2, d23, d20 ; op0 |= r_op0 & (flat & mask) + + vqrshrn.u16 d22, q14, #3 ; r_oq0 + + vsubw.u8 q14, d4 ; r_oq1 = r_oq0 - p2 + vsubw.u8 q14, d7 ; r_oq1 -= q0 + vaddw.u8 q14, d16 ; r_oq1 += q1 + + vbif d3, d21, d20 ; oq0 |= f_oq0 & ~(flat & mask) + + vaddw.u8 q14, d18 ; r_oq1 += q3 + + vbif d4, d26, d20 ; oq1 |= f_oq1 & ~(flat & mask) + + vqrshrn.u16 d6, q14, #3 ; r_oq1 + + vsubw.u8 q14, d5 ; r_oq2 = r_oq1 - p1 + vsubw.u8 q14, d16 ; r_oq2 -= q1 + vaddw.u8 q14, d17 ; r_oq2 += q2 + vaddw.u8 q14, d18 ; r_oq2 += q3 + + vbif d5, d17, d20 ; oq2 |= q2 & ~(flat & mask) + + vqrshrn.u16 d7, q14, #3 ; r_oq2 + + vbit d3, d22, d20 ; oq0 |= r_oq0 & (flat & mask) + vbit d4, d6, d20 ; oq1 |= r_oq1 & (flat & mask) + vbit d5, d7, d20 ; oq2 |= r_oq2 & (flat & mask) + + bx lr + +power_branch_only + vmov.u8 d27, #3 + vmov.u8 d21, #2 + vaddl.u8 q14, d6, d7 ; op2 = p0 + q0 + vmlal.u8 q14, d3, d27 ; op2 += p3 * 3 + vmlal.u8 q14, d4, d21 ; op2 += p2 * 2 + vaddw.u8 q14, d5 ; op2 += p1 + vqrshrn.u16 d0, q14, #3 ; op2 + + vsubw.u8 q14, d3 ; op1 = op2 - p3 + vsubw.u8 q14, d4 ; op1 -= p2 + vaddw.u8 q14, d5 ; op1 += p1 + vaddw.u8 q14, d16 ; op1 += q1 + vqrshrn.u16 d1, q14, #3 ; op1 + + vsubw.u8 q14, d3 ; op0 = op1 - p3 + vsubw.u8 q14, d5 ; op0 -= p1 + vaddw.u8 q14, d6 ; op0 += p0 + vaddw.u8 q14, d17 ; op0 += q2 + vqrshrn.u16 d2, q14, #3 ; op0 + + vsubw.u8 q14, d3 ; oq0 = op0 - p3 + vsubw.u8 q14, d6 ; oq0 -= p0 + vaddw.u8 q14, d7 ; oq0 += q0 + vaddw.u8 q14, d18 ; oq0 += q3 + vqrshrn.u16 d3, q14, #3 ; oq0 + + vsubw.u8 q14, d4 ; oq1 = oq0 - p2 + vsubw.u8 q14, d7 ; oq1 -= q0 + vaddw.u8 q14, d16 ; oq1 += q1 + vaddw.u8 q14, d18 ; oq1 += q3 + vqrshrn.u16 d4, q14, #3 ; oq1 + + vsubw.u8 q14, d5 ; oq2 = oq1 - p1 + vsubw.u8 q14, d16 ; oq2 -= q1 + vaddw.u8 q14, d17 ; oq2 += q2 + vaddw.u8 q14, d18 ; oq2 += q3 + vqrshrn.u16 d5, q14, #3 ; oq2 + + bx lr + +filter_branch_only + ; TODO(fgalligan): See if we can rearange registers so we do not need to + ; do the 2 vswp. + vswp d0, d4 ; op2 + vswp d5, d17 ; oq2 + veor d2, d24, d22 ; *op0 = u^0x80 + veor d3, d21, d22 ; *oq0 = u^0x80 + veor d1, d25, d22 ; *op1 = u^0x80 + veor d4, d26, d22 ; *oq1 = u^0x80 + + bx lr + + ENDP ; |vpx_mbloop_filter_neon| + + END diff --git a/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c new file mode 100644 index 0000000000..c54e588239 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c @@ -0,0 +1,1107 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/transpose_neon.h" + +// For all the static inline functions, the functions ending with '_8' process +// 8 samples in a bunch, and the functions ending with '_16' process 16 samples +// in a bunch. + +#define FUN_LOAD_THRESH(w, r) \ + static INLINE void load_thresh_##w( \ + const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, \ + uint8x##w##_t *blimit_vec, uint8x##w##_t *limit_vec, \ + uint8x##w##_t *thresh_vec) { \ + *blimit_vec = vld1##r##dup_u8(blimit); \ + *limit_vec = vld1##r##dup_u8(limit); \ + *thresh_vec = vld1##r##dup_u8(thresh); \ + } + +FUN_LOAD_THRESH(8, _) // load_thresh_8 +FUN_LOAD_THRESH(16, q_) // load_thresh_16 +#undef FUN_LOAD_THRESH + +static INLINE void load_thresh_8_dual( + const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, + uint8x16_t *blimit_vec, uint8x16_t *limit_vec, uint8x16_t *thresh_vec) { + *blimit_vec = vcombine_u8(vld1_dup_u8(blimit0), vld1_dup_u8(blimit1)); + *limit_vec = vcombine_u8(vld1_dup_u8(limit0), vld1_dup_u8(limit1)); + *thresh_vec = vcombine_u8(vld1_dup_u8(thresh0), vld1_dup_u8(thresh1)); +} + +// Here flat is 64-bit long, with each 8-bit (or 4-bit) chunk being a mask of a +// pixel. When used to control filter branches, we only detect whether it is all +// 0s or all 1s. We pairwise add flat to a 32-bit long number flat_status. +// flat equals 0 if and only if flat_status equals 0. +// flat equals -1 (all 1s) if and only if flat_status equals -2. (This is true +// because each mask occupies more than 1 bit.) +static INLINE uint32_t calc_flat_status_8(uint8x8_t flat) { + return vget_lane_u32( + vreinterpret_u32_u64(vpaddl_u32(vreinterpret_u32_u8(flat))), 0); +} + +// Here flat is 128-bit long, with each 8-bit chunk being a mask of a pixel. +// When used to control filter branches, we only detect whether it is all 0s or +// all 1s. We narrowing shift right each 16-bit chunk by 4 arithmetically, so +// we get a 64-bit long number, with each 4-bit chunk being a mask of a pixel. +// Then we pairwise add flat to a 32-bit long number flat_status. +// flat equals 0 if and only if flat_status equals 0. +// flat equals -1 (all 1s) if and only if flat_status equals -2. (This is true +// because each mask occupies more than 1 bit.) +static INLINE uint32_t calc_flat_status_16(uint8x16_t flat) { + const uint8x8_t flat_4bit = + vreinterpret_u8_s8(vshrn_n_s16(vreinterpretq_s16_u8(flat), 4)); + return calc_flat_status_8(flat_4bit); +} + +#define FUN_FILTER_HEV_MASK4(w, r) \ + static INLINE uint8x##w##_t filter_hev_mask4_##w( \ + const uint8x##w##_t limit, const uint8x##w##_t blimit, \ + const uint8x##w##_t thresh, const uint8x##w##_t p3, \ + const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \ + const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \ + const uint8x##w##_t q3, uint8x##w##_t *hev, uint8x##w##_t *mask) { \ + uint8x##w##_t max, t0, t1; \ + \ + max = vabd##r##u8(p1, p0); \ + max = vmax##r##u8(max, vabd##r##u8(q1, q0)); \ + *hev = vcgt##r##u8(max, thresh); \ + *mask = vmax##r##u8(max, vabd##r##u8(p3, p2)); \ + *mask = vmax##r##u8(*mask, vabd##r##u8(p2, p1)); \ + *mask = vmax##r##u8(*mask, vabd##r##u8(q2, q1)); \ + *mask = vmax##r##u8(*mask, vabd##r##u8(q3, q2)); \ + t0 = vabd##r##u8(p0, q0); \ + t1 = vabd##r##u8(p1, q1); \ + t0 = vqadd##r##u8(t0, t0); \ + t1 = vshr##r##n_u8(t1, 1); \ + t0 = vqadd##r##u8(t0, t1); \ + *mask = vcle##r##u8(*mask, limit); \ + t0 = vcle##r##u8(t0, blimit); \ + *mask = vand##r##u8(*mask, t0); \ + \ + return max; \ + } + +FUN_FILTER_HEV_MASK4(8, _) // filter_hev_mask4_8 +FUN_FILTER_HEV_MASK4(16, q_) // filter_hev_mask4_16 +#undef FUN_FILTER_HEV_MASK4 + +#define FUN_FILTER_FLAT_HEV_MASK(w, r) \ + static INLINE uint8x##w##_t filter_flat_hev_mask_##w( \ + const uint8x##w##_t limit, const uint8x##w##_t blimit, \ + const uint8x##w##_t thresh, const uint8x##w##_t p3, \ + const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \ + const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \ + const uint8x##w##_t q3, uint8x##w##_t *flat, uint32_t *flat_status, \ + uint8x##w##_t *hev) { \ + uint8x##w##_t max, mask; \ + \ + max = filter_hev_mask4_##w(limit, blimit, thresh, p3, p2, p1, p0, q0, q1, \ + q2, q3, hev, &mask); \ + *flat = vmax##r##u8(max, vabd##r##u8(p2, p0)); \ + *flat = vmax##r##u8(*flat, vabd##r##u8(q2, q0)); \ + *flat = vmax##r##u8(*flat, vabd##r##u8(p3, p0)); \ + *flat = vmax##r##u8(*flat, vabd##r##u8(q3, q0)); \ + *flat = vcle##r##u8(*flat, vdup##r##n_u8(1)); /* flat_mask4() */ \ + *flat = vand##r##u8(*flat, mask); \ + *flat_status = calc_flat_status_##w(*flat); \ + \ + return mask; \ + } + +FUN_FILTER_FLAT_HEV_MASK(8, _) // filter_flat_hev_mask_8 +FUN_FILTER_FLAT_HEV_MASK(16, q_) // filter_flat_hev_mask_16 +#undef FUN_FILTER_FLAT_HEV_MASK + +#define FUN_FLAT_MASK5(w, r) \ + static INLINE uint8x##w##_t flat_mask5_##w( \ + const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \ + const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \ + const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \ + const uint8x##w##_t q4, const uint8x##w##_t flat, \ + uint32_t *flat2_status) { \ + uint8x##w##_t flat2 = vabd##r##u8(p4, p0); \ + flat2 = vmax##r##u8(flat2, vabd##r##u8(p3, p0)); \ + flat2 = vmax##r##u8(flat2, vabd##r##u8(p2, p0)); \ + flat2 = vmax##r##u8(flat2, vabd##r##u8(p1, p0)); \ + flat2 = vmax##r##u8(flat2, vabd##r##u8(q1, q0)); \ + flat2 = vmax##r##u8(flat2, vabd##r##u8(q2, q0)); \ + flat2 = vmax##r##u8(flat2, vabd##r##u8(q3, q0)); \ + flat2 = vmax##r##u8(flat2, vabd##r##u8(q4, q0)); \ + flat2 = vcle##r##u8(flat2, vdup##r##n_u8(1)); \ + flat2 = vand##r##u8(flat2, flat); \ + *flat2_status = calc_flat_status_##w(flat2); \ + \ + return flat2; \ + } + +FUN_FLAT_MASK5(8, _) // flat_mask5_8 +FUN_FLAT_MASK5(16, q_) // flat_mask5_16 +#undef FUN_FLAT_MASK5 + +#define FUN_FLIP_SIGN(w, r) \ + static INLINE int8x##w##_t flip_sign_##w(const uint8x##w##_t v) { \ + const uint8x##w##_t sign_bit = vdup##r##n_u8(0x80); \ + return vreinterpret##r##s8_u8(veor##r##u8(v, sign_bit)); \ + } + +FUN_FLIP_SIGN(8, _) // flip_sign_8 +FUN_FLIP_SIGN(16, q_) // flip_sign_16 +#undef FUN_FLIP_SIGN + +#define FUN_FLIP_SIGN_BACK(w, r) \ + static INLINE uint8x##w##_t flip_sign_back_##w(const int8x##w##_t v) { \ + const int8x##w##_t sign_bit = vdup##r##n_s8(0x80); \ + return vreinterpret##r##u8_s8(veor##r##s8(v, sign_bit)); \ + } + +FUN_FLIP_SIGN_BACK(8, _) // flip_sign_back_8 +FUN_FLIP_SIGN_BACK(16, q_) // flip_sign_back_16 +#undef FUN_FLIP_SIGN_BACK + +static INLINE void filter_update_8(const uint8x8_t sub0, const uint8x8_t sub1, + const uint8x8_t add0, const uint8x8_t add1, + uint16x8_t *sum) { + *sum = vsubw_u8(*sum, sub0); + *sum = vsubw_u8(*sum, sub1); + *sum = vaddw_u8(*sum, add0); + *sum = vaddw_u8(*sum, add1); +} + +static INLINE void filter_update_16(const uint8x16_t sub0, + const uint8x16_t sub1, + const uint8x16_t add0, + const uint8x16_t add1, uint16x8_t *sum0, + uint16x8_t *sum1) { + *sum0 = vsubw_u8(*sum0, vget_low_u8(sub0)); + *sum1 = vsubw_u8(*sum1, vget_high_u8(sub0)); + *sum0 = vsubw_u8(*sum0, vget_low_u8(sub1)); + *sum1 = vsubw_u8(*sum1, vget_high_u8(sub1)); + *sum0 = vaddw_u8(*sum0, vget_low_u8(add0)); + *sum1 = vaddw_u8(*sum1, vget_high_u8(add0)); + *sum0 = vaddw_u8(*sum0, vget_low_u8(add1)); + *sum1 = vaddw_u8(*sum1, vget_high_u8(add1)); +} + +static INLINE uint8x8_t calc_7_tap_filter_8_kernel(const uint8x8_t sub0, + const uint8x8_t sub1, + const uint8x8_t add0, + const uint8x8_t add1, + uint16x8_t *sum) { + filter_update_8(sub0, sub1, add0, add1, sum); + return vrshrn_n_u16(*sum, 3); +} + +static INLINE uint8x16_t calc_7_tap_filter_16_kernel( + const uint8x16_t sub0, const uint8x16_t sub1, const uint8x16_t add0, + const uint8x16_t add1, uint16x8_t *sum0, uint16x8_t *sum1) { + filter_update_16(sub0, sub1, add0, add1, sum0, sum1); + return vcombine_u8(vrshrn_n_u16(*sum0, 3), vrshrn_n_u16(*sum1, 3)); +} + +static INLINE uint8x8_t apply_15_tap_filter_8_kernel( + const uint8x8_t flat, const uint8x8_t sub0, const uint8x8_t sub1, + const uint8x8_t add0, const uint8x8_t add1, const uint8x8_t in, + uint16x8_t *sum) { + filter_update_8(sub0, sub1, add0, add1, sum); + return vbsl_u8(flat, vrshrn_n_u16(*sum, 4), in); +} + +static INLINE uint8x16_t apply_15_tap_filter_16_kernel( + const uint8x16_t flat, const uint8x16_t sub0, const uint8x16_t sub1, + const uint8x16_t add0, const uint8x16_t add1, const uint8x16_t in, + uint16x8_t *sum0, uint16x8_t *sum1) { + uint8x16_t t; + filter_update_16(sub0, sub1, add0, add1, sum0, sum1); + t = vcombine_u8(vrshrn_n_u16(*sum0, 4), vrshrn_n_u16(*sum1, 4)); + return vbslq_u8(flat, t, in); +} + +// 7-tap filter [1, 1, 1, 2, 1, 1, 1] +static INLINE void calc_7_tap_filter_8(const uint8x8_t p3, const uint8x8_t p2, + const uint8x8_t p1, const uint8x8_t p0, + const uint8x8_t q0, const uint8x8_t q1, + const uint8x8_t q2, const uint8x8_t q3, + uint8x8_t *op2, uint8x8_t *op1, + uint8x8_t *op0, uint8x8_t *oq0, + uint8x8_t *oq1, uint8x8_t *oq2) { + uint16x8_t sum; + sum = vaddl_u8(p3, p3); // 2*p3 + sum = vaddw_u8(sum, p3); // 3*p3 + sum = vaddw_u8(sum, p2); // 3*p3+p2 + sum = vaddw_u8(sum, p2); // 3*p3+2*p2 + sum = vaddw_u8(sum, p1); // 3*p3+2*p2+p1 + sum = vaddw_u8(sum, p0); // 3*p3+2*p2+p1+p0 + sum = vaddw_u8(sum, q0); // 3*p3+2*p2+p1+p0+q0 + *op2 = vrshrn_n_u16(sum, 3); + *op1 = calc_7_tap_filter_8_kernel(p3, p2, p1, q1, &sum); + *op0 = calc_7_tap_filter_8_kernel(p3, p1, p0, q2, &sum); + *oq0 = calc_7_tap_filter_8_kernel(p3, p0, q0, q3, &sum); + *oq1 = calc_7_tap_filter_8_kernel(p2, q0, q1, q3, &sum); + *oq2 = calc_7_tap_filter_8_kernel(p1, q1, q2, q3, &sum); +} + +static INLINE void calc_7_tap_filter_16( + const uint8x16_t p3, const uint8x16_t p2, const uint8x16_t p1, + const uint8x16_t p0, const uint8x16_t q0, const uint8x16_t q1, + const uint8x16_t q2, const uint8x16_t q3, uint8x16_t *op2, uint8x16_t *op1, + uint8x16_t *op0, uint8x16_t *oq0, uint8x16_t *oq1, uint8x16_t *oq2) { + uint16x8_t sum0, sum1; + sum0 = vaddl_u8(vget_low_u8(p3), vget_low_u8(p3)); // 2*p3 + sum1 = vaddl_u8(vget_high_u8(p3), vget_high_u8(p3)); // 2*p3 + sum0 = vaddw_u8(sum0, vget_low_u8(p3)); // 3*p3 + sum1 = vaddw_u8(sum1, vget_high_u8(p3)); // 3*p3 + sum0 = vaddw_u8(sum0, vget_low_u8(p2)); // 3*p3+p2 + sum1 = vaddw_u8(sum1, vget_high_u8(p2)); // 3*p3+p2 + sum0 = vaddw_u8(sum0, vget_low_u8(p2)); // 3*p3+2*p2 + sum1 = vaddw_u8(sum1, vget_high_u8(p2)); // 3*p3+2*p2 + sum0 = vaddw_u8(sum0, vget_low_u8(p1)); // 3*p3+2*p2+p1 + sum1 = vaddw_u8(sum1, vget_high_u8(p1)); // 3*p3+2*p2+p1 + sum0 = vaddw_u8(sum0, vget_low_u8(p0)); // 3*p3+2*p2+p1+p0 + sum1 = vaddw_u8(sum1, vget_high_u8(p0)); // 3*p3+2*p2+p1+p0 + sum0 = vaddw_u8(sum0, vget_low_u8(q0)); // 3*p3+2*p2+p1+p0+q0 + sum1 = vaddw_u8(sum1, vget_high_u8(q0)); // 3*p3+2*p2+p1+p0+q0 + *op2 = vcombine_u8(vrshrn_n_u16(sum0, 3), vrshrn_n_u16(sum1, 3)); + *op1 = calc_7_tap_filter_16_kernel(p3, p2, p1, q1, &sum0, &sum1); + *op0 = calc_7_tap_filter_16_kernel(p3, p1, p0, q2, &sum0, &sum1); + *oq0 = calc_7_tap_filter_16_kernel(p3, p0, q0, q3, &sum0, &sum1); + *oq1 = calc_7_tap_filter_16_kernel(p2, q0, q1, q3, &sum0, &sum1); + *oq2 = calc_7_tap_filter_16_kernel(p1, q1, q2, q3, &sum0, &sum1); +} + +#define FUN_APPLY_7_TAP_FILTER(w, r) \ + static INLINE void apply_7_tap_filter_##w( \ + const uint8x##w##_t flat, const uint8x##w##_t p3, \ + const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \ + const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \ + const uint8x##w##_t q3, uint8x##w##_t *op2, uint8x##w##_t *op1, \ + uint8x##w##_t *op0, uint8x##w##_t *oq0, uint8x##w##_t *oq1, \ + uint8x##w##_t *oq2) { \ + uint8x##w##_t tp1, tp0, tq0, tq1; \ + calc_7_tap_filter_##w(p3, p2, p1, p0, q0, q1, q2, q3, op2, &tp1, &tp0, \ + &tq0, &tq1, oq2); \ + *op2 = vbsl##r##u8(flat, *op2, p2); \ + *op1 = vbsl##r##u8(flat, tp1, *op1); \ + *op0 = vbsl##r##u8(flat, tp0, *op0); \ + *oq0 = vbsl##r##u8(flat, tq0, *oq0); \ + *oq1 = vbsl##r##u8(flat, tq1, *oq1); \ + *oq2 = vbsl##r##u8(flat, *oq2, q2); \ + } + +FUN_APPLY_7_TAP_FILTER(8, _) // apply_7_tap_filter_8 +FUN_APPLY_7_TAP_FILTER(16, q_) // apply_7_tap_filter_16 +#undef FUN_APPLY_7_TAP_FILTER + +// 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1] +static INLINE void apply_15_tap_filter_8( + const uint8x8_t flat2, const uint8x8_t p7, const uint8x8_t p6, + const uint8x8_t p5, const uint8x8_t p4, const uint8x8_t p3, + const uint8x8_t p2, const uint8x8_t p1, const uint8x8_t p0, + const uint8x8_t q0, const uint8x8_t q1, const uint8x8_t q2, + const uint8x8_t q3, const uint8x8_t q4, const uint8x8_t q5, + const uint8x8_t q6, const uint8x8_t q7, uint8x8_t *op6, uint8x8_t *op5, + uint8x8_t *op4, uint8x8_t *op3, uint8x8_t *op2, uint8x8_t *op1, + uint8x8_t *op0, uint8x8_t *oq0, uint8x8_t *oq1, uint8x8_t *oq2, + uint8x8_t *oq3, uint8x8_t *oq4, uint8x8_t *oq5, uint8x8_t *oq6) { + uint16x8_t sum; + sum = vshll_n_u8(p7, 3); // 8*p7 + sum = vsubw_u8(sum, p7); // 7*p7 + sum = vaddw_u8(sum, p6); // 7*p7+p6 + sum = vaddw_u8(sum, p6); // 7*p7+2*p6 + sum = vaddw_u8(sum, p5); // 7*p7+2*p6+p5 + sum = vaddw_u8(sum, p4); // 7*p7+2*p6+p5+p4 + sum = vaddw_u8(sum, p3); // 7*p7+2*p6+p5+p4+p3 + sum = vaddw_u8(sum, p2); // 7*p7+2*p6+p5+p4+p3+p2 + sum = vaddw_u8(sum, p1); // 7*p7+2*p6+p5+p4+p3+p2+p1 + sum = vaddw_u8(sum, p0); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0 + sum = vaddw_u8(sum, q0); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0 + *op6 = vbsl_u8(flat2, vrshrn_n_u16(sum, 4), p6); + *op5 = apply_15_tap_filter_8_kernel(flat2, p7, p6, p5, q1, p5, &sum); + *op4 = apply_15_tap_filter_8_kernel(flat2, p7, p5, p4, q2, p4, &sum); + *op3 = apply_15_tap_filter_8_kernel(flat2, p7, p4, p3, q3, p3, &sum); + *op2 = apply_15_tap_filter_8_kernel(flat2, p7, p3, p2, q4, *op2, &sum); + *op1 = apply_15_tap_filter_8_kernel(flat2, p7, p2, p1, q5, *op1, &sum); + *op0 = apply_15_tap_filter_8_kernel(flat2, p7, p1, p0, q6, *op0, &sum); + *oq0 = apply_15_tap_filter_8_kernel(flat2, p7, p0, q0, q7, *oq0, &sum); + *oq1 = apply_15_tap_filter_8_kernel(flat2, p6, q0, q1, q7, *oq1, &sum); + *oq2 = apply_15_tap_filter_8_kernel(flat2, p5, q1, q2, q7, *oq2, &sum); + *oq3 = apply_15_tap_filter_8_kernel(flat2, p4, q2, q3, q7, q3, &sum); + *oq4 = apply_15_tap_filter_8_kernel(flat2, p3, q3, q4, q7, q4, &sum); + *oq5 = apply_15_tap_filter_8_kernel(flat2, p2, q4, q5, q7, q5, &sum); + *oq6 = apply_15_tap_filter_8_kernel(flat2, p1, q5, q6, q7, q6, &sum); +} + +static INLINE void apply_15_tap_filter_16( + const uint8x16_t flat2, const uint8x16_t p7, const uint8x16_t p6, + const uint8x16_t p5, const uint8x16_t p4, const uint8x16_t p3, + const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0, + const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2, + const uint8x16_t q3, const uint8x16_t q4, const uint8x16_t q5, + const uint8x16_t q6, const uint8x16_t q7, uint8x16_t *op6, uint8x16_t *op5, + uint8x16_t *op4, uint8x16_t *op3, uint8x16_t *op2, uint8x16_t *op1, + uint8x16_t *op0, uint8x16_t *oq0, uint8x16_t *oq1, uint8x16_t *oq2, + uint8x16_t *oq3, uint8x16_t *oq4, uint8x16_t *oq5, uint8x16_t *oq6) { + uint16x8_t sum0, sum1; + uint8x16_t t; + sum0 = vshll_n_u8(vget_low_u8(p7), 3); // 8*p7 + sum1 = vshll_n_u8(vget_high_u8(p7), 3); // 8*p7 + sum0 = vsubw_u8(sum0, vget_low_u8(p7)); // 7*p7 + sum1 = vsubw_u8(sum1, vget_high_u8(p7)); // 7*p7 + sum0 = vaddw_u8(sum0, vget_low_u8(p6)); // 7*p7+p6 + sum1 = vaddw_u8(sum1, vget_high_u8(p6)); // 7*p7+p6 + sum0 = vaddw_u8(sum0, vget_low_u8(p6)); // 7*p7+2*p6 + sum1 = vaddw_u8(sum1, vget_high_u8(p6)); // 7*p7+2*p6 + sum0 = vaddw_u8(sum0, vget_low_u8(p5)); // 7*p7+2*p6+p5 + sum1 = vaddw_u8(sum1, vget_high_u8(p5)); // 7*p7+2*p6+p5 + sum0 = vaddw_u8(sum0, vget_low_u8(p4)); // 7*p7+2*p6+p5+p4 + sum1 = vaddw_u8(sum1, vget_high_u8(p4)); // 7*p7+2*p6+p5+p4 + sum0 = vaddw_u8(sum0, vget_low_u8(p3)); // 7*p7+2*p6+p5+p4+p3 + sum1 = vaddw_u8(sum1, vget_high_u8(p3)); // 7*p7+2*p6+p5+p4+p3 + sum0 = vaddw_u8(sum0, vget_low_u8(p2)); // 7*p7+2*p6+p5+p4+p3+p2 + sum1 = vaddw_u8(sum1, vget_high_u8(p2)); // 7*p7+2*p6+p5+p4+p3+p2 + sum0 = vaddw_u8(sum0, vget_low_u8(p1)); // 7*p7+2*p6+p5+p4+p3+p2+p1 + sum1 = vaddw_u8(sum1, vget_high_u8(p1)); // 7*p7+2*p6+p5+p4+p3+p2+p1 + sum0 = vaddw_u8(sum0, vget_low_u8(p0)); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0 + sum1 = vaddw_u8(sum1, vget_high_u8(p0)); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0 + sum0 = vaddw_u8(sum0, vget_low_u8(q0)); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0 + sum1 = vaddw_u8(sum1, vget_high_u8(q0)); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0 + t = vcombine_u8(vrshrn_n_u16(sum0, 4), vrshrn_n_u16(sum1, 4)); + *op6 = vbslq_u8(flat2, t, p6); + *op5 = apply_15_tap_filter_16_kernel(flat2, p7, p6, p5, q1, p5, &sum0, &sum1); + *op4 = apply_15_tap_filter_16_kernel(flat2, p7, p5, p4, q2, p4, &sum0, &sum1); + *op3 = apply_15_tap_filter_16_kernel(flat2, p7, p4, p3, q3, p3, &sum0, &sum1); + *op2 = + apply_15_tap_filter_16_kernel(flat2, p7, p3, p2, q4, *op2, &sum0, &sum1); + *op1 = + apply_15_tap_filter_16_kernel(flat2, p7, p2, p1, q5, *op1, &sum0, &sum1); + *op0 = + apply_15_tap_filter_16_kernel(flat2, p7, p1, p0, q6, *op0, &sum0, &sum1); + *oq0 = + apply_15_tap_filter_16_kernel(flat2, p7, p0, q0, q7, *oq0, &sum0, &sum1); + *oq1 = + apply_15_tap_filter_16_kernel(flat2, p6, q0, q1, q7, *oq1, &sum0, &sum1); + *oq2 = + apply_15_tap_filter_16_kernel(flat2, p5, q1, q2, q7, *oq2, &sum0, &sum1); + *oq3 = apply_15_tap_filter_16_kernel(flat2, p4, q2, q3, q7, q3, &sum0, &sum1); + *oq4 = apply_15_tap_filter_16_kernel(flat2, p3, q3, q4, q7, q4, &sum0, &sum1); + *oq5 = apply_15_tap_filter_16_kernel(flat2, p2, q4, q5, q7, q5, &sum0, &sum1); + *oq6 = apply_15_tap_filter_16_kernel(flat2, p1, q5, q6, q7, q6, &sum0, &sum1); +} + +#define FUN_FILTER4(w, r) \ + static INLINE void filter4_##w( \ + const uint8x##w##_t mask, const uint8x##w##_t hev, \ + const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \ + const uint8x##w##_t q1, uint8x##w##_t *op1, uint8x##w##_t *op0, \ + uint8x##w##_t *oq0, uint8x##w##_t *oq1) { \ + int8x##w##_t filter, filter1, filter2, t; \ + int8x##w##_t ps1 = flip_sign_##w(p1); \ + int8x##w##_t ps0 = flip_sign_##w(p0); \ + int8x##w##_t qs0 = flip_sign_##w(q0); \ + int8x##w##_t qs1 = flip_sign_##w(q1); \ + \ + /* add outer taps if we have high edge variance */ \ + filter = vqsub##r##s8(ps1, qs1); \ + filter = vand##r##s8(filter, vreinterpret##r##s8_u8(hev)); \ + t = vqsub##r##s8(qs0, ps0); \ + \ + /* inner taps */ \ + filter = vqadd##r##s8(filter, t); \ + filter = vqadd##r##s8(filter, t); \ + filter = vqadd##r##s8(filter, t); \ + filter = vand##r##s8(filter, vreinterpret##r##s8_u8(mask)); \ + \ + /* save bottom 3 bits so that we round one side +4 and the other +3 */ \ + /* if it equals 4 we'll set it to adjust by -1 to account for the fact */ \ + /* we'd round it by 3 the other way */ \ + filter1 = vshr##r##n_s8(vqadd##r##s8(filter, vdup##r##n_s8(4)), 3); \ + filter2 = vshr##r##n_s8(vqadd##r##s8(filter, vdup##r##n_s8(3)), 3); \ + \ + qs0 = vqsub##r##s8(qs0, filter1); \ + ps0 = vqadd##r##s8(ps0, filter2); \ + *oq0 = flip_sign_back_##w(qs0); \ + *op0 = flip_sign_back_##w(ps0); \ + \ + /* outer tap adjustments */ \ + filter = vrshr##r##n_s8(filter1, 1); \ + filter = vbic##r##s8(filter, vreinterpret##r##s8_u8(hev)); \ + \ + qs1 = vqsub##r##s8(qs1, filter); \ + ps1 = vqadd##r##s8(ps1, filter); \ + *oq1 = flip_sign_back_##w(qs1); \ + *op1 = flip_sign_back_##w(ps1); \ + } + +FUN_FILTER4(8, _) // filter4_8 +FUN_FILTER4(16, q_) // filter4_16 +#undef FUN_FILTER4 + +#define FUN_FILTER8(w) \ + static INLINE void filter8_##w( \ + const uint8x##w##_t mask, const uint8x##w##_t flat, \ + const uint32_t flat_status, const uint8x##w##_t hev, \ + const uint8x##w##_t p3, const uint8x##w##_t p2, const uint8x##w##_t p1, \ + const uint8x##w##_t p0, const uint8x##w##_t q0, const uint8x##w##_t q1, \ + const uint8x##w##_t q2, const uint8x##w##_t q3, uint8x##w##_t *op2, \ + uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0, \ + uint8x##w##_t *oq1, uint8x##w##_t *oq2) { \ + if (flat_status != (uint32_t)-2) { \ + filter4_##w(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1); \ + *op2 = p2; \ + *oq2 = q2; \ + if (flat_status) { \ + apply_7_tap_filter_##w(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, \ + op0, oq0, oq1, oq2); \ + } \ + } else { \ + calc_7_tap_filter_##w(p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0, \ + oq0, oq1, oq2); \ + } \ + } + +FUN_FILTER8(8) // filter8_8 +FUN_FILTER8(16) // filter8_16 +#undef FUN_FILTER8 + +#define FUN_FILTER16(w) \ + static INLINE void filter16_##w( \ + const uint8x##w##_t mask, const uint8x##w##_t flat, \ + const uint32_t flat_status, const uint8x##w##_t flat2, \ + const uint32_t flat2_status, const uint8x##w##_t hev, \ + const uint8x##w##_t p7, const uint8x##w##_t p6, const uint8x##w##_t p5, \ + const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \ + const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \ + const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \ + const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6, \ + const uint8x##w##_t q7, uint8x##w##_t *op6, uint8x##w##_t *op5, \ + uint8x##w##_t *op4, uint8x##w##_t *op3, uint8x##w##_t *op2, \ + uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0, \ + uint8x##w##_t *oq1, uint8x##w##_t *oq2, uint8x##w##_t *oq3, \ + uint8x##w##_t *oq4, uint8x##w##_t *oq5, uint8x##w##_t *oq6) { \ + if (flat_status != (uint32_t)-2) { \ + filter4_##w(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1); \ + } \ + \ + if (flat_status) { \ + *op2 = p2; \ + *oq2 = q2; \ + if (flat2_status != (uint32_t)-2) { \ + apply_7_tap_filter_##w(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, \ + op0, oq0, oq1, oq2); \ + } \ + if (flat2_status) { \ + apply_15_tap_filter_##w(flat2, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, \ + q2, q3, q4, q5, q6, q7, op6, op5, op4, op3, \ + op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, \ + oq6); \ + } \ + } \ + } + +FUN_FILTER16(8) // filter16_8 +FUN_FILTER16(16) // filter16_16 +#undef FUN_FILTER16 + +#define FUN_LOAD8(w, r) \ + static INLINE void load_##w##x8( \ + const uint8_t *s, const int p, uint8x##w##_t *p3, uint8x##w##_t *p2, \ + uint8x##w##_t *p1, uint8x##w##_t *p0, uint8x##w##_t *q0, \ + uint8x##w##_t *q1, uint8x##w##_t *q2, uint8x##w##_t *q3) { \ + *p3 = vld1##r##u8(s); \ + s += p; \ + *p2 = vld1##r##u8(s); \ + s += p; \ + *p1 = vld1##r##u8(s); \ + s += p; \ + *p0 = vld1##r##u8(s); \ + s += p; \ + *q0 = vld1##r##u8(s); \ + s += p; \ + *q1 = vld1##r##u8(s); \ + s += p; \ + *q2 = vld1##r##u8(s); \ + s += p; \ + *q3 = vld1##r##u8(s); \ + } + +FUN_LOAD8(8, _) // load_8x8 +FUN_LOAD8(16, q_) // load_16x8 +#undef FUN_LOAD8 + +#define FUN_LOAD16(w, r) \ + static INLINE void load_##w##x16( \ + const uint8_t *s, const int p, uint8x##w##_t *s0, uint8x##w##_t *s1, \ + uint8x##w##_t *s2, uint8x##w##_t *s3, uint8x##w##_t *s4, \ + uint8x##w##_t *s5, uint8x##w##_t *s6, uint8x##w##_t *s7, \ + uint8x##w##_t *s8, uint8x##w##_t *s9, uint8x##w##_t *s10, \ + uint8x##w##_t *s11, uint8x##w##_t *s12, uint8x##w##_t *s13, \ + uint8x##w##_t *s14, uint8x##w##_t *s15) { \ + *s0 = vld1##r##u8(s); \ + s += p; \ + *s1 = vld1##r##u8(s); \ + s += p; \ + *s2 = vld1##r##u8(s); \ + s += p; \ + *s3 = vld1##r##u8(s); \ + s += p; \ + *s4 = vld1##r##u8(s); \ + s += p; \ + *s5 = vld1##r##u8(s); \ + s += p; \ + *s6 = vld1##r##u8(s); \ + s += p; \ + *s7 = vld1##r##u8(s); \ + s += p; \ + *s8 = vld1##r##u8(s); \ + s += p; \ + *s9 = vld1##r##u8(s); \ + s += p; \ + *s10 = vld1##r##u8(s); \ + s += p; \ + *s11 = vld1##r##u8(s); \ + s += p; \ + *s12 = vld1##r##u8(s); \ + s += p; \ + *s13 = vld1##r##u8(s); \ + s += p; \ + *s14 = vld1##r##u8(s); \ + s += p; \ + *s15 = vld1##r##u8(s); \ + } + +FUN_LOAD16(8, _) // load_8x16 +FUN_LOAD16(16, q_) // load_16x16 +#undef FUN_LOAD16 + +#define FUN_STORE4(w, r) \ + static INLINE void store_##w##x4( \ + uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \ + const uint8x##w##_t s2, const uint8x##w##_t s3) { \ + vst1##r##u8(s, s0); \ + s += p; \ + vst1##r##u8(s, s1); \ + s += p; \ + vst1##r##u8(s, s2); \ + s += p; \ + vst1##r##u8(s, s3); \ + } + +FUN_STORE4(8, _) // store_8x4 +FUN_STORE4(16, q_) // store_16x4 +#undef FUN_STORE4 + +#define FUN_STORE6(w, r) \ + static INLINE void store_##w##x6( \ + uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \ + const uint8x##w##_t s2, const uint8x##w##_t s3, const uint8x##w##_t s4, \ + const uint8x##w##_t s5) { \ + vst1##r##u8(s, s0); \ + s += p; \ + vst1##r##u8(s, s1); \ + s += p; \ + vst1##r##u8(s, s2); \ + s += p; \ + vst1##r##u8(s, s3); \ + s += p; \ + vst1##r##u8(s, s4); \ + s += p; \ + vst1##r##u8(s, s5); \ + } + +FUN_STORE6(8, _) // store_8x6 +FUN_STORE6(16, q_) // store_16x6 +#undef FUN_STORE6 + +static INLINE void store_4x8(uint8_t *s, const int p, const uint8x8_t p1, + const uint8x8_t p0, const uint8x8_t q0, + const uint8x8_t q1) { + uint8x8x4_t o; + + o.val[0] = p1; + o.val[1] = p0; + o.val[2] = q0; + o.val[3] = q1; + vst4_lane_u8(s, o, 0); + s += p; + vst4_lane_u8(s, o, 1); + s += p; + vst4_lane_u8(s, o, 2); + s += p; + vst4_lane_u8(s, o, 3); + s += p; + vst4_lane_u8(s, o, 4); + s += p; + vst4_lane_u8(s, o, 5); + s += p; + vst4_lane_u8(s, o, 6); + s += p; + vst4_lane_u8(s, o, 7); +} + +static INLINE void store_6x8(uint8_t *s, const int p, const uint8x8_t s0, + const uint8x8_t s1, const uint8x8_t s2, + const uint8x8_t s3, const uint8x8_t s4, + const uint8x8_t s5) { + uint8x8x3_t o0, o1; + + o0.val[0] = s0; + o0.val[1] = s1; + o0.val[2] = s2; + o1.val[0] = s3; + o1.val[1] = s4; + o1.val[2] = s5; + vst3_lane_u8(s - 3, o0, 0); + vst3_lane_u8(s + 0, o1, 0); + s += p; + vst3_lane_u8(s - 3, o0, 1); + vst3_lane_u8(s + 0, o1, 1); + s += p; + vst3_lane_u8(s - 3, o0, 2); + vst3_lane_u8(s + 0, o1, 2); + s += p; + vst3_lane_u8(s - 3, o0, 3); + vst3_lane_u8(s + 0, o1, 3); + s += p; + vst3_lane_u8(s - 3, o0, 4); + vst3_lane_u8(s + 0, o1, 4); + s += p; + vst3_lane_u8(s - 3, o0, 5); + vst3_lane_u8(s + 0, o1, 5); + s += p; + vst3_lane_u8(s - 3, o0, 6); + vst3_lane_u8(s + 0, o1, 6); + s += p; + vst3_lane_u8(s - 3, o0, 7); + vst3_lane_u8(s + 0, o1, 7); +} + +#define FUN_STORE8(w, r) \ + static INLINE void store_##w##x8( \ + uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \ + const uint8x##w##_t s2, const uint8x##w##_t s3, const uint8x##w##_t s4, \ + const uint8x##w##_t s5, const uint8x##w##_t s6, \ + const uint8x##w##_t s7) { \ + vst1##r##u8(s, s0); \ + s += p; \ + vst1##r##u8(s, s1); \ + s += p; \ + vst1##r##u8(s, s2); \ + s += p; \ + vst1##r##u8(s, s3); \ + s += p; \ + vst1##r##u8(s, s4); \ + s += p; \ + vst1##r##u8(s, s5); \ + s += p; \ + vst1##r##u8(s, s6); \ + s += p; \ + vst1##r##u8(s, s7); \ + } + +FUN_STORE8(8, _) // store_8x8 +FUN_STORE8(16, q_) // store_16x8 +#undef FUN_STORE8 + +#define FUN_STORE14(w, r) \ + static INLINE void store_##w##x14( \ + uint8_t *s, const int p, const uint8x##w##_t p6, const uint8x##w##_t p5, \ + const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \ + const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \ + const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \ + const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6, \ + const uint32_t flat_status, const uint32_t flat2_status) { \ + if (flat_status) { \ + if (flat2_status) { \ + vst1##r##u8(s - 7 * p, p6); \ + vst1##r##u8(s - 6 * p, p5); \ + vst1##r##u8(s - 5 * p, p4); \ + vst1##r##u8(s - 4 * p, p3); \ + vst1##r##u8(s + 3 * p, q3); \ + vst1##r##u8(s + 4 * p, q4); \ + vst1##r##u8(s + 5 * p, q5); \ + vst1##r##u8(s + 6 * p, q6); \ + } \ + vst1##r##u8(s - 3 * p, p2); \ + vst1##r##u8(s + 2 * p, q2); \ + } \ + vst1##r##u8(s - 2 * p, p1); \ + vst1##r##u8(s - 1 * p, p0); \ + vst1##r##u8(s + 0 * p, q0); \ + vst1##r##u8(s + 1 * p, q1); \ + } + +FUN_STORE14(8, _) // store_8x14 +FUN_STORE14(16, q_) // store_16x14 +#undef FUN_STORE14 + +static INLINE void store_16x16(uint8_t *s, const int p, const uint8x16_t s0, + const uint8x16_t s1, const uint8x16_t s2, + const uint8x16_t s3, const uint8x16_t s4, + const uint8x16_t s5, const uint8x16_t s6, + const uint8x16_t s7, const uint8x16_t s8, + const uint8x16_t s9, const uint8x16_t s10, + const uint8x16_t s11, const uint8x16_t s12, + const uint8x16_t s13, const uint8x16_t s14, + const uint8x16_t s15) { + vst1q_u8(s, s0); + s += p; + vst1q_u8(s, s1); + s += p; + vst1q_u8(s, s2); + s += p; + vst1q_u8(s, s3); + s += p; + vst1q_u8(s, s4); + s += p; + vst1q_u8(s, s5); + s += p; + vst1q_u8(s, s6); + s += p; + vst1q_u8(s, s7); + s += p; + vst1q_u8(s, s8); + s += p; + vst1q_u8(s, s9); + s += p; + vst1q_u8(s, s10); + s += p; + vst1q_u8(s, s11); + s += p; + vst1q_u8(s, s12); + s += p; + vst1q_u8(s, s13); + s += p; + vst1q_u8(s, s14); + s += p; + vst1q_u8(s, s15); +} + +#define FUN_HOR_4_KERNEL(name, w) \ + static INLINE void lpf_horizontal_4##name##kernel( \ + uint8_t *s, const int p, const uint8x##w##_t blimit, \ + const uint8x##w##_t limit, const uint8x##w##_t thresh) { \ + uint8x##w##_t p3, p2, p1, p0, q0, q1, q2, q3, mask, hev; \ + \ + load_##w##x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); \ + filter_hev_mask4_##w(limit, blimit, thresh, p3, p2, p1, p0, q0, q1, q2, \ + q3, &hev, &mask); \ + filter4_##w(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1); \ + store_##w##x4(s - 2 * p, p, p1, p0, q0, q1); \ + } + +FUN_HOR_4_KERNEL(_, 8) // lpf_horizontal_4_kernel +FUN_HOR_4_KERNEL(_dual_, 16) // lpf_horizontal_4_dual_kernel +#undef FUN_HOR_4_KERNEL + +void vpx_lpf_horizontal_4_neon(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x8_t blimit_vec, limit_vec, thresh_vec; + load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec); + lpf_horizontal_4_kernel(s, p, blimit_vec, limit_vec, thresh_vec); +} + +void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + uint8x16_t blimit_vec, limit_vec, thresh_vec; + load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1, + &blimit_vec, &limit_vec, &thresh_vec); + lpf_horizontal_4_dual_kernel(s, p, blimit_vec, limit_vec, thresh_vec); +} + +void vpx_lpf_vertical_4_neon(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, + mask, hev; + load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec); + load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + transpose_u8_8x8(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + filter_hev_mask4_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, + q2, q3, &hev, &mask); + filter4_8(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1); + store_4x8(s - 2, p, p1, p0, q0, q1); +} + +void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, + mask, hev; + uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, + s15; + + load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1, + &blimit_vec, &limit_vec, &thresh_vec); + load_8x16(s - 4, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10, + &s11, &s12, &s13, &s14, &s15); + transpose_u8_8x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, + s14, s15, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + filter_hev_mask4_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, + q2, q3, &hev, &mask); + filter4_16(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1); + s -= 2; + store_4x8(s, p, vget_low_u8(p1), vget_low_u8(p0), vget_low_u8(q0), + vget_low_u8(q1)); + store_4x8(s + 8 * p, p, vget_high_u8(p1), vget_high_u8(p0), vget_high_u8(q0), + vget_high_u8(q1)); +} + +void vpx_lpf_horizontal_8_neon(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, + op2, op1, op0, oq0, oq1, oq2, mask, flat, hev; + uint32_t flat_status; + + load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec); + load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + mask = filter_flat_hev_mask_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, + p0, q0, q1, q2, q3, &flat, &flat_status, &hev); + filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2, + &op1, &op0, &oq0, &oq1, &oq2); + store_8x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2); +} + +void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, + op2, op1, op0, oq0, oq1, oq2, mask, flat, hev; + uint32_t flat_status; + + load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1, + &blimit_vec, &limit_vec, &thresh_vec); + load_16x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + mask = filter_flat_hev_mask_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, + p0, q0, q1, q2, q3, &flat, &flat_status, &hev); + filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2, + &op1, &op0, &oq0, &oq1, &oq2); + store_16x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2); +} + +void vpx_lpf_vertical_8_neon(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, + op2, op1, op0, oq0, oq1, oq2, mask, flat, hev; + uint32_t flat_status; + + load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec); + load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + transpose_u8_8x8(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + mask = filter_flat_hev_mask_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, + p0, q0, q1, q2, q3, &flat, &flat_status, &hev); + filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2, + &op1, &op0, &oq0, &oq1, &oq2); + // Note: transpose + store_8x8() is faster than store_6x8(). + transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3); + store_8x8(s - 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3); +} + +void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, + op2, op1, op0, oq0, oq1, oq2, mask, flat, hev; + uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, + s15; + uint32_t flat_status; + + load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1, + &blimit_vec, &limit_vec, &thresh_vec); + load_8x16(s - 4, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10, + &s11, &s12, &s13, &s14, &s15); + transpose_u8_8x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, + s14, s15, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + mask = filter_flat_hev_mask_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, + p0, q0, q1, q2, q3, &flat, &flat_status, &hev); + filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2, + &op1, &op0, &oq0, &oq1, &oq2); + // Note: store_6x8() twice is faster than transpose + store_8x16(). + store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0), + vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2)); + store_6x8(s + 8 * p, p, vget_high_u8(op2), vget_high_u8(op1), + vget_high_u8(op0), vget_high_u8(oq0), vget_high_u8(oq1), + vget_high_u8(oq2)); +} + +#define FUN_LPF_16_KERNEL(name, w) \ + static INLINE void lpf_16##name##kernel( \ + const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, \ + const uint8x##w##_t p7, const uint8x##w##_t p6, const uint8x##w##_t p5, \ + const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \ + const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \ + const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \ + const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6, \ + const uint8x##w##_t q7, uint8x##w##_t *op6, uint8x##w##_t *op5, \ + uint8x##w##_t *op4, uint8x##w##_t *op3, uint8x##w##_t *op2, \ + uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0, \ + uint8x##w##_t *oq1, uint8x##w##_t *oq2, uint8x##w##_t *oq3, \ + uint8x##w##_t *oq4, uint8x##w##_t *oq5, uint8x##w##_t *oq6, \ + uint32_t *flat_status, uint32_t *flat2_status) { \ + uint8x##w##_t blimit_vec, limit_vec, thresh_vec, mask, flat, flat2, hev; \ + \ + load_thresh_##w(blimit, limit, thresh, &blimit_vec, &limit_vec, \ + &thresh_vec); \ + mask = filter_flat_hev_mask_##w(limit_vec, blimit_vec, thresh_vec, p3, p2, \ + p1, p0, q0, q1, q2, q3, &flat, \ + flat_status, &hev); \ + flat2 = flat_mask5_##w(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, \ + flat2_status); \ + filter16_##w(mask, flat, *flat_status, flat2, *flat2_status, hev, p7, p6, \ + p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6, \ + op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, \ + oq6); \ + } + +FUN_LPF_16_KERNEL(_, 8) // lpf_16_kernel +FUN_LPF_16_KERNEL(_dual_, 16) // lpf_16_dual_kernel +#undef FUN_LPF_16_KERNEL + +// Quiet warnings of the form: 'vpx_dsp/arm/loopfilter_neon.c|981 col 42| +// warning: 'oq1' may be used uninitialized in this function +// [-Wmaybe-uninitialized]', for oq1-op1. Without reworking the code or adding +// an additional branch this warning cannot be silenced otherwise. The +// loopfilter is only called when needed for a block so these output pixels +// will be set. +#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif + +void vpx_lpf_horizontal_16_neon(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6, + op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6; + uint32_t flat_status, flat2_status; + + load_8x16(s - 8 * p, p, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &q1, &q2, + &q3, &q4, &q5, &q6, &q7); + lpf_16_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, + q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2, &op1, + &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, &flat_status, + &flat2_status); + store_8x14(s, p, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, + oq5, oq6, flat_status, flat2_status); +} + +void vpx_lpf_horizontal_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + uint8x16_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, + op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6; + uint32_t flat_status, flat2_status; + + load_16x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + p7 = vld1q_u8(s - 8 * p); + p6 = vld1q_u8(s - 7 * p); + p5 = vld1q_u8(s - 6 * p); + p4 = vld1q_u8(s - 5 * p); + q4 = vld1q_u8(s + 4 * p); + q5 = vld1q_u8(s + 5 * p); + q6 = vld1q_u8(s + 6 * p); + q7 = vld1q_u8(s + 7 * p); + lpf_16_dual_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0, + q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2, + &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, + &flat_status, &flat2_status); + store_16x14(s, p, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, + oq5, oq6, flat_status, flat2_status); +} + +void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6, + op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6; + uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7; + uint32_t flat_status, flat2_status; + + s -= 8; + load_16x8(s, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + transpose_u8_16x8(s0, s1, s2, s3, s4, s5, s6, s7, &p7, &p6, &p5, &p4, &p3, + &p2, &p1, &p0, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7); + lpf_16_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, + q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2, &op1, + &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, &flat_status, + &flat2_status); + if (flat_status) { + if (flat2_status) { + transpose_u8_8x16(p7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, + oq3, oq4, oq5, oq6, q7, &s0, &s1, &s2, &s3, &s4, &s5, + &s6, &s7); + store_16x8(s, p, s0, s1, s2, s3, s4, s5, s6, s7); + } else { + // Note: transpose + store_8x8() is faster than store_6x8(). + transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3); + store_8x8(s + 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3); + } + } else { + store_4x8(s + 6, p, op1, op0, oq0, oq1); + } +} + +void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + uint8x16_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, + op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6; + uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, + s15; + uint32_t flat_status, flat2_status; + + s -= 8; + load_16x16(s, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10, &s11, + &s12, &s13, &s14, &s15); + transpose_u8_16x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, + s14, s15, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &q1, + &q2, &q3, &q4, &q5, &q6, &q7); + lpf_16_dual_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0, + q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2, + &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, + &flat_status, &flat2_status); + if (flat_status) { + if (flat2_status) { + transpose_u8_16x16(p7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, + oq3, oq4, oq5, oq6, q7, &s0, &s1, &s2, &s3, &s4, &s5, + &s6, &s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14, + &s15); + store_16x16(s, p, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, + s13, s14, s15); + } else { + // Note: store_6x8() twice is faster than transpose + store_8x16(). + s += 8; + store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0), + vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2)); + store_6x8(s + 8 * p, p, vget_high_u8(op2), vget_high_u8(op1), + vget_high_u8(op0), vget_high_u8(oq0), vget_high_u8(oq1), + vget_high_u8(oq2)); + } + } else { + s += 6; + store_4x8(s, p, vget_low_u8(op1), vget_low_u8(op0), vget_low_u8(oq0), + vget_low_u8(oq1)); + store_4x8(s + 8 * p, p, vget_high_u8(op1), vget_high_u8(op0), + vget_high_u8(oq0), vget_high_u8(oq1)); + } +} + +#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__) +#pragma GCC diagnostic pop +#endif diff --git a/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h new file mode 100644 index 0000000000..38b0b6c1a9 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h @@ -0,0 +1,473 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_MEM_NEON_H_ +#define VPX_VPX_DSP_ARM_MEM_NEON_H_ + +#include +#include +#include + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" + +// Support for these xN intrinsics is lacking in older versions of GCC. +#if defined(__GNUC__) && !defined(__clang__) +#if __GNUC__ < 8 || defined(__arm__) +static INLINE uint8x16x2_t vld1q_u8_x2(uint8_t const *ptr) { + uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } }; + return res; +} +#endif + +#if __GNUC__ < 9 || defined(__arm__) +static INLINE uint8x16x3_t vld1q_u8_x3(uint8_t const *ptr) { + uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16), + vld1q_u8(ptr + 2 * 16) } }; + return res; +} +#endif +#endif + +static INLINE int16x4_t create_s16x4_neon(const int16_t c0, const int16_t c1, + const int16_t c2, const int16_t c3) { + return vcreate_s16((uint16_t)c0 | ((uint32_t)c1 << 16) | + ((int64_t)(uint16_t)c2 << 32) | ((int64_t)c3 << 48)); +} + +static INLINE int32x2_t create_s32x2_neon(const int32_t c0, const int32_t c1) { + return vcreate_s32((uint32_t)c0 | ((int64_t)(uint32_t)c1 << 32)); +} + +static INLINE int32x4_t create_s32x4_neon(const int32_t c0, const int32_t c1, + const int32_t c2, const int32_t c3) { + return vcombine_s32(create_s32x2_neon(c0, c1), create_s32x2_neon(c2, c3)); +} + +// Helper functions used to load tran_low_t into int16, narrowing if necessary. +static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32x4x2_t v0 = vld2q_s32(buf); + const int32x4x2_t v1 = vld2q_s32(buf + 8); + const int16x4_t s0 = vmovn_s32(v0.val[0]); + const int16x4_t s1 = vmovn_s32(v0.val[1]); + const int16x4_t s2 = vmovn_s32(v1.val[0]); + const int16x4_t s3 = vmovn_s32(v1.val[1]); + int16x8x2_t res; + res.val[0] = vcombine_s16(s0, s2); + res.val[1] = vcombine_s16(s1, s3); + return res; +#else + return vld2q_s16(buf); +#endif +} + +static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32x4_t v0 = vld1q_s32(buf); + const int32x4_t v1 = vld1q_s32(buf + 4); + const int16x4_t s0 = vmovn_s32(v0); + const int16x4_t s1 = vmovn_s32(v1); + return vcombine_s16(s0, s1); +#else + return vld1q_s16(buf); +#endif +} + +static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32x4_t v0 = vld1q_s32(buf); + return vmovn_s32(v0); +#else + return vld1_s16(buf); +#endif +} + +static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32x4_t v0 = vmovl_s16(vget_low_s16(a)); + const int32x4_t v1 = vmovl_s16(vget_high_s16(a)); + vst1q_s32(buf, v0); + vst1q_s32(buf + 4, v1); +#else + vst1q_s16(buf, a); +#endif +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void store_s32q_to_tran_low(tran_low_t *buf, const int32x4_t a) { + vst1q_s32(buf, a); +} + +static INLINE int32x4_t load_tran_low_to_s32q(const tran_low_t *buf) { + return vld1q_s32(buf); +} +#endif + +// Propagate type information to the compiler. Without this the compiler may +// assume the required alignment of uint32_t (4 bytes) and add alignment hints +// to the memory access. +// +// This is used for functions operating on uint8_t which wish to load or store 4 +// values at a time but which may not be on 4 byte boundaries. +static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) { + memcpy(buf, &a, 4); +} + +// Load 4 contiguous bytes when alignment is not guaranteed. +static INLINE uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) { + uint32_t a; + uint32x2_t a_u32; + memcpy(&a, buf, 4); + a_u32 = vdup_n_u32(0); + a_u32 = vset_lane_u32(a, a_u32, 0); + return vreinterpret_u8_u32(a_u32); +} + +// Load 4 contiguous bytes and replicate across a vector when alignment is not +// guaranteed. +static INLINE uint8x8_t load_replicate_u8_4x1(const uint8_t *buf) { + uint32_t a; + memcpy(&a, buf, 4); + return vreinterpret_u8_u32(vdup_n_u32(a)); +} + +// Store 4 contiguous bytes from the low half of an 8x8 vector. +static INLINE void store_u8_4x1(uint8_t *buf, uint8x8_t a) { + vst1_lane_u32((uint32_t *)buf, vreinterpret_u32_u8(a), 0); +} + +// Store 4 contiguous bytes from the high half of an 8x8 vector. +static INLINE void store_u8_4x1_high(uint8_t *buf, uint8x8_t a) { + vst1_lane_u32((uint32_t *)buf, vreinterpret_u32_u8(a), 1); +} + +// Load 2 sets of 4 bytes when alignment is not guaranteed. +static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, + ptrdiff_t stride) { + uint32_t a; + uint32x2_t a_u32; + if (stride == 4) return vld1_u8(buf); + memcpy(&a, buf, 4); + buf += stride; + a_u32 = vdup_n_u32(a); + memcpy(&a, buf, 4); + a_u32 = vset_lane_u32(a, a_u32, 1); + return vreinterpret_u8_u32(a_u32); +} + +// Load 8 bytes when alignment is not guaranteed. +static INLINE uint16x4_t load_unaligned_u16(const uint16_t *buf) { + uint64_t a; + uint64x1_t a_u64 = vdup_n_u64(0); + memcpy(&a, buf, 8); + a_u64 = vset_lane_u64(a, a_u64, 0); + return vreinterpret_u16_u64(a_u64); +} + +// Load 2 sets of 8 bytes when alignment is not guaranteed. +static INLINE uint16x8_t load_unaligned_u16q(const uint16_t *buf, + ptrdiff_t stride) { + uint64_t a; + uint64x2_t a_u64; + if (stride == 4) return vld1q_u16(buf); + memcpy(&a, buf, 8); + buf += stride; + a_u64 = vdupq_n_u64(a); + memcpy(&a, buf, 8); + a_u64 = vsetq_lane_u64(a, a_u64, 1); + return vreinterpretq_u16_u64(a_u64); +} + +// Store 2 sets of 4 bytes when alignment is not guaranteed. +static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride, + const uint8x8_t a) { + const uint32x2_t a_u32 = vreinterpret_u32_u8(a); + if (stride == 4) { + vst1_u8(buf, a); + return; + } + uint32_to_mem(buf, vget_lane_u32(a_u32, 0)); + buf += stride; + uint32_to_mem(buf, vget_lane_u32(a_u32, 1)); +} + +// Load 4 sets of 4 bytes when alignment is not guaranteed. +static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, + ptrdiff_t stride) { + uint32_t a; + uint32x4_t a_u32; + if (stride == 4) return vld1q_u8(buf); + memcpy(&a, buf, 4); + buf += stride; + a_u32 = vdupq_n_u32(a); + memcpy(&a, buf, 4); + buf += stride; + a_u32 = vsetq_lane_u32(a, a_u32, 1); + memcpy(&a, buf, 4); + buf += stride; + a_u32 = vsetq_lane_u32(a, a_u32, 2); + memcpy(&a, buf, 4); + buf += stride; + a_u32 = vsetq_lane_u32(a, a_u32, 3); + return vreinterpretq_u8_u32(a_u32); +} + +// Store 4 sets of 4 bytes when alignment is not guaranteed. +static INLINE void store_unaligned_u8q(uint8_t *buf, ptrdiff_t stride, + const uint8x16_t a) { + const uint32x4_t a_u32 = vreinterpretq_u32_u8(a); + if (stride == 4) { + vst1q_u8(buf, a); + return; + } + uint32_to_mem(buf, vgetq_lane_u32(a_u32, 0)); + buf += stride; + uint32_to_mem(buf, vgetq_lane_u32(a_u32, 1)); + buf += stride; + uint32_to_mem(buf, vgetq_lane_u32(a_u32, 2)); + buf += stride; + uint32_to_mem(buf, vgetq_lane_u32(a_u32, 3)); +} + +// Load 2 sets of 4 bytes when alignment is guaranteed. +static INLINE uint8x8_t load_u8(const uint8_t *buf, ptrdiff_t stride) { + uint32x2_t a = vdup_n_u32(0); + + assert(!((intptr_t)buf % sizeof(uint32_t))); + assert(!(stride % sizeof(uint32_t))); + + a = vld1_lane_u32((const uint32_t *)buf, a, 0); + buf += stride; + a = vld1_lane_u32((const uint32_t *)buf, a, 1); + return vreinterpret_u8_u32(a); +} + +// Store 2 sets of 4 bytes when alignment is guaranteed. +static INLINE void store_u8(uint8_t *buf, ptrdiff_t stride, const uint8x8_t a) { + uint32x2_t a_u32 = vreinterpret_u32_u8(a); + + assert(!((intptr_t)buf % sizeof(uint32_t))); + assert(!(stride % sizeof(uint32_t))); + + vst1_lane_u32((uint32_t *)buf, a_u32, 0); + buf += stride; + vst1_lane_u32((uint32_t *)buf, a_u32, 1); +} + +static INLINE void store_u8_8x3(uint8_t *s, const ptrdiff_t p, + const uint8x8_t s0, const uint8x8_t s1, + const uint8x8_t s2) { + vst1_u8(s, s0); + s += p; + vst1_u8(s, s1); + s += p; + vst1_u8(s, s2); +} + +static INLINE void load_u8_8x3(const uint8_t *s, const ptrdiff_t p, + uint8x8_t *const s0, uint8x8_t *const s1, + uint8x8_t *const s2) { + *s0 = vld1_u8(s); + s += p; + *s1 = vld1_u8(s); + s += p; + *s2 = vld1_u8(s); +} + +static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p, + uint8x8_t *const s0, uint8x8_t *const s1, + uint8x8_t *const s2, uint8x8_t *const s3) { + *s0 = vld1_u8(s); + s += p; + *s1 = vld1_u8(s); + s += p; + *s2 = vld1_u8(s); + s += p; + *s3 = vld1_u8(s); +} + +static INLINE void store_u8_8x4(uint8_t *s, const ptrdiff_t p, + const uint8x8_t s0, const uint8x8_t s1, + const uint8x8_t s2, const uint8x8_t s3) { + vst1_u8(s, s0); + s += p; + vst1_u8(s, s1); + s += p; + vst1_u8(s, s2); + s += p; + vst1_u8(s, s3); +} + +static INLINE void load_u8_16x3(const uint8_t *s, const ptrdiff_t p, + uint8x16_t *const s0, uint8x16_t *const s1, + uint8x16_t *const s2) { + *s0 = vld1q_u8(s); + s += p; + *s1 = vld1q_u8(s); + s += p; + *s2 = vld1q_u8(s); +} + +static INLINE void load_u8_16x4(const uint8_t *s, const ptrdiff_t p, + uint8x16_t *const s0, uint8x16_t *const s1, + uint8x16_t *const s2, uint8x16_t *const s3) { + *s0 = vld1q_u8(s); + s += p; + *s1 = vld1q_u8(s); + s += p; + *s2 = vld1q_u8(s); + s += p; + *s3 = vld1q_u8(s); +} + +static INLINE void store_u8_16x4(uint8_t *s, const ptrdiff_t p, + const uint8x16_t s0, const uint8x16_t s1, + const uint8x16_t s2, const uint8x16_t s3) { + vst1q_u8(s, s0); + s += p; + vst1q_u8(s, s1); + s += p; + vst1q_u8(s, s2); + s += p; + vst1q_u8(s, s3); +} + +static INLINE void load_u8_8x7(const uint8_t *s, const ptrdiff_t p, + uint8x8_t *const s0, uint8x8_t *const s1, + uint8x8_t *const s2, uint8x8_t *const s3, + uint8x8_t *const s4, uint8x8_t *const s5, + uint8x8_t *const s6) { + *s0 = vld1_u8(s); + s += p; + *s1 = vld1_u8(s); + s += p; + *s2 = vld1_u8(s); + s += p; + *s3 = vld1_u8(s); + s += p; + *s4 = vld1_u8(s); + s += p; + *s5 = vld1_u8(s); + s += p; + *s6 = vld1_u8(s); +} + +static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p, + uint8x8_t *const s0, uint8x8_t *const s1, + uint8x8_t *const s2, uint8x8_t *const s3, + uint8x8_t *const s4, uint8x8_t *const s5, + uint8x8_t *const s6, uint8x8_t *const s7) { + *s0 = vld1_u8(s); + s += p; + *s1 = vld1_u8(s); + s += p; + *s2 = vld1_u8(s); + s += p; + *s3 = vld1_u8(s); + s += p; + *s4 = vld1_u8(s); + s += p; + *s5 = vld1_u8(s); + s += p; + *s6 = vld1_u8(s); + s += p; + *s7 = vld1_u8(s); +} + +static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p, + const uint8x8_t s0, const uint8x8_t s1, + const uint8x8_t s2, const uint8x8_t s3, + const uint8x8_t s4, const uint8x8_t s5, + const uint8x8_t s6, const uint8x8_t s7) { + vst1_u8(s, s0); + s += p; + vst1_u8(s, s1); + s += p; + vst1_u8(s, s2); + s += p; + vst1_u8(s, s3); + s += p; + vst1_u8(s, s4); + s += p; + vst1_u8(s, s5); + s += p; + vst1_u8(s, s6); + s += p; + vst1_u8(s, s7); +} + +static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p, + uint8x16_t *const s0, uint8x16_t *const s1, + uint8x16_t *const s2, uint8x16_t *const s3, + uint8x16_t *const s4, uint8x16_t *const s5, + uint8x16_t *const s6, uint8x16_t *const s7) { + *s0 = vld1q_u8(s); + s += p; + *s1 = vld1q_u8(s); + s += p; + *s2 = vld1q_u8(s); + s += p; + *s3 = vld1q_u8(s); + s += p; + *s4 = vld1q_u8(s); + s += p; + *s5 = vld1q_u8(s); + s += p; + *s6 = vld1q_u8(s); + s += p; + *s7 = vld1q_u8(s); +} + +static INLINE void store_u8_16x8(uint8_t *s, const ptrdiff_t p, + const uint8x16_t s0, const uint8x16_t s1, + const uint8x16_t s2, const uint8x16_t s3, + const uint8x16_t s4, const uint8x16_t s5, + const uint8x16_t s6, const uint8x16_t s7) { + vst1q_u8(s, s0); + s += p; + vst1q_u8(s, s1); + s += p; + vst1q_u8(s, s2); + s += p; + vst1q_u8(s, s3); + s += p; + vst1q_u8(s, s4); + s += p; + vst1q_u8(s, s5); + s += p; + vst1q_u8(s, s6); + s += p; + vst1q_u8(s, s7); +} + +static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p, + uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2, + uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5, + uint16x8_t *s6, uint16x8_t *s7) { + *s0 = vld1q_u16(s); + s += p; + *s1 = vld1q_u16(s); + s += p; + *s2 = vld1q_u16(s); + s += p; + *s3 = vld1q_u16(s); + s += p; + *s4 = vld1q_u16(s); + s += p; + *s5 = vld1q_u16(s); + s += p; + *s6 = vld1q_u16(s); + s += p; + *s7 = vld1q_u16(s); +} + +#endif // VPX_VPX_DSP_ARM_MEM_NEON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/quantize_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/quantize_neon.c new file mode 100644 index 0000000000..e2351fa2cc --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/quantize_neon.c @@ -0,0 +1,286 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" + +static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff, + const int16x8_t dequant, + tran_low_t *dqcoeff_ptr) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32x4_t dqcoeff_0 = + vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant)); + const int32x4_t dqcoeff_1 = + vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant)); + + vst1q_s32(dqcoeff_ptr, dqcoeff_0); + vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1); +#else + vst1q_s16(dqcoeff_ptr, vmulq_s16(qcoeff, dequant)); +#endif // CONFIG_VP9_HIGHBITDEPTH +} + +static INLINE int16x8_t +quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16x8_t zbin, + const int16x8_t round, const int16x8_t quant, + const int16x8_t quant_shift, const int16x8_t dequant) { + // Load coeffs as 8 x 16-bit ints, take sign and abs values + const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); + const int16x8_t coeff_abs = vabsq_s16(coeff); + + // Calculate mask of elements outside the bin + const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); + + // Get the rounded values + const int16x8_t rounded = vqaddq_s16(coeff_abs, round); + + // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 + int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); + + qcoeff = vaddq_s16(qcoeff, rounded); + + // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16 + qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1); + + // Restore the sign bit. + qcoeff = veorq_s16(qcoeff, coeff_sign); + qcoeff = vsubq_s16(qcoeff, coeff_sign); + + // Only keep the relevant coeffs + qcoeff = vandq_s16(qcoeff, zbin_mask); + store_s16q_to_tran_low(qcoeff_ptr, qcoeff); + + calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr); + + return qcoeff; +} + +void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const int16x8_t neg_one = vdupq_n_s16(-1); + uint16x8_t eob_max; + int16_t const *iscan = scan_order->iscan; + + // Only the first element of each vector is DC. + int16x8_t zbin = vld1q_s16(mb_plane->zbin); + int16x8_t round = vld1q_s16(mb_plane->round); + int16x8_t quant = vld1q_s16(mb_plane->quant); + int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift); + int16x8_t dequant = vld1q_s16(dequant_ptr); + + // Process first 8 values which include a dc component. + { + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); + + const int16x8_t qcoeff = + quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, quant, + quant_shift, dequant); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan); + + __builtin_prefetch(coeff_ptr + 64); + coeff_ptr += 8; + iscan += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + } + + n_coeffs -= 8; + + { + zbin = vdupq_lane_s16(vget_low_s16(zbin), 1); + round = vdupq_lane_s16(vget_low_s16(round), 1); + quant = vdupq_lane_s16(vget_low_s16(quant), 1); + quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1); + dequant = vdupq_lane_s16(vget_low_s16(dequant), 1); + + do { + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); + + const int16x8_t qcoeff = + quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, + quant, quant_shift, dequant); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = + vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan)); + + __builtin_prefetch(coeff_ptr + 64); + coeff_ptr += 8; + iscan += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + n_coeffs -= 8; + } while (n_coeffs > 0); + } + +#if VPX_ARCH_AARCH64 + *eob_ptr = vmaxvq_u16(eob_max); +#else + { + const uint16x4_t eob_max_0 = + vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max)); + const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0); + const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); + vst1_lane_u16(eob_ptr, eob_max_2, 0); + } +#endif // VPX_ARCH_AARCH64 +} + +static INLINE int32x4_t extract_sign_bit(int32x4_t a) { + return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31)); +} + +static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff, + const int16x8_t dequant, + tran_low_t *dqcoeff_ptr) { + int32x4_t dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant)); + int32x4_t dqcoeff_1 = + vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant)); + + // Add 1 if negative to round towards zero because the C uses division. + dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0)); + dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1)); + +#if CONFIG_VP9_HIGHBITDEPTH + dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1); + dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1); + vst1q_s32(dqcoeff_ptr, dqcoeff_0); + vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1); +#else + vst1q_s16(dqcoeff_ptr, + vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1))); +#endif // CONFIG_VP9_HIGHBITDEPTH +} + +static INLINE int16x8_t +quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16x8_t zbin, + const int16x8_t round, const int16x8_t quant, + const int16x8_t quant_shift, const int16x8_t dequant) { + // Load coeffs as 8 x 16-bit ints, take sign and abs values + const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); + const int16x8_t coeff_abs = vabsq_s16(coeff); + + // Calculate mask of elements outside the bin + const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); + + // Get the rounded values + const int16x8_t rounded = vqaddq_s16(coeff_abs, round); + + // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 + int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); + + qcoeff = vaddq_s16(qcoeff, rounded); + + // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15 + qcoeff = vqdmulhq_s16(qcoeff, quant_shift); + + // Restore the sign bit. + qcoeff = veorq_s16(qcoeff, coeff_sign); + qcoeff = vsubq_s16(qcoeff, coeff_sign); + + // Only keep the relevant coeffs + qcoeff = vandq_s16(qcoeff, zbin_mask); + store_s16q_to_tran_low(qcoeff_ptr, qcoeff); + + calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr); + + return qcoeff; +} + +// Main difference is that zbin values are halved before comparison and dqcoeff +// values are divided by 2. zbin is rounded but dqcoeff is not. +void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, + const struct macroblock_plane *mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const int16x8_t neg_one = vdupq_n_s16(-1); + uint16x8_t eob_max; + int i; + const int16_t *iscan = scan_order->iscan; + + // Only the first element of each vector is DC. + int16x8_t zbin = vrshrq_n_s16(vld1q_s16(mb_plane->zbin), 1); + int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round), 1); + int16x8_t quant = vld1q_s16(mb_plane->quant); + int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift); + int16x8_t dequant = vld1q_s16(dequant_ptr); + + // Process first 8 values which include a dc component. + { + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); + + const int16x8_t qcoeff = + quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, + quant, quant_shift, dequant); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan); + + __builtin_prefetch(coeff_ptr + 64); + coeff_ptr += 8; + iscan += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + } + + { + zbin = vdupq_lane_s16(vget_low_s16(zbin), 1); + round = vdupq_lane_s16(vget_low_s16(round), 1); + quant = vdupq_lane_s16(vget_low_s16(quant), 1); + quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1); + dequant = vdupq_lane_s16(vget_low_s16(dequant), 1); + + for (i = 1; i < 32 * 32 / 8; ++i) { + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); + + const int16x8_t qcoeff = + quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, + quant, quant_shift, dequant); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = + vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan)); + + __builtin_prefetch(coeff_ptr + 64); + coeff_ptr += 8; + iscan += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + } + } + +#if VPX_ARCH_AARCH64 + *eob_ptr = vmaxvq_u16(eob_max); +#else + { + const uint16x4_t eob_max_0 = + vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max)); + const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0); + const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); + vst1_lane_u16(eob_ptr, eob_max_2, 0); + } +#endif // VPX_ARCH_AARCH64 +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c new file mode 100644 index 0000000000..713eec7a92 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c @@ -0,0 +1,228 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref, + uint16x8_t *const sad_sum) { + uint8x16_t abs_diff = vabdq_u8(src, ref); + *sad_sum = vpadalq_u8(*sad_sum, abs_diff); +} + +static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { + uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + + int i = 0; + do { + uint8x16_t s0, s1, s2, s3; + + s0 = vld1q_u8(src + i * src_stride); + sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]); + sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]); + sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]); + sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]); + + s1 = vld1q_u8(src + i * src_stride + 16); + sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]); + sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]); + sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]); + sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]); + + s2 = vld1q_u8(src + i * src_stride + 32); + sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]); + sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]); + sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]); + sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]); + + s3 = vld1q_u8(src + i * src_stride + 48); + sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]); + sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]); + sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]); + sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]); + + i++; + } while (i < h); + + vst1q_u32(res, horizontal_long_add_4d_uint16x8(sum_lo, sum_hi)); +} + +static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { + uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + + int i = 0; + do { + uint8x16_t s0, s1; + + s0 = vld1q_u8(src + i * src_stride); + sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]); + sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]); + sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]); + sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]); + + s1 = vld1q_u8(src + i * src_stride + 16); + sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]); + sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]); + sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]); + sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]); + + i++; + } while (i < h); + + vst1q_u32(res, horizontal_long_add_4d_uint16x8(sum_lo, sum_hi)); +} + +static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + + int i = 0; + do { + const uint8x16_t s = vld1q_u8(src + i * src_stride); + sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]); + sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]); + sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]); + sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]); + + i++; + } while (i < h); + + vst1q_u32(res, horizontal_add_4d_uint16x8(sum)); +} + +static INLINE void sad8_neon(uint8x8_t src, uint8x8_t ref, + uint16x8_t *const sad_sum) { + uint8x8_t abs_diff = vabd_u8(src, ref); + *sad_sum = vaddw_u8(*sad_sum, abs_diff); +} + +static INLINE void sad8xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + + int i = 0; + do { + const uint8x8_t s = vld1_u8(src + i * src_stride); + sad8_neon(s, vld1_u8(ref[0] + i * ref_stride), &sum[0]); + sad8_neon(s, vld1_u8(ref[1] + i * ref_stride), &sum[1]); + sad8_neon(s, vld1_u8(ref[2] + i * ref_stride), &sum[2]); + sad8_neon(s, vld1_u8(ref[3] + i * ref_stride), &sum[3]); + + i++; + } while (i < h); + + vst1q_u32(res, horizontal_add_4d_uint16x8(sum)); +} + +static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + + int i = 0; + do { + uint8x8_t s = load_unaligned_u8(src + i * src_stride, src_stride); + uint8x8_t r0 = load_unaligned_u8(ref[0] + i * ref_stride, ref_stride); + uint8x8_t r1 = load_unaligned_u8(ref[1] + i * ref_stride, ref_stride); + uint8x8_t r2 = load_unaligned_u8(ref[2] + i * ref_stride, ref_stride); + uint8x8_t r3 = load_unaligned_u8(ref[3] + i * ref_stride, ref_stride); + + sad8_neon(s, r0, &sum[0]); + sad8_neon(s, r1, &sum[1]); + sad8_neon(s, r2, &sum[2]); + sad8_neon(s, r3, &sum[3]); + + i += 2; + } while (i < h); + + vst1q_u32(res, horizontal_add_4d_uint16x8(sum)); +} + +#define SAD_WXH_4D_NEON(w, h) \ + void vpx_sad##w##x##h##x4d_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + sad##w##xhx4d_neon(src_ptr, src_stride, ref_array, ref_stride, sad_array, \ + (h)); \ + } + +SAD_WXH_4D_NEON(4, 4) +SAD_WXH_4D_NEON(4, 8) + +SAD_WXH_4D_NEON(8, 4) +SAD_WXH_4D_NEON(8, 8) +SAD_WXH_4D_NEON(8, 16) + +SAD_WXH_4D_NEON(16, 8) +SAD_WXH_4D_NEON(16, 16) +SAD_WXH_4D_NEON(16, 32) + +SAD_WXH_4D_NEON(32, 16) +SAD_WXH_4D_NEON(32, 32) +SAD_WXH_4D_NEON(32, 64) + +SAD_WXH_4D_NEON(64, 32) +SAD_WXH_4D_NEON(64, 64) + +#undef SAD_WXH_4D_NEON + +#define SAD_SKIP_WXH_4D_NEON(w, h) \ + void vpx_sad_skip_##w##x##h##x4d_neon( \ + const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], int ref_stride, \ + uint32_t sad_array[4]) { \ + sad##w##xhx4d_neon(src_ptr, 2 * src_stride, ref_array, 2 * ref_stride, \ + sad_array, ((h) >> 1)); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } + +SAD_SKIP_WXH_4D_NEON(4, 4) +SAD_SKIP_WXH_4D_NEON(4, 8) + +SAD_SKIP_WXH_4D_NEON(8, 4) +SAD_SKIP_WXH_4D_NEON(8, 8) +SAD_SKIP_WXH_4D_NEON(8, 16) + +SAD_SKIP_WXH_4D_NEON(16, 8) +SAD_SKIP_WXH_4D_NEON(16, 16) +SAD_SKIP_WXH_4D_NEON(16, 32) + +SAD_SKIP_WXH_4D_NEON(32, 16) +SAD_SKIP_WXH_4D_NEON(32, 32) +SAD_SKIP_WXH_4D_NEON(32, 64) + +SAD_SKIP_WXH_4D_NEON(64, 32) +SAD_SKIP_WXH_4D_NEON(64, 64) + +#undef SAD_SKIP_WXH_4D_NEON diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon_dotprod.c b/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon_dotprod.c new file mode 100644 index 0000000000..933fc48b8c --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon_dotprod.c @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2021 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref, + uint32x4_t *const sad_sum) { + uint8x16_t abs_diff = vabdq_u8(src, ref); + *sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1)); +} + +static INLINE void sad64xhx4d_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], + int ref_stride, uint32_t res[4], + int h) { + uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum[4]; + + int i = 0; + do { + uint8x16_t s0, s1, s2, s3; + + s0 = vld1q_u8(src + i * src_stride); + sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]); + sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]); + sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]); + sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]); + + s1 = vld1q_u8(src + i * src_stride + 16); + sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]); + sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]); + sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]); + sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]); + + s2 = vld1q_u8(src + i * src_stride + 32); + sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]); + sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]); + sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]); + sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]); + + s3 = vld1q_u8(src + i * src_stride + 48); + sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]); + sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]); + sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]); + sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]); + + } while (++i < h); + + sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); + sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); + sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); + sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); + + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); +} + +static INLINE void sad32xhx4d_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], + int ref_stride, uint32_t res[4], + int h) { + uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum[4]; + + int i = 0; + do { + uint8x16_t s0, s1; + + s0 = vld1q_u8(src + i * src_stride); + sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]); + sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]); + sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]); + sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]); + + s1 = vld1q_u8(src + i * src_stride + 16); + sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]); + sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]); + sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]); + sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]); + + } while (++i < h); + + sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); + sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); + sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); + sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); + + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); +} + +static INLINE void sad16xhx4d_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], + int ref_stride, uint32_t res[4], + int h) { + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + + int i = 0; + do { + const uint8x16_t s = vld1q_u8(src + i * src_stride); + sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]); + sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]); + sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]); + sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]); + + } while (++i < h); + + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); +} + +#define SAD_WXH_4D_NEON_DOTPROD(w, h) \ + void vpx_sad##w##x##h##x4d_neon_dotprod( \ + const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], int ref_stride, \ + uint32_t sad_array[4]) { \ + sad##w##xhx4d_neon_dotprod(src_ptr, src_stride, ref_array, ref_stride, \ + sad_array, (h)); \ + } + +SAD_WXH_4D_NEON_DOTPROD(16, 8) +SAD_WXH_4D_NEON_DOTPROD(16, 16) +SAD_WXH_4D_NEON_DOTPROD(16, 32) + +SAD_WXH_4D_NEON_DOTPROD(32, 16) +SAD_WXH_4D_NEON_DOTPROD(32, 32) +SAD_WXH_4D_NEON_DOTPROD(32, 64) + +SAD_WXH_4D_NEON_DOTPROD(64, 32) +SAD_WXH_4D_NEON_DOTPROD(64, 64) + +#undef SAD_WXH_4D_NEON_DOTPROD + +#define SAD_SKIP_WXH_4D_NEON_DOTPROD(w, h) \ + void vpx_sad_skip_##w##x##h##x4d_neon_dotprod( \ + const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], int ref_stride, \ + uint32_t sad_array[4]) { \ + sad##w##xhx4d_neon_dotprod(src_ptr, 2 * src_stride, ref_array, \ + 2 * ref_stride, sad_array, ((h) >> 1)); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } + +SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 8) +SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 16) +SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 32) + +SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 16) +SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 32) +SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 64) + +SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 32) +SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 64) + +#undef SAD_SKIP_WXH_4D_NEON_DOTPROD diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sad_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/sad_neon.c new file mode 100644 index 0000000000..4dd87ddc0f --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/sad_neon.c @@ -0,0 +1,391 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint32x4_t sum_u32; + + int i = h; + do { + uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3; + uint8x16_t diff0, diff1, diff2, diff3; + + s0 = vld1q_u8(src_ptr); + r0 = vld1q_u8(ref_ptr); + diff0 = vabdq_u8(s0, r0); + sum[0] = vpadalq_u8(sum[0], diff0); + + s1 = vld1q_u8(src_ptr + 16); + r1 = vld1q_u8(ref_ptr + 16); + diff1 = vabdq_u8(s1, r1); + sum[1] = vpadalq_u8(sum[1], diff1); + + s2 = vld1q_u8(src_ptr + 32); + r2 = vld1q_u8(ref_ptr + 32); + diff2 = vabdq_u8(s2, r2); + sum[2] = vpadalq_u8(sum[2], diff2); + + s3 = vld1q_u8(src_ptr + 48); + r3 = vld1q_u8(ref_ptr + 48); + diff3 = vabdq_u8(s3, r3); + sum[3] = vpadalq_u8(sum[3], diff3); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + sum_u32 = vpaddlq_u16(sum[0]); + sum_u32 = vpadalq_u16(sum_u32, sum[1]); + sum_u32 = vpadalq_u16(sum_u32, sum[2]); + sum_u32 = vpadalq_u16(sum_u32, sum[3]); + + return horizontal_add_uint32x4(sum_u32); +} + +static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint32x4_t sum = vdupq_n_u32(0); + + int i = h; + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + uint8x16_t r0 = vld1q_u8(ref_ptr); + uint8x16_t diff0 = vabdq_u8(s0, r0); + uint16x8_t sum0 = vpaddlq_u8(diff0); + + uint8x16_t s1 = vld1q_u8(src_ptr + 16); + uint8x16_t r1 = vld1q_u8(ref_ptr + 16); + uint8x16_t diff1 = vabdq_u8(s1, r1); + uint16x8_t sum1 = vpaddlq_u8(diff1); + + sum = vpadalq_u16(sum, sum0); + sum = vpadalq_u16(sum, sum1); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(sum); +} + +static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint16x8_t sum = vdupq_n_u16(0); + + int i = h; + do { + uint8x16_t s = vld1q_u8(src_ptr); + uint8x16_t r = vld1q_u8(ref_ptr); + + uint8x16_t diff = vabdq_u8(s, r); + sum = vpadalq_u8(sum, diff); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_uint16x8(sum); +} + +static INLINE unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint16x8_t sum = vdupq_n_u16(0); + + int i = h; + do { + uint8x8_t s = vld1_u8(src_ptr); + uint8x8_t r = vld1_u8(ref_ptr); + + sum = vabal_u8(sum, s, r); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_uint16x8(sum); +} + +static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint16x8_t sum = vdupq_n_u16(0); + + int i = h / 2; + do { + uint8x8_t s = load_unaligned_u8(src_ptr, src_stride); + uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride); + + sum = vabal_u8(sum, s, r); + + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + } while (--i != 0); + + return horizontal_add_uint16x8(sum); +} + +#define SAD_WXH_NEON(w, h) \ + unsigned int vpx_sad##w##x##h##_neon(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return sad##w##xh_neon(src, src_stride, ref, ref_stride, (h)); \ + } + +SAD_WXH_NEON(4, 4) +SAD_WXH_NEON(4, 8) + +SAD_WXH_NEON(8, 4) +SAD_WXH_NEON(8, 8) +SAD_WXH_NEON(8, 16) + +SAD_WXH_NEON(16, 8) +SAD_WXH_NEON(16, 16) +SAD_WXH_NEON(16, 32) + +SAD_WXH_NEON(32, 16) +SAD_WXH_NEON(32, 32) +SAD_WXH_NEON(32, 64) + +SAD_WXH_NEON(64, 32) +SAD_WXH_NEON(64, 64) + +#undef SAD_WXH_NEON + +#define SAD_SKIP_WXH_NEON(w, h) \ + unsigned int vpx_sad_skip_##w##x##h##_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * \ + sad##w##xh_neon(src, 2 * src_stride, ref, 2 * ref_stride, (h) / 2); \ + } + +SAD_SKIP_WXH_NEON(4, 4) +SAD_SKIP_WXH_NEON(4, 8) + +SAD_SKIP_WXH_NEON(8, 4) +SAD_SKIP_WXH_NEON(8, 8) +SAD_SKIP_WXH_NEON(8, 16) + +SAD_SKIP_WXH_NEON(16, 8) +SAD_SKIP_WXH_NEON(16, 16) +SAD_SKIP_WXH_NEON(16, 32) + +SAD_SKIP_WXH_NEON(32, 16) +SAD_SKIP_WXH_NEON(32, 32) +SAD_SKIP_WXH_NEON(32, 64) + +SAD_SKIP_WXH_NEON(64, 32) +SAD_SKIP_WXH_NEON(64, 64) + +#undef SAD_SKIP_WXH_NEON + +static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint32x4_t sum_u32; + + int i = h; + do { + uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3; + uint8x16_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3; + + s0 = vld1q_u8(src_ptr); + r0 = vld1q_u8(ref_ptr); + p0 = vld1q_u8(second_pred); + avg0 = vrhaddq_u8(r0, p0); + diff0 = vabdq_u8(s0, avg0); + sum[0] = vpadalq_u8(sum[0], diff0); + + s1 = vld1q_u8(src_ptr + 16); + r1 = vld1q_u8(ref_ptr + 16); + p1 = vld1q_u8(second_pred + 16); + avg1 = vrhaddq_u8(r1, p1); + diff1 = vabdq_u8(s1, avg1); + sum[1] = vpadalq_u8(sum[1], diff1); + + s2 = vld1q_u8(src_ptr + 32); + r2 = vld1q_u8(ref_ptr + 32); + p2 = vld1q_u8(second_pred + 32); + avg2 = vrhaddq_u8(r2, p2); + diff2 = vabdq_u8(s2, avg2); + sum[2] = vpadalq_u8(sum[2], diff2); + + s3 = vld1q_u8(src_ptr + 48); + r3 = vld1q_u8(ref_ptr + 48); + p3 = vld1q_u8(second_pred + 48); + avg3 = vrhaddq_u8(r3, p3); + diff3 = vabdq_u8(s3, avg3); + sum[3] = vpadalq_u8(sum[3], diff3); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 64; + } while (--i != 0); + + sum_u32 = vpaddlq_u16(sum[0]); + sum_u32 = vpadalq_u16(sum_u32, sum[1]); + sum_u32 = vpadalq_u16(sum_u32, sum[2]); + sum_u32 = vpadalq_u16(sum_u32, sum[3]); + + return horizontal_add_uint32x4(sum_u32); +} + +static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + uint32x4_t sum = vdupq_n_u32(0); + + int i = h; + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + uint8x16_t r0 = vld1q_u8(ref_ptr); + uint8x16_t p0 = vld1q_u8(second_pred); + uint8x16_t avg0 = vrhaddq_u8(r0, p0); + uint8x16_t diff0 = vabdq_u8(s0, avg0); + uint16x8_t sum0 = vpaddlq_u8(diff0); + + uint8x16_t s1 = vld1q_u8(src_ptr + 16); + uint8x16_t r1 = vld1q_u8(ref_ptr + 16); + uint8x16_t p1 = vld1q_u8(second_pred + 16); + uint8x16_t avg1 = vrhaddq_u8(r1, p1); + uint8x16_t diff1 = vabdq_u8(s1, avg1); + uint16x8_t sum1 = vpaddlq_u8(diff1); + + sum = vpadalq_u16(sum, sum0); + sum = vpadalq_u16(sum, sum1); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 32; + } while (--i != 0); + + return horizontal_add_uint32x4(sum); +} + +static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + uint16x8_t sum = vdupq_n_u16(0); + + int i = h; + do { + uint8x16_t s = vld1q_u8(src_ptr); + uint8x16_t r = vld1q_u8(ref_ptr); + uint8x16_t p = vld1q_u8(second_pred); + + uint8x16_t avg = vrhaddq_u8(r, p); + uint8x16_t diff = vabdq_u8(s, avg); + sum = vpadalq_u8(sum, diff); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 16; + } while (--i != 0); + + return horizontal_add_uint16x8(sum); +} + +static INLINE unsigned int sad8xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + uint16x8_t sum = vdupq_n_u16(0); + + int i = h; + do { + uint8x8_t s = vld1_u8(src_ptr); + uint8x8_t r = vld1_u8(ref_ptr); + uint8x8_t p = vld1_u8(second_pred); + + uint8x8_t avg = vrhadd_u8(r, p); + sum = vabal_u8(sum, s, avg); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 8; + } while (--i != 0); + + return horizontal_add_uint16x8(sum); +} + +static INLINE unsigned int sad4xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + uint16x8_t sum = vdupq_n_u16(0); + + int i = h / 2; + do { + uint8x8_t s = load_unaligned_u8(src_ptr, src_stride); + uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride); + uint8x8_t p = vld1_u8(second_pred); + + uint8x8_t avg = vrhadd_u8(r, p); + sum = vabal_u8(sum, s, avg); + + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + second_pred += 8; + } while (--i != 0); + + return horizontal_add_uint16x8(sum); +} + +#define SAD_WXH_AVG_NEON(w, h) \ + uint32_t vpx_sad##w##x##h##_avg_neon(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \ + second_pred); \ + } + +SAD_WXH_AVG_NEON(4, 4) +SAD_WXH_AVG_NEON(4, 8) + +SAD_WXH_AVG_NEON(8, 4) +SAD_WXH_AVG_NEON(8, 8) +SAD_WXH_AVG_NEON(8, 16) + +SAD_WXH_AVG_NEON(16, 8) +SAD_WXH_AVG_NEON(16, 16) +SAD_WXH_AVG_NEON(16, 32) + +SAD_WXH_AVG_NEON(32, 16) +SAD_WXH_AVG_NEON(32, 32) +SAD_WXH_AVG_NEON(32, 64) + +SAD_WXH_AVG_NEON(64, 32) +SAD_WXH_AVG_NEON(64, 64) + +#undef SAD_WXH_AVG_NEON diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sad_neon_dotprod.c b/media/libvpx/libvpx/vpx_dsp/arm/sad_neon_dotprod.c new file mode 100644 index 0000000000..fbc0b8d75f --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/sad_neon_dotprod.c @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2021 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +static INLINE unsigned int sadwxh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int w, int h) { + // Only two accumulators are required for optimal instruction throughput of + // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes. + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + int j = 0; + do { + uint8x16_t s0, s1, r0, r1, diff0, diff1; + + s0 = vld1q_u8(src_ptr + j); + r0 = vld1q_u8(ref_ptr + j); + diff0 = vabdq_u8(s0, r0); + sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); + + s1 = vld1q_u8(src_ptr + j + 16); + r1 = vld1q_u8(ref_ptr + j + 16); + diff1 = vabdq_u8(s1, r1); + sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); + + j += 32; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1])); +} + +static INLINE unsigned int sad64xh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64, h); +} + +static INLINE unsigned int sad32xh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32, h); +} + +static INLINE unsigned int sad16xh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h / 2; + do { + uint8x16_t s0, s1, r0, r1, diff0, diff1; + + s0 = vld1q_u8(src_ptr); + r0 = vld1q_u8(ref_ptr); + diff0 = vabdq_u8(s0, r0); + sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + + s1 = vld1q_u8(src_ptr); + r1 = vld1q_u8(ref_ptr); + diff1 = vabdq_u8(s1, r1); + sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1])); +} + +#define SAD_WXH_NEON_DOTPROD(w, h) \ + unsigned int vpx_sad##w##x##h##_neon_dotprod( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return sad##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, (h)); \ + } + +SAD_WXH_NEON_DOTPROD(16, 8) +SAD_WXH_NEON_DOTPROD(16, 16) +SAD_WXH_NEON_DOTPROD(16, 32) + +SAD_WXH_NEON_DOTPROD(32, 16) +SAD_WXH_NEON_DOTPROD(32, 32) +SAD_WXH_NEON_DOTPROD(32, 64) + +SAD_WXH_NEON_DOTPROD(64, 32) +SAD_WXH_NEON_DOTPROD(64, 64) + +#undef SAD_WXH_NEON_DOTPROD + +#define SAD_SKIP_WXH_NEON_DOTPROD(w, h) \ + unsigned int vpx_sad_skip_##w##x##h##_neon_dotprod( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * sad##w##xh_neon_dotprod(src, 2 * src_stride, ref, \ + 2 * ref_stride, (h) / 2); \ + } + +SAD_SKIP_WXH_NEON_DOTPROD(16, 8) +SAD_SKIP_WXH_NEON_DOTPROD(16, 16) +SAD_SKIP_WXH_NEON_DOTPROD(16, 32) + +SAD_SKIP_WXH_NEON_DOTPROD(32, 16) +SAD_SKIP_WXH_NEON_DOTPROD(32, 32) +SAD_SKIP_WXH_NEON_DOTPROD(32, 64) + +SAD_SKIP_WXH_NEON_DOTPROD(64, 32) +SAD_SKIP_WXH_NEON_DOTPROD(64, 64) + +#undef SAD_SKIP_WXH_NEON_DOTPROD + +static INLINE unsigned int sadwxh_avg_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int w, int h, + const uint8_t *second_pred) { + // Only two accumulators are required for optimal instruction throughput of + // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes. + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + int j = 0; + do { + uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1; + + s0 = vld1q_u8(src_ptr + j); + r0 = vld1q_u8(ref_ptr + j); + p0 = vld1q_u8(second_pred); + avg0 = vrhaddq_u8(r0, p0); + diff0 = vabdq_u8(s0, avg0); + sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); + + s1 = vld1q_u8(src_ptr + j + 16); + r1 = vld1q_u8(ref_ptr + j + 16); + p1 = vld1q_u8(second_pred + 16); + avg1 = vrhaddq_u8(r1, p1); + diff1 = vabdq_u8(s1, avg1); + sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); + + j += 32; + second_pred += 32; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1])); +} + +static INLINE unsigned int sad64xh_avg_neon_dotprod( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, int h, const uint8_t *second_pred) { + return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64, + h, second_pred); +} + +static INLINE unsigned int sad32xh_avg_neon_dotprod( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, int h, const uint8_t *second_pred) { + return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32, + h, second_pred); +} + +static INLINE unsigned int sad16xh_avg_neon_dotprod( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, int h, const uint8_t *second_pred) { + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h / 2; + do { + uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1; + + s0 = vld1q_u8(src_ptr); + r0 = vld1q_u8(ref_ptr); + p0 = vld1q_u8(second_pred); + avg0 = vrhaddq_u8(r0, p0); + diff0 = vabdq_u8(s0, avg0); + sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 16; + + s1 = vld1q_u8(src_ptr); + r1 = vld1q_u8(ref_ptr); + p1 = vld1q_u8(second_pred); + avg1 = vrhaddq_u8(r1, p1); + diff1 = vabdq_u8(s1, avg1); + sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 16; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1])); +} + +#define SAD_WXH_AVG_NEON_DOTPROD(w, h) \ + uint32_t vpx_sad##w##x##h##_avg_neon_dotprod( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return sad##w##xh_avg_neon_dotprod(src, src_stride, ref, ref_stride, (h), \ + second_pred); \ + } + +SAD_WXH_AVG_NEON_DOTPROD(16, 8) +SAD_WXH_AVG_NEON_DOTPROD(16, 16) +SAD_WXH_AVG_NEON_DOTPROD(16, 32) + +SAD_WXH_AVG_NEON_DOTPROD(32, 16) +SAD_WXH_AVG_NEON_DOTPROD(32, 32) +SAD_WXH_AVG_NEON_DOTPROD(32, 64) + +SAD_WXH_AVG_NEON_DOTPROD(64, 32) +SAD_WXH_AVG_NEON_DOTPROD(64, 64) + +#undef SAD_WXH_AVG_NEON_DOTPROD diff --git a/media/libvpx/libvpx/vpx_dsp/arm/save_reg_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/save_reg_neon.asm new file mode 100644 index 0000000000..9811cd5a5a --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/save_reg_neon.asm @@ -0,0 +1,34 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vpx_push_neon| + EXPORT |vpx_pop_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vpx_push_neon| PROC + vstm r0!, {d8-d15} + bx lr + + ENDP + +|vpx_pop_neon| PROC + vldm r0!, {d8-d15} + bx lr + + ENDP + + END + diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sse_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/sse_neon.c new file mode 100644 index 0000000000..2dd57e596c --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/sse_neon.c @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +static INLINE void sse_16x1_neon(const uint8_t *src, const uint8_t *ref, + uint32x4_t *sse) { + uint8x16_t s = vld1q_u8(src); + uint8x16_t r = vld1q_u8(ref); + + uint8x16_t abs_diff = vabdq_u8(s, r); + uint8x8_t abs_diff_lo = vget_low_u8(abs_diff); + uint8x8_t abs_diff_hi = vget_high_u8(abs_diff); + + *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_lo, abs_diff_lo)); + *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_hi, abs_diff_hi)); +} + +static INLINE void sse_8x1_neon(const uint8_t *src, const uint8_t *ref, + uint32x4_t *sse) { + uint8x8_t s = vld1_u8(src); + uint8x8_t r = vld1_u8(ref); + + uint8x8_t abs_diff = vabd_u8(s, r); + + *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff)); +} + +static INLINE void sse_4x2_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + uint32x4_t *sse) { + uint8x8_t s = load_unaligned_u8(src, src_stride); + uint8x8_t r = load_unaligned_u8(ref, ref_stride); + + uint8x8_t abs_diff = vabd_u8(s, r); + + *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff)); +} + +static INLINE uint32_t sse_wxh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int width, int height) { + uint32x4_t sse = vdupq_n_u32(0); + + if ((width & 0x07) && ((width & 0x07) < 5)) { + int i = height; + do { + int j = 0; + do { + sse_8x1_neon(src + j, ref + j, &sse); + sse_8x1_neon(src + j + src_stride, ref + j + ref_stride, &sse); + j += 8; + } while (j + 4 < width); + + sse_4x2_neon(src + j, src_stride, ref + j, ref_stride, &sse); + src += 2 * src_stride; + ref += 2 * ref_stride; + i -= 2; + } while (i != 0); + } else { + int i = height; + do { + int j = 0; + do { + sse_8x1_neon(src + j, ref + j, &sse); + j += 8; + } while (j < width); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + } + return horizontal_add_uint32x4(sse); +} + +static INLINE uint32_t sse_64xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon(src, ref, &sse[0]); + sse_16x1_neon(src + 16, ref + 16, &sse[1]); + sse_16x1_neon(src + 32, ref + 32, &sse[0]); + sse_16x1_neon(src + 48, ref + 48, &sse[1]); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_32xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon(src, ref, &sse[0]); + sse_16x1_neon(src + 16, ref + 16, &sse[1]); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_16xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon(src, ref, &sse[0]); + src += src_stride; + ref += ref_stride; + sse_16x1_neon(src, ref, &sse[1]); + src += src_stride; + ref += ref_stride; + i -= 2; + } while (i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_8xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse = vdupq_n_u32(0); + + int i = height; + do { + sse_8x1_neon(src, ref, &sse); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(sse); +} + +static INLINE uint32_t sse_4xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse = vdupq_n_u32(0); + + int i = height; + do { + sse_4x2_neon(src, src_stride, ref, ref_stride, &sse); + + src += 2 * src_stride; + ref += 2 * ref_stride; + i -= 2; + } while (i != 0); + + return horizontal_add_uint32x4(sse); +} + +int64_t vpx_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref, + int ref_stride, int width, int height) { + switch (width) { + case 4: return sse_4xh_neon(src, src_stride, ref, ref_stride, height); + case 8: return sse_8xh_neon(src, src_stride, ref, ref_stride, height); + case 16: return sse_16xh_neon(src, src_stride, ref, ref_stride, height); + case 32: return sse_32xh_neon(src, src_stride, ref, ref_stride, height); + case 64: return sse_64xh_neon(src, src_stride, ref, ref_stride, height); + default: + return sse_wxh_neon(src, src_stride, ref, ref_stride, width, height); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sse_neon_dotprod.c b/media/libvpx/libvpx/vpx_dsp/arm/sse_neon_dotprod.c new file mode 100644 index 0000000000..8777773918 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/sse_neon_dotprod.c @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +static INLINE void sse_16x1_neon_dotprod(const uint8_t *src, const uint8_t *ref, + uint32x4_t *sse) { + uint8x16_t s = vld1q_u8(src); + uint8x16_t r = vld1q_u8(ref); + + uint8x16_t abs_diff = vabdq_u8(s, r); + + *sse = vdotq_u32(*sse, abs_diff, abs_diff); +} + +static INLINE void sse_8x1_neon_dotprod(const uint8_t *src, const uint8_t *ref, + uint32x2_t *sse) { + uint8x8_t s = vld1_u8(src); + uint8x8_t r = vld1_u8(ref); + + uint8x8_t abs_diff = vabd_u8(s, r); + + *sse = vdot_u32(*sse, abs_diff, abs_diff); +} + +static INLINE void sse_4x2_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + uint32x2_t *sse) { + uint8x8_t s = load_unaligned_u8(src, src_stride); + uint8x8_t r = load_unaligned_u8(ref, ref_stride); + + uint8x8_t abs_diff = vabd_u8(s, r); + + *sse = vdot_u32(*sse, abs_diff, abs_diff); +} + +static INLINE uint32_t sse_wxh_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int width, int height) { + uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) }; + + if ((width & 0x07) && ((width & 0x07) < 5)) { + int i = height; + do { + int j = 0; + do { + sse_8x1_neon_dotprod(src + j, ref + j, &sse[0]); + sse_8x1_neon_dotprod(src + j + src_stride, ref + j + ref_stride, + &sse[1]); + j += 8; + } while (j + 4 < width); + + sse_4x2_neon_dotprod(src + j, src_stride, ref + j, ref_stride, &sse[0]); + src += 2 * src_stride; + ref += 2 * ref_stride; + i -= 2; + } while (i != 0); + } else { + int i = height; + do { + int j = 0; + do { + sse_8x1_neon_dotprod(src + j, ref + j, &sse[0]); + sse_8x1_neon_dotprod(src + j + src_stride, ref + j + ref_stride, + &sse[1]); + j += 8; + } while (j < width); + + src += 2 * src_stride; + ref += 2 * ref_stride; + i -= 2; + } while (i != 0); + } + return horizontal_add_uint32x4(vcombine_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_64xh_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon_dotprod(src, ref, &sse[0]); + sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]); + sse_16x1_neon_dotprod(src + 32, ref + 32, &sse[0]); + sse_16x1_neon_dotprod(src + 48, ref + 48, &sse[1]); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_32xh_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon_dotprod(src, ref, &sse[0]); + sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_16xh_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon_dotprod(src, ref, &sse[0]); + src += src_stride; + ref += ref_stride; + sse_16x1_neon_dotprod(src, ref, &sse[1]); + src += src_stride; + ref += ref_stride; + i -= 2; + } while (i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_8xh_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) }; + + int i = height; + do { + sse_8x1_neon_dotprod(src, ref, &sse[0]); + src += src_stride; + ref += ref_stride; + sse_8x1_neon_dotprod(src, ref, &sse[1]); + src += src_stride; + ref += ref_stride; + i -= 2; + } while (i != 0); + + return horizontal_add_uint32x4(vcombine_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_4xh_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x2_t sse = vdup_n_u32(0); + + int i = height; + do { + sse_4x2_neon_dotprod(src, src_stride, ref, ref_stride, &sse); + + src += 2 * src_stride; + ref += 2 * ref_stride; + i -= 2; + } while (i != 0); + + return horizontal_add_uint32x2(sse); +} + +int64_t vpx_sse_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int width, + int height) { + switch (width) { + case 4: + return sse_4xh_neon_dotprod(src, src_stride, ref, ref_stride, height); + case 8: + return sse_8xh_neon_dotprod(src, src_stride, ref, ref_stride, height); + case 16: + return sse_16xh_neon_dotprod(src, src_stride, ref, ref_stride, height); + case 32: + return sse_32xh_neon_dotprod(src, src_stride, ref, ref_stride, height); + case 64: + return sse_64xh_neon_dotprod(src, src_stride, ref, ref_stride, height); + default: + return sse_wxh_neon_dotprod(src, src_stride, ref, ref_stride, width, + height); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/subpel_variance_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/subpel_variance_neon.c new file mode 100644 index 0000000000..d92f1615d7 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/subpel_variance_neon.c @@ -0,0 +1,489 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" + +#include "vpx_dsp/variance.h" +#include "vpx_dsp/arm/mem_neon.h" + +// Process a block exactly 4 wide and a multiple of 2 high. +static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride); + uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride); + uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1); + uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); + vst1_u8(dst_ptr, blend_u8); + + src_ptr += 2 * src_stride; + dst_ptr += 2 * 4; + i -= 2; + } while (i != 0); +} + +// Process a block exactly 8 wide and any height. +static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + uint8x8_t s0 = vld1_u8(src_ptr); + uint8x8_t s1 = vld1_u8(src_ptr + pixel_step); + uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1); + uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); + vst1_u8(dst_ptr, blend_u8); + + src_ptr += src_stride; + dst_ptr += 8; + } while (--i != 0); +} + +// Process a block which is a mutiple of 16 wide and any height. +static void var_filter_block2d_bil_large(const uint8_t *src_ptr, + uint8_t *dst_ptr, int src_stride, + int pixel_step, int dst_width, + int dst_height, int filter_offset) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + int j = 0; + do { + uint8x16_t s0 = vld1q_u8(src_ptr + j); + uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); + uint16x8_t blend_l = + vmlal_u8(vmull_u8(vget_low_u8(s0), f0), vget_low_u8(s1), f1); + uint16x8_t blend_h = + vmlal_u8(vmull_u8(vget_high_u8(s0), f0), vget_high_u8(s1), f1); + uint8x8_t out_lo = vrshrn_n_u16(blend_l, 3); + uint8x8_t out_hi = vrshrn_n_u16(blend_h, 3); + vst1q_u8(dst_ptr + j, vcombine_u8(out_lo, out_hi)); + + j += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16, + dst_height, filter_offset); +} +static void var_filter_block2d_bil_w32(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32, + dst_height, filter_offset); +} +static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64, + dst_height, filter_offset); +} + +static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_width, int dst_height) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint8x16_t s0 = vld1q_u8(src_ptr + j); + uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); + uint8x16_t avg = vrhaddq_u8(s0, s1); + vst1q_u8(dst_ptr + j, avg); + + j += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +#define SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int vpx_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse) { \ + uint8_t tmp0[w * (h + padding)]; \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ + xoffset); \ + var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } + +#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int vpx_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, unsigned int *sse) { \ + if (xoffset == 0) { \ + if (yoffset == 0) { \ + return vpx_variance##w##x##h(src, src_stride, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp[w * h]; \ + var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h); \ + return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h, \ + yoffset); \ + return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ + } \ + } else if (xoffset == 4) { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \ + return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * (h + padding)]; \ + var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \ + var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp1[w * (h + padding)]; \ + var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \ + var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } \ + } else { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \ + return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ + xoffset); \ + var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ + xoffset); \ + var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } \ + } \ + } + +// 4x blocks are processed two rows at a time, so require an extra row of +// padding. +SUBPEL_VARIANCE_WXH_NEON(4, 4, 2) +SUBPEL_VARIANCE_WXH_NEON(4, 8, 2) + +SUBPEL_VARIANCE_WXH_NEON(8, 4, 1) +SUBPEL_VARIANCE_WXH_NEON(8, 8, 1) +SUBPEL_VARIANCE_WXH_NEON(8, 16, 1) + +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1) + +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1) + +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1) + +// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 4. +static void avg_pred_var_filter_block2d_bil_w4(const uint8_t *src_ptr, + uint8_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset, + const uint8_t *second_pred) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride); + uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride); + uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1); + uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); + + uint8x8_t p = vld1_u8(second_pred); + uint8x8_t avg = vrhadd_u8(blend_u8, p); + + vst1_u8(dst_ptr, avg); + + src_ptr += 2 * src_stride; + dst_ptr += 2 * 4; + second_pred += 2 * 4; + i -= 2; + } while (i != 0); +} + +// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 8. +static void avg_pred_var_filter_block2d_bil_w8(const uint8_t *src_ptr, + uint8_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset, + const uint8_t *second_pred) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + uint8x8_t s0 = vld1_u8(src_ptr); + uint8x8_t s1 = vld1_u8(src_ptr + pixel_step); + uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1); + uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); + + uint8x8_t p = vld1_u8(second_pred); + uint8x8_t avg = vrhadd_u8(blend_u8, p); + + vst1_u8(dst_ptr, avg); + + src_ptr += src_stride; + dst_ptr += 8; + second_pred += 8; + } while (--i > 0); +} + +// Combine bilinear filter with vpx_comp_avg_pred for large blocks. +static void avg_pred_var_filter_block2d_bil_large( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_width, int dst_height, int filter_offset, + const uint8_t *second_pred) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + int j = 0; + do { + uint8x16_t s0 = vld1q_u8(src_ptr + j); + uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); + uint16x8_t blend_l = + vmlal_u8(vmull_u8(vget_low_u8(s0), f0), vget_low_u8(s1), f1); + uint16x8_t blend_h = + vmlal_u8(vmull_u8(vget_high_u8(s0), f0), vget_high_u8(s1), f1); + uint8x16_t blend_u8 = + vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3)); + + uint8x16_t p = vld1q_u8(second_pred); + uint8x16_t avg = vrhaddq_u8(blend_u8, p); + + vst1q_u8(dst_ptr + j, avg); + + j += 16; + second_pred += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 16. +static void avg_pred_var_filter_block2d_bil_w16( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint8_t *second_pred) { + avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 16, dst_height, + filter_offset, second_pred); +} + +// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 32. +static void avg_pred_var_filter_block2d_bil_w32( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint8_t *second_pred) { + avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 32, dst_height, + filter_offset, second_pred); +} + +// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 64. +static void avg_pred_var_filter_block2d_bil_w64( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint8_t *second_pred) { + avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 64, dst_height, + filter_offset, second_pred); +} + +// Combine averaging subpel filter with vpx_comp_avg_pred. +static void avg_pred_var_filter_block2d_avg(const uint8_t *src_ptr, + uint8_t *dst_ptr, int src_stride, + int pixel_step, int dst_width, + int dst_height, + const uint8_t *second_pred) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint8x16_t s0 = vld1q_u8(src_ptr + j); + uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); + uint8x16_t avg = vrhaddq_u8(s0, s1); + + uint8x16_t p = vld1q_u8(second_pred); + avg = vrhaddq_u8(avg, p); + + vst1q_u8(dst_ptr + j, avg); + + j += 16; + second_pred += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +// Implementation of vpx_comp_avg_pred for blocks having width >= 16. +static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, + int dst_width, int dst_height, + const uint8_t *second_pred) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint8x16_t s = vld1q_u8(src_ptr + j); + uint8x16_t p = vld1q_u8(second_pred); + + uint8x16_t avg = vrhaddq_u8(s, p); + + vst1q_u8(dst_ptr + j, avg); + + j += 16; + second_pred += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +#define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int vpx_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int source_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint8_t tmp0[w * (h + padding)]; \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \ + xoffset); \ + avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \ + second_pred); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } + +#define SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int vpx_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int source_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, unsigned int *sse, \ + const uint8_t *second_pred) { \ + if (xoffset == 0) { \ + uint8_t tmp[w * h]; \ + if (yoffset == 0) { \ + avg_pred(src, tmp, source_stride, w, h, second_pred); \ + return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + avg_pred_var_filter_block2d_avg(src, tmp, source_stride, \ + source_stride, w, h, second_pred); \ + return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ + } else { \ + avg_pred_var_filter_block2d_bil_w##w( \ + src, tmp, source_stride, source_stride, h, yoffset, second_pred); \ + return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ + } \ + } else if (xoffset == 4) { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + avg_pred_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h, \ + second_pred); \ + return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * (h + padding)]; \ + var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \ + avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp1[w * (h + padding)]; \ + var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \ + avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \ + second_pred); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } \ + } else { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + avg_pred_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h, \ + xoffset, second_pred); \ + return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \ + (h + padding), xoffset); \ + avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \ + (h + padding), xoffset); \ + avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \ + second_pred); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } \ + } \ + } + +// 4x blocks are processed two rows at a time, so require an extra row of +// padding. +SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2) +SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2) + +SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1) +SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1) +SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1) + +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1) + +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1) + +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1) diff --git a/media/libvpx/libvpx/vpx_dsp/arm/subtract_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/subtract_neon.c new file mode 100644 index 0000000000..2c008e48ab --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/subtract_neon.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" + +void vpx_subtract_block_neon(int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src, + ptrdiff_t src_stride, const uint8_t *pred, + ptrdiff_t pred_stride) { + int r = rows, c; + + if (cols > 16) { + do { + for (c = 0; c < cols; c += 32) { + const uint8x16_t s0 = vld1q_u8(&src[c + 0]); + const uint8x16_t s1 = vld1q_u8(&src[c + 16]); + const uint8x16_t p0 = vld1q_u8(&pred[c + 0]); + const uint8x16_t p1 = vld1q_u8(&pred[c + 16]); + const uint16x8_t d0 = vsubl_u8(vget_low_u8(s0), vget_low_u8(p0)); + const uint16x8_t d1 = vsubl_u8(vget_high_u8(s0), vget_high_u8(p0)); + const uint16x8_t d2 = vsubl_u8(vget_low_u8(s1), vget_low_u8(p1)); + const uint16x8_t d3 = vsubl_u8(vget_high_u8(s1), vget_high_u8(p1)); + vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(d0)); + vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(d1)); + vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(d2)); + vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(d3)); + } + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + } else if (cols > 8) { + do { + const uint8x16_t s = vld1q_u8(&src[0]); + const uint8x16_t p = vld1q_u8(&pred[0]); + const uint16x8_t d0 = vsubl_u8(vget_low_u8(s), vget_low_u8(p)); + const uint16x8_t d1 = vsubl_u8(vget_high_u8(s), vget_high_u8(p)); + vst1q_s16(&diff[0], vreinterpretq_s16_u16(d0)); + vst1q_s16(&diff[8], vreinterpretq_s16_u16(d1)); + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + } else if (cols > 4) { + do { + const uint8x8_t s = vld1_u8(&src[0]); + const uint8x8_t p = vld1_u8(&pred[0]); + const uint16x8_t v_diff = vsubl_u8(s, p); + vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff)); + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + } else { + assert(cols == 4); + do { + const uint8x8_t s = load_unaligned_u8(src, (int)src_stride); + const uint8x8_t p = load_unaligned_u8(pred, (int)pred_stride); + const uint16x8_t d = vsubl_u8(s, p); + vst1_s16(diff + 0 * diff_stride, vreinterpret_s16_u16(vget_low_u16(d))); + vst1_s16(diff + 1 * diff_stride, vreinterpret_s16_u16(vget_high_u16(d))); + diff += 2 * diff_stride; + pred += 2 * pred_stride; + src += 2 * src_stride; + r -= 2; + } while (r); + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_highbd_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, + const uint8_t *src8_ptr, + ptrdiff_t src_stride, + const uint8_t *pred8_ptr, + ptrdiff_t pred_stride, int bd) { + int r = rows, c; + uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8_ptr); + (void)bd; + + if (cols >= 16) { + do { + for (c = 0; c < cols; c += 16) { + const uint16x8_t s0 = vld1q_u16(&src[c + 0]); + const uint16x8_t s1 = vld1q_u16(&src[c + 8]); + const uint16x8_t p0 = vld1q_u16(&pred[c + 0]); + const uint16x8_t p1 = vld1q_u16(&pred[c + 8]); + const uint16x8_t d0 = vsubq_u16(s0, p0); + const uint16x8_t d1 = vsubq_u16(s1, p1); + vst1q_s16(&diff_ptr[c + 0], vreinterpretq_s16_u16(d0)); + vst1q_s16(&diff_ptr[c + 8], vreinterpretq_s16_u16(d1)); + } + diff_ptr += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + } else if (cols >= 8) { + do { + for (c = 0; c < cols; c += 8) { + const uint16x8_t s = vld1q_u16(&src[c]); + const uint16x8_t p = vld1q_u16(&pred[c]); + const uint16x8_t d0 = vsubq_u16(s, p); + vst1q_s16(&diff_ptr[c], vreinterpretq_s16_u16(d0)); + } + diff_ptr += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + } else if (cols >= 4) { + do { + for (c = 0; c < cols; c += 4) { + const uint16x4_t s = vld1_u16(&src[c]); + const uint16x4_t p = vld1_u16(&pred[c]); + const uint16x4_t v_diff = vsub_u16(s, p); + vst1_s16(&diff_ptr[c], vreinterpret_s16_u16(v_diff)); + } + diff_ptr += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h new file mode 100644 index 0000000000..11821dc10e --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h @@ -0,0 +1,275 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_SUM_NEON_H_ +#define VPX_VPX_DSP_ARM_SUM_NEON_H_ + +#include + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" + +static INLINE uint16_t horizontal_add_uint8x4(const uint8x8_t a) { +#if VPX_ARCH_AARCH64 + return vaddlv_u8(a); +#else + const uint16x4_t b = vpaddl_u8(a); + const uint16x4_t c = vpadd_u16(b, b); + return vget_lane_u16(c, 0); +#endif +} + +static INLINE uint16_t horizontal_add_uint8x8(const uint8x8_t a) { +#if VPX_ARCH_AARCH64 + return vaddlv_u8(a); +#else + const uint16x4_t b = vpaddl_u8(a); + const uint16x4_t c = vpadd_u16(b, b); + const uint16x4_t d = vpadd_u16(c, c); + return vget_lane_u16(d, 0); +#endif +} + +static INLINE uint16_t horizontal_add_uint8x16(const uint8x16_t a) { +#if VPX_ARCH_AARCH64 + return vaddlvq_u8(a); +#else + const uint16x8_t b = vpaddlq_u8(a); + const uint16x4_t c = vadd_u16(vget_low_u16(b), vget_high_u16(b)); + const uint16x4_t d = vpadd_u16(c, c); + const uint16x4_t e = vpadd_u16(d, d); + return vget_lane_u16(e, 0); +#endif +} + +static INLINE uint16_t horizontal_add_uint16x4(const uint16x4_t a) { +#if VPX_ARCH_AARCH64 + return vaddv_u16(a); +#else + const uint16x4_t b = vpadd_u16(a, a); + const uint16x4_t c = vpadd_u16(b, b); + return vget_lane_u16(c, 0); +#endif +} + +static INLINE int32_t horizontal_add_int16x8(const int16x8_t a) { +#if VPX_ARCH_AARCH64 + return vaddlvq_s16(a); +#else + const int32x4_t b = vpaddlq_s16(a); + const int64x2_t c = vpaddlq_s32(b); + const int32x2_t d = vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)), + vreinterpret_s32_s64(vget_high_s64(c))); + return vget_lane_s32(d, 0); +#endif +} + +static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) { +#if VPX_ARCH_AARCH64 + return vaddlvq_u16(a); +#else + const uint32x4_t b = vpaddlq_u16(a); + const uint64x2_t c = vpaddlq_u32(b); + const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)), + vreinterpret_u32_u64(vget_high_u64(c))); + return vget_lane_u32(d, 0); +#endif +} + +static INLINE uint32x4_t horizontal_add_4d_uint16x8(const uint16x8_t sum[4]) { +#if VPX_ARCH_AARCH64 + const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]); + const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]); + const uint16x8_t b0 = vpaddq_u16(a0, a1); + return vpaddlq_u16(b0); +#else + const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0])); + const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1])); + const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2])); + const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3])); + const uint16x4_t b0 = vpadd_u16(a0, a1); + const uint16x4_t b1 = vpadd_u16(a2, a3); + return vpaddlq_u16(vcombine_u16(b0, b1)); +#endif +} + +static INLINE uint32_t horizontal_long_add_uint16x8(const uint16x8_t vec_lo, + const uint16x8_t vec_hi) { +#if VPX_ARCH_AARCH64 + return vaddlvq_u16(vec_lo) + vaddlvq_u16(vec_hi); +#else + const uint32x4_t vec_l_lo = + vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo)); + const uint32x4_t vec_l_hi = + vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi)); + const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +#endif +} + +static INLINE uint32x4_t horizontal_long_add_4d_uint16x8( + const uint16x8_t sum_lo[4], const uint16x8_t sum_hi[4]) { + const uint32x4_t a0 = vpaddlq_u16(sum_lo[0]); + const uint32x4_t a1 = vpaddlq_u16(sum_lo[1]); + const uint32x4_t a2 = vpaddlq_u16(sum_lo[2]); + const uint32x4_t a3 = vpaddlq_u16(sum_lo[3]); + const uint32x4_t b0 = vpadalq_u16(a0, sum_hi[0]); + const uint32x4_t b1 = vpadalq_u16(a1, sum_hi[1]); + const uint32x4_t b2 = vpadalq_u16(a2, sum_hi[2]); + const uint32x4_t b3 = vpadalq_u16(a3, sum_hi[3]); +#if VPX_ARCH_AARCH64 + const uint32x4_t c0 = vpaddq_u32(b0, b1); + const uint32x4_t c1 = vpaddq_u32(b2, b3); + return vpaddq_u32(c0, c1); +#else + const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0)); + const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1)); + const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2)); + const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3)); + const uint32x2_t d0 = vpadd_u32(c0, c1); + const uint32x2_t d1 = vpadd_u32(c2, c3); + return vcombine_u32(d0, d1); +#endif +} + +static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) { +#if VPX_ARCH_AARCH64 + return vaddv_s32(a); +#else + return vget_lane_s32(a, 0) + vget_lane_s32(a, 1); +#endif +} + +static INLINE uint32_t horizontal_add_uint32x2(const uint32x2_t a) { +#if VPX_ARCH_AARCH64 + return vaddv_u32(a); +#else + const uint64x1_t b = vpaddl_u32(a); + return vget_lane_u32(vreinterpret_u32_u64(b), 0); +#endif +} + +static INLINE int32_t horizontal_add_int32x4(const int32x4_t a) { +#if VPX_ARCH_AARCH64 + return vaddvq_s32(a); +#else + const int64x2_t b = vpaddlq_s32(a); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +#endif +} + +static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) { +#if VPX_ARCH_AARCH64 + return vaddvq_u32(a); +#else + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +#endif +} + +static INLINE uint32x4_t horizontal_add_4d_uint32x4(const uint32x4_t sum[4]) { +#if VPX_ARCH_AARCH64 + uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]); + uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]); + return vpaddq_u32(res01, res23); +#else + uint32x4_t res = vdupq_n_u32(0); + res = vsetq_lane_u32(horizontal_add_uint32x4(sum[0]), res, 0); + res = vsetq_lane_u32(horizontal_add_uint32x4(sum[1]), res, 1); + res = vsetq_lane_u32(horizontal_add_uint32x4(sum[2]), res, 2); + res = vsetq_lane_u32(horizontal_add_uint32x4(sum[3]), res, 3); + return res; +#endif +} + +static INLINE uint64_t horizontal_long_add_uint32x4(const uint32x4_t a) { +#if VPX_ARCH_AARCH64 + return vaddlvq_u32(a); +#else + const uint64x2_t b = vpaddlq_u32(a); + return vgetq_lane_u64(b, 0) + vgetq_lane_u64(b, 1); +#endif +} + +static INLINE int64_t horizontal_add_int64x2(const int64x2_t a) { +#if VPX_ARCH_AARCH64 + return vaddvq_s64(a); +#else + return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1); +#endif +} + +static INLINE uint64_t horizontal_add_uint64x2(const uint64x2_t a) { +#if VPX_ARCH_AARCH64 + return vaddvq_u64(a); +#else + return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1); +#endif +} + +static INLINE uint64_t horizontal_long_add_uint32x4_x2(const uint32x4_t a[2]) { + return horizontal_long_add_uint32x4(a[0]) + + horizontal_long_add_uint32x4(a[1]); +} + +static INLINE uint64_t horizontal_long_add_uint32x4_x4(const uint32x4_t a[4]) { + uint64x2_t sum = vpaddlq_u32(a[0]); + sum = vpadalq_u32(sum, a[1]); + sum = vpadalq_u32(sum, a[2]); + sum = vpadalq_u32(sum, a[3]); + + return horizontal_add_uint64x2(sum); +} + +static INLINE uint64_t horizontal_long_add_uint32x4_x8(const uint32x4_t a[8]) { + uint64x2_t sum[2]; + sum[0] = vpaddlq_u32(a[0]); + sum[1] = vpaddlq_u32(a[1]); + sum[0] = vpadalq_u32(sum[0], a[2]); + sum[1] = vpadalq_u32(sum[1], a[3]); + sum[0] = vpadalq_u32(sum[0], a[4]); + sum[1] = vpadalq_u32(sum[1], a[5]); + sum[0] = vpadalq_u32(sum[0], a[6]); + sum[1] = vpadalq_u32(sum[1], a[7]); + + return horizontal_add_uint64x2(vaddq_u64(sum[0], sum[1])); +} + +static INLINE uint64_t +horizontal_long_add_uint32x4_x16(const uint32x4_t a[16]) { + uint64x2_t sum[2]; + sum[0] = vpaddlq_u32(a[0]); + sum[1] = vpaddlq_u32(a[1]); + sum[0] = vpadalq_u32(sum[0], a[2]); + sum[1] = vpadalq_u32(sum[1], a[3]); + sum[0] = vpadalq_u32(sum[0], a[4]); + sum[1] = vpadalq_u32(sum[1], a[5]); + sum[0] = vpadalq_u32(sum[0], a[6]); + sum[1] = vpadalq_u32(sum[1], a[7]); + sum[0] = vpadalq_u32(sum[0], a[8]); + sum[1] = vpadalq_u32(sum[1], a[9]); + sum[0] = vpadalq_u32(sum[0], a[10]); + sum[1] = vpadalq_u32(sum[1], a[11]); + sum[0] = vpadalq_u32(sum[0], a[12]); + sum[1] = vpadalq_u32(sum[1], a[13]); + sum[0] = vpadalq_u32(sum[0], a[14]); + sum[1] = vpadalq_u32(sum[1], a[15]); + + return horizontal_add_uint64x2(vaddq_u64(sum[0], sum[1])); +} + +#endif // VPX_VPX_DSP_ARM_SUM_NEON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_neon.c new file mode 100644 index 0000000000..074afe3258 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_neon.c @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/sum_neon.h" + +uint64_t vpx_sum_squares_2d_i16_neon(const int16_t *src, int stride, int size) { + if (size == 4) { + int16x4_t s[4]; + int32x4_t sum_s32; + + s[0] = vld1_s16(src + 0 * stride); + s[1] = vld1_s16(src + 1 * stride); + s[2] = vld1_s16(src + 2 * stride); + s[3] = vld1_s16(src + 3 * stride); + + sum_s32 = vmull_s16(s[0], s[0]); + sum_s32 = vmlal_s16(sum_s32, s[1], s[1]); + sum_s32 = vmlal_s16(sum_s32, s[2], s[2]); + sum_s32 = vmlal_s16(sum_s32, s[3], s[3]); + + return horizontal_long_add_uint32x4(vreinterpretq_u32_s32(sum_s32)); + } else { + uint64x2_t sum_u64 = vdupq_n_u64(0); + int rows = size; + + do { + const int16_t *src_ptr = src; + int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + int cols = size; + + do { + int16x8_t s[8]; + + s[0] = vld1q_s16(src_ptr + 0 * stride); + s[1] = vld1q_s16(src_ptr + 1 * stride); + s[2] = vld1q_s16(src_ptr + 2 * stride); + s[3] = vld1q_s16(src_ptr + 3 * stride); + s[4] = vld1q_s16(src_ptr + 4 * stride); + s[5] = vld1q_s16(src_ptr + 5 * stride); + s[6] = vld1q_s16(src_ptr + 6 * stride); + s[7] = vld1q_s16(src_ptr + 7 * stride); + + sum_s32[0] = + vmlal_s16(sum_s32[0], vget_low_s16(s[0]), vget_low_s16(s[0])); + sum_s32[0] = + vmlal_s16(sum_s32[0], vget_low_s16(s[1]), vget_low_s16(s[1])); + sum_s32[0] = + vmlal_s16(sum_s32[0], vget_low_s16(s[2]), vget_low_s16(s[2])); + sum_s32[0] = + vmlal_s16(sum_s32[0], vget_low_s16(s[3]), vget_low_s16(s[3])); + sum_s32[0] = + vmlal_s16(sum_s32[0], vget_low_s16(s[4]), vget_low_s16(s[4])); + sum_s32[0] = + vmlal_s16(sum_s32[0], vget_low_s16(s[5]), vget_low_s16(s[5])); + sum_s32[0] = + vmlal_s16(sum_s32[0], vget_low_s16(s[6]), vget_low_s16(s[6])); + sum_s32[0] = + vmlal_s16(sum_s32[0], vget_low_s16(s[7]), vget_low_s16(s[7])); + + sum_s32[1] = + vmlal_s16(sum_s32[1], vget_high_s16(s[0]), vget_high_s16(s[0])); + sum_s32[1] = + vmlal_s16(sum_s32[1], vget_high_s16(s[1]), vget_high_s16(s[1])); + sum_s32[1] = + vmlal_s16(sum_s32[1], vget_high_s16(s[2]), vget_high_s16(s[2])); + sum_s32[1] = + vmlal_s16(sum_s32[1], vget_high_s16(s[3]), vget_high_s16(s[3])); + sum_s32[1] = + vmlal_s16(sum_s32[1], vget_high_s16(s[4]), vget_high_s16(s[4])); + sum_s32[1] = + vmlal_s16(sum_s32[1], vget_high_s16(s[5]), vget_high_s16(s[5])); + sum_s32[1] = + vmlal_s16(sum_s32[1], vget_high_s16(s[6]), vget_high_s16(s[6])); + sum_s32[1] = + vmlal_s16(sum_s32[1], vget_high_s16(s[7]), vget_high_s16(s[7])); + + src_ptr += 8; + cols -= 8; + } while (cols); + + sum_u64 = vpadalq_u32(sum_u64, vreinterpretq_u32_s32(sum_s32[0])); + sum_u64 = vpadalq_u32(sum_u64, vreinterpretq_u32_s32(sum_s32[1])); + src += 8 * stride; + rows -= 8; + } while (rows); + + return horizontal_add_uint64x2(sum_u64); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h new file mode 100644 index 0000000000..74f85a6bb6 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h @@ -0,0 +1,1546 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_ +#define VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_ + +#include + +#include "./vpx_config.h" + +// Transpose 64 bit elements as follows: +// a0: 00 01 02 03 04 05 06 07 +// a1: 16 17 18 19 20 21 22 23 +// +// b0.val[0]: 00 01 02 03 16 17 18 19 +// b0.val[1]: 04 05 06 07 20 21 22 23 +static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) { + int16x8x2_t b0; +#if VPX_ARCH_AARCH64 + b0.val[0] = vreinterpretq_s16_s64( + vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); + b0.val[1] = vreinterpretq_s16_s64( + vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); +#else + b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)), + vreinterpret_s16_s32(vget_low_s32(a1))); + b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)), + vreinterpret_s16_s32(vget_high_s32(a1))); +#endif + return b0; +} + +static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) { + int32x4x2_t b0; +#if VPX_ARCH_AARCH64 + b0.val[0] = vreinterpretq_s32_s64( + vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); + b0.val[1] = vreinterpretq_s32_s64( + vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); +#else + b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1)); + b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1)); +#endif + return b0; +} + +static INLINE int64x2x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) { + int64x2x2_t b0; +#if VPX_ARCH_AARCH64 + b0.val[0] = vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)); + b0.val[1] = vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)); +#else + b0.val[0] = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(a0)), + vreinterpret_s64_s32(vget_low_s32(a1))); + b0.val[1] = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(a0)), + vreinterpret_s64_s32(vget_high_s32(a1))); +#endif + return b0; +} + +static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) { + uint8x16x2_t b0; +#if VPX_ARCH_AARCH64 + b0.val[0] = vreinterpretq_u8_u64( + vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); + b0.val[1] = vreinterpretq_u8_u64( + vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); +#else + b0.val[0] = vcombine_u8(vreinterpret_u8_u32(vget_low_u32(a0)), + vreinterpret_u8_u32(vget_low_u32(a1))); + b0.val[1] = vcombine_u8(vreinterpret_u8_u32(vget_high_u32(a0)), + vreinterpret_u8_u32(vget_high_u32(a1))); +#endif + return b0; +} + +static INLINE uint16x8x2_t vpx_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) { + uint16x8x2_t b0; +#if VPX_ARCH_AARCH64 + b0.val[0] = vreinterpretq_u16_u64( + vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); + b0.val[1] = vreinterpretq_u16_u64( + vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); +#else + b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)), + vreinterpret_u16_u32(vget_low_u32(a1))); + b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)), + vreinterpret_u16_u32(vget_high_u32(a1))); +#endif + return b0; +} + +static INLINE void transpose_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 10 11 12 13 + // a1: 20 21 22 23 30 31 32 33 + // to: + // b0.val[0]: 00 01 20 21 10 11 30 31 + // b0.val[1]: 02 03 22 23 12 13 32 33 + + const uint16x4x2_t b0 = + vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1)); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 01 20 21 02 03 22 23 + // c0.val[1]: 10 11 30 31 12 13 32 33 + + const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]), + vreinterpret_u32_u16(b0.val[1])); + + // Swap 8 bit elements resulting in: + // d0.val[0]: 00 10 20 30 02 12 22 32 + // d0.val[1]: 01 11 21 31 03 13 23 33 + + const uint8x8x2_t d0 = + vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1])); + + *a0 = d0.val[0]; + *a1 = d0.val[1]; +} + +static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1, + int16x4_t *a2, int16x4_t *a3) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 + // a1: 10 11 12 13 + // a2: 20 21 22 23 + // a3: 30 31 32 33 + // to: + // b0.val[0]: 00 10 02 12 + // b0.val[1]: 01 11 03 13 + // b1.val[0]: 20 30 22 32 + // b1.val[1]: 21 31 23 33 + + const int16x4x2_t b0 = vtrn_s16(*a0, *a1); + const int16x4x2_t b1 = vtrn_s16(*a2, *a3); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 + // c0.val[1]: 02 12 22 32 + // c1.val[0]: 01 11 21 31 + // c1.val[1]: 03 13 23 33 + + const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]), + vreinterpret_s32_s16(b1.val[0])); + const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]), + vreinterpret_s32_s16(b1.val[1])); + + *a0 = vreinterpret_s16_s32(c0.val[0]); + *a1 = vreinterpret_s16_s32(c1.val[0]); + *a2 = vreinterpret_s16_s32(c0.val[1]); + *a3 = vreinterpret_s16_s32(c1.val[1]); +} + +static INLINE void transpose_s16_4x4q(int16x8_t *a0, int16x8_t *a1) { + // Swap 32 bit elements. Goes from: + // a0: 00 01 02 03 10 11 12 13 + // a1: 20 21 22 23 30 31 32 33 + // to: + // b0.val[0]: 00 01 20 21 10 11 30 31 + // b0.val[1]: 02 03 22 23 12 13 32 33 + + const int32x4x2_t b0 = + vtrnq_s32(vreinterpretq_s32_s16(*a0), vreinterpretq_s32_s16(*a1)); + + // Swap 64 bit elements resulting in: + // c0: 00 01 20 21 02 03 22 23 + // c1: 10 11 30 31 12 13 32 33 + + const int16x8x2_t c0 = vpx_vtrnq_s64_to_s16(b0.val[0], b0.val[1]); + + // Swap 16 bit elements resulting in: + // d0.val[0]: 00 10 20 30 02 12 22 32 + // d0.val[1]: 01 11 21 31 03 13 23 33 + + const int16x8x2_t d0 = vtrnq_s16(c0.val[0], c0.val[1]); + + *a0 = d0.val[0]; + *a1 = d0.val[1]; +} + +static INLINE void transpose_u16_4x4q(uint16x8_t *a0, uint16x8_t *a1) { + // Swap 32 bit elements. Goes from: + // a0: 00 01 02 03 10 11 12 13 + // a1: 20 21 22 23 30 31 32 33 + // to: + // b0.val[0]: 00 01 20 21 10 11 30 31 + // b0.val[1]: 02 03 22 23 12 13 32 33 + + const uint32x4x2_t b0 = + vtrnq_u32(vreinterpretq_u32_u16(*a0), vreinterpretq_u32_u16(*a1)); + + // Swap 64 bit elements resulting in: + // c0: 00 01 20 21 02 03 22 23 + // c1: 10 11 30 31 12 13 32 33 + + const uint16x8x2_t c0 = vpx_vtrnq_u64_to_u16(b0.val[0], b0.val[1]); + + // Swap 16 bit elements resulting in: + // d0.val[0]: 00 10 20 30 02 12 22 32 + // d0.val[1]: 01 11 21 31 03 13 23 33 + + const uint16x8x2_t d0 = vtrnq_u16(c0.val[0], c0.val[1]); + + *a0 = d0.val[0]; + *a1 = d0.val[1]; +} + +static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, + uint8x8_t *a3, const uint8x8_t a4, + const uint8x8_t a5, const uint8x8_t a6, + const uint8x8_t a7) { + // Swap 32 bit elements. Goes from: + // a0: 00 01 02 03 XX XX XX XX + // a1: 10 11 12 13 XX XX XX XX + // a2: 20 21 22 23 XX XX XX XX + // a3; 30 31 32 33 XX XX XX XX + // a4: 40 41 42 43 XX XX XX XX + // a5: 50 51 52 53 XX XX XX XX + // a6: 60 61 62 63 XX XX XX XX + // a7: 70 71 72 73 XX XX XX XX + // to: + // b0.val[0]: 00 01 02 03 40 41 42 43 + // b1.val[0]: 10 11 12 13 50 51 52 53 + // b2.val[0]: 20 21 22 23 60 61 62 63 + // b3.val[0]: 30 31 32 33 70 71 72 73 + + const uint32x2x2_t b0 = + vtrn_u32(vreinterpret_u32_u8(*a0), vreinterpret_u32_u8(a4)); + const uint32x2x2_t b1 = + vtrn_u32(vreinterpret_u32_u8(*a1), vreinterpret_u32_u8(a5)); + const uint32x2x2_t b2 = + vtrn_u32(vreinterpret_u32_u8(*a2), vreinterpret_u32_u8(a6)); + const uint32x2x2_t b3 = + vtrn_u32(vreinterpret_u32_u8(*a3), vreinterpret_u32_u8(a7)); + + // Swap 16 bit elements resulting in: + // c0.val[0]: 00 01 20 21 40 41 60 61 + // c0.val[1]: 02 03 22 23 42 43 62 63 + // c1.val[0]: 10 11 30 31 50 51 70 71 + // c1.val[1]: 12 13 32 33 52 53 72 73 + + const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]), + vreinterpret_u16_u32(b2.val[0])); + const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]), + vreinterpret_u16_u32(b3.val[0])); + + // Swap 8 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 + // d0.val[1]: 01 11 21 31 41 51 61 71 + // d1.val[0]: 02 12 22 32 42 52 62 72 + // d1.val[1]: 03 13 23 33 43 53 63 73 + + const uint8x8x2_t d0 = + vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0])); + const uint8x8x2_t d1 = + vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1])); + + *a0 = d0.val[0]; + *a1 = d0.val[1]; + *a2 = d1.val[0]; + *a3 = d1.val[1]; +} + +static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1, + int32x4_t *a2, int32x4_t *a3) { + // Swap 32 bit elements. Goes from: + // a0: 00 01 02 03 + // a1: 10 11 12 13 + // a2: 20 21 22 23 + // a3: 30 31 32 33 + // to: + // b0.val[0]: 00 10 02 12 + // b0.val[1]: 01 11 03 13 + // b1.val[0]: 20 30 22 32 + // b1.val[1]: 21 31 23 33 + + const int32x4x2_t b0 = vtrnq_s32(*a0, *a1); + const int32x4x2_t b1 = vtrnq_s32(*a2, *a3); + + // Swap 64 bit elements resulting in: + // c0.val[0]: 00 10 20 30 + // c0.val[1]: 02 12 22 32 + // c1.val[0]: 01 11 21 31 + // c1.val[1]: 03 13 23 33 + + const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]); + const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]); + + *a0 = c0.val[0]; + *a1 = c1.val[0]; + *a2 = c0.val[1]; + *a3 = c1.val[1]; +} + +static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1, + const int16x4_t a2, const int16x4_t a3, + const int16x4_t a4, const int16x4_t a5, + const int16x4_t a6, const int16x4_t a7, + int16x8_t *const o0, int16x8_t *const o1, + int16x8_t *const o2, int16x8_t *const o3) { + // Combine rows. Goes from: + // a0: 00 01 02 03 + // a1: 10 11 12 13 + // a2: 20 21 22 23 + // a3: 30 31 32 33 + // a4: 40 41 42 43 + // a5: 50 51 52 53 + // a6: 60 61 62 63 + // a7: 70 71 72 73 + // to: + // b0: 00 01 02 03 40 41 42 43 + // b1: 10 11 12 13 50 51 52 53 + // b2: 20 21 22 23 60 61 62 63 + // b3: 30 31 32 33 70 71 72 73 + + const int16x8_t b0 = vcombine_s16(a0, a4); + const int16x8_t b1 = vcombine_s16(a1, a5); + const int16x8_t b2 = vcombine_s16(a2, a6); + const int16x8_t b3 = vcombine_s16(a3, a7); + + // Swap 16 bit elements resulting in: + // c0.val[0]: 00 10 02 12 40 50 42 52 + // c0.val[1]: 01 11 03 13 41 51 43 53 + // c1.val[0]: 20 30 22 32 60 70 62 72 + // c1.val[1]: 21 31 23 33 61 71 63 73 + + const int16x8x2_t c0 = vtrnq_s16(b0, b1); + const int16x8x2_t c1 = vtrnq_s16(b2, b3); + + // Swap 32 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 + // d0.val[1]: 02 12 22 32 42 52 62 72 + // d1.val[0]: 01 11 21 31 41 51 61 71 + // d1.val[1]: 03 13 23 33 43 53 63 73 + + const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]), + vreinterpretq_s32_s16(c1.val[0])); + const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]), + vreinterpretq_s32_s16(c1.val[1])); + + *o0 = vreinterpretq_s16_s32(d0.val[0]); + *o1 = vreinterpretq_s16_s32(d1.val[0]); + *o2 = vreinterpretq_s16_s32(d0.val[1]); + *o3 = vreinterpretq_s16_s32(d1.val[1]); +} + +static INLINE void transpose_s32_4x8(int32x4_t *const a0, int32x4_t *const a1, + int32x4_t *const a2, int32x4_t *const a3, + int32x4_t *const a4, int32x4_t *const a5, + int32x4_t *const a6, int32x4_t *const a7) { + // Swap 32 bit elements. Goes from: + // a0: 00 01 02 03 + // a1: 10 11 12 13 + // a2: 20 21 22 23 + // a3: 30 31 32 33 + // a4: 40 41 42 43 + // a5: 50 51 52 53 + // a6: 60 61 62 63 + // a7: 70 71 72 73 + // to: + // b0.val[0]: 00 10 02 12 + // b0.val[1]: 01 11 03 13 + // b1.val[0]: 20 30 22 32 + // b1.val[1]: 21 31 23 33 + // b2.val[0]: 40 50 42 52 + // b2.val[1]: 41 51 43 53 + // b3.val[0]: 60 70 62 72 + // b3.val[1]: 61 71 63 73 + + const int32x4x2_t b0 = vtrnq_s32(*a0, *a1); + const int32x4x2_t b1 = vtrnq_s32(*a2, *a3); + const int32x4x2_t b2 = vtrnq_s32(*a4, *a5); + const int32x4x2_t b3 = vtrnq_s32(*a6, *a7); + + // Swap 64 bit elements resulting in: + // c0.val[0]: 00 10 20 30 + // c0.val[1]: 02 12 22 32 + // c1.val[0]: 01 11 21 31 + // c1.val[1]: 03 13 23 33 + // c2.val[0]: 40 50 60 70 + // c2.val[1]: 42 52 62 72 + // c3.val[0]: 41 51 61 71 + // c3.val[1]: 43 53 63 73 + + const int64x2x2_t c0 = vpx_vtrnq_s64(b0.val[0], b1.val[0]); + const int64x2x2_t c1 = vpx_vtrnq_s64(b0.val[1], b1.val[1]); + const int64x2x2_t c2 = vpx_vtrnq_s64(b2.val[0], b3.val[0]); + const int64x2x2_t c3 = vpx_vtrnq_s64(b2.val[1], b3.val[1]); + + *a0 = vreinterpretq_s32_s64(c0.val[0]); + *a1 = vreinterpretq_s32_s64(c2.val[0]); + *a2 = vreinterpretq_s32_s64(c1.val[0]); + *a3 = vreinterpretq_s32_s64(c3.val[0]); + *a4 = vreinterpretq_s32_s64(c0.val[1]); + *a5 = vreinterpretq_s32_s64(c2.val[1]); + *a6 = vreinterpretq_s32_s64(c1.val[1]); + *a7 = vreinterpretq_s32_s64(c3.val[1]); +} + +static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, + uint8x8_t *a3) { + // Swap 8 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 + // b0.val[1]: 01 11 03 13 05 15 07 17 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // b1.val[1]: 21 31 23 33 25 35 27 37 + + const uint8x8x2_t b0 = vtrn_u8(*a0, *a1); + const uint8x8x2_t b1 = vtrn_u8(*a2, *a3); + + // Swap 16 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 + // c0.val[1]: 02 12 22 32 06 16 26 36 + // c1.val[0]: 01 11 21 31 05 15 25 35 + // c1.val[1]: 03 13 23 33 07 17 27 37 + + const uint16x4x2_t c0 = + vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0])); + const uint16x4x2_t c1 = + vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1])); + + *a0 = vreinterpret_u8_u16(c0.val[0]); + *a1 = vreinterpret_u8_u16(c1.val[0]); + *a2 = vreinterpret_u8_u16(c0.val[1]); + *a3 = vreinterpret_u8_u16(c1.val[1]); +} + +static INLINE void transpose_u16_8x4(uint16x8_t *a0, uint16x8_t *a1, + uint16x8_t *a2, uint16x8_t *a3) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 + // b0.val[1]: 01 11 03 13 05 15 07 17 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // b1.val[1]: 21 31 23 33 25 35 27 37 + + const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1); + const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 + // c0.val[1]: 02 12 22 32 06 16 26 36 + // c1.val[0]: 01 11 21 31 05 15 25 35 + // c1.val[1]: 03 13 23 33 07 17 27 37 + + const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]), + vreinterpretq_u32_u16(b1.val[0])); + const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]), + vreinterpretq_u32_u16(b1.val[1])); + + *a0 = vreinterpretq_u16_u32(c0.val[0]); + *a1 = vreinterpretq_u16_u32(c1.val[0]); + *a2 = vreinterpretq_u16_u32(c0.val[1]); + *a3 = vreinterpretq_u16_u32(c1.val[1]); +} + +static INLINE void transpose_s32_8x4(int32x4_t *const a0, int32x4_t *const a1, + int32x4_t *const a2, int32x4_t *const a3, + int32x4_t *const a4, int32x4_t *const a5, + int32x4_t *const a6, int32x4_t *const a7) { + // Swap 32 bit elements. Goes from: + // a0: 00 01 02 03 + // a1: 04 05 06 07 + // a2: 10 11 12 13 + // a3: 14 15 16 17 + // a4: 20 21 22 23 + // a5: 24 25 26 27 + // a6: 30 31 32 33 + // a7: 34 35 36 37 + // to: + // b0.val[0]: 00 10 02 12 + // b0.val[1]: 01 11 03 13 + // b1.val[0]: 04 14 06 16 + // b1.val[1]: 05 15 07 17 + // b2.val[0]: 20 30 22 32 + // b2.val[1]: 21 31 23 33 + // b3.val[0]: 24 34 26 36 + // b3.val[1]: 25 35 27 37 + + const int32x4x2_t b0 = vtrnq_s32(*a0, *a2); + const int32x4x2_t b1 = vtrnq_s32(*a1, *a3); + const int32x4x2_t b2 = vtrnq_s32(*a4, *a6); + const int32x4x2_t b3 = vtrnq_s32(*a5, *a7); + + // Swap 64 bit elements resulting in: + // c0.val[0]: 00 10 20 30 + // c0.val[1]: 02 12 22 32 + // c1.val[0]: 01 11 21 31 + // c1.val[1]: 03 13 23 33 + // c2.val[0]: 04 14 24 34 + // c2.val[1]: 06 16 26 36 + // c3.val[0]: 05 15 25 35 + // c3.val[1]: 07 17 27 37 + + const int64x2x2_t c0 = vpx_vtrnq_s64(b0.val[0], b2.val[0]); + const int64x2x2_t c1 = vpx_vtrnq_s64(b0.val[1], b2.val[1]); + const int64x2x2_t c2 = vpx_vtrnq_s64(b1.val[0], b3.val[0]); + const int64x2x2_t c3 = vpx_vtrnq_s64(b1.val[1], b3.val[1]); + + *a0 = vreinterpretq_s32_s64(c0.val[0]); + *a1 = vreinterpretq_s32_s64(c1.val[0]); + *a2 = vreinterpretq_s32_s64(c0.val[1]); + *a3 = vreinterpretq_s32_s64(c1.val[1]); + *a4 = vreinterpretq_s32_s64(c2.val[0]); + *a5 = vreinterpretq_s32_s64(c3.val[0]); + *a6 = vreinterpretq_s32_s64(c2.val[1]); + *a7 = vreinterpretq_s32_s64(c3.val[1]); +} + +// Note: Using 'd' registers or 'q' registers has almost identical speed. We use +// 'q' registers here to save some instructions. +static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, + uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5, + uint8x8_t *a6, uint8x8_t *a7) { + // Swap 8 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // a4: 40 41 42 43 44 45 46 47 + // a5: 50 51 52 53 54 55 56 57 + // a6: 60 61 62 63 64 65 66 67 + // a7: 70 71 72 73 74 75 76 77 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56 + // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57 + // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76 + // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77 + + const uint8x16x2_t b0 = + vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5)); + const uint8x16x2_t b1 = + vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7)); + + // Swap 16 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74 + // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76 + // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75 + // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77 + + const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]), + vreinterpretq_u16_u8(b1.val[0])); + const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]), + vreinterpretq_u16_u8(b1.val[1])); + + // Unzip 32 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]), + vreinterpretq_u32_u16(c1.val[0])); + const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]), + vreinterpretq_u32_u16(c1.val[1])); + + *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0])); + *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0])); + *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0])); + *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0])); + *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1])); + *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1])); + *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1])); + *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1])); +} + +// Transpose 8x8 to a new location. +static INLINE void transpose_s16_8x8q(int16x8_t *a, int16x8_t *out) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // a4: 40 41 42 43 44 45 46 47 + // a5: 50 51 52 53 54 55 56 57 + // a6: 60 61 62 63 64 65 66 67 + // a7: 70 71 72 73 74 75 76 77 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 + // b0.val[1]: 01 11 03 13 05 15 07 17 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // b1.val[1]: 21 31 23 33 25 35 27 37 + // b2.val[0]: 40 50 42 52 44 54 46 56 + // b2.val[1]: 41 51 43 53 45 55 47 57 + // b3.val[0]: 60 70 62 72 64 74 66 76 + // b3.val[1]: 61 71 63 73 65 75 67 77 + + const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]); + const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]); + const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]); + const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 + // c0.val[1]: 02 12 22 32 06 16 26 36 + // c1.val[0]: 01 11 21 31 05 15 25 35 + // c1.val[1]: 03 13 23 33 07 17 27 37 + // c2.val[0]: 40 50 60 70 44 54 64 74 + // c2.val[1]: 42 52 62 72 46 56 66 76 + // c3.val[0]: 41 51 61 71 45 55 65 75 + // c3.val[1]: 43 53 63 73 47 57 67 77 + + const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), + vreinterpretq_s32_s16(b1.val[0])); + const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]), + vreinterpretq_s32_s16(b1.val[1])); + const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]), + vreinterpretq_s32_s16(b3.val[0])); + const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]), + vreinterpretq_s32_s16(b3.val[1])); + + // Swap 64 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 + // d0.val[1]: 04 14 24 34 44 54 64 74 + // d1.val[0]: 01 11 21 31 41 51 61 71 + // d1.val[1]: 05 15 25 35 45 55 65 75 + // d2.val[0]: 02 12 22 32 42 52 62 72 + // d2.val[1]: 06 16 26 36 46 56 66 76 + // d3.val[0]: 03 13 23 33 43 53 63 73 + // d3.val[1]: 07 17 27 37 47 57 67 77 + + const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]); + const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]); + const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]); + const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]); + + out[0] = d0.val[0]; + out[1] = d1.val[0]; + out[2] = d2.val[0]; + out[3] = d3.val[0]; + out[4] = d0.val[1]; + out[5] = d1.val[1]; + out[6] = d2.val[1]; + out[7] = d3.val[1]; +} + +static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1, + int16x8_t *a2, int16x8_t *a3, + int16x8_t *a4, int16x8_t *a5, + int16x8_t *a6, int16x8_t *a7) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // a4: 40 41 42 43 44 45 46 47 + // a5: 50 51 52 53 54 55 56 57 + // a6: 60 61 62 63 64 65 66 67 + // a7: 70 71 72 73 74 75 76 77 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 + // b0.val[1]: 01 11 03 13 05 15 07 17 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // b1.val[1]: 21 31 23 33 25 35 27 37 + // b2.val[0]: 40 50 42 52 44 54 46 56 + // b2.val[1]: 41 51 43 53 45 55 47 57 + // b3.val[0]: 60 70 62 72 64 74 66 76 + // b3.val[1]: 61 71 63 73 65 75 67 77 + + const int16x8x2_t b0 = vtrnq_s16(*a0, *a1); + const int16x8x2_t b1 = vtrnq_s16(*a2, *a3); + const int16x8x2_t b2 = vtrnq_s16(*a4, *a5); + const int16x8x2_t b3 = vtrnq_s16(*a6, *a7); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 + // c0.val[1]: 02 12 22 32 06 16 26 36 + // c1.val[0]: 01 11 21 31 05 15 25 35 + // c1.val[1]: 03 13 23 33 07 17 27 37 + // c2.val[0]: 40 50 60 70 44 54 64 74 + // c2.val[1]: 42 52 62 72 46 56 66 76 + // c3.val[0]: 41 51 61 71 45 55 65 75 + // c3.val[1]: 43 53 63 73 47 57 67 77 + + const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), + vreinterpretq_s32_s16(b1.val[0])); + const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]), + vreinterpretq_s32_s16(b1.val[1])); + const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]), + vreinterpretq_s32_s16(b3.val[0])); + const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]), + vreinterpretq_s32_s16(b3.val[1])); + + // Swap 64 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 + // d0.val[1]: 04 14 24 34 44 54 64 74 + // d1.val[0]: 01 11 21 31 41 51 61 71 + // d1.val[1]: 05 15 25 35 45 55 65 75 + // d2.val[0]: 02 12 22 32 42 52 62 72 + // d2.val[1]: 06 16 26 36 46 56 66 76 + // d3.val[0]: 03 13 23 33 43 53 63 73 + // d3.val[1]: 07 17 27 37 47 57 67 77 + + const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]); + const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]); + const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]); + const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]); + + *a0 = d0.val[0]; + *a1 = d1.val[0]; + *a2 = d2.val[0]; + *a3 = d3.val[0]; + *a4 = d0.val[1]; + *a5 = d1.val[1]; + *a6 = d2.val[1]; + *a7 = d3.val[1]; +} + +static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1, + uint16x8_t *a2, uint16x8_t *a3, + uint16x8_t *a4, uint16x8_t *a5, + uint16x8_t *a6, uint16x8_t *a7) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // a4: 40 41 42 43 44 45 46 47 + // a5: 50 51 52 53 54 55 56 57 + // a6: 60 61 62 63 64 65 66 67 + // a7: 70 71 72 73 74 75 76 77 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 + // b0.val[1]: 01 11 03 13 05 15 07 17 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // b1.val[1]: 21 31 23 33 25 35 27 37 + // b2.val[0]: 40 50 42 52 44 54 46 56 + // b2.val[1]: 41 51 43 53 45 55 47 57 + // b3.val[0]: 60 70 62 72 64 74 66 76 + // b3.val[1]: 61 71 63 73 65 75 67 77 + + const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1); + const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3); + const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5); + const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 + // c0.val[1]: 02 12 22 32 06 16 26 36 + // c1.val[0]: 01 11 21 31 05 15 25 35 + // c1.val[1]: 03 13 23 33 07 17 27 37 + // c2.val[0]: 40 50 60 70 44 54 64 74 + // c2.val[1]: 42 52 62 72 46 56 66 76 + // c3.val[0]: 41 51 61 71 45 55 65 75 + // c3.val[1]: 43 53 63 73 47 57 67 77 + + const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]), + vreinterpretq_u32_u16(b1.val[0])); + const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]), + vreinterpretq_u32_u16(b1.val[1])); + const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]), + vreinterpretq_u32_u16(b3.val[0])); + const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]), + vreinterpretq_u32_u16(b3.val[1])); + + // Swap 64 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 + // d0.val[1]: 04 14 24 34 44 54 64 74 + // d1.val[0]: 01 11 21 31 41 51 61 71 + // d1.val[1]: 05 15 25 35 45 55 65 75 + // d2.val[0]: 02 12 22 32 42 52 62 72 + // d2.val[1]: 06 16 26 36 46 56 66 76 + // d3.val[0]: 03 13 23 33 43 53 63 73 + // d3.val[1]: 07 17 27 37 47 57 67 77 + + const uint16x8x2_t d0 = vpx_vtrnq_u64_to_u16(c0.val[0], c2.val[0]); + const uint16x8x2_t d1 = vpx_vtrnq_u64_to_u16(c1.val[0], c3.val[0]); + const uint16x8x2_t d2 = vpx_vtrnq_u64_to_u16(c0.val[1], c2.val[1]); + const uint16x8x2_t d3 = vpx_vtrnq_u64_to_u16(c1.val[1], c3.val[1]); + + *a0 = d0.val[0]; + *a1 = d1.val[0]; + *a2 = d2.val[0]; + *a3 = d3.val[0]; + *a4 = d0.val[1]; + *a5 = d1.val[1]; + *a6 = d2.val[1]; + *a7 = d3.val[1]; +} + +static INLINE void transpose_s32_8x8(int32x4x2_t *a0, int32x4x2_t *a1, + int32x4x2_t *a2, int32x4x2_t *a3, + int32x4x2_t *a4, int32x4x2_t *a5, + int32x4x2_t *a6, int32x4x2_t *a7) { + // Swap 32 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // a4: 40 41 42 43 44 45 46 47 + // a5: 50 51 52 53 54 55 56 57 + // a6: 60 61 62 63 64 65 66 67 + // a7: 70 71 72 73 74 75 76 77 + // to: + // b0: 00 10 02 12 01 11 03 13 + // b1: 20 30 22 32 21 31 23 33 + // b2: 40 50 42 52 41 51 43 53 + // b3: 60 70 62 72 61 71 63 73 + // b4: 04 14 06 16 05 15 07 17 + // b5: 24 34 26 36 25 35 27 37 + // b6: 44 54 46 56 45 55 47 57 + // b7: 64 74 66 76 65 75 67 77 + + const int32x4x2_t b0 = vtrnq_s32(a0->val[0], a1->val[0]); + const int32x4x2_t b1 = vtrnq_s32(a2->val[0], a3->val[0]); + const int32x4x2_t b2 = vtrnq_s32(a4->val[0], a5->val[0]); + const int32x4x2_t b3 = vtrnq_s32(a6->val[0], a7->val[0]); + const int32x4x2_t b4 = vtrnq_s32(a0->val[1], a1->val[1]); + const int32x4x2_t b5 = vtrnq_s32(a2->val[1], a3->val[1]); + const int32x4x2_t b6 = vtrnq_s32(a4->val[1], a5->val[1]); + const int32x4x2_t b7 = vtrnq_s32(a6->val[1], a7->val[1]); + + // Swap 64 bit elements resulting in: + // c0: 00 10 20 30 02 12 22 32 + // c1: 01 11 21 31 03 13 23 33 + // c2: 40 50 60 70 42 52 62 72 + // c3: 41 51 61 71 43 53 63 73 + // c4: 04 14 24 34 06 16 26 36 + // c5: 05 15 25 35 07 17 27 37 + // c6: 44 54 64 74 46 56 66 76 + // c7: 45 55 65 75 47 57 67 77 + const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]); + const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]); + const int32x4x2_t c2 = vpx_vtrnq_s64_to_s32(b2.val[0], b3.val[0]); + const int32x4x2_t c3 = vpx_vtrnq_s64_to_s32(b2.val[1], b3.val[1]); + const int32x4x2_t c4 = vpx_vtrnq_s64_to_s32(b4.val[0], b5.val[0]); + const int32x4x2_t c5 = vpx_vtrnq_s64_to_s32(b4.val[1], b5.val[1]); + const int32x4x2_t c6 = vpx_vtrnq_s64_to_s32(b6.val[0], b7.val[0]); + const int32x4x2_t c7 = vpx_vtrnq_s64_to_s32(b6.val[1], b7.val[1]); + + // Swap 128 bit elements resulting in: + // a0: 00 10 20 30 40 50 60 70 + // a1: 01 11 21 31 41 51 61 71 + // a2: 02 12 22 32 42 52 62 72 + // a3: 03 13 23 33 43 53 63 73 + // a4: 04 14 24 34 44 54 64 74 + // a5: 05 15 25 35 45 55 65 75 + // a6: 06 16 26 36 46 56 66 76 + // a7: 07 17 27 37 47 57 67 77 + a0->val[0] = c0.val[0]; + a0->val[1] = c2.val[0]; + a1->val[0] = c1.val[0]; + a1->val[1] = c3.val[0]; + a2->val[0] = c0.val[1]; + a2->val[1] = c2.val[1]; + a3->val[0] = c1.val[1]; + a3->val[1] = c3.val[1]; + a4->val[0] = c4.val[0]; + a4->val[1] = c6.val[0]; + a5->val[0] = c5.val[0]; + a5->val[1] = c7.val[0]; + a6->val[0] = c4.val[1]; + a6->val[1] = c6.val[1]; + a7->val[0] = c5.val[1]; + a7->val[1] = c7.val[1]; +} + +// Helper transpose function for highbd FDCT variants +static INLINE void transpose_s32_8x8_2(int32x4_t *left /*[8]*/, + int32x4_t *right /*[8]*/, + int32x4_t *out_left /*[8]*/, + int32x4_t *out_right /*[8]*/) { + int32x4x2_t out[8]; + + out[0].val[0] = left[0]; + out[0].val[1] = right[0]; + out[1].val[0] = left[1]; + out[1].val[1] = right[1]; + out[2].val[0] = left[2]; + out[2].val[1] = right[2]; + out[3].val[0] = left[3]; + out[3].val[1] = right[3]; + out[4].val[0] = left[4]; + out[4].val[1] = right[4]; + out[5].val[0] = left[5]; + out[5].val[1] = right[5]; + out[6].val[0] = left[6]; + out[6].val[1] = right[6]; + out[7].val[0] = left[7]; + out[7].val[1] = right[7]; + + transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], + &out[6], &out[7]); + + out_left[0] = out[0].val[0]; + out_left[1] = out[1].val[0]; + out_left[2] = out[2].val[0]; + out_left[3] = out[3].val[0]; + out_left[4] = out[4].val[0]; + out_left[5] = out[5].val[0]; + out_left[6] = out[6].val[0]; + out_left[7] = out[7].val[0]; + out_right[0] = out[0].val[1]; + out_right[1] = out[1].val[1]; + out_right[2] = out[2].val[1]; + out_right[3] = out[3].val[1]; + out_right[4] = out[4].val[1]; + out_right[5] = out[5].val[1]; + out_right[6] = out[6].val[1]; + out_right[7] = out[7].val[1]; +} + +static INLINE void transpose_s32_16x16(int32x4_t *left1, int32x4_t *right1, + int32x4_t *left2, int32x4_t *right2) { + int32x4_t tl[16], tr[16]; + + // transpose the 4 8x8 quadrants separately but first swap quadrants 2 and 3. + tl[0] = left1[8]; + tl[1] = left1[9]; + tl[2] = left1[10]; + tl[3] = left1[11]; + tl[4] = left1[12]; + tl[5] = left1[13]; + tl[6] = left1[14]; + tl[7] = left1[15]; + tr[0] = right1[8]; + tr[1] = right1[9]; + tr[2] = right1[10]; + tr[3] = right1[11]; + tr[4] = right1[12]; + tr[5] = right1[13]; + tr[6] = right1[14]; + tr[7] = right1[15]; + + left1[8] = left2[0]; + left1[9] = left2[1]; + left1[10] = left2[2]; + left1[11] = left2[3]; + left1[12] = left2[4]; + left1[13] = left2[5]; + left1[14] = left2[6]; + left1[15] = left2[7]; + right1[8] = right2[0]; + right1[9] = right2[1]; + right1[10] = right2[2]; + right1[11] = right2[3]; + right1[12] = right2[4]; + right1[13] = right2[5]; + right1[14] = right2[6]; + right1[15] = right2[7]; + + left2[0] = tl[0]; + left2[1] = tl[1]; + left2[2] = tl[2]; + left2[3] = tl[3]; + left2[4] = tl[4]; + left2[5] = tl[5]; + left2[6] = tl[6]; + left2[7] = tl[7]; + right2[0] = tr[0]; + right2[1] = tr[1]; + right2[2] = tr[2]; + right2[3] = tr[3]; + right2[4] = tr[4]; + right2[5] = tr[5]; + right2[6] = tr[6]; + right2[7] = tr[7]; + + transpose_s32_8x8_2(left1, right1, left1, right1); + transpose_s32_8x8_2(left2, right2, left2, right2); + transpose_s32_8x8_2(left1 + 8, right1 + 8, left1 + 8, right1 + 8); + transpose_s32_8x8_2(left2 + 8, right2 + 8, left2 + 8, right2 + 8); +} + +static INLINE void transpose_u8_16x8( + const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2, + const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5, + const uint8x16_t i6, const uint8x16_t i7, uint8x8_t *o0, uint8x8_t *o1, + uint8x8_t *o2, uint8x8_t *o3, uint8x8_t *o4, uint8x8_t *o5, uint8x8_t *o6, + uint8x8_t *o7, uint8x8_t *o8, uint8x8_t *o9, uint8x8_t *o10, uint8x8_t *o11, + uint8x8_t *o12, uint8x8_t *o13, uint8x8_t *o14, uint8x8_t *o15) { + // Swap 8 bit elements. Goes from: + // i0: 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F + // i1: 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F + // i2: 20 21 22 23 24 25 26 27 28 29 2A 2B 2C 2D 2E 2F + // i3: 30 31 32 33 34 35 36 37 38 39 3A 3B 3C 3D 3E 3F + // i4: 40 41 42 43 44 45 46 47 48 49 4A 4B 4C 4D 4E 4F + // i5: 50 51 52 53 54 55 56 57 58 59 5A 5B 5C 5D 5E 5F + // i6: 60 61 62 63 64 65 66 67 68 69 6A 6B 6C 6D 6E 6F + // i7: 70 71 72 73 74 75 76 77 78 79 7A 7B 7C 7D 7E 7F + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E + // b0.val[1]: 01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F + // b1.val[0]: 20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E + // b1.val[1]: 21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F + // b2.val[0]: 40 50 42 52 44 54 46 56 48 58 4A 5A 4C 5C 4E 5E + // b2.val[1]: 41 51 43 53 45 55 47 57 49 59 4B 5B 4D 5D 4F 5F + // b3.val[0]: 60 70 62 72 64 74 66 76 68 78 6A 7A 6C 7C 6E 7E + // b3.val[1]: 61 71 63 73 65 75 67 77 69 79 6B 7B 6D 7D 6F 7F + const uint8x16x2_t b0 = vtrnq_u8(i0, i1); + const uint8x16x2_t b1 = vtrnq_u8(i2, i3); + const uint8x16x2_t b2 = vtrnq_u8(i4, i5); + const uint8x16x2_t b3 = vtrnq_u8(i6, i7); + + // Swap 16 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 08 18 28 38 0C 1C 2C 3C + // c0.val[1]: 02 12 22 32 06 16 26 36 0A 1A 2A 3A 0E 1E 2E 3E + // c1.val[0]: 01 11 21 31 05 15 25 35 09 19 29 39 0D 1D 2D 3D + // c1.val[1]: 03 13 23 33 07 17 27 37 0B 1B 2B 3B 0F 1F 2F 3F + // c2.val[0]: 40 50 60 70 44 54 64 74 48 58 68 78 4C 5C 6C 7C + // c2.val[1]: 42 52 62 72 46 56 66 76 4A 5A 6A 7A 4E 5E 6E 7E + // c3.val[0]: 41 51 61 71 45 55 65 75 49 59 69 79 4D 5D 6D 7D + // c3.val[1]: 43 53 63 73 47 57 67 77 4B 5B 6B 7B 4F 5F 6F 7F + const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]), + vreinterpretq_u16_u8(b1.val[0])); + const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]), + vreinterpretq_u16_u8(b1.val[1])); + const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]), + vreinterpretq_u16_u8(b3.val[0])); + const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]), + vreinterpretq_u16_u8(b3.val[1])); + + // Swap 32 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78 + // d0.val[1]: 04 14 24 34 44 54 64 74 0C 1C 2C 3C 4C 5C 6C 7C + // d1.val[0]: 02 12 22 32 42 52 62 72 0A 1A 2A 3A 4A 5A 6A 7A + // d1.val[1]: 06 16 26 36 46 56 66 76 0E 1E 2E 3E 4E 5E 6E 7E + // d2.val[0]: 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79 + // d2.val[1]: 05 15 25 35 45 55 65 75 0D 1D 2D 3D 4D 5D 6D 7D + // d3.val[0]: 03 13 23 33 43 53 63 73 0B 1B 2B 3B 4B 5B 6B 7B + // d3.val[1]: 07 17 27 37 47 57 67 77 0F 1F 2F 3F 4F 5F 6F 7F + const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]), + vreinterpretq_u32_u16(c2.val[0])); + const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]), + vreinterpretq_u32_u16(c2.val[1])); + const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]), + vreinterpretq_u32_u16(c3.val[0])); + const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]), + vreinterpretq_u32_u16(c3.val[1])); + + // Output: + // o0 : 00 10 20 30 40 50 60 70 + // o1 : 01 11 21 31 41 51 61 71 + // o2 : 02 12 22 32 42 52 62 72 + // o3 : 03 13 23 33 43 53 63 73 + // o4 : 04 14 24 34 44 54 64 74 + // o5 : 05 15 25 35 45 55 65 75 + // o6 : 06 16 26 36 46 56 66 76 + // o7 : 07 17 27 37 47 57 67 77 + // o8 : 08 18 28 38 48 58 68 78 + // o9 : 09 19 29 39 49 59 69 79 + // o10: 0A 1A 2A 3A 4A 5A 6A 7A + // o11: 0B 1B 2B 3B 4B 5B 6B 7B + // o12: 0C 1C 2C 3C 4C 5C 6C 7C + // o13: 0D 1D 2D 3D 4D 5D 6D 7D + // o14: 0E 1E 2E 3E 4E 5E 6E 7E + // o15: 0F 1F 2F 3F 4F 5F 6F 7F + *o0 = vget_low_u8(vreinterpretq_u8_u32(d0.val[0])); + *o1 = vget_low_u8(vreinterpretq_u8_u32(d2.val[0])); + *o2 = vget_low_u8(vreinterpretq_u8_u32(d1.val[0])); + *o3 = vget_low_u8(vreinterpretq_u8_u32(d3.val[0])); + *o4 = vget_low_u8(vreinterpretq_u8_u32(d0.val[1])); + *o5 = vget_low_u8(vreinterpretq_u8_u32(d2.val[1])); + *o6 = vget_low_u8(vreinterpretq_u8_u32(d1.val[1])); + *o7 = vget_low_u8(vreinterpretq_u8_u32(d3.val[1])); + *o8 = vget_high_u8(vreinterpretq_u8_u32(d0.val[0])); + *o9 = vget_high_u8(vreinterpretq_u8_u32(d2.val[0])); + *o10 = vget_high_u8(vreinterpretq_u8_u32(d1.val[0])); + *o11 = vget_high_u8(vreinterpretq_u8_u32(d3.val[0])); + *o12 = vget_high_u8(vreinterpretq_u8_u32(d0.val[1])); + *o13 = vget_high_u8(vreinterpretq_u8_u32(d2.val[1])); + *o14 = vget_high_u8(vreinterpretq_u8_u32(d1.val[1])); + *o15 = vget_high_u8(vreinterpretq_u8_u32(d3.val[1])); +} + +static INLINE void transpose_u8_8x16( + const uint8x8_t i0, const uint8x8_t i1, const uint8x8_t i2, + const uint8x8_t i3, const uint8x8_t i4, const uint8x8_t i5, + const uint8x8_t i6, const uint8x8_t i7, const uint8x8_t i8, + const uint8x8_t i9, const uint8x8_t i10, const uint8x8_t i11, + const uint8x8_t i12, const uint8x8_t i13, const uint8x8_t i14, + const uint8x8_t i15, uint8x16_t *o0, uint8x16_t *o1, uint8x16_t *o2, + uint8x16_t *o3, uint8x16_t *o4, uint8x16_t *o5, uint8x16_t *o6, + uint8x16_t *o7) { + // Combine 8 bit elements. Goes from: + // i0 : 00 01 02 03 04 05 06 07 + // i1 : 10 11 12 13 14 15 16 17 + // i2 : 20 21 22 23 24 25 26 27 + // i3 : 30 31 32 33 34 35 36 37 + // i4 : 40 41 42 43 44 45 46 47 + // i5 : 50 51 52 53 54 55 56 57 + // i6 : 60 61 62 63 64 65 66 67 + // i7 : 70 71 72 73 74 75 76 77 + // i8 : 80 81 82 83 84 85 86 87 + // i9 : 90 91 92 93 94 95 96 97 + // i10: A0 A1 A2 A3 A4 A5 A6 A7 + // i11: B0 B1 B2 B3 B4 B5 B6 B7 + // i12: C0 C1 C2 C3 C4 C5 C6 C7 + // i13: D0 D1 D2 D3 D4 D5 D6 D7 + // i14: E0 E1 E2 E3 E4 E5 E6 E7 + // i15: F0 F1 F2 F3 F4 F5 F6 F7 + // to: + // a0: 00 01 02 03 04 05 06 07 80 81 82 83 84 85 86 87 + // a1: 10 11 12 13 14 15 16 17 90 91 92 93 94 95 96 97 + // a2: 20 21 22 23 24 25 26 27 A0 A1 A2 A3 A4 A5 A6 A7 + // a3: 30 31 32 33 34 35 36 37 B0 B1 B2 B3 B4 B5 B6 B7 + // a4: 40 41 42 43 44 45 46 47 C0 C1 C2 C3 C4 C5 C6 C7 + // a5: 50 51 52 53 54 55 56 57 D0 D1 D2 D3 D4 D5 D6 D7 + // a6: 60 61 62 63 64 65 66 67 E0 E1 E2 E3 E4 E5 E6 E7 + // a7: 70 71 72 73 74 75 76 77 F0 F1 F2 F3 F4 F5 F6 F7 + const uint8x16_t a0 = vcombine_u8(i0, i8); + const uint8x16_t a1 = vcombine_u8(i1, i9); + const uint8x16_t a2 = vcombine_u8(i2, i10); + const uint8x16_t a3 = vcombine_u8(i3, i11); + const uint8x16_t a4 = vcombine_u8(i4, i12); + const uint8x16_t a5 = vcombine_u8(i5, i13); + const uint8x16_t a6 = vcombine_u8(i6, i14); + const uint8x16_t a7 = vcombine_u8(i7, i15); + + // Swap 8 bit elements resulting in: + // b0.val[0]: 00 10 02 12 04 14 06 16 80 90 82 92 84 94 86 96 + // b0.val[1]: 01 11 03 13 05 15 07 17 81 91 83 93 85 95 87 97 + // b1.val[0]: 20 30 22 32 24 34 26 36 A0 B0 A2 B2 A4 B4 A6 B6 + // b1.val[1]: 21 31 23 33 25 35 27 37 A1 B1 A3 B3 A5 B5 A7 B7 + // b2.val[0]: 40 50 42 52 44 54 46 56 C0 D0 C2 D2 C4 D4 C6 D6 + // b2.val[1]: 41 51 43 53 45 55 47 57 C1 D1 C3 D3 C5 D5 C7 D7 + // b3.val[0]: 60 70 62 72 64 74 66 76 E0 F0 E2 F2 E4 F4 E6 F6 + // b3.val[1]: 61 71 63 73 65 75 67 77 E1 F1 E3 F3 E5 F5 E7 F7 + const uint8x16x2_t b0 = vtrnq_u8(a0, a1); + const uint8x16x2_t b1 = vtrnq_u8(a2, a3); + const uint8x16x2_t b2 = vtrnq_u8(a4, a5); + const uint8x16x2_t b3 = vtrnq_u8(a6, a7); + + // Swap 16 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 80 90 A0 B0 84 94 A4 B4 + // c0.val[1]: 02 12 22 32 06 16 26 36 82 92 A2 B2 86 96 A6 B6 + // c1.val[0]: 01 11 21 31 05 15 25 35 81 91 A1 B1 85 95 A5 B5 + // c1.val[1]: 03 13 23 33 07 17 27 37 83 93 A3 B3 87 97 A7 B7 + // c2.val[0]: 40 50 60 70 44 54 64 74 C0 D0 E0 F0 C4 D4 E4 F4 + // c2.val[1]: 42 52 62 72 46 56 66 76 C2 D2 E2 F2 C6 D6 E6 F6 + // c3.val[0]: 41 51 61 71 45 55 65 75 C1 D1 E1 F1 C5 D5 E5 F5 + // c3.val[1]: 43 53 63 73 47 57 67 77 C3 D3 E3 F3 C7 D7 E7 F7 + const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]), + vreinterpretq_u16_u8(b1.val[0])); + const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]), + vreinterpretq_u16_u8(b1.val[1])); + const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]), + vreinterpretq_u16_u8(b3.val[0])); + const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]), + vreinterpretq_u16_u8(b3.val[1])); + + // Swap 32 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0 + // d0.val[1]: 04 14 24 34 44 54 64 74 84 94 A4 B4 C4 D4 E4 F4 + // d1.val[0]: 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2 + // d1.val[1]: 06 16 26 36 46 56 66 76 86 96 A6 B6 C6 D6 E6 F6 + // d2.val[0]: 01 11 21 31 41 51 61 71 81 91 A1 B1 C1 D1 E1 F1 + // d2.val[1]: 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5 + // d3.val[0]: 03 13 23 33 43 53 63 73 83 93 A3 B3 C3 D3 E3 F3 + // d3.val[1]: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7 + const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]), + vreinterpretq_u32_u16(c2.val[0])); + const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]), + vreinterpretq_u32_u16(c2.val[1])); + const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]), + vreinterpretq_u32_u16(c3.val[0])); + const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]), + vreinterpretq_u32_u16(c3.val[1])); + + // Output: + // o0: 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0 + // o1: 01 11 21 31 41 51 61 71 81 91 A1 B1 C1 D1 E1 F1 + // o2: 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2 + // o3: 03 13 23 33 43 53 63 73 83 93 A3 B3 C3 D3 E3 F3 + // o4: 04 14 24 34 44 54 64 74 84 94 A4 B4 C4 D4 E4 F4 + // o5: 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5 + // o6: 06 16 26 36 46 56 66 76 86 96 A6 B6 C6 D6 E6 F6 + // o7: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7 + *o0 = vreinterpretq_u8_u32(d0.val[0]); + *o1 = vreinterpretq_u8_u32(d2.val[0]); + *o2 = vreinterpretq_u8_u32(d1.val[0]); + *o3 = vreinterpretq_u8_u32(d3.val[0]); + *o4 = vreinterpretq_u8_u32(d0.val[1]); + *o5 = vreinterpretq_u8_u32(d2.val[1]); + *o6 = vreinterpretq_u8_u32(d1.val[1]); + *o7 = vreinterpretq_u8_u32(d3.val[1]); +} + +static INLINE void transpose_u8_16x16( + const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2, + const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5, + const uint8x16_t i6, const uint8x16_t i7, const uint8x16_t i8, + const uint8x16_t i9, const uint8x16_t i10, const uint8x16_t i11, + const uint8x16_t i12, const uint8x16_t i13, const uint8x16_t i14, + const uint8x16_t i15, uint8x16_t *o0, uint8x16_t *o1, uint8x16_t *o2, + uint8x16_t *o3, uint8x16_t *o4, uint8x16_t *o5, uint8x16_t *o6, + uint8x16_t *o7, uint8x16_t *o8, uint8x16_t *o9, uint8x16_t *o10, + uint8x16_t *o11, uint8x16_t *o12, uint8x16_t *o13, uint8x16_t *o14, + uint8x16_t *o15) { + // Swap 8 bit elements. Goes from: + // i0: 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F + // i1: 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F + // i2: 20 21 22 23 24 25 26 27 28 29 2A 2B 2C 2D 2E 2F + // i3: 30 31 32 33 34 35 36 37 38 39 3A 3B 3C 3D 3E 3F + // i4: 40 41 42 43 44 45 46 47 48 49 4A 4B 4C 4D 4E 4F + // i5: 50 51 52 53 54 55 56 57 58 59 5A 5B 5C 5D 5E 5F + // i6: 60 61 62 63 64 65 66 67 68 69 6A 6B 6C 6D 6E 6F + // i7: 70 71 72 73 74 75 76 77 78 79 7A 7B 7C 7D 7E 7F + // i8: 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F + // i9: 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F + // i10: A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF + // i11: B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF + // i12: C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF + // i13: D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF + // i14: E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF + // i15: F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E + // b0.val[1]: 01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F + // b1.val[0]: 20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E + // b1.val[1]: 21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F + // b2.val[0]: 40 50 42 52 44 54 46 56 48 58 4A 5A 4C 5C 4E 5E + // b2.val[1]: 41 51 43 53 45 55 47 57 49 59 4B 5B 4D 5D 4F 5F + // b3.val[0]: 60 70 62 72 64 74 66 76 68 78 6A 7A 6C 7C 6E 7E + // b3.val[1]: 61 71 63 73 65 75 67 77 69 79 6B 7B 6D 7D 6F 7F + // b4.val[0]: 80 90 82 92 84 94 86 96 88 98 8A 9A 8C 9C 8E 9E + // b4.val[1]: 81 91 83 93 85 95 87 97 89 99 8B 9B 8D 9D 8F 9F + // b5.val[0]: A0 B0 A2 B2 A4 B4 A6 B6 A8 B8 AA BA AC BC AE BE + // b5.val[1]: A1 B1 A3 B3 A5 B5 A7 B7 A9 B9 AB BB AD BD AF BF + // b6.val[0]: C0 D0 C2 D2 C4 D4 C6 D6 C8 D8 CA DA CC DC CE DE + // b6.val[1]: C1 D1 C3 D3 C5 D5 C7 D7 C9 D9 CB DB CD DD CF DF + // b7.val[0]: E0 F0 E2 F2 E4 F4 E6 F6 E8 F8 EA FA EC FC EE FE + // b7.val[1]: E1 F1 E3 F3 E5 F5 E7 F7 E9 F9 EB FB ED FD EF FF + const uint8x16x2_t b0 = vtrnq_u8(i0, i1); + const uint8x16x2_t b1 = vtrnq_u8(i2, i3); + const uint8x16x2_t b2 = vtrnq_u8(i4, i5); + const uint8x16x2_t b3 = vtrnq_u8(i6, i7); + const uint8x16x2_t b4 = vtrnq_u8(i8, i9); + const uint8x16x2_t b5 = vtrnq_u8(i10, i11); + const uint8x16x2_t b6 = vtrnq_u8(i12, i13); + const uint8x16x2_t b7 = vtrnq_u8(i14, i15); + + // Swap 16 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 08 18 28 38 0C 1C 2C 3C + // c0.val[1]: 02 12 22 32 06 16 26 36 0A 1A 2A 3A 0E 1E 2E 3E + // c1.val[0]: 01 11 21 31 05 15 25 35 09 19 29 39 0D 1D 2D 3D + // c1.val[1]: 03 13 23 33 07 17 27 37 0B 1B 2B 3B 0F 1F 2F 3F + // c2.val[0]: 40 50 60 70 44 54 64 74 48 58 68 78 4C 5C 6C 7C + // c2.val[1]: 42 52 62 72 46 56 66 76 4A 5A 6A 7A 4E 5E 6E 7E + // c3.val[0]: 41 51 61 71 45 55 65 75 49 59 69 79 4D 5D 6D 7D + // c3.val[1]: 43 53 63 73 47 57 67 77 4B 5B 6B 7B 4F 5F 6F 7F + // c4.val[0]: 80 90 A0 B0 84 94 A4 B4 88 98 A8 B8 8C 9C AC BC + // c4.val[1]: 82 92 A2 B2 86 96 A6 B6 8A 9A AA BA 8E 9E AE BE + // c5.val[0]: 81 91 A1 B1 85 95 A5 B5 89 99 A9 B9 8D 9D AD BD + // c5.val[1]: 83 93 A3 B3 87 97 A7 B7 8B 9B AB BB 8F 9F AF BF + // c6.val[0]: C0 D0 E0 F0 C4 D4 E4 F4 C8 D8 E8 F8 CC DC EC FC + // c6.val[1]: C2 D2 E2 F2 C6 D6 E6 F6 CA DA EA FA CE DE EE FE + // c7.val[0]: C1 D1 E1 F1 C5 D5 E5 F5 C9 D9 E9 F9 CD DD ED FD + // c7.val[1]: C3 D3 E3 F3 C7 D7 E7 F7 CB DB EB FB CF DF EF FF + const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]), + vreinterpretq_u16_u8(b1.val[0])); + const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]), + vreinterpretq_u16_u8(b1.val[1])); + const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]), + vreinterpretq_u16_u8(b3.val[0])); + const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]), + vreinterpretq_u16_u8(b3.val[1])); + const uint16x8x2_t c4 = vtrnq_u16(vreinterpretq_u16_u8(b4.val[0]), + vreinterpretq_u16_u8(b5.val[0])); + const uint16x8x2_t c5 = vtrnq_u16(vreinterpretq_u16_u8(b4.val[1]), + vreinterpretq_u16_u8(b5.val[1])); + const uint16x8x2_t c6 = vtrnq_u16(vreinterpretq_u16_u8(b6.val[0]), + vreinterpretq_u16_u8(b7.val[0])); + const uint16x8x2_t c7 = vtrnq_u16(vreinterpretq_u16_u8(b6.val[1]), + vreinterpretq_u16_u8(b7.val[1])); + + // Swap 32 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78 + // d0.val[1]: 04 14 24 34 44 54 64 74 0C 1C 2C 3C 4C 5C 6C 7C + // d1.val[0]: 02 12 22 32 42 52 62 72 0A 1A 2A 3A 4A 5A 6A 7A + // d1.val[1]: 06 16 26 36 46 56 66 76 0E 1E 2E 3E 4E 5E 6E 7E + // d2.val[0]: 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79 + // d2.val[1]: 05 15 25 35 45 55 65 75 0D 1D 2D 3D 4D 5D 6D 7D + // d3.val[0]: 03 13 23 33 43 53 63 73 0B 1B 2B 3B 4B 5B 6B 7B + // d3.val[1]: 07 17 27 37 47 57 67 77 0F 1F 2F 3F 4F 5F 6F 7F + // d4.val[0]: 80 90 A0 B0 C0 D0 E0 F0 88 98 A8 B8 C8 D8 E8 F8 + // d4.val[1]: 84 94 A4 B4 C4 D4 E4 F4 8C 9C AC BC CC DC EC FC + // d5.val[0]: 82 92 A2 B2 C2 D2 E2 F2 8A 9A AA BA CA DA EA FA + // d5.val[1]: 86 96 A6 B6 C6 D6 E6 F6 8E 9E AE BE CE DE EE FE + // d6.val[0]: 81 91 A1 B1 C1 D1 E1 F1 89 99 A9 B9 C9 D9 E9 F9 + // d6.val[1]: 85 95 A5 B5 C5 D5 E5 F5 8D 9D AD BD CD DD ED FD + // d7.val[0]: 83 93 A3 B3 C3 D3 E3 F3 8B 9B AB BB CB DB EB FB + // d7.val[1]: 87 97 A7 B7 C7 D7 E7 F7 8F 9F AF BF CF DF EF FF + const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]), + vreinterpretq_u32_u16(c2.val[0])); + const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]), + vreinterpretq_u32_u16(c2.val[1])); + const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]), + vreinterpretq_u32_u16(c3.val[0])); + const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]), + vreinterpretq_u32_u16(c3.val[1])); + const uint32x4x2_t d4 = vtrnq_u32(vreinterpretq_u32_u16(c4.val[0]), + vreinterpretq_u32_u16(c6.val[0])); + const uint32x4x2_t d5 = vtrnq_u32(vreinterpretq_u32_u16(c4.val[1]), + vreinterpretq_u32_u16(c6.val[1])); + const uint32x4x2_t d6 = vtrnq_u32(vreinterpretq_u32_u16(c5.val[0]), + vreinterpretq_u32_u16(c7.val[0])); + const uint32x4x2_t d7 = vtrnq_u32(vreinterpretq_u32_u16(c5.val[1]), + vreinterpretq_u32_u16(c7.val[1])); + + // Swap 64 bit elements resulting in: + // e0.val[0]: 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0 + // e0.val[1]: 08 18 28 38 48 58 68 78 88 98 A8 B8 C8 D8 E8 F8 + // e1.val[0]: 01 11 21 31 41 51 61 71 84 94 A4 B4 C4 D4 E4 F4 + // e1.val[1]: 09 19 29 39 49 59 69 79 89 99 A9 B9 C9 D9 E9 F9 + // e2.val[0]: 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2 + // e2.val[1]: 0A 1A 2A 3A 4A 5A 6A 7A 8A 9A AA BA CA DA EA FA + // e3.val[0]: 03 13 23 33 43 53 63 73 86 96 A6 B6 C6 D6 E6 F6 + // e3.val[1]: 0B 1B 2B 3B 4B 5B 6B 7B 8B 9B AB BB CB DB EB FB + // e4.val[0]: 04 14 24 34 44 54 64 74 81 91 A1 B1 C1 D1 E1 F1 + // e4.val[1]: 0C 1C 2C 3C 4C 5C 6C 7C 8C 9C AC BC CC DC EC FC + // e5.val[0]: 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5 + // e5.val[1]: 0D 1D 2D 3D 4D 5D 6D 7D 8D 9D AD BD CD DD ED FD + // e6.val[0]: 06 16 26 36 46 56 66 76 83 93 A3 B3 C3 D3 E3 F3 + // e6.val[1]: 0E 1E 2E 3E 4E 5E 6E 7E 8E 9E AE BE CE DE EE FE + // e7.val[0]: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7 + // e7.val[1]: 0F 1F 2F 3F 4F 5F 6F 7F 8F 9F AF BF CF DF EF FF + const uint8x16x2_t e0 = vpx_vtrnq_u64_to_u8(d0.val[0], d4.val[0]); + const uint8x16x2_t e1 = vpx_vtrnq_u64_to_u8(d2.val[0], d6.val[0]); + const uint8x16x2_t e2 = vpx_vtrnq_u64_to_u8(d1.val[0], d5.val[0]); + const uint8x16x2_t e3 = vpx_vtrnq_u64_to_u8(d3.val[0], d7.val[0]); + const uint8x16x2_t e4 = vpx_vtrnq_u64_to_u8(d0.val[1], d4.val[1]); + const uint8x16x2_t e5 = vpx_vtrnq_u64_to_u8(d2.val[1], d6.val[1]); + const uint8x16x2_t e6 = vpx_vtrnq_u64_to_u8(d1.val[1], d5.val[1]); + const uint8x16x2_t e7 = vpx_vtrnq_u64_to_u8(d3.val[1], d7.val[1]); + + // Output: + // o0 : 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0 + // o1 : 01 11 21 31 41 51 61 71 84 94 A4 B4 C4 D4 E4 F4 + // o2 : 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2 + // o3 : 03 13 23 33 43 53 63 73 86 96 A6 B6 C6 D6 E6 F6 + // o4 : 04 14 24 34 44 54 64 74 81 91 A1 B1 C1 D1 E1 F1 + // o5 : 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5 + // o6 : 06 16 26 36 46 56 66 76 83 93 A3 B3 C3 D3 E3 F3 + // o7 : 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7 + // o8 : 08 18 28 38 48 58 68 78 88 98 A8 B8 C8 D8 E8 F8 + // o9 : 09 19 29 39 49 59 69 79 89 99 A9 B9 C9 D9 E9 F9 + // o10: 0A 1A 2A 3A 4A 5A 6A 7A 8A 9A AA BA CA DA EA FA + // o11: 0B 1B 2B 3B 4B 5B 6B 7B 8B 9B AB BB CB DB EB FB + // o12: 0C 1C 2C 3C 4C 5C 6C 7C 8C 9C AC BC CC DC EC FC + // o13: 0D 1D 2D 3D 4D 5D 6D 7D 8D 9D AD BD CD DD ED FD + // o14: 0E 1E 2E 3E 4E 5E 6E 7E 8E 9E AE BE CE DE EE FE + // o15: 0F 1F 2F 3F 4F 5F 6F 7F 8F 9F AF BF CF DF EF FF + *o0 = e0.val[0]; + *o1 = e1.val[0]; + *o2 = e2.val[0]; + *o3 = e3.val[0]; + *o4 = e4.val[0]; + *o5 = e5.val[0]; + *o6 = e6.val[0]; + *o7 = e7.val[0]; + *o8 = e0.val[1]; + *o9 = e1.val[1]; + *o10 = e2.val[1]; + *o11 = e3.val[1]; + *o12 = e4.val[1]; + *o13 = e5.val[1]; + *o14 = e6.val[1]; + *o15 = e7.val[1]; +} + +static INLINE void transpose_s16_16x16(int16x8_t *in0, int16x8_t *in1) { + int16x8_t t[8]; + + // transpose the 4 8x8 quadrants separately but first swap quadrants 2 and 3. + t[0] = in0[8]; + t[1] = in0[9]; + t[2] = in0[10]; + t[3] = in0[11]; + t[4] = in0[12]; + t[5] = in0[13]; + t[6] = in0[14]; + t[7] = in0[15]; + in0[8] = in1[0]; + in0[9] = in1[1]; + in0[10] = in1[2]; + in0[11] = in1[3]; + in0[12] = in1[4]; + in0[13] = in1[5]; + in0[14] = in1[6]; + in0[15] = in1[7]; + in1[0] = t[0]; + in1[1] = t[1]; + in1[2] = t[2]; + in1[3] = t[3]; + in1[4] = t[4]; + in1[5] = t[5]; + in1[6] = t[6]; + in1[7] = t[7]; + + transpose_s16_8x8(&in0[0], &in0[1], &in0[2], &in0[3], &in0[4], &in0[5], + &in0[6], &in0[7]); + transpose_s16_8x8(&in0[8], &in0[9], &in0[10], &in0[11], &in0[12], &in0[13], + &in0[14], &in0[15]); + transpose_s16_8x8(&in1[0], &in1[1], &in1[2], &in1[3], &in1[4], &in1[5], + &in1[6], &in1[7]); + transpose_s16_8x8(&in1[8], &in1[9], &in1[10], &in1[11], &in1[12], &in1[13], + &in1[14], &in1[15]); +} + +static INLINE void load_and_transpose_u8_4x8(const uint8_t *a, + const int a_stride, uint8x8_t *a0, + uint8x8_t *a1, uint8x8_t *a2, + uint8x8_t *a3) { + uint8x8_t a4, a5, a6, a7; + *a0 = vld1_u8(a); + a += a_stride; + *a1 = vld1_u8(a); + a += a_stride; + *a2 = vld1_u8(a); + a += a_stride; + *a3 = vld1_u8(a); + a += a_stride; + a4 = vld1_u8(a); + a += a_stride; + a5 = vld1_u8(a); + a += a_stride; + a6 = vld1_u8(a); + a += a_stride; + a7 = vld1_u8(a); + + transpose_u8_4x8(a0, a1, a2, a3, a4, a5, a6, a7); +} + +static INLINE void load_and_transpose_u8_8x8(const uint8_t *a, + const int a_stride, uint8x8_t *a0, + uint8x8_t *a1, uint8x8_t *a2, + uint8x8_t *a3, uint8x8_t *a4, + uint8x8_t *a5, uint8x8_t *a6, + uint8x8_t *a7) { + *a0 = vld1_u8(a); + a += a_stride; + *a1 = vld1_u8(a); + a += a_stride; + *a2 = vld1_u8(a); + a += a_stride; + *a3 = vld1_u8(a); + a += a_stride; + *a4 = vld1_u8(a); + a += a_stride; + *a5 = vld1_u8(a); + a += a_stride; + *a6 = vld1_u8(a); + a += a_stride; + *a7 = vld1_u8(a); + + transpose_u8_8x8(a0, a1, a2, a3, a4, a5, a6, a7); +} + +static INLINE void transpose_and_store_u8_8x8(uint8_t *a, const int a_stride, + uint8x8_t a0, uint8x8_t a1, + uint8x8_t a2, uint8x8_t a3, + uint8x8_t a4, uint8x8_t a5, + uint8x8_t a6, uint8x8_t a7) { + transpose_u8_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + vst1_u8(a, a0); + a += a_stride; + vst1_u8(a, a1); + a += a_stride; + vst1_u8(a, a2); + a += a_stride; + vst1_u8(a, a3); + a += a_stride; + vst1_u8(a, a4); + a += a_stride; + vst1_u8(a, a5); + a += a_stride; + vst1_u8(a, a6); + a += a_stride; + vst1_u8(a, a7); +} + +static INLINE void load_and_transpose_s16_8x8(const int16_t *a, + const int a_stride, int16x8_t *a0, + int16x8_t *a1, int16x8_t *a2, + int16x8_t *a3, int16x8_t *a4, + int16x8_t *a5, int16x8_t *a6, + int16x8_t *a7) { + *a0 = vld1q_s16(a); + a += a_stride; + *a1 = vld1q_s16(a); + a += a_stride; + *a2 = vld1q_s16(a); + a += a_stride; + *a3 = vld1q_s16(a); + a += a_stride; + *a4 = vld1q_s16(a); + a += a_stride; + *a5 = vld1q_s16(a); + a += a_stride; + *a6 = vld1q_s16(a); + a += a_stride; + *a7 = vld1q_s16(a); + + transpose_s16_8x8(a0, a1, a2, a3, a4, a5, a6, a7); +} + +static INLINE void load_and_transpose_s32_8x8( + const int32_t *a, const int a_stride, int32x4x2_t *const a0, + int32x4x2_t *const a1, int32x4x2_t *const a2, int32x4x2_t *const a3, + int32x4x2_t *const a4, int32x4x2_t *const a5, int32x4x2_t *const a6, + int32x4x2_t *const a7) { + a0->val[0] = vld1q_s32(a); + a0->val[1] = vld1q_s32(a + 4); + a += a_stride; + a1->val[0] = vld1q_s32(a); + a1->val[1] = vld1q_s32(a + 4); + a += a_stride; + a2->val[0] = vld1q_s32(a); + a2->val[1] = vld1q_s32(a + 4); + a += a_stride; + a3->val[0] = vld1q_s32(a); + a3->val[1] = vld1q_s32(a + 4); + a += a_stride; + a4->val[0] = vld1q_s32(a); + a4->val[1] = vld1q_s32(a + 4); + a += a_stride; + a5->val[0] = vld1q_s32(a); + a5->val[1] = vld1q_s32(a + 4); + a += a_stride; + a6->val[0] = vld1q_s32(a); + a6->val[1] = vld1q_s32(a + 4); + a += a_stride; + a7->val[0] = vld1q_s32(a); + a7->val[1] = vld1q_s32(a + 4); + + transpose_s32_8x8(a0, a1, a2, a3, a4, a5, a6, a7); +} +#endif // VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/variance_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/variance_neon.c new file mode 100644 index 0000000000..efb2c1d8da --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/variance_neon.c @@ -0,0 +1,332 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" +#include "vpx_ports/mem.h" + +// Process a block of width 4 two rows at a time. +static INLINE void variance_4xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h, uint32_t *sse, int *sum) { + int16x8_t sum_s16 = vdupq_n_s16(0); + int32x4_t sse_s32 = vdupq_n_s32(0); + int i = h; + + // Number of rows we can process before 'sum_s16' overflows: + // 32767 / 255 ~= 128, but we use an 8-wide accumulator; so 256 4-wide rows. + assert(h <= 256); + + do { + const uint8x8_t s = load_unaligned_u8(src_ptr, src_stride); + const uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride); + const int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r)); + + sum_s16 = vaddq_s16(sum_s16, diff); + + sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff)); + sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff)); + + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + i -= 2; + } while (i != 0); + + *sum = horizontal_add_int16x8(sum_s16); + *sse = (uint32_t)horizontal_add_int32x4(sse_s32); +} + +// Process a block of width 8 one row at a time. +static INLINE void variance_8xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h, uint32_t *sse, int *sum) { + int16x8_t sum_s16 = vdupq_n_s16(0); + int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + int i = h; + + // Number of rows we can process before 'sum_s16' overflows: + // 32767 / 255 ~= 128 + assert(h <= 128); + + do { + const uint8x8_t s = vld1_u8(src_ptr); + const uint8x8_t r = vld1_u8(ref_ptr); + const int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r)); + + sum_s16 = vaddq_s16(sum_s16, diff); + + sse_s32[0] = vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + *sum = horizontal_add_int16x8(sum_s16); + *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1])); +} + +// Process a block of width 16 one row at a time. +static INLINE void variance_16xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h, uint32_t *sse, int *sum) { + int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) }; + int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + int i = h; + + // Number of rows we can process before 'sum_s16' accumulators overflow: + // 32767 / 255 ~= 128, so 128 16-wide rows. + assert(h <= 128); + + do { + const uint8x16_t s = vld1q_u8(src_ptr); + const uint8x16_t r = vld1q_u8(ref_ptr); + + const int16x8_t diff_l = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r))); + const int16x8_t diff_h = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r))); + + sum_s16[0] = vaddq_s16(sum_s16[0], diff_l); + sum_s16[1] = vaddq_s16(sum_s16[1], diff_h); + + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l)); + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + *sum = horizontal_add_int16x8(vaddq_s16(sum_s16[0], sum_s16[1])); + *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1])); +} + +// Process a block of any size where the width is divisible by 16. +static INLINE void variance_large_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int w, int h, int h_limit, + unsigned int *sse, int *sum) { + int32x4_t sum_s32 = vdupq_n_s32(0); + int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + + // 'h_limit' is the number of 'w'-width rows we can process before our 16-bit + // accumulator overflows. After hitting this limit we accumulate into 32-bit + // elements. + int h_tmp = h > h_limit ? h_limit : h; + + int i = 0; + do { + int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) }; + do { + int j = 0; + do { + const uint8x16_t s = vld1q_u8(src_ptr + j); + const uint8x16_t r = vld1q_u8(ref_ptr + j); + + const int16x8_t diff_l = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r))); + const int16x8_t diff_h = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r))); + + sum_s16[0] = vaddq_s16(sum_s16[0], diff_l); + sum_s16[1] = vaddq_s16(sum_s16[1], diff_h); + + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l)); + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h)); + + j += 16; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + i++; + } while (i < h_tmp); + + sum_s32 = vpadalq_s16(sum_s32, sum_s16[0]); + sum_s32 = vpadalq_s16(sum_s32, sum_s16[1]); + + h_tmp += h_limit; + } while (i < h); + + *sum = horizontal_add_int32x4(sum_s32); + *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1])); +} + +static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon(src, src_stride, ref, ref_stride, 32, h, 64, sse, sum); +} + +static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon(src, src_stride, ref, ref_stride, 64, h, 32, sse, sum); +} + +void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + variance_8xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse, sum); +} + +void vpx_get16x16var_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + variance_16xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, sse, sum); +} + +#define VARIANCE_WXH_NEON(w, h, shift) \ + unsigned int vpx_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + int sum; \ + variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ + } + +VARIANCE_WXH_NEON(4, 4, 4) +VARIANCE_WXH_NEON(4, 8, 5) + +VARIANCE_WXH_NEON(8, 4, 5) +VARIANCE_WXH_NEON(8, 8, 6) +VARIANCE_WXH_NEON(8, 16, 7) + +VARIANCE_WXH_NEON(16, 8, 7) +VARIANCE_WXH_NEON(16, 16, 8) +VARIANCE_WXH_NEON(16, 32, 9) + +VARIANCE_WXH_NEON(32, 16, 9) +VARIANCE_WXH_NEON(32, 32, 10) +VARIANCE_WXH_NEON(32, 64, 11) + +VARIANCE_WXH_NEON(64, 32, 11) +VARIANCE_WXH_NEON(64, 64, 12) + +#undef VARIANCE_WXH_NEON + +static INLINE unsigned int vpx_mse8xh_neon(const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride, int h) { + uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h / 2; + do { + uint8x8_t s0, s1, r0, r1, diff0, diff1; + uint16x8_t sse0, sse1; + + s0 = vld1_u8(src_ptr); + src_ptr += src_stride; + s1 = vld1_u8(src_ptr); + src_ptr += src_stride; + r0 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + r1 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + + diff0 = vabd_u8(s0, r0); + diff1 = vabd_u8(s1, r1); + + sse0 = vmull_u8(diff0, diff0); + sse_u32[0] = vpadalq_u16(sse_u32[0], sse0); + sse1 = vmull_u8(diff1, diff1); + sse_u32[1] = vpadalq_u16(sse_u32[1], sse1); + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1])); +} + +static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride, int h) { + uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + uint8x16_t s, r, diff; + uint16x8_t sse0, sse1; + + s = vld1q_u8(src_ptr); + src_ptr += src_stride; + r = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + + diff = vabdq_u8(s, r); + + sse0 = vmull_u8(vget_low_u8(diff), vget_low_u8(diff)); + sse_u32[0] = vpadalq_u16(sse_u32[0], sse0); + sse1 = vmull_u8(vget_high_u8(diff), vget_high_u8(diff)); + sse_u32[1] = vpadalq_u16(sse_u32[1], sse1); + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1])); +} + +unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, + const unsigned char *ref_ptr, + int ref_stride) { + uint8x8_t s[2], r[2]; + uint16x8_t abs_diff[2]; + uint32x4_t sse; + + s[0] = load_u8(src_ptr, src_stride); + r[0] = load_u8(ref_ptr, ref_stride); + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + s[1] = load_u8(src_ptr, src_stride); + r[1] = load_u8(ref_ptr, ref_stride); + + abs_diff[0] = vabdl_u8(s[0], r[0]); + abs_diff[1] = vabdl_u8(s[1], r[1]); + + sse = vmull_u16(vget_low_u16(abs_diff[0]), vget_low_u16(abs_diff[0])); + sse = vmlal_u16(sse, vget_high_u16(abs_diff[0]), vget_high_u16(abs_diff[0])); + sse = vmlal_u16(sse, vget_low_u16(abs_diff[1]), vget_low_u16(abs_diff[1])); + sse = vmlal_u16(sse, vget_high_u16(abs_diff[1]), vget_high_u16(abs_diff[1])); + + return horizontal_add_uint32x4(sse); +} + +#define VPX_MSE_WXH_NEON(w, h) \ + unsigned int vpx_mse##w##x##h##_neon( \ + const unsigned char *src_ptr, int src_stride, \ + const unsigned char *ref_ptr, int ref_stride, unsigned int *sse) { \ + *sse = vpx_mse##w##xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, h); \ + return *sse; \ + } + +VPX_MSE_WXH_NEON(8, 8) +VPX_MSE_WXH_NEON(8, 16) +VPX_MSE_WXH_NEON(16, 8) +VPX_MSE_WXH_NEON(16, 16) + +#undef VPX_MSE_WXH_NEON diff --git a/media/libvpx/libvpx/vpx_dsp/arm/variance_neon_dotprod.c b/media/libvpx/libvpx/vpx_dsp/arm/variance_neon_dotprod.c new file mode 100644 index 0000000000..ab843e9fca --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/variance_neon_dotprod.c @@ -0,0 +1,298 @@ +/* + * Copyright (c) 2021 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" +#include "vpx_ports/mem.h" + +// Process a block of width 4 four rows at a time. +static INLINE void variance_4xh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h; + do { + const uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride); + const uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride); + + const uint8x16_t abs_diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); + + src_ptr += 4 * src_stride; + ref_ptr += 4 * ref_stride; + i -= 4; + } while (i != 0); + + *sum = horizontal_add_int32x4( + vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); + *sse = horizontal_add_uint32x4(sse_u32); +} + +// Process a block of width 8 two rows at a time. +static INLINE void variance_8xh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h; + do { + const uint8x16_t s = + vcombine_u8(vld1_u8(src_ptr), vld1_u8(src_ptr + src_stride)); + const uint8x16_t r = + vcombine_u8(vld1_u8(ref_ptr), vld1_u8(ref_ptr + ref_stride)); + + const uint8x16_t abs_diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); + + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + i -= 2; + } while (i != 0); + + *sum = horizontal_add_int32x4( + vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); + *sse = horizontal_add_uint32x4(sse_u32); +} + +// Process a block of width 16 one row at a time. +static INLINE void variance_16xh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h; + do { + const uint8x16_t s = vld1q_u8(src_ptr); + const uint8x16_t r = vld1q_u8(ref_ptr); + + const uint8x16_t abs_diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + *sum = horizontal_add_int32x4( + vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); + *sse = horizontal_add_uint32x4(sse_u32); +} + +// Process a block of any size where the width is divisible by 16. +static INLINE void variance_large_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int w, int h, + uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h; + do { + int j = 0; + do { + const uint8x16_t s = vld1q_u8(src_ptr + j); + const uint8x16_t r = vld1q_u8(ref_ptr + j); + + const uint8x16_t abs_diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); + + j += 16; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + *sum = horizontal_add_int32x4( + vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); + *sse = horizontal_add_uint32x4(sse_u32); +} + +static INLINE void variance_32xh_neon_dotprod(const uint8_t *src, + int src_stride, + const uint8_t *ref, + int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 32, h, sse, + sum); +} + +static INLINE void variance_64xh_neon_dotprod(const uint8_t *src, + int src_stride, + const uint8_t *ref, + int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 64, h, sse, + sum); +} + +void vpx_get8x8var_neon_dotprod(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + variance_8xh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse, + sum); +} + +void vpx_get16x16var_neon_dotprod(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + variance_16xh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 16, sse, + sum); +} + +#define VARIANCE_WXH_NEON_DOTPROD(w, h, shift) \ + unsigned int vpx_variance##w##x##h##_neon_dotprod( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + int sum; \ + variance_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h, sse, \ + &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ + } + +VARIANCE_WXH_NEON_DOTPROD(4, 4, 4) +VARIANCE_WXH_NEON_DOTPROD(4, 8, 5) + +VARIANCE_WXH_NEON_DOTPROD(8, 4, 5) +VARIANCE_WXH_NEON_DOTPROD(8, 8, 6) +VARIANCE_WXH_NEON_DOTPROD(8, 16, 7) + +VARIANCE_WXH_NEON_DOTPROD(16, 8, 7) +VARIANCE_WXH_NEON_DOTPROD(16, 16, 8) +VARIANCE_WXH_NEON_DOTPROD(16, 32, 9) + +VARIANCE_WXH_NEON_DOTPROD(32, 16, 9) +VARIANCE_WXH_NEON_DOTPROD(32, 32, 10) +VARIANCE_WXH_NEON_DOTPROD(32, 64, 11) + +VARIANCE_WXH_NEON_DOTPROD(64, 32, 11) +VARIANCE_WXH_NEON_DOTPROD(64, 64, 12) + +#undef VARIANCE_WXH_NEON_DOTPROD + +static INLINE unsigned int vpx_mse8xh_neon_dotprod(const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride, int h) { + uint32x2_t sse_u32[2] = { vdup_n_u32(0), vdup_n_u32(0) }; + + int i = h / 2; + do { + uint8x8_t s0, s1, r0, r1, diff0, diff1; + + s0 = vld1_u8(src_ptr); + src_ptr += src_stride; + s1 = vld1_u8(src_ptr); + src_ptr += src_stride; + r0 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + r1 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + + diff0 = vabd_u8(s0, r0); + diff1 = vabd_u8(s1, r1); + + sse_u32[0] = vdot_u32(sse_u32[0], diff0, diff0); + sse_u32[1] = vdot_u32(sse_u32[1], diff1, diff1); + } while (--i != 0); + + return horizontal_add_uint32x2(vadd_u32(sse_u32[0], sse_u32[1])); +} + +static INLINE unsigned int vpx_mse16xh_neon_dotprod( + const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, + int ref_stride, int h) { + uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h / 2; + do { + uint8x16_t s0, s1, r0, r1, diff0, diff1; + + s0 = vld1q_u8(src_ptr); + src_ptr += src_stride; + s1 = vld1q_u8(src_ptr); + src_ptr += src_stride; + r0 = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + r1 = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + + diff0 = vabdq_u8(s0, r0); + diff1 = vabdq_u8(s1, r1); + + sse_u32[0] = vdotq_u32(sse_u32[0], diff0, diff0); + sse_u32[1] = vdotq_u32(sse_u32[1], diff1, diff1); + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1])); +} + +unsigned int vpx_get4x4sse_cs_neon_dotprod(const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride) { + uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride); + uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride); + + uint8x16_t abs_diff = vabdq_u8(s, r); + + uint32x4_t sse = vdotq_u32(vdupq_n_u32(0), abs_diff, abs_diff); + + return horizontal_add_uint32x4(sse); +} + +#define VPX_MSE_WXH_NEON_DOTPROD(w, h) \ + unsigned int vpx_mse##w##x##h##_neon_dotprod( \ + const unsigned char *src_ptr, int src_stride, \ + const unsigned char *ref_ptr, int ref_stride, unsigned int *sse) { \ + *sse = vpx_mse##w##xh_neon_dotprod(src_ptr, src_stride, ref_ptr, \ + ref_stride, h); \ + return *sse; \ + } + +VPX_MSE_WXH_NEON_DOTPROD(8, 8) +VPX_MSE_WXH_NEON_DOTPROD(8, 16) +VPX_MSE_WXH_NEON_DOTPROD(16, 8) +VPX_MSE_WXH_NEON_DOTPROD(16, 16) + +#undef VPX_MSE_WXH_NEON_DOTPROD diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm new file mode 100644 index 0000000000..d8e4bcc3a7 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm @@ -0,0 +1,438 @@ +; +; Copyright (c) 2018 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +;**************Variables Vs Registers***************************************** +; r0 => src +; r1 => dst +; r2 => src_stride +; r3 => dst_stride +; r4 => filter_x0 +; r8 => ht +; r10 => wd + + EXPORT |vpx_convolve8_avg_horiz_filter_type1_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vpx_convolve8_avg_horiz_filter_type1_neon| PROC + + stmfd sp!, {r4 - r12, r14} ;stack stores the values of + ; the arguments + vpush {d8 - d15} ; stack offset by 64 + mov r4, r1 + mov r1, r2 + mov r2, r4 + +start_loop_count + ldr r4, [sp, #104] ;loads pi1_coeff + ldr r8, [sp, #108] ;loads x0_q4 + add r4, r4, r8, lsl #4 ;r4 = filter[x0_q4] + ldr r8, [sp, #128] ;loads ht + ldr r10, [sp, #124] ;loads wd + vld2.8 {d0, d1}, [r4] ;coeff = vld1_s8(pi1_coeff) + mov r11, #1 + subs r14, r8, #0 ;checks for ht == 0 + vabs.s8 d2, d0 ;vabs_s8(coeff) + vdup.8 d24, d2[0] ;coeffabs_0 = vdup_lane_u8(coeffabs, + ; 0) + sub r12, r0, #3 ;pu1_src - 3 + vdup.8 d25, d2[1] ;coeffabs_1 = vdup_lane_u8(coeffabs, + ; 1) + add r4, r12, r2 ;pu1_src_tmp2_8 = pu1_src + src_strd + vdup.8 d26, d2[2] ;coeffabs_2 = vdup_lane_u8(coeffabs, + ; 2) + rsb r9, r10, r2, lsl #1 ;2*src_strd - wd + vdup.8 d27, d2[3] ;coeffabs_3 = vdup_lane_u8(coeffabs, + ; 3) + rsb r8, r10, r3, lsl #1 ;2*dst_strd - wd + vdup.8 d28, d2[4] ;coeffabs_4 = vdup_lane_u8(coeffabs, + ; 4) + vdup.8 d29, d2[5] ;coeffabs_5 = vdup_lane_u8(coeffabs, + ; 5) + vdup.8 d30, d2[6] ;coeffabs_6 = vdup_lane_u8(coeffabs, + ; 6) + vdup.8 d31, d2[7] ;coeffabs_7 = vdup_lane_u8(coeffabs, + ; 7) + mov r7, r1 + cmp r10, #4 + ble outer_loop_4 + + cmp r10, #24 + moveq r10, #16 + addeq r8, #8 + addeq r9, #8 + cmp r10, #16 + bge outer_loop_16 + + cmp r10, #12 + addeq r8, #4 + addeq r9, #4 + b outer_loop_8 + +outer_loop8_residual + sub r12, r0, #3 ;pu1_src - 3 + mov r1, r7 + mov r14, #32 + add r1, #16 + add r12, #16 + mov r10, #8 + add r8, #8 + add r9, #8 + +outer_loop_8 + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + subs r5, r10, #0 ;checks wd + ble end_inner_loop_8 + +inner_loop_8 + mov r7, #0xc000 + vld1.u32 {d0}, [r12], r11 ;vector load pu1_src + vdup.16 q4, r7 + vld1.u32 {d1}, [r12], r11 + vdup.16 q5, r7 + vld1.u32 {d2}, [r12], r11 + vld1.u32 {d3}, [r12], r11 + mov r7, #0x4000 + vld1.u32 {d4}, [r12], r11 + vmlsl.u8 q4, d1, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {d5}, [r12], r11 + vmlal.u8 q4, d3, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {d6}, [r12], r11 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vld1.u32 {d7}, [r12], r11 + vmlal.u8 q4, d2, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd + vmlal.u8 q4, d4, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vld1.u32 {d13}, [r4], r11 + vmlal.u8 q4, d5, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vld1.u32 {d14}, [r4], r11 + vmlsl.u8 q4, d6, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vld1.u32 {d15}, [r4], r11 + vmlsl.u8 q4, d7, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + vld1.u32 {d16}, [r4], r11 ;vector load pu1_src + src_strd + vdup.16 q11, r7 + vmlal.u8 q5, d15, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {d17}, [r4], r11 + vmlal.u8 q5, d14, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vhadd.s16 q4, q4, q11 + vld1.u32 {d18}, [r4], r11 + vmlal.u8 q5, d16, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vld1.u32 {d19}, [r4], r11 ;vector load pu1_src + src_strd + vmlal.u8 q5, d17, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vld1.u8 {d6}, [r1] + vqrshrun.s16 d20, q4, #6 ;right shift and saturating narrow + ; result 1 + vmlsl.u8 q5, d18, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q5, d19, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + vld1.u8 {d7}, [r6] + vrhadd.u8 d20, d20, d6 + vmlsl.u8 q5, d12, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vmlsl.u8 q5, d13, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vst1.8 {d20}, [r1]! ;store the result pu1_dst + vhadd.s16 q5, q5, q11 + subs r5, r5, #8 ;decrement the wd loop + vqrshrun.s16 d8, q5, #6 ;right shift and saturating narrow + ; result 2 + vrhadd.u8 d8, d8, d7 + vst1.8 {d8}, [r6]! ;store the result pu1_dst + cmp r5, #4 + bgt inner_loop_8 + +end_inner_loop_8 + subs r14, r14, #2 ;decrement the ht loop + add r12, r12, r9 ;increment the src pointer by + ; 2*src_strd-wd + add r1, r1, r8 ;increment the dst pointer by + ; 2*dst_strd-wd + bgt outer_loop_8 + + ldr r10, [sp, #120] ;loads wd + cmp r10, #12 + beq outer_loop4_residual + +end_loops + b end_func + +outer_loop_16 + str r0, [sp, #-4]! + str r7, [sp, #-4]! + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + and r0, r12, #31 + mov r7, #0xc000 + sub r5, r10, #0 ;checks wd + pld [r4, r2, lsl #1] + pld [r12, r2, lsl #1] + vld1.u32 {q0}, [r12], r11 ;vector load pu1_src + vdup.16 q4, r7 + vld1.u32 {q1}, [r12], r11 + vld1.u32 {q2}, [r12], r11 + vld1.u32 {q3}, [r12], r11 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vld1.u32 {q6}, [r12], r11 + vmlsl.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {q7}, [r12], r11 + vmlal.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {q8}, [r12], r11 + vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {q9}, [r12], r11 + vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vmlal.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vdup.16 q10, r7 + vmlsl.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + +inner_loop_16 + vmlsl.u8 q10, d1, d24 + vdup.16 q5, r7 + vmlsl.u8 q10, d3, d25 + mov r7, #0x4000 + vdup.16 q11, r7 + vmlal.u8 q10, d5, d26 + vld1.u32 {q0}, [r4], r11 ;vector load pu1_src + vhadd.s16 q4, q4, q11 + vld1.u32 {q1}, [r4], r11 + vmlal.u8 q10, d7, d27 + add r12, #8 + subs r5, r5, #16 + vmlal.u8 q10, d13, d28 + vld1.u32 {q2}, [r4], r11 + vmlal.u8 q10, d15, d29 + vld1.u32 {q3}, [r4], r11 + vqrshrun.s16 d8, q4, #6 ;right shift and saturating narrow + ; result 1 + vmlsl.u8 q10, d17, d30 + vld1.u32 {q6}, [r4], r11 + vmlsl.u8 q10, d19, d31 + vld1.u32 {q7}, [r4], r11 + add r7, r1, #8 + vmlsl.u8 q5, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vmlsl.u8 q5, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {q8}, [r4], r11 + vhadd.s16 q10, q10, q11 + vld1.u32 {q9}, [r4], r11 + vld1.u8 {d0}, [r1] + vmlal.u8 q5, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u8 {d2}, [r7] + vmlal.u8 q5, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + add r4, #8 + mov r7, #0xc000 + vmlal.u8 q5, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vmlal.u8 q5, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vqrshrun.s16 d9, q10, #6 + vdup.16 q11, r7 + vmlsl.u8 q5, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q5, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + mov r7, #0x4000 + vrhadd.u8 d8, d8, d0 + vrhadd.u8 d9, d9, d2 + vmlsl.u8 q11, d1, d24 + vmlsl.u8 q11, d3, d25 + vdup.16 q10, r7 + vmlal.u8 q11, d5, d26 + pld [r12, r2, lsl #2] + pld [r4, r2, lsl #2] + addeq r12, r12, r9 ;increment the src pointer by + ; 2*src_strd-wd + addeq r4, r12, r2 ;pu1_src + src_strd + vmlal.u8 q11, d7, d27 + vmlal.u8 q11, d13, d28 + vst1.8 {q4}, [r1]! ;store the result pu1_dst + subeq r14, r14, #2 + vhadd.s16 q5, q5, q10 + vmlal.u8 q11, d15, d29 + addeq r1, r1, r8 + vmlsl.u8 q11, d17, d30 + cmp r14, #0 + vmlsl.u8 q11, d19, d31 + vqrshrun.s16 d10, q5, #6 ;right shift and saturating narrow + ; result 2 + beq epilog_16 + + vld1.u32 {q0}, [r12], r11 ;vector load pu1_src + mov r7, #0xc000 + cmp r5, #0 + vld1.u32 {q1}, [r12], r11 + vhadd.s16 q11, q11, q10 + vld1.u32 {q2}, [r12], r11 + vdup.16 q4, r7 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vdup.16 q10, r7 + vld1.u32 {q3}, [r12], r11 + add r7, r6, #8 + moveq r5, r10 + vld1.u8 {d0}, [r6] + vmlsl.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u8 {d2}, [r7] + vqrshrun.s16 d11, q11, #6 + vmlal.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {q6}, [r12], r11 + vrhadd.u8 d10, d10, d0 + vld1.u32 {q7}, [r12], r11 + vrhadd.u8 d11, d11, d2 + vld1.u32 {q8}, [r12], r11 + vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {q9}, [r12], r11 + vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vmlal.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + mov r7, #0xc000 + vmlsl.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vst1.8 {q5}, [r6]! ;store the result pu1_dst + vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + addeq r6, r1, r3 ;pu1_dst + dst_strd + b inner_loop_16 + +epilog_16 + mov r7, #0x4000 + ldr r0, [sp], #4 + ldr r10, [sp, #120] + vdup.16 q10, r7 + vhadd.s16 q11, q11, q10 + vqrshrun.s16 d11, q11, #6 + add r7, r6, #8 + vld1.u8 {d20}, [r6] + vld1.u8 {d21}, [r7] + vrhadd.u8 d10, d10, d20 + vrhadd.u8 d11, d11, d21 + vst1.8 {q5}, [r6]! ;store the result pu1_dst + ldr r7, [sp], #4 + cmp r10, #24 + beq outer_loop8_residual + +end_loops1 + b end_func + +outer_loop4_residual + sub r12, r0, #3 ;pu1_src - 3 + mov r1, r7 + add r1, #8 + mov r10, #4 + add r12, #8 + mov r14, #16 + add r8, #4 + add r9, #4 + +outer_loop_4 + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + subs r5, r10, #0 ;checks wd + ble end_inner_loop_4 + +inner_loop_4 + vld1.u32 {d0}, [r12], r11 ;vector load pu1_src + vld1.u32 {d1}, [r12], r11 + vld1.u32 {d2}, [r12], r11 + vld1.u32 {d3}, [r12], r11 + vld1.u32 {d4}, [r12], r11 + vld1.u32 {d5}, [r12], r11 + vld1.u32 {d6}, [r12], r11 + vld1.u32 {d7}, [r12], r11 + sub r12, r12, #4 + vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd + vld1.u32 {d13}, [r4], r11 + vzip.32 d0, d12 ;vector zip the i iteration and ii + ; interation in single register + vld1.u32 {d14}, [r4], r11 + vzip.32 d1, d13 + vld1.u32 {d15}, [r4], r11 + vzip.32 d2, d14 + vld1.u32 {d16}, [r4], r11 + vzip.32 d3, d15 + vld1.u32 {d17}, [r4], r11 + vzip.32 d4, d16 + vld1.u32 {d18}, [r4], r11 + vzip.32 d5, d17 + vld1.u32 {d19}, [r4], r11 + mov r7, #0xc000 + vdup.16 q4, r7 + sub r4, r4, #4 + vzip.32 d6, d18 + vzip.32 d7, d19 + vmlsl.u8 q4, d1, d25 ;arithmetic operations for ii + ; iteration in the same time + vmlsl.u8 q4, d0, d24 + vmlal.u8 q4, d2, d26 + vmlal.u8 q4, d3, d27 + vmlal.u8 q4, d4, d28 + vmlal.u8 q4, d5, d29 + vmlsl.u8 q4, d6, d30 + vmlsl.u8 q4, d7, d31 + mov r7, #0x4000 + vdup.16 q10, r7 + vhadd.s16 q4, q4, q10 + vqrshrun.s16 d8, q4, #6 + vld1.u32 {d10[0]}, [r1] + vld1.u32 {d10[1]}, [r6] + vrhadd.u8 d8, d8, d10 + vst1.32 {d8[0]},[r1]! ;store the i iteration result which + ; is in upper part of the register + vst1.32 {d8[1]},[r6]! ;store the ii iteration result which + ; is in lower part of the register + subs r5, r5, #4 ;decrement the wd by 4 + bgt inner_loop_4 + +end_inner_loop_4 + subs r14, r14, #2 ;decrement the ht by 4 + add r12, r12, r9 ;increment the input pointer + ; 2*src_strd-wd + add r1, r1, r8 ;increment the output pointer + ; 2*dst_strd-wd + bgt outer_loop_4 + +end_func + vpop {d8 - d15} + ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp + + ENDP + + END diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm new file mode 100644 index 0000000000..7a77747fec --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm @@ -0,0 +1,439 @@ +; +; Copyright (c) 2018 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +;**************Variables Vs Registers*********************************** +; r0 => src +; r1 => dst +; r2 => src_stride +; r3 => dst_stride +; r4 => filter_x0 +; r8 => ht +; r10 => wd + + EXPORT |vpx_convolve8_avg_horiz_filter_type2_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vpx_convolve8_avg_horiz_filter_type2_neon| PROC + + stmfd sp!, {r4 - r12, r14} ;stack stores the values of + ; the arguments + vpush {d8 - d15} ; stack offset by 64 + mov r4, r1 + mov r1, r2 + mov r2, r4 + +start_loop_count + ldr r4, [sp, #104] ;loads pi1_coeff + ldr r8, [sp, #108] ;loads x0_q4 + add r4, r4, r8, lsl #4 ;r4 = filter[x0_q4] + ldr r8, [sp, #128] ;loads ht + ldr r10, [sp, #124] ;loads wd + vld2.8 {d0, d1}, [r4] ;coeff = vld1_s8(pi1_coeff) + mov r11, #1 + subs r14, r8, #0 ;checks for ht == 0 + vabs.s8 d2, d0 ;vabs_s8(coeff) + vdup.8 d24, d2[0] ;coeffabs_0 = vdup_lane_u8(coeffabs, + ; 0) + sub r12, r0, #3 ;pu1_src - 3 + vdup.8 d25, d2[1] ;coeffabs_1 = vdup_lane_u8(coeffabs, + ; 1) + add r4, r12, r2 ;pu1_src_tmp2_8 = pu1_src + src_strd + vdup.8 d26, d2[2] ;coeffabs_2 = vdup_lane_u8(coeffabs, + ; 2) + rsb r9, r10, r2, lsl #1 ;2*src_strd - wd + vdup.8 d27, d2[3] ;coeffabs_3 = vdup_lane_u8(coeffabs, + ; 3) + rsb r8, r10, r3, lsl #1 ;2*dst_strd - wd + vdup.8 d28, d2[4] ;coeffabs_4 = vdup_lane_u8(coeffabs, + ; 4) + vdup.8 d29, d2[5] ;coeffabs_5 = vdup_lane_u8(coeffabs, + ; 5) + vdup.8 d30, d2[6] ;coeffabs_6 = vdup_lane_u8(coeffabs, + ; 6) + vdup.8 d31, d2[7] ;coeffabs_7 = vdup_lane_u8(coeffabs, + ; 7) + mov r7, r1 + cmp r10, #4 + ble outer_loop_4 + + cmp r10, #24 + moveq r10, #16 + addeq r8, #8 + addeq r9, #8 + cmp r10, #16 + bge outer_loop_16 + + cmp r10, #12 + addeq r8, #4 + addeq r9, #4 + b outer_loop_8 + +outer_loop8_residual + sub r12, r0, #3 ;pu1_src - 3 + mov r1, r7 + mov r14, #32 + add r1, #16 + add r12, #16 + mov r10, #8 + add r8, #8 + add r9, #8 + +outer_loop_8 + + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + subs r5, r10, #0 ;checks wd + ble end_inner_loop_8 + +inner_loop_8 + mov r7, #0xc000 + vld1.u32 {d0}, [r12], r11 ;vector load pu1_src + vdup.16 q4, r7 + vld1.u32 {d1}, [r12], r11 + vdup.16 q5, r7 + vld1.u32 {d2}, [r12], r11 + vld1.u32 {d3}, [r12], r11 + mov r7, #0x4000 + vld1.u32 {d4}, [r12], r11 + vmlal.u8 q4, d1, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {d5}, [r12], r11 + vmlal.u8 q4, d3, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {d6}, [r12], r11 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vld1.u32 {d7}, [r12], r11 + vmlsl.u8 q4, d2, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd + vmlal.u8 q4, d4, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vld1.u32 {d13}, [r4], r11 + vmlsl.u8 q4, d5, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vld1.u32 {d14}, [r4], r11 + vmlal.u8 q4, d6, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vld1.u32 {d15}, [r4], r11 + vmlsl.u8 q4, d7, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + vld1.u32 {d16}, [r4], r11 ;vector load pu1_src + src_strd + vdup.16 q11, r7 + vmlal.u8 q5, d15, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {d17}, [r4], r11 + vmlsl.u8 q5, d14, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vhadd.s16 q4, q4, q11 + vld1.u32 {d18}, [r4], r11 + vmlal.u8 q5, d16, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vld1.u32 {d19}, [r4], r11 ;vector load pu1_src + src_strd + vmlsl.u8 q5, d17, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vld1.u8 {d6}, [r1] + vqrshrun.s16 d20, q4, #6 ;right shift and saturating narrow + ; result 1 + vmlal.u8 q5, d18, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q5, d19, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + vld1.u8 {d7}, [r6] + vrhadd.u8 d20, d20, d6 + vmlsl.u8 q5, d12, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vmlal.u8 q5, d13, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vst1.8 {d20}, [r1]! ;store the result pu1_dst + vhadd.s16 q5, q5, q11 + subs r5, r5, #8 ;decrement the wd loop + vqrshrun.s16 d8, q5, #6 ;right shift and saturating narrow + ; result 2 + vrhadd.u8 d8, d8, d7 + vst1.8 {d8}, [r6]! ;store the result pu1_dst + cmp r5, #4 + bgt inner_loop_8 + +end_inner_loop_8 + subs r14, r14, #2 ;decrement the ht loop + add r12, r12, r9 ;increment the src pointer by + ; 2*src_strd-wd + add r1, r1, r8 ;increment the dst pointer by + ; 2*dst_strd-wd + bgt outer_loop_8 + + ldr r10, [sp, #120] ;loads wd + cmp r10, #12 + beq outer_loop4_residual + +end_loops + b end_func + +outer_loop_16 + str r0, [sp, #-4]! + str r7, [sp, #-4]! + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + and r0, r12, #31 + mov r7, #0xc000 + sub r5, r10, #0 ;checks wd + pld [r4, r2, lsl #1] + pld [r12, r2, lsl #1] + vld1.u32 {q0}, [r12], r11 ;vector load pu1_src + vdup.16 q4, r7 + vld1.u32 {q1}, [r12], r11 + vld1.u32 {q2}, [r12], r11 + vld1.u32 {q3}, [r12], r11 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vld1.u32 {q6}, [r12], r11 + vmlal.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {q7}, [r12], r11 + vmlsl.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {q8}, [r12], r11 + vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {q9}, [r12], r11 + vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vmlsl.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vdup.16 q10, r7 + vmlal.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + +inner_loop_16 + vmlsl.u8 q10, d1, d24 + vdup.16 q5, r7 + vmlal.u8 q10, d3, d25 + mov r7, #0x4000 + vdup.16 q11, r7 + vmlsl.u8 q10, d5, d26 + vld1.u32 {q0}, [r4], r11 ;vector load pu1_src + vhadd.s16 q4, q4, q11 + vld1.u32 {q1}, [r4], r11 + vmlal.u8 q10, d7, d27 + add r12, #8 + subs r5, r5, #16 + vmlal.u8 q10, d13, d28 + vld1.u32 {q2}, [r4], r11 + vmlsl.u8 q10, d15, d29 + vld1.u32 {q3}, [r4], r11 + vqrshrun.s16 d8, q4, #6 ;right shift and saturating narrow + ; result 1 + vmlal.u8 q10, d17, d30 + vld1.u32 {q6}, [r4], r11 + vmlsl.u8 q10, d19, d31 + vld1.u32 {q7}, [r4], r11 + add r7, r1, #8 + vmlsl.u8 q5, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vmlal.u8 q5, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {q8}, [r4], r11 + vhadd.s16 q10, q10, q11 + vld1.u32 {q9}, [r4], r11 + vld1.u8 {d0}, [r1] + vmlsl.u8 q5, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u8 {d2}, [r7] + vmlal.u8 q5, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + add r4, #8 + mov r7, #0xc000 + vmlal.u8 q5, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vmlsl.u8 q5, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vqrshrun.s16 d9, q10, #6 + vdup.16 q11, r7 + vmlal.u8 q5, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q5, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + mov r7, #0x4000 + vrhadd.u8 d8, d8, d0 + vrhadd.u8 d9, d9, d2 + vmlsl.u8 q11, d1, d24 + vmlal.u8 q11, d3, d25 + vdup.16 q10, r7 + vmlsl.u8 q11, d5, d26 + pld [r12, r2, lsl #2] + pld [r4, r2, lsl #2] + addeq r12, r12, r9 ;increment the src pointer by + ; 2*src_strd-wd + addeq r4, r12, r2 ;pu1_src + src_strd + vmlal.u8 q11, d7, d27 + vmlal.u8 q11, d13, d28 + vst1.8 {q4}, [r1]! ;store the result pu1_dst + subeq r14, r14, #2 + vhadd.s16 q5, q5, q10 + vmlsl.u8 q11, d15, d29 + addeq r1, r1, r8 + vmlal.u8 q11, d17, d30 + cmp r14, #0 + vmlsl.u8 q11, d19, d31 + vqrshrun.s16 d10, q5, #6 ;right shift and saturating narrow + ; result 2 + beq epilog_16 + + vld1.u32 {q0}, [r12], r11 ;vector load pu1_src + mov r7, #0xc000 + cmp r5, #0 + vld1.u32 {q1}, [r12], r11 + vhadd.s16 q11, q11, q10 + vld1.u32 {q2}, [r12], r11 + vdup.16 q4, r7 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vdup.16 q10, r7 + vld1.u32 {q3}, [r12], r11 + add r7, r6, #8 + moveq r5, r10 + vld1.u8 {d0}, [r6] + vmlal.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u8 {d2}, [r7] + vqrshrun.s16 d11, q11, #6 + vmlsl.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {q6}, [r12], r11 + vrhadd.u8 d10, d10, d0 + vld1.u32 {q7}, [r12], r11 + vrhadd.u8 d11, d11, d2 + vld1.u32 {q8}, [r12], r11 + vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {q9}, [r12], r11 + vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vmlsl.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + mov r7, #0xc000 + vmlal.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vst1.8 {q5}, [r6]! ;store the result pu1_dst + vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + addeq r6, r1, r3 ;pu1_dst + dst_strd + b inner_loop_16 + +epilog_16 + mov r7, #0x4000 + ldr r0, [sp], #4 + ldr r10, [sp, #120] + vdup.16 q10, r7 + vhadd.s16 q11, q11, q10 + vqrshrun.s16 d11, q11, #6 + add r7, r6, #8 + vld1.u8 {d20}, [r6] + vld1.u8 {d21}, [r7] + vrhadd.u8 d10, d10, d20 + vrhadd.u8 d11, d11, d21 + vst1.8 {q5}, [r6]! ;store the result pu1_dst + ldr r7, [sp], #4 + cmp r10, #24 + beq outer_loop8_residual + +end_loops1 + b end_func + +outer_loop4_residual + sub r12, r0, #3 ;pu1_src - 3 + mov r1, r7 + add r1, #8 + mov r10, #4 + add r12, #8 + mov r14, #16 + add r8, #4 + add r9, #4 + +outer_loop_4 + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + subs r5, r10, #0 ;checks wd + ble end_inner_loop_4 + +inner_loop_4 + vld1.u32 {d0}, [r12], r11 ;vector load pu1_src + vld1.u32 {d1}, [r12], r11 + vld1.u32 {d2}, [r12], r11 + vld1.u32 {d3}, [r12], r11 + vld1.u32 {d4}, [r12], r11 + vld1.u32 {d5}, [r12], r11 + vld1.u32 {d6}, [r12], r11 + vld1.u32 {d7}, [r12], r11 + sub r12, r12, #4 + vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd + vld1.u32 {d13}, [r4], r11 + vzip.32 d0, d12 ;vector zip the i iteration and ii + ; interation in single register + vld1.u32 {d14}, [r4], r11 + vzip.32 d1, d13 + vld1.u32 {d15}, [r4], r11 + vzip.32 d2, d14 + vld1.u32 {d16}, [r4], r11 + vzip.32 d3, d15 + vld1.u32 {d17}, [r4], r11 + vzip.32 d4, d16 + vld1.u32 {d18}, [r4], r11 + vzip.32 d5, d17 + vld1.u32 {d19}, [r4], r11 + mov r7, #0xc000 + vdup.16 q4, r7 + sub r4, r4, #4 + vzip.32 d6, d18 + vzip.32 d7, d19 + vmlal.u8 q4, d1, d25 ;arithmetic operations for ii + ; iteration in the same time + vmlsl.u8 q4, d0, d24 + vmlsl.u8 q4, d2, d26 + vmlal.u8 q4, d3, d27 + vmlal.u8 q4, d4, d28 + vmlsl.u8 q4, d5, d29 + vmlal.u8 q4, d6, d30 + vmlsl.u8 q4, d7, d31 + mov r7, #0x4000 + vdup.16 q10, r7 + vhadd.s16 q4, q4, q10 + vqrshrun.s16 d8, q4, #6 + vld1.u32 {d10[0]}, [r1] + vld1.u32 {d10[1]}, [r6] + vrhadd.u8 d8, d8, d10 + vst1.32 {d8[0]},[r1]! ;store the i iteration result which + ; is in upper part of the register + vst1.32 {d8[1]},[r6]! ;store the ii iteration result which + ; is in lower part of the register + subs r5, r5, #4 ;decrement the wd by 4 + bgt inner_loop_4 + +end_inner_loop_4 + subs r14, r14, #2 ;decrement the ht by 4 + add r12, r12, r9 ;increment the input pointer + ; 2*src_strd-wd + add r1, r1, r8 ;increment the output pointer + ; 2*dst_strd-wd + bgt outer_loop_4 + +end_func + vpop {d8 - d15} + ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp + + ENDP + + END diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm new file mode 100644 index 0000000000..d310a83dad --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm @@ -0,0 +1,486 @@ +; +; Copyright (c) 2018 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +;**************Variables Vs Registers*********************************** +; r0 => src +; r1 => dst +; r2 => src_stride +; r6 => dst_stride +; r12 => filter_y0 +; r5 => ht +; r3 => wd + + EXPORT |vpx_convolve8_avg_vert_filter_type1_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vpx_convolve8_avg_vert_filter_type1_neon| PROC + + stmfd sp!, {r4 - r12, r14} ;stack stores the values of + ; the arguments + vpush {d8 - d15} ; stack offset by 64 + mov r4, r1 + mov r1, r2 + mov r2, r4 + vmov.i16 q15, #0x4000 + mov r11, #0xc000 + ldr r12, [sp, #104] ;load filter + ldr r6, [sp, #116] ;load y0_q4 + add r12, r12, r6, lsl #4 ;r12 = filter[y0_q4] + mov r6, r3 + ldr r5, [sp, #124] ;load wd + vld2.8 {d0, d1}, [r12] ;coeff = vld1_s8(pi1_coeff) + sub r12, r2, r2, lsl #2 ;src_ctrd & pi1_coeff + vabs.s8 d0, d0 ;vabs_s8(coeff) + add r0, r0, r12 ;r0->pu1_src r12->pi1_coeff + ldr r3, [sp, #128] ;load ht + subs r7, r3, #0 ;r3->ht + vdup.u8 d22, d0[0] ;coeffabs_0 = vdup_lane_u8(coeffabs, + ; 0); + cmp r5, #8 + vdup.u8 d23, d0[1] ;coeffabs_1 = vdup_lane_u8(coeffabs, + ; 1); + vdup.u8 d24, d0[2] ;coeffabs_2 = vdup_lane_u8(coeffabs, + ; 2); + vdup.u8 d25, d0[3] ;coeffabs_3 = vdup_lane_u8(coeffabs, + ; 3); + vdup.u8 d26, d0[4] ;coeffabs_4 = vdup_lane_u8(coeffabs, + ; 4); + vdup.u8 d27, d0[5] ;coeffabs_5 = vdup_lane_u8(coeffabs, + ; 5); + vdup.u8 d28, d0[6] ;coeffabs_6 = vdup_lane_u8(coeffabs, + ; 6); + vdup.u8 d29, d0[7] ;coeffabs_7 = vdup_lane_u8(coeffabs, + ; 7); + blt core_loop_wd_4 ;core loop wd 4 jump + str r0, [sp, #-4]! + str r1, [sp, #-4]! + bic r4, r5, #7 ;r5 ->wd + rsb r9, r4, r6, lsl #2 ;r6->dst_strd r5 ->wd + rsb r8, r4, r2, lsl #2 ;r2->src_strd + mov r3, r5, lsr #3 ;divide by 8 + mul r7, r3 ;multiply height by width + sub r7, #4 ;subtract by one for epilog + +prolog + and r10, r0, #31 + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vdup.16 q4, r11 + vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + subs r4, r4, #8 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vdup.16 q5, r11 + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + addle r0, r0, r8 + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + bicle r4, r5, #7 ;r5 ->wd + vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + pld [r3] + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + pld [r3, r2] + pld [r3, r2, lsl #1] + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + add r3, r3, r2 + vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + pld [r3, r2, lsl #1] + vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + vld1.u8 {d20}, [r1] + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d1}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q6, d3, d23 + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q6, d2, d22 + vrhadd.u8 d8, d8, d20 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q6, d4, d24 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d5, d25 + vmlal.u8 q6, d6, d26 + add r14, r1, r6 + vmlal.u8 q6, d7, d27 + vmlsl.u8 q6, d16, d28 + vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res); + vmlsl.u8 q6, d17, d29 + vld1.u8 {d20}, [r14] + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + addle r1, r1, r9 + vmlsl.u8 q7, d4, d23 + subs r7, r7, #4 + vmlsl.u8 q7, d3, d22 + vmlal.u8 q7, d5, d24 + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d6, d25 + vrhadd.u8 d10, d10, d20 + vhadd.s16 q6, q6, q15 + vdup.16 q4, r11 + vmlal.u8 q7, d7, d26 + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d16, d27 + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d17, d28 + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d18, d29 + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + vqrshrun.s16 d12, q6, #6 + blt epilog_end ;jumps to epilog_end + + beq epilog ;jumps to epilog + +main_loop_8 + subs r4, r4, #8 + vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + vld1.u8 {d20}, [r14] + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + addle r0, r0, r8 + bicle r4, r5, #7 ;r5 ->wd + vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vrhadd.u8 d12, d12, d20 + vhadd.s16 q7, q7, q15 + vdup.16 q5, r11 + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vst1.8 {d12}, [r14], r6 + vld1.u8 {d20}, [r14] + vqrshrun.s16 d14, q7, #6 + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + vrhadd.u8 d14, d14, d20 + vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + vst1.8 {d14}, [r14], r6 + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + add r14, r1, #0 + vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + add r1, r1, #8 + vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + addle r1, r1, r9 + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + vld1.u8 {d20}, [r14] + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vmlsl.u8 q6, d3, d23 + add r10, r3, r2, lsl #3 ; 10*strd - 8+2 + vmlsl.u8 q6, d2, d22 + vrhadd.u8 d8, d8, d20 + add r10, r10, r2 ; 11*strd + vmlal.u8 q6, d4, d24 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q6, d5, d25 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d6, d26 + vst1.8 {d8}, [r14], r6 ;vst1_u8(pu1_dst,sto_res); + pld [r10] ;11+ 0 + vmlal.u8 q6, d7, d27 + pld [r10, r2] ;11+ 1*strd + pld [r10, r2, lsl #1] ;11+ 2*strd + vmlsl.u8 q6, d16, d28 + add r10, r10, r2 ;12*strd + vmlsl.u8 q6, d17, d29 + vld1.u8 {d20}, [r14] + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + + pld [r10, r2, lsl #1] ;11+ 3*strd + vmlsl.u8 q7, d4, d23 + vmlsl.u8 q7, d3, d22 + vrhadd.u8 d10, d10, d20 + subs r7, r7, #4 + vmlal.u8 q7, d5, d24 + vmlal.u8 q7, d6, d25 + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vhadd.s16 q6, q6, q15 + vdup.16 q4, r11 + vmlal.u8 q7, d7, d26 + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d16, d27 + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d17, d28 + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d18, d29 + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vqrshrun.s16 d12, q6, #6 + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + bgt main_loop_8 ;jumps to main_loop_8 + +epilog + vld1.u8 {d20}, [r14] + vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vrhadd.u8 d12, d12, d20 + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vhadd.s16 q7, q7, q15 + vdup.16 q5, r11 + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vst1.8 {d12}, [r14], r6 + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vld1.u8 {d20}, [r14] + vqrshrun.s16 d14, q7, #6 + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + vrhadd.u8 d14, d14, d20 + vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + vst1.8 {d14}, [r14], r6 + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + vld1.u8 {d20}, [r1] + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q6, d3, d23 + vmlsl.u8 q6, d2, d22 + vrhadd.u8 d8, d8, d20 + vmlal.u8 q6, d4, d24 + vmlal.u8 q6, d5, d25 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d6, d26 + vmlal.u8 q6, d7, d27 + add r14, r1, r6 + vmlsl.u8 q6, d16, d28 + vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res); + vmlsl.u8 q6, d17, d29 + vld1.u8 {d20}, [r14] + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d4, d23 + vmlsl.u8 q7, d3, d22 + vrhadd.u8 d10, d10, d20 + vmlal.u8 q7, d5, d24 + vmlal.u8 q7, d6, d25 + vhadd.s16 q6, q6, q15 + vmlal.u8 q7, d7, d26 + vmlal.u8 q7, d16, d27 + vmlsl.u8 q7, d17, d28 + vmlsl.u8 q7, d18, d29 + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + vqrshrun.s16 d12, q6, #6 + +epilog_end + vld1.u8 {d20}, [r14] + vrhadd.u8 d12, d12, d20 + vst1.8 {d12}, [r14], r6 + vhadd.s16 q7, q7, q15 + vqrshrun.s16 d14, q7, #6 + vld1.u8 {d20}, [r14] + vrhadd.u8 d14, d14, d20 + vst1.8 {d14}, [r14], r6 + +end_loops + tst r5, #7 + ldr r1, [sp], #4 + ldr r0, [sp], #4 + vpopeq {d8 - d15} + ldmfdeq sp!, {r4 - r12, r15} ;reload the registers from sp + mov r5, #4 + add r0, r0, #8 + add r1, r1, #8 + mov r7, #16 + +core_loop_wd_4 + rsb r9, r5, r6, lsl #2 ;r6->dst_strd r5 ->wd + rsb r8, r5, r2, lsl #2 ;r2->src_strd + vmov.i8 d4, #0 + +outer_loop_wd_4 + subs r12, r5, #0 + ble end_inner_loop_wd_4 ;outer loop jump + +inner_loop_wd_4 + add r3, r0, r2 + vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 1); + subs r12, r12, #4 + vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1, + ; 1); + vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp2, 1); + vld1.u32 {d4[0]},[r0] ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 0); + vdup.16 q0, r11 + vmlsl.u8 q0, d5, d23 ;mul_res1 = + ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1); + vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2, + ; 1); + add r0, r0, #4 + vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp3, 1); + vmlsl.u8 q0, d4, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp1), coeffabs_0); + vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3, + ; 1); + vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp4, 1); + vmlal.u8 q0, d6, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp3), coeffabs_2); + vdup.16 q4, r11 + vmlsl.u8 q4, d7, d23 + vdup.u32 d4, d7[1] ;src_tmp1 = vdup_lane_u32(src_tmp4, + ; 1); + vmull.u8 q1, d7, d25 ;mul_res2 = + ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3); + vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 1); + vmlsl.u8 q4, d6, d22 + vmlal.u8 q0, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp1), coeffabs_4); + vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1, + ; 1); + vmlal.u8 q4, d4, d24 + vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp2, 1); + vmlal.u8 q1, d5, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; vreinterpret_u8_u32(src_tmp2), coeffabs_5); + vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2, + ; 1); + vmlal.u8 q4, d5, d25 + vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp3, 1); + vmlsl.u8 q0, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp3), coeffabs_6); + vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3, + ; 1); + vmlal.u8 q4, d6, d26 + vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp4, 1); + vmlsl.u8 q1, d7, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; vreinterpret_u8_u32(src_tmp4), coeffabs_7); + vdup.u32 d4, d7[1] + vadd.i16 q0, q0, q1 ;mul_res1 = vaddq_u16(mul_res1, + ; mul_res2); + vmlal.u8 q4, d7, d27 + vld1.u32 {d4[1]},[r3], r2 + vmlsl.u8 q4, d4, d28 + vdup.u32 d5, d4[1] + vhadd.s16 q0, q0, q15 + vqrshrun.s16 d0, q0, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u32 {d5[1]},[r3] + add r3, r1, r6 + vld1.u32 {d20[0]}, [r1] + vld1.u32 {d20[1]}, [r3] + vrhadd.u8 d0, d0, d20 + vst1.32 {d0[0]},[r1] ;vst1_lane_u32((uint32_t *)pu1_dst, + ; vreinterpret_u32_u8(sto_res), 0); + vmlsl.u8 q4, d5, d29 + vst1.32 {d0[1]},[r3], r6 ;vst1_lane_u32((uint32_t + ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1); + vhadd.s16 q4, q4, q15 + vqrshrun.s16 d8, q4, #6 + mov r4, r3 + vld1.u32 {d20[0]}, [r4], r6 + vld1.u32 {d20[1]}, [r4] + vrhadd.u8 d8, d8, d20 + vst1.32 {d8[0]},[r3], r6 + add r1, r1, #4 + vst1.32 {d8[1]},[r3] + bgt inner_loop_wd_4 + +end_inner_loop_wd_4 + subs r7, r7, #4 + add r1, r1, r9 + add r0, r0, r8 + bgt outer_loop_wd_4 + + vpop {d8 - d15} + ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp + + ENDP + + END diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm new file mode 100644 index 0000000000..c5695fbda8 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm @@ -0,0 +1,487 @@ +; +; Copyright (c) 2018 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +;**************Variables Vs Registers*********************************** +; r0 => src +; r1 => dst +; r2 => src_stride +; r6 => dst_stride +; r12 => filter_y0 +; r5 => ht +; r3 => wd + + EXPORT |vpx_convolve8_avg_vert_filter_type2_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vpx_convolve8_avg_vert_filter_type2_neon| PROC + + stmfd sp!, {r4 - r12, r14} ;stack stores the values of + ; the arguments + vpush {d8 - d15} ; stack offset by 64 + mov r4, r1 + mov r1, r2 + mov r2, r4 + vmov.i16 q15, #0x4000 + mov r11, #0xc000 + ldr r12, [sp, #104] ;load filter + ldr r6, [sp, #116] ;load y0_q4 + add r12, r12, r6, lsl #4 ;r12 = filter[y0_q4] + mov r6, r3 + ldr r5, [sp, #124] ;load wd + vld2.8 {d0, d1}, [r12] ;coeff = vld1_s8(pi1_coeff) + sub r12, r2, r2, lsl #2 ;src_ctrd & pi1_coeff + vabs.s8 d0, d0 ;vabs_s8(coeff) + add r0, r0, r12 ;r0->pu1_src r12->pi1_coeff + ldr r3, [sp, #128] ;load ht + subs r7, r3, #0 ;r3->ht + vdup.u8 d22, d0[0] ;coeffabs_0 = vdup_lane_u8(coeffabs, + ; 0); + cmp r5, #8 + vdup.u8 d23, d0[1] ;coeffabs_1 = vdup_lane_u8(coeffabs, + ; 1); + vdup.u8 d24, d0[2] ;coeffabs_2 = vdup_lane_u8(coeffabs, + ; 2); + vdup.u8 d25, d0[3] ;coeffabs_3 = vdup_lane_u8(coeffabs, + ; 3); + vdup.u8 d26, d0[4] ;coeffabs_4 = vdup_lane_u8(coeffabs, + ; 4); + vdup.u8 d27, d0[5] ;coeffabs_5 = vdup_lane_u8(coeffabs, + ; 5); + vdup.u8 d28, d0[6] ;coeffabs_6 = vdup_lane_u8(coeffabs, + ; 6); + vdup.u8 d29, d0[7] ;coeffabs_7 = vdup_lane_u8(coeffabs, + ; 7); + blt core_loop_wd_4 ;core loop wd 4 jump + + str r0, [sp, #-4]! + str r1, [sp, #-4]! + bic r4, r5, #7 ;r5 ->wd + rsb r9, r4, r6, lsl #2 ;r6->dst_strd r5 ->wd + rsb r8, r4, r2, lsl #2 ;r2->src_strd + mov r3, r5, lsr #3 ;divide by 8 + mul r7, r3 ;multiply height by width + sub r7, #4 ;subtract by one for epilog + +prolog + and r10, r0, #31 + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vdup.16 q4, r11 + vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + subs r4, r4, #8 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vdup.16 q5, r11 + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + addle r0, r0, r8 + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + bicle r4, r5, #7 ;r5 ->wd + vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + pld [r3] + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + pld [r3, r2] + pld [r3, r2, lsl #1] + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + add r3, r3, r2 + vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + pld [r3, r2, lsl #1] + vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + vld1.u8 {d20}, [r1] + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d1}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q6, d3, d23 + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q6, d2, d22 + vrhadd.u8 d8, d8, d20 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q6, d4, d24 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d5, d25 + vmlal.u8 q6, d6, d26 + add r14, r1, r6 + vmlsl.u8 q6, d7, d27 + vmlal.u8 q6, d16, d28 + vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res); + vmlsl.u8 q6, d17, d29 + vld1.u8 {d20}, [r14] + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + addle r1, r1, r9 + vmlal.u8 q7, d4, d23 + subs r7, r7, #4 + vmlsl.u8 q7, d3, d22 + vmlsl.u8 q7, d5, d24 + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d6, d25 + vrhadd.u8 d10, d10, d20 + vhadd.s16 q6, q6, q15 + vdup.16 q4, r11 + vmlal.u8 q7, d7, d26 + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d16, d27 + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d17, d28 + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d18, d29 + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + vqrshrun.s16 d12, q6, #6 + blt epilog_end ;jumps to epilog_end + + beq epilog ;jumps to epilog + +main_loop_8 + subs r4, r4, #8 + vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + vld1.u8 {d20}, [r14] + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + addle r0, r0, r8 + bicle r4, r5, #7 ;r5 ->wd + vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vrhadd.u8 d12, d12, d20 + vhadd.s16 q7, q7, q15 + vdup.16 q5, r11 + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vst1.8 {d12}, [r14], r6 + vld1.u8 {d20}, [r14] + vqrshrun.s16 d14, q7, #6 + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + vrhadd.u8 d14, d14, d20 + vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + vst1.8 {d14}, [r14], r6 + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + add r14, r1, #0 + vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + add r1, r1, #8 + vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + addle r1, r1, r9 + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + vld1.u8 {d20}, [r14] + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vmlal.u8 q6, d3, d23 + add r10, r3, r2, lsl #3 ; 10*strd - 8+2 + vmlsl.u8 q6, d2, d22 + vrhadd.u8 d8, d8, d20 + add r10, r10, r2 ; 11*strd + vmlsl.u8 q6, d4, d24 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q6, d5, d25 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d6, d26 + vst1.8 {d8}, [r14], r6 ;vst1_u8(pu1_dst,sto_res); + pld [r10] ;11+ 0 + vmlsl.u8 q6, d7, d27 + pld [r10, r2] ;11+ 1*strd + pld [r10, r2, lsl #1] ;11+ 2*strd + vmlal.u8 q6, d16, d28 + add r10, r10, r2 ;12*strd + vmlsl.u8 q6, d17, d29 + vld1.u8 {d20}, [r14] + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + pld [r10, r2, lsl #1] ;11+ 3*strd + vmlal.u8 q7, d4, d23 + vmlsl.u8 q7, d3, d22 + vrhadd.u8 d10, d10, d20 + subs r7, r7, #4 + vmlsl.u8 q7, d5, d24 + vmlal.u8 q7, d6, d25 + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vhadd.s16 q6, q6, q15 + vdup.16 q4, r11 + vmlal.u8 q7, d7, d26 + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d16, d27 + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d17, d28 + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d18, d29 + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vqrshrun.s16 d12, q6, #6 + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + bgt main_loop_8 ;jumps to main_loop_8 + +epilog + vld1.u8 {d20}, [r14] + vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vrhadd.u8 d12, d12, d20 + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vhadd.s16 q7, q7, q15 + vdup.16 q5, r11 + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vst1.8 {d12}, [r14], r6 + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vld1.u8 {d20}, [r14] + vqrshrun.s16 d14, q7, #6 + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + vrhadd.u8 d14, d14, d20 + vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + vst1.8 {d14}, [r14], r6 + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + vld1.u8 {d20}, [r1] + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q6, d3, d23 + vmlsl.u8 q6, d2, d22 + vrhadd.u8 d8, d8, d20 + vmlsl.u8 q6, d4, d24 + vmlal.u8 q6, d5, d25 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d6, d26 + vmlsl.u8 q6, d7, d27 + add r14, r1, r6 + vmlal.u8 q6, d16, d28 + vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res); + vmlsl.u8 q6, d17, d29 + vld1.u8 {d20}, [r14] + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d4, d23 + vmlsl.u8 q7, d3, d22 + vrhadd.u8 d10, d10, d20 + vmlsl.u8 q7, d5, d24 + vmlal.u8 q7, d6, d25 + vhadd.s16 q6, q6, q15 + vmlal.u8 q7, d7, d26 + vmlsl.u8 q7, d16, d27 + vmlal.u8 q7, d17, d28 + vmlsl.u8 q7, d18, d29 + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + vqrshrun.s16 d12, q6, #6 + +epilog_end + vld1.u8 {d20}, [r14] + vrhadd.u8 d12, d12, d20 + vst1.8 {d12}, [r14], r6 + vhadd.s16 q7, q7, q15 + vqrshrun.s16 d14, q7, #6 + vld1.u8 {d20}, [r14] + vrhadd.u8 d14, d14, d20 + vst1.8 {d14}, [r14], r6 + +end_loops + tst r5, #7 + ldr r1, [sp], #4 + ldr r0, [sp], #4 + vpopeq {d8 - d15} + ldmfdeq sp!, {r4 - r12, r15} ;reload the registers from sp + + mov r5, #4 + add r0, r0, #8 + add r1, r1, #8 + mov r7, #16 + +core_loop_wd_4 + rsb r9, r5, r6, lsl #2 ;r6->dst_strd r5 ->wd + rsb r8, r5, r2, lsl #2 ;r2->src_strd + vmov.i8 d4, #0 + +outer_loop_wd_4 + subs r12, r5, #0 + ble end_inner_loop_wd_4 ;outer loop jump + +inner_loop_wd_4 + add r3, r0, r2 + vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 1); + subs r12, r12, #4 + vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1, + ; 1); + vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp2, 1); + vld1.u32 {d4[0]},[r0] ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 0); + vdup.16 q0, r11 + vmlal.u8 q0, d5, d23 ;mul_res1 = + ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1); + vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2, + ; 1); + add r0, r0, #4 + vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp3, 1); + vmlsl.u8 q0, d4, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp1), coeffabs_0); + vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3, + ; 1); + vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp4, 1); + vmlsl.u8 q0, d6, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp3), coeffabs_2); + vdup.16 q4, r11 + vmlal.u8 q4, d7, d23 + vdup.u32 d4, d7[1] ;src_tmp1 = vdup_lane_u32(src_tmp4, + ; 1); + vmull.u8 q1, d7, d25 ;mul_res2 = + ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3); + vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 1); + vmlsl.u8 q4, d6, d22 + vmlal.u8 q0, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp1), coeffabs_4); + vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1, + ; 1); + vmlsl.u8 q4, d4, d24 + vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp2, 1); + vmlsl.u8 q1, d5, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; vreinterpret_u8_u32(src_tmp2), coeffabs_5); + vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2, + ; 1); + vmlal.u8 q4, d5, d25 + vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp3, 1); + vmlal.u8 q0, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp3), coeffabs_6); + vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3, + ; 1); + vmlal.u8 q4, d6, d26 + vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp4, 1); + vmlsl.u8 q1, d7, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; vreinterpret_u8_u32(src_tmp4), coeffabs_7); + vdup.u32 d4, d7[1] + vadd.i16 q0, q0, q1 ;mul_res1 = vaddq_u16(mul_res1, + ; mul_res2); + vmlsl.u8 q4, d7, d27 + vld1.u32 {d4[1]},[r3], r2 + vmlal.u8 q4, d4, d28 + vdup.u32 d5, d4[1] + vhadd.s16 q0, q0, q15 + vqrshrun.s16 d0, q0, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u32 {d5[1]},[r3] + add r3, r1, r6 + vld1.u32 {d20[0]}, [r1] + vld1.u32 {d20[1]}, [r3] + vrhadd.u8 d0, d0, d20 + vst1.32 {d0[0]},[r1] ;vst1_lane_u32((uint32_t *)pu1_dst, + ; vreinterpret_u32_u8(sto_res), 0); + vmlsl.u8 q4, d5, d29 + vst1.32 {d0[1]},[r3], r6 ;vst1_lane_u32((uint32_t + ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1); + vhadd.s16 q4, q4, q15 + vqrshrun.s16 d8, q4, #6 + mov r4, r3 + vld1.u32 {d20[0]}, [r4], r6 + vld1.u32 {d20[1]}, [r4] + vrhadd.u8 d8, d8, d20 + vst1.32 {d8[0]},[r3], r6 + add r1, r1, #4 + vst1.32 {d8[1]},[r3] + bgt inner_loop_wd_4 + +end_inner_loop_wd_4 + subs r7, r7, #4 + add r1, r1, r9 + add r0, r0, r8 + bgt outer_loop_wd_4 + + vpop {d8 - d15} + ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp + + ENDP + + END diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm new file mode 100644 index 0000000000..fa1b732466 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm @@ -0,0 +1,415 @@ +; +; Copyright (c) 2018 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +;**************Variables Vs Registers*********************************** +; r0 => src +; r1 => dst +; r2 => src_stride +; r3 => dst_stride +; r4 => filter_x0 +; r8 => ht +; r10 => wd + + EXPORT |vpx_convolve8_horiz_filter_type1_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vpx_convolve8_horiz_filter_type1_neon| PROC + + stmfd sp!, {r4 - r12, r14} ;stack stores the values of + ; the arguments + vpush {d8 - d15} ; stack offset by 64 + mov r4, r1 + mov r1, r2 + mov r2, r4 +start_loop_count + ldr r4, [sp, #104] ;loads pi1_coeff + ldr r8, [sp, #108] ;loads x0_q4 + add r4, r4, r8, lsl #4 ;r4 = filter[x0_q4] + ldr r8, [sp, #128] ;loads ht + ldr r10, [sp, #124] ;loads wd + vld2.8 {d0, d1}, [r4] ;coeff = vld1_s8(pi1_coeff) + mov r11, #1 + subs r14, r8, #0 ;checks for ht == 0 + vabs.s8 d2, d0 ;vabs_s8(coeff) + vdup.8 d24, d2[0] ;coeffabs_0 = vdup_lane_u8(coeffabs, + ; 0) + sub r12, r0, #3 ;pu1_src - 3 + vdup.8 d25, d2[1] ;coeffabs_1 = vdup_lane_u8(coeffabs, + ; 1) + add r4, r12, r2 ;pu1_src_tmp2_8 = pu1_src + src_strd + vdup.8 d26, d2[2] ;coeffabs_2 = vdup_lane_u8(coeffabs, + ; 2) + rsb r9, r10, r2, lsl #1 ;2*src_strd - wd + vdup.8 d27, d2[3] ;coeffabs_3 = vdup_lane_u8(coeffabs, + ; 3) + rsb r8, r10, r3, lsl #1 ;2*dst_strd - wd + vdup.8 d28, d2[4] ;coeffabs_4 = vdup_lane_u8(coeffabs, + ; 4) + vdup.8 d29, d2[5] ;coeffabs_5 = vdup_lane_u8(coeffabs, + ; 5) + vdup.8 d30, d2[6] ;coeffabs_6 = vdup_lane_u8(coeffabs, + ; 6) + vdup.8 d31, d2[7] ;coeffabs_7 = vdup_lane_u8(coeffabs, + ; 7) + mov r7, r1 + cmp r10, #4 + ble outer_loop_4 + + cmp r10, #24 + moveq r10, #16 + addeq r8, #8 + addeq r9, #8 + cmp r10, #16 + bge outer_loop_16 + + cmp r10, #12 + addeq r8, #4 + addeq r9, #4 + b outer_loop_8 + +outer_loop8_residual + sub r12, r0, #3 ;pu1_src - 3 + mov r1, r7 + mov r14, #32 + add r1, #16 + add r12, #16 + mov r10, #8 + add r8, #8 + add r9, #8 + +outer_loop_8 + + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + subs r5, r10, #0 ;checks wd + ble end_inner_loop_8 + +inner_loop_8 + mov r7, #0xc000 + vld1.u32 {d0}, [r12], r11 ;vector load pu1_src + vdup.16 q4, r7 + vld1.u32 {d1}, [r12], r11 + vdup.16 q5, r7 + vld1.u32 {d2}, [r12], r11 + vld1.u32 {d3}, [r12], r11 + mov r7, #0x4000 + vld1.u32 {d4}, [r12], r11 + vmlsl.u8 q4, d1, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {d5}, [r12], r11 + vmlal.u8 q4, d3, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {d6}, [r12], r11 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vld1.u32 {d7}, [r12], r11 + vmlal.u8 q4, d2, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd + vmlal.u8 q4, d4, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vld1.u32 {d13}, [r4], r11 + vmlal.u8 q4, d5, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vld1.u32 {d14}, [r4], r11 + vmlsl.u8 q4, d6, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vld1.u32 {d15}, [r4], r11 + vmlsl.u8 q4, d7, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + vld1.u32 {d16}, [r4], r11 ;vector load pu1_src + src_strd + vdup.16 q11, r7 + vmlal.u8 q5, d15, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {d17}, [r4], r11 + vmlal.u8 q5, d14, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vhadd.s16 q4, q4, q11 + vld1.u32 {d18}, [r4], r11 + vmlal.u8 q5, d16, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vld1.u32 {d19}, [r4], r11 ;vector load pu1_src + src_strd + vmlal.u8 q5, d17, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vmlsl.u8 q5, d18, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q5, d19, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + vqrshrun.s16 d20, q4, #6 ;right shift and saturating narrow + ; result 1 + vmlsl.u8 q5, d12, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vmlsl.u8 q5, d13, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vst1.8 {d20}, [r1]! ;store the result pu1_dst + vhadd.s16 q5, q5, q11 + subs r5, r5, #8 ;decrement the wd loop + vqrshrun.s16 d8, q5, #6 ;right shift and saturating narrow + ; result 2 + vst1.8 {d8}, [r6]! ;store the result pu1_dst + cmp r5, #4 + bgt inner_loop_8 + +end_inner_loop_8 + subs r14, r14, #2 ;decrement the ht loop + add r12, r12, r9 ;increment the src pointer by + ; 2*src_strd-wd + add r1, r1, r8 ;increment the dst pointer by + ; 2*dst_strd-wd + bgt outer_loop_8 + + ldr r10, [sp, #120] ;loads wd + cmp r10, #12 + beq outer_loop4_residual + +end_loops + b end_func + +outer_loop_16 + str r0, [sp, #-4]! + str r7, [sp, #-4]! + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + and r0, r12, #31 + mov r7, #0xc000 + sub r5, r10, #0 ;checks wd + pld [r4, r2, lsl #1] + pld [r12, r2, lsl #1] + vld1.u32 {q0}, [r12], r11 ;vector load pu1_src + vdup.16 q4, r7 + vld1.u32 {q1}, [r12], r11 + vld1.u32 {q2}, [r12], r11 + vld1.u32 {q3}, [r12], r11 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vld1.u32 {q6}, [r12], r11 + vmlsl.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {q7}, [r12], r11 + vmlal.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {q8}, [r12], r11 + vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {q9}, [r12], r11 + vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vmlal.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vdup.16 q10, r7 + vmlsl.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + +inner_loop_16 + vmlsl.u8 q10, d1, d24 + vdup.16 q5, r7 + vmlsl.u8 q10, d3, d25 + mov r7, #0x4000 + vdup.16 q11, r7 + vmlal.u8 q10, d5, d26 + vld1.u32 {q0}, [r4], r11 ;vector load pu1_src + vhadd.s16 q4, q4, q11 + vld1.u32 {q1}, [r4], r11 + vmlal.u8 q10, d7, d27 + add r12, #8 + subs r5, r5, #16 + vmlal.u8 q10, d13, d28 + vld1.u32 {q2}, [r4], r11 + vmlal.u8 q10, d15, d29 + vld1.u32 {q3}, [r4], r11 + vqrshrun.s16 d8, q4, #6 ;right shift and saturating narrow + ; result 1 + vmlsl.u8 q10, d17, d30 + vld1.u32 {q6}, [r4], r11 + vmlsl.u8 q10, d19, d31 + vld1.u32 {q7}, [r4], r11 + vmlsl.u8 q5, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vmlsl.u8 q5, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {q8}, [r4], r11 + vhadd.s16 q10, q10, q11 + vld1.u32 {q9}, [r4], r11 + vmlal.u8 q5, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vmlal.u8 q5, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + add r4, #8 + mov r7, #0xc000 + vmlal.u8 q5, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vmlal.u8 q5, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vqrshrun.s16 d9, q10, #6 + vdup.16 q11, r7 + vmlsl.u8 q5, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q5, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + mov r7, #0x4000 + vmlsl.u8 q11, d1, d24 + vst1.8 {q4}, [r1]! ;store the result pu1_dst + vmlsl.u8 q11, d3, d25 + vdup.16 q10, r7 + vmlal.u8 q11, d5, d26 + pld [r12, r2, lsl #2] + pld [r4, r2, lsl #2] + addeq r12, r12, r9 ;increment the src pointer by + ; 2*src_strd-wd + addeq r4, r12, r2 ;pu1_src + src_strd + vmlal.u8 q11, d7, d27 + addeq r1, r1, r8 + subeq r14, r14, #2 + vmlal.u8 q11, d13, d28 + vhadd.s16 q5, q5, q10 + vmlal.u8 q11, d15, d29 + vmlsl.u8 q11, d17, d30 + cmp r14, #0 + vmlsl.u8 q11, d19, d31 + vqrshrun.s16 d10, q5, #6 ;right shift and saturating narrow + ; result 2 + beq epilog_16 + + vld1.u32 {q0}, [r12], r11 ;vector load pu1_src + mov r7, #0xc000 + cmp r5, #0 + vld1.u32 {q1}, [r12], r11 + vhadd.s16 q11, q11, q10 + vld1.u32 {q2}, [r12], r11 + vdup.16 q4, r7 + vld1.u32 {q3}, [r12], r11 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vld1.u32 {q6}, [r12], r11 + vld1.u32 {q7}, [r12], r11 + vmlsl.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {q8}, [r12], r11 + vmlal.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {q9}, [r12], r11 + vqrshrun.s16 d11, q11, #6 + vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + moveq r5, r10 + vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vdup.16 q10, r7 + vmlal.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vst1.8 {q5}, [r6]! ;store the result pu1_dst + vmlsl.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + addeq r6, r1, r3 ;pu1_dst + dst_strd + b inner_loop_16 + +epilog_16 + mov r7, #0x4000 + ldr r0, [sp], #4 + ldr r10, [sp, #120] + vdup.16 q10, r7 + vhadd.s16 q11, q11, q10 + vqrshrun.s16 d11, q11, #6 + vst1.8 {q5}, [r6]! ;store the result pu1_dst + ldr r7, [sp], #4 + cmp r10, #24 + beq outer_loop8_residual + +end_loops1 + b end_func + +outer_loop4_residual + sub r12, r0, #3 ;pu1_src - 3 + mov r1, r7 + add r1, #8 + mov r10, #4 + add r12, #8 + mov r14, #16 + add r8, #4 + add r9, #4 + +outer_loop_4 + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + subs r5, r10, #0 ;checks wd + ble end_inner_loop_4 + +inner_loop_4 + vld1.u32 {d0}, [r12], r11 ;vector load pu1_src + vld1.u32 {d1}, [r12], r11 + vld1.u32 {d2}, [r12], r11 + vld1.u32 {d3}, [r12], r11 + vld1.u32 {d4}, [r12], r11 + vld1.u32 {d5}, [r12], r11 + vld1.u32 {d6}, [r12], r11 + vld1.u32 {d7}, [r12], r11 + sub r12, r12, #4 + vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd + vld1.u32 {d13}, [r4], r11 + vzip.32 d0, d12 ;vector zip the i iteration and ii + ; interation in single register + vld1.u32 {d14}, [r4], r11 + vzip.32 d1, d13 + vld1.u32 {d15}, [r4], r11 + vzip.32 d2, d14 + vld1.u32 {d16}, [r4], r11 + vzip.32 d3, d15 + vld1.u32 {d17}, [r4], r11 + vzip.32 d4, d16 + vld1.u32 {d18}, [r4], r11 + vzip.32 d5, d17 + vld1.u32 {d19}, [r4], r11 + mov r7, #0xc000 + vdup.16 q4, r7 + sub r4, r4, #4 + vzip.32 d6, d18 + vzip.32 d7, d19 + vmlsl.u8 q4, d1, d25 ;arithmetic operations for ii + ; iteration in the same time + vmlsl.u8 q4, d0, d24 + vmlal.u8 q4, d2, d26 + vmlal.u8 q4, d3, d27 + vmlal.u8 q4, d4, d28 + vmlal.u8 q4, d5, d29 + vmlsl.u8 q4, d6, d30 + vmlsl.u8 q4, d7, d31 + mov r7, #0x4000 + vdup.16 q10, r7 + vhadd.s16 q4, q4, q10 + vqrshrun.s16 d8, q4, #6 + vst1.32 {d8[0]},[r1]! ;store the i iteration result which + ; is in upper part of the register + vst1.32 {d8[1]},[r6]! ;store the ii iteration result which + ; is in lower part of the register + subs r5, r5, #4 ;decrement the wd by 4 + bgt inner_loop_4 + +end_inner_loop_4 + subs r14, r14, #2 ;decrement the ht by 4 + add r12, r12, r9 ;increment the input pointer + ; 2*src_strd-wd + add r1, r1, r8 ;increment the output pointer + ; 2*dst_strd-wd + bgt outer_loop_4 + +end_func + vpop {d8 - d15} + ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp + + ENDP + + END diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm new file mode 100644 index 0000000000..90b2c8fef7 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm @@ -0,0 +1,415 @@ +; +; Copyright (c) 2018 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +;**************Variables Vs Registers*********************************** +; r0 => src +; r1 => dst +; r2 => src_stride +; r3 => dst_stride +; r4 => filter_x0 +; r8 => ht +; r10 => wd + + EXPORT |vpx_convolve8_horiz_filter_type2_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vpx_convolve8_horiz_filter_type2_neon| PROC + + stmfd sp!, {r4 - r12, r14} ;stack stores the values of + ; the arguments + vpush {d8 - d15} ; stack offset by 64 + mov r4, r1 + mov r1, r2 + mov r2, r4 + +start_loop_count + ldr r4, [sp, #104] ;loads pi1_coeff + ldr r8, [sp, #108] ;loads x0_q4 + add r4, r4, r8, lsl #4 ;r4 = filter[x0_q4] + ldr r8, [sp, #128] ;loads ht + ldr r10, [sp, #124] ;loads wd + vld2.8 {d0, d1}, [r4] ;coeff = vld1_s8(pi1_coeff) + mov r11, #1 + subs r14, r8, #0 ;checks for ht == 0 + vabs.s8 d2, d0 ;vabs_s8(coeff) + vdup.8 d24, d2[0] ;coeffabs_0 = vdup_lane_u8(coeffabs, + ; 0) + sub r12, r0, #3 ;pu1_src - 3 + vdup.8 d25, d2[1] ;coeffabs_1 = vdup_lane_u8(coeffabs, + ; 1) + add r4, r12, r2 ;pu1_src_tmp2_8 = pu1_src + src_strd + vdup.8 d26, d2[2] ;coeffabs_2 = vdup_lane_u8(coeffabs, + ; 2) + rsb r9, r10, r2, lsl #1 ;2*src_strd - wd + vdup.8 d27, d2[3] ;coeffabs_3 = vdup_lane_u8(coeffabs, + ; 3) + rsb r8, r10, r3, lsl #1 ;2*dst_strd - wd + vdup.8 d28, d2[4] ;coeffabs_4 = vdup_lane_u8(coeffabs, + ; 4) + vdup.8 d29, d2[5] ;coeffabs_5 = vdup_lane_u8(coeffabs, + ; 5) + vdup.8 d30, d2[6] ;coeffabs_6 = vdup_lane_u8(coeffabs, + ; 6) + vdup.8 d31, d2[7] ;coeffabs_7 = vdup_lane_u8(coeffabs, + ; 7) + mov r7, r1 + cmp r10, #4 + ble outer_loop_4 + + cmp r10, #24 + moveq r10, #16 + addeq r8, #8 + addeq r9, #8 + cmp r10, #16 + bge outer_loop_16 + + cmp r10, #12 + addeq r8, #4 + addeq r9, #4 + b outer_loop_8 + +outer_loop8_residual + sub r12, r0, #3 ;pu1_src - 3 + mov r1, r7 + mov r14, #32 + add r1, #16 + add r12, #16 + mov r10, #8 + add r8, #8 + add r9, #8 + +outer_loop_8 + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + subs r5, r10, #0 ;checks wd + ble end_inner_loop_8 + +inner_loop_8 + mov r7, #0xc000 + vld1.u32 {d0}, [r12], r11 ;vector load pu1_src + vdup.16 q4, r7 + vld1.u32 {d1}, [r12], r11 + vdup.16 q5, r7 + vld1.u32 {d2}, [r12], r11 + vld1.u32 {d3}, [r12], r11 + mov r7, #0x4000 + vld1.u32 {d4}, [r12], r11 + vmlal.u8 q4, d1, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {d5}, [r12], r11 + vmlal.u8 q4, d3, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {d6}, [r12], r11 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vld1.u32 {d7}, [r12], r11 + vmlsl.u8 q4, d2, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd + vmlal.u8 q4, d4, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vld1.u32 {d13}, [r4], r11 + vmlsl.u8 q4, d5, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vld1.u32 {d14}, [r4], r11 + vmlal.u8 q4, d6, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vld1.u32 {d15}, [r4], r11 + vmlsl.u8 q4, d7, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + vld1.u32 {d16}, [r4], r11 ;vector load pu1_src + src_strd + vdup.16 q11, r7 + vmlal.u8 q5, d15, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {d17}, [r4], r11 + vmlsl.u8 q5, d14, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vhadd.s16 q4, q4, q11 + vld1.u32 {d18}, [r4], r11 + vmlal.u8 q5, d16, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vld1.u32 {d19}, [r4], r11 ;vector load pu1_src + src_strd + vmlsl.u8 q5, d17, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vmlal.u8 q5, d18, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q5, d19, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + vqrshrun.s16 d20, q4, #6 ;right shift and saturating narrow + ; result 1 + vmlsl.u8 q5, d12, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vmlal.u8 q5, d13, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vst1.8 {d20}, [r1]! ;store the result pu1_dst + vhadd.s16 q5, q5, q11 + subs r5, r5, #8 ;decrement the wd loop + vqrshrun.s16 d8, q5, #6 ;right shift and saturating narrow + ; result 2 + vst1.8 {d8}, [r6]! ;store the result pu1_dst + cmp r5, #4 + bgt inner_loop_8 + +end_inner_loop_8 + subs r14, r14, #2 ;decrement the ht loop + add r12, r12, r9 ;increment the src pointer by + ; 2*src_strd-wd + add r1, r1, r8 ;increment the dst pointer by + ; 2*dst_strd-wd + bgt outer_loop_8 + + ldr r10, [sp, #120] ;loads wd + cmp r10, #12 + beq outer_loop4_residual + +end_loops + b end_func + +outer_loop_16 + str r0, [sp, #-4]! + str r7, [sp, #-4]! + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + and r0, r12, #31 + mov r7, #0xc000 + sub r5, r10, #0 ;checks wd + pld [r4, r2, lsl #1] + pld [r12, r2, lsl #1] + vld1.u32 {q0}, [r12], r11 ;vector load pu1_src + vdup.16 q4, r7 + vld1.u32 {q1}, [r12], r11 + vld1.u32 {q2}, [r12], r11 + vld1.u32 {q3}, [r12], r11 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vld1.u32 {q6}, [r12], r11 + vmlal.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {q7}, [r12], r11 + vmlsl.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {q8}, [r12], r11 + vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {q9}, [r12], r11 + vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vmlsl.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vdup.16 q10, r7 + vmlal.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + +inner_loop_16 + vmlsl.u8 q10, d1, d24 + vdup.16 q5, r7 + vmlal.u8 q10, d3, d25 + mov r7, #0x4000 + vdup.16 q11, r7 + vmlsl.u8 q10, d5, d26 + vld1.u32 {q0}, [r4], r11 ;vector load pu1_src + vhadd.s16 q4, q4, q11 + vld1.u32 {q1}, [r4], r11 + vmlal.u8 q10, d7, d27 + add r12, #8 + subs r5, r5, #16 + vmlal.u8 q10, d13, d28 + vld1.u32 {q2}, [r4], r11 + vmlsl.u8 q10, d15, d29 + vld1.u32 {q3}, [r4], r11 + vqrshrun.s16 d8, q4, #6 ;right shift and saturating narrow + ; result 1 + vmlal.u8 q10, d17, d30 + vld1.u32 {q6}, [r4], r11 + vmlsl.u8 q10, d19, d31 + vld1.u32 {q7}, [r4], r11 + vmlsl.u8 q5, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vmlal.u8 q5, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {q8}, [r4], r11 + vhadd.s16 q10, q10, q11 + vld1.u32 {q9}, [r4], r11 + vmlsl.u8 q5, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vmlal.u8 q5, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + add r4, #8 + mov r7, #0xc000 + vmlal.u8 q5, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vmlsl.u8 q5, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vqrshrun.s16 d9, q10, #6 + vdup.16 q11, r7 + vmlal.u8 q5, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q5, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + mov r7, #0x4000 + vmlsl.u8 q11, d1, d24 + vst1.8 {q4}, [r1]! ;store the result pu1_dst + vmlal.u8 q11, d3, d25 + vdup.16 q10, r7 + vmlsl.u8 q11, d5, d26 + pld [r12, r2, lsl #2] + pld [r4, r2, lsl #2] + addeq r12, r12, r9 ;increment the src pointer by + ; 2*src_strd-wd + addeq r4, r12, r2 ;pu1_src + src_strd + vmlal.u8 q11, d7, d27 + addeq r1, r1, r8 + subeq r14, r14, #2 + vmlal.u8 q11, d13, d28 + vhadd.s16 q5, q5, q10 + vmlsl.u8 q11, d15, d29 + vmlal.u8 q11, d17, d30 + cmp r14, #0 + vmlsl.u8 q11, d19, d31 + vqrshrun.s16 d10, q5, #6 ;right shift and saturating narrow + ; result 2 + beq epilog_16 + + vld1.u32 {q0}, [r12], r11 ;vector load pu1_src + mov r7, #0xc000 + cmp r5, #0 + vld1.u32 {q1}, [r12], r11 + vhadd.s16 q11, q11, q10 + vld1.u32 {q2}, [r12], r11 + vdup.16 q4, r7 + vld1.u32 {q3}, [r12], r11 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vld1.u32 {q6}, [r12], r11 + vld1.u32 {q7}, [r12], r11 + vmlal.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {q8}, [r12], r11 + vmlsl.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {q9}, [r12], r11 + vqrshrun.s16 d11, q11, #6 + vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + moveq r5, r10 + vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vdup.16 q10, r7 + vmlsl.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vst1.8 {q5}, [r6]! ;store the result pu1_dst + vmlal.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + addeq r6, r1, r3 ;pu1_dst + dst_strd + b inner_loop_16 + +epilog_16 + mov r7, #0x4000 + ldr r0, [sp], #4 + ldr r10, [sp, #120] + vdup.16 q10, r7 + vhadd.s16 q11, q11, q10 + vqrshrun.s16 d11, q11, #6 + vst1.8 {q5}, [r6]! ;store the result pu1_dst + ldr r7, [sp], #4 + cmp r10, #24 + beq outer_loop8_residual + +end_loops1 + b end_func + +outer_loop4_residual + sub r12, r0, #3 ;pu1_src - 3 + mov r1, r7 + add r1, #8 + mov r10, #4 + add r12, #8 + mov r14, #16 + add r8, #4 + add r9, #4 + +outer_loop_4 + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + subs r5, r10, #0 ;checks wd + ble end_inner_loop_4 + +inner_loop_4 + vld1.u32 {d0}, [r12], r11 ;vector load pu1_src + vld1.u32 {d1}, [r12], r11 + vld1.u32 {d2}, [r12], r11 + vld1.u32 {d3}, [r12], r11 + vld1.u32 {d4}, [r12], r11 + vld1.u32 {d5}, [r12], r11 + vld1.u32 {d6}, [r12], r11 + vld1.u32 {d7}, [r12], r11 + sub r12, r12, #4 + vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd + vld1.u32 {d13}, [r4], r11 + vzip.32 d0, d12 ;vector zip the i iteration and ii + ; interation in single register + vld1.u32 {d14}, [r4], r11 + vzip.32 d1, d13 + vld1.u32 {d15}, [r4], r11 + vzip.32 d2, d14 + vld1.u32 {d16}, [r4], r11 + vzip.32 d3, d15 + vld1.u32 {d17}, [r4], r11 + vzip.32 d4, d16 + vld1.u32 {d18}, [r4], r11 + vzip.32 d5, d17 + vld1.u32 {d19}, [r4], r11 + mov r7, #0xc000 + vdup.16 q4, r7 + sub r4, r4, #4 + vzip.32 d6, d18 + vzip.32 d7, d19 + vmlal.u8 q4, d1, d25 ;arithmetic operations for ii + ; iteration in the same time + vmlsl.u8 q4, d0, d24 + vmlsl.u8 q4, d2, d26 + vmlal.u8 q4, d3, d27 + vmlal.u8 q4, d4, d28 + vmlsl.u8 q4, d5, d29 + vmlal.u8 q4, d6, d30 + vmlsl.u8 q4, d7, d31 + mov r7, #0x4000 + vdup.16 q10, r7 + vhadd.s16 q4, q4, q10 + vqrshrun.s16 d8, q4, #6 + vst1.32 {d8[0]},[r1]! ;store the i iteration result which + ; is in upper part of the register + vst1.32 {d8[1]},[r6]! ;store the ii iteration result which + ; is in lower part of the register + subs r5, r5, #4 ;decrement the wd by 4 + bgt inner_loop_4 + +end_inner_loop_4 + subs r14, r14, #2 ;decrement the ht by 4 + add r12, r12, r9 ;increment the input pointer + ; 2*src_strd-wd + add r1, r1, r8 ;increment the output pointer + ; 2*dst_strd-wd + bgt outer_loop_4 + +end_func + vpop {d8 - d15} + ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp + + ENDP + + END diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c new file mode 100644 index 0000000000..65fb67c984 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c @@ -0,0 +1,965 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/vpx_convolve8_neon.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_ports/mem.h" + +// Note: +// 1. src is not always 32-bit aligned, so don't call vld1_lane_u32(src). +// 2. After refactoring the shared code in kernel loops with inline functions, +// the decoder speed dropped a lot when using gcc compiler. Therefore there is +// no refactoring for those parts by now. +// 3. For horizontal convolve, there is an alternative optimization that +// convolves a single row in each loop. For each row, 8 sample banks with 4 or 8 +// samples in each are read from memory: src, (src+1), (src+2), (src+3), +// (src+4), (src+5), (src+6), (src+7), or prepared by vector extract +// instructions. This optimization is much faster in speed unit test, but slowed +// down the whole decoder by 5%. + +static INLINE void vpx_convolve_4tap_horiz_neon(const uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, int w, + int h, const int16x4_t filter) { + if (w == 4) { + do { + int16x4_t s0[4], s1[4]; + + int16x8_t t0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src))); + s0[0] = vget_low_s16(vextq_s16(t0, t0, 0)); + s0[1] = vget_low_s16(vextq_s16(t0, t0, 1)); + s0[2] = vget_low_s16(vextq_s16(t0, t0, 2)); + s0[3] = vget_low_s16(vextq_s16(t0, t0, 3)); + + int16x8_t t1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + src_stride))); + s1[0] = vget_low_s16(vextq_s16(t1, t1, 0)); + s1[1] = vget_low_s16(vextq_s16(t1, t1, 1)); + s1[2] = vget_low_s16(vextq_s16(t1, t1, 2)); + s1[3] = vget_low_s16(vextq_s16(t1, t1, 3)); + + int16x4_t d0 = convolve4_4(s0[0], s0[1], s0[2], s0[3], filter); + int16x4_t d1 = convolve4_4(s1[0], s1[1], s1[2], s1[3], filter); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + + store_u8(dst, dst_stride, d01); + + src += 2 * src_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h > 0); + } else { + do { + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + int16x8_t t0[2], t1[2]; + int16x8_t s0[4], s1[4]; + + t0[0] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + t0[1] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 8))); + s0[0] = vextq_s16(t0[0], t0[1], 0); + s0[1] = vextq_s16(t0[0], t0[1], 1); + s0[2] = vextq_s16(t0[0], t0[1], 2); + s0[3] = vextq_s16(t0[0], t0[1], 3); + + t1[0] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + src_stride))); + t1[1] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + src_stride + 8))); + s1[0] = vextq_s16(t1[0], t1[1], 0); + s1[1] = vextq_s16(t1[0], t1[1], 1); + s1[2] = vextq_s16(t1[0], t1[1], 2); + s1[3] = vextq_s16(t1[0], t1[1], 3); + + uint8x8_t d0 = convolve4_8(s0[0], s0[1], s0[2], s0[3], filter); + uint8x8_t d1 = convolve4_8(s1[0], s1[1], s1[2], s1[3], filter); + + vst1_u8(d, d0); + vst1_u8(d + dst_stride, d1); + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 2 * src_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h > 0); + } +} + +static INLINE void vpx_convolve_8tap_horiz_neon(const uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, int w, + int h, const int16x8_t filter) { + uint8x8_t t0, t1, t2, t3; + + if (h == 4) { + uint8x8_t d01, d23; + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; + + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); + transpose_u8_8x4(&t0, &t1, &t2, &t3); + s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + src += 7; + + do { + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); + transpose_u8_8x4(&t0, &t1, &t2, &t3); + s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + + d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); + d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter); + d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter); + d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + transpose_u8_4x4(&d01, &d23); + + store_u8(dst + 0 * dst_stride, 2 * dst_stride, d01); + store_u8(dst + 1 * dst_stride, 2 * dst_stride, d23); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + src += 4; + dst += 4; + w -= 4; + } while (w != 0); + } else { + int width; + const uint8_t *s; + uint8x8_t t4, t5, t6, t7, d04, d15, d26, d37; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + + if (w == 4) { + do { + load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, + &t7); + src += 8 * src_stride; + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(dst + 4 * dst_stride); + __builtin_prefetch(dst + 5 * dst_stride); + __builtin_prefetch(dst + 6 * dst_stride); + __builtin_prefetch(dst + 7 * dst_stride); + transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7); + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + __builtin_prefetch(src + 7 * src_stride); + d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); + d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); + d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); + + transpose_u8_8x4(&d04, &d15, &d26, &d37); + + store_u8(dst + 0 * dst_stride, 4 * dst_stride, d04); + store_u8(dst + 1 * dst_stride, 4 * dst_stride, d15); + store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26); + store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37); + + dst += 8 * dst_stride; + h -= 8; + } while (h > 0); + } else { + uint8_t *d; + uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7; + int16x8_t s11, s12, s13, s14; + + do { + load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + width = w; + s = src + 7; + d = dst; + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(dst + 4 * dst_stride); + __builtin_prefetch(dst + 5 * dst_stride); + __builtin_prefetch(dst + 6 * dst_stride); + __builtin_prefetch(dst + 7 * dst_stride); + + do { + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); + s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); + d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); + d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); + d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter); + d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter); + d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter); + d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter); + + transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + + store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 8 * src_stride; + dst += 8 * dst_stride; + h -= 8; + } while (h > 0); + } + } +} + +void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + if (vpx_get_filter_taps(filter[x0_q4]) <= 4) { + /* All 4-tap and bilinear filter values are even, so halve them to reduce + * intermediate precision requirements. + */ + const int16x4_t x_filter_4tap = vshr_n_s16(vld1_s16(filter[x0_q4] + 2), 1); + vpx_convolve_4tap_horiz_neon(src - 1, src_stride, dst, dst_stride, w, h, + x_filter_4tap); + } else { + const int16x8_t x_filter_8tap = vld1q_s16(filter[x0_q4]); + vpx_convolve_8tap_horiz_neon(src - 3, src_stride, dst, dst_stride, w, h, + x_filter_8tap); + } +} + +void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int16x8_t filters = vld1q_s16(filter[x0_q4]); + uint8x8_t t0, t1, t2, t3; + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + src -= 3; + + if (h == 4) { + uint8x8_t d01, d23, dd01, dd23; + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; + + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); + transpose_u8_8x4(&t0, &t1, &t2, &t3); + s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + src += 7; + + do { + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); + transpose_u8_8x4(&t0, &t1, &t2, &t3); + s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + + d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + transpose_u8_4x4(&d01, &d23); + + dd01 = load_u8(dst + 0 * dst_stride, 2 * dst_stride); + dd23 = load_u8(dst + 1 * dst_stride, 2 * dst_stride); + + d01 = vrhadd_u8(d01, dd01); + d23 = vrhadd_u8(d23, dd23); + + store_u8(dst + 0 * dst_stride, 2 * dst_stride, d01); + store_u8(dst + 1 * dst_stride, 2 * dst_stride, d23); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + src += 4; + dst += 4; + w -= 4; + } while (w != 0); + } else { + int width; + const uint8_t *s; + uint8x8_t t4, t5, t6, t7; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + + if (w == 4) { + uint8x8_t d04, d15, d26, d37, dd04, dd15, dd26, dd37; + + do { + load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, + &t7); + src += 8 * src_stride; + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(dst + 4 * dst_stride); + __builtin_prefetch(dst + 5 * dst_stride); + __builtin_prefetch(dst + 6 * dst_stride); + __builtin_prefetch(dst + 7 * dst_stride); + transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7); + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + __builtin_prefetch(src + 7 * src_stride); + d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); + + transpose_u8_8x4(&d04, &d15, &d26, &d37); + + dd04 = load_u8(dst + 0 * dst_stride, 4 * dst_stride); + dd15 = load_u8(dst + 1 * dst_stride, 4 * dst_stride); + dd26 = load_u8(dst + 2 * dst_stride, 4 * dst_stride); + dd37 = load_u8(dst + 3 * dst_stride, 4 * dst_stride); + + d04 = vrhadd_u8(d04, dd04); + d15 = vrhadd_u8(d15, dd15); + d26 = vrhadd_u8(d26, dd26); + d37 = vrhadd_u8(d37, dd37); + + store_u8(dst + 0 * dst_stride, 4 * dst_stride, d04); + store_u8(dst + 1 * dst_stride, 4 * dst_stride, d15); + store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26); + store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37); + + dst += 8 * dst_stride; + h -= 8; + } while (h != 0); + } else { + uint8_t *d; + uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7; + int16x8_t s11, s12, s13, s14; + + do { + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + __builtin_prefetch(src + 7 * src_stride); + load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + width = w; + s = src + 7; + d = dst; + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(dst + 4 * dst_stride); + __builtin_prefetch(dst + 5 * dst_stride); + __builtin_prefetch(dst + 6 * dst_stride); + __builtin_prefetch(dst + 7 * dst_stride); + + do { + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); + s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); + d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters); + d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters); + d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters); + d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters); + + transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + + d0 = vrhadd_u8(d0, vld1_u8(d + 0 * dst_stride)); + d1 = vrhadd_u8(d1, vld1_u8(d + 1 * dst_stride)); + d2 = vrhadd_u8(d2, vld1_u8(d + 2 * dst_stride)); + d3 = vrhadd_u8(d3, vld1_u8(d + 3 * dst_stride)); + d4 = vrhadd_u8(d4, vld1_u8(d + 4 * dst_stride)); + d5 = vrhadd_u8(d5, vld1_u8(d + 5 * dst_stride)); + d6 = vrhadd_u8(d6, vld1_u8(d + 6 * dst_stride)); + d7 = vrhadd_u8(d7, vld1_u8(d + 7 * dst_stride)); + + store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 8 * src_stride; + dst += 8 * dst_stride; + h -= 8; + } while (h != 0); + } + } +} + +static INLINE void vpx_convolve_4tap_vert_neon(const uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, int w, + int h, const int16x4_t filter) { + if (w == 4) { + uint8x8_t t0, t1, t2, t3, d01, d23; + int16x4_t s0, s1, s2, s3, s4, s5, s6, d0, d1, d2, d3; + + load_u8_8x3(src, src_stride, &t0, &t1, &t2); + s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + + src += 3 * src_stride; + + do { + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); + s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + + d0 = convolve4_4(s0, s1, s2, s3, filter); + d1 = convolve4_4(s1, s2, s3, s4, filter); + d2 = convolve4_4(s2, s3, s4, s5, filter); + d3 = convolve4_4(s3, s4, s5, s6, filter); + /* We halved the filter values so -1 from right shift. */ + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + s0 = s4; + s1 = s5; + s2 = s6; + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + int height; + const uint8_t *s; + uint8_t *d; + uint8x8_t t0, t1, t2, t3, d0, d1, d2, d3; + int16x8_t s0, s1, s2, s3, s4, s5, s6; + + do { + load_u8_8x3(src, src_stride, &t0, &t1, &t2); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + + s = src + 3 * src_stride; + d = dst; + height = h; + + do { + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + s3 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s4 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s5 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s6 = vreinterpretq_s16_u16(vmovl_u8(t3)); + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + __builtin_prefetch(s + 0 * src_stride); + __builtin_prefetch(s + 1 * src_stride); + __builtin_prefetch(s + 2 * src_stride); + __builtin_prefetch(s + 3 * src_stride); + + d0 = convolve4_8(s0, s1, s2, s3, filter); + d1 = convolve4_8(s1, s2, s3, s4, filter); + d2 = convolve4_8(s2, s3, s4, s5, filter); + d3 = convolve4_8(s3, s4, s5, s6, filter); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void vpx_convolve_8tap_vert_neon(const uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, int w, + int h, const int16x8_t filter) { + if (w == 4) { + uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23; + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; + + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); + s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5))); + s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6))); + + src += 7 * src_stride; + + do { + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); + s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + + d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); + d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter); + d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter); + d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + int height; + const uint8_t *s; + uint8_t *d; + uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + + do { + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + s = src + 7 * src_stride; + d = dst; + height = h; + + do { + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + __builtin_prefetch(s + 0 * src_stride); + __builtin_prefetch(s + 1 * src_stride); + __builtin_prefetch(s + 2 * src_stride); + __builtin_prefetch(s + 3 * src_stride); + + d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); + d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); + d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(y_step_q4 == 16); + + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + + if (vpx_get_filter_taps(filter[y0_q4]) <= 4) { + /* All 4-tap and bilinear filter values are even, so halve them to reduce + * intermediate precision requirements. + */ + const int16x4_t y_filter_4tap = vshr_n_s16(vld1_s16(filter[y0_q4] + 2), 1); + vpx_convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride, + w, h, y_filter_4tap); + } else { + const int16x8_t y_filter_8tap = vld1q_s16(filter[y0_q4]); + vpx_convolve_8tap_vert_neon(src - 3 * src_stride, src_stride, dst, + dst_stride, w, h, y_filter_8tap); + } +} + +void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int16x8_t filters = vld1q_s16(filter[y0_q4]); + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(y_step_q4 == 16); + + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + + src -= 3 * src_stride; + + if (w == 4) { + uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23, dd01, dd23; + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; + + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); + s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5))); + s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6))); + + src += 7 * src_stride; + + do { + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); + s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + + d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + + d01 = vrhadd_u8(d01, dd01); + d23 = vrhadd_u8(d23, dd23); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + int height; + const uint8_t *s; + uint8_t *d; + uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + + do { + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + s = src + 7 * src_stride; + d = dst; + height = h; + + do { + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + __builtin_prefetch(s + 0 * src_stride); + __builtin_prefetch(s + 1 * src_stride); + __builtin_prefetch(s + 2 * src_stride); + __builtin_prefetch(s + 3 * src_stride); + + d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); + + d0 = vrhadd_u8(d0, vld1_u8(d + 0 * dst_stride)); + d1 = vrhadd_u8(d1, vld1_u8(d + 1 * dst_stride)); + d2 = vrhadd_u8(d2, vld1_u8(d + 2 * dst_stride)); + d3 = vrhadd_u8(d3, vld1_u8(d + 3 * dst_stride)); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + height -= 4; + s += 4 * src_stride; + d += 4 * dst_stride; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h new file mode 100644 index 0000000000..4ecaee0f99 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h @@ -0,0 +1,431 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_ +#define VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/vpx_filter.h" + +#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD) + +void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, + int y_step_q4, int w, int h); + +static INLINE int16x4_t convolve4_4_sdot_partial(const int8x16_t samples, + const int32x4_t correction, + const int8x8_t filters) { + /* Accumulate dot product into 'correction' to account for range clamp. */ + int32x4_t sum = vdotq_lane_s32(correction, samples, filters, 0); + + /* Further narrowing and packing is performed by the caller. */ + return vmovn_s32(sum); +} + +static INLINE int16x4_t convolve4_4_sdot(const uint8x16_t samples, + const int8x8_t filters, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16_t permute_tbl) { + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + int8x16_t clamped_samples = + vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + + /* Permute samples ready for dot product. */ + /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ + int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl); + + /* Accumulate dot product into 'correction' to account for range clamp. */ + int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, filters, 0); + + /* Further narrowing and packing is performed by the caller. */ + return vmovn_s32(sum); +} + +static INLINE uint8x8_t convolve4_8_sdot_partial(const int8x16_t samples_lo, + const int8x16_t samples_hi, + const int32x4_t correction, + const int8x8_t filters) { + /* Sample range-clamping and permutation are performed by the caller. */ + /* Accumulate dot product into 'correction' to account for range clamp. */ + /* First 4 output values. */ + int32x4_t sum0 = vdotq_lane_s32(correction, samples_lo, filters, 0); + /* Second 4 output values. */ + int32x4_t sum1 = vdotq_lane_s32(correction, samples_hi, filters, 0); + + /* Narrow and re-pack. */ + int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1)); + /* We halved the filter values so -1 from right shift. */ + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} + +static INLINE uint8x8_t convolve4_8_sdot(const uint8x16_t samples, + const int8x8_t filters, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16x2_t permute_tbl) { + int8x16_t clamped_samples, permuted_samples[2]; + + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + + /* Permute samples ready for dot product. */ + /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ + permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); + /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ + permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); + + /* Accumulate dot product into 'correction' to account for range clamp. */ + /* First 4 output values. */ + int32x4_t sum0 = vdotq_lane_s32(correction, permuted_samples[0], filters, 0); + /* Second 4 output values. */ + int32x4_t sum1 = vdotq_lane_s32(correction, permuted_samples[1], filters, 0); + + /* Narrow and re-pack. */ + int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1)); + /* We halved the filter values so -1 from right shift. */ + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} + +static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo, + const int8x16_t samples_hi, + const int32x4_t correction, + const int8x8_t filters) { + /* Sample range-clamping and permutation are performed by the caller. */ + int32x4_t sum; + + /* Accumulate dot product into 'correction' to account for range clamp. */ + sum = vdotq_lane_s32(correction, samples_lo, filters, 0); + sum = vdotq_lane_s32(sum, samples_hi, filters, 1); + + /* Further narrowing and packing is performed by the caller. */ + return vqmovn_s32(sum); +} + +static INLINE int16x4_t convolve8_4_sdot(const uint8x16_t samples, + const int8x8_t filters, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16x2_t permute_tbl) { + int8x16_t clamped_samples, permuted_samples[2]; + int32x4_t sum; + + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + + /* Permute samples ready for dot product. */ + /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ + permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); + /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ + permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); + + /* Accumulate dot product into 'correction' to account for range clamp. */ + sum = vdotq_lane_s32(correction, permuted_samples[0], filters, 0); + sum = vdotq_lane_s32(sum, permuted_samples[1], filters, 1); + + /* Further narrowing and packing is performed by the caller. */ + return vqmovn_s32(sum); +} + +static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo, + const int8x16_t samples0_hi, + const int8x16_t samples1_lo, + const int8x16_t samples1_hi, + const int32x4_t correction, + const int8x8_t filters) { + /* Sample range-clamping and permutation are performed by the caller. */ + int32x4_t sum0, sum1; + int16x8_t sum; + + /* Accumulate dot product into 'correction' to account for range clamp. */ + /* First 4 output values. */ + sum0 = vdotq_lane_s32(correction, samples0_lo, filters, 0); + sum0 = vdotq_lane_s32(sum0, samples0_hi, filters, 1); + /* Second 4 output values. */ + sum1 = vdotq_lane_s32(correction, samples1_lo, filters, 0); + sum1 = vdotq_lane_s32(sum1, samples1_hi, filters, 1); + + /* Narrow and re-pack. */ + sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); + return vqrshrun_n_s16(sum, FILTER_BITS); +} + +static INLINE uint8x8_t convolve8_8_sdot(const uint8x16_t samples, + const int8x8_t filters, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16x3_t permute_tbl) { + int8x16_t clamped_samples, permuted_samples[3]; + int32x4_t sum0, sum1; + int16x8_t sum; + + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + + /* Permute samples ready for dot product. */ + /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ + permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); + /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ + permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); + /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */ + permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]); + + /* Accumulate dot product into 'correction' to account for range clamp. */ + /* First 4 output values. */ + sum0 = vdotq_lane_s32(correction, permuted_samples[0], filters, 0); + sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filters, 1); + /* Second 4 output values. */ + sum1 = vdotq_lane_s32(correction, permuted_samples[1], filters, 0); + sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filters, 1); + + /* Narrow and re-pack. */ + sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); + return vqrshrun_n_s16(sum, FILTER_BITS); +} + +#endif // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD) + +#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8) + +void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h); + +static INLINE int16x4_t convolve4_4_usdot_partial(const uint8x16_t samples, + const int8x8_t filters) { + int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), samples, filters, 0); + + /* Further narrowing and packing is performed by the caller. */ + return vmovn_s32(sum); +} + +static INLINE int16x4_t convolve4_4_usdot(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16_t permute_tbl) { + /* Permute samples ready for dot product. */ + /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ + uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl); + + int32x4_t sum = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples, filters, 0); + + /* Further narrowing and packing is performed by the caller. */ + return vmovn_s32(sum); +} + +static INLINE uint8x8_t convolve4_8_usdot_partial(const uint8x16_t samples_lo, + const uint8x16_t samples_hi, + const int8x8_t filters) { + /* Sample permutation is performed by the caller. */ + /* First 4 output values. */ + int32x4_t sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0); + /* Second 4 output values. */ + int32x4_t sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples_hi, filters, 0); + + /* Narrow and re-pack. */ + int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1)); + /* We halved the filter values so -1 from right shift. */ + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} + +static INLINE uint8x8_t convolve4_8_usdot(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x2_t permute_tbl) { + uint8x16_t permuted_samples[2]; + + /* Permute samples ready for dot product. */ + /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ + permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); + /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ + permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); + + /* First 4 output values. */ + int32x4_t sum0 = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); + /* Second 4 output values. */ + int32x4_t sum1 = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0); + + /* Narrow and re-pack. */ + int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1)); + /* We halved the filter values so -1 from right shift. */ + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} + +static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo, + const uint8x16_t samples_hi, + const int8x8_t filters) { + /* Sample permutation is performed by the caller. */ + int32x4_t sum; + + sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0); + sum = vusdotq_lane_s32(sum, samples_hi, filters, 1); + + /* Further narrowing and packing is performed by the caller. */ + return vqmovn_s32(sum); +} + +static INLINE int16x4_t convolve8_4_usdot(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x2_t permute_tbl) { + uint8x16_t permuted_samples[2]; + int32x4_t sum; + + /* Permute samples ready for dot product. */ + /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ + permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); + /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ + permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); + + sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); + sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1); + + /* Further narrowing and packing is performed by the caller. */ + return vqmovn_s32(sum); +} + +static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo, + const uint8x16_t samples0_hi, + const uint8x16_t samples1_lo, + const uint8x16_t samples1_hi, + const int8x8_t filters) { + /* Sample permutation is performed by the caller. */ + int32x4_t sum0, sum1; + int16x8_t sum; + + /* First 4 output values. */ + sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0); + sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1); + /* Second 4 output values. */ + sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0); + sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1); + + /* Narrow and re-pack. */ + sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); + return vqrshrun_n_s16(sum, FILTER_BITS); +} + +static INLINE uint8x8_t convolve8_8_usdot(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x3_t permute_tbl) { + uint8x16_t permuted_samples[3]; + int32x4_t sum0, sum1; + int16x8_t sum; + + /* Permute samples ready for dot product. */ + /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ + permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); + /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ + permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); + /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */ + permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]); + + /* First 4 output values. */ + sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); + sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1); + /* Second 4 output values. */ + sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0); + sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1); + + /* Narrow and re-pack. */ + sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); + return vqrshrun_n_s16(sum, FILTER_BITS); +} + +#endif // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8) + +static INLINE int16x4_t convolve4_4(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t filters) { + int16x4_t sum = vmul_lane_s16(s0, filters, 0); + sum = vmla_lane_s16(sum, s1, filters, 1); + sum = vmla_lane_s16(sum, s2, filters, 2); + sum = vmla_lane_s16(sum, s3, filters, 3); + return sum; +} + +static INLINE uint8x8_t convolve4_8(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x4_t filters) { + int16x8_t sum = vmulq_lane_s16(s0, filters, 0); + sum = vmlaq_lane_s16(sum, s1, filters, 1); + sum = vmlaq_lane_s16(sum, s2, filters, 2); + sum = vmlaq_lane_s16(sum, s3, filters, 3); + /* We halved the filter values so -1 from right shift. */ + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} + +static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, + const int16x8_t filters) { + const int16x4_t filters_lo = vget_low_s16(filters); + const int16x4_t filters_hi = vget_high_s16(filters); + int16x4_t sum; + + sum = vmul_lane_s16(s0, filters_lo, 0); + sum = vmla_lane_s16(sum, s1, filters_lo, 1); + sum = vmla_lane_s16(sum, s2, filters_lo, 2); + sum = vmla_lane_s16(sum, s5, filters_hi, 1); + sum = vmla_lane_s16(sum, s6, filters_hi, 2); + sum = vmla_lane_s16(sum, s7, filters_hi, 3); + sum = vqadd_s16(sum, vmul_lane_s16(s3, filters_lo, 3)); + sum = vqadd_s16(sum, vmul_lane_s16(s4, filters_hi, 0)); + return sum; +} + +static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t filters) { + const int16x4_t filters_lo = vget_low_s16(filters); + const int16x4_t filters_hi = vget_high_s16(filters); + int16x8_t sum; + + sum = vmulq_lane_s16(s0, filters_lo, 0); + sum = vmlaq_lane_s16(sum, s1, filters_lo, 1); + sum = vmlaq_lane_s16(sum, s2, filters_lo, 2); + sum = vmlaq_lane_s16(sum, s5, filters_hi, 1); + sum = vmlaq_lane_s16(sum, s6, filters_hi, 2); + sum = vmlaq_lane_s16(sum, s7, filters_hi, 3); + sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filters_lo, 3)); + sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filters_hi, 0)); + return vqrshrun_n_s16(sum, FILTER_BITS); +} + +static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s, + const int16x8_t filters) { + int16x8_t ss[8]; + + ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0])); + ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1])); + ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2])); + ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3])); + ss[4] = vreinterpretq_s16_u16(vmovl_u8(s[4])); + ss[5] = vreinterpretq_s16_u16(vmovl_u8(s[5])); + ss[6] = vreinterpretq_s16_u16(vmovl_u8(s[6])); + ss[7] = vreinterpretq_s16_u16(vmovl_u8(s[7])); + + return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7], + filters); +} + +#endif // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c new file mode 100644 index 0000000000..c4177c5385 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vp9/common/vp9_filter.h" +#include "vpx_dsp/arm/vpx_convolve8_neon_asm.h" + +/* Type1 and Type2 functions are called depending on the position of the + * negative and positive coefficients in the filter. In type1, the filter kernel + * used is sub_pel_filters_8lp, in which only the first two and the last two + * coefficients are negative. In type2, the negative coefficients are 0, 2, 5 & + * 7. + */ + +#define DEFINE_FILTER(dir) \ + void vpx_convolve8_##dir##_neon( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \ + if (filter == vp9_filter_kernels[1]) { \ + vpx_convolve8_##dir##_filter_type1_neon( \ + src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, \ + y_step_q4, w, h); \ + } else { \ + vpx_convolve8_##dir##_filter_type2_neon( \ + src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, \ + y_step_q4, w, h); \ + } \ + } + +DEFINE_FILTER(horiz) +DEFINE_FILTER(avg_horiz) +DEFINE_FILTER(vert) +DEFINE_FILTER(avg_vert) diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.h new file mode 100644 index 0000000000..f1c7d62ed0 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_ASM_H_ +#define VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_ASM_H_ + +#define DECLARE_FILTER(dir, type) \ + void vpx_convolve8_##dir##_filter_##type##_neon( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ + int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +DECLARE_FILTER(horiz, type1) +DECLARE_FILTER(avg_horiz, type1) +DECLARE_FILTER(horiz, type2) +DECLARE_FILTER(avg_horiz, type2) +DECLARE_FILTER(vert, type1) +DECLARE_FILTER(avg_vert, type1) +DECLARE_FILTER(vert, type2) +DECLARE_FILTER(avg_vert, type2) + +#endif // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_ASM_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c new file mode 100644 index 0000000000..00bac3b9cf --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c @@ -0,0 +1,1117 @@ +/* + * Copyright (c) 2021 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/vpx_convolve8_neon.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_ports/mem.h" + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +}; + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = { + 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, + 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31 +}; + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = { + /* Shift left and insert new last column in transposed 4x4 block. */ + 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, + /* Shift left and insert two new columns in transposed 4x4 block. */ + 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, + /* Shift left and insert three new columns in transposed 4x4 block. */ + 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 +}; + +static INLINE void vpx_convolve_4tap_2d_horiz_neon_dotprod( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter, + const int32x4_t correction, const uint8x16_t range_limit) { + uint8x16_t s0, s1, s2, s3; + + if (w == 4) { + const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl); + int16x4_t d0, d1, d2, d3; + uint8x8_t d01, d23; + + do { + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve4_4_sdot(s0, filter, correction, range_limit, perm_tbl); + d1 = convolve4_4_sdot(s1, filter, correction, range_limit, perm_tbl); + d2 = convolve4_4_sdot(s2, filter, correction, range_limit, perm_tbl); + d3 = convolve4_4_sdot(s3, filter, correction, range_limit, perm_tbl); + /* We halved the filter values so -1 from right shift. */ + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for + * further details on possible values of block height. */ + load_u8_16x3(src, src_stride, &s0, &s1, &s2); + + d0 = convolve4_4_sdot(s0, filter, correction, range_limit, perm_tbl); + d1 = convolve4_4_sdot(s1, filter, correction, range_limit, perm_tbl); + d2 = convolve4_4_sdot(s2, filter, correction, range_limit, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8_4x1(dst + 2 * dst_stride, d23); + } else { + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve4_8_sdot(s0, filter, correction, range_limit, perm_tbl); + d1 = convolve4_8_sdot(s1, filter, correction, range_limit, perm_tbl); + d2 = convolve4_8_sdot(s2, filter, correction, range_limit, perm_tbl); + d3 = convolve4_8_sdot(s3, filter, correction, range_limit, perm_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for + * further details on possible values of block height. */ + width = w; + s = src; + d = dst; + do { + load_u8_16x3(s, src_stride, &s0, &s1, &s2); + + d0 = convolve4_8_sdot(s0, filter, correction, range_limit, perm_tbl); + d1 = convolve4_8_sdot(s1, filter, correction, range_limit, perm_tbl); + d2 = convolve4_8_sdot(s2, filter, correction, range_limit, perm_tbl); + + store_u8_8x3(d, dst_stride, d0, d1, d2); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + } +} + +static INLINE void vpx_convolve_8tap_2d_horiz_neon_dotprod( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter, + const int32x4_t correction, const uint8x16_t range_limit) { + uint8x16_t s0, s1, s2, s3; + + if (w == 4) { + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + int16x4_t d0, d1, d2, d3; + uint8x8_t d01, d23; + + do { + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl); + d1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl); + d2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl); + d3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for + * further details on possible values of block height. */ + load_u8_16x3(src, src_stride, &s0, &s1, &s2); + + d0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl); + d1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl); + d2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8_4x1(dst + 2 * dst_stride, d23); + } else { + const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl); + d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl); + d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl); + d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for + * further details on possible values of block height. */ + width = w; + s = src; + d = dst; + do { + load_u8_16x3(s, src_stride, &s0, &s1, &s2); + + d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl); + d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl); + d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl); + + store_u8_8x3(d, dst_stride, d0, d1, d2); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + } +} + +void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, + int y_step_q4, int w, int h) { + const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); + const int32x4_t correction_8tap = + vdupq_n_s32(vaddlvq_s16(vshll_n_s8(x_filter_8tap, FILTER_BITS))); + const uint8x16_t range_limit = vdupq_n_u8(128); + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + if (vpx_get_filter_taps(filter[x0_q4]) <= 4) { + /* All 4-tap and bilinear filter values are even, so halve them to reduce + * intermediate precision requirements. Also slide the filter values so the + * the 4 taps exist in the first 4 elements of the vector. + */ + const int8x8_t x_filter_4tap = + vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2); + const int32x4_t correction_4tap = vshrq_n_s32(correction_8tap, 1); + vpx_convolve_4tap_2d_horiz_neon_dotprod(src - 1, src_stride, dst, + dst_stride, w, h, x_filter_4tap, + correction_4tap, range_limit); + + } else { + vpx_convolve_8tap_2d_horiz_neon_dotprod(src - 3, src_stride, dst, + dst_stride, w, h, x_filter_8tap, + correction_8tap, range_limit); + } +} + +static INLINE void vpx_convolve_4tap_horiz_neon_dotprod( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter, + const int32x4_t correction, const uint8x16_t range_limit) { + uint8x16_t s0, s1, s2, s3; + + if (w == 4) { + const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl); + do { + int16x4_t t0, t1, t2, t3; + uint8x8_t d01, d23; + + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + t0 = convolve4_4_sdot(s0, filter, correction, range_limit, perm_tbl); + t1 = convolve4_4_sdot(s1, filter, correction, range_limit, perm_tbl); + t2 = convolve4_4_sdot(s2, filter, correction, range_limit, perm_tbl); + t3 = convolve4_4_sdot(s3, filter, correction, range_limit, perm_tbl); + /* We halved the filter values so -1 from right shift. */ + d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); + d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve4_8_sdot(s0, filter, correction, range_limit, perm_tbl); + d1 = convolve4_8_sdot(s1, filter, correction, range_limit, perm_tbl); + d2 = convolve4_8_sdot(s2, filter, correction, range_limit, perm_tbl); + d3 = convolve4_8_sdot(s3, filter, correction, range_limit, perm_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +static INLINE void vpx_convolve_8tap_horiz_neon_dotprod( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter, + const int32x4_t correction, const uint8x16_t range_limit) { + uint8x16_t s0, s1, s2, s3; + + if (w == 4) { + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + do { + int16x4_t t0, t1, t2, t3; + uint8x8_t d01, d23; + + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + t0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl); + t1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl); + t2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl); + t3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl); + d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl); + d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl); + d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); + const int32x4_t correction_8tap = + vdupq_n_s32(vaddlvq_s16(vshll_n_s8(x_filter_8tap, FILTER_BITS))); + const uint8x16_t range_limit = vdupq_n_u8(128); + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + if (vpx_get_filter_taps(filter[x0_q4]) <= 4) { + /* All 4-tap and bilinear filter values are even, so halve them to reduce + * intermediate precision requirements. Also slide the filter values so the + * the 4 taps exist in the first 4 elements of the vector. + */ + const int8x8_t x_filter_4tap = + vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2); + const int32x4_t correction_4tap = vshrq_n_s32(correction_8tap, 1); + vpx_convolve_4tap_horiz_neon_dotprod(src - 1, src_stride, dst, dst_stride, + w, h, x_filter_4tap, correction_4tap, + range_limit); + + } else { + vpx_convolve_8tap_horiz_neon_dotprod(src - 3, src_stride, dst, dst_stride, + w, h, x_filter_8tap, correction_8tap, + range_limit); + } +} + +void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, + int y_step_q4, int w, int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); + const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128); + const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); + const uint8x16_t range_limit = vdupq_n_u8(128); + uint8x16_t s0, s1, s2, s3; + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + src -= 3; + + if (w == 4) { + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + do { + int16x4_t t0, t1, t2, t3; + uint8x8_t d01, d23, dd01, dd23; + + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl); + t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl); + t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl); + t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); + + dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + + d01 = vrhadd_u8(d01, dd01); + d23 = vrhadd_u8(d23, dd23); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl); + d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl); + d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl); + d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl); + + load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + d0 = vrhadd_u8(d0, dd0); + d1 = vrhadd_u8(d1, dd1); + d2 = vrhadd_u8(d2, dd2); + d3 = vrhadd_u8(d3, dd3); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, + int8x8_t a3, int8x16_t *b, + const uint8x16_t permute_tbl) { + /* Transpose 8-bit elements and concatenate result rows as follows: + * a0: 00, 01, 02, 03, XX, XX, XX, XX + * a1: 10, 11, 12, 13, XX, XX, XX, XX + * a2: 20, 21, 22, 23, XX, XX, XX, XX + * a3: 30, 31, 32, 33, XX, XX, XX, XX + * + * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + * + * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it + * as an argument is preferable to loading it directly from memory as this + * inline helper is called many times from the same parent function. + */ + + int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } }; + *b = vqtbl2q_s8(samples, permute_tbl); +} + +static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, + int8x8_t a3, int8x16_t *b0, + int8x16_t *b1, + const uint8x16x2_t permute_tbl) { + /* Transpose 8-bit elements and concatenate result rows as follows: + * a0: 00, 01, 02, 03, 04, 05, 06, 07 + * a1: 10, 11, 12, 13, 14, 15, 16, 17 + * a2: 20, 21, 22, 23, 24, 25, 26, 27 + * a3: 30, 31, 32, 33, 34, 35, 36, 37 + * + * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 + * + * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it + * as an argument is preferable to loading it directly from memory as this + * inline helper is called many times from the same parent function. + */ + + int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } }; + *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]); + *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]); +} + +static INLINE void vpx_convolve_4tap_vert_neon_dotprod( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter, + const int32x4_t correction, const uint8x8_t range_limit) { + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + uint8x8_t t0, t1, t2, t3, t4, t5, t6; + int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + int8x16x2_t samples_LUT; + + if (w == 4) { + const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); + int8x16_t s0123, s1234, s2345, s3456, s78910; + int16x4_t d0, d1, d2, d3; + uint8x8_t d01, d23; + + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + src += 7 * src_stride; + + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); + s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); + s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); + s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); + s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); + s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); + s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + + do { + uint8x8_t t7, t8, t9, t10; + load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); + + s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); + s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); + s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); + s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + + transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + + d0 = convolve4_4_sdot_partial(s0123, correction, filter); + d1 = convolve4_4_sdot_partial(s1234, correction, filter); + d2 = convolve4_4_sdot_partial(s2345, correction, filter); + d3 = convolve4_4_sdot_partial(s3456, correction, filter); + /* We halved the filter values so -1 from right shift. */ + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456; + samples_LUT.val[1] = s78910; + s0123 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s1234 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s2345 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + s3456 = s78910; + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); + int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s78910_lo, s78910_hi; + uint8x8_t d0, d1, d2, d3; + const uint8_t *s; + uint8_t *d; + int height; + + do { + height = h; + s = src; + d = dst; + + load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + s += 7 * src_stride; + + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); + s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); + s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); + s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); + s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); + s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); + s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, + tran_concat_tbl); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, + tran_concat_tbl); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, + tran_concat_tbl); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, + tran_concat_tbl); + + do { + uint8x8_t t7, t8, t9, t10; + load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); + + s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); + s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); + s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); + s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, + tran_concat_tbl); + + d0 = convolve4_8_sdot_partial(s0123_lo, s0123_hi, correction, filter); + d1 = convolve4_8_sdot_partial(s1234_lo, s1234_hi, correction, filter); + d2 = convolve4_8_sdot_partial(s2345_lo, s2345_hi, correction, filter); + d3 = convolve4_8_sdot_partial(s3456_lo, s3456_hi, correction, filter); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456_lo; + samples_LUT.val[1] = s78910_lo; + s0123_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s1234_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s2345_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + s3456_lo = s78910_lo; + + samples_LUT.val[0] = s3456_hi; + samples_LUT.val[1] = s78910_hi; + s0123_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s1234_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s2345_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + s3456_hi = s78910_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void vpx_convolve_8tap_vert_neon_dotprod( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter, + const int32x4_t correction, const uint8x8_t range_limit) { + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + uint8x8_t t0, t1, t2, t3, t4, t5, t6; + int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + int8x16x2_t samples_LUT; + + if (w == 4) { + const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); + int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; + int16x4_t d0, d1, d2, d3; + uint8x8_t d01, d23; + + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + src += 7 * src_stride; + + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); + s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); + s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); + s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); + s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); + s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); + s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + + do { + uint8x8_t t7, t8, t9, t10; + + load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); + + s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); + s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); + s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); + s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + + transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456; + samples_LUT.val[1] = s78910; + s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filter); + d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filter); + d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filter); + d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filter); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s78910; + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); + int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, + s6789_hi, s78910_lo, s78910_hi; + uint8x8_t d0, d1, d2, d3; + const uint8_t *s; + uint8_t *d; + int height; + + do { + height = h; + s = src; + d = dst; + + load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + s += 7 * src_stride; + + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); + s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); + s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); + s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); + s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); + s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); + s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, + tran_concat_tbl); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, + tran_concat_tbl); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, + tran_concat_tbl); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, + tran_concat_tbl); + + do { + uint8x8_t t7, t8, t9, t10; + + load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); + + s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); + s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); + s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); + s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, + tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456_lo; + samples_LUT.val[1] = s78910_lo; + s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + samples_LUT.val[0] = s3456_hi; + samples_LUT.val[1] = s78910_hi; + s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, + correction, filter); + d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, + correction, filter); + d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, + correction, filter); + d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, + correction, filter); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s78910_lo; + s3456_hi = s78910_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4])); + const int32x4_t correction_8tap = + vdupq_n_s32(vaddlvq_s16(vshll_n_s8(y_filter_8tap, FILTER_BITS))); + const uint8x8_t range_limit = vdup_n_u8(128); + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(y_step_q4 == 16); + + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + + if (vpx_get_filter_taps(filter[y0_q4]) <= 4) { + /* All 4-tap and bilinear filter values are even, so halve them to reduce + * intermediate precision requirements. Also slide the filter values so the + * the 4 taps exist in the first 4 elements of the vector. + */ + const int8x8_t y_filter_4tap = + vext_s8(vshr_n_s8(y_filter_8tap, 1), vdup_n_s8(0), 2); + const int32x4_t correction_4tap = vshrq_n_s32(correction_8tap, 1); + vpx_convolve_4tap_vert_neon_dotprod(src - src_stride, src_stride, dst, + dst_stride, w, h, y_filter_4tap, + correction_4tap, range_limit); + } else { + vpx_convolve_8tap_vert_neon_dotprod(src - 3 * src_stride, src_stride, dst, + dst_stride, w, h, y_filter_8tap, + correction_8tap, range_limit); + } +} + +void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, + int y_step_q4, int w, int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); + const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128); + const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); + const uint8x8_t range_limit = vdup_n_u8(128); + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + uint8x8_t t0, t1, t2, t3, t4, t5, t6; + int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + int8x16x2_t samples_LUT; + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(y_step_q4 == 16); + + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + + src -= 3 * src_stride; + + if (w == 4) { + const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); + int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; + int16x4_t d0, d1, d2, d3; + uint8x8_t d01, d23, dd01, dd23; + + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + src += 7 * src_stride; + + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); + s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); + s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); + s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); + s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); + s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); + s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + + do { + uint8x8_t t7, t8, t9, t10; + + load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); + + s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); + s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); + s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); + s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + + transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456; + samples_LUT.val[1] = s78910; + s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters); + d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters); + d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters); + d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + + d01 = vrhadd_u8(d01, dd01); + d23 = vrhadd_u8(d23, dd23); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s78910; + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); + int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, + s6789_hi, s78910_lo, s78910_hi; + uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; + const uint8_t *s; + uint8_t *d; + int height; + + do { + height = h; + s = src; + d = dst; + + load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + s += 7 * src_stride; + + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); + s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); + s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); + s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); + s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); + s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); + s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, + tran_concat_tbl); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, + tran_concat_tbl); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, + tran_concat_tbl); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, + tran_concat_tbl); + + do { + uint8x8_t t7, t8, t9, t10; + + load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); + + s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); + s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); + s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); + s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, + tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456_lo; + samples_LUT.val[1] = s78910_lo; + s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + samples_LUT.val[0] = s3456_hi; + samples_LUT.val[1] = s78910_hi; + s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, + correction, filters); + d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, + correction, filters); + d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, + correction, filters); + d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, + correction, filters); + + load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + d0 = vrhadd_u8(d0, dd0); + d1 = vrhadd_u8(d1, dd1); + d2 = vrhadd_u8(d2, dd2); + d3 = vrhadd_u8(d3, dd3); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s78910_lo; + s3456_hi = s78910_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c new file mode 100644 index 0000000000..bcad1dd121 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c @@ -0,0 +1,989 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/vpx_convolve8_neon.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_ports/mem.h" + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +}; + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = { + 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, + 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31 +}; + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = { + /* Shift left and insert new last column in transposed 4x4 block. */ + 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, + /* Shift left and insert two new columns in transposed 4x4 block. */ + 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, + /* Shift left and insert three new columns in transposed 4x4 block. */ + 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 +}; + +static INLINE void vpx_convolve_4tap_2d_horiz_neon_i8mm( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { + uint8x16_t s0, s1, s2, s3; + + if (w == 4) { + const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl); + int16x4_t d0, d1, d2, d3; + uint8x8_t d01, d23; + + do { + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve4_4_usdot(s0, filter, perm_tbl); + d1 = convolve4_4_usdot(s1, filter, perm_tbl); + d2 = convolve4_4_usdot(s2, filter, perm_tbl); + d3 = convolve4_4_usdot(s3, filter, perm_tbl); + /* We halved the filter values so -1 from right shift. */ + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for + * further details on possible values of block height. */ + load_u8_16x3(src, src_stride, &s0, &s1, &s2); + + d0 = convolve4_4_usdot(s0, filter, perm_tbl); + d1 = convolve4_4_usdot(s1, filter, perm_tbl); + d2 = convolve4_4_usdot(s2, filter, perm_tbl); + /* We halved the filter values so -1 from right shift. */ + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8_4x1(dst + 2 * dst_stride, d23); + } else { + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve4_8_usdot(s0, filter, perm_tbl); + d1 = convolve4_8_usdot(s1, filter, perm_tbl); + d2 = convolve4_8_usdot(s2, filter, perm_tbl); + d3 = convolve4_8_usdot(s3, filter, perm_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for + * further details on possible values of block height. */ + width = w; + s = src; + d = dst; + do { + load_u8_16x3(s, src_stride, &s0, &s1, &s2); + + d0 = convolve4_8_usdot(s0, filter, perm_tbl); + d1 = convolve4_8_usdot(s1, filter, perm_tbl); + d2 = convolve4_8_usdot(s2, filter, perm_tbl); + + store_u8_8x3(d, dst_stride, d0, d1, d2); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + } +} + +static INLINE void vpx_convolve_8tap_2d_horiz_neon_i8mm( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { + uint8x16_t s0, s1, s2, s3; + + if (w == 4) { + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + int16x4_t d0, d1, d2, d3; + uint8x8_t d01, d23; + + do { + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_4_usdot(s0, filter, perm_tbl); + d1 = convolve8_4_usdot(s1, filter, perm_tbl); + d2 = convolve8_4_usdot(s2, filter, perm_tbl); + d3 = convolve8_4_usdot(s3, filter, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for + * further details on possible values of block height. */ + load_u8_16x3(src, src_stride, &s0, &s1, &s2); + + d0 = convolve8_4_usdot(s0, filter, perm_tbl); + d1 = convolve8_4_usdot(s1, filter, perm_tbl); + d2 = convolve8_4_usdot(s2, filter, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8_4x1(dst + 2 * dst_stride, d23); + } else { + const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_8_usdot(s0, filter, perm_tbl); + d1 = convolve8_8_usdot(s1, filter, perm_tbl); + d2 = convolve8_8_usdot(s2, filter, perm_tbl); + d3 = convolve8_8_usdot(s3, filter, perm_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for + * further details on possible values of block height. */ + width = w; + s = src; + d = dst; + do { + load_u8_16x3(s, src_stride, &s0, &s1, &s2); + + d0 = convolve8_8_usdot(s0, filter, perm_tbl); + d1 = convolve8_8_usdot(s1, filter, perm_tbl); + d2 = convolve8_8_usdot(s2, filter, perm_tbl); + + store_u8_8x3(d, dst_stride, d0, d1, d2); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + } +} + +void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + if (vpx_get_filter_taps(filter[x0_q4]) <= 4) { + /* All 4-tap and bilinear filter values are even, so halve them to reduce + * intermediate precision requirements. Also slide the filter values so the + * the 4 taps exist in the first 4 elements of the vector. + */ + const int8x8_t x_filter_4tap = + vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2); + vpx_convolve_4tap_2d_horiz_neon_i8mm(src - 1, src_stride, dst, dst_stride, + w, h, x_filter_4tap); + + } else { + vpx_convolve_8tap_2d_horiz_neon_i8mm(src - 3, src_stride, dst, dst_stride, + w, h, x_filter_8tap); + } +} + +static INLINE void vpx_convolve_4tap_horiz_neon_i8mm( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { + uint8x16_t s0, s1, s2, s3; + + if (w == 4) { + const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl); + do { + int16x4_t t0, t1, t2, t3; + uint8x8_t d01, d23; + + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + t0 = convolve4_4_usdot(s0, filter, perm_tbl); + t1 = convolve4_4_usdot(s1, filter, perm_tbl); + t2 = convolve4_4_usdot(s2, filter, perm_tbl); + t3 = convolve4_4_usdot(s3, filter, perm_tbl); + /* We halved the filter values so -1 from right shift. */ + d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); + d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve4_8_usdot(s0, filter, perm_tbl); + d1 = convolve4_8_usdot(s1, filter, perm_tbl); + d2 = convolve4_8_usdot(s2, filter, perm_tbl); + d3 = convolve4_8_usdot(s3, filter, perm_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +static INLINE void vpx_convolve_8tap_horiz_neon_i8mm( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { + uint8x16_t s0, s1, s2, s3; + + if (w == 4) { + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + do { + int16x4_t t0, t1, t2, t3; + uint8x8_t d01, d23; + + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + t0 = convolve8_4_usdot(s0, filter, perm_tbl); + t1 = convolve8_4_usdot(s1, filter, perm_tbl); + t2 = convolve8_4_usdot(s2, filter, perm_tbl); + t3 = convolve8_4_usdot(s3, filter, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_8_usdot(s0, filter, perm_tbl); + d1 = convolve8_8_usdot(s1, filter, perm_tbl); + d2 = convolve8_8_usdot(s2, filter, perm_tbl); + d3 = convolve8_8_usdot(s3, filter, perm_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + if (vpx_get_filter_taps(filter[x0_q4]) <= 4) { + /* All 4-tap and bilinear filter values are even, so halve them to reduce + * intermediate precision requirements. Also slide the filter values so the + * the 4 taps exist in the first 4 elements of the vector. + */ + const int8x8_t x_filter_4tap = + vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2); + vpx_convolve_4tap_horiz_neon_i8mm(src - 1, src_stride, dst, dst_stride, w, + h, x_filter_4tap); + + } else { + vpx_convolve_8tap_horiz_neon_i8mm(src - 3, src_stride, dst, dst_stride, w, + h, x_filter_8tap); + } +} + +void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); + uint8x16_t s0, s1, s2, s3; + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + src -= 3; + + if (w == 4) { + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + do { + int16x4_t t0, t1, t2, t3; + uint8x8_t d01, d23, dd01, dd23; + + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + t0 = convolve8_4_usdot(s0, filters, perm_tbl); + t1 = convolve8_4_usdot(s1, filters, perm_tbl); + t2 = convolve8_4_usdot(s2, filters, perm_tbl); + t3 = convolve8_4_usdot(s3, filters, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); + + dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + + d01 = vrhadd_u8(d01, dd01); + d23 = vrhadd_u8(d23, dd23); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_8_usdot(s0, filters, perm_tbl); + d1 = convolve8_8_usdot(s1, filters, perm_tbl); + d2 = convolve8_8_usdot(s2, filters, perm_tbl); + d3 = convolve8_8_usdot(s3, filters, perm_tbl); + + load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + d0 = vrhadd_u8(d0, dd0); + d1 = vrhadd_u8(d1, dd1); + d2 = vrhadd_u8(d2, dd2); + d3 = vrhadd_u8(d3, dd3); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1, + uint8x8_t a2, uint8x8_t a3, + uint8x16_t *b, + const uint8x16_t permute_tbl) { + /* Transpose 8-bit elements and concatenate result rows as follows: + * a0: 00, 01, 02, 03, XX, XX, XX, XX + * a1: 10, 11, 12, 13, XX, XX, XX, XX + * a2: 20, 21, 22, 23, XX, XX, XX, XX + * a3: 30, 31, 32, 33, XX, XX, XX, XX + * + * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + * + * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it + * as an argument is preferable to loading it directly from memory as this + * inline helper is called many times from the same parent function. + */ + + uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } }; + *b = vqtbl2q_u8(samples, permute_tbl); +} + +static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1, + uint8x8_t a2, uint8x8_t a3, + uint8x16_t *b0, uint8x16_t *b1, + const uint8x16x2_t permute_tbl) { + /* Transpose 8-bit elements and concatenate result rows as follows: + * a0: 00, 01, 02, 03, 04, 05, 06, 07 + * a1: 10, 11, 12, 13, 14, 15, 16, 17 + * a2: 20, 21, 22, 23, 24, 25, 26, 27 + * a3: 30, 31, 32, 33, 34, 35, 36, 37 + * + * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 + * + * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it + * as an argument is preferable to loading it directly from memory as this + * inline helper is called many times from the same parent function. + */ + + uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } }; + *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]); + *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]); +} + +static INLINE void vpx_convolve_4tap_vert_neon_i8mm( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + uint8x16x2_t samples_LUT; + + if (w == 4) { + const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); + uint8x16_t s0123, s1234, s2345, s3456, s78910; + int16x4_t d0, d1, d2, d3; + uint8x8_t d01, d23; + + load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + src += 7 * src_stride; + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + + do { + load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); + + transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + + d0 = convolve4_4_usdot_partial(s0123, filter); + d1 = convolve4_4_usdot_partial(s1234, filter); + d2 = convolve4_4_usdot_partial(s2345, filter); + d3 = convolve4_4_usdot_partial(s3456, filter); + /* We halved the filter values so -1 from right shift. */ + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456; + samples_LUT.val[1] = s78910; + s0123 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s1234 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s2345 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + s3456 = s78910; + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); + uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s78910_lo, s78910_hi; + uint8x8_t d0, d1, d2, d3; + const uint8_t *s; + uint8_t *d; + int height; + + do { + height = h; + s = src; + d = dst; + + load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, + tran_concat_tbl); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, + tran_concat_tbl); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, + tran_concat_tbl); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, + tran_concat_tbl); + + do { + load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, + tran_concat_tbl); + + d0 = convolve4_8_usdot_partial(s0123_lo, s0123_hi, filter); + d1 = convolve4_8_usdot_partial(s1234_lo, s1234_hi, filter); + d2 = convolve4_8_usdot_partial(s2345_lo, s2345_hi, filter); + d3 = convolve4_8_usdot_partial(s3456_lo, s3456_hi, filter); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456_lo; + samples_LUT.val[1] = s78910_lo; + s0123_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s1234_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s2345_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + s3456_lo = s78910_lo; + + samples_LUT.val[0] = s3456_hi; + samples_LUT.val[1] = s78910_hi; + s0123_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s1234_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s2345_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + s3456_hi = s78910_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void vpx_convolve_8tap_vert_neon_i8mm( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + uint8x16x2_t samples_LUT; + + if (w == 4) { + const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); + uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; + int16x4_t d0, d1, d2, d3; + uint8x8_t d01, d23; + + load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + src += 7 * src_stride; + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + + do { + load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); + + transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456; + samples_LUT.val[1] = s78910; + s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_4_usdot_partial(s0123, s4567, filter); + d1 = convolve8_4_usdot_partial(s1234, s5678, filter); + d2 = convolve8_4_usdot_partial(s2345, s6789, filter); + d3 = convolve8_4_usdot_partial(s3456, s78910, filter); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s78910; + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); + uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, + s6789_hi, s78910_lo, s78910_hi; + uint8x8_t d0, d1, d2, d3; + const uint8_t *s; + uint8_t *d; + int height; + + do { + height = h; + s = src; + d = dst; + + load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, + tran_concat_tbl); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, + tran_concat_tbl); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, + tran_concat_tbl); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, + tran_concat_tbl); + + do { + load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, + tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456_lo; + samples_LUT.val[1] = s78910_lo; + s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + samples_LUT.val[0] = s3456_hi; + samples_LUT.val[1] = s78910_hi; + s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, + filter); + d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, + filter); + d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, + filter); + d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, + filter); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s78910_lo; + s3456_hi = s78910_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4])); + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(y_step_q4 == 16); + + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + + if (vpx_get_filter_taps(filter[y0_q4]) <= 4) { + /* All 4-tap and bilinear filter values are even, so halve them to reduce + * intermediate precision requirements. Also slide the filter values so the + * the 4 taps exist in the first 4 elements of the vector. + */ + const int8x8_t y_filter_4tap = + vext_s8(vshr_n_s8(y_filter_8tap, 1), vdup_n_s8(0), 2); + vpx_convolve_4tap_vert_neon_i8mm(src - src_stride, src_stride, dst, + dst_stride, w, h, y_filter_4tap); + } else { + vpx_convolve_8tap_vert_neon_i8mm(src - 3 * src_stride, src_stride, dst, + dst_stride, w, h, y_filter_8tap); + } +} + +void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + uint8x16x2_t samples_LUT; + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(y_step_q4 == 16); + + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + + src -= 3 * src_stride; + + if (w == 4) { + const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); + uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; + int16x4_t d0, d1, d2, d3; + uint8x8_t d01, d23, dd01, dd23; + + load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + src += 7 * src_stride; + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + + do { + load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); + + transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456; + samples_LUT.val[1] = s78910; + s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_4_usdot_partial(s0123, s4567, filters); + d1 = convolve8_4_usdot_partial(s1234, s5678, filters); + d2 = convolve8_4_usdot_partial(s2345, s6789, filters); + d3 = convolve8_4_usdot_partial(s3456, s78910, filters); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + + d01 = vrhadd_u8(d01, dd01); + d23 = vrhadd_u8(d23, dd23); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s78910; + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); + uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, + s6789_hi, s78910_lo, s78910_hi; + uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; + const uint8_t *s; + uint8_t *d; + int height; + + do { + height = h; + s = src; + d = dst; + + load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, + tran_concat_tbl); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, + tran_concat_tbl); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, + tran_concat_tbl); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, + tran_concat_tbl); + + do { + load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, + tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456_lo; + samples_LUT.val[1] = s78910_lo; + s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + samples_LUT.val[0] = s3456_hi; + samples_LUT.val[1] = s78910_hi; + s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, + filters); + d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, + filters); + d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, + filters); + d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, + filters); + + load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + d0 = vrhadd_u8(d0, dd0); + d1 = vrhadd_u8(d1, dd1); + d2 = vrhadd_u8(d2, dd2); + d3 = vrhadd_u8(d3, dd3); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s78910_lo; + s3456_hi = s78910_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm new file mode 100644 index 0000000000..2666d4253e --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm @@ -0,0 +1,457 @@ +; +; Copyright (c) 2018 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +;**************Variables Vs Registers*********************************** +; r0 => src +; r1 => dst +; r2 => src_stride +; r6 => dst_stride +; r12 => filter_y0 +; r5 => ht +; r3 => wd + + EXPORT |vpx_convolve8_vert_filter_type1_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vpx_convolve8_vert_filter_type1_neon| PROC + + stmfd sp!, {r4 - r12, r14} ;stack stores the values of + ; the arguments + vpush {d8 - d15} ; stack offset by 64 + mov r4, r1 + mov r1, r2 + mov r2, r4 + vmov.i16 q15, #0x4000 + mov r11, #0xc000 + ldr r12, [sp, #104] ;load filter + ldr r6, [sp, #116] ;load y0_q4 + add r12, r12, r6, lsl #4 ;r12 = filter[y0_q4] + mov r6, r3 + ldr r5, [sp, #124] ;load wd + vld2.8 {d0, d1}, [r12] ;coeff = vld1_s8(pi1_coeff) + sub r12, r2, r2, lsl #2 ;src_ctrd & pi1_coeff + vabs.s8 d0, d0 ;vabs_s8(coeff) + add r0, r0, r12 ;r0->pu1_src r12->pi1_coeff + ldr r3, [sp, #128] ;load ht + subs r7, r3, #0 ;r3->ht + vdup.u8 d22, d0[0] ;coeffabs_0 = vdup_lane_u8(coeffabs, + ; 0); + cmp r5, #8 + vdup.u8 d23, d0[1] ;coeffabs_1 = vdup_lane_u8(coeffabs, + ; 1); + vdup.u8 d24, d0[2] ;coeffabs_2 = vdup_lane_u8(coeffabs, + ; 2); + vdup.u8 d25, d0[3] ;coeffabs_3 = vdup_lane_u8(coeffabs, + ; 3); + vdup.u8 d26, d0[4] ;coeffabs_4 = vdup_lane_u8(coeffabs, + ; 4); + vdup.u8 d27, d0[5] ;coeffabs_5 = vdup_lane_u8(coeffabs, + ; 5); + vdup.u8 d28, d0[6] ;coeffabs_6 = vdup_lane_u8(coeffabs, + ; 6); + vdup.u8 d29, d0[7] ;coeffabs_7 = vdup_lane_u8(coeffabs, + ; 7); + blt core_loop_wd_4 ;core loop wd 4 jump + + str r0, [sp, #-4]! + str r1, [sp, #-4]! + bic r4, r5, #7 ;r5 ->wd + rsb r9, r4, r6, lsl #2 ;r6->dst_strd r5 ->wd + rsb r8, r4, r2, lsl #2 ;r2->src_strd + mov r3, r5, lsr #3 ;divide by 8 + mul r7, r3 ;multiply height by width + sub r7, #4 ;subtract by one for epilog + +prolog + and r10, r0, #31 + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vdup.16 q4, r11 + vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + subs r4, r4, #8 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vdup.16 q5, r11 + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + addle r0, r0, r8 + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + bicle r4, r5, #7 ;r5 ->wd + vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + pld [r3] + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + pld [r3, r2] + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + pld [r3, r2, lsl #1] + vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + add r3, r3, r2 + vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + pld [r3, r2, lsl #1] + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d1}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q6, d3, d23 + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q6, d2, d22 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q6, d4, d24 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d5, d25 + vmlal.u8 q6, d6, d26 + vmlal.u8 q6, d7, d27 + vmlsl.u8 q6, d16, d28 + vmlsl.u8 q6, d17, d29 + add r14, r1, r6 + vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res); + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + addle r1, r1, r9 + vmlsl.u8 q7, d4, d23 + subs r7, r7, #4 + vmlsl.u8 q7, d3, d22 + vmlal.u8 q7, d5, d24 + vmlal.u8 q7, d6, d25 + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vhadd.s16 q6, q6, q15 + vdup.16 q4, r11 + vmlal.u8 q7, d7, d26 + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d16, d27 + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d17, d28 + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d18, d29 + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + vqrshrun.s16 d12, q6, #6 + blt epilog_end ;jumps to epilog_end + + beq epilog ;jumps to epilog + +main_loop_8 + subs r4, r4, #8 + vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + addle r0, r0, r8 + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + bicle r4, r5, #7 ;r5 ->wd + vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vhadd.s16 q7, q7, q15 + vdup.16 q5, r11 + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vst1.8 {d12}, [r14], r6 + vqrshrun.s16 d14, q7, #6 + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + vst1.8 {d14}, [r14], r6 + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + add r14, r1, #0 + vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + add r1, r1, #8 + vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + addle r1, r1, r9 + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vmlsl.u8 q6, d3, d23 + add r10, r3, r2, lsl #3 ; 10*strd - 8+2 + vmlsl.u8 q6, d2, d22 + add r10, r10, r2 ; 11*strd + vmlal.u8 q6, d4, d24 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q6, d5, d25 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d6, d26 + vst1.8 {d8}, [r14], r6 ;vst1_u8(pu1_dst,sto_res); + pld [r10] ;11+ 0 + vmlal.u8 q6, d7, d27 + pld [r10, r2] ;11+ 1*strd + vmlsl.u8 q6, d16, d28 + pld [r10, r2, lsl #1] ;11+ 2*strd + vmlsl.u8 q6, d17, d29 + add r10, r10, r2 ;12*strd + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + pld [r10, r2, lsl #1] ;11+ 3*strd + vmlsl.u8 q7, d4, d23 + vmlsl.u8 q7, d3, d22 + subs r7, r7, #4 + vmlal.u8 q7, d5, d24 + vmlal.u8 q7, d6, d25 + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vhadd.s16 q6, q6, q15 + vdup.16 q4, r11 + vmlal.u8 q7, d7, d26 + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d16, d27 + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d17, d28 + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d18, d29 + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vqrshrun.s16 d12, q6, #6 + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + bgt main_loop_8 ;jumps to main_loop_8 + +epilog + vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vhadd.s16 q7, q7, q15 + vdup.16 q5, r11 + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vst1.8 {d12}, [r14], r6 + vqrshrun.s16 d14, q7, #6 + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + vst1.8 {d14}, [r14], r6 + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q6, d3, d23 + vmlsl.u8 q6, d2, d22 + vmlal.u8 q6, d4, d24 + vmlal.u8 q6, d5, d25 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d6, d26 + vmlal.u8 q6, d7, d27 + vmlsl.u8 q6, d16, d28 + vmlsl.u8 q6, d17, d29 + add r14, r1, r6 + vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res); + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d4, d23 + vmlsl.u8 q7, d3, d22 + vmlal.u8 q7, d5, d24 + vmlal.u8 q7, d6, d25 + vhadd.s16 q6, q6, q15 + vmlal.u8 q7, d7, d26 + vmlal.u8 q7, d16, d27 + vmlsl.u8 q7, d17, d28 + vmlsl.u8 q7, d18, d29 + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + vqrshrun.s16 d12, q6, #6 + +epilog_end + vst1.8 {d12}, [r14], r6 + vhadd.s16 q7, q7, q15 + vqrshrun.s16 d14, q7, #6 + vst1.8 {d14}, [r14], r6 + +end_loops + tst r5, #7 + ldr r1, [sp], #4 + ldr r0, [sp], #4 + vpopeq {d8 - d15} + ldmfdeq sp!, {r4 - r12, r15} ;reload the registers from + ; sp + mov r5, #4 + add r0, r0, #8 + add r1, r1, #8 + mov r7, #16 + +core_loop_wd_4 + rsb r9, r5, r6, lsl #2 ;r6->dst_strd r5 ->wd + rsb r8, r5, r2, lsl #2 ;r2->src_strd + vmov.i8 d4, #0 + +outer_loop_wd_4 + subs r12, r5, #0 + ble end_inner_loop_wd_4 ;outer loop jump + +inner_loop_wd_4 + add r3, r0, r2 + vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 1); + subs r12, r12, #4 + vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1, + ; 1); + vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp2, 1); + vld1.u32 {d4[0]},[r0] ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 0); + vdup.16 q0, r11 + vmlsl.u8 q0, d5, d23 ;mul_res1 = + ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1); + + vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2, + ; 1); + add r0, r0, #4 + vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp3, 1); + vmlsl.u8 q0, d4, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp1), coeffabs_0); + vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3, + ; 1); + vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp4, 1); + vmlal.u8 q0, d6, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp3), coeffabs_2); + vdup.16 q4, r11 + vmlsl.u8 q4, d7, d23 + vdup.u32 d4, d7[1] ;src_tmp1 = vdup_lane_u32(src_tmp4, + ; 1); + vmull.u8 q1, d7, d25 ;mul_res2 = + ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3); + vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 1); + vmlsl.u8 q4, d6, d22 + vmlal.u8 q0, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp1), coeffabs_4); + vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1, + ; 1); + vmlal.u8 q4, d4, d24 + vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp2, 1); + vmlal.u8 q1, d5, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; vreinterpret_u8_u32(src_tmp2), coeffabs_5); + vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2, + ; 1); + vmlal.u8 q4, d5, d25 + vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp3, 1); + vmlsl.u8 q0, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp3), coeffabs_6); + vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3, + ; 1); + vmlal.u8 q4, d6, d26 + vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp4, 1); + vmlsl.u8 q1, d7, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; vreinterpret_u8_u32(src_tmp4), coeffabs_7); + vdup.u32 d4, d7[1] + vadd.i16 q0, q0, q1 ;mul_res1 = vaddq_u16(mul_res1, + ; mul_res2); + vmlal.u8 q4, d7, d27 + vld1.u32 {d4[1]},[r3], r2 + vmlsl.u8 q4, d4, d28 + vdup.u32 d5, d4[1] + vhadd.s16 q0, q0, q15 + vqrshrun.s16 d0, q0, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u32 {d5[1]},[r3] + add r3, r1, r6 + vst1.32 {d0[0]},[r1] ;vst1_lane_u32((uint32_t *)pu1_dst, + ; vreinterpret_u32_u8(sto_res), 0); + vmlsl.u8 q4, d5, d29 + vst1.32 {d0[1]},[r3], r6 ;vst1_lane_u32((uint32_t + ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1); + vhadd.s16 q4, q4, q15 + vqrshrun.s16 d8, q4, #6 + vst1.32 {d8[0]},[r3], r6 + add r1, r1, #4 + vst1.32 {d8[1]},[r3] + bgt inner_loop_wd_4 + +end_inner_loop_wd_4 + subs r7, r7, #4 + add r1, r1, r9 + add r0, r0, r8 + bgt outer_loop_wd_4 + + vpop {d8 - d15} + ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp + + ENDP + + END diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm new file mode 100644 index 0000000000..cb5d6d3fe5 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm @@ -0,0 +1,455 @@ +; +; Copyright (c) 2018 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +;**************Variables Vs Registers*********************************** +; r0 => src +; r1 => dst +; r2 => src_stride +; r6 => dst_stride +; r12 => filter_y0 +; r5 => ht +; r3 => wd + + EXPORT |vpx_convolve8_vert_filter_type2_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vpx_convolve8_vert_filter_type2_neon| PROC + + stmfd sp!, {r4 - r12, r14} ;stack stores the values of + ; the arguments + vpush {d8 - d15} ; stack offset by 64 + mov r4, r1 + mov r1, r2 + mov r2, r4 + vmov.i16 q15, #0x4000 + mov r11, #0xc000 + ldr r12, [sp, #104] ;load filter + ldr r6, [sp, #116] ;load y0_q4 + add r12, r12, r6, lsl #4 ;r12 = filter[y0_q4] + mov r6, r3 + ldr r5, [sp, #124] ;load wd + vld2.8 {d0, d1}, [r12] ;coeff = vld1_s8(pi1_coeff) + sub r12, r2, r2, lsl #2 ;src_ctrd & pi1_coeff + vabs.s8 d0, d0 ;vabs_s8(coeff) + add r0, r0, r12 ;r0->pu1_src r12->pi1_coeff + ldr r3, [sp, #128] ;load ht + subs r7, r3, #0 ;r3->ht + vdup.u8 d22, d0[0] ;coeffabs_0 = vdup_lane_u8(coeffabs, + ; 0); + cmp r5, #8 + vdup.u8 d23, d0[1] ;coeffabs_1 = vdup_lane_u8(coeffabs, + ; 1); + vdup.u8 d24, d0[2] ;coeffabs_2 = vdup_lane_u8(coeffabs, + ; 2); + vdup.u8 d25, d0[3] ;coeffabs_3 = vdup_lane_u8(coeffabs, + ; 3); + vdup.u8 d26, d0[4] ;coeffabs_4 = vdup_lane_u8(coeffabs, + ; 4); + vdup.u8 d27, d0[5] ;coeffabs_5 = vdup_lane_u8(coeffabs, + ; 5); + vdup.u8 d28, d0[6] ;coeffabs_6 = vdup_lane_u8(coeffabs, + ; 6); + vdup.u8 d29, d0[7] ;coeffabs_7 = vdup_lane_u8(coeffabs, + ; 7); + blt core_loop_wd_4 ;core loop wd 4 jump + + str r0, [sp, #-4]! + str r1, [sp, #-4]! + bic r4, r5, #7 ;r5 ->wd + rsb r9, r4, r6, lsl #2 ;r6->dst_strd r5 ->wd + rsb r8, r4, r2, lsl #2 ;r2->src_strd + mov r3, r5, lsr #3 ;divide by 8 + mul r7, r3 ;multiply height by width + sub r7, #4 ;subtract by one for epilog + +prolog + and r10, r0, #31 + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vdup.16 q4, r11 + vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + subs r4, r4, #8 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vdup.16 q5, r11 + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + addle r0, r0, r8 + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + bicle r4, r5, #7 ;r5 ->wd + vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + pld [r3] + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + pld [r3, r2] + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + pld [r3, r2, lsl #1] + vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + add r3, r3, r2 + vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + pld [r3, r2, lsl #1] + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + + vld1.u8 {d1}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q6, d3, d23 + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q6, d2, d22 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q6, d4, d24 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d5, d25 + vmlal.u8 q6, d6, d26 + vmlsl.u8 q6, d7, d27 + vmlal.u8 q6, d16, d28 + vmlsl.u8 q6, d17, d29 + add r14, r1, r6 + vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res); + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + addle r1, r1, r9 + vmlal.u8 q7, d4, d23 + subs r7, r7, #4 + vmlsl.u8 q7, d3, d22 + vmlsl.u8 q7, d5, d24 + vmlal.u8 q7, d6, d25 + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vhadd.s16 q6, q6, q15 + vdup.16 q4, r11 + vmlal.u8 q7, d7, d26 + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d16, d27 + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d17, d28 + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d18, d29 + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + vqrshrun.s16 d12, q6, #6 + blt epilog_end ;jumps to epilog_end + + beq epilog ;jumps to epilog + +main_loop_8 + subs r4, r4, #8 + vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + addle r0, r0, r8 + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + bicle r4, r5, #7 ;r5 ->wd + vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vhadd.s16 q7, q7, q15 + vdup.16 q5, r11 + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vst1.8 {d12}, [r14], r6 + vqrshrun.s16 d14, q7, #6 + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + vst1.8 {d14}, [r14], r6 + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + add r14, r1, #0 + vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + add r1, r1, #8 + vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + addle r1, r1, r9 + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vmlal.u8 q6, d3, d23 + add r10, r3, r2, lsl #3 ; 10*strd - 8+2 + vmlsl.u8 q6, d2, d22 + add r10, r10, r2 ; 11*strd + vmlsl.u8 q6, d4, d24 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q6, d5, d25 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d6, d26 + vst1.8 {d8}, [r14], r6 ;vst1_u8(pu1_dst,sto_res); + pld [r10] ;11+ 0 + vmlsl.u8 q6, d7, d27 + pld [r10, r2] ;11+ 1*strd + vmlal.u8 q6, d16, d28 + pld [r10, r2, lsl #1] ;11+ 2*strd + vmlsl.u8 q6, d17, d29 + add r10, r10, r2 ;12*strd + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + pld [r10, r2, lsl #1] ;11+ 3*strd + vmlal.u8 q7, d4, d23 + vmlsl.u8 q7, d3, d22 + subs r7, r7, #4 + vmlsl.u8 q7, d5, d24 + vmlal.u8 q7, d6, d25 + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vhadd.s16 q6, q6, q15 + vdup.16 q4, r11 + vmlal.u8 q7, d7, d26 + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d16, d27 + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d17, d28 + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d18, d29 + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vqrshrun.s16 d12, q6, #6 + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + bgt main_loop_8 ;jumps to main_loop_8 + +epilog + vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vhadd.s16 q7, q7, q15 + vdup.16 q5, r11 + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vst1.8 {d12}, [r14], r6 + vqrshrun.s16 d14, q7, #6 + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + vst1.8 {d14}, [r14], r6 + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q6, d3, d23 + vmlsl.u8 q6, d2, d22 + vmlsl.u8 q6, d4, d24 + vmlal.u8 q6, d5, d25 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d6, d26 + vmlsl.u8 q6, d7, d27 + vmlal.u8 q6, d16, d28 + vmlsl.u8 q6, d17, d29 + add r14, r1, r6 + vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res); + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d4, d23 + vmlsl.u8 q7, d3, d22 + vmlsl.u8 q7, d5, d24 + vmlal.u8 q7, d6, d25 + vhadd.s16 q6, q6, q15 + vmlal.u8 q7, d7, d26 + vmlsl.u8 q7, d16, d27 + vmlal.u8 q7, d17, d28 + vmlsl.u8 q7, d18, d29 + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + vqrshrun.s16 d12, q6, #6 + +epilog_end + vst1.8 {d12}, [r14], r6 + vhadd.s16 q7, q7, q15 + vqrshrun.s16 d14, q7, #6 + vst1.8 {d14}, [r14], r6 + +end_loops + tst r5, #7 + ldr r1, [sp], #4 + ldr r0, [sp], #4 + vpopeq {d8 - d15} + ldmfdeq sp!, {r4 - r12, r15} ;reload the registers from sp + mov r5, #4 + add r0, r0, #8 + add r1, r1, #8 + mov r7, #16 + +core_loop_wd_4 + rsb r9, r5, r6, lsl #2 ;r6->dst_strd r5 ->wd + rsb r8, r5, r2, lsl #2 ;r2->src_strd + vmov.i8 d4, #0 + +outer_loop_wd_4 + subs r12, r5, #0 + ble end_inner_loop_wd_4 ;outer loop jump + +inner_loop_wd_4 + add r3, r0, r2 + vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 1); + subs r12, r12, #4 + vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1, + ; 1); + vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp2, 1); + vld1.u32 {d4[0]},[r0] ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 0); + vdup.16 q0, r11 + vmlal.u8 q0, d5, d23 ;mul_res1 = + ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1); + vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2, + ; 1); + add r0, r0, #4 + vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp3, 1); + vmlsl.u8 q0, d4, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp1), coeffabs_0); + vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3, + ; 1); + vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp4, 1); + vmlsl.u8 q0, d6, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp3), coeffabs_2); + vdup.16 q4, r11 + vmlal.u8 q4, d7, d23 + vdup.u32 d4, d7[1] ;src_tmp1 = vdup_lane_u32(src_tmp4, + ; 1); + vmull.u8 q1, d7, d25 ;mul_res2 = + ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3); + vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 1); + vmlsl.u8 q4, d6, d22 + vmlal.u8 q0, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp1), coeffabs_4); + vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1, + ; 1); + vmlsl.u8 q4, d4, d24 + vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp2, 1); + vmlsl.u8 q1, d5, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; vreinterpret_u8_u32(src_tmp2), coeffabs_5); + vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2, + ; 1); + vmlal.u8 q4, d5, d25 + vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp3, 1); + vmlal.u8 q0, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp3), coeffabs_6); + vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3, + ; 1); + vmlal.u8 q4, d6, d26 + vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp4, 1); + vmlsl.u8 q1, d7, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; vreinterpret_u8_u32(src_tmp4), coeffabs_7); + vdup.u32 d4, d7[1] + vadd.i16 q0, q0, q1 ;mul_res1 = vaddq_u16(mul_res1, + ; mul_res2); + vmlsl.u8 q4, d7, d27 + vld1.u32 {d4[1]},[r3], r2 + vmlal.u8 q4, d4, d28 + vdup.u32 d5, d4[1] + vhadd.s16 q0, q0, q15 + vqrshrun.s16 d0, q0, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u32 {d5[1]},[r3] + add r3, r1, r6 + vst1.32 {d0[0]},[r1] ;vst1_lane_u32((uint32_t *)pu1_dst, + ; vreinterpret_u32_u8(sto_res), 0); + vmlsl.u8 q4, d5, d29 + vst1.32 {d0[1]},[r3], r6 ;vst1_lane_u32((uint32_t + ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1); + vhadd.s16 q4, q4, q15 + vqrshrun.s16 d8, q4, #6 + vst1.32 {d8[0]},[r3], r6 + add r1, r1, #4 + vst1.32 {d8[1]},[r3] + bgt inner_loop_wd_4 + +end_inner_loop_wd_4 + subs r7, r7, #4 + add r1, r1, r9 + add r0, r0, r8 + bgt outer_loop_wd_4 + + vpop {d8 - d15} + ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp + + ENDP + + END diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c new file mode 100644 index 0000000000..8e3ee599f4 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + if (w < 8) { // avg4 + uint8x8_t s0, s1; + uint8x8_t dd0 = vdup_n_u8(0); + uint32x2x2_t s01; + do { + s0 = vld1_u8(src); + src += src_stride; + s1 = vld1_u8(src); + src += src_stride; + s01 = vzip_u32(vreinterpret_u32_u8(s0), vreinterpret_u32_u8(s1)); + dd0 = vreinterpret_u8_u32( + vld1_lane_u32((const uint32_t *)dst, vreinterpret_u32_u8(dd0), 0)); + dd0 = vreinterpret_u8_u32(vld1_lane_u32( + (const uint32_t *)(dst + dst_stride), vreinterpret_u32_u8(dd0), 1)); + dd0 = vrhadd_u8(vreinterpret_u8_u32(s01.val[0]), dd0); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dd0), 0); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dd0), 1); + dst += dst_stride; + h -= 2; + } while (h != 0); + } else if (w == 8) { // avg8 + uint8x8_t s0, s1, d0, d1; + uint8x16_t s01, d01; + do { + s0 = vld1_u8(src); + src += src_stride; + s1 = vld1_u8(src); + src += src_stride; + d0 = vld1_u8(dst); + d1 = vld1_u8(dst + dst_stride); + + s01 = vcombine_u8(s0, s1); + d01 = vcombine_u8(d0, d1); + d01 = vrhaddq_u8(s01, d01); + + vst1_u8(dst, vget_low_u8(d01)); + dst += dst_stride; + vst1_u8(dst, vget_high_u8(d01)); + dst += dst_stride; + h -= 2; + } while (h != 0); + } else if (w < 32) { // avg16 + uint8x16_t s0, s1, d0, d1; + do { + s0 = vld1q_u8(src); + src += src_stride; + s1 = vld1q_u8(src); + src += src_stride; + d0 = vld1q_u8(dst); + d1 = vld1q_u8(dst + dst_stride); + + d0 = vrhaddq_u8(s0, d0); + d1 = vrhaddq_u8(s1, d1); + + vst1q_u8(dst, d0); + dst += dst_stride; + vst1q_u8(dst, d1); + dst += dst_stride; + h -= 2; + } while (h != 0); + } else if (w == 32) { // avg32 + uint8x16_t s0, s1, s2, s3, d0, d1, d2, d3; + do { + s0 = vld1q_u8(src); + s1 = vld1q_u8(src + 16); + src += src_stride; + s2 = vld1q_u8(src); + s3 = vld1q_u8(src + 16); + src += src_stride; + d0 = vld1q_u8(dst); + d1 = vld1q_u8(dst + 16); + d2 = vld1q_u8(dst + dst_stride); + d3 = vld1q_u8(dst + dst_stride + 16); + + d0 = vrhaddq_u8(s0, d0); + d1 = vrhaddq_u8(s1, d1); + d2 = vrhaddq_u8(s2, d2); + d3 = vrhaddq_u8(s3, d3); + + vst1q_u8(dst, d0); + vst1q_u8(dst + 16, d1); + dst += dst_stride; + vst1q_u8(dst, d2); + vst1q_u8(dst + 16, d3); + dst += dst_stride; + h -= 2; + } while (h != 0); + } else { // avg64 + uint8x16_t s0, s1, s2, s3, d0, d1, d2, d3; + do { + s0 = vld1q_u8(src); + s1 = vld1q_u8(src + 16); + s2 = vld1q_u8(src + 32); + s3 = vld1q_u8(src + 48); + src += src_stride; + d0 = vld1q_u8(dst); + d1 = vld1q_u8(dst + 16); + d2 = vld1q_u8(dst + 32); + d3 = vld1q_u8(dst + 48); + + d0 = vrhaddq_u8(s0, d0); + d1 = vrhaddq_u8(s1, d1); + d2 = vrhaddq_u8(s2, d2); + d3 = vrhaddq_u8(s3, d3); + + vst1q_u8(dst, d0); + vst1q_u8(dst + 16, d1); + vst1q_u8(dst + 32, d2); + vst1q_u8(dst + 48, d3); + dst += dst_stride; + } while (--h); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm new file mode 100644 index 0000000000..efd6574f1f --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm @@ -0,0 +1,116 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vpx_convolve_avg_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vpx_convolve_avg_neon| PROC + push {r4-r6, lr} + ldrd r4, r5, [sp, #36] + mov r6, r2 + + cmp r4, #32 + bgt avg64 + beq avg32 + cmp r4, #8 + bgt avg16 + beq avg8 + b avg4 + +avg64 + sub lr, r1, #32 + sub r4, r3, #32 +avg64_h + pld [r0, r1, lsl #1] + vld1.8 {q0-q1}, [r0]! + vld1.8 {q2-q3}, [r0], lr + pld [r2, r3] + vld1.8 {q8-q9}, [r6@128]! + vld1.8 {q10-q11}, [r6@128], r4 + vrhadd.u8 q0, q0, q8 + vrhadd.u8 q1, q1, q9 + vrhadd.u8 q2, q2, q10 + vrhadd.u8 q3, q3, q11 + vst1.8 {q0-q1}, [r2@128]! + vst1.8 {q2-q3}, [r2@128], r4 + subs r5, r5, #1 + bgt avg64_h + pop {r4-r6, pc} + +avg32 + vld1.8 {q0-q1}, [r0], r1 + vld1.8 {q2-q3}, [r0], r1 + vld1.8 {q8-q9}, [r6@128], r3 + vld1.8 {q10-q11}, [r6@128], r3 + pld [r0] + vrhadd.u8 q0, q0, q8 + pld [r0, r1] + vrhadd.u8 q1, q1, q9 + pld [r6] + vrhadd.u8 q2, q2, q10 + pld [r6, r3] + vrhadd.u8 q3, q3, q11 + vst1.8 {q0-q1}, [r2@128], r3 + vst1.8 {q2-q3}, [r2@128], r3 + subs r5, r5, #2 + bgt avg32 + pop {r4-r6, pc} + +avg16 + vld1.8 {q0}, [r0], r1 + vld1.8 {q1}, [r0], r1 + vld1.8 {q2}, [r6@128], r3 + vld1.8 {q3}, [r6@128], r3 + pld [r0] + pld [r0, r1] + vrhadd.u8 q0, q0, q2 + pld [r6] + pld [r6, r3] + vrhadd.u8 q1, q1, q3 + vst1.8 {q0}, [r2@128], r3 + vst1.8 {q1}, [r2@128], r3 + subs r5, r5, #2 + bgt avg16 + pop {r4-r6, pc} + +avg8 + vld1.8 {d0}, [r0], r1 + vld1.8 {d1}, [r0], r1 + vld1.8 {d2}, [r6@64], r3 + vld1.8 {d3}, [r6@64], r3 + pld [r0] + pld [r0, r1] + vrhadd.u8 q0, q0, q1 + pld [r6] + pld [r6, r3] + vst1.8 {d0}, [r2@64], r3 + vst1.8 {d1}, [r2@64], r3 + subs r5, r5, #2 + bgt avg8 + pop {r4-r6, pc} + +avg4 + vld1.32 {d0[0]}, [r0], r1 + vld1.32 {d0[1]}, [r0], r1 + vld1.32 {d2[0]}, [r6@32], r3 + vld1.32 {d2[1]}, [r6@32], r3 + vrhadd.u8 d0, d0, d2 + vst1.32 {d0[0]}, [r2@32], r3 + vst1.32 {d0[1]}, [r2@32], r3 + subs r5, r5, #2 + bgt avg4 + pop {r4-r6, pc} + ENDP + + END diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c new file mode 100644 index 0000000000..bea7c98437 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + if (w < 8) { // copy4 + do { + memcpy(dst, src, 4); + src += src_stride; + dst += dst_stride; + memcpy(dst, src, 4); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h != 0); + } else if (w == 8) { // copy8 + uint8x8_t s0, s1; + do { + s0 = vld1_u8(src); + src += src_stride; + s1 = vld1_u8(src); + src += src_stride; + + vst1_u8(dst, s0); + dst += dst_stride; + vst1_u8(dst, s1); + dst += dst_stride; + h -= 2; + } while (h != 0); + } else if (w < 32) { // copy16 + uint8x16_t s0, s1; + do { + s0 = vld1q_u8(src); + src += src_stride; + s1 = vld1q_u8(src); + src += src_stride; + + vst1q_u8(dst, s0); + dst += dst_stride; + vst1q_u8(dst, s1); + dst += dst_stride; + h -= 2; + } while (h != 0); + } else if (w == 32) { // copy32 + uint8x16_t s0, s1, s2, s3; + do { + s0 = vld1q_u8(src); + s1 = vld1q_u8(src + 16); + src += src_stride; + s2 = vld1q_u8(src); + s3 = vld1q_u8(src + 16); + src += src_stride; + + vst1q_u8(dst, s0); + vst1q_u8(dst + 16, s1); + dst += dst_stride; + vst1q_u8(dst, s2); + vst1q_u8(dst + 16, s3); + dst += dst_stride; + h -= 2; + } while (h != 0); + } else { // copy64 + uint8x16_t s0, s1, s2, s3; + do { + s0 = vld1q_u8(src); + s1 = vld1q_u8(src + 16); + s2 = vld1q_u8(src + 32); + s3 = vld1q_u8(src + 48); + src += src_stride; + + vst1q_u8(dst, s0); + vst1q_u8(dst + 16, s1); + vst1q_u8(dst + 32, s2); + vst1q_u8(dst + 48, s3); + dst += dst_stride; + } while (--h); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm new file mode 100644 index 0000000000..7a66e3ce2f --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm @@ -0,0 +1,84 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vpx_convolve_copy_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vpx_convolve_copy_neon| PROC + push {r4-r5, lr} + ldrd r4, r5, [sp, #32] + + cmp r4, #32 + bgt copy64 + beq copy32 + cmp r4, #8 + bgt copy16 + beq copy8 + b copy4 + +copy64 + sub lr, r1, #32 + sub r3, r3, #32 +copy64_h + pld [r0, r1, lsl #1] + vld1.8 {q0-q1}, [r0]! + vld1.8 {q2-q3}, [r0], lr + vst1.8 {q0-q1}, [r2@128]! + vst1.8 {q2-q3}, [r2@128], r3 + subs r5, r5, #1 + bgt copy64_h + pop {r4-r5, pc} + +copy32 + pld [r0, r1, lsl #1] + vld1.8 {q0-q1}, [r0], r1 + pld [r0, r1, lsl #1] + vld1.8 {q2-q3}, [r0], r1 + vst1.8 {q0-q1}, [r2@128], r3 + vst1.8 {q2-q3}, [r2@128], r3 + subs r5, r5, #2 + bgt copy32 + pop {r4-r5, pc} + +copy16 + pld [r0, r1, lsl #1] + vld1.8 {q0}, [r0], r1 + pld [r0, r1, lsl #1] + vld1.8 {q1}, [r0], r1 + vst1.8 {q0}, [r2@128], r3 + vst1.8 {q1}, [r2@128], r3 + subs r5, r5, #2 + bgt copy16 + pop {r4-r5, pc} + +copy8 + pld [r0, r1, lsl #1] + vld1.8 {d0}, [r0], r1 + pld [r0, r1, lsl #1] + vld1.8 {d2}, [r0], r1 + vst1.8 {d0}, [r2@64], r3 + vst1.8 {d2}, [r2@64], r3 + subs r5, r5, #2 + bgt copy8 + pop {r4-r5, pc} + +copy4 + ldr r12, [r0], r1 + str r12, [r2], r3 + subs r5, r5, #1 + bgt copy4 + pop {r4-r5, pc} + ENDP + + END diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c new file mode 100644 index 0000000000..57772ea668 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_ports/mem.h" + +void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + /* Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the + * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4). + */ + uint8_t temp[64 * 72]; + + const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8; + /* Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior + * and vert_filter_taps / 2 lines post. (+1 to make total divisible by 4.) */ + const int intermediate_height = h + vert_filter_taps; + const ptrdiff_t border_offset = vert_filter_taps / 2 - 1; + + assert(y_step_q4 == 16); + assert(x_step_q4 == 16); + + /* Filter starting border_offset lines back. The Neon implementation will + * ignore the given height and filter a multiple of 4 lines. Since this goes + * in to the temp buffer which has lots of extra room and is subsequently + * discarded this is safe if somewhat less than ideal. */ + vpx_convolve8_horiz_neon(src - src_stride * border_offset, src_stride, temp, + w, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, + intermediate_height); + + /* Step into the temp buffer border_offset lines to get actual frame data. */ + vpx_convolve8_vert_neon(temp + w * border_offset, w, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); +} + +void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + uint8_t temp[64 * 72]; + const int intermediate_height = h + 8; + + assert(y_step_q4 == 16); + assert(x_step_q4 == 16); + + /* This implementation has the same issues as above. In addition, we only want + * to average the values after both passes. + */ + vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, + intermediate_height); + vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_dotprod.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_dotprod.c new file mode 100644 index 0000000000..9d754fde17 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_dotprod.c @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/vpx_convolve8_neon.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_ports/mem.h" + +void vpx_convolve8_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + /* Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the + * maximum buffer size to 64 * (64 + 7). */ + uint8_t temp[64 * 71]; + + const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8; + /* Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior + * and vert_filter_taps / 2 lines post. */ + const int intermediate_height = h + vert_filter_taps - 1; + const ptrdiff_t border_offset = vert_filter_taps / 2 - 1; + + assert(y_step_q4 == 16); + assert(x_step_q4 == 16); + + vpx_convolve8_2d_horiz_neon_dotprod( + src - src_stride * border_offset, src_stride, temp, w, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, intermediate_height); + + vpx_convolve8_vert_neon_dotprod(temp + w * border_offset, w, dst, dst_stride, + filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, + h); +} + +void vpx_convolve8_avg_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + uint8_t temp[64 * 71]; + + /* Averaging convolution always uses an 8-tap filter. */ + /* Account for the vertical phase needing 3 lines prior and 4 lines post. */ + const int intermediate_height = h + 7; + + assert(y_step_q4 == 16); + assert(x_step_q4 == 16); + + vpx_convolve8_2d_horiz_neon_dotprod(src - src_stride * 3, src_stride, temp, w, + filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, intermediate_height); + + vpx_convolve8_avg_vert_neon_dotprod(temp + w * 3, w, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_i8mm.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_i8mm.c new file mode 100644 index 0000000000..d7cbb09ea6 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_i8mm.c @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/vpx_convolve8_neon.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_ports/mem.h" + +void vpx_convolve8_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + /* Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the + * maximum buffer size to 64 * (64 + 7). */ + uint8_t temp[64 * 71]; + + const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8; + /* Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior + * and vert_filter_taps / 2 lines post. */ + const int intermediate_height = h + vert_filter_taps - 1; + const ptrdiff_t border_offset = vert_filter_taps / 2 - 1; + + assert(y_step_q4 == 16); + assert(x_step_q4 == 16); + + vpx_convolve8_2d_horiz_neon_i8mm(src - src_stride * border_offset, src_stride, + temp, w, filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, intermediate_height); + + vpx_convolve8_vert_neon_i8mm(temp + w * border_offset, w, dst, dst_stride, + filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, + h); +} + +void vpx_convolve8_avg_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + uint8_t temp[64 * 71]; + + /* Averaging convolution always uses an 8-tap filter. */ + /* Account for the vertical phase needing 3 lines prior and 4 lines post. */ + const int intermediate_height = h + 7; + + assert(y_step_q4 == 16); + assert(x_step_q4 == 16); + + vpx_convolve8_2d_horiz_neon_i8mm(src - src_stride * 3, src_stride, temp, w, + filter, x0_q4, x_step_q4, y0_q4, y_step_q4, + w, intermediate_height); + + vpx_convolve8_avg_vert_neon_i8mm(temp + w * 3, w, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c new file mode 100644 index 0000000000..b8e3c5e540 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c @@ -0,0 +1,320 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/vpx_convolve8_neon.h" +#include "vpx_ports/mem.h" + +static INLINE void scaledconvolve_horiz_w4( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, const InterpKernel *const x_filters, + const int x0_q4, const int x_step_q4, const int w, const int h) { + DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); + int x, y, z; + + src -= SUBPEL_TAPS / 2 - 1; + + y = h; + do { + int x_q4 = x0_q4; + x = 0; + do { + // process 4 src_x steps + for (z = 0; z < 4; ++z) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + if (x_q4 & SUBPEL_MASK) { + const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]); + uint8x8_t s[8], d; + int16x8_t ss[4]; + int16x4_t t[8], tt; + + load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]); + transpose_u8_8x4(&s[0], &s[1], &s[2], &s[3]); + + ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0])); + ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1])); + ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2])); + ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3])); + t[0] = vget_low_s16(ss[0]); + t[1] = vget_low_s16(ss[1]); + t[2] = vget_low_s16(ss[2]); + t[3] = vget_low_s16(ss[3]); + t[4] = vget_high_s16(ss[0]); + t[5] = vget_high_s16(ss[1]); + t[6] = vget_high_s16(ss[2]); + t[7] = vget_high_s16(ss[3]); + + tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], + filters); + d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7); + vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0); + } else { + int i; + for (i = 0; i < 4; ++i) { + temp[z * 4 + i] = src_x[i * src_stride + 3]; + } + } + x_q4 += x_step_q4; + } + + // transpose the 4x4 filters values back to dst + { + const uint8x8x4_t d4 = vld4_u8(temp); + vst1_lane_u32((uint32_t *)&dst[x + 0 * dst_stride], + vreinterpret_u32_u8(d4.val[0]), 0); + vst1_lane_u32((uint32_t *)&dst[x + 1 * dst_stride], + vreinterpret_u32_u8(d4.val[1]), 0); + vst1_lane_u32((uint32_t *)&dst[x + 2 * dst_stride], + vreinterpret_u32_u8(d4.val[2]), 0); + vst1_lane_u32((uint32_t *)&dst[x + 3 * dst_stride], + vreinterpret_u32_u8(d4.val[3]), 0); + } + x += 4; + } while (x < w); + + src += src_stride * 4; + dst += dst_stride * 4; + y -= 4; + } while (y > 0); +} + +static INLINE void scaledconvolve_horiz_w8( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, const InterpKernel *const x_filters, + const int x0_q4, const int x_step_q4, const int w, const int h) { + DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); + int x, y, z; + src -= SUBPEL_TAPS / 2 - 1; + + // This function processes 8x8 areas. The intermediate height is not always + // a multiple of 8, so force it to be a multiple of 8 here. + y = (h + 7) & ~7; + + do { + int x_q4 = x0_q4; + x = 0; + do { + uint8x8_t d[8]; + // process 8 src_x steps + for (z = 0; z < 8; ++z) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + + if (x_q4 & SUBPEL_MASK) { + const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]); + uint8x8_t s[8]; + load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], + &s[5], &s[6], &s[7]); + transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7]); + d[0] = scale_filter_8(s, filters); + vst1_u8(&temp[8 * z], d[0]); + } else { + int i; + for (i = 0; i < 8; ++i) { + temp[z * 8 + i] = src_x[i * src_stride + 3]; + } + } + x_q4 += x_step_q4; + } + + // transpose the 8x8 filters values back to dst + load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], + &d[7]); + transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); + vst1_u8(&dst[x + 0 * dst_stride], d[0]); + vst1_u8(&dst[x + 1 * dst_stride], d[1]); + vst1_u8(&dst[x + 2 * dst_stride], d[2]); + vst1_u8(&dst[x + 3 * dst_stride], d[3]); + vst1_u8(&dst[x + 4 * dst_stride], d[4]); + vst1_u8(&dst[x + 5 * dst_stride], d[5]); + vst1_u8(&dst[x + 6 * dst_stride], d[6]); + vst1_u8(&dst[x + 7 * dst_stride], d[7]); + x += 8; + } while (x < w); + + src += src_stride * 8; + dst += dst_stride * 8; + } while (y -= 8); +} + +static INLINE void scaledconvolve_vert_w4( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, const InterpKernel *const y_filters, + const int y0_q4, const int y_step_q4, const int w, const int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + y = h; + do { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + + if (y_q4 & SUBPEL_MASK) { + const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]); + uint8x8_t s[8], d; + int16x4_t t[8], tt; + + load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0]))); + t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1]))); + t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2]))); + t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3]))); + t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4]))); + t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5]))); + t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6]))); + t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7]))); + + tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters); + d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0); + } else { + memcpy(dst, &src_y[3 * src_stride], w); + } + + dst += dst_stride; + y_q4 += y_step_q4; + } while (--y); +} + +static INLINE void scaledconvolve_vert_w8( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, const InterpKernel *const y_filters, + const int y0_q4, const int y_step_q4, const int w, const int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + y = h; + do { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + if (y_q4 & SUBPEL_MASK) { + const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]); + uint8x8_t s[8], d; + load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + d = scale_filter_8(s, filters); + vst1_u8(dst, d); + } else { + memcpy(dst, &src_y[3 * src_stride], w); + } + dst += dst_stride; + y_q4 += y_step_q4; + } while (--y); +} + +static INLINE void scaledconvolve_vert_w16( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, const InterpKernel *const y_filters, + const int y0_q4, const int y_step_q4, const int w, const int h) { + int x, y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + y = h; + do { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + if (y_q4 & SUBPEL_MASK) { + x = 0; + do { + const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]); + uint8x16_t ss[8]; + uint8x8_t s[8], d[2]; + load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4], + &ss[5], &ss[6], &ss[7]); + s[0] = vget_low_u8(ss[0]); + s[1] = vget_low_u8(ss[1]); + s[2] = vget_low_u8(ss[2]); + s[3] = vget_low_u8(ss[3]); + s[4] = vget_low_u8(ss[4]); + s[5] = vget_low_u8(ss[5]); + s[6] = vget_low_u8(ss[6]); + s[7] = vget_low_u8(ss[7]); + d[0] = scale_filter_8(s, filters); + + s[0] = vget_high_u8(ss[0]); + s[1] = vget_high_u8(ss[1]); + s[2] = vget_high_u8(ss[2]); + s[3] = vget_high_u8(ss[3]); + s[4] = vget_high_u8(ss[4]); + s[5] = vget_high_u8(ss[5]); + s[6] = vget_high_u8(ss[6]); + s[7] = vget_high_u8(ss[7]); + d[1] = scale_filter_8(s, filters); + vst1q_u8(&dst[x], vcombine_u8(d[0], d[1])); + src_y += 16; + x += 16; + } while (x < w); + } else { + memcpy(dst, &src_y[3 * src_stride], w); + } + dst += dst_stride; + y_q4 += y_step_q4; + } while (--y); +} + +void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + // --Require an additional 8 rows for the horiz_w8 transpose tail. + // When calling in frame scaling function, the smallest scaling factor is x1/4 + // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still + // big enough. + DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]); + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= 64); + assert(h <= 64); + assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); + assert(x_step_q4 <= 64); + + if (w >= 8) { + scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, 64, filter, x0_q4, x_step_q4, w, + intermediate_height); + } else { + scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, 64, filter, x0_q4, x_step_q4, w, + intermediate_height); + } + + if (w >= 16) { + scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, w, h); + } else if (w == 8) { + scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, w, h); + } else { + scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, w, h); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/avg.c b/media/libvpx/libvpx/vpx_dsp/avg.c new file mode 100644 index 0000000000..a8dcab7dae --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/avg.c @@ -0,0 +1,441 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_ports/mem.h" + +unsigned int vpx_avg_8x8_c(const uint8_t *s, int p) { + int i, j; + int sum = 0; + for (i = 0; i < 8; ++i, s += p) + for (j = 0; j < 8; sum += s[j], ++j) { + } + + return (sum + 32) >> 6; +} + +unsigned int vpx_avg_4x4_c(const uint8_t *s, int p) { + int i, j; + int sum = 0; + for (i = 0; i < 4; ++i, s += p) + for (j = 0; j < 4; sum += s[j], ++j) { + } + + return (sum + 8) >> 4; +} + +#if CONFIG_VP9_HIGHBITDEPTH +// src_diff: 13 bit, dynamic range [-4095, 4095] +// coeff: 16 bit +static void hadamard_highbd_col8_first_pass(const int16_t *src_diff, + ptrdiff_t src_stride, + int16_t *coeff) { + int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride]; + int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride]; + int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride]; + int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride]; + int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride]; + int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride]; + int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride]; + int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride]; + + int16_t c0 = b0 + b2; + int16_t c1 = b1 + b3; + int16_t c2 = b0 - b2; + int16_t c3 = b1 - b3; + int16_t c4 = b4 + b6; + int16_t c5 = b5 + b7; + int16_t c6 = b4 - b6; + int16_t c7 = b5 - b7; + + coeff[0] = c0 + c4; + coeff[7] = c1 + c5; + coeff[3] = c2 + c6; + coeff[4] = c3 + c7; + coeff[2] = c0 - c4; + coeff[6] = c1 - c5; + coeff[1] = c2 - c6; + coeff[5] = c3 - c7; +} + +// src_diff: 16 bit, dynamic range [-32760, 32760] +// coeff: 19 bit +static void hadamard_highbd_col8_second_pass(const int16_t *src_diff, + ptrdiff_t src_stride, + int32_t *coeff) { + int32_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride]; + int32_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride]; + int32_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride]; + int32_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride]; + int32_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride]; + int32_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride]; + int32_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride]; + int32_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride]; + + int32_t c0 = b0 + b2; + int32_t c1 = b1 + b3; + int32_t c2 = b0 - b2; + int32_t c3 = b1 - b3; + int32_t c4 = b4 + b6; + int32_t c5 = b5 + b7; + int32_t c6 = b4 - b6; + int32_t c7 = b5 - b7; + + coeff[0] = c0 + c4; + coeff[7] = c1 + c5; + coeff[3] = c2 + c6; + coeff[4] = c3 + c7; + coeff[2] = c0 - c4; + coeff[6] = c1 - c5; + coeff[1] = c2 - c6; + coeff[5] = c3 - c7; +} + +// The order of the output coeff of the hadamard is not important. For +// optimization purposes the final transpose may be skipped. +void vpx_highbd_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int idx; + int16_t buffer[64]; + int32_t buffer2[64]; + int16_t *tmp_buf = &buffer[0]; + for (idx = 0; idx < 8; ++idx) { + // src_diff: 13 bit + // buffer: 16 bit, dynamic range [-32760, 32760] + hadamard_highbd_col8_first_pass(src_diff, src_stride, tmp_buf); + tmp_buf += 8; + ++src_diff; + } + + tmp_buf = &buffer[0]; + for (idx = 0; idx < 8; ++idx) { + // buffer: 16 bit + // buffer2: 19 bit, dynamic range [-262080, 262080] + hadamard_highbd_col8_second_pass(tmp_buf, 8, buffer2 + 8 * idx); + ++tmp_buf; + } + + for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx]; +} + +// In place 16x16 2D Hadamard transform +void vpx_highbd_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int idx; + for (idx = 0; idx < 4; ++idx) { + // src_diff: 13 bit, dynamic range [-4095, 4095] + const int16_t *src_ptr = + src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; + vpx_highbd_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64); + } + + // coeff: 19 bit, dynamic range [-262080, 262080] + for (idx = 0; idx < 64; ++idx) { + tran_low_t a0 = coeff[0]; + tran_low_t a1 = coeff[64]; + tran_low_t a2 = coeff[128]; + tran_low_t a3 = coeff[192]; + + tran_low_t b0 = (a0 + a1) >> 1; + tran_low_t b1 = (a0 - a1) >> 1; + tran_low_t b2 = (a2 + a3) >> 1; + tran_low_t b3 = (a2 - a3) >> 1; + + // new coeff dynamic range: 20 bit + coeff[0] = b0 + b2; + coeff[64] = b1 + b3; + coeff[128] = b0 - b2; + coeff[192] = b1 - b3; + + ++coeff; + } +} + +void vpx_highbd_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int idx; + for (idx = 0; idx < 4; ++idx) { + // src_diff: 13 bit, dynamic range [-4095, 4095] + const int16_t *src_ptr = + src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; + vpx_highbd_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256); + } + + // coeff: 20 bit + for (idx = 0; idx < 256; ++idx) { + tran_low_t a0 = coeff[0]; + tran_low_t a1 = coeff[256]; + tran_low_t a2 = coeff[512]; + tran_low_t a3 = coeff[768]; + + tran_low_t b0 = (a0 + a1) >> 2; + tran_low_t b1 = (a0 - a1) >> 2; + tran_low_t b2 = (a2 + a3) >> 2; + tran_low_t b3 = (a2 - a3) >> 2; + + // new coeff dynamic range: 20 bit + coeff[0] = b0 + b2; + coeff[256] = b1 + b3; + coeff[512] = b0 - b2; + coeff[768] = b1 - b3; + + ++coeff; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +// src_diff: first pass, 9 bit, dynamic range [-255, 255] +// second pass, 12 bit, dynamic range [-2040, 2040] +static void hadamard_col8(const int16_t *src_diff, ptrdiff_t src_stride, + int16_t *coeff) { + int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride]; + int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride]; + int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride]; + int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride]; + int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride]; + int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride]; + int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride]; + int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride]; + + int16_t c0 = b0 + b2; + int16_t c1 = b1 + b3; + int16_t c2 = b0 - b2; + int16_t c3 = b1 - b3; + int16_t c4 = b4 + b6; + int16_t c5 = b5 + b7; + int16_t c6 = b4 - b6; + int16_t c7 = b5 - b7; + + coeff[0] = c0 + c4; + coeff[7] = c1 + c5; + coeff[3] = c2 + c6; + coeff[4] = c3 + c7; + coeff[2] = c0 - c4; + coeff[6] = c1 - c5; + coeff[1] = c2 - c6; + coeff[5] = c3 - c7; +} + +// The order of the output coeff of the hadamard is not important. For +// optimization purposes the final transpose may be skipped. +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int idx; + int16_t buffer[64]; + int16_t buffer2[64]; + int16_t *tmp_buf = &buffer[0]; + for (idx = 0; idx < 8; ++idx) { + hadamard_col8(src_diff, src_stride, tmp_buf); // src_diff: 9 bit + // dynamic range [-255, 255] + tmp_buf += 8; + ++src_diff; + } + + tmp_buf = &buffer[0]; + for (idx = 0; idx < 8; ++idx) { + hadamard_col8(tmp_buf, 8, buffer2 + 8 * idx); // tmp_buf: 12 bit + // dynamic range [-2040, 2040] + // buffer2: 15 bit + // dynamic range [-16320, 16320] + ++tmp_buf; + } + + for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx]; +} + +// In place 16x16 2D Hadamard transform +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int idx; + for (idx = 0; idx < 4; ++idx) { + // src_diff: 9 bit, dynamic range [-255, 255] + const int16_t *src_ptr = + src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; + vpx_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64); + } + + // coeff: 15 bit, dynamic range [-16320, 16320] + for (idx = 0; idx < 64; ++idx) { + tran_low_t a0 = coeff[0]; + tran_low_t a1 = coeff[64]; + tran_low_t a2 = coeff[128]; + tran_low_t a3 = coeff[192]; + + tran_low_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640] + tran_low_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range + tran_low_t b2 = (a2 + a3) >> 1; // [-16320, 16320] + tran_low_t b3 = (a2 - a3) >> 1; + + coeff[0] = b0 + b2; // 16 bit, [-32640, 32640] + coeff[64] = b1 + b3; + coeff[128] = b0 - b2; + coeff[192] = b1 - b3; + + ++coeff; + } +} + +void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int idx; + for (idx = 0; idx < 4; ++idx) { + // src_diff: 9 bit, dynamic range [-255, 255] + const int16_t *src_ptr = + src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; + vpx_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256); + } + + // coeff: 16 bit, dynamic range [-32768, 32767] + for (idx = 0; idx < 256; ++idx) { + tran_low_t a0 = coeff[0]; + tran_low_t a1 = coeff[256]; + tran_low_t a2 = coeff[512]; + tran_low_t a3 = coeff[768]; + + tran_low_t b0 = (a0 + a1) >> 2; // (a0 + a1): 17 bit, [-65536, 65535] + tran_low_t b1 = (a0 - a1) >> 2; // b0-b3: 15 bit, dynamic range + tran_low_t b2 = (a2 + a3) >> 2; // [-16384, 16383] + tran_low_t b3 = (a2 - a3) >> 2; + + coeff[0] = b0 + b2; // 16 bit, [-32768, 32767] + coeff[256] = b1 + b3; + coeff[512] = b0 - b2; + coeff[768] = b1 - b3; + + ++coeff; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +// coeff: dynamic range 20 bit. +// length: value range {16, 64, 256, 1024}. +int vpx_highbd_satd_c(const tran_low_t *coeff, int length) { + int i; + int satd = 0; + for (i = 0; i < length; ++i) satd += abs(coeff[i]); + + // satd: 30 bits + return satd; +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +// coeff: 16 bits, dynamic range [-32640, 32640]. +// length: value range {16, 64, 256, 1024}. +int vpx_satd_c(const tran_low_t *coeff, int length) { + int i; + int satd = 0; + for (i = 0; i < length; ++i) satd += abs(coeff[i]); + + // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] + return satd; +} + +// Integer projection onto row vectors. +// height: value range {16, 32, 64}. +void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, + const int ref_stride, const int height) { + int idx; + const int norm_factor = height >> 1; + assert(height >= 2); + for (idx = 0; idx < 16; ++idx) { + int i; + hbuf[idx] = 0; + // hbuf[idx]: 14 bit, dynamic range [0, 16320]. + for (i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride]; + // hbuf[idx]: 9 bit, dynamic range [0, 510]. + hbuf[idx] /= norm_factor; + ++ref; + } +} + +// width: value range {16, 32, 64}. +int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width) { + int idx; + int16_t sum = 0; + // sum: 14 bit, dynamic range [0, 16320] + for (idx = 0; idx < width; ++idx) sum += ref[idx]; + return sum; +} + +// ref: [0 - 510] +// src: [0 - 510] +// bwl: {2, 3, 4} +int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl) { + int i; + int width = 4 << bwl; + int sse = 0, mean = 0, var; + + for (i = 0; i < width; ++i) { + int diff = ref[i] - src[i]; // diff: dynamic range [-510, 510], 10 bits. + mean += diff; // mean: dynamic range 16 bits. + sse += diff * diff; // sse: dynamic range 26 bits. + } + + // (mean * mean): dynamic range 31 bits. + var = sse - ((mean * mean) >> (bwl + 2)); + return var; +} + +void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, + int *min, int *max) { + int i, j; + *min = 255; + *max = 0; + for (i = 0; i < 8; ++i, s += p, d += dp) { + for (j = 0; j < 8; ++j) { + int diff = abs(s[j] - d[j]); + *min = diff < *min ? diff : *min; + *max = diff > *max ? diff : *max; + } + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +unsigned int vpx_highbd_avg_8x8_c(const uint8_t *s8, int p) { + int i, j; + int sum = 0; + const uint16_t *s = CONVERT_TO_SHORTPTR(s8); + for (i = 0; i < 8; ++i, s += p) + for (j = 0; j < 8; sum += s[j], ++j) { + } + + return (sum + 32) >> 6; +} + +unsigned int vpx_highbd_avg_4x4_c(const uint8_t *s8, int p) { + int i, j; + int sum = 0; + const uint16_t *s = CONVERT_TO_SHORTPTR(s8); + for (i = 0; i < 4; ++i, s += p) + for (j = 0; j < 4; sum += s[j], ++j) { + } + + return (sum + 8) >> 4; +} + +void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, + int dp, int *min, int *max) { + int i, j; + const uint16_t *s = CONVERT_TO_SHORTPTR(s8); + const uint16_t *d = CONVERT_TO_SHORTPTR(d8); + *min = 65535; + *max = 0; + for (i = 0; i < 8; ++i, s += p, d += dp) { + for (j = 0; j < 8; ++j) { + int diff = abs(s[j] - d[j]); + *min = diff < *min ? diff : *min; + *max = diff > *max ? diff : *max; + } + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/bitreader.c b/media/libvpx/libvpx/vpx_dsp/bitreader.c new file mode 100644 index 0000000000..90cbbba53f --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/bitreader.c @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include + +#include "./vpx_config.h" + +#include "vpx_dsp/bitreader.h" +#include "vpx_dsp/prob.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_ports/mem.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_util/endian_inl.h" + +int vpx_reader_init(vpx_reader *r, const uint8_t *buffer, size_t size, + vpx_decrypt_cb decrypt_cb, void *decrypt_state) { + if (size && !buffer) { + return 1; + } else { + r->buffer_end = buffer + size; + r->buffer = buffer; + r->value = 0; + r->count = -8; + r->range = 255; + r->decrypt_cb = decrypt_cb; + r->decrypt_state = decrypt_state; + vpx_reader_fill(r); + return vpx_read_bit(r) != 0; // marker bit + } +} + +void vpx_reader_fill(vpx_reader *r) { + const uint8_t *const buffer_end = r->buffer_end; + const uint8_t *buffer = r->buffer; + const uint8_t *buffer_start = buffer; + BD_VALUE value = r->value; + int count = r->count; + const size_t bytes_left = buffer_end - buffer; + const size_t bits_left = bytes_left * CHAR_BIT; + int shift = BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT); + + if (r->decrypt_cb) { + size_t n = VPXMIN(sizeof(r->clear_buffer), bytes_left); + r->decrypt_cb(r->decrypt_state, buffer, r->clear_buffer, (int)n); + buffer = r->clear_buffer; + buffer_start = r->clear_buffer; + } + if (bits_left > BD_VALUE_SIZE) { + const int bits = (shift & 0xfffffff8) + CHAR_BIT; + BD_VALUE nv; + BD_VALUE big_endian_values; + memcpy(&big_endian_values, buffer, sizeof(BD_VALUE)); +#if SIZE_MAX == 0xffffffffffffffffULL + big_endian_values = HToBE64(big_endian_values); +#else + big_endian_values = HToBE32(big_endian_values); +#endif + nv = big_endian_values >> (BD_VALUE_SIZE - bits); + count += bits; + buffer += (bits >> 3); + value = r->value | (nv << (shift & 0x7)); + } else { + const int bits_over = (int)(shift + CHAR_BIT - (int)bits_left); + int loop_end = 0; + if (bits_over >= 0) { + count += LOTS_OF_BITS; + loop_end = bits_over; + } + + if (bits_over < 0 || bits_left) { + while (shift >= loop_end) { + count += CHAR_BIT; + value |= (BD_VALUE)*buffer++ << shift; + shift -= CHAR_BIT; + } + } + } + + // NOTE: Variable 'buffer' may not relate to 'r->buffer' after decryption, + // so we increase 'r->buffer' by the amount that 'buffer' moved, rather than + // assign 'buffer' to 'r->buffer'. + r->buffer += buffer - buffer_start; + r->value = value; + r->count = count; +} + +const uint8_t *vpx_reader_find_end(vpx_reader *r) { + // Find the end of the coded buffer + while (r->count > CHAR_BIT && r->count < BD_VALUE_SIZE) { + r->count -= CHAR_BIT; + r->buffer--; + } + return r->buffer; +} diff --git a/media/libvpx/libvpx/vpx_dsp/bitreader.h b/media/libvpx/libvpx/vpx_dsp/bitreader.h new file mode 100644 index 0000000000..a5927ea2ad --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/bitreader.h @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_BITREADER_H_ +#define VPX_VPX_DSP_BITREADER_H_ + +#include +#include +#include + +#include "./vpx_config.h" +#include "vpx_ports/mem.h" +#include "vpx/vp8dx.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/prob.h" +#if CONFIG_BITSTREAM_DEBUG +#include "vpx_util/vpx_debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG + +#ifdef __cplusplus +extern "C" { +#endif + +typedef size_t BD_VALUE; + +#define BD_VALUE_SIZE ((int)sizeof(BD_VALUE) * CHAR_BIT) + +// This is meant to be a large, positive constant that can still be efficiently +// loaded as an immediate (on platforms like ARM, for example). +// Even relatively modest values like 100 would work fine. +#define LOTS_OF_BITS 0x40000000 + +typedef struct { + // Be careful when reordering this struct, it may impact the cache negatively. + BD_VALUE value; + unsigned int range; + int count; + const uint8_t *buffer_end; + const uint8_t *buffer; + vpx_decrypt_cb decrypt_cb; + void *decrypt_state; + uint8_t clear_buffer[sizeof(BD_VALUE) + 1]; +} vpx_reader; + +int vpx_reader_init(vpx_reader *r, const uint8_t *buffer, size_t size, + vpx_decrypt_cb decrypt_cb, void *decrypt_state); + +void vpx_reader_fill(vpx_reader *r); + +const uint8_t *vpx_reader_find_end(vpx_reader *r); + +static INLINE int vpx_reader_has_error(vpx_reader *r) { + // Check if we have reached the end of the buffer. + // + // Variable 'count' stores the number of bits in the 'value' buffer, minus + // 8. The top byte is part of the algorithm, and the remainder is buffered + // to be shifted into it. So if count == 8, the top 16 bits of 'value' are + // occupied, 8 for the algorithm and 8 in the buffer. + // + // When reading a byte from the user's buffer, count is filled with 8 and + // one byte is filled into the value buffer. When we reach the end of the + // data, count is additionally filled with LOTS_OF_BITS. So when + // count == LOTS_OF_BITS - 1, the user's data has been exhausted. + // + // 1 if we have tried to decode bits after the end of stream was encountered. + // 0 No error. + return r->count > BD_VALUE_SIZE && r->count < LOTS_OF_BITS; +} + +static INLINE int vpx_read(vpx_reader *r, int prob) { + unsigned int bit = 0; + BD_VALUE value; + BD_VALUE bigsplit; + int count; + unsigned int range; + unsigned int split = (r->range * prob + (256 - prob)) >> CHAR_BIT; + + if (r->count < 0) vpx_reader_fill(r); + + value = r->value; + count = r->count; + + bigsplit = (BD_VALUE)split << (BD_VALUE_SIZE - CHAR_BIT); + + range = split; + + if (value >= bigsplit) { + range = r->range - split; + value = value - bigsplit; + bit = 1; + } + + { + const unsigned char shift = vpx_norm[(unsigned char)range]; + range <<= shift; + value <<= shift; + count -= shift; + } + r->value = value; + r->count = count; + r->range = range; + +#if CONFIG_BITSTREAM_DEBUG + { + const int queue_r = bitstream_queue_get_read(); + const int frame_idx = bitstream_queue_get_frame_read(); + int ref_result, ref_prob; + bitstream_queue_pop(&ref_result, &ref_prob); + if ((int)bit != ref_result) { + fprintf(stderr, + "\n *** [bit] result error, frame_idx_r %d bit %d ref_result %d " + "queue_r %d\n", + frame_idx, bit, ref_result, queue_r); + + assert(0); + } + if (prob != ref_prob) { + fprintf(stderr, + "\n *** [bit] prob error, frame_idx_r %d prob %d ref_prob %d " + "queue_r %d\n", + frame_idx, prob, ref_prob, queue_r); + + assert(0); + } + } +#endif + + return bit; +} + +static INLINE int vpx_read_bit(vpx_reader *r) { + return vpx_read(r, 128); // vpx_prob_half +} + +static INLINE int vpx_read_literal(vpx_reader *r, int bits) { + int literal = 0, bit; + + for (bit = bits - 1; bit >= 0; bit--) literal |= vpx_read_bit(r) << bit; + + return literal; +} + +static INLINE int vpx_read_tree(vpx_reader *r, const vpx_tree_index *tree, + const vpx_prob *probs) { + vpx_tree_index i = 0; + + while ((i = tree[i + vpx_read(r, probs[i >> 1])]) > 0) continue; + + return -i; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_DSP_BITREADER_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/bitreader_buffer.c b/media/libvpx/libvpx/vpx_dsp/bitreader_buffer.c new file mode 100644 index 0000000000..f59f1f7cb9 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/bitreader_buffer.c @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_config.h" +#include "./bitreader_buffer.h" + +size_t vpx_rb_bytes_read(struct vpx_read_bit_buffer *rb) { + return (rb->bit_offset + 7) >> 3; +} + +int vpx_rb_read_bit(struct vpx_read_bit_buffer *rb) { + const size_t off = rb->bit_offset; + const size_t p = off >> 3; + const int q = 7 - (int)(off & 0x7); + if (rb->bit_buffer + p < rb->bit_buffer_end) { + const int bit = (rb->bit_buffer[p] >> q) & 1; + rb->bit_offset = off + 1; + return bit; + } else { + if (rb->error_handler != NULL) rb->error_handler(rb->error_handler_data); + return 0; + } +} + +int vpx_rb_read_literal(struct vpx_read_bit_buffer *rb, int bits) { + int value = 0, bit; + for (bit = bits - 1; bit >= 0; bit--) value |= vpx_rb_read_bit(rb) << bit; + return value; +} + +int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb, int bits) { + const int value = vpx_rb_read_literal(rb, bits); + return vpx_rb_read_bit(rb) ? -value : value; +} + +int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb, int bits) { + return vpx_rb_read_signed_literal(rb, bits); +} diff --git a/media/libvpx/libvpx/vpx_dsp/bitreader_buffer.h b/media/libvpx/libvpx/vpx_dsp/bitreader_buffer.h new file mode 100644 index 0000000000..b27703a4db --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/bitreader_buffer.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_BITREADER_BUFFER_H_ +#define VPX_VPX_DSP_BITREADER_BUFFER_H_ + +#include + +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (*vpx_rb_error_handler)(void *data); + +struct vpx_read_bit_buffer { + const uint8_t *bit_buffer; + const uint8_t *bit_buffer_end; + size_t bit_offset; + + void *error_handler_data; + vpx_rb_error_handler error_handler; +}; + +size_t vpx_rb_bytes_read(struct vpx_read_bit_buffer *rb); + +int vpx_rb_read_bit(struct vpx_read_bit_buffer *rb); + +int vpx_rb_read_literal(struct vpx_read_bit_buffer *rb, int bits); + +int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb, int bits); + +int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb, int bits); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_DSP_BITREADER_BUFFER_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/bitwriter.c b/media/libvpx/libvpx/vpx_dsp/bitwriter.c new file mode 100644 index 0000000000..5b41aa54dd --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/bitwriter.c @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./bitwriter.h" + +#if CONFIG_BITSTREAM_DEBUG +#include "vpx_util/vpx_debug_util.h" +#endif + +void vpx_start_encode(vpx_writer *br, uint8_t *source) { + br->lowvalue = 0; + br->range = 255; + br->count = -24; + br->buffer = source; + br->pos = 0; + vpx_write_bit(br, 0); +} + +void vpx_stop_encode(vpx_writer *br) { + int i; + +#if CONFIG_BITSTREAM_DEBUG + bitstream_queue_set_skip_write(1); +#endif + for (i = 0; i < 32; i++) vpx_write_bit(br, 0); + + // Ensure there's no ambigous collision with any index marker bytes + if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0) br->buffer[br->pos++] = 0; + +#if CONFIG_BITSTREAM_DEBUG + bitstream_queue_set_skip_write(0); +#endif +} diff --git a/media/libvpx/libvpx/vpx_dsp/bitwriter.h b/media/libvpx/libvpx/vpx_dsp/bitwriter.h new file mode 100644 index 0000000000..5f1ee69ec2 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/bitwriter.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_BITWRITER_H_ +#define VPX_VPX_DSP_BITWRITER_H_ + +#include + +#include "vpx_ports/compiler_attributes.h" +#include "vpx_ports/mem.h" + +#include "vpx_dsp/prob.h" +#if CONFIG_BITSTREAM_DEBUG +#include "vpx_util/vpx_debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct vpx_writer { + unsigned int lowvalue; + unsigned int range; + int count; + unsigned int pos; + uint8_t *buffer; +} vpx_writer; + +void vpx_start_encode(vpx_writer *br, uint8_t *source); +void vpx_stop_encode(vpx_writer *br); + +static INLINE VPX_NO_UNSIGNED_SHIFT_CHECK void vpx_write(vpx_writer *br, + int bit, + int probability) { + unsigned int split; + int count = br->count; + unsigned int range = br->range; + unsigned int lowvalue = br->lowvalue; + int shift; + +#if CONFIG_BITSTREAM_DEBUG + /* + int queue_r = 0; + int frame_idx_r = 0; + int queue_w = bitstream_queue_get_write(); + int frame_idx_w = bitstream_queue_get_frame_write(); + if (frame_idx_w == frame_idx_r && queue_w == queue_r) { + fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n", + frame_idx_w, queue_w); + assert(0); + } + */ + bitstream_queue_push(bit, probability); +#endif + + split = 1 + (((range - 1) * probability) >> 8); + + range = split; + + if (bit) { + lowvalue += split; + range = br->range - split; + } + + shift = vpx_norm[range]; + + range <<= shift; + count += shift; + + if (count >= 0) { + int offset = shift - count; + + if ((lowvalue << (offset - 1)) & 0x80000000) { + int x = br->pos - 1; + + while (x >= 0 && br->buffer[x] == 0xff) { + br->buffer[x] = 0; + x--; + } + + br->buffer[x] += 1; + } + + br->buffer[br->pos++] = (lowvalue >> (24 - offset)) & 0xff; + lowvalue <<= offset; + shift = count; + lowvalue &= 0xffffff; + count -= 8; + } + + lowvalue <<= shift; + br->count = count; + br->lowvalue = lowvalue; + br->range = range; +} + +static INLINE void vpx_write_bit(vpx_writer *w, int bit) { + vpx_write(w, bit, 128); // vpx_prob_half +} + +static INLINE void vpx_write_literal(vpx_writer *w, int data, int bits) { + int bit; + + for (bit = bits - 1; bit >= 0; bit--) vpx_write_bit(w, 1 & (data >> bit)); +} + +#define vpx_write_prob(w, v) vpx_write_literal((w), (v), 8) + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_DSP_BITWRITER_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.c b/media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.c new file mode 100644 index 0000000000..7a7e96f02e --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.c @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./bitwriter_buffer.h" + +size_t vpx_wb_bytes_written(const struct vpx_write_bit_buffer *wb) { + return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0); +} + +void vpx_wb_write_bit(struct vpx_write_bit_buffer *wb, int bit) { + const int off = (int)wb->bit_offset; + const int p = off / CHAR_BIT; + const int q = CHAR_BIT - 1 - off % CHAR_BIT; + if (q == CHAR_BIT - 1) { + wb->bit_buffer[p] = bit << q; + } else { + wb->bit_buffer[p] &= ~(1 << q); + wb->bit_buffer[p] |= bit << q; + } + wb->bit_offset = off + 1; +} + +void vpx_wb_write_literal(struct vpx_write_bit_buffer *wb, int data, int bits) { + int bit; + for (bit = bits - 1; bit >= 0; bit--) vpx_wb_write_bit(wb, (data >> bit) & 1); +} + +void vpx_wb_write_inv_signed_literal(struct vpx_write_bit_buffer *wb, int data, + int bits) { + vpx_wb_write_literal(wb, abs(data), bits); + vpx_wb_write_bit(wb, data < 0); +} diff --git a/media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.h b/media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.h new file mode 100644 index 0000000000..3662cb64df --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_BITWRITER_BUFFER_H_ +#define VPX_VPX_DSP_BITWRITER_BUFFER_H_ + +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct vpx_write_bit_buffer { + uint8_t *bit_buffer; + size_t bit_offset; +}; + +size_t vpx_wb_bytes_written(const struct vpx_write_bit_buffer *wb); + +void vpx_wb_write_bit(struct vpx_write_bit_buffer *wb, int bit); + +void vpx_wb_write_literal(struct vpx_write_bit_buffer *wb, int data, int bits); + +void vpx_wb_write_inv_signed_literal(struct vpx_write_bit_buffer *wb, int data, + int bits); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_DSP_BITWRITER_BUFFER_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/deblock.c b/media/libvpx/libvpx/vpx_dsp/deblock.c new file mode 100644 index 0000000000..455b73bbce --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/deblock.c @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +const int16_t vpx_rv[] = { + 8, 5, 2, 2, 8, 12, 4, 9, 8, 3, 0, 3, 9, 0, 0, 0, 8, 3, 14, + 4, 10, 1, 11, 14, 1, 14, 9, 6, 12, 11, 8, 6, 10, 0, 0, 8, 9, 0, + 3, 14, 8, 11, 13, 4, 2, 9, 0, 3, 9, 6, 1, 2, 3, 14, 13, 1, 8, + 2, 9, 7, 3, 3, 1, 13, 13, 6, 6, 5, 2, 7, 11, 9, 11, 8, 7, 3, + 2, 0, 13, 13, 14, 4, 12, 5, 12, 10, 8, 10, 13, 10, 4, 14, 4, 10, 0, + 8, 11, 1, 13, 7, 7, 14, 6, 14, 13, 2, 13, 5, 4, 4, 0, 10, 0, 5, + 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, 3, 4, 7, + 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, 13, 1, + 12, 0, 10, 9, 7, 6, 2, 8, 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, 9, + 6, 10, 11, 7, 8, 7, 5, 14, 8, 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, + 3, 12, 5, 7, 14, 3, 14, 5, 2, 6, 11, 12, 12, 8, 0, 11, 13, 1, 2, + 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, 0, 3, 10, 5, 8, 0, 11, 6, + 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, 4, 3, 5, 6, 10, 8, 9, + 4, 11, 14, 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2, + 7, 2, 2, 5, 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3, + 0, 11, 8, 13, 1, 13, 1, 12, 0, 10, 9, 7, 6, 2, 8, 5, 2, 13, 7, + 1, 13, 14, 7, 6, 7, 9, 6, 10, 11, 7, 8, 7, 5, 14, 8, 4, 4, 0, + 8, 7, 10, 0, 8, 14, 11, 3, 12, 5, 7, 14, 3, 14, 5, 2, 6, 11, 12, + 12, 8, 0, 11, 13, 1, 2, 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, 0, + 3, 10, 5, 8, 0, 11, 6, 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, + 4, 3, 5, 6, 10, 8, 9, 4, 11, 14, 3, 8, 3, 7, 8, 5, 11, 4, 12, + 3, 11, 9, 14, 8, 14, 13, 4, 3, 1, 2, 14, 6, 5, 4, 4, 11, 4, 6, + 2, 1, 5, 8, 8, 12, 13, 5, 14, 10, 12, 13, 0, 9, 5, 5, 11, 10, 13, + 9, 10, 13, +}; + +void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, + unsigned char *dst, int src_pitch, + int dst_pitch, int cols, + unsigned char *flimits, int size) { + unsigned char *p_src, *p_dst; + int row; + int col; + unsigned char v; + unsigned char d[4]; + + assert(size >= 8); + assert(cols >= 8); + + for (row = 0; row < size; row++) { + /* post_proc_down for one row */ + p_src = src; + p_dst = dst; + + for (col = 0; col < cols; col++) { + unsigned char p_above2 = p_src[col - 2 * src_pitch]; + unsigned char p_above1 = p_src[col - src_pitch]; + unsigned char p_below1 = p_src[col + src_pitch]; + unsigned char p_below2 = p_src[col + 2 * src_pitch]; + + v = p_src[col]; + + if ((abs(v - p_above2) < flimits[col]) && + (abs(v - p_above1) < flimits[col]) && + (abs(v - p_below1) < flimits[col]) && + (abs(v - p_below2) < flimits[col])) { + unsigned char k1, k2, k3; + k1 = (p_above2 + p_above1 + 1) >> 1; + k2 = (p_below2 + p_below1 + 1) >> 1; + k3 = (k1 + k2 + 1) >> 1; + v = (k3 + v + 1) >> 1; + } + + p_dst[col] = v; + } + + /* now post_proc_across */ + p_src = dst; + p_dst = dst; + + p_src[-2] = p_src[-1] = p_src[0]; + p_src[cols] = p_src[cols + 1] = p_src[cols - 1]; + + for (col = 0; col < cols; col++) { + v = p_src[col]; + + if ((abs(v - p_src[col - 2]) < flimits[col]) && + (abs(v - p_src[col - 1]) < flimits[col]) && + (abs(v - p_src[col + 1]) < flimits[col]) && + (abs(v - p_src[col + 2]) < flimits[col])) { + unsigned char k1, k2, k3; + k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1; + k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1; + k3 = (k1 + k2 + 1) >> 1; + v = (k3 + v + 1) >> 1; + } + + d[col & 3] = v; + + if (col >= 2) p_dst[col - 2] = d[(col - 2) & 3]; + } + + /* handle the last two pixels */ + p_dst[col - 2] = d[(col - 2) & 3]; + p_dst[col - 1] = d[(col - 1) & 3]; + + /* next row */ + src += src_pitch; + dst += dst_pitch; + } +} + +void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, + int cols, int flimit) { + int r, c, i; + + unsigned char *s = src; + unsigned char d[16]; + + for (r = 0; r < rows; r++) { + int sumsq = 16; + int sum = 0; + + for (i = -8; i < 0; i++) s[i] = s[0]; + + /* 17 avoids valgrind warning - we buffer values in c in d + * and only write them when we've read 8 ahead... + */ + for (i = 0; i < 17; i++) s[i + cols] = s[cols - 1]; + + for (i = -8; i <= 6; i++) { + sumsq += s[i] * s[i]; + sum += s[i]; + d[i + 8] = 0; + } + + for (c = 0; c < cols + 8; c++) { + int x = s[c + 7] - s[c - 8]; + int y = s[c + 7] + s[c - 8]; + + sum += x; + sumsq += x * y; + + d[c & 15] = s[c]; + + if (sumsq * 15 - sum * sum < flimit) { + d[c & 15] = (8 + sum + s[c]) >> 4; + } + + s[c - 8] = d[(c - 8) & 15]; + } + + s += pitch; + } +} + +void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, + int flimit) { + int r, c, i; + + for (c = 0; c < cols; c++) { + unsigned char *s = &dst[c]; + int sumsq = 0; + int sum = 0; + unsigned char d[16]; + + for (i = -8; i < 0; i++) s[i * pitch] = s[0]; + + /* 17 avoids valgrind warning - we buffer values in c in d + * and only write them when we've read 8 ahead... + */ + for (i = 0; i < 17; i++) s[(i + rows) * pitch] = s[(rows - 1) * pitch]; + + for (i = -8; i <= 6; i++) { + sumsq += s[i * pitch] * s[i * pitch]; + sum += s[i * pitch]; + } + + for (r = 0; r < rows + 8; r++) { + sumsq += s[7 * pitch] * s[7 * pitch] - s[-8 * pitch] * s[-8 * pitch]; + sum += s[7 * pitch] - s[-8 * pitch]; + d[r & 15] = s[0]; + + if (sumsq * 15 - sum * sum < flimit) { + d[r & 15] = (vpx_rv[(r & 127) + (c & 7)] + sum + s[0]) >> 4; + } + if (r >= 8) s[-8 * pitch] = d[(r - 8) & 15]; + s += pitch; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/fastssim.c b/media/libvpx/libvpx/vpx_dsp/fastssim.c new file mode 100644 index 0000000000..4d32a02a55 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/fastssim.c @@ -0,0 +1,498 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + * This code was originally written by: Nathan E. Egge, at the Daala + * project. + */ +#include +#include +#include +#include +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/ssim.h" +#include "vpx_ports/system_state.h" + +typedef struct fs_level fs_level; +typedef struct fs_ctx fs_ctx; + +#define SSIM_C1 (255 * 255 * 0.01 * 0.01) +#define SSIM_C2 (255 * 255 * 0.03 * 0.03) +#if CONFIG_VP9_HIGHBITDEPTH +#define SSIM_C1_10 (1023 * 1023 * 0.01 * 0.01) +#define SSIM_C1_12 (4095 * 4095 * 0.01 * 0.01) +#define SSIM_C2_10 (1023 * 1023 * 0.03 * 0.03) +#define SSIM_C2_12 (4095 * 4095 * 0.03 * 0.03) +#endif +#define FS_MINI(_a, _b) ((_a) < (_b) ? (_a) : (_b)) +#define FS_MAXI(_a, _b) ((_a) > (_b) ? (_a) : (_b)) + +struct fs_level { + uint32_t *im1; + uint32_t *im2; + double *ssim; + int w; + int h; +}; + +struct fs_ctx { + fs_level *level; + int nlevels; + unsigned *col_buf; +}; + +static int fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) { + unsigned char *data; + size_t data_size; + int lw; + int lh; + int l; + lw = (_w + 1) >> 1; + lh = (_h + 1) >> 1; + data_size = + _nlevels * sizeof(fs_level) + 2 * (lw + 8) * 8 * sizeof(*_ctx->col_buf); + for (l = 0; l < _nlevels; l++) { + size_t im_size; + size_t level_size; + im_size = lw * (size_t)lh; + level_size = 2 * im_size * sizeof(*_ctx->level[l].im1); + level_size += sizeof(*_ctx->level[l].ssim) - 1; + level_size /= sizeof(*_ctx->level[l].ssim); + level_size += im_size; + level_size *= sizeof(*_ctx->level[l].ssim); + data_size += level_size; + lw = (lw + 1) >> 1; + lh = (lh + 1) >> 1; + } + data = (unsigned char *)malloc(data_size); + if (!data) return -1; + _ctx->level = (fs_level *)data; + _ctx->nlevels = _nlevels; + data += _nlevels * sizeof(*_ctx->level); + lw = (_w + 1) >> 1; + lh = (_h + 1) >> 1; + for (l = 0; l < _nlevels; l++) { + size_t im_size; + size_t level_size; + _ctx->level[l].w = lw; + _ctx->level[l].h = lh; + im_size = lw * (size_t)lh; + level_size = 2 * im_size * sizeof(*_ctx->level[l].im1); + level_size += sizeof(*_ctx->level[l].ssim) - 1; + level_size /= sizeof(*_ctx->level[l].ssim); + level_size *= sizeof(*_ctx->level[l].ssim); + _ctx->level[l].im1 = (uint32_t *)data; + _ctx->level[l].im2 = _ctx->level[l].im1 + im_size; + data += level_size; + _ctx->level[l].ssim = (double *)data; + data += im_size * sizeof(*_ctx->level[l].ssim); + lw = (lw + 1) >> 1; + lh = (lh + 1) >> 1; + } + _ctx->col_buf = (unsigned *)data; + return 0; +} + +static void fs_ctx_clear(fs_ctx *_ctx) { free(_ctx->level); } + +static void fs_downsample_level(fs_ctx *_ctx, int _l) { + const uint32_t *src1; + const uint32_t *src2; + uint32_t *dst1; + uint32_t *dst2; + int w2; + int h2; + int w; + int h; + int i; + int j; + w = _ctx->level[_l].w; + h = _ctx->level[_l].h; + dst1 = _ctx->level[_l].im1; + dst2 = _ctx->level[_l].im2; + w2 = _ctx->level[_l - 1].w; + h2 = _ctx->level[_l - 1].h; + src1 = _ctx->level[_l - 1].im1; + src2 = _ctx->level[_l - 1].im2; + for (j = 0; j < h; j++) { + int j0offs; + int j1offs; + j0offs = 2 * j * w2; + j1offs = FS_MINI(2 * j + 1, h2) * w2; + for (i = 0; i < w; i++) { + int i0; + int i1; + i0 = 2 * i; + i1 = FS_MINI(i0 + 1, w2); + dst1[j * w + i] = + (uint32_t)((int64_t)src1[j0offs + i0] + src1[j0offs + i1] + + src1[j1offs + i0] + src1[j1offs + i1]); + dst2[j * w + i] = + (uint32_t)((int64_t)src2[j0offs + i0] + src2[j0offs + i1] + + src2[j1offs + i0] + src2[j1offs + i1]); + } + } +} + +static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1, + int _s1ystride, const uint8_t *_src2, + int _s2ystride, int _w, int _h, uint32_t bd, + uint32_t shift) { + uint32_t *dst1; + uint32_t *dst2; + int w; + int h; + int i; + int j; + w = _ctx->level[0].w; + h = _ctx->level[0].h; + dst1 = _ctx->level[0].im1; + dst2 = _ctx->level[0].im2; + for (j = 0; j < h; j++) { + int j0; + int j1; + j0 = 2 * j; + j1 = FS_MINI(j0 + 1, _h); + for (i = 0; i < w; i++) { + int i0; + int i1; + i0 = 2 * i; + i1 = FS_MINI(i0 + 1, _w); + if (bd == 8 && shift == 0) { + dst1[j * w + i] = + _src1[j0 * _s1ystride + i0] + _src1[j0 * _s1ystride + i1] + + _src1[j1 * _s1ystride + i0] + _src1[j1 * _s1ystride + i1]; + dst2[j * w + i] = + _src2[j0 * _s2ystride + i0] + _src2[j0 * _s2ystride + i1] + + _src2[j1 * _s2ystride + i0] + _src2[j1 * _s2ystride + i1]; + } else { + uint16_t *src1s = CONVERT_TO_SHORTPTR(_src1); + uint16_t *src2s = CONVERT_TO_SHORTPTR(_src2); + dst1[j * w + i] = (src1s[j0 * _s1ystride + i0] >> shift) + + (src1s[j0 * _s1ystride + i1] >> shift) + + (src1s[j1 * _s1ystride + i0] >> shift) + + (src1s[j1 * _s1ystride + i1] >> shift); + dst2[j * w + i] = (src2s[j0 * _s2ystride + i0] >> shift) + + (src2s[j0 * _s2ystride + i1] >> shift) + + (src2s[j1 * _s2ystride + i0] >> shift) + + (src2s[j1 * _s2ystride + i1] >> shift); + } + } + } +} + +static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) { + unsigned *col_sums_x; + unsigned *col_sums_y; + uint32_t *im1; + uint32_t *im2; + double *ssim; + double c1; + int w; + int h; + int j0offs; + int j1offs; + int i; + int j; + double ssim_c1 = SSIM_C1; +#if CONFIG_VP9_HIGHBITDEPTH + if (bit_depth == 10) ssim_c1 = SSIM_C1_10; + if (bit_depth == 12) ssim_c1 = SSIM_C1_12; +#else + assert(bit_depth == 8); + (void)bit_depth; +#endif + w = _ctx->level[_l].w; + h = _ctx->level[_l].h; + col_sums_x = _ctx->col_buf; + col_sums_y = col_sums_x + w; + im1 = _ctx->level[_l].im1; + im2 = _ctx->level[_l].im2; + for (i = 0; i < w; i++) col_sums_x[i] = 5 * im1[i]; + for (i = 0; i < w; i++) col_sums_y[i] = 5 * im2[i]; + for (j = 1; j < 4; j++) { + j1offs = FS_MINI(j, h - 1) * w; + for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i]; + for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i]; + } + ssim = _ctx->level[_l].ssim; + c1 = (double)(ssim_c1 * 4096 * (1 << 4 * _l)); + for (j = 0; j < h; j++) { + int64_t mux; + int64_t muy; + int i0; + int i1; + mux = (int64_t)5 * col_sums_x[0]; + muy = (int64_t)5 * col_sums_y[0]; + for (i = 1; i < 4; i++) { + i1 = FS_MINI(i, w - 1); + mux += col_sums_x[i1]; + muy += col_sums_y[i1]; + } + for (i = 0; i < w; i++) { + ssim[j * w + i] *= (2 * mux * (double)muy + c1) / + (mux * (double)mux + muy * (double)muy + c1); + if (i + 1 < w) { + i0 = FS_MAXI(0, i - 4); + i1 = FS_MINI(i + 4, w - 1); + mux += (int)col_sums_x[i1] - (int)col_sums_x[i0]; + muy += (int)col_sums_x[i1] - (int)col_sums_x[i0]; + } + } + if (j + 1 < h) { + j0offs = FS_MAXI(0, j - 4) * w; + for (i = 0; i < w; i++) col_sums_x[i] -= im1[j0offs + i]; + for (i = 0; i < w; i++) col_sums_y[i] -= im2[j0offs + i]; + j1offs = FS_MINI(j + 4, h - 1) * w; + for (i = 0; i < w; i++) + col_sums_x[i] = (uint32_t)((int64_t)col_sums_x[i] + im1[j1offs + i]); + for (i = 0; i < w; i++) + col_sums_y[i] = (uint32_t)((int64_t)col_sums_y[i] + im2[j1offs + i]); + } + } +} + +#define FS_COL_SET(_col, _joffs, _ioffs) \ + do { \ + unsigned gx; \ + unsigned gy; \ + gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ + gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ + col_sums_gx2[(_col)] = gx * (double)gx; \ + col_sums_gy2[(_col)] = gy * (double)gy; \ + col_sums_gxgy[(_col)] = gx * (double)gy; \ + } while (0) + +#define FS_COL_ADD(_col, _joffs, _ioffs) \ + do { \ + unsigned gx; \ + unsigned gy; \ + gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ + gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ + col_sums_gx2[(_col)] += gx * (double)gx; \ + col_sums_gy2[(_col)] += gy * (double)gy; \ + col_sums_gxgy[(_col)] += gx * (double)gy; \ + } while (0) + +#define FS_COL_SUB(_col, _joffs, _ioffs) \ + do { \ + unsigned gx; \ + unsigned gy; \ + gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ + gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ + col_sums_gx2[(_col)] -= gx * (double)gx; \ + col_sums_gy2[(_col)] -= gy * (double)gy; \ + col_sums_gxgy[(_col)] -= gx * (double)gy; \ + } while (0) + +#define FS_COL_COPY(_col1, _col2) \ + do { \ + col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)]; \ + col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)]; \ + col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)]; \ + } while (0) + +#define FS_COL_HALVE(_col1, _col2) \ + do { \ + col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 0.5; \ + col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 0.5; \ + col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 0.5; \ + } while (0) + +#define FS_COL_DOUBLE(_col1, _col2) \ + do { \ + col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 2; \ + col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 2; \ + col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 2; \ + } while (0) + +static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) { + uint32_t *im1; + uint32_t *im2; + unsigned *gx_buf; + unsigned *gy_buf; + double *ssim; + double col_sums_gx2[8]; + double col_sums_gy2[8]; + double col_sums_gxgy[8]; + double c2; + int stride; + int w; + int h; + int i; + int j; + double ssim_c2 = SSIM_C2; +#if CONFIG_VP9_HIGHBITDEPTH + if (bit_depth == 10) ssim_c2 = SSIM_C2_10; + if (bit_depth == 12) ssim_c2 = SSIM_C2_12; +#else + assert(bit_depth == 8); + (void)bit_depth; +#endif + + w = _ctx->level[_l].w; + h = _ctx->level[_l].h; + im1 = _ctx->level[_l].im1; + im2 = _ctx->level[_l].im2; + ssim = _ctx->level[_l].ssim; + gx_buf = _ctx->col_buf; + stride = w + 8; + gy_buf = gx_buf + 8 * stride; + memset(gx_buf, 0, 2 * 8 * stride * sizeof(*gx_buf)); + c2 = ssim_c2 * (1 << 4 * _l) * 16 * 104; + for (j = 0; j < h + 4; j++) { + if (j < h - 1) { + for (i = 0; i < w - 1; i++) { + int64_t g1; + int64_t g2; + int64_t gx; + int64_t gy; + g1 = labs((int64_t)im1[(j + 1) * w + i + 1] - (int64_t)im1[j * w + i]); + g2 = labs((int64_t)im1[(j + 1) * w + i] - (int64_t)im1[j * w + i + 1]); + gx = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2); + g1 = labs((int64_t)im2[(j + 1) * w + i + 1] - (int64_t)im2[j * w + i]); + g2 = labs((int64_t)im2[(j + 1) * w + i] - (int64_t)im2[j * w + i + 1]); + gy = ((int64_t)4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2)); + gx_buf[(j & 7) * stride + i + 4] = (uint32_t)gx; + gy_buf[(j & 7) * stride + i + 4] = (uint32_t)gy; + } + } else { + memset(gx_buf + (j & 7) * stride, 0, stride * sizeof(*gx_buf)); + memset(gy_buf + (j & 7) * stride, 0, stride * sizeof(*gy_buf)); + } + if (j >= 4) { + int k; + col_sums_gx2[3] = col_sums_gx2[2] = col_sums_gx2[1] = col_sums_gx2[0] = 0; + col_sums_gy2[3] = col_sums_gy2[2] = col_sums_gy2[1] = col_sums_gy2[0] = 0; + col_sums_gxgy[3] = col_sums_gxgy[2] = col_sums_gxgy[1] = + col_sums_gxgy[0] = 0; + for (i = 4; i < 8; i++) { + FS_COL_SET(i, -1, 0); + FS_COL_ADD(i, 0, 0); + for (k = 1; k < 8 - i; k++) { + FS_COL_DOUBLE(i, i); + FS_COL_ADD(i, -k - 1, 0); + FS_COL_ADD(i, k, 0); + } + } + for (i = 0; i < w; i++) { + double mugx2; + double mugy2; + double mugxgy; + mugx2 = col_sums_gx2[0]; + for (k = 1; k < 8; k++) mugx2 += col_sums_gx2[k]; + mugy2 = col_sums_gy2[0]; + for (k = 1; k < 8; k++) mugy2 += col_sums_gy2[k]; + mugxgy = col_sums_gxgy[0]; + for (k = 1; k < 8; k++) mugxgy += col_sums_gxgy[k]; + ssim[(j - 4) * w + i] = (2 * mugxgy + c2) / (mugx2 + mugy2 + c2); + if (i + 1 < w) { + FS_COL_SET(0, -1, 1); + FS_COL_ADD(0, 0, 1); + FS_COL_SUB(2, -3, 2); + FS_COL_SUB(2, 2, 2); + FS_COL_HALVE(1, 2); + FS_COL_SUB(3, -4, 3); + FS_COL_SUB(3, 3, 3); + FS_COL_HALVE(2, 3); + FS_COL_COPY(3, 4); + FS_COL_DOUBLE(4, 5); + FS_COL_ADD(4, -4, 5); + FS_COL_ADD(4, 3, 5); + FS_COL_DOUBLE(5, 6); + FS_COL_ADD(5, -3, 6); + FS_COL_ADD(5, 2, 6); + FS_COL_DOUBLE(6, 7); + FS_COL_ADD(6, -2, 7); + FS_COL_ADD(6, 1, 7); + FS_COL_SET(7, -1, 8); + FS_COL_ADD(7, 0, 8); + } + } + } + } +} + +#define FS_NLEVELS (4) + +/*These weights were derived from the default weights found in Wang's original + Matlab implementation: {0.0448, 0.2856, 0.2363, 0.1333}. + We drop the finest scale and renormalize the rest to sum to 1.*/ + +static const double FS_WEIGHTS[FS_NLEVELS] = { + 0.2989654541015625, 0.3141326904296875, 0.2473602294921875, 0.1395416259765625 +}; + +static double fs_average(fs_ctx *_ctx, int _l) { + double *ssim; + double ret; + int w; + int h; + int i; + int j; + w = _ctx->level[_l].w; + h = _ctx->level[_l].h; + ssim = _ctx->level[_l].ssim; + ret = 0; + for (j = 0; j < h; j++) + for (i = 0; i < w; i++) ret += ssim[j * w + i]; + return pow(ret / (w * h), FS_WEIGHTS[_l]); +} + +static double convert_ssim_db(double _ssim, double _weight) { + assert(_weight >= _ssim); + if ((_weight - _ssim) < 1e-10) return MAX_SSIM_DB; + return 10 * (log10(_weight) - log10(_weight - _ssim)); +} + +static double calc_ssim(const uint8_t *_src, int _systride, const uint8_t *_dst, + int _dystride, int _w, int _h, uint32_t _bd, + uint32_t _shift) { + fs_ctx ctx; + double ret; + int l; + ret = 1; + if (fs_ctx_init(&ctx, _w, _h, FS_NLEVELS)) return 99.0; + fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _bd, + _shift); + for (l = 0; l < FS_NLEVELS - 1; l++) { + fs_calc_structure(&ctx, l, _bd); + ret *= fs_average(&ctx, l); + fs_downsample_level(&ctx, l + 1); + } + fs_calc_structure(&ctx, l, _bd); + fs_apply_luminance(&ctx, l, _bd); + ret *= fs_average(&ctx, l); + fs_ctx_clear(&ctx); + return ret; +} + +double vpx_calc_fastssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *ssim_y, + double *ssim_u, double *ssim_v, uint32_t bd, + uint32_t in_bd) { + double ssimv; + uint32_t bd_shift = 0; + vpx_clear_system_state(); + assert(bd >= in_bd); + bd_shift = bd - in_bd; + + *ssim_y = calc_ssim(source->y_buffer, source->y_stride, dest->y_buffer, + dest->y_stride, source->y_crop_width, + source->y_crop_height, in_bd, bd_shift); + *ssim_u = calc_ssim(source->u_buffer, source->uv_stride, dest->u_buffer, + dest->uv_stride, source->uv_crop_width, + source->uv_crop_height, in_bd, bd_shift); + *ssim_v = calc_ssim(source->v_buffer, source->uv_stride, dest->v_buffer, + dest->uv_stride, source->uv_crop_width, + source->uv_crop_height, in_bd, bd_shift); + + ssimv = (*ssim_y) * .8 + .1 * ((*ssim_u) + (*ssim_v)); + return convert_ssim_db(ssimv, 1.0); +} diff --git a/media/libvpx/libvpx/vpx_dsp/fwd_txfm.c b/media/libvpx/libvpx/vpx_dsp/fwd_txfm.c new file mode 100644 index 0000000000..ef66de0247 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/fwd_txfm.c @@ -0,0 +1,809 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/fwd_txfm.h" + +void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { + // The 2D transform is done with two passes which are actually pretty + // similar. In the first one, we transform the columns and transpose + // the results. In the second one, we transform the rows. To achieve that, + // as the first pass results are transposed, we transpose the columns (that + // is the transposed rows) and transpose the results (so that it goes back + // in normal/row positions). + int pass; + // We need an intermediate buffer between passes. + tran_low_t intermediate[4 * 4]; + const tran_low_t *in_low = NULL; + tran_low_t *out = intermediate; + // Do the two transform/transpose passes + for (pass = 0; pass < 2; ++pass) { + tran_high_t in_high[4]; // canbe16 + tran_high_t step[4]; // canbe16 + tran_high_t temp1, temp2; // needs32 + int i; + for (i = 0; i < 4; ++i) { + // Load inputs. + if (pass == 0) { + in_high[0] = input[0 * stride] * 16; + in_high[1] = input[1 * stride] * 16; + in_high[2] = input[2 * stride] * 16; + in_high[3] = input[3 * stride] * 16; + if (i == 0 && in_high[0]) { + ++in_high[0]; + } + } else { + assert(in_low != NULL); + in_high[0] = in_low[0 * 4]; + in_high[1] = in_low[1 * 4]; + in_high[2] = in_low[2 * 4]; + in_high[3] = in_low[3 * 4]; + ++in_low; + } + // Transform. + step[0] = in_high[0] + in_high[3]; + step[1] = in_high[1] + in_high[2]; + step[2] = in_high[1] - in_high[2]; + step[3] = in_high[0] - in_high[3]; + temp1 = (step[0] + step[1]) * cospi_16_64; + temp2 = (step[0] - step[1]) * cospi_16_64; + out[0] = (tran_low_t)fdct_round_shift(temp1); + out[2] = (tran_low_t)fdct_round_shift(temp2); + temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; + temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; + out[1] = (tran_low_t)fdct_round_shift(temp1); + out[3] = (tran_low_t)fdct_round_shift(temp2); + // Do next column (which is a transposed row in second/horizontal pass) + ++input; + out += 4; + } + // Setup in/out for next pass. + in_low = intermediate; + out = output; + } + + { + int i, j; + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) output[j + i * 4] = (output[j + i * 4] + 1) >> 2; + } + } +} + +void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) { + int r, c; + tran_low_t sum = 0; + for (r = 0; r < 4; ++r) + for (c = 0; c < 4; ++c) sum += input[r * stride + c]; + + output[0] = sum * 2; +} + +void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride) { + int i, j; + tran_low_t intermediate[64]; + int pass; + tran_low_t *out = intermediate; + const tran_low_t *in = NULL; + + // Transform columns + for (pass = 0; pass < 2; ++pass) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 + tran_high_t t0, t1, t2, t3; // needs32 + tran_high_t x0, x1, x2, x3; // canbe16 + + for (i = 0; i < 8; i++) { + // stage 1 + if (pass == 0) { + s0 = (input[0 * stride] + input[7 * stride]) * 4; + s1 = (input[1 * stride] + input[6 * stride]) * 4; + s2 = (input[2 * stride] + input[5 * stride]) * 4; + s3 = (input[3 * stride] + input[4 * stride]) * 4; + s4 = (input[3 * stride] - input[4 * stride]) * 4; + s5 = (input[2 * stride] - input[5 * stride]) * 4; + s6 = (input[1 * stride] - input[6 * stride]) * 4; + s7 = (input[0 * stride] - input[7 * stride]) * 4; + ++input; + } else { + s0 = in[0 * 8] + in[7 * 8]; + s1 = in[1 * 8] + in[6 * 8]; + s2 = in[2 * 8] + in[5 * 8]; + s3 = in[3 * 8] + in[4 * 8]; + s4 = in[3 * 8] - in[4 * 8]; + s5 = in[2 * 8] - in[5 * 8]; + s6 = in[1 * 8] - in[6 * 8]; + s7 = in[0 * 8] - in[7 * 8]; + ++in; + } + + // fdct4(step, step); + x0 = s0 + s3; + x1 = s1 + s2; + x2 = s1 - s2; + x3 = s0 - s3; + t0 = (x0 + x1) * cospi_16_64; + t1 = (x0 - x1) * cospi_16_64; + t2 = x2 * cospi_24_64 + x3 * cospi_8_64; + t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; + out[0] = (tran_low_t)fdct_round_shift(t0); + out[2] = (tran_low_t)fdct_round_shift(t2); + out[4] = (tran_low_t)fdct_round_shift(t1); + out[6] = (tran_low_t)fdct_round_shift(t3); + + // Stage 2 + t0 = (s6 - s5) * cospi_16_64; + t1 = (s6 + s5) * cospi_16_64; + t2 = fdct_round_shift(t0); + t3 = fdct_round_shift(t1); + + // Stage 3 + x0 = s4 + t2; + x1 = s4 - t2; + x2 = s7 - t3; + x3 = s7 + t3; + + // Stage 4 + t0 = x0 * cospi_28_64 + x3 * cospi_4_64; + t1 = x1 * cospi_12_64 + x2 * cospi_20_64; + t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; + t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; + out[1] = (tran_low_t)fdct_round_shift(t0); + out[3] = (tran_low_t)fdct_round_shift(t2); + out[5] = (tran_low_t)fdct_round_shift(t1); + out[7] = (tran_low_t)fdct_round_shift(t3); + out += 8; + } + in = intermediate; + out = output; + } + + // Rows + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) output[j + i * 8] /= 2; + } +} + +void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) { + int r, c; + tran_low_t sum = 0; + for (r = 0; r < 8; ++r) + for (c = 0; c < 8; ++c) sum += input[r * stride + c]; + + output[0] = sum; +} + +void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { + // The 2D transform is done with two passes which are actually pretty + // similar. In the first one, we transform the columns and transpose + // the results. In the second one, we transform the rows. To achieve that, + // as the first pass results are transposed, we transpose the columns (that + // is the transposed rows) and transpose the results (so that it goes back + // in normal/row positions). + int pass; + // We need an intermediate buffer between passes. + tran_low_t intermediate[256]; + const tran_low_t *in_low = NULL; + tran_low_t *out = intermediate; + // Do the two transform/transpose passes + for (pass = 0; pass < 2; ++pass) { + tran_high_t step1[8]; // canbe16 + tran_high_t step2[8]; // canbe16 + tran_high_t step3[8]; // canbe16 + tran_high_t in_high[8]; // canbe16 + tran_high_t temp1, temp2; // needs32 + int i; + for (i = 0; i < 16; i++) { + if (0 == pass) { + // Calculate input for the first 8 results. + in_high[0] = (input[0 * stride] + input[15 * stride]) * 4; + in_high[1] = (input[1 * stride] + input[14 * stride]) * 4; + in_high[2] = (input[2 * stride] + input[13 * stride]) * 4; + in_high[3] = (input[3 * stride] + input[12 * stride]) * 4; + in_high[4] = (input[4 * stride] + input[11 * stride]) * 4; + in_high[5] = (input[5 * stride] + input[10 * stride]) * 4; + in_high[6] = (input[6 * stride] + input[9 * stride]) * 4; + in_high[7] = (input[7 * stride] + input[8 * stride]) * 4; + // Calculate input for the next 8 results. + step1[0] = (input[7 * stride] - input[8 * stride]) * 4; + step1[1] = (input[6 * stride] - input[9 * stride]) * 4; + step1[2] = (input[5 * stride] - input[10 * stride]) * 4; + step1[3] = (input[4 * stride] - input[11 * stride]) * 4; + step1[4] = (input[3 * stride] - input[12 * stride]) * 4; + step1[5] = (input[2 * stride] - input[13 * stride]) * 4; + step1[6] = (input[1 * stride] - input[14 * stride]) * 4; + step1[7] = (input[0 * stride] - input[15 * stride]) * 4; + } else { + // Calculate input for the first 8 results. + assert(in_low != NULL); + in_high[0] = ((in_low[0 * 16] + 1) >> 2) + ((in_low[15 * 16] + 1) >> 2); + in_high[1] = ((in_low[1 * 16] + 1) >> 2) + ((in_low[14 * 16] + 1) >> 2); + in_high[2] = ((in_low[2 * 16] + 1) >> 2) + ((in_low[13 * 16] + 1) >> 2); + in_high[3] = ((in_low[3 * 16] + 1) >> 2) + ((in_low[12 * 16] + 1) >> 2); + in_high[4] = ((in_low[4 * 16] + 1) >> 2) + ((in_low[11 * 16] + 1) >> 2); + in_high[5] = ((in_low[5 * 16] + 1) >> 2) + ((in_low[10 * 16] + 1) >> 2); + in_high[6] = ((in_low[6 * 16] + 1) >> 2) + ((in_low[9 * 16] + 1) >> 2); + in_high[7] = ((in_low[7 * 16] + 1) >> 2) + ((in_low[8 * 16] + 1) >> 2); + // Calculate input for the next 8 results. + step1[0] = ((in_low[7 * 16] + 1) >> 2) - ((in_low[8 * 16] + 1) >> 2); + step1[1] = ((in_low[6 * 16] + 1) >> 2) - ((in_low[9 * 16] + 1) >> 2); + step1[2] = ((in_low[5 * 16] + 1) >> 2) - ((in_low[10 * 16] + 1) >> 2); + step1[3] = ((in_low[4 * 16] + 1) >> 2) - ((in_low[11 * 16] + 1) >> 2); + step1[4] = ((in_low[3 * 16] + 1) >> 2) - ((in_low[12 * 16] + 1) >> 2); + step1[5] = ((in_low[2 * 16] + 1) >> 2) - ((in_low[13 * 16] + 1) >> 2); + step1[6] = ((in_low[1 * 16] + 1) >> 2) - ((in_low[14 * 16] + 1) >> 2); + step1[7] = ((in_low[0 * 16] + 1) >> 2) - ((in_low[15 * 16] + 1) >> 2); + in_low++; + } + // Work on the first eight values; fdct8(input, even_results); + { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 + tran_high_t t0, t1, t2, t3; // needs32 + tran_high_t x0, x1, x2, x3; // canbe16 + + // stage 1 + s0 = in_high[0] + in_high[7]; + s1 = in_high[1] + in_high[6]; + s2 = in_high[2] + in_high[5]; + s3 = in_high[3] + in_high[4]; + s4 = in_high[3] - in_high[4]; + s5 = in_high[2] - in_high[5]; + s6 = in_high[1] - in_high[6]; + s7 = in_high[0] - in_high[7]; + + // fdct4(step, step); + x0 = s0 + s3; + x1 = s1 + s2; + x2 = s1 - s2; + x3 = s0 - s3; + t0 = (x0 + x1) * cospi_16_64; + t1 = (x0 - x1) * cospi_16_64; + t2 = x3 * cospi_8_64 + x2 * cospi_24_64; + t3 = x3 * cospi_24_64 - x2 * cospi_8_64; + out[0] = (tran_low_t)fdct_round_shift(t0); + out[4] = (tran_low_t)fdct_round_shift(t2); + out[8] = (tran_low_t)fdct_round_shift(t1); + out[12] = (tran_low_t)fdct_round_shift(t3); + + // Stage 2 + t0 = (s6 - s5) * cospi_16_64; + t1 = (s6 + s5) * cospi_16_64; + t2 = fdct_round_shift(t0); + t3 = fdct_round_shift(t1); + + // Stage 3 + x0 = s4 + t2; + x1 = s4 - t2; + x2 = s7 - t3; + x3 = s7 + t3; + + // Stage 4 + t0 = x0 * cospi_28_64 + x3 * cospi_4_64; + t1 = x1 * cospi_12_64 + x2 * cospi_20_64; + t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; + t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; + out[2] = (tran_low_t)fdct_round_shift(t0); + out[6] = (tran_low_t)fdct_round_shift(t2); + out[10] = (tran_low_t)fdct_round_shift(t1); + out[14] = (tran_low_t)fdct_round_shift(t3); + } + // Work on the next eight values; step1 -> odd_results + { + // step 2 + temp1 = (step1[5] - step1[2]) * cospi_16_64; + temp2 = (step1[4] - step1[3]) * cospi_16_64; + step2[2] = fdct_round_shift(temp1); + step2[3] = fdct_round_shift(temp2); + temp1 = (step1[4] + step1[3]) * cospi_16_64; + temp2 = (step1[5] + step1[2]) * cospi_16_64; + step2[4] = fdct_round_shift(temp1); + step2[5] = fdct_round_shift(temp2); + // step 3 + step3[0] = step1[0] + step2[3]; + step3[1] = step1[1] + step2[2]; + step3[2] = step1[1] - step2[2]; + step3[3] = step1[0] - step2[3]; + step3[4] = step1[7] - step2[4]; + step3[5] = step1[6] - step2[5]; + step3[6] = step1[6] + step2[5]; + step3[7] = step1[7] + step2[4]; + // step 4 + temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64; + temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64; + step2[1] = fdct_round_shift(temp1); + step2[2] = fdct_round_shift(temp2); + temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64; + temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64; + step2[5] = fdct_round_shift(temp1); + step2[6] = fdct_round_shift(temp2); + // step 5 + step1[0] = step3[0] + step2[1]; + step1[1] = step3[0] - step2[1]; + step1[2] = step3[3] + step2[2]; + step1[3] = step3[3] - step2[2]; + step1[4] = step3[4] - step2[5]; + step1[5] = step3[4] + step2[5]; + step1[6] = step3[7] - step2[6]; + step1[7] = step3[7] + step2[6]; + // step 6 + temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64; + temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64; + out[1] = (tran_low_t)fdct_round_shift(temp1); + out[9] = (tran_low_t)fdct_round_shift(temp2); + temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64; + temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64; + out[5] = (tran_low_t)fdct_round_shift(temp1); + out[13] = (tran_low_t)fdct_round_shift(temp2); + temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64; + temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; + out[3] = (tran_low_t)fdct_round_shift(temp1); + out[11] = (tran_low_t)fdct_round_shift(temp2); + temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; + temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; + out[7] = (tran_low_t)fdct_round_shift(temp1); + out[15] = (tran_low_t)fdct_round_shift(temp2); + } + // Do next column (which is a transposed row in second/horizontal pass) + input++; + out += 16; + } + // Setup in/out for next pass. + in_low = intermediate; + out = output; + } +} + +void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) { + int r, c; + int sum = 0; + for (r = 0; r < 16; ++r) + for (c = 0; c < 16; ++c) sum += input[r * stride + c]; + + output[0] = (tran_low_t)(sum >> 1); +} + +static INLINE tran_high_t dct_32_round(tran_high_t input) { + tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); + // TODO(debargha, peter.derivaz): Find new bounds for this assert, + // and make the bounds consts. + // assert(-131072 <= rv && rv <= 131071); + return rv; +} + +static INLINE tran_high_t half_round_shift(tran_high_t input) { + tran_high_t rv = (input + 1 + (input < 0)) >> 2; + return rv; +} + +void vpx_fdct32(const tran_high_t *input, tran_high_t *output, int round) { + tran_high_t step[32]; + // Stage 1 + step[0] = input[0] + input[(32 - 1)]; + step[1] = input[1] + input[(32 - 2)]; + step[2] = input[2] + input[(32 - 3)]; + step[3] = input[3] + input[(32 - 4)]; + step[4] = input[4] + input[(32 - 5)]; + step[5] = input[5] + input[(32 - 6)]; + step[6] = input[6] + input[(32 - 7)]; + step[7] = input[7] + input[(32 - 8)]; + step[8] = input[8] + input[(32 - 9)]; + step[9] = input[9] + input[(32 - 10)]; + step[10] = input[10] + input[(32 - 11)]; + step[11] = input[11] + input[(32 - 12)]; + step[12] = input[12] + input[(32 - 13)]; + step[13] = input[13] + input[(32 - 14)]; + step[14] = input[14] + input[(32 - 15)]; + step[15] = input[15] + input[(32 - 16)]; + step[16] = -input[16] + input[(32 - 17)]; + step[17] = -input[17] + input[(32 - 18)]; + step[18] = -input[18] + input[(32 - 19)]; + step[19] = -input[19] + input[(32 - 20)]; + step[20] = -input[20] + input[(32 - 21)]; + step[21] = -input[21] + input[(32 - 22)]; + step[22] = -input[22] + input[(32 - 23)]; + step[23] = -input[23] + input[(32 - 24)]; + step[24] = -input[24] + input[(32 - 25)]; + step[25] = -input[25] + input[(32 - 26)]; + step[26] = -input[26] + input[(32 - 27)]; + step[27] = -input[27] + input[(32 - 28)]; + step[28] = -input[28] + input[(32 - 29)]; + step[29] = -input[29] + input[(32 - 30)]; + step[30] = -input[30] + input[(32 - 31)]; + step[31] = -input[31] + input[(32 - 32)]; + + // Stage 2 + output[0] = step[0] + step[16 - 1]; + output[1] = step[1] + step[16 - 2]; + output[2] = step[2] + step[16 - 3]; + output[3] = step[3] + step[16 - 4]; + output[4] = step[4] + step[16 - 5]; + output[5] = step[5] + step[16 - 6]; + output[6] = step[6] + step[16 - 7]; + output[7] = step[7] + step[16 - 8]; + output[8] = -step[8] + step[16 - 9]; + output[9] = -step[9] + step[16 - 10]; + output[10] = -step[10] + step[16 - 11]; + output[11] = -step[11] + step[16 - 12]; + output[12] = -step[12] + step[16 - 13]; + output[13] = -step[13] + step[16 - 14]; + output[14] = -step[14] + step[16 - 15]; + output[15] = -step[15] + step[16 - 16]; + + output[16] = step[16]; + output[17] = step[17]; + output[18] = step[18]; + output[19] = step[19]; + + output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64); + output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64); + output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64); + output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64); + + output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64); + output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64); + output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64); + output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64); + + output[28] = step[28]; + output[29] = step[29]; + output[30] = step[30]; + output[31] = step[31]; + + // dump the magnitude by 4, hence the intermediate values are within + // the range of 16 bits. + if (round) { + output[0] = half_round_shift(output[0]); + output[1] = half_round_shift(output[1]); + output[2] = half_round_shift(output[2]); + output[3] = half_round_shift(output[3]); + output[4] = half_round_shift(output[4]); + output[5] = half_round_shift(output[5]); + output[6] = half_round_shift(output[6]); + output[7] = half_round_shift(output[7]); + output[8] = half_round_shift(output[8]); + output[9] = half_round_shift(output[9]); + output[10] = half_round_shift(output[10]); + output[11] = half_round_shift(output[11]); + output[12] = half_round_shift(output[12]); + output[13] = half_round_shift(output[13]); + output[14] = half_round_shift(output[14]); + output[15] = half_round_shift(output[15]); + + output[16] = half_round_shift(output[16]); + output[17] = half_round_shift(output[17]); + output[18] = half_round_shift(output[18]); + output[19] = half_round_shift(output[19]); + output[20] = half_round_shift(output[20]); + output[21] = half_round_shift(output[21]); + output[22] = half_round_shift(output[22]); + output[23] = half_round_shift(output[23]); + output[24] = half_round_shift(output[24]); + output[25] = half_round_shift(output[25]); + output[26] = half_round_shift(output[26]); + output[27] = half_round_shift(output[27]); + output[28] = half_round_shift(output[28]); + output[29] = half_round_shift(output[29]); + output[30] = half_round_shift(output[30]); + output[31] = half_round_shift(output[31]); + } + + // Stage 3 + step[0] = output[0] + output[(8 - 1)]; + step[1] = output[1] + output[(8 - 2)]; + step[2] = output[2] + output[(8 - 3)]; + step[3] = output[3] + output[(8 - 4)]; + step[4] = -output[4] + output[(8 - 5)]; + step[5] = -output[5] + output[(8 - 6)]; + step[6] = -output[6] + output[(8 - 7)]; + step[7] = -output[7] + output[(8 - 8)]; + step[8] = output[8]; + step[9] = output[9]; + step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64); + step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64); + step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64); + step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64); + step[14] = output[14]; + step[15] = output[15]; + + step[16] = output[16] + output[23]; + step[17] = output[17] + output[22]; + step[18] = output[18] + output[21]; + step[19] = output[19] + output[20]; + step[20] = -output[20] + output[19]; + step[21] = -output[21] + output[18]; + step[22] = -output[22] + output[17]; + step[23] = -output[23] + output[16]; + step[24] = -output[24] + output[31]; + step[25] = -output[25] + output[30]; + step[26] = -output[26] + output[29]; + step[27] = -output[27] + output[28]; + step[28] = output[28] + output[27]; + step[29] = output[29] + output[26]; + step[30] = output[30] + output[25]; + step[31] = output[31] + output[24]; + + // Stage 4 + output[0] = step[0] + step[3]; + output[1] = step[1] + step[2]; + output[2] = -step[2] + step[1]; + output[3] = -step[3] + step[0]; + output[4] = step[4]; + output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64); + output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64); + output[7] = step[7]; + output[8] = step[8] + step[11]; + output[9] = step[9] + step[10]; + output[10] = -step[10] + step[9]; + output[11] = -step[11] + step[8]; + output[12] = -step[12] + step[15]; + output[13] = -step[13] + step[14]; + output[14] = step[14] + step[13]; + output[15] = step[15] + step[12]; + + output[16] = step[16]; + output[17] = step[17]; + output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64); + output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64); + output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64); + output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64); + output[22] = step[22]; + output[23] = step[23]; + output[24] = step[24]; + output[25] = step[25]; + output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64); + output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64); + output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64); + output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64); + output[30] = step[30]; + output[31] = step[31]; + + // Stage 5 + step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64); + step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64); + step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64); + step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64); + step[4] = output[4] + output[5]; + step[5] = -output[5] + output[4]; + step[6] = -output[6] + output[7]; + step[7] = output[7] + output[6]; + step[8] = output[8]; + step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64); + step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64); + step[11] = output[11]; + step[12] = output[12]; + step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64); + step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64); + step[15] = output[15]; + + step[16] = output[16] + output[19]; + step[17] = output[17] + output[18]; + step[18] = -output[18] + output[17]; + step[19] = -output[19] + output[16]; + step[20] = -output[20] + output[23]; + step[21] = -output[21] + output[22]; + step[22] = output[22] + output[21]; + step[23] = output[23] + output[20]; + step[24] = output[24] + output[27]; + step[25] = output[25] + output[26]; + step[26] = -output[26] + output[25]; + step[27] = -output[27] + output[24]; + step[28] = -output[28] + output[31]; + step[29] = -output[29] + output[30]; + step[30] = output[30] + output[29]; + step[31] = output[31] + output[28]; + + // Stage 6 + output[0] = step[0]; + output[1] = step[1]; + output[2] = step[2]; + output[3] = step[3]; + output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64); + output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64); + output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64); + output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64); + output[8] = step[8] + step[9]; + output[9] = -step[9] + step[8]; + output[10] = -step[10] + step[11]; + output[11] = step[11] + step[10]; + output[12] = step[12] + step[13]; + output[13] = -step[13] + step[12]; + output[14] = -step[14] + step[15]; + output[15] = step[15] + step[14]; + + output[16] = step[16]; + output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64); + output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64); + output[19] = step[19]; + output[20] = step[20]; + output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64); + output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64); + output[23] = step[23]; + output[24] = step[24]; + output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64); + output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64); + output[27] = step[27]; + output[28] = step[28]; + output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64); + output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64); + output[31] = step[31]; + + // Stage 7 + step[0] = output[0]; + step[1] = output[1]; + step[2] = output[2]; + step[3] = output[3]; + step[4] = output[4]; + step[5] = output[5]; + step[6] = output[6]; + step[7] = output[7]; + step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64); + step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64); + step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64); + step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64); + step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64); + step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64); + step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64); + step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64); + + step[16] = output[16] + output[17]; + step[17] = -output[17] + output[16]; + step[18] = -output[18] + output[19]; + step[19] = output[19] + output[18]; + step[20] = output[20] + output[21]; + step[21] = -output[21] + output[20]; + step[22] = -output[22] + output[23]; + step[23] = output[23] + output[22]; + step[24] = output[24] + output[25]; + step[25] = -output[25] + output[24]; + step[26] = -output[26] + output[27]; + step[27] = output[27] + output[26]; + step[28] = output[28] + output[29]; + step[29] = -output[29] + output[28]; + step[30] = -output[30] + output[31]; + step[31] = output[31] + output[30]; + + // Final stage --- outputs indices are bit-reversed. + output[0] = step[0]; + output[16] = step[1]; + output[8] = step[2]; + output[24] = step[3]; + output[4] = step[4]; + output[20] = step[5]; + output[12] = step[6]; + output[28] = step[7]; + output[2] = step[8]; + output[18] = step[9]; + output[10] = step[10]; + output[26] = step[11]; + output[6] = step[12]; + output[22] = step[13]; + output[14] = step[14]; + output[30] = step[15]; + + output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64); + output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64); + output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64); + output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64); + output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64); + output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64); + output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64); + output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64); + output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64); + output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64); + output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64); + output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64); + output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64); + output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64); + output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64); + output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); +} + +void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride) { + int i, j; + tran_high_t out[32 * 32]; + + // Columns + for (i = 0; i < 32; ++i) { + tran_high_t temp_in[32], temp_out[32]; + for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4; + vpx_fdct32(temp_in, temp_out, 0); + for (j = 0; j < 32; ++j) + out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + } + + // Rows + for (i = 0; i < 32; ++i) { + tran_high_t temp_in[32], temp_out[32]; + for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32]; + vpx_fdct32(temp_in, temp_out, 0); + for (j = 0; j < 32; ++j) + output[j + i * 32] = + (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2); + } +} + +// Note that although we use dct_32_round in dct32 computation flow, +// this 2d fdct32x32 for rate-distortion optimization loop is operating +// within 16 bits precision. +void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride) { + int i, j; + tran_high_t out[32 * 32]; + + // Columns + for (i = 0; i < 32; ++i) { + tran_high_t temp_in[32], temp_out[32]; + for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4; + vpx_fdct32(temp_in, temp_out, 0); + for (j = 0; j < 32; ++j) + // TODO(cd): see quality impact of only doing + // output[j * 32 + i] = (temp_out[j] + 1) >> 2; + // PS: also change code in vpx_dsp/x86/vpx_dct_sse2.c + out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + } + + // Rows + for (i = 0; i < 32; ++i) { + tran_high_t temp_in[32], temp_out[32]; + for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32]; + vpx_fdct32(temp_in, temp_out, 1); + for (j = 0; j < 32; ++j) output[j + i * 32] = (tran_low_t)temp_out[j]; + } +} + +void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) { + int r, c; + int sum = 0; + for (r = 0; r < 32; ++r) + for (c = 0; c < 32; ++c) sum += input[r * stride + c]; + + output[0] = (tran_low_t)(sum >> 3); +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, + int stride) { + vpx_fdct4x4_c(input, output, stride); +} + +void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, + int stride) { + vpx_fdct8x8_c(input, output, stride); +} + +void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, + int stride) { + vpx_fdct8x8_1_c(input, output, stride); +} + +void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, + int stride) { + vpx_fdct16x16_c(input, output, stride); +} + +void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, + int stride) { + vpx_fdct16x16_1_c(input, output, stride); +} + +void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, + int stride) { + vpx_fdct32x32_c(input, output, stride); +} + +void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, + int stride) { + vpx_fdct32x32_rd_c(input, output, stride); +} + +void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, + int stride) { + vpx_fdct32x32_1_c(input, output, stride); +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/fwd_txfm.h b/media/libvpx/libvpx/vpx_dsp/fwd_txfm.h new file mode 100644 index 0000000000..a43c8ea7f7 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/fwd_txfm.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_FWD_TXFM_H_ +#define VPX_VPX_DSP_FWD_TXFM_H_ + +#include "vpx_dsp/txfm_common.h" + +static INLINE tran_high_t fdct_round_shift(tran_high_t input) { + tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); + // TODO(debargha, peter.derivaz): Find new bounds for this assert + // and make the bounds consts. + // assert(INT16_MIN <= rv && rv <= INT16_MAX); + return rv; +} + +void vpx_fdct32(const tran_high_t *input, tran_high_t *output, int round); +#endif // VPX_VPX_DSP_FWD_TXFM_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/intrapred.c b/media/libvpx/libvpx/vpx_dsp/intrapred.c new file mode 100644 index 0000000000..400e632e98 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/intrapred.c @@ -0,0 +1,917 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_mem/vpx_mem.h" + +#define DST(x, y) dst[(x) + (y)*stride] +#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2) +#define AVG2(a, b) (((a) + (b) + 1) >> 1) + +static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r, c; + (void)above; + // first column + for (r = 0; r < bs - 1; ++r) dst[r * stride] = AVG2(left[r], left[r + 1]); + dst[(bs - 1) * stride] = left[bs - 1]; + dst++; + + // second column + for (r = 0; r < bs - 2; ++r) + dst[r * stride] = AVG3(left[r], left[r + 1], left[r + 2]); + dst[(bs - 2) * stride] = AVG3(left[bs - 2], left[bs - 1], left[bs - 1]); + dst[(bs - 1) * stride] = left[bs - 1]; + dst++; + + // rest of last row + for (c = 0; c < bs - 2; ++c) dst[(bs - 1) * stride + c] = left[bs - 1]; + + for (r = bs - 2; r >= 0; --r) + for (c = 0; c < bs - 2; ++c) + dst[r * stride + c] = dst[(r + 1) * stride + c - 2]; +} + +static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r, c; + int size; + (void)left; + for (c = 0; c < bs; ++c) { + dst[c] = AVG2(above[c], above[c + 1]); + dst[stride + c] = AVG3(above[c], above[c + 1], above[c + 2]); + } + for (r = 2, size = bs - 2; r < bs; r += 2, --size) { + memcpy(dst + (r + 0) * stride, dst + (r >> 1), size); + memset(dst + (r + 0) * stride + size, above[bs - 1], bs - size); + memcpy(dst + (r + 1) * stride, dst + stride + (r >> 1), size); + memset(dst + (r + 1) * stride + size, above[bs - 1], bs - size); + } +} + +static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + const uint8_t above_right = above[bs - 1]; + const uint8_t *const dst_row0 = dst; + int x, size; + (void)left; + + for (x = 0; x < bs - 1; ++x) { + dst[x] = AVG3(above[x], above[x + 1], above[x + 2]); + } + dst[bs - 1] = above_right; + dst += stride; + for (x = 1, size = bs - 2; x < bs; ++x, --size) { + memcpy(dst, dst_row0 + x, size); + memset(dst + size, above_right, x + 1); + dst += stride; + } +} + +static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r, c; + + // first row + for (c = 0; c < bs; c++) dst[c] = AVG2(above[c - 1], above[c]); + dst += stride; + + // second row + dst[0] = AVG3(left[0], above[-1], above[0]); + for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); + dst += stride; + + // the rest of first col + dst[0] = AVG3(above[-1], left[0], left[1]); + for (r = 3; r < bs; ++r) + dst[(r - 2) * stride] = AVG3(left[r - 3], left[r - 2], left[r - 1]); + + // the rest of the block + for (r = 2; r < bs; ++r) { + for (c = 1; c < bs; c++) dst[c] = dst[-2 * stride + c - 1]; + dst += stride; + } +} + +static INLINE void d135_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int i; +#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ > 7 + // silence a spurious -Warray-bounds warning, possibly related to: + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56273 + uint8_t border[69]; +#else + uint8_t border[32 + 32 - 1]; // outer border from bottom-left to top-right +#endif + + // dst(bs, bs - 2)[0], i.e., border starting at bottom-left + for (i = 0; i < bs - 2; ++i) { + border[i] = AVG3(left[bs - 3 - i], left[bs - 2 - i], left[bs - 1 - i]); + } + border[bs - 2] = AVG3(above[-1], left[0], left[1]); + border[bs - 1] = AVG3(left[0], above[-1], above[0]); + border[bs - 0] = AVG3(above[-1], above[0], above[1]); + // dst[0][2, size), i.e., remaining top border ascending + for (i = 0; i < bs - 2; ++i) { + border[bs + 1 + i] = AVG3(above[i], above[i + 1], above[i + 2]); + } + + for (i = 0; i < bs; ++i) { + memcpy(dst + i * stride, border + bs - 1 - i, bs); + } +} + +static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r, c; + dst[0] = AVG2(above[-1], left[0]); + for (r = 1; r < bs; r++) dst[r * stride] = AVG2(left[r - 1], left[r]); + dst++; + + dst[0] = AVG3(left[0], above[-1], above[0]); + dst[stride] = AVG3(above[-1], left[0], left[1]); + for (r = 2; r < bs; r++) + dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]); + dst++; + + for (c = 0; c < bs - 2; c++) + dst[c] = AVG3(above[c - 1], above[c], above[c + 1]); + dst += stride; + + for (r = 1; r < bs; ++r) { + for (c = 0; c < bs - 2; c++) dst[c] = dst[-stride + c - 2]; + dst += stride; + } +} + +static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r; + (void)left; + + for (r = 0; r < bs; r++) { + memcpy(dst, above, bs); + dst += stride; + } +} + +static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r; + (void)above; + + for (r = 0; r < bs; r++) { + memset(dst, left[r], bs); + dst += stride; + } +} + +static INLINE void tm_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r, c; + int ytop_left = above[-1]; + + for (r = 0; r < bs; r++) { + for (c = 0; c < bs; c++) + dst[c] = clip_pixel(left[r] + above[c] - ytop_left); + dst += stride; + } +} + +static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r; + (void)above; + (void)left; + + for (r = 0; r < bs; r++) { + memset(dst, 128, bs); + dst += stride; + } +} + +static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, + const uint8_t *left) { + int i, r, expected_dc, sum = 0; + (void)above; + + for (i = 0; i < bs; i++) sum += left[i]; + expected_dc = (sum + (bs >> 1)) / bs; + + for (r = 0; r < bs; r++) { + memset(dst, expected_dc, bs); + dst += stride; + } +} + +static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int i, r, expected_dc, sum = 0; + (void)left; + + for (i = 0; i < bs; i++) sum += above[i]; + expected_dc = (sum + (bs >> 1)) / bs; + + for (r = 0; r < bs; r++) { + memset(dst, expected_dc, bs); + dst += stride; + } +} + +static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int i, r, expected_dc, sum = 0; + const int count = 2 * bs; + + for (i = 0; i < bs; i++) { + sum += above[i]; + sum += left[i]; + } + + expected_dc = (sum + (count >> 1)) / count; + + for (r = 0; r < bs; r++) { + memset(dst, expected_dc, bs); + dst += stride; + } +} + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int H = above[-1]; + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + + memset(dst + stride * 0, AVG3(H, I, J), 4); + memset(dst + stride * 1, AVG3(I, J, K), 4); + memset(dst + stride * 2, AVG3(J, K, L), 4); + memset(dst + stride * 3, AVG3(K, L, L), 4); +} + +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int H = above[-1]; + const int I = above[0]; + const int J = above[1]; + const int K = above[2]; + const int L = above[3]; + const int M = above[4]; + (void)left; + + dst[0] = AVG3(H, I, J); + dst[1] = AVG3(I, J, K); + dst[2] = AVG3(J, K, L); + dst[3] = AVG3(K, L, M); + memcpy(dst + stride * 1, dst, 4); + memcpy(dst + stride * 2, dst, 4); + memcpy(dst + stride * 3, dst, 4); +} + +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + (void)above; + DST(0, 0) = AVG2(I, J); + DST(2, 0) = DST(0, 1) = AVG2(J, K); + DST(2, 1) = DST(0, 2) = AVG2(K, L); + DST(1, 0) = AVG3(I, J, K); + DST(3, 0) = DST(1, 1) = AVG3(J, K, L); + DST(3, 1) = DST(1, 2) = AVG3(K, L, L); + DST(3, 2) = DST(2, 2) = DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L; +} + +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + const int E = above[4]; + const int F = above[5]; + const int G = above[6]; + (void)left; + DST(0, 0) = AVG2(A, B); + DST(1, 0) = DST(0, 2) = AVG2(B, C); + DST(2, 0) = DST(1, 2) = AVG2(C, D); + DST(3, 0) = DST(2, 2) = AVG2(D, E); + DST(3, 2) = AVG2(E, F); // differs from vp8 + + DST(0, 1) = AVG3(A, B, C); + DST(1, 1) = DST(0, 3) = AVG3(B, C, D); + DST(2, 1) = DST(1, 3) = AVG3(C, D, E); + DST(3, 1) = DST(2, 3) = AVG3(D, E, F); + DST(3, 3) = AVG3(E, F, G); // differs from vp8 +} + +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + const int E = above[4]; + const int F = above[5]; + const int G = above[6]; + const int H = above[7]; + (void)left; + DST(0, 0) = AVG2(A, B); + DST(1, 0) = DST(0, 2) = AVG2(B, C); + DST(2, 0) = DST(1, 2) = AVG2(C, D); + DST(3, 0) = DST(2, 2) = AVG2(D, E); + DST(3, 2) = AVG3(E, F, G); + + DST(0, 1) = AVG3(A, B, C); + DST(1, 1) = DST(0, 3) = AVG3(B, C, D); + DST(2, 1) = DST(1, 3) = AVG3(C, D, E); + DST(3, 1) = DST(2, 3) = AVG3(D, E, F); + DST(3, 3) = AVG3(F, G, H); +} + +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + const int E = above[4]; + const int F = above[5]; + const int G = above[6]; + const int H = above[7]; + (void)stride; + (void)left; + DST(0, 0) = AVG3(A, B, C); + DST(1, 0) = DST(0, 1) = AVG3(B, C, D); + DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E); + DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F); + DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G); + DST(3, 2) = DST(2, 3) = AVG3(F, G, H); + DST(3, 3) = H; // differs from vp8 +} + +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + const int E = above[4]; + const int F = above[5]; + const int G = above[6]; + const int H = above[7]; + (void)stride; + (void)left; + DST(0, 0) = AVG3(A, B, C); + DST(1, 0) = DST(0, 1) = AVG3(B, C, D); + DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E); + DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F); + DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G); + DST(3, 2) = DST(2, 3) = AVG3(F, G, H); + DST(3, 3) = AVG3(G, H, H); +} + +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int X = above[-1]; + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + DST(0, 0) = DST(1, 2) = AVG2(X, A); + DST(1, 0) = DST(2, 2) = AVG2(A, B); + DST(2, 0) = DST(3, 2) = AVG2(B, C); + DST(3, 0) = AVG2(C, D); + + DST(0, 3) = AVG3(K, J, I); + DST(0, 2) = AVG3(J, I, X); + DST(0, 1) = DST(1, 3) = AVG3(I, X, A); + DST(1, 1) = DST(2, 3) = AVG3(X, A, B); + DST(2, 1) = DST(3, 3) = AVG3(A, B, C); + DST(3, 1) = AVG3(B, C, D); +} + +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + const int X = above[-1]; + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + (void)stride; + DST(0, 3) = AVG3(J, K, L); + DST(1, 3) = DST(0, 2) = AVG3(I, J, K); + DST(2, 3) = DST(1, 2) = DST(0, 1) = AVG3(X, I, J); + DST(3, 3) = DST(2, 2) = DST(1, 1) = DST(0, 0) = AVG3(A, X, I); + DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X); + DST(3, 1) = DST(2, 0) = AVG3(C, B, A); + DST(3, 0) = AVG3(D, C, B); +} + +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + const int X = above[-1]; + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + + DST(0, 0) = DST(2, 1) = AVG2(I, X); + DST(0, 1) = DST(2, 2) = AVG2(J, I); + DST(0, 2) = DST(2, 3) = AVG2(K, J); + DST(0, 3) = AVG2(L, K); + + DST(3, 0) = AVG3(A, B, C); + DST(2, 0) = AVG3(X, A, B); + DST(1, 0) = DST(3, 1) = AVG3(I, X, A); + DST(1, 1) = DST(3, 2) = AVG3(J, I, X); + DST(1, 2) = DST(3, 3) = AVG3(K, J, I); + DST(1, 3) = AVG3(L, K, J); +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void highbd_d207_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + (void)above; + (void)bd; + + // First column. + for (r = 0; r < bs - 1; ++r) { + dst[r * stride] = AVG2(left[r], left[r + 1]); + } + dst[(bs - 1) * stride] = left[bs - 1]; + dst++; + + // Second column. + for (r = 0; r < bs - 2; ++r) { + dst[r * stride] = AVG3(left[r], left[r + 1], left[r + 2]); + } + dst[(bs - 2) * stride] = AVG3(left[bs - 2], left[bs - 1], left[bs - 1]); + dst[(bs - 1) * stride] = left[bs - 1]; + dst++; + + // Rest of last row. + for (c = 0; c < bs - 2; ++c) dst[(bs - 1) * stride + c] = left[bs - 1]; + + for (r = bs - 2; r >= 0; --r) { + for (c = 0; c < bs - 2; ++c) + dst[r * stride + c] = dst[(r + 1) * stride + c - 2]; + } +} + +static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + int size; + (void)left; + (void)bd; + for (c = 0; c < bs; ++c) { + dst[c] = AVG2(above[c], above[c + 1]); + dst[stride + c] = AVG3(above[c], above[c + 1], above[c + 2]); + } + for (r = 2, size = bs - 2; r < bs; r += 2, --size) { + memcpy(dst + (r + 0) * stride, dst + (r >> 1), size * sizeof(*dst)); + vpx_memset16(dst + (r + 0) * stride + size, above[bs - 1], bs - size); + memcpy(dst + (r + 1) * stride, dst + stride + (r >> 1), + size * sizeof(*dst)); + vpx_memset16(dst + (r + 1) * stride + size, above[bs - 1], bs - size); + } +} + +static INLINE void highbd_d45_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16_t above_right = above[bs - 1]; + const uint16_t *const dst_row0 = dst; + int x, size; + (void)left; + (void)bd; + + for (x = 0; x < bs - 1; ++x) { + dst[x] = AVG3(above[x], above[x + 1], above[x + 2]); + } + dst[bs - 1] = above_right; + dst += stride; + for (x = 1, size = bs - 2; x < bs; ++x, --size) { + memcpy(dst, dst_row0 + x, size * sizeof(*dst)); + vpx_memset16(dst + size, above_right, x + 1); + dst += stride; + } +} + +static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + (void)bd; + + // first row + for (c = 0; c < bs; c++) dst[c] = AVG2(above[c - 1], above[c]); + dst += stride; + + // second row + dst[0] = AVG3(left[0], above[-1], above[0]); + for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); + dst += stride; + + // the rest of first col + dst[0] = AVG3(above[-1], left[0], left[1]); + for (r = 3; r < bs; ++r) + dst[(r - 2) * stride] = AVG3(left[r - 3], left[r - 2], left[r - 1]); + + // the rest of the block + for (r = 2; r < bs; ++r) { + for (c = 1; c < bs; c++) dst[c] = dst[-2 * stride + c - 1]; + dst += stride; + } +} + +static INLINE void highbd_d135_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int i; +#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ > 7 + // silence a spurious -Warray-bounds warning, possibly related to: + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56273 + uint16_t border[69]; +#else + uint16_t border[32 + 32 - 1]; // outer border from bottom-left to top-right +#endif + (void)bd; + + // dst(bs, bs - 2)[0], i.e., border starting at bottom-left + for (i = 0; i < bs - 2; ++i) { + border[i] = AVG3(left[bs - 3 - i], left[bs - 2 - i], left[bs - 1 - i]); + } + border[bs - 2] = AVG3(above[-1], left[0], left[1]); + border[bs - 1] = AVG3(left[0], above[-1], above[0]); + border[bs - 0] = AVG3(above[-1], above[0], above[1]); + // dst[0][2, size), i.e., remaining top border ascending + for (i = 0; i < bs - 2; ++i) { + border[bs + 1 + i] = AVG3(above[i], above[i + 1], above[i + 2]); + } + + for (i = 0; i < bs; ++i) { + memcpy(dst + i * stride, border + bs - 1 - i, bs * sizeof(dst[0])); + } +} + +static INLINE void highbd_d153_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + (void)bd; + dst[0] = AVG2(above[-1], left[0]); + for (r = 1; r < bs; r++) dst[r * stride] = AVG2(left[r - 1], left[r]); + dst++; + + dst[0] = AVG3(left[0], above[-1], above[0]); + dst[stride] = AVG3(above[-1], left[0], left[1]); + for (r = 2; r < bs; r++) + dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]); + dst++; + + for (c = 0; c < bs - 2; c++) + dst[c] = AVG3(above[c - 1], above[c], above[c + 1]); + dst += stride; + + for (r = 1; r < bs; ++r) { + for (c = 0; c < bs - 2; c++) dst[c] = dst[-stride + c - 2]; + dst += stride; + } +} + +static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, + const uint16_t *left, int bd) { + int r; + (void)left; + (void)bd; + for (r = 0; r < bs; r++) { + memcpy(dst, above, bs * sizeof(uint16_t)); + dst += stride; + } +} + +static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, + const uint16_t *left, int bd) { + int r; + (void)above; + (void)bd; + for (r = 0; r < bs; r++) { + vpx_memset16(dst, left[r], bs); + dst += stride; + } +} + +static INLINE void highbd_tm_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + int ytop_left = above[-1]; + (void)bd; + + for (r = 0; r < bs; r++) { + for (c = 0; c < bs; c++) + dst[c] = clip_pixel_highbd(left[r] + above[c] - ytop_left, bd); + dst += stride; + } +} + +static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r; + (void)above; + (void)left; + + for (r = 0; r < bs; r++) { + vpx_memset16(dst, 128 << (bd - 8), bs); + dst += stride; + } +} + +static INLINE void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int i, r, expected_dc, sum = 0; + (void)above; + (void)bd; + + for (i = 0; i < bs; i++) sum += left[i]; + expected_dc = (sum + (bs >> 1)) / bs; + + for (r = 0; r < bs; r++) { + vpx_memset16(dst, expected_dc, bs); + dst += stride; + } +} + +static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int i, r, expected_dc, sum = 0; + (void)left; + (void)bd; + + for (i = 0; i < bs; i++) sum += above[i]; + expected_dc = (sum + (bs >> 1)) / bs; + + for (r = 0; r < bs; r++) { + vpx_memset16(dst, expected_dc, bs); + dst += stride; + } +} + +static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, + const uint16_t *left, int bd) { + int i, r, expected_dc, sum = 0; + const int count = 2 * bs; + (void)bd; + + for (i = 0; i < bs; i++) { + sum += above[i]; + sum += left[i]; + } + + expected_dc = (sum + (count >> 1)) / count; + + for (r = 0; r < bs; r++) { + vpx_memset16(dst, expected_dc, bs); + dst += stride; + } +} + +void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + (void)above; + (void)bd; + DST(0, 0) = AVG2(I, J); + DST(2, 0) = DST(0, 1) = AVG2(J, K); + DST(2, 1) = DST(0, 2) = AVG2(K, L); + DST(1, 0) = AVG3(I, J, K); + DST(3, 0) = DST(1, 1) = AVG3(J, K, L); + DST(3, 1) = DST(1, 2) = AVG3(K, L, L); + DST(3, 2) = DST(2, 2) = DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L; +} + +void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd) { + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + const int E = above[4]; + const int F = above[5]; + const int G = above[6]; + (void)left; + (void)bd; + DST(0, 0) = AVG2(A, B); + DST(1, 0) = DST(0, 2) = AVG2(B, C); + DST(2, 0) = DST(1, 2) = AVG2(C, D); + DST(3, 0) = DST(2, 2) = AVG2(D, E); + DST(3, 2) = AVG2(E, F); // differs from vp8 + + DST(0, 1) = AVG3(A, B, C); + DST(1, 1) = DST(0, 3) = AVG3(B, C, D); + DST(2, 1) = DST(1, 3) = AVG3(C, D, E); + DST(3, 1) = DST(2, 3) = AVG3(D, E, F); + DST(3, 3) = AVG3(E, F, G); // differs from vp8 +} + +void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd) { + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + const int E = above[4]; + const int F = above[5]; + const int G = above[6]; + const int H = above[7]; + (void)left; + (void)bd; + DST(0, 0) = AVG3(A, B, C); + DST(1, 0) = DST(0, 1) = AVG3(B, C, D); + DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E); + DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F); + DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G); + DST(3, 2) = DST(2, 3) = AVG3(F, G, H); + DST(3, 3) = H; // differs from vp8 +} + +void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int X = above[-1]; + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + (void)bd; + DST(0, 0) = DST(1, 2) = AVG2(X, A); + DST(1, 0) = DST(2, 2) = AVG2(A, B); + DST(2, 0) = DST(3, 2) = AVG2(B, C); + DST(3, 0) = AVG2(C, D); + + DST(0, 3) = AVG3(K, J, I); + DST(0, 2) = AVG3(J, I, X); + DST(0, 1) = DST(1, 3) = AVG3(I, X, A); + DST(1, 1) = DST(2, 3) = AVG3(X, A, B); + DST(2, 1) = DST(3, 3) = AVG3(A, B, C); + DST(3, 1) = AVG3(B, C, D); +} + +void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + const int X = above[-1]; + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + (void)bd; + DST(0, 3) = AVG3(J, K, L); + DST(1, 3) = DST(0, 2) = AVG3(I, J, K); + DST(2, 3) = DST(1, 2) = DST(0, 1) = AVG3(X, I, J); + DST(3, 3) = DST(2, 2) = DST(1, 1) = DST(0, 0) = AVG3(A, X, I); + DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X); + DST(3, 1) = DST(2, 0) = AVG3(C, B, A); + DST(3, 0) = AVG3(D, C, B); +} + +void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + const int X = above[-1]; + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + (void)bd; + + DST(0, 0) = DST(2, 1) = AVG2(I, X); + DST(0, 1) = DST(2, 2) = AVG2(J, I); + DST(0, 2) = DST(2, 3) = AVG2(K, J); + DST(0, 3) = AVG2(L, K); + + DST(3, 0) = AVG3(A, B, C); + DST(2, 0) = AVG3(X, A, B); + DST(1, 0) = DST(3, 1) = AVG3(I, X, A); + DST(1, 1) = DST(3, 2) = AVG3(J, I, X); + DST(1, 2) = DST(3, 3) = AVG3(K, J, I); + DST(1, 3) = AVG3(L, K, J); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +// This serves as a wrapper function, so that all the prediction functions +// can be unified and accessed as a pointer array. Note that the boundary +// above and left are not necessarily used all the time. +#define intra_pred_sized(type, size) \ + void vpx_##type##_predictor_##size##x##size##_c( \ + uint8_t *dst, ptrdiff_t stride, const uint8_t *above, \ + const uint8_t *left) { \ + type##_predictor(dst, stride, size, above, left); \ + } + +#if CONFIG_VP9_HIGHBITDEPTH +#define intra_pred_highbd_sized(type, size) \ + void vpx_highbd_##type##_predictor_##size##x##size##_c( \ + uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ + const uint16_t *left, int bd) { \ + highbd_##type##_predictor(dst, stride, size, above, left, bd); \ + } + +/* clang-format off */ +#define intra_pred_allsizes(type) \ + intra_pred_sized(type, 4) \ + intra_pred_sized(type, 8) \ + intra_pred_sized(type, 16) \ + intra_pred_sized(type, 32) \ + intra_pred_highbd_sized(type, 4) \ + intra_pred_highbd_sized(type, 8) \ + intra_pred_highbd_sized(type, 16) \ + intra_pred_highbd_sized(type, 32) + +#define intra_pred_no_4x4(type) \ + intra_pred_sized(type, 8) \ + intra_pred_sized(type, 16) \ + intra_pred_sized(type, 32) \ + intra_pred_highbd_sized(type, 8) \ + intra_pred_highbd_sized(type, 16) \ + intra_pred_highbd_sized(type, 32) + +#else +#define intra_pred_allsizes(type) \ + intra_pred_sized(type, 4) \ + intra_pred_sized(type, 8) \ + intra_pred_sized(type, 16) \ + intra_pred_sized(type, 32) + +#define intra_pred_no_4x4(type) \ + intra_pred_sized(type, 8) \ + intra_pred_sized(type, 16) \ + intra_pred_sized(type, 32) +#endif // CONFIG_VP9_HIGHBITDEPTH + +intra_pred_no_4x4(d207) +intra_pred_no_4x4(d63) +intra_pred_no_4x4(d45) +intra_pred_no_4x4(d117) +intra_pred_no_4x4(d135) +intra_pred_no_4x4(d153) +intra_pred_allsizes(v) +intra_pred_allsizes(h) +intra_pred_allsizes(tm) +intra_pred_allsizes(dc_128) +intra_pred_allsizes(dc_left) +intra_pred_allsizes(dc_top) +intra_pred_allsizes(dc) +/* clang-format on */ +#undef intra_pred_allsizes diff --git a/media/libvpx/libvpx/vpx_dsp/inv_txfm.c b/media/libvpx/libvpx/vpx_dsp/inv_txfm.c new file mode 100644 index 0000000000..97655b3a9e --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/inv_txfm.c @@ -0,0 +1,2701 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/inv_txfm.h" + +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, + 0.5 shifts per pixel. */ + int i; + tran_low_t output[16]; + tran_high_t a1, b1, c1, d1, e1; + const tran_low_t *ip = input; + tran_low_t *op = output; + + for (i = 0; i < 4; i++) { + a1 = ip[0] >> UNIT_QUANT_SHIFT; + c1 = ip[1] >> UNIT_QUANT_SHIFT; + d1 = ip[2] >> UNIT_QUANT_SHIFT; + b1 = ip[3] >> UNIT_QUANT_SHIFT; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + op[0] = WRAPLOW(a1); + op[1] = WRAPLOW(b1); + op[2] = WRAPLOW(c1); + op[3] = WRAPLOW(d1); + ip += 4; + op += 4; + } + + ip = output; + for (i = 0; i < 4; i++) { + a1 = ip[4 * 0]; + c1 = ip[4 * 1]; + d1 = ip[4 * 2]; + b1 = ip[4 * 3]; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1)); + dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1)); + dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1)); + dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1)); + + ip++; + dest++; + } +} + +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i; + tran_high_t a1, e1; + tran_low_t tmp[4]; + const tran_low_t *ip = input; + tran_low_t *op = tmp; + + a1 = ip[0] >> UNIT_QUANT_SHIFT; + e1 = a1 >> 1; + a1 -= e1; + op[0] = WRAPLOW(a1); + op[1] = op[2] = op[3] = WRAPLOW(e1); + + ip = tmp; + for (i = 0; i < 4; i++) { + e1 = ip[0] >> 1; + a1 = ip[0] - e1; + dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1); + dest[stride * 1] = clip_pixel_add(dest[stride * 1], e1); + dest[stride * 2] = clip_pixel_add(dest[stride * 2], e1); + dest[stride * 3] = clip_pixel_add(dest[stride * 3], e1); + ip++; + dest++; + } +} + +void iadst4_c(const tran_low_t *input, tran_low_t *output) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; + tran_low_t x0 = input[0]; + tran_low_t x1 = input[1]; + tran_low_t x2 = input[2]; + tran_low_t x3 = input[3]; + + if (!(x0 | x1 | x2 | x3)) { + memset(output, 0, 4 * sizeof(*output)); + return; + } + + // 32-bit result is enough for the following multiplications. + s0 = sinpi_1_9 * x0; + s1 = sinpi_2_9 * x0; + s2 = sinpi_3_9 * x1; + s3 = sinpi_4_9 * x2; + s4 = sinpi_1_9 * x2; + s5 = sinpi_2_9 * x3; + s6 = sinpi_4_9 * x3; + s7 = WRAPLOW(x0 - x2 + x3); + + s0 = s0 + s3 + s5; + s1 = s1 - s4 - s6; + s3 = s2; + s2 = sinpi_3_9 * s7; + + // 1-D transform scaling factor is sqrt(2). + // The overall dynamic range is 14b (input) + 14b (multiplication scaling) + // + 1b (addition) = 29b. + // Hence the output bit depth is 15b. + output[0] = WRAPLOW(dct_const_round_shift(s0 + s3)); + output[1] = WRAPLOW(dct_const_round_shift(s1 + s3)); + output[2] = WRAPLOW(dct_const_round_shift(s2)); + output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3)); +} + +void idct4_c(const tran_low_t *input, tran_low_t *output) { + int16_t step[4]; + tran_high_t temp1, temp2; + + // stage 1 + temp1 = ((int16_t)input[0] + (int16_t)input[2]) * cospi_16_64; + temp2 = ((int16_t)input[0] - (int16_t)input[2]) * cospi_16_64; + step[0] = WRAPLOW(dct_const_round_shift(temp1)); + step[1] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = (int16_t)input[1] * cospi_24_64 - (int16_t)input[3] * cospi_8_64; + temp2 = (int16_t)input[1] * cospi_8_64 + (int16_t)input[3] * cospi_24_64; + step[2] = WRAPLOW(dct_const_round_shift(temp1)); + step[3] = WRAPLOW(dct_const_round_shift(temp2)); + + // stage 2 + output[0] = WRAPLOW(step[0] + step[3]); + output[1] = WRAPLOW(step[1] + step[2]); + output[2] = WRAPLOW(step[1] - step[2]); + output[3] = WRAPLOW(step[0] - step[3]); +} + +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; + tran_low_t out[4 * 4]; + tran_low_t *outptr = out; + tran_low_t temp_in[4], temp_out[4]; + + // Rows + for (i = 0; i < 4; ++i) { + idct4_c(input, outptr); + input += 4; + outptr += 4; + } + + // Columns + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; + idct4_c(temp_in, temp_out); + for (j = 0; j < 4; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 4)); + } + } +} + +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i; + tran_high_t a1; + tran_low_t out = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); + + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 4); + + for (i = 0; i < 4; i++) { + dest[0] = clip_pixel_add(dest[0], a1); + dest[1] = clip_pixel_add(dest[1], a1); + dest[2] = clip_pixel_add(dest[2], a1); + dest[3] = clip_pixel_add(dest[3], a1); + dest += stride; + } +} + +void iadst8_c(const tran_low_t *input, tran_low_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7; + tran_high_t x0 = input[7]; + tran_high_t x1 = input[0]; + tran_high_t x2 = input[5]; + tran_high_t x3 = input[2]; + tran_high_t x4 = input[3]; + tran_high_t x5 = input[4]; + tran_high_t x6 = input[1]; + tran_high_t x7 = input[6]; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { + memset(output, 0, 8 * sizeof(*output)); + return; + } + + // stage 1 + s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1); + s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1); + s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3); + s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3); + s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5); + s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5); + s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7); + s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7); + + x0 = WRAPLOW(dct_const_round_shift(s0 + s4)); + x1 = WRAPLOW(dct_const_round_shift(s1 + s5)); + x2 = WRAPLOW(dct_const_round_shift(s2 + s6)); + x3 = WRAPLOW(dct_const_round_shift(s3 + s7)); + x4 = WRAPLOW(dct_const_round_shift(s0 - s4)); + x5 = WRAPLOW(dct_const_round_shift(s1 - s5)); + x6 = WRAPLOW(dct_const_round_shift(s2 - s6)); + x7 = WRAPLOW(dct_const_round_shift(s3 - s7)); + + // stage 2 + s0 = (int)x0; + s1 = (int)x1; + s2 = (int)x2; + s3 = (int)x3; + s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5); + s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5); + s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7); + s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7); + + x0 = WRAPLOW(s0 + s2); + x1 = WRAPLOW(s1 + s3); + x2 = WRAPLOW(s0 - s2); + x3 = WRAPLOW(s1 - s3); + x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); + x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); + x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); + x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); + + // stage 3 + s2 = (int)(cospi_16_64 * (x2 + x3)); + s3 = (int)(cospi_16_64 * (x2 - x3)); + s6 = (int)(cospi_16_64 * (x6 + x7)); + s7 = (int)(cospi_16_64 * (x6 - x7)); + + x2 = WRAPLOW(dct_const_round_shift(s2)); + x3 = WRAPLOW(dct_const_round_shift(s3)); + x6 = WRAPLOW(dct_const_round_shift(s6)); + x7 = WRAPLOW(dct_const_round_shift(s7)); + + output[0] = WRAPLOW(x0); + output[1] = WRAPLOW(-x4); + output[2] = WRAPLOW(x6); + output[3] = WRAPLOW(-x2); + output[4] = WRAPLOW(x3); + output[5] = WRAPLOW(-x7); + output[6] = WRAPLOW(x5); + output[7] = WRAPLOW(-x1); +} + +void idct8_c(const tran_low_t *input, tran_low_t *output) { + int16_t step1[8], step2[8]; + tran_high_t temp1, temp2; + + // stage 1 + step1[0] = (int16_t)input[0]; + step1[2] = (int16_t)input[4]; + step1[1] = (int16_t)input[2]; + step1[3] = (int16_t)input[6]; + temp1 = (int16_t)input[1] * cospi_28_64 - (int16_t)input[7] * cospi_4_64; + temp2 = (int16_t)input[1] * cospi_4_64 + (int16_t)input[7] * cospi_28_64; + step1[4] = WRAPLOW(dct_const_round_shift(temp1)); + step1[7] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = (int16_t)input[5] * cospi_12_64 - (int16_t)input[3] * cospi_20_64; + temp2 = (int16_t)input[5] * cospi_20_64 + (int16_t)input[3] * cospi_12_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + + // stage 2 + temp1 = (step1[0] + step1[2]) * cospi_16_64; + temp2 = (step1[0] - step1[2]) * cospi_16_64; + step2[0] = WRAPLOW(dct_const_round_shift(temp1)); + step2[1] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64; + temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64; + step2[2] = WRAPLOW(dct_const_round_shift(temp1)); + step2[3] = WRAPLOW(dct_const_round_shift(temp2)); + step2[4] = WRAPLOW(step1[4] + step1[5]); + step2[5] = WRAPLOW(step1[4] - step1[5]); + step2[6] = WRAPLOW(-step1[6] + step1[7]); + step2[7] = WRAPLOW(step1[6] + step1[7]); + + // stage 3 + step1[0] = WRAPLOW(step2[0] + step2[3]); + step1[1] = WRAPLOW(step2[1] + step2[2]); + step1[2] = WRAPLOW(step2[1] - step2[2]); + step1[3] = WRAPLOW(step2[0] - step2[3]); + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + step1[7] = step2[7]; + + // stage 4 + output[0] = WRAPLOW(step1[0] + step1[7]); + output[1] = WRAPLOW(step1[1] + step1[6]); + output[2] = WRAPLOW(step1[2] + step1[5]); + output[3] = WRAPLOW(step1[3] + step1[4]); + output[4] = WRAPLOW(step1[3] - step1[4]); + output[5] = WRAPLOW(step1[2] - step1[5]); + output[6] = WRAPLOW(step1[1] - step1[6]); + output[7] = WRAPLOW(step1[0] - step1[7]); +} + +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; + tran_low_t out[8 * 8]; + tran_low_t *outptr = out; + tran_low_t temp_in[8], temp_out[8]; + + // First transform rows + for (i = 0; i < 8; ++i) { + idct8_c(input, outptr); + input += 8; + outptr += 8; + } + + // Then transform columns + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; + idct8_c(temp_in, temp_out); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 5)); + } + } +} + +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; + tran_low_t out[8 * 8] = { 0 }; + tran_low_t *outptr = out; + tran_low_t temp_in[8], temp_out[8]; + + // First transform rows + // Only first 4 row has non-zero coefs + for (i = 0; i < 4; ++i) { + idct8_c(input, outptr); + input += 8; + outptr += 8; + } + + // Then transform columns + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; + idct8_c(temp_in, temp_out); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 5)); + } + } +} + +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; + tran_high_t a1; + tran_low_t out = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); + + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 5); + for (j = 0; j < 8; ++j) { + for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1); + dest += stride; + } +} + +void iadst16_c(const tran_low_t *input, tran_low_t *output) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; + tran_high_t s9, s10, s11, s12, s13, s14, s15; + tran_high_t x0 = input[15]; + tran_high_t x1 = input[0]; + tran_high_t x2 = input[13]; + tran_high_t x3 = input[2]; + tran_high_t x4 = input[11]; + tran_high_t x5 = input[4]; + tran_high_t x6 = input[9]; + tran_high_t x7 = input[6]; + tran_high_t x8 = input[7]; + tran_high_t x9 = input[8]; + tran_high_t x10 = input[5]; + tran_high_t x11 = input[10]; + tran_high_t x12 = input[3]; + tran_high_t x13 = input[12]; + tran_high_t x14 = input[1]; + tran_high_t x15 = input[14]; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | + x13 | x14 | x15)) { + memset(output, 0, 16 * sizeof(*output)); + return; + } + + // stage 1 + s0 = x0 * cospi_1_64 + x1 * cospi_31_64; + s1 = x0 * cospi_31_64 - x1 * cospi_1_64; + s2 = x2 * cospi_5_64 + x3 * cospi_27_64; + s3 = x2 * cospi_27_64 - x3 * cospi_5_64; + s4 = x4 * cospi_9_64 + x5 * cospi_23_64; + s5 = x4 * cospi_23_64 - x5 * cospi_9_64; + s6 = x6 * cospi_13_64 + x7 * cospi_19_64; + s7 = x6 * cospi_19_64 - x7 * cospi_13_64; + s8 = x8 * cospi_17_64 + x9 * cospi_15_64; + s9 = x8 * cospi_15_64 - x9 * cospi_17_64; + s10 = x10 * cospi_21_64 + x11 * cospi_11_64; + s11 = x10 * cospi_11_64 - x11 * cospi_21_64; + s12 = x12 * cospi_25_64 + x13 * cospi_7_64; + s13 = x12 * cospi_7_64 - x13 * cospi_25_64; + s14 = x14 * cospi_29_64 + x15 * cospi_3_64; + s15 = x14 * cospi_3_64 - x15 * cospi_29_64; + + x0 = WRAPLOW(dct_const_round_shift(s0 + s8)); + x1 = WRAPLOW(dct_const_round_shift(s1 + s9)); + x2 = WRAPLOW(dct_const_round_shift(s2 + s10)); + x3 = WRAPLOW(dct_const_round_shift(s3 + s11)); + x4 = WRAPLOW(dct_const_round_shift(s4 + s12)); + x5 = WRAPLOW(dct_const_round_shift(s5 + s13)); + x6 = WRAPLOW(dct_const_round_shift(s6 + s14)); + x7 = WRAPLOW(dct_const_round_shift(s7 + s15)); + x8 = WRAPLOW(dct_const_round_shift(s0 - s8)); + x9 = WRAPLOW(dct_const_round_shift(s1 - s9)); + x10 = WRAPLOW(dct_const_round_shift(s2 - s10)); + x11 = WRAPLOW(dct_const_round_shift(s3 - s11)); + x12 = WRAPLOW(dct_const_round_shift(s4 - s12)); + x13 = WRAPLOW(dct_const_round_shift(s5 - s13)); + x14 = WRAPLOW(dct_const_round_shift(s6 - s14)); + x15 = WRAPLOW(dct_const_round_shift(s7 - s15)); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4; + s5 = x5; + s6 = x6; + s7 = x7; + s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; + s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; + s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + + x0 = WRAPLOW(s0 + s4); + x1 = WRAPLOW(s1 + s5); + x2 = WRAPLOW(s2 + s6); + x3 = WRAPLOW(s3 + s7); + x4 = WRAPLOW(s0 - s4); + x5 = WRAPLOW(s1 - s5); + x6 = WRAPLOW(s2 - s6); + x7 = WRAPLOW(s3 - s7); + x8 = WRAPLOW(dct_const_round_shift(s8 + s12)); + x9 = WRAPLOW(dct_const_round_shift(s9 + s13)); + x10 = WRAPLOW(dct_const_round_shift(s10 + s14)); + x11 = WRAPLOW(dct_const_round_shift(s11 + s15)); + x12 = WRAPLOW(dct_const_round_shift(s8 - s12)); + x13 = WRAPLOW(dct_const_round_shift(s9 - s13)); + x14 = WRAPLOW(dct_const_round_shift(s10 - s14)); + x15 = WRAPLOW(dct_const_round_shift(s11 - s15)); + + // stage 3 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + s5 = x4 * cospi_24_64 - x5 * cospi_8_64; + s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; + s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + s8 = x8; + s9 = x9; + s10 = x10; + s11 = x11; + s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + s13 = x12 * cospi_24_64 - x13 * cospi_8_64; + s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; + s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + + x0 = WRAPLOW(s0 + s2); + x1 = WRAPLOW(s1 + s3); + x2 = WRAPLOW(s0 - s2); + x3 = WRAPLOW(s1 - s3); + x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); + x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); + x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); + x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); + x8 = WRAPLOW(s8 + s10); + x9 = WRAPLOW(s9 + s11); + x10 = WRAPLOW(s8 - s10); + x11 = WRAPLOW(s9 - s11); + x12 = WRAPLOW(dct_const_round_shift(s12 + s14)); + x13 = WRAPLOW(dct_const_round_shift(s13 + s15)); + x14 = WRAPLOW(dct_const_round_shift(s12 - s14)); + x15 = WRAPLOW(dct_const_round_shift(s13 - s15)); + + // stage 4 + s2 = (-cospi_16_64) * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (-x6 + x7); + s10 = cospi_16_64 * (x10 + x11); + s11 = cospi_16_64 * (-x10 + x11); + s14 = (-cospi_16_64) * (x14 + x15); + s15 = cospi_16_64 * (x14 - x15); + + x2 = WRAPLOW(dct_const_round_shift(s2)); + x3 = WRAPLOW(dct_const_round_shift(s3)); + x6 = WRAPLOW(dct_const_round_shift(s6)); + x7 = WRAPLOW(dct_const_round_shift(s7)); + x10 = WRAPLOW(dct_const_round_shift(s10)); + x11 = WRAPLOW(dct_const_round_shift(s11)); + x14 = WRAPLOW(dct_const_round_shift(s14)); + x15 = WRAPLOW(dct_const_round_shift(s15)); + + output[0] = WRAPLOW(x0); + output[1] = WRAPLOW(-x8); + output[2] = WRAPLOW(x12); + output[3] = WRAPLOW(-x4); + output[4] = WRAPLOW(x6); + output[5] = WRAPLOW(x14); + output[6] = WRAPLOW(x10); + output[7] = WRAPLOW(x2); + output[8] = WRAPLOW(x3); + output[9] = WRAPLOW(x11); + output[10] = WRAPLOW(x15); + output[11] = WRAPLOW(x7); + output[12] = WRAPLOW(x5); + output[13] = WRAPLOW(-x13); + output[14] = WRAPLOW(x9); + output[15] = WRAPLOW(-x1); +} + +void idct16_c(const tran_low_t *input, tran_low_t *output) { + int16_t step1[16], step2[16]; + tran_high_t temp1, temp2; + + // stage 1 + step1[0] = (int16_t)input[0 / 2]; + step1[1] = (int16_t)input[16 / 2]; + step1[2] = (int16_t)input[8 / 2]; + step1[3] = (int16_t)input[24 / 2]; + step1[4] = (int16_t)input[4 / 2]; + step1[5] = (int16_t)input[20 / 2]; + step1[6] = (int16_t)input[12 / 2]; + step1[7] = (int16_t)input[28 / 2]; + step1[8] = (int16_t)input[2 / 2]; + step1[9] = (int16_t)input[18 / 2]; + step1[10] = (int16_t)input[10 / 2]; + step1[11] = (int16_t)input[26 / 2]; + step1[12] = (int16_t)input[6 / 2]; + step1[13] = (int16_t)input[22 / 2]; + step1[14] = (int16_t)input[14 / 2]; + step1[15] = (int16_t)input[30 / 2]; + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + + temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; + temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; + step2[8] = WRAPLOW(dct_const_round_shift(temp1)); + step2[15] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; + temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; + step2[9] = WRAPLOW(dct_const_round_shift(temp1)); + step2[14] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; + temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; + temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; + step2[11] = WRAPLOW(dct_const_round_shift(temp1)); + step2[12] = WRAPLOW(dct_const_round_shift(temp2)); + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + + temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; + temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; + step1[4] = WRAPLOW(dct_const_round_shift(temp1)); + step1[7] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; + temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + + step1[8] = WRAPLOW(step2[8] + step2[9]); + step1[9] = WRAPLOW(step2[8] - step2[9]); + step1[10] = WRAPLOW(-step2[10] + step2[11]); + step1[11] = WRAPLOW(step2[10] + step2[11]); + step1[12] = WRAPLOW(step2[12] + step2[13]); + step1[13] = WRAPLOW(step2[12] - step2[13]); + step1[14] = WRAPLOW(-step2[14] + step2[15]); + step1[15] = WRAPLOW(step2[14] + step2[15]); + + // stage 4 + temp1 = (step1[0] + step1[1]) * cospi_16_64; + temp2 = (step1[0] - step1[1]) * cospi_16_64; + step2[0] = WRAPLOW(dct_const_round_shift(temp1)); + step2[1] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; + temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; + step2[2] = WRAPLOW(dct_const_round_shift(temp1)); + step2[3] = WRAPLOW(dct_const_round_shift(temp2)); + step2[4] = WRAPLOW(step1[4] + step1[5]); + step2[5] = WRAPLOW(step1[4] - step1[5]); + step2[6] = WRAPLOW(-step1[6] + step1[7]); + step2[7] = WRAPLOW(step1[6] + step1[7]); + + step2[8] = step1[8]; + step2[15] = step1[15]; + temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; + temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; + step2[9] = WRAPLOW(dct_const_round_shift(temp1)); + step2[14] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; + temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + step2[11] = step1[11]; + step2[12] = step1[12]; + + // stage 5 + step1[0] = WRAPLOW(step2[0] + step2[3]); + step1[1] = WRAPLOW(step2[1] + step2[2]); + step1[2] = WRAPLOW(step2[1] - step2[2]); + step1[3] = WRAPLOW(step2[0] - step2[3]); + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + step1[7] = step2[7]; + + step1[8] = WRAPLOW(step2[8] + step2[11]); + step1[9] = WRAPLOW(step2[9] + step2[10]); + step1[10] = WRAPLOW(step2[9] - step2[10]); + step1[11] = WRAPLOW(step2[8] - step2[11]); + step1[12] = WRAPLOW(-step2[12] + step2[15]); + step1[13] = WRAPLOW(-step2[13] + step2[14]); + step1[14] = WRAPLOW(step2[13] + step2[14]); + step1[15] = WRAPLOW(step2[12] + step2[15]); + + // stage 6 + step2[0] = WRAPLOW(step1[0] + step1[7]); + step2[1] = WRAPLOW(step1[1] + step1[6]); + step2[2] = WRAPLOW(step1[2] + step1[5]); + step2[3] = WRAPLOW(step1[3] + step1[4]); + step2[4] = WRAPLOW(step1[3] - step1[4]); + step2[5] = WRAPLOW(step1[2] - step1[5]); + step2[6] = WRAPLOW(step1[1] - step1[6]); + step2[7] = WRAPLOW(step1[0] - step1[7]); + step2[8] = step1[8]; + step2[9] = step1[9]; + temp1 = (-step1[10] + step1[13]) * cospi_16_64; + temp2 = (step1[10] + step1[13]) * cospi_16_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = (-step1[11] + step1[12]) * cospi_16_64; + temp2 = (step1[11] + step1[12]) * cospi_16_64; + step2[11] = WRAPLOW(dct_const_round_shift(temp1)); + step2[12] = WRAPLOW(dct_const_round_shift(temp2)); + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + output[0] = (tran_low_t)WRAPLOW(step2[0] + step2[15]); + output[1] = (tran_low_t)WRAPLOW(step2[1] + step2[14]); + output[2] = (tran_low_t)WRAPLOW(step2[2] + step2[13]); + output[3] = (tran_low_t)WRAPLOW(step2[3] + step2[12]); + output[4] = (tran_low_t)WRAPLOW(step2[4] + step2[11]); + output[5] = (tran_low_t)WRAPLOW(step2[5] + step2[10]); + output[6] = (tran_low_t)WRAPLOW(step2[6] + step2[9]); + output[7] = (tran_low_t)WRAPLOW(step2[7] + step2[8]); + output[8] = (tran_low_t)WRAPLOW(step2[7] - step2[8]); + output[9] = (tran_low_t)WRAPLOW(step2[6] - step2[9]); + output[10] = (tran_low_t)WRAPLOW(step2[5] - step2[10]); + output[11] = (tran_low_t)WRAPLOW(step2[4] - step2[11]); + output[12] = (tran_low_t)WRAPLOW(step2[3] - step2[12]); + output[13] = (tran_low_t)WRAPLOW(step2[2] - step2[13]); + output[14] = (tran_low_t)WRAPLOW(step2[1] - step2[14]); + output[15] = (tran_low_t)WRAPLOW(step2[0] - step2[15]); +} + +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + int i, j; + tran_low_t out[16 * 16]; + tran_low_t *outptr = out; + tran_low_t temp_in[16], temp_out[16]; + + // First transform rows + for (i = 0; i < 16; ++i) { + idct16_c(input, outptr); + input += 16; + outptr += 16; + } + + // Then transform columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; + idct16_c(temp_in, temp_out); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } +} + +void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + int i, j; + tran_low_t out[16 * 16] = { 0 }; + tran_low_t *outptr = out; + tran_low_t temp_in[16], temp_out[16]; + + // First transform rows. Since all non-zero dct coefficients are in + // upper-left 8x8 area, we only need to calculate first 8 rows here. + for (i = 0; i < 8; ++i) { + idct16_c(input, outptr); + input += 16; + outptr += 16; + } + + // Then transform columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; + idct16_c(temp_in, temp_out); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } +} + +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + int i, j; + tran_low_t out[16 * 16] = { 0 }; + tran_low_t *outptr = out; + tran_low_t temp_in[16], temp_out[16]; + + // First transform rows. Since all non-zero dct coefficients are in + // upper-left 4x4 area, we only need to calculate first 4 rows here. + for (i = 0; i < 4; ++i) { + idct16_c(input, outptr); + input += 16; + outptr += 16; + } + + // Then transform columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; + idct16_c(temp_in, temp_out); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } +} + +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; + tran_high_t a1; + tran_low_t out = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); + + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 6); + for (j = 0; j < 16; ++j) { + for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1); + dest += stride; + } +} + +void idct32_c(const tran_low_t *input, tran_low_t *output) { + int16_t step1[32], step2[32]; + tran_high_t temp1, temp2; + + // stage 1 + step1[0] = (int16_t)input[0]; + step1[1] = (int16_t)input[16]; + step1[2] = (int16_t)input[8]; + step1[3] = (int16_t)input[24]; + step1[4] = (int16_t)input[4]; + step1[5] = (int16_t)input[20]; + step1[6] = (int16_t)input[12]; + step1[7] = (int16_t)input[28]; + step1[8] = (int16_t)input[2]; + step1[9] = (int16_t)input[18]; + step1[10] = (int16_t)input[10]; + step1[11] = (int16_t)input[26]; + step1[12] = (int16_t)input[6]; + step1[13] = (int16_t)input[22]; + step1[14] = (int16_t)input[14]; + step1[15] = (int16_t)input[30]; + + temp1 = (int16_t)input[1] * cospi_31_64 - (int16_t)input[31] * cospi_1_64; + temp2 = (int16_t)input[1] * cospi_1_64 + (int16_t)input[31] * cospi_31_64; + step1[16] = WRAPLOW(dct_const_round_shift(temp1)); + step1[31] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = (int16_t)input[17] * cospi_15_64 - (int16_t)input[15] * cospi_17_64; + temp2 = (int16_t)input[17] * cospi_17_64 + (int16_t)input[15] * cospi_15_64; + step1[17] = WRAPLOW(dct_const_round_shift(temp1)); + step1[30] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = (int16_t)input[9] * cospi_23_64 - (int16_t)input[23] * cospi_9_64; + temp2 = (int16_t)input[9] * cospi_9_64 + (int16_t)input[23] * cospi_23_64; + step1[18] = WRAPLOW(dct_const_round_shift(temp1)); + step1[29] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = (int16_t)input[25] * cospi_7_64 - (int16_t)input[7] * cospi_25_64; + temp2 = (int16_t)input[25] * cospi_25_64 + (int16_t)input[7] * cospi_7_64; + step1[19] = WRAPLOW(dct_const_round_shift(temp1)); + step1[28] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = (int16_t)input[5] * cospi_27_64 - (int16_t)input[27] * cospi_5_64; + temp2 = (int16_t)input[5] * cospi_5_64 + (int16_t)input[27] * cospi_27_64; + step1[20] = WRAPLOW(dct_const_round_shift(temp1)); + step1[27] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = (int16_t)input[21] * cospi_11_64 - (int16_t)input[11] * cospi_21_64; + temp2 = (int16_t)input[21] * cospi_21_64 + (int16_t)input[11] * cospi_11_64; + step1[21] = WRAPLOW(dct_const_round_shift(temp1)); + step1[26] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = (int16_t)input[13] * cospi_19_64 - (int16_t)input[19] * cospi_13_64; + temp2 = (int16_t)input[13] * cospi_13_64 + (int16_t)input[19] * cospi_19_64; + step1[22] = WRAPLOW(dct_const_round_shift(temp1)); + step1[25] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = (int16_t)input[29] * cospi_3_64 - (int16_t)input[3] * cospi_29_64; + temp2 = (int16_t)input[29] * cospi_29_64 + (int16_t)input[3] * cospi_3_64; + step1[23] = WRAPLOW(dct_const_round_shift(temp1)); + step1[24] = WRAPLOW(dct_const_round_shift(temp2)); + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + + temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; + temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; + step2[8] = WRAPLOW(dct_const_round_shift(temp1)); + step2[15] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; + temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; + step2[9] = WRAPLOW(dct_const_round_shift(temp1)); + step2[14] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; + temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; + temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; + step2[11] = WRAPLOW(dct_const_round_shift(temp1)); + step2[12] = WRAPLOW(dct_const_round_shift(temp2)); + + step2[16] = WRAPLOW(step1[16] + step1[17]); + step2[17] = WRAPLOW(step1[16] - step1[17]); + step2[18] = WRAPLOW(-step1[18] + step1[19]); + step2[19] = WRAPLOW(step1[18] + step1[19]); + step2[20] = WRAPLOW(step1[20] + step1[21]); + step2[21] = WRAPLOW(step1[20] - step1[21]); + step2[22] = WRAPLOW(-step1[22] + step1[23]); + step2[23] = WRAPLOW(step1[22] + step1[23]); + step2[24] = WRAPLOW(step1[24] + step1[25]); + step2[25] = WRAPLOW(step1[24] - step1[25]); + step2[26] = WRAPLOW(-step1[26] + step1[27]); + step2[27] = WRAPLOW(step1[26] + step1[27]); + step2[28] = WRAPLOW(step1[28] + step1[29]); + step2[29] = WRAPLOW(step1[28] - step1[29]); + step2[30] = WRAPLOW(-step1[30] + step1[31]); + step2[31] = WRAPLOW(step1[30] + step1[31]); + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + + temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; + temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; + step1[4] = WRAPLOW(dct_const_round_shift(temp1)); + step1[7] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; + temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + + step1[8] = WRAPLOW(step2[8] + step2[9]); + step1[9] = WRAPLOW(step2[8] - step2[9]); + step1[10] = WRAPLOW(-step2[10] + step2[11]); + step1[11] = WRAPLOW(step2[10] + step2[11]); + step1[12] = WRAPLOW(step2[12] + step2[13]); + step1[13] = WRAPLOW(step2[12] - step2[13]); + step1[14] = WRAPLOW(-step2[14] + step2[15]); + step1[15] = WRAPLOW(step2[14] + step2[15]); + + step1[16] = step2[16]; + step1[31] = step2[31]; + temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; + temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; + step1[17] = WRAPLOW(dct_const_round_shift(temp1)); + step1[30] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; + temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; + step1[18] = WRAPLOW(dct_const_round_shift(temp1)); + step1[29] = WRAPLOW(dct_const_round_shift(temp2)); + step1[19] = step2[19]; + step1[20] = step2[20]; + temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; + temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; + step1[21] = WRAPLOW(dct_const_round_shift(temp1)); + step1[26] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; + temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; + step1[22] = WRAPLOW(dct_const_round_shift(temp1)); + step1[25] = WRAPLOW(dct_const_round_shift(temp2)); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + // stage 4 + temp1 = (step1[0] + step1[1]) * cospi_16_64; + temp2 = (step1[0] - step1[1]) * cospi_16_64; + step2[0] = WRAPLOW(dct_const_round_shift(temp1)); + step2[1] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; + temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; + step2[2] = WRAPLOW(dct_const_round_shift(temp1)); + step2[3] = WRAPLOW(dct_const_round_shift(temp2)); + step2[4] = WRAPLOW(step1[4] + step1[5]); + step2[5] = WRAPLOW(step1[4] - step1[5]); + step2[6] = WRAPLOW(-step1[6] + step1[7]); + step2[7] = WRAPLOW(step1[6] + step1[7]); + + step2[8] = step1[8]; + step2[15] = step1[15]; + temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; + temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; + step2[9] = WRAPLOW(dct_const_round_shift(temp1)); + step2[14] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; + temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + step2[11] = step1[11]; + step2[12] = step1[12]; + + step2[16] = WRAPLOW(step1[16] + step1[19]); + step2[17] = WRAPLOW(step1[17] + step1[18]); + step2[18] = WRAPLOW(step1[17] - step1[18]); + step2[19] = WRAPLOW(step1[16] - step1[19]); + step2[20] = WRAPLOW(-step1[20] + step1[23]); + step2[21] = WRAPLOW(-step1[21] + step1[22]); + step2[22] = WRAPLOW(step1[21] + step1[22]); + step2[23] = WRAPLOW(step1[20] + step1[23]); + + step2[24] = WRAPLOW(step1[24] + step1[27]); + step2[25] = WRAPLOW(step1[25] + step1[26]); + step2[26] = WRAPLOW(step1[25] - step1[26]); + step2[27] = WRAPLOW(step1[24] - step1[27]); + step2[28] = WRAPLOW(-step1[28] + step1[31]); + step2[29] = WRAPLOW(-step1[29] + step1[30]); + step2[30] = WRAPLOW(step1[29] + step1[30]); + step2[31] = WRAPLOW(step1[28] + step1[31]); + + // stage 5 + step1[0] = WRAPLOW(step2[0] + step2[3]); + step1[1] = WRAPLOW(step2[1] + step2[2]); + step1[2] = WRAPLOW(step2[1] - step2[2]); + step1[3] = WRAPLOW(step2[0] - step2[3]); + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + step1[7] = step2[7]; + + step1[8] = WRAPLOW(step2[8] + step2[11]); + step1[9] = WRAPLOW(step2[9] + step2[10]); + step1[10] = WRAPLOW(step2[9] - step2[10]); + step1[11] = WRAPLOW(step2[8] - step2[11]); + step1[12] = WRAPLOW(-step2[12] + step2[15]); + step1[13] = WRAPLOW(-step2[13] + step2[14]); + step1[14] = WRAPLOW(step2[13] + step2[14]); + step1[15] = WRAPLOW(step2[12] + step2[15]); + + step1[16] = step2[16]; + step1[17] = step2[17]; + temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; + temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; + step1[18] = WRAPLOW(dct_const_round_shift(temp1)); + step1[29] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; + temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; + step1[19] = WRAPLOW(dct_const_round_shift(temp1)); + step1[28] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; + temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; + step1[20] = WRAPLOW(dct_const_round_shift(temp1)); + step1[27] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; + temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; + step1[21] = WRAPLOW(dct_const_round_shift(temp1)); + step1[26] = WRAPLOW(dct_const_round_shift(temp2)); + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // stage 6 + step2[0] = WRAPLOW(step1[0] + step1[7]); + step2[1] = WRAPLOW(step1[1] + step1[6]); + step2[2] = WRAPLOW(step1[2] + step1[5]); + step2[3] = WRAPLOW(step1[3] + step1[4]); + step2[4] = WRAPLOW(step1[3] - step1[4]); + step2[5] = WRAPLOW(step1[2] - step1[5]); + step2[6] = WRAPLOW(step1[1] - step1[6]); + step2[7] = WRAPLOW(step1[0] - step1[7]); + step2[8] = step1[8]; + step2[9] = step1[9]; + temp1 = (-step1[10] + step1[13]) * cospi_16_64; + temp2 = (step1[10] + step1[13]) * cospi_16_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = (-step1[11] + step1[12]) * cospi_16_64; + temp2 = (step1[11] + step1[12]) * cospi_16_64; + step2[11] = WRAPLOW(dct_const_round_shift(temp1)); + step2[12] = WRAPLOW(dct_const_round_shift(temp2)); + step2[14] = step1[14]; + step2[15] = step1[15]; + + step2[16] = WRAPLOW(step1[16] + step1[23]); + step2[17] = WRAPLOW(step1[17] + step1[22]); + step2[18] = WRAPLOW(step1[18] + step1[21]); + step2[19] = WRAPLOW(step1[19] + step1[20]); + step2[20] = WRAPLOW(step1[19] - step1[20]); + step2[21] = WRAPLOW(step1[18] - step1[21]); + step2[22] = WRAPLOW(step1[17] - step1[22]); + step2[23] = WRAPLOW(step1[16] - step1[23]); + + step2[24] = WRAPLOW(-step1[24] + step1[31]); + step2[25] = WRAPLOW(-step1[25] + step1[30]); + step2[26] = WRAPLOW(-step1[26] + step1[29]); + step2[27] = WRAPLOW(-step1[27] + step1[28]); + step2[28] = WRAPLOW(step1[27] + step1[28]); + step2[29] = WRAPLOW(step1[26] + step1[29]); + step2[30] = WRAPLOW(step1[25] + step1[30]); + step2[31] = WRAPLOW(step1[24] + step1[31]); + + // stage 7 + step1[0] = WRAPLOW(step2[0] + step2[15]); + step1[1] = WRAPLOW(step2[1] + step2[14]); + step1[2] = WRAPLOW(step2[2] + step2[13]); + step1[3] = WRAPLOW(step2[3] + step2[12]); + step1[4] = WRAPLOW(step2[4] + step2[11]); + step1[5] = WRAPLOW(step2[5] + step2[10]); + step1[6] = WRAPLOW(step2[6] + step2[9]); + step1[7] = WRAPLOW(step2[7] + step2[8]); + step1[8] = WRAPLOW(step2[7] - step2[8]); + step1[9] = WRAPLOW(step2[6] - step2[9]); + step1[10] = WRAPLOW(step2[5] - step2[10]); + step1[11] = WRAPLOW(step2[4] - step2[11]); + step1[12] = WRAPLOW(step2[3] - step2[12]); + step1[13] = WRAPLOW(step2[2] - step2[13]); + step1[14] = WRAPLOW(step2[1] - step2[14]); + step1[15] = WRAPLOW(step2[0] - step2[15]); + + step1[16] = step2[16]; + step1[17] = step2[17]; + step1[18] = step2[18]; + step1[19] = step2[19]; + temp1 = (-step2[20] + step2[27]) * cospi_16_64; + temp2 = (step2[20] + step2[27]) * cospi_16_64; + step1[20] = WRAPLOW(dct_const_round_shift(temp1)); + step1[27] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = (-step2[21] + step2[26]) * cospi_16_64; + temp2 = (step2[21] + step2[26]) * cospi_16_64; + step1[21] = WRAPLOW(dct_const_round_shift(temp1)); + step1[26] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = (-step2[22] + step2[25]) * cospi_16_64; + temp2 = (step2[22] + step2[25]) * cospi_16_64; + step1[22] = WRAPLOW(dct_const_round_shift(temp1)); + step1[25] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = (-step2[23] + step2[24]) * cospi_16_64; + temp2 = (step2[23] + step2[24]) * cospi_16_64; + step1[23] = WRAPLOW(dct_const_round_shift(temp1)); + step1[24] = WRAPLOW(dct_const_round_shift(temp2)); + step1[28] = step2[28]; + step1[29] = step2[29]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // final stage + output[0] = WRAPLOW(step1[0] + step1[31]); + output[1] = WRAPLOW(step1[1] + step1[30]); + output[2] = WRAPLOW(step1[2] + step1[29]); + output[3] = WRAPLOW(step1[3] + step1[28]); + output[4] = WRAPLOW(step1[4] + step1[27]); + output[5] = WRAPLOW(step1[5] + step1[26]); + output[6] = WRAPLOW(step1[6] + step1[25]); + output[7] = WRAPLOW(step1[7] + step1[24]); + output[8] = WRAPLOW(step1[8] + step1[23]); + output[9] = WRAPLOW(step1[9] + step1[22]); + output[10] = WRAPLOW(step1[10] + step1[21]); + output[11] = WRAPLOW(step1[11] + step1[20]); + output[12] = WRAPLOW(step1[12] + step1[19]); + output[13] = WRAPLOW(step1[13] + step1[18]); + output[14] = WRAPLOW(step1[14] + step1[17]); + output[15] = WRAPLOW(step1[15] + step1[16]); + output[16] = WRAPLOW(step1[15] - step1[16]); + output[17] = WRAPLOW(step1[14] - step1[17]); + output[18] = WRAPLOW(step1[13] - step1[18]); + output[19] = WRAPLOW(step1[12] - step1[19]); + output[20] = WRAPLOW(step1[11] - step1[20]); + output[21] = WRAPLOW(step1[10] - step1[21]); + output[22] = WRAPLOW(step1[9] - step1[22]); + output[23] = WRAPLOW(step1[8] - step1[23]); + output[24] = WRAPLOW(step1[7] - step1[24]); + output[25] = WRAPLOW(step1[6] - step1[25]); + output[26] = WRAPLOW(step1[5] - step1[26]); + output[27] = WRAPLOW(step1[4] - step1[27]); + output[28] = WRAPLOW(step1[3] - step1[28]); + output[29] = WRAPLOW(step1[2] - step1[29]); + output[30] = WRAPLOW(step1[1] - step1[30]); + output[31] = WRAPLOW(step1[0] - step1[31]); +} + +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + int i, j; + tran_low_t out[32 * 32]; + tran_low_t *outptr = out; + tran_low_t temp_in[32], temp_out[32]; + + // Rows + for (i = 0; i < 32; ++i) { + int16_t zero_coeff = 0; + for (j = 0; j < 32; ++j) zero_coeff |= input[j]; + + if (zero_coeff) + idct32_c(input, outptr); + else + memset(outptr, 0, sizeof(tran_low_t) * 32); + input += 32; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; + idct32_c(temp_in, temp_out); + for (j = 0; j < 32; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } +} + +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + int i, j; + tran_low_t out[32 * 32] = { 0 }; + tran_low_t *outptr = out; + tran_low_t temp_in[32], temp_out[32]; + + // Rows + // Only upper-left 16x16 has non-zero coeff + for (i = 0; i < 16; ++i) { + idct32_c(input, outptr); + input += 32; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; + idct32_c(temp_in, temp_out); + for (j = 0; j < 32; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } +} + +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + int i, j; + tran_low_t out[32 * 32] = { 0 }; + tran_low_t *outptr = out; + tran_low_t temp_in[32], temp_out[32]; + + // Rows + // Only upper-left 8x8 has non-zero coeff + for (i = 0; i < 8; ++i) { + idct32_c(input, outptr); + input += 32; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; + idct32_c(temp_in, temp_out); + for (j = 0; j < 32; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } +} + +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; + tran_high_t a1; + tran_low_t out = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); + + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 6); + + for (j = 0; j < 32; ++j) { + for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1); + dest += stride; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH + +// 12 signal input bits + 7 2D forward transform amplify bits + 5 1D inverse +// transform amplify bits + 1 bit for contingency in rounding and quantizing +#define HIGHBD_VALID_TXFM_MAGNITUDE_RANGE (1 << 25) + +static INLINE int detect_invalid_highbd_input(const tran_low_t *input, + int size) { + int i; + for (i = 0; i < size; ++i) + if (abs(input[i]) >= HIGHBD_VALID_TXFM_MAGNITUDE_RANGE) return 1; + return 0; +} + +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, + 0.5 shifts per pixel. */ + int i; + tran_low_t output[16]; + tran_high_t a1, b1, c1, d1, e1; + const tran_low_t *ip = input; + tran_low_t *op = output; + + for (i = 0; i < 4; i++) { + a1 = ip[0] >> UNIT_QUANT_SHIFT; + c1 = ip[1] >> UNIT_QUANT_SHIFT; + d1 = ip[2] >> UNIT_QUANT_SHIFT; + b1 = ip[3] >> UNIT_QUANT_SHIFT; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + op[0] = HIGHBD_WRAPLOW(a1, bd); + op[1] = HIGHBD_WRAPLOW(b1, bd); + op[2] = HIGHBD_WRAPLOW(c1, bd); + op[3] = HIGHBD_WRAPLOW(d1, bd); + ip += 4; + op += 4; + } + + ip = output; + for (i = 0; i < 4; i++) { + a1 = ip[4 * 0]; + c1 = ip[4 * 1]; + d1 = ip[4 * 2]; + b1 = ip[4 * 3]; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + dest[stride * 0] = + highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd); + dest[stride * 1] = + highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd); + dest[stride * 2] = + highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd); + dest[stride * 3] = + highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd); + + ip++; + dest++; + } +} + +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i; + tran_high_t a1, e1; + tran_low_t tmp[4]; + const tran_low_t *ip = input; + tran_low_t *op = tmp; + (void)bd; + + a1 = ip[0] >> UNIT_QUANT_SHIFT; + e1 = a1 >> 1; + a1 -= e1; + op[0] = HIGHBD_WRAPLOW(a1, bd); + op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd); + + ip = tmp; + for (i = 0; i < 4; i++) { + e1 = ip[0] >> 1; + a1 = ip[0] - e1; + dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd); + dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], e1, bd); + dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], e1, bd); + dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], e1, bd); + ip++; + dest++; + } +} + +void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; + tran_low_t x0 = input[0]; + tran_low_t x1 = input[1]; + tran_low_t x2 = input[2]; + tran_low_t x3 = input[3]; + (void)bd; + + if (detect_invalid_highbd_input(input, 4)) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + assert(0 && "invalid highbd txfm input"); +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + memset(output, 0, sizeof(*output) * 4); + return; + } + + if (!(x0 | x1 | x2 | x3)) { + memset(output, 0, 4 * sizeof(*output)); + return; + } + + s0 = (tran_high_t)sinpi_1_9 * x0; + s1 = (tran_high_t)sinpi_2_9 * x0; + s2 = (tran_high_t)sinpi_3_9 * x1; + s3 = (tran_high_t)sinpi_4_9 * x2; + s4 = (tran_high_t)sinpi_1_9 * x2; + s5 = (tran_high_t)sinpi_2_9 * x3; + s6 = (tran_high_t)sinpi_4_9 * x3; + s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd); + + s0 = s0 + s3 + s5; + s1 = s1 - s4 - s6; + s3 = s2; + s2 = sinpi_3_9 * s7; + + // 1-D transform scaling factor is sqrt(2). + // The overall dynamic range is 14b (input) + 14b (multiplication scaling) + // + 1b (addition) = 29b. + // Hence the output bit depth is 15b. + output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd); + output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd); + output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd); + output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd); +} + +void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_low_t step[4]; + tran_high_t temp1, temp2; + (void)bd; + + if (detect_invalid_highbd_input(input, 4)) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + assert(0 && "invalid highbd txfm input"); +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + memset(output, 0, sizeof(*output) * 4); + return; + } + + // stage 1 + temp1 = (input[0] + input[2]) * (tran_high_t)cospi_16_64; + temp2 = (input[0] - input[2]) * (tran_high_t)cospi_16_64; + step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = + input[1] * (tran_high_t)cospi_24_64 - input[3] * (tran_high_t)cospi_8_64; + temp2 = + input[1] * (tran_high_t)cospi_8_64 + input[3] * (tran_high_t)cospi_24_64; + step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + // stage 2 + output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd); + output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd); + output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd); + output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd); +} + +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + tran_low_t out[4 * 4]; + tran_low_t *outptr = out; + tran_low_t temp_in[4], temp_out[4]; + + // Rows + for (i = 0; i < 4; ++i) { + vpx_highbd_idct4_c(input, outptr, bd); + input += 4; + outptr += 4; + } + + // Columns + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; + vpx_highbd_idct4_c(temp_in, temp_out, bd); + for (j = 0; j < 4; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); + } + } +} + +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i; + tran_high_t a1; + tran_low_t out = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); + + out = + HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd); + a1 = ROUND_POWER_OF_TWO(out, 4); + + for (i = 0; i < 4; i++) { + dest[0] = highbd_clip_pixel_add(dest[0], a1, bd); + dest[1] = highbd_clip_pixel_add(dest[1], a1, bd); + dest[2] = highbd_clip_pixel_add(dest[2], a1, bd); + dest[3] = highbd_clip_pixel_add(dest[3], a1, bd); + dest += stride; + } +} + +void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; + tran_low_t x0 = input[7]; + tran_low_t x1 = input[0]; + tran_low_t x2 = input[5]; + tran_low_t x3 = input[2]; + tran_low_t x4 = input[3]; + tran_low_t x5 = input[4]; + tran_low_t x6 = input[1]; + tran_low_t x7 = input[6]; + (void)bd; + + if (detect_invalid_highbd_input(input, 8)) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + assert(0 && "invalid highbd txfm input"); +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + memset(output, 0, sizeof(*output) * 8); + return; + } + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { + memset(output, 0, 8 * sizeof(*output)); + return; + } + + // stage 1 + s0 = (tran_high_t)cospi_2_64 * x0 + (tran_high_t)cospi_30_64 * x1; + s1 = (tran_high_t)cospi_30_64 * x0 - (tran_high_t)cospi_2_64 * x1; + s2 = (tran_high_t)cospi_10_64 * x2 + (tran_high_t)cospi_22_64 * x3; + s3 = (tran_high_t)cospi_22_64 * x2 - (tran_high_t)cospi_10_64 * x3; + s4 = (tran_high_t)cospi_18_64 * x4 + (tran_high_t)cospi_14_64 * x5; + s5 = (tran_high_t)cospi_14_64 * x4 - (tran_high_t)cospi_18_64 * x5; + s6 = (tran_high_t)cospi_26_64 * x6 + (tran_high_t)cospi_6_64 * x7; + s7 = (tran_high_t)cospi_6_64 * x6 - (tran_high_t)cospi_26_64 * x7; + + x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd); + x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd); + x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd); + x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd); + x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd); + x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd); + x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd); + x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = (tran_high_t)cospi_8_64 * x4 + (tran_high_t)cospi_24_64 * x5; + s5 = (tran_high_t)cospi_24_64 * x4 - (tran_high_t)cospi_8_64 * x5; + s6 = (tran_high_t)(-cospi_24_64) * x6 + (tran_high_t)cospi_8_64 * x7; + s7 = (tran_high_t)cospi_8_64 * x6 + (tran_high_t)cospi_24_64 * x7; + + x0 = HIGHBD_WRAPLOW(s0 + s2, bd); + x1 = HIGHBD_WRAPLOW(s1 + s3, bd); + x2 = HIGHBD_WRAPLOW(s0 - s2, bd); + x3 = HIGHBD_WRAPLOW(s1 - s3, bd); + x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd); + x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd); + x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd); + x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd); + + // stage 3 + s2 = (tran_high_t)cospi_16_64 * (x2 + x3); + s3 = (tran_high_t)cospi_16_64 * (x2 - x3); + s6 = (tran_high_t)cospi_16_64 * (x6 + x7); + s7 = (tran_high_t)cospi_16_64 * (x6 - x7); + + x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd); + x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd); + x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd); + x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd); + + output[0] = HIGHBD_WRAPLOW(x0, bd); + output[1] = HIGHBD_WRAPLOW(-x4, bd); + output[2] = HIGHBD_WRAPLOW(x6, bd); + output[3] = HIGHBD_WRAPLOW(-x2, bd); + output[4] = HIGHBD_WRAPLOW(x3, bd); + output[5] = HIGHBD_WRAPLOW(-x7, bd); + output[6] = HIGHBD_WRAPLOW(x5, bd); + output[7] = HIGHBD_WRAPLOW(-x1, bd); +} + +void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_low_t step1[8], step2[8]; + tran_high_t temp1, temp2; + + if (detect_invalid_highbd_input(input, 8)) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + assert(0 && "invalid highbd txfm input"); +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + memset(output, 0, sizeof(*output) * 8); + return; + } + + // stage 1 + step1[0] = input[0]; + step1[2] = input[4]; + step1[1] = input[2]; + step1[3] = input[6]; + temp1 = + input[1] * (tran_high_t)cospi_28_64 - input[7] * (tran_high_t)cospi_4_64; + temp2 = + input[1] * (tran_high_t)cospi_4_64 + input[7] * (tran_high_t)cospi_28_64; + step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = + input[5] * (tran_high_t)cospi_12_64 - input[3] * (tran_high_t)cospi_20_64; + temp2 = + input[5] * (tran_high_t)cospi_20_64 + input[3] * (tran_high_t)cospi_12_64; + step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + // stage 2 & stage 3 - even half + vpx_highbd_idct4_c(step1, step1, bd); + + // stage 2 - odd half + step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); + step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); + step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); + step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd); + + // stage 3 - odd half + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64; + temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64; + step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + step1[7] = step2[7]; + + // stage 4 + output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd); + output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd); + output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd); + output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd); + output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd); + output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd); + output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd); + output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); +} + +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + tran_low_t out[8 * 8]; + tran_low_t *outptr = out; + tran_low_t temp_in[8], temp_out[8]; + + // First transform rows + for (i = 0; i < 8; ++i) { + vpx_highbd_idct8_c(input, outptr, bd); + input += 8; + outptr += 8; + } + + // Then transform columns + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; + vpx_highbd_idct8_c(temp_in, temp_out, bd); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); + } + } +} + +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + tran_low_t out[8 * 8] = { 0 }; + tran_low_t *outptr = out; + tran_low_t temp_in[8], temp_out[8]; + + // First transform rows + // Only first 4 row has non-zero coefs + for (i = 0; i < 4; ++i) { + vpx_highbd_idct8_c(input, outptr, bd); + input += 8; + outptr += 8; + } + + // Then transform columns + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; + vpx_highbd_idct8_c(temp_in, temp_out, bd); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); + } + } +} + +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + tran_high_t a1; + tran_low_t out = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); + + out = + HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd); + a1 = ROUND_POWER_OF_TWO(out, 5); + for (j = 0; j < 8; ++j) { + for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); + dest += stride; + } +} + +void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; + tran_high_t s9, s10, s11, s12, s13, s14, s15; + tran_low_t x0 = input[15]; + tran_low_t x1 = input[0]; + tran_low_t x2 = input[13]; + tran_low_t x3 = input[2]; + tran_low_t x4 = input[11]; + tran_low_t x5 = input[4]; + tran_low_t x6 = input[9]; + tran_low_t x7 = input[6]; + tran_low_t x8 = input[7]; + tran_low_t x9 = input[8]; + tran_low_t x10 = input[5]; + tran_low_t x11 = input[10]; + tran_low_t x12 = input[3]; + tran_low_t x13 = input[12]; + tran_low_t x14 = input[1]; + tran_low_t x15 = input[14]; + (void)bd; + + if (detect_invalid_highbd_input(input, 16)) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + assert(0 && "invalid highbd txfm input"); +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + memset(output, 0, sizeof(*output) * 16); + return; + } + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | + x13 | x14 | x15)) { + memset(output, 0, 16 * sizeof(*output)); + return; + } + + // stage 1 + s0 = x0 * (tran_high_t)cospi_1_64 + x1 * (tran_high_t)cospi_31_64; + s1 = x0 * (tran_high_t)cospi_31_64 - x1 * (tran_high_t)cospi_1_64; + s2 = x2 * (tran_high_t)cospi_5_64 + x3 * (tran_high_t)cospi_27_64; + s3 = x2 * (tran_high_t)cospi_27_64 - x3 * (tran_high_t)cospi_5_64; + s4 = x4 * (tran_high_t)cospi_9_64 + x5 * (tran_high_t)cospi_23_64; + s5 = x4 * (tran_high_t)cospi_23_64 - x5 * (tran_high_t)cospi_9_64; + s6 = x6 * (tran_high_t)cospi_13_64 + x7 * (tran_high_t)cospi_19_64; + s7 = x6 * (tran_high_t)cospi_19_64 - x7 * (tran_high_t)cospi_13_64; + s8 = x8 * (tran_high_t)cospi_17_64 + x9 * (tran_high_t)cospi_15_64; + s9 = x8 * (tran_high_t)cospi_15_64 - x9 * (tran_high_t)cospi_17_64; + s10 = x10 * (tran_high_t)cospi_21_64 + x11 * (tran_high_t)cospi_11_64; + s11 = x10 * (tran_high_t)cospi_11_64 - x11 * (tran_high_t)cospi_21_64; + s12 = x12 * (tran_high_t)cospi_25_64 + x13 * (tran_high_t)cospi_7_64; + s13 = x12 * (tran_high_t)cospi_7_64 - x13 * (tran_high_t)cospi_25_64; + s14 = x14 * (tran_high_t)cospi_29_64 + x15 * (tran_high_t)cospi_3_64; + s15 = x14 * (tran_high_t)cospi_3_64 - x15 * (tran_high_t)cospi_29_64; + + x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd); + x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd); + x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd); + x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd); + x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd); + x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd); + x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd); + x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd); + x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd); + x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd); + x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd); + x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd); + x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd); + x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd); + x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd); + x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4; + s5 = x5; + s6 = x6; + s7 = x7; + s8 = x8 * (tran_high_t)cospi_4_64 + x9 * (tran_high_t)cospi_28_64; + s9 = x8 * (tran_high_t)cospi_28_64 - x9 * (tran_high_t)cospi_4_64; + s10 = x10 * (tran_high_t)cospi_20_64 + x11 * (tran_high_t)cospi_12_64; + s11 = x10 * (tran_high_t)cospi_12_64 - x11 * (tran_high_t)cospi_20_64; + s12 = -x12 * (tran_high_t)cospi_28_64 + x13 * (tran_high_t)cospi_4_64; + s13 = x12 * (tran_high_t)cospi_4_64 + x13 * (tran_high_t)cospi_28_64; + s14 = -x14 * (tran_high_t)cospi_12_64 + x15 * (tran_high_t)cospi_20_64; + s15 = x14 * (tran_high_t)cospi_20_64 + x15 * (tran_high_t)cospi_12_64; + + x0 = HIGHBD_WRAPLOW(s0 + s4, bd); + x1 = HIGHBD_WRAPLOW(s1 + s5, bd); + x2 = HIGHBD_WRAPLOW(s2 + s6, bd); + x3 = HIGHBD_WRAPLOW(s3 + s7, bd); + x4 = HIGHBD_WRAPLOW(s0 - s4, bd); + x5 = HIGHBD_WRAPLOW(s1 - s5, bd); + x6 = HIGHBD_WRAPLOW(s2 - s6, bd); + x7 = HIGHBD_WRAPLOW(s3 - s7, bd); + x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd); + x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd); + x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd); + x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd); + x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd); + x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd); + x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd); + x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd); + + // stage 3 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4 * (tran_high_t)cospi_8_64 + x5 * (tran_high_t)cospi_24_64; + s5 = x4 * (tran_high_t)cospi_24_64 - x5 * (tran_high_t)cospi_8_64; + s6 = -x6 * (tran_high_t)cospi_24_64 + x7 * (tran_high_t)cospi_8_64; + s7 = x6 * (tran_high_t)cospi_8_64 + x7 * (tran_high_t)cospi_24_64; + s8 = x8; + s9 = x9; + s10 = x10; + s11 = x11; + s12 = x12 * (tran_high_t)cospi_8_64 + x13 * (tran_high_t)cospi_24_64; + s13 = x12 * (tran_high_t)cospi_24_64 - x13 * (tran_high_t)cospi_8_64; + s14 = -x14 * (tran_high_t)cospi_24_64 + x15 * (tran_high_t)cospi_8_64; + s15 = x14 * (tran_high_t)cospi_8_64 + x15 * (tran_high_t)cospi_24_64; + + x0 = HIGHBD_WRAPLOW(s0 + s2, bd); + x1 = HIGHBD_WRAPLOW(s1 + s3, bd); + x2 = HIGHBD_WRAPLOW(s0 - s2, bd); + x3 = HIGHBD_WRAPLOW(s1 - s3, bd); + x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd); + x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd); + x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd); + x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd); + x8 = HIGHBD_WRAPLOW(s8 + s10, bd); + x9 = HIGHBD_WRAPLOW(s9 + s11, bd); + x10 = HIGHBD_WRAPLOW(s8 - s10, bd); + x11 = HIGHBD_WRAPLOW(s9 - s11, bd); + x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd); + x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd); + x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd); + x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd); + + // stage 4 + s2 = (tran_high_t)(-cospi_16_64) * (x2 + x3); + s3 = (tran_high_t)cospi_16_64 * (x2 - x3); + s6 = (tran_high_t)cospi_16_64 * (x6 + x7); + s7 = (tran_high_t)cospi_16_64 * (-x6 + x7); + s10 = (tran_high_t)cospi_16_64 * (x10 + x11); + s11 = (tran_high_t)cospi_16_64 * (-x10 + x11); + s14 = (tran_high_t)(-cospi_16_64) * (x14 + x15); + s15 = (tran_high_t)cospi_16_64 * (x14 - x15); + + x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd); + x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd); + x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd); + x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd); + x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd); + x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd); + x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd); + x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd); + + output[0] = HIGHBD_WRAPLOW(x0, bd); + output[1] = HIGHBD_WRAPLOW(-x8, bd); + output[2] = HIGHBD_WRAPLOW(x12, bd); + output[3] = HIGHBD_WRAPLOW(-x4, bd); + output[4] = HIGHBD_WRAPLOW(x6, bd); + output[5] = HIGHBD_WRAPLOW(x14, bd); + output[6] = HIGHBD_WRAPLOW(x10, bd); + output[7] = HIGHBD_WRAPLOW(x2, bd); + output[8] = HIGHBD_WRAPLOW(x3, bd); + output[9] = HIGHBD_WRAPLOW(x11, bd); + output[10] = HIGHBD_WRAPLOW(x15, bd); + output[11] = HIGHBD_WRAPLOW(x7, bd); + output[12] = HIGHBD_WRAPLOW(x5, bd); + output[13] = HIGHBD_WRAPLOW(-x13, bd); + output[14] = HIGHBD_WRAPLOW(x9, bd); + output[15] = HIGHBD_WRAPLOW(-x1, bd); +} + +void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_low_t step1[16], step2[16]; + tran_high_t temp1, temp2; + (void)bd; + + if (detect_invalid_highbd_input(input, 16)) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + assert(0 && "invalid highbd txfm input"); +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + memset(output, 0, sizeof(*output) * 16); + return; + } + + // stage 1 + step1[0] = input[0 / 2]; + step1[1] = input[16 / 2]; + step1[2] = input[8 / 2]; + step1[3] = input[24 / 2]; + step1[4] = input[4 / 2]; + step1[5] = input[20 / 2]; + step1[6] = input[12 / 2]; + step1[7] = input[28 / 2]; + step1[8] = input[2 / 2]; + step1[9] = input[18 / 2]; + step1[10] = input[10 / 2]; + step1[11] = input[26 / 2]; + step1[12] = input[6 / 2]; + step1[13] = input[22 / 2]; + step1[14] = input[14 / 2]; + step1[15] = input[30 / 2]; + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + + temp1 = + step1[8] * (tran_high_t)cospi_30_64 - step1[15] * (tran_high_t)cospi_2_64; + temp2 = + step1[8] * (tran_high_t)cospi_2_64 + step1[15] * (tran_high_t)cospi_30_64; + step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + temp1 = step1[9] * (tran_high_t)cospi_14_64 - + step1[14] * (tran_high_t)cospi_18_64; + temp2 = step1[9] * (tran_high_t)cospi_18_64 + + step1[14] * (tran_high_t)cospi_14_64; + step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + temp1 = step1[10] * (tran_high_t)cospi_22_64 - + step1[13] * (tran_high_t)cospi_10_64; + temp2 = step1[10] * (tran_high_t)cospi_10_64 + + step1[13] * (tran_high_t)cospi_22_64; + step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + temp1 = step1[11] * (tran_high_t)cospi_6_64 - + step1[12] * (tran_high_t)cospi_26_64; + temp2 = step1[11] * (tran_high_t)cospi_26_64 + + step1[12] * (tran_high_t)cospi_6_64; + step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + + temp1 = + step2[4] * (tran_high_t)cospi_28_64 - step2[7] * (tran_high_t)cospi_4_64; + temp2 = + step2[4] * (tran_high_t)cospi_4_64 + step2[7] * (tran_high_t)cospi_28_64; + step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = + step2[5] * (tran_high_t)cospi_12_64 - step2[6] * (tran_high_t)cospi_20_64; + temp2 = + step2[5] * (tran_high_t)cospi_20_64 + step2[6] * (tran_high_t)cospi_12_64; + step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd); + step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd); + step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd); + step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd); + step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd); + step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd); + step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd); + step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd); + + // stage 4 + temp1 = (step1[0] + step1[1]) * (tran_high_t)cospi_16_64; + temp2 = (step1[0] - step1[1]) * (tran_high_t)cospi_16_64; + step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = + step1[2] * (tran_high_t)cospi_24_64 - step1[3] * (tran_high_t)cospi_8_64; + temp2 = + step1[2] * (tran_high_t)cospi_8_64 + step1[3] * (tran_high_t)cospi_24_64; + step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); + step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); + step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); + step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd); + + step2[8] = step1[8]; + step2[15] = step1[15]; + temp1 = -step1[9] * (tran_high_t)cospi_8_64 + + step1[14] * (tran_high_t)cospi_24_64; + temp2 = + step1[9] * (tran_high_t)cospi_24_64 + step1[14] * (tran_high_t)cospi_8_64; + step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = -step1[10] * (tran_high_t)cospi_24_64 - + step1[13] * (tran_high_t)cospi_8_64; + temp2 = -step1[10] * (tran_high_t)cospi_8_64 + + step1[13] * (tran_high_t)cospi_24_64; + step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + step2[11] = step1[11]; + step2[12] = step1[12]; + + // stage 5 + step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd); + step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd); + step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd); + step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd); + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64; + temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64; + step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + step1[7] = step2[7]; + + step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd); + step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd); + step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd); + step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd); + step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd); + step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd); + step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd); + step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd); + + // stage 6 + step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd); + step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd); + step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd); + step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd); + step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd); + step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd); + step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd); + step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); + step2[8] = step1[8]; + step2[9] = step1[9]; + temp1 = (-step1[10] + step1[13]) * (tran_high_t)cospi_16_64; + temp2 = (step1[10] + step1[13]) * (tran_high_t)cospi_16_64; + step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = (-step1[11] + step1[12]) * (tran_high_t)cospi_16_64; + temp2 = (step1[11] + step1[12]) * (tran_high_t)cospi_16_64; + step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd); + output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd); + output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd); + output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd); + output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd); + output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd); + output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd); + output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd); + output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd); + output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd); + output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd); + output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd); + output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd); + output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd); + output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd); + output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd); +} + +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + tran_low_t out[16 * 16]; + tran_low_t *outptr = out; + tran_low_t temp_in[16], temp_out[16]; + + // First transform rows + for (i = 0; i < 16; ++i) { + vpx_highbd_idct16_c(input, outptr, bd); + input += 16; + outptr += 16; + } + + // Then transform columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; + vpx_highbd_idct16_c(temp_in, temp_out, bd); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } + } +} + +void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + tran_low_t out[16 * 16] = { 0 }; + tran_low_t *outptr = out; + tran_low_t temp_in[16], temp_out[16]; + + // First transform rows. Since all non-zero dct coefficients are in + // upper-left 8x8 area, we only need to calculate first 8 rows here. + for (i = 0; i < 8; ++i) { + vpx_highbd_idct16_c(input, outptr, bd); + input += 16; + outptr += 16; + } + + // Then transform columns + for (i = 0; i < 16; ++i) { + uint16_t *destT = dest; + for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; + vpx_highbd_idct16_c(temp_in, temp_out, bd); + for (j = 0; j < 16; ++j) { + destT[i] = highbd_clip_pixel_add(destT[i], + ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + destT += stride; + } + } +} + +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + tran_low_t out[16 * 16] = { 0 }; + tran_low_t *outptr = out; + tran_low_t temp_in[16], temp_out[16]; + + // First transform rows. Since all non-zero dct coefficients are in + // upper-left 4x4 area, we only need to calculate first 4 rows here. + for (i = 0; i < 4; ++i) { + vpx_highbd_idct16_c(input, outptr, bd); + input += 16; + outptr += 16; + } + + // Then transform columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; + vpx_highbd_idct16_c(temp_in, temp_out, bd); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } + } +} + +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + tran_high_t a1; + tran_low_t out = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); + + out = + HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd); + a1 = ROUND_POWER_OF_TWO(out, 6); + for (j = 0; j < 16; ++j) { + for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); + dest += stride; + } +} + +static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, + int bd) { + tran_low_t step1[32], step2[32]; + tran_high_t temp1, temp2; + (void)bd; + + if (detect_invalid_highbd_input(input, 32)) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + assert(0 && "invalid highbd txfm input"); +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + memset(output, 0, sizeof(*output) * 32); + return; + } + + // stage 1 + step1[0] = input[0]; + step1[1] = input[16]; + step1[2] = input[8]; + step1[3] = input[24]; + step1[4] = input[4]; + step1[5] = input[20]; + step1[6] = input[12]; + step1[7] = input[28]; + step1[8] = input[2]; + step1[9] = input[18]; + step1[10] = input[10]; + step1[11] = input[26]; + step1[12] = input[6]; + step1[13] = input[22]; + step1[14] = input[14]; + step1[15] = input[30]; + + temp1 = + input[1] * (tran_high_t)cospi_31_64 - input[31] * (tran_high_t)cospi_1_64; + temp2 = + input[1] * (tran_high_t)cospi_1_64 + input[31] * (tran_high_t)cospi_31_64; + step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + temp1 = input[17] * (tran_high_t)cospi_15_64 - + input[15] * (tran_high_t)cospi_17_64; + temp2 = input[17] * (tran_high_t)cospi_17_64 + + input[15] * (tran_high_t)cospi_15_64; + step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + temp1 = + input[9] * (tran_high_t)cospi_23_64 - input[23] * (tran_high_t)cospi_9_64; + temp2 = + input[9] * (tran_high_t)cospi_9_64 + input[23] * (tran_high_t)cospi_23_64; + step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + temp1 = + input[25] * (tran_high_t)cospi_7_64 - input[7] * (tran_high_t)cospi_25_64; + temp2 = + input[25] * (tran_high_t)cospi_25_64 + input[7] * (tran_high_t)cospi_7_64; + step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + temp1 = + input[5] * (tran_high_t)cospi_27_64 - input[27] * (tran_high_t)cospi_5_64; + temp2 = + input[5] * (tran_high_t)cospi_5_64 + input[27] * (tran_high_t)cospi_27_64; + step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + temp1 = input[21] * (tran_high_t)cospi_11_64 - + input[11] * (tran_high_t)cospi_21_64; + temp2 = input[21] * (tran_high_t)cospi_21_64 + + input[11] * (tran_high_t)cospi_11_64; + step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + temp1 = input[13] * (tran_high_t)cospi_19_64 - + input[19] * (tran_high_t)cospi_13_64; + temp2 = input[13] * (tran_high_t)cospi_13_64 + + input[19] * (tran_high_t)cospi_19_64; + step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + temp1 = + input[29] * (tran_high_t)cospi_3_64 - input[3] * (tran_high_t)cospi_29_64; + temp2 = + input[29] * (tran_high_t)cospi_29_64 + input[3] * (tran_high_t)cospi_3_64; + step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + + temp1 = + step1[8] * (tran_high_t)cospi_30_64 - step1[15] * (tran_high_t)cospi_2_64; + temp2 = + step1[8] * (tran_high_t)cospi_2_64 + step1[15] * (tran_high_t)cospi_30_64; + step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + temp1 = step1[9] * (tran_high_t)cospi_14_64 - + step1[14] * (tran_high_t)cospi_18_64; + temp2 = step1[9] * (tran_high_t)cospi_18_64 + + step1[14] * (tran_high_t)cospi_14_64; + step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + temp1 = step1[10] * (tran_high_t)cospi_22_64 - + step1[13] * (tran_high_t)cospi_10_64; + temp2 = step1[10] * (tran_high_t)cospi_10_64 + + step1[13] * (tran_high_t)cospi_22_64; + step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + temp1 = step1[11] * (tran_high_t)cospi_6_64 - + step1[12] * (tran_high_t)cospi_26_64; + temp2 = step1[11] * (tran_high_t)cospi_26_64 + + step1[12] * (tran_high_t)cospi_6_64; + step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd); + step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd); + step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd); + step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd); + step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd); + step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd); + step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd); + step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd); + step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd); + step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd); + step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd); + step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd); + step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd); + step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd); + step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd); + step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd); + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + + temp1 = + step2[4] * (tran_high_t)cospi_28_64 - step2[7] * (tran_high_t)cospi_4_64; + temp2 = + step2[4] * (tran_high_t)cospi_4_64 + step2[7] * (tran_high_t)cospi_28_64; + step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = + step2[5] * (tran_high_t)cospi_12_64 - step2[6] * (tran_high_t)cospi_20_64; + temp2 = + step2[5] * (tran_high_t)cospi_20_64 + step2[6] * (tran_high_t)cospi_12_64; + step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd); + step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd); + step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd); + step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd); + step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd); + step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd); + step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd); + step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd); + + step1[16] = step2[16]; + step1[31] = step2[31]; + temp1 = -step2[17] * (tran_high_t)cospi_4_64 + + step2[30] * (tran_high_t)cospi_28_64; + temp2 = step2[17] * (tran_high_t)cospi_28_64 + + step2[30] * (tran_high_t)cospi_4_64; + step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = -step2[18] * (tran_high_t)cospi_28_64 - + step2[29] * (tran_high_t)cospi_4_64; + temp2 = -step2[18] * (tran_high_t)cospi_4_64 + + step2[29] * (tran_high_t)cospi_28_64; + step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + step1[19] = step2[19]; + step1[20] = step2[20]; + temp1 = -step2[21] * (tran_high_t)cospi_20_64 + + step2[26] * (tran_high_t)cospi_12_64; + temp2 = step2[21] * (tran_high_t)cospi_12_64 + + step2[26] * (tran_high_t)cospi_20_64; + step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = -step2[22] * (tran_high_t)cospi_12_64 - + step2[25] * (tran_high_t)cospi_20_64; + temp2 = -step2[22] * (tran_high_t)cospi_20_64 + + step2[25] * (tran_high_t)cospi_12_64; + step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + // stage 4 + temp1 = (step1[0] + step1[1]) * (tran_high_t)cospi_16_64; + temp2 = (step1[0] - step1[1]) * (tran_high_t)cospi_16_64; + step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = + step1[2] * (tran_high_t)cospi_24_64 - step1[3] * (tran_high_t)cospi_8_64; + temp2 = + step1[2] * (tran_high_t)cospi_8_64 + step1[3] * (tran_high_t)cospi_24_64; + step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); + step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); + step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); + step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd); + + step2[8] = step1[8]; + step2[15] = step1[15]; + temp1 = -step1[9] * (tran_high_t)cospi_8_64 + + step1[14] * (tran_high_t)cospi_24_64; + temp2 = + step1[9] * (tran_high_t)cospi_24_64 + step1[14] * (tran_high_t)cospi_8_64; + step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = -step1[10] * (tran_high_t)cospi_24_64 - + step1[13] * (tran_high_t)cospi_8_64; + temp2 = -step1[10] * (tran_high_t)cospi_8_64 + + step1[13] * (tran_high_t)cospi_24_64; + step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + step2[11] = step1[11]; + step2[12] = step1[12]; + + step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd); + step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd); + step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd); + step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd); + step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd); + step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd); + step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd); + step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd); + + step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd); + step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd); + step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd); + step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd); + step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd); + step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd); + step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd); + step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd); + + // stage 5 + step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd); + step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd); + step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd); + step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd); + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64; + temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64; + step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + step1[7] = step2[7]; + + step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd); + step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd); + step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd); + step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd); + step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd); + step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd); + step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd); + step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd); + + step1[16] = step2[16]; + step1[17] = step2[17]; + temp1 = -step2[18] * (tran_high_t)cospi_8_64 + + step2[29] * (tran_high_t)cospi_24_64; + temp2 = step2[18] * (tran_high_t)cospi_24_64 + + step2[29] * (tran_high_t)cospi_8_64; + step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = -step2[19] * (tran_high_t)cospi_8_64 + + step2[28] * (tran_high_t)cospi_24_64; + temp2 = step2[19] * (tran_high_t)cospi_24_64 + + step2[28] * (tran_high_t)cospi_8_64; + step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = -step2[20] * (tran_high_t)cospi_24_64 - + step2[27] * (tran_high_t)cospi_8_64; + temp2 = -step2[20] * (tran_high_t)cospi_8_64 + + step2[27] * (tran_high_t)cospi_24_64; + step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = -step2[21] * (tran_high_t)cospi_24_64 - + step2[26] * (tran_high_t)cospi_8_64; + temp2 = -step2[21] * (tran_high_t)cospi_8_64 + + step2[26] * (tran_high_t)cospi_24_64; + step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // stage 6 + step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd); + step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd); + step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd); + step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd); + step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd); + step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd); + step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd); + step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); + step2[8] = step1[8]; + step2[9] = step1[9]; + temp1 = (-step1[10] + step1[13]) * (tran_high_t)cospi_16_64; + temp2 = (step1[10] + step1[13]) * (tran_high_t)cospi_16_64; + step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = (-step1[11] + step1[12]) * (tran_high_t)cospi_16_64; + temp2 = (step1[11] + step1[12]) * (tran_high_t)cospi_16_64; + step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + step2[14] = step1[14]; + step2[15] = step1[15]; + + step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd); + step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd); + step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd); + step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd); + step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd); + step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd); + step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd); + step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd); + + step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd); + step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd); + step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd); + step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd); + step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd); + step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd); + step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd); + step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd); + + // stage 7 + step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd); + step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd); + step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd); + step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd); + step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd); + step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd); + step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd); + step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd); + step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd); + step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd); + step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd); + step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd); + step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd); + step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd); + step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd); + step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd); + + step1[16] = step2[16]; + step1[17] = step2[17]; + step1[18] = step2[18]; + step1[19] = step2[19]; + temp1 = (-step2[20] + step2[27]) * (tran_high_t)cospi_16_64; + temp2 = (step2[20] + step2[27]) * (tran_high_t)cospi_16_64; + step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = (-step2[21] + step2[26]) * (tran_high_t)cospi_16_64; + temp2 = (step2[21] + step2[26]) * (tran_high_t)cospi_16_64; + step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = (-step2[22] + step2[25]) * (tran_high_t)cospi_16_64; + temp2 = (step2[22] + step2[25]) * (tran_high_t)cospi_16_64; + step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = (-step2[23] + step2[24]) * (tran_high_t)cospi_16_64; + temp2 = (step2[23] + step2[24]) * (tran_high_t)cospi_16_64; + step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + step1[28] = step2[28]; + step1[29] = step2[29]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // final stage + output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd); + output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd); + output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd); + output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd); + output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd); + output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd); + output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd); + output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd); + output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd); + output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd); + output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd); + output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd); + output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd); + output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd); + output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd); + output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd); + output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd); + output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd); + output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd); + output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd); + output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd); + output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd); + output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd); + output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd); + output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd); + output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd); + output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd); + output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd); + output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd); + output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd); + output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd); + output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd); +} + +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + tran_low_t out[32 * 32]; + tran_low_t *outptr = out; + tran_low_t temp_in[32], temp_out[32]; + + // Rows + for (i = 0; i < 32; ++i) { + tran_low_t zero_coeff = 0; + for (j = 0; j < 32; ++j) zero_coeff |= input[j]; + + if (zero_coeff) + highbd_idct32_c(input, outptr, bd); + else + memset(outptr, 0, sizeof(tran_low_t) * 32); + input += 32; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; + highbd_idct32_c(temp_in, temp_out, bd); + for (j = 0; j < 32; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } + } +} + +void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + tran_low_t out[32 * 32] = { 0 }; + tran_low_t *outptr = out; + tran_low_t temp_in[32], temp_out[32]; + + // Rows + // Only upper-left 16x16 has non-zero coeff + for (i = 0; i < 16; ++i) { + highbd_idct32_c(input, outptr, bd); + input += 32; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + uint16_t *destT = dest; + for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; + highbd_idct32_c(temp_in, temp_out, bd); + for (j = 0; j < 32; ++j) { + destT[i] = highbd_clip_pixel_add(destT[i], + ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + destT += stride; + } + } +} + +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + tran_low_t out[32 * 32] = { 0 }; + tran_low_t *outptr = out; + tran_low_t temp_in[32], temp_out[32]; + + // Rows + // Only upper-left 8x8 has non-zero coeff + for (i = 0; i < 8; ++i) { + highbd_idct32_c(input, outptr, bd); + input += 32; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; + highbd_idct32_c(temp_in, temp_out, bd); + for (j = 0; j < 32; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } + } +} + +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + int a1; + tran_low_t out = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); + + out = + HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd); + a1 = ROUND_POWER_OF_TWO(out, 6); + + for (j = 0; j < 32; ++j) { + for (i = 0; i < 32; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); + dest += stride; + } +} + +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/inv_txfm.h b/media/libvpx/libvpx/vpx_dsp/inv_txfm.h new file mode 100644 index 0000000000..6eedbeac35 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/inv_txfm.h @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_INV_TXFM_H_ +#define VPX_VPX_DSP_INV_TXFM_H_ + +#include + +#include "./vpx_config.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static INLINE tran_high_t check_range(tran_high_t input) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + // For valid VP9 input streams, intermediate stage coefficients should always + // stay within the range of a signed 16 bit integer. Coefficients can go out + // of this range for invalid/corrupt VP9 streams. However, strictly checking + // this range for every intermediate coefficient can burdensome for a decoder, + // therefore the following assertion is only enabled when configured with + // --enable-coefficient-range-checking. + assert(INT16_MIN <= input); + assert(input <= INT16_MAX); +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + return input; +} + +static INLINE tran_high_t dct_const_round_shift(tran_high_t input) { + tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); + return (tran_high_t)rv; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE tran_high_t highbd_check_range(tran_high_t input, int bd) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + // For valid highbitdepth VP9 streams, intermediate stage coefficients will + // stay within the ranges: + // - 8 bit: signed 16 bit integer + // - 10 bit: signed 18 bit integer + // - 12 bit: signed 20 bit integer + const int32_t int_max = (1 << (7 + bd)) - 1; + const int32_t int_min = -int_max - 1; + assert(int_min <= input); + assert(input <= int_max); + (void)int_min; +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + (void)bd; + return input; +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if CONFIG_EMULATE_HARDWARE +// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a +// non-normative method to handle overflows. A stream that causes +// overflows in the inverse transform is considered invalid in VP9, +// and a hardware implementer is free to choose any reasonable +// method to handle overflows. However to aid in hardware +// verification they can use a specific implementation of the +// WRAPLOW() macro below that is identical to their intended +// hardware implementation (and also use configure options to trigger +// the C-implementation of the transform). +// +// The particular WRAPLOW implementation below performs strict +// overflow wrapping to match common hardware implementations. +// bd of 8 uses trans_low with 16bits, need to remove 16bits +// bd of 10 uses trans_low with 18bits, need to remove 14bits +// bd of 12 uses trans_low with 20bits, need to remove 12bits +// bd of x uses trans_low with 8+x bits, need to remove 24-x bits +#define WRAPLOW(x) ((((int32_t)check_range(x)) << 16) >> 16) +#if CONFIG_VP9_HIGHBITDEPTH +#define HIGHBD_WRAPLOW(x, bd) \ + ((((int32_t)highbd_check_range((x), bd)) << (24 - bd)) >> (24 - bd)) +#endif // CONFIG_VP9_HIGHBITDEPTH + +#else // CONFIG_EMULATE_HARDWARE + +#define WRAPLOW(x) ((int32_t)check_range(x)) +#if CONFIG_VP9_HIGHBITDEPTH +#define HIGHBD_WRAPLOW(x, bd) ((int32_t)highbd_check_range((x), bd)) +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // CONFIG_EMULATE_HARDWARE + +void idct4_c(const tran_low_t *input, tran_low_t *output); +void idct8_c(const tran_low_t *input, tran_low_t *output); +void idct16_c(const tran_low_t *input, tran_low_t *output); +void idct32_c(const tran_low_t *input, tran_low_t *output); +void iadst4_c(const tran_low_t *input, tran_low_t *output); +void iadst8_c(const tran_low_t *input, tran_low_t *output); +void iadst16_c(const tran_low_t *input, tran_low_t *output); + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd); +void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd); +void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd); + +void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd); +void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd); +void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd); + +static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans, + int bd) { + trans = HIGHBD_WRAPLOW(trans, bd); + return clip_pixel_highbd(dest + (int)trans, bd); +} +#endif + +static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) { + trans = WRAPLOW(trans); + return clip_pixel(dest + (int)trans); +} +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_DSP_INV_TXFM_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/avg_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/avg_lsx.c new file mode 100644 index 0000000000..750c9de29f --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/avg_lsx.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/bitdepth_conversion_lsx.h" + +void vpx_hadamard_8x8_lsx(const int16_t *src, ptrdiff_t src_stride, + tran_low_t *dst) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + ptrdiff_t src_stride2 = src_stride << 1; + ptrdiff_t src_stride3 = src_stride2 + src_stride; + ptrdiff_t src_stride4 = src_stride2 << 1; + ptrdiff_t src_stride6 = src_stride3 << 1; + + int16_t *src_tmp = (int16_t *)src; + src0 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride2, src_tmp, src_stride4, src1, src2); + src3 = __lsx_vldx(src_tmp, src_stride6); + src_tmp += src_stride4; + src4 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride2, src_tmp, src_stride4, src5, src6); + src7 = __lsx_vldx(src_tmp, src_stride6); + + LSX_BUTTERFLY_8_H(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, + tmp4, tmp6, tmp7, tmp5, tmp3, tmp1); + LSX_BUTTERFLY_8_H(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, + src4, src5, src7, src6, src3, src2); + LSX_BUTTERFLY_8_H(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, + tmp3, tmp4, tmp5, tmp1, tmp6, tmp2); + LSX_TRANSPOSE8x8_H(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src3, src4, src5, src6, src7); + LSX_BUTTERFLY_8_H(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, + tmp4, tmp6, tmp7, tmp5, tmp3, tmp1); + LSX_BUTTERFLY_8_H(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, + src4, src5, src7, src6, src3, src2); + LSX_BUTTERFLY_8_H(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, + tmp3, tmp4, tmp5, tmp1, tmp6, tmp2); + store_tran_low(tmp0, dst, 0); + store_tran_low(tmp1, dst, 8); + store_tran_low(tmp2, dst, 16); + store_tran_low(tmp3, dst, 24); + store_tran_low(tmp4, dst, 32); + store_tran_low(tmp5, dst, 40); + store_tran_low(tmp6, dst, 48); + store_tran_low(tmp7, dst, 56); +} + +void vpx_hadamard_16x16_lsx(const int16_t *src, ptrdiff_t src_stride, + tran_low_t *dst) { + int i; + __m128i a0, a1, a2, a3, b0, b1, b2, b3; + + /* Rearrange 16x16 to 8x32 and remove stride. + * Top left first. */ + vpx_hadamard_8x8_lsx(src + 0 + 0 * src_stride, src_stride, dst + 0); + /* Top right. */ + vpx_hadamard_8x8_lsx(src + 8 + 0 * src_stride, src_stride, dst + 64); + /* Bottom left. */ + vpx_hadamard_8x8_lsx(src + 0 + 8 * src_stride, src_stride, dst + 128); + /* Bottom right. */ + vpx_hadamard_8x8_lsx(src + 8 + 8 * src_stride, src_stride, dst + 192); + + for (i = 0; i < 64; i += 8) { + a0 = load_tran_low(dst); + a1 = load_tran_low(dst + 64); + a2 = load_tran_low(dst + 128); + a3 = load_tran_low(dst + 192); + + LSX_BUTTERFLY_4_H(a0, a2, a3, a1, b0, b2, b3, b1); + DUP4_ARG2(__lsx_vsrai_h, b0, 1, b1, 1, b2, 1, b3, 1, b0, b1, b2, b3); + LSX_BUTTERFLY_4_H(b0, b1, b3, b2, a0, a1, a3, a2); + + store_tran_low(a0, dst, 0); + store_tran_low(a1, dst, 64); + store_tran_low(a2, dst, 128); + store_tran_low(a3, dst, 192); + + dst += 8; + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/avg_pred_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/avg_pred_lsx.c new file mode 100644 index 0000000000..482626080a --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/avg_pred_lsx.c @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_util/loongson_intrinsics.h" + +void vpx_comp_avg_pred_lsx(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + // width > 8 || width == 8 || width == 4 + if (width > 8) { + int i, j; + for (i = 0; i < height; ++i) { + for (j = 0; j < width; j += 16) { + __m128i p, r, avg; + + p = __lsx_vld(pred + j, 0); + r = __lsx_vld(ref + j, 0); + avg = __lsx_vavgr_bu(p, r); + __lsx_vst(avg, comp_pred + j, 0); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } + } else if (width == 8) { + int i = height * width; + do { + __m128i p, r, r_0, r_1; + + p = __lsx_vld(pred, 0); + r_0 = __lsx_vld(ref, 0); + ref += ref_stride; + r_1 = __lsx_vld(ref, 0); + ref += ref_stride; + r = __lsx_vilvl_d(r_1, r_0); + r = __lsx_vavgr_bu(p, r); + + __lsx_vst(r, comp_pred, 0); + + pred += 16; + comp_pred += 16; + i -= 16; + } while (i); + } else { // width = 4 + int i = height * width; + assert(width == 4); + do { + __m128i p, r, r_0, r_1, r_2, r_3; + p = __lsx_vld(pred, 0); + + if (width == ref_stride) { + r = __lsx_vld(ref, 0); + ref += 16; + } else { + r_0 = __lsx_vld(ref, 0); + ref += ref_stride; + r_1 = __lsx_vld(ref, 0); + ref += ref_stride; + r_2 = __lsx_vld(ref, 0); + ref += ref_stride; + r_3 = __lsx_vld(ref, 0); + ref += ref_stride; + DUP2_ARG2(__lsx_vilvl_w, r_1, r_0, r_3, r_2, r_0, r_2); + r = __lsx_vilvl_d(r_2, r_0); + } + r = __lsx_vavgr_bu(p, r); + + __lsx_vst(r, comp_pred, 0); + comp_pred += 16; + pred += 16; + i -= 16; + } while (i); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/bitdepth_conversion_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/bitdepth_conversion_lsx.h new file mode 100644 index 0000000000..b0db1e99c5 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/bitdepth_conversion_lsx.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_ +#define VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_ + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_util/loongson_intrinsics.h" + +static INLINE __m128i load_tran_low(const tran_low_t *s) { +#if CONFIG_VP9_HIGHBITDEPTH + __m128i v0_m = __lsx_vld(s, 0); + __m128i v1_m = __lsx_vld(s + 4, 0); + return __lsx_vsrlni_h_w(v0_m, v1_m, 0); +#else + return __lsx_vld(s, 0); +#endif +} + +static INLINE void store_tran_low(__m128i v, tran_low_t *s, int32_t c) { +#if CONFIG_VP9_HIGHBITDEPTH + __m128i v0_m, v1_m; + v1_m = __lsx_vexth_w_h(v); + v0_m = __lsx_vsllwil_w_h(v, 0); + __lsx_vst(v0_m, s + c, 0); + __lsx_vst(v1_m, s + c + 4, 0); +#else + __lsx_vst(v, s + c, 0); +#endif +} + +#endif // VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_dct32x32_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_dct32x32_lsx.c new file mode 100644 index 0000000000..9bb3877212 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_dct32x32_lsx.c @@ -0,0 +1,1176 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/fwd_txfm_lsx.h" +#include "vpx_dsp/fwd_txfm.h" + +#define UNPCK_SH_SW(in, out0, out1) \ + do { \ + out0 = __lsx_vsllwil_w_h(in, 0); \ + out1 = __lsx_vexth_w_h(in); \ + } while (0) + +static void fdct8x32_1d_column_load_butterfly(const int16_t *input, + int32_t src_stride, + int16_t *temp_buff) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i step0, step1, step2, step3; + __m128i in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1; + __m128i step0_1, step1_1, step2_1, step3_1; + + int32_t stride = src_stride << 1; + int32_t stride2 = stride << 1; + int32_t stride3 = stride2 + stride; + const int16_t *input_tmp = (int16_t *)input; + + in0 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1, in2); + in3 = __lsx_vldx(input_tmp, stride3); + + input_tmp += stride2; + in0_1 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1_1, in2_1); + in3_1 = __lsx_vldx(input_tmp, stride3); + + input_tmp = input + (src_stride * 24); + in4_1 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5_1, in6_1); + in7_1 = __lsx_vldx(input_tmp, stride3); + + input_tmp += stride2; + in4 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5, in6); + in7 = __lsx_vldx(input_tmp, stride3); + + DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7); + DUP4_ARG2(__lsx_vslli_h, in0_1, 2, in1_1, 2, in2_1, 2, in3_1, 2, in0_1, in1_1, + in2_1, in3_1); + DUP4_ARG2(__lsx_vslli_h, in4_1, 2, in5_1, 2, in6_1, 2, in7_1, 2, in4_1, in5_1, + in6_1, in7_1); + LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2, + step3, in4, in5, in6, in7); + LSX_BUTTERFLY_8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, + in7_1); + + __lsx_vst(step0, temp_buff, 0); + __lsx_vst(step1, temp_buff, 16); + __lsx_vst(step2, temp_buff, 32); + __lsx_vst(step3, temp_buff, 48); + + __lsx_vst(in4, temp_buff, 448); + __lsx_vst(in5, temp_buff, 464); + __lsx_vst(in6, temp_buff, 480); + __lsx_vst(in7, temp_buff, 496); + + __lsx_vst(step0_1, temp_buff, 64); + __lsx_vst(step1_1, temp_buff, 80); + __lsx_vst(step2_1, temp_buff, 96); + __lsx_vst(step3_1, temp_buff, 112); + + __lsx_vst(in4_1, temp_buff, 384); + __lsx_vst(in5_1, temp_buff, 400); + __lsx_vst(in6_1, temp_buff, 416); + __lsx_vst(in7_1, temp_buff, 432); + + /* 3rd and 4th set */ + input_tmp = input + (src_stride * 8); + in0 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1, in2); + in3 = __lsx_vldx(input_tmp, stride3); + + input_tmp += stride2; + in0_1 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1_1, in2_1); + in3_1 = __lsx_vldx(input_tmp, stride3); + + input_tmp += stride2; + in4_1 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5_1, in6_1); + in7_1 = __lsx_vldx(input_tmp, stride3); + + input_tmp += stride2; + in4 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5, in6); + in7 = __lsx_vldx(input_tmp, stride3); + DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7); + DUP4_ARG2(__lsx_vslli_h, in0_1, 2, in1_1, 2, in2_1, 2, in3_1, 2, in0_1, in1_1, + in2_1, in3_1); + DUP4_ARG2(__lsx_vslli_h, in4_1, 2, in5_1, 2, in6_1, 2, in7_1, 2, in4_1, in5_1, + in6_1, in7_1); + + LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2, + step3, in4, in5, in6, in7); + LSX_BUTTERFLY_8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, + in7_1); + + __lsx_vst(step0, temp_buff, 128); + __lsx_vst(step1, temp_buff, 144); + __lsx_vst(step2, temp_buff, 160); + __lsx_vst(step3, temp_buff, 176); + + __lsx_vst(in4, temp_buff, 320); + __lsx_vst(in5, temp_buff, 336); + __lsx_vst(in6, temp_buff, 352); + __lsx_vst(in7, temp_buff, 368); + + __lsx_vst(step0_1, temp_buff, 192); + __lsx_vst(step1_1, temp_buff, 208); + __lsx_vst(step2_1, temp_buff, 224); + __lsx_vst(step3_1, temp_buff, 240); + + __lsx_vst(in4_1, temp_buff, 256); + __lsx_vst(in5_1, temp_buff, 272); + __lsx_vst(in6_1, temp_buff, 288); + __lsx_vst(in7_1, temp_buff, 304); +} + +static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in8, in9, in10, in11, in12, in13, in14, in15; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i temp0, temp1; + + /* fdct even */ + DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48, in0, in1, in2, + in3); + DUP4_ARG2(__lsx_vld, input, 192, input, 208, input, 224, input, 240, in12, + in13, in14, in15); + LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1, + vec2, vec3, in12, in13, in14, in15); + DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 96, input, 112, in4, in5, + in6, in7); + DUP4_ARG2(__lsx_vld, input, 128, input, 144, input, 160, input, 176, in8, in9, + in10, in11); + LSX_BUTTERFLY_8_H(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6, + vec7, in8, in9, in10, in11); + + /* Stage 3 */ + DUP4_ARG2(__lsx_vadd_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, + in1, in2, in3); + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, in4, in1, in0); + DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + __lsx_vst(temp0, temp, 0); + __lsx_vst(temp1, temp, 1024); + + DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + __lsx_vst(temp0, temp, 512); + __lsx_vst(temp1, temp, 1536); + + DUP4_ARG2(__lsx_vsub_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, + vec6, vec5, vec4); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + __lsx_vst(temp0, temp, 256); + __lsx_vst(temp1, temp, 1792); + + DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + __lsx_vst(temp0, temp, 1280); + __lsx_vst(temp1, temp, 768); + + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, + vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + __lsx_vst(temp0, temp, 128); + __lsx_vst(temp1, temp, 1920); + + DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + __lsx_vst(temp0, temp, 1152); + __lsx_vst(temp1, temp, 896); + + DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5); + temp0 = __lsx_vneg_h(vec2); + DOTP_CONST_PAIR(temp0, vec5, cospi_24_64, cospi_8_64, in2, in1); + DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, + vec2, vec5); + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + __lsx_vst(temp0, temp, 640); + __lsx_vst(temp1, temp, 1408); + + DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + __lsx_vst(temp0, temp, 384); + __lsx_vst(temp1, temp, 1664); +} + +static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) { + __m128i in16, in17, in18, in19, in20, in21, in22, in23; + __m128i in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5; + __m128i tmp0, tmp1; + + DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 160, input, 176, in20, in21, + in26, in27); + + DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); + DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); + + DUP4_ARG2(__lsx_vld, input, 32, input, 48, input, 192, input, 208, in18, in19, + in28, in29); + + vec4 = __lsx_vsub_h(in19, in20); + __lsx_vst(vec4, input, 64); + vec4 = __lsx_vsub_h(in18, in21); + __lsx_vst(vec4, input, 80); + vec4 = __lsx_vsub_h(in29, in26); + __lsx_vst(vec4, input, 160); + vec4 = __lsx_vsub_h(in28, in27); + __lsx_vst(vec4, input, 176); + + in21 = __lsx_vadd_h(in18, in21); + in20 = __lsx_vadd_h(in19, in20); + in27 = __lsx_vadd_h(in28, in27); + in26 = __lsx_vadd_h(in29, in26); + + DUP4_ARG2(__lsx_vld, input, 96, input, 112, input, 128, input, 144, in22, + in23, in24, in25); + DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); + DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); + + DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 224, input, 240, in16, in17, + in30, in31); + + vec4 = __lsx_vsub_h(in17, in22); + __lsx_vst(vec4, input, 32); + vec4 = __lsx_vsub_h(in16, in23); + __lsx_vst(vec4, input, 48); + vec4 = __lsx_vsub_h(in31, in24); + __lsx_vst(vec4, input, 192); + vec4 = __lsx_vsub_h(in30, in25); + __lsx_vst(vec4, input, 208); + + DUP4_ARG2(__lsx_vadd_h, in16, in23, in17, in22, in30, in25, in31, in24, in16, + in17, in30, in31); + DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); + DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); + DUP4_ARG2(__lsx_vadd_h, in16, in19, in17, in18, in30, in29, in31, in28, in27, + in22, in21, in25); + DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); + DUP2_ARG2(__lsx_vadd_h, in27, in26, in25, in24, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + __lsx_vst(vec5, temp_ptr, 0); + __lsx_vst(vec4, temp_ptr, 1920); + + DUP2_ARG2(__lsx_vsub_h, in27, in26, in25, in24, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + __lsx_vst(vec5, temp_ptr, 896); + __lsx_vst(vec4, temp_ptr, 1024); + + DUP4_ARG2(__lsx_vsub_h, in17, in18, in16, in19, in31, in28, in30, in29, in23, + in26, in24, in20); + tmp0 = __lsx_vneg_h(in23); + DOTP_CONST_PAIR(tmp0, in20, cospi_28_64, cospi_4_64, in27, in25); + DUP2_ARG2(__lsx_vsub_h, in26, in27, in24, in25, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + __lsx_vst(vec4, temp_ptr, 1408); + __lsx_vst(vec5, temp_ptr, 512); + + DUP2_ARG2(__lsx_vadd_h, in26, in27, in24, in25, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + __lsx_vst(vec4, temp_ptr, 384); + __lsx_vst(vec5, temp_ptr, 1536); + + DUP4_ARG2(__lsx_vld, input, 32, input, 48, input, 64, input, 80, in22, in23, + in20, in21); + DUP4_ARG2(__lsx_vld, input, 160, input, 176, input, 192, input, 208, in26, + in27, in24, in25); + in16 = in20; + in17 = in21; + DUP2_ARG1(__lsx_vneg_h, in16, in17, tmp0, tmp1); + DOTP_CONST_PAIR(tmp0, in27, cospi_24_64, cospi_8_64, in20, in27); + DOTP_CONST_PAIR(tmp1, in26, cospi_24_64, cospi_8_64, in21, in26); + DUP4_ARG2(__lsx_vsub_h, in23, in20, in22, in21, in25, in26, in24, in27, in28, + in17, in18, in31); + DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); + DUP2_ARG2(__lsx_vadd_h, in28, in29, in31, in30, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + __lsx_vst(vec5, temp_ptr, 1664); + __lsx_vst(vec4, temp_ptr, 256); + + DUP2_ARG2(__lsx_vsub_h, in28, in29, in31, in30, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + __lsx_vst(vec5, temp_ptr, 640); + __lsx_vst(vec4, temp_ptr, 1280); + + DUP4_ARG2(__lsx_vadd_h, in22, in21, in23, in20, in24, in27, in25, in26, in16, + in29, in30, in19); + tmp0 = __lsx_vneg_h(in16); + DOTP_CONST_PAIR(tmp0, in19, cospi_12_64, cospi_20_64, in28, in31); + DUP2_ARG2(__lsx_vsub_h, in29, in28, in30, in31, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + __lsx_vst(vec5, temp_ptr, 1152); + __lsx_vst(vec4, temp_ptr, 768); + + DUP2_ARG2(__lsx_vadd_h, in29, in28, in30, in31, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + __lsx_vst(vec5, temp_ptr, 128); + __lsx_vst(vec4, temp_ptr, 1792); +} + +static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride, + int16_t *tmp_buf, int16_t *tmp_buf_big) { + fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf); + fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big); + fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32)); +} + +static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff, + int16_t *output) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in8, in9, in10, in11, in12, in13, in14, in15; + __m128i step0, step1, step2, step3, step4, step5, step6, step7; + + DUP4_ARG2(__lsx_vld, temp_buff, 0, temp_buff, 64, temp_buff, 128, temp_buff, + 192, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vld, temp_buff, 256, temp_buff, 320, temp_buff, 384, + temp_buff, 448, in4, in5, in6, in7); + DUP4_ARG2(__lsx_vld, temp_buff, 48, temp_buff, 112, temp_buff, 176, temp_buff, + 240, in8, in9, in10, in11); + DUP4_ARG2(__lsx_vld, temp_buff, 304, temp_buff, 368, temp_buff, 432, + temp_buff, 496, in12, in13, in14, in15); + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, + in11, in12, in13, in14, in15, step0, step1, step2, step3, + step4, step5, step6, step7, in8, in9, in10, in11, in12, + in13, in14, in15); + + __lsx_vst(step0, output, 0); + __lsx_vst(step1, output, 16); + __lsx_vst(step2, output, 32); + __lsx_vst(step3, output, 48); + __lsx_vst(step4, output, 64); + __lsx_vst(step5, output, 80); + __lsx_vst(step6, output, 96); + __lsx_vst(step7, output, 112); + + __lsx_vst(in8, output, 384); + __lsx_vst(in9, output, 400); + __lsx_vst(in10, output, 416); + __lsx_vst(in11, output, 432); + __lsx_vst(in12, output, 448); + __lsx_vst(in13, output, 464); + __lsx_vst(in14, output, 480); + __lsx_vst(in15, output, 496); + + /* 2nd set */ + DUP4_ARG2(__lsx_vld, temp_buff, 16, temp_buff, 80, temp_buff, 144, temp_buff, + 208, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vld, temp_buff, 272, temp_buff, 336, temp_buff, 400, + temp_buff, 464, in4, in5, in6, in7); + DUP4_ARG2(__lsx_vld, temp_buff, 32, temp_buff, 96, temp_buff, 160, temp_buff, + 224, in8, in9, in10, in11); + DUP4_ARG2(__lsx_vld, temp_buff, 288, temp_buff, 352, temp_buff, 416, + temp_buff, 480, in12, in13, in14, in15); + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, + in11, in12, in13, in14, in15, step0, step1, step2, step3, + step4, step5, step6, step7, in8, in9, in10, in11, in12, + in13, in14, in15); + + __lsx_vst(step0, output, 128); + __lsx_vst(step1, output, 144); + __lsx_vst(step2, output, 160); + __lsx_vst(step3, output, 176); + __lsx_vst(step4, output, 192); + __lsx_vst(step5, output, 208); + __lsx_vst(step6, output, 224); + __lsx_vst(step7, output, 240); + + __lsx_vst(in8, output, 256); + __lsx_vst(in9, output, 272); + __lsx_vst(in10, output, 288); + __lsx_vst(in11, output, 304); + __lsx_vst(in12, output, 320); + __lsx_vst(in13, output, 336); + __lsx_vst(in14, output, 352); + __lsx_vst(in15, output, 368); +} + +static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr, + int16_t *out) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in8, in9, in10, in11, in12, in13, in14, in15; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l; + __m128i vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r; + __m128i tmp0_w, tmp1_w, tmp2_w, tmp3_w; + + /* fdct32 even */ + /* stage 2 */ + DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48, in0, in1, in2, + in3); + DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 96, input, 112, in4, in5, + in6, in7); + DUP4_ARG2(__lsx_vld, input, 128, input, 144, input, 160, input, 176, in8, in9, + in10, in11); + DUP4_ARG2(__lsx_vld, input, 192, input, 208, input, 224, input, 240, in12, + in13, in14, in15); + + LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, + in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, + vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14, + in15); + + __lsx_vst(vec0, interm_ptr, 0); + __lsx_vst(vec1, interm_ptr, 16); + __lsx_vst(vec2, interm_ptr, 32); + __lsx_vst(vec3, interm_ptr, 48); + __lsx_vst(vec4, interm_ptr, 64); + __lsx_vst(vec5, interm_ptr, 80); + __lsx_vst(vec6, interm_ptr, 96); + __lsx_vst(vec7, interm_ptr, 112); + + __lsx_vst(in8, interm_ptr, 128); + __lsx_vst(in9, interm_ptr, 144); + __lsx_vst(in10, interm_ptr, 160); + __lsx_vst(in11, interm_ptr, 176); + __lsx_vst(in12, interm_ptr, 192); + __lsx_vst(in13, interm_ptr, 208); + __lsx_vst(in14, interm_ptr, 224); + __lsx_vst(in15, interm_ptr, 240); + + /* Stage 3 */ + UNPCK_SH_SW(vec0, vec0_l, vec0_r); + UNPCK_SH_SW(vec1, vec1_l, vec1_r); + UNPCK_SH_SW(vec2, vec2_l, vec2_r); + UNPCK_SH_SW(vec3, vec3_l, vec3_r); + UNPCK_SH_SW(vec4, vec4_l, vec4_r); + UNPCK_SH_SW(vec5, vec5_l, vec5_r); + UNPCK_SH_SW(vec6, vec6_l, vec6_r); + UNPCK_SH_SW(vec7, vec7_l, vec7_r); + DUP4_ARG2(__lsx_vadd_w, vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, + vec3_r, vec4_r, tmp0_w, tmp1_w, tmp2_w, tmp3_w); + LSX_BUTTERFLY_4_W(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, + vec5_r); + DUP4_ARG2(__lsx_vadd_w, vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, + vec3_l, vec4_l, vec0_r, vec1_r, vec2_r, vec3_r); + + tmp3_w = __lsx_vadd_w(vec0_r, vec3_r); + vec0_r = __lsx_vsub_w(vec0_r, vec3_r); + vec3_r = __lsx_vadd_w(vec1_r, vec2_r); + vec1_r = __lsx_vsub_w(vec1_r, vec2_r); + + DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64, + vec4_r, tmp3_w, vec6_r, vec3_r); + FDCT32_POSTPROC_NEG_W(vec4_r); + FDCT32_POSTPROC_NEG_W(tmp3_w); + FDCT32_POSTPROC_NEG_W(vec6_r); + FDCT32_POSTPROC_NEG_W(vec3_r); + DUP2_ARG2(__lsx_vpickev_h, vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); + __lsx_vst(vec5, out, 0); + __lsx_vst(vec4, out, 16); + + DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64, + vec4_r, tmp3_w, vec6_r, vec3_r); + FDCT32_POSTPROC_NEG_W(vec4_r); + FDCT32_POSTPROC_NEG_W(tmp3_w); + FDCT32_POSTPROC_NEG_W(vec6_r); + FDCT32_POSTPROC_NEG_W(vec3_r); + DUP2_ARG2(__lsx_vpickev_h, vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); + __lsx_vst(vec5, out, 32); + __lsx_vst(vec4, out, 48); + + DUP4_ARG2(__lsx_vld, interm_ptr, 0, interm_ptr, 16, interm_ptr, 32, + interm_ptr, 48, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, interm_ptr, 64, interm_ptr, 80, interm_ptr, 96, + interm_ptr, 112, vec4, vec5, vec6, vec7); + DUP4_ARG2(__lsx_vsub_h, vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, + vec5, vec6, vec7); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + __lsx_vst(in4, out, 64); + __lsx_vst(in5, out, 112); + + DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + __lsx_vst(in4, out, 80); + __lsx_vst(in5, out, 96); + + DUP4_ARG2(__lsx_vld, interm_ptr, 128, interm_ptr, 144, interm_ptr, 160, + interm_ptr, 176, in8, in9, in10, in11); + DUP4_ARG2(__lsx_vld, interm_ptr, 192, interm_ptr, 208, interm_ptr, 224, + interm_ptr, 240, in12, in13, in14, in15); + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, + vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + __lsx_vst(in4, out, 128); + __lsx_vst(in5, out, 240); + + DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + __lsx_vst(in4, out, 144); + __lsx_vst(in5, out, 224); + + DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5); + tmp0_w = __lsx_vneg_h(vec2); + DOTP_CONST_PAIR(tmp0_w, vec5, cospi_24_64, cospi_8_64, in2, in1); + DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, + vec2, vec5); + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + __lsx_vst(in4, out, 160); + __lsx_vst(in5, out, 208); + + DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + __lsx_vst(in4, out, 192); + __lsx_vst(in5, out, 176); +} + +static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in8, in9, in10, in11, in12, in13, in14, in15; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; + + /* fdct32 even */ + /* stage 2 */ + DUP4_ARG2(__lsx_vld, temp, 0, temp, 16, temp, 32, temp, 48, in0, in1, in2, + in3); + DUP4_ARG2(__lsx_vld, temp, 64, temp, 80, temp, 96, temp, 112, in4, in5, in6, + in7); + DUP4_ARG2(__lsx_vld, temp, 128, temp, 144, temp, 160, temp, 176, in8, in9, + in10, in11); + DUP4_ARG2(__lsx_vld, temp, 192, temp, 208, temp, 224, temp, 240, in12, in13, + in14, in15); + + LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, + in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, + vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14, + in15); + /* Stage 3 */ + DUP4_ARG2(__lsx_vadd_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, + in1, in2, in3); + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, in4, in1, in0); + DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + __lsx_vst(temp0, out, 0); + __lsx_vst(temp1, out, 16); + + DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + __lsx_vst(temp0, out, 32); + __lsx_vst(temp1, out, 48); + + DUP4_ARG2(__lsx_vsub_h, vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, + vec5, vec6, vec7); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + __lsx_vst(temp0, out, 64); + __lsx_vst(temp1, out, 112); + + DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + __lsx_vst(temp0, out, 80); + __lsx_vst(temp1, out, 96); + + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, + vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + __lsx_vst(temp0, out, 128); + __lsx_vst(temp1, out, 240); + + DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + __lsx_vst(temp0, out, 144); + __lsx_vst(temp1, out, 224); + + DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5); + temp0 = __lsx_vneg_h(vec2); + DOTP_CONST_PAIR(temp0, vec5, cospi_24_64, cospi_8_64, in2, in1); + DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, + vec2, vec5) + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + __lsx_vst(temp0, out, 160); + __lsx_vst(temp1, out, 208); + + DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + __lsx_vst(temp0, out, 192); + __lsx_vst(temp1, out, 176); +} + +static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr, + int16_t *out) { + __m128i in16, in17, in18, in19, in20, in21, in22, in23; + __m128i in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5; + __m128i tmp0, tmp1; + + in20 = __lsx_vld(temp, 64); + in21 = __lsx_vld(temp, 80); + in26 = __lsx_vld(temp, 160); + in27 = __lsx_vld(temp, 176); + + DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); + DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); + + in18 = __lsx_vld(temp, 32); + in19 = __lsx_vld(temp, 48); + in28 = __lsx_vld(temp, 192); + in29 = __lsx_vld(temp, 208); + + vec4 = __lsx_vsub_h(in19, in20); + __lsx_vst(vec4, interm_ptr, 64); + vec4 = __lsx_vsub_h(in18, in21); + __lsx_vst(vec4, interm_ptr, 176); + vec4 = __lsx_vsub_h(in28, in27); + __lsx_vst(vec4, interm_ptr, 112); + vec4 = __lsx_vsub_h(in29, in26); + __lsx_vst(vec4, interm_ptr, 128); + + DUP4_ARG2(__lsx_vadd_h, in18, in21, in19, in20, in28, in27, in29, in26, in21, + in20, in27, in26); + + in22 = __lsx_vld(temp, 96); + in23 = __lsx_vld(temp, 112); + in24 = __lsx_vld(temp, 128); + in25 = __lsx_vld(temp, 144); + + DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); + DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); + + in16 = __lsx_vld(temp, 0); + in17 = __lsx_vld(temp, 16); + in30 = __lsx_vld(temp, 224); + in31 = __lsx_vld(temp, 240); + + vec4 = __lsx_vsub_h(in17, in22); + __lsx_vst(vec4, interm_ptr, 80); + vec4 = __lsx_vsub_h(in30, in25); + __lsx_vst(vec4, interm_ptr, 96); + vec4 = __lsx_vsub_h(in31, in24); + __lsx_vst(vec4, interm_ptr, 144); + vec4 = __lsx_vsub_h(in16, in23); + __lsx_vst(vec4, interm_ptr, 160); + + DUP4_ARG2(__lsx_vadd_h, in16, in23, in17, in22, in30, in25, in31, in24, in16, + in17, in30, in31); + DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); + DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); + + DUP4_ARG2(__lsx_vadd_h, in16, in19, in17, in18, in30, in29, in31, in28, in27, + in22, in21, in25); + DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); + DUP2_ARG2(__lsx_vadd_h, in27, in26, in25, in24, in23, in20); + + DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + __lsx_vst(vec5, out, 0); + __lsx_vst(vec4, out, 240); + + DUP2_ARG2(__lsx_vsub_h, in27, in26, in25, in24, in22, in21); + + DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + __lsx_vst(vec5, out, 224); + __lsx_vst(vec4, out, 16); + + DUP4_ARG2(__lsx_vsub_h, in17, in18, in16, in19, in31, in28, in30, in29, in23, + in26, in24, in20); + tmp0 = __lsx_vneg_h(in23); + DOTP_CONST_PAIR(tmp0, in20, cospi_28_64, cospi_4_64, in27, in25); + DUP2_ARG2(__lsx_vsub_h, in26, in27, in24, in25, in23, in20); + + DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + __lsx_vst(vec4, out, 32); + __lsx_vst(vec5, out, 208); + + DUP2_ARG2(__lsx_vadd_h, in26, in27, in24, in25, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + __lsx_vst(vec4, out, 48); + __lsx_vst(vec5, out, 192); + + in20 = __lsx_vld(interm_ptr, 64); + in21 = __lsx_vld(interm_ptr, 176); + in27 = __lsx_vld(interm_ptr, 112); + in26 = __lsx_vld(interm_ptr, 128); + + in16 = in20; + in17 = in21; + DUP2_ARG1(__lsx_vneg_h, in16, in17, tmp0, tmp1); + DOTP_CONST_PAIR(tmp0, in27, cospi_24_64, cospi_8_64, in20, in27); + DOTP_CONST_PAIR(tmp1, in26, cospi_24_64, cospi_8_64, in21, in26); + + in22 = __lsx_vld(interm_ptr, 80); + in25 = __lsx_vld(interm_ptr, 96); + in24 = __lsx_vld(interm_ptr, 144); + in23 = __lsx_vld(interm_ptr, 160); + + DUP4_ARG2(__lsx_vsub_h, in23, in20, in22, in21, in25, in26, in24, in27, in28, + in17, in18, in31); + DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); + DUP2_ARG2(__lsx_vadd_h, in28, in29, in31, in30, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + __lsx_vst(vec5, out, 64); + __lsx_vst(vec4, out, 176); + + DUP2_ARG2(__lsx_vsub_h, in28, in29, in31, in30, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + __lsx_vst(vec5, out, 80); + __lsx_vst(vec4, out, 160); + + DUP4_ARG2(__lsx_vadd_h, in22, in21, in23, in20, in24, in27, in25, in26, in16, + in29, in30, in19); + tmp0 = __lsx_vneg_h(in16); + DOTP_CONST_PAIR(tmp0, in19, cospi_12_64, cospi_20_64, in28, in31); + DUP2_ARG2(__lsx_vsub_h, in29, in28, in30, in31, in16, in19); + + DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + __lsx_vst(vec5, out, 144); + __lsx_vst(vec4, out, 96); + + DUP2_ARG2(__lsx_vadd_h, in29, in28, in30, in31, in17, in18); + + DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + __lsx_vst(vec4, out, 112); + __lsx_vst(vec5, out, 128); +} + +static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1; + + /* 1st set */ + in0 = __lsx_vld(temp, 0); + in4 = __lsx_vld(temp, 64); + in2 = __lsx_vld(temp, 128); + in6 = __lsx_vld(temp, 192); + in1 = __lsx_vld(temp, 256); + in7 = __lsx_vld(temp, 304); + in3 = __lsx_vld(temp, 384); + in5 = __lsx_vld(temp, 432); + + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + + /* 2nd set */ + in0_1 = __lsx_vld(temp, 32); + in1_1 = __lsx_vld(temp, 464); + in2_1 = __lsx_vld(temp, 160); + in3_1 = __lsx_vld(temp, 336); + in4_1 = __lsx_vld(temp, 96); + in5_1 = __lsx_vld(temp, 352); + in6_1 = __lsx_vld(temp, 224); + in7_1 = __lsx_vld(temp, 480); + + __lsx_vst(in0, output, 0); + __lsx_vst(in1, output, 64); + __lsx_vst(in2, output, 128); + __lsx_vst(in3, output, 192); + __lsx_vst(in4, output, 256); + __lsx_vst(in5, output, 320); + __lsx_vst(in6, output, 384); + __lsx_vst(in7, output, 448); + + LSX_TRANSPOSE8x8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1); + + /* 3rd set */ + in0 = __lsx_vld(temp, 16); + in1 = __lsx_vld(temp, 272); + in2 = __lsx_vld(temp, 144); + in3 = __lsx_vld(temp, 400); + in4 = __lsx_vld(temp, 80); + in5 = __lsx_vld(temp, 416); + in6 = __lsx_vld(temp, 208); + in7 = __lsx_vld(temp, 288); + + __lsx_vst(in0_1, output, 16); + __lsx_vst(in1_1, output, 80); + __lsx_vst(in2_1, output, 144); + __lsx_vst(in3_1, output, 208); + __lsx_vst(in4_1, output, 272); + __lsx_vst(in5_1, output, 336); + __lsx_vst(in6_1, output, 400); + __lsx_vst(in7_1, output, 464); + + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + + __lsx_vst(in0, output, 32); + __lsx_vst(in1, output, 96); + __lsx_vst(in2, output, 160); + __lsx_vst(in3, output, 224); + __lsx_vst(in4, output, 288); + __lsx_vst(in5, output, 352); + __lsx_vst(in6, output, 416); + __lsx_vst(in7, output, 480); + + /* 4th set */ + in0_1 = __lsx_vld(temp, 48); + in1_1 = __lsx_vld(temp, 448); + in2_1 = __lsx_vld(temp, 176); + in3_1 = __lsx_vld(temp, 320); + in4_1 = __lsx_vld(temp, 112); + in5_1 = __lsx_vld(temp, 368); + in6_1 = __lsx_vld(temp, 240); + in7_1 = __lsx_vld(temp, 496); + + LSX_TRANSPOSE8x8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1); + + __lsx_vst(in0_1, output, 48); + __lsx_vst(in1_1, output, 112); + __lsx_vst(in2_1, output, 176); + __lsx_vst(in3_1, output, 240); + __lsx_vst(in4_1, output, 304); + __lsx_vst(in5_1, output, 368); + __lsx_vst(in6_1, output, 432); + __lsx_vst(in7_1, output, 496); +} + +static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, int16_t *output) { + fdct8x32_1d_row_load_butterfly(temp, temp_buf); + fdct8x32_1d_row_even(temp_buf, temp_buf); + fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128); + fdct8x32_1d_row_transpose_store(temp_buf, output); +} + +static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf, + int16_t *output) { + fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf); + fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf); + fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128); + fdct8x32_1d_row_transpose_store(tmp_buf, output); +} + +void vpx_fdct32x32_lsx(const int16_t *input, int16_t *output, + int32_t src_stride) { + int i; + DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]); + DECLARE_ALIGNED(32, int16_t, tmp_buf[256]); + + /* column transform */ + for (i = 0; i < 4; ++i) { + fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf, + tmp_buf_big + (8 * i)); + } + + /* row transform */ + fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output); + + /* row transform */ + for (i = 1; i < 4; ++i) { + fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256)); + } +} + +static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in8, in9, in10, in11, in12, in13, in14, in15; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; + + /* fdct32 even */ + /* stage 2 */ + DUP4_ARG2(__lsx_vld, temp, 0, temp, 16, temp, 32, temp, 48, in0, in1, in2, + in3); + DUP4_ARG2(__lsx_vld, temp, 64, temp, 80, temp, 96, temp, 112, in4, in5, in6, + in7); + DUP4_ARG2(__lsx_vld, temp, 128, temp, 144, temp, 160, temp, 176, in8, in9, + in10, in11); + DUP4_ARG2(__lsx_vld, temp, 192, temp, 208, temp, 224, temp, 240, in12, in13, + in14, in15); + LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, + in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, + vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14, + in15); + + FDCT_POSTPROC_2V_NEG_H(vec0, vec1); + FDCT_POSTPROC_2V_NEG_H(vec2, vec3); + FDCT_POSTPROC_2V_NEG_H(vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec6, vec7); + FDCT_POSTPROC_2V_NEG_H(in8, in9); + FDCT_POSTPROC_2V_NEG_H(in10, in11); + FDCT_POSTPROC_2V_NEG_H(in12, in13); + FDCT_POSTPROC_2V_NEG_H(in14, in15); + + /* Stage 3 */ + DUP4_ARG2(__lsx_vadd_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, + in1, in2, in3); + + temp0 = __lsx_vadd_h(in0, in3); + in0 = __lsx_vsub_h(in0, in3); + in3 = __lsx_vadd_h(in1, in2); + in1 = __lsx_vsub_h(in1, in2); + + DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0); + __lsx_vst(temp0, out, 0); + __lsx_vst(temp1, out, 16); + + DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); + __lsx_vst(temp0, out, 32); + __lsx_vst(temp1, out, 48); + + DUP4_ARG2(__lsx_vsub_h, vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, + vec5, vec6, vec7); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); + __lsx_vst(temp0, out, 64); + __lsx_vst(temp1, out, 112); + + DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); + __lsx_vst(temp0, out, 80); + __lsx_vst(temp1, out, 96); + + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, + vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); + __lsx_vst(temp0, out, 128); + __lsx_vst(temp1, out, 240); + + DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); + __lsx_vst(temp0, out, 144); + __lsx_vst(temp1, out, 224); + + DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5); + temp0 = __lsx_vneg_h(vec2); + DOTP_CONST_PAIR(temp0, vec5, cospi_24_64, cospi_8_64, in2, in1); + DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, + vec2, vec5); + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); + __lsx_vst(temp0, out, 160); + __lsx_vst(temp1, out, 208); + + DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); + __lsx_vst(temp0, out, 192); + __lsx_vst(temp1, out, 176); +} + +static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr, + int16_t *out) { + __m128i in16, in17, in18, in19, in20, in21, in22, in23; + __m128i in24, in25, in26, in27, in28, in29, in30, in31; + __m128i vec4, vec5, tmp0, tmp1; + + in20 = __lsx_vld(temp, 64); + in21 = __lsx_vld(temp, 80); + in26 = __lsx_vld(temp, 160); + in27 = __lsx_vld(temp, 176); + + DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); + DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); + + FDCT_POSTPROC_2V_NEG_H(in20, in21); + FDCT_POSTPROC_2V_NEG_H(in26, in27); + + in18 = __lsx_vld(temp, 32); + in19 = __lsx_vld(temp, 48); + in28 = __lsx_vld(temp, 192); + in29 = __lsx_vld(temp, 208); + + FDCT_POSTPROC_2V_NEG_H(in18, in19); + FDCT_POSTPROC_2V_NEG_H(in28, in29); + + vec4 = __lsx_vsub_h(in19, in20); + __lsx_vst(vec4, interm_ptr, 64); + vec4 = __lsx_vsub_h(in18, in21); + __lsx_vst(vec4, interm_ptr, 176); + vec4 = __lsx_vsub_h(in29, in26); + __lsx_vst(vec4, interm_ptr, 128); + vec4 = __lsx_vsub_h(in28, in27); + __lsx_vst(vec4, interm_ptr, 112); + + DUP4_ARG2(__lsx_vadd_h, in18, in21, in19, in20, in28, in27, in29, in26, in21, + in20, in27, in26); + + in22 = __lsx_vld(temp, 96); + in23 = __lsx_vld(temp, 112); + in24 = __lsx_vld(temp, 128); + in25 = __lsx_vld(temp, 144); + + DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); + DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); + FDCT_POSTPROC_2V_NEG_H(in22, in23); + FDCT_POSTPROC_2V_NEG_H(in24, in25); + + in16 = __lsx_vld(temp, 0); + in17 = __lsx_vld(temp, 16); + in30 = __lsx_vld(temp, 224); + in31 = __lsx_vld(temp, 240); + + FDCT_POSTPROC_2V_NEG_H(in16, in17); + FDCT_POSTPROC_2V_NEG_H(in30, in31); + + vec4 = __lsx_vsub_h(in17, in22); + __lsx_vst(vec4, interm_ptr, 80); + vec4 = __lsx_vsub_h(in30, in25); + __lsx_vst(vec4, interm_ptr, 96); + vec4 = __lsx_vsub_h(in31, in24); + __lsx_vst(vec4, interm_ptr, 144); + vec4 = __lsx_vsub_h(in16, in23); + __lsx_vst(vec4, interm_ptr, 160); + + DUP4_ARG2(__lsx_vadd_h, in16, in23, in17, in22, in30, in25, in31, in24, in16, + in17, in30, in31); + DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); + DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); + DUP4_ARG2(__lsx_vadd_h, in16, in19, in17, in18, in30, in29, in31, in28, in27, + in22, in21, in25); + DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); + DUP2_ARG2(__lsx_vadd_h, in27, in26, in25, in24, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); + __lsx_vst(vec5, out, 0); + __lsx_vst(vec4, out, 240); + + DUP2_ARG2(__lsx_vsub_h, in27, in26, in25, in24, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); + __lsx_vst(vec5, out, 224); + __lsx_vst(vec4, out, 16); + + DUP4_ARG2(__lsx_vsub_h, in17, in18, in16, in19, in31, in28, in30, in29, in23, + in26, in24, in20); + tmp0 = __lsx_vneg_h(in23); + DOTP_CONST_PAIR(tmp0, in20, cospi_28_64, cospi_4_64, in27, in25); + DUP2_ARG2(__lsx_vsub_h, in26, in27, in24, in25, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); + __lsx_vst(vec4, out, 32); + __lsx_vst(vec5, out, 208); + + DUP2_ARG2(__lsx_vadd_h, in26, in27, in24, in25, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); + __lsx_vst(vec4, out, 48); + __lsx_vst(vec5, out, 192); + + in20 = __lsx_vld(interm_ptr, 64); + in21 = __lsx_vld(interm_ptr, 176); + in27 = __lsx_vld(interm_ptr, 112); + in26 = __lsx_vld(interm_ptr, 128); + + in16 = in20; + in17 = in21; + DUP2_ARG1(__lsx_vneg_h, in16, in17, tmp0, tmp1); + DOTP_CONST_PAIR(tmp0, in27, cospi_24_64, cospi_8_64, in20, in27); + DOTP_CONST_PAIR(tmp1, in26, cospi_24_64, cospi_8_64, in21, in26); + + in22 = __lsx_vld(interm_ptr, 80); + in25 = __lsx_vld(interm_ptr, 96); + in24 = __lsx_vld(interm_ptr, 144); + in23 = __lsx_vld(interm_ptr, 160); + + DUP4_ARG2(__lsx_vsub_h, in23, in20, in22, in21, in25, in26, in24, in27, in28, + in17, in18, in31); + DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); + in16 = __lsx_vadd_h(in28, in29); + in19 = __lsx_vadd_h(in31, in30); + DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); + __lsx_vst(vec5, out, 64); + __lsx_vst(vec4, out, 176); + + DUP2_ARG2(__lsx_vsub_h, in28, in29, in31, in30, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); + __lsx_vst(vec5, out, 80); + __lsx_vst(vec4, out, 160); + + DUP4_ARG2(__lsx_vadd_h, in22, in21, in23, in20, in24, in27, in25, in26, in16, + in29, in30, in19); + tmp0 = __lsx_vneg_h(in16); + DOTP_CONST_PAIR(tmp0, in19, cospi_12_64, cospi_20_64, in28, in31); + DUP2_ARG2(__lsx_vsub_h, in29, in28, in30, in31, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); + __lsx_vst(vec5, out, 144); + __lsx_vst(vec4, out, 96); + + DUP2_ARG2(__lsx_vadd_h, in29, in28, in30, in31, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); + __lsx_vst(vec4, out, 112); + __lsx_vst(vec5, out, 128); +} + +static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf, + int16_t *output) { + fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf); + fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf); + fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128)); + fdct8x32_1d_row_transpose_store(tmp_buf, output); +} + +void vpx_fdct32x32_rd_lsx(const int16_t *input, int16_t *out, + int32_t src_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]); + DECLARE_ALIGNED(32, int16_t, tmp_buf[256]); + + /* column transform */ + for (i = 0; i < 4; ++i) { + fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0], + &tmp_buf_big[0] + (8 * i)); + } + /* row transform */ + for (i = 0; i < 4; ++i) { + fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0], + out + (8 * i * 32)); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.c new file mode 100644 index 0000000000..508532b9d8 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.c @@ -0,0 +1,350 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/fwd_txfm_lsx.h" + +#define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + do { \ + __m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3; \ + \ + DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1); \ + DUP2_ARG2(__lsx_vilvh_h, _in2, _in0, _in3, _in1, _s2, _s3); \ + _t0 = __lsx_vilvl_h(_s1, _s0); \ + _t1 = __lsx_vilvh_h(_s1, _s0); \ + _t2 = __lsx_vilvl_h(_s3, _s2); \ + _t3 = __lsx_vilvh_h(_s3, _s2); \ + DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2); \ + DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3); \ + } while (0) + +#if !CONFIG_VP9_HIGHBITDEPTH +void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, + int32_t src_stride) { + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in8, in9, in10, in11, in12, in13, in14, in15; + __m128i stp21, stp22, stp23, stp24, stp25, stp26, stp30; + __m128i stp31, stp32, stp33, stp34, stp35, stp36, stp37; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5; + __m128i coeff = { 0x187e3b21d2bf2d41, 0x238e3537e782c4df }; + __m128i coeff1 = { 0x289a317906463fb1, 0x12943d3f1e2b3871 }; + __m128i coeff2 = { 0xed6cd766c78fc04f, 0x0 }; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride4 = src_stride2 << 1; + int32_t src_stride6 = src_stride4 + src_stride2; + int32_t src_stride8 = src_stride4 << 1; + int16_t *input_tmp = (int16_t *)input; + in0 = __lsx_vld(input_tmp, 0); + DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, + input_tmp, src_stride6, input_tmp, src_stride8, in1, in2, in3, in4); + input_tmp += src_stride4; + DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, + input_tmp, src_stride6, input_tmp, src_stride8, in5, in6, in7, in8); + input_tmp += src_stride4; + DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, + input_tmp, src_stride6, input_tmp, src_stride8, in9, in10, in11, + in12); + input_tmp += src_stride4; + DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in13, + in14); + input_tmp += src_stride2; + in15 = __lsx_vldx(input_tmp, src_stride2); + + DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7); + DUP4_ARG2(__lsx_vslli_h, in8, 2, in9, 2, in10, 2, in11, 2, in8, in9, in10, + in11); + DUP4_ARG2(__lsx_vslli_h, in12, 2, in13, 2, in14, 2, in15, 2, in12, in13, in14, + in15); + DUP4_ARG2(__lsx_vadd_h, in0, in15, in1, in14, in2, in13, in3, in12, tmp0, + tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vadd_h, in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, + tmp6, tmp7); + FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, + tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); + __lsx_vst(tmp0, tmp_ptr, 0); + __lsx_vst(tmp1, tmp_ptr, 64); + __lsx_vst(tmp2, tmp_ptr, 128); + __lsx_vst(tmp3, tmp_ptr, 192); + __lsx_vst(tmp4, tmp_ptr, 256); + __lsx_vst(tmp5, tmp_ptr, 320); + __lsx_vst(tmp6, tmp_ptr, 384); + __lsx_vst(tmp7, tmp_ptr, 448); + DUP4_ARG2(__lsx_vsub_h, in0, in15, in1, in14, in2, in13, in3, in12, in15, + in14, in13, in12); + DUP4_ARG2(__lsx_vsub_h, in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, + in9, in8); + + tmp_ptr += 16; + + /* stp 1 */ + DUP2_ARG2(__lsx_vilvh_h, in10, in13, in11, in12, vec2, vec4); + DUP2_ARG2(__lsx_vilvl_h, in10, in13, in11, in12, vec3, vec5); + + cnst4 = __lsx_vreplvei_h(coeff, 0); + DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4, stp25); + + cnst5 = __lsx_vreplvei_h(coeff, 1); + cnst5 = __lsx_vpackev_h(cnst5, cnst4); + DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5, stp22); + DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4, stp24); + DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5, stp23); + + /* stp2 */ + LSX_BUTTERFLY_4_H(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33); + LSX_BUTTERFLY_4_H(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34); + DUP2_ARG2(__lsx_vilvh_h, stp36, stp31, stp35, stp32, vec2, vec4); + DUP2_ARG2(__lsx_vilvl_h, stp36, stp31, stp35, stp32, vec3, vec5); + DUP2_ARG2(__lsx_vreplvei_h, coeff, 2, coeff, 3, cnst0, cnst1); + cnst0 = __lsx_vpackev_h(cnst0, cnst1); + DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0, stp26); + + cnst0 = __lsx_vreplvei_h(coeff, 4); + cnst1 = __lsx_vpackev_h(cnst1, cnst0); + DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1, stp21); + + LSX_BUTTERFLY_4_H(stp30, stp37, stp26, stp21, in8, in15, in14, in9); + vec1 = __lsx_vilvl_h(in15, in8); + vec0 = __lsx_vilvh_h(in15, in8); + + DUP2_ARG2(__lsx_vreplvei_h, coeff1, 0, coeff1, 1, cnst0, cnst1); + cnst0 = __lsx_vpackev_h(cnst0, cnst1); + + DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8); + __lsx_vst(in8, tmp_ptr, 0); + + cnst0 = __lsx_vreplvei_h(coeff2, 0); + cnst0 = __lsx_vpackev_h(cnst1, cnst0); + DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8); + __lsx_vst(in8, tmp_ptr, 448); + + vec1 = __lsx_vilvl_h(in14, in9); + vec0 = __lsx_vilvh_h(in14, in9); + DUP2_ARG2(__lsx_vreplvei_h, coeff1, 2, coeff1, 3, cnst0, cnst1); + cnst1 = __lsx_vpackev_h(cnst1, cnst0); + + DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1, in8); + __lsx_vst(in8, tmp_ptr, 256); + + cnst1 = __lsx_vreplvei_h(coeff2, 2); + cnst0 = __lsx_vpackev_h(cnst0, cnst1); + DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8); + __lsx_vst(in8, tmp_ptr, 192); + + DUP2_ARG2(__lsx_vreplvei_h, coeff, 2, coeff, 5, cnst0, cnst1); + cnst1 = __lsx_vpackev_h(cnst1, cnst0); + DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1, stp25); + + cnst1 = __lsx_vreplvei_h(coeff, 3); + cnst1 = __lsx_vpackev_h(cnst0, cnst1); + DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1, stp22); + + /* stp4 */ + DUP2_ARG2(__lsx_vadd_h, stp34, stp25, stp33, stp22, in13, in10); + + vec1 = __lsx_vilvl_h(in13, in10); + vec0 = __lsx_vilvh_h(in13, in10); + DUP2_ARG2(__lsx_vreplvei_h, coeff1, 4, coeff1, 5, cnst0, cnst1); + cnst0 = __lsx_vpackev_h(cnst0, cnst1); + DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8); + __lsx_vst(in8, tmp_ptr, 128); + + cnst0 = __lsx_vreplvei_h(coeff2, 1); + cnst0 = __lsx_vpackev_h(cnst1, cnst0); + DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8); + __lsx_vst(in8, tmp_ptr, 320); + + DUP2_ARG2(__lsx_vsub_h, stp34, stp25, stp33, stp22, in12, in11); + vec1 = __lsx_vilvl_h(in12, in11); + vec0 = __lsx_vilvh_h(in12, in11); + DUP2_ARG2(__lsx_vreplvei_h, coeff1, 6, coeff1, 7, cnst0, cnst1); + cnst1 = __lsx_vpackev_h(cnst1, cnst0); + + DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1, in8); + __lsx_vst(in8, tmp_ptr, 384); + + cnst1 = __lsx_vreplvei_h(coeff2, 3); + cnst0 = __lsx_vpackev_h(cnst0, cnst1); + DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8); + __lsx_vst(in8, tmp_ptr, 64); +} + +void fdct16x8_1d_row(int16_t *input, int16_t *output) { + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in8, in9, in10, in11, in12, in13, in14, in15; + int16_t *input_tmp = input; + + DUP4_ARG2(__lsx_vld, input, 0, input, 32, input, 64, input, 96, in0, in1, in2, + in3); + DUP4_ARG2(__lsx_vld, input, 128, input, 160, input, 192, input, 224, in4, in5, + in6, in7); + DUP4_ARG2(__lsx_vld, input_tmp, 16, input_tmp, 48, input_tmp, 80, input_tmp, + 112, in8, in9, in10, in11); + DUP4_ARG2(__lsx_vld, input_tmp, 144, input_tmp, 176, input_tmp, 208, + input_tmp, 240, in12, in13, in14, in15); + + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + DUP4_ARG2(__lsx_vaddi_hu, in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vaddi_hu, in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7); + DUP4_ARG2(__lsx_vaddi_hu, in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, + in11); + DUP4_ARG2(__lsx_vaddi_hu, in12, 1, in13, 1, in14, 1, in15, 1, in12, in13, + in14, in15); + + DUP4_ARG2(__lsx_vsrai_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vsrai_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7); + DUP4_ARG2(__lsx_vsrai_h, in8, 2, in9, 2, in10, 2, in11, 2, in8, in9, in10, + in11); + DUP4_ARG2(__lsx_vsrai_h, in12, 2, in13, 2, in14, 2, in15, 2, in12, in13, in14, + in15); + LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, + in11, in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, + tmp5, tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14, + in15); + __lsx_vst(in8, input, 0); + __lsx_vst(in9, input, 32); + __lsx_vst(in10, input, 64); + __lsx_vst(in11, input, 96); + __lsx_vst(in12, input, 128); + __lsx_vst(in13, input, 160); + __lsx_vst(in14, input, 192); + __lsx_vst(in15, input, 224); + + FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, + tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); + DUP4_ARG2(__lsx_vld, input, 0, input, 32, input, 64, input, 96, in8, in9, + in10, in11); + DUP4_ARG2(__lsx_vld, input, 128, input, 160, input, 192, input, 224, in12, + in13, in14, in15); + FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3, + in4, in5, in6, in7); + LSX_TRANSPOSE8x8_H(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0, + tmp1, in1, tmp2, in2, tmp3, in3); + __lsx_vst(tmp0, output, 0); + __lsx_vst(in0, output, 32); + __lsx_vst(tmp1, output, 64); + __lsx_vst(in1, output, 96); + __lsx_vst(tmp2, output, 128); + __lsx_vst(in2, output, 160); + __lsx_vst(tmp3, output, 192); + __lsx_vst(in3, output, 224); + + LSX_TRANSPOSE8x8_H(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4, + tmp5, in5, tmp6, in6, tmp7, in7); + __lsx_vst(tmp4, output, 16); + __lsx_vst(in4, output, 48); + __lsx_vst(tmp5, output, 80); + __lsx_vst(in5, output, 112); + __lsx_vst(tmp6, output, 144); + __lsx_vst(in6, output, 176); + __lsx_vst(tmp7, output, 208); + __lsx_vst(in7, output, 240); +} + +void vpx_fdct4x4_lsx(const int16_t *input, int16_t *output, + int32_t src_stride) { + __m128i in0, in1, in2, in3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride4 = src_stride2 << 1; + int32_t src_stride6 = src_stride4 + src_stride2; + + in0 = __lsx_vld(input, 0); + DUP2_ARG2(__lsx_vldx, input, src_stride2, input, src_stride4, in1, in2); + in3 = __lsx_vldx(input, src_stride6); + + /* fdct4 pre-process */ + { + __m128i vec, mask; + __m128i zero = __lsx_vldi(0); + + mask = __lsx_vinsgr2vr_b(zero, 1, 0); + DUP4_ARG2(__lsx_vslli_h, in0, 4, in1, 4, in2, 4, in3, 4, in0, in1, in2, + in3); + vec = __lsx_vseqi_h(in0, 0); + vec = __lsx_vxori_b(vec, 255); + vec = __lsx_vand_v(mask, vec); + in0 = __lsx_vadd_h(in0, vec); + } + + VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vaddi_hu, in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vsrai_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3); + DUP2_ARG2(__lsx_vpickev_d, in1, in0, in3, in2, in0, in2); + __lsx_vst(in0, output, 0); + __lsx_vst(in2, output, 16); +} + +void vpx_fdct8x8_lsx(const int16_t *input, int16_t *output, + int32_t src_stride) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride4 = src_stride2 << 1; + int32_t src_stride6 = src_stride4 + src_stride2; + int16_t *input_tmp = (int16_t *)input; + + in0 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in1, + in2); + in3 = __lsx_vldx(input_tmp, src_stride6); + input_tmp += src_stride4; + in4 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in5, + in6); + in7 = __lsx_vldx(input_tmp, src_stride6); + + DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7); + + VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7); + + __lsx_vst(in0, output, 0); + __lsx_vst(in1, output, 16); + __lsx_vst(in2, output, 32); + __lsx_vst(in3, output, 48); + __lsx_vst(in4, output, 64); + __lsx_vst(in5, output, 80); + __lsx_vst(in6, output, 96); + __lsx_vst(in7, output, 112); +} + +void vpx_fdct16x16_lsx(const int16_t *input, int16_t *output, + int32_t src_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]); + + /* column transform */ + for (i = 0; i < 2; ++i) { + fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride); + } + + /* row transform */ + for (i = 0; i < 2; ++i) { + fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i))); + } +} +#endif // !CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.h new file mode 100644 index 0000000000..4a9fce9a3d --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.h @@ -0,0 +1,381 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_ +#define VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_ + +#include "vpx_dsp/loongarch/txfm_macros_lsx.h" +#include "vpx_dsp/txfm_common.h" + +#define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) \ + do { \ + __m128i cnst0_m, cnst1_m, cnst2_m, cnst3_m; \ + __m128i vec0_m, vec1_m, vec2_m, vec3_m; \ + __m128i vec4_m, vec5_m, vec6_m, vec7_m; \ + __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x000000000000c4df }; \ + \ + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m); \ + DUP2_ARG2(__lsx_vilvl_h, vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, cnst0_m, cnst1_m); \ + cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ + vec5_m = __lsx_vdp2_w_h(vec0_m, cnst1_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 3, cnst2_m, cnst3_m); \ + cnst2_m = __lsx_vpackev_h(cnst3_m, cnst2_m); \ + vec7_m = __lsx_vdp2_w_h(vec2_m, cnst2_m); \ + \ + vec4_m = __lsx_vdp2_w_h(vec0_m, cnst0_m); \ + cnst2_m = __lsx_vreplvei_h(coeff_m, 2); \ + cnst2_m = __lsx_vpackev_h(cnst2_m, cnst3_m); \ + vec6_m = __lsx_vdp2_w_h(vec2_m, cnst2_m); \ + \ + DUP4_ARG3(__lsx_vssrarni_h_w, vec4_m, vec4_m, DCT_CONST_BITS, vec5_m, \ + vec5_m, DCT_CONST_BITS, vec6_m, vec6_m, DCT_CONST_BITS, vec7_m, \ + vec7_m, DCT_CONST_BITS, out0, out2, out1, out3); \ + } while (0) + +#define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ + do { \ + __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \ + __m128i s7_m, x0_m, x1_m, x2_m, x3_m; \ + __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 }; \ + \ + /* FDCT stage1 */ \ + LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, \ + s2_m, s3_m, s4_m, s5_m, s6_m, s7_m); \ + LSX_BUTTERFLY_4_H(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ + DUP2_ARG2(__lsx_vilvh_h, x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ + DUP2_ARG2(__lsx_vilvl_h, x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, x0_m, x1_m); \ + x1_m = __lsx_vpackev_h(x1_m, x0_m); \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, out4); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, x2_m, x3_m); \ + x2_m = __lsx_vneg_h(x2_m); \ + x2_m = __lsx_vpackev_h(x3_m, x2_m); \ + DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out6); \ + \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, out0); \ + x2_m = __lsx_vreplvei_h(coeff_m, 2); \ + x2_m = __lsx_vpackev_h(x2_m, x3_m); \ + DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out2); \ + \ + /* stage2 */ \ + s1_m = __lsx_vilvl_h(s5_m, s6_m); \ + s0_m = __lsx_vilvh_h(s5_m, s6_m); \ + \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, s6_m); \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, s5_m); \ + \ + /* stage3 */ \ + LSX_BUTTERFLY_4_H(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \ + \ + /* stage4 */ \ + DUP2_ARG2(__lsx_vilvh_h, x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \ + DUP2_ARG2(__lsx_vilvl_h, x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 5, x0_m, x1_m); \ + x1_m = __lsx_vpackev_h(x0_m, x1_m); \ + DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m, out1); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 6, coeff_m, 7, x2_m, x3_m); \ + x2_m = __lsx_vpackev_h(x3_m, x2_m); \ + DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out5); \ + \ + x1_m = __lsx_vreplvei_h(coeff_m, 5); \ + x0_m = __lsx_vneg_h(x0_m); \ + x0_m = __lsx_vpackev_h(x1_m, x0_m); \ + DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m, out7); \ + x2_m = __lsx_vreplvei_h(coeff_m, 6); \ + x3_m = __lsx_vneg_h(x3_m); \ + x2_m = __lsx_vpackev_h(x2_m, x3_m); \ + DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3); \ + } while (0) + +#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) \ + do { \ + __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + \ + DUP4_ARG2(__lsx_vsrli_h, in0, 15, in1, 15, in2, 15, in3, 15, vec0_m, \ + vec1_m, vec2_m, vec3_m); \ + DUP4_ARG2(__lsx_vsrli_h, in4, 15, in5, 15, in6, 15, in7, 15, vec4_m, \ + vec5_m, vec6_m, vec7_m); \ + DUP4_ARG2(__lsx_vavg_h, vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, \ + in3, in0, in1, in2, in3); \ + DUP4_ARG2(__lsx_vavg_h, vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, \ + in7, in4, in5, in6, in7); \ + } while (0) + +#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \ + do { \ + __m128i tp0_m, tp1_m; \ + __m128i one = __lsx_vreplgr2vr_h(1); \ + \ + tp0_m = __lsx_vslei_h(vec0, 0); \ + tp1_m = __lsx_vslei_h(vec1, 0); \ + tp0_m = __lsx_vxori_b(tp0_m, 255); \ + tp1_m = __lsx_vxori_b(tp1_m, 255); \ + vec0 = __lsx_vadd_h(vec0, one); \ + vec1 = __lsx_vadd_h(vec1, one); \ + tp0_m = __lsx_vand_v(one, tp0_m); \ + tp1_m = __lsx_vand_v(one, tp1_m); \ + vec0 = __lsx_vadd_h(vec0, tp0_m); \ + vec1 = __lsx_vadd_h(vec1, tp1_m); \ + vec0 = __lsx_vsrai_h(vec0, 2); \ + vec1 = __lsx_vsrai_h(vec1, 2); \ + } while (0) + +#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \ + do { \ + __m128i tp0_m, tp1_m; \ + __m128i one_m = __lsx_vldi(0x401); \ + \ + tp0_m = __lsx_vslti_h(vec0, 0); \ + tp1_m = __lsx_vslti_h(vec1, 0); \ + vec0 = __lsx_vadd_h(vec0, one_m); \ + vec1 = __lsx_vadd_h(vec1, one_m); \ + tp0_m = __lsx_vand_v(one_m, tp0_m); \ + tp1_m = __lsx_vand_v(one_m, tp1_m); \ + vec0 = __lsx_vadd_h(vec0, tp0_m); \ + vec1 = __lsx_vadd_h(vec1, tp1_m); \ + vec0 = __lsx_vsrai_h(vec0, 2); \ + vec1 = __lsx_vsrai_h(vec1, 2); \ + } while (0) + +#define FDCT32_POSTPROC_NEG_W(vec) \ + do { \ + __m128i temp_m; \ + __m128i one_m = __lsx_vreplgr2vr_w(1); \ + \ + temp_m = __lsx_vslti_w(vec, 0); \ + vec = __lsx_vadd_w(vec, one_m); \ + temp_m = __lsx_vand_v(one_m, temp_m); \ + vec = __lsx_vadd_w(vec, temp_m); \ + vec = __lsx_vsrai_w(vec, 2); \ + } while (0) + +#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right, \ + const0, const1, out0, out1, out2, out3) \ + do { \ + __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ + __m128i tp0_m, tp1_m, tp2_m, tp3_m, _tmp0, _tmp1; \ + __m128i k0_m = __lsx_vreplgr2vr_w((int32_t)const0); \ + \ + s0_m = __lsx_vreplgr2vr_w((int32_t)const1); \ + k0_m = __lsx_vpackev_w(s0_m, k0_m); \ + \ + DUP2_ARG1(__lsx_vneg_w, reg1_left, reg1_right, _tmp0, _tmp1); \ + s1_m = __lsx_vilvl_w(_tmp0, reg0_left); \ + s0_m = __lsx_vilvh_w(_tmp0, reg0_left); \ + s3_m = __lsx_vilvl_w(reg0_left, reg1_left); \ + s2_m = __lsx_vilvh_w(reg0_left, reg1_left); \ + s5_m = __lsx_vilvl_w(_tmp1, reg0_right); \ + s4_m = __lsx_vilvh_w(_tmp1, reg0_right); \ + s7_m = __lsx_vilvl_w(reg0_right, reg1_right); \ + s6_m = __lsx_vilvh_w(reg0_right, reg1_right); \ + DUP2_ARG2(__lsx_vdp2_d_w, s0_m, k0_m, s1_m, k0_m, tp0_m, tp1_m); \ + DUP2_ARG2(__lsx_vdp2_d_w, s4_m, k0_m, s5_m, k0_m, tp2_m, tp3_m); \ + DUP2_ARG3(__lsx_vssrarni_w_d, tp0_m, tp1_m, DCT_CONST_BITS, tp2_m, tp3_m, \ + DCT_CONST_BITS, out0, out1); \ + DUP2_ARG2(__lsx_vdp2_d_w, s2_m, k0_m, s3_m, k0_m, tp0_m, tp1_m); \ + DUP2_ARG2(__lsx_vdp2_d_w, s6_m, k0_m, s7_m, k0_m, tp2_m, tp3_m); \ + DUP2_ARG3(__lsx_vssrarni_w_d, tp0_m, tp1_m, DCT_CONST_BITS, tp2_m, tp3_m, \ + DCT_CONST_BITS, out2, out3); \ + } while (0) + +#define VP9_ADDBLK_ST8x4_UB(dst, _stride, _stride2, _stride3, in0, in1, in2, \ + in3) \ + do { \ + __m128i dst0_m, dst1_m, dst2_m, dst3_m; \ + __m128i tmp0_m, tmp1_m; \ + __m128i res0_m, res1_m, res2_m, res3_m; \ + \ + dst0_m = __lsx_vld(dst, 0); \ + DUP2_ARG2(__lsx_vldx, dst, _stride, dst, _stride2, dst1_m, dst2_m); \ + dst3_m = __lsx_vldx(dst, _stride3); \ + DUP4_ARG2(__lsx_vsllwil_hu_bu, dst0_m, 0, dst1_m, 0, dst2_m, 0, dst3_m, 0, \ + res0_m, res1_m, res2_m, res3_m); \ + DUP4_ARG2(__lsx_vadd_h, res0_m, in0, res1_m, in1, res2_m, in2, res3_m, \ + in3, res0_m, res1_m, res2_m, res3_m); \ + DUP2_ARG3(__lsx_vssrarni_bu_h, res1_m, res0_m, 0, res3_m, res2_m, 0, \ + tmp0_m, tmp1_m); \ + __lsx_vstelm_d(tmp0_m, dst, 0, 0); \ + __lsx_vstelm_d(tmp0_m, dst + _stride, 0, 1); \ + __lsx_vstelm_d(tmp1_m, dst + _stride2, 0, 0); \ + __lsx_vstelm_d(tmp1_m, dst + _stride3, 0, 1); \ + } while (0) + +#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + do { \ + __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ + __m128i x0_m, x1_m, x2_m, x3_m; \ + __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 }; \ + \ + /* FDCT stage1 */ \ + LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, \ + s2_m, s3_m, s4_m, s5_m, s6_m, s7_m); \ + LSX_BUTTERFLY_4_H(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ + DUP2_ARG2(__lsx_vilvh_h, x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ + DUP2_ARG2(__lsx_vilvl_h, x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, x0_m, x1_m); \ + x1_m = __lsx_vpackev_h(x1_m, x0_m); \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, out4); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, x2_m, x3_m); \ + x2_m = __lsx_vneg_h(x2_m); \ + x2_m = __lsx_vpackev_h(x3_m, x2_m); \ + DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out6); \ + \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, out0); \ + x2_m = __lsx_vreplvei_h(coeff_m, 2); \ + x2_m = __lsx_vpackev_h(x2_m, x3_m); \ + DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out2); \ + \ + /* stage2 */ \ + s1_m = __lsx_vilvl_h(s5_m, s6_m); \ + s0_m = __lsx_vilvh_h(s5_m, s6_m); \ + \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, s6_m); \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, s5_m); \ + \ + /* stage3 */ \ + LSX_BUTTERFLY_4_H(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \ + \ + /* stage4 */ \ + DUP2_ARG2(__lsx_vilvh_h, x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \ + DUP2_ARG2(__lsx_vilvl_h, x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 5, x0_m, x1_m); \ + x1_m = __lsx_vpackev_h(x0_m, x1_m); \ + DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m, out1); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 6, coeff_m, 7, x2_m, x3_m); \ + x2_m = __lsx_vpackev_h(x3_m, x2_m); \ + DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out5); \ + \ + x1_m = __lsx_vreplvei_h(coeff_m, 5); \ + x0_m = __lsx_vneg_h(x0_m); \ + x0_m = __lsx_vpackev_h(x1_m, x0_m); \ + DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m, out7); \ + \ + x2_m = __lsx_vreplvei_h(coeff_m, 6); \ + x3_m = __lsx_vneg_h(x3_m); \ + x2_m = __lsx_vpackev_h(x2_m, x3_m); \ + DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3); \ + } while (0) + +#define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6, \ + input7, out1, out3, out5, out7, out9, out11, out13, \ + out15) \ + do { \ + __m128i stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m; \ + __m128i stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m; \ + __m128i stp36_m, stp37_m, vec0_m, vec1_m; \ + __m128i vec2_m, vec3_m, vec4_m, vec5_m, vec6_m; \ + __m128i cnst0_m, cnst1_m, cnst4_m, cnst5_m; \ + __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e3537e782c4df }; \ + __m128i coeff1_m = { 0x289a317906463fb1, 0x12943d3f1e2b3871 }; \ + __m128i coeff2_m = { 0xed6cd766c78fc04f, 0x0 }; \ + \ + /* stp 1 */ \ + DUP2_ARG2(__lsx_vilvh_h, input2, input5, input3, input4, vec2_m, vec4_m); \ + DUP2_ARG2(__lsx_vilvl_h, input2, input5, input3, input4, vec3_m, vec5_m); \ + \ + cnst4_m = __lsx_vreplvei_h(coeff_m, 0); \ + DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m, stp25_m); \ + \ + cnst5_m = __lsx_vreplvei_h(coeff_m, 1); \ + cnst5_m = __lsx_vpackev_h(cnst5_m, cnst4_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m, stp22_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m, stp24_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m, stp23_m); \ + \ + /* stp2 */ \ + LSX_BUTTERFLY_4_H(input0, input1, stp22_m, stp23_m, stp30_m, stp31_m, \ + stp32_m, stp33_m); \ + LSX_BUTTERFLY_4_H(input7, input6, stp25_m, stp24_m, stp37_m, stp36_m, \ + stp35_m, stp34_m); \ + \ + DUP2_ARG2(__lsx_vilvh_h, stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, \ + vec4_m); \ + DUP2_ARG2(__lsx_vilvl_h, stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, \ + vec5_m); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, cnst0_m, cnst1_m); \ + cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m, stp26_m); \ + \ + cnst0_m = __lsx_vreplvei_h(coeff_m, 4); \ + cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m, stp21_m); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 5, coeff_m, 2, cnst0_m, cnst1_m); \ + cnst1_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m, stp25_m); \ + \ + cnst0_m = __lsx_vreplvei_h(coeff_m, 3); \ + cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m, stp22_m); \ + \ + /* stp4 */ \ + LSX_BUTTERFLY_4_H(stp30_m, stp37_m, stp26_m, stp21_m, vec6_m, vec2_m, \ + vec4_m, vec5_m); \ + LSX_BUTTERFLY_4_H(stp33_m, stp34_m, stp25_m, stp22_m, stp21_m, stp23_m, \ + stp24_m, stp31_m); \ + \ + vec1_m = __lsx_vilvl_h(vec2_m, vec6_m); \ + vec0_m = __lsx_vilvh_h(vec2_m, vec6_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 0, coeff1_m, 1, cnst0_m, cnst1_m); \ + cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \ + \ + DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out1); \ + \ + cnst0_m = __lsx_vreplvei_h(coeff2_m, 0); \ + cnst0_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out15); \ + \ + vec1_m = __lsx_vilvl_h(vec4_m, vec5_m); \ + vec0_m = __lsx_vilvh_h(vec4_m, vec5_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 2, coeff1_m, 3, cnst0_m, cnst1_m); \ + cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ + \ + DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m, out9); \ + \ + cnst1_m = __lsx_vreplvei_h(coeff2_m, 2); \ + cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out7); \ + \ + vec1_m = __lsx_vilvl_h(stp23_m, stp21_m); \ + vec0_m = __lsx_vilvh_h(stp23_m, stp21_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 4, coeff1_m, 5, cnst0_m, cnst1_m); \ + cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out5); \ + \ + cnst0_m = __lsx_vreplvei_h(coeff2_m, 1); \ + cnst0_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out11); \ + \ + vec1_m = __lsx_vilvl_h(stp24_m, stp31_m); \ + vec0_m = __lsx_vilvh_h(stp24_m, stp31_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 6, coeff1_m, 7, cnst0_m, cnst1_m); \ + cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ + \ + DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m, out13); \ + \ + cnst1_m = __lsx_vreplvei_h(coeff2_m, 3); \ + cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out3); \ + } while (0) + +void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, + int32_t src_stride); +void fdct16x8_1d_row(int16_t *input, int16_t *output); +#endif // VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/idct32x32_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/idct32x32_lsx.c new file mode 100644 index 0000000000..ec07f57d90 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/idct32x32_lsx.c @@ -0,0 +1,834 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/fwd_txfm_lsx.h" + +#define UNPCK_UB_SH(_in, _out0, _out1) \ + do { \ + _out0 = __lsx_vsllwil_hu_bu(_in, 0); \ + _out1 = __lsx_vexth_hu_bu(_in); \ + } while (0) + +static void idct32x8_row_transpose_store(const int16_t *input, + int16_t *tmp_buf) { + __m128i m0, m1, m2, m3, m4, m5, m6, m7; + __m128i n0, n1, n2, n3, n4, n5, n6, n7; + + /* 1st & 2nd 8x8 */ + DUP4_ARG2(__lsx_vld, input, 0, input, 64, input, 128, input, 192, m0, n0, m1, + n1); + DUP4_ARG2(__lsx_vld, input, 256, input, 320, input, 384, input, 448, m2, n2, + m3, n3); + DUP4_ARG2(__lsx_vld, input, 16, input, 80, input, 144, input, 208, m4, n4, m5, + n5); + DUP4_ARG2(__lsx_vld, input, 272, input, 336, input, 400, input, 464, m6, n6, + m7, n7); + + LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); + + __lsx_vst(m0, tmp_buf, 0); + __lsx_vst(n0, tmp_buf, 16); + __lsx_vst(m1, tmp_buf, 32); + __lsx_vst(n1, tmp_buf, 48); + __lsx_vst(m2, tmp_buf, 64); + __lsx_vst(n2, tmp_buf, 80); + __lsx_vst(m3, tmp_buf, 96); + __lsx_vst(n3, tmp_buf, 112); + __lsx_vst(m4, tmp_buf, 128); + __lsx_vst(n4, tmp_buf, 144); + __lsx_vst(m5, tmp_buf, 160); + __lsx_vst(n5, tmp_buf, 176); + __lsx_vst(m6, tmp_buf, 192); + __lsx_vst(n6, tmp_buf, 208); + __lsx_vst(m7, tmp_buf, 224); + __lsx_vst(n7, tmp_buf, 240); + + /* 3rd & 4th 8x8 */ + DUP4_ARG2(__lsx_vld, input, 32, input, 96, input, 160, input, 224, m0, n0, m1, + n1); + DUP4_ARG2(__lsx_vld, input, 288, input, 352, input, 416, input, 480, m2, n2, + m3, n3); + DUP4_ARG2(__lsx_vld, input, 48, input, 112, input, 176, input, 240, m4, n4, + m5, n5); + DUP4_ARG2(__lsx_vld, input, 304, input, 368, input, 432, input, 496, m6, n6, + m7, n7); + + LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); + + __lsx_vst(m0, tmp_buf, 256); + __lsx_vst(n0, tmp_buf, 272); + __lsx_vst(m1, tmp_buf, 288); + __lsx_vst(n1, tmp_buf, 304); + __lsx_vst(m2, tmp_buf, 320); + __lsx_vst(n2, tmp_buf, 336); + __lsx_vst(m3, tmp_buf, 352); + __lsx_vst(n3, tmp_buf, 368); + __lsx_vst(m4, tmp_buf, 384); + __lsx_vst(n4, tmp_buf, 400); + __lsx_vst(m5, tmp_buf, 416); + __lsx_vst(n5, tmp_buf, 432); + __lsx_vst(m6, tmp_buf, 448); + __lsx_vst(n6, tmp_buf, 464); + __lsx_vst(m7, tmp_buf, 480); + __lsx_vst(n7, tmp_buf, 496); +} + +static void idct32x8_row_even_process_store(int16_t *tmp_buf, + int16_t *tmp_eve_buf) { + __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; + __m128i tmp0; + + /* Even stage 1 */ + DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 64, tmp_buf, 128, tmp_buf, 192, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_buf, 256, tmp_buf, 320, tmp_buf, 384, tmp_buf, 448, + reg4, reg5, reg6, reg7); + + DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); + DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); + LSX_BUTTERFLY_4_H(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0); + DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + + loc1 = vec3; + loc0 = vec1; + + DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4); + DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6); + LSX_BUTTERFLY_4_H(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0); + LSX_BUTTERFLY_4_H(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4); + LSX_BUTTERFLY_4_H(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5); + + /* Even stage 2 */ + DUP4_ARG2(__lsx_vld, tmp_buf, 32, tmp_buf, 96, tmp_buf, 160, tmp_buf, 224, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_buf, 288, tmp_buf, 352, tmp_buf, 416, tmp_buf, 480, + reg4, reg5, reg6, reg7); + DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7); + DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3); + DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5); + DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1); + + vec0 = __lsx_vadd_h(reg0, reg4); + reg0 = __lsx_vsub_h(reg0, reg4); + reg4 = __lsx_vadd_h(reg6, reg2); + reg6 = __lsx_vsub_h(reg6, reg2); + reg2 = __lsx_vadd_h(reg1, reg5); + reg1 = __lsx_vsub_h(reg1, reg5); + reg5 = __lsx_vadd_h(reg7, reg3); + reg7 = __lsx_vsub_h(reg7, reg3); + reg3 = vec0; + + vec1 = reg2; + reg2 = __lsx_vadd_h(reg3, reg4); + reg3 = __lsx_vsub_h(reg3, reg4); + reg4 = __lsx_vsub_h(reg5, vec1); + reg5 = __lsx_vadd_h(reg5, vec1); + + tmp0 = __lsx_vneg_h(reg6); + DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7); + DOTP_CONST_PAIR(tmp0, reg1, cospi_24_64, cospi_8_64, reg6, reg1); + + vec0 = __lsx_vsub_h(reg0, reg6); + reg0 = __lsx_vadd_h(reg0, reg6); + vec1 = __lsx_vsub_h(reg7, reg1); + reg7 = __lsx_vadd_h(reg7, reg1); + + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1); + DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4); + + /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */ + LSX_BUTTERFLY_4_H(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0); + __lsx_vst(loc0, tmp_eve_buf, 240); + __lsx_vst(loc1, tmp_eve_buf, 0); + __lsx_vst(loc2, tmp_eve_buf, 224); + __lsx_vst(loc3, tmp_eve_buf, 16); + + LSX_BUTTERFLY_4_H(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0); + __lsx_vst(loc0, tmp_eve_buf, 208); + __lsx_vst(loc1, tmp_eve_buf, 32); + __lsx_vst(loc2, tmp_eve_buf, 192); + __lsx_vst(loc3, tmp_eve_buf, 48); + + /* Store 8 */ + LSX_BUTTERFLY_4_H(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0); + __lsx_vst(loc0, tmp_eve_buf, 176); + __lsx_vst(loc1, tmp_eve_buf, 64); + __lsx_vst(loc2, tmp_eve_buf, 160); + __lsx_vst(loc3, tmp_eve_buf, 80); + + LSX_BUTTERFLY_4_H(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); + __lsx_vst(loc0, tmp_eve_buf, 144); + __lsx_vst(loc1, tmp_eve_buf, 96); + __lsx_vst(loc2, tmp_eve_buf, 128); + __lsx_vst(loc3, tmp_eve_buf, 112); +} + +static void idct32x8_row_odd_process_store(int16_t *tmp_buf, + int16_t *tmp_odd_buf) { + __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + + /* Odd stage 1 */ + DUP4_ARG2(__lsx_vld, tmp_buf, 16, tmp_buf, 112, tmp_buf, 144, tmp_buf, 240, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_buf, 272, tmp_buf, 368, tmp_buf, 400, tmp_buf, 496, + reg4, reg5, reg6, reg7); + + DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7); + DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4); + DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5); + DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6); + + vec0 = __lsx_vadd_h(reg0, reg3); + reg0 = __lsx_vsub_h(reg0, reg3); + reg3 = __lsx_vadd_h(reg7, reg4); + reg7 = __lsx_vsub_h(reg7, reg4); + reg4 = __lsx_vadd_h(reg1, reg2); + reg1 = __lsx_vsub_h(reg1, reg2); + reg2 = __lsx_vadd_h(reg6, reg5); + reg6 = __lsx_vsub_h(reg6, reg5); + reg5 = vec0; + + /* 4 Stores */ + DUP2_ARG2(__lsx_vadd_h, reg5, reg4, reg3, reg2, vec0, vec1); + __lsx_vst(vec0, tmp_odd_buf, 64); + __lsx_vst(vec1, tmp_odd_buf, 80); + + DUP2_ARG2(__lsx_vsub_h, reg5, reg4, reg3, reg2, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1); + __lsx_vst(vec0, tmp_odd_buf, 0); + __lsx_vst(vec1, tmp_odd_buf, 16); + + /* 4 Stores */ + DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7); + DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6); + LSX_BUTTERFLY_4_H(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3); + __lsx_vst(vec0, tmp_odd_buf, 96); + __lsx_vst(vec1, tmp_odd_buf, 112); + + DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3); + __lsx_vst(vec2, tmp_odd_buf, 32); + __lsx_vst(vec3, tmp_odd_buf, 48); + + /* Odd stage 2 */ + /* 8 loads */ + DUP4_ARG2(__lsx_vld, tmp_buf, 48, tmp_buf, 80, tmp_buf, 176, tmp_buf, 208, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_buf, 304, tmp_buf, 336, tmp_buf, 432, tmp_buf, 464, + reg4, reg5, reg6, reg7); + + DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6); + DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5); + DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4); + DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); + + /* 4 Stores */ + DUP4_ARG2(__lsx_vsub_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, + vec1, vec2, vec3); + DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); + DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); + + LSX_BUTTERFLY_4_H(loc3, loc2, loc0, loc1, vec1, vec0, vec2, vec3); + __lsx_vst(vec0, tmp_odd_buf, 192); + __lsx_vst(vec1, tmp_odd_buf, 240); + + DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1); + __lsx_vst(vec0, tmp_odd_buf, 160); + __lsx_vst(vec1, tmp_odd_buf, 176); + + /* 4 Stores */ + DUP4_ARG2(__lsx_vadd_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec1, + vec2, vec0, vec3); + LSX_BUTTERFLY_4_H(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2); + __lsx_vst(reg0, tmp_odd_buf, 208); + __lsx_vst(reg1, tmp_odd_buf, 224); + + DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1); + __lsx_vst(reg0, tmp_odd_buf, 128); + __lsx_vst(reg1, tmp_odd_buf, 144); + + /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */ + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 16, tmp_odd_buf, 32, + tmp_odd_buf, 48, reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 128, tmp_odd_buf, 144, tmp_odd_buf, 160, + tmp_odd_buf, 176, reg4, reg5, reg6, reg7); + DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, + loc1, loc2, loc3); + __lsx_vst(loc0, tmp_odd_buf, 0); + __lsx_vst(loc1, tmp_odd_buf, 16); + __lsx_vst(loc2, tmp_odd_buf, 32); + __lsx_vst(loc3, tmp_odd_buf, 48); + + DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg1, reg5, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + + DUP2_ARG2(__lsx_vsub_h, reg2, reg6, reg3, reg7, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + __lsx_vst(loc0, tmp_odd_buf, 128); + __lsx_vst(loc1, tmp_odd_buf, 144); + __lsx_vst(loc2, tmp_odd_buf, 160); + __lsx_vst(loc3, tmp_odd_buf, 176); + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 80, tmp_odd_buf, 96, + tmp_odd_buf, 112, reg1, reg2, reg0, reg3); + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 192, tmp_odd_buf, 208, tmp_odd_buf, 224, + tmp_odd_buf, 240, reg4, reg5, reg6, reg7); + + DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, + loc1, loc2, loc3); + __lsx_vst(loc0, tmp_odd_buf, 64); + __lsx_vst(loc1, tmp_odd_buf, 80); + __lsx_vst(loc2, tmp_odd_buf, 96); + __lsx_vst(loc3, tmp_odd_buf, 112); + + DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg3, reg7, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + DUP2_ARG2(__lsx_vsub_h, reg1, reg5, reg2, reg6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + __lsx_vst(loc0, tmp_odd_buf, 192); + __lsx_vst(loc1, tmp_odd_buf, 208); + __lsx_vst(loc2, tmp_odd_buf, 224); + __lsx_vst(loc3, tmp_odd_buf, 240); +} + +static void idct_butterfly_transpose_store(int16_t *tmp_buf, + int16_t *tmp_eve_buf, + int16_t *tmp_odd_buf, int16_t *dst) { + __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + __m128i m0, m1, m2, m3, m4, m5, m6, m7; + __m128i n0, n1, n2, n3, n4, n5, n6, n7; + __m128i reg0, reg1, reg2, reg3; + + /* FINAL BUTTERFLY : Dependency on Even & Odd */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 144, tmp_odd_buf, 224, + tmp_odd_buf, 96, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, tmp_eve_buf, 0, tmp_eve_buf, 128, tmp_eve_buf, 64, + tmp_eve_buf, 192, loc0, loc1, loc2, loc3); + + DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, + m4, m2, m6); + DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0, + reg1, reg2, reg3); + __lsx_vst(reg0, tmp_buf, 496); + __lsx_vst(reg1, tmp_buf, 368); + __lsx_vst(reg2, tmp_buf, 432); + __lsx_vst(reg3, tmp_buf, 304); + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 208, tmp_odd_buf, 160, + tmp_odd_buf, 48, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, tmp_eve_buf, 32, tmp_eve_buf, 160, tmp_eve_buf, 96, + tmp_eve_buf, 224, loc0, loc1, loc2, loc3); + + DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, + m5, m3, m7); + DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0, + reg1, reg2, reg3); + __lsx_vst(reg0, tmp_buf, 464); + __lsx_vst(reg1, tmp_buf, 336); + __lsx_vst(reg2, tmp_buf, 400); + __lsx_vst(reg3, tmp_buf, 272); + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 32, tmp_odd_buf, 176, tmp_odd_buf, 192, + tmp_odd_buf, 112, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, tmp_eve_buf, 16, tmp_eve_buf, 144, tmp_eve_buf, 80, + tmp_eve_buf, 208, loc0, loc1, loc2, loc3); + + DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, + n4, n2, n6); + DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0, + reg1, reg2, reg3); + __lsx_vst(reg0, tmp_buf, 480); + __lsx_vst(reg1, tmp_buf, 352); + __lsx_vst(reg2, tmp_buf, 416); + __lsx_vst(reg3, tmp_buf, 288); + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 80, tmp_odd_buf, 240, tmp_odd_buf, 128, + tmp_odd_buf, 16, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, tmp_eve_buf, 48, tmp_eve_buf, 176, tmp_eve_buf, 112, + tmp_eve_buf, 240, loc0, loc1, loc2, loc3); + DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, + n5, n3, n7); + DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0, + reg1, reg2, reg3); + __lsx_vst(reg0, tmp_buf, 448); + __lsx_vst(reg1, tmp_buf, 320); + __lsx_vst(reg2, tmp_buf, 384); + __lsx_vst(reg3, tmp_buf, 256); + + /* Transpose : 16 vectors */ + /* 1st & 2nd 8x8 */ + LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + __lsx_vst(m0, dst, 0); + __lsx_vst(n0, dst, 64); + __lsx_vst(m1, dst, 128); + __lsx_vst(n1, dst, 192); + __lsx_vst(m2, dst, 256); + __lsx_vst(n2, dst, 320); + __lsx_vst(m3, dst, 384); + __lsx_vst(n3, dst, 448); + + LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); + __lsx_vst(m4, dst, 16); + __lsx_vst(n4, dst, 80); + __lsx_vst(m5, dst, 144); + __lsx_vst(n5, dst, 208); + __lsx_vst(m6, dst, 272); + __lsx_vst(n6, dst, 336); + __lsx_vst(m7, dst, 400); + __lsx_vst(n7, dst, 464); + + /* 3rd & 4th 8x8 */ + DUP4_ARG2(__lsx_vld, tmp_buf, 256, tmp_buf, 272, tmp_buf, 288, tmp_buf, 304, + m0, n0, m1, n1); + DUP4_ARG2(__lsx_vld, tmp_buf, 320, tmp_buf, 336, tmp_buf, 352, tmp_buf, 368, + m2, n2, m3, n3); + DUP4_ARG2(__lsx_vld, tmp_buf, 384, tmp_buf, 400, tmp_buf, 416, tmp_buf, 432, + m4, n4, m5, n5); + DUP4_ARG2(__lsx_vld, tmp_buf, 448, tmp_buf, 464, tmp_buf, 480, tmp_buf, 496, + m6, n6, m7, n7); + LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); + __lsx_vst(m0, dst, 32); + __lsx_vst(n0, dst, 96); + __lsx_vst(m1, dst, 160); + __lsx_vst(n1, dst, 224); + __lsx_vst(m2, dst, 288); + __lsx_vst(n2, dst, 352); + __lsx_vst(m3, dst, 416); + __lsx_vst(n3, dst, 480); + __lsx_vst(m4, dst, 48); + __lsx_vst(n4, dst, 112); + __lsx_vst(m5, dst, 176); + __lsx_vst(n5, dst, 240); + __lsx_vst(m6, dst, 304); + __lsx_vst(n6, dst, 368); + __lsx_vst(m7, dst, 432); + __lsx_vst(n7, dst, 496); +} + +static void idct32x8_1d_rows_lsx(const int16_t *input, int16_t *output) { + DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]); + DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]); + DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]); + + idct32x8_row_transpose_store(input, &tmp_buf[0]); + idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]); + idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]); + idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], &tmp_odd_buf[0], + output); +} + +static void idct8x32_column_even_process_store(int16_t *tmp_buf, + int16_t *tmp_eve_buf) { + __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; + __m128i tmp0; + + /* Even stage 1 */ + DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 256, tmp_buf, 512, tmp_buf, 768, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_buf, 1024, tmp_buf, 1280, tmp_buf, 1536, tmp_buf, + 1792, reg4, reg5, reg6, reg7); + tmp_buf += 64; + + DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); + DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); + LSX_BUTTERFLY_4_H(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0); + DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + + loc1 = vec3; + loc0 = vec1; + + DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4); + DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6); + LSX_BUTTERFLY_4_H(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0); + LSX_BUTTERFLY_4_H(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4); + LSX_BUTTERFLY_4_H(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5); + + /* Even stage 2 */ + /* Load 8 */ + DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 256, tmp_buf, 512, tmp_buf, 768, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_buf, 1024, tmp_buf, 1280, tmp_buf, 1536, tmp_buf, + 1792, reg4, reg5, reg6, reg7); + DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7); + DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3); + DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5); + DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1); + + vec0 = __lsx_vadd_h(reg0, reg4); + reg0 = __lsx_vsub_h(reg0, reg4); + reg4 = __lsx_vadd_h(reg6, reg2); + reg6 = __lsx_vsub_h(reg6, reg2); + reg2 = __lsx_vadd_h(reg1, reg5); + reg1 = __lsx_vsub_h(reg1, reg5); + reg5 = __lsx_vadd_h(reg7, reg3); + reg7 = __lsx_vsub_h(reg7, reg3); + reg3 = vec0; + + vec1 = reg2; + reg2 = __lsx_vadd_h(reg3, reg4); + reg3 = __lsx_vsub_h(reg3, reg4); + reg4 = __lsx_vsub_h(reg5, vec1); + reg5 = __lsx_vadd_h(reg5, vec1); + + tmp0 = __lsx_vneg_h(reg6); + DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7); + DOTP_CONST_PAIR(tmp0, reg1, cospi_24_64, cospi_8_64, reg6, reg1); + + vec0 = __lsx_vsub_h(reg0, reg6); + reg0 = __lsx_vadd_h(reg0, reg6); + vec1 = __lsx_vsub_h(reg7, reg1); + reg7 = __lsx_vadd_h(reg7, reg1); + + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1); + DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4); + + /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */ + /* Store 8 */ + LSX_BUTTERFLY_4_H(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0); + __lsx_vst(loc1, tmp_eve_buf, 0); + __lsx_vst(loc3, tmp_eve_buf, 16); + __lsx_vst(loc2, tmp_eve_buf, 224); + __lsx_vst(loc0, tmp_eve_buf, 240); + + LSX_BUTTERFLY_4_H(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0); + __lsx_vst(loc1, tmp_eve_buf, 32); + __lsx_vst(loc3, tmp_eve_buf, 48); + __lsx_vst(loc2, tmp_eve_buf, 192); + __lsx_vst(loc0, tmp_eve_buf, 208); + + /* Store 8 */ + LSX_BUTTERFLY_4_H(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0); + __lsx_vst(loc1, tmp_eve_buf, 64); + __lsx_vst(loc3, tmp_eve_buf, 80); + __lsx_vst(loc2, tmp_eve_buf, 160); + __lsx_vst(loc0, tmp_eve_buf, 176); + + LSX_BUTTERFLY_4_H(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); + __lsx_vst(loc1, tmp_eve_buf, 96); + __lsx_vst(loc3, tmp_eve_buf, 112); + __lsx_vst(loc2, tmp_eve_buf, 128); + __lsx_vst(loc0, tmp_eve_buf, 144); +} + +static void idct8x32_column_odd_process_store(int16_t *tmp_buf, + int16_t *tmp_odd_buf) { + __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + + /* Odd stage 1 */ + DUP4_ARG2(__lsx_vld, tmp_buf, 64, tmp_buf, 448, tmp_buf, 576, tmp_buf, 960, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_buf, 1088, tmp_buf, 1472, tmp_buf, 1600, tmp_buf, + 1984, reg4, reg5, reg6, reg7); + + DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7); + DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4); + DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5); + DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6); + + vec0 = __lsx_vadd_h(reg0, reg3); + reg0 = __lsx_vsub_h(reg0, reg3); + reg3 = __lsx_vadd_h(reg7, reg4); + reg7 = __lsx_vsub_h(reg7, reg4); + reg4 = __lsx_vadd_h(reg1, reg2); + reg1 = __lsx_vsub_h(reg1, reg2); + reg2 = __lsx_vadd_h(reg6, reg5); + reg6 = __lsx_vsub_h(reg6, reg5); + reg5 = vec0; + + /* 4 Stores */ + DUP2_ARG2(__lsx_vadd_h, reg5, reg4, reg3, reg2, vec0, vec1); + __lsx_vst(vec0, tmp_odd_buf, 64); + __lsx_vst(vec1, tmp_odd_buf, 80); + DUP2_ARG2(__lsx_vsub_h, reg5, reg4, reg3, reg2, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1); + __lsx_vst(vec0, tmp_odd_buf, 0); + __lsx_vst(vec1, tmp_odd_buf, 16); + + /* 4 Stores */ + DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7); + DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6); + LSX_BUTTERFLY_4_H(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3); + DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3); + __lsx_vst(vec0, tmp_odd_buf, 96); + __lsx_vst(vec1, tmp_odd_buf, 112); + __lsx_vst(vec2, tmp_odd_buf, 32); + __lsx_vst(vec3, tmp_odd_buf, 48); + + /* Odd stage 2 */ + /* 8 loads */ + DUP4_ARG2(__lsx_vld, tmp_buf, 192, tmp_buf, 320, tmp_buf, 704, tmp_buf, 832, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_buf, 1216, tmp_buf, 1344, tmp_buf, 1728, tmp_buf, + 1856, reg4, reg5, reg6, reg7); + DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6); + DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5); + DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4); + DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); + + /* 4 Stores */ + DUP4_ARG2(__lsx_vsub_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, + vec1, vec2, vec3); + DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); + DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); + LSX_BUTTERFLY_4_H(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2); + __lsx_vst(vec0, tmp_odd_buf, 192); + __lsx_vst(vec1, tmp_odd_buf, 240); + DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1); + __lsx_vst(vec0, tmp_odd_buf, 160); + __lsx_vst(vec1, tmp_odd_buf, 176); + + /* 4 Stores */ + DUP4_ARG2(__lsx_vadd_h, reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, vec0, + vec1, vec2, vec3); + LSX_BUTTERFLY_4_H(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2); + __lsx_vst(reg0, tmp_odd_buf, 208); + __lsx_vst(reg1, tmp_odd_buf, 224); + DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1); + __lsx_vst(reg0, tmp_odd_buf, 128); + __lsx_vst(reg1, tmp_odd_buf, 144); + + /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */ + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 16, tmp_odd_buf, 32, + tmp_odd_buf, 48, reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 128, tmp_odd_buf, 144, tmp_odd_buf, 160, + tmp_odd_buf, 176, reg4, reg5, reg6, reg7); + DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, + loc1, loc2, loc3); + __lsx_vst(loc0, tmp_odd_buf, 0); + __lsx_vst(loc1, tmp_odd_buf, 16); + __lsx_vst(loc2, tmp_odd_buf, 32); + __lsx_vst(loc3, tmp_odd_buf, 48); + + DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg1, reg5, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + DUP2_ARG2(__lsx_vsub_h, reg2, reg6, reg3, reg7, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + __lsx_vst(loc0, tmp_odd_buf, 128); + __lsx_vst(loc1, tmp_odd_buf, 144); + __lsx_vst(loc2, tmp_odd_buf, 160); + __lsx_vst(loc3, tmp_odd_buf, 176); + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 80, tmp_odd_buf, 96, + tmp_odd_buf, 112, reg1, reg2, reg0, reg3); + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 192, tmp_odd_buf, 208, tmp_odd_buf, 224, + tmp_odd_buf, 240, reg4, reg5, reg6, reg7); + DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, + loc1, loc2, loc3); + __lsx_vst(loc0, tmp_odd_buf, 64); + __lsx_vst(loc1, tmp_odd_buf, 80); + __lsx_vst(loc2, tmp_odd_buf, 96); + __lsx_vst(loc3, tmp_odd_buf, 112); + + DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg3, reg7, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + DUP2_ARG2(__lsx_vsub_h, reg1, reg5, reg2, reg6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + __lsx_vst(loc0, tmp_odd_buf, 192); + __lsx_vst(loc1, tmp_odd_buf, 208); + __lsx_vst(loc2, tmp_odd_buf, 224); + __lsx_vst(loc3, tmp_odd_buf, 240); +} + +static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf, + int16_t *tmp_odd_buf, uint8_t *dst, + int32_t dst_stride) { + __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + __m128i m0, m1, m2, m3, m4, m5, m6, m7; + __m128i n0, n1, n2, n3, n4, n5, n6, n7; + int32_t stride = dst_stride << 2; + int32_t stride2 = stride << 1; + int32_t stride3 = stride + stride2; + + /* FINAL BUTTERFLY : Dependency on Even & Odd */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 144, tmp_odd_buf, 224, + tmp_odd_buf, 96, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, tmp_eve_buf, 0, tmp_eve_buf, 128, tmp_eve_buf, 64, + tmp_eve_buf, 192, loc0, loc1, loc2, loc3); + + DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, + m4, m2, m6); + DUP4_ARG2(__lsx_vsrari_h, m0, 6, m2, 6, m4, 6, m6, 6, m0, m2, m4, m6); + VP9_ADDBLK_ST8x4_UB(dst, stride, stride2, stride3, m0, m2, m4, m6); + DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, + m2, m4, m0); + DUP4_ARG2(__lsx_vsrari_h, m0, 6, m2, 6, m4, 6, m6, 6, m0, m2, m4, m6); + VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), stride, stride2, stride3, m0, m2, + m4, m6); + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 208, tmp_odd_buf, 160, + tmp_odd_buf, 48, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, tmp_eve_buf, 32, tmp_eve_buf, 160, tmp_eve_buf, 96, + tmp_eve_buf, 224, loc0, loc1, loc2, loc3); + + DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, + m5, m3, m7); + DUP4_ARG2(__lsx_vsrari_h, m1, 6, m3, 6, m5, 6, m7, 6, m1, m3, m5, m7); + VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), stride, stride2, stride3, m1, m3, + m5, m7); + DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, + m3, m5, m1); + DUP4_ARG2(__lsx_vsrari_h, m1, 6, m3, 6, m5, 6, m7, 6, m1, m3, m5, m7); + VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), stride, stride2, stride3, m1, m3, + m5, m7); + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 32, tmp_odd_buf, 176, tmp_odd_buf, 192, + tmp_odd_buf, 112, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, tmp_eve_buf, 16, tmp_eve_buf, 144, tmp_eve_buf, 80, + tmp_eve_buf, 208, loc0, loc1, loc2, loc3); + DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, + n4, n2, n6); + DUP4_ARG2(__lsx_vsrari_h, n0, 6, n2, 6, n4, 6, n6, 6, n0, n2, n4, n6); + VP9_ADDBLK_ST8x4_UB((dst + dst_stride), stride, stride2, stride3, n0, n2, n4, + n6); + DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, + n2, n4, n0); + DUP4_ARG2(__lsx_vsrari_h, n0, 6, n2, 6, n4, 6, n6, 6, n0, n2, n4, n6); + VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), stride, stride2, stride3, n0, n2, + n4, n6); + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 80, tmp_odd_buf, 240, tmp_odd_buf, 128, + tmp_odd_buf, 16, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, tmp_eve_buf, 48, tmp_eve_buf, 176, tmp_eve_buf, 112, + tmp_eve_buf, 240, loc0, loc1, loc2, loc3); + DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, + n5, n3, n7); + DUP4_ARG2(__lsx_vsrari_h, n1, 6, n3, 6, n5, 6, n7, 6, n1, n3, n5, n7); + VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), stride, stride2, stride3, n1, n3, + n5, n7); + DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, + n3, n5, n1); + DUP4_ARG2(__lsx_vsrari_h, n1, 6, n3, 6, n5, 6, n7, 6, n1, n3, n5, n7); + VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), stride, stride2, stride3, n1, n3, + n5, n7); +} + +static void idct8x32_1d_columns_addblk_lsx(int16_t *input, uint8_t *dst, + int32_t dst_stride) { + DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]); + DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]); + + idct8x32_column_even_process_store(input, &tmp_eve_buf[0]); + idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]); + idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], dst, + dst_stride); +} + +void vpx_idct32x32_1024_add_lsx(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]); + int16_t *out_ptr = out_arr; + + /* transform rows */ + for (i = 0; i < 4; ++i) { + /* process 32 * 8 block */ + idct32x8_1d_rows_lsx((input + (i << 8)), (out_ptr + (i << 8))); + } + + for (i = 0; i < 4; ++i) { + /* process 8 * 32 block */ + idct8x32_1d_columns_addblk_lsx((out_ptr + (i << 3)), (dst + (i << 3)), + dst_stride); + } +} + +void vpx_idct32x32_34_add_lsx(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]); + int16_t *out_ptr = out_arr; + __m128i zero = __lsx_vldi(0); + + for (i = 32; i--;) { + __lsx_vst(zero, out_ptr, 0); + __lsx_vst(zero, out_ptr, 16); + __lsx_vst(zero, out_ptr, 32); + __lsx_vst(zero, out_ptr, 48); + out_ptr += 32; + } + + out_ptr = out_arr; + + /* rows: only upper-left 8x8 has non-zero coeff */ + idct32x8_1d_rows_lsx(input, out_ptr); + + /* transform columns */ + for (i = 0; i < 4; ++i) { + /* process 8 * 32 block */ + idct8x32_1d_columns_addblk_lsx((out_ptr + (i << 3)), (dst + (i << 3)), + dst_stride); + } +} + +void vpx_idct32x32_1_add_lsx(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int32_t i; + int16_t out; + __m128i dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; + __m128i res0, res1, res2, res3, res4, res5, res6, res7, vec; + + out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO(out, 6); + + vec = __lsx_vreplgr2vr_h(out); + + for (i = 16; i--;) { + DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1); + dst2 = __lsx_vldx(dst, dst_stride); + dst3 = __lsx_vldx(dst + 16, dst_stride); + + UNPCK_UB_SH(dst0, res0, res4); + UNPCK_UB_SH(dst1, res1, res5); + UNPCK_UB_SH(dst2, res2, res6); + UNPCK_UB_SH(dst3, res3, res7); + + DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec, res0, + res1, res2, res3); + DUP4_ARG2(__lsx_vadd_h, res4, vec, res5, vec, res6, vec, res7, vec, res4, + res5, res6, res7); + DUP4_ARG3(__lsx_vssrarni_bu_h, res4, res0, 0, res5, res1, 0, res6, res2, 0, + res7, res3, 0, tmp0, tmp1, tmp2, tmp3); + __lsx_vst(tmp0, dst, 0); + __lsx_vst(tmp1, dst, 16); + dst += dst_stride; + __lsx_vst(tmp2, dst, 0); + __lsx_vst(tmp3, dst, 16); + dst += dst_stride; + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/intrapred_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/intrapred_lsx.c new file mode 100644 index 0000000000..f990211791 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/intrapred_lsx.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2021 Loongson Technology Corporation Limited + * Contributed by Lu Wang + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" + +static inline void intra_predict_dc_8x8_lsx(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, int32_t dst_stride) { + uint64_t val0, val1; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i store, sum_h, sum_w, sum_d; + __m128i src = { 0 }; + + val0 = *(const uint64_t *)src_top; + val1 = *(const uint64_t *)src_left; + DUP2_ARG3(__lsx_vinsgr2vr_d, src, val0, 0, src, val1, 1, src, src); + sum_h = __lsx_vhaddw_hu_bu(src, src); + sum_w = __lsx_vhaddw_wu_hu(sum_h, sum_h); + sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w); + sum_w = __lsx_vpickev_w(sum_d, sum_d); + sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w); + sum_w = __lsx_vsrari_w(sum_d, 4); + store = __lsx_vreplvei_b(sum_w, 0); + + __lsx_vstelm_d(store, dst, 0, 0); + __lsx_vstelm_d(store, dst + dst_stride, 0, 0); + __lsx_vstelm_d(store, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(store, dst + dst_stride_x3, 0, 0); + dst += dst_stride_x4; + __lsx_vstelm_d(store, dst, 0, 0); + __lsx_vstelm_d(store, dst + dst_stride, 0, 0); + __lsx_vstelm_d(store, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(store, dst + dst_stride_x3, 0, 0); +} + +static inline void intra_predict_dc_16x16_lsx(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, + int32_t dst_stride) { + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i top, left, out; + __m128i sum_h, sum_top, sum_left; + __m128i sum_w; + __m128i sum_d; + + DUP2_ARG2(__lsx_vld, src_top, 0, src_left, 0, top, left); + DUP2_ARG2(__lsx_vhaddw_hu_bu, top, top, left, left, sum_top, sum_left); + sum_h = __lsx_vadd_h(sum_top, sum_left); + sum_w = __lsx_vhaddw_wu_hu(sum_h, sum_h); + sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w); + sum_w = __lsx_vpickev_w(sum_d, sum_d); + sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w); + sum_w = __lsx_vsrari_w(sum_d, 5); + out = __lsx_vreplvei_b(sum_w, 0); + + __lsx_vstx(out, dst, 0); + __lsx_vstx(out, dst, dst_stride); + __lsx_vstx(out, dst, dst_stride_x2); + __lsx_vstx(out, dst, dst_stride_x3); + dst += dst_stride_x4; + __lsx_vstx(out, dst, 0); + __lsx_vstx(out, dst, dst_stride); + __lsx_vstx(out, dst, dst_stride_x2); + __lsx_vstx(out, dst, dst_stride_x3); + dst += dst_stride_x4; + __lsx_vstx(out, dst, 0); + __lsx_vstx(out, dst, dst_stride); + __lsx_vstx(out, dst, dst_stride_x2); + __lsx_vstx(out, dst, dst_stride_x3); + dst += dst_stride_x4; + __lsx_vstx(out, dst, 0); + __lsx_vstx(out, dst, dst_stride); + __lsx_vstx(out, dst, dst_stride_x2); + __lsx_vstx(out, dst, dst_stride_x3); +} + +void vpx_dc_predictor_8x8_lsx(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_8x8_lsx(above, left, dst, y_stride); +} + +void vpx_dc_predictor_16x16_lsx(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_16x16_lsx(above, left, dst, y_stride); +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_16_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_16_lsx.c new file mode 100644 index 0000000000..0503df9966 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_16_lsx.c @@ -0,0 +1,1320 @@ +/* + * Copyright (c) 2022 Loongson Technology Corporation Limited + * Contributed by Hecai Yuan + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/loopfilter_lsx.h" +#include "vpx_ports/mem.h" + +#define LSX_LD_8(_src, _stride, _stride2, _stride3, _stride4, _in0, _in1, \ + _in2, _in3, _in4, _in5, _in6, _in7) \ + do { \ + _in0 = __lsx_vld(_src, 0); \ + _in1 = __lsx_vldx(_src, _stride); \ + _in2 = __lsx_vldx(_src, _stride2); \ + _in3 = __lsx_vldx(_src, _stride3); \ + _src += _stride4; \ + _in4 = __lsx_vld(_src, 0); \ + _in5 = __lsx_vldx(_src, _stride); \ + _in6 = __lsx_vldx(_src, _stride2); \ + _in7 = __lsx_vldx(_src, _stride3); \ + } while (0) + +#define LSX_ST_8(_dst0, _dst1, _dst2, _dst3, _dst4, _dst5, _dst6, _dst7, _dst, \ + _stride, _stride2, _stride3, _stride4) \ + do { \ + __lsx_vst(_dst0, _dst, 0); \ + __lsx_vstx(_dst1, _dst, _stride); \ + __lsx_vstx(_dst2, _dst, _stride2); \ + __lsx_vstx(_dst3, _dst, _stride3); \ + _dst += _stride4; \ + __lsx_vst(_dst4, _dst, 0); \ + __lsx_vstx(_dst5, _dst, _stride); \ + __lsx_vstx(_dst6, _dst, _stride2); \ + __lsx_vstx(_dst7, _dst, _stride3); \ + } while (0) + +static int32_t hz_lpf_t4_and_t8_16w(uint8_t *dst, int32_t stride, + uint8_t *filter48, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + __m128i flat, mask, hev, thresh, b_limit, limit; + __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h; + __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l; + __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l; + __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h; + __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h; + + int32_t stride2 = stride << 1; + int32_t stride3 = stride2 + stride; + int32_t stride4 = stride2 << 1; + + /* load vector elements */ + DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst, + -stride, p3, p2, p1, p0); + + q0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); + q3 = __lsx_vldx(dst, stride3); + + thresh = __lsx_vldrepl_b(thresh_ptr, 0); + b_limit = __lsx_vldrepl_b(b_limit_ptr, 0); + limit = __lsx_vldrepl_b(limit_ptr, 0); + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + if (__lsx_bz_v(flat)) { + __lsx_vstx(p1_out, dst, -stride2); + __lsx_vstx(p0_out, dst, -stride); + __lsx_vst(q0_out, dst, 0); + __lsx_vstx(q1_out, dst, stride); + + return 1; + } + + DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l, + p0_l); + DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l, + q3_l); + + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h); + DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h); + VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h, + p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h); + + /* convert 16 bit output data into 8 bit */ + DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l, + p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l); + DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l, + q1_filt8_l, q2_filt8_l); + + /* store pixel values */ + DUP4_ARG3(__lsx_vbitsel_v, p2, p2_filt8_l, flat, p1_out, p1_filt8_l, flat, + p0_out, p0_filt8_l, flat, q0_out, q0_filt8_l, flat, p2_out, p1_out, + p0_out, q0_out); + DUP2_ARG3(__lsx_vbitsel_v, q1_out, q1_filt8_l, flat, q2, q2_filt8_l, flat, + q1_out, q2_out); + + __lsx_vst(p2_out, filter48, 0); + __lsx_vst(p1_out, filter48, 16); + __lsx_vst(p0_out, filter48, 32); + __lsx_vst(q0_out, filter48, 48); + __lsx_vst(q1_out, filter48, 64); + __lsx_vst(q2_out, filter48, 80); + __lsx_vst(flat, filter48, 96); + + return 0; +} + +static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) { + int32_t stride2 = stride << 1; + int32_t stride3 = stride2 + stride; + int32_t stride4 = stride2 << 1; + uint8_t *dst_tmp0 = dst - stride4; + uint8_t *dst_tmp1 = dst + stride4; + + __m128i flat, flat2, filter8; + __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + __m128i out_h, out_l; + v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in; + v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in; + v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in; + v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in; + v8u16 p7_h_in, p6_h_in, p5_h_in, p4_h_in; + v8u16 p3_h_in, p2_h_in, p1_h_in, p0_h_in; + v8u16 q7_h_in, q6_h_in, q5_h_in, q4_h_in; + v8u16 q3_h_in, q2_h_in, q1_h_in, q0_h_in; + v8u16 tmp0_l, tmp1_l, tmp0_h, tmp1_h; + + flat = __lsx_vld(filter48, 96); + + DUP4_ARG2(__lsx_vldx, dst_tmp0, -stride4, dst_tmp0, -stride3, dst_tmp0, + -stride2, dst_tmp0, -stride, p7, p6, p5, p4); + + p3 = __lsx_vld(dst_tmp0, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp0, stride, dst_tmp0, stride2, p2, p1); + p0 = __lsx_vldx(dst_tmp0, stride3); + + q0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); + q3 = __lsx_vldx(dst, stride3); + + q4 = __lsx_vld(dst_tmp1, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp1, stride, dst_tmp1, stride2, q5, q6); + q7 = __lsx_vldx(dst_tmp1, stride3); + + VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + if (__lsx_bz_v(flat2)) { + DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32, filter48, 48, + p2, p1, p0, q0); + DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2); + __lsx_vstx(p2, dst, -stride3); + __lsx_vstx(p1, dst, -stride2); + __lsx_vstx(p0, dst, -stride); + __lsx_vst(q0, dst, 0); + __lsx_vstx(q1, dst, stride); + __lsx_vstx(q2, dst, stride2); + } else { + dst = dst_tmp0 - stride3; + + p7_l_in = (v8u16)__lsx_vsllwil_hu_bu(p7, 0); + p6_l_in = (v8u16)__lsx_vsllwil_hu_bu(p6, 0); + p5_l_in = (v8u16)__lsx_vsllwil_hu_bu(p5, 0); + p4_l_in = (v8u16)__lsx_vsllwil_hu_bu(p4, 0); + p3_l_in = (v8u16)__lsx_vsllwil_hu_bu(p3, 0); + p2_l_in = (v8u16)__lsx_vsllwil_hu_bu(p2, 0); + p1_l_in = (v8u16)__lsx_vsllwil_hu_bu(p1, 0); + p0_l_in = (v8u16)__lsx_vsllwil_hu_bu(p0, 0); + q0_l_in = (v8u16)__lsx_vsllwil_hu_bu(q0, 0); + + tmp0_l = p7_l_in << 3; + tmp0_l -= p7_l_in; + tmp0_l += p6_l_in; + tmp0_l += q0_l_in; + tmp1_l = p6_l_in + p5_l_in; + tmp1_l += p4_l_in; + tmp1_l += p3_l_in; + tmp1_l += p2_l_in; + tmp1_l += p1_l_in; + tmp1_l += p0_l_in; + tmp1_l += tmp0_l; + + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + p7_h_in = (v8u16)__lsx_vexth_hu_bu(p7); + p6_h_in = (v8u16)__lsx_vexth_hu_bu(p6); + p5_h_in = (v8u16)__lsx_vexth_hu_bu(p5); + p4_h_in = (v8u16)__lsx_vexth_hu_bu(p4); + p3_h_in = (v8u16)__lsx_vexth_hu_bu(p3); + p2_h_in = (v8u16)__lsx_vexth_hu_bu(p2); + p1_h_in = (v8u16)__lsx_vexth_hu_bu(p1); + p0_h_in = (v8u16)__lsx_vexth_hu_bu(p0); + q0_h_in = (v8u16)__lsx_vexth_hu_bu(q0); + + tmp0_h = p7_h_in << 3; + tmp0_h -= p7_h_in; + tmp0_h += p6_h_in; + tmp0_h += q0_h_in; + tmp1_h = p6_h_in + p5_h_in; + tmp1_h += p4_h_in; + tmp1_h += p3_h_in; + tmp1_h += p2_h_in; + tmp1_h += p1_h_in; + tmp1_h += p0_h_in; + tmp1_h += tmp0_h; + + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + p6 = __lsx_vbitsel_v(p6, out_l, flat2); + __lsx_vst(p6, dst, 0); + dst += stride; + + /* p5 */ + q1_l_in = (v8u16)__lsx_vsllwil_hu_bu(q1, 0); + tmp0_l = p5_l_in - p6_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + q1_h_in = (v8u16)__lsx_vexth_hu_bu(q1); + tmp0_h = p5_h_in - p6_h_in; + tmp0_h += q1_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + p5 = __lsx_vbitsel_v(p5, out_l, flat2); + __lsx_vst(p5, dst, 0); + dst += stride; + + /* p4 */ + q2_l_in = (v8u16)__lsx_vsllwil_hu_bu(q2, 0); + tmp0_l = p4_l_in - p5_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + q2_h_in = (v8u16)__lsx_vexth_hu_bu(q2); + tmp0_h = p4_h_in - p5_h_in; + tmp0_h += q2_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + p4 = __lsx_vbitsel_v(p4, out_l, flat2); + __lsx_vst(p4, dst, 0); + dst += stride; + + /* p3 */ + q3_l_in = (v8u16)__lsx_vsllwil_hu_bu(q3, 0); + tmp0_l = p3_l_in - p4_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + q3_h_in = (v8u16)__lsx_vexth_hu_bu(q3); + tmp0_h = p3_h_in - p4_h_in; + tmp0_h += q3_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + p3 = __lsx_vbitsel_v(p3, out_l, flat2); + __lsx_vst(p3, dst, 0); + dst += stride; + + /* p2 */ + q4_l_in = (v8u16)__lsx_vsllwil_hu_bu(q4, 0); + filter8 = __lsx_vld(filter48, 0); + tmp0_l = p2_l_in - p3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + q4_h_in = (v8u16)__lsx_vexth_hu_bu(q4); + tmp0_h = p2_h_in - p3_h_in; + tmp0_h += q4_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 0); + dst += stride; + + /* p1 */ + q5_l_in = (v8u16)__lsx_vsllwil_hu_bu(q5, 0); + filter8 = __lsx_vld(filter48, 16); + tmp0_l = p1_l_in - p2_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + q5_h_in = (v8u16)__lsx_vexth_hu_bu(q5); + tmp0_h = p1_h_in - p2_h_in; + tmp0_h += q5_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 0); + dst += stride; + + /* p0 */ + q6_l_in = (v8u16)__lsx_vsllwil_hu_bu(q6, 0); + filter8 = __lsx_vld(filter48, 32); + tmp0_l = p0_l_in - p1_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + q6_h_in = (v8u16)__lsx_vexth_hu_bu(q6); + tmp0_h = p0_h_in - p1_h_in; + tmp0_h += q6_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 0); + dst += stride; + + /* q0 */ + q7_l_in = (v8u16)__lsx_vsllwil_hu_bu(q7, 0); + filter8 = __lsx_vld(filter48, 48); + tmp0_l = q7_l_in - p0_l_in; + tmp0_l += q0_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + q7_h_in = (v8u16)__lsx_vexth_hu_bu(q7); + tmp0_h = q7_h_in - p0_h_in; + tmp0_h += q0_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 0); + dst += stride; + + /* q1 */ + filter8 = __lsx_vld(filter48, 64); + tmp0_l = q7_l_in - q0_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p6_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + tmp0_h = q7_h_in - q0_h_in; + tmp0_h += q1_h_in; + tmp0_h -= p6_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 0); + dst += stride; + + /* q2 */ + filter8 = __lsx_vld(filter48, 80); + tmp0_l = q7_l_in - q1_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p5_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + tmp0_h = q7_h_in - q1_h_in; + tmp0_h += q2_h_in; + tmp0_h -= p5_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 0); + dst += stride; + + /* q3 */ + tmp0_l = q7_l_in - q2_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p4_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + tmp0_h = q7_h_in - q2_h_in; + tmp0_h += q3_h_in; + tmp0_h -= p4_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + q3 = __lsx_vbitsel_v(q3, out_l, flat2); + __lsx_vst(q3, dst, 0); + dst += stride; + + /* q4 */ + tmp0_l = q7_l_in - q3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p3_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + tmp0_h = q7_h_in - q3_h_in; + tmp0_h += q4_h_in; + tmp0_h -= p3_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + q4 = __lsx_vbitsel_v(q4, out_l, flat2); + __lsx_vst(q4, dst, 0); + dst += stride; + + /* q5 */ + tmp0_l = q7_l_in - q4_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p2_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + tmp0_h = q7_h_in - q4_h_in; + tmp0_h += q5_h_in; + tmp0_h -= p2_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + q5 = __lsx_vbitsel_v(q5, out_l, flat2); + __lsx_vst(q5, dst, 0); + dst += stride; + + /* q6 */ + tmp0_l = q7_l_in - q5_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p1_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + tmp0_h = q7_h_in - q5_h_in; + tmp0_h += q6_h_in; + tmp0_h -= p1_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + q6 = __lsx_vbitsel_v(q6, out_l, flat2); + __lsx_vst(q6, dst, 0); + } +} + +static void mb_lpf_horizontal_edge_dual(uint8_t *dst, int32_t stride, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + DECLARE_ALIGNED(16, uint8_t, filter48[16 * 8]); + uint8_t early_exit = 0; + + early_exit = hz_lpf_t4_and_t8_16w(dst, stride, &filter48[0], b_limit_ptr, + limit_ptr, thresh_ptr); + + if (early_exit == 0) { + hz_lpf_t16_16w(dst, stride, filter48); + } +} + +static void mb_lpf_horizontal_edge(uint8_t *dst, int32_t stride, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr, int32_t count) { + if (count == 1) { + __m128i flat2, mask, hev, flat, thresh, b_limit, limit; + __m128i p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7; + __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + __m128i p0_filter16, p1_filter16; + __m128i p2_filter8, p1_filter8, p0_filter8; + __m128i q0_filter8, q1_filter8, q2_filter8; + __m128i p7_l, p6_l, p5_l, p4_l, q7_l, q6_l, q5_l, q4_l; + __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l; + __m128i zero = __lsx_vldi(0); + __m128i tmp0, tmp1, tmp2; + + int32_t stride2 = stride << 1; + int32_t stride3 = 2 + stride; + int32_t stride4 = stride << 2; + uint8_t *dst_tmp0 = dst - stride4; + uint8_t *dst_tmp1 = dst + stride4; + + /* load vector elements */ + DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst, + -stride, p3, p2, p1, p0); + q0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); + q3 = __lsx_vldx(dst, stride3); + + thresh = __lsx_vldrepl_b(thresh_ptr, 0); + b_limit = __lsx_vldrepl_b(b_limit_ptr, 0); + limit = __lsx_vldrepl_b(limit_ptr, 0); + + /* filter_mask* */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, + q1_out); + flat = __lsx_vilvl_d(zero, flat); + if (__lsx_bz_v(flat)) { + __lsx_vstelm_d(p1_out, dst - stride2, 0, 0); + __lsx_vstelm_d(p0_out, dst - stride, 0, 0); + __lsx_vstelm_d(q0_out, dst, 0, 0); + __lsx_vstelm_d(q1_out, dst + stride, 0, 0); + } else { + /* convert 8 bit input data into 16 bit */ + DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, + p2_l, p1_l, p0_l); + DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, + q1_l, q2_l, q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filter8, + p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); + + /* convert 16 bit output data into 8 bit */ + DUP4_ARG2(__lsx_vpickev_b, zero, p2_filter8, zero, p1_filter8, zero, + p0_filter8, zero, q0_filter8, p2_filter8, p1_filter8, + p0_filter8, q0_filter8); + DUP2_ARG2(__lsx_vpickev_b, zero, q1_filter8, zero, q2_filter8, q1_filter8, + q2_filter8); + + /* store pixel values */ + p2_out = __lsx_vbitsel_v(p2, p2_filter8, flat); + p1_out = __lsx_vbitsel_v(p1_out, p1_filter8, flat); + p0_out = __lsx_vbitsel_v(p0_out, p0_filter8, flat); + q0_out = __lsx_vbitsel_v(q0_out, q0_filter8, flat); + q1_out = __lsx_vbitsel_v(q1_out, q1_filter8, flat); + q2_out = __lsx_vbitsel_v(q2, q2_filter8, flat); + + /* load 16 vector elements */ + DUP4_ARG2(__lsx_vldx, dst_tmp0, -stride4, dst_tmp0, -stride3, dst_tmp0, + -stride2, dst_tmp0, -stride, p7, p6, p5, p4); + q4 = __lsx_vld(dst_tmp1, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp1, stride, dst_tmp1, stride2, q5, q6); + q7 = __lsx_vldx(dst_tmp1, stride3); + + VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + if (__lsx_bz_v(flat2)) { + dst -= stride3; + __lsx_vstelm_d(p2_out, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p1_out, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p0_out, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(q0_out, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(q1_out, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(q2_out, dst, 0, 0); + } else { + /* LSB(right) 8 pixel operation */ + DUP4_ARG2(__lsx_vilvl_b, zero, p7, zero, p6, zero, p5, zero, p4, p7_l, + p6_l, p5_l, p4_l); + DUP4_ARG2(__lsx_vilvl_b, zero, q4, zero, q5, zero, q6, zero, q7, q4_l, + q5_l, q6_l, q7_l); + + tmp0 = __lsx_vslli_h(p7_l, 3); + tmp0 = __lsx_vsub_h(tmp0, p7_l); + tmp0 = __lsx_vadd_h(tmp0, p6_l); + tmp0 = __lsx_vadd_h(tmp0, q0_l); + + dst = dst_tmp0 - stride3; + + /* calculation of p6 and p5 */ + tmp1 = __lsx_vadd_h(p6_l, p5_l); + tmp1 = __lsx_vadd_h(tmp1, p4_l); + tmp1 = __lsx_vadd_h(tmp1, p3_l); + tmp1 = __lsx_vadd_h(tmp1, p2_l); + tmp1 = __lsx_vadd_h(tmp1, p1_l); + tmp1 = __lsx_vadd_h(tmp1, p0_l); + tmp1 = __lsx_vadd_h(tmp1, tmp0); + p0_filter16 = __lsx_vsrari_h(tmp1, 4); + tmp0 = __lsx_vsub_h(p5_l, p6_l); + tmp0 = __lsx_vadd_h(tmp0, q1_l); + tmp0 = __lsx_vsub_h(tmp0, p7_l); + tmp1 = __lsx_vadd_h(tmp1, tmp0); + p1_filter16 = __lsx_vsrari_h(tmp1, 4); + DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + DUP2_ARG3(__lsx_vbitsel_v, p6, p0_filter16, flat2, p5, p1_filter16, + flat2, p0_filter16, p1_filter16); + __lsx_vstelm_d(p0_filter16, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p1_filter16, dst, 0, 0); + dst += stride; + + /* calculation of p4 and p3 */ + tmp0 = __lsx_vsub_h(p4_l, p5_l); + tmp0 = __lsx_vadd_h(tmp0, q2_l); + tmp0 = __lsx_vsub_h(tmp0, p7_l); + tmp2 = __lsx_vsub_h(p3_l, p4_l); + tmp2 = __lsx_vadd_h(tmp2, q3_l); + tmp2 = __lsx_vsub_h(tmp2, p7_l); + tmp1 = __lsx_vadd_h(tmp1, tmp0); + p0_filter16 = __lsx_vsrari_h(tmp1, 4); + tmp1 = __lsx_vadd_h(tmp1, tmp2); + p1_filter16 = __lsx_vsrari_h(tmp1, 4); + DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + DUP2_ARG3(__lsx_vbitsel_v, p4, p0_filter16, flat2, p3, p1_filter16, + flat2, p0_filter16, p1_filter16); + __lsx_vstelm_d(p0_filter16, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p1_filter16, dst, 0, 0); + dst += stride; + + /* calculation of p2 and p1 */ + tmp0 = __lsx_vsub_h(p2_l, p3_l); + tmp0 = __lsx_vadd_h(tmp0, q4_l); + tmp0 = __lsx_vsub_h(tmp0, p7_l); + tmp2 = __lsx_vsub_h(p1_l, p2_l); + tmp2 = __lsx_vadd_h(tmp2, q5_l); + tmp2 = __lsx_vsub_h(tmp2, p7_l); + tmp1 = __lsx_vadd_h(tmp1, tmp0); + p0_filter16 = __lsx_vsrari_h(tmp1, 4); + tmp1 = __lsx_vadd_h(tmp1, tmp2); + p1_filter16 = __lsx_vsrari_h(tmp1, 4); + DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + DUP2_ARG3(__lsx_vbitsel_v, p2_out, p0_filter16, flat2, p1_out, + p1_filter16, flat2, p0_filter16, p1_filter16); + __lsx_vstelm_d(p0_filter16, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p1_filter16, dst, 0, 0); + dst += stride; + + /* calculation of p0 and q0 */ + tmp0 = __lsx_vsub_h(p0_l, p1_l); + tmp0 = __lsx_vadd_h(tmp0, q6_l); + tmp0 = __lsx_vsub_h(tmp0, p7_l); + tmp2 = __lsx_vsub_h(q7_l, p0_l); + tmp2 = __lsx_vadd_h(tmp2, q0_l); + tmp2 = __lsx_vsub_h(tmp2, p7_l); + tmp1 = __lsx_vadd_h(tmp1, tmp0); + p0_filter16 = __lsx_vsrari_h(tmp1, 4); + tmp1 = __lsx_vadd_h(tmp1, tmp2); + p1_filter16 = __lsx_vsrari_h(tmp1, 4); + DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + DUP2_ARG3(__lsx_vbitsel_v, p0_out, p0_filter16, flat2, q0_out, + p1_filter16, flat2, p0_filter16, p1_filter16); + __lsx_vstelm_d(p0_filter16, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p1_filter16, dst, 0, 0); + dst += stride; + + /* calculation of q1 and q2 */ + tmp0 = __lsx_vsub_h(q7_l, q0_l); + tmp0 = __lsx_vadd_h(tmp0, q1_l); + tmp0 = __lsx_vsub_h(tmp0, p6_l); + tmp2 = __lsx_vsub_h(q7_l, q1_l); + tmp2 = __lsx_vadd_h(tmp2, q2_l); + tmp2 = __lsx_vsub_h(tmp2, p5_l); + tmp1 = __lsx_vadd_h(tmp1, tmp0); + p0_filter16 = __lsx_vsrari_h(tmp1, 4); + tmp1 = __lsx_vadd_h(tmp1, tmp2); + p1_filter16 = __lsx_vsrari_h(tmp1, 4); + DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + DUP2_ARG3(__lsx_vbitsel_v, q1_out, p0_filter16, flat2, q2_out, + p1_filter16, flat2, p0_filter16, p1_filter16); + __lsx_vstelm_d(p0_filter16, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p1_filter16, dst, 0, 0); + dst += stride; + + /* calculation of q3 and q4 */ + tmp0 = __lsx_vsub_h(q7_l, q2_l); + tmp0 = __lsx_vadd_h(tmp0, q3_l); + tmp0 = __lsx_vsub_h(tmp0, p4_l); + tmp2 = __lsx_vsub_h(q7_l, q3_l); + tmp2 = __lsx_vadd_h(tmp2, q4_l); + tmp2 = __lsx_vsub_h(tmp2, p3_l); + tmp1 = __lsx_vadd_h(tmp1, tmp0); + p0_filter16 = __lsx_vsrari_h(tmp1, 4); + tmp1 = __lsx_vadd_h(tmp1, tmp2); + p1_filter16 = __lsx_vsrari_h(tmp1, 4); + DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + DUP2_ARG3(__lsx_vbitsel_v, q3, p0_filter16, flat2, q4, p1_filter16, + flat2, p0_filter16, p1_filter16); + __lsx_vstelm_d(p0_filter16, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p1_filter16, dst, 0, 0); + dst += stride; + + /* calculation of q5 and q6 */ + tmp0 = __lsx_vsub_h(q7_l, q4_l); + tmp0 = __lsx_vadd_h(tmp0, q5_l); + tmp0 = __lsx_vsub_h(tmp0, p2_l); + tmp2 = __lsx_vsub_h(q7_l, q5_l); + tmp2 = __lsx_vadd_h(tmp2, q6_l); + tmp2 = __lsx_vsub_h(tmp2, p1_l); + tmp1 = __lsx_vadd_h(tmp1, tmp0); + p0_filter16 = __lsx_vsrari_h(tmp1, 4); + tmp1 = __lsx_vadd_h(tmp1, tmp2); + p1_filter16 = __lsx_vsrari_h(tmp1, 4); + DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + DUP2_ARG3(__lsx_vbitsel_v, q5, p0_filter16, flat2, q6, p1_filter16, + flat2, p0_filter16, p1_filter16); + __lsx_vstelm_d(p0_filter16, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p1_filter16, dst, 0, 0); + } + } + } else { + mb_lpf_horizontal_edge_dual(dst, stride, b_limit_ptr, limit_ptr, + thresh_ptr); + } +} + +void vpx_lpf_horizontal_16_dual_lsx(uint8_t *dst, int32_t stride, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + mb_lpf_horizontal_edge(dst, stride, b_limit_ptr, limit_ptr, thresh_ptr, 2); +} + +static void transpose_16x16(uint8_t *input, int32_t in_stride, uint8_t *output, + int32_t out_stride) { + __m128i row0, row1, row2, row3, row4, row5, row6, row7; + __m128i row8, row9, row10, row11, row12, row13, row14, row15; + __m128i tmp0, tmp1, tmp4, tmp5, tmp6, tmp7; + __m128i tmp2, tmp3; + __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + int32_t in_stride2 = in_stride << 1; + int32_t in_stride3 = in_stride2 + in_stride; + int32_t in_stride4 = in_stride2 << 1; + int32_t out_stride2 = out_stride << 1; + int32_t out_stride3 = out_stride2 + out_stride; + int32_t out_stride4 = out_stride2 << 1; + + LSX_LD_8(input, in_stride, in_stride2, in_stride3, in_stride4, row0, row1, + row2, row3, row4, row5, row6, row7); + input += in_stride4; + LSX_LD_8(input, in_stride, in_stride2, in_stride3, in_stride4, row8, row9, + row10, row11, row12, row13, row14, row15); + + LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p7, p6, + p5, p4, p3, p2, p1, p0); + + /* transpose 16x8 matrix into 8x16 */ + /* total 8 intermediate register and 32 instructions */ + q7 = __lsx_vpackod_d(row8, row0); + q6 = __lsx_vpackod_d(row9, row1); + q5 = __lsx_vpackod_d(row10, row2); + q4 = __lsx_vpackod_d(row11, row3); + q3 = __lsx_vpackod_d(row12, row4); + q2 = __lsx_vpackod_d(row13, row5); + q1 = __lsx_vpackod_d(row14, row6); + q0 = __lsx_vpackod_d(row15, row7); + + DUP2_ARG2(__lsx_vpackev_b, q6, q7, q4, q5, tmp0, tmp1); + DUP2_ARG2(__lsx_vpackod_b, q6, q7, q4, q5, tmp4, tmp5); + + DUP2_ARG2(__lsx_vpackev_b, q2, q3, q0, q1, q5, q7); + DUP2_ARG2(__lsx_vpackod_b, q2, q3, q0, q1, tmp6, tmp7); + + DUP2_ARG2(__lsx_vpackev_h, tmp1, tmp0, q7, q5, tmp2, tmp3); + q0 = __lsx_vpackev_w(tmp3, tmp2); + q4 = __lsx_vpackod_w(tmp3, tmp2); + + tmp2 = __lsx_vpackod_h(tmp1, tmp0); + tmp3 = __lsx_vpackod_h(q7, q5); + q2 = __lsx_vpackev_w(tmp3, tmp2); + q6 = __lsx_vpackod_w(tmp3, tmp2); + + DUP2_ARG2(__lsx_vpackev_h, tmp5, tmp4, tmp7, tmp6, tmp2, tmp3); + q1 = __lsx_vpackev_w(tmp3, tmp2); + q5 = __lsx_vpackod_w(tmp3, tmp2); + + tmp2 = __lsx_vpackod_h(tmp5, tmp4); + tmp3 = __lsx_vpackod_h(tmp7, tmp6); + q3 = __lsx_vpackev_w(tmp3, tmp2); + q7 = __lsx_vpackod_w(tmp3, tmp2); + + LSX_ST_8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_stride, out_stride2, + out_stride3, out_stride4); + output += out_stride4; + LSX_ST_8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_stride, out_stride2, + out_stride3, out_stride4); +} + +static int32_t vt_lpf_t4_and_t8_16w(uint8_t *dst, uint8_t *filter48, + uint8_t *dst_org, int32_t stride, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + int32_t stride2 = stride << 1; + int32_t stride3 = stride2 + stride; + int32_t stride4 = stride2 << 1; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + __m128i flat, mask, hev, thresh, b_limit, limit; + __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h; + __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l; + __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l; + __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h; + __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h; + __m128i vec0, vec1, vec2, vec3, vec4, vec5; + + /* load vector elements */ + DUP4_ARG2(__lsx_vld, dst, -64, dst, -48, dst, -32, dst, -16, p3, p2, p1, p0); + DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3); + + thresh = __lsx_vldrepl_b(thresh_ptr, 0); + b_limit = __lsx_vldrepl_b(b_limit_ptr, 0); + limit = __lsx_vldrepl_b(limit_ptr, 0); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + /* flat4 */ + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + /* if flat is zero for all pixels, then no need to calculate other filter */ + if (__lsx_bz_v(flat)) { + DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1); + vec2 = __lsx_vilvl_h(vec1, vec0); + vec3 = __lsx_vilvh_h(vec1, vec0); + DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1); + vec4 = __lsx_vilvl_h(vec1, vec0); + vec5 = __lsx_vilvh_h(vec1, vec0); + + dst_org -= 2; + __lsx_vstelm_w(vec2, dst_org, 0, 0); + __lsx_vstelm_w(vec2, dst_org + stride, 0, 1); + __lsx_vstelm_w(vec2, dst_org + stride2, 0, 2); + __lsx_vstelm_w(vec2, dst_org + stride3, 0, 3); + dst_org += stride4; + __lsx_vstelm_w(vec3, dst_org, 0, 0); + __lsx_vstelm_w(vec3, dst_org + stride, 0, 1); + __lsx_vstelm_w(vec3, dst_org + stride2, 0, 2); + __lsx_vstelm_w(vec3, dst_org + stride3, 0, 3); + dst_org += stride4; + __lsx_vstelm_w(vec4, dst_org, 0, 0); + __lsx_vstelm_w(vec4, dst_org + stride, 0, 1); + __lsx_vstelm_w(vec4, dst_org + stride2, 0, 2); + __lsx_vstelm_w(vec4, dst_org + stride3, 0, 3); + dst_org += stride4; + __lsx_vstelm_w(vec5, dst_org, 0, 0); + __lsx_vstelm_w(vec5, dst_org + stride, 0, 1); + __lsx_vstelm_w(vec5, dst_org + stride2, 0, 2); + __lsx_vstelm_w(vec5, dst_org + stride3, 0, 3); + + return 1; + } + + DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l, + p0_l); + DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l, + q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h); + DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h); + VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h, + p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h); + + /* convert 16 bit output data into 8 bit */ + DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l, + p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l); + DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l, + q1_filt8_l, q2_filt8_l); + + /* store pixel values */ + p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat); + p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat); + p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat); + q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat); + q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat); + q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat); + + __lsx_vst(p2_out, filter48, 0); + __lsx_vst(p1_out, filter48, 16); + __lsx_vst(p0_out, filter48, 32); + __lsx_vst(q0_out, filter48, 48); + __lsx_vst(q1_out, filter48, 64); + __lsx_vst(q2_out, filter48, 80); + __lsx_vst(flat, filter48, 96); + + return 0; +} + +static int32_t vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, int32_t stride, + uint8_t *filter48) { + __m128i flat, flat2, filter8; + __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + __m128i out_l, out_h; + v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in; + v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in; + v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in; + v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in; + v8u16 p7_h_in, p6_h_in, p5_h_in, p4_h_in; + v8u16 p3_h_in, p2_h_in, p1_h_in, p0_h_in; + v8u16 q7_h_in, q6_h_in, q5_h_in, q4_h_in; + v8u16 q3_h_in, q2_h_in, q1_h_in, q0_h_in; + v8u16 tmp0_l, tmp1_l, tmp0_h, tmp1_h; + uint8_t *dst_tmp = dst - 128; + + flat = __lsx_vld(filter48, 96); + + DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48, p7, + p6, p5, p4); + DUP4_ARG2(__lsx_vld, dst_tmp, 64, dst_tmp, 80, dst_tmp, 96, dst_tmp, 112, p3, + p2, p1, p0); + DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3); + DUP4_ARG2(__lsx_vld, dst, 64, dst, 80, dst, 96, dst, 112, q4, q5, q6, q7); + + VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + /* if flat2 is zero for all pixels, then no need to calculate other filter */ + if (__lsx_bz_v(flat2)) { + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + + DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32, filter48, 48, + p2, p1, p0, q0); + DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2); + + DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1); + vec3 = __lsx_vilvl_h(vec1, vec0); + vec4 = __lsx_vilvh_h(vec1, vec0); + DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1); + vec6 = __lsx_vilvl_h(vec1, vec0); + vec7 = __lsx_vilvh_h(vec1, vec0); + vec2 = __lsx_vilvl_b(q2, q1); + vec5 = __lsx_vilvh_b(q2, q1); + + dst_org -= 3; + __lsx_vstelm_w(vec3, dst_org, 0, 0); + __lsx_vstelm_h(vec2, dst_org, 4, 0); + dst_org += stride; + __lsx_vstelm_w(vec3, dst_org, 0, 1); + __lsx_vstelm_h(vec2, dst_org, 4, 1); + dst_org += stride; + __lsx_vstelm_w(vec3, dst_org, 0, 2); + __lsx_vstelm_h(vec2, dst_org, 4, 2); + dst_org += stride; + __lsx_vstelm_w(vec3, dst_org, 0, 3); + __lsx_vstelm_h(vec2, dst_org, 4, 3); + dst_org += stride; + __lsx_vstelm_w(vec4, dst_org, 0, 0); + __lsx_vstelm_h(vec2, dst_org, 4, 4); + dst_org += stride; + __lsx_vstelm_w(vec4, dst_org, 0, 1); + __lsx_vstelm_h(vec2, dst_org, 4, 5); + dst_org += stride; + __lsx_vstelm_w(vec4, dst_org, 0, 2); + __lsx_vstelm_h(vec2, dst_org, 4, 6); + dst_org += stride; + __lsx_vstelm_w(vec4, dst_org, 0, 3); + __lsx_vstelm_h(vec2, dst_org, 4, 7); + dst_org += stride; + __lsx_vstelm_w(vec6, dst_org, 0, 0); + __lsx_vstelm_h(vec5, dst_org, 4, 0); + dst_org += stride; + __lsx_vstelm_w(vec6, dst_org, 0, 1); + __lsx_vstelm_h(vec5, dst_org, 4, 1); + dst_org += stride; + __lsx_vstelm_w(vec6, dst_org, 0, 2); + __lsx_vstelm_h(vec5, dst_org, 4, 2); + dst_org += stride; + __lsx_vstelm_w(vec6, dst_org, 0, 3); + __lsx_vstelm_h(vec5, dst_org, 4, 3); + dst_org += stride; + __lsx_vstelm_w(vec7, dst_org, 0, 0); + __lsx_vstelm_h(vec5, dst_org, 4, 4); + dst_org += stride; + __lsx_vstelm_w(vec7, dst_org, 0, 1); + __lsx_vstelm_h(vec5, dst_org, 4, 5); + dst_org += stride; + __lsx_vstelm_w(vec7, dst_org, 0, 2); + __lsx_vstelm_h(vec5, dst_org, 4, 6); + dst_org += stride; + __lsx_vstelm_w(vec7, dst_org, 0, 3); + __lsx_vstelm_h(vec5, dst_org, 4, 7); + + return 1; + } + + dst -= 7 * 16; + + p7_l_in = (v8u16)__lsx_vsllwil_hu_bu(p7, 0); + p6_l_in = (v8u16)__lsx_vsllwil_hu_bu(p6, 0); + p5_l_in = (v8u16)__lsx_vsllwil_hu_bu(p5, 0); + p4_l_in = (v8u16)__lsx_vsllwil_hu_bu(p4, 0); + p3_l_in = (v8u16)__lsx_vsllwil_hu_bu(p3, 0); + p2_l_in = (v8u16)__lsx_vsllwil_hu_bu(p2, 0); + p1_l_in = (v8u16)__lsx_vsllwil_hu_bu(p1, 0); + p0_l_in = (v8u16)__lsx_vsllwil_hu_bu(p0, 0); + q0_l_in = (v8u16)__lsx_vsllwil_hu_bu(q0, 0); + + tmp0_l = p7_l_in << 3; + tmp0_l -= p7_l_in; + tmp0_l += p6_l_in; + tmp0_l += q0_l_in; + tmp1_l = p6_l_in + p5_l_in; + tmp1_l += p4_l_in; + tmp1_l += p3_l_in; + tmp1_l += p2_l_in; + tmp1_l += p1_l_in; + tmp1_l += p0_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + p7_h_in = (v8u16)__lsx_vexth_hu_bu(p7); + p6_h_in = (v8u16)__lsx_vexth_hu_bu(p6); + p5_h_in = (v8u16)__lsx_vexth_hu_bu(p5); + p4_h_in = (v8u16)__lsx_vexth_hu_bu(p4); + p3_h_in = (v8u16)__lsx_vexth_hu_bu(p3); + p2_h_in = (v8u16)__lsx_vexth_hu_bu(p2); + p1_h_in = (v8u16)__lsx_vexth_hu_bu(p1); + p0_h_in = (v8u16)__lsx_vexth_hu_bu(p0); + q0_h_in = (v8u16)__lsx_vexth_hu_bu(q0); + + tmp0_h = p7_h_in << 3; + tmp0_h -= p7_h_in; + tmp0_h += p6_h_in; + tmp0_h += q0_h_in; + tmp1_h = p6_h_in + p5_h_in; + tmp1_h += p4_h_in; + tmp1_h += p3_h_in; + tmp1_h += p2_h_in; + tmp1_h += p1_h_in; + tmp1_h += p0_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + p6 = __lsx_vbitsel_v(p6, out_l, flat2); + __lsx_vst(p6, dst, 0); + + /* p5 */ + q1_l_in = (v8u16)__lsx_vsllwil_hu_bu(q1, 0); + tmp0_l = p5_l_in - p6_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + q1_h_in = (v8u16)__lsx_vexth_hu_bu(q1); + tmp0_h = p5_h_in - p6_h_in; + tmp0_h += q1_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + p5 = __lsx_vbitsel_v(p5, out_l, flat2); + __lsx_vst(p5, dst, 16); + + /* p4 */ + q2_l_in = (v8u16)__lsx_vsllwil_hu_bu(q2, 0); + tmp0_l = p4_l_in - p5_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + q2_h_in = (v8u16)__lsx_vexth_hu_bu(q2); + tmp0_h = p4_h_in - p5_h_in; + tmp0_h += q2_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + p4 = __lsx_vbitsel_v(p4, out_l, flat2); + __lsx_vst(p4, dst, 16 * 2); + + /* p3 */ + q3_l_in = (v8u16)__lsx_vsllwil_hu_bu(q3, 0); + tmp0_l = p3_l_in - p4_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + q3_h_in = (v8u16)__lsx_vexth_hu_bu(q3); + tmp0_h = p3_h_in - p4_h_in; + tmp0_h += q3_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + p3 = __lsx_vbitsel_v(p3, out_l, flat2); + __lsx_vst(p3, dst, 16 * 3); + + /* p2 */ + q4_l_in = (v8u16)__lsx_vsllwil_hu_bu(q4, 0); + filter8 = __lsx_vld(filter48, 0); + tmp0_l = p2_l_in - p3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + q4_h_in = (v8u16)__lsx_vexth_hu_bu(q4); + tmp0_h = p2_h_in - p3_h_in; + tmp0_h += q4_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 16 * 4); + + /* p1 */ + q5_l_in = (v8u16)__lsx_vsllwil_hu_bu(q5, 0); + filter8 = __lsx_vld(filter48, 16); + tmp0_l = p1_l_in - p2_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + q5_h_in = (v8u16)__lsx_vexth_hu_bu(q5); + tmp0_h = p1_h_in - p2_h_in; + tmp0_h += q5_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)(tmp1_h), 4); + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 16 * 5); + + /* p0 */ + q6_l_in = (v8u16)__lsx_vsllwil_hu_bu(q6, 0); + filter8 = __lsx_vld(filter48, 32); + tmp0_l = p0_l_in - p1_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + q6_h_in = (v8u16)__lsx_vexth_hu_bu(q6); + tmp0_h = p0_h_in - p1_h_in; + tmp0_h += q6_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 16 * 6); + + /* q0 */ + q7_l_in = (v8u16)__lsx_vsllwil_hu_bu(q7, 0); + filter8 = __lsx_vld(filter48, 48); + tmp0_l = q7_l_in - p0_l_in; + tmp0_l += q0_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + q7_h_in = (v8u16)__lsx_vexth_hu_bu(q7); + tmp0_h = q7_h_in - p0_h_in; + tmp0_h += q0_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 16 * 7); + + /* q1 */ + filter8 = __lsx_vld(filter48, 64); + tmp0_l = q7_l_in - q0_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p6_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + tmp0_h = q7_h_in - q0_h_in; + tmp0_h += q1_h_in; + tmp0_h -= p6_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 16 * 8); + + /* q2 */ + filter8 = __lsx_vld(filter48, 80); + tmp0_l = q7_l_in - q1_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p5_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + tmp0_h = q7_h_in - q1_h_in; + tmp0_h += q2_h_in; + tmp0_h -= p5_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 16 * 9); + + /* q3 */ + tmp0_l = q7_l_in - q2_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p4_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + tmp0_h = q7_h_in - q2_h_in; + tmp0_h += q3_h_in; + tmp0_h -= p4_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + q3 = __lsx_vbitsel_v(q3, out_l, flat2); + __lsx_vst(q3, dst, 16 * 10); + + /* q4 */ + tmp0_l = q7_l_in - q3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p3_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + tmp0_h = q7_h_in - q3_h_in; + tmp0_h += q4_h_in; + tmp0_h -= p3_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + q4 = __lsx_vbitsel_v(q4, out_l, flat2); + __lsx_vst(q4, dst, 16 * 11); + + /* q5 */ + tmp0_l = q7_l_in - q4_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p2_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + tmp0_h = q7_h_in - q4_h_in; + tmp0_h += q5_h_in; + tmp0_h -= p2_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + q5 = __lsx_vbitsel_v(q5, out_l, flat2); + __lsx_vst(q5, dst, 16 * 12); + + /* q6 */ + tmp0_l = q7_l_in - q5_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p1_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + tmp0_h = q7_h_in - q5_h_in; + tmp0_h += q6_h_in; + tmp0_h -= p1_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + q6 = __lsx_vbitsel_v(q6, out_l, flat2); + __lsx_vst(q6, dst, 16 * 13); + + return 0; +} + +void vpx_lpf_vertical_16_dual_lsx(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + uint8_t early_exit = 0; + DECLARE_ALIGNED(16, uint8_t, transposed_input[16 * 24]); + uint8_t *filter48 = &transposed_input[16 * 16]; + + transpose_16x16((src - 8), pitch, &transposed_input[0], 16); + + early_exit = + vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src, + pitch, b_limit_ptr, limit_ptr, thresh_ptr); + + if (early_exit == 0) { + early_exit = + vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, &filter48[0]); + + if (early_exit == 0) { + transpose_16x16(transposed_input, 16, (src - 8), pitch); + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_4_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_4_lsx.c new file mode 100644 index 0000000000..9300b5c5ae --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_4_lsx.c @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/loopfilter_lsx.h" + +void vpx_lpf_horizontal_4_lsx(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + __m128i mask, hev, flat, thresh, b_limit, limit; + __m128i p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out; + int32_t pitch2 = pitch << 1; + int32_t pitch3 = pitch2 + pitch; + int32_t pitch4 = pitch2 << 1; + + DUP4_ARG2(__lsx_vldx, src, -pitch4, src, -pitch3, src, -pitch2, src, -pitch, + p3, p2, p1, p0); + q0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch2, q1, q2); + q3 = __lsx_vldx(src, pitch3); + + thresh = __lsx_vldrepl_b(thresh_ptr, 0); + b_limit = __lsx_vldrepl_b(b_limit_ptr, 0); + limit = __lsx_vldrepl_b(limit_ptr, 0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + __lsx_vstelm_d(p1_out, src - pitch2, 0, 0); + __lsx_vstelm_d(p0_out, src - pitch, 0, 0); + __lsx_vstelm_d(q0_out, src, 0, 0); + __lsx_vstelm_d(q1_out, src + pitch, 0, 0); +} + +void vpx_lpf_horizontal_4_dual_lsx(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0_ptr, + const uint8_t *limit0_ptr, + const uint8_t *thresh0_ptr, + const uint8_t *b_limit1_ptr, + const uint8_t *limit1_ptr, + const uint8_t *thresh1_ptr) { + __m128i mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + int32_t pitch2 = pitch << 1; + int32_t pitch3 = pitch2 + pitch; + int32_t pitch4 = pitch2 << 1; + + DUP4_ARG2(__lsx_vldx, src, -pitch4, src, -pitch3, src, -pitch2, src, -pitch, + p3, p2, p1, p0); + q0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch2, q1, q2); + q3 = __lsx_vldx(src, pitch3); + + thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0); + thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0); + thresh0 = __lsx_vilvl_d(thresh1, thresh0); + + b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0); + b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0); + b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0); + + limit0 = __lsx_vldrepl_b(limit0_ptr, 0); + limit1 = __lsx_vldrepl_b(limit1_ptr, 0); + limit0 = __lsx_vilvl_d(limit1, limit0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, + mask, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + + __lsx_vstx(p1, src, -pitch2); + __lsx_vstx(p0, src, -pitch); + __lsx_vst(q0, src, 0); + __lsx_vstx(q1, src, pitch); +} + +void vpx_lpf_vertical_4_lsx(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + __m128i mask, hev, flat, limit, thresh, b_limit; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i vec0, vec1, vec2, vec3; + int32_t pitch2 = pitch << 1; + int32_t pitch3 = pitch2 + pitch; + int32_t pitch4 = pitch2 << 1; + uint8_t *src_tmp = src - 4; + + p3 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, p2, p1); + p0 = __lsx_vldx(src_tmp, pitch3); + src_tmp += pitch4; + q0 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, q1, q2); + q3 = __lsx_vldx(src_tmp, pitch3); + + thresh = __lsx_vldrepl_b(thresh_ptr, 0); + b_limit = __lsx_vldrepl_b(b_limit_ptr, 0); + limit = __lsx_vldrepl_b(limit_ptr, 0); + + LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2, + q3); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, vec0, vec1); + vec2 = __lsx_vilvl_h(vec1, vec0); + vec3 = __lsx_vilvh_h(vec1, vec0); + + src -= 2; + __lsx_vstelm_w(vec2, src, 0, 0); + src += pitch; + __lsx_vstelm_w(vec2, src, 0, 1); + src += pitch; + __lsx_vstelm_w(vec2, src, 0, 2); + src += pitch; + __lsx_vstelm_w(vec2, src, 0, 3); + src += pitch; + + __lsx_vstelm_w(vec3, src, 0, 0); + __lsx_vstelm_w(vec3, src + pitch, 0, 1); + __lsx_vstelm_w(vec3, src + pitch2, 0, 2); + __lsx_vstelm_w(vec3, src + pitch3, 0, 3); +} + +void vpx_lpf_vertical_4_dual_lsx(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0_ptr, + const uint8_t *limit0_ptr, + const uint8_t *thresh0_ptr, + const uint8_t *b_limit1_ptr, + const uint8_t *limit1_ptr, + const uint8_t *thresh1_ptr) { + __m128i mask, hev, flat; + __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i row0, row1, row2, row3, row4, row5, row6, row7; + __m128i row8, row9, row10, row11, row12, row13, row14, row15; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + int32_t pitch2 = pitch << 1; + int32_t pitch3 = pitch2 + pitch; + int32_t pitch4 = pitch2 << 1; + uint8_t *src_tmp = src - 4; + + row0 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row1, row2); + row3 = __lsx_vldx(src_tmp, pitch3); + src_tmp += pitch4; + row4 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row5, row6); + row7 = __lsx_vldx(src_tmp, pitch3); + src_tmp += pitch4; + row8 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row9, row10); + row11 = __lsx_vldx(src_tmp, pitch3); + src_tmp += pitch4; + row12 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row13, row14); + row15 = __lsx_vldx(src_tmp, pitch3); + + LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p3, p2, + p1, p0, q0, q1, q2, q3); + + thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0); + thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0); + thresh0 = __lsx_vilvl_d(thresh1, thresh0); + + b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0); + b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0); + b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0); + + limit0 = __lsx_vldrepl_b(limit0_ptr, 0); + limit1 = __lsx_vldrepl_b(limit1_ptr, 0); + limit0 = __lsx_vilvl_d(limit1, limit0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, + mask, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1); + tmp2 = __lsx_vilvl_h(tmp1, tmp0); + tmp3 = __lsx_vilvh_h(tmp1, tmp0); + DUP2_ARG2(__lsx_vilvh_b, p0, p1, q1, q0, tmp0, tmp1); + tmp4 = __lsx_vilvl_h(tmp1, tmp0); + tmp5 = __lsx_vilvh_h(tmp1, tmp0); + + src -= 2; + __lsx_vstelm_w(tmp2, src, 0, 0); + __lsx_vstelm_w(tmp2, src + pitch, 0, 1); + __lsx_vstelm_w(tmp2, src + pitch2, 0, 2); + __lsx_vstelm_w(tmp2, src + pitch3, 0, 3); + src += pitch4; + __lsx_vstelm_w(tmp3, src, 0, 0); + __lsx_vstelm_w(tmp3, src + pitch, 0, 1); + __lsx_vstelm_w(tmp3, src + pitch2, 0, 2); + __lsx_vstelm_w(tmp3, src + pitch3, 0, 3); + src += pitch4; + __lsx_vstelm_w(tmp4, src, 0, 0); + __lsx_vstelm_w(tmp4, src + pitch, 0, 1); + __lsx_vstelm_w(tmp4, src + pitch2, 0, 2); + __lsx_vstelm_w(tmp4, src + pitch3, 0, 3); + src += pitch4; + __lsx_vstelm_w(tmp5, src, 0, 0); + __lsx_vstelm_w(tmp5, src + pitch, 0, 1); + __lsx_vstelm_w(tmp5, src + pitch2, 0, 2); + __lsx_vstelm_w(tmp5, src + pitch3, 0, 3); +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_8_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_8_lsx.c new file mode 100644 index 0000000000..00219ba71d --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_8_lsx.c @@ -0,0 +1,458 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/loopfilter_lsx.h" + +void vpx_lpf_horizontal_8_lsx(uint8_t *dst, int32_t stride, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + __m128i mask, hev, flat, thresh, b_limit, limit; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i p2_out, p1_out, p0_out, q0_out, q1_out; + __m128i p2_filter8, p1_filter8, p0_filter8; + __m128i q0_filter8, q1_filter8, q2_filter8; + __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l; + + int32_t stride2 = stride << 1; + int32_t stride3 = stride2 + stride; + int32_t stride4 = stride2 << 1; + + /* load vector elements */ + DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst, + -stride, p3, p2, p1, p0); + q0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); + q3 = __lsx_vldx(dst, stride3); + + thresh = __lsx_vldrepl_b(thresh_ptr, 0); + b_limit = __lsx_vldrepl_b(b_limit_ptr, 0); + limit = __lsx_vldrepl_b(limit_ptr, 0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + flat = __lsx_vilvl_d(flat, flat); + + if (__lsx_bz_v(flat)) { + __lsx_vstelm_d(p1_out, dst - stride2, 0, 0); + __lsx_vstelm_d(p0_out, dst - stride, 0, 0); + __lsx_vstelm_d(q0_out, dst, 0, 0); + __lsx_vstelm_d(q1_out, dst + stride, 0, 0); + } else { + DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l, + p0_l); + DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l, + q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filter8, + p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); + + DUP2_ARG2(__lsx_vpickev_b, p1_filter8, p2_filter8, q0_filter8, p0_filter8, + p1_filter8, q0_filter8); + q2_filter8 = __lsx_vpickev_b(q2_filter8, q1_filter8); + + p2 = __lsx_vilvl_d(p1_out, p2); + p0_out = __lsx_vilvl_d(q0_out, p0_out); + q1_out = __lsx_vilvl_d(q2, q1_out); + + DUP2_ARG3(__lsx_vbitsel_v, p2, p1_filter8, flat, p0_out, q0_filter8, flat, + p2_out, p1_out); + p0_out = __lsx_vbitsel_v(q1_out, q2_filter8, flat); + dst -= stride3; + + __lsx_vstelm_d(p2_out, dst, 0, 0); + __lsx_vstelm_d(p2_out, dst + stride, 0, 1); + __lsx_vstelm_d(p1_out, dst + stride2, 0, 0); + __lsx_vstelm_d(p1_out, dst + stride3, 0, 1); + + dst += stride4; + __lsx_vstelm_d(p0_out, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p0_out, dst, 0, 1); + } +} + +void vpx_lpf_horizontal_8_dual_lsx( + uint8_t *dst, int32_t stride, const uint8_t *b_limit0, + const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *b_limit1, + const uint8_t *limit1, const uint8_t *thresh1) { + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + __m128i flat, mask, hev, thresh, b_limit, limit; + __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h; + __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l; + __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l; + __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h; + __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h; + + int32_t stride2 = stride << 1; + int32_t stride3 = stride2 + stride; + int32_t stride4 = stride2 << 1; + + DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst, + -stride, p3, p2, p1, p0); + q0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); + q3 = __lsx_vldx(dst, stride3); + + thresh = __lsx_vldrepl_b(thresh0, 0); + p2_out = __lsx_vldrepl_b(thresh1, 0); + thresh = __lsx_vilvl_d(p2_out, thresh); + + b_limit = __lsx_vldrepl_b(b_limit0, 0); + p2_out = __lsx_vldrepl_b(b_limit1, 0); + b_limit = __lsx_vilvl_d(p2_out, b_limit); + + limit = __lsx_vldrepl_b(limit0, 0); + p2_out = __lsx_vldrepl_b(limit1, 0); + limit = __lsx_vilvl_d(p2_out, limit); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + if (__lsx_bz_v(flat)) { + __lsx_vst(p1_out, dst - stride2, 0); + __lsx_vst(p0_out, dst - stride, 0); + __lsx_vst(q0_out, dst, 0); + __lsx_vst(q1_out, dst + stride, 0); + } else { + DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l, + p0_l); + DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l, + q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h); + DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h); + VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h, + p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h); + + /* convert 16 bit output data into 8 bit */ + DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l, + p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l); + DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l, + q1_filt8_l, q2_filt8_l); + + /* store pixel values */ + p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat); + p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat); + p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat); + q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat); + q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat); + q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat); + + __lsx_vst(p2_out, dst - stride3, 0); + __lsx_vst(p1_out, dst - stride2, 0); + __lsx_vst(p0_out, dst - stride, 0); + __lsx_vst(q0_out, dst, 0); + __lsx_vst(q1_out, dst + stride, 0); + __lsx_vst(q2_out, dst + stride2, 0); + } +} + +void vpx_lpf_vertical_8_lsx(uint8_t *dst, int32_t stride, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i p1_out, p0_out, q0_out, q1_out; + __m128i flat, mask, hev, thresh, b_limit, limit; + __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l; + __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l; + __m128i zero = __lsx_vldi(0); + + int32_t stride2 = stride << 1; + int32_t stride3 = stride2 + stride; + int32_t stride4 = stride2 << 1; + uint8_t *dst_tmp = dst - 4; + + /* load vector elements */ + p3 = __lsx_vld(dst_tmp, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p2, p1); + p0 = __lsx_vldx(dst_tmp, stride3); + dst_tmp += stride4; + q0 = __lsx_vld(dst_tmp, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q1, q2); + q3 = __lsx_vldx(dst_tmp, stride3); + + LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2, + q3); + + thresh = __lsx_vldrepl_b(thresh_ptr, 0); + b_limit = __lsx_vldrepl_b(b_limit_ptr, 0); + limit = __lsx_vldrepl_b(limit_ptr, 0); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + /* flat4 */ + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + flat = __lsx_vilvl_d(zero, flat); + + /* if flat is zero for all pixels, then no need to calculate other filter */ + if (__lsx_bz_v(flat)) { + /* Store 4 pixels p1-_q1 */ + DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, p0, p1); + p2 = __lsx_vilvl_h(p1, p0); + p3 = __lsx_vilvh_h(p1, p0); + + dst -= 2; + __lsx_vstelm_w(p2, dst, 0, 0); + __lsx_vstelm_w(p2, dst + stride, 0, 1); + __lsx_vstelm_w(p2, dst + stride2, 0, 2); + __lsx_vstelm_w(p2, dst + stride3, 0, 3); + dst += stride4; + __lsx_vstelm_w(p3, dst, 0, 0); + __lsx_vstelm_w(p3, dst + stride, 0, 1); + __lsx_vstelm_w(p3, dst + stride2, 0, 2); + __lsx_vstelm_w(p3, dst + stride3, 0, 3); + } else { + DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, + p1_l, p0_l); + DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, + q2_l, q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + /* convert 16 bit output data into 8 bit */ + DUP4_ARG2(__lsx_vpickev_b, p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l, + p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l); + DUP2_ARG2(__lsx_vpickev_b, q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l, + q1_filt8_l, q2_filt8_l); + /* store pixel values */ + p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat); + p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat); + p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat); + q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat); + q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat); + q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat); + + /* Store 6 pixels p2-_q2 */ + DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, p3, q3); + p1 = __lsx_vilvl_h(q3, p3); + p2 = __lsx_vilvh_h(q3, p3); + p3 = __lsx_vilvl_b(q2, q1); + dst -= 3; + __lsx_vstelm_w(p1, dst, 0, 0); + __lsx_vstelm_h(p3, dst, 4, 0); + dst += stride; + __lsx_vstelm_w(p1, dst, 0, 1); + __lsx_vstelm_h(p3, dst, 4, 1); + dst += stride; + __lsx_vstelm_w(p1, dst, 0, 2); + __lsx_vstelm_h(p3, dst, 4, 2); + dst += stride; + __lsx_vstelm_w(p1, dst, 0, 3); + __lsx_vstelm_h(p3, dst, 4, 3); + dst += stride; + __lsx_vstelm_w(p2, dst, 0, 0); + __lsx_vstelm_h(p3, dst, 4, 4); + dst += stride; + __lsx_vstelm_w(p2, dst, 0, 1); + __lsx_vstelm_h(p3, dst, 4, 5); + dst += stride; + __lsx_vstelm_w(p2, dst, 0, 2); + __lsx_vstelm_h(p3, dst, 4, 6); + dst += stride; + __lsx_vstelm_w(p2, dst, 0, 3); + __lsx_vstelm_h(p3, dst, 4, 7); + } +} + +void vpx_lpf_vertical_8_dual_lsx(uint8_t *dst, int32_t stride, + const uint8_t *b_limit0, const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *b_limit1, const uint8_t *limit1, + const uint8_t *thresh1) { + uint8_t *dst_tmp = dst - 4; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i p1_out, p0_out, q0_out, q1_out; + __m128i flat, mask, hev, thresh, b_limit, limit; + __m128i row4, row5, row6, row7, row12, row13, row14, row15; + __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h; + __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l; + __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l; + __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h; + __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h; + int32_t stride2 = stride << 1; + int32_t stride3 = stride2 + stride; + int32_t stride4 = stride2 << 1; + + p0 = __lsx_vld(dst_tmp, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p1, p2); + p3 = __lsx_vldx(dst_tmp, stride3); + dst_tmp += stride4; + row4 = __lsx_vld(dst_tmp, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row5, row6); + row7 = __lsx_vldx(dst_tmp, stride3); + dst_tmp += stride4; + + q3 = __lsx_vld(dst_tmp, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q2, q1); + q0 = __lsx_vldx(dst_tmp, stride3); + dst_tmp += stride4; + row12 = __lsx_vld(dst_tmp, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row13, row14); + row15 = __lsx_vldx(dst_tmp, stride3); + + /* transpose 16x8 matrix into 8x16 */ + LSX_TRANSPOSE16x8_B(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0, + row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2, + q3); + + thresh = __lsx_vldrepl_b(thresh0, 0); + p1_out = __lsx_vldrepl_b(thresh1, 0); + thresh = __lsx_vilvl_d(p1_out, thresh); + + b_limit = __lsx_vldrepl_b(b_limit0, 0); + p1_out = __lsx_vldrepl_b(b_limit1, 0); + b_limit = __lsx_vilvl_d(p1_out, b_limit); + + limit = __lsx_vldrepl_b(limit0, 0); + p1_out = __lsx_vldrepl_b(limit1, 0); + limit = __lsx_vilvl_d(p1_out, limit); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + /* flat4 */ + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + /* if flat is zero for all pixels, then no need to calculate other filter */ + if (__lsx_bz_v(flat)) { + DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, p0, p1); + p2 = __lsx_vilvl_h(p1, p0); + p3 = __lsx_vilvh_h(p1, p0); + DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, p0, p1); + q2 = __lsx_vilvl_h(p1, p0); + q3 = __lsx_vilvh_h(p1, p0); + dst -= 2; + __lsx_vstelm_w(p2, dst, 0, 0); + __lsx_vstelm_w(p2, dst + stride, 0, 1); + __lsx_vstelm_w(p2, dst + stride2, 0, 2); + __lsx_vstelm_w(p2, dst + stride3, 0, 3); + dst += stride4; + __lsx_vstelm_w(p3, dst, 0, 0); + __lsx_vstelm_w(p3, dst + stride, 0, 1); + __lsx_vstelm_w(p3, dst + stride2, 0, 2); + __lsx_vstelm_w(p3, dst + stride3, 0, 3); + dst += stride4; + __lsx_vstelm_w(q2, dst, 0, 0); + __lsx_vstelm_w(q2, dst + stride, 0, 1); + __lsx_vstelm_w(q2, dst + stride2, 0, 2); + __lsx_vstelm_w(q2, dst + stride3, 0, 3); + dst += stride4; + __lsx_vstelm_w(q3, dst, 0, 0); + __lsx_vstelm_w(q3, dst + stride, 0, 1); + __lsx_vstelm_w(q3, dst + stride2, 0, 2); + __lsx_vstelm_w(q3, dst + stride3, 0, 3); + } else { + DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l, + p0_l); + DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l, + q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h); + DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h); + + /* filter8 */ + VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h, + p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h); + + /* convert 16 bit output data into 8 bit */ + DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l, + p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l); + DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l, + q1_filt8_l, q2_filt8_l); + + /* store pixel values */ + p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat); + p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat); + p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat); + q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat); + q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat); + q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat); + + DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, p3, q3); + p2_filt8_l = __lsx_vilvl_h(q3, p3); + p2_filt8_h = __lsx_vilvh_h(q3, p3); + DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, p3, q3); + p0_filt8_l = __lsx_vilvl_h(q3, p3); + p0_filt8_h = __lsx_vilvh_h(q3, p3); + q1_filt8_l = __lsx_vilvl_b(q2, q1); + q1_filt8_h = __lsx_vilvh_b(q2, q1); + + dst -= 3; + __lsx_vstelm_w(p2_filt8_l, dst, 0, 0); + __lsx_vstelm_h(q1_filt8_l, dst, 4, 0); + dst += stride; + __lsx_vstelm_w(p2_filt8_l, dst, 0, 1); + __lsx_vstelm_h(q1_filt8_l, dst, 4, 1); + dst += stride; + __lsx_vstelm_w(p2_filt8_l, dst, 0, 2); + __lsx_vstelm_h(q1_filt8_l, dst, 4, 2); + dst += stride; + __lsx_vstelm_w(p2_filt8_l, dst, 0, 3); + __lsx_vstelm_h(q1_filt8_l, dst, 4, 3); + dst += stride; + __lsx_vstelm_w(p2_filt8_h, dst, 0, 0); + __lsx_vstelm_h(q1_filt8_l, dst, 4, 4); + dst += stride; + __lsx_vstelm_w(p2_filt8_h, dst, 0, 1); + __lsx_vstelm_h(q1_filt8_l, dst, 4, 5); + dst += stride; + __lsx_vstelm_w(p2_filt8_h, dst, 0, 2); + __lsx_vstelm_h(q1_filt8_l, dst, 4, 6); + dst += stride; + __lsx_vstelm_w(p2_filt8_h, dst, 0, 3); + __lsx_vstelm_h(q1_filt8_l, dst, 4, 7); + dst += stride; + __lsx_vstelm_w(p0_filt8_l, dst, 0, 0); + __lsx_vstelm_h(q1_filt8_h, dst, 4, 0); + dst += stride; + __lsx_vstelm_w(p0_filt8_l, dst, 0, 1); + __lsx_vstelm_h(q1_filt8_h, dst, 4, 1); + dst += stride; + __lsx_vstelm_w(p0_filt8_l, dst, 0, 2); + __lsx_vstelm_h(q1_filt8_h, dst, 4, 2); + dst += stride; + __lsx_vstelm_w(p0_filt8_l, dst, 0, 3); + __lsx_vstelm_h(q1_filt8_h, dst, 4, 3); + dst += stride; + __lsx_vstelm_w(p0_filt8_h, dst, 0, 0); + __lsx_vstelm_h(q1_filt8_h, dst, 4, 4); + dst += stride; + __lsx_vstelm_w(p0_filt8_h, dst, 0, 1); + __lsx_vstelm_h(q1_filt8_h, dst, 4, 5); + dst += stride; + __lsx_vstelm_w(p0_filt8_h, dst, 0, 2); + __lsx_vstelm_h(q1_filt8_h, dst, 4, 6); + dst += stride; + __lsx_vstelm_w(p0_filt8_h, dst, 0, 3); + __lsx_vstelm_h(q1_filt8_h, dst, 4, 7); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_lsx.h new file mode 100644 index 0000000000..1c43836503 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_lsx.h @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_ +#define VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_ + +#include "vpx_util/loongson_intrinsics.h" + +#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ + limit_in, b_limit_in, thresh_in, hev_out, mask_out, \ + flat_out) \ + do { \ + __m128i p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \ + __m128i p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \ + \ + /* absolute subtraction of pixel values */ \ + p3_asub_p2_m = __lsx_vabsd_bu(p3_in, p2_in); \ + p2_asub_p1_m = __lsx_vabsd_bu(p2_in, p1_in); \ + p1_asub_p0_m = __lsx_vabsd_bu(p1_in, p0_in); \ + q1_asub_q0_m = __lsx_vabsd_bu(q1_in, q0_in); \ + q2_asub_q1_m = __lsx_vabsd_bu(q2_in, q1_in); \ + q3_asub_q2_m = __lsx_vabsd_bu(q3_in, q2_in); \ + p0_asub_q0_m = __lsx_vabsd_bu(p0_in, q0_in); \ + p1_asub_q1_m = __lsx_vabsd_bu(p1_in, q1_in); \ + \ + /* calculation of hev */ \ + flat_out = __lsx_vmax_bu(p1_asub_p0_m, q1_asub_q0_m); \ + hev_out = __lsx_vslt_bu(thresh_in, flat_out); \ + \ + /* calculation of mask */ \ + p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p0_asub_q0_m); \ + p1_asub_q1_m = __lsx_vsrli_b(p1_asub_q1_m, 1); \ + p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p1_asub_q1_m); \ + mask_out = __lsx_vslt_bu(b_limit_in, p0_asub_q0_m); \ + mask_out = __lsx_vmax_bu(flat_out, mask_out); \ + p3_asub_p2_m = __lsx_vmax_bu(p3_asub_p2_m, p2_asub_p1_m); \ + mask_out = __lsx_vmax_bu(p3_asub_p2_m, mask_out); \ + q2_asub_q1_m = __lsx_vmax_bu(q2_asub_q1_m, q3_asub_q2_m); \ + mask_out = __lsx_vmax_bu(q2_asub_q1_m, mask_out); \ + \ + mask_out = __lsx_vslt_bu(limit_in, mask_out); \ + mask_out = __lsx_vxori_b(mask_out, 0xff); \ + } while (0) + +#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \ + do { \ + __m128i p2_asub_p0, q2_asub_q0, p3_asub_p0, q3_asub_q0; \ + __m128i flat4_tmp = __lsx_vldi(1); \ + \ + DUP4_ARG2(__lsx_vabsd_bu, p2_in, p0_in, q2_in, q0_in, p3_in, p0_in, q3_in, \ + q0_in, p2_asub_p0, q2_asub_q0, p3_asub_p0, q3_asub_q0); \ + p2_asub_p0 = __lsx_vmax_bu(p2_asub_p0, q2_asub_q0); \ + flat_out = __lsx_vmax_bu(p2_asub_p0, flat_out); \ + p3_asub_p0 = __lsx_vmax_bu(p3_asub_p0, q3_asub_q0); \ + flat_out = __lsx_vmax_bu(p3_asub_p0, flat_out); \ + \ + flat_out = __lsx_vslt_bu(flat4_tmp, flat_out); \ + flat_out = __lsx_vxori_b(flat_out, 0xff); \ + flat_out = flat_out & (mask); \ + } while (0) + +#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \ + q6_in, q7_in, flat_in, flat2_out) \ + do { \ + __m128i flat5_tmp = __lsx_vldi(1); \ + __m128i p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0; \ + __m128i p6_asub_p0, q6_asub_q0, p7_asub_p0, q7_asub_q0; \ + DUP4_ARG2(__lsx_vabsd_bu, p4_in, p0_in, q4_in, q0_in, p5_in, p0_in, q5_in, \ + q0_in, p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0); \ + DUP4_ARG2(__lsx_vabsd_bu, p6_in, p0_in, q6_in, q0_in, p7_in, p0_in, q7_in, \ + q0_in, p6_asub_p0, q6_asub_q0, p7_asub_p0, q7_asub_q0); \ + \ + DUP2_ARG2(__lsx_vmax_bu, p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0, \ + p4_asub_p0, flat2_out); \ + flat2_out = __lsx_vmax_bu(p4_asub_p0, flat2_out); \ + p6_asub_p0 = __lsx_vmax_bu(p6_asub_p0, q6_asub_q0); \ + flat2_out = __lsx_vmax_bu(p6_asub_p0, flat2_out); \ + p7_asub_p0 = __lsx_vmax_bu(p7_asub_p0, q7_asub_q0); \ + flat2_out = __lsx_vmax_bu(p7_asub_p0, flat2_out); \ + flat2_out = __lsx_vslt_bu(flat5_tmp, flat2_out); \ + flat2_out = __lsx_vxori_b(flat2_out, 0xff); \ + flat2_out = flat2_out & flat_in; \ + } while (0) + +#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask, hev, p1_out, \ + p0_out, q0_out, q1_out) \ + do { \ + __m128i p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \ + const __m128i cnst4b = __lsx_vldi(4); \ + const __m128i cnst3b = __lsx_vldi(3); \ + DUP4_ARG2(__lsx_vxori_b, p1_in, 0x80, p0_in, 0x80, q0_in, 0x80, q1_in, \ + 0x80, p1_m, p0_m, q0_m, q1_m); \ + filt = __lsx_vssub_b(p1_m, q1_m); \ + filt &= hev; \ + \ + q0_sub_p0 = __lsx_vssub_b(q0_m, p0_m); \ + filt = __lsx_vsadd_b(filt, q0_sub_p0); \ + filt = __lsx_vsadd_b(filt, q0_sub_p0); \ + filt = __lsx_vsadd_b(filt, q0_sub_p0); \ + filt &= mask; \ + DUP2_ARG2(__lsx_vsadd_b, filt, cnst4b, filt, cnst3b, t1, t2); \ + DUP2_ARG2(__lsx_vsrai_b, t1, 3, t2, 3, t1, t2); \ + \ + q0_m = __lsx_vssub_b(q0_m, t1); \ + p0_m = __lsx_vsadd_b(p0_m, t2); \ + DUP2_ARG2(__lsx_vxori_b, q0_m, 0x80, p0_m, 0x80, q0_out, p0_out); \ + \ + filt = __lsx_vsrari_b(t1, 1); \ + hev = __lsx_vxori_b(hev, 0xff); \ + filt &= hev; \ + q1_m = __lsx_vssub_b(q1_m, filt); \ + p1_m = __lsx_vsadd_b(p1_m, filt); \ + DUP2_ARG2(__lsx_vxori_b, q1_m, 0x80, p1_m, 0x80, q1_out, p1_out); \ + } while (0) + +#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ + p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \ + q1_filt8_out, q2_filt8_out) \ + do { \ + __m128i tmp_filt8_0, tmp_filt8_1, tmp_filt8_2; \ + \ + tmp_filt8_2 = __lsx_vadd_h(p2_in, p1_in); \ + tmp_filt8_2 = __lsx_vadd_h(tmp_filt8_2, p0_in); \ + tmp_filt8_0 = __lsx_vslli_h(p3_in, 1); \ + \ + tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_2); \ + tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, q0_in); \ + tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, p3_in); \ + tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, p2_in); \ + p2_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \ + \ + tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, p1_in); \ + tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, q1_in); \ + p1_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \ + \ + tmp_filt8_1 = __lsx_vadd_h(q2_in, q1_in); \ + tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, q0_in); \ + tmp_filt8_2 = __lsx_vadd_h(tmp_filt8_2, tmp_filt8_1); \ + tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_2, p0_in); \ + tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, p3_in); \ + p0_filt8_out = __lsx_vsrari_h(tmp_filt8_0, 3); \ + \ + tmp_filt8_0 = __lsx_vadd_h(q2_in, q3_in); \ + tmp_filt8_0 = __lsx_vadd_h(p0_in, tmp_filt8_0); \ + tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_1); \ + tmp_filt8_1 = __lsx_vadd_h(q3_in, q3_in); \ + tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, tmp_filt8_0); \ + q2_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \ + \ + tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_2, q3_in); \ + tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, q0_in); \ + q0_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \ + \ + tmp_filt8_1 = __lsx_vsub_h(tmp_filt8_0, p2_in); \ + tmp_filt8_0 = __lsx_vadd_h(q1_in, q3_in); \ + tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_1); \ + q1_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \ + } while (0) + +#endif // VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/quantize_intrin_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/quantize_intrin_lsx.c new file mode 100644 index 0000000000..9bb1691e2e --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/quantize_intrin_lsx.c @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" + +static INLINE __m128i calculate_qcoeff(__m128i coeff, __m128i coeff_abs, + __m128i round, __m128i quant, + __m128i shift, __m128i cmp_mask) { + __m128i rounded, qcoeff; + + rounded = __lsx_vsadd_h(coeff_abs, round); + qcoeff = __lsx_vmuh_h(rounded, quant); + qcoeff = __lsx_vadd_h(rounded, qcoeff); + qcoeff = __lsx_vmuh_h(qcoeff, shift); + qcoeff = __lsx_vsigncov_h(coeff, qcoeff); + qcoeff = __lsx_vand_v(qcoeff, cmp_mask); + + return qcoeff; +} + +static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant, + int16_t *dqcoeff) { + __m128i dqcoeff16 = __lsx_vmul_h(qcoeff, dequant); + __lsx_vst(dqcoeff16, dqcoeff, 0); +} + +static INLINE void calculate_dqcoeff_and_store_32x32(__m128i qcoeff, + __m128i dequant, + int16_t *dqcoeff) { + // Un-sign to bias rounding like C. + __m128i low, high, dqcoeff32_0, dqcoeff32_1, res; + __m128i zero = __lsx_vldi(0); + __m128i coeff = __lsx_vabsd_h(qcoeff, zero); + + const __m128i sign_0 = __lsx_vilvl_h(qcoeff, zero); + const __m128i sign_1 = __lsx_vilvh_h(qcoeff, zero); + + low = __lsx_vmul_h(coeff, dequant); + high = __lsx_vmuh_h(coeff, dequant); + dqcoeff32_0 = __lsx_vilvl_h(high, low); + dqcoeff32_1 = __lsx_vilvh_h(high, low); + + // "Divide" by 2. + dqcoeff32_0 = __lsx_vsrai_w(dqcoeff32_0, 1); + dqcoeff32_1 = __lsx_vsrai_w(dqcoeff32_1, 1); + dqcoeff32_0 = __lsx_vsigncov_w(sign_0, dqcoeff32_0); + dqcoeff32_1 = __lsx_vsigncov_w(sign_1, dqcoeff32_1); + res = __lsx_vpickev_h(dqcoeff32_1, dqcoeff32_0); + __lsx_vst(res, dqcoeff, 0); +} + +static INLINE __m128i scan_for_eob(__m128i coeff0, __m128i coeff1, + const int16_t *scan, int index, + __m128i zero) { + const __m128i zero_coeff0 = __lsx_vseq_h(coeff0, zero); + const __m128i zero_coeff1 = __lsx_vseq_h(coeff1, zero); + __m128i scan0 = __lsx_vld(scan + index, 0); + __m128i scan1 = __lsx_vld(scan + index + 8, 0); + __m128i eob0, eob1; + + eob0 = __lsx_vandn_v(zero_coeff0, scan0); + eob1 = __lsx_vandn_v(zero_coeff1, scan1); + return __lsx_vmax_h(eob0, eob1); +} + +static INLINE int16_t accumulate_eob(__m128i eob) { + __m128i eob_shuffled; + int16_t res_m; + + eob_shuffled = __lsx_vshuf4i_w(eob, 0xe); + eob = __lsx_vmax_h(eob, eob_shuffled); + eob_shuffled = __lsx_vshuf4i_h(eob, 0xe); + eob = __lsx_vmax_h(eob, eob_shuffled); + eob_shuffled = __lsx_vshuf4i_h(eob, 0x1); + eob = __lsx_vmax_h(eob, eob_shuffled); + res_m = __lsx_vpickve2gr_h(eob, 1); + + return res_m; +} + +#if !CONFIG_VP9_HIGHBITDEPTH + +void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + __m128i zero = __lsx_vldi(0); + int index = 16; + const int16_t *iscan = scan_order->iscan; + + __m128i zbin, round, quant, dequant, quant_shift; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i eob, eob0; + + zbin = __lsx_vld(mb_plane->zbin, 0); + round = __lsx_vld(mb_plane->round, 0); + quant = __lsx_vld(mb_plane->quant, 0); + dequant = __lsx_vld(dequant_ptr, 0); + quant_shift = __lsx_vld(mb_plane->quant_shift, 0); + // Handle one DC and first 15 AC. + DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1); + qcoeff0 = __lsx_vabsd_h(coeff0, zero); + qcoeff1 = __lsx_vabsd_h(coeff1, zero); + + cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0); + zbin = __lsx_vilvh_d(zbin, zbin); + cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1); + + qcoeff0 = + calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); + round = __lsx_vilvh_d(round, round); + quant = __lsx_vilvh_d(quant, quant); + quant_shift = __lsx_vilvh_d(quant_shift, quant_shift); + qcoeff1 = + calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); + + __lsx_vst(qcoeff0, qcoeff_ptr, 0); + __lsx_vst(qcoeff1, qcoeff_ptr, 16); + + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr); + dequant = __lsx_vilvh_d(dequant, dequant); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); + + eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero); + // AC only loop. + while (index < n_coeffs) { + coeff0 = __lsx_vld(coeff_ptr + index, 0); + coeff1 = __lsx_vld(coeff_ptr + index + 8, 0); + + qcoeff0 = __lsx_vabsd_h(coeff0, zero); + qcoeff1 = __lsx_vabsd_h(coeff1, zero); + + cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0); + cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1); + + qcoeff0 = + calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); + qcoeff1 = + calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); + + __lsx_vst(qcoeff0, qcoeff_ptr + index, 0); + __lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0); + + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero); + eob = __lsx_vmax_h(eob, eob0); + + index += 16; + } + + *eob_ptr = accumulate_eob(eob); +} + +void vpx_quantize_b_32x32_lsx(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + __m128i zero = __lsx_vldi(0); + int index; + const int16_t *iscan = scan_order->iscan; + + __m128i zbin, round, quant, dequant, quant_shift; + __m128i coeff0, coeff1, qcoeff0, qcoeff1, cmp_mask0, cmp_mask1; + __m128i eob = zero, eob0; + + zbin = __lsx_vld(mb_plane->zbin, 0); + zbin = __lsx_vsrari_h(zbin, 1); + round = __lsx_vld(mb_plane->round, 0); + round = __lsx_vsrari_h(round, 1); + + quant = __lsx_vld(mb_plane->quant, 0); + dequant = __lsx_vld(dequant_ptr, 0); + quant_shift = __lsx_vld(mb_plane->quant_shift, 0); + quant_shift = __lsx_vslli_h(quant_shift, 1); + // Handle one DC and first 15 AC. + DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1); + qcoeff0 = __lsx_vabsd_h(coeff0, zero); + qcoeff1 = __lsx_vabsd_h(coeff1, zero); + + cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0); + // remove DC from zbin + zbin = __lsx_vilvh_d(zbin, zbin); + cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1); + + qcoeff0 = + calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); + // remove DC in quant_shift, quant, quant_shift + round = __lsx_vilvh_d(round, round); + quant = __lsx_vilvh_d(quant, quant); + quant_shift = __lsx_vilvh_d(quant_shift, quant_shift); + qcoeff1 = + calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); + __lsx_vst(qcoeff0, qcoeff_ptr, 0); + __lsx_vst(qcoeff1, qcoeff_ptr, 16); + + calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr); + dequant = __lsx_vilvh_d(dequant, dequant); + calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, dqcoeff_ptr + 8); + eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero); + // AC only loop. + for (index = 16; index < 32 * 32; index += 16) { + coeff0 = __lsx_vld(coeff_ptr + index, 0); + coeff1 = __lsx_vld(coeff_ptr + index + 8, 0); + + qcoeff0 = __lsx_vabsd_h(coeff0, zero); + qcoeff1 = __lsx_vabsd_h(coeff1, zero); + + cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0); + cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1); + + qcoeff0 = + calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); + qcoeff1 = + calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); + __lsx_vst(qcoeff0, qcoeff_ptr + index, 0); + __lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0); + + calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr + index); + calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, + dqcoeff_ptr + 8 + index); + eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero); + eob = __lsx_vmax_h(eob, eob0); + } + + *eob_ptr = accumulate_eob(eob); +} +#endif diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/sad_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/sad_lsx.c new file mode 100644 index 0000000000..b6fbedb0d0 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/sad_lsx.c @@ -0,0 +1,717 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" + +static INLINE __m128i sad_ub2_uh(__m128i in0, __m128i in1, __m128i ref0, + __m128i ref1) { + __m128i diff0_m, diff1_m, sad_m0; + __m128i sad_m = __lsx_vldi(0); + + diff0_m = __lsx_vabsd_bu(in0, ref0); + diff1_m = __lsx_vabsd_bu(in1, ref1); + + sad_m0 = __lsx_vhaddw_hu_bu(diff0_m, diff0_m); + sad_m = __lsx_vadd_h(sad_m, sad_m0); + sad_m0 = __lsx_vhaddw_hu_bu(diff1_m, diff1_m); + sad_m = __lsx_vadd_h(sad_m, sad_m0); + + return sad_m; +} + +static INLINE uint32_t hadd_uw_u32(__m128i in) { + __m128i res0_m; + uint32_t sum_m; + + res0_m = __lsx_vhaddw_du_wu(in, in); + res0_m = __lsx_vhaddw_qu_du(res0_m, res0_m); + sum_m = __lsx_vpickve2gr_w(res0_m, 0); + + return sum_m; +} + +static INLINE uint32_t hadd_uh_u32(__m128i in) { + __m128i res_m; + uint32_t sum_m; + + res_m = __lsx_vhaddw_wu_hu(in, in); + sum_m = hadd_uw_u32(res_m); + + return sum_m; +} + +static INLINE int32_t hadd_sw_s32(__m128i in) { + __m128i res0_m; + int32_t sum_m; + + res0_m = __lsx_vhaddw_d_w(in, in); + res0_m = __lsx_vhaddw_q_d(res0_m, res0_m); + sum_m = __lsx_vpickve2gr_w(res0_m, 0); + + return sum_m; +} + +static uint32_t sad_8width_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + uint32_t res; + __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3, sad_tmp; + __m128i sad = __lsx_vldi(0); + + for (ht_cnt = (height >> 2); ht_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0); + src += src_stride; + ref += ref_stride; + DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src1, ref1); + src += src_stride; + ref += ref_stride; + DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src2, ref2); + src += src_stride; + ref += ref_stride; + DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src3, ref3); + src += src_stride; + ref += ref_stride; + DUP4_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, ref1, ref0, ref3, ref2, + src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad = __lsx_vadd_h(sad, sad_tmp); + } + res = hadd_uh_u32(sad); + return res; +} + +static uint32_t sad_16width_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt = (height >> 2); + uint32_t res; + __m128i src0, src1, ref0, ref1, sad_tmp; + __m128i sad = __lsx_vldi(0); + int32_t src_stride2 = src_stride << 1; + int32_t ref_stride2 = ref_stride << 1; + + for (; ht_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0); + DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1); + src += src_stride2; + ref += ref_stride2; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad = __lsx_vadd_h(sad, sad_tmp); + + DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0); + DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1); + src += src_stride2; + ref += ref_stride2; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad = __lsx_vadd_h(sad, sad_tmp); + } + + res = hadd_uh_u32(sad); + return res; +} + +static uint32_t sad_32width_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt = (height >> 2); + uint32_t res; + __m128i src0, src1, ref0, ref1; + __m128i sad_tmp; + __m128i sad = __lsx_vldi(0); + + for (; ht_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1); + ref += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad = __lsx_vadd_h(sad, sad_tmp); + + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1); + ref += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad = __lsx_vadd_h(sad, sad_tmp); + + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1); + ref += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad = __lsx_vadd_h(sad, sad_tmp); + + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1); + ref += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad = __lsx_vadd_h(sad, sad_tmp); + } + res = hadd_uh_u32(sad); + return res; +} + +static uint32_t sad_64width_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt = (height >> 1); + uint32_t sad = 0; + __m128i src0, src1, src2, src3; + __m128i ref0, ref1, ref2, ref3; + __m128i sad_tmp; + __m128i sad0 = __lsx_vldi(0); + __m128i sad1 = sad0; + + for (; ht_cnt--;) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2, + ref3); + ref += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2, + ref3); + ref += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + } + + sad = hadd_uh_u32(sad0); + sad += hadd_uh_u32(sad1); + + return sad; +} + +static void sad_8width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + int32_t ht_cnt = (height >> 2); + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + __m128i src0, src1, src2, src3, sad_tmp; + __m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; + __m128i ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15; + __m128i sad0 = __lsx_vldi(0); + __m128i sad1 = sad0; + __m128i sad2 = sad0; + __m128i sad3 = sad0; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t ref_stride2 = ref_stride << 1; + int32_t ref_stride3 = ref_stride2 + ref_stride; + int32_t ref_stride4 = ref_stride2 << 1; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (; ht_cnt--;) { + src0 = __lsx_vld(src_ptr, 0); + DUP2_ARG2(__lsx_vldx, src_ptr, src_stride, src_ptr, src_stride2, src1, + src2); + src3 = __lsx_vldx(src_ptr, src_stride3); + src_ptr += src_stride4; + ref0 = __lsx_vld(ref0_ptr, 0); + DUP2_ARG2(__lsx_vldx, ref0_ptr, ref_stride, ref0_ptr, ref_stride2, ref1, + ref2); + ref3 = __lsx_vldx(ref0_ptr, ref_stride3); + ref0_ptr += ref_stride4; + ref4 = __lsx_vld(ref1_ptr, 0); + DUP2_ARG2(__lsx_vldx, ref1_ptr, ref_stride, ref1_ptr, ref_stride2, ref5, + ref6); + ref7 = __lsx_vldx(ref1_ptr, ref_stride3); + ref1_ptr += ref_stride4; + ref8 = __lsx_vld(ref2_ptr, 0); + DUP2_ARG2(__lsx_vldx, ref2_ptr, ref_stride, ref2_ptr, ref_stride2, ref9, + ref10); + ref11 = __lsx_vldx(ref2_ptr, ref_stride3); + ref2_ptr += ref_stride4; + ref12 = __lsx_vld(ref3_ptr, 0); + DUP2_ARG2(__lsx_vldx, ref3_ptr, ref_stride, ref3_ptr, ref_stride2, ref13, + ref14); + ref15 = __lsx_vldx(ref3_ptr, ref_stride3); + ref3_ptr += ref_stride4; + + DUP2_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, src0, src1); + DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + + DUP2_ARG2(__lsx_vpickev_d, ref5, ref4, ref7, ref6, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + + DUP2_ARG2(__lsx_vpickev_d, ref9, ref8, ref11, ref10, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad2 = __lsx_vadd_h(sad2, sad_tmp); + + DUP2_ARG2(__lsx_vpickev_d, ref13, ref12, ref15, ref14, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad3 = __lsx_vadd_h(sad3, sad_tmp); + } + sad_array[0] = hadd_uh_u32(sad0); + sad_array[1] = hadd_uh_u32(sad1); + sad_array[2] = hadd_uh_u32(sad2); + sad_array[3] = hadd_uh_u32(sad3); +} + +static void sad_16width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + int32_t ht_cnt = (height >> 1); + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + __m128i src, ref0, ref1, ref2, ref3, diff, sad_tmp; + __m128i sad0 = __lsx_vldi(0); + __m128i sad1 = sad0; + __m128i sad2 = sad0; + __m128i sad3 = sad0; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (; ht_cnt--;) { + src = __lsx_vld(src_ptr, 0); + src_ptr += src_stride; + ref0 = __lsx_vld(ref0_ptr, 0); + ref0_ptr += ref_stride; + ref1 = __lsx_vld(ref1_ptr, 0); + ref1_ptr += ref_stride; + ref2 = __lsx_vld(ref2_ptr, 0); + ref2_ptr += ref_stride; + ref3 = __lsx_vld(ref3_ptr, 0); + ref3_ptr += ref_stride; + + diff = __lsx_vabsd_bu(src, ref0); + sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + diff = __lsx_vabsd_bu(src, ref1); + sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + diff = __lsx_vabsd_bu(src, ref2); + sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); + sad2 = __lsx_vadd_h(sad2, sad_tmp); + diff = __lsx_vabsd_bu(src, ref3); + sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); + sad3 = __lsx_vadd_h(sad3, sad_tmp); + + src = __lsx_vld(src_ptr, 0); + src_ptr += src_stride; + ref0 = __lsx_vld(ref0_ptr, 0); + ref0_ptr += ref_stride; + ref1 = __lsx_vld(ref1_ptr, 0); + ref1_ptr += ref_stride; + ref2 = __lsx_vld(ref2_ptr, 0); + ref2_ptr += ref_stride; + ref3 = __lsx_vld(ref3_ptr, 0); + ref3_ptr += ref_stride; + + diff = __lsx_vabsd_bu(src, ref0); + sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + diff = __lsx_vabsd_bu(src, ref1); + sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + diff = __lsx_vabsd_bu(src, ref2); + sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); + sad2 = __lsx_vadd_h(sad2, sad_tmp); + diff = __lsx_vabsd_bu(src, ref3); + sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); + sad3 = __lsx_vadd_h(sad3, sad_tmp); + } + sad_array[0] = hadd_uh_u32(sad0); + sad_array[1] = hadd_uh_u32(sad1); + sad_array[2] = hadd_uh_u32(sad2); + sad_array[3] = hadd_uh_u32(sad3); +} + +static void sad_32width_x4d_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + int32_t ht_cnt = height; + __m128i src0, src1, ref0, ref1, sad_tmp; + __m128i sad0 = __lsx_vldi(0); + __m128i sad1 = sad0; + __m128i sad2 = sad0; + __m128i sad3 = sad0; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (; ht_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); + src += src_stride; + + DUP2_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0, ref1); + ref0_ptr += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + + DUP2_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref0, ref1); + ref1_ptr += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + + DUP2_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref0, ref1); + ref2_ptr += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad2 = __lsx_vadd_h(sad2, sad_tmp); + + DUP2_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref0, ref1); + ref3_ptr += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad3 = __lsx_vadd_h(sad3, sad_tmp); + } + sad_array[0] = hadd_uh_u32(sad0); + sad_array[1] = hadd_uh_u32(sad1); + sad_array[2] = hadd_uh_u32(sad2); + sad_array[3] = hadd_uh_u32(sad3); +} + +static void sad_64width_x4d_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + int32_t ht_cnt = height; + __m128i src0, src1, src2, src3; + __m128i ref0, ref1, ref2, ref3; + __m128i sad, sad_tmp; + + __m128i sad0_0 = __lsx_vldi(0); + __m128i sad0_1 = sad0_0; + __m128i sad1_0 = sad0_0; + __m128i sad1_1 = sad0_0; + __m128i sad2_0 = sad0_0; + __m128i sad2_1 = sad0_0; + __m128i sad3_0 = sad0_0; + __m128i sad3_1 = sad0_0; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (; ht_cnt--;) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + + DUP4_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0_ptr, 32, ref0_ptr, 48, + ref0, ref1, ref2, ref3); + ref0_ptr += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad0_0 = __lsx_vadd_h(sad0_0, sad_tmp); + sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3); + sad0_1 = __lsx_vadd_h(sad0_1, sad_tmp); + + DUP4_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref1_ptr, 32, ref1_ptr, 48, + ref0, ref1, ref2, ref3); + ref1_ptr += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad1_0 = __lsx_vadd_h(sad1_0, sad_tmp); + sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3); + sad1_1 = __lsx_vadd_h(sad1_1, sad_tmp); + + DUP4_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref2_ptr, 32, ref2_ptr, 48, + ref0, ref1, ref2, ref3); + ref2_ptr += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad2_0 = __lsx_vadd_h(sad2_0, sad_tmp); + sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3); + sad2_1 = __lsx_vadd_h(sad2_1, sad_tmp); + + DUP4_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref3_ptr, 32, ref3_ptr, 48, + ref0, ref1, ref2, ref3); + ref3_ptr += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad3_0 = __lsx_vadd_h(sad3_0, sad_tmp); + sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3); + sad3_1 = __lsx_vadd_h(sad3_1, sad_tmp); + } + sad = __lsx_vhaddw_wu_hu(sad0_0, sad0_0); + sad_tmp = __lsx_vhaddw_wu_hu(sad0_1, sad0_1); + sad = __lsx_vadd_w(sad, sad_tmp); + sad_array[0] = hadd_uw_u32(sad); + + sad = __lsx_vhaddw_wu_hu(sad1_0, sad1_0); + sad_tmp = __lsx_vhaddw_wu_hu(sad1_1, sad1_1); + sad = __lsx_vadd_w(sad, sad_tmp); + sad_array[1] = hadd_uw_u32(sad); + + sad = __lsx_vhaddw_wu_hu(sad2_0, sad2_0); + sad_tmp = __lsx_vhaddw_wu_hu(sad2_1, sad2_1); + sad = __lsx_vadd_w(sad, sad_tmp); + sad_array[2] = hadd_uw_u32(sad); + + sad = __lsx_vhaddw_wu_hu(sad3_0, sad3_0); + sad_tmp = __lsx_vhaddw_wu_hu(sad3_1, sad3_1); + sad = __lsx_vadd_w(sad, sad_tmp); + sad_array[3] = hadd_uw_u32(sad); +} + +static uint32_t avgsad_32width_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t res, ht_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; + __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + __m128i comp0, comp1, sad_tmp; + __m128i sad = __lsx_vldi(0); + uint8_t *src_tmp, *ref_tmp; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t ref_stride2 = ref_stride << 1; + int32_t ref_stride3 = ref_stride2 + ref_stride; + int32_t ref_stride4 = ref_stride2 << 1; + + for (; ht_cnt--;) { + src_tmp = (uint8_t *)src + 16; + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4); + src6 = __lsx_vldx(src, src_stride3); + src1 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src3, + src5); + src7 = __lsx_vldx(src_tmp, src_stride3); + src += src_stride4; + + ref_tmp = (uint8_t *)ref + 16; + ref0 = __lsx_vld(ref, 0); + DUP2_ARG2(__lsx_vldx, ref, ref_stride, ref, ref_stride2, ref2, ref4); + ref6 = __lsx_vldx(ref, ref_stride3); + ref1 = __lsx_vld(ref_tmp, 0); + DUP2_ARG2(__lsx_vldx, ref_tmp, ref_stride, ref_tmp, ref_stride2, ref3, + ref5); + ref7 = __lsx_vldx(ref_tmp, ref_stride3); + ref += ref_stride4; + + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 32, sec_pred, 64, sec_pred, 96, + pred0, pred2, pred4, pred6); + DUP4_ARG2(__lsx_vld, sec_pred, 16, sec_pred, 48, sec_pred, 80, sec_pred, + 112, pred1, pred3, pred5, pred7); + sec_pred += 128; + + DUP2_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, comp0, comp1); + sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1); + sad = __lsx_vadd_h(sad, sad_tmp); + DUP2_ARG2(__lsx_vavgr_bu, pred2, ref2, pred3, ref3, comp0, comp1); + sad_tmp = sad_ub2_uh(src2, src3, comp0, comp1); + sad = __lsx_vadd_h(sad, sad_tmp); + DUP2_ARG2(__lsx_vavgr_bu, pred4, ref4, pred5, ref5, comp0, comp1); + sad_tmp = sad_ub2_uh(src4, src5, comp0, comp1); + sad = __lsx_vadd_h(sad, sad_tmp); + DUP2_ARG2(__lsx_vavgr_bu, pred6, ref6, pred7, ref7, comp0, comp1); + sad_tmp = sad_ub2_uh(src6, src7, comp0, comp1); + sad = __lsx_vadd_h(sad, sad_tmp); + } + res = hadd_uh_u32(sad); + return res; +} + +static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t res, ht_cnt = (height >> 2); + __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3; + __m128i comp0, comp1, comp2, comp3, pred0, pred1, pred2, pred3; + __m128i sad, sad_tmp; + __m128i sad0 = __lsx_vldi(0); + __m128i sad1 = sad0; + + for (; ht_cnt--;) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2, + ref3); + ref += ref_stride; + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48, + pred0, pred1, pred2, pred3); + sec_pred += 64; + DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3, + ref3, comp0, comp1, comp2, comp3); + sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2, + ref3); + ref += ref_stride; + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48, + pred0, pred1, pred2, pred3); + sec_pred += 64; + DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3, + ref3, comp0, comp1, comp2, comp3); + sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2, + ref3); + ref += ref_stride; + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48, + pred0, pred1, pred2, pred3); + sec_pred += 64; + DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3, + ref3, comp0, comp1, comp2, comp3); + sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2, + ref3); + ref += ref_stride; + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48, + pred0, pred1, pred2, pred3); + sec_pred += 64; + DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3, + ref3, comp0, comp1, comp2, comp3); + sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + } + sad = __lsx_vhaddw_wu_hu(sad0, sad0); + sad_tmp = __lsx_vhaddw_wu_hu(sad1, sad1); + sad = __lsx_vadd_w(sad, sad_tmp); + + res = hadd_sw_s32(sad); + return res; +} + +#define VPX_SAD_8xHT_LSX(height) \ + uint32_t vpx_sad8x##height##_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_8width_lsx(src, src_stride, ref, ref_stride, height); \ + } + +#define VPX_SAD_16xHT_LSX(height) \ + uint32_t vpx_sad16x##height##_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_16width_lsx(src, src_stride, ref, ref_stride, height); \ + } + +#define VPX_SAD_32xHT_LSX(height) \ + uint32_t vpx_sad32x##height##_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_32width_lsx(src, src_stride, ref, ref_stride, height); \ + } + +#define VPX_SAD_64xHT_LSX(height) \ + uint32_t vpx_sad64x##height##_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_64width_lsx(src, src_stride, ref, ref_stride, height); \ + } + +#define VPX_SAD_8xHTx4D_LSX(height) \ + void vpx_sad8x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[4], \ + int32_t ref_stride, uint32_t sads[4]) { \ + sad_8width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define VPX_SAD_16xHTx4D_LSX(height) \ + void vpx_sad16x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_16width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define VPX_SAD_32xHTx4D_LSX(height) \ + void vpx_sad32x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_32width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define VPX_SAD_64xHTx4D_LSX(height) \ + void vpx_sad64x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_64width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define VPX_AVGSAD_32xHT_LSX(height) \ + uint32_t vpx_sad32x##height##_avg_lsx( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, const uint8_t *second_pred) { \ + return avgsad_32width_lsx(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +#define VPX_AVGSAD_64xHT_LSX(height) \ + uint32_t vpx_sad64x##height##_avg_lsx( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, const uint8_t *second_pred) { \ + return avgsad_64width_lsx(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +#define SAD64 \ + VPX_SAD_64xHT_LSX(64) VPX_SAD_64xHTx4D_LSX(64) VPX_SAD_64xHTx4D_LSX(32) \ + VPX_AVGSAD_64xHT_LSX(64) + +SAD64 + +#define SAD32 \ + VPX_SAD_32xHT_LSX(32) VPX_SAD_32xHTx4D_LSX(32) VPX_SAD_32xHTx4D_LSX(64) \ + VPX_AVGSAD_32xHT_LSX(32) + +SAD32 + +#define SAD16 VPX_SAD_16xHT_LSX(16) VPX_SAD_16xHTx4D_LSX(16) + +SAD16 + +#define SAD8 VPX_SAD_8xHT_LSX(8) VPX_SAD_8xHTx4D_LSX(8) + +SAD8 + +#undef SAD64 +#undef SAD32 +#undef SAD16 +#undef SAD8 diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/sub_pixel_variance_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/sub_pixel_variance_lsx.c new file mode 100644 index 0000000000..700793531c --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/sub_pixel_variance_lsx.c @@ -0,0 +1,874 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_ports/mem.h" +#include "vpx_dsp/loongarch/variance_lsx.h" +#include "vpx_dsp/variance.h" + +static const uint8_t bilinear_filters_lsx[8][2] = { + { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, + { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, +}; + +#define VARIANCE_WxH(sse, diff, shift) \ + (sse) - (((uint32_t)(diff) * (diff)) >> (shift)) + +#define VARIANCE_LARGE_WxH(sse, diff, shift) \ + (sse) - (((int64_t)(diff) * (diff)) >> (shift)) + +static uint32_t avg_sse_diff_64x64_lsx(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, int32_t *diff) { + int32_t res, ht_cnt = 32; + __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3; + __m128i pred0, pred1, pred2, pred3, vec, vec_tmp; + __m128i avg0, avg1, avg2, avg3; + __m128i var = __lsx_vldi(0); + + avg0 = var; + avg1 = var; + avg2 = var; + avg3 = var; + + for (; ht_cnt--;) { + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48, + pred0, pred1, pred2, pred3); + sec_pred += 64; + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + src_ptr += src_stride; + DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48, + ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + + DUP4_ARG2(__lsx_vavgr_bu, src0, pred0, src1, pred1, src2, pred2, src3, + pred3, src0, src1, src2, src3); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48, + pred0, pred1, pred2, pred3); + sec_pred += 64; + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + src_ptr += src_stride; + DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48, + ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + + DUP4_ARG2(__lsx_vavgr_bu, src0, pred0, src1, pred1, src2, pred2, src3, + pred3, src0, src1, src2, src3); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + } + vec = __lsx_vhaddw_w_h(avg0, avg0); + vec_tmp = __lsx_vhaddw_w_h(avg1, avg1); + vec = __lsx_vadd_w(vec, vec_tmp); + vec_tmp = __lsx_vhaddw_w_h(avg2, avg2); + vec = __lsx_vadd_w(vec, vec_tmp); + vec_tmp = __lsx_vhaddw_w_h(avg3, avg3); + vec = __lsx_vadd_w(vec, vec_tmp); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + + return res; +} + +static uint32_t sub_pixel_sse_diff_8width_h_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3; + __m128i vec0, vec1, vec2, vec3, filt0, out, vec; + __m128i mask = { 0x0403030202010100, 0x0807070606050504 }; + __m128i avg = __lsx_vldi(0); + __m128i var = avg; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + for (; loop_cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + ref0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2); + ref3 = __lsx_vldx(dst, dst_stride3); + dst += dst_stride4; + + DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1); + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, vec0, vec1, vec2, vec3); + DUP4_ARG3(__lsx_vssrarni_bu_h, vec0, vec0, FILTER_BITS, vec1, vec1, + FILTER_BITS, vec2, vec2, FILTER_BITS, vec3, vec3, FILTER_BITS, + src0, src1, src2, src3); + out = __lsx_vpackev_d(src1, src0); + CALC_MSE_AVG_B(out, ref0, var, avg); + out = __lsx_vpackev_d(src3, src2); + CALC_MSE_AVG_B(out, ref1, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + +static uint32_t sub_pixel_sse_diff_16width_h_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i dst0, dst1, dst2, dst3, filt0; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i out0, out1, out2, out3, out4, out5, out6, out7; + __m128i vec, var = __lsx_vldi(0); + __m128i avg = var; + __m128i mask = { 0x0403030202010100, 0x0807070606050504 }; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7); + src += src_stride; + + dst0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2); + dst3 = __lsx_vldx(dst, dst_stride3); + dst += dst_stride4; + + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5); + DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7); + + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, out0, out1, out2, out3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, out4, out5, out6, out7); + DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2, + FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, + src0, src1, src2, src3); + CALC_MSE_AVG_B(src0, dst0, var, avg); + CALC_MSE_AVG_B(src1, dst1, var, avg); + CALC_MSE_AVG_B(src2, dst2, var, avg); + CALC_MSE_AVG_B(src3, dst3, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + +static uint32_t sub_pixel_sse_diff_32width_h_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t sse = 0; + int32_t diff0[2]; + + sse += sub_pixel_sse_diff_16width_h_lsx(src, src_stride, dst, dst_stride, + filter, height, &diff0[0]); + src += 16; + dst += 16; + + sse += sub_pixel_sse_diff_16width_h_lsx(src, src_stride, dst, dst_stride, + filter, height, &diff0[1]); + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_8width_v_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4; + __m128i vec, vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3, filt0; + __m128i avg = __lsx_vldi(0); + __m128i var = avg; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + src0 = __lsx_vld(src, 0); + src += src_stride; + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + ref0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2); + ref3 = __lsx_vldx(dst, dst_stride3); + dst += dst_stride4; + + DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, + vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + src0 = src4; + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + +static uint32_t sub_pixel_sse_diff_16width_v_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4; + __m128i out0, out1, out2, out3, tmp0, tmp1, filt0, vec; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i var = __lsx_vldi(0); + __m128i avg = var; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + src += src_stride; + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + ref0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2); + ref3 = __lsx_vldx(dst, dst_stride3); + dst += dst_stride4; + + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + src0 = src4; + + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + CALC_MSE_AVG_B(out2, ref2, var, avg); + CALC_MSE_AVG_B(out3, ref3, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + +static uint32_t sub_pixel_sse_diff_32width_v_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t sse = 0; + int32_t diff0[2]; + + sse += sub_pixel_sse_diff_16width_v_lsx(src, src_stride, dst, dst_stride, + filter, height, &diff0[0]); + src += 16; + dst += 16; + + sse += sub_pixel_sse_diff_16width_v_lsx(src, src_stride, dst, dst_stride, + filter, height, &diff0[1]); + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_8width_hv_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4, out0, out1; + __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3, vec, vec0, filt_hz, filt_vt; + __m128i mask = { 0x0403030202010100, 0x0807070606050504 }; + __m128i avg = __lsx_vldi(0); + __m128i var = avg; + + filt_hz = __lsx_vldrepl_h(filter_horiz, 0); + filt_vt = __lsx_vldrepl_h(filter_vert, 0); + + src0 = __lsx_vld(src, 0); + src += src_stride; + HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src1, ref0); + src += src_stride; + dst += dst_stride; + DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src2, ref1); + src += src_stride; + dst += dst_stride; + DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src3, ref2); + src += src_stride; + dst += dst_stride; + DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src4, ref3); + src += src_stride; + dst += dst_stride; + + DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1); + HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out1); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt); + HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0); + vec0 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt); + + HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out1); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt); + HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out0); + vec0 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, out0, out1); + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + +static uint32_t sub_pixel_sse_diff_16width_hv_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i ref0, ref1, ref2, ref3, filt_hz, filt_vt, vec0, vec1; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1, vec; + __m128i var = __lsx_vldi(0); + __m128i avg = var; + __m128i mask = { 0x0403030202010100, 0x0807070606050504 }; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + filt_hz = __lsx_vldrepl_h(filter_horiz, 0); + filt_vt = __lsx_vldrepl_h(filter_vert, 0); + + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + + HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0); + HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out2); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7); + src += src_stride; + + ref0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2); + ref3 = __lsx_vldx(dst, dst_stride3); + dst += dst_stride4; + + HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out1); + HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out3); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + src0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0); + HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out2); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + src1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out1); + HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS, hz_out3); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + src2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS, hz_out0); + HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS, hz_out2); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + src3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + CALC_MSE_AVG_B(src2, ref2, var, avg); + CALC_MSE_AVG_B(src3, ref3, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + + return res; +} + +static uint32_t sub_pixel_sse_diff_32width_hv_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + uint32_t sse = 0; + int32_t diff0[2]; + + sse += sub_pixel_sse_diff_16width_hv_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height, + &diff0[0]); + src += 16; + dst += 16; + + sse += sub_pixel_sse_diff_16width_hv_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height, + &diff0[1]); + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t subpel_avg_ssediff_16w_h_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff, int32_t width) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; + __m128i pred0, pred1, pred2, pred3, filt0, vec; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i out0, out1, out2, out3, out4, out5, out6, out7; + __m128i mask = { 0x403030202010100, 0x807070606050504 }; + __m128i avg = __lsx_vldi(0); + __m128i var = avg; + + filt0 = __lsx_vldrepl_h(filter, 0); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7); + src += src_stride; + + dst0 = __lsx_vld(dst, 0); + dst += dst_stride; + dst1 = __lsx_vld(dst, 0); + dst += dst_stride; + dst2 = __lsx_vld(dst, 0); + dst += dst_stride; + dst3 = __lsx_vld(dst, 0); + dst += dst_stride; + + pred0 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred1 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred2 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred3 = __lsx_vld(sec_pred, 0); + sec_pred += width; + + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5); + DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7); + + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, out0, out1, out2, out3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, out4, out5, out6, out7); + DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2, + FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, + tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vavgr_bu, tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, + pred3, tmp0, tmp1, tmp2, tmp3); + + CALC_MSE_AVG_B(tmp0, dst0, var, avg); + CALC_MSE_AVG_B(tmp1, dst1, var, avg); + CALC_MSE_AVG_B(tmp2, dst2, var, avg); + CALC_MSE_AVG_B(tmp3, dst3, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + + return res; +} + +static uint32_t subpel_avg_ssediff_16w_v_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff, int32_t width) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i ref0, ref1, ref2, ref3, pred0, pred1, pred2, pred3; + __m128i src0, src1, src2, src3, src4, out0, out1, out2, out3; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i tmp0, tmp1, vec, filt0; + __m128i avg = __lsx_vldi(0); + __m128i var = avg; + + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + src += src_stride; + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + src += src_stride; + src2 = __lsx_vld(src, 0); + src += src_stride; + src3 = __lsx_vld(src, 0); + src += src_stride; + src4 = __lsx_vld(src, 0); + src += src_stride; + + pred0 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred1 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred2 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred3 = __lsx_vld(sec_pred, 0); + sec_pred += width; + + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + src0 = src4; + ref0 = __lsx_vld(dst, 0); + dst += dst_stride; + ref1 = __lsx_vld(dst, 0); + dst += dst_stride; + ref2 = __lsx_vld(dst, 0); + dst += dst_stride; + ref3 = __lsx_vld(dst, 0); + dst += dst_stride; + + DUP4_ARG2(__lsx_vavgr_bu, out0, pred0, out1, pred1, out2, pred2, out3, + pred3, out0, out1, out2, out3); + + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + CALC_MSE_AVG_B(out2, ref2, var, avg); + CALC_MSE_AVG_B(out3, ref3, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + +static uint32_t subpel_avg_ssediff_16w_hv_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i ref0, ref1, ref2, ref3, pred0, pred1, pred2, pred3; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1; + __m128i out0, out1, out2, out3, filt_hz, filt_vt, vec, vec0, vec1; + __m128i mask = { 0x403030202010100, 0x807070606050504 }; + __m128i avg = __lsx_vldi(0); + __m128i var = avg; + + filt_hz = __lsx_vldrepl_h(filter_horiz, 0); + filt_vt = __lsx_vldrepl_h(filter_vert, 0); + + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + + HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0); + HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out2); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7); + src += src_stride; + + pred0 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred1 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred2 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred3 = __lsx_vld(sec_pred, 0); + sec_pred += width; + + HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out1); + HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out3); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0); + HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out2); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out1); + HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS, hz_out3); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS, hz_out0); + HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS, hz_out2); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + ref0 = __lsx_vld(dst, 0); + dst += dst_stride; + ref1 = __lsx_vld(dst, 0); + dst += dst_stride; + ref2 = __lsx_vld(dst, 0); + dst += dst_stride; + ref3 = __lsx_vld(dst, 0); + dst += dst_stride; + + DUP4_ARG2(__lsx_vavgr_bu, out0, pred0, out1, pred1, out2, pred2, out3, + pred3, out0, out1, out2, out3); + + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + CALC_MSE_AVG_B(out2, ref2, var, avg); + CALC_MSE_AVG_B(out3, ref3, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + +static uint32_t sub_pixel_avg_sse_diff_64width_h_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += + subpel_avg_ssediff_16w_h_lsx(src, src_stride, dst, dst_stride, sec_pred, + filter, height, &diff0[loop_cnt], 64); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_64width_v_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += + subpel_avg_ssediff_16w_v_lsx(src, src_stride, dst, dst_stride, sec_pred, + filter, height, &diff0[loop_cnt], 64); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_64width_hv_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += subpel_avg_ssediff_16w_hv_lsx(src, src_stride, dst, dst_stride, + sec_pred, filter_horiz, filter_vert, + height, &diff0[loop_cnt], 64); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6) +#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8) +#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10) +#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12) + +#define VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(wd, ht) \ + uint32_t vpx_sub_pixel_variance##wd##x##ht##_lsx( \ + const uint8_t *src, int32_t src_stride, int32_t x_offset, \ + int32_t y_offset, const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sse) { \ + int32_t diff; \ + uint32_t var; \ + const uint8_t *h_filter = bilinear_filters_lsx[x_offset]; \ + const uint8_t *v_filter = bilinear_filters_lsx[y_offset]; \ + \ + if (y_offset) { \ + if (x_offset) { \ + *sse = sub_pixel_sse_diff_##wd##width_hv_lsx( \ + src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \ + } else { \ + *sse = sub_pixel_sse_diff_##wd##width_v_lsx( \ + src, src_stride, ref, ref_stride, v_filter, ht, &diff); \ + } \ + \ + var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } else { \ + if (x_offset) { \ + *sse = sub_pixel_sse_diff_##wd##width_h_lsx( \ + src, src_stride, ref, ref_stride, h_filter, ht, &diff); \ + \ + var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } else { \ + var = vpx_variance##wd##x##ht##_lsx(src, src_stride, ref, ref_stride, \ + sse); \ + } \ + } \ + \ + return var; \ + } + +VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(8, 8) +VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(16, 16) +VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(32, 32) + +#define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_LSX(ht) \ + uint32_t vpx_sub_pixel_avg_variance64x##ht##_lsx( \ + const uint8_t *src_ptr, int32_t src_stride, int32_t x_offset, \ + int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride, \ + uint32_t *sse, const uint8_t *sec_pred) { \ + int32_t diff; \ + const uint8_t *h_filter = bilinear_filters_lsx[x_offset]; \ + const uint8_t *v_filter = bilinear_filters_lsx[y_offset]; \ + \ + if (y_offset) { \ + if (x_offset) { \ + *sse = sub_pixel_avg_sse_diff_64width_hv_lsx( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \ + v_filter, ht, &diff); \ + } else { \ + *sse = sub_pixel_avg_sse_diff_64width_v_lsx( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \ + &diff); \ + } \ + } else { \ + if (x_offset) { \ + *sse = sub_pixel_avg_sse_diff_64width_h_lsx( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \ + &diff); \ + } else { \ + *sse = avg_sse_diff_64x##ht##_lsx(src_ptr, src_stride, ref_ptr, \ + ref_stride, sec_pred, &diff); \ + } \ + } \ + \ + return VARIANCE_64Wx##ht##H(*sse, diff); \ + } + +VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_LSX(64) diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/subtract_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/subtract_lsx.c new file mode 100644 index 0000000000..943a5c5a9b --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/subtract_lsx.c @@ -0,0 +1,371 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" + +static void sub_blk_4x4_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *pred_ptr, int32_t pred_stride, + int16_t *diff_ptr, int32_t diff_stride) { + __m128i src0, src1, src2, src3; + __m128i pred0, pred1, pred2, pred3; + __m128i diff0, diff1; + __m128i reg0, reg1; + int32_t src_stride2 = src_stride << 1; + int32_t pred_stride2 = pred_stride << 1; + int32_t diff_stride2 = diff_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t pred_stride3 = pred_stride2 + pred_stride; + int32_t diff_stride3 = diff_stride2 + diff_stride; + + DUP4_ARG2(__lsx_vldrepl_w, src_ptr, 0, src_ptr + src_stride, 0, + src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1, + src2, src3); + DUP4_ARG2(__lsx_vldrepl_w, pred_ptr, 0, pred_ptr + pred_stride, 0, + pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred0, + pred1, pred2, pred3); + DUP4_ARG2(__lsx_vilvl_w, src1, src0, src3, src2, pred1, pred0, pred3, pred2, + src0, src2, pred0, pred2); + DUP2_ARG2(__lsx_vilvl_d, src2, src0, pred2, pred0, src0, pred0); + reg0 = __lsx_vilvl_b(src0, pred0); + reg1 = __lsx_vilvh_b(src0, pred0); + DUP2_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, diff0, diff1); + __lsx_vstelm_d(diff0, diff_ptr, 0, 0); + __lsx_vstelm_d(diff0, diff_ptr + diff_stride, 0, 1); + __lsx_vstelm_d(diff1, diff_ptr + diff_stride2, 0, 0); + __lsx_vstelm_d(diff1, diff_ptr + diff_stride3, 0, 1); +} + +static void sub_blk_8x8_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *pred_ptr, int32_t pred_stride, + int16_t *diff_ptr, int32_t diff_stride) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + int32_t src_stride2 = src_stride << 1; + int32_t pred_stride2 = pred_stride << 1; + int32_t dst_stride = diff_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t pred_stride3 = pred_stride2 + pred_stride; + int32_t dst_stride2 = dst_stride << 1; + int32_t src_stride4 = src_stride2 << 1; + int32_t pred_stride4 = pred_stride2 << 1; + int32_t dst_stride3 = dst_stride + dst_stride2; + + DUP4_ARG2(__lsx_vldrepl_d, src_ptr, 0, src_ptr + src_stride, 0, + src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1, + src2, src3); + DUP4_ARG2(__lsx_vldrepl_d, pred_ptr, 0, pred_ptr + pred_stride, 0, + pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred0, + pred1, pred2, pred3); + src_ptr += src_stride4; + pred_ptr += pred_stride4; + + DUP4_ARG2(__lsx_vldrepl_d, src_ptr, 0, src_ptr + src_stride, 0, + src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src4, src5, + src6, src7); + DUP4_ARG2(__lsx_vldrepl_d, pred_ptr, 0, pred_ptr + pred_stride, 0, + pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred4, + pred5, pred6, pred7); + + DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + reg4, reg5, reg6, reg7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7, + src4, src5, src6, src7); + __lsx_vst(src0, diff_ptr, 0); + __lsx_vstx(src1, diff_ptr, dst_stride); + __lsx_vstx(src2, diff_ptr, dst_stride2); + __lsx_vstx(src3, diff_ptr, dst_stride3); + diff_ptr += dst_stride2; + __lsx_vst(src4, diff_ptr, 0); + __lsx_vstx(src5, diff_ptr, dst_stride); + __lsx_vstx(src6, diff_ptr, dst_stride2); + __lsx_vstx(src7, diff_ptr, dst_stride3); +} + +static void sub_blk_16x16_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int32_t src_stride2 = src_stride << 1; + int32_t pred_stride2 = pred_stride << 1; + int32_t dst_stride = diff_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t pred_stride3 = pred_stride2 + pred_stride; + int32_t dst_stride2 = dst_stride << 1; + int32_t src_stride4 = src_stride2 << 1; + int32_t pred_stride4 = pred_stride2 << 1; + int32_t dst_stride3 = dst_stride + dst_stride2; + int16_t *diff_tmp = diff + 8; + + DUP2_ARG2(__lsx_vld, src, 0, pred, 0, src0, pred0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + DUP4_ARG2(__lsx_vldx, pred, pred_stride, pred, pred_stride2, pred, + pred_stride3, pred, pred_stride4, pred1, pred2, pred3, pred4); + src += src_stride4; + pred += pred_stride4; + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + pred, pred_stride, src5, src6, src7, pred5); + DUP2_ARG2(__lsx_vldx, pred, pred_stride2, pred, pred_stride3, pred6, pred7); + src += src_stride4; + pred += pred_stride4; + DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg0, reg2, reg4, reg6); + DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg1, reg3, reg5, reg7); + DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp0, tmp2, tmp4, tmp6); + DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp1, tmp3, tmp5, tmp7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7, + src4, src5, src6, src7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, + pred0, pred1, pred2, pred3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, tmp7, + pred4, pred5, pred6, pred7); + __lsx_vst(src0, diff, 0); + __lsx_vstx(src2, diff, dst_stride); + __lsx_vstx(src4, diff, dst_stride2); + __lsx_vstx(src6, diff, dst_stride3); + __lsx_vst(src1, diff_tmp, 0); + __lsx_vstx(src3, diff_tmp, dst_stride); + __lsx_vstx(src5, diff_tmp, dst_stride2); + __lsx_vstx(src7, diff_tmp, dst_stride3); + diff += dst_stride2; + diff_tmp += dst_stride2; + __lsx_vst(pred0, diff, 0); + __lsx_vstx(pred2, diff, dst_stride); + __lsx_vstx(pred4, diff, dst_stride2); + __lsx_vstx(pred6, diff, dst_stride3); + __lsx_vst(pred1, diff_tmp, 0); + __lsx_vstx(pred3, diff_tmp, dst_stride); + __lsx_vstx(pred5, diff_tmp, dst_stride2); + __lsx_vstx(pred7, diff_tmp, dst_stride3); + diff += dst_stride2; + diff_tmp += dst_stride2; + DUP2_ARG2(__lsx_vld, src, 0, pred, 0, src0, pred0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + DUP4_ARG2(__lsx_vldx, pred, pred_stride, pred, pred_stride2, pred, + pred_stride3, pred, pred_stride4, pred1, pred2, pred3, pred4); + src += src_stride4; + pred += pred_stride4; + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + pred, pred_stride, src5, src6, src7, pred5); + DUP2_ARG2(__lsx_vldx, pred, pred_stride2, pred, pred_stride3, pred6, pred7); + DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg0, reg2, reg4, reg6); + DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg1, reg3, reg5, reg7); + DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp0, tmp2, tmp4, tmp6); + DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp1, tmp3, tmp5, tmp7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7, + src4, src5, src6, src7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, + pred0, pred1, pred2, pred3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, tmp7, + pred4, pred5, pred6, pred7); + __lsx_vst(src0, diff, 0); + __lsx_vstx(src2, diff, dst_stride); + __lsx_vstx(src4, diff, dst_stride2); + __lsx_vstx(src6, diff, dst_stride3); + __lsx_vst(src1, diff_tmp, 0); + __lsx_vstx(src3, diff_tmp, dst_stride); + __lsx_vstx(src5, diff_tmp, dst_stride2); + __lsx_vstx(src7, diff_tmp, dst_stride3); + diff += dst_stride2; + diff_tmp += dst_stride2; + __lsx_vst(pred0, diff, 0); + __lsx_vstx(pred2, diff, dst_stride); + __lsx_vstx(pred4, diff, dst_stride2); + __lsx_vstx(pred6, diff, dst_stride3); + __lsx_vst(pred1, diff_tmp, 0); + __lsx_vstx(pred3, diff_tmp, dst_stride); + __lsx_vstx(pred5, diff_tmp, dst_stride2); + __lsx_vstx(pred7, diff_tmp, dst_stride3); +} + +static void sub_blk_32x32_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + uint32_t loop_cnt; + int32_t src_stride2 = src_stride << 1; + int32_t pred_stride2 = pred_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t pred_stride3 = pred_stride2 + pred_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t pred_stride4 = pred_stride2 << 1; + + for (loop_cnt = 8; loop_cnt--;) { + const uint8_t *src_tmp = src + 16; + const uint8_t *pred_tmp = pred + 16; + DUP4_ARG2(__lsx_vld, src, 0, src_tmp, 0, pred, 0, pred_tmp, 0, src0, src1, + pred0, pred1); + DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src, + src_stride2, src_tmp, src_stride2, src2, src3, src4, src5); + DUP4_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, pred, + pred_stride, pred_tmp, pred_stride, src6, src7, pred2, pred3); + DUP4_ARG2(__lsx_vldx, pred, pred_stride2, pred_tmp, pred_stride2, pred, + pred_stride3, pred_tmp, pred_stride3, pred4, pred5, pred6, pred7); + DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg0, reg2, reg4, reg6); + DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg1, reg3, reg5, reg7); + DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp0, tmp2, tmp4, tmp6); + DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp1, tmp3, tmp5, tmp7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, + reg3, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, + reg7, src4, src5, src6, src7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, + tmp3, pred0, pred1, pred2, pred3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, + tmp7, pred4, pred5, pred6, pred7); + src += src_stride4; + pred += pred_stride4; + __lsx_vst(src0, diff, 0); + __lsx_vst(src1, diff, 16); + __lsx_vst(src2, diff, 32); + __lsx_vst(src3, diff, 48); + diff += diff_stride; + __lsx_vst(src4, diff, 0); + __lsx_vst(src5, diff, 16); + __lsx_vst(src6, diff, 32); + __lsx_vst(src7, diff, 48); + diff += diff_stride; + __lsx_vst(pred0, diff, 0); + __lsx_vst(pred1, diff, 16); + __lsx_vst(pred2, diff, 32); + __lsx_vst(pred3, diff, 48); + diff += diff_stride; + __lsx_vst(pred4, diff, 0); + __lsx_vst(pred5, diff, 16); + __lsx_vst(pred6, diff, 32); + __lsx_vst(pred7, diff, 48); + diff += diff_stride; + } +} + +static void sub_blk_64x64_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + uint32_t loop_cnt; + + for (loop_cnt = 32; loop_cnt--;) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + DUP4_ARG2(__lsx_vld, pred, 0, pred, 16, pred, 32, pred, 48, pred0, pred1, + pred2, pred3); + src += src_stride; + pred += pred_stride; + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src4, src5, src6, + src7); + DUP4_ARG2(__lsx_vld, pred, 0, pred, 16, pred, 32, pred, 48, pred4, pred5, + pred6, pred7); + src += src_stride; + pred += pred_stride; + + DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg0, reg2, reg4, reg6); + DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg1, reg3, reg5, reg7); + DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp0, tmp2, tmp4, tmp6); + DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp1, tmp3, tmp5, tmp7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, + reg3, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, + reg7, src4, src5, src6, src7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, + tmp3, pred0, pred1, pred2, pred3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, + tmp7, pred4, pred5, pred6, pred7); + __lsx_vst(src0, diff, 0); + __lsx_vst(src1, diff, 16); + __lsx_vst(src2, diff, 32); + __lsx_vst(src3, diff, 48); + __lsx_vst(src4, diff, 64); + __lsx_vst(src5, diff, 80); + __lsx_vst(src6, diff, 96); + __lsx_vst(src7, diff, 112); + diff += diff_stride; + __lsx_vst(pred0, diff, 0); + __lsx_vst(pred1, diff, 16); + __lsx_vst(pred2, diff, 32); + __lsx_vst(pred3, diff, 48); + __lsx_vst(pred4, diff, 64); + __lsx_vst(pred5, diff, 80); + __lsx_vst(pred6, diff, 96); + __lsx_vst(pred7, diff, 112); + diff += diff_stride; + } +} + +void vpx_subtract_block_lsx(int32_t rows, int32_t cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, + ptrdiff_t pred_stride) { + if (rows == cols) { + switch (rows) { + case 4: + sub_blk_4x4_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 8: + sub_blk_8x8_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 16: + sub_blk_16x16_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 32: + sub_blk_32x32_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 64: + sub_blk_64x64_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + default: + vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + } + } else { + vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/txfm_macros_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/txfm_macros_lsx.h new file mode 100644 index 0000000000..bd514831bf --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/txfm_macros_lsx.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_ +#define VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_ + +#include "vpx_util/loongson_intrinsics.h" + +#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \ + do { \ + __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m; \ + __m128i k0_m, k1_m, k2_m, k3_m; \ + \ + k0_m = __lsx_vreplgr2vr_h(cnst0); \ + k1_m = __lsx_vreplgr2vr_h(cnst1); \ + k2_m = __lsx_vpackev_h(k1_m, k0_m); \ + \ + DUP2_ARG2(__lsx_vilvl_h, reg1, reg0, reg0, reg1, s5_m, s3_m); \ + DUP2_ARG2(__lsx_vilvh_h, reg1, reg0, reg0, reg1, s4_m, s2_m); \ + \ + DUP2_ARG2(__lsx_vmulwev_w_h, s5_m, k0_m, s4_m, k0_m, s1_m, s0_m); \ + k3_m = __lsx_vmulwod_w_h(s5_m, k1_m); \ + s1_m = __lsx_vsub_w(s1_m, k3_m); \ + k3_m = __lsx_vmulwod_w_h(s4_m, k1_m); \ + s0_m = __lsx_vsub_w(s0_m, k3_m); \ + \ + out0 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS); \ + \ + DUP2_ARG2(__lsx_vdp2_w_h, s3_m, k2_m, s2_m, k2_m, s1_m, s0_m); \ + out1 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS); \ + } while (0) + +#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2, in3) \ + do { \ + __m128i tp0_m, tp1_m; \ + \ + DUP2_ARG2(__lsx_vdp2_w_h, in0, in2, in1, in2, tp1_m, tp0_m); \ + in3 = __lsx_vssrarni_h_w(tp1_m, tp0_m, DCT_CONST_BITS); \ + } while (0) + +#endif // VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.c new file mode 100644 index 0000000000..8fad342c71 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.c @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/variance_lsx.h" + +#define VARIANCE_WxH(sse, diff, shift) \ + (sse) - (((uint32_t)(diff) * (diff)) >> (shift)) + +#define VARIANCE_LARGE_WxH(sse, diff, shift) \ + (sse) - (((int64_t)(diff) * (diff)) >> (shift)) + +static uint32_t sse_diff_8width_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, int32_t *diff) { + int32_t res, ht_cnt = (height >> 2); + __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3, vec; + __m128i avg = __lsx_vldi(0); + __m128i var = avg; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t ref_stride2 = ref_stride << 1; + int32_t ref_stride3 = ref_stride2 + ref_stride; + int32_t ref_stride4 = ref_stride2 << 1; + + for (; ht_cnt--;) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr + src_stride, 0, + src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1, + src2, src3); + src_ptr += src_stride4; + DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr + ref_stride, 0, + ref_ptr + ref_stride2, 0, ref_ptr + ref_stride3, 0, ref0, ref1, + ref2, ref3); + ref_ptr += ref_stride4; + + DUP4_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, ref1, ref0, ref3, ref2, + src0, src1, ref0, ref1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + } + + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + +static uint32_t sse_diff_16width_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, int32_t *diff) { + int32_t res, ht_cnt = (height >> 2); + __m128i src, ref, vec; + __m128i avg = __lsx_vldi(0); + __m128i var = avg; + + for (; ht_cnt--;) { + src = __lsx_vld(src_ptr, 0); + src_ptr += src_stride; + ref = __lsx_vld(ref_ptr, 0); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + + src = __lsx_vld(src_ptr, 0); + src_ptr += src_stride; + ref = __lsx_vld(ref_ptr, 0); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + src = __lsx_vld(src_ptr, 0); + src_ptr += src_stride; + ref = __lsx_vld(ref_ptr, 0); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + + src = __lsx_vld(src_ptr, 0); + src_ptr += src_stride; + ref = __lsx_vld(ref_ptr, 0); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + +static uint32_t sse_diff_32width_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, int32_t *diff) { + int32_t res, ht_cnt = (height >> 2); + __m128i avg = __lsx_vldi(0); + __m128i src0, src1, ref0, ref1; + __m128i vec; + __m128i var = avg; + + for (; ht_cnt--;) { + DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1); + src_ptr += src_stride; + DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1); + src_ptr += src_stride; + DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1); + src_ptr += src_stride; + DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1); + src_ptr += src_stride; + DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + } + + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + +static uint32_t sse_diff_64x64_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t *diff) { + int32_t res, ht_cnt = 32; + __m128i avg0 = __lsx_vldi(0); + __m128i src0, src1, src2, src3; + __m128i ref0, ref1, ref2, ref3; + __m128i vec0, vec1; + __m128i avg1 = avg0; + __m128i avg2 = avg0; + __m128i avg3 = avg0; + __m128i var = avg0; + + for (; ht_cnt--;) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + src_ptr += src_stride; + DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48, + ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + src_ptr += src_stride; + DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48, + ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + } + vec0 = __lsx_vhaddw_w_h(avg0, avg0); + vec1 = __lsx_vhaddw_w_h(avg1, avg1); + vec0 = __lsx_vadd_w(vec0, vec1); + vec1 = __lsx_vhaddw_w_h(avg2, avg2); + vec0 = __lsx_vadd_w(vec0, vec1); + vec1 = __lsx_vhaddw_w_h(avg3, avg3); + vec0 = __lsx_vadd_w(vec0, vec1); + HADD_SW_S32(vec0, *diff); + HADD_SW_S32(var, res); + return res; +} + +#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6) +#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8) + +#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10) +#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12) + +#define VPX_VARIANCE_WDXHT_LSX(wd, ht) \ + uint32_t vpx_variance##wd##x##ht##_lsx( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, uint32_t *sse) { \ + int32_t diff; \ + \ + *sse = \ + sse_diff_##wd##width_lsx(src, src_stride, ref, ref_stride, ht, &diff); \ + \ + return VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } + +static uint32_t sse_16width_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t res, ht_cnt = (height >> 2); + __m128i src, ref; + __m128i var = __lsx_vldi(0); + + for (; ht_cnt--;) { + DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref); + src_ptr += src_stride; + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + + DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref); + src_ptr += src_stride; + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + + DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref); + src_ptr += src_stride; + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + + DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref); + src_ptr += src_stride; + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + } + HADD_SW_S32(var, res); + return res; +} + +VPX_VARIANCE_WDXHT_LSX(8, 8) +VPX_VARIANCE_WDXHT_LSX(16, 16) +VPX_VARIANCE_WDXHT_LSX(32, 32) + +uint32_t vpx_variance64x64_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + int32_t diff; + + *sse = sse_diff_64x64_lsx(src, src_stride, ref, ref_stride, &diff); + + return VARIANCE_64Wx64H(*sse, diff); +} + +uint32_t vpx_mse16x16_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + *sse = sse_16width_lsx(src, src_stride, ref, ref_stride, 16); + + return *sse; +} + +void vpx_get16x16var_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, uint32_t *sse, + int32_t *sum) { + *sse = sse_diff_16width_lsx(src, src_stride, ref, ref_stride, 16, sum); +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.h new file mode 100644 index 0000000000..cf9e9890ff --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_LOONGARCH_VARIANCE_LSX_H_ +#define VPX_VPX_DSP_LOONGARCH_VARIANCE_LSX_H_ + +#include "vpx_util/loongson_intrinsics.h" + +#define HADD_SW_S32(in0, in1) \ + do { \ + __m128i res0_m; \ + \ + res0_m = __lsx_vhaddw_d_w(in0, in0); \ + res0_m = __lsx_vhaddw_q_d(res0_m, res0_m); \ + in1 = __lsx_vpickve2gr_w(res0_m, 0); \ + } while (0) + +#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift, in2) \ + do { \ + __m128i tmp0_m, tmp1_m; \ + \ + tmp0_m = __lsx_vshuf_b(in1, in0, mask); \ + tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff); \ + in2 = __lsx_vsrari_h(tmp1_m, shift); \ + } while (0) + +#define CALC_MSE_B(src, ref, var) \ + do { \ + __m128i src_l0_m, src_l1_m; \ + __m128i res_l0_m, res_l1_m; \ + \ + src_l0_m = __lsx_vilvl_b(src, ref); \ + src_l1_m = __lsx_vilvh_b(src, ref); \ + DUP2_ARG2(__lsx_vhsubw_hu_bu, src_l0_m, src_l0_m, src_l1_m, src_l1_m, \ + res_l0_m, res_l1_m); \ + var = __lsx_vdp2add_w_h(var, res_l0_m, res_l0_m); \ + var = __lsx_vdp2add_w_h(var, res_l1_m, res_l1_m); \ + } while (0) + +#define CALC_MSE_AVG_B(src, ref, var, sub) \ + do { \ + __m128i src_l0_m, src_l1_m; \ + __m128i res_l0_m, res_l1_m; \ + \ + src_l0_m = __lsx_vilvl_b(src, ref); \ + src_l1_m = __lsx_vilvh_b(src, ref); \ + DUP2_ARG2(__lsx_vhsubw_hu_bu, src_l0_m, src_l0_m, src_l1_m, src_l1_m, \ + res_l0_m, res_l1_m); \ + var = __lsx_vdp2add_w_h(var, res_l0_m, res_l0_m); \ + var = __lsx_vdp2add_w_h(var, res_l1_m, res_l1_m); \ + sub = __lsx_vadd_h(sub, res_l0_m); \ + sub = __lsx_vadd_h(sub, res_l1_m); \ + } while (0) + +#endif // VPX_VPX_DSP_LOONGARCH_VARIANCE_LSX_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c new file mode 100644 index 0000000000..1c59228813 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c @@ -0,0 +1,972 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/vpx_convolve_lsx.h" + +static const uint8_t mc_filt_mask_arr[16 * 3] = { + /* 8 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + /* 4 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, + /* 4 width cases */ + 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 +}; + +static void common_hz_8t_and_aver_dst_4x4_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i tmp0, tmp1; + __m128i dst0, dst1, dst2, dst3; + + mask0 = __lsx_vld(mc_filt_mask_arr, 16); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filter0, filter1, filter2, filter3, tmp0, tmp1); + dst0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst0 = __lsx_vilvl_w(dst1, dst0); + dst1 = __lsx_vilvl_w(dst3, dst2); + dst0 = __lsx_vilvl_d(dst1, dst0); + tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7); + tmp0 = __lsx_vxori_b(tmp0, 128); + dst0 = __lsx_vavgr_bu(tmp0, dst0); + __lsx_vstelm_w(dst0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(dst0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(dst0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(dst0, dst, 0, 3); +} + +static void common_hz_8t_and_aver_dst_4x8_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3, tmp0, tmp1, tmp2, tmp3; + __m128i dst0, dst1; + + mask0 = __lsx_vld(mc_filt_mask_arr, 16); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + src += src_stride; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + tmp0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + tmp1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + tmp2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + tmp3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + tmp0 = __lsx_vilvl_w(tmp1, tmp0); + tmp1 = __lsx_vilvl_w(tmp3, tmp2); + dst0 = __lsx_vilvl_d(tmp1, tmp0); + + tmp0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + tmp1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + tmp2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + tmp3 = __lsx_vldrepl_w(dst_tmp, 0); + tmp0 = __lsx_vilvl_w(tmp1, tmp0); + tmp1 = __lsx_vilvl_w(tmp3, tmp2); + dst1 = __lsx_vilvl_d(tmp1, tmp0); + + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filter0, filter1, filter2, filter3, tmp0, tmp1); + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filter0, filter1, filter2, filter3, tmp2, tmp3); + DUP4_ARG3(__lsx_vssrarni_b_h, tmp0, tmp0, 7, tmp1, tmp1, 7, tmp2, tmp2, 7, + tmp3, tmp3, 7, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp1, dst1, dst0, dst1); + __lsx_vstelm_w(dst0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(dst0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(dst0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(dst0, dst, 0, 3); + dst += dst_stride; + __lsx_vstelm_w(dst1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(dst1, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(dst1, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(dst1, dst, 0, 3); +} + +static void common_hz_8t_and_aver_dst_4w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (height == 4) { + common_hz_8t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter); + } else if (height == 8) { + common_hz_8t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_8t_and_aver_dst_8w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + int32_t loop_cnt = height >> 2; + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i dst0, dst1, dst2, dst3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + uint8_t *_src = (uint8_t *)src - 3; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + for (; loop_cnt--;) { + src0 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); + src3 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, tmp0, + tmp1, tmp2, tmp3); + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp1, dst1, dst0, dst1); + __lsx_vstelm_d(dst0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(dst0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(dst1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(dst1, dst, 0, 1); + dst += dst_stride; + } +} + +static void common_hz_8t_and_aver_dst_16w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + int32_t loop_cnt = height >> 1; + int32_t dst_stride2 = dst_stride << 1; + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3); + src += src_stride; + dst0 = __lsx_vld(dst_tmp, 0); + dst1 = __lsx_vldx(dst_tmp, dst_stride); + dst_tmp += dst_stride2; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2, src2, + mask0, src3, src3, mask0, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2, src2, + mask1, src3, src3, mask1, tmp4, tmp5, tmp6, tmp7); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2, src2, + mask2, src3, src3, mask2, tmp8, tmp9, tmp10, tmp11); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, src2, src2, + mask3, src3, src3, mask3, tmp12, tmp13, tmp14, tmp15); + DUP4_ARG2(__lsx_vdp2_h_b, tmp0, filter0, tmp1, filter0, tmp2, filter0, tmp3, + filter0, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vdp2_h_b, tmp8, filter2, tmp9, filter2, tmp10, filter2, + tmp11, filter2, tmp8, tmp9, tmp10, tmp11); + DUP4_ARG3(__lsx_vdp2add_h_b, tmp0, tmp4, filter1, tmp1, tmp5, filter1, tmp2, + tmp6, filter1, tmp3, tmp7, filter1, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG3(__lsx_vdp2add_h_b, tmp8, tmp12, filter3, tmp9, tmp13, filter3, + tmp10, tmp14, filter3, tmp11, tmp15, filter3, tmp4, tmp5, tmp6, + tmp7); + DUP4_ARG2(__lsx_vsadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, dst2, dst3); + DUP2_ARG2(__lsx_vxori_b, dst2, 128, dst3, 128, dst2, dst3); + DUP2_ARG2(__lsx_vavgr_bu, dst0, dst2, dst1, dst3, dst0, dst1); + __lsx_vst(dst0, dst, 0); + __lsx_vstx(dst1, dst, dst_stride); + dst += dst_stride2; + } +} + +static void common_hz_8t_and_aver_dst_32w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = height; + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3, dst0, dst1; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2); + src3 = __lsx_vld(src, 24); + src1 = __lsx_vshuf_b(src2, src0, shuff); + src += src_stride; + DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst, 16, dst0, dst1); + dst_tmp += dst_stride; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2, src2, + mask0, src3, src3, mask0, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2, src2, + mask1, src3, src3, mask1, tmp4, tmp5, tmp6, tmp7); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2, src2, + mask2, src3, src3, mask2, tmp8, tmp9, tmp10, tmp11); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, src2, src2, + mask3, src3, src3, mask3, tmp12, tmp13, tmp14, tmp15); + DUP4_ARG2(__lsx_vdp2_h_b, tmp0, filter0, tmp1, filter0, tmp2, filter0, tmp3, + filter0, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vdp2_h_b, tmp8, filter2, tmp9, filter2, tmp10, filter2, + tmp11, filter2, tmp8, tmp9, tmp10, tmp11); + DUP4_ARG3(__lsx_vdp2add_h_b, tmp0, tmp4, filter1, tmp1, tmp5, filter1, tmp2, + tmp6, filter1, tmp3, tmp7, filter1, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG3(__lsx_vdp2add_h_b, tmp8, tmp12, filter3, tmp9, tmp13, filter3, + tmp10, tmp14, filter3, tmp11, tmp15, filter3, tmp4, tmp5, tmp6, + tmp7); + DUP4_ARG2(__lsx_vsadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + DUP2_ARG2(__lsx_vavgr_bu, dst0, tmp0, dst1, tmp1, dst0, dst1); + __lsx_vst(dst0, dst, 0); + __lsx_vst(dst1, dst, 16); + dst += dst_stride; + } +} + +static void common_hz_8t_and_aver_dst_64w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + int32_t loop_cnt = height; + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i out0, out1, out2, out3, dst0, dst1; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2); + src3 = __lsx_vld(src, 24); + src1 = __lsx_vshuf_b(src2, src0, shuff); + DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, out0, + out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + DUP2_ARG2(__lsx_vavgr_bu, out0, dst0, out1, dst1, out0, out1); + __lsx_vst(out0, dst, 0); + __lsx_vst(out1, dst, 16); + + DUP2_ARG2(__lsx_vld, src, 32, src, 48, src0, src2); + src3 = __lsx_vld(src, 56); + src1 = __lsx_vshuf_b(src2, src0, shuff); + DUP2_ARG2(__lsx_vld, dst, 32, dst, 48, dst0, dst1); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, out0, + out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + DUP2_ARG2(__lsx_vavgr_bu, out0, dst0, out1, dst1, out0, out1); + __lsx_vst(out0, dst, 32); + __lsx_vst(out1, dst, 48); + src += src_stride; + dst += dst_stride; + } +} + +static void common_hz_2t_and_aver_dst_4x4_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, mask; + __m128i dst0, dst1, dst2, dst3, vec0, vec1, filt0; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + uint8_t *dst_tmp = dst; + + mask = __lsx_vld(mc_filt_mask_arr, 16); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + dst0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_w, dst1, dst0, dst3, dst2, dst0, dst1); + dst0 = __lsx_vilvl_d(dst1, dst0); + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec0, vec1); + vec0 = __lsx_vssrarni_bu_h(vec1, vec0, FILTER_BITS); + vec0 = __lsx_vavgr_bu(vec0, dst0); + __lsx_vstelm_w(vec0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(vec0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(vec0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(vec0, dst, 0, 3); +} + +static void common_hz_2t_and_aver_dst_4x8_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3; + __m128i dst0, dst1, dst2, dst3, dst4; + __m128i vec4, vec5, vec6, vec7; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + uint8_t *src_tmp1 = (uint8_t *)src + src_stride4; + uint8_t *dst_tmp = dst; + + mask = __lsx_vld(mc_filt_mask_arr, 16); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + + src4 = __lsx_vld(src_tmp1, 0); + DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src5, + src6); + src7 = __lsx_vldx(src_tmp1, src_stride3); + + dst0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_w, dst1, dst0, dst3, dst2, dst0, dst1); + dst0 = __lsx_vilvl_d(dst1, dst0); + + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst4 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_w, dst2, dst1, dst4, dst3, dst1, dst2); + dst1 = __lsx_vilvl_d(dst2, dst1); + + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask, src7, src6, mask, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + vec4, vec5, vec6, vec7); + DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5, + FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, res0, + res1, res2, res3); + DUP2_ARG2(__lsx_vilvl_d, res1, res0, res3, res2, res0, res2); + DUP2_ARG2(__lsx_vavgr_bu, res0, dst0, res2, dst1, res0, res2); + + __lsx_vstelm_w(res0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(res0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(res0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(res0, dst, 0, 3); + dst += dst_stride; + + __lsx_vstelm_w(res2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(res2, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(res2, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(res2, dst, 0, 3); + dst += dst_stride; +} + +static void common_hz_2t_and_aver_dst_4w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (height == 4) { + common_hz_2t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter); + } else if (height == 8) { + common_hz_2t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_2t_and_aver_dst_8x4_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, mask; + __m128i filt0, dst0, dst1, dst2, dst3; + __m128i vec0, vec1, vec2, vec3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + uint8_t *dst_tmp = dst; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, vec0, vec1); + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec1, dst1, vec0, vec1); + __lsx_vstelm_d(vec0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(vec1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec1, dst, 0, 1); +} + +static void common_hz_2t_and_aver_dst_8x8mult_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + __m128i src0, src1, src2, src3, mask; + __m128i filt0, dst0, dst1, dst2, dst3; + __m128i vec0, vec1, vec2, vec3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + uint8_t *dst_tmp = dst; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + src += src_stride; + + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, vec0, vec2); + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + + DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2); + __lsx_vstelm_d(vec0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(vec2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec2, dst, 0, 1); + dst += dst_stride; + + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + src += src_stride; + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, vec0, vec2); + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2); + __lsx_vstelm_d(vec0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(vec2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec2, dst, 0, 1); + dst += dst_stride; + + if (height == 16) { + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + src += src_stride; + + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, vec0, vec2); + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2); + __lsx_vstelm_d(vec0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(vec2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec2, dst, 0, 1); + dst += dst_stride; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, vec0, vec2); + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2); + __lsx_vstelm_d(vec0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(vec2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec2, dst, 0, 1); + dst += dst_stride; + } +} + +static void common_hz_2t_and_aver_dst_8w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (height == 4) { + common_hz_2t_and_aver_dst_8x4_lsx(src, src_stride, dst, dst_stride, filter); + } else { + common_hz_2t_and_aver_dst_8x8mult_lsx(src, src_stride, dst, dst_stride, + filter, height); + } +} + +static void common_hz_2t_and_aver_dst_16w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 2) - 1; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt0, dst0; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i res0, res1, res2, res3, res4, res5, res6, res7; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + uint8_t *src_tmp1 = (uint8_t *)src + 8; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4); + src6 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + src1 = __lsx_vld(src_tmp1, 0); + DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3, + src5); + src7 = __lsx_vldx(src_tmp1, src_stride3); + src_tmp1 += src_stride4; + + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5); + DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7); + + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + res0, res1, res2, res3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0, + res4, res5, res6, res7); + DUP4_ARG3(__lsx_vssrarni_bu_h, res1, res0, FILTER_BITS, res3, res2, + FILTER_BITS, res5, res4, FILTER_BITS, res7, res6, FILTER_BITS, res0, + res2, res4, res6); + dst0 = __lsx_vld(dst, 0); + res0 = __lsx_vavgr_bu(res0, dst0); + __lsx_vst(res0, dst, 0); + dst += dst_stride; + + dst0 = __lsx_vld(dst, 0); + res2 = __lsx_vavgr_bu(res2, dst0); + __lsx_vst(res2, dst, 0); + dst += dst_stride; + + dst0 = __lsx_vld(dst, 0); + res4 = __lsx_vavgr_bu(res4, dst0); + __lsx_vst(res4, dst, 0); + dst += dst_stride; + + dst0 = __lsx_vld(dst, 0); + res6 = __lsx_vavgr_bu(res6, dst0); + __lsx_vst(res6, dst, 0); + dst += dst_stride; + + for (; loop_cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4); + src6 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + src1 = __lsx_vld(src_tmp1, 0); + DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3, + src5); + src7 = __lsx_vldx(src_tmp1, src_stride3); + src_tmp1 += src_stride4; + + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5); + DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, res0, res1, res2, res3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, res4, res5, res6, res7); + + DUP4_ARG3(__lsx_vssrarni_bu_h, res1, res0, FILTER_BITS, res3, res2, + FILTER_BITS, res5, res4, FILTER_BITS, res7, res6, FILTER_BITS, + res0, res2, res4, res6); + dst0 = __lsx_vld(dst, 0); + res0 = __lsx_vavgr_bu(res0, dst0); + __lsx_vst(res0, dst, 0); + dst += dst_stride; + + dst0 = __lsx_vld(dst, 0); + res2 = __lsx_vavgr_bu(res2, dst0); + __lsx_vst(res2, dst, 0); + dst += dst_stride; + + dst0 = __lsx_vld(dst, 0); + res4 = __lsx_vavgr_bu(res4, dst0); + __lsx_vst(res4, dst, 0); + dst += dst_stride; + + dst0 = __lsx_vld(dst, 0); + res6 = __lsx_vavgr_bu(res6, dst0); + __lsx_vst(res6, dst, 0); + dst += dst_stride; + } +} + +static void common_hz_2t_and_aver_dst_32w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 1); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt0, dst0, dst1; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i res0, res1, res2, res3, res4, res5, res6, res7; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + for (; loop_cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vld, src, 16, src, 24, src2, src3); + src1 = __lsx_vshuf_b(src2, src0, shuff); + src += src_stride; + src4 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vld, src, 16, src, 24, src6, src7); + src5 = __lsx_vshuf_b(src6, src4, shuff); + src += src_stride; + + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5); + DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7); + + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, res0, res1, res2, res3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, res4, res5, res6, res7); + DUP4_ARG3(__lsx_vssrarni_bu_h, res1, res0, FILTER_BITS, res3, res2, + FILTER_BITS, res5, res4, FILTER_BITS, res7, res6, FILTER_BITS, + res0, res2, res4, res6); + + DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1); + res0 = __lsx_vavgr_bu(res0, dst0); + __lsx_vst(res0, dst, 0); + res2 = __lsx_vavgr_bu(res2, dst1); + __lsx_vst(res2, dst, 16); + dst += dst_stride; + + DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1); + res4 = __lsx_vavgr_bu(res4, dst0); + __lsx_vst(res4, dst, 0); + res6 = __lsx_vavgr_bu(res6, dst1); + __lsx_vst(res6, dst, 16); + dst += dst_stride; + } +} + +static void common_hz_2t_and_aver_dst_64w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = height; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt0, dst0, dst1, dst2, dst3; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i out0, out1, out2, out3, out4, out5, out6, out7; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + for (; loop_cnt--;) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src2, src4, + src6); + src7 = __lsx_vld(src, 56); + DUP2_ARG3(__lsx_vshuf_b, src2, src0, shuff, src4, src2, shuff, src1, src3); + src5 = __lsx_vshuf_b(src6, src4, shuff); + src += src_stride; + + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5); + DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, out0, out1, out2, out3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, out4, out5, out6, out7); + + DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2, + FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, + out0, out2, out4, out6); + + DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, dst0, dst1, dst2, + dst3); + out0 = __lsx_vavgr_bu(out0, dst0); + __lsx_vst(out0, dst, 0); + out2 = __lsx_vavgr_bu(out2, dst1); + __lsx_vst(out2, dst, 16); + out4 = __lsx_vavgr_bu(out4, dst2); + __lsx_vst(out4, dst, 32); + out6 = __lsx_vavgr_bu(out6, dst3); + __lsx_vst(out6, dst, 48); + dst += dst_stride; + } +} + +void vpx_convolve8_avg_horiz_lsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int16_t *const filter_x = filter[x0_q4]; + int8_t cnt, filt_hor[8]; + + assert(x_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + } + + if (vpx_get_filter_taps(filter_x) == 2) { + switch (w) { + case 4: + common_hz_2t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + case 8: + common_hz_2t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + case 16: + common_hz_2t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + + case 32: + common_hz_2t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + case 64: + common_hz_2t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + default: + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_hz_8t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + case 8: + common_hz_8t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + case 16: + common_hz_8t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + case 32: + common_hz_8t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + case 64: + common_hz_8t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + default: + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c new file mode 100644 index 0000000000..d1abf622ad --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c @@ -0,0 +1,737 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/vpx_convolve_lsx.h" + +static const uint8_t mc_filt_mask_arr[16 * 3] = { + /* 8 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + /* 4 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, + /* 4 width cases */ + 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 +}; + +static void common_hv_8ht_8vt_and_aver_dst_4w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt = height >> 2; + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3; + __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3; + __m128i mask0, mask1, mask2, mask3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + __m128i out0, out1; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + uint8_t *_src = (uint8_t *)src - 3 - src_stride3; + + mask0 = __lsx_vld(mc_filt_mask_arr, 16); + DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4, + filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + + src0 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); + src3 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + src4 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6); + _src += src_stride3; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + + tmp0 = horiz_8tap_filt(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp2 = horiz_8tap_filt(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp4 = horiz_8tap_filt(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp5 = horiz_8tap_filt(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3); + DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4, + filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + DUP2_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); + tmp2 = __lsx_vpackev_b(tmp5, tmp4); + for (; loop_cnt--;) { + src7 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9); + src10 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + src2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + src3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + src4 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + src5 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_w, src3, src2, src5, src4, src2, src3); + src2 = __lsx_vilvl_d(src3, src2); + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + tmp3 = horiz_8tap_filt(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff); + tmp4 = __lsx_vpackev_b(tmp3, tmp4); + out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + src1 = horiz_8tap_filt(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src0 = __lsx_vshuf_b(src1, tmp3, shuff); + src0 = __lsx_vpackev_b(src1, src0); + out1 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + out0 = __lsx_vssrarni_b_h(out1, out0, FILTER_BITS); + out0 = __lsx_vxori_b(out0, 128); + out0 = __lsx_vavgr_bu(out0, src2); + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); + dst += dst_stride; + + tmp5 = src1; + tmp0 = tmp2; + tmp1 = tmp4; + tmp2 = src0; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_8w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt = height >> 2; + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3; + __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3; + __m128i mask0, mask1, mask2, mask3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; + __m128i out0, out1; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + uint8_t *_src = (uint8_t *)src - 3 - src_stride3; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4, + filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + + src0 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); + src3 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + src4 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6); + _src += src_stride3; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + + src0 = horiz_8tap_filt(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src1 = horiz_8tap_filt(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src2 = horiz_8tap_filt(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src3 = horiz_8tap_filt(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src4 = horiz_8tap_filt(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src5 = horiz_8tap_filt(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src6 = horiz_8tap_filt(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + + DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4, + filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + DUP4_ARG2(__lsx_vpackev_b, src1, src0, src3, src2, src5, src4, src2, src1, + tmp0, tmp1, tmp2, tmp4); + DUP2_ARG2(__lsx_vpackev_b, src4, src3, src6, src5, tmp5, tmp6); + + for (; loop_cnt--;) { + src7 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9); + src10 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + src7 = horiz_8tap_filt(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp3 = __lsx_vpackev_b(src7, src6); + out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + src8 = horiz_8tap_filt(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src0 = __lsx_vpackev_b(src8, src7); + out1 = filt_8tap_dpadd_s_h(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + src9 = horiz_8tap_filt(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src1 = __lsx_vpackev_b(src9, src8); + src3 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + src10 = horiz_8tap_filt(src10, src10, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src2 = __lsx_vpackev_b(src10, src9); + src4 = filt_8tap_dpadd_s_h(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, FILTER_BITS, src4, src3, + FILTER_BITS, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + src5 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + src7 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + src8 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + src9 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, src7, src5, src9, src8, src5, src7); + DUP2_ARG2(__lsx_vavgr_bu, out0, src5, out1, src7, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); + dst += dst_stride; + + src6 = src10; + tmp0 = tmp2; + tmp1 = tmp3; + tmp2 = src1; + tmp4 = tmp6; + tmp5 = src0; + tmp6 = src2; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_16w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) { + common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 8; + dst += 8; + + common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); +} + +static void common_hv_8ht_8vt_and_aver_dst_32w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) { + int32_t multiple8_cnt; + + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_64w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) { + int32_t multiple8_cnt; + + for (multiple8_cnt = 8; multiple8_cnt--;) { + common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + + src += 8; + dst += 8; + } +} + +static void common_hv_2ht_2vt_and_aver_dst_4x4_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert) { + __m128i src0, src1, src2, src3, src4, mask; + __m128i filt_hz, filt_vt, vec0, vec1; + __m128i dst0, dst1, dst2, dst3; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + mask = __lsx_vld(mc_filt_mask_arr, 16); + /* rearranging filter */ + DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + + hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz); + hz_out4 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); + hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff); + hz_out3 = __lsx_vpickod_d(hz_out4, hz_out2); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + + dst0 = __lsx_vldrepl_w(dst, 0); + dst1 = __lsx_vldrepl_w(dst + dst_stride, 0); + dst2 = __lsx_vldrepl_w(dst + dst_stride2, 0); + dst3 = __lsx_vldrepl_w(dst + dst_stride3, 0); + dst0 = __lsx_vilvl_w(dst1, dst0); + dst1 = __lsx_vilvl_w(dst3, dst2); + dst0 = __lsx_vilvl_d(dst1, dst0); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst0); + __lsx_vstelm_w(tmp0, dst, 0, 0); + __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2); + __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3); +} + +static void common_hv_2ht_2vt_and_aver_dst_4x8_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert) { + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; + __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + __m128i hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3; + __m128i dst0, dst1, dst2, dst3, dst4; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + mask = __lsx_vld(mc_filt_mask_arr, 16); + + /* rearranging filter */ + filt_hz = __lsx_vldrepl_h(filter_horiz, 0); + filt_vt = __lsx_vldrepl_h(filter_vert, 0); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += src_stride4; + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src5, src6, src7, src8); + src += src_stride4; + + hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz); + hz_out4 = horiz_2tap_filt_uh(src4, src5, mask, filt_hz); + hz_out6 = horiz_2tap_filt_uh(src6, src7, mask, filt_hz); + hz_out8 = horiz_2tap_filt_uh(src8, src8, mask, filt_hz); + DUP2_ARG3(__lsx_vshuf_b, hz_out2, hz_out0, shuff, hz_out4, hz_out2, shuff, + hz_out1, hz_out3); + hz_out5 = __lsx_vshuf_b(hz_out6, hz_out4, shuff); + hz_out7 = __lsx_vpickod_d(hz_out8, hz_out6); + + dst0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst0 = __lsx_vilvl_w(dst1, dst0); + dst1 = __lsx_vilvl_w(dst3, dst2); + dst0 = __lsx_vilvl_d(dst1, dst0); + + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst4 = __lsx_vldrepl_w(dst_tmp, 0); + dst1 = __lsx_vilvl_w(dst2, dst1); + dst2 = __lsx_vilvl_w(dst4, dst3); + dst1 = __lsx_vilvl_d(dst2, dst1); + + DUP4_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, hz_out5, + hz_out4, hz_out7, hz_out6, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, vec2, filt_vt, vec3, + filt_vt, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, res0, res1); + DUP2_ARG2(__lsx_vavgr_bu, res0, dst0, res1, dst1, res0, res1); + + __lsx_vstelm_w(res0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(res0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(res0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(res0, dst, 0, 3); + dst += dst_stride; + + __lsx_vstelm_w(res1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(res1, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(res1, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(res1, dst, 0, 3); +} + +static void common_hv_2ht_2vt_and_aver_dst_4w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + if (height == 4) { + common_hv_2ht_2vt_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } else if (height == 8) { + common_hv_2ht_2vt_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } +} + +static void common_hv_2ht_2vt_and_aver_dst_8x4_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert) { + __m128i src0, src1, src2, src3, src4, mask; + __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3; + __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + __m128i dst0, dst1, dst2, dst3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + uint8_t *dst_tmp = dst; + mask = __lsx_vld(mc_filt_mask_arr, 0); + /* rearranging filter */ + filt_hz = __lsx_vldrepl_h(filter_horiz, 0); + filt_vt = __lsx_vldrepl_h(filter_vert, 0); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); + vec1 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp1 = __lsx_vdp2_h_bu(vec1, filt_vt); + + hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); + vec2 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp2 = __lsx_vdp2_h_bu(vec2, filt_vt); + + hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); + vec3 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp1); + AVG_ST4_D(tmp0, tmp1, dst0, dst1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, mask; + __m128i filt_hz, filt_vt, vec0; + __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + __m128i dst0, dst1, dst2, dst3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + uint8_t *dst_tmp = dst; + + /* rearranging filter */ + mask = __lsx_vld(mc_filt_mask_arr, 0); + + filt_hz = __lsx_vldrepl_h(filter_horiz, 0); + filt_vt = __lsx_vldrepl_h(filter_vert, 0); + + src0 = __lsx_vld(src, 0); + src += src_stride; + + hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp1); + + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + AVG_ST4_D(tmp0, tmp1, dst0, dst1, dst, dst_stride); + dst += dst_stride; + } +} + +static void common_hv_2ht_2vt_and_aver_dst_8w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + if (height == 4) { + common_hv_2ht_2vt_and_aver_dst_8x4_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } else { + common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx( + src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height); + } +} + +static void common_hv_2ht_2vt_and_aver_dst_16w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + uint8_t *src_tmp1; + uint32_t loop_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1, tmp3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride << 2; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + /* rearranging filter */ + filt_hz = __lsx_vldrepl_h(filter_horiz, 0); + filt_vt = __lsx_vldrepl_h(filter_vert, 0); + + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + + hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); + + for (; loop_cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4); + src6 = __lsx_vldx(src, src_stride3); + src_tmp1 = (uint8_t *)(src + 8); + src1 = __lsx_vld(src_tmp1, 0); + DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3, + src5); + src7 = __lsx_vldx(src_tmp1, src_stride3); + src += src_stride4; + dst0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2); + dst3 = __lsx_vldx(dst, dst_stride3); + + hz_out1 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + hz_out3 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp3 = __lsx_vavgr_bu(tmp3, dst0); + __lsx_vst(tmp3, dst, 0); + + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp3 = __lsx_vavgr_bu(tmp3, dst1); + __lsx_vstx(tmp3, dst, dst_stride); + + hz_out1 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); + hz_out3 = horiz_2tap_filt_uh(src5, src5, mask, filt_hz); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp3 = __lsx_vavgr_bu(tmp3, dst2); + __lsx_vstx(tmp3, dst, dst_stride2); + + hz_out0 = horiz_2tap_filt_uh(src6, src6, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src7, src7, mask, filt_hz); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp3 = __lsx_vavgr_bu(tmp3, dst3); + __lsx_vstx(tmp3, dst, dst_stride3); + dst += dst_stride4; + } +} + +static void common_hv_2ht_2vt_and_aver_dst_32w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 16; + dst += 16; + + common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); +} + +static void common_hv_2ht_2vt_and_aver_dst_64w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 16; + dst += 16; + } +} + +void vpx_convolve8_avg_lsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; + const int16_t *const filter_y = filter[y0_q4]; + int8_t cnt, filt_hor[8], filt_ver[8]; + + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + filt_ver[cnt] = filter_y[cnt]; + } + if (vpx_get_filter_taps(filter_x) == 2 && + vpx_get_filter_taps(filter_y) == 2) { + switch (w) { + case 4: + common_hv_2ht_2vt_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], h); + break; + case 8: + common_hv_2ht_2vt_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], h); + break; + case 16: + common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, + &filt_hor[3], &filt_ver[3], h); + break; + case 32: + common_hv_2ht_2vt_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, + &filt_hor[3], &filt_ver[3], h); + break; + case 64: + common_hv_2ht_2vt_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, + &filt_hor[3], &filt_ver[3], h); + break; + default: + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } else if (vpx_get_filter_taps(filter_x) == 2 || + vpx_get_filter_taps(filter_y) == 2) { + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + } else { + switch (w) { + case 4: + common_hv_8ht_8vt_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + case 8: + common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + case 16: + common_hv_8ht_8vt_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + case 32: + common_hv_8ht_8vt_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + case 64: + common_hv_8ht_8vt_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + default: + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c new file mode 100644 index 0000000000..5c6413df44 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c @@ -0,0 +1,918 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/vpx_convolve_lsx.h" + +static void common_vt_8t_and_aver_dst_4w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) { + uint32_t loop_cnt = (height >> 2); + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + __m128i reg0, reg1, reg2, reg3, reg4; + __m128i filter0, filter1, filter2, filter3; + __m128i out0, out1; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + uint8_t *src_tmp0 = (uint8_t *)src - src_stride3; + + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + src0 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src1, + src2); + src3 = __lsx_vldx(src_tmp0, src_stride3); + src_tmp0 += src_stride4; + src4 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src5, + src6); + src_tmp0 += src_stride3; + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, tmp0, + tmp1, tmp2, tmp3); + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, tmp4, tmp5); + DUP2_ARG2(__lsx_vilvl_d, tmp3, tmp0, tmp4, tmp1, reg0, reg1); + reg2 = __lsx_vilvl_d(tmp5, tmp2); + DUP2_ARG2(__lsx_vxori_b, reg0, 128, reg1, 128, reg0, reg1); + reg2 = __lsx_vxori_b(reg2, 128); + + for (; loop_cnt--;) { + src7 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src8, + src9); + src10 = __lsx_vldx(src_tmp0, src_stride3); + src_tmp0 += src_stride4; + src0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + src1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + src2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + src3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_w, src1, src0, src3, src2, src0, src1); + src0 = __lsx_vilvl_d(src1, src0); + DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4); + DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4); + out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, reg3, filter0, filter1, + filter2, filter3); + out1 = filt_8tap_dpadd_s_h(reg1, reg2, reg3, reg4, filter0, filter1, + filter2, filter3); + out0 = __lsx_vssrarni_b_h(out1, out0, 7); + out0 = __lsx_vxori_b(out0, 128); + out0 = __lsx_vavgr_bu(out0, src0); + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); + dst += dst_stride; + reg0 = reg2; + reg1 = reg3; + reg2 = reg4; + src6 = src10; + } +} + +static void common_vt_8t_and_aver_dst_8w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) { + uint32_t loop_cnt = height >> 2; + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5; + __m128i filter0, filter1, filter2, filter3; + __m128i out0, out1, out2, out3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + uint8_t *src_tmp0 = (uint8_t *)src - src_stride3; + + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + src0 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src1, + src2); + src3 = __lsx_vldx(src_tmp0, src_stride3); + src_tmp0 += src_stride4; + src4 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src5, + src6); + src_tmp0 += src_stride3; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, reg0, + reg1, reg2, reg3); + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5); + + for (; loop_cnt--;) { + src7 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src8, + src9); + src10 = __lsx_vldx(src_tmp0, src_stride3); + src_tmp0 += src_stride4; + src0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + src1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + src2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + src3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, src1, src0, src3, src2, src0, src1); + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, + tmp0, tmp1, tmp2, tmp3); + out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, tmp0, filter0, filter1, + filter2, filter3); + out1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, tmp1, filter0, filter1, + filter2, filter3); + out2 = filt_8tap_dpadd_s_h(reg1, reg2, tmp0, tmp2, filter0, filter1, + filter2, filter3); + out3 = filt_8tap_dpadd_s_h(reg4, reg5, tmp1, tmp3, filter0, filter1, + filter2, filter3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + DUP2_ARG2(__lsx_vavgr_bu, out0, src0, out1, src1, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); + dst += dst_stride; + reg0 = reg2; + reg1 = tmp0; + reg2 = tmp2; + reg3 = reg5; + reg4 = tmp1; + reg5 = tmp3; + src6 = src10; + } +} + +static void common_vt_8t_and_aver_dst_16w_mult_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, int32_t width) { + uint8_t *src_tmp; + uint32_t cnt = width >> 4; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i filter0, filter1, filter2, filter3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5; + __m128i reg6, reg7, reg8, reg9, reg10, reg11; + __m128i tmp0, tmp1, tmp2, tmp3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + uint8_t *src_tmp0 = (uint8_t *)src - src_stride3; + + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + for (; cnt--;) { + uint32_t loop_cnt = height >> 2; + uint8_t *dst_reg = dst; + + src_tmp = src_tmp0; + src0 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src1, + src2); + src3 = __lsx_vldx(src_tmp, src_stride3); + src_tmp += src_stride4; + src4 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5, + src6); + src_tmp += src_stride3; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, + reg0, reg1, reg2, reg3); + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5); + DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1, + reg6, reg7, reg8, reg9); + DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11); + for (; loop_cnt--;) { + src7 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src8, + src9); + src10 = __lsx_vldx(src_tmp, src_stride3); + src_tmp += src_stride4; + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, + src7, src8, src9, src10); + DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9, + src4, src5, src7, src8); + tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1, + filter2, filter3); + tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1, + filter2, filter3); + tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1, + filter2, filter3); + tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1, + filter2, filter3); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + tmp2 = __lsx_vld(dst_reg, 0); + tmp3 = __lsx_vldx(dst_reg, dst_stride); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1); + __lsx_vst(tmp0, dst_reg, 0); + __lsx_vstx(tmp1, dst_reg, dst_stride); + tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1, + filter2, filter3); + tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1, + filter2, filter3); + tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1, + filter2, filter3); + tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1, + filter2, filter3); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + tmp2 = __lsx_vldx(dst_reg, dst_stride2); + tmp3 = __lsx_vldx(dst_reg, dst_stride3); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1); + __lsx_vstx(tmp0, dst_reg, dst_stride2); + __lsx_vstx(tmp1, dst_reg, dst_stride3); + dst_reg += dst_stride4; + + reg0 = reg2; + reg1 = src0; + reg2 = src2; + reg3 = reg5; + reg4 = src1; + reg5 = src3; + reg6 = reg8; + reg7 = src4; + reg8 = src7; + reg9 = reg11; + reg10 = src5; + reg11 = src8; + src6 = src10; + } + src_tmp0 += 16; + dst += 16; + } +} + +static void common_vt_8t_and_aver_dst_16w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) { + common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride, + filter, height, 16); +} + +static void common_vt_8t_and_aver_dst_32w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) { + common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride, + filter, height, 32); +} + +static void common_vt_8t_and_aver_dst_64w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) { + common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride, + filter, height, 64); +} + +static void common_vt_2t_and_aver_dst_4x4_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, src4; + __m128i dst0, dst1, dst2, dst3, out, filt0, src2110, src4332; + __m128i src10_r, src32_r, src21_r, src43_r; + __m128i tmp0, tmp1; + uint8_t *dst_tmp = dst; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + src4 = __lsx_vld(src, 0); + src += src_stride; + + dst0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst0 = __lsx_vilvl_w(dst1, dst0); + dst1 = __lsx_vilvl_w(dst3, dst2); + dst0 = __lsx_vilvl_d(dst1, dst0); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, + src10_r, src21_r, src32_r, src43_r); + DUP2_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r, src2110, + src4332); + DUP2_ARG2(__lsx_vdp2_h_bu, src2110, filt0, src4332, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + out = __lsx_vavgr_bu(tmp0, dst0); + __lsx_vstelm_w(out, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out, dst, 0, 3); + dst += dst_stride; +} + +static void common_vt_2t_and_aver_dst_4x8_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + __m128i dst0, dst1, dst2, dst3, dst4; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r; + __m128i src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; + __m128i src2110, src4332, src6554, src8776, filt0; + __m128i tmp0, tmp1, tmp2, tmp3; + uint8_t *dst_tmp = dst; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + src4 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6); + src7 = __lsx_vldx(src, src_stride3); + src += src_stride4; + src8 = __lsx_vld(src, 0); + + dst0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst0 = __lsx_vilvl_w(dst1, dst0); + dst1 = __lsx_vilvl_w(dst3, dst2); + dst0 = __lsx_vilvl_d(dst1, dst0); + + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst4 = __lsx_vldrepl_w(dst_tmp, 0); + dst1 = __lsx_vilvl_w(dst2, dst1); + dst2 = __lsx_vilvl_w(dst4, dst3); + dst1 = __lsx_vilvl_d(dst2, dst1); + + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, + src10_r, src21_r, src32_r, src43_r); + DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, + src54_r, src65_r, src76_r, src87_r); + DUP4_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, + src87_r, src76_r, src2110, src4332, src6554, src8776); + DUP4_ARG2(__lsx_vdp2_h_bu, src2110, filt0, src4332, filt0, src6554, filt0, + src8776, filt0, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp2); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp2, dst1, tmp0, tmp2); + __lsx_vstelm_w(tmp0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 3); + dst += dst_stride; + + __lsx_vstelm_w(tmp2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(tmp2, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(tmp2, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(tmp2, dst, 0, 3); +} + +static void common_vt_2t_and_aver_dst_4w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (height == 4) { + common_vt_2t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter); + } else if (height == 8) { + common_vt_2t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_vt_2t_and_aver_dst_8x4_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, src4; + __m128i dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0; + __m128i tmp0, tmp1, tmp2, tmp3; + uint8_t *dst_tmp = dst; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec1); + DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp2); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp2, dst1, tmp0, tmp2); + __lsx_vstelm_d(tmp0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 1); +} + +static void common_vt_2t_and_aver_dst_8x8mult_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 3); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; + __m128i dst0, dst1, dst2, dst3, dst4, dst5; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + __m128i tmp0, tmp1, tmp2, tmp3; + uint8_t *dst_tmp = dst; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + src0 = __lsx_vld(src, 0); + src += src_stride; + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + src5 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src6, src7); + src8 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst4 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst5 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst3, dst2, dst5, dst4, dst2, dst3); + + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, + vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, + vec4, vec5, vec6, vec7); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, tmp0, tmp1, tmp2, tmp3); + + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp2); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp2, dst1, tmp0, tmp2); + __lsx_vstelm_d(tmp0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 1); + dst += dst_stride; + + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp2); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst2, tmp2, dst3, tmp0, tmp2); + __lsx_vstelm_d(tmp0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 1); + dst += dst_stride; + + src0 = src8; + } +} + +static void common_vt_2t_and_aver_dst_8w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (height == 4) { + common_vt_2t_and_aver_dst_8x4_lsx(src, src_stride, dst, dst_stride, filter); + } else { + common_vt_2t_and_aver_dst_8x8mult_lsx(src, src_stride, dst, dst_stride, + filter, height); + } +} + +static void common_vt_2t_and_aver_dst_16w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i tmp0, tmp1; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + filt0 = __lsx_vldrepl_h(filter, 0); + src0 = __lsx_vld(src, 0); + src += src_stride; + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + dst0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2); + dst3 = __lsx_vldx(dst, dst_stride3); + + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst0); + __lsx_vst(tmp0, dst, 0); + dst += dst_stride; + + DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst1); + __lsx_vst(tmp0, dst, 0); + dst += dst_stride; + + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst2); + __lsx_vst(tmp0, dst, 0); + dst += dst_stride; + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst3); + __lsx_vst(tmp0, dst, 0); + dst += dst_stride; + + src0 = src4; + } +} + +static void common_vt_2t_and_aver_dst_32w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 2); + uint8_t *src_tmp1; + uint8_t *dst_tmp1; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; + __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + __m128i tmp0, tmp1; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + filt0 = __lsx_vldrepl_h(filter, 0); + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src5); + src += src_stride; + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + + dst0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2); + dst3 = __lsx_vldx(dst, dst_stride3); + + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); + + src_tmp1 = src + 16; + src6 = __lsx_vld(src_tmp1, 0); + DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src7, + src8); + src9 = __lsx_vldx(src_tmp1, src_stride3); + + dst_tmp1 = dst + 16; + dst4 = __lsx_vld(dst_tmp1, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp1, dst_stride, dst_tmp1, dst_stride2, dst5, + dst6); + dst7 = __lsx_vldx(dst_tmp1, dst_stride3); + src += src_stride4; + + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst0); + __lsx_vst(tmp0, dst, 0); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst1); + __lsx_vstx(tmp0, dst, dst_stride); + + DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst2); + __lsx_vstx(tmp0, dst, dst_stride2); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst3); + __lsx_vstx(tmp0, dst, dst_stride3); + + DUP2_ARG2(__lsx_vilvl_b, src6, src5, src7, src6, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src6, src5, src7, src6, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst4); + __lsx_vst(tmp0, dst, 16); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst5); + dst += dst_stride; + __lsx_vst(tmp0, dst, 16); + + DUP2_ARG2(__lsx_vilvl_b, src8, src7, src9, src8, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src8, src7, src9, src8, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst6); + dst += dst_stride; + __lsx_vst(tmp0, dst, 16); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst7); + dst += dst_stride; + __lsx_vst(tmp0, dst, 16); + dst += dst_stride; + + src0 = src4; + src5 = src9; + } +} + +static void common_vt_2t_and_aver_dst_64w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 1); + int32_t src_stride2 = src_stride << 1; + int32_t dst_stride2 = dst_stride << 1; + uint8_t *src_tmp1; + uint8_t *dst_tmp1; + __m128i src0, src1, src2, src3, src4, src5; + __m128i src6, src7, src8, src9, src10, src11, filt0; + __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i tmp0, tmp1; + + filt0 = __lsx_vldrepl_h(filter, 0); + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src3, src6, + src9); + src += src_stride; + + for (; loop_cnt--;) { + src2 = __lsx_vldx(src, src_stride); + dst1 = __lsx_vldx(dst, dst_stride); + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src1, src4, src7, + src10); + DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, dst0, dst2, dst4, + dst6); + src_tmp1 = (uint8_t *)src + 16; + src5 = __lsx_vldx(src_tmp1, src_stride); + src_tmp1 = src_tmp1 + 16; + src8 = __lsx_vldx(src_tmp1, src_stride); + src_tmp1 = src_tmp1 + 16; + src11 = __lsx_vldx(src_tmp1, src_stride); + + dst_tmp1 = dst + 16; + dst3 = __lsx_vldx(dst_tmp1, dst_stride); + dst_tmp1 = dst + 32; + dst5 = __lsx_vldx(dst_tmp1, dst_stride); + dst_tmp1 = dst + 48; + dst7 = __lsx_vldx(dst_tmp1, dst_stride); + src += src_stride2; + + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst0); + __lsx_vst(tmp0, dst, 0); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst1); + __lsx_vstx(tmp0, dst, dst_stride); + + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src5, src4, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src4, src3, src5, src4, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst2); + __lsx_vst(tmp0, dst, 16); + + dst_tmp1 = dst + 16; + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst3); + __lsx_vstx(tmp0, dst_tmp1, dst_stride); + + DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst4); + __lsx_vst(tmp0, dst, 32); + + dst_tmp1 = dst_tmp1 + 16; + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst5); + __lsx_vstx(tmp0, dst_tmp1, dst_stride); + + DUP2_ARG2(__lsx_vilvl_b, src10, src9, src11, src10, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src10, src9, src11, src10, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst6); + __lsx_vst(tmp0, dst, 48); + + dst_tmp1 = dst_tmp1 + 16; + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst7); + __lsx_vstx(tmp0, dst_tmp1, dst_stride); + dst += dst_stride2; + + src0 = src2; + src3 = src5; + src6 = src8; + src9 = src11; + } +} + +void vpx_convolve8_avg_vert_lsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int16_t *const filter_y = filter[y0_q4]; + int8_t cnt, filt_ver[8]; + + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_ver[cnt] = filter_y[cnt]; + } + + if (vpx_get_filter_taps(filter_y) == 2) { + switch (w) { + case 4: + common_vt_2t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + case 8: + common_vt_2t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + case 16: + common_vt_2t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + case 32: + common_vt_2t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + case 64: + common_vt_2t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + default: + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_vt_8t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + break; + case 8: + common_vt_8t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + break; + case 16: + common_vt_8t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + + break; + case 32: + common_vt_8t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + break; + case 64: + common_vt_8t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + break; + default: + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c new file mode 100644 index 0000000000..2c6459a978 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c @@ -0,0 +1,814 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/vpx_convolve_lsx.h" + +static const uint8_t mc_filt_mask_arr[16 * 3] = { + /* 8 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + /* 4 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, + /* 4 width cases */ + 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 +}; + +static void common_hz_8t_4x4_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) { + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i out, out0, out1; + + mask0 = __lsx_vld(mc_filt_mask_arr, 16); + src -= 3; + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filter0, filter1, filter2, filter3, out0, out1); + out = __lsx_vssrarni_b_h(out1, out0, 7); + out = __lsx_vxori_b(out, 128); + __lsx_vstelm_w(out, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out, dst, 0, 3); +} + +static void common_hz_8t_4x8_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) { + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i out0, out1, out2, out3; + uint8_t *_src = (uint8_t *)src - 3; + + mask0 = __lsx_vld(mc_filt_mask_arr, 16); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + src0 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); + src3 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filter0, filter1, filter2, filter3, out0, out1); + src0 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); + src3 = __lsx_vldx(_src, src_stride3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filter0, filter1, filter2, filter3, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 3); +} + +static void common_hz_8t_4w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + if (height == 4) { + common_hz_8t_4x4_lsx(src, src_stride, dst, dst_stride, filter); + } else if (height == 8) { + common_hz_8t_4x8_lsx(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_8t_8x4_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) { + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i out0, out1, out2, out3; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filter0, filter1, filter2, filter3, out0, out1, + out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); +} + +static void common_hz_8t_8x8mult_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt = height >> 2; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i out0, out1, out2, out3; + uint8_t *_src = (uint8_t *)src - 3; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + for (; loop_cnt--;) { + src0 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); + src3 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, out0, + out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); + dst += dst_stride; + } +} + +static void common_hz_8t_8w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + if (height == 4) { + common_hz_8t_8x4_lsx(src, src_stride, dst, dst_stride, filter); + } else { + common_hz_8t_8x8mult_lsx(src, src_stride, dst, dst_stride, filter, height); + } +} + +static void common_hz_8t_16w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt = height >> 1; + int32_t stride = src_stride << 1; + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i out0, out1, out2, out3; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + for (; loop_cnt--;) { + const uint8_t *_src = src + src_stride; + DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src0, src2); + DUP2_ARG2(__lsx_vld, src, 8, _src, 8, src1, src3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, out0, + out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vst(out0, dst, 0); + dst += dst_stride; + __lsx_vst(out1, dst, 0); + dst += dst_stride; + src += stride; + } +} + +static void common_hz_8t_32w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt = height >> 1; + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i out0, out1, out2, out3; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2); + src3 = __lsx_vld(src, 24); + src1 = __lsx_vshuf_b(src2, src0, shuff); + src += src_stride; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, out0, + out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vst(out0, dst, 0); + __lsx_vst(out1, dst, 16); + + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2); + src3 = __lsx_vld(src, 24); + src1 = __lsx_vshuf_b(src2, src0, shuff); + src += src_stride; + + dst += dst_stride; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, out0, + out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vst(out0, dst, 0); + __lsx_vst(out1, dst, 16); + dst += dst_stride; + } +} + +static void common_hz_8t_64w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + int32_t loop_cnt = height; + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i out0, out1, out2, out3; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2); + src3 = __lsx_vld(src, 24); + src1 = __lsx_vshuf_b(src2, src0, shuff); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, out0, + out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vst(out0, dst, 0); + __lsx_vst(out1, dst, 16); + + DUP2_ARG2(__lsx_vld, src, 32, src, 48, src0, src2); + src3 = __lsx_vld(src, 56); + src1 = __lsx_vshuf_b(src2, src0, shuff); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, out0, + out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vst(out0, dst, 32); + __lsx_vst(out1, dst, 48); + src += src_stride; + dst += dst_stride; + } +} + +static void common_hz_2t_4x4_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, mask; + __m128i filt0, vec0, vec1, vec2, vec3, res0, res1; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride + dst_stride2; + + mask = __lsx_vld(mc_filt_mask_arr, 16); + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec2, vec2, FILTER_BITS, vec3, vec3, + FILTER_BITS, res0, res1); + + __lsx_vstelm_w(res0, dst, 0, 0); + __lsx_vstelm_w(res0, dst + dst_stride, 0, 1); + __lsx_vstelm_w(res1, dst + dst_stride2, 0, 0); + __lsx_vstelm_w(res1, dst + dst_stride3, 0, 1); +} + +static void common_hz_2t_4x8_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i res0, res1, res2, res3, filt0; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride + dst_stride2; + + uint8_t *src_tmp1 = src + src_stride4; + + mask = __lsx_vld(mc_filt_mask_arr, 16); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src5, + src6); + src7 = __lsx_vldx(src_tmp1, src_stride3); + + DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, src5, src4, mask, + src7, src6, mask, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + vec4, vec5, vec6, vec7); + DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5, + FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, res0, + res1, res2, res3); + + __lsx_vstelm_w(res0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(res0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(res1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(res1, dst, 0, 1); + dst += dst_stride; + + __lsx_vstelm_w(res2, dst, 0, 0); + __lsx_vstelm_w(res2, dst + dst_stride, 0, 1); + __lsx_vstelm_w(res3, dst + dst_stride2, 0, 0); + __lsx_vstelm_w(res3, dst + dst_stride3, 0, 1); +} + +static void common_hz_2t_4w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (height == 4) { + common_hz_2t_4x4_lsx(src, src_stride, dst, dst_stride, filter); + } else if (height == 8) { + common_hz_2t_4x8_lsx(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_2t_8x4_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + __m128i filt0, mask; + __m128i src0, src1, src2, src3; + __m128i vec0, vec1, vec2, vec3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask, + src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, vec0, vec1); + + __lsx_vstelm_d(vec0, dst, 0, 0); + __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(vec1, dst + dst_stride2, 0, 0); + __lsx_vstelm_d(vec1, dst + dst_stride3, 0, 1); +} + +static void common_hz_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + __m128i filt0, mask; + __m128i src0, src1, src2, src3, out0, out1; + __m128i vec0, vec1, vec2, vec3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask, + src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, out0, out1); + + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); + dst += dst_stride; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask, + src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, out0, out1); + + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); + dst += dst_stride; + + if (height == 16) { + uint8_t *dst_tmp1 = dst + dst_stride4; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, + mask, src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, out0, out1); + + __lsx_vstelm_d(out0, dst, 0, 0); + __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0); + __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, + mask, src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, out0, out1); + + __lsx_vstelm_d(out0, dst_tmp1, 0, 0); + __lsx_vstelm_d(out0, dst_tmp1 + dst_stride, 0, 1); + __lsx_vstelm_d(out1, dst_tmp1 + dst_stride2, 0, 0); + __lsx_vstelm_d(out1, dst_tmp1 + dst_stride3, 0, 1); + } +} + +static void common_hz_2t_8w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (height == 4) { + common_hz_2t_8x4_lsx(src, src_stride, dst, dst_stride, filter); + } else { + common_hz_2t_8x8mult_lsx(src, src_stride, dst, dst_stride, filter, height); + } +} + +static void common_hz_2t_16w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 2) - 1; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i out0, out1, out2, out3, out4, out5, out6, out7; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + uint8_t *src_tmp1 = src + 8; + mask = __lsx_vld(mc_filt_mask_arr, 0); + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4); + src6 = __lsx_vldx(src, src_stride3); + src1 = __lsx_vld(src_tmp1, 0); + DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3, + src5); + src7 = __lsx_vldx(src_tmp1, src_stride3); + src += src_stride4; + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask, + src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6, mask, + src7, src7, mask, vec4, vec5, vec6, vec7); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + out0, out1, out2, out3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0, + out4, out5, out6, out7); + DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2, + FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, out0, + out1, out2, out3); + + __lsx_vst(out0, dst, 0); + dst += dst_stride; + __lsx_vst(out1, dst, 0); + dst += dst_stride; + __lsx_vst(out2, dst, 0); + dst += dst_stride; + __lsx_vst(out3, dst, 0); + dst += dst_stride; + + for (; loop_cnt--;) { + src_tmp1 += src_stride4; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4); + src6 = __lsx_vldx(src, src_stride3); + + src1 = __lsx_vld(src_tmp1, 0); + DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3, + src5); + src7 = __lsx_vldx(src_tmp1, src_stride3); + src += src_stride4; + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, + mask, src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6, + mask, src7, src7, mask, vec4, vec5, vec6, vec7); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, out0, out1, out2, out3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, out4, out5, out6, out7); + DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2, + FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, + out0, out1, out2, out3); + + __lsx_vst(out0, dst, 0); + dst += dst_stride; + __lsx_vst(out1, dst, 0); + dst += dst_stride; + __lsx_vst(out2, dst, 0); + dst += dst_stride; + __lsx_vst(out3, dst, 0); + dst += dst_stride; + } +} + +static void common_hz_2t_32w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 1); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i out0, out1, out2, out3, out4, out5, out6, out7; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2); + src3 = __lsx_vld(src, 24); + src1 = __lsx_vshuf_b(src2, src0, shuff); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src4, src6); + src7 = __lsx_vld(src, 24); + src5 = __lsx_vshuf_b(src6, src4, shuff); + src += src_stride; + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, + mask, src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6, + mask, src7, src7, mask, vec4, vec5, vec6, vec7); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, out0, out1, out2, out3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, out4, out5, out6, out7); + DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2, + FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, + out0, out1, out2, out3); + + __lsx_vst(out0, dst, 0); + __lsx_vst(out1, dst, 16); + dst += dst_stride; + + __lsx_vst(out2, dst, 0); + __lsx_vst(out3, dst, 16); + dst += dst_stride; + } +} + +static void common_hz_2t_64w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = height; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i out0, out1, out2, out3, out4, out5, out6, out7; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + for (; loop_cnt--;) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src2, src4, + src6); + src7 = __lsx_vld(src, 56); + DUP2_ARG3(__lsx_vshuf_b, src2, src0, shuff, src4, src2, shuff, src1, src3); + src5 = __lsx_vshuf_b(src6, src4, shuff); + src += src_stride; + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, + mask, src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6, + mask, src7, src7, mask, vec4, vec5, vec6, vec7); + + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, out0, out1, out2, out3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, out4, out5, out6, out7); + DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2, + FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, + out0, out1, out2, out3); + + __lsx_vst(out0, dst, 0); + __lsx_vst(out1, dst, 16); + __lsx_vst(out2, dst, 32); + __lsx_vst(out3, dst, 48); + dst += dst_stride; + } +} + +void vpx_convolve8_horiz_lsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int16_t *const filter_x = filter[x0_q4]; + int8_t cnt, filt_hor[8]; + + assert(x_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + } + if (vpx_get_filter_taps(filter_x) == 2) { + switch (w) { + case 4: + common_hz_2t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 8: + common_hz_2t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 16: + common_hz_2t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 32: + common_hz_2t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 64: + common_hz_2t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + default: + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_hz_8t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + case 8: + common_hz_8t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + + case 16: + common_hz_8t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + + case 32: + common_hz_8t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + + case 64: + common_hz_8t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + default: + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_lsx.c new file mode 100644 index 0000000000..9f5cd6cfe9 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_lsx.c @@ -0,0 +1,697 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/vpx_convolve_lsx.h" + +static const uint8_t mc_filt_mask_arr[16 * 3] = { + /* 8 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + /* 4 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, + /* 4 width cases */ + 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 +}; + +static void common_hv_8ht_8vt_4w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3; + __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3; + __m128i mask0, mask1, mask2, mask3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + __m128i out0, out1; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask0 = __lsx_vld(mc_filt_mask_arr, 16); + src -= (3 + 3 * src_stride); + DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4, + filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + src += src_stride; + src4 = __lsx_vld(src, 0); + src += src_stride; + src5 = __lsx_vld(src, 0); + src += src_stride; + src6 = __lsx_vld(src, 0); + src += src_stride; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + + tmp0 = horiz_8tap_filt(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp2 = horiz_8tap_filt(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp4 = horiz_8tap_filt(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp5 = horiz_8tap_filt(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3); + DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4, + filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + DUP2_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); + tmp2 = __lsx_vpackev_b(tmp5, tmp4); + + for (; loop_cnt--;) { + LSX_LD_4(src, src_stride, src7, src8, src9, src10); + src += src_stride; + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + tmp3 = horiz_8tap_filt(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff); + tmp4 = __lsx_vpackev_b(tmp3, tmp4); + out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + src1 = horiz_8tap_filt(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src0 = __lsx_vshuf_b(src1, tmp3, shuff); + src0 = __lsx_vpackev_b(src1, src0); + out1 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + out0 = __lsx_vssrarni_b_h(out1, out0, 7); + out0 = __lsx_vxori_b(out0, 128); + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); + dst += dst_stride; + + tmp5 = src1; + tmp0 = tmp2; + tmp1 = tmp4; + tmp2 = src0; + } +} + +static void common_hv_8ht_8vt_8w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3; + __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3; + __m128i mask0, mask1, mask2, mask3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; + __m128i out0, out1; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + src -= (3 + 3 * src_stride); + DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4, + filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + src += src_stride; + src4 = __lsx_vld(src, 0); + src += src_stride; + src5 = __lsx_vld(src, 0); + src += src_stride; + src6 = __lsx_vld(src, 0); + src += src_stride; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + + src0 = horiz_8tap_filt(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src1 = horiz_8tap_filt(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src2 = horiz_8tap_filt(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src3 = horiz_8tap_filt(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src4 = horiz_8tap_filt(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src5 = horiz_8tap_filt(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src6 = horiz_8tap_filt(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + + DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4, + filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + DUP4_ARG2(__lsx_vpackev_b, src1, src0, src3, src2, src5, src4, src2, src1, + tmp0, tmp1, tmp2, tmp4); + DUP2_ARG2(__lsx_vpackev_b, src4, src3, src6, src5, tmp5, tmp6); + + for (; loop_cnt--;) { + LSX_LD_4(src, src_stride, src7, src8, src9, src10); + src += src_stride; + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + src7 = horiz_8tap_filt(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp3 = __lsx_vpackev_b(src7, src6); + out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + src8 = horiz_8tap_filt(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src0 = __lsx_vpackev_b(src8, src7); + out1 = filt_8tap_dpadd_s_h(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + src9 = horiz_8tap_filt(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src1 = __lsx_vpackev_b(src9, src8); + src3 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + src10 = horiz_8tap_filt(src10, src10, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src2 = __lsx_vpackev_b(src10, src9); + src4 = filt_8tap_dpadd_s_h(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, src4, src3, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); + dst += dst_stride; + + src6 = src10; + tmp0 = tmp2; + tmp1 = tmp3; + tmp2 = src1; + tmp4 = tmp6; + tmp5 = src0; + tmp6 = src2; + } +} + +static void common_hv_8ht_8vt_16w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 8; + dst += 8; + + common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 8; + dst += 8; +} + +static void common_hv_8ht_8vt_32w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_8ht_8vt_64w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 8; multiple8_cnt--;) { + common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_2ht_2vt_4x4_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert) { + __m128i src0, src1, src2, src3, src4, mask; + __m128i filt_vt, filt_hz, vec0, vec1; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + mask = __lsx_vld(mc_filt_mask_arr, 16); + + /* rearranging filter */ + filt_hz = __lsx_vldrepl_h(filter_horiz, 0); + filt_vt = __lsx_vldrepl_h(filter_vert, 0); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz); + hz_out4 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); + + hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff); + hz_out3 = __lsx_vpickod_d(hz_out4, hz_out2); + + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp0, tmp0, FILTER_BITS, tmp1, tmp1, + FILTER_BITS, tmp0, tmp1); + + __lsx_vstelm_w(tmp0, dst, 0, 0); + __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_w(tmp1, dst + dst_stride2, 0, 0); + __lsx_vstelm_w(tmp1, dst + dst_stride3, 0, 1); +} + +static void common_hv_2ht_2vt_4x8_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; + __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + __m128i hz_out7, hz_out8, vec4, vec5, vec6, vec7; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + mask = __lsx_vld(mc_filt_mask_arr, 16); + + /* rearranging filter */ + DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += src_stride4; + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src5, src6, src7, src8); + src += src_stride4; + + hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz); + hz_out4 = horiz_2tap_filt_uh(src4, src5, mask, filt_hz); + hz_out6 = horiz_2tap_filt_uh(src6, src7, mask, filt_hz); + hz_out8 = horiz_2tap_filt_uh(src8, src8, mask, filt_hz); + + DUP2_ARG3(__lsx_vshuf_b, hz_out2, hz_out0, shuff, hz_out4, hz_out2, shuff, + hz_out1, hz_out3); + hz_out5 = __lsx_vshuf_b(hz_out6, hz_out4, shuff); + hz_out7 = __lsx_vpickod_d(hz_out8, hz_out6); + DUP4_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, hz_out5, + hz_out4, hz_out7, hz_out6, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, vec2, filt_vt, vec3, + filt_vt, vec4, vec5, vec6, vec7); + DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5, + FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, vec4, + vec5, vec6, vec7); + + __lsx_vstelm_w(vec4, dst, 0, 0); + __lsx_vstelm_w(vec4, dst + dst_stride, 0, 1); + __lsx_vstelm_w(vec5, dst + dst_stride2, 0, 0); + __lsx_vstelm_w(vec5, dst + dst_stride3, 0, 1); + dst += dst_stride4; + __lsx_vstelm_w(vec6, dst, 0, 0); + __lsx_vstelm_w(vec6, dst + dst_stride, 0, 1); + __lsx_vstelm_w(vec7, dst + dst_stride2, 0, 0); + __lsx_vstelm_w(vec7, dst + dst_stride3, 0, 1); +} + +static void common_hv_2ht_2vt_4w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + if (height == 4) { + common_hv_2ht_2vt_4x4_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert); + } else if (height == 8) { + common_hv_2ht_2vt_4x8_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert); + } +} + +static void common_hv_2ht_2vt_8x4_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert) { + __m128i src0, src1, src2, src3, src4, mask; + __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3; + __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + + hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); + vec1 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp1 = __lsx_vdp2_h_bu(vec1, filt_vt); + + hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); + vec2 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp2 = __lsx_vdp2_h_bu(vec2, filt_vt); + + hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); + vec3 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt); + + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp1); + + __lsx_vstelm_d(tmp0, dst, 0, 0); + __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(tmp1, dst + dst_stride2, 0, 0); + __lsx_vstelm_d(tmp1, dst + dst_stride3, 0, 1); +} + +static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt = (height >> 3); + __m128i src0, src1, src2, src3, src4, mask; + __m128i filt_hz, filt_vt, vec0; + __m128i hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt); + + src0 = __lsx_vld(src, 0); + src += src_stride; + + hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + vec0 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt); + + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp2, tmp1, FILTER_BITS, tmp4, tmp3, + FILTER_BITS, tmp1, tmp2); + + __lsx_vstelm_d(tmp1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp1, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 1); + dst += dst_stride; + + hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt); + + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp2, tmp1, FILTER_BITS, tmp4, tmp3, + FILTER_BITS, tmp1, tmp2); + + __lsx_vstelm_d(tmp1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp1, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 1); + dst += dst_stride; + } +} + +static void common_hv_2ht_2vt_8w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + if (height == 4) { + common_hv_2ht_2vt_8x4_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert); + } else { + common_hv_2ht_2vt_8x8mult_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + } +} + +static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt_hz, filt_vt, vec0, vec1; + __m128i tmp, tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt); + + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + + hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); + + for (; loop_cnt--;) { + uint8_t *src_tmp0 = src + 8; + + DUP2_ARG2(__lsx_vld, src, 0, src_tmp0, 0, src0, src1); + DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp0, src_stride, src, + src_stride2, src_tmp0, src_stride2, src2, src3, src4, src5); + DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp0, src_stride3, src6, src7); + src += src_stride4; + + hz_out1 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + hz_out3 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2); + tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2); + tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + + hz_out1 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); + hz_out3 = horiz_2tap_filt_uh(src5, src5, mask, filt_hz); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2); + tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + + hz_out0 = horiz_2tap_filt_uh(src6, src6, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src7, src7, mask, filt_hz); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2); + tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + } +} + +static void common_hv_2ht_2vt_32w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 16; + dst += 16; + + common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); +} + +static void common_hv_2ht_2vt_64w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 16; + dst += 16; + } +} + +void vpx_convolve8_lsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int32_t x_step_q4, int y0_q4, + int32_t y_step_q4, int32_t w, int32_t h) { + const int16_t *const filter_x = filter[x0_q4]; + const int16_t *const filter_y = filter[y0_q4]; + int8_t cnt, filt_hor[8], filt_ver[8]; + + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + filt_ver[cnt] = filter_y[cnt]; + } + + if (vpx_get_filter_taps(filter_x) == 2 && + vpx_get_filter_taps(filter_y) == 2) { + switch (w) { + case 4: + common_hv_2ht_2vt_4w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + case 8: + common_hv_2ht_2vt_8w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + case 16: + common_hv_2ht_2vt_16w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + case 32: + common_hv_2ht_2vt_32w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + case 64: + common_hv_2ht_2vt_64w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + default: + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } else if (vpx_get_filter_taps(filter_x) == 2 || + vpx_get_filter_taps(filter_y) == 2) { + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h); + } else { + switch (w) { + case 4: + common_hv_8ht_8vt_4w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + case 8: + common_hv_8ht_8vt_8w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + case 16: + common_hv_8ht_8vt_16w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + case 32: + common_hv_8ht_8vt_32w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + case 64: + common_hv_8ht_8vt_64w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + default: + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c new file mode 100644 index 0000000000..6022e43c83 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c @@ -0,0 +1,825 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/vpx_convolve_lsx.h" + +static void common_vt_8t_4w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = height >> 2; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + __m128i reg0, reg1, reg2, reg3, reg4; + __m128i filter0, filter1, filter2, filter3; + __m128i out0, out1; + uint8_t *_src = (uint8_t *)src - src_stride3; + + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + src0 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); + src3 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + src4 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6); + _src += src_stride3; + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, tmp0, + tmp1, tmp2, tmp3); + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, tmp4, tmp5); + DUP2_ARG2(__lsx_vilvl_d, tmp3, tmp0, tmp4, tmp1, reg0, reg1); + reg2 = __lsx_vilvl_d(tmp5, tmp2); + DUP2_ARG2(__lsx_vxori_b, reg0, 128, reg1, 128, reg0, reg1); + reg2 = __lsx_vxori_b(reg2, 128); + + for (; loop_cnt--;) { + src7 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9); + src10 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4); + DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4); + out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, reg3, filter0, filter1, + filter2, filter3); + out1 = filt_8tap_dpadd_s_h(reg1, reg2, reg3, reg4, filter0, filter1, + filter2, filter3); + out0 = __lsx_vssrarni_b_h(out1, out0, 7); + out0 = __lsx_vxori_b(out0, 128); + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); + dst += dst_stride; + + reg0 = reg2; + reg1 = reg3; + reg2 = reg4; + src6 = src10; + } +} + +static void common_vt_8t_8w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = height >> 2; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5; + __m128i filter0, filter1, filter2, filter3; + __m128i out0, out1, out2, out3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + src = src - src_stride3; + + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + src4 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6); + src += src_stride3; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, reg0, + reg1, reg2, reg3); + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5); + + for (; loop_cnt--;) { + src7 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src8, src9); + src10 = __lsx_vldx(src, src_stride3); + src += src_stride4; + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, + tmp0, tmp1, tmp2, tmp3); + out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, tmp0, filter0, filter1, + filter2, filter3); + out1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, tmp1, filter0, filter1, + filter2, filter3); + out2 = filt_8tap_dpadd_s_h(reg1, reg2, tmp0, tmp2, filter0, filter1, + filter2, filter3); + out3 = filt_8tap_dpadd_s_h(reg4, reg5, tmp1, tmp3, filter0, filter1, + filter2, filter3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); + dst += dst_stride; + + reg0 = reg2; + reg1 = tmp0; + reg2 = tmp2; + reg3 = reg5; + reg4 = tmp1; + reg5 = tmp3; + src6 = src10; + } +} + +static void common_vt_8t_16w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = height >> 2; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i filter0, filter1, filter2, filter3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5; + __m128i reg6, reg7, reg8, reg9, reg10, reg11; + __m128i tmp0, tmp1, tmp2, tmp3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + // uint8_t *_src = (uint8_t *)src - src_stride3; + src -= src_stride3; + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += src_stride4; + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6); + src += src_stride3; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, reg0, + reg1, reg2, reg3); + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5); + DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1, reg6, + reg7, reg8, reg9); + DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11); + + for (; loop_cnt--;) { + src7 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src8, src9); + src10 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9, + src4, src5, src7, src8); + tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1, + filter2, filter3); + tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1, + filter2, filter3); + tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1, + filter2, filter3); + tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1, + filter2, filter3); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + __lsx_vst(tmp0, dst, 0); + dst += dst_stride; + __lsx_vst(tmp1, dst, 0); + dst += dst_stride; + tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1, + filter2, filter3); + tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1, + filter2, filter3); + tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1, + filter2, filter3); + tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1, + filter2, filter3); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + __lsx_vst(tmp0, dst, 0); + dst += dst_stride; + __lsx_vst(tmp1, dst, 0); + dst += dst_stride; + + reg0 = reg2; + reg1 = src0; + reg2 = src2; + reg3 = reg5; + reg4 = src1; + reg5 = src3; + reg6 = reg8; + reg7 = src4; + reg8 = src7; + reg9 = reg11; + reg10 = src5; + reg11 = src8; + src6 = src10; + } +} + +static void common_vt_8t_16w_mult_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height, + int32_t width) { + uint8_t *src_tmp; + uint8_t *dst_tmp; + uint32_t cnt = width >> 4; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i filter0, filter1, filter2, filter3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5; + __m128i reg6, reg7, reg8, reg9, reg10, reg11; + __m128i tmp0, tmp1, tmp2, tmp3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + src -= src_stride3; + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + for (; cnt--;) { + uint32_t loop_cnt = height >> 2; + + src_tmp = src; + dst_tmp = dst; + + src0 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src1, + src2); + src3 = __lsx_vldx(src_tmp, src_stride3); + src_tmp += src_stride4; + src4 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5, + src6); + src_tmp += src_stride3; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, + reg0, reg1, reg2, reg3); + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5); + DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1, + reg6, reg7, reg8, reg9); + DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11); + + for (; loop_cnt--;) { + src7 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src8, + src9); + src10 = __lsx_vldx(src_tmp, src_stride3); + src_tmp += src_stride4; + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, + src7, src8, src9, src10); + DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9, + src4, src5, src7, src8); + tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1, + filter2, filter3); + tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1, + filter2, filter3); + tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1, + filter2, filter3); + tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1, + filter2, filter3); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + __lsx_vst(tmp0, dst_tmp, 0); + __lsx_vstx(tmp1, dst_tmp, dst_stride); + tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1, + filter2, filter3); + tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1, + filter2, filter3); + tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1, + filter2, filter3); + tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1, + filter2, filter3); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + __lsx_vstx(tmp0, dst_tmp, dst_stride2); + __lsx_vstx(tmp1, dst_tmp, dst_stride3); + dst_tmp += dst_stride4; + + reg0 = reg2; + reg1 = src0; + reg2 = src2; + reg3 = reg5; + reg4 = src1; + reg5 = src3; + reg6 = reg8; + reg7 = src4; + reg8 = src7; + reg9 = reg11; + reg10 = src5; + reg11 = src8; + src6 = src10; + } + src += 16; + dst += 16; + } +} + +static void common_vt_8t_32w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + common_vt_8t_16w_mult_lsx(src, src_stride, dst, dst_stride, filter, height, + 32); +} + +static void common_vt_8t_64w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + common_vt_8t_16w_mult_lsx(src, src_stride, dst, dst_stride, filter, height, + 64); +} + +static void common_vt_2t_4x4_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, src4; + __m128i vec0, vec1, vec2, vec3, vec4, vec5; + __m128i filt0, tmp0, tmp1; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += (src_stride4 + src_stride); + + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0, + vec1, vec2, vec3); + DUP2_ARG2(__lsx_vilvl_d, vec1, vec0, vec3, vec2, vec4, vec5); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + __lsx_vstelm_w(tmp0, dst, 0, 0); + __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2); + __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3); +} + +static void common_vt_2t_4x8_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; + __m128i vec0, vec1, vec2, vec3, vec4, vec5; + __m128i vec6, vec7, vec8, vec9, vec10, vec11; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i filt0; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + uint8_t *dst_tmp1 = dst + dst_stride4; + + filt0 = __lsx_vldrepl_h(filter, 0); + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += src_stride4; + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src5, src6, src7, src8); + src += (src_stride4 + src_stride); + + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0, + vec1, vec2, vec3); + DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, vec4, + vec5, vec6, vec7); + DUP4_ARG2(__lsx_vilvl_d, vec1, vec0, vec3, vec2, vec5, vec4, vec7, vec6, vec8, + vec9, vec10, vec11); + + DUP4_ARG2(__lsx_vdp2_h_bu, vec8, filt0, vec9, filt0, vec10, filt0, vec11, + filt0, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp1); + + __lsx_vstelm_w(tmp0, dst, 0, 0); + __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2); + __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3); + + __lsx_vstelm_w(tmp1, dst_tmp1, 0, 0); + __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride, 0, 1); + __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride2, 0, 2); + __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride3, 0, 3); +} + +static void common_vt_2t_4w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (height == 4) { + common_vt_2t_4x4_lsx(src, src_stride, dst, dst_stride, filter); + } else if (height == 8) { + common_vt_2t_4x8_lsx(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_vt_2t_8x4_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0; + __m128i out0, out1, tmp0, tmp1, tmp2, tmp3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0, + vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, out0, out1); + + __lsx_vstelm_d(out0, dst, 0, 0); + __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0); + __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1); +} + +static void common_vt_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 3); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + __m128i out0, out1, tmp0, tmp1, tmp2, tmp3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + src0 = __lsx_vld(src, 0); + src += src_stride; + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + src5 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src6, src7) + src8 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, + vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, + vec4, vec5, vec6, vec7); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, out0, out1); + + __lsx_vstelm_d(out0, dst, 0, 0); + __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0); + __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1); + dst += dst_stride4; + + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, out0, out1); + + __lsx_vstelm_d(out0, dst, 0, 0); + __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0); + __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1); + dst += dst_stride4; + + src0 = src8; + } +} + +static void common_vt_2t_8w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (height == 4) { + common_vt_2t_8x4_lsx(src, src_stride, dst, dst_stride, filter); + } else { + common_vt_2t_8x8mult_lsx(src, src_stride, dst, dst_stride, filter, height); + } +} + +static void common_vt_2t_16w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, tmp, tmp0, tmp1; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + src += src_stride; + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + + DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + + src0 = src4; + } +} + +static void common_vt_2t_32w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + __m128i tmp, tmp0, tmp1; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + uint8_t *src_tmp; + + filt0 = __lsx_vldrepl_h(filter, 0); + + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src5); + src += src_stride; + src_tmp = src + 16; + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src_tmp, 0, src1, src6); + DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src, + src_stride2, src_tmp, src_stride2, src2, src7, src3, src8); + DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, src4, src9); + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); + src += src_stride4; + src_tmp += src_stride4; + + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 0); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vstx(tmp, dst, dst_stride); + + DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vstx(tmp, dst, dst_stride2); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vstx(tmp, dst, dst_stride3); + + DUP2_ARG2(__lsx_vilvl_b, src6, src5, src7, src6, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src6, src5, src7, src6, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 16); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + dst += dst_stride; + __lsx_vst(tmp, dst, 16); + + DUP2_ARG2(__lsx_vilvl_b, src8, src7, src9, src8, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src8, src7, src9, src8, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + dst += dst_stride; + __lsx_vst(tmp, dst, 16); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + dst += dst_stride; + __lsx_vst(tmp, dst, 16); + + dst += dst_stride; + + src0 = src4; + src5 = src9; + } +} + +static void common_vt_2t_64w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 1); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + __m128i tmp, tmp0, tmp1; + + int32_t src_stride2 = src_stride << 1; + int32_t dst_stride2 = dst_stride << 1; + uint8_t *dst_tmp1 = dst + dst_stride; + + filt0 = __lsx_vldrepl_h(filter, 0); + + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src3, src6, + src9); + src += src_stride; + + for (; loop_cnt--;) { + uint8_t *src_tmp0 = src + src_stride; + + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src1, src4, src7, + src10); + DUP4_ARG2(__lsx_vld, src_tmp0, 0, src_tmp0, 16, src_tmp0, 32, src_tmp0, 48, + src2, src5, src8, src11); + src += src_stride2; + + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 0); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst_tmp1, 0); + + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src5, src4, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src4, src3, src5, src4, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 16); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst_tmp1, 16); + + DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 32); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst_tmp1, 32); + + DUP2_ARG2(__lsx_vilvl_b, src10, src9, src11, src10, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src10, src9, src11, src10, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 48); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst_tmp1, 48); + dst += dst_stride2; + dst_tmp1 += dst_stride2; + + src0 = src2; + src3 = src5; + src6 = src8; + src9 = src11; + } +} + +void vpx_convolve8_vert_lsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int16_t *const filter_y = filter[y0_q4]; + int8_t cnt, filt_ver[8]; + + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 8; cnt--;) { + filt_ver[cnt] = filter_y[cnt]; + } + + if (vpx_get_filter_taps(filter_y) == 2) { + switch (w) { + case 4: + common_vt_2t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 8: + common_vt_2t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 16: + common_vt_2t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 32: + common_vt_2t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 64: + common_vt_2t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + default: + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_vt_8t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 8: + common_vt_8t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 16: + common_vt_8t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 32: + common_vt_8t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 64: + common_vt_8t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + default: + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c new file mode 100644 index 0000000000..1dad29eeed --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c @@ -0,0 +1,321 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" + +static void avg_width4_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int32_t height) { + int32_t cnt; + __m128i src0, src1; + __m128i dst0, dst1; + + int32_t src_stride2 = src_stride << 1; + + if ((height % 2) == 0) { + for (cnt = (height / 2); cnt--;) { + src0 = __lsx_vld(src, 0); + src1 = __lsx_vldx(src, src_stride); + src += src_stride2; + + dst0 = __lsx_vld(dst, 0); + dst1 = __lsx_vldx(dst, dst_stride); + DUP2_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, dst0, dst1); + + __lsx_vstelm_w(dst0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(dst1, dst, 0, 0); + dst += dst_stride; + } + } +} + +static void avg_width8_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int32_t height) { + int32_t cnt = (height / 4); + __m128i src0, src1, src2, src3; + __m128i dst0, dst1, dst2, dst3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + for (; cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + dst0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2); + dst3 = __lsx_vldx(dst, dst_stride3); + + DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + + __lsx_vstelm_d(dst0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(dst1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(dst2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(dst3, dst, 0, 0); + dst += dst_stride; + } +} + +static void avg_width16_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt = (height / 8); + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + for (; cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + src4 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6); + src7 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + dst0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2); + dst3 = __lsx_vldx(dst, dst_stride3); + dst += dst_stride4; + dst4 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst5, dst6); + dst7 = __lsx_vldx(dst, dst_stride3); + dst -= dst_stride4; + + DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5, src6, dst6, src7, dst7, + dst4, dst5, dst6, dst7); + + __lsx_vst(dst0, dst, 0); + __lsx_vstx(dst1, dst, dst_stride); + __lsx_vstx(dst2, dst, dst_stride2); + __lsx_vstx(dst3, dst, dst_stride3); + dst += dst_stride4; + __lsx_vst(dst4, dst, 0); + __lsx_vstx(dst5, dst, dst_stride); + __lsx_vstx(dst6, dst, dst_stride2); + __lsx_vstx(dst7, dst, dst_stride3); + dst += dst_stride4; + } +} + +static void avg_width32_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt = (height / 8); + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i src8, src9, src10, src11, src12, src13, src14, src15; + __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + __m128i dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + for (; cnt--;) { + uint8_t *dst_tmp = dst; + uint8_t *dst_tmp1 = dst_tmp + 16; + uint8_t *src_tmp = src + 16; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vld, src, 0, src_tmp, 0, src0, src1); + DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src, + src_stride2, src_tmp, src_stride2, src2, src3, src4, src5); + DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, src6, src7); + src += src_stride4; + + DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp1, 0, dst0, dst1); + DUP4_ARG2(__lsx_vldx, dst_tmp, dst_stride, dst_tmp1, dst_stride, dst_tmp, + dst_stride2, dst_tmp1, dst_stride2, dst2, dst3, dst4, dst5); + DUP2_ARG2(__lsx_vldx, dst_tmp, dst_stride3, dst_tmp1, dst_stride3, dst6, + dst7); + dst_tmp += dst_stride4; + dst_tmp1 += dst_stride4; + + src_tmp = src + 16; + DUP2_ARG2(__lsx_vld, src, 0, src_tmp, 0, src8, src9); + DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src, + src_stride2, src_tmp, src_stride2, src10, src11, src12, src13); + DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, src14, src15); + src += src_stride4; + + DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp1, 0, dst8, dst9); + DUP4_ARG2(__lsx_vldx, dst_tmp, dst_stride, dst_tmp1, dst_stride, dst_tmp, + dst_stride2, dst_tmp1, dst_stride2, dst10, dst11, dst12, dst13); + DUP2_ARG2(__lsx_vldx, dst_tmp, dst_stride3, dst_tmp1, dst_stride3, dst14, + dst15); + DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5, src6, dst6, src7, dst7, + dst4, dst5, dst6, dst7); + DUP4_ARG2(__lsx_vavgr_bu, src8, dst8, src9, dst9, src10, dst10, src11, + dst11, dst8, dst9, dst10, dst11); + DUP4_ARG2(__lsx_vavgr_bu, src12, dst12, src13, dst13, src14, dst14, src15, + dst15, dst12, dst13, dst14, dst15); + + dst_tmp = dst + 16; + __lsx_vst(dst0, dst, 0); + __lsx_vstx(dst2, dst, dst_stride); + __lsx_vstx(dst4, dst, dst_stride2); + __lsx_vstx(dst6, dst, dst_stride3); + __lsx_vst(dst1, dst_tmp, 0); + __lsx_vstx(dst3, dst_tmp, dst_stride); + __lsx_vstx(dst5, dst_tmp, dst_stride2); + __lsx_vstx(dst7, dst_tmp, dst_stride3); + dst += dst_stride4; + + __lsx_vst(dst8, dst, 0); + __lsx_vstx(dst10, dst, dst_stride); + __lsx_vstx(dst12, dst, dst_stride2); + __lsx_vstx(dst14, dst, dst_stride3); + __lsx_vst(dst9, dst_tmp1, 0); + __lsx_vstx(dst11, dst_tmp1, dst_stride); + __lsx_vstx(dst13, dst_tmp1, dst_stride2); + __lsx_vstx(dst15, dst_tmp1, dst_stride3); + dst += dst_stride4; + } +} + +static void avg_width64_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt = (height / 4); + uint8_t *dst_tmp = dst; + + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i src8, src9, src10, src11, src12, src13, src14, src15; + __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + __m128i dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; + + for (; cnt--;) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src4, src5, src6, + src7); + src += src_stride; + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src8, src9, src10, + src11); + src += src_stride; + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src12, src13, src14, + src15); + src += src_stride; + + DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48, + dst0, dst1, dst2, dst3); + dst_tmp += dst_stride; + DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48, + dst4, dst5, dst6, dst7); + dst_tmp += dst_stride; + DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48, + dst8, dst9, dst10, dst11); + dst_tmp += dst_stride; + DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48, + dst12, dst13, dst14, dst15); + dst_tmp += dst_stride; + + DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5, src6, dst6, src7, dst7, + dst4, dst5, dst6, dst7); + DUP4_ARG2(__lsx_vavgr_bu, src8, dst8, src9, dst9, src10, dst10, src11, + dst11, dst8, dst9, dst10, dst11); + DUP4_ARG2(__lsx_vavgr_bu, src12, dst12, src13, dst13, src14, dst14, src15, + dst15, dst12, dst13, dst14, dst15); + + __lsx_vst(dst0, dst, 0); + __lsx_vst(dst1, dst, 16); + __lsx_vst(dst2, dst, 32); + __lsx_vst(dst3, dst, 48); + dst += dst_stride; + __lsx_vst(dst4, dst, 0); + __lsx_vst(dst5, dst, 16); + __lsx_vst(dst6, dst, 32); + __lsx_vst(dst7, dst, 48); + dst += dst_stride; + __lsx_vst(dst8, dst, 0); + __lsx_vst(dst9, dst, 16); + __lsx_vst(dst10, dst, 32); + __lsx_vst(dst11, dst, 48); + dst += dst_stride; + __lsx_vst(dst12, dst, 0); + __lsx_vst(dst13, dst, 16); + __lsx_vst(dst14, dst, 32); + __lsx_vst(dst15, dst, 48); + dst += dst_stride; + } +} + +void vpx_convolve_avg_lsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int32_t y_step_q4, + int32_t w, int32_t h) { + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + switch (w) { + case 4: { + avg_width4_lsx(src, src_stride, dst, dst_stride, h); + break; + } + + case 8: { + avg_width8_lsx(src, src_stride, dst, dst_stride, h); + break; + } + case 16: { + avg_width16_lsx(src, src_stride, dst, dst_stride, h); + break; + } + case 32: { + avg_width32_lsx(src, src_stride, dst, dst_stride, h); + break; + } + case 64: { + avg_width64_lsx(src, src_stride, dst, dst_stride, h); + break; + } + default: { + int32_t lp, cnt; + for (cnt = h; cnt--;) { + for (lp = 0; lp < w; ++lp) { + dst[lp] = (((dst[lp] + src[lp]) + 1) >> 1); + } + src += src_stride; + dst += dst_stride; + } + break; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c new file mode 100644 index 0000000000..53dc7097ed --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c @@ -0,0 +1,437 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" + +static void copy_width8_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + if ((height % 12) == 0) { + for (cnt = (height / 12); cnt--;) { + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += src_stride4; + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6); + src += src_stride2; + src7 = __lsx_vldx(src, src_stride); + src += src_stride2; + + __lsx_vstelm_d(src0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src3, dst, 0, 0); + dst += dst_stride; + + __lsx_vstelm_d(src4, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src5, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src6, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src7, dst, 0, 0); + dst += dst_stride; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + __lsx_vstelm_d(src0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src3, dst, 0, 0); + dst += dst_stride; + } + } else if ((height % 8) == 0) { + for (cnt = height >> 3; cnt--;) { + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += src_stride4; + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6); + src += src_stride2; + src7 = __lsx_vldx(src, src_stride); + src += src_stride2; + + __lsx_vstelm_d(src0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src3, dst, 0, 0); + dst += dst_stride; + + __lsx_vstelm_d(src4, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src5, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src6, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src7, dst, 0, 0); + dst += dst_stride; + } + } else if ((height % 4) == 0) { + for (cnt = (height / 4); cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + __lsx_vstelm_d(src0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src3, dst, 0, 0); + dst += dst_stride; + } + } else if ((height % 2) == 0) { + for (cnt = (height / 2); cnt--;) { + src0 = __lsx_vld(src, 0); + src1 = __lsx_vldx(src, src_stride); + src += src_stride2; + + __lsx_vstelm_d(src0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src1, dst, 0, 0); + dst += dst_stride; + } + } +} + +static void copy_16multx8mult_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height, int32_t width) { + int32_t cnt, loop_cnt; + uint8_t *src_tmp; + uint8_t *dst_tmp; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + for (cnt = (width >> 4); cnt--;) { + src_tmp = (uint8_t *)src; + dst_tmp = dst; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + src0 = __lsx_vld(src_tmp, 0); + DUP4_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src_tmp, + src_stride3, src_tmp, src_stride4, src1, src2, src3, src4); + src_tmp += src_stride4; + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5, + src6); + src_tmp += src_stride2; + src7 = __lsx_vldx(src_tmp, src_stride); + src_tmp += src_stride2; + + __lsx_vst(src0, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src1, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src2, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src3, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src4, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src5, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src6, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src7, dst_tmp, 0); + dst_tmp += dst_stride; + } + src += 16; + dst += 16; + } +} + +static void copy_width16_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + if ((height % 12) == 0) { + for (cnt = (height / 12); cnt--;) { + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += src_stride4; + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6); + src += src_stride2; + src7 = __lsx_vldx(src, src_stride); + src += src_stride2; + + __lsx_vst(src0, dst, 0); + dst += dst_stride; + __lsx_vst(src1, dst, 0); + dst += dst_stride; + __lsx_vst(src2, dst, 0); + dst += dst_stride; + __lsx_vst(src3, dst, 0); + dst += dst_stride; + __lsx_vst(src4, dst, 0); + dst += dst_stride; + __lsx_vst(src5, dst, 0); + dst += dst_stride; + __lsx_vst(src6, dst, 0); + dst += dst_stride; + __lsx_vst(src7, dst, 0); + dst += dst_stride; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + __lsx_vst(src0, dst, 0); + dst += dst_stride; + __lsx_vst(src1, dst, 0); + dst += dst_stride; + __lsx_vst(src2, dst, 0); + dst += dst_stride; + __lsx_vst(src3, dst, 0); + dst += dst_stride; + } + } else if ((height % 8) == 0) { + copy_16multx8mult_lsx(src, src_stride, dst, dst_stride, height, 16); + } else if ((height % 4) == 0) { + for (cnt = (height >> 2); cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + __lsx_vst(src0, dst, 0); + dst += dst_stride; + __lsx_vst(src1, dst, 0); + dst += dst_stride; + __lsx_vst(src2, dst, 0); + dst += dst_stride; + __lsx_vst(src3, dst, 0); + dst += dst_stride; + } + } +} + +static void copy_width32_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + uint8_t *src_tmp; + uint8_t *dst_tmp; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + if ((height % 12) == 0) { + for (cnt = (height / 12); cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + + src_tmp = (uint8_t *)src + 16; + src4 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5, + src6); + src7 = __lsx_vldx(src_tmp, src_stride3); + src += src_stride4; + + __lsx_vst(src0, dst, 0); + dst += dst_stride; + __lsx_vst(src1, dst, 0); + dst += dst_stride; + __lsx_vst(src2, dst, 0); + dst += dst_stride; + __lsx_vst(src3, dst, 0); + dst += dst_stride; + + dst_tmp = dst + 16; + __lsx_vst(src4, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src5, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src6, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src7, dst_tmp, 0); + dst_tmp += dst_stride; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + + src_tmp = (uint8_t *)src + 16; + src4 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5, + src6); + src7 = __lsx_vldx(src_tmp, src_stride3); + src += src_stride4; + + __lsx_vst(src0, dst, 0); + dst += dst_stride; + __lsx_vst(src1, dst, 0); + dst += dst_stride; + __lsx_vst(src2, dst, 0); + dst += dst_stride; + __lsx_vst(src3, dst, 0); + dst += dst_stride; + + dst_tmp = dst + 16; + __lsx_vst(src4, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src5, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src6, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src7, dst_tmp, 0); + dst_tmp += dst_stride; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + + src_tmp = (uint8_t *)src + 16; + src4 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5, + src6); + src7 = __lsx_vldx(src_tmp, src_stride3); + src += src_stride4; + + __lsx_vst(src0, dst, 0); + dst += dst_stride; + __lsx_vst(src1, dst, 0); + dst += dst_stride; + __lsx_vst(src2, dst, 0); + dst += dst_stride; + __lsx_vst(src3, dst, 0); + dst += dst_stride; + + dst_tmp = dst + 16; + __lsx_vst(src4, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src5, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src6, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src7, dst_tmp, 0); + dst_tmp += dst_stride; + } + } else if ((height % 8) == 0) { + copy_16multx8mult_lsx(src, src_stride, dst, dst_stride, height, 32); + } else if ((height % 4) == 0) { + for (cnt = (height >> 2); cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + + src_tmp = (uint8_t *)src + 16; + src4 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5, + src6); + src7 = __lsx_vldx(src_tmp, src_stride3); + src += src_stride4; + + __lsx_vst(src0, dst, 0); + dst += dst_stride; + __lsx_vst(src1, dst, 0); + dst += dst_stride; + __lsx_vst(src2, dst, 0); + dst += dst_stride; + __lsx_vst(src3, dst, 0); + dst += dst_stride; + + dst_tmp = dst + 16; + __lsx_vst(src4, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src5, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src6, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src7, dst_tmp, 0); + dst_tmp += dst_stride; + } + } +} + +static void copy_width64_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + copy_16multx8mult_lsx(src, src_stride, dst, dst_stride, height, 64); +} + +void vpx_convolve_copy_lsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int32_t y_step_q4, + int32_t w, int32_t h) { + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + switch (w) { + case 4: { + uint32_t cnt; + __m128i tmp; + for (cnt = h; cnt--;) { + tmp = __lsx_vldrepl_w(src, 0); + __lsx_vstelm_w(tmp, dst, 0, 0); + src += src_stride; + dst += dst_stride; + } + break; + } + case 8: { + copy_width8_lsx(src, src_stride, dst, dst_stride, h); + break; + } + case 16: { + copy_width16_lsx(src, src_stride, dst, dst_stride, h); + break; + } + case 32: { + copy_width32_lsx(src, src_stride, dst, dst_stride, h); + break; + } + case 64: { + copy_width64_lsx(src, src_stride, dst, dst_stride, h); + break; + } + default: { + uint32_t cnt; + for (cnt = h; cnt--;) { + memcpy(dst, src, w); + src += src_stride; + dst += dst_stride; + } + break; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_lsx.h new file mode 100644 index 0000000000..d886b00198 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_lsx.h @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_ +#define VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_ + +#include "./vpx_config.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_util/loongson_intrinsics.h" + +static INLINE __m128i filt_8tap_dpadd_s_h(__m128i _reg0, __m128i _reg1, + __m128i _reg2, __m128i _reg3, + __m128i _filter0, __m128i _filter1, + __m128i _filter2, __m128i _filter3) { + __m128i _vec0, _vec1; + + _vec0 = __lsx_vdp2_h_b(_reg0, _filter0); + _vec0 = __lsx_vdp2add_h_b(_vec0, _reg1, _filter1); + _vec1 = __lsx_vdp2_h_b(_reg2, _filter2); + _vec1 = __lsx_vdp2add_h_b(_vec1, _reg3, _filter3); + return __lsx_vsadd_h(_vec0, _vec1); +} + +static INLINE __m128i horiz_8tap_filt(__m128i _src0, __m128i _src1, + __m128i _mask0, __m128i _mask1, + __m128i _mask2, __m128i _mask3, + __m128i _filt_h0, __m128i _filt_h1, + __m128i _filt_h2, __m128i _filt_h3) { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + __m128i _out; + + DUP4_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src1, _src0, _mask1, _src1, + _src0, _mask2, _src1, _src0, _mask3, _tmp0, _tmp1, _tmp2, _tmp3); + _out = filt_8tap_dpadd_s_h(_tmp0, _tmp1, _tmp2, _tmp3, _filt_h0, _filt_h1, + _filt_h2, _filt_h3); + _out = __lsx_vsrari_h(_out, FILTER_BITS); + return __lsx_vsat_h(_out, 7); +} + +static INLINE __m128i horiz_2tap_filt_uh(__m128i in0, __m128i in1, __m128i mask, + __m128i coeff) { + __m128i tmp0_m, tmp1_m; + + tmp0_m = __lsx_vshuf_b(in1, in0, mask); + tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff); + return __lsx_vsrari_h(tmp1_m, FILTER_BITS); +} + +#define LSX_LD_4(_src, _stride, _src0, _src1, _src2, _src3) \ + do { \ + _src0 = __lsx_vld(_src, 0); \ + _src += _stride; \ + _src1 = __lsx_vld(_src, 0); \ + _src += _stride; \ + _src2 = __lsx_vld(_src, 0); \ + _src += _stride; \ + _src3 = __lsx_vld(_src, 0); \ + } while (0) + +#define HORIZ_8TAP_4WID_4VECS_FILT(_src0, _src1, _src2, _src3, _mask0, _mask1, \ + _mask2, _mask3, _filter0, _filter1, \ + _filter2, _filter3, _out0, _out1) \ + do { \ + __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \ + __m128i _reg0, _reg1, _reg2, _reg3; \ + \ + DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src3, _src2, _mask0, \ + _tmp0, _tmp1); \ + DUP2_ARG2(__lsx_vdp2_h_b, _tmp0, _filter0, _tmp1, _filter0, _reg0, _reg1); \ + DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask1, _src3, _src2, _mask1, \ + _tmp2, _tmp3); \ + DUP2_ARG3(__lsx_vdp2add_h_b, _reg0, _tmp2, _filter1, _reg1, _tmp3, \ + _filter1, _reg0, _reg1); \ + DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask2, _src3, _src2, _mask2, \ + _tmp4, _tmp5); \ + DUP2_ARG2(__lsx_vdp2_h_b, _tmp4, _filter2, _tmp5, _filter2, _reg2, _reg3); \ + DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask3, _src3, _src2, _mask3, \ + _tmp6, _tmp7); \ + DUP2_ARG3(__lsx_vdp2add_h_b, _reg2, _tmp6, _filter3, _reg3, _tmp7, \ + _filter3, _reg2, _reg3); \ + DUP2_ARG2(__lsx_vsadd_h, _reg0, _reg2, _reg1, _reg3, _out0, _out1); \ + } while (0) + +#define HORIZ_8TAP_8WID_4VECS_FILT( \ + _src0, _src1, _src2, _src3, _mask0, _mask1, _mask2, _mask3, _filter0, \ + _filter1, _filter2, _filter3, _out0, _out1, _out2, _out3) \ + do { \ + __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \ + __m128i _reg0, _reg1, _reg2, _reg3, _reg4, _reg5, _reg6, _reg7; \ + \ + DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask0, _src1, _src1, _mask0, \ + _src2, _src2, _mask0, _src3, _src3, _mask0, _tmp0, _tmp1, _tmp2, \ + _tmp3); \ + DUP4_ARG2(__lsx_vdp2_h_b, _tmp0, _filter0, _tmp1, _filter0, _tmp2, \ + _filter0, _tmp3, _filter0, _reg0, _reg1, _reg2, _reg3); \ + DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask2, _src1, _src1, _mask2, \ + _src2, _src2, _mask2, _src3, _src3, _mask2, _tmp0, _tmp1, _tmp2, \ + _tmp3); \ + DUP4_ARG2(__lsx_vdp2_h_b, _tmp0, _filter2, _tmp1, _filter2, _tmp2, \ + _filter2, _tmp3, _filter2, _reg4, _reg5, _reg6, _reg7); \ + DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask1, _src1, _src1, _mask1, \ + _src2, _src2, _mask1, _src3, _src3, _mask1, _tmp4, _tmp5, _tmp6, \ + _tmp7); \ + DUP4_ARG3(__lsx_vdp2add_h_b, _reg0, _tmp4, _filter1, _reg1, _tmp5, \ + _filter1, _reg2, _tmp6, _filter1, _reg3, _tmp7, _filter1, _reg0, \ + _reg1, _reg2, _reg3); \ + DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask3, _src1, _src1, _mask3, \ + _src2, _src2, _mask3, _src3, _src3, _mask3, _tmp4, _tmp5, _tmp6, \ + _tmp7); \ + DUP4_ARG3(__lsx_vdp2add_h_b, _reg4, _tmp4, _filter3, _reg5, _tmp5, \ + _filter3, _reg6, _tmp6, _filter3, _reg7, _tmp7, _filter3, _reg4, \ + _reg5, _reg6, _reg7); \ + DUP4_ARG2(__lsx_vsadd_h, _reg0, _reg4, _reg1, _reg5, _reg2, _reg6, _reg3, \ + _reg7, _out0, _out1, _out2, _out3); \ + } while (0) + +#define AVG_ST4_D(in0, in1, dst0, dst1, pdst, stride) \ + do { \ + __m128i tmp0_m, tmp1_m; \ + \ + DUP2_ARG2(__lsx_vavgr_bu, in0, dst0, in1, dst1, tmp0_m, tmp1_m); \ + __lsx_vstelm_d(tmp0_m, pdst, 0, 0); \ + pdst += stride; \ + __lsx_vstelm_d(tmp0_m, pdst, 0, 1); \ + pdst += stride; \ + __lsx_vstelm_d(tmp1_m, pdst, 0, 0); \ + pdst += stride; \ + __lsx_vstelm_d(tmp1_m, pdst, 0, 1); \ + } while (0) + +#endif // VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/loopfilter.c b/media/libvpx/libvpx/vpx_dsp/loopfilter.c new file mode 100644 index 0000000000..d6504aab1f --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loopfilter.c @@ -0,0 +1,743 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_ports/mem.h" + +static INLINE int8_t signed_char_clamp(int t) { + return (int8_t)clamp(t, -128, 127); +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE int16_t signed_char_clamp_high(int t, int bd) { + switch (bd) { + case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1); + case 12: return (int16_t)clamp(t, -128 * 16, 128 * 16 - 1); + case 8: + default: return (int16_t)clamp(t, -128, 128 - 1); + } +} +#endif + +// Should we apply any filter at all: 11111111 yes, 00000000 no +static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3, + uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0, + uint8_t q1, uint8_t q2, uint8_t q3) { + int8_t mask = 0; + mask |= (abs(p3 - p2) > limit) * -1; + mask |= (abs(p2 - p1) > limit) * -1; + mask |= (abs(p1 - p0) > limit) * -1; + mask |= (abs(q1 - q0) > limit) * -1; + mask |= (abs(q2 - q1) > limit) * -1; + mask |= (abs(q3 - q2) > limit) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + return ~mask; +} + +static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2, + uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1, + uint8_t q2, uint8_t q3) { + int8_t mask = 0; + mask |= (abs(p1 - p0) > thresh) * -1; + mask |= (abs(q1 - q0) > thresh) * -1; + mask |= (abs(p2 - p0) > thresh) * -1; + mask |= (abs(q2 - q0) > thresh) * -1; + mask |= (abs(p3 - p0) > thresh) * -1; + mask |= (abs(q3 - q0) > thresh) * -1; + return ~mask; +} + +static INLINE int8_t flat_mask5(uint8_t thresh, uint8_t p4, uint8_t p3, + uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0, + uint8_t q1, uint8_t q2, uint8_t q3, + uint8_t q4) { + int8_t mask = ~flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3); + mask |= (abs(p4 - p0) > thresh) * -1; + mask |= (abs(q4 - q0) > thresh) * -1; + return ~mask; +} + +// Is there high edge variance internal edge: 11111111 yes, 00000000 no +static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0, + uint8_t q0, uint8_t q1) { + int8_t hev = 0; + hev |= (abs(p1 - p0) > thresh) * -1; + hev |= (abs(q1 - q0) > thresh) * -1; + return hev; +} + +static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1, + uint8_t *op0, uint8_t *oq0, uint8_t *oq1) { + int8_t filter1, filter2; + + const int8_t ps1 = (int8_t)(*op1 ^ 0x80); + const int8_t ps0 = (int8_t)(*op0 ^ 0x80); + const int8_t qs0 = (int8_t)(*oq0 ^ 0x80); + const int8_t qs1 = (int8_t)(*oq1 ^ 0x80); + const int8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); + + // add outer taps if we have high edge variance + int8_t filter = signed_char_clamp(ps1 - qs1) & hev; + + // inner taps + filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; + + // save bottom 3 bits so that we round one side +4 and the other +3 + // if it equals 4 we'll set it to adjust by -1 to account for the fact + // we'd round it by 3 the other way + filter1 = signed_char_clamp(filter + 4) >> 3; + filter2 = signed_char_clamp(filter + 3) >> 3; + + *oq0 = (uint8_t)(signed_char_clamp(qs0 - filter1) ^ 0x80); + *op0 = (uint8_t)(signed_char_clamp(ps0 + filter2) ^ 0x80); + + // outer tap adjustments + filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; + + *oq1 = (uint8_t)(signed_char_clamp(qs1 - filter) ^ 0x80); + *op1 = (uint8_t)(signed_char_clamp(ps1 + filter) ^ 0x80); +} + +void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8; ++i) { + const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch], + p0 = s[-pitch]; + const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch], + q3 = s[3 * pitch]; + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); + filter4(mask, *thresh, s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch); + ++s; + } +} + +void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + vpx_lpf_horizontal_4_c(s, pitch, blimit0, limit0, thresh0); + vpx_lpf_horizontal_4_c(s + 8, pitch, blimit1, limit1, thresh1); +} + +void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8; ++i) { + const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); + filter4(mask, *thresh, s - 2, s - 1, s, s + 1); + s += pitch; + } +} + +void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + vpx_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0); + vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1); +} + +static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat, + uint8_t *op3, uint8_t *op2, uint8_t *op1, + uint8_t *op0, uint8_t *oq0, uint8_t *oq1, + uint8_t *oq2, uint8_t *oq3) { + if (flat && mask) { + const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; + const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; + + // 7-tap filter [1, 1, 1, 2, 1, 1, 1] + *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3); + *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3); + *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3); + *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3); + *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3); + *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3); + } else { + filter4(mask, thresh, op1, op0, oq0, oq1); + } +} + +void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8; ++i) { + const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch], + p0 = s[-pitch]; + const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch], + q3 = s[3 * pitch]; + + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + filter8(mask, *thresh, flat, s - 4 * pitch, s - 3 * pitch, s - 2 * pitch, + s - 1 * pitch, s, s + 1 * pitch, s + 2 * pitch, s + 3 * pitch); + ++s; + } +} + +void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + vpx_lpf_horizontal_8_c(s, pitch, blimit0, limit0, thresh0); + vpx_lpf_horizontal_8_c(s + 8, pitch, blimit1, limit1, thresh1); +} + +void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; + + for (i = 0; i < 8; ++i) { + const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, + s + 3); + s += pitch; + } +} + +void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + vpx_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0); + vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1); +} + +static INLINE void filter16(int8_t mask, uint8_t thresh, int8_t flat, + int8_t flat2, uint8_t *op7, uint8_t *op6, + uint8_t *op5, uint8_t *op4, uint8_t *op3, + uint8_t *op2, uint8_t *op1, uint8_t *op0, + uint8_t *oq0, uint8_t *oq1, uint8_t *oq2, + uint8_t *oq3, uint8_t *oq4, uint8_t *oq5, + uint8_t *oq6, uint8_t *oq7) { + if (flat2 && flat && mask) { + const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3, + p2 = *op2, p1 = *op1, p0 = *op0; + + const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4, + q5 = *oq5, q6 = *oq6, q7 = *oq7; + + // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1] + *op6 = ROUND_POWER_OF_TWO( + p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4); + *op5 = ROUND_POWER_OF_TWO( + p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4); + *op4 = ROUND_POWER_OF_TWO( + p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4); + *op3 = ROUND_POWER_OF_TWO( + p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4); + *op2 = ROUND_POWER_OF_TWO( + p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4, + 4); + *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 + + q0 + q1 + q2 + q3 + q4 + q5, + 4); + *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 + + q1 + q2 + q3 + q4 + q5 + q6, + 4); + *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 + + q2 + q3 + q4 + q5 + q6 + q7, + 4); + *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 + + q3 + q4 + q5 + q6 + q7 * 2, + 4); + *oq2 = ROUND_POWER_OF_TWO( + p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, + 4); + *oq3 = ROUND_POWER_OF_TWO( + p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4); + *oq4 = ROUND_POWER_OF_TWO( + p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4); + *oq5 = ROUND_POWER_OF_TWO( + p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4); + *oq6 = ROUND_POWER_OF_TWO( + p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4); + } else { + filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3); + } +} + +static void mb_lpf_horizontal_edge_w(uint8_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int count) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8 * count; ++i) { + const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch], + p0 = s[-pitch]; + const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch], + q3 = s[3 * pitch]; + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat2 = flat_mask5( + 1, s[-8 * pitch], s[-7 * pitch], s[-6 * pitch], s[-5 * pitch], p0, q0, + s[4 * pitch], s[5 * pitch], s[6 * pitch], s[7 * pitch]); + + filter16(mask, *thresh, flat, flat2, s - 8 * pitch, s - 7 * pitch, + s - 6 * pitch, s - 5 * pitch, s - 4 * pitch, s - 3 * pitch, + s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch, s + 2 * pitch, + s + 3 * pitch, s + 4 * pitch, s + 5 * pitch, s + 6 * pitch, + s + 7 * pitch); + ++s; + } +} + +void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1); +} + +void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 2); +} + +static void mb_lpf_vertical_edge_w(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int count) { + int i; + + for (i = 0; i < count; ++i) { + const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0, q0, s[4], + s[5], s[6], s[7]); + + filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4, + s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, + s + 7); + s += pitch; + } +} + +void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 8); +} + +void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 16); +} + +#if CONFIG_VP9_HIGHBITDEPTH +// Should we apply any filter at all: 11111111 yes, 00000000 no ? +static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit, + uint16_t p3, uint16_t p2, uint16_t p1, + uint16_t p0, uint16_t q0, uint16_t q1, + uint16_t q2, uint16_t q3, int bd) { + int8_t mask = 0; + int16_t limit16 = (uint16_t)limit << (bd - 8); + int16_t blimit16 = (uint16_t)blimit << (bd - 8); + mask |= (abs(p3 - p2) > limit16) * -1; + mask |= (abs(p2 - p1) > limit16) * -1; + mask |= (abs(p1 - p0) > limit16) * -1; + mask |= (abs(q1 - q0) > limit16) * -1; + mask |= (abs(q2 - q1) > limit16) * -1; + mask |= (abs(q3 - q2) > limit16) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1; + return ~mask; +} + +static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2, + uint16_t p1, uint16_t p0, uint16_t q0, + uint16_t q1, uint16_t q2, uint16_t q3, + int bd) { + int8_t mask = 0; + int16_t thresh16 = (uint16_t)thresh << (bd - 8); + mask |= (abs(p1 - p0) > thresh16) * -1; + mask |= (abs(q1 - q0) > thresh16) * -1; + mask |= (abs(p2 - p0) > thresh16) * -1; + mask |= (abs(q2 - q0) > thresh16) * -1; + mask |= (abs(p3 - p0) > thresh16) * -1; + mask |= (abs(q3 - q0) > thresh16) * -1; + return ~mask; +} + +static INLINE int8_t highbd_flat_mask5(uint8_t thresh, uint16_t p4, uint16_t p3, + uint16_t p2, uint16_t p1, uint16_t p0, + uint16_t q0, uint16_t q1, uint16_t q2, + uint16_t q3, uint16_t q4, int bd) { + int8_t mask = ~highbd_flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3, bd); + int16_t thresh16 = (uint16_t)thresh << (bd - 8); + mask |= (abs(p4 - p0) > thresh16) * -1; + mask |= (abs(q4 - q0) > thresh16) * -1; + return ~mask; +} + +// Is there high edge variance internal edge: +// 11111111_11111111 yes, 00000000_00000000 no ? +static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0, + uint16_t q0, uint16_t q1, int bd) { + int16_t hev = 0; + int16_t thresh16 = (uint16_t)thresh << (bd - 8); + hev |= (abs(p1 - p0) > thresh16) * -1; + hev |= (abs(q1 - q0) > thresh16) * -1; + return hev; +} + +static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1, + uint16_t *op0, uint16_t *oq0, uint16_t *oq1, + int bd) { + int16_t filter1, filter2; + // ^0x80 equivalent to subtracting 0x80 from the values to turn them + // into -128 to +127 instead of 0 to 255. + int shift = bd - 8; + const int16_t ps1 = (int16_t)*op1 - (0x80 << shift); + const int16_t ps0 = (int16_t)*op0 - (0x80 << shift); + const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift); + const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift); + const int16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd); + + // Add outer taps if we have high edge variance. + int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev; + + // Inner taps. + filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask; + + // Save bottom 3 bits so that we round one side +4 and the other +3 + // if it equals 4 we'll set it to adjust by -1 to account for the fact + // we'd round it by 3 the other way. + filter1 = signed_char_clamp_high(filter + 4, bd) >> 3; + filter2 = signed_char_clamp_high(filter + 3, bd) >> 3; + + *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift); + *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift); + + // Outer tap adjustments. + filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; + + *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift); + *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift); +} + +void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int bd) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8; ++i) { + const uint16_t p3 = s[-4 * pitch]; + const uint16_t p2 = s[-3 * pitch]; + const uint16_t p1 = s[-2 * pitch]; + const uint16_t p0 = s[-pitch]; + const uint16_t q0 = s[0 * pitch]; + const uint16_t q1 = s[1 * pitch]; + const uint16_t q2 = s[2 * pitch]; + const uint16_t q3 = s[3 * pitch]; + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + highbd_filter4(mask, *thresh, s - 2 * pitch, s - 1 * pitch, s, + s + 1 * pitch, bd); + ++s; + } +} + +void vpx_highbd_lpf_horizontal_4_dual_c( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + vpx_highbd_lpf_horizontal_4_c(s, pitch, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_horizontal_4_c(s + 8, pitch, blimit1, limit1, thresh1, bd); +} + +void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8; ++i) { + const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd); + s += pitch; + } +} + +void vpx_highbd_lpf_vertical_4_dual_c( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + vpx_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1, + bd); +} + +static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat, + uint16_t *op3, uint16_t *op2, uint16_t *op1, + uint16_t *op0, uint16_t *oq0, uint16_t *oq1, + uint16_t *oq2, uint16_t *oq3, int bd) { + if (flat && mask) { + const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; + const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; + + // 7-tap filter [1, 1, 1, 2, 1, 1, 1] + *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3); + *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3); + *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3); + *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3); + *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3); + *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3); + } else { + highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd); + } +} + +void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int bd) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8; ++i) { + const uint16_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch], + p0 = s[-pitch]; + const uint16_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch], + q3 = s[3 * pitch]; + + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat = + highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); + highbd_filter8(mask, *thresh, flat, s - 4 * pitch, s - 3 * pitch, + s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch, + s + 2 * pitch, s + 3 * pitch, bd); + ++s; + } +} + +void vpx_highbd_lpf_horizontal_8_dual_c( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + vpx_highbd_lpf_horizontal_8_c(s, pitch, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_horizontal_8_c(s + 8, pitch, blimit1, limit1, thresh1, bd); +} + +void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + int i; + + for (i = 0; i < 8; ++i) { + const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat = + highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); + highbd_filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, + s + 2, s + 3, bd); + s += pitch; + } +} + +void vpx_highbd_lpf_vertical_8_dual_c( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + vpx_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1, + bd); +} + +static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, int8_t flat, + int8_t flat2, uint16_t *op7, uint16_t *op6, + uint16_t *op5, uint16_t *op4, uint16_t *op3, + uint16_t *op2, uint16_t *op1, uint16_t *op0, + uint16_t *oq0, uint16_t *oq1, uint16_t *oq2, + uint16_t *oq3, uint16_t *oq4, uint16_t *oq5, + uint16_t *oq6, uint16_t *oq7, int bd) { + if (flat2 && flat && mask) { + const uint16_t p7 = *op7; + const uint16_t p6 = *op6; + const uint16_t p5 = *op5; + const uint16_t p4 = *op4; + const uint16_t p3 = *op3; + const uint16_t p2 = *op2; + const uint16_t p1 = *op1; + const uint16_t p0 = *op0; + const uint16_t q0 = *oq0; + const uint16_t q1 = *oq1; + const uint16_t q2 = *oq2; + const uint16_t q3 = *oq3; + const uint16_t q4 = *oq4; + const uint16_t q5 = *oq5; + const uint16_t q6 = *oq6; + const uint16_t q7 = *oq7; + + // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1] + *op6 = ROUND_POWER_OF_TWO( + p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4); + *op5 = ROUND_POWER_OF_TWO( + p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4); + *op4 = ROUND_POWER_OF_TWO( + p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4); + *op3 = ROUND_POWER_OF_TWO( + p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4); + *op2 = ROUND_POWER_OF_TWO( + p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4, + 4); + *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 + + q0 + q1 + q2 + q3 + q4 + q5, + 4); + *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 + + q1 + q2 + q3 + q4 + q5 + q6, + 4); + *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 + + q2 + q3 + q4 + q5 + q6 + q7, + 4); + *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 + + q3 + q4 + q5 + q6 + q7 * 2, + 4); + *oq2 = ROUND_POWER_OF_TWO( + p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, + 4); + *oq3 = ROUND_POWER_OF_TWO( + p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4); + *oq4 = ROUND_POWER_OF_TWO( + p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4); + *oq5 = ROUND_POWER_OF_TWO( + p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4); + *oq6 = ROUND_POWER_OF_TWO( + p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4); + } else { + highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3, + bd); + } +} + +static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int count, + int bd) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8 * count; ++i) { + const uint16_t p3 = s[-4 * pitch]; + const uint16_t p2 = s[-3 * pitch]; + const uint16_t p1 = s[-2 * pitch]; + const uint16_t p0 = s[-pitch]; + const uint16_t q0 = s[0 * pitch]; + const uint16_t q1 = s[1 * pitch]; + const uint16_t q2 = s[2 * pitch]; + const uint16_t q3 = s[3 * pitch]; + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat = + highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat2 = highbd_flat_mask5( + 1, s[-8 * pitch], s[-7 * pitch], s[-6 * pitch], s[-5 * pitch], p0, q0, + s[4 * pitch], s[5 * pitch], s[6 * pitch], s[7 * pitch], bd); + + highbd_filter16(mask, *thresh, flat, flat2, s - 8 * pitch, s - 7 * pitch, + s - 6 * pitch, s - 5 * pitch, s - 4 * pitch, s - 3 * pitch, + s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch, + s + 2 * pitch, s + 3 * pitch, s + 4 * pitch, s + 5 * pitch, + s + 6 * pitch, s + 7 * pitch, bd); + ++s; + } +} + +void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int bd) { + highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1, bd); +} + +void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 2, bd); +} + +static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int count, + int bd) { + int i; + + for (i = 0; i < count; ++i) { + const uint16_t p3 = s[-4]; + const uint16_t p2 = s[-3]; + const uint16_t p1 = s[-2]; + const uint16_t p0 = s[-1]; + const uint16_t q0 = s[0]; + const uint16_t q1 = s[1]; + const uint16_t q2 = s[2]; + const uint16_t q3 = s[3]; + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat = + highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat2 = highbd_flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0, + q0, s[4], s[5], s[6], s[7], bd); + + highbd_filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, + s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, + s + 5, s + 6, s + 7, bd); + s += pitch; + } +} + +void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + highbd_mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 8, bd); +} + +void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + highbd_mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 16, bd); +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/mips/add_noise_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/add_noise_msa.c new file mode 100644 index 0000000000..97541411e4 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/add_noise_msa.c @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/macros_msa.h" + +void vpx_plane_add_noise_msa(uint8_t *start_ptr, const int8_t *noise, + int blackclamp, int whiteclamp, int width, + int height, int32_t pitch) { + int i, j; + v16u8 pos0, pos1, ref0, ref1; + v16i8 black_clamp, white_clamp, both_clamp; + + black_clamp = __msa_fill_b(blackclamp); + white_clamp = __msa_fill_b(whiteclamp); + both_clamp = black_clamp + white_clamp; + both_clamp = -both_clamp; + + for (i = 0; i < height / 2; ++i) { + uint8_t *pos0_ptr = start_ptr + (2 * i) * pitch; + const int8_t *ref0_ptr = noise + (rand() & 0xff); + uint8_t *pos1_ptr = start_ptr + (2 * i + 1) * pitch; + const int8_t *ref1_ptr = noise + (rand() & 0xff); + for (j = width / 16; j--;) { + pos0 = LD_UB(pos0_ptr); + ref0 = LD_UB(ref0_ptr); + pos1 = LD_UB(pos1_ptr); + ref1 = LD_UB(ref1_ptr); + pos0 = __msa_subsus_u_b(pos0, black_clamp); + pos1 = __msa_subsus_u_b(pos1, black_clamp); + pos0 = __msa_subsus_u_b(pos0, both_clamp); + pos1 = __msa_subsus_u_b(pos1, both_clamp); + pos0 = __msa_subsus_u_b(pos0, white_clamp); + pos1 = __msa_subsus_u_b(pos1, white_clamp); + pos0 += ref0; + ST_UB(pos0, pos0_ptr); + pos1 += ref1; + ST_UB(pos1, pos1_ptr); + pos0_ptr += 16; + pos1_ptr += 16; + ref0_ptr += 16; + ref1_ptr += 16; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/mips/avg_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/avg_msa.c new file mode 100644 index 0000000000..3fd18dec56 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/avg_msa.c @@ -0,0 +1,731 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/macros_msa.h" + +uint32_t vpx_avg_8x8_msa(const uint8_t *src, int32_t src_stride) { + uint32_t sum_out; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7; + v4u32 sum = { 0 }; + + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + HADD_UB4_UH(src0, src1, src2, src3, sum0, sum1, sum2, sum3); + HADD_UB4_UH(src4, src5, src6, src7, sum4, sum5, sum6, sum7); + ADD4(sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum0, sum2, sum4, sum6); + ADD2(sum0, sum2, sum4, sum6, sum0, sum4); + sum0 += sum4; + + sum = __msa_hadd_u_w(sum0, sum0); + sum0 = (v8u16)__msa_pckev_h((v8i16)sum, (v8i16)sum); + sum = __msa_hadd_u_w(sum0, sum0); + sum = (v4u32)__msa_srari_w((v4i32)sum, 6); + sum_out = __msa_copy_u_w((v4i32)sum, 0); + + return sum_out; +} + +uint32_t vpx_avg_4x4_msa(const uint8_t *src, int32_t src_stride) { + uint32_t sum_out; + uint32_t src0, src1, src2, src3; + v16u8 vec = { 0 }; + v8u16 sum0; + v4u32 sum1; + v2u64 sum2; + + LW4(src, src_stride, src0, src1, src2, src3); + INSERT_W4_UB(src0, src1, src2, src3, vec); + + sum0 = __msa_hadd_u_h(vec, vec); + sum1 = __msa_hadd_u_w(sum0, sum0); + sum0 = (v8u16)__msa_pckev_h((v8i16)sum1, (v8i16)sum1); + sum1 = __msa_hadd_u_w(sum0, sum0); + sum2 = __msa_hadd_u_d(sum1, sum1); + sum1 = (v4u32)__msa_srari_w((v4i32)sum2, 4); + sum_out = __msa_copy_u_w((v4i32)sum1, 0); + + return sum_out; +} + +#if !CONFIG_VP9_HIGHBITDEPTH +void vpx_hadamard_8x8_msa(const int16_t *src, ptrdiff_t src_stride, + int16_t *dst) { + v8i16 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, + tmp4, tmp5, tmp1, tmp6, tmp2); + TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src3, src4, src5, src6, src7); + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, + tmp4, tmp5, tmp1, tmp6, tmp2); + TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src3, src4, src5, src6, src7); + ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst, 8); +} + +void vpx_hadamard_16x16_msa(const int16_t *src, ptrdiff_t src_stride, + int16_t *dst) { + v8i16 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v8i16 src11, src12, src13, src14, src15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + v8i16 tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + v8i16 res0, res1, res2, res3, res4, res5, res6, res7; + + LD_SH2(src, 8, src0, src8); + src += src_stride; + LD_SH2(src, 8, src1, src9); + src += src_stride; + LD_SH2(src, 8, src2, src10); + src += src_stride; + LD_SH2(src, 8, src3, src11); + src += src_stride; + LD_SH2(src, 8, src4, src12); + src += src_stride; + LD_SH2(src, 8, src5, src13); + src += src_stride; + LD_SH2(src, 8, src6, src14); + src += src_stride; + LD_SH2(src, 8, src7, src15); + src += src_stride; + + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10, + tmp12, tmp14, tmp15, tmp13, tmp11, tmp9); + + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, + tmp4, tmp5, tmp1, tmp6, tmp2); + TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src3, src4, src5, src6, src7); + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, + tmp4, tmp5, tmp1, tmp6, tmp2); + TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src11, src4, src5, src6, src7); + ST_SH8(src0, src1, src2, src11, src4, src5, src6, src7, dst, 8); + + BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9, + src12, src13, src15, src14, src11, src10); + BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15, + tmp11, tmp12, tmp13, tmp9, tmp14, tmp10); + TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, src8, + src9, src10, src11, src12, src13, src14, src15); + BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10, + tmp12, tmp14, tmp15, tmp13, tmp11, tmp9); + BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9, + src12, src13, src15, src14, src11, src10); + BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15, + tmp11, tmp12, tmp13, tmp9, tmp14, tmp10); + TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, res0, + res1, res2, res3, res4, res5, res6, res7); + + LD_SH2(src, 8, src0, src8); + src += src_stride; + LD_SH2(src, 8, src1, src9); + src += src_stride; + LD_SH2(src, 8, src2, src10); + src += src_stride; + LD_SH2(src, 8, src3, src11); + src += src_stride; + + ST_SH8(res0, res1, res2, res3, res4, res5, res6, res7, dst + 64, 8); + + LD_SH2(src, 8, src4, src12); + src += src_stride; + LD_SH2(src, 8, src5, src13); + src += src_stride; + LD_SH2(src, 8, src6, src14); + src += src_stride; + LD_SH2(src, 8, src7, src15); + src += src_stride; + + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10, + tmp12, tmp14, tmp15, tmp13, tmp11, tmp9); + + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, + tmp4, tmp5, tmp1, tmp6, tmp2); + TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src3, src4, src5, src6, src7); + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, + tmp4, tmp5, tmp1, tmp6, tmp2); + TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src3, src4, src5, src6, src7); + ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst + 2 * 64, 8); + + BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9, + src12, src13, src15, src14, src11, src10); + BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15, + tmp11, tmp12, tmp13, tmp9, tmp14, tmp10); + TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, src8, + src9, src10, src11, src12, src13, src14, src15); + BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10, + tmp12, tmp14, tmp15, tmp13, tmp11, tmp9); + BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9, + src12, src13, src15, src14, src11, src10); + BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15, + tmp11, tmp12, tmp13, tmp9, tmp14, tmp10); + TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, res0, + res1, res2, res3, res4, res5, res6, res7); + ST_SH8(res0, res1, res2, res3, res4, res5, res6, res7, dst + 3 * 64, 8); + + LD_SH4(dst, 64, src0, src1, src2, src3); + LD_SH4(dst + 8, 64, src4, src5, src6, src7); + + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + SRA_4V(tmp0, tmp1, tmp2, tmp3, 1); + SRA_4V(tmp4, tmp5, tmp6, tmp7, 1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + + ST_SH4(src0, src1, src2, src3, dst, 64); + ST_SH4(src4, src5, src6, src7, dst + 8, 64); + dst += 16; + + LD_SH4(dst, 64, src0, src1, src2, src3); + LD_SH4(dst + 8, 64, src4, src5, src6, src7); + + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + SRA_4V(tmp0, tmp1, tmp2, tmp3, 1); + SRA_4V(tmp4, tmp5, tmp6, tmp7, 1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + + ST_SH4(src0, src1, src2, src3, dst, 64); + ST_SH4(src4, src5, src6, src7, dst + 8, 64); + dst += 16; + + LD_SH4(dst, 64, src0, src1, src2, src3); + LD_SH4(dst + 8, 64, src4, src5, src6, src7); + + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + SRA_4V(tmp0, tmp1, tmp2, tmp3, 1); + SRA_4V(tmp4, tmp5, tmp6, tmp7, 1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + + ST_SH4(src0, src1, src2, src3, dst, 64); + ST_SH4(src4, src5, src6, src7, dst + 8, 64); + dst += 16; + + LD_SH4(dst, 64, src0, src1, src2, src3); + LD_SH4(dst + 8, 64, src4, src5, src6, src7); + + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + SRA_4V(tmp0, tmp1, tmp2, tmp3, 1); + SRA_4V(tmp4, tmp5, tmp6, tmp7, 1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + + ST_SH4(src0, src1, src2, src3, dst, 64); + ST_SH4(src4, src5, src6, src7, dst + 8, 64); +} + +int vpx_satd_msa(const int16_t *data, int length) { + int i, satd; + v8i16 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 src8, src9, src10, src11, src12, src13, src14, src15; + v8i16 zero = { 0 }; + v8u16 tmp0_h, tmp1_h, tmp2_h, tmp3_h, tmp4_h, tmp5_h, tmp6_h, tmp7_h; + v4u32 tmp0_w = { 0 }; + + if (16 == length) { + LD_SH2(data, 8, src0, src1); + tmp0_h = (v8u16)__msa_asub_s_h(src0, zero); + tmp1_h = (v8u16)__msa_asub_s_h(src1, zero); + tmp0_w = __msa_hadd_u_w(tmp0_h, tmp0_h); + tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h); + satd = HADD_UW_U32(tmp0_w); + } else if (64 == length) { + LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7); + + tmp0_h = (v8u16)__msa_asub_s_h(src0, zero); + tmp1_h = (v8u16)__msa_asub_s_h(src1, zero); + tmp2_h = (v8u16)__msa_asub_s_h(src2, zero); + tmp3_h = (v8u16)__msa_asub_s_h(src3, zero); + tmp4_h = (v8u16)__msa_asub_s_h(src4, zero); + tmp5_h = (v8u16)__msa_asub_s_h(src5, zero); + tmp6_h = (v8u16)__msa_asub_s_h(src6, zero); + tmp7_h = (v8u16)__msa_asub_s_h(src7, zero); + + tmp0_w = __msa_hadd_u_w(tmp0_h, tmp0_h); + tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h); + tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h); + tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h); + tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h); + tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h); + tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h); + tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h); + + satd = HADD_UW_U32(tmp0_w); + } else if (256 == length) { + for (i = 0; i < 2; ++i) { + LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7); + data += 8 * 8; + LD_SH8(data, 8, src8, src9, src10, src11, src12, src13, src14, src15); + data += 8 * 8; + + tmp0_h = (v8u16)__msa_asub_s_h(src0, zero); + tmp1_h = (v8u16)__msa_asub_s_h(src1, zero); + tmp2_h = (v8u16)__msa_asub_s_h(src2, zero); + tmp3_h = (v8u16)__msa_asub_s_h(src3, zero); + tmp4_h = (v8u16)__msa_asub_s_h(src4, zero); + tmp5_h = (v8u16)__msa_asub_s_h(src5, zero); + tmp6_h = (v8u16)__msa_asub_s_h(src6, zero); + tmp7_h = (v8u16)__msa_asub_s_h(src7, zero); + + tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h); + tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h); + tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h); + tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h); + tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h); + tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h); + tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h); + tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h); + + tmp0_h = (v8u16)__msa_asub_s_h(src8, zero); + tmp1_h = (v8u16)__msa_asub_s_h(src9, zero); + tmp2_h = (v8u16)__msa_asub_s_h(src10, zero); + tmp3_h = (v8u16)__msa_asub_s_h(src11, zero); + tmp4_h = (v8u16)__msa_asub_s_h(src12, zero); + tmp5_h = (v8u16)__msa_asub_s_h(src13, zero); + tmp6_h = (v8u16)__msa_asub_s_h(src14, zero); + tmp7_h = (v8u16)__msa_asub_s_h(src15, zero); + + tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h); + tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h); + tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h); + tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h); + tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h); + tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h); + tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h); + tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h); + } + + satd = HADD_UW_U32(tmp0_w); + } else if (1024 == length) { + for (i = 0; i < 8; ++i) { + LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7); + data += 8 * 8; + LD_SH8(data, 8, src8, src9, src10, src11, src12, src13, src14, src15); + data += 8 * 8; + + tmp0_h = (v8u16)__msa_asub_s_h(src0, zero); + tmp1_h = (v8u16)__msa_asub_s_h(src1, zero); + tmp2_h = (v8u16)__msa_asub_s_h(src2, zero); + tmp3_h = (v8u16)__msa_asub_s_h(src3, zero); + tmp4_h = (v8u16)__msa_asub_s_h(src4, zero); + tmp5_h = (v8u16)__msa_asub_s_h(src5, zero); + tmp6_h = (v8u16)__msa_asub_s_h(src6, zero); + tmp7_h = (v8u16)__msa_asub_s_h(src7, zero); + + tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h); + tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h); + tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h); + tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h); + tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h); + tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h); + tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h); + tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h); + + tmp0_h = (v8u16)__msa_asub_s_h(src8, zero); + tmp1_h = (v8u16)__msa_asub_s_h(src9, zero); + tmp2_h = (v8u16)__msa_asub_s_h(src10, zero); + tmp3_h = (v8u16)__msa_asub_s_h(src11, zero); + tmp4_h = (v8u16)__msa_asub_s_h(src12, zero); + tmp5_h = (v8u16)__msa_asub_s_h(src13, zero); + tmp6_h = (v8u16)__msa_asub_s_h(src14, zero); + tmp7_h = (v8u16)__msa_asub_s_h(src15, zero); + + tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h); + tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h); + tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h); + tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h); + tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h); + tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h); + tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h); + tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h); + } + + satd = HADD_UW_U32(tmp0_w); + } else { + satd = 0; + + for (i = 0; i < length; ++i) { + satd += abs(data[i]); + } + } + + return satd; +} +#endif // !CONFIG_VP9_HIGHBITDEPTH + +void vpx_int_pro_row_msa(int16_t hbuf[16], const uint8_t *ref, + const int ref_stride, const int height) { + int i; + v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; + v8i16 hbuf_r = { 0 }; + v8i16 hbuf_l = { 0 }; + v8i16 ref0_r, ref0_l, ref1_r, ref1_l, ref2_r, ref2_l, ref3_r, ref3_l; + v8i16 ref4_r, ref4_l, ref5_r, ref5_l, ref6_r, ref6_l, ref7_r, ref7_l; + + if (16 == height) { + for (i = 2; i--;) { + LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); + ref += 8 * ref_stride; + UNPCK_UB_SH(ref0, ref0_r, ref0_l); + UNPCK_UB_SH(ref1, ref1_r, ref1_l); + UNPCK_UB_SH(ref2, ref2_r, ref2_l); + UNPCK_UB_SH(ref3, ref3_r, ref3_l); + UNPCK_UB_SH(ref4, ref4_r, ref4_l); + UNPCK_UB_SH(ref5, ref5_r, ref5_l); + UNPCK_UB_SH(ref6, ref6_r, ref6_l); + UNPCK_UB_SH(ref7, ref7_r, ref7_l); + ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + } + + SRA_2V(hbuf_r, hbuf_l, 3); + ST_SH2(hbuf_r, hbuf_l, hbuf, 8); + } else if (32 == height) { + for (i = 2; i--;) { + LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); + ref += 8 * ref_stride; + UNPCK_UB_SH(ref0, ref0_r, ref0_l); + UNPCK_UB_SH(ref1, ref1_r, ref1_l); + UNPCK_UB_SH(ref2, ref2_r, ref2_l); + UNPCK_UB_SH(ref3, ref3_r, ref3_l); + UNPCK_UB_SH(ref4, ref4_r, ref4_l); + UNPCK_UB_SH(ref5, ref5_r, ref5_l); + UNPCK_UB_SH(ref6, ref6_r, ref6_l); + UNPCK_UB_SH(ref7, ref7_r, ref7_l); + ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); + ref += 8 * ref_stride; + UNPCK_UB_SH(ref0, ref0_r, ref0_l); + UNPCK_UB_SH(ref1, ref1_r, ref1_l); + UNPCK_UB_SH(ref2, ref2_r, ref2_l); + UNPCK_UB_SH(ref3, ref3_r, ref3_l); + UNPCK_UB_SH(ref4, ref4_r, ref4_l); + UNPCK_UB_SH(ref5, ref5_r, ref5_l); + UNPCK_UB_SH(ref6, ref6_r, ref6_l); + UNPCK_UB_SH(ref7, ref7_r, ref7_l); + ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + } + + SRA_2V(hbuf_r, hbuf_l, 4); + ST_SH2(hbuf_r, hbuf_l, hbuf, 8); + } else if (64 == height) { + for (i = 4; i--;) { + LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); + ref += 8 * ref_stride; + UNPCK_UB_SH(ref0, ref0_r, ref0_l); + UNPCK_UB_SH(ref1, ref1_r, ref1_l); + UNPCK_UB_SH(ref2, ref2_r, ref2_l); + UNPCK_UB_SH(ref3, ref3_r, ref3_l); + UNPCK_UB_SH(ref4, ref4_r, ref4_l); + UNPCK_UB_SH(ref5, ref5_r, ref5_l); + UNPCK_UB_SH(ref6, ref6_r, ref6_l); + UNPCK_UB_SH(ref7, ref7_r, ref7_l); + ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); + ref += 8 * ref_stride; + UNPCK_UB_SH(ref0, ref0_r, ref0_l); + UNPCK_UB_SH(ref1, ref1_r, ref1_l); + UNPCK_UB_SH(ref2, ref2_r, ref2_l); + UNPCK_UB_SH(ref3, ref3_r, ref3_l); + UNPCK_UB_SH(ref4, ref4_r, ref4_l); + UNPCK_UB_SH(ref5, ref5_r, ref5_l); + UNPCK_UB_SH(ref6, ref6_r, ref6_l); + UNPCK_UB_SH(ref7, ref7_r, ref7_l); + ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + } + + SRA_2V(hbuf_r, hbuf_l, 5); + ST_SH2(hbuf_r, hbuf_l, hbuf, 8); + } else { + const int norm_factor = height >> 1; + int cnt; + + for (cnt = 0; cnt < 16; cnt++) { + hbuf[cnt] = 0; + } + + for (i = 0; i < height; ++i) { + for (cnt = 0; cnt < 16; cnt++) { + hbuf[cnt] += ref[cnt]; + } + + ref += ref_stride; + } + + for (cnt = 0; cnt < 16; cnt++) { + hbuf[cnt] /= norm_factor; + } + } +} + +int16_t vpx_int_pro_col_msa(const uint8_t *ref, const int width) { + int16_t sum; + v16u8 ref0, ref1, ref2, ref3; + v8u16 ref0_h; + + if (16 == width) { + ref0 = LD_UB(ref); + ref0_h = __msa_hadd_u_h(ref0, ref0); + sum = HADD_UH_U32(ref0_h); + } else if (32 == width) { + LD_UB2(ref, 16, ref0, ref1); + ref0_h = __msa_hadd_u_h(ref0, ref0); + ref0_h += __msa_hadd_u_h(ref1, ref1); + sum = HADD_UH_U32(ref0_h); + } else if (64 == width) { + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref0_h = __msa_hadd_u_h(ref0, ref0); + ref0_h += __msa_hadd_u_h(ref1, ref1); + ref0_h += __msa_hadd_u_h(ref2, ref2); + ref0_h += __msa_hadd_u_h(ref3, ref3); + sum = HADD_UH_U32(ref0_h); + } else { + int idx; + + sum = 0; + for (idx = 0; idx < width; ++idx) { + sum += ref[idx]; + } + } + + return sum; +} + +int vpx_vector_var_msa(const int16_t *ref, const int16_t *src, const int bwl) { + int sse, mean, var; + v8i16 src0, src1, src2, src3, src4, src5, src6, src7, ref0, ref1, ref2; + v8i16 ref3, ref4, ref5, ref6, ref7, src_l0_m, src_l1_m, src_l2_m, src_l3_m; + v8i16 src_l4_m, src_l5_m, src_l6_m, src_l7_m; + v4i32 res_l0_m, res_l1_m, res_l2_m, res_l3_m, res_l4_m, res_l5_m, res_l6_m; + v4i32 res_l7_m, mean_v; + v2i64 sse_v; + + if (2 == bwl) { + LD_SH2(src, 8, src0, src1); + LD_SH2(ref, 8, ref0, ref1); + + ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m); + ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m); + HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m); + HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m); + sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m); + sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m); + DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v); + mean_v = res_l0_m + res_l1_m; + mean_v += res_l2_m + res_l3_m; + + sse_v += __msa_splati_d(sse_v, 1); + sse = __msa_copy_s_w((v4i32)sse_v, 0); + + mean = HADD_SW_S32(mean_v); + } else if (3 == bwl) { + LD_SH4(src, 8, src0, src1, src2, src3); + LD_SH4(ref, 8, ref0, ref1, ref2, ref3); + + ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m); + ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m); + ILVRL_H2_SH(src2, ref2, src_l4_m, src_l5_m); + ILVRL_H2_SH(src3, ref3, src_l6_m, src_l7_m); + HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m); + HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m); + HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m); + HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m); + sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m); + sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m); + DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v); + DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v); + DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v); + mean_v = res_l0_m + res_l1_m; + mean_v += res_l2_m + res_l3_m; + mean_v += res_l4_m + res_l5_m; + mean_v += res_l6_m + res_l7_m; + + sse_v += __msa_splati_d(sse_v, 1); + sse = __msa_copy_s_w((v4i32)sse_v, 0); + + mean = HADD_SW_S32(mean_v); + } else if (4 == bwl) { + LD_SH8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7); + LD_SH8(ref, 8, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); + + ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m); + ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m); + ILVRL_H2_SH(src2, ref2, src_l4_m, src_l5_m); + ILVRL_H2_SH(src3, ref3, src_l6_m, src_l7_m); + HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m); + HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m); + HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m); + HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m); + sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m); + sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m); + DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v); + DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v); + DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v); + mean_v = res_l0_m + res_l1_m; + mean_v += res_l2_m + res_l3_m; + mean_v += res_l4_m + res_l5_m; + mean_v += res_l6_m + res_l7_m; + + ILVRL_H2_SH(src4, ref4, src_l0_m, src_l1_m); + ILVRL_H2_SH(src5, ref5, src_l2_m, src_l3_m); + ILVRL_H2_SH(src6, ref6, src_l4_m, src_l5_m); + ILVRL_H2_SH(src7, ref7, src_l6_m, src_l7_m); + HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m); + HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m); + HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m); + HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m); + DPADD_SD2_SD(res_l0_m, res_l1_m, sse_v, sse_v); + DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v); + DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v); + DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v); + mean_v += res_l0_m + res_l1_m; + mean_v += res_l2_m + res_l3_m; + mean_v += res_l4_m + res_l5_m; + mean_v += res_l6_m + res_l7_m; + + sse_v += __msa_splati_d(sse_v, 1); + sse = __msa_copy_s_w((v4i32)sse_v, 0); + + mean = HADD_SW_S32(mean_v); + } else { + int i; + const int width = 4 << bwl; + + sse = 0; + mean = 0; + + for (i = 0; i < width; ++i) { + const int diff = ref[i] - src[i]; + + mean += diff; + sse += diff * diff; + } + } + + var = sse - ((mean * mean) >> (bwl + 2)); + + return var; +} + +void vpx_minmax_8x8_msa(const uint8_t *s, int p, const uint8_t *d, int dp, + int *min, int *max) { + v16u8 s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7; + v16u8 diff0, diff1, diff2, diff3, min0, min1, max0, max1; + + LD_UB8(s, p, s0, s1, s2, s3, s4, s5, s6, s7); + LD_UB8(d, dp, d0, d1, d2, d3, d4, d5, d6, d7); + PCKEV_D4_UB(s1, s0, s3, s2, s5, s4, s7, s6, s0, s1, s2, s3); + PCKEV_D4_UB(d1, d0, d3, d2, d5, d4, d7, d6, d0, d1, d2, d3); + + diff0 = __msa_asub_u_b(s0, d0); + diff1 = __msa_asub_u_b(s1, d1); + diff2 = __msa_asub_u_b(s2, d2); + diff3 = __msa_asub_u_b(s3, d3); + + min0 = __msa_min_u_b(diff0, diff1); + min1 = __msa_min_u_b(diff2, diff3); + min0 = __msa_min_u_b(min0, min1); + + max0 = __msa_max_u_b(diff0, diff1); + max1 = __msa_max_u_b(diff2, diff3); + max0 = __msa_max_u_b(max0, max1); + + min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 8); + min0 = __msa_min_u_b(min0, min1); + max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 8); + max0 = __msa_max_u_b(max0, max1); + + min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 4); + min0 = __msa_min_u_b(min0, min1); + max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 4); + max0 = __msa_max_u_b(max0, max1); + + min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 2); + min0 = __msa_min_u_b(min0, min1); + max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 2); + max0 = __msa_max_u_b(max0, max1); + + min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 1); + min0 = __msa_min_u_b(min0, min1); + max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 1); + max0 = __msa_max_u_b(max0, max1); + + *min = min0[0]; + *max = max0[0]; +} diff --git a/media/libvpx/libvpx/vpx_dsp/mips/common_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/common_dspr2.c new file mode 100644 index 0000000000..b22f084a02 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/common_dspr2.c @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_dsp/mips/common_dspr2.h" + +#if HAVE_DSPR2 +uint8_t vpx_ff_cropTbl_a[256 + 2 * CROP_WIDTH]; +uint8_t *vpx_ff_cropTbl; + +void vpx_dsputil_static_init(void) { + int i; + + for (i = 0; i < 256; i++) vpx_ff_cropTbl_a[i + CROP_WIDTH] = i; + + for (i = 0; i < CROP_WIDTH; i++) { + vpx_ff_cropTbl_a[i] = 0; + vpx_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255; + } + + vpx_ff_cropTbl = &vpx_ff_cropTbl_a[CROP_WIDTH]; +} + +#endif diff --git a/media/libvpx/libvpx/vpx_dsp/mips/common_dspr2.h b/media/libvpx/libvpx/vpx_dsp/mips/common_dspr2.h new file mode 100644 index 0000000000..87a5bbab56 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/common_dspr2.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_MIPS_COMMON_DSPR2_H_ +#define VPX_VPX_DSP_MIPS_COMMON_DSPR2_H_ + +#include +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif +#if HAVE_DSPR2 +#define CROP_WIDTH 512 + +extern uint8_t *vpx_ff_cropTbl; // From "vpx_dsp/mips/intrapred4_dspr2.c" + +static INLINE void prefetch_load(const unsigned char *src) { + __asm__ __volatile__("pref 0, 0(%[src]) \n\t" : : [src] "r"(src)); +} + +/* prefetch data for store */ +static INLINE void prefetch_store(unsigned char *dst) { + __asm__ __volatile__("pref 1, 0(%[dst]) \n\t" : : [dst] "r"(dst)); +} + +static INLINE void prefetch_load_streamed(const unsigned char *src) { + __asm__ __volatile__("pref 4, 0(%[src]) \n\t" : : [src] "r"(src)); +} + +/* prefetch data for store */ +static INLINE void prefetch_store_streamed(unsigned char *dst) { + __asm__ __volatile__("pref 5, 0(%[dst]) \n\t" : : [dst] "r"(dst)); +} +#endif // #if HAVE_DSPR2 +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_DSP_MIPS_COMMON_DSPR2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c new file mode 100644 index 0000000000..18e7d5375d --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/convolve_common_dspr2.h" +#include "vpx_dsp/vpx_convolve.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t w, + int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = vpx_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2; + uint32_t p1, p2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t Temp1, Temp2; + const int16_t *filter = &filter_y[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + + for (x = 0; x < w; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" + + "extp %[Temp1], $ac0, 31 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "lbu %[scratch1], 0(%[dst_ptr]) \n\t" + "lbu %[scratch2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ + "extp %[Temp2], $ac3, 31 \n\t" + "lbu %[scratch1], 2(%[dst_ptr]) \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + "lbu %[scratch2], 3(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1), + [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), + [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), + [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_y, int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = vpx_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2; + uint32_t p1, p2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t Temp1, Temp2; + const int16_t *filter = &filter_y[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + prefetch_store(dst + dst_stride + 32); + + for (x = 0; x < 64; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" + + "extp %[Temp1], $ac0, 31 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "lbu %[scratch1], 0(%[dst_ptr]) \n\t" + "lbu %[scratch2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ + "extp %[Temp2], $ac3, 31 \n\t" + "lbu %[scratch1], 2(%[dst_ptr]) \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + "lbu %[scratch2], 3(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1), + [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), + [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), + [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int16_t *const filter_y = filter[y0_q4]; + uint32_t pos = 38; + + assert(y_step_q4 == 16); + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + prefetch_store(dst); + + switch (w) { + case 4: + case 8: + case 16: + case 32: + convolve_bi_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, + w, h); + break; + case 64: + prefetch_store(dst + 32); + convolve_bi_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, + h); + break; + default: + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } +} +#endif diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c new file mode 100644 index 0000000000..7dcb662d7f --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c @@ -0,0 +1,802 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/convolve_common_dspr2.h" +#include "vpx_dsp/vpx_convolve.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = vpx_ff_cropTbl; + int32_t Temp1, Temp2, Temp3, Temp4; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2, p3; + uint32_t tn1, tn2; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */ + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */ + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p3], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */ + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */ + "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" + "extp %[Temp4], $ac2, 31 \n\t" + + "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */ + "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */ + + /* clamp */ + "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */ + "lbux %[p3], %[Temp4](%[cm]) \n\t" /* odd 2 */ + "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */ + + "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */ + "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */ + + "addqh_r.w %[p2], %[p2], %[p3] \n\t" /* average odd 2 */ + "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [Temp4] "=&r"(Temp4) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = vpx_ff_cropTbl; + uint32_t vector4a = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2, tp3, tp4; + uint32_t p1, p2, p3, p4, n1; + uint32_t st0, st1; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + "lbu %[Temp2], 0(%[dst]) \n\t" + "lbu %[tp4], 2(%[dst]) \n\t" + + /* even 2. pixel */ + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lbux %[st1], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" + "addqh_r.w %[tp4], %[tp4], %[st1] \n\t" + "sb %[Temp2], 0(%[dst]) \n\t" + "sb %[tp4], 2(%[dst]) \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "balign %[tp3], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "lbu %[Temp2], 4(%[dst]) \n\t" + "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" + + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sb %[Temp2], 4(%[dst]) \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tp3] \n\t" + "preceu.ph.qbl %[p4], %[tp3] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tp1], 6(%[dst]) \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "lbux %[st0], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + "lbu %[tp2], 1(%[dst]) \n\t" + "lbu %[tp3], 3(%[dst]) \n\t" + "addqh_r.w %[tp1], %[tp1], %[st0] \n\t" + + /* odd 3. pixel */ + "lbux %[st1], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" + "addqh_r.w %[tp2], %[tp2], %[st1] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tp4], 5(%[dst]) \n\t" + + /* odd 4. pixel */ + "sb %[tp2], 1(%[dst]) \n\t" + "sb %[tp1], 6(%[dst]) \n\t" + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbu %[tp1], 7(%[dst]) \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "addqh_r.w %[tp3], %[tp3], %[p4] \n\t" + + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[tp4], %[tp4], %[p2] \n\t" + + "lbux %[p1], %[Temp1](%[cm]) \n\t" + "addqh_r.w %[tp1], %[tp1], %[p1] \n\t" + + /* store bytes */ + "sb %[tp3], 3(%[dst]) \n\t" + "sb %[tp4], 5(%[dst]) \n\t" + "sb %[tp1], 7(%[dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), + [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr, + int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, int32_t h, + int32_t count) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = vpx_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_store(dst_ptr + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ + "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ + "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ + "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ + "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ + "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ + "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ + "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ + "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ + "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ + "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ + "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ + "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ + "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ + "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ + "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ + "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ + "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ + "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ + "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ + "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ + + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ + + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ + + "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ + "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ + "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), + [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), + [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), + [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr, + int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = vpx_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_load(src_ptr + src_stride + 64); + prefetch_store(dst_ptr + dst_stride); + prefetch_store(dst_ptr + dst_stride + 32); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ + "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ + "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ + "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ + "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ + "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ + "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ + "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ + "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ + "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ + "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ + "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ + "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ + "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ + "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ + "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ + "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ + "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ + "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ + "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ + "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ + + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ + + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ + + "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ + "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ + "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), + [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), + [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), + [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; + uint32_t pos = 38; + + assert(x_step_q4 == 16); + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + prefetch_store(dst); + + switch (w) { + case 4: + convolve_bi_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x, + h); + break; + case 8: + convolve_bi_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x, + h); + break; + case 16: + convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, + h, 1); + break; + case 32: + convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, + h, 2); + break; + case 64: + prefetch_load(src + 64); + prefetch_store(dst + 32); + + convolve_bi_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x, + h); + break; + default: + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } +} +#endif diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve2_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_dspr2.c new file mode 100644 index 0000000000..e355ba3a06 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_dspr2.c @@ -0,0 +1,1029 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/convolve_common_dspr2.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_bi_horiz_4_transposed_dspr2( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = vpx_ff_cropTbl; + uint8_t *dst_ptr; + int32_t Temp1, Temp2; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + dst_ptr = dst; + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp2], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp2](%[cm]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp2], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p1], %[Temp1](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + + /* store bytes */ + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[p1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[tp2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[p2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [dst_ptr] "+r"(dst_ptr) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [src] "r"(src), [dst_stride] "r"(dst_stride)); + + /* Next row... */ + src += src_stride; + dst += 1; + } +} + +static void convolve_bi_horiz_8_transposed_dspr2( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = vpx_ff_cropTbl; + uint8_t *dst_ptr; + uint32_t vector4a = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2, tp3; + uint32_t p1, p2, p3, p4; + uint8_t *odd_dst; + uint32_t dst_pitch_2 = (dst_stride << 1); + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + + dst_ptr = dst; + odd_dst = (dst_ptr + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "balign %[tp3], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" + "lbux %[tp1], %[Temp3](%[cm]) \n\t" + "extp %[p3], $ac1, 31 \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sb %[Temp2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbux %[Temp1], %[p3](%[cm]) " + "\n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tp3] \n\t" + "preceu.ph.qbl %[p4], %[tp3] \n\t" + "sb %[Temp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp1], %[Temp3](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + /* odd 3. pixel */ + "lbux %[tp3], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 4. pixel */ + "sb %[tp3], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "lbux %[p1], %[Temp1](%[cm]) \n\t" + + /* store bytes */ + "sb %[p4], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[p2], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[p1], 0(%[odd_dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1), + [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [dst_ptr] "+r"(dst_ptr), + [odd_dst] "+r"(odd_dst) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2)); + + /* Next row... */ + src += src_stride; + dst += 1; + } +} + +static void convolve_bi_horiz_16_transposed_dspr2( + const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) { + int32_t c, y; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = vpx_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + uint32_t dst_pitch_2 = (dst_stride << 1); + uint8_t *odd_dst; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + + src = src_ptr; + dst = dst_ptr; + + odd_dst = (dst + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) " + "\n\t" + "ulw %[qload2], 4(%[src]) " + "\n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 1 */ + "mthi $zero, $ac1 " + "\n\t" + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 2 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "ulw %[qload1], 8(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] " + "\n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 3 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload1] " + "\n\t" + "ulw %[qload2], 12(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] " + "\n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 4 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 1 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + " \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] " + "\n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 5 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 2 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p4], %[filter45] " + "\n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 6 */ + "mthi $zero, $ac3 " + "\n\t" + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 3 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p1], %[filter45] " + "\n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 7 */ + "mthi $zero, $ac1 " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 4 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 20(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p5], %[filter45] " + "\n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 8 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 5 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] " + "\n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 1 */ + "mthi $zero, $ac3 " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] " + "\n\t" /* even 8 */ + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 6 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) " + "\n\t" + "ulw %[qload2], 5(%[src]) " + "\n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 2 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 7 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 9(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] " + "\n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 3 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] " + "\n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 4 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload1] " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 1 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] " + "\n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 5 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 2 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p4], %[filter45] " + "\n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 6 */ + "mthi $zero, $ac2 " + "\n\t" + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 3 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] " + "\n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 7 */ + "mthi $zero, $ac3 " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 4 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 21(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p5], %[filter45] " + "\n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 8 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 5 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] " + "\n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] " + "\n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 8 */ + + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 6 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 7 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), + [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), + [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2)); + + src += 16; + dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); + odd_dst = (dst + dst_stride); + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += 1; + } +} + +static void convolve_bi_horiz_64_transposed_dspr2( + const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, const int16_t *filter_x0, int32_t h) { + int32_t c, y; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = vpx_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + uint32_t dst_pitch_2 = (dst_stride << 1); + uint8_t *odd_dst; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_load(src_ptr + src_stride + 64); + + src = src_ptr; + dst = dst_ptr; + + odd_dst = (dst + dst_stride); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) " + "\n\t" + "ulw %[qload2], 4(%[src]) " + "\n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 1 */ + "mthi $zero, $ac1 " + "\n\t" + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 2 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "ulw %[qload1], 8(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] " + "\n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 3 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload1] " + "\n\t" + "ulw %[qload2], 12(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] " + "\n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 4 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 1 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + " \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] " + "\n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 5 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 2 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p4], %[filter45] " + "\n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 6 */ + "mthi $zero, $ac3 " + "\n\t" + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 3 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p1], %[filter45] " + "\n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 7 */ + "mthi $zero, $ac1 " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 4 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 20(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p5], %[filter45] " + "\n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 8 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 5 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] " + "\n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 1 */ + "mthi $zero, $ac3 " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] " + "\n\t" /* even 8 */ + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 6 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) " + "\n\t" + "ulw %[qload2], 5(%[src]) " + "\n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 2 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 7 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 9(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] " + "\n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 3 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] " + "\n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 4 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload1] " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 1 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] " + "\n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 5 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 2 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p4], %[filter45] " + "\n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 6 */ + "mthi $zero, $ac2 " + "\n\t" + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 3 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] " + "\n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 7 */ + "mthi $zero, $ac3 " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 4 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 21(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p5], %[filter45] " + "\n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 8 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 5 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] " + "\n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] " + "\n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 8 */ + + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 6 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 7 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), + [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), + [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2)); + + src += 16; + dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); + odd_dst = (dst + dst_stride); + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += 1; + } +} + +void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter, int w, int h) { + int x, y; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + int sum = 0; + + sum += src[x] * filter[3]; + sum += src[x + 1] * filter[4]; + + dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + + src += src_stride; + dst += 1; + } +} + +void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter, int w, + int h) { + uint32_t pos = 38; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + + switch (w) { + case 4: + convolve_bi_horiz_4_transposed_dspr2(src, src_stride, dst, dst_stride, + filter, h); + break; + case 8: + convolve_bi_horiz_8_transposed_dspr2(src, src_stride, dst, dst_stride, + filter, h); + break; + case 16: + case 32: + convolve_bi_horiz_16_transposed_dspr2(src, src_stride, dst, dst_stride, + filter, h, (w / 16)); + break; + case 64: + prefetch_load(src + 32); + convolve_bi_horiz_64_transposed_dspr2(src, src_stride, dst, dst_stride, + filter, h); + break; + default: + convolve_bi_horiz_transposed(src, src_stride, dst, dst_stride, filter, w, + h); + break; + } +} +#endif diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c new file mode 100644 index 0000000000..9e65a8f50f --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c @@ -0,0 +1,681 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/convolve_common_dspr2.h" +#include "vpx_dsp/vpx_convolve.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_bi_horiz_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = vpx_ff_cropTbl; + int32_t Temp1, Temp2, Temp3, Temp4; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp3](%[cm]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp4], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p1], %[Temp2](%[cm]) \n\t" + "lbux %[p2], %[Temp4](%[cm]) \n\t" + + /* store bytes */ + "sb %[tp1], 0(%[dst]) \n\t" + "sb %[p1], 1(%[dst]) \n\t" + "sb %[tp2], 2(%[dst]) \n\t" + "sb %[p2], 3(%[dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [Temp4] "=&r"(Temp4) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_horiz_8_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = vpx_ff_cropTbl; + uint32_t vector4a = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2, tp3; + uint32_t p1, p2, p3, p4; + uint32_t st0, st1; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sb %[st0], 0(%[dst]) \n\t" + "lbux %[st1], %[Temp3](%[cm]) \n\t" + + "balign %[tp3], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbux %[st0], %[Temp1](%[cm]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sb %[st1], 2(%[dst]) \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tp3] \n\t" + "preceu.ph.qbl %[p4], %[tp3] \n\t" + "sb %[st0], 4(%[dst]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "lbux %[st0], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + /* odd 3. pixel */ + "lbux %[st1], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 4. pixel */ + "sb %[st1], 1(%[dst]) \n\t" + "sb %[st0], 6(%[dst]) \n\t" + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "lbux %[p1], %[Temp1](%[cm]) \n\t" + + /* store bytes */ + "sb %[p4], 3(%[dst]) \n\t" + "sb %[p2], 5(%[dst]) \n\t" + "sb %[p1], 7(%[dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), + [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr, + int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, int32_t h, + int32_t count) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = vpx_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_store(dst_ptr + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ + "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ + "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ + "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ + "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ + "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ + "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), + [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), + [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr, + int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = vpx_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_load(src_ptr + src_stride + 64); + prefetch_store(dst_ptr + dst_stride); + prefetch_store(dst_ptr + dst_stride + 32); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ + "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ + "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ + "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ + "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ + "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ + "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), + [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), + [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; + uint32_t pos = 38; + + assert(x_step_q4 == 16); + + prefetch_load((const uint8_t *)filter_x); + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + prefetch_store(dst); + + switch (w) { + case 4: + convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); + break; + case 8: + convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); + break; + case 16: + convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h, 1); + break; + case 32: + convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h, 2); + break; + case 64: + prefetch_load(src + 64); + prefetch_store(dst + 32); + + convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); + break; + default: + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } +} +#endif diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c new file mode 100644 index 0000000000..a3e967b405 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/convolve_common_dspr2.h" +#include "vpx_dsp/vpx_convolve.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_bi_vert_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t w, + int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = vpx_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2; + uint32_t p1, p2; + uint32_t scratch1; + uint32_t store1, store2; + int32_t Temp1, Temp2; + const int16_t *filter = &filter_y[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + + for (x = 0; x < w; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" + + "extp %[Temp1], $ac0, 31 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1), + [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), + [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = vpx_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2; + uint32_t p1, p2; + uint32_t scratch1; + uint32_t store1, store2; + int32_t Temp1, Temp2; + const int16_t *filter = &filter_y[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + + for (x = 0; x < 64; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" + + "extp %[Temp1], $ac0, 31 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1), + [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), + [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int16_t *const filter_y = filter[y0_q4]; + uint32_t pos = 38; + + assert(y_step_q4 == 16); + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + prefetch_store(dst); + + switch (w) { + case 4: + case 8: + case 16: + case 32: + convolve_bi_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, + h); + break; + case 64: + prefetch_store(dst + 32); + convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h); + break; + default: + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } +} +#endif diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c new file mode 100644 index 0000000000..cc458c8618 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c @@ -0,0 +1,647 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/convolve_common_dspr2.h" +#include "vpx_dsp/vpx_convolve.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t w, + int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = vpx_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2, load3, load4; + uint32_t p1, p2; + uint32_t n1, n2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2; + + vector1b = ((const int32_t *)filter_y)[0]; + vector2b = ((const int32_t *)filter_y)[1]; + vector3b = ((const int32_t *)filter_y)[2]; + vector4b = ((const int32_t *)filter_y)[3]; + + src -= 3 * src_stride; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + + for (x = 0; x < w; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac0, 31 \n\t" + "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "lbu %[scratch1], 0(%[dst_ptr]) \n\t" + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + "lbu %[scratch2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ + "extp %[Temp2], $ac3, 31 \n\t" + "lbu %[scratch1], 2(%[dst_ptr]) \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + "lbu %[scratch2], 3(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2), + [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1), + [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [src_stride] "r"(src_stride), + [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_avg_vert_64_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = vpx_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2, load3, load4; + uint32_t p1, p2; + uint32_t n1, n2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2; + + vector1b = ((const int32_t *)filter_y)[0]; + vector2b = ((const int32_t *)filter_y)[1]; + vector3b = ((const int32_t *)filter_y)[2]; + vector4b = ((const int32_t *)filter_y)[3]; + + src -= 3 * src_stride; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + prefetch_store(dst + dst_stride + 32); + + for (x = 0; x < 64; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac0, 31 \n\t" + "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "lbu %[scratch1], 0(%[dst_ptr]) \n\t" + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + "lbu %[scratch2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ + "extp %[Temp2], $ac3, 31 \n\t" + "lbu %[scratch1], 2(%[dst_ptr]) \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + "lbu %[scratch2], 3(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2), + [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1), + [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [src_stride] "r"(src_stride), + [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int16_t *const filter_y = filter[y0_q4]; + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + if (vpx_get_filter_taps(filter_y) == 2) { + vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); + } else { + uint32_t pos = 38; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + prefetch_store(dst); + + switch (w) { + case 4: + case 8: + case 16: + case 32: + convolve_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, + h); + break; + case 64: + prefetch_store(dst + 32); + convolve_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, + h); + break; + default: + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} + +void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + /* Fixed size intermediate buffer places limits on parameters. */ + DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]); + int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; + + assert(w <= 64); + assert(h <= 64); + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + + if (intermediate_height < h) intermediate_height = h; + + vpx_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, + intermediate_height); + + vpx_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); +} + +void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + int x, y; + uint32_t tp1, tp2, tn1, tp3, tp4, tn2; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + prefetch_store(dst); + + switch (w) { + case 4: + /* 1 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 0(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + + : [tn1] "=&r"(tn1), [tp1] "=&r"(tp1), [tp2] "=&r"(tp2) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + break; + case 8: + /* 2 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 0(%[dst]) \n\t" + "ulw %[tp3], 4(%[src]) \n\t" + "ulw %[tp4], 4(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 4(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + break; + case 16: + /* 4 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 0(%[dst]) \n\t" + "ulw %[tp3], 4(%[src]) \n\t" + "ulw %[tp4], 4(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 8(%[src]) \n\t" + "ulw %[tp2], 8(%[dst]) \n\t" + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 4(%[dst]) \n\t" /* store */ + "ulw %[tp3], 12(%[src]) \n\t" + "ulw %[tp4], 12(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 8(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 12(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + break; + case 32: + /* 8 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 0(%[dst]) \n\t" + "ulw %[tp3], 4(%[src]) \n\t" + "ulw %[tp4], 4(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 8(%[src]) \n\t" + "ulw %[tp2], 8(%[dst]) \n\t" + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 4(%[dst]) \n\t" /* store */ + "ulw %[tp3], 12(%[src]) \n\t" + "ulw %[tp4], 12(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 16(%[src]) \n\t" + "ulw %[tp2], 16(%[dst]) \n\t" + "sw %[tn1], 8(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 12(%[dst]) \n\t" /* store */ + "ulw %[tp3], 20(%[src]) \n\t" + "ulw %[tp4], 20(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 24(%[src]) \n\t" + "ulw %[tp2], 24(%[dst]) \n\t" + "sw %[tn1], 16(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 20(%[dst]) \n\t" /* store */ + "ulw %[tp3], 28(%[src]) \n\t" + "ulw %[tp4], 28(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 24(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 28(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + break; + case 64: + prefetch_load(src + 64); + prefetch_store(dst + 32); + + /* 16 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_load(src + src_stride + 64); + prefetch_store(dst + dst_stride); + prefetch_store(dst + dst_stride + 32); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 0(%[dst]) \n\t" + "ulw %[tp3], 4(%[src]) \n\t" + "ulw %[tp4], 4(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 8(%[src]) \n\t" + "ulw %[tp2], 8(%[dst]) \n\t" + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 4(%[dst]) \n\t" /* store */ + "ulw %[tp3], 12(%[src]) \n\t" + "ulw %[tp4], 12(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 16(%[src]) \n\t" + "ulw %[tp2], 16(%[dst]) \n\t" + "sw %[tn1], 8(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 12(%[dst]) \n\t" /* store */ + "ulw %[tp3], 20(%[src]) \n\t" + "ulw %[tp4], 20(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 24(%[src]) \n\t" + "ulw %[tp2], 24(%[dst]) \n\t" + "sw %[tn1], 16(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 20(%[dst]) \n\t" /* store */ + "ulw %[tp3], 28(%[src]) \n\t" + "ulw %[tp4], 28(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 32(%[src]) \n\t" + "ulw %[tp2], 32(%[dst]) \n\t" + "sw %[tn1], 24(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 28(%[dst]) \n\t" /* store */ + "ulw %[tp3], 36(%[src]) \n\t" + "ulw %[tp4], 36(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 40(%[src]) \n\t" + "ulw %[tp2], 40(%[dst]) \n\t" + "sw %[tn1], 32(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 36(%[dst]) \n\t" /* store */ + "ulw %[tp3], 44(%[src]) \n\t" + "ulw %[tp4], 44(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 48(%[src]) \n\t" + "ulw %[tp2], 48(%[dst]) \n\t" + "sw %[tn1], 40(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 44(%[dst]) \n\t" /* store */ + "ulw %[tp3], 52(%[src]) \n\t" + "ulw %[tp4], 52(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 56(%[src]) \n\t" + "ulw %[tp2], 56(%[dst]) \n\t" + "sw %[tn1], 48(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 52(%[dst]) \n\t" /* store */ + "ulw %[tp3], 60(%[src]) \n\t" + "ulw %[tp4], 60(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 56(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 60(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + break; + default: + for (y = h; y > 0; --y) { + for (x = 0; x < w; ++x) { + dst[x] = (dst[x] + src[x] + 1) >> 1; + } + + src += src_stride; + dst += dst_stride; + } + break; + } +} +#endif diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c new file mode 100644 index 0000000000..7a9aa49d8a --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c @@ -0,0 +1,998 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/convolve_common_dspr2.h" +#include "vpx_dsp/vpx_convolve.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_avg_horiz_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = vpx_ff_cropTbl; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3, Temp4; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2, p3, p4; + uint32_t n1, n2, n3, n4; + uint32_t tn1, tn2; + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "ulw %[tn2], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn2] \n\t" + "balign %[tn1], %[tn2], 3 \n\t" + "balign %[tn2], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */ + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */ + "preceu.ph.qbr %[n1], %[tp2] \n\t" + "preceu.ph.qbl %[n2], %[tp2] \n\t" + "preceu.ph.qbr %[n3], %[tn2] \n\t" + "preceu.ph.qbl %[n4], %[tn2] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */ + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[n1], %[tn1] \n\t" + "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */ + "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */ + "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t" + "extp %[Temp4], $ac2, 31 \n\t" + + "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */ + "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */ + + /* clamp */ + "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */ + "lbux %[n2], %[Temp4](%[cm]) \n\t" /* odd 2 */ + "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */ + + "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */ + "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */ + + "addqh_r.w %[p2], %[p2], %[n2] \n\t" /* average odd 2 */ + "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3), + [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_avg_horiz_8_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = vpx_ff_cropTbl; + uint32_t vector4a = 64; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2; + uint32_t p1, p2, p3, p4, n1; + uint32_t tn1, tn2, tn3; + uint32_t st0, st1; + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "ulw %[tn2], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + "lbu %[Temp2], 0(%[dst]) \n\t" + "lbu %[tn3], 2(%[dst]) \n\t" + + /* even 2. pixel */ + "preceu.ph.qbr %[p1], %[tn2] \n\t" + "preceu.ph.qbl %[n1], %[tn2] \n\t" + "ulw %[tn1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[tn1] \n\t" + "lbux %[st1], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" + "addqh_r.w %[tn3], %[tn3], %[st1] \n\t" + "sb %[Temp2], 0(%[dst]) \n\t" + "sb %[tn3], 2(%[dst]) \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "balign %[tn3], %[tn1], 3 \n\t" + "balign %[tn1], %[tn2], 3 \n\t" + "balign %[tn2], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "lbu %[Temp2], 4(%[dst]) \n\t" + "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" + + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sb %[Temp2], 4(%[dst]) \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tn2] \n\t" + "preceu.ph.qbl %[p4], %[tn2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tp1], 6(%[dst]) \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn1] \n\t" + "preceu.ph.qbl %[n1], %[tn1] \n\t" + "lbux %[st0], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + "lbu %[tp2], 1(%[dst]) \n\t" + "lbu %[tn2], 3(%[dst]) \n\t" + "addqh_r.w %[tp1], %[tp1], %[st0] \n\t" + + /* odd 3. pixel */ + "lbux %[st1], %[Temp2](%[cm]) \n\t" + "preceu.ph.qbr %[p2], %[tn3] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" + "addqh_r.w %[tp2], %[tp2], %[st1] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tn3], 5(%[dst]) \n\t" + + /* odd 4. pixel */ + "sb %[tp2], 1(%[dst]) \n\t" + "sb %[tp1], 6(%[dst]) \n\t" + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbu %[tn1], 7(%[dst]) \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "addqh_r.w %[tn2], %[tn2], %[p4] \n\t" + + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[tn3], %[tn3], %[p2] \n\t" + + "lbux %[n1], %[Temp1](%[cm]) \n\t" + "addqh_r.w %[tn1], %[tn1], %[n1] \n\t" + + /* store bytes */ + "sb %[tn2], 3(%[dst]) \n\t" + "sb %[tn3], 5(%[dst]) \n\t" + "sb %[tn1], 7(%[dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0), + [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr, + int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, int32_t h, + int32_t count) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = vpx_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_store(dst_ptr + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ + "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ + "ulw %[qload2], 16(%[src]) \n\t" + "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ + "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ + "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ + "ulw %[qload3], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ + "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ + "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ + "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ + "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ + "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ + "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ + "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ + "ulw %[qload2], 17(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ + "ulw %[qload3], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ + "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ + "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ + + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ + + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ + + "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ + "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ + "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), + [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), + [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), + [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3) + : [filter12] "r"(filter12), [filter34] "r"(filter34), + [filter56] "r"(filter56), [filter78] "r"(filter78), + [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr, + int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = vpx_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_load(src_ptr + src_stride + 64); + prefetch_store(dst_ptr + dst_stride); + prefetch_store(dst_ptr + dst_stride + 32); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ + "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ + "ulw %[qload2], 16(%[src]) \n\t" + "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ + "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ + "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ + "ulw %[qload3], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ + "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ + "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ + "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ + "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ + "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ + "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ + "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ + "ulw %[qload2], 17(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ + "ulw %[qload3], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ + "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ + "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ + + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ + + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ + + "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ + "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ + "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), + [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), + [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), + [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3) + : [filter12] "r"(filter12), [filter34] "r"(filter34), + [filter56] "r"(filter56), [filter78] "r"(filter78), + [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; + assert(x_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + + if (vpx_get_filter_taps(filter_x) == 2) { + vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); + } else { + uint32_t pos = 38; + + src -= 3; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + prefetch_store(dst); + + switch (w) { + case 4: + convolve_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x, + h); + break; + case 8: + convolve_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x, + h); + break; + case 16: + convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, + h, 1); + break; + case 32: + convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, + h, 2); + break; + case 64: + prefetch_load(src + 64); + prefetch_store(dst + 32); + + convolve_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x, + h); + break; + default: + vpx_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} +#endif diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve8_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_dspr2.c new file mode 100644 index 0000000000..1e7052f6c5 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_dspr2.c @@ -0,0 +1,1602 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/convolve_common_dspr2.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_horiz_4_transposed_dspr2(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y; + uint8_t *cm = vpx_ff_cropTbl; + uint8_t *dst_ptr; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3, Temp4; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2, p3, p4; + uint32_t tn1, tn2; + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + dst_ptr = dst; + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "ulw %[tn2], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn2] \n\t" + "balign %[tn1], %[tn2], 3 \n\t" + "balign %[tn2], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tn2] \n\t" + "preceu.ph.qbl %[p4], %[tn2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp3](%[cm]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn1] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp4], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[tn1], %[Temp2](%[cm]) \n\t" + "lbux %[p2], %[Temp4](%[cm]) \n\t" + + /* store bytes */ + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[tn1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[tp2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[p2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4), [dst_ptr] "+r"(dst_ptr) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src), + [dst_stride] "r"(dst_stride)); + + /* Next row... */ + src += src_stride; + dst += 1; + } +} + +static void convolve_horiz_8_transposed_dspr2(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y; + uint8_t *cm = vpx_ff_cropTbl; + uint8_t *dst_ptr; + uint32_t vector4a = 64; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2, tp3; + uint32_t p1, p2, p3, p4, n1; + uint8_t *odd_dst; + uint32_t dst_pitch_2 = (dst_stride << 1); + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + + dst_ptr = dst; + odd_dst = (dst_ptr + dst_stride); + + __asm__ __volatile__( + "ulw %[tp2], 0(%[src]) \n\t" + "ulw %[tp1], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tp1] \n\t" + "preceu.ph.qbl %[p4], %[tp1] \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "preceu.ph.qbr %[p1], %[tp3] \n\t" + "preceu.ph.qbl %[n1], %[tp3] \n\t" + "ulw %[tp2], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[tp2] \n\t" + "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" + "lbux %[tp3], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" + "extp %[p3], $ac1, 31 \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sb %[Temp2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + "sb %[tp3], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + + "ulw %[tp1], 1(%[src]) \n\t" + "ulw %[tp3], 5(%[src]) \n\t" + + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbux %[tp2], %[p3](%[cm]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp3] \n\t" + "preceu.ph.qbl %[p4], %[tp3] \n\t" + "sb %[tp2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + "ulw %[tp2], 9(%[src]) \n\t" + + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp1], %[Temp3](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[n1], %[tp2] \n\t" + "ulw %[Temp1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + /* odd 3. pixel */ + "lbux %[tp3], %[Temp2](%[cm]) \n\t" + "preceu.ph.qbr %[p2], %[Temp1] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 4. pixel */ + "sb %[tp3], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "lbux %[n1], %[Temp1](%[cm]) \n\t" + + /* store bytes */ + "sb %[p4], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[p2], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[n1], 0(%[odd_dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1), + [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [dst_ptr] "+r"(dst_ptr), [odd_dst] "+r"(odd_dst) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src), + [dst_pitch_2] "r"(dst_pitch_2)); + + /* Next row... */ + src += src_stride; + dst += 1; + } +} + +static void convolve_horiz_16_transposed_dspr2( + const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) { + int32_t c, y; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = vpx_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + uint32_t dst_pitch_2 = (dst_stride << 1); + uint8_t *odd_dst; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + + src = src_ptr; + dst = dst_ptr; + + odd_dst = (dst + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) " + "\n\t" + "ulw %[qload2], 4(%[src]) " + "\n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 1 */ + "mthi $zero, $ac1 " + "\n\t" + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 2 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "ulw %[qload2], 8(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] " + "\n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 3 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload2] " + "\n\t" + "ulw %[qload1], 12(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] " + "\n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 4 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload1] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 1 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + " \n\t" + "dpa.w.ph $ac3, %[p3], %[filter12] " + "\n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] " + "\n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] " + "\n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] " + "\n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 5 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload1] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 2 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 16(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p4], %[filter12] " + "\n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] " + "\n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] " + "\n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] " + "\n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 6 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p4], %[qload2] " + "\n\t" + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 3 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p1], %[filter12] " + "\n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] " + "\n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] " + "\n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] " + "\n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 7 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbl %[p1], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 4 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 20(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] " + "\n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] " + "\n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] " + "\n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] " + "\n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 8 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 5 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] " + "\n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] " + "\n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] " + "\n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] " + "\n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 1 */ + "mthi $zero, $ac3 " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] " + "\n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] " + "\n\t" /* even 8 */ + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 6 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p1], %[filter56] " + "\n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] " + "\n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) " + "\n\t" + "ulw %[qload2], 5(%[src]) " + "\n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 2 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 7 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 9(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] " + "\n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] " + "\n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p3], %[filter56] " + "\n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] " + "\n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 3 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] " + "\n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] " + "\n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] " + "\n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] " + "\n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 4 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload1] " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 1 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] " + "\n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] " + "\n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] " + "\n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] " + "\n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 5 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 2 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 17(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] " + "\n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] " + "\n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] " + "\n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] " + "\n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 6 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p4], %[qload2] " + "\n\t" + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 3 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] " + "\n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] " + "\n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] " + "\n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] " + "\n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 7 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbl %[p1], %[qload2] " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 4 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 21(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] " + "\n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] " + "\n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] " + "\n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] " + "\n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 8 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 5 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p2], %[filter12] " + "\n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] " + "\n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] " + "\n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] " + "\n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] " + "\n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] " + "\n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] " + "\n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] " + "\n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 8 */ + + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 6 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 7 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), + [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), + [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) + : [filter12] "r"(filter12), [filter34] "r"(filter34), + [filter56] "r"(filter56), [filter78] "r"(filter78), + [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src), + [dst_pitch_2] "r"(dst_pitch_2)); + + src += 16; + dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); + odd_dst = (dst + dst_stride); + } + + /* Next row... */ + src_ptr += src_stride; + + dst_ptr += 1; + } +} + +static void convolve_horiz_64_transposed_dspr2( + const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, const int16_t *filter_x0, int32_t h) { + int32_t c, y; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = vpx_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + uint32_t dst_pitch_2 = (dst_stride << 1); + uint8_t *odd_dst; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_load(src_ptr + src_stride + 64); + + src = src_ptr; + dst = dst_ptr; + + odd_dst = (dst + dst_stride); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) " + "\n\t" + "ulw %[qload2], 4(%[src]) " + "\n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 1 */ + "mthi $zero, $ac1 " + "\n\t" + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 2 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "ulw %[qload2], 8(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] " + "\n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 3 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload2] " + "\n\t" + "ulw %[qload1], 12(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] " + "\n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 4 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload1] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 1 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + " \n\t" + "dpa.w.ph $ac3, %[p3], %[filter12] " + "\n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] " + "\n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] " + "\n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] " + "\n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 5 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload1] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 2 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 16(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p4], %[filter12] " + "\n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] " + "\n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] " + "\n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] " + "\n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 6 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p4], %[qload2] " + "\n\t" + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 3 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p1], %[filter12] " + "\n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] " + "\n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] " + "\n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] " + "\n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 7 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbl %[p1], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 4 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 20(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] " + "\n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] " + "\n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] " + "\n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] " + "\n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 8 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 5 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] " + "\n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] " + "\n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] " + "\n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] " + "\n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 1 */ + "mthi $zero, $ac3 " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] " + "\n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] " + "\n\t" /* even 8 */ + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 6 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p1], %[filter56] " + "\n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] " + "\n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) " + "\n\t" + "ulw %[qload2], 5(%[src]) " + "\n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 2 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 7 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 9(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] " + "\n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] " + "\n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p3], %[filter56] " + "\n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] " + "\n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 3 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] " + "\n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] " + "\n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] " + "\n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] " + "\n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 4 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload1] " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 1 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] " + "\n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] " + "\n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] " + "\n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] " + "\n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 5 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 2 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 17(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] " + "\n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] " + "\n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] " + "\n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] " + "\n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 6 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p4], %[qload2] " + "\n\t" + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 3 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] " + "\n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] " + "\n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] " + "\n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] " + "\n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 7 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbl %[p1], %[qload2] " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 4 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 21(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] " + "\n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] " + "\n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] " + "\n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] " + "\n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 8 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 5 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p2], %[filter12] " + "\n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] " + "\n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] " + "\n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] " + "\n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] " + "\n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] " + "\n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] " + "\n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] " + "\n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 8 */ + + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 6 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 7 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), + [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), + [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) + : [filter12] "r"(filter12), [filter34] "r"(filter34), + [filter56] "r"(filter56), [filter78] "r"(filter78), + [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src), + [dst_pitch_2] "r"(dst_pitch_2)); + + src += 16; + dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); + odd_dst = (dst + dst_stride); + } + + /* Next row... */ + src_ptr += src_stride; + + dst_ptr += 1; + } +} + +void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter, int w, int h) { + int x, y, k; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + int sum = 0; + + for (k = 0; k < 8; ++k) sum += src[x + k] * filter[k]; + + dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + + src += src_stride; + dst += 1; + } +} + +void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, int w, int h) { + int x, y; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + dst[x * dst_stride] = src[x]; + } + + src += src_stride; + dst += 1; + } +} + +void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; + const int16_t *const filter_y = filter[y0_q4]; + DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]); + int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; + uint32_t pos = 38; + + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + assert(((const int32_t *)filter_y)[1] != 0x800000); + (void)x_step_q4; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + if (intermediate_height < h) intermediate_height = h; + + /* copy the src to dst */ + if (filter_x[3] == 0x80) { + copy_horiz_transposed(src - src_stride * 3, src_stride, temp, + intermediate_height, w, intermediate_height); + } else if (vpx_get_filter_taps(filter_x) == 2) { + vpx_convolve2_dspr2(src - src_stride * 3, src_stride, temp, + intermediate_height, filter_x, w, intermediate_height); + } else { + src -= (src_stride * 3 + 3); + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + + switch (w) { + case 4: + convolve_horiz_4_transposed_dspr2(src, src_stride, temp, + intermediate_height, filter_x, + intermediate_height); + break; + case 8: + convolve_horiz_8_transposed_dspr2(src, src_stride, temp, + intermediate_height, filter_x, + intermediate_height); + break; + case 16: + case 32: + convolve_horiz_16_transposed_dspr2(src, src_stride, temp, + intermediate_height, filter_x, + intermediate_height, (w / 16)); + break; + case 64: + prefetch_load(src + 32); + convolve_horiz_64_transposed_dspr2(src, src_stride, temp, + intermediate_height, filter_x, + intermediate_height); + break; + default: + convolve_horiz_transposed(src, src_stride, temp, intermediate_height, + filter_x, w, intermediate_height); + break; + } + } + + /* copy the src to dst */ + if (filter_y[3] == 0x80) { + copy_horiz_transposed(temp + 3, intermediate_height, dst, dst_stride, h, w); + } else if (vpx_get_filter_taps(filter_y) == 2) { + vpx_convolve2_dspr2(temp + 3, intermediate_height, dst, dst_stride, + filter_y, h, w); + } else { + switch (h) { + case 4: + convolve_horiz_4_transposed_dspr2(temp, intermediate_height, dst, + dst_stride, filter_y, w); + break; + case 8: + convolve_horiz_8_transposed_dspr2(temp, intermediate_height, dst, + dst_stride, filter_y, w); + break; + case 16: + case 32: + convolve_horiz_16_transposed_dspr2(temp, intermediate_height, dst, + dst_stride, filter_y, w, (h / 16)); + break; + case 64: + convolve_horiz_64_transposed_dspr2(temp, intermediate_height, dst, + dst_stride, filter_y, w); + break; + default: + convolve_horiz_transposed(temp, intermediate_height, dst, dst_stride, + filter_y, h, w); + break; + } + } +} + +void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + int x, y; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + prefetch_store(dst); + + switch (w) { + case 4: { + uint32_t tp1; + + /* 1 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], (%[src]) \n\t" + "sw %[tp1], (%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + break; + } + case 8: { + uint32_t tp1, tp2; + + /* 2 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + "sw %[tp1], 0(%[dst]) \n\t" /* store */ + "sw %[tp2], 4(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + break; + } + case 16: { + uint32_t tp1, tp2, tp3, tp4; + + /* 4 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "ulw %[tp4], 12(%[src]) \n\t" + + "sw %[tp1], 0(%[dst]) \n\t" /* store */ + "sw %[tp2], 4(%[dst]) \n\t" /* store */ + "sw %[tp3], 8(%[dst]) \n\t" /* store */ + "sw %[tp4], 12(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + break; + } + case 32: { + uint32_t tp1, tp2, tp3, tp4; + uint32_t tp5, tp6, tp7, tp8; + + /* 8 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "ulw %[tp4], 12(%[src]) \n\t" + "ulw %[tp5], 16(%[src]) \n\t" + "ulw %[tp6], 20(%[src]) \n\t" + "ulw %[tp7], 24(%[src]) \n\t" + "ulw %[tp8], 28(%[src]) \n\t" + + "sw %[tp1], 0(%[dst]) \n\t" /* store */ + "sw %[tp2], 4(%[dst]) \n\t" /* store */ + "sw %[tp3], 8(%[dst]) \n\t" /* store */ + "sw %[tp4], 12(%[dst]) \n\t" /* store */ + "sw %[tp5], 16(%[dst]) \n\t" /* store */ + "sw %[tp6], 20(%[dst]) \n\t" /* store */ + "sw %[tp7], 24(%[dst]) \n\t" /* store */ + "sw %[tp8], 28(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6), + [tp7] "=&r"(tp7), [tp8] "=&r"(tp8) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + break; + } + case 64: { + uint32_t tp1, tp2, tp3, tp4; + uint32_t tp5, tp6, tp7, tp8; + + prefetch_load(src + 64); + prefetch_store(dst + 32); + + /* 16 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_load(src + src_stride + 64); + prefetch_store(dst + dst_stride); + prefetch_store(dst + dst_stride + 32); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "ulw %[tp4], 12(%[src]) \n\t" + "ulw %[tp5], 16(%[src]) \n\t" + "ulw %[tp6], 20(%[src]) \n\t" + "ulw %[tp7], 24(%[src]) \n\t" + "ulw %[tp8], 28(%[src]) \n\t" + + "sw %[tp1], 0(%[dst]) \n\t" /* store */ + "sw %[tp2], 4(%[dst]) \n\t" /* store */ + "sw %[tp3], 8(%[dst]) \n\t" /* store */ + "sw %[tp4], 12(%[dst]) \n\t" /* store */ + "sw %[tp5], 16(%[dst]) \n\t" /* store */ + "sw %[tp6], 20(%[dst]) \n\t" /* store */ + "sw %[tp7], 24(%[dst]) \n\t" /* store */ + "sw %[tp8], 28(%[dst]) \n\t" /* store */ + + "ulw %[tp1], 32(%[src]) \n\t" + "ulw %[tp2], 36(%[src]) \n\t" + "ulw %[tp3], 40(%[src]) \n\t" + "ulw %[tp4], 44(%[src]) \n\t" + "ulw %[tp5], 48(%[src]) \n\t" + "ulw %[tp6], 52(%[src]) \n\t" + "ulw %[tp7], 56(%[src]) \n\t" + "ulw %[tp8], 60(%[src]) \n\t" + + "sw %[tp1], 32(%[dst]) \n\t" /* store */ + "sw %[tp2], 36(%[dst]) \n\t" /* store */ + "sw %[tp3], 40(%[dst]) \n\t" /* store */ + "sw %[tp4], 44(%[dst]) \n\t" /* store */ + "sw %[tp5], 48(%[dst]) \n\t" /* store */ + "sw %[tp6], 52(%[dst]) \n\t" /* store */ + "sw %[tp7], 56(%[dst]) \n\t" /* store */ + "sw %[tp8], 60(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6), + [tp7] "=&r"(tp7), [tp8] "=&r"(tp8) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + break; + } + default: + for (y = h; y--;) { + for (x = 0; x < w; ++x) { + dst[x] = src[x]; + } + + src += src_stride; + dst += dst_stride; + } + break; + } +} +#endif diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c new file mode 100644 index 0000000000..09d6f36e56 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c @@ -0,0 +1,878 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/convolve_common_dspr2.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_horiz_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = vpx_ff_cropTbl; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3, Temp4; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2, p3, p4; + uint32_t n1, n2, n3, n4; + uint32_t tn1, tn2; + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "ulw %[tn2], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn2] \n\t" + "balign %[tn1], %[tn2], 3 \n\t" + "balign %[tn2], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[n1], %[tp2] \n\t" + "preceu.ph.qbl %[n2], %[tp2] \n\t" + "preceu.ph.qbr %[n3], %[tn2] \n\t" + "preceu.ph.qbl %[n4], %[tn2] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp3](%[cm]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[n1], %[tn1] \n\t" + "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t" + "extp %[Temp4], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[tn1], %[Temp2](%[cm]) \n\t" + "lbux %[n2], %[Temp4](%[cm]) \n\t" + + /* store bytes */ + "sb %[tp1], 0(%[dst]) \n\t" + "sb %[tn1], 1(%[dst]) \n\t" + "sb %[tp2], 2(%[dst]) \n\t" + "sb %[n2], 3(%[dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3), + [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_horiz_8_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = vpx_ff_cropTbl; + uint32_t vector4a = 64; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2; + uint32_t p1, p2, p3, p4, n1; + uint32_t tn1, tn2, tn3; + uint32_t st0, st1; + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "ulw %[tn2], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "preceu.ph.qbr %[p1], %[tn2] \n\t" + "preceu.ph.qbl %[n1], %[tn2] \n\t" + "ulw %[tn1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[tn1] \n\t" + "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sb %[st0], 0(%[dst]) \n\t" + "lbux %[st1], %[Temp3](%[cm]) \n\t" + + "balign %[tn3], %[tn1], 3 \n\t" + "balign %[tn1], %[tn2], 3 \n\t" + "balign %[tn2], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbux %[st0], %[Temp1](%[cm]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sb %[st1], 2(%[dst]) \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tn2] \n\t" + "preceu.ph.qbl %[p4], %[tn2] \n\t" + "sb %[st0], 4(%[dst]) \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn1] \n\t" + "preceu.ph.qbl %[n1], %[tn1] \n\t" + "lbux %[st0], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + /* odd 3. pixel */ + "lbux %[st1], %[Temp2](%[cm]) \n\t" + "preceu.ph.qbr %[p2], %[tn3] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 4. pixel */ + "sb %[st1], 1(%[dst]) \n\t" + "sb %[st0], 6(%[dst]) \n\t" + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "lbux %[n1], %[Temp1](%[cm]) \n\t" + + /* store bytes */ + "sb %[p4], 3(%[dst]) \n\t" + "sb %[p2], 5(%[dst]) \n\t" + "sb %[n1], 7(%[dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0), + [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, int32_t src_stride, + uint8_t *dst_ptr, int32_t dst_stride, + const int16_t *filter_x0, int32_t h, + int32_t count) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = vpx_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_store(dst_ptr + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ + "ulw %[qload2], 16(%[src]) \n\t" + "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ + "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ + "ulw %[qload3], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ + "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ + "ulw %[qload2], 17(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ + "ulw %[qload3], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ + "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ + "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ + "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), + [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), + [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter12] "r"(filter12), [filter34] "r"(filter34), + [filter56] "r"(filter56), [filter78] "r"(filter78), + [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride, + uint8_t *dst_ptr, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = vpx_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_load(src_ptr + src_stride + 64); + prefetch_store(dst_ptr + dst_stride); + prefetch_store(dst_ptr + dst_stride + 32); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ + "ulw %[qload2], 16(%[src]) \n\t" + "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ + "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ + "ulw %[qload3], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ + "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ + "ulw %[qload2], 17(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ + "ulw %[qload3], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ + "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ + "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ + "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), + [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), + [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter12] "r"(filter12), [filter34] "r"(filter34), + [filter56] "r"(filter56), [filter78] "r"(filter78), + [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int16_t *const filter_x = filter[x0_q4]; + assert(x_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + + if (vpx_get_filter_taps(filter_x) == 2) { + vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + } else { + uint32_t pos = 38; + + prefetch_load((const uint8_t *)filter_x); + src -= 3; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + prefetch_store(dst); + + switch (w) { + case 4: + convolve_horiz_4_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); + break; + case 8: + convolve_horiz_8_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); + break; + case 16: + convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h, 1); + break; + case 32: + convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h, 2); + break; + case 64: + prefetch_load(src + 64); + prefetch_store(dst + 32); + + convolve_horiz_64_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); + break; + default: + vpx_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} +#endif diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c new file mode 100644 index 0000000000..fd977b5336 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c @@ -0,0 +1,360 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/convolve_common_dspr2.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_vert_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t w, + int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = vpx_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2, load3, load4; + uint32_t p1, p2; + uint32_t n1, n2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2; + + vector1b = ((const int32_t *)filter_y)[0]; + vector2b = ((const int32_t *)filter_y)[1]; + vector3b = ((const int32_t *)filter_y)[2]; + vector4b = ((const int32_t *)filter_y)[3]; + + src -= 3 * src_stride; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + + for (x = 0; x < w; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac0, 31 \n\t" + "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2), + [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1), + [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [src_stride] "r"(src_stride), + [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_vert_64_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = vpx_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2, load3, load4; + uint32_t p1, p2; + uint32_t n1, n2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2; + + vector1b = ((const int32_t *)filter_y)[0]; + vector2b = ((const int32_t *)filter_y)[1]; + vector3b = ((const int32_t *)filter_y)[2]; + vector4b = ((const int32_t *)filter_y)[3]; + + src -= 3 * src_stride; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + prefetch_store(dst + dst_stride + 32); + + for (x = 0; x < 64; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac0, 31 \n\t" + "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2), + [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1), + [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [src_stride] "r"(src_stride), + [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int16_t *const filter_y = filter[y0_q4]; + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + if (vpx_get_filter_taps(filter_y) == 2) { + vpx_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + } else { + uint32_t pos = 38; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + prefetch_store(dst); + + switch (w) { + case 4: + case 8: + case 16: + case 32: + convolve_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, h); + break; + case 64: + prefetch_store(dst + 32); + convolve_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h); + break; + default: + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} + +#endif diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve_common_dspr2.h b/media/libvpx/libvpx/vpx_dsp/mips/convolve_common_dspr2.h new file mode 100644 index 0000000000..14b65bc650 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve_common_dspr2.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_ +#define VPX_VPX_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_ + +#include + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/mips/common_dspr2.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if HAVE_DSPR2 +void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h); + +void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h); + +void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h); + +void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter, int w, + int h); + +void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h); + +#endif // #if HAVE_DSPR2 +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/mips/deblock_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/deblock_msa.c new file mode 100644 index 0000000000..4e93ff594d --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/deblock_msa.c @@ -0,0 +1,742 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/macros_msa.h" + +extern const int16_t vpx_rv[]; + +#define VPX_TRANSPOSE8x16_UB_UB( \ + in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, out4, \ + out5, out6, out7, out8, out9, out10, out11, out12, out13, out14, out15) \ + { \ + v8i16 temp0, temp1, temp2, temp3, temp4; \ + v8i16 temp5, temp6, temp7, temp8, temp9; \ + \ + ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2, \ + temp3); \ + ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ + ILVRL_W2_SH(temp5, temp4, temp6, temp7); \ + ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ + ILVRL_W2_SH(temp5, temp4, temp8, temp9); \ + ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2, \ + temp3); \ + ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ + ILVRL_W2_UB(temp5, temp4, out8, out10); \ + ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ + ILVRL_W2_UB(temp5, temp4, out12, out14); \ + out0 = (v16u8)temp6; \ + out2 = (v16u8)temp7; \ + out4 = (v16u8)temp8; \ + out6 = (v16u8)temp9; \ + out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8); \ + out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10); \ + out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12); \ + out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14); \ + out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ + out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ + out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4); \ + out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6); \ + } + +#define VPX_AVER_IF_RETAIN(above2_in, above1_in, src_in, below1_in, below2_in, \ + ref, out) \ + { \ + v16u8 temp0, temp1; \ + \ + temp1 = __msa_aver_u_b(above2_in, above1_in); \ + temp0 = __msa_aver_u_b(below2_in, below1_in); \ + temp1 = __msa_aver_u_b(temp1, temp0); \ + out = __msa_aver_u_b(src_in, temp1); \ + temp0 = __msa_asub_u_b(src_in, above2_in); \ + temp1 = __msa_asub_u_b(src_in, above1_in); \ + temp0 = (temp0 < ref); \ + temp1 = (temp1 < ref); \ + temp0 = temp0 & temp1; \ + temp1 = __msa_asub_u_b(src_in, below1_in); \ + temp1 = (temp1 < ref); \ + temp0 = temp0 & temp1; \ + temp1 = __msa_asub_u_b(src_in, below2_in); \ + temp1 = (temp1 < ref); \ + temp0 = temp0 & temp1; \ + out = __msa_bmz_v(out, src_in, temp0); \ + } + +#define TRANSPOSE12x16_B(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \ + in10, in11, in12, in13, in14, in15) \ + { \ + v8i16 temp0, temp1, temp2, temp3, temp4; \ + v8i16 temp5, temp6, temp7, temp8, temp9; \ + \ + ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \ + ILVRL_H2_SH(temp1, temp0, temp2, temp3); \ + ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \ + ILVRL_H2_SH(temp1, temp0, temp4, temp5); \ + ILVRL_W2_SH(temp4, temp2, temp0, temp1); \ + ILVRL_W2_SH(temp5, temp3, temp2, temp3); \ + ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \ + ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \ + ILVRL_H2_SH(temp5, temp4, temp6, temp7); \ + ILVR_B2_SH(in13, in12, in15, in14, temp4, temp5); \ + ILVRL_H2_SH(temp5, temp4, temp8, temp9); \ + ILVRL_W2_SH(temp8, temp6, temp4, temp5); \ + ILVRL_W2_SH(temp9, temp7, temp6, temp7); \ + ILVL_B2_SH(in1, in0, in3, in2, temp8, temp9); \ + ILVR_D2_UB(temp4, temp0, temp5, temp1, in0, in2); \ + in1 = (v16u8)__msa_ilvl_d((v2i64)temp4, (v2i64)temp0); \ + in3 = (v16u8)__msa_ilvl_d((v2i64)temp5, (v2i64)temp1); \ + ILVL_B2_SH(in5, in4, in7, in6, temp0, temp1); \ + ILVR_D2_UB(temp6, temp2, temp7, temp3, in4, in6); \ + in5 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp2); \ + in7 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp3); \ + ILVL_B4_SH(in9, in8, in11, in10, in13, in12, in15, in14, temp2, temp3, \ + temp4, temp5); \ + ILVR_H4_SH(temp9, temp8, temp1, temp0, temp3, temp2, temp5, temp4, temp6, \ + temp7, temp8, temp9); \ + ILVR_W2_SH(temp7, temp6, temp9, temp8, temp0, temp1); \ + in8 = (v16u8)__msa_ilvr_d((v2i64)temp1, (v2i64)temp0); \ + in9 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp0); \ + ILVL_W2_SH(temp7, temp6, temp9, temp8, temp2, temp3); \ + in10 = (v16u8)__msa_ilvr_d((v2i64)temp3, (v2i64)temp2); \ + in11 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp2); \ + } + +#define VPX_TRANSPOSE12x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, \ + in9, in10, in11) \ + { \ + v8i16 temp0, temp1, temp2, temp3; \ + v8i16 temp4, temp5, temp6, temp7; \ + \ + ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \ + ILVRL_H2_SH(temp1, temp0, temp2, temp3); \ + ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \ + ILVRL_H2_SH(temp1, temp0, temp4, temp5); \ + ILVRL_W2_SH(temp4, temp2, temp0, temp1); \ + ILVRL_W2_SH(temp5, temp3, temp2, temp3); \ + ILVL_B2_SH(in1, in0, in3, in2, temp4, temp5); \ + temp4 = __msa_ilvr_h(temp5, temp4); \ + ILVL_B2_SH(in5, in4, in7, in6, temp6, temp7); \ + temp5 = __msa_ilvr_h(temp7, temp6); \ + ILVRL_W2_SH(temp5, temp4, temp6, temp7); \ + in0 = (v16u8)temp0; \ + in2 = (v16u8)temp1; \ + in4 = (v16u8)temp2; \ + in6 = (v16u8)temp3; \ + in8 = (v16u8)temp6; \ + in10 = (v16u8)temp7; \ + in1 = (v16u8)__msa_ilvl_d((v2i64)temp0, (v2i64)temp0); \ + in3 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp1); \ + in5 = (v16u8)__msa_ilvl_d((v2i64)temp2, (v2i64)temp2); \ + in7 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp3); \ + in9 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp6); \ + in11 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp7); \ + } + +static void postproc_down_across_chroma_msa(uint8_t *src_ptr, uint8_t *dst_ptr, + int32_t src_stride, + int32_t dst_stride, int32_t cols, + uint8_t *f) { + uint8_t *p_src = src_ptr; + uint8_t *p_dst = dst_ptr; + uint8_t *f_orig = f; + uint8_t *p_dst_st = dst_ptr; + uint16_t col; + uint64_t out0, out1, out2, out3; + v16u8 above2, above1, below2, below1, src, ref, ref_temp; + v16u8 inter0, inter1, inter2, inter3, inter4, inter5; + v16u8 inter6, inter7, inter8, inter9, inter10, inter11; + + for (col = (cols / 16); col--;) { + ref = LD_UB(f); + LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1); + src = LD_UB(p_src); + LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0); + above2 = LD_UB(p_src + 3 * src_stride); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1); + above1 = LD_UB(p_src + 4 * src_stride); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2); + src = LD_UB(p_src + 5 * src_stride); + VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3); + below1 = LD_UB(p_src + 6 * src_stride); + VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4); + below2 = LD_UB(p_src + 7 * src_stride); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5); + above2 = LD_UB(p_src + 8 * src_stride); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6); + above1 = LD_UB(p_src + 9 * src_stride); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7); + ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7, + p_dst, dst_stride); + + p_dst += 16; + p_src += 16; + f += 16; + } + + if (0 != (cols / 16)) { + ref = LD_UB(f); + LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1); + src = LD_UB(p_src); + LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0); + above2 = LD_UB(p_src + 3 * src_stride); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1); + above1 = LD_UB(p_src + 4 * src_stride); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2); + src = LD_UB(p_src + 5 * src_stride); + VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3); + below1 = LD_UB(p_src + 6 * src_stride); + VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4); + below2 = LD_UB(p_src + 7 * src_stride); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5); + above2 = LD_UB(p_src + 8 * src_stride); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6); + above1 = LD_UB(p_src + 9 * src_stride); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7); + out0 = __msa_copy_u_d((v2i64)inter0, 0); + out1 = __msa_copy_u_d((v2i64)inter1, 0); + out2 = __msa_copy_u_d((v2i64)inter2, 0); + out3 = __msa_copy_u_d((v2i64)inter3, 0); + SD4(out0, out1, out2, out3, p_dst, dst_stride); + + out0 = __msa_copy_u_d((v2i64)inter4, 0); + out1 = __msa_copy_u_d((v2i64)inter5, 0); + out2 = __msa_copy_u_d((v2i64)inter6, 0); + out3 = __msa_copy_u_d((v2i64)inter7, 0); + SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride); + } + + f = f_orig; + p_dst = dst_ptr - 2; + LD_UB8(p_dst, dst_stride, inter0, inter1, inter2, inter3, inter4, inter5, + inter6, inter7); + + for (col = 0; col < (cols / 8); ++col) { + ref = LD_UB(f); + f += 8; + VPX_TRANSPOSE12x8_UB_UB(inter0, inter1, inter2, inter3, inter4, inter5, + inter6, inter7, inter8, inter9, inter10, inter11); + if (0 == col) { + above2 = inter2; + above1 = inter2; + } else { + above2 = inter0; + above1 = inter1; + } + src = inter2; + below1 = inter3; + below2 = inter4; + ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2); + above2 = inter5; + ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3); + above1 = inter6; + ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4); + src = inter7; + ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3); + VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5); + below1 = inter8; + ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4); + VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6); + below2 = inter9; + ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7); + if (col == (cols / 8 - 1)) { + above2 = inter9; + } else { + above2 = inter10; + } + ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8); + if (col == (cols / 8 - 1)) { + above1 = inter9; + } else { + above1 = inter11; + } + ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9); + TRANSPOSE8x8_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7, inter8, + inter9, inter2, inter3, inter4, inter5, inter6, inter7, + inter8, inter9); + p_dst += 8; + LD_UB2(p_dst, dst_stride, inter0, inter1); + ST8x1_UB(inter2, p_dst_st); + ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride)); + LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3); + ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride)); + ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride)); + LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5); + ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride)); + ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride)); + LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7); + ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride)); + ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride)); + p_dst_st += 8; + } +} + +static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr, + int32_t src_stride, + int32_t dst_stride, int32_t cols, + uint8_t *f) { + uint8_t *p_src = src_ptr; + uint8_t *p_dst = dst_ptr; + uint8_t *p_dst_st = dst_ptr; + uint8_t *f_orig = f; + uint16_t col; + uint64_t out0, out1, out2, out3; + v16u8 above2, above1, below2, below1; + v16u8 src, ref, ref_temp; + v16u8 inter0, inter1, inter2, inter3, inter4, inter5, inter6; + v16u8 inter7, inter8, inter9, inter10, inter11; + v16u8 inter12, inter13, inter14, inter15; + + for (col = (cols / 16); col--;) { + ref = LD_UB(f); + LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1); + src = LD_UB(p_src); + LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0); + above2 = LD_UB(p_src + 3 * src_stride); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1); + above1 = LD_UB(p_src + 4 * src_stride); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2); + src = LD_UB(p_src + 5 * src_stride); + VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3); + below1 = LD_UB(p_src + 6 * src_stride); + VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4); + below2 = LD_UB(p_src + 7 * src_stride); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5); + above2 = LD_UB(p_src + 8 * src_stride); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6); + above1 = LD_UB(p_src + 9 * src_stride); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7); + src = LD_UB(p_src + 10 * src_stride); + VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter8); + below1 = LD_UB(p_src + 11 * src_stride); + VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter9); + below2 = LD_UB(p_src + 12 * src_stride); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter10); + above2 = LD_UB(p_src + 13 * src_stride); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter11); + above1 = LD_UB(p_src + 14 * src_stride); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter12); + src = LD_UB(p_src + 15 * src_stride); + VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter13); + below1 = LD_UB(p_src + 16 * src_stride); + VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter14); + below2 = LD_UB(p_src + 17 * src_stride); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter15); + ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7, + p_dst, dst_stride); + ST_UB8(inter8, inter9, inter10, inter11, inter12, inter13, inter14, inter15, + p_dst + 8 * dst_stride, dst_stride); + p_src += 16; + p_dst += 16; + f += 16; + } + + if (0 != (cols / 16)) { + ref = LD_UB(f); + LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1); + src = LD_UB(p_src); + LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0); + above2 = LD_UB(p_src + 3 * src_stride); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1); + above1 = LD_UB(p_src + 4 * src_stride); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2); + src = LD_UB(p_src + 5 * src_stride); + VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3); + below1 = LD_UB(p_src + 6 * src_stride); + VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4); + below2 = LD_UB(p_src + 7 * src_stride); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5); + above2 = LD_UB(p_src + 8 * src_stride); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6); + above1 = LD_UB(p_src + 9 * src_stride); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7); + src = LD_UB(p_src + 10 * src_stride); + VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter8); + below1 = LD_UB(p_src + 11 * src_stride); + VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter9); + below2 = LD_UB(p_src + 12 * src_stride); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter10); + above2 = LD_UB(p_src + 13 * src_stride); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter11); + above1 = LD_UB(p_src + 14 * src_stride); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter12); + src = LD_UB(p_src + 15 * src_stride); + VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter13); + below1 = LD_UB(p_src + 16 * src_stride); + VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter14); + below2 = LD_UB(p_src + 17 * src_stride); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter15); + out0 = __msa_copy_u_d((v2i64)inter0, 0); + out1 = __msa_copy_u_d((v2i64)inter1, 0); + out2 = __msa_copy_u_d((v2i64)inter2, 0); + out3 = __msa_copy_u_d((v2i64)inter3, 0); + SD4(out0, out1, out2, out3, p_dst, dst_stride); + + out0 = __msa_copy_u_d((v2i64)inter4, 0); + out1 = __msa_copy_u_d((v2i64)inter5, 0); + out2 = __msa_copy_u_d((v2i64)inter6, 0); + out3 = __msa_copy_u_d((v2i64)inter7, 0); + SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride); + + out0 = __msa_copy_u_d((v2i64)inter8, 0); + out1 = __msa_copy_u_d((v2i64)inter9, 0); + out2 = __msa_copy_u_d((v2i64)inter10, 0); + out3 = __msa_copy_u_d((v2i64)inter11, 0); + SD4(out0, out1, out2, out3, p_dst + 8 * dst_stride, dst_stride); + + out0 = __msa_copy_u_d((v2i64)inter12, 0); + out1 = __msa_copy_u_d((v2i64)inter13, 0); + out2 = __msa_copy_u_d((v2i64)inter14, 0); + out3 = __msa_copy_u_d((v2i64)inter15, 0); + SD4(out0, out1, out2, out3, p_dst + 12 * dst_stride, dst_stride); + } + + f = f_orig; + p_dst = dst_ptr - 2; + LD_UB8(p_dst, dst_stride, inter0, inter1, inter2, inter3, inter4, inter5, + inter6, inter7); + LD_UB8(p_dst + 8 * dst_stride, dst_stride, inter8, inter9, inter10, inter11, + inter12, inter13, inter14, inter15); + + for (col = 0; col < cols / 8; ++col) { + ref = LD_UB(f); + f += 8; + TRANSPOSE12x16_B(inter0, inter1, inter2, inter3, inter4, inter5, inter6, + inter7, inter8, inter9, inter10, inter11, inter12, inter13, + inter14, inter15); + if (0 == col) { + above2 = inter2; + above1 = inter2; + } else { + above2 = inter0; + above1 = inter1; + } + + src = inter2; + below1 = inter3; + below2 = inter4; + ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2); + above2 = inter5; + ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3); + above1 = inter6; + ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4); + src = inter7; + ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3); + VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5); + below1 = inter8; + ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4); + VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6); + below2 = inter9; + ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7); + if (col == (cols / 8 - 1)) { + above2 = inter9; + } else { + above2 = inter10; + } + ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8); + if (col == (cols / 8 - 1)) { + above1 = inter9; + } else { + above1 = inter11; + } + ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9); + VPX_TRANSPOSE8x16_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7, + inter8, inter9, inter2, inter3, inter4, inter5, + inter6, inter7, inter8, inter9, inter10, inter11, + inter12, inter13, inter14, inter15, above2, above1); + + p_dst += 8; + LD_UB2(p_dst, dst_stride, inter0, inter1); + ST8x1_UB(inter2, p_dst_st); + ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride)); + LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3); + ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride)); + ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride)); + LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5); + ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride)); + ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride)); + LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7); + ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride)); + ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride)); + LD_UB2(p_dst + 8 * dst_stride, dst_stride, inter8, inter9); + ST8x1_UB(inter10, (p_dst_st + 8 * dst_stride)); + ST8x1_UB(inter11, (p_dst_st + 9 * dst_stride)); + LD_UB2(p_dst + 10 * dst_stride, dst_stride, inter10, inter11); + ST8x1_UB(inter12, (p_dst_st + 10 * dst_stride)); + ST8x1_UB(inter13, (p_dst_st + 11 * dst_stride)); + LD_UB2(p_dst + 12 * dst_stride, dst_stride, inter12, inter13); + ST8x1_UB(inter14, (p_dst_st + 12 * dst_stride)); + ST8x1_UB(inter15, (p_dst_st + 13 * dst_stride)); + LD_UB2(p_dst + 14 * dst_stride, dst_stride, inter14, inter15); + ST8x1_UB(above2, (p_dst_st + 14 * dst_stride)); + ST8x1_UB(above1, (p_dst_st + 15 * dst_stride)); + p_dst_st += 8; + } +} + +void vpx_post_proc_down_and_across_mb_row_msa(uint8_t *src, uint8_t *dst, + int32_t src_stride, + int32_t dst_stride, int32_t cols, + uint8_t *f, int32_t size) { + if (8 == size) { + postproc_down_across_chroma_msa(src, dst, src_stride, dst_stride, cols, f); + } else if (16 == size) { + postproc_down_across_luma_msa(src, dst, src_stride, dst_stride, cols, f); + } +} + +void vpx_mbpost_proc_across_ip_msa(uint8_t *src, int32_t pitch, int32_t rows, + int32_t cols, int32_t flimit) { + int32_t row, col, cnt; + uint8_t *src_dup = src; + v16u8 src0, src1, tmp_orig; + v16u8 tmp = { 0 }; + v16i8 zero = { 0 }; + v8u16 sum_h, src_r_h, src_l_h; + v4u32 src_r_w; + v4i32 flimit_vec; + + flimit_vec = __msa_fill_w(flimit); + for (row = rows; row--;) { + int32_t sum_sq; + int32_t sum = 0; + src0 = (v16u8)__msa_fill_b(src_dup[0]); + ST8x1_UB(src0, (src_dup - 8)); + + src0 = (v16u8)__msa_fill_b(src_dup[cols - 1]); + ST_UB(src0, src_dup + cols); + src_dup[cols + 16] = src_dup[cols - 1]; + tmp_orig = (v16u8)__msa_ldi_b(0); + tmp_orig[15] = tmp[15]; + src1 = LD_UB(src_dup - 8); + src1[15] = 0; + ILVRL_B2_UH(zero, src1, src_r_h, src_l_h); + src_r_w = __msa_dotp_u_w(src_r_h, src_r_h); + src_r_w += __msa_dotp_u_w(src_l_h, src_l_h); + sum_sq = HADD_SW_S32(src_r_w) + 16; + sum_h = __msa_hadd_u_h(src1, src1); + sum = HADD_UH_U32(sum_h); + { + v16u8 src7, src8, src_r, src_l; + v16i8 mask; + v8u16 add_r, add_l; + v8i16 sub_r, sub_l, sum_r, sum_l, mask0, mask1; + v4i32 sum_sq0, sum_sq1, sum_sq2, sum_sq3; + v4i32 sub0, sub1, sub2, sub3; + v4i32 sum0_w, sum1_w, sum2_w, sum3_w; + v4i32 mul0, mul1, mul2, mul3; + v4i32 total0, total1, total2, total3; + v8i16 const8 = __msa_fill_h(8); + + src7 = LD_UB(src_dup + 7); + src8 = LD_UB(src_dup - 8); + for (col = 0; col < (cols >> 4); ++col) { + ILVRL_B2_UB(src7, src8, src_r, src_l); + HSUB_UB2_SH(src_r, src_l, sub_r, sub_l); + + sum_r[0] = sum + sub_r[0]; + for (cnt = 0; cnt < 7; ++cnt) { + sum_r[cnt + 1] = sum_r[cnt] + sub_r[cnt + 1]; + } + sum_l[0] = sum_r[7] + sub_l[0]; + for (cnt = 0; cnt < 7; ++cnt) { + sum_l[cnt + 1] = sum_l[cnt] + sub_l[cnt + 1]; + } + sum = sum_l[7]; + src1 = LD_UB(src_dup + 16 * col); + ILVRL_B2_UH(zero, src1, src_r_h, src_l_h); + src7 = (v16u8)((const8 + sum_r + (v8i16)src_r_h) >> 4); + src8 = (v16u8)((const8 + sum_l + (v8i16)src_l_h) >> 4); + tmp = (v16u8)__msa_pckev_b((v16i8)src8, (v16i8)src7); + + HADD_UB2_UH(src_r, src_l, add_r, add_l); + UNPCK_SH_SW(sub_r, sub0, sub1); + UNPCK_SH_SW(sub_l, sub2, sub3); + ILVR_H2_SW(zero, add_r, zero, add_l, sum0_w, sum2_w); + ILVL_H2_SW(zero, add_r, zero, add_l, sum1_w, sum3_w); + MUL4(sum0_w, sub0, sum1_w, sub1, sum2_w, sub2, sum3_w, sub3, mul0, mul1, + mul2, mul3); + sum_sq0[0] = sum_sq + mul0[0]; + for (cnt = 0; cnt < 3; ++cnt) { + sum_sq0[cnt + 1] = sum_sq0[cnt] + mul0[cnt + 1]; + } + sum_sq1[0] = sum_sq0[3] + mul1[0]; + for (cnt = 0; cnt < 3; ++cnt) { + sum_sq1[cnt + 1] = sum_sq1[cnt] + mul1[cnt + 1]; + } + sum_sq2[0] = sum_sq1[3] + mul2[0]; + for (cnt = 0; cnt < 3; ++cnt) { + sum_sq2[cnt + 1] = sum_sq2[cnt] + mul2[cnt + 1]; + } + sum_sq3[0] = sum_sq2[3] + mul3[0]; + for (cnt = 0; cnt < 3; ++cnt) { + sum_sq3[cnt + 1] = sum_sq3[cnt] + mul3[cnt + 1]; + } + sum_sq = sum_sq3[3]; + + UNPCK_SH_SW(sum_r, sum0_w, sum1_w); + UNPCK_SH_SW(sum_l, sum2_w, sum3_w); + total0 = sum_sq0 * __msa_ldi_w(15); + total0 -= sum0_w * sum0_w; + total1 = sum_sq1 * __msa_ldi_w(15); + total1 -= sum1_w * sum1_w; + total2 = sum_sq2 * __msa_ldi_w(15); + total2 -= sum2_w * sum2_w; + total3 = sum_sq3 * __msa_ldi_w(15); + total3 -= sum3_w * sum3_w; + total0 = (total0 < flimit_vec); + total1 = (total1 < flimit_vec); + total2 = (total2 < flimit_vec); + total3 = (total3 < flimit_vec); + PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1); + mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0); + tmp = __msa_bmz_v(tmp, src1, (v16u8)mask); + + if (col == 0) { + uint64_t src_d; + + src_d = __msa_copy_u_d((v2i64)tmp_orig, 1); + SD(src_d, (src_dup - 8)); + } + + src7 = LD_UB(src_dup + 16 * (col + 1) + 7); + src8 = LD_UB(src_dup + 16 * (col + 1) - 8); + ST_UB(tmp, (src_dup + (16 * col))); + } + + src_dup += pitch; + } + } +} + +void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows, + int32_t cols, int32_t flimit) { + int32_t row, col, cnt, i; + v4i32 flimit_vec; + v16u8 dst7, dst8, dst_r_b, dst_l_b; + v16i8 mask; + v8u16 add_r, add_l; + v8i16 dst_r_h, dst_l_h, sub_r, sub_l, mask0, mask1; + v4i32 sub0, sub1, sub2, sub3, total0, total1, total2, total3; + + flimit_vec = __msa_fill_w(flimit); + + for (col = 0; col < (cols >> 4); ++col) { + uint8_t *dst_tmp = &dst_ptr[col << 4]; + v16u8 dst; + v16i8 zero = { 0 }; + v16u8 tmp[16]; + v8i16 mult0, mult1, rv2_0, rv2_1; + v8i16 sum0_h = { 0 }; + v8i16 sum1_h = { 0 }; + v4i32 mul0 = { 0 }; + v4i32 mul1 = { 0 }; + v4i32 mul2 = { 0 }; + v4i32 mul3 = { 0 }; + v4i32 sum0_w, sum1_w, sum2_w, sum3_w; + v4i32 add0, add1, add2, add3; + const int16_t *rv2[16]; + + dst = LD_UB(dst_tmp); + for (cnt = (col << 4), i = 0; i < 16; ++cnt) { + rv2[i] = vpx_rv + (i & 7); + ++i; + } + for (cnt = -8; cnt < 0; ++cnt) { + ST_UB(dst, dst_tmp + cnt * pitch); + } + + dst = LD_UB((dst_tmp + (rows - 1) * pitch)); + for (cnt = rows; cnt < rows + 17; ++cnt) { + ST_UB(dst, dst_tmp + cnt * pitch); + } + for (cnt = -8; cnt <= 6; ++cnt) { + dst = LD_UB(dst_tmp + (cnt * pitch)); + UNPCK_UB_SH(dst, dst_r_h, dst_l_h); + MUL2(dst_r_h, dst_r_h, dst_l_h, dst_l_h, mult0, mult1); + mul0 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult0); + mul1 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult0); + mul2 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult1); + mul3 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult1); + ADD2(sum0_h, dst_r_h, sum1_h, dst_l_h, sum0_h, sum1_h); + } + + for (row = 0; row < (rows + 8); ++row) { + for (i = 0; i < 8; ++i) { + rv2_0[i] = *(rv2[i] + (row & 127)); + rv2_1[i] = *(rv2[i + 8] + (row & 127)); + } + dst7 = LD_UB(dst_tmp + (7 * pitch)); + dst8 = LD_UB(dst_tmp - (8 * pitch)); + ILVRL_B2_UB(dst7, dst8, dst_r_b, dst_l_b); + + HSUB_UB2_SH(dst_r_b, dst_l_b, sub_r, sub_l); + UNPCK_SH_SW(sub_r, sub0, sub1); + UNPCK_SH_SW(sub_l, sub2, sub3); + sum0_h += sub_r; + sum1_h += sub_l; + + HADD_UB2_UH(dst_r_b, dst_l_b, add_r, add_l); + + ILVRL_H2_SW(zero, add_r, add0, add1); + ILVRL_H2_SW(zero, add_l, add2, add3); + mul0 += add0 * sub0; + mul1 += add1 * sub1; + mul2 += add2 * sub2; + mul3 += add3 * sub3; + dst = LD_UB(dst_tmp); + ILVRL_B2_SH(zero, dst, dst_r_h, dst_l_h); + dst7 = (v16u8)((rv2_0 + sum0_h + dst_r_h) >> 4); + dst8 = (v16u8)((rv2_1 + sum1_h + dst_l_h) >> 4); + tmp[row & 15] = (v16u8)__msa_pckev_b((v16i8)dst8, (v16i8)dst7); + + UNPCK_SH_SW(sum0_h, sum0_w, sum1_w); + UNPCK_SH_SW(sum1_h, sum2_w, sum3_w); + total0 = mul0 * __msa_ldi_w(15); + total0 -= sum0_w * sum0_w; + total1 = mul1 * __msa_ldi_w(15); + total1 -= sum1_w * sum1_w; + total2 = mul2 * __msa_ldi_w(15); + total2 -= sum2_w * sum2_w; + total3 = mul3 * __msa_ldi_w(15); + total3 -= sum3_w * sum3_w; + total0 = (total0 < flimit_vec); + total1 = (total1 < flimit_vec); + total2 = (total2 < flimit_vec); + total3 = (total3 < flimit_vec); + PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1); + mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0); + tmp[row & 15] = __msa_bmz_v(tmp[row & 15], dst, (v16u8)mask); + + if (row >= 8) { + ST_UB(tmp[(row - 8) & 15], (dst_tmp - 8 * pitch)); + } + + dst_tmp += pitch; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c new file mode 100644 index 0000000000..36583e2d24 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c @@ -0,0 +1,948 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/fwd_txfm_msa.h" + +static void fdct8x32_1d_column_load_butterfly(const int16_t *input, + int32_t src_stride, + int16_t *temp_buff) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 step0, step1, step2, step3; + v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1; + v8i16 step0_1, step1_1, step2_1, step3_1; + + /* 1st and 2nd set */ + LD_SH4(input, src_stride, in0, in1, in2, in3); + LD_SH4(input + (28 * src_stride), src_stride, in4, in5, in6, in7); + LD_SH4(input + (4 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1); + LD_SH4(input + (24 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1); + SLLI_4V(in0, in1, in2, in3, 2); + SLLI_4V(in4, in5, in6, in7, 2); + SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2); + SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2); + BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2, + step3, in4, in5, in6, in7); + BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1, + step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1); + ST_SH4(step0, step1, step2, step3, temp_buff, 8); + ST_SH4(in4, in5, in6, in7, temp_buff + (28 * 8), 8); + ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (4 * 8), 8); + ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (24 * 8), 8); + + /* 3rd and 4th set */ + LD_SH4(input + (8 * src_stride), src_stride, in0, in1, in2, in3); + LD_SH4(input + (20 * src_stride), src_stride, in4, in5, in6, in7); + LD_SH4(input + (12 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1); + LD_SH4(input + (16 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1); + SLLI_4V(in0, in1, in2, in3, 2); + SLLI_4V(in4, in5, in6, in7, 2); + SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2); + SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2); + BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2, + step3, in4, in5, in6, in7); + BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1, + step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1); + ST_SH4(step0, step1, step2, step3, temp_buff + (8 * 8), 8); + ST_SH4(in4, in5, in6, in7, temp_buff + (20 * 8), 8); + ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (12 * 8), 8); + ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (15 * 8) + 8, 8); +} + +static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 temp0, temp1; + + /* fdct even */ + LD_SH4(input, 8, in0, in1, in2, in3); + LD_SH4(input + 96, 8, in12, in13, in14, in15); + BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1, vec2, + vec3, in12, in13, in14, in15); + LD_SH4(input + 32, 8, in4, in5, in6, in7); + LD_SH4(input + 64, 8, in8, in9, in10, in11); + BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6, vec7, + in8, in9, in10, in11); + + /* Stage 3 */ + ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); + BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0); + DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp); + ST_SH(temp1, temp + 512); + + DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 256); + ST_SH(temp1, temp + 768); + + SUB4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, vec6, vec5, vec4); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + ADD2(vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 128); + ST_SH(temp1, temp + 896); + + SUB2(vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 640); + ST_SH(temp1, temp + 384); + + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + ADD2(in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 64); + ST_SH(temp1, temp + 960); + + SUB2(in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 576); + ST_SH(temp1, temp + 448); + + SUB2(in9, vec2, in14, vec5, vec2, vec5); + DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); + SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 320); + ST_SH(temp1, temp + 704); + + ADD2(in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 192); + ST_SH(temp1, temp + 832); +} + +static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) { + v8i16 in16, in17, in18, in19, in20, in21, in22, in23; + v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5; + + in20 = LD_SH(input + 32); + in21 = LD_SH(input + 40); + in26 = LD_SH(input + 80); + in27 = LD_SH(input + 88); + + DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); + DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); + + in18 = LD_SH(input + 16); + in19 = LD_SH(input + 24); + in28 = LD_SH(input + 96); + in29 = LD_SH(input + 104); + + vec4 = in19 - in20; + ST_SH(vec4, input + 32); + vec4 = in18 - in21; + ST_SH(vec4, input + 40); + vec4 = in29 - in26; + ST_SH(vec4, input + 80); + vec4 = in28 - in27; + ST_SH(vec4, input + 88); + + in21 = in18 + in21; + in20 = in19 + in20; + in27 = in28 + in27; + in26 = in29 + in26; + + LD_SH4(input + 48, 8, in22, in23, in24, in25); + DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); + DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); + + in16 = LD_SH(input); + in17 = LD_SH(input + 8); + in30 = LD_SH(input + 112); + in31 = LD_SH(input + 120); + + vec4 = in17 - in22; + ST_SH(vec4, input + 16); + vec4 = in16 - in23; + ST_SH(vec4, input + 24); + vec4 = in31 - in24; + ST_SH(vec4, input + 96); + vec4 = in30 - in25; + ST_SH(vec4, input + 104); + + ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); + DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); + DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); + ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); + DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); + ADD2(in27, in26, in25, in24, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr); + ST_SH(vec4, temp_ptr + 960); + + SUB2(in27, in26, in25, in24, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr + 448); + ST_SH(vec4, temp_ptr + 512); + + SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); + DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); + SUB2(in26, in27, in24, in25, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec4, temp_ptr + 704); + ST_SH(vec5, temp_ptr + 256); + + ADD2(in26, in27, in24, in25, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec4, temp_ptr + 192); + ST_SH(vec5, temp_ptr + 768); + + LD_SH4(input + 16, 8, in22, in23, in20, in21); + LD_SH4(input + 80, 8, in26, in27, in24, in25); + in16 = in20; + in17 = in21; + DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); + DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); + SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); + DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); + ADD2(in28, in29, in31, in30, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr + 832); + ST_SH(vec4, temp_ptr + 128); + + SUB2(in28, in29, in31, in30, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr + 320); + ST_SH(vec4, temp_ptr + 640); + ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); + DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); + SUB2(in29, in28, in30, in31, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr + 576); + ST_SH(vec4, temp_ptr + 384); + + ADD2(in29, in28, in30, in31, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr + 64); + ST_SH(vec4, temp_ptr + 896); +} + +static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride, + int16_t *tmp_buf, int16_t *tmp_buf_big) { + fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf); + fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big); + fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32)); +} + +static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff, + int16_t *output) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 step0, step1, step2, step3, step4, step5, step6, step7; + + LD_SH8(temp_buff, 32, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8(temp_buff + 24, 32, in8, in9, in10, in11, in12, in13, in14, in15); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, step0, step1, step2, step3, step4, step5, + step6, step7, in8, in9, in10, in11, in12, in13, in14, in15); + ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, output, 8); + ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 24 * 8), 8); + + /* 2nd set */ + LD_SH8(temp_buff + 8, 32, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8(temp_buff + 16, 32, in8, in9, in10, in11, in12, in13, in14, in15); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, step0, step1, step2, step3, step4, step5, + step6, step7, in8, in9, in10, in11, in12, in13, in14, in15); + ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, + (output + 8 * 8), 8); + ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 16 * 8), 8); +} + +static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr, + int16_t *out) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l; + v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r; + v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w; + + /* fdct32 even */ + /* stage 2 */ + LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); + + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, + vec7, in8, in9, in10, in11, in12, in13, in14, in15); + ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8); + ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8); + + /* Stage 3 */ + UNPCK_SH_SW(vec0, vec0_l, vec0_r); + UNPCK_SH_SW(vec1, vec1_l, vec1_r); + UNPCK_SH_SW(vec2, vec2_l, vec2_r); + UNPCK_SH_SW(vec3, vec3_l, vec3_r); + UNPCK_SH_SW(vec4, vec4_l, vec4_r); + UNPCK_SH_SW(vec5, vec5_l, vec5_r); + UNPCK_SH_SW(vec6, vec6_l, vec6_r); + UNPCK_SH_SW(vec7, vec7_l, vec7_r); + ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r, tmp0_w, + tmp1_w, tmp2_w, tmp3_w); + BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r); + ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l, vec0_r, + vec1_r, vec2_r, vec3_r); + + tmp3_w = vec0_r + vec3_r; + vec0_r = vec0_r - vec3_r; + vec3_r = vec1_r + vec2_r; + vec1_r = vec1_r - vec2_r; + + DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64, + vec4_r, tmp3_w, vec6_r, vec3_r); + FDCT32_POSTPROC_NEG_W(vec4_r); + FDCT32_POSTPROC_NEG_W(tmp3_w); + FDCT32_POSTPROC_NEG_W(vec6_r); + FDCT32_POSTPROC_NEG_W(vec3_r); + PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); + ST_SH2(vec5, vec4, out, 8); + + DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64, + vec4_r, tmp3_w, vec6_r, vec3_r); + FDCT32_POSTPROC_NEG_W(vec4_r); + FDCT32_POSTPROC_NEG_W(tmp3_w); + FDCT32_POSTPROC_NEG_W(vec6_r); + FDCT32_POSTPROC_NEG_W(vec3_r); + PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); + ST_SH2(vec5, vec4, out + 16, 8); + + LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); + SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + ADD2(vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 32); + ST_SH(in5, out + 56); + + SUB2(vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 40); + ST_SH(in5, out + 48); + + LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + ADD2(in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 64); + ST_SH(in5, out + 120); + + SUB2(in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 72); + ST_SH(in5, out + 112); + + SUB2(in9, vec2, in14, vec5, vec2, vec5); + DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); + SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 80); + ST_SH(in5, out + 104); + + ADD2(in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 96); + ST_SH(in5, out + 88); +} + +static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; + + /* fdct32 even */ + /* stage 2 */ + LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); + + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, + vec7, in8, in9, in10, in11, in12, in13, in14, in15); + + /* Stage 3 */ + ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); + BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0); + DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out); + ST_SH(temp1, out + 8); + + DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 16); + ST_SH(temp1, out + 24); + + SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + ADD2(vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 32); + ST_SH(temp1, out + 56); + + SUB2(vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 40); + ST_SH(temp1, out + 48); + + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + ADD2(in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 64); + ST_SH(temp1, out + 120); + + SUB2(in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 72); + ST_SH(temp1, out + 112); + + SUB2(in9, vec2, in14, vec5, vec2, vec5); + DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); + SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5) + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 80); + ST_SH(temp1, out + 104); + + ADD2(in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 96); + ST_SH(temp1, out + 88); +} + +static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr, + int16_t *out) { + v8i16 in16, in17, in18, in19, in20, in21, in22, in23; + v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5; + + in20 = LD_SH(temp + 32); + in21 = LD_SH(temp + 40); + in26 = LD_SH(temp + 80); + in27 = LD_SH(temp + 88); + + DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); + DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); + + in18 = LD_SH(temp + 16); + in19 = LD_SH(temp + 24); + in28 = LD_SH(temp + 96); + in29 = LD_SH(temp + 104); + + vec4 = in19 - in20; + ST_SH(vec4, interm_ptr + 32); + vec4 = in18 - in21; + ST_SH(vec4, interm_ptr + 88); + vec4 = in28 - in27; + ST_SH(vec4, interm_ptr + 56); + vec4 = in29 - in26; + ST_SH(vec4, interm_ptr + 64); + + ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26); + + in22 = LD_SH(temp + 48); + in23 = LD_SH(temp + 56); + in24 = LD_SH(temp + 64); + in25 = LD_SH(temp + 72); + + DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); + DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); + + in16 = LD_SH(temp); + in17 = LD_SH(temp + 8); + in30 = LD_SH(temp + 112); + in31 = LD_SH(temp + 120); + + vec4 = in17 - in22; + ST_SH(vec4, interm_ptr + 40); + vec4 = in30 - in25; + ST_SH(vec4, interm_ptr + 48); + vec4 = in31 - in24; + ST_SH(vec4, interm_ptr + 72); + vec4 = in16 - in23; + ST_SH(vec4, interm_ptr + 80); + + ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); + DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); + DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); + + ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); + DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); + ADD2(in27, in26, in25, in24, in23, in20); + + DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec5, out); + ST_SH(vec4, out + 120); + + SUB2(in27, in26, in25, in24, in22, in21); + + DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec5, out + 112); + ST_SH(vec4, out + 8); + + SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); + DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); + SUB2(in26, in27, in24, in25, in23, in20); + + DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec4, out + 16); + ST_SH(vec5, out + 104); + + ADD2(in26, in27, in24, in25, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec4, out + 24); + ST_SH(vec5, out + 96); + + in20 = LD_SH(interm_ptr + 32); + in21 = LD_SH(interm_ptr + 88); + in27 = LD_SH(interm_ptr + 56); + in26 = LD_SH(interm_ptr + 64); + + in16 = in20; + in17 = in21; + DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); + DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); + + in22 = LD_SH(interm_ptr + 40); + in25 = LD_SH(interm_ptr + 48); + in24 = LD_SH(interm_ptr + 72); + in23 = LD_SH(interm_ptr + 80); + + SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); + DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); + ADD2(in28, in29, in31, in30, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec5, out + 32); + ST_SH(vec4, out + 88); + + SUB2(in28, in29, in31, in30, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec5, out + 40); + ST_SH(vec4, out + 80); + + ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); + DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); + SUB2(in29, in28, in30, in31, in16, in19); + + DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec5, out + 72); + ST_SH(vec4, out + 48); + + ADD2(in29, in28, in30, in31, in17, in18); + + DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec4, out + 56); + ST_SH(vec5, out + 64); +} + +static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1; + + /* 1st set */ + in0 = LD_SH(temp); + in4 = LD_SH(temp + 32); + in2 = LD_SH(temp + 64); + in6 = LD_SH(temp + 96); + in1 = LD_SH(temp + 128); + in7 = LD_SH(temp + 152); + in3 = LD_SH(temp + 192); + in5 = LD_SH(temp + 216); + + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + + /* 2nd set */ + in0_1 = LD_SH(temp + 16); + in1_1 = LD_SH(temp + 232); + in2_1 = LD_SH(temp + 80); + in3_1 = LD_SH(temp + 168); + in4_1 = LD_SH(temp + 48); + in5_1 = LD_SH(temp + 176); + in6_1 = LD_SH(temp + 112); + in7_1 = LD_SH(temp + 240); + + ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 32); + TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1); + + /* 3rd set */ + in0 = LD_SH(temp + 8); + in1 = LD_SH(temp + 136); + in2 = LD_SH(temp + 72); + in3 = LD_SH(temp + 200); + in4 = LD_SH(temp + 40); + in5 = LD_SH(temp + 208); + in6 = LD_SH(temp + 104); + in7 = LD_SH(temp + 144); + + ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 8, + 32); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output + 16, 32); + + /* 4th set */ + in0_1 = LD_SH(temp + 24); + in1_1 = LD_SH(temp + 224); + in2_1 = LD_SH(temp + 88); + in3_1 = LD_SH(temp + 160); + in4_1 = LD_SH(temp + 56); + in5_1 = LD_SH(temp + 184); + in6_1 = LD_SH(temp + 120); + in7_1 = LD_SH(temp + 248); + + TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1); + ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 24, + 32); +} + +static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, int16_t *output) { + fdct8x32_1d_row_load_butterfly(temp, temp_buf); + fdct8x32_1d_row_even(temp_buf, temp_buf); + fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128); + fdct8x32_1d_row_transpose_store(temp_buf, output); +} + +static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf, + int16_t *output) { + fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf); + fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf); + fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128); + fdct8x32_1d_row_transpose_store(tmp_buf, output); +} + +void vpx_fdct32x32_msa(const int16_t *input, int16_t *output, + int32_t src_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]); + DECLARE_ALIGNED(32, int16_t, tmp_buf[256]); + + /* column transform */ + for (i = 0; i < 4; ++i) { + fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf, + tmp_buf_big + (8 * i)); + } + + /* row transform */ + fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output); + + /* row transform */ + for (i = 1; i < 4; ++i) { + fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256)); + } +} + +static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; + + /* fdct32 even */ + /* stage 2 */ + LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); + + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, + vec7, in8, in9, in10, in11, in12, in13, in14, in15); + FDCT_POSTPROC_2V_NEG_H(vec0, vec1); + FDCT_POSTPROC_2V_NEG_H(vec2, vec3); + FDCT_POSTPROC_2V_NEG_H(vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec6, vec7); + FDCT_POSTPROC_2V_NEG_H(in8, in9); + FDCT_POSTPROC_2V_NEG_H(in10, in11); + FDCT_POSTPROC_2V_NEG_H(in12, in13); + FDCT_POSTPROC_2V_NEG_H(in14, in15); + + /* Stage 3 */ + ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); + + temp0 = in0 + in3; + in0 = in0 - in3; + in3 = in1 + in2; + in1 = in1 - in2; + + DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0); + ST_SH(temp0, out); + ST_SH(temp1, out + 8); + + DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); + ST_SH(temp0, out + 16); + ST_SH(temp1, out + 24); + + SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + ADD2(vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); + ST_SH(temp0, out + 32); + ST_SH(temp1, out + 56); + + SUB2(vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); + ST_SH(temp0, out + 40); + ST_SH(temp1, out + 48); + + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + ADD2(in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); + ST_SH(temp0, out + 64); + ST_SH(temp1, out + 120); + + SUB2(in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); + ST_SH(temp0, out + 72); + ST_SH(temp1, out + 112); + + SUB2(in9, vec2, in14, vec5, vec2, vec5); + DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); + SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); + ST_SH(temp0, out + 80); + ST_SH(temp1, out + 104); + + ADD2(in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); + ST_SH(temp0, out + 96); + ST_SH(temp1, out + 88); +} + +static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr, + int16_t *out) { + v8i16 in16, in17, in18, in19, in20, in21, in22, in23; + v8i16 in24, in25, in26, in27, in28, in29, in30, in31; + v8i16 vec4, vec5; + + in20 = LD_SH(temp + 32); + in21 = LD_SH(temp + 40); + in26 = LD_SH(temp + 80); + in27 = LD_SH(temp + 88); + + DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); + DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); + + FDCT_POSTPROC_2V_NEG_H(in20, in21); + FDCT_POSTPROC_2V_NEG_H(in26, in27); + + in18 = LD_SH(temp + 16); + in19 = LD_SH(temp + 24); + in28 = LD_SH(temp + 96); + in29 = LD_SH(temp + 104); + + FDCT_POSTPROC_2V_NEG_H(in18, in19); + FDCT_POSTPROC_2V_NEG_H(in28, in29); + + vec4 = in19 - in20; + ST_SH(vec4, interm_ptr + 32); + vec4 = in18 - in21; + ST_SH(vec4, interm_ptr + 88); + vec4 = in29 - in26; + ST_SH(vec4, interm_ptr + 64); + vec4 = in28 - in27; + ST_SH(vec4, interm_ptr + 56); + + ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26); + + in22 = LD_SH(temp + 48); + in23 = LD_SH(temp + 56); + in24 = LD_SH(temp + 64); + in25 = LD_SH(temp + 72); + + DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); + DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); + FDCT_POSTPROC_2V_NEG_H(in22, in23); + FDCT_POSTPROC_2V_NEG_H(in24, in25); + + in16 = LD_SH(temp); + in17 = LD_SH(temp + 8); + in30 = LD_SH(temp + 112); + in31 = LD_SH(temp + 120); + + FDCT_POSTPROC_2V_NEG_H(in16, in17); + FDCT_POSTPROC_2V_NEG_H(in30, in31); + + vec4 = in17 - in22; + ST_SH(vec4, interm_ptr + 40); + vec4 = in30 - in25; + ST_SH(vec4, interm_ptr + 48); + vec4 = in31 - in24; + ST_SH(vec4, interm_ptr + 72); + vec4 = in16 - in23; + ST_SH(vec4, interm_ptr + 80); + + ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); + DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); + DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); + ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); + DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); + ADD2(in27, in26, in25, in24, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); + ST_SH(vec5, out); + ST_SH(vec4, out + 120); + + SUB2(in27, in26, in25, in24, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); + ST_SH(vec5, out + 112); + ST_SH(vec4, out + 8); + + SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); + DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); + SUB2(in26, in27, in24, in25, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); + ST_SH(vec4, out + 16); + ST_SH(vec5, out + 104); + + ADD2(in26, in27, in24, in25, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); + ST_SH(vec4, out + 24); + ST_SH(vec5, out + 96); + + in20 = LD_SH(interm_ptr + 32); + in21 = LD_SH(interm_ptr + 88); + in27 = LD_SH(interm_ptr + 56); + in26 = LD_SH(interm_ptr + 64); + + in16 = in20; + in17 = in21; + DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); + DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); + + in22 = LD_SH(interm_ptr + 40); + in25 = LD_SH(interm_ptr + 48); + in24 = LD_SH(interm_ptr + 72); + in23 = LD_SH(interm_ptr + 80); + + SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); + DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); + in16 = in28 + in29; + in19 = in31 + in30; + DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); + ST_SH(vec5, out + 32); + ST_SH(vec4, out + 88); + + SUB2(in28, in29, in31, in30, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); + ST_SH(vec5, out + 40); + ST_SH(vec4, out + 80); + + ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); + DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); + SUB2(in29, in28, in30, in31, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); + ST_SH(vec5, out + 72); + ST_SH(vec4, out + 48); + + ADD2(in29, in28, in30, in31, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); + ST_SH(vec4, out + 56); + ST_SH(vec5, out + 64); +} + +static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf, + int16_t *output) { + fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf); + fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf); + fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128)); + fdct8x32_1d_row_transpose_store(tmp_buf, output); +} + +void vpx_fdct32x32_rd_msa(const int16_t *input, int16_t *out, + int32_t src_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]); + DECLARE_ALIGNED(32, int16_t, tmp_buf[256]); + + /* column transform */ + for (i = 0; i < 4; ++i) { + fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0], + &tmp_buf_big[0] + (8 * i)); + } + + /* row transform */ + for (i = 0; i < 4; ++i) { + fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0], + out + (8 * i * 32)); + } +} + +void vpx_fdct32x32_1_msa(const int16_t *input, int16_t *out, int32_t stride) { + int sum, i; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v4i32 vec_w = { 0 }; + + for (i = 0; i < 16; ++i) { + LD_SH4(input, 8, in0, in1, in2, in3); + input += stride; + LD_SH4(input, 8, in4, in5, in6, in7); + input += stride; + ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6); + ADD2(in0, in2, in4, in6, in0, in4); + vec_w += __msa_hadd_s_w(in0, in0); + vec_w += __msa_hadd_s_w(in4, in4); + } + + sum = HADD_SW_S32(vec_w); + out[0] = (int16_t)(sum >> 3); +} diff --git a/media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.c new file mode 100644 index 0000000000..5a6dfcef2f --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.c @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/fwd_txfm_msa.h" + +void vpx_fdct8x8_1_msa(const int16_t *input, tran_low_t *out, int32_t stride) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v4i32 vec_w; + + LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7); + ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6); + ADD2(in0, in2, in4, in6, in0, in4); + vec_w = __msa_hadd_s_w(in0, in0); + vec_w += __msa_hadd_s_w(in4, in4); + out[0] = HADD_SW_S32(vec_w); + out[1] = 0; +} + +#if !CONFIG_VP9_HIGHBITDEPTH +void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, + int32_t src_stride) { + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 stp21, stp22, stp23, stp24, stp25, stp26, stp30; + v8i16 stp31, stp32, stp33, stp34, stp35, stp36, stp37; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5; + v8i16 coeff = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, + -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 }; + v8i16 coeff1 = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64, + cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 }; + v8i16 coeff2 = { + -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0 + }; + + LD_SH16(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, + in10, in11, in12, in13, in14, in15); + SLLI_4V(in0, in1, in2, in3, 2); + SLLI_4V(in4, in5, in6, in7, 2); + SLLI_4V(in8, in9, in10, in11, 2); + SLLI_4V(in12, in13, in14, in15, 2); + ADD4(in0, in15, in1, in14, in2, in13, in3, in12, tmp0, tmp1, tmp2, tmp3); + ADD4(in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, tmp6, tmp7); + FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, + tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); + ST_SH8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp_ptr, 32); + SUB4(in0, in15, in1, in14, in2, in13, in3, in12, in15, in14, in13, in12); + SUB4(in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, in9, in8); + + tmp_ptr += 16; + + /* stp 1 */ + ILVL_H2_SH(in10, in13, in11, in12, vec2, vec4); + ILVR_H2_SH(in10, in13, in11, in12, vec3, vec5); + + cnst4 = __msa_splati_h(coeff, 0); + stp25 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4); + + cnst5 = __msa_splati_h(coeff, 1); + cnst5 = __msa_ilvev_h(cnst5, cnst4); + stp22 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5); + stp24 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4); + stp23 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5); + + /* stp2 */ + BUTTERFLY_4(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33); + BUTTERFLY_4(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34); + ILVL_H2_SH(stp36, stp31, stp35, stp32, vec2, vec4); + ILVR_H2_SH(stp36, stp31, stp35, stp32, vec3, vec5); + SPLATI_H2_SH(coeff, 2, 3, cnst0, cnst1); + cnst0 = __msa_ilvev_h(cnst0, cnst1); + stp26 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0); + + cnst0 = __msa_splati_h(coeff, 4); + cnst1 = __msa_ilvev_h(cnst1, cnst0); + stp21 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1); + + BUTTERFLY_4(stp30, stp37, stp26, stp21, in8, in15, in14, in9); + ILVRL_H2_SH(in15, in8, vec1, vec0); + SPLATI_H2_SH(coeff1, 0, 1, cnst0, cnst1); + cnst0 = __msa_ilvev_h(cnst0, cnst1); + + in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); + ST_SH(in8, tmp_ptr); + + cnst0 = __msa_splati_h(coeff2, 0); + cnst0 = __msa_ilvev_h(cnst1, cnst0); + in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); + ST_SH(in8, tmp_ptr + 224); + + ILVRL_H2_SH(in14, in9, vec1, vec0); + SPLATI_H2_SH(coeff1, 2, 3, cnst0, cnst1); + cnst1 = __msa_ilvev_h(cnst1, cnst0); + + in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1); + ST_SH(in8, tmp_ptr + 128); + + cnst1 = __msa_splati_h(coeff2, 2); + cnst0 = __msa_ilvev_h(cnst0, cnst1); + in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); + ST_SH(in8, tmp_ptr + 96); + + SPLATI_H2_SH(coeff, 2, 5, cnst0, cnst1); + cnst1 = __msa_ilvev_h(cnst1, cnst0); + + stp25 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1); + + cnst1 = __msa_splati_h(coeff, 3); + cnst1 = __msa_ilvev_h(cnst0, cnst1); + stp22 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1); + + /* stp4 */ + ADD2(stp34, stp25, stp33, stp22, in13, in10); + + ILVRL_H2_SH(in13, in10, vec1, vec0); + SPLATI_H2_SH(coeff1, 4, 5, cnst0, cnst1); + cnst0 = __msa_ilvev_h(cnst0, cnst1); + in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); + ST_SH(in8, tmp_ptr + 64); + + cnst0 = __msa_splati_h(coeff2, 1); + cnst0 = __msa_ilvev_h(cnst1, cnst0); + in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); + ST_SH(in8, tmp_ptr + 160); + + SUB2(stp34, stp25, stp33, stp22, in12, in11); + ILVRL_H2_SH(in12, in11, vec1, vec0); + SPLATI_H2_SH(coeff1, 6, 7, cnst0, cnst1); + cnst1 = __msa_ilvev_h(cnst1, cnst0); + + in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1); + ST_SH(in8, tmp_ptr + 192); + + cnst1 = __msa_splati_h(coeff2, 3); + cnst0 = __msa_ilvev_h(cnst0, cnst1); + in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); + ST_SH(in8, tmp_ptr + 32); +} + +void fdct16x8_1d_row(int16_t *input, int16_t *output) { + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + + LD_SH8(input, 16, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8((input + 8), 16, in8, in9, in10, in11, in12, in13, in14, in15); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); + ADD4(in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7); + ADD4(in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, in11); + ADD4(in12, 1, in13, 1, in14, 1, in15, 1, in12, in13, in14, in15); + SRA_4V(in0, in1, in2, in3, 2); + SRA_4V(in4, in5, in6, in7, 2); + SRA_4V(in8, in9, in10, in11, 2); + SRA_4V(in12, in13, in14, in15, 2); + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, + tmp7, in8, in9, in10, in11, in12, in13, in14, in15); + ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, input, 16); + FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, + tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); + LD_SH8(input, 16, in8, in9, in10, in11, in12, in13, in14, in15); + FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0, + tmp1, in1, tmp2, in2, tmp3, in3); + ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, output, 16); + TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4, + tmp5, in5, tmp6, in6, tmp7, in7); + ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, output + 8, 16); +} + +void vpx_fdct4x4_msa(const int16_t *input, int16_t *output, + int32_t src_stride) { + v8i16 in0, in1, in2, in3; + + LD_SH4(input, src_stride, in0, in1, in2, in3); + + /* fdct4 pre-process */ + { + v8i16 vec, mask; + v16i8 zero = { 0 }; + v16i8 one = __msa_ldi_b(1); + + mask = (v8i16)__msa_sldi_b(zero, one, 15); + SLLI_4V(in0, in1, in2, in3, 4); + vec = __msa_ceqi_h(in0, 0); + vec = vec ^ 255; + vec = mask & vec; + in0 += vec; + } + + VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); + SRA_4V(in0, in1, in2, in3, 2); + PCKEV_D2_SH(in1, in0, in3, in2, in0, in2); + ST_SH2(in0, in2, output, 8); +} + +void vpx_fdct8x8_msa(const int16_t *input, int16_t *output, + int32_t src_stride) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + + LD_SH8(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7); + SLLI_4V(in0, in1, in2, in3, 2); + SLLI_4V(in4, in5, in6, in7, 2); + VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7); + ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8); +} + +void vpx_fdct16x16_msa(const int16_t *input, int16_t *output, + int32_t src_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]); + + /* column transform */ + for (i = 0; i < 2; ++i) { + fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride); + } + + /* row transform */ + for (i = 0; i < 2; ++i) { + fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i))); + } +} + +void vpx_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) { + int sum, i; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v4i32 vec_w = { 0 }; + + for (i = 0; i < 4; ++i) { + LD_SH2(input, 8, in0, in1); + input += stride; + LD_SH2(input, 8, in2, in3); + input += stride; + LD_SH2(input, 8, in4, in5); + input += stride; + LD_SH2(input, 8, in6, in7); + input += stride; + ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6); + ADD2(in0, in2, in4, in6, in0, in4); + vec_w += __msa_hadd_s_w(in0, in0); + vec_w += __msa_hadd_s_w(in4, in4); + } + + sum = HADD_SW_S32(vec_w); + out[0] = (int16_t)(sum >> 1); +} +#endif // !CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.h b/media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.h new file mode 100644 index 0000000000..c0be56b819 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.h @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_MIPS_FWD_TXFM_MSA_H_ +#define VPX_VPX_DSP_MIPS_FWD_TXFM_MSA_H_ + +#include "vpx_dsp/mips/txfm_macros_msa.h" +#include "vpx_dsp/txfm_common.h" + +#define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m; \ + v8i16 vec0_m, vec1_m, vec2_m, vec3_m; \ + v4i32 vec4_m, vec5_m, vec6_m, vec7_m; \ + v8i16 coeff_m = { \ + cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, -cospi_8_64, 0, 0, 0 \ + }; \ + \ + BUTTERFLY_4(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m); \ + ILVR_H2_SH(vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \ + SPLATI_H2_SH(coeff_m, 0, 1, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + vec5_m = __msa_dotp_s_w(vec0_m, cnst1_m); \ + \ + SPLATI_H2_SH(coeff_m, 4, 3, cnst2_m, cnst3_m); \ + cnst2_m = __msa_ilvev_h(cnst3_m, cnst2_m); \ + vec7_m = __msa_dotp_s_w(vec2_m, cnst2_m); \ + \ + vec4_m = __msa_dotp_s_w(vec0_m, cnst0_m); \ + cnst2_m = __msa_splati_h(coeff_m, 2); \ + cnst2_m = __msa_ilvev_h(cnst2_m, cnst3_m); \ + vec6_m = __msa_dotp_s_w(vec2_m, cnst2_m); \ + \ + SRARI_W4_SW(vec4_m, vec5_m, vec6_m, vec7_m, DCT_CONST_BITS); \ + PCKEV_H4_SH(vec4_m, vec4_m, vec5_m, vec5_m, vec6_m, vec6_m, vec7_m, \ + vec7_m, out0, out2, out1, out3); \ + } + +#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) \ + { \ + v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + \ + SRLI_H4_SH(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m, 15); \ + SRLI_H4_SH(in4, in5, in6, in7, vec4_m, vec5_m, vec6_m, vec7_m, 15); \ + AVE_SH4_SH(vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, in3, in0, in1, \ + in2, in3); \ + AVE_SH4_SH(vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, in7, in4, in5, \ + in6, in7); \ + } + +#define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ + { \ + v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \ + v8i16 s7_m, x0_m, x1_m, x2_m, x3_m; \ + v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \ + cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 }; \ + \ + /* FDCT stage1 */ \ + BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m, \ + s3_m, s4_m, s5_m, s6_m, s7_m); \ + BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ + ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ + ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ + SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m); \ + x1_m = __msa_ilvev_h(x1_m, x0_m); \ + out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ + \ + SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m); \ + x2_m = -x2_m; \ + x2_m = __msa_ilvev_h(x3_m, x2_m); \ + out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ + \ + out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ + x2_m = __msa_splati_h(coeff_m, 2); \ + x2_m = __msa_ilvev_h(x2_m, x3_m); \ + out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ + \ + /* stage2 */ \ + ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m); \ + \ + s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ + s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ + \ + /* stage3 */ \ + BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \ + \ + /* stage4 */ \ + ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \ + ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \ + \ + SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m); \ + x1_m = __msa_ilvev_h(x0_m, x1_m); \ + out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m); \ + \ + SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m); \ + x2_m = __msa_ilvev_h(x3_m, x2_m); \ + out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ + \ + x1_m = __msa_splati_h(coeff_m, 5); \ + x0_m = -x0_m; \ + x0_m = __msa_ilvev_h(x1_m, x0_m); \ + out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m); \ + \ + x2_m = __msa_splati_h(coeff_m, 6); \ + x3_m = -x3_m; \ + x2_m = __msa_ilvev_h(x2_m, x3_m); \ + out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ + } + +#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ + v8i16 x0_m, x1_m, x2_m, x3_m; \ + v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \ + cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 }; \ + \ + /* FDCT stage1 */ \ + BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m, \ + s3_m, s4_m, s5_m, s6_m, s7_m); \ + BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ + ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ + ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ + SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m); \ + x1_m = __msa_ilvev_h(x1_m, x0_m); \ + out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ + \ + SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m); \ + x2_m = -x2_m; \ + x2_m = __msa_ilvev_h(x3_m, x2_m); \ + out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ + \ + out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ + x2_m = __msa_splati_h(coeff_m, 2); \ + x2_m = __msa_ilvev_h(x2_m, x3_m); \ + out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ + \ + /* stage2 */ \ + ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m); \ + \ + s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ + s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ + \ + /* stage3 */ \ + BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \ + \ + /* stage4 */ \ + ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \ + ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \ + \ + SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m); \ + x1_m = __msa_ilvev_h(x0_m, x1_m); \ + out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m); \ + \ + SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m); \ + x2_m = __msa_ilvev_h(x3_m, x2_m); \ + out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ + \ + x1_m = __msa_splati_h(coeff_m, 5); \ + x0_m = -x0_m; \ + x0_m = __msa_ilvev_h(x1_m, x0_m); \ + out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m); \ + \ + x2_m = __msa_splati_h(coeff_m, 6); \ + x3_m = -x3_m; \ + x2_m = __msa_ilvev_h(x2_m, x3_m); \ + out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ + } + +#define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6, \ + input7, out1, out3, out5, out7, out9, out11, out13, \ + out15) \ + { \ + v8i16 stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m; \ + v8i16 stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m; \ + v8i16 stp36_m, stp37_m, vec0_m, vec1_m; \ + v8i16 vec2_m, vec3_m, vec4_m, vec5_m, vec6_m; \ + v8i16 cnst0_m, cnst1_m, cnst4_m, cnst5_m; \ + v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \ + -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 }; \ + v8i16 coeff1_m = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64, \ + cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 }; \ + v8i16 coeff2_m = { \ + -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0 \ + }; \ + \ + /* stp 1 */ \ + ILVL_H2_SH(input2, input5, input3, input4, vec2_m, vec4_m); \ + ILVR_H2_SH(input2, input5, input3, input4, vec3_m, vec5_m); \ + \ + cnst4_m = __msa_splati_h(coeff_m, 0); \ + stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m); \ + \ + cnst5_m = __msa_splati_h(coeff_m, 1); \ + cnst5_m = __msa_ilvev_h(cnst5_m, cnst4_m); \ + stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m); \ + stp24_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m); \ + stp23_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m); \ + \ + /* stp2 */ \ + BUTTERFLY_4(input0, input1, stp22_m, stp23_m, stp30_m, stp31_m, stp32_m, \ + stp33_m); \ + BUTTERFLY_4(input7, input6, stp25_m, stp24_m, stp37_m, stp36_m, stp35_m, \ + stp34_m); \ + \ + ILVL_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, vec4_m); \ + ILVR_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, vec5_m); \ + \ + SPLATI_H2_SH(coeff_m, 2, 3, cnst0_m, cnst1_m); \ + cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + stp26_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \ + \ + cnst0_m = __msa_splati_h(coeff_m, 4); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + stp21_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \ + \ + SPLATI_H2_SH(coeff_m, 5, 2, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \ + \ + cnst0_m = __msa_splati_h(coeff_m, 3); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \ + \ + /* stp4 */ \ + BUTTERFLY_4(stp30_m, stp37_m, stp26_m, stp21_m, vec6_m, vec2_m, vec4_m, \ + vec5_m); \ + BUTTERFLY_4(stp33_m, stp34_m, stp25_m, stp22_m, stp21_m, stp23_m, stp24_m, \ + stp31_m); \ + \ + ILVRL_H2_SH(vec2_m, vec6_m, vec1_m, vec0_m); \ + SPLATI_H2_SH(coeff1_m, 0, 1, cnst0_m, cnst1_m); \ + cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + \ + out1 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + \ + cnst0_m = __msa_splati_h(coeff2_m, 0); \ + cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + out15 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + \ + ILVRL_H2_SH(vec4_m, vec5_m, vec1_m, vec0_m); \ + SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + \ + out9 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ + \ + cnst1_m = __msa_splati_h(coeff2_m, 2); \ + cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + out7 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + \ + ILVRL_H2_SH(stp23_m, stp21_m, vec1_m, vec0_m); \ + SPLATI_H2_SH(coeff1_m, 4, 5, cnst0_m, cnst1_m); \ + cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + out5 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + \ + cnst0_m = __msa_splati_h(coeff2_m, 1); \ + cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + out11 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + \ + ILVRL_H2_SH(stp24_m, stp31_m, vec1_m, vec0_m); \ + SPLATI_H2_SH(coeff1_m, 6, 7, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + \ + out13 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ + \ + cnst1_m = __msa_splati_h(coeff2_m, 3); \ + cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + } + +#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \ + { \ + v8i16 tp0_m, tp1_m; \ + v8i16 one_m = __msa_ldi_h(1); \ + \ + tp0_m = __msa_clti_s_h(vec0, 0); \ + tp1_m = __msa_clti_s_h(vec1, 0); \ + vec0 += 1; \ + vec1 += 1; \ + tp0_m = one_m & tp0_m; \ + tp1_m = one_m & tp1_m; \ + vec0 += tp0_m; \ + vec1 += tp1_m; \ + vec0 >>= 2; \ + vec1 >>= 2; \ + } + +#define FDCT32_POSTPROC_NEG_W(vec) \ + { \ + v4i32 temp_m; \ + v4i32 one_m = __msa_ldi_w(1); \ + \ + temp_m = __msa_clti_s_w(vec, 0); \ + vec += 1; \ + temp_m = one_m & temp_m; \ + vec += temp_m; \ + vec >>= 2; \ + } + +#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \ + { \ + v8i16 tp0_m, tp1_m; \ + v8i16 one = __msa_ldi_h(1); \ + \ + tp0_m = __msa_clei_s_h(vec0, 0); \ + tp1_m = __msa_clei_s_h(vec1, 0); \ + tp0_m = (v8i16)__msa_xori_b((v16u8)tp0_m, 255); \ + tp1_m = (v8i16)__msa_xori_b((v16u8)tp1_m, 255); \ + vec0 += 1; \ + vec1 += 1; \ + tp0_m = one & tp0_m; \ + tp1_m = one & tp1_m; \ + vec0 += tp0_m; \ + vec1 += tp1_m; \ + vec0 >>= 2; \ + vec1 >>= 2; \ + } + +#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right, \ + const0, const1, out0, out1, out2, out3) \ + { \ + v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ + v2i64 tp0_m, tp1_m, tp2_m, tp3_m; \ + v4i32 k0_m = __msa_fill_w((int32_t)const0); \ + \ + s0_m = __msa_fill_w((int32_t)const1); \ + k0_m = __msa_ilvev_w(s0_m, k0_m); \ + \ + ILVRL_W2_SW(-reg1_left, reg0_left, s1_m, s0_m); \ + ILVRL_W2_SW(reg0_left, reg1_left, s3_m, s2_m); \ + ILVRL_W2_SW(-reg1_right, reg0_right, s5_m, s4_m); \ + ILVRL_W2_SW(reg0_right, reg1_right, s7_m, s6_m); \ + \ + DOTP_SW2_SD(s0_m, s1_m, k0_m, k0_m, tp0_m, tp1_m); \ + DOTP_SW2_SD(s4_m, s5_m, k0_m, k0_m, tp2_m, tp3_m); \ + tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \ + tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \ + tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \ + tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \ + out0 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \ + out1 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \ + \ + DOTP_SW2_SD(s2_m, s3_m, k0_m, k0_m, tp0_m, tp1_m); \ + DOTP_SW2_SD(s6_m, s7_m, k0_m, k0_m, tp2_m, tp3_m); \ + tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \ + tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \ + tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \ + tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \ + out2 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \ + out3 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \ + } + +void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, + int32_t src_stride); +void fdct16x8_1d_row(int16_t *input, int16_t *output); +#endif // VPX_VPX_DSP_MIPS_FWD_TXFM_MSA_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/mips/idct16x16_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/idct16x16_msa.c new file mode 100644 index 0000000000..7ca61a28ec --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/idct16x16_msa.c @@ -0,0 +1,486 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/inv_txfm_msa.h" + +void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output) { + v8i16 loc0, loc1, loc2, loc3; + v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14; + v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15; + v8i16 tmp5, tmp6, tmp7; + + LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + input += 8; + LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); + + TRANSPOSE8x8_SH_SH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg0, reg1, + reg2, reg3, reg4, reg5, reg6, reg7); + TRANSPOSE8x8_SH_SH(reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15, reg8, + reg9, reg10, reg11, reg12, reg13, reg14, reg15); + DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14); + DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6); + BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2); + DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3); + DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8); + DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12); + BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14); + SUB4(reg2, loc1, reg14, loc0, reg6, loc3, reg10, loc2, reg0, reg12, reg4, + reg8); + ADD4(reg2, loc1, reg14, loc0, reg6, loc3, reg10, loc2, reg2, reg14, reg6, + reg10); + + /* stage 2 */ + DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15); + DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3); + + reg9 = reg1 - loc2; + reg1 = reg1 + loc2; + reg7 = reg15 - loc3; + reg15 = reg15 + loc3; + + DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11); + DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1); + BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5); + + loc1 = reg15 + reg3; + reg3 = reg15 - reg3; + loc2 = reg2 + loc1; + reg15 = reg2 - loc1; + + loc1 = reg1 + reg13; + reg13 = reg1 - reg13; + loc0 = reg0 + loc1; + loc1 = reg0 - loc1; + tmp6 = loc0; + tmp7 = loc1; + reg0 = loc2; + + DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9); + DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11); + + loc0 = reg9 + reg5; + reg5 = reg9 - reg5; + reg2 = reg6 + loc0; + reg1 = reg6 - loc0; + + loc0 = reg7 + reg11; + reg11 = reg7 - reg11; + loc1 = reg4 + loc0; + loc2 = reg4 - loc0; + tmp5 = loc1; + + DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11); + BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1); + + reg10 = loc0; + reg11 = loc1; + + DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13); + BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5); + + reg13 = loc2; + + /* Transpose and store the output */ + reg12 = tmp5; + reg14 = tmp6; + reg3 = tmp7; + + /* transpose block */ + TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, reg0, + reg2, reg4, reg6, reg8, reg10, reg12, reg14); + ST_SH8(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, output, 16); + + /* transpose block */ + TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, reg3, + reg13, reg11, reg5, reg7, reg9, reg1, reg15); + ST_SH8(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, (output + 8), 16); +} + +void vpx_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride) { + v8i16 loc0, loc1, loc2, loc3; + v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14; + v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15; + v8i16 tmp5, tmp6, tmp7; + + /* load up 8x8 */ + LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + input += 8 * 16; + /* load bottom 8x8 */ + LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); + + DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14); + DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6); + BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2); + DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3); + DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8); + DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12); + BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14); + + reg0 = reg2 - loc1; + reg2 = reg2 + loc1; + reg12 = reg14 - loc0; + reg14 = reg14 + loc0; + reg4 = reg6 - loc3; + reg6 = reg6 + loc3; + reg8 = reg10 - loc2; + reg10 = reg10 + loc2; + + /* stage 2 */ + DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15); + DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3); + + reg9 = reg1 - loc2; + reg1 = reg1 + loc2; + reg7 = reg15 - loc3; + reg15 = reg15 + loc3; + + DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11); + DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1); + BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5); + + loc1 = reg15 + reg3; + reg3 = reg15 - reg3; + loc2 = reg2 + loc1; + reg15 = reg2 - loc1; + + loc1 = reg1 + reg13; + reg13 = reg1 - reg13; + loc0 = reg0 + loc1; + loc1 = reg0 - loc1; + tmp6 = loc0; + tmp7 = loc1; + reg0 = loc2; + + DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9); + DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11); + + loc0 = reg9 + reg5; + reg5 = reg9 - reg5; + reg2 = reg6 + loc0; + reg1 = reg6 - loc0; + + loc0 = reg7 + reg11; + reg11 = reg7 - reg11; + loc1 = reg4 + loc0; + loc2 = reg4 - loc0; + tmp5 = loc1; + + DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11); + BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1); + + reg10 = loc0; + reg11 = loc1; + + DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13); + BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5); + reg13 = loc2; + + /* Transpose and store the output */ + reg12 = tmp5; + reg14 = tmp6; + reg3 = tmp7; + + SRARI_H4_SH(reg0, reg2, reg4, reg6, 6); + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg0, reg2, reg4, reg6); + dst += (4 * dst_stride); + SRARI_H4_SH(reg8, reg10, reg12, reg14, 6); + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg8, reg10, reg12, reg14); + dst += (4 * dst_stride); + SRARI_H4_SH(reg3, reg13, reg11, reg5, 6); + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg3, reg13, reg11, reg5); + dst += (4 * dst_stride); + SRARI_H4_SH(reg7, reg9, reg1, reg15, 6); + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg7, reg9, reg1, reg15); +} + +void vpx_idct16x16_256_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]); + int16_t *out = out_arr; + + /* transform rows */ + for (i = 0; i < 2; ++i) { + /* process 16 * 8 block */ + vpx_idct16_1d_rows_msa((input + (i << 7)), (out + (i << 7))); + } + + /* transform columns */ + for (i = 0; i < 2; ++i) { + /* process 8 * 16 block */ + vpx_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)), + dst_stride); + } +} + +void vpx_idct16x16_10_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + uint8_t i; + DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]); + int16_t *out = out_arr; + + /* process 16 * 8 block */ + vpx_idct16_1d_rows_msa(input, out); + + /* short case just considers top 4 rows as valid output */ + out += 4 * 16; + for (i = 12; i--;) { + __asm__ __volatile__( + "sw $zero, 0(%[out]) \n\t" + "sw $zero, 4(%[out]) \n\t" + "sw $zero, 8(%[out]) \n\t" + "sw $zero, 12(%[out]) \n\t" + "sw $zero, 16(%[out]) \n\t" + "sw $zero, 20(%[out]) \n\t" + "sw $zero, 24(%[out]) \n\t" + "sw $zero, 28(%[out]) \n\t" + + : + : [out] "r"(out)); + + out += 16; + } + + out = out_arr; + + /* transform columns */ + for (i = 0; i < 2; ++i) { + /* process 8 * 16 block */ + vpx_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)), + dst_stride); + } +} + +void vpx_idct16x16_1_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + uint8_t i; + int16_t out; + v8i16 vec, res0, res1, res2, res3, res4, res5, res6, res7; + v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; + + out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO(out, 6); + + vec = __msa_fill_h(out); + + for (i = 4; i--;) { + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + UNPCK_UB_SH(dst0, res0, res4); + UNPCK_UB_SH(dst1, res1, res5); + UNPCK_UB_SH(dst2, res2, res6); + UNPCK_UB_SH(dst3, res3, res7); + ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3); + ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7); + CLIP_SH4_0_255(res0, res1, res2, res3); + CLIP_SH4_0_255(res4, res5, res6, res7); + PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1, + tmp2, tmp3); + ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +void vpx_iadst16_1d_rows_msa(const int16_t *input, int16_t *output) { + v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; + + /* load input data */ + LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, + l7, l15); + TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, l0, l1, l2, l3, l4, l5, l6, + l7); + TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, l8, l9, l10, l11, + l12, l13, l14, l15); + + /* ADST in horizontal */ + VP9_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, + l14, l15, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, + r12, r13, r14, r15); + + l1 = -r8; + l3 = -r4; + l13 = -r13; + l15 = -r1; + + TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2, l0, l1, l2, l3, l4, l5, + l6, l7); + ST_SH8(l0, l1, l2, l3, l4, l5, l6, l7, output, 16); + TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15, l8, l9, l10, l11, l12, + l13, l14, l15); + ST_SH8(l8, l9, l10, l11, l12, l13, l14, l15, (output + 8), 16); +} + +void vpx_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride) { + v8i16 v0, v2, v4, v6, k0, k1, k2, k3; + v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + v8i16 out0, out1, out2, out3, out4, out5, out6, out7; + v8i16 out8, out9, out10, out11, out12, out13, out14, out15; + v8i16 g0, g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15; + v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11; + v8i16 res0, res1, res2, res3, res4, res5, res6, res7; + v8i16 res8, res9, res10, res11, res12, res13, res14, res15; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; + v16i8 zero = { 0 }; + + r0 = LD_SH(input + 0 * 16); + r3 = LD_SH(input + 3 * 16); + r4 = LD_SH(input + 4 * 16); + r7 = LD_SH(input + 7 * 16); + r8 = LD_SH(input + 8 * 16); + r11 = LD_SH(input + 11 * 16); + r12 = LD_SH(input + 12 * 16); + r15 = LD_SH(input + 15 * 16); + + /* stage 1 */ + k0 = VP9_SET_COSPI_PAIR(cospi_1_64, cospi_31_64); + k1 = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); + k2 = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); + k3 = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); + MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3); + k0 = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); + k1 = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); + k2 = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); + k3 = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); + MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11); + BUTTERFLY_4(g0, g2, g10, g8, h8, h9, v2, v0); + k0 = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); + k1 = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); + k2 = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); + MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3); + + r1 = LD_SH(input + 1 * 16); + r2 = LD_SH(input + 2 * 16); + r5 = LD_SH(input + 5 * 16); + r6 = LD_SH(input + 6 * 16); + r9 = LD_SH(input + 9 * 16); + r10 = LD_SH(input + 10 * 16); + r13 = LD_SH(input + 13 * 16); + r14 = LD_SH(input + 14 * 16); + + k0 = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); + k1 = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); + k2 = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); + k3 = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); + MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, g4, g5, g6, g7); + k0 = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); + k1 = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); + k2 = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); + k3 = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); + MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g12, g13, g14, g15); + BUTTERFLY_4(g4, g6, g14, g12, h10, h11, v6, v4); + BUTTERFLY_4(h8, h9, h11, h10, out0, out1, h11, h10); + out1 = -out1; + SRARI_H2_SH(out0, out1, 6); + dst0 = LD_UB(dst + 0 * dst_stride); + dst1 = LD_UB(dst + 15 * dst_stride); + ILVR_B2_SH(zero, dst0, zero, dst1, res0, res1); + ADD2(res0, out0, res1, out1, res0, res1); + CLIP_SH2_0_255(res0, res1); + PCKEV_B2_SH(res0, res0, res1, res1, res0, res1); + ST8x1_UB(res0, dst); + ST8x1_UB(res1, dst + 15 * dst_stride); + + k0 = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); + k1 = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); + k2 = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); + MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7); + BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10); + out8 = -out8; + + SRARI_H2_SH(out8, out9, 6); + dst8 = LD_UB(dst + 1 * dst_stride); + dst9 = LD_UB(dst + 14 * dst_stride); + ILVR_B2_SH(zero, dst8, zero, dst9, res8, res9); + ADD2(res8, out8, res9, out9, res8, res9); + CLIP_SH2_0_255(res8, res9); + PCKEV_B2_SH(res8, res8, res9, res9, res8, res9); + ST8x1_UB(res8, dst + dst_stride); + ST8x1_UB(res9, dst + 14 * dst_stride); + + k0 = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); + k1 = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); + k2 = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); + MADD_BF(v0, v2, v4, v6, k0, k1, k2, k0, out4, out6, out5, out7); + out4 = -out4; + SRARI_H2_SH(out4, out5, 6); + dst4 = LD_UB(dst + 3 * dst_stride); + dst5 = LD_UB(dst + 12 * dst_stride); + ILVR_B2_SH(zero, dst4, zero, dst5, res4, res5); + ADD2(res4, out4, res5, out5, res4, res5); + CLIP_SH2_0_255(res4, res5); + PCKEV_B2_SH(res4, res4, res5, res5, res4, res5); + ST8x1_UB(res4, dst + 3 * dst_stride); + ST8x1_UB(res5, dst + 12 * dst_stride); + + MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15); + out13 = -out13; + SRARI_H2_SH(out12, out13, 6); + dst12 = LD_UB(dst + 2 * dst_stride); + dst13 = LD_UB(dst + 13 * dst_stride); + ILVR_B2_SH(zero, dst12, zero, dst13, res12, res13); + ADD2(res12, out12, res13, out13, res12, res13); + CLIP_SH2_0_255(res12, res13); + PCKEV_B2_SH(res12, res12, res13, res13, res12, res13); + ST8x1_UB(res12, dst + 2 * dst_stride); + ST8x1_UB(res13, dst + 13 * dst_stride); + + k0 = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); + k3 = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); + MADD_SHORT(out6, out7, k0, k3, out6, out7); + SRARI_H2_SH(out6, out7, 6); + dst6 = LD_UB(dst + 4 * dst_stride); + dst7 = LD_UB(dst + 11 * dst_stride); + ILVR_B2_SH(zero, dst6, zero, dst7, res6, res7); + ADD2(res6, out6, res7, out7, res6, res7); + CLIP_SH2_0_255(res6, res7); + PCKEV_B2_SH(res6, res6, res7, res7, res6, res7); + ST8x1_UB(res6, dst + 4 * dst_stride); + ST8x1_UB(res7, dst + 11 * dst_stride); + + MADD_SHORT(out10, out11, k0, k3, out10, out11); + SRARI_H2_SH(out10, out11, 6); + dst10 = LD_UB(dst + 6 * dst_stride); + dst11 = LD_UB(dst + 9 * dst_stride); + ILVR_B2_SH(zero, dst10, zero, dst11, res10, res11); + ADD2(res10, out10, res11, out11, res10, res11); + CLIP_SH2_0_255(res10, res11); + PCKEV_B2_SH(res10, res10, res11, res11, res10, res11); + ST8x1_UB(res10, dst + 6 * dst_stride); + ST8x1_UB(res11, dst + 9 * dst_stride); + + k1 = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); + k2 = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); + MADD_SHORT(h10, h11, k1, k2, out2, out3); + SRARI_H2_SH(out2, out3, 6); + dst2 = LD_UB(dst + 7 * dst_stride); + dst3 = LD_UB(dst + 8 * dst_stride); + ILVR_B2_SH(zero, dst2, zero, dst3, res2, res3); + ADD2(res2, out2, res3, out3, res2, res3); + CLIP_SH2_0_255(res2, res3); + PCKEV_B2_SH(res2, res2, res3, res3, res2, res3); + ST8x1_UB(res2, dst + 7 * dst_stride); + ST8x1_UB(res3, dst + 8 * dst_stride); + + MADD_SHORT(out14, out15, k1, k2, out14, out15); + SRARI_H2_SH(out14, out15, 6); + dst14 = LD_UB(dst + 5 * dst_stride); + dst15 = LD_UB(dst + 10 * dst_stride); + ILVR_B2_SH(zero, dst14, zero, dst15, res14, res15); + ADD2(res14, out14, res15, out15, res14, res15); + CLIP_SH2_0_255(res14, res15); + PCKEV_B2_SH(res14, res14, res15, res15, res14, res15); + ST8x1_UB(res14, dst + 5 * dst_stride); + ST8x1_UB(res15, dst + 10 * dst_stride); +} diff --git a/media/libvpx/libvpx/vpx_dsp/mips/idct32x32_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/idct32x32_msa.c new file mode 100644 index 0000000000..053948183a --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/idct32x32_msa.c @@ -0,0 +1,730 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/inv_txfm_msa.h" + +static void idct32x8_row_transpose_store(const int16_t *input, + int16_t *tmp_buf) { + v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; + + /* 1st & 2nd 8x8 */ + LD_SH8(input, 32, m0, n0, m1, n1, m2, n2, m3, n3); + LD_SH8((input + 8), 32, m4, n4, m5, n5, m6, n6, m7, n7); + TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); + ST_SH8(m0, n0, m1, n1, m2, n2, m3, n3, (tmp_buf), 8); + ST_SH4(m4, n4, m5, n5, (tmp_buf + 8 * 8), 8); + ST_SH4(m6, n6, m7, n7, (tmp_buf + 12 * 8), 8); + + /* 3rd & 4th 8x8 */ + LD_SH8((input + 16), 32, m0, n0, m1, n1, m2, n2, m3, n3); + LD_SH8((input + 24), 32, m4, n4, m5, n5, m6, n6, m7, n7); + TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); + ST_SH4(m0, n0, m1, n1, (tmp_buf + 16 * 8), 8); + ST_SH4(m2, n2, m3, n3, (tmp_buf + 20 * 8), 8); + ST_SH4(m4, n4, m5, n5, (tmp_buf + 24 * 8), 8); + ST_SH4(m6, n6, m7, n7, (tmp_buf + 28 * 8), 8); +} + +static void idct32x8_row_even_process_store(int16_t *tmp_buf, + int16_t *tmp_eve_buf) { + v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; + + /* Even stage 1 */ + LD_SH8(tmp_buf, 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + + DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); + DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); + BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0); + DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + + loc1 = vec3; + loc0 = vec1; + + DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4); + DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6); + BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0); + BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4); + BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5); + + /* Even stage 2 */ + LD_SH8((tmp_buf + 16), 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7); + DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3); + DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5); + DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1); + + vec0 = reg0 + reg4; + reg0 = reg0 - reg4; + reg4 = reg6 + reg2; + reg6 = reg6 - reg2; + reg2 = reg1 + reg5; + reg1 = reg1 - reg5; + reg5 = reg7 + reg3; + reg7 = reg7 - reg3; + reg3 = vec0; + + vec1 = reg2; + reg2 = reg3 + reg4; + reg3 = reg3 - reg4; + reg4 = reg5 - vec1; + reg5 = reg5 + vec1; + + DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7); + DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1); + + vec0 = reg0 - reg6; + reg0 = reg0 + reg6; + vec1 = reg7 - reg1; + reg7 = reg7 + reg1; + + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1); + DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4); + + /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */ + BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0); + ST_SH(loc0, (tmp_eve_buf + 15 * 8)); + ST_SH(loc1, (tmp_eve_buf)); + ST_SH(loc2, (tmp_eve_buf + 14 * 8)); + ST_SH(loc3, (tmp_eve_buf + 8)); + + BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0); + ST_SH(loc0, (tmp_eve_buf + 13 * 8)); + ST_SH(loc1, (tmp_eve_buf + 2 * 8)); + ST_SH(loc2, (tmp_eve_buf + 12 * 8)); + ST_SH(loc3, (tmp_eve_buf + 3 * 8)); + + /* Store 8 */ + BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0); + ST_SH(loc0, (tmp_eve_buf + 11 * 8)); + ST_SH(loc1, (tmp_eve_buf + 4 * 8)); + ST_SH(loc2, (tmp_eve_buf + 10 * 8)); + ST_SH(loc3, (tmp_eve_buf + 5 * 8)); + + BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); + ST_SH(loc0, (tmp_eve_buf + 9 * 8)); + ST_SH(loc1, (tmp_eve_buf + 6 * 8)); + ST_SH(loc2, (tmp_eve_buf + 8 * 8)); + ST_SH(loc3, (tmp_eve_buf + 7 * 8)); +} + +static void idct32x8_row_odd_process_store(int16_t *tmp_buf, + int16_t *tmp_odd_buf) { + v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + + /* Odd stage 1 */ + reg0 = LD_SH(tmp_buf + 8); + reg1 = LD_SH(tmp_buf + 7 * 8); + reg2 = LD_SH(tmp_buf + 9 * 8); + reg3 = LD_SH(tmp_buf + 15 * 8); + reg4 = LD_SH(tmp_buf + 17 * 8); + reg5 = LD_SH(tmp_buf + 23 * 8); + reg6 = LD_SH(tmp_buf + 25 * 8); + reg7 = LD_SH(tmp_buf + 31 * 8); + + DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7); + DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4); + DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5); + DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6); + + vec0 = reg0 + reg3; + reg0 = reg0 - reg3; + reg3 = reg7 + reg4; + reg7 = reg7 - reg4; + reg4 = reg1 + reg2; + reg1 = reg1 - reg2; + reg2 = reg6 + reg5; + reg6 = reg6 - reg5; + reg5 = vec0; + + /* 4 Stores */ + ADD2(reg5, reg4, reg3, reg2, vec0, vec1); + ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8); + + SUB2(reg5, reg4, reg3, reg2, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1); + ST_SH2(vec0, vec1, (tmp_odd_buf), 8); + + /* 4 Stores */ + DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7); + DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6); + BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3); + ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8); + + DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3); + ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8); + + /* Odd stage 2 */ + /* 8 loads */ + reg0 = LD_SH(tmp_buf + 3 * 8); + reg1 = LD_SH(tmp_buf + 5 * 8); + reg2 = LD_SH(tmp_buf + 11 * 8); + reg3 = LD_SH(tmp_buf + 13 * 8); + reg4 = LD_SH(tmp_buf + 19 * 8); + reg5 = LD_SH(tmp_buf + 21 * 8); + reg6 = LD_SH(tmp_buf + 27 * 8); + reg7 = LD_SH(tmp_buf + 29 * 8); + + DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6); + DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5); + DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4); + DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); + + /* 4 Stores */ + SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3); + DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); + DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); + + BUTTERFLY_4(loc3, loc2, loc0, loc1, vec1, vec0, vec2, vec3); + ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8); + + DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1); + ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8); + + /* 4 Stores */ + ADD4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec1, vec2, vec0, vec3); + BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2); + ST_SH(reg0, (tmp_odd_buf + 13 * 8)); + ST_SH(reg1, (tmp_odd_buf + 14 * 8)); + + DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1); + ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8); + + /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */ + + /* Load 8 & Store 8 */ + LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3); + LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7); + + ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8); + + SUB2(reg0, reg4, reg1, reg5, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + + SUB2(reg2, reg6, reg3, reg7, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8); + + /* Load 8 & Store 8 */ + LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3); + LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7); + + ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8); + + SUB2(reg0, reg4, reg3, reg7, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + + SUB2(reg1, reg5, reg2, reg6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8); +} + +static void idct_butterfly_transpose_store(int16_t *tmp_buf, + int16_t *tmp_eve_buf, + int16_t *tmp_odd_buf, int16_t *dst) { + v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; + + /* FINAL BUTTERFLY : Dependency on Even & Odd */ + vec0 = LD_SH(tmp_odd_buf); + vec1 = LD_SH(tmp_odd_buf + 9 * 8); + vec2 = LD_SH(tmp_odd_buf + 14 * 8); + vec3 = LD_SH(tmp_odd_buf + 6 * 8); + loc0 = LD_SH(tmp_eve_buf); + loc1 = LD_SH(tmp_eve_buf + 8 * 8); + loc2 = LD_SH(tmp_eve_buf + 4 * 8); + loc3 = LD_SH(tmp_eve_buf + 12 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6); + + ST_SH((loc0 - vec3), (tmp_buf + 31 * 8)); + ST_SH((loc1 - vec2), (tmp_buf + 23 * 8)); + ST_SH((loc2 - vec1), (tmp_buf + 27 * 8)); + ST_SH((loc3 - vec0), (tmp_buf + 19 * 8)); + + /* Load 8 & Store 8 */ + vec0 = LD_SH(tmp_odd_buf + 4 * 8); + vec1 = LD_SH(tmp_odd_buf + 13 * 8); + vec2 = LD_SH(tmp_odd_buf + 10 * 8); + vec3 = LD_SH(tmp_odd_buf + 3 * 8); + loc0 = LD_SH(tmp_eve_buf + 2 * 8); + loc1 = LD_SH(tmp_eve_buf + 10 * 8); + loc2 = LD_SH(tmp_eve_buf + 6 * 8); + loc3 = LD_SH(tmp_eve_buf + 14 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7); + + ST_SH((loc0 - vec3), (tmp_buf + 29 * 8)); + ST_SH((loc1 - vec2), (tmp_buf + 21 * 8)); + ST_SH((loc2 - vec1), (tmp_buf + 25 * 8)); + ST_SH((loc3 - vec0), (tmp_buf + 17 * 8)); + + /* Load 8 & Store 8 */ + vec0 = LD_SH(tmp_odd_buf + 2 * 8); + vec1 = LD_SH(tmp_odd_buf + 11 * 8); + vec2 = LD_SH(tmp_odd_buf + 12 * 8); + vec3 = LD_SH(tmp_odd_buf + 7 * 8); + loc0 = LD_SH(tmp_eve_buf + 1 * 8); + loc1 = LD_SH(tmp_eve_buf + 9 * 8); + loc2 = LD_SH(tmp_eve_buf + 5 * 8); + loc3 = LD_SH(tmp_eve_buf + 13 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6); + + ST_SH((loc0 - vec3), (tmp_buf + 30 * 8)); + ST_SH((loc1 - vec2), (tmp_buf + 22 * 8)); + ST_SH((loc2 - vec1), (tmp_buf + 26 * 8)); + ST_SH((loc3 - vec0), (tmp_buf + 18 * 8)); + + /* Load 8 & Store 8 */ + vec0 = LD_SH(tmp_odd_buf + 5 * 8); + vec1 = LD_SH(tmp_odd_buf + 15 * 8); + vec2 = LD_SH(tmp_odd_buf + 8 * 8); + vec3 = LD_SH(tmp_odd_buf + 1 * 8); + loc0 = LD_SH(tmp_eve_buf + 3 * 8); + loc1 = LD_SH(tmp_eve_buf + 11 * 8); + loc2 = LD_SH(tmp_eve_buf + 7 * 8); + loc3 = LD_SH(tmp_eve_buf + 15 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7); + + ST_SH((loc0 - vec3), (tmp_buf + 28 * 8)); + ST_SH((loc1 - vec2), (tmp_buf + 20 * 8)); + ST_SH((loc2 - vec1), (tmp_buf + 24 * 8)); + ST_SH((loc3 - vec0), (tmp_buf + 16 * 8)); + + /* Transpose : 16 vectors */ + /* 1st & 2nd 8x8 */ + TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + ST_SH4(m0, n0, m1, n1, (dst + 0), 32); + ST_SH4(m2, n2, m3, n3, (dst + 4 * 32), 32); + + TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); + ST_SH4(m4, n4, m5, n5, (dst + 8), 32); + ST_SH4(m6, n6, m7, n7, (dst + 8 + 4 * 32), 32); + + /* 3rd & 4th 8x8 */ + LD_SH8((tmp_buf + 8 * 16), 8, m0, n0, m1, n1, m2, n2, m3, n3); + LD_SH8((tmp_buf + 12 * 16), 8, m4, n4, m5, n5, m6, n6, m7, n7); + TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + ST_SH4(m0, n0, m1, n1, (dst + 16), 32); + ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32); + + TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); + ST_SH4(m4, n4, m5, n5, (dst + 24), 32); + ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32); +} + +static void idct32x8_1d_rows_msa(const int16_t *input, int16_t *output) { + DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]); + DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]); + DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]); + + idct32x8_row_transpose_store(input, &tmp_buf[0]); + idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]); + idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]); + idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], &tmp_odd_buf[0], + output); +} + +static void idct8x32_column_even_process_store(int16_t *tmp_buf, + int16_t *tmp_eve_buf) { + v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; + + /* Even stage 1 */ + LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + tmp_buf += (2 * 32); + + DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); + DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); + BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0); + DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + + loc1 = vec3; + loc0 = vec1; + + DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4); + DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6); + BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0); + BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4); + BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5); + + /* Even stage 2 */ + /* Load 8 */ + LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + + DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7); + DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3); + DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5); + DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1); + + vec0 = reg0 + reg4; + reg0 = reg0 - reg4; + reg4 = reg6 + reg2; + reg6 = reg6 - reg2; + reg2 = reg1 + reg5; + reg1 = reg1 - reg5; + reg5 = reg7 + reg3; + reg7 = reg7 - reg3; + reg3 = vec0; + + vec1 = reg2; + reg2 = reg3 + reg4; + reg3 = reg3 - reg4; + reg4 = reg5 - vec1; + reg5 = reg5 + vec1; + + DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7); + DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1); + + vec0 = reg0 - reg6; + reg0 = reg0 + reg6; + vec1 = reg7 - reg1; + reg7 = reg7 + reg1; + + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1); + DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4); + + /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */ + /* Store 8 */ + BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0); + ST_SH2(loc1, loc3, tmp_eve_buf, 8); + ST_SH2(loc2, loc0, (tmp_eve_buf + 14 * 8), 8); + + BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0); + ST_SH2(loc1, loc3, (tmp_eve_buf + 2 * 8), 8); + ST_SH2(loc2, loc0, (tmp_eve_buf + 12 * 8), 8); + + /* Store 8 */ + BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0); + ST_SH2(loc1, loc3, (tmp_eve_buf + 4 * 8), 8); + ST_SH2(loc2, loc0, (tmp_eve_buf + 10 * 8), 8); + + BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); + ST_SH2(loc1, loc3, (tmp_eve_buf + 6 * 8), 8); + ST_SH2(loc2, loc0, (tmp_eve_buf + 8 * 8), 8); +} + +static void idct8x32_column_odd_process_store(int16_t *tmp_buf, + int16_t *tmp_odd_buf) { + v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + + /* Odd stage 1 */ + reg0 = LD_SH(tmp_buf + 32); + reg1 = LD_SH(tmp_buf + 7 * 32); + reg2 = LD_SH(tmp_buf + 9 * 32); + reg3 = LD_SH(tmp_buf + 15 * 32); + reg4 = LD_SH(tmp_buf + 17 * 32); + reg5 = LD_SH(tmp_buf + 23 * 32); + reg6 = LD_SH(tmp_buf + 25 * 32); + reg7 = LD_SH(tmp_buf + 31 * 32); + + DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7); + DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4); + DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5); + DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6); + + vec0 = reg0 + reg3; + reg0 = reg0 - reg3; + reg3 = reg7 + reg4; + reg7 = reg7 - reg4; + reg4 = reg1 + reg2; + reg1 = reg1 - reg2; + reg2 = reg6 + reg5; + reg6 = reg6 - reg5; + reg5 = vec0; + + /* 4 Stores */ + ADD2(reg5, reg4, reg3, reg2, vec0, vec1); + ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8); + SUB2(reg5, reg4, reg3, reg2, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1); + ST_SH2(vec0, vec1, tmp_odd_buf, 8); + + /* 4 Stores */ + DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7); + DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6); + BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3); + ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8); + DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3); + ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8); + + /* Odd stage 2 */ + /* 8 loads */ + reg0 = LD_SH(tmp_buf + 3 * 32); + reg1 = LD_SH(tmp_buf + 5 * 32); + reg2 = LD_SH(tmp_buf + 11 * 32); + reg3 = LD_SH(tmp_buf + 13 * 32); + reg4 = LD_SH(tmp_buf + 19 * 32); + reg5 = LD_SH(tmp_buf + 21 * 32); + reg6 = LD_SH(tmp_buf + 27 * 32); + reg7 = LD_SH(tmp_buf + 29 * 32); + + DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6); + DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5); + DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4); + DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); + + /* 4 Stores */ + SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3); + DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); + DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); + BUTTERFLY_4(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2); + ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8); + DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1); + ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8); + + /* 4 Stores */ + ADD4(reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, vec0, vec1, vec2, vec3); + BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2); + ST_SH2(reg0, reg1, (tmp_odd_buf + 13 * 8), 8); + DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1); + ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8); + + /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */ + /* Load 8 & Store 8 */ + LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3); + LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7); + + ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8); + + SUB2(reg0, reg4, reg1, reg5, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + + SUB2(reg2, reg6, reg3, reg7, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8); + + /* Load 8 & Store 8 */ + LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3); + LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7); + + ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8); + + SUB2(reg0, reg4, reg3, reg7, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + + SUB2(reg1, reg5, reg2, reg6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8); +} + +static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf, + int16_t *tmp_odd_buf, uint8_t *dst, + int32_t dst_stride) { + v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; + + /* FINAL BUTTERFLY : Dependency on Even & Odd */ + vec0 = LD_SH(tmp_odd_buf); + vec1 = LD_SH(tmp_odd_buf + 9 * 8); + vec2 = LD_SH(tmp_odd_buf + 14 * 8); + vec3 = LD_SH(tmp_odd_buf + 6 * 8); + loc0 = LD_SH(tmp_eve_buf); + loc1 = LD_SH(tmp_eve_buf + 8 * 8); + loc2 = LD_SH(tmp_eve_buf + 4 * 8); + loc3 = LD_SH(tmp_eve_buf + 12 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6); + SRARI_H4_SH(m0, m2, m4, m6, 6); + VP9_ADDBLK_ST8x4_UB(dst, (4 * dst_stride), m0, m2, m4, m6); + + SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, m2, m4, m0); + SRARI_H4_SH(m0, m2, m4, m6, 6); + VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride), m0, m2, m4, + m6); + + /* Load 8 & Store 8 */ + vec0 = LD_SH(tmp_odd_buf + 4 * 8); + vec1 = LD_SH(tmp_odd_buf + 13 * 8); + vec2 = LD_SH(tmp_odd_buf + 10 * 8); + vec3 = LD_SH(tmp_odd_buf + 3 * 8); + loc0 = LD_SH(tmp_eve_buf + 2 * 8); + loc1 = LD_SH(tmp_eve_buf + 10 * 8); + loc2 = LD_SH(tmp_eve_buf + 6 * 8); + loc3 = LD_SH(tmp_eve_buf + 14 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7); + SRARI_H4_SH(m1, m3, m5, m7, 6); + VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride), m1, m3, m5, m7); + + SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, m3, m5, m1); + SRARI_H4_SH(m1, m3, m5, m7, 6); + VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride), m1, m3, m5, + m7); + + /* Load 8 & Store 8 */ + vec0 = LD_SH(tmp_odd_buf + 2 * 8); + vec1 = LD_SH(tmp_odd_buf + 11 * 8); + vec2 = LD_SH(tmp_odd_buf + 12 * 8); + vec3 = LD_SH(tmp_odd_buf + 7 * 8); + loc0 = LD_SH(tmp_eve_buf + 1 * 8); + loc1 = LD_SH(tmp_eve_buf + 9 * 8); + loc2 = LD_SH(tmp_eve_buf + 5 * 8); + loc3 = LD_SH(tmp_eve_buf + 13 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6); + SRARI_H4_SH(n0, n2, n4, n6, 6); + VP9_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride), n0, n2, n4, n6); + + SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, n2, n4, n0); + SRARI_H4_SH(n0, n2, n4, n6, 6); + VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride), n0, n2, n4, + n6); + + /* Load 8 & Store 8 */ + vec0 = LD_SH(tmp_odd_buf + 5 * 8); + vec1 = LD_SH(tmp_odd_buf + 15 * 8); + vec2 = LD_SH(tmp_odd_buf + 8 * 8); + vec3 = LD_SH(tmp_odd_buf + 1 * 8); + loc0 = LD_SH(tmp_eve_buf + 3 * 8); + loc1 = LD_SH(tmp_eve_buf + 11 * 8); + loc2 = LD_SH(tmp_eve_buf + 7 * 8); + loc3 = LD_SH(tmp_eve_buf + 15 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7); + SRARI_H4_SH(n1, n3, n5, n7, 6); + VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride), n1, n3, n5, n7); + + SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1); + SRARI_H4_SH(n1, n3, n5, n7, 6); + VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride), n1, n3, n5, + n7); +} + +static void idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride) { + DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]); + DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]); + + idct8x32_column_even_process_store(input, &tmp_eve_buf[0]); + idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]); + idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], dst, + dst_stride); +} + +void vpx_idct32x32_1024_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]); + int16_t *out_ptr = out_arr; + + /* transform rows */ + for (i = 0; i < 4; ++i) { + /* process 32 * 8 block */ + idct32x8_1d_rows_msa((input + (i << 8)), (out_ptr + (i << 8))); + } + + /* transform columns */ + for (i = 0; i < 4; ++i) { + /* process 8 * 32 block */ + idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)), + dst_stride); + } +} + +void vpx_idct32x32_34_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]); + int16_t *out_ptr = out_arr; + + for (i = 32; i--;) { + __asm__ __volatile__( + "sw $zero, 0(%[out_ptr]) \n\t" + "sw $zero, 4(%[out_ptr]) \n\t" + "sw $zero, 8(%[out_ptr]) \n\t" + "sw $zero, 12(%[out_ptr]) \n\t" + "sw $zero, 16(%[out_ptr]) \n\t" + "sw $zero, 20(%[out_ptr]) \n\t" + "sw $zero, 24(%[out_ptr]) \n\t" + "sw $zero, 28(%[out_ptr]) \n\t" + "sw $zero, 32(%[out_ptr]) \n\t" + "sw $zero, 36(%[out_ptr]) \n\t" + "sw $zero, 40(%[out_ptr]) \n\t" + "sw $zero, 44(%[out_ptr]) \n\t" + "sw $zero, 48(%[out_ptr]) \n\t" + "sw $zero, 52(%[out_ptr]) \n\t" + "sw $zero, 56(%[out_ptr]) \n\t" + "sw $zero, 60(%[out_ptr]) \n\t" + + : + : [out_ptr] "r"(out_ptr)); + + out_ptr += 32; + } + + out_ptr = out_arr; + + /* rows: only upper-left 8x8 has non-zero coeff */ + idct32x8_1d_rows_msa(input, out_ptr); + + /* transform columns */ + for (i = 0; i < 4; ++i) { + /* process 8 * 32 block */ + idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)), + dst_stride); + } +} + +void vpx_idct32x32_1_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int32_t i; + int16_t out; + v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; + v8i16 res0, res1, res2, res3, res4, res5, res6, res7, vec; + + out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO(out, 6); + + vec = __msa_fill_h(out); + + for (i = 16; i--;) { + LD_UB2(dst, 16, dst0, dst1); + LD_UB2(dst + dst_stride, 16, dst2, dst3); + + UNPCK_UB_SH(dst0, res0, res4); + UNPCK_UB_SH(dst1, res1, res5); + UNPCK_UB_SH(dst2, res2, res6); + UNPCK_UB_SH(dst3, res3, res7); + ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3); + ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7); + CLIP_SH4_0_255(res0, res1, res2, res3); + CLIP_SH4_0_255(res4, res5, res6, res7); + PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1, + tmp2, tmp3); + + ST_UB2(tmp0, tmp1, dst, 16); + dst += dst_stride; + ST_UB2(tmp2, tmp3, dst, 16); + dst += dst_stride; + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/mips/idct4x4_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/idct4x4_msa.c new file mode 100644 index 0000000000..56ffec3cba --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/idct4x4_msa.c @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/inv_txfm_msa.h" + +void vpx_iwht4x4_16_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + v8i16 in0, in1, in2, in3; + v4i32 in0_r, in1_r, in2_r, in3_r, in4_r; + + /* load vector elements of 4x4 block */ + LD4x4_SH(input, in0, in2, in3, in1); + TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1); + UNPCK_R_SH_SW(in0, in0_r); + UNPCK_R_SH_SW(in2, in2_r); + UNPCK_R_SH_SW(in3, in3_r); + UNPCK_R_SH_SW(in1, in1_r); + SRA_4V(in0_r, in1_r, in2_r, in3_r, UNIT_QUANT_SHIFT); + + in0_r += in2_r; + in3_r -= in1_r; + in4_r = (in0_r - in3_r) >> 1; + in1_r = in4_r - in1_r; + in2_r = in4_r - in2_r; + in0_r -= in1_r; + in3_r += in2_r; + + TRANSPOSE4x4_SW_SW(in0_r, in1_r, in2_r, in3_r, in0_r, in1_r, in2_r, in3_r); + + in0_r += in1_r; + in2_r -= in3_r; + in4_r = (in0_r - in2_r) >> 1; + in3_r = in4_r - in3_r; + in1_r = in4_r - in1_r; + in0_r -= in3_r; + in2_r += in1_r; + + PCKEV_H4_SH(in0_r, in0_r, in1_r, in1_r, in2_r, in2_r, in3_r, in3_r, in0, in1, + in2, in3); + ADDBLK_ST4x4_UB(in0, in3, in1, in2, dst, dst_stride); +} + +void vpx_iwht4x4_1_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int16_t a1, e1; + v8i16 in1, in0 = { 0 }; + + a1 = input[0] >> UNIT_QUANT_SHIFT; + e1 = a1 >> 1; + a1 -= e1; + + in0 = __msa_insert_h(in0, 0, a1); + in0 = __msa_insert_h(in0, 1, e1); + in0 = __msa_insert_h(in0, 2, e1); + in0 = __msa_insert_h(in0, 3, e1); + + in1 = in0 >> 1; + in0 -= in1; + + ADDBLK_ST4x4_UB(in0, in1, in1, in1, dst, dst_stride); +} + +void vpx_idct4x4_16_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + v8i16 in0, in1, in2, in3; + + /* load vector elements of 4x4 block */ + LD4x4_SH(input, in0, in1, in2, in3); + /* rows */ + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); + /* columns */ + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); + /* rounding (add 2^3, divide by 2^4) */ + SRARI_H4_SH(in0, in1, in2, in3, 4); + ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride); +} + +void vpx_idct4x4_1_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int16_t out; + v8i16 vec; + + out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO(out, 4); + vec = __msa_fill_h(out); + + ADDBLK_ST4x4_UB(vec, vec, vec, vec, dst, dst_stride); +} diff --git a/media/libvpx/libvpx/vpx_dsp/mips/idct8x8_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/idct8x8_msa.c new file mode 100644 index 0000000000..a383ff2066 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/idct8x8_msa.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/inv_txfm_msa.h" + +void vpx_idct8x8_64_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + + /* load vector elements of 8x8 block */ + LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); + + /* rows transform */ + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + /* 1D idct8x8 */ + VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + /* columns transform */ + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + /* 1D idct8x8 */ + VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + /* final rounding (add 2^4, divide by 2^5) and shift */ + SRARI_H4_SH(in0, in1, in2, in3, 5); + SRARI_H4_SH(in4, in5, in6, in7, 5); + /* add block and store 8x8 */ + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3); + dst += (4 * dst_stride); + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7); +} + +void vpx_idct8x8_12_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 s0, s1, s2, s3, s4, s5, s6, s7, k0, k1, k2, k3, m0, m1, m2, m3; + v4i32 tmp0, tmp1, tmp2, tmp3; + v8i16 zero = { 0 }; + + /* load vector elements of 8x8 block */ + LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + + /* stage1 */ + ILVL_H2_SH(in3, in0, in2, in1, s0, s1); + k0 = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); + k1 = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); + k2 = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); + k3 = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); + DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3); + SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, DCT_CONST_BITS); + PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1); + PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3); + BUTTERFLY_4(s0, s1, s3, s2, s4, s7, s6, s5); + + /* stage2 */ + ILVR_H2_SH(in3, in1, in2, in0, s1, s0); + k0 = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); + k1 = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); + k2 = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); + k3 = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); + DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3); + SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, DCT_CONST_BITS); + PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1); + PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3); + BUTTERFLY_4(s0, s1, s2, s3, m0, m1, m2, m3); + + /* stage3 */ + s0 = __msa_ilvr_h(s6, s5); + + k1 = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); + DOTP_SH2_SW(s0, s0, k1, k0, tmp0, tmp1); + SRARI_W2_SW(tmp0, tmp1, DCT_CONST_BITS); + PCKEV_H2_SH(zero, tmp0, zero, tmp1, s2, s3); + + /* stage4 */ + BUTTERFLY_8(m0, m1, m2, m3, s4, s2, s3, s7, in0, in1, in2, in3, in4, in5, in6, + in7); + TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + + /* final rounding (add 2^4, divide by 2^5) and shift */ + SRARI_H4_SH(in0, in1, in2, in3, 5); + SRARI_H4_SH(in4, in5, in6, in7, 5); + + /* add block and store 8x8 */ + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3); + dst += (4 * dst_stride); + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7); +} + +void vpx_idct8x8_1_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int16_t out; + int32_t val; + v8i16 vec; + + out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); + val = ROUND_POWER_OF_TWO(out, 5); + vec = __msa_fill_h(val); + + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec); + dst += (4 * dst_stride); + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec); +} diff --git a/media/libvpx/libvpx/vpx_dsp/mips/intrapred16_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/intrapred16_dspr2.c new file mode 100644 index 0000000000..835e10e125 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/intrapred16_dspr2.c @@ -0,0 +1,325 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_dsp/mips/common_dspr2.h" + +#if HAVE_DSPR2 +void vpx_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16; + (void)above; + + __asm__ __volatile__( + "lb %[tmp1], (%[left]) \n\t" + "lb %[tmp2], 1(%[left]) \n\t" + "lb %[tmp3], 2(%[left]) \n\t" + "lb %[tmp4], 3(%[left]) \n\t" + "lb %[tmp5], 4(%[left]) \n\t" + "lb %[tmp6], 5(%[left]) \n\t" + "lb %[tmp7], 6(%[left]) \n\t" + "lb %[tmp8], 7(%[left]) \n\t" + "lb %[tmp9], 8(%[left]) \n\t" + "lb %[tmp10], 9(%[left]) \n\t" + "lb %[tmp11], 10(%[left]) \n\t" + "lb %[tmp12], 11(%[left]) \n\t" + "lb %[tmp13], 12(%[left]) \n\t" + "lb %[tmp14], 13(%[left]) \n\t" + "lb %[tmp15], 14(%[left]) \n\t" + "lb %[tmp16], 15(%[left]) \n\t" + + "replv.qb %[tmp1], %[tmp1] \n\t" + "replv.qb %[tmp2], %[tmp2] \n\t" + "replv.qb %[tmp3], %[tmp3] \n\t" + "replv.qb %[tmp4], %[tmp4] \n\t" + "replv.qb %[tmp5], %[tmp5] \n\t" + "replv.qb %[tmp6], %[tmp6] \n\t" + "replv.qb %[tmp7], %[tmp7] \n\t" + "replv.qb %[tmp8], %[tmp8] \n\t" + "replv.qb %[tmp9], %[tmp9] \n\t" + "replv.qb %[tmp10], %[tmp10] \n\t" + "replv.qb %[tmp11], %[tmp11] \n\t" + "replv.qb %[tmp12], %[tmp12] \n\t" + "replv.qb %[tmp13], %[tmp13] \n\t" + "replv.qb %[tmp14], %[tmp14] \n\t" + "replv.qb %[tmp15], %[tmp15] \n\t" + "replv.qb %[tmp16], %[tmp16] \n\t" + + "sw %[tmp1], (%[dst]) \n\t" + "sw %[tmp1], 4(%[dst]) \n\t" + "sw %[tmp1], 8(%[dst]) \n\t" + "sw %[tmp1], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp2], (%[dst]) \n\t" + "sw %[tmp2], 4(%[dst]) \n\t" + "sw %[tmp2], 8(%[dst]) \n\t" + "sw %[tmp2], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp3], (%[dst]) \n\t" + "sw %[tmp3], 4(%[dst]) \n\t" + "sw %[tmp3], 8(%[dst]) \n\t" + "sw %[tmp3], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp4], (%[dst]) \n\t" + "sw %[tmp4], 4(%[dst]) \n\t" + "sw %[tmp4], 8(%[dst]) \n\t" + "sw %[tmp4], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp5], (%[dst]) \n\t" + "sw %[tmp5], 4(%[dst]) \n\t" + "sw %[tmp5], 8(%[dst]) \n\t" + "sw %[tmp5], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp6], (%[dst]) \n\t" + "sw %[tmp6], 4(%[dst]) \n\t" + "sw %[tmp6], 8(%[dst]) \n\t" + "sw %[tmp6], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp7], (%[dst]) \n\t" + "sw %[tmp7], 4(%[dst]) \n\t" + "sw %[tmp7], 8(%[dst]) \n\t" + "sw %[tmp7], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp8], (%[dst]) \n\t" + "sw %[tmp8], 4(%[dst]) \n\t" + "sw %[tmp8], 8(%[dst]) \n\t" + "sw %[tmp8], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp9], (%[dst]) \n\t" + "sw %[tmp9], 4(%[dst]) \n\t" + "sw %[tmp9], 8(%[dst]) \n\t" + "sw %[tmp9], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp10], (%[dst]) \n\t" + "sw %[tmp10], 4(%[dst]) \n\t" + "sw %[tmp10], 8(%[dst]) \n\t" + "sw %[tmp10], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp11], (%[dst]) \n\t" + "sw %[tmp11], 4(%[dst]) \n\t" + "sw %[tmp11], 8(%[dst]) \n\t" + "sw %[tmp11], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp12], (%[dst]) \n\t" + "sw %[tmp12], 4(%[dst]) \n\t" + "sw %[tmp12], 8(%[dst]) \n\t" + "sw %[tmp12], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp13], (%[dst]) \n\t" + "sw %[tmp13], 4(%[dst]) \n\t" + "sw %[tmp13], 8(%[dst]) \n\t" + "sw %[tmp13], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp14], (%[dst]) \n\t" + "sw %[tmp14], 4(%[dst]) \n\t" + "sw %[tmp14], 8(%[dst]) \n\t" + "sw %[tmp14], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp15], (%[dst]) \n\t" + "sw %[tmp15], 4(%[dst]) \n\t" + "sw %[tmp15], 8(%[dst]) \n\t" + "sw %[tmp15], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp16], (%[dst]) \n\t" + "sw %[tmp16], 4(%[dst]) \n\t" + "sw %[tmp16], 8(%[dst]) \n\t" + "sw %[tmp16], 12(%[dst]) \n\t" + + : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3), + [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7), + [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8), [tmp9] "=&r"(tmp9), + [tmp10] "=&r"(tmp10), [tmp11] "=&r"(tmp11), [tmp12] "=&r"(tmp12), + [tmp13] "=&r"(tmp13), [tmp14] "=&r"(tmp14), [tmp15] "=&r"(tmp15), + [tmp16] "=&r"(tmp16) + : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride)); +} + +void vpx_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t expected_dc; + int32_t average; + int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1; + int32_t above2, left2; + + __asm__ __volatile__( + "lw %[above1], (%[above]) \n\t" + "lw %[above2], 4(%[above]) \n\t" + "lw %[left1], (%[left]) \n\t" + "lw %[left2], 4(%[left]) \n\t" + + "preceu.ph.qbl %[above_l1], %[above1] \n\t" + "preceu.ph.qbr %[above_r1], %[above1] \n\t" + "preceu.ph.qbl %[left_l1], %[left1] \n\t" + "preceu.ph.qbr %[left_r1], %[left1] \n\t" + + "addu.ph %[average], %[above_r1], %[above_l1] \n\t" + "addu.ph %[average], %[average], %[left_l1] \n\t" + "addu.ph %[average], %[average], %[left_r1] \n\t" + + "preceu.ph.qbl %[above_l1], %[above2] \n\t" + "preceu.ph.qbr %[above_r1], %[above2] \n\t" + "preceu.ph.qbl %[left_l1], %[left2] \n\t" + "preceu.ph.qbr %[left_r1], %[left2] \n\t" + + "addu.ph %[average], %[average], %[above_l1] \n\t" + "addu.ph %[average], %[average], %[above_r1] \n\t" + "addu.ph %[average], %[average], %[left_l1] \n\t" + "addu.ph %[average], %[average], %[left_r1] \n\t" + + "lw %[above1], 8(%[above]) \n\t" + "lw %[above2], 12(%[above]) \n\t" + "lw %[left1], 8(%[left]) \n\t" + "lw %[left2], 12(%[left]) \n\t" + + "preceu.ph.qbl %[above_l1], %[above1] \n\t" + "preceu.ph.qbr %[above_r1], %[above1] \n\t" + "preceu.ph.qbl %[left_l1], %[left1] \n\t" + "preceu.ph.qbr %[left_r1], %[left1] \n\t" + + "addu.ph %[average], %[average], %[above_l1] \n\t" + "addu.ph %[average], %[average], %[above_r1] \n\t" + "addu.ph %[average], %[average], %[left_l1] \n\t" + "addu.ph %[average], %[average], %[left_r1] \n\t" + + "preceu.ph.qbl %[above_l1], %[above2] \n\t" + "preceu.ph.qbr %[above_r1], %[above2] \n\t" + "preceu.ph.qbl %[left_l1], %[left2] \n\t" + "preceu.ph.qbr %[left_r1], %[left2] \n\t" + + "addu.ph %[average], %[average], %[above_l1] \n\t" + "addu.ph %[average], %[average], %[above_r1] \n\t" + "addu.ph %[average], %[average], %[left_l1] \n\t" + "addu.ph %[average], %[average], %[left_r1] \n\t" + + "addiu %[average], %[average], 16 \n\t" + "srl %[tmp], %[average], 16 \n\t" + "addu.ph %[average], %[tmp], %[average] \n\t" + "srl %[expected_dc], %[average], 5 \n\t" + "replv.qb %[expected_dc], %[expected_dc] \n\t" + + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + : [left1] "=&r"(left1), [above1] "=&r"(above1), [left_l1] "=&r"(left_l1), + [above_l1] "=&r"(above_l1), [left_r1] "=&r"(left_r1), + [above_r1] "=&r"(above_r1), [above2] "=&r"(above2), + [left2] "=&r"(left2), [average] "=&r"(average), [tmp] "=&r"(tmp), + [expected_dc] "=&r"(expected_dc) + : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), + [stride] "r"(stride)); +} +#endif // #if HAVE_DSPR2 diff --git a/media/libvpx/libvpx/vpx_dsp/mips/intrapred4_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/intrapred4_dspr2.c new file mode 100644 index 0000000000..dce03a2b2a --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/intrapred4_dspr2.c @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_dsp/mips/common_dspr2.h" + +#if HAVE_DSPR2 +void vpx_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t tmp1, tmp2, tmp3, tmp4; + (void)above; + + __asm__ __volatile__( + "lb %[tmp1], (%[left]) \n\t" + "lb %[tmp2], 1(%[left]) \n\t" + "lb %[tmp3], 2(%[left]) \n\t" + "lb %[tmp4], 3(%[left]) \n\t" + "replv.qb %[tmp1], %[tmp1] \n\t" + "replv.qb %[tmp2], %[tmp2] \n\t" + "replv.qb %[tmp3], %[tmp3] \n\t" + "replv.qb %[tmp4], %[tmp4] \n\t" + "sw %[tmp1], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp2], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp3], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp4], (%[dst]) \n\t" + + : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3), + [tmp4] "=&r"(tmp4) + : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride)); +} + +void vpx_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t expected_dc; + int32_t average; + int32_t tmp, above_c, above_l, above_r, left_c, left_r, left_l; + + __asm__ __volatile__( + "lw %[above_c], (%[above]) \n\t" + "lw %[left_c], (%[left]) \n\t" + + "preceu.ph.qbl %[above_l], %[above_c] \n\t" + "preceu.ph.qbr %[above_r], %[above_c] \n\t" + "preceu.ph.qbl %[left_l], %[left_c] \n\t" + "preceu.ph.qbr %[left_r], %[left_c] \n\t" + + "addu.ph %[average], %[above_r], %[above_l] \n\t" + "addu.ph %[average], %[average], %[left_l] \n\t" + "addu.ph %[average], %[average], %[left_r] \n\t" + "addiu %[average], %[average], 4 \n\t" + "srl %[tmp], %[average], 16 \n\t" + "addu.ph %[average], %[tmp], %[average] \n\t" + "srl %[expected_dc], %[average], 3 \n\t" + "replv.qb %[expected_dc], %[expected_dc] \n\t" + + "sw %[expected_dc], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + + : [above_c] "=&r"(above_c), [above_l] "=&r"(above_l), + [above_r] "=&r"(above_r), [left_c] "=&r"(left_c), + [left_l] "=&r"(left_l), [left_r] "=&r"(left_r), + [average] "=&r"(average), [tmp] "=&r"(tmp), + [expected_dc] "=&r"(expected_dc) + : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), + [stride] "r"(stride)); +} + +void vpx_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t abovel, abover; + int32_t left0, left1, left2, left3; + int32_t res0, res1; + int32_t resl; + int32_t resr; + int32_t top_left; + uint8_t *cm = vpx_ff_cropTbl; + + __asm__ __volatile__( + "ulw %[resl], (%[above]) \n\t" + + "lbu %[left0], (%[left]) \n\t" + "lbu %[left1], 1(%[left]) \n\t" + "lbu %[left2], 2(%[left]) \n\t" + "lbu %[left3], 3(%[left]) \n\t" + + "lbu %[top_left], -1(%[above]) \n\t" + + "preceu.ph.qbl %[abovel], %[resl] \n\t" + "preceu.ph.qbr %[abover], %[resl] \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "replv.ph %[left1], %[left1] \n\t" + "replv.ph %[left2], %[left2] \n\t" + "replv.ph %[left3], %[left3] \n\t" + + "replv.ph %[top_left], %[top_left] \n\t" + + "addu.ph %[resl], %[abovel], %[left0] \n\t" + "subu.ph %[resl], %[resl], %[top_left] \n\t" + + "addu.ph %[resr], %[abover], %[left0] \n\t" + "subu.ph %[resr], %[resr], %[top_left] \n\t" + + "sll %[res0], %[resr], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + + "sra %[res1], %[resr], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "sb %[res0], (%[dst]) \n\t" + + "sll %[res0], %[resl], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + + "sra %[res1], %[resl], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + + "addu.ph %[resl], %[abovel], %[left1] \n\t" + "subu.ph %[resl], %[resl], %[top_left] \n\t" + + "addu.ph %[resr], %[abover], %[left1] \n\t" + "subu.ph %[resr], %[resr], %[top_left] \n\t" + + "sb %[res0], 2(%[dst]) \n\t" + "sb %[res1], 3(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + + "sll %[res0], %[resr], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + + "sra %[res1], %[resr], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "sb %[res0], (%[dst]) \n\t" + + "sll %[res0], %[resl], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + + "sb %[res1], 1(%[dst]) \n\t" + "sra %[res1], %[resl], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + + "addu.ph %[resl], %[abovel], %[left2] \n\t" + "subu.ph %[resl], %[resl], %[top_left] \n\t" + + "addu.ph %[resr], %[abover], %[left2] \n\t" + "subu.ph %[resr], %[resr], %[top_left] \n\t" + + "sb %[res0], 2(%[dst]) \n\t" + "sb %[res1], 3(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + + "sll %[res0], %[resr], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + + "sra %[res1], %[resr], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "sb %[res0], (%[dst]) \n\t" + + "sll %[res0], %[resl], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + + "sb %[res1], 1(%[dst]) \n\t" + "sra %[res1], %[resl], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + + "addu.ph %[resl], %[abovel], %[left3] \n\t" + "subu.ph %[resl], %[resl], %[top_left] \n\t" + + "addu.ph %[resr], %[abover], %[left3] \n\t" + "subu.ph %[resr], %[resr], %[top_left] \n\t" + + "sb %[res0], 2(%[dst]) \n\t" + "sb %[res1], 3(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + + "sll %[res0], %[resr], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + + "sra %[res1], %[resr], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "sb %[res0], (%[dst]) \n\t" + + "sll %[res0], %[resl], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + + "sra %[res1], %[resl], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + + "sb %[res0], 2(%[dst]) \n\t" + "sb %[res1], 3(%[dst]) \n\t" + + : [abovel] "=&r"(abovel), [abover] "=&r"(abover), [left0] "=&r"(left0), + [left1] "=&r"(left1), [left2] "=&r"(left2), [res0] "=&r"(res0), + [res1] "=&r"(res1), [left3] "=&r"(left3), [resl] "=&r"(resl), + [resr] "=&r"(resr), [top_left] "=&r"(top_left) + : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), + [stride] "r"(stride), [cm] "r"(cm)); +} +#endif // #if HAVE_DSPR2 diff --git a/media/libvpx/libvpx/vpx_dsp/mips/intrapred8_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/intrapred8_dspr2.c new file mode 100644 index 0000000000..16e7fc5507 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/intrapred8_dspr2.c @@ -0,0 +1,603 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_dsp/mips/common_dspr2.h" + +#if HAVE_DSPR2 +void vpx_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + (void)above; + + __asm__ __volatile__( + "lb %[tmp1], (%[left]) \n\t" + "lb %[tmp2], 1(%[left]) \n\t" + "lb %[tmp3], 2(%[left]) \n\t" + "lb %[tmp4], 3(%[left]) \n\t" + "lb %[tmp5], 4(%[left]) \n\t" + "lb %[tmp6], 5(%[left]) \n\t" + "lb %[tmp7], 6(%[left]) \n\t" + "lb %[tmp8], 7(%[left]) \n\t" + + "replv.qb %[tmp1], %[tmp1] \n\t" + "replv.qb %[tmp2], %[tmp2] \n\t" + "replv.qb %[tmp3], %[tmp3] \n\t" + "replv.qb %[tmp4], %[tmp4] \n\t" + "replv.qb %[tmp5], %[tmp5] \n\t" + "replv.qb %[tmp6], %[tmp6] \n\t" + "replv.qb %[tmp7], %[tmp7] \n\t" + "replv.qb %[tmp8], %[tmp8] \n\t" + + "sw %[tmp1], (%[dst]) \n\t" + "sw %[tmp1], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp2], (%[dst]) \n\t" + "sw %[tmp2], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp3], (%[dst]) \n\t" + "sw %[tmp3], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp4], (%[dst]) \n\t" + "sw %[tmp4], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp5], (%[dst]) \n\t" + "sw %[tmp5], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp6], (%[dst]) \n\t" + "sw %[tmp6], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp7], (%[dst]) \n\t" + "sw %[tmp7], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp8], (%[dst]) \n\t" + "sw %[tmp8], 4(%[dst]) \n\t" + + : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3), + [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7), + [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8) + : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride)); +} + +void vpx_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t expected_dc; + int32_t average; + int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1; + int32_t above2, above_l2, above_r2, left2, left_r2, left_l2; + + __asm__ __volatile__( + "lw %[above1], (%[above]) \n\t" + "lw %[above2], 4(%[above]) \n\t" + "lw %[left1], (%[left]) \n\t" + "lw %[left2], 4(%[left]) \n\t" + + "preceu.ph.qbl %[above_l1], %[above1] \n\t" + "preceu.ph.qbr %[above_r1], %[above1] \n\t" + "preceu.ph.qbl %[left_l1], %[left1] \n\t" + "preceu.ph.qbr %[left_r1], %[left1] \n\t" + + "preceu.ph.qbl %[above_l2], %[above2] \n\t" + "preceu.ph.qbr %[above_r2], %[above2] \n\t" + "preceu.ph.qbl %[left_l2], %[left2] \n\t" + "preceu.ph.qbr %[left_r2], %[left2] \n\t" + + "addu.ph %[average], %[above_r1], %[above_l1] \n\t" + "addu.ph %[average], %[average], %[left_l1] \n\t" + "addu.ph %[average], %[average], %[left_r1] \n\t" + + "addu.ph %[average], %[average], %[above_l2] \n\t" + "addu.ph %[average], %[average], %[above_r2] \n\t" + "addu.ph %[average], %[average], %[left_l2] \n\t" + "addu.ph %[average], %[average], %[left_r2] \n\t" + + "addiu %[average], %[average], 8 \n\t" + + "srl %[tmp], %[average], 16 \n\t" + "addu.ph %[average], %[tmp], %[average] \n\t" + "srl %[expected_dc], %[average], 4 \n\t" + "replv.qb %[expected_dc], %[expected_dc] \n\t" + + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + : [above1] "=&r"(above1), [above_l1] "=&r"(above_l1), + [above_r1] "=&r"(above_r1), [left1] "=&r"(left1), + [left_l1] "=&r"(left_l1), [left_r1] "=&r"(left_r1), + [above2] "=&r"(above2), [above_l2] "=&r"(above_l2), + [above_r2] "=&r"(above_r2), [left2] "=&r"(left2), + [left_l2] "=&r"(left_l2), [left_r2] "=&r"(left_r2), + [average] "=&r"(average), [tmp] "=&r"(tmp), + [expected_dc] "=&r"(expected_dc) + : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), + [stride] "r"(stride)); +} + +void vpx_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t abovel, abover; + int32_t abovel_1, abover_1; + int32_t left0; + int32_t res0, res1, res2, res3; + int32_t reshw; + int32_t top_left; + uint8_t *cm = vpx_ff_cropTbl; + + __asm__ __volatile__( + "ulw %[reshw], (%[above]) \n\t" + "ulw %[top_left], 4(%[above]) \n\t" + + "lbu %[left0], (%[left]) \n\t" + + "preceu.ph.qbl %[abovel], %[reshw] \n\t" + "preceu.ph.qbr %[abover], %[reshw] \n\t" + "preceu.ph.qbl %[abovel_1], %[top_left] \n\t" + "preceu.ph.qbr %[abover_1], %[top_left] \n\t" + + "lbu %[top_left], -1(%[above]) \n\t" + "replv.ph %[left0], %[left0] \n\t" + + "replv.ph %[top_left], %[top_left] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 1(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 2(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 3(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 4(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 5(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 6(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 7(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + : [abovel] "=&r"(abovel), [abover] "=&r"(abover), + [abovel_1] "=&r"(abovel_1), [abover_1] "=&r"(abover_1), + [left0] "=&r"(left0), [res2] "=&r"(res2), [res3] "=&r"(res3), + [res0] "=&r"(res0), [res1] "=&r"(res1), [reshw] "=&r"(reshw), + [top_left] "=&r"(top_left) + : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), + [stride] "r"(stride), [cm] "r"(cm)); +} +#endif // #if HAVE_DSPR2 diff --git a/media/libvpx/libvpx/vpx_dsp/mips/intrapred_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/intrapred_msa.c new file mode 100644 index 0000000000..b5ee943031 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/intrapred_msa.c @@ -0,0 +1,738 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/macros_msa.h" + +#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \ + { \ + out0 = __msa_subs_u_h(out0, in0); \ + out1 = __msa_subs_u_h(out1, in1); \ + } + +static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t src_data; + + src_data = LW(src); + + SW4(src_data, src_data, src_data, src_data, dst, dst_stride); +} + +static void intra_predict_vert_8x8_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + uint32_t src_data1, src_data2; + + src_data1 = LW(src); + src_data2 = LW(src + 4); + + for (row = 8; row--;) { + SW(src_data1, dst); + SW(src_data2, (dst + 4)); + dst += dst_stride; + } +} + +static void intra_predict_vert_16x16_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + v16u8 src0; + + src0 = LD_UB(src); + + for (row = 16; row--;) { + ST_UB(src0, dst); + dst += dst_stride; + } +} + +static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + v16u8 src1, src2; + + src1 = LD_UB(src); + src2 = LD_UB(src + 16); + + for (row = 32; row--;) { + ST_UB2(src1, src2, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_horiz_4x4_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t out0, out1, out2, out3; + + out0 = src[0] * 0x01010101; + out1 = src[1] * 0x01010101; + out2 = src[2] * 0x01010101; + out3 = src[3] * 0x01010101; + + SW4(out0, out1, out2, out3, dst, dst_stride); +} + +static void intra_predict_horiz_8x8_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + out0 = src[0] * 0x0101010101010101ull; + out1 = src[1] * 0x0101010101010101ull; + out2 = src[2] * 0x0101010101010101ull; + out3 = src[3] * 0x0101010101010101ull; + out4 = src[4] * 0x0101010101010101ull; + out5 = src[5] * 0x0101010101010101ull; + out6 = src[6] * 0x0101010101010101ull; + out7 = src[7] * 0x0101010101010101ull; + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); +} + +static void intra_predict_horiz_16x16_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + uint8_t inp0, inp1, inp2, inp3; + v16u8 src0, src1, src2, src3; + + for (row = 4; row--;) { + inp0 = src[0]; + inp1 = src[1]; + inp2 = src[2]; + inp3 = src[3]; + src += 4; + + src0 = (v16u8)__msa_fill_b(inp0); + src1 = (v16u8)__msa_fill_b(inp1); + src2 = (v16u8)__msa_fill_b(inp2); + src3 = (v16u8)__msa_fill_b(inp3); + + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + uint8_t inp0, inp1, inp2, inp3; + v16u8 src0, src1, src2, src3; + + for (row = 8; row--;) { + inp0 = src[0]; + inp1 = src[1]; + inp2 = src[2]; + inp3 = src[3]; + src += 4; + + src0 = (v16u8)__msa_fill_b(inp0); + src1 = (v16u8)__msa_fill_b(inp1); + src2 = (v16u8)__msa_fill_b(inp2); + src3 = (v16u8)__msa_fill_b(inp3); + + ST_UB2(src0, src0, dst, 16); + dst += dst_stride; + ST_UB2(src1, src1, dst, 16); + dst += dst_stride; + ST_UB2(src2, src2, dst, 16); + dst += dst_stride; + ST_UB2(src3, src3, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_dc_4x4_msa(const uint8_t *src_top, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + uint32_t val0, val1; + v16i8 store, src = { 0 }; + v8u16 sum_h; + v4u32 sum_w; + v2u64 sum_d; + + val0 = LW(src_top); + val1 = LW(src_left); + INSERT_W2_SB(val0, val1, src); + sum_h = __msa_hadd_u_h((v16u8)src, (v16u8)src); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3); + store = __msa_splati_b((v16i8)sum_w, 0); + val0 = __msa_copy_u_w((v4i32)store, 0); + + SW4(val0, val0, val0, val0, dst, dst_stride); +} + +static void intra_predict_dc_tl_4x4_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t val0; + v16i8 store, data = { 0 }; + v8u16 sum_h; + v4u32 sum_w; + + val0 = LW(src); + data = (v16i8)__msa_insert_w((v4i32)data, 0, val0); + sum_h = __msa_hadd_u_h((v16u8)data, (v16u8)data); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_w, 2); + store = __msa_splati_b((v16i8)sum_w, 0); + val0 = __msa_copy_u_w((v4i32)store, 0); + + SW4(val0, val0, val0, val0, dst, dst_stride); +} + +static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) { + uint32_t out; + const v16i8 store = __msa_ldi_b(128); + + out = __msa_copy_u_w((v4i32)store, 0); + + SW4(out, out, out, out, dst, dst_stride); +} + +static void intra_predict_dc_8x8_msa(const uint8_t *src_top, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + uint64_t val0, val1; + v16i8 store; + v16u8 src = { 0 }; + v8u16 sum_h; + v4u32 sum_w; + v2u64 sum_d; + + val0 = LD(src_top); + val1 = LD(src_left); + INSERT_D2_UB(val0, val1, src); + sum_h = __msa_hadd_u_h(src, src); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4); + store = __msa_splati_b((v16i8)sum_w, 0); + val0 = __msa_copy_u_d((v2i64)store, 0); + + SD4(val0, val0, val0, val0, dst, dst_stride); + dst += (4 * dst_stride); + SD4(val0, val0, val0, val0, dst, dst_stride); +} + +static void intra_predict_dc_tl_8x8_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint64_t val0; + v16i8 store; + v16u8 data = { 0 }; + v8u16 sum_h; + v4u32 sum_w; + v2u64 sum_d; + + val0 = LD(src); + data = (v16u8)__msa_insert_d((v2i64)data, 0, val0); + sum_h = __msa_hadd_u_h(data, data); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3); + store = __msa_splati_b((v16i8)sum_w, 0); + val0 = __msa_copy_u_d((v2i64)store, 0); + + SD4(val0, val0, val0, val0, dst, dst_stride); + dst += (4 * dst_stride); + SD4(val0, val0, val0, val0, dst, dst_stride); +} + +static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) { + uint64_t out; + const v16i8 store = __msa_ldi_b(128); + + out = __msa_copy_u_d((v2i64)store, 0); + + SD4(out, out, out, out, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out, out, out, out, dst, dst_stride); +} + +static void intra_predict_dc_16x16_msa(const uint8_t *src_top, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + v16u8 top, left, out; + v8u16 sum_h, sum_top, sum_left; + v4u32 sum_w; + v2u64 sum_d; + + top = LD_UB(src_top); + left = LD_UB(src_left); + HADD_UB2_UH(top, left, sum_top, sum_left); + sum_h = sum_top + sum_left; + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5); + out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); + + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); + dst += (8 * dst_stride); + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); +} + +static void intra_predict_dc_tl_16x16_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + v16u8 data, out; + v8u16 sum_h; + v4u32 sum_w; + v2u64 sum_d; + + data = LD_UB(src); + sum_h = __msa_hadd_u_h(data, data); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4); + out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); + + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); + dst += (8 * dst_stride); + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); +} + +static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) { + const v16u8 out = (v16u8)__msa_ldi_b(128); + + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); + dst += (8 * dst_stride); + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); +} + +static void intra_predict_dc_32x32_msa(const uint8_t *src_top, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + v16u8 top0, top1, left0, left1, out; + v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1; + v4u32 sum_w; + v2u64 sum_d; + + LD_UB2(src_top, 16, top0, top1); + LD_UB2(src_left, 16, left0, left1); + HADD_UB2_UH(top0, top1, sum_top0, sum_top1); + HADD_UB2_UH(left0, left1, sum_left0, sum_left1); + sum_h = sum_top0 + sum_top1; + sum_h += sum_left0 + sum_left1; + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 6); + out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); + + for (row = 16; row--;) { + ST_UB2(out, out, dst, 16); + dst += dst_stride; + ST_UB2(out, out, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_dc_tl_32x32_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + v16u8 data0, data1, out; + v8u16 sum_h, sum_data0, sum_data1; + v4u32 sum_w; + v2u64 sum_d; + + LD_UB2(src, 16, data0, data1); + HADD_UB2_UH(data0, data1, sum_data0, sum_data1); + sum_h = sum_data0 + sum_data1; + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5); + out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); + + for (row = 16; row--;) { + ST_UB2(out, out, dst, 16); + dst += dst_stride; + ST_UB2(out, out, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) { + uint32_t row; + const v16u8 out = (v16u8)__msa_ldi_b(128); + + for (row = 16; row--;) { + ST_UB2(out, out, dst, 16); + dst += dst_stride; + ST_UB2(out, out, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + uint32_t val; + uint8_t top_left = src_top_ptr[-1]; + v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 }; + v16u8 src0, src1, src2, src3; + v8u16 src_top_left, vec0, vec1, vec2, vec3; + + src_top_left = (v8u16)__msa_fill_h(top_left); + val = LW(src_top_ptr); + src_top = (v16i8)__msa_insert_w((v4i32)src_top, 0, val); + + src_left0 = __msa_fill_b(src_left[0]); + src_left1 = __msa_fill_b(src_left[1]); + src_left2 = __msa_fill_b(src_left[2]); + src_left3 = __msa_fill_b(src_left[3]); + + ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top, + src_left3, src_top, src0, src1, src2, src3); + HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3); + SAT_UH4_UH(vec0, vec1, vec2, vec3, 7); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1); + ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride); +} + +static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + uint64_t val; + uint8_t top_left = src_top_ptr[-1]; + uint32_t loop_cnt; + v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 }; + v8u16 src_top_left, vec0, vec1, vec2, vec3; + v16u8 src0, src1, src2, src3; + + val = LD(src_top_ptr); + src_top = (v16i8)__msa_insert_d((v2i64)src_top, 0, val); + src_top_left = (v8u16)__msa_fill_h(top_left); + + for (loop_cnt = 2; loop_cnt--;) { + src_left0 = __msa_fill_b(src_left[0]); + src_left1 = __msa_fill_b(src_left[1]); + src_left2 = __msa_fill_b(src_left[2]); + src_left3 = __msa_fill_b(src_left[3]); + src_left += 4; + + ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top, + src_left3, src_top, src0, src1, src2, src3); + HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3); + SAT_UH4_UH(vec0, vec1, vec2, vec3, 7); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + uint8_t top_left = src_top_ptr[-1]; + uint32_t loop_cnt; + v16i8 src_top, src_left0, src_left1, src_left2, src_left3; + v8u16 src_top_left, res_r, res_l; + + src_top = LD_SB(src_top_ptr); + src_top_left = (v8u16)__msa_fill_h(top_left); + + for (loop_cnt = 4; loop_cnt--;) { + src_left0 = __msa_fill_b(src_left[0]); + src_left1 = __msa_fill_b(src_left[1]); + src_left2 = __msa_fill_b(src_left[2]); + src_left3 = __msa_fill_b(src_left[3]); + src_left += 4; + + ILVRL_B2_UH(src_left0, src_top, res_r, res_l); + HADD_UB2_UH(res_r, res_l, res_r, res_l); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); + + SAT_UH2_UH(res_r, res_l, 7); + PCKEV_ST_SB(res_r, res_l, dst); + dst += dst_stride; + + ILVRL_B2_UH(src_left1, src_top, res_r, res_l); + HADD_UB2_UH(res_r, res_l, res_r, res_l); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); + SAT_UH2_UH(res_r, res_l, 7); + PCKEV_ST_SB(res_r, res_l, dst); + dst += dst_stride; + + ILVRL_B2_UH(src_left2, src_top, res_r, res_l); + HADD_UB2_UH(res_r, res_l, res_r, res_l); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); + SAT_UH2_UH(res_r, res_l, 7); + PCKEV_ST_SB(res_r, res_l, dst); + dst += dst_stride; + + ILVRL_B2_UH(src_left3, src_top, res_r, res_l); + HADD_UB2_UH(res_r, res_l, res_r, res_l); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); + SAT_UH2_UH(res_r, res_l, 7); + PCKEV_ST_SB(res_r, res_l, dst); + dst += dst_stride; + } +} + +static void intra_predict_tm_32x32_msa(const uint8_t *src_top, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + uint8_t top_left = src_top[-1]; + uint32_t loop_cnt; + v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3; + v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1; + + LD_SB2(src_top, 16, src_top0, src_top1); + src_top_left = (v8u16)__msa_fill_h(top_left); + + for (loop_cnt = 8; loop_cnt--;) { + src_left0 = __msa_fill_b(src_left[0]); + src_left1 = __msa_fill_b(src_left[1]); + src_left2 = __msa_fill_b(src_left[2]); + src_left3 = __msa_fill_b(src_left[3]); + src_left += 4; + + ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1); + ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1); + HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); + SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); + PCKEV_ST_SB(res_r0, res_l0, dst); + PCKEV_ST_SB(res_r1, res_l1, dst + 16); + dst += dst_stride; + + ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1); + ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1); + HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); + SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); + PCKEV_ST_SB(res_r0, res_l0, dst); + PCKEV_ST_SB(res_r1, res_l1, dst + 16); + dst += dst_stride; + + ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1); + ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1); + HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); + SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); + PCKEV_ST_SB(res_r0, res_l0, dst); + PCKEV_ST_SB(res_r1, res_l1, dst + 16); + dst += dst_stride; + + ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1); + ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1); + HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); + SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); + PCKEV_ST_SB(res_r0, res_l0, dst); + PCKEV_ST_SB(res_r1, res_l1, dst + 16); + dst += dst_stride; + } +} + +void vpx_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_vert_4x4_msa(above, dst, y_stride); +} + +void vpx_v_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_vert_8x8_msa(above, dst, y_stride); +} + +void vpx_v_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_vert_16x16_msa(above, dst, y_stride); +} + +void vpx_v_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_vert_32x32_msa(above, dst, y_stride); +} + +void vpx_h_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_horiz_4x4_msa(left, dst, y_stride); +} + +void vpx_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_horiz_8x8_msa(left, dst, y_stride); +} + +void vpx_h_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_horiz_16x16_msa(left, dst, y_stride); +} + +void vpx_h_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_horiz_32x32_msa(left, dst, y_stride); +} + +void vpx_dc_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_4x4_msa(above, left, dst, y_stride); +} + +void vpx_dc_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_8x8_msa(above, left, dst, y_stride); +} + +void vpx_dc_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_16x16_msa(above, left, dst, y_stride); +} + +void vpx_dc_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_32x32_msa(above, left, dst, y_stride); +} + +void vpx_dc_top_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_dc_tl_4x4_msa(above, dst, y_stride); +} + +void vpx_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_dc_tl_8x8_msa(above, dst, y_stride); +} + +void vpx_dc_top_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_dc_tl_16x16_msa(above, dst, y_stride); +} + +void vpx_dc_top_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_dc_tl_32x32_msa(above, dst, y_stride); +} + +void vpx_dc_left_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_dc_tl_4x4_msa(left, dst, y_stride); +} + +void vpx_dc_left_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_dc_tl_8x8_msa(left, dst, y_stride); +} + +void vpx_dc_left_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + + intra_predict_dc_tl_16x16_msa(left, dst, y_stride); +} + +void vpx_dc_left_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + + intra_predict_dc_tl_32x32_msa(left, dst, y_stride); +} + +void vpx_dc_128_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + + intra_predict_128dc_4x4_msa(dst, y_stride); +} + +void vpx_dc_128_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + + intra_predict_128dc_8x8_msa(dst, y_stride); +} + +void vpx_dc_128_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + + intra_predict_128dc_16x16_msa(dst, y_stride); +} + +void vpx_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + + intra_predict_128dc_32x32_msa(dst, y_stride); +} + +void vpx_tm_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_tm_4x4_msa(above, left, dst, y_stride); +} + +void vpx_tm_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_tm_8x8_msa(above, left, dst, y_stride); +} + +void vpx_tm_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_tm_16x16_msa(above, left, dst, y_stride); +} + +void vpx_tm_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_tm_32x32_msa(above, left, dst, y_stride); +} diff --git a/media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h b/media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h new file mode 100644 index 0000000000..cbea22f20f --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_MIPS_INV_TXFM_DSPR2_H_ +#define VPX_VPX_DSP_MIPS_INV_TXFM_DSPR2_H_ + +#include + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/inv_txfm.h" +#include "vpx_dsp/mips/common_dspr2.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if HAVE_DSPR2 +#define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input) \ + ({ \ + int32_t tmp, out; \ + int dct_cost_rounding = DCT_CONST_ROUNDING; \ + int in = input; \ + \ + __asm__ __volatile__(/* out = dct_const_round_shift(dc * cospi_16_64); */ \ + "mtlo %[dct_cost_rounding], $ac1 " \ + " \n\t" \ + "mthi $zero, $ac1 " \ + " \n\t" \ + "madd $ac1, %[in], " \ + "%[cospi_16_64] \n\t" \ + "extp %[tmp], $ac1, " \ + "31 \n\t" \ + \ + /* out = dct_const_round_shift(out * cospi_16_64); */ \ + "mtlo %[dct_cost_rounding], $ac2 " \ + " \n\t" \ + "mthi $zero, $ac2 " \ + " \n\t" \ + "madd $ac2, %[tmp], " \ + "%[cospi_16_64] \n\t" \ + "extp %[out], $ac2, " \ + "31 \n\t" \ + \ + : [tmp] "=&r"(tmp), [out] "=r"(out) \ + : [in] "r"(in), \ + [dct_cost_rounding] "r"(dct_cost_rounding), \ + [cospi_16_64] "r"(cospi_16_64)); \ + out; \ + }) + +void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride); +void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output); +void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride); +void iadst4_dspr2(const int16_t *input, int16_t *output); +void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows); +void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride); +void iadst8_dspr2(const int16_t *input, int16_t *output); +void idct16_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows); +void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride); +void iadst16_dspr2(const int16_t *input, int16_t *output); + +#endif // #if HAVE_DSPR2 +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_DSP_MIPS_INV_TXFM_DSPR2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_msa.h b/media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_msa.h new file mode 100644 index 0000000000..3b66249ef2 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_msa.h @@ -0,0 +1,411 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_MIPS_INV_TXFM_MSA_H_ +#define VPX_VPX_DSP_MIPS_INV_TXFM_MSA_H_ + +#include "vpx_dsp/mips/macros_msa.h" +#include "vpx_dsp/mips/txfm_macros_msa.h" +#include "vpx_dsp/txfm_common.h" + +#define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ + { \ + v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \ + v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m; \ + v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64, \ + cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \ + v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, -cospi_16_64, \ + cospi_24_64, -cospi_24_64, 0, 0 }; \ + \ + SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m); \ + cnst2_m = -cnst0_m; \ + ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ + SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m); \ + cnst4_m = -cnst2_m; \ + ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ + \ + ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \ + ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \ + DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \ + cnst2_m, cnst3_m, in7, in0, in4, in3); \ + \ + SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \ + cnst2_m = -cnst0_m; \ + ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ + SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m); \ + cnst4_m = -cnst2_m; \ + ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ + \ + ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ + ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ + \ + DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \ + cnst2_m, cnst3_m, in5, in2, in6, in1); \ + BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \ + out7 = -s0_m; \ + out0 = s1_m; \ + \ + SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m); \ + \ + ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m); \ + cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + cnst1_m = cnst0_m; \ + \ + ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \ + ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ + DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst2_m, \ + cnst3_m, cnst1_m, out1, out6, s0_m, s1_m); \ + \ + SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + \ + ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ + ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m); \ + out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ + out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \ + out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \ + \ + out1 = -out1; \ + out3 = -out3; \ + out5 = -out5; \ + } + +#define VP9_SET_COSPI_PAIR(c0_h, c1_h) \ + ({ \ + v8i16 out0_m, r0_m, r1_m; \ + \ + r0_m = __msa_fill_h(c0_h); \ + r1_m = __msa_fill_h(c1_h); \ + out0_m = __msa_ilvev_h(r1_m, r0_m); \ + \ + out0_m; \ + }) + +#define VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3) \ + { \ + uint8_t *dst_m = (uint8_t *)(dst); \ + v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \ + v16i8 tmp0_m, tmp1_m; \ + v16i8 zero_m = { 0 }; \ + v8i16 res0_m, res1_m, res2_m, res3_m; \ + \ + LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m); \ + ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m, zero_m, dst3_m, \ + res0_m, res1_m, res2_m, res3_m); \ + ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3, res0_m, res1_m, \ + res2_m, res3_m); \ + CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m); \ + PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m); \ + ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride); \ + } + +#define VP9_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 c0_m, c1_m, c2_m, c3_m; \ + v8i16 step0_m, step1_m; \ + v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + c0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \ + c1_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \ + step0_m = __msa_ilvr_h(in2, in0); \ + DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m); \ + \ + c2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ + c3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ + step1_m = __msa_ilvr_h(in3, in1); \ + DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m); \ + SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ + \ + PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m); \ + SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8); \ + BUTTERFLY_4((v8i16)tmp0_m, (v8i16)tmp1_m, (v8i16)tmp2_m, (v8i16)tmp3_m, \ + out0, out1, out2, out3); \ + } + +#define VP9_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 res0_m, res1_m, c0_m, c1_m; \ + v8i16 k1_m, k2_m, k3_m, k4_m; \ + v8i16 zero_m = { 0 }; \ + v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v4i32 int0_m, int1_m, int2_m, int3_m; \ + v8i16 mask_m = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9, \ + -sinpi_1_9, -sinpi_2_9, -sinpi_3_9, -sinpi_4_9 }; \ + \ + SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m); \ + ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m); \ + ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \ + DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m); \ + int0_m = tmp2_m + tmp1_m; \ + \ + SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m); \ + ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m); \ + DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \ + int1_m = tmp0_m + tmp1_m; \ + \ + c0_m = __msa_splati_h(mask_m, 6); \ + ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m); \ + ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \ + DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \ + int2_m = tmp0_m + tmp1_m; \ + \ + c0_m = __msa_splati_h(mask_m, 6); \ + c0_m = __msa_ilvev_h(c0_m, k1_m); \ + \ + res0_m = __msa_ilvr_h((in1), (in3)); \ + tmp0_m = __msa_dotp_s_w(res0_m, c0_m); \ + int3_m = tmp2_m + tmp0_m; \ + \ + res0_m = __msa_ilvr_h((in2), (in3)); \ + c1_m = __msa_ilvev_h(k4_m, k3_m); \ + \ + tmp2_m = __msa_dotp_s_w(res0_m, c1_m); \ + res1_m = __msa_ilvr_h((in0), (in2)); \ + c1_m = __msa_ilvev_h(k1_m, zero_m); \ + \ + tmp3_m = __msa_dotp_s_w(res1_m, c1_m); \ + int3_m += tmp2_m; \ + int3_m += tmp3_m; \ + \ + SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1); \ + PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3); \ + } + +#define VP9_SET_CONST_PAIR(mask_h, idx1_h, idx2_h) \ + ({ \ + v8i16 c0_m, c1_m; \ + \ + SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m); \ + c0_m = __msa_ilvev_h(c1_m, c0_m); \ + \ + c0_m; \ + }) + +/* multiply and add macro */ +#define VP9_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1, \ + out2, out3) \ + { \ + v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ + v4i32 tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd; \ + \ + ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m); \ + ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m); \ + DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, cst0, cst0, cst1, \ + cst1, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd); \ + SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS); \ + PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out0, out1); \ + DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, cst2, cst2, cst3, \ + cst3, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd); \ + SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS); \ + PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out2, out3); \ + } + +/* idct 8x8 macro */ +#define VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m; \ + v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m; \ + v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64, \ + cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 }; \ + \ + k0_m = VP9_SET_CONST_PAIR(mask_m, 0, 5); \ + k1_m = VP9_SET_CONST_PAIR(mask_m, 1, 0); \ + k2_m = VP9_SET_CONST_PAIR(mask_m, 6, 3); \ + k3_m = VP9_SET_CONST_PAIR(mask_m, 3, 2); \ + VP9_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5); \ + SUB2(in1, in3, in7, in5, res0_m, res1_m); \ + k0_m = VP9_SET_CONST_PAIR(mask_m, 4, 7); \ + k1_m = __msa_splati_h(mask_m, 4); \ + \ + ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m); \ + DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m, \ + tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ + tp4_m = in1 + in3; \ + PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m); \ + tp7_m = in7 + in5; \ + k2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ + k3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ + VP9_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m, in0, in4, in2, in6); \ + BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m); \ + BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m, out0, \ + out1, out2, out3, out4, out5, out6, out7); \ + } + +#define VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m; \ + v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m; \ + v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1; \ + v8i16 mask1_m = { cospi_2_64, cospi_30_64, -cospi_2_64, cospi_10_64, \ + cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 }; \ + v8i16 mask2_m = { cospi_14_64, -cospi_18_64, cospi_26_64, cospi_6_64, \ + -cospi_26_64, cospi_8_64, cospi_24_64, -cospi_8_64 }; \ + v8i16 mask3_m = { \ + -cospi_24_64, cospi_8_64, cospi_16_64, -cospi_16_64, 0, 0, 0, 0 \ + }; \ + \ + k0_m = VP9_SET_CONST_PAIR(mask1_m, 0, 1); \ + k1_m = VP9_SET_CONST_PAIR(mask1_m, 1, 2); \ + ILVRL_H2_SH(in1, in0, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \ + r1_m, r2_m, r3_m); \ + k0_m = VP9_SET_CONST_PAIR(mask1_m, 6, 7); \ + k1_m = VP9_SET_CONST_PAIR(mask2_m, 0, 1); \ + ILVRL_H2_SH(in5, in4, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m, \ + r5_m, r6_m, r7_m); \ + ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \ + m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m); \ + SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \ + m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m); \ + k0_m = VP9_SET_CONST_PAIR(mask1_m, 3, 4); \ + k1_m = VP9_SET_CONST_PAIR(mask1_m, 4, 5); \ + ILVRL_H2_SH(in3, in2, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \ + r1_m, r2_m, r3_m); \ + k0_m = VP9_SET_CONST_PAIR(mask2_m, 2, 3); \ + k1_m = VP9_SET_CONST_PAIR(mask2_m, 3, 4); \ + ILVRL_H2_SH(in7, in6, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m, \ + r5_m, r6_m, r7_m); \ + ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \ + m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m); \ + SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \ + m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m); \ + ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m); \ + BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3); \ + k0_m = VP9_SET_CONST_PAIR(mask2_m, 5, 6); \ + k1_m = VP9_SET_CONST_PAIR(mask2_m, 6, 7); \ + ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \ + r1_m, r2_m, r3_m); \ + k1_m = VP9_SET_CONST_PAIR(mask3_m, 0, 1); \ + DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, r4_m, r5_m, \ + r6_m, r7_m); \ + ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m, \ + m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6); \ + SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m, \ + m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5); \ + k0_m = VP9_SET_CONST_PAIR(mask3_m, 2, 2); \ + k1_m = VP9_SET_CONST_PAIR(mask3_m, 2, 3); \ + ILVRL_H2_SH(in4, in3, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, m0_m, \ + m1_m, m2_m, m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4); \ + ILVRL_H2_SW(in5, in2, m2_m, m3_m); \ + DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, m0_m, m1_m, \ + m2_m, m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5); \ + \ + out1 = -in1; \ + out3 = -in3; \ + out5 = -in5; \ + out7 = -in7; \ + } + +#define VP9_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, \ + r12, r13, r14, r15, out0, out1, out2, out3, out4, \ + out5, out6, out7, out8, out9, out10, out11, out12, \ + out13, out14, out15) \ + { \ + v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m; \ + v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m; \ + v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m; \ + v8i16 h8_m, h9_m, h10_m, h11_m; \ + v8i16 k0_m, k1_m, k2_m, k3_m; \ + \ + /* stage 1 */ \ + k0_m = VP9_SET_COSPI_PAIR(cospi_1_64, cospi_31_64); \ + k1_m = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); \ + k2_m = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); \ + k3_m = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); \ + MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, g0_m, g1_m, g2_m, g3_m); \ + k0_m = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); \ + k1_m = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); \ + k2_m = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); \ + k3_m = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); \ + MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, g4_m, g5_m, g6_m, g7_m); \ + k0_m = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); \ + k1_m = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); \ + k2_m = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); \ + k3_m = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); \ + MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, g8_m, g9_m, g10_m, \ + g11_m); \ + k0_m = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); \ + k1_m = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); \ + k2_m = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); \ + k3_m = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); \ + MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, g12_m, g13_m, g14_m, \ + g15_m); \ + \ + /* stage 2 */ \ + k0_m = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); \ + k1_m = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); \ + k2_m = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); \ + MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, h0_m, h1_m, h2_m, \ + h3_m); \ + k0_m = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); \ + k1_m = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); \ + k2_m = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); \ + MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, h4_m, h5_m, \ + h6_m, h7_m); \ + BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10); \ + BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, h8_m, h9_m, \ + h10_m, h11_m, h6_m, h4_m, h2_m, h0_m); \ + \ + /* stage 3 */ \ + BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m); \ + k0_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ + k1_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ + k2_m = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); \ + MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, out4, out6, out5, \ + out7); \ + MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, out12, out14, \ + out13, out15); \ + \ + /* stage 4 */ \ + k0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \ + k1_m = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); \ + k2_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \ + k3_m = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); \ + MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3); \ + MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7); \ + MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11); \ + MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15); \ + } + +void vpx_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride); +void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output); +void vpx_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride); +void vpx_iadst16_1d_rows_msa(const int16_t *input, int16_t *output); +#endif // VPX_VPX_DSP_MIPS_INV_TXFM_MSA_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/mips/itrans16_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/itrans16_dspr2.c new file mode 100644 index 0000000000..44ba65c7ac --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/itrans16_dspr2.c @@ -0,0 +1,1230 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/inv_txfm_dspr2.h" +#include "vpx_dsp/txfm_common.h" + +#if HAVE_DSPR2 +void idct16_rows_dspr2(const int16_t *input, int16_t *output, + uint32_t no_rows) { + int i; + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; + int step1_10, step1_11, step1_12, step1_13; + int step2_0, step2_1, step2_2, step2_3; + int step2_8, step2_9, step2_10, step2_11; + int step2_12, step2_13, step2_14, step2_15; + int load1, load2, load3, load4, load5, load6, load7, load8; + int result1, result2, result3, result4; + const int const_2_power_13 = 8192; + + for (i = no_rows; i--;) { + /* prefetch row */ + prefetch_load((const uint8_t *)(input + 16)); + + __asm__ __volatile__( + "lh %[load1], 0(%[input]) \n\t" + "lh %[load2], 16(%[input]) \n\t" + "lh %[load3], 8(%[input]) \n\t" + "lh %[load4], 24(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "add %[result1], %[load1], %[load2] \n\t" + "sub %[result2], %[load1], %[load2] \n\t" + "madd $ac1, %[result1], %[cospi_16_64] \n\t" + "madd $ac2, %[result2], %[cospi_16_64] \n\t" + "extp %[step2_0], $ac1, 31 \n\t" + "extp %[step2_1], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac3, %[load3], %[cospi_24_64] \n\t" + "msub $ac3, %[load4], %[cospi_8_64] \n\t" + "extp %[step2_2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "madd $ac1, %[load3], %[cospi_8_64] \n\t" + "madd $ac1, %[load4], %[cospi_24_64] \n\t" + "extp %[step2_3], $ac1, 31 \n\t" + + "add %[step1_0], %[step2_0], %[step2_3] \n\t" + "add %[step1_1], %[step2_1], %[step2_2] \n\t" + "sub %[step1_2], %[step2_1], %[step2_2] \n\t" + "sub %[step1_3], %[step2_0], %[step2_3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [result1] "=&r"(result1), + [result2] "=&r"(result2), [step2_0] "=&r"(step2_0), + [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2), + [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0), + [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2), + [step1_3] "=r"(step1_3) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64), + [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( + "lh %[load5], 2(%[input]) \n\t" + "lh %[load6], 30(%[input]) \n\t" + "lh %[load7], 18(%[input]) \n\t" + "lh %[load8], 14(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load5], %[cospi_30_64] \n\t" + "msub $ac1, %[load6], %[cospi_2_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load7], %[cospi_14_64] \n\t" + "msub $ac3, %[load8], %[cospi_18_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load7], %[cospi_18_64] \n\t" + "madd $ac1, %[load8], %[cospi_14_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load5], %[cospi_2_64] \n\t" + "madd $ac2, %[load6], %[cospi_30_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "sub %[load5], %[result1], %[result2] \n\t" + "sub %[load6], %[result4], %[result3] \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load6], %[cospi_24_64] \n\t" + "msub $ac1, %[load5], %[cospi_8_64] \n\t" + "madd $ac3, %[load5], %[cospi_24_64] \n\t" + "madd $ac3, %[load6], %[cospi_8_64] \n\t" + + "extp %[step2_9], $ac1, 31 \n\t" + "extp %[step2_14], $ac3, 31 \n\t" + "add %[step2_8], %[result1], %[result2] \n\t" + "add %[step2_15], %[result4], %[result3] \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), + [load8] "=&r"(load8), [result1] "=&r"(result1), + [result2] "=&r"(result2), [result3] "=&r"(result3), + [result4] "=&r"(result4), [step2_8] "=r"(step2_8), + [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9), + [step2_14] "=r"(step2_14) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), + [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); + + __asm__ __volatile__( + "lh %[load1], 10(%[input]) \n\t" + "lh %[load2], 22(%[input]) \n\t" + "lh %[load3], 26(%[input]) \n\t" + "lh %[load4], 6(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_22_64] \n\t" + "msub $ac1, %[load2], %[cospi_10_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load3], %[cospi_6_64] \n\t" + "msub $ac3, %[load4], %[cospi_26_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load1], %[cospi_10_64] \n\t" + "madd $ac1, %[load2], %[cospi_22_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load3], %[cospi_26_64] \n\t" + "madd $ac2, %[load4], %[cospi_6_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[result2], %[result1] \n\t" + "sub %[load2], %[result4], %[result3] \n\t" + + "msub $ac1, %[load1], %[cospi_24_64] \n\t" + "msub $ac1, %[load2], %[cospi_8_64] \n\t" + "madd $ac3, %[load2], %[cospi_24_64] \n\t" + "msub $ac3, %[load1], %[cospi_8_64] \n\t" + + "extp %[step2_10], $ac1, 31 \n\t" + "extp %[step2_13], $ac3, 31 \n\t" + "add %[step2_11], %[result1], %[result2] \n\t" + "add %[step2_12], %[result4], %[result3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [result1] "=&r"(result1), + [result2] "=&r"(result2), [result3] "=&r"(result3), + [result4] "=&r"(result4), [step2_10] "=r"(step2_10), + [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12), + [step2_13] "=r"(step2_13) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), + [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); + + __asm__ __volatile__( + "lh %[load5], 4(%[input]) \n\t" + "lh %[load6], 28(%[input]) \n\t" + "lh %[load7], 20(%[input]) \n\t" + "lh %[load8], 12(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load5], %[cospi_28_64] \n\t" + "msub $ac1, %[load6], %[cospi_4_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load7], %[cospi_12_64] \n\t" + "msub $ac3, %[load8], %[cospi_20_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load7], %[cospi_20_64] \n\t" + "madd $ac1, %[load8], %[cospi_12_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load5], %[cospi_4_64] \n\t" + "madd $ac2, %[load6], %[cospi_28_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load5], %[result4], %[result3] \n\t" + "sub %[load5], %[load5], %[result1] \n\t" + "add %[load5], %[load5], %[result2] \n\t" + + "sub %[load6], %[result1], %[result2] \n\t" + "sub %[load6], %[load6], %[result3] \n\t" + "add %[load6], %[load6], %[result4] \n\t" + + "madd $ac1, %[load5], %[cospi_16_64] \n\t" + "madd $ac3, %[load6], %[cospi_16_64] \n\t" + + "extp %[step1_5], $ac1, 31 \n\t" + "extp %[step1_6], $ac3, 31 \n\t" + "add %[step1_4], %[result1], %[result2] \n\t" + "add %[step1_7], %[result4], %[result3] \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), + [load8] "=&r"(load8), [result1] "=&r"(result1), + [result2] "=&r"(result2), [result3] "=&r"(result3), + [result4] "=&r"(result4), [step1_4] "=r"(step1_4), + [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6), + [step1_7] "=r"(step1_7) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), + [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + "sub %[load5], %[step2_14], %[step2_13] \n\t" + "sub %[load5], %[load5], %[step2_9] \n\t" + "add %[load5], %[load5], %[step2_10] \n\t" + + "madd $ac0, %[load5], %[cospi_16_64] \n\t" + + "sub %[load6], %[step2_14], %[step2_13] \n\t" + "sub %[load6], %[load6], %[step2_10] \n\t" + "add %[load6], %[load6], %[step2_9] \n\t" + + "madd $ac1, %[load6], %[cospi_16_64] \n\t" + + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load5], %[step2_15], %[step2_12] \n\t" + "sub %[load5], %[load5], %[step2_8] \n\t" + "add %[load5], %[load5], %[step2_11] \n\t" + + "madd $ac2, %[load5], %[cospi_16_64] \n\t" + + "sub %[load6], %[step2_15], %[step2_12] \n\t" + "sub %[load6], %[load6], %[step2_11] \n\t" + "add %[load6], %[load6], %[step2_8] \n\t" + + "madd $ac3, %[load6], %[cospi_16_64] \n\t" + + "extp %[step1_10], $ac0, 31 \n\t" + "extp %[step1_13], $ac1, 31 \n\t" + "extp %[step1_11], $ac2, 31 \n\t" + "extp %[step1_12], $ac3, 31 \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10), + [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12), + [step1_13] "=r"(step1_13) + : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14), + [step2_13] "r"(step2_13), [step2_9] "r"(step2_9), + [step2_10] "r"(step2_10), [step2_15] "r"(step2_15), + [step2_12] "r"(step2_12), [step2_8] "r"(step2_8), + [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( + "add %[load5], %[step1_0], %[step1_7] \n\t" + "add %[load5], %[load5], %[step2_12] \n\t" + "add %[load5], %[load5], %[step2_15] \n\t" + "add %[load6], %[step1_1], %[step1_6] \n\t" + "add %[load6], %[load6], %[step2_13] \n\t" + "add %[load6], %[load6], %[step2_14] \n\t" + "sh %[load5], 0(%[output]) \n\t" + "sh %[load6], 32(%[output]) \n\t" + "sub %[load5], %[step1_1], %[step1_6] \n\t" + "add %[load5], %[load5], %[step2_9] \n\t" + "add %[load5], %[load5], %[step2_10] \n\t" + "sub %[load6], %[step1_0], %[step1_7] \n\t" + "add %[load6], %[load6], %[step2_8] \n\t" + "add %[load6], %[load6], %[step2_11] \n\t" + "sh %[load5], 192(%[output]) \n\t" + "sh %[load6], 224(%[output]) \n\t" + "sub %[load5], %[step1_0], %[step1_7] \n\t" + "sub %[load5], %[load5], %[step2_8] \n\t" + "sub %[load5], %[load5], %[step2_11] \n\t" + "sub %[load6], %[step1_1], %[step1_6] \n\t" + "sub %[load6], %[load6], %[step2_9] \n\t" + "sub %[load6], %[load6], %[step2_10] \n\t" + "sh %[load5], 256(%[output]) \n\t" + "sh %[load6], 288(%[output]) \n\t" + "add %[load5], %[step1_1], %[step1_6] \n\t" + "sub %[load5], %[load5], %[step2_13] \n\t" + "sub %[load5], %[load5], %[step2_14] \n\t" + "add %[load6], %[step1_0], %[step1_7] \n\t" + "sub %[load6], %[load6], %[step2_12] \n\t" + "sub %[load6], %[load6], %[step2_15] \n\t" + "sh %[load5], 448(%[output]) \n\t" + "sh %[load6], 480(%[output]) \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6) + : [output] "r"(output), [step1_0] "r"(step1_0), [step1_1] "r"(step1_1), + [step1_6] "r"(step1_6), [step1_7] "r"(step1_7), + [step2_8] "r"(step2_8), [step2_9] "r"(step2_9), + [step2_10] "r"(step2_10), [step2_11] "r"(step2_11), + [step2_12] "r"(step2_12), [step2_13] "r"(step2_13), + [step2_14] "r"(step2_14), [step2_15] "r"(step2_15)); + + __asm__ __volatile__( + "add %[load5], %[step1_2], %[step1_5] \n\t" + "add %[load5], %[load5], %[step1_13] \n\t" + "add %[load6], %[step1_3], %[step1_4] \n\t" + "add %[load6], %[load6], %[step1_12] \n\t" + "sh %[load5], 64(%[output]) \n\t" + "sh %[load6], 96(%[output]) \n\t" + "sub %[load5], %[step1_3], %[step1_4] \n\t" + "add %[load5], %[load5], %[step1_11] \n\t" + "sub %[load6], %[step1_2], %[step1_5] \n\t" + "add %[load6], %[load6], %[step1_10] \n\t" + "sh %[load5], 128(%[output]) \n\t" + "sh %[load6], 160(%[output]) \n\t" + "sub %[load5], %[step1_2], %[step1_5] \n\t" + "sub %[load5], %[load5], %[step1_10] \n\t" + "sub %[load6], %[step1_3], %[step1_4] \n\t" + "sub %[load6], %[load6], %[step1_11] \n\t" + "sh %[load5], 320(%[output]) \n\t" + "sh %[load6], 352(%[output]) \n\t" + "add %[load5], %[step1_3], %[step1_4] \n\t" + "sub %[load5], %[load5], %[step1_12] \n\t" + "add %[load6], %[step1_2], %[step1_5] \n\t" + "sub %[load6], %[load6], %[step1_13] \n\t" + "sh %[load5], 384(%[output]) \n\t" + "sh %[load6], 416(%[output]) \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6) + : [output] "r"(output), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3), + [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), + [step1_10] "r"(step1_10), [step1_11] "r"(step1_11), + [step1_12] "r"(step1_12), [step1_13] "r"(step1_13)); + + input += 16; + output += 1; + } +} + +void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) { + int i; + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; + int step1_8, step1_9, step1_10, step1_11; + int step1_12, step1_13, step1_14, step1_15; + int step2_0, step2_1, step2_2, step2_3; + int step2_8, step2_9, step2_10, step2_11; + int step2_12, step2_13, step2_14, step2_15; + int load1, load2, load3, load4, load5, load6, load7, load8; + int result1, result2, result3, result4; + const int const_2_power_13 = 8192; + uint8_t *dest_pix; + uint8_t *cm = vpx_ff_cropTbl; + + /* prefetch vpx_ff_cropTbl */ + prefetch_load(vpx_ff_cropTbl); + prefetch_load(vpx_ff_cropTbl + 32); + prefetch_load(vpx_ff_cropTbl + 64); + prefetch_load(vpx_ff_cropTbl + 96); + prefetch_load(vpx_ff_cropTbl + 128); + prefetch_load(vpx_ff_cropTbl + 160); + prefetch_load(vpx_ff_cropTbl + 192); + prefetch_load(vpx_ff_cropTbl + 224); + + for (i = 0; i < 16; ++i) { + dest_pix = (dest + i); + __asm__ __volatile__( + "lh %[load1], 0(%[input]) \n\t" + "lh %[load2], 16(%[input]) \n\t" + "lh %[load3], 8(%[input]) \n\t" + "lh %[load4], 24(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "add %[result1], %[load1], %[load2] \n\t" + "sub %[result2], %[load1], %[load2] \n\t" + "madd $ac1, %[result1], %[cospi_16_64] \n\t" + "madd $ac2, %[result2], %[cospi_16_64] \n\t" + "extp %[step2_0], $ac1, 31 \n\t" + "extp %[step2_1], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac3, %[load3], %[cospi_24_64] \n\t" + "msub $ac3, %[load4], %[cospi_8_64] \n\t" + "extp %[step2_2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "madd $ac1, %[load3], %[cospi_8_64] \n\t" + "madd $ac1, %[load4], %[cospi_24_64] \n\t" + "extp %[step2_3], $ac1, 31 \n\t" + + "add %[step1_0], %[step2_0], %[step2_3] \n\t" + "add %[step1_1], %[step2_1], %[step2_2] \n\t" + "sub %[step1_2], %[step2_1], %[step2_2] \n\t" + "sub %[step1_3], %[step2_0], %[step2_3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [result1] "=&r"(result1), + [result2] "=&r"(result2), [step2_0] "=&r"(step2_0), + [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2), + [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0), + [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2), + [step1_3] "=r"(step1_3) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64), + [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( + "lh %[load5], 2(%[input]) \n\t" + "lh %[load6], 30(%[input]) \n\t" + "lh %[load7], 18(%[input]) \n\t" + "lh %[load8], 14(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load5], %[cospi_30_64] \n\t" + "msub $ac1, %[load6], %[cospi_2_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load7], %[cospi_14_64] \n\t" + "msub $ac3, %[load8], %[cospi_18_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load7], %[cospi_18_64] \n\t" + "madd $ac1, %[load8], %[cospi_14_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load5], %[cospi_2_64] \n\t" + "madd $ac2, %[load6], %[cospi_30_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "sub %[load5], %[result1], %[result2] \n\t" + "sub %[load6], %[result4], %[result3] \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load6], %[cospi_24_64] \n\t" + "msub $ac1, %[load5], %[cospi_8_64] \n\t" + "madd $ac3, %[load5], %[cospi_24_64] \n\t" + "madd $ac3, %[load6], %[cospi_8_64] \n\t" + + "extp %[step2_9], $ac1, 31 \n\t" + "extp %[step2_14], $ac3, 31 \n\t" + "add %[step2_8], %[result1], %[result2] \n\t" + "add %[step2_15], %[result4], %[result3] \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), + [load8] "=&r"(load8), [result1] "=&r"(result1), + [result2] "=&r"(result2), [result3] "=&r"(result3), + [result4] "=&r"(result4), [step2_8] "=r"(step2_8), + [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9), + [step2_14] "=r"(step2_14) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), + [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); + + __asm__ __volatile__( + "lh %[load1], 10(%[input]) \n\t" + "lh %[load2], 22(%[input]) \n\t" + "lh %[load3], 26(%[input]) \n\t" + "lh %[load4], 6(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_22_64] \n\t" + "msub $ac1, %[load2], %[cospi_10_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load3], %[cospi_6_64] \n\t" + "msub $ac3, %[load4], %[cospi_26_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load1], %[cospi_10_64] \n\t" + "madd $ac1, %[load2], %[cospi_22_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load3], %[cospi_26_64] \n\t" + "madd $ac2, %[load4], %[cospi_6_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[result2], %[result1] \n\t" + "sub %[load2], %[result4], %[result3] \n\t" + + "msub $ac1, %[load1], %[cospi_24_64] \n\t" + "msub $ac1, %[load2], %[cospi_8_64] \n\t" + "madd $ac3, %[load2], %[cospi_24_64] \n\t" + "msub $ac3, %[load1], %[cospi_8_64] \n\t" + + "extp %[step2_10], $ac1, 31 \n\t" + "extp %[step2_13], $ac3, 31 \n\t" + "add %[step2_11], %[result1], %[result2] \n\t" + "add %[step2_12], %[result4], %[result3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [result1] "=&r"(result1), + [result2] "=&r"(result2), [result3] "=&r"(result3), + [result4] "=&r"(result4), [step2_10] "=r"(step2_10), + [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12), + [step2_13] "=r"(step2_13) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), + [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); + + __asm__ __volatile__( + "lh %[load5], 4(%[input]) \n\t" + "lh %[load6], 28(%[input]) \n\t" + "lh %[load7], 20(%[input]) \n\t" + "lh %[load8], 12(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load5], %[cospi_28_64] \n\t" + "msub $ac1, %[load6], %[cospi_4_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load7], %[cospi_12_64] \n\t" + "msub $ac3, %[load8], %[cospi_20_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load7], %[cospi_20_64] \n\t" + "madd $ac1, %[load8], %[cospi_12_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load5], %[cospi_4_64] \n\t" + "madd $ac2, %[load6], %[cospi_28_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load5], %[result4], %[result3] \n\t" + "sub %[load5], %[load5], %[result1] \n\t" + "add %[load5], %[load5], %[result2] \n\t" + + "sub %[load6], %[result1], %[result2] \n\t" + "sub %[load6], %[load6], %[result3] \n\t" + "add %[load6], %[load6], %[result4] \n\t" + + "madd $ac1, %[load5], %[cospi_16_64] \n\t" + "madd $ac3, %[load6], %[cospi_16_64] \n\t" + + "extp %[step1_5], $ac1, 31 \n\t" + "extp %[step1_6], $ac3, 31 \n\t" + + "add %[step1_4], %[result1], %[result2] \n\t" + "add %[step1_7], %[result4], %[result3] \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), + [load8] "=&r"(load8), [result1] "=&r"(result1), + [result2] "=&r"(result2), [result3] "=&r"(result3), + [result4] "=&r"(result4), [step1_4] "=r"(step1_4), + [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6), + [step1_7] "=r"(step1_7) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), + [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + "sub %[load5], %[step2_14], %[step2_13] \n\t" + "sub %[load5], %[load5], %[step2_9] \n\t" + "add %[load5], %[load5], %[step2_10] \n\t" + + "madd $ac0, %[load5], %[cospi_16_64] \n\t" + + "sub %[load6], %[step2_14], %[step2_13] \n\t" + "sub %[load6], %[load6], %[step2_10] \n\t" + "add %[load6], %[load6], %[step2_9] \n\t" + + "madd $ac1, %[load6], %[cospi_16_64] \n\t" + + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load5], %[step2_15], %[step2_12] \n\t" + "sub %[load5], %[load5], %[step2_8] \n\t" + "add %[load5], %[load5], %[step2_11] \n\t" + + "madd $ac2, %[load5], %[cospi_16_64] \n\t" + + "sub %[load6], %[step2_15], %[step2_12] \n\t" + "sub %[load6], %[load6], %[step2_11] \n\t" + "add %[load6], %[load6], %[step2_8] \n\t" + + "madd $ac3, %[load6], %[cospi_16_64] \n\t" + + "extp %[step1_10], $ac0, 31 \n\t" + "extp %[step1_13], $ac1, 31 \n\t" + "extp %[step1_11], $ac2, 31 \n\t" + "extp %[step1_12], $ac3, 31 \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10), + [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12), + [step1_13] "=r"(step1_13) + : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14), + [step2_13] "r"(step2_13), [step2_9] "r"(step2_9), + [step2_10] "r"(step2_10), [step2_15] "r"(step2_15), + [step2_12] "r"(step2_12), [step2_8] "r"(step2_8), + [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64)); + + step1_8 = step2_8 + step2_11; + step1_9 = step2_9 + step2_10; + step1_14 = step2_13 + step2_14; + step1_15 = step2_12 + step2_15; + + __asm__ __volatile__( + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[step1_0], %[step1_7] \n\t" + "add %[load5], %[load5], %[step1_15] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "add %[load6], %[step1_1], %[step1_6] \n\t" + "add %[load6], %[load6], %[step1_14] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[step1_2], %[step1_5] \n\t" + "add %[load5], %[load5], %[step1_13] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "add %[load6], %[step1_3], %[step1_4] \n\t" + "add %[load6], %[load6], %[step1_12] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "sub %[load5], %[step1_3], %[step1_4] \n\t" + "add %[load5], %[load5], %[step1_11] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "sub %[load6], %[step1_2], %[step1_5] \n\t" + "add %[load6], %[load6], %[step1_10] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + + "sub %[load5], %[step1_1], %[step1_6] \n\t" + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[load5], %[step1_9] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "sub %[load6], %[step1_0], %[step1_7] \n\t" + "add %[load6], %[load6], %[step1_8] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "sub %[load5], %[step1_0], %[step1_7] \n\t" + "sub %[load5], %[load5], %[step1_8] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "sub %[load6], %[step1_1], %[step1_6] \n\t" + "sub %[load6], %[load6], %[step1_9] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "sub %[load5], %[step1_2], %[step1_5] \n\t" + "sub %[load5], %[load5], %[step1_10] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "sub %[load6], %[step1_3], %[step1_4] \n\t" + "sub %[load6], %[load6], %[step1_11] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[step1_3], %[step1_4] \n\t" + "sub %[load5], %[load5], %[step1_12] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "add %[load6], %[step1_2], %[step1_5] \n\t" + "sub %[load6], %[load6], %[step1_13] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[step1_1], %[step1_6] \n\t" + "sub %[load5], %[load5], %[step1_14] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "add %[load6], %[step1_0], %[step1_7] \n\t" + "sub %[load6], %[load6], %[step1_15] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), + [load8] "=&r"(load8), [dest_pix] "+r"(dest_pix) + : + [cm] "r"(cm), [stride] "r"(stride), [step1_0] "r"(step1_0), + [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3), + [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), [step1_6] "r"(step1_6), + [step1_7] "r"(step1_7), [step1_8] "r"(step1_8), [step1_9] "r"(step1_9), + [step1_10] "r"(step1_10), [step1_11] "r"(step1_11), + [step1_12] "r"(step1_12), [step1_13] "r"(step1_13), + [step1_14] "r"(step1_14), [step1_15] "r"(step1_15)); + + input += 16; + } +} + +void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, + int stride) { + DECLARE_ALIGNED(32, int16_t, out[16 * 16]); + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); + + // First transform rows + idct16_rows_dspr2(input, out, 16); + + // Then transform columns and add to dest + idct16_cols_add_blk_dspr2(out, dest, stride); +} + +void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, + int stride) { + DECLARE_ALIGNED(32, int16_t, out[16 * 16]); + int16_t *outptr = out; + uint32_t i; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); + + // First transform rows. Since all non-zero dct coefficients are in + // upper-left 4x4 area, we only need to calculate first 4 rows here. + idct16_rows_dspr2(input, outptr, 4); + + outptr += 4; + for (i = 0; i < 6; ++i) { + __asm__ __volatile__( + "sw $zero, 0(%[outptr]) \n\t" + "sw $zero, 32(%[outptr]) \n\t" + "sw $zero, 64(%[outptr]) \n\t" + "sw $zero, 96(%[outptr]) \n\t" + "sw $zero, 128(%[outptr]) \n\t" + "sw $zero, 160(%[outptr]) \n\t" + "sw $zero, 192(%[outptr]) \n\t" + "sw $zero, 224(%[outptr]) \n\t" + "sw $zero, 256(%[outptr]) \n\t" + "sw $zero, 288(%[outptr]) \n\t" + "sw $zero, 320(%[outptr]) \n\t" + "sw $zero, 352(%[outptr]) \n\t" + "sw $zero, 384(%[outptr]) \n\t" + "sw $zero, 416(%[outptr]) \n\t" + "sw $zero, 448(%[outptr]) \n\t" + "sw $zero, 480(%[outptr]) \n\t" + + : + : [outptr] "r"(outptr)); + + outptr += 2; + } + + // Then transform columns + idct16_cols_add_blk_dspr2(out, dest, stride); +} + +void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, + int stride) { + uint32_t pos = 45; + int32_t out; + int32_t r; + int32_t a1, absa1; + int32_t vector_a1; + int32_t t1, t2, t3, t4; + int32_t vector_1, vector_2, vector_3, vector_4; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + + : + : [pos] "r"(pos)); + + out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); + __asm__ __volatile__( + "addi %[out], %[out], 32 \n\t" + "sra %[a1], %[out], 6 \n\t" + + : [out] "+r"(out), [a1] "=r"(a1) + :); + + if (a1 < 0) { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__( + "abs %[absa1], %[a1] \n\t" + "replv.qb %[vector_a1], %[absa1] \n\t" + + : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); + + for (r = 16; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + "add %[dest], %[dest], %[stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), + [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), + [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), + [dest] "+&r"(dest) + : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); + } + } else if (a1 > 255) { + int32_t a11, a12, vector_a11, vector_a12; + + /* use quad-byte + * input and output memory are four byte aligned */ + a11 = a1 >> 1; + a12 = a1 - a11; + __asm__ __volatile__( + "replv.qb %[vector_a11], %[a11] \n\t" + "replv.qb %[vector_a12], %[a12] \n\t" + + : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12) + : [a11] "r"(a11), [a12] "r"(a12)); + + for (r = 16; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a11] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a11] \n\t" + "addu_s.qb %[vector_3], %[t3], %[vector_a11] \n\t" + "addu_s.qb %[vector_4], %[t4], %[vector_a11] \n\t" + "addu_s.qb %[vector_1], %[vector_1], %[vector_a12] \n\t" + "addu_s.qb %[vector_2], %[vector_2], %[vector_a12] \n\t" + "addu_s.qb %[vector_3], %[vector_3], %[vector_a12] \n\t" + "addu_s.qb %[vector_4], %[vector_4], %[vector_a12] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + "add %[dest], %[dest], %[stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), + [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), + [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), + [dest] "+&r"(dest) + : [stride] "r"(stride), [vector_a11] "r"(vector_a11), + [vector_a12] "r"(vector_a12)); + } + } else { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" + + : [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); + + for (r = 16; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + "add %[dest], %[dest], %[stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), + [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), + [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), + [dest] "+&r"(dest) + : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); + } + } +} + +void iadst16_dspr2(const int16_t *input, int16_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; + + int x0 = input[15]; + int x1 = input[0]; + int x2 = input[13]; + int x3 = input[2]; + int x4 = input[11]; + int x5 = input[4]; + int x6 = input[9]; + int x7 = input[6]; + int x8 = input[7]; + int x9 = input[8]; + int x10 = input[5]; + int x11 = input[10]; + int x12 = input[3]; + int x13 = input[12]; + int x14 = input[1]; + int x15 = input[14]; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | + x13 | x14 | x15)) { + output[0] = output[1] = output[2] = output[3] = output[4] = output[5] = + output[6] = output[7] = output[8] = output[9] = output[10] = + output[11] = output[12] = output[13] = output[14] = output[15] = 0; + return; + } + + // stage 1 + s0 = x0 * cospi_1_64 + x1 * cospi_31_64; + s1 = x0 * cospi_31_64 - x1 * cospi_1_64; + s2 = x2 * cospi_5_64 + x3 * cospi_27_64; + s3 = x2 * cospi_27_64 - x3 * cospi_5_64; + s4 = x4 * cospi_9_64 + x5 * cospi_23_64; + s5 = x4 * cospi_23_64 - x5 * cospi_9_64; + s6 = x6 * cospi_13_64 + x7 * cospi_19_64; + s7 = x6 * cospi_19_64 - x7 * cospi_13_64; + s8 = x8 * cospi_17_64 + x9 * cospi_15_64; + s9 = x8 * cospi_15_64 - x9 * cospi_17_64; + s10 = x10 * cospi_21_64 + x11 * cospi_11_64; + s11 = x10 * cospi_11_64 - x11 * cospi_21_64; + s12 = x12 * cospi_25_64 + x13 * cospi_7_64; + s13 = x12 * cospi_7_64 - x13 * cospi_25_64; + s14 = x14 * cospi_29_64 + x15 * cospi_3_64; + s15 = x14 * cospi_3_64 - x15 * cospi_29_64; + + x0 = dct_const_round_shift(s0 + s8); + x1 = dct_const_round_shift(s1 + s9); + x2 = dct_const_round_shift(s2 + s10); + x3 = dct_const_round_shift(s3 + s11); + x4 = dct_const_round_shift(s4 + s12); + x5 = dct_const_round_shift(s5 + s13); + x6 = dct_const_round_shift(s6 + s14); + x7 = dct_const_round_shift(s7 + s15); + x8 = dct_const_round_shift(s0 - s8); + x9 = dct_const_round_shift(s1 - s9); + x10 = dct_const_round_shift(s2 - s10); + x11 = dct_const_round_shift(s3 - s11); + x12 = dct_const_round_shift(s4 - s12); + x13 = dct_const_round_shift(s5 - s13); + x14 = dct_const_round_shift(s6 - s14); + x15 = dct_const_round_shift(s7 - s15); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4; + s5 = x5; + s6 = x6; + s7 = x7; + s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; + s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; + s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + + x0 = s0 + s4; + x1 = s1 + s5; + x2 = s2 + s6; + x3 = s3 + s7; + x4 = s0 - s4; + x5 = s1 - s5; + x6 = s2 - s6; + x7 = s3 - s7; + x8 = dct_const_round_shift(s8 + s12); + x9 = dct_const_round_shift(s9 + s13); + x10 = dct_const_round_shift(s10 + s14); + x11 = dct_const_round_shift(s11 + s15); + x12 = dct_const_round_shift(s8 - s12); + x13 = dct_const_round_shift(s9 - s13); + x14 = dct_const_round_shift(s10 - s14); + x15 = dct_const_round_shift(s11 - s15); + + // stage 3 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + s5 = x4 * cospi_24_64 - x5 * cospi_8_64; + s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; + s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + s8 = x8; + s9 = x9; + s10 = x10; + s11 = x11; + s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + s13 = x12 * cospi_24_64 - x13 * cospi_8_64; + s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; + s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + + x0 = s0 + s2; + x1 = s1 + s3; + x2 = s0 - s2; + x3 = s1 - s3; + x4 = dct_const_round_shift(s4 + s6); + x5 = dct_const_round_shift(s5 + s7); + x6 = dct_const_round_shift(s4 - s6); + x7 = dct_const_round_shift(s5 - s7); + x8 = s8 + s10; + x9 = s9 + s11; + x10 = s8 - s10; + x11 = s9 - s11; + x12 = dct_const_round_shift(s12 + s14); + x13 = dct_const_round_shift(s13 + s15); + x14 = dct_const_round_shift(s12 - s14); + x15 = dct_const_round_shift(s13 - s15); + + // stage 4 + s2 = (-cospi_16_64) * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (-x6 + x7); + s10 = cospi_16_64 * (x10 + x11); + s11 = cospi_16_64 * (-x10 + x11); + s14 = (-cospi_16_64) * (x14 + x15); + s15 = cospi_16_64 * (x14 - x15); + + x2 = dct_const_round_shift(s2); + x3 = dct_const_round_shift(s3); + x6 = dct_const_round_shift(s6); + x7 = dct_const_round_shift(s7); + x10 = dct_const_round_shift(s10); + x11 = dct_const_round_shift(s11); + x14 = dct_const_round_shift(s14); + x15 = dct_const_round_shift(s15); + + output[0] = x0; + output[1] = -x8; + output[2] = x12; + output[3] = -x4; + output[4] = x6; + output[5] = x14; + output[6] = x10; + output[7] = x2; + output[8] = x3; + output[9] = x11; + output[10] = x15; + output[11] = x7; + output[12] = x5; + output[13] = -x13; + output[14] = x9; + output[15] = -x1; +} + +#endif // HAVE_DSPR2 diff --git a/media/libvpx/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c new file mode 100644 index 0000000000..3f043b48ba --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c @@ -0,0 +1,1119 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "vpx_dsp/mips/inv_txfm_dspr2.h" +#include "vpx_dsp/txfm_common.h" + +#if HAVE_DSPR2 +void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) { + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; + int step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; + int step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20; + int step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27; + int step1_28, step1_29, step1_30, step1_31; + int step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; + int step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; + int step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; + int step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; + int step2_28, step2_29, step2_30, step2_31; + int step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; + int step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; + int step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28; + int step3_29, step3_30, step3_31; + int temp0, temp1, temp2, temp3; + int load1, load2, load3, load4; + int result1, result2; + int i; + uint8_t *dest_pix, *dest_pix1; + const int const_2_power_13 = 8192; + uint8_t *cm = vpx_ff_cropTbl; + + /* prefetch vpx_ff_cropTbl */ + prefetch_load(vpx_ff_cropTbl); + prefetch_load(vpx_ff_cropTbl + 32); + prefetch_load(vpx_ff_cropTbl + 64); + prefetch_load(vpx_ff_cropTbl + 96); + prefetch_load(vpx_ff_cropTbl + 128); + prefetch_load(vpx_ff_cropTbl + 160); + prefetch_load(vpx_ff_cropTbl + 192); + prefetch_load(vpx_ff_cropTbl + 224); + + for (i = 0; i < 32; ++i) { + dest_pix = dest + i; + dest_pix1 = dest + i + 31 * stride; + + __asm__ __volatile__( + "lh %[load1], 2(%[input]) \n\t" + "lh %[load2], 62(%[input]) \n\t" + "lh %[load3], 34(%[input]) \n\t" + "lh %[load4], 30(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_31_64] \n\t" + "msub $ac1, %[load2], %[cospi_1_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_1_64] \n\t" + "madd $ac3, %[load2], %[cospi_31_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_15_64] \n\t" + "msub $ac2, %[load4], %[cospi_17_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_17_64] \n\t" + "madd $ac1, %[load4], %[cospi_15_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp3], %[temp2] \n\t" + "sub %[load2], %[temp0], %[temp1] \n\t" + + "madd $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "madd $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + + "extp %[step1_17], $ac1, 31 \n\t" + "extp %[step1_30], $ac3, 31 \n\t" + "add %[step1_16], %[temp0], %[temp1] \n\t" + "add %[step1_31], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), + [step1_16] "=&r"(step1_16), [step1_17] "=&r"(step1_17), + [step1_30] "=&r"(step1_30), [step1_31] "=&r"(step1_31) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64), + [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64)); + + __asm__ __volatile__( + "lh %[load1], 18(%[input]) \n\t" + "lh %[load2], 46(%[input]) \n\t" + "lh %[load3], 50(%[input]) \n\t" + "lh %[load4], 14(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_23_64] \n\t" + "msub $ac1, %[load2], %[cospi_9_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_9_64] \n\t" + "madd $ac3, %[load2], %[cospi_23_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_7_64] \n\t" + "msub $ac2, %[load4], %[cospi_25_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_25_64] \n\t" + "madd $ac1, %[load4], %[cospi_7_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + + "msub $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "msub $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + + "extp %[step1_18], $ac1, 31 \n\t" + "extp %[step1_29], $ac3, 31 \n\t" + "add %[step1_19], %[temp0], %[temp1] \n\t" + "add %[step1_28], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), + [step1_18] "=&r"(step1_18), [step1_19] "=&r"(step1_19), + [step1_28] "=&r"(step1_28), [step1_29] "=&r"(step1_29) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64), + [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64)); + + __asm__ __volatile__( + "lh %[load1], 10(%[input]) \n\t" + "lh %[load2], 54(%[input]) \n\t" + "lh %[load3], 42(%[input]) \n\t" + "lh %[load4], 22(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_27_64] \n\t" + "msub $ac1, %[load2], %[cospi_5_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_5_64] \n\t" + "madd $ac3, %[load2], %[cospi_27_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_11_64] \n\t" + "msub $ac2, %[load4], %[cospi_21_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_21_64] \n\t" + "madd $ac1, %[load4], %[cospi_11_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp0], %[temp1] \n\t" + "sub %[load2], %[temp3], %[temp2] \n\t" + + "madd $ac1, %[load2], %[cospi_12_64] \n\t" + "msub $ac1, %[load1], %[cospi_20_64] \n\t" + "madd $ac3, %[load1], %[cospi_12_64] \n\t" + "madd $ac3, %[load2], %[cospi_20_64] \n\t" + + "extp %[step1_21], $ac1, 31 \n\t" + "extp %[step1_26], $ac3, 31 \n\t" + "add %[step1_20], %[temp0], %[temp1] \n\t" + "add %[step1_27], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), + [step1_20] "=&r"(step1_20), [step1_21] "=&r"(step1_21), + [step1_26] "=&r"(step1_26), [step1_27] "=&r"(step1_27) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64), + [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64), + [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64)); + + __asm__ __volatile__( + "lh %[load1], 26(%[input]) \n\t" + "lh %[load2], 38(%[input]) \n\t" + "lh %[load3], 58(%[input]) \n\t" + "lh %[load4], 6(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_19_64] \n\t" + "msub $ac1, %[load2], %[cospi_13_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "madd $ac3, %[load1], %[cospi_13_64] \n\t" + "madd $ac3, %[load2], %[cospi_19_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_3_64] \n\t" + "msub $ac2, %[load4], %[cospi_29_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + "madd $ac1, %[load3], %[cospi_29_64] \n\t" + "madd $ac1, %[load4], %[cospi_3_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + "msub $ac1, %[load1], %[cospi_12_64] \n\t" + "msub $ac1, %[load2], %[cospi_20_64] \n\t" + "msub $ac3, %[load1], %[cospi_20_64] \n\t" + "madd $ac3, %[load2], %[cospi_12_64] \n\t" + "extp %[step1_22], $ac1, 31 \n\t" + "extp %[step1_25], $ac3, 31 \n\t" + "add %[step1_23], %[temp0], %[temp1] \n\t" + "add %[step1_24], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), + [step1_22] "=&r"(step1_22), [step1_23] "=&r"(step1_23), + [step1_24] "=&r"(step1_24), [step1_25] "=&r"(step1_25) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64), + [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64), + [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64)); + + __asm__ __volatile__( + "lh %[load1], 4(%[input]) \n\t" + "lh %[load2], 60(%[input]) \n\t" + "lh %[load3], 36(%[input]) \n\t" + "lh %[load4], 28(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_30_64] \n\t" + "msub $ac1, %[load2], %[cospi_2_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "madd $ac3, %[load1], %[cospi_2_64] \n\t" + "madd $ac3, %[load2], %[cospi_30_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_14_64] \n\t" + "msub $ac2, %[load4], %[cospi_18_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + "madd $ac1, %[load3], %[cospi_18_64] \n\t" + "madd $ac1, %[load4], %[cospi_14_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp0], %[temp1] \n\t" + "sub %[load2], %[temp3], %[temp2] \n\t" + "msub $ac1, %[load1], %[cospi_8_64] \n\t" + "madd $ac1, %[load2], %[cospi_24_64] \n\t" + "madd $ac3, %[load1], %[cospi_24_64] \n\t" + "madd $ac3, %[load2], %[cospi_8_64] \n\t" + "extp %[step2_9], $ac1, 31 \n\t" + "extp %[step2_14], $ac3, 31 \n\t" + "add %[step2_8], %[temp0], %[temp1] \n\t" + "add %[step2_15], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=&r"(step2_8), + [step2_9] "=&r"(step2_9), [step2_14] "=&r"(step2_14), + [step2_15] "=&r"(step2_15) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), + [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), + [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64)); + + __asm__ __volatile__( + "lh %[load1], 20(%[input]) \n\t" + "lh %[load2], 44(%[input]) \n\t" + "lh %[load3], 52(%[input]) \n\t" + "lh %[load4], 12(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_22_64] \n\t" + "msub $ac1, %[load2], %[cospi_10_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "madd $ac3, %[load1], %[cospi_10_64] \n\t" + "madd $ac3, %[load2], %[cospi_22_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_6_64] \n\t" + "msub $ac2, %[load4], %[cospi_26_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + "madd $ac1, %[load3], %[cospi_26_64] \n\t" + "madd $ac1, %[load4], %[cospi_6_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + "msub $ac1, %[load1], %[cospi_24_64] \n\t" + "msub $ac1, %[load2], %[cospi_8_64] \n\t" + "madd $ac3, %[load2], %[cospi_24_64] \n\t" + "msub $ac3, %[load1], %[cospi_8_64] \n\t" + "extp %[step2_10], $ac1, 31 \n\t" + "extp %[step2_13], $ac3, 31 \n\t" + "add %[step2_11], %[temp0], %[temp1] \n\t" + "add %[step2_12], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), + [step2_10] "=&r"(step2_10), [step2_11] "=&r"(step2_11), + [step2_12] "=&r"(step2_12), [step2_13] "=&r"(step2_13) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), + [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), + [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64)); + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "sub %[temp0], %[step2_14], %[step2_13] \n\t" + "sub %[temp0], %[temp0], %[step2_9] \n\t" + "add %[temp0], %[temp0], %[step2_10] \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp1], %[step2_14], %[step2_13] \n\t" + "add %[temp1], %[temp1], %[step2_9] \n\t" + "sub %[temp1], %[temp1], %[step2_10] \n\t" + "madd $ac1, %[temp1], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "sub %[temp0], %[step2_15], %[step2_12] \n\t" + "sub %[temp0], %[temp0], %[step2_8] \n\t" + "add %[temp0], %[temp0], %[step2_11] \n\t" + "madd $ac2, %[temp0], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sub %[temp1], %[step2_15], %[step2_12] \n\t" + "add %[temp1], %[temp1], %[step2_8] \n\t" + "sub %[temp1], %[temp1], %[step2_11] \n\t" + "madd $ac3, %[temp1], %[cospi_16_64] \n\t" + + "add %[step3_8], %[step2_8], %[step2_11] \n\t" + "add %[step3_9], %[step2_9], %[step2_10] \n\t" + "add %[step3_14], %[step2_13], %[step2_14] \n\t" + "add %[step3_15], %[step2_12], %[step2_15] \n\t" + "extp %[step3_10], $ac0, 31 \n\t" + "extp %[step3_13], $ac1, 31 \n\t" + "extp %[step3_11], $ac2, 31 \n\t" + "extp %[step3_12], $ac3, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=&r"(step3_8), + [step3_9] "=&r"(step3_9), [step3_10] "=&r"(step3_10), + [step3_11] "=&r"(step3_11), [step3_12] "=&r"(step3_12), + [step3_13] "=&r"(step3_13), [step3_14] "=&r"(step3_14), + [step3_15] "=&r"(step3_15) + : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8), + [step2_9] "r"(step2_9), [step2_10] "r"(step2_10), + [step2_11] "r"(step2_11), [step2_12] "r"(step2_12), + [step2_13] "r"(step2_13), [step2_14] "r"(step2_14), + [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp0], %[step1_17], %[step1_18] \n\t" + "sub %[temp1], %[step1_30], %[step1_29] \n\t" + "add %[step3_17], %[step1_17], %[step1_18] \n\t" + "add %[step3_30], %[step1_30], %[step1_29] \n\t" + + "msub $ac0, %[temp0], %[cospi_8_64] \n\t" + "madd $ac0, %[temp1], %[cospi_24_64] \n\t" + "extp %[step3_18], $ac0, 31 \n\t" + "madd $ac1, %[temp0], %[cospi_24_64] \n\t" + "madd $ac1, %[temp1], %[cospi_8_64] \n\t" + "extp %[step3_29], $ac1, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [step3_18] "=&r"(step3_18), [step3_29] "=&r"(step3_29), + [step3_17] "=&r"(step3_17), [step3_30] "=&r"(step3_30) + : [const_2_power_13] "r"(const_2_power_13), [step1_17] "r"(step1_17), + [step1_18] "r"(step1_18), [step1_30] "r"(step1_30), + [step1_29] "r"(step1_29), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp0], %[step1_16], %[step1_19] \n\t" + "sub %[temp1], %[step1_31], %[step1_28] \n\t" + "add %[step3_16], %[step1_16], %[step1_19] \n\t" + "add %[step3_31], %[step1_31], %[step1_28] \n\t" + + "msub $ac0, %[temp0], %[cospi_8_64] \n\t" + "madd $ac0, %[temp1], %[cospi_24_64] \n\t" + "extp %[step3_19], $ac0, 31 \n\t" + "madd $ac1, %[temp0], %[cospi_24_64] \n\t" + "madd $ac1, %[temp1], %[cospi_8_64] \n\t" + "extp %[step3_28], $ac1, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [step3_16] "=&r"(step3_16), [step3_31] "=&r"(step3_31), + [step3_19] "=&r"(step3_19), [step3_28] "=&r"(step3_28) + : [const_2_power_13] "r"(const_2_power_13), [step1_16] "r"(step1_16), + [step1_19] "r"(step1_19), [step1_31] "r"(step1_31), + [step1_28] "r"(step1_28), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp0], %[step1_23], %[step1_20] \n\t" + "sub %[temp1], %[step1_24], %[step1_27] \n\t" + "add %[step3_23], %[step1_23], %[step1_20] \n\t" + "add %[step3_24], %[step1_24], %[step1_27] \n\t" + + "msub $ac0, %[temp0], %[cospi_8_64] \n\t" + "madd $ac0, %[temp1], %[cospi_24_64] \n\t" + "extp %[step3_27], $ac0, 31 \n\t" + "msub $ac1, %[temp0], %[cospi_24_64] \n\t" + "msub $ac1, %[temp1], %[cospi_8_64] \n\t" + "extp %[step3_20], $ac1, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [step3_23] "=&r"(step3_23), [step3_24] "=&r"(step3_24), + [step3_20] "=&r"(step3_20), [step3_27] "=&r"(step3_27) + : [const_2_power_13] "r"(const_2_power_13), [step1_23] "r"(step1_23), + [step1_20] "r"(step1_20), [step1_24] "r"(step1_24), + [step1_27] "r"(step1_27), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp0], %[step1_22], %[step1_21] \n\t" + "sub %[temp1], %[step1_25], %[step1_26] \n\t" + "add %[step3_22], %[step1_22], %[step1_21] \n\t" + "add %[step3_25], %[step1_25], %[step1_26] \n\t" + + "msub $ac0, %[temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[temp1], %[cospi_8_64] \n\t" + "extp %[step3_21], $ac0, 31 \n\t" + "msub $ac1, %[temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[temp1], %[cospi_24_64] \n\t" + "extp %[step3_26], $ac1, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [step3_22] "=&r"(step3_22), [step3_25] "=&r"(step3_25), + [step3_21] "=&r"(step3_21), [step3_26] "=&r"(step3_26) + : [const_2_power_13] "r"(const_2_power_13), [step1_22] "r"(step1_22), + [step1_21] "r"(step1_21), [step1_25] "r"(step1_25), + [step1_26] "r"(step1_26), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + __asm__ __volatile__( + "add %[step2_16], %[step3_16], %[step3_23] \n\t" + "add %[step2_17], %[step3_17], %[step3_22] \n\t" + "add %[step2_18], %[step3_18], %[step3_21] \n\t" + "add %[step2_19], %[step3_19], %[step3_20] \n\t" + "sub %[step2_20], %[step3_19], %[step3_20] \n\t" + "sub %[step2_21], %[step3_18], %[step3_21] \n\t" + "sub %[step2_22], %[step3_17], %[step3_22] \n\t" + "sub %[step2_23], %[step3_16], %[step3_23] \n\t" + + : [step2_16] "=&r"(step2_16), [step2_17] "=&r"(step2_17), + [step2_18] "=&r"(step2_18), [step2_19] "=&r"(step2_19), + [step2_20] "=&r"(step2_20), [step2_21] "=&r"(step2_21), + [step2_22] "=&r"(step2_22), [step2_23] "=&r"(step2_23) + : [step3_16] "r"(step3_16), [step3_23] "r"(step3_23), + [step3_17] "r"(step3_17), [step3_22] "r"(step3_22), + [step3_18] "r"(step3_18), [step3_21] "r"(step3_21), + [step3_19] "r"(step3_19), [step3_20] "r"(step3_20)); + + __asm__ __volatile__( + "sub %[step2_24], %[step3_31], %[step3_24] \n\t" + "sub %[step2_25], %[step3_30], %[step3_25] \n\t" + "sub %[step2_26], %[step3_29], %[step3_26] \n\t" + "sub %[step2_27], %[step3_28], %[step3_27] \n\t" + "add %[step2_28], %[step3_28], %[step3_27] \n\t" + "add %[step2_29], %[step3_29], %[step3_26] \n\t" + "add %[step2_30], %[step3_30], %[step3_25] \n\t" + "add %[step2_31], %[step3_31], %[step3_24] \n\t" + + : [step2_24] "=&r"(step2_24), [step2_28] "=&r"(step2_28), + [step2_25] "=&r"(step2_25), [step2_29] "=&r"(step2_29), + [step2_26] "=&r"(step2_26), [step2_30] "=&r"(step2_30), + [step2_27] "=&r"(step2_27), [step2_31] "=&r"(step2_31) + : [step3_31] "r"(step3_31), [step3_24] "r"(step3_24), + [step3_30] "r"(step3_30), [step3_25] "r"(step3_25), + [step3_29] "r"(step3_29), [step3_26] "r"(step3_26), + [step3_28] "r"(step3_28), [step3_27] "r"(step3_27)); + + __asm__ __volatile__( + "lh %[load1], 0(%[input]) \n\t" + "lh %[load2], 32(%[input]) \n\t" + "lh %[load3], 16(%[input]) \n\t" + "lh %[load4], 48(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "add %[result1], %[load1], %[load2] \n\t" + "sub %[result2], %[load1], %[load2] \n\t" + "madd $ac1, %[result1], %[cospi_16_64] \n\t" + "madd $ac2, %[result2], %[cospi_16_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac3, %[load3], %[cospi_24_64] \n\t" + "msub $ac3, %[load4], %[cospi_8_64] \n\t" + "extp %[temp2], $ac3, 31 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "madd $ac1, %[load3], %[cospi_8_64] \n\t" + "madd $ac1, %[load4], %[cospi_24_64] \n\t" + "extp %[temp3], $ac1, 31 \n\t" + "add %[step1_0], %[temp0], %[temp3] \n\t" + "add %[step1_1], %[temp1], %[temp2] \n\t" + "sub %[step1_2], %[temp1], %[temp2] \n\t" + "sub %[step1_3], %[temp0], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [result1] "=&r"(result1), + [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=&r"(step1_0), + [step1_1] "=&r"(step1_1), [step1_2] "=&r"(step1_2), + [step1_3] "=&r"(step1_3) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64), + [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( + "lh %[load1], 8(%[input]) \n\t" + "lh %[load2], 56(%[input]) \n\t" + "lh %[load3], 40(%[input]) \n\t" + "lh %[load4], 24(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "madd $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_12_64] \n\t" + "msub $ac2, %[load4], %[cospi_20_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + "madd $ac1, %[load3], %[cospi_20_64] \n\t" + "madd $ac1, %[load4], %[cospi_12_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp3], %[temp2] \n\t" + "sub %[load1], %[load1], %[temp0] \n\t" + "add %[load1], %[load1], %[temp1] \n\t" + "sub %[load2], %[temp0], %[temp1] \n\t" + "sub %[load2], %[load2], %[temp2] \n\t" + "add %[load2], %[load2], %[temp3] \n\t" + "madd $ac1, %[load1], %[cospi_16_64] \n\t" + "madd $ac3, %[load2], %[cospi_16_64] \n\t" + + "extp %[step1_5], $ac1, 31 \n\t" + "extp %[step1_6], $ac3, 31 \n\t" + "add %[step1_4], %[temp0], %[temp1] \n\t" + "add %[step1_7], %[temp3], %[temp2] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=&r"(step1_4), + [step1_5] "=&r"(step1_5), [step1_6] "=&r"(step1_6), + [step1_7] "=&r"(step1_7) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), + [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( + "add %[step2_0], %[step1_0], %[step1_7] \n\t" + "add %[step2_1], %[step1_1], %[step1_6] \n\t" + "add %[step2_2], %[step1_2], %[step1_5] \n\t" + "add %[step2_3], %[step1_3], %[step1_4] \n\t" + "sub %[step2_4], %[step1_3], %[step1_4] \n\t" + "sub %[step2_5], %[step1_2], %[step1_5] \n\t" + "sub %[step2_6], %[step1_1], %[step1_6] \n\t" + "sub %[step2_7], %[step1_0], %[step1_7] \n\t" + + : [step2_0] "=&r"(step2_0), [step2_4] "=&r"(step2_4), + [step2_1] "=&r"(step2_1), [step2_5] "=&r"(step2_5), + [step2_2] "=&r"(step2_2), [step2_6] "=&r"(step2_6), + [step2_3] "=&r"(step2_3), [step2_7] "=&r"(step2_7) + : [step1_0] "r"(step1_0), [step1_7] "r"(step1_7), + [step1_1] "r"(step1_1), [step1_6] "r"(step1_6), + [step1_2] "r"(step1_2), [step1_5] "r"(step1_5), + [step1_3] "r"(step1_3), [step1_4] "r"(step1_4)); + + // stage 7 + __asm__ __volatile__( + "add %[step1_0], %[step2_0], %[step3_15] \n\t" + "add %[step1_1], %[step2_1], %[step3_14] \n\t" + "add %[step1_2], %[step2_2], %[step3_13] \n\t" + "add %[step1_3], %[step2_3], %[step3_12] \n\t" + "sub %[step1_12], %[step2_3], %[step3_12] \n\t" + "sub %[step1_13], %[step2_2], %[step3_13] \n\t" + "sub %[step1_14], %[step2_1], %[step3_14] \n\t" + "sub %[step1_15], %[step2_0], %[step3_15] \n\t" + + : [step1_0] "=&r"(step1_0), [step1_12] "=&r"(step1_12), + [step1_1] "=&r"(step1_1), [step1_13] "=&r"(step1_13), + [step1_2] "=&r"(step1_2), [step1_14] "=&r"(step1_14), + [step1_3] "=&r"(step1_3), [step1_15] "=&r"(step1_15) + : [step2_0] "r"(step2_0), [step3_15] "r"(step3_15), + [step2_1] "r"(step2_1), [step3_14] "r"(step3_14), + [step2_2] "r"(step2_2), [step3_13] "r"(step3_13), + [step2_3] "r"(step2_3), [step3_12] "r"(step3_12)); + + __asm__ __volatile__( + "add %[step1_4], %[step2_4], %[step3_11] \n\t" + "add %[step1_5], %[step2_5], %[step3_10] \n\t" + "add %[step1_6], %[step2_6], %[step3_9] \n\t" + "add %[step1_7], %[step2_7], %[step3_8] \n\t" + "sub %[step1_8], %[step2_7], %[step3_8] \n\t" + "sub %[step1_9], %[step2_6], %[step3_9] \n\t" + "sub %[step1_10], %[step2_5], %[step3_10] \n\t" + "sub %[step1_11], %[step2_4], %[step3_11] \n\t" + + : [step1_4] "=&r"(step1_4), [step1_8] "=&r"(step1_8), + [step1_5] "=&r"(step1_5), [step1_9] "=&r"(step1_9), + [step1_6] "=&r"(step1_6), [step1_10] "=&r"(step1_10), + [step1_7] "=&r"(step1_7), [step1_11] "=&r"(step1_11) + : [step2_4] "r"(step2_4), [step3_11] "r"(step3_11), + [step2_5] "r"(step2_5), [step3_10] "r"(step3_10), + [step2_6] "r"(step2_6), [step3_9] "r"(step3_9), + [step2_7] "r"(step2_7), [step3_8] "r"(step3_8)); + + __asm__ __volatile__( + "sub %[temp0], %[step2_27], %[step2_20] \n\t" + "add %[temp1], %[step2_27], %[step2_20] \n\t" + "sub %[temp2], %[step2_26], %[step2_21] \n\t" + "add %[temp3], %[step2_26], %[step2_21] \n\t" + + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "madd $ac1, %[temp1], %[cospi_16_64] \n\t" + "madd $ac2, %[temp2], %[cospi_16_64] \n\t" + "madd $ac3, %[temp3], %[cospi_16_64] \n\t" + + "extp %[step1_20], $ac0, 31 \n\t" + "extp %[step1_27], $ac1, 31 \n\t" + "extp %[step1_21], $ac2, 31 \n\t" + "extp %[step1_26], $ac3, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [step1_20] "=&r"(step1_20), + [step1_27] "=&r"(step1_27), [step1_21] "=&r"(step1_21), + [step1_26] "=&r"(step1_26) + : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), + [step2_27] "r"(step2_27), [step2_21] "r"(step2_21), + [step2_26] "r"(step2_26), [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( + "sub %[temp0], %[step2_25], %[step2_22] \n\t" + "add %[temp1], %[step2_25], %[step2_22] \n\t" + "sub %[temp2], %[step2_24], %[step2_23] \n\t" + "add %[temp3], %[step2_24], %[step2_23] \n\t" + + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "madd $ac1, %[temp1], %[cospi_16_64] \n\t" + "madd $ac2, %[temp2], %[cospi_16_64] \n\t" + "madd $ac3, %[temp3], %[cospi_16_64] \n\t" + + "extp %[step1_22], $ac0, 31 \n\t" + "extp %[step1_25], $ac1, 31 \n\t" + "extp %[step1_23], $ac2, 31 \n\t" + "extp %[step1_24], $ac3, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [step1_22] "=&r"(step1_22), + [step1_25] "=&r"(step1_25), [step1_23] "=&r"(step1_23), + [step1_24] "=&r"(step1_24) + : [const_2_power_13] "r"(const_2_power_13), [step2_22] "r"(step2_22), + [step2_25] "r"(step2_25), [step2_23] "r"(step2_23), + [step2_24] "r"(step2_24), [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_0], %[step2_31] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_1], %[step2_30] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_2], %[step2_29] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_3], %[step2_28] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) + : [cm] "r"(cm), [stride] "r"(stride), [step1_0] "r"(step1_0), + [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), + [step1_3] "r"(step1_3), [step2_28] "r"(step2_28), + [step2_29] "r"(step2_29), [step2_30] "r"(step2_30), + [step2_31] "r"(step2_31)); + + step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6); + step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6); + step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6); + step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6); + + __asm__ __volatile__( + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_15] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_14] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_13] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_12] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) + : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12), + [step3_13] "r"(step3_13), [step3_14] "r"(step3_14), + [step3_15] "r"(step3_15)); + + __asm__ __volatile__( + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_4], %[step1_27] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_5], %[step1_26] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_6], %[step1_25] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_7], %[step1_24] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) + : [cm] "r"(cm), [stride] "r"(stride), [step1_4] "r"(step1_4), + [step1_5] "r"(step1_5), [step1_6] "r"(step1_6), + [step1_7] "r"(step1_7), [step1_24] "r"(step1_24), + [step1_25] "r"(step1_25), [step1_26] "r"(step1_26), + [step1_27] "r"(step1_27)); + + step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6); + step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6); + step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6); + step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6); + + __asm__ __volatile__( + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_15] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_14] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_13] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_12] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) + : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12), + [step3_13] "r"(step3_13), [step3_14] "r"(step3_14), + [step3_15] "r"(step3_15)); + + __asm__ __volatile__( + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_8], %[step1_23] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_9], %[step1_22] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_10], %[step1_21] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_11], %[step1_20] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) + : [cm] "r"(cm), [stride] "r"(stride), [step1_8] "r"(step1_8), + [step1_9] "r"(step1_9), [step1_10] "r"(step1_10), + [step1_11] "r"(step1_11), [step1_20] "r"(step1_20), + [step1_21] "r"(step1_21), [step1_22] "r"(step1_22), + [step1_23] "r"(step1_23)); + + step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6); + step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6); + step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6); + step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6); + + __asm__ __volatile__( + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_15] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_14] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_13] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_12] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) + : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12), + [step3_13] "r"(step3_13), [step3_14] "r"(step3_14), + [step3_15] "r"(step3_15)); + + __asm__ __volatile__( + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_12], %[step2_19] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_13], %[step2_18] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_14], %[step2_17] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_15], %[step2_16] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) + : [cm] "r"(cm), [stride] "r"(stride), [step1_12] "r"(step1_12), + [step1_13] "r"(step1_13), [step1_14] "r"(step1_14), + [step1_15] "r"(step1_15), [step2_16] "r"(step2_16), + [step2_17] "r"(step2_17), [step2_18] "r"(step2_18), + [step2_19] "r"(step2_19)); + + step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6); + step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6); + step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6); + step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6); + + __asm__ __volatile__( + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_15] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_14] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_13] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_12] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) + : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12), + [step3_13] "r"(step3_13), [step3_14] "r"(step3_14), + [step3_15] "r"(step3_15)); + + input += 32; + } +} +#endif // #if HAVE_DSPR2 diff --git a/media/libvpx/libvpx/vpx_dsp/mips/itrans32_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/itrans32_dspr2.c new file mode 100644 index 0000000000..3c0468c00f --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/itrans32_dspr2.c @@ -0,0 +1,1218 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "vpx_dsp/mips/inv_txfm_dspr2.h" +#include "vpx_dsp/txfm_common.h" + +#if HAVE_DSPR2 +static void idct32_rows_dspr2(const int16_t *input, int16_t *output, + uint32_t no_rows) { + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; + int step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; + int step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20; + int step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27; + int step1_28, step1_29, step1_30, step1_31; + int step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; + int step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; + int step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; + int step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; + int step2_28, step2_29, step2_30, step2_31; + int step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; + int step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; + int step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28; + int step3_29, step3_30, step3_31; + int temp0, temp1, temp2, temp3; + int load1, load2, load3, load4; + int result1, result2; + int i; + const int const_2_power_13 = 8192; + const int32_t *input_int; + + for (i = no_rows; i--;) { + input_int = (const int32_t *)input; + + if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] | + input_int[4] | input_int[5] | input_int[6] | input_int[7] | + input_int[8] | input_int[9] | input_int[10] | input_int[11] | + input_int[12] | input_int[13] | input_int[14] | input_int[15])) { + input += 32; + + __asm__ __volatile__( + "sh $zero, 0(%[output]) \n\t" + "sh $zero, 64(%[output]) \n\t" + "sh $zero, 128(%[output]) \n\t" + "sh $zero, 192(%[output]) \n\t" + "sh $zero, 256(%[output]) \n\t" + "sh $zero, 320(%[output]) \n\t" + "sh $zero, 384(%[output]) \n\t" + "sh $zero, 448(%[output]) \n\t" + "sh $zero, 512(%[output]) \n\t" + "sh $zero, 576(%[output]) \n\t" + "sh $zero, 640(%[output]) \n\t" + "sh $zero, 704(%[output]) \n\t" + "sh $zero, 768(%[output]) \n\t" + "sh $zero, 832(%[output]) \n\t" + "sh $zero, 896(%[output]) \n\t" + "sh $zero, 960(%[output]) \n\t" + "sh $zero, 1024(%[output]) \n\t" + "sh $zero, 1088(%[output]) \n\t" + "sh $zero, 1152(%[output]) \n\t" + "sh $zero, 1216(%[output]) \n\t" + "sh $zero, 1280(%[output]) \n\t" + "sh $zero, 1344(%[output]) \n\t" + "sh $zero, 1408(%[output]) \n\t" + "sh $zero, 1472(%[output]) \n\t" + "sh $zero, 1536(%[output]) \n\t" + "sh $zero, 1600(%[output]) \n\t" + "sh $zero, 1664(%[output]) \n\t" + "sh $zero, 1728(%[output]) \n\t" + "sh $zero, 1792(%[output]) \n\t" + "sh $zero, 1856(%[output]) \n\t" + "sh $zero, 1920(%[output]) \n\t" + "sh $zero, 1984(%[output]) \n\t" + + : + : [output] "r"(output)); + + output += 1; + + continue; + } + + /* prefetch row */ + prefetch_load((const uint8_t *)(input + 32)); + prefetch_load((const uint8_t *)(input + 48)); + + __asm__ __volatile__( + "lh %[load1], 2(%[input]) \n\t" + "lh %[load2], 62(%[input]) \n\t" + "lh %[load3], 34(%[input]) \n\t" + "lh %[load4], 30(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_31_64] \n\t" + "msub $ac1, %[load2], %[cospi_1_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_1_64] \n\t" + "madd $ac3, %[load2], %[cospi_31_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_15_64] \n\t" + "msub $ac2, %[load4], %[cospi_17_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_17_64] \n\t" + "madd $ac1, %[load4], %[cospi_15_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp3], %[temp2] \n\t" + "sub %[load2], %[temp0], %[temp1] \n\t" + + "madd $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "madd $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + + "extp %[step1_17], $ac1, 31 \n\t" + "extp %[step1_30], $ac3, 31 \n\t" + "add %[step1_16], %[temp0], %[temp1] \n\t" + "add %[step1_31], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), + [step1_16] "=&r"(step1_16), [step1_17] "=&r"(step1_17), + [step1_30] "=&r"(step1_30), [step1_31] "=&r"(step1_31) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64), + [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64)); + + __asm__ __volatile__( + "lh %[load1], 18(%[input]) \n\t" + "lh %[load2], 46(%[input]) \n\t" + "lh %[load3], 50(%[input]) \n\t" + "lh %[load4], 14(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_23_64] \n\t" + "msub $ac1, %[load2], %[cospi_9_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_9_64] \n\t" + "madd $ac3, %[load2], %[cospi_23_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_7_64] \n\t" + "msub $ac2, %[load4], %[cospi_25_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_25_64] \n\t" + "madd $ac1, %[load4], %[cospi_7_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + + "msub $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "msub $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + + "extp %[step1_18], $ac1, 31 \n\t" + "extp %[step1_29], $ac3, 31 \n\t" + "add %[step1_19], %[temp0], %[temp1] \n\t" + "add %[step1_28], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), + [step1_18] "=&r"(step1_18), [step1_19] "=&r"(step1_19), + [step1_28] "=&r"(step1_28), [step1_29] "=&r"(step1_29) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64), + [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64)); + + __asm__ __volatile__( + "lh %[load1], 10(%[input]) \n\t" + "lh %[load2], 54(%[input]) \n\t" + "lh %[load3], 42(%[input]) \n\t" + "lh %[load4], 22(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_27_64] \n\t" + "msub $ac1, %[load2], %[cospi_5_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_5_64] \n\t" + "madd $ac3, %[load2], %[cospi_27_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_11_64] \n\t" + "msub $ac2, %[load4], %[cospi_21_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_21_64] \n\t" + "madd $ac1, %[load4], %[cospi_11_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp0], %[temp1] \n\t" + "sub %[load2], %[temp3], %[temp2] \n\t" + + "madd $ac1, %[load2], %[cospi_12_64] \n\t" + "msub $ac1, %[load1], %[cospi_20_64] \n\t" + "madd $ac3, %[load1], %[cospi_12_64] \n\t" + "madd $ac3, %[load2], %[cospi_20_64] \n\t" + + "extp %[step1_21], $ac1, 31 \n\t" + "extp %[step1_26], $ac3, 31 \n\t" + "add %[step1_20], %[temp0], %[temp1] \n\t" + "add %[step1_27], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), + [step1_20] "=&r"(step1_20), [step1_21] "=&r"(step1_21), + [step1_26] "=&r"(step1_26), [step1_27] "=&r"(step1_27) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64), + [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64), + [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64)); + + __asm__ __volatile__( + "lh %[load1], 26(%[input]) \n\t" + "lh %[load2], 38(%[input]) \n\t" + "lh %[load3], 58(%[input]) \n\t" + "lh %[load4], 6(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_19_64] \n\t" + "msub $ac1, %[load2], %[cospi_13_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "madd $ac3, %[load1], %[cospi_13_64] \n\t" + "madd $ac3, %[load2], %[cospi_19_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_3_64] \n\t" + "msub $ac2, %[load4], %[cospi_29_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + "madd $ac1, %[load3], %[cospi_29_64] \n\t" + "madd $ac1, %[load4], %[cospi_3_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + "msub $ac1, %[load1], %[cospi_12_64] \n\t" + "msub $ac1, %[load2], %[cospi_20_64] \n\t" + "msub $ac3, %[load1], %[cospi_20_64] \n\t" + "madd $ac3, %[load2], %[cospi_12_64] \n\t" + "extp %[step1_22], $ac1, 31 \n\t" + "extp %[step1_25], $ac3, 31 \n\t" + "add %[step1_23], %[temp0], %[temp1] \n\t" + "add %[step1_24], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), + [step1_22] "=&r"(step1_22), [step1_23] "=&r"(step1_23), + [step1_24] "=&r"(step1_24), [step1_25] "=&r"(step1_25) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64), + [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64), + [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64)); + + __asm__ __volatile__( + "lh %[load1], 4(%[input]) \n\t" + "lh %[load2], 60(%[input]) \n\t" + "lh %[load3], 36(%[input]) \n\t" + "lh %[load4], 28(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_30_64] \n\t" + "msub $ac1, %[load2], %[cospi_2_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "madd $ac3, %[load1], %[cospi_2_64] \n\t" + "madd $ac3, %[load2], %[cospi_30_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_14_64] \n\t" + "msub $ac2, %[load4], %[cospi_18_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + "madd $ac1, %[load3], %[cospi_18_64] \n\t" + "madd $ac1, %[load4], %[cospi_14_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp0], %[temp1] \n\t" + "sub %[load2], %[temp3], %[temp2] \n\t" + "msub $ac1, %[load1], %[cospi_8_64] \n\t" + "madd $ac1, %[load2], %[cospi_24_64] \n\t" + "madd $ac3, %[load1], %[cospi_24_64] \n\t" + "madd $ac3, %[load2], %[cospi_8_64] \n\t" + "extp %[step2_9], $ac1, 31 \n\t" + "extp %[step2_14], $ac3, 31 \n\t" + "add %[step2_8], %[temp0], %[temp1] \n\t" + "add %[step2_15], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=&r"(step2_8), + [step2_9] "=&r"(step2_9), [step2_14] "=&r"(step2_14), + [step2_15] "=&r"(step2_15) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), + [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), + [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64)); + + __asm__ __volatile__( + "lh %[load1], 20(%[input]) \n\t" + "lh %[load2], 44(%[input]) \n\t" + "lh %[load3], 52(%[input]) \n\t" + "lh %[load4], 12(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_22_64] \n\t" + "msub $ac1, %[load2], %[cospi_10_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "madd $ac3, %[load1], %[cospi_10_64] \n\t" + "madd $ac3, %[load2], %[cospi_22_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_6_64] \n\t" + "msub $ac2, %[load4], %[cospi_26_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + "madd $ac1, %[load3], %[cospi_26_64] \n\t" + "madd $ac1, %[load4], %[cospi_6_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + "msub $ac1, %[load1], %[cospi_24_64] \n\t" + "msub $ac1, %[load2], %[cospi_8_64] \n\t" + "madd $ac3, %[load2], %[cospi_24_64] \n\t" + "msub $ac3, %[load1], %[cospi_8_64] \n\t" + "extp %[step2_10], $ac1, 31 \n\t" + "extp %[step2_13], $ac3, 31 \n\t" + "add %[step2_11], %[temp0], %[temp1] \n\t" + "add %[step2_12], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), + [step2_10] "=&r"(step2_10), [step2_11] "=&r"(step2_11), + [step2_12] "=&r"(step2_12), [step2_13] "=&r"(step2_13) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), + [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), + [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64)); + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "sub %[temp0], %[step2_14], %[step2_13] \n\t" + "sub %[temp0], %[temp0], %[step2_9] \n\t" + "add %[temp0], %[temp0], %[step2_10] \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp1], %[step2_14], %[step2_13] \n\t" + "add %[temp1], %[temp1], %[step2_9] \n\t" + "sub %[temp1], %[temp1], %[step2_10] \n\t" + "madd $ac1, %[temp1], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "sub %[temp0], %[step2_15], %[step2_12] \n\t" + "sub %[temp0], %[temp0], %[step2_8] \n\t" + "add %[temp0], %[temp0], %[step2_11] \n\t" + "madd $ac2, %[temp0], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sub %[temp1], %[step2_15], %[step2_12] \n\t" + "add %[temp1], %[temp1], %[step2_8] \n\t" + "sub %[temp1], %[temp1], %[step2_11] \n\t" + "madd $ac3, %[temp1], %[cospi_16_64] \n\t" + + "add %[step3_8], %[step2_8], %[step2_11] \n\t" + "add %[step3_9], %[step2_9], %[step2_10] \n\t" + "add %[step3_14], %[step2_13], %[step2_14] \n\t" + "add %[step3_15], %[step2_12], %[step2_15] \n\t" + "extp %[step3_10], $ac0, 31 \n\t" + "extp %[step3_13], $ac1, 31 \n\t" + "extp %[step3_11], $ac2, 31 \n\t" + "extp %[step3_12], $ac3, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=&r"(step3_8), + [step3_9] "=&r"(step3_9), [step3_10] "=&r"(step3_10), + [step3_11] "=&r"(step3_11), [step3_12] "=&r"(step3_12), + [step3_13] "=&r"(step3_13), [step3_14] "=&r"(step3_14), + [step3_15] "=&r"(step3_15) + : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8), + [step2_9] "r"(step2_9), [step2_10] "r"(step2_10), + [step2_11] "r"(step2_11), [step2_12] "r"(step2_12), + [step2_13] "r"(step2_13), [step2_14] "r"(step2_14), + [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp0], %[step1_17], %[step1_18] \n\t" + "sub %[temp1], %[step1_30], %[step1_29] \n\t" + "add %[step3_17], %[step1_17], %[step1_18] \n\t" + "add %[step3_30], %[step1_30], %[step1_29] \n\t" + + "msub $ac0, %[temp0], %[cospi_8_64] \n\t" + "madd $ac0, %[temp1], %[cospi_24_64] \n\t" + "extp %[step3_18], $ac0, 31 \n\t" + "madd $ac1, %[temp0], %[cospi_24_64] \n\t" + "madd $ac1, %[temp1], %[cospi_8_64] \n\t" + "extp %[step3_29], $ac1, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [step3_18] "=&r"(step3_18), [step3_29] "=&r"(step3_29), + [step3_17] "=&r"(step3_17), [step3_30] "=&r"(step3_30) + : [const_2_power_13] "r"(const_2_power_13), [step1_17] "r"(step1_17), + [step1_18] "r"(step1_18), [step1_30] "r"(step1_30), + [step1_29] "r"(step1_29), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp0], %[step1_16], %[step1_19] \n\t" + "sub %[temp1], %[step1_31], %[step1_28] \n\t" + "add %[step3_16], %[step1_16], %[step1_19] \n\t" + "add %[step3_31], %[step1_31], %[step1_28] \n\t" + + "msub $ac0, %[temp0], %[cospi_8_64] \n\t" + "madd $ac0, %[temp1], %[cospi_24_64] \n\t" + "extp %[step3_19], $ac0, 31 \n\t" + "madd $ac1, %[temp0], %[cospi_24_64] \n\t" + "madd $ac1, %[temp1], %[cospi_8_64] \n\t" + "extp %[step3_28], $ac1, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [step3_16] "=&r"(step3_16), [step3_31] "=&r"(step3_31), + [step3_19] "=&r"(step3_19), [step3_28] "=&r"(step3_28) + : [const_2_power_13] "r"(const_2_power_13), [step1_16] "r"(step1_16), + [step1_19] "r"(step1_19), [step1_31] "r"(step1_31), + [step1_28] "r"(step1_28), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp0], %[step1_23], %[step1_20] \n\t" + "sub %[temp1], %[step1_24], %[step1_27] \n\t" + "add %[step3_23], %[step1_23], %[step1_20] \n\t" + "add %[step3_24], %[step1_24], %[step1_27] \n\t" + + "msub $ac0, %[temp0], %[cospi_8_64] \n\t" + "madd $ac0, %[temp1], %[cospi_24_64] \n\t" + "extp %[step3_27], $ac0, 31 \n\t" + "msub $ac1, %[temp0], %[cospi_24_64] \n\t" + "msub $ac1, %[temp1], %[cospi_8_64] \n\t" + "extp %[step3_20], $ac1, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [step3_23] "=&r"(step3_23), [step3_24] "=&r"(step3_24), + [step3_20] "=&r"(step3_20), [step3_27] "=&r"(step3_27) + : [const_2_power_13] "r"(const_2_power_13), [step1_23] "r"(step1_23), + [step1_20] "r"(step1_20), [step1_24] "r"(step1_24), + [step1_27] "r"(step1_27), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp0], %[step1_22], %[step1_21] \n\t" + "sub %[temp1], %[step1_25], %[step1_26] \n\t" + "add %[step3_22], %[step1_22], %[step1_21] \n\t" + "add %[step3_25], %[step1_25], %[step1_26] \n\t" + + "msub $ac0, %[temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[temp1], %[cospi_8_64] \n\t" + "extp %[step3_21], $ac0, 31 \n\t" + "msub $ac1, %[temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[temp1], %[cospi_24_64] \n\t" + "extp %[step3_26], $ac1, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [step3_22] "=&r"(step3_22), [step3_25] "=&r"(step3_25), + [step3_21] "=&r"(step3_21), [step3_26] "=&r"(step3_26) + : [const_2_power_13] "r"(const_2_power_13), [step1_22] "r"(step1_22), + [step1_21] "r"(step1_21), [step1_25] "r"(step1_25), + [step1_26] "r"(step1_26), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + __asm__ __volatile__( + "add %[step2_16], %[step3_16], %[step3_23] \n\t" + "add %[step2_17], %[step3_17], %[step3_22] \n\t" + "add %[step2_18], %[step3_18], %[step3_21] \n\t" + "add %[step2_19], %[step3_19], %[step3_20] \n\t" + "sub %[step2_20], %[step3_19], %[step3_20] \n\t" + "sub %[step2_21], %[step3_18], %[step3_21] \n\t" + "sub %[step2_22], %[step3_17], %[step3_22] \n\t" + "sub %[step2_23], %[step3_16], %[step3_23] \n\t" + + : [step2_16] "=&r"(step2_16), [step2_17] "=&r"(step2_17), + [step2_18] "=&r"(step2_18), [step2_19] "=&r"(step2_19), + [step2_20] "=&r"(step2_20), [step2_21] "=&r"(step2_21), + [step2_22] "=&r"(step2_22), [step2_23] "=&r"(step2_23) + : [step3_16] "r"(step3_16), [step3_23] "r"(step3_23), + [step3_17] "r"(step3_17), [step3_22] "r"(step3_22), + [step3_18] "r"(step3_18), [step3_21] "r"(step3_21), + [step3_19] "r"(step3_19), [step3_20] "r"(step3_20)); + + __asm__ __volatile__( + "sub %[step2_24], %[step3_31], %[step3_24] \n\t" + "sub %[step2_25], %[step3_30], %[step3_25] \n\t" + "sub %[step2_26], %[step3_29], %[step3_26] \n\t" + "sub %[step2_27], %[step3_28], %[step3_27] \n\t" + "add %[step2_28], %[step3_28], %[step3_27] \n\t" + "add %[step2_29], %[step3_29], %[step3_26] \n\t" + "add %[step2_30], %[step3_30], %[step3_25] \n\t" + "add %[step2_31], %[step3_31], %[step3_24] \n\t" + + : [step2_24] "=&r"(step2_24), [step2_28] "=&r"(step2_28), + [step2_25] "=&r"(step2_25), [step2_29] "=&r"(step2_29), + [step2_26] "=&r"(step2_26), [step2_30] "=&r"(step2_30), + [step2_27] "=&r"(step2_27), [step2_31] "=&r"(step2_31) + : [step3_31] "r"(step3_31), [step3_24] "r"(step3_24), + [step3_30] "r"(step3_30), [step3_25] "r"(step3_25), + [step3_29] "r"(step3_29), [step3_26] "r"(step3_26), + [step3_28] "r"(step3_28), [step3_27] "r"(step3_27)); + + __asm__ __volatile__( + "lh %[load1], 0(%[input]) \n\t" + "lh %[load2], 32(%[input]) \n\t" + "lh %[load3], 16(%[input]) \n\t" + "lh %[load4], 48(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "add %[result1], %[load1], %[load2] \n\t" + "sub %[result2], %[load1], %[load2] \n\t" + "madd $ac1, %[result1], %[cospi_16_64] \n\t" + "madd $ac2, %[result2], %[cospi_16_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac3, %[load3], %[cospi_24_64] \n\t" + "msub $ac3, %[load4], %[cospi_8_64] \n\t" + "extp %[temp2], $ac3, 31 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "madd $ac1, %[load3], %[cospi_8_64] \n\t" + "madd $ac1, %[load4], %[cospi_24_64] \n\t" + "extp %[temp3], $ac1, 31 \n\t" + "add %[step1_0], %[temp0], %[temp3] \n\t" + "add %[step1_1], %[temp1], %[temp2] \n\t" + "sub %[step1_2], %[temp1], %[temp2] \n\t" + "sub %[step1_3], %[temp0], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [result1] "=&r"(result1), + [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=&r"(step1_0), + [step1_1] "=&r"(step1_1), [step1_2] "=&r"(step1_2), + [step1_3] "=&r"(step1_3) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64), + [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( + "lh %[load1], 8(%[input]) \n\t" + "lh %[load2], 56(%[input]) \n\t" + "lh %[load3], 40(%[input]) \n\t" + "lh %[load4], 24(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "madd $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_12_64] \n\t" + "msub $ac2, %[load4], %[cospi_20_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + "madd $ac1, %[load3], %[cospi_20_64] \n\t" + "madd $ac1, %[load4], %[cospi_12_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp3], %[temp2] \n\t" + "sub %[load1], %[load1], %[temp0] \n\t" + "add %[load1], %[load1], %[temp1] \n\t" + "sub %[load2], %[temp0], %[temp1] \n\t" + "sub %[load2], %[load2], %[temp2] \n\t" + "add %[load2], %[load2], %[temp3] \n\t" + "madd $ac1, %[load1], %[cospi_16_64] \n\t" + "madd $ac3, %[load2], %[cospi_16_64] \n\t" + + "extp %[step1_5], $ac1, 31 \n\t" + "extp %[step1_6], $ac3, 31 \n\t" + "add %[step1_4], %[temp0], %[temp1] \n\t" + "add %[step1_7], %[temp3], %[temp2] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=&r"(step1_4), + [step1_5] "=&r"(step1_5), [step1_6] "=&r"(step1_6), + [step1_7] "=&r"(step1_7) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), + [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( + "add %[step2_0], %[step1_0], %[step1_7] \n\t" + "add %[step2_1], %[step1_1], %[step1_6] \n\t" + "add %[step2_2], %[step1_2], %[step1_5] \n\t" + "add %[step2_3], %[step1_3], %[step1_4] \n\t" + "sub %[step2_4], %[step1_3], %[step1_4] \n\t" + "sub %[step2_5], %[step1_2], %[step1_5] \n\t" + "sub %[step2_6], %[step1_1], %[step1_6] \n\t" + "sub %[step2_7], %[step1_0], %[step1_7] \n\t" + + : [step2_0] "=&r"(step2_0), [step2_4] "=&r"(step2_4), + [step2_1] "=&r"(step2_1), [step2_5] "=&r"(step2_5), + [step2_2] "=&r"(step2_2), [step2_6] "=&r"(step2_6), + [step2_3] "=&r"(step2_3), [step2_7] "=&r"(step2_7) + : [step1_0] "r"(step1_0), [step1_7] "r"(step1_7), + [step1_1] "r"(step1_1), [step1_6] "r"(step1_6), + [step1_2] "r"(step1_2), [step1_5] "r"(step1_5), + [step1_3] "r"(step1_3), [step1_4] "r"(step1_4)); + + // stage 7 + __asm__ __volatile__( + "add %[step1_0], %[step2_0], %[step3_15] \n\t" + "add %[step1_1], %[step2_1], %[step3_14] \n\t" + "add %[step1_2], %[step2_2], %[step3_13] \n\t" + "add %[step1_3], %[step2_3], %[step3_12] \n\t" + "sub %[step1_12], %[step2_3], %[step3_12] \n\t" + "sub %[step1_13], %[step2_2], %[step3_13] \n\t" + "sub %[step1_14], %[step2_1], %[step3_14] \n\t" + "sub %[step1_15], %[step2_0], %[step3_15] \n\t" + + : [step1_0] "=&r"(step1_0), [step1_12] "=&r"(step1_12), + [step1_1] "=&r"(step1_1), [step1_13] "=&r"(step1_13), + [step1_2] "=&r"(step1_2), [step1_14] "=&r"(step1_14), + [step1_3] "=&r"(step1_3), [step1_15] "=&r"(step1_15) + : [step2_0] "r"(step2_0), [step3_15] "r"(step3_15), + [step2_1] "r"(step2_1), [step3_14] "r"(step3_14), + [step2_2] "r"(step2_2), [step3_13] "r"(step3_13), + [step2_3] "r"(step2_3), [step3_12] "r"(step3_12)); + + __asm__ __volatile__( + "add %[step1_4], %[step2_4], %[step3_11] \n\t" + "add %[step1_5], %[step2_5], %[step3_10] \n\t" + "add %[step1_6], %[step2_6], %[step3_9] \n\t" + "add %[step1_7], %[step2_7], %[step3_8] \n\t" + "sub %[step1_8], %[step2_7], %[step3_8] \n\t" + "sub %[step1_9], %[step2_6], %[step3_9] \n\t" + "sub %[step1_10], %[step2_5], %[step3_10] \n\t" + "sub %[step1_11], %[step2_4], %[step3_11] \n\t" + + : [step1_4] "=&r"(step1_4), [step1_8] "=&r"(step1_8), + [step1_5] "=&r"(step1_5), [step1_9] "=&r"(step1_9), + [step1_6] "=&r"(step1_6), [step1_10] "=&r"(step1_10), + [step1_7] "=&r"(step1_7), [step1_11] "=&r"(step1_11) + : [step2_4] "r"(step2_4), [step3_11] "r"(step3_11), + [step2_5] "r"(step2_5), [step3_10] "r"(step3_10), + [step2_6] "r"(step2_6), [step3_9] "r"(step3_9), + [step2_7] "r"(step2_7), [step3_8] "r"(step3_8)); + + __asm__ __volatile__( + "sub %[temp0], %[step2_27], %[step2_20] \n\t" + "add %[temp1], %[step2_27], %[step2_20] \n\t" + "sub %[temp2], %[step2_26], %[step2_21] \n\t" + "add %[temp3], %[step2_26], %[step2_21] \n\t" + + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "madd $ac1, %[temp1], %[cospi_16_64] \n\t" + "madd $ac2, %[temp2], %[cospi_16_64] \n\t" + "madd $ac3, %[temp3], %[cospi_16_64] \n\t" + + "extp %[step1_20], $ac0, 31 \n\t" + "extp %[step1_27], $ac1, 31 \n\t" + "extp %[step1_21], $ac2, 31 \n\t" + "extp %[step1_26], $ac3, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [step1_20] "=&r"(step1_20), + [step1_27] "=&r"(step1_27), [step1_21] "=&r"(step1_21), + [step1_26] "=&r"(step1_26) + : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), + [step2_27] "r"(step2_27), [step2_21] "r"(step2_21), + [step2_26] "r"(step2_26), [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( + "sub %[temp0], %[step2_25], %[step2_22] \n\t" + "add %[temp1], %[step2_25], %[step2_22] \n\t" + "sub %[temp2], %[step2_24], %[step2_23] \n\t" + "add %[temp3], %[step2_24], %[step2_23] \n\t" + + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "madd $ac1, %[temp1], %[cospi_16_64] \n\t" + "madd $ac2, %[temp2], %[cospi_16_64] \n\t" + "madd $ac3, %[temp3], %[cospi_16_64] \n\t" + + "extp %[step1_22], $ac0, 31 \n\t" + "extp %[step1_25], $ac1, 31 \n\t" + "extp %[step1_23], $ac2, 31 \n\t" + "extp %[step1_24], $ac3, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [step1_22] "=&r"(step1_22), + [step1_25] "=&r"(step1_25), [step1_23] "=&r"(step1_23), + [step1_24] "=&r"(step1_24) + : [const_2_power_13] "r"(const_2_power_13), [step2_22] "r"(step2_22), + [step2_25] "r"(step2_25), [step2_23] "r"(step2_23), + [step2_24] "r"(step2_24), [cospi_16_64] "r"(cospi_16_64)); + + // final stage + __asm__ __volatile__( + "add %[temp0], %[step1_0], %[step2_31] \n\t" + "add %[temp1], %[step1_1], %[step2_30] \n\t" + "add %[temp2], %[step1_2], %[step2_29] \n\t" + "add %[temp3], %[step1_3], %[step2_28] \n\t" + "sub %[load1], %[step1_3], %[step2_28] \n\t" + "sub %[load2], %[step1_2], %[step2_29] \n\t" + "sub %[load3], %[step1_1], %[step2_30] \n\t" + "sub %[load4], %[step1_0], %[step2_31] \n\t" + "sh %[temp0], 0(%[output]) \n\t" + "sh %[temp1], 64(%[output]) \n\t" + "sh %[temp2], 128(%[output]) \n\t" + "sh %[temp3], 192(%[output]) \n\t" + "sh %[load1], 1792(%[output]) \n\t" + "sh %[load2], 1856(%[output]) \n\t" + "sh %[load3], 1920(%[output]) \n\t" + "sh %[load4], 1984(%[output]) \n\t" + + : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1), + [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3), + [temp3] "=&r"(temp3), [load4] "=&r"(load4) + : [step1_0] "r"(step1_0), [step2_31] "r"(step2_31), + [step1_1] "r"(step1_1), [step2_30] "r"(step2_30), + [step1_2] "r"(step1_2), [step2_29] "r"(step2_29), + [step1_3] "r"(step1_3), [step2_28] "r"(step2_28), + [output] "r"(output)); + + __asm__ __volatile__( + "add %[temp0], %[step1_4], %[step1_27] \n\t" + "add %[temp1], %[step1_5], %[step1_26] \n\t" + "add %[temp2], %[step1_6], %[step1_25] \n\t" + "add %[temp3], %[step1_7], %[step1_24] \n\t" + "sub %[load1], %[step1_7], %[step1_24] \n\t" + "sub %[load2], %[step1_6], %[step1_25] \n\t" + "sub %[load3], %[step1_5], %[step1_26] \n\t" + "sub %[load4], %[step1_4], %[step1_27] \n\t" + "sh %[temp0], 256(%[output]) \n\t" + "sh %[temp1], 320(%[output]) \n\t" + "sh %[temp2], 384(%[output]) \n\t" + "sh %[temp3], 448(%[output]) \n\t" + "sh %[load1], 1536(%[output]) \n\t" + "sh %[load2], 1600(%[output]) \n\t" + "sh %[load3], 1664(%[output]) \n\t" + "sh %[load4], 1728(%[output]) \n\t" + + : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1), + [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3), + [temp3] "=&r"(temp3), [load4] "=&r"(load4) + : [step1_4] "r"(step1_4), [step1_27] "r"(step1_27), + [step1_5] "r"(step1_5), [step1_26] "r"(step1_26), + [step1_6] "r"(step1_6), [step1_25] "r"(step1_25), + [step1_7] "r"(step1_7), [step1_24] "r"(step1_24), + [output] "r"(output)); + + __asm__ __volatile__( + "add %[temp0], %[step1_8], %[step1_23] \n\t" + "add %[temp1], %[step1_9], %[step1_22] \n\t" + "add %[temp2], %[step1_10], %[step1_21] \n\t" + "add %[temp3], %[step1_11], %[step1_20] \n\t" + "sub %[load1], %[step1_11], %[step1_20] \n\t" + "sub %[load2], %[step1_10], %[step1_21] \n\t" + "sub %[load3], %[step1_9], %[step1_22] \n\t" + "sub %[load4], %[step1_8], %[step1_23] \n\t" + "sh %[temp0], 512(%[output]) \n\t" + "sh %[temp1], 576(%[output]) \n\t" + "sh %[temp2], 640(%[output]) \n\t" + "sh %[temp3], 704(%[output]) \n\t" + "sh %[load1], 1280(%[output]) \n\t" + "sh %[load2], 1344(%[output]) \n\t" + "sh %[load3], 1408(%[output]) \n\t" + "sh %[load4], 1472(%[output]) \n\t" + + : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1), + [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3), + [temp3] "=&r"(temp3), [load4] "=&r"(load4) + : [step1_8] "r"(step1_8), [step1_23] "r"(step1_23), + [step1_9] "r"(step1_9), [step1_22] "r"(step1_22), + [step1_10] "r"(step1_10), [step1_21] "r"(step1_21), + [step1_11] "r"(step1_11), [step1_20] "r"(step1_20), + [output] "r"(output)); + + __asm__ __volatile__( + "add %[temp0], %[step1_12], %[step2_19] \n\t" + "add %[temp1], %[step1_13], %[step2_18] \n\t" + "add %[temp2], %[step1_14], %[step2_17] \n\t" + "add %[temp3], %[step1_15], %[step2_16] \n\t" + "sub %[load1], %[step1_15], %[step2_16] \n\t" + "sub %[load2], %[step1_14], %[step2_17] \n\t" + "sub %[load3], %[step1_13], %[step2_18] \n\t" + "sub %[load4], %[step1_12], %[step2_19] \n\t" + "sh %[temp0], 768(%[output]) \n\t" + "sh %[temp1], 832(%[output]) \n\t" + "sh %[temp2], 896(%[output]) \n\t" + "sh %[temp3], 960(%[output]) \n\t" + "sh %[load1], 1024(%[output]) \n\t" + "sh %[load2], 1088(%[output]) \n\t" + "sh %[load3], 1152(%[output]) \n\t" + "sh %[load4], 1216(%[output]) \n\t" + + : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1), + [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3), + [temp3] "=&r"(temp3), [load4] "=&r"(load4) + : [step1_12] "r"(step1_12), [step2_19] "r"(step2_19), + [step1_13] "r"(step1_13), [step2_18] "r"(step2_18), + [step1_14] "r"(step1_14), [step2_17] "r"(step2_17), + [step1_15] "r"(step1_15), [step2_16] "r"(step2_16), + [output] "r"(output)); + + input += 32; + output += 1; + } +} + +void vpx_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest, + int stride) { + DECLARE_ALIGNED(32, int16_t, out[32 * 32]); + int16_t *outptr = out; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + // Rows + idct32_rows_dspr2(input, outptr, 32); + + // Columns + vpx_idct32_cols_add_blk_dspr2(out, dest, stride); +} + +void vpx_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest, + int stride) { + DECLARE_ALIGNED(32, int16_t, out[32 * 32]); + int16_t *outptr = out; + uint32_t i; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + // Rows + idct32_rows_dspr2(input, outptr, 8); + + outptr += 8; + __asm__ __volatile__( + "sw $zero, 0(%[outptr]) \n\t" + "sw $zero, 4(%[outptr]) \n\t" + "sw $zero, 8(%[outptr]) \n\t" + "sw $zero, 12(%[outptr]) \n\t" + "sw $zero, 16(%[outptr]) \n\t" + "sw $zero, 20(%[outptr]) \n\t" + "sw $zero, 24(%[outptr]) \n\t" + "sw $zero, 28(%[outptr]) \n\t" + "sw $zero, 32(%[outptr]) \n\t" + "sw $zero, 36(%[outptr]) \n\t" + "sw $zero, 40(%[outptr]) \n\t" + "sw $zero, 44(%[outptr]) \n\t" + + : + : [outptr] "r"(outptr)); + + for (i = 0; i < 31; ++i) { + outptr += 32; + + __asm__ __volatile__( + "sw $zero, 0(%[outptr]) \n\t" + "sw $zero, 4(%[outptr]) \n\t" + "sw $zero, 8(%[outptr]) \n\t" + "sw $zero, 12(%[outptr]) \n\t" + "sw $zero, 16(%[outptr]) \n\t" + "sw $zero, 20(%[outptr]) \n\t" + "sw $zero, 24(%[outptr]) \n\t" + "sw $zero, 28(%[outptr]) \n\t" + "sw $zero, 32(%[outptr]) \n\t" + "sw $zero, 36(%[outptr]) \n\t" + "sw $zero, 40(%[outptr]) \n\t" + "sw $zero, 44(%[outptr]) \n\t" + + : + : [outptr] "r"(outptr)); + } + + // Columns + vpx_idct32_cols_add_blk_dspr2(out, dest, stride); +} + +void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest, + int stride) { + int r, out; + int32_t a1, absa1; + int32_t vector_a1; + int32_t t1, t2, t3, t4; + int32_t vector_1, vector_2, vector_3, vector_4; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + + : + : [pos] "r"(pos)); + + out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); + __asm__ __volatile__( + "addi %[out], %[out], 32 \n\t" + "sra %[a1], %[out], 6 \n\t" + + : [out] "+r"(out), [a1] "=r"(a1) + :); + + if (a1 < 0) { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__( + "abs %[absa1], %[a1] \n\t" + "replv.qb %[vector_a1], %[absa1] \n\t" + + : [absa1] "=&r"(absa1), [vector_a1] "=&r"(vector_a1) + : [a1] "r"(a1)); + + for (r = 32; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + + "lw %[t1], 16(%[dest]) \n\t" + "lw %[t2], 20(%[dest]) \n\t" + "lw %[t3], 24(%[dest]) \n\t" + "lw %[t4], 28(%[dest]) \n\t" + "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 16(%[dest]) \n\t" + "sw %[vector_2], 20(%[dest]) \n\t" + "sw %[vector_3], 24(%[dest]) \n\t" + "sw %[vector_4], 28(%[dest]) \n\t" + + "add %[dest], %[dest], %[stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), + [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), + [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), + [dest] "+&r"(dest) + : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); + } + } else if (a1 > 255) { + int32_t a11, a12, vector_a11, vector_a12; + + /* use quad-byte + * input and output memory are four byte aligned */ + a11 = a1 >> 1; + a12 = a1 - a11; + __asm__ __volatile__( + "replv.qb %[vector_a11], %[a11] \n\t" + "replv.qb %[vector_a12], %[a12] \n\t" + + : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12) + : [a11] "r"(a11), [a12] "r"(a12)); + + for (r = 32; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a11] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a11] \n\t" + "addu_s.qb %[vector_3], %[t3], %[vector_a11] \n\t" + "addu_s.qb %[vector_4], %[t4], %[vector_a11] \n\t" + "addu_s.qb %[vector_1], %[vector_1], %[vector_a12] \n\t" + "addu_s.qb %[vector_2], %[vector_2], %[vector_a12] \n\t" + "addu_s.qb %[vector_3], %[vector_3], %[vector_a12] \n\t" + "addu_s.qb %[vector_4], %[vector_4], %[vector_a12] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + + "lw %[t1], 16(%[dest]) \n\t" + "lw %[t2], 20(%[dest]) \n\t" + "lw %[t3], 24(%[dest]) \n\t" + "lw %[t4], 28(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a11] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a11] \n\t" + "addu_s.qb %[vector_3], %[t3], %[vector_a11] \n\t" + "addu_s.qb %[vector_4], %[t4], %[vector_a11] \n\t" + "addu_s.qb %[vector_1], %[vector_1], %[vector_a12] \n\t" + "addu_s.qb %[vector_2], %[vector_2], %[vector_a12] \n\t" + "addu_s.qb %[vector_3], %[vector_3], %[vector_a12] \n\t" + "addu_s.qb %[vector_4], %[vector_4], %[vector_a12] \n\t" + "sw %[vector_1], 16(%[dest]) \n\t" + "sw %[vector_2], 20(%[dest]) \n\t" + "sw %[vector_3], 24(%[dest]) \n\t" + "sw %[vector_4], 28(%[dest]) \n\t" + + "add %[dest], %[dest], %[stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), + [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), + [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), + [dest] "+&r"(dest) + : [stride] "r"(stride), [vector_a11] "r"(vector_a11), + [vector_a12] "r"(vector_a12)); + } + } else { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" + + : [vector_a1] "=&r"(vector_a1) + : [a1] "r"(a1)); + + for (r = 32; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + + "lw %[t1], 16(%[dest]) \n\t" + "lw %[t2], 20(%[dest]) \n\t" + "lw %[t3], 24(%[dest]) \n\t" + "lw %[t4], 28(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 16(%[dest]) \n\t" + "sw %[vector_2], 20(%[dest]) \n\t" + "sw %[vector_3], 24(%[dest]) \n\t" + "sw %[vector_4], 28(%[dest]) \n\t" + + "add %[dest], %[dest], %[stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), + [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), + [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), + [dest] "+&r"(dest) + : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); + } + } +} +#endif // #if HAVE_DSPR2 diff --git a/media/libvpx/libvpx/vpx_dsp/mips/itrans4_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/itrans4_dspr2.c new file mode 100644 index 0000000000..e214b538d4 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/itrans4_dspr2.c @@ -0,0 +1,375 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/inv_txfm_dspr2.h" +#include "vpx_dsp/txfm_common.h" + +#if HAVE_DSPR2 +void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output) { + int step_0, step_1, step_2, step_3; + int Temp0, Temp1, Temp2, Temp3; + const int const_2_power_13 = 8192; + int i; + + for (i = 4; i--;) { + __asm__ __volatile__( + /* + temp_1 = (input[0] + input[2]) * cospi_16_64; + step_0 = dct_const_round_shift(temp_1); + + temp_2 = (input[0] - input[2]) * cospi_16_64; + step_1 = dct_const_round_shift(temp_2); + */ + "lh %[Temp0], 0(%[input]) \n\t" + "lh %[Temp1], 4(%[input]) \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "add %[Temp2], %[Temp0], %[Temp1] \n\t" + "sub %[Temp3], %[Temp0], %[Temp1] \n\t" + "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" + "lh %[Temp0], 2(%[input]) \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "extp %[step_0], $ac0, 31 \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + + "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" + "extp %[step_1], $ac1, 31 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + /* + temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; + step_2 = dct_const_round_shift(temp1); + */ + "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" + "extp %[step_2], $ac0, 31 \n\t" + + /* + temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; + step_3 = dct_const_round_shift(temp2); + */ + "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" + "extp %[step_3], $ac1, 31 \n\t" + + /* + output[0] = step_0 + step_3; + output[4] = step_1 + step_2; + output[8] = step_1 - step_2; + output[12] = step_0 - step_3; + */ + "add %[Temp0], %[step_0], %[step_3] \n\t" + "sh %[Temp0], 0(%[output]) \n\t" + + "add %[Temp1], %[step_1], %[step_2] \n\t" + "sh %[Temp1], 8(%[output]) \n\t" + + "sub %[Temp2], %[step_1], %[step_2] \n\t" + "sh %[Temp2], 16(%[output]) \n\t" + + "sub %[Temp3], %[step_0], %[step_3] \n\t" + "sh %[Temp3], 24(%[output]) \n\t" + + : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1), + [step_2] "=&r"(step_2), [step_3] "=&r"(step_3), [output] "+r"(output) + : [const_2_power_13] "r"(const_2_power_13), + [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64), + [cospi_24_64] "r"(cospi_24_64), [input] "r"(input)); + + input += 4; + output += 1; + } +} + +void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, + int stride) { + int step_0, step_1, step_2, step_3; + int Temp0, Temp1, Temp2, Temp3; + const int const_2_power_13 = 8192; + const int const_255 = 255; + int i; + uint8_t *dest_pix; + + for (i = 0; i < 4; ++i) { + dest_pix = (dest + i); + + __asm__ __volatile__( + /* + temp_1 = (input[0] + input[2]) * cospi_16_64; + step_0 = dct_const_round_shift(temp_1); + + temp_2 = (input[0] - input[2]) * cospi_16_64; + step_1 = dct_const_round_shift(temp_2); + */ + "lh %[Temp0], 0(%[input]) \n\t" + "lh %[Temp1], 4(%[input]) \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "add %[Temp2], %[Temp0], %[Temp1] \n\t" + "sub %[Temp3], %[Temp0], %[Temp1] \n\t" + "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" + "lh %[Temp0], 2(%[input]) \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "extp %[step_0], $ac0, 31 \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + + "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" + "extp %[step_1], $ac1, 31 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + /* + temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; + step_2 = dct_const_round_shift(temp1); + */ + "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" + "extp %[step_2], $ac0, 31 \n\t" + + /* + temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; + step_3 = dct_const_round_shift(temp2); + */ + "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" + "extp %[step_3], $ac1, 31 \n\t" + + /* + output[0] = step_0 + step_3; + output[4] = step_1 + step_2; + output[8] = step_1 - step_2; + output[12] = step_0 - step_3; + */ + "add %[Temp0], %[step_0], %[step_3] \n\t" + "addi %[Temp0], %[Temp0], 8 \n\t" + "sra %[Temp0], %[Temp0], 4 \n\t" + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + + "add %[Temp0], %[step_1], %[step_2] \n\t" + "addi %[Temp0], %[Temp0], 8 \n\t" + "sra %[Temp0], %[Temp0], 4 \n\t" + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + + "sub %[Temp0], %[step_1], %[step_2] \n\t" + "addi %[Temp0], %[Temp0], 8 \n\t" + "sra %[Temp0], %[Temp0], 4 \n\t" + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + + "sub %[Temp0], %[step_0], %[step_3] \n\t" + "addi %[Temp0], %[Temp0], 8 \n\t" + "sra %[Temp0], %[Temp0], 4 \n\t" + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" + + : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1), + [step_2] "=&r"(step_2), [step_3] "=&r"(step_3), + [dest_pix] "+r"(dest_pix) + : [const_2_power_13] "r"(const_2_power_13), [const_255] "r"(const_255), + [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64), + [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), + [stride] "r"(stride)); + + input += 4; + } +} + +void vpx_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, int stride) { + DECLARE_ALIGNED(32, int16_t, out[4 * 4]); + int16_t *outptr = out; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + // Rows + vpx_idct4_rows_dspr2(input, outptr); + + // Columns + vpx_idct4_columns_add_blk_dspr2(&out[0], dest, stride); +} + +void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, int stride) { + int a1, absa1; + int r; + int32_t out; + int t2, vector_a1, vector_a; + uint32_t pos = 45; + int16_t input_dc = input[0]; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + + : + : [pos] "r"(pos)); + + out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc); + __asm__ __volatile__( + "addi %[out], %[out], 8 \n\t" + "sra %[a1], %[out], 4 \n\t" + + : [out] "+r"(out), [a1] "=r"(a1) + :); + + if (a1 < 0) { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__( + "abs %[absa1], %[a1] \n\t" + "replv.qb %[vector_a1], %[absa1] \n\t" + + : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); + + for (r = 4; r--;) { + __asm__ __volatile__( + "lw %[t2], 0(%[dest]) \n\t" + "subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" + "sw %[vector_a], 0(%[dest]) \n\t" + "add %[dest], %[dest], %[stride] \n\t" + + : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest) + : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); + } + } else if (a1 > 255) { + int32_t a11, a12, vector_a11, vector_a12; + + /* use quad-byte + * input and output memory are four byte aligned */ + a11 = a1 >> 3; + a12 = a1 - (a11 * 7); + + __asm__ __volatile__( + "replv.qb %[vector_a11], %[a11] \n\t" + "replv.qb %[vector_a12], %[a12] \n\t" + + : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12) + : [a11] "r"(a11), [a12] "r"(a12)); + + for (r = 4; r--;) { + __asm__ __volatile__( + "lw %[t2], 4(%[dest]) \n\t" + "addu_s.qb %[vector_a], %[t2], %[vector_a11] \n\t" + "addu_s.qb %[vector_a], %[vector_a], %[vector_a11] \n\t" + "addu_s.qb %[vector_a], %[vector_a], %[vector_a11] \n\t" + "addu_s.qb %[vector_a], %[vector_a], %[vector_a11] \n\t" + "addu_s.qb %[vector_a], %[vector_a], %[vector_a11] \n\t" + "addu_s.qb %[vector_a], %[vector_a], %[vector_a11] \n\t" + "addu_s.qb %[vector_a], %[vector_a], %[vector_a11] \n\t" + "addu_s.qb %[vector_a], %[vector_a], %[vector_a12] \n\t" + "sw %[vector_a], 0(%[dest]) \n\t" + "add %[dest], %[dest], %[stride] \n\t" + + : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest) + : [stride] "r"(stride), [vector_a11] "r"(vector_a11), + [vector_a12] "r"(vector_a12)); + } + } else { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" + : [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); + + for (r = 4; r--;) { + __asm__ __volatile__( + "lw %[t2], 0(%[dest]) \n\t" + "addu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" + "sw %[vector_a], 0(%[dest]) \n\t" + "add %[dest], %[dest], %[stride] \n\t" + + : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest) + : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); + } + } +} + +void iadst4_dspr2(const int16_t *input, int16_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7; + int x0, x1, x2, x3; + + x0 = input[0]; + x1 = input[1]; + x2 = input[2]; + x3 = input[3]; + + if (!(x0 | x1 | x2 | x3)) { + output[0] = output[1] = output[2] = output[3] = 0; + return; + } + + // 32-bit result is enough for the following multiplications. + s0 = sinpi_1_9 * x0; + s1 = sinpi_2_9 * x0; + s2 = sinpi_3_9 * x1; + s3 = sinpi_4_9 * x2; + s4 = sinpi_1_9 * x2; + s5 = sinpi_2_9 * x3; + s6 = sinpi_4_9 * x3; + s7 = x0 - x2 + x3; + + x0 = s0 + s3 + s5; + x1 = s1 - s4 - s6; + x2 = sinpi_3_9 * s7; + x3 = s2; + + s0 = x0 + x3; + s1 = x1 + x3; + s2 = x2; + s3 = x0 + x1 - x3; + + // 1-D transform scaling factor is sqrt(2). + // The overall dynamic range is 14b (input) + 14b (multiplication scaling) + // + 1b (addition) = 29b. + // Hence the output bit depth is 15b. + output[0] = dct_const_round_shift(s0); + output[1] = dct_const_round_shift(s1); + output[2] = dct_const_round_shift(s2); + output[3] = dct_const_round_shift(s3); +} +#endif // #if HAVE_DSPR2 diff --git a/media/libvpx/libvpx/vpx_dsp/mips/itrans8_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/itrans8_dspr2.c new file mode 100644 index 0000000000..d4d246965c --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/itrans8_dspr2.c @@ -0,0 +1,690 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/inv_txfm_dspr2.h" +#include "vpx_dsp/txfm_common.h" + +#if HAVE_DSPR2 +void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) { + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; + const int const_2_power_13 = 8192; + int Temp0, Temp1, Temp2, Temp3, Temp4; + int i; + + for (i = no_rows; i--;) { + __asm__ __volatile__( + /* + temp_1 = (input[0] + input[4]) * cospi_16_64; + step2_0 = dct_const_round_shift(temp_1); + + temp_2 = (input[0] - input[4]) * cospi_16_64; + step2_1 = dct_const_round_shift(temp_2); + */ + "lh %[Temp0], 0(%[input]) \n\t" + "lh %[Temp1], 8(%[input]) \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "add %[Temp2], %[Temp0], %[Temp1] \n\t" + "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" + "extp %[Temp4], $ac0, 31 \n\t" + + "sub %[Temp3], %[Temp0], %[Temp1] \n\t" + "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + /* + temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64; + step2_2 = dct_const_round_shift(temp_1); + */ + "lh %[Temp0], 4(%[input]) \n\t" + "lh %[Temp1], 12(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "extp %[Temp3], $ac0, 31 \n\t" + + /* + step1_1 = step2_1 + step2_2; + step1_2 = step2_1 - step2_2; + */ + "add %[step1_1], %[Temp2], %[Temp3] \n\t" + "sub %[step1_2], %[Temp2], %[Temp3] \n\t" + + /* + temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64; + step2_3 = dct_const_round_shift(temp_2); + */ + "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + + /* + step1_0 = step2_0 + step2_3; + step1_3 = step2_0 - step2_3; + */ + "add %[step1_0], %[Temp4], %[Temp1] \n\t" + "sub %[step1_3], %[Temp4], %[Temp1] \n\t" + + /* + temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; + step1_4 = dct_const_round_shift(temp_1); + */ + "lh %[Temp0], 2(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_28_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lh %[Temp1], 14(%[input]) \n\t" + "lh %[Temp0], 2(%[input]) \n\t" + "msub $ac0, %[Temp1], %[cospi_4_64] \n\t" + "extp %[step1_4], $ac0, 31 \n\t" + + /* + temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; + step1_7 = dct_const_round_shift(temp_2); + */ + "madd $ac1, %[Temp0], %[cospi_4_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_28_64] \n\t" + "extp %[step1_7], $ac1, 31 \n\t" + + /* + temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; + step1_5 = dct_const_round_shift(temp_1); + */ + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "lh %[Temp0], 10(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_12_64] \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "msub $ac0, %[Temp1], %[cospi_20_64] \n\t" + "extp %[step1_5], $ac0, 31 \n\t" + + /* + temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; + step1_6 = dct_const_round_shift(temp_2); + */ + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lh %[Temp0], 10(%[input]) \n\t" + "madd $ac1, %[Temp0], %[cospi_20_64] \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "madd $ac1, %[Temp1], %[cospi_12_64] \n\t" + "extp %[step1_6], $ac1, 31 \n\t" + + /* + temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64; + temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64; + */ + "sub %[Temp0], %[step1_7], %[step1_6] \n\t" + "sub %[Temp0], %[Temp0], %[step1_4] \n\t" + "add %[Temp0], %[Temp0], %[step1_5] \n\t" + "sub %[Temp1], %[step1_4], %[step1_5] \n\t" + "sub %[Temp1], %[Temp1], %[step1_6] \n\t" + "add %[Temp1], %[Temp1], %[step1_7] \n\t" + + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + "madd $ac0, %[Temp0], %[cospi_16_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_16_64] \n\t" + + /* + step1_4 = step1_4 + step1_5; + step1_7 = step1_6 + step1_7; + */ + "add %[step1_4], %[step1_4], %[step1_5] \n\t" + "add %[step1_7], %[step1_7], %[step1_6] \n\t" + + "extp %[step1_5], $ac0, 31 \n\t" + "extp %[step1_6], $ac1, 31 \n\t" + + "add %[Temp0], %[step1_0], %[step1_7] \n\t" + "sh %[Temp0], 0(%[output]) \n\t" + "add %[Temp1], %[step1_1], %[step1_6] \n\t" + "sh %[Temp1], 16(%[output]) \n\t" + "add %[Temp0], %[step1_2], %[step1_5] \n\t" + "sh %[Temp0], 32(%[output]) \n\t" + "add %[Temp1], %[step1_3], %[step1_4] \n\t" + "sh %[Temp1], 48(%[output]) \n\t" + + "sub %[Temp0], %[step1_3], %[step1_4] \n\t" + "sh %[Temp0], 64(%[output]) \n\t" + "sub %[Temp1], %[step1_2], %[step1_5] \n\t" + "sh %[Temp1], 80(%[output]) \n\t" + "sub %[Temp0], %[step1_1], %[step1_6] \n\t" + "sh %[Temp0], 96(%[output]) \n\t" + "sub %[Temp1], %[step1_0], %[step1_7] \n\t" + "sh %[Temp1], 112(%[output]) \n\t" + + : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1), + [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3), + [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5), + [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7), + [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4) + : [const_2_power_13] "r"(const_2_power_13), + [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64), + [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64), + [cospi_24_64] "r"(cospi_24_64), [output] "r"(output), + [input] "r"(input)); + + input += 8; + output += 1; + } +} + +void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) { + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; + int Temp0, Temp1, Temp2, Temp3; + int i; + const int const_2_power_13 = 8192; + const int const_255 = 255; + uint8_t *dest_pix; + + for (i = 0; i < 8; ++i) { + dest_pix = (dest + i); + + __asm__ __volatile__( + /* + temp_1 = (input[0] + input[4]) * cospi_16_64; + step2_0 = dct_const_round_shift(temp_1); + + temp_2 = (input[0] - input[4]) * cospi_16_64; + step2_1 = dct_const_round_shift(temp_2); + */ + "lh %[Temp0], 0(%[input]) \n\t" + "lh %[Temp1], 8(%[input]) \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "add %[Temp2], %[Temp0], %[Temp1] \n\t" + "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" + "extp %[step1_6], $ac0, 31 \n\t" + + "sub %[Temp3], %[Temp0], %[Temp1] \n\t" + "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + /* + temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64; + step2_2 = dct_const_round_shift(temp_1); + */ + "lh %[Temp0], 4(%[input]) \n\t" + "lh %[Temp1], 12(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "extp %[Temp3], $ac0, 31 \n\t" + + /* + step1_1 = step2_1 + step2_2; + step1_2 = step2_1 - step2_2; + */ + "add %[step1_1], %[Temp2], %[Temp3] \n\t" + "sub %[step1_2], %[Temp2], %[Temp3] \n\t" + + /* + temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64; + step2_3 = dct_const_round_shift(temp_2); + */ + "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + + /* + step1_0 = step2_0 + step2_3; + step1_3 = step2_0 - step2_3; + */ + "add %[step1_0], %[step1_6], %[Temp1] \n\t" + "sub %[step1_3], %[step1_6], %[Temp1] \n\t" + + /* + temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; + step1_4 = dct_const_round_shift(temp_1); + */ + "lh %[Temp0], 2(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_28_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lh %[Temp1], 14(%[input]) \n\t" + "lh %[Temp0], 2(%[input]) \n\t" + "msub $ac0, %[Temp1], %[cospi_4_64] \n\t" + "extp %[step1_4], $ac0, 31 \n\t" + + /* + temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; + step1_7 = dct_const_round_shift(temp_2); + */ + "madd $ac1, %[Temp0], %[cospi_4_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_28_64] \n\t" + "extp %[step1_7], $ac1, 31 \n\t" + + /* + temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; + step1_5 = dct_const_round_shift(temp_1); + */ + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "lh %[Temp0], 10(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_12_64] \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "msub $ac0, %[Temp1], %[cospi_20_64] \n\t" + "extp %[step1_5], $ac0, 31 \n\t" + + /* + temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; + step1_6 = dct_const_round_shift(temp_2); + */ + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lh %[Temp0], 10(%[input]) \n\t" + "madd $ac1, %[Temp0], %[cospi_20_64] \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "madd $ac1, %[Temp1], %[cospi_12_64] \n\t" + "extp %[step1_6], $ac1, 31 \n\t" + + /* + temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64; + temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64; + */ + "sub %[Temp0], %[step1_7], %[step1_6] \n\t" + "sub %[Temp0], %[Temp0], %[step1_4] \n\t" + "add %[Temp0], %[Temp0], %[step1_5] \n\t" + "sub %[Temp1], %[step1_4], %[step1_5] \n\t" + "sub %[Temp1], %[Temp1], %[step1_6] \n\t" + "add %[Temp1], %[Temp1], %[step1_7] \n\t" + + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + "madd $ac0, %[Temp0], %[cospi_16_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_16_64] \n\t" + + /* + step1_4 = step1_4 + step1_5; + step1_7 = step1_6 + step1_7; + */ + "add %[step1_4], %[step1_4], %[step1_5] \n\t" + "add %[step1_7], %[step1_7], %[step1_6] \n\t" + + "extp %[step1_5], $ac0, 31 \n\t" + "extp %[step1_6], $ac1, 31 \n\t" + + /* add block */ + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp0], %[step1_0], %[step1_7] \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "add %[Temp0], %[step1_1], %[step1_6] \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "add %[Temp0], %[step1_2], %[step1_5] \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "add %[Temp0], %[step1_3], %[step1_4] \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step1_3], %[step1_4] \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step1_2], %[step1_5] \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step1_1], %[step1_6] \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step1_0], %[step1_7] \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" + + : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1), + [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3), + [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5), + [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7), + [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [dest_pix] "+r"(dest_pix) + : [const_2_power_13] "r"(const_2_power_13), [const_255] "r"(const_255), + [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64), + [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64), + [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), + [stride] "r"(stride)); + + input += 8; + } +} + +void vpx_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, int stride) { + DECLARE_ALIGNED(32, int16_t, out[8 * 8]); + int16_t *outptr = out; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); + + // First transform rows + idct8_rows_dspr2(input, outptr, 8); + + // Then transform columns and add to dest + idct8_columns_add_blk_dspr2(&out[0], dest, stride); +} + +void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest, int stride) { + DECLARE_ALIGNED(32, int16_t, out[8 * 8]); + int16_t *outptr = out; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); + + // First transform rows + idct8_rows_dspr2(input, outptr, 4); + + outptr += 4; + + __asm__ __volatile__( + "sw $zero, 0(%[outptr]) \n\t" + "sw $zero, 4(%[outptr]) \n\t" + "sw $zero, 16(%[outptr]) \n\t" + "sw $zero, 20(%[outptr]) \n\t" + "sw $zero, 32(%[outptr]) \n\t" + "sw $zero, 36(%[outptr]) \n\t" + "sw $zero, 48(%[outptr]) \n\t" + "sw $zero, 52(%[outptr]) \n\t" + "sw $zero, 64(%[outptr]) \n\t" + "sw $zero, 68(%[outptr]) \n\t" + "sw $zero, 80(%[outptr]) \n\t" + "sw $zero, 84(%[outptr]) \n\t" + "sw $zero, 96(%[outptr]) \n\t" + "sw $zero, 100(%[outptr]) \n\t" + "sw $zero, 112(%[outptr]) \n\t" + "sw $zero, 116(%[outptr]) \n\t" + + : + : [outptr] "r"(outptr)); + + // Then transform columns and add to dest + idct8_columns_add_blk_dspr2(&out[0], dest, stride); +} + +void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, int stride) { + uint32_t pos = 45; + int32_t out; + int32_t r; + int32_t a1, absa1; + int32_t t1, t2, vector_a1, vector_1, vector_2; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + + : + : [pos] "r"(pos)); + + out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); + __asm__ __volatile__( + "addi %[out], %[out], 16 \n\t" + "sra %[a1], %[out], 5 \n\t" + + : [out] "+r"(out), [a1] "=r"(a1) + :); + + if (a1 < 0) { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__( + "abs %[absa1], %[a1] \n\t" + "replv.qb %[vector_a1], %[absa1] \n\t" + + : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); + + for (r = 8; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "add %[dest], %[dest], %[stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1), + [vector_2] "=&r"(vector_2), [dest] "+&r"(dest) + : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); + } + } else if (a1 > 255) { + int32_t a11, a12, vector_a11, vector_a12; + + /* use quad-byte + * input and output memory are four byte aligned */ + a11 = a1 >> 2; + a12 = a1 - (a11 * 3); + + __asm__ __volatile__( + "replv.qb %[vector_a11], %[a11] \n\t" + "replv.qb %[vector_a12], %[a12] \n\t" + + : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12) + : [a11] "r"(a11), [a12] "r"(a12)); + + for (r = 8; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a11] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a11] \n\t" + "addu_s.qb %[vector_1], %[vector_1], %[vector_a11] \n\t" + "addu_s.qb %[vector_2], %[vector_2], %[vector_a11] \n\t" + "addu_s.qb %[vector_1], %[vector_1], %[vector_a11] \n\t" + "addu_s.qb %[vector_2], %[vector_2], %[vector_a11] \n\t" + "addu_s.qb %[vector_1], %[vector_1], %[vector_a12] \n\t" + "addu_s.qb %[vector_2], %[vector_2], %[vector_a12] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "add %[dest], %[dest], %[stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1), + [vector_2] "=&r"(vector_2), [dest] "+r"(dest) + : [stride] "r"(stride), [vector_a11] "r"(vector_a11), + [vector_a12] "r"(vector_a12)); + } + } else { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" + + : [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); + + for (r = 8; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "add %[dest], %[dest], %[stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1), + [vector_2] "=&r"(vector_2), [dest] "+r"(dest) + : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); + } + } +} + +void iadst8_dspr2(const int16_t *input, int16_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7; + int x0, x1, x2, x3, x4, x5, x6, x7; + + x0 = input[7]; + x1 = input[0]; + x2 = input[5]; + x3 = input[2]; + x4 = input[3]; + x5 = input[4]; + x6 = input[1]; + x7 = input[6]; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { + output[0] = output[1] = output[2] = output[3] = output[4] = output[5] = + output[6] = output[7] = 0; + return; + } + + // stage 1 + s0 = cospi_2_64 * x0 + cospi_30_64 * x1; + s1 = cospi_30_64 * x0 - cospi_2_64 * x1; + s2 = cospi_10_64 * x2 + cospi_22_64 * x3; + s3 = cospi_22_64 * x2 - cospi_10_64 * x3; + s4 = cospi_18_64 * x4 + cospi_14_64 * x5; + s5 = cospi_14_64 * x4 - cospi_18_64 * x5; + s6 = cospi_26_64 * x6 + cospi_6_64 * x7; + s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + + x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS); + x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS); + x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS); + x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS); + x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS); + x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS); + x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS); + x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = cospi_8_64 * x4 + cospi_24_64 * x5; + s5 = cospi_24_64 * x4 - cospi_8_64 * x5; + s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; + s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + + x0 = s0 + s2; + x1 = s1 + s3; + x2 = s0 - s2; + x3 = s1 - s3; + x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS); + x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS); + x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS); + x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS); + + // stage 3 + s2 = cospi_16_64 * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (x6 - x7); + + x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS); + x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS); + x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS); + x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS); + + output[0] = x0; + output[1] = -x4; + output[2] = x6; + output[3] = -x2; + output[4] = x3; + output[5] = -x7; + output[6] = x5; + output[7] = -x1; +} +#endif // HAVE_DSPR2 diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_16_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_16_msa.c new file mode 100644 index 0000000000..b1731f2345 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_16_msa.c @@ -0,0 +1,1489 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/loopfilter_msa.h" +#include "vpx_ports/mem.h" + +static int32_t hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, + uint8_t *filter48, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; + v16u8 zero = { 0 }; + + /* load vector elements */ + LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + if (__msa_test_bz_v(flat)) { + ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); + + return 1; + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); + ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, + p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); + + ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); + filter48 += (4 * 16); + ST_UB2(q1_out, q2_out, filter48, 16); + filter48 += (2 * 16); + ST_UB(flat, filter48); + + return 0; + } +} + +static void hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) { + v16u8 flat, flat2, filter8; + v16i8 zero = { 0 }; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; + v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; + v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in; + v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in; + v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l; + v8i16 l_out, r_out; + + flat = LD_UB(filter48 + 96); + + LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0); + LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7); + VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + if (__msa_test_bz_v(flat2)) { + LD_UB4(filter48, 16, p2, p1, p0, q0); + LD_UB2(filter48 + 4 * 16, 16, q1, q2); + + src -= 3 * pitch; + ST_UB4(p2, p1, p0, q0, src, pitch); + src += (4 * pitch); + ST_UB2(q1, q2, src, pitch); + } else { + src -= 7 * pitch; + + ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero, + p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, + p2_r_in, p1_r_in, p0_r_in); + + q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); + + tmp0_r = p7_r_in << 3; + tmp0_r -= p7_r_in; + tmp0_r += p6_r_in; + tmp0_r += q0_r_in; + tmp1_r = p6_r_in + p5_r_in; + tmp1_r += p4_r_in; + tmp1_r += p3_r_in; + tmp1_r += p2_r_in; + tmp1_r += p1_r_in; + tmp1_r += p0_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in, + p5_l_in, p4_l_in); + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in, + p1_l_in, p0_l_in); + q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0); + + tmp0_l = p7_l_in << 3; + tmp0_l -= p7_l_in; + tmp0_l += p6_l_in; + tmp0_l += q0_l_in; + tmp1_l = p6_l_in + p5_l_in; + tmp1_l += p4_l_in; + tmp1_l += p3_l_in; + tmp1_l += p2_l_in; + tmp1_l += p1_l_in; + tmp1_l += p0_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); + ST_UB(p6, src); + src += pitch; + + /* p5 */ + q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); + tmp0_r = p5_r_in - p6_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1); + tmp0_l = p5_l_in - p6_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); + ST_UB(p5, src); + src += pitch; + + /* p4 */ + q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); + tmp0_r = p4_r_in - p5_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = (v8i16)__msa_srari_h((v8i16)tmp1_r, 4); + + q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2); + tmp0_l = p4_l_in - p5_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); + ST_UB(p4, src); + src += pitch; + + /* p3 */ + q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); + tmp0_r = p3_r_in - p4_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3); + tmp0_l = p3_l_in - p4_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); + ST_UB(p3, src); + src += pitch; + + /* p2 */ + q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); + filter8 = LD_UB(filter48); + tmp0_r = p2_r_in - p3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4); + tmp0_l = p2_l_in - p3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* p1 */ + q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); + filter8 = LD_UB(filter48 + 16); + tmp0_r = p1_r_in - p2_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5); + tmp0_l = p1_l_in - p2_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* p0 */ + q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); + filter8 = LD_UB(filter48 + 32); + tmp0_r = p0_r_in - p1_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6); + tmp0_l = p0_l_in - p1_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* q0 */ + q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); + filter8 = LD_UB(filter48 + 48); + tmp0_r = q7_r_in - p0_r_in; + tmp0_r += q0_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7); + tmp0_l = q7_l_in - p0_l_in; + tmp0_l += q0_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* q1 */ + filter8 = LD_UB(filter48 + 64); + tmp0_r = q7_r_in - q0_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p6_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q0_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p6_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* q2 */ + filter8 = LD_UB(filter48 + 80); + tmp0_r = q7_r_in - q1_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p5_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q1_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p5_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* q3 */ + tmp0_r = q7_r_in - q2_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p4_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q2_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p4_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); + ST_UB(q3, src); + src += pitch; + + /* q4 */ + tmp0_r = q7_r_in - q3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p3_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p3_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); + ST_UB(q4, src); + src += pitch; + + /* q5 */ + tmp0_r = q7_r_in - q4_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p2_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q4_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p2_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); + ST_UB(q5, src); + src += pitch; + + /* q6 */ + tmp0_r = q7_r_in - q5_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p1_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q5_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p1_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); + ST_UB(q6, src); + } +} + +static void mb_lpf_horizontal_edge_dual(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr, + int32_t count) { + DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]); + uint8_t early_exit = 0; + + (void)count; + + early_exit = hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr, + limit_ptr, thresh_ptr); + + if (0 == early_exit) { + hz_lpf_t16_16w(src, pitch, filter48); + } +} + +static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr, int32_t count) { + if (1 == count) { + uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; + uint64_t dword0, dword1; + v16u8 flat2, mask, hev, flat, thresh, b_limit, limit; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 p0_filter16, p1_filter16; + v8i16 p2_filter8, p1_filter8, p0_filter8; + v8i16 q0_filter8, q1_filter8, q2_filter8; + v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r; + v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r; + v16i8 zero = { 0 }; + v8u16 tmp0, tmp1, tmp2; + + /* load vector elements */ + LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, + q1_out); + + flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); + + if (__msa_test_bz_v(flat)) { + p1_d = __msa_copy_u_d((v2i64)p1_out, 0); + p0_d = __msa_copy_u_d((v2i64)p0_out, 0); + q0_d = __msa_copy_u_d((v2i64)q0_out, 0); + q1_d = __msa_copy_u_d((v2i64)q1_out, 0); + SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch); + } else { + /* convert 8 bit input data into 16 bit */ + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, + zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, + q3_r); + VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8, + p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero, + q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8); + PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat); + + /* load 16 vector elements */ + LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4); + LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7); + + VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + if (__msa_test_bz_v(flat2)) { + p2_d = __msa_copy_u_d((v2i64)p2_out, 0); + p1_d = __msa_copy_u_d((v2i64)p1_out, 0); + p0_d = __msa_copy_u_d((v2i64)p0_out, 0); + q0_d = __msa_copy_u_d((v2i64)q0_out, 0); + q1_d = __msa_copy_u_d((v2i64)q1_out, 0); + q2_d = __msa_copy_u_d((v2i64)q2_out, 0); + + SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch); + SD(q1_d, src + pitch); + SD(q2_d, src + 2 * pitch); + } else { + /* LSB(right) 8 pixel operation */ + ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, zero, q5, + zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, q4_r, q5_r, q6_r, + q7_r); + + tmp0 = p7_r << 3; + tmp0 -= p7_r; + tmp0 += p6_r; + tmp0 += q0_r; + + src -= 7 * pitch; + + /* calculation of p6 and p5 */ + tmp1 = p6_r + p5_r + p4_r + p3_r; + tmp1 += (p2_r + p1_r + p0_r); + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp0 = p5_r - p6_r + q1_r - p7_r; + tmp1 += tmp0; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of p4 and p3 */ + tmp0 = p4_r - p5_r + q2_r - p7_r; + tmp2 = p3_r - p4_r + q3_r - p7_r; + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of p2 and p1 */ + tmp0 = p2_r - p3_r + q4_r - p7_r; + tmp2 = p1_r - p2_r + q5_r - p7_r; + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of p0 and q0 */ + tmp0 = (p0_r - p1_r) + (q6_r - p7_r); + tmp2 = (q7_r - p0_r) + (q0_r - p7_r); + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of q1 and q2 */ + tmp0 = q7_r - q0_r + q1_r - p6_r; + tmp2 = q7_r - q1_r + q2_r - p5_r; + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of q3 and q4 */ + tmp0 = (q7_r - q2_r) + (q3_r - p4_r); + tmp2 = (q7_r - q3_r) + (q4_r - p3_r); + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of q5 and q6 */ + tmp0 = (q7_r - q4_r) + (q5_r - p2_r); + tmp2 = (q7_r - q5_r) + (q6_r - p1_r); + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + } + } + } else { + mb_lpf_horizontal_edge_dual(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, + count); + } +} + +void vpx_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1); +} + +void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 2); +} + +static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch, + uint8_t *output, int32_t out_pitch) { + v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org; + v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + + LD_UB8(input, in_pitch, p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, + p1_org, p0_org); + /* 8x8 transpose */ + TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, + p0_org, p7, p6, p5, p4, p3, p2, p1, p0); + /* 8x8 transpose */ + ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org, + tmp0, tmp1, tmp2, tmp3); + ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6); + ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7); + ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4); + ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6); + SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8); + + ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch); + output += (8 * out_pitch); + ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch); +} + +static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch, + uint8_t *output, int32_t out_pitch) { + v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + + LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0); + LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7); + TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, + q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o); + ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch); +} + +static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output, + int32_t out_pitch) { + v16u8 row0, row1, row2, row3, row4, row5, row6, row7; + v16u8 row8, row9, row10, row11, row12, row13, row14, row15; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7; + v4i32 tmp2, tmp3; + + LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7); + input += (8 * in_pitch); + LD_UB8(input, in_pitch, row8, row9, row10, row11, row12, row13, row14, row15); + + TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p7, p6, + p5, p4, p3, p2, p1, p0); + + /* transpose 16x8 matrix into 8x16 */ + /* total 8 intermediate register and 32 instructions */ + q7 = (v16u8)__msa_ilvod_d((v2i64)row8, (v2i64)row0); + q6 = (v16u8)__msa_ilvod_d((v2i64)row9, (v2i64)row1); + q5 = (v16u8)__msa_ilvod_d((v2i64)row10, (v2i64)row2); + q4 = (v16u8)__msa_ilvod_d((v2i64)row11, (v2i64)row3); + q3 = (v16u8)__msa_ilvod_d((v2i64)row12, (v2i64)row4); + q2 = (v16u8)__msa_ilvod_d((v2i64)row13, (v2i64)row5); + q1 = (v16u8)__msa_ilvod_d((v2i64)row14, (v2i64)row6); + q0 = (v16u8)__msa_ilvod_d((v2i64)row15, (v2i64)row7); + + ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1); + tmp4 = (v8i16)__msa_ilvod_b((v16i8)q6, (v16i8)q7); + tmp5 = (v8i16)__msa_ilvod_b((v16i8)q4, (v16i8)q5); + + ILVEV_B2_UB(q3, q2, q1, q0, q5, q7); + tmp6 = (v8i16)__msa_ilvod_b((v16i8)q2, (v16i8)q3); + tmp7 = (v8i16)__msa_ilvod_b((v16i8)q0, (v16i8)q1); + + ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3); + q0 = (v16u8)__msa_ilvev_w(tmp3, tmp2); + q4 = (v16u8)__msa_ilvod_w(tmp3, tmp2); + + tmp2 = (v4i32)__msa_ilvod_h(tmp1, tmp0); + tmp3 = (v4i32)__msa_ilvod_h((v8i16)q7, (v8i16)q5); + q2 = (v16u8)__msa_ilvev_w(tmp3, tmp2); + q6 = (v16u8)__msa_ilvod_w(tmp3, tmp2); + + ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3); + q1 = (v16u8)__msa_ilvev_w(tmp3, tmp2); + q5 = (v16u8)__msa_ilvod_w(tmp3, tmp2); + + tmp2 = (v4i32)__msa_ilvod_h(tmp5, tmp4); + tmp3 = (v4i32)__msa_ilvod_h(tmp7, tmp6); + q3 = (v16u8)__msa_ilvev_w(tmp3, tmp2); + q7 = (v16u8)__msa_ilvod_w(tmp3, tmp2); + + ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch); + output += (8 * out_pitch); + ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch); +} + +static int32_t vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, + uint8_t *src_org, int32_t pitch_org, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v16i8 zero = { 0 }; + v8i16 vec0, vec1, vec2, vec3; + + /* load vector elements */ + LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + /* flat4 */ + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); + + if (__msa_test_bz_v(flat)) { + ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org); + return 1; + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + + /* convert 16 bit output data into 8 bit */ + p2_r = (v8u16)__msa_pckev_b((v16i8)p2_filt8_r, (v16i8)p2_filt8_r); + p1_r = (v8u16)__msa_pckev_b((v16i8)p1_filt8_r, (v16i8)p1_filt8_r); + p0_r = (v8u16)__msa_pckev_b((v16i8)p0_filt8_r, (v16i8)p0_filt8_r); + q0_r = (v8u16)__msa_pckev_b((v16i8)q0_filt8_r, (v16i8)q0_filt8_r); + q1_r = (v8u16)__msa_pckev_b((v16i8)q1_filt8_r, (v16i8)q1_filt8_r); + q2_r = (v8u16)__msa_pckev_b((v16i8)q2_filt8_r, (v16i8)q2_filt8_r); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_r, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_r, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_r, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_r, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_r, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_r, flat); + + ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); + filter48 += (4 * 16); + ST_UB2(q1_out, q2_out, filter48, 16); + filter48 += (2 * 16); + ST_UB(flat, filter48); + + return 0; + } +} + +static int32_t vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch, + uint8_t *filter48) { + v16i8 zero = { 0 }; + v16u8 filter8, flat, flat2; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; + v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; + v8u16 tmp0_r, tmp1_r; + v8i16 r_out; + + flat = LD_UB(filter48 + 6 * 16); + + LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0); + LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7); + + VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + if (__msa_test_bz_v(flat2)) { + v8i16 vec0, vec1, vec2, vec3, vec4; + + LD_UB4(filter48, 16, p2, p1, p0, q0); + LD_UB2(filter48 + 4 * 16, 16, q1, q2); + + ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec3, vec4); + vec2 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1); + + src_org -= 3; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec2, 0, (src_org + 4), pitch); + src_org += (4 * pitch); + ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec2, 4, (src_org + 4), pitch); + + return 1; + } else { + src -= 7 * 16; + + ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero, + p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, + p2_r_in, p1_r_in, p0_r_in); + q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); + + tmp0_r = p7_r_in << 3; + tmp0_r -= p7_r_in; + tmp0_r += p6_r_in; + tmp0_r += q0_r_in; + tmp1_r = p6_r_in + p5_r_in; + tmp1_r += p4_r_in; + tmp1_r += p3_r_in; + tmp1_r += p2_r_in; + tmp1_r += p1_r_in; + tmp1_r += p0_r_in; + tmp1_r += tmp0_r; + + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); + ST8x1_UB(p6, src); + src += 16; + + /* p5 */ + q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); + tmp0_r = p5_r_in - p6_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); + ST8x1_UB(p5, src); + src += 16; + + /* p4 */ + q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); + tmp0_r = p4_r_in - p5_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); + ST8x1_UB(p4, src); + src += 16; + + /* p3 */ + q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); + tmp0_r = p3_r_in - p4_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); + ST8x1_UB(p3, src); + src += 16; + + /* p2 */ + q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); + filter8 = LD_UB(filter48); + tmp0_r = p2_r_in - p3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* p1 */ + q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); + filter8 = LD_UB(filter48 + 16); + tmp0_r = p1_r_in - p2_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* p0 */ + q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); + filter8 = LD_UB(filter48 + 32); + tmp0_r = p0_r_in - p1_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* q0 */ + q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); + filter8 = LD_UB(filter48 + 48); + tmp0_r = q7_r_in - p0_r_in; + tmp0_r += q0_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* q1 */ + filter8 = LD_UB(filter48 + 64); + tmp0_r = q7_r_in - q0_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p6_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* q2 */ + filter8 = LD_UB(filter48 + 80); + tmp0_r = q7_r_in - q1_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p5_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* q3 */ + tmp0_r = q7_r_in - q2_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p4_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); + ST8x1_UB(q3, src); + src += 16; + + /* q4 */ + tmp0_r = q7_r_in - q3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p3_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); + ST8x1_UB(q4, src); + src += 16; + + /* q5 */ + tmp0_r = q7_r_in - q4_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p2_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); + ST8x1_UB(q5, src); + src += 16; + + /* q6 */ + tmp0_r = q7_r_in - q5_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p1_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); + ST8x1_UB(q6, src); + + return 0; + } +} + +void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + uint8_t early_exit = 0; + DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]); + uint8_t *filter48 = &transposed_input[16 * 16]; + + transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16); + + early_exit = + vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src, pitch, + b_limit_ptr, limit_ptr, thresh_ptr); + + if (0 == early_exit) { + early_exit = + vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch, &filter48[0]); + + if (0 == early_exit) { + transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch); + } + } +} + +static int32_t vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48, + uint8_t *src_org, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; + v16i8 zero = { 0 }; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5; + + /* load vector elements */ + LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + /* flat4 */ + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + if (__msa_test_bz_v(flat)) { + ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec4, vec5); + + src_org -= 2; + ST4x8_UB(vec2, vec3, src_org, pitch); + src_org += 8 * pitch; + ST4x8_UB(vec4, vec5, src_org, pitch); + + return 1; + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); + ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, + p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); + + ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); + filter48 += (4 * 16); + ST_UB2(q1_out, q2_out, filter48, 16); + filter48 += (2 * 16); + ST_UB(flat, filter48); + + return 0; + } +} + +static int32_t vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch, + uint8_t *filter48) { + v16u8 flat, flat2, filter8; + v16i8 zero = { 0 }; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; + v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; + v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in; + v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in; + v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l; + v8i16 l_out, r_out; + + flat = LD_UB(filter48 + 6 * 16); + + LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0); + LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7); + + VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + if (__msa_test_bz_v(flat2)) { + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + + LD_UB4(filter48, 16, p2, p1, p0, q0); + LD_UB2(filter48 + 4 * 16, 16, q1, q2); + + ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec3, vec4); + ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec6, vec7); + ILVRL_B2_SH(q2, q1, vec2, vec5); + + src_org -= 3; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec2, 0, (src_org + 4), pitch); + src_org += (4 * pitch); + ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec2, 4, (src_org + 4), pitch); + src_org += (4 * pitch); + ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec5, 0, (src_org + 4), pitch); + src_org += (4 * pitch); + ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec5, 4, (src_org + 4), pitch); + + return 1; + } else { + src -= 7 * 16; + + ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero, + p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, + p2_r_in, p1_r_in, p0_r_in); + q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); + + tmp0_r = p7_r_in << 3; + tmp0_r -= p7_r_in; + tmp0_r += p6_r_in; + tmp0_r += q0_r_in; + tmp1_r = p6_r_in + p5_r_in; + tmp1_r += p4_r_in; + tmp1_r += p3_r_in; + tmp1_r += p2_r_in; + tmp1_r += p1_r_in; + tmp1_r += p0_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in, + p5_l_in, p4_l_in); + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in, + p1_l_in, p0_l_in); + q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0); + + tmp0_l = p7_l_in << 3; + tmp0_l -= p7_l_in; + tmp0_l += p6_l_in; + tmp0_l += q0_l_in; + tmp1_l = p6_l_in + p5_l_in; + tmp1_l += p4_l_in; + tmp1_l += p3_l_in; + tmp1_l += p2_l_in; + tmp1_l += p1_l_in; + tmp1_l += p0_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); + ST_UB(p6, src); + src += 16; + + /* p5 */ + q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); + tmp0_r = p5_r_in - p6_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1); + tmp0_l = p5_l_in - p6_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); + ST_UB(p5, src); + src += 16; + + /* p4 */ + q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); + tmp0_r = p4_r_in - p5_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2); + tmp0_l = p4_l_in - p5_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); + ST_UB(p4, src); + src += 16; + + /* p3 */ + q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); + tmp0_r = p3_r_in - p4_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3); + tmp0_l = p3_l_in - p4_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); + ST_UB(p3, src); + src += 16; + + /* p2 */ + q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); + filter8 = LD_UB(filter48); + tmp0_r = p2_r_in - p3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4); + tmp0_l = p2_l_in - p3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* p1 */ + q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); + filter8 = LD_UB(filter48 + 16); + tmp0_r = p1_r_in - p2_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5); + tmp0_l = p1_l_in - p2_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)(tmp1_l), 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* p0 */ + q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); + filter8 = LD_UB(filter48 + 32); + tmp0_r = p0_r_in - p1_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6); + tmp0_l = p0_l_in - p1_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* q0 */ + q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); + filter8 = LD_UB(filter48 + 48); + tmp0_r = q7_r_in - p0_r_in; + tmp0_r += q0_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7); + tmp0_l = q7_l_in - p0_l_in; + tmp0_l += q0_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* q1 */ + filter8 = LD_UB(filter48 + 64); + tmp0_r = q7_r_in - q0_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p6_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q0_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p6_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* q2 */ + filter8 = LD_UB(filter48 + 80); + tmp0_r = q7_r_in - q1_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p5_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q1_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p5_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* q3 */ + tmp0_r = q7_r_in - q2_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p4_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q2_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p4_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); + ST_UB(q3, src); + src += 16; + + /* q4 */ + tmp0_r = q7_r_in - q3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p3_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p3_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); + ST_UB(q4, src); + src += 16; + + /* q5 */ + tmp0_r = q7_r_in - q4_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p2_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q4_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p2_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); + ST_UB(q5, src); + src += 16; + + /* q6 */ + tmp0_r = q7_r_in - q5_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p1_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q5_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p1_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); + ST_UB(q6, src); + + return 0; + } +} + +void vpx_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + uint8_t early_exit = 0; + DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]); + uint8_t *filter48 = &transposed_input[16 * 16]; + + transpose_16x16((src - 8), pitch, &transposed_input[0], 16); + + early_exit = + vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src, + pitch, b_limit_ptr, limit_ptr, thresh_ptr); + + if (0 == early_exit) { + early_exit = + vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, &filter48[0]); + + if (0 == early_exit) { + transpose_16x16(transposed_input, 16, (src - 8), pitch); + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_4_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_4_msa.c new file mode 100644 index 0000000000..0eff2b6ca9 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_4_msa.c @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/loopfilter_msa.h" + +void vpx_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + uint64_t p1_d, p0_d, q0_d, q1_d; + v16u8 mask, hev, flat, thresh, b_limit, limit; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out; + + /* load vector elements */ + LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + p1_d = __msa_copy_u_d((v2i64)p1_out, 0); + p0_d = __msa_copy_u_d((v2i64)p0_out, 0); + q0_d = __msa_copy_u_d((v2i64)q0_out, 0); + q1_d = __msa_copy_u_d((v2i64)q1_out, 0); + SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch); +} + +void vpx_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0_ptr, + const uint8_t *limit0_ptr, + const uint8_t *thresh0_ptr, + const uint8_t *b_limit1_ptr, + const uint8_t *limit1_ptr, + const uint8_t *thresh1_ptr) { + v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + + /* load vector elements */ + LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr); + thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr); + thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0); + + b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr); + b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr); + b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0); + + limit0 = (v16u8)__msa_fill_b(*limit0_ptr); + limit1 = (v16u8)__msa_fill_b(*limit1_ptr); + limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, + mask, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + + ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch); +} + +void vpx_lpf_vertical_4_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + v16u8 mask, hev, flat, limit, thresh, b_limit; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v8i16 vec0, vec1, vec2, vec3; + + LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2, + q3); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + + src -= 2; + ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); + src += 4 * pitch; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); +} + +void vpx_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0_ptr, + const uint8_t *limit0_ptr, + const uint8_t *thresh0_ptr, + const uint8_t *b_limit1_ptr, + const uint8_t *limit1_ptr, + const uint8_t *thresh1_ptr) { + v16u8 mask, hev, flat; + v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 row0, row1, row2, row3, row4, row5, row6, row7; + v16u8 row8, row9, row10, row11, row12, row13, row14, row15; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + + LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); + LD_UB8(src - 4 + (8 * pitch), pitch, row8, row9, row10, row11, row12, row13, + row14, row15); + + TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p3, p2, + p1, p0, q0, q1, q2, q3); + + thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr); + thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr); + thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0); + + b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr); + b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr); + b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0); + + limit0 = (v16u8)__msa_fill_b(*limit0_ptr); + limit1 = (v16u8)__msa_fill_b(*limit1_ptr); + limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, + mask, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1); + ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3); + ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1); + ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5); + + src -= 2; + + ST4x8_UB(tmp2, tmp3, src, pitch); + src += (8 * pitch); + ST4x8_UB(tmp4, tmp5, src, pitch); +} diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_8_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_8_msa.c new file mode 100644 index 0000000000..703fcce8a7 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_8_msa.c @@ -0,0 +1,333 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/loopfilter_msa.h" + +void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; + v16u8 mask, hev, flat, thresh, b_limit, limit; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v8i16 p2_filter8, p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8; + v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r; + v16i8 zero = { 0 }; + + /* load vector elements */ + LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); + + if (__msa_test_bz_v(flat)) { + p1_d = __msa_copy_u_d((v2i64)p1_out, 0); + p0_d = __msa_copy_u_d((v2i64)p0_out, 0); + q0_d = __msa_copy_u_d((v2i64)q0_out, 0); + q1_d = __msa_copy_u_d((v2i64)q1_out, 0); + SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch); + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8, + p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero, + q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8); + PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat); + + p2_d = __msa_copy_u_d((v2i64)p2_out, 0); + p1_d = __msa_copy_u_d((v2i64)p1_out, 0); + p0_d = __msa_copy_u_d((v2i64)p0_out, 0); + q0_d = __msa_copy_u_d((v2i64)q0_out, 0); + q1_d = __msa_copy_u_d((v2i64)q1_out, 0); + q2_d = __msa_copy_u_d((v2i64)q2_out, 0); + + src -= 3 * pitch; + + SD4(p2_d, p1_d, p0_d, q0_d, src, pitch); + src += (4 * pitch); + SD(q1_d, src); + src += pitch; + SD(q2_d, src); + } +} + +void vpx_lpf_horizontal_8_dual_msa( + uint8_t *src, int32_t pitch, const uint8_t *b_limit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *b_limit1, const uint8_t *limit1, + const uint8_t *thresh1) { + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 flat, mask, hev, tmp, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; + v16u8 zero = { 0 }; + + /* load vector elements */ + LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh0); + tmp = (v16u8)__msa_fill_b(*thresh1); + thresh = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)thresh); + + b_limit = (v16u8)__msa_fill_b(*b_limit0); + tmp = (v16u8)__msa_fill_b(*b_limit1); + b_limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)b_limit); + + limit = (v16u8)__msa_fill_b(*limit0); + tmp = (v16u8)__msa_fill_b(*limit1); + limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + if (__msa_test_bz_v(flat)) { + ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); + ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, + p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); + + src -= 3 * pitch; + + ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch); + src += (4 * pitch); + ST_UB2(q1_out, q2_out, src, pitch); + src += (2 * pitch); + } +} + +void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p1_out, p0_out, q0_out, q1_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v16u8 zero = { 0 }; + v8i16 vec0, vec1, vec2, vec3, vec4; + + /* load vector elements */ + LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2, + q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + /* flat4 */ + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); + + if (__msa_test_bz_v(flat)) { + /* Store 4 pixels p1-_q1 */ + ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + + src -= 2; + ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); + src += 4 * pitch; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r, + p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); + p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); + p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); + q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); + q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); + q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); + + /* Store 6 pixels p2-_q2 */ + ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + vec4 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1); + + src -= 3; + ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec4, 0, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec4, 4, src + 4, pitch); + } +} + +void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0, const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *b_limit1, const uint8_t *limit1, + const uint8_t *thresh1) { + uint8_t *temp_src; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p1_out, p0_out, q0_out, q1_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v16u8 row4, row5, row6, row7, row12, row13, row14, row15; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; + v16u8 zero = { 0 }; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + + temp_src = src - 4; + + LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7); + temp_src += (8 * pitch); + LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15); + + /* transpose 16x8 matrix into 8x16 */ + TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0, + row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2, + q3); + + thresh = (v16u8)__msa_fill_b(*thresh0); + vec0 = (v8i16)__msa_fill_b(*thresh1); + thresh = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)thresh); + + b_limit = (v16u8)__msa_fill_b(*b_limit0); + vec0 = (v8i16)__msa_fill_b(*b_limit1); + b_limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)b_limit); + + limit = (v16u8)__msa_fill_b(*limit0); + vec0 = (v8i16)__msa_fill_b(*limit1); + limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + /* flat4 */ + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + if (__msa_test_bz_v(flat)) { + ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec4, vec5); + + src -= 2; + ST4x8_UB(vec2, vec3, src, pitch); + src += 8 * pitch; + ST4x8_UB(vec4, vec5, src, pitch); + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); + ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); + + /* filter8 */ + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, + p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); + p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); + p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); + q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); + q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); + q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); + + ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec3, vec4); + ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec6, vec7); + ILVRL_B2_SH(q2, q1, vec2, vec5); + + src -= 3; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec2, 0, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec2, 4, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec5, 0, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec5, 4, src + 4, pitch); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.c new file mode 100644 index 0000000000..f1743679a7 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.c @@ -0,0 +1,326 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/mips/common_dspr2.h" +#include "vpx_dsp/mips/loopfilter_filters_dspr2.h" +#include "vpx_dsp/mips/loopfilter_macros_dspr2.h" +#include "vpx_dsp/mips/loopfilter_masks_dspr2.h" +#include "vpx_mem/vpx_mem.h" + +#if HAVE_DSPR2 +void vpx_lpf_horizontal_4_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + uint8_t i; + uint32_t mask; + uint32_t hev; + uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; + uint8_t *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); + + /* prefetch data for store */ + prefetch_store(s); + + /* loop filter designed to work using chars so that we can make maximum use + of 8 bit simd instructions. */ + for (i = 0; i < 2; i++) { + sm1 = s - (pitch << 2); + s0 = sm1 + pitch; + s1 = s0 + pitch; + s2 = s - pitch; + s3 = s; + s4 = s + pitch; + s5 = s4 + pitch; + s6 = s5 + pitch; + + __asm__ __volatile__( + "lw %[p1], (%[s1]) \n\t" + "lw %[p2], (%[s2]) \n\t" + "lw %[p3], (%[s3]) \n\t" + "lw %[p4], (%[s4]) \n\t" + + : [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4) + : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); + + /* if (p1 - p4 == 0) and (p2 - p3 == 0) + mask will be zero and filtering is not needed */ + if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { + __asm__ __volatile__( + "lw %[pm1], (%[sm1]) \n\t" + "lw %[p0], (%[s0]) \n\t" + "lw %[p5], (%[s5]) \n\t" + "lw %[p6], (%[s6]) \n\t" + + : [pm1] "=&r"(pm1), [p0] "=&r"(p0), [p5] "=&r"(p5), [p6] "=&r"(p6) + : [sm1] "r"(sm1), [s0] "r"(s0), [s5] "r"(s5), [s6] "r"(s6)); + + filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5, + p6, thresh_vec, &hev, &mask); + + /* if mask == 0 do filtering is not needed */ + if (mask) { + /* filtering */ + filter_dspr2(mask, hev, &p1, &p2, &p3, &p4); + + __asm__ __volatile__( + "sw %[p1], (%[s1]) \n\t" + "sw %[p2], (%[s2]) \n\t" + "sw %[p3], (%[s3]) \n\t" + "sw %[p4], (%[s4]) \n\t" + + : + : [p1] "r"(p1), [p2] "r"(p2), [p3] "r"(p3), [p4] "r"(p4), + [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); + } + } + + s = s + 4; + } +} + +void vpx_lpf_vertical_4_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + uint8_t i; + uint32_t mask, hev; + uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; + uint8_t *s1, *s2, *s3, *s4; + uint32_t prim1, prim2, sec3, sec4, prim3, prim4; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); + + /* prefetch data for store */ + prefetch_store(s + pitch); + + for (i = 0; i < 2; i++) { + s1 = s; + s2 = s + pitch; + s3 = s2 + pitch; + s4 = s3 + pitch; + s = s4 + pitch; + + /* load quad-byte vectors + * memory is 4 byte aligned + */ + p2 = *((uint32_t *)(s1 - 4)); + p6 = *((uint32_t *)(s1)); + p1 = *((uint32_t *)(s2 - 4)); + p5 = *((uint32_t *)(s2)); + p0 = *((uint32_t *)(s3 - 4)); + p4 = *((uint32_t *)(s3)); + pm1 = *((uint32_t *)(s4 - 4)); + p3 = *((uint32_t *)(s4)); + + /* transpose pm1, p0, p1, p2 */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" + "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" + "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" + "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t" + + "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p2], %[p1], %[sec3] \n\t" + "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t" + "append %[p1], %[sec3], 16 \n\t" + "append %[pm1], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), + [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* transpose p3, p4, p5, p6 */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" + "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" + "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" + "precr.qb.ph %[prim4], %[p4], %[p3] \n\t" + + "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p6], %[p5], %[sec3] \n\t" + "precrq.ph.w %[p4], %[p3], %[sec4] \n\t" + "append %[p5], %[sec3], 16 \n\t" + "append %[p3], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4), + [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* if (p1 - p4 == 0) and (p2 - p3 == 0) + * mask will be zero and filtering is not needed + */ + if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { + filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5, + p6, thresh_vec, &hev, &mask); + + /* if mask == 0 do filtering is not needed */ + if (mask) { + /* filtering */ + filter_dspr2(mask, hev, &p1, &p2, &p3, &p4); + + /* unpack processed 4x4 neighborhood + * don't use transpose on output data + * because memory isn't aligned + */ + __asm__ __volatile__( + "sb %[p4], 1(%[s4]) \n\t" + "sb %[p3], 0(%[s4]) \n\t" + "sb %[p2], -1(%[s4]) \n\t" + "sb %[p1], -2(%[s4]) \n\t" + + : + : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), + [s4] "r"(s4)); + + __asm__ __volatile__( + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + + : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) + :); + + __asm__ __volatile__( + "sb %[p4], 1(%[s3]) \n\t" + "sb %[p3], 0(%[s3]) \n\t" + "sb %[p2], -1(%[s3]) \n\t" + "sb %[p1], -2(%[s3]) \n\t" + + : [p1] "+r"(p1) + : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [s3] "r"(s3)); + + __asm__ __volatile__( + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + + : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) + :); + + __asm__ __volatile__( + "sb %[p4], 1(%[s2]) \n\t" + "sb %[p3], 0(%[s2]) \n\t" + "sb %[p2], -1(%[s2]) \n\t" + "sb %[p1], -2(%[s2]) \n\t" + + : + : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), + [s2] "r"(s2)); + + __asm__ __volatile__( + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + + : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) + :); + + __asm__ __volatile__( + "sb %[p4], 1(%[s1]) \n\t" + "sb %[p3], 0(%[s1]) \n\t" + "sb %[p2], -1(%[s1]) \n\t" + "sb %[p1], -2(%[s1]) \n\t" + + : + : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), + [s1] "r"(s1)); + } + } + } +} + +void vpx_lpf_horizontal_4_dual_dspr2( + uint8_t *s, int p /* pitch */, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, + const uint8_t *limit1, const uint8_t *thresh1) { + vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0); + vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1); +} + +void vpx_lpf_horizontal_8_dual_dspr2( + uint8_t *s, int p /* pitch */, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, + const uint8_t *limit1, const uint8_t *thresh1) { + vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0); + vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1); +} + +void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + vpx_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0); + vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1); +} + +void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + vpx_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0); + vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1); +} + +void vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + vpx_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh); + vpx_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh); +} +#endif // #if HAVE_DSPR2 diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.h b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.h new file mode 100644 index 0000000000..ec339be868 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.h @@ -0,0 +1,734 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ +#define VPX_VPX_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if HAVE_DSPR2 +/* inputs & outputs are quad-byte vectors */ +static INLINE void filter_dspr2(uint32_t mask, uint32_t hev, uint32_t *ps1, + uint32_t *ps0, uint32_t *qs0, uint32_t *qs1) { + int32_t vpx_filter_l, vpx_filter_r; + int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r; + int32_t subr_r, subr_l; + uint32_t t1, t2, HWM, t3; + uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r; + int32_t vps1, vps0, vqs0, vqs1; + int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r; + uint32_t N128; + + N128 = 0x80808080; + t1 = 0x03000300; + t2 = 0x04000400; + t3 = 0x01000100; + HWM = 0xFF00FF00; + + vps0 = (*ps0) ^ N128; + vps1 = (*ps1) ^ N128; + vqs0 = (*qs0) ^ N128; + vqs1 = (*qs1) ^ N128; + + /* use halfword pairs instead quad-bytes because of accuracy */ + vps0_l = vps0 & HWM; + vps0_r = vps0 << 8; + vps0_r = vps0_r & HWM; + + vps1_l = vps1 & HWM; + vps1_r = vps1 << 8; + vps1_r = vps1_r & HWM; + + vqs0_l = vqs0 & HWM; + vqs0_r = vqs0 << 8; + vqs0_r = vqs0_r & HWM; + + vqs1_l = vqs1 & HWM; + vqs1_r = vqs1 << 8; + vqs1_r = vqs1_r & HWM; + + mask_l = mask & HWM; + mask_r = mask << 8; + mask_r = mask_r & HWM; + + hev_l = hev & HWM; + hev_r = hev << 8; + hev_r = hev_r & HWM; + + __asm__ __volatile__( + /* vpx_filter = vp8_signed_char_clamp(ps1 - qs1); */ + "subq_s.ph %[vpx_filter_l], %[vps1_l], %[vqs1_l] \n\t" + "subq_s.ph %[vpx_filter_r], %[vps1_r], %[vqs1_r] \n\t" + + /* qs0 - ps0 */ + "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t" + "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t" + + /* vpx_filter &= hev; */ + "and %[vpx_filter_l], %[vpx_filter_l], %[hev_l] \n\t" + "and %[vpx_filter_r], %[vpx_filter_r], %[hev_r] \n\t" + + /* vpx_filter = vp8_signed_char_clamp(vpx_filter + 3 * (qs0 - ps0)); */ + "addq_s.ph %[vpx_filter_l], %[vpx_filter_l], %[subr_l] \n\t" + "addq_s.ph %[vpx_filter_r], %[vpx_filter_r], %[subr_r] \n\t" + "xor %[invhev_l], %[hev_l], %[HWM] \n\t" + "addq_s.ph %[vpx_filter_l], %[vpx_filter_l], %[subr_l] \n\t" + "addq_s.ph %[vpx_filter_r], %[vpx_filter_r], %[subr_r] \n\t" + "xor %[invhev_r], %[hev_r], %[HWM] \n\t" + "addq_s.ph %[vpx_filter_l], %[vpx_filter_l], %[subr_l] \n\t" + "addq_s.ph %[vpx_filter_r], %[vpx_filter_r], %[subr_r] \n\t" + + /* vpx_filter &= mask; */ + "and %[vpx_filter_l], %[vpx_filter_l], %[mask_l] \n\t" + "and %[vpx_filter_r], %[vpx_filter_r], %[mask_r] \n\t" + + : [vpx_filter_l] "=&r"(vpx_filter_l), [vpx_filter_r] "=&r"(vpx_filter_r), + [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r), + [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r) + : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l), + [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r), + [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l), + [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r), + [HWM] "r"(HWM)); + + /* save bottom 3 bits so that we round one side +4 and the other +3 */ + __asm__ __volatile__( + /* Filter2 = vp8_signed_char_clamp(vpx_filter + 3) >>= 3; */ + "addq_s.ph %[Filter1_l], %[vpx_filter_l], %[t2] \n\t" + "addq_s.ph %[Filter1_r], %[vpx_filter_r], %[t2] \n\t" + + /* Filter1 = vp8_signed_char_clamp(vpx_filter + 4) >>= 3; */ + "addq_s.ph %[Filter2_l], %[vpx_filter_l], %[t1] \n\t" + "addq_s.ph %[Filter2_r], %[vpx_filter_r], %[t1] \n\t" + "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t" + "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t" + + "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t" + "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t" + + "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t" + "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t" + + /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */ + "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t" + "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t" + + /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */ + "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t" + "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t" + + : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r), + [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r), + [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l), + [vqs0_r] "+r"(vqs0_r) + : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM), + [vpx_filter_l] "r"(vpx_filter_l), [vpx_filter_r] "r"(vpx_filter_r)); + + __asm__ __volatile__( + /* (vpx_filter += 1) >>= 1 */ + "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t" + "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t" + + /* vpx_filter &= ~hev; */ + "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t" + "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t" + + /* vps1 = vp8_signed_char_clamp(ps1 + vpx_filter); */ + "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t" + "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t" + + /* vqs1 = vp8_signed_char_clamp(qs1 - vpx_filter); */ + "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t" + "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t" + + : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r), + [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l), + [vqs1_r] "+r"(vqs1_r) + : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r)); + + /* Create quad-bytes from halfword pairs */ + vqs0_l = vqs0_l & HWM; + vqs1_l = vqs1_l & HWM; + vps0_l = vps0_l & HWM; + vps1_l = vps1_l & HWM; + + __asm__ __volatile__( + "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t" + "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t" + "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t" + "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t" + + : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r), + [vqs0_r] "+r"(vqs0_r) + :); + + vqs0 = vqs0_l | vqs0_r; + vqs1 = vqs1_l | vqs1_r; + vps0 = vps0_l | vps0_r; + vps1 = vps1_l | vps1_r; + + *ps0 = vps0 ^ N128; + *ps1 = vps1 ^ N128; + *qs0 = vqs0 ^ N128; + *qs1 = vqs1 ^ N128; +} + +static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev, uint32_t ps1, + uint32_t ps0, uint32_t qs0, uint32_t qs1, + uint32_t *p1_f0, uint32_t *p0_f0, + uint32_t *q0_f0, uint32_t *q1_f0) { + int32_t vpx_filter_l, vpx_filter_r; + int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r; + int32_t subr_r, subr_l; + uint32_t t1, t2, HWM, t3; + uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r; + int32_t vps1, vps0, vqs0, vqs1; + int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r; + uint32_t N128; + + N128 = 0x80808080; + t1 = 0x03000300; + t2 = 0x04000400; + t3 = 0x01000100; + HWM = 0xFF00FF00; + + vps0 = (ps0) ^ N128; + vps1 = (ps1) ^ N128; + vqs0 = (qs0) ^ N128; + vqs1 = (qs1) ^ N128; + + /* use halfword pairs instead quad-bytes because of accuracy */ + vps0_l = vps0 & HWM; + vps0_r = vps0 << 8; + vps0_r = vps0_r & HWM; + + vps1_l = vps1 & HWM; + vps1_r = vps1 << 8; + vps1_r = vps1_r & HWM; + + vqs0_l = vqs0 & HWM; + vqs0_r = vqs0 << 8; + vqs0_r = vqs0_r & HWM; + + vqs1_l = vqs1 & HWM; + vqs1_r = vqs1 << 8; + vqs1_r = vqs1_r & HWM; + + mask_l = mask & HWM; + mask_r = mask << 8; + mask_r = mask_r & HWM; + + hev_l = hev & HWM; + hev_r = hev << 8; + hev_r = hev_r & HWM; + + __asm__ __volatile__( + /* vpx_filter = vp8_signed_char_clamp(ps1 - qs1); */ + "subq_s.ph %[vpx_filter_l], %[vps1_l], %[vqs1_l] \n\t" + "subq_s.ph %[vpx_filter_r], %[vps1_r], %[vqs1_r] \n\t" + + /* qs0 - ps0 */ + "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t" + "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t" + + /* vpx_filter &= hev; */ + "and %[vpx_filter_l], %[vpx_filter_l], %[hev_l] \n\t" + "and %[vpx_filter_r], %[vpx_filter_r], %[hev_r] \n\t" + + /* vpx_filter = vp8_signed_char_clamp(vpx_filter + 3 * (qs0 - ps0)); */ + "addq_s.ph %[vpx_filter_l], %[vpx_filter_l], %[subr_l] \n\t" + "addq_s.ph %[vpx_filter_r], %[vpx_filter_r], %[subr_r] \n\t" + "xor %[invhev_l], %[hev_l], %[HWM] \n\t" + "addq_s.ph %[vpx_filter_l], %[vpx_filter_l], %[subr_l] \n\t" + "addq_s.ph %[vpx_filter_r], %[vpx_filter_r], %[subr_r] \n\t" + "xor %[invhev_r], %[hev_r], %[HWM] \n\t" + "addq_s.ph %[vpx_filter_l], %[vpx_filter_l], %[subr_l] \n\t" + "addq_s.ph %[vpx_filter_r], %[vpx_filter_r], %[subr_r] \n\t" + + /* vpx_filter &= mask; */ + "and %[vpx_filter_l], %[vpx_filter_l], %[mask_l] \n\t" + "and %[vpx_filter_r], %[vpx_filter_r], %[mask_r] \n\t" + + : [vpx_filter_l] "=&r"(vpx_filter_l), [vpx_filter_r] "=&r"(vpx_filter_r), + [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r), + [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r) + : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l), + [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r), + [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l), + [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r), + [HWM] "r"(HWM)); + + /* save bottom 3 bits so that we round one side +4 and the other +3 */ + __asm__ __volatile__( + /* Filter2 = vp8_signed_char_clamp(vpx_filter + 3) >>= 3; */ + "addq_s.ph %[Filter1_l], %[vpx_filter_l], %[t2] \n\t" + "addq_s.ph %[Filter1_r], %[vpx_filter_r], %[t2] \n\t" + + /* Filter1 = vp8_signed_char_clamp(vpx_filter + 4) >>= 3; */ + "addq_s.ph %[Filter2_l], %[vpx_filter_l], %[t1] \n\t" + "addq_s.ph %[Filter2_r], %[vpx_filter_r], %[t1] \n\t" + "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t" + "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t" + + "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t" + "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t" + + "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t" + "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t" + + /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */ + "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t" + "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t" + + /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */ + "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t" + "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t" + + : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r), + [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r), + [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l), + [vqs0_r] "+r"(vqs0_r) + : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM), + [vpx_filter_l] "r"(vpx_filter_l), [vpx_filter_r] "r"(vpx_filter_r)); + + __asm__ __volatile__( + /* (vpx_filter += 1) >>= 1 */ + "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t" + "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t" + + /* vpx_filter &= ~hev; */ + "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t" + "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t" + + /* vps1 = vp8_signed_char_clamp(ps1 + vpx_filter); */ + "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t" + "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t" + + /* vqs1 = vp8_signed_char_clamp(qs1 - vpx_filter); */ + "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t" + "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t" + + : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r), + [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l), + [vqs1_r] "+r"(vqs1_r) + : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r)); + + /* Create quad-bytes from halfword pairs */ + vqs0_l = vqs0_l & HWM; + vqs1_l = vqs1_l & HWM; + vps0_l = vps0_l & HWM; + vps1_l = vps1_l & HWM; + + __asm__ __volatile__( + "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t" + "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t" + "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t" + "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t" + + : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r), + [vqs0_r] "+r"(vqs0_r) + :); + + vqs0 = vqs0_l | vqs0_r; + vqs1 = vqs1_l | vqs1_r; + vps0 = vps0_l | vps0_r; + vps1 = vps1_l | vps1_r; + + *p0_f0 = vps0 ^ N128; + *p1_f0 = vps1 ^ N128; + *q0_f0 = vqs0 ^ N128; + *q1_f0 = vqs1 ^ N128; +} + +static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2, uint32_t *op1, + uint32_t *op0, uint32_t *oq0, uint32_t *oq1, + uint32_t *oq2, uint32_t *oq3) { + /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */ + const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; + const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; + uint32_t res_op2, res_op1, res_op0; + uint32_t res_oq0, res_oq1, res_oq2; + uint32_t tmp; + uint32_t add_p210_q012; + uint32_t u32Four = 0x00040004; + + /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3) 1 */ + /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3) 2 */ + /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3) 3 */ + /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3) 4 */ + /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */ + /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */ + + __asm__ __volatile__( + "addu.ph %[add_p210_q012], %[p2], %[p1] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q1] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q2] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[u32Four] \n\t" + + "shll.ph %[tmp], %[p3], 1 \n\t" + "addu.ph %[res_op2], %[tmp], %[p3] \n\t" + "addu.ph %[res_op1], %[p3], %[p3] \n\t" + "addu.ph %[res_op2], %[res_op2], %[p2] \n\t" + "addu.ph %[res_op1], %[res_op1], %[p1] \n\t" + "addu.ph %[res_op2], %[res_op2], %[add_p210_q012] \n\t" + "addu.ph %[res_op1], %[res_op1], %[add_p210_q012] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q1] \n\t" + "subu.ph %[res_op1], %[res_op1], %[q2] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q2] \n\t" + "shrl.ph %[res_op1], %[res_op1], 3 \n\t" + "shrl.ph %[res_op2], %[res_op2], 3 \n\t" + "addu.ph %[res_op0], %[p3], %[p0] \n\t" + "addu.ph %[res_oq0], %[q0], %[q3] \n\t" + "addu.ph %[res_op0], %[res_op0], %[add_p210_q012] \n\t" + "addu.ph %[res_oq0], %[res_oq0], %[add_p210_q012] \n\t" + "addu.ph %[res_oq1], %[q3], %[q3] \n\t" + "shll.ph %[tmp], %[q3], 1 \n\t" + "addu.ph %[res_oq1], %[res_oq1], %[q1] \n\t" + "addu.ph %[res_oq2], %[tmp], %[q3] \n\t" + "addu.ph %[res_oq1], %[res_oq1], %[add_p210_q012] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[add_p210_q012] \n\t" + "subu.ph %[res_oq1], %[res_oq1], %[p2] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t" + "shrl.ph %[res_oq1], %[res_oq1], 3 \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p2] \n\t" + "shrl.ph %[res_oq0], %[res_oq0], 3 \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p1] \n\t" + "shrl.ph %[res_op0], %[res_op0], 3 \n\t" + "shrl.ph %[res_oq2], %[res_oq2], 3 \n\t" + + : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp), + [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1), + [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0), + [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2) + : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2), + [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four)); + + *op2 = res_op2; + *op1 = res_op1; + *op0 = res_op0; + *oq0 = res_oq0; + *oq1 = res_oq1; + *oq2 = res_oq2; +} + +static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2, uint32_t p1, + uint32_t p0, uint32_t q0, uint32_t q1, + uint32_t q2, uint32_t q3, uint32_t *op2_f1, + uint32_t *op1_f1, uint32_t *op0_f1, + uint32_t *oq0_f1, uint32_t *oq1_f1, + uint32_t *oq2_f1) { + /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */ + uint32_t res_op2, res_op1, res_op0; + uint32_t res_oq0, res_oq1, res_oq2; + uint32_t tmp; + uint32_t add_p210_q012; + uint32_t u32Four = 0x00040004; + + /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3) 1 */ + /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3) 2 */ + /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3) 3 */ + /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3) 4 */ + /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */ + /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */ + + __asm__ __volatile__( + "addu.ph %[add_p210_q012], %[p2], %[p1] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q1] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q2] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[u32Four] \n\t" + + "shll.ph %[tmp], %[p3], 1 \n\t" + "addu.ph %[res_op2], %[tmp], %[p3] \n\t" + "addu.ph %[res_op1], %[p3], %[p3] \n\t" + "addu.ph %[res_op2], %[res_op2], %[p2] \n\t" + "addu.ph %[res_op1], %[res_op1], %[p1] \n\t" + "addu.ph %[res_op2], %[res_op2], %[add_p210_q012] \n\t" + "addu.ph %[res_op1], %[res_op1], %[add_p210_q012] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q1] \n\t" + "subu.ph %[res_op1], %[res_op1], %[q2] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q2] \n\t" + "shrl.ph %[res_op1], %[res_op1], 3 \n\t" + "shrl.ph %[res_op2], %[res_op2], 3 \n\t" + "addu.ph %[res_op0], %[p3], %[p0] \n\t" + "addu.ph %[res_oq0], %[q0], %[q3] \n\t" + "addu.ph %[res_op0], %[res_op0], %[add_p210_q012] \n\t" + "addu.ph %[res_oq0], %[res_oq0], %[add_p210_q012] \n\t" + "addu.ph %[res_oq1], %[q3], %[q3] \n\t" + "shll.ph %[tmp], %[q3], 1 \n\t" + "addu.ph %[res_oq1], %[res_oq1], %[q1] \n\t" + "addu.ph %[res_oq2], %[tmp], %[q3] \n\t" + "addu.ph %[res_oq1], %[res_oq1], %[add_p210_q012] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[add_p210_q012] \n\t" + "subu.ph %[res_oq1], %[res_oq1], %[p2] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t" + "shrl.ph %[res_oq1], %[res_oq1], 3 \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p2] \n\t" + "shrl.ph %[res_oq0], %[res_oq0], 3 \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p1] \n\t" + "shrl.ph %[res_op0], %[res_op0], 3 \n\t" + "shrl.ph %[res_oq2], %[res_oq2], 3 \n\t" + + : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp), + [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1), + [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0), + [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2) + : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2), + [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four)); + + *op2_f1 = res_op2; + *op1_f1 = res_op1; + *op0_f1 = res_op0; + *oq0_f1 = res_oq0; + *oq1_f1 = res_oq1; + *oq2_f1 = res_oq2; +} + +static INLINE void wide_mbfilter_dspr2( + uint32_t *op7, uint32_t *op6, uint32_t *op5, uint32_t *op4, uint32_t *op3, + uint32_t *op2, uint32_t *op1, uint32_t *op0, uint32_t *oq0, uint32_t *oq1, + uint32_t *oq2, uint32_t *oq3, uint32_t *oq4, uint32_t *oq5, uint32_t *oq6, + uint32_t *oq7) { + const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4; + const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; + const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; + const uint32_t q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7; + uint32_t res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0; + uint32_t res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6; + uint32_t tmp; + uint32_t add_p6toq6; + uint32_t u32Eight = 0x00080008; + + __asm__ __volatile__( + /* addition of p6,p5,p4,p3,p2,p1,p0,q0,q1,q2,q3,q4,q5,q6 + which is used most of the time */ + "addu.ph %[add_p6toq6], %[p6], %[p5] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[p4] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[p3] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[p2] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[p1] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[p0] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q0] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q1] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q2] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q3] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q4] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q5] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q6] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[u32Eight] \n\t" + + : [add_p6toq6] "=&r"(add_p6toq6) + : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), + [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), + [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), + [u32Eight] "r"(u32Eight)); + + __asm__ __volatile__( + /* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + + p3 + p2 + p1 + p0 + q0, 4) */ + "shll.ph %[tmp], %[p7], 3 \n\t" + "subu.ph %[res_op6], %[tmp], %[p7] \n\t" + "addu.ph %[res_op6], %[res_op6], %[p6] \n\t" + "addu.ph %[res_op6], %[res_op6], %[add_p6toq6] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q1] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q2] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q3] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q4] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q5] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q6] \n\t" + "shrl.ph %[res_op6], %[res_op6], 4 \n\t" + + /* *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + + p2 + p1 + p0 + q0 + q1, 4) */ + "shll.ph %[tmp], %[p7], 2 \n\t" + "addu.ph %[res_op5], %[tmp], %[p7] \n\t" + "addu.ph %[res_op5], %[res_op5], %[p7] \n\t" + "addu.ph %[res_op5], %[res_op5], %[p5] \n\t" + "addu.ph %[res_op5], %[res_op5], %[add_p6toq6] \n\t" + "subu.ph %[res_op5], %[res_op5], %[q2] \n\t" + "subu.ph %[res_op5], %[res_op5], %[q3] \n\t" + "subu.ph %[res_op5], %[res_op5], %[q4] \n\t" + "subu.ph %[res_op5], %[res_op5], %[q5] \n\t" + "subu.ph %[res_op5], %[res_op5], %[q6] \n\t" + "shrl.ph %[res_op5], %[res_op5], 4 \n\t" + + /* *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + + p1 + p0 + q0 + q1 + q2, 4) */ + "shll.ph %[tmp], %[p7], 2 \n\t" + "addu.ph %[res_op4], %[tmp], %[p7] \n\t" + "addu.ph %[res_op4], %[res_op4], %[p4] \n\t" + "addu.ph %[res_op4], %[res_op4], %[add_p6toq6] \n\t" + "subu.ph %[res_op4], %[res_op4], %[q3] \n\t" + "subu.ph %[res_op4], %[res_op4], %[q4] \n\t" + "subu.ph %[res_op4], %[res_op4], %[q5] \n\t" + "subu.ph %[res_op4], %[res_op4], %[q6] \n\t" + "shrl.ph %[res_op4], %[res_op4], 4 \n\t" + + /* *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + + p1 + p0 + q0 + q1 + q2 + q3, 4) */ + "shll.ph %[tmp], %[p7], 2 \n\t" + "addu.ph %[res_op3], %[tmp], %[p3] \n\t" + "addu.ph %[res_op3], %[res_op3], %[add_p6toq6] \n\t" + "subu.ph %[res_op3], %[res_op3], %[q4] \n\t" + "subu.ph %[res_op3], %[res_op3], %[q5] \n\t" + "subu.ph %[res_op3], %[res_op3], %[q6] \n\t" + "shrl.ph %[res_op3], %[res_op3], 4 \n\t" + + /* *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + + p0 + q0 + q1 + q2 + q3 + q4, 4) */ + "shll.ph %[tmp], %[p7], 1 \n\t" + "addu.ph %[res_op2], %[tmp], %[p7] \n\t" + "addu.ph %[res_op2], %[res_op2], %[p2] \n\t" + "addu.ph %[res_op2], %[res_op2], %[add_p6toq6] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q5] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q6] \n\t" + "shrl.ph %[res_op2], %[res_op2], 4 \n\t" + + /* *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + + p0 + q0 + q1 + q2 + q3 + q4 + q5, 4); */ + "shll.ph %[tmp], %[p7], 1 \n\t" + "addu.ph %[res_op1], %[tmp], %[p1] \n\t" + "addu.ph %[res_op1], %[res_op1], %[add_p6toq6] \n\t" + "subu.ph %[res_op1], %[res_op1], %[q6] \n\t" + "shrl.ph %[res_op1], %[res_op1], 4 \n\t" + + /* *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + + q0 + q1 + q2 + q3 + q4 + q5 + q6, 4) */ + "addu.ph %[res_op0], %[p7], %[p0] \n\t" + "addu.ph %[res_op0], %[res_op0], %[add_p6toq6] \n\t" + "shrl.ph %[res_op0], %[res_op0], 4 \n\t" + + : [res_op6] "=&r"(res_op6), [res_op5] "=&r"(res_op5), + [res_op4] "=&r"(res_op4), [res_op3] "=&r"(res_op3), + [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1), + [res_op0] "=&r"(res_op0), [tmp] "=&r"(tmp) + : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), + [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q2] "r"(q2), [q1] "r"(q1), + [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), + [add_p6toq6] "r"(add_p6toq6)); + + *op6 = res_op6; + *op5 = res_op5; + *op4 = res_op4; + *op3 = res_op3; + *op2 = res_op2; + *op1 = res_op1; + *op0 = res_op0; + + __asm__ __volatile__( + /* *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); */ + "addu.ph %[res_oq0], %[q7], %[q0] \n\t" + "addu.ph %[res_oq0], %[res_oq0], %[add_p6toq6] \n\t" + "shrl.ph %[res_oq0], %[res_oq0], 4 \n\t" + + /* *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4) */ + "shll.ph %[tmp], %[q7], 1 \n\t" + "addu.ph %[res_oq1], %[tmp], %[q1] \n\t" + "addu.ph %[res_oq1], %[res_oq1], %[add_p6toq6] \n\t" + "subu.ph %[res_oq1], %[res_oq1], %[p6] \n\t" + "shrl.ph %[res_oq1], %[res_oq1], 4 \n\t" + + /* *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + + q3 + q4 + q5 + q6 + q7 * 3, 4) */ + "shll.ph %[tmp], %[q7], 1 \n\t" + "addu.ph %[res_oq2], %[tmp], %[q7] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[add_p6toq6] \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p5] \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p6] \n\t" + "shrl.ph %[res_oq2], %[res_oq2], 4 \n\t" + + /* *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q0 + q1 + q2 + + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4) */ + "shll.ph %[tmp], %[q7], 2 \n\t" + "addu.ph %[res_oq3], %[tmp], %[q3] \n\t" + "addu.ph %[res_oq3], %[res_oq3], %[add_p6toq6] \n\t" + "subu.ph %[res_oq3], %[res_oq3], %[p4] \n\t" + "subu.ph %[res_oq3], %[res_oq3], %[p5] \n\t" + "subu.ph %[res_oq3], %[res_oq3], %[p6] \n\t" + "shrl.ph %[res_oq3], %[res_oq3], 4 \n\t" + + /* *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q1 + q2 + q3 + + q4 * 2 + q5 + q6 + q7 * 5, 4) */ + "shll.ph %[tmp], %[q7], 2 \n\t" + "addu.ph %[res_oq4], %[tmp], %[q7] \n\t" + "addu.ph %[res_oq4], %[res_oq4], %[q4] \n\t" + "addu.ph %[res_oq4], %[res_oq4], %[add_p6toq6] \n\t" + "subu.ph %[res_oq4], %[res_oq4], %[p3] \n\t" + "subu.ph %[res_oq4], %[res_oq4], %[p4] \n\t" + "subu.ph %[res_oq4], %[res_oq4], %[p5] \n\t" + "subu.ph %[res_oq4], %[res_oq4], %[p6] \n\t" + "shrl.ph %[res_oq4], %[res_oq4], 4 \n\t" + + /* *oq5 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q2 + q3 + q4 + + q5 * 2 + q6 + q7 * 6, 4) */ + "shll.ph %[tmp], %[q7], 2 \n\t" + "addu.ph %[res_oq5], %[tmp], %[q7] \n\t" + "addu.ph %[res_oq5], %[res_oq5], %[q7] \n\t" + "addu.ph %[res_oq5], %[res_oq5], %[q5] \n\t" + "addu.ph %[res_oq5], %[res_oq5], %[add_p6toq6] \n\t" + "subu.ph %[res_oq5], %[res_oq5], %[p2] \n\t" + "subu.ph %[res_oq5], %[res_oq5], %[p3] \n\t" + "subu.ph %[res_oq5], %[res_oq5], %[p4] \n\t" + "subu.ph %[res_oq5], %[res_oq5], %[p5] \n\t" + "subu.ph %[res_oq5], %[res_oq5], %[p6] \n\t" + "shrl.ph %[res_oq5], %[res_oq5], 4 \n\t" + + /* *oq6 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + + q4 + q5 + q6 * 2 + q7 * 7, 4) */ + "shll.ph %[tmp], %[q7], 3 \n\t" + "subu.ph %[res_oq6], %[tmp], %[q7] \n\t" + "addu.ph %[res_oq6], %[res_oq6], %[q6] \n\t" + "addu.ph %[res_oq6], %[res_oq6], %[add_p6toq6] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p1] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p2] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p3] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p4] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p5] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p6] \n\t" + "shrl.ph %[res_oq6], %[res_oq6], 4 \n\t" + + : [res_oq6] "=&r"(res_oq6), [res_oq5] "=&r"(res_oq5), + [res_oq4] "=&r"(res_oq4), [res_oq3] "=&r"(res_oq3), + [res_oq2] "=&r"(res_oq2), [res_oq1] "=&r"(res_oq1), + [res_oq0] "=&r"(res_oq0), [tmp] "=&r"(tmp) + : [q7] "r"(q7), [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3), + [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [p1] "r"(p1), [p2] "r"(p2), + [p3] "r"(p3), [p4] "r"(p4), [p5] "r"(p5), [p6] "r"(p6), + [add_p6toq6] "r"(add_p6toq6)); + + *oq0 = res_oq0; + *oq1 = res_oq1; + *oq2 = res_oq2; + *oq3 = res_oq3; + *oq4 = res_oq4; + *oq5 = res_oq5; + *oq6 = res_oq6; +} +#endif // #if HAVE_DSPR2 +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_macros_dspr2.h b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_macros_dspr2.h new file mode 100644 index 0000000000..9af0b42360 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_macros_dspr2.h @@ -0,0 +1,435 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ +#define VPX_VPX_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_mem/vpx_mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if HAVE_DSPR2 +#define STORE_F0() \ + { \ + __asm__ __volatile__( \ + "sb %[q1_f0], 1(%[s4]) \n\t" \ + "sb %[q0_f0], 0(%[s4]) \n\t" \ + "sb %[p0_f0], -1(%[s4]) \n\t" \ + "sb %[p1_f0], -2(%[s4]) \n\t" \ + \ + : \ + : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \ + [p1_f0] "r"(p1_f0), [s4] "r"(s4)); \ + \ + __asm__ __volatile__( \ + "srl %[q1_f0], %[q1_f0], 8 \n\t" \ + "srl %[q0_f0], %[q0_f0], 8 \n\t" \ + "srl %[p0_f0], %[p0_f0], 8 \n\t" \ + "srl %[p1_f0], %[p1_f0], 8 \n\t" \ + \ + : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \ + [p1_f0] "+r"(p1_f0) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q1_f0], 1(%[s3]) \n\t" \ + "sb %[q0_f0], 0(%[s3]) \n\t" \ + "sb %[p0_f0], -1(%[s3]) \n\t" \ + "sb %[p1_f0], -2(%[s3]) \n\t" \ + \ + : [p1_f0] "+r"(p1_f0) \ + : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [s3] "r"(s3), \ + [p0_f0] "r"(p0_f0)); \ + \ + __asm__ __volatile__( \ + "srl %[q1_f0], %[q1_f0], 8 \n\t" \ + "srl %[q0_f0], %[q0_f0], 8 \n\t" \ + "srl %[p0_f0], %[p0_f0], 8 \n\t" \ + "srl %[p1_f0], %[p1_f0], 8 \n\t" \ + \ + : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \ + [p1_f0] "+r"(p1_f0) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q1_f0], 1(%[s2]) \n\t" \ + "sb %[q0_f0], 0(%[s2]) \n\t" \ + "sb %[p0_f0], -1(%[s2]) \n\t" \ + "sb %[p1_f0], -2(%[s2]) \n\t" \ + \ + : \ + : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \ + [p1_f0] "r"(p1_f0), [s2] "r"(s2)); \ + \ + __asm__ __volatile__( \ + "srl %[q1_f0], %[q1_f0], 8 \n\t" \ + "srl %[q0_f0], %[q0_f0], 8 \n\t" \ + "srl %[p0_f0], %[p0_f0], 8 \n\t" \ + "srl %[p1_f0], %[p1_f0], 8 \n\t" \ + \ + : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \ + [p1_f0] "+r"(p1_f0) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q1_f0], 1(%[s1]) \n\t" \ + "sb %[q0_f0], 0(%[s1]) \n\t" \ + "sb %[p0_f0], -1(%[s1]) \n\t" \ + "sb %[p1_f0], -2(%[s1]) \n\t" \ + \ + : \ + : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \ + [p1_f0] "r"(p1_f0), [s1] "r"(s1)); \ + } + +#define STORE_F1() \ + { \ + __asm__ __volatile__( \ + "sb %[q2_r], 2(%[s4]) \n\t" \ + "sb %[q1_r], 1(%[s4]) \n\t" \ + "sb %[q0_r], 0(%[s4]) \n\t" \ + "sb %[p0_r], -1(%[s4]) \n\t" \ + "sb %[p1_r], -2(%[s4]) \n\t" \ + "sb %[p2_r], -3(%[s4]) \n\t" \ + \ + : \ + : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r), \ + [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s4] "r"(s4)); \ + \ + __asm__ __volatile__( \ + "srl %[q2_r], %[q2_r], 16 \n\t" \ + "srl %[q1_r], %[q1_r], 16 \n\t" \ + "srl %[q0_r], %[q0_r], 16 \n\t" \ + "srl %[p0_r], %[p0_r], 16 \n\t" \ + "srl %[p1_r], %[p1_r], 16 \n\t" \ + "srl %[p2_r], %[p2_r], 16 \n\t" \ + \ + : [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), [q0_r] "+r"(q0_r), \ + [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), [p2_r] "+r"(p2_r) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q2_r], 2(%[s3]) \n\t" \ + "sb %[q1_r], 1(%[s3]) \n\t" \ + "sb %[q0_r], 0(%[s3]) \n\t" \ + "sb %[p0_r], -1(%[s3]) \n\t" \ + "sb %[p1_r], -2(%[s3]) \n\t" \ + "sb %[p2_r], -3(%[s3]) \n\t" \ + \ + : \ + : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r), \ + [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s3] "r"(s3)); \ + \ + __asm__ __volatile__( \ + "sb %[q2_l], 2(%[s2]) \n\t" \ + "sb %[q1_l], 1(%[s2]) \n\t" \ + "sb %[q0_l], 0(%[s2]) \n\t" \ + "sb %[p0_l], -1(%[s2]) \n\t" \ + "sb %[p1_l], -2(%[s2]) \n\t" \ + "sb %[p2_l], -3(%[s2]) \n\t" \ + \ + : \ + : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l), \ + [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s2] "r"(s2)); \ + \ + __asm__ __volatile__( \ + "srl %[q2_l], %[q2_l], 16 \n\t" \ + "srl %[q1_l], %[q1_l], 16 \n\t" \ + "srl %[q0_l], %[q0_l], 16 \n\t" \ + "srl %[p0_l], %[p0_l], 16 \n\t" \ + "srl %[p1_l], %[p1_l], 16 \n\t" \ + "srl %[p2_l], %[p2_l], 16 \n\t" \ + \ + : [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), [q0_l] "+r"(q0_l), \ + [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), [p2_l] "+r"(p2_l) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q2_l], 2(%[s1]) \n\t" \ + "sb %[q1_l], 1(%[s1]) \n\t" \ + "sb %[q0_l], 0(%[s1]) \n\t" \ + "sb %[p0_l], -1(%[s1]) \n\t" \ + "sb %[p1_l], -2(%[s1]) \n\t" \ + "sb %[p2_l], -3(%[s1]) \n\t" \ + \ + : \ + : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l), \ + [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s1] "r"(s1)); \ + } + +#define STORE_F2() \ + { \ + __asm__ __volatile__( \ + "sb %[q6_r], 6(%[s4]) \n\t" \ + "sb %[q5_r], 5(%[s4]) \n\t" \ + "sb %[q4_r], 4(%[s4]) \n\t" \ + "sb %[q3_r], 3(%[s4]) \n\t" \ + "sb %[q2_r], 2(%[s4]) \n\t" \ + "sb %[q1_r], 1(%[s4]) \n\t" \ + "sb %[q0_r], 0(%[s4]) \n\t" \ + "sb %[p0_r], -1(%[s4]) \n\t" \ + "sb %[p1_r], -2(%[s4]) \n\t" \ + "sb %[p2_r], -3(%[s4]) \n\t" \ + "sb %[p3_r], -4(%[s4]) \n\t" \ + "sb %[p4_r], -5(%[s4]) \n\t" \ + "sb %[p5_r], -6(%[s4]) \n\t" \ + "sb %[p6_r], -7(%[s4]) \n\t" \ + \ + : \ + : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r), \ + [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), \ + [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), \ + [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r), \ + [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s4] "r"(s4)); \ + \ + __asm__ __volatile__( \ + "srl %[q6_r], %[q6_r], 16 \n\t" \ + "srl %[q5_r], %[q5_r], 16 \n\t" \ + "srl %[q4_r], %[q4_r], 16 \n\t" \ + "srl %[q3_r], %[q3_r], 16 \n\t" \ + "srl %[q2_r], %[q2_r], 16 \n\t" \ + "srl %[q1_r], %[q1_r], 16 \n\t" \ + "srl %[q0_r], %[q0_r], 16 \n\t" \ + "srl %[p0_r], %[p0_r], 16 \n\t" \ + "srl %[p1_r], %[p1_r], 16 \n\t" \ + "srl %[p2_r], %[p2_r], 16 \n\t" \ + "srl %[p3_r], %[p3_r], 16 \n\t" \ + "srl %[p4_r], %[p4_r], 16 \n\t" \ + "srl %[p5_r], %[p5_r], 16 \n\t" \ + "srl %[p6_r], %[p6_r], 16 \n\t" \ + \ + : [q6_r] "+r"(q6_r), [q5_r] "+r"(q5_r), [q4_r] "+r"(q4_r), \ + [q3_r] "+r"(q3_r), [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), \ + [q0_r] "+r"(q0_r), [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), \ + [p2_r] "+r"(p2_r), [p3_r] "+r"(p3_r), [p4_r] "+r"(p4_r), \ + [p5_r] "+r"(p5_r), [p6_r] "+r"(p6_r) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q6_r], 6(%[s3]) \n\t" \ + "sb %[q5_r], 5(%[s3]) \n\t" \ + "sb %[q4_r], 4(%[s3]) \n\t" \ + "sb %[q3_r], 3(%[s3]) \n\t" \ + "sb %[q2_r], 2(%[s3]) \n\t" \ + "sb %[q1_r], 1(%[s3]) \n\t" \ + "sb %[q0_r], 0(%[s3]) \n\t" \ + "sb %[p0_r], -1(%[s3]) \n\t" \ + "sb %[p1_r], -2(%[s3]) \n\t" \ + "sb %[p2_r], -3(%[s3]) \n\t" \ + "sb %[p3_r], -4(%[s3]) \n\t" \ + "sb %[p4_r], -5(%[s3]) \n\t" \ + "sb %[p5_r], -6(%[s3]) \n\t" \ + "sb %[p6_r], -7(%[s3]) \n\t" \ + \ + : \ + : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r), \ + [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), \ + [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), \ + [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r), \ + [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s3] "r"(s3)); \ + \ + __asm__ __volatile__( \ + "sb %[q6_l], 6(%[s2]) \n\t" \ + "sb %[q5_l], 5(%[s2]) \n\t" \ + "sb %[q4_l], 4(%[s2]) \n\t" \ + "sb %[q3_l], 3(%[s2]) \n\t" \ + "sb %[q2_l], 2(%[s2]) \n\t" \ + "sb %[q1_l], 1(%[s2]) \n\t" \ + "sb %[q0_l], 0(%[s2]) \n\t" \ + "sb %[p0_l], -1(%[s2]) \n\t" \ + "sb %[p1_l], -2(%[s2]) \n\t" \ + "sb %[p2_l], -3(%[s2]) \n\t" \ + "sb %[p3_l], -4(%[s2]) \n\t" \ + "sb %[p4_l], -5(%[s2]) \n\t" \ + "sb %[p5_l], -6(%[s2]) \n\t" \ + "sb %[p6_l], -7(%[s2]) \n\t" \ + \ + : \ + : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l), \ + [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), \ + [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), \ + [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l), \ + [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s2] "r"(s2)); \ + \ + __asm__ __volatile__( \ + "srl %[q6_l], %[q6_l], 16 \n\t" \ + "srl %[q5_l], %[q5_l], 16 \n\t" \ + "srl %[q4_l], %[q4_l], 16 \n\t" \ + "srl %[q3_l], %[q3_l], 16 \n\t" \ + "srl %[q2_l], %[q2_l], 16 \n\t" \ + "srl %[q1_l], %[q1_l], 16 \n\t" \ + "srl %[q0_l], %[q0_l], 16 \n\t" \ + "srl %[p0_l], %[p0_l], 16 \n\t" \ + "srl %[p1_l], %[p1_l], 16 \n\t" \ + "srl %[p2_l], %[p2_l], 16 \n\t" \ + "srl %[p3_l], %[p3_l], 16 \n\t" \ + "srl %[p4_l], %[p4_l], 16 \n\t" \ + "srl %[p5_l], %[p5_l], 16 \n\t" \ + "srl %[p6_l], %[p6_l], 16 \n\t" \ + \ + : [q6_l] "+r"(q6_l), [q5_l] "+r"(q5_l), [q4_l] "+r"(q4_l), \ + [q3_l] "+r"(q3_l), [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), \ + [q0_l] "+r"(q0_l), [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), \ + [p2_l] "+r"(p2_l), [p3_l] "+r"(p3_l), [p4_l] "+r"(p4_l), \ + [p5_l] "+r"(p5_l), [p6_l] "+r"(p6_l) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q6_l], 6(%[s1]) \n\t" \ + "sb %[q5_l], 5(%[s1]) \n\t" \ + "sb %[q4_l], 4(%[s1]) \n\t" \ + "sb %[q3_l], 3(%[s1]) \n\t" \ + "sb %[q2_l], 2(%[s1]) \n\t" \ + "sb %[q1_l], 1(%[s1]) \n\t" \ + "sb %[q0_l], 0(%[s1]) \n\t" \ + "sb %[p0_l], -1(%[s1]) \n\t" \ + "sb %[p1_l], -2(%[s1]) \n\t" \ + "sb %[p2_l], -3(%[s1]) \n\t" \ + "sb %[p3_l], -4(%[s1]) \n\t" \ + "sb %[p4_l], -5(%[s1]) \n\t" \ + "sb %[p5_l], -6(%[s1]) \n\t" \ + "sb %[p6_l], -7(%[s1]) \n\t" \ + \ + : \ + : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l), \ + [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), \ + [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), \ + [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l), \ + [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s1] "r"(s1)); \ + } + +#define PACK_LEFT_0TO3() \ + { \ + __asm__ __volatile__( \ + "preceu.ph.qbl %[p3_l], %[p3] \n\t" \ + "preceu.ph.qbl %[p2_l], %[p2] \n\t" \ + "preceu.ph.qbl %[p1_l], %[p1] \n\t" \ + "preceu.ph.qbl %[p0_l], %[p0] \n\t" \ + "preceu.ph.qbl %[q0_l], %[q0] \n\t" \ + "preceu.ph.qbl %[q1_l], %[q1] \n\t" \ + "preceu.ph.qbl %[q2_l], %[q2] \n\t" \ + "preceu.ph.qbl %[q3_l], %[q3] \n\t" \ + \ + : [p3_l] "=&r"(p3_l), [p2_l] "=&r"(p2_l), [p1_l] "=&r"(p1_l), \ + [p0_l] "=&r"(p0_l), [q0_l] "=&r"(q0_l), [q1_l] "=&r"(q1_l), \ + [q2_l] "=&r"(q2_l), [q3_l] "=&r"(q3_l) \ + : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), \ + [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3)); \ + } + +#define PACK_LEFT_4TO7() \ + { \ + __asm__ __volatile__( \ + "preceu.ph.qbl %[p7_l], %[p7] \n\t" \ + "preceu.ph.qbl %[p6_l], %[p6] \n\t" \ + "preceu.ph.qbl %[p5_l], %[p5] \n\t" \ + "preceu.ph.qbl %[p4_l], %[p4] \n\t" \ + "preceu.ph.qbl %[q4_l], %[q4] \n\t" \ + "preceu.ph.qbl %[q5_l], %[q5] \n\t" \ + "preceu.ph.qbl %[q6_l], %[q6] \n\t" \ + "preceu.ph.qbl %[q7_l], %[q7] \n\t" \ + \ + : [p7_l] "=&r"(p7_l), [p6_l] "=&r"(p6_l), [p5_l] "=&r"(p5_l), \ + [p4_l] "=&r"(p4_l), [q4_l] "=&r"(q4_l), [q5_l] "=&r"(q5_l), \ + [q6_l] "=&r"(q6_l), [q7_l] "=&r"(q7_l) \ + : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), \ + [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7)); \ + } + +#define PACK_RIGHT_0TO3() \ + { \ + __asm__ __volatile__( \ + "preceu.ph.qbr %[p3_r], %[p3] \n\t" \ + "preceu.ph.qbr %[p2_r], %[p2] \n\t" \ + "preceu.ph.qbr %[p1_r], %[p1] \n\t" \ + "preceu.ph.qbr %[p0_r], %[p0] \n\t" \ + "preceu.ph.qbr %[q0_r], %[q0] \n\t" \ + "preceu.ph.qbr %[q1_r], %[q1] \n\t" \ + "preceu.ph.qbr %[q2_r], %[q2] \n\t" \ + "preceu.ph.qbr %[q3_r], %[q3] \n\t" \ + \ + : [p3_r] "=&r"(p3_r), [p2_r] "=&r"(p2_r), [p1_r] "=&r"(p1_r), \ + [p0_r] "=&r"(p0_r), [q0_r] "=&r"(q0_r), [q1_r] "=&r"(q1_r), \ + [q2_r] "=&r"(q2_r), [q3_r] "=&r"(q3_r) \ + : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), \ + [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3)); \ + } + +#define PACK_RIGHT_4TO7() \ + { \ + __asm__ __volatile__( \ + "preceu.ph.qbr %[p7_r], %[p7] \n\t" \ + "preceu.ph.qbr %[p6_r], %[p6] \n\t" \ + "preceu.ph.qbr %[p5_r], %[p5] \n\t" \ + "preceu.ph.qbr %[p4_r], %[p4] \n\t" \ + "preceu.ph.qbr %[q4_r], %[q4] \n\t" \ + "preceu.ph.qbr %[q5_r], %[q5] \n\t" \ + "preceu.ph.qbr %[q6_r], %[q6] \n\t" \ + "preceu.ph.qbr %[q7_r], %[q7] \n\t" \ + \ + : [p7_r] "=&r"(p7_r), [p6_r] "=&r"(p6_r), [p5_r] "=&r"(p5_r), \ + [p4_r] "=&r"(p4_r), [q4_r] "=&r"(q4_r), [q5_r] "=&r"(q5_r), \ + [q6_r] "=&r"(q6_r), [q7_r] "=&r"(q7_r) \ + : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), \ + [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7)); \ + } + +#define COMBINE_LEFT_RIGHT_0TO2() \ + { \ + __asm__ __volatile__( \ + "precr.qb.ph %[p2], %[p2_l], %[p2_r] \n\t" \ + "precr.qb.ph %[p1], %[p1_l], %[p1_r] \n\t" \ + "precr.qb.ph %[p0], %[p0_l], %[p0_r] \n\t" \ + "precr.qb.ph %[q0], %[q0_l], %[q0_r] \n\t" \ + "precr.qb.ph %[q1], %[q1_l], %[q1_r] \n\t" \ + "precr.qb.ph %[q2], %[q2_l], %[q2_r] \n\t" \ + \ + : [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), [q0] "=&r"(q0), \ + [q1] "=&r"(q1), [q2] "=&r"(q2) \ + : [p2_l] "r"(p2_l), [p2_r] "r"(p2_r), [p1_l] "r"(p1_l), \ + [p1_r] "r"(p1_r), [p0_l] "r"(p0_l), [p0_r] "r"(p0_r), \ + [q0_l] "r"(q0_l), [q0_r] "r"(q0_r), [q1_l] "r"(q1_l), \ + [q1_r] "r"(q1_r), [q2_l] "r"(q2_l), [q2_r] "r"(q2_r)); \ + } + +#define COMBINE_LEFT_RIGHT_3TO6() \ + { \ + __asm__ __volatile__( \ + "precr.qb.ph %[p6], %[p6_l], %[p6_r] \n\t" \ + "precr.qb.ph %[p5], %[p5_l], %[p5_r] \n\t" \ + "precr.qb.ph %[p4], %[p4_l], %[p4_r] \n\t" \ + "precr.qb.ph %[p3], %[p3_l], %[p3_r] \n\t" \ + "precr.qb.ph %[q3], %[q3_l], %[q3_r] \n\t" \ + "precr.qb.ph %[q4], %[q4_l], %[q4_r] \n\t" \ + "precr.qb.ph %[q5], %[q5_l], %[q5_r] \n\t" \ + "precr.qb.ph %[q6], %[q6_l], %[q6_r] \n\t" \ + \ + : [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4), [p3] "=&r"(p3), \ + [q3] "=&r"(q3), [q4] "=&r"(q4), [q5] "=&r"(q5), [q6] "=&r"(q6) \ + : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), \ + [p3_l] "r"(p3_l), [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), \ + [p4_r] "r"(p4_r), [p3_r] "r"(p3_r), [q3_l] "r"(q3_l), \ + [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), [q6_l] "r"(q6_l), \ + [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), \ + [q6_r] "r"(q6_r)); \ + } + +#endif // #if HAVE_DSPR2 +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h new file mode 100644 index 0000000000..24c492bea0 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h @@ -0,0 +1,355 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ +#define VPX_VPX_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_mem/vpx_mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if HAVE_DSPR2 +/* processing 4 pixels at the same time + * compute hev and mask in the same function */ +static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit, + uint32_t p1, uint32_t p0, uint32_t p3, + uint32_t p2, uint32_t q0, uint32_t q1, + uint32_t q2, uint32_t q3, + uint32_t thresh, uint32_t *hev, + uint32_t *mask) { + uint32_t c, r, r3, r_k; + uint32_t s1, s2, s3; + uint32_t ones = 0xFFFFFFFF; + uint32_t hev1; + + __asm__ __volatile__( + /* mask |= (abs(p3 - p2) > limit) */ + "subu_s.qb %[c], %[p3], %[p2] \n\t" + "subu_s.qb %[r_k], %[p2], %[p3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], $0, %[c] \n\t" + + /* mask |= (abs(p2 - p1) > limit) */ + "subu_s.qb %[c], %[p2], %[p1] \n\t" + "subu_s.qb %[r_k], %[p1], %[p2] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + /* mask |= (abs(p1 - p0) > limit) + * hev |= (abs(p1 - p0) > thresh) + */ + "subu_s.qb %[c], %[p1], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" + "or %[r3], $0, %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + /* mask |= (abs(q1 - q0) > limit) + * hev |= (abs(q1 - q0) > thresh) + */ + "subu_s.qb %[c], %[q1], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" + "or %[r3], %[r3], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + /* mask |= (abs(q2 - q1) > limit) */ + "subu_s.qb %[c], %[q2], %[q1] \n\t" + "subu_s.qb %[r_k], %[q1], %[q2] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + "sll %[r3], %[r3], 24 \n\t" + + /* mask |= (abs(q3 - q2) > limit) */ + "subu_s.qb %[c], %[q3], %[q2] \n\t" + "subu_s.qb %[r_k], %[q2], %[q3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3) + : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), + [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3), + [thresh] "r"(thresh)); + + __asm__ __volatile__( + /* abs(p0 - q0) */ + "subu_s.qb %[c], %[p0], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[p0] \n\t" + "wrdsp %[r3] \n\t" + "or %[s1], %[r_k], %[c] \n\t" + + /* abs(p1 - q1) */ + "subu_s.qb %[c], %[p1], %[q1] \n\t" + "addu_s.qb %[s3], %[s1], %[s1] \n\t" + "pick.qb %[hev1], %[ones], $0 \n\t" + "subu_s.qb %[r_k], %[q1], %[p1] \n\t" + "or %[s2], %[r_k], %[c] \n\t" + + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */ + "shrl.qb %[s2], %[s2], 1 \n\t" + "addu_s.qb %[s1], %[s2], %[s3] \n\t" + "cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t" + "or %[r], %[r], %[c] \n\t" + "sll %[r], %[r], 24 \n\t" + + "wrdsp %[r] \n\t" + "pick.qb %[s2], $0, %[ones] \n\t" + + : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1), + [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3) + : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1), + [ones] "r"(ones), [flimit] "r"(flimit)); + + *hev = hev1; + *mask = s2; +} + +static INLINE void filter_hev_mask_flatmask4_dspr2( + uint32_t limit, uint32_t flimit, uint32_t thresh, uint32_t p1, uint32_t p0, + uint32_t p3, uint32_t p2, uint32_t q0, uint32_t q1, uint32_t q2, + uint32_t q3, uint32_t *hev, uint32_t *mask, uint32_t *flat) { + uint32_t c, r, r3, r_k, r_flat; + uint32_t s1, s2, s3; + uint32_t ones = 0xFFFFFFFF; + uint32_t flat_thresh = 0x01010101; + uint32_t hev1; + uint32_t flat1; + + __asm__ __volatile__( + /* mask |= (abs(p3 - p2) > limit) */ + "subu_s.qb %[c], %[p3], %[p2] \n\t" + "subu_s.qb %[r_k], %[p2], %[p3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], $0, %[c] \n\t" + + /* mask |= (abs(p2 - p1) > limit) */ + "subu_s.qb %[c], %[p2], %[p1] \n\t" + "subu_s.qb %[r_k], %[p1], %[p2] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + /* mask |= (abs(p1 - p0) > limit) + * hev |= (abs(p1 - p0) > thresh) + * flat |= (abs(p1 - p0) > thresh) + */ + "subu_s.qb %[c], %[p1], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" + "or %[r3], $0, %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], $0, %[c] \n\t" + + /* mask |= (abs(q1 - q0) > limit) + * hev |= (abs(q1 - q0) > thresh) + * flat |= (abs(q1 - q0) > thresh) + */ + "subu_s.qb %[c], %[q1], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" + "or %[r3], %[r3], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(p0 - p2) > thresh) */ + "subu_s.qb %[c], %[p0], %[p2] \n\t" + "subu_s.qb %[r_k], %[p2], %[p0] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(q0 - q2) > thresh) */ + "subu_s.qb %[c], %[q0], %[q2] \n\t" + "subu_s.qb %[r_k], %[q2], %[q0] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(p3 - p0) > thresh) */ + "subu_s.qb %[c], %[p3], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(q3 - q0) > thresh) */ + "subu_s.qb %[c], %[q3], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + "sll %[r_flat], %[r_flat], 24 \n\t" + /* look at stall here */ + "wrdsp %[r_flat] \n\t" + "pick.qb %[flat1], $0, %[ones] \n\t" + + /* mask |= (abs(q2 - q1) > limit) */ + "subu_s.qb %[c], %[q2], %[q1] \n\t" + "subu_s.qb %[r_k], %[q1], %[q2] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + "sll %[r3], %[r3], 24 \n\t" + + /* mask |= (abs(q3 - q2) > limit) */ + "subu_s.qb %[c], %[q3], %[q2] \n\t" + "subu_s.qb %[r_k], %[q2], %[q3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3), + [r_flat] "=&r"(r_flat), [flat1] "=&r"(flat1) + : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), + [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3), + [thresh] "r"(thresh), [flat_thresh] "r"(flat_thresh), [ones] "r"(ones)); + + __asm__ __volatile__( + /* abs(p0 - q0) */ + "subu_s.qb %[c], %[p0], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[p0] \n\t" + "wrdsp %[r3] \n\t" + "or %[s1], %[r_k], %[c] \n\t" + + /* abs(p1 - q1) */ + "subu_s.qb %[c], %[p1], %[q1] \n\t" + "addu_s.qb %[s3], %[s1], %[s1] \n\t" + "pick.qb %[hev1], %[ones], $0 \n\t" + "subu_s.qb %[r_k], %[q1], %[p1] \n\t" + "or %[s2], %[r_k], %[c] \n\t" + + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */ + "shrl.qb %[s2], %[s2], 1 \n\t" + "addu_s.qb %[s1], %[s2], %[s3] \n\t" + "cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t" + "or %[r], %[r], %[c] \n\t" + "sll %[r], %[r], 24 \n\t" + + "wrdsp %[r] \n\t" + "pick.qb %[s2], $0, %[ones] \n\t" + + : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1), + [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3) + : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1), + [ones] "r"(ones), [flimit] "r"(flimit)); + + *hev = hev1; + *mask = s2; + *flat = flat1; +} + +static INLINE void flatmask5(uint32_t p4, uint32_t p3, uint32_t p2, uint32_t p1, + uint32_t p0, uint32_t q0, uint32_t q1, uint32_t q2, + uint32_t q3, uint32_t q4, uint32_t *flat2) { + uint32_t c, r, r_k, r_flat; + uint32_t ones = 0xFFFFFFFF; + uint32_t flat_thresh = 0x01010101; + uint32_t flat1, flat3; + + __asm__ __volatile__( + /* flat |= (abs(p4 - p0) > thresh) */ + "subu_s.qb %[c], %[p4], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p4] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r], $0, %[c] \n\t" + + /* flat |= (abs(q4 - q0) > thresh) */ + "subu_s.qb %[c], %[q4], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q4] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + "sll %[r], %[r], 24 \n\t" + "wrdsp %[r] \n\t" + "pick.qb %[flat3], $0, %[ones] \n\t" + + /* flat |= (abs(p1 - p0) > thresh) */ + "subu_s.qb %[c], %[p1], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], $0, %[c] \n\t" + + /* flat |= (abs(q1 - q0) > thresh) */ + "subu_s.qb %[c], %[q1], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(p0 - p2) > thresh) */ + "subu_s.qb %[c], %[p0], %[p2] \n\t" + "subu_s.qb %[r_k], %[p2], %[p0] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(q0 - q2) > thresh) */ + "subu_s.qb %[c], %[q0], %[q2] \n\t" + "subu_s.qb %[r_k], %[q2], %[q0] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(p3 - p0) > thresh) */ + "subu_s.qb %[c], %[p3], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(q3 - q0) > thresh) */ + "subu_s.qb %[c], %[q3], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + "sll %[r_flat], %[r_flat], 24 \n\t" + "wrdsp %[r_flat] \n\t" + "pick.qb %[flat1], $0, %[ones] \n\t" + /* flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3) */ + "and %[flat1], %[flat3], %[flat1] \n\t" + + : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r_flat] "=&r"(r_flat), + [flat1] "=&r"(flat1), [flat3] "=&r"(flat3) + : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), + [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3), [q4] "r"(q4), + [flat_thresh] "r"(flat_thresh), [ones] "r"(ones)); + + *flat2 = flat1; +} +#endif // #if HAVE_DSPR2 +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_mb_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_mb_dspr2.c new file mode 100644 index 0000000000..e42479257c --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_mb_dspr2.c @@ -0,0 +1,588 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/mips/common_dspr2.h" +#include "vpx_dsp/mips/loopfilter_filters_dspr2.h" +#include "vpx_dsp/mips/loopfilter_macros_dspr2.h" +#include "vpx_dsp/mips/loopfilter_masks_dspr2.h" +#include "vpx_mem/vpx_mem.h" + +#if HAVE_DSPR2 +void vpx_lpf_horizontal_8_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + uint32_t mask; + uint32_t hev, flat; + uint8_t i; + uint8_t *sp3, *sp2, *sp1, *sp0, *sq0, *sq1, *sq2, *sq3; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + uint32_t p1_f0, p0_f0, q0_f0, q1_f0; + uint32_t p3, p2, p1, p0, q0, q1, q2, q3; + uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l; + uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); + + /* prefetch data for store */ + prefetch_store(s); + + for (i = 0; i < 2; i++) { + sp3 = s - (pitch << 2); + sp2 = sp3 + pitch; + sp1 = sp2 + pitch; + sp0 = sp1 + pitch; + sq0 = s; + sq1 = s + pitch; + sq2 = sq1 + pitch; + sq3 = sq2 + pitch; + + __asm__ __volatile__( + "lw %[p3], (%[sp3]) \n\t" + "lw %[p2], (%[sp2]) \n\t" + "lw %[p1], (%[sp1]) \n\t" + "lw %[p0], (%[sp0]) \n\t" + "lw %[q0], (%[sq0]) \n\t" + "lw %[q1], (%[sq1]) \n\t" + "lw %[q2], (%[sq2]) \n\t" + "lw %[q3], (%[sq3]) \n\t" + + : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), + [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0) + : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0)); + + filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0, + p3, p2, q0, q1, q2, q3, &hev, &mask, &flat); + + if ((flat == 0) && (mask != 0)) { + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + __asm__ __volatile__( + "sw %[p1_f0], (%[sp1]) \n\t" + "sw %[p0_f0], (%[sp0]) \n\t" + "sw %[q0_f0], (%[sq0]) \n\t" + "sw %[q1_f0], (%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1)); + } else if ((mask & flat) == 0xFFFFFFFF) { + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + COMBINE_LEFT_RIGHT_0TO2() + + __asm__ __volatile__( + "sw %[p2], (%[sp2]) \n\t" + "sw %[p1], (%[sp1]) \n\t" + "sw %[p0], (%[sp0]) \n\t" + "sw %[q0], (%[sq0]) \n\t" + "sw %[q1], (%[sq1]) \n\t" + "sw %[q2], (%[sq2]) \n\t" + + : + : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), + [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1), + [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if ((flat != 0) && (mask != 0)) { + /* filtering */ + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + if (mask & flat & 0x000000FF) { + __asm__ __volatile__( + "sb %[p2_r], (%[sp2]) \n\t" + "sb %[p1_r], (%[sp1]) \n\t" + "sb %[p0_r], (%[sp0]) \n\t" + "sb %[q0_r], (%[sq0]) \n\t" + "sb %[q1_r], (%[sq1]) \n\t" + "sb %[q2_r], (%[sq2]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0x000000FF) { + __asm__ __volatile__( + "sb %[p1_f0], (%[sp1]) \n\t" + "sb %[p0_f0], (%[sp0]) \n\t" + "sb %[q0_f0], (%[sq0]) \n\t" + "sb %[q1_f0], (%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r), + [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p2_r], +1(%[sp2]) \n\t" + "sb %[p1_r], +1(%[sp1]) \n\t" + "sb %[p0_r], +1(%[sp0]) \n\t" + "sb %[q0_r], +1(%[sq0]) \n\t" + "sb %[q1_r], +1(%[sq1]) \n\t" + "sb %[q2_r], +1(%[sq2]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p1_f0], +1(%[sp1]) \n\t" + "sb %[p0_f0], +1(%[sp0]) \n\t" + "sb %[q0_f0], +1(%[sq0]) \n\t" + "sb %[q1_f0], +1(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0), + [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0), + [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p2_l], +2(%[sp2]) \n\t" + "sb %[p1_l], +2(%[sp1]) \n\t" + "sb %[p0_l], +2(%[sp0]) \n\t" + "sb %[q0_l], +2(%[sq0]) \n\t" + "sb %[q1_l], +2(%[sq1]) \n\t" + "sb %[q2_l], +2(%[sq2]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p1_f0], +2(%[sp1]) \n\t" + "sb %[p0_f0], +2(%[sp0]) \n\t" + "sb %[q0_f0], +2(%[sq0]) \n\t" + "sb %[q1_f0], +2(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l), + [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0xFF000000) { + __asm__ __volatile__( + "sb %[p2_l], +3(%[sp2]) \n\t" + "sb %[p1_l], +3(%[sp1]) \n\t" + "sb %[p0_l], +3(%[sp0]) \n\t" + "sb %[q0_l], +3(%[sq0]) \n\t" + "sb %[q1_l], +3(%[sq1]) \n\t" + "sb %[q2_l], +3(%[sq2]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0xFF000000) { + __asm__ __volatile__( + "sb %[p1_f0], +3(%[sp1]) \n\t" + "sb %[p0_f0], +3(%[sp0]) \n\t" + "sb %[q0_f0], +3(%[sq0]) \n\t" + "sb %[q1_f0], +3(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + } + + s = s + 4; + } +} + +void vpx_lpf_vertical_8_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + uint8_t i; + uint32_t mask, hev, flat; + uint8_t *s1, *s2, *s3, *s4; + uint32_t prim1, prim2, sec3, sec4, prim3, prim4; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + uint32_t p3, p2, p1, p0, q3, q2, q1, q0; + uint32_t p1_f0, p0_f0, q0_f0, q1_f0; + uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l; + uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); + + prefetch_store(s + pitch); + + for (i = 0; i < 2; i++) { + s1 = s; + s2 = s + pitch; + s3 = s2 + pitch; + s4 = s3 + pitch; + s = s4 + pitch; + + __asm__ __volatile__( + "lw %[p0], -4(%[s1]) \n\t" + "lw %[p1], -4(%[s2]) \n\t" + "lw %[p2], -4(%[s3]) \n\t" + "lw %[p3], -4(%[s4]) \n\t" + "lw %[q3], (%[s1]) \n\t" + "lw %[q2], (%[s2]) \n\t" + "lw %[q1], (%[s3]) \n\t" + "lw %[q0], (%[s4]) \n\t" + + : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), + [q0] "=&r"(q0), [q1] "=&r"(q1), [q2] "=&r"(q2), [q3] "=&r"(q3) + : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); + + /* transpose p3, p2, p1, p0 + original (when loaded from memory) + register -4 -3 -2 -1 + p0 p0_0 p0_1 p0_2 p0_3 + p1 p1_0 p1_1 p1_2 p1_3 + p2 p2_0 p2_1 p2_2 p2_3 + p3 p3_0 p3_1 p3_2 p3_3 + + after transpose + register + p0 p3_3 p2_3 p1_3 p0_3 + p1 p3_2 p2_2 p1_2 p0_2 + p2 p3_1 p2_1 p1_1 p0_1 + p3 p3_0 p2_0 p1_0 p0_0 + */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p0], %[p1] \n\t" + "precr.qb.ph %[prim2], %[p0], %[p1] \n\t" + "precrq.qb.ph %[prim3], %[p2], %[p3] \n\t" + "precr.qb.ph %[prim4], %[p2], %[p3] \n\t" + + "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p0], %[p1], %[sec3] \n\t" + "precrq.ph.w %[p2], %[p3], %[sec4] \n\t" + "append %[p1], %[sec3], 16 \n\t" + "append %[p3], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2), + [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* transpose q0, q1, q2, q3 + original (when loaded from memory) + register +1 +2 +3 +4 + q3 q3_0 q3_1 q3_2 q3_3 + q2 q2_0 q2_1 q2_2 q2_3 + q1 q1_0 q1_1 q1_2 q1_3 + q0 q0_0 q0_1 q0_2 q0_3 + + after transpose + register + q3 q0_3 q1_3 q2_3 q3_3 + q2 q0_2 q1_2 q2_2 q3_2 + q1 q0_1 q1_1 q2_1 q3_1 + q0 q0_0 q1_0 q2_0 q3_0 + */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[q3], %[q2] \n\t" + "precr.qb.ph %[prim2], %[q3], %[q2] \n\t" + "precrq.qb.ph %[prim3], %[q1], %[q0] \n\t" + "precr.qb.ph %[prim4], %[q1], %[q0] \n\t" + + "precrq.qb.ph %[q2], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[q0], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[q3], %[q2], %[sec3] \n\t" + "precrq.ph.w %[q1], %[q0], %[sec4] \n\t" + "append %[q2], %[sec3], 16 \n\t" + "append %[q0], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1), + [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0, + p3, p2, q0, q1, q2, q3, &hev, &mask, &flat); + + if ((flat == 0) && (mask != 0)) { + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + STORE_F0() + } else if ((mask & flat) == 0xFFFFFFFF) { + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + STORE_F1() + } else if ((flat != 0) && (mask != 0)) { + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + if (mask & flat & 0x000000FF) { + __asm__ __volatile__( + "sb %[p2_r], -3(%[s4]) \n\t" + "sb %[p1_r], -2(%[s4]) \n\t" + "sb %[p0_r], -1(%[s4]) \n\t" + "sb %[q0_r], (%[s4]) \n\t" + "sb %[q1_r], +1(%[s4]) \n\t" + "sb %[q2_r], +2(%[s4]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [s4] "r"(s4)); + } else if (mask & 0x000000FF) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s4]) \n\t" + "sb %[p0_f0], -1(%[s4]) \n\t" + "sb %[q0_f0], (%[s4]) \n\t" + "sb %[q1_f0], +1(%[s4]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s4] "r"(s4)); + } + + __asm__ __volatile__( + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r), + [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p2_r], -3(%[s3]) \n\t" + "sb %[p1_r], -2(%[s3]) \n\t" + "sb %[p0_r], -1(%[s3]) \n\t" + "sb %[q0_r], (%[s3]) \n\t" + "sb %[q1_r], +1(%[s3]) \n\t" + "sb %[q2_r], +2(%[s3]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [s3] "r"(s3)); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s3]) \n\t" + "sb %[p0_f0], -1(%[s3]) \n\t" + "sb %[q0_f0], (%[s3]) \n\t" + "sb %[q1_f0], +1(%[s3]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s3] "r"(s3)); + } + + __asm__ __volatile__( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0), + [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0), + [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p2_l], -3(%[s2]) \n\t" + "sb %[p1_l], -2(%[s2]) \n\t" + "sb %[p0_l], -1(%[s2]) \n\t" + "sb %[q0_l], (%[s2]) \n\t" + "sb %[q1_l], +1(%[s2]) \n\t" + "sb %[q2_l], +2(%[s2]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [s2] "r"(s2)); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s2]) \n\t" + "sb %[p0_f0], -1(%[s2]) \n\t" + "sb %[q0_f0], (%[s2]) \n\t" + "sb %[q1_f0], +1(%[s2]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s2] "r"(s2)); + } + + __asm__ __volatile__( + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l), + [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0xFF000000) { + __asm__ __volatile__( + "sb %[p2_l], -3(%[s1]) \n\t" + "sb %[p1_l], -2(%[s1]) \n\t" + "sb %[p0_l], -1(%[s1]) \n\t" + "sb %[q0_l], (%[s1]) \n\t" + "sb %[q1_l], +1(%[s1]) \n\t" + "sb %[q2_l], +2(%[s1]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [s1] "r"(s1)); + } else if (mask & 0xFF000000) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s1]) \n\t" + "sb %[p0_f0], -1(%[s1]) \n\t" + "sb %[q0_f0], (%[s1]) \n\t" + "sb %[q1_f0], +1(%[s1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s1] "r"(s1)); + } + } + } +} +#endif // #if HAVE_DSPR2 diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c new file mode 100644 index 0000000000..9c1f5143f2 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c @@ -0,0 +1,732 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/mips/common_dspr2.h" +#include "vpx_dsp/mips/loopfilter_filters_dspr2.h" +#include "vpx_dsp/mips/loopfilter_macros_dspr2.h" +#include "vpx_dsp/mips/loopfilter_masks_dspr2.h" +#include "vpx_mem/vpx_mem.h" + +#if HAVE_DSPR2 +static void mb_lpf_horizontal_edge(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int count) { + uint32_t mask; + uint32_t hev, flat, flat2; + uint8_t i; + uint8_t *sp7, *sp6, *sp5, *sp4, *sp3, *sp2, *sp1, *sp0; + uint8_t *sq0, *sq1, *sq2, *sq3, *sq4, *sq5, *sq6, *sq7; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + uint32_t p1_f0, p0_f0, q0_f0, q1_f0; + uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l; + uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l; + uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r; + uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r; + uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1; + uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); + + /* prefetch data for store */ + prefetch_store(s); + + for (i = 0; i < (2 * count); i++) { + sp7 = s - (pitch << 3); + sp6 = sp7 + pitch; + sp5 = sp6 + pitch; + sp4 = sp5 + pitch; + sp3 = sp4 + pitch; + sp2 = sp3 + pitch; + sp1 = sp2 + pitch; + sp0 = sp1 + pitch; + sq0 = s; + sq1 = s + pitch; + sq2 = sq1 + pitch; + sq3 = sq2 + pitch; + sq4 = sq3 + pitch; + sq5 = sq4 + pitch; + sq6 = sq5 + pitch; + sq7 = sq6 + pitch; + + __asm__ __volatile__( + "lw %[p7], (%[sp7]) \n\t" + "lw %[p6], (%[sp6]) \n\t" + "lw %[p5], (%[sp5]) \n\t" + "lw %[p4], (%[sp4]) \n\t" + "lw %[p3], (%[sp3]) \n\t" + "lw %[p2], (%[sp2]) \n\t" + "lw %[p1], (%[sp1]) \n\t" + "lw %[p0], (%[sp0]) \n\t" + + : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), + [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4) + : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sp4] "r"(sp4), [sp5] "r"(sp5), [sp6] "r"(sp6), [sp7] "r"(sp7)); + + __asm__ __volatile__( + "lw %[q0], (%[sq0]) \n\t" + "lw %[q1], (%[sq1]) \n\t" + "lw %[q2], (%[sq2]) \n\t" + "lw %[q3], (%[sq3]) \n\t" + "lw %[q4], (%[sq4]) \n\t" + "lw %[q5], (%[sq5]) \n\t" + "lw %[q6], (%[sq6]) \n\t" + "lw %[q7], (%[sq7]) \n\t" + + : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0), + [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4) + : [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0), + [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6), [sq7] "r"(sq7)); + + filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0, + p3, p2, q0, q1, q2, q3, &hev, &mask, &flat); + + flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2); + + /* f0 */ + if (((flat2 == 0) && (flat == 0) && (mask != 0)) || + ((flat2 != 0) && (flat == 0) && (mask != 0))) { + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + __asm__ __volatile__( + "sw %[p1_f0], (%[sp1]) \n\t" + "sw %[p0_f0], (%[sp0]) \n\t" + "sw %[q0_f0], (%[sq0]) \n\t" + "sw %[q1_f0], (%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1)); + } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) && + (mask == 0xFFFFFFFF)) { + /* f2 */ + PACK_LEFT_0TO3() + PACK_LEFT_4TO7() + wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l, + &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l, + &q6_l, &q7_l); + + PACK_RIGHT_0TO3() + PACK_RIGHT_4TO7() + wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r, + &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r, + &q6_r, &q7_r); + + COMBINE_LEFT_RIGHT_0TO2() + COMBINE_LEFT_RIGHT_3TO6() + + __asm__ __volatile__( + "sw %[p6], (%[sp6]) \n\t" + "sw %[p5], (%[sp5]) \n\t" + "sw %[p4], (%[sp4]) \n\t" + "sw %[p3], (%[sp3]) \n\t" + "sw %[p2], (%[sp2]) \n\t" + "sw %[p1], (%[sp1]) \n\t" + "sw %[p0], (%[sp0]) \n\t" + + : + : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), + [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [sp6] "r"(sp6), + [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3), [sp2] "r"(sp2), + [sp1] "r"(sp1), [sp0] "r"(sp0)); + + __asm__ __volatile__( + "sw %[q6], (%[sq6]) \n\t" + "sw %[q5], (%[sq5]) \n\t" + "sw %[q4], (%[sq4]) \n\t" + "sw %[q3], (%[sq3]) \n\t" + "sw %[q2], (%[sq2]) \n\t" + "sw %[q1], (%[sq1]) \n\t" + "sw %[q0], (%[sq0]) \n\t" + + : + : [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3), + [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [sq6] "r"(sq6), + [sq5] "r"(sq5), [sq4] "r"(sq4), [sq3] "r"(sq3), [sq2] "r"(sq2), + [sq1] "r"(sq1), [sq0] "r"(sq0)); + } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) { + /* f1 */ + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + COMBINE_LEFT_RIGHT_0TO2() + + __asm__ __volatile__( + "sw %[p2], (%[sp2]) \n\t" + "sw %[p1], (%[sp1]) \n\t" + "sw %[p0], (%[sp0]) \n\t" + "sw %[q0], (%[sq0]) \n\t" + "sw %[q1], (%[sq1]) \n\t" + "sw %[q2], (%[sq2]) \n\t" + + : + : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), + [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1), + [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) { + /* f0+f1 */ + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + if (mask & flat & 0x000000FF) { + __asm__ __volatile__( + "sb %[p2_r], (%[sp2]) \n\t" + "sb %[p1_r], (%[sp1]) \n\t" + "sb %[p0_r], (%[sp0]) \n\t" + "sb %[q0_r], (%[sq0]) \n\t" + "sb %[q1_r], (%[sq1]) \n\t" + "sb %[q2_r], (%[sq2]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0x000000FF) { + __asm__ __volatile__( + "sb %[p1_f0], (%[sp1]) \n\t" + "sb %[p0_f0], (%[sp0]) \n\t" + "sb %[q0_f0], (%[sq0]) \n\t" + "sb %[q1_f0], (%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r), + [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p2_r], +1(%[sp2]) \n\t" + "sb %[p1_r], +1(%[sp1]) \n\t" + "sb %[p0_r], +1(%[sp0]) \n\t" + "sb %[q0_r], +1(%[sq0]) \n\t" + "sb %[q1_r], +1(%[sq1]) \n\t" + "sb %[q2_r], +1(%[sq2]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p1_f0], +1(%[sp1]) \n\t" + "sb %[p0_f0], +1(%[sp0]) \n\t" + "sb %[q0_f0], +1(%[sq0]) \n\t" + "sb %[q1_f0], +1(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p2_l], +2(%[sp2]) \n\t" + "sb %[p1_l], +2(%[sp1]) \n\t" + "sb %[p0_l], +2(%[sp0]) \n\t" + "sb %[q0_l], +2(%[sq0]) \n\t" + "sb %[q1_l], +2(%[sq1]) \n\t" + "sb %[q2_l], +2(%[sq2]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p1_f0], +2(%[sp1]) \n\t" + "sb %[p0_f0], +2(%[sp0]) \n\t" + "sb %[q0_f0], +2(%[sq0]) \n\t" + "sb %[q1_f0], +2(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l), + [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0xFF000000) { + __asm__ __volatile__( + "sb %[p2_l], +3(%[sp2]) \n\t" + "sb %[p1_l], +3(%[sp1]) \n\t" + "sb %[p0_l], +3(%[sp0]) \n\t" + "sb %[q0_l], +3(%[sq0]) \n\t" + "sb %[q1_l], +3(%[sq1]) \n\t" + "sb %[q2_l], +3(%[sq2]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0xFF000000) { + __asm__ __volatile__( + "sb %[p1_f0], +3(%[sp1]) \n\t" + "sb %[p0_f0], +3(%[sp0]) \n\t" + "sb %[q0_f0], +3(%[sq0]) \n\t" + "sb %[q1_f0], +3(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) { + /* f0 + f1 + f2 */ + /* f0 function */ + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + /* f1 function */ + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1, + &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1, + &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1); + + /* f2 function */ + PACK_LEFT_4TO7() + wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l, + &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l, + &q6_l, &q7_l); + + PACK_RIGHT_4TO7() + wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r, + &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r, + &q6_r, &q7_r); + + if (mask & flat & flat2 & 0x000000FF) { + __asm__ __volatile__( + "sb %[p6_r], (%[sp6]) \n\t" + "sb %[p5_r], (%[sp5]) \n\t" + "sb %[p4_r], (%[sp4]) \n\t" + "sb %[p3_r], (%[sp3]) \n\t" + "sb %[p2_r], (%[sp2]) \n\t" + "sb %[p1_r], (%[sp1]) \n\t" + "sb %[p0_r], (%[sp0]) \n\t" + + : + : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r), + [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), + [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3), + [sp2] "r"(sp2), [sp1] "r"(sp1), [p0_r] "r"(p0_r), [sp0] "r"(sp0)); + + __asm__ __volatile__( + "sb %[q0_r], (%[sq0]) \n\t" + "sb %[q1_r], (%[sq1]) \n\t" + "sb %[q2_r], (%[sq2]) \n\t" + "sb %[q3_r], (%[sq3]) \n\t" + "sb %[q4_r], (%[sq4]) \n\t" + "sb %[q5_r], (%[sq5]) \n\t" + "sb %[q6_r], (%[sq6]) \n\t" + + : + : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), + [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), + [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6)); + } else if (mask & flat & 0x000000FF) { + __asm__ __volatile__( + "sb %[p2_r_f1], (%[sp2]) \n\t" + "sb %[p1_r_f1], (%[sp1]) \n\t" + "sb %[p0_r_f1], (%[sp0]) \n\t" + "sb %[q0_r_f1], (%[sq0]) \n\t" + "sb %[q1_r_f1], (%[sq1]) \n\t" + "sb %[q2_r_f1], (%[sq2]) \n\t" + + : + : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1), + [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1), + [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2), + [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), + [sq2] "r"(sq2)); + } else if (mask & 0x000000FF) { + __asm__ __volatile__( + "sb %[p1_f0], (%[sp1]) \n\t" + "sb %[p0_f0], (%[sp0]) \n\t" + "sb %[q0_f0], (%[sq0]) \n\t" + "sb %[q1_f0], (%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p6_r], %[p6_r], 16 \n\t" + "srl %[p5_r], %[p5_r], 16 \n\t" + "srl %[p4_r], %[p4_r], 16 \n\t" + "srl %[p3_r], %[p3_r], 16 \n\t" + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[q3_r], %[q3_r], 16 \n\t" + "srl %[q4_r], %[q4_r], 16 \n\t" + "srl %[q5_r], %[q5_r], 16 \n\t" + "srl %[q6_r], %[q6_r], 16 \n\t" + + : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r), + [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r), [p4_r] "+r"(p4_r), + [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), + [q6_r] "+r"(q6_r), [p0_r] "+r"(p0_r) + :); + + __asm__ __volatile__( + "srl %[p2_r_f1], %[p2_r_f1], 16 \n\t" + "srl %[p1_r_f1], %[p1_r_f1], 16 \n\t" + "srl %[p0_r_f1], %[p0_r_f1], 16 \n\t" + "srl %[q0_r_f1], %[q0_r_f1], 16 \n\t" + "srl %[q1_r_f1], %[q1_r_f1], 16 \n\t" + "srl %[q2_r_f1], %[q2_r_f1], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1), + [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1), + [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & flat2 & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p6_r], +1(%[sp6]) \n\t" + "sb %[p5_r], +1(%[sp5]) \n\t" + "sb %[p4_r], +1(%[sp4]) \n\t" + "sb %[p3_r], +1(%[sp3]) \n\t" + "sb %[p2_r], +1(%[sp2]) \n\t" + "sb %[p1_r], +1(%[sp1]) \n\t" + "sb %[p0_r], +1(%[sp0]) \n\t" + + : + : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r), + [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), + [p0_r] "r"(p0_r), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), + [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0)); + + __asm__ __volatile__( + "sb %[q0_r], +1(%[sq0]) \n\t" + "sb %[q1_r], +1(%[sq1]) \n\t" + "sb %[q2_r], +1(%[sq2]) \n\t" + "sb %[q3_r], +1(%[sq3]) \n\t" + "sb %[q4_r], +1(%[sq4]) \n\t" + "sb %[q5_r], +1(%[sq5]) \n\t" + "sb %[q6_r], +1(%[sq6]) \n\t" + + : + : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), + [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), + [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6)); + } else if (mask & flat & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p2_r_f1], +1(%[sp2]) \n\t" + "sb %[p1_r_f1], +1(%[sp1]) \n\t" + "sb %[p0_r_f1], +1(%[sp0]) \n\t" + "sb %[q0_r_f1], +1(%[sq0]) \n\t" + "sb %[q1_r_f1], +1(%[sq1]) \n\t" + "sb %[q2_r_f1], +1(%[sq2]) \n\t" + + : + : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1), + [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1), + [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2), + [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), + [sq2] "r"(sq2)); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p1_f0], +1(%[sp1]) \n\t" + "sb %[p0_f0], +1(%[sp0]) \n\t" + "sb %[q0_f0], +1(%[sq0]) \n\t" + "sb %[q1_f0], +1(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & flat2 & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p6_l], +2(%[sp6]) \n\t" + "sb %[p5_l], +2(%[sp5]) \n\t" + "sb %[p4_l], +2(%[sp4]) \n\t" + "sb %[p3_l], +2(%[sp3]) \n\t" + "sb %[p2_l], +2(%[sp2]) \n\t" + "sb %[p1_l], +2(%[sp1]) \n\t" + "sb %[p0_l], +2(%[sp0]) \n\t" + + : + : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), + [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), + [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), + [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0)); + + __asm__ __volatile__( + "sb %[q0_l], +2(%[sq0]) \n\t" + "sb %[q1_l], +2(%[sq1]) \n\t" + "sb %[q2_l], +2(%[sq2]) \n\t" + "sb %[q3_l], +2(%[sq3]) \n\t" + "sb %[q4_l], +2(%[sq4]) \n\t" + "sb %[q5_l], +2(%[sq5]) \n\t" + "sb %[q6_l], +2(%[sq6]) \n\t" + + : + : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), + [q6_l] "r"(q6_l), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), + [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6)); + } else if (mask & flat & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p2_l_f1], +2(%[sp2]) \n\t" + "sb %[p1_l_f1], +2(%[sp1]) \n\t" + "sb %[p0_l_f1], +2(%[sp0]) \n\t" + "sb %[q0_l_f1], +2(%[sq0]) \n\t" + "sb %[q1_l_f1], +2(%[sq1]) \n\t" + "sb %[q2_l_f1], +2(%[sq2]) \n\t" + + : + : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1), + [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1), + [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2), + [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), + [sq2] "r"(sq2)); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p1_f0], +2(%[sp1]) \n\t" + "sb %[p0_f0], +2(%[sp0]) \n\t" + "sb %[q0_f0], +2(%[sq0]) \n\t" + "sb %[q1_f0], +2(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p6_l], %[p6_l], 16 \n\t" + "srl %[p5_l], %[p5_l], 16 \n\t" + "srl %[p4_l], %[p4_l], 16 \n\t" + "srl %[p3_l], %[p3_l], 16 \n\t" + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[q3_l], %[q3_l], 16 \n\t" + "srl %[q4_l], %[q4_l], 16 \n\t" + "srl %[q5_l], %[q5_l], 16 \n\t" + "srl %[q6_l], %[q6_l], 16 \n\t" + + : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l), + [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l), + [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l), + [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l) + :); + + __asm__ __volatile__( + "srl %[p2_l_f1], %[p2_l_f1], 16 \n\t" + "srl %[p1_l_f1], %[p1_l_f1], 16 \n\t" + "srl %[p0_l_f1], %[p0_l_f1], 16 \n\t" + "srl %[q0_l_f1], %[q0_l_f1], 16 \n\t" + "srl %[q1_l_f1], %[q1_l_f1], 16 \n\t" + "srl %[q2_l_f1], %[q2_l_f1], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1), + [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1), + [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & flat2 & 0xFF000000) { + __asm__ __volatile__( + "sb %[p6_l], +3(%[sp6]) \n\t" + "sb %[p5_l], +3(%[sp5]) \n\t" + "sb %[p4_l], +3(%[sp4]) \n\t" + "sb %[p3_l], +3(%[sp3]) \n\t" + "sb %[p2_l], +3(%[sp2]) \n\t" + "sb %[p1_l], +3(%[sp1]) \n\t" + "sb %[p0_l], +3(%[sp0]) \n\t" + + : + : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), + [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), + [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), + [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0)); + + __asm__ __volatile__( + "sb %[q0_l], +3(%[sq0]) \n\t" + "sb %[q1_l], +3(%[sq1]) \n\t" + "sb %[q2_l], +3(%[sq2]) \n\t" + "sb %[q3_l], +3(%[sq3]) \n\t" + "sb %[q4_l], +3(%[sq4]) \n\t" + "sb %[q5_l], +3(%[sq5]) \n\t" + "sb %[q6_l], +3(%[sq6]) \n\t" + + : + : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), + [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), [sq3] "r"(sq3), + [sq4] "r"(sq4), [sq5] "r"(sq5), [q6_l] "r"(q6_l), [sq6] "r"(sq6)); + } else if (mask & flat & 0xFF000000) { + __asm__ __volatile__( + "sb %[p2_l_f1], +3(%[sp2]) \n\t" + "sb %[p1_l_f1], +3(%[sp1]) \n\t" + "sb %[p0_l_f1], +3(%[sp0]) \n\t" + "sb %[q0_l_f1], +3(%[sq0]) \n\t" + "sb %[q1_l_f1], +3(%[sq1]) \n\t" + "sb %[q2_l_f1], +3(%[sq2]) \n\t" + + : + : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1), + [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1), + [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2), + [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), + [sq2] "r"(sq2)); + } else if (mask & 0xFF000000) { + __asm__ __volatile__( + "sb %[p1_f0], +3(%[sp1]) \n\t" + "sb %[p0_f0], +3(%[sp0]) \n\t" + "sb %[q0_f0], +3(%[sq0]) \n\t" + "sb %[q1_f0], +3(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + } + + s = s + 4; + } +} + +void vpx_lpf_horizontal_16_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 1); +} + +void vpx_lpf_horizontal_16_dual_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 2); +} +#endif // #if HAVE_DSPR2 diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c new file mode 100644 index 0000000000..96e8d8858a --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c @@ -0,0 +1,756 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/mips/common_dspr2.h" +#include "vpx_dsp/mips/loopfilter_filters_dspr2.h" +#include "vpx_dsp/mips/loopfilter_macros_dspr2.h" +#include "vpx_dsp/mips/loopfilter_masks_dspr2.h" +#include "vpx_mem/vpx_mem.h" + +#if HAVE_DSPR2 +void vpx_lpf_vertical_16_dspr2(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8_t i; + uint32_t mask, hev, flat, flat2; + uint8_t *s1, *s2, *s3, *s4; + uint32_t prim1, prim2, sec3, sec4, prim3, prim4; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + uint32_t p1_f0, p0_f0, q0_f0, q1_f0; + uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l; + uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l; + uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r; + uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r; + uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1; + uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); + + prefetch_store(s + pitch); + + for (i = 0; i < 2; i++) { + s1 = s; + s2 = s + pitch; + s3 = s2 + pitch; + s4 = s3 + pitch; + s = s4 + pitch; + + __asm__ __volatile__( + "lw %[p0], -4(%[s1]) \n\t" + "lw %[p1], -4(%[s2]) \n\t" + "lw %[p2], -4(%[s3]) \n\t" + "lw %[p3], -4(%[s4]) \n\t" + "lw %[p4], -8(%[s1]) \n\t" + "lw %[p5], -8(%[s2]) \n\t" + "lw %[p6], -8(%[s3]) \n\t" + "lw %[p7], -8(%[s4]) \n\t" + + : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), + [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4) + : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); + + __asm__ __volatile__( + "lw %[q3], (%[s1]) \n\t" + "lw %[q2], (%[s2]) \n\t" + "lw %[q1], (%[s3]) \n\t" + "lw %[q0], (%[s4]) \n\t" + "lw %[q7], +4(%[s1]) \n\t" + "lw %[q6], +4(%[s2]) \n\t" + "lw %[q5], +4(%[s3]) \n\t" + "lw %[q4], +4(%[s4]) \n\t" + + : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0), + [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4) + : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); + + /* transpose p3, p2, p1, p0 + original (when loaded from memory) + register -4 -3 -2 -1 + p0 p0_0 p0_1 p0_2 p0_3 + p1 p1_0 p1_1 p1_2 p1_3 + p2 p2_0 p2_1 p2_2 p2_3 + p3 p3_0 p3_1 p3_2 p3_3 + + after transpose + register + p0 p3_3 p2_3 p1_3 p0_3 + p1 p3_2 p2_2 p1_2 p0_2 + p2 p3_1 p2_1 p1_1 p0_1 + p3 p3_0 p2_0 p1_0 p0_0 + */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p0], %[p1] \n\t" + "precr.qb.ph %[prim2], %[p0], %[p1] \n\t" + "precrq.qb.ph %[prim3], %[p2], %[p3] \n\t" + "precr.qb.ph %[prim4], %[p2], %[p3] \n\t" + + "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p0], %[p1], %[sec3] \n\t" + "precrq.ph.w %[p2], %[p3], %[sec4] \n\t" + "append %[p1], %[sec3], 16 \n\t" + "append %[p3], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2), + [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* transpose q0, q1, q2, q3 + original (when loaded from memory) + register +1 +2 +3 +4 + q3 q3_0 q3_1 q3_2 q3_3 + q2 q2_0 q2_1 q2_2 q2_3 + q1 q1_0 q1_1 q1_2 q1_3 + q0 q0_0 q0_1 q0_2 q0_3 + + after transpose + register + q3 q0_3 q1_3 q2_3 q3_3 + q2 q0_2 q1_2 q2_2 q3_2 + q1 q0_1 q1_1 q2_1 q3_1 + q0 q0_0 q1_0 q2_0 q3_0 + */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[q3], %[q2] \n\t" + "precr.qb.ph %[prim2], %[q3], %[q2] \n\t" + "precrq.qb.ph %[prim3], %[q1], %[q0] \n\t" + "precr.qb.ph %[prim4], %[q1], %[q0] \n\t" + + "precrq.qb.ph %[q2], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[q0], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[q3], %[q2], %[sec3] \n\t" + "precrq.ph.w %[q1], %[q0], %[sec4] \n\t" + "append %[q2], %[sec3], 16 \n\t" + "append %[q0], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1), + [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* transpose p7, p6, p5, p4 + original (when loaded from memory) + register -8 -7 -6 -5 + p4 p4_0 p4_1 p4_2 p4_3 + p5 p5_0 p5_1 p5_2 p5_3 + p6 p6_0 p6_1 p6_2 p6_3 + p7 p7_0 p7_1 p7_2 p7_3 + + after transpose + register + p4 p7_3 p6_3 p5_3 p4_3 + p5 p7_2 p6_2 p5_2 p4_2 + p6 p7_1 p6_1 p5_1 p4_1 + p7 p7_0 p6_0 p5_0 p4_0 + */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p4], %[p5] \n\t" + "precr.qb.ph %[prim2], %[p4], %[p5] \n\t" + "precrq.qb.ph %[prim3], %[p6], %[p7] \n\t" + "precr.qb.ph %[prim4], %[p6], %[p7] \n\t" + + "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[p7], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p4], %[p5], %[sec3] \n\t" + "precrq.ph.w %[p6], %[p7], %[sec4] \n\t" + "append %[p5], %[sec3], 16 \n\t" + "append %[p7], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p4] "+r"(p4), [p5] "+r"(p5), [p6] "+r"(p6), + [p7] "+r"(p7), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* transpose q4, q5, q6, q7 + original (when loaded from memory) + register +5 +6 +7 +8 + q7 q7_0 q7_1 q7_2 q7_3 + q6 q6_0 q6_1 q6_2 q6_3 + q5 q5_0 q5_1 q5_2 q5_3 + q4 q4_0 q4_1 q4_2 q4_3 + + after transpose + register + q7 q4_3 q5_3 q26_3 q7_3 + q6 q4_2 q5_2 q26_2 q7_2 + q5 q4_1 q5_1 q26_1 q7_1 + q4 q4_0 q5_0 q26_0 q7_0 + */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[q7], %[q6] \n\t" + "precr.qb.ph %[prim2], %[q7], %[q6] \n\t" + "precrq.qb.ph %[prim3], %[q5], %[q4] \n\t" + "precr.qb.ph %[prim4], %[q5], %[q4] \n\t" + + "precrq.qb.ph %[q6], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[q4], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[q7], %[q6], %[sec3] \n\t" + "precrq.ph.w %[q5], %[q4], %[sec4] \n\t" + "append %[q6], %[sec3], 16 \n\t" + "append %[q4], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [q7] "+r"(q7), [q6] "+r"(q6), [q5] "+r"(q5), + [q4] "+r"(q4), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0, + p3, p2, q0, q1, q2, q3, &hev, &mask, &flat); + + flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2); + + /* f0 */ + if (((flat2 == 0) && (flat == 0) && (mask != 0)) || + ((flat2 != 0) && (flat == 0) && (mask != 0))) { + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + STORE_F0() + } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) && + (mask == 0xFFFFFFFF)) { + /* f2 */ + PACK_LEFT_0TO3() + PACK_LEFT_4TO7() + wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l, + &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l, + &q6_l, &q7_l); + + PACK_RIGHT_0TO3() + PACK_RIGHT_4TO7() + wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r, + &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r, + &q6_r, &q7_r); + + STORE_F2() + } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) { + /* f1 */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + STORE_F1() + } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) { + /* f0 + f1 */ + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + if (mask & flat & 0x000000FF) { + __asm__ __volatile__( + "sb %[p2_r], -3(%[s4]) \n\t" + "sb %[p1_r], -2(%[s4]) \n\t" + "sb %[p0_r], -1(%[s4]) \n\t" + "sb %[q0_r], (%[s4]) \n\t" + "sb %[q1_r], +1(%[s4]) \n\t" + "sb %[q2_r], +2(%[s4]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [s4] "r"(s4)); + } else if (mask & 0x000000FF) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s4]) \n\t" + "sb %[p0_f0], -1(%[s4]) \n\t" + "sb %[q0_f0], (%[s4]) \n\t" + "sb %[q1_f0], +1(%[s4]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s4] "r"(s4)); + } + + __asm__ __volatile__( + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r), + [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p2_r], -3(%[s3]) \n\t" + "sb %[p1_r], -2(%[s3]) \n\t" + "sb %[p0_r], -1(%[s3]) \n\t" + "sb %[q0_r], (%[s3]) \n\t" + "sb %[q1_r], +1(%[s3]) \n\t" + "sb %[q2_r], +2(%[s3]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [s3] "r"(s3)); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s3]) \n\t" + "sb %[p0_f0], -1(%[s3]) \n\t" + "sb %[q0_f0], (%[s3]) \n\t" + "sb %[q1_f0], +1(%[s3]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s3] "r"(s3)); + } + + __asm__ __volatile__( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p2_l], -3(%[s2]) \n\t" + "sb %[p1_l], -2(%[s2]) \n\t" + "sb %[p0_l], -1(%[s2]) \n\t" + "sb %[q0_l], (%[s2]) \n\t" + "sb %[q1_l], +1(%[s2]) \n\t" + "sb %[q2_l], +2(%[s2]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [s2] "r"(s2)); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s2]) \n\t" + "sb %[p0_f0], -1(%[s2]) \n\t" + "sb %[q0_f0], (%[s2]) \n\t" + "sb %[q1_f0], +1(%[s2]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s2] "r"(s2)); + } + + __asm__ __volatile__( + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l), + [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0xFF000000) { + __asm__ __volatile__( + "sb %[p2_l], -3(%[s1]) \n\t" + "sb %[p1_l], -2(%[s1]) \n\t" + "sb %[p0_l], -1(%[s1]) \n\t" + "sb %[q0_l], (%[s1]) \n\t" + "sb %[q1_l], +1(%[s1]) \n\t" + "sb %[q2_l], +2(%[s1]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [s1] "r"(s1)); + } else if (mask & 0xFF000000) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s1]) \n\t" + "sb %[p0_f0], -1(%[s1]) \n\t" + "sb %[q0_f0], (%[s1]) \n\t" + "sb %[q1_f0], +1(%[s1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s1] "r"(s1)); + } + } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) { + /* f0+f1+f2 */ + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + PACK_LEFT_0TO3() + mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1, + &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1); + + PACK_RIGHT_0TO3() + mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1, + &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1); + + PACK_LEFT_4TO7() + wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l, + &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l, + &q6_l, &q7_l); + + PACK_RIGHT_4TO7() + wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r, + &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r, + &q6_r, &q7_r); + + if (mask & flat & flat2 & 0x000000FF) { + __asm__ __volatile__( + "sb %[p6_r], -7(%[s4]) \n\t" + "sb %[p5_r], -6(%[s4]) \n\t" + "sb %[p4_r], -5(%[s4]) \n\t" + "sb %[p3_r], -4(%[s4]) \n\t" + "sb %[p2_r], -3(%[s4]) \n\t" + "sb %[p1_r], -2(%[s4]) \n\t" + "sb %[p0_r], -1(%[s4]) \n\t" + + : + : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r), + [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), + [p0_r] "r"(p0_r), [s4] "r"(s4)); + + __asm__ __volatile__( + "sb %[q0_r], (%[s4]) \n\t" + "sb %[q1_r], +1(%[s4]) \n\t" + "sb %[q2_r], +2(%[s4]) \n\t" + "sb %[q3_r], +3(%[s4]) \n\t" + "sb %[q4_r], +4(%[s4]) \n\t" + "sb %[q5_r], +5(%[s4]) \n\t" + "sb %[q6_r], +6(%[s4]) \n\t" + + : + : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), + [q6_r] "r"(q6_r), [s4] "r"(s4)); + } else if (mask & flat & 0x000000FF) { + __asm__ __volatile__( + "sb %[p2_r_f1], -3(%[s4]) \n\t" + "sb %[p1_r_f1], -2(%[s4]) \n\t" + "sb %[p0_r_f1], -1(%[s4]) \n\t" + "sb %[q0_r_f1], (%[s4]) \n\t" + "sb %[q1_r_f1], +1(%[s4]) \n\t" + "sb %[q2_r_f1], +2(%[s4]) \n\t" + + : + : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1), + [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1), + [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s4] "r"(s4)); + } else if (mask & 0x000000FF) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s4]) \n\t" + "sb %[p0_f0], -1(%[s4]) \n\t" + "sb %[q0_f0], (%[s4]) \n\t" + "sb %[q1_f0], +1(%[s4]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s4] "r"(s4)); + } + + __asm__ __volatile__( + "srl %[p6_r], %[p6_r], 16 \n\t" + "srl %[p5_r], %[p5_r], 16 \n\t" + "srl %[p4_r], %[p4_r], 16 \n\t" + "srl %[p3_r], %[p3_r], 16 \n\t" + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[q3_r], %[q3_r], 16 \n\t" + "srl %[q4_r], %[q4_r], 16 \n\t" + "srl %[q5_r], %[q5_r], 16 \n\t" + "srl %[q6_r], %[q6_r], 16 \n\t" + + : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r), + [q6_r] "+r"(q6_r), [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r), + [p4_r] "+r"(p4_r), [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r), + [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r) + :); + + __asm__ __volatile__( + "srl %[p2_r_f1], %[p2_r_f1], 16 \n\t" + "srl %[p1_r_f1], %[p1_r_f1], 16 \n\t" + "srl %[p0_r_f1], %[p0_r_f1], 16 \n\t" + "srl %[q0_r_f1], %[q0_r_f1], 16 \n\t" + "srl %[q1_r_f1], %[q1_r_f1], 16 \n\t" + "srl %[q2_r_f1], %[q2_r_f1], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1), + [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1), + [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & flat2 & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p6_r], -7(%[s3]) \n\t" + "sb %[p5_r], -6(%[s3]) \n\t" + "sb %[p4_r], -5(%[s3]) \n\t" + "sb %[p3_r], -4(%[s3]) \n\t" + "sb %[p2_r], -3(%[s3]) \n\t" + "sb %[p1_r], -2(%[s3]) \n\t" + "sb %[p0_r], -1(%[s3]) \n\t" + + : + : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r), + [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), + [p0_r] "r"(p0_r), [s3] "r"(s3)); + + __asm__ __volatile__( + "sb %[q0_r], (%[s3]) \n\t" + "sb %[q1_r], +1(%[s3]) \n\t" + "sb %[q2_r], +2(%[s3]) \n\t" + "sb %[q3_r], +3(%[s3]) \n\t" + "sb %[q4_r], +4(%[s3]) \n\t" + "sb %[q5_r], +5(%[s3]) \n\t" + "sb %[q6_r], +6(%[s3]) \n\t" + + : + : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), + [q6_r] "r"(q6_r), [s3] "r"(s3)); + } else if (mask & flat & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p2_r_f1], -3(%[s3]) \n\t" + "sb %[p1_r_f1], -2(%[s3]) \n\t" + "sb %[p0_r_f1], -1(%[s3]) \n\t" + "sb %[q0_r_f1], (%[s3]) \n\t" + "sb %[q1_r_f1], +1(%[s3]) \n\t" + "sb %[q2_r_f1], +2(%[s3]) \n\t" + + : + : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1), + [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1), + [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s3] "r"(s3)); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s3]) \n\t" + "sb %[p0_f0], -1(%[s3]) \n\t" + "sb %[q0_f0], (%[s3]) \n\t" + "sb %[q1_f0], +1(%[s3]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s3] "r"(s3)); + } + + __asm__ __volatile__( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & flat2 & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p6_l], -7(%[s2]) \n\t" + "sb %[p5_l], -6(%[s2]) \n\t" + "sb %[p4_l], -5(%[s2]) \n\t" + "sb %[p3_l], -4(%[s2]) \n\t" + "sb %[p2_l], -3(%[s2]) \n\t" + "sb %[p1_l], -2(%[s2]) \n\t" + "sb %[p0_l], -1(%[s2]) \n\t" + + : + : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), + [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), + [p0_l] "r"(p0_l), [s2] "r"(s2)); + + __asm__ __volatile__( + "sb %[q0_l], (%[s2]) \n\t" + "sb %[q1_l], +1(%[s2]) \n\t" + "sb %[q2_l], +2(%[s2]) \n\t" + "sb %[q3_l], +3(%[s2]) \n\t" + "sb %[q4_l], +4(%[s2]) \n\t" + "sb %[q5_l], +5(%[s2]) \n\t" + "sb %[q6_l], +6(%[s2]) \n\t" + + : + : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), + [q6_l] "r"(q6_l), [s2] "r"(s2)); + } else if (mask & flat & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p2_l_f1], -3(%[s2]) \n\t" + "sb %[p1_l_f1], -2(%[s2]) \n\t" + "sb %[p0_l_f1], -1(%[s2]) \n\t" + "sb %[q0_l_f1], (%[s2]) \n\t" + "sb %[q1_l_f1], +1(%[s2]) \n\t" + "sb %[q2_l_f1], +2(%[s2]) \n\t" + + : + : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1), + [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1), + [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s2] "r"(s2)); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s2]) \n\t" + "sb %[p0_f0], -1(%[s2]) \n\t" + "sb %[q0_f0], (%[s2]) \n\t" + "sb %[q1_f0], +1(%[s2]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s2] "r"(s2)); + } + + __asm__ __volatile__( + "srl %[p6_l], %[p6_l], 16 \n\t" + "srl %[p5_l], %[p5_l], 16 \n\t" + "srl %[p4_l], %[p4_l], 16 \n\t" + "srl %[p3_l], %[p3_l], 16 \n\t" + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[q3_l], %[q3_l], 16 \n\t" + "srl %[q4_l], %[q4_l], 16 \n\t" + "srl %[q5_l], %[q5_l], 16 \n\t" + "srl %[q6_l], %[q6_l], 16 \n\t" + + : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l), + [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l), + [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l), + [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l) + :); + + __asm__ __volatile__( + "srl %[p2_l_f1], %[p2_l_f1], 16 \n\t" + "srl %[p1_l_f1], %[p1_l_f1], 16 \n\t" + "srl %[p0_l_f1], %[p0_l_f1], 16 \n\t" + "srl %[q0_l_f1], %[q0_l_f1], 16 \n\t" + "srl %[q1_l_f1], %[q1_l_f1], 16 \n\t" + "srl %[q2_l_f1], %[q2_l_f1], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1), + [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1), + [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & flat2 & 0xFF000000) { + __asm__ __volatile__( + "sb %[p6_l], -7(%[s1]) \n\t" + "sb %[p5_l], -6(%[s1]) \n\t" + "sb %[p4_l], -5(%[s1]) \n\t" + "sb %[p3_l], -4(%[s1]) \n\t" + "sb %[p2_l], -3(%[s1]) \n\t" + "sb %[p1_l], -2(%[s1]) \n\t" + "sb %[p0_l], -1(%[s1]) \n\t" + + : + : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), + [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), + [p0_l] "r"(p0_l), [s1] "r"(s1)); + + __asm__ __volatile__( + "sb %[q0_l], (%[s1]) \n\t" + "sb %[q1_l], 1(%[s1]) \n\t" + "sb %[q2_l], 2(%[s1]) \n\t" + "sb %[q3_l], 3(%[s1]) \n\t" + "sb %[q4_l], 4(%[s1]) \n\t" + "sb %[q5_l], 5(%[s1]) \n\t" + "sb %[q6_l], 6(%[s1]) \n\t" + + : + : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), + [q6_l] "r"(q6_l), [s1] "r"(s1)); + } else if (mask & flat & 0xFF000000) { + __asm__ __volatile__( + "sb %[p2_l_f1], -3(%[s1]) \n\t" + "sb %[p1_l_f1], -2(%[s1]) \n\t" + "sb %[p0_l_f1], -1(%[s1]) \n\t" + "sb %[q0_l_f1], (%[s1]) \n\t" + "sb %[q1_l_f1], +1(%[s1]) \n\t" + "sb %[q2_l_f1], +2(%[s1]) \n\t" + + : + : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1), + [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1), + [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s1] "r"(s1)); + } else if (mask & 0xFF000000) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s1]) \n\t" + "sb %[p0_f0], -1(%[s1]) \n\t" + "sb %[q0_f0], (%[s1]) \n\t" + "sb %[q1_f0], +1(%[s1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s1] "r"(s1)); + } + } + } +} +#endif // #if HAVE_DSPR2 diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_msa.h b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_msa.h new file mode 100644 index 0000000000..1ea05e0b0b --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_msa.h @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_MIPS_LOOPFILTER_MSA_H_ +#define VPX_VPX_DSP_MIPS_LOOPFILTER_MSA_H_ + +#include "vpx_dsp/mips/macros_msa.h" + +#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask, hev, p1_out, \ + p0_out, q0_out, q1_out) \ + { \ + v16i8 p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \ + const v16i8 cnst4b = __msa_ldi_b(4); \ + const v16i8 cnst3b = __msa_ldi_b(3); \ + \ + p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ + p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ + q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ + q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ + \ + filt = __msa_subs_s_b(p1_m, q1_m); \ + filt &= hev; \ + q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt &= mask; \ + t1 = __msa_adds_s_b(filt, cnst4b); \ + t1 >>= cnst3b; \ + t2 = __msa_adds_s_b(filt, cnst3b); \ + t2 >>= cnst3b; \ + q0_m = __msa_subs_s_b(q0_m, t1); \ + q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \ + p0_m = __msa_adds_s_b(p0_m, t2); \ + p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \ + filt = __msa_srari_b(t1, 1); \ + hev = __msa_xori_b(hev, 0xff); \ + filt &= hev; \ + q1_m = __msa_subs_s_b(q1_m, filt); \ + q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \ + p1_m = __msa_adds_s_b(p1_m, filt); \ + p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \ + } + +#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \ + { \ + v16u8 tmp_flat4, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \ + v16u8 zero_in = { 0 }; \ + \ + tmp_flat4 = __msa_ori_b(zero_in, 1); \ + p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \ + q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \ + p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \ + q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \ + \ + p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \ + flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \ + p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \ + flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \ + \ + flat_out = (tmp_flat4 < (v16u8)flat_out); \ + flat_out = __msa_xori_b(flat_out, 0xff); \ + flat_out = flat_out & (mask); \ + } + +#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \ + q6_in, q7_in, flat_in, flat2_out) \ + { \ + v16u8 tmp_flat5, zero_in = { 0 }; \ + v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \ + v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \ + \ + tmp_flat5 = __msa_ori_b(zero_in, 1); \ + p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in); \ + q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in); \ + p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in); \ + q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in); \ + p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in); \ + q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in); \ + p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in); \ + q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in); \ + \ + p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0); \ + flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0); \ + flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out); \ + p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0); \ + flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out); \ + p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0); \ + flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out); \ + \ + flat2_out = (tmp_flat5 < (v16u8)flat2_out); \ + flat2_out = __msa_xori_b(flat2_out, 0xff); \ + flat2_out = flat2_out & flat_in; \ + } + +#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ + p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \ + q1_filt8_out, q2_filt8_out) \ + { \ + v8u16 tmp_filt8_0, tmp_filt8_1, tmp_filt8_2; \ + \ + tmp_filt8_2 = p2_in + p1_in + p0_in; \ + tmp_filt8_0 = p3_in << 1; \ + \ + tmp_filt8_0 = tmp_filt8_0 + tmp_filt8_2 + q0_in; \ + tmp_filt8_1 = tmp_filt8_0 + p3_in + p2_in; \ + p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ + \ + tmp_filt8_1 = tmp_filt8_0 + p1_in + q1_in; \ + p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ + \ + tmp_filt8_1 = q2_in + q1_in + q0_in; \ + tmp_filt8_2 = tmp_filt8_2 + tmp_filt8_1; \ + tmp_filt8_0 = tmp_filt8_2 + (p0_in); \ + tmp_filt8_0 = tmp_filt8_0 + (p3_in); \ + p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_0, 3); \ + \ + tmp_filt8_0 = q2_in + q3_in; \ + tmp_filt8_0 = p0_in + tmp_filt8_1 + tmp_filt8_0; \ + tmp_filt8_1 = q3_in + q3_in; \ + tmp_filt8_1 = tmp_filt8_1 + tmp_filt8_0; \ + q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ + \ + tmp_filt8_0 = tmp_filt8_2 + q3_in; \ + tmp_filt8_1 = tmp_filt8_0 + q0_in; \ + q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ + \ + tmp_filt8_1 = tmp_filt8_0 - p2_in; \ + tmp_filt8_0 = q1_in + q3_in; \ + tmp_filt8_1 = tmp_filt8_0 + tmp_filt8_1; \ + q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ + } + +#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ + limit_in, b_limit_in, thresh_in, hev_out, mask_out, \ + flat_out) \ + { \ + v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \ + v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \ + \ + /* absolute subtraction of pixel values */ \ + p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \ + p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \ + p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \ + q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \ + q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \ + q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \ + p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \ + p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \ + \ + /* calculation of hev */ \ + flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \ + hev_out = thresh_in < (v16u8)flat_out; \ + \ + /* calculation of mask */ \ + p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \ + p1_asub_q1_m >>= 1; \ + p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \ + \ + mask_out = b_limit_in < p0_asub_q0_m; \ + mask_out = __msa_max_u_b(flat_out, mask_out); \ + p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \ + mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \ + q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \ + mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \ + \ + mask_out = limit_in < (v16u8)mask_out; \ + mask_out = __msa_xori_b(mask_out, 0xff); \ + } +#endif // VPX_VPX_DSP_MIPS_LOOPFILTER_MSA_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/mips/macros_msa.h b/media/libvpx/libvpx/vpx_dsp/mips/macros_msa.h new file mode 100644 index 0000000000..53462b59f4 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/macros_msa.h @@ -0,0 +1,1971 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_MIPS_MACROS_MSA_H_ +#define VPX_VPX_DSP_MIPS_MACROS_MSA_H_ + +#include + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" + +#define LD_V(RTYPE, psrc) *((const RTYPE *)(psrc)) +#define LD_UB(...) LD_V(v16u8, __VA_ARGS__) +#define LD_SB(...) LD_V(v16i8, __VA_ARGS__) +#define LD_UH(...) LD_V(v8u16, __VA_ARGS__) +#define LD_SH(...) LD_V(v8i16, __VA_ARGS__) +#define LD_SW(...) LD_V(v4i32, __VA_ARGS__) + +#define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_UB(...) ST_V(v16u8, __VA_ARGS__) +#define ST_SB(...) ST_V(v16i8, __VA_ARGS__) +#define ST_SH(...) ST_V(v8i16, __VA_ARGS__) +#define ST_SW(...) ST_V(v4i32, __VA_ARGS__) + +#if (__mips_isa_rev >= 6) +#define LH(psrc) \ + ({ \ + uint16_t val_lh_m = *(const uint16_t *)(psrc); \ + val_lh_m; \ + }) + +#define LW(psrc) \ + ({ \ + uint32_t val_lw_m = *(const uint32_t *)(psrc); \ + val_lw_m; \ + }) + +#if (__mips == 64) +#define LD(psrc) \ + ({ \ + uint64_t val_ld_m = *(const uint64_t *)(psrc); \ + val_ld_m; \ + }) +#else // !(__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \ + uint32_t val0_ld_m, val1_ld_m; \ + uint64_t val_ld_m = 0; \ + \ + val0_ld_m = LW(psrc_ld_m); \ + val1_ld_m = LW(psrc_ld_m + 4); \ + \ + val_ld_m = (uint64_t)(val1_ld_m); \ + val_ld_m = (uint64_t)((val_ld_m << 32) & 0xFFFFFFFF00000000); \ + val_ld_m = (uint64_t)(val_ld_m | (uint64_t)val0_ld_m); \ + \ + val_ld_m; \ + }) +#endif // (__mips == 64) + +#define SH(val, pdst) *(uint16_t *)(pdst) = (val); +#define SW(val, pdst) *(uint32_t *)(pdst) = (val); +#define SD(val, pdst) *(uint64_t *)(pdst) = (val); +#else // !(__mips_isa_rev >= 6) +#define LH(psrc) \ + ({ \ + const uint8_t *psrc_lh_m = (const uint8_t *)(psrc); \ + uint16_t val_lh_m; \ + \ + __asm__ __volatile__("ulh %[val_lh_m], %[psrc_lh_m] \n\t" \ + \ + : [val_lh_m] "=r"(val_lh_m) \ + : [psrc_lh_m] "m"(*psrc_lh_m)); \ + \ + val_lh_m; \ + }) + +#define LW(psrc) \ + ({ \ + const uint8_t *psrc_lw_m = (const uint8_t *)(psrc); \ + uint32_t val_lw_m; \ + \ + __asm__ __volatile__( \ + "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \ + "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \ + : [val_lw_m] "=&r"(val_lw_m) \ + : [psrc_lw_m] "r"(psrc_lw_m)); \ + \ + val_lw_m; \ + }) + +#if (__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \ + uint64_t val_ld_m = 0; \ + \ + __asm__ __volatile__( \ + "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \ + "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \ + : [val_ld_m] "=&r"(val_ld_m) \ + : [psrc_ld_m] "r"(psrc_ld_m)); \ + \ + val_ld_m; \ + }) +#else // !(__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \ + uint32_t val0_ld_m, val1_ld_m; \ + uint64_t val_ld_m = 0; \ + \ + val0_ld_m = LW(psrc_ld_m); \ + val1_ld_m = LW(psrc_ld_m + 4); \ + \ + val_ld_m = (uint64_t)(val1_ld_m); \ + val_ld_m = (uint64_t)((val_ld_m << 32) & 0xFFFFFFFF00000000); \ + val_ld_m = (uint64_t)(val_ld_m | (uint64_t)val0_ld_m); \ + \ + val_ld_m; \ + }) +#endif // (__mips == 64) + +#define SH(val, pdst) \ + { \ + uint8_t *pdst_sh_m = (uint8_t *)(pdst); \ + const uint16_t val_sh_m = (val); \ + \ + __asm__ __volatile__("ush %[val_sh_m], %[pdst_sh_m] \n\t" \ + \ + : [pdst_sh_m] "=m"(*pdst_sh_m) \ + : [val_sh_m] "r"(val_sh_m)); \ + } + +#define SW(val, pdst) \ + { \ + uint8_t *pdst_sw_m = (uint8_t *)(pdst); \ + const uint32_t val_sw_m = (val); \ + \ + __asm__ __volatile__("usw %[val_sw_m], %[pdst_sw_m] \n\t" \ + \ + : [pdst_sw_m] "=m"(*pdst_sw_m) \ + : [val_sw_m] "r"(val_sw_m)); \ + } + +#define SD(val, pdst) \ + { \ + uint8_t *pdst_sd_m = (uint8_t *)(pdst); \ + uint32_t val0_sd_m, val1_sd_m; \ + \ + val0_sd_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ + val1_sd_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ + \ + SW(val0_sd_m, pdst_sd_m); \ + SW(val1_sd_m, pdst_sd_m + 4); \ + } +#endif // (__mips_isa_rev >= 6) + +/* Description : Load 4 words with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1, out2, out3 + Details : Load word in 'out0' from (psrc) + Load word in 'out1' from (psrc + stride) + Load word in 'out2' from (psrc + 2 * stride) + Load word in 'out3' from (psrc + 3 * stride) +*/ +#define LW4(psrc, stride, out0, out1, out2, out3) \ + { \ + out0 = LW((psrc)); \ + out1 = LW((psrc) + stride); \ + out2 = LW((psrc) + 2 * stride); \ + out3 = LW((psrc) + 3 * stride); \ + } + +/* Description : Load double words with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Details : Load double word in 'out0' from (psrc) + Load double word in 'out1' from (psrc + stride) +*/ +#define LD2(psrc, stride, out0, out1) \ + { \ + out0 = LD((psrc)); \ + out1 = LD((psrc) + stride); \ + } +#define LD4(psrc, stride, out0, out1, out2, out3) \ + { \ + LD2((psrc), stride, out0, out1); \ + LD2((psrc) + 2 * stride, stride, out2, out3); \ + } + +/* Description : Store 4 words with stride + Arguments : Inputs - in0, in1, in2, in3, pdst, stride + Details : Store word from 'in0' to (pdst) + Store word from 'in1' to (pdst + stride) + Store word from 'in2' to (pdst + 2 * stride) + Store word from 'in3' to (pdst + 3 * stride) +*/ +#define SW4(in0, in1, in2, in3, pdst, stride) \ + { \ + SW(in0, (pdst)) \ + SW(in1, (pdst) + stride); \ + SW(in2, (pdst) + 2 * stride); \ + SW(in3, (pdst) + 3 * stride); \ + } + +/* Description : Store 4 double words with stride + Arguments : Inputs - in0, in1, in2, in3, pdst, stride + Details : Store double word from 'in0' to (pdst) + Store double word from 'in1' to (pdst + stride) + Store double word from 'in2' to (pdst + 2 * stride) + Store double word from 'in3' to (pdst + 3 * stride) +*/ +#define SD4(in0, in1, in2, in3, pdst, stride) \ + { \ + SD(in0, (pdst)) \ + SD(in1, (pdst) + stride); \ + SD(in2, (pdst) + 2 * stride); \ + SD(in3, (pdst) + 3 * stride); \ + } + +/* Description : Load vector elements with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Load 16 byte elements in 'out0' from (psrc) + Load 16 byte elements in 'out1' from (psrc + stride) +*/ +#define LD_V2(RTYPE, psrc, stride, out0, out1) \ + { \ + out0 = LD_V(RTYPE, (psrc)); \ + out1 = LD_V(RTYPE, (psrc) + stride); \ + } +#define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__) +#define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__) +#define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__) +#define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__) + +#define LD_V3(RTYPE, psrc, stride, out0, out1, out2) \ + { \ + LD_V2(RTYPE, (psrc), stride, out0, out1); \ + out2 = LD_V(RTYPE, (psrc) + 2 * stride); \ + } +#define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__) + +#define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3) \ + { \ + LD_V2(RTYPE, (psrc), stride, out0, out1); \ + LD_V2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ + } +#define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__) +#define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__) +#define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__) + +#define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \ + { \ + LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + out4 = LD_V(RTYPE, (psrc) + 4 * stride); \ + } +#define LD_UB5(...) LD_V5(v16u8, __VA_ARGS__) +#define LD_SB5(...) LD_V5(v16i8, __VA_ARGS__) + +#define LD_V7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \ + { \ + LD_V5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \ + LD_V2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \ + } +#define LD_SB7(...) LD_V7(v16i8, __VA_ARGS__) + +#define LD_V8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ + out7) \ + { \ + LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + LD_V4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ + } +#define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__) +#define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__) +#define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__) + +#define LD_V16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ + out7, out8, out9, out10, out11, out12, out13, out14, out15) \ + { \ + LD_V8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6, \ + out7); \ + LD_V8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \ + out13, out14, out15); \ + } +#define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__) + +/* Description : Load 4x4 block of signed halfword elements from 1D source + data into 4 vectors (Each vector with 4 signed halfwords) + Arguments : Input - psrc + Outputs - out0, out1, out2, out3 +*/ +#define LD4x4_SH(psrc, out0, out1, out2, out3) \ + { \ + out0 = LD_SH(psrc); \ + out2 = LD_SH(psrc + 8); \ + out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ + out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ + } + +/* Description : Store vectors with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 16 byte elements from 'in0' to (pdst) + Store 16 byte elements from 'in1' to (pdst + stride) +*/ +#define ST_V2(RTYPE, in0, in1, pdst, stride) \ + { \ + ST_V(RTYPE, in0, (pdst)); \ + ST_V(RTYPE, in1, (pdst) + stride); \ + } +#define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__) +#define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__) +#define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__) + +#define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride) \ + { \ + ST_V2(RTYPE, in0, in1, (pdst), stride); \ + ST_V2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ + } +#define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__) +#define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__) + +#define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ + { \ + ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride); \ + ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ + } +#define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__) +#define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__) + +/* Description : Store 2x4 byte block to destination memory from input vector + Arguments : Inputs - in, stidx, pdst, stride + Details : Index 'stidx' halfword element from 'in' vector is copied to + the GP register and stored to (pdst) + Index 'stidx+1' halfword element from 'in' vector is copied to + the GP register and stored to (pdst + stride) + Index 'stidx+2' halfword element from 'in' vector is copied to + the GP register and stored to (pdst + 2 * stride) + Index 'stidx+3' halfword element from 'in' vector is copied to + the GP register and stored to (pdst + 3 * stride) +*/ +#define ST2x4_UB(in, stidx, pdst, stride) \ + { \ + uint16_t out0_m, out1_m, out2_m, out3_m; \ + uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \ + out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \ + out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \ + out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \ + \ + SH(out0_m, pblk_2x4_m); \ + SH(out1_m, pblk_2x4_m + stride); \ + SH(out2_m, pblk_2x4_m + 2 * stride); \ + SH(out3_m, pblk_2x4_m + 3 * stride); \ + } + +/* Description : Store 4x2 byte block to destination memory from input vector + Arguments : Inputs - in, pdst, stride + Details : Index 0 word element from 'in' vector is copied to the GP + register and stored to (pdst) + Index 1 word element from 'in' vector is copied to the GP + register and stored to (pdst + stride) +*/ +#define ST4x2_UB(in, pdst, stride) \ + { \ + uint32_t out0_m, out1_m; \ + uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_w((v4i32)in, 0); \ + out1_m = __msa_copy_u_w((v4i32)in, 1); \ + \ + SW(out0_m, pblk_4x2_m); \ + SW(out1_m, pblk_4x2_m + stride); \ + } + +/* Description : Store 4x4 byte block to destination memory from input vector + Arguments : Inputs - in0, in1, pdst, stride + Details : 'Idx0' word element from input vector 'in0' is copied to the + GP register and stored to (pdst) + 'Idx1' word element from input vector 'in0' is copied to the + GP register and stored to (pdst + stride) + 'Idx2' word element from input vector 'in0' is copied to the + GP register and stored to (pdst + 2 * stride) + 'Idx3' word element from input vector 'in0' is copied to the + GP register and stored to (pdst + 3 * stride) +*/ +#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \ + { \ + uint32_t out0_m, out1_m, out2_m, out3_m; \ + uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_w((v4i32)in0, idx0); \ + out1_m = __msa_copy_u_w((v4i32)in0, idx1); \ + out2_m = __msa_copy_u_w((v4i32)in1, idx2); \ + out3_m = __msa_copy_u_w((v4i32)in1, idx3); \ + \ + SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \ + } +#define ST4x8_UB(in0, in1, pdst, stride) \ + { \ + uint8_t *pblk_4x8 = (uint8_t *)(pdst); \ + \ + ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \ + ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \ + } + +/* Description : Store 8x1 byte block to destination memory from input vector + Arguments : Inputs - in, pdst + Details : Index 0 double word element from 'in' vector is copied to the + GP register and stored to (pdst) +*/ +#define ST8x1_UB(in, pdst) \ + { \ + uint64_t out0_m; \ + \ + out0_m = __msa_copy_u_d((v2i64)in, 0); \ + SD(out0_m, pdst); \ + } + +/* Description : Store 8x2 byte block to destination memory from input vector + Arguments : Inputs - in, pdst, stride + Details : Index 0 double word element from 'in' vector is copied to the + GP register and stored to (pdst) + Index 1 double word element from 'in' vector is copied to the + GP register and stored to (pdst + stride) +*/ +#define ST8x2_UB(in, pdst, stride) \ + { \ + uint64_t out0_m, out1_m; \ + uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_d((v2i64)in, 0); \ + out1_m = __msa_copy_u_d((v2i64)in, 1); \ + \ + SD(out0_m, pblk_8x2_m); \ + SD(out1_m, pblk_8x2_m + stride); \ + } + +/* Description : Store 8x4 byte block to destination memory from input + vectors + Arguments : Inputs - in0, in1, pdst, stride + Details : Index 0 double word element from 'in0' vector is copied to the + GP register and stored to (pdst) + Index 1 double word element from 'in0' vector is copied to the + GP register and stored to (pdst + stride) + Index 0 double word element from 'in1' vector is copied to the + GP register and stored to (pdst + 2 * stride) + Index 1 double word element from 'in1' vector is copied to the + GP register and stored to (pdst + 3 * stride) +*/ +#define ST8x4_UB(in0, in1, pdst, stride) \ + { \ + uint64_t out0_m, out1_m, out2_m, out3_m; \ + uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_d((v2i64)in0, 0); \ + out1_m = __msa_copy_u_d((v2i64)in0, 1); \ + out2_m = __msa_copy_u_d((v2i64)in1, 0); \ + out3_m = __msa_copy_u_d((v2i64)in1, 1); \ + \ + SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \ + } + +/* Description : average with rounding (in0 + in1 + 1) / 2. + Arguments : Inputs - in0, in1, in2, in3, + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each unsigned byte element from 'in0' vector is added with + each unsigned byte element from 'in1' vector. Then the average + with rounding is calculated and written to 'out0' +*/ +#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \ + out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \ + } +#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__) + +#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ + AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \ + } +#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__) + +/* Description : Immediate number of elements to slide with zero + Arguments : Inputs - in0, in1, slide_val + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Byte elements from 'zero_m' vector are slid into 'in0' by + value specified in the 'slide_val' +*/ +#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \ + { \ + v16i8 zero_m = { 0 }; \ + out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \ + out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \ + } +#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__) + +#define SLDI_B4_0(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, \ + slide_val) \ + { \ + SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \ + SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \ + } +#define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__) + +/* Description : Immediate number of elements to slide + Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Byte elements from 'in0_0' vector are slid into 'in1_0' by + value specified in the 'slide_val' +*/ +#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ + { \ + out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \ + out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \ + } +#define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__) +#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__) + +#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \ + out2, slide_val) \ + { \ + SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ + out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \ + } +#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__) +#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__) + +/* Description : Shuffle byte vector elements as per mask vector + Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Byte elements from 'in0' & 'in1' are copied selectively to + 'out0' as per control vector 'mask0' +*/ +#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ + out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \ + } +#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) +#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__) +#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__) +#define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__) + +#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \ + out3) \ + { \ + VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \ + VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \ + } +#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__) +#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__) + +/* Description : Dot product of byte vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Unsigned byte elements from 'mult0' are multiplied with + unsigned byte elements from 'cnst0' producing a result + twice the size of input i.e. unsigned halfword. + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \ + out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \ + } +#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__) + +#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ + cnst3, out0, out1, out2, out3) \ + { \ + DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ + } +#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__) + +/* Description : Dot product of byte vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed byte elements from 'mult0' are multiplied with + signed byte elements from 'cnst0' producing a result + twice the size of input i.e. signed halfword. + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \ + out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \ + } +#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__) + +#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ + cnst3, out0, out1, out2, out3) \ + { \ + DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ + } +#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__) + +/* Description : Dot product of halfword vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed halfword elements from 'mult0' are multiplied with + signed halfword elements from 'cnst0' producing a result + twice the size of input i.e. signed word. + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \ + out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \ + } +#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__) + +#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ + cnst3, out0, out1, out2, out3) \ + { \ + DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ + } +#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__) + +/* Description : Dot product of word vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed word elements from 'mult0' are multiplied with + signed word elements from 'cnst0' producing a result + twice the size of input i.e. signed double word. + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \ + out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \ + } +#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__) + +/* Description : Dot product & addition of byte vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed byte elements from 'mult0' are multiplied with + signed byte elements from 'cnst0' producing a result + twice the size of input i.e. signed halfword. + The multiplication result of adjacent odd-even elements + are added to the 'out0' vector +*/ +#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \ + out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \ + } +#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__) + +#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ + cnst3, out0, out1, out2, out3) \ + { \ + DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ + } +#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__) + +/* Description : Dot product & addition of halfword vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed halfword elements from 'mult0' are multiplied with + signed halfword elements from 'cnst0' producing a result + twice the size of input i.e. signed word. + The multiplication result of adjacent odd-even elements + are added to the 'out0' vector +*/ +#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \ + out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \ + } +#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__) + +/* Description : Dot product & addition of double word vector elements + Arguments : Inputs - mult0, mult1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each signed word element from 'mult0' is multiplied with itself + producing an intermediate result twice the size of input + i.e. signed double word + The multiplication result of adjacent odd-even elements + are added to the 'out0' vector +*/ +#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \ + out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \ + } +#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__) + +/* Description : Minimum values between unsigned elements of + either vector are copied to the output vector + Arguments : Inputs - in0, in1, min_vec + Outputs - in place operation + Return Type - as per RTYPE + Details : Minimum of unsigned halfword element values from 'in0' and + 'min_vec' are written to output vector 'in0' +*/ +#define MIN_UH2(RTYPE, in0, in1, min_vec) \ + { \ + in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \ + in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \ + } +#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__) + +#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \ + { \ + MIN_UH2(RTYPE, in0, in1, min_vec); \ + MIN_UH2(RTYPE, in2, in3, min_vec); \ + } +#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__) + +/* Description : Clips all signed halfword elements of input vector + between 0 & 255 + Arguments : Input - in + Output - out_m + Return Type - signed halfword +*/ +#define CLIP_SH_0_255(in) \ + ({ \ + v8i16 max_m = __msa_ldi_h(255); \ + v8i16 out_m; \ + \ + out_m = __msa_maxi_s_h((v8i16)in, 0); \ + out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \ + out_m; \ + }) +#define CLIP_SH2_0_255(in0, in1) \ + { \ + in0 = CLIP_SH_0_255(in0); \ + in1 = CLIP_SH_0_255(in1); \ + } +#define CLIP_SH4_0_255(in0, in1, in2, in3) \ + { \ + CLIP_SH2_0_255(in0, in1); \ + CLIP_SH2_0_255(in2, in3); \ + } + +/* Description : Horizontal addition of 4 signed word elements of input vector + Arguments : Input - in (signed word vector) + Output - sum_m (i32 sum) + Return Type - signed word (GP) + Details : 4 signed word elements of 'in' vector are added together and + the resulting integer sum is returned +*/ +#define HADD_SW_S32(in) \ + ({ \ + v2i64 hadd_sw_s32_res0_m, hadd_sw_s32_res1_m; \ + int32_t hadd_sw_s32_sum_m; \ + \ + hadd_sw_s32_res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \ + hadd_sw_s32_res1_m = __msa_splati_d(hadd_sw_s32_res0_m, 1); \ + hadd_sw_s32_res0_m = hadd_sw_s32_res0_m + hadd_sw_s32_res1_m; \ + hadd_sw_s32_sum_m = __msa_copy_s_w((v4i32)hadd_sw_s32_res0_m, 0); \ + hadd_sw_s32_sum_m; \ + }) + +/* Description : Horizontal addition of 4 unsigned word elements + Arguments : Input - in (unsigned word vector) + Output - sum_m (u32 sum) + Return Type - unsigned word (GP) + Details : 4 unsigned word elements of 'in' vector are added together and + the resulting integer sum is returned +*/ +#define HADD_UW_U32(in) \ + ({ \ + v2u64 hadd_uw_u32_res0_m, hadd_uw_u32_res1_m; \ + uint32_t hadd_uw_u32_sum_m; \ + \ + hadd_uw_u32_res0_m = __msa_hadd_u_d((v4u32)in, (v4u32)in); \ + hadd_uw_u32_res1_m = (v2u64)__msa_splati_d((v2i64)hadd_uw_u32_res0_m, 1); \ + hadd_uw_u32_res0_m += hadd_uw_u32_res1_m; \ + hadd_uw_u32_sum_m = __msa_copy_u_w((v4i32)hadd_uw_u32_res0_m, 0); \ + hadd_uw_u32_sum_m; \ + }) + +/* Description : Horizontal addition of 8 unsigned halfword elements + Arguments : Input - in (unsigned halfword vector) + Output - sum_m (u32 sum) + Return Type - unsigned word + Details : 8 unsigned halfword elements of 'in' vector are added + together and the resulting integer sum is returned +*/ +#define HADD_UH_U32(in) \ + ({ \ + v4u32 hadd_uh_u32_res_m; \ + uint32_t hadd_uh_u32_sum_m; \ + \ + hadd_uh_u32_res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \ + hadd_uh_u32_sum_m = HADD_UW_U32(hadd_uh_u32_res_m); \ + hadd_uh_u32_sum_m; \ + }) + +/* Description : Horizontal addition of unsigned byte vector elements + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each unsigned odd byte element from 'in0' is added to + even unsigned byte element from 'in0' (pairwise) and the + halfword result is written to 'out0' +*/ +#define HADD_UB2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \ + out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \ + } +#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__) + +#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + HADD_UB2(RTYPE, in0, in1, out0, out1); \ + HADD_UB2(RTYPE, in2, in3, out2, out3); \ + } +#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__) + +/* Description : Horizontal subtraction of unsigned byte vector elements + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each unsigned odd byte element from 'in0' is subtracted from + even unsigned byte element from 'in0' (pairwise) and the + halfword result is written to 'out0' +*/ +#define HSUB_UB2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \ + out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \ + } +#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__) + +/* Description : SAD (Sum of Absolute Difference) + Arguments : Inputs - in0, in1, ref0, ref1 + Outputs - sad_m (halfword vector) + Return Type - unsigned halfword + Details : Absolute difference of all the byte elements from 'in0' with + 'ref0' is calculated and preserved in 'diff0'. Then even-odd + pairs are added together to generate 8 halfword results. +*/ +#define SAD_UB2_UH(in0, in1, ref0, ref1) \ + ({ \ + v16u8 diff0_m, diff1_m; \ + v8u16 sad_m = { 0 }; \ + \ + diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0); \ + diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1); \ + \ + sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m); \ + sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m); \ + \ + sad_m; \ + }) + +/* Description : Horizontal subtraction of signed halfword vector elements + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each signed odd halfword element from 'in0' is subtracted from + even signed halfword element from 'in0' (pairwise) and the + word result is written to 'out0' +*/ +#define HSUB_UH2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \ + out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \ + } +#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__) + +/* Description : Set element n input vector to GPR value + Arguments : Inputs - in0, in1, in2, in3 + Output - out + Return Type - as per RTYPE + Details : Set element 0 in vector 'out' to value specified in 'in0' +*/ +#define INSERT_W2(RTYPE, in0, in1, out) \ + { \ + out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ + } +#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__) + +#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \ + { \ + out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \ + } +#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__) +#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__) + +#define INSERT_D2(RTYPE, in0, in1, out) \ + { \ + out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \ + out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \ + } +#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__) +#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__) +#define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__) + +/* Description : Interleave even byte elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even byte elements of 'in0' and 'in1' are interleaved + and written to 'out0' +*/ +#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ + out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ + } +#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__) +#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__) + +/* Description : Interleave even halfword elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even halfword elements of 'in0' and 'in1' are interleaved + and written to 'out0' +*/ +#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \ + out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \ + } +#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) +#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__) +#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__) + +/* Description : Interleave even word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even word elements of 'in0' and 'in1' are interleaved + and written to 'out0' +*/ +#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \ + out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \ + } +#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__) + +/* Description : Interleave even double word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even double word elements of 'in0' and 'in1' are interleaved + and written to 'out0' +*/ +#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \ + out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \ + } +#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__) + +/* Description : Interleave left half of byte elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Left half of byte elements of 'in0' and 'in1' are interleaved + and written to 'out0'. +*/ +#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \ + } +#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__) +#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__) +#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__) +#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__) + +#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__) +#define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__) +#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__) + +/* Description : Interleave left half of halfword elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Left half of halfword elements of 'in0' and 'in1' are + interleaved and written to 'out0'. +*/ +#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \ + } +#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__) +#define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__) + +/* Description : Interleave left half of word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Left half of word elements of 'in0' and 'in1' are interleaved + and written to 'out0'. +*/ +#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ + out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \ + } +#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__) +#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__) + +/* Description : Interleave right half of byte elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of byte elements of 'in0' and 'in1' are interleaved + and written to out0. +*/ +#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \ + } +#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__) +#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__) +#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__) +#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) + +#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__) +#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) +#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) +#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) + +#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \ + in11, in12, in13, in14, in15, out0, out1, out2, out3, out4, \ + out5, out6, out7) \ + { \ + ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3); \ + ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, out4, out5, \ + out6, out7); \ + } +#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__) + +/* Description : Interleave right half of halfword elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of halfword elements of 'in0' and 'in1' are + interleaved and written to 'out0'. +*/ +#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \ + } +#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__) +#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__) + +#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__) + +#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ + out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \ + } +#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__) +#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__) + +#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__) + +/* Description : Interleave right half of double word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of double word elements of 'in0' and 'in1' are + interleaved and written to 'out0'. +*/ +#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \ + out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \ + } +#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__) +#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__) +#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__) + +#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \ + { \ + ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ + out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5)); \ + } +#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__) + +#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__) +#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__) + +/* Description : Interleave both left and right half of input vectors + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of byte elements from 'in0' and 'in1' are + interleaved and written to 'out0' +*/ +#define ILVRL_B2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ + } +#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) +#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__) +#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__) +#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__) + +#define ILVRL_H2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ + } +#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__) +#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__) + +#define ILVRL_W2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ + out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ + } +#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__) +#define ILVRL_W2_SB(...) ILVRL_W2(v16i8, __VA_ARGS__) +#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) +#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) + +/* Description : Saturate the halfword element values to the max + unsigned value of (sat_val + 1) bits + The element data width remains unchanged + Arguments : Inputs - in0, in1, sat_val + Outputs - in place operation + Return Type - as per RTYPE + Details : Each unsigned halfword element from 'in0' is saturated to the + value generated with (sat_val + 1) bit range. + The results are written in place +*/ +#define SAT_UH2(RTYPE, in0, in1, sat_val) \ + { \ + in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \ + in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \ + } +#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__) + +#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \ + { \ + SAT_UH2(RTYPE, in0, in1, sat_val); \ + SAT_UH2(RTYPE, in2, in3, sat_val) \ + } +#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__) + +/* Description : Saturate the halfword element values to the max + unsigned value of (sat_val + 1) bits + The element data width remains unchanged + Arguments : Inputs - in0, in1, sat_val + Outputs - in place operation + Return Type - as per RTYPE + Details : Each unsigned halfword element from 'in0' is saturated to the + value generated with (sat_val + 1) bit range + The results are written in place +*/ +#define SAT_SH2(RTYPE, in0, in1, sat_val) \ + { \ + in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \ + in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \ + } +#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__) + +#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \ + { \ + SAT_SH2(RTYPE, in0, in1, sat_val); \ + SAT_SH2(RTYPE, in2, in3, sat_val); \ + } +#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__) + +/* Description : Indexed halfword element values are replicated to all + elements in output vector + Arguments : Inputs - in, idx0, idx1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : 'idx0' element value from 'in' vector is replicated to all + elements in 'out0' vector + Valid index range for halfword operation is 0-7 +*/ +#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \ + out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \ + } +#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__) + +#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, out0, out1, out2, out3) \ + { \ + SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \ + SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \ + } +#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__) +#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__) + +/* Description : Pack even byte elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even byte elements of 'in0' are copied to the left half of + 'out0' & even byte elements of 'in1' are copied to the right + half of 'out0'. +*/ +#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \ + } +#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__) +#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__) +#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__) + +#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__) +#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__) +#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__) + +/* Description : Pack even halfword elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even halfword elements of 'in0' are copied to the left half of + 'out0' & even halfword elements of 'in1' are copied to the + right half of 'out0'. +*/ +#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \ + } +#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__) +#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__) + +#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ + PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__) + +/* Description : Pack even double word elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even double elements of 'in0' are copied to the left half of + 'out0' & even double elements of 'in1' are copied to the right + half of 'out0'. +*/ +#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \ + out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \ + } +#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__) +#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__) + +#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ + PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__) + +/* Description : Each byte element is logically xor'ed with immediate 128 + Arguments : Inputs - in0, in1 + Outputs - in place operation + Return Type - as per RTYPE + Details : Each unsigned byte element from input vector 'in0' is + logically xor'ed with 128 and the result is stored in-place. +*/ +#define XORI_B2_128(RTYPE, in0, in1) \ + { \ + in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \ + in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \ + } +#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__) +#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__) + +#define XORI_B3_128(RTYPE, in0, in1, in2) \ + { \ + XORI_B2_128(RTYPE, in0, in1); \ + in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \ + } +#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__) + +#define XORI_B4_128(RTYPE, in0, in1, in2, in3) \ + { \ + XORI_B2_128(RTYPE, in0, in1); \ + XORI_B2_128(RTYPE, in2, in3); \ + } +#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__) +#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__) + +#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \ + { \ + XORI_B4_128(RTYPE, in0, in1, in2, in3); \ + XORI_B3_128(RTYPE, in4, in5, in6); \ + } +#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__) + +/* Description : Average of signed halfword elements -> (a + b) / 2 + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3 + Return Type - as per RTYPE + Details : Each signed halfword element from 'in0' is added to each + signed halfword element of 'in1' with full precision resulting + in one extra bit in the result. The result is then divided by + 2 and written to 'out0' +*/ +#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3); \ + out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5); \ + out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7); \ + } +#define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__) + +/* Description : Addition of signed halfword elements and signed saturation + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed halfword elements from 'in0' are added to signed + halfword elements of 'in1'. The result is then signed saturated + between halfword data type range +*/ +#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \ + } +#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__) + +#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__) + +/* Description : Shift left all elements of vector (generic for all data types) + Arguments : Inputs - in0, in1, in2, in3, shift + Outputs - in place operation + Return Type - as per input vector RTYPE + Details : Each element of vector 'in0' is left shifted by 'shift' and + the result is written in-place. +*/ +#define SLLI_4V(in0, in1, in2, in3, shift) \ + { \ + in0 = in0 << shift; \ + in1 = in1 << shift; \ + in2 = in2 << shift; \ + in3 = in3 << shift; \ + } + +/* Description : Arithmetic shift right all elements of vector + (generic for all data types) + Arguments : Inputs - in0, in1, in2, in3, shift + Outputs - in place operation + Return Type - as per input vector RTYPE + Details : Each element of vector 'in0' is right shifted by 'shift' and + the result is written in-place. 'shift' is a GP variable. +*/ +#define SRA_2V(in0, in1, shift) \ + { \ + in0 = in0 >> shift; \ + in1 = in1 >> shift; \ + } + +#define SRA_4V(in0, in1, in2, in3, shift) \ + { \ + in0 = in0 >> shift; \ + in1 = in1 >> shift; \ + in2 = in2 >> shift; \ + in3 = in3 >> shift; \ + } + +/* Description : Shift right arithmetic rounded words + Arguments : Inputs - in0, in1, shift + Outputs - in place operation + Return Type - as per RTYPE + Details : Each element of vector 'in0' is shifted right arithmetically by + the number of bits in the corresponding element in the vector + 'shift'. The last discarded bit is added to shifted value for + rounding and the result is written in-place. + 'shift' is a vector. +*/ +#define SRAR_W2(RTYPE, in0, in1, shift) \ + { \ + in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \ + in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \ + } + +#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \ + { \ + SRAR_W2(RTYPE, in0, in1, shift) \ + SRAR_W2(RTYPE, in2, in3, shift) \ + } +#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__) + +/* Description : Shift right arithmetic rounded (immediate) + Arguments : Inputs - in0, in1, shift + Outputs - in place operation + Return Type - as per RTYPE + Details : Each element of vector 'in0' is shifted right arithmetically by + the value in 'shift'. The last discarded bit is added to the + shifted value for rounding and the result is written in-place. + 'shift' is an immediate value. +*/ +#define SRARI_H2(RTYPE, in0, in1, shift) \ + { \ + in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \ + in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \ + } +#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__) +#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__) + +#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \ + { \ + SRARI_H2(RTYPE, in0, in1, shift); \ + SRARI_H2(RTYPE, in2, in3, shift); \ + } +#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__) +#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__) + +#define SRARI_W2(RTYPE, in0, in1, shift) \ + { \ + in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \ + in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \ + } +#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__) + +#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \ + { \ + SRARI_W2(RTYPE, in0, in1, shift); \ + SRARI_W2(RTYPE, in2, in3, shift); \ + } +#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__) + +/* Description : Logical shift right all elements of vector (immediate) + Arguments : Inputs - in0, in1, in2, in3, shift + Outputs - out0, out1, out2, out3 + Return Type - as per RTYPE + Details : Each element of vector 'in0' is right shifted by 'shift' and + the result is written in-place. 'shift' is an immediate value. +*/ +#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) \ + { \ + out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift); \ + out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift); \ + out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift); \ + out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift); \ + } +#define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__) + +/* Description : Multiplication of pairs of vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element from 'in0' is multiplied with elements from 'in1' + and the result is written to 'out0' +*/ +#define MUL2(in0, in1, in2, in3, out0, out1) \ + { \ + out0 = in0 * in1; \ + out1 = in2 * in3; \ + } +#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ + { \ + MUL2(in0, in1, in2, in3, out0, out1); \ + MUL2(in4, in5, in6, in7, out2, out3); \ + } + +/* Description : Addition of 2 pairs of vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element in 'in0' is added to 'in1' and result is written + to 'out0'. +*/ +#define ADD2(in0, in1, in2, in3, out0, out1) \ + { \ + out0 = in0 + in1; \ + out1 = in2 + in3; \ + } +#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ + { \ + ADD2(in0, in1, in2, in3, out0, out1); \ + ADD2(in4, in5, in6, in7, out2, out3); \ + } + +/* Description : Subtraction of 2 pairs of vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element in 'in1' is subtracted from 'in0' and result is + written to 'out0'. +*/ +#define SUB2(in0, in1, in2, in3, out0, out1) \ + { \ + out0 = in0 - in1; \ + out1 = in2 - in3; \ + } +#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ + { \ + out0 = in0 - in1; \ + out1 = in2 - in3; \ + out2 = in4 - in5; \ + out3 = in6 - in7; \ + } + +/* Description : Sign extend halfword elements from right half of the vector + Arguments : Input - in (halfword vector) + Output - out (sign extended word vector) + Return Type - signed word + Details : Sign bit of halfword elements from input vector 'in' is + extracted and interleaved with same vector 'in0' to generate + 4 word elements keeping sign intact +*/ +#define UNPCK_R_SH_SW(in, out) \ + { \ + v8i16 sign_m; \ + \ + sign_m = __msa_clti_s_h((v8i16)in, 0); \ + out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \ + } + +/* Description : Sign extend byte elements from input vector and return + halfword results in pair of vectors + Arguments : Input - in (byte vector) + Outputs - out0, out1 (sign extended halfword vectors) + Return Type - signed halfword + Details : Sign bit of byte elements from input vector 'in' is + extracted and interleaved right with same vector 'in0' to + generate 8 signed halfword elements in 'out0' + Then interleaved left with same vector 'in0' to + generate 8 signed halfword elements in 'out1' +*/ +#define UNPCK_SB_SH(in, out0, out1) \ + { \ + v16i8 tmp_m; \ + \ + tmp_m = __msa_clti_s_b((v16i8)in, 0); \ + ILVRL_B2_SH(tmp_m, in, out0, out1); \ + } + +/* Description : Zero extend unsigned byte elements to halfword elements + Arguments : Input - in (unsigned byte vector) + Outputs - out0, out1 (unsigned halfword vectors) + Return Type - signed halfword + Details : Zero extended right half of vector is returned in 'out0' + Zero extended left half of vector is returned in 'out1' +*/ +#define UNPCK_UB_SH(in, out0, out1) \ + { \ + v16i8 zero_m = { 0 }; \ + \ + ILVRL_B2_SH(zero_m, in, out0, out1); \ + } + +/* Description : Sign extend halfword elements from input vector and return + the result in pair of vectors + Arguments : Input - in (halfword vector) + Outputs - out0, out1 (sign extended word vectors) + Return Type - signed word + Details : Sign bit of halfword elements from input vector 'in' is + extracted and interleaved right with same vector 'in0' to + generate 4 signed word elements in 'out0' + Then interleaved left with same vector 'in0' to + generate 4 signed word elements in 'out1' +*/ +#define UNPCK_SH_SW(in, out0, out1) \ + { \ + v8i16 tmp_m; \ + \ + tmp_m = __msa_clti_s_h((v8i16)in, 0); \ + ILVRL_H2_SW(tmp_m, in, out0, out1); \ + } + +/* Description : Butterfly of 4 input vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1, out2, out3 + Details : Butterfly operation +*/ +#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + out0 = in0 + in3; \ + out1 = in1 + in2; \ + \ + out2 = in1 - in2; \ + out3 = in0 - in3; \ + } + +/* Description : Butterfly of 8 input vectors + Arguments : Inputs - in0 ... in7 + Outputs - out0 .. out7 + Details : Butterfly operation +*/ +#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ + { \ + out0 = in0 + in7; \ + out1 = in1 + in6; \ + out2 = in2 + in5; \ + out3 = in3 + in4; \ + \ + out4 = in3 - in4; \ + out5 = in2 - in5; \ + out6 = in1 - in6; \ + out7 = in0 - in7; \ + } + +/* Description : Butterfly of 16 input vectors + Arguments : Inputs - in0 ... in15 + Outputs - out0 .. out15 + Details : Butterfly operation +*/ +#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \ + in11, in12, in13, in14, in15, out0, out1, out2, out3, \ + out4, out5, out6, out7, out8, out9, out10, out11, out12, \ + out13, out14, out15) \ + { \ + out0 = in0 + in15; \ + out1 = in1 + in14; \ + out2 = in2 + in13; \ + out3 = in3 + in12; \ + out4 = in4 + in11; \ + out5 = in5 + in10; \ + out6 = in6 + in9; \ + out7 = in7 + in8; \ + \ + out8 = in7 - in8; \ + out9 = in6 - in9; \ + out10 = in5 - in10; \ + out11 = in4 - in11; \ + out12 = in3 - in12; \ + out13 = in2 - in13; \ + out14 = in1 - in14; \ + out15 = in0 - in15; \ + } + +/* Description : Transpose input 8x8 byte block + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - as per RTYPE +*/ +#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \ + out1, out2, out3, out4, out5, out6, out7) \ + { \ + v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ + \ + ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \ + tmp3_m); \ + ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \ + ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \ + ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \ + ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \ + SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \ + SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \ + } +#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__) + +/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, + in8, in9, in10, in11, in12, in13, in14, in15 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - unsigned byte +*/ +#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \ + in10, in11, in12, in13, in14, in15, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ + \ + ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \ + ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \ + ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \ + ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \ + \ + tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \ + tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \ + tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \ + tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \ + out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \ + tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \ + out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \ + tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \ + \ + ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \ + out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + \ + tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ + tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \ + out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + \ + ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \ + out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + \ + tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ + tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ + out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + } + +/* Description : Transpose 4x4 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1, out2, out3 + Return Type - signed halfword +*/ +#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 s0_m, s1_m; \ + \ + ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \ + ILVRL_W2_SH(s1_m, s0_m, out0, out2); \ + out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ + out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \ + } + +/* Description : Transpose 4x8 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - signed halfword +*/ +#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \ + v8i16 zero_m = { 0 }; \ + \ + ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, tmp0_n, tmp1_n, tmp2_n, \ + tmp3_n); \ + ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m); \ + ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m); \ + \ + out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ + out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ + out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ + out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ + \ + out4 = zero_m; \ + out5 = zero_m; \ + out6 = zero_m; \ + out7 = zero_m; \ + } + +/* Description : Transpose 8x4 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - signed halfword +*/ +#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \ + ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \ + ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \ + ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \ + } + +/* Description : Transpose 8x8 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - as per RTYPE +*/ +#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \ + out1, out2, out3, out4, out5, out6, out7) \ + { \ + v8i16 s0_m, s1_m; \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ + \ + ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \ + ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \ + ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \ + ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \ + PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, tmp3_m, \ + tmp7_m, out0, out2, out4, out6); \ + out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ + out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ + out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ + out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ + } +#define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__) + +/* Description : Transpose 4x4 block with word elements in vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1, out2, out3 + Return Type - signed word +*/ +#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v4i32 s0_m, s1_m, s2_m, s3_m; \ + \ + ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ + ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ + \ + out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \ + out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \ + out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \ + out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \ + } + +/* Description : Add block 4x4 + Arguments : Inputs - in0, in1, in2, in3, pdst, stride + Details : Least significant 4 bytes from each input vector are added to + the destination bytes, clipped between 0-255 and stored. +*/ +#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \ + { \ + uint32_t src0_m, src1_m, src2_m, src3_m; \ + v8i16 inp0_m, inp1_m, res0_m, res1_m; \ + v16i8 dst0_m = { 0 }; \ + v16i8 dst1_m = { 0 }; \ + v16i8 zero_m = { 0 }; \ + \ + ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \ + LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \ + INSERT_W2_SB(src0_m, src1_m, dst0_m); \ + INSERT_W2_SB(src2_m, src3_m, dst1_m); \ + ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \ + ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \ + CLIP_SH2_0_255(res0_m, res1_m); \ + PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \ + ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \ + } + +/* Description : Pack even elements of input vectors & xor with 128 + Arguments : Inputs - in0, in1 + Output - out_m + Return Type - unsigned byte + Details : Signed byte even elements from 'in0' and 'in1' are packed + together in one vector and the resulting vector is xor'ed with + 128 to shift the range from signed to unsigned byte +*/ +#define PCKEV_XORI128_UB(in0, in1) \ + ({ \ + v16u8 out_m; \ + \ + out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \ + out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \ + out_m; \ + }) + +/* Description : Converts inputs to unsigned bytes, interleave, average & store + as 8x4 unsigned byte block + Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, pdst, stride +*/ +#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, pdst, stride) \ + { \ + v16u8 tmp0_m, tmp1_m; \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + \ + tmp0_m = PCKEV_XORI128_UB(in0, in1); \ + tmp1_m = PCKEV_XORI128_UB(in2, in3); \ + AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \ + ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ + } + +/* Description : Pack even byte elements and store byte vector in destination + memory + Arguments : Inputs - in0, in1, pdst +*/ +#define PCKEV_ST_SB(in0, in1, pdst) \ + { \ + v16i8 tmp_m; \ + \ + tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \ + ST_SB(tmp_m, (pdst)); \ + } + +/* Description : Horizontal 2 tap filter kernel code + Arguments : Inputs - in0, in1, mask, coeff, shift +*/ +#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \ + ({ \ + v16i8 tmp0_m; \ + v8u16 tmp1_m; \ + \ + tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \ + tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \ + tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \ + \ + tmp1_m; \ + }) +#endif // VPX_VPX_DSP_MIPS_MACROS_MSA_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/mips/sad_mmi.c b/media/libvpx/libvpx/vpx_dsp/mips/sad_mmi.c new file mode 100644 index 0000000000..7f5882bca3 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/sad_mmi.c @@ -0,0 +1,807 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_ports/asmdefs_mmi.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +#define SAD_SRC_REF_ABS_SUB_64 \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \ + "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x17(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x10(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x1f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x18(%[src]) \n\t" \ + "gsldlc1 %[ftmp3], 0x17(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x10(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x1f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x18(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x27(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x20(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x2f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x28(%[src]) \n\t" \ + "gsldlc1 %[ftmp3], 0x27(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x20(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x2f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x28(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x37(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x30(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x3f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x38(%[src]) \n\t" \ + "gsldlc1 %[ftmp3], 0x37(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x30(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x3f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x38(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + +#define SAD_SRC_REF_ABS_SUB_32 \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \ + "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x17(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x10(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x1f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x18(%[src]) \n\t" \ + "gsldlc1 %[ftmp3], 0x17(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x10(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x1f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x18(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + +#define SAD_SRC_REF_ABS_SUB_16 \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \ + "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + +#define SAD_SRC_REF_ABS_SUB_8 \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x07(%[ref]) \n\t" \ + "gsldrc1 %[ftmp2], 0x00(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t" + +#if _MIPS_SIM == _ABIO32 +#define SAD_SRC_REF_ABS_SUB_4 \ + "ulw %[tmp0], 0x00(%[src]) \n\t" \ + "mtc1 %[tmp0], %[ftmp1] \n\t" \ + "ulw %[tmp0], 0x00(%[ref]) \n\t" \ + "mtc1 %[tmp0], %[ftmp2] \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ + "mthc1 $0, %[ftmp1] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t" +#else /* _MIPS_SIM == _ABI64 || _MIPS_SIM == _ABIN32 */ +#define SAD_SRC_REF_ABS_SUB_4 \ + "gslwlc1 %[ftmp1], 0x03(%[src]) \n\t" \ + "gslwrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "gslwlc1 %[ftmp2], 0x03(%[ref]) \n\t" \ + "gslwrc1 %[ftmp2], 0x00(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ + "mthc1 $0, %[ftmp1] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t" +#endif /* _MIPS_SIM == _ABIO32 */ + +#define SAD_SRC_AVGREF_ABS_SUB_64 \ + "gsldlc1 %[ftmp1], 0x07(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \ + "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \ + "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x17(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp1], 0x10(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp2], 0x1f(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp2], 0x18(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp3], 0x17(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x10(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x1f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x18(%[ref]) \n\t" \ + "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \ + "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \ + "gsldlc1 %[ftmp1], 0x17(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x10(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x1f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x18(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x27(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp1], 0x20(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp2], 0x2f(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp2], 0x28(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp3], 0x27(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x20(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x2f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x28(%[ref]) \n\t" \ + "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \ + "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \ + "gsldlc1 %[ftmp1], 0x27(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x20(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x2f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x28(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x37(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp1], 0x30(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp2], 0x3f(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp2], 0x38(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp3], 0x37(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x30(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x3f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x38(%[ref]) \n\t" \ + "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \ + "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \ + "gsldlc1 %[ftmp1], 0x37(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x30(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x3f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x38(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + +#define SAD_SRC_AVGREF_ABS_SUB_32 \ + "gsldlc1 %[ftmp1], 0x07(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \ + "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \ + "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x17(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp1], 0x10(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp2], 0x1f(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp2], 0x18(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp3], 0x17(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x10(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x1f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x18(%[ref]) \n\t" \ + "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \ + "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \ + "gsldlc1 %[ftmp1], 0x17(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x10(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x1f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x18(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + +#define SAD_SRC_AVGREF_ABS_SUB_16 \ + "gsldlc1 %[ftmp1], 0x07(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \ + "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \ + "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + +#define SAD_SRC_AVGREF_ABS_SUB_8 \ + "gsldlc1 %[ftmp1], 0x07(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp2], 0x07(%[ref]) \n\t" \ + "gsldrc1 %[ftmp2], 0x00(%[ref]) \n\t" \ + "pavgb %[ftmp2], %[ftmp1], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t" + +#if _MIPS_SIM == _ABIO32 +#define SAD_SRC_AVGREF_ABS_SUB_4 \ + "ulw %[tmp0], 0x00(%[second_pred]) \n\t" \ + "mtc1 %[tmp0], %[ftmp1] \n\t" \ + "ulw %[tmp0], 0x00(%[ref]) \n\t" \ + "mtc1 %[tmp0], %[ftmp2] \n\t" \ + "pavgb %[ftmp2], %[ftmp1], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ + "mthc1 $0, %[ftmp1] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t" +#else /* _MIPS_SIM == _ABI64 || _MIPS_SIM == _ABIN32 */ +#define SAD_SRC_AVGREF_ABS_SUB_4 \ + "gslwlc1 %[ftmp1], 0x03(%[second_pred]) \n\t" \ + "gslwrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \ + "gslwlc1 %[ftmp2], 0x03(%[ref]) \n\t" \ + "gslwrc1 %[ftmp2], 0x00(%[ref]) \n\t" \ + "pavgb %[ftmp2], %[ftmp1], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ + "mthc1 $0, %[ftmp1] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t" +#endif /* _MIPS_SIM == _ABIO32 */ + +#define sadMxNx4D_mmi(m, n) \ + void vpx_sad##m##x##n##x4d_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[], \ + int ref_stride, uint32_t *sad_array) { \ + int i; \ + for (i = 0; i < 4; ++i) \ + sad_array[i] = \ + vpx_sad##m##x##n##_mmi(src, src_stride, ref_array[i], ref_stride); \ + } + +static inline unsigned int vpx_sad64x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; + mips_reg l_counter = counter; + + /* clang-format off */ + __asm__ volatile ( + "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_REF_ABS_SUB_64 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_REF_ABS_SUB_64 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp5] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter), + [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + /* clang-format on */ + + return sad; +} + +#define vpx_sad64xN(H) \ + unsigned int vpx_sad64x##H##_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return vpx_sad64x(src, src_stride, ref, ref_stride, H); \ + } + +vpx_sad64xN(64); +vpx_sad64xN(32); +sadMxNx4D_mmi(64, 64); +sadMxNx4D_mmi(64, 32); + +static inline unsigned int vpx_sad_avg64x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; + mips_reg l_counter = counter; + mips_reg l_second_pred = (mips_reg)second_pred; + + /* clang-format off */ + __asm__ volatile ( + "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_AVGREF_ABS_SUB_64 + MMI_ADDIU(%[second_pred], %[second_pred], 0x40) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_AVGREF_ABS_SUB_64 + MMI_ADDIU(%[second_pred], %[second_pred], 0x40) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp5] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter), + [src]"+&r"(src), [ref]"+&r"(ref), + [second_pred]"+&r"(l_second_pred), + [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + /* clang-format on */ + + return sad; +} + +#define vpx_sad_avg64xN(H) \ + unsigned int vpx_sad64x##H##_avg_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return vpx_sad_avg64x(src, src_stride, ref, ref_stride, second_pred, H); \ + } + +vpx_sad_avg64xN(64); +vpx_sad_avg64xN(32); + +static inline unsigned int vpx_sad32x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; + mips_reg l_counter = counter; + + /* clang-format off */ + __asm__ volatile ( + "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_REF_ABS_SUB_32 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_REF_ABS_SUB_32 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp5] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter), + [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + /* clang-format on */ + + return sad; +} + +#define vpx_sad32xN(H) \ + unsigned int vpx_sad32x##H##_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return vpx_sad32x(src, src_stride, ref, ref_stride, H); \ + } + +vpx_sad32xN(64); +vpx_sad32xN(32); +vpx_sad32xN(16); +sadMxNx4D_mmi(32, 64); +sadMxNx4D_mmi(32, 32); +sadMxNx4D_mmi(32, 16); + +static inline unsigned int vpx_sad_avg32x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; + mips_reg l_counter = counter; + mips_reg l_second_pred = (mips_reg)second_pred; + + /* clang-format off */ + __asm__ volatile ( + "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_AVGREF_ABS_SUB_32 + MMI_ADDIU(%[second_pred], %[second_pred], 0x20) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_AVGREF_ABS_SUB_32 + MMI_ADDIU(%[second_pred], %[second_pred], 0x20) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp5] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter), + [src]"+&r"(src), [ref]"+&r"(ref), + [second_pred]"+&r"(l_second_pred), + [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + /* clang-format on */ + + return sad; +} + +#define vpx_sad_avg32xN(H) \ + unsigned int vpx_sad32x##H##_avg_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return vpx_sad_avg32x(src, src_stride, ref, ref_stride, second_pred, H); \ + } + +vpx_sad_avg32xN(64); +vpx_sad_avg32xN(32); +vpx_sad_avg32xN(16); + +static inline unsigned int vpx_sad16x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; + mips_reg l_counter = counter; + + /* clang-format off */ + __asm__ volatile ( + "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_REF_ABS_SUB_16 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_REF_ABS_SUB_16 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp5] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter), + [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + /* clang-format on */ + + return sad; +} + +#define vpx_sad16xN(H) \ + unsigned int vpx_sad16x##H##_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return vpx_sad16x(src, src_stride, ref, ref_stride, H); \ + } + +vpx_sad16xN(32); +vpx_sad16xN(16); +vpx_sad16xN(8); +sadMxNx4D_mmi(16, 32); +sadMxNx4D_mmi(16, 16); +sadMxNx4D_mmi(16, 8); + +static inline unsigned int vpx_sad_avg16x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; + mips_reg l_counter = counter; + mips_reg l_second_pred = (mips_reg)second_pred; + + /* clang-format off */ + __asm__ volatile ( + "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_AVGREF_ABS_SUB_16 + MMI_ADDIU(%[second_pred], %[second_pred], 0x10) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_AVGREF_ABS_SUB_16 + MMI_ADDIU(%[second_pred], %[second_pred], 0x10) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp5] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter), + [src]"+&r"(src), [ref]"+&r"(ref), + [second_pred]"+&r"(l_second_pred), + [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + /* clang-format on */ + + return sad; +} + +#define vpx_sad_avg16xN(H) \ + unsigned int vpx_sad16x##H##_avg_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return vpx_sad_avg16x(src, src_stride, ref, ref_stride, second_pred, H); \ + } + +vpx_sad_avg16xN(32); +vpx_sad_avg16xN(16); +vpx_sad_avg16xN(8); + +static inline unsigned int vpx_sad8x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3; + mips_reg l_counter = counter; + + /* clang-format off */ + __asm__ volatile ( + "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_REF_ABS_SUB_8 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_REF_ABS_SUB_8 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp3] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref), + [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + /* clang-format on */ + + return sad; +} + +#define vpx_sad8xN(H) \ + unsigned int vpx_sad8x##H##_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return vpx_sad8x(src, src_stride, ref, ref_stride, H); \ + } + +vpx_sad8xN(16); +vpx_sad8xN(8); +vpx_sad8xN(4); +sadMxNx4D_mmi(8, 16); +sadMxNx4D_mmi(8, 8); +sadMxNx4D_mmi(8, 4); + +static inline unsigned int vpx_sad_avg8x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3; + mips_reg l_counter = counter; + mips_reg l_second_pred = (mips_reg)second_pred; + + /* clang-format off */ + __asm__ volatile ( + "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_AVGREF_ABS_SUB_8 + MMI_ADDIU(%[second_pred], %[second_pred], 0x08) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_AVGREF_ABS_SUB_8 + MMI_ADDIU(%[second_pred], %[second_pred], 0x08) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp3] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref), + [second_pred]"+&r"(l_second_pred), + [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + /* clang-format on */ + + return sad; +} + +#define vpx_sad_avg8xN(H) \ + unsigned int vpx_sad8x##H##_avg_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return vpx_sad_avg8x(src, src_stride, ref, ref_stride, second_pred, H); \ + } + +vpx_sad_avg8xN(16); +vpx_sad_avg8xN(8); +vpx_sad_avg8xN(4); + +static inline unsigned int vpx_sad4x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3; + mips_reg l_counter = counter; + + /* clang-format off */ + __asm__ volatile ( + "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_REF_ABS_SUB_4 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_REF_ABS_SUB_4 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp3] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref), + [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + /* clang-format on */ + + return sad; +} + +#define vpx_sad4xN(H) \ + unsigned int vpx_sad4x##H##_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return vpx_sad4x(src, src_stride, ref, ref_stride, H); \ + } + +vpx_sad4xN(8); +vpx_sad4xN(4); +sadMxNx4D_mmi(4, 8); +sadMxNx4D_mmi(4, 4); + +static inline unsigned int vpx_sad_avg4x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3; + mips_reg l_counter = counter; + mips_reg l_second_pred = (mips_reg)second_pred; + + /* clang-format off */ + __asm__ volatile ( + "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_AVGREF_ABS_SUB_4 + MMI_ADDIU(%[second_pred], %[second_pred], 0x04) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_AVGREF_ABS_SUB_4 + MMI_ADDIU(%[second_pred], %[second_pred], 0x04) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp3] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref), + [second_pred]"+&r"(l_second_pred), + [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + /* clang-format on */ + + return sad; +} + +#define vpx_sad_avg4xN(H) \ + unsigned int vpx_sad4x##H##_avg_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return vpx_sad_avg4x(src, src_stride, ref, ref_stride, second_pred, H); \ + } + +vpx_sad_avg4xN(8); +vpx_sad_avg4xN(4); diff --git a/media/libvpx/libvpx/vpx_dsp/mips/sad_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/sad_msa.c new file mode 100644 index 0000000000..b0f8ff1fd9 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/sad_msa.c @@ -0,0 +1,804 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/macros_msa.h" + +#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) \ + { \ + out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \ + out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \ + out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \ + out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \ + } +#define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__) + +static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v16u8 diff; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LW4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + INSERT_W4_UB(src0, src1, src2, src3, src); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + + diff = __msa_asub_u_b(src, ref); + sad += __msa_hadd_u_h(diff, diff); + } + + return HADD_UH_U32(sad); +} + +static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); + ref += (4 * ref_stride); + + PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, + ref0, ref1); + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + return HADD_UH_U32(sad); +} + +static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + LD_UB2(ref, ref_stride, ref0, ref1); + ref += (2 * ref_stride); + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + LD_UB2(ref, ref_stride, ref0, ref1); + ref += (2 * ref_stride); + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + return HADD_UH_U32(sad); +} + +static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB2(src, 16, src0, src1); + src += src_stride; + LD_UB2(ref, 16, ref0, ref1); + ref += ref_stride; + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(src, 16, src0, src1); + src += src_stride; + LD_UB2(ref, 16, ref0, ref1); + ref += ref_stride; + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(src, 16, src0, src1); + src += src_stride; + LD_UB2(ref, 16, ref0, ref1); + ref += ref_stride; + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(src, 16, src0, src1); + src += src_stride; + LD_UB2(ref, 16, ref0, ref1); + ref += ref_stride; + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + return HADD_UH_U32(sad); +} + +static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + uint32_t sad = 0; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + + for (ht_cnt = (height >> 1); ht_cnt--;) { + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref += ref_stride; + sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref += ref_stride; + sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad1 += SAD_UB2_UH(src2, src3, ref2, ref3); + } + + sad = HADD_UH_U32(sad0); + sad += HADD_UH_U32(sad1); + + return sad; +} + +static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + int32_t ht_cnt; + uint32_t src0, src1, src2, src3; + uint32_t ref0, ref1, ref2, ref3; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v16u8 diff; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LW4(src_ptr, src_stride, src0, src1, src2, src3); + INSERT_W4_UB(src0, src1, src2, src3, src); + src_ptr += (4 * src_stride); + + LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + ref0_ptr += (4 * ref_stride); + + diff = __msa_asub_u_b(src, ref); + sad0 += __msa_hadd_u_h(diff, diff); + + LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + ref1_ptr += (4 * ref_stride); + + diff = __msa_asub_u_b(src, ref); + sad1 += __msa_hadd_u_h(diff, diff); + + LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + ref2_ptr += (4 * ref_stride); + + diff = __msa_asub_u_b(src, ref); + sad2 += __msa_hadd_u_h(diff, diff); + + LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + ref3_ptr += (4 * ref_stride); + + diff = __msa_asub_u_b(src, ref); + sad3 += __msa_hadd_u_h(diff, diff); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); +} + +static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + int32_t ht_cnt; + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; + v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref0_ptr += (4 * ref_stride); + LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7); + ref1_ptr += (4 * ref_stride); + LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11); + ref2_ptr += (4 * ref_stride); + LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15); + ref3_ptr += (4 * ref_stride); + + PCKEV_D2_UB(src1, src0, src3, src2, src0, src1); + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); + + PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1); + sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); + + PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1); + sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); + + PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1); + sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); +} + +static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + int32_t ht_cnt; + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + v16u8 src, ref0, ref1, ref2, ref3, diff; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (ht_cnt = (height >> 1); ht_cnt--;) { + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref0 = LD_UB(ref0_ptr); + ref0_ptr += ref_stride; + ref1 = LD_UB(ref1_ptr); + ref1_ptr += ref_stride; + ref2 = LD_UB(ref2_ptr); + ref2_ptr += ref_stride; + ref3 = LD_UB(ref3_ptr); + ref3_ptr += ref_stride; + + diff = __msa_asub_u_b(src, ref0); + sad0 += __msa_hadd_u_h(diff, diff); + diff = __msa_asub_u_b(src, ref1); + sad1 += __msa_hadd_u_h(diff, diff); + diff = __msa_asub_u_b(src, ref2); + sad2 += __msa_hadd_u_h(diff, diff); + diff = __msa_asub_u_b(src, ref3); + sad3 += __msa_hadd_u_h(diff, diff); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref0 = LD_UB(ref0_ptr); + ref0_ptr += ref_stride; + ref1 = LD_UB(ref1_ptr); + ref1_ptr += ref_stride; + ref2 = LD_UB(ref2_ptr); + ref2_ptr += ref_stride; + ref3 = LD_UB(ref3_ptr); + ref3_ptr += ref_stride; + + diff = __msa_asub_u_b(src, ref0); + sad0 += __msa_hadd_u_h(diff, diff); + diff = __msa_asub_u_b(src, ref1); + sad1 += __msa_hadd_u_h(diff, diff); + diff = __msa_asub_u_b(src, ref2); + sad2 += __msa_hadd_u_h(diff, diff); + diff = __msa_asub_u_b(src, ref3); + sad3 += __msa_hadd_u_h(diff, diff); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); +} + +static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (ht_cnt = height; ht_cnt--;) { + LD_UB2(src, 16, src0, src1); + src += src_stride; + + LD_UB2(ref0_ptr, 16, ref0, ref1); + ref0_ptr += ref_stride; + sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(ref1_ptr, 16, ref0, ref1); + ref1_ptr += ref_stride; + sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(ref2_ptr, 16, ref0, ref1); + ref2_ptr += ref_stride; + sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(ref3_ptr, 16, ref0, ref1); + ref3_ptr += ref_stride; + sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); +} + +static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v8u16 sad0_0 = { 0 }; + v8u16 sad0_1 = { 0 }; + v8u16 sad1_0 = { 0 }; + v8u16 sad1_1 = { 0 }; + v8u16 sad2_0 = { 0 }; + v8u16 sad2_1 = { 0 }; + v8u16 sad3_0 = { 0 }; + v8u16 sad3_1 = { 0 }; + v4u32 sad; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (ht_cnt = height; ht_cnt--;) { + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + + LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3); + ref0_ptr += ref_stride; + sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3); + ref1_ptr += ref_stride; + sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3); + ref2_ptr += ref_stride; + sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3); + ref3_ptr += ref_stride; + sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + } + + sad = __msa_hadd_u_w(sad0_0, sad0_0); + sad += __msa_hadd_u_w(sad0_1, sad0_1); + sad_array[0] = HADD_UW_U32(sad); + + sad = __msa_hadd_u_w(sad1_0, sad1_0); + sad += __msa_hadd_u_w(sad1_1, sad1_1); + sad_array[1] = HADD_UW_U32(sad); + + sad = __msa_hadd_u_w(sad2_0, sad2_0); + sad += __msa_hadd_u_w(sad2_1, sad2_1); + sad_array[2] = HADD_UW_U32(sad); + + sad = __msa_hadd_u_w(sad3_0, sad3_0); + sad += __msa_hadd_u_w(sad3_1, sad3_1); + sad_array[3] = HADD_UW_U32(sad); +} + +static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t ht_cnt; + uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v16u8 diff, pred, comp; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LW4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + pred = LD_UB(sec_pred); + sec_pred += 16; + + INSERT_W4_UB(src0, src1, src2, src3, src); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + + comp = __msa_aver_u_b(pred, ref); + diff = __msa_asub_u_b(src, comp); + sad += __msa_hadd_u_h(diff, diff); + } + + return HADD_UH_U32(sad); +} + +static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; + v16u8 diff0, diff1, pred0, pred1; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); + ref += (4 * ref_stride); + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, + ref0, ref1); + AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1); + sad += SAD_UB2_UH(src0, src1, diff0, diff1); + } + + return HADD_UH_U32(sad); +} + +static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; + v16u8 pred0, pred1, pred2, pred3, comp0, comp1; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 3); ht_cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); + ref += (4 * ref_stride); + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += (4 * 16); + AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); + sad += SAD_UB2_UH(src0, src1, comp0, comp1); + AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); + sad += SAD_UB2_UH(src2, src3, comp0, comp1); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); + ref += (4 * ref_stride); + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += (4 * 16); + AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); + sad += SAD_UB2_UH(src0, src1, comp0, comp1); + AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); + sad += SAD_UB2_UH(src2, src3, comp0, comp1); + } + + return HADD_UH_U32(sad); +} + +static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; + v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + v16u8 comp0, comp1; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src, src_stride, src0, src2, src4, src6); + LD_UB4(src + 16, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6); + LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7); + ref += (4 * ref_stride); + + LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6); + LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7); + sec_pred += (4 * 32); + + AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); + sad += SAD_UB2_UH(src0, src1, comp0, comp1); + AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); + sad += SAD_UB2_UH(src2, src3, comp0, comp1); + AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1); + sad += SAD_UB2_UH(src4, src5, comp0, comp1); + AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1); + sad += SAD_UB2_UH(src6, src7, comp0, comp1); + } + + return HADD_UH_U32(sad); +} + +static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v16u8 comp0, comp1, comp2, comp3; + v16u8 pred0, pred1, pred2, pred3; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v4u32 sad; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref += ref_stride; + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, + comp1, comp2, comp3); + sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); + sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); + + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref += ref_stride; + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, + comp1, comp2, comp3); + sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); + sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); + + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref += ref_stride; + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, + comp1, comp2, comp3); + sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); + sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); + + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref += ref_stride; + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, + comp1, comp2, comp3); + sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); + sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); + } + + sad = __msa_hadd_u_w(sad0, sad0); + sad += __msa_hadd_u_w(sad1, sad1); + + return HADD_SW_S32(sad); +} + +#define VPX_SAD_4xHEIGHT_MSA(height) \ + uint32_t vpx_sad4x##height##_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_4width_msa(src, src_stride, ref, ref_stride, height); \ + } + +#define VPX_SAD_8xHEIGHT_MSA(height) \ + uint32_t vpx_sad8x##height##_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_8width_msa(src, src_stride, ref, ref_stride, height); \ + } + +#define VPX_SAD_16xHEIGHT_MSA(height) \ + uint32_t vpx_sad16x##height##_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_16width_msa(src, src_stride, ref, ref_stride, height); \ + } + +#define VPX_SAD_32xHEIGHT_MSA(height) \ + uint32_t vpx_sad32x##height##_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_32width_msa(src, src_stride, ref, ref_stride, height); \ + } + +#define VPX_SAD_64xHEIGHT_MSA(height) \ + uint32_t vpx_sad64x##height##_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_64width_msa(src, src_stride, ref, ref_stride, height); \ + } + +#define VPX_SAD_4xHEIGHTx4D_MSA(height) \ + void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[4], \ + int32_t ref_stride, uint32_t sads[4]) { \ + sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define VPX_SAD_8xHEIGHTx4D_MSA(height) \ + void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[4], \ + int32_t ref_stride, uint32_t sads[4]) { \ + sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define VPX_SAD_16xHEIGHTx4D_MSA(height) \ + void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[4], \ + int32_t ref_stride, uint32_t sads[4]) { \ + sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define VPX_SAD_32xHEIGHTx4D_MSA(height) \ + void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[4], \ + int32_t ref_stride, uint32_t sads[4]) { \ + sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define VPX_SAD_64xHEIGHTx4D_MSA(height) \ + void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[4], \ + int32_t ref_stride, uint32_t sads[4]) { \ + sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define VPX_AVGSAD_4xHEIGHT_MSA(height) \ + uint32_t vpx_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + const uint8_t *second_pred) { \ + return avgsad_4width_msa(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +#define VPX_AVGSAD_8xHEIGHT_MSA(height) \ + uint32_t vpx_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + const uint8_t *second_pred) { \ + return avgsad_8width_msa(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +#define VPX_AVGSAD_16xHEIGHT_MSA(height) \ + uint32_t vpx_sad16x##height##_avg_msa( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, const uint8_t *second_pred) { \ + return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +#define VPX_AVGSAD_32xHEIGHT_MSA(height) \ + uint32_t vpx_sad32x##height##_avg_msa( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, const uint8_t *second_pred) { \ + return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +#define VPX_AVGSAD_64xHEIGHT_MSA(height) \ + uint32_t vpx_sad64x##height##_avg_msa( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, const uint8_t *second_pred) { \ + return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +// 64x64 +VPX_SAD_64xHEIGHT_MSA(64); +VPX_SAD_64xHEIGHTx4D_MSA(64); +VPX_AVGSAD_64xHEIGHT_MSA(64); + +// 64x32 +VPX_SAD_64xHEIGHT_MSA(32); +VPX_SAD_64xHEIGHTx4D_MSA(32); +VPX_AVGSAD_64xHEIGHT_MSA(32); + +// 32x64 +VPX_SAD_32xHEIGHT_MSA(64); +VPX_SAD_32xHEIGHTx4D_MSA(64); +VPX_AVGSAD_32xHEIGHT_MSA(64); + +// 32x32 +VPX_SAD_32xHEIGHT_MSA(32); +VPX_SAD_32xHEIGHTx4D_MSA(32); +VPX_AVGSAD_32xHEIGHT_MSA(32); + +// 32x16 +VPX_SAD_32xHEIGHT_MSA(16); +VPX_SAD_32xHEIGHTx4D_MSA(16); +VPX_AVGSAD_32xHEIGHT_MSA(16); + +// 16x32 +VPX_SAD_16xHEIGHT_MSA(32); +VPX_SAD_16xHEIGHTx4D_MSA(32); +VPX_AVGSAD_16xHEIGHT_MSA(32); + +// 16x16 +VPX_SAD_16xHEIGHT_MSA(16); +VPX_SAD_16xHEIGHTx4D_MSA(16); +VPX_AVGSAD_16xHEIGHT_MSA(16); + +// 16x8 +VPX_SAD_16xHEIGHT_MSA(8); +VPX_SAD_16xHEIGHTx4D_MSA(8); +VPX_AVGSAD_16xHEIGHT_MSA(8); + +// 8x16 +VPX_SAD_8xHEIGHT_MSA(16); +VPX_SAD_8xHEIGHTx4D_MSA(16); +VPX_AVGSAD_8xHEIGHT_MSA(16); + +// 8x8 +VPX_SAD_8xHEIGHT_MSA(8); +VPX_SAD_8xHEIGHTx4D_MSA(8); +VPX_AVGSAD_8xHEIGHT_MSA(8); + +// 8x4 +VPX_SAD_8xHEIGHT_MSA(4); +VPX_SAD_8xHEIGHTx4D_MSA(4); +VPX_AVGSAD_8xHEIGHT_MSA(4); + +// 4x8 +VPX_SAD_4xHEIGHT_MSA(8); +VPX_SAD_4xHEIGHTx4D_MSA(8); +VPX_AVGSAD_4xHEIGHT_MSA(8); + +// 4x4 +VPX_SAD_4xHEIGHT_MSA(4); +VPX_SAD_4xHEIGHTx4D_MSA(4); +VPX_AVGSAD_4xHEIGHT_MSA(4); diff --git a/media/libvpx/libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c new file mode 100644 index 0000000000..572fcabfc0 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c @@ -0,0 +1,1789 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_ports/mem.h" +#include "vpx_dsp/mips/macros_msa.h" +#include "vpx_dsp/variance.h" + +static const uint8_t bilinear_filters_msa[8][2] = { + { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, + { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, +}; + +#define CALC_MSE_AVG_B(src, ref, var, sub) \ + { \ + v16u8 src_l0_m, src_l1_m; \ + v8i16 res_l0_m, res_l1_m; \ + \ + ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ + HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ + DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ + \ + (sub) += res_l0_m + res_l1_m; \ + } + +#define VARIANCE_WxH(sse, diff, shift) \ + (sse) - (((uint32_t)(diff) * (diff)) >> (shift)) + +#define VARIANCE_LARGE_WxH(sse, diff, shift) \ + (sse) - (((int64_t)(diff) * (diff)) >> (shift)) + +static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, int32_t height, + int32_t *diff) { + int32_t ht_cnt; + uint32_t src0, src1, src2, src3; + uint32_t ref0, ref1, ref2, ref3; + v16u8 pred, src = { 0 }; + v16u8 ref = { 0 }; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + pred = LD_UB(sec_pred); + sec_pred += 16; + LW4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + INSERT_W4_UB(src0, src1, src2, src3, src); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + + src = __msa_aver_u_b(src, pred); + CALC_MSE_AVG_B(src, ref, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, int32_t height, + int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v16u8 pred0, pred1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, + ref0, ref1); + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, + int32_t height, int32_t *diff) { + int32_t ht_cnt; + v16u8 src, ref, pred; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + pred = LD_UB(sec_pred); + sec_pred += 16; + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + src = __msa_aver_u_b(src, pred); + CALC_MSE_AVG_B(src, ref, var, avg); + + pred = LD_UB(sec_pred); + sec_pred += 16; + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + src = __msa_aver_u_b(src, pred); + CALC_MSE_AVG_B(src, ref, var, avg); + + pred = LD_UB(sec_pred); + sec_pred += 16; + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + src = __msa_aver_u_b(src, pred); + CALC_MSE_AVG_B(src, ref, var, avg); + + pred = LD_UB(sec_pred); + sec_pred += 16; + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + src = __msa_aver_u_b(src, pred); + CALC_MSE_AVG_B(src, ref, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, + int32_t height, int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1, pred0, pred1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1, pred0, pred1; + v8i16 avg0 = { 0 }; + v8i16 avg1 = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = 16; ht_cnt--;) { + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + } + + vec = __msa_hadd_s_w(avg0, avg0); + vec += __msa_hadd_s_w(avg1, avg1); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v16u8 pred0, pred1, pred2, pred3; + v8i16 avg0 = { 0 }; + v8i16 avg1 = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = 16; ht_cnt--;) { + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, + src2, src3); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src2, ref2, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src3, ref3, var, avg1); + + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, + src2, src3); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src2, ref2, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src3, ref3, var, avg1); + } + + vec = __msa_hadd_s_w(avg0, avg0); + vec += __msa_hadd_s_w(avg1, avg1); + + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v16u8 pred0, pred1, pred2, pred3; + v8i16 avg0 = { 0 }; + v8i16 avg1 = { 0 }; + v8i16 avg2 = { 0 }; + v8i16 avg3 = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = 32; ht_cnt--;) { + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, + src2, src3); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, + src2, src3); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + } + + vec = __msa_hadd_s_w(avg0, avg0); + vec += __msa_hadd_s_w(avg1, avg1); + vec += __msa_hadd_s_w(avg2, avg2); + vec += __msa_hadd_s_w(avg3, avg3); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_4width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + uint32_t ref0, ref1, ref2, ref3; + v16u8 filt0, ref = { 0 }; + v16i8 src0, src1, src2, src3; + v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 vec0, vec1, vec2, vec3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LW4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, + src2, src3); + ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); + src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); + CALC_MSE_AVG_B(src0, ref, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_8width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 filt0, out, ref0, ref1, ref2, ref3; + v16i8 src0, src1, src2, src3; + v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 vec0, vec1, vec2, vec3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, + src2, src3); + out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); + CALC_MSE_AVG_B(out, ref0, var, avg); + out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); + CALC_MSE_AVG_B(out, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_16width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 dst0, dst1, dst2, dst3, filt0; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + dst += (4 * dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, src0, src1, + src2, src3); + CALC_MSE_AVG_B(src0, dst0, var, avg); + CALC_MSE_AVG_B(src1, dst1, var, avg); + CALC_MSE_AVG_B(src2, dst2, var, avg); + CALC_MSE_AVG_B(src3, dst3, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_32width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[2]; + + for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { + sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride, + filter, height, &diff0[loop_cnt]); + src += 16; + dst += 16; + } + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_64width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride, + filter, height, &diff0[loop_cnt]); + src += 16; + dst += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_4width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + uint32_t ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4, out; + v16u8 src10_r, src32_r, src21_r, src43_r; + v16u8 ref = { 0 }; + v16u8 src2110, src4332; + v16u8 filt0; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + v8u16 tmp0, tmp1; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LW4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); + DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + CALC_MSE_AVG_B(out, ref, var, avg); + src0 = src4; + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_8width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4; + v16u8 ref0, ref1, ref2, ref3; + v8u16 vec0, vec1, vec2, vec3; + v8u16 tmp0, tmp1, tmp2, tmp3; + v16u8 filt0; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, + vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + src0 = src4; + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_16width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4; + v16u8 out0, out1, out2, out3; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 tmp0, tmp1, tmp2, tmp3; + v16u8 filt0; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); + + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); + + src0 = src4; + + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + CALC_MSE_AVG_B(out2, ref2, var, avg); + CALC_MSE_AVG_B(out3, ref3, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_32width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[2]; + + for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { + sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride, + filter, height, &diff0[loop_cnt]); + src += 16; + dst += 16; + } + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_64width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride, + filter, height, &diff0[loop_cnt]); + src += 16; + dst += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_4width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + uint32_t ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4; + v16u8 out, ref = { 0 }; + v16u8 filt_vt, filt_hz, vec0, vec1; + v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4; + v8u16 tmp0, tmp1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter_horiz); + filt_hz = (v16u8)__msa_fill_h(filtval); + filtval = LH(filter_vert); + filt_vt = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LW4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); + hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + CALC_MSE_AVG_B(out, ref, var, avg); + src0 = src4; + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_8width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4; + v16u8 out0, out1; + v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 hz_out0, hz_out1; + v8u16 tmp0, tmp1, tmp2, tmp3; + v16u8 filt_vt, filt_hz, vec0; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter_horiz); + filt_hz = (v16u8)__msa_fill_h(filtval); + filtval = LH(filter_vert); + filt_vt = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp0 = __msa_dotp_u_h(vec0, filt_vt); + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp1 = __msa_dotp_u_h(vec0, filt_vt); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp2 = __msa_dotp_u_h(vec0, filt_vt); + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp3 = __msa_dotp_u_h(vec0, filt_vt); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_16width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 ref0, ref1, ref2, ref3; + v16u8 filt_hz, filt_vt, vec0, vec1; + v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3; + v8u16 tmp0, tmp1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter_horiz); + filt_hz = (v16u8)__msa_fill_h(filtval); + filtval = LH(filter_vert); + filt_vt = (v16u8)__msa_fill_h(filtval); + + LD_UB2(src, 8, src0, src1); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src0, src2, src4, src6); + LD_UB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + CALC_MSE_AVG_B(src2, ref2, var, avg); + CALC_MSE_AVG_B(src3, ref3, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_32width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[2]; + + for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { + sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height, + &diff0[loop_cnt]); + src += 16; + dst += 16; + } + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_64width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height, + &diff0[loop_cnt]); + src += 16; + dst += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_4width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + uint32_t ref0, ref1, ref2, ref3; + v16u8 out, pred, filt0, ref = { 0 }; + v16i8 src0, src1, src2, src3; + v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 vec0, vec1, vec2, vec3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + pred = LD_UB(sec_pred); + sec_pred += 16; + LW4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, + src2, src3); + ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); + out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); + out = __msa_aver_u_b(out, pred); + CALC_MSE_AVG_B(out, ref, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_avg_sse_diff_8width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 out, pred, filt0; + v16u8 ref0, ref1, ref2, ref3; + v16i8 src0, src1, src2, src3; + v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 vec0, vec1, vec2, vec3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, + src2, src3); + out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); + + pred = LD_UB(sec_pred); + sec_pred += 16; + out = __msa_aver_u_b(out, pred); + CALC_MSE_AVG_B(out, ref0, var, avg); + out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); + pred = LD_UB(sec_pred); + sec_pred += 16; + out = __msa_aver_u_b(out, pred); + CALC_MSE_AVG_B(out, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t subpel_avg_ssediff_16w_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff, int32_t width) { + int16_t filtval; + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 dst0, dst1, dst2, dst3; + v16u8 tmp0, tmp1, tmp2, tmp3; + v16u8 pred0, pred1, pred2, pred3, filt0; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + dst += (4 * dst_stride); + LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); + sec_pred += (4 * width); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, tmp0, tmp1, + tmp2, tmp3); + AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, tmp0, tmp1, + tmp2, tmp3); + + CALC_MSE_AVG_B(tmp0, dst0, var, avg); + CALC_MSE_AVG_B(tmp1, dst1, var, avg); + CALC_MSE_AVG_B(tmp2, dst2, var, avg); + CALC_MSE_AVG_B(tmp3, dst3, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_avg_sse_diff_16width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, + sec_pred, filter, height, diff, 16); +} + +static uint32_t sub_pixel_avg_sse_diff_32width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[2]; + + for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { + sse += + subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred, + filter, height, &diff0[loop_cnt], 32); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_64width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += + subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred, + filter, height, &diff0[loop_cnt], 64); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_4width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + uint32_t ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4; + v16u8 src10_r, src32_r, src21_r, src43_r; + v16u8 out, pred, ref = { 0 }; + v16u8 src2110, src4332, filt0; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + v8u16 tmp0, tmp1; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + pred = LD_UB(sec_pred); + sec_pred += 16; + LW4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); + DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + + out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + out = __msa_aver_u_b(out, pred); + CALC_MSE_AVG_B(out, ref, var, avg); + src0 = src4; + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_avg_sse_diff_8width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4; + v16u8 ref0, ref1, ref2, ref3; + v16u8 pred0, pred1, filt0; + v8u16 vec0, vec1, vec2, vec3; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, + vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + src0 = src4; + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t subpel_avg_ssediff_16w_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff, int32_t width) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 ref0, ref1, ref2, ref3; + v16u8 pred0, pred1, pred2, pred3; + v16u8 src0, src1, src2, src3, src4; + v16u8 out0, out1, out2, out3, filt0; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); + sec_pred += (4 * width); + + ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); + + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); + + src0 = src4; + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1, + out2, out3); + + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + CALC_MSE_AVG_B(out2, ref2, var, avg); + CALC_MSE_AVG_B(out3, ref3, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_avg_sse_diff_16width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, + sec_pred, filter, height, diff, 16); +} + +static uint32_t sub_pixel_avg_sse_diff_32width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[2]; + + for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { + sse += + subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred, + filter, height, &diff0[loop_cnt], 32); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_64width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += + subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred, + filter, height, &diff0[loop_cnt], 64); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + uint32_t ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4; + v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; + v16u8 filt_hz, filt_vt, vec0, vec1; + v16u8 out, pred, ref = { 0 }; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter_horiz); + filt_hz = (v16u8)__msa_fill_h(filtval); + filtval = LH(filter_vert); + filt_vt = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + pred = LD_UB(sec_pred); + sec_pred += 16; + LW4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); + hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + out = __msa_aver_u_b(out, pred); + CALC_MSE_AVG_B(out, ref, var, avg); + src0 = src4; + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4; + v16u8 pred0, pred1, out0, out1; + v16u8 filt_hz, filt_vt, vec0; + v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter_horiz); + filt_hz = (v16u8)__msa_fill_h(filtval); + filtval = LH(filter_vert); + filt_vt = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp0 = __msa_dotp_u_h(vec0, filt_vt); + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp1 = __msa_dotp_u_h(vec0, filt_vt); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp2 = __msa_dotp_u_h(vec0, filt_vt); + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp3 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); + AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1); + + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t subpel_avg_ssediff_16w_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 ref0, ref1, ref2, ref3; + v16u8 pred0, pred1, pred2, pred3; + v16u8 out0, out1, out2, out3; + v16u8 filt_hz, filt_vt, vec0, vec1; + v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter_horiz); + filt_hz = (v16u8)__msa_fill_h(filtval); + filtval = LH(filter_vert); + filt_vt = (v16u8)__msa_fill_h(filtval); + + LD_UB2(src, 8, src0, src1); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src0, src2, src4, src6); + LD_UB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); + sec_pred += (4 * width); + + hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1, + out2, out3); + + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + CALC_MSE_AVG_B(out2, ref2, var, avg); + CALC_MSE_AVG_B(out3, ref3, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { + return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, + sec_pred, filter_horiz, filter_vert, + height, diff, 16); +} + +static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[2]; + + for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { + sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, + sec_pred, filter_horiz, filter_vert, + height, &diff0[loop_cnt], 32); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, + sec_pred, filter_horiz, filter_vert, + height, &diff0[loop_cnt], 64); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4); +#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5); +#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5); +#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6); +#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7); +#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7); +#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8); + +#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); +#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); +#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10); +#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); +#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); +#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12); + +#define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \ + uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa( \ + const uint8_t *src, int32_t src_stride, int32_t x_offset, \ + int32_t y_offset, const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sse) { \ + int32_t diff; \ + uint32_t var; \ + const uint8_t *h_filter = bilinear_filters_msa[x_offset]; \ + const uint8_t *v_filter = bilinear_filters_msa[y_offset]; \ + \ + if (y_offset) { \ + if (x_offset) { \ + *sse = sub_pixel_sse_diff_##wd##width_hv_msa( \ + src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \ + } else { \ + *sse = sub_pixel_sse_diff_##wd##width_v_msa( \ + src, src_stride, ref, ref_stride, v_filter, ht, &diff); \ + } \ + \ + var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } else { \ + if (x_offset) { \ + *sse = sub_pixel_sse_diff_##wd##width_h_msa( \ + src, src_stride, ref, ref_stride, h_filter, ht, &diff); \ + \ + var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } else { \ + var = vpx_variance##wd##x##ht##_msa(src, src_stride, ref, ref_stride, \ + sse); \ + } \ + } \ + \ + return var; \ + } + +VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4); +VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8); + +VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4); +VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8); +VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16); + +VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8); +VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16); +VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32); + +VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16); +VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32); +VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64); + +VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32); +VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64); + +#define VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht) \ + uint32_t vpx_sub_pixel_avg_variance##wd##x##ht##_msa( \ + const uint8_t *src_ptr, int32_t src_stride, int32_t x_offset, \ + int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride, \ + uint32_t *sse, const uint8_t *sec_pred) { \ + int32_t diff; \ + const uint8_t *h_filter = bilinear_filters_msa[x_offset]; \ + const uint8_t *v_filter = bilinear_filters_msa[y_offset]; \ + \ + if (y_offset) { \ + if (x_offset) { \ + *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \ + v_filter, ht, &diff); \ + } else { \ + *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \ + &diff); \ + } \ + } else { \ + if (x_offset) { \ + *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \ + &diff); \ + } else { \ + *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, ref_ptr, \ + ref_stride, sec_pred, ht, &diff); \ + } \ + } \ + \ + return VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } + +VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4); +VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8); + +VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4); +VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8); +VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16); + +VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8); +VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16); +VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32); + +VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16); +VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32); + +uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr, + int32_t src_stride, + int32_t x_offset, int32_t y_offset, + const uint8_t *ref_ptr, + int32_t ref_stride, uint32_t *sse, + const uint8_t *sec_pred) { + int32_t diff; + const uint8_t *h_filter = bilinear_filters_msa[x_offset]; + const uint8_t *v_filter = bilinear_filters_msa[y_offset]; + + if (y_offset) { + if (x_offset) { + *sse = sub_pixel_avg_sse_diff_32width_hv_msa( + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, + v_filter, 64, &diff); + } else { + *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride, ref_ptr, + ref_stride, sec_pred, + v_filter, 64, &diff); + } + } else { + if (x_offset) { + *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr, + ref_stride, sec_pred, + h_filter, 64, &diff); + } else { + *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride, + sec_pred, &diff); + } + } + + return VARIANCE_32Wx64H(*sse, diff); +} + +#define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht) \ + uint32_t vpx_sub_pixel_avg_variance64x##ht##_msa( \ + const uint8_t *src_ptr, int32_t src_stride, int32_t x_offset, \ + int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride, \ + uint32_t *sse, const uint8_t *sec_pred) { \ + int32_t diff; \ + const uint8_t *h_filter = bilinear_filters_msa[x_offset]; \ + const uint8_t *v_filter = bilinear_filters_msa[y_offset]; \ + \ + if (y_offset) { \ + if (x_offset) { \ + *sse = sub_pixel_avg_sse_diff_64width_hv_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \ + v_filter, ht, &diff); \ + } else { \ + *sse = sub_pixel_avg_sse_diff_64width_v_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \ + &diff); \ + } \ + } else { \ + if (x_offset) { \ + *sse = sub_pixel_avg_sse_diff_64width_h_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \ + &diff); \ + } else { \ + *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, ref_ptr, \ + ref_stride, sec_pred, &diff); \ + } \ + } \ + \ + return VARIANCE_64Wx##ht##H(*sse, diff); \ + } + +VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32); +VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64); diff --git a/media/libvpx/libvpx/vpx_dsp/mips/subtract_mmi.c b/media/libvpx/libvpx/vpx_dsp/mips/subtract_mmi.c new file mode 100644 index 0000000000..8bd7e6977c --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/subtract_mmi.c @@ -0,0 +1,306 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/asmdefs_mmi.h" + +void vpx_subtract_block_mmi(int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src, + ptrdiff_t src_stride, const uint8_t *pred, + ptrdiff_t pred_stride) { + double ftmp[13]; + uint32_t tmp[1]; + + if (rows == cols) { + switch (rows) { + case 4: + __asm__ volatile( + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[src]) \n\t" + "mtc1 %[tmp0], %[ftmp1] \n\t" + "ulw %[tmp0], 0x00(%[pred]) \n\t" + "mtc1 %[tmp0], %[ftmp2] \n\t" +#else + "gslwlc1 %[ftmp1], 0x03(%[src]) \n\t" + "gslwrc1 %[ftmp1], 0x00(%[src]) \n\t" + "gslwlc1 %[ftmp2], 0x03(%[pred]) \n\t" + "gslwrc1 %[ftmp2], 0x00(%[pred]) \n\t" +#endif + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[src]) \n\t" + "mtc1 %[tmp0], %[ftmp3] \n\t" + "ulw %[tmp0], 0x00(%[pred]) \n\t" + "mtc1 %[tmp0], %[ftmp4] \n\t" +#else + "gslwlc1 %[ftmp3], 0x03(%[src]) \n\t" + "gslwrc1 %[ftmp3], 0x00(%[src]) \n\t" + "gslwlc1 %[ftmp4], 0x03(%[pred]) \n\t" + "gslwrc1 %[ftmp4], 0x00(%[pred]) \n\t" +#endif + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[src]) \n\t" + "mtc1 %[tmp0], %[ftmp5] \n\t" + "ulw %[tmp0], 0x00(%[pred]) \n\t" + "mtc1 %[tmp0], %[ftmp6] \n\t" +#else + "gslwlc1 %[ftmp5], 0x03(%[src]) \n\t" + "gslwrc1 %[ftmp5], 0x00(%[src]) \n\t" + "gslwlc1 %[ftmp6], 0x03(%[pred]) \n\t" + "gslwrc1 %[ftmp6], 0x00(%[pred]) \n\t" +#endif + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[src]) \n\t" + "mtc1 %[tmp0], %[ftmp7] \n\t" + "ulw %[tmp0], 0x00(%[pred]) \n\t" + "mtc1 %[tmp0], %[ftmp8] \n\t" +#else + "gslwlc1 %[ftmp7], 0x03(%[src]) \n\t" + "gslwrc1 %[ftmp7], 0x00(%[src]) \n\t" + "gslwlc1 %[ftmp8], 0x03(%[pred]) \n\t" + "gslwrc1 %[ftmp8], 0x00(%[pred]) \n\t" +#endif + "punpcklbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp2], %[ftmp0] \n\t" + "psubh %[ftmp11], %[ftmp9], %[ftmp10] \n\t" + "gssdlc1 %[ftmp11], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp11], 0x00(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "punpcklbh %[ftmp9], %[ftmp3], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp4], %[ftmp0] \n\t" + "psubh %[ftmp11], %[ftmp9], %[ftmp10] \n\t" + "gssdlc1 %[ftmp11], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp11], 0x00(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "punpcklbh %[ftmp9], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp6], %[ftmp0] \n\t" + "psubh %[ftmp11], %[ftmp9], %[ftmp10] \n\t" + "gssdlc1 %[ftmp11], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp11], 0x00(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "punpcklbh %[ftmp9], %[ftmp7], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp8], %[ftmp0] \n\t" + "psubh %[ftmp11], %[ftmp9], %[ftmp10] \n\t" + "gssdlc1 %[ftmp11], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp11], 0x00(%[diff]) \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), + [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]), + [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), + [ftmp8] "=&f"(ftmp[8]), [ftmp9] "=&f"(ftmp[9]), + [ftmp10] "=&f"(ftmp[10]), [ftmp11] "=&f"(ftmp[11]), +#if _MIPS_SIM == _ABIO32 + [tmp0] "=&r"(tmp[0]), +#endif + [src] "+&r"(src), [pred] "+&r"(pred), [diff] "+&r"(diff) + : [src_stride] "r"((mips_reg)src_stride), + [pred_stride] "r"((mips_reg)pred_stride), + [diff_stride] "r"((mips_reg)(diff_stride * 2)) + : "memory"); + break; + case 8: + __asm__ volatile( + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "li %[tmp0], 0x02 \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[pred]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[pred]) \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + "gsldlc1 %[ftmp3], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp4], 0x07(%[pred]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[pred]) \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + "gsldlc1 %[ftmp5], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp6], 0x07(%[pred]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[pred]) \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + "gsldlc1 %[ftmp7], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp8], 0x07(%[pred]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[pred]) \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + "punpcklbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp2], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp2], %[ftmp0] \n\t" + "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t" + "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t" + "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "punpcklbh %[ftmp9], %[ftmp3], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp3], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp4], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp4], %[ftmp0] \n\t" + "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t" + "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t" + "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "punpcklbh %[ftmp9], %[ftmp5], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp6], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp6], %[ftmp0] \n\t" + "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t" + "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t" + "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "punpcklbh %[ftmp9], %[ftmp7], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp7], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp8], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp8], %[ftmp0] \n\t" + "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t" + "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t" + "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + "bnez %[tmp0], 1b \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), + [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]), + [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), + [ftmp8] "=&f"(ftmp[8]), [ftmp9] "=&f"(ftmp[9]), + [ftmp10] "=&f"(ftmp[10]), [ftmp11] "=&f"(ftmp[11]), + [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]), [src] "+&r"(src), + [pred] "+&r"(pred), [diff] "+&r"(diff) + : [pred_stride] "r"((mips_reg)pred_stride), + [src_stride] "r"((mips_reg)src_stride), + [diff_stride] "r"((mips_reg)(diff_stride * 2)) + : "memory"); + break; + case 16: + __asm__ volatile( + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "li %[tmp0], 0x08 \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[pred]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[pred]) \n\t" + "gsldlc1 %[ftmp3], 0x0f(%[src]) \n\t" + "gsldrc1 %[ftmp3], 0x08(%[src]) \n\t" + "gsldlc1 %[ftmp4], 0x0f(%[pred]) \n\t" + "gsldrc1 %[ftmp4], 0x08(%[pred]) \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + "gsldlc1 %[ftmp5], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp6], 0x07(%[pred]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[pred]) \n\t" + "gsldlc1 %[ftmp7], 0x0f(%[src]) \n\t" + "gsldrc1 %[ftmp7], 0x08(%[src]) \n\t" + "gsldlc1 %[ftmp8], 0x0f(%[pred]) \n\t" + "gsldrc1 %[ftmp8], 0x08(%[pred]) \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + "punpcklbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp2], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp2], %[ftmp0] \n\t" + "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t" + "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t" + "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t" + "punpcklbh %[ftmp9], %[ftmp3], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp3], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp4], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp4], %[ftmp0] \n\t" + "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "gssdlc1 %[ftmp9], 0x17(%[diff]) \n\t" + "gssdrc1 %[ftmp9], 0x10(%[diff]) \n\t" + "gssdlc1 %[ftmp10], 0x1f(%[diff]) \n\t" + "gssdrc1 %[ftmp10], 0x18(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "punpcklbh %[ftmp9], %[ftmp5], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp6], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp6], %[ftmp0] \n\t" + "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t" + "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t" + "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t" + "punpcklbh %[ftmp9], %[ftmp7], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp7], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp8], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp8], %[ftmp0] \n\t" + "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "gssdlc1 %[ftmp9], 0x17(%[diff]) \n\t" + "gssdrc1 %[ftmp9], 0x10(%[diff]) \n\t" + "gssdlc1 %[ftmp10], 0x1f(%[diff]) \n\t" + "gssdrc1 %[ftmp10], 0x18(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + "bnez %[tmp0], 1b \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), + [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]), + [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), + [ftmp8] "=&f"(ftmp[8]), [ftmp9] "=&f"(ftmp[9]), + [ftmp10] "=&f"(ftmp[10]), [ftmp11] "=&f"(ftmp[11]), + [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]), [src] "+&r"(src), + [pred] "+&r"(pred), [diff] "+&r"(diff) + : [pred_stride] "r"((mips_reg)pred_stride), + [src_stride] "r"((mips_reg)src_stride), + [diff_stride] "r"((mips_reg)(diff_stride * 2)) + : "memory"); + break; + case 32: + vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride, + pred, pred_stride); + break; + case 64: + vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride, + pred, pred_stride); + break; + default: + vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride, + pred, pred_stride); + break; + } + } else { + vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride, pred, + pred_stride); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/mips/subtract_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/subtract_msa.c new file mode 100644 index 0000000000..391a7ebf66 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/subtract_msa.c @@ -0,0 +1,264 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/macros_msa.h" + +static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *pred_ptr, int32_t pred_stride, + int16_t *diff_ptr, int32_t diff_stride) { + uint32_t src0, src1, src2, src3; + uint32_t pred0, pred1, pred2, pred3; + v16i8 src = { 0 }; + v16i8 pred = { 0 }; + v16u8 src_l0, src_l1; + v8i16 diff0, diff1; + + LW4(src_ptr, src_stride, src0, src1, src2, src3); + LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3); + INSERT_W4_SB(src0, src1, src2, src3, src); + INSERT_W4_SB(pred0, pred1, pred2, pred3, pred); + ILVRL_B2_UB(src, pred, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride)); +} + +static void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *pred_ptr, int32_t pred_stride, + int16_t *diff_ptr, int32_t diff_stride) { + uint32_t loop_cnt; + uint64_t src0, src1, pred0, pred1; + v16i8 src = { 0 }; + v16i8 pred = { 0 }; + v16u8 src_l0, src_l1; + v8i16 diff0, diff1; + + for (loop_cnt = 4; loop_cnt--;) { + LD2(src_ptr, src_stride, src0, src1); + src_ptr += (2 * src_stride); + LD2(pred_ptr, pred_stride, pred0, pred1); + pred_ptr += (2 * pred_stride); + + INSERT_D2_SB(src0, src1, src); + INSERT_D2_SB(pred0, pred1, pred); + ILVRL_B2_UB(src, pred, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff_ptr, diff_stride); + diff_ptr += (2 * diff_stride); + } +} + +static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + int8_t count; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + v16u8 src_l0, src_l1; + v8i16 diff0, diff1; + + for (count = 2; count--;) { + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + LD_SB8(pred, pred_stride, pred0, pred1, pred2, pred3, pred4, pred5, pred6, + pred7); + pred += (8 * pred_stride); + + ILVRL_B2_UB(src0, pred0, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src1, pred1, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src2, pred2, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src3, pred3, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src4, pred4, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src5, pred5, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src6, pred6, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src7, pred7, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + } +} + +static void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + v16u8 src_l0, src_l1; + v8i16 diff0, diff1; + + for (loop_cnt = 8; loop_cnt--;) { + LD_SB2(src, 16, src0, src1); + src += src_stride; + LD_SB2(src, 16, src2, src3); + src += src_stride; + LD_SB2(src, 16, src4, src5); + src += src_stride; + LD_SB2(src, 16, src6, src7); + src += src_stride; + + LD_SB2(pred, 16, pred0, pred1); + pred += pred_stride; + LD_SB2(pred, 16, pred2, pred3); + pred += pred_stride; + LD_SB2(pred, 16, pred4, pred5); + pred += pred_stride; + LD_SB2(pred, 16, pred6, pred7); + pred += pred_stride; + + ILVRL_B2_UB(src0, pred0, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src1, pred1, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + diff += diff_stride; + + ILVRL_B2_UB(src2, pred2, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src3, pred3, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + diff += diff_stride; + + ILVRL_B2_UB(src4, pred4, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src5, pred5, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + diff += diff_stride; + + ILVRL_B2_UB(src6, pred6, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src7, pred7, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + diff += diff_stride; + } +} + +static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + v16u8 src_l0, src_l1; + v8i16 diff0, diff1; + + for (loop_cnt = 32; loop_cnt--;) { + LD_SB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_SB4(src, 16, src4, src5, src6, src7); + src += src_stride; + + LD_SB4(pred, 16, pred0, pred1, pred2, pred3); + pred += pred_stride; + LD_SB4(pred, 16, pred4, pred5, pred6, pred7); + pred += pred_stride; + + ILVRL_B2_UB(src0, pred0, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src1, pred1, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + ILVRL_B2_UB(src2, pred2, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 32, 8); + ILVRL_B2_UB(src3, pred3, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 48, 8); + diff += diff_stride; + + ILVRL_B2_UB(src4, pred4, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src5, pred5, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + ILVRL_B2_UB(src6, pred6, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 32, 8); + ILVRL_B2_UB(src7, pred7, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 48, 8); + diff += diff_stride; + } +} + +void vpx_subtract_block_msa(int32_t rows, int32_t cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, + ptrdiff_t pred_stride) { + if (rows == cols) { + switch (rows) { + case 4: + sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 8: + sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 16: + sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 32: + sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 64: + sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + default: + vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + } + } else { + vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/mips/sum_squares_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/sum_squares_msa.c new file mode 100644 index 0000000000..d4563dc410 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/sum_squares_msa.c @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "./macros_msa.h" + +uint64_t vpx_sum_squares_2d_i16_msa(const int16_t *src, int src_stride, + int size) { + int row, col; + uint64_t ss_res = 0; + v4i32 mul0, mul1; + v2i64 res0 = { 0 }; + + if (4 == size) { + uint64_t src0, src1, src2, src3; + v8i16 diff0 = { 0 }; + v8i16 diff1 = { 0 }; + + LD4(src, src_stride, src0, src1, src2, src3); + INSERT_D2_SH(src0, src1, diff0); + INSERT_D2_SH(src2, src3, diff1); + DOTP_SH2_SW(diff0, diff1, diff0, diff1, mul0, mul1); + mul0 += mul1; + res0 = __msa_hadd_s_d(mul0, mul0); + res0 += __msa_splati_d(res0, 1); + ss_res = (uint64_t)__msa_copy_s_d(res0, 0); + } else if (8 == size) { + v8i16 src0, src1, src2, src3, src4, src5, src6, src7; + + LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + mul0 += mul1; + res0 = __msa_hadd_s_d(mul0, mul0); + res0 += __msa_splati_d(res0, 1); + ss_res = (uint64_t)__msa_copy_s_d(res0, 0); + } else if (16 == size) { + v8i16 src0, src1, src2, src3, src4, src5, src6, src7; + + LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + LD_SH8(src + 8, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += 8 * src_stride; + DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + LD_SH8(src + 8, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + mul0 += mul1; + res0 += __msa_hadd_s_d(mul0, mul0); + + res0 += __msa_splati_d(res0, 1); + ss_res = (uint64_t)__msa_copy_s_d(res0, 0); + } else if (0 == (size % 16)) { + v8i16 src0, src1, src2, src3, src4, src5, src6, src7; + + for (row = 0; row < (size >> 4); row++) { + for (col = 0; col < size; col += 16) { + const int16_t *src_ptr = src + col; + LD_SH8(src_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6, + src7); + DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + LD_SH8(src_ptr + 8, src_stride, src0, src1, src2, src3, src4, src5, + src6, src7); + src_ptr += 8 * src_stride; + DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + LD_SH8(src_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6, + src7); + DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + LD_SH8(src_ptr + 8, src_stride, src0, src1, src2, src3, src4, src5, + src6, src7); + DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + mul0 += mul1; + res0 += __msa_hadd_s_d(mul0, mul0); + } + + src += 16 * src_stride; + } + + res0 += __msa_splati_d(res0, 1); + ss_res = (uint64_t)__msa_copy_s_d(res0, 0); + } else { + int16_t val; + + for (row = 0; row < size; row++) { + for (col = 0; col < size; col++) { + val = src[col]; + ss_res += val * val; + } + + src += src_stride; + } + } + + return ss_res; +} diff --git a/media/libvpx/libvpx/vpx_dsp/mips/txfm_macros_msa.h b/media/libvpx/libvpx/vpx_dsp/mips/txfm_macros_msa.h new file mode 100644 index 0000000000..f27504a207 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/txfm_macros_msa.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_MIPS_TXFM_MACROS_MSA_H_ +#define VPX_VPX_DSP_MIPS_TXFM_MACROS_MSA_H_ + +#include "vpx_dsp/mips/macros_msa.h" + +#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \ + { \ + v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m; \ + v8i16 k0_m, k1_m, k2_m, zero = { 0 }; \ + \ + k0_m = __msa_fill_h(cnst0); \ + k1_m = __msa_fill_h(cnst1); \ + k2_m = __msa_ilvev_h((v8i16)k1_m, k0_m); \ + k0_m = __msa_ilvev_h((v8i16)zero, k0_m); \ + k1_m = __msa_ilvev_h(k1_m, (v8i16)zero); \ + \ + ILVRL_H2_SW(reg1, reg0, s5_m, s4_m); \ + ILVRL_H2_SW(reg0, reg1, s3_m, s2_m); \ + DOTP_SH2_SW(s5_m, s4_m, k0_m, k0_m, s1_m, s0_m); \ + s1_m = __msa_dpsub_s_w(s1_m, (v8i16)s5_m, k1_m); \ + s0_m = __msa_dpsub_s_w(s0_m, (v8i16)s4_m, k1_m); \ + SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \ + out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \ + \ + DOTP_SH2_SW(s3_m, s2_m, k2_m, k2_m, s1_m, s0_m); \ + SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \ + out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \ + } + +#define DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7, dst0, \ + dst1, dst2, dst3) \ + { \ + v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m; \ + v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m; \ + \ + DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5, tp0_m, tp2_m, tp3_m, \ + tp4_m); \ + DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7, tp5_m, tp6_m, tp7_m, \ + tp8_m); \ + BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m); \ + BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m); \ + SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, DCT_CONST_BITS); \ + SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, DCT_CONST_BITS); \ + PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m, dst0, \ + dst1, dst2, dst3); \ + } + +#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) \ + ({ \ + v8i16 dst_m; \ + v4i32 tp0_m, tp1_m; \ + \ + DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m); \ + SRARI_W2_SW(tp1_m, tp0_m, DCT_CONST_BITS); \ + dst_m = __msa_pckev_h((v8i16)tp1_m, (v8i16)tp0_m); \ + \ + dst_m; \ + }) + +#define MADD_SHORT(m0, m1, c0, c1, res0, res1) \ + { \ + v4i32 madd0_m, madd1_m, madd2_m, madd3_m; \ + v8i16 madd_s0_m, madd_s1_m; \ + \ + ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m); \ + DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m, c0, c0, c1, c1, \ + madd0_m, madd1_m, madd2_m, madd3_m); \ + SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1); \ + } + +#define MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1, \ + out2, out3) \ + { \ + v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ + v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m; \ + \ + ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m); \ + ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m); \ + DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, cst0, cst0, cst2, \ + cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, m4_m, m5_m, tmp3_m, tmp2_m); \ + SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1); \ + DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, cst1, cst1, cst3, \ + cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, m4_m, m5_m, tmp3_m, tmp2_m); \ + SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3); \ + } +#endif // VPX_VPX_DSP_MIPS_TXFM_MACROS_MSA_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/mips/variance_mmi.c b/media/libvpx/libvpx/vpx_dsp/mips/variance_mmi.c new file mode 100644 index 0000000000..c2adcfa018 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/variance_mmi.c @@ -0,0 +1,1357 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/variance.h" +#include "vpx_ports/mem.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/asmdefs_mmi.h" + +static const uint8_t bilinear_filters[8][2] = { + { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, + { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, +}; + +/* Use VARIANCE_SSE_SUM_8_FOR_W64 in vpx_variance64x64,vpx_variance64x32, + vpx_variance32x64. VARIANCE_SSE_SUM_8 will lead to sum overflow. */ +#define VARIANCE_SSE_SUM_8_FOR_W64 \ + /* sse */ \ + "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ + "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \ + "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \ + "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \ + "paddw %[ftmp10], %[ftmp10], %[ftmp6] \n\t" \ + "paddw %[ftmp10], %[ftmp10], %[ftmp7] \n\t" \ + \ + /* sum */ \ + "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ + "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \ + "punpcklhw %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp2], %[ftmp3], %[ftmp0] \n\t" \ + "punpcklhw %[ftmp7], %[ftmp5], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp8], %[ftmp5], %[ftmp0] \n\t" \ + "psubw %[ftmp3], %[ftmp1], %[ftmp7] \n\t" \ + "psubw %[ftmp5], %[ftmp2], %[ftmp8] \n\t" \ + "punpcklhw %[ftmp1], %[ftmp4], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp2], %[ftmp4], %[ftmp0] \n\t" \ + "punpcklhw %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp8], %[ftmp6], %[ftmp0] \n\t" \ + "psubw %[ftmp4], %[ftmp1], %[ftmp7] \n\t" \ + "psubw %[ftmp6], %[ftmp2], %[ftmp8] \n\t" \ + "paddw %[ftmp9], %[ftmp9], %[ftmp3] \n\t" \ + "paddw %[ftmp9], %[ftmp9], %[ftmp4] \n\t" \ + "paddw %[ftmp9], %[ftmp9], %[ftmp5] \n\t" \ + "paddw %[ftmp9], %[ftmp9], %[ftmp6] \n\t" + +#define VARIANCE_SSE_SUM_4 \ + /* sse */ \ + "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ + "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ + "pmaddhw %[ftmp5], %[ftmp4], %[ftmp4] \n\t" \ + "paddw %[ftmp6], %[ftmp6], %[ftmp5] \n\t" \ + \ + /* sum */ \ + "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ + "punpcklbh %[ftmp4], %[ftmp2], %[ftmp0] \n\t" \ + "paddh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" \ + "paddh %[ftmp8], %[ftmp8], %[ftmp4] \n\t" + +#define VARIANCE_SSE_SUM_8 \ + /* sse */ \ + "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ + "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \ + "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \ + "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t" \ + \ + /* sum */ \ + "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ + "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \ + "paddh %[ftmp10], %[ftmp10], %[ftmp3] \n\t" \ + "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t" \ + "paddh %[ftmp12], %[ftmp12], %[ftmp5] \n\t" \ + "paddh %[ftmp12], %[ftmp12], %[ftmp6] \n\t" + +#define VARIANCE_SSE_8 \ + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \ + "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" \ + "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" \ + "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ + "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \ + "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \ + "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + +#define VARIANCE_SSE_16 \ + VARIANCE_SSE_8 \ + "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t" \ + "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ + "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \ + "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \ + "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A \ + /* calculate fdata3[0]~fdata3[3], store at ftmp2*/ \ + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \ + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ + "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \ + "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ + "pmullh %[ftmp2], %[ftmp2], %[filter_x0] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp3], %[ftmp3], %[filter_x1] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \ + "psrlh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B \ + /* calculate fdata3[0]~fdata3[3], store at ftmp4*/ \ + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \ + "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ + "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \ + "punpcklbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ + "pmullh %[ftmp4], %[ftmp4], %[filter_x0] \n\t" \ + "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp5], %[ftmp5], %[filter_x1] \n\t" \ + "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \ + "psrlh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A \ + /* calculate: temp2[0] ~ temp2[3] */ \ + "pmullh %[ftmp2], %[ftmp2], %[filter_y0] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp4], %[filter_y1] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" \ + "psrlh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" \ + \ + /* store: temp2[0] ~ temp2[3] */ \ + "pand %[ftmp2], %[ftmp2], %[mask] \n\t" \ + "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \ + "gssdrc1 %[ftmp2], 0x00(%[temp2_ptr]) \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B \ + /* calculate: temp2[0] ~ temp2[3] */ \ + "pmullh %[ftmp4], %[ftmp4], %[filter_y0] \n\t" \ + "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp2], %[filter_y1] \n\t" \ + "paddh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" \ + "psrlh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" \ + \ + /* store: temp2[0] ~ temp2[3] */ \ + "pand %[ftmp4], %[ftmp4], %[mask] \n\t" \ + "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \ + "gssdrc1 %[ftmp4], 0x00(%[temp2_ptr]) \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A \ + /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/ \ + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \ + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ + "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \ + "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ + "pmullh %[ftmp2], %[ftmp2], %[filter_x0] \n\t" \ + "pmullh %[ftmp3], %[ftmp3], %[filter_x0] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \ + "paddh %[ftmp3], %[ftmp3], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp4], %[ftmp4], %[filter_x1] \n\t" \ + "pmullh %[ftmp5], %[ftmp5], %[filter_x1] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \ + "psrlh %[ftmp2], %[ftmp2], %[ftmp14] \n\t" \ + "psrlh %[ftmp3], %[ftmp3], %[ftmp14] \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B \ + /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/ \ + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \ + "punpcklbh %[ftmp8], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t" \ + "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \ + "punpcklbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp11], %[ftmp1], %[ftmp0] \n\t" \ + "pmullh %[ftmp8], %[ftmp8], %[filter_x0] \n\t" \ + "pmullh %[ftmp9], %[ftmp9], %[filter_x0] \n\t" \ + "paddh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" \ + "paddh %[ftmp9], %[ftmp9], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp10], %[ftmp10], %[filter_x1] \n\t" \ + "pmullh %[ftmp11], %[ftmp11], %[filter_x1] \n\t" \ + "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t" \ + "paddh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" \ + "psrlh %[ftmp8], %[ftmp8], %[ftmp14] \n\t" \ + "psrlh %[ftmp9], %[ftmp9], %[ftmp14] \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A \ + /* calculate: temp2[0] ~ temp2[3] */ \ + "pmullh %[ftmp2], %[ftmp2], %[filter_y0] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp8], %[filter_y1] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" \ + "psrlh %[ftmp2], %[ftmp2], %[ftmp14] \n\t" \ + \ + /* calculate: temp2[4] ~ temp2[7] */ \ + "pmullh %[ftmp3], %[ftmp3], %[filter_y0] \n\t" \ + "paddh %[ftmp3], %[ftmp3], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp9], %[filter_y1] \n\t" \ + "paddh %[ftmp3], %[ftmp3], %[ftmp1] \n\t" \ + "psrlh %[ftmp3], %[ftmp3], %[ftmp14] \n\t" \ + \ + /* store: temp2[0] ~ temp2[7] */ \ + "pand %[ftmp2], %[ftmp2], %[mask] \n\t" \ + "pand %[ftmp3], %[ftmp3], %[mask] \n\t" \ + "packushb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \ + "gssdlc1 %[ftmp2], 0x07(%[temp2_ptr]) \n\t" \ + "gssdrc1 %[ftmp2], 0x00(%[temp2_ptr]) \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B \ + /* calculate: temp2[0] ~ temp2[3] */ \ + "pmullh %[ftmp8], %[ftmp8], %[filter_y0] \n\t" \ + "paddh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp2], %[filter_y1] \n\t" \ + "paddh %[ftmp8], %[ftmp8], %[ftmp1] \n\t" \ + "psrlh %[ftmp8], %[ftmp8], %[ftmp14] \n\t" \ + \ + /* calculate: temp2[4] ~ temp2[7] */ \ + "pmullh %[ftmp9], %[ftmp9], %[filter_y0] \n\t" \ + "paddh %[ftmp9], %[ftmp9], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp3], %[filter_y1] \n\t" \ + "paddh %[ftmp9], %[ftmp9], %[ftmp1] \n\t" \ + "psrlh %[ftmp9], %[ftmp9], %[ftmp14] \n\t" \ + \ + /* store: temp2[0] ~ temp2[7] */ \ + "pand %[ftmp8], %[ftmp8], %[mask] \n\t" \ + "pand %[ftmp9], %[ftmp9], %[mask] \n\t" \ + "packushb %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \ + "gssdlc1 %[ftmp8], 0x07(%[temp2_ptr]) \n\t" \ + "gssdrc1 %[ftmp8], 0x00(%[temp2_ptr]) \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A \ + /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/ \ + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A \ + \ + /* calculate fdata3[8]~fdata3[15], store at ftmp4 and ftmp5*/ \ + "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ + "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ + "gsldlc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x09(%[src_ptr]) \n\t" \ + "punpcklbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp7], %[ftmp1], %[ftmp0] \n\t" \ + "pmullh %[ftmp4], %[ftmp4], %[filter_x0] \n\t" \ + "pmullh %[ftmp5], %[ftmp5], %[filter_x0] \n\t" \ + "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \ + "paddh %[ftmp5], %[ftmp5], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp6], %[ftmp6], %[filter_x1] \n\t" \ + "pmullh %[ftmp7], %[ftmp7], %[filter_x1] \n\t" \ + "paddh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" \ + "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \ + "psrlh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" \ + "psrlh %[ftmp5], %[ftmp5], %[ftmp14] \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B \ + /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/ \ + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B \ + \ + /* calculate fdata3[8]~fdata3[15], store at ftmp10 and ftmp11*/ \ + "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ + "punpcklbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp11], %[ftmp1], %[ftmp0] \n\t" \ + "gsldlc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x09(%[src_ptr]) \n\t" \ + "punpcklbh %[ftmp12], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp13], %[ftmp1], %[ftmp0] \n\t" \ + "pmullh %[ftmp10], %[ftmp10], %[filter_x0] \n\t" \ + "pmullh %[ftmp11], %[ftmp11], %[filter_x0] \n\t" \ + "paddh %[ftmp10], %[ftmp10], %[ff_ph_40] \n\t" \ + "paddh %[ftmp11], %[ftmp11], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp12], %[ftmp12], %[filter_x1] \n\t" \ + "pmullh %[ftmp13], %[ftmp13], %[filter_x1] \n\t" \ + "paddh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" \ + "paddh %[ftmp11], %[ftmp11], %[ftmp13] \n\t" \ + "psrlh %[ftmp10], %[ftmp10], %[ftmp14] \n\t" \ + "psrlh %[ftmp11], %[ftmp11], %[ftmp14] \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A \ + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A \ + \ + /* calculate: temp2[8] ~ temp2[11] */ \ + "pmullh %[ftmp4], %[ftmp4], %[filter_y0] \n\t" \ + "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp10], %[filter_y1] \n\t" \ + "paddh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" \ + "psrlh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" \ + \ + /* calculate: temp2[12] ~ temp2[15] */ \ + "pmullh %[ftmp5], %[ftmp5], %[filter_y0] \n\t" \ + "paddh %[ftmp5], %[ftmp5], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp11], %[filter_y1] \n\t" \ + "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "psrlh %[ftmp5], %[ftmp5], %[ftmp14] \n\t" \ + \ + /* store: temp2[8] ~ temp2[15] */ \ + "pand %[ftmp4], %[ftmp4], %[mask] \n\t" \ + "pand %[ftmp5], %[ftmp5], %[mask] \n\t" \ + "packushb %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \ + "gssdlc1 %[ftmp4], 0x0f(%[temp2_ptr]) \n\t" \ + "gssdrc1 %[ftmp4], 0x08(%[temp2_ptr]) \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B \ + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B \ + \ + /* calculate: temp2[8] ~ temp2[11] */ \ + "pmullh %[ftmp10], %[ftmp10], %[filter_y0] \n\t" \ + "paddh %[ftmp10], %[ftmp10], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp4], %[filter_y1] \n\t" \ + "paddh %[ftmp10], %[ftmp10], %[ftmp1] \n\t" \ + "psrlh %[ftmp10], %[ftmp10], %[ftmp14] \n\t" \ + \ + /* calculate: temp2[12] ~ temp2[15] */ \ + "pmullh %[ftmp11], %[ftmp11], %[filter_y0] \n\t" \ + "paddh %[ftmp11], %[ftmp11], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp5], %[filter_y1] \n\t" \ + "paddh %[ftmp11], %[ftmp11], %[ftmp1] \n\t" \ + "psrlh %[ftmp11], %[ftmp11], %[ftmp14] \n\t" \ + \ + /* store: temp2[8] ~ temp2[15] */ \ + "pand %[ftmp10], %[ftmp10], %[mask] \n\t" \ + "pand %[ftmp11], %[ftmp11], %[mask] \n\t" \ + "packushb %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \ + "gssdlc1 %[ftmp10], 0x0f(%[temp2_ptr]) \n\t" \ + "gssdrc1 %[ftmp10], 0x08(%[temp2_ptr]) \n\t" + +// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal +// or vertical direction to produce the filtered output block. Used to implement +// the first-pass of 2-D separable filter. +// +// Produces int16_t output to retain precision for the next pass. Two filter +// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is +// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride). +// It defines the offset required to move from one input to the next. +static void var_filter_block2d_bil_first_pass( + const uint8_t *src_ptr, uint16_t *ref_ptr, unsigned int src_pixels_per_line, + int pixel_step, unsigned int output_height, unsigned int output_width, + const uint8_t *filter) { + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + ref_ptr[j] = ROUND_POWER_OF_TWO( + (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], + FILTER_BITS); + + ++src_ptr; + } + + src_ptr += src_pixels_per_line - output_width; + ref_ptr += output_width; + } +} + +// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal +// or vertical direction to produce the filtered output block. Used to implement +// the second-pass of 2-D separable filter. +// +// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two +// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the +// filter is applied horizontally (pixel_step = 1) or vertically +// (pixel_step = stride). It defines the offset required to move from one input +// to the next. Output is 8-bit. +static void var_filter_block2d_bil_second_pass( + const uint16_t *src_ptr, uint8_t *ref_ptr, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter) { + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + ref_ptr[j] = ROUND_POWER_OF_TWO( + (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], + FILTER_BITS); + ++src_ptr; + } + + src_ptr += src_pixels_per_line - output_width; + ref_ptr += output_width; + } +} + +static inline uint32_t vpx_variance64x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sse, int high) { + int sum; + double ftmp[12]; + uint32_t tmp[3]; + + *sse = 0; + + /* clang-format off */ + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + MMI_L(%[tmp0], %[high], 0x00) + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp9], %[ftmp9], %[ftmp9] \n\t" + "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x17(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x17(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x10(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x1f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x18(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x1f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x18(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x27(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x20(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x27(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x20(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x2f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x28(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x2f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x28(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x37(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x30(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x37(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x30(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x3f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x38(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x3f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x38(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) + "bnez %[tmp0], 1b \n\t" + + "mfc1 %[tmp1], %[ftmp9] \n\t" + "mfhc1 %[tmp2], %[ftmp9] \n\t" + "addu %[sum], %[tmp1], %[tmp2] \n\t" + "ssrld %[ftmp1], %[ftmp10], %[ftmp11] \n\t" + "paddw %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "swc1 %[ftmp1], 0x00(%[sse]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [tmp2]"=&r"(tmp[2]), + [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr), + [sum]"=&r"(sum) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride), + [high]"r"(&high), [sse]"r"(sse) + : "memory" + ); + /* clang-format on */ + + return *sse - (((int64_t)sum * sum) / (64 * high)); +} + +#define VPX_VARIANCE64XN(n) \ + uint32_t vpx_variance64x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + return vpx_variance64x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ + } + +VPX_VARIANCE64XN(64) +VPX_VARIANCE64XN(32) + +uint32_t vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sse) { + int sum; + double ftmp[12]; + uint32_t tmp[3]; + + *sse = 0; + + /* clang-format off */ + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + "li %[tmp0], 0x40 \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp9], %[ftmp9], %[ftmp9] \n\t" + "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x17(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x17(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x10(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x1f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x18(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x1f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x18(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) + "bnez %[tmp0], 1b \n\t" + + "mfc1 %[tmp1], %[ftmp9] \n\t" + "mfhc1 %[tmp2], %[ftmp9] \n\t" + "addu %[sum], %[tmp1], %[tmp2] \n\t" + "ssrld %[ftmp1], %[ftmp10], %[ftmp11] \n\t" + "paddw %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "swc1 %[ftmp1], 0x00(%[sse]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [tmp2]"=&r"(tmp[2]), + [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr), + [sum]"=&r"(sum) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride), + [sse]"r"(sse) + : "memory" + ); + /* clang-format on */ + + return *sse - (((int64_t)sum * sum) / 2048); +} + +static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sse, int high) { + int sum; + double ftmp[13]; + uint32_t tmp[3]; + + *sse = 0; + + /* clang-format off */ + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + MMI_L(%[tmp0], %[high], 0x00) + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" + "pxor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8 + "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8 + "gsldlc1 %[ftmp1], 0x17(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x17(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x10(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8 + "gsldlc1 %[ftmp1], 0x1f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x18(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x1f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x18(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) + "bnez %[tmp0], 1b \n\t" + + "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t" + "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" + "swc1 %[ftmp9], 0x00(%[sse]) \n\t" + + "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t" + "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t" + "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t" + "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t" + "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t" + "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t" + "ssrld %[ftmp0], %[ftmp3], %[ftmp11] \n\t" + "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + "swc1 %[ftmp0], 0x00(%[sum]) \n\t" + + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), + [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride), + [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) + : "memory" + ); + /* clang-format on */ + + return *sse - (((int64_t)sum * sum) / (32 * high)); +} + +#define VPX_VARIANCE32XN(n) \ + uint32_t vpx_variance32x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + return vpx_variance32x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ + } + +VPX_VARIANCE32XN(32) +VPX_VARIANCE32XN(16) + +static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sse, int high) { + int sum; + double ftmp[13]; + uint32_t tmp[3]; + + *sse = 0; + + /* clang-format off */ + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + MMI_L(%[tmp0], %[high], 0x00) + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" + "pxor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8 + "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) + "bnez %[tmp0], 1b \n\t" + + "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t" + "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" + "swc1 %[ftmp9], 0x00(%[sse]) \n\t" + + "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t" + "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t" + "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t" + "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t" + "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t" + "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t" + "ssrld %[ftmp0], %[ftmp3], %[ftmp11] \n\t" + "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + "swc1 %[ftmp0], 0x00(%[sum]) \n\t" + + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), + [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride), + [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) + : "memory" + ); + /* clang-format on */ + + return *sse - (((int64_t)sum * sum) / (16 * high)); +} + +#define VPX_VARIANCE16XN(n) \ + uint32_t vpx_variance16x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + return vpx_variance16x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ + } + +VPX_VARIANCE16XN(32) +VPX_VARIANCE16XN(16) +VPX_VARIANCE16XN(8) + +static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sse, int high) { + int sum; + double ftmp[13]; + uint32_t tmp[3]; + + *sse = 0; + + /* clang-format off */ + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + MMI_L(%[tmp0], %[high], 0x00) + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" + "pxor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) + "bnez %[tmp0], 1b \n\t" + + "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t" + "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" + "swc1 %[ftmp9], 0x00(%[sse]) \n\t" + + "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t" + "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t" + "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t" + "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t" + "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t" + "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t" + "ssrld %[ftmp0], %[ftmp3], %[ftmp11] \n\t" + "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + "swc1 %[ftmp0], 0x00(%[sum]) \n\t" + + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), + [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride), + [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) + : "memory" + ); + /* clang-format on */ + + return *sse - (((int64_t)sum * sum) / (8 * high)); +} + +#define VPX_VARIANCE8XN(n) \ + uint32_t vpx_variance8x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + return vpx_variance8x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ + } + +VPX_VARIANCE8XN(16) +VPX_VARIANCE8XN(8) +VPX_VARIANCE8XN(4) + +static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sse, int high) { + int sum; + double ftmp[12]; + uint32_t tmp[3]; + + *sse = 0; + + /* clang-format off */ + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp10] \n\t" + MMI_L(%[tmp0], %[high], 0x00) + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "pxor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_4 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) + "bnez %[tmp0], 1b \n\t" + + "ssrld %[ftmp9], %[ftmp6], %[ftmp10] \n\t" + "paddw %[ftmp9], %[ftmp9], %[ftmp6] \n\t" + "swc1 %[ftmp9], 0x00(%[sse]) \n\t" + + "punpcklhw %[ftmp3], %[ftmp7], %[ftmp0] \n\t" + "punpckhhw %[ftmp4], %[ftmp7], %[ftmp0] \n\t" + "punpcklhw %[ftmp5], %[ftmp8], %[ftmp0] \n\t" + "punpckhhw %[ftmp6], %[ftmp8], %[ftmp0] \n\t" + "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t" + "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t" + "ssrld %[ftmp0], %[ftmp3], %[ftmp10] \n\t" + "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + "swc1 %[ftmp0], 0x00(%[sum]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), + [tmp0]"=&r"(tmp[0]), + [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride), + [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) + : "memory" + ); + /* clang-format on */ + + return *sse - (((int64_t)sum * sum) / (4 * high)); +} + +#define VPX_VARIANCE4XN(n) \ + uint32_t vpx_variance4x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + return vpx_variance4x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ + } + +VPX_VARIANCE4XN(8) +VPX_VARIANCE4XN(4) + +static inline uint32_t vpx_mse16x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sse, uint64_t high) { + double ftmp[12]; + uint32_t tmp[1]; + + *sse = 0; + + /* clang-format off */ + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + MMI_L(%[tmp0], %[high], 0x00) + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + + "1: \n\t" + VARIANCE_SSE_16 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) + "bnez %[tmp0], 1b \n\t" + + "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t" + "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" + "swc1 %[ftmp9], 0x00(%[sse]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), + [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride), + [high]"r"(&high), [sse]"r"(sse) + : "memory" + ); + /* clang-format on */ + + return *sse; +} + +#define vpx_mse16xN(n) \ + uint32_t vpx_mse16x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + return vpx_mse16x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ + } + +vpx_mse16xN(16); +vpx_mse16xN(8); + +static inline uint32_t vpx_mse8x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sse, uint64_t high) { + double ftmp[12]; + uint32_t tmp[1]; + + *sse = 0; + + /* clang-format off */ + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + MMI_L(%[tmp0], %[high], 0x00) + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + + "1: \n\t" + VARIANCE_SSE_8 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) + "bnez %[tmp0], 1b \n\t" + + "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t" + "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" + "swc1 %[ftmp9], 0x00(%[sse]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), + [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride), + [high]"r"(&high), [sse]"r"(sse) + : "memory" + ); + /* clang-format on */ + + return *sse; +} + +#define vpx_mse8xN(n) \ + uint32_t vpx_mse8x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + return vpx_mse8x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ + } + +vpx_mse8xN(16); +vpx_mse8xN(8); + +#define SUBPIX_VAR(W, H) \ + uint32_t vpx_sub_pixel_variance##W##x##H##_mmi( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint16_t fdata3[((H) + 1) * (W)]; \ + uint8_t temp2[(H) * (W)]; \ + \ + var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, (H) + 1, \ + W, bilinear_filters[x_offset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + return vpx_variance##W##x##H##_mmi(temp2, W, ref_ptr, ref_stride, sse); \ + } + +SUBPIX_VAR(64, 64) +SUBPIX_VAR(64, 32) +SUBPIX_VAR(32, 64) +SUBPIX_VAR(32, 32) +SUBPIX_VAR(32, 16) +SUBPIX_VAR(16, 32) + +static inline void var_filter_block2d_bil_16x(const uint8_t *src_ptr, + int src_stride, int x_offset, + int y_offset, uint8_t *temp2, + int counter) { + uint8_t *temp2_ptr = temp2; + mips_reg l_counter = counter; + double ftmp[15]; + double ff_ph_40, mask; + double filter_x0, filter_x1, filter_y0, filter_y1; + mips_reg tmp[2]; + uint64_t x0, x1, y0, y1, all; + + const uint8_t *filter_x = bilinear_filters[x_offset]; + const uint8_t *filter_y = bilinear_filters[y_offset]; + x0 = (uint64_t)filter_x[0]; + x1 = (uint64_t)filter_x[1]; + y0 = (uint64_t)filter_y[0]; + y1 = (uint64_t)filter_y[1]; + all = x0 | x1 << 8 | y0 << 16 | y1 << 24; + + /* clang-format off */ + __asm__ volatile ( + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + MMI_MTC1(%[all], %[ftmp14]) + "punpcklbh %[ftmp14], %[ftmp14], %[ftmp0] \n\t" + "pshufh %[filter_x0], %[ftmp14], %[ftmp0] \n\t" + MMI_LI(%[tmp0], 0x10) + MMI_MTC1(%[tmp0], %[mask]) + "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t" + "pshufh %[filter_x1], %[ftmp14], %[ftmp0] \n\t" + "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t" + "pshufh %[filter_y0], %[ftmp14], %[ftmp0] \n\t" + "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t" + "pshufh %[filter_y1], %[ftmp14], %[ftmp0] \n\t" + MMI_LI(%[tmp0], 0x07) + MMI_MTC1(%[tmp0], %[ftmp14]) + MMI_LI(%[tmp0], 0x0040004000400040) + MMI_MTC1(%[tmp0], %[ff_ph_40]) + MMI_LI(%[tmp0], 0x00ff00ff00ff00ff) + MMI_MTC1(%[tmp0], %[mask]) + // fdata3: fdata3[0] ~ fdata3[15] + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A + + // fdata3 +src_stride*1: fdata3[0] ~ fdata3[15] + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B + // temp2: temp2[0] ~ temp2[15] + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A + + // fdata3 +src_stride*2: fdata3[0] ~ fdata3[15] + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A + // temp2+16*1: temp2[0] ~ temp2[15] + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B + + "1: \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A + + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B + "addiu %[counter], %[counter], -0x01 \n\t" + "bnez %[counter], 1b \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), + [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]), + [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]), + [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), + [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]), + [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr), + [counter]"+&r"(l_counter), [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask), + [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1), + [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1) + : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all) + : "memory" + ); + /* clang-format on */ +} + +#define SUBPIX_VAR16XN(H) \ + uint32_t vpx_sub_pixel_variance16x##H##_mmi( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint8_t temp2[16 * (H)]; \ + var_filter_block2d_bil_16x(src_ptr, src_stride, x_offset, y_offset, temp2, \ + ((H)-2) / 2); \ + \ + return vpx_variance16x##H##_mmi(temp2, 16, ref_ptr, ref_stride, sse); \ + } + +SUBPIX_VAR16XN(16) +SUBPIX_VAR16XN(8) + +static inline void var_filter_block2d_bil_8x(const uint8_t *src_ptr, + int src_stride, int x_offset, + int y_offset, uint8_t *temp2, + int counter) { + uint8_t *temp2_ptr = temp2; + mips_reg l_counter = counter; + double ftmp[15]; + mips_reg tmp[2]; + double ff_ph_40, mask; + uint64_t x0, x1, y0, y1, all; + double filter_x0, filter_x1, filter_y0, filter_y1; + const uint8_t *filter_x = bilinear_filters[x_offset]; + const uint8_t *filter_y = bilinear_filters[y_offset]; + x0 = (uint64_t)filter_x[0]; + x1 = (uint64_t)filter_x[1]; + y0 = (uint64_t)filter_y[0]; + y1 = (uint64_t)filter_y[1]; + all = x0 | x1 << 8 | y0 << 16 | y1 << 24; + + /* clang-format off */ + __asm__ volatile ( + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + MMI_MTC1(%[all], %[ftmp14]) + "punpcklbh %[ftmp14], %[ftmp14], %[ftmp0] \n\t" + "pshufh %[filter_x0], %[ftmp14], %[ftmp0] \n\t" + MMI_LI(%[tmp0], 0x10) + MMI_MTC1(%[tmp0], %[mask]) + "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t" + "pshufh %[filter_x1], %[ftmp14], %[ftmp0] \n\t" + "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t" + "pshufh %[filter_y0], %[ftmp14], %[ftmp0] \n\t" + "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t" + "pshufh %[filter_y1], %[ftmp14], %[ftmp0] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + MMI_LI(%[tmp0], 0x07) + MMI_MTC1(%[tmp0], %[ftmp14]) + MMI_LI(%[tmp0], 0x0040004000400040) + MMI_MTC1(%[tmp0], %[ff_ph_40]) + MMI_LI(%[tmp0], 0x00ff00ff00ff00ff) + MMI_MTC1(%[tmp0], %[mask]) + + // fdata3: fdata3[0] ~ fdata3[7] + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A + + // fdata3 +src_stride*1: fdata3[0] ~ fdata3[7] + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B + // temp2: temp2[0] ~ temp2[7] + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A + + // fdata3 +src_stride*2: fdata3[0] ~ fdata3[7] + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A + // temp2+8*1: temp2[0] ~ temp2[7] + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B + + "1: \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A + + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B + "addiu %[counter], %[counter], -0x01 \n\t" + "bnez %[counter], 1b \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), + [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]), + [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]), + [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), + [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]), + [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr), + [counter]"+&r"(l_counter), [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask), + [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1), + [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1) + : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all) + : "memory" + ); + /* clang-format on */ +} + +#define SUBPIX_VAR8XN(H) \ + uint32_t vpx_sub_pixel_variance8x##H##_mmi( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint8_t temp2[8 * (H)]; \ + var_filter_block2d_bil_8x(src_ptr, src_stride, x_offset, y_offset, temp2, \ + ((H)-2) / 2); \ + \ + return vpx_variance8x##H##_mmi(temp2, 8, ref_ptr, ref_stride, sse); \ + } + +SUBPIX_VAR8XN(16) +SUBPIX_VAR8XN(8) +SUBPIX_VAR8XN(4) + +static inline void var_filter_block2d_bil_4x(const uint8_t *src_ptr, + int src_stride, int x_offset, + int y_offset, uint8_t *temp2, + int counter) { + uint8_t *temp2_ptr = temp2; + mips_reg l_counter = counter; + double ftmp[7]; + mips_reg tmp[2]; + double ff_ph_40, mask; + uint64_t x0, x1, y0, y1, all; + double filter_x0, filter_x1, filter_y0, filter_y1; + const uint8_t *filter_x = bilinear_filters[x_offset]; + const uint8_t *filter_y = bilinear_filters[y_offset]; + x0 = (uint64_t)filter_x[0]; + x1 = (uint64_t)filter_x[1]; + y0 = (uint64_t)filter_y[0]; + y1 = (uint64_t)filter_y[1]; + all = x0 | x1 << 8 | y0 << 16 | y1 << 24; + + /* clang-format off */ + __asm__ volatile ( + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + MMI_MTC1(%[all], %[ftmp6]) + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "pshufh %[filter_x0], %[ftmp6], %[ftmp0] \n\t" + MMI_LI(%[tmp0], 0x10) + MMI_MTC1(%[tmp0], %[mask]) + "ssrld %[ftmp6], %[ftmp6], %[mask] \n\t" + "pshufh %[filter_x1], %[ftmp6], %[ftmp0] \n\t" + "ssrld %[ftmp6], %[ftmp6], %[mask] \n\t" + "pshufh %[filter_y0], %[ftmp6], %[ftmp0] \n\t" + "ssrld %[ftmp6], %[ftmp6], %[mask] \n\t" + "pshufh %[filter_y1], %[ftmp6], %[ftmp0] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + MMI_LI(%[tmp0], 0x07) + MMI_MTC1(%[tmp0], %[ftmp6]) + MMI_LI(%[tmp0], 0x0040004000400040) + MMI_MTC1(%[tmp0], %[ff_ph_40]) + MMI_LI(%[tmp0], 0x00ff00ff00ff00ff) + MMI_MTC1(%[tmp0], %[mask]) + // fdata3: fdata3[0] ~ fdata3[3] + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A + + // fdata3 +src_stride*1: fdata3[0] ~ fdata3[3] + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B + // temp2: temp2[0] ~ temp2[7] + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A + + // fdata3 +src_stride*2: fdata3[0] ~ fdata3[3] + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A + // temp2+4*1: temp2[0] ~ temp2[7] + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B + + "1: \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A + + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B + "addiu %[counter], %[counter], -0x01 \n\t" + "bnez %[counter], 1b \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), + [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), + [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter), + [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask), + [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1), + [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1) + : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all) + : "memory" + ); + /* clang-format on */ +} + +#define SUBPIX_VAR4XN(H) \ + uint32_t vpx_sub_pixel_variance4x##H##_mmi( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint8_t temp2[4 * (H)]; \ + var_filter_block2d_bil_4x(src_ptr, src_stride, x_offset, y_offset, temp2, \ + ((H)-2) / 2); \ + \ + return vpx_variance4x##H##_mmi(temp2, 4, ref_ptr, ref_stride, sse); \ + } + +SUBPIX_VAR4XN(8) +SUBPIX_VAR4XN(4) + +#define SUBPIX_AVG_VAR(W, H) \ + uint32_t vpx_sub_pixel_avg_variance##W##x##H##_mmi( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[((H) + 1) * (W)]; \ + uint8_t temp2[(H) * (W)]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[(H) * (W)]); \ + \ + var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, (H) + 1, \ + W, bilinear_filters[x_offset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W); \ + \ + return vpx_variance##W##x##H##_mmi(temp3, W, ref_ptr, ref_stride, sse); \ + } + +SUBPIX_AVG_VAR(64, 64) +SUBPIX_AVG_VAR(64, 32) +SUBPIX_AVG_VAR(32, 64) +SUBPIX_AVG_VAR(32, 32) +SUBPIX_AVG_VAR(32, 16) +SUBPIX_AVG_VAR(16, 32) +SUBPIX_AVG_VAR(16, 16) +SUBPIX_AVG_VAR(16, 8) +SUBPIX_AVG_VAR(8, 16) +SUBPIX_AVG_VAR(8, 8) +SUBPIX_AVG_VAR(8, 4) +SUBPIX_AVG_VAR(4, 8) +SUBPIX_AVG_VAR(4, 4) diff --git a/media/libvpx/libvpx/vpx_dsp/mips/variance_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/variance_msa.c new file mode 100644 index 0000000000..444b086a6e --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/variance_msa.c @@ -0,0 +1,622 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/macros_msa.h" + +#define CALC_MSE_B(src, ref, var) \ + { \ + v16u8 src_l0_m, src_l1_m; \ + v8i16 res_l0_m, res_l1_m; \ + \ + ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ + HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ + DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ + } + +#define CALC_MSE_AVG_B(src, ref, var, sub) \ + { \ + v16u8 src_l0_m, src_l1_m; \ + v8i16 res_l0_m, res_l1_m; \ + \ + ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ + HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ + DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ + \ + sub += res_l0_m + res_l1_m; \ + } + +#define VARIANCE_WxH(sse, diff, shift) \ + (sse) - (((uint32_t)(diff) * (diff)) >> (shift)) + +#define VARIANCE_LARGE_WxH(sse, diff, shift) \ + (sse) - (((int64_t)(diff) * (diff)) >> (shift)) + +static uint32_t sse_diff_4width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, int32_t *diff) { + uint32_t src0, src1, src2, src3; + uint32_t ref0, ref1, ref2, ref3; + int32_t ht_cnt; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LW4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + INSERT_W4_UB(src0, src1, src2, src3, src); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + CALC_MSE_AVG_B(src, ref, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sse_diff_8width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, + ref0, ref1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sse_diff_16width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, int32_t *diff) { + int32_t ht_cnt; + v16u8 src, ref; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sse_diff_32width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sse_diff_32x64_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1; + v8i16 avg0 = { 0 }; + v8i16 avg1 = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = 16; ht_cnt--;) { + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + } + + vec = __msa_hadd_s_w(avg0, avg0); + vec += __msa_hadd_s_w(avg1, avg1); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sse_diff_64x32_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v8i16 avg0 = { 0 }; + v8i16 avg1 = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = 16; ht_cnt--;) { + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src2, ref2, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src3, ref3, var, avg1); + + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src2, ref2, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src3, ref3, var, avg1); + } + + vec = __msa_hadd_s_w(avg0, avg0); + vec += __msa_hadd_s_w(avg1, avg1); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sse_diff_64x64_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v8i16 avg0 = { 0 }; + v8i16 avg1 = { 0 }; + v8i16 avg2 = { 0 }; + v8i16 avg3 = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = 32; ht_cnt--;) { + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + } + + vec = __msa_hadd_s_w(avg0, avg0); + vec += __msa_hadd_s_w(avg1, avg1); + vec += __msa_hadd_s_w(avg2, avg2); + vec += __msa_hadd_s_w(avg3, avg3); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t get_mb_ss_msa(const int16_t *src) { + uint32_t sum, cnt; + v8i16 src0, src1, src2, src3; + v4i32 src0_l, src1_l, src2_l, src3_l; + v4i32 src0_r, src1_r, src2_r, src3_r; + v2i64 sq_src_l = { 0 }; + v2i64 sq_src_r = { 0 }; + + for (cnt = 8; cnt--;) { + LD_SH4(src, 8, src0, src1, src2, src3); + src += 4 * 8; + + UNPCK_SH_SW(src0, src0_l, src0_r); + UNPCK_SH_SW(src1, src1_l, src1_r); + UNPCK_SH_SW(src2, src2_l, src2_r); + UNPCK_SH_SW(src3, src3_l, src3_r); + + DPADD_SD2_SD(src0_l, src0_r, sq_src_l, sq_src_r); + DPADD_SD2_SD(src1_l, src1_r, sq_src_l, sq_src_r); + DPADD_SD2_SD(src2_l, src2_r, sq_src_l, sq_src_r); + DPADD_SD2_SD(src3_l, src3_r, sq_src_l, sq_src_r); + } + + sq_src_l += __msa_splati_d(sq_src_l, 1); + sq_src_r += __msa_splati_d(sq_src_r, 1); + + sum = __msa_copy_s_d(sq_src_l, 0); + sum += __msa_copy_s_d(sq_src_r, 0); + + return sum; +} + +static uint32_t sse_4width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + uint32_t src0, src1, src2, src3; + uint32_t ref0, ref1, ref2, ref3; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v4i32 var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LW4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + INSERT_W4_UB(src0, src1, src2, src3, src); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + CALC_MSE_B(src, ref, var); + } + + return HADD_SW_S32(var); +} + +static uint32_t sse_8width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v4i32 var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, + ref0, ref1); + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src1, ref1, var); + } + + return HADD_SW_S32(var); +} + +static uint32_t sse_16width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src, ref; + v4i32 var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + } + + return HADD_SW_S32(var); +} + +static uint32_t sse_32width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1; + v4i32 var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src1, ref1, var); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src1, ref1, var); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src1, ref1, var); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src1, ref1, var); + } + + return HADD_SW_S32(var); +} + +static uint32_t sse_64width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v4i32 var = { 0 }; + + for (ht_cnt = height >> 1; ht_cnt--;) { + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src2, ref2, var); + CALC_MSE_B(src1, ref1, var); + CALC_MSE_B(src3, ref3, var); + + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src2, ref2, var); + CALC_MSE_B(src1, ref1, var); + CALC_MSE_B(src3, ref3, var); + } + + return HADD_SW_S32(var); +} + +uint32_t vpx_get4x4sse_cs_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride) { + uint32_t src0, src1, src2, src3; + uint32_t ref0, ref1, ref2, ref3; + v16i8 src = { 0 }; + v16i8 ref = { 0 }; + v4i32 err0 = { 0 }; + + LW4(src_ptr, src_stride, src0, src1, src2, src3); + LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + INSERT_W4_SB(src0, src1, src2, src3, src); + INSERT_W4_SB(ref0, ref1, ref2, ref3, ref); + CALC_MSE_B(src, ref, err0); + + return HADD_SW_S32(err0); +} + +#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4); +#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5); +#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5); +#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6); +#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7); +#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7); +#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8); + +#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); +#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); +#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10); +#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); +#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); +#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12); + +#define VPX_VARIANCE_WDXHT_MSA(wd, ht) \ + uint32_t vpx_variance##wd##x##ht##_msa( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, uint32_t *sse) { \ + int32_t diff; \ + \ + *sse = \ + sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride, ht, &diff); \ + \ + return VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } + +VPX_VARIANCE_WDXHT_MSA(4, 4); +VPX_VARIANCE_WDXHT_MSA(4, 8); + +VPX_VARIANCE_WDXHT_MSA(8, 4) +VPX_VARIANCE_WDXHT_MSA(8, 8) +VPX_VARIANCE_WDXHT_MSA(8, 16) + +VPX_VARIANCE_WDXHT_MSA(16, 8) +VPX_VARIANCE_WDXHT_MSA(16, 16) +VPX_VARIANCE_WDXHT_MSA(16, 32) + +VPX_VARIANCE_WDXHT_MSA(32, 16) +VPX_VARIANCE_WDXHT_MSA(32, 32) + +uint32_t vpx_variance32x64_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + int32_t diff; + + *sse = sse_diff_32x64_msa(src, src_stride, ref, ref_stride, &diff); + + return VARIANCE_32Wx64H(*sse, diff); +} + +uint32_t vpx_variance64x32_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + int32_t diff; + + *sse = sse_diff_64x32_msa(src, src_stride, ref, ref_stride, &diff); + + return VARIANCE_64Wx32H(*sse, diff); +} + +uint32_t vpx_variance64x64_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + int32_t diff; + + *sse = sse_diff_64x64_msa(src, src_stride, ref, ref_stride, &diff); + + return VARIANCE_64Wx64H(*sse, diff); +} + +uint32_t vpx_mse8x8_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, uint32_t *sse) { + *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 8); + + return *sse; +} + +uint32_t vpx_mse8x16_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 16); + + return *sse; +} + +uint32_t vpx_mse16x8_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 8); + + return *sse; +} + +uint32_t vpx_mse16x16_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 16); + + return *sse; +} + +void vpx_get8x8var_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, uint32_t *sse, + int32_t *sum) { + *sse = sse_diff_8width_msa(src, src_stride, ref, ref_stride, 8, sum); +} + +void vpx_get16x16var_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, uint32_t *sse, + int32_t *sum) { + *sse = sse_diff_16width_msa(src, src_stride, ref, ref_stride, 16, sum); +} + +uint32_t vpx_get_mb_ss_msa(const int16_t *src) { return get_mb_ss_msa(src); } diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c new file mode 100644 index 0000000000..5b5a1cbc3a --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c @@ -0,0 +1,716 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/vpx_convolve_msa.h" + +static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + uint32_t tp0, tp1, tp2, tp3; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 dst0 = { 0 }, res; + v16u8 mask0, mask1, mask2, mask3; + v8i16 filt, res0, res1; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, res0, res1); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); + SRARI_H2_SH(res0, res1, FILTER_BITS); + SAT_SH2_SH(res0, res1, 7); + res = PCKEV_XORI128_UB(res0, res1); + res = (v16u8)__msa_aver_u_b(res, dst0); + ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + uint32_t tp0, tp1, tp2, tp3; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3; + v16u8 dst0 = { 0 }, dst1 = { 0 }; + v8i16 filt, vec0, vec1, vec2, vec3; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); + LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, vec0, vec1); + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, vec2, vec3); + SRARI_H4_SH(vec0, vec1, vec2, vec3, FILTER_BITS); + SAT_SH4_SH(vec0, vec1, vec2, vec3, 7); + PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, res0, res1, res2, + res3); + ILVR_D2_UB(res1, res0, res3, res2, res0, res2); + XORI_B2_128_UB(res0, res2); + AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2); + ST4x8_UB(res0, res2, dst, dst_stride); +} + +static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (4 == height) { + common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + int32_t loop_cnt; + int64_t tp0, tp1, tp2, tp3; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, dst0 = { 0 }, dst1 = { 0 }; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst, + dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, dst0, dst1; + v8i16 filt, out0, out1, out2, out3; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = height >> 1; loop_cnt--;) { + LD_SB2(src, src_stride, src0, src2); + LD_SB2(src + 8, src_stride, src1, src3); + src += (2 * src_stride); + + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12); + VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13); + VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10, + vec14); + VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11, + vec15); + DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8, + vec9, vec10, vec11); + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1, + vec2, vec3); + DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8, + vec9, vec10, vec11); + ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1, + out2, out3); + LD_UB2(dst, dst_stride, dst0, dst1); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst); + dst += dst_stride; + PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst); + dst += dst_stride; + } +} + +static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 dst1, dst2, mask0, mask1, mask2, mask3; + v8i16 filt, out0, out1, out2, out3; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = height; loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12); + VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13); + VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10, + vec14); + VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11, + vec15); + DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8, + vec9, vec10, vec11); + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1, + vec2, vec3); + DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8, + vec9, vec10, vec11); + ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + LD_UB2(dst, 16, dst1, dst2); + PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst); + PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16); + dst += dst_stride; + } +} + +static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt, cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 dst1, dst2, mask0, mask1, mask2, mask3; + v8i16 filt, out0, out1, out2, out3; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = height; loop_cnt--;) { + for (cnt = 0; cnt < 2; ++cnt) { + src0 = LD_SB(&src[cnt << 5]); + src2 = LD_SB(&src[16 + (cnt << 5)]); + src3 = LD_SB(&src[24 + (cnt << 5)]); + src1 = __msa_sldi_b(src2, src0, 8); + + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, + vec12); + VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, + vec13); + VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10, + vec14); + VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11, + vec15); + DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, + vec1, vec2, vec3); + DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8, + vec9, vec10, vec11); + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, + vec1, vec2, vec3); + DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8, + vec9, vec10, vec11); + ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + LD_UB2(&dst[cnt << 5], 16, dst1, dst2); + PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]); + PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]); + } + + src += src_stride; + dst += dst_stride; + } +} + +static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + uint32_t tp0, tp1, tp2, tp3; + v16i8 src0, src1, src2, src3, mask; + v16u8 filt0, dst0 = { 0 }, vec0, vec1, res; + v8u16 vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); + SRARI_H2_UH(vec2, vec3, FILTER_BITS); + res = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + res = (v16u8)__msa_aver_u_b(res, dst0); + ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + uint32_t tp0, tp1, tp2, tp3; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3; + v16u8 dst0 = { 0 }, dst1 = { 0 }; + v8u16 vec4, vec5, vec6, vec7, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); + LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); + VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, + vec6, vec7); + SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); + PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, + res3); + ILVR_D2_UB(res1, res0, res3, res2, res0, res2); + AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2); + ST4x8_UB(res0, res2, dst, dst_stride); +} + +static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (4 == height) { + common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + int64_t tp0, tp1, tp2, tp3; + v16i8 src0, src1, src2, src3, mask; + v16u8 filt0, dst0 = { 0 }, dst1 = { 0 }; + v8u16 vec0, vec1, vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); + PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride); +} + +static void common_hz_2t_and_aver_dst_8x8mult_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + int64_t tp0, tp1, tp2, tp3; + v16i8 src0, src1, src2, src3, mask; + v16u8 filt0, dst0 = { 0 }, dst1 = { 0 }; + v8u16 vec0, vec1, vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride); + dst += (4 * dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); + PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride); + dst += (4 * dst_stride); + + if (16 == height) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); + LD_SB4(src, src_stride, src0, src1, src2, src3); + PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride); + dst += (4 * dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); + PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride); + } +} + +static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (4 == height) { + common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter); + } else { + common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride, + filter, height); + } +} + +static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, dst0, dst1, dst2, dst3; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, + res2, res3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, + res6, res7); + SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); + SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + PCKEV_AVG_ST_UB(res1, res0, dst0, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res3, res2, dst1, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res5, res4, dst2, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res7, res6, dst3, dst); + dst += dst_stride; + + for (loop_cnt = (height >> 2) - 1; loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, + res2, res3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, + res6, res7); + SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); + SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + PCKEV_AVG_ST_UB(res1, res0, dst0, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res3, res2, dst1, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res5, res4, dst2, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res7, res6, dst3, dst); + dst += dst_stride; + } +} + +static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, dst0, dst1, dst2, dst3; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + for (loop_cnt = (height >> 1); loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + src4 = LD_SB(src); + src6 = LD_SB(src + 16); + src7 = LD_SB(src + 24); + src5 = __msa_sldi_b(src6, src4, 8); + src += src_stride; + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, + res2, res3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, + res6, res7); + SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); + SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); + LD_UB2(dst, 16, dst0, dst1); + PCKEV_AVG_ST_UB(res1, res0, dst0, dst); + PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16)); + dst += dst_stride; + LD_UB2(dst, 16, dst2, dst3); + PCKEV_AVG_ST_UB(res5, res4, dst2, dst); + PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16)); + dst += dst_stride; + } +} + +static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, dst0, dst1, dst2, dst3; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + for (loop_cnt = height; loop_cnt--;) { + LD_SB4(src, 16, src0, src2, src4, src6); + src7 = LD_SB(src + 56); + SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); + src += src_stride; + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + LD_UB4(dst, 16, dst0, dst1, dst2, dst3); + PCKEV_AVG_ST_UB(out1, out0, dst0, dst); + PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16); + PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32); + PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48); + dst += dst_stride; + } +} + +void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int16_t *const filter_x = filter[x0_q4]; + int8_t cnt, filt_hor[8]; + + assert(x_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + } + + if (vpx_get_filter_taps(filter_x) == 2) { + switch (w) { + case 4: + common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + case 8: + common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + case 16: + common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + case 32: + common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + case 64: + common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + default: + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + case 8: + common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + case 16: + common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + case 32: + common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + case 64: + common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + default: + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c new file mode 100644 index 0000000000..ba816192a1 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c @@ -0,0 +1,611 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/vpx_convolve_msa.h" + +static void common_hv_8ht_8vt_and_aver_dst_4w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt; + uint32_t tp0, tp1, tp2, tp3; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16u8 dst0 = { 0 }, mask0, mask1, mask2, mask3, res; + v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4; + v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= (3 + 3 * src_stride); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8); + + filt = LD_SH(filter_vert); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + + ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); + hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8); + vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); + res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8); + vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); + res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + SRARI_H2_SH(res0, res1, FILTER_BITS); + SAT_SH2_SH(res0, res1, 7); + res = PCKEV_XORI128_UB(res0, res1); + res = (v16u8)__msa_aver_u_b(res, dst0); + ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out5 = hz_out9; + vec0 = vec2; + vec1 = vec3; + vec2 = vec4; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_8w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt; + uint64_t tp0, tp1, tp2, tp3; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; + v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; + v16u8 dst0 = { 0 }, dst1 = { 0 }, mask0, mask1, mask2, mask3; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3; + v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= (3 + 3 * src_stride); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + + filt = LD_SH(filter_vert); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + + ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); + ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4); + ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); + + hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); + tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7); + tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); + tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, + filt_hz0, filt_hz1, filt_hz2, filt_hz3); + out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9); + tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); + CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, + dst_stride); + dst += (4 * dst_stride); + + hz_out6 = hz_out10; + out0 = out2; + out1 = out3; + out2 = out8; + out4 = out6; + out5 = out7; + out6 = out9; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_16w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 2; multiple8_cnt--;) { + common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_32w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_64w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 8; multiple8_cnt--;) { + common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_2ht_2vt_and_aver_dst_4x4_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert) { + uint32_t tp0, tp1, tp2, tp3; + v16i8 src0, src1, src2, src3, src4, mask; + v16u8 filt_hz, filt_vt, vec0, vec1; + v16u8 dst0 = { 0 }, out; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); + + filt = LD_UH(filter_vert); + filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); + hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + out = __msa_aver_u_b(out, dst0); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_hv_2ht_2vt_and_aver_dst_4x8_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert) { + uint32_t tp0, tp1, tp2, tp3; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; + v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1; + v16u8 dst0 = { 0 }, dst1 = { 0 }; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + src8 = LD_SB(src); + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS); + hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS); + hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS); + SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, + hz_out3, hz_out5, 8); + hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6); + + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); + LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, tmp0, + tmp1, tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, res0, res1); + AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); + ST4x8_UB(res0, res1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_and_aver_dst_4w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + if (4 == height) { + common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } else if (8 == height) { + common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } +} + +static void common_hv_2ht_2vt_and_aver_dst_8x4_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert) { + uint64_t tp0, tp1, tp2, tp3; + v16i8 src0, src1, src2, src3, src4, mask; + v16u8 filt_hz, filt_vt, dst0 = { 0 }, dst1 = { 0 }, vec0, vec1, vec2, vec3; + v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp0 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp1 = __msa_dotp_u_h(vec1, filt_vt); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp2 = __msa_dotp_u_h(vec2, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp3 = __msa_dotp_u_h(vec3, filt_vt); + + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt; + uint64_t tp0, tp1, tp2, tp3; + v16i8 src0, src1, src2, src3, src4, mask; + v16u8 filt_hz, filt_vt, vec0, dst0 = { 0 }, dst1 = { 0 }; + v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + src0 = LD_SB(src); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp0 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp1 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp2 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp3 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); + PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hv_2ht_2vt_and_aver_dst_8w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + if (4 == height) { + common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } else { + common_hv_2ht_2vt_and_aver_dst_8x8mult_msa( + src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height); + } +} + +static void common_hv_2ht_2vt_and_aver_dst_16w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + LD_SB2(src, 8, src0, src1); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); + dst += dst_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst); + dst += dst_stride; + + hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst); + dst += dst_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst); + dst += dst_stride; + } +} + +static void common_hv_2ht_2vt_and_aver_dst_32w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 2; multiple8_cnt--;) { + common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 16; + dst += 16; + } +} + +static void common_hv_2ht_2vt_and_aver_dst_64w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 16; + dst += 16; + } +} + +void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; + const int16_t *const filter_y = filter[y0_q4]; + int8_t cnt, filt_hor[8], filt_ver[8]; + + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + filt_ver[cnt] = filter_y[cnt]; + } + + if (vpx_get_filter_taps(filter_x) == 2 && + vpx_get_filter_taps(filter_y) == 2) { + switch (w) { + case 4: + common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], h); + break; + case 8: + common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], h); + break; + case 16: + common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, + &filt_hor[3], &filt_ver[3], h); + break; + case 32: + common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, + &filt_hor[3], &filt_ver[3], h); + break; + case 64: + common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, + &filt_hor[3], &filt_ver[3], h); + break; + default: + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } else if (vpx_get_filter_taps(filter_x) == 2 || + vpx_get_filter_taps(filter_y) == 2) { + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + } else { + switch (w) { + case 4: + common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + case 8: + common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + case 16: + common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + case 32: + common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + case 64: + common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + default: + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c new file mode 100644 index 0000000000..e6a790dfc6 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c @@ -0,0 +1,684 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/vpx_convolve_msa.h" + +static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + uint32_t loop_cnt; + uint32_t tp0, tp1, tp2, tp3; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16u8 dst0 = { 0 }, out; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; + v16i8 src10998, filt0, filt1, filt2, filt3; + v8i16 filt, out10, out32; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110, + src4332, src6554); + XORI_B3_128_SB(src2110, src4332, src6554); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998); + XORI_B2_128_SB(src8776, src10998); + out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0, + filt1, filt2, filt3); + out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0, + filt1, filt2, filt3); + SRARI_H2_SH(out10, out32, FILTER_BITS); + SAT_SH2_SH(out10, out32, 7); + out = PCKEV_XORI128_UB(out10, out32); + out = __msa_aver_u_b(out, dst0); + + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + src2110 = src6554; + src4332 = src8776; + src6554 = src10998; + src6 = src10; + } +} + +static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + uint32_t loop_cnt; + uint64_t tp0, tp1, tp2, tp3; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16u8 dst0 = { 0 }, dst1 = { 0 }; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3; + v8i16 filt, out0, out1, out2, out3; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); + XORI_B4_128_SB(src7, src8, src9, src10); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, filt1, + filt2, filt3); + out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, filt1, + filt2, filt3); + out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, filt1, + filt2, filt3); + out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst, + dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src6 = src10; + } +} + +static void common_vt_8t_and_aver_dst_16w_mult_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height, int32_t width) { + const uint8_t *src_tmp; + uint8_t *dst_tmp; + uint32_t loop_cnt, cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; + v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; + v16i8 filt0, filt1, filt2, filt3; + v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; + v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + for (cnt = (width >> 4); cnt--;) { + src_tmp = src; + dst_tmp = dst; + + LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src_tmp += (7 * src_stride); + + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l, + src54_l, src21_l); + ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src_tmp, src_stride, src7, src8, src9, src10); + src_tmp += (4 * src_stride); + + LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3); + XORI_B4_128_SB(src7, src8, src9, src10); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, + src87_l, src98_l, src109_l); + out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, + filt1, filt2, filt3); + out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, + filt1, filt2, filt3); + out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, + filt1, filt2, filt3); + out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0, + filt1, filt2, filt3); + out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0, + filt1, filt2, filt3); + out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0, + filt1, filt2, filt3); + out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0, + filt1, filt2, filt3); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); + SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); + PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, + out3_r, tmp0, tmp1, tmp2, tmp3); + XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); + AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst0, dst1, + dst2, dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride); + dst_tmp += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src10_l = src54_l; + src32_l = src76_l; + src54_l = src98_l; + src21_l = src65_l; + src43_l = src87_l; + src65_l = src109_l; + src6 = src10; + } + + src += 16; + dst += 16; + } +} + +static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, + filter, height, 16); +} + +static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, + filter, height, 32); +} + +static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, + filter, height, 64); +} + +static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + uint32_t tp0, tp1, tp2, tp3; + v16i8 src0, src1, src2, src3, src4; + v16u8 dst0 = { 0 }, out, filt0, src2110, src4332; + v16i8 src10_r, src32_r, src21_r, src43_r; + v8i16 filt; + v8u16 tmp0, tmp1; + + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + src4 = LD_SB(src); + src += src_stride; + + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); + DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + + out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + out = __msa_aver_u_b(out, dst0); + + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + uint32_t tp0, tp1, tp2, tp3; + v16u8 dst0 = { 0 }, dst1 = { 0 }; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r; + v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; + v16u8 src2110, src4332, src6554, src8776, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + src8 = LD_SB(src); + + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); + LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, + src76_r, src87_r); + ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r, + src76_r, src2110, src4332, src6554, src8776); + DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, + tmp0, tmp1, tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); + AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332); + ST4x8_UB(src2110, src4332, dst, dst_stride); +} + +static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (4 == height) { + common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_vt_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + int64_t tp0, tp1, tp2, tp3; + v16u8 src0, src1, src2, src3, src4; + v16u8 dst0 = { 0 }, dst1 = { 0 }, vec0, vec1, vec2, vec3, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_UB5(src, src_stride, src0, src1, src2, src3, src4); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); + ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); +} + +static void common_vt_2t_and_aver_dst_8x8mult_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + int64_t tp0, tp1, tp2, tp3; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 }; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); + src += (8 * src_stride); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); + LD4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst2); + INSERT_D2_UB(tp2, tp3, dst3); + + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, + vec3); + ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6, + vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); + dst += (4 * dst_stride); + + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst2, dst3, dst, dst_stride); + dst += (4 * dst_stride); + + src0 = src8; + } +} + +static void common_vt_2t_and_aver_dst_8w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (4 == height) { + common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter); + } else { + common_vt_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride, + filter, height); + } +} + +static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 tmp0, tmp1, tmp2, tmp3, filt; + + /* rearranging filter_y */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); + dst += dst_stride; + + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst); + dst += dst_stride; + + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst); + dst += dst_stride; + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst); + dst += dst_stride; + + src0 = src4; + } +} + +static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3, filt; + + /* rearranging filter_y */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_UB2(src, 16, src0, src5); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + + LD_UB4(src + 16, src_stride, src6, src7, src8, src9); + LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7); + src += (4 * src_stride); + + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride); + + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride); + + ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2); + ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride); + + ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6); + ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride); + dst += (4 * dst_stride); + + src0 = src4; + src5 = src9; + } +} + +static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5; + v16u8 src6, src7, src8, src9, src10, src11, filt0; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8u16 filt; + + /* rearranging filter_y */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_UB4(src, 16, src0, src3, src6, src9); + src += src_stride; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_UB2(src, src_stride, src1, src2); + LD_UB2(dst, dst_stride, dst0, dst1); + LD_UB2(src + 16, src_stride, src4, src5); + LD_UB2(dst + 16, dst_stride, dst2, dst3); + LD_UB2(src + 32, src_stride, src7, src8); + LD_UB2(dst + 32, dst_stride, dst4, dst5); + LD_UB2(src + 48, src_stride, src10, src11); + LD_UB2(dst + 48, dst_stride, dst6, dst7); + src += (2 * src_stride); + + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride); + + ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6); + ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); + SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); + SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride); + + ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2); + ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride); + + ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6); + ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); + SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48)); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); + SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride); + dst += (2 * dst_stride); + + src0 = src2; + src3 = src5; + src6 = src8; + src9 = src11; + } +} + +void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int16_t *const filter_y = filter[y0_q4]; + int8_t cnt, filt_ver[8]; + + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_ver[cnt] = filter_y[cnt]; + } + + if (vpx_get_filter_taps(filter_y) == 2) { + switch (w) { + case 4: + common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + case 8: + common_vt_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + case 16: + common_vt_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + case 32: + common_vt_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + case 64: + common_vt_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + default: + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_vt_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + break; + case 8: + common_vt_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + break; + case 16: + common_vt_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + + break; + case 32: + common_vt_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + break; + case 64: + common_vt_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + break; + default: + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c new file mode 100644 index 0000000000..792c0f709c --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c @@ -0,0 +1,692 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/vpx_convolve_msa.h" + +static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16u8 mask0, mask1, mask2, mask3, out; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v8i16 filt, out0, out1; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, out0, out1); + SRARI_H2_SH(out0, out1, FILTER_BITS); + SAT_SH2_SH(out0, out1, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16i8 filt0, filt1, filt2, filt3; + v16i8 src0, src1, src2, src3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, out0, out1); + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + out = PCKEV_XORI128_UB(out2, out3); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (4 == height) { + common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, out0, out1, out2, + out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + tmp0 = PCKEV_XORI128_UB(out0, out1); + tmp1 = PCKEV_XORI128_UB(out2, out3); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); +} + +static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + tmp0 = PCKEV_XORI128_UB(out0, out1); + tmp1 = PCKEV_XORI128_UB(out2, out3); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (4 == height) { + common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter); + } else { + common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); + } +} + +static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_SB2(src, src_stride, src0, src2); + LD_SB2(src + 8, src_stride, src1, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (2 * src_stride); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + dst += dst_stride; + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst); + dst += dst_stride; + } +} + +static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 16); + dst += dst_stride; + + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 16); + dst += dst_stride; + } +} + +static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = height; loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 16); + + src0 = LD_SB(src + 32); + src2 = LD_SB(src + 48); + src3 = LD_SB(src + 56); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst + 32); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 48); + dst += dst_stride; + } +} + +static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, mask; + v16u8 filt0, vec0, vec1, res0, res1; + v8u16 vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); + SRARI_H2_UH(vec2, vec3, FILTER_BITS); + PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16u8 vec0, vec1, vec2, vec3, filt0; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16i8 res0, res1, res2, res3; + v8u16 vec4, vec5, vec6, vec7, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); + VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, + vec6, vec7); + SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); + PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, + res3); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (4 == height) { + common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16u8 filt0; + v16i8 src0, src1, src2, src3, mask; + v8u16 vec0, vec1, vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1); + ST8x4_UB(src0, src1, dst, dst_stride); +} + +static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + v16u8 filt0; + v16i8 src0, src1, src2, src3, mask, out0, out1; + v8u16 vec0, vec1, vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + if (16 == height) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride); + } +} + +static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (4 == height) { + common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); + } else { + common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); + } +} + +static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + loop_cnt = (height >> 2) - 1; + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + PCKEV_ST_SB(out0, out1, dst); + dst += dst_stride; + PCKEV_ST_SB(out2, out3, dst); + dst += dst_stride; + PCKEV_ST_SB(out4, out5, dst); + dst += dst_stride; + PCKEV_ST_SB(out6, out7, dst); + dst += dst_stride; + + for (; loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + PCKEV_ST_SB(out0, out1, dst); + dst += dst_stride; + PCKEV_ST_SB(out2, out3, dst); + dst += dst_stride; + PCKEV_ST_SB(out4, out5, dst); + dst += dst_stride; + PCKEV_ST_SB(out6, out7, dst); + dst += dst_stride; + } +} + +static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + for (loop_cnt = height >> 1; loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + src4 = LD_SB(src); + src6 = LD_SB(src + 16); + src7 = LD_SB(src + 24); + src5 = __msa_sldi_b(src6, src4, 8); + src += src_stride; + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + PCKEV_ST_SB(out0, out1, dst); + PCKEV_ST_SB(out2, out3, dst + 16); + dst += dst_stride; + PCKEV_ST_SB(out4, out5, dst); + PCKEV_ST_SB(out6, out7, dst + 16); + dst += dst_stride; + } +} + +static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + for (loop_cnt = height; loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src4 = LD_SB(src + 32); + src6 = LD_SB(src + 48); + src7 = LD_SB(src + 56); + SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); + src += src_stride; + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + PCKEV_ST_SB(out0, out1, dst); + PCKEV_ST_SB(out2, out3, dst + 16); + PCKEV_ST_SB(out4, out5, dst + 32); + PCKEV_ST_SB(out6, out7, dst + 48); + dst += dst_stride; + } +} + +void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int16_t *const filter_x = filter[x0_q4]; + int8_t cnt, filt_hor[8]; + + assert(x_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + } + + if (vpx_get_filter_taps(filter_x) == 2) { + switch (w) { + case 4: + common_hz_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 8: + common_hz_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 16: + common_hz_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 32: + common_hz_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 64: + common_hz_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + default: + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_hz_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + case 8: + common_hz_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + case 16: + common_hz_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + case 32: + common_hz_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + case 64: + common_hz_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + default: + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c new file mode 100644 index 0000000000..cb7bca5589 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c @@ -0,0 +1,716 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_ports/asmdefs_mmi.h" +#include "vpx_ports/mem.h" + +#define GET_DATA_H_MMI \ + "pmaddhw %[ftmp4], %[ftmp4], %[filter1] \n\t" \ + "pmaddhw %[ftmp5], %[ftmp5], %[filter2] \n\t" \ + "paddw %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \ + "punpckhwd %[ftmp5], %[ftmp4], %[ftmp0] \n\t" \ + "paddw %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \ + "pmaddhw %[ftmp6], %[ftmp6], %[filter1] \n\t" \ + "pmaddhw %[ftmp7], %[ftmp7], %[filter2] \n\t" \ + "paddw %[ftmp6], %[ftmp6], %[ftmp7] \n\t" \ + "punpckhwd %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \ + "paddw %[ftmp6], %[ftmp6], %[ftmp7] \n\t" \ + "punpcklwd %[srcl], %[ftmp4], %[ftmp6] \n\t" \ + "pmaddhw %[ftmp8], %[ftmp8], %[filter1] \n\t" \ + "pmaddhw %[ftmp9], %[ftmp9], %[filter2] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \ + "punpckhwd %[ftmp9], %[ftmp8], %[ftmp0] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \ + "pmaddhw %[ftmp10], %[ftmp10], %[filter1] \n\t" \ + "pmaddhw %[ftmp11], %[ftmp11], %[filter2] \n\t" \ + "paddw %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \ + "punpckhwd %[ftmp11], %[ftmp10], %[ftmp0] \n\t" \ + "paddw %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \ + "punpcklwd %[srch], %[ftmp8], %[ftmp10] \n\t" + +#define GET_DATA_V_MMI \ + "punpcklhw %[srcl], %[ftmp4], %[ftmp5] \n\t" \ + "pmaddhw %[srcl], %[srcl], %[filter10] \n\t" \ + "punpcklhw %[ftmp12], %[ftmp6], %[ftmp7] \n\t" \ + "pmaddhw %[ftmp12], %[ftmp12], %[filter32] \n\t" \ + "paddw %[srcl], %[srcl], %[ftmp12] \n\t" \ + "punpcklhw %[ftmp12], %[ftmp8], %[ftmp9] \n\t" \ + "pmaddhw %[ftmp12], %[ftmp12], %[filter54] \n\t" \ + "paddw %[srcl], %[srcl], %[ftmp12] \n\t" \ + "punpcklhw %[ftmp12], %[ftmp10], %[ftmp11] \n\t" \ + "pmaddhw %[ftmp12], %[ftmp12], %[filter76] \n\t" \ + "paddw %[srcl], %[srcl], %[ftmp12] \n\t" \ + "punpckhhw %[srch], %[ftmp4], %[ftmp5] \n\t" \ + "pmaddhw %[srch], %[srch], %[filter10] \n\t" \ + "punpckhhw %[ftmp12], %[ftmp6], %[ftmp7] \n\t" \ + "pmaddhw %[ftmp12], %[ftmp12], %[filter32] \n\t" \ + "paddw %[srch], %[srch], %[ftmp12] \n\t" \ + "punpckhhw %[ftmp12], %[ftmp8], %[ftmp9] \n\t" \ + "pmaddhw %[ftmp12], %[ftmp12], %[filter54] \n\t" \ + "paddw %[srch], %[srch], %[ftmp12] \n\t" \ + "punpckhhw %[ftmp12], %[ftmp10], %[ftmp11] \n\t" \ + "pmaddhw %[ftmp12], %[ftmp12], %[filter76] \n\t" \ + "paddw %[srch], %[srch], %[ftmp12] \n\t" + +/* clang-format off */ +#define ROUND_POWER_OF_TWO_MMI \ + /* Add para[0] */ \ + "lw %[tmp0], 0x00(%[para]) \n\t" \ + MMI_MTC1(%[tmp0], %[ftmp6]) \ + "punpcklwd %[ftmp6], %[ftmp6], %[ftmp6] \n\t" \ + "paddw %[srcl], %[srcl], %[ftmp6] \n\t" \ + "paddw %[srch], %[srch], %[ftmp6] \n\t" \ + /* Arithmetic right shift para[1] bits */ \ + "lw %[tmp0], 0x04(%[para]) \n\t" \ + MMI_MTC1(%[tmp0], %[ftmp5]) \ + "psraw %[srcl], %[srcl], %[ftmp5] \n\t" \ + "psraw %[srch], %[srch], %[ftmp5] \n\t" +/* clang-format on */ + +#define CLIP_PIXEL_MMI \ + /* Staturated operation */ \ + "packsswh %[srcl], %[srcl], %[srch] \n\t" \ + "packushb %[ftmp12], %[srcl], %[ftmp0] \n\t" + +static void convolve_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int32_t w, int32_t h) { + const int16_t *filter_x = filter[x0_q4]; + double ftmp[14]; + uint32_t tmp[2]; + uint32_t para[5]; + para[0] = (1 << ((FILTER_BITS)-1)); + para[1] = FILTER_BITS; + src -= SUBPEL_TAPS / 2 - 1; + src_stride -= w; + dst_stride -= w; + (void)x_step_q4; + + /* clang-format off */ + __asm__ volatile( + "move %[tmp1], %[width] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gsldlc1 %[filter1], 0x03(%[filter]) \n\t" + "gsldrc1 %[filter1], 0x00(%[filter]) \n\t" + "gsldlc1 %[filter2], 0x0b(%[filter]) \n\t" + "gsldrc1 %[filter2], 0x08(%[filter]) \n\t" + "1: \n\t" + /* Get 8 data per row */ + "gsldlc1 %[ftmp5], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp7], 0x08(%[src]) \n\t" + "gsldrc1 %[ftmp7], 0x01(%[src]) \n\t" + "gsldlc1 %[ftmp9], 0x09(%[src]) \n\t" + "gsldrc1 %[ftmp9], 0x02(%[src]) \n\t" + "gsldlc1 %[ftmp11], 0x0A(%[src]) \n\t" + "gsldrc1 %[ftmp11], 0x03(%[src]) \n\t" + "punpcklbh %[ftmp4], %[ftmp5], %[ftmp0] \n\t" + "punpckhbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp6], %[ftmp7], %[ftmp0] \n\t" + "punpckhbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "punpcklbh %[ftmp8], %[ftmp9], %[ftmp0] \n\t" + "punpckhbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp11], %[ftmp0] \n\t" + "punpckhbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t" + MMI_ADDIU(%[width], %[width], -0x04) + /* Get raw data */ + GET_DATA_H_MMI + ROUND_POWER_OF_TWO_MMI + CLIP_PIXEL_MMI + "swc1 %[ftmp12], 0x00(%[dst]) \n\t" + MMI_ADDIU(%[dst], %[dst], 0x04) + MMI_ADDIU(%[src], %[src], 0x04) + /* Loop count */ + "bnez %[width], 1b \n\t" + "move %[width], %[tmp1] \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[dst], %[dst], %[dst_stride]) + MMI_ADDIU(%[height], %[height], -0x01) + "bnez %[height], 1b \n\t" + : [srcl]"=&f"(ftmp[0]), [srch]"=&f"(ftmp[1]), + [filter1]"=&f"(ftmp[2]), [filter2]"=&f"(ftmp[3]), + [ftmp0]"=&f"(ftmp[4]), [ftmp4]"=&f"(ftmp[5]), + [ftmp5]"=&f"(ftmp[6]), [ftmp6]"=&f"(ftmp[7]), + [ftmp7]"=&f"(ftmp[8]), [ftmp8]"=&f"(ftmp[9]), + [ftmp9]"=&f"(ftmp[10]), [ftmp10]"=&f"(ftmp[11]), + [ftmp11]"=&f"(ftmp[12]), [ftmp12]"=&f"(ftmp[13]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [src]"+&r"(src), [width]"+&r"(w), + [dst]"+&r"(dst), [height]"+&r"(h) + : [filter]"r"(filter_x), [para]"r"(para), + [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); + /* clang-format on */ +} + +static void convolve_vert_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int y0_q4, + int y_step_q4, int32_t w, int32_t h) { + const int16_t *filter_y = filter[y0_q4]; + double ftmp[16]; + uint32_t tmp[1]; + uint32_t para[2]; + ptrdiff_t addr = src_stride; + para[0] = (1 << ((FILTER_BITS)-1)); + para[1] = FILTER_BITS; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + src_stride -= w; + dst_stride -= w; + (void)y_step_q4; + + __asm__ volatile( + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gsldlc1 %[ftmp4], 0x03(%[filter]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[filter]) \n\t" + "gsldlc1 %[ftmp5], 0x0b(%[filter]) \n\t" + "gsldrc1 %[ftmp5], 0x08(%[filter]) \n\t" + "punpcklwd %[filter10], %[ftmp4], %[ftmp4] \n\t" + "punpckhwd %[filter32], %[ftmp4], %[ftmp4] \n\t" + "punpcklwd %[filter54], %[ftmp5], %[ftmp5] \n\t" + "punpckhwd %[filter76], %[ftmp5], %[ftmp5] \n\t" + "1: \n\t" + /* Get 8 data per column */ + "gsldlc1 %[ftmp4], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[src]) \n\t" + MMI_ADDU(%[tmp0], %[src], %[addr]) + "gsldlc1 %[ftmp5], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp6], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp7], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp8], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp9], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp9], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp10], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp10], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp11], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[tmp0]) \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t" + "punpcklbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp10], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t" + MMI_ADDIU(%[width], %[width], -0x04) + /* Get raw data */ + GET_DATA_V_MMI + ROUND_POWER_OF_TWO_MMI + CLIP_PIXEL_MMI + "swc1 %[ftmp12], 0x00(%[dst]) \n\t" + MMI_ADDIU(%[dst], %[dst], 0x04) + MMI_ADDIU(%[src], %[src], 0x04) + /* Loop count */ + "bnez %[width], 1b \n\t" + MMI_SUBU(%[width], %[addr], %[src_stride]) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[dst], %[dst], %[dst_stride]) + MMI_ADDIU(%[height], %[height], -0x01) + "bnez %[height], 1b \n\t" + : [srcl]"=&f"(ftmp[0]), [srch]"=&f"(ftmp[1]), + [filter10]"=&f"(ftmp[2]), [filter32]"=&f"(ftmp[3]), + [filter54]"=&f"(ftmp[4]), [filter76]"=&f"(ftmp[5]), + [ftmp0]"=&f"(ftmp[6]), [ftmp4]"=&f"(ftmp[7]), + [ftmp5]"=&f"(ftmp[8]), [ftmp6]"=&f"(ftmp[9]), + [ftmp7]"=&f"(ftmp[10]), [ftmp8]"=&f"(ftmp[11]), + [ftmp9]"=&f"(ftmp[12]), [ftmp10]"=&f"(ftmp[13]), + [ftmp11]"=&f"(ftmp[14]), [ftmp12]"=&f"(ftmp[15]), + [src]"+&r"(src), [dst]"+&r"(dst), + [width]"+&r"(w), [height]"+&r"(h), + [tmp0]"=&r"(tmp[0]) + : [filter]"r"(filter_y), [para]"r"(para), + [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride), + [addr]"r"((mips_reg)addr) + : "memory" + ); +} + +static void convolve_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int32_t w, int32_t h) { + const int16_t *filter_x = filter[x0_q4]; + double ftmp[14]; + uint32_t tmp[2]; + uint32_t para[2]; + para[0] = (1 << ((FILTER_BITS)-1)); + para[1] = FILTER_BITS; + src -= SUBPEL_TAPS / 2 - 1; + src_stride -= w; + dst_stride -= w; + (void)x_step_q4; + + __asm__ volatile( + "move %[tmp1], %[width] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gsldlc1 %[filter1], 0x03(%[filter]) \n\t" + "gsldrc1 %[filter1], 0x00(%[filter]) \n\t" + "gsldlc1 %[filter2], 0x0b(%[filter]) \n\t" + "gsldrc1 %[filter2], 0x08(%[filter]) \n\t" + "1: \n\t" + /* Get 8 data per row */ + "gsldlc1 %[ftmp5], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp7], 0x08(%[src]) \n\t" + "gsldrc1 %[ftmp7], 0x01(%[src]) \n\t" + "gsldlc1 %[ftmp9], 0x09(%[src]) \n\t" + "gsldrc1 %[ftmp9], 0x02(%[src]) \n\t" + "gsldlc1 %[ftmp11], 0x0A(%[src]) \n\t" + "gsldrc1 %[ftmp11], 0x03(%[src]) \n\t" + "punpcklbh %[ftmp4], %[ftmp5], %[ftmp0] \n\t" + "punpckhbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp6], %[ftmp7], %[ftmp0] \n\t" + "punpckhbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "punpcklbh %[ftmp8], %[ftmp9], %[ftmp0] \n\t" + "punpckhbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp11], %[ftmp0] \n\t" + "punpckhbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t" + MMI_ADDIU(%[width], %[width], -0x04) + /* Get raw data */ + GET_DATA_H_MMI + ROUND_POWER_OF_TWO_MMI + CLIP_PIXEL_MMI + "punpcklbh %[ftmp12], %[ftmp12], %[ftmp0] \n\t" + "gsldlc1 %[ftmp4], 0x07(%[dst]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[dst]) \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "paddh %[ftmp12], %[ftmp12], %[ftmp4] \n\t" + "li %[tmp0], 0x10001 \n\t" + MMI_MTC1(%[tmp0], %[ftmp5]) + "punpcklhw %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "paddh %[ftmp12], %[ftmp12], %[ftmp5] \n\t" + "psrah %[ftmp12], %[ftmp12], %[ftmp5] \n\t" + "packushb %[ftmp12], %[ftmp12], %[ftmp0] \n\t" + "swc1 %[ftmp12], 0x00(%[dst]) \n\t" + MMI_ADDIU(%[dst], %[dst], 0x04) + MMI_ADDIU(%[src], %[src], 0x04) + /* Loop count */ + "bnez %[width], 1b \n\t" + "move %[width], %[tmp1] \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[dst], %[dst], %[dst_stride]) + MMI_ADDIU(%[height], %[height], -0x01) + "bnez %[height], 1b \n\t" + : [srcl]"=&f"(ftmp[0]), [srch]"=&f"(ftmp[1]), + [filter1]"=&f"(ftmp[2]), [filter2]"=&f"(ftmp[3]), + [ftmp0]"=&f"(ftmp[4]), [ftmp4]"=&f"(ftmp[5]), + [ftmp5]"=&f"(ftmp[6]), [ftmp6]"=&f"(ftmp[7]), + [ftmp7]"=&f"(ftmp[8]), [ftmp8]"=&f"(ftmp[9]), + [ftmp9]"=&f"(ftmp[10]), [ftmp10]"=&f"(ftmp[11]), + [ftmp11]"=&f"(ftmp[12]), [ftmp12]"=&f"(ftmp[13]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [src]"+&r"(src), [width]"+&r"(w), + [dst]"+&r"(dst), [height]"+&r"(h) + : [filter]"r"(filter_x), [para]"r"(para), + [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); +} + +static void convolve_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int y0_q4, + int y_step_q4, int32_t w, int32_t h) { + const int16_t *filter_y = filter[y0_q4]; + double ftmp[16]; + uint32_t tmp[1]; + uint32_t para[2]; + ptrdiff_t addr = src_stride; + para[0] = (1 << ((FILTER_BITS)-1)); + para[1] = FILTER_BITS; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + src_stride -= w; + dst_stride -= w; + (void)y_step_q4; + + __asm__ volatile( + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gsldlc1 %[ftmp4], 0x03(%[filter]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[filter]) \n\t" + "gsldlc1 %[ftmp5], 0x0b(%[filter]) \n\t" + "gsldrc1 %[ftmp5], 0x08(%[filter]) \n\t" + "punpcklwd %[filter10], %[ftmp4], %[ftmp4] \n\t" + "punpckhwd %[filter32], %[ftmp4], %[ftmp4] \n\t" + "punpcklwd %[filter54], %[ftmp5], %[ftmp5] \n\t" + "punpckhwd %[filter76], %[ftmp5], %[ftmp5] \n\t" + "1: \n\t" + /* Get 8 data per column */ + "gsldlc1 %[ftmp4], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[src]) \n\t" + MMI_ADDU(%[tmp0], %[src], %[addr]) + "gsldlc1 %[ftmp5], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp6], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp7], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp8], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp9], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp9], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp10], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp10], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp11], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[tmp0]) \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t" + "punpcklbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp10], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t" + MMI_ADDIU(%[width], %[width], -0x04) + /* Get raw data */ + GET_DATA_V_MMI + ROUND_POWER_OF_TWO_MMI + CLIP_PIXEL_MMI + "punpcklbh %[ftmp12], %[ftmp12], %[ftmp0] \n\t" + "gsldlc1 %[ftmp4], 0x07(%[dst]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[dst]) \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "paddh %[ftmp12], %[ftmp12], %[ftmp4] \n\t" + "li %[tmp0], 0x10001 \n\t" + MMI_MTC1(%[tmp0], %[ftmp5]) + "punpcklhw %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "paddh %[ftmp12], %[ftmp12], %[ftmp5] \n\t" + "psrah %[ftmp12], %[ftmp12], %[ftmp5] \n\t" + "packushb %[ftmp12], %[ftmp12], %[ftmp0] \n\t" + "swc1 %[ftmp12], 0x00(%[dst]) \n\t" + MMI_ADDIU(%[dst], %[dst], 0x04) + MMI_ADDIU(%[src], %[src], 0x04) + /* Loop count */ + "bnez %[width], 1b \n\t" + MMI_SUBU(%[width], %[addr], %[src_stride]) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[dst], %[dst], %[dst_stride]) + MMI_ADDIU(%[height], %[height], -0x01) + "bnez %[height], 1b \n\t" + : [srcl]"=&f"(ftmp[0]), [srch]"=&f"(ftmp[1]), + [filter10]"=&f"(ftmp[2]), [filter32]"=&f"(ftmp[3]), + [filter54]"=&f"(ftmp[4]), [filter76]"=&f"(ftmp[5]), + [ftmp0]"=&f"(ftmp[6]), [ftmp4]"=&f"(ftmp[7]), + [ftmp5]"=&f"(ftmp[8]), [ftmp6]"=&f"(ftmp[9]), + [ftmp7]"=&f"(ftmp[10]), [ftmp8]"=&f"(ftmp[11]), + [ftmp9]"=&f"(ftmp[12]), [ftmp10]"=&f"(ftmp[13]), + [ftmp11]"=&f"(ftmp[14]), [ftmp12]"=&f"(ftmp[15]), + [src]"+&r"(src), [dst]"+&r"(dst), + [width]"+&r"(w), [height]"+&r"(h), + [tmp0]"=&r"(tmp[0]) + : [filter]"r"(filter_y), [para]"r"(para), + [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride), + [addr]"r"((mips_reg)addr) + : "memory" + ); +} + +void vpx_convolve_avg_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + int x, y; + + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + if (w & 0x03) { + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); + src += src_stride; + dst += dst_stride; + } + } else { + double ftmp[4]; + uint32_t tmp[2]; + src_stride -= w; + dst_stride -= w; + + __asm__ volatile( + "move %[tmp1], %[width] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "li %[tmp0], 0x10001 \n\t" + MMI_MTC1(%[tmp0], %[ftmp3]) + "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[dst]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[dst]) \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "swc1 %[ftmp1], 0x00(%[dst]) \n\t" + MMI_ADDIU(%[width], %[width], -0x04) + MMI_ADDIU(%[dst], %[dst], 0x04) + MMI_ADDIU(%[src], %[src], 0x04) + "bnez %[width], 1b \n\t" + "move %[width], %[tmp1] \n\t" + MMI_ADDU(%[dst], %[dst], %[dst_stride]) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDIU(%[height], %[height], -0x01) + "bnez %[height], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [src]"+&r"(src), [dst]"+&r"(dst), + [width]"+&r"(w), [height]"+&r"(h) + : [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); + } +} + +static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; + + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = ROUND_POWER_OF_TWO( + dst[y * dst_stride] + + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), + 1); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; + + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = ROUND_POWER_OF_TWO( + dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +void vpx_convolve8_mmi(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int32_t x_step_q4, int y0_q4, + int32_t y_step_q4, int32_t w, int32_t h) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + // When calling in frame scaling function, the smallest scaling factor is x1/4 + // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still + // big enough. + uint8_t temp[64 * 135]; + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= 64); + assert(h <= 64); + assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); + assert(x_step_q4 <= 64); + + if (w & 0x03) { + convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, + 64, filter, x0_q4, x_step_q4, w, intermediate_height); + convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, + filter, y0_q4, y_step_q4, w, h); + } else { + convolve_horiz_mmi(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, + temp, 64, filter, x0_q4, x_step_q4, w, + intermediate_height); + convolve_vert_mmi(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, + filter, y0_q4, y_step_q4, w, h); + } +} + +void vpx_convolve8_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int32_t y_step_q4, + int32_t w, int32_t h) { + (void)y0_q4; + (void)y_step_q4; + if (w & 0x03) + convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + w, h); + else + convolve_horiz_mmi(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, w, h); +} + +void vpx_convolve8_vert_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + (void)x0_q4; + (void)x_step_q4; + if (w & 0x03) + convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w, + h); + else + convolve_vert_mmi(src, src_stride, dst, dst_stride, filter, y0_q4, + y_step_q4, w, h); +} + +void vpx_convolve8_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + (void)y0_q4; + (void)y_step_q4; + if (w & 0x03) + convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, w, h); + else + convolve_avg_horiz_mmi(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, w, h); +} + +void vpx_convolve8_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + (void)x0_q4; + (void)x_step_q4; + if (w & 0x03) + convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, + y_step_q4, w, h); + else + convolve_avg_vert_mmi(src, src_stride, dst, dst_stride, filter, y0_q4, + y_step_q4, w, h); +} + +void vpx_convolve8_avg_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int32_t y_step_q4, + int32_t w, int32_t h) { + // Fixed size intermediate buffer places limits on parameters. + DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]); + assert(w <= 64); + assert(h <= 64); + + vpx_convolve8_mmi(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, h); + vpx_convolve_avg_mmi(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h); +} diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c new file mode 100644 index 0000000000..c942167587 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c @@ -0,0 +1,1227 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/vpx_convolve_msa.h" + +const uint8_t mc_filt_mask_arr[16 * 3] = { + /* 8 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + /* 4 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, + /* 4 width cases */ + 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 +}; + +static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4; + v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= (3 + 3 * src_stride); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8); + + filt = LD_SH(filter_vert); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + + ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); + out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8); + out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); + tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8); + out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); + tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + SRARI_H2_SH(tmp0, tmp1, FILTER_BITS); + SAT_SH2_SH(tmp0, tmp1, 7); + out = PCKEV_XORI128_UB(tmp0, tmp1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out5 = hz_out9; + out0 = out2; + out1 = out3; + out2 = out4; + } +} + +static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; + v16u8 mask0, mask1, mask2, mask3, vec0, vec1; + v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3; + v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= (3 + 3 * src_stride); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + + filt = LD_SH(filter_vert); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + + ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); + ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4); + ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + + XORI_B4_128_SB(src7, src8, src9, src10); + + hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); + tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7); + tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); + tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, + filt_hz0, filt_hz1, filt_hz2, filt_hz3); + out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9); + tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); + vec0 = PCKEV_XORI128_UB(tmp0, tmp1); + vec1 = PCKEV_XORI128_UB(tmp2, tmp3); + ST8x4_UB(vec0, vec1, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out6 = hz_out10; + out0 = out2; + out1 = out3; + out2 = out8; + out4 = out6; + out5 = out7; + out6 = out9; + } +} + +static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 2; multiple8_cnt--;) { + common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 8; multiple8_cnt--;) { + common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert) { + v16i8 src0, src1, src2, src3, src4, mask; + v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); + + filt = LD_UH(filter_vert); + filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); + hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); + + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert) { + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; + v16i8 res0, res1, res2, res3; + v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); + + filt = LD_UH(filter_vert); + filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + src8 = LD_SB(src); + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS); + hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS); + hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS); + SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, + hz_out3, hz_out5, 8); + hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6); + + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, vec4, + vec5, vec6, vec7); + SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); + PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, + res3); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + if (4 == height) { + common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert); + } else if (8 == height) { + common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert); + } +} + +static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert) { + v16i8 src0, src1, src2, src3, src4, mask, out0, out1; + v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3; + v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp0 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp1 = __msa_dotp_u_h(vec1, filt_vt); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp2 = __msa_dotp_u_h(vec2, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp3 = __msa_dotp_u_h(vec3, filt_vt); + + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, mask, out0, out1; + v16u8 filt_hz, filt_vt, vec0; + v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + src0 = LD_SB(src); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp1 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp2 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp3 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + LD_SB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp4 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H2_UH(tmp3, tmp4, FILTER_BITS); + PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp5 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp6 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp7 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp8 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS); + PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hv_2ht_2vt_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + if (4 == height) { + common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert); + } else { + common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + } +} + +static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt_hz, filt_vt, vec0, vec1; + v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + LD_SB2(src, 8, src0, src1); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); + SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); + PCKEV_ST_SB(tmp1, tmp2, dst); + dst += dst_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); + SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); + PCKEV_ST_SB(tmp1, tmp2, dst); + dst += dst_stride; + + hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); + SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); + PCKEV_ST_SB(tmp1, tmp2, dst); + dst += dst_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); + SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); + PCKEV_ST_SB(tmp1, tmp2, dst); + dst += dst_stride; + } +} + +static void common_hv_2ht_2vt_32w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 2; multiple8_cnt--;) { + common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 16; + dst += 16; + } +} + +static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 16; + dst += 16; + } +} + +void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int32_t x_step_q4, int y0_q4, + int32_t y_step_q4, int32_t w, int32_t h) { + const int16_t *const filter_x = filter[x0_q4]; + const int16_t *const filter_y = filter[y0_q4]; + int8_t cnt, filt_hor[8], filt_ver[8]; + + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + filt_ver[cnt] = filter_y[cnt]; + } + + if (vpx_get_filter_taps(filter_x) == 2 && + vpx_get_filter_taps(filter_y) == 2) { + switch (w) { + case 4: + common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + case 8: + common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + case 16: + common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + case 32: + common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + case 64: + common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + default: + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } else if (vpx_get_filter_taps(filter_x) == 2 || + vpx_get_filter_taps(filter_y) == 2) { + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h); + } else { + switch (w) { + case 4: + common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + case 8: + common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + case 16: + common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + case 32: + common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + case 64: + common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + default: + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} + +static void filter_horiz_w4_msa(const uint8_t *src_x, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *x_filter) { + uint64_t srcd0, srcd1, srcd2, srcd3; + uint32_t res; + v16u8 src0 = { 0 }, src1 = { 0 }, dst0; + v16i8 out0, out1; + v16i8 shf1 = { 0, 8, 16, 24, 4, 12, 20, 28, 1, 9, 17, 25, 5, 13, 21, 29 }; + v16i8 shf2 = shf1 + 2; + v16i8 filt_shf0 = { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9 }; + v16i8 filt_shf1 = filt_shf0 + 2; + v16i8 filt_shf2 = filt_shf0 + 4; + v16i8 filt_shf3 = filt_shf0 + 6; + v8i16 filt, src0_h, src1_h, src2_h, src3_h, filt0, filt1, filt2, filt3; + + LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_UB(srcd0, srcd1, src0); + INSERT_D2_UB(srcd2, srcd3, src1); + VSHF_B2_SB(src0, src1, src0, src1, shf1, shf2, out0, out1); + XORI_B2_128_SB(out0, out1); + UNPCK_SB_SH(out0, src0_h, src1_h); + UNPCK_SB_SH(out1, src2_h, src3_h); + + filt = LD_SH(x_filter); + VSHF_B2_SH(filt, filt, filt, filt, filt_shf0, filt_shf1, filt0, filt1); + VSHF_B2_SH(filt, filt, filt, filt, filt_shf2, filt_shf3, filt2, filt3); + + src0_h *= filt0; + src0_h += src1_h * filt1; + src0_h += src2_h * filt2; + src0_h += src3_h * filt3; + + src1_h = (v8i16)__msa_sldi_b((v16i8)src0_h, (v16i8)src0_h, 8); + + src0_h = __msa_adds_s_h(src0_h, src1_h); + src0_h = __msa_srari_h(src0_h, FILTER_BITS); + src0_h = __msa_sat_s_h(src0_h, 7); + dst0 = PCKEV_XORI128_UB(src0_h, src0_h); + res = __msa_copy_u_w((v4i32)dst0, 0); + SW(res, dst); +} + +static void filter_horiz_w8_msa(const uint8_t *src_x, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *x_filter) { + uint64_t srcd0, srcd1, srcd2, srcd3; + v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 }; + v16u8 tmp0, tmp1, tmp2, tmp3, dst0; + v16i8 out0, out1, out2, out3; + v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 }; + v16i8 shf2 = shf1 + 4; + v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h; + v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7; + + LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_UB(srcd0, srcd1, src0); + INSERT_D2_UB(srcd2, srcd3, src1); + LD4(src_x + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_UB(srcd0, srcd1, src2); + INSERT_D2_UB(srcd2, srcd3, src3); + + filt = LD_SH(x_filter); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7); + + // transpose + VSHF_B2_UB(src0, src1, src0, src1, shf1, shf2, tmp0, tmp1); + VSHF_B2_UB(src2, src3, src2, src3, shf1, shf2, tmp2, tmp3); + ILVRL_W2_SB(tmp2, tmp0, out0, out1); + ILVRL_W2_SB(tmp3, tmp1, out2, out3); + + XORI_B4_128_SB(out0, out1, out2, out3); + UNPCK_SB_SH(out0, src0_h, src1_h); + UNPCK_SB_SH(out1, src2_h, src3_h); + UNPCK_SB_SH(out2, src4_h, src5_h); + UNPCK_SB_SH(out3, src6_h, src7_h); + + src0_h *= filt0; + src4_h *= filt4; + src0_h += src1_h * filt1; + src4_h += src5_h * filt5; + src0_h += src2_h * filt2; + src4_h += src6_h * filt6; + src0_h += src3_h * filt3; + src4_h += src7_h * filt7; + + src0_h = __msa_adds_s_h(src0_h, src4_h); + src0_h = __msa_srari_h(src0_h, FILTER_BITS); + src0_h = __msa_sat_s_h(src0_h, 7); + dst0 = PCKEV_XORI128_UB(src0_h, src0_h); + ST8x1_UB(dst0, dst); +} + +static void filter_horiz_w16_msa(const uint8_t *src_x, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *x_filter) { + uint64_t srcd0, srcd1, srcd2, srcd3; + v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 }; + v16u8 src4 = { 0 }, src5 = { 0 }, src6 = { 0 }, src7 = { 0 }; + v16u8 tmp0, tmp1, tmp2, tmp3, dst0; + v16i8 out0, out1, out2, out3, out4, out5, out6, out7; + v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 }; + v16i8 shf2 = shf1 + 4; + v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h; + v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7; + v8i16 dst0_h, dst1_h, dst2_h, dst3_h; + + LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_UB(srcd0, srcd1, src0); + INSERT_D2_UB(srcd2, srcd3, src1); + LD4(src_x + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_UB(srcd0, srcd1, src2); + INSERT_D2_UB(srcd2, srcd3, src3); + LD4(src_x + 8 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_UB(srcd0, srcd1, src4); + INSERT_D2_UB(srcd2, srcd3, src5); + LD4(src_x + 12 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_UB(srcd0, srcd1, src6); + INSERT_D2_UB(srcd2, srcd3, src7); + + filt = LD_SH(x_filter); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7); + + // transpose + VSHF_B2_UB(src0, src1, src0, src1, shf1, shf2, tmp0, tmp1); + VSHF_B2_UB(src2, src3, src2, src3, shf1, shf2, tmp2, tmp3); + ILVRL_W2_SB(tmp2, tmp0, out0, out1); + ILVRL_W2_SB(tmp3, tmp1, out2, out3); + XORI_B4_128_SB(out0, out1, out2, out3); + + UNPCK_SB_SH(out0, src0_h, src1_h); + UNPCK_SB_SH(out1, src2_h, src3_h); + UNPCK_SB_SH(out2, src4_h, src5_h); + UNPCK_SB_SH(out3, src6_h, src7_h); + + VSHF_B2_UB(src4, src5, src4, src5, shf1, shf2, tmp0, tmp1); + VSHF_B2_UB(src6, src7, src6, src7, shf1, shf2, tmp2, tmp3); + ILVRL_W2_SB(tmp2, tmp0, out4, out5); + ILVRL_W2_SB(tmp3, tmp1, out6, out7); + XORI_B4_128_SB(out4, out5, out6, out7); + + dst0_h = src0_h * filt0; + dst1_h = src4_h * filt4; + dst0_h += src1_h * filt1; + dst1_h += src5_h * filt5; + dst0_h += src2_h * filt2; + dst1_h += src6_h * filt6; + dst0_h += src3_h * filt3; + dst1_h += src7_h * filt7; + + UNPCK_SB_SH(out4, src0_h, src1_h); + UNPCK_SB_SH(out5, src2_h, src3_h); + UNPCK_SB_SH(out6, src4_h, src5_h); + UNPCK_SB_SH(out7, src6_h, src7_h); + + dst2_h = src0_h * filt0; + dst3_h = src4_h * filt4; + dst2_h += src1_h * filt1; + dst3_h += src5_h * filt5; + dst2_h += src2_h * filt2; + dst3_h += src6_h * filt6; + dst2_h += src3_h * filt3; + dst3_h += src7_h * filt7; + + ADDS_SH2_SH(dst0_h, dst1_h, dst2_h, dst3_h, dst0_h, dst2_h); + SRARI_H2_SH(dst0_h, dst2_h, FILTER_BITS); + SAT_SH2_SH(dst0_h, dst2_h, 7); + dst0 = PCKEV_XORI128_UB(dst0_h, dst2_h); + ST_UB(dst0, dst); +} + +static void transpose4x4_to_dst(const uint8_t *src, uint8_t *dst, + ptrdiff_t dst_stride) { + v16u8 in0; + v16i8 out0 = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; + + in0 = LD_UB(src); + out0 = __msa_vshf_b(out0, (v16i8)in0, (v16i8)in0); + ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride); +} + +static void transpose8x8_to_dst(const uint8_t *src, uint8_t *dst, + ptrdiff_t dst_stride) { + v16u8 in0, in1, in2, in3, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3; + v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 }; + v16i8 shf2 = shf1 + 4; + + LD_UB4(src, 16, in0, in1, in2, in3); + VSHF_B2_UB(in0, in1, in0, in1, shf1, shf2, tmp0, tmp1); + VSHF_B2_UB(in2, in3, in2, in3, shf1, shf2, tmp2, tmp3); + ILVRL_W2_UB(tmp2, tmp0, out0, out1); + ILVRL_W2_UB(tmp3, tmp1, out2, out3); + ST8x4_UB(out0, out1, dst, dst_stride); + ST8x4_UB(out2, out3, dst + 4 * dst_stride, dst_stride); +} + +static void transpose16x16_to_dst(const uint8_t *src, uint8_t *dst, + ptrdiff_t dst_stride) { + v16u8 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12; + v16u8 in13, in14, in15, out0, out1, out2, out3, out4, out5, out6, out7, out8; + v16u8 out9, out10, out11, out12, out13, out14, out15; + + LD_UB8(src, 16, in0, in1, in2, in3, in4, in5, in6, in7); + LD_UB8(src + 16 * 8, 16, in8, in9, in10, in11, in12, in13, in14, in15); + + TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, + in11, in12, in13, in14, in15, out0, out1, out2, out3, + out4, out5, out6, out7); + ST_UB8(out0, out1, out2, out3, out4, out5, out6, out7, dst, dst_stride); + dst += 8 * dst_stride; + + SLDI_B4_0_UB(in0, in1, in2, in3, in0, in1, in2, in3, 8); + SLDI_B4_0_UB(in4, in5, in6, in7, in4, in5, in6, in7, 8); + SLDI_B4_0_UB(in8, in9, in10, in11, in8, in9, in10, in11, 8); + SLDI_B4_0_UB(in12, in13, in14, in15, in12, in13, in14, in15, 8); + + TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, + in11, in12, in13, in14, in15, out8, out9, out10, out11, + out12, out13, out14, out15); + ST_UB8(out8, out9, out10, out11, out12, out13, out14, out15, dst, dst_stride); +} + +static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int h) { + DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); + int y, z, i; + src -= SUBPEL_TAPS / 2 - 1; + + for (y = 0; y < h; y += 4) { + int x_q4 = x0_q4; + for (z = 0; z < 4; ++z) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + + if (x_q4 & SUBPEL_MASK) { + filter_horiz_w4_msa(src_x, src_stride, temp + (z * 4), x_filter); + } else { + for (i = 0; i < 4; ++i) { + temp[z * 4 + i] = src_x[i * src_stride + 3]; + } + } + + x_q4 += x_step_q4; + } + + transpose4x4_to_dst(temp, dst, dst_stride); + + src += src_stride * 4; + dst += dst_stride * 4; + } +} + +static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int h) { + DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); + int y, z, i; + src -= SUBPEL_TAPS / 2 - 1; + + // This function processes 8x8 areas. The intermediate height is not always + // a multiple of 8, so force it to be a multiple of 8 here. + y = h + (8 - (h & 0x7)); + + do { + int x_q4 = x0_q4; + for (z = 0; z < 8; ++z) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + + if (x_q4 & SUBPEL_MASK) { + filter_horiz_w8_msa(src_x, src_stride, temp + (z * 8), x_filter); + } else { + for (i = 0; i < 8; ++i) { + temp[z * 8 + i] = src_x[3 + i * src_stride]; + } + } + + x_q4 += x_step_q4; + } + + transpose8x8_to_dst(temp, dst, dst_stride); + + src += src_stride * 8; + dst += dst_stride * 8; + } while (y -= 8); +} + +static void scaledconvolve_horiz_mul16(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + DECLARE_ALIGNED(16, uint8_t, temp[16 * 16]); + int x, y, z, i; + + src -= SUBPEL_TAPS / 2 - 1; + + // This function processes 16x16 areas. The intermediate height is not always + // a multiple of 16, so force it to be a multiple of 8 here. + y = h + (16 - (h & 0xF)); + + do { + int x_q4 = x0_q4; + for (x = 0; x < w; x += 16) { + for (z = 0; z < 16; ++z) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + + if (x_q4 & SUBPEL_MASK) { + filter_horiz_w16_msa(src_x, src_stride, temp + (z * 16), x_filter); + } else { + for (i = 0; i < 16; ++i) { + temp[z * 16 + i] = src_x[3 + i * src_stride]; + } + } + + x_q4 += x_step_q4; + } + + transpose16x16_to_dst(temp, dst + x, dst_stride); + } + + src += src_stride * 16; + dst += dst_stride * 16; + } while (y -= 16); +} + +static void filter_vert_w4_msa(const uint8_t *src_y, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *y_filter) { + uint32_t srcw0, srcw1, srcw2, srcw3, srcw4, srcw5, srcw6, srcw7; + uint32_t res; + v16u8 src0 = { 0 }, src1 = { 0 }, dst0; + v16i8 out0, out1; + v16i8 shf1 = { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 }; + v16i8 shf2 = shf1 + 8; + v16i8 filt_shf0 = { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9 }; + v16i8 filt_shf1 = filt_shf0 + 2; + v16i8 filt_shf2 = filt_shf0 + 4; + v16i8 filt_shf3 = filt_shf0 + 6; + v8i16 filt, src0_h, src1_h, src2_h, src3_h; + v8i16 filt0, filt1, filt2, filt3; + + LW4(src_y, src_pitch, srcw0, srcw1, srcw2, srcw3); + LW4(src_y + 4 * src_pitch, src_pitch, srcw4, srcw5, srcw6, srcw7); + INSERT_W4_UB(srcw0, srcw1, srcw2, srcw3, src0); + INSERT_W4_UB(srcw4, srcw5, srcw6, srcw7, src1); + VSHF_B2_SB(src0, src1, src0, src1, shf1, shf2, out0, out1); + XORI_B2_128_SB(out0, out1); + UNPCK_SB_SH(out0, src0_h, src1_h); + UNPCK_SB_SH(out1, src2_h, src3_h); + + filt = LD_SH(y_filter); + VSHF_B2_SH(filt, filt, filt, filt, filt_shf0, filt_shf1, filt0, filt1); + VSHF_B2_SH(filt, filt, filt, filt, filt_shf2, filt_shf3, filt2, filt3); + + src0_h *= filt0; + src0_h += src1_h * filt1; + src0_h += src2_h * filt2; + src0_h += src3_h * filt3; + + src1_h = (v8i16)__msa_sldi_b((v16i8)src0_h, (v16i8)src0_h, 8); + + src0_h = __msa_adds_s_h(src0_h, src1_h); + src0_h = __msa_srari_h(src0_h, FILTER_BITS); + src0_h = __msa_sat_s_h(src0_h, 7); + dst0 = PCKEV_XORI128_UB(src0_h, src0_h); + res = __msa_copy_u_w((v4i32)dst0, 0); + SW(res, dst); +} + +static void filter_vert_w8_msa(const uint8_t *src_y, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *y_filter) { + uint64_t srcd0, srcd1, srcd2, srcd3; + v16u8 dst0; + v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 }; + v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h; + v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7; + + LD4(src_y, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_SB(srcd0, srcd1, src0); + INSERT_D2_SB(srcd2, srcd3, src1); + LD4(src_y + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_SB(srcd0, srcd1, src2); + INSERT_D2_SB(srcd2, srcd3, src3); + + filt = LD_SH(y_filter); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7); + + XORI_B4_128_SB(src0, src1, src2, src3); + UNPCK_SB_SH(src0, src0_h, src1_h); + UNPCK_SB_SH(src1, src2_h, src3_h); + UNPCK_SB_SH(src2, src4_h, src5_h); + UNPCK_SB_SH(src3, src6_h, src7_h); + + src0_h *= filt0; + src4_h *= filt4; + src0_h += src1_h * filt1; + src4_h += src5_h * filt5; + src0_h += src2_h * filt2; + src4_h += src6_h * filt6; + src0_h += src3_h * filt3; + src4_h += src7_h * filt7; + + src0_h = __msa_adds_s_h(src0_h, src4_h); + src0_h = __msa_srari_h(src0_h, FILTER_BITS); + src0_h = __msa_sat_s_h(src0_h, 7); + dst0 = PCKEV_XORI128_UB(src0_h, src0_h); + ST8x1_UB(dst0, dst); +} + +static void filter_vert_mul_w16_msa(const uint8_t *src_y, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *y_filter, + int w) { + int x; + v16u8 dst0; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h; + v8i16 src8_h, src9_h, src10_h, src11_h, src12_h, src13_h, src14_h, src15_h; + v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7; + + filt = LD_SH(y_filter); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7); + + for (x = 0; x < w; x += 16) { + LD_SB8(src_y, src_pitch, src0, src1, src2, src3, src4, src5, src6, src7); + src_y += 16; + + XORI_B4_128_SB(src0, src1, src2, src3); + XORI_B4_128_SB(src4, src5, src6, src7); + UNPCK_SB_SH(src0, src0_h, src1_h); + UNPCK_SB_SH(src1, src2_h, src3_h); + UNPCK_SB_SH(src2, src4_h, src5_h); + UNPCK_SB_SH(src3, src6_h, src7_h); + UNPCK_SB_SH(src4, src8_h, src9_h); + UNPCK_SB_SH(src5, src10_h, src11_h); + UNPCK_SB_SH(src6, src12_h, src13_h); + UNPCK_SB_SH(src7, src14_h, src15_h); + + src0_h *= filt0; + src1_h *= filt0; + src8_h *= filt4; + src9_h *= filt4; + src0_h += src2_h * filt1; + src1_h += src3_h * filt1; + src8_h += src10_h * filt5; + src9_h += src11_h * filt5; + src0_h += src4_h * filt2; + src1_h += src5_h * filt2; + src8_h += src12_h * filt6; + src9_h += src13_h * filt6; + src0_h += src6_h * filt3; + src1_h += src7_h * filt3; + src8_h += src14_h * filt7; + src9_h += src15_h * filt7; + + ADDS_SH2_SH(src0_h, src8_h, src1_h, src9_h, src0_h, src1_h); + SRARI_H2_SH(src0_h, src1_h, FILTER_BITS); + SAT_SH2_SH(src0_h, src1_h, 7); + dst0 = PCKEV_XORI128_UB(src0_h, src1_h); + ST_UB(dst0, dst); + dst += 16; + } +} + +static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (y = 0; y < h; ++y) { + const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + + if (y_q4 & SUBPEL_MASK) { + filter_vert_w4_msa(src_y, src_stride, &dst[y * dst_stride], y_filter); + } else { + uint32_t srcd = LW(src_y + 3 * src_stride); + SW(srcd, dst + y * dst_stride); + } + + y_q4 += y_step_q4; + } +} + +static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (y = 0; y < h; ++y) { + const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + + if (y_q4 & SUBPEL_MASK) { + filter_vert_w8_msa(src_y, src_stride, &dst[y * dst_stride], y_filter); + } else { + uint64_t srcd = LD(src_y + 3 * src_stride); + SD(srcd, dst + y * dst_stride); + } + + y_q4 += y_step_q4; + } +} + +static void scaledconvolve_vert_mul16(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int x, y; + int y_q4 = y0_q4; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (y = 0; y < h; ++y) { + const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + + if (y_q4 & SUBPEL_MASK) { + filter_vert_mul_w16_msa(src_y, src_stride, &dst[y * dst_stride], y_filter, + w); + } else { + for (x = 0; x < w; ++x) { + dst[x + y * dst_stride] = src_y[x + 3 * src_stride]; + } + } + + y_q4 += y_step_q4; + } +} + +void vpx_scaled_2d_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + // --Require an additional 8 rows for the horiz_w8 transpose tail. + DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]); + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= 64); + assert(h <= 64); + assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); + assert(x_step_q4 <= 64); + + if ((0 == x0_q4) && (16 == x_step_q4) && (0 == y0_q4) && (16 == y_step_q4)) { + vpx_convolve_copy_msa(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + } else { + if (w >= 16) { + scaledconvolve_horiz_mul16(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, 64, filter, x0_q4, x_step_q4, + w, intermediate_height); + } else if (w == 8) { + scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, 64, filter, x0_q4, x_step_q4, + intermediate_height); + } else { + scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, 64, filter, x0_q4, x_step_q4, + intermediate_height); + } + + if (w >= 16) { + scaledconvolve_vert_mul16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, w, h); + } else if (w == 8) { + scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, h); + } else { + scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, h); + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c new file mode 100644 index 0000000000..195228689e --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c @@ -0,0 +1,699 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/vpx_convolve_msa.h" + +static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; + v16i8 src10998, filt0, filt1, filt2, filt3; + v16u8 out; + v8i16 filt, out10, out32; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110, + src4332, src6554); + XORI_B3_128_SB(src2110, src4332, src6554); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998); + XORI_B2_128_SB(src8776, src10998); + out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0, + filt1, filt2, filt3); + out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0, + filt1, filt2, filt3); + SRARI_H2_SH(out10, out32, FILTER_BITS); + SAT_SH2_SH(out10, out32, 7); + out = PCKEV_XORI128_UB(out10, out32); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + src2110 = src6554; + src4332 = src8776; + src6554 = src10998; + src6 = src10; + } +} + +static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3; + v16u8 tmp0, tmp1; + v8i16 filt, out0_r, out1_r, out2_r, out3_r; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, + filt1, filt2, filt3); + out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, + filt1, filt2, filt3); + out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, + filt1, filt2, filt3); + out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + tmp0 = PCKEV_XORI128_UB(out0_r, out1_r); + tmp1 = PCKEV_XORI128_UB(out2_r, out3_r); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src6 = src10; + } +} + +static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 filt0, filt1, filt2, filt3; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; + v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; + v16u8 tmp0, tmp1, tmp2, tmp3; + v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l, + src54_l, src21_l); + ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, + src87_l, src98_l, src109_l); + out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, + filt1, filt2, filt3); + out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, + filt1, filt2, filt3); + out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, + filt1, filt2, filt3); + out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0, + filt1, filt2, filt3); + out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0, + filt1, filt2, filt3); + out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0, + filt1, filt2, filt3); + out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0, + filt1, filt2, filt3); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); + SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); + PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r, + tmp0, tmp1, tmp2, tmp3); + XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); + ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src10_l = src54_l; + src32_l = src76_l; + src54_l = src98_l; + src21_l = src65_l; + src43_l = src87_l; + src65_l = src109_l; + src6 = src10; + } +} + +static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height, + int32_t width) { + const uint8_t *src_tmp; + uint8_t *dst_tmp; + uint32_t loop_cnt, cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 filt0, filt1, filt2, filt3; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; + v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; + v16u8 tmp0, tmp1, tmp2, tmp3; + v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + for (cnt = (width >> 4); cnt--;) { + src_tmp = src; + dst_tmp = dst; + + LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src_tmp += (7 * src_stride); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l, + src54_l, src21_l); + ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src_tmp, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src_tmp += (4 * src_stride); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, + src87_l, src98_l, src109_l); + out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, + filt1, filt2, filt3); + out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, + filt1, filt2, filt3); + out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, + filt1, filt2, filt3); + out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0, + filt1, filt2, filt3); + out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0, + filt1, filt2, filt3); + out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0, + filt1, filt2, filt3); + out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0, + filt1, filt2, filt3); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); + SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); + PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, + out3_r, tmp0, tmp1, tmp2, tmp3); + XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); + ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride); + dst_tmp += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src10_l = src54_l; + src32_l = src76_l; + src54_l = src98_l; + src21_l = src65_l; + src43_l = src87_l; + src65_l = src109_l; + src6 = src10; + } + + src += 16; + dst += 16; + } +} + +static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, + 32); +} + +static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, + 64); +} + +static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, src4; + v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332; + v16u8 filt0; + v8i16 filt; + v8u16 tmp0, tmp1; + + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); + DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776; + v8u16 tmp0, tmp1, tmp2, tmp3; + v16u8 filt0; + v8i16 filt; + + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + src8 = LD_SB(src); + src += src_stride; + + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, + src76_r, src87_r); + ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r, + src76_r, src2110, src4332, src6554, src8776); + DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, + tmp0, tmp1, tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); + ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); + ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); +} + +static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (4 == height) { + common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0; + v16i8 out0, out1; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_UB5(src, src_stride, src0, src1, src2, src3, src4); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); + ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); +} + +static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v16i8 out0, out1; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); + src += (8 * src_stride); + + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, + vec3); + ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6, + vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + src0 = src8; + } +} + +static void common_vt_2t_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (4 == height) { + common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); + } else { + common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); + } +} + +static void common_vt_2t_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst); + dst += dst_stride; + + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst); + dst += dst_stride; + + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst); + dst += dst_stride; + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst); + dst += dst_stride; + + src0 = src4; + } +} + +static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + src0 = LD_UB(src); + src5 = LD_UB(src + 16); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + + LD_UB4(src + 16, src_stride, src6, src7, src8, src9); + src += (4 * src_stride); + + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst); + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); + + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride); + + ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2); + ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst + 16); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride); + + ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6); + ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride); + dst += (4 * dst_stride); + + src0 = src4; + src5 = src9; + } +} + +static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_UB4(src, 16, src0, src3, src6, src9); + src += src_stride; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_UB2(src, src_stride, src1, src2); + LD_UB2(src + 16, src_stride, src4, src5); + LD_UB2(src + 32, src_stride, src7, src8); + LD_UB2(src + 48, src_stride, src10, src11); + src += (2 * src_stride); + + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); + + ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6); + ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); + SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); + PCKEV_ST_SB(tmp4, tmp5, dst + 16); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); + SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); + PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride); + + ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2); + ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst + 32); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride); + + ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6); + ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); + SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); + PCKEV_ST_SB(tmp4, tmp5, dst + 48); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); + SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); + PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride); + dst += (2 * dst_stride); + + src0 = src2; + src3 = src5; + src6 = src8; + src9 = src11; + } +} + +void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int16_t *const filter_y = filter[y0_q4]; + int8_t cnt, filt_ver[8]; + + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 8; cnt--;) { + filt_ver[cnt] = filter_y[cnt]; + } + + if (vpx_get_filter_taps(filter_y) == 2) { + switch (w) { + case 4: + common_vt_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 8: + common_vt_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 16: + common_vt_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 32: + common_vt_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 64: + common_vt_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + default: + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_vt_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 8: + common_vt_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 16: + common_vt_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 32: + common_vt_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 64: + common_vt_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + default: + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c new file mode 100644 index 0000000000..ce649935da --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c @@ -0,0 +1,234 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/macros_msa.h" + +static void avg_width4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int32_t height) { + int32_t cnt; + uint32_t out0, out1, out2, out3; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + + if (0 == (height % 4)) { + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, + dst2, dst3); + + out0 = __msa_copy_u_w((v4i32)dst0, 0); + out1 = __msa_copy_u_w((v4i32)dst1, 0); + out2 = __msa_copy_u_w((v4i32)dst2, 0); + out3 = __msa_copy_u_w((v4i32)dst3, 0); + SW4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == (height % 2)) { + for (cnt = (height / 2); cnt--;) { + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + + LD_UB2(dst, dst_stride, dst0, dst1); + + AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1); + + out0 = __msa_copy_u_w((v4i32)dst0, 0); + out1 = __msa_copy_u_w((v4i32)dst1, 0); + SW(out0, dst); + dst += dst_stride; + SW(out1, dst); + dst += dst_stride; + } + } +} + +static void avg_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int32_t height) { + int32_t cnt; + uint64_t out0, out1, out2, out3; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, + dst2, dst3); + + out0 = __msa_copy_u_d((v2i64)dst0, 0); + out1 = __msa_copy_u_d((v2i64)dst1, 0); + out2 = __msa_copy_u_d((v2i64)dst2, 0); + out3 = __msa_copy_u_d((v2i64)dst3, 0); + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void avg_width16_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + + for (cnt = (height / 8); cnt--;) { + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, + dst2, dst3); + AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5, + dst6, dst7); + ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride); + dst += (8 * dst_stride); + } +} + +static void avg_width32_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + uint8_t *dst_dup = dst; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 src8, src9, src10, src11, src12, src13, src14, src15; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; + + for (cnt = (height / 8); cnt--;) { + LD_UB4(src, src_stride, src0, src2, src4, src6); + LD_UB4(src + 16, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6); + LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7); + dst_dup += (4 * dst_stride); + LD_UB4(src, src_stride, src8, src10, src12, src14); + LD_UB4(src + 16, src_stride, src9, src11, src13, src15); + src += (4 * src_stride); + LD_UB4(dst_dup, dst_stride, dst8, dst10, dst12, dst14); + LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15); + dst_dup += (4 * dst_stride); + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, + dst2, dst3); + AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5, + dst6, dst7); + AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, dst8, dst9, + dst10, dst11); + AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, dst12, + dst13, dst14, dst15); + + ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride); + ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride); + dst += (4 * dst_stride); + ST_UB4(dst8, dst10, dst12, dst14, dst, dst_stride); + ST_UB4(dst9, dst11, dst13, dst15, dst + 16, dst_stride); + dst += (4 * dst_stride); + } +} + +static void avg_width64_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + uint8_t *dst_dup = dst; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 src8, src9, src10, src11, src12, src13, src14, src15; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; + + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(src, 16, src4, src5, src6, src7); + src += src_stride; + LD_UB4(src, 16, src8, src9, src10, src11); + src += src_stride; + LD_UB4(src, 16, src12, src13, src14, src15); + src += src_stride; + + LD_UB4(dst_dup, 16, dst0, dst1, dst2, dst3); + dst_dup += dst_stride; + LD_UB4(dst_dup, 16, dst4, dst5, dst6, dst7); + dst_dup += dst_stride; + LD_UB4(dst_dup, 16, dst8, dst9, dst10, dst11); + dst_dup += dst_stride; + LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15); + dst_dup += dst_stride; + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, + dst2, dst3); + AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5, + dst6, dst7); + AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, dst8, dst9, + dst10, dst11); + AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, dst12, + dst13, dst14, dst15); + + ST_UB4(dst0, dst1, dst2, dst3, dst, 16); + dst += dst_stride; + ST_UB4(dst4, dst5, dst6, dst7, dst, 16); + dst += dst_stride; + ST_UB4(dst8, dst9, dst10, dst11, dst, 16); + dst += dst_stride; + ST_UB4(dst12, dst13, dst14, dst15, dst, 16); + dst += dst_stride; + } +} + +void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int32_t y_step_q4, + int32_t w, int32_t h) { + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + switch (w) { + case 4: { + avg_width4_msa(src, src_stride, dst, dst_stride, h); + break; + } + case 8: { + avg_width8_msa(src, src_stride, dst, dst_stride, h); + break; + } + case 16: { + avg_width16_msa(src, src_stride, dst, dst_stride, h); + break; + } + case 32: { + avg_width32_msa(src, src_stride, dst, dst_stride, h); + break; + } + case 64: { + avg_width64_msa(src, src_stride, dst, dst_stride, h); + break; + } + default: { + int32_t lp, cnt; + for (cnt = h; cnt--;) { + for (lp = 0; lp < w; ++lp) { + dst[lp] = (((dst[lp] + src[lp]) + 1) >> 1); + } + src += src_stride; + dst += dst_stride; + } + break; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c new file mode 100644 index 0000000000..c2ab33a2f4 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c @@ -0,0 +1,249 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/macros_msa.h" + +static void copy_width8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + if (0 == height % 12) { + for (cnt = (height / 12); cnt--;) { + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + out0 = __msa_copy_u_d((v2i64)src0, 0); + out1 = __msa_copy_u_d((v2i64)src1, 0); + out2 = __msa_copy_u_d((v2i64)src2, 0); + out3 = __msa_copy_u_d((v2i64)src3, 0); + out4 = __msa_copy_u_d((v2i64)src4, 0); + out5 = __msa_copy_u_d((v2i64)src5, 0); + out6 = __msa_copy_u_d((v2i64)src6, 0); + out7 = __msa_copy_u_d((v2i64)src7, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); + dst += (4 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + out0 = __msa_copy_u_d((v2i64)src0, 0); + out1 = __msa_copy_u_d((v2i64)src1, 0); + out2 = __msa_copy_u_d((v2i64)src2, 0); + out3 = __msa_copy_u_d((v2i64)src3, 0); + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 8) { + for (cnt = height >> 3; cnt--;) { + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + out0 = __msa_copy_u_d((v2i64)src0, 0); + out1 = __msa_copy_u_d((v2i64)src1, 0); + out2 = __msa_copy_u_d((v2i64)src2, 0); + out3 = __msa_copy_u_d((v2i64)src3, 0); + out4 = __msa_copy_u_d((v2i64)src4, 0); + out5 = __msa_copy_u_d((v2i64)src5, 0); + out6 = __msa_copy_u_d((v2i64)src6, 0); + out7 = __msa_copy_u_d((v2i64)src7, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 4) { + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + out0 = __msa_copy_u_d((v2i64)src0, 0); + out1 = __msa_copy_u_d((v2i64)src1, 0); + out2 = __msa_copy_u_d((v2i64)src2, 0); + out3 = __msa_copy_u_d((v2i64)src3, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 2) { + for (cnt = (height / 2); cnt--;) { + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + out0 = __msa_copy_u_d((v2i64)src0, 0); + out1 = __msa_copy_u_d((v2i64)src1, 0); + + SD(out0, dst); + dst += dst_stride; + SD(out1, dst); + dst += dst_stride; + } + } +} + +static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height, int32_t width) { + int32_t cnt, loop_cnt; + const uint8_t *src_tmp; + uint8_t *dst_tmp; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + for (cnt = (width >> 4); cnt--;) { + src_tmp = src; + dst_tmp = dst; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_UB8(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6, + src7); + src_tmp += (8 * src_stride); + + ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst_tmp, + dst_stride); + dst_tmp += (8 * dst_stride); + } + + src += 16; + dst += 16; + } +} + +static void copy_width16_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + if (0 == height % 12) { + for (cnt = (height / 12); cnt--;) { + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); + dst += (8 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 8) { + copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16); + } else if (0 == height % 4) { + for (cnt = (height >> 2); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } + } +} + +static void copy_width32_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + if (0 == height % 12) { + for (cnt = (height / 12); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); + dst += (4 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); + dst += (4 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 8) { + copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32); + } else if (0 == height % 4) { + for (cnt = (height >> 2); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); + dst += (4 * dst_stride); + } + } +} + +static void copy_width64_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64); +} + +void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int32_t y_step_q4, + int32_t w, int32_t h) { + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + switch (w) { + case 4: { + uint32_t cnt, tmp; + /* 1 word storage */ + for (cnt = h; cnt--;) { + tmp = LW(src); + SW(tmp, dst); + src += src_stride; + dst += dst_stride; + } + break; + } + case 8: { + copy_width8_msa(src, src_stride, dst, dst_stride, h); + break; + } + case 16: { + copy_width16_msa(src, src_stride, dst, dst_stride, h); + break; + } + case 32: { + copy_width32_msa(src, src_stride, dst, dst_stride, h); + break; + } + case 64: { + copy_width64_msa(src, src_stride, dst, dst_stride, h); + break; + } + default: { + uint32_t cnt; + for (cnt = h; cnt--;) { + memcpy(dst, src, w); + src += src_stride; + dst += dst_stride; + } + break; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_msa.h b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_msa.h new file mode 100644 index 0000000000..a0280c5434 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_msa.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ +#define VPX_VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ + +#include "vpx_dsp/mips/macros_msa.h" +#include "vpx_dsp/vpx_filter.h" + +extern const uint8_t mc_filt_mask_arr[16 * 3]; + +#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2, \ + filt3) \ + ({ \ + v8i16 tmp_dpadd_0, tmp_dpadd_1; \ + \ + tmp_dpadd_0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \ + tmp_dpadd_0 = __msa_dpadd_s_h(tmp_dpadd_0, (v16i8)vec1, (v16i8)filt1); \ + tmp_dpadd_1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2); \ + tmp_dpadd_1 = __msa_dpadd_s_h(tmp_dpadd_1, (v16i8)vec3, (v16i8)filt3); \ + tmp_dpadd_0 = __msa_adds_s_h(tmp_dpadd_0, tmp_dpadd_1); \ + \ + tmp_dpadd_0; \ + }) + +#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_h0, \ + filt_h1, filt_h2, filt_h3) \ + ({ \ + v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ + v8i16 hz_out_m; \ + \ + VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, vec0_m, vec1_m, vec2_m, \ + vec3_m); \ + hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, filt_h0, \ + filt_h1, filt_h2, filt_h3); \ + \ + hz_out_m = __msa_srari_h(hz_out_m, FILTER_BITS); \ + hz_out_m = __msa_sat_s_h(hz_out_m, 7); \ + \ + hz_out_m; \ + }) + +#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + mask2, mask3, filt0, filt1, filt2, filt3, \ + out0, out1) \ + { \ + v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v8i16 res0_m, res1_m, res2_m, res3_m; \ + \ + VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ + DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \ + VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ + DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \ + VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \ + DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \ + VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \ + DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \ + ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \ + } + +#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + mask2, mask3, filt0, filt1, filt2, filt3, \ + out0, out1, out2, out3) \ + { \ + v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \ + \ + VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ + DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ + res0_m, res1_m, res2_m, res3_m); \ + VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \ + DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \ + res4_m, res5_m, res6_m, res7_m); \ + VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \ + DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \ + res0_m, res1_m, res2_m, res3_m); \ + VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \ + DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \ + res4_m, res5_m, res6_m, res7_m); \ + ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \ + res7_m, out0, out1, out2, out3); \ + } + +#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) \ + { \ + v16u8 tmp_m; \ + \ + tmp_m = PCKEV_XORI128_UB(in1, in0); \ + tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \ + ST_UB(tmp_m, (pdst)); \ + } + +#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) \ + { \ + v16u8 tmp_m; \ + \ + tmp_m = (v16u8)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ + tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \ + ST_UB(tmp_m, (pdst)); \ + } + +#define PCKEV_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, pdst, stride) \ + { \ + v16u8 tmp0_m, tmp1_m; \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + \ + PCKEV_B2_UB(in1, in0, in3, in2, tmp0_m, tmp1_m); \ + AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \ + ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ + } +#endif // VPX_VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/postproc.h b/media/libvpx/libvpx/vpx_dsp/postproc.h new file mode 100644 index 0000000000..37f993f814 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/postproc.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_POSTPROC_H_ +#define VPX_VPX_DSP_POSTPROC_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +// Fills a noise buffer with gaussian noise strength determined by sigma. +int vpx_setup_noise(double sigma, int8_t *noise, int size); + +#ifdef __cplusplus +} +#endif + +#endif // VPX_VPX_DSP_POSTPROC_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/bitdepth_conversion_vsx.h b/media/libvpx/libvpx/vpx_dsp/ppc/bitdepth_conversion_vsx.h new file mode 100644 index 0000000000..7ac873f9fc --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/ppc/bitdepth_conversion_vsx.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_ +#define VPX_VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_ + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/ppc/types_vsx.h" + +// Load 8 16 bit values. If the source is 32 bits then pack down with +// saturation. +static INLINE int16x8_t load_tran_low(int32_t c, const tran_low_t *s) { +#if CONFIG_VP9_HIGHBITDEPTH + int32x4_t u = vec_vsx_ld(c, s); + int32x4_t v = vec_vsx_ld(c, s + 4); + return vec_packs(u, v); +#else + return vec_vsx_ld(c, s); +#endif +} + +// Store 8 16 bit values. If the destination is 32 bits then sign extend the +// values by multiplying by 1. +static INLINE void store_tran_low(int16x8_t v, int32_t c, tran_low_t *s) { +#if CONFIG_VP9_HIGHBITDEPTH + const int16x8_t one = vec_splat_s16(1); + const int32x4_t even = vec_mule(v, one); + const int32x4_t odd = vec_mulo(v, one); + const int32x4_t high = vec_mergeh(even, odd); + const int32x4_t low = vec_mergel(even, odd); + vec_vsx_st(high, c, s); + vec_vsx_st(low, c, s + 4); +#else + vec_vsx_st(v, c, s); +#endif +} + +#endif // VPX_VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/deblock_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/deblock_vsx.c new file mode 100644 index 0000000000..2129911696 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/ppc/deblock_vsx.c @@ -0,0 +1,374 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/ppc/types_vsx.h" + +extern const int16_t vpx_rv[]; + +static const uint8x16_t load_merge = { 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, + 0x0C, 0x0E, 0x18, 0x19, 0x1A, 0x1B, + 0x1C, 0x1D, 0x1E, 0x1F }; + +static const uint8x16_t st8_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, + 0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B, + 0x1C, 0x1D, 0x1E, 0x1F }; + +static INLINE uint8x16_t apply_filter(uint8x16_t ctx[4], uint8x16_t v, + uint8x16_t filter) { + const uint8x16_t k1 = vec_avg(ctx[0], ctx[1]); + const uint8x16_t k2 = vec_avg(ctx[3], ctx[2]); + const uint8x16_t k3 = vec_avg(k1, k2); + const uint8x16_t f_a = vec_max(vec_absd(v, ctx[0]), vec_absd(v, ctx[1])); + const uint8x16_t f_b = vec_max(vec_absd(v, ctx[2]), vec_absd(v, ctx[3])); + const bool8x16_t mask = vec_cmplt(vec_max(f_a, f_b), filter); + return vec_sel(v, vec_avg(k3, v), mask); +} + +static INLINE void vert_ctx(uint8x16_t ctx[4], int col, uint8_t *src, + int stride) { + ctx[0] = vec_vsx_ld(col - 2 * stride, src); + ctx[1] = vec_vsx_ld(col - stride, src); + ctx[2] = vec_vsx_ld(col + stride, src); + ctx[3] = vec_vsx_ld(col + 2 * stride, src); +} + +static INLINE void horz_ctx(uint8x16_t ctx[4], uint8x16_t left_ctx, + uint8x16_t v, uint8x16_t right_ctx) { + static const uint8x16_t l2_perm = { 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, + 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, + 0x1A, 0x1B, 0x1C, 0x1D }; + + static const uint8x16_t l1_perm = { 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, + 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, + 0x1B, 0x1C, 0x1D, 0x1E }; + + static const uint8x16_t r1_perm = { 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, + 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, + 0x0D, 0x0E, 0x0F, 0x10 }; + + static const uint8x16_t r2_perm = { 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, + 0x0E, 0x0F, 0x10, 0x11 }; + ctx[0] = vec_perm(left_ctx, v, l2_perm); + ctx[1] = vec_perm(left_ctx, v, l1_perm); + ctx[2] = vec_perm(v, right_ctx, r1_perm); + ctx[3] = vec_perm(v, right_ctx, r2_perm); +} +void vpx_post_proc_down_and_across_mb_row_vsx(unsigned char *src_ptr, + unsigned char *dst_ptr, + int src_pixels_per_line, + int dst_pixels_per_line, int cols, + unsigned char *f, int size) { + int row, col; + uint8x16_t ctx[4], out, v, left_ctx; + + for (row = 0; row < size; row++) { + for (col = 0; col < cols - 8; col += 16) { + const uint8x16_t filter = vec_vsx_ld(col, f); + v = vec_vsx_ld(col, src_ptr); + vert_ctx(ctx, col, src_ptr, src_pixels_per_line); + vec_vsx_st(apply_filter(ctx, v, filter), col, dst_ptr); + } + + if (col != cols) { + const uint8x16_t filter = vec_vsx_ld(col, f); + v = vec_vsx_ld(col, src_ptr); + vert_ctx(ctx, col, src_ptr, src_pixels_per_line); + out = apply_filter(ctx, v, filter); + vec_vsx_st(vec_perm(out, v, st8_perm), col, dst_ptr); + } + + /* now post_proc_across */ + left_ctx = vec_splats(dst_ptr[0]); + v = vec_vsx_ld(0, dst_ptr); + for (col = 0; col < cols - 8; col += 16) { + const uint8x16_t filter = vec_vsx_ld(col, f); + const uint8x16_t right_ctx = (col + 16 == cols) + ? vec_splats(dst_ptr[cols - 1]) + : vec_vsx_ld(col, dst_ptr + 16); + horz_ctx(ctx, left_ctx, v, right_ctx); + vec_vsx_st(apply_filter(ctx, v, filter), col, dst_ptr); + left_ctx = v; + v = right_ctx; + } + + if (col != cols) { + const uint8x16_t filter = vec_vsx_ld(col, f); + const uint8x16_t right_ctx = vec_splats(dst_ptr[cols - 1]); + horz_ctx(ctx, left_ctx, v, right_ctx); + out = apply_filter(ctx, v, filter); + vec_vsx_st(vec_perm(out, v, st8_perm), col, dst_ptr); + } + + src_ptr += src_pixels_per_line; + dst_ptr += dst_pixels_per_line; + } +} + +// C: s[c + 7] +static INLINE int16x8_t next7l_s16(uint8x16_t c) { + static const uint8x16_t next7_perm = { + 0x07, 0x10, 0x08, 0x11, 0x09, 0x12, 0x0A, 0x13, + 0x0B, 0x14, 0x0C, 0x15, 0x0D, 0x16, 0x0E, 0x17, + }; + return (int16x8_t)vec_perm(c, vec_zeros_u8, next7_perm); +} + +// Slide across window and add. +static INLINE int16x8_t slide_sum_s16(int16x8_t x) { + // x = A B C D E F G H + // + // 0 A B C D E F G + const int16x8_t sum1 = vec_add(x, vec_slo(x, vec_splats((int8_t)(2 << 3)))); + // 0 0 A B C D E F + const int16x8_t sum2 = vec_add(vec_slo(x, vec_splats((int8_t)(4 << 3))), + // 0 0 0 A B C D E + vec_slo(x, vec_splats((int8_t)(6 << 3)))); + // 0 0 0 0 A B C D + const int16x8_t sum3 = vec_add(vec_slo(x, vec_splats((int8_t)(8 << 3))), + // 0 0 0 0 0 A B C + vec_slo(x, vec_splats((int8_t)(10 << 3)))); + // 0 0 0 0 0 0 A B + const int16x8_t sum4 = vec_add(vec_slo(x, vec_splats((int8_t)(12 << 3))), + // 0 0 0 0 0 0 0 A + vec_slo(x, vec_splats((int8_t)(14 << 3)))); + return vec_add(vec_add(sum1, sum2), vec_add(sum3, sum4)); +} + +// Slide across window and add. +static INLINE int32x4_t slide_sumsq_s32(int32x4_t xsq_even, int32x4_t xsq_odd) { + // 0 A C E + // + 0 B D F + int32x4_t sumsq_1 = vec_add(vec_slo(xsq_even, vec_splats((int8_t)(4 << 3))), + vec_slo(xsq_odd, vec_splats((int8_t)(4 << 3)))); + // 0 0 A C + // + 0 0 B D + int32x4_t sumsq_2 = vec_add(vec_slo(xsq_even, vec_splats((int8_t)(8 << 3))), + vec_slo(xsq_odd, vec_splats((int8_t)(8 << 3)))); + // 0 0 0 A + // + 0 0 0 B + int32x4_t sumsq_3 = vec_add(vec_slo(xsq_even, vec_splats((int8_t)(12 << 3))), + vec_slo(xsq_odd, vec_splats((int8_t)(12 << 3)))); + sumsq_1 = vec_add(sumsq_1, xsq_even); + sumsq_2 = vec_add(sumsq_2, sumsq_3); + return vec_add(sumsq_1, sumsq_2); +} + +// C: (b + sum + val) >> 4 +static INLINE int16x8_t filter_s16(int16x8_t b, int16x8_t sum, int16x8_t val) { + return vec_sra(vec_add(vec_add(b, sum), val), vec_splats((uint16_t)4)); +} + +// C: sumsq * 15 - sum * sum +static INLINE bool16x8_t mask_s16(int32x4_t sumsq_even, int32x4_t sumsq_odd, + int16x8_t sum, int32x4_t lim) { + static const uint8x16_t mask_merge = { 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, + 0x14, 0x15, 0x08, 0x09, 0x18, 0x19, + 0x0C, 0x0D, 0x1C, 0x1D }; + const int32x4_t sumsq_odd_scaled = + vec_mul(sumsq_odd, vec_splats((int32_t)15)); + const int32x4_t sumsq_even_scaled = + vec_mul(sumsq_even, vec_splats((int32_t)15)); + const int32x4_t thres_odd = vec_sub(sumsq_odd_scaled, vec_mulo(sum, sum)); + const int32x4_t thres_even = vec_sub(sumsq_even_scaled, vec_mule(sum, sum)); + + const bool32x4_t mask_odd = vec_cmplt(thres_odd, lim); + const bool32x4_t mask_even = vec_cmplt(thres_even, lim); + return vec_perm((bool16x8_t)mask_even, (bool16x8_t)mask_odd, mask_merge); +} + +void vpx_mbpost_proc_across_ip_vsx(unsigned char *src, int pitch, int rows, + int cols, int flimit) { + int row, col; + const int32x4_t lim = vec_splats(flimit); + + // 8 columns are processed at a time. + assert(cols % 8 == 0); + + for (row = 0; row < rows; row++) { + // The sum is signed and requires at most 13 bits. + // (8 bits + sign) * 15 (4 bits) + int16x8_t sum; + // The sum of squares requires at most 20 bits. + // (16 bits + sign) * 15 (4 bits) + int32x4_t sumsq_even, sumsq_odd; + + // Fill left context with first col. + int16x8_t left_ctx = vec_splats((int16_t)src[0]); + int16_t s = src[0] * 9; + int32_t ssq = src[0] * src[0] * 9 + 16; + + // Fill the next 6 columns of the sliding window with cols 2 to 7. + for (col = 1; col <= 6; ++col) { + s += src[col]; + ssq += src[col] * src[col]; + } + // Set this sum to every element in the window. + sum = vec_splats(s); + sumsq_even = vec_splats(ssq); + sumsq_odd = vec_splats(ssq); + + for (col = 0; col < cols; col += 8) { + bool16x8_t mask; + int16x8_t filtered, masked; + uint8x16_t out; + + const uint8x16_t val = vec_vsx_ld(0, src + col); + const int16x8_t val_high = unpack_to_s16_h(val); + + // C: s[c + 7] + const int16x8_t right_ctx = (col + 8 == cols) + ? vec_splats((int16_t)src[col + 7]) + : next7l_s16(val); + + // C: x = s[c + 7] - s[c - 8]; + const int16x8_t x = vec_sub(right_ctx, left_ctx); + const int32x4_t xsq_even = + vec_sub(vec_mule(right_ctx, right_ctx), vec_mule(left_ctx, left_ctx)); + const int32x4_t xsq_odd = + vec_sub(vec_mulo(right_ctx, right_ctx), vec_mulo(left_ctx, left_ctx)); + + const int32x4_t sumsq_tmp = slide_sumsq_s32(xsq_even, xsq_odd); + // A C E G + // 0 B D F + // 0 A C E + // 0 0 B D + // 0 0 A C + // 0 0 0 B + // 0 0 0 A + sumsq_even = vec_add(sumsq_even, sumsq_tmp); + // B D F G + // A C E G + // 0 B D F + // 0 A C E + // 0 0 B D + // 0 0 A C + // 0 0 0 B + // 0 0 0 A + sumsq_odd = vec_add(sumsq_odd, vec_add(sumsq_tmp, xsq_odd)); + + sum = vec_add(sum, slide_sum_s16(x)); + + // C: (8 + sum + s[c]) >> 4 + filtered = filter_s16(vec_splats((int16_t)8), sum, val_high); + // C: sumsq * 15 - sum * sum + mask = mask_s16(sumsq_even, sumsq_odd, sum, lim); + masked = vec_sel(val_high, filtered, mask); + + out = vec_perm((uint8x16_t)masked, vec_vsx_ld(0, src + col), load_merge); + vec_vsx_st(out, 0, src + col); + + // Update window sum and square sum + sum = vec_splat(sum, 7); + sumsq_even = vec_splat(sumsq_odd, 3); + sumsq_odd = vec_splat(sumsq_odd, 3); + + // C: s[c - 8] (for next iteration) + left_ctx = val_high; + } + src += pitch; + } +} + +void vpx_mbpost_proc_down_vsx(uint8_t *dst, int pitch, int rows, int cols, + int flimit) { + int col, row, i; + int16x8_t window[16]; + const int32x4_t lim = vec_splats(flimit); + + // 8 columns are processed at a time. + assert(cols % 8 == 0); + // If rows is less than 8 the bottom border extension fails. + assert(rows >= 8); + + for (col = 0; col < cols; col += 8) { + // The sum is signed and requires at most 13 bits. + // (8 bits + sign) * 15 (4 bits) + int16x8_t r1, sum; + // The sum of squares requires at most 20 bits. + // (16 bits + sign) * 15 (4 bits) + int32x4_t sumsq_even, sumsq_odd; + + r1 = unpack_to_s16_h(vec_vsx_ld(0, dst)); + // Fill sliding window with first row. + for (i = 0; i <= 8; i++) { + window[i] = r1; + } + // First 9 rows of the sliding window are the same. + // sum = r1 * 9 + sum = vec_mladd(r1, vec_splats((int16_t)9), vec_zeros_s16); + + // sumsq = r1 * r1 * 9 + sumsq_even = vec_mule(sum, r1); + sumsq_odd = vec_mulo(sum, r1); + + // Fill the next 6 rows of the sliding window with rows 2 to 7. + for (i = 1; i <= 6; ++i) { + const int16x8_t next_row = unpack_to_s16_h(vec_vsx_ld(i * pitch, dst)); + window[i + 8] = next_row; + sum = vec_add(sum, next_row); + sumsq_odd = vec_add(sumsq_odd, vec_mulo(next_row, next_row)); + sumsq_even = vec_add(sumsq_even, vec_mule(next_row, next_row)); + } + + for (row = 0; row < rows; row++) { + int32x4_t d15_even, d15_odd, d0_even, d0_odd; + bool16x8_t mask; + int16x8_t filtered, masked; + uint8x16_t out; + + const int16x8_t rv = vec_vsx_ld(0, vpx_rv + (row & 127)); + + // Move the sliding window + if (row + 7 < rows) { + window[15] = unpack_to_s16_h(vec_vsx_ld((row + 7) * pitch, dst)); + } else { + window[15] = window[14]; + } + + // C: sum += s[7 * pitch] - s[-8 * pitch]; + sum = vec_add(sum, vec_sub(window[15], window[0])); + + // C: sumsq += s[7 * pitch] * s[7 * pitch] - s[-8 * pitch] * s[-8 * + // pitch]; + // Optimization Note: Caching a squared-window for odd and even is + // slower than just repeating the multiplies. + d15_odd = vec_mulo(window[15], window[15]); + d15_even = vec_mule(window[15], window[15]); + d0_odd = vec_mulo(window[0], window[0]); + d0_even = vec_mule(window[0], window[0]); + sumsq_odd = vec_add(sumsq_odd, vec_sub(d15_odd, d0_odd)); + sumsq_even = vec_add(sumsq_even, vec_sub(d15_even, d0_even)); + + // C: (vpx_rv[(r & 127) + (c & 7)] + sum + s[0]) >> 4 + filtered = filter_s16(rv, sum, window[8]); + + // C: sumsq * 15 - sum * sum + mask = mask_s16(sumsq_even, sumsq_odd, sum, lim); + masked = vec_sel(window[8], filtered, mask); + + // TODO(ltrudeau) If cols % 16 == 0, we could just process 16 per + // iteration + out = vec_perm((uint8x16_t)masked, vec_vsx_ld(0, dst + row * pitch), + load_merge); + vec_vsx_st(out, 0, dst + row * pitch); + + // Optimization Note: Turns out that the following loop is faster than + // using pointers to manage the sliding window. + for (i = 1; i < 16; i++) { + window[i - 1] = window[i]; + } + } + dst += 8; + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/fdct32x32_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/fdct32x32_vsx.c new file mode 100644 index 0000000000..328b0e3130 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/ppc/fdct32x32_vsx.c @@ -0,0 +1,553 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_dsp/ppc/transpose_vsx.h" +#include "vpx_dsp/ppc/txfm_common_vsx.h" +#include "vpx_dsp/ppc/types_vsx.h" + +// Returns ((a +/- b) * cospi16 + (2 << 13)) >> 14. +static INLINE void single_butterfly(int16x8_t a, int16x8_t b, int16x8_t *add, + int16x8_t *sub) { + // Since a + b can overflow 16 bits, the multiplication is distributed + // (a * c +/- b * c). + const int32x4_t ac_e = vec_mule(a, cospi16_v); + const int32x4_t ac_o = vec_mulo(a, cospi16_v); + const int32x4_t bc_e = vec_mule(b, cospi16_v); + const int32x4_t bc_o = vec_mulo(b, cospi16_v); + + // Reuse the same multiplies for sum and difference. + const int32x4_t sum_e = vec_add(ac_e, bc_e); + const int32x4_t sum_o = vec_add(ac_o, bc_o); + const int32x4_t diff_e = vec_sub(ac_e, bc_e); + const int32x4_t diff_o = vec_sub(ac_o, bc_o); + + // Add rounding offset + const int32x4_t rsum_o = vec_add(sum_o, vec_dct_const_rounding); + const int32x4_t rsum_e = vec_add(sum_e, vec_dct_const_rounding); + const int32x4_t rdiff_o = vec_add(diff_o, vec_dct_const_rounding); + const int32x4_t rdiff_e = vec_add(diff_e, vec_dct_const_rounding); + + const int32x4_t ssum_o = vec_sra(rsum_o, vec_dct_const_bits); + const int32x4_t ssum_e = vec_sra(rsum_e, vec_dct_const_bits); + const int32x4_t sdiff_o = vec_sra(rdiff_o, vec_dct_const_bits); + const int32x4_t sdiff_e = vec_sra(rdiff_e, vec_dct_const_bits); + + // There's no pack operation for even and odd, so we need to permute. + *add = (int16x8_t)vec_perm(ssum_e, ssum_o, vec_perm_odd_even_pack); + *sub = (int16x8_t)vec_perm(sdiff_e, sdiff_o, vec_perm_odd_even_pack); +} + +// Returns (a * c1 +/- b * c2 + (2 << 13)) >> 14 +static INLINE void double_butterfly(int16x8_t a, int16x8_t c1, int16x8_t b, + int16x8_t c2, int16x8_t *add, + int16x8_t *sub) { + const int32x4_t ac1_o = vec_mulo(a, c1); + const int32x4_t ac1_e = vec_mule(a, c1); + const int32x4_t ac2_o = vec_mulo(a, c2); + const int32x4_t ac2_e = vec_mule(a, c2); + + const int32x4_t bc1_o = vec_mulo(b, c1); + const int32x4_t bc1_e = vec_mule(b, c1); + const int32x4_t bc2_o = vec_mulo(b, c2); + const int32x4_t bc2_e = vec_mule(b, c2); + + const int32x4_t sum_o = vec_add(ac1_o, bc2_o); + const int32x4_t sum_e = vec_add(ac1_e, bc2_e); + const int32x4_t diff_o = vec_sub(ac2_o, bc1_o); + const int32x4_t diff_e = vec_sub(ac2_e, bc1_e); + + // Add rounding offset + const int32x4_t rsum_o = vec_add(sum_o, vec_dct_const_rounding); + const int32x4_t rsum_e = vec_add(sum_e, vec_dct_const_rounding); + const int32x4_t rdiff_o = vec_add(diff_o, vec_dct_const_rounding); + const int32x4_t rdiff_e = vec_add(diff_e, vec_dct_const_rounding); + + const int32x4_t ssum_o = vec_sra(rsum_o, vec_dct_const_bits); + const int32x4_t ssum_e = vec_sra(rsum_e, vec_dct_const_bits); + const int32x4_t sdiff_o = vec_sra(rdiff_o, vec_dct_const_bits); + const int32x4_t sdiff_e = vec_sra(rdiff_e, vec_dct_const_bits); + + // There's no pack operation for even and odd, so we need to permute. + *add = (int16x8_t)vec_perm(ssum_e, ssum_o, vec_perm_odd_even_pack); + *sub = (int16x8_t)vec_perm(sdiff_e, sdiff_o, vec_perm_odd_even_pack); +} + +// While other architecture combine the load and the stage 1 operations, Power9 +// benchmarking show no benefit in such an approach. +static INLINE void load(const int16_t *a, int stride, int16x8_t *b) { + // Tried out different combinations of load and shift instructions, this is + // the fastest one. + { + const int16x8_t l0 = vec_vsx_ld(0, a); + const int16x8_t l1 = vec_vsx_ld(0, a + stride); + const int16x8_t l2 = vec_vsx_ld(0, a + 2 * stride); + const int16x8_t l3 = vec_vsx_ld(0, a + 3 * stride); + const int16x8_t l4 = vec_vsx_ld(0, a + 4 * stride); + const int16x8_t l5 = vec_vsx_ld(0, a + 5 * stride); + const int16x8_t l6 = vec_vsx_ld(0, a + 6 * stride); + const int16x8_t l7 = vec_vsx_ld(0, a + 7 * stride); + + const int16x8_t l8 = vec_vsx_ld(0, a + 8 * stride); + const int16x8_t l9 = vec_vsx_ld(0, a + 9 * stride); + const int16x8_t l10 = vec_vsx_ld(0, a + 10 * stride); + const int16x8_t l11 = vec_vsx_ld(0, a + 11 * stride); + const int16x8_t l12 = vec_vsx_ld(0, a + 12 * stride); + const int16x8_t l13 = vec_vsx_ld(0, a + 13 * stride); + const int16x8_t l14 = vec_vsx_ld(0, a + 14 * stride); + const int16x8_t l15 = vec_vsx_ld(0, a + 15 * stride); + + b[0] = vec_sl(l0, vec_dct_scale_log2); + b[1] = vec_sl(l1, vec_dct_scale_log2); + b[2] = vec_sl(l2, vec_dct_scale_log2); + b[3] = vec_sl(l3, vec_dct_scale_log2); + b[4] = vec_sl(l4, vec_dct_scale_log2); + b[5] = vec_sl(l5, vec_dct_scale_log2); + b[6] = vec_sl(l6, vec_dct_scale_log2); + b[7] = vec_sl(l7, vec_dct_scale_log2); + + b[8] = vec_sl(l8, vec_dct_scale_log2); + b[9] = vec_sl(l9, vec_dct_scale_log2); + b[10] = vec_sl(l10, vec_dct_scale_log2); + b[11] = vec_sl(l11, vec_dct_scale_log2); + b[12] = vec_sl(l12, vec_dct_scale_log2); + b[13] = vec_sl(l13, vec_dct_scale_log2); + b[14] = vec_sl(l14, vec_dct_scale_log2); + b[15] = vec_sl(l15, vec_dct_scale_log2); + } + { + const int16x8_t l16 = vec_vsx_ld(0, a + 16 * stride); + const int16x8_t l17 = vec_vsx_ld(0, a + 17 * stride); + const int16x8_t l18 = vec_vsx_ld(0, a + 18 * stride); + const int16x8_t l19 = vec_vsx_ld(0, a + 19 * stride); + const int16x8_t l20 = vec_vsx_ld(0, a + 20 * stride); + const int16x8_t l21 = vec_vsx_ld(0, a + 21 * stride); + const int16x8_t l22 = vec_vsx_ld(0, a + 22 * stride); + const int16x8_t l23 = vec_vsx_ld(0, a + 23 * stride); + + const int16x8_t l24 = vec_vsx_ld(0, a + 24 * stride); + const int16x8_t l25 = vec_vsx_ld(0, a + 25 * stride); + const int16x8_t l26 = vec_vsx_ld(0, a + 26 * stride); + const int16x8_t l27 = vec_vsx_ld(0, a + 27 * stride); + const int16x8_t l28 = vec_vsx_ld(0, a + 28 * stride); + const int16x8_t l29 = vec_vsx_ld(0, a + 29 * stride); + const int16x8_t l30 = vec_vsx_ld(0, a + 30 * stride); + const int16x8_t l31 = vec_vsx_ld(0, a + 31 * stride); + + b[16] = vec_sl(l16, vec_dct_scale_log2); + b[17] = vec_sl(l17, vec_dct_scale_log2); + b[18] = vec_sl(l18, vec_dct_scale_log2); + b[19] = vec_sl(l19, vec_dct_scale_log2); + b[20] = vec_sl(l20, vec_dct_scale_log2); + b[21] = vec_sl(l21, vec_dct_scale_log2); + b[22] = vec_sl(l22, vec_dct_scale_log2); + b[23] = vec_sl(l23, vec_dct_scale_log2); + + b[24] = vec_sl(l24, vec_dct_scale_log2); + b[25] = vec_sl(l25, vec_dct_scale_log2); + b[26] = vec_sl(l26, vec_dct_scale_log2); + b[27] = vec_sl(l27, vec_dct_scale_log2); + b[28] = vec_sl(l28, vec_dct_scale_log2); + b[29] = vec_sl(l29, vec_dct_scale_log2); + b[30] = vec_sl(l30, vec_dct_scale_log2); + b[31] = vec_sl(l31, vec_dct_scale_log2); + } +} + +static INLINE void store(tran_low_t *a, const int16x8_t *b) { + vec_vsx_st(b[0], 0, a); + vec_vsx_st(b[8], 0, a + 8); + vec_vsx_st(b[16], 0, a + 16); + vec_vsx_st(b[24], 0, a + 24); + + vec_vsx_st(b[1], 0, a + 32); + vec_vsx_st(b[9], 0, a + 40); + vec_vsx_st(b[17], 0, a + 48); + vec_vsx_st(b[25], 0, a + 56); + + vec_vsx_st(b[2], 0, a + 64); + vec_vsx_st(b[10], 0, a + 72); + vec_vsx_st(b[18], 0, a + 80); + vec_vsx_st(b[26], 0, a + 88); + + vec_vsx_st(b[3], 0, a + 96); + vec_vsx_st(b[11], 0, a + 104); + vec_vsx_st(b[19], 0, a + 112); + vec_vsx_st(b[27], 0, a + 120); + + vec_vsx_st(b[4], 0, a + 128); + vec_vsx_st(b[12], 0, a + 136); + vec_vsx_st(b[20], 0, a + 144); + vec_vsx_st(b[28], 0, a + 152); + + vec_vsx_st(b[5], 0, a + 160); + vec_vsx_st(b[13], 0, a + 168); + vec_vsx_st(b[21], 0, a + 176); + vec_vsx_st(b[29], 0, a + 184); + + vec_vsx_st(b[6], 0, a + 192); + vec_vsx_st(b[14], 0, a + 200); + vec_vsx_st(b[22], 0, a + 208); + vec_vsx_st(b[30], 0, a + 216); + + vec_vsx_st(b[7], 0, a + 224); + vec_vsx_st(b[15], 0, a + 232); + vec_vsx_st(b[23], 0, a + 240); + vec_vsx_st(b[31], 0, a + 248); +} + +// Returns 1 if negative 0 if positive +static INLINE int16x8_t vec_sign_s16(int16x8_t a) { + return vec_sr(a, vec_shift_sign_s16); +} + +// Add 2 if positive, 1 if negative, and shift by 2. +static INLINE int16x8_t sub_round_shift(const int16x8_t a) { + const int16x8_t sign = vec_sign_s16(a); + return vec_sra(vec_sub(vec_add(a, vec_twos_s16), sign), vec_dct_scale_log2); +} + +// Add 1 if positive, 2 if negative, and shift by 2. +// In practice, add 1, then add the sign bit, then shift without rounding. +static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) { + const int16x8_t sign = vec_sign_s16(a); + return vec_sra(vec_add(vec_add(a, vec_ones_s16), sign), vec_dct_scale_log2); +} + +static void fdct32_vsx(const int16x8_t *in, int16x8_t *out, int pass) { + int16x8_t temp0[32]; // Hold stages: 1, 4, 7 + int16x8_t temp1[32]; // Hold stages: 2, 5 + int16x8_t temp2[32]; // Hold stages: 3, 6 + int i; + + // Stage 1 + // Unrolling this loops actually slows down Power9 benchmarks + for (i = 0; i < 16; i++) { + temp0[i] = vec_add(in[i], in[31 - i]); + // pass through to stage 3. + temp1[i + 16] = vec_sub(in[15 - i], in[i + 16]); + } + + // Stage 2 + // Unrolling this loops actually slows down Power9 benchmarks + for (i = 0; i < 8; i++) { + temp1[i] = vec_add(temp0[i], temp0[15 - i]); + temp1[i + 8] = vec_sub(temp0[7 - i], temp0[i + 8]); + } + + // Apply butterflies (in place) on pass through to stage 3. + single_butterfly(temp1[27], temp1[20], &temp1[27], &temp1[20]); + single_butterfly(temp1[26], temp1[21], &temp1[26], &temp1[21]); + single_butterfly(temp1[25], temp1[22], &temp1[25], &temp1[22]); + single_butterfly(temp1[24], temp1[23], &temp1[24], &temp1[23]); + + // dump the magnitude by 4, hence the intermediate values are within + // the range of 16 bits. + if (pass) { + temp1[0] = add_round_shift_s16(temp1[0]); + temp1[1] = add_round_shift_s16(temp1[1]); + temp1[2] = add_round_shift_s16(temp1[2]); + temp1[3] = add_round_shift_s16(temp1[3]); + temp1[4] = add_round_shift_s16(temp1[4]); + temp1[5] = add_round_shift_s16(temp1[5]); + temp1[6] = add_round_shift_s16(temp1[6]); + temp1[7] = add_round_shift_s16(temp1[7]); + temp1[8] = add_round_shift_s16(temp1[8]); + temp1[9] = add_round_shift_s16(temp1[9]); + temp1[10] = add_round_shift_s16(temp1[10]); + temp1[11] = add_round_shift_s16(temp1[11]); + temp1[12] = add_round_shift_s16(temp1[12]); + temp1[13] = add_round_shift_s16(temp1[13]); + temp1[14] = add_round_shift_s16(temp1[14]); + temp1[15] = add_round_shift_s16(temp1[15]); + + temp1[16] = add_round_shift_s16(temp1[16]); + temp1[17] = add_round_shift_s16(temp1[17]); + temp1[18] = add_round_shift_s16(temp1[18]); + temp1[19] = add_round_shift_s16(temp1[19]); + temp1[20] = add_round_shift_s16(temp1[20]); + temp1[21] = add_round_shift_s16(temp1[21]); + temp1[22] = add_round_shift_s16(temp1[22]); + temp1[23] = add_round_shift_s16(temp1[23]); + temp1[24] = add_round_shift_s16(temp1[24]); + temp1[25] = add_round_shift_s16(temp1[25]); + temp1[26] = add_round_shift_s16(temp1[26]); + temp1[27] = add_round_shift_s16(temp1[27]); + temp1[28] = add_round_shift_s16(temp1[28]); + temp1[29] = add_round_shift_s16(temp1[29]); + temp1[30] = add_round_shift_s16(temp1[30]); + temp1[31] = add_round_shift_s16(temp1[31]); + } + + // Stage 3 + temp2[0] = vec_add(temp1[0], temp1[7]); + temp2[1] = vec_add(temp1[1], temp1[6]); + temp2[2] = vec_add(temp1[2], temp1[5]); + temp2[3] = vec_add(temp1[3], temp1[4]); + temp2[5] = vec_sub(temp1[2], temp1[5]); + temp2[6] = vec_sub(temp1[1], temp1[6]); + temp2[8] = temp1[8]; + temp2[9] = temp1[9]; + + single_butterfly(temp1[13], temp1[10], &temp2[13], &temp2[10]); + single_butterfly(temp1[12], temp1[11], &temp2[12], &temp2[11]); + temp2[14] = temp1[14]; + temp2[15] = temp1[15]; + + temp2[18] = vec_add(temp1[18], temp1[21]); + temp2[19] = vec_add(temp1[19], temp1[20]); + + temp2[20] = vec_sub(temp1[19], temp1[20]); + temp2[21] = vec_sub(temp1[18], temp1[21]); + + temp2[26] = vec_sub(temp1[29], temp1[26]); + temp2[27] = vec_sub(temp1[28], temp1[27]); + + temp2[28] = vec_add(temp1[28], temp1[27]); + temp2[29] = vec_add(temp1[29], temp1[26]); + + // Pass through Stage 4 + temp0[7] = vec_sub(temp1[0], temp1[7]); + temp0[4] = vec_sub(temp1[3], temp1[4]); + temp0[16] = vec_add(temp1[16], temp1[23]); + temp0[17] = vec_add(temp1[17], temp1[22]); + temp0[22] = vec_sub(temp1[17], temp1[22]); + temp0[23] = vec_sub(temp1[16], temp1[23]); + temp0[24] = vec_sub(temp1[31], temp1[24]); + temp0[25] = vec_sub(temp1[30], temp1[25]); + temp0[30] = vec_add(temp1[30], temp1[25]); + temp0[31] = vec_add(temp1[31], temp1[24]); + + // Stage 4 + temp0[0] = vec_add(temp2[0], temp2[3]); + temp0[1] = vec_add(temp2[1], temp2[2]); + temp0[2] = vec_sub(temp2[1], temp2[2]); + temp0[3] = vec_sub(temp2[0], temp2[3]); + single_butterfly(temp2[6], temp2[5], &temp0[6], &temp0[5]); + + temp0[9] = vec_add(temp2[9], temp2[10]); + temp0[10] = vec_sub(temp2[9], temp2[10]); + temp0[13] = vec_sub(temp2[14], temp2[13]); + temp0[14] = vec_add(temp2[14], temp2[13]); + + double_butterfly(temp2[29], cospi8_v, temp2[18], cospi24_v, &temp0[29], + &temp0[18]); + double_butterfly(temp2[28], cospi8_v, temp2[19], cospi24_v, &temp0[28], + &temp0[19]); + double_butterfly(temp2[27], cospi24_v, temp2[20], cospi8m_v, &temp0[27], + &temp0[20]); + double_butterfly(temp2[26], cospi24_v, temp2[21], cospi8m_v, &temp0[26], + &temp0[21]); + + // Pass through Stage 5 + temp1[8] = vec_add(temp2[8], temp2[11]); + temp1[11] = vec_sub(temp2[8], temp2[11]); + temp1[12] = vec_sub(temp2[15], temp2[12]); + temp1[15] = vec_add(temp2[15], temp2[12]); + + // Stage 5 + // 0 and 1 pass through to 0 and 16 at the end + single_butterfly(temp0[0], temp0[1], &out[0], &out[16]); + + // 2 and 3 pass through to 8 and 24 at the end + double_butterfly(temp0[3], cospi8_v, temp0[2], cospi24_v, &out[8], &out[24]); + + temp1[4] = vec_add(temp0[4], temp0[5]); + temp1[5] = vec_sub(temp0[4], temp0[5]); + temp1[6] = vec_sub(temp0[7], temp0[6]); + temp1[7] = vec_add(temp0[7], temp0[6]); + + double_butterfly(temp0[14], cospi8_v, temp0[9], cospi24_v, &temp1[14], + &temp1[9]); + double_butterfly(temp0[13], cospi24_v, temp0[10], cospi8m_v, &temp1[13], + &temp1[10]); + + temp1[17] = vec_add(temp0[17], temp0[18]); + temp1[18] = vec_sub(temp0[17], temp0[18]); + + temp1[21] = vec_sub(temp0[22], temp0[21]); + temp1[22] = vec_add(temp0[22], temp0[21]); + + temp1[25] = vec_add(temp0[25], temp0[26]); + temp1[26] = vec_sub(temp0[25], temp0[26]); + + temp1[29] = vec_sub(temp0[30], temp0[29]); + temp1[30] = vec_add(temp0[30], temp0[29]); + + // Pass through Stage 6 + temp2[16] = vec_add(temp0[16], temp0[19]); + temp2[19] = vec_sub(temp0[16], temp0[19]); + temp2[20] = vec_sub(temp0[23], temp0[20]); + temp2[23] = vec_add(temp0[23], temp0[20]); + temp2[24] = vec_add(temp0[24], temp0[27]); + temp2[27] = vec_sub(temp0[24], temp0[27]); + temp2[28] = vec_sub(temp0[31], temp0[28]); + temp2[31] = vec_add(temp0[31], temp0[28]); + + // Stage 6 + // 4 and 7 pass through to 4 and 28 at the end + double_butterfly(temp1[7], cospi4_v, temp1[4], cospi28_v, &out[4], &out[28]); + // 5 and 6 pass through to 20 and 12 at the end + double_butterfly(temp1[6], cospi20_v, temp1[5], cospi12_v, &out[20], + &out[12]); + temp2[8] = vec_add(temp1[8], temp1[9]); + temp2[9] = vec_sub(temp1[8], temp1[9]); + temp2[10] = vec_sub(temp1[11], temp1[10]); + temp2[11] = vec_add(temp1[11], temp1[10]); + temp2[12] = vec_add(temp1[12], temp1[13]); + temp2[13] = vec_sub(temp1[12], temp1[13]); + temp2[14] = vec_sub(temp1[15], temp1[14]); + temp2[15] = vec_add(temp1[15], temp1[14]); + + double_butterfly(temp1[30], cospi4_v, temp1[17], cospi28_v, &temp2[30], + &temp2[17]); + double_butterfly(temp1[29], cospi28_v, temp1[18], cospi4m_v, &temp2[29], + &temp2[18]); + double_butterfly(temp1[26], cospi20_v, temp1[21], cospi12_v, &temp2[26], + &temp2[21]); + double_butterfly(temp1[25], cospi12_v, temp1[22], cospi20m_v, &temp2[25], + &temp2[22]); + + // Stage 7 + double_butterfly(temp2[15], cospi2_v, temp2[8], cospi30_v, &out[2], &out[30]); + double_butterfly(temp2[14], cospi18_v, temp2[9], cospi14_v, &out[18], + &out[14]); + double_butterfly(temp2[13], cospi10_v, temp2[10], cospi22_v, &out[10], + &out[22]); + double_butterfly(temp2[12], cospi26_v, temp2[11], cospi6_v, &out[26], + &out[6]); + + temp0[16] = vec_add(temp2[16], temp2[17]); + temp0[17] = vec_sub(temp2[16], temp2[17]); + temp0[18] = vec_sub(temp2[19], temp2[18]); + temp0[19] = vec_add(temp2[19], temp2[18]); + temp0[20] = vec_add(temp2[20], temp2[21]); + temp0[21] = vec_sub(temp2[20], temp2[21]); + temp0[22] = vec_sub(temp2[23], temp2[22]); + temp0[23] = vec_add(temp2[23], temp2[22]); + temp0[24] = vec_add(temp2[24], temp2[25]); + temp0[25] = vec_sub(temp2[24], temp2[25]); + temp0[26] = vec_sub(temp2[27], temp2[26]); + temp0[27] = vec_add(temp2[27], temp2[26]); + temp0[28] = vec_add(temp2[28], temp2[29]); + temp0[29] = vec_sub(temp2[28], temp2[29]); + temp0[30] = vec_sub(temp2[31], temp2[30]); + temp0[31] = vec_add(temp2[31], temp2[30]); + + // Final stage --- outputs indices are bit-reversed. + double_butterfly(temp0[31], cospi1_v, temp0[16], cospi31_v, &out[1], + &out[31]); + double_butterfly(temp0[30], cospi17_v, temp0[17], cospi15_v, &out[17], + &out[15]); + double_butterfly(temp0[29], cospi9_v, temp0[18], cospi23_v, &out[9], + &out[23]); + double_butterfly(temp0[28], cospi25_v, temp0[19], cospi7_v, &out[25], + &out[7]); + double_butterfly(temp0[27], cospi5_v, temp0[20], cospi27_v, &out[5], + &out[27]); + double_butterfly(temp0[26], cospi21_v, temp0[21], cospi11_v, &out[21], + &out[11]); + double_butterfly(temp0[25], cospi13_v, temp0[22], cospi19_v, &out[13], + &out[19]); + double_butterfly(temp0[24], cospi29_v, temp0[23], cospi3_v, &out[29], + &out[3]); + + if (pass == 0) { + for (i = 0; i < 32; i++) { + out[i] = sub_round_shift(out[i]); + } + } +} + +void vpx_fdct32x32_rd_vsx(const int16_t *input, tran_low_t *out, int stride) { + int16x8_t temp0[32]; + int16x8_t temp1[32]; + int16x8_t temp2[32]; + int16x8_t temp3[32]; + int16x8_t temp4[32]; + int16x8_t temp5[32]; + int16x8_t temp6[32]; + + // Process in 8x32 columns. + load(input, stride, temp0); + fdct32_vsx(temp0, temp1, 0); + + load(input + 8, stride, temp0); + fdct32_vsx(temp0, temp2, 0); + + load(input + 16, stride, temp0); + fdct32_vsx(temp0, temp3, 0); + + load(input + 24, stride, temp0); + fdct32_vsx(temp0, temp4, 0); + + // Generate the top row by munging the first set of 8 from each one + // together. + transpose_8x8(&temp1[0], &temp0[0]); + transpose_8x8(&temp2[0], &temp0[8]); + transpose_8x8(&temp3[0], &temp0[16]); + transpose_8x8(&temp4[0], &temp0[24]); + + fdct32_vsx(temp0, temp5, 1); + + transpose_8x8(&temp5[0], &temp6[0]); + transpose_8x8(&temp5[8], &temp6[8]); + transpose_8x8(&temp5[16], &temp6[16]); + transpose_8x8(&temp5[24], &temp6[24]); + + store(out, temp6); + + // Second row of 8x32. + transpose_8x8(&temp1[8], &temp0[0]); + transpose_8x8(&temp2[8], &temp0[8]); + transpose_8x8(&temp3[8], &temp0[16]); + transpose_8x8(&temp4[8], &temp0[24]); + + fdct32_vsx(temp0, temp5, 1); + + transpose_8x8(&temp5[0], &temp6[0]); + transpose_8x8(&temp5[8], &temp6[8]); + transpose_8x8(&temp5[16], &temp6[16]); + transpose_8x8(&temp5[24], &temp6[24]); + + store(out + 8 * 32, temp6); + + // Third row of 8x32 + transpose_8x8(&temp1[16], &temp0[0]); + transpose_8x8(&temp2[16], &temp0[8]); + transpose_8x8(&temp3[16], &temp0[16]); + transpose_8x8(&temp4[16], &temp0[24]); + + fdct32_vsx(temp0, temp5, 1); + + transpose_8x8(&temp5[0], &temp6[0]); + transpose_8x8(&temp5[8], &temp6[8]); + transpose_8x8(&temp5[16], &temp6[16]); + transpose_8x8(&temp5[24], &temp6[24]); + + store(out + 16 * 32, temp6); + + // Final row of 8x32. + transpose_8x8(&temp1[24], &temp0[0]); + transpose_8x8(&temp2[24], &temp0[8]); + transpose_8x8(&temp3[24], &temp0[16]); + transpose_8x8(&temp4[24], &temp0[24]); + + fdct32_vsx(temp0, temp5, 1); + + transpose_8x8(&temp5[0], &temp6[0]); + transpose_8x8(&temp5[8], &temp6[8]); + transpose_8x8(&temp5[16], &temp6[16]); + transpose_8x8(&temp5[24], &temp6[24]); + + store(out + 24 * 32, temp6); +} diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/hadamard_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/hadamard_vsx.c new file mode 100644 index 0000000000..e279b30478 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/ppc/hadamard_vsx.c @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/ppc/types_vsx.h" +#include "vpx_dsp/ppc/transpose_vsx.h" +#include "vpx_dsp/ppc/bitdepth_conversion_vsx.h" + +static void vpx_hadamard_s16_8x8_one_pass(int16x8_t v[8]) { + const int16x8_t b0 = vec_add(v[0], v[1]); + const int16x8_t b1 = vec_sub(v[0], v[1]); + const int16x8_t b2 = vec_add(v[2], v[3]); + const int16x8_t b3 = vec_sub(v[2], v[3]); + const int16x8_t b4 = vec_add(v[4], v[5]); + const int16x8_t b5 = vec_sub(v[4], v[5]); + const int16x8_t b6 = vec_add(v[6], v[7]); + const int16x8_t b7 = vec_sub(v[6], v[7]); + + const int16x8_t c0 = vec_add(b0, b2); + const int16x8_t c1 = vec_add(b1, b3); + const int16x8_t c2 = vec_sub(b0, b2); + const int16x8_t c3 = vec_sub(b1, b3); + const int16x8_t c4 = vec_add(b4, b6); + const int16x8_t c5 = vec_add(b5, b7); + const int16x8_t c6 = vec_sub(b4, b6); + const int16x8_t c7 = vec_sub(b5, b7); + + v[0] = vec_add(c0, c4); + v[1] = vec_sub(c2, c6); + v[2] = vec_sub(c0, c4); + v[3] = vec_add(c2, c6); + v[4] = vec_add(c3, c7); + v[5] = vec_sub(c3, c7); + v[6] = vec_sub(c1, c5); + v[7] = vec_add(c1, c5); +} + +void vpx_hadamard_8x8_vsx(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int16x8_t v[8]; + + v[0] = vec_vsx_ld(0, src_diff); + v[1] = vec_vsx_ld(0, src_diff + src_stride); + v[2] = vec_vsx_ld(0, src_diff + (2 * src_stride)); + v[3] = vec_vsx_ld(0, src_diff + (3 * src_stride)); + v[4] = vec_vsx_ld(0, src_diff + (4 * src_stride)); + v[5] = vec_vsx_ld(0, src_diff + (5 * src_stride)); + v[6] = vec_vsx_ld(0, src_diff + (6 * src_stride)); + v[7] = vec_vsx_ld(0, src_diff + (7 * src_stride)); + + vpx_hadamard_s16_8x8_one_pass(v); + + vpx_transpose_s16_8x8(v); + + vpx_hadamard_s16_8x8_one_pass(v); + + store_tran_low(v[0], 0, coeff); + store_tran_low(v[1], 0, coeff + 8); + store_tran_low(v[2], 0, coeff + 16); + store_tran_low(v[3], 0, coeff + 24); + store_tran_low(v[4], 0, coeff + 32); + store_tran_low(v[5], 0, coeff + 40); + store_tran_low(v[6], 0, coeff + 48); + store_tran_low(v[7], 0, coeff + 56); +} + +void vpx_hadamard_16x16_vsx(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int i; + const uint16x8_t ones = vec_splat_u16(1); + + /* Rearrange 16x16 to 8x32 and remove stride. + * Top left first. */ + vpx_hadamard_8x8_vsx(src_diff, src_stride, coeff); + /* Top right. */ + vpx_hadamard_8x8_vsx(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64); + /* Bottom left. */ + vpx_hadamard_8x8_vsx(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128); + /* Bottom right. */ + vpx_hadamard_8x8_vsx(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192); + + /* Overlay the 8x8 blocks and combine. */ + for (i = 0; i < 64; i += 8) { + const int16x8_t a0 = load_tran_low(0, coeff); + const int16x8_t a1 = load_tran_low(0, coeff + 64); + const int16x8_t a2 = load_tran_low(0, coeff + 128); + const int16x8_t a3 = load_tran_low(0, coeff + 192); + + /* Prevent the result from escaping int16_t. */ + const int16x8_t b0 = vec_sra(a0, ones); + const int16x8_t b1 = vec_sra(a1, ones); + const int16x8_t b2 = vec_sra(a2, ones); + const int16x8_t b3 = vec_sra(a3, ones); + + const int16x8_t c0 = vec_add(b0, b1); + const int16x8_t c2 = vec_add(b2, b3); + const int16x8_t c1 = vec_sub(b0, b1); + const int16x8_t c3 = vec_sub(b2, b3); + + const int16x8_t d0 = vec_add(c0, c2); + const int16x8_t d1 = vec_add(c1, c3); + const int16x8_t d2 = vec_sub(c0, c2); + const int16x8_t d3 = vec_sub(c1, c3); + + store_tran_low(d0, 0, coeff); + store_tran_low(d1, 0, coeff + 64); + store_tran_low(d2, 0, coeff + 128); + store_tran_low(d3, 0, coeff + 192); + + coeff += 8; + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/intrapred_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/intrapred_vsx.c new file mode 100644 index 0000000000..a4c8322ff2 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/ppc/intrapred_vsx.c @@ -0,0 +1,767 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/ppc/types_vsx.h" + +void vpx_v_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d = vec_vsx_ld(0, above); + int i; + (void)left; + + for (i = 0; i < 16; i++, dst += stride) { + vec_vsx_st(d, 0, dst); + } +} + +void vpx_v_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d0 = vec_vsx_ld(0, above); + const uint8x16_t d1 = vec_vsx_ld(16, above); + int i; + (void)left; + + for (i = 0; i < 32; i++, dst += stride) { + vec_vsx_st(d0, 0, dst); + vec_vsx_st(d1, 16, dst); + } +} + +// TODO(crbug.com/webm/1522): Fix test failures. +#if 0 +static const uint32x4_t mask4 = { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; + +void vpx_h_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d = vec_vsx_ld(0, left); + const uint8x16_t v0 = vec_splat(d, 0); + const uint8x16_t v1 = vec_splat(d, 1); + const uint8x16_t v2 = vec_splat(d, 2); + const uint8x16_t v3 = vec_splat(d, 3); + + (void)above; + + vec_vsx_st(vec_sel(v0, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst); + dst += stride; + vec_vsx_st(vec_sel(v1, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst); + dst += stride; + vec_vsx_st(vec_sel(v2, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst); + dst += stride; + vec_vsx_st(vec_sel(v3, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst); +} + +void vpx_h_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d = vec_vsx_ld(0, left); + const uint8x16_t v0 = vec_splat(d, 0); + const uint8x16_t v1 = vec_splat(d, 1); + const uint8x16_t v2 = vec_splat(d, 2); + const uint8x16_t v3 = vec_splat(d, 3); + + const uint8x16_t v4 = vec_splat(d, 4); + const uint8x16_t v5 = vec_splat(d, 5); + const uint8x16_t v6 = vec_splat(d, 6); + const uint8x16_t v7 = vec_splat(d, 7); + + (void)above; + + vec_vsx_st(xxpermdi(v0, vec_vsx_ld(0, dst), 1), 0, dst); + dst += stride; + vec_vsx_st(xxpermdi(v1, vec_vsx_ld(0, dst), 1), 0, dst); + dst += stride; + vec_vsx_st(xxpermdi(v2, vec_vsx_ld(0, dst), 1), 0, dst); + dst += stride; + vec_vsx_st(xxpermdi(v3, vec_vsx_ld(0, dst), 1), 0, dst); + dst += stride; + vec_vsx_st(xxpermdi(v4, vec_vsx_ld(0, dst), 1), 0, dst); + dst += stride; + vec_vsx_st(xxpermdi(v5, vec_vsx_ld(0, dst), 1), 0, dst); + dst += stride; + vec_vsx_st(xxpermdi(v6, vec_vsx_ld(0, dst), 1), 0, dst); + dst += stride; + vec_vsx_st(xxpermdi(v7, vec_vsx_ld(0, dst), 1), 0, dst); +} +#endif + +void vpx_h_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d = vec_vsx_ld(0, left); + const uint8x16_t v0 = vec_splat(d, 0); + const uint8x16_t v1 = vec_splat(d, 1); + const uint8x16_t v2 = vec_splat(d, 2); + const uint8x16_t v3 = vec_splat(d, 3); + + const uint8x16_t v4 = vec_splat(d, 4); + const uint8x16_t v5 = vec_splat(d, 5); + const uint8x16_t v6 = vec_splat(d, 6); + const uint8x16_t v7 = vec_splat(d, 7); + + const uint8x16_t v8 = vec_splat(d, 8); + const uint8x16_t v9 = vec_splat(d, 9); + const uint8x16_t v10 = vec_splat(d, 10); + const uint8x16_t v11 = vec_splat(d, 11); + + const uint8x16_t v12 = vec_splat(d, 12); + const uint8x16_t v13 = vec_splat(d, 13); + const uint8x16_t v14 = vec_splat(d, 14); + const uint8x16_t v15 = vec_splat(d, 15); + + (void)above; + + vec_vsx_st(v0, 0, dst); + dst += stride; + vec_vsx_st(v1, 0, dst); + dst += stride; + vec_vsx_st(v2, 0, dst); + dst += stride; + vec_vsx_st(v3, 0, dst); + dst += stride; + vec_vsx_st(v4, 0, dst); + dst += stride; + vec_vsx_st(v5, 0, dst); + dst += stride; + vec_vsx_st(v6, 0, dst); + dst += stride; + vec_vsx_st(v7, 0, dst); + dst += stride; + vec_vsx_st(v8, 0, dst); + dst += stride; + vec_vsx_st(v9, 0, dst); + dst += stride; + vec_vsx_st(v10, 0, dst); + dst += stride; + vec_vsx_st(v11, 0, dst); + dst += stride; + vec_vsx_st(v12, 0, dst); + dst += stride; + vec_vsx_st(v13, 0, dst); + dst += stride; + vec_vsx_st(v14, 0, dst); + dst += stride; + vec_vsx_st(v15, 0, dst); +} + +#define H_PREDICTOR_32(v) \ + vec_vsx_st(v, 0, dst); \ + vec_vsx_st(v, 16, dst); \ + dst += stride + +void vpx_h_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d0 = vec_vsx_ld(0, left); + const uint8x16_t d1 = vec_vsx_ld(16, left); + + const uint8x16_t v0_0 = vec_splat(d0, 0); + const uint8x16_t v1_0 = vec_splat(d0, 1); + const uint8x16_t v2_0 = vec_splat(d0, 2); + const uint8x16_t v3_0 = vec_splat(d0, 3); + const uint8x16_t v4_0 = vec_splat(d0, 4); + const uint8x16_t v5_0 = vec_splat(d0, 5); + const uint8x16_t v6_0 = vec_splat(d0, 6); + const uint8x16_t v7_0 = vec_splat(d0, 7); + const uint8x16_t v8_0 = vec_splat(d0, 8); + const uint8x16_t v9_0 = vec_splat(d0, 9); + const uint8x16_t v10_0 = vec_splat(d0, 10); + const uint8x16_t v11_0 = vec_splat(d0, 11); + const uint8x16_t v12_0 = vec_splat(d0, 12); + const uint8x16_t v13_0 = vec_splat(d0, 13); + const uint8x16_t v14_0 = vec_splat(d0, 14); + const uint8x16_t v15_0 = vec_splat(d0, 15); + + const uint8x16_t v0_1 = vec_splat(d1, 0); + const uint8x16_t v1_1 = vec_splat(d1, 1); + const uint8x16_t v2_1 = vec_splat(d1, 2); + const uint8x16_t v3_1 = vec_splat(d1, 3); + const uint8x16_t v4_1 = vec_splat(d1, 4); + const uint8x16_t v5_1 = vec_splat(d1, 5); + const uint8x16_t v6_1 = vec_splat(d1, 6); + const uint8x16_t v7_1 = vec_splat(d1, 7); + const uint8x16_t v8_1 = vec_splat(d1, 8); + const uint8x16_t v9_1 = vec_splat(d1, 9); + const uint8x16_t v10_1 = vec_splat(d1, 10); + const uint8x16_t v11_1 = vec_splat(d1, 11); + const uint8x16_t v12_1 = vec_splat(d1, 12); + const uint8x16_t v13_1 = vec_splat(d1, 13); + const uint8x16_t v14_1 = vec_splat(d1, 14); + const uint8x16_t v15_1 = vec_splat(d1, 15); + + (void)above; + + H_PREDICTOR_32(v0_0); + H_PREDICTOR_32(v1_0); + H_PREDICTOR_32(v2_0); + H_PREDICTOR_32(v3_0); + + H_PREDICTOR_32(v4_0); + H_PREDICTOR_32(v5_0); + H_PREDICTOR_32(v6_0); + H_PREDICTOR_32(v7_0); + + H_PREDICTOR_32(v8_0); + H_PREDICTOR_32(v9_0); + H_PREDICTOR_32(v10_0); + H_PREDICTOR_32(v11_0); + + H_PREDICTOR_32(v12_0); + H_PREDICTOR_32(v13_0); + H_PREDICTOR_32(v14_0); + H_PREDICTOR_32(v15_0); + + H_PREDICTOR_32(v0_1); + H_PREDICTOR_32(v1_1); + H_PREDICTOR_32(v2_1); + H_PREDICTOR_32(v3_1); + + H_PREDICTOR_32(v4_1); + H_PREDICTOR_32(v5_1); + H_PREDICTOR_32(v6_1); + H_PREDICTOR_32(v7_1); + + H_PREDICTOR_32(v8_1); + H_PREDICTOR_32(v9_1); + H_PREDICTOR_32(v10_1); + H_PREDICTOR_32(v11_1); + + H_PREDICTOR_32(v12_1); + H_PREDICTOR_32(v13_1); + H_PREDICTOR_32(v14_1); + H_PREDICTOR_32(v15_1); +} + +// TODO(crbug.com/webm/1522): Fix test failures. +#if 0 +void vpx_tm_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0)); + const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left)); + const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above)); + int16x8_t tmp, val; + uint8x16_t d; + + d = vec_vsx_ld(0, dst); + tmp = unpack_to_s16_l(d); + val = vec_sub(vec_add(vec_splat(l, 0), a), tl); + vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst); + dst += stride; + + d = vec_vsx_ld(0, dst); + tmp = unpack_to_s16_l(d); + val = vec_sub(vec_add(vec_splat(l, 1), a), tl); + vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst); + dst += stride; + + d = vec_vsx_ld(0, dst); + tmp = unpack_to_s16_l(d); + val = vec_sub(vec_add(vec_splat(l, 2), a), tl); + vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst); + dst += stride; + + d = vec_vsx_ld(0, dst); + tmp = unpack_to_s16_l(d); + val = vec_sub(vec_add(vec_splat(l, 3), a), tl); + vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst); +} + +void vpx_tm_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0)); + const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left)); + const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above)); + int16x8_t tmp, val; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 0), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 1), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 2), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 3), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 4), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 5), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 6), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 7), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); +} +#endif + +static void tm_predictor_16x8(uint8_t *dst, const ptrdiff_t stride, int16x8_t l, + int16x8_t ah, int16x8_t al, int16x8_t tl) { + int16x8_t vh, vl, ls; + + ls = vec_splat(l, 0); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 1); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 2); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 3); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 4); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 5); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 6); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 7); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); +} + +void vpx_tm_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0)); + const uint8x16_t l = vec_vsx_ld(0, left); + const int16x8_t lh = unpack_to_s16_h(l); + const int16x8_t ll = unpack_to_s16_l(l); + const uint8x16_t a = vec_vsx_ld(0, above); + const int16x8_t ah = unpack_to_s16_h(a); + const int16x8_t al = unpack_to_s16_l(a); + + tm_predictor_16x8(dst, stride, lh, ah, al, tl); + + dst += stride * 8; + + tm_predictor_16x8(dst, stride, ll, ah, al, tl); +} + +static INLINE void tm_predictor_32x1(uint8_t *dst, const int16x8_t ls, + const int16x8_t a0h, const int16x8_t a0l, + const int16x8_t a1h, const int16x8_t a1l, + const int16x8_t tl) { + int16x8_t vh, vl; + + vh = vec_sub(vec_add(ls, a0h), tl); + vl = vec_sub(vec_add(ls, a0l), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + vh = vec_sub(vec_add(ls, a1h), tl); + vl = vec_sub(vec_add(ls, a1l), tl); + vec_vsx_st(vec_packsu(vh, vl), 16, dst); +} + +static void tm_predictor_32x8(uint8_t *dst, const ptrdiff_t stride, + const int16x8_t l, const uint8x16_t a0, + const uint8x16_t a1, const int16x8_t tl) { + const int16x8_t a0h = unpack_to_s16_h(a0); + const int16x8_t a0l = unpack_to_s16_l(a0); + const int16x8_t a1h = unpack_to_s16_h(a1); + const int16x8_t a1l = unpack_to_s16_l(a1); + + tm_predictor_32x1(dst, vec_splat(l, 0), a0h, a0l, a1h, a1l, tl); + dst += stride; + + tm_predictor_32x1(dst, vec_splat(l, 1), a0h, a0l, a1h, a1l, tl); + dst += stride; + + tm_predictor_32x1(dst, vec_splat(l, 2), a0h, a0l, a1h, a1l, tl); + dst += stride; + + tm_predictor_32x1(dst, vec_splat(l, 3), a0h, a0l, a1h, a1l, tl); + dst += stride; + + tm_predictor_32x1(dst, vec_splat(l, 4), a0h, a0l, a1h, a1l, tl); + dst += stride; + + tm_predictor_32x1(dst, vec_splat(l, 5), a0h, a0l, a1h, a1l, tl); + dst += stride; + + tm_predictor_32x1(dst, vec_splat(l, 6), a0h, a0l, a1h, a1l, tl); + dst += stride; + + tm_predictor_32x1(dst, vec_splat(l, 7), a0h, a0l, a1h, a1l, tl); +} + +void vpx_tm_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0)); + const uint8x16_t l0 = vec_vsx_ld(0, left); + const uint8x16_t l1 = vec_vsx_ld(16, left); + const uint8x16_t a0 = vec_vsx_ld(0, above); + const uint8x16_t a1 = vec_vsx_ld(16, above); + + tm_predictor_32x8(dst, stride, unpack_to_s16_h(l0), a0, a1, tl); + dst += stride * 8; + + tm_predictor_32x8(dst, stride, unpack_to_s16_l(l0), a0, a1, tl); + dst += stride * 8; + + tm_predictor_32x8(dst, stride, unpack_to_s16_h(l1), a0, a1, tl); + dst += stride * 8; + + tm_predictor_32x8(dst, stride, unpack_to_s16_l(l1), a0, a1, tl); +} + +static INLINE void dc_fill_predictor_8x8(uint8_t *dst, const ptrdiff_t stride, + const uint8x16_t val) { + int i; + + for (i = 0; i < 8; i++, dst += stride) { + const uint8x16_t d = vec_vsx_ld(0, dst); + vec_vsx_st(xxpermdi(val, d, 1), 0, dst); + } +} + +static INLINE void dc_fill_predictor_16x16(uint8_t *dst, const ptrdiff_t stride, + const uint8x16_t val) { + int i; + + for (i = 0; i < 16; i++, dst += stride) { + vec_vsx_st(val, 0, dst); + } +} + +void vpx_dc_128_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7)); + (void)above; + (void)left; + + dc_fill_predictor_16x16(dst, stride, v128); +} + +static INLINE void dc_fill_predictor_32x32(uint8_t *dst, const ptrdiff_t stride, + const uint8x16_t val) { + int i; + + for (i = 0; i < 32; i++, dst += stride) { + vec_vsx_st(val, 0, dst); + vec_vsx_st(val, 16, dst); + } +} + +void vpx_dc_128_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7)); + (void)above; + (void)left; + + dc_fill_predictor_32x32(dst, stride, v128); +} + +static uint8x16_t avg16(const uint8_t *values) { + const int32x4_t sum4s = + (int32x4_t)vec_sum4s(vec_vsx_ld(0, values), vec_splat_u32(0)); + const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, vec_splat_s32(8)); + const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4)); + + return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)), + 3); +} + +void vpx_dc_left_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + + dc_fill_predictor_16x16(dst, stride, avg16(left)); +} + +void vpx_dc_top_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + dc_fill_predictor_16x16(dst, stride, avg16(above)); +} + +static uint8x16_t avg32(const uint8_t *values) { + const uint8x16_t v0 = vec_vsx_ld(0, values); + const uint8x16_t v1 = vec_vsx_ld(16, values); + const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4)); + const int32x4_t sum4s = + (int32x4_t)vec_sum4s(v0, vec_sum4s(v1, vec_splat_u32(0))); + const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16); + const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5)); + + return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)), + 3); +} + +void vpx_dc_left_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + + dc_fill_predictor_32x32(dst, stride, avg32(left)); +} + +void vpx_dc_top_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + dc_fill_predictor_32x32(dst, stride, avg32(above)); +} + +// TODO(crbug.com/webm/1522): Fix test failures. +#if 0 +static uint8x16_t dc_avg8(const uint8_t *above, const uint8_t *left) { + const uint8x16_t a0 = vec_vsx_ld(0, above); + const uint8x16_t l0 = vec_vsx_ld(0, left); + const int32x4_t sum4s = + (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0))); + const int32x4_t sum4s8 = xxpermdi(sum4s, vec_splat_s32(0), 1); + const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s8, vec_splat_s32(8)); + const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4)); + + return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)), + 3); +} +#endif + +static uint8x16_t dc_avg16(const uint8_t *above, const uint8_t *left) { + const uint8x16_t a0 = vec_vsx_ld(0, above); + const uint8x16_t l0 = vec_vsx_ld(0, left); + const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4)); + const int32x4_t sum4s = + (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0))); + const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16); + const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5)); + + return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)), + 3); +} + +// TODO(crbug.com/webm/1522): Fix test failures. +#if 0 +void vpx_dc_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_fill_predictor_8x8(dst, stride, dc_avg8(above, left)); +} +#endif + +void vpx_dc_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_fill_predictor_16x16(dst, stride, dc_avg16(above, left)); +} + +static uint8x16_t dc_avg32(const uint8_t *above, const uint8_t *left) { + const uint8x16_t a0 = vec_vsx_ld(0, above); + const uint8x16_t a1 = vec_vsx_ld(16, above); + const uint8x16_t l0 = vec_vsx_ld(0, left); + const uint8x16_t l1 = vec_vsx_ld(16, left); + const int32x4_t v32 = vec_sl(vec_splat_s32(1), vec_splat_u32(5)); + const uint32x4_t a_sum = vec_sum4s(a0, vec_sum4s(a1, vec_splat_u32(0))); + const int32x4_t sum4s = (int32x4_t)vec_sum4s(l0, vec_sum4s(l1, a_sum)); + const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v32); + const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(6)); + + return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)), + 3); +} + +void vpx_dc_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_fill_predictor_32x32(dst, stride, dc_avg32(above, left)); +} + +static uint8x16_t avg3(const uint8x16_t a, const uint8x16_t b, + const uint8x16_t c) { + const uint8x16_t ac = + vec_adds(vec_and(a, c), vec_sr(vec_xor(a, c), vec_splat_u8(1))); + + return vec_avg(ac, b); +} + +// Workaround vec_sld/vec_xxsldi/vec_lsdoi being missing or broken. +static const uint8x16_t sl1 = { 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, + 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x10 }; + +// TODO(crbug.com/webm/1522): Fix test failures. +#if 0 +void vpx_d45_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t af = vec_vsx_ld(0, above); + const uint8x16_t above_right = vec_splat(af, 7); + const uint8x16_t a = xxpermdi(af, above_right, 1); + const uint8x16_t b = vec_perm(a, above_right, sl1); + const uint8x16_t c = vec_perm(b, above_right, sl1); + uint8x16_t row = avg3(a, b, c); + int i; + (void)left; + + for (i = 0; i < 8; i++) { + const uint8x16_t d = vec_vsx_ld(0, dst); + vec_vsx_st(xxpermdi(row, d, 1), 0, dst); + dst += stride; + row = vec_perm(row, above_right, sl1); + } +} +#endif + +void vpx_d45_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t a = vec_vsx_ld(0, above); + const uint8x16_t above_right = vec_splat(a, 15); + const uint8x16_t b = vec_perm(a, above_right, sl1); + const uint8x16_t c = vec_perm(b, above_right, sl1); + uint8x16_t row = avg3(a, b, c); + int i; + (void)left; + + for (i = 0; i < 16; i++) { + vec_vsx_st(row, 0, dst); + dst += stride; + row = vec_perm(row, above_right, sl1); + } +} + +void vpx_d45_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t a0 = vec_vsx_ld(0, above); + const uint8x16_t a1 = vec_vsx_ld(16, above); + const uint8x16_t above_right = vec_splat(a1, 15); + const uint8x16_t b0 = vec_perm(a0, a1, sl1); + const uint8x16_t b1 = vec_perm(a1, above_right, sl1); + const uint8x16_t c0 = vec_perm(b0, b1, sl1); + const uint8x16_t c1 = vec_perm(b1, above_right, sl1); + uint8x16_t row0 = avg3(a0, b0, c0); + uint8x16_t row1 = avg3(a1, b1, c1); + int i; + (void)left; + + for (i = 0; i < 32; i++) { + vec_vsx_st(row0, 0, dst); + vec_vsx_st(row1, 16, dst); + dst += stride; + row0 = vec_perm(row0, row1, sl1); + row1 = vec_perm(row1, above_right, sl1); + } +} + +// TODO(crbug.com/webm/1522): Fix test failures. +#if 0 +void vpx_d63_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t af = vec_vsx_ld(0, above); + const uint8x16_t above_right = vec_splat(af, 9); + const uint8x16_t a = xxpermdi(af, above_right, 1); + const uint8x16_t b = vec_perm(a, above_right, sl1); + const uint8x16_t c = vec_perm(b, above_right, sl1); + uint8x16_t row0 = vec_avg(a, b); + uint8x16_t row1 = avg3(a, b, c); + int i; + (void)left; + + for (i = 0; i < 4; i++) { + const uint8x16_t d0 = vec_vsx_ld(0, dst); + const uint8x16_t d1 = vec_vsx_ld(0, dst + stride); + vec_vsx_st(xxpermdi(row0, d0, 1), 0, dst); + vec_vsx_st(xxpermdi(row1, d1, 1), 0, dst + stride); + dst += stride * 2; + row0 = vec_perm(row0, above_right, sl1); + row1 = vec_perm(row1, above_right, sl1); + } +} +#endif + +void vpx_d63_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t a0 = vec_vsx_ld(0, above); + const uint8x16_t a1 = vec_vsx_ld(16, above); + const uint8x16_t above_right = vec_splat(a1, 0); + const uint8x16_t b = vec_perm(a0, above_right, sl1); + const uint8x16_t c = vec_perm(b, above_right, sl1); + uint8x16_t row0 = vec_avg(a0, b); + uint8x16_t row1 = avg3(a0, b, c); + int i; + (void)left; + + for (i = 0; i < 8; i++) { + vec_vsx_st(row0, 0, dst); + vec_vsx_st(row1, 0, dst + stride); + dst += stride * 2; + row0 = vec_perm(row0, above_right, sl1); + row1 = vec_perm(row1, above_right, sl1); + } +} + +void vpx_d63_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t a0 = vec_vsx_ld(0, above); + const uint8x16_t a1 = vec_vsx_ld(16, above); + const uint8x16_t a2 = vec_vsx_ld(32, above); + const uint8x16_t above_right = vec_splat(a2, 0); + const uint8x16_t b0 = vec_perm(a0, a1, sl1); + const uint8x16_t b1 = vec_perm(a1, above_right, sl1); + const uint8x16_t c0 = vec_perm(b0, b1, sl1); + const uint8x16_t c1 = vec_perm(b1, above_right, sl1); + uint8x16_t row0_0 = vec_avg(a0, b0); + uint8x16_t row0_1 = vec_avg(a1, b1); + uint8x16_t row1_0 = avg3(a0, b0, c0); + uint8x16_t row1_1 = avg3(a1, b1, c1); + int i; + (void)left; + + for (i = 0; i < 16; i++) { + vec_vsx_st(row0_0, 0, dst); + vec_vsx_st(row0_1, 16, dst); + vec_vsx_st(row1_0, 0, dst + stride); + vec_vsx_st(row1_1, 16, dst + stride); + dst += stride * 2; + row0_0 = vec_perm(row0_0, row0_1, sl1); + row0_1 = vec_perm(row0_1, above_right, sl1); + row1_0 = vec_perm(row1_0, row1_1, sl1); + row1_1 = vec_perm(row1_1, above_right, sl1); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c new file mode 100644 index 0000000000..e99412ecab --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c @@ -0,0 +1,1828 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "vpx_dsp/ppc/bitdepth_conversion_vsx.h" +#include "vpx_dsp/ppc/types_vsx.h" +#include "vpx_dsp/ppc/inv_txfm_vsx.h" + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/inv_txfm.h" + +static const int16x8_t cospi1_v = { 16364, 16364, 16364, 16364, + 16364, 16364, 16364, 16364 }; +static const int16x8_t cospi1m_v = { -16364, -16364, -16364, -16364, + -16364, -16364, -16364, -16364 }; +static const int16x8_t cospi2_v = { 16305, 16305, 16305, 16305, + 16305, 16305, 16305, 16305 }; +static const int16x8_t cospi2m_v = { -16305, -16305, -16305, -16305, + -16305, -16305, -16305, -16305 }; +static const int16x8_t cospi3_v = { 16207, 16207, 16207, 16207, + 16207, 16207, 16207, 16207 }; +static const int16x8_t cospi4_v = { 16069, 16069, 16069, 16069, + 16069, 16069, 16069, 16069 }; +static const int16x8_t cospi4m_v = { -16069, -16069, -16069, -16069, + -16069, -16069, -16069, -16069 }; +static const int16x8_t cospi5_v = { 15893, 15893, 15893, 15893, + 15893, 15893, 15893, 15893 }; +static const int16x8_t cospi5m_v = { -15893, -15893, -15893, -15893, + -15893, -15893, -15893, -15893 }; +static const int16x8_t cospi6_v = { 15679, 15679, 15679, 15679, + 15679, 15679, 15679, 15679 }; +static const int16x8_t cospi7_v = { 15426, 15426, 15426, 15426, + 15426, 15426, 15426, 15426 }; +static const int16x8_t cospi8_v = { 15137, 15137, 15137, 15137, + 15137, 15137, 15137, 15137 }; +static const int16x8_t cospi8m_v = { -15137, -15137, -15137, -15137, + -15137, -15137, -15137, -15137 }; +static const int16x8_t cospi9_v = { 14811, 14811, 14811, 14811, + 14811, 14811, 14811, 14811 }; +static const int16x8_t cospi9m_v = { -14811, -14811, -14811, -14811, + -14811, -14811, -14811, -14811 }; +static const int16x8_t cospi10_v = { 14449, 14449, 14449, 14449, + 14449, 14449, 14449, 14449 }; +static const int16x8_t cospi10m_v = { -14449, -14449, -14449, -14449, + -14449, -14449, -14449, -14449 }; +static const int16x8_t cospi11_v = { 14053, 14053, 14053, 14053, + 14053, 14053, 14053, 14053 }; +static const int16x8_t cospi12_v = { 13623, 13623, 13623, 13623, + 13623, 13623, 13623, 13623 }; +static const int16x8_t cospi12m_v = { -13623, -13623, -13623, -13623, + -13623, -13623, -13623, -13623 }; +static const int16x8_t cospi13_v = { 13160, 13160, 13160, 13160, + 13160, 13160, 13160, 13160 }; +static const int16x8_t cospi13m_v = { -13160, -13160, -13160, -13160, + -13160, -13160, -13160, -13160 }; +static const int16x8_t cospi14_v = { 12665, 12665, 12665, 12665, + 12665, 12665, 12665, 12665 }; +static const int16x8_t cospi15_v = { 12140, 12140, 12140, 12140, + 12140, 12140, 12140, 12140 }; +static const int16x8_t cospi16_v = { 11585, 11585, 11585, 11585, + 11585, 11585, 11585, 11585 }; +static const int16x8_t cospi16m_v = { -11585, -11585, -11585, -11585, + -11585, -11585, -11585, -11585 }; +static const int16x8_t cospi17_v = { 11003, 11003, 11003, 11003, + 11003, 11003, 11003, 11003 }; +static const int16x8_t cospi17m_v = { -11003, -11003, -11003, -11003, + -11003, -11003, -11003, -11003 }; +static const int16x8_t cospi18_v = { 10394, 10394, 10394, 10394, + 10394, 10394, 10394, 10394 }; +static const int16x8_t cospi18m_v = { -10394, -10394, -10394, -10394, + -10394, -10394, -10394, -10394 }; +static const int16x8_t cospi19_v = { 9760, 9760, 9760, 9760, + 9760, 9760, 9760, 9760 }; +static const int16x8_t cospi20_v = { 9102, 9102, 9102, 9102, + 9102, 9102, 9102, 9102 }; +static const int16x8_t cospi20m_v = { -9102, -9102, -9102, -9102, + -9102, -9102, -9102, -9102 }; +static const int16x8_t cospi21_v = { 8423, 8423, 8423, 8423, + 8423, 8423, 8423, 8423 }; +static const int16x8_t cospi21m_v = { -8423, -8423, -8423, -8423, + -8423, -8423, -8423, -8423 }; +static const int16x8_t cospi22_v = { 7723, 7723, 7723, 7723, + 7723, 7723, 7723, 7723 }; +static const int16x8_t cospi23_v = { 7005, 7005, 7005, 7005, + 7005, 7005, 7005, 7005 }; +static const int16x8_t cospi24_v = { 6270, 6270, 6270, 6270, + 6270, 6270, 6270, 6270 }; +static const int16x8_t cospi24m_v = { -6270, -6270, -6270, -6270, + -6270, -6270, -6270, -6270 }; +static const int16x8_t cospi25_v = { 5520, 5520, 5520, 5520, + 5520, 5520, 5520, 5520 }; +static const int16x8_t cospi25m_v = { -5520, -5520, -5520, -5520, + -5520, -5520, -5520, -5520 }; +static const int16x8_t cospi26_v = { 4756, 4756, 4756, 4756, + 4756, 4756, 4756, 4756 }; +static const int16x8_t cospi26m_v = { -4756, -4756, -4756, -4756, + -4756, -4756, -4756, -4756 }; +static const int16x8_t cospi27_v = { 3981, 3981, 3981, 3981, + 3981, 3981, 3981, 3981 }; +static const int16x8_t cospi28_v = { 3196, 3196, 3196, 3196, + 3196, 3196, 3196, 3196 }; +static const int16x8_t cospi28m_v = { -3196, -3196, -3196, -3196, + -3196, -3196, -3196, -3196 }; +static const int16x8_t cospi29_v = { 2404, 2404, 2404, 2404, + 2404, 2404, 2404, 2404 }; +static const int16x8_t cospi29m_v = { -2404, -2404, -2404, -2404, + -2404, -2404, -2404, -2404 }; +static const int16x8_t cospi30_v = { 1606, 1606, 1606, 1606, + 1606, 1606, 1606, 1606 }; +static const int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 }; + +static const int16x8_t sinpi_1_9_v = { 5283, 5283, 5283, 5283, + 5283, 5283, 5283, 5283 }; +static const int16x8_t sinpi_2_9_v = { 9929, 9929, 9929, 9929, + 9929, 9929, 9929, 9929 }; +static const int16x8_t sinpi_3_9_v = { 13377, 13377, 13377, 13377, + 13377, 13377, 13377, 13377 }; +static const int16x8_t sinpi_4_9_v = { 15212, 15212, 15212, 15212, + 15212, 15212, 15212, 15212 }; + +static uint8x16_t tr8_mask0 = { + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 +}; + +static uint8x16_t tr8_mask1 = { + 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, + 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F +}; + +#define ROUND_SHIFT_INIT \ + const int32x4_t shift = vec_sl(vec_splat_s32(1), vec_splat_u32(13)); \ + const uint32x4_t shift14 = vec_splat_u32(14); + +#define DCT_CONST_ROUND_SHIFT(vec) vec = vec_sra(vec_add(vec, shift), shift14); + +#define PIXEL_ADD_INIT \ + int16x8_t add8 = vec_splat_s16(8); \ + uint16x8_t shift4 = vec_splat_u16(4); + +#define PIXEL_ADD4(out, in) out = vec_sra(vec_add(in, add8), shift4); + +#define IDCT4(in0, in1, out0, out1) \ + t0 = vec_add(in0, in1); \ + t1 = vec_sub(in0, in1); \ + tmp16_0 = vec_mergeh(t0, t1); \ + temp1 = vec_sra(vec_add(vec_mule(tmp16_0, cospi16_v), shift), shift14); \ + temp2 = vec_sra(vec_add(vec_mulo(tmp16_0, cospi16_v), shift), shift14); \ + \ + tmp16_0 = vec_mergel(in0, in1); \ + temp3 = vec_sub(vec_mule(tmp16_0, cospi24_v), vec_mulo(tmp16_0, cospi8_v)); \ + DCT_CONST_ROUND_SHIFT(temp3); \ + temp4 = vec_add(vec_mule(tmp16_0, cospi8_v), vec_mulo(tmp16_0, cospi24_v)); \ + DCT_CONST_ROUND_SHIFT(temp4); \ + \ + step0 = vec_packs(temp1, temp2); \ + step1 = vec_packs(temp4, temp3); \ + out0 = vec_add(step0, step1); \ + out1 = vec_sub(step0, step1); \ + out1 = vec_perm(out1, out1, mask0); + +#define PACK_STORE(v0, v1) \ + tmp16_0 = vec_add(vec_perm(d_u0, d_u1, tr8_mask0), v0); \ + tmp16_1 = vec_add(vec_perm(d_u2, d_u3, tr8_mask0), v1); \ + output_v = vec_packsu(tmp16_0, tmp16_1); \ + \ + vec_vsx_st(output_v, 0, tmp_dest); \ + for (i = 0; i < 4; i++) \ + for (j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i]; + +void vpx_round_store4x4_vsx(int16x8_t *in, int16x8_t *out, uint8_t *dest, + int stride) { + int i, j; + uint8x16_t dest0 = vec_vsx_ld(0, dest); + uint8x16_t dest1 = vec_vsx_ld(stride, dest); + uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest); + uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest); + uint8x16_t zerov = vec_splat_u8(0); + int16x8_t d_u0 = (int16x8_t)vec_mergeh(dest0, zerov); + int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov); + int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov); + int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov); + int16x8_t tmp16_0, tmp16_1; + uint8x16_t output_v; + uint8_t tmp_dest[16]; + PIXEL_ADD_INIT; + + PIXEL_ADD4(out[0], in[0]); + PIXEL_ADD4(out[1], in[1]); + + PACK_STORE(out[0], out[1]); +} + +void vpx_idct4_vsx(int16x8_t *in, int16x8_t *out) { + int32x4_t temp1, temp2, temp3, temp4; + int16x8_t step0, step1, tmp16_0; + uint8x16_t mask0 = { 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 }; + int16x8_t t0 = vec_mergeh(in[0], in[1]); + int16x8_t t1 = vec_mergel(in[0], in[1]); + ROUND_SHIFT_INIT + + in[0] = vec_mergeh(t0, t1); + in[1] = vec_mergel(t0, t1); + + IDCT4(in[0], in[1], out[0], out[1]); +} + +void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride) { + int16x8_t in[2], out[2]; + + in[0] = load_tran_low(0, input); + in[1] = load_tran_low(8 * sizeof(*input), input); + // Rows + vpx_idct4_vsx(in, out); + + // Columns + vpx_idct4_vsx(out, in); + + vpx_round_store4x4_vsx(in, out, dest, stride); +} + +#define TRANSPOSE8x8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ + out0 = vec_mergeh(in0, in1); \ + out1 = vec_mergel(in0, in1); \ + out2 = vec_mergeh(in2, in3); \ + out3 = vec_mergel(in2, in3); \ + out4 = vec_mergeh(in4, in5); \ + out5 = vec_mergel(in4, in5); \ + out6 = vec_mergeh(in6, in7); \ + out7 = vec_mergel(in6, in7); \ + in0 = (int16x8_t)vec_mergeh((int32x4_t)out0, (int32x4_t)out2); \ + in1 = (int16x8_t)vec_mergel((int32x4_t)out0, (int32x4_t)out2); \ + in2 = (int16x8_t)vec_mergeh((int32x4_t)out1, (int32x4_t)out3); \ + in3 = (int16x8_t)vec_mergel((int32x4_t)out1, (int32x4_t)out3); \ + in4 = (int16x8_t)vec_mergeh((int32x4_t)out4, (int32x4_t)out6); \ + in5 = (int16x8_t)vec_mergel((int32x4_t)out4, (int32x4_t)out6); \ + in6 = (int16x8_t)vec_mergeh((int32x4_t)out5, (int32x4_t)out7); \ + in7 = (int16x8_t)vec_mergel((int32x4_t)out5, (int32x4_t)out7); \ + out0 = vec_perm(in0, in4, tr8_mask0); \ + out1 = vec_perm(in0, in4, tr8_mask1); \ + out2 = vec_perm(in1, in5, tr8_mask0); \ + out3 = vec_perm(in1, in5, tr8_mask1); \ + out4 = vec_perm(in2, in6, tr8_mask0); \ + out5 = vec_perm(in2, in6, tr8_mask1); \ + out6 = vec_perm(in3, in7, tr8_mask0); \ + out7 = vec_perm(in3, in7, tr8_mask1); + +/* for the: temp1 = step[x] * cospi_q - step[y] * cospi_z + * temp2 = step[x] * cospi_z + step[y] * cospi_q */ +#define STEP8_0(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1) \ + tmp16_0 = vec_mergeh(inpt0, inpt1); \ + tmp16_1 = vec_mergel(inpt0, inpt1); \ + temp10 = vec_sub(vec_mule(tmp16_0, cospi0), vec_mulo(tmp16_0, cospi1)); \ + temp11 = vec_sub(vec_mule(tmp16_1, cospi0), vec_mulo(tmp16_1, cospi1)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + outpt0 = vec_packs(temp10, temp11); \ + temp10 = vec_add(vec_mule(tmp16_0, cospi1), vec_mulo(tmp16_0, cospi0)); \ + temp11 = vec_add(vec_mule(tmp16_1, cospi1), vec_mulo(tmp16_1, cospi0)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + outpt1 = vec_packs(temp10, temp11); + +#define STEP8_1(inpt0, inpt1, outpt0, outpt1, cospi) \ + tmp16_2 = vec_sub(inpt0, inpt1); \ + tmp16_3 = vec_add(inpt0, inpt1); \ + tmp16_0 = vec_mergeh(tmp16_2, tmp16_3); \ + tmp16_1 = vec_mergel(tmp16_2, tmp16_3); \ + temp10 = vec_mule(tmp16_0, cospi); \ + temp11 = vec_mule(tmp16_1, cospi); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + outpt0 = vec_packs(temp10, temp11); \ + temp10 = vec_mulo(tmp16_0, cospi); \ + temp11 = vec_mulo(tmp16_1, cospi); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + outpt1 = vec_packs(temp10, temp11); + +#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7) \ + /* stage 1 */ \ + step0 = in0; \ + step2 = in4; \ + step1 = in2; \ + step3 = in6; \ + \ + STEP8_0(in1, in7, step4, step7, cospi28_v, cospi4_v); \ + STEP8_0(in5, in3, step5, step6, cospi12_v, cospi20_v); \ + \ + /* stage 2 */ \ + STEP8_1(step0, step2, in1, in0, cospi16_v); \ + STEP8_0(step1, step3, in2, in3, cospi24_v, cospi8_v); \ + in4 = vec_add(step4, step5); \ + in5 = vec_sub(step4, step5); \ + in6 = vec_sub(step7, step6); \ + in7 = vec_add(step6, step7); \ + \ + /* stage 3 */ \ + step0 = vec_add(in0, in3); \ + step1 = vec_add(in1, in2); \ + step2 = vec_sub(in1, in2); \ + step3 = vec_sub(in0, in3); \ + step4 = in4; \ + STEP8_1(in6, in5, step5, step6, cospi16_v); \ + step7 = in7; \ + \ + /* stage 4 */ \ + in0 = vec_add(step0, step7); \ + in1 = vec_add(step1, step6); \ + in2 = vec_add(step2, step5); \ + in3 = vec_add(step3, step4); \ + in4 = vec_sub(step3, step4); \ + in5 = vec_sub(step2, step5); \ + in6 = vec_sub(step1, step6); \ + in7 = vec_sub(step0, step7); + +#define PIXEL_ADD(in, out, add, shiftx) \ + out = vec_add(vec_sra(vec_add(in, add), shiftx), out); + +void vpx_idct8_vsx(int16x8_t *in, int16x8_t *out) { + int16x8_t step0, step1, step2, step3, step4, step5, step6, step7; + int16x8_t tmp16_0, tmp16_1, tmp16_2, tmp16_3; + int32x4_t temp10, temp11; + ROUND_SHIFT_INIT; + + TRANSPOSE8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], out[0], + out[1], out[2], out[3], out[4], out[5], out[6], out[7]); + + IDCT8(out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7]); +} + +void vpx_round_store8x8_vsx(int16x8_t *in, uint8_t *dest, int stride) { + uint8x16_t zerov = vec_splat_u8(0); + uint8x16_t dest0 = vec_vsx_ld(0, dest); + uint8x16_t dest1 = vec_vsx_ld(stride, dest); + uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest); + uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest); + uint8x16_t dest4 = vec_vsx_ld(4 * stride, dest); + uint8x16_t dest5 = vec_vsx_ld(5 * stride, dest); + uint8x16_t dest6 = vec_vsx_ld(6 * stride, dest); + uint8x16_t dest7 = vec_vsx_ld(7 * stride, dest); + int16x8_t d_u0 = (int16x8_t)vec_mergeh(dest0, zerov); + int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov); + int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov); + int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov); + int16x8_t d_u4 = (int16x8_t)vec_mergeh(dest4, zerov); + int16x8_t d_u5 = (int16x8_t)vec_mergeh(dest5, zerov); + int16x8_t d_u6 = (int16x8_t)vec_mergeh(dest6, zerov); + int16x8_t d_u7 = (int16x8_t)vec_mergeh(dest7, zerov); + int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(1)); + uint16x8_t shift5 = vec_splat_u16(5); + uint8x16_t output0, output1, output2, output3; + + PIXEL_ADD(in[0], d_u0, add, shift5); + PIXEL_ADD(in[1], d_u1, add, shift5); + PIXEL_ADD(in[2], d_u2, add, shift5); + PIXEL_ADD(in[3], d_u3, add, shift5); + PIXEL_ADD(in[4], d_u4, add, shift5); + PIXEL_ADD(in[5], d_u5, add, shift5); + PIXEL_ADD(in[6], d_u6, add, shift5); + PIXEL_ADD(in[7], d_u7, add, shift5); + output0 = vec_packsu(d_u0, d_u1); + output1 = vec_packsu(d_u2, d_u3); + output2 = vec_packsu(d_u4, d_u5); + output3 = vec_packsu(d_u6, d_u7); + + vec_vsx_st(xxpermdi(output0, dest0, 1), 0, dest); + vec_vsx_st(xxpermdi(output0, dest1, 3), stride, dest); + vec_vsx_st(xxpermdi(output1, dest2, 1), 2 * stride, dest); + vec_vsx_st(xxpermdi(output1, dest3, 3), 3 * stride, dest); + vec_vsx_st(xxpermdi(output2, dest4, 1), 4 * stride, dest); + vec_vsx_st(xxpermdi(output2, dest5, 3), 5 * stride, dest); + vec_vsx_st(xxpermdi(output3, dest6, 1), 6 * stride, dest); + vec_vsx_st(xxpermdi(output3, dest7, 3), 7 * stride, dest); +} + +void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride) { + int16x8_t src[8], tmp[8]; + + src[0] = load_tran_low(0, input); + src[1] = load_tran_low(8 * sizeof(*input), input); + src[2] = load_tran_low(16 * sizeof(*input), input); + src[3] = load_tran_low(24 * sizeof(*input), input); + src[4] = load_tran_low(32 * sizeof(*input), input); + src[5] = load_tran_low(40 * sizeof(*input), input); + src[6] = load_tran_low(48 * sizeof(*input), input); + src[7] = load_tran_low(56 * sizeof(*input), input); + + vpx_idct8_vsx(src, tmp); + vpx_idct8_vsx(tmp, src); + + vpx_round_store8x8_vsx(src, dest, stride); +} + +#define STEP16_1(inpt0, inpt1, outpt0, outpt1, cospi) \ + tmp16_0 = vec_mergeh(inpt0, inpt1); \ + tmp16_1 = vec_mergel(inpt0, inpt1); \ + temp10 = vec_mule(tmp16_0, cospi); \ + temp11 = vec_mule(tmp16_1, cospi); \ + temp20 = vec_mulo(tmp16_0, cospi); \ + temp21 = vec_mulo(tmp16_1, cospi); \ + temp30 = vec_sub(temp10, temp20); \ + temp10 = vec_add(temp10, temp20); \ + temp20 = vec_sub(temp11, temp21); \ + temp21 = vec_add(temp11, temp21); \ + DCT_CONST_ROUND_SHIFT(temp30); \ + DCT_CONST_ROUND_SHIFT(temp20); \ + outpt0 = vec_packs(temp30, temp20); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp21); \ + outpt1 = vec_packs(temp10, temp21); + +#define IDCT16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, inA, inB, \ + inC, inD, inE, inF, out0, out1, out2, out3, out4, out5, out6, \ + out7, out8, out9, outA, outB, outC, outD, outE, outF) \ + /* stage 1 */ \ + /* out0 = in0; */ \ + out1 = in8; \ + out2 = in4; \ + out3 = inC; \ + out4 = in2; \ + out5 = inA; \ + out6 = in6; \ + out7 = inE; \ + out8 = in1; \ + out9 = in9; \ + outA = in5; \ + outB = inD; \ + outC = in3; \ + outD = inB; \ + outE = in7; \ + outF = inF; \ + \ + /* stage 2 */ \ + /* in0 = out0; */ \ + in1 = out1; \ + in2 = out2; \ + in3 = out3; \ + in4 = out4; \ + in5 = out5; \ + in6 = out6; \ + in7 = out7; \ + \ + STEP8_0(out8, outF, in8, inF, cospi30_v, cospi2_v); \ + STEP8_0(out9, outE, in9, inE, cospi14_v, cospi18_v); \ + STEP8_0(outA, outD, inA, inD, cospi22_v, cospi10_v); \ + STEP8_0(outB, outC, inB, inC, cospi6_v, cospi26_v); \ + \ + /* stage 3 */ \ + out0 = in0; \ + out1 = in1; \ + out2 = in2; \ + out3 = in3; \ + \ + STEP8_0(in4, in7, out4, out7, cospi28_v, cospi4_v); \ + STEP8_0(in5, in6, out5, out6, cospi12_v, cospi20_v); \ + \ + out8 = vec_add(in8, in9); \ + out9 = vec_sub(in8, in9); \ + outA = vec_sub(inB, inA); \ + outB = vec_add(inA, inB); \ + outC = vec_add(inC, inD); \ + outD = vec_sub(inC, inD); \ + outE = vec_sub(inF, inE); \ + outF = vec_add(inE, inF); \ + \ + /* stage 4 */ \ + STEP16_1(out0, out1, in1, in0, cospi16_v); \ + STEP8_0(out2, out3, in2, in3, cospi24_v, cospi8_v); \ + in4 = vec_add(out4, out5); \ + in5 = vec_sub(out4, out5); \ + in6 = vec_sub(out7, out6); \ + in7 = vec_add(out6, out7); \ + \ + in8 = out8; \ + inF = outF; \ + tmp16_0 = vec_mergeh(out9, outE); \ + tmp16_1 = vec_mergel(out9, outE); \ + temp10 = vec_sub(vec_mulo(tmp16_0, cospi24_v), vec_mule(tmp16_0, cospi8_v)); \ + temp11 = vec_sub(vec_mulo(tmp16_1, cospi24_v), vec_mule(tmp16_1, cospi8_v)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + in9 = vec_packs(temp10, temp11); \ + temp10 = vec_add(vec_mule(tmp16_0, cospi24_v), vec_mulo(tmp16_0, cospi8_v)); \ + temp11 = vec_add(vec_mule(tmp16_1, cospi24_v), vec_mulo(tmp16_1, cospi8_v)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + inE = vec_packs(temp10, temp11); \ + \ + tmp16_0 = vec_mergeh(outA, outD); \ + tmp16_1 = vec_mergel(outA, outD); \ + temp10 = \ + vec_sub(vec_mule(tmp16_0, cospi24m_v), vec_mulo(tmp16_0, cospi8_v)); \ + temp11 = \ + vec_sub(vec_mule(tmp16_1, cospi24m_v), vec_mulo(tmp16_1, cospi8_v)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + inA = vec_packs(temp10, temp11); \ + temp10 = vec_sub(vec_mulo(tmp16_0, cospi24_v), vec_mule(tmp16_0, cospi8_v)); \ + temp11 = vec_sub(vec_mulo(tmp16_1, cospi24_v), vec_mule(tmp16_1, cospi8_v)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + inD = vec_packs(temp10, temp11); \ + \ + inB = outB; \ + inC = outC; \ + \ + /* stage 5 */ \ + out0 = vec_add(in0, in3); \ + out1 = vec_add(in1, in2); \ + out2 = vec_sub(in1, in2); \ + out3 = vec_sub(in0, in3); \ + out4 = in4; \ + STEP16_1(in6, in5, out5, out6, cospi16_v); \ + out7 = in7; \ + \ + out8 = vec_add(in8, inB); \ + out9 = vec_add(in9, inA); \ + outA = vec_sub(in9, inA); \ + outB = vec_sub(in8, inB); \ + outC = vec_sub(inF, inC); \ + outD = vec_sub(inE, inD); \ + outE = vec_add(inD, inE); \ + outF = vec_add(inC, inF); \ + \ + /* stage 6 */ \ + in0 = vec_add(out0, out7); \ + in1 = vec_add(out1, out6); \ + in2 = vec_add(out2, out5); \ + in3 = vec_add(out3, out4); \ + in4 = vec_sub(out3, out4); \ + in5 = vec_sub(out2, out5); \ + in6 = vec_sub(out1, out6); \ + in7 = vec_sub(out0, out7); \ + in8 = out8; \ + in9 = out9; \ + STEP16_1(outD, outA, inA, inD, cospi16_v); \ + STEP16_1(outC, outB, inB, inC, cospi16_v); \ + inE = outE; \ + inF = outF; \ + \ + /* stage 7 */ \ + out0 = vec_add(in0, inF); \ + out1 = vec_add(in1, inE); \ + out2 = vec_add(in2, inD); \ + out3 = vec_add(in3, inC); \ + out4 = vec_add(in4, inB); \ + out5 = vec_add(in5, inA); \ + out6 = vec_add(in6, in9); \ + out7 = vec_add(in7, in8); \ + out8 = vec_sub(in7, in8); \ + out9 = vec_sub(in6, in9); \ + outA = vec_sub(in5, inA); \ + outB = vec_sub(in4, inB); \ + outC = vec_sub(in3, inC); \ + outD = vec_sub(in2, inD); \ + outE = vec_sub(in1, inE); \ + outF = vec_sub(in0, inF); + +#define PIXEL_ADD_STORE16(in0, in1, dst, offset) \ + d_uh = (int16x8_t)vec_mergeh(dst, zerov); \ + d_ul = (int16x8_t)vec_mergel(dst, zerov); \ + PIXEL_ADD(in0, d_uh, add, shift6); \ + PIXEL_ADD(in1, d_ul, add, shift6); \ + vec_vsx_st(vec_packsu(d_uh, d_ul), offset, dest); + +static void half_idct16x8_vsx(int16x8_t *src) { + int16x8_t tmp0[8], tmp1[8]; + int32x4_t temp10, temp11, temp20, temp21, temp30; + int16x8_t tmp16_0, tmp16_1; + ROUND_SHIFT_INIT; + + TRANSPOSE8x8(src[0], src[2], src[4], src[6], src[8], src[10], src[12], + src[14], tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], + tmp0[6], tmp0[7]); + TRANSPOSE8x8(src[1], src[3], src[5], src[7], src[9], src[11], src[13], + src[15], tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], + tmp1[6], tmp1[7]); + IDCT16(tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], tmp0[6], tmp0[7], + tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], tmp1[6], tmp1[7], + src[0], src[2], src[4], src[6], src[8], src[10], src[12], src[14], + src[1], src[3], src[5], src[7], src[9], src[11], src[13], src[15]); +} + +void vpx_idct16_vsx(int16x8_t *src0, int16x8_t *src1) { + int16x8_t tmp0[8], tmp1[8], tmp2[8], tmp3[8]; + int32x4_t temp10, temp11, temp20, temp21, temp30; + int16x8_t tmp16_0, tmp16_1; + ROUND_SHIFT_INIT; + + TRANSPOSE8x8(src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12], + src0[14], tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], + tmp0[6], tmp0[7]); + TRANSPOSE8x8(src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13], + src0[15], tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], + tmp1[6], tmp1[7]); + TRANSPOSE8x8(src1[0], src1[2], src1[4], src1[6], src1[8], src1[10], src1[12], + src1[14], tmp2[0], tmp2[1], tmp2[2], tmp2[3], tmp2[4], tmp2[5], + tmp2[6], tmp2[7]); + TRANSPOSE8x8(src1[1], src1[3], src1[5], src1[7], src1[9], src1[11], src1[13], + src1[15], tmp3[0], tmp3[1], tmp3[2], tmp3[3], tmp3[4], tmp3[5], + tmp3[6], tmp3[7]); + + IDCT16(tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], tmp0[6], tmp0[7], + tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], tmp1[6], tmp1[7], + src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12], + src0[14], src1[0], src1[2], src1[4], src1[6], src1[8], src1[10], + src1[12], src1[14]); + + IDCT16(tmp2[0], tmp2[1], tmp2[2], tmp2[3], tmp2[4], tmp2[5], tmp2[6], tmp2[7], + tmp3[0], tmp3[1], tmp3[2], tmp3[3], tmp3[4], tmp3[5], tmp3[6], tmp3[7], + src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13], + src0[15], src1[1], src1[3], src1[5], src1[7], src1[9], src1[11], + src1[13], src1[15]); +} + +void vpx_round_store16x16_vsx(int16x8_t *src0, int16x8_t *src1, uint8_t *dest, + int stride) { + uint8x16_t destv[16]; + int16x8_t d_uh, d_ul; + uint8x16_t zerov = vec_splat_u8(0); + uint16x8_t shift6 = vec_splat_u16(6); + int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2)); + + // load dest + LOAD_INPUT16(vec_vsx_ld, dest, 0, stride, destv); + + PIXEL_ADD_STORE16(src0[0], src0[1], destv[0], 0); + PIXEL_ADD_STORE16(src0[2], src0[3], destv[1], stride); + PIXEL_ADD_STORE16(src0[4], src0[5], destv[2], 2 * stride); + PIXEL_ADD_STORE16(src0[6], src0[7], destv[3], 3 * stride); + PIXEL_ADD_STORE16(src0[8], src0[9], destv[4], 4 * stride); + PIXEL_ADD_STORE16(src0[10], src0[11], destv[5], 5 * stride); + PIXEL_ADD_STORE16(src0[12], src0[13], destv[6], 6 * stride); + PIXEL_ADD_STORE16(src0[14], src0[15], destv[7], 7 * stride); + + PIXEL_ADD_STORE16(src1[0], src1[1], destv[8], 8 * stride); + PIXEL_ADD_STORE16(src1[2], src1[3], destv[9], 9 * stride); + PIXEL_ADD_STORE16(src1[4], src1[5], destv[10], 10 * stride); + PIXEL_ADD_STORE16(src1[6], src1[7], destv[11], 11 * stride); + PIXEL_ADD_STORE16(src1[8], src1[9], destv[12], 12 * stride); + PIXEL_ADD_STORE16(src1[10], src1[11], destv[13], 13 * stride); + PIXEL_ADD_STORE16(src1[12], src1[13], destv[14], 14 * stride); + PIXEL_ADD_STORE16(src1[14], src1[15], destv[15], 15 * stride); +} +void vpx_idct16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride) { + int16x8_t src0[16], src1[16]; + int16x8_t tmp0[8], tmp1[8], tmp2[8], tmp3[8]; + int32x4_t temp10, temp11, temp20, temp21, temp30; + int16x8_t tmp16_0, tmp16_1; + ROUND_SHIFT_INIT; + + LOAD_INPUT16(load_tran_low, input, 0, 8 * sizeof(*input), src0); + LOAD_INPUT16(load_tran_low, input, 8 * 8 * 2 * sizeof(*input), + 8 * sizeof(*input), src1); + + // transform rows + // transform the upper half of 16x16 matrix + half_idct16x8_vsx(src0); + TRANSPOSE8x8(src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12], + src0[14], tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], + tmp0[6], tmp0[7]); + TRANSPOSE8x8(src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13], + src0[15], tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], + tmp1[6], tmp1[7]); + + // transform the lower half of 16x16 matrix + half_idct16x8_vsx(src1); + TRANSPOSE8x8(src1[0], src1[2], src1[4], src1[6], src1[8], src1[10], src1[12], + src1[14], tmp2[0], tmp2[1], tmp2[2], tmp2[3], tmp2[4], tmp2[5], + tmp2[6], tmp2[7]); + TRANSPOSE8x8(src1[1], src1[3], src1[5], src1[7], src1[9], src1[11], src1[13], + src1[15], tmp3[0], tmp3[1], tmp3[2], tmp3[3], tmp3[4], tmp3[5], + tmp3[6], tmp3[7]); + + // transform columns + // left half first + IDCT16(tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], tmp0[6], tmp0[7], + tmp2[0], tmp2[1], tmp2[2], tmp2[3], tmp2[4], tmp2[5], tmp2[6], tmp2[7], + src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12], + src0[14], src1[0], src1[2], src1[4], src1[6], src1[8], src1[10], + src1[12], src1[14]); + // right half + IDCT16(tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], tmp1[6], tmp1[7], + tmp3[0], tmp3[1], tmp3[2], tmp3[3], tmp3[4], tmp3[5], tmp3[6], tmp3[7], + src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13], + src0[15], src1[1], src1[3], src1[5], src1[7], src1[9], src1[11], + src1[13], src1[15]); + + vpx_round_store16x16_vsx(src0, src1, dest, stride); +} + +#define LOAD_8x32(load, in00, in01, in02, in03, in10, in11, in12, in13, in20, \ + in21, in22, in23, in30, in31, in32, in33, in40, in41, in42, \ + in43, in50, in51, in52, in53, in60, in61, in62, in63, in70, \ + in71, in72, in73, offset) \ + /* load the first row from the 8x32 block*/ \ + in00 = load(offset, input); \ + in01 = load(offset + 16, input); \ + in02 = load(offset + 2 * 16, input); \ + in03 = load(offset + 3 * 16, input); \ + \ + in10 = load(offset + 4 * 16, input); \ + in11 = load(offset + 5 * 16, input); \ + in12 = load(offset + 6 * 16, input); \ + in13 = load(offset + 7 * 16, input); \ + \ + in20 = load(offset + 8 * 16, input); \ + in21 = load(offset + 9 * 16, input); \ + in22 = load(offset + 10 * 16, input); \ + in23 = load(offset + 11 * 16, input); \ + \ + in30 = load(offset + 12 * 16, input); \ + in31 = load(offset + 13 * 16, input); \ + in32 = load(offset + 14 * 16, input); \ + in33 = load(offset + 15 * 16, input); \ + \ + in40 = load(offset + 16 * 16, input); \ + in41 = load(offset + 17 * 16, input); \ + in42 = load(offset + 18 * 16, input); \ + in43 = load(offset + 19 * 16, input); \ + \ + in50 = load(offset + 20 * 16, input); \ + in51 = load(offset + 21 * 16, input); \ + in52 = load(offset + 22 * 16, input); \ + in53 = load(offset + 23 * 16, input); \ + \ + in60 = load(offset + 24 * 16, input); \ + in61 = load(offset + 25 * 16, input); \ + in62 = load(offset + 26 * 16, input); \ + in63 = load(offset + 27 * 16, input); \ + \ + /* load the last row from the 8x32 block*/ \ + in70 = load(offset + 28 * 16, input); \ + in71 = load(offset + 29 * 16, input); \ + in72 = load(offset + 30 * 16, input); \ + in73 = load(offset + 31 * 16, input); + +/* for the: temp1 = -step[x] * cospi_q + step[y] * cospi_z + * temp2 = step[x] * cospi_z + step[y] * cospi_q */ +#define STEP32(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1) \ + tmp16_0 = vec_mergeh(inpt0, inpt1); \ + tmp16_1 = vec_mergel(inpt0, inpt1); \ + temp10 = vec_sub(vec_mulo(tmp16_0, cospi1), vec_mule(tmp16_0, cospi0)); \ + temp11 = vec_sub(vec_mulo(tmp16_1, cospi1), vec_mule(tmp16_1, cospi0)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + outpt0 = vec_packs(temp10, temp11); \ + temp10 = vec_add(vec_mule(tmp16_0, cospi1), vec_mulo(tmp16_0, cospi0)); \ + temp11 = vec_add(vec_mule(tmp16_1, cospi1), vec_mulo(tmp16_1, cospi0)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + outpt1 = vec_packs(temp10, temp11); + +/* for the: temp1 = -step[x] * cospi_q - step[y] * cospi_z + * temp2 = -step[x] * cospi_z + step[y] * cospi_q */ +#define STEP32_1(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1, cospi1m) \ + tmp16_0 = vec_mergeh(inpt0, inpt1); \ + tmp16_1 = vec_mergel(inpt0, inpt1); \ + temp10 = vec_sub(vec_mulo(tmp16_0, cospi1m), vec_mule(tmp16_0, cospi0)); \ + temp11 = vec_sub(vec_mulo(tmp16_1, cospi1m), vec_mule(tmp16_1, cospi0)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + outpt0 = vec_packs(temp10, temp11); \ + temp10 = vec_sub(vec_mulo(tmp16_0, cospi0), vec_mule(tmp16_0, cospi1)); \ + temp11 = vec_sub(vec_mulo(tmp16_1, cospi0), vec_mule(tmp16_1, cospi1)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + outpt1 = vec_packs(temp10, temp11); + +#define IDCT32(in0, in1, in2, in3, out) \ + \ + /* stage 1 */ \ + /* out[0][0] = in[0][0]; */ \ + out[0][1] = in2[0]; \ + out[0][2] = in1[0]; \ + out[0][3] = in3[0]; \ + out[0][4] = in0[4]; \ + out[0][5] = in2[4]; \ + out[0][6] = in1[4]; \ + out[0][7] = in3[4]; \ + out[1][0] = in0[2]; \ + out[1][1] = in2[2]; \ + out[1][2] = in1[2]; \ + out[1][3] = in3[2]; \ + out[1][4] = in0[6]; \ + out[1][5] = in2[6]; \ + out[1][6] = in1[6]; \ + out[1][7] = in3[6]; \ + \ + STEP8_0(in0[1], in3[7], out[2][0], out[3][7], cospi31_v, cospi1_v); \ + STEP8_0(in2[1], in1[7], out[2][1], out[3][6], cospi15_v, cospi17_v); \ + STEP8_0(in1[1], in2[7], out[2][2], out[3][5], cospi23_v, cospi9_v); \ + STEP8_0(in3[1], in0[7], out[2][3], out[3][4], cospi7_v, cospi25_v); \ + STEP8_0(in0[5], in3[3], out[2][4], out[3][3], cospi27_v, cospi5_v); \ + STEP8_0(in2[5], in1[3], out[2][5], out[3][2], cospi11_v, cospi21_v); \ + STEP8_0(in1[5], in2[3], out[2][6], out[3][1], cospi19_v, cospi13_v); \ + STEP8_0(in3[5], in0[3], out[2][7], out[3][0], cospi3_v, cospi29_v); \ + \ + /* stage 2 */ \ + /* in0[0] = out[0][0]; */ \ + in0[1] = out[0][1]; \ + in0[2] = out[0][2]; \ + in0[3] = out[0][3]; \ + in0[4] = out[0][4]; \ + in0[5] = out[0][5]; \ + in0[6] = out[0][6]; \ + in0[7] = out[0][7]; \ + \ + STEP8_0(out[1][0], out[1][7], in1[0], in1[7], cospi30_v, cospi2_v); \ + STEP8_0(out[1][1], out[1][6], in1[1], in1[6], cospi14_v, cospi18_v); \ + STEP8_0(out[1][2], out[1][5], in1[2], in1[5], cospi22_v, cospi10_v); \ + STEP8_0(out[1][3], out[1][4], in1[3], in1[4], cospi6_v, cospi26_v); \ + \ + in2[0] = vec_add(out[2][0], out[2][1]); \ + in2[1] = vec_sub(out[2][0], out[2][1]); \ + in2[2] = vec_sub(out[2][3], out[2][2]); \ + in2[3] = vec_add(out[2][3], out[2][2]); \ + in2[4] = vec_add(out[2][4], out[2][5]); \ + in2[5] = vec_sub(out[2][4], out[2][5]); \ + in2[6] = vec_sub(out[2][7], out[2][6]); \ + in2[7] = vec_add(out[2][7], out[2][6]); \ + in3[0] = vec_add(out[3][0], out[3][1]); \ + in3[1] = vec_sub(out[3][0], out[3][1]); \ + in3[2] = vec_sub(out[3][3], out[3][2]); \ + in3[3] = vec_add(out[3][3], out[3][2]); \ + in3[4] = vec_add(out[3][4], out[3][5]); \ + in3[5] = vec_sub(out[3][4], out[3][5]); \ + in3[6] = vec_sub(out[3][7], out[3][6]); \ + in3[7] = vec_add(out[3][6], out[3][7]); \ + \ + /* stage 3 */ \ + out[0][0] = in0[0]; \ + out[0][1] = in0[1]; \ + out[0][2] = in0[2]; \ + out[0][3] = in0[3]; \ + \ + STEP8_0(in0[4], in0[7], out[0][4], out[0][7], cospi28_v, cospi4_v); \ + STEP8_0(in0[5], in0[6], out[0][5], out[0][6], cospi12_v, cospi20_v); \ + \ + out[1][0] = vec_add(in1[0], in1[1]); \ + out[1][1] = vec_sub(in1[0], in1[1]); \ + out[1][2] = vec_sub(in1[3], in1[2]); \ + out[1][3] = vec_add(in1[2], in1[3]); \ + out[1][4] = vec_add(in1[4], in1[5]); \ + out[1][5] = vec_sub(in1[4], in1[5]); \ + out[1][6] = vec_sub(in1[7], in1[6]); \ + out[1][7] = vec_add(in1[6], in1[7]); \ + \ + out[2][0] = in2[0]; \ + out[3][7] = in3[7]; \ + STEP32(in2[1], in3[6], out[2][1], out[3][6], cospi4_v, cospi28_v); \ + STEP32_1(in2[2], in3[5], out[2][2], out[3][5], cospi28_v, cospi4_v, \ + cospi4m_v); \ + out[2][3] = in2[3]; \ + out[2][4] = in2[4]; \ + STEP32(in2[5], in3[2], out[2][5], out[3][2], cospi20_v, cospi12_v); \ + STEP32_1(in2[6], in3[1], out[2][6], out[3][1], cospi12_v, cospi20_v, \ + cospi20m_v); \ + out[2][7] = in2[7]; \ + out[3][0] = in3[0]; \ + out[3][3] = in3[3]; \ + out[3][4] = in3[4]; \ + \ + /* stage 4 */ \ + STEP16_1(out[0][0], out[0][1], in0[1], in0[0], cospi16_v); \ + STEP8_0(out[0][2], out[0][3], in0[2], in0[3], cospi24_v, cospi8_v); \ + in0[4] = vec_add(out[0][4], out[0][5]); \ + in0[5] = vec_sub(out[0][4], out[0][5]); \ + in0[6] = vec_sub(out[0][7], out[0][6]); \ + in0[7] = vec_add(out[0][7], out[0][6]); \ + \ + in1[0] = out[1][0]; \ + in1[7] = out[1][7]; \ + STEP32(out[1][1], out[1][6], in1[1], in1[6], cospi8_v, cospi24_v); \ + STEP32_1(out[1][2], out[1][5], in1[2], in1[5], cospi24_v, cospi8_v, \ + cospi8m_v); \ + in1[3] = out[1][3]; \ + in1[4] = out[1][4]; \ + \ + in2[0] = vec_add(out[2][0], out[2][3]); \ + in2[1] = vec_add(out[2][1], out[2][2]); \ + in2[2] = vec_sub(out[2][1], out[2][2]); \ + in2[3] = vec_sub(out[2][0], out[2][3]); \ + in2[4] = vec_sub(out[2][7], out[2][4]); \ + in2[5] = vec_sub(out[2][6], out[2][5]); \ + in2[6] = vec_add(out[2][5], out[2][6]); \ + in2[7] = vec_add(out[2][4], out[2][7]); \ + \ + in3[0] = vec_add(out[3][0], out[3][3]); \ + in3[1] = vec_add(out[3][1], out[3][2]); \ + in3[2] = vec_sub(out[3][1], out[3][2]); \ + in3[3] = vec_sub(out[3][0], out[3][3]); \ + in3[4] = vec_sub(out[3][7], out[3][4]); \ + in3[5] = vec_sub(out[3][6], out[3][5]); \ + in3[6] = vec_add(out[3][5], out[3][6]); \ + in3[7] = vec_add(out[3][4], out[3][7]); \ + \ + /* stage 5 */ \ + out[0][0] = vec_add(in0[0], in0[3]); \ + out[0][1] = vec_add(in0[1], in0[2]); \ + out[0][2] = vec_sub(in0[1], in0[2]); \ + out[0][3] = vec_sub(in0[0], in0[3]); \ + out[0][4] = in0[4]; \ + STEP16_1(in0[6], in0[5], out[0][5], out[0][6], cospi16_v); \ + out[0][7] = in0[7]; \ + \ + out[1][0] = vec_add(in1[0], in1[3]); \ + out[1][1] = vec_add(in1[1], in1[2]); \ + out[1][2] = vec_sub(in1[1], in1[2]); \ + out[1][3] = vec_sub(in1[0], in1[3]); \ + out[1][4] = vec_sub(in1[7], in1[4]); \ + out[1][5] = vec_sub(in1[6], in1[5]); \ + out[1][6] = vec_add(in1[5], in1[6]); \ + out[1][7] = vec_add(in1[4], in1[7]); \ + \ + out[2][0] = in2[0]; \ + out[2][1] = in2[1]; \ + STEP32(in2[2], in3[5], out[2][2], out[3][5], cospi8_v, cospi24_v); \ + STEP32(in2[3], in3[4], out[2][3], out[3][4], cospi8_v, cospi24_v); \ + STEP32_1(in2[4], in3[3], out[2][4], out[3][3], cospi24_v, cospi8_v, \ + cospi8m_v); \ + STEP32_1(in2[5], in3[2], out[2][5], out[3][2], cospi24_v, cospi8_v, \ + cospi8m_v); \ + out[2][6] = in2[6]; \ + out[2][7] = in2[7]; \ + out[3][0] = in3[0]; \ + out[3][1] = in3[1]; \ + out[3][6] = in3[6]; \ + out[3][7] = in3[7]; \ + \ + /* stage 6 */ \ + in0[0] = vec_add(out[0][0], out[0][7]); \ + in0[1] = vec_add(out[0][1], out[0][6]); \ + in0[2] = vec_add(out[0][2], out[0][5]); \ + in0[3] = vec_add(out[0][3], out[0][4]); \ + in0[4] = vec_sub(out[0][3], out[0][4]); \ + in0[5] = vec_sub(out[0][2], out[0][5]); \ + in0[6] = vec_sub(out[0][1], out[0][6]); \ + in0[7] = vec_sub(out[0][0], out[0][7]); \ + in1[0] = out[1][0]; \ + in1[1] = out[1][1]; \ + STEP16_1(out[1][5], out[1][2], in1[2], in1[5], cospi16_v); \ + STEP16_1(out[1][4], out[1][3], in1[3], in1[4], cospi16_v); \ + in1[6] = out[1][6]; \ + in1[7] = out[1][7]; \ + \ + in2[0] = vec_add(out[2][0], out[2][7]); \ + in2[1] = vec_add(out[2][1], out[2][6]); \ + in2[2] = vec_add(out[2][2], out[2][5]); \ + in2[3] = vec_add(out[2][3], out[2][4]); \ + in2[4] = vec_sub(out[2][3], out[2][4]); \ + in2[5] = vec_sub(out[2][2], out[2][5]); \ + in2[6] = vec_sub(out[2][1], out[2][6]); \ + in2[7] = vec_sub(out[2][0], out[2][7]); \ + \ + in3[0] = vec_sub(out[3][7], out[3][0]); \ + in3[1] = vec_sub(out[3][6], out[3][1]); \ + in3[2] = vec_sub(out[3][5], out[3][2]); \ + in3[3] = vec_sub(out[3][4], out[3][3]); \ + in3[4] = vec_add(out[3][4], out[3][3]); \ + in3[5] = vec_add(out[3][5], out[3][2]); \ + in3[6] = vec_add(out[3][6], out[3][1]); \ + in3[7] = vec_add(out[3][7], out[3][0]); \ + \ + /* stage 7 */ \ + out[0][0] = vec_add(in0[0], in1[7]); \ + out[0][1] = vec_add(in0[1], in1[6]); \ + out[0][2] = vec_add(in0[2], in1[5]); \ + out[0][3] = vec_add(in0[3], in1[4]); \ + out[0][4] = vec_add(in0[4], in1[3]); \ + out[0][5] = vec_add(in0[5], in1[2]); \ + out[0][6] = vec_add(in0[6], in1[1]); \ + out[0][7] = vec_add(in0[7], in1[0]); \ + out[1][0] = vec_sub(in0[7], in1[0]); \ + out[1][1] = vec_sub(in0[6], in1[1]); \ + out[1][2] = vec_sub(in0[5], in1[2]); \ + out[1][3] = vec_sub(in0[4], in1[3]); \ + out[1][4] = vec_sub(in0[3], in1[4]); \ + out[1][5] = vec_sub(in0[2], in1[5]); \ + out[1][6] = vec_sub(in0[1], in1[6]); \ + out[1][7] = vec_sub(in0[0], in1[7]); \ + \ + out[2][0] = in2[0]; \ + out[2][1] = in2[1]; \ + out[2][2] = in2[2]; \ + out[2][3] = in2[3]; \ + STEP16_1(in3[3], in2[4], out[2][4], out[3][3], cospi16_v); \ + STEP16_1(in3[2], in2[5], out[2][5], out[3][2], cospi16_v); \ + STEP16_1(in3[1], in2[6], out[2][6], out[3][1], cospi16_v); \ + STEP16_1(in3[0], in2[7], out[2][7], out[3][0], cospi16_v); \ + out[3][4] = in3[4]; \ + out[3][5] = in3[5]; \ + out[3][6] = in3[6]; \ + out[3][7] = in3[7]; \ + \ + /* final */ \ + in0[0] = vec_add(out[0][0], out[3][7]); \ + in0[1] = vec_add(out[0][1], out[3][6]); \ + in0[2] = vec_add(out[0][2], out[3][5]); \ + in0[3] = vec_add(out[0][3], out[3][4]); \ + in0[4] = vec_add(out[0][4], out[3][3]); \ + in0[5] = vec_add(out[0][5], out[3][2]); \ + in0[6] = vec_add(out[0][6], out[3][1]); \ + in0[7] = vec_add(out[0][7], out[3][0]); \ + in1[0] = vec_add(out[1][0], out[2][7]); \ + in1[1] = vec_add(out[1][1], out[2][6]); \ + in1[2] = vec_add(out[1][2], out[2][5]); \ + in1[3] = vec_add(out[1][3], out[2][4]); \ + in1[4] = vec_add(out[1][4], out[2][3]); \ + in1[5] = vec_add(out[1][5], out[2][2]); \ + in1[6] = vec_add(out[1][6], out[2][1]); \ + in1[7] = vec_add(out[1][7], out[2][0]); \ + in2[0] = vec_sub(out[1][7], out[2][0]); \ + in2[1] = vec_sub(out[1][6], out[2][1]); \ + in2[2] = vec_sub(out[1][5], out[2][2]); \ + in2[3] = vec_sub(out[1][4], out[2][3]); \ + in2[4] = vec_sub(out[1][3], out[2][4]); \ + in2[5] = vec_sub(out[1][2], out[2][5]); \ + in2[6] = vec_sub(out[1][1], out[2][6]); \ + in2[7] = vec_sub(out[1][0], out[2][7]); \ + in3[0] = vec_sub(out[0][7], out[3][0]); \ + in3[1] = vec_sub(out[0][6], out[3][1]); \ + in3[2] = vec_sub(out[0][5], out[3][2]); \ + in3[3] = vec_sub(out[0][4], out[3][3]); \ + in3[4] = vec_sub(out[0][3], out[3][4]); \ + in3[5] = vec_sub(out[0][2], out[3][5]); \ + in3[6] = vec_sub(out[0][1], out[3][6]); \ + in3[7] = vec_sub(out[0][0], out[3][7]); + +// NOT A FULL TRANSPOSE! Transposes just each 8x8 block in each row, +// does not transpose rows +#define TRANSPOSE_8x32(in, out) \ + /* transpose 4 of 8x8 blocks */ \ + TRANSPOSE8x8(in[0][0], in[0][1], in[0][2], in[0][3], in[0][4], in[0][5], \ + in[0][6], in[0][7], out[0][0], out[0][1], out[0][2], out[0][3], \ + out[0][4], out[0][5], out[0][6], out[0][7]); \ + TRANSPOSE8x8(in[1][0], in[1][1], in[1][2], in[1][3], in[1][4], in[1][5], \ + in[1][6], in[1][7], out[1][0], out[1][1], out[1][2], out[1][3], \ + out[1][4], out[1][5], out[1][6], out[1][7]); \ + TRANSPOSE8x8(in[2][0], in[2][1], in[2][2], in[2][3], in[2][4], in[2][5], \ + in[2][6], in[2][7], out[2][0], out[2][1], out[2][2], out[2][3], \ + out[2][4], out[2][5], out[2][6], out[2][7]); \ + TRANSPOSE8x8(in[3][0], in[3][1], in[3][2], in[3][3], in[3][4], in[3][5], \ + in[3][6], in[3][7], out[3][0], out[3][1], out[3][2], out[3][3], \ + out[3][4], out[3][5], out[3][6], out[3][7]); + +#define PIXEL_ADD_STORE32(in0, in1, in2, in3, step) \ + dst = vec_vsx_ld((step)*stride, dest); \ + d_uh = (int16x8_t)vec_mergeh(dst, zerov); \ + d_ul = (int16x8_t)vec_mergel(dst, zerov); \ + PIXEL_ADD(in0, d_uh, add, shift6); \ + PIXEL_ADD(in1, d_ul, add, shift6); \ + vec_vsx_st(vec_packsu(d_uh, d_ul), (step)*stride, dest); \ + dst = vec_vsx_ld((step)*stride + 16, dest); \ + d_uh = (int16x8_t)vec_mergeh(dst, zerov); \ + d_ul = (int16x8_t)vec_mergel(dst, zerov); \ + PIXEL_ADD(in2, d_uh, add, shift6); \ + PIXEL_ADD(in3, d_ul, add, shift6); \ + vec_vsx_st(vec_packsu(d_uh, d_ul), (step)*stride + 16, dest); + +#define ADD_STORE_BLOCK(in, offset) \ + PIXEL_ADD_STORE32(in[0][0], in[1][0], in[2][0], in[3][0], (offset) + 0); \ + PIXEL_ADD_STORE32(in[0][1], in[1][1], in[2][1], in[3][1], (offset) + 1); \ + PIXEL_ADD_STORE32(in[0][2], in[1][2], in[2][2], in[3][2], (offset) + 2); \ + PIXEL_ADD_STORE32(in[0][3], in[1][3], in[2][3], in[3][3], (offset) + 3); \ + PIXEL_ADD_STORE32(in[0][4], in[1][4], in[2][4], in[3][4], (offset) + 4); \ + PIXEL_ADD_STORE32(in[0][5], in[1][5], in[2][5], in[3][5], (offset) + 5); \ + PIXEL_ADD_STORE32(in[0][6], in[1][6], in[2][6], in[3][6], (offset) + 6); \ + PIXEL_ADD_STORE32(in[0][7], in[1][7], in[2][7], in[3][7], (offset) + 7); + +void vpx_idct32x32_1024_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride) { + int16x8_t src0[4][8], src1[4][8], src2[4][8], src3[4][8], tmp[4][8]; + int16x8_t tmp16_0, tmp16_1; + int32x4_t temp10, temp11, temp20, temp21, temp30; + uint8x16_t dst; + int16x8_t d_uh, d_ul; + int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2)); + uint16x8_t shift6 = vec_splat_u16(6); + uint8x16_t zerov = vec_splat_u8(0); + + ROUND_SHIFT_INIT; + + LOAD_8x32(load_tran_low, src0[0][0], src0[1][0], src0[2][0], src0[3][0], + src0[0][1], src0[1][1], src0[2][1], src0[3][1], src0[0][2], + src0[1][2], src0[2][2], src0[3][2], src0[0][3], src0[1][3], + src0[2][3], src0[3][3], src0[0][4], src0[1][4], src0[2][4], + src0[3][4], src0[0][5], src0[1][5], src0[2][5], src0[3][5], + src0[0][6], src0[1][6], src0[2][6], src0[3][6], src0[0][7], + src0[1][7], src0[2][7], src0[3][7], 0); + // Rows + // transpose the first row of 8x8 blocks + TRANSPOSE_8x32(src0, tmp); + // transform the 32x8 column + IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src0); + TRANSPOSE_8x32(tmp, src0); + + LOAD_8x32(load_tran_low, src1[0][0], src1[1][0], src1[2][0], src1[3][0], + src1[0][1], src1[1][1], src1[2][1], src1[3][1], src1[0][2], + src1[1][2], src1[2][2], src1[3][2], src1[0][3], src1[1][3], + src1[2][3], src1[3][3], src1[0][4], src1[1][4], src1[2][4], + src1[3][4], src1[0][5], src1[1][5], src1[2][5], src1[3][5], + src1[0][6], src1[1][6], src1[2][6], src1[3][6], src1[0][7], + src1[1][7], src1[2][7], src1[3][7], 512); + TRANSPOSE_8x32(src1, tmp); + IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src1); + TRANSPOSE_8x32(tmp, src1); + + LOAD_8x32(load_tran_low, src2[0][0], src2[1][0], src2[2][0], src2[3][0], + src2[0][1], src2[1][1], src2[2][1], src2[3][1], src2[0][2], + src2[1][2], src2[2][2], src2[3][2], src2[0][3], src2[1][3], + src2[2][3], src2[3][3], src2[0][4], src2[1][4], src2[2][4], + src2[3][4], src2[0][5], src2[1][5], src2[2][5], src2[3][5], + src2[0][6], src2[1][6], src2[2][6], src2[3][6], src2[0][7], + src2[1][7], src2[2][7], src2[3][7], 1024); + TRANSPOSE_8x32(src2, tmp); + IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src2); + TRANSPOSE_8x32(tmp, src2); + + LOAD_8x32(load_tran_low, src3[0][0], src3[1][0], src3[2][0], src3[3][0], + src3[0][1], src3[1][1], src3[2][1], src3[3][1], src3[0][2], + src3[1][2], src3[2][2], src3[3][2], src3[0][3], src3[1][3], + src3[2][3], src3[3][3], src3[0][4], src3[1][4], src3[2][4], + src3[3][4], src3[0][5], src3[1][5], src3[2][5], src3[3][5], + src3[0][6], src3[1][6], src3[2][6], src3[3][6], src3[0][7], + src3[1][7], src3[2][7], src3[3][7], 1536); + TRANSPOSE_8x32(src3, tmp); + IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src3); + TRANSPOSE_8x32(tmp, src3); + + // Columns + IDCT32(src0[0], src1[0], src2[0], src3[0], tmp); + IDCT32(src0[1], src1[1], src2[1], src3[1], tmp); + IDCT32(src0[2], src1[2], src2[2], src3[2], tmp); + IDCT32(src0[3], src1[3], src2[3], src3[3], tmp); + + ADD_STORE_BLOCK(src0, 0); + ADD_STORE_BLOCK(src1, 8); + ADD_STORE_BLOCK(src2, 16); + ADD_STORE_BLOCK(src3, 24); +} + +#define TRANSFORM_COLS \ + v32_a = vec_add(v32_a, v32_c); \ + v32_d = vec_sub(v32_d, v32_b); \ + v32_e = vec_sub(v32_a, v32_d); \ + v32_e = vec_sra(v32_e, one); \ + v32_b = vec_sub(v32_e, v32_b); \ + v32_c = vec_sub(v32_e, v32_c); \ + v32_a = vec_sub(v32_a, v32_b); \ + v32_d = vec_add(v32_d, v32_c); \ + v_a = vec_packs(v32_a, v32_b); \ + v_c = vec_packs(v32_c, v32_d); + +#define TRANSPOSE_WHT \ + tmp_a = vec_mergeh(v_a, v_c); \ + tmp_c = vec_mergel(v_a, v_c); \ + v_a = vec_mergeh(tmp_a, tmp_c); \ + v_c = vec_mergel(tmp_a, tmp_c); + +void vpx_iwht4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride) { + int16x8_t v_a = load_tran_low(0, input); + int16x8_t v_c = load_tran_low(8 * sizeof(*input), input); + int16x8_t tmp_a, tmp_c; + uint16x8_t two = vec_splat_u16(2); + uint32x4_t one = vec_splat_u32(1); + int16x8_t tmp16_0, tmp16_1; + int32x4_t v32_a, v32_c, v32_d, v32_b, v32_e; + uint8x16_t dest0 = vec_vsx_ld(0, dest); + uint8x16_t dest1 = vec_vsx_ld(stride, dest); + uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest); + uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest); + int16x8_t d_u0 = (int16x8_t)unpack_to_u16_h(dest0); + int16x8_t d_u1 = (int16x8_t)unpack_to_u16_h(dest1); + int16x8_t d_u2 = (int16x8_t)unpack_to_u16_h(dest2); + int16x8_t d_u3 = (int16x8_t)unpack_to_u16_h(dest3); + uint8x16_t output_v; + uint8_t tmp_dest[16]; + int i, j; + + v_a = vec_sra(v_a, two); + v_c = vec_sra(v_c, two); + + TRANSPOSE_WHT; + + v32_a = vec_unpackh(v_a); + v32_c = vec_unpackl(v_a); + + v32_d = vec_unpackh(v_c); + v32_b = vec_unpackl(v_c); + + TRANSFORM_COLS; + + TRANSPOSE_WHT; + + v32_a = vec_unpackh(v_a); + v32_c = vec_unpackl(v_a); + v32_d = vec_unpackh(v_c); + v32_b = vec_unpackl(v_c); + + TRANSFORM_COLS; + + PACK_STORE(v_a, v_c); +} + +void vp9_iadst4_vsx(int16x8_t *in, int16x8_t *out) { + int16x8_t sinpi_1_3_v, sinpi_4_2_v, sinpi_2_3_v, sinpi_1_4_v, sinpi_12_n3_v; + int32x4_t v_v[5], u_v[4]; + int32x4_t zerov = vec_splat_s32(0); + int16x8_t tmp0, tmp1; + int16x8_t zero16v = vec_splat_s16(0); + uint32x4_t shift16 = vec_sl(vec_splat_u32(8), vec_splat_u32(1)); + ROUND_SHIFT_INIT; + + sinpi_1_3_v = vec_mergel(sinpi_1_9_v, sinpi_3_9_v); + sinpi_4_2_v = vec_mergel(sinpi_4_9_v, sinpi_2_9_v); + sinpi_2_3_v = vec_mergel(sinpi_2_9_v, sinpi_3_9_v); + sinpi_1_4_v = vec_mergel(sinpi_1_9_v, sinpi_4_9_v); + sinpi_12_n3_v = vec_mergel(vec_add(sinpi_1_9_v, sinpi_2_9_v), + vec_sub(zero16v, sinpi_3_9_v)); + + tmp0 = (int16x8_t)vec_mergeh((int32x4_t)in[0], (int32x4_t)in[1]); + tmp1 = (int16x8_t)vec_mergel((int32x4_t)in[0], (int32x4_t)in[1]); + in[0] = (int16x8_t)vec_mergeh((int32x4_t)tmp0, (int32x4_t)tmp1); + in[1] = (int16x8_t)vec_mergel((int32x4_t)tmp0, (int32x4_t)tmp1); + + v_v[0] = vec_msum(in[0], sinpi_1_3_v, zerov); + v_v[1] = vec_msum(in[1], sinpi_4_2_v, zerov); + v_v[2] = vec_msum(in[0], sinpi_2_3_v, zerov); + v_v[3] = vec_msum(in[1], sinpi_1_4_v, zerov); + v_v[4] = vec_msum(in[0], sinpi_12_n3_v, zerov); + + in[0] = vec_sub(in[0], in[1]); + in[1] = (int16x8_t)vec_sra((int32x4_t)in[1], shift16); + in[0] = vec_add(in[0], in[1]); + in[0] = (int16x8_t)vec_sl((int32x4_t)in[0], shift16); + + u_v[0] = vec_add(v_v[0], v_v[1]); + u_v[1] = vec_sub(v_v[2], v_v[3]); + u_v[2] = vec_msum(in[0], sinpi_1_3_v, zerov); + u_v[3] = vec_sub(v_v[1], v_v[3]); + u_v[3] = vec_add(u_v[3], v_v[4]); + + DCT_CONST_ROUND_SHIFT(u_v[0]); + DCT_CONST_ROUND_SHIFT(u_v[1]); + DCT_CONST_ROUND_SHIFT(u_v[2]); + DCT_CONST_ROUND_SHIFT(u_v[3]); + + out[0] = vec_packs(u_v[0], u_v[1]); + out[1] = vec_packs(u_v[2], u_v[3]); +} + +#define MSUM_ROUND_SHIFT(a, b, cospi) \ + b = vec_msums(a, cospi, zerov); \ + DCT_CONST_ROUND_SHIFT(b); + +#define IADST_WRAPLOW(in0, in1, tmp0, tmp1, out, cospi) \ + MSUM_ROUND_SHIFT(in0, tmp0, cospi); \ + MSUM_ROUND_SHIFT(in1, tmp1, cospi); \ + out = vec_packs(tmp0, tmp1); + +void vp9_iadst8_vsx(int16x8_t *in, int16x8_t *out) { + int32x4_t tmp0[16], tmp1[16]; + + int32x4_t zerov = vec_splat_s32(0); + int16x8_t zero16v = vec_splat_s16(0); + int16x8_t cospi_p02_p30_v = vec_mergel(cospi2_v, cospi30_v); + int16x8_t cospi_p30_m02_v = vec_mergel(cospi30_v, cospi2m_v); + int16x8_t cospi_p10_p22_v = vec_mergel(cospi10_v, cospi22_v); + int16x8_t cospi_p22_m10_v = vec_mergel(cospi22_v, cospi10m_v); + int16x8_t cospi_p18_p14_v = vec_mergel(cospi18_v, cospi14_v); + int16x8_t cospi_p14_m18_v = vec_mergel(cospi14_v, cospi18m_v); + int16x8_t cospi_p26_p06_v = vec_mergel(cospi26_v, cospi6_v); + int16x8_t cospi_p06_m26_v = vec_mergel(cospi6_v, cospi26m_v); + int16x8_t cospi_p08_p24_v = vec_mergel(cospi8_v, cospi24_v); + int16x8_t cospi_p24_m08_v = vec_mergel(cospi24_v, cospi8m_v); + int16x8_t cospi_m24_p08_v = vec_mergel(cospi24m_v, cospi8_v); + int16x8_t cospi_p16_m16_v = vec_mergel(cospi16_v, cospi16m_v); + ROUND_SHIFT_INIT; + + TRANSPOSE8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], out[0], + out[1], out[2], out[3], out[4], out[5], out[6], out[7]); + + // stage 1 + // interleave and multiply/add into 32-bit integer + in[0] = vec_mergeh(out[7], out[0]); + in[1] = vec_mergel(out[7], out[0]); + in[2] = vec_mergeh(out[5], out[2]); + in[3] = vec_mergel(out[5], out[2]); + in[4] = vec_mergeh(out[3], out[4]); + in[5] = vec_mergel(out[3], out[4]); + in[6] = vec_mergeh(out[1], out[6]); + in[7] = vec_mergel(out[1], out[6]); + + tmp1[0] = vec_msum(in[0], cospi_p02_p30_v, zerov); + tmp1[1] = vec_msum(in[1], cospi_p02_p30_v, zerov); + tmp1[2] = vec_msum(in[0], cospi_p30_m02_v, zerov); + tmp1[3] = vec_msum(in[1], cospi_p30_m02_v, zerov); + tmp1[4] = vec_msum(in[2], cospi_p10_p22_v, zerov); + tmp1[5] = vec_msum(in[3], cospi_p10_p22_v, zerov); + tmp1[6] = vec_msum(in[2], cospi_p22_m10_v, zerov); + tmp1[7] = vec_msum(in[3], cospi_p22_m10_v, zerov); + tmp1[8] = vec_msum(in[4], cospi_p18_p14_v, zerov); + tmp1[9] = vec_msum(in[5], cospi_p18_p14_v, zerov); + tmp1[10] = vec_msum(in[4], cospi_p14_m18_v, zerov); + tmp1[11] = vec_msum(in[5], cospi_p14_m18_v, zerov); + tmp1[12] = vec_msum(in[6], cospi_p26_p06_v, zerov); + tmp1[13] = vec_msum(in[7], cospi_p26_p06_v, zerov); + tmp1[14] = vec_msum(in[6], cospi_p06_m26_v, zerov); + tmp1[15] = vec_msum(in[7], cospi_p06_m26_v, zerov); + + tmp0[0] = vec_add(tmp1[0], tmp1[8]); + tmp0[1] = vec_add(tmp1[1], tmp1[9]); + tmp0[2] = vec_add(tmp1[2], tmp1[10]); + tmp0[3] = vec_add(tmp1[3], tmp1[11]); + tmp0[4] = vec_add(tmp1[4], tmp1[12]); + tmp0[5] = vec_add(tmp1[5], tmp1[13]); + tmp0[6] = vec_add(tmp1[6], tmp1[14]); + tmp0[7] = vec_add(tmp1[7], tmp1[15]); + tmp0[8] = vec_sub(tmp1[0], tmp1[8]); + tmp0[9] = vec_sub(tmp1[1], tmp1[9]); + tmp0[10] = vec_sub(tmp1[2], tmp1[10]); + tmp0[11] = vec_sub(tmp1[3], tmp1[11]); + tmp0[12] = vec_sub(tmp1[4], tmp1[12]); + tmp0[13] = vec_sub(tmp1[5], tmp1[13]); + tmp0[14] = vec_sub(tmp1[6], tmp1[14]); + tmp0[15] = vec_sub(tmp1[7], tmp1[15]); + + // shift and rounding + DCT_CONST_ROUND_SHIFT(tmp0[0]); + DCT_CONST_ROUND_SHIFT(tmp0[1]); + DCT_CONST_ROUND_SHIFT(tmp0[2]); + DCT_CONST_ROUND_SHIFT(tmp0[3]); + DCT_CONST_ROUND_SHIFT(tmp0[4]); + DCT_CONST_ROUND_SHIFT(tmp0[5]); + DCT_CONST_ROUND_SHIFT(tmp0[6]); + DCT_CONST_ROUND_SHIFT(tmp0[7]); + DCT_CONST_ROUND_SHIFT(tmp0[8]); + DCT_CONST_ROUND_SHIFT(tmp0[9]); + DCT_CONST_ROUND_SHIFT(tmp0[10]); + DCT_CONST_ROUND_SHIFT(tmp0[11]); + DCT_CONST_ROUND_SHIFT(tmp0[12]); + DCT_CONST_ROUND_SHIFT(tmp0[13]); + DCT_CONST_ROUND_SHIFT(tmp0[14]); + DCT_CONST_ROUND_SHIFT(tmp0[15]); + + // back to 16-bit + out[0] = vec_packs(tmp0[0], tmp0[1]); + out[1] = vec_packs(tmp0[2], tmp0[3]); + out[2] = vec_packs(tmp0[4], tmp0[5]); + out[3] = vec_packs(tmp0[6], tmp0[7]); + out[4] = vec_packs(tmp0[8], tmp0[9]); + out[5] = vec_packs(tmp0[10], tmp0[11]); + out[6] = vec_packs(tmp0[12], tmp0[13]); + out[7] = vec_packs(tmp0[14], tmp0[15]); + + // stage 2 + in[0] = vec_add(out[0], out[2]); + in[1] = vec_add(out[1], out[3]); + in[2] = vec_sub(out[0], out[2]); + in[3] = vec_sub(out[1], out[3]); + in[4] = vec_mergeh(out[4], out[5]); + in[5] = vec_mergel(out[4], out[5]); + in[6] = vec_mergeh(out[6], out[7]); + in[7] = vec_mergel(out[6], out[7]); + + tmp1[0] = vec_msum(in[4], cospi_p08_p24_v, zerov); + tmp1[1] = vec_msum(in[5], cospi_p08_p24_v, zerov); + tmp1[2] = vec_msum(in[4], cospi_p24_m08_v, zerov); + tmp1[3] = vec_msum(in[5], cospi_p24_m08_v, zerov); + tmp1[4] = vec_msum(in[6], cospi_m24_p08_v, zerov); + tmp1[5] = vec_msum(in[7], cospi_m24_p08_v, zerov); + tmp1[6] = vec_msum(in[6], cospi_p08_p24_v, zerov); + tmp1[7] = vec_msum(in[7], cospi_p08_p24_v, zerov); + + tmp0[0] = vec_add(tmp1[0], tmp1[4]); + tmp0[1] = vec_add(tmp1[1], tmp1[5]); + tmp0[2] = vec_add(tmp1[2], tmp1[6]); + tmp0[3] = vec_add(tmp1[3], tmp1[7]); + tmp0[4] = vec_sub(tmp1[0], tmp1[4]); + tmp0[5] = vec_sub(tmp1[1], tmp1[5]); + tmp0[6] = vec_sub(tmp1[2], tmp1[6]); + tmp0[7] = vec_sub(tmp1[3], tmp1[7]); + + DCT_CONST_ROUND_SHIFT(tmp0[0]); + DCT_CONST_ROUND_SHIFT(tmp0[1]); + DCT_CONST_ROUND_SHIFT(tmp0[2]); + DCT_CONST_ROUND_SHIFT(tmp0[3]); + DCT_CONST_ROUND_SHIFT(tmp0[4]); + DCT_CONST_ROUND_SHIFT(tmp0[5]); + DCT_CONST_ROUND_SHIFT(tmp0[6]); + DCT_CONST_ROUND_SHIFT(tmp0[7]); + + in[4] = vec_packs(tmp0[0], tmp0[1]); + in[5] = vec_packs(tmp0[2], tmp0[3]); + in[6] = vec_packs(tmp0[4], tmp0[5]); + in[7] = vec_packs(tmp0[6], tmp0[7]); + + // stage 3 + out[0] = vec_mergeh(in[2], in[3]); + out[1] = vec_mergel(in[2], in[3]); + out[2] = vec_mergeh(in[6], in[7]); + out[3] = vec_mergel(in[6], in[7]); + + IADST_WRAPLOW(out[0], out[1], tmp0[0], tmp0[1], in[2], cospi16_v); + IADST_WRAPLOW(out[0], out[1], tmp0[0], tmp0[1], in[3], cospi_p16_m16_v); + IADST_WRAPLOW(out[2], out[3], tmp0[0], tmp0[1], in[6], cospi16_v); + IADST_WRAPLOW(out[2], out[3], tmp0[0], tmp0[1], in[7], cospi_p16_m16_v); + + out[0] = in[0]; + out[2] = in[6]; + out[4] = in[3]; + out[6] = in[5]; + + out[1] = vec_sub(zero16v, in[4]); + out[3] = vec_sub(zero16v, in[2]); + out[5] = vec_sub(zero16v, in[7]); + out[7] = vec_sub(zero16v, in[1]); +} + +static void iadst16x8_vsx(int16x8_t *in, int16x8_t *out) { + int32x4_t tmp0[32], tmp1[32]; + int16x8_t tmp16_0[8]; + int16x8_t cospi_p01_p31 = vec_mergel(cospi1_v, cospi31_v); + int16x8_t cospi_p31_m01 = vec_mergel(cospi31_v, cospi1m_v); + int16x8_t cospi_p05_p27 = vec_mergel(cospi5_v, cospi27_v); + int16x8_t cospi_p27_m05 = vec_mergel(cospi27_v, cospi5m_v); + int16x8_t cospi_p09_p23 = vec_mergel(cospi9_v, cospi23_v); + int16x8_t cospi_p23_m09 = vec_mergel(cospi23_v, cospi9m_v); + int16x8_t cospi_p13_p19 = vec_mergel(cospi13_v, cospi19_v); + int16x8_t cospi_p19_m13 = vec_mergel(cospi19_v, cospi13m_v); + int16x8_t cospi_p17_p15 = vec_mergel(cospi17_v, cospi15_v); + int16x8_t cospi_p15_m17 = vec_mergel(cospi15_v, cospi17m_v); + int16x8_t cospi_p21_p11 = vec_mergel(cospi21_v, cospi11_v); + int16x8_t cospi_p11_m21 = vec_mergel(cospi11_v, cospi21m_v); + int16x8_t cospi_p25_p07 = vec_mergel(cospi25_v, cospi7_v); + int16x8_t cospi_p07_m25 = vec_mergel(cospi7_v, cospi25m_v); + int16x8_t cospi_p29_p03 = vec_mergel(cospi29_v, cospi3_v); + int16x8_t cospi_p03_m29 = vec_mergel(cospi3_v, cospi29m_v); + int16x8_t cospi_p04_p28 = vec_mergel(cospi4_v, cospi28_v); + int16x8_t cospi_p28_m04 = vec_mergel(cospi28_v, cospi4m_v); + int16x8_t cospi_p20_p12 = vec_mergel(cospi20_v, cospi12_v); + int16x8_t cospi_p12_m20 = vec_mergel(cospi12_v, cospi20m_v); + int16x8_t cospi_m28_p04 = vec_mergel(cospi28m_v, cospi4_v); + int16x8_t cospi_m12_p20 = vec_mergel(cospi12m_v, cospi20_v); + int16x8_t cospi_p08_p24 = vec_mergel(cospi8_v, cospi24_v); + int16x8_t cospi_p24_m08 = vec_mergel(cospi24_v, cospi8m_v); + int16x8_t cospi_m24_p08 = vec_mergel(cospi24m_v, cospi8_v); + int32x4_t zerov = vec_splat_s32(0); + ROUND_SHIFT_INIT; + + tmp16_0[0] = vec_mergeh(in[15], in[0]); + tmp16_0[1] = vec_mergel(in[15], in[0]); + tmp16_0[2] = vec_mergeh(in[13], in[2]); + tmp16_0[3] = vec_mergel(in[13], in[2]); + tmp16_0[4] = vec_mergeh(in[11], in[4]); + tmp16_0[5] = vec_mergel(in[11], in[4]); + tmp16_0[6] = vec_mergeh(in[9], in[6]); + tmp16_0[7] = vec_mergel(in[9], in[6]); + tmp16_0[8] = vec_mergeh(in[7], in[8]); + tmp16_0[9] = vec_mergel(in[7], in[8]); + tmp16_0[10] = vec_mergeh(in[5], in[10]); + tmp16_0[11] = vec_mergel(in[5], in[10]); + tmp16_0[12] = vec_mergeh(in[3], in[12]); + tmp16_0[13] = vec_mergel(in[3], in[12]); + tmp16_0[14] = vec_mergeh(in[1], in[14]); + tmp16_0[15] = vec_mergel(in[1], in[14]); + + tmp0[0] = vec_msum(tmp16_0[0], cospi_p01_p31, zerov); + tmp0[1] = vec_msum(tmp16_0[1], cospi_p01_p31, zerov); + tmp0[2] = vec_msum(tmp16_0[0], cospi_p31_m01, zerov); + tmp0[3] = vec_msum(tmp16_0[1], cospi_p31_m01, zerov); + tmp0[4] = vec_msum(tmp16_0[2], cospi_p05_p27, zerov); + tmp0[5] = vec_msum(tmp16_0[3], cospi_p05_p27, zerov); + tmp0[6] = vec_msum(tmp16_0[2], cospi_p27_m05, zerov); + tmp0[7] = vec_msum(tmp16_0[3], cospi_p27_m05, zerov); + tmp0[8] = vec_msum(tmp16_0[4], cospi_p09_p23, zerov); + tmp0[9] = vec_msum(tmp16_0[5], cospi_p09_p23, zerov); + tmp0[10] = vec_msum(tmp16_0[4], cospi_p23_m09, zerov); + tmp0[11] = vec_msum(tmp16_0[5], cospi_p23_m09, zerov); + tmp0[12] = vec_msum(tmp16_0[6], cospi_p13_p19, zerov); + tmp0[13] = vec_msum(tmp16_0[7], cospi_p13_p19, zerov); + tmp0[14] = vec_msum(tmp16_0[6], cospi_p19_m13, zerov); + tmp0[15] = vec_msum(tmp16_0[7], cospi_p19_m13, zerov); + tmp0[16] = vec_msum(tmp16_0[8], cospi_p17_p15, zerov); + tmp0[17] = vec_msum(tmp16_0[9], cospi_p17_p15, zerov); + tmp0[18] = vec_msum(tmp16_0[8], cospi_p15_m17, zerov); + tmp0[19] = vec_msum(tmp16_0[9], cospi_p15_m17, zerov); + tmp0[20] = vec_msum(tmp16_0[10], cospi_p21_p11, zerov); + tmp0[21] = vec_msum(tmp16_0[11], cospi_p21_p11, zerov); + tmp0[22] = vec_msum(tmp16_0[10], cospi_p11_m21, zerov); + tmp0[23] = vec_msum(tmp16_0[11], cospi_p11_m21, zerov); + tmp0[24] = vec_msum(tmp16_0[12], cospi_p25_p07, zerov); + tmp0[25] = vec_msum(tmp16_0[13], cospi_p25_p07, zerov); + tmp0[26] = vec_msum(tmp16_0[12], cospi_p07_m25, zerov); + tmp0[27] = vec_msum(tmp16_0[13], cospi_p07_m25, zerov); + tmp0[28] = vec_msum(tmp16_0[14], cospi_p29_p03, zerov); + tmp0[29] = vec_msum(tmp16_0[15], cospi_p29_p03, zerov); + tmp0[30] = vec_msum(tmp16_0[14], cospi_p03_m29, zerov); + tmp0[31] = vec_msum(tmp16_0[15], cospi_p03_m29, zerov); + + tmp1[0] = vec_add(tmp0[0], tmp0[16]); + tmp1[1] = vec_add(tmp0[1], tmp0[17]); + tmp1[2] = vec_add(tmp0[2], tmp0[18]); + tmp1[3] = vec_add(tmp0[3], tmp0[19]); + tmp1[4] = vec_add(tmp0[4], tmp0[20]); + tmp1[5] = vec_add(tmp0[5], tmp0[21]); + tmp1[6] = vec_add(tmp0[6], tmp0[22]); + tmp1[7] = vec_add(tmp0[7], tmp0[23]); + tmp1[8] = vec_add(tmp0[8], tmp0[24]); + tmp1[9] = vec_add(tmp0[9], tmp0[25]); + tmp1[10] = vec_add(tmp0[10], tmp0[26]); + tmp1[11] = vec_add(tmp0[11], tmp0[27]); + tmp1[12] = vec_add(tmp0[12], tmp0[28]); + tmp1[13] = vec_add(tmp0[13], tmp0[29]); + tmp1[14] = vec_add(tmp0[14], tmp0[30]); + tmp1[15] = vec_add(tmp0[15], tmp0[31]); + tmp1[16] = vec_sub(tmp0[0], tmp0[16]); + tmp1[17] = vec_sub(tmp0[1], tmp0[17]); + tmp1[18] = vec_sub(tmp0[2], tmp0[18]); + tmp1[19] = vec_sub(tmp0[3], tmp0[19]); + tmp1[20] = vec_sub(tmp0[4], tmp0[20]); + tmp1[21] = vec_sub(tmp0[5], tmp0[21]); + tmp1[22] = vec_sub(tmp0[6], tmp0[22]); + tmp1[23] = vec_sub(tmp0[7], tmp0[23]); + tmp1[24] = vec_sub(tmp0[8], tmp0[24]); + tmp1[25] = vec_sub(tmp0[9], tmp0[25]); + tmp1[26] = vec_sub(tmp0[10], tmp0[26]); + tmp1[27] = vec_sub(tmp0[11], tmp0[27]); + tmp1[28] = vec_sub(tmp0[12], tmp0[28]); + tmp1[29] = vec_sub(tmp0[13], tmp0[29]); + tmp1[30] = vec_sub(tmp0[14], tmp0[30]); + tmp1[31] = vec_sub(tmp0[15], tmp0[31]); + + DCT_CONST_ROUND_SHIFT(tmp1[0]); + DCT_CONST_ROUND_SHIFT(tmp1[1]); + DCT_CONST_ROUND_SHIFT(tmp1[2]); + DCT_CONST_ROUND_SHIFT(tmp1[3]); + DCT_CONST_ROUND_SHIFT(tmp1[4]); + DCT_CONST_ROUND_SHIFT(tmp1[5]); + DCT_CONST_ROUND_SHIFT(tmp1[6]); + DCT_CONST_ROUND_SHIFT(tmp1[7]); + DCT_CONST_ROUND_SHIFT(tmp1[8]); + DCT_CONST_ROUND_SHIFT(tmp1[9]); + DCT_CONST_ROUND_SHIFT(tmp1[10]); + DCT_CONST_ROUND_SHIFT(tmp1[11]); + DCT_CONST_ROUND_SHIFT(tmp1[12]); + DCT_CONST_ROUND_SHIFT(tmp1[13]); + DCT_CONST_ROUND_SHIFT(tmp1[14]); + DCT_CONST_ROUND_SHIFT(tmp1[15]); + DCT_CONST_ROUND_SHIFT(tmp1[16]); + DCT_CONST_ROUND_SHIFT(tmp1[17]); + DCT_CONST_ROUND_SHIFT(tmp1[18]); + DCT_CONST_ROUND_SHIFT(tmp1[19]); + DCT_CONST_ROUND_SHIFT(tmp1[20]); + DCT_CONST_ROUND_SHIFT(tmp1[21]); + DCT_CONST_ROUND_SHIFT(tmp1[22]); + DCT_CONST_ROUND_SHIFT(tmp1[23]); + DCT_CONST_ROUND_SHIFT(tmp1[24]); + DCT_CONST_ROUND_SHIFT(tmp1[25]); + DCT_CONST_ROUND_SHIFT(tmp1[26]); + DCT_CONST_ROUND_SHIFT(tmp1[27]); + DCT_CONST_ROUND_SHIFT(tmp1[28]); + DCT_CONST_ROUND_SHIFT(tmp1[29]); + DCT_CONST_ROUND_SHIFT(tmp1[30]); + DCT_CONST_ROUND_SHIFT(tmp1[31]); + + in[0] = vec_packs(tmp1[0], tmp1[1]); + in[1] = vec_packs(tmp1[2], tmp1[3]); + in[2] = vec_packs(tmp1[4], tmp1[5]); + in[3] = vec_packs(tmp1[6], tmp1[7]); + in[4] = vec_packs(tmp1[8], tmp1[9]); + in[5] = vec_packs(tmp1[10], tmp1[11]); + in[6] = vec_packs(tmp1[12], tmp1[13]); + in[7] = vec_packs(tmp1[14], tmp1[15]); + in[8] = vec_packs(tmp1[16], tmp1[17]); + in[9] = vec_packs(tmp1[18], tmp1[19]); + in[10] = vec_packs(tmp1[20], tmp1[21]); + in[11] = vec_packs(tmp1[22], tmp1[23]); + in[12] = vec_packs(tmp1[24], tmp1[25]); + in[13] = vec_packs(tmp1[26], tmp1[27]); + in[14] = vec_packs(tmp1[28], tmp1[29]); + in[15] = vec_packs(tmp1[30], tmp1[31]); + + // stage 2 + tmp16_0[0] = vec_mergeh(in[8], in[9]); + tmp16_0[1] = vec_mergel(in[8], in[9]); + tmp16_0[2] = vec_mergeh(in[10], in[11]); + tmp16_0[3] = vec_mergel(in[10], in[11]); + tmp16_0[4] = vec_mergeh(in[12], in[13]); + tmp16_0[5] = vec_mergel(in[12], in[13]); + tmp16_0[6] = vec_mergeh(in[14], in[15]); + tmp16_0[7] = vec_mergel(in[14], in[15]); + + tmp0[0] = vec_msum(tmp16_0[0], cospi_p04_p28, zerov); + tmp0[1] = vec_msum(tmp16_0[1], cospi_p04_p28, zerov); + tmp0[2] = vec_msum(tmp16_0[0], cospi_p28_m04, zerov); + tmp0[3] = vec_msum(tmp16_0[1], cospi_p28_m04, zerov); + tmp0[4] = vec_msum(tmp16_0[2], cospi_p20_p12, zerov); + tmp0[5] = vec_msum(tmp16_0[3], cospi_p20_p12, zerov); + tmp0[6] = vec_msum(tmp16_0[2], cospi_p12_m20, zerov); + tmp0[7] = vec_msum(tmp16_0[3], cospi_p12_m20, zerov); + tmp0[8] = vec_msum(tmp16_0[4], cospi_m28_p04, zerov); + tmp0[9] = vec_msum(tmp16_0[5], cospi_m28_p04, zerov); + tmp0[10] = vec_msum(tmp16_0[4], cospi_p04_p28, zerov); + tmp0[11] = vec_msum(tmp16_0[5], cospi_p04_p28, zerov); + tmp0[12] = vec_msum(tmp16_0[6], cospi_m12_p20, zerov); + tmp0[13] = vec_msum(tmp16_0[7], cospi_m12_p20, zerov); + tmp0[14] = vec_msum(tmp16_0[6], cospi_p20_p12, zerov); + tmp0[15] = vec_msum(tmp16_0[7], cospi_p20_p12, zerov); + + tmp1[0] = vec_add(tmp0[0], tmp0[8]); + tmp1[1] = vec_add(tmp0[1], tmp0[9]); + tmp1[2] = vec_add(tmp0[2], tmp0[10]); + tmp1[3] = vec_add(tmp0[3], tmp0[11]); + tmp1[4] = vec_add(tmp0[4], tmp0[12]); + tmp1[5] = vec_add(tmp0[5], tmp0[13]); + tmp1[6] = vec_add(tmp0[6], tmp0[14]); + tmp1[7] = vec_add(tmp0[7], tmp0[15]); + tmp1[8] = vec_sub(tmp0[0], tmp0[8]); + tmp1[9] = vec_sub(tmp0[1], tmp0[9]); + tmp1[10] = vec_sub(tmp0[2], tmp0[10]); + tmp1[11] = vec_sub(tmp0[3], tmp0[11]); + tmp1[12] = vec_sub(tmp0[4], tmp0[12]); + tmp1[13] = vec_sub(tmp0[5], tmp0[13]); + tmp1[14] = vec_sub(tmp0[6], tmp0[14]); + tmp1[15] = vec_sub(tmp0[7], tmp0[15]); + + DCT_CONST_ROUND_SHIFT(tmp1[0]); + DCT_CONST_ROUND_SHIFT(tmp1[1]); + DCT_CONST_ROUND_SHIFT(tmp1[2]); + DCT_CONST_ROUND_SHIFT(tmp1[3]); + DCT_CONST_ROUND_SHIFT(tmp1[4]); + DCT_CONST_ROUND_SHIFT(tmp1[5]); + DCT_CONST_ROUND_SHIFT(tmp1[6]); + DCT_CONST_ROUND_SHIFT(tmp1[7]); + DCT_CONST_ROUND_SHIFT(tmp1[8]); + DCT_CONST_ROUND_SHIFT(tmp1[9]); + DCT_CONST_ROUND_SHIFT(tmp1[10]); + DCT_CONST_ROUND_SHIFT(tmp1[11]); + DCT_CONST_ROUND_SHIFT(tmp1[12]); + DCT_CONST_ROUND_SHIFT(tmp1[13]); + DCT_CONST_ROUND_SHIFT(tmp1[14]); + DCT_CONST_ROUND_SHIFT(tmp1[15]); + + tmp16_0[0] = vec_add(in[0], in[4]); + tmp16_0[1] = vec_add(in[1], in[5]); + tmp16_0[2] = vec_add(in[2], in[6]); + tmp16_0[3] = vec_add(in[3], in[7]); + tmp16_0[4] = vec_sub(in[0], in[4]); + tmp16_0[5] = vec_sub(in[1], in[5]); + tmp16_0[6] = vec_sub(in[2], in[6]); + tmp16_0[7] = vec_sub(in[3], in[7]); + tmp16_0[8] = vec_packs(tmp1[0], tmp1[1]); + tmp16_0[9] = vec_packs(tmp1[2], tmp1[3]); + tmp16_0[10] = vec_packs(tmp1[4], tmp1[5]); + tmp16_0[11] = vec_packs(tmp1[6], tmp1[7]); + tmp16_0[12] = vec_packs(tmp1[8], tmp1[9]); + tmp16_0[13] = vec_packs(tmp1[10], tmp1[11]); + tmp16_0[14] = vec_packs(tmp1[12], tmp1[13]); + tmp16_0[15] = vec_packs(tmp1[14], tmp1[15]); + + // stage 3 + in[0] = vec_mergeh(tmp16_0[4], tmp16_0[5]); + in[1] = vec_mergel(tmp16_0[4], tmp16_0[5]); + in[2] = vec_mergeh(tmp16_0[6], tmp16_0[7]); + in[3] = vec_mergel(tmp16_0[6], tmp16_0[7]); + in[4] = vec_mergeh(tmp16_0[12], tmp16_0[13]); + in[5] = vec_mergel(tmp16_0[12], tmp16_0[13]); + in[6] = vec_mergeh(tmp16_0[14], tmp16_0[15]); + in[7] = vec_mergel(tmp16_0[14], tmp16_0[15]); + + tmp0[0] = vec_msum(in[0], cospi_p08_p24, zerov); + tmp0[1] = vec_msum(in[1], cospi_p08_p24, zerov); + tmp0[2] = vec_msum(in[0], cospi_p24_m08, zerov); + tmp0[3] = vec_msum(in[1], cospi_p24_m08, zerov); + tmp0[4] = vec_msum(in[2], cospi_m24_p08, zerov); + tmp0[5] = vec_msum(in[3], cospi_m24_p08, zerov); + tmp0[6] = vec_msum(in[2], cospi_p08_p24, zerov); + tmp0[7] = vec_msum(in[3], cospi_p08_p24, zerov); + tmp0[8] = vec_msum(in[4], cospi_p08_p24, zerov); + tmp0[9] = vec_msum(in[5], cospi_p08_p24, zerov); + tmp0[10] = vec_msum(in[4], cospi_p24_m08, zerov); + tmp0[11] = vec_msum(in[5], cospi_p24_m08, zerov); + tmp0[12] = vec_msum(in[6], cospi_m24_p08, zerov); + tmp0[13] = vec_msum(in[7], cospi_m24_p08, zerov); + tmp0[14] = vec_msum(in[6], cospi_p08_p24, zerov); + tmp0[15] = vec_msum(in[7], cospi_p08_p24, zerov); + + tmp1[0] = vec_add(tmp0[0], tmp0[4]); + tmp1[1] = vec_add(tmp0[1], tmp0[5]); + tmp1[2] = vec_add(tmp0[2], tmp0[6]); + tmp1[3] = vec_add(tmp0[3], tmp0[7]); + tmp1[4] = vec_sub(tmp0[0], tmp0[4]); + tmp1[5] = vec_sub(tmp0[1], tmp0[5]); + tmp1[6] = vec_sub(tmp0[2], tmp0[6]); + tmp1[7] = vec_sub(tmp0[3], tmp0[7]); + tmp1[8] = vec_add(tmp0[8], tmp0[12]); + tmp1[9] = vec_add(tmp0[9], tmp0[13]); + tmp1[10] = vec_add(tmp0[10], tmp0[14]); + tmp1[11] = vec_add(tmp0[11], tmp0[15]); + tmp1[12] = vec_sub(tmp0[8], tmp0[12]); + tmp1[13] = vec_sub(tmp0[9], tmp0[13]); + tmp1[14] = vec_sub(tmp0[10], tmp0[14]); + tmp1[15] = vec_sub(tmp0[11], tmp0[15]); + + DCT_CONST_ROUND_SHIFT(tmp1[0]); + DCT_CONST_ROUND_SHIFT(tmp1[1]); + DCT_CONST_ROUND_SHIFT(tmp1[2]); + DCT_CONST_ROUND_SHIFT(tmp1[3]); + DCT_CONST_ROUND_SHIFT(tmp1[4]); + DCT_CONST_ROUND_SHIFT(tmp1[5]); + DCT_CONST_ROUND_SHIFT(tmp1[6]); + DCT_CONST_ROUND_SHIFT(tmp1[7]); + DCT_CONST_ROUND_SHIFT(tmp1[8]); + DCT_CONST_ROUND_SHIFT(tmp1[9]); + DCT_CONST_ROUND_SHIFT(tmp1[10]); + DCT_CONST_ROUND_SHIFT(tmp1[11]); + DCT_CONST_ROUND_SHIFT(tmp1[12]); + DCT_CONST_ROUND_SHIFT(tmp1[13]); + DCT_CONST_ROUND_SHIFT(tmp1[14]); + DCT_CONST_ROUND_SHIFT(tmp1[15]); + + in[0] = vec_add(tmp16_0[0], tmp16_0[2]); + in[1] = vec_add(tmp16_0[1], tmp16_0[3]); + in[2] = vec_sub(tmp16_0[0], tmp16_0[2]); + in[3] = vec_sub(tmp16_0[1], tmp16_0[3]); + in[4] = vec_packs(tmp1[0], tmp1[1]); + in[5] = vec_packs(tmp1[2], tmp1[3]); + in[6] = vec_packs(tmp1[4], tmp1[5]); + in[7] = vec_packs(tmp1[6], tmp1[7]); + in[8] = vec_add(tmp16_0[8], tmp16_0[10]); + in[9] = vec_add(tmp16_0[9], tmp16_0[11]); + in[10] = vec_sub(tmp16_0[8], tmp16_0[10]); + in[11] = vec_sub(tmp16_0[9], tmp16_0[11]); + in[12] = vec_packs(tmp1[8], tmp1[9]); + in[13] = vec_packs(tmp1[10], tmp1[11]); + in[14] = vec_packs(tmp1[12], tmp1[13]); + in[15] = vec_packs(tmp1[14], tmp1[15]); + + // stage 4 + out[0] = vec_mergeh(in[2], in[3]); + out[1] = vec_mergel(in[2], in[3]); + out[2] = vec_mergeh(in[6], in[7]); + out[3] = vec_mergel(in[6], in[7]); + out[4] = vec_mergeh(in[10], in[11]); + out[5] = vec_mergel(in[10], in[11]); + out[6] = vec_mergeh(in[14], in[15]); + out[7] = vec_mergel(in[14], in[15]); +} + +void vpx_iadst16_vsx(int16x8_t *src0, int16x8_t *src1) { + int16x8_t tmp0[16], tmp1[16], tmp2[8]; + int32x4_t tmp3, tmp4; + int16x8_t zero16v = vec_splat_s16(0); + int32x4_t zerov = vec_splat_s32(0); + int16x8_t cospi_p16_m16 = vec_mergel(cospi16_v, cospi16m_v); + int16x8_t cospi_m16_p16 = vec_mergel(cospi16m_v, cospi16_v); + ROUND_SHIFT_INIT; + + TRANSPOSE8x8(src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12], + src0[14], tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], + tmp0[6], tmp0[7]); + TRANSPOSE8x8(src1[0], src1[2], src1[4], src1[6], src1[8], src1[10], src1[12], + src1[14], tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], + tmp1[6], tmp1[7]); + TRANSPOSE8x8(src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13], + src0[15], tmp0[8], tmp0[9], tmp0[10], tmp0[11], tmp0[12], + tmp0[13], tmp0[14], tmp0[15]); + TRANSPOSE8x8(src1[1], src1[3], src1[5], src1[7], src1[9], src1[11], src1[13], + src1[15], tmp1[8], tmp1[9], tmp1[10], tmp1[11], tmp1[12], + tmp1[13], tmp1[14], tmp1[15]); + + iadst16x8_vsx(tmp0, tmp2); + IADST_WRAPLOW(tmp2[0], tmp2[1], tmp3, tmp4, src0[14], cospi16m_v); + IADST_WRAPLOW(tmp2[0], tmp2[1], tmp3, tmp4, src1[0], cospi_p16_m16); + IADST_WRAPLOW(tmp2[2], tmp2[3], tmp3, tmp4, src0[8], cospi16_v); + IADST_WRAPLOW(tmp2[2], tmp2[3], tmp3, tmp4, src1[6], cospi_m16_p16); + IADST_WRAPLOW(tmp2[4], tmp2[5], tmp3, tmp4, src0[12], cospi16_v); + IADST_WRAPLOW(tmp2[4], tmp2[5], tmp3, tmp4, src1[2], cospi_m16_p16); + IADST_WRAPLOW(tmp2[6], tmp2[7], tmp3, tmp4, src0[10], cospi16m_v); + IADST_WRAPLOW(tmp2[6], tmp2[7], tmp3, tmp4, src1[4], cospi_p16_m16); + + src0[0] = tmp0[0]; + src0[2] = vec_sub(zero16v, tmp0[8]); + src0[4] = tmp0[12]; + src0[6] = vec_sub(zero16v, tmp0[4]); + src1[8] = tmp0[5]; + src1[10] = vec_sub(zero16v, tmp0[13]); + src1[12] = tmp0[9]; + src1[14] = vec_sub(zero16v, tmp0[1]); + + iadst16x8_vsx(tmp1, tmp2); + IADST_WRAPLOW(tmp2[0], tmp2[1], tmp3, tmp4, src0[15], cospi16m_v); + IADST_WRAPLOW(tmp2[0], tmp2[1], tmp3, tmp4, src1[1], cospi_p16_m16); + IADST_WRAPLOW(tmp2[2], tmp2[3], tmp3, tmp4, src0[9], cospi16_v); + IADST_WRAPLOW(tmp2[2], tmp2[3], tmp3, tmp4, src1[7], cospi_m16_p16); + IADST_WRAPLOW(tmp2[4], tmp2[5], tmp3, tmp4, src0[13], cospi16_v); + IADST_WRAPLOW(tmp2[4], tmp2[5], tmp3, tmp4, src1[3], cospi_m16_p16); + IADST_WRAPLOW(tmp2[6], tmp2[7], tmp3, tmp4, src0[11], cospi16m_v); + IADST_WRAPLOW(tmp2[6], tmp2[7], tmp3, tmp4, src1[5], cospi_p16_m16); + + src0[1] = tmp1[0]; + src0[3] = vec_sub(zero16v, tmp1[8]); + src0[5] = tmp1[12]; + src0[7] = vec_sub(zero16v, tmp1[4]); + src1[9] = tmp1[5]; + src1[11] = vec_sub(zero16v, tmp1[13]); + src1[13] = tmp1[9]; + src1[15] = vec_sub(zero16v, tmp1[1]); +} diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.h b/media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.h new file mode 100644 index 0000000000..7031742c1c --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_PPC_INV_TXFM_VSX_H_ +#define VPX_VPX_DSP_PPC_INV_TXFM_VSX_H_ + +#include "vpx_dsp/ppc/types_vsx.h" + +void vpx_round_store4x4_vsx(int16x8_t *in, int16x8_t *out, uint8_t *dest, + int stride); +void vpx_idct4_vsx(int16x8_t *in, int16x8_t *out); +void vp9_iadst4_vsx(int16x8_t *in, int16x8_t *out); + +void vpx_round_store8x8_vsx(int16x8_t *in, uint8_t *dest, int stride); +void vpx_idct8_vsx(int16x8_t *in, int16x8_t *out); +void vp9_iadst8_vsx(int16x8_t *in, int16x8_t *out); + +#define LOAD_INPUT16(load, source, offset, step, in) \ + in[0] = load(offset, source); \ + in[1] = load((step) + (offset), source); \ + in[2] = load(2 * (step) + (offset), source); \ + in[3] = load(3 * (step) + (offset), source); \ + in[4] = load(4 * (step) + (offset), source); \ + in[5] = load(5 * (step) + (offset), source); \ + in[6] = load(6 * (step) + (offset), source); \ + in[7] = load(7 * (step) + (offset), source); \ + in[8] = load(8 * (step) + (offset), source); \ + in[9] = load(9 * (step) + (offset), source); \ + in[10] = load(10 * (step) + (offset), source); \ + in[11] = load(11 * (step) + (offset), source); \ + in[12] = load(12 * (step) + (offset), source); \ + in[13] = load(13 * (step) + (offset), source); \ + in[14] = load(14 * (step) + (offset), source); \ + in[15] = load(15 * (step) + (offset), source); + +void vpx_round_store16x16_vsx(int16x8_t *src0, int16x8_t *src1, uint8_t *dest, + int stride); +void vpx_idct16_vsx(int16x8_t *src0, int16x8_t *src1); +void vpx_iadst16_vsx(int16x8_t *src0, int16x8_t *src1); + +#endif // VPX_VPX_DSP_PPC_INV_TXFM_VSX_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/quantize_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/quantize_vsx.c new file mode 100644 index 0000000000..ab71f6e235 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/ppc/quantize_vsx.c @@ -0,0 +1,301 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/ppc/types_vsx.h" + +// Negate 16-bit integers in a when the corresponding signed 16-bit +// integer in b is negative. +static INLINE int16x8_t vec_sign(int16x8_t a, int16x8_t b) { + const int16x8_t mask = vec_sra(b, vec_shift_sign_s16); + return vec_xor(vec_add(a, mask), mask); +} + +// Sets the value of a 32-bit integers to 1 when the corresponding value in a is +// negative. +static INLINE int32x4_t vec_is_neg(int32x4_t a) { + return vec_sr(a, vec_shift_sign_s32); +} + +// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit +// integers, and return the high 16 bits of the intermediate integers. +// (a * b) >> 16 +static INLINE int16x8_t vec_mulhi(int16x8_t a, int16x8_t b) { + // madds does ((A * B) >>15) + C, we need >> 16, so we perform an extra right + // shift. + return vec_sra(vec_madds(a, b, vec_zeros_s16), vec_ones_u16); +} + +// Quantization function used for 4x4, 8x8 and 16x16 blocks. +static INLINE int16x8_t quantize_coeff(int16x8_t coeff, int16x8_t coeff_abs, + int16x8_t round, int16x8_t quant, + int16x8_t quant_shift, bool16x8_t mask) { + const int16x8_t rounded = vec_vaddshs(coeff_abs, round); + int16x8_t qcoeff = vec_mulhi(rounded, quant); + qcoeff = vec_add(qcoeff, rounded); + qcoeff = vec_mulhi(qcoeff, quant_shift); + qcoeff = vec_sign(qcoeff, coeff); + return vec_and(qcoeff, mask); +} + +// Quantization function used for 32x32 blocks. +static INLINE int16x8_t quantize_coeff_32(int16x8_t coeff, int16x8_t coeff_abs, + int16x8_t round, int16x8_t quant, + int16x8_t quant_shift, + bool16x8_t mask) { + const int16x8_t rounded = vec_vaddshs(coeff_abs, round); + int16x8_t qcoeff = vec_mulhi(rounded, quant); + qcoeff = vec_add(qcoeff, rounded); + // 32x32 blocks require an extra multiplication by 2, this compensates for the + // extra right shift added in vec_mulhi, as such vec_madds can be used + // directly instead of vec_mulhi (((a * b) >> 15) >> 1) << 1 == (a * b >> 15) + qcoeff = vec_madds(qcoeff, quant_shift, vec_zeros_s16); + qcoeff = vec_sign(qcoeff, coeff); + return vec_and(qcoeff, mask); +} + +// DeQuantization function used for 32x32 blocks. Quantized coeff of 32x32 +// blocks are twice as big as for other block sizes. As such, using +// vec_mladd results in overflow. +static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff, + int16x8_t dequant) { + int32x4_t dqcoeffe = vec_mule(qcoeff, dequant); + int32x4_t dqcoeffo = vec_mulo(qcoeff, dequant); + // Add 1 if negative to round towards zero because the C uses division. + dqcoeffe = vec_add(dqcoeffe, vec_is_neg(dqcoeffe)); + dqcoeffo = vec_add(dqcoeffo, vec_is_neg(dqcoeffo)); + dqcoeffe = vec_sra(dqcoeffe, vec_ones_u32); + dqcoeffo = vec_sra(dqcoeffo, vec_ones_u32); + return (int16x8_t)vec_perm(dqcoeffe, dqcoeffo, vec_perm_odd_even_pack); +} + +static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff, + const int16_t *iscan_ptr, int index) { + int16x8_t scan = vec_vsx_ld(index, iscan_ptr); + bool16x8_t zero_coeff = vec_cmpeq(qcoeff, vec_zeros_s16); + return vec_andc(scan, zero_coeff); +} + +// Compare packed 16-bit integers across a, and return the maximum value in +// every element. Returns a vector containing the biggest value across vector a. +static INLINE int16x8_t vec_max_across(int16x8_t a) { + a = vec_max(a, vec_perm(a, a, vec_perm64)); + a = vec_max(a, vec_perm(a, a, vec_perm32)); + return vec_max(a, vec_perm(a, a, vec_perm16)); +} + +void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan_ptr, + const int16_t *iscan_ptr) { + int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob; + bool16x8_t zero_mask0, zero_mask1; + + // First set of 8 coeff starts with DC + 7 AC + int16x8_t zbin = vec_vsx_ld(0, zbin_ptr); + int16x8_t round = vec_vsx_ld(0, round_ptr); + int16x8_t quant = vec_vsx_ld(0, quant_ptr); + int16x8_t dequant = vec_vsx_ld(0, dequant_ptr); + int16x8_t quant_shift = vec_vsx_ld(0, quant_shift_ptr); + + int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr); + int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr); + + int16x8_t coeff0_abs = vec_abs(coeff0); + int16x8_t coeff1_abs = vec_abs(coeff1); + + zero_mask0 = vec_cmpge(coeff0_abs, zbin); + zbin = vec_splat(zbin, 1); + zero_mask1 = vec_cmpge(coeff1_abs, zbin); + + (void)scan_ptr; + + qcoeff0 = + quantize_coeff(coeff0, coeff0_abs, round, quant, quant_shift, zero_mask0); + vec_vsx_st(qcoeff0, 0, qcoeff_ptr); + round = vec_splat(round, 1); + quant = vec_splat(quant, 1); + quant_shift = vec_splat(quant_shift, 1); + qcoeff1 = + quantize_coeff(coeff1, coeff1_abs, round, quant, quant_shift, zero_mask1); + vec_vsx_st(qcoeff1, 16, qcoeff_ptr); + + dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16); + vec_vsx_st(dqcoeff0, 0, dqcoeff_ptr); + dequant = vec_splat(dequant, 1); + dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16); + vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr); + + eob = vec_max(nonzero_scanindex(qcoeff0, iscan_ptr, 0), + nonzero_scanindex(qcoeff1, iscan_ptr, 16)); + + if (n_coeffs > 16) { + int index = 16; + int off0 = 32; + int off1 = 48; + int off2 = 64; + do { + int16x8_t coeff2, coeff2_abs, qcoeff2, dqcoeff2, eob2; + bool16x8_t zero_mask2; + coeff0 = vec_vsx_ld(off0, coeff_ptr); + coeff1 = vec_vsx_ld(off1, coeff_ptr); + coeff2 = vec_vsx_ld(off2, coeff_ptr); + coeff0_abs = vec_abs(coeff0); + coeff1_abs = vec_abs(coeff1); + coeff2_abs = vec_abs(coeff2); + zero_mask0 = vec_cmpge(coeff0_abs, zbin); + zero_mask1 = vec_cmpge(coeff1_abs, zbin); + zero_mask2 = vec_cmpge(coeff2_abs, zbin); + qcoeff0 = quantize_coeff(coeff0, coeff0_abs, round, quant, quant_shift, + zero_mask0); + qcoeff1 = quantize_coeff(coeff1, coeff1_abs, round, quant, quant_shift, + zero_mask1); + qcoeff2 = quantize_coeff(coeff2, coeff2_abs, round, quant, quant_shift, + zero_mask2); + vec_vsx_st(qcoeff0, off0, qcoeff_ptr); + vec_vsx_st(qcoeff1, off1, qcoeff_ptr); + vec_vsx_st(qcoeff2, off2, qcoeff_ptr); + + dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16); + dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16); + dqcoeff2 = vec_mladd(qcoeff2, dequant, vec_zeros_s16); + + vec_vsx_st(dqcoeff0, off0, dqcoeff_ptr); + vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr); + vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr); + + eob = vec_max(eob, nonzero_scanindex(qcoeff0, iscan_ptr, off0)); + eob2 = vec_max(nonzero_scanindex(qcoeff1, iscan_ptr, off1), + nonzero_scanindex(qcoeff2, iscan_ptr, off2)); + eob = vec_max(eob, eob2); + + index += 24; + off0 += 48; + off1 += 48; + off2 += 48; + } while (index < n_coeffs); + } + + eob = vec_max_across(eob); + *eob_ptr = eob[0]; +} + +void vpx_quantize_b_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan_ptr, + const int16_t *iscan_ptr) { + // In stage 1, we quantize 16 coeffs (DC + 15 AC) + // In stage 2, we loop 42 times and quantize 24 coeffs per iteration + // (32 * 32 - 16) / 24 = 42 + int num_itr = 42; + // Offsets are in bytes, 16 coeffs = 32 bytes + int off0 = 32; + int off1 = 48; + int off2 = 64; + + int16x8_t qcoeff0, qcoeff1, eob; + bool16x8_t zero_mask0, zero_mask1; + + int16x8_t zbin = vec_vsx_ld(0, zbin_ptr); + int16x8_t round = vec_vsx_ld(0, round_ptr); + int16x8_t quant = vec_vsx_ld(0, quant_ptr); + int16x8_t dequant = vec_vsx_ld(0, dequant_ptr); + int16x8_t quant_shift = vec_vsx_ld(0, quant_shift_ptr); + + int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr); + int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr); + + int16x8_t coeff0_abs = vec_abs(coeff0); + int16x8_t coeff1_abs = vec_abs(coeff1); + + (void)scan_ptr; + (void)n_coeffs; + + // 32x32 quantization requires that zbin and round be divided by 2 + zbin = vec_sra(vec_add(zbin, vec_ones_s16), vec_ones_u16); + round = vec_sra(vec_add(round, vec_ones_s16), vec_ones_u16); + + zero_mask0 = vec_cmpge(coeff0_abs, zbin); + zbin = vec_splat(zbin, 1); // remove DC from zbin + zero_mask1 = vec_cmpge(coeff1_abs, zbin); + + qcoeff0 = quantize_coeff_32(coeff0, coeff0_abs, round, quant, quant_shift, + zero_mask0); + round = vec_splat(round, 1); // remove DC from round + quant = vec_splat(quant, 1); // remove DC from quant + quant_shift = vec_splat(quant_shift, 1); // remove DC from quant_shift + qcoeff1 = quantize_coeff_32(coeff1, coeff1_abs, round, quant, quant_shift, + zero_mask1); + + vec_vsx_st(qcoeff0, 0, qcoeff_ptr); + vec_vsx_st(qcoeff1, 16, qcoeff_ptr); + + vec_vsx_st(dequantize_coeff_32(qcoeff0, dequant), 0, dqcoeff_ptr); + dequant = vec_splat(dequant, 1); // remove DC from dequant + vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), 16, dqcoeff_ptr); + + eob = vec_max(nonzero_scanindex(qcoeff0, iscan_ptr, 0), + nonzero_scanindex(qcoeff1, iscan_ptr, 16)); + + do { + int16x8_t coeff2, coeff2_abs, qcoeff2, eob2; + bool16x8_t zero_mask2; + + coeff0 = vec_vsx_ld(off0, coeff_ptr); + coeff1 = vec_vsx_ld(off1, coeff_ptr); + coeff2 = vec_vsx_ld(off2, coeff_ptr); + + coeff0_abs = vec_abs(coeff0); + coeff1_abs = vec_abs(coeff1); + coeff2_abs = vec_abs(coeff2); + + zero_mask0 = vec_cmpge(coeff0_abs, zbin); + zero_mask1 = vec_cmpge(coeff1_abs, zbin); + zero_mask2 = vec_cmpge(coeff2_abs, zbin); + + qcoeff0 = quantize_coeff_32(coeff0, coeff0_abs, round, quant, quant_shift, + zero_mask0); + qcoeff1 = quantize_coeff_32(coeff1, coeff1_abs, round, quant, quant_shift, + zero_mask1); + qcoeff2 = quantize_coeff_32(coeff2, coeff2_abs, round, quant, quant_shift, + zero_mask2); + + vec_vsx_st(qcoeff0, off0, qcoeff_ptr); + vec_vsx_st(qcoeff1, off1, qcoeff_ptr); + vec_vsx_st(qcoeff2, off2, qcoeff_ptr); + + vec_vsx_st(dequantize_coeff_32(qcoeff0, dequant), off0, dqcoeff_ptr); + vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), off1, dqcoeff_ptr); + vec_vsx_st(dequantize_coeff_32(qcoeff2, dequant), off2, dqcoeff_ptr); + + eob = vec_max(eob, nonzero_scanindex(qcoeff0, iscan_ptr, off0)); + eob2 = vec_max(nonzero_scanindex(qcoeff1, iscan_ptr, off1), + nonzero_scanindex(qcoeff2, iscan_ptr, off2)); + eob = vec_max(eob, eob2); + + // 24 int16_t is 48 bytes + off0 += 48; + off1 += 48; + off2 += 48; + num_itr--; + } while (num_itr != 0); + + eob = vec_max_across(eob); + *eob_ptr = eob[0]; +} diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/sad_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/sad_vsx.c new file mode 100644 index 0000000000..a08ae12413 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/ppc/sad_vsx.c @@ -0,0 +1,261 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" + +#include "vpx_dsp/ppc/types_vsx.h" + +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +#define PROCESS16(offset) \ + v_a = vec_vsx_ld(offset, a); \ + v_b = vec_vsx_ld(offset, b); \ + v_abs = vec_absd(v_a, v_b); \ + v_sad = vec_sum4s(v_abs, v_sad); + +#define SAD8(height) \ + unsigned int vpx_sad8x##height##_vsx(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride) { \ + int y = 0; \ + uint8x16_t v_a, v_b, v_abs; \ + uint32x4_t v_sad = vec_zeros_u32; \ + \ + do { \ + PROCESS16(0) \ + \ + a += a_stride; \ + b += b_stride; \ + y++; \ + } while (y < height); \ + \ + return v_sad[1] + v_sad[0]; \ + } + +#define SAD16(height) \ + unsigned int vpx_sad16x##height##_vsx(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride) { \ + int y = 0; \ + uint8x16_t v_a, v_b, v_abs; \ + uint32x4_t v_sad = vec_zeros_u32; \ + \ + do { \ + PROCESS16(0); \ + \ + a += a_stride; \ + b += b_stride; \ + y++; \ + } while (y < height); \ + \ + return v_sad[3] + v_sad[2] + v_sad[1] + v_sad[0]; \ + } + +#define SAD32(height) \ + unsigned int vpx_sad32x##height##_vsx(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride) { \ + int y = 0; \ + uint8x16_t v_a, v_b, v_abs; \ + uint32x4_t v_sad = vec_zeros_u32; \ + \ + do { \ + PROCESS16(0); \ + PROCESS16(16); \ + \ + a += a_stride; \ + b += b_stride; \ + y++; \ + } while (y < height); \ + \ + return v_sad[3] + v_sad[2] + v_sad[1] + v_sad[0]; \ + } + +#define SAD64(height) \ + unsigned int vpx_sad64x##height##_vsx(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride) { \ + int y = 0; \ + uint8x16_t v_a, v_b, v_abs; \ + uint32x4_t v_sad = vec_zeros_u32; \ + \ + do { \ + PROCESS16(0); \ + PROCESS16(16); \ + PROCESS16(32); \ + PROCESS16(48); \ + \ + a += a_stride; \ + b += b_stride; \ + y++; \ + } while (y < height); \ + \ + return v_sad[3] + v_sad[2] + v_sad[1] + v_sad[0]; \ + } + +SAD8(4); +SAD8(8); +SAD8(16); +SAD16(8); +SAD16(16); +SAD16(32); +SAD32(16); +SAD32(32); +SAD32(64); +SAD64(32); +SAD64(64); + +#define SAD16AVG(height) \ + unsigned int vpx_sad16x##height##_avg_vsx( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + DECLARE_ALIGNED(16, uint8_t, comp_pred[16 * (height)]); \ + vpx_comp_avg_pred_vsx(comp_pred, second_pred, 16, height, ref, \ + ref_stride); \ + \ + return vpx_sad16x##height##_vsx(src, src_stride, comp_pred, 16); \ + } + +#define SAD32AVG(height) \ + unsigned int vpx_sad32x##height##_avg_vsx( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + DECLARE_ALIGNED(32, uint8_t, comp_pred[32 * (height)]); \ + vpx_comp_avg_pred_vsx(comp_pred, second_pred, 32, height, ref, \ + ref_stride); \ + \ + return vpx_sad32x##height##_vsx(src, src_stride, comp_pred, 32); \ + } + +#define SAD64AVG(height) \ + unsigned int vpx_sad64x##height##_avg_vsx( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + DECLARE_ALIGNED(64, uint8_t, comp_pred[64 * (height)]); \ + vpx_comp_avg_pred_vsx(comp_pred, second_pred, 64, height, ref, \ + ref_stride); \ + return vpx_sad64x##height##_vsx(src, src_stride, comp_pred, 64); \ + } + +SAD16AVG(8); +SAD16AVG(16); +SAD16AVG(32); +SAD32AVG(16); +SAD32AVG(32); +SAD32AVG(64); +SAD64AVG(32); +SAD64AVG(64); + +#define PROCESS16_4D(offset, ref, v_h, v_l) \ + v_b = vec_vsx_ld(offset, ref); \ + v_bh = unpack_to_s16_h(v_b); \ + v_bl = unpack_to_s16_l(v_b); \ + v_subh = vec_sub(v_h, v_bh); \ + v_subl = vec_sub(v_l, v_bl); \ + v_absh = vec_abs(v_subh); \ + v_absl = vec_abs(v_subl); \ + v_sad = vec_sum4s(v_absh, v_sad); \ + v_sad = vec_sum4s(v_absl, v_sad); + +#define UNPACK_SRC(offset, srcv_h, srcv_l) \ + v_a = vec_vsx_ld(offset, src); \ + srcv_h = unpack_to_s16_h(v_a); \ + srcv_l = unpack_to_s16_l(v_a); + +#define SAD16_4D(height) \ + void vpx_sad16x##height##x4d_vsx(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[], \ + int ref_stride, uint32_t *sad_array) { \ + int i; \ + int y; \ + unsigned int sad[4]; \ + uint8x16_t v_a, v_b; \ + int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl; \ + \ + for (i = 0; i < 4; i++) sad_array[i] = 0; \ + \ + for (y = 0; y < height; y++) { \ + UNPACK_SRC(y *src_stride, v_ah, v_al); \ + for (i = 0; i < 4; i++) { \ + int32x4_t v_sad = vec_splat_s32(0); \ + PROCESS16_4D(y *ref_stride, ref_array[i], v_ah, v_al); \ + \ + vec_vsx_st((uint32x4_t)v_sad, 0, sad); \ + sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]); \ + } \ + } \ + } + +#define SAD32_4D(height) \ + void vpx_sad32x##height##x4d_vsx(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[], \ + int ref_stride, uint32_t *sad_array) { \ + int i; \ + int y; \ + unsigned int sad[4]; \ + uint8x16_t v_a, v_b; \ + int16x8_t v_ah1, v_al1, v_ah2, v_al2, v_bh, v_bl; \ + int16x8_t v_absh, v_absl, v_subh, v_subl; \ + \ + for (i = 0; i < 4; i++) sad_array[i] = 0; \ + \ + for (y = 0; y < height; y++) { \ + UNPACK_SRC(y *src_stride, v_ah1, v_al1); \ + UNPACK_SRC(y *src_stride + 16, v_ah2, v_al2); \ + for (i = 0; i < 4; i++) { \ + int32x4_t v_sad = vec_splat_s32(0); \ + PROCESS16_4D(y *ref_stride, ref_array[i], v_ah1, v_al1); \ + PROCESS16_4D(y *ref_stride + 16, ref_array[i], v_ah2, v_al2); \ + \ + vec_vsx_st((uint32x4_t)v_sad, 0, sad); \ + sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]); \ + } \ + } \ + } + +#define SAD64_4D(height) \ + void vpx_sad64x##height##x4d_vsx(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[], \ + int ref_stride, uint32_t *sad_array) { \ + int i; \ + int y; \ + unsigned int sad[4]; \ + uint8x16_t v_a, v_b; \ + int16x8_t v_ah1, v_al1, v_ah2, v_al2, v_bh, v_bl; \ + int16x8_t v_ah3, v_al3, v_ah4, v_al4; \ + int16x8_t v_absh, v_absl, v_subh, v_subl; \ + \ + for (i = 0; i < 4; i++) sad_array[i] = 0; \ + \ + for (y = 0; y < height; y++) { \ + UNPACK_SRC(y *src_stride, v_ah1, v_al1); \ + UNPACK_SRC(y *src_stride + 16, v_ah2, v_al2); \ + UNPACK_SRC(y *src_stride + 32, v_ah3, v_al3); \ + UNPACK_SRC(y *src_stride + 48, v_ah4, v_al4); \ + for (i = 0; i < 4; i++) { \ + int32x4_t v_sad = vec_splat_s32(0); \ + PROCESS16_4D(y *ref_stride, ref_array[i], v_ah1, v_al1); \ + PROCESS16_4D(y *ref_stride + 16, ref_array[i], v_ah2, v_al2); \ + PROCESS16_4D(y *ref_stride + 32, ref_array[i], v_ah3, v_al3); \ + PROCESS16_4D(y *ref_stride + 48, ref_array[i], v_ah4, v_al4); \ + \ + vec_vsx_st((uint32x4_t)v_sad, 0, sad); \ + sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]); \ + } \ + } \ + } + +SAD16_4D(8); +SAD16_4D(16); +SAD16_4D(32); +SAD32_4D(16); +SAD32_4D(32); +SAD32_4D(64); +SAD64_4D(32); +SAD64_4D(64); diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/subtract_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/subtract_vsx.c new file mode 100644 index 0000000000..76ad302da6 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/ppc/subtract_vsx.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/ppc/types_vsx.h" + +static VPX_FORCE_INLINE void subtract_block4x4( + int16_t *diff, ptrdiff_t diff_stride, const uint8_t *src, + ptrdiff_t src_stride, const uint8_t *pred, ptrdiff_t pred_stride) { + int16_t *diff1 = diff + 2 * diff_stride; + const uint8_t *src1 = src + 2 * src_stride; + const uint8_t *pred1 = pred + 2 * pred_stride; + + const int16x8_t d0 = vec_vsx_ld(0, diff); + const int16x8_t d1 = vec_vsx_ld(0, diff + diff_stride); + const int16x8_t d2 = vec_vsx_ld(0, diff1); + const int16x8_t d3 = vec_vsx_ld(0, diff1 + diff_stride); + + const uint8x16_t s0 = read4x2(src, (int)src_stride); + const uint8x16_t p0 = read4x2(pred, (int)pred_stride); + const uint8x16_t s1 = read4x2(src1, (int)src_stride); + const uint8x16_t p1 = read4x2(pred1, (int)pred_stride); + + const int16x8_t da = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0)); + const int16x8_t db = vec_sub(unpack_to_s16_h(s1), unpack_to_s16_h(p1)); + + vec_vsx_st(xxpermdi(da, d0, 1), 0, diff); + vec_vsx_st(xxpermdi(da, d1, 3), 0, diff + diff_stride); + vec_vsx_st(xxpermdi(db, d2, 1), 0, diff1); + vec_vsx_st(xxpermdi(db, d3, 3), 0, diff1 + diff_stride); +} + +void vpx_subtract_block_vsx(int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src, + ptrdiff_t src_stride, const uint8_t *pred, + ptrdiff_t pred_stride) { + int r = rows, c; + + switch (cols) { + case 64: + case 32: + do { + for (c = 0; c < cols; c += 32) { + const uint8x16_t s0 = vec_vsx_ld(0, src + c); + const uint8x16_t s1 = vec_vsx_ld(16, src + c); + const uint8x16_t p0 = vec_vsx_ld(0, pred + c); + const uint8x16_t p1 = vec_vsx_ld(16, pred + c); + const int16x8_t d0l = + vec_sub(unpack_to_s16_l(s0), unpack_to_s16_l(p0)); + const int16x8_t d0h = + vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0)); + const int16x8_t d1l = + vec_sub(unpack_to_s16_l(s1), unpack_to_s16_l(p1)); + const int16x8_t d1h = + vec_sub(unpack_to_s16_h(s1), unpack_to_s16_h(p1)); + vec_vsx_st(d0h, 0, diff + c); + vec_vsx_st(d0l, 16, diff + c); + vec_vsx_st(d1h, 0, diff + c + 16); + vec_vsx_st(d1l, 16, diff + c + 16); + } + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + break; + case 16: + do { + const uint8x16_t s0 = vec_vsx_ld(0, src); + const uint8x16_t p0 = vec_vsx_ld(0, pred); + const int16x8_t d0l = vec_sub(unpack_to_s16_l(s0), unpack_to_s16_l(p0)); + const int16x8_t d0h = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0)); + vec_vsx_st(d0h, 0, diff); + vec_vsx_st(d0l, 16, diff); + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + break; + case 8: + do { + const uint8x16_t s0 = vec_vsx_ld(0, src); + const uint8x16_t p0 = vec_vsx_ld(0, pred); + const int16x8_t d0h = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0)); + vec_vsx_st(d0h, 0, diff); + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + break; + case 4: + subtract_block4x4(diff, diff_stride, src, src_stride, pred, pred_stride); + if (r > 4) { + diff += 4 * diff_stride; + pred += 4 * pred_stride; + src += 4 * src_stride; + + subtract_block4x4(diff, diff_stride, + + src, src_stride, + + pred, pred_stride); + } + break; + default: assert(0); // unreachable + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/transpose_vsx.h b/media/libvpx/libvpx/vpx_dsp/ppc/transpose_vsx.h new file mode 100644 index 0000000000..4883b734ad --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/ppc/transpose_vsx.h @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_PPC_TRANSPOSE_VSX_H_ +#define VPX_VPX_DSP_PPC_TRANSPOSE_VSX_H_ + +#include "./vpx_config.h" +#include "vpx_dsp/ppc/types_vsx.h" + +static INLINE void vpx_transpose_s16_8x8(int16x8_t v[8]) { + // d = vec_mergeh(a,b): + // The even elements of the result are obtained left-to-right, + // from the high elements of a. + // The odd elements of the result are obtained left-to-right, + // from the high elements of b. + // + // d = vec_mergel(a,b): + // The even elements of the result are obtained left-to-right, + // from the low elements of a. + // The odd elements of the result are obtained left-to-right, + // from the low elements of b. + + // Example, starting with: + // v[0]: 00 01 02 03 04 05 06 07 + // v[1]: 10 11 12 13 14 15 16 17 + // v[2]: 20 21 22 23 24 25 26 27 + // v[3]: 30 31 32 33 34 35 36 37 + // v[4]: 40 41 42 43 44 45 46 47 + // v[5]: 50 51 52 53 54 55 56 57 + // v[6]: 60 61 62 63 64 65 66 67 + // v[7]: 70 71 72 73 74 75 76 77 + + int16x8_t b0, b1, b2, b3, b4, b5, b6, b7; + int16x8_t c0, c1, c2, c3, c4, c5, c6, c7; + + b0 = vec_mergeh(v[0], v[4]); + b1 = vec_mergel(v[0], v[4]); + b2 = vec_mergeh(v[1], v[5]); + b3 = vec_mergel(v[1], v[5]); + b4 = vec_mergeh(v[2], v[6]); + b5 = vec_mergel(v[2], v[6]); + b6 = vec_mergeh(v[3], v[7]); + b7 = vec_mergel(v[3], v[7]); + + // After first merge operation + // b0: 00 40 01 41 02 42 03 43 + // b1: 04 44 05 45 06 46 07 47 + // b2: 10 50 11 51 12 52 13 53 + // b3: 14 54 15 55 16 56 17 57 + // b4: 20 60 21 61 22 62 23 63 + // b5: 24 64 25 65 26 66 27 67 + // b6: 30 70 31 71 32 62 33 73 + // b7: 34 74 35 75 36 76 37 77 + + c0 = vec_mergeh(b0, b4); + c1 = vec_mergel(b0, b4); + c2 = vec_mergeh(b1, b5); + c3 = vec_mergel(b1, b5); + c4 = vec_mergeh(b2, b6); + c5 = vec_mergel(b2, b6); + c6 = vec_mergeh(b3, b7); + c7 = vec_mergel(b3, b7); + + // After second merge operation + // c0: 00 20 40 60 01 21 41 61 + // c1: 02 22 42 62 03 23 43 63 + // c2: 04 24 44 64 05 25 45 65 + // c3: 06 26 46 66 07 27 47 67 + // c4: 10 30 50 70 11 31 51 71 + // c5: 12 32 52 72 13 33 53 73 + // c6: 14 34 54 74 15 35 55 75 + // c7: 16 36 56 76 17 37 57 77 + + v[0] = vec_mergeh(c0, c4); + v[1] = vec_mergel(c0, c4); + v[2] = vec_mergeh(c1, c5); + v[3] = vec_mergel(c1, c5); + v[4] = vec_mergeh(c2, c6); + v[5] = vec_mergel(c2, c6); + v[6] = vec_mergeh(c3, c7); + v[7] = vec_mergel(c3, c7); + + // After last merge operation + // v[0]: 00 10 20 30 40 50 60 70 + // v[1]: 01 11 21 31 41 51 61 71 + // v[2]: 02 12 22 32 42 52 62 72 + // v[3]: 03 13 23 33 43 53 63 73 + // v[4]: 04 14 24 34 44 54 64 74 + // v[5]: 05 15 25 35 45 55 65 75 + // v[6]: 06 16 26 36 46 56 66 76 + // v[7]: 07 17 27 37 47 57 67 77 +} + +static INLINE void transpose_8x8(const int16x8_t *a, int16x8_t *b) { + // Stage 1 + const int16x8_t s1_0 = vec_mergeh(a[0], a[4]); + const int16x8_t s1_1 = vec_mergel(a[0], a[4]); + const int16x8_t s1_2 = vec_mergeh(a[1], a[5]); + const int16x8_t s1_3 = vec_mergel(a[1], a[5]); + const int16x8_t s1_4 = vec_mergeh(a[2], a[6]); + const int16x8_t s1_5 = vec_mergel(a[2], a[6]); + const int16x8_t s1_6 = vec_mergeh(a[3], a[7]); + const int16x8_t s1_7 = vec_mergel(a[3], a[7]); + + // Stage 2 + const int16x8_t s2_0 = vec_mergeh(s1_0, s1_4); + const int16x8_t s2_1 = vec_mergel(s1_0, s1_4); + const int16x8_t s2_2 = vec_mergeh(s1_1, s1_5); + const int16x8_t s2_3 = vec_mergel(s1_1, s1_5); + const int16x8_t s2_4 = vec_mergeh(s1_2, s1_6); + const int16x8_t s2_5 = vec_mergel(s1_2, s1_6); + const int16x8_t s2_6 = vec_mergeh(s1_3, s1_7); + const int16x8_t s2_7 = vec_mergel(s1_3, s1_7); + + // Stage 2 + b[0] = vec_mergeh(s2_0, s2_4); + b[1] = vec_mergel(s2_0, s2_4); + b[2] = vec_mergeh(s2_1, s2_5); + b[3] = vec_mergel(s2_1, s2_5); + b[4] = vec_mergeh(s2_2, s2_6); + b[5] = vec_mergel(s2_2, s2_6); + b[6] = vec_mergeh(s2_3, s2_7); + b[7] = vec_mergel(s2_3, s2_7); +} + +#endif // VPX_VPX_DSP_PPC_TRANSPOSE_VSX_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/txfm_common_vsx.h b/media/libvpx/libvpx/vpx_dsp/ppc/txfm_common_vsx.h new file mode 100644 index 0000000000..2907a1fe40 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/ppc/txfm_common_vsx.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_PPC_TXFM_COMMON_VSX_H_ +#define VPX_VPX_DSP_PPC_TXFM_COMMON_VSX_H_ + +#include "vpx_dsp/ppc/types_vsx.h" + +static const int32x4_t vec_dct_const_rounding = { 8192, 8192, 8192, 8192 }; + +static const uint32x4_t vec_dct_const_bits = { 14, 14, 14, 14 }; + +static const uint16x8_t vec_dct_scale_log2 = { 2, 2, 2, 2, 2, 2, 2, 2 }; + +static const int16x8_t cospi1_v = { 16364, 16364, 16364, 16364, + 16364, 16364, 16364, 16364 }; +static const int16x8_t cospi2_v = { 16305, 16305, 16305, 16305, + 16305, 16305, 16305, 16305 }; +static const int16x8_t cospi3_v = { 16207, 16207, 16207, 16207, + 16207, 16207, 16207, 16207 }; +static const int16x8_t cospi4_v = { 16069, 16069, 16069, 16069, + 16069, 16069, 16069, 16069 }; +static const int16x8_t cospi4m_v = { -16069, -16069, -16069, -16069, + -16069, -16069, -16069, -16069 }; +static const int16x8_t cospi5_v = { 15893, 15893, 15893, 15893, + 15893, 15893, 15893, 15893 }; +static const int16x8_t cospi6_v = { 15679, 15679, 15679, 15679, + 15679, 15679, 15679, 15679 }; +static const int16x8_t cospi7_v = { 15426, 15426, 15426, 15426, + 15426, 15426, 15426, 15426 }; +static const int16x8_t cospi8_v = { 15137, 15137, 15137, 15137, + 15137, 15137, 15137, 15137 }; +static const int16x8_t cospi8m_v = { -15137, -15137, -15137, -15137, + -15137, -15137, -15137, -15137 }; +static const int16x8_t cospi9_v = { 14811, 14811, 14811, 14811, + 14811, 14811, 14811, 14811 }; +static const int16x8_t cospi10_v = { 14449, 14449, 14449, 14449, + 14449, 14449, 14449, 14449 }; +static const int16x8_t cospi11_v = { 14053, 14053, 14053, 14053, + 14053, 14053, 14053, 14053 }; +static const int16x8_t cospi12_v = { 13623, 13623, 13623, 13623, + 13623, 13623, 13623, 13623 }; +static const int16x8_t cospi13_v = { 13160, 13160, 13160, 13160, + 13160, 13160, 13160, 13160 }; +static const int16x8_t cospi14_v = { 12665, 12665, 12665, 12665, + 12665, 12665, 12665, 12665 }; +static const int16x8_t cospi15_v = { 12140, 12140, 12140, 12140, + 12140, 12140, 12140, 12140 }; +static const int16x8_t cospi16_v = { 11585, 11585, 11585, 11585, + 11585, 11585, 11585, 11585 }; +static const int16x8_t cospi17_v = { 11003, 11003, 11003, 11003, + 11003, 11003, 11003, 11003 }; +static const int16x8_t cospi18_v = { 10394, 10394, 10394, 10394, + 10394, 10394, 10394, 10394 }; +static const int16x8_t cospi19_v = { 9760, 9760, 9760, 9760, + 9760, 9760, 9760, 9760 }; +static const int16x8_t cospi20_v = { 9102, 9102, 9102, 9102, + 9102, 9102, 9102, 9102 }; +static const int16x8_t cospi20m_v = { -9102, -9102, -9102, -9102, + -9102, -9102, -9102, -9102 }; +static const int16x8_t cospi21_v = { 8423, 8423, 8423, 8423, + 8423, 8423, 8423, 8423 }; +static const int16x8_t cospi22_v = { 7723, 7723, 7723, 7723, + 7723, 7723, 7723, 7723 }; +static const int16x8_t cospi23_v = { 7005, 7005, 7005, 7005, + 7005, 7005, 7005, 7005 }; +static const int16x8_t cospi24_v = { 6270, 6270, 6270, 6270, + 6270, 6270, 6270, 6270 }; +static const int16x8_t cospi25_v = { 5520, 5520, 5520, 5520, + 5520, 5520, 5520, 5520 }; +static const int16x8_t cospi26_v = { 4756, 4756, 4756, 4756, + 4756, 4756, 4756, 4756 }; +static const int16x8_t cospi27_v = { 3981, 3981, 3981, 3981, + 3981, 3981, 3981, 3981 }; +static const int16x8_t cospi28_v = { 3196, 3196, 3196, 3196, + 3196, 3196, 3196, 3196 }; +static const int16x8_t cospi29_v = { 2404, 2404, 2404, 2404, + 2404, 2404, 2404, 2404 }; +static const int16x8_t cospi30_v = { 1606, 1606, 1606, 1606, + 1606, 1606, 1606, 1606 }; +static const int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 }; + +#endif // VPX_VPX_DSP_PPC_TXFM_COMMON_VSX_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/types_vsx.h b/media/libvpx/libvpx/vpx_dsp/ppc/types_vsx.h new file mode 100644 index 0000000000..b891169245 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/ppc/types_vsx.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_PPC_TYPES_VSX_H_ +#define VPX_VPX_DSP_PPC_TYPES_VSX_H_ + +#include + +typedef vector signed char int8x16_t; +typedef vector unsigned char uint8x16_t; +typedef vector signed short int16x8_t; +typedef vector unsigned short uint16x8_t; +typedef vector signed int int32x4_t; +typedef vector unsigned int uint32x4_t; +typedef vector bool char bool8x16_t; +typedef vector bool short bool16x8_t; +typedef vector bool int bool32x4_t; + +#if defined(__clang__) && __clang_major__ < 6 +static const uint8x16_t xxpermdi0_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, + 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, + 0x14, 0x15, 0x16, 0x17 }; +static const uint8x16_t xxpermdi1_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, + 0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B, + 0x1C, 0x1D, 0x1E, 0x1F }; +static const uint8x16_t xxpermdi2_perm = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, + 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, + 0x14, 0x15, 0x16, 0x17 }; +static const uint8x16_t xxpermdi3_perm = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, + 0x0E, 0x0F, 0x18, 0x19, 0x1A, 0x1B, + 0x1C, 0x1D, 0x1E, 0x1F }; +#define xxpermdi(a, b, c) vec_perm(a, b, xxpermdi##c##_perm) +#elif defined(__GNUC__) && \ + (__GNUC__ > 6 || (__GNUC__ == 6 && __GNUC_MINOR__ >= 3)) +#define xxpermdi(a, b, c) vec_xxpermdi(a, b, c) +#endif + +#ifdef WORDS_BIGENDIAN +#define unpack_to_u16_h(v) \ + (uint16x8_t) vec_mergeh(vec_splat_u8(0), (uint8x16_t)v) +#define unpack_to_u16_l(v) \ + (uint16x8_t) vec_mergel(vec_splat_u8(0), (uint8x16_t)v) +#define unpack_to_s16_h(v) \ + (int16x8_t) vec_mergeh(vec_splat_u8(0), (uint8x16_t)v) +#define unpack_to_s16_l(v) \ + (int16x8_t) vec_mergel(vec_splat_u8(0), (uint8x16_t)v) +#ifndef xxpermdi +#define xxpermdi(a, b, c) vec_xxpermdi(a, b, c) +#endif +#else +#define unpack_to_u16_h(v) \ + (uint16x8_t) vec_mergeh((uint8x16_t)v, vec_splat_u8(0)) +#define unpack_to_u16_l(v) \ + (uint16x8_t) vec_mergel((uint8x16_t)v, vec_splat_u8(0)) +#define unpack_to_s16_h(v) \ + (int16x8_t) vec_mergeh((uint8x16_t)v, vec_splat_u8(0)) +#define unpack_to_s16_l(v) \ + (int16x8_t) vec_mergel((uint8x16_t)v, vec_splat_u8(0)) +#ifndef xxpermdi +#define xxpermdi(a, b, c) vec_xxpermdi(b, a, (((c) >> 1) | ((c)&1) << 1) ^ 3) +#endif +#endif + +static INLINE uint8x16_t read4x2(const uint8_t *a, int stride) { + const uint32x4_t a0 = (uint32x4_t)vec_vsx_ld(0, a); + const uint32x4_t a1 = (uint32x4_t)vec_vsx_ld(0, a + stride); + + return (uint8x16_t)vec_mergeh(a0, a1); +} + +#ifndef __POWER9_VECTOR__ +#define vec_absd(a, b) vec_sub(vec_max(a, b), vec_min(a, b)) +#endif + +static const uint8x16_t vec_zeros_u8 = { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }; +static const int16x8_t vec_zeros_s16 = { 0, 0, 0, 0, 0, 0, 0, 0 }; +static const int16x8_t vec_ones_s16 = { 1, 1, 1, 1, 1, 1, 1, 1 }; +static const int16x8_t vec_twos_s16 = { 2, 2, 2, 2, 2, 2, 2, 2 }; +static const uint16x8_t vec_ones_u16 = { 1, 1, 1, 1, 1, 1, 1, 1 }; +static const uint32x4_t vec_ones_u32 = { 1, 1, 1, 1 }; +static const int32x4_t vec_zeros_s32 = { 0, 0, 0, 0 }; +static const uint32x4_t vec_zeros_u32 = { 0, 0, 0, 0 }; +static const uint16x8_t vec_shift_sign_s16 = { 15, 15, 15, 15, 15, 15, 15, 15 }; +static const uint32x4_t vec_shift_sign_s32 = { 31, 31, 31, 31 }; +static const uint8x16_t vec_perm64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, + 0x0E, 0x0F, 0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x06, 0x07 }; +static const uint8x16_t vec_perm32 = { 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, + 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0x00, 0x01, 0x02, 0x03 }; +static const uint8x16_t vec_perm16 = { 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0A, 0x0B, 0x0E, 0x0D, + 0x0E, 0x0F, 0x00, 0x01 }; + +static const uint8x16_t vec_perm_odd_even_pack = { 0x00, 0x01, 0x10, 0x11, + 0x04, 0x05, 0x14, 0x15, + 0x08, 0x09, 0x18, 0x19, + 0x0C, 0x0D, 0x1C, 0x1D }; + +#endif // VPX_VPX_DSP_PPC_TYPES_VSX_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/variance_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/variance_vsx.c new file mode 100644 index 0000000000..6c6bc9a301 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/ppc/variance_vsx.c @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/ppc/types_vsx.h" + +uint32_t vpx_get4x4sse_cs_vsx(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + int distortion; + + const int16x8_t a0 = unpack_to_s16_h(read4x2(src_ptr, src_stride)); + const int16x8_t a1 = + unpack_to_s16_h(read4x2(src_ptr + src_stride * 2, src_stride)); + const int16x8_t b0 = unpack_to_s16_h(read4x2(ref_ptr, ref_stride)); + const int16x8_t b1 = + unpack_to_s16_h(read4x2(ref_ptr + ref_stride * 2, ref_stride)); + const int16x8_t d0 = vec_sub(a0, b0); + const int16x8_t d1 = vec_sub(a1, b1); + const int32x4_t ds = vec_msum(d1, d1, vec_msum(d0, d0, vec_splat_s32(0))); + const int32x4_t d = vec_splat(vec_sums(ds, vec_splat_s32(0)), 3); + + vec_ste(d, 0, &distortion); + + return distortion; +} + +// TODO(lu_zero): Unroll +uint32_t vpx_get_mb_ss_vsx(const int16_t *src_ptr) { + unsigned int i, sum = 0; + int32x4_t s = vec_splat_s32(0); + + for (i = 0; i < 256; i += 8) { + const int16x8_t v = vec_vsx_ld(0, src_ptr + i); + s = vec_msum(v, v, s); + } + + s = vec_splat(vec_sums(s, vec_splat_s32(0)), 3); + + vec_ste((uint32x4_t)s, 0, &sum); + + return sum; +} + +void vpx_comp_avg_pred_vsx(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + int i, j; + /* comp_pred and pred must be 16 byte aligned. */ + assert(((intptr_t)comp_pred & 0xf) == 0); + assert(((intptr_t)pred & 0xf) == 0); + if (width >= 16) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; j += 16) { + const uint8x16_t v = vec_avg(vec_vsx_ld(j, pred), vec_vsx_ld(j, ref)); + vec_vsx_st(v, j, comp_pred); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } + } else if (width == 8) { + // Process 2 lines at time + for (i = 0; i < height / 2; ++i) { + const uint8x16_t r0 = vec_vsx_ld(0, ref); + const uint8x16_t r1 = vec_vsx_ld(0, ref + ref_stride); + const uint8x16_t r = xxpermdi(r0, r1, 0); + const uint8x16_t v = vec_avg(vec_vsx_ld(0, pred), r); + vec_vsx_st(v, 0, comp_pred); + comp_pred += 16; // width * 2; + pred += 16; // width * 2; + ref += ref_stride * 2; + } + } else { + assert(width == 4); + // process 4 lines at time + for (i = 0; i < height / 4; ++i) { + const uint32x4_t r0 = (uint32x4_t)vec_vsx_ld(0, ref); + const uint32x4_t r1 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride); + const uint32x4_t r2 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride * 2); + const uint32x4_t r3 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride * 3); + const uint8x16_t r = + (uint8x16_t)xxpermdi(vec_mergeh(r0, r1), vec_mergeh(r2, r3), 0); + const uint8x16_t v = vec_avg(vec_vsx_ld(0, pred), r); + vec_vsx_st(v, 0, comp_pred); + comp_pred += 16; // width * 4; + pred += 16; // width * 4; + ref += ref_stride * 4; + } + } +} + +static INLINE void variance_inner_32(const uint8_t *src_ptr, + const uint8_t *ref_ptr, + int32x4_t *sum_squared, int32x4_t *sum) { + int32x4_t s = *sum; + int32x4_t ss = *sum_squared; + + const uint8x16_t va0 = vec_vsx_ld(0, src_ptr); + const uint8x16_t vb0 = vec_vsx_ld(0, ref_ptr); + const uint8x16_t va1 = vec_vsx_ld(16, src_ptr); + const uint8x16_t vb1 = vec_vsx_ld(16, ref_ptr); + + const int16x8_t a0 = unpack_to_s16_h(va0); + const int16x8_t b0 = unpack_to_s16_h(vb0); + const int16x8_t a1 = unpack_to_s16_l(va0); + const int16x8_t b1 = unpack_to_s16_l(vb0); + const int16x8_t a2 = unpack_to_s16_h(va1); + const int16x8_t b2 = unpack_to_s16_h(vb1); + const int16x8_t a3 = unpack_to_s16_l(va1); + const int16x8_t b3 = unpack_to_s16_l(vb1); + const int16x8_t d0 = vec_sub(a0, b0); + const int16x8_t d1 = vec_sub(a1, b1); + const int16x8_t d2 = vec_sub(a2, b2); + const int16x8_t d3 = vec_sub(a3, b3); + + s = vec_sum4s(d0, s); + ss = vec_msum(d0, d0, ss); + s = vec_sum4s(d1, s); + ss = vec_msum(d1, d1, ss); + s = vec_sum4s(d2, s); + ss = vec_msum(d2, d2, ss); + s = vec_sum4s(d3, s); + ss = vec_msum(d3, d3, ss); + *sum = s; + *sum_squared = ss; +} + +static INLINE void variance(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, int w, + int h, uint32_t *sse, int *sum) { + int i; + + int32x4_t s = vec_splat_s32(0); + int32x4_t ss = vec_splat_s32(0); + + switch (w) { + case 4: + for (i = 0; i < h / 2; ++i) { + const int16x8_t a0 = unpack_to_s16_h(read4x2(src_ptr, src_stride)); + const int16x8_t b0 = unpack_to_s16_h(read4x2(ref_ptr, ref_stride)); + const int16x8_t d = vec_sub(a0, b0); + s = vec_sum4s(d, s); + ss = vec_msum(d, d, ss); + src_ptr += src_stride * 2; + ref_ptr += ref_stride * 2; + } + break; + case 8: + for (i = 0; i < h; ++i) { + const int16x8_t a0 = unpack_to_s16_h(vec_vsx_ld(0, src_ptr)); + const int16x8_t b0 = unpack_to_s16_h(vec_vsx_ld(0, ref_ptr)); + const int16x8_t d = vec_sub(a0, b0); + + s = vec_sum4s(d, s); + ss = vec_msum(d, d, ss); + src_ptr += src_stride; + ref_ptr += ref_stride; + } + break; + case 16: + for (i = 0; i < h; ++i) { + const uint8x16_t va = vec_vsx_ld(0, src_ptr); + const uint8x16_t vb = vec_vsx_ld(0, ref_ptr); + const int16x8_t a0 = unpack_to_s16_h(va); + const int16x8_t b0 = unpack_to_s16_h(vb); + const int16x8_t a1 = unpack_to_s16_l(va); + const int16x8_t b1 = unpack_to_s16_l(vb); + const int16x8_t d0 = vec_sub(a0, b0); + const int16x8_t d1 = vec_sub(a1, b1); + + s = vec_sum4s(d0, s); + ss = vec_msum(d0, d0, ss); + s = vec_sum4s(d1, s); + ss = vec_msum(d1, d1, ss); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } + break; + case 32: + for (i = 0; i < h; ++i) { + variance_inner_32(src_ptr, ref_ptr, &ss, &s); + src_ptr += src_stride; + ref_ptr += ref_stride; + } + break; + case 64: + for (i = 0; i < h; ++i) { + variance_inner_32(src_ptr, ref_ptr, &ss, &s); + variance_inner_32(src_ptr + 32, ref_ptr + 32, &ss, &s); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } + break; + } + + s = vec_splat(vec_sums(s, vec_splat_s32(0)), 3); + + vec_ste(s, 0, sum); + + ss = vec_splat(vec_sums(ss, vec_splat_s32(0)), 3); + + vec_ste((uint32x4_t)ss, 0, sse); +} + +/* Identical to the variance call except it takes an additional parameter, sum, + * and returns that value using pass-by-reference instead of returning + * sse - sum^2 / w*h + */ +#define GET_VAR(W, H) \ + void vpx_get##W##x##H##var_vsx(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse, int *sum) { \ + variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, sum); \ + } + +/* Identical to the variance call except it does not calculate the + * sse - sum^2 / w*h and returns sse in addition to modifying the passed in + * variable. + */ +#define MSE(W, H) \ + uint32_t vpx_mse##W##x##H##_vsx(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + int sum; \ + variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum); \ + return *sse; \ + } + +#define VAR(W, H) \ + uint32_t vpx_variance##W##x##H##_vsx(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + int sum; \ + variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / ((W) * (H))); \ + } + +#define VARIANCES(W, H) VAR(W, H) + +VARIANCES(64, 64) +VARIANCES(64, 32) +VARIANCES(32, 64) +VARIANCES(32, 32) +VARIANCES(32, 16) +VARIANCES(16, 32) +VARIANCES(16, 16) +VARIANCES(16, 8) +VARIANCES(8, 16) +VARIANCES(8, 8) +VARIANCES(8, 4) +VARIANCES(4, 8) +VARIANCES(4, 4) + +GET_VAR(16, 16) +GET_VAR(8, 8) + +MSE(16, 16) +MSE(16, 8) +MSE(8, 16) +MSE(8, 8) diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c new file mode 100644 index 0000000000..2dc66055cc --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c @@ -0,0 +1,408 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/ppc/types_vsx.h" +#include "vpx_dsp/vpx_filter.h" + +// TODO(lu_zero): unroll +static VPX_FORCE_INLINE void copy_w16(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + int32_t h) { + int i; + + for (i = h; i--;) { + vec_vsx_st(vec_vsx_ld(0, src), 0, dst); + src += src_stride; + dst += dst_stride; + } +} + +static VPX_FORCE_INLINE void copy_w32(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + int32_t h) { + int i; + + for (i = h; i--;) { + vec_vsx_st(vec_vsx_ld(0, src), 0, dst); + vec_vsx_st(vec_vsx_ld(16, src), 16, dst); + src += src_stride; + dst += dst_stride; + } +} + +static VPX_FORCE_INLINE void copy_w64(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + int32_t h) { + int i; + + for (i = h; i--;) { + vec_vsx_st(vec_vsx_ld(0, src), 0, dst); + vec_vsx_st(vec_vsx_ld(16, src), 16, dst); + vec_vsx_st(vec_vsx_ld(32, src), 32, dst); + vec_vsx_st(vec_vsx_ld(48, src), 48, dst); + src += src_stride; + dst += dst_stride; + } +} + +void vpx_convolve_copy_vsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) { + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + switch (w) { + case 16: { + copy_w16(src, src_stride, dst, dst_stride, h); + break; + } + case 32: { + copy_w32(src, src_stride, dst, dst_stride, h); + break; + } + case 64: { + copy_w64(src, src_stride, dst, dst_stride, h); + break; + } + default: { + int i; + for (i = h; i--;) { + memcpy(dst, src, w); + src += src_stride; + dst += dst_stride; + } + break; + } + } +} + +static VPX_FORCE_INLINE void avg_w16(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + int32_t h) { + int i; + + for (i = h; i--;) { + const uint8x16_t v = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst)); + vec_vsx_st(v, 0, dst); + src += src_stride; + dst += dst_stride; + } +} + +static VPX_FORCE_INLINE void avg_w32(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + int32_t h) { + int i; + + for (i = h; i--;) { + const uint8x16_t v0 = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst)); + const uint8x16_t v1 = vec_avg(vec_vsx_ld(16, src), vec_vsx_ld(16, dst)); + vec_vsx_st(v0, 0, dst); + vec_vsx_st(v1, 16, dst); + src += src_stride; + dst += dst_stride; + } +} + +static VPX_FORCE_INLINE void avg_w64(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + int32_t h) { + int i; + + for (i = h; i--;) { + const uint8x16_t v0 = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst)); + const uint8x16_t v1 = vec_avg(vec_vsx_ld(16, src), vec_vsx_ld(16, dst)); + const uint8x16_t v2 = vec_avg(vec_vsx_ld(32, src), vec_vsx_ld(32, dst)); + const uint8x16_t v3 = vec_avg(vec_vsx_ld(48, src), vec_vsx_ld(48, dst)); + vec_vsx_st(v0, 0, dst); + vec_vsx_st(v1, 16, dst); + vec_vsx_st(v2, 32, dst); + vec_vsx_st(v3, 48, dst); + src += src_stride; + dst += dst_stride; + } +} + +void vpx_convolve_avg_vsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) { + switch (w) { + case 16: { + avg_w16(src, src_stride, dst, dst_stride, h); + break; + } + case 32: { + avg_w32(src, src_stride, dst, dst_stride, h); + break; + } + case 64: { + avg_w64(src, src_stride, dst, dst_stride, h); + break; + } + default: { + vpx_convolve_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} + +static VPX_FORCE_INLINE void convolve_line(uint8_t *dst, const int16x8_t s, + const int16x8_t f) { + const int32x4_t sum = vec_msum(s, f, vec_splat_s32(0)); + const int32x4_t bias = + vec_sl(vec_splat_s32(1), vec_splat_u32(FILTER_BITS - 1)); + const int32x4_t avg = vec_sr(vec_sums(sum, bias), vec_splat_u32(FILTER_BITS)); + const uint8x16_t v = vec_splat( + vec_packsu(vec_pack(avg, vec_splat_s32(0)), vec_splat_s16(0)), 3); + vec_ste(v, 0, dst); +} + +static VPX_FORCE_INLINE void convolve_line_h(uint8_t *dst, + const uint8_t *const src_x, + const int16_t *const x_filter) { + const int16x8_t s = unpack_to_s16_h(vec_vsx_ld(0, src_x)); + const int16x8_t f = vec_vsx_ld(0, x_filter); + + convolve_line(dst, s, f); +} + +// TODO(lu_zero): Implement 8x8 and bigger block special cases +static VPX_FORCE_INLINE void convolve_horiz(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const InterpKernel *x_filters, + int x0_q4, int x_step_q4, int w, + int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; + + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + convolve_line_h(dst + x, &src[x_q4 >> SUBPEL_BITS], + x_filters[x_q4 & SUBPEL_MASK]); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static VPX_FORCE_INLINE void convolve_avg_horiz( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; + + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + uint8_t v; + convolve_line_h(&v, &src[x_q4 >> SUBPEL_BITS], + x_filters[x_q4 & SUBPEL_MASK]); + dst[x] = ROUND_POWER_OF_TWO(dst[x] + v, 1); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static uint8x16_t transpose_line_u8_8x8(uint8x16_t a, uint8x16_t b, + uint8x16_t c, uint8x16_t d, + uint8x16_t e, uint8x16_t f, + uint8x16_t g, uint8x16_t h) { + uint16x8_t ab = (uint16x8_t)vec_mergeh(a, b); + uint16x8_t cd = (uint16x8_t)vec_mergeh(c, d); + uint16x8_t ef = (uint16x8_t)vec_mergeh(e, f); + uint16x8_t gh = (uint16x8_t)vec_mergeh(g, h); + + uint32x4_t abcd = (uint32x4_t)vec_mergeh(ab, cd); + uint32x4_t efgh = (uint32x4_t)vec_mergeh(ef, gh); + + return (uint8x16_t)vec_mergeh(abcd, efgh); +} + +static VPX_FORCE_INLINE void convolve_line_v(uint8_t *dst, + const uint8_t *const src_y, + ptrdiff_t src_stride, + const int16_t *const y_filter) { + uint8x16_t s0 = vec_vsx_ld(0, src_y + 0 * src_stride); + uint8x16_t s1 = vec_vsx_ld(0, src_y + 1 * src_stride); + uint8x16_t s2 = vec_vsx_ld(0, src_y + 2 * src_stride); + uint8x16_t s3 = vec_vsx_ld(0, src_y + 3 * src_stride); + uint8x16_t s4 = vec_vsx_ld(0, src_y + 4 * src_stride); + uint8x16_t s5 = vec_vsx_ld(0, src_y + 5 * src_stride); + uint8x16_t s6 = vec_vsx_ld(0, src_y + 6 * src_stride); + uint8x16_t s7 = vec_vsx_ld(0, src_y + 7 * src_stride); + const int16x8_t f = vec_vsx_ld(0, y_filter); + uint8_t buf[16]; + const uint8x16_t s = transpose_line_u8_8x8(s0, s1, s2, s3, s4, s5, s6, s7); + + vec_vsx_st(s, 0, buf); + + convolve_line(dst, unpack_to_s16_h(s), f); +} + +static VPX_FORCE_INLINE void convolve_vert(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const InterpKernel *y_filters, + int y0_q4, int y_step_q4, int w, + int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + convolve_line_v(dst + y * dst_stride, + &src[(y_q4 >> SUBPEL_BITS) * src_stride], src_stride, + y_filters[y_q4 & SUBPEL_MASK]); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static VPX_FORCE_INLINE void convolve_avg_vert( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + uint8_t v; + convolve_line_v(&v, &src[(y_q4 >> SUBPEL_BITS) * src_stride], src_stride, + y_filters[y_q4 & SUBPEL_MASK]); + dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + v, 1); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static VPX_FORCE_INLINE void convolve(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *const filter, + int x0_q4, int x_step_q4, int y0_q4, + int y_step_q4, int w, int h) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + DECLARE_ALIGNED(16, uint8_t, temp[64 * 135]); + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= 64); + assert(h <= 64); + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); + + convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64, + filter, x0_q4, x_step_q4, w, intermediate_height); + convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter, + y0_q4, y_step_q4, w, h); +} + +void vpx_convolve8_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + (void)y0_q4; + (void)y_step_q4; + + convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w, + h); +} + +void vpx_convolve8_avg_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + (void)y0_q4; + (void)y_step_q4; + + convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + w, h); +} + +void vpx_convolve8_vert_vsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + (void)x0_q4; + (void)x_step_q4; + + convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w, + h); +} + +void vpx_convolve8_avg_vert_vsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + (void)x0_q4; + (void)x_step_q4; + + convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, + w, h); +} + +void vpx_convolve8_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, h); +} + +void vpx_convolve8_avg_vsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + // Fixed size intermediate buffer places limits on parameters. + DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]); + assert(w <= 64); + assert(h <= 64); + + vpx_convolve8_vsx(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, h); + vpx_convolve_avg_vsx(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h); +} diff --git a/media/libvpx/libvpx/vpx_dsp/prob.c b/media/libvpx/libvpx/vpx_dsp/prob.c new file mode 100644 index 0000000000..819e95062e --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/prob.c @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./prob.h" + +const uint8_t vpx_norm[256] = { + 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +static unsigned int tree_merge_probs_impl(unsigned int i, + const vpx_tree_index *tree, + const vpx_prob *pre_probs, + const unsigned int *counts, + vpx_prob *probs) { + const int l = tree[i]; + const unsigned int left_count = + (l <= 0) ? counts[-l] + : tree_merge_probs_impl(l, tree, pre_probs, counts, probs); + const int r = tree[i + 1]; + const unsigned int right_count = + (r <= 0) ? counts[-r] + : tree_merge_probs_impl(r, tree, pre_probs, counts, probs); + const unsigned int ct[2] = { left_count, right_count }; + probs[i >> 1] = mode_mv_merge_probs(pre_probs[i >> 1], ct); + return left_count + right_count; +} + +void vpx_tree_merge_probs(const vpx_tree_index *tree, const vpx_prob *pre_probs, + const unsigned int *counts, vpx_prob *probs) { + tree_merge_probs_impl(0, tree, pre_probs, counts, probs); +} diff --git a/media/libvpx/libvpx/vpx_dsp/prob.h b/media/libvpx/libvpx/vpx_dsp/prob.h new file mode 100644 index 0000000000..7a71c0041f --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/prob.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_PROB_H_ +#define VPX_VPX_DSP_PROB_H_ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_common.h" + +#include "vpx_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef uint8_t vpx_prob; + +#define MAX_PROB 255 + +#define vpx_prob_half ((vpx_prob)128) + +typedef int8_t vpx_tree_index; + +#define TREE_SIZE(leaf_count) (2 * (leaf_count)-2) + +#define vpx_complement(x) (255 - (x)) + +#define MODE_MV_COUNT_SAT 20 + +/* We build coding trees compactly in arrays. + Each node of the tree is a pair of vpx_tree_indices. + Array index often references a corresponding probability table. + Index <= 0 means done encoding/decoding and value = -Index, + Index > 0 means need another bit, specification at index. + Nonnegative indices are always even; processing begins at node 0. */ + +typedef const vpx_tree_index vpx_tree[]; + +static INLINE vpx_prob get_prob(unsigned int num, unsigned int den) { + assert(den != 0); + { + const int p = (int)(((uint64_t)num * 256 + (den >> 1)) / den); + // (p > 255) ? 255 : (p < 1) ? 1 : p; + const int clipped_prob = p | ((255 - p) >> 23) | (p == 0); + return (vpx_prob)clipped_prob; + } +} + +static INLINE vpx_prob get_binary_prob(unsigned int n0, unsigned int n1) { + const unsigned int den = n0 + n1; + if (den == 0) return 128u; + return get_prob(n0, den); +} + +/* This function assumes prob1 and prob2 are already within [1,255] range. */ +static INLINE vpx_prob weighted_prob(int prob1, int prob2, int factor) { + return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8); +} + +static INLINE vpx_prob merge_probs(vpx_prob pre_prob, const unsigned int ct[2], + unsigned int count_sat, + unsigned int max_update_factor) { + const vpx_prob prob = get_binary_prob(ct[0], ct[1]); + const unsigned int count = VPXMIN(ct[0] + ct[1], count_sat); + const unsigned int factor = max_update_factor * count / count_sat; + return weighted_prob(pre_prob, prob, factor); +} + +// MODE_MV_MAX_UPDATE_FACTOR (128) * count / MODE_MV_COUNT_SAT; +static const int count_to_update_factor[MODE_MV_COUNT_SAT + 1] = { + 0, 6, 12, 19, 25, 32, 38, 44, 51, 57, 64, + 70, 76, 83, 89, 96, 102, 108, 115, 121, 128 +}; + +static INLINE vpx_prob mode_mv_merge_probs(vpx_prob pre_prob, + const unsigned int ct[2]) { + const unsigned int den = ct[0] + ct[1]; + if (den == 0) { + return pre_prob; + } else { + const unsigned int count = VPXMIN(den, MODE_MV_COUNT_SAT); + const unsigned int factor = count_to_update_factor[count]; + const vpx_prob prob = get_prob(ct[0], den); + return weighted_prob(pre_prob, prob, factor); + } +} + +void vpx_tree_merge_probs(const vpx_tree_index *tree, const vpx_prob *pre_probs, + const unsigned int *counts, vpx_prob *probs); + +DECLARE_ALIGNED(16, extern const uint8_t, vpx_norm[256]); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_DSP_PROB_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/psnr.c b/media/libvpx/libvpx/vpx_dsp/psnr.c new file mode 100644 index 0000000000..4ee4130a21 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/psnr.c @@ -0,0 +1,258 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/psnr.h" +#include "vpx_scale/yv12config.h" + +double vpx_sse_to_psnr(double samples, double peak, double sse) { + if (sse > 0.0) { + const double psnr = 10.0 * log10(samples * peak * peak / sse); + return psnr > MAX_PSNR ? MAX_PSNR : psnr; + } else { + return MAX_PSNR; + } +} + +/* TODO(yaowu): The block_variance calls the unoptimized versions of variance() + * and highbd_8_variance(). It should not. + */ +static int64_t encoder_sse(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h) { + int i, j; + int64_t sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = a[j] - b[j]; + sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } + + return sse; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static int64_t encoder_highbd_sse(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, + int h) { + int i, j; + int64_t sse = 0; + + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = a[j] - b[j]; + sse += diff * diff; + } + a += a_stride; + b += b_stride; + } + + return sse; +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + const int dw = width % 16; + const int dh = height % 16; + int64_t total_sse = 0; + int x, y; + + if (dw > 0) { + total_sse += encoder_sse(&a[width - dw], a_stride, &b[width - dw], b_stride, + dw, height); + } + + if (dh > 0) { + total_sse += + encoder_sse(&a[(height - dh) * a_stride], a_stride, + &b[(height - dh) * b_stride], b_stride, width - dw, dh); + } + + for (y = 0; y < height / 16; ++y) { + const uint8_t *pa = a; + const uint8_t *pb = b; + for (x = 0; x < width / 16; ++x) { + total_sse += vpx_sse(pa, a_stride, pb, b_stride, 16, 16); + + pa += 16; + pb += 16; + } + + a += 16 * a_stride; + b += 16 * b_stride; + } + + return total_sse; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int width, + int height, unsigned int input_shift) { + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + int64_t total_sse = 0; + int x, y; + for (y = 0; y < height; ++y) { + for (x = 0; x < width; ++x) { + int64_t diff; + diff = (a[x] >> input_shift) - (b[x] >> input_shift); + total_sse += diff * diff; + } + a += a_stride; + b += b_stride; + } + return total_sse; +} + +static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int64_t total_sse = 0; + int x, y; + const int dw = width % 16; + const int dh = height % 16; + if (dw > 0) { + total_sse += encoder_highbd_sse(&a[width - dw], a_stride, &b[width - dw], + b_stride, dw, height); + } + if (dh > 0) { + total_sse += encoder_highbd_sse(&a[(height - dh) * a_stride], a_stride, + &b[(height - dh) * b_stride], b_stride, + width - dw, dh); + } + for (y = 0; y < height / 16; ++y) { + const uint8_t *pa = a; + const uint8_t *pb = b; + for (x = 0; x < width / 16; ++x) { + total_sse += vpx_highbd_sse(pa, a_stride, pb, b_stride, 16, 16); + pa += 16; + pb += 16; + } + a += 16 * a_stride; + b += 16 * b_stride; + } + return total_sse; +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +int64_t vpx_get_y_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + assert(a->y_crop_width == b->y_crop_width); + assert(a->y_crop_height == b->y_crop_height); + + return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride, + a->y_crop_width, a->y_crop_height); +} + +#if CONFIG_VP9_HIGHBITDEPTH +int64_t vpx_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + assert(a->y_crop_width == b->y_crop_width); + assert(a->y_crop_height == b->y_crop_height); + assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0); + assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0); + + return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride, + a->y_crop_width, a->y_crop_height); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr, + uint32_t bit_depth, uint32_t in_bit_depth) { + const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width }; + const int heights[3] = { a->y_crop_height, a->uv_crop_height, + a->uv_crop_height }; + const uint8_t *a_planes[3] = { a->y_buffer, a->u_buffer, a->v_buffer }; + const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride }; + const uint8_t *b_planes[3] = { b->y_buffer, b->u_buffer, b->v_buffer }; + const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride }; + int i; + uint64_t total_sse = 0; + uint32_t total_samples = 0; + const double peak = (double)((1 << in_bit_depth) - 1); + const unsigned int input_shift = bit_depth - in_bit_depth; + + for (i = 0; i < 3; ++i) { + const int w = widths[i]; + const int h = heights[i]; + const uint32_t samples = w * h; + uint64_t sse; + if (a->flags & YV12_FLAG_HIGHBITDEPTH) { + if (input_shift) { + sse = highbd_get_sse_shift(a_planes[i], a_strides[i], b_planes[i], + b_strides[i], w, h, input_shift); + } else { + sse = highbd_get_sse(a_planes[i], a_strides[i], b_planes[i], + b_strides[i], w, h); + } + } else { + sse = get_sse(a_planes[i], a_strides[i], b_planes[i], b_strides[i], w, h); + } + psnr->sse[1 + i] = sse; + psnr->samples[1 + i] = samples; + psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse); + + total_sse += sse; + total_samples += samples; + } + + psnr->sse[0] = total_sse; + psnr->samples[0] = total_samples; + psnr->psnr[0] = + vpx_sse_to_psnr((double)total_samples, peak, (double)total_sse); +} + +#endif // !CONFIG_VP9_HIGHBITDEPTH + +void vpx_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, + PSNR_STATS *psnr) { + static const double peak = 255.0; + const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width }; + const int heights[3] = { a->y_crop_height, a->uv_crop_height, + a->uv_crop_height }; + const uint8_t *a_planes[3] = { a->y_buffer, a->u_buffer, a->v_buffer }; + const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride }; + const uint8_t *b_planes[3] = { b->y_buffer, b->u_buffer, b->v_buffer }; + const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride }; + int i; + uint64_t total_sse = 0; + uint32_t total_samples = 0; + + for (i = 0; i < 3; ++i) { + const int w = widths[i]; + const int h = heights[i]; + const uint32_t samples = w * h; + const uint64_t sse = + get_sse(a_planes[i], a_strides[i], b_planes[i], b_strides[i], w, h); + psnr->sse[1 + i] = sse; + psnr->samples[1 + i] = samples; + psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse); + + total_sse += sse; + total_samples += samples; + } + + psnr->sse[0] = total_sse; + psnr->samples[0] = total_samples; + psnr->psnr[0] = + vpx_sse_to_psnr((double)total_samples, peak, (double)total_sse); +} diff --git a/media/libvpx/libvpx/vpx_dsp/psnr.h b/media/libvpx/libvpx/vpx_dsp/psnr.h new file mode 100644 index 0000000000..7c57aa429f --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/psnr.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_PSNR_H_ +#define VPX_VPX_DSP_PSNR_H_ + +#include "vpx_scale/yv12config.h" +#include "vpx/vpx_encoder.h" + +#define MAX_PSNR 100.0 + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct vpx_psnr_pkt PSNR_STATS; + +// TODO(dkovalev) change vpx_sse_to_psnr signature: double -> int64_t + +/*!\brief Converts SSE to PSNR + * + * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PSNR). + * + * \param[in] samples Number of samples + * \param[in] peak Max sample value + * \param[in] sse Sum of squared errors + */ +double vpx_sse_to_psnr(double samples, double peak, double sse); +int64_t vpx_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); +#if CONFIG_VP9_HIGHBITDEPTH +int64_t vpx_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b); +void vpx_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr, + unsigned int bit_depth, unsigned int in_bit_depth); +#endif +void vpx_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, + PSNR_STATS *psnr); + +double vpx_psnrhvs(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *phvs_y, + double *phvs_u, double *phvs_v, uint32_t bd, uint32_t in_bd); + +#ifdef __cplusplus +} // extern "C" +#endif +#endif // VPX_VPX_DSP_PSNR_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/psnrhvs.c b/media/libvpx/libvpx/vpx_dsp/psnrhvs.c new file mode 100644 index 0000000000..d7ec1a429a --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/psnrhvs.c @@ -0,0 +1,281 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + * This code was originally written by: Gregory Maxwell, at the Daala + * project. + */ +#include +#include +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/ssim.h" +#include "vpx_ports/system_state.h" +#include "vpx_dsp/psnr.h" + +#if !defined(M_PI) +#define M_PI (3.141592653589793238462643) +#endif +#include + +static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x, + int xstride) { + int i, j; + (void)xstride; + vpx_fdct8x8(x, y, ystride); + for (i = 0; i < 8; i++) + for (j = 0; j < 8; j++) + *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3; +} +#if CONFIG_VP9_HIGHBITDEPTH +static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x, + int xstride) { + int i, j; + (void)xstride; + vpx_highbd_fdct8x8(x, y, ystride); + for (i = 0; i < 8; i++) + for (j = 0; j < 8; j++) + *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3; +} +#endif + +/* Normalized inverse quantization matrix for 8x8 DCT at the point of + * transparency. This is not the JPEG based matrix from the paper, + this one gives a slightly higher MOS agreement.*/ +static const double csf_y[8][8] = { + { 1.6193873005, 2.2901594831, 2.08509755623, 1.48366094411, 1.00227514334, + 0.678296995242, 0.466224900598, 0.3265091542 }, + { 2.2901594831, 1.94321815382, 2.04793073064, 1.68731108984, 1.2305666963, + 0.868920337363, 0.61280991668, 0.436405793551 }, + { 2.08509755623, 2.04793073064, 1.34329019223, 1.09205635862, 0.875748795257, + 0.670882927016, 0.501731932449, 0.372504254596 }, + { 1.48366094411, 1.68731108984, 1.09205635862, 0.772819797575, 0.605636379554, + 0.48309405692, 0.380429446972, 0.295774038565 }, + { 1.00227514334, 1.2305666963, 0.875748795257, 0.605636379554, 0.448996256676, + 0.352889268808, 0.283006984131, 0.226951348204 }, + { 0.678296995242, 0.868920337363, 0.670882927016, 0.48309405692, + 0.352889268808, 0.27032073436, 0.215017739696, 0.17408067321 }, + { 0.466224900598, 0.61280991668, 0.501731932449, 0.380429446972, + 0.283006984131, 0.215017739696, 0.168869545842, 0.136153931001 }, + { 0.3265091542, 0.436405793551, 0.372504254596, 0.295774038565, + 0.226951348204, 0.17408067321, 0.136153931001, 0.109083846276 } +}; +static const double csf_cb420[8][8] = { + { 1.91113096927, 2.46074210438, 1.18284184739, 1.14982565193, 1.05017074788, + 0.898018824055, 0.74725392039, 0.615105596242 }, + { 2.46074210438, 1.58529308355, 1.21363250036, 1.38190029285, 1.33100189972, + 1.17428548929, 0.996404342439, 0.830890433625 }, + { 1.18284184739, 1.21363250036, 0.978712413627, 1.02624506078, 1.03145147362, + 0.960060382087, 0.849823426169, 0.731221236837 }, + { 1.14982565193, 1.38190029285, 1.02624506078, 0.861317501629, 0.801821139099, + 0.751437590932, 0.685398513368, 0.608694761374 }, + { 1.05017074788, 1.33100189972, 1.03145147362, 0.801821139099, 0.676555426187, + 0.605503172737, 0.55002013668, 0.495804539034 }, + { 0.898018824055, 1.17428548929, 0.960060382087, 0.751437590932, + 0.605503172737, 0.514674450957, 0.454353482512, 0.407050308965 }, + { 0.74725392039, 0.996404342439, 0.849823426169, 0.685398513368, + 0.55002013668, 0.454353482512, 0.389234902883, 0.342353999733 }, + { 0.615105596242, 0.830890433625, 0.731221236837, 0.608694761374, + 0.495804539034, 0.407050308965, 0.342353999733, 0.295530605237 } +}; +static const double csf_cr420[8][8] = { + { 2.03871978502, 2.62502345193, 1.26180942886, 1.11019789803, 1.01397751469, + 0.867069376285, 0.721500455585, 0.593906509971 }, + { 2.62502345193, 1.69112867013, 1.17180569821, 1.3342742857, 1.28513006198, + 1.13381474809, 0.962064122248, 0.802254508198 }, + { 1.26180942886, 1.17180569821, 0.944981930573, 0.990876405848, + 0.995903384143, 0.926972725286, 0.820534991409, 0.706020324706 }, + { 1.11019789803, 1.3342742857, 0.990876405848, 0.831632933426, 0.77418706195, + 0.725539939514, 0.661776842059, 0.587716619023 }, + { 1.01397751469, 1.28513006198, 0.995903384143, 0.77418706195, 0.653238524286, + 0.584635025748, 0.531064164893, 0.478717061273 }, + { 0.867069376285, 1.13381474809, 0.926972725286, 0.725539939514, + 0.584635025748, 0.496936637883, 0.438694579826, 0.393021669543 }, + { 0.721500455585, 0.962064122248, 0.820534991409, 0.661776842059, + 0.531064164893, 0.438694579826, 0.375820256136, 0.330555063063 }, + { 0.593906509971, 0.802254508198, 0.706020324706, 0.587716619023, + 0.478717061273, 0.393021669543, 0.330555063063, 0.285345396658 } +}; + +static double convert_score_db(double _score, double _weight, int bit_depth) { + int16_t pix_max = 255; + assert(_score * _weight >= 0.0); + if (bit_depth == 10) + pix_max = 1023; + else if (bit_depth == 12) + pix_max = 4095; + + if (_weight * _score < pix_max * pix_max * 1e-10) return MAX_PSNR; + return 10 * (log10(pix_max * pix_max) - log10(_weight * _score)); +} + +static double calc_psnrhvs(const unsigned char *src, int _systride, + const unsigned char *dst, int _dystride, double _par, + int _w, int _h, int _step, const double _csf[8][8], + uint32_t bit_depth, uint32_t _shift) { + double ret; + const uint8_t *_src8 = src; + const uint8_t *_dst8 = dst; + const uint16_t *_src16 = CONVERT_TO_SHORTPTR(src); + const uint16_t *_dst16 = CONVERT_TO_SHORTPTR(dst); + DECLARE_ALIGNED(16, int16_t, dct_s[8 * 8]); + DECLARE_ALIGNED(16, int16_t, dct_d[8 * 8]); + DECLARE_ALIGNED(16, tran_low_t, dct_s_coef[8 * 8]); + DECLARE_ALIGNED(16, tran_low_t, dct_d_coef[8 * 8]); + double mask[8][8]; + int pixels; + int x; + int y; + (void)_par; + ret = pixels = 0; + + /*In the PSNR-HVS-M paper[1] the authors describe the construction of + their masking table as "we have used the quantization table for the + color component Y of JPEG [6] that has been also obtained on the + basis of CSF. Note that the values in quantization table JPEG have + been normalized and then squared." Their CSF matrix (from PSNR-HVS) + was also constructed from the JPEG matrices. I can not find any obvious + scheme of normalizing to produce their table, but if I multiply their + CSF by 0.3885746225901003 and square the result I get their masking table. + I have no idea where this constant comes from, but deviating from it + too greatly hurts MOS agreement. + + [1] Nikolay Ponomarenko, Flavia Silvestri, Karen Egiazarian, Marco Carli, + Jaakko Astola, Vladimir Lukin, "On between-coefficient contrast masking + of DCT basis functions", CD-ROM Proceedings of the Third + International Workshop on Video Processing and Quality Metrics for Consumer + Electronics VPQM-07, Scottsdale, Arizona, USA, 25-26 January, 2007, 4 p. + + Suggested in aomedia issue #2363: + 0.3885746225901003 is a reciprocal of the maximum coefficient (2.573509) + of the old JPEG based matrix from the paper. Since you are not using that, + divide by actual maximum coefficient. */ + for (x = 0; x < 8; x++) + for (y = 0; y < 8; y++) + mask[x][y] = (_csf[x][y] / _csf[1][0]) * (_csf[x][y] / _csf[1][0]); + for (y = 0; y < _h - 7; y += _step) { + for (x = 0; x < _w - 7; x += _step) { + int i; + int j; + double s_means[4]; + double d_means[4]; + double s_vars[4]; + double d_vars[4]; + double s_gmean = 0; + double d_gmean = 0; + double s_gvar = 0; + double d_gvar = 0; + double s_mask = 0; + double d_mask = 0; + for (i = 0; i < 4; i++) + s_means[i] = d_means[i] = s_vars[i] = d_vars[i] = 0; + for (i = 0; i < 8; i++) { + for (j = 0; j < 8; j++) { + int sub = ((i & 12) >> 2) + ((j & 12) >> 1); + if (bit_depth == 8 && _shift == 0) { + dct_s[i * 8 + j] = _src8[(y + i) * _systride + (j + x)]; + dct_d[i * 8 + j] = _dst8[(y + i) * _dystride + (j + x)]; + } else if (bit_depth == 10 || bit_depth == 12) { + dct_s[i * 8 + j] = _src16[(y + i) * _systride + (j + x)] >> _shift; + dct_d[i * 8 + j] = _dst16[(y + i) * _dystride + (j + x)] >> _shift; + } + s_gmean += dct_s[i * 8 + j]; + d_gmean += dct_d[i * 8 + j]; + s_means[sub] += dct_s[i * 8 + j]; + d_means[sub] += dct_d[i * 8 + j]; + } + } + s_gmean /= 64.f; + d_gmean /= 64.f; + for (i = 0; i < 4; i++) s_means[i] /= 16.f; + for (i = 0; i < 4; i++) d_means[i] /= 16.f; + for (i = 0; i < 8; i++) { + for (j = 0; j < 8; j++) { + int sub = ((i & 12) >> 2) + ((j & 12) >> 1); + s_gvar += (dct_s[i * 8 + j] - s_gmean) * (dct_s[i * 8 + j] - s_gmean); + d_gvar += (dct_d[i * 8 + j] - d_gmean) * (dct_d[i * 8 + j] - d_gmean); + s_vars[sub] += (dct_s[i * 8 + j] - s_means[sub]) * + (dct_s[i * 8 + j] - s_means[sub]); + d_vars[sub] += (dct_d[i * 8 + j] - d_means[sub]) * + (dct_d[i * 8 + j] - d_means[sub]); + } + } + s_gvar *= 1 / 63.f * 64; + d_gvar *= 1 / 63.f * 64; + for (i = 0; i < 4; i++) s_vars[i] *= 1 / 15.f * 16; + for (i = 0; i < 4; i++) d_vars[i] *= 1 / 15.f * 16; + if (s_gvar > 0) + s_gvar = (s_vars[0] + s_vars[1] + s_vars[2] + s_vars[3]) / s_gvar; + if (d_gvar > 0) + d_gvar = (d_vars[0] + d_vars[1] + d_vars[2] + d_vars[3]) / d_gvar; +#if CONFIG_VP9_HIGHBITDEPTH + if (bit_depth == 10 || bit_depth == 12) { + hbd_od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8); + hbd_od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8); + } +#endif + if (bit_depth == 8) { + od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8); + od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8); + } + for (i = 0; i < 8; i++) + for (j = (i == 0); j < 8; j++) + s_mask += dct_s_coef[i * 8 + j] * dct_s_coef[i * 8 + j] * mask[i][j]; + for (i = 0; i < 8; i++) + for (j = (i == 0); j < 8; j++) + d_mask += dct_d_coef[i * 8 + j] * dct_d_coef[i * 8 + j] * mask[i][j]; + s_mask = sqrt(s_mask * s_gvar) / 32.f; + d_mask = sqrt(d_mask * d_gvar) / 32.f; + if (d_mask > s_mask) s_mask = d_mask; + for (i = 0; i < 8; i++) { + for (j = 0; j < 8; j++) { + double err; + err = fabs((double)(dct_s_coef[i * 8 + j] - dct_d_coef[i * 8 + j])); + if (i != 0 || j != 0) + err = err < s_mask / mask[i][j] ? 0 : err - s_mask / mask[i][j]; + ret += (err * _csf[i][j]) * (err * _csf[i][j]); + pixels++; + } + } + } + } + if (pixels <= 0) return 0; + ret /= pixels; + return ret; +} + +double vpx_psnrhvs(const YV12_BUFFER_CONFIG *src, + const YV12_BUFFER_CONFIG *dest, double *y_psnrhvs, + double *u_psnrhvs, double *v_psnrhvs, uint32_t bd, + uint32_t in_bd) { + double psnrhvs; + const double par = 1.0; + const int step = 7; + uint32_t bd_shift = 0; + vpx_clear_system_state(); + + assert(bd == 8 || bd == 10 || bd == 12); + assert(bd >= in_bd); + + bd_shift = bd - in_bd; + + *y_psnrhvs = calc_psnrhvs(src->y_buffer, src->y_stride, dest->y_buffer, + dest->y_stride, par, src->y_crop_width, + src->y_crop_height, step, csf_y, bd, bd_shift); + *u_psnrhvs = calc_psnrhvs(src->u_buffer, src->uv_stride, dest->u_buffer, + dest->uv_stride, par, src->uv_crop_width, + src->uv_crop_height, step, csf_cb420, bd, bd_shift); + *v_psnrhvs = calc_psnrhvs(src->v_buffer, src->uv_stride, dest->v_buffer, + dest->uv_stride, par, src->uv_crop_width, + src->uv_crop_height, step, csf_cr420, bd, bd_shift); + psnrhvs = (*y_psnrhvs) * .8 + .1 * ((*u_psnrhvs) + (*v_psnrhvs)); + return convert_score_db(psnrhvs, 1.0, in_bd); +} diff --git a/media/libvpx/libvpx/vpx_dsp/quantize.c b/media/libvpx/libvpx/vpx_dsp/quantize.c new file mode 100644 index 0000000000..fac9136f8c --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/quantize.c @@ -0,0 +1,324 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/quantize.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_mem/vpx_mem.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" + +void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant, uint16_t *eob_ptr) { + const int rc = 0; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int tmp, eob = -1; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = (tmp * quant) >> 16; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant; + if (tmp) eob = 0; + + *eob_ptr = eob + 1; +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant, uint16_t *eob_ptr) { + int eob = -1; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + { + const int coeff = coeff_ptr[0]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp = abs_coeff + round_ptr[0]; + const int abs_qcoeff = (int)((tmp * quant) >> 16); + qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant; + if (abs_qcoeff) eob = 0; + } + + *eob_ptr = eob + 1; +} +#endif + +void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant, uint16_t *eob_ptr) { + const int n_coeffs = 1024; + const int rc = 0; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int tmp, eob = -1; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1), INT16_MIN, + INT16_MAX); + tmp = (tmp * quant) >> 15; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / 2; + if (tmp) eob = 0; + + *eob_ptr = eob + 1; +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t dequant, uint16_t *eob_ptr) { + const int n_coeffs = 1024; + int eob = -1; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + { + const int coeff = coeff_ptr[0]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1); + const int abs_qcoeff = (int)((tmp * quant) >> 15); + qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant / 2; + if (abs_qcoeff) eob = 0; + } + + *eob_ptr = eob + 1; +} +#endif + +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + int i, non_zero_count = (int)n_coeffs, eob = -1; + const int zbins[2] = { mb_plane->zbin[0], mb_plane->zbin[1] }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + const int16_t *round_ptr = mb_plane->round; + const int16_t *quant_ptr = mb_plane->quant; + const int16_t *quant_shift_ptr = mb_plane->quant_shift; + const int16_t *scan = scan_order->scan; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + // Pre-scan pass + for (i = (int)n_coeffs - 1; i >= 0; i--) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + + if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0]) + non_zero_count--; + else + break; + } + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + if (abs_coeff >= zbins[rc != 0]) { + int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * + quant_shift_ptr[rc != 0]) >> + 16; // quantization + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = (tran_low_t)(qcoeff_ptr[rc] * dequant_ptr[rc != 0]); + + if (tmp) eob = i; + } + } + *eob_ptr = eob + 1; +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + int i, non_zero_count = (int)n_coeffs, eob = -1; + const int zbins[2] = { mb_plane->zbin[0], mb_plane->zbin[1] }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + const int16_t *round_ptr = mb_plane->round; + const int16_t *quant_ptr = mb_plane->quant; + const int16_t *quant_shift_ptr = mb_plane->quant_shift; + const int16_t *scan = scan_order->scan; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + // Pre-scan pass + for (i = (int)n_coeffs - 1; i >= 0; i--) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + + if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0]) + non_zero_count--; + else + break; + } + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + if (abs_coeff >= zbins[rc != 0]) { + const int64_t tmp1 = abs_coeff + round_ptr[rc != 0]; + const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 16); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; + if (abs_qcoeff) eob = i; + } + } + *eob_ptr = eob + 1; +} +#endif + +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const int n_coeffs = 32 * 32; + const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1), + ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + const int16_t *round_ptr = mb_plane->round; + const int16_t *quant_ptr = mb_plane->quant; + const int16_t *quant_shift_ptr = mb_plane->quant_shift; + const int16_t *scan = scan_order->scan; + + int idx = 0; + int idx_arr[32 * 32 /* n_coeffs */]; + int i, eob = -1; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + // Pre-scan pass + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + + // If the coefficient is out of the base ZBIN range, keep it for + // quantization. + if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) idx_arr[idx++] = i; + } + + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = scan[idx_arr[i]]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + int tmp; + int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX); + tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) * + quant_shift_ptr[rc != 0]) >> + 15; + + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; +#if (VPX_ARCH_X86 || VPX_ARCH_X86_64) && !CONFIG_VP9_HIGHBITDEPTH + // When tran_low_t is only 16 bits dqcoeff can outrange it. Rather than + // truncating with a cast, saturate the value. This is easier to implement + // on x86 and preserves the sign of the value. + dqcoeff_ptr[rc] = + clamp(qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2, INT16_MIN, INT16_MAX); +#else + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; +#endif // VPX_ARCH_X86 && CONFIG_VP9_HIGHBITDEPTH + + if (tmp) eob = idx_arr[i]; + } + *eob_ptr = eob + 1; +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_highbd_quantize_b_32x32_c( + const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const struct ScanOrder *const scan_order) { + const intptr_t n_coeffs = 32 * 32; + const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1), + ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + const int16_t *round_ptr = mb_plane->round; + const int16_t *quant_ptr = mb_plane->quant; + const int16_t *quant_shift_ptr = mb_plane->quant_shift; + const int16_t *scan = scan_order->scan; + + int idx = 0; + int idx_arr[1024]; + int i, eob = -1; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + // Pre-scan pass + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + + // If the coefficient is out of the base ZBIN range, keep it for + // quantization. + if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) idx_arr[idx++] = i; + } + + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = scan[idx_arr[i]]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + if (abs_qcoeff) eob = idx_arr[i]; + } + *eob_ptr = eob + 1; +} +#endif diff --git a/media/libvpx/libvpx/vpx_dsp/quantize.h b/media/libvpx/libvpx/vpx_dsp/quantize.h new file mode 100644 index 0000000000..8e138445e2 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/quantize.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_QUANTIZE_H_ +#define VPX_VPX_DSP_QUANTIZE_H_ + +#include "./vpx_config.h" +#include "vpx_dsp/vpx_dsp_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant, uint16_t *eob_ptr); +void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant, uint16_t *eob_ptr); + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant, uint16_t *eob_ptr); +void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t dequant, uint16_t *eob_ptr); +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_DSP_QUANTIZE_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/sad.c b/media/libvpx/libvpx/vpx_dsp/sad.c new file mode 100644 index 0000000000..2a4c81d588 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/sad.c @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +/* Sum the difference between every corresponding element of the buffers. */ +static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int width, int height) { + int y, x; + unsigned int sad = 0; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) sad += abs(src_ptr[x] - ref_ptr[x]); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } + return sad; +} + +#define sadMxN(m, n) \ + unsigned int vpx_sad##m##x##n##_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + return sad(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \ + } \ + unsigned int vpx_sad##m##x##n##_avg_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + DECLARE_ALIGNED(32, uint8_t, comp_pred[m * n]); \ + vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref_ptr, ref_stride); \ + return sad(src_ptr, src_stride, comp_pred, m, m, n); \ + } \ + unsigned int vpx_sad_skip_##m##x##n##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return 2 * sad(src_ptr, 2 * src_stride, ref_ptr, 2 * ref_stride, (m), \ + (n / 2)); \ + } + +// Compare |src_ptr| to 4 distinct references in |ref_array[4]| +#define sadMxNx4D(m, n) \ + void vpx_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + int i; \ + for (i = 0; i < 4; ++i) \ + sad_array[i] = \ + vpx_sad##m##x##n##_c(src_ptr, src_stride, ref_array[i], ref_stride); \ + } \ + void vpx_sad_skip_##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + int i; \ + for (i = 0; i < 4; ++i) { \ + sad_array[i] = 2 * sad(src_ptr, 2 * src_stride, ref_array[i], \ + 2 * ref_stride, (m), (n / 2)); \ + } \ + } + +/* clang-format off */ +// 64x64 +sadMxN(64, 64) +sadMxNx4D(64, 64) + +// 64x32 +sadMxN(64, 32) +sadMxNx4D(64, 32) + +// 32x64 +sadMxN(32, 64) +sadMxNx4D(32, 64) + +// 32x32 +sadMxN(32, 32) +sadMxNx4D(32, 32) + +// 32x16 +sadMxN(32, 16) +sadMxNx4D(32, 16) + +// 16x32 +sadMxN(16, 32) +sadMxNx4D(16, 32) + +// 16x16 +sadMxN(16, 16) +sadMxNx4D(16, 16) + +// 16x8 +sadMxN(16, 8) +sadMxNx4D(16, 8) + +// 8x16 +sadMxN(8, 16) +sadMxNx4D(8, 16) + +// 8x8 +sadMxN(8, 8) +sadMxNx4D(8, 8) + +// 8x4 +sadMxN(8, 4) +sadMxNx4D(8, 4) + +// 4x8 +sadMxN(4, 8) +sadMxNx4D(4, 8) + +// 4x4 +sadMxN(4, 4) +sadMxNx4D(4, 4) +/* clang-format on */ + +#if CONFIG_VP9_HIGHBITDEPTH + static INLINE + unsigned int highbd_sad(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride, int width, + int height) { + int y, x; + unsigned int sad = 0; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr); + const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref8_ptr); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) sad += abs(src[x] - ref_ptr[x]); + + src += src_stride; + ref_ptr += ref_stride; + } + return sad; +} + +static INLINE unsigned int highbd_sadb(const uint8_t *src8_ptr, int src_stride, + const uint16_t *ref_ptr, int ref_stride, + int width, int height) { + int y, x; + unsigned int sad = 0; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) sad += abs(src[x] - ref_ptr[x]); + + src += src_stride; + ref_ptr += ref_stride; + } + return sad; +} + +#define highbd_sadMxN(m, n) \ + unsigned int vpx_highbd_sad##m##x##n##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return highbd_sad(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \ + } \ + unsigned int vpx_highbd_sad##m##x##n##_avg_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + DECLARE_ALIGNED(16, uint16_t, comp_pred[m * n]); \ + vpx_highbd_comp_avg_pred_c(comp_pred, CONVERT_TO_SHORTPTR(second_pred), m, \ + n, CONVERT_TO_SHORTPTR(ref_ptr), ref_stride); \ + return highbd_sadb(src_ptr, src_stride, comp_pred, m, m, n); \ + } \ + unsigned int vpx_highbd_sad_skip_##m##x##n##_c( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * \ + highbd_sad(src, 2 * src_stride, ref, 2 * ref_stride, (m), (n / 2)); \ + } + +#define highbd_sadMxNx4D(m, n) \ + void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + int i; \ + for (i = 0; i < 4; ++i) { \ + sad_array[i] = vpx_highbd_sad##m##x##n##_c(src_ptr, src_stride, \ + ref_array[i], ref_stride); \ + } \ + } \ + void vpx_highbd_sad_skip_##m##x##n##x4d_c( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + int i; \ + for (i = 0; i < 4; ++i) { \ + sad_array[i] = vpx_highbd_sad_skip_##m##x##n##_c( \ + src, src_stride, ref_array[i], ref_stride); \ + } \ + } + +/* clang-format off */ +// 64x64 +highbd_sadMxN(64, 64) +highbd_sadMxNx4D(64, 64) + +// 64x32 +highbd_sadMxN(64, 32) +highbd_sadMxNx4D(64, 32) + +// 32x64 +highbd_sadMxN(32, 64) +highbd_sadMxNx4D(32, 64) + +// 32x32 +highbd_sadMxN(32, 32) +highbd_sadMxNx4D(32, 32) + +// 32x16 +highbd_sadMxN(32, 16) +highbd_sadMxNx4D(32, 16) + +// 16x32 +highbd_sadMxN(16, 32) +highbd_sadMxNx4D(16, 32) + +// 16x16 +highbd_sadMxN(16, 16) +highbd_sadMxNx4D(16, 16) + +// 16x8 +highbd_sadMxN(16, 8) +highbd_sadMxNx4D(16, 8) + +// 8x16 +highbd_sadMxN(8, 16) +highbd_sadMxNx4D(8, 16) + +// 8x8 +highbd_sadMxN(8, 8) +highbd_sadMxNx4D(8, 8) + +// 8x4 +highbd_sadMxN(8, 4) +highbd_sadMxNx4D(8, 4) + +// 4x8 +highbd_sadMxN(4, 8) +highbd_sadMxNx4D(4, 8) + +// 4x4 +highbd_sadMxN(4, 4) +highbd_sadMxNx4D(4, 4) +/* clang-format on */ + +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/skin_detection.c b/media/libvpx/libvpx/vpx_dsp/skin_detection.c new file mode 100644 index 0000000000..bbbb6c3a17 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/skin_detection.c @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_dsp/skin_detection.h" + +#define MODEL_MODE 1 + +// Fixed-point skin color model parameters. +static const int skin_mean[5][2] = { { 7463, 9614 }, + { 6400, 10240 }, + { 7040, 10240 }, + { 8320, 9280 }, + { 6800, 9614 } }; +static const int skin_inv_cov[4] = { 4107, 1663, 1663, 2157 }; // q16 +static const int skin_threshold[6] = { 1570636, 1400000, 800000, + 800000, 800000, 800000 }; // q18 +// Thresholds on luminance. +static const int y_low = 40; +static const int y_high = 220; + +// Evaluates the Mahalanobis distance measure for the input CbCr values. +static int vpx_evaluate_skin_color_difference(const int cb, const int cr, + const int idx) { + const int cb_q6 = cb << 6; + const int cr_q6 = cr << 6; + const int cb_diff_q12 = + (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]); + const int cbcr_diff_q12 = + (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]); + const int cr_diff_q12 = + (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]); + const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10; + const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10; + const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10; + const int skin_diff = + skin_inv_cov[0] * cb_diff_q2 + skin_inv_cov[1] * cbcr_diff_q2 + + skin_inv_cov[2] * cbcr_diff_q2 + skin_inv_cov[3] * cr_diff_q2; + return skin_diff; +} + +// Checks if the input yCbCr values corresponds to skin color. +int vpx_skin_pixel(const int y, const int cb, const int cr, int motion) { + if (y < y_low || y > y_high) { + return 0; + } else if (MODEL_MODE == 0) { + return (vpx_evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]); + } else { + int i = 0; + // Exit on grey. + if (cb == 128 && cr == 128) return 0; + // Exit on very strong cb. + if (cb > 150 && cr < 110) return 0; + for (; i < 5; ++i) { + int skin_color_diff = vpx_evaluate_skin_color_difference(cb, cr, i); + if (skin_color_diff < skin_threshold[i + 1]) { + if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2)) { + return 0; + } else if (motion == 0 && + skin_color_diff > (skin_threshold[i + 1] >> 1)) { + return 0; + } else { + return 1; + } + } + // Exit if difference is much large than the threshold. + if (skin_color_diff > (skin_threshold[i + 1] << 3)) { + return 0; + } + } + return 0; + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/skin_detection.h b/media/libvpx/libvpx/vpx_dsp/skin_detection.h new file mode 100644 index 0000000000..91640c33d5 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/skin_detection.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_SKIN_DETECTION_H_ +#define VPX_VPX_DSP_SKIN_DETECTION_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +int vpx_skin_pixel(const int y, const int cb, const int cr, int motion); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_DSP_SKIN_DETECTION_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/sse.c b/media/libvpx/libvpx/vpx_dsp/sse.c new file mode 100644 index 0000000000..c9d751859d --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/sse.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/* + * Sum the square of the difference between every corresponding element of the + * buffers. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +int64_t vpx_sse_c(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int y, x; + int64_t sse = 0; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + const int32_t diff = abs(a[x] - b[x]); + sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } + return sse; +} + +#if CONFIG_VP9_HIGHBITDEPTH +int64_t vpx_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8, + int b_stride, int width, int height) { + int y, x; + int64_t sse = 0; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + const int32_t diff = (int32_t)(a[x]) - (int32_t)(b[x]); + sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } + return sse; +} +#endif diff --git a/media/libvpx/libvpx/vpx_dsp/ssim.c b/media/libvpx/libvpx/vpx_dsp/ssim.c new file mode 100644 index 0000000000..7c3c31bad8 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/ssim.c @@ -0,0 +1,461 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/ssim.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/system_state.h" + +void vpx_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, int rp, + uint32_t *sum_s, uint32_t *sum_r, + uint32_t *sum_sq_s, uint32_t *sum_sq_r, + uint32_t *sum_sxr) { + int i, j; + for (i = 0; i < 16; i++, s += sp, r += rp) { + for (j = 0; j < 16; j++) { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} +void vpx_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp, + uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, + uint32_t *sum_sq_r, uint32_t *sum_sxr) { + int i, j; + for (i = 0; i < 8; i++, s += sp, r += rp) { + for (j = 0; j < 8; j++) { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r, + int rp, uint32_t *sum_s, uint32_t *sum_r, + uint32_t *sum_sq_s, uint32_t *sum_sq_r, + uint32_t *sum_sxr) { + int i, j; + for (i = 0; i < 8; i++, s += sp, r += rp) { + for (j = 0; j < 8; j++) { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +static const int64_t cc1 = 26634; // (64^2*(.01*255)^2 +static const int64_t cc2 = 239708; // (64^2*(.03*255)^2 +static const int64_t cc1_10 = 428658; // (64^2*(.01*1023)^2 +static const int64_t cc2_10 = 3857925; // (64^2*(.03*1023)^2 +static const int64_t cc1_12 = 6868593; // (64^2*(.01*4095)^2 +static const int64_t cc2_12 = 61817334; // (64^2*(.03*4095)^2 + +static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s, + uint32_t sum_sq_r, uint32_t sum_sxr, int count, + uint32_t bd) { + double ssim_n, ssim_d; + int64_t c1, c2; + if (bd == 8) { + // scale the constants by number of pixels + c1 = (cc1 * count * count) >> 12; + c2 = (cc2 * count * count) >> 12; + } else if (bd == 10) { + c1 = (cc1_10 * count * count) >> 12; + c2 = (cc2_10 * count * count) >> 12; + } else if (bd == 12) { + c1 = (cc1_12 * count * count) >> 12; + c2 = (cc2_12 * count * count) >> 12; + } else { + c1 = c2 = 0; + assert(0); + } + + ssim_n = (2.0 * sum_s * sum_r + c1) * + (2.0 * count * sum_sxr - 2.0 * sum_s * sum_r + c2); + + ssim_d = ((double)sum_s * sum_s + (double)sum_r * sum_r + c1) * + ((double)count * sum_sq_s - (double)sum_s * sum_s + + (double)count * sum_sq_r - (double)sum_r * sum_r + c2); + + return ssim_n / ssim_d; +} + +static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) { + uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0; + vpx_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, + &sum_sxr); + return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8); +} + +#if CONFIG_VP9_HIGHBITDEPTH +static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r, + int rp, uint32_t bd, uint32_t shift) { + uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0; + vpx_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, + &sum_sxr); + return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift), + sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +// We are using a 8x8 moving window with starting location of each 8x8 window +// on the 4x4 pixel grid. Such arrangement allows the windows to overlap +// block boundaries to penalize blocking artifacts. +static double vpx_ssim2(const uint8_t *img1, const uint8_t *img2, + int stride_img1, int stride_img2, int width, + int height) { + int i, j; + int samples = 0; + double ssim_total = 0; + + // sample point start with each 4x4 location + for (i = 0; i <= height - 8; + i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) { + for (j = 0; j <= width - 8; j += 4) { + double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2); + ssim_total += v; + samples++; + } + } + ssim_total /= samples; + return ssim_total; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static double vpx_highbd_ssim2(const uint8_t *img1, const uint8_t *img2, + int stride_img1, int stride_img2, int width, + int height, uint32_t bd, uint32_t shift) { + int i, j; + int samples = 0; + double ssim_total = 0; + + // sample point start with each 4x4 location + for (i = 0; i <= height - 8; + i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) { + for (j = 0; j <= width - 8; j += 4) { + double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1, + CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd, + shift); + ssim_total += v; + samples++; + } + } + ssim_total /= samples; + return ssim_total; +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +double vpx_calc_ssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *weight) { + double a, b, c; + double ssimv; + + a = vpx_ssim2(source->y_buffer, dest->y_buffer, source->y_stride, + dest->y_stride, source->y_crop_width, source->y_crop_height); + + b = vpx_ssim2(source->u_buffer, dest->u_buffer, source->uv_stride, + dest->uv_stride, source->uv_crop_width, source->uv_crop_height); + + c = vpx_ssim2(source->v_buffer, dest->v_buffer, source->uv_stride, + dest->uv_stride, source->uv_crop_width, source->uv_crop_height); + + ssimv = a * .8 + .1 * (b + c); + + *weight = 1; + + return ssimv; +} + +// traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity +// +// Re working out the math -> +// +// ssim(x,y) = (2*mean(x)*mean(y) + c1)*(2*cov(x,y)+c2) / +// ((mean(x)^2+mean(y)^2+c1)*(var(x)+var(y)+c2)) +// +// mean(x) = sum(x) / n +// +// cov(x,y) = (n*sum(xi*yi)-sum(x)*sum(y))/(n*n) +// +// var(x) = (n*sum(xi*xi)-sum(xi)*sum(xi))/(n*n) +// +// ssim(x,y) = +// (2*sum(x)*sum(y)/(n*n) + c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))/(n*n)+c2) / +// (((sum(x)*sum(x)+sum(y)*sum(y))/(n*n) +c1) * +// ((n*sum(xi*xi) - sum(xi)*sum(xi))/(n*n)+ +// (n*sum(yi*yi) - sum(yi)*sum(yi))/(n*n)+c2))) +// +// factoring out n*n +// +// ssim(x,y) = +// (2*sum(x)*sum(y) + n*n*c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))+n*n*c2) / +// (((sum(x)*sum(x)+sum(y)*sum(y)) + n*n*c1) * +// (n*sum(xi*xi)-sum(xi)*sum(xi)+n*sum(yi*yi)-sum(yi)*sum(yi)+n*n*c2)) +// +// Replace c1 with n*n * c1 for the final step that leads to this code: +// The final step scales by 12 bits so we don't lose precision in the constants. + +static double ssimv_similarity(const Ssimv *sv, int64_t n) { + // Scale the constants by number of pixels. + const int64_t c1 = (cc1 * n * n) >> 12; + const int64_t c2 = (cc2 * n * n) >> 12; + + const double l = 1.0 * (2 * sv->sum_s * sv->sum_r + c1) / + (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1); + + // Since these variables are unsigned sums, convert to double so + // math is done in double arithmetic. + const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) / + (n * sv->sum_sq_s - sv->sum_s * sv->sum_s + + n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2); + + return l * v; +} + +// The first term of the ssim metric is a luminance factor. +// +// (2*mean(x)*mean(y) + c1)/ (mean(x)^2+mean(y)^2+c1) +// +// This luminance factor is super sensitive to the dark side of luminance +// values and completely insensitive on the white side. check out 2 sets +// (1,3) and (250,252) the term gives ( 2*1*3/(1+9) = .60 +// 2*250*252/ (250^2+252^2) => .99999997 +// +// As a result in this tweaked version of the calculation in which the +// luminance is taken as percentage off from peak possible. +// +// 255 * 255 - (sum_s - sum_r) / count * (sum_s - sum_r) / count +// +static double ssimv_similarity2(const Ssimv *sv, int64_t n) { + // Scale the constants by number of pixels. + const int64_t c1 = (cc1 * n * n) >> 12; + const int64_t c2 = (cc2 * n * n) >> 12; + + const double mean_diff = (1.0 * sv->sum_s - sv->sum_r) / n; + const double l = (255 * 255 - mean_diff * mean_diff + c1) / (255 * 255 + c1); + + // Since these variables are unsigned, sums convert to double so + // math is done in double arithmetic. + const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) / + (n * sv->sum_sq_s - sv->sum_s * sv->sum_s + + n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2); + + return l * v; +} +static void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2, + int img2_pitch, Ssimv *sv) { + vpx_ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch, &sv->sum_s, &sv->sum_r, + &sv->sum_sq_s, &sv->sum_sq_r, &sv->sum_sxr); +} + +double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2, + int img2_pitch, int width, int height, Ssimv *sv2, + Metrics *m, int do_inconsistency) { + double dssim_total = 0; + double ssim_total = 0; + double ssim2_total = 0; + double inconsistency_total = 0; + int i, j; + int c = 0; + double norm; + double old_ssim_total = 0; + vpx_clear_system_state(); + // We can sample points as frequently as we like start with 1 per 4x4. + for (i = 0; i < height; + i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) { + for (j = 0; j < width; j += 4, ++c) { + Ssimv sv = { 0, 0, 0, 0, 0, 0 }; + double ssim; + double ssim2; + double dssim; + uint32_t var_new; + uint32_t var_old; + uint32_t mean_new; + uint32_t mean_old; + double ssim_new; + double ssim_old; + + // Not sure there's a great way to handle the edge pixels + // in ssim when using a window. Seems biased against edge pixels + // however you handle this. This uses only samples that are + // fully in the frame. + if (j + 8 <= width && i + 8 <= height) { + ssimv_parms(img1 + j, img1_pitch, img2 + j, img2_pitch, &sv); + } + + ssim = ssimv_similarity(&sv, 64); + ssim2 = ssimv_similarity2(&sv, 64); + + sv.ssim = ssim2; + + // dssim is calculated to use as an actual error metric and + // is scaled up to the same range as sum square error. + // Since we are subsampling every 16th point maybe this should be + // *16 ? + dssim = 255 * 255 * (1 - ssim2) / 2; + + // Here I introduce a new error metric: consistency-weighted + // SSIM-inconsistency. This metric isolates frames where the + // SSIM 'suddenly' changes, e.g. if one frame in every 8 is much + // sharper or blurrier than the others. Higher values indicate a + // temporally inconsistent SSIM. There are two ideas at work: + // + // 1) 'SSIM-inconsistency': the total inconsistency value + // reflects how much SSIM values are changing between this + // source / reference frame pair and the previous pair. + // + // 2) 'consistency-weighted': weights de-emphasize areas in the + // frame where the scene content has changed. Changes in scene + // content are detected via changes in local variance and local + // mean. + // + // Thus the overall measure reflects how inconsistent the SSIM + // values are, over consistent regions of the frame. + // + // The metric has three terms: + // + // term 1 -> uses change in scene Variance to weight error score + // 2 * var(Fi)*var(Fi-1) / (var(Fi)^2+var(Fi-1)^2) + // larger changes from one frame to the next mean we care + // less about consistency. + // + // term 2 -> uses change in local scene luminance to weight error + // 2 * avg(Fi)*avg(Fi-1) / (avg(Fi)^2+avg(Fi-1)^2) + // larger changes from one frame to the next mean we care + // less about consistency. + // + // term3 -> measures inconsistency in ssim scores between frames + // 1 - ( 2 * ssim(Fi)*ssim(Fi-1)/(ssim(Fi)^2+sssim(Fi-1)^2). + // + // This term compares the ssim score for the same location in 2 + // subsequent frames. + var_new = sv.sum_sq_s - sv.sum_s * sv.sum_s / 64; + var_old = sv2[c].sum_sq_s - sv2[c].sum_s * sv2[c].sum_s / 64; + mean_new = sv.sum_s; + mean_old = sv2[c].sum_s; + ssim_new = sv.ssim; + ssim_old = sv2[c].ssim; + + if (do_inconsistency) { + // We do the metric once for every 4x4 block in the image. Since + // we are scaling the error to SSE for use in a psnr calculation + // 1.0 = 4x4x255x255 the worst error we can possibly have. + static const double kScaling = 4. * 4 * 255 * 255; + + // The constants have to be non 0 to avoid potential divide by 0 + // issues other than that they affect kind of a weighting between + // the terms. No testing of what the right terms should be has been + // done. + static const double c1 = 1, c2 = 1, c3 = 1; + + // This measures how much consistent variance is in two consecutive + // source frames. 1.0 means they have exactly the same variance. + const double variance_term = + (2.0 * var_old * var_new + c1) / + (1.0 * var_old * var_old + 1.0 * var_new * var_new + c1); + + // This measures how consistent the local mean are between two + // consecutive frames. 1.0 means they have exactly the same mean. + const double mean_term = + (2.0 * mean_old * mean_new + c2) / + (1.0 * mean_old * mean_old + 1.0 * mean_new * mean_new + c2); + + // This measures how consistent the ssims of two + // consecutive frames is. 1.0 means they are exactly the same. + double ssim_term = + pow((2.0 * ssim_old * ssim_new + c3) / + (ssim_old * ssim_old + ssim_new * ssim_new + c3), + 5); + + double this_inconsistency; + + // Floating point math sometimes makes this > 1 by a tiny bit. + // We want the metric to scale between 0 and 1.0 so we can convert + // it to an snr scaled value. + if (ssim_term > 1) ssim_term = 1; + + // This converts the consistency metric to an inconsistency metric + // ( so we can scale it like psnr to something like sum square error. + // The reason for the variance and mean terms is the assumption that + // if there are big changes in the source we shouldn't penalize + // inconsistency in ssim scores a bit less as it will be less visible + // to the user. + this_inconsistency = (1 - ssim_term) * variance_term * mean_term; + + this_inconsistency *= kScaling; + inconsistency_total += this_inconsistency; + } + sv2[c] = sv; + ssim_total += ssim; + ssim2_total += ssim2; + dssim_total += dssim; + + old_ssim_total += ssim_old; + } + old_ssim_total += 0; + } + + norm = 1. / (width / 4) / (height / 4); + ssim_total *= norm; + ssim2_total *= norm; + m->ssim2 = ssim2_total; + m->ssim = ssim_total; + if (old_ssim_total == 0) inconsistency_total = 0; + + m->ssimc = inconsistency_total; + + m->dssim = dssim_total; + return inconsistency_total; +} + +#if CONFIG_VP9_HIGHBITDEPTH +double vpx_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *weight, + uint32_t bd, uint32_t in_bd) { + double a, b, c; + double ssimv; + uint32_t shift = 0; + + assert(bd >= in_bd); + shift = bd - in_bd; + + a = vpx_highbd_ssim2(source->y_buffer, dest->y_buffer, source->y_stride, + dest->y_stride, source->y_crop_width, + source->y_crop_height, in_bd, shift); + + b = vpx_highbd_ssim2(source->u_buffer, dest->u_buffer, source->uv_stride, + dest->uv_stride, source->uv_crop_width, + source->uv_crop_height, in_bd, shift); + + c = vpx_highbd_ssim2(source->v_buffer, dest->v_buffer, source->uv_stride, + dest->uv_stride, source->uv_crop_width, + source->uv_crop_height, in_bd, shift); + + ssimv = a * .8 + .1 * (b + c); + + *weight = 1; + + return ssimv; +} + +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/ssim.h b/media/libvpx/libvpx/vpx_dsp/ssim.h new file mode 100644 index 0000000000..c382237fc6 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/ssim.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_SSIM_H_ +#define VPX_VPX_DSP_SSIM_H_ + +#define MAX_SSIM_DB 100.0; + +#ifdef __cplusplus +extern "C" { +#endif + +#include "./vpx_config.h" +#include "vpx_scale/yv12config.h" + +// metrics used for calculating ssim, ssim2, dssim, and ssimc +typedef struct { + // source sum ( over 8x8 region ) + uint32_t sum_s; + + // reference sum (over 8x8 region ) + uint32_t sum_r; + + // source sum squared ( over 8x8 region ) + uint32_t sum_sq_s; + + // reference sum squared (over 8x8 region ) + uint32_t sum_sq_r; + + // sum of source times reference (over 8x8 region) + uint32_t sum_sxr; + + // calculated ssim score between source and reference + double ssim; +} Ssimv; + +// metrics collected on a frame basis +typedef struct { + // ssim consistency error metric ( see code for explanation ) + double ssimc; + + // standard ssim + double ssim; + + // revised ssim ( see code for explanation) + double ssim2; + + // ssim restated as an error metric like sse + double dssim; + + // dssim converted to decibels + double dssimd; + + // ssimc converted to decibels + double ssimcd; +} Metrics; + +double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2, + int img2_pitch, int width, int height, Ssimv *sv2, + Metrics *m, int do_inconsistency); + +double vpx_calc_ssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *weight); + +double vpx_calc_fastssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *ssim_y, + double *ssim_u, double *ssim_v, uint32_t bd, + uint32_t in_bd); + +#if CONFIG_VP9_HIGHBITDEPTH +double vpx_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *weight, + uint32_t bd, uint32_t in_bd); +#endif // CONFIG_VP9_HIGHBITDEPTH + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_DSP_SSIM_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/subtract.c b/media/libvpx/libvpx/vpx_dsp/subtract.c new file mode 100644 index 0000000000..45c819e67a --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/subtract.c @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, + ptrdiff_t pred_stride) { + int r, c; + + for (r = 0; r < rows; r++) { + for (c = 0; c < cols; c++) diff_ptr[c] = src_ptr[c] - pred_ptr[c]; + + diff_ptr += diff_stride; + pred_ptr += pred_stride; + src_ptr += src_stride; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src8_ptr, + ptrdiff_t src_stride, const uint8_t *pred8_ptr, + ptrdiff_t pred_stride, int bd) { + int r, c; + uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8_ptr); + (void)bd; + + for (r = 0; r < rows; r++) { + for (c = 0; c < cols; c++) { + diff_ptr[c] = src[c] - pred[c]; + } + + diff_ptr += diff_stride; + pred += pred_stride; + src += src_stride; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/sum_squares.c b/media/libvpx/libvpx/vpx_dsp/sum_squares.c new file mode 100644 index 0000000000..b80cd588e4 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/sum_squares.c @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" + +uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size) { + int r, c; + uint64_t ss = 0; + + for (r = 0; r < size; r++) { + for (c = 0; c < size; c++) { + const int16_t v = src[c]; + ss += v * v; + } + src += stride; + } + + return ss; +} diff --git a/media/libvpx/libvpx/vpx_dsp/txfm_common.h b/media/libvpx/libvpx/vpx_dsp/txfm_common.h new file mode 100644 index 0000000000..25f4fdb327 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/txfm_common.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_TXFM_COMMON_H_ +#define VPX_VPX_DSP_TXFM_COMMON_H_ + +#include "vpx_dsp/vpx_dsp_common.h" + +// Constants and Macros used by all idct/dct functions +#define DCT_CONST_BITS 14 +#define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1)) + +#define UNIT_QUANT_SHIFT 2 +#define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT) + +// Constants: +// for (int i = 1; i< 32; ++i) +// printf("static const int cospi_%d_64 = %.0f;\n", i, +// round(16384 * cos(i*M_PI/64))); +// Note: sin(k*Pi/64) = cos((32-k)*Pi/64) +static const tran_coef_t cospi_1_64 = 16364; +static const tran_coef_t cospi_2_64 = 16305; +static const tran_coef_t cospi_3_64 = 16207; +static const tran_coef_t cospi_4_64 = 16069; +static const tran_coef_t cospi_5_64 = 15893; +static const tran_coef_t cospi_6_64 = 15679; +static const tran_coef_t cospi_7_64 = 15426; +static const tran_coef_t cospi_8_64 = 15137; +static const tran_coef_t cospi_9_64 = 14811; +static const tran_coef_t cospi_10_64 = 14449; +static const tran_coef_t cospi_11_64 = 14053; +static const tran_coef_t cospi_12_64 = 13623; +static const tran_coef_t cospi_13_64 = 13160; +static const tran_coef_t cospi_14_64 = 12665; +static const tran_coef_t cospi_15_64 = 12140; +static const tran_coef_t cospi_16_64 = 11585; +static const tran_coef_t cospi_17_64 = 11003; +static const tran_coef_t cospi_18_64 = 10394; +static const tran_coef_t cospi_19_64 = 9760; +static const tran_coef_t cospi_20_64 = 9102; +static const tran_coef_t cospi_21_64 = 8423; +static const tran_coef_t cospi_22_64 = 7723; +static const tran_coef_t cospi_23_64 = 7005; +static const tran_coef_t cospi_24_64 = 6270; +static const tran_coef_t cospi_25_64 = 5520; +static const tran_coef_t cospi_26_64 = 4756; +static const tran_coef_t cospi_27_64 = 3981; +static const tran_coef_t cospi_28_64 = 3196; +static const tran_coef_t cospi_29_64 = 2404; +static const tran_coef_t cospi_30_64 = 1606; +static const tran_coef_t cospi_31_64 = 804; + +// 16384 * sqrt(2) * sin(kPi/9) * 2 / 3 +static const tran_coef_t sinpi_1_9 = 5283; +static const tran_coef_t sinpi_2_9 = 9929; +static const tran_coef_t sinpi_3_9 = 13377; +static const tran_coef_t sinpi_4_9 = 15212; + +#endif // VPX_VPX_DSP_TXFM_COMMON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/variance.c b/media/libvpx/libvpx/vpx_dsp/variance.c new file mode 100644 index 0000000000..1c476542fa --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/variance.c @@ -0,0 +1,566 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_ports/mem.h" +#include "vpx/vpx_integer.h" + +#include "vpx_dsp/variance.h" + +static const uint8_t bilinear_filters[8][2] = { + { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, + { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, +}; + +uint32_t vpx_get4x4sse_cs_c(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + int distortion = 0; + int r, c; + + for (r = 0; r < 4; ++r) { + for (c = 0; c < 4; ++c) { + int diff = src_ptr[c] - ref_ptr[c]; + distortion += diff * diff; + } + + src_ptr += src_stride; + ref_ptr += ref_stride; + } + + return distortion; +} + +uint32_t vpx_get_mb_ss_c(const int16_t *src_ptr) { + unsigned int i, sum = 0; + + for (i = 0; i < 256; ++i) { + sum += src_ptr[i] * src_ptr[i]; + } + + return sum; +} + +static void variance(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, int w, int h, + uint32_t *sse, int *sum) { + int i, j; + + *sum = 0; + *sse = 0; + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int diff = src_ptr[j] - ref_ptr[j]; + *sum += diff; + *sse += diff * diff; + } + + src_ptr += src_stride; + ref_ptr += ref_stride; + } +} + +// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal +// or vertical direction to produce the filtered output block. Used to implement +// the first-pass of 2-D separable filter. +// +// Produces int16_t output to retain precision for the next pass. Two filter +// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is +// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride). +// It defines the offset required to move from one input to the next. +static void var_filter_block2d_bil_first_pass( + const uint8_t *src_ptr, uint16_t *ref_ptr, unsigned int src_pixels_per_line, + int pixel_step, unsigned int output_height, unsigned int output_width, + const uint8_t *filter) { + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + ref_ptr[j] = ROUND_POWER_OF_TWO( + (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], + FILTER_BITS); + + ++src_ptr; + } + + src_ptr += src_pixels_per_line - output_width; + ref_ptr += output_width; + } +} + +// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal +// or vertical direction to produce the filtered output block. Used to implement +// the second-pass of 2-D separable filter. +// +// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two +// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the +// filter is applied horizontally (pixel_step = 1) or vertically +// (pixel_step = stride). It defines the offset required to move from one input +// to the next. Output is 8-bit. +static void var_filter_block2d_bil_second_pass( + const uint16_t *src_ptr, uint8_t *ref_ptr, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter) { + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + ref_ptr[j] = ROUND_POWER_OF_TWO( + (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], + FILTER_BITS); + ++src_ptr; + } + + src_ptr += src_pixels_per_line - output_width; + ref_ptr += output_width; + } +} + +#define VAR(W, H) \ + uint32_t vpx_variance##W##x##H##_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + int sum; \ + variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ + } + +#define SUBPIX_VAR(W, H) \ + uint32_t vpx_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, H + 1, \ + W, bilinear_filters[x_offset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + return vpx_variance##W##x##H##_c(temp2, W, ref_ptr, ref_stride, sse); \ + } + +#define SUBPIX_AVG_VAR(W, H) \ + uint32_t vpx_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(32, uint8_t, temp3[H * W]); \ + \ + var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, H + 1, \ + W, bilinear_filters[x_offset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W); \ + \ + return vpx_variance##W##x##H##_c(temp3, W, ref_ptr, ref_stride, sse); \ + } + +/* Identical to the variance call except it takes an additional parameter, sum, + * and returns that value using pass-by-reference instead of returning + * sse - sum^2 / w*h + */ +#define GET_VAR(W, H) \ + void vpx_get##W##x##H##var_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse, int *sum) { \ + variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, sum); \ + } + +/* Identical to the variance call except it does not calculate the + * sse - sum^2 / w*h and returns sse in addition to modifying the passed in + * variable. + */ +#define MSE(W, H) \ + uint32_t vpx_mse##W##x##H##_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + int sum; \ + variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum); \ + return *sse; \ + } + +/* All three forms of the variance are available in the same sizes. */ +#define VARIANCES(W, H) \ + VAR(W, H) \ + SUBPIX_VAR(W, H) \ + SUBPIX_AVG_VAR(W, H) + +VARIANCES(64, 64) +VARIANCES(64, 32) +VARIANCES(32, 64) +VARIANCES(32, 32) +VARIANCES(32, 16) +VARIANCES(16, 32) +VARIANCES(16, 16) +VARIANCES(16, 8) +VARIANCES(8, 16) +VARIANCES(8, 8) +VARIANCES(8, 4) +VARIANCES(4, 8) +VARIANCES(4, 4) + +GET_VAR(16, 16) +GET_VAR(8, 8) + +MSE(16, 16) +MSE(16, 8) +MSE(8, 16) +MSE(8, 8) + +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + int i, j; + + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int tmp = pred[j] + ref[j]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +static void highbd_variance64(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride, int w, + int h, uint64_t *sse, int64_t *sum) { + int i, j; + + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr); + uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref8_ptr); + *sum = 0; + *sse = 0; + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int diff = src_ptr[j] - ref_ptr[j]; + *sum += diff; + *sse += diff * diff; + } + src_ptr += src_stride; + ref_ptr += ref_stride; + } +} + +static void highbd_8_variance(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride, int w, + int h, uint32_t *sse, int *sum) { + uint64_t sse_long = 0; + int64_t sum_long = 0; + highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long, + &sum_long); + *sse = (uint32_t)sse_long; + *sum = (int)sum_long; +} + +static void highbd_10_variance(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride, int w, + int h, uint32_t *sse, int *sum) { + uint64_t sse_long = 0; + int64_t sum_long = 0; + highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long, + &sum_long); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); +} + +static void highbd_12_variance(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride, int w, + int h, uint32_t *sse, int *sum) { + uint64_t sse_long = 0; + int64_t sum_long = 0; + highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long, + &sum_long); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); +} + +#define HIGHBD_VAR(W, H) \ + uint32_t vpx_highbd_8_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ + } \ + \ + uint32_t vpx_highbd_10_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t vpx_highbd_12_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define HIGHBD_GET_VAR(S) \ + void vpx_highbd_8_get##S##x##S##var_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \ + sum); \ + } \ + \ + void vpx_highbd_10_get##S##x##S##var_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \ + sum); \ + } \ + \ + void vpx_highbd_12_get##S##x##S##var_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \ + sum); \ + } + +#define HIGHBD_MSE(W, H) \ + uint32_t vpx_highbd_8_mse##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + return *sse; \ + } \ + \ + uint32_t vpx_highbd_10_mse##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + return *sse; \ + } \ + \ + uint32_t vpx_highbd_12_mse##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + return *sse; \ + } + +static void highbd_var_filter_block2d_bil_first_pass( + const uint8_t *src_ptr8, uint16_t *output_ptr, + unsigned int src_pixels_per_line, int pixel_step, + unsigned int output_height, unsigned int output_width, + const uint8_t *filter) { + unsigned int i, j; + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8); + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + output_ptr[j] = ROUND_POWER_OF_TWO( + (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], + FILTER_BITS); + + ++src_ptr; + } + + // Next row... + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + +static void highbd_var_filter_block2d_bil_second_pass( + const uint16_t *src_ptr, uint16_t *output_ptr, + unsigned int src_pixels_per_line, unsigned int pixel_step, + unsigned int output_height, unsigned int output_width, + const uint8_t *filter) { + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + output_ptr[j] = ROUND_POWER_OF_TWO( + (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], + FILTER_BITS); + ++src_ptr; + } + + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + +#define HIGHBD_SUBPIX_VAR(W, H) \ + uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ + ref_ptr, ref_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ + ref_ptr, ref_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ + ref_ptr, ref_stride, sse); \ + } + +#define HIGHBD_SUBPIX_AVG_VAR(W, H) \ + uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + vpx_highbd_comp_avg_pred_c(temp3, CONVERT_TO_SHORTPTR(second_pred), W, H, \ + temp2, W); \ + \ + return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + ref_ptr, ref_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + vpx_highbd_comp_avg_pred_c(temp3, CONVERT_TO_SHORTPTR(second_pred), W, H, \ + temp2, W); \ + \ + return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + ref_ptr, ref_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + vpx_highbd_comp_avg_pred_c(temp3, CONVERT_TO_SHORTPTR(second_pred), W, H, \ + temp2, W); \ + \ + return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + ref_ptr, ref_stride, sse); \ + } + +/* All three forms of the variance are available in the same sizes. */ +#define HIGHBD_VARIANCES(W, H) \ + HIGHBD_VAR(W, H) \ + HIGHBD_SUBPIX_VAR(W, H) \ + HIGHBD_SUBPIX_AVG_VAR(W, H) + +HIGHBD_VARIANCES(64, 64) +HIGHBD_VARIANCES(64, 32) +HIGHBD_VARIANCES(32, 64) +HIGHBD_VARIANCES(32, 32) +HIGHBD_VARIANCES(32, 16) +HIGHBD_VARIANCES(16, 32) +HIGHBD_VARIANCES(16, 16) +HIGHBD_VARIANCES(16, 8) +HIGHBD_VARIANCES(8, 16) +HIGHBD_VARIANCES(8, 8) +HIGHBD_VARIANCES(8, 4) +HIGHBD_VARIANCES(4, 8) +HIGHBD_VARIANCES(4, 4) + +HIGHBD_GET_VAR(8) +HIGHBD_GET_VAR(16) + +HIGHBD_MSE(16, 16) +HIGHBD_MSE(16, 8) +HIGHBD_MSE(8, 16) +HIGHBD_MSE(8, 8) + +void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint16_t *pred, + int width, int height, const uint16_t *ref, + int ref_stride) { + int i, j; + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int tmp = pred[j] + ref[j]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/variance.h b/media/libvpx/libvpx/vpx_dsp/variance.h new file mode 100644 index 0000000000..ccdb2f90ba --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/variance.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_VARIANCE_H_ +#define VPX_VPX_DSP_VARIANCE_H_ + +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define FILTER_BITS 7 +#define FILTER_WEIGHT 128 + +typedef unsigned int (*vpx_sad_fn_t)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride); + +typedef unsigned int (*vpx_sad_avg_fn_t)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred); + +typedef void (*vp8_copy32xn_fn_t)(const uint8_t *src_ptr, int src_stride, + uint8_t *ref_ptr, int ref_stride, int n); + +typedef void (*vpx_sad_multi_fn_t)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sad_array); + +typedef void (*vpx_sad_multi_d_fn_t)(const uint8_t *src_ptr, int src_stride, + const uint8_t *const b_array[], + int ref_stride, unsigned int *sad_array); + +typedef unsigned int (*vpx_variance_fn_t)(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, unsigned int *sse); + +typedef unsigned int (*vpx_subpixvariance_fn_t)( + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +typedef unsigned int (*vpx_subp_avg_variance_fn_t)( + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, + const uint8_t *second_pred); + +#if CONFIG_VP8 +typedef struct variance_vtable { + vpx_sad_fn_t sdf; + vpx_variance_fn_t vf; + vpx_subpixvariance_fn_t svf; + vpx_sad_multi_d_fn_t sdx4df; +#if VPX_ARCH_X86 || VPX_ARCH_X86_64 + vp8_copy32xn_fn_t copymem; +#endif +} vp8_variance_fn_ptr_t; +#endif // CONFIG_VP8 + +#if CONFIG_VP9 +typedef struct vp9_variance_vtable { + vpx_sad_fn_t sdf; + // Same as normal sad, but downsample the rows by a factor of 2. + vpx_sad_fn_t sdsf; + vpx_sad_avg_fn_t sdaf; + vpx_variance_fn_t vf; + vpx_subpixvariance_fn_t svf; + vpx_subp_avg_variance_fn_t svaf; + vpx_sad_multi_d_fn_t sdx4df; + // Same as sadx4, but downsample the rows by a factor of 2. + vpx_sad_multi_d_fn_t sdsx4df; +} vp9_variance_fn_ptr_t; +#endif // CONFIG_VP9 + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_DSP_VARIANCE_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_convolve.c b/media/libvpx/libvpx/vpx_dsp/vpx_convolve.c new file mode 100644 index 0000000000..e55a963f9d --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/vpx_convolve.c @@ -0,0 +1,537 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_convolve.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_ports/mem.h" + +static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; + + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; + + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = ROUND_POWER_OF_TWO( + dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = ROUND_POWER_OF_TWO( + dst[y * dst_stride] + + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), + 1); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + (void)y0_q4; + (void)y_step_q4; + convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w, + h); +} + +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + (void)y0_q4; + (void)y_step_q4; + convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + w, h); +} + +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + (void)x0_q4; + (void)x_step_q4; + convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w, + h); +} + +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + (void)x0_q4; + (void)x_step_q4; + convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, + w, h); +} + +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + // When calling in frame scaling function, the smallest scaling factor is x1/4 + // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still + // big enough. + uint8_t temp[64 * 135]; + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= 64); + assert(h <= 64); + assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); + assert(x_step_q4 <= 64); + + convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64, + filter, x0_q4, x_step_q4, w, intermediate_height); + convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter, + y0_q4, y_step_q4, w, h); +} + +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + // Fixed size intermediate buffer places limits on parameters. + DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]); + assert(w <= 64); + assert(h <= 64); + + vpx_convolve8_c(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, h); + vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h); +} + +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + int r; + + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + for (r = h; r > 0; --r) { + memcpy(dst, src, w); + src += src_stride; + dst += dst_stride; + } +} + +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + int x, y; + + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); + src += src_stride; + dst += dst_stride; + } +} + +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); +} + +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); +} + +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h); +} + +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); +} + +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); +} + +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); +} + +#if CONFIG_VP9_HIGHBITDEPTH +static void highbd_convolve_horiz(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h, int bd) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; + + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void highbd_convolve_avg_horiz(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h, int bd) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; + + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = ROUND_POWER_OF_TWO( + dst[x] + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), + 1); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void highbd_convolve_vert(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h, int bd) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void highbd_convolve_avg_vert(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h, int bd) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = ROUND_POWER_OF_TWO( + dst[y * dst_stride] + + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), + 1); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void highbd_convolve(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h, int bd) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + uint16_t temp[64 * 135]; + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= 64); + assert(h <= 64); + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); + + highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, + temp, 64, filter, x0_q4, x_step_q4, w, + intermediate_height, bd); + highbd_convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, + filter, y0_q4, y_step_q4, w, h, bd); +} + +void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + (void)y0_q4; + (void)y_step_q4; + + highbd_convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, w, h, bd); +} + +void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + (void)y0_q4; + (void)y_step_q4; + + highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, w, h, bd); +} + +void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h, int bd) { + (void)x0_q4; + (void)x_step_q4; + + highbd_convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, + y_step_q4, w, h, bd); +} + +void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + (void)x0_q4; + (void)x_step_q4; + + highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, + y_step_q4, w, h, bd); +} + +void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h, int bd) { + highbd_convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h, bd); +} + +void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h, int bd) { + // Fixed size intermediate buffer places limits on parameters. + DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]); + assert(w <= 64); + assert(h <= 64); + + vpx_highbd_convolve8_c(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h, bd); + vpx_highbd_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h, + bd); +} + +void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h, int bd) { + int r; + + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + (void)bd; + + for (r = h; r > 0; --r) { + memcpy(dst, src, w * sizeof(uint16_t)); + src += src_stride; + dst += dst_stride; + } +} + +void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h, int bd) { + int x, y; + + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + (void)bd; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); + src += src_stride; + dst += dst_stride; + } +} +#endif diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_convolve.h b/media/libvpx/libvpx/vpx_dsp/vpx_convolve.h new file mode 100644 index 0000000000..d5793e17ad --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/vpx_convolve.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_VPX_DSP_VPX_CONVOLVE_H_ +#define VPX_VPX_DSP_VPX_CONVOLVE_H_ + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h); + +#if CONFIG_VP9_HIGHBITDEPTH +typedef void (*highbd_convolve_fn_t)(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd); +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_DSP_VPX_CONVOLVE_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk b/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk new file mode 100644 index 0000000000..2bee91f449 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk @@ -0,0 +1,485 @@ +## +## Copyright (c) 2015 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +DSP_SRCS-yes += vpx_dsp.mk +DSP_SRCS-yes += vpx_dsp_common.h + +DSP_SRCS-$(HAVE_MSA) += mips/macros_msa.h + +DSP_SRCS-$(HAVE_AVX2) += x86/bitdepth_conversion_avx2.h +DSP_SRCS-$(HAVE_SSE2) += x86/bitdepth_conversion_sse2.h +# This file is included in libs.mk. Including it here would cause it to be +# compiled into an object. Even as an empty file, this would create an +# executable section on the stack. +#DSP_SRCS-$(HAVE_SSE2) += x86/bitdepth_conversion_sse2$(ASM) + +# bit reader +DSP_SRCS-yes += prob.h +DSP_SRCS-yes += prob.c + +ifeq ($(CONFIG_ENCODERS),yes) +DSP_SRCS-yes += bitwriter.h +DSP_SRCS-yes += bitwriter.c +DSP_SRCS-yes += bitwriter_buffer.c +DSP_SRCS-yes += bitwriter_buffer.h +DSP_SRCS-yes += psnr.c +DSP_SRCS-yes += psnr.h +DSP_SRCS-yes += sse.c +DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.c +DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.h +DSP_SRCS-$(CONFIG_INTERNAL_STATS) += psnrhvs.c +DSP_SRCS-$(CONFIG_INTERNAL_STATS) += fastssim.c +DSP_SRCS-$(HAVE_NEON) += arm/sse_neon.c +DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/sse_neon_dotprod.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/sse_sse4.c +DSP_SRCS-$(HAVE_AVX2) += x86/sse_avx2.c +endif + +ifeq ($(CONFIG_DECODERS),yes) +DSP_SRCS-yes += bitreader.h +DSP_SRCS-yes += bitreader.c +DSP_SRCS-yes += bitreader_buffer.c +DSP_SRCS-yes += bitreader_buffer.h +endif + +# intra predictions +DSP_SRCS-yes += intrapred.c + +DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm +DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm +DSP_SRCS-$(HAVE_VSX) += ppc/intrapred_vsx.c + +ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_intrin_sse2.c +DSP_SRCS-$(HAVE_SSSE3) += x86/highbd_intrapred_intrin_ssse3.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_intrapred_neon.c +endif # CONFIG_VP9_HIGHBITDEPTH + +ifneq ($(filter yes,$(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),) +DSP_SRCS-yes += add_noise.c +DSP_SRCS-yes += deblock.c +DSP_SRCS-yes += postproc.h +DSP_SRCS-$(HAVE_MSA) += mips/add_noise_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/deblock_msa.c +DSP_SRCS-$(HAVE_NEON) += arm/deblock_neon.c +DSP_SRCS-$(HAVE_SSE2) += x86/add_noise_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/deblock_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/post_proc_sse2.c +DSP_SRCS-$(HAVE_VSX) += ppc/deblock_vsx.c +endif # CONFIG_POSTPROC + +DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM) +DSP_SRCS-$(HAVE_NEON) += arm/intrapred_neon.c +DSP_SRCS-$(HAVE_MSA) += mips/intrapred_msa.c +DSP_SRCS-$(HAVE_LSX) += loongarch/intrapred_lsx.c +DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred4_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred8_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred16_dspr2.c + +DSP_SRCS-$(HAVE_DSPR2) += mips/common_dspr2.h +DSP_SRCS-$(HAVE_DSPR2) += mips/common_dspr2.c + +DSP_SRCS-yes += vpx_filter.h +ifeq ($(CONFIG_VP9),yes) +# interpolation filters +DSP_SRCS-yes += vpx_convolve.c +DSP_SRCS-yes += vpx_convolve.h + +DSP_SRCS-$(VPX_ARCH_X86)$(VPX_ARCH_X86_64) += x86/convolve.h + +DSP_SRCS-$(HAVE_SSE2) += x86/convolve_sse2.h +DSP_SRCS-$(HAVE_SSSE3) += x86/convolve_ssse3.h +DSP_SRCS-$(HAVE_AVX2) += x86/convolve_avx2.h +DSP_SRCS-$(HAVE_SSE2) += x86/vpx_subpixel_8t_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/vpx_subpixel_4t_intrin_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/vpx_subpixel_bilinear_sse2.asm +DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm +DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_bilinear_ssse3.asm +DSP_SRCS-$(HAVE_AVX2) += x86/vpx_subpixel_8t_intrin_avx2.c +DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_intrin_ssse3.c +ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_SSE2) += x86/vpx_high_subpixel_8t_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/vpx_high_subpixel_bilinear_sse2.asm +DSP_SRCS-$(HAVE_AVX2) += x86/highbd_convolve_avx2.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_copy_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_avg_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve8_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_neon.c +endif + +DSP_SRCS-$(HAVE_SSE2) += x86/vpx_convolve_copy_sse2.asm +DSP_SRCS-$(HAVE_NEON) += arm/vpx_scaled_convolve8_neon.c + +ifeq ($(HAVE_NEON_ASM),yes) +DSP_SRCS-yes += arm/vpx_convolve_copy_neon_asm$(ASM) +DSP_SRCS-yes += arm/vpx_convolve8_horiz_filter_type2_neon$(ASM) +DSP_SRCS-yes += arm/vpx_convolve8_vert_filter_type2_neon$(ASM) +DSP_SRCS-yes += arm/vpx_convolve8_horiz_filter_type1_neon$(ASM) +DSP_SRCS-yes += arm/vpx_convolve8_vert_filter_type1_neon$(ASM) +DSP_SRCS-yes += arm/vpx_convolve8_avg_horiz_filter_type2_neon$(ASM) +DSP_SRCS-yes += arm/vpx_convolve8_avg_vert_filter_type2_neon$(ASM) +DSP_SRCS-yes += arm/vpx_convolve8_avg_horiz_filter_type1_neon$(ASM) +DSP_SRCS-yes += arm/vpx_convolve8_avg_vert_filter_type1_neon$(ASM) +DSP_SRCS-yes += arm/vpx_convolve_avg_neon_asm$(ASM) +DSP_SRCS-yes += arm/vpx_convolve8_neon_asm.c +DSP_SRCS-yes += arm/vpx_convolve8_neon_asm.h +DSP_SRCS-yes += arm/vpx_convolve_neon.c +else +ifeq ($(HAVE_NEON),yes) +DSP_SRCS-yes += arm/vpx_convolve_copy_neon.c +DSP_SRCS-yes += arm/vpx_convolve8_neon.c +DSP_SRCS-yes += arm/vpx_convolve_avg_neon.c +DSP_SRCS-yes += arm/vpx_convolve_neon.c +DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/vpx_convolve8_neon_dotprod.c +DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/vpx_convolve_neon_dotprod.c +DSP_SRCS-$(HAVE_NEON_I8MM) += arm/vpx_convolve8_neon_i8mm.c +DSP_SRCS-$(HAVE_NEON_I8MM) += arm/vpx_convolve_neon_i8mm.c +endif # HAVE_NEON +endif # HAVE_NEON_ASM + +# common (msa) +DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_avg_horiz_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_avg_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_avg_vert_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_horiz_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_vert_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_avg_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_copy_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_msa.h +DSP_SRCS-$(HAVE_MMI) += mips/vpx_convolve8_mmi.c + +# common (dspr2) +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve_common_dspr2.h +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_avg_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_avg_horiz_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_horiz_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_vert_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_avg_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_avg_horiz_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_horiz_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_vert_dspr2.c + +DSP_SRCS-$(HAVE_VSX) += ppc/vpx_convolve_vsx.c + +# common (lsx) +DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_avg_horiz_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_avg_vert_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_horiz_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_vert_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_avg_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve_avg_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve_copy_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve_lsx.h + +# loop filters +DSP_SRCS-yes += loopfilter.c + +DSP_SRCS-$(HAVE_SSE2) += x86/loopfilter_intrin_sse2.c +DSP_SRCS-$(HAVE_AVX2) += x86/loopfilter_avx2.c + +ifeq ($(HAVE_NEON_ASM),yes) +DSP_SRCS-yes += arm/loopfilter_16_neon$(ASM) +DSP_SRCS-yes += arm/loopfilter_8_neon$(ASM) +DSP_SRCS-yes += arm/loopfilter_4_neon$(ASM) +else +DSP_SRCS-$(HAVE_NEON) += arm/loopfilter_neon.c +endif # HAVE_NEON_ASM + +DSP_SRCS-$(HAVE_MSA) += mips/loopfilter_msa.h +DSP_SRCS-$(HAVE_MSA) += mips/loopfilter_16_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/loopfilter_8_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/loopfilter_4_msa.c +DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_filters_dspr2.h +DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_filters_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_macros_dspr2.h +DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_masks_dspr2.h +DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_horiz_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_vert_dspr2.c + +DSP_SRCS-$(HAVE_LSX) += loongarch/loopfilter_lsx.h +DSP_SRCS-$(HAVE_LSX) += loongarch/loopfilter_16_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/loopfilter_8_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/loopfilter_4_lsx.c + +ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_NEON) += arm/highbd_loopfilter_neon.c +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_loopfilter_sse2.c +endif # CONFIG_VP9_HIGHBITDEPTH +endif # CONFIG_VP9 + +DSP_SRCS-yes += txfm_common.h +DSP_SRCS-$(HAVE_SSE2) += x86/txfm_common_sse2.h +DSP_SRCS-$(HAVE_MSA) += mips/txfm_macros_msa.h +DSP_SRCS-$(HAVE_LSX) += loongarch/txfm_macros_lsx.h +# forward transform +ifeq ($(CONFIG_VP9_ENCODER),yes) +DSP_SRCS-yes += fwd_txfm.c +DSP_SRCS-yes += fwd_txfm.h +DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_sse2.h +DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_impl_sse2.h +DSP_SRCS-$(HAVE_SSE2) += x86/fwd_dct32x32_impl_sse2.h +ifeq ($(VPX_ARCH_X86_64),yes) +DSP_SRCS-$(HAVE_SSSE3) += x86/fwd_txfm_ssse3_x86_64.asm +endif +DSP_SRCS-$(HAVE_AVX2) += x86/fwd_dct32x32_impl_avx2.h +DSP_SRCS-$(HAVE_NEON) += arm/fdct4x4_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/fdct8x8_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/fdct16x16_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/fdct32x32_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/fdct_partial_neon.c +DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.h +DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.c +DSP_SRCS-$(HAVE_LSX) += loongarch/fwd_txfm_lsx.h +DSP_SRCS-$(HAVE_LSX) += loongarch/fwd_txfm_lsx.c + +ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.c +DSP_SRCS-$(HAVE_MSA) += mips/fwd_dct32x32_msa.c +DSP_SRCS-$(HAVE_LSX) += loongarch/fwd_dct32x32_lsx.c +endif # !CONFIG_VP9_HIGHBITDEPTH + +DSP_SRCS-$(HAVE_VSX) += ppc/fdct32x32_vsx.c +endif # CONFIG_VP9_ENCODER + +# inverse transform +ifeq ($(CONFIG_VP9),yes) +DSP_SRCS-yes += inv_txfm.h +DSP_SRCS-yes += inv_txfm.c +DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.h +DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.c +DSP_SRCS-$(HAVE_AVX2) += x86/inv_txfm_avx2.c +DSP_SRCS-$(HAVE_SSE2) += x86/inv_wht_sse2.asm +DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3.h +DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3.c + +DSP_SRCS-$(HAVE_NEON_ASM) += arm/save_reg_neon$(ASM) + +DSP_SRCS-$(HAVE_VSX) += ppc/inv_txfm_vsx.c + +ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_MSA) += mips/inv_txfm_msa.h +DSP_SRCS-$(HAVE_MSA) += mips/idct4x4_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/idct8x8_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/idct16x16_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/idct32x32_msa.c + +DSP_SRCS-$(HAVE_DSPR2) += mips/inv_txfm_dspr2.h +DSP_SRCS-$(HAVE_DSPR2) += mips/itrans4_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/itrans8_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c + +DSP_SRCS-$(HAVE_LSX) += loongarch/idct32x32_lsx.c +else # CONFIG_VP9_HIGHBITDEPTH +DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct4x4_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct8x8_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct16x16_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_34_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_135_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_1024_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct_neon.h +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_inv_txfm_sse2.h +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct4x4_add_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct8x8_add_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct16x16_add_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct32x32_add_sse2.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_inv_txfm_sse4.h +DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct4x4_add_sse4.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct8x8_add_sse4.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct16x16_add_sse4.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct32x32_add_sse4.c +endif # !CONFIG_VP9_HIGHBITDEPTH + +ifeq ($(HAVE_NEON_ASM),yes) +DSP_SRCS-yes += arm/idct_neon$(ASM) +DSP_SRCS-yes += arm/idct4x4_1_add_neon$(ASM) +DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM) +else +DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_1_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_add_neon.c +endif # HAVE_NEON_ASM +DSP_SRCS-$(HAVE_NEON) += arm/idct_neon.h +DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_1_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_1_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_1_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_34_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_135_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_add_neon.c + +endif # CONFIG_VP9 + +# quantization +ifeq ($(CONFIG_VP9_ENCODER),yes) +DSP_SRCS-yes += quantize.c +DSP_SRCS-yes += quantize.h + +DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.h +DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3.c +DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3.h +DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx.c +DSP_SRCS-$(HAVE_AVX2) += x86/quantize_avx2.c +DSP_SRCS-$(HAVE_NEON) += arm/quantize_neon.c +DSP_SRCS-$(HAVE_VSX) += ppc/quantize_vsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/quantize_intrin_lsx.c +ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c +DSP_SRCS-$(HAVE_AVX2) += x86/highbd_quantize_intrin_avx2.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_quantize_neon.c +endif + +# avg +DSP_SRCS-yes += avg.c +DSP_SRCS-$(HAVE_SSE2) += x86/avg_intrin_sse2.c +DSP_SRCS-$(HAVE_AVX2) += x86/avg_intrin_avx2.c +DSP_SRCS-$(HAVE_NEON) += arm/avg_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/hadamard_neon.c +ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_NEON) += arm/highbd_hadamard_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_avg_neon.c +endif +DSP_SRCS-$(HAVE_MSA) += mips/avg_msa.c +DSP_SRCS-$(HAVE_LSX) += loongarch/avg_lsx.c +ifeq ($(VPX_ARCH_X86_64),yes) +DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm +endif +DSP_SRCS-$(HAVE_VSX) += ppc/hadamard_vsx.c + +endif # CONFIG_VP9_ENCODER + +# skin detection +DSP_SRCS-yes += skin_detection.h +DSP_SRCS-yes += skin_detection.c + +ifeq ($(CONFIG_ENCODERS),yes) +DSP_SRCS-yes += sad.c +DSP_SRCS-yes += subtract.c +DSP_SRCS-yes += sum_squares.c +DSP_SRCS-$(HAVE_NEON) += arm/sum_squares_neon.c +DSP_SRCS-$(HAVE_SSE2) += x86/sum_squares_sse2.c +DSP_SRCS-$(HAVE_MSA) += mips/sum_squares_msa.c + +DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c +DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/sad4d_neon_dotprod.c +DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c +DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/sad_neon_dotprod.c +DSP_SRCS-$(HAVE_NEON) += arm/subtract_neon.c + +DSP_SRCS-$(HAVE_MSA) += mips/sad_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/subtract_msa.c + +DSP_SRCS-$(HAVE_LSX) += loongarch/sad_lsx.c + +DSP_SRCS-$(HAVE_MMI) += mips/sad_mmi.c +DSP_SRCS-$(HAVE_MMI) += mips/subtract_mmi.c + +DSP_SRCS-$(HAVE_AVX2) += x86/sad4d_avx2.c +DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c +DSP_SRCS-$(HAVE_AVX2) += x86/subtract_avx2.c +DSP_SRCS-$(HAVE_AVX512) += x86/sad4d_avx512.c + +DSP_SRCS-$(HAVE_SSE2) += x86/sad4d_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/sad_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/subtract_sse2.asm + +DSP_SRCS-$(HAVE_VSX) += ppc/sad_vsx.c +DSP_SRCS-$(HAVE_VSX) += ppc/subtract_vsx.c + +DSP_SRCS-$(HAVE_LSX) += loongarch/subtract_lsx.c + +ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm +DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad4d_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad_neon.c +DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad4d_avx2.c +DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad_avx2.c +endif # CONFIG_VP9_HIGHBITDEPTH + +endif # CONFIG_ENCODERS + +ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),) +DSP_SRCS-yes += variance.c +DSP_SRCS-yes += variance.h + +DSP_SRCS-$(HAVE_NEON) += arm/avg_pred_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/subpel_variance_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/variance_neon.c +DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/variance_neon_dotprod.c + +DSP_SRCS-$(HAVE_MSA) += mips/variance_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/sub_pixel_variance_msa.c + +DSP_SRCS-$(HAVE_LSX) += loongarch/variance_lsx.h +DSP_SRCS-$(HAVE_LSX) += loongarch/variance_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/sub_pixel_variance_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/avg_pred_lsx.c + +DSP_SRCS-$(HAVE_MMI) += mips/variance_mmi.c + +DSP_SRCS-$(HAVE_SSE2) += x86/avg_pred_sse2.c +DSP_SRCS-$(HAVE_AVX2) += x86/avg_pred_avx2.c +DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c # Contains SSE2 and SSSE3 +DSP_SRCS-$(HAVE_AVX2) += x86/variance_avx2.c +DSP_SRCS-$(HAVE_VSX) += ppc/variance_vsx.c + +ifeq ($(VPX_ARCH_X86_64),yes) +DSP_SRCS-$(HAVE_SSE2) += x86/ssim_opt_x86_64.asm +endif # VPX_ARCH_X86_64 + +DSP_SRCS-$(HAVE_SSE2) += x86/subpel_variance_sse2.asm # Contains SSE2 and SSSE3 + +ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm +DSP_SRCS-$(HAVE_NEON) += arm/highbd_avg_pred_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_sse_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_variance_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_subpel_variance_neon.c +endif # CONFIG_VP9_HIGHBITDEPTH +endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC + +# Neon utilities +DSP_SRCS-$(HAVE_NEON) += arm/mem_neon.h +DSP_SRCS-$(HAVE_NEON) += arm/sum_neon.h +DSP_SRCS-$(HAVE_NEON) += arm/transpose_neon.h +DSP_SRCS-$(HAVE_NEON) += arm/vpx_convolve8_neon.h + +# PPC VSX utilities +DSP_SRCS-$(HAVE_VSX) += ppc/types_vsx.h +DSP_SRCS-$(HAVE_VSX) += ppc/txfm_common_vsx.h +DSP_SRCS-$(HAVE_VSX) += ppc/transpose_vsx.h +DSP_SRCS-$(HAVE_VSX) += ppc/bitdepth_conversion_vsx.h + +# X86 utilities +DSP_SRCS-$(HAVE_SSE2) += x86/mem_sse2.h +DSP_SRCS-$(HAVE_SSE2) += x86/transpose_sse2.h + +# LSX utilities +DSP_SRCS-$(HAVE_LSX) += loongarch/bitdepth_conversion_lsx.h + +DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes) + +DSP_SRCS-yes += vpx_dsp_rtcd.c +DSP_SRCS-yes += vpx_dsp_rtcd_defs.pl + +$(eval $(call rtcd_h_template,vpx_dsp_rtcd,vpx_dsp/vpx_dsp_rtcd_defs.pl)) diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_common.h b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_common.h new file mode 100644 index 0000000000..4b946d7560 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_common.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_VPX_DSP_COMMON_H_ +#define VPX_VPX_DSP_VPX_DSP_COMMON_H_ + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define VPXMIN(x, y) (((x) < (y)) ? (x) : (y)) +#define VPXMAX(x, y) (((x) > (y)) ? (x) : (y)) + +#define VPX_SWAP(type, a, b) \ + do { \ + type c = (b); \ + (b) = a; \ + (a) = c; \ + } while (0) + +#if CONFIG_VP9_HIGHBITDEPTH +// Note: +// tran_low_t is the datatype used for final transform coefficients. +// tran_high_t is the datatype used for intermediate transform stages. +typedef int64_t tran_high_t; +typedef int32_t tran_low_t; +#else +// Note: +// tran_low_t is the datatype used for final transform coefficients. +// tran_high_t is the datatype used for intermediate transform stages. +typedef int32_t tran_high_t; +typedef int16_t tran_low_t; +#endif // CONFIG_VP9_HIGHBITDEPTH + +typedef int16_t tran_coef_t; + +// Visual Studio 2022 (cl.exe) targeting AArch64 with optimizations enabled +// produces invalid code for clip_pixel() when the return type is uint8_t. +// See: +// https://developercommunity.visualstudio.com/t/Misoptimization-for-ARM64-in-VS-2022-17/10363361 +// TODO(jzern): check the compiler version after a fix for the issue is +// released. +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) +static INLINE int clip_pixel(int val) { + return (val > 255) ? 255 : (val < 0) ? 0 : val; +} +#else +static INLINE uint8_t clip_pixel(int val) { + return (val > 255) ? 255 : (val < 0) ? 0 : val; +} +#endif + +static INLINE int clamp(int value, int low, int high) { + return value < low ? low : (value > high ? high : value); +} + +static INLINE double fclamp(double value, double low, double high) { + return value < low ? low : (value > high ? high : value); +} + +static INLINE int64_t lclamp(int64_t value, int64_t low, int64_t high) { + return value < low ? low : (value > high ? high : value); +} + +static INLINE uint16_t clip_pixel_highbd(int val, int bd) { + switch (bd) { + case 8: + default: return (uint16_t)clamp(val, 0, 255); + case 10: return (uint16_t)clamp(val, 0, 1023); + case 12: return (uint16_t)clamp(val, 0, 4095); + } +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_DSP_VPX_DSP_COMMON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c new file mode 100644 index 0000000000..030c456d39 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c @@ -0,0 +1,15 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_config.h" +#define RTCD_C +#include "./vpx_dsp_rtcd.h" +#include "vpx_ports/vpx_once.h" + +void vpx_dsp_rtcd() { once(setup_rtcd_internal); } diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl new file mode 100644 index 0000000000..18087e25d9 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -0,0 +1,1828 @@ +## +## Copyright (c) 2017 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +sub vpx_dsp_forward_decls() { +print < +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define FILTER_BITS 7 + +#define SUBPEL_BITS 4 +#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1) +#define SUBPEL_SHIFTS (1 << SUBPEL_BITS) +#define SUBPEL_TAPS 8 + +typedef int16_t InterpKernel[SUBPEL_TAPS]; + +static INLINE int vpx_get_filter_taps(const int16_t *const filter) { + assert(filter[3] != 128); + if (filter[0] | filter[7]) { + return 8; + } + if (filter[1] | filter[6]) { + return 6; + } + if (filter[2] | filter[5]) { + return 4; + } + return 2; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_DSP_VPX_FILTER_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/add_noise_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/add_noise_sse2.asm new file mode 100644 index 0000000000..f51718cf99 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/add_noise_sse2.asm @@ -0,0 +1,88 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +SECTION .text + +;void vpx_plane_add_noise_sse2(uint8_t *start, const int8_t *noise, +; int blackclamp, int whiteclamp, +; int width, int height, int pitch) +globalsym(vpx_plane_add_noise_sse2) +sym(vpx_plane_add_noise_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + + mov rdx, 0x01010101 + mov rax, arg(2) + mul rdx + movq xmm3, rax + pshufd xmm3, xmm3, 0 ; xmm3 is 16 copies of char in blackclamp + + mov rdx, 0x01010101 + mov rax, arg(3) + mul rdx + movq xmm4, rax + pshufd xmm4, xmm4, 0 ; xmm4 is 16 copies of char in whiteclamp + + movdqu xmm5, xmm3 ; both clamp = black clamp + white clamp + paddusb xmm5, xmm4 + +.addnoise_loop: + call sym(LIBVPX_RAND) WRT_PLT + mov rcx, arg(1) ;noise + and rax, 0xff + add rcx, rax + + mov rdi, rcx + movsxd rcx, dword arg(4) ;[Width] + mov rsi, arg(0) ;Pos + xor rax, rax + +.addnoise_nextset: + movdqu xmm1,[rsi+rax] ; get the source + + psubusb xmm1, xmm3 ; subtract black clamp + paddusb xmm1, xmm5 ; add both clamp + psubusb xmm1, xmm4 ; subtract whiteclamp + + movdqu xmm2,[rdi+rax] ; get the noise for this line + paddb xmm1,xmm2 ; add it in + movdqu [rsi+rax],xmm1 ; store the result + + add rax,16 ; move to the next line + + cmp rax, rcx + jl .addnoise_nextset + + movsxd rax, dword arg(6) ; Pitch + add arg(0), rax ; Start += Pitch + sub dword arg(5), 1 ; Height -= 1 + jg .addnoise_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +rd42: + times 8 dw 0x04 +four8s: + times 4 dd 8 diff --git a/media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_avx2.c new file mode 100644 index 0000000000..61e4e73c5b --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_avx2.c @@ -0,0 +1,519 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/bitdepth_conversion_avx2.h" +#include "vpx_ports/mem.h" + +#if CONFIG_VP9_HIGHBITDEPTH +static void highbd_hadamard_col8_avx2(__m256i *in, int iter) { + __m256i a0 = in[0]; + __m256i a1 = in[1]; + __m256i a2 = in[2]; + __m256i a3 = in[3]; + __m256i a4 = in[4]; + __m256i a5 = in[5]; + __m256i a6 = in[6]; + __m256i a7 = in[7]; + + __m256i b0 = _mm256_add_epi32(a0, a1); + __m256i b1 = _mm256_sub_epi32(a0, a1); + __m256i b2 = _mm256_add_epi32(a2, a3); + __m256i b3 = _mm256_sub_epi32(a2, a3); + __m256i b4 = _mm256_add_epi32(a4, a5); + __m256i b5 = _mm256_sub_epi32(a4, a5); + __m256i b6 = _mm256_add_epi32(a6, a7); + __m256i b7 = _mm256_sub_epi32(a6, a7); + + a0 = _mm256_add_epi32(b0, b2); + a1 = _mm256_add_epi32(b1, b3); + a2 = _mm256_sub_epi32(b0, b2); + a3 = _mm256_sub_epi32(b1, b3); + a4 = _mm256_add_epi32(b4, b6); + a5 = _mm256_add_epi32(b5, b7); + a6 = _mm256_sub_epi32(b4, b6); + a7 = _mm256_sub_epi32(b5, b7); + + if (iter == 0) { + b0 = _mm256_add_epi32(a0, a4); + b7 = _mm256_add_epi32(a1, a5); + b3 = _mm256_add_epi32(a2, a6); + b4 = _mm256_add_epi32(a3, a7); + b2 = _mm256_sub_epi32(a0, a4); + b6 = _mm256_sub_epi32(a1, a5); + b1 = _mm256_sub_epi32(a2, a6); + b5 = _mm256_sub_epi32(a3, a7); + + a0 = _mm256_unpacklo_epi32(b0, b1); + a1 = _mm256_unpacklo_epi32(b2, b3); + a2 = _mm256_unpackhi_epi32(b0, b1); + a3 = _mm256_unpackhi_epi32(b2, b3); + a4 = _mm256_unpacklo_epi32(b4, b5); + a5 = _mm256_unpacklo_epi32(b6, b7); + a6 = _mm256_unpackhi_epi32(b4, b5); + a7 = _mm256_unpackhi_epi32(b6, b7); + + b0 = _mm256_unpacklo_epi64(a0, a1); + b1 = _mm256_unpacklo_epi64(a4, a5); + b2 = _mm256_unpackhi_epi64(a0, a1); + b3 = _mm256_unpackhi_epi64(a4, a5); + b4 = _mm256_unpacklo_epi64(a2, a3); + b5 = _mm256_unpacklo_epi64(a6, a7); + b6 = _mm256_unpackhi_epi64(a2, a3); + b7 = _mm256_unpackhi_epi64(a6, a7); + + in[0] = _mm256_permute2x128_si256(b0, b1, 0x20); + in[1] = _mm256_permute2x128_si256(b0, b1, 0x31); + in[2] = _mm256_permute2x128_si256(b2, b3, 0x20); + in[3] = _mm256_permute2x128_si256(b2, b3, 0x31); + in[4] = _mm256_permute2x128_si256(b4, b5, 0x20); + in[5] = _mm256_permute2x128_si256(b4, b5, 0x31); + in[6] = _mm256_permute2x128_si256(b6, b7, 0x20); + in[7] = _mm256_permute2x128_si256(b6, b7, 0x31); + } else { + in[0] = _mm256_add_epi32(a0, a4); + in[7] = _mm256_add_epi32(a1, a5); + in[3] = _mm256_add_epi32(a2, a6); + in[4] = _mm256_add_epi32(a3, a7); + in[2] = _mm256_sub_epi32(a0, a4); + in[6] = _mm256_sub_epi32(a1, a5); + in[1] = _mm256_sub_epi32(a2, a6); + in[5] = _mm256_sub_epi32(a3, a7); + } +} + +void vpx_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + __m128i src16[8]; + __m256i src32[8]; + + src16[0] = _mm_loadu_si128((const __m128i *)src_diff); + src16[1] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[2] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[3] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[7] = _mm_loadu_si128((const __m128i *)(src_diff + src_stride)); + + src32[0] = _mm256_cvtepi16_epi32(src16[0]); + src32[1] = _mm256_cvtepi16_epi32(src16[1]); + src32[2] = _mm256_cvtepi16_epi32(src16[2]); + src32[3] = _mm256_cvtepi16_epi32(src16[3]); + src32[4] = _mm256_cvtepi16_epi32(src16[4]); + src32[5] = _mm256_cvtepi16_epi32(src16[5]); + src32[6] = _mm256_cvtepi16_epi32(src16[6]); + src32[7] = _mm256_cvtepi16_epi32(src16[7]); + + highbd_hadamard_col8_avx2(src32, 0); + highbd_hadamard_col8_avx2(src32, 1); + + _mm256_storeu_si256((__m256i *)coeff, src32[0]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[1]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[2]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[3]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[4]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[5]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[6]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[7]); +} + +void vpx_highbd_hadamard_16x16_avx2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff) { + int idx; + tran_low_t *t_coeff = coeff; + for (idx = 0; idx < 4; ++idx) { + const int16_t *src_ptr = + src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; + vpx_highbd_hadamard_8x8_avx2(src_ptr, src_stride, t_coeff + idx * 64); + } + + for (idx = 0; idx < 64; idx += 8) { + __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); + __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64)); + __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128)); + __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192)); + + __m256i b0 = _mm256_add_epi32(coeff0, coeff1); + __m256i b1 = _mm256_sub_epi32(coeff0, coeff1); + __m256i b2 = _mm256_add_epi32(coeff2, coeff3); + __m256i b3 = _mm256_sub_epi32(coeff2, coeff3); + + b0 = _mm256_srai_epi32(b0, 1); + b1 = _mm256_srai_epi32(b1, 1); + b2 = _mm256_srai_epi32(b2, 1); + b3 = _mm256_srai_epi32(b3, 1); + + coeff0 = _mm256_add_epi32(b0, b2); + coeff1 = _mm256_add_epi32(b1, b3); + coeff2 = _mm256_sub_epi32(b0, b2); + coeff3 = _mm256_sub_epi32(b1, b3); + + _mm256_storeu_si256((__m256i *)coeff, coeff0); + _mm256_storeu_si256((__m256i *)(coeff + 64), coeff1); + _mm256_storeu_si256((__m256i *)(coeff + 128), coeff2); + _mm256_storeu_si256((__m256i *)(coeff + 192), coeff3); + + coeff += 8; + t_coeff += 8; + } +} + +void vpx_highbd_hadamard_32x32_avx2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff) { + int idx; + tran_low_t *t_coeff = coeff; + for (idx = 0; idx < 4; ++idx) { + const int16_t *src_ptr = + src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; + vpx_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256); + } + + for (idx = 0; idx < 256; idx += 8) { + __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); + __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256)); + __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512)); + __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768)); + + __m256i b0 = _mm256_add_epi32(coeff0, coeff1); + __m256i b1 = _mm256_sub_epi32(coeff0, coeff1); + __m256i b2 = _mm256_add_epi32(coeff2, coeff3); + __m256i b3 = _mm256_sub_epi32(coeff2, coeff3); + + b0 = _mm256_srai_epi32(b0, 2); + b1 = _mm256_srai_epi32(b1, 2); + b2 = _mm256_srai_epi32(b2, 2); + b3 = _mm256_srai_epi32(b3, 2); + + coeff0 = _mm256_add_epi32(b0, b2); + coeff1 = _mm256_add_epi32(b1, b3); + coeff2 = _mm256_sub_epi32(b0, b2); + coeff3 = _mm256_sub_epi32(b1, b3); + + _mm256_storeu_si256((__m256i *)coeff, coeff0); + _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1); + _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2); + _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3); + + coeff += 8; + t_coeff += 8; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +static INLINE void sign_extend_16bit_to_32bit_avx2(__m256i in, __m256i zero, + __m256i *out_lo, + __m256i *out_hi) { + const __m256i sign_bits = _mm256_cmpgt_epi16(zero, in); + *out_lo = _mm256_unpacklo_epi16(in, sign_bits); + *out_hi = _mm256_unpackhi_epi16(in, sign_bits); +} + +static void hadamard_col8x2_avx2(__m256i *in, int iter) { + __m256i a0 = in[0]; + __m256i a1 = in[1]; + __m256i a2 = in[2]; + __m256i a3 = in[3]; + __m256i a4 = in[4]; + __m256i a5 = in[5]; + __m256i a6 = in[6]; + __m256i a7 = in[7]; + + __m256i b0 = _mm256_add_epi16(a0, a1); + __m256i b1 = _mm256_sub_epi16(a0, a1); + __m256i b2 = _mm256_add_epi16(a2, a3); + __m256i b3 = _mm256_sub_epi16(a2, a3); + __m256i b4 = _mm256_add_epi16(a4, a5); + __m256i b5 = _mm256_sub_epi16(a4, a5); + __m256i b6 = _mm256_add_epi16(a6, a7); + __m256i b7 = _mm256_sub_epi16(a6, a7); + + a0 = _mm256_add_epi16(b0, b2); + a1 = _mm256_add_epi16(b1, b3); + a2 = _mm256_sub_epi16(b0, b2); + a3 = _mm256_sub_epi16(b1, b3); + a4 = _mm256_add_epi16(b4, b6); + a5 = _mm256_add_epi16(b5, b7); + a6 = _mm256_sub_epi16(b4, b6); + a7 = _mm256_sub_epi16(b5, b7); + + if (iter == 0) { + b0 = _mm256_add_epi16(a0, a4); + b7 = _mm256_add_epi16(a1, a5); + b3 = _mm256_add_epi16(a2, a6); + b4 = _mm256_add_epi16(a3, a7); + b2 = _mm256_sub_epi16(a0, a4); + b6 = _mm256_sub_epi16(a1, a5); + b1 = _mm256_sub_epi16(a2, a6); + b5 = _mm256_sub_epi16(a3, a7); + + a0 = _mm256_unpacklo_epi16(b0, b1); + a1 = _mm256_unpacklo_epi16(b2, b3); + a2 = _mm256_unpackhi_epi16(b0, b1); + a3 = _mm256_unpackhi_epi16(b2, b3); + a4 = _mm256_unpacklo_epi16(b4, b5); + a5 = _mm256_unpacklo_epi16(b6, b7); + a6 = _mm256_unpackhi_epi16(b4, b5); + a7 = _mm256_unpackhi_epi16(b6, b7); + + b0 = _mm256_unpacklo_epi32(a0, a1); + b1 = _mm256_unpacklo_epi32(a4, a5); + b2 = _mm256_unpackhi_epi32(a0, a1); + b3 = _mm256_unpackhi_epi32(a4, a5); + b4 = _mm256_unpacklo_epi32(a2, a3); + b5 = _mm256_unpacklo_epi32(a6, a7); + b6 = _mm256_unpackhi_epi32(a2, a3); + b7 = _mm256_unpackhi_epi32(a6, a7); + + in[0] = _mm256_unpacklo_epi64(b0, b1); + in[1] = _mm256_unpackhi_epi64(b0, b1); + in[2] = _mm256_unpacklo_epi64(b2, b3); + in[3] = _mm256_unpackhi_epi64(b2, b3); + in[4] = _mm256_unpacklo_epi64(b4, b5); + in[5] = _mm256_unpackhi_epi64(b4, b5); + in[6] = _mm256_unpacklo_epi64(b6, b7); + in[7] = _mm256_unpackhi_epi64(b6, b7); + } else { + in[0] = _mm256_add_epi16(a0, a4); + in[7] = _mm256_add_epi16(a1, a5); + in[3] = _mm256_add_epi16(a2, a6); + in[4] = _mm256_add_epi16(a3, a7); + in[2] = _mm256_sub_epi16(a0, a4); + in[6] = _mm256_sub_epi16(a1, a5); + in[1] = _mm256_sub_epi16(a2, a6); + in[5] = _mm256_sub_epi16(a3, a7); + } +} + +static void hadamard_8x8x2_avx2(const int16_t *src_diff, ptrdiff_t src_stride, + int16_t *coeff) { + __m256i src[8]; + src[0] = _mm256_loadu_si256((const __m256i *)src_diff); + src[1] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[2] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[3] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[7] = _mm256_loadu_si256((const __m256i *)(src_diff + src_stride)); + + hadamard_col8x2_avx2(src, 0); + hadamard_col8x2_avx2(src, 1); + + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[0], src[1], 0x20)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[2], src[3], 0x20)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[4], src[5], 0x20)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[6], src[7], 0x20)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[0], src[1], 0x31)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[2], src[3], 0x31)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[4], src[5], 0x31)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[6], src[7], 0x31)); +} + +static INLINE void hadamard_16x16_avx2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff, + int is_final) { +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]); + int16_t *t_coeff = temp_coeff; +#else + int16_t *t_coeff = coeff; +#endif + int16_t *coeff16 = (int16_t *)coeff; + int idx; + for (idx = 0; idx < 2; ++idx) { + const int16_t *src_ptr = src_diff + idx * 8 * src_stride; + hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2)); + } + + for (idx = 0; idx < 64; idx += 16) { + const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); + const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64)); + const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128)); + const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192)); + + __m256i b0 = _mm256_add_epi16(coeff0, coeff1); + __m256i b1 = _mm256_sub_epi16(coeff0, coeff1); + __m256i b2 = _mm256_add_epi16(coeff2, coeff3); + __m256i b3 = _mm256_sub_epi16(coeff2, coeff3); + + b0 = _mm256_srai_epi16(b0, 1); + b1 = _mm256_srai_epi16(b1, 1); + b2 = _mm256_srai_epi16(b2, 1); + b3 = _mm256_srai_epi16(b3, 1); + if (is_final) { + store_tran_low(_mm256_add_epi16(b0, b2), coeff); + store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64); + store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128); + store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192); + coeff += 16; + } else { + _mm256_storeu_si256((__m256i *)coeff16, _mm256_add_epi16(b0, b2)); + _mm256_storeu_si256((__m256i *)(coeff16 + 64), _mm256_add_epi16(b1, b3)); + _mm256_storeu_si256((__m256i *)(coeff16 + 128), _mm256_sub_epi16(b0, b2)); + _mm256_storeu_si256((__m256i *)(coeff16 + 192), _mm256_sub_epi16(b1, b3)); + coeff16 += 16; + } + t_coeff += 16; + } +} + +void vpx_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + hadamard_16x16_avx2(src_diff, src_stride, coeff, 1); +} + +void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { +#if CONFIG_VP9_HIGHBITDEPTH + // For high bitdepths, it is unnecessary to store_tran_low + // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the + // next stage. Output to an intermediate buffer first, then store_tran_low() + // in the final stage. + DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]); + int16_t *t_coeff = temp_coeff; +#else + int16_t *t_coeff = coeff; +#endif + int idx; + __m256i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo, + b3_lo; + __m256i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi, + b3_hi; + __m256i b0, b1, b2, b3; + const __m256i zero = _mm256_setzero_si256(); + for (idx = 0; idx < 4; ++idx) { + // src_diff: 9 bit, dynamic range [-255, 255] + const int16_t *src_ptr = + src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; + hadamard_16x16_avx2(src_ptr, src_stride, + (tran_low_t *)(t_coeff + idx * 256), 0); + } + + for (idx = 0; idx < 256; idx += 16) { + const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); + const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256)); + const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512)); + const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768)); + + // Sign extend 16 bit to 32 bit. + sign_extend_16bit_to_32bit_avx2(coeff0, zero, &coeff0_lo, &coeff0_hi); + sign_extend_16bit_to_32bit_avx2(coeff1, zero, &coeff1_lo, &coeff1_hi); + sign_extend_16bit_to_32bit_avx2(coeff2, zero, &coeff2_lo, &coeff2_hi); + sign_extend_16bit_to_32bit_avx2(coeff3, zero, &coeff3_lo, &coeff3_hi); + + b0_lo = _mm256_add_epi32(coeff0_lo, coeff1_lo); + b0_hi = _mm256_add_epi32(coeff0_hi, coeff1_hi); + + b1_lo = _mm256_sub_epi32(coeff0_lo, coeff1_lo); + b1_hi = _mm256_sub_epi32(coeff0_hi, coeff1_hi); + + b2_lo = _mm256_add_epi32(coeff2_lo, coeff3_lo); + b2_hi = _mm256_add_epi32(coeff2_hi, coeff3_hi); + + b3_lo = _mm256_sub_epi32(coeff2_lo, coeff3_lo); + b3_hi = _mm256_sub_epi32(coeff2_hi, coeff3_hi); + + b0_lo = _mm256_srai_epi32(b0_lo, 2); + b1_lo = _mm256_srai_epi32(b1_lo, 2); + b2_lo = _mm256_srai_epi32(b2_lo, 2); + b3_lo = _mm256_srai_epi32(b3_lo, 2); + + b0_hi = _mm256_srai_epi32(b0_hi, 2); + b1_hi = _mm256_srai_epi32(b1_hi, 2); + b2_hi = _mm256_srai_epi32(b2_hi, 2); + b3_hi = _mm256_srai_epi32(b3_hi, 2); + + b0 = _mm256_packs_epi32(b0_lo, b0_hi); + b1 = _mm256_packs_epi32(b1_lo, b1_hi); + b2 = _mm256_packs_epi32(b2_lo, b2_hi); + b3 = _mm256_packs_epi32(b3_lo, b3_hi); + + store_tran_low(_mm256_add_epi16(b0, b2), coeff); + store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256); + store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 512); + store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 768); + + coeff += 16; + t_coeff += 16; + } +} + +int vpx_satd_avx2(const tran_low_t *coeff, int length) { + const __m256i one = _mm256_set1_epi16(1); + __m256i accum = _mm256_setzero_si256(); + int i; + + for (i = 0; i < length; i += 16) { + const __m256i src_line = load_tran_low(coeff); + const __m256i abs = _mm256_abs_epi16(src_line); + const __m256i sum = _mm256_madd_epi16(abs, one); + accum = _mm256_add_epi32(accum, sum); + coeff += 16; + } + + { // 32 bit horizontal add + const __m256i a = _mm256_srli_si256(accum, 8); + const __m256i b = _mm256_add_epi32(accum, a); + const __m256i c = _mm256_srli_epi64(b, 32); + const __m256i d = _mm256_add_epi32(b, c); + const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d), + _mm256_extractf128_si256(d, 1)); + return _mm_cvtsi128_si32(accum_128); + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +int vpx_highbd_satd_avx2(const tran_low_t *coeff, int length) { + __m256i accum = _mm256_setzero_si256(); + int i; + + for (i = 0; i < length; i += 8, coeff += 8) { + const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff); + const __m256i abs = _mm256_abs_epi32(src_line); + accum = _mm256_add_epi32(accum, abs); + } + + { // 32 bit horizontal add + const __m256i a = _mm256_srli_si256(accum, 8); + const __m256i b = _mm256_add_epi32(accum, a); + const __m256i c = _mm256_srli_epi64(b, 32); + const __m256i d = _mm256_add_epi32(b, c); + const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d), + _mm256_extractf128_si256(d, 1)); + return _mm_cvtsi128_si32(accum_128); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_sse2.c new file mode 100644 index 0000000000..4447dfab7c --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_sse2.c @@ -0,0 +1,614 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" +#include "vpx_ports/mem.h" + +static INLINE void sign_extend_16bit_to_32bit_sse2(__m128i in, __m128i zero, + __m128i *out_lo, + __m128i *out_hi) { + const __m128i sign_bits = _mm_cmplt_epi16(in, zero); + *out_lo = _mm_unpacklo_epi16(in, sign_bits); + *out_hi = _mm_unpackhi_epi16(in, sign_bits); +} + +void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, + int *min, int *max) { + __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff; + u0 = _mm_setzero_si128(); + // Row 0 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff0 = _mm_max_epi16(diff, negdiff); + // Row 1 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(absdiff0, absdiff); + minabsdiff = _mm_min_epi16(absdiff0, absdiff); + // Row 2 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 3 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 4 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 5 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 6 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 7 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + + maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8)); + maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32)); + maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16)); + *max = _mm_extract_epi16(maxabsdiff, 0); + + minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8)); + minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32)); + minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16)); + *min = _mm_extract_epi16(minabsdiff, 0); +} + +unsigned int vpx_avg_8x8_sse2(const uint8_t *s, int p) { + __m128i s0, s1, u0; + unsigned int avg = 0; + u0 = _mm_setzero_si128(); + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + + s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8)); + s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32)); + s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16)); + avg = _mm_extract_epi16(s0, 0); + return (avg + 32) >> 6; +} + +unsigned int vpx_avg_4x4_sse2(const uint8_t *s, int p) { + __m128i s0, s1, u0; + unsigned int avg = 0; + u0 = _mm_setzero_si128(); + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + + s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4)); + s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16)); + avg = _mm_extract_epi16(s0, 0); + return (avg + 8) >> 4; +} + +#if CONFIG_VP9_HIGHBITDEPTH +unsigned int vpx_highbd_avg_8x8_sse2(const uint8_t *s8, int p) { + __m128i s0, s1; + unsigned int avg; + const uint16_t *s = CONVERT_TO_SHORTPTR(s8); + const __m128i zero = _mm_setzero_si128(); + s0 = _mm_loadu_si128((const __m128i *)(s)); + s1 = _mm_loadu_si128((const __m128i *)(s + p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_loadu_si128((const __m128i *)(s + 2 * p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_loadu_si128((const __m128i *)(s + 3 * p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_loadu_si128((const __m128i *)(s + 4 * p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_loadu_si128((const __m128i *)(s + 5 * p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_loadu_si128((const __m128i *)(s + 6 * p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_loadu_si128((const __m128i *)(s + 7 * p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpackhi_epi16(s0, zero); + s0 = _mm_unpacklo_epi16(s0, zero); + s0 = _mm_add_epi32(s0, s1); + s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 8)); + s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 4)); + avg = (unsigned int)_mm_cvtsi128_si32(s0); + + return (avg + 32) >> 6; +} + +unsigned int vpx_highbd_avg_4x4_sse2(const uint8_t *s8, int p) { + __m128i s0, s1; + unsigned int avg; + const uint16_t *s = CONVERT_TO_SHORTPTR(s8); + s0 = _mm_loadl_epi64((const __m128i *)(s)); + s1 = _mm_loadl_epi64((const __m128i *)(s + p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_loadl_epi64((const __m128i *)(s + 2 * p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_loadl_epi64((const __m128i *)(s + 3 * p)); + s0 = _mm_adds_epu16(s0, s1); + s0 = _mm_add_epi16(s0, _mm_srli_si128(s0, 4)); + s0 = _mm_add_epi16(s0, _mm_srli_si128(s0, 2)); + avg = _mm_extract_epi16(s0, 0); + + return (avg + 8) >> 4; +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +static void hadamard_col8_sse2(__m128i *in, int iter) { + __m128i a0 = in[0]; + __m128i a1 = in[1]; + __m128i a2 = in[2]; + __m128i a3 = in[3]; + __m128i a4 = in[4]; + __m128i a5 = in[5]; + __m128i a6 = in[6]; + __m128i a7 = in[7]; + + __m128i b0 = _mm_add_epi16(a0, a1); + __m128i b1 = _mm_sub_epi16(a0, a1); + __m128i b2 = _mm_add_epi16(a2, a3); + __m128i b3 = _mm_sub_epi16(a2, a3); + __m128i b4 = _mm_add_epi16(a4, a5); + __m128i b5 = _mm_sub_epi16(a4, a5); + __m128i b6 = _mm_add_epi16(a6, a7); + __m128i b7 = _mm_sub_epi16(a6, a7); + + a0 = _mm_add_epi16(b0, b2); + a1 = _mm_add_epi16(b1, b3); + a2 = _mm_sub_epi16(b0, b2); + a3 = _mm_sub_epi16(b1, b3); + a4 = _mm_add_epi16(b4, b6); + a5 = _mm_add_epi16(b5, b7); + a6 = _mm_sub_epi16(b4, b6); + a7 = _mm_sub_epi16(b5, b7); + + if (iter == 0) { + b0 = _mm_add_epi16(a0, a4); + b7 = _mm_add_epi16(a1, a5); + b3 = _mm_add_epi16(a2, a6); + b4 = _mm_add_epi16(a3, a7); + b2 = _mm_sub_epi16(a0, a4); + b6 = _mm_sub_epi16(a1, a5); + b1 = _mm_sub_epi16(a2, a6); + b5 = _mm_sub_epi16(a3, a7); + + a0 = _mm_unpacklo_epi16(b0, b1); + a1 = _mm_unpacklo_epi16(b2, b3); + a2 = _mm_unpackhi_epi16(b0, b1); + a3 = _mm_unpackhi_epi16(b2, b3); + a4 = _mm_unpacklo_epi16(b4, b5); + a5 = _mm_unpacklo_epi16(b6, b7); + a6 = _mm_unpackhi_epi16(b4, b5); + a7 = _mm_unpackhi_epi16(b6, b7); + + b0 = _mm_unpacklo_epi32(a0, a1); + b1 = _mm_unpacklo_epi32(a4, a5); + b2 = _mm_unpackhi_epi32(a0, a1); + b3 = _mm_unpackhi_epi32(a4, a5); + b4 = _mm_unpacklo_epi32(a2, a3); + b5 = _mm_unpacklo_epi32(a6, a7); + b6 = _mm_unpackhi_epi32(a2, a3); + b7 = _mm_unpackhi_epi32(a6, a7); + + in[0] = _mm_unpacklo_epi64(b0, b1); + in[1] = _mm_unpackhi_epi64(b0, b1); + in[2] = _mm_unpacklo_epi64(b2, b3); + in[3] = _mm_unpackhi_epi64(b2, b3); + in[4] = _mm_unpacklo_epi64(b4, b5); + in[5] = _mm_unpackhi_epi64(b4, b5); + in[6] = _mm_unpacklo_epi64(b6, b7); + in[7] = _mm_unpackhi_epi64(b6, b7); + } else { + in[0] = _mm_add_epi16(a0, a4); + in[7] = _mm_add_epi16(a1, a5); + in[3] = _mm_add_epi16(a2, a6); + in[4] = _mm_add_epi16(a3, a7); + in[2] = _mm_sub_epi16(a0, a4); + in[6] = _mm_sub_epi16(a1, a5); + in[1] = _mm_sub_epi16(a2, a6); + in[5] = _mm_sub_epi16(a3, a7); + } +} + +static INLINE void hadamard_8x8_sse2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff, + int is_final) { + __m128i src[8]; + src[0] = _mm_load_si128((const __m128i *)src_diff); + src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[7] = _mm_load_si128((const __m128i *)(src_diff + src_stride)); + + hadamard_col8_sse2(src, 0); + hadamard_col8_sse2(src, 1); + + if (is_final) { + store_tran_low(src[0], coeff); + coeff += 8; + store_tran_low(src[1], coeff); + coeff += 8; + store_tran_low(src[2], coeff); + coeff += 8; + store_tran_low(src[3], coeff); + coeff += 8; + store_tran_low(src[4], coeff); + coeff += 8; + store_tran_low(src[5], coeff); + coeff += 8; + store_tran_low(src[6], coeff); + coeff += 8; + store_tran_low(src[7], coeff); + } else { + int16_t *coeff16 = (int16_t *)coeff; + _mm_store_si128((__m128i *)coeff16, src[0]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[1]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[2]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[3]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[4]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[5]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[6]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[7]); + } +} + +void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + hadamard_8x8_sse2(src_diff, src_stride, coeff, 1); +} + +static INLINE void hadamard_16x16_sse2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff, + int is_final) { +#if CONFIG_VP9_HIGHBITDEPTH + // For high bitdepths, it is unnecessary to store_tran_low + // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the + // next stage. Output to an intermediate buffer first, then store_tran_low() + // in the final stage. + DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]); + int16_t *t_coeff = temp_coeff; +#else + int16_t *t_coeff = coeff; +#endif + int16_t *coeff16 = (int16_t *)coeff; + int idx; + for (idx = 0; idx < 4; ++idx) { + const int16_t *src_ptr = + src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; + hadamard_8x8_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 64), + 0); + } + + for (idx = 0; idx < 64; idx += 8) { + __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff); + __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64)); + __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128)); + __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192)); + + __m128i b0 = _mm_add_epi16(coeff0, coeff1); + __m128i b1 = _mm_sub_epi16(coeff0, coeff1); + __m128i b2 = _mm_add_epi16(coeff2, coeff3); + __m128i b3 = _mm_sub_epi16(coeff2, coeff3); + + b0 = _mm_srai_epi16(b0, 1); + b1 = _mm_srai_epi16(b1, 1); + b2 = _mm_srai_epi16(b2, 1); + b3 = _mm_srai_epi16(b3, 1); + + coeff0 = _mm_add_epi16(b0, b2); + coeff1 = _mm_add_epi16(b1, b3); + coeff2 = _mm_sub_epi16(b0, b2); + coeff3 = _mm_sub_epi16(b1, b3); + + if (is_final) { + store_tran_low(coeff0, coeff); + store_tran_low(coeff1, coeff + 64); + store_tran_low(coeff2, coeff + 128); + store_tran_low(coeff3, coeff + 192); + coeff += 8; + } else { + _mm_store_si128((__m128i *)coeff16, coeff0); + _mm_store_si128((__m128i *)(coeff16 + 64), coeff1); + _mm_store_si128((__m128i *)(coeff16 + 128), coeff2); + _mm_store_si128((__m128i *)(coeff16 + 192), coeff3); + coeff16 += 8; + } + + t_coeff += 8; + } +} + +void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + hadamard_16x16_sse2(src_diff, src_stride, coeff, 1); +} + +void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { +#if CONFIG_VP9_HIGHBITDEPTH + // For high bitdepths, it is unnecessary to store_tran_low + // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the + // next stage. Output to an intermediate buffer first, then store_tran_low() + // in the final stage. + DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]); + int16_t *t_coeff = temp_coeff; +#else + int16_t *t_coeff = coeff; +#endif + int idx; + __m128i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo, + b3_lo; + __m128i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi, + b3_hi; + __m128i b0, b1, b2, b3; + const __m128i zero = _mm_setzero_si128(); + for (idx = 0; idx < 4; ++idx) { + const int16_t *src_ptr = + src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; + hadamard_16x16_sse2(src_ptr, src_stride, + (tran_low_t *)(t_coeff + idx * 256), 0); + } + + for (idx = 0; idx < 256; idx += 8) { + __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff); + __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 256)); + __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512)); + __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768)); + + // Sign extend 16 bit to 32 bit. + sign_extend_16bit_to_32bit_sse2(coeff0, zero, &coeff0_lo, &coeff0_hi); + sign_extend_16bit_to_32bit_sse2(coeff1, zero, &coeff1_lo, &coeff1_hi); + sign_extend_16bit_to_32bit_sse2(coeff2, zero, &coeff2_lo, &coeff2_hi); + sign_extend_16bit_to_32bit_sse2(coeff3, zero, &coeff3_lo, &coeff3_hi); + + b0_lo = _mm_add_epi32(coeff0_lo, coeff1_lo); + b0_hi = _mm_add_epi32(coeff0_hi, coeff1_hi); + + b1_lo = _mm_sub_epi32(coeff0_lo, coeff1_lo); + b1_hi = _mm_sub_epi32(coeff0_hi, coeff1_hi); + + b2_lo = _mm_add_epi32(coeff2_lo, coeff3_lo); + b2_hi = _mm_add_epi32(coeff2_hi, coeff3_hi); + + b3_lo = _mm_sub_epi32(coeff2_lo, coeff3_lo); + b3_hi = _mm_sub_epi32(coeff2_hi, coeff3_hi); + + b0_lo = _mm_srai_epi32(b0_lo, 2); + b1_lo = _mm_srai_epi32(b1_lo, 2); + b2_lo = _mm_srai_epi32(b2_lo, 2); + b3_lo = _mm_srai_epi32(b3_lo, 2); + + b0_hi = _mm_srai_epi32(b0_hi, 2); + b1_hi = _mm_srai_epi32(b1_hi, 2); + b2_hi = _mm_srai_epi32(b2_hi, 2); + b3_hi = _mm_srai_epi32(b3_hi, 2); + + b0 = _mm_packs_epi32(b0_lo, b0_hi); + b1 = _mm_packs_epi32(b1_lo, b1_hi); + b2 = _mm_packs_epi32(b2_lo, b2_hi); + b3 = _mm_packs_epi32(b3_lo, b3_hi); + + coeff0 = _mm_add_epi16(b0, b2); + coeff1 = _mm_add_epi16(b1, b3); + store_tran_low(coeff0, coeff); + store_tran_low(coeff1, coeff + 256); + + coeff2 = _mm_sub_epi16(b0, b2); + coeff3 = _mm_sub_epi16(b1, b3); + store_tran_low(coeff2, coeff + 512); + store_tran_low(coeff3, coeff + 768); + + coeff += 8; + t_coeff += 8; + } +} + +int vpx_satd_sse2(const tran_low_t *coeff, int length) { + int i; + const __m128i zero = _mm_setzero_si128(); + __m128i accum = zero; + + for (i = 0; i < length; i += 8) { + const __m128i src_line = load_tran_low(coeff); + const __m128i inv = _mm_sub_epi16(zero, src_line); + const __m128i abs = _mm_max_epi16(src_line, inv); // abs(src_line) + const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero); + const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero); + const __m128i sum = _mm_add_epi32(abs_lo, abs_hi); + accum = _mm_add_epi32(accum, sum); + coeff += 8; + } + + { // cascading summation of accum + __m128i hi = _mm_srli_si128(accum, 8); + accum = _mm_add_epi32(accum, hi); + hi = _mm_srli_epi64(accum, 32); + accum = _mm_add_epi32(accum, hi); + } + + return _mm_cvtsi128_si32(accum); +} + +void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref, + const int ref_stride, const int height) { + int idx; + __m128i zero = _mm_setzero_si128(); + __m128i src_line = _mm_loadu_si128((const __m128i *)ref); + __m128i s0 = _mm_unpacklo_epi8(src_line, zero); + __m128i s1 = _mm_unpackhi_epi8(src_line, zero); + __m128i t0, t1; + int height_1 = height - 1; + ref += ref_stride; + + for (idx = 1; idx < height_1; idx += 2) { + src_line = _mm_loadu_si128((const __m128i *)ref); + t0 = _mm_unpacklo_epi8(src_line, zero); + t1 = _mm_unpackhi_epi8(src_line, zero); + s0 = _mm_adds_epu16(s0, t0); + s1 = _mm_adds_epu16(s1, t1); + ref += ref_stride; + + src_line = _mm_loadu_si128((const __m128i *)ref); + t0 = _mm_unpacklo_epi8(src_line, zero); + t1 = _mm_unpackhi_epi8(src_line, zero); + s0 = _mm_adds_epu16(s0, t0); + s1 = _mm_adds_epu16(s1, t1); + ref += ref_stride; + } + + src_line = _mm_loadu_si128((const __m128i *)ref); + t0 = _mm_unpacklo_epi8(src_line, zero); + t1 = _mm_unpackhi_epi8(src_line, zero); + s0 = _mm_adds_epu16(s0, t0); + s1 = _mm_adds_epu16(s1, t1); + + if (height == 64) { + s0 = _mm_srai_epi16(s0, 5); + s1 = _mm_srai_epi16(s1, 5); + } else if (height == 32) { + s0 = _mm_srai_epi16(s0, 4); + s1 = _mm_srai_epi16(s1, 4); + } else { + s0 = _mm_srai_epi16(s0, 3); + s1 = _mm_srai_epi16(s1, 3); + } + + _mm_storeu_si128((__m128i *)hbuf, s0); + hbuf += 8; + _mm_storeu_si128((__m128i *)hbuf, s1); +} + +int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width) { + __m128i zero = _mm_setzero_si128(); + __m128i src_line = _mm_loadu_si128((const __m128i *)ref); + __m128i s0 = _mm_sad_epu8(src_line, zero); + __m128i s1; + int i; + + for (i = 16; i < width; i += 16) { + ref += 16; + src_line = _mm_loadu_si128((const __m128i *)ref); + s1 = _mm_sad_epu8(src_line, zero); + s0 = _mm_adds_epu16(s0, s1); + } + + s1 = _mm_srli_si128(s0, 8); + s0 = _mm_adds_epu16(s0, s1); + + return _mm_extract_epi16(s0, 0); +} + +int vpx_vector_var_sse2(const int16_t *ref, const int16_t *src, const int bwl) { + int idx; + int width = 4 << bwl; + int16_t mean; + __m128i v0 = _mm_loadu_si128((const __m128i *)ref); + __m128i v1 = _mm_load_si128((const __m128i *)src); + __m128i diff = _mm_subs_epi16(v0, v1); + __m128i sum = diff; + __m128i sse = _mm_madd_epi16(diff, diff); + + ref += 8; + src += 8; + + for (idx = 8; idx < width; idx += 8) { + v0 = _mm_loadu_si128((const __m128i *)ref); + v1 = _mm_load_si128((const __m128i *)src); + diff = _mm_subs_epi16(v0, v1); + + sum = _mm_add_epi16(sum, diff); + v0 = _mm_madd_epi16(diff, diff); + sse = _mm_add_epi32(sse, v0); + + ref += 8; + src += 8; + } + + v0 = _mm_srli_si128(sum, 8); + sum = _mm_add_epi16(sum, v0); + v0 = _mm_srli_epi64(sum, 32); + sum = _mm_add_epi16(sum, v0); + v0 = _mm_srli_epi32(sum, 16); + sum = _mm_add_epi16(sum, v0); + + v1 = _mm_srli_si128(sse, 8); + sse = _mm_add_epi32(sse, v1); + v1 = _mm_srli_epi64(sse, 32); + sse = _mm_add_epi32(sse, v1); + + mean = (int16_t)_mm_extract_epi16(sum, 0); + + return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2)); +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/avg_pred_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/avg_pred_avx2.c new file mode 100644 index 0000000000..f4357998c9 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/avg_pred_avx2.c @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" + +void vpx_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + int row = 0; + // comp_pred and pred must be 32 byte aligned. + assert(((intptr_t)comp_pred % 32) == 0); + assert(((intptr_t)pred % 32) == 0); + + if (width == 8) { + assert(height % 4 == 0); + do { + const __m256i p = _mm256_load_si256((const __m256i *)pred); + const __m128i r_0 = _mm_loadl_epi64((const __m128i *)ref); + const __m128i r_1 = + _mm_loadl_epi64((const __m128i *)(ref + 2 * ref_stride)); + + const __m128i r1 = _mm_castps_si128(_mm_loadh_pi( + _mm_castsi128_ps(r_0), (const __m64 *)(ref + ref_stride))); + const __m128i r2 = _mm_castps_si128(_mm_loadh_pi( + _mm_castsi128_ps(r_1), (const __m64 *)(ref + 3 * ref_stride))); + + const __m256i ref_0123 = + _mm256_inserti128_si256(_mm256_castsi128_si256(r1), r2, 1); + const __m256i avg = _mm256_avg_epu8(p, ref_0123); + + _mm256_store_si256((__m256i *)comp_pred, avg); + + row += 4; + pred += 32; + comp_pred += 32; + ref += 4 * ref_stride; + } while (row < height); + } else if (width == 16) { + assert(height % 4 == 0); + do { + const __m256i pred_0 = _mm256_load_si256((const __m256i *)pred); + const __m256i pred_1 = _mm256_load_si256((const __m256i *)(pred + 32)); + const __m256i tmp0 = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)ref)); + const __m256i ref_0 = _mm256_inserti128_si256( + tmp0, _mm_loadu_si128((const __m128i *)(ref + ref_stride)), 1); + const __m256i tmp1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(ref + 2 * ref_stride))); + const __m256i ref_1 = _mm256_inserti128_si256( + tmp1, _mm_loadu_si128((const __m128i *)(ref + 3 * ref_stride)), 1); + const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0); + const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1); + _mm256_store_si256((__m256i *)comp_pred, average_0); + _mm256_store_si256((__m256i *)(comp_pred + 32), average_1); + + row += 4; + pred += 64; + comp_pred += 64; + ref += 4 * ref_stride; + } while (row < height); + } else if (width == 32) { + assert(height % 2 == 0); + do { + const __m256i pred_0 = _mm256_load_si256((const __m256i *)pred); + const __m256i pred_1 = _mm256_load_si256((const __m256i *)(pred + 32)); + const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i ref_1 = + _mm256_loadu_si256((const __m256i *)(ref + ref_stride)); + const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0); + const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1); + _mm256_store_si256((__m256i *)comp_pred, average_0); + _mm256_store_si256((__m256i *)(comp_pred + 32), average_1); + + row += 2; + pred += 64; + comp_pred += 64; + ref += 2 * ref_stride; + } while (row < height); + } else if (width % 64 == 0) { + do { + int x; + for (x = 0; x < width; x += 64) { + const __m256i pred_0 = _mm256_load_si256((const __m256i *)(pred + x)); + const __m256i pred_1 = + _mm256_load_si256((const __m256i *)(pred + x + 32)); + const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)(ref + x)); + const __m256i ref_1 = + _mm256_loadu_si256((const __m256i *)(ref + x + 32)); + const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0); + const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1); + _mm256_store_si256((__m256i *)(comp_pred + x), average_0); + _mm256_store_si256((__m256i *)(comp_pred + x + 32), average_1); + } + row++; + pred += width; + comp_pred += width; + ref += ref_stride; + } while (row < height); + } else { + vpx_comp_avg_pred_sse2(comp_pred, pred, width, height, ref, ref_stride); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/avg_pred_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/avg_pred_sse2.c new file mode 100644 index 0000000000..c6e70f744e --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/avg_pred_sse2.c @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/mem_sse2.h" + +void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + /* comp_pred and pred must be 16 byte aligned. */ + assert(((intptr_t)comp_pred & 0xf) == 0); + assert(((intptr_t)pred & 0xf) == 0); + if (width > 8) { + int x, y; + for (y = 0; y < height; ++y) { + for (x = 0; x < width; x += 16) { + const __m128i p = _mm_load_si128((const __m128i *)(pred + x)); + const __m128i r = _mm_loadu_si128((const __m128i *)(ref + x)); + const __m128i avg = _mm_avg_epu8(p, r); + _mm_store_si128((__m128i *)(comp_pred + x), avg); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } + } else { // width must be 4 or 8. + int i; + // Process 16 elements at a time. comp_pred and pred have width == stride + // and therefore live in contigious memory. 4*4, 4*8, 8*4, 8*8, and 8*16 are + // all divisible by 16 so just ref needs to be massaged when loading. + for (i = 0; i < width * height; i += 16) { + const __m128i p = _mm_load_si128((const __m128i *)pred); + __m128i r; + __m128i avg; + if (width == ref_stride) { + r = _mm_loadu_si128((const __m128i *)ref); + ref += 16; + } else if (width == 4) { + r = _mm_set_epi32(loadu_int32(ref + 3 * ref_stride), + loadu_int32(ref + 2 * ref_stride), + loadu_int32(ref + ref_stride), loadu_int32(ref)); + + ref += 4 * ref_stride; + } else { + const __m128i r_0 = _mm_loadl_epi64((const __m128i *)ref); + assert(width == 8); + r = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(r_0), + (const __m64 *)(ref + ref_stride))); + + ref += 2 * ref_stride; + } + avg = _mm_avg_epu8(p, r); + _mm_store_si128((__m128i *)comp_pred, avg); + + pred += 16; + comp_pred += 16; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm b/media/libvpx/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm new file mode 100644 index 0000000000..9122b5a401 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm @@ -0,0 +1,130 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" +%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm" + +SECTION .text + +%if VPX_ARCH_X86_64 +; matrix transpose +%macro TRANSPOSE8X8 10 + ; stage 1 + punpcklwd m%9, m%1, m%2 + punpcklwd m%10, m%3, m%4 + punpckhwd m%1, m%2 + punpckhwd m%3, m%4 + + punpcklwd m%2, m%5, m%6 + punpcklwd m%4, m%7, m%8 + punpckhwd m%5, m%6 + punpckhwd m%7, m%8 + + ; stage 2 + punpckldq m%6, m%9, m%10 + punpckldq m%8, m%1, m%3 + punpckhdq m%9, m%10 + punpckhdq m%1, m%3 + + punpckldq m%10, m%2, m%4 + punpckldq m%3, m%5, m%7 + punpckhdq m%2, m%4 + punpckhdq m%5, m%7 + + ; stage 3 + punpckhqdq m%4, m%9, m%2 ; out3 + punpcklqdq m%9, m%2 ; out2 + punpcklqdq m%7, m%1, m%5 ; out6 + punpckhqdq m%1, m%5 ; out7 + + punpckhqdq m%2, m%6, m%10 ; out1 + punpcklqdq m%6, m%10 ; out0 + punpcklqdq m%5, m%8, m%3 ; out4 + punpckhqdq m%8, m%3 ; out5 + + SWAP %6, %1 + SWAP %3, %9 + SWAP %8, %6 +%endmacro + +%macro HMD8_1D 0 + psubw m8, m0, m1 + psubw m9, m2, m3 + paddw m0, m1 + paddw m2, m3 + SWAP 1, 8 + SWAP 3, 9 + psubw m8, m4, m5 + psubw m9, m6, m7 + paddw m4, m5 + paddw m6, m7 + SWAP 5, 8 + SWAP 7, 9 + + psubw m8, m0, m2 + psubw m9, m1, m3 + paddw m0, m2 + paddw m1, m3 + SWAP 2, 8 + SWAP 3, 9 + psubw m8, m4, m6 + psubw m9, m5, m7 + paddw m4, m6 + paddw m5, m7 + SWAP 6, 8 + SWAP 7, 9 + + psubw m8, m0, m4 + psubw m9, m1, m5 + paddw m0, m4 + paddw m1, m5 + SWAP 4, 8 + SWAP 5, 9 + psubw m8, m2, m6 + psubw m9, m3, m7 + paddw m2, m6 + paddw m3, m7 + SWAP 6, 8 + SWAP 7, 9 +%endmacro + + +INIT_XMM ssse3 +cglobal hadamard_8x8, 3, 5, 11, input, stride, output + lea r3, [2 * strideq] + lea r4, [4 * strideq] + + mova m0, [inputq] + mova m1, [inputq + r3] + lea inputq, [inputq + r4] + mova m2, [inputq] + mova m3, [inputq + r3] + lea inputq, [inputq + r4] + mova m4, [inputq] + mova m5, [inputq + r3] + lea inputq, [inputq + r4] + mova m6, [inputq] + mova m7, [inputq + r3] + + HMD8_1D + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10 + HMD8_1D + + STORE_TRAN_LOW 0, outputq, 0, 8, 9 + STORE_TRAN_LOW 1, outputq, 8, 8, 9 + STORE_TRAN_LOW 2, outputq, 16, 8, 9 + STORE_TRAN_LOW 3, outputq, 24, 8, 9 + STORE_TRAN_LOW 4, outputq, 32, 8, 9 + STORE_TRAN_LOW 5, outputq, 40, 8, 9 + STORE_TRAN_LOW 6, outputq, 48, 8, 9 + STORE_TRAN_LOW 7, outputq, 56, 8, 9 + + RET +%endif diff --git a/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h b/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h new file mode 100644 index 0000000000..c02b47a3eb --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_ +#define VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_ + +#include + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" + +// Load 16 16 bit values. If the source is 32 bits then pack down with +// saturation. +static INLINE __m256i load_tran_low(const tran_low_t *a) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m256i a_low = _mm256_loadu_si256((const __m256i *)a); + const __m256i a_high = _mm256_loadu_si256((const __m256i *)(a + 8)); + return _mm256_packs_epi32(a_low, a_high); +#else + return _mm256_loadu_si256((const __m256i *)a); +#endif +} + +static INLINE void store_tran_low(__m256i a, tran_low_t *b) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m256i one = _mm256_set1_epi16(1); + const __m256i a_hi = _mm256_mulhi_epi16(a, one); + const __m256i a_lo = _mm256_mullo_epi16(a, one); + const __m256i a_1 = _mm256_unpacklo_epi16(a_lo, a_hi); + const __m256i a_2 = _mm256_unpackhi_epi16(a_lo, a_hi); + _mm256_storeu_si256((__m256i *)b, a_1); + _mm256_storeu_si256((__m256i *)(b + 8), a_2); +#else + _mm256_storeu_si256((__m256i *)b, a); +#endif +} +#endif // VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm new file mode 100644 index 0000000000..aacf71f7ac --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm @@ -0,0 +1,90 @@ +; +; Copyright (c) 2017 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +; TODO(johannkoenig): Add the necessary include guards to vpx_config.asm. +; vpx_config.asm is not guarded so can not be included twice. Because this will +; be used in conjunction with x86_abi_support.asm or x86inc.asm, it must be +; included after those files. + +; Increment register by sizeof() tran_low_t * 8. +%macro INCREMENT_TRAN_LOW 1 +%if CONFIG_VP9_HIGHBITDEPTH + add %1, 32 +%else + add %1, 16 +%endif +%endmacro + +; Increment %1 by sizeof() tran_low_t * %2. +%macro INCREMENT_ELEMENTS_TRAN_LOW 2 +%if CONFIG_VP9_HIGHBITDEPTH + lea %1, [%1 + %2 * 4] +%else + lea %1, [%1 + %2 * 2] +%endif +%endmacro + +; Load %2 + %3 into m%1. +; %3 is the offset in elements, not bytes. +; If tran_low_t is 16 bits (low bit depth configuration) then load the value +; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack +; the values down to 16 bits. +%macro LOAD_TRAN_LOW 3 +%if CONFIG_VP9_HIGHBITDEPTH + mova m%1, [%2 + (%3) * 4] + packssdw m%1, [%2 + (%3) * 4 + 16] +%else + mova m%1, [%2 + (%3) * 2] +%endif +%endmacro + +; Store m%1 to %2 + %3. +; %3 is the offset in elements, not bytes. +; If 5 arguments are provided then m%1 is corrupted. +; If 6 arguments are provided then m%1 is preserved. +; If tran_low_t is 16 bits (low bit depth configuration) then store the value +; directly. If tran_low_t is 32 bits (high bit depth configuration) then sign +; extend the values first. +; Uses m%4-m%6 as scratch registers for high bit depth. +%macro STORE_TRAN_LOW 5-6 +%if CONFIG_VP9_HIGHBITDEPTH + pxor m%4, m%4 + mova m%5, m%1 + %if %0 == 6 + mova m%6, m%1 + %endif + pcmpgtw m%4, m%1 + punpcklwd m%5, m%4 + %if %0 == 5 + punpckhwd m%1, m%4 + %else + punpckhwd m%6, m%4 + %endif + mova [%2 + (%3) * 4 + 0], m%5 + %if %0 == 5 + mova [%2 + (%3) * 4 + 16], m%1 + %else + mova [%2 + (%3) * 4 + 16], m%6 + %endif +%else + mova [%2 + (%3) * 2], m%1 +%endif +%endmacro + +; Store zeros (in m%1) to %2 + %3. +; %3 is the offset in elements, not bytes. +%macro STORE_ZERO_TRAN_LOW 3 +%if CONFIG_VP9_HIGHBITDEPTH + mova [%2 + (%3) * 4 + 0], m%1 + mova [%2 + (%3) * 4 + 16], m%1 +%else + mova [%2 + (%3) * 2], m%1 +%endif +%endmacro diff --git a/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h new file mode 100644 index 0000000000..74dde656b1 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_ +#define VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_ + +#include + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" + +// Load 8 16 bit values. If the source is 32 bits then pack down with +// saturation. +static INLINE __m128i load_tran_low(const tran_low_t *a) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m128i a_low = _mm_load_si128((const __m128i *)a); + return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4)); +#else + return _mm_load_si128((const __m128i *)a); +#endif +} + +// Store 8 16 bit values. If the destination is 32 bits then sign extend the +// values by multiplying by 1. +static INLINE void store_tran_low(__m128i a, tran_low_t *b) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m128i one = _mm_set1_epi16(1); + const __m128i a_hi = _mm_mulhi_epi16(a, one); + const __m128i a_lo = _mm_mullo_epi16(a, one); + const __m128i a_1 = _mm_unpacklo_epi16(a_lo, a_hi); + const __m128i a_2 = _mm_unpackhi_epi16(a_lo, a_hi); + _mm_store_si128((__m128i *)(b), a_1); + _mm_store_si128((__m128i *)(b + 4), a_2); +#else + _mm_store_si128((__m128i *)(b), a); +#endif +} + +// Zero fill 8 positions in the output buffer. +static INLINE void store_zero_tran_low(tran_low_t *a) { + const __m128i zero = _mm_setzero_si128(); +#if CONFIG_VP9_HIGHBITDEPTH + _mm_store_si128((__m128i *)(a), zero); + _mm_store_si128((__m128i *)(a + 4), zero); +#else + _mm_store_si128((__m128i *)(a), zero); +#endif +} +#endif // VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/convolve.h b/media/libvpx/libvpx/vpx_dsp/x86/convolve.h new file mode 100644 index 0000000000..c339600556 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/convolve.h @@ -0,0 +1,279 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_VPX_DSP_X86_CONVOLVE_H_ +#define VPX_VPX_DSP_X86_CONVOLVE_H_ + +#include + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/compiler_attributes.h" + +// TODO(chiyotsai@google.com): Refactor the code here. Currently this is pretty +// hacky and awful to read. Note that there is a filter_x[3] == 128 check in +// HIGHBD_FUN_CONV_2D to avoid seg fault due to the fact that the c function +// assumes the filter is always 8 tap. +typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, + uint8_t *output_ptr, ptrdiff_t out_pitch, + uint32_t output_height, const int16_t *filter); + +// TODO(chiyotsai@google.com): Remove the is_avg argument to the MACROS once we +// have 4-tap vert avg filter. +#define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt, is_avg) \ + void vpx_convolve8_##name##_##opt( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \ + const int16_t *filter_row = filter[offset]; \ + (void)x0_q4; \ + (void)x_step_q4; \ + (void)y0_q4; \ + (void)y_step_q4; \ + assert(filter_row[3] != 128); \ + assert(step_q4 == 16); \ + if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \ + const int num_taps = 8; \ + while (w >= 16) { \ + vpx_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter_row); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + if (w == 8) { \ + vpx_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter_row); \ + } else if (w == 4) { \ + vpx_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter_row); \ + } \ + (void)num_taps; \ + } else if (filter_row[2] | filter_row[5]) { \ + const int num_taps = is_avg ? 8 : 4; \ + while (w >= 16) { \ + vpx_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter_row); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + if (w == 8) { \ + vpx_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter_row); \ + } else if (w == 4) { \ + vpx_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter_row); \ + } \ + (void)num_taps; \ + } else { \ + const int num_taps = 2; \ + while (w >= 16) { \ + vpx_filter_block1d16_##dir##2_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter_row); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + if (w == 8) { \ + vpx_filter_block1d8_##dir##2_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter_row); \ + } else if (w == 4) { \ + vpx_filter_block1d4_##dir##2_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter_row); \ + } \ + (void)num_taps; \ + } \ + } + +#define FUN_CONV_2D(avg, opt, is_avg) \ + void vpx_convolve8_##avg##opt( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \ + const int16_t *filter_x = filter[x0_q4]; \ + const int16_t *filter_y = filter[y0_q4]; \ + (void)filter_y; \ + assert(filter_x[3] != 128); \ + assert(filter_y[3] != 128); \ + assert(w <= 64); \ + assert(h <= 64); \ + assert(x_step_q4 == 16); \ + assert(y_step_q4 == 16); \ + if (filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) { \ + DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71] VPX_UNINITIALIZED); \ + vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ + filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, \ + h + 7); \ + vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ + filter, x0_q4, x_step_q4, y0_q4, \ + y_step_q4, w, h); \ + } else if (filter_x[2] | filter_x[5]) { \ + const int num_taps = is_avg ? 8 : 4; \ + DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71] VPX_UNINITIALIZED); \ + vpx_convolve8_horiz_##opt( \ + src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64, \ + filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1); \ + vpx_convolve8_##avg##vert_##opt(fdata2 + 64 * (num_taps / 2 - 1), 64, \ + dst, dst_stride, filter, x0_q4, \ + x_step_q4, y0_q4, y_step_q4, w, h); \ + } else { \ + DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65] VPX_UNINITIALIZED); \ + vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, x0_q4, \ + x_step_q4, y0_q4, y_step_q4, w, h + 1); \ + vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, filter, \ + x0_q4, x_step_q4, y0_q4, y_step_q4, w, \ + h); \ + } \ + } + +#if CONFIG_VP9_HIGHBITDEPTH + +typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, + const ptrdiff_t src_pitch, + uint16_t *output_ptr, + ptrdiff_t out_pitch, + unsigned int output_height, + const int16_t *filter, int bd); + +#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt, \ + is_avg) \ + void vpx_highbd_convolve8_##name##_##opt( \ + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \ + ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4, \ + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \ + const int16_t *filter_row = filter_kernel[offset]; \ + if (step_q4 == 16 && filter_row[3] != 128) { \ + if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \ + const int num_taps = 8; \ + while (w >= 16) { \ + vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vpx_highbd_filter_block1d8_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vpx_highbd_filter_block1d4_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + (void)num_taps; \ + } else if (filter_row[2] | filter_row[5]) { \ + const int num_taps = is_avg ? 8 : 4; \ + while (w >= 16) { \ + vpx_highbd_filter_block1d16_##dir##4_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vpx_highbd_filter_block1d8_##dir##4_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vpx_highbd_filter_block1d4_##dir##4_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + (void)num_taps; \ + } else { \ + const int num_taps = 2; \ + while (w >= 16) { \ + vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vpx_highbd_filter_block1d8_##dir##2_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vpx_highbd_filter_block1d4_##dir##2_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + (void)num_taps; \ + } \ + } \ + if (w) { \ + vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ + filter_kernel, x0_q4, x_step_q4, y0_q4, \ + y_step_q4, w, h, bd); \ + } \ + } + +#define HIGH_FUN_CONV_2D(avg, opt, is_avg) \ + void vpx_highbd_convolve8_##avg##opt( \ + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \ + ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \ + const int16_t *filter_x = filter[x0_q4]; \ + assert(w <= 64); \ + assert(h <= 64); \ + if (x_step_q4 == 16 && y_step_q4 == 16) { \ + if ((filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) || \ + filter_x[3] == 128) { \ + DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71] VPX_UNINITIALIZED); \ + vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ + fdata2, 64, filter, x0_q4, x_step_q4, \ + y0_q4, y_step_q4, w, h + 7, bd); \ + vpx_highbd_convolve8_##avg##vert_##opt( \ + fdata2 + 192, 64, dst, dst_stride, filter, x0_q4, x_step_q4, \ + y0_q4, y_step_q4, w, h, bd); \ + } else if (filter_x[2] | filter_x[5]) { \ + const int num_taps = is_avg ? 8 : 4; \ + DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71] VPX_UNINITIALIZED); \ + vpx_highbd_convolve8_horiz_##opt( \ + src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64, \ + filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1, \ + bd); \ + vpx_highbd_convolve8_##avg##vert_##opt( \ + fdata2 + 64 * (num_taps / 2 - 1), 64, dst, dst_stride, filter, \ + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); \ + } else { \ + DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65] VPX_UNINITIALIZED); \ + vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, \ + x0_q4, x_step_q4, y0_q4, y_step_q4, \ + w, h + 1, bd); \ + vpx_highbd_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ + filter, x0_q4, x_step_q4, \ + y0_q4, y_step_q4, w, h, bd); \ + } \ + } else { \ + vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, filter, \ + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, \ + bd); \ + } \ + } + +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // VPX_VPX_DSP_X86_CONVOLVE_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/convolve_avx2.h b/media/libvpx/libvpx/vpx_dsp/x86/convolve_avx2.h new file mode 100644 index 0000000000..ebee964b18 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/convolve_avx2.h @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_ +#define VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_ + +#include // AVX2 + +#include "./vpx_config.h" + +#if defined(__clang__) +#if (__clang_major__ > 0 && __clang_major__ < 3) || \ + (__clang_major__ == 3 && __clang_minor__ <= 3) || \ + (defined(__APPLE__) && defined(__apple_build_version__) && \ + ((__clang_major__ == 4 && __clang_minor__ <= 2) || \ + (__clang_major__ == 5 && __clang_minor__ == 0))) +#define MM256_BROADCASTSI128_SI256(x) \ + _mm_broadcastsi128_si256((__m128i const *)&(x)) +#else // clang > 3.3, and not 5.0 on macosx. +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // clang <= 3.3 +#elif defined(__GNUC__) +#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6) +#define MM256_BROADCASTSI128_SI256(x) \ + _mm_broadcastsi128_si256((__m128i const *)&(x)) +#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 +#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) +#else // gcc > 4.7 +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // gcc <= 4.6 +#else // !(gcc || clang) +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // __clang__ + +static INLINE void shuffle_filter_avx2(const int16_t *const filter, + __m256i *const f) { + const __m256i f_values = + MM256_BROADCASTSI128_SI256(_mm_load_si128((const __m128i *)filter)); + // pack and duplicate the filter values + f[0] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0200u)); + f[1] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0604u)); + f[2] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0a08u)); + f[3] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0e0cu)); +} + +static INLINE __m256i convolve8_16_avx2(const __m256i *const s, + const __m256i *const f) { + // multiply 2 adjacent elements with the filter and add the result + const __m256i k_64 = _mm256_set1_epi16(1 << 6); + const __m256i x0 = _mm256_maddubs_epi16(s[0], f[0]); + const __m256i x1 = _mm256_maddubs_epi16(s[1], f[1]); + const __m256i x2 = _mm256_maddubs_epi16(s[2], f[2]); + const __m256i x3 = _mm256_maddubs_epi16(s[3], f[3]); + __m256i sum1, sum2; + + // sum the results together, saturating only on the final step + // adding x0 with x2 and x1 with x3 is the only order that prevents + // outranges for all filters + sum1 = _mm256_add_epi16(x0, x2); + sum2 = _mm256_add_epi16(x1, x3); + // add the rounding offset early to avoid another saturated add + sum1 = _mm256_add_epi16(sum1, k_64); + sum1 = _mm256_adds_epi16(sum1, sum2); + // round and shift by 7 bit each 16 bit + sum1 = _mm256_srai_epi16(sum1, 7); + return sum1; +} + +static INLINE __m128i convolve8_8_avx2(const __m256i *const s, + const __m256i *const f) { + // multiply 2 adjacent elements with the filter and add the result + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i x0 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[0]), + _mm256_castsi256_si128(f[0])); + const __m128i x1 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[1]), + _mm256_castsi256_si128(f[1])); + const __m128i x2 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[2]), + _mm256_castsi256_si128(f[2])); + const __m128i x3 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[3]), + _mm256_castsi256_si128(f[3])); + __m128i sum1, sum2; + + // sum the results together, saturating only on the final step + // adding x0 with x2 and x1 with x3 is the only order that prevents + // outranges for all filters + sum1 = _mm_add_epi16(x0, x2); + sum2 = _mm_add_epi16(x1, x3); + // add the rounding offset early to avoid another saturated add + sum1 = _mm_add_epi16(sum1, k_64); + sum1 = _mm_adds_epi16(sum1, sum2); + // shift by 7 bit each 16 bit + sum1 = _mm_srai_epi16(sum1, 7); + return sum1; +} + +static INLINE __m256i mm256_loadu2_si128(const void *lo, const void *hi) { + const __m256i tmp = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)lo)); + return _mm256_inserti128_si256(tmp, _mm_loadu_si128((const __m128i *)hi), 1); +} + +static INLINE __m256i mm256_loadu2_epi64(const void *lo, const void *hi) { + const __m256i tmp = + _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)lo)); + return _mm256_inserti128_si256(tmp, _mm_loadl_epi64((const __m128i *)hi), 1); +} + +static INLINE void mm256_store2_si128(__m128i *const dst_ptr_1, + __m128i *const dst_ptr_2, + const __m256i *const src) { + _mm_store_si128(dst_ptr_1, _mm256_castsi256_si128(*src)); + _mm_store_si128(dst_ptr_2, _mm256_extractf128_si256(*src, 1)); +} + +static INLINE void mm256_storeu2_epi64(__m128i *const dst_ptr_1, + __m128i *const dst_ptr_2, + const __m256i *const src) { + _mm_storel_epi64(dst_ptr_1, _mm256_castsi256_si128(*src)); + _mm_storel_epi64(dst_ptr_2, _mm256_extractf128_si256(*src, 1)); +} + +static INLINE void mm256_storeu2_epi32(__m128i *const dst_ptr_1, + __m128i *const dst_ptr_2, + const __m256i *const src) { + *((int *)(dst_ptr_1)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*src)); + *((int *)(dst_ptr_2)) = _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1)); +} + +static INLINE __m256i mm256_round_epi32(const __m256i *const src, + const __m256i *const half_depth, + const int depth) { + const __m256i nearest_src = _mm256_add_epi32(*src, *half_depth); + return _mm256_srai_epi32(nearest_src, depth); +} + +static INLINE __m256i mm256_round_epi16(const __m256i *const src, + const __m256i *const half_depth, + const int depth) { + const __m256i nearest_src = _mm256_adds_epi16(*src, *half_depth); + return _mm256_srai_epi16(nearest_src, depth); +} + +static INLINE __m256i mm256_madd_add_epi32(const __m256i *const src_0, + const __m256i *const src_1, + const __m256i *const ker_0, + const __m256i *const ker_1) { + const __m256i tmp_0 = _mm256_madd_epi16(*src_0, *ker_0); + const __m256i tmp_1 = _mm256_madd_epi16(*src_1, *ker_1); + return _mm256_add_epi32(tmp_0, tmp_1); +} + +#undef MM256_BROADCASTSI128_SI256 + +#endif // VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/convolve_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/convolve_sse2.h new file mode 100644 index 0000000000..8443546394 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/convolve_sse2.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_ +#define VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_ + +#include // SSE2 + +#include "./vpx_config.h" + +// Interprets the input register as 16-bit words 7 6 5 4 3 2 1 0, then returns +// values at index 2 and 3 to return 3 2 3 2 3 2 3 2 as 16-bit words +static INLINE __m128i extract_quarter_2_epi16_sse2(const __m128i *const reg) { + __m128i tmp = _mm_unpacklo_epi32(*reg, *reg); + return _mm_unpackhi_epi64(tmp, tmp); +} + +// Interprets the input register as 16-bit words 7 6 5 4 3 2 1 0, then returns +// values at index 2 and 3 to return 5 4 5 4 5 4 5 4 as 16-bit words. +static INLINE __m128i extract_quarter_3_epi16_sse2(const __m128i *const reg) { + __m128i tmp = _mm_unpackhi_epi32(*reg, *reg); + return _mm_unpacklo_epi64(tmp, tmp); +} + +// Interprets src as 8-bit words, zero extends to form 16-bit words, then +// multiplies with ker and add the adjacent results to form 32-bit words. +// Finally adds the result from 1 and 2 together. +static INLINE __m128i mm_madd_add_epi8_sse2(const __m128i *const src_1, + const __m128i *const src_2, + const __m128i *const ker_1, + const __m128i *const ker_2) { + const __m128i src_1_half = _mm_unpacklo_epi8(*src_1, _mm_setzero_si128()); + const __m128i src_2_half = _mm_unpacklo_epi8(*src_2, _mm_setzero_si128()); + const __m128i madd_1 = _mm_madd_epi16(src_1_half, *ker_1); + const __m128i madd_2 = _mm_madd_epi16(src_2_half, *ker_2); + return _mm_add_epi32(madd_1, madd_2); +} + +// Interprets src as 16-bit words, then multiplies with ker and add the +// adjacent results to form 32-bit words. Finally adds the result from 1 and 2 +// together. +static INLINE __m128i mm_madd_add_epi16_sse2(const __m128i *const src_1, + const __m128i *const src_2, + const __m128i *const ker_1, + const __m128i *const ker_2) { + const __m128i madd_1 = _mm_madd_epi16(*src_1, *ker_1); + const __m128i madd_2 = _mm_madd_epi16(*src_2, *ker_2); + return _mm_add_epi32(madd_1, madd_2); +} + +static INLINE __m128i mm_madd_packs_epi16_sse2(const __m128i *const src_0, + const __m128i *const src_1, + const __m128i *const ker) { + const __m128i madd_1 = _mm_madd_epi16(*src_0, *ker); + const __m128i madd_2 = _mm_madd_epi16(*src_1, *ker); + return _mm_packs_epi32(madd_1, madd_2); +} + +// Interleaves src_1 and src_2 +static INLINE __m128i mm_zip_epi32_sse2(const __m128i *const src_1, + const __m128i *const src_2) { + const __m128i tmp_1 = _mm_unpacklo_epi32(*src_1, *src_2); + const __m128i tmp_2 = _mm_unpackhi_epi32(*src_1, *src_2); + return _mm_packs_epi32(tmp_1, tmp_2); +} + +static INLINE __m128i mm_round_epi32_sse2(const __m128i *const src, + const __m128i *const half_depth, + const int depth) { + const __m128i nearest_src = _mm_add_epi32(*src, *half_depth); + return _mm_srai_epi32(nearest_src, depth); +} + +static INLINE __m128i mm_round_epi16_sse2(const __m128i *const src, + const __m128i *const half_depth, + const int depth) { + const __m128i nearest_src = _mm_adds_epi16(*src, *half_depth); + return _mm_srai_epi16(nearest_src, depth); +} + +#endif // VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/convolve_ssse3.h b/media/libvpx/libvpx/vpx_dsp/x86/convolve_ssse3.h new file mode 100644 index 0000000000..8a4b165133 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/convolve_ssse3.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_X86_CONVOLVE_SSSE3_H_ +#define VPX_VPX_DSP_X86_CONVOLVE_SSSE3_H_ + +#include +#include // SSSE3 + +#include "./vpx_config.h" + +static INLINE void shuffle_filter_ssse3(const int16_t *const filter, + __m128i *const f) { + const __m128i f_values = _mm_load_si128((const __m128i *)filter); + // pack and duplicate the filter values + f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); + f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); + f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); + f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); +} + +static INLINE void shuffle_filter_odd_ssse3(const int16_t *const filter, + __m128i *const f) { + const __m128i f_values = _mm_load_si128((const __m128i *)filter); + // pack and duplicate the filter values + // It utilizes the fact that the high byte of filter[3] is always 0 to clean + // half of f[0] and f[4]. + assert(filter[3] >= 0 && filter[3] < 256); + f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0007u)); + f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0402u)); + f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0806u)); + f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0c0au)); + f[4] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x070eu)); +} + +static INLINE __m128i convolve8_8_ssse3(const __m128i *const s, + const __m128i *const f) { + // multiply 2 adjacent elements with the filter and add the result + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]); + const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]); + const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]); + const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]); + __m128i sum1, sum2; + + // sum the results together, saturating only on the final step + // adding x0 with x2 and x1 with x3 is the only order that prevents + // outranges for all filters + sum1 = _mm_add_epi16(x0, x2); + sum2 = _mm_add_epi16(x1, x3); + // add the rounding offset early to avoid another saturated add + sum1 = _mm_add_epi16(sum1, k_64); + sum1 = _mm_adds_epi16(sum1, sum2); + // shift by 7 bit each 16 bit + sum1 = _mm_srai_epi16(sum1, 7); + return sum1; +} + +static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s, + const __m128i *const f) { + // multiply 2 adjacent elements with the filter and add the result + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]); + const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]); + const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]); + const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]); + // compensate the subtracted 64 in f[1]. x4 is always non negative. + const __m128i x4 = _mm_maddubs_epi16(s[1], _mm_set1_epi8(64)); + // add and saturate the results together + __m128i temp = _mm_adds_epi16(x0, x3); + temp = _mm_adds_epi16(temp, x1); + temp = _mm_adds_epi16(temp, x2); + temp = _mm_adds_epi16(temp, x4); + // round and shift by 7 bit each 16 bit + temp = _mm_adds_epi16(temp, k_64); + temp = _mm_srai_epi16(temp, 7); + return temp; +} + +static INLINE __m128i convolve8_8_odd_offset_ssse3(const __m128i *const s, + const __m128i *const f) { + // multiply 2 adjacent elements with the filter and add the result + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]); + const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]); + const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]); + const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]); + const __m128i x4 = _mm_maddubs_epi16(s[4], f[4]); + // compensate the subtracted 64 in f[2]. x5 is always non negative. + const __m128i x5 = _mm_maddubs_epi16(s[2], _mm_set1_epi8(64)); + __m128i temp; + + // add and saturate the results together + temp = _mm_adds_epi16(x0, x1); + temp = _mm_adds_epi16(temp, x2); + temp = _mm_adds_epi16(temp, x3); + temp = _mm_adds_epi16(temp, x4); + temp = _mm_adds_epi16(temp, x5); + // round and shift by 7 bit each 16 bit + temp = _mm_adds_epi16(temp, k_64); + temp = _mm_srai_epi16(temp, 7); + return temp; +} + +#endif // VPX_VPX_DSP_X86_CONVOLVE_SSSE3_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/deblock_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/deblock_sse2.asm new file mode 100644 index 0000000000..b3af677d2e --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/deblock_sse2.asm @@ -0,0 +1,432 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;macro in deblock functions +%macro FIRST_2_ROWS 0 + movdqa xmm4, xmm0 + movdqa xmm6, xmm0 + movdqa xmm5, xmm1 + pavgb xmm5, xmm3 + + ;calculate absolute value + psubusb xmm4, xmm1 + psubusb xmm1, xmm0 + psubusb xmm6, xmm3 + psubusb xmm3, xmm0 + paddusb xmm4, xmm1 + paddusb xmm6, xmm3 + + ;get threshold + movdqa xmm2, flimit + pxor xmm1, xmm1 + movdqa xmm7, xmm2 + + ;get mask + psubusb xmm2, xmm4 + psubusb xmm7, xmm6 + pcmpeqb xmm2, xmm1 + pcmpeqb xmm7, xmm1 + por xmm7, xmm2 +%endmacro + +%macro SECOND_2_ROWS 0 + movdqa xmm6, xmm0 + movdqa xmm4, xmm0 + movdqa xmm2, xmm1 + pavgb xmm1, xmm3 + + ;calculate absolute value + psubusb xmm6, xmm2 + psubusb xmm2, xmm0 + psubusb xmm4, xmm3 + psubusb xmm3, xmm0 + paddusb xmm6, xmm2 + paddusb xmm4, xmm3 + + pavgb xmm5, xmm1 + + ;get threshold + movdqa xmm2, flimit + pxor xmm1, xmm1 + movdqa xmm3, xmm2 + + ;get mask + psubusb xmm2, xmm6 + psubusb xmm3, xmm4 + pcmpeqb xmm2, xmm1 + pcmpeqb xmm3, xmm1 + + por xmm7, xmm2 + por xmm7, xmm3 + + pavgb xmm5, xmm0 + + ;decide if or not to use filtered value + pand xmm0, xmm7 + pandn xmm7, xmm5 + paddusb xmm0, xmm7 +%endmacro + +%macro UPDATE_FLIMIT 0 + movdqu xmm2, XMMWORD PTR [rbx] + movdqu [rsp], xmm2 + add rbx, 16 +%endmacro + +SECTION .text + +;void vpx_post_proc_down_and_across_mb_row_sse2 +;( +; unsigned char *src_ptr, +; unsigned char *dst_ptr, +; int src_pixels_per_line, +; int dst_pixels_per_line, +; int cols, +; int *flimits, +; int size +;) +globalsym(vpx_post_proc_down_and_across_mb_row_sse2) +sym(vpx_post_proc_down_and_across_mb_row_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rbx + push rsi + push rdi + ; end prolog + ALIGN_STACK 16, rax + sub rsp, 16 + + ; put flimit on stack + mov rbx, arg(5) ;flimits ptr + UPDATE_FLIMIT + +%define flimit [rsp] + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(1) ;dst_ptr + + movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line + movsxd rcx, DWORD PTR arg(6) ;rows in a macroblock +.nextrow: + xor rdx, rdx ;col +.nextcol: + ;load current and next 2 rows + movdqu xmm0, XMMWORD PTR [rsi] + movdqu xmm1, XMMWORD PTR [rsi + rax] + movdqu xmm3, XMMWORD PTR [rsi + 2*rax] + + FIRST_2_ROWS + + ;load above 2 rows + neg rax + movdqu xmm1, XMMWORD PTR [rsi + 2*rax] + movdqu xmm3, XMMWORD PTR [rsi + rax] + + SECOND_2_ROWS + + movdqu XMMWORD PTR [rdi], xmm0 + + neg rax ; positive stride + add rsi, 16 + add rdi, 16 + + add rdx, 16 + cmp edx, dword arg(4) ;cols + jge .downdone + UPDATE_FLIMIT + jmp .nextcol + +.downdone: + ; done with the all cols, start the across filtering in place + sub rsi, rdx + sub rdi, rdx + + mov rbx, arg(5) ; flimits + UPDATE_FLIMIT + + ; dup the first byte into the left border 8 times + movq mm1, [rdi] + punpcklbw mm1, mm1 + punpcklwd mm1, mm1 + punpckldq mm1, mm1 + mov rdx, -8 + movq [rdi+rdx], mm1 + + ; dup the last byte into the right border + movsxd rdx, dword arg(4) + movq mm1, [rdi + rdx + -1] + punpcklbw mm1, mm1 + punpcklwd mm1, mm1 + punpckldq mm1, mm1 + movq [rdi+rdx], mm1 + + xor rdx, rdx + movq mm0, QWORD PTR [rdi-16]; + movq mm1, QWORD PTR [rdi-8]; + +.acrossnextcol: + movdqu xmm0, XMMWORD PTR [rdi + rdx] + movdqu xmm1, XMMWORD PTR [rdi + rdx -2] + movdqu xmm3, XMMWORD PTR [rdi + rdx -1] + + FIRST_2_ROWS + + movdqu xmm1, XMMWORD PTR [rdi + rdx +1] + movdqu xmm3, XMMWORD PTR [rdi + rdx +2] + + SECOND_2_ROWS + + movq QWORD PTR [rdi+rdx-16], mm0 ; store previous 8 bytes + movq QWORD PTR [rdi+rdx-8], mm1 ; store previous 8 bytes + movdq2q mm0, xmm0 + psrldq xmm0, 8 + movdq2q mm1, xmm0 + + add rdx, 16 + cmp edx, dword arg(4) ;cols + jge .acrossdone + UPDATE_FLIMIT + jmp .acrossnextcol + +.acrossdone: + ; last 16 pixels + movq QWORD PTR [rdi+rdx-16], mm0 + + cmp edx, dword arg(4) + jne .throw_last_8 + movq QWORD PTR [rdi+rdx-8], mm1 +.throw_last_8: + ; done with this rwo + add rsi,rax ;next src line + mov eax, dword arg(3) ;dst_pixels_per_line + add rdi,rax ;next destination + mov eax, dword arg(2) ;src_pixels_per_line + + mov rbx, arg(5) ;flimits + UPDATE_FLIMIT + + dec rcx ;decrement count + jnz .nextrow ;next row + + add rsp, 16 + pop rsp + ; begin epilog + pop rdi + pop rsi + pop rbx + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%undef flimit + + +;void vpx_mbpost_proc_across_ip_sse2(unsigned char *src, +; int pitch, int rows, int cols,int flimit) +globalsym(vpx_mbpost_proc_across_ip_sse2) +sym(vpx_mbpost_proc_across_ip_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 + + ; create flimit4 at [rsp] + mov eax, dword ptr arg(4) ;flimit + mov [rsp], eax + mov [rsp+4], eax + mov [rsp+8], eax + mov [rsp+12], eax +%define flimit4 [rsp] + + + ;for(r=0;r // AVX2 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" + +#define pair256_set_epi16(a, b) \ + _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) + +#define pair256_set_epi32(a, b) \ + _mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), (int)(b), (int)(a), \ + (int)(b), (int)(a)) + +#if FDCT32x32_HIGH_PRECISION +static INLINE __m256i k_madd_epi32_avx2(__m256i a, __m256i b) { + __m256i buf0, buf1; + buf0 = _mm256_mul_epu32(a, b); + a = _mm256_srli_epi64(a, 32); + b = _mm256_srli_epi64(b, 32); + buf1 = _mm256_mul_epu32(a, b); + return _mm256_add_epi64(buf0, buf1); +} + +static INLINE __m256i k_packs_epi64_avx2(__m256i a, __m256i b) { + __m256i buf0 = _mm256_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0)); + __m256i buf1 = _mm256_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0)); + return _mm256_unpacklo_epi64(buf0, buf1); +} +#endif + +void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) { + // Calculate pre-multiplied strides + const int str1 = stride; + const int str2 = 2 * stride; + const int str3 = 2 * stride + str1; + // We need an intermediate buffer between passes. + DECLARE_ALIGNED(32, int16_t, intermediate[32 * 32]); + // Constants + // When we use them, in one case, they are all the same. In all others + // it's a pair of them that we need to repeat four times. This is done + // by constructing the 32 bit constant corresponding to that pair. + const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(cospi_16_64); + const __m256i k__cospi_p16_m16 = + pair256_set_epi16(+cospi_16_64, -cospi_16_64); + const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64); + const __m256i k__cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); + const __m256i k__cospi_p24_p08 = pair256_set_epi16(+cospi_24_64, cospi_8_64); + const __m256i k__cospi_p12_p20 = pair256_set_epi16(+cospi_12_64, cospi_20_64); + const __m256i k__cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64); + const __m256i k__cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64); + const __m256i k__cospi_p28_p04 = pair256_set_epi16(+cospi_28_64, cospi_4_64); + const __m256i k__cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64); + const __m256i k__cospi_m12_m20 = + pair256_set_epi16(-cospi_12_64, -cospi_20_64); + const __m256i k__cospi_p30_p02 = pair256_set_epi16(+cospi_30_64, cospi_2_64); + const __m256i k__cospi_p14_p18 = pair256_set_epi16(+cospi_14_64, cospi_18_64); + const __m256i k__cospi_p22_p10 = pair256_set_epi16(+cospi_22_64, cospi_10_64); + const __m256i k__cospi_p06_p26 = pair256_set_epi16(+cospi_6_64, cospi_26_64); + const __m256i k__cospi_m26_p06 = pair256_set_epi16(-cospi_26_64, cospi_6_64); + const __m256i k__cospi_m10_p22 = pair256_set_epi16(-cospi_10_64, cospi_22_64); + const __m256i k__cospi_m18_p14 = pair256_set_epi16(-cospi_18_64, cospi_14_64); + const __m256i k__cospi_m02_p30 = pair256_set_epi16(-cospi_2_64, cospi_30_64); + const __m256i k__cospi_p31_p01 = pair256_set_epi16(+cospi_31_64, cospi_1_64); + const __m256i k__cospi_p15_p17 = pair256_set_epi16(+cospi_15_64, cospi_17_64); + const __m256i k__cospi_p23_p09 = pair256_set_epi16(+cospi_23_64, cospi_9_64); + const __m256i k__cospi_p07_p25 = pair256_set_epi16(+cospi_7_64, cospi_25_64); + const __m256i k__cospi_m25_p07 = pair256_set_epi16(-cospi_25_64, cospi_7_64); + const __m256i k__cospi_m09_p23 = pair256_set_epi16(-cospi_9_64, cospi_23_64); + const __m256i k__cospi_m17_p15 = pair256_set_epi16(-cospi_17_64, cospi_15_64); + const __m256i k__cospi_m01_p31 = pair256_set_epi16(-cospi_1_64, cospi_31_64); + const __m256i k__cospi_p27_p05 = pair256_set_epi16(+cospi_27_64, cospi_5_64); + const __m256i k__cospi_p11_p21 = pair256_set_epi16(+cospi_11_64, cospi_21_64); + const __m256i k__cospi_p19_p13 = pair256_set_epi16(+cospi_19_64, cospi_13_64); + const __m256i k__cospi_p03_p29 = pair256_set_epi16(+cospi_3_64, cospi_29_64); + const __m256i k__cospi_m29_p03 = pair256_set_epi16(-cospi_29_64, cospi_3_64); + const __m256i k__cospi_m13_p19 = pair256_set_epi16(-cospi_13_64, cospi_19_64); + const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64); + const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64); + const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING); + const __m256i kZero = _mm256_setzero_si256(); + const __m256i kOne = _mm256_set1_epi16(1); + // Do the two transform/transpose passes + int pass; + for (pass = 0; pass < 2; ++pass) { + // We process sixteen columns (transposed rows in second pass) at a time. + int column_start; + for (column_start = 0; column_start < 32; column_start += 16) { + __m256i step1[32]; + __m256i step2[32]; + __m256i step3[32]; + __m256i out[32]; + // Stage 1 + // Note: even though all the loads below are aligned, using the aligned + // intrinsic make the code slightly slower. + if (0 == pass) { + const int16_t *in = &input[column_start]; + // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2; + // Note: the next four blocks could be in a loop. That would help the + // instruction cache but is actually slower. + { + const int16_t *ina = in + 0 * str1; + const int16_t *inb = in + 31 * str1; + __m256i *step1a = &step1[0]; + __m256i *step1b = &step1[31]; + const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina)); + const __m256i ina1 = + _mm256_loadu_si256((const __m256i *)(ina + str1)); + const __m256i ina2 = + _mm256_loadu_si256((const __m256i *)(ina + str2)); + const __m256i ina3 = + _mm256_loadu_si256((const __m256i *)(ina + str3)); + const __m256i inb3 = + _mm256_loadu_si256((const __m256i *)(inb - str3)); + const __m256i inb2 = + _mm256_loadu_si256((const __m256i *)(inb - str2)); + const __m256i inb1 = + _mm256_loadu_si256((const __m256i *)(inb - str1)); + const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb)); + step1a[0] = _mm256_add_epi16(ina0, inb0); + step1a[1] = _mm256_add_epi16(ina1, inb1); + step1a[2] = _mm256_add_epi16(ina2, inb2); + step1a[3] = _mm256_add_epi16(ina3, inb3); + step1b[-3] = _mm256_sub_epi16(ina3, inb3); + step1b[-2] = _mm256_sub_epi16(ina2, inb2); + step1b[-1] = _mm256_sub_epi16(ina1, inb1); + step1b[-0] = _mm256_sub_epi16(ina0, inb0); + step1a[0] = _mm256_slli_epi16(step1a[0], 2); + step1a[1] = _mm256_slli_epi16(step1a[1], 2); + step1a[2] = _mm256_slli_epi16(step1a[2], 2); + step1a[3] = _mm256_slli_epi16(step1a[3], 2); + step1b[-3] = _mm256_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm256_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm256_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm256_slli_epi16(step1b[-0], 2); + } + { + const int16_t *ina = in + 4 * str1; + const int16_t *inb = in + 27 * str1; + __m256i *step1a = &step1[4]; + __m256i *step1b = &step1[27]; + const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina)); + const __m256i ina1 = + _mm256_loadu_si256((const __m256i *)(ina + str1)); + const __m256i ina2 = + _mm256_loadu_si256((const __m256i *)(ina + str2)); + const __m256i ina3 = + _mm256_loadu_si256((const __m256i *)(ina + str3)); + const __m256i inb3 = + _mm256_loadu_si256((const __m256i *)(inb - str3)); + const __m256i inb2 = + _mm256_loadu_si256((const __m256i *)(inb - str2)); + const __m256i inb1 = + _mm256_loadu_si256((const __m256i *)(inb - str1)); + const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb)); + step1a[0] = _mm256_add_epi16(ina0, inb0); + step1a[1] = _mm256_add_epi16(ina1, inb1); + step1a[2] = _mm256_add_epi16(ina2, inb2); + step1a[3] = _mm256_add_epi16(ina3, inb3); + step1b[-3] = _mm256_sub_epi16(ina3, inb3); + step1b[-2] = _mm256_sub_epi16(ina2, inb2); + step1b[-1] = _mm256_sub_epi16(ina1, inb1); + step1b[-0] = _mm256_sub_epi16(ina0, inb0); + step1a[0] = _mm256_slli_epi16(step1a[0], 2); + step1a[1] = _mm256_slli_epi16(step1a[1], 2); + step1a[2] = _mm256_slli_epi16(step1a[2], 2); + step1a[3] = _mm256_slli_epi16(step1a[3], 2); + step1b[-3] = _mm256_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm256_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm256_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm256_slli_epi16(step1b[-0], 2); + } + { + const int16_t *ina = in + 8 * str1; + const int16_t *inb = in + 23 * str1; + __m256i *step1a = &step1[8]; + __m256i *step1b = &step1[23]; + const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina)); + const __m256i ina1 = + _mm256_loadu_si256((const __m256i *)(ina + str1)); + const __m256i ina2 = + _mm256_loadu_si256((const __m256i *)(ina + str2)); + const __m256i ina3 = + _mm256_loadu_si256((const __m256i *)(ina + str3)); + const __m256i inb3 = + _mm256_loadu_si256((const __m256i *)(inb - str3)); + const __m256i inb2 = + _mm256_loadu_si256((const __m256i *)(inb - str2)); + const __m256i inb1 = + _mm256_loadu_si256((const __m256i *)(inb - str1)); + const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb)); + step1a[0] = _mm256_add_epi16(ina0, inb0); + step1a[1] = _mm256_add_epi16(ina1, inb1); + step1a[2] = _mm256_add_epi16(ina2, inb2); + step1a[3] = _mm256_add_epi16(ina3, inb3); + step1b[-3] = _mm256_sub_epi16(ina3, inb3); + step1b[-2] = _mm256_sub_epi16(ina2, inb2); + step1b[-1] = _mm256_sub_epi16(ina1, inb1); + step1b[-0] = _mm256_sub_epi16(ina0, inb0); + step1a[0] = _mm256_slli_epi16(step1a[0], 2); + step1a[1] = _mm256_slli_epi16(step1a[1], 2); + step1a[2] = _mm256_slli_epi16(step1a[2], 2); + step1a[3] = _mm256_slli_epi16(step1a[3], 2); + step1b[-3] = _mm256_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm256_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm256_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm256_slli_epi16(step1b[-0], 2); + } + { + const int16_t *ina = in + 12 * str1; + const int16_t *inb = in + 19 * str1; + __m256i *step1a = &step1[12]; + __m256i *step1b = &step1[19]; + const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina)); + const __m256i ina1 = + _mm256_loadu_si256((const __m256i *)(ina + str1)); + const __m256i ina2 = + _mm256_loadu_si256((const __m256i *)(ina + str2)); + const __m256i ina3 = + _mm256_loadu_si256((const __m256i *)(ina + str3)); + const __m256i inb3 = + _mm256_loadu_si256((const __m256i *)(inb - str3)); + const __m256i inb2 = + _mm256_loadu_si256((const __m256i *)(inb - str2)); + const __m256i inb1 = + _mm256_loadu_si256((const __m256i *)(inb - str1)); + const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb)); + step1a[0] = _mm256_add_epi16(ina0, inb0); + step1a[1] = _mm256_add_epi16(ina1, inb1); + step1a[2] = _mm256_add_epi16(ina2, inb2); + step1a[3] = _mm256_add_epi16(ina3, inb3); + step1b[-3] = _mm256_sub_epi16(ina3, inb3); + step1b[-2] = _mm256_sub_epi16(ina2, inb2); + step1b[-1] = _mm256_sub_epi16(ina1, inb1); + step1b[-0] = _mm256_sub_epi16(ina0, inb0); + step1a[0] = _mm256_slli_epi16(step1a[0], 2); + step1a[1] = _mm256_slli_epi16(step1a[1], 2); + step1a[2] = _mm256_slli_epi16(step1a[2], 2); + step1a[3] = _mm256_slli_epi16(step1a[3], 2); + step1b[-3] = _mm256_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm256_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm256_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm256_slli_epi16(step1b[-0], 2); + } + } else { + int16_t *in = &intermediate[column_start]; + // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32]; + // Note: using the same approach as above to have common offset is + // counter-productive as all offsets can be calculated at compile + // time. + // Note: the next four blocks could be in a loop. That would help the + // instruction cache but is actually slower. + { + __m256i in00 = _mm256_loadu_si256((const __m256i *)(in + 0 * 32)); + __m256i in01 = _mm256_loadu_si256((const __m256i *)(in + 1 * 32)); + __m256i in02 = _mm256_loadu_si256((const __m256i *)(in + 2 * 32)); + __m256i in03 = _mm256_loadu_si256((const __m256i *)(in + 3 * 32)); + __m256i in28 = _mm256_loadu_si256((const __m256i *)(in + 28 * 32)); + __m256i in29 = _mm256_loadu_si256((const __m256i *)(in + 29 * 32)); + __m256i in30 = _mm256_loadu_si256((const __m256i *)(in + 30 * 32)); + __m256i in31 = _mm256_loadu_si256((const __m256i *)(in + 31 * 32)); + step1[0] = _mm256_add_epi16(in00, in31); + step1[1] = _mm256_add_epi16(in01, in30); + step1[2] = _mm256_add_epi16(in02, in29); + step1[3] = _mm256_add_epi16(in03, in28); + step1[28] = _mm256_sub_epi16(in03, in28); + step1[29] = _mm256_sub_epi16(in02, in29); + step1[30] = _mm256_sub_epi16(in01, in30); + step1[31] = _mm256_sub_epi16(in00, in31); + } + { + __m256i in04 = _mm256_loadu_si256((const __m256i *)(in + 4 * 32)); + __m256i in05 = _mm256_loadu_si256((const __m256i *)(in + 5 * 32)); + __m256i in06 = _mm256_loadu_si256((const __m256i *)(in + 6 * 32)); + __m256i in07 = _mm256_loadu_si256((const __m256i *)(in + 7 * 32)); + __m256i in24 = _mm256_loadu_si256((const __m256i *)(in + 24 * 32)); + __m256i in25 = _mm256_loadu_si256((const __m256i *)(in + 25 * 32)); + __m256i in26 = _mm256_loadu_si256((const __m256i *)(in + 26 * 32)); + __m256i in27 = _mm256_loadu_si256((const __m256i *)(in + 27 * 32)); + step1[4] = _mm256_add_epi16(in04, in27); + step1[5] = _mm256_add_epi16(in05, in26); + step1[6] = _mm256_add_epi16(in06, in25); + step1[7] = _mm256_add_epi16(in07, in24); + step1[24] = _mm256_sub_epi16(in07, in24); + step1[25] = _mm256_sub_epi16(in06, in25); + step1[26] = _mm256_sub_epi16(in05, in26); + step1[27] = _mm256_sub_epi16(in04, in27); + } + { + __m256i in08 = _mm256_loadu_si256((const __m256i *)(in + 8 * 32)); + __m256i in09 = _mm256_loadu_si256((const __m256i *)(in + 9 * 32)); + __m256i in10 = _mm256_loadu_si256((const __m256i *)(in + 10 * 32)); + __m256i in11 = _mm256_loadu_si256((const __m256i *)(in + 11 * 32)); + __m256i in20 = _mm256_loadu_si256((const __m256i *)(in + 20 * 32)); + __m256i in21 = _mm256_loadu_si256((const __m256i *)(in + 21 * 32)); + __m256i in22 = _mm256_loadu_si256((const __m256i *)(in + 22 * 32)); + __m256i in23 = _mm256_loadu_si256((const __m256i *)(in + 23 * 32)); + step1[8] = _mm256_add_epi16(in08, in23); + step1[9] = _mm256_add_epi16(in09, in22); + step1[10] = _mm256_add_epi16(in10, in21); + step1[11] = _mm256_add_epi16(in11, in20); + step1[20] = _mm256_sub_epi16(in11, in20); + step1[21] = _mm256_sub_epi16(in10, in21); + step1[22] = _mm256_sub_epi16(in09, in22); + step1[23] = _mm256_sub_epi16(in08, in23); + } + { + __m256i in12 = _mm256_loadu_si256((const __m256i *)(in + 12 * 32)); + __m256i in13 = _mm256_loadu_si256((const __m256i *)(in + 13 * 32)); + __m256i in14 = _mm256_loadu_si256((const __m256i *)(in + 14 * 32)); + __m256i in15 = _mm256_loadu_si256((const __m256i *)(in + 15 * 32)); + __m256i in16 = _mm256_loadu_si256((const __m256i *)(in + 16 * 32)); + __m256i in17 = _mm256_loadu_si256((const __m256i *)(in + 17 * 32)); + __m256i in18 = _mm256_loadu_si256((const __m256i *)(in + 18 * 32)); + __m256i in19 = _mm256_loadu_si256((const __m256i *)(in + 19 * 32)); + step1[12] = _mm256_add_epi16(in12, in19); + step1[13] = _mm256_add_epi16(in13, in18); + step1[14] = _mm256_add_epi16(in14, in17); + step1[15] = _mm256_add_epi16(in15, in16); + step1[16] = _mm256_sub_epi16(in15, in16); + step1[17] = _mm256_sub_epi16(in14, in17); + step1[18] = _mm256_sub_epi16(in13, in18); + step1[19] = _mm256_sub_epi16(in12, in19); + } + } + // Stage 2 + { + step2[0] = _mm256_add_epi16(step1[0], step1[15]); + step2[1] = _mm256_add_epi16(step1[1], step1[14]); + step2[2] = _mm256_add_epi16(step1[2], step1[13]); + step2[3] = _mm256_add_epi16(step1[3], step1[12]); + step2[4] = _mm256_add_epi16(step1[4], step1[11]); + step2[5] = _mm256_add_epi16(step1[5], step1[10]); + step2[6] = _mm256_add_epi16(step1[6], step1[9]); + step2[7] = _mm256_add_epi16(step1[7], step1[8]); + step2[8] = _mm256_sub_epi16(step1[7], step1[8]); + step2[9] = _mm256_sub_epi16(step1[6], step1[9]); + step2[10] = _mm256_sub_epi16(step1[5], step1[10]); + step2[11] = _mm256_sub_epi16(step1[4], step1[11]); + step2[12] = _mm256_sub_epi16(step1[3], step1[12]); + step2[13] = _mm256_sub_epi16(step1[2], step1[13]); + step2[14] = _mm256_sub_epi16(step1[1], step1[14]); + step2[15] = _mm256_sub_epi16(step1[0], step1[15]); + } + { + const __m256i s2_20_0 = _mm256_unpacklo_epi16(step1[27], step1[20]); + const __m256i s2_20_1 = _mm256_unpackhi_epi16(step1[27], step1[20]); + const __m256i s2_21_0 = _mm256_unpacklo_epi16(step1[26], step1[21]); + const __m256i s2_21_1 = _mm256_unpackhi_epi16(step1[26], step1[21]); + const __m256i s2_22_0 = _mm256_unpacklo_epi16(step1[25], step1[22]); + const __m256i s2_22_1 = _mm256_unpackhi_epi16(step1[25], step1[22]); + const __m256i s2_23_0 = _mm256_unpacklo_epi16(step1[24], step1[23]); + const __m256i s2_23_1 = _mm256_unpackhi_epi16(step1[24], step1[23]); + const __m256i s2_20_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_m16); + const __m256i s2_20_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_m16); + const __m256i s2_21_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_m16); + const __m256i s2_21_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_m16); + const __m256i s2_22_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_m16); + const __m256i s2_22_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_m16); + const __m256i s2_23_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_m16); + const __m256i s2_23_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_m16); + const __m256i s2_24_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_p16); + const __m256i s2_24_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_p16); + const __m256i s2_25_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_p16); + const __m256i s2_25_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_p16); + const __m256i s2_26_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_p16); + const __m256i s2_26_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_p16); + const __m256i s2_27_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_p16); + const __m256i s2_27_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m256i s2_20_4 = + _mm256_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING); + const __m256i s2_20_5 = + _mm256_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING); + const __m256i s2_21_4 = + _mm256_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING); + const __m256i s2_21_5 = + _mm256_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING); + const __m256i s2_22_4 = + _mm256_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING); + const __m256i s2_22_5 = + _mm256_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING); + const __m256i s2_23_4 = + _mm256_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING); + const __m256i s2_23_5 = + _mm256_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING); + const __m256i s2_24_4 = + _mm256_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING); + const __m256i s2_24_5 = + _mm256_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING); + const __m256i s2_25_4 = + _mm256_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING); + const __m256i s2_25_5 = + _mm256_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING); + const __m256i s2_26_4 = + _mm256_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING); + const __m256i s2_26_5 = + _mm256_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING); + const __m256i s2_27_4 = + _mm256_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING); + const __m256i s2_27_5 = + _mm256_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING); + const __m256i s2_20_6 = _mm256_srai_epi32(s2_20_4, DCT_CONST_BITS); + const __m256i s2_20_7 = _mm256_srai_epi32(s2_20_5, DCT_CONST_BITS); + const __m256i s2_21_6 = _mm256_srai_epi32(s2_21_4, DCT_CONST_BITS); + const __m256i s2_21_7 = _mm256_srai_epi32(s2_21_5, DCT_CONST_BITS); + const __m256i s2_22_6 = _mm256_srai_epi32(s2_22_4, DCT_CONST_BITS); + const __m256i s2_22_7 = _mm256_srai_epi32(s2_22_5, DCT_CONST_BITS); + const __m256i s2_23_6 = _mm256_srai_epi32(s2_23_4, DCT_CONST_BITS); + const __m256i s2_23_7 = _mm256_srai_epi32(s2_23_5, DCT_CONST_BITS); + const __m256i s2_24_6 = _mm256_srai_epi32(s2_24_4, DCT_CONST_BITS); + const __m256i s2_24_7 = _mm256_srai_epi32(s2_24_5, DCT_CONST_BITS); + const __m256i s2_25_6 = _mm256_srai_epi32(s2_25_4, DCT_CONST_BITS); + const __m256i s2_25_7 = _mm256_srai_epi32(s2_25_5, DCT_CONST_BITS); + const __m256i s2_26_6 = _mm256_srai_epi32(s2_26_4, DCT_CONST_BITS); + const __m256i s2_26_7 = _mm256_srai_epi32(s2_26_5, DCT_CONST_BITS); + const __m256i s2_27_6 = _mm256_srai_epi32(s2_27_4, DCT_CONST_BITS); + const __m256i s2_27_7 = _mm256_srai_epi32(s2_27_5, DCT_CONST_BITS); + // Combine + step2[20] = _mm256_packs_epi32(s2_20_6, s2_20_7); + step2[21] = _mm256_packs_epi32(s2_21_6, s2_21_7); + step2[22] = _mm256_packs_epi32(s2_22_6, s2_22_7); + step2[23] = _mm256_packs_epi32(s2_23_6, s2_23_7); + step2[24] = _mm256_packs_epi32(s2_24_6, s2_24_7); + step2[25] = _mm256_packs_epi32(s2_25_6, s2_25_7); + step2[26] = _mm256_packs_epi32(s2_26_6, s2_26_7); + step2[27] = _mm256_packs_epi32(s2_27_6, s2_27_7); + } + +#if !FDCT32x32_HIGH_PRECISION + // dump the magnitude by half, hence the intermediate values are within + // the range of 16 bits. + if (1 == pass) { + __m256i s3_00_0 = _mm256_cmpgt_epi16(kZero, step2[0]); + __m256i s3_01_0 = _mm256_cmpgt_epi16(kZero, step2[1]); + __m256i s3_02_0 = _mm256_cmpgt_epi16(kZero, step2[2]); + __m256i s3_03_0 = _mm256_cmpgt_epi16(kZero, step2[3]); + __m256i s3_04_0 = _mm256_cmpgt_epi16(kZero, step2[4]); + __m256i s3_05_0 = _mm256_cmpgt_epi16(kZero, step2[5]); + __m256i s3_06_0 = _mm256_cmpgt_epi16(kZero, step2[6]); + __m256i s3_07_0 = _mm256_cmpgt_epi16(kZero, step2[7]); + __m256i s2_08_0 = _mm256_cmpgt_epi16(kZero, step2[8]); + __m256i s2_09_0 = _mm256_cmpgt_epi16(kZero, step2[9]); + __m256i s3_10_0 = _mm256_cmpgt_epi16(kZero, step2[10]); + __m256i s3_11_0 = _mm256_cmpgt_epi16(kZero, step2[11]); + __m256i s3_12_0 = _mm256_cmpgt_epi16(kZero, step2[12]); + __m256i s3_13_0 = _mm256_cmpgt_epi16(kZero, step2[13]); + __m256i s2_14_0 = _mm256_cmpgt_epi16(kZero, step2[14]); + __m256i s2_15_0 = _mm256_cmpgt_epi16(kZero, step2[15]); + __m256i s3_16_0 = _mm256_cmpgt_epi16(kZero, step1[16]); + __m256i s3_17_0 = _mm256_cmpgt_epi16(kZero, step1[17]); + __m256i s3_18_0 = _mm256_cmpgt_epi16(kZero, step1[18]); + __m256i s3_19_0 = _mm256_cmpgt_epi16(kZero, step1[19]); + __m256i s3_20_0 = _mm256_cmpgt_epi16(kZero, step2[20]); + __m256i s3_21_0 = _mm256_cmpgt_epi16(kZero, step2[21]); + __m256i s3_22_0 = _mm256_cmpgt_epi16(kZero, step2[22]); + __m256i s3_23_0 = _mm256_cmpgt_epi16(kZero, step2[23]); + __m256i s3_24_0 = _mm256_cmpgt_epi16(kZero, step2[24]); + __m256i s3_25_0 = _mm256_cmpgt_epi16(kZero, step2[25]); + __m256i s3_26_0 = _mm256_cmpgt_epi16(kZero, step2[26]); + __m256i s3_27_0 = _mm256_cmpgt_epi16(kZero, step2[27]); + __m256i s3_28_0 = _mm256_cmpgt_epi16(kZero, step1[28]); + __m256i s3_29_0 = _mm256_cmpgt_epi16(kZero, step1[29]); + __m256i s3_30_0 = _mm256_cmpgt_epi16(kZero, step1[30]); + __m256i s3_31_0 = _mm256_cmpgt_epi16(kZero, step1[31]); + + step2[0] = _mm256_sub_epi16(step2[0], s3_00_0); + step2[1] = _mm256_sub_epi16(step2[1], s3_01_0); + step2[2] = _mm256_sub_epi16(step2[2], s3_02_0); + step2[3] = _mm256_sub_epi16(step2[3], s3_03_0); + step2[4] = _mm256_sub_epi16(step2[4], s3_04_0); + step2[5] = _mm256_sub_epi16(step2[5], s3_05_0); + step2[6] = _mm256_sub_epi16(step2[6], s3_06_0); + step2[7] = _mm256_sub_epi16(step2[7], s3_07_0); + step2[8] = _mm256_sub_epi16(step2[8], s2_08_0); + step2[9] = _mm256_sub_epi16(step2[9], s2_09_0); + step2[10] = _mm256_sub_epi16(step2[10], s3_10_0); + step2[11] = _mm256_sub_epi16(step2[11], s3_11_0); + step2[12] = _mm256_sub_epi16(step2[12], s3_12_0); + step2[13] = _mm256_sub_epi16(step2[13], s3_13_0); + step2[14] = _mm256_sub_epi16(step2[14], s2_14_0); + step2[15] = _mm256_sub_epi16(step2[15], s2_15_0); + step1[16] = _mm256_sub_epi16(step1[16], s3_16_0); + step1[17] = _mm256_sub_epi16(step1[17], s3_17_0); + step1[18] = _mm256_sub_epi16(step1[18], s3_18_0); + step1[19] = _mm256_sub_epi16(step1[19], s3_19_0); + step2[20] = _mm256_sub_epi16(step2[20], s3_20_0); + step2[21] = _mm256_sub_epi16(step2[21], s3_21_0); + step2[22] = _mm256_sub_epi16(step2[22], s3_22_0); + step2[23] = _mm256_sub_epi16(step2[23], s3_23_0); + step2[24] = _mm256_sub_epi16(step2[24], s3_24_0); + step2[25] = _mm256_sub_epi16(step2[25], s3_25_0); + step2[26] = _mm256_sub_epi16(step2[26], s3_26_0); + step2[27] = _mm256_sub_epi16(step2[27], s3_27_0); + step1[28] = _mm256_sub_epi16(step1[28], s3_28_0); + step1[29] = _mm256_sub_epi16(step1[29], s3_29_0); + step1[30] = _mm256_sub_epi16(step1[30], s3_30_0); + step1[31] = _mm256_sub_epi16(step1[31], s3_31_0); + + step2[0] = _mm256_add_epi16(step2[0], kOne); + step2[1] = _mm256_add_epi16(step2[1], kOne); + step2[2] = _mm256_add_epi16(step2[2], kOne); + step2[3] = _mm256_add_epi16(step2[3], kOne); + step2[4] = _mm256_add_epi16(step2[4], kOne); + step2[5] = _mm256_add_epi16(step2[5], kOne); + step2[6] = _mm256_add_epi16(step2[6], kOne); + step2[7] = _mm256_add_epi16(step2[7], kOne); + step2[8] = _mm256_add_epi16(step2[8], kOne); + step2[9] = _mm256_add_epi16(step2[9], kOne); + step2[10] = _mm256_add_epi16(step2[10], kOne); + step2[11] = _mm256_add_epi16(step2[11], kOne); + step2[12] = _mm256_add_epi16(step2[12], kOne); + step2[13] = _mm256_add_epi16(step2[13], kOne); + step2[14] = _mm256_add_epi16(step2[14], kOne); + step2[15] = _mm256_add_epi16(step2[15], kOne); + step1[16] = _mm256_add_epi16(step1[16], kOne); + step1[17] = _mm256_add_epi16(step1[17], kOne); + step1[18] = _mm256_add_epi16(step1[18], kOne); + step1[19] = _mm256_add_epi16(step1[19], kOne); + step2[20] = _mm256_add_epi16(step2[20], kOne); + step2[21] = _mm256_add_epi16(step2[21], kOne); + step2[22] = _mm256_add_epi16(step2[22], kOne); + step2[23] = _mm256_add_epi16(step2[23], kOne); + step2[24] = _mm256_add_epi16(step2[24], kOne); + step2[25] = _mm256_add_epi16(step2[25], kOne); + step2[26] = _mm256_add_epi16(step2[26], kOne); + step2[27] = _mm256_add_epi16(step2[27], kOne); + step1[28] = _mm256_add_epi16(step1[28], kOne); + step1[29] = _mm256_add_epi16(step1[29], kOne); + step1[30] = _mm256_add_epi16(step1[30], kOne); + step1[31] = _mm256_add_epi16(step1[31], kOne); + + step2[0] = _mm256_srai_epi16(step2[0], 2); + step2[1] = _mm256_srai_epi16(step2[1], 2); + step2[2] = _mm256_srai_epi16(step2[2], 2); + step2[3] = _mm256_srai_epi16(step2[3], 2); + step2[4] = _mm256_srai_epi16(step2[4], 2); + step2[5] = _mm256_srai_epi16(step2[5], 2); + step2[6] = _mm256_srai_epi16(step2[6], 2); + step2[7] = _mm256_srai_epi16(step2[7], 2); + step2[8] = _mm256_srai_epi16(step2[8], 2); + step2[9] = _mm256_srai_epi16(step2[9], 2); + step2[10] = _mm256_srai_epi16(step2[10], 2); + step2[11] = _mm256_srai_epi16(step2[11], 2); + step2[12] = _mm256_srai_epi16(step2[12], 2); + step2[13] = _mm256_srai_epi16(step2[13], 2); + step2[14] = _mm256_srai_epi16(step2[14], 2); + step2[15] = _mm256_srai_epi16(step2[15], 2); + step1[16] = _mm256_srai_epi16(step1[16], 2); + step1[17] = _mm256_srai_epi16(step1[17], 2); + step1[18] = _mm256_srai_epi16(step1[18], 2); + step1[19] = _mm256_srai_epi16(step1[19], 2); + step2[20] = _mm256_srai_epi16(step2[20], 2); + step2[21] = _mm256_srai_epi16(step2[21], 2); + step2[22] = _mm256_srai_epi16(step2[22], 2); + step2[23] = _mm256_srai_epi16(step2[23], 2); + step2[24] = _mm256_srai_epi16(step2[24], 2); + step2[25] = _mm256_srai_epi16(step2[25], 2); + step2[26] = _mm256_srai_epi16(step2[26], 2); + step2[27] = _mm256_srai_epi16(step2[27], 2); + step1[28] = _mm256_srai_epi16(step1[28], 2); + step1[29] = _mm256_srai_epi16(step1[29], 2); + step1[30] = _mm256_srai_epi16(step1[30], 2); + step1[31] = _mm256_srai_epi16(step1[31], 2); + } +#endif + +#if FDCT32x32_HIGH_PRECISION + if (pass == 0) { +#endif + // Stage 3 + { + step3[0] = _mm256_add_epi16(step2[(8 - 1)], step2[0]); + step3[1] = _mm256_add_epi16(step2[(8 - 2)], step2[1]); + step3[2] = _mm256_add_epi16(step2[(8 - 3)], step2[2]); + step3[3] = _mm256_add_epi16(step2[(8 - 4)], step2[3]); + step3[4] = _mm256_sub_epi16(step2[(8 - 5)], step2[4]); + step3[5] = _mm256_sub_epi16(step2[(8 - 6)], step2[5]); + step3[6] = _mm256_sub_epi16(step2[(8 - 7)], step2[6]); + step3[7] = _mm256_sub_epi16(step2[(8 - 8)], step2[7]); + } + { + const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]); + const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]); + const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]); + const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]); + const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16); + const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16); + const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16); + const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16); + const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16); + const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16); + const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16); + const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m256i s3_10_4 = + _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); + const __m256i s3_10_5 = + _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); + const __m256i s3_11_4 = + _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); + const __m256i s3_11_5 = + _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); + const __m256i s3_12_4 = + _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); + const __m256i s3_12_5 = + _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); + const __m256i s3_13_4 = + _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); + const __m256i s3_13_5 = + _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); + const __m256i s3_10_6 = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS); + const __m256i s3_10_7 = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS); + const __m256i s3_11_6 = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS); + const __m256i s3_11_7 = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS); + const __m256i s3_12_6 = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS); + const __m256i s3_12_7 = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS); + const __m256i s3_13_6 = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS); + const __m256i s3_13_7 = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS); + // Combine + step3[10] = _mm256_packs_epi32(s3_10_6, s3_10_7); + step3[11] = _mm256_packs_epi32(s3_11_6, s3_11_7); + step3[12] = _mm256_packs_epi32(s3_12_6, s3_12_7); + step3[13] = _mm256_packs_epi32(s3_13_6, s3_13_7); + } + { + step3[16] = _mm256_add_epi16(step2[23], step1[16]); + step3[17] = _mm256_add_epi16(step2[22], step1[17]); + step3[18] = _mm256_add_epi16(step2[21], step1[18]); + step3[19] = _mm256_add_epi16(step2[20], step1[19]); + step3[20] = _mm256_sub_epi16(step1[19], step2[20]); + step3[21] = _mm256_sub_epi16(step1[18], step2[21]); + step3[22] = _mm256_sub_epi16(step1[17], step2[22]); + step3[23] = _mm256_sub_epi16(step1[16], step2[23]); + step3[24] = _mm256_sub_epi16(step1[31], step2[24]); + step3[25] = _mm256_sub_epi16(step1[30], step2[25]); + step3[26] = _mm256_sub_epi16(step1[29], step2[26]); + step3[27] = _mm256_sub_epi16(step1[28], step2[27]); + step3[28] = _mm256_add_epi16(step2[27], step1[28]); + step3[29] = _mm256_add_epi16(step2[26], step1[29]); + step3[30] = _mm256_add_epi16(step2[25], step1[30]); + step3[31] = _mm256_add_epi16(step2[24], step1[31]); + } + + // Stage 4 + { + step1[0] = _mm256_add_epi16(step3[3], step3[0]); + step1[1] = _mm256_add_epi16(step3[2], step3[1]); + step1[2] = _mm256_sub_epi16(step3[1], step3[2]); + step1[3] = _mm256_sub_epi16(step3[0], step3[3]); + step1[8] = _mm256_add_epi16(step3[11], step2[8]); + step1[9] = _mm256_add_epi16(step3[10], step2[9]); + step1[10] = _mm256_sub_epi16(step2[9], step3[10]); + step1[11] = _mm256_sub_epi16(step2[8], step3[11]); + step1[12] = _mm256_sub_epi16(step2[15], step3[12]); + step1[13] = _mm256_sub_epi16(step2[14], step3[13]); + step1[14] = _mm256_add_epi16(step3[13], step2[14]); + step1[15] = _mm256_add_epi16(step3[12], step2[15]); + } + { + const __m256i s1_05_0 = _mm256_unpacklo_epi16(step3[6], step3[5]); + const __m256i s1_05_1 = _mm256_unpackhi_epi16(step3[6], step3[5]); + const __m256i s1_05_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_m16); + const __m256i s1_05_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_m16); + const __m256i s1_06_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_p16); + const __m256i s1_06_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m256i s1_05_4 = + _mm256_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING); + const __m256i s1_05_5 = + _mm256_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING); + const __m256i s1_06_4 = + _mm256_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING); + const __m256i s1_06_5 = + _mm256_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING); + const __m256i s1_05_6 = _mm256_srai_epi32(s1_05_4, DCT_CONST_BITS); + const __m256i s1_05_7 = _mm256_srai_epi32(s1_05_5, DCT_CONST_BITS); + const __m256i s1_06_6 = _mm256_srai_epi32(s1_06_4, DCT_CONST_BITS); + const __m256i s1_06_7 = _mm256_srai_epi32(s1_06_5, DCT_CONST_BITS); + // Combine + step1[5] = _mm256_packs_epi32(s1_05_6, s1_05_7); + step1[6] = _mm256_packs_epi32(s1_06_6, s1_06_7); + } + { + const __m256i s1_18_0 = _mm256_unpacklo_epi16(step3[18], step3[29]); + const __m256i s1_18_1 = _mm256_unpackhi_epi16(step3[18], step3[29]); + const __m256i s1_19_0 = _mm256_unpacklo_epi16(step3[19], step3[28]); + const __m256i s1_19_1 = _mm256_unpackhi_epi16(step3[19], step3[28]); + const __m256i s1_20_0 = _mm256_unpacklo_epi16(step3[20], step3[27]); + const __m256i s1_20_1 = _mm256_unpackhi_epi16(step3[20], step3[27]); + const __m256i s1_21_0 = _mm256_unpacklo_epi16(step3[21], step3[26]); + const __m256i s1_21_1 = _mm256_unpackhi_epi16(step3[21], step3[26]); + const __m256i s1_18_2 = _mm256_madd_epi16(s1_18_0, k__cospi_m08_p24); + const __m256i s1_18_3 = _mm256_madd_epi16(s1_18_1, k__cospi_m08_p24); + const __m256i s1_19_2 = _mm256_madd_epi16(s1_19_0, k__cospi_m08_p24); + const __m256i s1_19_3 = _mm256_madd_epi16(s1_19_1, k__cospi_m08_p24); + const __m256i s1_20_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m24_m08); + const __m256i s1_20_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m24_m08); + const __m256i s1_21_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m24_m08); + const __m256i s1_21_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m24_m08); + const __m256i s1_26_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m08_p24); + const __m256i s1_26_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m08_p24); + const __m256i s1_27_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m08_p24); + const __m256i s1_27_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m08_p24); + const __m256i s1_28_2 = _mm256_madd_epi16(s1_19_0, k__cospi_p24_p08); + const __m256i s1_28_3 = _mm256_madd_epi16(s1_19_1, k__cospi_p24_p08); + const __m256i s1_29_2 = _mm256_madd_epi16(s1_18_0, k__cospi_p24_p08); + const __m256i s1_29_3 = _mm256_madd_epi16(s1_18_1, k__cospi_p24_p08); + // dct_const_round_shift + const __m256i s1_18_4 = + _mm256_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING); + const __m256i s1_18_5 = + _mm256_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING); + const __m256i s1_19_4 = + _mm256_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING); + const __m256i s1_19_5 = + _mm256_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING); + const __m256i s1_20_4 = + _mm256_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING); + const __m256i s1_20_5 = + _mm256_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING); + const __m256i s1_21_4 = + _mm256_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING); + const __m256i s1_21_5 = + _mm256_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING); + const __m256i s1_26_4 = + _mm256_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING); + const __m256i s1_26_5 = + _mm256_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING); + const __m256i s1_27_4 = + _mm256_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING); + const __m256i s1_27_5 = + _mm256_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING); + const __m256i s1_28_4 = + _mm256_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING); + const __m256i s1_28_5 = + _mm256_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING); + const __m256i s1_29_4 = + _mm256_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING); + const __m256i s1_29_5 = + _mm256_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING); + const __m256i s1_18_6 = _mm256_srai_epi32(s1_18_4, DCT_CONST_BITS); + const __m256i s1_18_7 = _mm256_srai_epi32(s1_18_5, DCT_CONST_BITS); + const __m256i s1_19_6 = _mm256_srai_epi32(s1_19_4, DCT_CONST_BITS); + const __m256i s1_19_7 = _mm256_srai_epi32(s1_19_5, DCT_CONST_BITS); + const __m256i s1_20_6 = _mm256_srai_epi32(s1_20_4, DCT_CONST_BITS); + const __m256i s1_20_7 = _mm256_srai_epi32(s1_20_5, DCT_CONST_BITS); + const __m256i s1_21_6 = _mm256_srai_epi32(s1_21_4, DCT_CONST_BITS); + const __m256i s1_21_7 = _mm256_srai_epi32(s1_21_5, DCT_CONST_BITS); + const __m256i s1_26_6 = _mm256_srai_epi32(s1_26_4, DCT_CONST_BITS); + const __m256i s1_26_7 = _mm256_srai_epi32(s1_26_5, DCT_CONST_BITS); + const __m256i s1_27_6 = _mm256_srai_epi32(s1_27_4, DCT_CONST_BITS); + const __m256i s1_27_7 = _mm256_srai_epi32(s1_27_5, DCT_CONST_BITS); + const __m256i s1_28_6 = _mm256_srai_epi32(s1_28_4, DCT_CONST_BITS); + const __m256i s1_28_7 = _mm256_srai_epi32(s1_28_5, DCT_CONST_BITS); + const __m256i s1_29_6 = _mm256_srai_epi32(s1_29_4, DCT_CONST_BITS); + const __m256i s1_29_7 = _mm256_srai_epi32(s1_29_5, DCT_CONST_BITS); + // Combine + step1[18] = _mm256_packs_epi32(s1_18_6, s1_18_7); + step1[19] = _mm256_packs_epi32(s1_19_6, s1_19_7); + step1[20] = _mm256_packs_epi32(s1_20_6, s1_20_7); + step1[21] = _mm256_packs_epi32(s1_21_6, s1_21_7); + step1[26] = _mm256_packs_epi32(s1_26_6, s1_26_7); + step1[27] = _mm256_packs_epi32(s1_27_6, s1_27_7); + step1[28] = _mm256_packs_epi32(s1_28_6, s1_28_7); + step1[29] = _mm256_packs_epi32(s1_29_6, s1_29_7); + } + // Stage 5 + { + step2[4] = _mm256_add_epi16(step1[5], step3[4]); + step2[5] = _mm256_sub_epi16(step3[4], step1[5]); + step2[6] = _mm256_sub_epi16(step3[7], step1[6]); + step2[7] = _mm256_add_epi16(step1[6], step3[7]); + } + { + const __m256i out_00_0 = _mm256_unpacklo_epi16(step1[0], step1[1]); + const __m256i out_00_1 = _mm256_unpackhi_epi16(step1[0], step1[1]); + const __m256i out_08_0 = _mm256_unpacklo_epi16(step1[2], step1[3]); + const __m256i out_08_1 = _mm256_unpackhi_epi16(step1[2], step1[3]); + const __m256i out_00_2 = + _mm256_madd_epi16(out_00_0, k__cospi_p16_p16); + const __m256i out_00_3 = + _mm256_madd_epi16(out_00_1, k__cospi_p16_p16); + const __m256i out_16_2 = + _mm256_madd_epi16(out_00_0, k__cospi_p16_m16); + const __m256i out_16_3 = + _mm256_madd_epi16(out_00_1, k__cospi_p16_m16); + const __m256i out_08_2 = + _mm256_madd_epi16(out_08_0, k__cospi_p24_p08); + const __m256i out_08_3 = + _mm256_madd_epi16(out_08_1, k__cospi_p24_p08); + const __m256i out_24_2 = + _mm256_madd_epi16(out_08_0, k__cospi_m08_p24); + const __m256i out_24_3 = + _mm256_madd_epi16(out_08_1, k__cospi_m08_p24); + // dct_const_round_shift + const __m256i out_00_4 = + _mm256_add_epi32(out_00_2, k__DCT_CONST_ROUNDING); + const __m256i out_00_5 = + _mm256_add_epi32(out_00_3, k__DCT_CONST_ROUNDING); + const __m256i out_16_4 = + _mm256_add_epi32(out_16_2, k__DCT_CONST_ROUNDING); + const __m256i out_16_5 = + _mm256_add_epi32(out_16_3, k__DCT_CONST_ROUNDING); + const __m256i out_08_4 = + _mm256_add_epi32(out_08_2, k__DCT_CONST_ROUNDING); + const __m256i out_08_5 = + _mm256_add_epi32(out_08_3, k__DCT_CONST_ROUNDING); + const __m256i out_24_4 = + _mm256_add_epi32(out_24_2, k__DCT_CONST_ROUNDING); + const __m256i out_24_5 = + _mm256_add_epi32(out_24_3, k__DCT_CONST_ROUNDING); + const __m256i out_00_6 = _mm256_srai_epi32(out_00_4, DCT_CONST_BITS); + const __m256i out_00_7 = _mm256_srai_epi32(out_00_5, DCT_CONST_BITS); + const __m256i out_16_6 = _mm256_srai_epi32(out_16_4, DCT_CONST_BITS); + const __m256i out_16_7 = _mm256_srai_epi32(out_16_5, DCT_CONST_BITS); + const __m256i out_08_6 = _mm256_srai_epi32(out_08_4, DCT_CONST_BITS); + const __m256i out_08_7 = _mm256_srai_epi32(out_08_5, DCT_CONST_BITS); + const __m256i out_24_6 = _mm256_srai_epi32(out_24_4, DCT_CONST_BITS); + const __m256i out_24_7 = _mm256_srai_epi32(out_24_5, DCT_CONST_BITS); + // Combine + out[0] = _mm256_packs_epi32(out_00_6, out_00_7); + out[16] = _mm256_packs_epi32(out_16_6, out_16_7); + out[8] = _mm256_packs_epi32(out_08_6, out_08_7); + out[24] = _mm256_packs_epi32(out_24_6, out_24_7); + } + { + const __m256i s2_09_0 = _mm256_unpacklo_epi16(step1[9], step1[14]); + const __m256i s2_09_1 = _mm256_unpackhi_epi16(step1[9], step1[14]); + const __m256i s2_10_0 = _mm256_unpacklo_epi16(step1[10], step1[13]); + const __m256i s2_10_1 = _mm256_unpackhi_epi16(step1[10], step1[13]); + const __m256i s2_09_2 = _mm256_madd_epi16(s2_09_0, k__cospi_m08_p24); + const __m256i s2_09_3 = _mm256_madd_epi16(s2_09_1, k__cospi_m08_p24); + const __m256i s2_10_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m24_m08); + const __m256i s2_10_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m24_m08); + const __m256i s2_13_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m08_p24); + const __m256i s2_13_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m08_p24); + const __m256i s2_14_2 = _mm256_madd_epi16(s2_09_0, k__cospi_p24_p08); + const __m256i s2_14_3 = _mm256_madd_epi16(s2_09_1, k__cospi_p24_p08); + // dct_const_round_shift + const __m256i s2_09_4 = + _mm256_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING); + const __m256i s2_09_5 = + _mm256_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING); + const __m256i s2_10_4 = + _mm256_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING); + const __m256i s2_10_5 = + _mm256_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING); + const __m256i s2_13_4 = + _mm256_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING); + const __m256i s2_13_5 = + _mm256_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING); + const __m256i s2_14_4 = + _mm256_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING); + const __m256i s2_14_5 = + _mm256_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING); + const __m256i s2_09_6 = _mm256_srai_epi32(s2_09_4, DCT_CONST_BITS); + const __m256i s2_09_7 = _mm256_srai_epi32(s2_09_5, DCT_CONST_BITS); + const __m256i s2_10_6 = _mm256_srai_epi32(s2_10_4, DCT_CONST_BITS); + const __m256i s2_10_7 = _mm256_srai_epi32(s2_10_5, DCT_CONST_BITS); + const __m256i s2_13_6 = _mm256_srai_epi32(s2_13_4, DCT_CONST_BITS); + const __m256i s2_13_7 = _mm256_srai_epi32(s2_13_5, DCT_CONST_BITS); + const __m256i s2_14_6 = _mm256_srai_epi32(s2_14_4, DCT_CONST_BITS); + const __m256i s2_14_7 = _mm256_srai_epi32(s2_14_5, DCT_CONST_BITS); + // Combine + step2[9] = _mm256_packs_epi32(s2_09_6, s2_09_7); + step2[10] = _mm256_packs_epi32(s2_10_6, s2_10_7); + step2[13] = _mm256_packs_epi32(s2_13_6, s2_13_7); + step2[14] = _mm256_packs_epi32(s2_14_6, s2_14_7); + } + { + step2[16] = _mm256_add_epi16(step1[19], step3[16]); + step2[17] = _mm256_add_epi16(step1[18], step3[17]); + step2[18] = _mm256_sub_epi16(step3[17], step1[18]); + step2[19] = _mm256_sub_epi16(step3[16], step1[19]); + step2[20] = _mm256_sub_epi16(step3[23], step1[20]); + step2[21] = _mm256_sub_epi16(step3[22], step1[21]); + step2[22] = _mm256_add_epi16(step1[21], step3[22]); + step2[23] = _mm256_add_epi16(step1[20], step3[23]); + step2[24] = _mm256_add_epi16(step1[27], step3[24]); + step2[25] = _mm256_add_epi16(step1[26], step3[25]); + step2[26] = _mm256_sub_epi16(step3[25], step1[26]); + step2[27] = _mm256_sub_epi16(step3[24], step1[27]); + step2[28] = _mm256_sub_epi16(step3[31], step1[28]); + step2[29] = _mm256_sub_epi16(step3[30], step1[29]); + step2[30] = _mm256_add_epi16(step1[29], step3[30]); + step2[31] = _mm256_add_epi16(step1[28], step3[31]); + } + // Stage 6 + { + const __m256i out_04_0 = _mm256_unpacklo_epi16(step2[4], step2[7]); + const __m256i out_04_1 = _mm256_unpackhi_epi16(step2[4], step2[7]); + const __m256i out_20_0 = _mm256_unpacklo_epi16(step2[5], step2[6]); + const __m256i out_20_1 = _mm256_unpackhi_epi16(step2[5], step2[6]); + const __m256i out_12_0 = _mm256_unpacklo_epi16(step2[5], step2[6]); + const __m256i out_12_1 = _mm256_unpackhi_epi16(step2[5], step2[6]); + const __m256i out_28_0 = _mm256_unpacklo_epi16(step2[4], step2[7]); + const __m256i out_28_1 = _mm256_unpackhi_epi16(step2[4], step2[7]); + const __m256i out_04_2 = + _mm256_madd_epi16(out_04_0, k__cospi_p28_p04); + const __m256i out_04_3 = + _mm256_madd_epi16(out_04_1, k__cospi_p28_p04); + const __m256i out_20_2 = + _mm256_madd_epi16(out_20_0, k__cospi_p12_p20); + const __m256i out_20_3 = + _mm256_madd_epi16(out_20_1, k__cospi_p12_p20); + const __m256i out_12_2 = + _mm256_madd_epi16(out_12_0, k__cospi_m20_p12); + const __m256i out_12_3 = + _mm256_madd_epi16(out_12_1, k__cospi_m20_p12); + const __m256i out_28_2 = + _mm256_madd_epi16(out_28_0, k__cospi_m04_p28); + const __m256i out_28_3 = + _mm256_madd_epi16(out_28_1, k__cospi_m04_p28); + // dct_const_round_shift + const __m256i out_04_4 = + _mm256_add_epi32(out_04_2, k__DCT_CONST_ROUNDING); + const __m256i out_04_5 = + _mm256_add_epi32(out_04_3, k__DCT_CONST_ROUNDING); + const __m256i out_20_4 = + _mm256_add_epi32(out_20_2, k__DCT_CONST_ROUNDING); + const __m256i out_20_5 = + _mm256_add_epi32(out_20_3, k__DCT_CONST_ROUNDING); + const __m256i out_12_4 = + _mm256_add_epi32(out_12_2, k__DCT_CONST_ROUNDING); + const __m256i out_12_5 = + _mm256_add_epi32(out_12_3, k__DCT_CONST_ROUNDING); + const __m256i out_28_4 = + _mm256_add_epi32(out_28_2, k__DCT_CONST_ROUNDING); + const __m256i out_28_5 = + _mm256_add_epi32(out_28_3, k__DCT_CONST_ROUNDING); + const __m256i out_04_6 = _mm256_srai_epi32(out_04_4, DCT_CONST_BITS); + const __m256i out_04_7 = _mm256_srai_epi32(out_04_5, DCT_CONST_BITS); + const __m256i out_20_6 = _mm256_srai_epi32(out_20_4, DCT_CONST_BITS); + const __m256i out_20_7 = _mm256_srai_epi32(out_20_5, DCT_CONST_BITS); + const __m256i out_12_6 = _mm256_srai_epi32(out_12_4, DCT_CONST_BITS); + const __m256i out_12_7 = _mm256_srai_epi32(out_12_5, DCT_CONST_BITS); + const __m256i out_28_6 = _mm256_srai_epi32(out_28_4, DCT_CONST_BITS); + const __m256i out_28_7 = _mm256_srai_epi32(out_28_5, DCT_CONST_BITS); + // Combine + out[4] = _mm256_packs_epi32(out_04_6, out_04_7); + out[20] = _mm256_packs_epi32(out_20_6, out_20_7); + out[12] = _mm256_packs_epi32(out_12_6, out_12_7); + out[28] = _mm256_packs_epi32(out_28_6, out_28_7); + } + { + step3[8] = _mm256_add_epi16(step2[9], step1[8]); + step3[9] = _mm256_sub_epi16(step1[8], step2[9]); + step3[10] = _mm256_sub_epi16(step1[11], step2[10]); + step3[11] = _mm256_add_epi16(step2[10], step1[11]); + step3[12] = _mm256_add_epi16(step2[13], step1[12]); + step3[13] = _mm256_sub_epi16(step1[12], step2[13]); + step3[14] = _mm256_sub_epi16(step1[15], step2[14]); + step3[15] = _mm256_add_epi16(step2[14], step1[15]); + } + { + const __m256i s3_17_0 = _mm256_unpacklo_epi16(step2[17], step2[30]); + const __m256i s3_17_1 = _mm256_unpackhi_epi16(step2[17], step2[30]); + const __m256i s3_18_0 = _mm256_unpacklo_epi16(step2[18], step2[29]); + const __m256i s3_18_1 = _mm256_unpackhi_epi16(step2[18], step2[29]); + const __m256i s3_21_0 = _mm256_unpacklo_epi16(step2[21], step2[26]); + const __m256i s3_21_1 = _mm256_unpackhi_epi16(step2[21], step2[26]); + const __m256i s3_22_0 = _mm256_unpacklo_epi16(step2[22], step2[25]); + const __m256i s3_22_1 = _mm256_unpackhi_epi16(step2[22], step2[25]); + const __m256i s3_17_2 = _mm256_madd_epi16(s3_17_0, k__cospi_m04_p28); + const __m256i s3_17_3 = _mm256_madd_epi16(s3_17_1, k__cospi_m04_p28); + const __m256i s3_18_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m28_m04); + const __m256i s3_18_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m28_m04); + const __m256i s3_21_2 = _mm256_madd_epi16(s3_21_0, k__cospi_m20_p12); + const __m256i s3_21_3 = _mm256_madd_epi16(s3_21_1, k__cospi_m20_p12); + const __m256i s3_22_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m12_m20); + const __m256i s3_22_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m12_m20); + const __m256i s3_25_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m20_p12); + const __m256i s3_25_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m20_p12); + const __m256i s3_26_2 = _mm256_madd_epi16(s3_21_0, k__cospi_p12_p20); + const __m256i s3_26_3 = _mm256_madd_epi16(s3_21_1, k__cospi_p12_p20); + const __m256i s3_29_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m04_p28); + const __m256i s3_29_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m04_p28); + const __m256i s3_30_2 = _mm256_madd_epi16(s3_17_0, k__cospi_p28_p04); + const __m256i s3_30_3 = _mm256_madd_epi16(s3_17_1, k__cospi_p28_p04); + // dct_const_round_shift + const __m256i s3_17_4 = + _mm256_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING); + const __m256i s3_17_5 = + _mm256_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING); + const __m256i s3_18_4 = + _mm256_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING); + const __m256i s3_18_5 = + _mm256_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING); + const __m256i s3_21_4 = + _mm256_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING); + const __m256i s3_21_5 = + _mm256_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING); + const __m256i s3_22_4 = + _mm256_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING); + const __m256i s3_22_5 = + _mm256_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING); + const __m256i s3_17_6 = _mm256_srai_epi32(s3_17_4, DCT_CONST_BITS); + const __m256i s3_17_7 = _mm256_srai_epi32(s3_17_5, DCT_CONST_BITS); + const __m256i s3_18_6 = _mm256_srai_epi32(s3_18_4, DCT_CONST_BITS); + const __m256i s3_18_7 = _mm256_srai_epi32(s3_18_5, DCT_CONST_BITS); + const __m256i s3_21_6 = _mm256_srai_epi32(s3_21_4, DCT_CONST_BITS); + const __m256i s3_21_7 = _mm256_srai_epi32(s3_21_5, DCT_CONST_BITS); + const __m256i s3_22_6 = _mm256_srai_epi32(s3_22_4, DCT_CONST_BITS); + const __m256i s3_22_7 = _mm256_srai_epi32(s3_22_5, DCT_CONST_BITS); + const __m256i s3_25_4 = + _mm256_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING); + const __m256i s3_25_5 = + _mm256_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING); + const __m256i s3_26_4 = + _mm256_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING); + const __m256i s3_26_5 = + _mm256_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING); + const __m256i s3_29_4 = + _mm256_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING); + const __m256i s3_29_5 = + _mm256_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING); + const __m256i s3_30_4 = + _mm256_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING); + const __m256i s3_30_5 = + _mm256_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING); + const __m256i s3_25_6 = _mm256_srai_epi32(s3_25_4, DCT_CONST_BITS); + const __m256i s3_25_7 = _mm256_srai_epi32(s3_25_5, DCT_CONST_BITS); + const __m256i s3_26_6 = _mm256_srai_epi32(s3_26_4, DCT_CONST_BITS); + const __m256i s3_26_7 = _mm256_srai_epi32(s3_26_5, DCT_CONST_BITS); + const __m256i s3_29_6 = _mm256_srai_epi32(s3_29_4, DCT_CONST_BITS); + const __m256i s3_29_7 = _mm256_srai_epi32(s3_29_5, DCT_CONST_BITS); + const __m256i s3_30_6 = _mm256_srai_epi32(s3_30_4, DCT_CONST_BITS); + const __m256i s3_30_7 = _mm256_srai_epi32(s3_30_5, DCT_CONST_BITS); + // Combine + step3[17] = _mm256_packs_epi32(s3_17_6, s3_17_7); + step3[18] = _mm256_packs_epi32(s3_18_6, s3_18_7); + step3[21] = _mm256_packs_epi32(s3_21_6, s3_21_7); + step3[22] = _mm256_packs_epi32(s3_22_6, s3_22_7); + // Combine + step3[25] = _mm256_packs_epi32(s3_25_6, s3_25_7); + step3[26] = _mm256_packs_epi32(s3_26_6, s3_26_7); + step3[29] = _mm256_packs_epi32(s3_29_6, s3_29_7); + step3[30] = _mm256_packs_epi32(s3_30_6, s3_30_7); + } + // Stage 7 + { + const __m256i out_02_0 = _mm256_unpacklo_epi16(step3[8], step3[15]); + const __m256i out_02_1 = _mm256_unpackhi_epi16(step3[8], step3[15]); + const __m256i out_18_0 = _mm256_unpacklo_epi16(step3[9], step3[14]); + const __m256i out_18_1 = _mm256_unpackhi_epi16(step3[9], step3[14]); + const __m256i out_10_0 = _mm256_unpacklo_epi16(step3[10], step3[13]); + const __m256i out_10_1 = _mm256_unpackhi_epi16(step3[10], step3[13]); + const __m256i out_26_0 = _mm256_unpacklo_epi16(step3[11], step3[12]); + const __m256i out_26_1 = _mm256_unpackhi_epi16(step3[11], step3[12]); + const __m256i out_02_2 = + _mm256_madd_epi16(out_02_0, k__cospi_p30_p02); + const __m256i out_02_3 = + _mm256_madd_epi16(out_02_1, k__cospi_p30_p02); + const __m256i out_18_2 = + _mm256_madd_epi16(out_18_0, k__cospi_p14_p18); + const __m256i out_18_3 = + _mm256_madd_epi16(out_18_1, k__cospi_p14_p18); + const __m256i out_10_2 = + _mm256_madd_epi16(out_10_0, k__cospi_p22_p10); + const __m256i out_10_3 = + _mm256_madd_epi16(out_10_1, k__cospi_p22_p10); + const __m256i out_26_2 = + _mm256_madd_epi16(out_26_0, k__cospi_p06_p26); + const __m256i out_26_3 = + _mm256_madd_epi16(out_26_1, k__cospi_p06_p26); + const __m256i out_06_2 = + _mm256_madd_epi16(out_26_0, k__cospi_m26_p06); + const __m256i out_06_3 = + _mm256_madd_epi16(out_26_1, k__cospi_m26_p06); + const __m256i out_22_2 = + _mm256_madd_epi16(out_10_0, k__cospi_m10_p22); + const __m256i out_22_3 = + _mm256_madd_epi16(out_10_1, k__cospi_m10_p22); + const __m256i out_14_2 = + _mm256_madd_epi16(out_18_0, k__cospi_m18_p14); + const __m256i out_14_3 = + _mm256_madd_epi16(out_18_1, k__cospi_m18_p14); + const __m256i out_30_2 = + _mm256_madd_epi16(out_02_0, k__cospi_m02_p30); + const __m256i out_30_3 = + _mm256_madd_epi16(out_02_1, k__cospi_m02_p30); + // dct_const_round_shift + const __m256i out_02_4 = + _mm256_add_epi32(out_02_2, k__DCT_CONST_ROUNDING); + const __m256i out_02_5 = + _mm256_add_epi32(out_02_3, k__DCT_CONST_ROUNDING); + const __m256i out_18_4 = + _mm256_add_epi32(out_18_2, k__DCT_CONST_ROUNDING); + const __m256i out_18_5 = + _mm256_add_epi32(out_18_3, k__DCT_CONST_ROUNDING); + const __m256i out_10_4 = + _mm256_add_epi32(out_10_2, k__DCT_CONST_ROUNDING); + const __m256i out_10_5 = + _mm256_add_epi32(out_10_3, k__DCT_CONST_ROUNDING); + const __m256i out_26_4 = + _mm256_add_epi32(out_26_2, k__DCT_CONST_ROUNDING); + const __m256i out_26_5 = + _mm256_add_epi32(out_26_3, k__DCT_CONST_ROUNDING); + const __m256i out_06_4 = + _mm256_add_epi32(out_06_2, k__DCT_CONST_ROUNDING); + const __m256i out_06_5 = + _mm256_add_epi32(out_06_3, k__DCT_CONST_ROUNDING); + const __m256i out_22_4 = + _mm256_add_epi32(out_22_2, k__DCT_CONST_ROUNDING); + const __m256i out_22_5 = + _mm256_add_epi32(out_22_3, k__DCT_CONST_ROUNDING); + const __m256i out_14_4 = + _mm256_add_epi32(out_14_2, k__DCT_CONST_ROUNDING); + const __m256i out_14_5 = + _mm256_add_epi32(out_14_3, k__DCT_CONST_ROUNDING); + const __m256i out_30_4 = + _mm256_add_epi32(out_30_2, k__DCT_CONST_ROUNDING); + const __m256i out_30_5 = + _mm256_add_epi32(out_30_3, k__DCT_CONST_ROUNDING); + const __m256i out_02_6 = _mm256_srai_epi32(out_02_4, DCT_CONST_BITS); + const __m256i out_02_7 = _mm256_srai_epi32(out_02_5, DCT_CONST_BITS); + const __m256i out_18_6 = _mm256_srai_epi32(out_18_4, DCT_CONST_BITS); + const __m256i out_18_7 = _mm256_srai_epi32(out_18_5, DCT_CONST_BITS); + const __m256i out_10_6 = _mm256_srai_epi32(out_10_4, DCT_CONST_BITS); + const __m256i out_10_7 = _mm256_srai_epi32(out_10_5, DCT_CONST_BITS); + const __m256i out_26_6 = _mm256_srai_epi32(out_26_4, DCT_CONST_BITS); + const __m256i out_26_7 = _mm256_srai_epi32(out_26_5, DCT_CONST_BITS); + const __m256i out_06_6 = _mm256_srai_epi32(out_06_4, DCT_CONST_BITS); + const __m256i out_06_7 = _mm256_srai_epi32(out_06_5, DCT_CONST_BITS); + const __m256i out_22_6 = _mm256_srai_epi32(out_22_4, DCT_CONST_BITS); + const __m256i out_22_7 = _mm256_srai_epi32(out_22_5, DCT_CONST_BITS); + const __m256i out_14_6 = _mm256_srai_epi32(out_14_4, DCT_CONST_BITS); + const __m256i out_14_7 = _mm256_srai_epi32(out_14_5, DCT_CONST_BITS); + const __m256i out_30_6 = _mm256_srai_epi32(out_30_4, DCT_CONST_BITS); + const __m256i out_30_7 = _mm256_srai_epi32(out_30_5, DCT_CONST_BITS); + // Combine + out[2] = _mm256_packs_epi32(out_02_6, out_02_7); + out[18] = _mm256_packs_epi32(out_18_6, out_18_7); + out[10] = _mm256_packs_epi32(out_10_6, out_10_7); + out[26] = _mm256_packs_epi32(out_26_6, out_26_7); + out[6] = _mm256_packs_epi32(out_06_6, out_06_7); + out[22] = _mm256_packs_epi32(out_22_6, out_22_7); + out[14] = _mm256_packs_epi32(out_14_6, out_14_7); + out[30] = _mm256_packs_epi32(out_30_6, out_30_7); + } + { + step1[16] = _mm256_add_epi16(step3[17], step2[16]); + step1[17] = _mm256_sub_epi16(step2[16], step3[17]); + step1[18] = _mm256_sub_epi16(step2[19], step3[18]); + step1[19] = _mm256_add_epi16(step3[18], step2[19]); + step1[20] = _mm256_add_epi16(step3[21], step2[20]); + step1[21] = _mm256_sub_epi16(step2[20], step3[21]); + step1[22] = _mm256_sub_epi16(step2[23], step3[22]); + step1[23] = _mm256_add_epi16(step3[22], step2[23]); + step1[24] = _mm256_add_epi16(step3[25], step2[24]); + step1[25] = _mm256_sub_epi16(step2[24], step3[25]); + step1[26] = _mm256_sub_epi16(step2[27], step3[26]); + step1[27] = _mm256_add_epi16(step3[26], step2[27]); + step1[28] = _mm256_add_epi16(step3[29], step2[28]); + step1[29] = _mm256_sub_epi16(step2[28], step3[29]); + step1[30] = _mm256_sub_epi16(step2[31], step3[30]); + step1[31] = _mm256_add_epi16(step3[30], step2[31]); + } + // Final stage --- outputs indices are bit-reversed. + { + const __m256i out_01_0 = _mm256_unpacklo_epi16(step1[16], step1[31]); + const __m256i out_01_1 = _mm256_unpackhi_epi16(step1[16], step1[31]); + const __m256i out_17_0 = _mm256_unpacklo_epi16(step1[17], step1[30]); + const __m256i out_17_1 = _mm256_unpackhi_epi16(step1[17], step1[30]); + const __m256i out_09_0 = _mm256_unpacklo_epi16(step1[18], step1[29]); + const __m256i out_09_1 = _mm256_unpackhi_epi16(step1[18], step1[29]); + const __m256i out_25_0 = _mm256_unpacklo_epi16(step1[19], step1[28]); + const __m256i out_25_1 = _mm256_unpackhi_epi16(step1[19], step1[28]); + const __m256i out_01_2 = + _mm256_madd_epi16(out_01_0, k__cospi_p31_p01); + const __m256i out_01_3 = + _mm256_madd_epi16(out_01_1, k__cospi_p31_p01); + const __m256i out_17_2 = + _mm256_madd_epi16(out_17_0, k__cospi_p15_p17); + const __m256i out_17_3 = + _mm256_madd_epi16(out_17_1, k__cospi_p15_p17); + const __m256i out_09_2 = + _mm256_madd_epi16(out_09_0, k__cospi_p23_p09); + const __m256i out_09_3 = + _mm256_madd_epi16(out_09_1, k__cospi_p23_p09); + const __m256i out_25_2 = + _mm256_madd_epi16(out_25_0, k__cospi_p07_p25); + const __m256i out_25_3 = + _mm256_madd_epi16(out_25_1, k__cospi_p07_p25); + const __m256i out_07_2 = + _mm256_madd_epi16(out_25_0, k__cospi_m25_p07); + const __m256i out_07_3 = + _mm256_madd_epi16(out_25_1, k__cospi_m25_p07); + const __m256i out_23_2 = + _mm256_madd_epi16(out_09_0, k__cospi_m09_p23); + const __m256i out_23_3 = + _mm256_madd_epi16(out_09_1, k__cospi_m09_p23); + const __m256i out_15_2 = + _mm256_madd_epi16(out_17_0, k__cospi_m17_p15); + const __m256i out_15_3 = + _mm256_madd_epi16(out_17_1, k__cospi_m17_p15); + const __m256i out_31_2 = + _mm256_madd_epi16(out_01_0, k__cospi_m01_p31); + const __m256i out_31_3 = + _mm256_madd_epi16(out_01_1, k__cospi_m01_p31); + // dct_const_round_shift + const __m256i out_01_4 = + _mm256_add_epi32(out_01_2, k__DCT_CONST_ROUNDING); + const __m256i out_01_5 = + _mm256_add_epi32(out_01_3, k__DCT_CONST_ROUNDING); + const __m256i out_17_4 = + _mm256_add_epi32(out_17_2, k__DCT_CONST_ROUNDING); + const __m256i out_17_5 = + _mm256_add_epi32(out_17_3, k__DCT_CONST_ROUNDING); + const __m256i out_09_4 = + _mm256_add_epi32(out_09_2, k__DCT_CONST_ROUNDING); + const __m256i out_09_5 = + _mm256_add_epi32(out_09_3, k__DCT_CONST_ROUNDING); + const __m256i out_25_4 = + _mm256_add_epi32(out_25_2, k__DCT_CONST_ROUNDING); + const __m256i out_25_5 = + _mm256_add_epi32(out_25_3, k__DCT_CONST_ROUNDING); + const __m256i out_07_4 = + _mm256_add_epi32(out_07_2, k__DCT_CONST_ROUNDING); + const __m256i out_07_5 = + _mm256_add_epi32(out_07_3, k__DCT_CONST_ROUNDING); + const __m256i out_23_4 = + _mm256_add_epi32(out_23_2, k__DCT_CONST_ROUNDING); + const __m256i out_23_5 = + _mm256_add_epi32(out_23_3, k__DCT_CONST_ROUNDING); + const __m256i out_15_4 = + _mm256_add_epi32(out_15_2, k__DCT_CONST_ROUNDING); + const __m256i out_15_5 = + _mm256_add_epi32(out_15_3, k__DCT_CONST_ROUNDING); + const __m256i out_31_4 = + _mm256_add_epi32(out_31_2, k__DCT_CONST_ROUNDING); + const __m256i out_31_5 = + _mm256_add_epi32(out_31_3, k__DCT_CONST_ROUNDING); + const __m256i out_01_6 = _mm256_srai_epi32(out_01_4, DCT_CONST_BITS); + const __m256i out_01_7 = _mm256_srai_epi32(out_01_5, DCT_CONST_BITS); + const __m256i out_17_6 = _mm256_srai_epi32(out_17_4, DCT_CONST_BITS); + const __m256i out_17_7 = _mm256_srai_epi32(out_17_5, DCT_CONST_BITS); + const __m256i out_09_6 = _mm256_srai_epi32(out_09_4, DCT_CONST_BITS); + const __m256i out_09_7 = _mm256_srai_epi32(out_09_5, DCT_CONST_BITS); + const __m256i out_25_6 = _mm256_srai_epi32(out_25_4, DCT_CONST_BITS); + const __m256i out_25_7 = _mm256_srai_epi32(out_25_5, DCT_CONST_BITS); + const __m256i out_07_6 = _mm256_srai_epi32(out_07_4, DCT_CONST_BITS); + const __m256i out_07_7 = _mm256_srai_epi32(out_07_5, DCT_CONST_BITS); + const __m256i out_23_6 = _mm256_srai_epi32(out_23_4, DCT_CONST_BITS); + const __m256i out_23_7 = _mm256_srai_epi32(out_23_5, DCT_CONST_BITS); + const __m256i out_15_6 = _mm256_srai_epi32(out_15_4, DCT_CONST_BITS); + const __m256i out_15_7 = _mm256_srai_epi32(out_15_5, DCT_CONST_BITS); + const __m256i out_31_6 = _mm256_srai_epi32(out_31_4, DCT_CONST_BITS); + const __m256i out_31_7 = _mm256_srai_epi32(out_31_5, DCT_CONST_BITS); + // Combine + out[1] = _mm256_packs_epi32(out_01_6, out_01_7); + out[17] = _mm256_packs_epi32(out_17_6, out_17_7); + out[9] = _mm256_packs_epi32(out_09_6, out_09_7); + out[25] = _mm256_packs_epi32(out_25_6, out_25_7); + out[7] = _mm256_packs_epi32(out_07_6, out_07_7); + out[23] = _mm256_packs_epi32(out_23_6, out_23_7); + out[15] = _mm256_packs_epi32(out_15_6, out_15_7); + out[31] = _mm256_packs_epi32(out_31_6, out_31_7); + } + { + const __m256i out_05_0 = _mm256_unpacklo_epi16(step1[20], step1[27]); + const __m256i out_05_1 = _mm256_unpackhi_epi16(step1[20], step1[27]); + const __m256i out_21_0 = _mm256_unpacklo_epi16(step1[21], step1[26]); + const __m256i out_21_1 = _mm256_unpackhi_epi16(step1[21], step1[26]); + const __m256i out_13_0 = _mm256_unpacklo_epi16(step1[22], step1[25]); + const __m256i out_13_1 = _mm256_unpackhi_epi16(step1[22], step1[25]); + const __m256i out_29_0 = _mm256_unpacklo_epi16(step1[23], step1[24]); + const __m256i out_29_1 = _mm256_unpackhi_epi16(step1[23], step1[24]); + const __m256i out_05_2 = + _mm256_madd_epi16(out_05_0, k__cospi_p27_p05); + const __m256i out_05_3 = + _mm256_madd_epi16(out_05_1, k__cospi_p27_p05); + const __m256i out_21_2 = + _mm256_madd_epi16(out_21_0, k__cospi_p11_p21); + const __m256i out_21_3 = + _mm256_madd_epi16(out_21_1, k__cospi_p11_p21); + const __m256i out_13_2 = + _mm256_madd_epi16(out_13_0, k__cospi_p19_p13); + const __m256i out_13_3 = + _mm256_madd_epi16(out_13_1, k__cospi_p19_p13); + const __m256i out_29_2 = + _mm256_madd_epi16(out_29_0, k__cospi_p03_p29); + const __m256i out_29_3 = + _mm256_madd_epi16(out_29_1, k__cospi_p03_p29); + const __m256i out_03_2 = + _mm256_madd_epi16(out_29_0, k__cospi_m29_p03); + const __m256i out_03_3 = + _mm256_madd_epi16(out_29_1, k__cospi_m29_p03); + const __m256i out_19_2 = + _mm256_madd_epi16(out_13_0, k__cospi_m13_p19); + const __m256i out_19_3 = + _mm256_madd_epi16(out_13_1, k__cospi_m13_p19); + const __m256i out_11_2 = + _mm256_madd_epi16(out_21_0, k__cospi_m21_p11); + const __m256i out_11_3 = + _mm256_madd_epi16(out_21_1, k__cospi_m21_p11); + const __m256i out_27_2 = + _mm256_madd_epi16(out_05_0, k__cospi_m05_p27); + const __m256i out_27_3 = + _mm256_madd_epi16(out_05_1, k__cospi_m05_p27); + // dct_const_round_shift + const __m256i out_05_4 = + _mm256_add_epi32(out_05_2, k__DCT_CONST_ROUNDING); + const __m256i out_05_5 = + _mm256_add_epi32(out_05_3, k__DCT_CONST_ROUNDING); + const __m256i out_21_4 = + _mm256_add_epi32(out_21_2, k__DCT_CONST_ROUNDING); + const __m256i out_21_5 = + _mm256_add_epi32(out_21_3, k__DCT_CONST_ROUNDING); + const __m256i out_13_4 = + _mm256_add_epi32(out_13_2, k__DCT_CONST_ROUNDING); + const __m256i out_13_5 = + _mm256_add_epi32(out_13_3, k__DCT_CONST_ROUNDING); + const __m256i out_29_4 = + _mm256_add_epi32(out_29_2, k__DCT_CONST_ROUNDING); + const __m256i out_29_5 = + _mm256_add_epi32(out_29_3, k__DCT_CONST_ROUNDING); + const __m256i out_03_4 = + _mm256_add_epi32(out_03_2, k__DCT_CONST_ROUNDING); + const __m256i out_03_5 = + _mm256_add_epi32(out_03_3, k__DCT_CONST_ROUNDING); + const __m256i out_19_4 = + _mm256_add_epi32(out_19_2, k__DCT_CONST_ROUNDING); + const __m256i out_19_5 = + _mm256_add_epi32(out_19_3, k__DCT_CONST_ROUNDING); + const __m256i out_11_4 = + _mm256_add_epi32(out_11_2, k__DCT_CONST_ROUNDING); + const __m256i out_11_5 = + _mm256_add_epi32(out_11_3, k__DCT_CONST_ROUNDING); + const __m256i out_27_4 = + _mm256_add_epi32(out_27_2, k__DCT_CONST_ROUNDING); + const __m256i out_27_5 = + _mm256_add_epi32(out_27_3, k__DCT_CONST_ROUNDING); + const __m256i out_05_6 = _mm256_srai_epi32(out_05_4, DCT_CONST_BITS); + const __m256i out_05_7 = _mm256_srai_epi32(out_05_5, DCT_CONST_BITS); + const __m256i out_21_6 = _mm256_srai_epi32(out_21_4, DCT_CONST_BITS); + const __m256i out_21_7 = _mm256_srai_epi32(out_21_5, DCT_CONST_BITS); + const __m256i out_13_6 = _mm256_srai_epi32(out_13_4, DCT_CONST_BITS); + const __m256i out_13_7 = _mm256_srai_epi32(out_13_5, DCT_CONST_BITS); + const __m256i out_29_6 = _mm256_srai_epi32(out_29_4, DCT_CONST_BITS); + const __m256i out_29_7 = _mm256_srai_epi32(out_29_5, DCT_CONST_BITS); + const __m256i out_03_6 = _mm256_srai_epi32(out_03_4, DCT_CONST_BITS); + const __m256i out_03_7 = _mm256_srai_epi32(out_03_5, DCT_CONST_BITS); + const __m256i out_19_6 = _mm256_srai_epi32(out_19_4, DCT_CONST_BITS); + const __m256i out_19_7 = _mm256_srai_epi32(out_19_5, DCT_CONST_BITS); + const __m256i out_11_6 = _mm256_srai_epi32(out_11_4, DCT_CONST_BITS); + const __m256i out_11_7 = _mm256_srai_epi32(out_11_5, DCT_CONST_BITS); + const __m256i out_27_6 = _mm256_srai_epi32(out_27_4, DCT_CONST_BITS); + const __m256i out_27_7 = _mm256_srai_epi32(out_27_5, DCT_CONST_BITS); + // Combine + out[5] = _mm256_packs_epi32(out_05_6, out_05_7); + out[21] = _mm256_packs_epi32(out_21_6, out_21_7); + out[13] = _mm256_packs_epi32(out_13_6, out_13_7); + out[29] = _mm256_packs_epi32(out_29_6, out_29_7); + out[3] = _mm256_packs_epi32(out_03_6, out_03_7); + out[19] = _mm256_packs_epi32(out_19_6, out_19_7); + out[11] = _mm256_packs_epi32(out_11_6, out_11_7); + out[27] = _mm256_packs_epi32(out_27_6, out_27_7); + } +#if FDCT32x32_HIGH_PRECISION + } else { + __m256i lstep1[64], lstep2[64], lstep3[64]; + __m256i u[32], v[32], sign[16]; + const __m256i K32One = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1); + const __m256i k__pOne_mOne = pair256_set_epi16(1, -1); + // start using 32-bit operations + // stage 3 + { + // expanding to 32-bit length while adding and subtracting + lstep2[0] = _mm256_unpacklo_epi16(step2[0], step2[7]); + lstep2[1] = _mm256_unpackhi_epi16(step2[0], step2[7]); + lstep2[2] = _mm256_unpacklo_epi16(step2[1], step2[6]); + lstep2[3] = _mm256_unpackhi_epi16(step2[1], step2[6]); + lstep2[4] = _mm256_unpacklo_epi16(step2[2], step2[5]); + lstep2[5] = _mm256_unpackhi_epi16(step2[2], step2[5]); + lstep2[6] = _mm256_unpacklo_epi16(step2[3], step2[4]); + lstep2[7] = _mm256_unpackhi_epi16(step2[3], step2[4]); + + lstep3[0] = _mm256_madd_epi16(lstep2[0], kOne); + lstep3[1] = _mm256_madd_epi16(lstep2[1], kOne); + lstep3[2] = _mm256_madd_epi16(lstep2[2], kOne); + lstep3[3] = _mm256_madd_epi16(lstep2[3], kOne); + lstep3[4] = _mm256_madd_epi16(lstep2[4], kOne); + lstep3[5] = _mm256_madd_epi16(lstep2[5], kOne); + lstep3[6] = _mm256_madd_epi16(lstep2[6], kOne); + lstep3[7] = _mm256_madd_epi16(lstep2[7], kOne); + + lstep3[8] = _mm256_madd_epi16(lstep2[6], k__pOne_mOne); + lstep3[9] = _mm256_madd_epi16(lstep2[7], k__pOne_mOne); + lstep3[10] = _mm256_madd_epi16(lstep2[4], k__pOne_mOne); + lstep3[11] = _mm256_madd_epi16(lstep2[5], k__pOne_mOne); + lstep3[12] = _mm256_madd_epi16(lstep2[2], k__pOne_mOne); + lstep3[13] = _mm256_madd_epi16(lstep2[3], k__pOne_mOne); + lstep3[14] = _mm256_madd_epi16(lstep2[0], k__pOne_mOne); + lstep3[15] = _mm256_madd_epi16(lstep2[1], k__pOne_mOne); + } + { + const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]); + const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]); + const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]); + const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]); + const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16); + const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16); + const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16); + const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16); + const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16); + const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16); + const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16); + const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m256i s3_10_4 = + _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); + const __m256i s3_10_5 = + _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); + const __m256i s3_11_4 = + _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); + const __m256i s3_11_5 = + _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); + const __m256i s3_12_4 = + _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); + const __m256i s3_12_5 = + _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); + const __m256i s3_13_4 = + _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); + const __m256i s3_13_5 = + _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); + lstep3[20] = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS); + lstep3[21] = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS); + lstep3[22] = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS); + lstep3[23] = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS); + lstep3[24] = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS); + lstep3[25] = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS); + lstep3[26] = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS); + lstep3[27] = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS); + } + { + lstep1[32] = _mm256_unpacklo_epi16(step1[16], step2[23]); + lstep1[33] = _mm256_unpackhi_epi16(step1[16], step2[23]); + lstep1[34] = _mm256_unpacklo_epi16(step1[17], step2[22]); + lstep1[35] = _mm256_unpackhi_epi16(step1[17], step2[22]); + lstep1[36] = _mm256_unpacklo_epi16(step1[18], step2[21]); + lstep1[37] = _mm256_unpackhi_epi16(step1[18], step2[21]); + lstep1[38] = _mm256_unpacklo_epi16(step1[19], step2[20]); + lstep1[39] = _mm256_unpackhi_epi16(step1[19], step2[20]); + + lstep1[56] = _mm256_unpacklo_epi16(step1[28], step2[27]); + lstep1[57] = _mm256_unpackhi_epi16(step1[28], step2[27]); + lstep1[58] = _mm256_unpacklo_epi16(step1[29], step2[26]); + lstep1[59] = _mm256_unpackhi_epi16(step1[29], step2[26]); + lstep1[60] = _mm256_unpacklo_epi16(step1[30], step2[25]); + lstep1[61] = _mm256_unpackhi_epi16(step1[30], step2[25]); + lstep1[62] = _mm256_unpacklo_epi16(step1[31], step2[24]); + lstep1[63] = _mm256_unpackhi_epi16(step1[31], step2[24]); + + lstep3[32] = _mm256_madd_epi16(lstep1[32], kOne); + lstep3[33] = _mm256_madd_epi16(lstep1[33], kOne); + lstep3[34] = _mm256_madd_epi16(lstep1[34], kOne); + lstep3[35] = _mm256_madd_epi16(lstep1[35], kOne); + lstep3[36] = _mm256_madd_epi16(lstep1[36], kOne); + lstep3[37] = _mm256_madd_epi16(lstep1[37], kOne); + lstep3[38] = _mm256_madd_epi16(lstep1[38], kOne); + lstep3[39] = _mm256_madd_epi16(lstep1[39], kOne); + + lstep3[40] = _mm256_madd_epi16(lstep1[38], k__pOne_mOne); + lstep3[41] = _mm256_madd_epi16(lstep1[39], k__pOne_mOne); + lstep3[42] = _mm256_madd_epi16(lstep1[36], k__pOne_mOne); + lstep3[43] = _mm256_madd_epi16(lstep1[37], k__pOne_mOne); + lstep3[44] = _mm256_madd_epi16(lstep1[34], k__pOne_mOne); + lstep3[45] = _mm256_madd_epi16(lstep1[35], k__pOne_mOne); + lstep3[46] = _mm256_madd_epi16(lstep1[32], k__pOne_mOne); + lstep3[47] = _mm256_madd_epi16(lstep1[33], k__pOne_mOne); + + lstep3[48] = _mm256_madd_epi16(lstep1[62], k__pOne_mOne); + lstep3[49] = _mm256_madd_epi16(lstep1[63], k__pOne_mOne); + lstep3[50] = _mm256_madd_epi16(lstep1[60], k__pOne_mOne); + lstep3[51] = _mm256_madd_epi16(lstep1[61], k__pOne_mOne); + lstep3[52] = _mm256_madd_epi16(lstep1[58], k__pOne_mOne); + lstep3[53] = _mm256_madd_epi16(lstep1[59], k__pOne_mOne); + lstep3[54] = _mm256_madd_epi16(lstep1[56], k__pOne_mOne); + lstep3[55] = _mm256_madd_epi16(lstep1[57], k__pOne_mOne); + + lstep3[56] = _mm256_madd_epi16(lstep1[56], kOne); + lstep3[57] = _mm256_madd_epi16(lstep1[57], kOne); + lstep3[58] = _mm256_madd_epi16(lstep1[58], kOne); + lstep3[59] = _mm256_madd_epi16(lstep1[59], kOne); + lstep3[60] = _mm256_madd_epi16(lstep1[60], kOne); + lstep3[61] = _mm256_madd_epi16(lstep1[61], kOne); + lstep3[62] = _mm256_madd_epi16(lstep1[62], kOne); + lstep3[63] = _mm256_madd_epi16(lstep1[63], kOne); + } + + // stage 4 + { + // expanding to 32-bit length prior to addition operations + sign[0] = _mm256_cmpgt_epi16(kZero, step2[8]); + sign[1] = _mm256_cmpgt_epi16(kZero, step2[9]); + sign[2] = _mm256_cmpgt_epi16(kZero, step2[14]); + sign[3] = _mm256_cmpgt_epi16(kZero, step2[15]); + lstep2[16] = _mm256_unpacklo_epi16(step2[8], sign[0]); + lstep2[17] = _mm256_unpackhi_epi16(step2[8], sign[0]); + lstep2[18] = _mm256_unpacklo_epi16(step2[9], sign[1]); + lstep2[19] = _mm256_unpackhi_epi16(step2[9], sign[1]); + lstep2[28] = _mm256_unpacklo_epi16(step2[14], sign[2]); + lstep2[29] = _mm256_unpackhi_epi16(step2[14], sign[2]); + lstep2[30] = _mm256_unpacklo_epi16(step2[15], sign[3]); + lstep2[31] = _mm256_unpackhi_epi16(step2[15], sign[3]); + + lstep1[0] = _mm256_add_epi32(lstep3[6], lstep3[0]); + lstep1[1] = _mm256_add_epi32(lstep3[7], lstep3[1]); + lstep1[2] = _mm256_add_epi32(lstep3[4], lstep3[2]); + lstep1[3] = _mm256_add_epi32(lstep3[5], lstep3[3]); + lstep1[4] = _mm256_sub_epi32(lstep3[2], lstep3[4]); + lstep1[5] = _mm256_sub_epi32(lstep3[3], lstep3[5]); + lstep1[6] = _mm256_sub_epi32(lstep3[0], lstep3[6]); + lstep1[7] = _mm256_sub_epi32(lstep3[1], lstep3[7]); + lstep1[16] = _mm256_add_epi32(lstep3[22], lstep2[16]); + lstep1[17] = _mm256_add_epi32(lstep3[23], lstep2[17]); + lstep1[18] = _mm256_add_epi32(lstep3[20], lstep2[18]); + lstep1[19] = _mm256_add_epi32(lstep3[21], lstep2[19]); + lstep1[20] = _mm256_sub_epi32(lstep2[18], lstep3[20]); + lstep1[21] = _mm256_sub_epi32(lstep2[19], lstep3[21]); + lstep1[22] = _mm256_sub_epi32(lstep2[16], lstep3[22]); + lstep1[23] = _mm256_sub_epi32(lstep2[17], lstep3[23]); + lstep1[24] = _mm256_sub_epi32(lstep2[30], lstep3[24]); + lstep1[25] = _mm256_sub_epi32(lstep2[31], lstep3[25]); + lstep1[26] = _mm256_sub_epi32(lstep2[28], lstep3[26]); + lstep1[27] = _mm256_sub_epi32(lstep2[29], lstep3[27]); + lstep1[28] = _mm256_add_epi32(lstep3[26], lstep2[28]); + lstep1[29] = _mm256_add_epi32(lstep3[27], lstep2[29]); + lstep1[30] = _mm256_add_epi32(lstep3[24], lstep2[30]); + lstep1[31] = _mm256_add_epi32(lstep3[25], lstep2[31]); + } + { + // to be continued... + // + const __m256i k32_p16_p16 = + pair256_set_epi32(cospi_16_64, cospi_16_64); + const __m256i k32_p16_m16 = + pair256_set_epi32(cospi_16_64, -cospi_16_64); + + u[0] = _mm256_unpacklo_epi32(lstep3[12], lstep3[10]); + u[1] = _mm256_unpackhi_epi32(lstep3[12], lstep3[10]); + u[2] = _mm256_unpacklo_epi32(lstep3[13], lstep3[11]); + u[3] = _mm256_unpackhi_epi32(lstep3[13], lstep3[11]); + + // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide + // instruction latency. + v[0] = k_madd_epi32_avx2(u[0], k32_p16_m16); + v[1] = k_madd_epi32_avx2(u[1], k32_p16_m16); + v[2] = k_madd_epi32_avx2(u[2], k32_p16_m16); + v[3] = k_madd_epi32_avx2(u[3], k32_p16_m16); + v[4] = k_madd_epi32_avx2(u[0], k32_p16_p16); + v[5] = k_madd_epi32_avx2(u[1], k32_p16_p16); + v[6] = k_madd_epi32_avx2(u[2], k32_p16_p16); + v[7] = k_madd_epi32_avx2(u[3], k32_p16_p16); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + + lstep1[10] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + lstep1[11] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + lstep1[12] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + lstep1[13] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + } + { + const __m256i k32_m08_p24 = + pair256_set_epi32(-cospi_8_64, cospi_24_64); + const __m256i k32_m24_m08 = + pair256_set_epi32(-cospi_24_64, -cospi_8_64); + const __m256i k32_p24_p08 = + pair256_set_epi32(cospi_24_64, cospi_8_64); + + u[0] = _mm256_unpacklo_epi32(lstep3[36], lstep3[58]); + u[1] = _mm256_unpackhi_epi32(lstep3[36], lstep3[58]); + u[2] = _mm256_unpacklo_epi32(lstep3[37], lstep3[59]); + u[3] = _mm256_unpackhi_epi32(lstep3[37], lstep3[59]); + u[4] = _mm256_unpacklo_epi32(lstep3[38], lstep3[56]); + u[5] = _mm256_unpackhi_epi32(lstep3[38], lstep3[56]); + u[6] = _mm256_unpacklo_epi32(lstep3[39], lstep3[57]); + u[7] = _mm256_unpackhi_epi32(lstep3[39], lstep3[57]); + u[8] = _mm256_unpacklo_epi32(lstep3[40], lstep3[54]); + u[9] = _mm256_unpackhi_epi32(lstep3[40], lstep3[54]); + u[10] = _mm256_unpacklo_epi32(lstep3[41], lstep3[55]); + u[11] = _mm256_unpackhi_epi32(lstep3[41], lstep3[55]); + u[12] = _mm256_unpacklo_epi32(lstep3[42], lstep3[52]); + u[13] = _mm256_unpackhi_epi32(lstep3[42], lstep3[52]); + u[14] = _mm256_unpacklo_epi32(lstep3[43], lstep3[53]); + u[15] = _mm256_unpackhi_epi32(lstep3[43], lstep3[53]); + + v[0] = k_madd_epi32_avx2(u[0], k32_m08_p24); + v[1] = k_madd_epi32_avx2(u[1], k32_m08_p24); + v[2] = k_madd_epi32_avx2(u[2], k32_m08_p24); + v[3] = k_madd_epi32_avx2(u[3], k32_m08_p24); + v[4] = k_madd_epi32_avx2(u[4], k32_m08_p24); + v[5] = k_madd_epi32_avx2(u[5], k32_m08_p24); + v[6] = k_madd_epi32_avx2(u[6], k32_m08_p24); + v[7] = k_madd_epi32_avx2(u[7], k32_m08_p24); + v[8] = k_madd_epi32_avx2(u[8], k32_m24_m08); + v[9] = k_madd_epi32_avx2(u[9], k32_m24_m08); + v[10] = k_madd_epi32_avx2(u[10], k32_m24_m08); + v[11] = k_madd_epi32_avx2(u[11], k32_m24_m08); + v[12] = k_madd_epi32_avx2(u[12], k32_m24_m08); + v[13] = k_madd_epi32_avx2(u[13], k32_m24_m08); + v[14] = k_madd_epi32_avx2(u[14], k32_m24_m08); + v[15] = k_madd_epi32_avx2(u[15], k32_m24_m08); + v[16] = k_madd_epi32_avx2(u[12], k32_m08_p24); + v[17] = k_madd_epi32_avx2(u[13], k32_m08_p24); + v[18] = k_madd_epi32_avx2(u[14], k32_m08_p24); + v[19] = k_madd_epi32_avx2(u[15], k32_m08_p24); + v[20] = k_madd_epi32_avx2(u[8], k32_m08_p24); + v[21] = k_madd_epi32_avx2(u[9], k32_m08_p24); + v[22] = k_madd_epi32_avx2(u[10], k32_m08_p24); + v[23] = k_madd_epi32_avx2(u[11], k32_m08_p24); + v[24] = k_madd_epi32_avx2(u[4], k32_p24_p08); + v[25] = k_madd_epi32_avx2(u[5], k32_p24_p08); + v[26] = k_madd_epi32_avx2(u[6], k32_p24_p08); + v[27] = k_madd_epi32_avx2(u[7], k32_p24_p08); + v[28] = k_madd_epi32_avx2(u[0], k32_p24_p08); + v[29] = k_madd_epi32_avx2(u[1], k32_p24_p08); + v[30] = k_madd_epi32_avx2(u[2], k32_p24_p08); + v[31] = k_madd_epi32_avx2(u[3], k32_p24_p08); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + u[4] = k_packs_epi64_avx2(v[8], v[9]); + u[5] = k_packs_epi64_avx2(v[10], v[11]); + u[6] = k_packs_epi64_avx2(v[12], v[13]); + u[7] = k_packs_epi64_avx2(v[14], v[15]); + u[8] = k_packs_epi64_avx2(v[16], v[17]); + u[9] = k_packs_epi64_avx2(v[18], v[19]); + u[10] = k_packs_epi64_avx2(v[20], v[21]); + u[11] = k_packs_epi64_avx2(v[22], v[23]); + u[12] = k_packs_epi64_avx2(v[24], v[25]); + u[13] = k_packs_epi64_avx2(v[26], v[27]); + u[14] = k_packs_epi64_avx2(v[28], v[29]); + u[15] = k_packs_epi64_avx2(v[30], v[31]); + + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + lstep1[36] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + lstep1[37] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + lstep1[38] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + lstep1[39] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + lstep1[40] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); + lstep1[41] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); + lstep1[42] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); + lstep1[43] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); + lstep1[52] = _mm256_srai_epi32(v[8], DCT_CONST_BITS); + lstep1[53] = _mm256_srai_epi32(v[9], DCT_CONST_BITS); + lstep1[54] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); + lstep1[55] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); + lstep1[56] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); + lstep1[57] = _mm256_srai_epi32(v[13], DCT_CONST_BITS); + lstep1[58] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); + lstep1[59] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); + } + // stage 5 + { + lstep2[8] = _mm256_add_epi32(lstep1[10], lstep3[8]); + lstep2[9] = _mm256_add_epi32(lstep1[11], lstep3[9]); + lstep2[10] = _mm256_sub_epi32(lstep3[8], lstep1[10]); + lstep2[11] = _mm256_sub_epi32(lstep3[9], lstep1[11]); + lstep2[12] = _mm256_sub_epi32(lstep3[14], lstep1[12]); + lstep2[13] = _mm256_sub_epi32(lstep3[15], lstep1[13]); + lstep2[14] = _mm256_add_epi32(lstep1[12], lstep3[14]); + lstep2[15] = _mm256_add_epi32(lstep1[13], lstep3[15]); + } + { + const __m256i k32_p16_p16 = + pair256_set_epi32(cospi_16_64, cospi_16_64); + const __m256i k32_p16_m16 = + pair256_set_epi32(cospi_16_64, -cospi_16_64); + const __m256i k32_p24_p08 = + pair256_set_epi32(cospi_24_64, cospi_8_64); + const __m256i k32_m08_p24 = + pair256_set_epi32(-cospi_8_64, cospi_24_64); + + u[0] = _mm256_unpacklo_epi32(lstep1[0], lstep1[2]); + u[1] = _mm256_unpackhi_epi32(lstep1[0], lstep1[2]); + u[2] = _mm256_unpacklo_epi32(lstep1[1], lstep1[3]); + u[3] = _mm256_unpackhi_epi32(lstep1[1], lstep1[3]); + u[4] = _mm256_unpacklo_epi32(lstep1[4], lstep1[6]); + u[5] = _mm256_unpackhi_epi32(lstep1[4], lstep1[6]); + u[6] = _mm256_unpacklo_epi32(lstep1[5], lstep1[7]); + u[7] = _mm256_unpackhi_epi32(lstep1[5], lstep1[7]); + + // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide + // instruction latency. + v[0] = k_madd_epi32_avx2(u[0], k32_p16_p16); + v[1] = k_madd_epi32_avx2(u[1], k32_p16_p16); + v[2] = k_madd_epi32_avx2(u[2], k32_p16_p16); + v[3] = k_madd_epi32_avx2(u[3], k32_p16_p16); + v[4] = k_madd_epi32_avx2(u[0], k32_p16_m16); + v[5] = k_madd_epi32_avx2(u[1], k32_p16_m16); + v[6] = k_madd_epi32_avx2(u[2], k32_p16_m16); + v[7] = k_madd_epi32_avx2(u[3], k32_p16_m16); + v[8] = k_madd_epi32_avx2(u[4], k32_p24_p08); + v[9] = k_madd_epi32_avx2(u[5], k32_p24_p08); + v[10] = k_madd_epi32_avx2(u[6], k32_p24_p08); + v[11] = k_madd_epi32_avx2(u[7], k32_p24_p08); + v[12] = k_madd_epi32_avx2(u[4], k32_m08_p24); + v[13] = k_madd_epi32_avx2(u[5], k32_m08_p24); + v[14] = k_madd_epi32_avx2(u[6], k32_m08_p24); + v[15] = k_madd_epi32_avx2(u[7], k32_m08_p24); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + u[4] = k_packs_epi64_avx2(v[8], v[9]); + u[5] = k_packs_epi64_avx2(v[10], v[11]); + u[6] = k_packs_epi64_avx2(v[12], v[13]); + u[7] = k_packs_epi64_avx2(v[14], v[15]); + + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); + + sign[0] = _mm256_cmpgt_epi32(kZero, u[0]); + sign[1] = _mm256_cmpgt_epi32(kZero, u[1]); + sign[2] = _mm256_cmpgt_epi32(kZero, u[2]); + sign[3] = _mm256_cmpgt_epi32(kZero, u[3]); + sign[4] = _mm256_cmpgt_epi32(kZero, u[4]); + sign[5] = _mm256_cmpgt_epi32(kZero, u[5]); + sign[6] = _mm256_cmpgt_epi32(kZero, u[6]); + sign[7] = _mm256_cmpgt_epi32(kZero, u[7]); + + u[0] = _mm256_sub_epi32(u[0], sign[0]); + u[1] = _mm256_sub_epi32(u[1], sign[1]); + u[2] = _mm256_sub_epi32(u[2], sign[2]); + u[3] = _mm256_sub_epi32(u[3], sign[3]); + u[4] = _mm256_sub_epi32(u[4], sign[4]); + u[5] = _mm256_sub_epi32(u[5], sign[5]); + u[6] = _mm256_sub_epi32(u[6], sign[6]); + u[7] = _mm256_sub_epi32(u[7], sign[7]); + + u[0] = _mm256_add_epi32(u[0], K32One); + u[1] = _mm256_add_epi32(u[1], K32One); + u[2] = _mm256_add_epi32(u[2], K32One); + u[3] = _mm256_add_epi32(u[3], K32One); + u[4] = _mm256_add_epi32(u[4], K32One); + u[5] = _mm256_add_epi32(u[5], K32One); + u[6] = _mm256_add_epi32(u[6], K32One); + u[7] = _mm256_add_epi32(u[7], K32One); + + u[0] = _mm256_srai_epi32(u[0], 2); + u[1] = _mm256_srai_epi32(u[1], 2); + u[2] = _mm256_srai_epi32(u[2], 2); + u[3] = _mm256_srai_epi32(u[3], 2); + u[4] = _mm256_srai_epi32(u[4], 2); + u[5] = _mm256_srai_epi32(u[5], 2); + u[6] = _mm256_srai_epi32(u[6], 2); + u[7] = _mm256_srai_epi32(u[7], 2); + + // Combine + out[0] = _mm256_packs_epi32(u[0], u[1]); + out[16] = _mm256_packs_epi32(u[2], u[3]); + out[8] = _mm256_packs_epi32(u[4], u[5]); + out[24] = _mm256_packs_epi32(u[6], u[7]); + } + { + const __m256i k32_m08_p24 = + pair256_set_epi32(-cospi_8_64, cospi_24_64); + const __m256i k32_m24_m08 = + pair256_set_epi32(-cospi_24_64, -cospi_8_64); + const __m256i k32_p24_p08 = + pair256_set_epi32(cospi_24_64, cospi_8_64); + + u[0] = _mm256_unpacklo_epi32(lstep1[18], lstep1[28]); + u[1] = _mm256_unpackhi_epi32(lstep1[18], lstep1[28]); + u[2] = _mm256_unpacklo_epi32(lstep1[19], lstep1[29]); + u[3] = _mm256_unpackhi_epi32(lstep1[19], lstep1[29]); + u[4] = _mm256_unpacklo_epi32(lstep1[20], lstep1[26]); + u[5] = _mm256_unpackhi_epi32(lstep1[20], lstep1[26]); + u[6] = _mm256_unpacklo_epi32(lstep1[21], lstep1[27]); + u[7] = _mm256_unpackhi_epi32(lstep1[21], lstep1[27]); + + v[0] = k_madd_epi32_avx2(u[0], k32_m08_p24); + v[1] = k_madd_epi32_avx2(u[1], k32_m08_p24); + v[2] = k_madd_epi32_avx2(u[2], k32_m08_p24); + v[3] = k_madd_epi32_avx2(u[3], k32_m08_p24); + v[4] = k_madd_epi32_avx2(u[4], k32_m24_m08); + v[5] = k_madd_epi32_avx2(u[5], k32_m24_m08); + v[6] = k_madd_epi32_avx2(u[6], k32_m24_m08); + v[7] = k_madd_epi32_avx2(u[7], k32_m24_m08); + v[8] = k_madd_epi32_avx2(u[4], k32_m08_p24); + v[9] = k_madd_epi32_avx2(u[5], k32_m08_p24); + v[10] = k_madd_epi32_avx2(u[6], k32_m08_p24); + v[11] = k_madd_epi32_avx2(u[7], k32_m08_p24); + v[12] = k_madd_epi32_avx2(u[0], k32_p24_p08); + v[13] = k_madd_epi32_avx2(u[1], k32_p24_p08); + v[14] = k_madd_epi32_avx2(u[2], k32_p24_p08); + v[15] = k_madd_epi32_avx2(u[3], k32_p24_p08); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + u[4] = k_packs_epi64_avx2(v[8], v[9]); + u[5] = k_packs_epi64_avx2(v[10], v[11]); + u[6] = k_packs_epi64_avx2(v[12], v[13]); + u[7] = k_packs_epi64_avx2(v[14], v[15]); + + u[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + u[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + u[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + u[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + u[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); + u[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); + u[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); + u[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); + + lstep2[18] = _mm256_srai_epi32(u[0], DCT_CONST_BITS); + lstep2[19] = _mm256_srai_epi32(u[1], DCT_CONST_BITS); + lstep2[20] = _mm256_srai_epi32(u[2], DCT_CONST_BITS); + lstep2[21] = _mm256_srai_epi32(u[3], DCT_CONST_BITS); + lstep2[26] = _mm256_srai_epi32(u[4], DCT_CONST_BITS); + lstep2[27] = _mm256_srai_epi32(u[5], DCT_CONST_BITS); + lstep2[28] = _mm256_srai_epi32(u[6], DCT_CONST_BITS); + lstep2[29] = _mm256_srai_epi32(u[7], DCT_CONST_BITS); + } + { + lstep2[32] = _mm256_add_epi32(lstep1[38], lstep3[32]); + lstep2[33] = _mm256_add_epi32(lstep1[39], lstep3[33]); + lstep2[34] = _mm256_add_epi32(lstep1[36], lstep3[34]); + lstep2[35] = _mm256_add_epi32(lstep1[37], lstep3[35]); + lstep2[36] = _mm256_sub_epi32(lstep3[34], lstep1[36]); + lstep2[37] = _mm256_sub_epi32(lstep3[35], lstep1[37]); + lstep2[38] = _mm256_sub_epi32(lstep3[32], lstep1[38]); + lstep2[39] = _mm256_sub_epi32(lstep3[33], lstep1[39]); + lstep2[40] = _mm256_sub_epi32(lstep3[46], lstep1[40]); + lstep2[41] = _mm256_sub_epi32(lstep3[47], lstep1[41]); + lstep2[42] = _mm256_sub_epi32(lstep3[44], lstep1[42]); + lstep2[43] = _mm256_sub_epi32(lstep3[45], lstep1[43]); + lstep2[44] = _mm256_add_epi32(lstep1[42], lstep3[44]); + lstep2[45] = _mm256_add_epi32(lstep1[43], lstep3[45]); + lstep2[46] = _mm256_add_epi32(lstep1[40], lstep3[46]); + lstep2[47] = _mm256_add_epi32(lstep1[41], lstep3[47]); + lstep2[48] = _mm256_add_epi32(lstep1[54], lstep3[48]); + lstep2[49] = _mm256_add_epi32(lstep1[55], lstep3[49]); + lstep2[50] = _mm256_add_epi32(lstep1[52], lstep3[50]); + lstep2[51] = _mm256_add_epi32(lstep1[53], lstep3[51]); + lstep2[52] = _mm256_sub_epi32(lstep3[50], lstep1[52]); + lstep2[53] = _mm256_sub_epi32(lstep3[51], lstep1[53]); + lstep2[54] = _mm256_sub_epi32(lstep3[48], lstep1[54]); + lstep2[55] = _mm256_sub_epi32(lstep3[49], lstep1[55]); + lstep2[56] = _mm256_sub_epi32(lstep3[62], lstep1[56]); + lstep2[57] = _mm256_sub_epi32(lstep3[63], lstep1[57]); + lstep2[58] = _mm256_sub_epi32(lstep3[60], lstep1[58]); + lstep2[59] = _mm256_sub_epi32(lstep3[61], lstep1[59]); + lstep2[60] = _mm256_add_epi32(lstep1[58], lstep3[60]); + lstep2[61] = _mm256_add_epi32(lstep1[59], lstep3[61]); + lstep2[62] = _mm256_add_epi32(lstep1[56], lstep3[62]); + lstep2[63] = _mm256_add_epi32(lstep1[57], lstep3[63]); + } + // stage 6 + { + const __m256i k32_p28_p04 = + pair256_set_epi32(cospi_28_64, cospi_4_64); + const __m256i k32_p12_p20 = + pair256_set_epi32(cospi_12_64, cospi_20_64); + const __m256i k32_m20_p12 = + pair256_set_epi32(-cospi_20_64, cospi_12_64); + const __m256i k32_m04_p28 = + pair256_set_epi32(-cospi_4_64, cospi_28_64); + + u[0] = _mm256_unpacklo_epi32(lstep2[8], lstep2[14]); + u[1] = _mm256_unpackhi_epi32(lstep2[8], lstep2[14]); + u[2] = _mm256_unpacklo_epi32(lstep2[9], lstep2[15]); + u[3] = _mm256_unpackhi_epi32(lstep2[9], lstep2[15]); + u[4] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]); + u[5] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]); + u[6] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]); + u[7] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]); + u[8] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]); + u[9] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]); + u[10] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]); + u[11] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]); + u[12] = _mm256_unpacklo_epi32(lstep2[8], lstep2[14]); + u[13] = _mm256_unpackhi_epi32(lstep2[8], lstep2[14]); + u[14] = _mm256_unpacklo_epi32(lstep2[9], lstep2[15]); + u[15] = _mm256_unpackhi_epi32(lstep2[9], lstep2[15]); + + v[0] = k_madd_epi32_avx2(u[0], k32_p28_p04); + v[1] = k_madd_epi32_avx2(u[1], k32_p28_p04); + v[2] = k_madd_epi32_avx2(u[2], k32_p28_p04); + v[3] = k_madd_epi32_avx2(u[3], k32_p28_p04); + v[4] = k_madd_epi32_avx2(u[4], k32_p12_p20); + v[5] = k_madd_epi32_avx2(u[5], k32_p12_p20); + v[6] = k_madd_epi32_avx2(u[6], k32_p12_p20); + v[7] = k_madd_epi32_avx2(u[7], k32_p12_p20); + v[8] = k_madd_epi32_avx2(u[8], k32_m20_p12); + v[9] = k_madd_epi32_avx2(u[9], k32_m20_p12); + v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12); + v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12); + v[12] = k_madd_epi32_avx2(u[12], k32_m04_p28); + v[13] = k_madd_epi32_avx2(u[13], k32_m04_p28); + v[14] = k_madd_epi32_avx2(u[14], k32_m04_p28); + v[15] = k_madd_epi32_avx2(u[15], k32_m04_p28); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + u[4] = k_packs_epi64_avx2(v[8], v[9]); + u[5] = k_packs_epi64_avx2(v[10], v[11]); + u[6] = k_packs_epi64_avx2(v[12], v[13]); + u[7] = k_packs_epi64_avx2(v[14], v[15]); + + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); + + sign[0] = _mm256_cmpgt_epi32(kZero, u[0]); + sign[1] = _mm256_cmpgt_epi32(kZero, u[1]); + sign[2] = _mm256_cmpgt_epi32(kZero, u[2]); + sign[3] = _mm256_cmpgt_epi32(kZero, u[3]); + sign[4] = _mm256_cmpgt_epi32(kZero, u[4]); + sign[5] = _mm256_cmpgt_epi32(kZero, u[5]); + sign[6] = _mm256_cmpgt_epi32(kZero, u[6]); + sign[7] = _mm256_cmpgt_epi32(kZero, u[7]); + + u[0] = _mm256_sub_epi32(u[0], sign[0]); + u[1] = _mm256_sub_epi32(u[1], sign[1]); + u[2] = _mm256_sub_epi32(u[2], sign[2]); + u[3] = _mm256_sub_epi32(u[3], sign[3]); + u[4] = _mm256_sub_epi32(u[4], sign[4]); + u[5] = _mm256_sub_epi32(u[5], sign[5]); + u[6] = _mm256_sub_epi32(u[6], sign[6]); + u[7] = _mm256_sub_epi32(u[7], sign[7]); + + u[0] = _mm256_add_epi32(u[0], K32One); + u[1] = _mm256_add_epi32(u[1], K32One); + u[2] = _mm256_add_epi32(u[2], K32One); + u[3] = _mm256_add_epi32(u[3], K32One); + u[4] = _mm256_add_epi32(u[4], K32One); + u[5] = _mm256_add_epi32(u[5], K32One); + u[6] = _mm256_add_epi32(u[6], K32One); + u[7] = _mm256_add_epi32(u[7], K32One); + + u[0] = _mm256_srai_epi32(u[0], 2); + u[1] = _mm256_srai_epi32(u[1], 2); + u[2] = _mm256_srai_epi32(u[2], 2); + u[3] = _mm256_srai_epi32(u[3], 2); + u[4] = _mm256_srai_epi32(u[4], 2); + u[5] = _mm256_srai_epi32(u[5], 2); + u[6] = _mm256_srai_epi32(u[6], 2); + u[7] = _mm256_srai_epi32(u[7], 2); + + out[4] = _mm256_packs_epi32(u[0], u[1]); + out[20] = _mm256_packs_epi32(u[2], u[3]); + out[12] = _mm256_packs_epi32(u[4], u[5]); + out[28] = _mm256_packs_epi32(u[6], u[7]); + } + { + lstep3[16] = _mm256_add_epi32(lstep2[18], lstep1[16]); + lstep3[17] = _mm256_add_epi32(lstep2[19], lstep1[17]); + lstep3[18] = _mm256_sub_epi32(lstep1[16], lstep2[18]); + lstep3[19] = _mm256_sub_epi32(lstep1[17], lstep2[19]); + lstep3[20] = _mm256_sub_epi32(lstep1[22], lstep2[20]); + lstep3[21] = _mm256_sub_epi32(lstep1[23], lstep2[21]); + lstep3[22] = _mm256_add_epi32(lstep2[20], lstep1[22]); + lstep3[23] = _mm256_add_epi32(lstep2[21], lstep1[23]); + lstep3[24] = _mm256_add_epi32(lstep2[26], lstep1[24]); + lstep3[25] = _mm256_add_epi32(lstep2[27], lstep1[25]); + lstep3[26] = _mm256_sub_epi32(lstep1[24], lstep2[26]); + lstep3[27] = _mm256_sub_epi32(lstep1[25], lstep2[27]); + lstep3[28] = _mm256_sub_epi32(lstep1[30], lstep2[28]); + lstep3[29] = _mm256_sub_epi32(lstep1[31], lstep2[29]); + lstep3[30] = _mm256_add_epi32(lstep2[28], lstep1[30]); + lstep3[31] = _mm256_add_epi32(lstep2[29], lstep1[31]); + } + { + const __m256i k32_m04_p28 = + pair256_set_epi32(-cospi_4_64, cospi_28_64); + const __m256i k32_m28_m04 = + pair256_set_epi32(-cospi_28_64, -cospi_4_64); + const __m256i k32_m20_p12 = + pair256_set_epi32(-cospi_20_64, cospi_12_64); + const __m256i k32_m12_m20 = + pair256_set_epi32(-cospi_12_64, -cospi_20_64); + const __m256i k32_p12_p20 = + pair256_set_epi32(cospi_12_64, cospi_20_64); + const __m256i k32_p28_p04 = + pair256_set_epi32(cospi_28_64, cospi_4_64); + + u[0] = _mm256_unpacklo_epi32(lstep2[34], lstep2[60]); + u[1] = _mm256_unpackhi_epi32(lstep2[34], lstep2[60]); + u[2] = _mm256_unpacklo_epi32(lstep2[35], lstep2[61]); + u[3] = _mm256_unpackhi_epi32(lstep2[35], lstep2[61]); + u[4] = _mm256_unpacklo_epi32(lstep2[36], lstep2[58]); + u[5] = _mm256_unpackhi_epi32(lstep2[36], lstep2[58]); + u[6] = _mm256_unpacklo_epi32(lstep2[37], lstep2[59]); + u[7] = _mm256_unpackhi_epi32(lstep2[37], lstep2[59]); + u[8] = _mm256_unpacklo_epi32(lstep2[42], lstep2[52]); + u[9] = _mm256_unpackhi_epi32(lstep2[42], lstep2[52]); + u[10] = _mm256_unpacklo_epi32(lstep2[43], lstep2[53]); + u[11] = _mm256_unpackhi_epi32(lstep2[43], lstep2[53]); + u[12] = _mm256_unpacklo_epi32(lstep2[44], lstep2[50]); + u[13] = _mm256_unpackhi_epi32(lstep2[44], lstep2[50]); + u[14] = _mm256_unpacklo_epi32(lstep2[45], lstep2[51]); + u[15] = _mm256_unpackhi_epi32(lstep2[45], lstep2[51]); + + v[0] = k_madd_epi32_avx2(u[0], k32_m04_p28); + v[1] = k_madd_epi32_avx2(u[1], k32_m04_p28); + v[2] = k_madd_epi32_avx2(u[2], k32_m04_p28); + v[3] = k_madd_epi32_avx2(u[3], k32_m04_p28); + v[4] = k_madd_epi32_avx2(u[4], k32_m28_m04); + v[5] = k_madd_epi32_avx2(u[5], k32_m28_m04); + v[6] = k_madd_epi32_avx2(u[6], k32_m28_m04); + v[7] = k_madd_epi32_avx2(u[7], k32_m28_m04); + v[8] = k_madd_epi32_avx2(u[8], k32_m20_p12); + v[9] = k_madd_epi32_avx2(u[9], k32_m20_p12); + v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12); + v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12); + v[12] = k_madd_epi32_avx2(u[12], k32_m12_m20); + v[13] = k_madd_epi32_avx2(u[13], k32_m12_m20); + v[14] = k_madd_epi32_avx2(u[14], k32_m12_m20); + v[15] = k_madd_epi32_avx2(u[15], k32_m12_m20); + v[16] = k_madd_epi32_avx2(u[12], k32_m20_p12); + v[17] = k_madd_epi32_avx2(u[13], k32_m20_p12); + v[18] = k_madd_epi32_avx2(u[14], k32_m20_p12); + v[19] = k_madd_epi32_avx2(u[15], k32_m20_p12); + v[20] = k_madd_epi32_avx2(u[8], k32_p12_p20); + v[21] = k_madd_epi32_avx2(u[9], k32_p12_p20); + v[22] = k_madd_epi32_avx2(u[10], k32_p12_p20); + v[23] = k_madd_epi32_avx2(u[11], k32_p12_p20); + v[24] = k_madd_epi32_avx2(u[4], k32_m04_p28); + v[25] = k_madd_epi32_avx2(u[5], k32_m04_p28); + v[26] = k_madd_epi32_avx2(u[6], k32_m04_p28); + v[27] = k_madd_epi32_avx2(u[7], k32_m04_p28); + v[28] = k_madd_epi32_avx2(u[0], k32_p28_p04); + v[29] = k_madd_epi32_avx2(u[1], k32_p28_p04); + v[30] = k_madd_epi32_avx2(u[2], k32_p28_p04); + v[31] = k_madd_epi32_avx2(u[3], k32_p28_p04); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + u[4] = k_packs_epi64_avx2(v[8], v[9]); + u[5] = k_packs_epi64_avx2(v[10], v[11]); + u[6] = k_packs_epi64_avx2(v[12], v[13]); + u[7] = k_packs_epi64_avx2(v[14], v[15]); + u[8] = k_packs_epi64_avx2(v[16], v[17]); + u[9] = k_packs_epi64_avx2(v[18], v[19]); + u[10] = k_packs_epi64_avx2(v[20], v[21]); + u[11] = k_packs_epi64_avx2(v[22], v[23]); + u[12] = k_packs_epi64_avx2(v[24], v[25]); + u[13] = k_packs_epi64_avx2(v[26], v[27]); + u[14] = k_packs_epi64_avx2(v[28], v[29]); + u[15] = k_packs_epi64_avx2(v[30], v[31]); + + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + lstep3[34] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + lstep3[35] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + lstep3[36] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + lstep3[37] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + lstep3[42] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); + lstep3[43] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); + lstep3[44] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); + lstep3[45] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); + lstep3[50] = _mm256_srai_epi32(v[8], DCT_CONST_BITS); + lstep3[51] = _mm256_srai_epi32(v[9], DCT_CONST_BITS); + lstep3[52] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); + lstep3[53] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); + lstep3[58] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); + lstep3[59] = _mm256_srai_epi32(v[13], DCT_CONST_BITS); + lstep3[60] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); + lstep3[61] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); + } + // stage 7 + { + const __m256i k32_p30_p02 = + pair256_set_epi32(cospi_30_64, cospi_2_64); + const __m256i k32_p14_p18 = + pair256_set_epi32(cospi_14_64, cospi_18_64); + const __m256i k32_p22_p10 = + pair256_set_epi32(cospi_22_64, cospi_10_64); + const __m256i k32_p06_p26 = + pair256_set_epi32(cospi_6_64, cospi_26_64); + const __m256i k32_m26_p06 = + pair256_set_epi32(-cospi_26_64, cospi_6_64); + const __m256i k32_m10_p22 = + pair256_set_epi32(-cospi_10_64, cospi_22_64); + const __m256i k32_m18_p14 = + pair256_set_epi32(-cospi_18_64, cospi_14_64); + const __m256i k32_m02_p30 = + pair256_set_epi32(-cospi_2_64, cospi_30_64); + + u[0] = _mm256_unpacklo_epi32(lstep3[16], lstep3[30]); + u[1] = _mm256_unpackhi_epi32(lstep3[16], lstep3[30]); + u[2] = _mm256_unpacklo_epi32(lstep3[17], lstep3[31]); + u[3] = _mm256_unpackhi_epi32(lstep3[17], lstep3[31]); + u[4] = _mm256_unpacklo_epi32(lstep3[18], lstep3[28]); + u[5] = _mm256_unpackhi_epi32(lstep3[18], lstep3[28]); + u[6] = _mm256_unpacklo_epi32(lstep3[19], lstep3[29]); + u[7] = _mm256_unpackhi_epi32(lstep3[19], lstep3[29]); + u[8] = _mm256_unpacklo_epi32(lstep3[20], lstep3[26]); + u[9] = _mm256_unpackhi_epi32(lstep3[20], lstep3[26]); + u[10] = _mm256_unpacklo_epi32(lstep3[21], lstep3[27]); + u[11] = _mm256_unpackhi_epi32(lstep3[21], lstep3[27]); + u[12] = _mm256_unpacklo_epi32(lstep3[22], lstep3[24]); + u[13] = _mm256_unpackhi_epi32(lstep3[22], lstep3[24]); + u[14] = _mm256_unpacklo_epi32(lstep3[23], lstep3[25]); + u[15] = _mm256_unpackhi_epi32(lstep3[23], lstep3[25]); + + v[0] = k_madd_epi32_avx2(u[0], k32_p30_p02); + v[1] = k_madd_epi32_avx2(u[1], k32_p30_p02); + v[2] = k_madd_epi32_avx2(u[2], k32_p30_p02); + v[3] = k_madd_epi32_avx2(u[3], k32_p30_p02); + v[4] = k_madd_epi32_avx2(u[4], k32_p14_p18); + v[5] = k_madd_epi32_avx2(u[5], k32_p14_p18); + v[6] = k_madd_epi32_avx2(u[6], k32_p14_p18); + v[7] = k_madd_epi32_avx2(u[7], k32_p14_p18); + v[8] = k_madd_epi32_avx2(u[8], k32_p22_p10); + v[9] = k_madd_epi32_avx2(u[9], k32_p22_p10); + v[10] = k_madd_epi32_avx2(u[10], k32_p22_p10); + v[11] = k_madd_epi32_avx2(u[11], k32_p22_p10); + v[12] = k_madd_epi32_avx2(u[12], k32_p06_p26); + v[13] = k_madd_epi32_avx2(u[13], k32_p06_p26); + v[14] = k_madd_epi32_avx2(u[14], k32_p06_p26); + v[15] = k_madd_epi32_avx2(u[15], k32_p06_p26); + v[16] = k_madd_epi32_avx2(u[12], k32_m26_p06); + v[17] = k_madd_epi32_avx2(u[13], k32_m26_p06); + v[18] = k_madd_epi32_avx2(u[14], k32_m26_p06); + v[19] = k_madd_epi32_avx2(u[15], k32_m26_p06); + v[20] = k_madd_epi32_avx2(u[8], k32_m10_p22); + v[21] = k_madd_epi32_avx2(u[9], k32_m10_p22); + v[22] = k_madd_epi32_avx2(u[10], k32_m10_p22); + v[23] = k_madd_epi32_avx2(u[11], k32_m10_p22); + v[24] = k_madd_epi32_avx2(u[4], k32_m18_p14); + v[25] = k_madd_epi32_avx2(u[5], k32_m18_p14); + v[26] = k_madd_epi32_avx2(u[6], k32_m18_p14); + v[27] = k_madd_epi32_avx2(u[7], k32_m18_p14); + v[28] = k_madd_epi32_avx2(u[0], k32_m02_p30); + v[29] = k_madd_epi32_avx2(u[1], k32_m02_p30); + v[30] = k_madd_epi32_avx2(u[2], k32_m02_p30); + v[31] = k_madd_epi32_avx2(u[3], k32_m02_p30); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + u[4] = k_packs_epi64_avx2(v[8], v[9]); + u[5] = k_packs_epi64_avx2(v[10], v[11]); + u[6] = k_packs_epi64_avx2(v[12], v[13]); + u[7] = k_packs_epi64_avx2(v[14], v[15]); + u[8] = k_packs_epi64_avx2(v[16], v[17]); + u[9] = k_packs_epi64_avx2(v[18], v[19]); + u[10] = k_packs_epi64_avx2(v[20], v[21]); + u[11] = k_packs_epi64_avx2(v[22], v[23]); + u[12] = k_packs_epi64_avx2(v[24], v[25]); + u[13] = k_packs_epi64_avx2(v[26], v[27]); + u[14] = k_packs_epi64_avx2(v[28], v[29]); + u[15] = k_packs_epi64_avx2(v[30], v[31]); + + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); + + v[0] = _mm256_cmpgt_epi32(kZero, u[0]); + v[1] = _mm256_cmpgt_epi32(kZero, u[1]); + v[2] = _mm256_cmpgt_epi32(kZero, u[2]); + v[3] = _mm256_cmpgt_epi32(kZero, u[3]); + v[4] = _mm256_cmpgt_epi32(kZero, u[4]); + v[5] = _mm256_cmpgt_epi32(kZero, u[5]); + v[6] = _mm256_cmpgt_epi32(kZero, u[6]); + v[7] = _mm256_cmpgt_epi32(kZero, u[7]); + v[8] = _mm256_cmpgt_epi32(kZero, u[8]); + v[9] = _mm256_cmpgt_epi32(kZero, u[9]); + v[10] = _mm256_cmpgt_epi32(kZero, u[10]); + v[11] = _mm256_cmpgt_epi32(kZero, u[11]); + v[12] = _mm256_cmpgt_epi32(kZero, u[12]); + v[13] = _mm256_cmpgt_epi32(kZero, u[13]); + v[14] = _mm256_cmpgt_epi32(kZero, u[14]); + v[15] = _mm256_cmpgt_epi32(kZero, u[15]); + + u[0] = _mm256_sub_epi32(u[0], v[0]); + u[1] = _mm256_sub_epi32(u[1], v[1]); + u[2] = _mm256_sub_epi32(u[2], v[2]); + u[3] = _mm256_sub_epi32(u[3], v[3]); + u[4] = _mm256_sub_epi32(u[4], v[4]); + u[5] = _mm256_sub_epi32(u[5], v[5]); + u[6] = _mm256_sub_epi32(u[6], v[6]); + u[7] = _mm256_sub_epi32(u[7], v[7]); + u[8] = _mm256_sub_epi32(u[8], v[8]); + u[9] = _mm256_sub_epi32(u[9], v[9]); + u[10] = _mm256_sub_epi32(u[10], v[10]); + u[11] = _mm256_sub_epi32(u[11], v[11]); + u[12] = _mm256_sub_epi32(u[12], v[12]); + u[13] = _mm256_sub_epi32(u[13], v[13]); + u[14] = _mm256_sub_epi32(u[14], v[14]); + u[15] = _mm256_sub_epi32(u[15], v[15]); + + v[0] = _mm256_add_epi32(u[0], K32One); + v[1] = _mm256_add_epi32(u[1], K32One); + v[2] = _mm256_add_epi32(u[2], K32One); + v[3] = _mm256_add_epi32(u[3], K32One); + v[4] = _mm256_add_epi32(u[4], K32One); + v[5] = _mm256_add_epi32(u[5], K32One); + v[6] = _mm256_add_epi32(u[6], K32One); + v[7] = _mm256_add_epi32(u[7], K32One); + v[8] = _mm256_add_epi32(u[8], K32One); + v[9] = _mm256_add_epi32(u[9], K32One); + v[10] = _mm256_add_epi32(u[10], K32One); + v[11] = _mm256_add_epi32(u[11], K32One); + v[12] = _mm256_add_epi32(u[12], K32One); + v[13] = _mm256_add_epi32(u[13], K32One); + v[14] = _mm256_add_epi32(u[14], K32One); + v[15] = _mm256_add_epi32(u[15], K32One); + + u[0] = _mm256_srai_epi32(v[0], 2); + u[1] = _mm256_srai_epi32(v[1], 2); + u[2] = _mm256_srai_epi32(v[2], 2); + u[3] = _mm256_srai_epi32(v[3], 2); + u[4] = _mm256_srai_epi32(v[4], 2); + u[5] = _mm256_srai_epi32(v[5], 2); + u[6] = _mm256_srai_epi32(v[6], 2); + u[7] = _mm256_srai_epi32(v[7], 2); + u[8] = _mm256_srai_epi32(v[8], 2); + u[9] = _mm256_srai_epi32(v[9], 2); + u[10] = _mm256_srai_epi32(v[10], 2); + u[11] = _mm256_srai_epi32(v[11], 2); + u[12] = _mm256_srai_epi32(v[12], 2); + u[13] = _mm256_srai_epi32(v[13], 2); + u[14] = _mm256_srai_epi32(v[14], 2); + u[15] = _mm256_srai_epi32(v[15], 2); + + out[2] = _mm256_packs_epi32(u[0], u[1]); + out[18] = _mm256_packs_epi32(u[2], u[3]); + out[10] = _mm256_packs_epi32(u[4], u[5]); + out[26] = _mm256_packs_epi32(u[6], u[7]); + out[6] = _mm256_packs_epi32(u[8], u[9]); + out[22] = _mm256_packs_epi32(u[10], u[11]); + out[14] = _mm256_packs_epi32(u[12], u[13]); + out[30] = _mm256_packs_epi32(u[14], u[15]); + } + { + lstep1[32] = _mm256_add_epi32(lstep3[34], lstep2[32]); + lstep1[33] = _mm256_add_epi32(lstep3[35], lstep2[33]); + lstep1[34] = _mm256_sub_epi32(lstep2[32], lstep3[34]); + lstep1[35] = _mm256_sub_epi32(lstep2[33], lstep3[35]); + lstep1[36] = _mm256_sub_epi32(lstep2[38], lstep3[36]); + lstep1[37] = _mm256_sub_epi32(lstep2[39], lstep3[37]); + lstep1[38] = _mm256_add_epi32(lstep3[36], lstep2[38]); + lstep1[39] = _mm256_add_epi32(lstep3[37], lstep2[39]); + lstep1[40] = _mm256_add_epi32(lstep3[42], lstep2[40]); + lstep1[41] = _mm256_add_epi32(lstep3[43], lstep2[41]); + lstep1[42] = _mm256_sub_epi32(lstep2[40], lstep3[42]); + lstep1[43] = _mm256_sub_epi32(lstep2[41], lstep3[43]); + lstep1[44] = _mm256_sub_epi32(lstep2[46], lstep3[44]); + lstep1[45] = _mm256_sub_epi32(lstep2[47], lstep3[45]); + lstep1[46] = _mm256_add_epi32(lstep3[44], lstep2[46]); + lstep1[47] = _mm256_add_epi32(lstep3[45], lstep2[47]); + lstep1[48] = _mm256_add_epi32(lstep3[50], lstep2[48]); + lstep1[49] = _mm256_add_epi32(lstep3[51], lstep2[49]); + lstep1[50] = _mm256_sub_epi32(lstep2[48], lstep3[50]); + lstep1[51] = _mm256_sub_epi32(lstep2[49], lstep3[51]); + lstep1[52] = _mm256_sub_epi32(lstep2[54], lstep3[52]); + lstep1[53] = _mm256_sub_epi32(lstep2[55], lstep3[53]); + lstep1[54] = _mm256_add_epi32(lstep3[52], lstep2[54]); + lstep1[55] = _mm256_add_epi32(lstep3[53], lstep2[55]); + lstep1[56] = _mm256_add_epi32(lstep3[58], lstep2[56]); + lstep1[57] = _mm256_add_epi32(lstep3[59], lstep2[57]); + lstep1[58] = _mm256_sub_epi32(lstep2[56], lstep3[58]); + lstep1[59] = _mm256_sub_epi32(lstep2[57], lstep3[59]); + lstep1[60] = _mm256_sub_epi32(lstep2[62], lstep3[60]); + lstep1[61] = _mm256_sub_epi32(lstep2[63], lstep3[61]); + lstep1[62] = _mm256_add_epi32(lstep3[60], lstep2[62]); + lstep1[63] = _mm256_add_epi32(lstep3[61], lstep2[63]); + } + // stage 8 + { + const __m256i k32_p31_p01 = + pair256_set_epi32(cospi_31_64, cospi_1_64); + const __m256i k32_p15_p17 = + pair256_set_epi32(cospi_15_64, cospi_17_64); + const __m256i k32_p23_p09 = + pair256_set_epi32(cospi_23_64, cospi_9_64); + const __m256i k32_p07_p25 = + pair256_set_epi32(cospi_7_64, cospi_25_64); + const __m256i k32_m25_p07 = + pair256_set_epi32(-cospi_25_64, cospi_7_64); + const __m256i k32_m09_p23 = + pair256_set_epi32(-cospi_9_64, cospi_23_64); + const __m256i k32_m17_p15 = + pair256_set_epi32(-cospi_17_64, cospi_15_64); + const __m256i k32_m01_p31 = + pair256_set_epi32(-cospi_1_64, cospi_31_64); + + u[0] = _mm256_unpacklo_epi32(lstep1[32], lstep1[62]); + u[1] = _mm256_unpackhi_epi32(lstep1[32], lstep1[62]); + u[2] = _mm256_unpacklo_epi32(lstep1[33], lstep1[63]); + u[3] = _mm256_unpackhi_epi32(lstep1[33], lstep1[63]); + u[4] = _mm256_unpacklo_epi32(lstep1[34], lstep1[60]); + u[5] = _mm256_unpackhi_epi32(lstep1[34], lstep1[60]); + u[6] = _mm256_unpacklo_epi32(lstep1[35], lstep1[61]); + u[7] = _mm256_unpackhi_epi32(lstep1[35], lstep1[61]); + u[8] = _mm256_unpacklo_epi32(lstep1[36], lstep1[58]); + u[9] = _mm256_unpackhi_epi32(lstep1[36], lstep1[58]); + u[10] = _mm256_unpacklo_epi32(lstep1[37], lstep1[59]); + u[11] = _mm256_unpackhi_epi32(lstep1[37], lstep1[59]); + u[12] = _mm256_unpacklo_epi32(lstep1[38], lstep1[56]); + u[13] = _mm256_unpackhi_epi32(lstep1[38], lstep1[56]); + u[14] = _mm256_unpacklo_epi32(lstep1[39], lstep1[57]); + u[15] = _mm256_unpackhi_epi32(lstep1[39], lstep1[57]); + + v[0] = k_madd_epi32_avx2(u[0], k32_p31_p01); + v[1] = k_madd_epi32_avx2(u[1], k32_p31_p01); + v[2] = k_madd_epi32_avx2(u[2], k32_p31_p01); + v[3] = k_madd_epi32_avx2(u[3], k32_p31_p01); + v[4] = k_madd_epi32_avx2(u[4], k32_p15_p17); + v[5] = k_madd_epi32_avx2(u[5], k32_p15_p17); + v[6] = k_madd_epi32_avx2(u[6], k32_p15_p17); + v[7] = k_madd_epi32_avx2(u[7], k32_p15_p17); + v[8] = k_madd_epi32_avx2(u[8], k32_p23_p09); + v[9] = k_madd_epi32_avx2(u[9], k32_p23_p09); + v[10] = k_madd_epi32_avx2(u[10], k32_p23_p09); + v[11] = k_madd_epi32_avx2(u[11], k32_p23_p09); + v[12] = k_madd_epi32_avx2(u[12], k32_p07_p25); + v[13] = k_madd_epi32_avx2(u[13], k32_p07_p25); + v[14] = k_madd_epi32_avx2(u[14], k32_p07_p25); + v[15] = k_madd_epi32_avx2(u[15], k32_p07_p25); + v[16] = k_madd_epi32_avx2(u[12], k32_m25_p07); + v[17] = k_madd_epi32_avx2(u[13], k32_m25_p07); + v[18] = k_madd_epi32_avx2(u[14], k32_m25_p07); + v[19] = k_madd_epi32_avx2(u[15], k32_m25_p07); + v[20] = k_madd_epi32_avx2(u[8], k32_m09_p23); + v[21] = k_madd_epi32_avx2(u[9], k32_m09_p23); + v[22] = k_madd_epi32_avx2(u[10], k32_m09_p23); + v[23] = k_madd_epi32_avx2(u[11], k32_m09_p23); + v[24] = k_madd_epi32_avx2(u[4], k32_m17_p15); + v[25] = k_madd_epi32_avx2(u[5], k32_m17_p15); + v[26] = k_madd_epi32_avx2(u[6], k32_m17_p15); + v[27] = k_madd_epi32_avx2(u[7], k32_m17_p15); + v[28] = k_madd_epi32_avx2(u[0], k32_m01_p31); + v[29] = k_madd_epi32_avx2(u[1], k32_m01_p31); + v[30] = k_madd_epi32_avx2(u[2], k32_m01_p31); + v[31] = k_madd_epi32_avx2(u[3], k32_m01_p31); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + u[4] = k_packs_epi64_avx2(v[8], v[9]); + u[5] = k_packs_epi64_avx2(v[10], v[11]); + u[6] = k_packs_epi64_avx2(v[12], v[13]); + u[7] = k_packs_epi64_avx2(v[14], v[15]); + u[8] = k_packs_epi64_avx2(v[16], v[17]); + u[9] = k_packs_epi64_avx2(v[18], v[19]); + u[10] = k_packs_epi64_avx2(v[20], v[21]); + u[11] = k_packs_epi64_avx2(v[22], v[23]); + u[12] = k_packs_epi64_avx2(v[24], v[25]); + u[13] = k_packs_epi64_avx2(v[26], v[27]); + u[14] = k_packs_epi64_avx2(v[28], v[29]); + u[15] = k_packs_epi64_avx2(v[30], v[31]); + + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); + + v[0] = _mm256_cmpgt_epi32(kZero, u[0]); + v[1] = _mm256_cmpgt_epi32(kZero, u[1]); + v[2] = _mm256_cmpgt_epi32(kZero, u[2]); + v[3] = _mm256_cmpgt_epi32(kZero, u[3]); + v[4] = _mm256_cmpgt_epi32(kZero, u[4]); + v[5] = _mm256_cmpgt_epi32(kZero, u[5]); + v[6] = _mm256_cmpgt_epi32(kZero, u[6]); + v[7] = _mm256_cmpgt_epi32(kZero, u[7]); + v[8] = _mm256_cmpgt_epi32(kZero, u[8]); + v[9] = _mm256_cmpgt_epi32(kZero, u[9]); + v[10] = _mm256_cmpgt_epi32(kZero, u[10]); + v[11] = _mm256_cmpgt_epi32(kZero, u[11]); + v[12] = _mm256_cmpgt_epi32(kZero, u[12]); + v[13] = _mm256_cmpgt_epi32(kZero, u[13]); + v[14] = _mm256_cmpgt_epi32(kZero, u[14]); + v[15] = _mm256_cmpgt_epi32(kZero, u[15]); + + u[0] = _mm256_sub_epi32(u[0], v[0]); + u[1] = _mm256_sub_epi32(u[1], v[1]); + u[2] = _mm256_sub_epi32(u[2], v[2]); + u[3] = _mm256_sub_epi32(u[3], v[3]); + u[4] = _mm256_sub_epi32(u[4], v[4]); + u[5] = _mm256_sub_epi32(u[5], v[5]); + u[6] = _mm256_sub_epi32(u[6], v[6]); + u[7] = _mm256_sub_epi32(u[7], v[7]); + u[8] = _mm256_sub_epi32(u[8], v[8]); + u[9] = _mm256_sub_epi32(u[9], v[9]); + u[10] = _mm256_sub_epi32(u[10], v[10]); + u[11] = _mm256_sub_epi32(u[11], v[11]); + u[12] = _mm256_sub_epi32(u[12], v[12]); + u[13] = _mm256_sub_epi32(u[13], v[13]); + u[14] = _mm256_sub_epi32(u[14], v[14]); + u[15] = _mm256_sub_epi32(u[15], v[15]); + + v[0] = _mm256_add_epi32(u[0], K32One); + v[1] = _mm256_add_epi32(u[1], K32One); + v[2] = _mm256_add_epi32(u[2], K32One); + v[3] = _mm256_add_epi32(u[3], K32One); + v[4] = _mm256_add_epi32(u[4], K32One); + v[5] = _mm256_add_epi32(u[5], K32One); + v[6] = _mm256_add_epi32(u[6], K32One); + v[7] = _mm256_add_epi32(u[7], K32One); + v[8] = _mm256_add_epi32(u[8], K32One); + v[9] = _mm256_add_epi32(u[9], K32One); + v[10] = _mm256_add_epi32(u[10], K32One); + v[11] = _mm256_add_epi32(u[11], K32One); + v[12] = _mm256_add_epi32(u[12], K32One); + v[13] = _mm256_add_epi32(u[13], K32One); + v[14] = _mm256_add_epi32(u[14], K32One); + v[15] = _mm256_add_epi32(u[15], K32One); + + u[0] = _mm256_srai_epi32(v[0], 2); + u[1] = _mm256_srai_epi32(v[1], 2); + u[2] = _mm256_srai_epi32(v[2], 2); + u[3] = _mm256_srai_epi32(v[3], 2); + u[4] = _mm256_srai_epi32(v[4], 2); + u[5] = _mm256_srai_epi32(v[5], 2); + u[6] = _mm256_srai_epi32(v[6], 2); + u[7] = _mm256_srai_epi32(v[7], 2); + u[8] = _mm256_srai_epi32(v[8], 2); + u[9] = _mm256_srai_epi32(v[9], 2); + u[10] = _mm256_srai_epi32(v[10], 2); + u[11] = _mm256_srai_epi32(v[11], 2); + u[12] = _mm256_srai_epi32(v[12], 2); + u[13] = _mm256_srai_epi32(v[13], 2); + u[14] = _mm256_srai_epi32(v[14], 2); + u[15] = _mm256_srai_epi32(v[15], 2); + + out[1] = _mm256_packs_epi32(u[0], u[1]); + out[17] = _mm256_packs_epi32(u[2], u[3]); + out[9] = _mm256_packs_epi32(u[4], u[5]); + out[25] = _mm256_packs_epi32(u[6], u[7]); + out[7] = _mm256_packs_epi32(u[8], u[9]); + out[23] = _mm256_packs_epi32(u[10], u[11]); + out[15] = _mm256_packs_epi32(u[12], u[13]); + out[31] = _mm256_packs_epi32(u[14], u[15]); + } + { + const __m256i k32_p27_p05 = + pair256_set_epi32(cospi_27_64, cospi_5_64); + const __m256i k32_p11_p21 = + pair256_set_epi32(cospi_11_64, cospi_21_64); + const __m256i k32_p19_p13 = + pair256_set_epi32(cospi_19_64, cospi_13_64); + const __m256i k32_p03_p29 = + pair256_set_epi32(cospi_3_64, cospi_29_64); + const __m256i k32_m29_p03 = + pair256_set_epi32(-cospi_29_64, cospi_3_64); + const __m256i k32_m13_p19 = + pair256_set_epi32(-cospi_13_64, cospi_19_64); + const __m256i k32_m21_p11 = + pair256_set_epi32(-cospi_21_64, cospi_11_64); + const __m256i k32_m05_p27 = + pair256_set_epi32(-cospi_5_64, cospi_27_64); + + u[0] = _mm256_unpacklo_epi32(lstep1[40], lstep1[54]); + u[1] = _mm256_unpackhi_epi32(lstep1[40], lstep1[54]); + u[2] = _mm256_unpacklo_epi32(lstep1[41], lstep1[55]); + u[3] = _mm256_unpackhi_epi32(lstep1[41], lstep1[55]); + u[4] = _mm256_unpacklo_epi32(lstep1[42], lstep1[52]); + u[5] = _mm256_unpackhi_epi32(lstep1[42], lstep1[52]); + u[6] = _mm256_unpacklo_epi32(lstep1[43], lstep1[53]); + u[7] = _mm256_unpackhi_epi32(lstep1[43], lstep1[53]); + u[8] = _mm256_unpacklo_epi32(lstep1[44], lstep1[50]); + u[9] = _mm256_unpackhi_epi32(lstep1[44], lstep1[50]); + u[10] = _mm256_unpacklo_epi32(lstep1[45], lstep1[51]); + u[11] = _mm256_unpackhi_epi32(lstep1[45], lstep1[51]); + u[12] = _mm256_unpacklo_epi32(lstep1[46], lstep1[48]); + u[13] = _mm256_unpackhi_epi32(lstep1[46], lstep1[48]); + u[14] = _mm256_unpacklo_epi32(lstep1[47], lstep1[49]); + u[15] = _mm256_unpackhi_epi32(lstep1[47], lstep1[49]); + + v[0] = k_madd_epi32_avx2(u[0], k32_p27_p05); + v[1] = k_madd_epi32_avx2(u[1], k32_p27_p05); + v[2] = k_madd_epi32_avx2(u[2], k32_p27_p05); + v[3] = k_madd_epi32_avx2(u[3], k32_p27_p05); + v[4] = k_madd_epi32_avx2(u[4], k32_p11_p21); + v[5] = k_madd_epi32_avx2(u[5], k32_p11_p21); + v[6] = k_madd_epi32_avx2(u[6], k32_p11_p21); + v[7] = k_madd_epi32_avx2(u[7], k32_p11_p21); + v[8] = k_madd_epi32_avx2(u[8], k32_p19_p13); + v[9] = k_madd_epi32_avx2(u[9], k32_p19_p13); + v[10] = k_madd_epi32_avx2(u[10], k32_p19_p13); + v[11] = k_madd_epi32_avx2(u[11], k32_p19_p13); + v[12] = k_madd_epi32_avx2(u[12], k32_p03_p29); + v[13] = k_madd_epi32_avx2(u[13], k32_p03_p29); + v[14] = k_madd_epi32_avx2(u[14], k32_p03_p29); + v[15] = k_madd_epi32_avx2(u[15], k32_p03_p29); + v[16] = k_madd_epi32_avx2(u[12], k32_m29_p03); + v[17] = k_madd_epi32_avx2(u[13], k32_m29_p03); + v[18] = k_madd_epi32_avx2(u[14], k32_m29_p03); + v[19] = k_madd_epi32_avx2(u[15], k32_m29_p03); + v[20] = k_madd_epi32_avx2(u[8], k32_m13_p19); + v[21] = k_madd_epi32_avx2(u[9], k32_m13_p19); + v[22] = k_madd_epi32_avx2(u[10], k32_m13_p19); + v[23] = k_madd_epi32_avx2(u[11], k32_m13_p19); + v[24] = k_madd_epi32_avx2(u[4], k32_m21_p11); + v[25] = k_madd_epi32_avx2(u[5], k32_m21_p11); + v[26] = k_madd_epi32_avx2(u[6], k32_m21_p11); + v[27] = k_madd_epi32_avx2(u[7], k32_m21_p11); + v[28] = k_madd_epi32_avx2(u[0], k32_m05_p27); + v[29] = k_madd_epi32_avx2(u[1], k32_m05_p27); + v[30] = k_madd_epi32_avx2(u[2], k32_m05_p27); + v[31] = k_madd_epi32_avx2(u[3], k32_m05_p27); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + u[4] = k_packs_epi64_avx2(v[8], v[9]); + u[5] = k_packs_epi64_avx2(v[10], v[11]); + u[6] = k_packs_epi64_avx2(v[12], v[13]); + u[7] = k_packs_epi64_avx2(v[14], v[15]); + u[8] = k_packs_epi64_avx2(v[16], v[17]); + u[9] = k_packs_epi64_avx2(v[18], v[19]); + u[10] = k_packs_epi64_avx2(v[20], v[21]); + u[11] = k_packs_epi64_avx2(v[22], v[23]); + u[12] = k_packs_epi64_avx2(v[24], v[25]); + u[13] = k_packs_epi64_avx2(v[26], v[27]); + u[14] = k_packs_epi64_avx2(v[28], v[29]); + u[15] = k_packs_epi64_avx2(v[30], v[31]); + + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); + + v[0] = _mm256_cmpgt_epi32(kZero, u[0]); + v[1] = _mm256_cmpgt_epi32(kZero, u[1]); + v[2] = _mm256_cmpgt_epi32(kZero, u[2]); + v[3] = _mm256_cmpgt_epi32(kZero, u[3]); + v[4] = _mm256_cmpgt_epi32(kZero, u[4]); + v[5] = _mm256_cmpgt_epi32(kZero, u[5]); + v[6] = _mm256_cmpgt_epi32(kZero, u[6]); + v[7] = _mm256_cmpgt_epi32(kZero, u[7]); + v[8] = _mm256_cmpgt_epi32(kZero, u[8]); + v[9] = _mm256_cmpgt_epi32(kZero, u[9]); + v[10] = _mm256_cmpgt_epi32(kZero, u[10]); + v[11] = _mm256_cmpgt_epi32(kZero, u[11]); + v[12] = _mm256_cmpgt_epi32(kZero, u[12]); + v[13] = _mm256_cmpgt_epi32(kZero, u[13]); + v[14] = _mm256_cmpgt_epi32(kZero, u[14]); + v[15] = _mm256_cmpgt_epi32(kZero, u[15]); + + u[0] = _mm256_sub_epi32(u[0], v[0]); + u[1] = _mm256_sub_epi32(u[1], v[1]); + u[2] = _mm256_sub_epi32(u[2], v[2]); + u[3] = _mm256_sub_epi32(u[3], v[3]); + u[4] = _mm256_sub_epi32(u[4], v[4]); + u[5] = _mm256_sub_epi32(u[5], v[5]); + u[6] = _mm256_sub_epi32(u[6], v[6]); + u[7] = _mm256_sub_epi32(u[7], v[7]); + u[8] = _mm256_sub_epi32(u[8], v[8]); + u[9] = _mm256_sub_epi32(u[9], v[9]); + u[10] = _mm256_sub_epi32(u[10], v[10]); + u[11] = _mm256_sub_epi32(u[11], v[11]); + u[12] = _mm256_sub_epi32(u[12], v[12]); + u[13] = _mm256_sub_epi32(u[13], v[13]); + u[14] = _mm256_sub_epi32(u[14], v[14]); + u[15] = _mm256_sub_epi32(u[15], v[15]); + + v[0] = _mm256_add_epi32(u[0], K32One); + v[1] = _mm256_add_epi32(u[1], K32One); + v[2] = _mm256_add_epi32(u[2], K32One); + v[3] = _mm256_add_epi32(u[3], K32One); + v[4] = _mm256_add_epi32(u[4], K32One); + v[5] = _mm256_add_epi32(u[5], K32One); + v[6] = _mm256_add_epi32(u[6], K32One); + v[7] = _mm256_add_epi32(u[7], K32One); + v[8] = _mm256_add_epi32(u[8], K32One); + v[9] = _mm256_add_epi32(u[9], K32One); + v[10] = _mm256_add_epi32(u[10], K32One); + v[11] = _mm256_add_epi32(u[11], K32One); + v[12] = _mm256_add_epi32(u[12], K32One); + v[13] = _mm256_add_epi32(u[13], K32One); + v[14] = _mm256_add_epi32(u[14], K32One); + v[15] = _mm256_add_epi32(u[15], K32One); + + u[0] = _mm256_srai_epi32(v[0], 2); + u[1] = _mm256_srai_epi32(v[1], 2); + u[2] = _mm256_srai_epi32(v[2], 2); + u[3] = _mm256_srai_epi32(v[3], 2); + u[4] = _mm256_srai_epi32(v[4], 2); + u[5] = _mm256_srai_epi32(v[5], 2); + u[6] = _mm256_srai_epi32(v[6], 2); + u[7] = _mm256_srai_epi32(v[7], 2); + u[8] = _mm256_srai_epi32(v[8], 2); + u[9] = _mm256_srai_epi32(v[9], 2); + u[10] = _mm256_srai_epi32(v[10], 2); + u[11] = _mm256_srai_epi32(v[11], 2); + u[12] = _mm256_srai_epi32(v[12], 2); + u[13] = _mm256_srai_epi32(v[13], 2); + u[14] = _mm256_srai_epi32(v[14], 2); + u[15] = _mm256_srai_epi32(v[15], 2); + + out[5] = _mm256_packs_epi32(u[0], u[1]); + out[21] = _mm256_packs_epi32(u[2], u[3]); + out[13] = _mm256_packs_epi32(u[4], u[5]); + out[29] = _mm256_packs_epi32(u[6], u[7]); + out[3] = _mm256_packs_epi32(u[8], u[9]); + out[19] = _mm256_packs_epi32(u[10], u[11]); + out[11] = _mm256_packs_epi32(u[12], u[13]); + out[27] = _mm256_packs_epi32(u[14], u[15]); + } + } +#endif + // Transpose the results, do it as four 8x8 transposes. + { + int transpose_block; + int16_t *output_currStep, *output_nextStep; + if (0 == pass) { + output_currStep = &intermediate[column_start * 32]; + output_nextStep = &intermediate[(column_start + 8) * 32]; + } else { + output_currStep = &output_org[column_start * 32]; + output_nextStep = &output_org[(column_start + 8) * 32]; + } + for (transpose_block = 0; transpose_block < 4; ++transpose_block) { + __m256i *this_out = &out[8 * transpose_block]; + // 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 + // 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 + // 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 + // 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 + // 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 + // 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 + // 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 + // 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 + const __m256i tr0_0 = _mm256_unpacklo_epi16(this_out[0], this_out[1]); + const __m256i tr0_1 = _mm256_unpacklo_epi16(this_out[2], this_out[3]); + const __m256i tr0_2 = _mm256_unpackhi_epi16(this_out[0], this_out[1]); + const __m256i tr0_3 = _mm256_unpackhi_epi16(this_out[2], this_out[3]); + const __m256i tr0_4 = _mm256_unpacklo_epi16(this_out[4], this_out[5]); + const __m256i tr0_5 = _mm256_unpacklo_epi16(this_out[6], this_out[7]); + const __m256i tr0_6 = _mm256_unpackhi_epi16(this_out[4], this_out[5]); + const __m256i tr0_7 = _mm256_unpackhi_epi16(this_out[6], this_out[7]); + // 00 20 01 21 02 22 03 23 08 28 09 29 10 30 11 31 + // 40 60 41 61 42 62 43 63 48 68 49 69 50 70 51 71 + // 04 24 05 25 06 26 07 27 12 32 13 33 14 34 15 35 + // 44 64 45 65 46 66 47 67 52 72 53 73 54 74 55 75 + // 80 100 81 101 82 102 83 103 88 108 89 109 90 110 91 101 + // 120 140 121 141 122 142 123 143 128 148 129 149 130 150 131 151 + // 84 104 85 105 86 106 87 107 92 112 93 113 94 114 95 115 + // 124 144 125 145 126 146 127 147 132 152 133 153 134 154 135 155 + + const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1); + const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_2, tr0_3); + const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1); + const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_2, tr0_3); + const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_5); + const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7); + const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_4, tr0_5); + const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7); + // 00 20 40 60 01 21 41 61 08 28 48 68 09 29 49 69 + // 04 24 44 64 05 25 45 65 12 32 52 72 13 33 53 73 + // 02 22 42 62 03 23 43 63 10 30 50 70 11 31 51 71 + // 06 26 46 66 07 27 47 67 14 34 54 74 15 35 55 75 + // 80 100 120 140 81 101 121 141 88 108 128 148 89 109 129 149 + // 84 104 124 144 85 105 125 145 92 112 132 152 93 113 133 153 + // 82 102 122 142 83 103 123 143 90 110 130 150 91 101 131 151 + // 86 106 126 146 87 107 127 147 94 114 134 154 95 115 135 155 + __m256i tr2_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4); + __m256i tr2_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4); + __m256i tr2_2 = _mm256_unpacklo_epi64(tr1_2, tr1_6); + __m256i tr2_3 = _mm256_unpackhi_epi64(tr1_2, tr1_6); + __m256i tr2_4 = _mm256_unpacklo_epi64(tr1_1, tr1_5); + __m256i tr2_5 = _mm256_unpackhi_epi64(tr1_1, tr1_5); + __m256i tr2_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7); + __m256i tr2_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7); + // 00 20 40 60 80 100 120 140 08 28 48 68 88 108 128 148 + // 01 21 41 61 81 101 121 141 09 29 49 69 89 109 129 149 + // 02 22 42 62 82 102 122 142 10 30 50 70 90 110 130 150 + // 03 23 43 63 83 103 123 143 11 31 51 71 91 101 131 151 + // 04 24 44 64 84 104 124 144 12 32 52 72 92 112 132 152 + // 05 25 45 65 85 105 125 145 13 33 53 73 93 113 133 153 + // 06 26 46 66 86 106 126 146 14 34 54 74 94 114 134 154 + // 07 27 47 67 87 107 127 147 15 35 55 75 95 115 135 155 + if (0 == pass) { + // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2; + // TODO(cd): see quality impact of only doing + // output[j] = (output[j] + 1) >> 2; + // which would remove the code between here ... + __m256i tr2_0_0 = _mm256_cmpgt_epi16(tr2_0, kZero); + __m256i tr2_1_0 = _mm256_cmpgt_epi16(tr2_1, kZero); + __m256i tr2_2_0 = _mm256_cmpgt_epi16(tr2_2, kZero); + __m256i tr2_3_0 = _mm256_cmpgt_epi16(tr2_3, kZero); + __m256i tr2_4_0 = _mm256_cmpgt_epi16(tr2_4, kZero); + __m256i tr2_5_0 = _mm256_cmpgt_epi16(tr2_5, kZero); + __m256i tr2_6_0 = _mm256_cmpgt_epi16(tr2_6, kZero); + __m256i tr2_7_0 = _mm256_cmpgt_epi16(tr2_7, kZero); + tr2_0 = _mm256_sub_epi16(tr2_0, tr2_0_0); + tr2_1 = _mm256_sub_epi16(tr2_1, tr2_1_0); + tr2_2 = _mm256_sub_epi16(tr2_2, tr2_2_0); + tr2_3 = _mm256_sub_epi16(tr2_3, tr2_3_0); + tr2_4 = _mm256_sub_epi16(tr2_4, tr2_4_0); + tr2_5 = _mm256_sub_epi16(tr2_5, tr2_5_0); + tr2_6 = _mm256_sub_epi16(tr2_6, tr2_6_0); + tr2_7 = _mm256_sub_epi16(tr2_7, tr2_7_0); + // ... and here. + // PS: also change code in vp9/encoder/vp9_dct.c + tr2_0 = _mm256_add_epi16(tr2_0, kOne); + tr2_1 = _mm256_add_epi16(tr2_1, kOne); + tr2_2 = _mm256_add_epi16(tr2_2, kOne); + tr2_3 = _mm256_add_epi16(tr2_3, kOne); + tr2_4 = _mm256_add_epi16(tr2_4, kOne); + tr2_5 = _mm256_add_epi16(tr2_5, kOne); + tr2_6 = _mm256_add_epi16(tr2_6, kOne); + tr2_7 = _mm256_add_epi16(tr2_7, kOne); + tr2_0 = _mm256_srai_epi16(tr2_0, 2); + tr2_1 = _mm256_srai_epi16(tr2_1, 2); + tr2_2 = _mm256_srai_epi16(tr2_2, 2); + tr2_3 = _mm256_srai_epi16(tr2_3, 2); + tr2_4 = _mm256_srai_epi16(tr2_4, 2); + tr2_5 = _mm256_srai_epi16(tr2_5, 2); + tr2_6 = _mm256_srai_epi16(tr2_6, 2); + tr2_7 = _mm256_srai_epi16(tr2_7, 2); + } + // Note: even though all these stores are aligned, using the aligned + // intrinsic make the code slightly slower. + _mm_storeu_si128((__m128i *)(output_currStep + 0 * 32), + _mm256_castsi256_si128(tr2_0)); + _mm_storeu_si128((__m128i *)(output_currStep + 1 * 32), + _mm256_castsi256_si128(tr2_1)); + _mm_storeu_si128((__m128i *)(output_currStep + 2 * 32), + _mm256_castsi256_si128(tr2_2)); + _mm_storeu_si128((__m128i *)(output_currStep + 3 * 32), + _mm256_castsi256_si128(tr2_3)); + _mm_storeu_si128((__m128i *)(output_currStep + 4 * 32), + _mm256_castsi256_si128(tr2_4)); + _mm_storeu_si128((__m128i *)(output_currStep + 5 * 32), + _mm256_castsi256_si128(tr2_5)); + _mm_storeu_si128((__m128i *)(output_currStep + 6 * 32), + _mm256_castsi256_si128(tr2_6)); + _mm_storeu_si128((__m128i *)(output_currStep + 7 * 32), + _mm256_castsi256_si128(tr2_7)); + + _mm_storeu_si128((__m128i *)(output_nextStep + 0 * 32), + _mm256_extractf128_si256(tr2_0, 1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 1 * 32), + _mm256_extractf128_si256(tr2_1, 1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 2 * 32), + _mm256_extractf128_si256(tr2_2, 1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 3 * 32), + _mm256_extractf128_si256(tr2_3, 1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 4 * 32), + _mm256_extractf128_si256(tr2_4, 1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 5 * 32), + _mm256_extractf128_si256(tr2_5, 1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32), + _mm256_extractf128_si256(tr2_6, 1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32), + _mm256_extractf128_si256(tr2_7, 1)); + // Process next 8x8 + output_currStep += 8; + output_nextStep += 8; + } + } + } + } +} // NOLINT diff --git a/media/libvpx/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h new file mode 100644 index 0000000000..bf350b6da0 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h @@ -0,0 +1,3130 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE2 + +#include "vpx_dsp/fwd_txfm.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +// TODO(jingning) The high bit-depth version needs re-work for performance. +// The current SSE2 implementation also causes cross reference to the static +// functions in the C implementation file. +#if DCT_HIGH_BIT_DEPTH +#define ADD_EPI16 _mm_adds_epi16 +#define SUB_EPI16 _mm_subs_epi16 +#if FDCT32x32_HIGH_PRECISION +static void vpx_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) { + int i, j; + for (i = 0; i < 32; ++i) { + tran_high_t temp_in[32], temp_out[32]; + for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i]; + vpx_fdct32(temp_in, temp_out, 0); + for (j = 0; j < 32; ++j) + out[j + i * 32] = + (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2); + } +} +#define HIGH_FDCT32x32_2D_C vpx_highbd_fdct32x32_c +#define HIGH_FDCT32x32_2D_ROWS_C vpx_fdct32x32_rows_c +#else +static void vpx_fdct32x32_rd_rows_c(const int16_t *intermediate, + tran_low_t *out) { + int i, j; + for (i = 0; i < 32; ++i) { + tran_high_t temp_in[32], temp_out[32]; + for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i]; + vpx_fdct32(temp_in, temp_out, 1); + for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j]; + } +} +#define HIGH_FDCT32x32_2D_C vpx_highbd_fdct32x32_rd_c +#define HIGH_FDCT32x32_2D_ROWS_C vpx_fdct32x32_rd_rows_c +#endif // FDCT32x32_HIGH_PRECISION +#else +#define ADD_EPI16 _mm_add_epi16 +#define SUB_EPI16 _mm_sub_epi16 +#endif // DCT_HIGH_BIT_DEPTH + +void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) { + // Calculate pre-multiplied strides + const int str1 = stride; + const int str2 = 2 * stride; + const int str3 = 2 * stride + str1; + // We need an intermediate buffer between passes. + DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]); + // Constants + // When we use them, in one case, they are all the same. In all others + // it's a pair of them that we need to repeat four times. This is done + // by constructing the 32 bit constant corresponding to that pair. + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64); + const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64); + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64); + const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64); + const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64); + const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64); + const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64); + const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); + const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); + const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); + const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); + const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64); + const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64); + const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64); + const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64); + const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64); + const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64); + const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64); + const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64); + const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64); + const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64); + const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64); + const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64); + const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64); + const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64); + const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64); + const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i kZero = _mm_setzero_si128(); + const __m128i kOne = _mm_set1_epi16(1); + + // Do the two transform/transpose passes + int pass; +#if DCT_HIGH_BIT_DEPTH + int overflow; +#endif + for (pass = 0; pass < 2; ++pass) { + // We process eight columns (transposed rows in second pass) at a time. + int column_start; + for (column_start = 0; column_start < 32; column_start += 8) { + __m128i step1[32]; + __m128i step2[32]; + __m128i step3[32]; + __m128i out[32]; + // Stage 1 + // Note: even though all the loads below are aligned, using the aligned + // intrinsic make the code slightly slower. + if (0 == pass) { + const int16_t *in = &input[column_start]; + // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2; + // Note: the next four blocks could be in a loop. That would help the + // instruction cache but is actually slower. + { + const int16_t *ina = in + 0 * str1; + const int16_t *inb = in + 31 * str1; + __m128i *step1a = &step1[0]; + __m128i *step1b = &step1[31]; + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); + step1a[0] = _mm_add_epi16(ina0, inb0); + step1a[1] = _mm_add_epi16(ina1, inb1); + step1a[2] = _mm_add_epi16(ina2, inb2); + step1a[3] = _mm_add_epi16(ina3, inb3); + step1b[-3] = _mm_sub_epi16(ina3, inb3); + step1b[-2] = _mm_sub_epi16(ina2, inb2); + step1b[-1] = _mm_sub_epi16(ina1, inb1); + step1b[-0] = _mm_sub_epi16(ina0, inb0); + step1a[0] = _mm_slli_epi16(step1a[0], 2); + step1a[1] = _mm_slli_epi16(step1a[1], 2); + step1a[2] = _mm_slli_epi16(step1a[2], 2); + step1a[3] = _mm_slli_epi16(step1a[3], 2); + step1b[-3] = _mm_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm_slli_epi16(step1b[-0], 2); + } + { + const int16_t *ina = in + 4 * str1; + const int16_t *inb = in + 27 * str1; + __m128i *step1a = &step1[4]; + __m128i *step1b = &step1[27]; + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); + step1a[0] = _mm_add_epi16(ina0, inb0); + step1a[1] = _mm_add_epi16(ina1, inb1); + step1a[2] = _mm_add_epi16(ina2, inb2); + step1a[3] = _mm_add_epi16(ina3, inb3); + step1b[-3] = _mm_sub_epi16(ina3, inb3); + step1b[-2] = _mm_sub_epi16(ina2, inb2); + step1b[-1] = _mm_sub_epi16(ina1, inb1); + step1b[-0] = _mm_sub_epi16(ina0, inb0); + step1a[0] = _mm_slli_epi16(step1a[0], 2); + step1a[1] = _mm_slli_epi16(step1a[1], 2); + step1a[2] = _mm_slli_epi16(step1a[2], 2); + step1a[3] = _mm_slli_epi16(step1a[3], 2); + step1b[-3] = _mm_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm_slli_epi16(step1b[-0], 2); + } + { + const int16_t *ina = in + 8 * str1; + const int16_t *inb = in + 23 * str1; + __m128i *step1a = &step1[8]; + __m128i *step1b = &step1[23]; + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); + step1a[0] = _mm_add_epi16(ina0, inb0); + step1a[1] = _mm_add_epi16(ina1, inb1); + step1a[2] = _mm_add_epi16(ina2, inb2); + step1a[3] = _mm_add_epi16(ina3, inb3); + step1b[-3] = _mm_sub_epi16(ina3, inb3); + step1b[-2] = _mm_sub_epi16(ina2, inb2); + step1b[-1] = _mm_sub_epi16(ina1, inb1); + step1b[-0] = _mm_sub_epi16(ina0, inb0); + step1a[0] = _mm_slli_epi16(step1a[0], 2); + step1a[1] = _mm_slli_epi16(step1a[1], 2); + step1a[2] = _mm_slli_epi16(step1a[2], 2); + step1a[3] = _mm_slli_epi16(step1a[3], 2); + step1b[-3] = _mm_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm_slli_epi16(step1b[-0], 2); + } + { + const int16_t *ina = in + 12 * str1; + const int16_t *inb = in + 19 * str1; + __m128i *step1a = &step1[12]; + __m128i *step1b = &step1[19]; + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); + step1a[0] = _mm_add_epi16(ina0, inb0); + step1a[1] = _mm_add_epi16(ina1, inb1); + step1a[2] = _mm_add_epi16(ina2, inb2); + step1a[3] = _mm_add_epi16(ina3, inb3); + step1b[-3] = _mm_sub_epi16(ina3, inb3); + step1b[-2] = _mm_sub_epi16(ina2, inb2); + step1b[-1] = _mm_sub_epi16(ina1, inb1); + step1b[-0] = _mm_sub_epi16(ina0, inb0); + step1a[0] = _mm_slli_epi16(step1a[0], 2); + step1a[1] = _mm_slli_epi16(step1a[1], 2); + step1a[2] = _mm_slli_epi16(step1a[2], 2); + step1a[3] = _mm_slli_epi16(step1a[3], 2); + step1b[-3] = _mm_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm_slli_epi16(step1b[-0], 2); + } + } else { + int16_t *in = &intermediate[column_start]; + // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32]; + // Note: using the same approach as above to have common offset is + // counter-productive as all offsets can be calculated at compile + // time. + // Note: the next four blocks could be in a loop. That would help the + // instruction cache but is actually slower. + { + __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32)); + __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32)); + __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32)); + __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32)); + __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32)); + __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32)); + __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32)); + __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32)); + step1[0] = ADD_EPI16(in00, in31); + step1[1] = ADD_EPI16(in01, in30); + step1[2] = ADD_EPI16(in02, in29); + step1[3] = ADD_EPI16(in03, in28); + step1[28] = SUB_EPI16(in03, in28); + step1[29] = SUB_EPI16(in02, in29); + step1[30] = SUB_EPI16(in01, in30); + step1[31] = SUB_EPI16(in00, in31); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x8(&step1[0], &step1[1], &step1[2], + &step1[3], &step1[28], &step1[29], + &step1[30], &step1[31]); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32)); + __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32)); + __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32)); + __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32)); + __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32)); + __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32)); + __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32)); + __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32)); + step1[4] = ADD_EPI16(in04, in27); + step1[5] = ADD_EPI16(in05, in26); + step1[6] = ADD_EPI16(in06, in25); + step1[7] = ADD_EPI16(in07, in24); + step1[24] = SUB_EPI16(in07, in24); + step1[25] = SUB_EPI16(in06, in25); + step1[26] = SUB_EPI16(in05, in26); + step1[27] = SUB_EPI16(in04, in27); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x8(&step1[4], &step1[5], &step1[6], + &step1[7], &step1[24], &step1[25], + &step1[26], &step1[27]); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32)); + __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32)); + __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32)); + __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32)); + __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32)); + __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32)); + __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32)); + __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32)); + step1[8] = ADD_EPI16(in08, in23); + step1[9] = ADD_EPI16(in09, in22); + step1[10] = ADD_EPI16(in10, in21); + step1[11] = ADD_EPI16(in11, in20); + step1[20] = SUB_EPI16(in11, in20); + step1[21] = SUB_EPI16(in10, in21); + step1[22] = SUB_EPI16(in09, in22); + step1[23] = SUB_EPI16(in08, in23); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x8(&step1[8], &step1[9], &step1[10], + &step1[11], &step1[20], &step1[21], + &step1[22], &step1[23]); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32)); + __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32)); + __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32)); + __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32)); + __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32)); + __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32)); + __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32)); + __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32)); + step1[12] = ADD_EPI16(in12, in19); + step1[13] = ADD_EPI16(in13, in18); + step1[14] = ADD_EPI16(in14, in17); + step1[15] = ADD_EPI16(in15, in16); + step1[16] = SUB_EPI16(in15, in16); + step1[17] = SUB_EPI16(in14, in17); + step1[18] = SUB_EPI16(in13, in18); + step1[19] = SUB_EPI16(in12, in19); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x8(&step1[12], &step1[13], &step1[14], + &step1[15], &step1[16], &step1[17], + &step1[18], &step1[19]); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + } + // Stage 2 + { + step2[0] = ADD_EPI16(step1[0], step1[15]); + step2[1] = ADD_EPI16(step1[1], step1[14]); + step2[2] = ADD_EPI16(step1[2], step1[13]); + step2[3] = ADD_EPI16(step1[3], step1[12]); + step2[4] = ADD_EPI16(step1[4], step1[11]); + step2[5] = ADD_EPI16(step1[5], step1[10]); + step2[6] = ADD_EPI16(step1[6], step1[9]); + step2[7] = ADD_EPI16(step1[7], step1[8]); + step2[8] = SUB_EPI16(step1[7], step1[8]); + step2[9] = SUB_EPI16(step1[6], step1[9]); + step2[10] = SUB_EPI16(step1[5], step1[10]); + step2[11] = SUB_EPI16(step1[4], step1[11]); + step2[12] = SUB_EPI16(step1[3], step1[12]); + step2[13] = SUB_EPI16(step1[2], step1[13]); + step2[14] = SUB_EPI16(step1[1], step1[14]); + step2[15] = SUB_EPI16(step1[0], step1[15]); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x16( + &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5], + &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11], + &step2[12], &step2[13], &step2[14], &step2[15]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]); + const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]); + const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]); + const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]); + const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]); + const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]); + const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]); + const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]); + const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16); + const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16); + const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16); + const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16); + const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16); + const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16); + const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16); + const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16); + const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16); + const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16); + const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16); + const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16); + const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16); + const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16); + const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16); + const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING); + const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING); + const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING); + const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING); + const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING); + const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING); + const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING); + const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING); + const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING); + const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING); + const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING); + const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING); + const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING); + const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING); + const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING); + const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING); + const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS); + const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS); + const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS); + const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS); + const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS); + const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS); + const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS); + const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS); + const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS); + const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS); + const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS); + const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS); + const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS); + const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS); + const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS); + const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS); + // Combine + step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7); + step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7); + step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7); + step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7); + step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7); + step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7); + step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7); + step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x8(&step2[20], &step2[21], &step2[22], + &step2[23], &step2[24], &step2[25], + &step2[26], &step2[27]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + +#if !FDCT32x32_HIGH_PRECISION + // dump the magnitude by half, hence the intermediate values are within + // the range of 16 bits. + if (1 == pass) { + __m128i s3_00_0 = _mm_cmplt_epi16(step2[0], kZero); + __m128i s3_01_0 = _mm_cmplt_epi16(step2[1], kZero); + __m128i s3_02_0 = _mm_cmplt_epi16(step2[2], kZero); + __m128i s3_03_0 = _mm_cmplt_epi16(step2[3], kZero); + __m128i s3_04_0 = _mm_cmplt_epi16(step2[4], kZero); + __m128i s3_05_0 = _mm_cmplt_epi16(step2[5], kZero); + __m128i s3_06_0 = _mm_cmplt_epi16(step2[6], kZero); + __m128i s3_07_0 = _mm_cmplt_epi16(step2[7], kZero); + __m128i s2_08_0 = _mm_cmplt_epi16(step2[8], kZero); + __m128i s2_09_0 = _mm_cmplt_epi16(step2[9], kZero); + __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero); + __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero); + __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero); + __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero); + __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero); + __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero); + __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero); + __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero); + __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero); + __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero); + __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero); + __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero); + __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero); + __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero); + __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero); + __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero); + __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero); + __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero); + __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero); + __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero); + __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero); + __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero); + + step2[0] = SUB_EPI16(step2[0], s3_00_0); + step2[1] = SUB_EPI16(step2[1], s3_01_0); + step2[2] = SUB_EPI16(step2[2], s3_02_0); + step2[3] = SUB_EPI16(step2[3], s3_03_0); + step2[4] = SUB_EPI16(step2[4], s3_04_0); + step2[5] = SUB_EPI16(step2[5], s3_05_0); + step2[6] = SUB_EPI16(step2[6], s3_06_0); + step2[7] = SUB_EPI16(step2[7], s3_07_0); + step2[8] = SUB_EPI16(step2[8], s2_08_0); + step2[9] = SUB_EPI16(step2[9], s2_09_0); + step2[10] = SUB_EPI16(step2[10], s3_10_0); + step2[11] = SUB_EPI16(step2[11], s3_11_0); + step2[12] = SUB_EPI16(step2[12], s3_12_0); + step2[13] = SUB_EPI16(step2[13], s3_13_0); + step2[14] = SUB_EPI16(step2[14], s2_14_0); + step2[15] = SUB_EPI16(step2[15], s2_15_0); + step1[16] = SUB_EPI16(step1[16], s3_16_0); + step1[17] = SUB_EPI16(step1[17], s3_17_0); + step1[18] = SUB_EPI16(step1[18], s3_18_0); + step1[19] = SUB_EPI16(step1[19], s3_19_0); + step2[20] = SUB_EPI16(step2[20], s3_20_0); + step2[21] = SUB_EPI16(step2[21], s3_21_0); + step2[22] = SUB_EPI16(step2[22], s3_22_0); + step2[23] = SUB_EPI16(step2[23], s3_23_0); + step2[24] = SUB_EPI16(step2[24], s3_24_0); + step2[25] = SUB_EPI16(step2[25], s3_25_0); + step2[26] = SUB_EPI16(step2[26], s3_26_0); + step2[27] = SUB_EPI16(step2[27], s3_27_0); + step1[28] = SUB_EPI16(step1[28], s3_28_0); + step1[29] = SUB_EPI16(step1[29], s3_29_0); + step1[30] = SUB_EPI16(step1[30], s3_30_0); + step1[31] = SUB_EPI16(step1[31], s3_31_0); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x32( + &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5], + &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11], + &step2[12], &step2[13], &step2[14], &step2[15], &step1[16], + &step1[17], &step1[18], &step1[19], &step2[20], &step2[21], + &step2[22], &step2[23], &step2[24], &step2[25], &step2[26], + &step2[27], &step1[28], &step1[29], &step1[30], &step1[31]); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + step2[0] = _mm_add_epi16(step2[0], kOne); + step2[1] = _mm_add_epi16(step2[1], kOne); + step2[2] = _mm_add_epi16(step2[2], kOne); + step2[3] = _mm_add_epi16(step2[3], kOne); + step2[4] = _mm_add_epi16(step2[4], kOne); + step2[5] = _mm_add_epi16(step2[5], kOne); + step2[6] = _mm_add_epi16(step2[6], kOne); + step2[7] = _mm_add_epi16(step2[7], kOne); + step2[8] = _mm_add_epi16(step2[8], kOne); + step2[9] = _mm_add_epi16(step2[9], kOne); + step2[10] = _mm_add_epi16(step2[10], kOne); + step2[11] = _mm_add_epi16(step2[11], kOne); + step2[12] = _mm_add_epi16(step2[12], kOne); + step2[13] = _mm_add_epi16(step2[13], kOne); + step2[14] = _mm_add_epi16(step2[14], kOne); + step2[15] = _mm_add_epi16(step2[15], kOne); + step1[16] = _mm_add_epi16(step1[16], kOne); + step1[17] = _mm_add_epi16(step1[17], kOne); + step1[18] = _mm_add_epi16(step1[18], kOne); + step1[19] = _mm_add_epi16(step1[19], kOne); + step2[20] = _mm_add_epi16(step2[20], kOne); + step2[21] = _mm_add_epi16(step2[21], kOne); + step2[22] = _mm_add_epi16(step2[22], kOne); + step2[23] = _mm_add_epi16(step2[23], kOne); + step2[24] = _mm_add_epi16(step2[24], kOne); + step2[25] = _mm_add_epi16(step2[25], kOne); + step2[26] = _mm_add_epi16(step2[26], kOne); + step2[27] = _mm_add_epi16(step2[27], kOne); + step1[28] = _mm_add_epi16(step1[28], kOne); + step1[29] = _mm_add_epi16(step1[29], kOne); + step1[30] = _mm_add_epi16(step1[30], kOne); + step1[31] = _mm_add_epi16(step1[31], kOne); + + step2[0] = _mm_srai_epi16(step2[0], 2); + step2[1] = _mm_srai_epi16(step2[1], 2); + step2[2] = _mm_srai_epi16(step2[2], 2); + step2[3] = _mm_srai_epi16(step2[3], 2); + step2[4] = _mm_srai_epi16(step2[4], 2); + step2[5] = _mm_srai_epi16(step2[5], 2); + step2[6] = _mm_srai_epi16(step2[6], 2); + step2[7] = _mm_srai_epi16(step2[7], 2); + step2[8] = _mm_srai_epi16(step2[8], 2); + step2[9] = _mm_srai_epi16(step2[9], 2); + step2[10] = _mm_srai_epi16(step2[10], 2); + step2[11] = _mm_srai_epi16(step2[11], 2); + step2[12] = _mm_srai_epi16(step2[12], 2); + step2[13] = _mm_srai_epi16(step2[13], 2); + step2[14] = _mm_srai_epi16(step2[14], 2); + step2[15] = _mm_srai_epi16(step2[15], 2); + step1[16] = _mm_srai_epi16(step1[16], 2); + step1[17] = _mm_srai_epi16(step1[17], 2); + step1[18] = _mm_srai_epi16(step1[18], 2); + step1[19] = _mm_srai_epi16(step1[19], 2); + step2[20] = _mm_srai_epi16(step2[20], 2); + step2[21] = _mm_srai_epi16(step2[21], 2); + step2[22] = _mm_srai_epi16(step2[22], 2); + step2[23] = _mm_srai_epi16(step2[23], 2); + step2[24] = _mm_srai_epi16(step2[24], 2); + step2[25] = _mm_srai_epi16(step2[25], 2); + step2[26] = _mm_srai_epi16(step2[26], 2); + step2[27] = _mm_srai_epi16(step2[27], 2); + step1[28] = _mm_srai_epi16(step1[28], 2); + step1[29] = _mm_srai_epi16(step1[29], 2); + step1[30] = _mm_srai_epi16(step1[30], 2); + step1[31] = _mm_srai_epi16(step1[31], 2); + } +#endif // !FDCT32x32_HIGH_PRECISION + +#if FDCT32x32_HIGH_PRECISION + if (pass == 0) { +#endif + // Stage 3 + { + step3[0] = ADD_EPI16(step2[(8 - 1)], step2[0]); + step3[1] = ADD_EPI16(step2[(8 - 2)], step2[1]); + step3[2] = ADD_EPI16(step2[(8 - 3)], step2[2]); + step3[3] = ADD_EPI16(step2[(8 - 4)], step2[3]); + step3[4] = SUB_EPI16(step2[(8 - 5)], step2[4]); + step3[5] = SUB_EPI16(step2[(8 - 6)], step2[5]); + step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]); + step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2], + &step3[3], &step3[4], &step3[5], + &step3[6], &step3[7]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]); + const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]); + const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]); + const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]); + const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16); + const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16); + const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16); + const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16); + const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16); + const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16); + const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16); + const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); + const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); + const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); + const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); + const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); + const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); + const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); + const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); + const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS); + const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS); + const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS); + const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS); + const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS); + const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS); + const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS); + const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); + // Combine + step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7); + step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7); + step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7); + step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&step3[10], &step3[11], &step3[12], + &step3[13]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + step3[16] = ADD_EPI16(step2[23], step1[16]); + step3[17] = ADD_EPI16(step2[22], step1[17]); + step3[18] = ADD_EPI16(step2[21], step1[18]); + step3[19] = ADD_EPI16(step2[20], step1[19]); + step3[20] = SUB_EPI16(step1[19], step2[20]); + step3[21] = SUB_EPI16(step1[18], step2[21]); + step3[22] = SUB_EPI16(step1[17], step2[22]); + step3[23] = SUB_EPI16(step1[16], step2[23]); + step3[24] = SUB_EPI16(step1[31], step2[24]); + step3[25] = SUB_EPI16(step1[30], step2[25]); + step3[26] = SUB_EPI16(step1[29], step2[26]); + step3[27] = SUB_EPI16(step1[28], step2[27]); + step3[28] = ADD_EPI16(step2[27], step1[28]); + step3[29] = ADD_EPI16(step2[26], step1[29]); + step3[30] = ADD_EPI16(step2[25], step1[30]); + step3[31] = ADD_EPI16(step2[24], step1[31]); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x16( + &step3[16], &step3[17], &step3[18], &step3[19], &step3[20], + &step3[21], &step3[22], &step3[23], &step3[24], &step3[25], + &step3[26], &step3[27], &step3[28], &step3[29], &step3[30], + &step3[31]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + + // Stage 4 + { + step1[0] = ADD_EPI16(step3[3], step3[0]); + step1[1] = ADD_EPI16(step3[2], step3[1]); + step1[2] = SUB_EPI16(step3[1], step3[2]); + step1[3] = SUB_EPI16(step3[0], step3[3]); + step1[8] = ADD_EPI16(step3[11], step2[8]); + step1[9] = ADD_EPI16(step3[10], step2[9]); + step1[10] = SUB_EPI16(step2[9], step3[10]); + step1[11] = SUB_EPI16(step2[8], step3[11]); + step1[12] = SUB_EPI16(step2[15], step3[12]); + step1[13] = SUB_EPI16(step2[14], step3[13]); + step1[14] = ADD_EPI16(step3[13], step2[14]); + step1[15] = ADD_EPI16(step3[12], step2[15]); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x16( + &step1[0], &step1[1], &step1[2], &step1[3], &step1[4], &step1[5], + &step1[6], &step1[7], &step1[8], &step1[9], &step1[10], + &step1[11], &step1[12], &step1[13], &step1[14], &step1[15]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]); + const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]); + const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16); + const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16); + const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16); + const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING); + const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING); + const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING); + const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING); + const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS); + const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS); + const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS); + const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS); + // Combine + step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7); + step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x2(&step1[5], &step1[6]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]); + const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]); + const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]); + const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]); + const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]); + const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]); + const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]); + const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]); + const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24); + const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24); + const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24); + const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24); + const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08); + const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08); + const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08); + const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08); + const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24); + const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24); + const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24); + const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24); + const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08); + const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08); + const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08); + const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08); + // dct_const_round_shift + const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING); + const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING); + const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING); + const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING); + const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING); + const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING); + const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING); + const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING); + const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING); + const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING); + const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING); + const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING); + const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING); + const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING); + const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING); + const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING); + const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS); + const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS); + const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS); + const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS); + const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS); + const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS); + const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS); + const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS); + const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS); + const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS); + const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS); + const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS); + const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS); + const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS); + const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS); + const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS); + // Combine + step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7); + step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7); + step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7); + step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7); + step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7); + step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7); + step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7); + step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20], + &step1[21], &step1[26], &step1[27], + &step1[28], &step1[29]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + // Stage 5 + { + step2[4] = ADD_EPI16(step1[5], step3[4]); + step2[5] = SUB_EPI16(step3[4], step1[5]); + step2[6] = SUB_EPI16(step3[7], step1[6]); + step2[7] = ADD_EPI16(step1[6], step3[7]); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&step2[4], &step2[5], &step2[6], + &step2[7]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]); + const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]); + const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]); + const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]); + const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16); + const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16); + const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16); + const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16); + const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08); + const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08); + const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24); + const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24); + // dct_const_round_shift + const __m128i out_00_4 = + _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING); + const __m128i out_00_5 = + _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING); + const __m128i out_16_4 = + _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING); + const __m128i out_16_5 = + _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING); + const __m128i out_08_4 = + _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING); + const __m128i out_08_5 = + _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING); + const __m128i out_24_4 = + _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING); + const __m128i out_24_5 = + _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING); + const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS); + const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS); + const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS); + const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS); + const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS); + const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS); + const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS); + const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS); + // Combine + out[0] = _mm_packs_epi32(out_00_6, out_00_7); + out[16] = _mm_packs_epi32(out_16_6, out_16_7); + out[8] = _mm_packs_epi32(out_08_6, out_08_7); + out[24] = _mm_packs_epi32(out_24_6, out_24_7); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]); + const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]); + const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]); + const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]); + const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24); + const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24); + const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08); + const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08); + const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24); + const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24); + const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08); + const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08); + // dct_const_round_shift + const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING); + const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING); + const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING); + const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING); + const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING); + const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING); + const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING); + const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING); + const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS); + const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS); + const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS); + const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS); + const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS); + const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS); + const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS); + const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS); + // Combine + step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7); + step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7); + step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7); + step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&step2[9], &step2[10], &step2[13], + &step2[14]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + step2[16] = ADD_EPI16(step1[19], step3[16]); + step2[17] = ADD_EPI16(step1[18], step3[17]); + step2[18] = SUB_EPI16(step3[17], step1[18]); + step2[19] = SUB_EPI16(step3[16], step1[19]); + step2[20] = SUB_EPI16(step3[23], step1[20]); + step2[21] = SUB_EPI16(step3[22], step1[21]); + step2[22] = ADD_EPI16(step1[21], step3[22]); + step2[23] = ADD_EPI16(step1[20], step3[23]); + step2[24] = ADD_EPI16(step1[27], step3[24]); + step2[25] = ADD_EPI16(step1[26], step3[25]); + step2[26] = SUB_EPI16(step3[25], step1[26]); + step2[27] = SUB_EPI16(step3[24], step1[27]); + step2[28] = SUB_EPI16(step3[31], step1[28]); + step2[29] = SUB_EPI16(step3[30], step1[29]); + step2[30] = ADD_EPI16(step1[29], step3[30]); + step2[31] = ADD_EPI16(step1[28], step3[31]); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x16( + &step2[16], &step2[17], &step2[18], &step2[19], &step2[20], + &step2[21], &step2[22], &step2[23], &step2[24], &step2[25], + &step2[26], &step2[27], &step2[28], &step2[29], &step2[30], + &step2[31]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + // Stage 6 + { + const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]); + const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]); + const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]); + const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]); + const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]); + const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]); + const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]); + const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]); + const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04); + const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04); + const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20); + const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20); + const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12); + const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12); + const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28); + const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28); + // dct_const_round_shift + const __m128i out_04_4 = + _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING); + const __m128i out_04_5 = + _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING); + const __m128i out_20_4 = + _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING); + const __m128i out_20_5 = + _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING); + const __m128i out_12_4 = + _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING); + const __m128i out_12_5 = + _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING); + const __m128i out_28_4 = + _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING); + const __m128i out_28_5 = + _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING); + const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS); + const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS); + const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS); + const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS); + const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS); + const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS); + const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS); + const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS); + // Combine + out[4] = _mm_packs_epi32(out_04_6, out_04_7); + out[20] = _mm_packs_epi32(out_20_6, out_20_7); + out[12] = _mm_packs_epi32(out_12_6, out_12_7); + out[28] = _mm_packs_epi32(out_28_6, out_28_7); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + step3[8] = ADD_EPI16(step2[9], step1[8]); + step3[9] = SUB_EPI16(step1[8], step2[9]); + step3[10] = SUB_EPI16(step1[11], step2[10]); + step3[11] = ADD_EPI16(step2[10], step1[11]); + step3[12] = ADD_EPI16(step2[13], step1[12]); + step3[13] = SUB_EPI16(step1[12], step2[13]); + step3[14] = SUB_EPI16(step1[15], step2[14]); + step3[15] = ADD_EPI16(step2[14], step1[15]); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10], + &step3[11], &step3[12], &step3[13], + &step3[14], &step3[15]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]); + const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]); + const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]); + const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]); + const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]); + const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]); + const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]); + const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]); + const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28); + const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28); + const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04); + const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04); + const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12); + const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12); + const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20); + const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20); + const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12); + const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12); + const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20); + const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20); + const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28); + const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28); + const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04); + const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04); + // dct_const_round_shift + const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING); + const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING); + const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING); + const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING); + const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING); + const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING); + const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING); + const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING); + const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS); + const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS); + const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS); + const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS); + const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS); + const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS); + const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS); + const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS); + const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING); + const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING); + const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING); + const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING); + const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING); + const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING); + const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING); + const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING); + const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS); + const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS); + const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS); + const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS); + const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS); + const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS); + const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS); + const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS); + // Combine + step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7); + step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7); + step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7); + step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7); + // Combine + step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7); + step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7); + step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7); + step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21], + &step3[22], &step3[25], &step3[26], + &step3[29], &step3[30]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + // Stage 7 + { + const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]); + const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]); + const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]); + const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]); + const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]); + const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]); + const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]); + const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]); + const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02); + const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02); + const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18); + const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18); + const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10); + const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10); + const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26); + const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26); + const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06); + const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06); + const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22); + const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22); + const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14); + const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14); + const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30); + const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30); + // dct_const_round_shift + const __m128i out_02_4 = + _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING); + const __m128i out_02_5 = + _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING); + const __m128i out_18_4 = + _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING); + const __m128i out_18_5 = + _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING); + const __m128i out_10_4 = + _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING); + const __m128i out_10_5 = + _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING); + const __m128i out_26_4 = + _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING); + const __m128i out_26_5 = + _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING); + const __m128i out_06_4 = + _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING); + const __m128i out_06_5 = + _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING); + const __m128i out_22_4 = + _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING); + const __m128i out_22_5 = + _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING); + const __m128i out_14_4 = + _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING); + const __m128i out_14_5 = + _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING); + const __m128i out_30_4 = + _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING); + const __m128i out_30_5 = + _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING); + const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS); + const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS); + const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS); + const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS); + const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS); + const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS); + const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS); + const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS); + const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS); + const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS); + const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS); + const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS); + const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS); + const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS); + const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS); + const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS); + // Combine + out[2] = _mm_packs_epi32(out_02_6, out_02_7); + out[18] = _mm_packs_epi32(out_18_6, out_18_7); + out[10] = _mm_packs_epi32(out_10_6, out_10_7); + out[26] = _mm_packs_epi32(out_26_6, out_26_7); + out[6] = _mm_packs_epi32(out_06_6, out_06_7); + out[22] = _mm_packs_epi32(out_22_6, out_22_7); + out[14] = _mm_packs_epi32(out_14_6, out_14_7); + out[30] = _mm_packs_epi32(out_30_6, out_30_7); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26], + &out[6], &out[22], &out[14], &out[30]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + step1[16] = ADD_EPI16(step3[17], step2[16]); + step1[17] = SUB_EPI16(step2[16], step3[17]); + step1[18] = SUB_EPI16(step2[19], step3[18]); + step1[19] = ADD_EPI16(step3[18], step2[19]); + step1[20] = ADD_EPI16(step3[21], step2[20]); + step1[21] = SUB_EPI16(step2[20], step3[21]); + step1[22] = SUB_EPI16(step2[23], step3[22]); + step1[23] = ADD_EPI16(step3[22], step2[23]); + step1[24] = ADD_EPI16(step3[25], step2[24]); + step1[25] = SUB_EPI16(step2[24], step3[25]); + step1[26] = SUB_EPI16(step2[27], step3[26]); + step1[27] = ADD_EPI16(step3[26], step2[27]); + step1[28] = ADD_EPI16(step3[29], step2[28]); + step1[29] = SUB_EPI16(step2[28], step3[29]); + step1[30] = SUB_EPI16(step2[31], step3[30]); + step1[31] = ADD_EPI16(step3[30], step2[31]); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x16( + &step1[16], &step1[17], &step1[18], &step1[19], &step1[20], + &step1[21], &step1[22], &step1[23], &step1[24], &step1[25], + &step1[26], &step1[27], &step1[28], &step1[29], &step1[30], + &step1[31]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + // Final stage --- outputs indices are bit-reversed. + { + const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]); + const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]); + const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]); + const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]); + const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]); + const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]); + const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]); + const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]); + const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01); + const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01); + const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17); + const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17); + const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09); + const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09); + const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25); + const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25); + const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07); + const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07); + const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23); + const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23); + const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15); + const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15); + const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31); + const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31); + // dct_const_round_shift + const __m128i out_01_4 = + _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING); + const __m128i out_01_5 = + _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING); + const __m128i out_17_4 = + _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING); + const __m128i out_17_5 = + _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING); + const __m128i out_09_4 = + _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING); + const __m128i out_09_5 = + _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING); + const __m128i out_25_4 = + _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING); + const __m128i out_25_5 = + _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING); + const __m128i out_07_4 = + _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING); + const __m128i out_07_5 = + _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING); + const __m128i out_23_4 = + _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING); + const __m128i out_23_5 = + _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING); + const __m128i out_15_4 = + _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING); + const __m128i out_15_5 = + _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING); + const __m128i out_31_4 = + _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING); + const __m128i out_31_5 = + _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING); + const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS); + const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS); + const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS); + const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS); + const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS); + const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS); + const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS); + const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS); + const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS); + const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS); + const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS); + const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS); + const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS); + const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS); + const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS); + const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS); + // Combine + out[1] = _mm_packs_epi32(out_01_6, out_01_7); + out[17] = _mm_packs_epi32(out_17_6, out_17_7); + out[9] = _mm_packs_epi32(out_09_6, out_09_7); + out[25] = _mm_packs_epi32(out_25_6, out_25_7); + out[7] = _mm_packs_epi32(out_07_6, out_07_7); + out[23] = _mm_packs_epi32(out_23_6, out_23_7); + out[15] = _mm_packs_epi32(out_15_6, out_15_7); + out[31] = _mm_packs_epi32(out_31_6, out_31_7); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25], + &out[7], &out[23], &out[15], &out[31]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]); + const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]); + const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]); + const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]); + const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]); + const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]); + const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]); + const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]); + const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05); + const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05); + const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21); + const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21); + const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13); + const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13); + const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29); + const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29); + const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03); + const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03); + const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19); + const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19); + const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11); + const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11); + const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27); + const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27); + // dct_const_round_shift + const __m128i out_05_4 = + _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING); + const __m128i out_05_5 = + _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING); + const __m128i out_21_4 = + _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING); + const __m128i out_21_5 = + _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING); + const __m128i out_13_4 = + _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING); + const __m128i out_13_5 = + _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING); + const __m128i out_29_4 = + _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING); + const __m128i out_29_5 = + _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING); + const __m128i out_03_4 = + _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING); + const __m128i out_03_5 = + _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING); + const __m128i out_19_4 = + _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING); + const __m128i out_19_5 = + _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING); + const __m128i out_11_4 = + _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING); + const __m128i out_11_5 = + _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING); + const __m128i out_27_4 = + _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING); + const __m128i out_27_5 = + _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING); + const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS); + const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS); + const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS); + const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS); + const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS); + const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS); + const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS); + const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS); + const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS); + const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS); + const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS); + const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS); + const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS); + const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS); + const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS); + const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS); + // Combine + out[5] = _mm_packs_epi32(out_05_6, out_05_7); + out[21] = _mm_packs_epi32(out_21_6, out_21_7); + out[13] = _mm_packs_epi32(out_13_6, out_13_7); + out[29] = _mm_packs_epi32(out_29_6, out_29_7); + out[3] = _mm_packs_epi32(out_03_6, out_03_7); + out[19] = _mm_packs_epi32(out_19_6, out_19_7); + out[11] = _mm_packs_epi32(out_11_6, out_11_7); + out[27] = _mm_packs_epi32(out_27_6, out_27_7); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29], + &out[3], &out[19], &out[11], &out[27]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } +#if FDCT32x32_HIGH_PRECISION + } else { + __m128i lstep1[64], lstep2[64], lstep3[64]; + __m128i u[32], v[32], sign[16]; + const __m128i K32One = _mm_set_epi32(1, 1, 1, 1); + const __m128i k__pOne_mOne = pair_set_epi16(1, -1); + // start using 32-bit operations + // stage 3 + { + // expanding to 32-bit length while adding and subtracting + lstep2[0] = _mm_unpacklo_epi16(step2[0], step2[7]); + lstep2[1] = _mm_unpackhi_epi16(step2[0], step2[7]); + lstep2[2] = _mm_unpacklo_epi16(step2[1], step2[6]); + lstep2[3] = _mm_unpackhi_epi16(step2[1], step2[6]); + lstep2[4] = _mm_unpacklo_epi16(step2[2], step2[5]); + lstep2[5] = _mm_unpackhi_epi16(step2[2], step2[5]); + lstep2[6] = _mm_unpacklo_epi16(step2[3], step2[4]); + lstep2[7] = _mm_unpackhi_epi16(step2[3], step2[4]); + + lstep3[0] = _mm_madd_epi16(lstep2[0], kOne); + lstep3[1] = _mm_madd_epi16(lstep2[1], kOne); + lstep3[2] = _mm_madd_epi16(lstep2[2], kOne); + lstep3[3] = _mm_madd_epi16(lstep2[3], kOne); + lstep3[4] = _mm_madd_epi16(lstep2[4], kOne); + lstep3[5] = _mm_madd_epi16(lstep2[5], kOne); + lstep3[6] = _mm_madd_epi16(lstep2[6], kOne); + lstep3[7] = _mm_madd_epi16(lstep2[7], kOne); + + lstep3[8] = _mm_madd_epi16(lstep2[6], k__pOne_mOne); + lstep3[9] = _mm_madd_epi16(lstep2[7], k__pOne_mOne); + lstep3[10] = _mm_madd_epi16(lstep2[4], k__pOne_mOne); + lstep3[11] = _mm_madd_epi16(lstep2[5], k__pOne_mOne); + lstep3[12] = _mm_madd_epi16(lstep2[2], k__pOne_mOne); + lstep3[13] = _mm_madd_epi16(lstep2[3], k__pOne_mOne); + lstep3[14] = _mm_madd_epi16(lstep2[0], k__pOne_mOne); + lstep3[15] = _mm_madd_epi16(lstep2[1], k__pOne_mOne); + } + { + const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]); + const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]); + const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]); + const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]); + const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16); + const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16); + const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16); + const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16); + const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16); + const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16); + const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16); + const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); + const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); + const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); + const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); + const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); + const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); + const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); + const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); + lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS); + lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS); + lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS); + lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS); + lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS); + lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS); + lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS); + lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); + } + { + lstep1[32] = _mm_unpacklo_epi16(step1[16], step2[23]); + lstep1[33] = _mm_unpackhi_epi16(step1[16], step2[23]); + lstep1[34] = _mm_unpacklo_epi16(step1[17], step2[22]); + lstep1[35] = _mm_unpackhi_epi16(step1[17], step2[22]); + lstep1[36] = _mm_unpacklo_epi16(step1[18], step2[21]); + lstep1[37] = _mm_unpackhi_epi16(step1[18], step2[21]); + lstep1[38] = _mm_unpacklo_epi16(step1[19], step2[20]); + lstep1[39] = _mm_unpackhi_epi16(step1[19], step2[20]); + + lstep1[56] = _mm_unpacklo_epi16(step1[28], step2[27]); + lstep1[57] = _mm_unpackhi_epi16(step1[28], step2[27]); + lstep1[58] = _mm_unpacklo_epi16(step1[29], step2[26]); + lstep1[59] = _mm_unpackhi_epi16(step1[29], step2[26]); + lstep1[60] = _mm_unpacklo_epi16(step1[30], step2[25]); + lstep1[61] = _mm_unpackhi_epi16(step1[30], step2[25]); + lstep1[62] = _mm_unpacklo_epi16(step1[31], step2[24]); + lstep1[63] = _mm_unpackhi_epi16(step1[31], step2[24]); + + lstep3[32] = _mm_madd_epi16(lstep1[32], kOne); + lstep3[33] = _mm_madd_epi16(lstep1[33], kOne); + lstep3[34] = _mm_madd_epi16(lstep1[34], kOne); + lstep3[35] = _mm_madd_epi16(lstep1[35], kOne); + lstep3[36] = _mm_madd_epi16(lstep1[36], kOne); + lstep3[37] = _mm_madd_epi16(lstep1[37], kOne); + lstep3[38] = _mm_madd_epi16(lstep1[38], kOne); + lstep3[39] = _mm_madd_epi16(lstep1[39], kOne); + + lstep3[40] = _mm_madd_epi16(lstep1[38], k__pOne_mOne); + lstep3[41] = _mm_madd_epi16(lstep1[39], k__pOne_mOne); + lstep3[42] = _mm_madd_epi16(lstep1[36], k__pOne_mOne); + lstep3[43] = _mm_madd_epi16(lstep1[37], k__pOne_mOne); + lstep3[44] = _mm_madd_epi16(lstep1[34], k__pOne_mOne); + lstep3[45] = _mm_madd_epi16(lstep1[35], k__pOne_mOne); + lstep3[46] = _mm_madd_epi16(lstep1[32], k__pOne_mOne); + lstep3[47] = _mm_madd_epi16(lstep1[33], k__pOne_mOne); + + lstep3[48] = _mm_madd_epi16(lstep1[62], k__pOne_mOne); + lstep3[49] = _mm_madd_epi16(lstep1[63], k__pOne_mOne); + lstep3[50] = _mm_madd_epi16(lstep1[60], k__pOne_mOne); + lstep3[51] = _mm_madd_epi16(lstep1[61], k__pOne_mOne); + lstep3[52] = _mm_madd_epi16(lstep1[58], k__pOne_mOne); + lstep3[53] = _mm_madd_epi16(lstep1[59], k__pOne_mOne); + lstep3[54] = _mm_madd_epi16(lstep1[56], k__pOne_mOne); + lstep3[55] = _mm_madd_epi16(lstep1[57], k__pOne_mOne); + + lstep3[56] = _mm_madd_epi16(lstep1[56], kOne); + lstep3[57] = _mm_madd_epi16(lstep1[57], kOne); + lstep3[58] = _mm_madd_epi16(lstep1[58], kOne); + lstep3[59] = _mm_madd_epi16(lstep1[59], kOne); + lstep3[60] = _mm_madd_epi16(lstep1[60], kOne); + lstep3[61] = _mm_madd_epi16(lstep1[61], kOne); + lstep3[62] = _mm_madd_epi16(lstep1[62], kOne); + lstep3[63] = _mm_madd_epi16(lstep1[63], kOne); + } + + // stage 4 + { + // expanding to 32-bit length prior to addition operations + sign[0] = _mm_cmpgt_epi16(kZero, step2[8]); + sign[1] = _mm_cmpgt_epi16(kZero, step2[9]); + sign[2] = _mm_cmpgt_epi16(kZero, step2[14]); + sign[3] = _mm_cmpgt_epi16(kZero, step2[15]); + lstep2[16] = _mm_unpacklo_epi16(step2[8], sign[0]); + lstep2[17] = _mm_unpackhi_epi16(step2[8], sign[0]); + lstep2[18] = _mm_unpacklo_epi16(step2[9], sign[1]); + lstep2[19] = _mm_unpackhi_epi16(step2[9], sign[1]); + lstep2[28] = _mm_unpacklo_epi16(step2[14], sign[2]); + lstep2[29] = _mm_unpackhi_epi16(step2[14], sign[2]); + lstep2[30] = _mm_unpacklo_epi16(step2[15], sign[3]); + lstep2[31] = _mm_unpackhi_epi16(step2[15], sign[3]); + + lstep1[0] = _mm_add_epi32(lstep3[6], lstep3[0]); + lstep1[1] = _mm_add_epi32(lstep3[7], lstep3[1]); + lstep1[2] = _mm_add_epi32(lstep3[4], lstep3[2]); + lstep1[3] = _mm_add_epi32(lstep3[5], lstep3[3]); + lstep1[4] = _mm_sub_epi32(lstep3[2], lstep3[4]); + lstep1[5] = _mm_sub_epi32(lstep3[3], lstep3[5]); + lstep1[6] = _mm_sub_epi32(lstep3[0], lstep3[6]); + lstep1[7] = _mm_sub_epi32(lstep3[1], lstep3[7]); + lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]); + lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]); + lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]); + lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]); + lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]); + lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]); + lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]); + lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]); + lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]); + lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]); + lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]); + lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]); + lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]); + lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]); + lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]); + lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]); + } + { + // to be continued... + // + const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64); + const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64); + + u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]); + u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]); + u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]); + u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]); + + // TODO(jingning): manually inline k_madd_epi32_ to further hide + // instruction latency. + v[0] = k_madd_epi32(u[0], k32_p16_m16); + v[1] = k_madd_epi32(u[1], k32_p16_m16); + v[2] = k_madd_epi32(u[2], k32_p16_m16); + v[3] = k_madd_epi32(u[3], k32_p16_m16); + v[4] = k_madd_epi32(u[0], k32_p16_p16); + v[5] = k_madd_epi32(u[1], k32_p16_p16); + v[6] = k_madd_epi32(u[2], k32_p16_p16); + v[7] = k_madd_epi32(u[3], k32_p16_p16); +#if DCT_HIGH_BIT_DEPTH + overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3], &v[4], + &v[5], &v[6], &v[7], &kZero); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + + lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + } + { + const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); + const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64); + const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64); + + u[0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]); + u[1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]); + u[2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]); + u[3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]); + u[4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]); + u[5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]); + u[6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]); + u[7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]); + u[8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]); + u[9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]); + u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]); + u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]); + u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]); + u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]); + u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]); + u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]); + + v[0] = k_madd_epi32(u[0], k32_m08_p24); + v[1] = k_madd_epi32(u[1], k32_m08_p24); + v[2] = k_madd_epi32(u[2], k32_m08_p24); + v[3] = k_madd_epi32(u[3], k32_m08_p24); + v[4] = k_madd_epi32(u[4], k32_m08_p24); + v[5] = k_madd_epi32(u[5], k32_m08_p24); + v[6] = k_madd_epi32(u[6], k32_m08_p24); + v[7] = k_madd_epi32(u[7], k32_m08_p24); + v[8] = k_madd_epi32(u[8], k32_m24_m08); + v[9] = k_madd_epi32(u[9], k32_m24_m08); + v[10] = k_madd_epi32(u[10], k32_m24_m08); + v[11] = k_madd_epi32(u[11], k32_m24_m08); + v[12] = k_madd_epi32(u[12], k32_m24_m08); + v[13] = k_madd_epi32(u[13], k32_m24_m08); + v[14] = k_madd_epi32(u[14], k32_m24_m08); + v[15] = k_madd_epi32(u[15], k32_m24_m08); + v[16] = k_madd_epi32(u[12], k32_m08_p24); + v[17] = k_madd_epi32(u[13], k32_m08_p24); + v[18] = k_madd_epi32(u[14], k32_m08_p24); + v[19] = k_madd_epi32(u[15], k32_m08_p24); + v[20] = k_madd_epi32(u[8], k32_m08_p24); + v[21] = k_madd_epi32(u[9], k32_m08_p24); + v[22] = k_madd_epi32(u[10], k32_m08_p24); + v[23] = k_madd_epi32(u[11], k32_m08_p24); + v[24] = k_madd_epi32(u[4], k32_p24_p08); + v[25] = k_madd_epi32(u[5], k32_p24_p08); + v[26] = k_madd_epi32(u[6], k32_p24_p08); + v[27] = k_madd_epi32(u[7], k32_p24_p08); + v[28] = k_madd_epi32(u[0], k32_p24_p08); + v[29] = k_madd_epi32(u[1], k32_p24_p08); + v[30] = k_madd_epi32(u[2], k32_p24_p08); + v[31] = k_madd_epi32(u[3], k32_p24_p08); + +#if DCT_HIGH_BIT_DEPTH + overflow = k_check_epi32_overflow_32( + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], + &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], + &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], + &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + u[4] = k_packs_epi64(v[8], v[9]); + u[5] = k_packs_epi64(v[10], v[11]); + u[6] = k_packs_epi64(v[12], v[13]); + u[7] = k_packs_epi64(v[14], v[15]); + u[8] = k_packs_epi64(v[16], v[17]); + u[9] = k_packs_epi64(v[18], v[19]); + u[10] = k_packs_epi64(v[20], v[21]); + u[11] = k_packs_epi64(v[22], v[23]); + u[12] = k_packs_epi64(v[24], v[25]); + u[13] = k_packs_epi64(v[26], v[27]); + u[14] = k_packs_epi64(v[28], v[29]); + u[15] = k_packs_epi64(v[30], v[31]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + lstep1[36] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + lstep1[37] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + lstep1[38] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + lstep1[39] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + lstep1[40] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + lstep1[41] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + lstep1[42] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + lstep1[43] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + lstep1[52] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + lstep1[53] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + } + // stage 5 + { + lstep2[8] = _mm_add_epi32(lstep1[10], lstep3[8]); + lstep2[9] = _mm_add_epi32(lstep1[11], lstep3[9]); + lstep2[10] = _mm_sub_epi32(lstep3[8], lstep1[10]); + lstep2[11] = _mm_sub_epi32(lstep3[9], lstep1[11]); + lstep2[12] = _mm_sub_epi32(lstep3[14], lstep1[12]); + lstep2[13] = _mm_sub_epi32(lstep3[15], lstep1[13]); + lstep2[14] = _mm_add_epi32(lstep1[12], lstep3[14]); + lstep2[15] = _mm_add_epi32(lstep1[13], lstep3[15]); + } + { + const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64); + const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64); + const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64); + const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); + + u[0] = _mm_unpacklo_epi32(lstep1[0], lstep1[2]); + u[1] = _mm_unpackhi_epi32(lstep1[0], lstep1[2]); + u[2] = _mm_unpacklo_epi32(lstep1[1], lstep1[3]); + u[3] = _mm_unpackhi_epi32(lstep1[1], lstep1[3]); + u[4] = _mm_unpacklo_epi32(lstep1[4], lstep1[6]); + u[5] = _mm_unpackhi_epi32(lstep1[4], lstep1[6]); + u[6] = _mm_unpacklo_epi32(lstep1[5], lstep1[7]); + u[7] = _mm_unpackhi_epi32(lstep1[5], lstep1[7]); + + // TODO(jingning): manually inline k_madd_epi32_ to further hide + // instruction latency. + v[0] = k_madd_epi32(u[0], k32_p16_p16); + v[1] = k_madd_epi32(u[1], k32_p16_p16); + v[2] = k_madd_epi32(u[2], k32_p16_p16); + v[3] = k_madd_epi32(u[3], k32_p16_p16); + v[4] = k_madd_epi32(u[0], k32_p16_m16); + v[5] = k_madd_epi32(u[1], k32_p16_m16); + v[6] = k_madd_epi32(u[2], k32_p16_m16); + v[7] = k_madd_epi32(u[3], k32_p16_m16); + v[8] = k_madd_epi32(u[4], k32_p24_p08); + v[9] = k_madd_epi32(u[5], k32_p24_p08); + v[10] = k_madd_epi32(u[6], k32_p24_p08); + v[11] = k_madd_epi32(u[7], k32_p24_p08); + v[12] = k_madd_epi32(u[4], k32_m08_p24); + v[13] = k_madd_epi32(u[5], k32_m08_p24); + v[14] = k_madd_epi32(u[6], k32_m08_p24); + v[15] = k_madd_epi32(u[7], k32_m08_p24); + +#if DCT_HIGH_BIT_DEPTH + overflow = k_check_epi32_overflow_16( + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], + &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + u[4] = k_packs_epi64(v[8], v[9]); + u[5] = k_packs_epi64(v[10], v[11]); + u[6] = k_packs_epi64(v[12], v[13]); + u[7] = k_packs_epi64(v[14], v[15]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + + sign[0] = _mm_cmplt_epi32(u[0], kZero); + sign[1] = _mm_cmplt_epi32(u[1], kZero); + sign[2] = _mm_cmplt_epi32(u[2], kZero); + sign[3] = _mm_cmplt_epi32(u[3], kZero); + sign[4] = _mm_cmplt_epi32(u[4], kZero); + sign[5] = _mm_cmplt_epi32(u[5], kZero); + sign[6] = _mm_cmplt_epi32(u[6], kZero); + sign[7] = _mm_cmplt_epi32(u[7], kZero); + + u[0] = _mm_sub_epi32(u[0], sign[0]); + u[1] = _mm_sub_epi32(u[1], sign[1]); + u[2] = _mm_sub_epi32(u[2], sign[2]); + u[3] = _mm_sub_epi32(u[3], sign[3]); + u[4] = _mm_sub_epi32(u[4], sign[4]); + u[5] = _mm_sub_epi32(u[5], sign[5]); + u[6] = _mm_sub_epi32(u[6], sign[6]); + u[7] = _mm_sub_epi32(u[7], sign[7]); + + u[0] = _mm_add_epi32(u[0], K32One); + u[1] = _mm_add_epi32(u[1], K32One); + u[2] = _mm_add_epi32(u[2], K32One); + u[3] = _mm_add_epi32(u[3], K32One); + u[4] = _mm_add_epi32(u[4], K32One); + u[5] = _mm_add_epi32(u[5], K32One); + u[6] = _mm_add_epi32(u[6], K32One); + u[7] = _mm_add_epi32(u[7], K32One); + + u[0] = _mm_srai_epi32(u[0], 2); + u[1] = _mm_srai_epi32(u[1], 2); + u[2] = _mm_srai_epi32(u[2], 2); + u[3] = _mm_srai_epi32(u[3], 2); + u[4] = _mm_srai_epi32(u[4], 2); + u[5] = _mm_srai_epi32(u[5], 2); + u[6] = _mm_srai_epi32(u[6], 2); + u[7] = _mm_srai_epi32(u[7], 2); + + // Combine + out[0] = _mm_packs_epi32(u[0], u[1]); + out[16] = _mm_packs_epi32(u[2], u[3]); + out[8] = _mm_packs_epi32(u[4], u[5]); + out[24] = _mm_packs_epi32(u[6], u[7]); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); + const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64); + const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64); + + u[0] = _mm_unpacklo_epi32(lstep1[18], lstep1[28]); + u[1] = _mm_unpackhi_epi32(lstep1[18], lstep1[28]); + u[2] = _mm_unpacklo_epi32(lstep1[19], lstep1[29]); + u[3] = _mm_unpackhi_epi32(lstep1[19], lstep1[29]); + u[4] = _mm_unpacklo_epi32(lstep1[20], lstep1[26]); + u[5] = _mm_unpackhi_epi32(lstep1[20], lstep1[26]); + u[6] = _mm_unpacklo_epi32(lstep1[21], lstep1[27]); + u[7] = _mm_unpackhi_epi32(lstep1[21], lstep1[27]); + + v[0] = k_madd_epi32(u[0], k32_m08_p24); + v[1] = k_madd_epi32(u[1], k32_m08_p24); + v[2] = k_madd_epi32(u[2], k32_m08_p24); + v[3] = k_madd_epi32(u[3], k32_m08_p24); + v[4] = k_madd_epi32(u[4], k32_m24_m08); + v[5] = k_madd_epi32(u[5], k32_m24_m08); + v[6] = k_madd_epi32(u[6], k32_m24_m08); + v[7] = k_madd_epi32(u[7], k32_m24_m08); + v[8] = k_madd_epi32(u[4], k32_m08_p24); + v[9] = k_madd_epi32(u[5], k32_m08_p24); + v[10] = k_madd_epi32(u[6], k32_m08_p24); + v[11] = k_madd_epi32(u[7], k32_m08_p24); + v[12] = k_madd_epi32(u[0], k32_p24_p08); + v[13] = k_madd_epi32(u[1], k32_p24_p08); + v[14] = k_madd_epi32(u[2], k32_p24_p08); + v[15] = k_madd_epi32(u[3], k32_p24_p08); + +#if DCT_HIGH_BIT_DEPTH + overflow = k_check_epi32_overflow_16( + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], + &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + u[4] = k_packs_epi64(v[8], v[9]); + u[5] = k_packs_epi64(v[10], v[11]); + u[6] = k_packs_epi64(v[12], v[13]); + u[7] = k_packs_epi64(v[14], v[15]); + + u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + + lstep2[18] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + lstep2[19] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + lstep2[20] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + lstep2[21] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + lstep2[26] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + lstep2[27] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + lstep2[28] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + } + { + lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]); + lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]); + lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]); + lstep2[35] = _mm_add_epi32(lstep1[37], lstep3[35]); + lstep2[36] = _mm_sub_epi32(lstep3[34], lstep1[36]); + lstep2[37] = _mm_sub_epi32(lstep3[35], lstep1[37]); + lstep2[38] = _mm_sub_epi32(lstep3[32], lstep1[38]); + lstep2[39] = _mm_sub_epi32(lstep3[33], lstep1[39]); + lstep2[40] = _mm_sub_epi32(lstep3[46], lstep1[40]); + lstep2[41] = _mm_sub_epi32(lstep3[47], lstep1[41]); + lstep2[42] = _mm_sub_epi32(lstep3[44], lstep1[42]); + lstep2[43] = _mm_sub_epi32(lstep3[45], lstep1[43]); + lstep2[44] = _mm_add_epi32(lstep1[42], lstep3[44]); + lstep2[45] = _mm_add_epi32(lstep1[43], lstep3[45]); + lstep2[46] = _mm_add_epi32(lstep1[40], lstep3[46]); + lstep2[47] = _mm_add_epi32(lstep1[41], lstep3[47]); + lstep2[48] = _mm_add_epi32(lstep1[54], lstep3[48]); + lstep2[49] = _mm_add_epi32(lstep1[55], lstep3[49]); + lstep2[50] = _mm_add_epi32(lstep1[52], lstep3[50]); + lstep2[51] = _mm_add_epi32(lstep1[53], lstep3[51]); + lstep2[52] = _mm_sub_epi32(lstep3[50], lstep1[52]); + lstep2[53] = _mm_sub_epi32(lstep3[51], lstep1[53]); + lstep2[54] = _mm_sub_epi32(lstep3[48], lstep1[54]); + lstep2[55] = _mm_sub_epi32(lstep3[49], lstep1[55]); + lstep2[56] = _mm_sub_epi32(lstep3[62], lstep1[56]); + lstep2[57] = _mm_sub_epi32(lstep3[63], lstep1[57]); + lstep2[58] = _mm_sub_epi32(lstep3[60], lstep1[58]); + lstep2[59] = _mm_sub_epi32(lstep3[61], lstep1[59]); + lstep2[60] = _mm_add_epi32(lstep1[58], lstep3[60]); + lstep2[61] = _mm_add_epi32(lstep1[59], lstep3[61]); + lstep2[62] = _mm_add_epi32(lstep1[56], lstep3[62]); + lstep2[63] = _mm_add_epi32(lstep1[57], lstep3[63]); + } + // stage 6 + { + const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64); + const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64); + const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64); + const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64); + + u[0] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]); + u[1] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]); + u[2] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]); + u[3] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]); + u[4] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]); + u[5] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]); + u[6] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]); + u[7] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]); + u[8] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]); + u[9] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]); + u[10] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]); + u[11] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]); + u[12] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]); + u[13] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]); + u[14] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]); + u[15] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]); + + v[0] = k_madd_epi32(u[0], k32_p28_p04); + v[1] = k_madd_epi32(u[1], k32_p28_p04); + v[2] = k_madd_epi32(u[2], k32_p28_p04); + v[3] = k_madd_epi32(u[3], k32_p28_p04); + v[4] = k_madd_epi32(u[4], k32_p12_p20); + v[5] = k_madd_epi32(u[5], k32_p12_p20); + v[6] = k_madd_epi32(u[6], k32_p12_p20); + v[7] = k_madd_epi32(u[7], k32_p12_p20); + v[8] = k_madd_epi32(u[8], k32_m20_p12); + v[9] = k_madd_epi32(u[9], k32_m20_p12); + v[10] = k_madd_epi32(u[10], k32_m20_p12); + v[11] = k_madd_epi32(u[11], k32_m20_p12); + v[12] = k_madd_epi32(u[12], k32_m04_p28); + v[13] = k_madd_epi32(u[13], k32_m04_p28); + v[14] = k_madd_epi32(u[14], k32_m04_p28); + v[15] = k_madd_epi32(u[15], k32_m04_p28); + +#if DCT_HIGH_BIT_DEPTH + overflow = k_check_epi32_overflow_16( + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], + &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + u[4] = k_packs_epi64(v[8], v[9]); + u[5] = k_packs_epi64(v[10], v[11]); + u[6] = k_packs_epi64(v[12], v[13]); + u[7] = k_packs_epi64(v[14], v[15]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + + sign[0] = _mm_cmplt_epi32(u[0], kZero); + sign[1] = _mm_cmplt_epi32(u[1], kZero); + sign[2] = _mm_cmplt_epi32(u[2], kZero); + sign[3] = _mm_cmplt_epi32(u[3], kZero); + sign[4] = _mm_cmplt_epi32(u[4], kZero); + sign[5] = _mm_cmplt_epi32(u[5], kZero); + sign[6] = _mm_cmplt_epi32(u[6], kZero); + sign[7] = _mm_cmplt_epi32(u[7], kZero); + + u[0] = _mm_sub_epi32(u[0], sign[0]); + u[1] = _mm_sub_epi32(u[1], sign[1]); + u[2] = _mm_sub_epi32(u[2], sign[2]); + u[3] = _mm_sub_epi32(u[3], sign[3]); + u[4] = _mm_sub_epi32(u[4], sign[4]); + u[5] = _mm_sub_epi32(u[5], sign[5]); + u[6] = _mm_sub_epi32(u[6], sign[6]); + u[7] = _mm_sub_epi32(u[7], sign[7]); + + u[0] = _mm_add_epi32(u[0], K32One); + u[1] = _mm_add_epi32(u[1], K32One); + u[2] = _mm_add_epi32(u[2], K32One); + u[3] = _mm_add_epi32(u[3], K32One); + u[4] = _mm_add_epi32(u[4], K32One); + u[5] = _mm_add_epi32(u[5], K32One); + u[6] = _mm_add_epi32(u[6], K32One); + u[7] = _mm_add_epi32(u[7], K32One); + + u[0] = _mm_srai_epi32(u[0], 2); + u[1] = _mm_srai_epi32(u[1], 2); + u[2] = _mm_srai_epi32(u[2], 2); + u[3] = _mm_srai_epi32(u[3], 2); + u[4] = _mm_srai_epi32(u[4], 2); + u[5] = _mm_srai_epi32(u[5], 2); + u[6] = _mm_srai_epi32(u[6], 2); + u[7] = _mm_srai_epi32(u[7], 2); + + out[4] = _mm_packs_epi32(u[0], u[1]); + out[20] = _mm_packs_epi32(u[2], u[3]); + out[12] = _mm_packs_epi32(u[4], u[5]); + out[28] = _mm_packs_epi32(u[6], u[7]); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + lstep3[16] = _mm_add_epi32(lstep2[18], lstep1[16]); + lstep3[17] = _mm_add_epi32(lstep2[19], lstep1[17]); + lstep3[18] = _mm_sub_epi32(lstep1[16], lstep2[18]); + lstep3[19] = _mm_sub_epi32(lstep1[17], lstep2[19]); + lstep3[20] = _mm_sub_epi32(lstep1[22], lstep2[20]); + lstep3[21] = _mm_sub_epi32(lstep1[23], lstep2[21]); + lstep3[22] = _mm_add_epi32(lstep2[20], lstep1[22]); + lstep3[23] = _mm_add_epi32(lstep2[21], lstep1[23]); + lstep3[24] = _mm_add_epi32(lstep2[26], lstep1[24]); + lstep3[25] = _mm_add_epi32(lstep2[27], lstep1[25]); + lstep3[26] = _mm_sub_epi32(lstep1[24], lstep2[26]); + lstep3[27] = _mm_sub_epi32(lstep1[25], lstep2[27]); + lstep3[28] = _mm_sub_epi32(lstep1[30], lstep2[28]); + lstep3[29] = _mm_sub_epi32(lstep1[31], lstep2[29]); + lstep3[30] = _mm_add_epi32(lstep2[28], lstep1[30]); + lstep3[31] = _mm_add_epi32(lstep2[29], lstep1[31]); + } + { + const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64); + const __m128i k32_m28_m04 = pair_set_epi32(-cospi_28_64, -cospi_4_64); + const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64); + const __m128i k32_m12_m20 = + pair_set_epi32(-cospi_12_64, -cospi_20_64); + const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64); + const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64); + + u[0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]); + u[1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]); + u[2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]); + u[3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]); + u[4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]); + u[5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]); + u[6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]); + u[7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]); + u[8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]); + u[9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]); + u[10] = _mm_unpacklo_epi32(lstep2[43], lstep2[53]); + u[11] = _mm_unpackhi_epi32(lstep2[43], lstep2[53]); + u[12] = _mm_unpacklo_epi32(lstep2[44], lstep2[50]); + u[13] = _mm_unpackhi_epi32(lstep2[44], lstep2[50]); + u[14] = _mm_unpacklo_epi32(lstep2[45], lstep2[51]); + u[15] = _mm_unpackhi_epi32(lstep2[45], lstep2[51]); + + v[0] = k_madd_epi32(u[0], k32_m04_p28); + v[1] = k_madd_epi32(u[1], k32_m04_p28); + v[2] = k_madd_epi32(u[2], k32_m04_p28); + v[3] = k_madd_epi32(u[3], k32_m04_p28); + v[4] = k_madd_epi32(u[4], k32_m28_m04); + v[5] = k_madd_epi32(u[5], k32_m28_m04); + v[6] = k_madd_epi32(u[6], k32_m28_m04); + v[7] = k_madd_epi32(u[7], k32_m28_m04); + v[8] = k_madd_epi32(u[8], k32_m20_p12); + v[9] = k_madd_epi32(u[9], k32_m20_p12); + v[10] = k_madd_epi32(u[10], k32_m20_p12); + v[11] = k_madd_epi32(u[11], k32_m20_p12); + v[12] = k_madd_epi32(u[12], k32_m12_m20); + v[13] = k_madd_epi32(u[13], k32_m12_m20); + v[14] = k_madd_epi32(u[14], k32_m12_m20); + v[15] = k_madd_epi32(u[15], k32_m12_m20); + v[16] = k_madd_epi32(u[12], k32_m20_p12); + v[17] = k_madd_epi32(u[13], k32_m20_p12); + v[18] = k_madd_epi32(u[14], k32_m20_p12); + v[19] = k_madd_epi32(u[15], k32_m20_p12); + v[20] = k_madd_epi32(u[8], k32_p12_p20); + v[21] = k_madd_epi32(u[9], k32_p12_p20); + v[22] = k_madd_epi32(u[10], k32_p12_p20); + v[23] = k_madd_epi32(u[11], k32_p12_p20); + v[24] = k_madd_epi32(u[4], k32_m04_p28); + v[25] = k_madd_epi32(u[5], k32_m04_p28); + v[26] = k_madd_epi32(u[6], k32_m04_p28); + v[27] = k_madd_epi32(u[7], k32_m04_p28); + v[28] = k_madd_epi32(u[0], k32_p28_p04); + v[29] = k_madd_epi32(u[1], k32_p28_p04); + v[30] = k_madd_epi32(u[2], k32_p28_p04); + v[31] = k_madd_epi32(u[3], k32_p28_p04); + +#if DCT_HIGH_BIT_DEPTH + overflow = k_check_epi32_overflow_32( + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], + &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], + &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], + &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + u[4] = k_packs_epi64(v[8], v[9]); + u[5] = k_packs_epi64(v[10], v[11]); + u[6] = k_packs_epi64(v[12], v[13]); + u[7] = k_packs_epi64(v[14], v[15]); + u[8] = k_packs_epi64(v[16], v[17]); + u[9] = k_packs_epi64(v[18], v[19]); + u[10] = k_packs_epi64(v[20], v[21]); + u[11] = k_packs_epi64(v[22], v[23]); + u[12] = k_packs_epi64(v[24], v[25]); + u[13] = k_packs_epi64(v[26], v[27]); + u[14] = k_packs_epi64(v[28], v[29]); + u[15] = k_packs_epi64(v[30], v[31]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + lstep3[34] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + lstep3[35] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + lstep3[36] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + lstep3[37] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + lstep3[42] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + lstep3[43] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + lstep3[44] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + lstep3[45] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + lstep3[50] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + lstep3[51] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + lstep3[52] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + lstep3[53] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + lstep3[58] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + lstep3[59] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + lstep3[60] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + lstep3[61] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + } + // stage 7 + { + const __m128i k32_p30_p02 = pair_set_epi32(cospi_30_64, cospi_2_64); + const __m128i k32_p14_p18 = pair_set_epi32(cospi_14_64, cospi_18_64); + const __m128i k32_p22_p10 = pair_set_epi32(cospi_22_64, cospi_10_64); + const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64, cospi_26_64); + const __m128i k32_m26_p06 = pair_set_epi32(-cospi_26_64, cospi_6_64); + const __m128i k32_m10_p22 = pair_set_epi32(-cospi_10_64, cospi_22_64); + const __m128i k32_m18_p14 = pair_set_epi32(-cospi_18_64, cospi_14_64); + const __m128i k32_m02_p30 = pair_set_epi32(-cospi_2_64, cospi_30_64); + + u[0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]); + u[1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]); + u[2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]); + u[3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]); + u[4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]); + u[5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]); + u[6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]); + u[7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]); + u[8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]); + u[9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]); + u[10] = _mm_unpacklo_epi32(lstep3[21], lstep3[27]); + u[11] = _mm_unpackhi_epi32(lstep3[21], lstep3[27]); + u[12] = _mm_unpacklo_epi32(lstep3[22], lstep3[24]); + u[13] = _mm_unpackhi_epi32(lstep3[22], lstep3[24]); + u[14] = _mm_unpacklo_epi32(lstep3[23], lstep3[25]); + u[15] = _mm_unpackhi_epi32(lstep3[23], lstep3[25]); + + v[0] = k_madd_epi32(u[0], k32_p30_p02); + v[1] = k_madd_epi32(u[1], k32_p30_p02); + v[2] = k_madd_epi32(u[2], k32_p30_p02); + v[3] = k_madd_epi32(u[3], k32_p30_p02); + v[4] = k_madd_epi32(u[4], k32_p14_p18); + v[5] = k_madd_epi32(u[5], k32_p14_p18); + v[6] = k_madd_epi32(u[6], k32_p14_p18); + v[7] = k_madd_epi32(u[7], k32_p14_p18); + v[8] = k_madd_epi32(u[8], k32_p22_p10); + v[9] = k_madd_epi32(u[9], k32_p22_p10); + v[10] = k_madd_epi32(u[10], k32_p22_p10); + v[11] = k_madd_epi32(u[11], k32_p22_p10); + v[12] = k_madd_epi32(u[12], k32_p06_p26); + v[13] = k_madd_epi32(u[13], k32_p06_p26); + v[14] = k_madd_epi32(u[14], k32_p06_p26); + v[15] = k_madd_epi32(u[15], k32_p06_p26); + v[16] = k_madd_epi32(u[12], k32_m26_p06); + v[17] = k_madd_epi32(u[13], k32_m26_p06); + v[18] = k_madd_epi32(u[14], k32_m26_p06); + v[19] = k_madd_epi32(u[15], k32_m26_p06); + v[20] = k_madd_epi32(u[8], k32_m10_p22); + v[21] = k_madd_epi32(u[9], k32_m10_p22); + v[22] = k_madd_epi32(u[10], k32_m10_p22); + v[23] = k_madd_epi32(u[11], k32_m10_p22); + v[24] = k_madd_epi32(u[4], k32_m18_p14); + v[25] = k_madd_epi32(u[5], k32_m18_p14); + v[26] = k_madd_epi32(u[6], k32_m18_p14); + v[27] = k_madd_epi32(u[7], k32_m18_p14); + v[28] = k_madd_epi32(u[0], k32_m02_p30); + v[29] = k_madd_epi32(u[1], k32_m02_p30); + v[30] = k_madd_epi32(u[2], k32_m02_p30); + v[31] = k_madd_epi32(u[3], k32_m02_p30); + +#if DCT_HIGH_BIT_DEPTH + overflow = k_check_epi32_overflow_32( + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], + &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], + &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], + &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + u[4] = k_packs_epi64(v[8], v[9]); + u[5] = k_packs_epi64(v[10], v[11]); + u[6] = k_packs_epi64(v[12], v[13]); + u[7] = k_packs_epi64(v[14], v[15]); + u[8] = k_packs_epi64(v[16], v[17]); + u[9] = k_packs_epi64(v[18], v[19]); + u[10] = k_packs_epi64(v[20], v[21]); + u[11] = k_packs_epi64(v[22], v[23]); + u[12] = k_packs_epi64(v[24], v[25]); + u[13] = k_packs_epi64(v[26], v[27]); + u[14] = k_packs_epi64(v[28], v[29]); + u[15] = k_packs_epi64(v[30], v[31]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + + v[0] = _mm_cmplt_epi32(u[0], kZero); + v[1] = _mm_cmplt_epi32(u[1], kZero); + v[2] = _mm_cmplt_epi32(u[2], kZero); + v[3] = _mm_cmplt_epi32(u[3], kZero); + v[4] = _mm_cmplt_epi32(u[4], kZero); + v[5] = _mm_cmplt_epi32(u[5], kZero); + v[6] = _mm_cmplt_epi32(u[6], kZero); + v[7] = _mm_cmplt_epi32(u[7], kZero); + v[8] = _mm_cmplt_epi32(u[8], kZero); + v[9] = _mm_cmplt_epi32(u[9], kZero); + v[10] = _mm_cmplt_epi32(u[10], kZero); + v[11] = _mm_cmplt_epi32(u[11], kZero); + v[12] = _mm_cmplt_epi32(u[12], kZero); + v[13] = _mm_cmplt_epi32(u[13], kZero); + v[14] = _mm_cmplt_epi32(u[14], kZero); + v[15] = _mm_cmplt_epi32(u[15], kZero); + + u[0] = _mm_sub_epi32(u[0], v[0]); + u[1] = _mm_sub_epi32(u[1], v[1]); + u[2] = _mm_sub_epi32(u[2], v[2]); + u[3] = _mm_sub_epi32(u[3], v[3]); + u[4] = _mm_sub_epi32(u[4], v[4]); + u[5] = _mm_sub_epi32(u[5], v[5]); + u[6] = _mm_sub_epi32(u[6], v[6]); + u[7] = _mm_sub_epi32(u[7], v[7]); + u[8] = _mm_sub_epi32(u[8], v[8]); + u[9] = _mm_sub_epi32(u[9], v[9]); + u[10] = _mm_sub_epi32(u[10], v[10]); + u[11] = _mm_sub_epi32(u[11], v[11]); + u[12] = _mm_sub_epi32(u[12], v[12]); + u[13] = _mm_sub_epi32(u[13], v[13]); + u[14] = _mm_sub_epi32(u[14], v[14]); + u[15] = _mm_sub_epi32(u[15], v[15]); + + v[0] = _mm_add_epi32(u[0], K32One); + v[1] = _mm_add_epi32(u[1], K32One); + v[2] = _mm_add_epi32(u[2], K32One); + v[3] = _mm_add_epi32(u[3], K32One); + v[4] = _mm_add_epi32(u[4], K32One); + v[5] = _mm_add_epi32(u[5], K32One); + v[6] = _mm_add_epi32(u[6], K32One); + v[7] = _mm_add_epi32(u[7], K32One); + v[8] = _mm_add_epi32(u[8], K32One); + v[9] = _mm_add_epi32(u[9], K32One); + v[10] = _mm_add_epi32(u[10], K32One); + v[11] = _mm_add_epi32(u[11], K32One); + v[12] = _mm_add_epi32(u[12], K32One); + v[13] = _mm_add_epi32(u[13], K32One); + v[14] = _mm_add_epi32(u[14], K32One); + v[15] = _mm_add_epi32(u[15], K32One); + + u[0] = _mm_srai_epi32(v[0], 2); + u[1] = _mm_srai_epi32(v[1], 2); + u[2] = _mm_srai_epi32(v[2], 2); + u[3] = _mm_srai_epi32(v[3], 2); + u[4] = _mm_srai_epi32(v[4], 2); + u[5] = _mm_srai_epi32(v[5], 2); + u[6] = _mm_srai_epi32(v[6], 2); + u[7] = _mm_srai_epi32(v[7], 2); + u[8] = _mm_srai_epi32(v[8], 2); + u[9] = _mm_srai_epi32(v[9], 2); + u[10] = _mm_srai_epi32(v[10], 2); + u[11] = _mm_srai_epi32(v[11], 2); + u[12] = _mm_srai_epi32(v[12], 2); + u[13] = _mm_srai_epi32(v[13], 2); + u[14] = _mm_srai_epi32(v[14], 2); + u[15] = _mm_srai_epi32(v[15], 2); + + out[2] = _mm_packs_epi32(u[0], u[1]); + out[18] = _mm_packs_epi32(u[2], u[3]); + out[10] = _mm_packs_epi32(u[4], u[5]); + out[26] = _mm_packs_epi32(u[6], u[7]); + out[6] = _mm_packs_epi32(u[8], u[9]); + out[22] = _mm_packs_epi32(u[10], u[11]); + out[14] = _mm_packs_epi32(u[12], u[13]); + out[30] = _mm_packs_epi32(u[14], u[15]); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26], + &out[6], &out[22], &out[14], &out[30]); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + lstep1[32] = _mm_add_epi32(lstep3[34], lstep2[32]); + lstep1[33] = _mm_add_epi32(lstep3[35], lstep2[33]); + lstep1[34] = _mm_sub_epi32(lstep2[32], lstep3[34]); + lstep1[35] = _mm_sub_epi32(lstep2[33], lstep3[35]); + lstep1[36] = _mm_sub_epi32(lstep2[38], lstep3[36]); + lstep1[37] = _mm_sub_epi32(lstep2[39], lstep3[37]); + lstep1[38] = _mm_add_epi32(lstep3[36], lstep2[38]); + lstep1[39] = _mm_add_epi32(lstep3[37], lstep2[39]); + lstep1[40] = _mm_add_epi32(lstep3[42], lstep2[40]); + lstep1[41] = _mm_add_epi32(lstep3[43], lstep2[41]); + lstep1[42] = _mm_sub_epi32(lstep2[40], lstep3[42]); + lstep1[43] = _mm_sub_epi32(lstep2[41], lstep3[43]); + lstep1[44] = _mm_sub_epi32(lstep2[46], lstep3[44]); + lstep1[45] = _mm_sub_epi32(lstep2[47], lstep3[45]); + lstep1[46] = _mm_add_epi32(lstep3[44], lstep2[46]); + lstep1[47] = _mm_add_epi32(lstep3[45], lstep2[47]); + lstep1[48] = _mm_add_epi32(lstep3[50], lstep2[48]); + lstep1[49] = _mm_add_epi32(lstep3[51], lstep2[49]); + lstep1[50] = _mm_sub_epi32(lstep2[48], lstep3[50]); + lstep1[51] = _mm_sub_epi32(lstep2[49], lstep3[51]); + lstep1[52] = _mm_sub_epi32(lstep2[54], lstep3[52]); + lstep1[53] = _mm_sub_epi32(lstep2[55], lstep3[53]); + lstep1[54] = _mm_add_epi32(lstep3[52], lstep2[54]); + lstep1[55] = _mm_add_epi32(lstep3[53], lstep2[55]); + lstep1[56] = _mm_add_epi32(lstep3[58], lstep2[56]); + lstep1[57] = _mm_add_epi32(lstep3[59], lstep2[57]); + lstep1[58] = _mm_sub_epi32(lstep2[56], lstep3[58]); + lstep1[59] = _mm_sub_epi32(lstep2[57], lstep3[59]); + lstep1[60] = _mm_sub_epi32(lstep2[62], lstep3[60]); + lstep1[61] = _mm_sub_epi32(lstep2[63], lstep3[61]); + lstep1[62] = _mm_add_epi32(lstep3[60], lstep2[62]); + lstep1[63] = _mm_add_epi32(lstep3[61], lstep2[63]); + } + // stage 8 + { + const __m128i k32_p31_p01 = pair_set_epi32(cospi_31_64, cospi_1_64); + const __m128i k32_p15_p17 = pair_set_epi32(cospi_15_64, cospi_17_64); + const __m128i k32_p23_p09 = pair_set_epi32(cospi_23_64, cospi_9_64); + const __m128i k32_p07_p25 = pair_set_epi32(cospi_7_64, cospi_25_64); + const __m128i k32_m25_p07 = pair_set_epi32(-cospi_25_64, cospi_7_64); + const __m128i k32_m09_p23 = pair_set_epi32(-cospi_9_64, cospi_23_64); + const __m128i k32_m17_p15 = pair_set_epi32(-cospi_17_64, cospi_15_64); + const __m128i k32_m01_p31 = pair_set_epi32(-cospi_1_64, cospi_31_64); + + u[0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]); + u[1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]); + u[2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]); + u[3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]); + u[4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]); + u[5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]); + u[6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]); + u[7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]); + u[8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]); + u[9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]); + u[10] = _mm_unpacklo_epi32(lstep1[37], lstep1[59]); + u[11] = _mm_unpackhi_epi32(lstep1[37], lstep1[59]); + u[12] = _mm_unpacklo_epi32(lstep1[38], lstep1[56]); + u[13] = _mm_unpackhi_epi32(lstep1[38], lstep1[56]); + u[14] = _mm_unpacklo_epi32(lstep1[39], lstep1[57]); + u[15] = _mm_unpackhi_epi32(lstep1[39], lstep1[57]); + + v[0] = k_madd_epi32(u[0], k32_p31_p01); + v[1] = k_madd_epi32(u[1], k32_p31_p01); + v[2] = k_madd_epi32(u[2], k32_p31_p01); + v[3] = k_madd_epi32(u[3], k32_p31_p01); + v[4] = k_madd_epi32(u[4], k32_p15_p17); + v[5] = k_madd_epi32(u[5], k32_p15_p17); + v[6] = k_madd_epi32(u[6], k32_p15_p17); + v[7] = k_madd_epi32(u[7], k32_p15_p17); + v[8] = k_madd_epi32(u[8], k32_p23_p09); + v[9] = k_madd_epi32(u[9], k32_p23_p09); + v[10] = k_madd_epi32(u[10], k32_p23_p09); + v[11] = k_madd_epi32(u[11], k32_p23_p09); + v[12] = k_madd_epi32(u[12], k32_p07_p25); + v[13] = k_madd_epi32(u[13], k32_p07_p25); + v[14] = k_madd_epi32(u[14], k32_p07_p25); + v[15] = k_madd_epi32(u[15], k32_p07_p25); + v[16] = k_madd_epi32(u[12], k32_m25_p07); + v[17] = k_madd_epi32(u[13], k32_m25_p07); + v[18] = k_madd_epi32(u[14], k32_m25_p07); + v[19] = k_madd_epi32(u[15], k32_m25_p07); + v[20] = k_madd_epi32(u[8], k32_m09_p23); + v[21] = k_madd_epi32(u[9], k32_m09_p23); + v[22] = k_madd_epi32(u[10], k32_m09_p23); + v[23] = k_madd_epi32(u[11], k32_m09_p23); + v[24] = k_madd_epi32(u[4], k32_m17_p15); + v[25] = k_madd_epi32(u[5], k32_m17_p15); + v[26] = k_madd_epi32(u[6], k32_m17_p15); + v[27] = k_madd_epi32(u[7], k32_m17_p15); + v[28] = k_madd_epi32(u[0], k32_m01_p31); + v[29] = k_madd_epi32(u[1], k32_m01_p31); + v[30] = k_madd_epi32(u[2], k32_m01_p31); + v[31] = k_madd_epi32(u[3], k32_m01_p31); + +#if DCT_HIGH_BIT_DEPTH + overflow = k_check_epi32_overflow_32( + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], + &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], + &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], + &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + u[4] = k_packs_epi64(v[8], v[9]); + u[5] = k_packs_epi64(v[10], v[11]); + u[6] = k_packs_epi64(v[12], v[13]); + u[7] = k_packs_epi64(v[14], v[15]); + u[8] = k_packs_epi64(v[16], v[17]); + u[9] = k_packs_epi64(v[18], v[19]); + u[10] = k_packs_epi64(v[20], v[21]); + u[11] = k_packs_epi64(v[22], v[23]); + u[12] = k_packs_epi64(v[24], v[25]); + u[13] = k_packs_epi64(v[26], v[27]); + u[14] = k_packs_epi64(v[28], v[29]); + u[15] = k_packs_epi64(v[30], v[31]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + + v[0] = _mm_cmplt_epi32(u[0], kZero); + v[1] = _mm_cmplt_epi32(u[1], kZero); + v[2] = _mm_cmplt_epi32(u[2], kZero); + v[3] = _mm_cmplt_epi32(u[3], kZero); + v[4] = _mm_cmplt_epi32(u[4], kZero); + v[5] = _mm_cmplt_epi32(u[5], kZero); + v[6] = _mm_cmplt_epi32(u[6], kZero); + v[7] = _mm_cmplt_epi32(u[7], kZero); + v[8] = _mm_cmplt_epi32(u[8], kZero); + v[9] = _mm_cmplt_epi32(u[9], kZero); + v[10] = _mm_cmplt_epi32(u[10], kZero); + v[11] = _mm_cmplt_epi32(u[11], kZero); + v[12] = _mm_cmplt_epi32(u[12], kZero); + v[13] = _mm_cmplt_epi32(u[13], kZero); + v[14] = _mm_cmplt_epi32(u[14], kZero); + v[15] = _mm_cmplt_epi32(u[15], kZero); + + u[0] = _mm_sub_epi32(u[0], v[0]); + u[1] = _mm_sub_epi32(u[1], v[1]); + u[2] = _mm_sub_epi32(u[2], v[2]); + u[3] = _mm_sub_epi32(u[3], v[3]); + u[4] = _mm_sub_epi32(u[4], v[4]); + u[5] = _mm_sub_epi32(u[5], v[5]); + u[6] = _mm_sub_epi32(u[6], v[6]); + u[7] = _mm_sub_epi32(u[7], v[7]); + u[8] = _mm_sub_epi32(u[8], v[8]); + u[9] = _mm_sub_epi32(u[9], v[9]); + u[10] = _mm_sub_epi32(u[10], v[10]); + u[11] = _mm_sub_epi32(u[11], v[11]); + u[12] = _mm_sub_epi32(u[12], v[12]); + u[13] = _mm_sub_epi32(u[13], v[13]); + u[14] = _mm_sub_epi32(u[14], v[14]); + u[15] = _mm_sub_epi32(u[15], v[15]); + + v[0] = _mm_add_epi32(u[0], K32One); + v[1] = _mm_add_epi32(u[1], K32One); + v[2] = _mm_add_epi32(u[2], K32One); + v[3] = _mm_add_epi32(u[3], K32One); + v[4] = _mm_add_epi32(u[4], K32One); + v[5] = _mm_add_epi32(u[5], K32One); + v[6] = _mm_add_epi32(u[6], K32One); + v[7] = _mm_add_epi32(u[7], K32One); + v[8] = _mm_add_epi32(u[8], K32One); + v[9] = _mm_add_epi32(u[9], K32One); + v[10] = _mm_add_epi32(u[10], K32One); + v[11] = _mm_add_epi32(u[11], K32One); + v[12] = _mm_add_epi32(u[12], K32One); + v[13] = _mm_add_epi32(u[13], K32One); + v[14] = _mm_add_epi32(u[14], K32One); + v[15] = _mm_add_epi32(u[15], K32One); + + u[0] = _mm_srai_epi32(v[0], 2); + u[1] = _mm_srai_epi32(v[1], 2); + u[2] = _mm_srai_epi32(v[2], 2); + u[3] = _mm_srai_epi32(v[3], 2); + u[4] = _mm_srai_epi32(v[4], 2); + u[5] = _mm_srai_epi32(v[5], 2); + u[6] = _mm_srai_epi32(v[6], 2); + u[7] = _mm_srai_epi32(v[7], 2); + u[8] = _mm_srai_epi32(v[8], 2); + u[9] = _mm_srai_epi32(v[9], 2); + u[10] = _mm_srai_epi32(v[10], 2); + u[11] = _mm_srai_epi32(v[11], 2); + u[12] = _mm_srai_epi32(v[12], 2); + u[13] = _mm_srai_epi32(v[13], 2); + u[14] = _mm_srai_epi32(v[14], 2); + u[15] = _mm_srai_epi32(v[15], 2); + + out[1] = _mm_packs_epi32(u[0], u[1]); + out[17] = _mm_packs_epi32(u[2], u[3]); + out[9] = _mm_packs_epi32(u[4], u[5]); + out[25] = _mm_packs_epi32(u[6], u[7]); + out[7] = _mm_packs_epi32(u[8], u[9]); + out[23] = _mm_packs_epi32(u[10], u[11]); + out[15] = _mm_packs_epi32(u[12], u[13]); + out[31] = _mm_packs_epi32(u[14], u[15]); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25], + &out[7], &out[23], &out[15], &out[31]); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i k32_p27_p05 = pair_set_epi32(cospi_27_64, cospi_5_64); + const __m128i k32_p11_p21 = pair_set_epi32(cospi_11_64, cospi_21_64); + const __m128i k32_p19_p13 = pair_set_epi32(cospi_19_64, cospi_13_64); + const __m128i k32_p03_p29 = pair_set_epi32(cospi_3_64, cospi_29_64); + const __m128i k32_m29_p03 = pair_set_epi32(-cospi_29_64, cospi_3_64); + const __m128i k32_m13_p19 = pair_set_epi32(-cospi_13_64, cospi_19_64); + const __m128i k32_m21_p11 = pair_set_epi32(-cospi_21_64, cospi_11_64); + const __m128i k32_m05_p27 = pair_set_epi32(-cospi_5_64, cospi_27_64); + + u[0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]); + u[1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]); + u[2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]); + u[3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]); + u[4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]); + u[5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]); + u[6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]); + u[7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]); + u[8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]); + u[9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]); + u[10] = _mm_unpacklo_epi32(lstep1[45], lstep1[51]); + u[11] = _mm_unpackhi_epi32(lstep1[45], lstep1[51]); + u[12] = _mm_unpacklo_epi32(lstep1[46], lstep1[48]); + u[13] = _mm_unpackhi_epi32(lstep1[46], lstep1[48]); + u[14] = _mm_unpacklo_epi32(lstep1[47], lstep1[49]); + u[15] = _mm_unpackhi_epi32(lstep1[47], lstep1[49]); + + v[0] = k_madd_epi32(u[0], k32_p27_p05); + v[1] = k_madd_epi32(u[1], k32_p27_p05); + v[2] = k_madd_epi32(u[2], k32_p27_p05); + v[3] = k_madd_epi32(u[3], k32_p27_p05); + v[4] = k_madd_epi32(u[4], k32_p11_p21); + v[5] = k_madd_epi32(u[5], k32_p11_p21); + v[6] = k_madd_epi32(u[6], k32_p11_p21); + v[7] = k_madd_epi32(u[7], k32_p11_p21); + v[8] = k_madd_epi32(u[8], k32_p19_p13); + v[9] = k_madd_epi32(u[9], k32_p19_p13); + v[10] = k_madd_epi32(u[10], k32_p19_p13); + v[11] = k_madd_epi32(u[11], k32_p19_p13); + v[12] = k_madd_epi32(u[12], k32_p03_p29); + v[13] = k_madd_epi32(u[13], k32_p03_p29); + v[14] = k_madd_epi32(u[14], k32_p03_p29); + v[15] = k_madd_epi32(u[15], k32_p03_p29); + v[16] = k_madd_epi32(u[12], k32_m29_p03); + v[17] = k_madd_epi32(u[13], k32_m29_p03); + v[18] = k_madd_epi32(u[14], k32_m29_p03); + v[19] = k_madd_epi32(u[15], k32_m29_p03); + v[20] = k_madd_epi32(u[8], k32_m13_p19); + v[21] = k_madd_epi32(u[9], k32_m13_p19); + v[22] = k_madd_epi32(u[10], k32_m13_p19); + v[23] = k_madd_epi32(u[11], k32_m13_p19); + v[24] = k_madd_epi32(u[4], k32_m21_p11); + v[25] = k_madd_epi32(u[5], k32_m21_p11); + v[26] = k_madd_epi32(u[6], k32_m21_p11); + v[27] = k_madd_epi32(u[7], k32_m21_p11); + v[28] = k_madd_epi32(u[0], k32_m05_p27); + v[29] = k_madd_epi32(u[1], k32_m05_p27); + v[30] = k_madd_epi32(u[2], k32_m05_p27); + v[31] = k_madd_epi32(u[3], k32_m05_p27); + +#if DCT_HIGH_BIT_DEPTH + overflow = k_check_epi32_overflow_32( + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], + &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], + &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], + &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + u[4] = k_packs_epi64(v[8], v[9]); + u[5] = k_packs_epi64(v[10], v[11]); + u[6] = k_packs_epi64(v[12], v[13]); + u[7] = k_packs_epi64(v[14], v[15]); + u[8] = k_packs_epi64(v[16], v[17]); + u[9] = k_packs_epi64(v[18], v[19]); + u[10] = k_packs_epi64(v[20], v[21]); + u[11] = k_packs_epi64(v[22], v[23]); + u[12] = k_packs_epi64(v[24], v[25]); + u[13] = k_packs_epi64(v[26], v[27]); + u[14] = k_packs_epi64(v[28], v[29]); + u[15] = k_packs_epi64(v[30], v[31]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + + v[0] = _mm_cmplt_epi32(u[0], kZero); + v[1] = _mm_cmplt_epi32(u[1], kZero); + v[2] = _mm_cmplt_epi32(u[2], kZero); + v[3] = _mm_cmplt_epi32(u[3], kZero); + v[4] = _mm_cmplt_epi32(u[4], kZero); + v[5] = _mm_cmplt_epi32(u[5], kZero); + v[6] = _mm_cmplt_epi32(u[6], kZero); + v[7] = _mm_cmplt_epi32(u[7], kZero); + v[8] = _mm_cmplt_epi32(u[8], kZero); + v[9] = _mm_cmplt_epi32(u[9], kZero); + v[10] = _mm_cmplt_epi32(u[10], kZero); + v[11] = _mm_cmplt_epi32(u[11], kZero); + v[12] = _mm_cmplt_epi32(u[12], kZero); + v[13] = _mm_cmplt_epi32(u[13], kZero); + v[14] = _mm_cmplt_epi32(u[14], kZero); + v[15] = _mm_cmplt_epi32(u[15], kZero); + + u[0] = _mm_sub_epi32(u[0], v[0]); + u[1] = _mm_sub_epi32(u[1], v[1]); + u[2] = _mm_sub_epi32(u[2], v[2]); + u[3] = _mm_sub_epi32(u[3], v[3]); + u[4] = _mm_sub_epi32(u[4], v[4]); + u[5] = _mm_sub_epi32(u[5], v[5]); + u[6] = _mm_sub_epi32(u[6], v[6]); + u[7] = _mm_sub_epi32(u[7], v[7]); + u[8] = _mm_sub_epi32(u[8], v[8]); + u[9] = _mm_sub_epi32(u[9], v[9]); + u[10] = _mm_sub_epi32(u[10], v[10]); + u[11] = _mm_sub_epi32(u[11], v[11]); + u[12] = _mm_sub_epi32(u[12], v[12]); + u[13] = _mm_sub_epi32(u[13], v[13]); + u[14] = _mm_sub_epi32(u[14], v[14]); + u[15] = _mm_sub_epi32(u[15], v[15]); + + v[0] = _mm_add_epi32(u[0], K32One); + v[1] = _mm_add_epi32(u[1], K32One); + v[2] = _mm_add_epi32(u[2], K32One); + v[3] = _mm_add_epi32(u[3], K32One); + v[4] = _mm_add_epi32(u[4], K32One); + v[5] = _mm_add_epi32(u[5], K32One); + v[6] = _mm_add_epi32(u[6], K32One); + v[7] = _mm_add_epi32(u[7], K32One); + v[8] = _mm_add_epi32(u[8], K32One); + v[9] = _mm_add_epi32(u[9], K32One); + v[10] = _mm_add_epi32(u[10], K32One); + v[11] = _mm_add_epi32(u[11], K32One); + v[12] = _mm_add_epi32(u[12], K32One); + v[13] = _mm_add_epi32(u[13], K32One); + v[14] = _mm_add_epi32(u[14], K32One); + v[15] = _mm_add_epi32(u[15], K32One); + + u[0] = _mm_srai_epi32(v[0], 2); + u[1] = _mm_srai_epi32(v[1], 2); + u[2] = _mm_srai_epi32(v[2], 2); + u[3] = _mm_srai_epi32(v[3], 2); + u[4] = _mm_srai_epi32(v[4], 2); + u[5] = _mm_srai_epi32(v[5], 2); + u[6] = _mm_srai_epi32(v[6], 2); + u[7] = _mm_srai_epi32(v[7], 2); + u[8] = _mm_srai_epi32(v[8], 2); + u[9] = _mm_srai_epi32(v[9], 2); + u[10] = _mm_srai_epi32(v[10], 2); + u[11] = _mm_srai_epi32(v[11], 2); + u[12] = _mm_srai_epi32(v[12], 2); + u[13] = _mm_srai_epi32(v[13], 2); + u[14] = _mm_srai_epi32(v[14], 2); + u[15] = _mm_srai_epi32(v[15], 2); + + out[5] = _mm_packs_epi32(u[0], u[1]); + out[21] = _mm_packs_epi32(u[2], u[3]); + out[13] = _mm_packs_epi32(u[4], u[5]); + out[29] = _mm_packs_epi32(u[6], u[7]); + out[3] = _mm_packs_epi32(u[8], u[9]); + out[19] = _mm_packs_epi32(u[10], u[11]); + out[11] = _mm_packs_epi32(u[12], u[13]); + out[27] = _mm_packs_epi32(u[14], u[15]); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29], + &out[3], &out[19], &out[11], &out[27]); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + } +#endif // FDCT32x32_HIGH_PRECISION + // Transpose the results, do it as four 8x8 transposes. + { + int transpose_block; + int16_t *output0 = &intermediate[column_start * 32]; + tran_low_t *output1 = &output_org[column_start * 32]; + for (transpose_block = 0; transpose_block < 4; ++transpose_block) { + __m128i *this_out = &out[8 * transpose_block]; + // 00 01 02 03 04 05 06 07 + // 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 + // 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 + // 50 51 52 53 54 55 56 57 + // 60 61 62 63 64 65 66 67 + // 70 71 72 73 74 75 76 77 + const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]); + const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]); + const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]); + const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]); + const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]); + const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]); + const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]); + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + // 04 14 05 15 06 16 07 17 + // 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 + // 60 70 61 71 62 72 63 73 + // 54 54 55 55 56 56 57 57 + // 64 74 65 75 66 76 67 77 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + // 00 10 20 30 01 11 21 31 + // 40 50 60 70 41 51 61 71 + // 02 12 22 32 03 13 23 33 + // 42 52 62 72 43 53 63 73 + // 04 14 24 34 05 15 21 36 + // 44 54 64 74 45 55 61 76 + // 06 16 26 36 07 17 27 37 + // 46 56 66 76 47 57 67 77 + __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); + __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); + __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); + __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); + __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); + __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); + __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); + __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + if (0 == pass) { + // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2; + // TODO(cd): see quality impact of only doing + // output[j] = (output[j] + 1) >> 2; + // which would remove the code between here ... + __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero); + __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero); + __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero); + __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero); + __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero); + __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero); + __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero); + __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero); + tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0); + tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0); + tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0); + tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0); + tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0); + tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0); + tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0); + tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0); + // ... and here. + // PS: also change code in vp9/encoder/vp9_dct.c + tr2_0 = _mm_add_epi16(tr2_0, kOne); + tr2_1 = _mm_add_epi16(tr2_1, kOne); + tr2_2 = _mm_add_epi16(tr2_2, kOne); + tr2_3 = _mm_add_epi16(tr2_3, kOne); + tr2_4 = _mm_add_epi16(tr2_4, kOne); + tr2_5 = _mm_add_epi16(tr2_5, kOne); + tr2_6 = _mm_add_epi16(tr2_6, kOne); + tr2_7 = _mm_add_epi16(tr2_7, kOne); + tr2_0 = _mm_srai_epi16(tr2_0, 2); + tr2_1 = _mm_srai_epi16(tr2_1, 2); + tr2_2 = _mm_srai_epi16(tr2_2, 2); + tr2_3 = _mm_srai_epi16(tr2_3, 2); + tr2_4 = _mm_srai_epi16(tr2_4, 2); + tr2_5 = _mm_srai_epi16(tr2_5, 2); + tr2_6 = _mm_srai_epi16(tr2_6, 2); + tr2_7 = _mm_srai_epi16(tr2_7, 2); + } + // Note: even though all these stores are aligned, using the aligned + // intrinsic make the code slightly slower. + if (pass == 0) { + _mm_storeu_si128((__m128i *)(output0 + 0 * 32), tr2_0); + _mm_storeu_si128((__m128i *)(output0 + 1 * 32), tr2_1); + _mm_storeu_si128((__m128i *)(output0 + 2 * 32), tr2_2); + _mm_storeu_si128((__m128i *)(output0 + 3 * 32), tr2_3); + _mm_storeu_si128((__m128i *)(output0 + 4 * 32), tr2_4); + _mm_storeu_si128((__m128i *)(output0 + 5 * 32), tr2_5); + _mm_storeu_si128((__m128i *)(output0 + 6 * 32), tr2_6); + _mm_storeu_si128((__m128i *)(output0 + 7 * 32), tr2_7); + // Process next 8x8 + output0 += 8; + } else { + storeu_output(&tr2_0, (output1 + 0 * 32)); + storeu_output(&tr2_1, (output1 + 1 * 32)); + storeu_output(&tr2_2, (output1 + 2 * 32)); + storeu_output(&tr2_3, (output1 + 3 * 32)); + storeu_output(&tr2_4, (output1 + 4 * 32)); + storeu_output(&tr2_5, (output1 + 5 * 32)); + storeu_output(&tr2_6, (output1 + 6 * 32)); + storeu_output(&tr2_7, (output1 + 7 * 32)); + // Process next 8x8 + output1 += 8; + } + } + } + } + } +} // NOLINT + +#undef ADD_EPI16 +#undef SUB_EPI16 +#undef HIGH_FDCT32x32_2D_C +#undef HIGH_FDCT32x32_2D_ROWS_C diff --git a/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c new file mode 100644 index 0000000000..c8f54a49cb --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c @@ -0,0 +1,399 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // AVX2 +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_dsp/txfm_common.h" +#define ADD256_EPI16 _mm256_add_epi16 +#define SUB256_EPI16 _mm256_sub_epi16 + +static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in, + int stride, __m256i *out, + int out_size, int pass) { + int i; + const __m256i kOne = _mm256_set1_epi16(1); + if (pass == 0) { + for (i = 0; i < out_size; i++) { + out[i] = _mm256_loadu_si256((const __m256i *)(in + i * stride)); + // x = x << 2 + out[i] = _mm256_slli_epi16(out[i], 2); + } + } else { + for (i = 0; i < out_size; i++) { + out[i] = _mm256_loadu_si256((const __m256i *)(in + i * 16)); + // x = (x + 1) >> 2 + out[i] = _mm256_add_epi16(out[i], kOne); + out[i] = _mm256_srai_epi16(out[i], 2); + } + } +} + +static INLINE void transpose2_8x8_avx2(const __m256i *const in, + __m256i *const out) { + int i; + __m256i t[16], u[16]; + // (1st, 2nd) ==> (lo, hi) + // (0, 1) ==> (0, 1) + // (2, 3) ==> (2, 3) + // (4, 5) ==> (4, 5) + // (6, 7) ==> (6, 7) + for (i = 0; i < 4; i++) { + t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]); + t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]); + } + + // (1st, 2nd) ==> (lo, hi) + // (0, 2) ==> (0, 2) + // (1, 3) ==> (1, 3) + // (4, 6) ==> (4, 6) + // (5, 7) ==> (5, 7) + for (i = 0; i < 2; i++) { + u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]); + u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]); + + u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]); + u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]); + } + + // (1st, 2nd) ==> (lo, hi) + // (0, 4) ==> (0, 1) + // (1, 5) ==> (4, 5) + // (2, 6) ==> (2, 3) + // (3, 7) ==> (6, 7) + for (i = 0; i < 2; i++) { + out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]); + out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]); + + out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]); + out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]); + } +} + +static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in, + __m256i *const out) { + __m256i t[16]; + +#define LOADL(idx) \ + t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \ + t[idx] = _mm256_inserti128_si256( \ + t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1); + +#define LOADR(idx) \ + t[8 + idx] = \ + _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \ + t[8 + idx] = _mm256_inserti128_si256( \ + t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1); + + // load left 8x16 + LOADL(0) + LOADL(1) + LOADL(2) + LOADL(3) + LOADL(4) + LOADL(5) + LOADL(6) + LOADL(7) + + // load right 8x16 + LOADR(0) + LOADR(1) + LOADR(2) + LOADR(3) + LOADR(4) + LOADR(5) + LOADR(6) + LOADR(7) + + // get the top 16x8 result + transpose2_8x8_avx2(t, out); + // get the bottom 16x8 result + transpose2_8x8_avx2(&t[8], &out[8]); +} + +// Store 8 16-bit values. Sign extend the values. +static INLINE void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in, + tran_low_t *out, + const int stride, + const int out_size) { + int i; + for (i = 0; i < out_size; ++i) { + _mm256_storeu_si256((__m256i *)(out), in[i]); + out += stride; + } +} + +#define PAIR256_SET_EPI16(a, b) \ + _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) + +static INLINE __m256i mult256_round_shift(const __m256i *pin0, + const __m256i *pin1, + const __m256i *pmultiplier, + const __m256i *prounding, + const int shift) { + const __m256i u0 = _mm256_madd_epi16(*pin0, *pmultiplier); + const __m256i u1 = _mm256_madd_epi16(*pin1, *pmultiplier); + const __m256i v0 = _mm256_add_epi32(u0, *prounding); + const __m256i v1 = _mm256_add_epi32(u1, *prounding); + const __m256i w0 = _mm256_srai_epi32(v0, shift); + const __m256i w1 = _mm256_srai_epi32(v1, shift); + return _mm256_packs_epi32(w0, w1); +} + +static INLINE void fdct16x16_1D_avx2(__m256i *input, __m256i *output) { + int i; + __m256i step2[4]; + __m256i in[8]; + __m256i step1[8]; + __m256i step3[8]; + + const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(cospi_16_64); + const __m256i k__cospi_p16_m16 = PAIR256_SET_EPI16(cospi_16_64, -cospi_16_64); + const __m256i k__cospi_p24_p08 = PAIR256_SET_EPI16(cospi_24_64, cospi_8_64); + const __m256i k__cospi_p08_m24 = PAIR256_SET_EPI16(cospi_8_64, -cospi_24_64); + const __m256i k__cospi_m08_p24 = PAIR256_SET_EPI16(-cospi_8_64, cospi_24_64); + const __m256i k__cospi_p28_p04 = PAIR256_SET_EPI16(cospi_28_64, cospi_4_64); + const __m256i k__cospi_m04_p28 = PAIR256_SET_EPI16(-cospi_4_64, cospi_28_64); + const __m256i k__cospi_p12_p20 = PAIR256_SET_EPI16(cospi_12_64, cospi_20_64); + const __m256i k__cospi_m20_p12 = PAIR256_SET_EPI16(-cospi_20_64, cospi_12_64); + const __m256i k__cospi_p30_p02 = PAIR256_SET_EPI16(cospi_30_64, cospi_2_64); + const __m256i k__cospi_p14_p18 = PAIR256_SET_EPI16(cospi_14_64, cospi_18_64); + const __m256i k__cospi_m02_p30 = PAIR256_SET_EPI16(-cospi_2_64, cospi_30_64); + const __m256i k__cospi_m18_p14 = PAIR256_SET_EPI16(-cospi_18_64, cospi_14_64); + const __m256i k__cospi_p22_p10 = PAIR256_SET_EPI16(cospi_22_64, cospi_10_64); + const __m256i k__cospi_p06_p26 = PAIR256_SET_EPI16(cospi_6_64, cospi_26_64); + const __m256i k__cospi_m10_p22 = PAIR256_SET_EPI16(-cospi_10_64, cospi_22_64); + const __m256i k__cospi_m26_p06 = PAIR256_SET_EPI16(-cospi_26_64, cospi_6_64); + const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING); + + // Calculate input for the first 8 results. + for (i = 0; i < 8; i++) { + in[i] = ADD256_EPI16(input[i], input[15 - i]); + } + + // Calculate input for the next 8 results. + for (i = 0; i < 8; i++) { + step1[i] = SUB256_EPI16(input[7 - i], input[8 + i]); + } + + // Work on the first eight values; fdct8(input, even_results); + { + // Add/subtract + const __m256i q0 = ADD256_EPI16(in[0], in[7]); + const __m256i q1 = ADD256_EPI16(in[1], in[6]); + const __m256i q2 = ADD256_EPI16(in[2], in[5]); + const __m256i q3 = ADD256_EPI16(in[3], in[4]); + const __m256i q4 = SUB256_EPI16(in[3], in[4]); + const __m256i q5 = SUB256_EPI16(in[2], in[5]); + const __m256i q6 = SUB256_EPI16(in[1], in[6]); + const __m256i q7 = SUB256_EPI16(in[0], in[7]); + + // Work on first four results + { + // Add/subtract + const __m256i r0 = ADD256_EPI16(q0, q3); + const __m256i r1 = ADD256_EPI16(q1, q2); + const __m256i r2 = SUB256_EPI16(q1, q2); + const __m256i r3 = SUB256_EPI16(q0, q3); + + // Interleave to do the multiply by constants which gets us + // into 32 bits. + { + const __m256i t0 = _mm256_unpacklo_epi16(r0, r1); + const __m256i t1 = _mm256_unpackhi_epi16(r0, r1); + const __m256i t2 = _mm256_unpacklo_epi16(r2, r3); + const __m256i t3 = _mm256_unpackhi_epi16(r2, r3); + + output[0] = mult256_round_shift(&t0, &t1, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[8] = mult256_round_shift(&t0, &t1, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[4] = mult256_round_shift(&t2, &t3, &k__cospi_p24_p08, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[12] = + mult256_round_shift(&t2, &t3, &k__cospi_m08_p24, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + } + + // Work on next four results + { + // Interleave to do the multiply by constants which gets us + // into 32 bits. + const __m256i d0 = _mm256_unpacklo_epi16(q6, q5); + const __m256i d1 = _mm256_unpackhi_epi16(q6, q5); + const __m256i r0 = mult256_round_shift( + &d0, &d1, &k__cospi_p16_m16, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + const __m256i r1 = mult256_round_shift( + &d0, &d1, &k__cospi_p16_p16, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + + { + // Add/subtract + const __m256i x0 = ADD256_EPI16(q4, r0); + const __m256i x1 = SUB256_EPI16(q4, r0); + const __m256i x2 = SUB256_EPI16(q7, r1); + const __m256i x3 = ADD256_EPI16(q7, r1); + + // Interleave to do the multiply by constants which gets us + // into 32 bits. + { + const __m256i t0 = _mm256_unpacklo_epi16(x0, x3); + const __m256i t1 = _mm256_unpackhi_epi16(x0, x3); + const __m256i t2 = _mm256_unpacklo_epi16(x1, x2); + const __m256i t3 = _mm256_unpackhi_epi16(x1, x2); + output[2] = + mult256_round_shift(&t0, &t1, &k__cospi_p28_p04, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[14] = + mult256_round_shift(&t0, &t1, &k__cospi_m04_p28, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[10] = + mult256_round_shift(&t2, &t3, &k__cospi_p12_p20, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[6] = + mult256_round_shift(&t2, &t3, &k__cospi_m20_p12, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + } + } + } + // Work on the next eight values; step1 -> odd_results + { // step 2 + { + const __m256i t0 = _mm256_unpacklo_epi16(step1[5], step1[2]); + const __m256i t1 = _mm256_unpackhi_epi16(step1[5], step1[2]); + const __m256i t2 = _mm256_unpacklo_epi16(step1[4], step1[3]); + const __m256i t3 = _mm256_unpackhi_epi16(step1[4], step1[3]); + step2[0] = mult256_round_shift(&t0, &t1, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[1] = mult256_round_shift(&t2, &t3, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[2] = mult256_round_shift(&t0, &t1, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[3] = mult256_round_shift(&t2, &t3, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + // step 3 + { + step3[0] = ADD256_EPI16(step1[0], step2[1]); + step3[1] = ADD256_EPI16(step1[1], step2[0]); + step3[2] = SUB256_EPI16(step1[1], step2[0]); + step3[3] = SUB256_EPI16(step1[0], step2[1]); + step3[4] = SUB256_EPI16(step1[7], step2[3]); + step3[5] = SUB256_EPI16(step1[6], step2[2]); + step3[6] = ADD256_EPI16(step1[6], step2[2]); + step3[7] = ADD256_EPI16(step1[7], step2[3]); + } + // step 4 + { + const __m256i t0 = _mm256_unpacklo_epi16(step3[1], step3[6]); + const __m256i t1 = _mm256_unpackhi_epi16(step3[1], step3[6]); + const __m256i t2 = _mm256_unpacklo_epi16(step3[2], step3[5]); + const __m256i t3 = _mm256_unpackhi_epi16(step3[2], step3[5]); + step2[0] = mult256_round_shift(&t0, &t1, &k__cospi_m08_p24, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[1] = mult256_round_shift(&t2, &t3, &k__cospi_p24_p08, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[2] = mult256_round_shift(&t0, &t1, &k__cospi_p24_p08, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[3] = mult256_round_shift(&t2, &t3, &k__cospi_p08_m24, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + // step 5 + { + step1[0] = ADD256_EPI16(step3[0], step2[0]); + step1[1] = SUB256_EPI16(step3[0], step2[0]); + step1[2] = ADD256_EPI16(step3[3], step2[1]); + step1[3] = SUB256_EPI16(step3[3], step2[1]); + step1[4] = SUB256_EPI16(step3[4], step2[3]); + step1[5] = ADD256_EPI16(step3[4], step2[3]); + step1[6] = SUB256_EPI16(step3[7], step2[2]); + step1[7] = ADD256_EPI16(step3[7], step2[2]); + } + // step 6 + { + const __m256i t0 = _mm256_unpacklo_epi16(step1[0], step1[7]); + const __m256i t1 = _mm256_unpackhi_epi16(step1[0], step1[7]); + const __m256i t2 = _mm256_unpacklo_epi16(step1[1], step1[6]); + const __m256i t3 = _mm256_unpackhi_epi16(step1[1], step1[6]); + output[1] = mult256_round_shift(&t0, &t1, &k__cospi_p30_p02, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[9] = mult256_round_shift(&t2, &t3, &k__cospi_p14_p18, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[15] = mult256_round_shift(&t0, &t1, &k__cospi_m02_p30, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[7] = mult256_round_shift(&t2, &t3, &k__cospi_m18_p14, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + { + const __m256i t0 = _mm256_unpacklo_epi16(step1[2], step1[5]); + const __m256i t1 = _mm256_unpackhi_epi16(step1[2], step1[5]); + const __m256i t2 = _mm256_unpacklo_epi16(step1[3], step1[4]); + const __m256i t3 = _mm256_unpackhi_epi16(step1[3], step1[4]); + output[5] = mult256_round_shift(&t0, &t1, &k__cospi_p22_p10, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[13] = mult256_round_shift(&t2, &t3, &k__cospi_p06_p26, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[11] = mult256_round_shift(&t0, &t1, &k__cospi_m10_p22, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[3] = mult256_round_shift(&t2, &t3, &k__cospi_m26_p06, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + } +} + +void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride) { + int pass; + DECLARE_ALIGNED(32, int16_t, intermediate[256]); + int16_t *out0 = intermediate; + tran_low_t *out1 = output; + const int width = 16; + const int height = 16; + __m256i buf0[16], buf1[16]; + + // Two transform and transpose passes + // Process 16 columns (transposed rows in second pass) at a time. + for (pass = 0; pass < 2; ++pass) { + // Load and pre-condition input. + load_buffer_16bit_to_16bit_avx2(input, stride, buf1, height, pass); + + // Calculate dct for 16x16 values + fdct16x16_1D_avx2(buf1, buf0); + + // Transpose the results. + transpose_16bit_16x16_avx2(buf0, buf1); + + if (pass == 0) { + store_buffer_16bit_to_32bit_w16_avx2(buf1, out0, width, height); + } else { + store_buffer_16bit_to_32bit_w16_avx2(buf1, out1, width, height); + } + // Setup in/out for next pass. + input = intermediate; + } +} + +#if !CONFIG_VP9_HIGHBITDEPTH +#define FDCT32x32_2D_AVX2 vpx_fdct32x32_rd_avx2 +#define FDCT32x32_HIGH_PRECISION 0 +#include "vpx_dsp/x86/fwd_dct32x32_impl_avx2.h" +#undef FDCT32x32_2D_AVX2 +#undef FDCT32x32_HIGH_PRECISION + +#define FDCT32x32_2D_AVX2 vpx_fdct32x32_avx2 +#define FDCT32x32_HIGH_PRECISION 1 +#include "vpx_dsp/x86/fwd_dct32x32_impl_avx2.h" // NOLINT +#undef FDCT32x32_2D_AVX2 +#undef FDCT32x32_HIGH_PRECISION +#endif // !CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h new file mode 100644 index 0000000000..d546f02a14 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h @@ -0,0 +1,1015 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE2 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/x86/fwd_txfm_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" +#include "vpx_ports/mem.h" + +// TODO(jingning) The high bit-depth functions need rework for performance. +// After we properly fix the high bit-depth function implementations, this +// file's dependency should be substantially simplified. +#if DCT_HIGH_BIT_DEPTH +#define ADD_EPI16 _mm_adds_epi16 +#define SUB_EPI16 _mm_subs_epi16 + +#else +#define ADD_EPI16 _mm_add_epi16 +#define SUB_EPI16 _mm_sub_epi16 +#endif + +void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) { + // This 2D transform implements 4 vertical 1D transforms followed + // by 4 horizontal 1D transforms. The multiplies and adds are as given + // by Chen, Smith and Fralick ('77). The commands for moving the data + // around have been minimized by hand. + // For the purposes of the comments, the 16 inputs are referred to at i0 + // through iF (in raster order), intermediate variables are a0, b0, c0 + // through f, and correspond to the in-place computations mapped to input + // locations. The outputs, o0 through oF are labeled according to the + // output locations. + + // Constants + // These are the coefficients used for the multiplies. + // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64), + // where cospi_N_64 = cos(N pi /64) + const __m128i k__cospi_A = + octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64, + cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64); + const __m128i k__cospi_B = + octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64, + cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64); + const __m128i k__cospi_C = + octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64, + cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64); + const __m128i k__cospi_D = + octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64, + cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64); + const __m128i k__cospi_E = + octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64, + cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64); + const __m128i k__cospi_F = + octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64, + cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64); + const __m128i k__cospi_G = + octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64, + -cospi_8_64, -cospi_24_64, -cospi_8_64, -cospi_24_64); + const __m128i k__cospi_H = + octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64, + -cospi_24_64, cospi_8_64, -cospi_24_64, cospi_8_64); + + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + // This second rounding constant saves doing some extra adds at the end + const __m128i k__DCT_CONST_ROUNDING2 = + _mm_set1_epi32(DCT_CONST_ROUNDING + (DCT_CONST_ROUNDING << 1)); + const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2; + const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); + const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); + __m128i in0, in1; +#if DCT_HIGH_BIT_DEPTH + __m128i cmp0, cmp1; + int test, overflow; +#endif + + // Load inputs. + in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + in1 = _mm_unpacklo_epi64( + in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride))); + in0 = _mm_unpacklo_epi64( + in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride))); +// in0 = [i0 i1 i2 i3 iC iD iE iF] +// in1 = [i4 i5 i6 i7 i8 i9 iA iB] +#if DCT_HIGH_BIT_DEPTH + // Check inputs small enough to use optimised code + cmp0 = _mm_xor_si128(_mm_cmpgt_epi16(in0, _mm_set1_epi16(0x3ff)), + _mm_cmplt_epi16(in0, _mm_set1_epi16((int16_t)0xfc00))); + cmp1 = _mm_xor_si128(_mm_cmpgt_epi16(in1, _mm_set1_epi16(0x3ff)), + _mm_cmplt_epi16(in1, _mm_set1_epi16((int16_t)0xfc00))); + test = _mm_movemask_epi8(_mm_or_si128(cmp0, cmp1)); + if (test) { + vpx_highbd_fdct4x4_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + + // multiply by 16 to give some extra precision + in0 = _mm_slli_epi16(in0, 4); + in1 = _mm_slli_epi16(in1, 4); + // if (i == 0 && input[0]) input[0] += 1; + // add 1 to the upper left pixel if it is non-zero, which helps reduce + // the round-trip error + { + // The mask will only contain whether the first value is zero, all + // other comparison will fail as something shifted by 4 (above << 4) + // can never be equal to one. To increment in the non-zero case, we + // add the mask and one for the first element: + // - if zero, mask = -1, v = v - 1 + 1 = v + // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 + __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a); + in0 = _mm_add_epi16(in0, mask); + in0 = _mm_add_epi16(in0, k__nonzero_bias_b); + } + // There are 4 total stages, alternating between an add/subtract stage + // followed by an multiply-and-add stage. + { + // Stage 1: Add/subtract + + // in0 = [i0 i1 i2 i3 iC iD iE iF] + // in1 = [i4 i5 i6 i7 i8 i9 iA iB] + const __m128i r0 = _mm_unpacklo_epi16(in0, in1); + const __m128i r1 = _mm_unpackhi_epi16(in0, in1); + // r0 = [i0 i4 i1 i5 i2 i6 i3 i7] + // r1 = [iC i8 iD i9 iE iA iF iB] + const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4); + const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4); + // r2 = [i0 i4 i1 i5 i3 i7 i2 i6] + // r3 = [iC i8 iD i9 iF iB iE iA] + + const __m128i t0 = _mm_add_epi16(r2, r3); + const __m128i t1 = _mm_sub_epi16(r2, r3); + // t0 = [a0 a4 a1 a5 a3 a7 a2 a6] + // t1 = [aC a8 aD a9 aF aB aE aA] + + // Stage 2: multiply by constants (which gets us into 32 bits). + // The constants needed here are: + // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16] + // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16] + // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08] + // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24] + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D); + // Then add and right-shift to get back to 16-bit range + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + // w0 = [b0 b1 b7 b6] + // w1 = [b8 b9 bF bE] + // w2 = [b4 b5 b3 b2] + // w3 = [bC bD bB bA] + const __m128i x0 = _mm_packs_epi32(w0, w1); + const __m128i x1 = _mm_packs_epi32(w2, w3); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x2(&x0, &x1); + if (overflow) { + vpx_highbd_fdct4x4_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + // x0 = [b0 b1 b7 b6 b8 b9 bF bE] + // x1 = [b4 b5 b3 b2 bC bD bB bA] + in0 = _mm_shuffle_epi32(x0, 0xD8); + in1 = _mm_shuffle_epi32(x1, 0x8D); + // in0 = [b0 b1 b8 b9 b7 b6 bF bE] + // in1 = [b3 b2 bB bA b4 b5 bC bD] + } + { + // vertical DCTs finished. Now we do the horizontal DCTs. + // Stage 3: Add/subtract + + const __m128i t0 = ADD_EPI16(in0, in1); + const __m128i t1 = SUB_EPI16(in0, in1); +// t0 = [c0 c1 c8 c9 c4 c5 cC cD] +// t1 = [c3 c2 cB cA -c7 -c6 -cF -cE] +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x2(&t0, &t1); + if (overflow) { + vpx_highbd_fdct4x4_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + + // Stage 4: multiply by constants (which gets us into 32 bits). + { + // The constants needed here are: + // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16] + // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16] + // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24] + // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08] + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E); + const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F); + const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H); + // Then add and right-shift to get back to 16-bit range + // but this combines the final right-shift as well to save operations + // This unusual rounding operations is to maintain bit-accurate + // compatibility with the c version of this function which has two + // rounding steps in a row. + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2); + // w0 = [o0 o4 o8 oC] + // w1 = [o2 o6 oA oE] + // w2 = [o1 o5 o9 oD] + // w3 = [o3 o7 oB oF] + // remember the o's are numbered according to the correct output location + const __m128i x0 = _mm_packs_epi32(w0, w1); + const __m128i x1 = _mm_packs_epi32(w2, w3); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x2(&x0, &x1); + if (overflow) { + vpx_highbd_fdct4x4_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + { + // x0 = [o0 o4 o8 oC o2 o6 oA oE] + // x1 = [o1 o5 o9 oD o3 o7 oB oF] + const __m128i y0 = _mm_unpacklo_epi16(x0, x1); + const __m128i y1 = _mm_unpackhi_epi16(x0, x1); + // y0 = [o0 o1 o4 o5 o8 o9 oC oD] + // y1 = [o2 o3 o6 o7 oA oB oE oF] + in0 = _mm_unpacklo_epi32(y0, y1); + // in0 = [o0 o1 o2 o3 o4 o5 o6 o7] + in1 = _mm_unpackhi_epi32(y0, y1); + // in1 = [o8 o9 oA oB oC oD oE oF] + } + } + } + // Post-condition (v + 1) >> 2 is now incorporated into previous + // add and right-shift commands. Only 2 store instructions needed + // because we are using the fact that 1/3 are stored just after 0/2. + storeu_output(&in0, output + 0 * 4); + storeu_output(&in1, output + 2 * 4); +} + +void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) { + int pass; + // Constants + // When we use them, in one case, they are all the same. In all others + // it's a pair of them that we need to repeat four times. This is done + // by constructing the 32 bit constant corresponding to that pair. + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); +#if DCT_HIGH_BIT_DEPTH + int overflow; +#endif + // Load input + __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); + __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); + __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); + __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); + __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); + __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); + __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); + __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); + // Pre-condition input (shift by two) + in0 = _mm_slli_epi16(in0, 2); + in1 = _mm_slli_epi16(in1, 2); + in2 = _mm_slli_epi16(in2, 2); + in3 = _mm_slli_epi16(in3, 2); + in4 = _mm_slli_epi16(in4, 2); + in5 = _mm_slli_epi16(in5, 2); + in6 = _mm_slli_epi16(in6, 2); + in7 = _mm_slli_epi16(in7, 2); + + // We do two passes, first the columns, then the rows. The results of the + // first pass are transposed so that the same column code can be reused. The + // results of the second pass are also transposed so that the rows (processed + // as columns) are put back in row positions. + for (pass = 0; pass < 2; pass++) { + // To store results of each pass before the transpose. + __m128i res0, res1, res2, res3, res4, res5, res6, res7; + // Add/subtract + const __m128i q0 = ADD_EPI16(in0, in7); + const __m128i q1 = ADD_EPI16(in1, in6); + const __m128i q2 = ADD_EPI16(in2, in5); + const __m128i q3 = ADD_EPI16(in3, in4); + const __m128i q4 = SUB_EPI16(in3, in4); + const __m128i q5 = SUB_EPI16(in2, in5); + const __m128i q6 = SUB_EPI16(in1, in6); + const __m128i q7 = SUB_EPI16(in0, in7); +#if DCT_HIGH_BIT_DEPTH + if (pass == 1) { + overflow = + check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7); + if (overflow) { + vpx_highbd_fdct8x8_c(input, output, stride); + return; + } + } +#endif // DCT_HIGH_BIT_DEPTH + // Work on first four results + { + // Add/subtract + const __m128i r0 = ADD_EPI16(q0, q3); + const __m128i r1 = ADD_EPI16(q1, q2); + const __m128i r2 = SUB_EPI16(q1, q2); + const __m128i r3 = SUB_EPI16(q0, q3); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3); + if (overflow) { + vpx_highbd_fdct8x8_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + // Interleave to do the multiply by constants which gets us into 32bits + { + const __m128i t0 = _mm_unpacklo_epi16(r0, r1); + const __m128i t1 = _mm_unpackhi_epi16(r0, r1); + const __m128i t2 = _mm_unpacklo_epi16(r2, r3); + const __m128i t3 = _mm_unpackhi_epi16(r2, r3); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); + const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); + const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); + const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); + const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + // Combine + res0 = _mm_packs_epi32(w0, w1); + res4 = _mm_packs_epi32(w2, w3); + res2 = _mm_packs_epi32(w4, w5); + res6 = _mm_packs_epi32(w6, w7); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6); + if (overflow) { + vpx_highbd_fdct8x8_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + } + // Work on next four results + { + // Interleave to do the multiply by constants which gets us into 32bits + const __m128i d0 = _mm_unpacklo_epi16(q6, q5); + const __m128i d1 = _mm_unpackhi_epi16(q6, q5); + const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); + const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); + const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); + const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); + const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); + const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); + const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); + const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); + const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); + const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); + const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); + // Combine + const __m128i r0 = _mm_packs_epi32(s0, s1); + const __m128i r1 = _mm_packs_epi32(s2, s3); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x2(&r0, &r1); + if (overflow) { + vpx_highbd_fdct8x8_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + { + // Add/subtract + const __m128i x0 = ADD_EPI16(q4, r0); + const __m128i x1 = SUB_EPI16(q4, r0); + const __m128i x2 = SUB_EPI16(q7, r1); + const __m128i x3 = ADD_EPI16(q7, r1); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3); + if (overflow) { + vpx_highbd_fdct8x8_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + // Interleave to do the multiply by constants which gets us into 32bits + { + const __m128i t0 = _mm_unpacklo_epi16(x0, x3); + const __m128i t1 = _mm_unpackhi_epi16(x0, x3); + const __m128i t2 = _mm_unpacklo_epi16(x1, x2); + const __m128i t3 = _mm_unpackhi_epi16(x1, x2); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); + const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); + const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); + const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); + const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + // Combine + res1 = _mm_packs_epi32(w0, w1); + res7 = _mm_packs_epi32(w2, w3); + res5 = _mm_packs_epi32(w4, w5); + res3 = _mm_packs_epi32(w6, w7); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3); + if (overflow) { + vpx_highbd_fdct8x8_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + } + } + // Transpose the 8x8. + { + // 00 01 02 03 04 05 06 07 + // 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 + // 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 + // 50 51 52 53 54 55 56 57 + // 60 61 62 63 64 65 66 67 + // 70 71 72 73 74 75 76 77 + const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); + const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); + const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); + const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); + const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); + const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); + const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); + const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + // 04 14 05 15 06 16 07 17 + // 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 + // 60 70 61 71 62 72 63 73 + // 54 54 55 55 56 56 57 57 + // 64 74 65 75 66 76 67 77 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + // 00 10 20 30 01 11 21 31 + // 40 50 60 70 41 51 61 71 + // 02 12 22 32 03 13 23 33 + // 42 52 62 72 43 53 63 73 + // 04 14 24 34 05 15 21 36 + // 44 54 64 74 45 55 61 76 + // 06 16 26 36 07 17 27 37 + // 46 56 66 76 47 57 67 77 + in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); + in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); + in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); + in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); + in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); + in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); + in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); + in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + } + } + // Post-condition output and store it + { + // Post-condition (division by two) + // division of two 16 bits signed numbers using shifts + // n / 2 = (n - (n >> 15)) >> 1 + const __m128i sign_in0 = _mm_srai_epi16(in0, 15); + const __m128i sign_in1 = _mm_srai_epi16(in1, 15); + const __m128i sign_in2 = _mm_srai_epi16(in2, 15); + const __m128i sign_in3 = _mm_srai_epi16(in3, 15); + const __m128i sign_in4 = _mm_srai_epi16(in4, 15); + const __m128i sign_in5 = _mm_srai_epi16(in5, 15); + const __m128i sign_in6 = _mm_srai_epi16(in6, 15); + const __m128i sign_in7 = _mm_srai_epi16(in7, 15); + in0 = _mm_sub_epi16(in0, sign_in0); + in1 = _mm_sub_epi16(in1, sign_in1); + in2 = _mm_sub_epi16(in2, sign_in2); + in3 = _mm_sub_epi16(in3, sign_in3); + in4 = _mm_sub_epi16(in4, sign_in4); + in5 = _mm_sub_epi16(in5, sign_in5); + in6 = _mm_sub_epi16(in6, sign_in6); + in7 = _mm_sub_epi16(in7, sign_in7); + in0 = _mm_srai_epi16(in0, 1); + in1 = _mm_srai_epi16(in1, 1); + in2 = _mm_srai_epi16(in2, 1); + in3 = _mm_srai_epi16(in3, 1); + in4 = _mm_srai_epi16(in4, 1); + in5 = _mm_srai_epi16(in5, 1); + in6 = _mm_srai_epi16(in6, 1); + in7 = _mm_srai_epi16(in7, 1); + // store results + store_output(&in0, (output + 0 * 8)); + store_output(&in1, (output + 1 * 8)); + store_output(&in2, (output + 2 * 8)); + store_output(&in3, (output + 3 * 8)); + store_output(&in4, (output + 4 * 8)); + store_output(&in5, (output + 5 * 8)); + store_output(&in6, (output + 6 * 8)); + store_output(&in7, (output + 7 * 8)); + } +} + +void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { + // The 2D transform is done with two passes which are actually pretty + // similar. In the first one, we transform the columns and transpose + // the results. In the second one, we transform the rows. To achieve that, + // as the first pass results are transposed, we transpose the columns (that + // is the transposed rows) and transpose the results (so that it goes back + // in normal/row positions). + int pass; + // We need an intermediate buffer between passes. + DECLARE_ALIGNED(16, int16_t, intermediate[256]); + const int16_t *in = input; + int16_t *out0 = intermediate; + tran_low_t *out1 = output; + // Constants + // When we use them, in one case, they are all the same. In all others + // it's a pair of them that we need to repeat four times. This is done + // by constructing the 32 bit constant corresponding to that pair. + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); + const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); + const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); + const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); + const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); + const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); + const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); + const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i kOne = _mm_set1_epi16(1); + // Do the two transform/transpose passes + for (pass = 0; pass < 2; ++pass) { + // We process eight columns (transposed rows in second pass) at a time. + int column_start; +#if DCT_HIGH_BIT_DEPTH + int overflow; +#endif + for (column_start = 0; column_start < 16; column_start += 8) { + __m128i in00, in01, in02, in03, in04, in05, in06, in07; + __m128i in08, in09, in10, in11, in12, in13, in14, in15; + __m128i input0, input1, input2, input3, input4, input5, input6, input7; + __m128i step1_0, step1_1, step1_2, step1_3; + __m128i step1_4, step1_5, step1_6, step1_7; + __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; + __m128i step3_0, step3_1, step3_2, step3_3; + __m128i step3_4, step3_5, step3_6, step3_7; + __m128i res00, res01, res02, res03, res04, res05, res06, res07; + __m128i res08, res09, res10, res11, res12, res13, res14, res15; + // Load and pre-condition input. + if (0 == pass) { + in00 = _mm_load_si128((const __m128i *)(in + 0 * stride)); + in01 = _mm_load_si128((const __m128i *)(in + 1 * stride)); + in02 = _mm_load_si128((const __m128i *)(in + 2 * stride)); + in03 = _mm_load_si128((const __m128i *)(in + 3 * stride)); + in04 = _mm_load_si128((const __m128i *)(in + 4 * stride)); + in05 = _mm_load_si128((const __m128i *)(in + 5 * stride)); + in06 = _mm_load_si128((const __m128i *)(in + 6 * stride)); + in07 = _mm_load_si128((const __m128i *)(in + 7 * stride)); + in08 = _mm_load_si128((const __m128i *)(in + 8 * stride)); + in09 = _mm_load_si128((const __m128i *)(in + 9 * stride)); + in10 = _mm_load_si128((const __m128i *)(in + 10 * stride)); + in11 = _mm_load_si128((const __m128i *)(in + 11 * stride)); + in12 = _mm_load_si128((const __m128i *)(in + 12 * stride)); + in13 = _mm_load_si128((const __m128i *)(in + 13 * stride)); + in14 = _mm_load_si128((const __m128i *)(in + 14 * stride)); + in15 = _mm_load_si128((const __m128i *)(in + 15 * stride)); + // x = x << 2 + in00 = _mm_slli_epi16(in00, 2); + in01 = _mm_slli_epi16(in01, 2); + in02 = _mm_slli_epi16(in02, 2); + in03 = _mm_slli_epi16(in03, 2); + in04 = _mm_slli_epi16(in04, 2); + in05 = _mm_slli_epi16(in05, 2); + in06 = _mm_slli_epi16(in06, 2); + in07 = _mm_slli_epi16(in07, 2); + in08 = _mm_slli_epi16(in08, 2); + in09 = _mm_slli_epi16(in09, 2); + in10 = _mm_slli_epi16(in10, 2); + in11 = _mm_slli_epi16(in11, 2); + in12 = _mm_slli_epi16(in12, 2); + in13 = _mm_slli_epi16(in13, 2); + in14 = _mm_slli_epi16(in14, 2); + in15 = _mm_slli_epi16(in15, 2); + } else { + in00 = _mm_load_si128((const __m128i *)(in + 0 * 16)); + in01 = _mm_load_si128((const __m128i *)(in + 1 * 16)); + in02 = _mm_load_si128((const __m128i *)(in + 2 * 16)); + in03 = _mm_load_si128((const __m128i *)(in + 3 * 16)); + in04 = _mm_load_si128((const __m128i *)(in + 4 * 16)); + in05 = _mm_load_si128((const __m128i *)(in + 5 * 16)); + in06 = _mm_load_si128((const __m128i *)(in + 6 * 16)); + in07 = _mm_load_si128((const __m128i *)(in + 7 * 16)); + in08 = _mm_load_si128((const __m128i *)(in + 8 * 16)); + in09 = _mm_load_si128((const __m128i *)(in + 9 * 16)); + in10 = _mm_load_si128((const __m128i *)(in + 10 * 16)); + in11 = _mm_load_si128((const __m128i *)(in + 11 * 16)); + in12 = _mm_load_si128((const __m128i *)(in + 12 * 16)); + in13 = _mm_load_si128((const __m128i *)(in + 13 * 16)); + in14 = _mm_load_si128((const __m128i *)(in + 14 * 16)); + in15 = _mm_load_si128((const __m128i *)(in + 15 * 16)); + // x = (x + 1) >> 2 + in00 = _mm_add_epi16(in00, kOne); + in01 = _mm_add_epi16(in01, kOne); + in02 = _mm_add_epi16(in02, kOne); + in03 = _mm_add_epi16(in03, kOne); + in04 = _mm_add_epi16(in04, kOne); + in05 = _mm_add_epi16(in05, kOne); + in06 = _mm_add_epi16(in06, kOne); + in07 = _mm_add_epi16(in07, kOne); + in08 = _mm_add_epi16(in08, kOne); + in09 = _mm_add_epi16(in09, kOne); + in10 = _mm_add_epi16(in10, kOne); + in11 = _mm_add_epi16(in11, kOne); + in12 = _mm_add_epi16(in12, kOne); + in13 = _mm_add_epi16(in13, kOne); + in14 = _mm_add_epi16(in14, kOne); + in15 = _mm_add_epi16(in15, kOne); + in00 = _mm_srai_epi16(in00, 2); + in01 = _mm_srai_epi16(in01, 2); + in02 = _mm_srai_epi16(in02, 2); + in03 = _mm_srai_epi16(in03, 2); + in04 = _mm_srai_epi16(in04, 2); + in05 = _mm_srai_epi16(in05, 2); + in06 = _mm_srai_epi16(in06, 2); + in07 = _mm_srai_epi16(in07, 2); + in08 = _mm_srai_epi16(in08, 2); + in09 = _mm_srai_epi16(in09, 2); + in10 = _mm_srai_epi16(in10, 2); + in11 = _mm_srai_epi16(in11, 2); + in12 = _mm_srai_epi16(in12, 2); + in13 = _mm_srai_epi16(in13, 2); + in14 = _mm_srai_epi16(in14, 2); + in15 = _mm_srai_epi16(in15, 2); + } + in += 8; + // Calculate input for the first 8 results. + { + input0 = ADD_EPI16(in00, in15); + input1 = ADD_EPI16(in01, in14); + input2 = ADD_EPI16(in02, in13); + input3 = ADD_EPI16(in03, in12); + input4 = ADD_EPI16(in04, in11); + input5 = ADD_EPI16(in05, in10); + input6 = ADD_EPI16(in06, in09); + input7 = ADD_EPI16(in07, in08); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x8(&input0, &input1, &input2, &input3, + &input4, &input5, &input6, &input7); + if (overflow) { + vpx_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + // Calculate input for the next 8 results. + { + step1_0 = SUB_EPI16(in07, in08); + step1_1 = SUB_EPI16(in06, in09); + step1_2 = SUB_EPI16(in05, in10); + step1_3 = SUB_EPI16(in04, in11); + step1_4 = SUB_EPI16(in03, in12); + step1_5 = SUB_EPI16(in02, in13); + step1_6 = SUB_EPI16(in01, in14); + step1_7 = SUB_EPI16(in00, in15); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3, + &step1_4, &step1_5, &step1_6, &step1_7); + if (overflow) { + vpx_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + // Work on the first eight values; fdct8(input, even_results); + { + // Add/subtract + const __m128i q0 = ADD_EPI16(input0, input7); + const __m128i q1 = ADD_EPI16(input1, input6); + const __m128i q2 = ADD_EPI16(input2, input5); + const __m128i q3 = ADD_EPI16(input3, input4); + const __m128i q4 = SUB_EPI16(input3, input4); + const __m128i q5 = SUB_EPI16(input2, input5); + const __m128i q6 = SUB_EPI16(input1, input6); + const __m128i q7 = SUB_EPI16(input0, input7); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7); + if (overflow) { + vpx_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + // Work on first four results + { + // Add/subtract + const __m128i r0 = ADD_EPI16(q0, q3); + const __m128i r1 = ADD_EPI16(q1, q2); + const __m128i r2 = SUB_EPI16(q1, q2); + const __m128i r3 = SUB_EPI16(q0, q3); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3); + if (overflow) { + vpx_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + + // Interleave to do the multiply by constants which gets us + // into 32 bits. + { + const __m128i t0 = _mm_unpacklo_epi16(r0, r1); + const __m128i t1 = _mm_unpackhi_epi16(r0, r1); + const __m128i t2 = _mm_unpacklo_epi16(r2, r3); + const __m128i t3 = _mm_unpackhi_epi16(r2, r3); + res00 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res08 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res04 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res12 = mult_round_shift(&t2, &t3, &k__cospi_m08_p24, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&res00, &res08, &res04, &res12); + if (overflow) { + vpx_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + } + // Work on next four results + { + // Interleave to do the multiply by constants which gets us + // into 32 bits. + const __m128i d0 = _mm_unpacklo_epi16(q6, q5); + const __m128i d1 = _mm_unpackhi_epi16(q6, q5); + const __m128i r0 = + mult_round_shift(&d0, &d1, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + const __m128i r1 = + mult_round_shift(&d0, &d1, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x2(&r0, &r1); + if (overflow) { + vpx_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + { + // Add/subtract + const __m128i x0 = ADD_EPI16(q4, r0); + const __m128i x1 = SUB_EPI16(q4, r0); + const __m128i x2 = SUB_EPI16(q7, r1); + const __m128i x3 = ADD_EPI16(q7, r1); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3); + if (overflow) { + vpx_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + + // Interleave to do the multiply by constants which gets us + // into 32 bits. + { + const __m128i t0 = _mm_unpacklo_epi16(x0, x3); + const __m128i t1 = _mm_unpackhi_epi16(x0, x3); + const __m128i t2 = _mm_unpacklo_epi16(x1, x2); + const __m128i t3 = _mm_unpackhi_epi16(x1, x2); + res02 = mult_round_shift(&t0, &t1, &k__cospi_p28_p04, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res14 = mult_round_shift(&t0, &t1, &k__cospi_m04_p28, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res10 = mult_round_shift(&t2, &t3, &k__cospi_p12_p20, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res06 = mult_round_shift(&t2, &t3, &k__cospi_m20_p12, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x4(&res02, &res14, &res10, &res06); + if (overflow) { + vpx_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + } + } + } + // Work on the next eight values; step1 -> odd_results + { + // step 2 + { + const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); + const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); + const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); + const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); + step2_2 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2_3 = mult_round_shift(&t2, &t3, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2_5 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2_4 = mult_round_shift(&t2, &t3, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5, &step2_4); + if (overflow) { + vpx_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + // step 3 + { + step3_0 = ADD_EPI16(step1_0, step2_3); + step3_1 = ADD_EPI16(step1_1, step2_2); + step3_2 = SUB_EPI16(step1_1, step2_2); + step3_3 = SUB_EPI16(step1_0, step2_3); + step3_4 = SUB_EPI16(step1_7, step2_4); + step3_5 = SUB_EPI16(step1_6, step2_5); + step3_6 = ADD_EPI16(step1_6, step2_5); + step3_7 = ADD_EPI16(step1_7, step2_4); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x8(&step3_0, &step3_1, &step3_2, &step3_3, + &step3_4, &step3_5, &step3_6, &step3_7); + if (overflow) { + vpx_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + // step 4 + { + const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); + const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); + const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); + const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); + step2_1 = mult_round_shift(&t0, &t1, &k__cospi_m08_p24, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2_2 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2_6 = mult_round_shift(&t0, &t1, &k__cospi_p24_p08, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2_5 = mult_round_shift(&t2, &t3, &k__cospi_p08_m24, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6, &step2_5); + if (overflow) { + vpx_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + // step 5 + { + step1_0 = ADD_EPI16(step3_0, step2_1); + step1_1 = SUB_EPI16(step3_0, step2_1); + step1_2 = ADD_EPI16(step3_3, step2_2); + step1_3 = SUB_EPI16(step3_3, step2_2); + step1_4 = SUB_EPI16(step3_4, step2_5); + step1_5 = ADD_EPI16(step3_4, step2_5); + step1_6 = SUB_EPI16(step3_7, step2_6); + step1_7 = ADD_EPI16(step3_7, step2_6); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3, + &step1_4, &step1_5, &step1_6, &step1_7); + if (overflow) { + vpx_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + // step 6 + { + const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); + const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); + const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); + const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); + res01 = mult_round_shift(&t0, &t1, &k__cospi_p30_p02, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res09 = mult_round_shift(&t2, &t3, &k__cospi_p14_p18, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res15 = mult_round_shift(&t0, &t1, &k__cospi_m02_p30, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res07 = mult_round_shift(&t2, &t3, &k__cospi_m18_p14, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&res01, &res09, &res15, &res07); + if (overflow) { + vpx_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); + const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); + const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); + const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); + res05 = mult_round_shift(&t0, &t1, &k__cospi_p22_p10, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res13 = mult_round_shift(&t2, &t3, &k__cospi_p06_p26, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res11 = mult_round_shift(&t0, &t1, &k__cospi_m10_p22, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res03 = mult_round_shift(&t2, &t3, &k__cospi_m26_p06, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&res05, &res13, &res11, &res03); + if (overflow) { + vpx_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + } + // Transpose the results, do it as two 8x8 transposes. + transpose_and_output8x8(&res00, &res01, &res02, &res03, &res04, &res05, + &res06, &res07, pass, out0, out1); + transpose_and_output8x8(&res08, &res09, &res10, &res11, &res12, &res13, + &res14, &res15, pass, out0 + 8, out1 + 8); + if (pass == 0) { + out0 += 8 * 16; + } else { + out1 += 8 * 16; + } + } + // Setup in/out for next pass. + in = intermediate; + } +} + +#undef ADD_EPI16 +#undef SUB_EPI16 diff --git a/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.c new file mode 100644 index 0000000000..e14b99197f --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.c @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE2 + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/x86/fwd_txfm_sse2.h" + +void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) { + __m128i in0, in1; + __m128i tmp; + const __m128i zero = _mm_setzero_si128(); + in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + in1 = _mm_unpacklo_epi64( + in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride))); + in0 = _mm_unpacklo_epi64( + in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride))); + + tmp = _mm_add_epi16(in0, in1); + in0 = _mm_unpacklo_epi16(zero, tmp); + in1 = _mm_unpackhi_epi16(zero, tmp); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + tmp = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(tmp, zero); + in1 = _mm_unpackhi_epi32(tmp, zero); + + tmp = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(tmp, 8); + + in1 = _mm_add_epi32(tmp, in0); + in0 = _mm_slli_epi32(in1, 1); + output[0] = (tran_low_t)_mm_cvtsi128_si32(in0); +} + +void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) { + __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); + __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); + __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); + __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); + __m128i u0, u1, sum; + + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + + in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); + + sum = _mm_add_epi16(u0, u1); + + in0 = _mm_add_epi16(in0, in1); + in2 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, in0); + + u0 = _mm_setzero_si128(); + sum = _mm_add_epi16(sum, in2); + + in0 = _mm_unpacklo_epi16(u0, sum); + in1 = _mm_unpackhi_epi16(u0, sum); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(sum, u0); + in1 = _mm_unpackhi_epi32(sum, u0); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(sum, 8); + + in1 = _mm_add_epi32(sum, in0); + output[0] = (tran_low_t)_mm_cvtsi128_si32(in1); +} + +void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output, + int stride) { + __m128i in0, in1, in2, in3; + __m128i u0, u1; + __m128i sum = _mm_setzero_si128(); + int i; + + for (i = 0; i < 2; ++i) { + in0 = _mm_load_si128((const __m128i *)(input + 0 * stride + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 0 * stride + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 1 * stride + 0)); + in3 = _mm_load_si128((const __m128i *)(input + 1 * stride + 8)); + + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 2 * stride + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 2 * stride + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 3 * stride + 0)); + in3 = _mm_load_si128((const __m128i *)(input + 3 * stride + 8)); + + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 4 * stride + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 4 * stride + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 5 * stride + 0)); + in3 = _mm_load_si128((const __m128i *)(input + 5 * stride + 8)); + + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 6 * stride + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 6 * stride + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 7 * stride + 0)); + in3 = _mm_load_si128((const __m128i *)(input + 7 * stride + 8)); + + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + sum = _mm_add_epi16(sum, u1); + input += 8 * stride; + } + + u0 = _mm_setzero_si128(); + in0 = _mm_unpacklo_epi16(u0, sum); + in1 = _mm_unpackhi_epi16(u0, sum); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(sum, u0); + in1 = _mm_unpackhi_epi32(sum, u0); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(sum, 8); + + in1 = _mm_add_epi32(sum, in0); + in1 = _mm_srai_epi32(in1, 1); + output[0] = (tran_low_t)_mm_cvtsi128_si32(in1); +} + +void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output, + int stride) { + __m128i in0, in1, in2, in3; + __m128i u0, u1; + __m128i sum = _mm_setzero_si128(); + int i; + + for (i = 0; i < 8; ++i) { + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); + + input += stride; + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); + + input += stride; + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); + + input += stride; + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); + + input += stride; + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + sum = _mm_add_epi16(sum, u1); + } + + u0 = _mm_setzero_si128(); + in0 = _mm_unpacklo_epi16(u0, sum); + in1 = _mm_unpackhi_epi16(u0, sum); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(sum, u0); + in1 = _mm_unpackhi_epi32(sum, u0); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(sum, 8); + + in1 = _mm_add_epi32(sum, in0); + in1 = _mm_srai_epi32(in1, 3); + output[0] = (tran_low_t)_mm_cvtsi128_si32(in1); +} + +#define DCT_HIGH_BIT_DEPTH 0 +#define FDCT4x4_2D vpx_fdct4x4_sse2 +#define FDCT8x8_2D vpx_fdct8x8_sse2 +#define FDCT16x16_2D vpx_fdct16x16_sse2 +#include "vpx_dsp/x86/fwd_txfm_impl_sse2.h" +#undef FDCT4x4_2D +#undef FDCT8x8_2D +#undef FDCT16x16_2D + +#define FDCT32x32_2D vpx_fdct32x32_rd_sse2 +#define FDCT32x32_HIGH_PRECISION 0 +#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" +#undef FDCT32x32_2D +#undef FDCT32x32_HIGH_PRECISION + +#define FDCT32x32_2D vpx_fdct32x32_sse2 +#define FDCT32x32_HIGH_PRECISION 1 +#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT +#undef FDCT32x32_2D +#undef FDCT32x32_HIGH_PRECISION +#undef DCT_HIGH_BIT_DEPTH + +#if CONFIG_VP9_HIGHBITDEPTH +#define DCT_HIGH_BIT_DEPTH 1 +#define FDCT4x4_2D vpx_highbd_fdct4x4_sse2 +#define FDCT8x8_2D vpx_highbd_fdct8x8_sse2 +#define FDCT16x16_2D vpx_highbd_fdct16x16_sse2 +#include "vpx_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT +#undef FDCT4x4_2D +#undef FDCT8x8_2D +#undef FDCT16x16_2D + +#define FDCT32x32_2D vpx_highbd_fdct32x32_rd_sse2 +#define FDCT32x32_HIGH_PRECISION 0 +#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT +#undef FDCT32x32_2D +#undef FDCT32x32_HIGH_PRECISION + +#define FDCT32x32_2D vpx_highbd_fdct32x32_sse2 +#define FDCT32x32_HIGH_PRECISION 1 +#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT +#undef FDCT32x32_2D +#undef FDCT32x32_HIGH_PRECISION +#undef DCT_HIGH_BIT_DEPTH +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h new file mode 100644 index 0000000000..5aa2779706 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h @@ -0,0 +1,371 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_X86_FWD_TXFM_SSE2_H_ +#define VPX_VPX_DSP_X86_FWD_TXFM_SSE2_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#define pair_set_epi32(a, b) \ + _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a)) + +static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) { + __m128i buf0, buf1; + buf0 = _mm_mul_epu32(a, b); + a = _mm_srli_epi64(a, 32); + b = _mm_srli_epi64(b, 32); + buf1 = _mm_mul_epu32(a, b); + return _mm_add_epi64(buf0, buf1); +} + +static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) { + __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0)); + __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0)); + return _mm_unpacklo_epi64(buf0, buf1); +} + +static INLINE int check_epi16_overflow_x2(const __m128i *preg0, + const __m128i *preg1) { + const __m128i max_overflow = _mm_set1_epi16(0x7fff); + const __m128i min_overflow = _mm_set1_epi16((short)0x8000); + __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow), + _mm_cmpeq_epi16(*preg0, min_overflow)); + __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow), + _mm_cmpeq_epi16(*preg1, min_overflow)); + cmp0 = _mm_or_si128(cmp0, cmp1); + return _mm_movemask_epi8(cmp0); +} + +static INLINE int check_epi16_overflow_x4(const __m128i *preg0, + const __m128i *preg1, + const __m128i *preg2, + const __m128i *preg3) { + const __m128i max_overflow = _mm_set1_epi16(0x7fff); + const __m128i min_overflow = _mm_set1_epi16((short)0x8000); + __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow), + _mm_cmpeq_epi16(*preg0, min_overflow)); + __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow), + _mm_cmpeq_epi16(*preg1, min_overflow)); + __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow), + _mm_cmpeq_epi16(*preg2, min_overflow)); + __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow), + _mm_cmpeq_epi16(*preg3, min_overflow)); + cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3)); + return _mm_movemask_epi8(cmp0); +} + +static INLINE int check_epi16_overflow_x8( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7) { + int res0, res1; + res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); + res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); + return res0 + res1; +} + +static INLINE int check_epi16_overflow_x12( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, + const __m128i *preg9, const __m128i *preg10, const __m128i *preg11) { + int res0, res1; + res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); + res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); + if (!res0) res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); + return res0 + res1; +} + +static INLINE int check_epi16_overflow_x16( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, + const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, + const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, + const __m128i *preg15) { + int res0, res1; + res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); + res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); + if (!res0) { + res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); + if (!res1) res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15); + } + return res0 + res1; +} + +static INLINE int check_epi16_overflow_x32( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, + const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, + const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, + const __m128i *preg15, const __m128i *preg16, const __m128i *preg17, + const __m128i *preg18, const __m128i *preg19, const __m128i *preg20, + const __m128i *preg21, const __m128i *preg22, const __m128i *preg23, + const __m128i *preg24, const __m128i *preg25, const __m128i *preg26, + const __m128i *preg27, const __m128i *preg28, const __m128i *preg29, + const __m128i *preg30, const __m128i *preg31) { + int res0, res1; + res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); + res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); + if (!res0) { + res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); + if (!res1) { + res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15); + if (!res0) { + res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19); + if (!res1) { + res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23); + if (!res0) { + res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27); + if (!res1) + res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31); + } + } + } + } + } + return res0 + res1; +} + +static INLINE int k_check_epi32_overflow_4(const __m128i *preg0, + const __m128i *preg1, + const __m128i *preg2, + const __m128i *preg3, + const __m128i *zero) { + __m128i minus_one = _mm_set1_epi32(-1); + // Check for overflows + __m128i reg0_shifted = _mm_slli_epi64(*preg0, 1); + __m128i reg1_shifted = _mm_slli_epi64(*preg1, 1); + __m128i reg2_shifted = _mm_slli_epi64(*preg2, 1); + __m128i reg3_shifted = _mm_slli_epi64(*preg3, 1); + __m128i reg0_top_dwords = + _mm_shuffle_epi32(reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1)); + __m128i reg1_top_dwords = + _mm_shuffle_epi32(reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1)); + __m128i reg2_top_dwords = + _mm_shuffle_epi32(reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1)); + __m128i reg3_top_dwords = + _mm_shuffle_epi32(reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1)); + __m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords); + __m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords); + __m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero); + __m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero); + __m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one); + __m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one); + int overflow_01 = + _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_01, valid_negative_01)); + int overflow_23 = + _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_23, valid_negative_23)); + return (overflow_01 + overflow_23); +} + +static INLINE int k_check_epi32_overflow_8( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *zero) { + int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero); + if (!overflow) { + overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero); + } + return overflow; +} + +static INLINE int k_check_epi32_overflow_16( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, + const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, + const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, + const __m128i *preg15, const __m128i *zero) { + int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero); + if (!overflow) { + overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero); + if (!overflow) { + overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero); + if (!overflow) { + overflow = + k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero); + } + } + } + return overflow; +} + +static INLINE int k_check_epi32_overflow_32( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, + const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, + const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, + const __m128i *preg15, const __m128i *preg16, const __m128i *preg17, + const __m128i *preg18, const __m128i *preg19, const __m128i *preg20, + const __m128i *preg21, const __m128i *preg22, const __m128i *preg23, + const __m128i *preg24, const __m128i *preg25, const __m128i *preg26, + const __m128i *preg27, const __m128i *preg28, const __m128i *preg29, + const __m128i *preg30, const __m128i *preg31, const __m128i *zero) { + int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero); + if (!overflow) { + overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero); + if (!overflow) { + overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero); + if (!overflow) { + overflow = + k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero); + if (!overflow) { + overflow = + k_check_epi32_overflow_4(preg16, preg17, preg18, preg19, zero); + if (!overflow) { + overflow = + k_check_epi32_overflow_4(preg20, preg21, preg22, preg23, zero); + if (!overflow) { + overflow = k_check_epi32_overflow_4(preg24, preg25, preg26, + preg27, zero); + if (!overflow) { + overflow = k_check_epi32_overflow_4(preg28, preg29, preg30, + preg31, zero); + } + } + } + } + } + } + } + return overflow; +} + +static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m128i zero = _mm_setzero_si128(); + const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); + __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); + __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); + _mm_store_si128((__m128i *)(dst_ptr), out0); + _mm_store_si128((__m128i *)(dst_ptr + 4), out1); +#else + _mm_store_si128((__m128i *)(dst_ptr), *poutput); +#endif // CONFIG_VP9_HIGHBITDEPTH +} + +static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m128i zero = _mm_setzero_si128(); + const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); + __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); + __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); + _mm_storeu_si128((__m128i *)(dst_ptr), out0); + _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1); +#else + _mm_storeu_si128((__m128i *)(dst_ptr), *poutput); +#endif // CONFIG_VP9_HIGHBITDEPTH +} + +static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1, + const __m128i *pmultiplier, + const __m128i *prounding, + const int shift) { + const __m128i u0 = _mm_madd_epi16(*pin0, *pmultiplier); + const __m128i u1 = _mm_madd_epi16(*pin1, *pmultiplier); + const __m128i v0 = _mm_add_epi32(u0, *prounding); + const __m128i v1 = _mm_add_epi32(u1, *prounding); + const __m128i w0 = _mm_srai_epi32(v0, shift); + const __m128i w1 = _mm_srai_epi32(v1, shift); + return _mm_packs_epi32(w0, w1); +} + +static INLINE void transpose_and_output8x8( + const __m128i *pin00, const __m128i *pin01, const __m128i *pin02, + const __m128i *pin03, const __m128i *pin04, const __m128i *pin05, + const __m128i *pin06, const __m128i *pin07, const int pass, + int16_t *out0_ptr, tran_low_t *out1_ptr) { + // 00 01 02 03 04 05 06 07 + // 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 + // 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 + // 50 51 52 53 54 55 56 57 + // 60 61 62 63 64 65 66 67 + // 70 71 72 73 74 75 76 77 + const __m128i tr0_0 = _mm_unpacklo_epi16(*pin00, *pin01); + const __m128i tr0_1 = _mm_unpacklo_epi16(*pin02, *pin03); + const __m128i tr0_2 = _mm_unpackhi_epi16(*pin00, *pin01); + const __m128i tr0_3 = _mm_unpackhi_epi16(*pin02, *pin03); + const __m128i tr0_4 = _mm_unpacklo_epi16(*pin04, *pin05); + const __m128i tr0_5 = _mm_unpacklo_epi16(*pin06, *pin07); + const __m128i tr0_6 = _mm_unpackhi_epi16(*pin04, *pin05); + const __m128i tr0_7 = _mm_unpackhi_epi16(*pin06, *pin07); + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + // 04 14 05 15 06 16 07 17 + // 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 + // 60 70 61 71 62 72 63 73 + // 54 54 55 55 56 56 57 57 + // 64 74 65 75 66 76 67 77 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + // 00 10 20 30 01 11 21 31 + // 40 50 60 70 41 51 61 71 + // 02 12 22 32 03 13 23 33 + // 42 52 62 72 43 53 63 73 + // 04 14 24 34 05 15 21 36 + // 44 54 64 74 45 55 61 76 + // 06 16 26 36 07 17 27 37 + // 46 56 66 76 47 57 67 77 + const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); + const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); + const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); + const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); + const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); + const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); + const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); + const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + if (pass == 0) { + _mm_storeu_si128((__m128i *)(out0_ptr + 0 * 16), tr2_0); + _mm_storeu_si128((__m128i *)(out0_ptr + 1 * 16), tr2_1); + _mm_storeu_si128((__m128i *)(out0_ptr + 2 * 16), tr2_2); + _mm_storeu_si128((__m128i *)(out0_ptr + 3 * 16), tr2_3); + _mm_storeu_si128((__m128i *)(out0_ptr + 4 * 16), tr2_4); + _mm_storeu_si128((__m128i *)(out0_ptr + 5 * 16), tr2_5); + _mm_storeu_si128((__m128i *)(out0_ptr + 6 * 16), tr2_6); + _mm_storeu_si128((__m128i *)(out0_ptr + 7 * 16), tr2_7); + } else { + storeu_output(&tr2_0, (out1_ptr + 0 * 16)); + storeu_output(&tr2_1, (out1_ptr + 1 * 16)); + storeu_output(&tr2_2, (out1_ptr + 2 * 16)); + storeu_output(&tr2_3, (out1_ptr + 3 * 16)); + storeu_output(&tr2_4, (out1_ptr + 4 * 16)); + storeu_output(&tr2_5, (out1_ptr + 5 * 16)); + storeu_output(&tr2_6, (out1_ptr + 6 * 16)); + storeu_output(&tr2_7, (out1_ptr + 7 * 16)); + } +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_DSP_X86_FWD_TXFM_SSE2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm new file mode 100644 index 0000000000..2c338fb5dd --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm @@ -0,0 +1,361 @@ +; +; Copyright (c) 2015 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA + +pw_11585x2: times 8 dw 23170 +pd_8192: times 4 dd 8192 + +%macro TRANSFORM_COEFFS 2 +pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2 +pw_%2_m%1: dw %2, -%1, %2, -%1, %2, -%1, %2, -%1 +%endmacro + +TRANSFORM_COEFFS 11585, 11585 +TRANSFORM_COEFFS 15137, 6270 +TRANSFORM_COEFFS 16069, 3196 +TRANSFORM_COEFFS 9102, 13623 + +SECTION .text + +%if VPX_ARCH_X86_64 +INIT_XMM ssse3 +cglobal fdct8x8, 3, 5, 13, input, output, stride + + mova m8, [GLOBAL(pd_8192)] + mova m12, [GLOBAL(pw_11585x2)] + + lea r3, [2 * strideq] + lea r4, [4 * strideq] + mova m0, [inputq] + mova m1, [inputq + r3] + lea inputq, [inputq + r4] + mova m2, [inputq] + mova m3, [inputq + r3] + lea inputq, [inputq + r4] + mova m4, [inputq] + mova m5, [inputq + r3] + lea inputq, [inputq + r4] + mova m6, [inputq] + mova m7, [inputq + r3] + + ; left shift by 2 to increase forward transformation precision + psllw m0, 2 + psllw m1, 2 + psllw m2, 2 + psllw m3, 2 + psllw m4, 2 + psllw m5, 2 + psllw m6, 2 + psllw m7, 2 + + ; column transform + ; stage 1 + paddw m10, m0, m7 + psubw m0, m7 + + paddw m9, m1, m6 + psubw m1, m6 + + paddw m7, m2, m5 + psubw m2, m5 + + paddw m6, m3, m4 + psubw m3, m4 + + ; stage 2 + paddw m5, m9, m7 + psubw m9, m7 + + paddw m4, m10, m6 + psubw m10, m6 + + paddw m7, m1, m2 + psubw m1, m2 + + ; stage 3 + paddw m6, m4, m5 + psubw m4, m5 + + pmulhrsw m1, m12 + pmulhrsw m7, m12 + + ; sin(pi / 8), cos(pi / 8) + punpcklwd m2, m10, m9 + punpckhwd m10, m9 + pmaddwd m5, m2, [GLOBAL(pw_15137_6270)] + pmaddwd m2, [GLOBAL(pw_6270_m15137)] + pmaddwd m9, m10, [GLOBAL(pw_15137_6270)] + pmaddwd m10, [GLOBAL(pw_6270_m15137)] + paddd m5, m8 + paddd m2, m8 + paddd m9, m8 + paddd m10, m8 + psrad m5, 14 + psrad m2, 14 + psrad m9, 14 + psrad m10, 14 + packssdw m5, m9 + packssdw m2, m10 + + pmulhrsw m6, m12 + pmulhrsw m4, m12 + + paddw m9, m3, m1 + psubw m3, m1 + + paddw m10, m0, m7 + psubw m0, m7 + + ; stage 4 + ; sin(pi / 16), cos(pi / 16) + punpcklwd m1, m10, m9 + punpckhwd m10, m9 + pmaddwd m7, m1, [GLOBAL(pw_16069_3196)] + pmaddwd m1, [GLOBAL(pw_3196_m16069)] + pmaddwd m9, m10, [GLOBAL(pw_16069_3196)] + pmaddwd m10, [GLOBAL(pw_3196_m16069)] + paddd m7, m8 + paddd m1, m8 + paddd m9, m8 + paddd m10, m8 + psrad m7, 14 + psrad m1, 14 + psrad m9, 14 + psrad m10, 14 + packssdw m7, m9 + packssdw m1, m10 + + ; sin(3 * pi / 16), cos(3 * pi / 16) + punpcklwd m11, m0, m3 + punpckhwd m0, m3 + pmaddwd m9, m11, [GLOBAL(pw_9102_13623)] + pmaddwd m11, [GLOBAL(pw_13623_m9102)] + pmaddwd m3, m0, [GLOBAL(pw_9102_13623)] + pmaddwd m0, [GLOBAL(pw_13623_m9102)] + paddd m9, m8 + paddd m11, m8 + paddd m3, m8 + paddd m0, m8 + psrad m9, 14 + psrad m11, 14 + psrad m3, 14 + psrad m0, 14 + packssdw m9, m3 + packssdw m11, m0 + + ; transpose + ; stage 1 + punpcklwd m0, m6, m7 + punpcklwd m3, m5, m11 + punpckhwd m6, m7 + punpckhwd m5, m11 + punpcklwd m7, m4, m9 + punpcklwd m10, m2, m1 + punpckhwd m4, m9 + punpckhwd m2, m1 + + ; stage 2 + punpckldq m9, m0, m3 + punpckldq m1, m6, m5 + punpckhdq m0, m3 + punpckhdq m6, m5 + punpckldq m3, m7, m10 + punpckldq m5, m4, m2 + punpckhdq m7, m10 + punpckhdq m4, m2 + + ; stage 3 + punpcklqdq m10, m9, m3 + punpckhqdq m9, m3 + punpcklqdq m2, m0, m7 + punpckhqdq m0, m7 + punpcklqdq m3, m1, m5 + punpckhqdq m1, m5 + punpcklqdq m7, m6, m4 + punpckhqdq m6, m4 + + ; row transform + ; stage 1 + paddw m5, m10, m6 + psubw m10, m6 + + paddw m4, m9, m7 + psubw m9, m7 + + paddw m6, m2, m1 + psubw m2, m1 + + paddw m7, m0, m3 + psubw m0, m3 + + ;stage 2 + paddw m1, m5, m7 + psubw m5, m7 + + paddw m3, m4, m6 + psubw m4, m6 + + paddw m7, m9, m2 + psubw m9, m2 + + ; stage 3 + punpcklwd m6, m1, m3 + punpckhwd m1, m3 + pmaddwd m2, m6, [GLOBAL(pw_11585_11585)] + pmaddwd m6, [GLOBAL(pw_11585_m11585)] + pmaddwd m3, m1, [GLOBAL(pw_11585_11585)] + pmaddwd m1, [GLOBAL(pw_11585_m11585)] + paddd m2, m8 + paddd m6, m8 + paddd m3, m8 + paddd m1, m8 + psrad m2, 14 + psrad m6, 14 + psrad m3, 14 + psrad m1, 14 + packssdw m2, m3 + packssdw m6, m1 + + pmulhrsw m7, m12 + pmulhrsw m9, m12 + + punpcklwd m3, m5, m4 + punpckhwd m5, m4 + pmaddwd m1, m3, [GLOBAL(pw_15137_6270)] + pmaddwd m3, [GLOBAL(pw_6270_m15137)] + pmaddwd m4, m5, [GLOBAL(pw_15137_6270)] + pmaddwd m5, [GLOBAL(pw_6270_m15137)] + paddd m1, m8 + paddd m3, m8 + paddd m4, m8 + paddd m5, m8 + psrad m1, 14 + psrad m3, 14 + psrad m4, 14 + psrad m5, 14 + packssdw m1, m4 + packssdw m3, m5 + + paddw m4, m0, m9 + psubw m0, m9 + + paddw m5, m10, m7 + psubw m10, m7 + + ; stage 4 + punpcklwd m9, m5, m4 + punpckhwd m5, m4 + pmaddwd m7, m9, [GLOBAL(pw_16069_3196)] + pmaddwd m9, [GLOBAL(pw_3196_m16069)] + pmaddwd m4, m5, [GLOBAL(pw_16069_3196)] + pmaddwd m5, [GLOBAL(pw_3196_m16069)] + paddd m7, m8 + paddd m9, m8 + paddd m4, m8 + paddd m5, m8 + psrad m7, 14 + psrad m9, 14 + psrad m4, 14 + psrad m5, 14 + packssdw m7, m4 + packssdw m9, m5 + + punpcklwd m4, m10, m0 + punpckhwd m10, m0 + pmaddwd m5, m4, [GLOBAL(pw_9102_13623)] + pmaddwd m4, [GLOBAL(pw_13623_m9102)] + pmaddwd m0, m10, [GLOBAL(pw_9102_13623)] + pmaddwd m10, [GLOBAL(pw_13623_m9102)] + paddd m5, m8 + paddd m4, m8 + paddd m0, m8 + paddd m10, m8 + psrad m5, 14 + psrad m4, 14 + psrad m0, 14 + psrad m10, 14 + packssdw m5, m0 + packssdw m4, m10 + + ; transpose + ; stage 1 + punpcklwd m0, m2, m7 + punpcklwd m10, m1, m4 + punpckhwd m2, m7 + punpckhwd m1, m4 + punpcklwd m7, m6, m5 + punpcklwd m4, m3, m9 + punpckhwd m6, m5 + punpckhwd m3, m9 + + ; stage 2 + punpckldq m5, m0, m10 + punpckldq m9, m2, m1 + punpckhdq m0, m10 + punpckhdq m2, m1 + punpckldq m10, m7, m4 + punpckldq m1, m6, m3 + punpckhdq m7, m4 + punpckhdq m6, m3 + + ; stage 3 + punpcklqdq m4, m5, m10 + punpckhqdq m5, m10 + punpcklqdq m3, m0, m7 + punpckhqdq m0, m7 + punpcklqdq m10, m9, m1 + punpckhqdq m9, m1 + punpcklqdq m7, m2, m6 + punpckhqdq m2, m6 + + psraw m1, m4, 15 + psraw m6, m5, 15 + psraw m8, m3, 15 + psraw m11, m0, 15 + + psubw m4, m1 + psubw m5, m6 + psubw m3, m8 + psubw m0, m11 + + psraw m4, 1 + psraw m5, 1 + psraw m3, 1 + psraw m0, 1 + + psraw m1, m10, 15 + psraw m6, m9, 15 + psraw m8, m7, 15 + psraw m11, m2, 15 + + psubw m10, m1 + psubw m9, m6 + psubw m7, m8 + psubw m2, m11 + + psraw m10, 1 + psraw m9, 1 + psraw m7, 1 + psraw m2, 1 + + mova [outputq + 0], m4 + mova [outputq + 16], m5 + mova [outputq + 32], m3 + mova [outputq + 48], m0 + mova [outputq + 64], m10 + mova [outputq + 80], m9 + mova [outputq + 96], m7 + mova [outputq + 112], m2 + + RET +%endif diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c new file mode 100644 index 0000000000..01a52ec8bf --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c @@ -0,0 +1,1495 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/convolve.h" +#include "vpx_dsp/x86/convolve_avx2.h" + +// ----------------------------------------------------------------------------- +// Copy and average + +void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + (void)bd; + + assert(w % 4 == 0); + if (w > 32) { // w = 64 + do { + const __m256i p0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); + const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32)); + const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48)); + src += src_stride; + _mm256_storeu_si256((__m256i *)dst, p0); + _mm256_storeu_si256((__m256i *)(dst + 16), p1); + _mm256_storeu_si256((__m256i *)(dst + 32), p2); + _mm256_storeu_si256((__m256i *)(dst + 48), p3); + dst += dst_stride; + h--; + } while (h > 0); + } else if (w > 16) { // w = 32 + do { + const __m256i p0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); + src += src_stride; + _mm256_storeu_si256((__m256i *)dst, p0); + _mm256_storeu_si256((__m256i *)(dst + 16), p1); + dst += dst_stride; + h--; + } while (h > 0); + } else if (w > 8) { // w = 16 + __m256i p0, p1; + do { + p0 = _mm256_loadu_si256((const __m256i *)src); + src += src_stride; + p1 = _mm256_loadu_si256((const __m256i *)src); + src += src_stride; + + _mm256_storeu_si256((__m256i *)dst, p0); + dst += dst_stride; + _mm256_storeu_si256((__m256i *)dst, p1); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else if (w > 4) { // w = 8 + __m128i p0, p1; + do { + p0 = _mm_loadu_si128((const __m128i *)src); + src += src_stride; + p1 = _mm_loadu_si128((const __m128i *)src); + src += src_stride; + + _mm_storeu_si128((__m128i *)dst, p0); + dst += dst_stride; + _mm_storeu_si128((__m128i *)dst, p1); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else { // w = 4 + __m128i p0, p1; + do { + p0 = _mm_loadl_epi64((const __m128i *)src); + src += src_stride; + p1 = _mm_loadl_epi64((const __m128i *)src); + src += src_stride; + + _mm_storel_epi64((__m128i *)dst, p0); + dst += dst_stride; + _mm_storel_epi64((__m128i *)dst, p1); + dst += dst_stride; + h -= 2; + } while (h > 0); + } +} + +void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + (void)bd; + + assert(w % 4 == 0); + if (w > 32) { // w = 64 + __m256i p0, p1, p2, p3, u0, u1, u2, u3; + do { + p0 = _mm256_loadu_si256((const __m256i *)src); + p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); + p2 = _mm256_loadu_si256((const __m256i *)(src + 32)); + p3 = _mm256_loadu_si256((const __m256i *)(src + 48)); + src += src_stride; + u0 = _mm256_loadu_si256((const __m256i *)dst); + u1 = _mm256_loadu_si256((const __m256i *)(dst + 16)); + u2 = _mm256_loadu_si256((const __m256i *)(dst + 32)); + u3 = _mm256_loadu_si256((const __m256i *)(dst + 48)); + _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); + _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1)); + _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2)); + _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3)); + dst += dst_stride; + h--; + } while (h > 0); + } else if (w > 16) { // w = 32 + __m256i p0, p1, u0, u1; + do { + p0 = _mm256_loadu_si256((const __m256i *)src); + p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); + src += src_stride; + u0 = _mm256_loadu_si256((const __m256i *)dst); + u1 = _mm256_loadu_si256((const __m256i *)(dst + 16)); + _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); + _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1)); + dst += dst_stride; + h--; + } while (h > 0); + } else if (w > 8) { // w = 16 + __m256i p0, p1, u0, u1; + do { + p0 = _mm256_loadu_si256((const __m256i *)src); + p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride)); + src += src_stride << 1; + u0 = _mm256_loadu_si256((const __m256i *)dst); + u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride)); + + _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); + _mm256_storeu_si256((__m256i *)(dst + dst_stride), + _mm256_avg_epu16(p1, u1)); + dst += dst_stride << 1; + h -= 2; + } while (h > 0); + } else if (w > 4) { // w = 8 + __m128i p0, p1, u0, u1; + do { + p0 = _mm_loadu_si128((const __m128i *)src); + p1 = _mm_loadu_si128((const __m128i *)(src + src_stride)); + src += src_stride << 1; + u0 = _mm_loadu_si128((const __m128i *)dst); + u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride)); + + _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0)); + _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1)); + dst += dst_stride << 1; + h -= 2; + } while (h > 0); + } else { // w = 4 + __m128i p0, p1, u0, u1; + do { + p0 = _mm_loadl_epi64((const __m128i *)src); + p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride)); + src += src_stride << 1; + u0 = _mm_loadl_epi64((const __m128i *)dst); + u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride)); + + _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0)); + _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1)); + dst += dst_stride << 1; + h -= 2; + } while (h > 0); + } +} + +// ----------------------------------------------------------------------------- +// Horizontal and vertical filtering + +static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, + 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, + 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; + +static const uint8_t signal_pattern_1[32] = { 4, 5, 6, 7, 6, 7, 8, 9, + 8, 9, 10, 11, 10, 11, 12, 13, + 4, 5, 6, 7, 6, 7, 8, 9, + 8, 9, 10, 11, 10, 11, 12, 13 }; + +static const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11, + 10, 11, 12, 13, 12, 13, 14, 15, + 6, 7, 8, 9, 8, 9, 10, 11, + 10, 11, 12, 13, 12, 13, 14, 15 }; + +static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 }; + +#define CONV8_ROUNDING_BITS (7) +#define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1)) + +// ----------------------------------------------------------------------------- +// Horizontal Filtering + +static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) { + const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); + const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0); + const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1); + const __m256i c = _mm256_permutevar8x32_epi32(*s, idx); + + p[0] = _mm256_shuffle_epi8(*s, sf0); // x0x6 + p[1] = _mm256_shuffle_epi8(*s, sf1); // x1x7 + p[2] = _mm256_shuffle_epi8(c, sf0); // x2x4 + p[3] = _mm256_shuffle_epi8(c, sf1); // x3x5 +} + +// Note: +// Shared by 8x2 and 16x1 block +static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1, + __m256i *x /*x[8]*/) { + __m256i pp[8]; + pack_pixels(s0, pp); + pack_pixels(s1, &pp[4]); + x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20); + x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20); + x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20); + x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20); + x[4] = x[2]; + x[5] = x[3]; + x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31); + x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31); +} + +static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) { + __m256i pp[8]; + __m256i s0; + s0 = _mm256_loadu_si256((const __m256i *)src); + pack_pixels(&s0, pp); + x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30); + x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30); + x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30); + x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30); +} + +static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride, + __m256i *x) { + __m256i s0, s1; + s0 = _mm256_loadu_si256((const __m256i *)src); + s1 = _mm256_loadu_si256((const __m256i *)(src + stride)); + pack_16_pixels(&s0, &s1, x); +} + +static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) { + __m256i s0, s1; + s0 = _mm256_loadu_si256((const __m256i *)src); + s1 = _mm256_loadu_si256((const __m256i *)(src + 8)); + pack_16_pixels(&s0, &s1, x); +} + +// Note: +// Shared by horizontal and vertical filtering +static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) { + const __m128i h = _mm_loadu_si128((const __m128i *)filter); + const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1); + const __m256i p0 = _mm256_set1_epi32(0x03020100); + const __m256i p1 = _mm256_set1_epi32(0x07060504); + const __m256i p2 = _mm256_set1_epi32(0x0b0a0908); + const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c); + f[0] = _mm256_shuffle_epi8(hh, p0); + f[1] = _mm256_shuffle_epi8(hh, p1); + f[2] = _mm256_shuffle_epi8(hh, p2); + f[3] = _mm256_shuffle_epi8(hh, p3); +} + +static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/, + const __m256i *fil /*fil[4]*/, + __m256i *y) { + __m256i a, a0, a1; + + a0 = _mm256_madd_epi16(fil[0], sig[0]); + a1 = _mm256_madd_epi16(fil[3], sig[3]); + a = _mm256_add_epi32(a0, a1); + + a0 = _mm256_madd_epi16(fil[1], sig[1]); + a1 = _mm256_madd_epi16(fil[2], sig[2]); + + { + const __m256i min = _mm256_min_epi32(a0, a1); + a = _mm256_add_epi32(a, min); + } + { + const __m256i max = _mm256_max_epi32(a0, a1); + a = _mm256_add_epi32(a, max); + } + { + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + a = _mm256_add_epi32(a, rounding); + *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS); + } +} + +static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask, + uint16_t *dst) { + const __m128i a0 = _mm256_castsi256_si128(*y); + const __m128i a1 = _mm256_extractf128_si256(*y, 1); + __m128i res = _mm_packus_epi32(a0, a1); + res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask)); + _mm_storeu_si128((__m128i *)dst, res); +} + +static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + __m256i a = _mm256_packus_epi32(*y0, *y1); + a = _mm256_min_epi16(a, *mask); + _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a)); + _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1)); +} + +static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst) { + __m256i a = _mm256_packus_epi32(*y0, *y1); + a = _mm256_min_epi16(a, *mask); + _mm256_storeu_si256((__m256i *)dst, a); +} + +static void vpx_highbd_filter_block1d8_h8_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[8], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + src_ptr -= 3; + do { + pack_8x2_pixels(src_ptr, src_pitch, signal); + filter_8x1_pixels(signal, ff, &res0); + filter_8x1_pixels(&signal[4], ff, &res1); + store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + height -= 2; + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + } while (height > 1); + + if (height > 0) { + pack_8x1_pixels(src_ptr, signal); + filter_8x1_pixels(signal, ff, &res0); + store_8x1_pixels(&res0, &max, dst_ptr); + } +} + +static void vpx_highbd_filter_block1d16_h8_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[8], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + src_ptr -= 3; + do { + pack_16x1_pixels(src_ptr, signal); + filter_8x1_pixels(signal, ff, &res0); + filter_8x1_pixels(&signal[4], ff, &res1); + store_16x1_pixels(&res0, &res1, &max, dst_ptr); + height -= 1; + src_ptr += src_pitch; + dst_ptr += dst_pitch; + } while (height > 0); +} + +// ----------------------------------------------------------------------------- +// 2-tap horizontal filtering + +static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) { + const __m128i h = _mm_loadu_si128((const __m128i *)filter); + const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1); + const __m256i p = _mm256_set1_epi32(0x09080706); + f[0] = _mm256_shuffle_epi8(hh, p); +} + +// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels() +// the difference is s0/s1 specifies first and second rows or, +// first 16 samples and 8-sample shifted 16 samples +static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1, + __m256i *sig) { + const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); + const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2); + __m256i x0 = _mm256_shuffle_epi8(*s0, sf2); + __m256i x1 = _mm256_shuffle_epi8(*s1, sf2); + __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx); + __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx); + r0 = _mm256_shuffle_epi8(r0, sf2); + r1 = _mm256_shuffle_epi8(r1, sf2); + sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20); + sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20); +} + +static INLINE void pack_8x2_2t_pixels(const uint16_t *src, + const ptrdiff_t pitch, __m256i *sig) { + const __m256i r0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch)); + pack_16_2t_pixels(&r0, &r1, sig); +} + +static INLINE void pack_16x1_2t_pixels(const uint16_t *src, + __m256i *sig /*sig[2]*/) { + const __m256i r0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8)); + pack_16_2t_pixels(&r0, &r1, sig); +} + +static INLINE void pack_8x1_2t_pixels(const uint16_t *src, + __m256i *sig /*sig[2]*/) { + const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); + const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2); + __m256i r0 = _mm256_loadu_si256((const __m256i *)src); + __m256i x0 = _mm256_shuffle_epi8(r0, sf2); + r0 = _mm256_permutevar8x32_epi32(r0, idx); + r0 = _mm256_shuffle_epi8(r0, sf2); + sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20); +} + +// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels() +static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m256i x0 = _mm256_madd_epi16(sig[0], *f); + __m256i x1 = _mm256_madd_epi16(sig[1], *f); + x0 = _mm256_add_epi32(x0, rounding); + x1 = _mm256_add_epi32(x1, rounding); + *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); + *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS); +} + +static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0) { + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m256i x0 = _mm256_madd_epi16(sig[0], *f); + x0 = _mm256_add_epi32(x0, rounding); + *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); +} + +static void vpx_highbd_filter_block1d8_h2_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[2], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff; + pack_2t_filter(filter, &ff); + + src_ptr -= 3; + do { + pack_8x2_2t_pixels(src_ptr, src_pitch, signal); + filter_16_2t_pixels(signal, &ff, &res0, &res1); + store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + height -= 2; + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + } while (height > 1); + + if (height > 0) { + pack_8x1_2t_pixels(src_ptr, signal); + filter_8x1_2t_pixels(signal, &ff, &res0); + store_8x1_pixels(&res0, &max, dst_ptr); + } +} + +static void vpx_highbd_filter_block1d16_h2_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[2], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff; + pack_2t_filter(filter, &ff); + + src_ptr -= 3; + do { + pack_16x1_2t_pixels(src_ptr, signal); + filter_16_2t_pixels(signal, &ff, &res0, &res1); + store_16x1_pixels(&res0, &res1, &max, dst_ptr); + height -= 1; + src_ptr += src_pitch; + dst_ptr += dst_pitch; + } while (height > 0); +} + +// ----------------------------------------------------------------------------- +// Vertical Filtering + +static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { + __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src)); + __m256i s1 = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch))); + __m256i s2 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 2 * pitch))); + __m256i s3 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 3 * pitch))); + __m256i s4 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 4 * pitch))); + __m256i s5 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 5 * pitch))); + __m256i s6 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 6 * pitch))); + + s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1); + s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1); + s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1); + s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1); + s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1); + s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1); + + sig[0] = _mm256_unpacklo_epi16(s0, s1); + sig[4] = _mm256_unpackhi_epi16(s0, s1); + sig[1] = _mm256_unpacklo_epi16(s2, s3); + sig[5] = _mm256_unpackhi_epi16(s2, s3); + sig[2] = _mm256_unpacklo_epi16(s4, s5); + sig[6] = _mm256_unpackhi_epi16(s4, s5); + sig[8] = s6; +} + +static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch, + __m256i *sig) { + // base + 7th row + __m256i s0 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 7 * pitch))); + // base + 8th row + __m256i s1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 8 * pitch))); + __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1); + __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1); + sig[3] = _mm256_unpacklo_epi16(s2, s3); + sig[7] = _mm256_unpackhi_epi16(s2, s3); + sig[8] = s1; +} + +static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + filter_8x1_pixels(sig, f, y0); + filter_8x1_pixels(&sig[4], f, y1); +} + +static INLINE void update_pixels(__m256i *sig) { + int i; + for (i = 0; i < 3; ++i) { + sig[i] = sig[i + 1]; + sig[i + 4] = sig[i + 5]; + } +} + +static void vpx_highbd_filter_block1d8_v8_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[9], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + pack_8x9_init(src_ptr, src_pitch, signal); + + do { + pack_8x9_pixels(src_ptr, src_pitch, signal); + + filter_8x9_pixels(signal, ff, &res0, &res1); + store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + update_pixels(signal); + + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + height -= 2; + } while (height > 0); +} + +static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { + __m256i u0, u1, u2, u3; + // load 0-6 rows + const __m256i s0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch)); + const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch)); + const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch)); + const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch)); + const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch)); + const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch)); + + u0 = _mm256_permute2x128_si256(s0, s1, 0x20); // 0, 1 low + u1 = _mm256_permute2x128_si256(s0, s1, 0x31); // 0, 1 high + + u2 = _mm256_permute2x128_si256(s1, s2, 0x20); // 1, 2 low + u3 = _mm256_permute2x128_si256(s1, s2, 0x31); // 1, 2 high + + sig[0] = _mm256_unpacklo_epi16(u0, u2); + sig[4] = _mm256_unpackhi_epi16(u0, u2); + + sig[8] = _mm256_unpacklo_epi16(u1, u3); + sig[12] = _mm256_unpackhi_epi16(u1, u3); + + u0 = _mm256_permute2x128_si256(s2, s3, 0x20); + u1 = _mm256_permute2x128_si256(s2, s3, 0x31); + + u2 = _mm256_permute2x128_si256(s3, s4, 0x20); + u3 = _mm256_permute2x128_si256(s3, s4, 0x31); + + sig[1] = _mm256_unpacklo_epi16(u0, u2); + sig[5] = _mm256_unpackhi_epi16(u0, u2); + + sig[9] = _mm256_unpacklo_epi16(u1, u3); + sig[13] = _mm256_unpackhi_epi16(u1, u3); + + u0 = _mm256_permute2x128_si256(s4, s5, 0x20); + u1 = _mm256_permute2x128_si256(s4, s5, 0x31); + + u2 = _mm256_permute2x128_si256(s5, s6, 0x20); + u3 = _mm256_permute2x128_si256(s5, s6, 0x31); + + sig[2] = _mm256_unpacklo_epi16(u0, u2); + sig[6] = _mm256_unpackhi_epi16(u0, u2); + + sig[10] = _mm256_unpacklo_epi16(u1, u3); + sig[14] = _mm256_unpackhi_epi16(u1, u3); + + sig[16] = s6; +} + +static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch, + __m256i *sig) { + // base + 7th row + const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch)); + // base + 8th row + const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch)); + + __m256i u0, u1, u2, u3; + u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20); + u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31); + + u2 = _mm256_permute2x128_si256(s7, s8, 0x20); + u3 = _mm256_permute2x128_si256(s7, s8, 0x31); + + sig[3] = _mm256_unpacklo_epi16(u0, u2); + sig[7] = _mm256_unpackhi_epi16(u0, u2); + + sig[11] = _mm256_unpacklo_epi16(u1, u3); + sig[15] = _mm256_unpackhi_epi16(u1, u3); + + sig[16] = s8; +} + +static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + __m256i res[4]; + int i; + for (i = 0; i < 4; ++i) { + filter_8x1_pixels(&sig[i << 2], f, &res[i]); + } + + { + const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]); + const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]); + *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20); + *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31); + } +} + +static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + __m256i p = _mm256_min_epi16(*y0, *mask); + _mm256_storeu_si256((__m256i *)dst, p); + p = _mm256_min_epi16(*y1, *mask); + _mm256_storeu_si256((__m256i *)(dst + pitch), p); +} + +static void update_16x9_pixels(__m256i *sig) { + update_pixels(&sig[0]); + update_pixels(&sig[8]); +} + +static void vpx_highbd_filter_block1d16_v8_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[17], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + pack_16x9_init(src_ptr, src_pitch, signal); + + do { + pack_16x9_pixels(src_ptr, src_pitch, signal); + filter_16x9_pixels(signal, ff, &res0, &res1); + store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + update_16x9_pixels(signal); + + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + height -= 2; + } while (height > 0); +} + +// ----------------------------------------------------------------------------- +// 2-tap vertical filtering + +static void pack_16x2_init(const uint16_t *src, __m256i *sig) { + sig[2] = _mm256_loadu_si256((const __m256i *)src); +} + +static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch, + __m256i *sig) { + // load the next row + const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch)); + sig[0] = _mm256_unpacklo_epi16(sig[2], u); + sig[1] = _mm256_unpackhi_epi16(sig[2], u); + sig[2] = u; +} + +static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + filter_16_2t_pixels(sig, f, y0, y1); +} + +static void vpx_highbd_filter_block1d16_v2_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[3], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + __m256i ff; + + pack_2t_filter(filter, &ff); + pack_16x2_init(src_ptr, signal); + + do { + pack_16x2_2t_pixels(src_ptr, src_pitch, signal); + filter_16x2_2t_pixels(signal, &ff, &res0, &res1); + store_16x1_pixels(&res0, &res1, &max, dst_ptr); + + src_ptr += src_pitch; + dst_ptr += dst_pitch; + height -= 1; + } while (height > 0); +} + +static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) { + const __m128i h = _mm_loadu_si128((const __m128i *)filter); + const __m128i p = _mm_set1_epi32(0x09080706); + f[0] = _mm_shuffle_epi8(h, p); +} + +static void pack_8x2_init(const uint16_t *src, __m128i *sig) { + sig[2] = _mm_loadu_si128((const __m128i *)src); +} + +static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch, + __m128i *sig) { + // load the next row + const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch)); + sig[0] = _mm_unpacklo_epi16(sig[2], u); + sig[1] = _mm_unpackhi_epi16(sig[2], u); + sig[2] = u; +} + +static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f, + __m128i *y0, __m128i *y1) { + const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m128i x0 = _mm_madd_epi16(sig[0], *f); + __m128i x1 = _mm_madd_epi16(sig[1], *f); + x0 = _mm_add_epi32(x0, rounding); + x1 = _mm_add_epi32(x1, rounding); + *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS); + *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS); +} + +static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1, + const __m128i *mask, uint16_t *dst) { + __m128i res = _mm_packus_epi32(*y0, *y1); + res = _mm_min_epi16(res, *mask); + _mm_storeu_si128((__m128i *)dst, res); +} + +static void vpx_highbd_filter_block1d8_v2_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m128i signal[3], res0, res1; + const __m128i max = _mm_set1_epi16((1 << bd) - 1); + __m128i ff; + + pack_8x1_2t_filter(filter, &ff); + pack_8x2_init(src_ptr, signal); + + do { + pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal); + filter_8_2t_pixels(signal, &ff, &res0, &res1); + store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr); + + src_ptr += src_pitch; + dst_ptr += dst_pitch; + height -= 1; + } while (height > 0); +} + +// Calculation with averaging the input pixels + +static INLINE void store_8x1_avg_pixels(const __m256i *y0, const __m256i *mask, + uint16_t *dst) { + const __m128i a0 = _mm256_castsi256_si128(*y0); + const __m128i a1 = _mm256_extractf128_si256(*y0, 1); + __m128i res = _mm_packus_epi32(a0, a1); + const __m128i pix = _mm_loadu_si128((const __m128i *)dst); + res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask)); + res = _mm_avg_epu16(res, pix); + _mm_storeu_si128((__m128i *)dst, res); +} + +static INLINE void store_8x2_avg_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + __m256i a = _mm256_packus_epi32(*y0, *y1); + const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst); + const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch)); + const __m256i pix = + _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1); + a = _mm256_min_epi16(a, *mask); + a = _mm256_avg_epu16(a, pix); + _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a)); + _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1)); +} + +static INLINE void store_16x1_avg_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst) { + __m256i a = _mm256_packus_epi32(*y0, *y1); + const __m256i pix = _mm256_loadu_si256((const __m256i *)dst); + a = _mm256_min_epi16(a, *mask); + a = _mm256_avg_epu16(a, pix); + _mm256_storeu_si256((__m256i *)dst, a); +} + +static INLINE void store_16x2_avg_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst); + const __m256i pix1 = _mm256_loadu_si256((const __m256i *)(dst + pitch)); + __m256i p = _mm256_min_epi16(*y0, *mask); + p = _mm256_avg_epu16(p, pix0); + _mm256_storeu_si256((__m256i *)dst, p); + + p = _mm256_min_epi16(*y1, *mask); + p = _mm256_avg_epu16(p, pix1); + _mm256_storeu_si256((__m256i *)(dst + pitch), p); +} + +static INLINE void store_8x1_2t_avg_pixels_ver(const __m128i *y0, + const __m128i *y1, + const __m128i *mask, + uint16_t *dst) { + __m128i res = _mm_packus_epi32(*y0, *y1); + const __m128i pix = _mm_loadu_si128((const __m128i *)dst); + res = _mm_min_epi16(res, *mask); + res = _mm_avg_epu16(res, pix); + _mm_storeu_si128((__m128i *)dst, res); +} + +static void vpx_highbd_filter_block1d8_h8_avg_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[8], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + src_ptr -= 3; + do { + pack_8x2_pixels(src_ptr, src_pitch, signal); + filter_8x1_pixels(signal, ff, &res0); + filter_8x1_pixels(&signal[4], ff, &res1); + store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + height -= 2; + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + } while (height > 1); + + if (height > 0) { + pack_8x1_pixels(src_ptr, signal); + filter_8x1_pixels(signal, ff, &res0); + store_8x1_avg_pixels(&res0, &max, dst_ptr); + } +} + +static void vpx_highbd_filter_block1d16_h8_avg_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[8], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + src_ptr -= 3; + do { + pack_16x1_pixels(src_ptr, signal); + filter_8x1_pixels(signal, ff, &res0); + filter_8x1_pixels(&signal[4], ff, &res1); + store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr); + height -= 1; + src_ptr += src_pitch; + dst_ptr += dst_pitch; + } while (height > 0); +} + +static void vpx_highbd_filter_block1d4_h4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + // We extract the middle four elements of the kernel into two registers in + // the form + // ... k[3] k[2] k[3] k[2] + // ... k[5] k[4] k[5] k[4] + // Then we shuffle the source into + // ... s[1] s[0] s[0] s[-1] + // ... s[3] s[2] s[2] s[1] + // Calling multiply and add gives us half of the sum. Calling add on the two + // halves gives us the output. Since avx2 allows us to use 256-bit buffer, we + // can do this two rows at a time. + + __m256i src_reg, src_reg_shift_0, src_reg_shift_2; + __m256i res_reg; + __m256i idx_shift_0 = + _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2, + 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9); + __m256i idx_shift_2 = + _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4, + 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13); + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg, kernel_reg_23, + kernel_reg_45; // Segments of the kernel used + const __m256i reg_round = + _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1); + const ptrdiff_t unrolled_src_stride = src_stride << 1; + const ptrdiff_t unrolled_dst_stride = dst_stride << 1; + int h; + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55); + kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa); + + for (h = height; h >= 2; h -= 2) { + // Load the source + src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + // Get the output + res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2, + &kernel_reg_23, &kernel_reg_45); + + // Round the result + res_reg = mm256_round_epi32(&res_reg, ®_round, CONV8_ROUNDING_BITS); + + // Finally combine to get the final dst + res_reg = _mm256_packus_epi32(res_reg, res_reg); + res_reg = _mm256_min_epi16(res_reg, reg_max); + mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &res_reg); + + src_ptr += unrolled_src_stride; + dst_ptr += unrolled_dst_stride; + } + + // Repeat for the last row if needed + if (h > 0) { + // Load the source + src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + // Get the output + res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2, + &kernel_reg_23, &kernel_reg_45); + + // Round the result + res_reg = mm256_round_epi32(&res_reg, ®_round, CONV8_ROUNDING_BITS); + + // Finally combine to get the final dst + res_reg = _mm256_packus_epi32(res_reg, res_reg); + res_reg = _mm256_min_epi16(res_reg, reg_max); + _mm_storel_epi64((__m128i *)dst_ptr, _mm256_castsi256_si128(res_reg)); + } +} + +static void vpx_highbd_filter_block1d8_h4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + // We will extract the middle four elements of the kernel into two registers + // in the form + // ... k[3] k[2] k[3] k[2] + // ... k[5] k[4] k[5] k[4] + // Then we shuffle the source into + // ... s[1] s[0] s[0] s[-1] + // ... s[3] s[2] s[2] s[1] + // Calling multiply and add gives us half of the sum of the first half. + // Calling add gives us first half of the output. Repat again to get the whole + // output. Since avx2 allows us to use 256-bit buffer, we can do this two rows + // at a time. + + __m256i src_reg, src_reg_shift_0, src_reg_shift_2; + __m256i res_reg, res_first, res_last; + __m256i idx_shift_0 = + _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2, + 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9); + __m256i idx_shift_2 = + _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4, + 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13); + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg, kernel_reg_23, + kernel_reg_45; // Segments of the kernel used + const __m256i reg_round = + _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1); + const ptrdiff_t unrolled_src_stride = src_stride << 1; + const ptrdiff_t unrolled_dst_stride = dst_stride << 1; + int h; + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55); + kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa); + + for (h = height; h >= 2; h -= 2) { + // Load the source + src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + // Result for first half + res_first = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2, + &kernel_reg_23, &kernel_reg_45); + + // Do again to get the second half of dst + // Load the source + src_reg = mm256_loadu2_si128(src_ptr + 4, src_ptr + src_stride + 4); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + // Result for second half + res_last = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2, + &kernel_reg_23, &kernel_reg_45); + + // Round each result + res_first = mm256_round_epi32(&res_first, ®_round, CONV8_ROUNDING_BITS); + res_last = mm256_round_epi32(&res_last, ®_round, CONV8_ROUNDING_BITS); + + // Finally combine to get the final dst + res_reg = _mm256_packus_epi32(res_first, res_last); + res_reg = _mm256_min_epi16(res_reg, reg_max); + mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &res_reg); + + src_ptr += unrolled_src_stride; + dst_ptr += unrolled_dst_stride; + } + + // Repeat for the last row if needed + if (h > 0) { + src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2, + &kernel_reg_23, &kernel_reg_45); + + res_reg = mm256_round_epi32(&res_reg, ®_round, CONV8_ROUNDING_BITS); + + res_reg = _mm256_packus_epi32(res_reg, res_reg); + res_reg = _mm256_min_epi16(res_reg, reg_max); + + mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + 4), &res_reg); + } +} + +static void vpx_highbd_filter_block1d16_h4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + vpx_highbd_filter_block1d8_h4_avx2(src_ptr, src_stride, dst_ptr, dst_stride, + height, kernel, bd); + vpx_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_stride, dst_ptr + 8, + dst_stride, height, kernel, bd); +} + +static void vpx_highbd_filter_block1d8_v8_avg_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[9], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + pack_8x9_init(src_ptr, src_pitch, signal); + + do { + pack_8x9_pixels(src_ptr, src_pitch, signal); + + filter_8x9_pixels(signal, ff, &res0, &res1); + store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + update_pixels(signal); + + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + height -= 2; + } while (height > 0); +} + +static void vpx_highbd_filter_block1d16_v8_avg_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[17], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + pack_16x9_init(src_ptr, src_pitch, signal); + + do { + pack_16x9_pixels(src_ptr, src_pitch, signal); + filter_16x9_pixels(signal, ff, &res0, &res1); + store_16x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + update_16x9_pixels(signal); + + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + height -= 2; + } while (height > 0); +} + +static void vpx_highbd_filter_block1d8_h2_avg_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[2], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff; + pack_2t_filter(filter, &ff); + + src_ptr -= 3; + do { + pack_8x2_2t_pixels(src_ptr, src_pitch, signal); + filter_16_2t_pixels(signal, &ff, &res0, &res1); + store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + height -= 2; + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + } while (height > 1); + + if (height > 0) { + pack_8x1_2t_pixels(src_ptr, signal); + filter_8x1_2t_pixels(signal, &ff, &res0); + store_8x1_avg_pixels(&res0, &max, dst_ptr); + } +} + +static void vpx_highbd_filter_block1d16_h2_avg_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[2], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff; + pack_2t_filter(filter, &ff); + + src_ptr -= 3; + do { + pack_16x1_2t_pixels(src_ptr, signal); + filter_16_2t_pixels(signal, &ff, &res0, &res1); + store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr); + height -= 1; + src_ptr += src_pitch; + dst_ptr += dst_pitch; + } while (height > 0); +} + +static void vpx_highbd_filter_block1d16_v2_avg_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[3], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + __m256i ff; + + pack_2t_filter(filter, &ff); + pack_16x2_init(src_ptr, signal); + + do { + pack_16x2_2t_pixels(src_ptr, src_pitch, signal); + filter_16x2_2t_pixels(signal, &ff, &res0, &res1); + store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr); + + src_ptr += src_pitch; + dst_ptr += dst_pitch; + height -= 1; + } while (height > 0); +} + +static void vpx_highbd_filter_block1d8_v2_avg_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m128i signal[3], res0, res1; + const __m128i max = _mm_set1_epi16((1 << bd) - 1); + __m128i ff; + + pack_8x1_2t_filter(filter, &ff); + pack_8x2_init(src_ptr, signal); + + do { + pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal); + filter_8_2t_pixels(signal, &ff, &res0, &res1); + store_8x1_2t_avg_pixels_ver(&res0, &res1, &max, dst_ptr); + + src_ptr += src_pitch; + dst_ptr += dst_pitch; + height -= 1; + } while (height > 0); +} + +static void vpx_highbd_filter_block1d4_v4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + // We will load two rows of pixels and rearrange them into the form + // ... s[1,0] s[0,0] s[0,0] s[-1,0] + // so that we can call multiply and add with the kernel partial output. Then + // we can call add with another row to get the output. + + // Register for source s[-1:3, :] + __m256i src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23; + __m256i src_reg_m1001, src_reg_1223; + + // Result after multiply and add + __m256i res_reg; + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg, kernel_reg_23, kernel_reg_45; // Segments of kernel used + + const __m256i reg_round = + _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1); + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55); + kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa); + + // Row -1 to row 0 + src_reg_m10 = mm256_loadu2_epi64((const __m128i *)src_ptr, + (const __m128i *)(src_ptr + src_stride)); + + // Row 0 to row 1 + src_reg_1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2))); + src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21); + + // First three rows + src_reg_m1001 = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3))); + + src_reg_12 = _mm256_inserti128_si256(src_reg_1, + _mm256_castsi256_si128(src_reg_2), 1); + + src_reg_3 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4))); + + src_reg_23 = _mm256_inserti128_si256(src_reg_2, + _mm256_castsi256_si128(src_reg_3), 1); + + // Last three rows + src_reg_1223 = _mm256_unpacklo_epi16(src_reg_12, src_reg_23); + + // Output + res_reg = mm256_madd_add_epi32(&src_reg_m1001, &src_reg_1223, + &kernel_reg_23, &kernel_reg_45); + + // Round the words + res_reg = mm256_round_epi32(&res_reg, ®_round, CONV8_ROUNDING_BITS); + + // Combine to get the result + res_reg = _mm256_packus_epi32(res_reg, res_reg); + res_reg = _mm256_min_epi16(res_reg, reg_max); + + // Save the result + mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &res_reg); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m1001 = src_reg_1223; + src_reg_1 = src_reg_3; + } +} + +static void vpx_highbd_filter_block1d8_v4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + // We will load two rows of pixels and rearrange them into the form + // ... s[1,0] s[0,0] s[0,0] s[-1,0] + // so that we can call multiply and add with the kernel partial output. Then + // we can call add with another row to get the output. + + // Register for source s[-1:3, :] + __m256i src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23; + __m256i src_reg_m1001_lo, src_reg_m1001_hi, src_reg_1223_lo, src_reg_1223_hi; + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg, kernel_reg_23, kernel_reg_45; // Segments of kernel + + // Result after multiply and add + __m256i res_reg, res_reg_lo, res_reg_hi; + + const __m256i reg_round = + _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1); + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55); + kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa); + + // Row -1 to row 0 + src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr, + (const __m128i *)(src_ptr + src_stride)); + + // Row 0 to row 1 + src_reg_1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2))); + src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21); + + // First three rows + src_reg_m1001_lo = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01); + src_reg_m1001_hi = _mm256_unpackhi_epi16(src_reg_m10, src_reg_01); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3))); + + src_reg_12 = _mm256_inserti128_si256(src_reg_1, + _mm256_castsi256_si128(src_reg_2), 1); + + src_reg_3 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4))); + + src_reg_23 = _mm256_inserti128_si256(src_reg_2, + _mm256_castsi256_si128(src_reg_3), 1); + + // Last three rows + src_reg_1223_lo = _mm256_unpacklo_epi16(src_reg_12, src_reg_23); + src_reg_1223_hi = _mm256_unpackhi_epi16(src_reg_12, src_reg_23); + + // Output from first half + res_reg_lo = mm256_madd_add_epi32(&src_reg_m1001_lo, &src_reg_1223_lo, + &kernel_reg_23, &kernel_reg_45); + + // Output from second half + res_reg_hi = mm256_madd_add_epi32(&src_reg_m1001_hi, &src_reg_1223_hi, + &kernel_reg_23, &kernel_reg_45); + + // Round the words + res_reg_lo = + mm256_round_epi32(&res_reg_lo, ®_round, CONV8_ROUNDING_BITS); + res_reg_hi = + mm256_round_epi32(&res_reg_hi, ®_round, CONV8_ROUNDING_BITS); + + // Combine to get the result + res_reg = _mm256_packus_epi32(res_reg_lo, res_reg_hi); + res_reg = _mm256_min_epi16(res_reg, reg_max); + + // Save the result + mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &res_reg); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m1001_lo = src_reg_1223_lo; + src_reg_m1001_hi = src_reg_1223_hi; + src_reg_1 = src_reg_3; + } +} + +static void vpx_highbd_filter_block1d16_v4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + vpx_highbd_filter_block1d8_v4_avx2(src_ptr, src_stride, dst_ptr, dst_stride, + height, kernel, bd); + vpx_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_stride, dst_ptr + 8, + dst_stride, height, kernel, bd); +} + +// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm. +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2; + +// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm. +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2; + +#define vpx_highbd_filter_block1d4_h8_avx2 vpx_highbd_filter_block1d4_h8_sse2 +#define vpx_highbd_filter_block1d4_h2_avx2 vpx_highbd_filter_block1d4_h2_sse2 +#define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2 +#define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2 + +// Use the [vh]8 version because there is no [vh]4 implementation. +#define vpx_highbd_filter_block1d16_v4_avg_avx2 \ + vpx_highbd_filter_block1d16_v8_avg_avx2 +#define vpx_highbd_filter_block1d16_h4_avg_avx2 \ + vpx_highbd_filter_block1d16_h8_avg_avx2 +#define vpx_highbd_filter_block1d8_v4_avg_avx2 \ + vpx_highbd_filter_block1d8_v8_avg_avx2 +#define vpx_highbd_filter_block1d8_h4_avg_avx2 \ + vpx_highbd_filter_block1d8_h8_avg_avx2 +#define vpx_highbd_filter_block1d4_v4_avg_avx2 \ + vpx_highbd_filter_block1d4_v8_avg_avx2 +#define vpx_highbd_filter_block1d4_h4_avg_avx2 \ + vpx_highbd_filter_block1d4_h8_avg_avx2 + +HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0) +HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, + src - src_stride * (num_taps / 2 - 1), , avx2, 0) +HIGH_FUN_CONV_2D(, avx2, 0) + +// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm. +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2; + +// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm. +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2; + +#define vpx_highbd_filter_block1d4_h8_avg_avx2 \ + vpx_highbd_filter_block1d4_h8_avg_sse2 +#define vpx_highbd_filter_block1d4_h2_avg_avx2 \ + vpx_highbd_filter_block1d4_h2_avg_sse2 +#define vpx_highbd_filter_block1d4_v8_avg_avx2 \ + vpx_highbd_filter_block1d4_v8_avg_sse2 +#define vpx_highbd_filter_block1d4_v2_avg_avx2 \ + vpx_highbd_filter_block1d4_v2_avg_sse2 + +HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1) +HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, + src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1) +HIGH_FUN_CONV_2D(avg_, avx2, 1) + +#undef HIGHBD_FUNC diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c new file mode 100644 index 0000000000..f4f7235d13 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c @@ -0,0 +1,355 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE2 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void highbd_idct16_4col_stage5(const __m128i *const in, + __m128i *const out) { + // stage 5 + out[0] = _mm_add_epi32(in[0], in[3]); + out[1] = _mm_add_epi32(in[1], in[2]); + out[2] = _mm_sub_epi32(in[1], in[2]); + out[3] = _mm_sub_epi32(in[0], in[3]); + highbd_butterfly_cospi16_sse2(in[6], in[5], &out[6], &out[5]); + out[8] = _mm_add_epi32(in[8], in[11]); + out[9] = _mm_add_epi32(in[9], in[10]); + out[10] = _mm_sub_epi32(in[9], in[10]); + out[11] = _mm_sub_epi32(in[8], in[11]); + out[12] = _mm_sub_epi32(in[15], in[12]); + out[13] = _mm_sub_epi32(in[14], in[13]); + out[14] = _mm_add_epi32(in[14], in[13]); + out[15] = _mm_add_epi32(in[15], in[12]); +} + +static INLINE void highbd_idct16_4col_stage6(const __m128i *const in, + __m128i *const out) { + out[0] = _mm_add_epi32(in[0], in[7]); + out[1] = _mm_add_epi32(in[1], in[6]); + out[2] = _mm_add_epi32(in[2], in[5]); + out[3] = _mm_add_epi32(in[3], in[4]); + out[4] = _mm_sub_epi32(in[3], in[4]); + out[5] = _mm_sub_epi32(in[2], in[5]); + out[6] = _mm_sub_epi32(in[1], in[6]); + out[7] = _mm_sub_epi32(in[0], in[7]); + out[8] = in[8]; + out[9] = in[9]; + highbd_butterfly_cospi16_sse2(in[13], in[10], &out[13], &out[10]); + highbd_butterfly_cospi16_sse2(in[12], in[11], &out[12], &out[11]); + out[14] = in[14]; + out[15] = in[15]; +} + +static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) { + __m128i step1[16], step2[16]; + + // stage 2 + highbd_butterfly_sse2(io[1], io[15], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_butterfly_sse2(io[9], io[7], cospi_14_64, cospi_18_64, &step2[9], + &step2[14]); + highbd_butterfly_sse2(io[5], io[11], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + highbd_butterfly_sse2(io[13], io[3], cospi_6_64, cospi_26_64, &step2[11], + &step2[12]); + + // stage 3 + highbd_butterfly_sse2(io[2], io[14], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_butterfly_sse2(io[10], io[6], cospi_12_64, cospi_20_64, &step1[5], + &step1[6]); + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10] + step1[11] = _mm_add_epi32(step2[10], step2[11]); + step1[12] = _mm_add_epi32(step2[13], step2[12]); + step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13] + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + + // stage 4 + highbd_butterfly_cospi16_sse2(io[0], io[8], &step2[0], &step2[1]); + highbd_butterfly_sse2(io[4], io[12], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64, + &step2[13], &step2[10]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step1[4] = _mm_add_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step1[7] = _mm_add_epi32(step1[7], step1[6]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + highbd_idct16_4col_stage5(step2, step1); + highbd_idct16_4col_stage6(step1, step2); + highbd_idct16_4col_stage7(step2, io); +} + +static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) { + __m128i step1[16], step2[16]; + __m128i temp1[2], sign[2]; + + // stage 2 + highbd_partial_butterfly_sse2(io[1], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_partial_butterfly_neg_sse2(io[7], cospi_14_64, cospi_18_64, &step2[9], + &step2[14]); + highbd_partial_butterfly_sse2(io[5], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + highbd_partial_butterfly_neg_sse2(io[3], cospi_6_64, cospi_26_64, &step2[11], + &step2[12]); + + // stage 3 + highbd_partial_butterfly_sse2(io[2], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_partial_butterfly_neg_sse2(io[6], cospi_12_64, cospi_20_64, &step1[5], + &step1[6]); + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10] + step1[11] = _mm_add_epi32(step2[10], step2[11]); + step1[12] = _mm_add_epi32(step2[13], step2[12]); + step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13] + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + + // stage 4 + abs_extend_64bit_sse2(io[0], temp1, sign); + step2[0] = multiplication_round_shift_sse2(temp1, sign, cospi_16_64); + step2[1] = step2[0]; + highbd_partial_butterfly_sse2(io[4], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64, + &step2[13], &step2[10]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step1[4] = _mm_add_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step1[7] = _mm_add_epi32(step1[7], step1[6]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + highbd_idct16_4col_stage5(step2, step1); + highbd_idct16_4col_stage6(step1, step2); + highbd_idct16_4col_stage7(step2, io); +} + +static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) { + __m128i step1[16], step2[16]; + __m128i temp[2], sign[2]; + + // stage 2 + highbd_partial_butterfly_sse2(io[1], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_partial_butterfly_neg_sse2(io[3], cospi_6_64, cospi_26_64, &step2[11], + &step2[12]); + + // stage 3 + highbd_partial_butterfly_sse2(io[2], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[10] = + _mm_sub_epi32(_mm_setzero_si128(), step2[11]); // step1[10] = -step1[10] + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = + _mm_sub_epi32(_mm_setzero_si128(), step2[12]); // step1[13] = -step1[13] + step1[14] = step2[15]; + step1[15] = step2[15]; + + // stage 4 + abs_extend_64bit_sse2(io[0], temp, sign); + step2[0] = multiplication_round_shift_sse2(temp, sign, cospi_16_64); + step2[1] = step2[0]; + step2[2] = _mm_setzero_si128(); + step2[3] = _mm_setzero_si128(); + highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64, + &step2[13], &step2[10]); + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + highbd_idct16_4col_stage5(step2, step1); + highbd_idct16_4col_stage6(step1, step2); + highbd_idct16_4col_stage7(step2, io); +} + +void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i; + __m128i out[16], *in; + + if (bd == 8) { + __m128i l[16], r[16]; + + in = l; + for (i = 0; i < 2; i++) { + highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]); + highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]); + idct16_8col(in, in); + in = r; + input += 128; + } + + for (i = 0; i < 16; i += 8) { + int j; + transpose_16bit_8x8(l + i, out); + transpose_16bit_8x8(r + i, out + 8); + idct16_8col(out, out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[4][16]; + + for (i = 0; i < 4; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]); + highbd_idct16_4col(in); + input += 4 * 16; + } + + for (i = 0; i < 16; i += 4) { + int j; + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + highbd_idct16_4col(out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +void vpx_highbd_idct16x16_38_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i; + __m128i out[16]; + + if (bd == 8) { + __m128i in[16], temp[16]; + + highbd_load_pack_transpose_32bit_8x8(input, 16, in); + for (i = 8; i < 16; i++) { + in[i] = _mm_setzero_si128(); + } + idct16_8col(in, temp); + + for (i = 0; i < 16; i += 8) { + int j; + transpose_16bit_8x8(temp + i, in); + idct16_8col(in, out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[2][16], *in; + + for (i = 0; i < 2; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(input, 16, in); + highbd_idct16x16_38_4col(in); + input += 4 * 16; + } + + for (i = 0; i < 16; i += 4) { + int j; + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + highbd_idct16x16_38_4col(out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i; + __m128i out[16]; + + if (bd == 8) { + __m128i in[16], l[16]; + + in[0] = load_pack_8_32bit(input + 0 * 16); + in[1] = load_pack_8_32bit(input + 1 * 16); + in[2] = load_pack_8_32bit(input + 2 * 16); + in[3] = load_pack_8_32bit(input + 3 * 16); + + idct16x16_10_pass1(in, l); + + for (i = 0; i < 16; i += 8) { + int j; + idct16x16_10_pass2(l + i, in); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, in[j], bd); + } + dest += 8; + } + } else { + __m128i all[2][16], *in; + + for (i = 0; i < 2; i++) { + in = all[i]; + highbd_load_transpose_32bit_4x4(input, 16, in); + highbd_idct16x16_10_4col(in); + input += 4 * 16; + } + + for (i = 0; i < 16; i += 4) { + int j; + transpose_32bit_4x4(&all[0][i], out); + highbd_idct16x16_10_4col(out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +void vpx_highbd_idct16x16_1_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + highbd_idct_1_add_kernel(input, dest, stride, bd, 16); +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c new file mode 100644 index 0000000000..7898ee12c8 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c @@ -0,0 +1,349 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE4.1 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void highbd_idct16_4col_stage5(const __m128i *const in, + __m128i *const out) { + // stage 5 + out[0] = _mm_add_epi32(in[0], in[3]); + out[1] = _mm_add_epi32(in[1], in[2]); + out[2] = _mm_sub_epi32(in[1], in[2]); + out[3] = _mm_sub_epi32(in[0], in[3]); + highbd_butterfly_cospi16_sse4_1(in[6], in[5], &out[6], &out[5]); + out[8] = _mm_add_epi32(in[8], in[11]); + out[9] = _mm_add_epi32(in[9], in[10]); + out[10] = _mm_sub_epi32(in[9], in[10]); + out[11] = _mm_sub_epi32(in[8], in[11]); + out[12] = _mm_sub_epi32(in[15], in[12]); + out[13] = _mm_sub_epi32(in[14], in[13]); + out[14] = _mm_add_epi32(in[14], in[13]); + out[15] = _mm_add_epi32(in[15], in[12]); +} + +static INLINE void highbd_idct16_4col_stage6(const __m128i *const in, + __m128i *const out) { + out[0] = _mm_add_epi32(in[0], in[7]); + out[1] = _mm_add_epi32(in[1], in[6]); + out[2] = _mm_add_epi32(in[2], in[5]); + out[3] = _mm_add_epi32(in[3], in[4]); + out[4] = _mm_sub_epi32(in[3], in[4]); + out[5] = _mm_sub_epi32(in[2], in[5]); + out[6] = _mm_sub_epi32(in[1], in[6]); + out[7] = _mm_sub_epi32(in[0], in[7]); + out[8] = in[8]; + out[9] = in[9]; + highbd_butterfly_cospi16_sse4_1(in[13], in[10], &out[13], &out[10]); + highbd_butterfly_cospi16_sse4_1(in[12], in[11], &out[12], &out[11]); + out[14] = in[14]; + out[15] = in[15]; +} + +void vpx_highbd_idct16_4col_sse4_1(__m128i *const io /*io[16]*/) { + __m128i step1[16], step2[16]; + + // stage 2 + highbd_butterfly_sse4_1(io[1], io[15], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_butterfly_sse4_1(io[9], io[7], cospi_14_64, cospi_18_64, &step2[9], + &step2[14]); + highbd_butterfly_sse4_1(io[5], io[11], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + highbd_butterfly_sse4_1(io[13], io[3], cospi_6_64, cospi_26_64, &step2[11], + &step2[12]); + + // stage 3 + highbd_butterfly_sse4_1(io[2], io[14], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_butterfly_sse4_1(io[10], io[6], cospi_12_64, cospi_20_64, &step1[5], + &step1[6]); + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[10] = _mm_sub_epi32(step2[11], step2[10]); + step1[11] = _mm_add_epi32(step2[11], step2[10]); + step1[12] = _mm_add_epi32(step2[12], step2[13]); + step1[13] = _mm_sub_epi32(step2[12], step2[13]); + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + + // stage 4 + highbd_butterfly_cospi16_sse4_1(io[0], io[8], &step2[0], &step2[1]); + highbd_butterfly_sse4_1(io[4], io[12], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64, + &step2[9], &step2[14]); + highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64, + &step2[13], &step2[10]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step1[4] = _mm_add_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step1[7] = _mm_add_epi32(step1[7], step1[6]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + highbd_idct16_4col_stage5(step2, step1); + highbd_idct16_4col_stage6(step1, step2); + highbd_idct16_4col_stage7(step2, io); +} + +static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) { + __m128i step1[16], step2[16]; + __m128i temp1[2]; + + // stage 2 + highbd_partial_butterfly_sse4_1(io[1], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_partial_butterfly_sse4_1(io[7], -cospi_18_64, cospi_14_64, &step2[9], + &step2[14]); + highbd_partial_butterfly_sse4_1(io[5], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + highbd_partial_butterfly_sse4_1(io[3], -cospi_26_64, cospi_6_64, &step2[11], + &step2[12]); + + // stage 3 + highbd_partial_butterfly_sse4_1(io[2], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_partial_butterfly_sse4_1(io[6], -cospi_20_64, cospi_12_64, &step1[5], + &step1[6]); + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[10] = _mm_sub_epi32(step2[11], step2[10]); + step1[11] = _mm_add_epi32(step2[11], step2[10]); + step1[12] = _mm_add_epi32(step2[12], step2[13]); + step1[13] = _mm_sub_epi32(step2[12], step2[13]); + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + + // stage 4 + extend_64bit(io[0], temp1); + step2[0] = multiplication_round_shift_sse4_1(temp1, cospi_16_64); + step2[1] = step2[0]; + highbd_partial_butterfly_sse4_1(io[4], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64, + &step2[9], &step2[14]); + highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64, + &step2[13], &step2[10]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step1[4] = _mm_add_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step1[7] = _mm_add_epi32(step1[7], step1[6]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + highbd_idct16_4col_stage5(step2, step1); + highbd_idct16_4col_stage6(step1, step2); + highbd_idct16_4col_stage7(step2, io); +} + +static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) { + __m128i step1[16], step2[16]; + __m128i temp[2]; + + // stage 2 + highbd_partial_butterfly_sse4_1(io[1], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_partial_butterfly_sse4_1(io[3], -cospi_26_64, cospi_6_64, &step2[11], + &step2[12]); + + // stage 3 + highbd_partial_butterfly_sse4_1(io[2], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + step1[14] = step2[15]; + step1[15] = step2[15]; + + // stage 4 + extend_64bit(io[0], temp); + step2[0] = multiplication_round_shift_sse4_1(temp, cospi_16_64); + step2[1] = step2[0]; + step2[2] = _mm_setzero_si128(); + step2[3] = _mm_setzero_si128(); + highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64, + &step2[9], &step2[14]); + highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64, + &step2[13], &step2[10]); + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + highbd_idct16_4col_stage5(step2, step1); + highbd_idct16_4col_stage6(step1, step2); + highbd_idct16_4col_stage7(step2, io); +} + +void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input, + uint16_t *dest, int stride, int bd) { + int i; + __m128i out[16], *in; + + if (bd == 8) { + __m128i l[16], r[16]; + + in = l; + for (i = 0; i < 2; i++) { + highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]); + highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]); + idct16_8col(in, in); + in = r; + input += 128; + } + + for (i = 0; i < 16; i += 8) { + int j; + transpose_16bit_8x8(l + i, out); + transpose_16bit_8x8(r + i, out + 8); + idct16_8col(out, out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[4][16]; + + for (i = 0; i < 4; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]); + vpx_highbd_idct16_4col_sse4_1(in); + input += 4 * 16; + } + + for (i = 0; i < 16; i += 4) { + int j; + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + vpx_highbd_idct16_4col_sse4_1(out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +void vpx_highbd_idct16x16_38_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i; + __m128i out[16]; + + if (bd == 8) { + __m128i in[16], temp[16]; + + highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]); + for (i = 8; i < 16; i++) { + in[i] = _mm_setzero_si128(); + } + idct16_8col(in, temp); + + for (i = 0; i < 16; i += 8) { + int j; + transpose_16bit_8x8(temp + i, in); + idct16_8col(in, out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[2][16], *in; + + for (i = 0; i < 2; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(input, 16, in); + highbd_idct16x16_38_4col(in); + input += 4 * 16; + } + + for (i = 0; i < 16; i += 4) { + int j; + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + highbd_idct16x16_38_4col(out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +void vpx_highbd_idct16x16_10_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i; + __m128i out[16]; + + if (bd == 8) { + __m128i in[16], l[16]; + + in[0] = load_pack_8_32bit(input + 0 * 16); + in[1] = load_pack_8_32bit(input + 1 * 16); + in[2] = load_pack_8_32bit(input + 2 * 16); + in[3] = load_pack_8_32bit(input + 3 * 16); + + idct16x16_10_pass1(in, l); + + for (i = 0; i < 16; i += 8) { + int j; + idct16x16_10_pass2(l + i, in); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, in[j], bd); + } + dest += 8; + } + } else { + __m128i all[2][16], *in; + + for (i = 0; i < 2; i++) { + in = all[i]; + highbd_load_transpose_32bit_4x4(input, 16, in); + highbd_idct16x16_10_4col(in); + input += 4 * 16; + } + + for (i = 0; i < 16; i += 4) { + int j; + transpose_32bit_4x4(&all[0][i], out); + highbd_idct16x16_10_4col(out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c new file mode 100644 index 0000000000..c710e89954 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c @@ -0,0 +1,782 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void highbd_idct32_4x32_quarter_2_stage_4_to_6( + __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) { + __m128i step2[32]; + + // stage 4 + step2[8] = step1[8]; + step2[15] = step1[15]; + highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64, + &step2[13], &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + + // stage 5 + step1[8] = _mm_add_epi32(step2[8], step2[11]); + step1[9] = _mm_add_epi32(step2[9], step2[10]); + step1[10] = _mm_sub_epi32(step2[9], step2[10]); + step1[11] = _mm_sub_epi32(step2[8], step2[11]); + step1[12] = _mm_sub_epi32(step2[15], step2[12]); + step1[13] = _mm_sub_epi32(step2[14], step2[13]); + step1[14] = _mm_add_epi32(step2[14], step2[13]); + step1[15] = _mm_add_epi32(step2[15], step2[12]); + + // stage 6 + out[8] = step1[8]; + out[9] = step1[9]; + highbd_butterfly_sse2(step1[13], step1[10], cospi_16_64, cospi_16_64, + &out[10], &out[13]); + highbd_butterfly_sse2(step1[12], step1[11], cospi_16_64, cospi_16_64, + &out[11], &out[12]); + out[14] = step1[14]; + out[15] = step1[15]; +} + +static INLINE void highbd_idct32_4x32_quarter_3_4_stage_4_to_7( + __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) { + __m128i step2[32]; + + // stage 4 + step2[16] = _mm_add_epi32(step1[16], step1[19]); + step2[17] = _mm_add_epi32(step1[17], step1[18]); + step2[18] = _mm_sub_epi32(step1[17], step1[18]); + step2[19] = _mm_sub_epi32(step1[16], step1[19]); + step2[20] = _mm_sub_epi32(step1[20], step1[23]); // step2[20] = -step2[20] + step2[21] = _mm_sub_epi32(step1[21], step1[22]); // step2[21] = -step2[21] + step2[22] = _mm_add_epi32(step1[21], step1[22]); + step2[23] = _mm_add_epi32(step1[20], step1[23]); + + step2[24] = _mm_add_epi32(step1[27], step1[24]); + step2[25] = _mm_add_epi32(step1[26], step1[25]); + step2[26] = _mm_sub_epi32(step1[26], step1[25]); // step2[26] = -step2[26] + step2[27] = _mm_sub_epi32(step1[27], step1[24]); // step2[27] = -step2[27] + step2[28] = _mm_sub_epi32(step1[31], step1[28]); + step2[29] = _mm_sub_epi32(step1[30], step1[29]); + step2[30] = _mm_add_epi32(step1[29], step1[30]); + step2[31] = _mm_add_epi32(step1[28], step1[31]); + + // stage 5 + step1[16] = step2[16]; + step1[17] = step2[17]; + highbd_butterfly_sse2(step2[29], step2[18], cospi_24_64, cospi_8_64, + &step1[18], &step1[29]); + highbd_butterfly_sse2(step2[28], step2[19], cospi_24_64, cospi_8_64, + &step1[19], &step1[28]); + highbd_butterfly_sse2(step2[20], step2[27], cospi_8_64, cospi_24_64, + &step1[27], &step1[20]); + highbd_butterfly_sse2(step2[21], step2[26], cospi_8_64, cospi_24_64, + &step1[26], &step1[21]); + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // stage 6 + step2[16] = _mm_add_epi32(step1[16], step1[23]); + step2[17] = _mm_add_epi32(step1[17], step1[22]); + step2[18] = _mm_add_epi32(step1[18], step1[21]); + step2[19] = _mm_add_epi32(step1[19], step1[20]); + step2[20] = _mm_sub_epi32(step1[19], step1[20]); + step2[21] = _mm_sub_epi32(step1[18], step1[21]); + step2[22] = _mm_sub_epi32(step1[17], step1[22]); + step2[23] = _mm_sub_epi32(step1[16], step1[23]); + + step2[24] = _mm_sub_epi32(step1[31], step1[24]); + step2[25] = _mm_sub_epi32(step1[30], step1[25]); + step2[26] = _mm_sub_epi32(step1[29], step1[26]); + step2[27] = _mm_sub_epi32(step1[28], step1[27]); + step2[28] = _mm_add_epi32(step1[27], step1[28]); + step2[29] = _mm_add_epi32(step1[26], step1[29]); + step2[30] = _mm_add_epi32(step1[25], step1[30]); + step2[31] = _mm_add_epi32(step1[24], step1[31]); + + // stage 7 + out[16] = step2[16]; + out[17] = step2[17]; + out[18] = step2[18]; + out[19] = step2[19]; + highbd_butterfly_sse2(step2[27], step2[20], cospi_16_64, cospi_16_64, + &out[20], &out[27]); + highbd_butterfly_sse2(step2[26], step2[21], cospi_16_64, cospi_16_64, + &out[21], &out[26]); + highbd_butterfly_sse2(step2[25], step2[22], cospi_16_64, cospi_16_64, + &out[22], &out[25]); + highbd_butterfly_sse2(step2[24], step2[23], cospi_16_64, cospi_16_64, + &out[23], &out[24]); + out[28] = step2[28]; + out[29] = step2[29]; + out[30] = step2[30]; + out[31] = step2[31]; +} + +// Group the coefficient calculation into smaller functions to prevent stack +// spillover in 32x32 idct optimizations: +// quarter_1: 0-7 +// quarter_2: 8-15 +// quarter_3_4: 16-23, 24-31 + +// For each 4x32 block __m128i in[32], +// Input with index, 0, 4, 8, 12, 16, 20, 24, 28 +// output pixels: 0-7 in __m128i out[32] +static INLINE void highbd_idct32_1024_4x32_quarter_1( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + highbd_butterfly_sse2(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_butterfly_sse2(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], + &step1[6]); + + // stage 4 + highbd_butterfly_sse2(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], + &step2[0]); + highbd_butterfly_sse2(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 5 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[1], step2[2]); + step1[2] = _mm_sub_epi32(step2[1], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_butterfly_sse2(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], + &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi32(step1[0], step1[7]); + out[1] = _mm_add_epi32(step1[1], step1[6]); + out[2] = _mm_add_epi32(step1[2], step1[5]); + out[3] = _mm_add_epi32(step1[3], step1[4]); + out[4] = _mm_sub_epi32(step1[3], step1[4]); + out[5] = _mm_sub_epi32(step1[2], step1[5]); + out[6] = _mm_sub_epi32(step1[1], step1[6]); + out[7] = _mm_sub_epi32(step1[0], step1[7]); +} + +// For each 4x32 block __m128i in[32], +// Input with index, 2, 6, 10, 14, 18, 22, 26, 30 +// output pixels: 8-15 in __m128i out[32] +static INLINE void highbd_idct32_1024_4x32_quarter_2( + const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) { + __m128i step1[32], step2[32]; + + // stage 2 + highbd_butterfly_sse2(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_butterfly_sse2(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], + &step2[14]); + highbd_butterfly_sse2(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + highbd_butterfly_sse2(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], + &step2[12]); + + // stage 3 + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10] + step1[11] = _mm_add_epi32(step2[10], step2[11]); + step1[12] = _mm_add_epi32(step2[13], step2[12]); + step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13] + + highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void highbd_idct32_1024_4x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + highbd_idct32_1024_4x32_quarter_1(in, temp); + highbd_idct32_1024_4x32_quarter_2(in, temp); + // stage 7 + highbd_add_sub_butterfly(temp, out, 16); +} + +// For each 4x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void highbd_idct32_1024_4x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32], step2[32]; + + // stage 1 + highbd_butterfly_sse2(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], + &step1[31]); + highbd_butterfly_sse2(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], + &step1[30]); + highbd_butterfly_sse2(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], + &step1[29]); + highbd_butterfly_sse2(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], + &step1[28]); + + highbd_butterfly_sse2(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], + &step1[27]); + highbd_butterfly_sse2(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], + &step1[26]); + + highbd_butterfly_sse2(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], + &step1[25]); + highbd_butterfly_sse2(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], + &step1[24]); + + // stage 2 + step2[16] = _mm_add_epi32(step1[16], step1[17]); + step2[17] = _mm_sub_epi32(step1[16], step1[17]); + step2[18] = _mm_sub_epi32(step1[18], step1[19]); // step2[18] = -step2[18] + step2[19] = _mm_add_epi32(step1[18], step1[19]); + step2[20] = _mm_add_epi32(step1[20], step1[21]); + step2[21] = _mm_sub_epi32(step1[20], step1[21]); + step2[22] = _mm_sub_epi32(step1[22], step1[23]); // step2[22] = -step2[22] + step2[23] = _mm_add_epi32(step1[22], step1[23]); + + step2[24] = _mm_add_epi32(step1[25], step1[24]); + step2[25] = _mm_sub_epi32(step1[25], step1[24]); // step2[25] = -step2[25] + step2[26] = _mm_sub_epi32(step1[27], step1[26]); + step2[27] = _mm_add_epi32(step1[27], step1[26]); + step2[28] = _mm_add_epi32(step1[29], step1[28]); + step2[29] = _mm_sub_epi32(step1[29], step1[28]); // step2[29] = -step2[29] + step2[30] = _mm_sub_epi32(step1[31], step1[30]); + step2[31] = _mm_add_epi32(step1[31], step1[30]); + + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + highbd_butterfly_sse2(step2[30], step2[17], cospi_28_64, cospi_4_64, + &step1[17], &step1[30]); + highbd_butterfly_sse2(step2[18], step2[29], cospi_4_64, cospi_28_64, + &step1[29], &step1[18]); + step1[19] = step2[19]; + step1[20] = step2[20]; + highbd_butterfly_sse2(step2[26], step2[21], cospi_12_64, cospi_20_64, + &step1[21], &step1[26]); + highbd_butterfly_sse2(step2[22], step2[25], cospi_20_64, cospi_12_64, + &step1[25], &step1[22]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out); +} + +static void highbd_idct32_1024_4x32(__m128i *const io /*io[32]*/) { + __m128i temp[32]; + + highbd_idct32_1024_4x32_quarter_1_2(io, temp); + highbd_idct32_1024_4x32_quarter_3_4(io, temp); + // final stage + highbd_add_sub_butterfly(temp, io, 32); +} + +void vpx_highbd_idct32x32_1024_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + + if (bd == 8) { + __m128i col[4][32], io[32]; + + // rows + for (i = 0; i < 4; i++) { + highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &io[0]); + highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &io[8]); + highbd_load_pack_transpose_32bit_8x8(&input[16], 32, &io[16]); + highbd_load_pack_transpose_32bit_8x8(&input[24], 32, &io[24]); + idct32_1024_8x32(io, col[i]); + input += 32 << 3; + } + + // columns + for (i = 0; i < 32; i += 8) { + // Transpose 32x8 block to 8x32 block + transpose_16bit_8x8(col[0] + i, io); + transpose_16bit_8x8(col[1] + i, io + 8); + transpose_16bit_8x8(col[2] + i, io + 16); + transpose_16bit_8x8(col[3] + i, io + 24); + idct32_1024_8x32(io, io); + for (j = 0; j < 32; ++j) { + highbd_write_buffer_8(dest + j * stride, io[j], bd); + } + dest += 8; + } + } else { + __m128i all[8][32], out[32], *in; + + for (i = 0; i < 8; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]); + highbd_load_transpose_32bit_8x4(&input[16], 32, &in[16]); + highbd_load_transpose_32bit_8x4(&input[24], 32, &in[24]); + highbd_idct32_1024_4x32(in); + input += 4 * 32; + } + + for (i = 0; i < 32; i += 4) { + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + transpose_32bit_4x4(all[4] + i, out + 16); + transpose_32bit_4x4(all[5] + i, out + 20); + transpose_32bit_4x4(all[6] + i, out + 24); + transpose_32bit_4x4(all[7] + i, out + 28); + highbd_idct32_1024_4x32(out); + + for (j = 0; j < 32; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +// ----------------------------------------------------------------------------- + +// For each 4x32 block __m128i in[32], +// Input with index, 0, 4, 8, 12 +// output pixels: 0-7 in __m128i out[32] +static INLINE void highbd_idct32_135_4x32_quarter_1( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + highbd_partial_butterfly_sse2(in[4], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_partial_butterfly_neg_sse2(in[12], cospi_12_64, cospi_20_64, &step1[5], + &step1[6]); + + // stage 4 + highbd_partial_butterfly_sse2(in[0], cospi_16_64, cospi_16_64, &step2[1], + &step2[0]); + highbd_partial_butterfly_sse2(in[8], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 5 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[1], step2[2]); + step1[2] = _mm_sub_epi32(step2[1], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_butterfly_sse2(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], + &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi32(step1[0], step1[7]); + out[1] = _mm_add_epi32(step1[1], step1[6]); + out[2] = _mm_add_epi32(step1[2], step1[5]); + out[3] = _mm_add_epi32(step1[3], step1[4]); + out[4] = _mm_sub_epi32(step1[3], step1[4]); + out[5] = _mm_sub_epi32(step1[2], step1[5]); + out[6] = _mm_sub_epi32(step1[1], step1[6]); + out[7] = _mm_sub_epi32(step1[0], step1[7]); +} + +// For each 4x32 block __m128i in[32], +// Input with index, 2, 6, 10, 14 +// output pixels: 8-15 in __m128i out[32] +static INLINE void highbd_idct32_135_4x32_quarter_2( + const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) { + __m128i step1[32], step2[32]; + + // stage 2 + highbd_partial_butterfly_sse2(in[2], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_partial_butterfly_neg_sse2(in[14], cospi_14_64, cospi_18_64, &step2[9], + &step2[14]); + highbd_partial_butterfly_sse2(in[10], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + highbd_partial_butterfly_neg_sse2(in[6], cospi_6_64, cospi_26_64, &step2[11], + &step2[12]); + + // stage 3 + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10] + step1[11] = _mm_add_epi32(step2[10], step2[11]); + step1[12] = _mm_add_epi32(step2[13], step2[12]); + step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13] + + highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void highbd_idct32_135_4x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + highbd_idct32_135_4x32_quarter_1(in, temp); + highbd_idct32_135_4x32_quarter_2(in, temp); + // stage 7 + highbd_add_sub_butterfly(temp, out, 16); +} + +// For each 4x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7, 9, 11, 13, 15 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void highbd_idct32_135_4x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32], step2[32]; + + // stage 1 + highbd_partial_butterfly_sse2(in[1], cospi_31_64, cospi_1_64, &step1[16], + &step1[31]); + highbd_partial_butterfly_neg_sse2(in[15], cospi_15_64, cospi_17_64, + &step1[17], &step1[30]); + highbd_partial_butterfly_sse2(in[9], cospi_23_64, cospi_9_64, &step1[18], + &step1[29]); + highbd_partial_butterfly_neg_sse2(in[7], cospi_7_64, cospi_25_64, &step1[19], + &step1[28]); + + highbd_partial_butterfly_sse2(in[5], cospi_27_64, cospi_5_64, &step1[20], + &step1[27]); + highbd_partial_butterfly_neg_sse2(in[11], cospi_11_64, cospi_21_64, + &step1[21], &step1[26]); + + highbd_partial_butterfly_sse2(in[13], cospi_19_64, cospi_13_64, &step1[22], + &step1[25]); + highbd_partial_butterfly_neg_sse2(in[3], cospi_3_64, cospi_29_64, &step1[23], + &step1[24]); + + // stage 2 + step2[16] = _mm_add_epi32(step1[16], step1[17]); + step2[17] = _mm_sub_epi32(step1[16], step1[17]); + step2[18] = _mm_sub_epi32(step1[18], step1[19]); // step2[18] = -step2[18] + step2[19] = _mm_add_epi32(step1[18], step1[19]); + step2[20] = _mm_add_epi32(step1[20], step1[21]); + step2[21] = _mm_sub_epi32(step1[20], step1[21]); + step2[22] = _mm_sub_epi32(step1[22], step1[23]); // step2[22] = -step2[22] + step2[23] = _mm_add_epi32(step1[22], step1[23]); + + step2[24] = _mm_add_epi32(step1[25], step1[24]); + step2[25] = _mm_sub_epi32(step1[25], step1[24]); // step2[25] = -step2[25] + step2[26] = _mm_sub_epi32(step1[27], step1[26]); + step2[27] = _mm_add_epi32(step1[27], step1[26]); + step2[28] = _mm_add_epi32(step1[29], step1[28]); + step2[29] = _mm_sub_epi32(step1[29], step1[28]); // step2[29] = -step2[29] + step2[30] = _mm_sub_epi32(step1[31], step1[30]); + step2[31] = _mm_add_epi32(step1[31], step1[30]); + + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + highbd_butterfly_sse2(step2[30], step2[17], cospi_28_64, cospi_4_64, + &step1[17], &step1[30]); + highbd_butterfly_sse2(step2[18], step2[29], cospi_4_64, cospi_28_64, + &step1[29], &step1[18]); + step1[19] = step2[19]; + step1[20] = step2[20]; + highbd_butterfly_sse2(step2[26], step2[21], cospi_12_64, cospi_20_64, + &step1[21], &step1[26]); + highbd_butterfly_sse2(step2[22], step2[25], cospi_20_64, cospi_12_64, + &step1[25], &step1[22]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out); +} + +static void highbd_idct32_135_4x32(__m128i *const io /*io[32]*/) { + __m128i temp[32]; + + highbd_idct32_135_4x32_quarter_1_2(io, temp); + highbd_idct32_135_4x32_quarter_3_4(io, temp); + // final stage + highbd_add_sub_butterfly(temp, io, 32); +} + +void vpx_highbd_idct32x32_135_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + + if (bd == 8) { + __m128i col[2][32], in[32], out[32]; + + for (i = 16; i < 32; i++) { + in[i] = _mm_setzero_si128(); + } + + // rows + for (i = 0; i < 2; i++) { + highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]); + highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &in[8]); + idct32_1024_8x32(in, col[i]); + input += 32 << 3; + } + + // columns + for (i = 0; i < 32; i += 8) { + transpose_16bit_8x8(col[0] + i, in); + transpose_16bit_8x8(col[1] + i, in + 8); + idct32_1024_8x32(in, out); + for (j = 0; j < 32; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[8][32], out[32], *in; + + for (i = 0; i < 4; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]); + highbd_idct32_135_4x32(in); + input += 4 * 32; + } + + for (i = 0; i < 32; i += 4) { + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + highbd_idct32_135_4x32(out); + + for (j = 0; j < 32; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +// ----------------------------------------------------------------------------- + +// For each 4x32 block __m128i in[32], +// Input with index, 0, 4 +// output pixels: 0-7 in __m128i out[32] +static INLINE void highbd_idct32_34_4x32_quarter_1( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + highbd_partial_butterfly_sse2(in[4], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + + // stage 4 + highbd_partial_butterfly_sse2(in[0], cospi_16_64, cospi_16_64, &step2[1], + &step2[0]); + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + + // stage 5 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[1]; + step1[3] = step2[0]; + step1[4] = step2[4]; + highbd_butterfly_sse2(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], + &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi32(step1[0], step1[7]); + out[1] = _mm_add_epi32(step1[1], step1[6]); + out[2] = _mm_add_epi32(step1[2], step1[5]); + out[3] = _mm_add_epi32(step1[3], step1[4]); + out[4] = _mm_sub_epi32(step1[3], step1[4]); + out[5] = _mm_sub_epi32(step1[2], step1[5]); + out[6] = _mm_sub_epi32(step1[1], step1[6]); + out[7] = _mm_sub_epi32(step1[0], step1[7]); +} + +// For each 4x32 block __m128i in[32], +// Input with index, 2, 6 +// output pixels: 8-15 in __m128i out[32] +static INLINE void highbd_idct32_34_4x32_quarter_2(const __m128i *in /*in[32]*/, + __m128i *out /*out[16]*/) { + __m128i step1[32], step2[32]; + + // stage 2 + highbd_partial_butterfly_sse2(in[2], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_partial_butterfly_neg_sse2(in[6], cospi_6_64, cospi_26_64, &step2[11], + &step2[12]); + + // stage 3 + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[14] = step2[15]; + step1[15] = step2[15]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + + step1[10] = + _mm_sub_epi32(_mm_setzero_si128(), step1[10]); // step1[10] = -step1[10] + step1[13] = + _mm_sub_epi32(_mm_setzero_si128(), step1[13]); // step1[13] = -step1[13] + highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void highbd_idct32_34_4x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + highbd_idct32_34_4x32_quarter_1(in, temp); + highbd_idct32_34_4x32_quarter_2(in, temp); + // stage 7 + highbd_add_sub_butterfly(temp, out, 16); +} + +// For each 4x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void highbd_idct32_34_4x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32], step2[32]; + + // stage 1 + highbd_partial_butterfly_sse2(in[1], cospi_31_64, cospi_1_64, &step1[16], + &step1[31]); + highbd_partial_butterfly_neg_sse2(in[7], cospi_7_64, cospi_25_64, &step1[19], + &step1[28]); + + highbd_partial_butterfly_sse2(in[5], cospi_27_64, cospi_5_64, &step1[20], + &step1[27]); + highbd_partial_butterfly_neg_sse2(in[3], cospi_3_64, cospi_29_64, &step1[23], + &step1[24]); + + // stage 2 + step2[16] = step1[16]; + step2[17] = step1[16]; + step2[18] = step1[19]; + step2[19] = step1[19]; + step2[20] = step1[20]; + step2[21] = step1[20]; + step2[22] = step1[23]; + step2[23] = step1[23]; + + step2[24] = step1[24]; + step2[25] = step1[24]; + step2[26] = step1[27]; + step2[27] = step1[27]; + step2[28] = step1[28]; + step2[29] = step1[28]; + step2[30] = step1[31]; + step2[31] = step1[31]; + + // stage 3 + step2[18] = + _mm_sub_epi32(_mm_setzero_si128(), step2[18]); // step2[18] = -step2[18] + step2[22] = + _mm_sub_epi32(_mm_setzero_si128(), step2[22]); // step2[22] = -step2[22] + step2[25] = + _mm_sub_epi32(_mm_setzero_si128(), step2[25]); // step2[25] = -step2[25] + step2[29] = + _mm_sub_epi32(_mm_setzero_si128(), step2[29]); // step2[29] = -step2[29] + step1[16] = step2[16]; + step1[31] = step2[31]; + highbd_butterfly_sse2(step2[30], step2[17], cospi_28_64, cospi_4_64, + &step1[17], &step1[30]); + highbd_butterfly_sse2(step2[18], step2[29], cospi_4_64, cospi_28_64, + &step1[29], &step1[18]); + step1[19] = step2[19]; + step1[20] = step2[20]; + highbd_butterfly_sse2(step2[26], step2[21], cospi_12_64, cospi_20_64, + &step1[21], &step1[26]); + highbd_butterfly_sse2(step2[22], step2[25], cospi_20_64, cospi_12_64, + &step1[25], &step1[22]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out); +} + +static void highbd_idct32_34_4x32(__m128i *const io /*io[32]*/) { + __m128i temp[32]; + + highbd_idct32_34_4x32_quarter_1_2(io, temp); + highbd_idct32_34_4x32_quarter_3_4(io, temp); + // final stage + highbd_add_sub_butterfly(temp, io, 32); +} + +void vpx_highbd_idct32x32_34_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + + if (bd == 8) { + __m128i col[32], in[32], out[32]; + + // rows + highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]); + idct32_34_8x32_sse2(in, col); + + // columns + for (i = 0; i < 32; i += 8) { + transpose_16bit_8x8(col + i, in); + idct32_34_8x32_sse2(in, out); + for (j = 0; j < 32; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[8][32], out[32], *in; + + for (i = 0; i < 4; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]); + highbd_idct32_34_4x32(in); + input += 4 * 32; + } + + for (i = 0; i < 32; i += 4) { + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + highbd_idct32_34_4x32(out); + + for (j = 0; j < 32; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + highbd_idct_1_add_kernel(input, dest, stride, bd, 32); +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c new file mode 100644 index 0000000000..2d0a53ac0a --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c @@ -0,0 +1,765 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE4.1 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/inv_txfm_ssse3.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void highbd_idct32_4x32_quarter_2_stage_4_to_6( + __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) { + __m128i step2[32]; + + // stage 4 + step2[8] = step1[8]; + step2[15] = step1[15]; + highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64, + &step2[9], &step2[14]); + highbd_butterfly_sse4_1(step1[13], step1[10], -cospi_8_64, cospi_24_64, + &step2[10], &step2[13]); + step2[11] = step1[11]; + step2[12] = step1[12]; + + // stage 5 + step1[8] = _mm_add_epi32(step2[8], step2[11]); + step1[9] = _mm_add_epi32(step2[9], step2[10]); + step1[10] = _mm_sub_epi32(step2[9], step2[10]); + step1[11] = _mm_sub_epi32(step2[8], step2[11]); + step1[12] = _mm_sub_epi32(step2[15], step2[12]); + step1[13] = _mm_sub_epi32(step2[14], step2[13]); + step1[14] = _mm_add_epi32(step2[14], step2[13]); + step1[15] = _mm_add_epi32(step2[15], step2[12]); + + // stage 6 + out[8] = step1[8]; + out[9] = step1[9]; + highbd_butterfly_sse4_1(step1[13], step1[10], cospi_16_64, cospi_16_64, + &out[10], &out[13]); + highbd_butterfly_sse4_1(step1[12], step1[11], cospi_16_64, cospi_16_64, + &out[11], &out[12]); + out[14] = step1[14]; + out[15] = step1[15]; +} + +static INLINE void highbd_idct32_4x32_quarter_3_4_stage_4_to_7( + __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) { + __m128i step2[32]; + + // stage 4 + step2[16] = _mm_add_epi32(step1[16], step1[19]); + step2[17] = _mm_add_epi32(step1[17], step1[18]); + step2[18] = _mm_sub_epi32(step1[17], step1[18]); + step2[19] = _mm_sub_epi32(step1[16], step1[19]); + step2[20] = _mm_sub_epi32(step1[23], step1[20]); + step2[21] = _mm_sub_epi32(step1[22], step1[21]); + step2[22] = _mm_add_epi32(step1[22], step1[21]); + step2[23] = _mm_add_epi32(step1[23], step1[20]); + + step2[24] = _mm_add_epi32(step1[24], step1[27]); + step2[25] = _mm_add_epi32(step1[25], step1[26]); + step2[26] = _mm_sub_epi32(step1[25], step1[26]); + step2[27] = _mm_sub_epi32(step1[24], step1[27]); + step2[28] = _mm_sub_epi32(step1[31], step1[28]); + step2[29] = _mm_sub_epi32(step1[30], step1[29]); + step2[30] = _mm_add_epi32(step1[29], step1[30]); + step2[31] = _mm_add_epi32(step1[28], step1[31]); + + // stage 5 + step1[16] = step2[16]; + step1[17] = step2[17]; + highbd_butterfly_sse4_1(step2[29], step2[18], cospi_24_64, cospi_8_64, + &step1[18], &step1[29]); + highbd_butterfly_sse4_1(step2[28], step2[19], cospi_24_64, cospi_8_64, + &step1[19], &step1[28]); + highbd_butterfly_sse4_1(step2[27], step2[20], -cospi_8_64, cospi_24_64, + &step1[20], &step1[27]); + highbd_butterfly_sse4_1(step2[26], step2[21], -cospi_8_64, cospi_24_64, + &step1[21], &step1[26]); + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // stage 6 + step2[16] = _mm_add_epi32(step1[16], step1[23]); + step2[17] = _mm_add_epi32(step1[17], step1[22]); + step2[18] = _mm_add_epi32(step1[18], step1[21]); + step2[19] = _mm_add_epi32(step1[19], step1[20]); + step2[20] = _mm_sub_epi32(step1[19], step1[20]); + step2[21] = _mm_sub_epi32(step1[18], step1[21]); + step2[22] = _mm_sub_epi32(step1[17], step1[22]); + step2[23] = _mm_sub_epi32(step1[16], step1[23]); + + step2[24] = _mm_sub_epi32(step1[31], step1[24]); + step2[25] = _mm_sub_epi32(step1[30], step1[25]); + step2[26] = _mm_sub_epi32(step1[29], step1[26]); + step2[27] = _mm_sub_epi32(step1[28], step1[27]); + step2[28] = _mm_add_epi32(step1[27], step1[28]); + step2[29] = _mm_add_epi32(step1[26], step1[29]); + step2[30] = _mm_add_epi32(step1[25], step1[30]); + step2[31] = _mm_add_epi32(step1[24], step1[31]); + + // stage 7 + out[16] = step2[16]; + out[17] = step2[17]; + out[18] = step2[18]; + out[19] = step2[19]; + highbd_butterfly_sse4_1(step2[27], step2[20], cospi_16_64, cospi_16_64, + &out[20], &out[27]); + highbd_butterfly_sse4_1(step2[26], step2[21], cospi_16_64, cospi_16_64, + &out[21], &out[26]); + highbd_butterfly_sse4_1(step2[25], step2[22], cospi_16_64, cospi_16_64, + &out[22], &out[25]); + highbd_butterfly_sse4_1(step2[24], step2[23], cospi_16_64, cospi_16_64, + &out[23], &out[24]); + out[28] = step2[28]; + out[29] = step2[29]; + out[30] = step2[30]; + out[31] = step2[31]; +} + +// Group the coefficient calculation into smaller functions to prevent stack +// spillover in 32x32 idct optimizations: +// quarter_1: 0-7 +// quarter_2: 8-15 +// quarter_3_4: 16-23, 24-31 + +// For each 4x32 block __m128i in[32], +// Input with index, 0, 4, 8, 12, 16, 20, 24, 28 +// output pixels: 0-7 in __m128i out[32] +static INLINE void highbd_idct32_1024_4x32_quarter_1( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + highbd_butterfly_sse4_1(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_butterfly_sse4_1(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], + &step1[6]); + + // stage 4 + highbd_butterfly_sse4_1(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], + &step2[0]); + highbd_butterfly_sse4_1(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 5 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[1], step2[2]); + step1[2] = _mm_sub_epi32(step2[1], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64, + &step1[5], &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi32(step1[0], step1[7]); + out[1] = _mm_add_epi32(step1[1], step1[6]); + out[2] = _mm_add_epi32(step1[2], step1[5]); + out[3] = _mm_add_epi32(step1[3], step1[4]); + out[4] = _mm_sub_epi32(step1[3], step1[4]); + out[5] = _mm_sub_epi32(step1[2], step1[5]); + out[6] = _mm_sub_epi32(step1[1], step1[6]); + out[7] = _mm_sub_epi32(step1[0], step1[7]); +} + +// For each 4x32 block __m128i in[32], +// Input with index, 2, 6, 10, 14, 18, 22, 26, 30 +// output pixels: 8-15 in __m128i out[32] +static INLINE void highbd_idct32_1024_4x32_quarter_2( + const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) { + __m128i step1[32], step2[32]; + + // stage 2 + highbd_butterfly_sse4_1(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_butterfly_sse4_1(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], + &step2[14]); + highbd_butterfly_sse4_1(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + highbd_butterfly_sse4_1(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], + &step2[12]); + + // stage 3 + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + step1[10] = _mm_sub_epi32(step2[11], step2[10]); + step1[11] = _mm_add_epi32(step2[11], step2[10]); + step1[12] = _mm_add_epi32(step2[12], step2[13]); + step1[13] = _mm_sub_epi32(step2[12], step2[13]); + + highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void highbd_idct32_1024_4x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + highbd_idct32_1024_4x32_quarter_1(in, temp); + highbd_idct32_1024_4x32_quarter_2(in, temp); + // stage 7 + highbd_add_sub_butterfly(temp, out, 16); +} + +// For each 4x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void highbd_idct32_1024_4x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32], step2[32]; + + // stage 1 + highbd_butterfly_sse4_1(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], + &step1[31]); + highbd_butterfly_sse4_1(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], + &step1[30]); + highbd_butterfly_sse4_1(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], + &step1[29]); + highbd_butterfly_sse4_1(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], + &step1[28]); + + highbd_butterfly_sse4_1(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], + &step1[27]); + highbd_butterfly_sse4_1(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], + &step1[26]); + + highbd_butterfly_sse4_1(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], + &step1[25]); + highbd_butterfly_sse4_1(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], + &step1[24]); + + // stage 2 + step2[16] = _mm_add_epi32(step1[16], step1[17]); + step2[17] = _mm_sub_epi32(step1[16], step1[17]); + step2[18] = _mm_sub_epi32(step1[19], step1[18]); + step2[19] = _mm_add_epi32(step1[19], step1[18]); + step2[20] = _mm_add_epi32(step1[20], step1[21]); + step2[21] = _mm_sub_epi32(step1[20], step1[21]); + step2[22] = _mm_sub_epi32(step1[23], step1[22]); + step2[23] = _mm_add_epi32(step1[23], step1[22]); + + step2[24] = _mm_add_epi32(step1[24], step1[25]); + step2[25] = _mm_sub_epi32(step1[24], step1[25]); + step2[26] = _mm_sub_epi32(step1[27], step1[26]); + step2[27] = _mm_add_epi32(step1[27], step1[26]); + step2[28] = _mm_add_epi32(step1[28], step1[29]); + step2[29] = _mm_sub_epi32(step1[28], step1[29]); + step2[30] = _mm_sub_epi32(step1[31], step1[30]); + step2[31] = _mm_add_epi32(step1[31], step1[30]); + + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64, + &step1[17], &step1[30]); + highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64, + &step1[18], &step1[29]); + step1[19] = step2[19]; + step1[20] = step2[20]; + highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64, + &step1[21], &step1[26]); + highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64, + &step1[22], &step1[25]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out); +} + +static void highbd_idct32_1024_4x32(__m128i *const io /*io[32]*/) { + __m128i temp[32]; + + highbd_idct32_1024_4x32_quarter_1_2(io, temp); + highbd_idct32_1024_4x32_quarter_3_4(io, temp); + // final stage + highbd_add_sub_butterfly(temp, io, 32); +} + +void vpx_highbd_idct32x32_1024_add_sse4_1(const tran_low_t *input, + uint16_t *dest, int stride, int bd) { + int i, j; + + if (bd == 8) { + __m128i col[4][32], io[32]; + + // rows + for (i = 0; i < 4; i++) { + highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &io[0]); + highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &io[8]); + highbd_load_pack_transpose_32bit_8x8(&input[16], 32, &io[16]); + highbd_load_pack_transpose_32bit_8x8(&input[24], 32, &io[24]); + idct32_1024_8x32(io, col[i]); + input += 32 << 3; + } + + // columns + for (i = 0; i < 32; i += 8) { + // Transpose 32x8 block to 8x32 block + transpose_16bit_8x8(col[0] + i, io); + transpose_16bit_8x8(col[1] + i, io + 8); + transpose_16bit_8x8(col[2] + i, io + 16); + transpose_16bit_8x8(col[3] + i, io + 24); + idct32_1024_8x32(io, io); + for (j = 0; j < 32; ++j) { + highbd_write_buffer_8(dest + j * stride, io[j], bd); + } + dest += 8; + } + } else { + __m128i all[8][32], out[32], *in; + + for (i = 0; i < 8; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]); + highbd_load_transpose_32bit_8x4(&input[16], 32, &in[16]); + highbd_load_transpose_32bit_8x4(&input[24], 32, &in[24]); + highbd_idct32_1024_4x32(in); + input += 4 * 32; + } + + for (i = 0; i < 32; i += 4) { + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + transpose_32bit_4x4(all[4] + i, out + 16); + transpose_32bit_4x4(all[5] + i, out + 20); + transpose_32bit_4x4(all[6] + i, out + 24); + transpose_32bit_4x4(all[7] + i, out + 28); + highbd_idct32_1024_4x32(out); + + for (j = 0; j < 32; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +// ----------------------------------------------------------------------------- + +// For each 4x32 block __m128i in[32], +// Input with index, 0, 4, 8, 12 +// output pixels: 0-7 in __m128i out[32] +static INLINE void highbd_idct32_135_4x32_quarter_1( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + highbd_partial_butterfly_sse4_1(in[4], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_partial_butterfly_sse4_1(in[12], -cospi_20_64, cospi_12_64, &step1[5], + &step1[6]); + + // stage 4 + highbd_partial_butterfly_sse4_1(in[0], cospi_16_64, cospi_16_64, &step2[1], + &step2[0]); + highbd_partial_butterfly_sse4_1(in[8], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 5 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[1], step2[2]); + step1[2] = _mm_sub_epi32(step2[1], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64, + &step1[5], &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi32(step1[0], step1[7]); + out[1] = _mm_add_epi32(step1[1], step1[6]); + out[2] = _mm_add_epi32(step1[2], step1[5]); + out[3] = _mm_add_epi32(step1[3], step1[4]); + out[4] = _mm_sub_epi32(step1[3], step1[4]); + out[5] = _mm_sub_epi32(step1[2], step1[5]); + out[6] = _mm_sub_epi32(step1[1], step1[6]); + out[7] = _mm_sub_epi32(step1[0], step1[7]); +} + +// For each 4x32 block __m128i in[32], +// Input with index, 2, 6, 10, 14 +// output pixels: 8-15 in __m128i out[32] +static INLINE void highbd_idct32_135_4x32_quarter_2( + const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) { + __m128i step1[32], step2[32]; + + // stage 2 + highbd_partial_butterfly_sse4_1(in[2], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_partial_butterfly_sse4_1(in[14], -cospi_18_64, cospi_14_64, &step2[9], + &step2[14]); + highbd_partial_butterfly_sse4_1(in[10], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + highbd_partial_butterfly_sse4_1(in[6], -cospi_26_64, cospi_6_64, &step2[11], + &step2[12]); + + // stage 3 + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + step1[10] = _mm_sub_epi32(step2[11], step2[10]); + step1[11] = _mm_add_epi32(step2[11], step2[10]); + step1[12] = _mm_add_epi32(step2[12], step2[13]); + step1[13] = _mm_sub_epi32(step2[12], step2[13]); + + highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void highbd_idct32_135_4x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + highbd_idct32_135_4x32_quarter_1(in, temp); + highbd_idct32_135_4x32_quarter_2(in, temp); + // stage 7 + highbd_add_sub_butterfly(temp, out, 16); +} + +// For each 4x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7, 9, 11, 13, 15 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void highbd_idct32_135_4x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32], step2[32]; + + // stage 1 + highbd_partial_butterfly_sse4_1(in[1], cospi_31_64, cospi_1_64, &step1[16], + &step1[31]); + highbd_partial_butterfly_sse4_1(in[15], -cospi_17_64, cospi_15_64, &step1[17], + &step1[30]); + highbd_partial_butterfly_sse4_1(in[9], cospi_23_64, cospi_9_64, &step1[18], + &step1[29]); + highbd_partial_butterfly_sse4_1(in[7], -cospi_25_64, cospi_7_64, &step1[19], + &step1[28]); + + highbd_partial_butterfly_sse4_1(in[5], cospi_27_64, cospi_5_64, &step1[20], + &step1[27]); + highbd_partial_butterfly_sse4_1(in[11], -cospi_21_64, cospi_11_64, &step1[21], + &step1[26]); + + highbd_partial_butterfly_sse4_1(in[13], cospi_19_64, cospi_13_64, &step1[22], + &step1[25]); + highbd_partial_butterfly_sse4_1(in[3], -cospi_29_64, cospi_3_64, &step1[23], + &step1[24]); + + // stage 2 + step2[16] = _mm_add_epi32(step1[16], step1[17]); + step2[17] = _mm_sub_epi32(step1[16], step1[17]); + step2[18] = _mm_sub_epi32(step1[19], step1[18]); + step2[19] = _mm_add_epi32(step1[19], step1[18]); + step2[20] = _mm_add_epi32(step1[20], step1[21]); + step2[21] = _mm_sub_epi32(step1[20], step1[21]); + step2[22] = _mm_sub_epi32(step1[23], step1[22]); + step2[23] = _mm_add_epi32(step1[23], step1[22]); + + step2[24] = _mm_add_epi32(step1[24], step1[25]); + step2[25] = _mm_sub_epi32(step1[24], step1[25]); + step2[26] = _mm_sub_epi32(step1[27], step1[26]); + step2[27] = _mm_add_epi32(step1[27], step1[26]); + step2[28] = _mm_add_epi32(step1[28], step1[29]); + step2[29] = _mm_sub_epi32(step1[28], step1[29]); + step2[30] = _mm_sub_epi32(step1[31], step1[30]); + step2[31] = _mm_add_epi32(step1[31], step1[30]); + + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64, + &step1[17], &step1[30]); + highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64, + &step1[18], &step1[29]); + step1[19] = step2[19]; + step1[20] = step2[20]; + highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64, + &step1[21], &step1[26]); + highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64, + &step1[22], &step1[25]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out); +} + +static void highbd_idct32_135_4x32(__m128i *const io /*io[32]*/) { + __m128i temp[32]; + + highbd_idct32_135_4x32_quarter_1_2(io, temp); + highbd_idct32_135_4x32_quarter_3_4(io, temp); + // final stage + highbd_add_sub_butterfly(temp, io, 32); +} + +void vpx_highbd_idct32x32_135_add_sse4_1(const tran_low_t *input, + uint16_t *dest, int stride, int bd) { + int i, j; + + if (bd == 8) { + __m128i col[2][32], in[32], out[32]; + + // rows + for (i = 0; i < 2; i++) { + highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]); + highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &in[8]); + idct32_135_8x32_ssse3(in, col[i]); + input += 32 << 3; + } + + // columns + for (i = 0; i < 32; i += 8) { + transpose_16bit_8x8(col[0] + i, in); + transpose_16bit_8x8(col[1] + i, in + 8); + idct32_135_8x32_ssse3(in, out); + for (j = 0; j < 32; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[8][32], out[32], *in; + + for (i = 0; i < 4; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]); + highbd_idct32_135_4x32(in); + input += 4 * 32; + } + + for (i = 0; i < 32; i += 4) { + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + highbd_idct32_135_4x32(out); + + for (j = 0; j < 32; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +// ----------------------------------------------------------------------------- + +// For each 4x32 block __m128i in[32], +// Input with index, 0, 4 +// output pixels: 0-7 in __m128i out[32] +static INLINE void highbd_idct32_34_4x32_quarter_1( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + highbd_partial_butterfly_sse4_1(in[4], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + + // stage 4 + highbd_partial_butterfly_sse4_1(in[0], cospi_16_64, cospi_16_64, &step2[1], + &step2[0]); + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + + // stage 5 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[1]; + step1[3] = step2[0]; + step1[4] = step2[4]; + highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64, + &step1[5], &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi32(step1[0], step1[7]); + out[1] = _mm_add_epi32(step1[1], step1[6]); + out[2] = _mm_add_epi32(step1[2], step1[5]); + out[3] = _mm_add_epi32(step1[3], step1[4]); + out[4] = _mm_sub_epi32(step1[3], step1[4]); + out[5] = _mm_sub_epi32(step1[2], step1[5]); + out[6] = _mm_sub_epi32(step1[1], step1[6]); + out[7] = _mm_sub_epi32(step1[0], step1[7]); +} + +// For each 4x32 block __m128i in[32], +// Input with index, 2, 6 +// output pixels: 8-15 in __m128i out[32] +static INLINE void highbd_idct32_34_4x32_quarter_2(const __m128i *in /*in[32]*/, + __m128i *out /*out[16]*/) { + __m128i step1[32], step2[32]; + + // stage 2 + highbd_partial_butterfly_sse4_1(in[2], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_partial_butterfly_sse4_1(in[6], -cospi_26_64, cospi_6_64, &step2[11], + &step2[12]); + + // stage 3 + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[14] = step2[15]; + step1[15] = step2[15]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + + highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void highbd_idct32_34_4x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + highbd_idct32_34_4x32_quarter_1(in, temp); + highbd_idct32_34_4x32_quarter_2(in, temp); + // stage 7 + highbd_add_sub_butterfly(temp, out, 16); +} + +// For each 4x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void highbd_idct32_34_4x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32], step2[32]; + + // stage 1 + highbd_partial_butterfly_sse4_1(in[1], cospi_31_64, cospi_1_64, &step1[16], + &step1[31]); + highbd_partial_butterfly_sse4_1(in[7], -cospi_25_64, cospi_7_64, &step1[19], + &step1[28]); + + highbd_partial_butterfly_sse4_1(in[5], cospi_27_64, cospi_5_64, &step1[20], + &step1[27]); + highbd_partial_butterfly_sse4_1(in[3], -cospi_29_64, cospi_3_64, &step1[23], + &step1[24]); + + // stage 2 + step2[16] = step1[16]; + step2[17] = step1[16]; + step2[18] = step1[19]; + step2[19] = step1[19]; + step2[20] = step1[20]; + step2[21] = step1[20]; + step2[22] = step1[23]; + step2[23] = step1[23]; + + step2[24] = step1[24]; + step2[25] = step1[24]; + step2[26] = step1[27]; + step2[27] = step1[27]; + step2[28] = step1[28]; + step2[29] = step1[28]; + step2[30] = step1[31]; + step2[31] = step1[31]; + + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64, + &step1[17], &step1[30]); + highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64, + &step1[18], &step1[29]); + step1[19] = step2[19]; + step1[20] = step2[20]; + highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64, + &step1[21], &step1[26]); + highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64, + &step1[22], &step1[25]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out); +} + +static void highbd_idct32_34_4x32(__m128i *const io /*io[32]*/) { + __m128i temp[32]; + + highbd_idct32_34_4x32_quarter_1_2(io, temp); + highbd_idct32_34_4x32_quarter_3_4(io, temp); + // final stage + highbd_add_sub_butterfly(temp, io, 32); +} + +void vpx_highbd_idct32x32_34_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + + if (bd == 8) { + __m128i col[32], in[32], out[32]; + + // rows + highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]); + idct32_34_8x32_ssse3(in, col); + + // columns + for (i = 0; i < 32; i += 8) { + transpose_16bit_8x8(col + i, in); + idct32_34_8x32_ssse3(in, out); + for (j = 0; j < 32; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[8][32], out[32], *in; + + for (i = 0; i < 4; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]); + highbd_idct32_34_4x32(in); + input += 4 * 32; + } + + for (i = 0; i < 32; i += 4) { + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + highbd_idct32_34_4x32(out); + + for (j = 0; j < 32; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c new file mode 100644 index 0000000000..b9c8884f99 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE2 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" + +static INLINE __m128i dct_const_round_shift_4_sse2(const __m128i in0, + const __m128i in1) { + const __m128i t0 = _mm_unpacklo_epi32(in0, in1); // 0, 1 + const __m128i t1 = _mm_unpackhi_epi32(in0, in1); // 2, 3 + const __m128i t2 = _mm_unpacklo_epi64(t0, t1); // 0, 1, 2, 3 + return dct_const_round_shift_sse2(t2); +} + +static INLINE void highbd_idct4_small_sse2(__m128i *const io) { + const __m128i cospi_p16_p16 = _mm_setr_epi32(cospi_16_64, 0, cospi_16_64, 0); + const __m128i cospi_p08_p08 = _mm_setr_epi32(cospi_8_64, 0, cospi_8_64, 0); + const __m128i cospi_p24_p24 = _mm_setr_epi32(cospi_24_64, 0, cospi_24_64, 0); + __m128i temp1[4], temp2[4], step[4]; + + transpose_32bit_4x4(io, io); + + // Note: There is no 32-bit signed multiply SIMD instruction in SSE2. + // _mm_mul_epu32() is used which can only guarantee the lower 32-bit + // (signed) result is meaningful, which is enough in this function. + + // stage 1 + temp1[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2] + temp2[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2] + temp1[1] = _mm_srli_si128(temp1[0], 4); // 1, 3 + temp2[1] = _mm_srli_si128(temp2[0], 4); // 1, 3 + temp1[0] = _mm_mul_epu32(temp1[0], cospi_p16_p16); // ([0] + [2])*cospi_16_64 + temp1[1] = _mm_mul_epu32(temp1[1], cospi_p16_p16); // ([0] + [2])*cospi_16_64 + temp2[0] = _mm_mul_epu32(temp2[0], cospi_p16_p16); // ([0] - [2])*cospi_16_64 + temp2[1] = _mm_mul_epu32(temp2[1], cospi_p16_p16); // ([0] - [2])*cospi_16_64 + step[0] = dct_const_round_shift_4_sse2(temp1[0], temp1[1]); + step[1] = dct_const_round_shift_4_sse2(temp2[0], temp2[1]); + + temp1[3] = _mm_srli_si128(io[1], 4); + temp2[3] = _mm_srli_si128(io[3], 4); + temp1[0] = _mm_mul_epu32(io[1], cospi_p24_p24); // input[1] * cospi_24_64 + temp1[1] = _mm_mul_epu32(temp1[3], cospi_p24_p24); // input[1] * cospi_24_64 + temp2[0] = _mm_mul_epu32(io[1], cospi_p08_p08); // input[1] * cospi_8_64 + temp2[1] = _mm_mul_epu32(temp1[3], cospi_p08_p08); // input[1] * cospi_8_64 + temp1[2] = _mm_mul_epu32(io[3], cospi_p08_p08); // input[3] * cospi_8_64 + temp1[3] = _mm_mul_epu32(temp2[3], cospi_p08_p08); // input[3] * cospi_8_64 + temp2[2] = _mm_mul_epu32(io[3], cospi_p24_p24); // input[3] * cospi_24_64 + temp2[3] = _mm_mul_epu32(temp2[3], cospi_p24_p24); // input[3] * cospi_24_64 + temp1[0] = _mm_sub_epi64(temp1[0], temp1[2]); // [1]*cospi_24 - [3]*cospi_8 + temp1[1] = _mm_sub_epi64(temp1[1], temp1[3]); // [1]*cospi_24 - [3]*cospi_8 + temp2[0] = _mm_add_epi64(temp2[0], temp2[2]); // [1]*cospi_8 + [3]*cospi_24 + temp2[1] = _mm_add_epi64(temp2[1], temp2[3]); // [1]*cospi_8 + [3]*cospi_24 + step[2] = dct_const_round_shift_4_sse2(temp1[0], temp1[1]); + step[3] = dct_const_round_shift_4_sse2(temp2[0], temp2[1]); + + // stage 2 + io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3] + io[1] = _mm_add_epi32(step[1], step[2]); // step[1] + step[2] + io[2] = _mm_sub_epi32(step[1], step[2]); // step[1] - step[2] + io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3] +} + +static INLINE void highbd_idct4_large_sse2(__m128i *const io) { + __m128i step[4]; + + transpose_32bit_4x4(io, io); + + // stage 1 + highbd_butterfly_cospi16_sse2(io[0], io[2], &step[0], &step[1]); + highbd_butterfly_sse2(io[1], io[3], cospi_24_64, cospi_8_64, &step[2], + &step[3]); + + // stage 2 + io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3] + io[1] = _mm_add_epi32(step[1], step[2]); // step[1] + step[2] + io[2] = _mm_sub_epi32(step[1], step[2]); // step[1] - step[2] + io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3] +} + +void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int16_t max = 0, min = 0; + __m128i io[4], io_short[2]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0)); + io[1] = _mm_load_si128((const __m128i *)(input + 4)); + io[2] = _mm_load_si128((const __m128i *)(input + 8)); + io[3] = _mm_load_si128((const __m128i *)(input + 12)); + + io_short[0] = _mm_packs_epi32(io[0], io[1]); + io_short[1] = _mm_packs_epi32(io[2], io[3]); + + if (bd != 8) { + __m128i max_input, min_input; + + max_input = _mm_max_epi16(io_short[0], io_short[1]); + min_input = _mm_min_epi16(io_short[0], io_short[1]); + max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 8)); + min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 8)); + max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 4)); + min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 4)); + max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 2)); + min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 2)); + max = (int16_t)_mm_extract_epi16(max_input, 0); + min = (int16_t)_mm_extract_epi16(min_input, 0); + } + + if (bd == 8 || (max < 4096 && min >= -4096)) { + idct4_sse2(io_short); + idct4_sse2(io_short); + io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8)); + io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8)); + io[0] = _mm_srai_epi16(io_short[0], 4); + io[1] = _mm_srai_epi16(io_short[1], 4); + } else { + if (max < 32767 && min > -32768) { + highbd_idct4_small_sse2(io); + highbd_idct4_small_sse2(io); + } else { + highbd_idct4_large_sse2(io); + highbd_idct4_large_sse2(io); + } + io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8)); + io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8)); + } + + recon_and_store_4x4(io, dest, stride, bd); +} + +void vpx_highbd_idct4x4_1_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int a1, i; + tran_low_t out; + __m128i dc, d; + + out = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); + out = + HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd); + a1 = ROUND_POWER_OF_TWO(out, 4); + dc = _mm_set1_epi16(a1); + + for (i = 0; i < 4; ++i) { + d = _mm_loadl_epi64((const __m128i *)dest); + d = add_clamp(d, dc, bd); + _mm_storel_epi64((__m128i *)dest, d); + dest += stride; + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c new file mode 100644 index 0000000000..fe74d272ad --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE4.1 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" + +void vpx_highbd_idct4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + __m128i io[4]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0)); + io[1] = _mm_load_si128((const __m128i *)(input + 4)); + io[2] = _mm_load_si128((const __m128i *)(input + 8)); + io[3] = _mm_load_si128((const __m128i *)(input + 12)); + + if (bd == 8) { + __m128i io_short[2]; + + io_short[0] = _mm_packs_epi32(io[0], io[1]); + io_short[1] = _mm_packs_epi32(io[2], io[3]); + idct4_sse2(io_short); + idct4_sse2(io_short); + io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8)); + io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8)); + io[0] = _mm_srai_epi16(io_short[0], 4); + io[1] = _mm_srai_epi16(io_short[1], 4); + } else { + highbd_idct4_sse4_1(io); + highbd_idct4_sse4_1(io); + io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8)); + io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8)); + } + + recon_and_store_4x4(io, dest, stride, bd); +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c new file mode 100644 index 0000000000..bb7a510e15 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE2 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" + +static void highbd_idct8x8_half1d(__m128i *const io) { + __m128i step1[8], step2[8]; + + transpose_32bit_4x4x2(io, io); + + // stage 1 + step1[0] = io[0]; + step1[2] = io[4]; + step1[1] = io[2]; + step1[3] = io[6]; + highbd_butterfly_sse2(io[1], io[7], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_butterfly_sse2(io[5], io[3], cospi_12_64, cospi_20_64, &step1[5], + &step1[6]); + + // stage 2 + highbd_butterfly_cospi16_sse2(step1[0], step1[2], &step2[0], &step2[1]); + highbd_butterfly_sse2(step1[1], step1[3], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 3 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[1], step2[2]); + step1[2] = _mm_sub_epi32(step2[1], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_butterfly_cospi16_sse2(step2[6], step2[5], &step1[6], &step1[5]); + step1[7] = step2[7]; + + // stage 4 + highbd_idct8_stage4(step1, io); +} + +static void highbd_idct8x8_12_half1d(__m128i *const io) { + __m128i temp1[4], sign[2], step1[8], step2[8]; + + transpose_32bit_4x4(io, io); + + // stage 1 + step1[0] = io[0]; + step1[1] = io[2]; + abs_extend_64bit_sse2(io[1], temp1, sign); + step1[4] = multiplication_round_shift_sse2(temp1, sign, cospi_28_64); + step1[7] = multiplication_round_shift_sse2(temp1, sign, cospi_4_64); + abs_extend_64bit_sse2(io[3], temp1, sign); + step1[5] = multiplication_neg_round_shift_sse2(temp1, sign, cospi_20_64); + step1[6] = multiplication_round_shift_sse2(temp1, sign, cospi_12_64); + + // stage 2 + abs_extend_64bit_sse2(step1[0], temp1, sign); + step2[0] = multiplication_round_shift_sse2(temp1, sign, cospi_16_64); + abs_extend_64bit_sse2(step1[1], temp1, sign); + step2[2] = multiplication_round_shift_sse2(temp1, sign, cospi_24_64); + step2[3] = multiplication_round_shift_sse2(temp1, sign, cospi_8_64); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 3 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[0], step2[2]); + step1[2] = _mm_sub_epi32(step2[0], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_butterfly_cospi16_sse2(step2[6], step2[5], &step1[6], &step1[5]); + step1[7] = step2[7]; + + // stage 4 + highbd_idct8_stage4(step1, io); +} + +void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + __m128i io[16]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0)); + io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4)); + io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0)); + io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4)); + io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0)); + io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4)); + io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0)); + io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4)); + + if (bd == 8) { + __m128i io_short[8]; + + io_short[0] = _mm_packs_epi32(io[0], io[4]); + io_short[1] = _mm_packs_epi32(io[1], io[5]); + io_short[2] = _mm_packs_epi32(io[2], io[6]); + io_short[3] = _mm_packs_epi32(io[3], io[7]); + io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0)); + io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4)); + io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0)); + io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4)); + io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0)); + io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4)); + io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0)); + io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4)); + io_short[4] = _mm_packs_epi32(io[8], io[12]); + io_short[5] = _mm_packs_epi32(io[9], io[13]); + io_short[6] = _mm_packs_epi32(io[10], io[14]); + io_short[7] = _mm_packs_epi32(io[11], io[15]); + + vpx_idct8_sse2(io_short); + vpx_idct8_sse2(io_short); + round_shift_8x8(io_short, io); + } else { + __m128i temp[4]; + + highbd_idct8x8_half1d(io); + + io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0)); + io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4)); + io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0)); + io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4)); + io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0)); + io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4)); + io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0)); + io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4)); + highbd_idct8x8_half1d(&io[8]); + + temp[0] = io[4]; + temp[1] = io[5]; + temp[2] = io[6]; + temp[3] = io[7]; + io[4] = io[8]; + io[5] = io[9]; + io[6] = io[10]; + io[7] = io[11]; + highbd_idct8x8_half1d(io); + + io[8] = temp[0]; + io[9] = temp[1]; + io[10] = temp[2]; + io[11] = temp[3]; + highbd_idct8x8_half1d(&io[8]); + + highbd_idct8x8_final_round(io); + } + + recon_and_store_8x8(io, dest, stride, bd); +} + +void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + const __m128i zero = _mm_setzero_si128(); + __m128i io[16]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0)); + io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0)); + io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0)); + io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0)); + + if (bd == 8) { + __m128i io_short[8]; + + io_short[0] = _mm_packs_epi32(io[0], zero); + io_short[1] = _mm_packs_epi32(io[1], zero); + io_short[2] = _mm_packs_epi32(io[2], zero); + io_short[3] = _mm_packs_epi32(io[3], zero); + + idct8x8_12_add_kernel_sse2(io_short); + round_shift_8x8(io_short, io); + } else { + __m128i temp[4]; + + highbd_idct8x8_12_half1d(io); + + temp[0] = io[4]; + temp[1] = io[5]; + temp[2] = io[6]; + temp[3] = io[7]; + highbd_idct8x8_12_half1d(io); + + io[8] = temp[0]; + io[9] = temp[1]; + io[10] = temp[2]; + io[11] = temp[3]; + highbd_idct8x8_12_half1d(&io[8]); + + highbd_idct8x8_final_round(io); + } + + recon_and_store_8x8(io, dest, stride, bd); +} + +void vpx_highbd_idct8x8_1_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + highbd_idct_1_add_kernel(input, dest, stride, bd, 8); +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c new file mode 100644 index 0000000000..8b2e3d2415 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE4.1 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/inv_txfm_ssse3.h" +#include "vpx_dsp/x86/transpose_sse2.h" + +void vpx_highbd_idct8x8_half1d_sse4_1(__m128i *const io) { + __m128i step1[8], step2[8]; + + transpose_32bit_4x4x2(io, io); + + // stage 1 + step1[0] = io[0]; + step1[2] = io[4]; + step1[1] = io[2]; + step1[3] = io[6]; + highbd_butterfly_sse4_1(io[1], io[7], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_butterfly_sse4_1(io[5], io[3], cospi_12_64, cospi_20_64, &step1[5], + &step1[6]); + + // stage 2 + highbd_butterfly_cospi16_sse4_1(step1[0], step1[2], &step2[0], &step2[1]); + highbd_butterfly_sse4_1(step1[1], step1[3], cospi_24_64, cospi_8_64, + &step2[2], &step2[3]); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 3 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[1], step2[2]); + step1[2] = _mm_sub_epi32(step2[1], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_butterfly_cospi16_sse4_1(step2[6], step2[5], &step1[6], &step1[5]); + step1[7] = step2[7]; + + // stage 4 + highbd_idct8_stage4(step1, io); +} + +static void highbd_idct8x8_12_half1d(__m128i *const io) { + __m128i temp1[2], step1[8], step2[8]; + + transpose_32bit_4x4(io, io); + + // stage 1 + step1[0] = io[0]; + step1[1] = io[2]; + extend_64bit(io[1], temp1); + step1[4] = multiplication_round_shift_sse4_1(temp1, cospi_28_64); + step1[7] = multiplication_round_shift_sse4_1(temp1, cospi_4_64); + extend_64bit(io[3], temp1); + step1[5] = multiplication_round_shift_sse4_1(temp1, -cospi_20_64); + step1[6] = multiplication_round_shift_sse4_1(temp1, cospi_12_64); + + // stage 2 + extend_64bit(step1[0], temp1); + step2[0] = multiplication_round_shift_sse4_1(temp1, cospi_16_64); + extend_64bit(step1[1], temp1); + step2[2] = multiplication_round_shift_sse4_1(temp1, cospi_24_64); + step2[3] = multiplication_round_shift_sse4_1(temp1, cospi_8_64); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 3 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[0], step2[2]); + step1[2] = _mm_sub_epi32(step2[0], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_butterfly_cospi16_sse4_1(step2[6], step2[5], &step1[6], &step1[5]); + step1[7] = step2[7]; + + // stage 4 + highbd_idct8_stage4(step1, io); +} + +void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + __m128i io[16]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0)); + io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4)); + io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0)); + io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4)); + io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0)); + io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4)); + io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0)); + io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4)); + + if (bd == 8) { + __m128i io_short[8]; + + io_short[0] = _mm_packs_epi32(io[0], io[4]); + io_short[1] = _mm_packs_epi32(io[1], io[5]); + io_short[2] = _mm_packs_epi32(io[2], io[6]); + io_short[3] = _mm_packs_epi32(io[3], io[7]); + io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0)); + io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4)); + io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0)); + io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4)); + io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0)); + io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4)); + io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0)); + io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4)); + io_short[4] = _mm_packs_epi32(io[8], io[12]); + io_short[5] = _mm_packs_epi32(io[9], io[13]); + io_short[6] = _mm_packs_epi32(io[10], io[14]); + io_short[7] = _mm_packs_epi32(io[11], io[15]); + + vpx_idct8_sse2(io_short); + vpx_idct8_sse2(io_short); + round_shift_8x8(io_short, io); + } else { + __m128i temp[4]; + + vpx_highbd_idct8x8_half1d_sse4_1(io); + + io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0)); + io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4)); + io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0)); + io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4)); + io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0)); + io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4)); + io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0)); + io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4)); + vpx_highbd_idct8x8_half1d_sse4_1(&io[8]); + + temp[0] = io[4]; + temp[1] = io[5]; + temp[2] = io[6]; + temp[3] = io[7]; + io[4] = io[8]; + io[5] = io[9]; + io[6] = io[10]; + io[7] = io[11]; + vpx_highbd_idct8x8_half1d_sse4_1(io); + + io[8] = temp[0]; + io[9] = temp[1]; + io[10] = temp[2]; + io[11] = temp[3]; + vpx_highbd_idct8x8_half1d_sse4_1(&io[8]); + + highbd_idct8x8_final_round(io); + } + + recon_and_store_8x8(io, dest, stride, bd); +} + +void vpx_highbd_idct8x8_12_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + const __m128i zero = _mm_setzero_si128(); + __m128i io[16]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0)); + io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0)); + io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0)); + io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0)); + + if (bd == 8) { + __m128i io_short[8]; + + io_short[0] = _mm_packs_epi32(io[0], zero); + io_short[1] = _mm_packs_epi32(io[1], zero); + io_short[2] = _mm_packs_epi32(io[2], zero); + io_short[3] = _mm_packs_epi32(io[3], zero); + + idct8x8_12_add_kernel_ssse3(io_short); + round_shift_8x8(io_short, io); + } else { + __m128i temp[4]; + + highbd_idct8x8_12_half1d(io); + + temp[0] = io[4]; + temp[1] = io[5]; + temp[2] = io[6]; + temp[3] = io[7]; + highbd_idct8x8_12_half1d(io); + + io[8] = temp[0]; + io[9] = temp[1]; + io[10] = temp[2]; + io[11] = temp[3]; + highbd_idct8x8_12_half1d(&io[8]); + + highbd_idct8x8_final_round(io); + } + + recon_and_store_8x8(io, dest, stride, bd); +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c new file mode 100644 index 0000000000..43634aea3a --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c @@ -0,0 +1,534 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE2 + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +// ----------------------------------------------------------------------------- + +void vpx_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + (void)above; + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +void vpx_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + (void)above; + (void)bd; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7)); +} + +static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpacklo_epi64(*row, *row); + _mm_store_si128((__m128i *)*dst, val); + _mm_store_si128((__m128i *)(*dst + 8), val); + *dst += stride; +} + +static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpackhi_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + *dst += stride; +} + +void vpx_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 2; i++, left += 8) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + h_store_16_unpacklo(&dst, stride, &row0); + h_store_16_unpacklo(&dst, stride, &row1); + h_store_16_unpacklo(&dst, stride, &row2); + h_store_16_unpacklo(&dst, stride, &row3); + h_store_16_unpackhi(&dst, stride, &row4); + h_store_16_unpackhi(&dst, stride, &row5); + h_store_16_unpackhi(&dst, stride, &row6); + h_store_16_unpackhi(&dst, stride, &row7); + } +} + +static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpacklo_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + _mm_store_si128((__m128i *)(*dst + 16), val); + _mm_store_si128((__m128i *)(*dst + 24), val); + *dst += stride; +} + +static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpackhi_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + _mm_store_si128((__m128i *)(*dst + 16), val); + _mm_store_si128((__m128i *)(*dst + 24), val); + *dst += stride; +} + +void vpx_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 4; i++, left += 8) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + h_store_32_unpacklo(&dst, stride, &row0); + h_store_32_unpacklo(&dst, stride, &row1); + h_store_32_unpacklo(&dst, stride, &row2); + h_store_32_unpacklo(&dst, stride, &row3); + h_store_32_unpackhi(&dst, stride, &row4); + h_store_32_unpackhi(&dst, stride, &row5); + h_store_32_unpackhi(&dst, stride, &row6); + h_store_32_unpackhi(&dst, stride, &row7); + } +} + +//------------------------------------------------------------------------------ +// DC 4x4 + +static INLINE __m128i dc_sum_4(const uint16_t *ref) { + const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref); + const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); + const __m128i a = _mm_add_epi16(_dcba, _xxdc); + return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); +} + +static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride, + const __m128i *dc) { + const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0); + int i; + for (i = 0; i < 4; ++i, dst += stride) { + _mm_storel_epi64((__m128i *)dst, dc_dup); + } +} + +void vpx_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i two = _mm_cvtsi32_si128(2); + const __m128i sum = dc_sum_4(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); + (void)above; + (void)bd; + dc_store_4x4(dst, stride, &dc); +} + +void vpx_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i two = _mm_cvtsi32_si128(2); + const __m128i sum = dc_sum_4(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); + (void)left; + (void)bd; + dc_store_4x4(dst, stride, &dc); +} + +void vpx_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_4x4(dst, stride, &dc_dup); +} + +//------------------------------------------------------------------------------ +// DC 8x8 + +static INLINE __m128i dc_sum_8(const uint16_t *ref) { + const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref); + const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8)); + const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); + const __m128i a = _mm_add_epi16(_dcba, _xxdc); + + return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); +} + +static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride, + const __m128i *dc) { + const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); + const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); + int i; + for (i = 0; i < 8; ++i, dst += stride) { + _mm_store_si128((__m128i *)dst, dc_dup); + } +} + +void vpx_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i four = _mm_cvtsi32_si128(4); + const __m128i sum = dc_sum_8(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); + (void)above; + (void)bd; + dc_store_8x8(dst, stride, &dc); +} + +void vpx_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i four = _mm_cvtsi32_si128(4); + const __m128i sum = dc_sum_8(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); + (void)left; + (void)bd; + dc_store_8x8(dst, stride, &dc); +} + +void vpx_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_8x8(dst, stride, &dc_dup); +} + +//------------------------------------------------------------------------------ +// DC 16x16 + +static INLINE __m128i dc_sum_16(const uint16_t *ref) { + const __m128i sum_lo = dc_sum_8(ref); + const __m128i sum_hi = dc_sum_8(ref + 8); + return _mm_add_epi16(sum_lo, sum_hi); +} + +static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride, + const __m128i *dc) { + const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); + const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); + int i; + for (i = 0; i < 16; ++i, dst += stride) { + _mm_store_si128((__m128i *)dst, dc_dup); + _mm_store_si128((__m128i *)(dst + 8), dc_dup); + } +} + +void vpx_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)above; + (void)bd; + dc_store_16x16(dst, stride, &dc); +} + +void vpx_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)left; + (void)bd; + dc_store_16x16(dst, stride, &dc); +} + +void vpx_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_16x16(dst, stride, &dc_dup); +} + +//------------------------------------------------------------------------------ +// DC 32x32 + +static INLINE __m128i dc_sum_32(const uint16_t *ref) { + const __m128i zero = _mm_setzero_si128(); + const __m128i sum_a = dc_sum_16(ref); + const __m128i sum_b = dc_sum_16(ref + 16); + // 12 bit bd will outrange, so expand to 32 bit before adding final total + return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero), + _mm_unpacklo_epi16(sum_b, zero)); +} + +static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride, + const __m128i *dc) { + const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); + const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); + int i; + for (i = 0; i < 32; ++i, dst += stride) { + _mm_store_si128((__m128i *)dst, dc_dup); + _mm_store_si128((__m128i *)(dst + 8), dc_dup); + _mm_store_si128((__m128i *)(dst + 16), dc_dup); + _mm_store_si128((__m128i *)(dst + 24), dc_dup); + } +} + +void vpx_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sixteen = _mm_cvtsi32_si128(16); + const __m128i sum = dc_sum_32(left); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); + (void)above; + (void)bd; + dc_store_32x32(dst, stride, &dc); +} + +void vpx_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sixteen = _mm_cvtsi32_si128(16); + const __m128i sum = dc_sum_32(above); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); + (void)left; + (void)bd; + dc_store_32x32(dst, stride, &dc); +} + +void vpx_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_32x32(dst, stride, &dc_dup); +} + +// ----------------------------------------------------------------------------- +/* +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +*/ +static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y, + const __m128i *z) { + const __m128i one = _mm_set1_epi16(1); + const __m128i a = _mm_avg_epu16(*x, *z); + const __m128i b = + _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one)); + return _mm_avg_epu16(b, *y); +} + +void vpx_highbd_d117_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4)); + const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0); + const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1); + const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2); + const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2); + const __m128i IXABCD00 = _mm_srli_si128(KJIXABCD, 4); + const __m128i avg2 = _mm_avg_epu16(KJIXABCD, JIXABCD0); + const __m128i avg3 = avg3_epu16(&KJIXABCD, &JIXABCD0, &IXABCD00); + const __m128i row0 = _mm_srli_si128(avg2, 6); + const __m128i row1 = _mm_srli_si128(avg3, 4); + const __m128i row2 = _mm_srli_si128(avg2, 4); + const __m128i row3 = _mm_srli_si128(avg3, 2); + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + + dst -= stride; + dst[0] = _mm_extract_epi16(avg3, 1); + dst[stride] = _mm_extract_epi16(avg3, 0); +} + +void vpx_highbd_d135_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4)); + const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0); + const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1); + const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2); + const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2); + const __m128i LKJIXABC = _mm_insert_epi16(_mm_slli_si128(KJIXABCD, 2), L, 0); + const __m128i avg3 = avg3_epu16(&JIXABCD0, &KJIXABCD, &LKJIXABC); + const __m128i row0 = _mm_srli_si128(avg3, 6); + const __m128i row1 = _mm_srli_si128(avg3, 4); + const __m128i row2 = _mm_srli_si128(avg3, 2); + const __m128i row3 = avg3; + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +void vpx_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + const __m128i XXXXXABC = _mm_castps_si128( + _mm_loadh_pi(_mm_setzero_ps(), (const __m64 *)(above - 1))); + const __m128i LXXXXABC = _mm_insert_epi16(XXXXXABC, L, 0); + const __m128i LKXXXABC = _mm_insert_epi16(LXXXXABC, K, 1); + const __m128i LKJXXABC = _mm_insert_epi16(LKXXXABC, J, 2); + const __m128i LKJIXABC = _mm_insert_epi16(LKJXXABC, I, 3); + const __m128i KJIXABC0 = _mm_srli_si128(LKJIXABC, 2); + const __m128i JIXABC00 = _mm_srli_si128(LKJIXABC, 4); + const __m128i avg3 = avg3_epu16(&LKJIXABC, &KJIXABC0, &JIXABC00); + const __m128i avg2 = _mm_avg_epu16(LKJIXABC, KJIXABC0); + const __m128i row3 = _mm_unpacklo_epi16(avg2, avg3); + const __m128i row2 = _mm_srli_si128(row3, 4); + const __m128i row1 = _mm_srli_si128(row3, 8); + const __m128i row0 = _mm_srli_si128(avg3, 4); + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst[0] = _mm_extract_epi16(avg2, 3); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +void vpx_highbd_d207_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i IJKL0000 = _mm_load_si128((const __m128i *)left); + const __m128i LLLL0000 = _mm_shufflelo_epi16(IJKL0000, 0xff); + const __m128i IJKLLLLL = _mm_unpacklo_epi64(IJKL0000, LLLL0000); + const __m128i JKLLLLL0 = _mm_srli_si128(IJKLLLLL, 2); + const __m128i KLLLLL00 = _mm_srli_si128(IJKLLLLL, 4); + const __m128i avg3 = avg3_epu16(&IJKLLLLL, &JKLLLLL0, &KLLLLL00); + const __m128i avg2 = _mm_avg_epu16(IJKLLLLL, JKLLLLL0); + const __m128i row0 = _mm_unpacklo_epi16(avg2, avg3); + const __m128i row1 = _mm_srli_si128(row0, 4); + const __m128i row2 = _mm_srli_si128(row0, 8); + const __m128i row3 = LLLL0000; + (void)above; + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +void vpx_highbd_d63_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above); + const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2); + const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4); + const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00); + const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGH0); + const __m128i row0 = avg2; + const __m128i row1 = avg3; + const __m128i row2 = _mm_srli_si128(avg2, 2); + const __m128i row3 = _mm_srli_si128(avg3, 2); + (void)left; + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c new file mode 100644 index 0000000000..d673fac493 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c @@ -0,0 +1,930 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +// ----------------------------------------------------------------------------- +/* +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +*/ +static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y, + const __m128i *z) { + const __m128i one = _mm_set1_epi16(1); + const __m128i a = _mm_avg_epu16(*x, *z); + const __m128i b = + _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one)); + return _mm_avg_epu16(b, *y); +} + +void vpx_highbd_d45_predictor_4x4_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above); + const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2); + const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4); + const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00); + (void)left; + (void)bd; + _mm_storel_epi64((__m128i *)dst, avg3); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2)); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4)); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6)); + dst[3] = above[7]; // aka H +} + +static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride, + __m128i *row, const __m128i *ar) { + *row = _mm_alignr_epi8(*ar, *row, 2); + _mm_store_si128((__m128i *)*dst, *row); + *dst += stride; +} + +void vpx_highbd_d45_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); + const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff); + const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH); + const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2); + const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4); + __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH); + (void)left; + (void)bd; + _mm_store_si128((__m128i *)dst, avg3); + dst += stride; + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); +} + +static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride, + __m128i *row_0, __m128i *row_1, + const __m128i *ar) { + *row_0 = _mm_alignr_epi8(*row_1, *row_0, 2); + *row_1 = _mm_alignr_epi8(*ar, *row_1, 2); + _mm_store_si128((__m128i *)*dst, *row_0); + _mm_store_si128((__m128i *)(*dst + 8), *row_1); + *dst += stride; +} + +void vpx_highbd_d45_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_load_si128((const __m128i *)above); + const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff); + const __m128i AR = _mm_unpackhi_epi64(AR0, AR0); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(AR, A1, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(AR, A1, 4); + __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + (void)left; + (void)bd; + _mm_store_si128((__m128i *)dst, avg3_0); + _mm_store_si128((__m128i *)(dst + 8), avg3_1); + dst += stride; + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); +} + +void vpx_highbd_d45_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_load_si128((const __m128i *)above); + const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24)); + const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff); + const __m128i AR = _mm_unpackhi_epi64(AR0, AR0); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(A2, A1, 2); + const __m128i B2 = _mm_alignr_epi8(A3, A2, 2); + const __m128i B3 = _mm_alignr_epi8(AR, A3, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(A2, A1, 4); + const __m128i C2 = _mm_alignr_epi8(A3, A2, 4); + const __m128i C3 = _mm_alignr_epi8(AR, A3, 4); + __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + int i; + (void)left; + (void)bd; + _mm_store_si128((__m128i *)dst, avg3_0); + _mm_store_si128((__m128i *)(dst + 8), avg3_1); + _mm_store_si128((__m128i *)(dst + 16), avg3_2); + _mm_store_si128((__m128i *)(dst + 24), avg3_3); + dst += stride; + for (i = 1; i < 32; ++i) { + avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2); + avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2); + avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2); + avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2); + _mm_store_si128((__m128i *)dst, avg3_0); + _mm_store_si128((__m128i *)(dst + 8), avg3_1); + _mm_store_si128((__m128i *)(dst + 16), avg3_2); + _mm_store_si128((__m128i *)(dst + 24), avg3_3); + dst += stride; + } +} + +DECLARE_ALIGNED(16, static const uint8_t, + rotate_right_epu16[16]) = { 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 0, 1 }; + +static INLINE __m128i rotr_epu16(__m128i *a, const __m128i *rotrw) { + *a = _mm_shuffle_epi8(*a, *rotrw); + return *a; +} + +void vpx_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); + const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left); + const __m128i IXABCDEF = + _mm_alignr_epi8(XABCDEFG, _mm_slli_si128(IJKLMNOP, 14), 14); + const __m128i avg3 = avg3_epu16(&ABCDEFGH, &XABCDEFG, &IXABCDEF); + const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, XABCDEFG); + const __m128i XIJKLMNO = + _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14); + const __m128i JKLMNOP0 = _mm_srli_si128(IJKLMNOP, 2); + __m128i avg3_left = avg3_epu16(&XIJKLMNO, &IJKLMNOP, &JKLMNOP0); + __m128i rowa = avg2; + __m128i rowb = avg3; + int i; + (void)bd; + for (i = 0; i < 8; i += 2) { + _mm_store_si128((__m128i *)dst, rowa); + dst += stride; + _mm_store_si128((__m128i *)dst, rowb); + dst += stride; + rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14); + rowb = _mm_alignr_epi8(rowb, rotr_epu16(&avg3_left, &rotrw), 14); + } +} + +void vpx_highbd_d117_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i A0 = _mm_load_si128((const __m128i *)above); + const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i avg2_0 = _mm_avg_epu16(A0, B0); + const __m128i avg2_1 = _mm_avg_epu16(A1, B1); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14); + const __m128i C1 = _mm_alignr_epi8(B1, B0, 14); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2); + const __m128i L1_ = _mm_srli_si128(L1, 2); + __m128i rowa_0 = avg2_0; + __m128i rowa_1 = avg2_1; + __m128i rowb_0 = avg3_0; + __m128i rowb_1 = avg3_1; + __m128i avg3_left[2]; + int i, j; + (void)bd; + avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_); + avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_); + for (i = 0; i < 2; ++i) { + __m128i avg_left = avg3_left[i]; + for (j = 0; j < 8; j += 2) { + _mm_store_si128((__m128i *)dst, rowa_0); + _mm_store_si128((__m128i *)(dst + 8), rowa_1); + dst += stride; + _mm_store_si128((__m128i *)dst, rowb_0); + _mm_store_si128((__m128i *)(dst + 8), rowb_1); + dst += stride; + rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); + rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); + rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14); + rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14); + } + } +} + +void vpx_highbd_d117_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i A0 = _mm_load_si128((const __m128i *)above); + const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24)); + const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i B2 = _mm_loadu_si128((const __m128i *)(above + 15)); + const __m128i B3 = _mm_loadu_si128((const __m128i *)(above + 23)); + const __m128i avg2_0 = _mm_avg_epu16(A0, B0); + const __m128i avg2_1 = _mm_avg_epu16(A1, B1); + const __m128i avg2_2 = _mm_avg_epu16(A2, B2); + const __m128i avg2_3 = _mm_avg_epu16(A3, B3); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16)); + const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24)); + const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14); + const __m128i C1 = _mm_alignr_epi8(B1, B0, 14); + const __m128i C2 = _mm_alignr_epi8(B2, B1, 14); + const __m128i C3 = _mm_alignr_epi8(B3, B2, 14); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14); + const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14); + const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2); + const __m128i L1_ = _mm_alignr_epi8(L2, L1, 2); + const __m128i L2_ = _mm_alignr_epi8(L3, L2, 2); + const __m128i L3_ = _mm_srli_si128(L3, 2); + __m128i rowa_0 = avg2_0; + __m128i rowa_1 = avg2_1; + __m128i rowa_2 = avg2_2; + __m128i rowa_3 = avg2_3; + __m128i rowb_0 = avg3_0; + __m128i rowb_1 = avg3_1; + __m128i rowb_2 = avg3_2; + __m128i rowb_3 = avg3_3; + __m128i avg3_left[4]; + int i, j; + (void)bd; + avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_); + avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_); + avg3_left[2] = avg3_epu16(&XL2, &L2, &L2_); + avg3_left[3] = avg3_epu16(&XL3, &L3, &L3_); + for (i = 0; i < 4; ++i) { + __m128i avg_left = avg3_left[i]; + for (j = 0; j < 8; j += 2) { + _mm_store_si128((__m128i *)dst, rowa_0); + _mm_store_si128((__m128i *)(dst + 8), rowa_1); + _mm_store_si128((__m128i *)(dst + 16), rowa_2); + _mm_store_si128((__m128i *)(dst + 24), rowa_3); + dst += stride; + _mm_store_si128((__m128i *)dst, rowb_0); + _mm_store_si128((__m128i *)(dst + 8), rowb_1); + _mm_store_si128((__m128i *)(dst + 16), rowb_2); + _mm_store_si128((__m128i *)(dst + 24), rowb_3); + dst += stride; + rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14); + rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14); + rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); + rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); + rowb_3 = _mm_alignr_epi8(rowb_3, rowb_2, 14); + rowb_2 = _mm_alignr_epi8(rowb_2, rowb_1, 14); + rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14); + rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14); + } + } +} + +void vpx_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); + const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2); + const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left); + const __m128i XIJKLMNO = + _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14); + const __m128i AXIJKLMN = + _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(ABCDEFGH, 14), 14); + const __m128i avg3 = avg3_epu16(&XABCDEFG, &ABCDEFGH, &BCDEFGH0); + __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN); + __m128i rowa = avg3; + int i; + (void)bd; + for (i = 0; i < 8; ++i) { + rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14); + _mm_store_si128((__m128i *)dst, rowa); + dst += stride; + } +} + +void vpx_highbd_d135_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i B0 = _mm_load_si128((const __m128i *)above); + const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i C0 = _mm_alignr_epi8(B1, B0, 2); + const __m128i C1 = _mm_srli_si128(B1, 2); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14); + const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14); + __m128i rowa_0 = avg3_0; + __m128i rowa_1 = avg3_1; + __m128i avg3_left[2]; + int i, j; + (void)bd; + avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_); + avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_); + for (i = 0; i < 2; ++i) { + __m128i avg_left = avg3_left[i]; + for (j = 0; j < 8; ++j) { + rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); + rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); + _mm_store_si128((__m128i *)dst, rowa_0); + _mm_store_si128((__m128i *)(dst + 8), rowa_1); + dst += stride; + } + } +} + +void vpx_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15)); + const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23)); + const __m128i B0 = _mm_load_si128((const __m128i *)above); + const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i B2 = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i B3 = _mm_load_si128((const __m128i *)(above + 24)); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16)); + const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24)); + const __m128i C0 = _mm_alignr_epi8(B1, B0, 2); + const __m128i C1 = _mm_alignr_epi8(B2, B1, 2); + const __m128i C2 = _mm_alignr_epi8(B3, B2, 2); + const __m128i C3 = _mm_srli_si128(B3, 2); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14); + const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14); + const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14); + const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14); + const __m128i L2_ = _mm_alignr_epi8(XL2, XL1, 14); + const __m128i L3_ = _mm_alignr_epi8(XL3, XL2, 14); + __m128i rowa_0 = avg3_0; + __m128i rowa_1 = avg3_1; + __m128i rowa_2 = avg3_2; + __m128i rowa_3 = avg3_3; + __m128i avg3_left[4]; + int i, j; + (void)bd; + avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_); + avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_); + avg3_left[2] = avg3_epu16(&L2, &XL2, &L2_); + avg3_left[3] = avg3_epu16(&L3, &XL3, &L3_); + for (i = 0; i < 4; ++i) { + __m128i avg_left = avg3_left[i]; + for (j = 0; j < 8; ++j) { + rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14); + rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14); + rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); + rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); + _mm_store_si128((__m128i *)dst, rowa_0); + _mm_store_si128((__m128i *)(dst + 8), rowa_1); + _mm_store_si128((__m128i *)(dst + 16), rowa_2); + _mm_store_si128((__m128i *)(dst + 24), rowa_3); + dst += stride; + } + } +} + +void vpx_highbd_d153_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i ABCDEFG0 = _mm_srli_si128(XABCDEFG, 2); + const __m128i BCDEFG00 = _mm_srli_si128(XABCDEFG, 4); + const __m128i avg3 = avg3_epu16(&BCDEFG00, &ABCDEFG0, &XABCDEFG); + const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left); + const __m128i XIJKLMNO = + _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14); + const __m128i AXIJKLMN = + _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(XABCDEFG, 12), 14); + const __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN); + const __m128i avg2_left = _mm_avg_epu16(IJKLMNOP, XIJKLMNO); + const __m128i avg2_avg3_lo = _mm_unpacklo_epi16(avg2_left, avg3_left); + const __m128i avg2_avg3_hi = _mm_unpackhi_epi16(avg2_left, avg3_left); + const __m128i row0 = + _mm_alignr_epi8(avg3, _mm_slli_si128(avg2_avg3_lo, 12), 12); + const __m128i row1 = + _mm_alignr_epi8(row0, _mm_slli_si128(avg2_avg3_lo, 8), 12); + const __m128i row2 = + _mm_alignr_epi8(row1, _mm_slli_si128(avg2_avg3_lo, 4), 12); + const __m128i row3 = _mm_alignr_epi8(row2, avg2_avg3_lo, 12); + const __m128i row4 = + _mm_alignr_epi8(row3, _mm_slli_si128(avg2_avg3_hi, 12), 12); + const __m128i row5 = + _mm_alignr_epi8(row4, _mm_slli_si128(avg2_avg3_hi, 8), 12); + const __m128i row6 = + _mm_alignr_epi8(row5, _mm_slli_si128(avg2_avg3_hi, 4), 12); + const __m128i row7 = _mm_alignr_epi8(row6, avg2_avg3_hi, 12); + (void)bd; + _mm_store_si128((__m128i *)dst, row0); + dst += stride; + _mm_store_si128((__m128i *)dst, row1); + dst += stride; + _mm_store_si128((__m128i *)dst, row2); + dst += stride; + _mm_store_si128((__m128i *)dst, row3); + dst += stride; + _mm_store_si128((__m128i *)dst, row4); + dst += stride; + _mm_store_si128((__m128i *)dst, row5); + dst += stride; + _mm_store_si128((__m128i *)dst, row6); + dst += stride; + _mm_store_si128((__m128i *)dst, row7); +} + +void vpx_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_srli_si128(A1, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_srli_si128(A1, 4); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); + const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12); + const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0); + const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0); + const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1); + const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1); + __m128i row_0 = avg3_0; + __m128i row_1 = avg3_1; + __m128i avg2_avg3_left[2][2]; + int i, j; + (void)bd; + + avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0); + avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0); + avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1); + avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1); + + for (j = 0; j < 2; ++j) { + for (i = 0; i < 2; ++i) { + const __m128i avg2_avg3 = avg2_avg3_left[j][i]; + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + dst += stride; + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + dst += stride; + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + dst += stride; + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + dst += stride; + } + } +} + +void vpx_highbd_d153_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15)); + const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23)); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(A2, A1, 2); + const __m128i B2 = _mm_alignr_epi8(A3, A2, 2); + const __m128i B3 = _mm_srli_si128(A3, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(A2, A1, 4); + const __m128i C2 = _mm_alignr_epi8(A3, A2, 4); + const __m128i C3 = _mm_srli_si128(A3, 4); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16)); + const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24)); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14); + const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14); + const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14); + const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12); + const __m128i AXL2 = _mm_alignr_epi8(L2, L1, 12); + const __m128i AXL3 = _mm_alignr_epi8(L3, L2, 12); + const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0); + const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1); + const __m128i avg3_left_2 = avg3_epu16(&L2, &XL2, &AXL2); + const __m128i avg3_left_3 = avg3_epu16(&L3, &XL3, &AXL3); + const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0); + const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1); + const __m128i avg2_left_2 = _mm_avg_epu16(L2, XL2); + const __m128i avg2_left_3 = _mm_avg_epu16(L3, XL3); + __m128i row_0 = avg3_0; + __m128i row_1 = avg3_1; + __m128i row_2 = avg3_2; + __m128i row_3 = avg3_3; + __m128i avg2_avg3_left[4][2]; + int i, j; + (void)bd; + + avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0); + avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0); + avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1); + avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1); + avg2_avg3_left[2][0] = _mm_unpacklo_epi16(avg2_left_2, avg3_left_2); + avg2_avg3_left[2][1] = _mm_unpackhi_epi16(avg2_left_2, avg3_left_2); + avg2_avg3_left[3][0] = _mm_unpacklo_epi16(avg2_left_3, avg3_left_3); + avg2_avg3_left[3][1] = _mm_unpackhi_epi16(avg2_left_3, avg3_left_3); + + for (j = 0; j < 4; ++j) { + for (i = 0; i < 2; ++i) { + const __m128i avg2_avg3 = avg2_avg3_left[j][i]; + row_3 = _mm_alignr_epi8(row_3, row_2, 12); + row_2 = _mm_alignr_epi8(row_2, row_1, 12); + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + _mm_store_si128((__m128i *)(dst + 16), row_2); + _mm_store_si128((__m128i *)(dst + 24), row_3); + dst += stride; + row_3 = _mm_alignr_epi8(row_3, row_2, 12); + row_2 = _mm_alignr_epi8(row_2, row_1, 12); + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + _mm_store_si128((__m128i *)(dst + 16), row_2); + _mm_store_si128((__m128i *)(dst + 24), row_3); + dst += stride; + row_3 = _mm_alignr_epi8(row_3, row_2, 12); + row_2 = _mm_alignr_epi8(row_2, row_1, 12); + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + _mm_store_si128((__m128i *)(dst + 16), row_2); + _mm_store_si128((__m128i *)(dst + 24), row_3); + dst += stride; + row_3 = _mm_alignr_epi8(row_3, row_2, 12); + row_2 = _mm_alignr_epi8(row_2, row_1, 12); + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + _mm_store_si128((__m128i *)(dst + 16), row_2); + _mm_store_si128((__m128i *)(dst + 24), row_3); + dst += stride; + } + } +} + +static INLINE void d207_store_4x8(uint16_t **dst, const ptrdiff_t stride, + const __m128i *a, const __m128i *b) { + _mm_store_si128((__m128i *)*dst, *a); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4)); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8)); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12)); + *dst += stride; +} + +void vpx_highbd_d207_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)left); + const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff); + const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH); + const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2); + const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4); + const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH); + const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH); + const __m128i out_a = _mm_unpacklo_epi16(avg2, avg3); + const __m128i out_b = _mm_unpackhi_epi16(avg2, avg3); + (void)above; + (void)bd; + d207_store_4x8(&dst, stride, &out_a, &out_b); + d207_store_4x8(&dst, stride, &out_b, &HHHHHHHH); +} + +static INLINE void d207_store_4x16(uint16_t **dst, const ptrdiff_t stride, + const __m128i *a, const __m128i *b, + const __m128i *c) { + _mm_store_si128((__m128i *)*dst, *a); + _mm_store_si128((__m128i *)(*dst + 8), *b); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4)); + _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 4)); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8)); + _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 8)); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12)); + _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 12)); + *dst += stride; +} + +void vpx_highbd_d207_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_load_si128((const __m128i *)left); + const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i LR0 = _mm_shufflehi_epi16(A1, 0xff); + const __m128i LR = _mm_unpackhi_epi64(LR0, LR0); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(LR, A1, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(LR, A1, 4); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i avg2_0 = _mm_avg_epu16(A0, B0); + const __m128i avg2_1 = _mm_avg_epu16(A1, B1); + const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0); + const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0); + const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1); + const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1); + (void)above; + (void)bd; + d207_store_4x16(&dst, stride, &out_a, &out_b, &out_c); + d207_store_4x16(&dst, stride, &out_b, &out_c, &out_d); + d207_store_4x16(&dst, stride, &out_c, &out_d, &LR); + d207_store_4x16(&dst, stride, &out_d, &LR, &LR); +} + +static INLINE void d207_store_4x32(uint16_t **dst, const ptrdiff_t stride, + const __m128i *a, const __m128i *b, + const __m128i *c, const __m128i *d, + const __m128i *e) { + _mm_store_si128((__m128i *)*dst, *a); + _mm_store_si128((__m128i *)(*dst + 8), *b); + _mm_store_si128((__m128i *)(*dst + 16), *c); + _mm_store_si128((__m128i *)(*dst + 24), *d); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4)); + _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 4)); + _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 4)); + _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 4)); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8)); + _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 8)); + _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 8)); + _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 8)); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12)); + _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 12)); + _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 12)); + _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 12)); + *dst += stride; +} + +void vpx_highbd_d207_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_load_si128((const __m128i *)left); + const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i A2 = _mm_load_si128((const __m128i *)(left + 16)); + const __m128i A3 = _mm_load_si128((const __m128i *)(left + 24)); + const __m128i LR0 = _mm_shufflehi_epi16(A3, 0xff); + const __m128i LR = _mm_unpackhi_epi64(LR0, LR0); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(A2, A1, 2); + const __m128i B2 = _mm_alignr_epi8(A3, A2, 2); + const __m128i B3 = _mm_alignr_epi8(LR, A3, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(A2, A1, 4); + const __m128i C2 = _mm_alignr_epi8(A3, A2, 4); + const __m128i C3 = _mm_alignr_epi8(LR, A3, 4); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + const __m128i avg2_0 = _mm_avg_epu16(A0, B0); + const __m128i avg2_1 = _mm_avg_epu16(A1, B1); + const __m128i avg2_2 = _mm_avg_epu16(A2, B2); + const __m128i avg2_3 = _mm_avg_epu16(A3, B3); + const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0); + const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0); + const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1); + const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1); + const __m128i out_e = _mm_unpacklo_epi16(avg2_2, avg3_2); + const __m128i out_f = _mm_unpackhi_epi16(avg2_2, avg3_2); + const __m128i out_g = _mm_unpacklo_epi16(avg2_3, avg3_3); + const __m128i out_h = _mm_unpackhi_epi16(avg2_3, avg3_3); + (void)above; + (void)bd; + d207_store_4x32(&dst, stride, &out_a, &out_b, &out_c, &out_d, &out_e); + d207_store_4x32(&dst, stride, &out_b, &out_c, &out_d, &out_e, &out_f); + d207_store_4x32(&dst, stride, &out_c, &out_d, &out_e, &out_f, &out_g); + d207_store_4x32(&dst, stride, &out_d, &out_e, &out_f, &out_g, &out_h); + d207_store_4x32(&dst, stride, &out_e, &out_f, &out_g, &out_h, &LR); + d207_store_4x32(&dst, stride, &out_f, &out_g, &out_h, &LR, &LR); + d207_store_4x32(&dst, stride, &out_g, &out_h, &LR, &LR, &LR); + d207_store_4x32(&dst, stride, &out_h, &LR, &LR, &LR, &LR); +} + +static INLINE void d63_store_4x8(uint16_t **dst, const ptrdiff_t stride, + __m128i *a, __m128i *b, const __m128i *ar) { + _mm_store_si128((__m128i *)*dst, *a); + *dst += stride; + _mm_store_si128((__m128i *)*dst, *b); + *dst += stride; + *a = _mm_alignr_epi8(*ar, *a, 2); + *b = _mm_alignr_epi8(*ar, *b, 2); + _mm_store_si128((__m128i *)*dst, *a); + *dst += stride; + _mm_store_si128((__m128i *)*dst, *b); + *dst += stride; + *a = _mm_alignr_epi8(*ar, *a, 2); + *b = _mm_alignr_epi8(*ar, *b, 2); +} + +void vpx_highbd_d63_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); + const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff); + const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH); + const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2); + const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4); + __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH); + __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH); + (void)left; + (void)bd; + d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH); + d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH); +} + +void vpx_highbd_d63_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_load_si128((const __m128i *)above); + const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff); + const __m128i AR = _mm_unpackhi_epi64(AR0, AR0); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(AR, A1, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(AR, A1, 4); + __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + __m128i avg2_0 = _mm_avg_epu16(A0, B0); + __m128i avg2_1 = _mm_avg_epu16(A1, B1); + int i; + (void)left; + (void)bd; + for (i = 0; i < 14; i += 2) { + _mm_store_si128((__m128i *)dst, avg2_0); + _mm_store_si128((__m128i *)(dst + 8), avg2_1); + dst += stride; + _mm_store_si128((__m128i *)dst, avg3_0); + _mm_store_si128((__m128i *)(dst + 8), avg3_1); + dst += stride; + avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2); + avg2_1 = _mm_alignr_epi8(AR, avg2_1, 2); + avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2); + avg3_1 = _mm_alignr_epi8(AR, avg3_1, 2); + } + _mm_store_si128((__m128i *)dst, avg2_0); + _mm_store_si128((__m128i *)(dst + 8), avg2_1); + dst += stride; + _mm_store_si128((__m128i *)dst, avg3_0); + _mm_store_si128((__m128i *)(dst + 8), avg3_1); +} + +void vpx_highbd_d63_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_load_si128((const __m128i *)above); + const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24)); + const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff); + const __m128i AR = _mm_unpackhi_epi64(AR0, AR0); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(A2, A1, 2); + const __m128i B2 = _mm_alignr_epi8(A3, A2, 2); + const __m128i B3 = _mm_alignr_epi8(AR, A3, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(A2, A1, 4); + const __m128i C2 = _mm_alignr_epi8(A3, A2, 4); + const __m128i C3 = _mm_alignr_epi8(AR, A3, 4); + __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + __m128i avg2_0 = _mm_avg_epu16(A0, B0); + __m128i avg2_1 = _mm_avg_epu16(A1, B1); + __m128i avg2_2 = _mm_avg_epu16(A2, B2); + __m128i avg2_3 = _mm_avg_epu16(A3, B3); + int i; + (void)left; + (void)bd; + for (i = 0; i < 30; i += 2) { + _mm_store_si128((__m128i *)dst, avg2_0); + _mm_store_si128((__m128i *)(dst + 8), avg2_1); + _mm_store_si128((__m128i *)(dst + 16), avg2_2); + _mm_store_si128((__m128i *)(dst + 24), avg2_3); + dst += stride; + _mm_store_si128((__m128i *)dst, avg3_0); + _mm_store_si128((__m128i *)(dst + 8), avg3_1); + _mm_store_si128((__m128i *)(dst + 16), avg3_2); + _mm_store_si128((__m128i *)(dst + 24), avg3_3); + dst += stride; + avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2); + avg2_1 = _mm_alignr_epi8(avg2_2, avg2_1, 2); + avg2_2 = _mm_alignr_epi8(avg2_3, avg2_2, 2); + avg2_3 = _mm_alignr_epi8(AR, avg2_3, 2); + avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2); + avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2); + avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2); + avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2); + } + _mm_store_si128((__m128i *)dst, avg2_0); + _mm_store_si128((__m128i *)(dst + 8), avg2_1); + _mm_store_si128((__m128i *)(dst + 16), avg2_2); + _mm_store_si128((__m128i *)(dst + 24), avg2_3); + dst += stride; + _mm_store_si128((__m128i *)dst, avg3_0); + _mm_store_si128((__m128i *)(dst + 8), avg3_1); + _mm_store_si128((__m128i *)(dst + 16), avg3_2); + _mm_store_si128((__m128i *)(dst + 24), avg3_3); +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm new file mode 100644 index 0000000000..caf506ac07 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm @@ -0,0 +1,453 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_4: times 8 dw 4 +pw_8: times 8 dw 8 +pw_16: times 4 dd 16 +pw_32: times 4 dd 32 + +SECTION .text +INIT_XMM sse2 +cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset + GET_GOT goffsetq + + movq m0, [aboveq] + movq m2, [leftq] + paddw m0, m2 + pshuflw m1, m0, 0xe + paddw m0, m1 + pshuflw m1, m0, 0x1 + paddw m0, m1 + paddw m0, [GLOBAL(pw_4)] + psraw m0, 3 + pshuflw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [leftq] + DEFINE_ARGS dst, stride, stride3, one + mov oned, 0x00010001 + lea stride3q, [strideq*3] + movd m3, oned + pshufd m3, m3, 0x0 + paddw m0, m2 + pmaddwd m0, m3 + packssdw m0, m1 + pmaddwd m0, m3 + packssdw m0, m1 + pmaddwd m0, m3 + paddw m0, [GLOBAL(pw_8)] + psrlw m0, 4 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + lea dstq, [dstq+strideq*8] + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m3, [aboveq+16] + mova m2, [leftq] + mova m4, [leftq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + paddw m0, m2 + paddw m0, m3 + paddw m0, m4 + movhlps m2, m0 + paddw m0, m2 + punpcklwd m0, m1 + movhlps m2, m0 + paddd m0, m2 + punpckldq m0, m1 + movhlps m2, m0 + paddd m0, m2 + paddd m0, [GLOBAL(pw_16)] + psrad m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2 +16], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4 +16], m0 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2+16], m0 + lea dstq, [dstq+strideq*8] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal highbd_dc_predictor_32x32, 4, 5, 7, dst, stride, above, left, goffset + GET_GOT goffsetq + + mova m0, [aboveq] + mova m2, [aboveq+16] + mova m3, [aboveq+32] + mova m4, [aboveq+48] + paddw m0, m2 + paddw m3, m4 + mova m2, [leftq] + mova m4, [leftq+16] + mova m5, [leftq+32] + mova m6, [leftq+48] + paddw m2, m4 + paddw m5, m6 + paddw m0, m3 + paddw m2, m5 + pxor m1, m1 + paddw m0, m2 + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + movhlps m2, m0 + paddw m0, m2 + punpcklwd m0, m1 + movhlps m2, m0 + paddd m0, m2 + punpckldq m0, m1 + movhlps m2, m0 + paddd m0, m2 + paddd m0, [GLOBAL(pw_32)] + psrad m0, 6 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16 ], m0 + mova [dstq +32 ], m0 + mova [dstq +48 ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16 ], m0 + mova [dstq+strideq*2+32 ], m0 + mova [dstq+strideq*2+48 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4+16 ], m0 + mova [dstq+strideq*4+32 ], m0 + mova [dstq+strideq*4+48 ], m0 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2 +16], m0 + mova [dstq+stride3q*2 +32], m0 + mova [dstq+stride3q*2 +48], m0 + lea dstq, [dstq+strideq*8] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above + movq m0, [aboveq] + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + RET + +INIT_XMM sse2 +cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + lea dstq, [dstq+strideq*8] + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + RET + +INIT_XMM sse2 +cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above + mova m0, [aboveq] + mova m1, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 4 +.loop: + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2 +16], m1 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4 +16], m1 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2+16], m1 + lea dstq, [dstq+strideq*8] + dec nlines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above + mova m0, [aboveq] + mova m1, [aboveq+16] + mova m2, [aboveq+32] + mova m3, [aboveq+48] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 8 +.loop: + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq +32], m2 + mova [dstq +48], m3 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2 +16], m1 + mova [dstq+strideq*2 +32], m2 + mova [dstq+strideq*2 +48], m3 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4 +16], m1 + mova [dstq+strideq*4 +32], m2 + mova [dstq+strideq*4 +48], m3 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2 +16], m1 + mova [dstq+stride3q*2 +32], m2 + mova [dstq+stride3q*2 +48], m3 + lea dstq, [dstq+strideq*8] + dec nlines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bd + movd m1, [aboveq-2] + movq m0, [aboveq] + pshuflw m1, m1, 0x0 + movlhps m0, m0 ; t1 t2 t3 t4 t1 t2 t3 t4 + movlhps m1, m1 ; tl tl tl tl tl tl tl tl + ; Get the values to compute the maximum value at this bit depth + pcmpeqw m3, m3 + movd m4, bdd + psubw m0, m1 ; t1-tl t2-tl t3-tl t4-tl + psllw m3, m4 + pcmpeqw m2, m2 + pxor m4, m4 ; min possible value + pxor m3, m2 ; max possible value + mova m1, [leftq] + pshuflw m2, m1, 0x0 + pshuflw m5, m1, 0x55 + movlhps m2, m5 ; l1 l1 l1 l1 l2 l2 l2 l2 + paddw m2, m0 + ;Clamp to the bit-depth + pminsw m2, m3 + pmaxsw m2, m4 + ;Store the values + movq [dstq ], m2 + movhpd [dstq+strideq*2], m2 + lea dstq, [dstq+strideq*4] + pshuflw m2, m1, 0xaa + pshuflw m5, m1, 0xff + movlhps m2, m5 + paddw m2, m0 + ;Clamp to the bit-depth + pminsw m2, m3 + pmaxsw m2, m4 + ;Store the values + movq [dstq ], m2 + movhpd [dstq+strideq*2], m2 + RET + +INIT_XMM sse2 +cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bd, one + movd m1, [aboveq-2] + mova m0, [aboveq] + pshuflw m1, m1, 0x0 + ; Get the values to compute the maximum value at this bit depth + mov oned, 1 + pxor m3, m3 + pxor m4, m4 + pinsrw m3, oned, 0 + pinsrw m4, bdd, 0 + pshuflw m3, m3, 0x0 + DEFINE_ARGS dst, stride, line, left + punpcklqdq m3, m3 + mov lineq, -4 + mova m2, m3 + punpcklqdq m1, m1 + psllw m3, m4 + add leftq, 16 + psubw m3, m2 ; max possible value + pxor m4, m4 ; min possible value + psubw m0, m1 +.loop: + movd m1, [leftq+lineq*4] + movd m2, [leftq+lineq*4+2] + pshuflw m1, m1, 0x0 + pshuflw m2, m2, 0x0 + punpcklqdq m1, m1 + punpcklqdq m2, m2 + paddw m1, m0 + paddw m2, m0 + ;Clamp to the bit-depth + pminsw m1, m3 + pminsw m2, m3 + pmaxsw m1, m4 + pmaxsw m2, m4 + ;Store the values + mova [dstq ], m1 + mova [dstq+strideq*2], m2 + lea dstq, [dstq+strideq*4] + inc lineq + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bd + movd m2, [aboveq-2] + mova m0, [aboveq] + mova m1, [aboveq+16] + pshuflw m2, m2, 0x0 + ; Get the values to compute the maximum value at this bit depth + pcmpeqw m3, m3 + movd m4, bdd + punpcklqdq m2, m2 + psllw m3, m4 + pcmpeqw m5, m5 + pxor m4, m4 ; min possible value + pxor m3, m5 ; max possible value + DEFINE_ARGS dst, stride, line, left + mov lineq, -8 + psubw m0, m2 + psubw m1, m2 +.loop: + movd m7, [leftq] + pshuflw m5, m7, 0x0 + pshuflw m2, m7, 0x55 + punpcklqdq m5, m5 ; l1 l1 l1 l1 l1 l1 l1 l1 + punpcklqdq m2, m2 ; l2 l2 l2 l2 l2 l2 l2 l2 + paddw m6, m5, m0 ; t1-tl+l1 to t4-tl+l1 + paddw m5, m1 ; t5-tl+l1 to t8-tl+l1 + pminsw m6, m3 + pminsw m5, m3 + pmaxsw m6, m4 ; Clamp to the bit-depth + pmaxsw m5, m4 + mova [dstq ], m6 + mova [dstq +16], m5 + paddw m6, m2, m0 + paddw m2, m1 + pminsw m6, m3 + pminsw m2, m3 + pmaxsw m6, m4 + pmaxsw m2, m4 + mova [dstq+strideq*2 ], m6 + mova [dstq+strideq*2+16], m2 + lea dstq, [dstq+strideq*4] + inc lineq + lea leftq, [leftq+4] + + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bd + movd m0, [aboveq-2] + mova m1, [aboveq] + mova m2, [aboveq+16] + mova m3, [aboveq+32] + mova m4, [aboveq+48] + pshuflw m0, m0, 0x0 + ; Get the values to compute the maximum value at this bit depth + pcmpeqw m5, m5 + movd m6, bdd + psllw m5, m6 + pcmpeqw m7, m7 + pxor m6, m6 ; min possible value + pxor m5, m7 ; max possible value + punpcklqdq m0, m0 + DEFINE_ARGS dst, stride, line, left + mov lineq, -16 + psubw m1, m0 + psubw m2, m0 + psubw m3, m0 + psubw m4, m0 +.loop: + movd m7, [leftq] + pshuflw m7, m7, 0x0 + punpcklqdq m7, m7 ; l1 l1 l1 l1 l1 l1 l1 l1 + paddw m0, m7, m1 + pminsw m0, m5 + pmaxsw m0, m6 + mova [dstq ], m0 + paddw m0, m7, m2 + pminsw m0, m5 + pmaxsw m0, m6 + mova [dstq +16], m0 + paddw m0, m7, m3 + pminsw m0, m5 + pmaxsw m0, m6 + mova [dstq +32], m0 + paddw m0, m7, m4 + pminsw m0, m5 + pmaxsw m0, m6 + mova [dstq +48], m0 + movd m7, [leftq+2] + pshuflw m7, m7, 0x0 + punpcklqdq m7, m7 ; l2 l2 l2 l2 l2 l2 l2 l2 + paddw m0, m7, m1 + pminsw m0, m5 + pmaxsw m0, m6 + mova [dstq+strideq*2 ], m0 + paddw m0, m7, m2 + pminsw m0, m5 + pmaxsw m0, m6 + mova [dstq+strideq*2+16], m0 + paddw m0, m7, m3 + pminsw m0, m5 + pmaxsw m0, m6 + mova [dstq+strideq*2+32], m0 + paddw m0, m7, m4 + pminsw m0, m5 + pmaxsw m0, m6 + mova [dstq+strideq*2+48], m0 + lea dstq, [dstq+strideq*4] + lea leftq, [leftq+4] + inc lineq + jnz .loop + REP_RET diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h new file mode 100644 index 0000000000..1d07391b02 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h @@ -0,0 +1,404 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_ +#define VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_ + +#include // SSE2 + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/inv_txfm.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +// Note: There is no 64-bit bit-level shifting SIMD instruction. All +// coefficients are left shifted by 2, so that dct_const_round_shift() can be +// done by right shifting 2 bytes. + +static INLINE void extend_64bit(const __m128i in, + __m128i *const out /*out[2]*/) { + out[0] = _mm_unpacklo_epi32(in, in); // 0, 0, 1, 1 + out[1] = _mm_unpackhi_epi32(in, in); // 2, 2, 3, 3 +} + +static INLINE __m128i wraplow_16bit_shift4(const __m128i in0, const __m128i in1, + const __m128i rounding) { + __m128i temp[2]; + temp[0] = _mm_add_epi32(in0, rounding); + temp[1] = _mm_add_epi32(in1, rounding); + temp[0] = _mm_srai_epi32(temp[0], 4); + temp[1] = _mm_srai_epi32(temp[1], 4); + return _mm_packs_epi32(temp[0], temp[1]); +} + +static INLINE __m128i wraplow_16bit_shift5(const __m128i in0, const __m128i in1, + const __m128i rounding) { + __m128i temp[2]; + temp[0] = _mm_add_epi32(in0, rounding); + temp[1] = _mm_add_epi32(in1, rounding); + temp[0] = _mm_srai_epi32(temp[0], 5); + temp[1] = _mm_srai_epi32(temp[1], 5); + return _mm_packs_epi32(temp[0], temp[1]); +} + +static INLINE __m128i dct_const_round_shift_64bit(const __m128i in) { + const __m128i t = + _mm_add_epi64(in, pair_set_epi32(DCT_CONST_ROUNDING << 2, 0)); + return _mm_srli_si128(t, 2); +} + +static INLINE __m128i pack_4(const __m128i in0, const __m128i in1) { + const __m128i t0 = _mm_unpacklo_epi32(in0, in1); // 0, 2 + const __m128i t1 = _mm_unpackhi_epi32(in0, in1); // 1, 3 + return _mm_unpacklo_epi32(t0, t1); // 0, 1, 2, 3 +} + +static INLINE void abs_extend_64bit_sse2(const __m128i in, + __m128i *const out /*out[2]*/, + __m128i *const sign /*sign[2]*/) { + sign[0] = _mm_srai_epi32(in, 31); + out[0] = _mm_xor_si128(in, sign[0]); + out[0] = _mm_sub_epi32(out[0], sign[0]); + sign[1] = _mm_unpackhi_epi32(sign[0], sign[0]); // 64-bit sign of 2, 3 + sign[0] = _mm_unpacklo_epi32(sign[0], sign[0]); // 64-bit sign of 0, 1 + out[1] = _mm_unpackhi_epi32(out[0], out[0]); // 2, 3 + out[0] = _mm_unpacklo_epi32(out[0], out[0]); // 0, 1 +} + +// Note: cospi must be non negative. +static INLINE __m128i multiply_apply_sign_sse2(const __m128i in, + const __m128i sign, + const __m128i cospi) { + __m128i out = _mm_mul_epu32(in, cospi); + out = _mm_xor_si128(out, sign); + return _mm_sub_epi64(out, sign); +} + +// Note: c must be non negative. +static INLINE __m128i multiplication_round_shift_sse2( + const __m128i *const in /*in[2]*/, const __m128i *const sign /*sign[2]*/, + const int c) { + const __m128i pair_c = pair_set_epi32(c << 2, 0); + __m128i t0, t1; + + assert(c >= 0); + t0 = multiply_apply_sign_sse2(in[0], sign[0], pair_c); + t1 = multiply_apply_sign_sse2(in[1], sign[1], pair_c); + t0 = dct_const_round_shift_64bit(t0); + t1 = dct_const_round_shift_64bit(t1); + + return pack_4(t0, t1); +} + +// Note: c must be non negative. +static INLINE __m128i multiplication_neg_round_shift_sse2( + const __m128i *const in /*in[2]*/, const __m128i *const sign /*sign[2]*/, + const int c) { + const __m128i pair_c = pair_set_epi32(c << 2, 0); + __m128i t0, t1; + + assert(c >= 0); + t0 = multiply_apply_sign_sse2(in[0], sign[0], pair_c); + t1 = multiply_apply_sign_sse2(in[1], sign[1], pair_c); + t0 = _mm_sub_epi64(_mm_setzero_si128(), t0); + t1 = _mm_sub_epi64(_mm_setzero_si128(), t1); + t0 = dct_const_round_shift_64bit(t0); + t1 = dct_const_round_shift_64bit(t1); + + return pack_4(t0, t1); +} + +// Note: c0 and c1 must be non negative. +static INLINE void highbd_butterfly_sse2(const __m128i in0, const __m128i in1, + const int c0, const int c1, + __m128i *const out0, + __m128i *const out1) { + const __m128i pair_c0 = pair_set_epi32(c0 << 2, 0); + const __m128i pair_c1 = pair_set_epi32(c1 << 2, 0); + __m128i temp1[4], temp2[4], sign1[2], sign2[2]; + + assert(c0 >= 0); + assert(c1 >= 0); + abs_extend_64bit_sse2(in0, temp1, sign1); + abs_extend_64bit_sse2(in1, temp2, sign2); + temp1[2] = multiply_apply_sign_sse2(temp1[0], sign1[0], pair_c1); + temp1[3] = multiply_apply_sign_sse2(temp1[1], sign1[1], pair_c1); + temp1[0] = multiply_apply_sign_sse2(temp1[0], sign1[0], pair_c0); + temp1[1] = multiply_apply_sign_sse2(temp1[1], sign1[1], pair_c0); + temp2[2] = multiply_apply_sign_sse2(temp2[0], sign2[0], pair_c0); + temp2[3] = multiply_apply_sign_sse2(temp2[1], sign2[1], pair_c0); + temp2[0] = multiply_apply_sign_sse2(temp2[0], sign2[0], pair_c1); + temp2[1] = multiply_apply_sign_sse2(temp2[1], sign2[1], pair_c1); + temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]); + temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]); + temp2[0] = _mm_add_epi64(temp1[2], temp2[2]); + temp2[1] = _mm_add_epi64(temp1[3], temp2[3]); + temp1[0] = dct_const_round_shift_64bit(temp1[0]); + temp1[1] = dct_const_round_shift_64bit(temp1[1]); + temp2[0] = dct_const_round_shift_64bit(temp2[0]); + temp2[1] = dct_const_round_shift_64bit(temp2[1]); + *out0 = pack_4(temp1[0], temp1[1]); + *out1 = pack_4(temp2[0], temp2[1]); +} + +// Note: c0 and c1 must be non negative. +static INLINE void highbd_partial_butterfly_sse2(const __m128i in, const int c0, + const int c1, + __m128i *const out0, + __m128i *const out1) { + __m128i temp[2], sign[2]; + + assert(c0 >= 0); + assert(c1 >= 0); + abs_extend_64bit_sse2(in, temp, sign); + *out0 = multiplication_round_shift_sse2(temp, sign, c0); + *out1 = multiplication_round_shift_sse2(temp, sign, c1); +} + +// Note: c0 and c1 must be non negative. +static INLINE void highbd_partial_butterfly_neg_sse2(const __m128i in, + const int c0, const int c1, + __m128i *const out0, + __m128i *const out1) { + __m128i temp[2], sign[2]; + + assert(c0 >= 0); + assert(c1 >= 0); + abs_extend_64bit_sse2(in, temp, sign); + *out0 = multiplication_neg_round_shift_sse2(temp, sign, c1); + *out1 = multiplication_round_shift_sse2(temp, sign, c0); +} + +static INLINE void highbd_butterfly_cospi16_sse2(const __m128i in0, + const __m128i in1, + __m128i *const out0, + __m128i *const out1) { + __m128i temp1[2], temp2, sign[2]; + + temp2 = _mm_add_epi32(in0, in1); + abs_extend_64bit_sse2(temp2, temp1, sign); + *out0 = multiplication_round_shift_sse2(temp1, sign, cospi_16_64); + temp2 = _mm_sub_epi32(in0, in1); + abs_extend_64bit_sse2(temp2, temp1, sign); + *out1 = multiplication_round_shift_sse2(temp1, sign, cospi_16_64); +} + +// Only do addition and subtraction butterfly, size = 16, 32 +static INLINE void highbd_add_sub_butterfly(const __m128i *in, __m128i *out, + int size) { + int i = 0; + const int num = size >> 1; + const int bound = size - 1; + while (i < num) { + out[i] = _mm_add_epi32(in[i], in[bound - i]); + out[bound - i] = _mm_sub_epi32(in[i], in[bound - i]); + i++; + } +} + +static INLINE void highbd_idct8_stage4(const __m128i *const in, + __m128i *const out) { + out[0] = _mm_add_epi32(in[0], in[7]); + out[1] = _mm_add_epi32(in[1], in[6]); + out[2] = _mm_add_epi32(in[2], in[5]); + out[3] = _mm_add_epi32(in[3], in[4]); + out[4] = _mm_sub_epi32(in[3], in[4]); + out[5] = _mm_sub_epi32(in[2], in[5]); + out[6] = _mm_sub_epi32(in[1], in[6]); + out[7] = _mm_sub_epi32(in[0], in[7]); +} + +static INLINE void highbd_idct8x8_final_round(__m128i *const io) { + io[0] = wraplow_16bit_shift5(io[0], io[8], _mm_set1_epi32(16)); + io[1] = wraplow_16bit_shift5(io[1], io[9], _mm_set1_epi32(16)); + io[2] = wraplow_16bit_shift5(io[2], io[10], _mm_set1_epi32(16)); + io[3] = wraplow_16bit_shift5(io[3], io[11], _mm_set1_epi32(16)); + io[4] = wraplow_16bit_shift5(io[4], io[12], _mm_set1_epi32(16)); + io[5] = wraplow_16bit_shift5(io[5], io[13], _mm_set1_epi32(16)); + io[6] = wraplow_16bit_shift5(io[6], io[14], _mm_set1_epi32(16)); + io[7] = wraplow_16bit_shift5(io[7], io[15], _mm_set1_epi32(16)); +} + +static INLINE void highbd_idct16_4col_stage7(const __m128i *const in, + __m128i *const out) { + out[0] = _mm_add_epi32(in[0], in[15]); + out[1] = _mm_add_epi32(in[1], in[14]); + out[2] = _mm_add_epi32(in[2], in[13]); + out[3] = _mm_add_epi32(in[3], in[12]); + out[4] = _mm_add_epi32(in[4], in[11]); + out[5] = _mm_add_epi32(in[5], in[10]); + out[6] = _mm_add_epi32(in[6], in[9]); + out[7] = _mm_add_epi32(in[7], in[8]); + out[8] = _mm_sub_epi32(in[7], in[8]); + out[9] = _mm_sub_epi32(in[6], in[9]); + out[10] = _mm_sub_epi32(in[5], in[10]); + out[11] = _mm_sub_epi32(in[4], in[11]); + out[12] = _mm_sub_epi32(in[3], in[12]); + out[13] = _mm_sub_epi32(in[2], in[13]); + out[14] = _mm_sub_epi32(in[1], in[14]); + out[15] = _mm_sub_epi32(in[0], in[15]); +} + +static INLINE __m128i add_clamp(const __m128i in0, const __m128i in1, + const int bd) { + const __m128i zero = _mm_setzero_si128(); + // Faster than _mm_set1_epi16((1 << bd) - 1). + const __m128i one = _mm_set1_epi16(1); + const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); + __m128i d; + + d = _mm_adds_epi16(in0, in1); + d = _mm_max_epi16(d, zero); + d = _mm_min_epi16(d, max); + + return d; +} + +static INLINE void highbd_idct_1_add_kernel(const tran_low_t *input, + uint16_t *dest, int stride, int bd, + const int size) { + int a1, i, j; + tran_low_t out; + __m128i dc, d; + + out = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); + out = + HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd); + a1 = ROUND_POWER_OF_TWO(out, (size == 8) ? 5 : 6); + dc = _mm_set1_epi16(a1); + + for (i = 0; i < size; ++i) { + for (j = 0; j < size; j += 8) { + d = _mm_load_si128((const __m128i *)(&dest[j])); + d = add_clamp(d, dc, bd); + _mm_store_si128((__m128i *)(&dest[j]), d); + } + dest += stride; + } +} + +static INLINE void recon_and_store_4(const __m128i in, uint16_t *const dest, + const int bd) { + __m128i d; + + d = _mm_loadl_epi64((const __m128i *)dest); + d = add_clamp(d, in, bd); + _mm_storel_epi64((__m128i *)dest, d); +} + +static INLINE void recon_and_store_4x2(const __m128i in, uint16_t *const dest, + const int stride, const int bd) { + __m128i d; + + d = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride)); + d = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(d), (const __m64 *)(dest + 1 * stride))); + d = add_clamp(d, in, bd); + _mm_storel_epi64((__m128i *)(dest + 0 * stride), d); + _mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d)); +} + +static INLINE void recon_and_store_4x4(const __m128i *const in, uint16_t *dest, + const int stride, const int bd) { + recon_and_store_4x2(in[0], dest, stride, bd); + dest += 2 * stride; + recon_and_store_4x2(in[1], dest, stride, bd); +} + +static INLINE void recon_and_store_8(const __m128i in, uint16_t **const dest, + const int stride, const int bd) { + __m128i d; + + d = _mm_load_si128((const __m128i *)(*dest)); + d = add_clamp(d, in, bd); + _mm_store_si128((__m128i *)(*dest), d); + *dest += stride; +} + +static INLINE void recon_and_store_8x8(const __m128i *const in, uint16_t *dest, + const int stride, const int bd) { + recon_and_store_8(in[0], &dest, stride, bd); + recon_and_store_8(in[1], &dest, stride, bd); + recon_and_store_8(in[2], &dest, stride, bd); + recon_and_store_8(in[3], &dest, stride, bd); + recon_and_store_8(in[4], &dest, stride, bd); + recon_and_store_8(in[5], &dest, stride, bd); + recon_and_store_8(in[6], &dest, stride, bd); + recon_and_store_8(in[7], &dest, stride, bd); +} + +static INLINE __m128i load_pack_8_32bit(const tran_low_t *const input) { + const __m128i t0 = _mm_load_si128((const __m128i *)(input + 0)); + const __m128i t1 = _mm_load_si128((const __m128i *)(input + 4)); + return _mm_packs_epi32(t0, t1); +} + +static INLINE void highbd_load_pack_transpose_32bit_8x8(const tran_low_t *input, + const int stride, + __m128i *const in) { + in[0] = load_pack_8_32bit(input + 0 * stride); + in[1] = load_pack_8_32bit(input + 1 * stride); + in[2] = load_pack_8_32bit(input + 2 * stride); + in[3] = load_pack_8_32bit(input + 3 * stride); + in[4] = load_pack_8_32bit(input + 4 * stride); + in[5] = load_pack_8_32bit(input + 5 * stride); + in[6] = load_pack_8_32bit(input + 6 * stride); + in[7] = load_pack_8_32bit(input + 7 * stride); + transpose_16bit_8x8(in, in); +} + +static INLINE void highbd_load_transpose_32bit_8x4(const tran_low_t *input, + const int stride, + __m128i *in) { + in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride + 0)); + in[1] = _mm_load_si128((const __m128i *)(input + 0 * stride + 4)); + in[2] = _mm_load_si128((const __m128i *)(input + 1 * stride + 0)); + in[3] = _mm_load_si128((const __m128i *)(input + 1 * stride + 4)); + in[4] = _mm_load_si128((const __m128i *)(input + 2 * stride + 0)); + in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride + 4)); + in[6] = _mm_load_si128((const __m128i *)(input + 3 * stride + 0)); + in[7] = _mm_load_si128((const __m128i *)(input + 3 * stride + 4)); + transpose_32bit_8x4(in, in); +} + +static INLINE void highbd_load_transpose_32bit_4x4(const tran_low_t *input, + const int stride, + __m128i *in) { + in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); + in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); + in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); + in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); + transpose_32bit_4x4(in, in); +} + +static INLINE void highbd_write_buffer_8(uint16_t *dest, const __m128i in, + const int bd) { + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + __m128i out; + + out = _mm_adds_epi16(in, final_rounding); + out = _mm_srai_epi16(out, 6); + recon_and_store_8(out, &dest, 0, bd); +} + +static INLINE void highbd_write_buffer_4(uint16_t *const dest, const __m128i in, + const int bd) { + const __m128i final_rounding = _mm_set1_epi32(1 << 5); + __m128i out; + + out = _mm_add_epi32(in, final_rounding); + out = _mm_srai_epi32(out, 6); + out = _mm_packs_epi32(out, out); + recon_and_store_4(out, dest, bd); +} + +#endif // VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h b/media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h new file mode 100644 index 0000000000..f446bb13f3 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_ +#define VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_ + +#include // SSE4.1 + +#include "./vpx_config.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" + +static INLINE __m128i multiplication_round_shift_sse4_1( + const __m128i *const in /*in[2]*/, const int c) { + const __m128i pair_c = pair_set_epi32(c * 4, 0); + __m128i t0, t1; + + t0 = _mm_mul_epi32(in[0], pair_c); + t1 = _mm_mul_epi32(in[1], pair_c); + t0 = dct_const_round_shift_64bit(t0); + t1 = dct_const_round_shift_64bit(t1); + + return pack_4(t0, t1); +} + +static INLINE void highbd_butterfly_sse4_1(const __m128i in0, const __m128i in1, + const int c0, const int c1, + __m128i *const out0, + __m128i *const out1) { + const __m128i pair_c0 = pair_set_epi32(4 * c0, 0); + const __m128i pair_c1 = pair_set_epi32(4 * c1, 0); + __m128i temp1[4], temp2[4]; + + extend_64bit(in0, temp1); + extend_64bit(in1, temp2); + temp1[2] = _mm_mul_epi32(temp1[0], pair_c1); + temp1[3] = _mm_mul_epi32(temp1[1], pair_c1); + temp1[0] = _mm_mul_epi32(temp1[0], pair_c0); + temp1[1] = _mm_mul_epi32(temp1[1], pair_c0); + temp2[2] = _mm_mul_epi32(temp2[0], pair_c0); + temp2[3] = _mm_mul_epi32(temp2[1], pair_c0); + temp2[0] = _mm_mul_epi32(temp2[0], pair_c1); + temp2[1] = _mm_mul_epi32(temp2[1], pair_c1); + temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]); + temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]); + temp2[0] = _mm_add_epi64(temp1[2], temp2[2]); + temp2[1] = _mm_add_epi64(temp1[3], temp2[3]); + temp1[0] = dct_const_round_shift_64bit(temp1[0]); + temp1[1] = dct_const_round_shift_64bit(temp1[1]); + temp2[0] = dct_const_round_shift_64bit(temp2[0]); + temp2[1] = dct_const_round_shift_64bit(temp2[1]); + *out0 = pack_4(temp1[0], temp1[1]); + *out1 = pack_4(temp2[0], temp2[1]); +} + +static INLINE void highbd_butterfly_cospi16_sse4_1(const __m128i in0, + const __m128i in1, + __m128i *const out0, + __m128i *const out1) { + __m128i temp1[2], temp2; + + temp2 = _mm_add_epi32(in0, in1); + extend_64bit(temp2, temp1); + *out0 = multiplication_round_shift_sse4_1(temp1, cospi_16_64); + temp2 = _mm_sub_epi32(in0, in1); + extend_64bit(temp2, temp1); + *out1 = multiplication_round_shift_sse4_1(temp1, cospi_16_64); +} + +static INLINE void highbd_partial_butterfly_sse4_1(const __m128i in, + const int c0, const int c1, + __m128i *const out0, + __m128i *const out1) { + __m128i temp[2]; + + extend_64bit(in, temp); + *out0 = multiplication_round_shift_sse4_1(temp, c0); + *out1 = multiplication_round_shift_sse4_1(temp, c1); +} + +static INLINE void highbd_idct4_sse4_1(__m128i *const io) { + __m128i temp[2], step[4]; + + transpose_32bit_4x4(io, io); + + // stage 1 + temp[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2] + extend_64bit(temp[0], temp); + step[0] = multiplication_round_shift_sse4_1(temp, cospi_16_64); + temp[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2] + extend_64bit(temp[0], temp); + step[1] = multiplication_round_shift_sse4_1(temp, cospi_16_64); + highbd_butterfly_sse4_1(io[1], io[3], cospi_24_64, cospi_8_64, &step[2], + &step[3]); + + // stage 2 + io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3] + io[1] = _mm_add_epi32(step[1], step[2]); // step[1] + step[2] + io[2] = _mm_sub_epi32(step[1], step[2]); // step[1] - step[2] + io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3] +} + +void vpx_highbd_idct8x8_half1d_sse4_1(__m128i *const io); +void vpx_highbd_idct16_4col_sse4_1(__m128i *const io /*io[16]*/); + +#endif // VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c new file mode 100644 index 0000000000..9f45623dee --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c @@ -0,0 +1,1140 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE2 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_ports/mem.h" + +static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) { + __m128i ubounded; + __m128i lbounded; + __m128i retval; + + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + __m128i t80, max, min; + + if (bd == 8) { + t80 = _mm_set1_epi16(0x80); + max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 8), one), t80); + } else if (bd == 10) { + t80 = _mm_set1_epi16(0x200); + max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 10), one), t80); + } else { // bd == 12 + t80 = _mm_set1_epi16(0x800); + max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 12), one), t80); + } + + min = _mm_subs_epi16(zero, t80); + + ubounded = _mm_cmpgt_epi16(value, max); + lbounded = _mm_cmplt_epi16(value, min); + retval = _mm_andnot_si128(_mm_or_si128(ubounded, lbounded), value); + ubounded = _mm_and_si128(ubounded, max); + lbounded = _mm_and_si128(lbounded, min); + retval = _mm_or_si128(retval, ubounded); + retval = _mm_or_si128(retval, lbounded); + return retval; +} + +// TODO(debargha, peter): Break up large functions into smaller ones +// in this file. +void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + __m128i blimit_v, limit_v, thresh_v; + __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0; + __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0; + __m128i ps1, qs1, ps0, qs0; + __m128i abs_p0q0, abs_p1q1, ffff, work; + __m128i filt, work_a, filter1, filter2; + __m128i flat2_q6, flat2_p6, flat2_q5, flat2_p5, flat2_q4, flat2_p4; + __m128i flat2_q3, flat2_p3, flat2_q2, flat2_p2, flat2_q1, flat2_p1; + __m128i flat2_q0, flat2_p0; + __m128i flat_q2, flat_p2, flat_q1, flat_p1, flat_q0, flat_p0; + __m128i pixelFilter_p, pixelFilter_q; + __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; + __m128i sum_p7, sum_q7, sum_p3, sum_q3; + __m128i t4, t3, t80, t1; + __m128i eight, four; + + if (bd == 8) { + blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero); + limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero); + thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero); + } else if (bd == 10) { + blimit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2); + limit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2); + thresh_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2); + } else { // bd == 12 + blimit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4); + limit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4); + thresh_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4); + } + + q4 = _mm_load_si128((__m128i *)(s + 4 * pitch)); + p4 = _mm_load_si128((__m128i *)(s - 5 * pitch)); + q3 = _mm_load_si128((__m128i *)(s + 3 * pitch)); + p3 = _mm_load_si128((__m128i *)(s - 4 * pitch)); + q2 = _mm_load_si128((__m128i *)(s + 2 * pitch)); + p2 = _mm_load_si128((__m128i *)(s - 3 * pitch)); + q1 = _mm_load_si128((__m128i *)(s + 1 * pitch)); + p1 = _mm_load_si128((__m128i *)(s - 2 * pitch)); + q0 = _mm_load_si128((__m128i *)(s + 0 * pitch)); + p0 = _mm_load_si128((__m128i *)(s - 1 * pitch)); + + // highbd_filter_mask + abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); + abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)); + + ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0); + + abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0)); + abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1)); + + // highbd_hev_mask (in C code this is actually called from highbd_filter4) + flat = _mm_max_epi16(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu16(flat, thresh_v); + hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); + + abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); // abs(p0 - q0) * 2 + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // abs(p1 - q1) / 2 + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v); + mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); + mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one)); + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)), + _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1))); + mask = _mm_max_epi16(work, mask); + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)), + _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2))); + mask = _mm_max_epi16(work, mask); + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)), + _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); + mask = _mm_max_epi16(work, mask); + + mask = _mm_subs_epu16(mask, limit_v); + mask = _mm_cmpeq_epi16(mask, zero); // return ~mask + + // lp filter + // highbd_filter4 + t4 = _mm_set1_epi16(4); + t3 = _mm_set1_epi16(3); + if (bd == 8) + t80 = _mm_set1_epi16(0x80); + else if (bd == 10) + t80 = _mm_set1_epi16(0x200); + else // bd == 12 + t80 = _mm_set1_epi16(0x800); + + t1 = _mm_set1_epi16(0x1); + + ps1 = _mm_subs_epi16(p1, t80); + qs1 = _mm_subs_epi16(q1, t80); + ps0 = _mm_subs_epi16(p0, t80); + qs0 = _mm_subs_epi16(q0, t80); + + filt = _mm_and_si128(signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd), + hev); + work_a = _mm_subs_epi16(qs0, ps0); + filt = _mm_adds_epi16(filt, work_a); + filt = _mm_adds_epi16(filt, work_a); + filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd); + filt = _mm_and_si128(filt, mask); + filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd); + filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd); + + // Filter1 >> 3 + filter1 = _mm_srai_epi16(filter1, 0x3); + filter2 = _mm_srai_epi16(filter2, 0x3); + + qs0 = _mm_adds_epi16( + signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80); + ps0 = _mm_adds_epi16( + signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80); + filt = _mm_adds_epi16(filter1, t1); + filt = _mm_srai_epi16(filt, 1); + filt = _mm_andnot_si128(hev, filt); + qs1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), + t80); + ps1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), + t80); + + // end highbd_filter4 + // loopfilter done + + // highbd_flat_mask4 + flat = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)), + _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3))); + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)), + _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3))); + flat = _mm_max_epi16(work, flat); + work = _mm_max_epi16(abs_p1p0, abs_q1q0); + flat = _mm_max_epi16(work, flat); + + if (bd == 8) + flat = _mm_subs_epu16(flat, one); + else if (bd == 10) + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2)); + else // bd == 12 + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4)); + + flat = _mm_cmpeq_epi16(flat, zero); + // end flat_mask4 + + // flat & mask = flat && mask (as used in filter8) + // (because, in both vars, each block of 16 either all 1s or all 0s) + flat = _mm_and_si128(flat, mask); + + p5 = _mm_load_si128((__m128i *)(s - 6 * pitch)); + q5 = _mm_load_si128((__m128i *)(s + 5 * pitch)); + p6 = _mm_load_si128((__m128i *)(s - 7 * pitch)); + q6 = _mm_load_si128((__m128i *)(s + 6 * pitch)); + p7 = _mm_load_si128((__m128i *)(s - 8 * pitch)); + q7 = _mm_load_si128((__m128i *)(s + 7 * pitch)); + + // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7 + // but referred to as p0-p4 & q0-q4 in fn) + flat2 = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p4, p0), _mm_subs_epu16(p0, p4)), + _mm_or_si128(_mm_subs_epu16(q4, q0), _mm_subs_epu16(q0, q4))); + + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p5, p0), _mm_subs_epu16(p0, p5)), + _mm_or_si128(_mm_subs_epu16(q5, q0), _mm_subs_epu16(q0, q5))); + flat2 = _mm_max_epi16(work, flat2); + + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p6, p0), _mm_subs_epu16(p0, p6)), + _mm_or_si128(_mm_subs_epu16(q6, q0), _mm_subs_epu16(q0, q6))); + flat2 = _mm_max_epi16(work, flat2); + + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p7, p0), _mm_subs_epu16(p0, p7)), + _mm_or_si128(_mm_subs_epu16(q7, q0), _mm_subs_epu16(q0, q7))); + flat2 = _mm_max_epi16(work, flat2); + + if (bd == 8) + flat2 = _mm_subs_epu16(flat2, one); + else if (bd == 10) + flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 2)); + else // bd == 12 + flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 4)); + + flat2 = _mm_cmpeq_epi16(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + // end highbd_flat_mask5 + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + eight = _mm_set1_epi16(8); + four = _mm_set1_epi16(4); + + pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5), _mm_add_epi16(p4, p3)); + pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5), _mm_add_epi16(q4, q3)); + + pixetFilter_p2p1p0 = _mm_add_epi16(p0, _mm_add_epi16(p2, p1)); + pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); + + pixetFilter_q2q1q0 = _mm_add_epi16(q0, _mm_add_epi16(q2, q1)); + pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); + pixelFilter_p = + _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); + pixetFilter_p2p1p0 = _mm_add_epi16( + four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); + flat2_p0 = + _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7, p0)), 4); + flat2_q0 = + _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7, q0)), 4); + flat_p0 = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3, p0)), 3); + flat_q0 = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3, q0)), 3); + + sum_p7 = _mm_add_epi16(p7, p7); + sum_q7 = _mm_add_epi16(q7, q7); + sum_p3 = _mm_add_epi16(p3, p3); + sum_q3 = _mm_add_epi16(q3, q3); + + pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6); + flat2_p1 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1)), 4); + flat2_q1 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1)), 4); + + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2); + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2); + flat_p1 = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1)), 3); + flat_q1 = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1)), 3); + + sum_p7 = _mm_add_epi16(sum_p7, p7); + sum_q7 = _mm_add_epi16(sum_q7, q7); + sum_p3 = _mm_add_epi16(sum_p3, p3); + sum_q3 = _mm_add_epi16(sum_q3, q3); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5); + flat2_p2 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2)), 4); + flat2_q2 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2)), 4); + + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1); + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1); + flat_p2 = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2)), 3); + flat_q2 = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2)), 3); + + sum_p7 = _mm_add_epi16(sum_p7, p7); + sum_q7 = _mm_add_epi16(sum_q7, q7); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4); + flat2_p3 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3)), 4); + flat2_q3 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3)), 4); + + sum_p7 = _mm_add_epi16(sum_p7, p7); + sum_q7 = _mm_add_epi16(sum_q7, q7); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3); + flat2_p4 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4)), 4); + flat2_q4 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4)), 4); + + sum_p7 = _mm_add_epi16(sum_p7, p7); + sum_q7 = _mm_add_epi16(sum_q7, q7); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2); + flat2_p5 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5)), 4); + flat2_q5 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5)), 4); + + sum_p7 = _mm_add_epi16(sum_p7, p7); + sum_q7 = _mm_add_epi16(sum_q7, q7); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1); + flat2_p6 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6)), 4); + flat2_q6 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6)), 4); + + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + // highbd_filter8 + p2 = _mm_andnot_si128(flat, p2); + // p2 remains unchanged if !(flat && mask) + flat_p2 = _mm_and_si128(flat, flat_p2); + // when (flat && mask) + p2 = _mm_or_si128(p2, flat_p2); // full list of p2 values + q2 = _mm_andnot_si128(flat, q2); + flat_q2 = _mm_and_si128(flat, flat_q2); + q2 = _mm_or_si128(q2, flat_q2); // full list of q2 values + + ps1 = _mm_andnot_si128(flat, ps1); + // p1 takes the value assigned to in in filter4 if !(flat && mask) + flat_p1 = _mm_and_si128(flat, flat_p1); + // when (flat && mask) + p1 = _mm_or_si128(ps1, flat_p1); // full list of p1 values + qs1 = _mm_andnot_si128(flat, qs1); + flat_q1 = _mm_and_si128(flat, flat_q1); + q1 = _mm_or_si128(qs1, flat_q1); // full list of q1 values + + ps0 = _mm_andnot_si128(flat, ps0); + // p0 takes the value assigned to in in filter4 if !(flat && mask) + flat_p0 = _mm_and_si128(flat, flat_p0); + // when (flat && mask) + p0 = _mm_or_si128(ps0, flat_p0); // full list of p0 values + qs0 = _mm_andnot_si128(flat, qs0); + flat_q0 = _mm_and_si128(flat, flat_q0); + q0 = _mm_or_si128(qs0, flat_q0); // full list of q0 values + // end highbd_filter8 + + // highbd_filter16 + p6 = _mm_andnot_si128(flat2, p6); + // p6 remains unchanged if !(flat2 && flat && mask) + flat2_p6 = _mm_and_si128(flat2, flat2_p6); + // get values for when (flat2 && flat && mask) + p6 = _mm_or_si128(p6, flat2_p6); // full list of p6 values + q6 = _mm_andnot_si128(flat2, q6); + // q6 remains unchanged if !(flat2 && flat && mask) + flat2_q6 = _mm_and_si128(flat2, flat2_q6); + // get values for when (flat2 && flat && mask) + q6 = _mm_or_si128(q6, flat2_q6); // full list of q6 values + _mm_store_si128((__m128i *)(s - 7 * pitch), p6); + _mm_store_si128((__m128i *)(s + 6 * pitch), q6); + + p5 = _mm_andnot_si128(flat2, p5); + // p5 remains unchanged if !(flat2 && flat && mask) + flat2_p5 = _mm_and_si128(flat2, flat2_p5); + // get values for when (flat2 && flat && mask) + p5 = _mm_or_si128(p5, flat2_p5); + // full list of p5 values + q5 = _mm_andnot_si128(flat2, q5); + // q5 remains unchanged if !(flat2 && flat && mask) + flat2_q5 = _mm_and_si128(flat2, flat2_q5); + // get values for when (flat2 && flat && mask) + q5 = _mm_or_si128(q5, flat2_q5); + // full list of q5 values + _mm_store_si128((__m128i *)(s - 6 * pitch), p5); + _mm_store_si128((__m128i *)(s + 5 * pitch), q5); + + p4 = _mm_andnot_si128(flat2, p4); + // p4 remains unchanged if !(flat2 && flat && mask) + flat2_p4 = _mm_and_si128(flat2, flat2_p4); + // get values for when (flat2 && flat && mask) + p4 = _mm_or_si128(p4, flat2_p4); // full list of p4 values + q4 = _mm_andnot_si128(flat2, q4); + // q4 remains unchanged if !(flat2 && flat && mask) + flat2_q4 = _mm_and_si128(flat2, flat2_q4); + // get values for when (flat2 && flat && mask) + q4 = _mm_or_si128(q4, flat2_q4); // full list of q4 values + _mm_store_si128((__m128i *)(s - 5 * pitch), p4); + _mm_store_si128((__m128i *)(s + 4 * pitch), q4); + + p3 = _mm_andnot_si128(flat2, p3); + // p3 takes value from highbd_filter8 if !(flat2 && flat && mask) + flat2_p3 = _mm_and_si128(flat2, flat2_p3); + // get values for when (flat2 && flat && mask) + p3 = _mm_or_si128(p3, flat2_p3); // full list of p3 values + q3 = _mm_andnot_si128(flat2, q3); + // q3 takes value from highbd_filter8 if !(flat2 && flat && mask) + flat2_q3 = _mm_and_si128(flat2, flat2_q3); + // get values for when (flat2 && flat && mask) + q3 = _mm_or_si128(q3, flat2_q3); // full list of q3 values + _mm_store_si128((__m128i *)(s - 4 * pitch), p3); + _mm_store_si128((__m128i *)(s + 3 * pitch), q3); + + p2 = _mm_andnot_si128(flat2, p2); + // p2 takes value from highbd_filter8 if !(flat2 && flat && mask) + flat2_p2 = _mm_and_si128(flat2, flat2_p2); + // get values for when (flat2 && flat && mask) + p2 = _mm_or_si128(p2, flat2_p2); + // full list of p2 values + q2 = _mm_andnot_si128(flat2, q2); + // q2 takes value from highbd_filter8 if !(flat2 && flat && mask) + flat2_q2 = _mm_and_si128(flat2, flat2_q2); + // get values for when (flat2 && flat && mask) + q2 = _mm_or_si128(q2, flat2_q2); // full list of q2 values + _mm_store_si128((__m128i *)(s - 3 * pitch), p2); + _mm_store_si128((__m128i *)(s + 2 * pitch), q2); + + p1 = _mm_andnot_si128(flat2, p1); + // p1 takes value from highbd_filter8 if !(flat2 && flat && mask) + flat2_p1 = _mm_and_si128(flat2, flat2_p1); + // get values for when (flat2 && flat && mask) + p1 = _mm_or_si128(p1, flat2_p1); // full list of p1 values + q1 = _mm_andnot_si128(flat2, q1); + // q1 takes value from highbd_filter8 if !(flat2 && flat && mask) + flat2_q1 = _mm_and_si128(flat2, flat2_q1); + // get values for when (flat2 && flat && mask) + q1 = _mm_or_si128(q1, flat2_q1); // full list of q1 values + _mm_store_si128((__m128i *)(s - 2 * pitch), p1); + _mm_store_si128((__m128i *)(s + 1 * pitch), q1); + + p0 = _mm_andnot_si128(flat2, p0); + // p0 takes value from highbd_filter8 if !(flat2 && flat && mask) + flat2_p0 = _mm_and_si128(flat2, flat2_p0); + // get values for when (flat2 && flat && mask) + p0 = _mm_or_si128(p0, flat2_p0); // full list of p0 values + q0 = _mm_andnot_si128(flat2, q0); + // q0 takes value from highbd_filter8 if !(flat2 && flat && mask) + flat2_q0 = _mm_and_si128(flat2, flat2_q0); + // get values for when (flat2 && flat && mask) + q0 = _mm_or_si128(q0, flat2_q0); // full list of q0 values + _mm_store_si128((__m128i *)(s - 1 * pitch), p0); + _mm_store_si128((__m128i *)(s - 0 * pitch), q0); +} + +void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + vpx_highbd_lpf_horizontal_16_sse2(s, pitch, blimit, limit, thresh, bd); + vpx_highbd_lpf_horizontal_16_sse2(s + 8, pitch, blimit, limit, thresh, bd); +} + +void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + DECLARE_ALIGNED(16, uint16_t, flat_op2[16]); + DECLARE_ALIGNED(16, uint16_t, flat_op1[16]); + DECLARE_ALIGNED(16, uint16_t, flat_op0[16]); + DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]); + DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]); + DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]); + const __m128i zero = _mm_setzero_si128(); + __m128i blimit_v, limit_v, thresh_v; + __m128i mask, hev, flat; + __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * pitch)); + __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * pitch)); + __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * pitch)); + __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * pitch)); + __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * pitch)); + __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * pitch)); + __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * pitch)); + __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * pitch)); + const __m128i one = _mm_set1_epi16(1); + const __m128i ffff = _mm_cmpeq_epi16(one, one); + __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + const __m128i four = _mm_set1_epi16(4); + __m128i workp_a, workp_b, workp_shft; + + const __m128i t4 = _mm_set1_epi16(4); + const __m128i t3 = _mm_set1_epi16(3); + __m128i t80; + const __m128i t1 = _mm_set1_epi16(0x1); + __m128i ps1, ps0, qs0, qs1; + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + if (bd == 8) { + blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero); + limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero); + thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero); + t80 = _mm_set1_epi16(0x80); + } else if (bd == 10) { + blimit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2); + limit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2); + thresh_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2); + t80 = _mm_set1_epi16(0x200); + } else { // bd == 12 + blimit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4); + limit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4); + thresh_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4); + t80 = _mm_set1_epi16(0x800); + } + + ps1 = _mm_subs_epi16(p1, t80); + ps0 = _mm_subs_epi16(p0, t80); + qs0 = _mm_subs_epi16(q0, t80); + qs1 = _mm_subs_epi16(q1, t80); + + // filter_mask and hev_mask + abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); + abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)); + + abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0)); + abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1)); + flat = _mm_max_epi16(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu16(flat, thresh_v); + hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); + + abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v); + mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + // So taking maximums continues to work: + mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one)); + mask = _mm_max_epi16(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + mask = _mm_max_epi16(abs_q1q0, mask); + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)), + _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2))); + mask = _mm_max_epi16(work, mask); + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)), + _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); + mask = _mm_max_epi16(work, mask); + mask = _mm_subs_epu16(mask, limit_v); + mask = _mm_cmpeq_epi16(mask, zero); + + // flat_mask4 + flat = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)), + _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2))); + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)), + _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3))); + flat = _mm_max_epi16(work, flat); + flat = _mm_max_epi16(abs_p1p0, flat); + flat = _mm_max_epi16(abs_q1q0, flat); + + if (bd == 8) + flat = _mm_subs_epu16(flat, one); + else if (bd == 10) + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2)); + else // bd == 12 + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4)); + + flat = _mm_cmpeq_epi16(flat, zero); + flat = _mm_and_si128(flat, mask); // flat & mask + + // Added before shift for rounding part of ROUND_POWER_OF_TWO + + workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); + workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_store_si128((__m128i *)&flat_op2[0], workp_shft); + + workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_store_si128((__m128i *)&flat_op1[0], workp_shft); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_store_si128((__m128i *)&flat_op0[0], workp_shft); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_store_si128((__m128i *)&flat_oq0[0], workp_shft); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_store_si128((__m128i *)&flat_oq1[0], workp_shft); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_store_si128((__m128i *)&flat_oq2[0], workp_shft); + + // lp filter + filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd); + filt = _mm_and_si128(filt, hev); + work_a = _mm_subs_epi16(qs0, ps0); + filt = _mm_adds_epi16(filt, work_a); + filt = _mm_adds_epi16(filt, work_a); + filt = _mm_adds_epi16(filt, work_a); + // (vpx_filter + 3 * (qs0 - ps0)) & mask + filt = signed_char_clamp_bd_sse2(filt, bd); + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi16(filt, t4); + filter2 = _mm_adds_epi16(filt, t3); + + // Filter1 >> 3 + filter1 = signed_char_clamp_bd_sse2(filter1, bd); + filter1 = _mm_srai_epi16(filter1, 3); + + // Filter2 >> 3 + filter2 = signed_char_clamp_bd_sse2(filter2, bd); + filter2 = _mm_srai_epi16(filter2, 3); + + // filt >> 1 + filt = _mm_adds_epi16(filter1, t1); + filt = _mm_srai_epi16(filt, 1); + // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; + filt = _mm_andnot_si128(hev, filt); + + work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd); + work_a = _mm_adds_epi16(work_a, t80); + q0 = _mm_load_si128((__m128i *)flat_oq0); + work_a = _mm_andnot_si128(flat, work_a); + q0 = _mm_and_si128(flat, q0); + q0 = _mm_or_si128(work_a, q0); + + work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd); + work_a = _mm_adds_epi16(work_a, t80); + q1 = _mm_load_si128((__m128i *)flat_oq1); + work_a = _mm_andnot_si128(flat, work_a); + q1 = _mm_and_si128(flat, q1); + q1 = _mm_or_si128(work_a, q1); + + work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); + q2 = _mm_load_si128((__m128i *)flat_oq2); + work_a = _mm_andnot_si128(flat, work_a); + q2 = _mm_and_si128(flat, q2); + q2 = _mm_or_si128(work_a, q2); + + work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd); + work_a = _mm_adds_epi16(work_a, t80); + p0 = _mm_load_si128((__m128i *)flat_op0); + work_a = _mm_andnot_si128(flat, work_a); + p0 = _mm_and_si128(flat, p0); + p0 = _mm_or_si128(work_a, p0); + + work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd); + work_a = _mm_adds_epi16(work_a, t80); + p1 = _mm_load_si128((__m128i *)flat_op1); + work_a = _mm_andnot_si128(flat, work_a); + p1 = _mm_and_si128(flat, p1); + p1 = _mm_or_si128(work_a, p1); + + work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch)); + p2 = _mm_load_si128((__m128i *)flat_op2); + work_a = _mm_andnot_si128(flat, work_a); + p2 = _mm_and_si128(flat, p2); + p2 = _mm_or_si128(work_a, p2); + + _mm_store_si128((__m128i *)(s - 3 * pitch), p2); + _mm_store_si128((__m128i *)(s - 2 * pitch), p1); + _mm_store_si128((__m128i *)(s - 1 * pitch), p0); + _mm_store_si128((__m128i *)(s + 0 * pitch), q0); + _mm_store_si128((__m128i *)(s + 1 * pitch), q1); + _mm_store_si128((__m128i *)(s + 2 * pitch), q2); +} + +void vpx_highbd_lpf_horizontal_8_dual_sse2( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + vpx_highbd_lpf_horizontal_8_sse2(s, pitch, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_horizontal_8_sse2(s + 8, pitch, blimit1, limit1, thresh1, bd); +} + +void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + const __m128i zero = _mm_setzero_si128(); + __m128i blimit_v, limit_v, thresh_v; + __m128i mask, hev, flat; + __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch)); + __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch)); + __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch)); + __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch)); + __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch)); + __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch)); + __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); + __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch)); + const __m128i abs_p1p0 = + _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); + const __m128i abs_q1q0 = + _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)); + const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0); + const __m128i one = _mm_set1_epi16(1); + __m128i abs_p0q0 = + _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0)); + __m128i abs_p1q1 = + _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1)); + __m128i work; + const __m128i t4 = _mm_set1_epi16(4); + const __m128i t3 = _mm_set1_epi16(3); + __m128i t80; + __m128i tff80; + __m128i tffe0; + __m128i t1f; + // equivalent to shifting 0x1f left by bitdepth - 8 + // and setting new bits to 1 + const __m128i t1 = _mm_set1_epi16(0x1); + __m128i t7f; + // equivalent to shifting 0x7f left by bitdepth - 8 + // and setting new bits to 1 + __m128i ps1, ps0, qs0, qs1; + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + if (bd == 8) { + blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero); + limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero); + thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero); + t80 = _mm_set1_epi16(0x80); + tff80 = _mm_set1_epi16((int16_t)0xff80); + tffe0 = _mm_set1_epi16((int16_t)0xffe0); + t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8); + t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8); + } else if (bd == 10) { + blimit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2); + limit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2); + thresh_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2); + t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2); + tff80 = _mm_slli_epi16(_mm_set1_epi16((int16_t)0xff80), 2); + tffe0 = _mm_slli_epi16(_mm_set1_epi16((int16_t)0xffe0), 2); + t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6); + t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6); + } else { // bd == 12 + blimit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4); + limit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4); + thresh_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4); + t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4); + tff80 = _mm_slli_epi16(_mm_set1_epi16((int16_t)0xff80), 4); + tffe0 = _mm_slli_epi16(_mm_set1_epi16((int16_t)0xffe0), 4); + t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4); + t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4); + } + + ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80); + ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80); + qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80); + qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80); + + // filter_mask and hev_mask + flat = _mm_max_epi16(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu16(flat, thresh_v); + hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); + + abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v); + mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + // So taking maximums continues to work: + mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one)); + mask = _mm_max_epi16(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)), + _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3))); + mask = _mm_max_epi16(work, mask); + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)), + _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); + mask = _mm_max_epi16(work, mask); + mask = _mm_subs_epu16(mask, limit_v); + mask = _mm_cmpeq_epi16(mask, zero); + + // filter4 + filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd); + filt = _mm_and_si128(filt, hev); + work_a = _mm_subs_epi16(qs0, ps0); + filt = _mm_adds_epi16(filt, work_a); + filt = _mm_adds_epi16(filt, work_a); + filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd); + + // (vpx_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + + filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd); + filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd); + + // Filter1 >> 3 + work_a = _mm_cmpgt_epi16(zero, filter1); // get the values that are <0 + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, tffe0); // sign bits for the values < 0 + filter1 = _mm_and_si128(filter1, t1f); // clamp the range + filter1 = _mm_or_si128(filter1, work_a); // reinsert the sign bits + + // Filter2 >> 3 + work_a = _mm_cmpgt_epi16(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, tffe0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + + // filt >> 1 + filt = _mm_adds_epi16(filter1, t1); + work_a = _mm_cmpgt_epi16(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, tff80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + + filt = _mm_andnot_si128(hev, filt); + + q0 = _mm_adds_epi16( + signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80); + q1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), + t80); + p0 = _mm_adds_epi16( + signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80); + p1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), + t80); + + _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1); + _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0); + _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0); + _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1); +} + +void vpx_highbd_lpf_horizontal_4_dual_sse2( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + vpx_highbd_lpf_horizontal_4_sse2(s, pitch, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_horizontal_4_sse2(s + 8, pitch, blimit1, limit1, thresh1, bd); +} + +static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[], + int out_p, int num_8x8_to_transpose) { + int idx8x8 = 0; + __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7; + do { + uint16_t *in = src[idx8x8]; + uint16_t *out = dst[idx8x8]; + + p0 = + _mm_loadu_si128((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07 + p1 = + _mm_loadu_si128((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17 + p2 = + _mm_loadu_si128((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27 + p3 = + _mm_loadu_si128((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37 + p4 = + _mm_loadu_si128((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47 + p5 = + _mm_loadu_si128((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57 + p6 = + _mm_loadu_si128((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67 + p7 = + _mm_loadu_si128((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77 + // 00 10 01 11 02 12 03 13 + x0 = _mm_unpacklo_epi16(p0, p1); + // 20 30 21 31 22 32 23 33 + x1 = _mm_unpacklo_epi16(p2, p3); + // 40 50 41 51 42 52 43 53 + x2 = _mm_unpacklo_epi16(p4, p5); + // 60 70 61 71 62 72 63 73 + x3 = _mm_unpacklo_epi16(p6, p7); + // 00 10 20 30 01 11 21 31 + x4 = _mm_unpacklo_epi32(x0, x1); + // 40 50 60 70 41 51 61 71 + x5 = _mm_unpacklo_epi32(x2, x3); + // 00 10 20 30 40 50 60 70 + x6 = _mm_unpacklo_epi64(x4, x5); + // 01 11 21 31 41 51 61 71 + x7 = _mm_unpackhi_epi64(x4, x5); + + _mm_storeu_si128((__m128i *)(out + 0 * out_p), x6); + // 00 10 20 30 40 50 60 70 + _mm_storeu_si128((__m128i *)(out + 1 * out_p), x7); + // 01 11 21 31 41 51 61 71 + + // 02 12 22 32 03 13 23 33 + x4 = _mm_unpackhi_epi32(x0, x1); + // 42 52 62 72 43 53 63 73 + x5 = _mm_unpackhi_epi32(x2, x3); + // 02 12 22 32 42 52 62 72 + x6 = _mm_unpacklo_epi64(x4, x5); + // 03 13 23 33 43 53 63 73 + x7 = _mm_unpackhi_epi64(x4, x5); + + _mm_storeu_si128((__m128i *)(out + 2 * out_p), x6); + // 02 12 22 32 42 52 62 72 + _mm_storeu_si128((__m128i *)(out + 3 * out_p), x7); + // 03 13 23 33 43 53 63 73 + + // 04 14 05 15 06 16 07 17 + x0 = _mm_unpackhi_epi16(p0, p1); + // 24 34 25 35 26 36 27 37 + x1 = _mm_unpackhi_epi16(p2, p3); + // 44 54 45 55 46 56 47 57 + x2 = _mm_unpackhi_epi16(p4, p5); + // 64 74 65 75 66 76 67 77 + x3 = _mm_unpackhi_epi16(p6, p7); + // 04 14 24 34 05 15 25 35 + x4 = _mm_unpacklo_epi32(x0, x1); + // 44 54 64 74 45 55 65 75 + x5 = _mm_unpacklo_epi32(x2, x3); + // 04 14 24 34 44 54 64 74 + x6 = _mm_unpacklo_epi64(x4, x5); + // 05 15 25 35 45 55 65 75 + x7 = _mm_unpackhi_epi64(x4, x5); + + _mm_storeu_si128((__m128i *)(out + 4 * out_p), x6); + // 04 14 24 34 44 54 64 74 + _mm_storeu_si128((__m128i *)(out + 5 * out_p), x7); + // 05 15 25 35 45 55 65 75 + + // 06 16 26 36 07 17 27 37 + x4 = _mm_unpackhi_epi32(x0, x1); + // 46 56 66 76 47 57 67 77 + x5 = _mm_unpackhi_epi32(x2, x3); + // 06 16 26 36 46 56 66 76 + x6 = _mm_unpacklo_epi64(x4, x5); + // 07 17 27 37 47 57 67 77 + x7 = _mm_unpackhi_epi64(x4, x5); + + _mm_storeu_si128((__m128i *)(out + 6 * out_p), x6); + // 06 16 26 36 46 56 66 76 + _mm_storeu_si128((__m128i *)(out + 7 * out_p), x7); + // 07 17 27 37 47 57 67 77 + } while (++idx8x8 < num_8x8_to_transpose); +} + +static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p, + uint16_t *out, int out_p) { + uint16_t *src0[1]; + uint16_t *src1[1]; + uint16_t *dest0[1]; + uint16_t *dest1[1]; + src0[0] = in0; + src1[0] = in1; + dest0[0] = out; + dest1[0] = out + 8; + highbd_transpose(src0, in_p, dest0, out_p, 1); + highbd_transpose(src1, in_p, dest1, out_p, 1); +} + +void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]); + uint16_t *src[1]; + uint16_t *dst[1]; + + // Transpose 8x8 + src[0] = s - 4; + dst[0] = t_dst; + + highbd_transpose(src, pitch, dst, 8, 1); + + // Loop filtering + vpx_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd); + + src[0] = t_dst; + dst[0] = s - 4; + + // Transpose back + highbd_transpose(src, 8, dst, pitch, 1); +} + +void vpx_highbd_lpf_vertical_4_dual_sse2( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); + uint16_t *src[2]; + uint16_t *dst[2]; + + // Transpose 8x16 + highbd_transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16); + + // Loop filtering + vpx_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, + thresh0, blimit1, limit1, thresh1, bd); + src[0] = t_dst; + src[1] = t_dst + 8; + dst[0] = s - 4; + dst[1] = s - 4 + pitch * 8; + + // Transpose back + highbd_transpose(src, 16, dst, pitch, 2); +} + +void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]); + uint16_t *src[1]; + uint16_t *dst[1]; + + // Transpose 8x8 + src[0] = s - 4; + dst[0] = t_dst; + + highbd_transpose(src, pitch, dst, 8, 1); + + // Loop filtering + vpx_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd); + + src[0] = t_dst; + dst[0] = s - 4; + + // Transpose back + highbd_transpose(src, 8, dst, pitch, 1); +} + +void vpx_highbd_lpf_vertical_8_dual_sse2( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); + uint16_t *src[2]; + uint16_t *dst[2]; + + // Transpose 8x16 + highbd_transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16); + + // Loop filtering + vpx_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, + thresh0, blimit1, limit1, thresh1, bd); + src[0] = t_dst; + src[1] = t_dst + 8; + + dst[0] = s - 4; + dst[1] = s - 4 + pitch * 8; + + // Transpose back + highbd_transpose(src, 16, dst, pitch, 2); +} + +void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]); + uint16_t *src[2]; + uint16_t *dst[2]; + + src[0] = s - 8; + src[1] = s; + dst[0] = t_dst; + dst[1] = t_dst + 8 * 8; + + // Transpose 16x8 + highbd_transpose(src, pitch, dst, 8, 2); + + // Loop filtering + vpx_highbd_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh, + bd); + src[0] = t_dst; + src[1] = t_dst + 8 * 8; + dst[0] = s - 8; + dst[1] = s; + + // Transpose back + highbd_transpose(src, 8, dst, pitch, 2); +} + +void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[256]); + + // Transpose 16x16 + highbd_transpose8x16(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16); + highbd_transpose8x16(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16); + + // Loop filtering + vpx_highbd_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit, + thresh, bd); + + // Transpose back + highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, pitch); + highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch, + pitch); +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c new file mode 100644 index 0000000000..35ca554049 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" + +static VPX_FORCE_INLINE void init_one_qp(const __m128i *p, __m256i *qp) { + const __m128i sign = _mm_srai_epi16(*p, 15); + const __m128i dc = _mm_unpacklo_epi16(*p, sign); + const __m128i ac = _mm_unpackhi_epi16(*p, sign); + *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1); +} + +static VPX_FORCE_INLINE void update_qp(__m256i *qp) { + int i; + for (i = 0; i < 5; ++i) { + qp[i] = _mm256_permute2x128_si256(qp[i], qp[i], 0x11); + } +} + +static VPX_FORCE_INLINE void init_qp( + const struct macroblock_plane *const mb_plane, const int16_t *dequant_ptr, + __m256i *qp, int log_scale) { + const __m128i zbin = _mm_loadu_si128((const __m128i *)mb_plane->zbin); + const __m128i round = _mm_loadu_si128((const __m128i *)mb_plane->round); + const __m128i quant = _mm_loadu_si128((const __m128i *)mb_plane->quant); + const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr); + const __m128i quant_shift = + _mm_loadu_si128((const __m128i *)mb_plane->quant_shift); + init_one_qp(&zbin, &qp[0]); + init_one_qp(&round, &qp[1]); + init_one_qp(&quant, &qp[2]); + init_one_qp(&dequant, &qp[3]); + init_one_qp(&quant_shift, &qp[4]); + if (log_scale > 0) { + const __m256i rnd = _mm256_set1_epi32((int16_t)(1 << (log_scale - 1))); + qp[0] = _mm256_add_epi32(qp[0], rnd); + qp[0] = _mm256_srai_epi32(qp[0], log_scale); + + qp[1] = _mm256_add_epi32(qp[1], rnd); + qp[1] = _mm256_srai_epi32(qp[1], log_scale); + } + // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when + // calculating the zbin mask. + qp[0] = _mm256_sub_epi32(qp[0], _mm256_set1_epi32(1)); +} + +// Note: +// *x is vector multiplied by *y which is 16 int32_t parallel multiplication +// and right shift 16. The output, 16 int32_t is save in *p. +static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32(const __m256i *x, + const __m256i *y) { + __m256i prod_lo = _mm256_mul_epi32(*x, *y); + __m256i prod_hi = _mm256_srli_epi64(*x, 32); + const __m256i mult_hi = _mm256_srli_epi64(*y, 32); + const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); + prod_hi = _mm256_mul_epi32(prod_hi, mult_hi); + prod_lo = _mm256_srli_epi64(prod_lo, 16); + prod_lo = _mm256_and_si256(prod_lo, mask); + prod_hi = _mm256_srli_epi64(prod_hi, 16); + prod_hi = _mm256_slli_epi64(prod_hi, 32); + return _mm256_or_si256(prod_lo, prod_hi); +} + +static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan_ptr, + __m256i eobmax, + __m256i nz_mask) { + const __m256i packed_nz_mask = _mm256_packs_epi32(nz_mask, nz_mask); + const __m256i packed_nz_mask_perm = + _mm256_permute4x64_epi64(packed_nz_mask, 0xD8); + const __m256i iscan = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr)); + const __m256i nz_iscan = _mm256_and_si256(iscan, packed_nz_mask_perm); + return _mm256_max_epi16(eobmax, nz_iscan); +} + +// Get the max eob from the lower 128 bits. +static VPX_FORCE_INLINE uint16_t get_max_eob(__m256i eob) { + __m256i eob_s; + eob_s = _mm256_shuffle_epi32(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 1); + eob = _mm256_max_epi16(eob, eob_s); +#if defined(_MSC_VER) && (_MSC_VER < 1910) + return _mm_cvtsi128_si32(_mm256_extracti128_si256(eob, 0)) & 0xffff; +#else + return (uint16_t)_mm256_extract_epi16(eob, 0); +#endif +} + +static VPX_FORCE_INLINE void quantize(const __m256i *qp, + const tran_low_t *coeff_ptr, + const int16_t *iscan_ptr, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + __m256i *eob) { + const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi32(coeff); + const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]); + + if (_mm256_movemask_epi8(zbin_mask) == 0) { + const __m256i zero = _mm256_setzero_si256(); + _mm256_storeu_si256((__m256i *)qcoeff, zero); + _mm256_storeu_si256((__m256i *)dqcoeff, zero); + return; + } + { + const __m256i tmp_rnd = + _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask); + const __m256i tmp = mm256_mul_shift_epi32(&tmp_rnd, &qp[2]); + const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd); + const __m256i abs_q = mm256_mul_shift_epi32(&tmp2, &qp[4]); + const __m256i abs_dq = _mm256_mullo_epi32(abs_q, qp[3]); + const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256()); + const __m256i q = _mm256_sign_epi32(abs_q, coeff); + const __m256i dq = _mm256_sign_epi32(abs_dq, coeff); + + _mm256_storeu_si256((__m256i *)qcoeff, q); + _mm256_storeu_si256((__m256i *)dqcoeff, dq); + + *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask); + } +} + +void vpx_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const int step = 8; + __m256i eob = _mm256_setzero_si256(); + __m256i qp[5]; + const int16_t *iscan = scan_order->iscan; + + init_qp(mb_plane, dequant_ptr, qp, 0); + + quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + + update_qp(qp); + + while (n_coeffs > 0) { + quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + } + + *eob_ptr = get_max_eob(eob); +} + +static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x, + const __m256i *y, + int log_scale) { + __m256i prod_lo = _mm256_mul_epi32(*x, *y); + __m256i prod_hi = _mm256_srli_epi64(*x, 32); + const __m256i mult_hi = _mm256_srli_epi64(*y, 32); + const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); + prod_hi = _mm256_mul_epi32(prod_hi, mult_hi); + prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale); + prod_lo = _mm256_and_si256(prod_lo, mask); + prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale); + prod_hi = _mm256_slli_epi64(prod_hi, 32); + return _mm256_or_si256(prod_lo, prod_hi); +} + +static VPX_FORCE_INLINE void quantize_b_32x32( + const __m256i *qp, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob) { + const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi32(coeff); + const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]); + + if (_mm256_movemask_epi8(zbin_mask) == 0) { + const __m256i zero = _mm256_setzero_si256(); + _mm256_storeu_si256((__m256i *)qcoeff, zero); + _mm256_storeu_si256((__m256i *)dqcoeff, zero); + return; + } + + { + const __m256i tmp_rnd = + _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask); + // const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw; + const __m256i tmp = mm256_mul_shift_epi32_logscale(&tmp_rnd, &qp[2], 0); + const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd); + // const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); + const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp2, &qp[4], 1); + const __m256i abs_dq = + _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, qp[3]), 1); + const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256()); + const __m256i q = _mm256_sign_epi32(abs_q, coeff); + const __m256i dq = _mm256_sign_epi32(abs_dq, coeff); + + _mm256_storeu_si256((__m256i *)qcoeff, q); + _mm256_storeu_si256((__m256i *)dqcoeff, dq); + + *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask); + } +} + +void vpx_highbd_quantize_b_32x32_avx2( + const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const struct ScanOrder *const scan_order) { + const unsigned int step = 8; + intptr_t n_coeffs = 32 * 32; + const int16_t *iscan = scan_order->iscan; + __m256i eob = _mm256_setzero_si256(); + __m256i qp[5]; + + init_qp(mb_plane, dequant_ptr, qp, 1); + + quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + + update_qp(qp); + + while (n_coeffs > 0) { + quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + } + + *eob_ptr = get_max_eob(eob); +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c new file mode 100644 index 0000000000..adae60756d --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" + +void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, + const struct macroblock_plane *mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + int i, j, non_zero_regs = (int)count / 4, eob_i = 0; + __m128i zbins[2]; + __m128i nzbins[2]; + const int16_t *iscan = scan_order->iscan; + const int16_t *zbin_ptr = mb_plane->zbin; + const int16_t *round_ptr = mb_plane->round; + const int16_t *quant_ptr = mb_plane->quant; + const int16_t *quant_shift_ptr = mb_plane->quant_shift; + + zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1], + (int)zbin_ptr[0]); + zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]); + + nzbins[0] = _mm_setzero_si128(); + nzbins[1] = _mm_setzero_si128(); + nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); + nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); + + memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); + + // Pre-scan pass + for (i = ((int)count / 4) - 1; i >= 0; i--) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (test == 0xffff) + non_zero_regs--; + else + break; + } + + // Quantization pass: + for (i = 0; i < non_zero_regs; i++) { + __m128i coeffs, coeffs_sign, tmp1, tmp2; + int test; + int abs_coeff[4]; + int coeff_sign[4]; + + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + coeffs_sign = _mm_srai_epi32(coeffs, 31); + coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); + tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); + tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); + tmp1 = _mm_or_si128(tmp1, tmp2); + test = _mm_movemask_epi8(tmp1); + _mm_storeu_si128((__m128i *)abs_coeff, coeffs); + _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign); + + for (j = 0; j < 4; j++) { + if (test & (1 << (4 * j))) { + int k = 4 * i + j; + const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0]; + const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3; + const uint32_t abs_qcoeff = + (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16); + qcoeff_ptr[k] = + (int)(abs_qcoeff ^ (uint32_t)coeff_sign[j]) - coeff_sign[j]; + dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; + if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; + } + } + } + *eob_ptr = eob_i; +} + +void vpx_highbd_quantize_b_32x32_sse2( + const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const struct ScanOrder *const scan_order) { + __m128i zbins[2]; + __m128i nzbins[2]; + int idx = 0; + int idx_arr[1024]; + int i, eob = 0; + const intptr_t n_coeffs = 32 * 32; + const int16_t *iscan = scan_order->iscan; + const int zbin0_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1); + const int zbin1_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1); + + zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); + zbins[1] = _mm_set1_epi32(zbin1_tmp); + + nzbins[0] = _mm_setzero_si128(); + nzbins[1] = _mm_setzero_si128(); + nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); + nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + // Pre-scan pass + for (i = 0; i < n_coeffs / 4; i++) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (!(test & 0xf)) idx_arr[idx++] = i * 4; + if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; + if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; + if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; + } + + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = idx_arr[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = + abs_coeff + ROUND_POWER_OF_TWO(mb_plane->round[rc != 0], 1); + const int64_t tmp2 = ((tmp1 * mb_plane->quant[rc != 0]) >> 16) + tmp1; + const uint32_t abs_qcoeff = + (uint32_t)((tmp2 * mb_plane->quant_shift[rc != 0]) >> 15); + qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; + } + *eob_ptr = eob; +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c new file mode 100644 index 0000000000..e483fdce73 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c @@ -0,0 +1,462 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include // AVX2 +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +static VPX_FORCE_INLINE void calc_final_4(const __m256i *const sums /*[4]*/, + uint32_t sad_array[4]) { + const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]); + const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]); + const __m256i t2 = _mm256_hadd_epi32(t0, t1); + const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2), + _mm256_extractf128_si256(t2, 1)); + _mm_storeu_si128((__m128i *)sad_array, sum); +} + +static VPX_FORCE_INLINE void highbd_sad64xHx4d(__m256i *sums_16 /*[4]*/, + const uint16_t *src, + int src_stride, + uint16_t *refs[4], + int ref_stride, int height) { + int i; + for (i = 0; i < height; ++i) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16)); + const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32)); + const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48)); + int x; + + for (x = 0; x < 4; ++x) { + __m256i r[4]; + r[0] = _mm256_loadu_si256((const __m256i *)refs[x]); + r[1] = _mm256_loadu_si256((const __m256i *)(refs[x] + 16)); + r[2] = _mm256_loadu_si256((const __m256i *)(refs[x] + 32)); + r[3] = _mm256_loadu_si256((const __m256i *)(refs[x] + 48)); + + // absolute differences between every ref[] to src + r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s0)); + r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s1)); + r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s2)); + r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s3)); + + // sum every abs diff + sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[0], r[1])); + sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[2], r[3])); + } + + src += src_stride; + refs[0] += ref_stride; + refs[1] += ref_stride; + refs[2] += ref_stride; + refs[3] += ref_stride; + } +} + +static VPX_FORCE_INLINE void highbd_sad64xNx4d_avx2( + const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4], int n) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *refs[4]; + __m256i sums_16[4]; + __m256i sums_32[4]; + int i; + + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); + sums_32[0] = _mm256_setzero_si256(); + sums_32[1] = _mm256_setzero_si256(); + sums_32[2] = _mm256_setzero_si256(); + sums_32[3] = _mm256_setzero_si256(); + + for (i = 0; i < (n / 2); ++i) { + sums_16[0] = _mm256_setzero_si256(); + sums_16[1] = _mm256_setzero_si256(); + sums_16[2] = _mm256_setzero_si256(); + sums_16[3] = _mm256_setzero_si256(); + + highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2); + + /* sums_16 will outrange after 2 rows, so add current sums_16 to + * sums_32*/ + sums_32[0] = _mm256_add_epi32( + sums_32[0], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)))); + sums_32[1] = _mm256_add_epi32( + sums_32[1], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)))); + sums_32[2] = _mm256_add_epi32( + sums_32[2], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)))); + sums_32[3] = _mm256_add_epi32( + sums_32[3], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)))); + + src += src_stride << 1; + } + calc_final_4(sums_32, sad_array); +} + +#define HIGHBD_SAD64XNX4D(n) \ + void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad64xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \ + n); \ + } + +#define HIGHBD_SADSKIP64XNx4D(n) \ + void vpx_highbd_sad_skip_64x##n##x4d_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad64xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + sad_array, n / 2); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } + +static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/, + const uint16_t *src, + int src_stride, + uint16_t *refs[4], + int ref_stride, int height) { + int i; + for (i = 0; i < height; i++) { + __m256i r[8]; + + // load src and all ref[] + const __m256i s = _mm256_load_si256((const __m256i *)src); + const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 16)); + r[0] = _mm256_loadu_si256((const __m256i *)refs[0]); + r[1] = _mm256_loadu_si256((const __m256i *)(refs[0] + 16)); + r[2] = _mm256_loadu_si256((const __m256i *)refs[1]); + r[3] = _mm256_loadu_si256((const __m256i *)(refs[1] + 16)); + r[4] = _mm256_loadu_si256((const __m256i *)refs[2]); + r[5] = _mm256_loadu_si256((const __m256i *)(refs[2] + 16)); + r[6] = _mm256_loadu_si256((const __m256i *)refs[3]); + r[7] = _mm256_loadu_si256((const __m256i *)(refs[3] + 16)); + + // absolute differences between every ref[] to src + r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s)); + r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s2)); + r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s)); + r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s2)); + r[4] = _mm256_abs_epi16(_mm256_sub_epi16(r[4], s)); + r[5] = _mm256_abs_epi16(_mm256_sub_epi16(r[5], s2)); + r[6] = _mm256_abs_epi16(_mm256_sub_epi16(r[6], s)); + r[7] = _mm256_abs_epi16(_mm256_sub_epi16(r[7], s2)); + + // sum every abs diff + sums_16[0] = _mm256_add_epi16(sums_16[0], _mm256_add_epi16(r[0], r[1])); + sums_16[1] = _mm256_add_epi16(sums_16[1], _mm256_add_epi16(r[2], r[3])); + sums_16[2] = _mm256_add_epi16(sums_16[2], _mm256_add_epi16(r[4], r[5])); + sums_16[3] = _mm256_add_epi16(sums_16[3], _mm256_add_epi16(r[6], r[7])); + + src += src_stride; + refs[0] += ref_stride; + refs[1] += ref_stride; + refs[2] += ref_stride; + refs[3] += ref_stride; + } +} + +static VPX_FORCE_INLINE void highbd_sad32xNx4d_avx2( + const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4], int n) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *refs[4]; + __m256i sums_16[4]; + __m256i sums_32[4]; + int i; + + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); + sums_32[0] = _mm256_setzero_si256(); + sums_32[1] = _mm256_setzero_si256(); + sums_32[2] = _mm256_setzero_si256(); + sums_32[3] = _mm256_setzero_si256(); + + for (i = 0; i < (n / 8); ++i) { + sums_16[0] = _mm256_setzero_si256(); + sums_16[1] = _mm256_setzero_si256(); + sums_16[2] = _mm256_setzero_si256(); + sums_16[3] = _mm256_setzero_si256(); + + highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8); + + /* sums_16 will outrange after 8 rows, so add current sums_16 to + * sums_32*/ + sums_32[0] = _mm256_add_epi32( + sums_32[0], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)))); + sums_32[1] = _mm256_add_epi32( + sums_32[1], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)))); + sums_32[2] = _mm256_add_epi32( + sums_32[2], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)))); + sums_32[3] = _mm256_add_epi32( + sums_32[3], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)))); + + src += src_stride << 3; + } + calc_final_4(sums_32, sad_array); +} + +#define HIGHBD_SAD32XNX4D(n) \ + void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad32xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \ + n); \ + } + +#define HIGHBD_SADSKIP32XNx4D(n) \ + void vpx_highbd_sad_skip_32x##n##x4d_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad32xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + sad_array, n / 2); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } + +static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/, + const uint16_t *src, + int src_stride, + uint16_t *refs[4], + int ref_stride, int height) { + int i; + for (i = 0; i < height; i++) { + __m256i r[4]; + + // load src and all ref[] + const __m256i s = _mm256_load_si256((const __m256i *)src); + r[0] = _mm256_loadu_si256((const __m256i *)refs[0]); + r[1] = _mm256_loadu_si256((const __m256i *)refs[1]); + r[2] = _mm256_loadu_si256((const __m256i *)refs[2]); + r[3] = _mm256_loadu_si256((const __m256i *)refs[3]); + + // absolute differences between every ref[] to src + r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s)); + r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s)); + r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s)); + r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s)); + + // sum every abs diff + sums_16[0] = _mm256_add_epi16(sums_16[0], r[0]); + sums_16[1] = _mm256_add_epi16(sums_16[1], r[1]); + sums_16[2] = _mm256_add_epi16(sums_16[2], r[2]); + sums_16[3] = _mm256_add_epi16(sums_16[3], r[3]); + + src += src_stride; + refs[0] += ref_stride; + refs[1] += ref_stride; + refs[2] += ref_stride; + refs[3] += ref_stride; + } +} + +static VPX_FORCE_INLINE void highbd_sad16xNx4d_avx2( + const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4], int n) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *refs[4]; + __m256i sums_16[4]; + __m256i sums_32[4]; + const int height = VPXMIN(16, n); + const int num_iters = n / height; + int i; + + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); + sums_32[0] = _mm256_setzero_si256(); + sums_32[1] = _mm256_setzero_si256(); + sums_32[2] = _mm256_setzero_si256(); + sums_32[3] = _mm256_setzero_si256(); + + for (i = 0; i < num_iters; ++i) { + sums_16[0] = _mm256_setzero_si256(); + sums_16[1] = _mm256_setzero_si256(); + sums_16[2] = _mm256_setzero_si256(); + sums_16[3] = _mm256_setzero_si256(); + + highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, height); + + // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32 + sums_32[0] = _mm256_add_epi32( + sums_32[0], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)))); + sums_32[1] = _mm256_add_epi32( + sums_32[1], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)))); + sums_32[2] = _mm256_add_epi32( + sums_32[2], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)))); + sums_32[3] = _mm256_add_epi32( + sums_32[3], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)))); + + src += src_stride << 4; + } + calc_final_4(sums_32, sad_array); +} + +#define HIGHBD_SAD16XNX4D(n) \ + void vpx_highbd_sad16x##n##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad16xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \ + n); \ + } + +#define HIGHBD_SADSKIP16XNx4D(n) \ + void vpx_highbd_sad_skip_16x##n##x4d_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad16xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + sad_array, n / 2); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } + +void vpx_highbd_sad16x16x4d_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4]) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *refs[4]; + __m256i sums_16[4]; + + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); + sums_16[0] = _mm256_setzero_si256(); + sums_16[1] = _mm256_setzero_si256(); + sums_16[2] = _mm256_setzero_si256(); + sums_16[3] = _mm256_setzero_si256(); + + highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16); + + { + __m256i sums_32[4]; + sums_32[0] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))); + sums_32[1] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))); + sums_32[2] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))); + sums_32[3] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))); + calc_final_4(sums_32, sad_array); + } +} + +void vpx_highbd_sad16x8x4d_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4]) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *refs[4]; + __m256i sums_16[4]; + + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); + sums_16[0] = _mm256_setzero_si256(); + sums_16[1] = _mm256_setzero_si256(); + sums_16[2] = _mm256_setzero_si256(); + sums_16[3] = _mm256_setzero_si256(); + + highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 8); + + { + __m256i sums_32[4]; + sums_32[0] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))); + sums_32[1] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))); + sums_32[2] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))); + sums_32[3] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))); + calc_final_4(sums_32, sad_array); + } +} + +// clang-format off +HIGHBD_SAD64XNX4D(64) +HIGHBD_SADSKIP64XNx4D(64) + +HIGHBD_SAD64XNX4D(32) +HIGHBD_SADSKIP64XNx4D(32) + +HIGHBD_SAD32XNX4D(64) +HIGHBD_SADSKIP32XNx4D(64) + +HIGHBD_SAD32XNX4D(32) +HIGHBD_SADSKIP32XNx4D(32) + +HIGHBD_SAD32XNX4D(16) +HIGHBD_SADSKIP32XNx4D(16) + +HIGHBD_SAD16XNX4D(32) +HIGHBD_SADSKIP16XNx4D(32) + +HIGHBD_SADSKIP16XNx4D(16) + +HIGHBD_SADSKIP16XNx4D(8) + // clang-format on diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm new file mode 100644 index 0000000000..a07892d811 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm @@ -0,0 +1,326 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_4x2x4 5-6 0 + movh m0, [srcq +%2*2] +%if %1 == 1 + movu m4, [ref1q+%3*2] + movu m5, [ref2q+%3*2] + movu m6, [ref3q+%3*2] + movu m7, [ref4q+%3*2] + movhps m0, [srcq +%4*2] + movhps m4, [ref1q+%5*2] + movhps m5, [ref2q+%5*2] + movhps m6, [ref3q+%5*2] + movhps m7, [ref4q+%5*2] + mova m3, m0 + mova m2, m0 + psubusw m3, m4 + psubusw m2, m5 + psubusw m4, m0 + psubusw m5, m0 + por m4, m3 + por m5, m2 + pmaddwd m4, m1 + pmaddwd m5, m1 + mova m3, m0 + mova m2, m0 + psubusw m3, m6 + psubusw m2, m7 + psubusw m6, m0 + psubusw m7, m0 + por m6, m3 + por m7, m2 + pmaddwd m6, m1 + pmaddwd m7, m1 +%else + movu m2, [ref1q+%3*2] + movhps m0, [srcq +%4*2] + movhps m2, [ref1q+%5*2] + mova m3, m0 + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m4, m2 + + movu m2, [ref2q+%3*2] + mova m3, m0 + movhps m2, [ref2q+%5*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m5, m2 + + movu m2, [ref3q+%3*2] + mova m3, m0 + movhps m2, [ref3q+%5*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m6, m2 + + movu m2, [ref4q+%3*2] + mova m3, m0 + movhps m2, [ref4q+%5*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m7, m2 +%endif +%if %6 == 1 + lea srcq, [srcq +src_strideq*4] + lea ref1q, [ref1q+ref_strideq*4] + lea ref2q, [ref2q+ref_strideq*4] + lea ref3q, [ref3q+ref_strideq*4] + lea ref4q, [ref4q+ref_strideq*4] +%endif +%endmacro + +; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_8x2x4 5-6 0 + ; 1st 8 px + mova m0, [srcq +%2*2] +%if %1 == 1 + movu m4, [ref1q+%3*2] + movu m5, [ref2q+%3*2] + movu m6, [ref3q+%3*2] + movu m7, [ref4q+%3*2] + mova m3, m0 + mova m2, m0 + psubusw m3, m4 + psubusw m2, m5 + psubusw m4, m0 + psubusw m5, m0 + por m4, m3 + por m5, m2 + pmaddwd m4, m1 + pmaddwd m5, m1 + mova m3, m0 + mova m2, m0 + psubusw m3, m6 + psubusw m2, m7 + psubusw m6, m0 + psubusw m7, m0 + por m6, m3 + por m7, m2 + pmaddwd m6, m1 + pmaddwd m7, m1 +%else + mova m3, m0 + movu m2, [ref1q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m4, m2 + movu m2, [ref2q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m5, m2 + movu m2, [ref3q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m6, m2 + movu m2, [ref4q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m7, m2 +%endif + + ; 2nd 8 px + mova m0, [srcq +(%4)*2] + mova m3, m0 + movu m2, [ref1q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m4, m2 + movu m2, [ref2q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m5, m2 + movu m2, [ref3q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m6, m2 + movu m2, [ref4q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 +%if %6 == 1 + lea srcq, [srcq +src_strideq*4] + lea ref1q, [ref1q+ref_strideq*4] + lea ref2q, [ref2q+ref_strideq*4] + lea ref3q, [ref3q+ref_strideq*4] + lea ref4q, [ref4q+ref_strideq*4] +%endif + por m2, m3 + pmaddwd m2, m1 + paddd m7, m2 +%endmacro + +; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_16x2x4 5-6 0 + HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8) + HIGH_PROCESS_8x2x4 0, %4, %5, (%4 + 8), (%5 + 8), %6 +%endmacro + +; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_32x2x4 5-6 0 + HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16) + HIGH_PROCESS_16x2x4 0, %4, %5, (%4 + 16), (%5 + 16), %6 +%endmacro + +; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_64x2x4 5-6 0 + HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32) + HIGH_PROCESS_32x2x4 0, %4, %5, (%4 + 32), (%5 + 32), %6 +%endmacro + +; void vpx_highbd_sadNxNx4d_sse2(uint8_t *src, int src_stride, +; uint8_t *ref[4], int ref_stride, +; uint32_t res[4]); +; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8 +; Macro Arguments: +; 1: Width +; 2: Height +; 3: If 0, then normal sad, if 2, then skip every other row +%macro HIGH_SADNXN4D 2-3 0 +%if %3 == 0 ; normal sad +%if UNIX64 +cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ + res, ref2, ref3, ref4 +%else +cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ + ref2, ref3, ref4 +%endif +%else ; %3 == 2, downsample +%if UNIX64 +cglobal highbd_sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ + res, ref2, ref3, ref4 +%else +cglobal highbd_sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ + ref2, ref3, ref4 +%endif ; +%endif ; sad/avg/skip + +; set m1 + push srcq + mov srcd, 0x00010001 + movd m1, srcd + pshufd m1, m1, 0x0 + pop srcq + +%if %3 == 2 ; skip rows + lea src_strided, [2*src_strided] + lea ref_strided, [2*ref_strided] +%endif ; skip rows + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + mov ref2q, [ref1q+gprsize*1] + mov ref3q, [ref1q+gprsize*2] + mov ref4q, [ref1q+gprsize*3] + mov ref1q, [ref1q+gprsize*0] + +; convert byte pointers to short pointers + shl srcq, 1 + shl ref2q, 1 + shl ref3q, 1 + shl ref4q, 1 + shl ref1q, 1 + + HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 +%if %3 == 2 ; Downsampling by two +%define num_rep (%2-8)/4 +%else +%define num_rep (%2-4)/2 +%endif +%rep num_rep + HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 +%endrep +%undef rep + HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 + ; N.B. HIGH_PROCESS outputs dwords (32 bits) + ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM + movhlps m0, m4 + movhlps m1, m5 + movhlps m2, m6 + movhlps m3, m7 + paddd m4, m0 + paddd m5, m1 + paddd m6, m2 + paddd m7, m3 + punpckldq m4, m5 + punpckldq m6, m7 + movhlps m0, m4 + movhlps m1, m6 + paddd m4, m0 + paddd m6, m1 + punpcklqdq m4, m6 +%if %3 == 2 ; skip rows + pslld m4, 1 +%endif + movifnidn r4, r4mp + movu [r4], m4 + RET +%endmacro + + +INIT_XMM sse2 +HIGH_SADNXN4D 64, 64 +HIGH_SADNXN4D 64, 32 +HIGH_SADNXN4D 32, 64 +HIGH_SADNXN4D 32, 32 +HIGH_SADNXN4D 32, 16 +HIGH_SADNXN4D 16, 32 +HIGH_SADNXN4D 16, 16 +HIGH_SADNXN4D 16, 8 +HIGH_SADNXN4D 8, 16 +HIGH_SADNXN4D 8, 8 +HIGH_SADNXN4D 8, 4 +HIGH_SADNXN4D 4, 8 +HIGH_SADNXN4D 4, 4 + +HIGH_SADNXN4D 64, 64, 2 +HIGH_SADNXN4D 64, 32, 2 +HIGH_SADNXN4D 32, 64, 2 +HIGH_SADNXN4D 32, 32, 2 +HIGH_SADNXN4D 32, 16, 2 +HIGH_SADNXN4D 16, 32, 2 +HIGH_SADNXN4D 16, 16, 2 +HIGH_SADNXN4D 16, 8, 2 +HIGH_SADNXN4D 8, 16, 2 +HIGH_SADNXN4D 8, 8, 2 +HIGH_SADNXN4D 4, 8, 2 diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_avx2.c new file mode 100644 index 0000000000..78f8eb8bfa --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_avx2.c @@ -0,0 +1,522 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +static VPX_FORCE_INLINE unsigned int calc_final(const __m256i sums_32) { + const __m256i t0 = _mm256_add_epi32(sums_32, _mm256_srli_si256(sums_32, 8)); + const __m256i t1 = _mm256_add_epi32(t0, _mm256_srli_si256(t0, 4)); + const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t1), + _mm256_extractf128_si256(t1, 1)); + return (unsigned int)_mm_cvtsi128_si32(sum); +} + +static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16, + const uint16_t *src, int src_stride, + uint16_t *ref, int ref_stride, + int height) { + int i; + for (i = 0; i < height; ++i) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16)); + const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32)); + const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48)); + const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16)); + const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32)); + const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48)); + // absolute differences between every ref[] to src + const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0)); + const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1)); + const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(r2, s2)); + const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(r3, s3)); + // sum every abs diff + *sums_16 = + _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1)); + *sums_16 = + _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3)); + + src += src_stride; + ref += ref_stride; + } +} + +static VPX_FORCE_INLINE unsigned int highbd_sad64xN_avx2(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + int n) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + __m256i sums_32 = _mm256_setzero_si256(); + int i; + + for (i = 0; i < (n / 2); ++i) { + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2); + + /* sums_16 will outrange after 2 rows, so add current sums_16 to + * sums_32*/ + sums_32 = _mm256_add_epi32( + sums_32, + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); + + src += src_stride << 1; + ref += ref_stride << 1; + } + return calc_final(sums_32); +} + +#define HIGHBD_SAD64XN(n) \ + unsigned int vpx_highbd_sad64x##n##_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad64xN_avx2(src, src_stride, ref, ref_stride, n); \ + } + +#define HIGHBD_SADSKIP64xN(n) \ + unsigned int vpx_highbd_sad_skip_64x##n##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * highbd_sad64xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \ + n / 2); \ + } + +static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16, + const uint16_t *src, int src_stride, + uint16_t *ref, int ref_stride, + int height) { + int i; + for (i = 0; i < height; ++i) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16)); + const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16)); + // absolute differences between every ref[] to src + const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0)); + const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1)); + // sum every abs diff + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0); + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1); + + src += src_stride; + ref += ref_stride; + } +} + +static VPX_FORCE_INLINE unsigned int highbd_sad32xN_avx2(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + int n) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + __m256i sums_32 = _mm256_setzero_si256(); + int i; + + for (i = 0; i < (n / 8); ++i) { + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8); + + /* sums_16 will outrange after 8 rows, so add current sums_16 to + * sums_32*/ + sums_32 = _mm256_add_epi32( + sums_32, + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); + + src += src_stride << 3; + ref += ref_stride << 3; + } + return calc_final(sums_32); +} + +#define HIGHBD_SAD32XN(n) \ + unsigned int vpx_highbd_sad32x##n##_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad32xN_avx2(src, src_stride, ref, ref_stride, n); \ + } + +#define HIGHBD_SADSKIP32xN(n) \ + unsigned int vpx_highbd_sad_skip_32x##n##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * highbd_sad32xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \ + n / 2); \ + } + +static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16, + const uint16_t *src, int src_stride, + uint16_t *ref, int ref_stride, + int height) { + int i; + for (i = 0; i < height; i += 2) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride)); + const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride)); + // absolute differences between every ref[] to src + const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0)); + const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1)); + // sum every abs diff + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0); + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1); + + src += src_stride << 1; + ref += ref_stride << 1; + } +} + +static VPX_FORCE_INLINE unsigned int highbd_sad16xN_avx2(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + int n) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + __m256i sums_32 = _mm256_setzero_si256(); + const int height = VPXMIN(16, n); + const int num_iters = n / height; + int i; + + for (i = 0; i < num_iters; ++i) { + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, height); + + // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32 + sums_32 = _mm256_add_epi32( + sums_32, + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); + + src += src_stride << 4; + ref += ref_stride << 4; + } + return calc_final(sums_32); +} + +#define HIGHBD_SAD16XN(n) \ + unsigned int vpx_highbd_sad16x##n##_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad16xN_avx2(src, src_stride, ref, ref_stride, n); \ + } + +#define HIGHBD_SADSKIP16xN(n) \ + unsigned int vpx_highbd_sad_skip_16x##n##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * highbd_sad16xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \ + n / 2); \ + } + +unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16); + + { + const __m256i sums_32 = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))); + return calc_final(sums_32); + } +} + +unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 8); + + { + const __m256i sums_32 = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))); + return calc_final(sums_32); + } +} + +// clang-format off +HIGHBD_SAD64XN(64) +HIGHBD_SADSKIP64xN(64) +HIGHBD_SAD64XN(32) +HIGHBD_SADSKIP64xN(32) +HIGHBD_SAD32XN(64) +HIGHBD_SADSKIP32xN(64) +HIGHBD_SAD32XN(32) +HIGHBD_SADSKIP32xN(32) +HIGHBD_SAD32XN(16) +HIGHBD_SADSKIP32xN(16) +HIGHBD_SAD16XN(32) +HIGHBD_SADSKIP16xN(32) +HIGHBD_SADSKIP16xN(16) +HIGHBD_SADSKIP16xN(8) +//clang-format on + +// AVG ------------------------------------------------------------------------- +static VPX_FORCE_INLINE void highbd_sad64xH_avg(__m256i *sums_16, + const uint16_t *src, + int src_stride, uint16_t *ref, + int ref_stride, uint16_t *sec, + int height) { + int i; + for (i = 0; i < height; ++i) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16)); + const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32)); + const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48)); + const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16)); + const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32)); + const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48)); + const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec); + const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16)); + const __m256i x2 = _mm256_loadu_si256((const __m256i *)(sec + 32)); + const __m256i x3 = _mm256_loadu_si256((const __m256i *)(sec + 48)); + const __m256i avg0 = _mm256_avg_epu16(r0, x0); + const __m256i avg1 = _mm256_avg_epu16(r1, x1); + const __m256i avg2 = _mm256_avg_epu16(r2, x2); + const __m256i avg3 = _mm256_avg_epu16(r3, x3); + // absolute differences between every ref/pred avg to src + const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0)); + const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1)); + const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(avg2, s2)); + const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(avg3, s3)); + // sum every abs diff + *sums_16 = + _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1)); + *sums_16 = + _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3)); + + src += src_stride; + ref += ref_stride; + sec += 64; + } +} + +#define HIGHBD_SAD64XN_AVG(n) \ + unsigned int vpx_highbd_sad64x##n##_avg_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); \ + __m256i sums_32 = _mm256_setzero_si256(); \ + int i; \ + \ + for (i = 0; i < (n / 2); ++i) { \ + __m256i sums_16 = _mm256_setzero_si256(); \ + \ + highbd_sad64xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 2); \ + \ + /* sums_16 will outrange after 2 rows, so add current sums_16 to \ + * sums_32*/ \ + sums_32 = _mm256_add_epi32( \ + sums_32, \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \ + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \ + \ + src += src_stride << 1; \ + ref += ref_stride << 1; \ + sec += 64 << 1; \ + } \ + return calc_final(sums_32); \ + } + +// 64x64 +HIGHBD_SAD64XN_AVG(64) + +// 64x32 +HIGHBD_SAD64XN_AVG(32) + +static VPX_FORCE_INLINE void highbd_sad32xH_avg(__m256i *sums_16, + const uint16_t *src, + int src_stride, uint16_t *ref, + int ref_stride, uint16_t *sec, + int height) { + int i; + for (i = 0; i < height; ++i) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16)); + const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16)); + const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec); + const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16)); + const __m256i avg0 = _mm256_avg_epu16(r0, x0); + const __m256i avg1 = _mm256_avg_epu16(r1, x1); + // absolute differences between every ref/pred avg to src + const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0)); + const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1)); + // sum every abs diff + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0); + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1); + + src += src_stride; + ref += ref_stride; + sec += 32; + } +} + +#define HIGHBD_SAD32XN_AVG(n) \ + unsigned int vpx_highbd_sad32x##n##_avg_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); \ + __m256i sums_32 = _mm256_setzero_si256(); \ + int i; \ + \ + for (i = 0; i < (n / 8); ++i) { \ + __m256i sums_16 = _mm256_setzero_si256(); \ + \ + highbd_sad32xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8); \ + \ + /* sums_16 will outrange after 8 rows, so add current sums_16 to \ + * sums_32*/ \ + sums_32 = _mm256_add_epi32( \ + sums_32, \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \ + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \ + \ + src += src_stride << 3; \ + ref += ref_stride << 3; \ + sec += 32 << 3; \ + } \ + return calc_final(sums_32); \ + } + +// 32x64 +HIGHBD_SAD32XN_AVG(64) + +// 32x32 +HIGHBD_SAD32XN_AVG(32) + +// 32x16 +HIGHBD_SAD32XN_AVG(16) + +static VPX_FORCE_INLINE void highbd_sad16xH_avg(__m256i *sums_16, + const uint16_t *src, + int src_stride, uint16_t *ref, + int ref_stride, uint16_t *sec, + int height) { + int i; + for (i = 0; i < height; i += 2) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride)); + const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride)); + const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec); + const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16)); + const __m256i avg0 = _mm256_avg_epu16(r0, x0); + const __m256i avg1 = _mm256_avg_epu16(r1, x1); + // absolute differences between every ref[] to src + const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0)); + const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1)); + // sum every abs diff + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0); + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1); + + src += src_stride << 1; + ref += ref_stride << 1; + sec += 32; + } +} + +unsigned int vpx_highbd_sad16x32_avg_avx2(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + const uint8_t *second_pred) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); + __m256i sums_32 = _mm256_setzero_si256(); + int i; + + for (i = 0; i < 2; ++i) { + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16); + + // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32 + sums_32 = _mm256_add_epi32( + sums_32, + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); + + src += src_stride << 4; + ref += ref_stride << 4; + sec += 16 << 4; + } + return calc_final(sums_32); +} + +unsigned int vpx_highbd_sad16x16_avg_avx2(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + const uint8_t *second_pred) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16); + + { + const __m256i sums_32 = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))); + return calc_final(sums_32); + } +} + +unsigned int vpx_highbd_sad16x8_avg_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8); + + { + const __m256i sums_32 = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))); + return calc_final(sums_32); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm new file mode 100644 index 0000000000..62ad2237ff --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm @@ -0,0 +1,416 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; Macro Arguments +; Arg 1: Width +; Arg 2: Height +; Arg 3: Number of general purpose registers +; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows +%macro HIGH_SAD_FN 4 +%if %4 == 0 +%if %3 == 5 +cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 +%elif %4 == 1 ; avg +%if %3 == 5 +cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \ + second_pred, n_rows +%else ; %3 == 7 +cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \ + ref, ref_stride, \ + second_pred, \ + src_stride3, ref_stride3 +%if VPX_ARCH_X86_64 +%define n_rowsd r7d +%else ; x86-32 +%define n_rowsd dword r0m +%endif ; x86-32/64 +%endif ; %3 == 5/7 +%else ; %4 == 2, skip rows +%if %3 == 5 +cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 +%endif ; sad/avg/skip +%if %4 == 2 ; double the stride if we are skipping rows + lea src_strided, [src_strided*2] + lea ref_strided, [ref_strided*2] +%endif + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided +%if %3 == 7 + lea src_stride3q, [src_strideq*3] + lea ref_stride3q, [ref_strideq*3] +%endif ; %3 == 7 +; convert src, ref & second_pred to short ptrs (from byte ptrs) + shl srcq, 1 + shl refq, 1 +%if %4 == 1 + shl second_predq, 1 +%endif +%endmacro + +; unsigned int vpx_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD64XN 1-2 0 + HIGH_SAD_FN 64, %1, 5, %2 +%if %2 == 2 ; skip rows, so divide number of rows by 2 + mov n_rowsd, %1/2 +%else + mov n_rowsd, %1 +%endif + pxor m0, m0 + pxor m6, m6 + +.loop: + ; first half of each row + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+16] + psubusw m5, m2 + psubusw m2, [srcq+16] + por m2, m5 + mova m5, [srcq+32] + psubusw m5, m3 + psubusw m3, [srcq+32] + por m3, m5 + mova m5, [srcq+48] + psubusw m5, m4 + psubusw m4, [srcq+48] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + paddd m0, m1 + paddd m0, m3 + ; second half of each row + movu m1, [refq+64] + movu m2, [refq+80] + movu m3, [refq+96] + movu m4, [refq+112] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq+64] + psubusw m5, m1 + psubusw m1, [srcq+64] + por m1, m5 + mova m5, [srcq+80] + psubusw m5, m2 + psubusw m2, [srcq+80] + por m2, m5 + mova m5, [srcq+96] + psubusw m5, m3 + psubusw m3, [srcq+96] + por m3, m5 + mova m5, [srcq+112] + psubusw m5, m4 + psubusw m4, [srcq+112] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*2] + paddd m0, m1 + lea srcq, [srcq+src_strideq*2] + paddd m0, m3 + + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 +%if %2 == 2 ; we skipped rows, so we need to double the sad + pslld m0, 1 +%endif + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD64XN 64 ; highbd_sad64x64_sse2 +HIGH_SAD64XN 32 ; highbd_sad64x32_sse2 +HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2 +HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 +HIGH_SAD64XN 64, 2 ; highbd_sad_skip_64x64_sse2 +HIGH_SAD64XN 32, 2 ; highbd_sad_skip_64x32_sse2 + + +; unsigned int vpx_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD32XN 1-2 0 + HIGH_SAD_FN 32, %1, 5, %2 +%if %2 == 2 ; skip rows, so divide number of rows by 2 + mov n_rowsd, %1/2 +%else + mov n_rowsd, %1 +%endif + pxor m0, m0 + pxor m6, m6 + +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+16] + psubusw m5, m2 + psubusw m2, [srcq+16] + por m2, m5 + mova m5, [srcq+32] + psubusw m5, m3 + psubusw m3, [srcq+32] + por m3, m5 + mova m5, [srcq+48] + psubusw m5, m4 + psubusw m4, [srcq+48] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*2] + paddd m0, m1 + lea srcq, [srcq+src_strideq*2] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 +%if %2 == 2 ; we skipped rows, so we need to double the sad + pslld m0, 1 +%endif + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD32XN 64 ; highbd_sad32x64_sse2 +HIGH_SAD32XN 32 ; highbd_sad32x32_sse2 +HIGH_SAD32XN 16 ; highbd_sad32x16_sse2 +HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2 +HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2 +HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2 +HIGH_SAD32XN 64, 2 ; highbd_sad_skip_32x64_sse2 +HIGH_SAD32XN 32, 2 ; highbd_sad_skip_32x32_sse2 +HIGH_SAD32XN 16, 2 ; highbd_sad_skip_32x16_sse2 + +; unsigned int vpx_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD16XN 1-2 0 + HIGH_SAD_FN 16, %1, 5, %2 +%if %2 == 2 ; skip rows, so divide number of rows by 2 + mov n_rowsd, %1/4 +%else + mov n_rowsd, %1/2 +%endif + pxor m0, m0 + pxor m6, m6 + +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+ref_strideq*2] + movu m4, [refq+ref_strideq*2+16] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+16] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*2+16] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+16] + psubusw m5, m2 + psubusw m2, [srcq+16] + por m2, m5 + mova m5, [srcq+src_strideq*2] + psubusw m5, m3 + psubusw m3, [srcq+src_strideq*2] + por m3, m5 + mova m5, [srcq+src_strideq*2+16] + psubusw m5, m4 + psubusw m4, [srcq+src_strideq*2+16] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 +%if %2 == 2 ; we skipped rows, so we need to double the sad + pslld m0, 1 +%endif + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD16XN 32 ; highbd_sad16x32_sse2 +HIGH_SAD16XN 16 ; highbd_sad16x16_sse2 +HIGH_SAD16XN 8 ; highbd_sad16x8_sse2 +HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2 +HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2 +HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 +HIGH_SAD16XN 32, 2 ; highbd_sad_skip_16x32_sse2 +HIGH_SAD16XN 16, 2 ; highbd_sad_skip_16x16_sse2 +HIGH_SAD16XN 8, 2 ; highbd_sad_skip_16x8_sse2 + +; unsigned int vpx_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD8XN 1-2 0 + HIGH_SAD_FN 8, %1, 7, %2 +%if %2 == 2 ; skip rows, so divide number of rows by 2 + mov n_rowsd, %1/8 +%else + mov n_rowsd, %1/4 +%endif + pxor m0, m0 + pxor m6, m6 + +.loop: + movu m1, [refq] + movu m2, [refq+ref_strideq*2] + movu m3, [refq+ref_strideq*4] + movu m4, [refq+ref_stride3q*2] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+src_strideq*2] + psubusw m5, m2 + psubusw m2, [srcq+src_strideq*2] + por m2, m5 + mova m5, [srcq+src_strideq*4] + psubusw m5, m3 + psubusw m3, [srcq+src_strideq*4] + por m3, m5 + mova m5, [srcq+src_stride3q*2] + psubusw m5, m4 + psubusw m4, [srcq+src_stride3q*2] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*8] + paddd m0, m1 + lea srcq, [srcq+src_strideq*8] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 +%if %2 == 2 ; we skipped rows, so we need to double the sad + pslld m0, 1 +%endif + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD8XN 16 ; highbd_sad8x16_sse2 +HIGH_SAD8XN 8 ; highbd_sad8x8_sse2 +HIGH_SAD8XN 4 ; highbd_sad8x4_sse2 +HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2 +HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2 +HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2 +HIGH_SAD8XN 16, 2 ; highbd_sad_skip_8x16_sse2 +HIGH_SAD8XN 8, 2 ; highbd_sad_skip_8x8_sse2 diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm new file mode 100644 index 0000000000..5a3a2818de --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm @@ -0,0 +1,1021 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_8: times 8 dw 8 +bilin_filter_m_sse2: times 8 dw 16 + times 8 dw 0 + times 8 dw 14 + times 8 dw 2 + times 8 dw 12 + times 8 dw 4 + times 8 dw 10 + times 8 dw 6 + times 16 dw 8 + times 8 dw 6 + times 8 dw 10 + times 8 dw 4 + times 8 dw 12 + times 8 dw 2 + times 8 dw 14 + +SECTION .text + +; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, +; int x_offset, int y_offset, +; const uint8_t *ref, ptrdiff_t ref_stride, +; int height, unsigned int *sse); +; +; This function returns the SE and stores SSE in the given pointer. + +%macro SUM_SSE 6 ; src1, ref1, src2, ref2, sum, sse + psubw %3, %4 + psubw %1, %2 + mova %4, %3 ; make copies to manipulate to calc sum + mova %2, %1 ; use originals for calc sse + pmaddwd %3, %3 + paddw %4, %2 + pmaddwd %1, %1 + movhlps %2, %4 + paddd %6, %3 + paddw %4, %2 + pxor %2, %2 + pcmpgtw %2, %4 ; mask for 0 > %4 (sum) + punpcklwd %4, %2 ; sign-extend word to dword + paddd %6, %1 + paddd %5, %4 + +%endmacro + +%macro STORE_AND_RET 0 +%if mmsize == 16 + ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit + ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. + ; We have to sign-extend it before adding the words within the register + ; and outputing to a dword. + movhlps m3, m7 + movhlps m4, m6 + paddd m7, m3 + paddd m6, m4 + pshufd m3, m7, 0x1 + pshufd m4, m6, 0x1 + paddd m7, m3 + paddd m6, m4 + mov r1, ssem ; r1 = unsigned int *sse + movd [r1], m7 ; store sse + movd eax, m6 ; store sum as return value +%endif + RET +%endmacro + +%macro INC_SRC_BY_SRC_STRIDE 0 +%if VPX_ARCH_X86=1 && CONFIG_PIC=1 + add srcq, src_stridemp + add srcq, src_stridemp +%else + lea srcq, [srcq + src_strideq*2] +%endif +%endmacro + +%macro SUBPEL_VARIANCE 1-2 0 ; W +%define bilin_filter_m bilin_filter_m_sse2 +%define filter_idx_shift 5 + + +%if VPX_ARCH_X86_64 + %if %2 == 1 ; avg + cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ + x_offset, y_offset, \ + ref, ref_stride, \ + second_pred, second_stride, height, sse + %define second_str second_strideq + %else + cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ + x_offset, y_offset, \ + ref, ref_stride, height, sse + %endif + %define block_height heightd + %define bilin_filter sseq +%else + %if CONFIG_PIC=1 + %if %2 == 1 ; avg + cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + ref, ref_stride, \ + second_pred, second_stride, height, sse + %define block_height dword heightm + %define second_str second_stridemp + %else + cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + ref, ref_stride, height, sse + %define block_height heightd + %endif + + ; reuse argument stack space + %define g_bilin_filterm x_offsetm + %define g_pw_8m y_offsetm + + ; Store bilin_filter and pw_8 location in stack + %if GET_GOT_DEFINED == 1 + GET_GOT eax + add esp, 4 ; restore esp + %endif + + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx + + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back + %else + %if %2 == 1 ; avg + cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + ref, ref_stride, \ + second_pred, second_stride, height, sse + %define block_height dword heightm + %define second_str second_stridemp + %else + cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + ref, ref_stride, height, sse + %define block_height heightd + %endif + + %define bilin_filter bilin_filter_m + %endif +%endif + + ASSERT %1 <= 16 ; m6 overflows if w > 16 + pxor m6, m6 ; sum + pxor m7, m7 ; sse + +%if %1 < 16 + sar block_height, 1 +%endif +%if %2 == 1 ; avg + shl second_str, 1 +%endif + + ; FIXME(rbultje) replace by jumptable? + test x_offsetd, x_offsetd + jnz .x_nonzero + ; x_offset == 0 + test y_offsetd, y_offsetd + jnz .x_zero_y_nonzero + + ; x_offset == 0 && y_offset == 0 +.x_zero_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m2, [srcq + 16] + mova m1, [refq] + mova m3, [refq + 16] +%if %2 == 1 ; avg + pavgw m0, [second_predq] + pavgw m2, [second_predq+16] +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq + src_strideq*2] + lea refq, [refq + ref_strideq*2] +%if %2 == 1 ; avg + add second_predq, second_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq + src_strideq*2] + mova m1, [refq] + mova m3, [refq + ref_strideq*2] +%if %2 == 1 ; avg + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m2, [second_predq] +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq + src_strideq*4] + lea refq, [refq + ref_strideq*4] +%if %2 == 1 ; avg + add second_predq, second_str +%endif +%endif + dec block_height + jg .x_zero_y_zero_loop + STORE_AND_RET + +.x_zero_y_nonzero: + cmp y_offsetd, 8 + jne .x_zero_y_nonhalf + + ; x_offset == 0 && y_offset == 0.5 +.x_zero_y_half_loop: +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m4, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*2+16] + mova m2, [refq] + mova m3, [refq+16] + pavgw m0, m4 + pavgw m1, m5 +%if %2 == 1 ; avg + pavgw m0, [second_predq] + pavgw m1, [second_predq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*2] + lea refq, [refq + ref_strideq*2] +%if %2 == 1 ; avg + add second_predq, second_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m1, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*4] + mova m2, [refq] + mova m3, [refq+ref_strideq*2] + pavgw m0, m1 + pavgw m1, m5 +%if %2 == 1 ; avg + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m1, [second_predq] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*4] + lea refq, [refq + ref_strideq*4] +%if %2 == 1 ; avg + add second_predq, second_str +%endif +%endif + dec block_height + jg .x_zero_y_half_loop + STORE_AND_RET + +.x_zero_y_nonhalf: + ; x_offset == 0 && y_offset == bilin interpolation +%if VPX_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl y_offsetd, filter_idx_shift +%if VPX_ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+y_offsetq] + mova m9, [bilin_filter+y_offsetq+16] + mova m10, [GLOBAL(pw_8)] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ; x86-32 or mmx +%if VPX_ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0, reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +.x_zero_y_other_loop: +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq + 16] + movu m4, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*2+16] + mova m2, [refq] + mova m3, [refq+16] + ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can + ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of + ; instructions is the same (5), but it is 1 mul instead of 2, so might be + ; slightly faster because of pmullw latency. It would also cut our rodata + ; tables in half for this function, and save 1-2 registers on x86-64. + pmullw m1, filter_y_a + pmullw m5, filter_y_b + paddw m1, filter_rnd + pmullw m0, filter_y_a + pmullw m4, filter_y_b + paddw m0, filter_rnd + paddw m1, m5 + paddw m0, m4 + psrlw m1, 4 + psrlw m0, 4 +%if %2 == 1 ; avg + pavgw m0, [second_predq] + pavgw m1, [second_predq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*2] + lea refq, [refq + ref_strideq*2] +%if %2 == 1 ; avg + add second_predq, second_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m1, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*4] + mova m4, m1 + mova m2, [refq] + mova m3, [refq+ref_strideq*2] + pmullw m1, filter_y_a + pmullw m5, filter_y_b + paddw m1, filter_rnd + pmullw m0, filter_y_a + pmullw m4, filter_y_b + paddw m0, filter_rnd + paddw m1, m5 + paddw m0, m4 + psrlw m1, 4 + psrlw m0, 4 +%if %2 == 1 ; avg + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m1, [second_predq] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*4] + lea refq, [refq + ref_strideq*4] +%if %2 == 1 ; avg + add second_predq, second_str +%endif +%endif + dec block_height + jg .x_zero_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET + +.x_nonzero: + cmp x_offsetd, 8 + jne .x_nonhalf + ; x_offset == 0.5 + test y_offsetd, y_offsetd + jnz .x_half_y_nonzero + + ; x_offset == 0.5 && y_offset == 0 +.x_half_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq + 16] + movu m4, [srcq + 2] + movu m5, [srcq + 18] + mova m2, [refq] + mova m3, [refq + 16] + pavgw m0, m4 + pavgw m1, m5 +%if %2 == 1 ; avg + pavgw m0, [second_predq] + pavgw m1, [second_predq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*2] + lea refq, [refq + ref_strideq*2] +%if %2 == 1 ; avg + add second_predq, second_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m1, [srcq + src_strideq*2] + movu m4, [srcq + 2] + movu m5, [srcq + src_strideq*2 + 2] + mova m2, [refq] + mova m3, [refq + ref_strideq*2] + pavgw m0, m4 + pavgw m1, m5 +%if %2 == 1 ; avg + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m1, [second_predq] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*4] + lea refq, [refq + ref_strideq*4] +%if %2 == 1 ; avg + add second_predq, second_str +%endif +%endif + dec block_height + jg .x_half_y_zero_loop + STORE_AND_RET + +.x_half_y_nonzero: + cmp y_offsetd, 8 + jne .x_half_y_nonhalf + + ; x_offset == 0.5 && y_offset == 0.5 +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+2] + movu m3, [srcq+18] + lea srcq, [srcq + src_strideq*2] + pavgw m0, m2 + pavgw m1, m3 +.x_half_y_half_loop: + movu m2, [srcq] + movu m3, [srcq + 16] + movu m4, [srcq + 2] + movu m5, [srcq + 18] + pavgw m2, m4 + pavgw m3, m5 + pavgw m0, m2 + pavgw m1, m3 + mova m4, [refq] + mova m5, [refq + 16] +%if %2 == 1 ; avg + pavgw m0, [second_predq] + pavgw m1, [second_predq+16] +%endif + SUM_SSE m0, m4, m1, m5, m6, m7 + mova m0, m2 + mova m1, m3 + + lea srcq, [srcq + src_strideq*2] + lea refq, [refq + ref_strideq*2] +%if %2 == 1 ; avg + add second_predq, second_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq+2] + lea srcq, [srcq + src_strideq*2] + pavgw m0, m2 +.x_half_y_half_loop: + movu m2, [srcq] + movu m3, [srcq + src_strideq*2] + movu m4, [srcq + 2] + movu m5, [srcq + src_strideq*2 + 2] + pavgw m2, m4 + pavgw m3, m5 + pavgw m0, m2 + pavgw m2, m3 + mova m4, [refq] + mova m5, [refq + ref_strideq*2] +%if %2 == 1 ; avg + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m2, [second_predq] +%endif + SUM_SSE m0, m4, m2, m5, m6, m7 + mova m0, m3 + + lea srcq, [srcq + src_strideq*4] + lea refq, [refq + ref_strideq*4] +%if %2 == 1 ; avg + add second_predq, second_str +%endif +%endif + dec block_height + jg .x_half_y_half_loop + STORE_AND_RET + +.x_half_y_nonhalf: + ; x_offset == 0.5 && y_offset == bilin interpolation +%if VPX_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl y_offsetd, filter_idx_shift +%if VPX_ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+y_offsetq] + mova m9, [bilin_filter+y_offsetq+16] + mova m10, [GLOBAL(pw_8)] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ; x86_32 +%if VPX_ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0.5. We can reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+2] + movu m3, [srcq+18] + lea srcq, [srcq + src_strideq*2] + pavgw m0, m2 + pavgw m1, m3 +.x_half_y_other_loop: + movu m2, [srcq] + movu m3, [srcq+16] + movu m4, [srcq+2] + movu m5, [srcq+18] + pavgw m2, m4 + pavgw m3, m5 + mova m4, m2 + mova m5, m3 + pmullw m1, filter_y_a + pmullw m3, filter_y_b + paddw m1, filter_rnd + paddw m1, m3 + pmullw m0, filter_y_a + pmullw m2, filter_y_b + paddw m0, filter_rnd + psrlw m1, 4 + paddw m0, m2 + mova m2, [refq] + psrlw m0, 4 + mova m3, [refq+16] +%if %2 == 1 ; avg + pavgw m0, [second_predq] + pavgw m1, [second_predq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + mova m0, m4 + mova m1, m5 + + lea srcq, [srcq + src_strideq*2] + lea refq, [refq + ref_strideq*2] +%if %2 == 1 ; avg + add second_predq, second_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq+2] + lea srcq, [srcq + src_strideq*2] + pavgw m0, m2 +.x_half_y_other_loop: + movu m2, [srcq] + movu m3, [srcq+src_strideq*2] + movu m4, [srcq+2] + movu m5, [srcq+src_strideq*2+2] + pavgw m2, m4 + pavgw m3, m5 + mova m4, m2 + mova m5, m3 + pmullw m4, filter_y_a + pmullw m3, filter_y_b + paddw m4, filter_rnd + paddw m4, m3 + pmullw m0, filter_y_a + pmullw m2, filter_y_b + paddw m0, filter_rnd + psrlw m4, 4 + paddw m0, m2 + mova m2, [refq] + psrlw m0, 4 + mova m3, [refq+ref_strideq*2] +%if %2 == 1 ; avg + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m4, [second_predq] +%endif + SUM_SSE m0, m2, m4, m3, m6, m7 + mova m0, m5 + + lea srcq, [srcq + src_strideq*4] + lea refq, [refq + ref_strideq*4] +%if %2 == 1 ; avg + add second_predq, second_str +%endif +%endif + dec block_height + jg .x_half_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf: + test y_offsetd, y_offsetd + jnz .x_nonhalf_y_nonzero + + ; x_offset == bilin interpolation && y_offset == 0 +%if VPX_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl x_offsetd, filter_idx_shift +%if VPX_ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] + mova m9, [bilin_filter+x_offsetq+16] + mova m10, [GLOBAL(pw_8)] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else ; x86-32 +%if VPX_ARCH_X86=1 && CONFIG_PIC=1 +; y_offset == 0. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +.x_other_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+2] + movu m3, [srcq+18] + mova m4, [refq] + mova m5, [refq+16] + pmullw m1, filter_x_a + pmullw m3, filter_x_b + paddw m1, filter_rnd + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + paddw m1, m3 + paddw m0, m2 + psrlw m1, 4 + psrlw m0, 4 +%if %2 == 1 ; avg + pavgw m0, [second_predq] + pavgw m1, [second_predq+16] +%endif + SUM_SSE m0, m4, m1, m5, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea refq, [refq+ref_strideq*2] +%if %2 == 1 ; avg + add second_predq, second_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m1, [srcq+src_strideq*2] + movu m2, [srcq+2] + movu m3, [srcq+src_strideq*2+2] + mova m4, [refq] + mova m5, [refq+ref_strideq*2] + pmullw m1, filter_x_a + pmullw m3, filter_x_b + paddw m1, filter_rnd + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + paddw m1, m3 + paddw m0, m2 + psrlw m1, 4 + psrlw m0, 4 +%if %2 == 1 ; avg + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m1, [second_predq] +%endif + SUM_SSE m0, m4, m1, m5, m6, m7 + + lea srcq, [srcq+src_strideq*4] + lea refq, [refq+ref_strideq*4] +%if %2 == 1 ; avg + add second_predq, second_str +%endif +%endif + dec block_height + jg .x_other_y_zero_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf_y_nonzero: + cmp y_offsetd, 8 + jne .x_nonhalf_y_nonhalf + + ; x_offset == bilin interpolation && y_offset == 0.5 +%if VPX_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl x_offsetd, filter_idx_shift +%if VPX_ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] + mova m9, [bilin_filter+x_offsetq+16] + mova m10, [GLOBAL(pw_8)] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else ; x86-32 +%if VPX_ARCH_X86=1 && CONFIG_PIC=1 +; y_offset == 0.5. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+2] + movu m3, [srcq+18] + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + pmullw m1, filter_x_a + pmullw m3, filter_x_b + paddw m1, filter_rnd + paddw m0, m2 + paddw m1, m3 + psrlw m0, 4 + psrlw m1, 4 + lea srcq, [srcq+src_strideq*2] +.x_other_y_half_loop: + movu m2, [srcq] + movu m3, [srcq+16] + movu m4, [srcq+2] + movu m5, [srcq+18] + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m2, filter_rnd + pmullw m3, filter_x_a + pmullw m5, filter_x_b + paddw m3, filter_rnd + paddw m2, m4 + paddw m3, m5 + mova m4, [refq] + mova m5, [refq+16] + psrlw m2, 4 + psrlw m3, 4 + pavgw m0, m2 + pavgw m1, m3 +%if %2 == 1 ; avg + pavgw m0, [second_predq] + pavgw m1, [second_predq+16] +%endif + SUM_SSE m0, m4, m1, m5, m6, m7 + mova m0, m2 + mova m1, m3 + + lea srcq, [srcq+src_strideq*2] + lea refq, [refq+ref_strideq*2] +%if %2 == 1 ; avg + add second_predq, second_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq+2] + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + paddw m0, m2 + psrlw m0, 4 + lea srcq, [srcq+src_strideq*2] +.x_other_y_half_loop: + movu m2, [srcq] + movu m3, [srcq+src_strideq*2] + movu m4, [srcq+2] + movu m5, [srcq+src_strideq*2+2] + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m2, filter_rnd + pmullw m3, filter_x_a + pmullw m5, filter_x_b + paddw m3, filter_rnd + paddw m2, m4 + paddw m3, m5 + mova m4, [refq] + mova m5, [refq+ref_strideq*2] + psrlw m2, 4 + psrlw m3, 4 + pavgw m0, m2 + pavgw m2, m3 +%if %2 == 1 ; avg + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m2, [second_predq] +%endif + SUM_SSE m0, m4, m2, m5, m6, m7 + mova m0, m3 + + lea srcq, [srcq+src_strideq*4] + lea refq, [refq+ref_strideq*4] +%if %2 == 1 ; avg + add second_predq, second_str +%endif +%endif + dec block_height + jg .x_other_y_half_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf_y_nonhalf: +; loading filter - this is same as in 8-bit depth +%if VPX_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5 + shl y_offsetd, filter_idx_shift +%if VPX_ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] + mova m9, [bilin_filter+x_offsetq+16] + mova m10, [bilin_filter+y_offsetq] + mova m11, [bilin_filter+y_offsetq+16] + mova m12, [GLOBAL(pw_8)] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_y_a m10 +%define filter_y_b m11 +%define filter_rnd m12 +%else ; x86-32 +%if VPX_ARCH_X86=1 && CONFIG_PIC=1 +; In this case, there is NO unused register. Used src_stride register. Later, +; src_stride has to be loaded from stack when it is needed. +%define tempq src_strideq + mov tempq, g_bilin_filterm + add x_offsetq, tempq + add y_offsetq, tempq +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter + add y_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif +; end of load filter + + ; x_offset == bilin interpolation && y_offset == bilin interpolation +%if %1 == 16 + movu m0, [srcq] + movu m2, [srcq+2] + movu m1, [srcq+16] + movu m3, [srcq+18] + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + pmullw m1, filter_x_a + pmullw m3, filter_x_b + paddw m1, filter_rnd + paddw m0, m2 + paddw m1, m3 + psrlw m0, 4 + psrlw m1, 4 + + INC_SRC_BY_SRC_STRIDE + +.x_other_y_other_loop: + movu m2, [srcq] + movu m4, [srcq+2] + movu m3, [srcq+16] + movu m5, [srcq+18] + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m2, filter_rnd + pmullw m3, filter_x_a + pmullw m5, filter_x_b + paddw m3, filter_rnd + paddw m2, m4 + paddw m3, m5 + psrlw m2, 4 + psrlw m3, 4 + mova m4, m2 + mova m5, m3 + pmullw m0, filter_y_a + pmullw m2, filter_y_b + paddw m0, filter_rnd + pmullw m1, filter_y_a + pmullw m3, filter_y_b + paddw m0, m2 + paddw m1, filter_rnd + mova m2, [refq] + paddw m1, m3 + psrlw m0, 4 + psrlw m1, 4 + mova m3, [refq+16] +%if %2 == 1 ; avg + pavgw m0, [second_predq] + pavgw m1, [second_predq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + mova m0, m4 + mova m1, m5 + + INC_SRC_BY_SRC_STRIDE + lea refq, [refq + ref_strideq * 2] +%if %2 == 1 ; avg + add second_predq, second_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq+2] + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + paddw m0, m2 + psrlw m0, 4 + + INC_SRC_BY_SRC_STRIDE + +.x_other_y_other_loop: + movu m2, [srcq] + movu m4, [srcq+2] + INC_SRC_BY_SRC_STRIDE + movu m3, [srcq] + movu m5, [srcq+2] + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m2, filter_rnd + pmullw m3, filter_x_a + pmullw m5, filter_x_b + paddw m3, filter_rnd + paddw m2, m4 + paddw m3, m5 + psrlw m2, 4 + psrlw m3, 4 + mova m4, m2 + mova m5, m3 + pmullw m0, filter_y_a + pmullw m2, filter_y_b + paddw m0, filter_rnd + pmullw m4, filter_y_a + pmullw m3, filter_y_b + paddw m0, m2 + paddw m4, filter_rnd + mova m2, [refq] + paddw m4, m3 + psrlw m0, 4 + psrlw m4, 4 + mova m3, [refq+ref_strideq*2] +%if %2 == 1 ; avg + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m4, [second_predq] +%endif + SUM_SSE m0, m2, m4, m3, m6, m7 + mova m0, m5 + + INC_SRC_BY_SRC_STRIDE + lea refq, [refq + ref_strideq * 4] +%if %2 == 1 ; avg + add second_predq, second_str +%endif +%endif + dec block_height + jg .x_other_y_other_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET +%endmacro + +INIT_XMM sse2 +SUBPEL_VARIANCE 8 +SUBPEL_VARIANCE 16 + +INIT_XMM sse2 +SUBPEL_VARIANCE 8, 1 +SUBPEL_VARIANCE 16, 1 diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm new file mode 100644 index 0000000000..5bee51fa0c --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm @@ -0,0 +1,315 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +SECTION .text + +;unsigned int vpx_highbd_calc16x16var_sse2 +;( +; unsigned char * src_ptr, +; int src_stride, +; unsigned char * ref_ptr, +; int ref_stride, +; unsigned int * SSE, +; int * Sum +;) +globalsym(vpx_highbd_calc16x16var_sse2) +sym(vpx_highbd_calc16x16var_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;[src_ptr] + mov rdi, arg(2) ;[ref_ptr] + + movsxd rax, DWORD PTR arg(1) ;[src_stride] + movsxd rdx, DWORD PTR arg(3) ;[ref_stride] + add rax, rax ; source stride in bytes + add rdx, rdx ; recon stride in bytes + + ; Prefetch data + prefetcht0 [rsi] + prefetcht0 [rsi+16] + prefetcht0 [rsi+rax] + prefetcht0 [rsi+rax+16] + lea rbx, [rsi+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rax] + prefetcht0 [rbx+rax+16] + + prefetcht0 [rdi] + prefetcht0 [rdi+16] + prefetcht0 [rdi+rdx] + prefetcht0 [rdi+rdx+16] + lea rbx, [rdi+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rdx] + prefetcht0 [rbx+rdx+16] + + pxor xmm0, xmm0 ; clear xmm0 for unpack + pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs + + pxor xmm6, xmm6 ; clear xmm6 for accumulating sse + mov rcx, 16 + +.var16loop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rdi] + + lea rbx, [rsi+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rax] + prefetcht0 [rbx+rax+16] + lea rbx, [rdi+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rdx] + prefetcht0 [rbx+rdx+16] + + pxor xmm5, xmm5 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+16] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+16] + paddd xmm6, xmm1 + + psubw xmm3, xmm2 + movdqu xmm1, XMMWORD PTR [rsi+rax] + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + movdqu xmm2, XMMWORD PTR [rdi+rdx] + paddd xmm6, xmm3 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+rax+16] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+rdx+16] + paddd xmm6, xmm1 + + psubw xmm3, xmm2 + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + paddd xmm6, xmm3 + + movdqa xmm1, xmm5 + movdqa xmm2, xmm5 + pcmpgtw xmm1, xmm0 + pcmpeqw xmm2, xmm0 + por xmm1, xmm2 + pcmpeqw xmm1, xmm0 + movdqa xmm2, xmm5 + punpcklwd xmm5, xmm1 + punpckhwd xmm2, xmm1 + paddd xmm7, xmm5 + paddd xmm7, xmm2 + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + sub rcx, 2 + jnz .var16loop + + movdqa xmm4, xmm6 + punpckldq xmm6, xmm0 + + punpckhdq xmm4, xmm0 + movdqa xmm5, xmm7 + + paddd xmm6, xmm4 + punpckldq xmm7, xmm0 + + punpckhdq xmm5, xmm0 + paddd xmm7, xmm5 + + movdqa xmm4, xmm6 + movdqa xmm5, xmm7 + + psrldq xmm4, 8 + psrldq xmm5, 8 + + paddd xmm6, xmm4 + paddd xmm7, xmm5 + + mov rdi, arg(4) ; [SSE] + mov rax, arg(5) ; [Sum] + + movd DWORD PTR [rdi], xmm6 + movd DWORD PTR [rax], xmm7 + + + ; begin epilog + pop rdi + pop rsi + pop rbx + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vpx_highbd_calc8x8var_sse2 +;( +; unsigned char * src_ptr, +; int src_stride, +; unsigned char * ref_ptr, +; int ref_stride, +; unsigned int * SSE, +; int * Sum +;) +globalsym(vpx_highbd_calc8x8var_sse2) +sym(vpx_highbd_calc8x8var_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;[src_ptr] + mov rdi, arg(2) ;[ref_ptr] + + movsxd rax, DWORD PTR arg(1) ;[src_stride] + movsxd rdx, DWORD PTR arg(3) ;[ref_stride] + add rax, rax ; source stride in bytes + add rdx, rdx ; recon stride in bytes + + ; Prefetch data + prefetcht0 [rsi] + prefetcht0 [rsi+rax] + lea rbx, [rsi+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rax] + + prefetcht0 [rdi] + prefetcht0 [rdi+rdx] + lea rbx, [rdi+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rdx] + + pxor xmm0, xmm0 ; clear xmm0 for unpack + pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs + + pxor xmm6, xmm6 ; clear xmm6 for accumulating sse + mov rcx, 8 + +.var8loop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rdi] + + lea rbx, [rsi+rax*4] + prefetcht0 [rbx] + prefetcht0 [rbx+rax] + lea rbx, [rbx+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rax] + lea rbx, [rdi+rdx*4] + prefetcht0 [rbx] + prefetcht0 [rbx+rdx] + lea rbx, [rbx+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rdx] + + pxor xmm5, xmm5 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+rax] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+rdx] + paddd xmm6, xmm1 + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + + psubw xmm3, xmm2 + movdqu xmm1, XMMWORD PTR [rsi] + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + movdqu xmm2, XMMWORD PTR [rdi] + paddd xmm6, xmm3 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+rax] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+rdx] + paddd xmm6, xmm1 + + psubw xmm3, xmm2 + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + paddd xmm6, xmm3 + + movdqa xmm1, xmm5 + movdqa xmm2, xmm5 + pcmpgtw xmm1, xmm0 + pcmpeqw xmm2, xmm0 + por xmm1, xmm2 + pcmpeqw xmm1, xmm0 + movdqa xmm2, xmm5 + punpcklwd xmm5, xmm1 + punpckhwd xmm2, xmm1 + paddd xmm7, xmm5 + paddd xmm7, xmm2 + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + sub rcx, 4 + jnz .var8loop + + movdqa xmm4, xmm6 + punpckldq xmm6, xmm0 + + punpckhdq xmm4, xmm0 + movdqa xmm5, xmm7 + + paddd xmm6, xmm4 + punpckldq xmm7, xmm0 + + punpckhdq xmm5, xmm0 + paddd xmm7, xmm5 + + movdqa xmm4, xmm6 + movdqa xmm5, xmm7 + + psrldq xmm4, 8 + psrldq xmm5, 8 + + paddd xmm6, xmm4 + paddd xmm7, xmm5 + + mov rdi, arg(4) ; [SSE] + mov rax, arg(5) ; [Sum] + + movd DWORD PTR [rdi], xmm6 + movd DWORD PTR [rax], xmm7 + + ; begin epilog + pop rdi + pop rsi + pop rbx + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_sse2.c new file mode 100644 index 0000000000..381e0ad193 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_sse2.c @@ -0,0 +1,608 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include // SSE2 + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_ports/mem.h" + +typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +uint32_t vpx_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +uint32_t vpx_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +static void highbd_8_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, int w, + int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, + ref_stride, &sse0, &sum0); + *sse += sse0; + *sum += sum0; + } + } +} + +static void highbd_10_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, int w, + int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + uint64_t sse_long = 0; + int32_t sum_long = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, + ref_stride, &sse0, &sum0); + sse_long += sse0; + sum_long += sum0; + } + } + *sum = ROUND_POWER_OF_TWO(sum_long, 2); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); +} + +static void highbd_12_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, int w, + int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + uint64_t sse_long = 0; + int32_t sum_long = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, + ref_stride, &sse0, &sum0); + sse_long += sse0; + sum_long += sum0; + } + } + *sum = ROUND_POWER_OF_TWO(sum_long, 4); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); +} + +#define HIGH_GET_VAR(S) \ + void vpx_highbd_8_get##S##x##S##var_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \ + sum); \ + } \ + \ + void vpx_highbd_10_get##S##x##S##var_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \ + sum); \ + *sum = ROUND_POWER_OF_TWO(*sum, 2); \ + *sse = ROUND_POWER_OF_TWO(*sse, 4); \ + } \ + \ + void vpx_highbd_12_get##S##x##S##var_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \ + sum); \ + *sum = ROUND_POWER_OF_TWO(*sum, 4); \ + *sse = ROUND_POWER_OF_TWO(*sse, 8); \ + } + +HIGH_GET_VAR(16) +HIGH_GET_VAR(8) + +#undef HIGH_GET_VAR + +#define VAR_FN(w, h, block_size, shift) \ + uint32_t vpx_highbd_8_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_8_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> (shift)); \ + } \ + \ + uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_10_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) >> (shift)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t vpx_highbd_12_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_12_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) >> (shift)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +VAR_FN(64, 64, 16, 12) +VAR_FN(64, 32, 16, 11) +VAR_FN(32, 64, 16, 11) +VAR_FN(32, 32, 16, 10) +VAR_FN(32, 16, 16, 9) +VAR_FN(16, 32, 16, 9) +VAR_FN(16, 16, 16, 8) +VAR_FN(16, 8, 8, 7) +VAR_FN(8, 16, 8, 7) +VAR_FN(8, 8, 8, 6) + +#undef VAR_FN + +unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, + vpx_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, + vpx_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, + vpx_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, + vpx_highbd_calc8x8var_sse2, 8); + return *sse; +} + +unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, + vpx_highbd_calc8x8var_sse2, 8); + return *sse; +} + +unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, + vpx_highbd_calc8x8var_sse2, 8); + return *sse; +} + +// The 2 unused parameters are place holders for PIC enabled build. +// These definitions are for functions defined in +// highbd_subpel_variance_impl_sse2.asm +#define DECL(w, opt) \ + int vpx_highbd_sub_pixel_variance##w##xh_##opt( \ + const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ + const uint16_t *ref, ptrdiff_t ref_stride, int height, \ + unsigned int *sse, void *unused0, void *unused); +#define DECLS(opt) \ + DECL(8, opt) \ + DECL(16, opt) + +DECLS(sse2) + +#undef DECLS +#undef DECL + +#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ + uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) { \ + uint32_t sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ + src, src_stride, x_offset, y_offset, ref, ref_stride, h, &sse, NULL, \ + NULL); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + *sse_ptr = sse; \ + return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \ + } \ + \ + uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) { \ + int64_t var; \ + uint32_t sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ + src, src_stride, x_offset, y_offset, ref, ref_stride, h, &sse, NULL, \ + NULL); \ + if (w > wf) { \ + uint32_t sse2; \ + int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 2); \ + sse = ROUND_POWER_OF_TWO(sse, 4); \ + *sse_ptr = sse; \ + var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) { \ + int start_row; \ + uint32_t sse; \ + int se = 0; \ + int64_t var; \ + uint64_t long_sse = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + for (start_row = 0; start_row < h; start_row += 16) { \ + uint32_t sse2; \ + int height = h - start_row < 16 ? h - start_row : 16; \ + int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + (start_row * src_stride), src_stride, x_offset, y_offset, \ + ref + (start_row * ref_stride), ref_stride, height, &sse2, NULL, \ + NULL); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf) { \ + se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 16 + (start_row * src_stride), src_stride, x_offset, \ + y_offset, ref + 16 + (start_row * ref_stride), ref_stride, height, \ + &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf * 2) { \ + se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 32 + (start_row * src_stride), src_stride, x_offset, \ + y_offset, ref + 32 + (start_row * ref_stride), ref_stride, \ + height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 48 + (start_row * src_stride), src_stride, x_offset, \ + y_offset, ref + 48 + (start_row * ref_stride), ref_stride, \ + height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + } \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 4); \ + sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \ + *sse_ptr = sse; \ + var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define FNS(opt) \ + FN(64, 64, 16, 6, 6, opt, (int64_t)) \ + FN(64, 32, 16, 6, 5, opt, (int64_t)) \ + FN(32, 64, 16, 5, 6, opt, (int64_t)) \ + FN(32, 32, 16, 5, 5, opt, (int64_t)) \ + FN(32, 16, 16, 5, 4, opt, (int64_t)) \ + FN(16, 32, 16, 4, 5, opt, (int64_t)) \ + FN(16, 16, 16, 4, 4, opt, (int64_t)) \ + FN(16, 8, 16, 4, 3, opt, (int64_t)) \ + FN(8, 16, 8, 3, 4, opt, (int64_t)) \ + FN(8, 8, 8, 3, 3, opt, (int64_t)) \ + FN(8, 4, 8, 3, 2, opt, (int64_t)) + +FNS(sse2) + +#undef FNS +#undef FN + +// The 2 unused parameters are place holders for PIC enabled build. +#define DECL(w, opt) \ + int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt( \ + const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ + const uint16_t *ref, ptrdiff_t ref_stride, const uint16_t *second, \ + ptrdiff_t second_stride, int height, unsigned int *sse, void *unused0, \ + void *unused); +#define DECLS(opt1) \ + DECL(16, opt1) \ + DECL(8, opt1) + +DECLS(sse2) +#undef DECL +#undef DECLS + +#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ + uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr, \ + const uint8_t *sec8) { \ + uint32_t sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ + int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src, src_stride, x_offset, y_offset, ref, ref_stride, sec, w, h, &sse, \ + NULL, NULL); \ + if (w > wf) { \ + uint32_t sse2; \ + int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, \ + sec + 16, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, \ + sec + 32, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, \ + sec + 48, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + *sse_ptr = sse; \ + return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \ + } \ + \ + uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr, \ + const uint8_t *sec8) { \ + int64_t var; \ + uint32_t sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ + int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src, src_stride, x_offset, y_offset, ref, ref_stride, sec, w, h, &sse, \ + NULL, NULL); \ + if (w > wf) { \ + uint32_t sse2; \ + int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, \ + sec + 16, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, \ + sec + 32, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, \ + sec + 48, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 2); \ + sse = ROUND_POWER_OF_TWO(sse, 4); \ + *sse_ptr = sse; \ + var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr, \ + const uint8_t *sec8) { \ + int start_row; \ + int64_t var; \ + uint32_t sse; \ + int se = 0; \ + uint64_t long_sse = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ + for (start_row = 0; start_row < h; start_row += 16) { \ + uint32_t sse2; \ + int height = h - start_row < 16 ? h - start_row : 16; \ + int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + (start_row * src_stride), src_stride, x_offset, y_offset, \ + ref + (start_row * ref_stride), ref_stride, sec + (start_row * w), \ + w, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf) { \ + se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 16 + (start_row * src_stride), src_stride, x_offset, \ + y_offset, ref + 16 + (start_row * ref_stride), ref_stride, \ + sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf * 2) { \ + se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 32 + (start_row * src_stride), src_stride, x_offset, \ + y_offset, ref + 32 + (start_row * ref_stride), ref_stride, \ + sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 48 + (start_row * src_stride), src_stride, x_offset, \ + y_offset, ref + 48 + (start_row * ref_stride), ref_stride, \ + sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + } \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 4); \ + sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \ + *sse_ptr = sse; \ + var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define FNS(opt1) \ + FN(64, 64, 16, 6, 6, opt1, (int64_t)) \ + FN(64, 32, 16, 6, 5, opt1, (int64_t)) \ + FN(32, 64, 16, 5, 6, opt1, (int64_t)) \ + FN(32, 32, 16, 5, 5, opt1, (int64_t)) \ + FN(32, 16, 16, 5, 4, opt1, (int64_t)) \ + FN(16, 32, 16, 4, 5, opt1, (int64_t)) \ + FN(16, 16, 16, 4, 4, opt1, (int64_t)) \ + FN(16, 8, 16, 4, 3, opt1, (int64_t)) \ + FN(8, 16, 8, 4, 3, opt1, (int64_t)) \ + FN(8, 8, 8, 3, 3, opt1, (int64_t)) \ + FN(8, 4, 8, 3, 2, opt1, (int64_t)) + +FNS(sse2) + +#undef FNS +#undef FN + +void vpx_highbd_comp_avg_pred_sse2(uint16_t *comp_pred, const uint16_t *pred, + int width, int height, const uint16_t *ref, + int ref_stride) { + int i, j; + if (width > 8) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; j += 16) { + const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[j]); + const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[j + 8]); + const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[j]); + const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[j + 8]); + _mm_storeu_si128((__m128i *)&comp_pred[j], _mm_avg_epu16(p0, r0)); + _mm_storeu_si128((__m128i *)&comp_pred[j + 8], _mm_avg_epu16(p1, r1)); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } + } else if (width == 8) { + for (i = 0; i < height; i += 2) { + const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[0]); + const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[8]); + const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[0]); + const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[ref_stride]); + _mm_storeu_si128((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0)); + _mm_storeu_si128((__m128i *)&comp_pred[8], _mm_avg_epu16(p1, r1)); + comp_pred += 8 << 1; + pred += 8 << 1; + ref += ref_stride << 1; + } + } else { + assert(width == 4); + for (i = 0; i < height; i += 2) { + const __m128i p0 = _mm_loadl_epi64((const __m128i *)&pred[0]); + const __m128i p1 = _mm_loadl_epi64((const __m128i *)&pred[4]); + const __m128i r0 = _mm_loadl_epi64((const __m128i *)&ref[0]); + const __m128i r1 = _mm_loadl_epi64((const __m128i *)&ref[ref_stride]); + _mm_storel_epi64((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0)); + _mm_storel_epi64((__m128i *)&comp_pred[4], _mm_avg_epu16(p1, r1)); + comp_pred += 4 << 1; + pred += 4 << 1; + ref += ref_stride << 1; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/intrapred_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/intrapred_sse2.asm new file mode 100644 index 0000000000..61af6236ed --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/intrapred_sse2.asm @@ -0,0 +1,860 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pb_1: times 16 db 1 +pw_4: times 8 dw 4 +pw_8: times 8 dw 8 +pw_16: times 8 dw 16 +pw_32: times 8 dw 32 +dc_128: times 16 db 128 +pw2_4: times 8 dw 2 +pw2_8: times 8 dw 4 +pw2_16: times 8 dw 8 +pw2_32: times 8 dw 16 + +SECTION .text + +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 + pavgb %4, %1, %3 + pxor %3, %1 + pand %3, [GLOBAL(pb_1)] + psubb %4, %3 + pavgb %4, %2 +%endmacro + +INIT_XMM sse2 +cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset + GET_GOT goffsetq + + movq m0, [aboveq] + DEFINE_ARGS dst, stride, temp + psrldq m1, m0, 1 + psrldq m2, m0, 2 + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 + + ; store 4 lines + movd [dstq ], m3 + psrlq m3, 8 + movd [dstq+strideq ], m3 + lea dstq, [dstq+strideq*2] + psrlq m3, 8 + movd [dstq ], m3 + psrlq m3, 8 + movd [dstq+strideq ], m3 + psrlq m0, 56 + movd tempd, m0 + mov [dstq+strideq+3], tempb + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset + GET_GOT goffsetq + + movu m1, [aboveq] + pslldq m0, m1, 1 + psrldq m2, m1, 1 + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 + punpckhbw m0, m0 ; 7 7 + punpcklwd m0, m0 ; 7 7 7 7 + punpckldq m0, m0 ; 7 7 7 7 7 7 7 7 + punpcklqdq m3, m0 ; -1 0 1 2 3 4 5 6 7 7 7 7 7 7 7 7 + + ; store 4 lines + psrldq m3, 1 + movq [dstq ], m3 + psrldq m3, 1 + movq [dstq+strideq ], m3 + psrldq m3, 1 + movq [dstq+strideq*2], m3 + psrldq m3, 1 + movq [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + + ; store next 4 lines + psrldq m3, 1 + movq [dstq ], m3 + psrldq m3, 1 + movq [dstq+strideq ], m3 + psrldq m3, 1 + movq [dstq+strideq*2], m3 + psrldq m3, 1 + movq [dstq+stride3q ], m3 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal d207_predictor_4x4, 4, 4, 5, dst, stride, unused, left, goffset + GET_GOT goffsetq + + movd m0, [leftq] ; abcd [byte] + punpcklbw m4, m0, m0 ; aabb ccdd + punpcklwd m4, m4 ; aaaa bbbb cccc dddd + psrldq m4, 12 ; dddd + punpckldq m0, m4 ; abcd dddd + psrldq m1, m0, 1 ; bcdd + psrldq m2, m0, 2 ; cddd + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; a2bc b2cd c3d d + pavgb m1, m0 ; ab, bc, cd, d [byte] + + punpcklbw m1, m3 ; ab, a2bc, bc, b2cd, cd, c3d, d, d + movd [dstq ], m1 + psrlq m1, 16 ; bc, b2cd, cd, c3d, d, d + movd [dstq+strideq], m1 + + lea dstq, [dstq+strideq*2] + psrlq m1, 16 ; cd, c3d, d, d + movd [dstq ], m1 + movd [dstq+strideq], m4 ; d, d, d, d + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + movd m2, [leftq] + movd m0, [aboveq] + pxor m1, m1 + punpckldq m0, m2 + psadbw m0, m1 + paddw m0, [GLOBAL(pw_4)] + psraw m0, 3 + pshuflw m0, m0, 0x0 + packuswb m0, m0 + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset + movifnidn leftq, leftmp + GET_GOT goffsetq + + pxor m1, m1 + movd m0, [leftq] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_4)] + psraw m0, 2 + pshuflw m0, m0, 0x0 + packuswb m0, m0 + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + movd m0, [aboveq] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_4)] + psraw m0, 2 + pshuflw m0, m0, 0x0 + packuswb m0, m0 + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + movq m0, [aboveq] + movq m2, [leftq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + paddw m0, [GLOBAL(pw_8)] + psraw m0, 4 + punpcklbw m0, m0 + pshuflw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + movq m0, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_8)] + psraw m0, 3 + punpcklbw m0, m0 + pshuflw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset + movifnidn leftq, leftmp + GET_GOT goffsetq + + pxor m1, m1 + movq m0, [leftq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_8)] + psraw m0, 3 + punpcklbw m0, m0 + pshuflw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + movd m0, [GLOBAL(dc_128)] + movd [dstq ], m0 + movd [dstq+strideq ], m0 + movd [dstq+strideq*2], m0 + movd [dstq+stride3q ], m0 + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + movq m0, [GLOBAL(dc_128)] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [leftq] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw_16)] + psraw m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + + +INIT_XMM sse2 +cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + psadbw m0, m1 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_16)] + psraw m0, 4 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [leftq] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + psadbw m0, m1 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_16)] + psraw m0, 4 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + mova m0, [GLOBAL(dc_128)] +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + RESTORE_GOT + RET + + +INIT_XMM sse2 +cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [aboveq+16] + mova m3, [leftq] + mova m4, [leftq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + psadbw m0, m1 + psadbw m2, m1 + psadbw m3, m1 + psadbw m4, m1 + paddw m0, m2 + paddw m0, m3 + paddw m0, m4 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw_32)] + psraw m0, 6 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_32)] + psraw m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [leftq] + mova m2, [leftq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_32)] + psraw m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + mova m0, [GLOBAL(dc_128)] +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above + movd m0, [aboveq] + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + RET + +INIT_XMM sse2 +cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above + movq m0, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + RET + +INIT_XMM sse2 +cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 4 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec nlines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above + mova m0, [aboveq] + mova m1, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 8 +.loop: + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m1 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m1 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m1 + lea dstq, [dstq+strideq*4] + dec nlines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left + movifnidn leftq, leftmp + movd m0, [leftq] + punpcklbw m0, m0 + punpcklbw m0, m0 + pshufd m1, m0, 0x1 + movd [dstq ], m0 + movd [dstq+strideq], m1 + pshufd m2, m0, 0x2 + lea dstq, [dstq+strideq*2] + pshufd m3, m0, 0x3 + movd [dstq ], m2 + movd [dstq+strideq], m3 + RET + +INIT_XMM sse2 +cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left + movifnidn leftq, leftmp + mov lineq, -2 + DEFINE_ARGS dst, stride, line, left, stride3 + lea stride3q, [strideq*3] + movq m0, [leftq ] + punpcklbw m0, m0 ; l1 l1 l2 l2 ... l8 l8 +.loop: + pshuflw m1, m0, 0x0 ; l1 l1 l1 l1 l1 l1 l1 l1 + pshuflw m2, m0, 0x55 ; l2 l2 l2 l2 l2 l2 l2 l2 + movq [dstq ], m1 + movq [dstq+strideq], m2 + pshuflw m1, m0, 0xaa + pshuflw m2, m0, 0xff + movq [dstq+strideq*2], m1 + movq [dstq+stride3q ], m2 + pshufd m0, m0, 0xe ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8 + inc lineq + lea dstq, [dstq+strideq*4] + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left + movifnidn leftq, leftmp + mov lineq, -4 + DEFINE_ARGS dst, stride, line, left, stride3 + lea stride3q, [strideq*3] +.loop: + movd m0, [leftq] + punpcklbw m0, m0 + punpcklbw m0, m0 ; l1 to l4 each repeated 4 times + pshufd m1, m0, 0x0 ; l1 repeated 16 times + pshufd m2, m0, 0x55 ; l2 repeated 16 times + mova [dstq ], m1 + mova [dstq+strideq ], m2 + pshufd m1, m0, 0xaa + pshufd m2, m0, 0xff + mova [dstq+strideq*2], m1 + mova [dstq+stride3q ], m2 + inc lineq + lea leftq, [leftq+4 ] + lea dstq, [dstq+strideq*4] + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left + movifnidn leftq, leftmp + mov lineq, -8 + DEFINE_ARGS dst, stride, line, left, stride3 + lea stride3q, [strideq*3] +.loop: + movd m0, [leftq] + punpcklbw m0, m0 + punpcklbw m0, m0 ; l1 to l4 each repeated 4 times + pshufd m1, m0, 0x0 ; l1 repeated 16 times + pshufd m2, m0, 0x55 ; l2 repeated 16 times + mova [dstq ], m1 + mova [dstq+16 ], m1 + mova [dstq+strideq ], m2 + mova [dstq+strideq+16 ], m2 + pshufd m1, m0, 0xaa + pshufd m2, m0, 0xff + mova [dstq+strideq*2 ], m1 + mova [dstq+strideq*2+16], m1 + mova [dstq+stride3q ], m2 + mova [dstq+stride3q+16 ], m2 + inc lineq + lea leftq, [leftq+4 ] + lea dstq, [dstq+strideq*4] + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal tm_predictor_4x4, 4, 4, 5, dst, stride, above, left + pxor m1, m1 + movq m0, [aboveq-1]; [63:0] tl t1 t2 t3 t4 x x x + punpcklbw m0, m1 + pshuflw m2, m0, 0x0 ; [63:0] tl tl tl tl [word] + psrldq m0, 2 + psubw m0, m2 ; [63:0] t1-tl t2-tl t3-tl t4-tl [word] + movd m2, [leftq] + punpcklbw m2, m1 + pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word] + pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word] + paddw m4, m0 + paddw m3, m0 + packuswb m4, m4 + packuswb m3, m3 + movd [dstq ], m4 + movd [dstq+strideq], m3 + lea dstq, [dstq+strideq*2] + pshuflw m4, m2, 0xaa + pshuflw m3, m2, 0xff + paddw m4, m0 + paddw m3, m0 + packuswb m4, m4 + packuswb m3, m3 + movd [dstq ], m4 + movd [dstq+strideq], m3 + RET + +INIT_XMM sse2 +cglobal tm_predictor_8x8, 4, 4, 5, dst, stride, above, left + pxor m1, m1 + movd m2, [aboveq-1] + movq m0, [aboveq] + punpcklbw m2, m1 + punpcklbw m0, m1 ; t1 t2 t3 t4 t5 t6 t7 t8 [word] + pshuflw m2, m2, 0x0 ; [63:0] tl tl tl tl [word] + DEFINE_ARGS dst, stride, line, left + mov lineq, -4 + punpcklqdq m2, m2 ; tl tl tl tl tl tl tl tl [word] + psubw m0, m2 ; t1-tl t2-tl ... t8-tl [word] + movq m2, [leftq] + punpcklbw m2, m1 ; l1 l2 l3 l4 l5 l6 l7 l8 [word] +.loop: + pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word] + pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word] + punpcklqdq m4, m4 ; l1 l1 l1 l1 l1 l1 l1 l1 [word] + punpcklqdq m3, m3 ; l2 l2 l2 l2 l2 l2 l2 l2 [word] + paddw m4, m0 + paddw m3, m0 + packuswb m4, m3 + movq [dstq ], m4 + movhps [dstq+strideq], m4 + lea dstq, [dstq+strideq*2] + psrldq m2, 4 + inc lineq + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal tm_predictor_16x16, 4, 5, 8, dst, stride, above, left + pxor m1, m1 + mova m2, [aboveq-16]; + mova m0, [aboveq] ; t1 t2 ... t16 [byte] + punpckhbw m2, m1 ; [127:112] tl [word] + punpckhbw m4, m0, m1 + punpcklbw m0, m1 ; m0:m4 t1 t2 ... t16 [word] + DEFINE_ARGS dst, stride, line, left, stride8 + mov lineq, -8 + pshufhw m2, m2, 0xff + mova m3, [leftq] ; l1 l2 ... l16 [byte] + punpckhqdq m2, m2 ; tl repeated 8 times [word] + psubw m0, m2 + psubw m4, m2 ; m0:m4 t1-tl t2-tl ... t16-tl [word] + punpckhbw m5, m3, m1 + punpcklbw m3, m1 ; m3:m5 l1 l2 ... l16 [word] + lea stride8q, [strideq*8] +.loop: + pshuflw m6, m3, 0x0 + pshuflw m7, m5, 0x0 + punpcklqdq m6, m6 ; l1 repeated 8 times [word] + punpcklqdq m7, m7 ; l8 repeated 8 times [word] + paddw m1, m6, m0 + paddw m6, m4 ; m1:m6 ti-tl+l1 [i=1,15] [word] + psrldq m5, 2 + packuswb m1, m6 + mova [dstq ], m1 + paddw m1, m7, m0 + paddw m7, m4 ; m1:m7 ti-tl+l8 [i=1,15] [word] + psrldq m3, 2 + packuswb m1, m7 + mova [dstq+stride8q], m1 + inc lineq + lea dstq, [dstq+strideq] + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal tm_predictor_32x32, 4, 4, 8, dst, stride, above, left + pxor m1, m1 + movd m2, [aboveq-1] + mova m0, [aboveq] + mova m4, [aboveq+16] + punpcklbw m2, m1 + punpckhbw m3, m0, m1 + punpckhbw m5, m4, m1 + punpcklbw m0, m1 + punpcklbw m4, m1 + pshuflw m2, m2, 0x0 + DEFINE_ARGS dst, stride, line, left + mov lineq, -16 + punpcklqdq m2, m2 + add leftq, 32 + psubw m0, m2 + psubw m3, m2 + psubw m4, m2 + psubw m5, m2 +.loop: + movd m2, [leftq+lineq*2] + pxor m1, m1 + punpcklbw m2, m1 + pshuflw m7, m2, 0x55 + pshuflw m2, m2, 0x0 + punpcklqdq m2, m2 + punpcklqdq m7, m7 + paddw m6, m2, m3 + paddw m1, m2, m0 + packuswb m1, m6 + mova [dstq ], m1 + paddw m6, m2, m5 + paddw m1, m2, m4 + packuswb m1, m6 + mova [dstq+16 ], m1 + paddw m6, m7, m3 + paddw m1, m7, m0 + packuswb m1, m6 + mova [dstq+strideq ], m1 + paddw m6, m7, m5 + paddw m1, m7, m4 + packuswb m1, m6 + mova [dstq+strideq+16], m1 + lea dstq, [dstq+strideq*2] + inc lineq + jnz .loop + REP_RET diff --git a/media/libvpx/libvpx/vpx_dsp/x86/intrapred_ssse3.asm b/media/libvpx/libvpx/vpx_dsp/x86/intrapred_ssse3.asm new file mode 100644 index 0000000000..5e0139fa8d --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/intrapred_ssse3.asm @@ -0,0 +1,871 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA + +pb_1: times 16 db 1 +sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 +sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 +sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 +sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 +sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15 +sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0 +sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 +sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +SECTION .text + +INIT_XMM ssse3 +cglobal d45_predictor_16x16, 3, 6, 4, dst, stride, above, dst8, line, goffset + GET_GOT goffsetq + + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3, dst8, line + lea stride3q, [strideq*3] + lea dst8q, [dstq+strideq*8] + mova m1, [GLOBAL(sh_b123456789abcdeff)] + pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] + pavgb m3, m2, m0 + pxor m2, m0 + pshufb m0, m1 + pand m2, [GLOBAL(pb_1)] + psubb m3, m2 + pavgb m0, m3 + + ; first 4 lines and first half of 3rd 4 lines + mov lined, 2 +.loop: + mova [dstq ], m0 + movhps [dst8q ], m0 + pshufb m0, m1 + mova [dstq +strideq ], m0 + movhps [dst8q+strideq ], m0 + pshufb m0, m1 + mova [dstq +strideq*2 ], m0 + movhps [dst8q+strideq*2 ], m0 + pshufb m0, m1 + mova [dstq +stride3q ], m0 + movhps [dst8q+stride3q ], m0 + pshufb m0, m1 + lea dstq, [dstq +strideq*4] + lea dst8q, [dst8q+strideq*4] + dec lined + jnz .loop + + ; bottom-right 8x8 block + movhps [dstq +8], m0 + movhps [dstq+strideq +8], m0 + movhps [dstq+strideq*2+8], m0 + movhps [dstq+stride3q +8], m0 + lea dstq, [dstq+strideq*4] + movhps [dstq +8], m0 + movhps [dstq+strideq +8], m0 + movhps [dstq+strideq*2+8], m0 + movhps [dstq+stride3q +8], m0 + + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d45_predictor_32x32, 3, 6, 7, dst, stride, above, dst16, line, goffset + GET_GOT goffsetq + + mova m0, [aboveq] + mova m4, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, dst16, line + lea stride3q, [strideq*3] + lea dst16q, [dstq +strideq*8] + lea dst16q, [dst16q+strideq*8] + mova m1, [GLOBAL(sh_b123456789abcdeff)] + pshufb m2, m4, [GLOBAL(sh_b23456789abcdefff)] + pavgb m3, m2, m4 + pxor m2, m4 + palignr m5, m4, m0, 1 + palignr m6, m4, m0, 2 + pshufb m4, m1 + pand m2, [GLOBAL(pb_1)] + psubb m3, m2 + pavgb m4, m3 + pavgb m3, m0, m6 + pxor m0, m6 + pand m0, [GLOBAL(pb_1)] + psubb m3, m0 + pavgb m5, m3 + + ; write 4x4 lines (and the first half of the second 4x4 lines) + mov lined, 4 +.loop: + mova [dstq ], m5 + mova [dstq +16], m4 + mova [dst16q ], m4 + palignr m3, m4, m5, 1 + pshufb m4, m1 + mova [dstq +strideq ], m3 + mova [dstq +strideq +16], m4 + mova [dst16q+strideq ], m4 + palignr m5, m4, m3, 1 + pshufb m4, m1 + mova [dstq +strideq*2 ], m5 + mova [dstq +strideq*2+16], m4 + mova [dst16q+strideq*2 ], m4 + palignr m3, m4, m5, 1 + pshufb m4, m1 + mova [dstq +stride3q ], m3 + mova [dstq +stride3q +16], m4 + mova [dst16q+stride3q ], m4 + palignr m5, m4, m3, 1 + pshufb m4, m1 + lea dstq, [dstq +strideq*4] + lea dst16q, [dst16q+strideq*4] + dec lined + jnz .loop + + ; write second half of second 4x4 lines + mova [dstq +16], m4 + mova [dstq +strideq +16], m4 + mova [dstq +strideq*2+16], m4 + mova [dstq +stride3q +16], m4 + lea dstq, [dstq +strideq*4] + mova [dstq +16], m4 + mova [dstq +strideq +16], m4 + mova [dstq +strideq*2+16], m4 + mova [dstq +stride3q +16], m4 + lea dstq, [dstq +strideq*4] + mova [dstq +16], m4 + mova [dstq +strideq +16], m4 + mova [dstq +strideq*2+16], m4 + mova [dstq +stride3q +16], m4 + lea dstq, [dstq +strideq*4] + mova [dstq +16], m4 + mova [dstq +strideq +16], m4 + mova [dstq +strideq*2+16], m4 + mova [dstq +stride3q +16], m4 + + RESTORE_GOT + RET + +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 + pavgb %4, %1, %3 + pxor %3, %1 + pand %3, [GLOBAL(pb_1)] + psubb %4, %3 + pavgb %4, %2 +%endmacro + +INIT_XMM ssse3 +cglobal d63_predictor_4x4, 3, 4, 5, dst, stride, above, goffset + GET_GOT goffsetq + + movq m3, [aboveq] + pshufb m1, m3, [GLOBAL(sh_b23456777)] + pshufb m2, m3, [GLOBAL(sh_b12345677)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4 + pavgb m3, m2 + + ; store 4 lines + movd [dstq ], m3 + movd [dstq+strideq], m4 + lea dstq, [dstq+strideq*2] + psrldq m3, 1 + psrldq m4, 1 + movd [dstq ], m3 + movd [dstq+strideq], m4 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d63_predictor_8x8, 3, 4, 5, dst, stride, above, goffset + GET_GOT goffsetq + + movq m3, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pshufb m1, m3, [GLOBAL(sh_b2345677777777777)] + pshufb m0, m3, [GLOBAL(sh_b0123456777777777)] + pshufb m2, m3, [GLOBAL(sh_b1234567777777777)] + pshufb m3, [GLOBAL(sh_b0123456777777777)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m4 + pavgb m3, m2 + + ; store 4 lines + movq [dstq ], m3 + movq [dstq+strideq], m4 + psrldq m3, 1 + psrldq m4, 1 + movq [dstq+strideq*2], m3 + movq [dstq+stride3q ], m4 + lea dstq, [dstq+strideq*4] + psrldq m3, 1 + psrldq m4, 1 + + ; store 4 lines + movq [dstq ], m3 + movq [dstq+strideq], m4 + psrldq m3, 1 + psrldq m4, 1 + movq [dstq+strideq*2], m3 + movq [dstq+stride3q ], m4 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d63_predictor_16x16, 3, 5, 5, dst, stride, above, line, goffset + GET_GOT goffsetq + + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3, line + lea stride3q, [strideq*3] + mova m1, [GLOBAL(sh_b123456789abcdeff)] + pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] + pshufb m3, m0, m1 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m3, m2, m4 + pavgb m0, m3 + + mov lined, 4 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m4 + pshufb m0, m1 + pshufb m4, m1 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m4 + pshufb m0, m1 + pshufb m4, m1 + lea dstq, [dstq+strideq*4] + dec lined + jnz .loop + RESTORE_GOT + REP_RET + +INIT_XMM ssse3 +cglobal d63_predictor_32x32, 3, 5, 8, dst, stride, above, line, goffset + GET_GOT goffsetq + + mova m0, [aboveq] + mova m7, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, line + mova m1, [GLOBAL(sh_b123456789abcdeff)] + lea stride3q, [strideq*3] + pshufb m2, m7, [GLOBAL(sh_b23456789abcdefff)] + pshufb m3, m7, m1 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m2, m4 + palignr m6, m7, m0, 1 + palignr m5, m7, m0, 2 + pavgb m7, m3 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m6, m5, m2 + pavgb m0, m6 + + mov lined, 8 +.loop: + mova [dstq ], m0 + mova [dstq +16], m7 + mova [dstq+strideq ], m2 + mova [dstq+strideq +16], m4 + palignr m3, m7, m0, 1 + palignr m5, m4, m2, 1 + pshufb m7, m1 + pshufb m4, m1 + + mova [dstq+strideq*2 ], m3 + mova [dstq+strideq*2+16], m7 + mova [dstq+stride3q ], m5 + mova [dstq+stride3q +16], m4 + palignr m0, m7, m3, 1 + palignr m2, m4, m5, 1 + pshufb m7, m1 + pshufb m4, m1 + lea dstq, [dstq+strideq*4] + dec lined + jnz .loop + RESTORE_GOT + REP_RET + +INIT_XMM ssse3 +cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset + GET_GOT goffsetq + movd m0, [leftq] ; l1, l2, l3, l4 + movd m1, [aboveq-1] ; tl, t1, t2, t3 + punpckldq m0, m1 ; l1, l2, l3, l4, tl, t1, t2, t3 + pshufb m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3 + psrldq m1, m0, 1 ; l3, l2, l1, tl, t1, t2, t3 + psrldq m2, m0, 2 ; l2, l1, tl, t1, t2, t3 + ; comments below are for a predictor like this + ; A1 B1 C1 D1 + ; A2 B2 A1 B1 + ; A3 B3 A2 B2 + ; A4 B4 A3 B3 + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; 3-tap avg B4 B3 B2 B1 C1 D1 + pavgb m1, m0 ; 2-tap avg A4 A3 A2 A1 + + punpcklqdq m3, m1 ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 .. + + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pshufb m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 .. + movd [dstq+stride3q ], m3 + psrldq m3, 2 ; A3 B3 A2 B2 A1 B1 C1 D1 .. + movd [dstq+strideq*2], m3 + psrldq m3, 2 ; A2 B2 A1 B1 C1 D1 .. + movd [dstq+strideq ], m3 + psrldq m3, 2 ; A1 B1 C1 D1 .. + movd [dstq ], m3 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset + GET_GOT goffsetq + movq m0, [leftq] ; [0- 7] l1-8 [byte] + movhps m0, [aboveq-1] ; [8-15] tl, t1-7 [byte] + pshufb m1, m0, [GLOBAL(sh_b76543210)] ; l8-1 [word] + pshufb m2, m0, [GLOBAL(sh_b65432108)] ; l7-1,tl [word] + pshufb m3, m0, [GLOBAL(sh_b54321089)] ; l6-1,tl,t1 [word] + pshufb m0, [GLOBAL(sh_b89abcdef)] ; tl,t1-7 [word] + psrldq m4, m0, 1 ; t1-7 [word] + psrldq m5, m0, 2 ; t2-7 [word] + ; comments below are for a predictor like this + ; A1 B1 C1 D1 E1 F1 G1 H1 + ; A2 B2 A1 B1 C1 D1 E1 F1 + ; A3 B3 A2 B2 A1 B1 C1 D1 + ; A4 B4 A3 B3 A2 B2 A1 B1 + ; A5 B5 A4 B4 A3 B3 A2 B2 + ; A6 B6 A5 B5 A4 B4 A3 B3 + ; A7 B7 A6 B6 A5 B5 A4 B4 + ; A8 B8 A7 B7 A6 B6 A5 B5 + pavgb m6, m1, m2 ; 2-tap avg A8-A1 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7 ; 3-tap avg C-H1 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0 ; 3-tap avg B8-1 + + punpcklbw m6, m0 ; A-B8, A-B7 ... A-B2, A-B1 + + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + + movhps [dstq+stride3q], m6 ; A-B4, A-B3, A-B2, A-B1 + palignr m0, m7, m6, 10 ; A-B3, A-B2, A-B1, C-H1 + movq [dstq+strideq*2], m0 + psrldq m0, 2 ; A-B2, A-B1, C-H1 + movq [dstq+strideq ], m0 + psrldq m0, 2 ; A-H1 + movq [dstq ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq+stride3q ], m6 ; A-B8, A-B7, A-B6, A-B5 + psrldq m6, 2 ; A-B7, A-B6, A-B5, A-B4 + movq [dstq+strideq*2], m6 + psrldq m6, 2 ; A-B6, A-B5, A-B4, A-B3 + movq [dstq+strideq ], m6 + psrldq m6, 2 ; A-B5, A-B4, A-B3, A-B2 + movq [dstq ], m6 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset + GET_GOT goffsetq + mova m0, [leftq] + movu m7, [aboveq-1] + ; comments below are for a predictor like this + ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1 + ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 + ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 + ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 + ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 + ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 + ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 + ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 + ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 + ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 + ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 + ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 + ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 + ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 + ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 + ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 + pshufb m6, m7, [GLOBAL(sh_bfedcba9876543210)] + palignr m5, m0, m6, 15 + palignr m3, m0, m6, 14 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg + pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] + pavgb m5, m0 ; A1 - Ag + + punpcklbw m0, m4, m5 ; A-B8 ... A-B1 + punpckhbw m4, m5 ; A-B9 ... A-Bg + + pshufb m3, m7, [GLOBAL(sh_b123456789abcdeff)] + pshufb m5, m7, [GLOBAL(sh_b23456789abcdefff)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg C1-P1 + + pshufb m6, m0, [GLOBAL(sh_bfedcba9876543210)] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + palignr m2, m1, m6, 14 + mova [dstq ], m2 + palignr m2, m1, m6, 12 + mova [dstq+strideq ], m2 + palignr m2, m1, m6, 10 + mova [dstq+strideq*2], m2 + palignr m2, m1, m6, 8 + mova [dstq+stride3q ], m2 + lea dstq, [dstq+strideq*4] + palignr m2, m1, m6, 6 + mova [dstq ], m2 + palignr m2, m1, m6, 4 + mova [dstq+strideq ], m2 + palignr m2, m1, m6, 2 + mova [dstq+strideq*2], m2 + pshufb m4, [GLOBAL(sh_bfedcba9876543210)] + mova [dstq+stride3q ], m6 + lea dstq, [dstq+strideq*4] + + palignr m2, m6, m4, 14 + mova [dstq ], m2 + palignr m2, m6, m4, 12 + mova [dstq+strideq ], m2 + palignr m2, m6, m4, 10 + mova [dstq+strideq*2], m2 + palignr m2, m6, m4, 8 + mova [dstq+stride3q ], m2 + lea dstq, [dstq+strideq*4] + palignr m2, m6, m4, 6 + mova [dstq ], m2 + palignr m2, m6, m4, 4 + mova [dstq+strideq ], m2 + palignr m2, m6, m4, 2 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m4 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset + GET_GOT goffsetq + mova m0, [leftq] + movu m7, [aboveq-1] + movu m1, [aboveq+15] + + pshufb m4, m1, [GLOBAL(sh_b123456789abcdeff)] + pshufb m6, m1, [GLOBAL(sh_b23456789abcdefff)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2 ; 3-tap avg above [high] + + palignr m3, m1, m7, 1 + palignr m5, m1, m7, 2 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg above [low] + + pshufb m7, [GLOBAL(sh_bfedcba9876543210)] + palignr m5, m0, m7, 15 + palignr m3, m0, m7, 14 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg + pavgb m5, m0 ; A1 - Ag + punpcklbw m6, m4, m5 ; A-B8 ... A-B1 + punpckhbw m4, m5 ; A-B9 ... A-Bg + pshufb m6, [GLOBAL(sh_bfedcba9876543210)] + pshufb m4, [GLOBAL(sh_bfedcba9876543210)] + + DEFINE_ARGS dst, stride, stride3, left, line + lea stride3q, [strideq*3] + + palignr m5, m2, m1, 14 + palignr m7, m1, m6, 14 + mova [dstq ], m7 + mova [dstq+16 ], m5 + palignr m5, m2, m1, 12 + palignr m7, m1, m6, 12 + mova [dstq+strideq ], m7 + mova [dstq+strideq+16 ], m5 + palignr m5, m2, m1, 10 + palignr m7, m1, m6, 10 + mova [dstq+strideq*2 ], m7 + mova [dstq+strideq*2+16], m5 + palignr m5, m2, m1, 8 + palignr m7, m1, m6, 8 + mova [dstq+stride3q ], m7 + mova [dstq+stride3q+16 ], m5 + lea dstq, [dstq+strideq*4] + palignr m5, m2, m1, 6 + palignr m7, m1, m6, 6 + mova [dstq ], m7 + mova [dstq+16 ], m5 + palignr m5, m2, m1, 4 + palignr m7, m1, m6, 4 + mova [dstq+strideq ], m7 + mova [dstq+strideq+16 ], m5 + palignr m5, m2, m1, 2 + palignr m7, m1, m6, 2 + mova [dstq+strideq*2 ], m7 + mova [dstq+strideq*2+16], m5 + mova [dstq+stride3q ], m6 + mova [dstq+stride3q+16 ], m1 + lea dstq, [dstq+strideq*4] + + palignr m5, m1, m6, 14 + palignr m3, m6, m4, 14 + mova [dstq ], m3 + mova [dstq+16 ], m5 + palignr m5, m1, m6, 12 + palignr m3, m6, m4, 12 + mova [dstq+strideq ], m3 + mova [dstq+strideq+16 ], m5 + palignr m5, m1, m6, 10 + palignr m3, m6, m4, 10 + mova [dstq+strideq*2 ], m3 + mova [dstq+strideq*2+16], m5 + palignr m5, m1, m6, 8 + palignr m3, m6, m4, 8 + mova [dstq+stride3q ], m3 + mova [dstq+stride3q+16 ], m5 + lea dstq, [dstq+strideq*4] + palignr m5, m1, m6, 6 + palignr m3, m6, m4, 6 + mova [dstq ], m3 + mova [dstq+16 ], m5 + palignr m5, m1, m6, 4 + palignr m3, m6, m4, 4 + mova [dstq+strideq ], m3 + mova [dstq+strideq+16 ], m5 + palignr m5, m1, m6, 2 + palignr m3, m6, m4, 2 + mova [dstq+strideq*2 ], m3 + mova [dstq+strideq*2+16], m5 + mova [dstq+stride3q ], m4 + mova [dstq+stride3q+16 ], m6 + lea dstq, [dstq+strideq*4] + + mova m7, [leftq] + mova m3, [leftq+16] + palignr m5, m3, m7, 15 + palignr m0, m3, m7, 14 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2 ; 3-tap avg Bh - + pavgb m5, m3 ; Ah - + punpcklbw m3, m2, m5 ; A-B8 ... A-B1 + punpckhbw m2, m5 ; A-B9 ... A-Bg + pshufb m3, [GLOBAL(sh_bfedcba9876543210)] + pshufb m2, [GLOBAL(sh_bfedcba9876543210)] + + palignr m7, m6, m4, 14 + palignr m0, m4, m3, 14 + mova [dstq ], m0 + mova [dstq+16 ], m7 + palignr m7, m6, m4, 12 + palignr m0, m4, m3, 12 + mova [dstq+strideq ], m0 + mova [dstq+strideq+16 ], m7 + palignr m7, m6, m4, 10 + palignr m0, m4, m3, 10 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m7 + palignr m7, m6, m4, 8 + palignr m0, m4, m3, 8 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q+16 ], m7 + lea dstq, [dstq+strideq*4] + palignr m7, m6, m4, 6 + palignr m0, m4, m3, 6 + mova [dstq ], m0 + mova [dstq+16 ], m7 + palignr m7, m6, m4, 4 + palignr m0, m4, m3, 4 + mova [dstq+strideq ], m0 + mova [dstq+strideq+16 ], m7 + palignr m7, m6, m4, 2 + palignr m0, m4, m3, 2 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m7 + mova [dstq+stride3q ], m3 + mova [dstq+stride3q+16 ], m4 + lea dstq, [dstq+strideq*4] + + palignr m7, m4, m3, 14 + palignr m0, m3, m2, 14 + mova [dstq ], m0 + mova [dstq+16 ], m7 + palignr m7, m4, m3, 12 + palignr m0, m3, m2, 12 + mova [dstq+strideq ], m0 + mova [dstq+strideq+16 ], m7 + palignr m7, m4, m3, 10 + palignr m0, m3, m2, 10 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m7 + palignr m7, m4, m3, 8 + palignr m0, m3, m2, 8 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q+16 ], m7 + lea dstq, [dstq+strideq*4] + palignr m7, m4, m3, 6 + palignr m0, m3, m2, 6 + mova [dstq ], m0 + mova [dstq+16 ], m7 + palignr m7, m4, m3, 4 + palignr m0, m3, m2, 4 + mova [dstq+strideq ], m0 + mova [dstq+strideq+16 ], m7 + palignr m7, m4, m3, 2 + palignr m0, m3, m2, 2 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m7 + mova [dstq+stride3q ], m2 + mova [dstq+stride3q+16 ], m3 + + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d207_predictor_8x8, 4, 5, 4, dst, stride, stride3, left, goffset + GET_GOT goffsetq + movq m3, [leftq] ; abcdefgh [byte] + lea stride3q, [strideq*3] + + pshufb m1, m3, [GLOBAL(sh_b2345677777777777)] + pshufb m0, m3, [GLOBAL(sh_b0123456777777777)] + pshufb m2, m3, [GLOBAL(sh_b1234567777777777)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m3 + pavgb m0, m2 + punpcklbw m0, m3 ; interleaved output + + movq [dstq ], m0 + psrldq m0, 2 + movq [dstq+strideq ], m0 + psrldq m0, 2 + movq [dstq+strideq*2], m0 + psrldq m0, 2 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + pshufhw m0, m0, q0000 ; de, d2ef, ef, e2fg, fg, f2gh, gh, g3h, 8xh + psrldq m0, 2 + movq [dstq ], m0 + psrldq m0, 2 + movq [dstq+strideq ], m0 + psrldq m0, 2 + movq [dstq+strideq*2], m0 + psrldq m0, 2 + movq [dstq+stride3q ], m0 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d207_predictor_16x16, 4, 5, 5, dst, stride, stride3, left, goffset + GET_GOT goffsetq + lea stride3q, [strideq*3] + mova m0, [leftq] ; abcdefghijklmnop [byte] + pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] ; bcdefghijklmnopp + pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 + pavgb m1, m0 ; ab, bc, cd .. no, op, pp [byte] + + punpckhbw m4, m1, m3 ; interleaved input + punpcklbw m1, m3 ; interleaved output + mova [dstq ], m1 + palignr m3, m4, m1, 2 + mova [dstq+strideq ], m3 + palignr m3, m4, m1, 4 + mova [dstq+strideq*2], m3 + palignr m3, m4, m1, 6 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + palignr m3, m4, m1, 8 + mova [dstq ], m3 + palignr m3, m4, m1, 10 + mova [dstq+strideq ], m3 + palignr m3, m4, m1, 12 + mova [dstq+strideq*2], m3 + palignr m3, m4, m1, 14 + mova [dstq+stride3q ], m3 + DEFINE_ARGS dst, stride, stride3, line + mov lined, 2 + mova m0, [GLOBAL(sh_b23456789abcdefff)] +.loop: + lea dstq, [dstq+strideq*4] + mova [dstq ], m4 + pshufb m4, m0 + mova [dstq+strideq ], m4 + pshufb m4, m0 + mova [dstq+strideq*2], m4 + pshufb m4, m0 + mova [dstq+stride3q ], m4 + pshufb m4, m0 + dec lined + jnz .loop + RESTORE_GOT + REP_RET + +INIT_XMM ssse3 +cglobal d207_predictor_32x32, 4, 5, 8, dst, stride, stride3, left, goffset + GET_GOT goffsetq + lea stride3q, [strideq*3] + mova m1, [leftq] ; 0-15 [byte] + mova m2, [leftq+16] ; 16-31 [byte] + pshufb m0, m2, [GLOBAL(sh_b23456789abcdefff)] + pshufb m4, m2, [GLOBAL(sh_b123456789abcdeff)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m2, m4, m0, m3 + palignr m6, m2, m1, 1 + palignr m5, m2, m1, 2 + pavgb m2, m4 ; high 16px even lines + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m6, m5, m0 + pavgb m1, m6 ; low 16px even lines + + punpckhbw m6, m1, m0 ; interleaved output 2 + punpcklbw m1, m0 ; interleaved output 1 + + punpckhbw m7, m2, m3 ; interleaved output 4 + punpcklbw m2, m3 ; interleaved output 3 + + ; output 1st 8 lines (and half of 2nd 8 lines) + DEFINE_ARGS dst, stride, stride3, dst8 + lea dst8q, [dstq+strideq*8] + mova [dstq ], m1 + mova [dstq +16], m6 + mova [dst8q ], m6 + palignr m0, m6, m1, 2 + palignr m4, m2, m6, 2 + mova [dstq +strideq ], m0 + mova [dstq +strideq +16], m4 + mova [dst8q+strideq ], m4 + palignr m0, m6, m1, 4 + palignr m4, m2, m6, 4 + mova [dstq +strideq*2 ], m0 + mova [dstq +strideq*2+16], m4 + mova [dst8q+strideq*2 ], m4 + palignr m0, m6, m1, 6 + palignr m4, m2, m6, 6 + mova [dstq +stride3q ], m0 + mova [dstq +stride3q +16], m4 + mova [dst8q+stride3q ], m4 + lea dstq, [dstq +strideq*4] + lea dst8q, [dst8q+strideq*4] + palignr m0, m6, m1, 8 + palignr m4, m2, m6, 8 + mova [dstq ], m0 + mova [dstq +16], m4 + mova [dst8q ], m4 + palignr m0, m6, m1, 10 + palignr m4, m2, m6, 10 + mova [dstq +strideq ], m0 + mova [dstq +strideq +16], m4 + mova [dst8q+strideq ], m4 + palignr m0, m6, m1, 12 + palignr m4, m2, m6, 12 + mova [dstq +strideq*2 ], m0 + mova [dstq +strideq*2+16], m4 + mova [dst8q+strideq*2 ], m4 + palignr m0, m6, m1, 14 + palignr m4, m2, m6, 14 + mova [dstq +stride3q ], m0 + mova [dstq +stride3q +16], m4 + mova [dst8q+stride3q ], m4 + lea dstq, [dstq+strideq*4] + lea dst8q, [dst8q+strideq*4] + + ; output 2nd half of 2nd 8 lines and half of 3rd 8 lines + mova [dstq +16], m2 + mova [dst8q ], m2 + palignr m4, m7, m2, 2 + mova [dstq +strideq +16], m4 + mova [dst8q+strideq ], m4 + palignr m4, m7, m2, 4 + mova [dstq +strideq*2+16], m4 + mova [dst8q+strideq*2 ], m4 + palignr m4, m7, m2, 6 + mova [dstq +stride3q +16], m4 + mova [dst8q+stride3q ], m4 + lea dstq, [dstq+strideq*4] + lea dst8q, [dst8q+strideq*4] + palignr m4, m7, m2, 8 + mova [dstq +16], m4 + mova [dst8q ], m4 + palignr m4, m7, m2, 10 + mova [dstq +strideq +16], m4 + mova [dst8q+strideq ], m4 + palignr m4, m7, m2, 12 + mova [dstq +strideq*2+16], m4 + mova [dst8q+strideq*2 ], m4 + palignr m4, m7, m2, 14 + mova [dstq +stride3q +16], m4 + mova [dst8q+stride3q ], m4 + lea dstq, [dstq+strideq*4] + lea dst8q, [dst8q+strideq*4] + + ; output 2nd half of 3rd 8 lines and half of 4th 8 lines + mova m0, [GLOBAL(sh_b23456789abcdefff)] + mova [dstq +16], m7 + mova [dst8q ], m7 + pshufb m7, m0 + mova [dstq +strideq +16], m7 + mova [dst8q+strideq ], m7 + pshufb m7, m0 + mova [dstq +strideq*2+16], m7 + mova [dst8q+strideq*2 ], m7 + pshufb m7, m0 + mova [dstq +stride3q +16], m7 + mova [dst8q+stride3q ], m7 + pshufb m7, m0 + lea dstq, [dstq+strideq*4] + lea dst8q, [dst8q+strideq*4] + mova [dstq +16], m7 + mova [dst8q ], m7 + pshufb m7, m0 + mova [dstq +strideq +16], m7 + mova [dst8q+strideq ], m7 + pshufb m7, m0 + mova [dstq +strideq*2+16], m7 + mova [dst8q+strideq*2 ], m7 + pshufb m7, m0 + mova [dstq +stride3q +16], m7 + mova [dst8q+stride3q ], m7 + pshufb m7, m0 + lea dstq, [dstq+strideq*4] + + ; output last half of 4th 8 lines + mova [dstq +16], m7 + mova [dstq +strideq +16], m7 + mova [dstq +strideq*2+16], m7 + mova [dstq +stride3q +16], m7 + lea dstq, [dstq+strideq*4] + mova [dstq +16], m7 + mova [dstq +strideq +16], m7 + mova [dstq +strideq*2+16], m7 + mova [dstq +stride3q +16], m7 + + ; done! + RESTORE_GOT + RET diff --git a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_avx2.c new file mode 100644 index 0000000000..752435d240 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_avx2.c @@ -0,0 +1,626 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // AVX2 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" + +#define PAIR256_SET_EPI16(a, b) \ + _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) + +static INLINE void idct_load16x16(const tran_low_t *input, __m256i *in, + int stride) { + int i; + // Load 16x16 values + for (i = 0; i < 16; i++) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m128i in0 = _mm_loadu_si128((const __m128i *)(input + i * stride)); + const __m128i in1 = + _mm_loadu_si128((const __m128i *)((input + i * stride) + 4)); + const __m128i in2 = + _mm_loadu_si128((const __m128i *)((input + i * stride) + 8)); + const __m128i in3 = + _mm_loadu_si128((const __m128i *)((input + i * stride) + 12)); + const __m128i ls = _mm_packs_epi32(in0, in1); + const __m128i rs = _mm_packs_epi32(in2, in3); + in[i] = _mm256_inserti128_si256(_mm256_castsi128_si256(ls), rs, 1); +#else + in[i] = _mm256_load_si256((const __m256i *)(input + i * stride)); +#endif + } +} + +static INLINE __m256i dct_round_shift_avx2(__m256i in) { + const __m256i t = _mm256_add_epi32(in, _mm256_set1_epi32(DCT_CONST_ROUNDING)); + return _mm256_srai_epi32(t, DCT_CONST_BITS); +} + +static INLINE __m256i idct_madd_round_shift_avx2(__m256i *in, __m256i *cospi) { + const __m256i t = _mm256_madd_epi16(*in, *cospi); + return dct_round_shift_avx2(t); +} + +// Calculate the dot product between in0/1 and x and wrap to short. +static INLINE __m256i idct_calc_wraplow_avx2(__m256i *in0, __m256i *in1, + __m256i *x) { + const __m256i t0 = idct_madd_round_shift_avx2(in0, x); + const __m256i t1 = idct_madd_round_shift_avx2(in1, x); + return _mm256_packs_epi32(t0, t1); +} + +// Multiply elements by constants and add them together. +static INLINE void butterfly16(__m256i in0, __m256i in1, int c0, int c1, + __m256i *out0, __m256i *out1) { + __m256i cst0 = PAIR256_SET_EPI16(c0, -c1); + __m256i cst1 = PAIR256_SET_EPI16(c1, c0); + __m256i lo = _mm256_unpacklo_epi16(in0, in1); + __m256i hi = _mm256_unpackhi_epi16(in0, in1); + *out0 = idct_calc_wraplow_avx2(&lo, &hi, &cst0); + *out1 = idct_calc_wraplow_avx2(&lo, &hi, &cst1); +} + +static INLINE void idct16_16col(__m256i *in, __m256i *out) { + __m256i step1[16], step2[16]; + + // stage 2 + butterfly16(in[1], in[15], cospi_30_64, cospi_2_64, &step2[8], &step2[15]); + butterfly16(in[9], in[7], cospi_14_64, cospi_18_64, &step2[9], &step2[14]); + butterfly16(in[5], in[11], cospi_22_64, cospi_10_64, &step2[10], &step2[13]); + butterfly16(in[13], in[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); + + // stage 3 + butterfly16(in[2], in[14], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + butterfly16(in[10], in[6], cospi_12_64, cospi_20_64, &step1[5], &step1[6]); + step1[8] = _mm256_add_epi16(step2[8], step2[9]); + step1[9] = _mm256_sub_epi16(step2[8], step2[9]); + step1[10] = _mm256_sub_epi16(step2[11], step2[10]); + step1[11] = _mm256_add_epi16(step2[10], step2[11]); + step1[12] = _mm256_add_epi16(step2[12], step2[13]); + step1[13] = _mm256_sub_epi16(step2[12], step2[13]); + step1[14] = _mm256_sub_epi16(step2[15], step2[14]); + step1[15] = _mm256_add_epi16(step2[14], step2[15]); + + // stage 4 + butterfly16(in[0], in[8], cospi_16_64, cospi_16_64, &step2[1], &step2[0]); + butterfly16(in[4], in[12], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); + butterfly16(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + butterfly16(step1[10], step1[13], -cospi_8_64, -cospi_24_64, &step2[13], + &step2[10]); + step2[5] = _mm256_sub_epi16(step1[4], step1[5]); + step1[4] = _mm256_add_epi16(step1[4], step1[5]); + step2[6] = _mm256_sub_epi16(step1[7], step1[6]); + step1[7] = _mm256_add_epi16(step1[6], step1[7]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0] = _mm256_add_epi16(step2[0], step2[3]); + step1[1] = _mm256_add_epi16(step2[1], step2[2]); + step1[2] = _mm256_sub_epi16(step2[1], step2[2]); + step1[3] = _mm256_sub_epi16(step2[0], step2[3]); + butterfly16(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], + &step1[6]); + step1[8] = _mm256_add_epi16(step2[8], step2[11]); + step1[9] = _mm256_add_epi16(step2[9], step2[10]); + step1[10] = _mm256_sub_epi16(step2[9], step2[10]); + step1[11] = _mm256_sub_epi16(step2[8], step2[11]); + step1[12] = _mm256_sub_epi16(step2[15], step2[12]); + step1[13] = _mm256_sub_epi16(step2[14], step2[13]); + step1[14] = _mm256_add_epi16(step2[14], step2[13]); + step1[15] = _mm256_add_epi16(step2[15], step2[12]); + + // stage 6 + step2[0] = _mm256_add_epi16(step1[0], step1[7]); + step2[1] = _mm256_add_epi16(step1[1], step1[6]); + step2[2] = _mm256_add_epi16(step1[2], step1[5]); + step2[3] = _mm256_add_epi16(step1[3], step1[4]); + step2[4] = _mm256_sub_epi16(step1[3], step1[4]); + step2[5] = _mm256_sub_epi16(step1[2], step1[5]); + step2[6] = _mm256_sub_epi16(step1[1], step1[6]); + step2[7] = _mm256_sub_epi16(step1[0], step1[7]); + butterfly16(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10], + &step2[13]); + butterfly16(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11], + &step2[12]); + + // stage 7 + out[0] = _mm256_add_epi16(step2[0], step1[15]); + out[1] = _mm256_add_epi16(step2[1], step1[14]); + out[2] = _mm256_add_epi16(step2[2], step2[13]); + out[3] = _mm256_add_epi16(step2[3], step2[12]); + out[4] = _mm256_add_epi16(step2[4], step2[11]); + out[5] = _mm256_add_epi16(step2[5], step2[10]); + out[6] = _mm256_add_epi16(step2[6], step1[9]); + out[7] = _mm256_add_epi16(step2[7], step1[8]); + out[8] = _mm256_sub_epi16(step2[7], step1[8]); + out[9] = _mm256_sub_epi16(step2[6], step1[9]); + out[10] = _mm256_sub_epi16(step2[5], step2[10]); + out[11] = _mm256_sub_epi16(step2[4], step2[11]); + out[12] = _mm256_sub_epi16(step2[3], step2[12]); + out[13] = _mm256_sub_epi16(step2[2], step2[13]); + out[14] = _mm256_sub_epi16(step2[1], step1[14]); + out[15] = _mm256_sub_epi16(step2[0], step1[15]); +} + +static INLINE void recon_and_store16(uint8_t *dest, __m256i in_x) { + const __m256i zero = _mm256_setzero_si256(); + __m256i d0 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dest))); + d0 = _mm256_permute4x64_epi64(d0, 0xd8); + d0 = _mm256_unpacklo_epi8(d0, zero); + d0 = _mm256_add_epi16(in_x, d0); + d0 = _mm256_packus_epi16( + d0, _mm256_castsi128_si256(_mm256_extractf128_si256(d0, 1))); + + _mm_storeu_si128((__m128i *)dest, _mm256_castsi256_si128(d0)); +} + +static INLINE void write_buffer_16x1(uint8_t *dest, __m256i in) { + const __m256i final_rounding = _mm256_set1_epi16(1 << 5); + __m256i out; + out = _mm256_adds_epi16(in, final_rounding); + out = _mm256_srai_epi16(out, 6); + recon_and_store16(dest, out); +} + +static INLINE void store_buffer_16x32(__m256i *in, uint8_t *dst, int stride) { + const __m256i final_rounding = _mm256_set1_epi16(1 << 5); + int j = 0; + while (j < 32) { + in[j] = _mm256_adds_epi16(in[j], final_rounding); + in[j + 1] = _mm256_adds_epi16(in[j + 1], final_rounding); + + in[j] = _mm256_srai_epi16(in[j], 6); + in[j + 1] = _mm256_srai_epi16(in[j + 1], 6); + + recon_and_store16(dst, in[j]); + dst += stride; + recon_and_store16(dst, in[j + 1]); + dst += stride; + j += 2; + } +} + +static INLINE void transpose2_8x8_avx2(__m256i *in, __m256i *out) { + int i; + __m256i t[16], u[16]; + // (1st, 2nd) ==> (lo, hi) + // (0, 1) ==> (0, 1) + // (2, 3) ==> (2, 3) + // (4, 5) ==> (4, 5) + // (6, 7) ==> (6, 7) + for (i = 0; i < 4; i++) { + t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]); + t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]); + } + + // (1st, 2nd) ==> (lo, hi) + // (0, 2) ==> (0, 2) + // (1, 3) ==> (1, 3) + // (4, 6) ==> (4, 6) + // (5, 7) ==> (5, 7) + for (i = 0; i < 2; i++) { + u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]); + u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]); + + u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]); + u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]); + } + + // (1st, 2nd) ==> (lo, hi) + // (0, 4) ==> (0, 1) + // (1, 5) ==> (4, 5) + // (2, 6) ==> (2, 3) + // (3, 7) ==> (6, 7) + for (i = 0; i < 2; i++) { + out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]); + out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]); + + out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]); + out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]); + } +} + +static INLINE void transpose_16bit_16x16_avx2(__m256i *in, __m256i *out) { + __m256i t[16]; + +#define LOADL(idx) \ + t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \ + t[idx] = _mm256_inserti128_si256( \ + t[idx], _mm_load_si128((__m128i const *)&in[(idx) + 8]), 1); + +#define LOADR(idx) \ + t[8 + (idx)] = \ + _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \ + t[8 + (idx)] = _mm256_inserti128_si256( \ + t[8 + (idx)], _mm_load_si128((__m128i const *)&in[(idx) + 8] + 1), 1); + + // load left 8x16 + LOADL(0) + LOADL(1) + LOADL(2) + LOADL(3) + LOADL(4) + LOADL(5) + LOADL(6) + LOADL(7) + + // load right 8x16 + LOADR(0) + LOADR(1) + LOADR(2) + LOADR(3) + LOADR(4) + LOADR(5) + LOADR(6) + LOADR(7) + + // get the top 16x8 result + transpose2_8x8_avx2(t, out); + // get the bottom 16x8 result + transpose2_8x8_avx2(&t[8], &out[8]); +} + +void vpx_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest, + int stride) { + int i; + __m256i in[16]; + + // Load 16x16 values + idct_load16x16(input, in, 16); + + transpose_16bit_16x16_avx2(in, in); + idct16_16col(in, in); + + transpose_16bit_16x16_avx2(in, in); + idct16_16col(in, in); + + for (i = 0; i < 16; ++i) { + write_buffer_16x1(dest + i * stride, in[i]); + } +} + +// Only do addition and subtraction butterfly, size = 16, 32 +static INLINE void add_sub_butterfly_avx2(__m256i *in, __m256i *out, int size) { + int i = 0; + const int num = size >> 1; + const int bound = size - 1; + while (i < num) { + out[i] = _mm256_add_epi16(in[i], in[bound - i]); + out[bound - i] = _mm256_sub_epi16(in[i], in[bound - i]); + i++; + } +} + +// For each 16x32 block __m256i in[32], +// Input with index, 0, 4, 8, 12, 16, 20, 24, 28 +// output pixels: 0-7 in __m256i out[32] +static INLINE void idct32_1024_16x32_quarter_1(__m256i *in, __m256i *out) { + __m256i step1[8], step2[8]; + + // stage 3 + butterfly16(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + butterfly16(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], &step1[6]); + + // stage 4 + butterfly16(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], &step2[0]); + butterfly16(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); + step2[4] = _mm256_add_epi16(step1[4], step1[5]); + step2[5] = _mm256_sub_epi16(step1[4], step1[5]); + step2[6] = _mm256_sub_epi16(step1[7], step1[6]); + step2[7] = _mm256_add_epi16(step1[7], step1[6]); + + // stage 5 + step1[0] = _mm256_add_epi16(step2[0], step2[3]); + step1[1] = _mm256_add_epi16(step2[1], step2[2]); + step1[2] = _mm256_sub_epi16(step2[1], step2[2]); + step1[3] = _mm256_sub_epi16(step2[0], step2[3]); + step1[4] = step2[4]; + butterfly16(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], + &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm256_add_epi16(step1[0], step1[7]); + out[1] = _mm256_add_epi16(step1[1], step1[6]); + out[2] = _mm256_add_epi16(step1[2], step1[5]); + out[3] = _mm256_add_epi16(step1[3], step1[4]); + out[4] = _mm256_sub_epi16(step1[3], step1[4]); + out[5] = _mm256_sub_epi16(step1[2], step1[5]); + out[6] = _mm256_sub_epi16(step1[1], step1[6]); + out[7] = _mm256_sub_epi16(step1[0], step1[7]); +} + +static INLINE void idct32_16x32_quarter_2_stage_4_to_6(__m256i *step1, + __m256i *out) { + __m256i step2[32]; + + // stage 4 + step2[8] = step1[8]; + step2[15] = step1[15]; + butterfly16(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + butterfly16(step1[13], step1[10], -cospi_8_64, cospi_24_64, &step2[10], + &step2[13]); + step2[11] = step1[11]; + step2[12] = step1[12]; + + // stage 5 + step1[8] = _mm256_add_epi16(step2[8], step2[11]); + step1[9] = _mm256_add_epi16(step2[9], step2[10]); + step1[10] = _mm256_sub_epi16(step2[9], step2[10]); + step1[11] = _mm256_sub_epi16(step2[8], step2[11]); + step1[12] = _mm256_sub_epi16(step2[15], step2[12]); + step1[13] = _mm256_sub_epi16(step2[14], step2[13]); + step1[14] = _mm256_add_epi16(step2[14], step2[13]); + step1[15] = _mm256_add_epi16(step2[15], step2[12]); + + // stage 6 + out[8] = step1[8]; + out[9] = step1[9]; + butterfly16(step1[13], step1[10], cospi_16_64, cospi_16_64, &out[10], + &out[13]); + butterfly16(step1[12], step1[11], cospi_16_64, cospi_16_64, &out[11], + &out[12]); + out[14] = step1[14]; + out[15] = step1[15]; +} + +// For each 16x32 block __m256i in[32], +// Input with index, 2, 6, 10, 14, 18, 22, 26, 30 +// output pixels: 8-15 in __m256i out[32] +static INLINE void idct32_1024_16x32_quarter_2(__m256i *in, __m256i *out) { + __m256i step1[16], step2[16]; + + // stage 2 + butterfly16(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], &step2[15]); + butterfly16(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], &step2[14]); + butterfly16(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], &step2[13]); + butterfly16(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); + + // stage 3 + step1[8] = _mm256_add_epi16(step2[8], step2[9]); + step1[9] = _mm256_sub_epi16(step2[8], step2[9]); + step1[10] = _mm256_sub_epi16(step2[11], step2[10]); + step1[11] = _mm256_add_epi16(step2[11], step2[10]); + step1[12] = _mm256_add_epi16(step2[12], step2[13]); + step1[13] = _mm256_sub_epi16(step2[12], step2[13]); + step1[14] = _mm256_sub_epi16(step2[15], step2[14]); + step1[15] = _mm256_add_epi16(step2[15], step2[14]); + + idct32_16x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void idct32_16x32_quarter_3_4_stage_4_to_7(__m256i *step1, + __m256i *out) { + __m256i step2[32]; + + // stage 4 + step2[16] = _mm256_add_epi16(step1[16], step1[19]); + step2[17] = _mm256_add_epi16(step1[17], step1[18]); + step2[18] = _mm256_sub_epi16(step1[17], step1[18]); + step2[19] = _mm256_sub_epi16(step1[16], step1[19]); + step2[20] = _mm256_sub_epi16(step1[23], step1[20]); + step2[21] = _mm256_sub_epi16(step1[22], step1[21]); + step2[22] = _mm256_add_epi16(step1[22], step1[21]); + step2[23] = _mm256_add_epi16(step1[23], step1[20]); + + step2[24] = _mm256_add_epi16(step1[24], step1[27]); + step2[25] = _mm256_add_epi16(step1[25], step1[26]); + step2[26] = _mm256_sub_epi16(step1[25], step1[26]); + step2[27] = _mm256_sub_epi16(step1[24], step1[27]); + step2[28] = _mm256_sub_epi16(step1[31], step1[28]); + step2[29] = _mm256_sub_epi16(step1[30], step1[29]); + step2[30] = _mm256_add_epi16(step1[29], step1[30]); + step2[31] = _mm256_add_epi16(step1[28], step1[31]); + + // stage 5 + step1[16] = step2[16]; + step1[17] = step2[17]; + butterfly16(step2[29], step2[18], cospi_24_64, cospi_8_64, &step1[18], + &step1[29]); + butterfly16(step2[28], step2[19], cospi_24_64, cospi_8_64, &step1[19], + &step1[28]); + butterfly16(step2[27], step2[20], -cospi_8_64, cospi_24_64, &step1[20], + &step1[27]); + butterfly16(step2[26], step2[21], -cospi_8_64, cospi_24_64, &step1[21], + &step1[26]); + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // stage 6 + out[16] = _mm256_add_epi16(step1[16], step1[23]); + out[17] = _mm256_add_epi16(step1[17], step1[22]); + out[18] = _mm256_add_epi16(step1[18], step1[21]); + out[19] = _mm256_add_epi16(step1[19], step1[20]); + step2[20] = _mm256_sub_epi16(step1[19], step1[20]); + step2[21] = _mm256_sub_epi16(step1[18], step1[21]); + step2[22] = _mm256_sub_epi16(step1[17], step1[22]); + step2[23] = _mm256_sub_epi16(step1[16], step1[23]); + + step2[24] = _mm256_sub_epi16(step1[31], step1[24]); + step2[25] = _mm256_sub_epi16(step1[30], step1[25]); + step2[26] = _mm256_sub_epi16(step1[29], step1[26]); + step2[27] = _mm256_sub_epi16(step1[28], step1[27]); + out[28] = _mm256_add_epi16(step1[27], step1[28]); + out[29] = _mm256_add_epi16(step1[26], step1[29]); + out[30] = _mm256_add_epi16(step1[25], step1[30]); + out[31] = _mm256_add_epi16(step1[24], step1[31]); + + // stage 7 + butterfly16(step2[27], step2[20], cospi_16_64, cospi_16_64, &out[20], + &out[27]); + butterfly16(step2[26], step2[21], cospi_16_64, cospi_16_64, &out[21], + &out[26]); + butterfly16(step2[25], step2[22], cospi_16_64, cospi_16_64, &out[22], + &out[25]); + butterfly16(step2[24], step2[23], cospi_16_64, cospi_16_64, &out[23], + &out[24]); +} + +static INLINE void idct32_1024_16x32_quarter_1_2(__m256i *in, __m256i *out) { + __m256i temp[16]; + + // For each 16x32 block __m256i in[32], + // Input with index, 0, 4, 8, 12, 16, 20, 24, 28 + // output pixels: 0-7 in __m256i out[32] + idct32_1024_16x32_quarter_1(in, temp); + + // Input with index, 2, 6, 10, 14, 18, 22, 26, 30 + // output pixels: 8-15 in __m256i out[32] + idct32_1024_16x32_quarter_2(in, temp); + + // stage 7 + add_sub_butterfly_avx2(temp, out, 16); +} + +// For each 16x32 block __m256i in[32], +// Input with odd index, +// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +// output pixels: 16-23, 24-31 in __m256i out[32] +static INLINE void idct32_1024_16x32_quarter_3_4(__m256i *in, __m256i *out) { + __m256i step1[32], step2[32]; + + // stage 1 + butterfly16(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], &step1[31]); + butterfly16(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], &step1[30]); + butterfly16(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], &step1[29]); + butterfly16(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]); + + butterfly16(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], &step1[27]); + butterfly16(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], &step1[26]); + + butterfly16(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], &step1[25]); + butterfly16(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]); + + // stage 2 + step2[16] = _mm256_add_epi16(step1[16], step1[17]); + step2[17] = _mm256_sub_epi16(step1[16], step1[17]); + step2[18] = _mm256_sub_epi16(step1[19], step1[18]); + step2[19] = _mm256_add_epi16(step1[19], step1[18]); + step2[20] = _mm256_add_epi16(step1[20], step1[21]); + step2[21] = _mm256_sub_epi16(step1[20], step1[21]); + step2[22] = _mm256_sub_epi16(step1[23], step1[22]); + step2[23] = _mm256_add_epi16(step1[23], step1[22]); + + step2[24] = _mm256_add_epi16(step1[24], step1[25]); + step2[25] = _mm256_sub_epi16(step1[24], step1[25]); + step2[26] = _mm256_sub_epi16(step1[27], step1[26]); + step2[27] = _mm256_add_epi16(step1[27], step1[26]); + step2[28] = _mm256_add_epi16(step1[28], step1[29]); + step2[29] = _mm256_sub_epi16(step1[28], step1[29]); + step2[30] = _mm256_sub_epi16(step1[31], step1[30]); + step2[31] = _mm256_add_epi16(step1[31], step1[30]); + + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + butterfly16(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17], + &step1[30]); + butterfly16(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18], + &step1[29]); + step1[19] = step2[19]; + step1[20] = step2[20]; + butterfly16(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21], + &step1[26]); + butterfly16(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22], + &step1[25]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + idct32_16x32_quarter_3_4_stage_4_to_7(step1, out); +} + +static INLINE void idct32_1024_16x32(__m256i *in, __m256i *out) { + __m256i temp[32]; + + // For each 16x32 block __m256i in[32], + // Input with index, 0, 4, 8, 12, 16, 20, 24, 28 + // output pixels: 0-7 in __m256i out[32] + // AND + // Input with index, 2, 6, 10, 14, 18, 22, 26, 30 + // output pixels: 8-15 in __m256i out[32] + idct32_1024_16x32_quarter_1_2(in, temp); + + // For each 16x32 block __m256i in[32], + // Input with odd index, + // 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 + // output pixels: 16-23, 24-31 in __m256i out[32] + idct32_1024_16x32_quarter_3_4(in, temp); + + // final stage + add_sub_butterfly_avx2(temp, out, 32); +} + +void vpx_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m256i l[32], r[32], out[32], *in; + int i; + + in = l; + + for (i = 0; i < 2; i++) { + idct_load16x16(input, in, 32); + transpose_16bit_16x16_avx2(in, in); + + idct_load16x16(input + 16, in + 16, 32); + transpose_16bit_16x16_avx2(in + 16, in + 16); + idct32_1024_16x32(in, in); + + in = r; + input += 32 << 4; + } + + for (i = 0; i < 32; i += 16) { + transpose_16bit_16x16_avx2(l + i, out); + transpose_16bit_16x16_avx2(r + i, out + 16); + idct32_1024_16x32(out, out); + + store_buffer_16x32(out, dest, stride); + dest += 16; + } +} + +// Case when only upper-left 16x16 has non-zero coeff +void vpx_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m256i in[32], io[32], out[32]; + int i; + + for (i = 16; i < 32; i++) { + in[i] = _mm256_setzero_si256(); + } + + // rows + idct_load16x16(input, in, 32); + transpose_16bit_16x16_avx2(in, in); + idct32_1024_16x32(in, io); + + // columns + for (i = 0; i < 32; i += 16) { + transpose_16bit_16x16_avx2(io + i, in); + idct32_1024_16x32(in, out); + + store_buffer_16x32(out, dest, stride); + dest += 16; + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.c new file mode 100644 index 0000000000..f42b3df849 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.c @@ -0,0 +1,1235 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE2 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void transpose_16bit_4(__m128i *res) { + const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); + const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); + + res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); + res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); +} + +void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i eight = _mm_set1_epi16(8); + __m128i in[2]; + + // Rows + in[0] = load_input_data8(input); + in[1] = load_input_data8(input + 8); + idct4_sse2(in); + + // Columns + idct4_sse2(in); + + // Final round and shift + in[0] = _mm_add_epi16(in[0], eight); + in[1] = _mm_add_epi16(in[1], eight); + in[0] = _mm_srai_epi16(in[0], 4); + in[1] = _mm_srai_epi16(in[1], 4); + + recon_and_store4x4_sse2(in, dest, stride); +} + +void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i zero = _mm_setzero_si128(); + int a; + __m128i dc_value, d[2]; + + a = (int)dct_const_round_shift((int16_t)input[0] * cospi_16_64); + a = (int)dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 4); + + dc_value = _mm_set1_epi16(a); + + // Reconstruction and Store + d[0] = _mm_cvtsi32_si128(*(const int *)(dest)); + d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)); + d[0] = _mm_unpacklo_epi32(d[0], + _mm_cvtsi32_si128(*(const int *)(dest + stride))); + d[1] = _mm_unpacklo_epi32( + _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]); + d[0] = _mm_unpacklo_epi8(d[0], zero); + d[1] = _mm_unpacklo_epi8(d[1], zero); + d[0] = _mm_add_epi16(d[0], dc_value); + d[1] = _mm_add_epi16(d[1], dc_value); + d[0] = _mm_packus_epi16(d[0], d[1]); + + *(int *)dest = _mm_cvtsi128_si32(d[0]); + d[0] = _mm_srli_si128(d[0], 4); + *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]); + d[0] = _mm_srli_si128(d[0], 4); + *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]); + d[0] = _mm_srli_si128(d[0], 4); + *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]); +} + +void idct4_sse2(__m128i *const in) { + const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + __m128i u[2]; + + transpose_16bit_4(in); + // stage 1 + u[0] = _mm_unpacklo_epi16(in[0], in[1]); + u[1] = _mm_unpackhi_epi16(in[0], in[1]); + u[0] = idct_calc_wraplow_sse2(k__cospi_p16_p16, k__cospi_p16_m16, u[0]); + u[1] = idct_calc_wraplow_sse2(k__cospi_p08_p24, k__cospi_p24_m08, u[1]); + + // stage 2 + in[0] = _mm_add_epi16(u[0], u[1]); + in[1] = _mm_sub_epi16(u[0], u[1]); + in[1] = _mm_shuffle_epi32(in[1], 0x4E); +} + +void iadst4_sse2(__m128i *const in) { + const __m128i k__sinpi_1_3 = pair_set_epi16(sinpi_1_9, sinpi_3_9); + const __m128i k__sinpi_4_2 = pair_set_epi16(sinpi_4_9, sinpi_2_9); + const __m128i k__sinpi_2_3 = pair_set_epi16(sinpi_2_9, sinpi_3_9); + const __m128i k__sinpi_1_4 = pair_set_epi16(sinpi_1_9, sinpi_4_9); + const __m128i k__sinpi_12_n3 = + pair_set_epi16(sinpi_1_9 + sinpi_2_9, -sinpi_3_9); + __m128i u[4], v[5]; + + // 00 01 20 21 02 03 22 23 + // 10 11 30 31 12 13 32 33 + const __m128i tr0_0 = _mm_unpacklo_epi32(in[0], in[1]); + const __m128i tr0_1 = _mm_unpackhi_epi32(in[0], in[1]); + + // 00 01 10 11 20 21 30 31 + // 02 03 12 13 22 23 32 33 + in[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); + in[1] = _mm_unpackhi_epi32(tr0_0, tr0_1); + + v[0] = _mm_madd_epi16(in[0], k__sinpi_1_3); // s_1 * x0 + s_3 * x1 + v[1] = _mm_madd_epi16(in[1], k__sinpi_4_2); // s_4 * x2 + s_2 * x3 + v[2] = _mm_madd_epi16(in[0], k__sinpi_2_3); // s_2 * x0 + s_3 * x1 + v[3] = _mm_madd_epi16(in[1], k__sinpi_1_4); // s_1 * x2 + s_4 * x3 + v[4] = _mm_madd_epi16(in[0], k__sinpi_12_n3); // (s_1 + s_2) * x0 - s_3 * x1 + in[0] = _mm_sub_epi16(in[0], in[1]); // x0 - x2 + in[1] = _mm_srli_epi32(in[1], 16); + in[0] = _mm_add_epi16(in[0], in[1]); + in[0] = _mm_slli_epi32(in[0], 16); // x0 - x2 + x3 + + u[0] = _mm_add_epi32(v[0], v[1]); + u[1] = _mm_sub_epi32(v[2], v[3]); + u[2] = _mm_madd_epi16(in[0], k__sinpi_1_3); + u[3] = _mm_sub_epi32(v[1], v[3]); + u[3] = _mm_add_epi32(u[3], v[4]); + + u[0] = dct_const_round_shift_sse2(u[0]); + u[1] = dct_const_round_shift_sse2(u[1]); + u[2] = dct_const_round_shift_sse2(u[2]); + u[3] = dct_const_round_shift_sse2(u[3]); + + in[0] = _mm_packs_epi32(u[0], u[1]); + in[1] = _mm_packs_epi32(u[2], u[3]); +} + +static INLINE void load_buffer_8x8(const tran_low_t *const input, + __m128i *const in) { + in[0] = load_input_data8(input + 0 * 8); + in[1] = load_input_data8(input + 1 * 8); + in[2] = load_input_data8(input + 2 * 8); + in[3] = load_input_data8(input + 3 * 8); + in[4] = load_input_data8(input + 4 * 8); + in[5] = load_input_data8(input + 5 * 8); + in[6] = load_input_data8(input + 6 * 8); + in[7] = load_input_data8(input + 7 * 8); +} + +void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i in[8]; + int i; + + // Load input data. + load_buffer_8x8(input, in); + + // 2-D + for (i = 0; i < 2; i++) { + vpx_idct8_sse2(in); + } + + write_buffer_8x8(in, dest, stride); +} + +void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i io[8]; + + io[0] = load_input_data4(input + 0 * 8); + io[1] = load_input_data4(input + 1 * 8); + io[2] = load_input_data4(input + 2 * 8); + io[3] = load_input_data4(input + 3 * 8); + + idct8x8_12_add_kernel_sse2(io); + write_buffer_8x8(io, dest, stride); +} + +static INLINE void recon_and_store_8_dual(uint8_t *const dest, + const __m128i in_x, + const int stride) { + const __m128i zero = _mm_setzero_si128(); + __m128i d0, d1; + + d0 = _mm_loadl_epi64((__m128i *)(dest + 0 * stride)); + d1 = _mm_loadl_epi64((__m128i *)(dest + 1 * stride)); + d0 = _mm_unpacklo_epi8(d0, zero); + d1 = _mm_unpacklo_epi8(d1, zero); + d0 = _mm_add_epi16(in_x, d0); + d1 = _mm_add_epi16(in_x, d1); + d0 = _mm_packus_epi16(d0, d1); + _mm_storel_epi64((__m128i *)(dest + 0 * stride), d0); + _mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d0)); +} + +void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i dc_value; + tran_high_t a1; + tran_low_t out = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); + + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 5); + dc_value = _mm_set1_epi16((int16_t)a1); + + recon_and_store_8_dual(dest, dc_value, stride); + dest += 2 * stride; + recon_and_store_8_dual(dest, dc_value, stride); + dest += 2 * stride; + recon_and_store_8_dual(dest, dc_value, stride); + dest += 2 * stride; + recon_and_store_8_dual(dest, dc_value, stride); +} + +void vpx_idct8_sse2(__m128i *const in) { + // 8x8 Transpose is copied from vpx_fdct8x8_sse2() + transpose_16bit_8x8(in, in); + + // 4-stage 1D idct8x8 + idct8(in, in); +} + +void iadst8_sse2(__m128i *const in) { + const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); + const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i kZero = _mm_setzero_si128(); + __m128i s[8], u[16], v[8], w[16]; + + // transpose + transpose_16bit_8x8(in, in); + + // column transformation + // stage 1 + // interleave and multiply/add into 32-bit integer + s[0] = _mm_unpacklo_epi16(in[7], in[0]); + s[1] = _mm_unpackhi_epi16(in[7], in[0]); + s[2] = _mm_unpacklo_epi16(in[5], in[2]); + s[3] = _mm_unpackhi_epi16(in[5], in[2]); + s[4] = _mm_unpacklo_epi16(in[3], in[4]); + s[5] = _mm_unpackhi_epi16(in[3], in[4]); + s[6] = _mm_unpacklo_epi16(in[1], in[6]); + s[7] = _mm_unpackhi_epi16(in[1], in[6]); + + u[0] = _mm_madd_epi16(s[0], k__cospi_p02_p30); + u[1] = _mm_madd_epi16(s[1], k__cospi_p02_p30); + u[2] = _mm_madd_epi16(s[0], k__cospi_p30_m02); + u[3] = _mm_madd_epi16(s[1], k__cospi_p30_m02); + u[4] = _mm_madd_epi16(s[2], k__cospi_p10_p22); + u[5] = _mm_madd_epi16(s[3], k__cospi_p10_p22); + u[6] = _mm_madd_epi16(s[2], k__cospi_p22_m10); + u[7] = _mm_madd_epi16(s[3], k__cospi_p22_m10); + u[8] = _mm_madd_epi16(s[4], k__cospi_p18_p14); + u[9] = _mm_madd_epi16(s[5], k__cospi_p18_p14); + u[10] = _mm_madd_epi16(s[4], k__cospi_p14_m18); + u[11] = _mm_madd_epi16(s[5], k__cospi_p14_m18); + u[12] = _mm_madd_epi16(s[6], k__cospi_p26_p06); + u[13] = _mm_madd_epi16(s[7], k__cospi_p26_p06); + u[14] = _mm_madd_epi16(s[6], k__cospi_p06_m26); + u[15] = _mm_madd_epi16(s[7], k__cospi_p06_m26); + + // addition + w[0] = _mm_add_epi32(u[0], u[8]); + w[1] = _mm_add_epi32(u[1], u[9]); + w[2] = _mm_add_epi32(u[2], u[10]); + w[3] = _mm_add_epi32(u[3], u[11]); + w[4] = _mm_add_epi32(u[4], u[12]); + w[5] = _mm_add_epi32(u[5], u[13]); + w[6] = _mm_add_epi32(u[6], u[14]); + w[7] = _mm_add_epi32(u[7], u[15]); + w[8] = _mm_sub_epi32(u[0], u[8]); + w[9] = _mm_sub_epi32(u[1], u[9]); + w[10] = _mm_sub_epi32(u[2], u[10]); + w[11] = _mm_sub_epi32(u[3], u[11]); + w[12] = _mm_sub_epi32(u[4], u[12]); + w[13] = _mm_sub_epi32(u[5], u[13]); + w[14] = _mm_sub_epi32(u[6], u[14]); + w[15] = _mm_sub_epi32(u[7], u[15]); + + // shift and rounding + u[0] = dct_const_round_shift_sse2(w[0]); + u[1] = dct_const_round_shift_sse2(w[1]); + u[2] = dct_const_round_shift_sse2(w[2]); + u[3] = dct_const_round_shift_sse2(w[3]); + u[4] = dct_const_round_shift_sse2(w[4]); + u[5] = dct_const_round_shift_sse2(w[5]); + u[6] = dct_const_round_shift_sse2(w[6]); + u[7] = dct_const_round_shift_sse2(w[7]); + u[8] = dct_const_round_shift_sse2(w[8]); + u[9] = dct_const_round_shift_sse2(w[9]); + u[10] = dct_const_round_shift_sse2(w[10]); + u[11] = dct_const_round_shift_sse2(w[11]); + u[12] = dct_const_round_shift_sse2(w[12]); + u[13] = dct_const_round_shift_sse2(w[13]); + u[14] = dct_const_round_shift_sse2(w[14]); + u[15] = dct_const_round_shift_sse2(w[15]); + + // back to 16-bit and pack 8 integers into __m128i + in[0] = _mm_packs_epi32(u[0], u[1]); + in[1] = _mm_packs_epi32(u[2], u[3]); + in[2] = _mm_packs_epi32(u[4], u[5]); + in[3] = _mm_packs_epi32(u[6], u[7]); + in[4] = _mm_packs_epi32(u[8], u[9]); + in[5] = _mm_packs_epi32(u[10], u[11]); + in[6] = _mm_packs_epi32(u[12], u[13]); + in[7] = _mm_packs_epi32(u[14], u[15]); + + // stage 2 + s[0] = _mm_add_epi16(in[0], in[2]); + s[1] = _mm_add_epi16(in[1], in[3]); + s[2] = _mm_sub_epi16(in[0], in[2]); + s[3] = _mm_sub_epi16(in[1], in[3]); + u[0] = _mm_unpacklo_epi16(in[4], in[5]); + u[1] = _mm_unpackhi_epi16(in[4], in[5]); + u[2] = _mm_unpacklo_epi16(in[6], in[7]); + u[3] = _mm_unpackhi_epi16(in[6], in[7]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); + v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); + v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); + v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); + v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); + v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); + v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); + v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); + + w[0] = _mm_add_epi32(v[0], v[4]); + w[1] = _mm_add_epi32(v[1], v[5]); + w[2] = _mm_add_epi32(v[2], v[6]); + w[3] = _mm_add_epi32(v[3], v[7]); + w[4] = _mm_sub_epi32(v[0], v[4]); + w[5] = _mm_sub_epi32(v[1], v[5]); + w[6] = _mm_sub_epi32(v[2], v[6]); + w[7] = _mm_sub_epi32(v[3], v[7]); + + u[0] = dct_const_round_shift_sse2(w[0]); + u[1] = dct_const_round_shift_sse2(w[1]); + u[2] = dct_const_round_shift_sse2(w[2]); + u[3] = dct_const_round_shift_sse2(w[3]); + u[4] = dct_const_round_shift_sse2(w[4]); + u[5] = dct_const_round_shift_sse2(w[5]); + u[6] = dct_const_round_shift_sse2(w[6]); + u[7] = dct_const_round_shift_sse2(w[7]); + + // back to 16-bit intergers + s[4] = _mm_packs_epi32(u[0], u[1]); + s[5] = _mm_packs_epi32(u[2], u[3]); + s[6] = _mm_packs_epi32(u[4], u[5]); + s[7] = _mm_packs_epi32(u[6], u[7]); + + // stage 3 + u[0] = _mm_unpacklo_epi16(s[2], s[3]); + u[1] = _mm_unpackhi_epi16(s[2], s[3]); + u[2] = _mm_unpacklo_epi16(s[6], s[7]); + u[3] = _mm_unpackhi_epi16(s[6], s[7]); + + s[2] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_p16); + s[3] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_m16); + s[6] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_p16); + s[7] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_m16); + + in[0] = s[0]; + in[1] = _mm_sub_epi16(kZero, s[4]); + in[2] = s[6]; + in[3] = _mm_sub_epi16(kZero, s[2]); + in[4] = s[3]; + in[5] = _mm_sub_epi16(kZero, s[7]); + in[6] = s[5]; + in[7] = _mm_sub_epi16(kZero, s[1]); +} + +static INLINE void idct16_load8x8(const tran_low_t *const input, + __m128i *const in) { + in[0] = load_input_data8(input + 0 * 16); + in[1] = load_input_data8(input + 1 * 16); + in[2] = load_input_data8(input + 2 * 16); + in[3] = load_input_data8(input + 3 * 16); + in[4] = load_input_data8(input + 4 * 16); + in[5] = load_input_data8(input + 5 * 16); + in[6] = load_input_data8(input + 6 * 16); + in[7] = load_input_data8(input + 7 * 16); +} + +void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i l[16], r[16], out[16], *in; + int i; + + in = l; + for (i = 0; i < 2; i++) { + idct16_load8x8(input, in); + transpose_16bit_8x8(in, in); + idct16_load8x8(input + 8, in + 8); + transpose_16bit_8x8(in + 8, in + 8); + idct16_8col(in, in); + in = r; + input += 128; + } + + for (i = 0; i < 16; i += 8) { + int j; + transpose_16bit_8x8(l + i, out); + transpose_16bit_8x8(r + i, out + 8); + idct16_8col(out, out); + + for (j = 0; j < 16; ++j) { + write_buffer_8x1(dest + j * stride, out[j]); + } + + dest += 8; + } +} + +void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i in[16], temp[16], out[16]; + int i; + + idct16_load8x8(input, in); + transpose_16bit_8x8(in, in); + + for (i = 8; i < 16; i++) { + in[i] = _mm_setzero_si128(); + } + idct16_8col(in, temp); + + for (i = 0; i < 16; i += 8) { + int j; + transpose_16bit_8x8(temp + i, in); + idct16_8col(in, out); + + for (j = 0; j < 16; ++j) { + write_buffer_8x1(dest + j * stride, out[j]); + } + + dest += 8; + } +} + +void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i in[16], l[16]; + int i; + + // First 1-D inverse DCT + // Load input data. + in[0] = load_input_data4(input + 0 * 16); + in[1] = load_input_data4(input + 1 * 16); + in[2] = load_input_data4(input + 2 * 16); + in[3] = load_input_data4(input + 3 * 16); + + idct16x16_10_pass1(in, l); + + // Second 1-D inverse transform, performed per 8x16 block + for (i = 0; i < 16; i += 8) { + int j; + idct16x16_10_pass2(l + i, in); + + for (j = 0; j < 16; ++j) { + write_buffer_8x1(dest + j * stride, in[j]); + } + + dest += 8; + } +} + +static INLINE void recon_and_store_16(uint8_t *const dest, const __m128i in_x) { + const __m128i zero = _mm_setzero_si128(); + __m128i d0, d1; + + d0 = _mm_load_si128((__m128i *)(dest)); + d1 = _mm_unpackhi_epi8(d0, zero); + d0 = _mm_unpacklo_epi8(d0, zero); + d0 = _mm_add_epi16(in_x, d0); + d1 = _mm_add_epi16(in_x, d1); + d0 = _mm_packus_epi16(d0, d1); + _mm_store_si128((__m128i *)(dest), d0); +} + +void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i dc_value; + int i; + tran_high_t a1; + tran_low_t out = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); + + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 6); + dc_value = _mm_set1_epi16((int16_t)a1); + + for (i = 0; i < 16; ++i) { + recon_and_store_16(dest, dc_value); + dest += stride; + } +} + +void vpx_iadst16_8col_sse2(__m128i *const in) { + // perform 16x16 1-D ADST for 8 columns + __m128i s[16], x[16], u[32], v[32]; + const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); + const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); + const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); + const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); + const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); + const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); + const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); + const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); + const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); + const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); + const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); + const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); + const __m128i kZero = _mm_setzero_si128(); + + u[0] = _mm_unpacklo_epi16(in[15], in[0]); + u[1] = _mm_unpackhi_epi16(in[15], in[0]); + u[2] = _mm_unpacklo_epi16(in[13], in[2]); + u[3] = _mm_unpackhi_epi16(in[13], in[2]); + u[4] = _mm_unpacklo_epi16(in[11], in[4]); + u[5] = _mm_unpackhi_epi16(in[11], in[4]); + u[6] = _mm_unpacklo_epi16(in[9], in[6]); + u[7] = _mm_unpackhi_epi16(in[9], in[6]); + u[8] = _mm_unpacklo_epi16(in[7], in[8]); + u[9] = _mm_unpackhi_epi16(in[7], in[8]); + u[10] = _mm_unpacklo_epi16(in[5], in[10]); + u[11] = _mm_unpackhi_epi16(in[5], in[10]); + u[12] = _mm_unpacklo_epi16(in[3], in[12]); + u[13] = _mm_unpackhi_epi16(in[3], in[12]); + u[14] = _mm_unpacklo_epi16(in[1], in[14]); + u[15] = _mm_unpackhi_epi16(in[1], in[14]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); + v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); + v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); + v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); + v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); + v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); + v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); + v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); + v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); + v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); + v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); + v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); + v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); + v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); + v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); + v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); + v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); + v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); + v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); + v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); + v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); + v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); + v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); + v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); + v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); + v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); + v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); + v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); + v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); + v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); + v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); + v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); + + u[0] = _mm_add_epi32(v[0], v[16]); + u[1] = _mm_add_epi32(v[1], v[17]); + u[2] = _mm_add_epi32(v[2], v[18]); + u[3] = _mm_add_epi32(v[3], v[19]); + u[4] = _mm_add_epi32(v[4], v[20]); + u[5] = _mm_add_epi32(v[5], v[21]); + u[6] = _mm_add_epi32(v[6], v[22]); + u[7] = _mm_add_epi32(v[7], v[23]); + u[8] = _mm_add_epi32(v[8], v[24]); + u[9] = _mm_add_epi32(v[9], v[25]); + u[10] = _mm_add_epi32(v[10], v[26]); + u[11] = _mm_add_epi32(v[11], v[27]); + u[12] = _mm_add_epi32(v[12], v[28]); + u[13] = _mm_add_epi32(v[13], v[29]); + u[14] = _mm_add_epi32(v[14], v[30]); + u[15] = _mm_add_epi32(v[15], v[31]); + u[16] = _mm_sub_epi32(v[0], v[16]); + u[17] = _mm_sub_epi32(v[1], v[17]); + u[18] = _mm_sub_epi32(v[2], v[18]); + u[19] = _mm_sub_epi32(v[3], v[19]); + u[20] = _mm_sub_epi32(v[4], v[20]); + u[21] = _mm_sub_epi32(v[5], v[21]); + u[22] = _mm_sub_epi32(v[6], v[22]); + u[23] = _mm_sub_epi32(v[7], v[23]); + u[24] = _mm_sub_epi32(v[8], v[24]); + u[25] = _mm_sub_epi32(v[9], v[25]); + u[26] = _mm_sub_epi32(v[10], v[26]); + u[27] = _mm_sub_epi32(v[11], v[27]); + u[28] = _mm_sub_epi32(v[12], v[28]); + u[29] = _mm_sub_epi32(v[13], v[29]); + u[30] = _mm_sub_epi32(v[14], v[30]); + u[31] = _mm_sub_epi32(v[15], v[31]); + + u[0] = dct_const_round_shift_sse2(u[0]); + u[1] = dct_const_round_shift_sse2(u[1]); + u[2] = dct_const_round_shift_sse2(u[2]); + u[3] = dct_const_round_shift_sse2(u[3]); + u[4] = dct_const_round_shift_sse2(u[4]); + u[5] = dct_const_round_shift_sse2(u[5]); + u[6] = dct_const_round_shift_sse2(u[6]); + u[7] = dct_const_round_shift_sse2(u[7]); + u[8] = dct_const_round_shift_sse2(u[8]); + u[9] = dct_const_round_shift_sse2(u[9]); + u[10] = dct_const_round_shift_sse2(u[10]); + u[11] = dct_const_round_shift_sse2(u[11]); + u[12] = dct_const_round_shift_sse2(u[12]); + u[13] = dct_const_round_shift_sse2(u[13]); + u[14] = dct_const_round_shift_sse2(u[14]); + u[15] = dct_const_round_shift_sse2(u[15]); + u[16] = dct_const_round_shift_sse2(u[16]); + u[17] = dct_const_round_shift_sse2(u[17]); + u[18] = dct_const_round_shift_sse2(u[18]); + u[19] = dct_const_round_shift_sse2(u[19]); + u[20] = dct_const_round_shift_sse2(u[20]); + u[21] = dct_const_round_shift_sse2(u[21]); + u[22] = dct_const_round_shift_sse2(u[22]); + u[23] = dct_const_round_shift_sse2(u[23]); + u[24] = dct_const_round_shift_sse2(u[24]); + u[25] = dct_const_round_shift_sse2(u[25]); + u[26] = dct_const_round_shift_sse2(u[26]); + u[27] = dct_const_round_shift_sse2(u[27]); + u[28] = dct_const_round_shift_sse2(u[28]); + u[29] = dct_const_round_shift_sse2(u[29]); + u[30] = dct_const_round_shift_sse2(u[30]); + u[31] = dct_const_round_shift_sse2(u[31]); + + s[0] = _mm_packs_epi32(u[0], u[1]); + s[1] = _mm_packs_epi32(u[2], u[3]); + s[2] = _mm_packs_epi32(u[4], u[5]); + s[3] = _mm_packs_epi32(u[6], u[7]); + s[4] = _mm_packs_epi32(u[8], u[9]); + s[5] = _mm_packs_epi32(u[10], u[11]); + s[6] = _mm_packs_epi32(u[12], u[13]); + s[7] = _mm_packs_epi32(u[14], u[15]); + s[8] = _mm_packs_epi32(u[16], u[17]); + s[9] = _mm_packs_epi32(u[18], u[19]); + s[10] = _mm_packs_epi32(u[20], u[21]); + s[11] = _mm_packs_epi32(u[22], u[23]); + s[12] = _mm_packs_epi32(u[24], u[25]); + s[13] = _mm_packs_epi32(u[26], u[27]); + s[14] = _mm_packs_epi32(u[28], u[29]); + s[15] = _mm_packs_epi32(u[30], u[31]); + + // stage 2 + u[0] = _mm_unpacklo_epi16(s[8], s[9]); + u[1] = _mm_unpackhi_epi16(s[8], s[9]); + u[2] = _mm_unpacklo_epi16(s[10], s[11]); + u[3] = _mm_unpackhi_epi16(s[10], s[11]); + u[4] = _mm_unpacklo_epi16(s[12], s[13]); + u[5] = _mm_unpackhi_epi16(s[12], s[13]); + u[6] = _mm_unpacklo_epi16(s[14], s[15]); + u[7] = _mm_unpackhi_epi16(s[14], s[15]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); + v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); + v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); + v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); + v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); + v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); + v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); + v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); + v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); + v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); + v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); + v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); + v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); + v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); + v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); + v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); + + u[0] = _mm_add_epi32(v[0], v[8]); + u[1] = _mm_add_epi32(v[1], v[9]); + u[2] = _mm_add_epi32(v[2], v[10]); + u[3] = _mm_add_epi32(v[3], v[11]); + u[4] = _mm_add_epi32(v[4], v[12]); + u[5] = _mm_add_epi32(v[5], v[13]); + u[6] = _mm_add_epi32(v[6], v[14]); + u[7] = _mm_add_epi32(v[7], v[15]); + u[8] = _mm_sub_epi32(v[0], v[8]); + u[9] = _mm_sub_epi32(v[1], v[9]); + u[10] = _mm_sub_epi32(v[2], v[10]); + u[11] = _mm_sub_epi32(v[3], v[11]); + u[12] = _mm_sub_epi32(v[4], v[12]); + u[13] = _mm_sub_epi32(v[5], v[13]); + u[14] = _mm_sub_epi32(v[6], v[14]); + u[15] = _mm_sub_epi32(v[7], v[15]); + + u[0] = dct_const_round_shift_sse2(u[0]); + u[1] = dct_const_round_shift_sse2(u[1]); + u[2] = dct_const_round_shift_sse2(u[2]); + u[3] = dct_const_round_shift_sse2(u[3]); + u[4] = dct_const_round_shift_sse2(u[4]); + u[5] = dct_const_round_shift_sse2(u[5]); + u[6] = dct_const_round_shift_sse2(u[6]); + u[7] = dct_const_round_shift_sse2(u[7]); + u[8] = dct_const_round_shift_sse2(u[8]); + u[9] = dct_const_round_shift_sse2(u[9]); + u[10] = dct_const_round_shift_sse2(u[10]); + u[11] = dct_const_round_shift_sse2(u[11]); + u[12] = dct_const_round_shift_sse2(u[12]); + u[13] = dct_const_round_shift_sse2(u[13]); + u[14] = dct_const_round_shift_sse2(u[14]); + u[15] = dct_const_round_shift_sse2(u[15]); + + x[0] = _mm_add_epi16(s[0], s[4]); + x[1] = _mm_add_epi16(s[1], s[5]); + x[2] = _mm_add_epi16(s[2], s[6]); + x[3] = _mm_add_epi16(s[3], s[7]); + x[4] = _mm_sub_epi16(s[0], s[4]); + x[5] = _mm_sub_epi16(s[1], s[5]); + x[6] = _mm_sub_epi16(s[2], s[6]); + x[7] = _mm_sub_epi16(s[3], s[7]); + x[8] = _mm_packs_epi32(u[0], u[1]); + x[9] = _mm_packs_epi32(u[2], u[3]); + x[10] = _mm_packs_epi32(u[4], u[5]); + x[11] = _mm_packs_epi32(u[6], u[7]); + x[12] = _mm_packs_epi32(u[8], u[9]); + x[13] = _mm_packs_epi32(u[10], u[11]); + x[14] = _mm_packs_epi32(u[12], u[13]); + x[15] = _mm_packs_epi32(u[14], u[15]); + + // stage 3 + u[0] = _mm_unpacklo_epi16(x[4], x[5]); + u[1] = _mm_unpackhi_epi16(x[4], x[5]); + u[2] = _mm_unpacklo_epi16(x[6], x[7]); + u[3] = _mm_unpackhi_epi16(x[6], x[7]); + u[4] = _mm_unpacklo_epi16(x[12], x[13]); + u[5] = _mm_unpackhi_epi16(x[12], x[13]); + u[6] = _mm_unpacklo_epi16(x[14], x[15]); + u[7] = _mm_unpackhi_epi16(x[14], x[15]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); + v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); + v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); + v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); + v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); + v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); + v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); + v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); + v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); + v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); + v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); + v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); + v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); + v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); + v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); + v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); + + u[0] = _mm_add_epi32(v[0], v[4]); + u[1] = _mm_add_epi32(v[1], v[5]); + u[2] = _mm_add_epi32(v[2], v[6]); + u[3] = _mm_add_epi32(v[3], v[7]); + u[4] = _mm_sub_epi32(v[0], v[4]); + u[5] = _mm_sub_epi32(v[1], v[5]); + u[6] = _mm_sub_epi32(v[2], v[6]); + u[7] = _mm_sub_epi32(v[3], v[7]); + u[8] = _mm_add_epi32(v[8], v[12]); + u[9] = _mm_add_epi32(v[9], v[13]); + u[10] = _mm_add_epi32(v[10], v[14]); + u[11] = _mm_add_epi32(v[11], v[15]); + u[12] = _mm_sub_epi32(v[8], v[12]); + u[13] = _mm_sub_epi32(v[9], v[13]); + u[14] = _mm_sub_epi32(v[10], v[14]); + u[15] = _mm_sub_epi32(v[11], v[15]); + + v[0] = dct_const_round_shift_sse2(u[0]); + v[1] = dct_const_round_shift_sse2(u[1]); + v[2] = dct_const_round_shift_sse2(u[2]); + v[3] = dct_const_round_shift_sse2(u[3]); + v[4] = dct_const_round_shift_sse2(u[4]); + v[5] = dct_const_round_shift_sse2(u[5]); + v[6] = dct_const_round_shift_sse2(u[6]); + v[7] = dct_const_round_shift_sse2(u[7]); + v[8] = dct_const_round_shift_sse2(u[8]); + v[9] = dct_const_round_shift_sse2(u[9]); + v[10] = dct_const_round_shift_sse2(u[10]); + v[11] = dct_const_round_shift_sse2(u[11]); + v[12] = dct_const_round_shift_sse2(u[12]); + v[13] = dct_const_round_shift_sse2(u[13]); + v[14] = dct_const_round_shift_sse2(u[14]); + v[15] = dct_const_round_shift_sse2(u[15]); + + s[0] = _mm_add_epi16(x[0], x[2]); + s[1] = _mm_add_epi16(x[1], x[3]); + s[2] = _mm_sub_epi16(x[0], x[2]); + s[3] = _mm_sub_epi16(x[1], x[3]); + s[4] = _mm_packs_epi32(v[0], v[1]); + s[5] = _mm_packs_epi32(v[2], v[3]); + s[6] = _mm_packs_epi32(v[4], v[5]); + s[7] = _mm_packs_epi32(v[6], v[7]); + s[8] = _mm_add_epi16(x[8], x[10]); + s[9] = _mm_add_epi16(x[9], x[11]); + s[10] = _mm_sub_epi16(x[8], x[10]); + s[11] = _mm_sub_epi16(x[9], x[11]); + s[12] = _mm_packs_epi32(v[8], v[9]); + s[13] = _mm_packs_epi32(v[10], v[11]); + s[14] = _mm_packs_epi32(v[12], v[13]); + s[15] = _mm_packs_epi32(v[14], v[15]); + + // stage 4 + u[0] = _mm_unpacklo_epi16(s[2], s[3]); + u[1] = _mm_unpackhi_epi16(s[2], s[3]); + u[2] = _mm_unpacklo_epi16(s[6], s[7]); + u[3] = _mm_unpackhi_epi16(s[6], s[7]); + u[4] = _mm_unpacklo_epi16(s[10], s[11]); + u[5] = _mm_unpackhi_epi16(s[10], s[11]); + u[6] = _mm_unpacklo_epi16(s[14], s[15]); + u[7] = _mm_unpackhi_epi16(s[14], s[15]); + + in[7] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_m16_m16); + in[8] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_m16); + in[4] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_p16); + in[11] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_m16_p16); + in[6] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p16_p16); + in[9] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_m16_p16); + in[5] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_m16_m16); + in[10] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_p16_m16); + + in[0] = s[0]; + in[1] = _mm_sub_epi16(kZero, s[8]); + in[2] = s[12]; + in[3] = _mm_sub_epi16(kZero, s[4]); + in[12] = s[5]; + in[13] = _mm_sub_epi16(kZero, s[13]); + in[14] = s[9]; + in[15] = _mm_sub_epi16(kZero, s[1]); +} + +void idct16_sse2(__m128i *const in0, __m128i *const in1) { + transpose_16bit_16x16(in0, in1); + idct16_8col(in0, in0); + idct16_8col(in1, in1); +} + +void iadst16_sse2(__m128i *const in0, __m128i *const in1) { + transpose_16bit_16x16(in0, in1); + vpx_iadst16_8col_sse2(in0); + vpx_iadst16_8col_sse2(in1); +} + +// Group the coefficient calculation into smaller functions to prevent stack +// spillover in 32x32 idct optimizations: +// quarter_1: 0-7 +// quarter_2: 8-15 +// quarter_3_4: 16-23, 24-31 + +// For each 8x32 block __m128i in[32], +// Input with index, 0, 4 +// output pixels: 0-7 in __m128i out[32] +static INLINE void idct32_34_8x32_quarter_1(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[8]*/) { + const __m128i zero = _mm_setzero_si128(); + __m128i step1[8], step2[8]; + + // stage 3 + butterfly(in[4], zero, cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + + // stage 4 + step2[0] = butterfly_cospi16(in[0]); + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + + // stage 5 + step1[0] = step2[0]; + step1[1] = step2[0]; + step1[2] = step2[0]; + step1[3] = step2[0]; + step1[4] = step2[4]; + butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi16(step1[0], step1[7]); + out[1] = _mm_add_epi16(step1[1], step1[6]); + out[2] = _mm_add_epi16(step1[2], step1[5]); + out[3] = _mm_add_epi16(step1[3], step1[4]); + out[4] = _mm_sub_epi16(step1[3], step1[4]); + out[5] = _mm_sub_epi16(step1[2], step1[5]); + out[6] = _mm_sub_epi16(step1[1], step1[6]); + out[7] = _mm_sub_epi16(step1[0], step1[7]); +} + +// For each 8x32 block __m128i in[32], +// Input with index, 2, 6 +// output pixels: 8-15 in __m128i out[32] +static INLINE void idct32_34_8x32_quarter_2(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[16]*/) { + const __m128i zero = _mm_setzero_si128(); + __m128i step1[16], step2[16]; + + // stage 2 + butterfly(in[2], zero, cospi_30_64, cospi_2_64, &step2[8], &step2[15]); + butterfly(zero, in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); + + // stage 3 + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[14] = step2[15]; + step1[15] = step2[15]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + + idct32_8x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void idct32_34_8x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + idct32_34_8x32_quarter_1(in, temp); + idct32_34_8x32_quarter_2(in, temp); + // stage 7 + add_sub_butterfly(temp, out, 16); +} + +// For each 8x32 block __m128i in[32], +// Input with odd index, 1, 3, 5, 7 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void idct32_34_8x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + const __m128i zero = _mm_setzero_si128(); + __m128i step1[32]; + + // stage 1 + butterfly(in[1], zero, cospi_31_64, cospi_1_64, &step1[16], &step1[31]); + butterfly(zero, in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]); + butterfly(in[5], zero, cospi_27_64, cospi_5_64, &step1[20], &step1[27]); + butterfly(zero, in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]); + + // stage 3 + butterfly(step1[31], step1[16], cospi_28_64, cospi_4_64, &step1[17], + &step1[30]); + butterfly(step1[28], step1[19], -cospi_4_64, cospi_28_64, &step1[18], + &step1[29]); + butterfly(step1[27], step1[20], cospi_12_64, cospi_20_64, &step1[21], + &step1[26]); + butterfly(step1[24], step1[23], -cospi_20_64, cospi_12_64, &step1[22], + &step1[25]); + + idct32_8x32_quarter_3_4_stage_4_to_7(step1, out); +} + +void idct32_34_8x32_sse2(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[32]*/) { + __m128i temp[32]; + + idct32_34_8x32_quarter_1_2(in, temp); + idct32_34_8x32_quarter_3_4(in, temp); + // final stage + add_sub_butterfly(temp, out, 32); +} + +// Only upper-left 8x8 has non-zero coeff +void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i io[32], col[32]; + int i; + + // Load input data. Only need to load the top left 8x8 block. + load_transpose_16bit_8x8(input, 32, io); + idct32_34_8x32_sse2(io, col); + + for (i = 0; i < 32; i += 8) { + int j; + transpose_16bit_8x8(col + i, io); + idct32_34_8x32_sse2(io, io); + + for (j = 0; j < 32; ++j) { + write_buffer_8x1(dest + j * stride, io[j]); + } + + dest += 8; + } +} + +// For each 8x32 block __m128i in[32], +// Input with index, 0, 4, 8, 12, 16, 20, 24, 28 +// output pixels: 0-7 in __m128i out[32] +static INLINE void idct32_1024_8x32_quarter_1( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + butterfly(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + butterfly(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], &step1[6]); + + // stage 4 + butterfly(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], &step2[0]); + butterfly(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); + step2[4] = _mm_add_epi16(step1[4], step1[5]); + step2[5] = _mm_sub_epi16(step1[4], step1[5]); + step2[6] = _mm_sub_epi16(step1[7], step1[6]); + step2[7] = _mm_add_epi16(step1[7], step1[6]); + + // stage 5 + step1[0] = _mm_add_epi16(step2[0], step2[3]); + step1[1] = _mm_add_epi16(step2[1], step2[2]); + step1[2] = _mm_sub_epi16(step2[1], step2[2]); + step1[3] = _mm_sub_epi16(step2[0], step2[3]); + step1[4] = step2[4]; + butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi16(step1[0], step1[7]); + out[1] = _mm_add_epi16(step1[1], step1[6]); + out[2] = _mm_add_epi16(step1[2], step1[5]); + out[3] = _mm_add_epi16(step1[3], step1[4]); + out[4] = _mm_sub_epi16(step1[3], step1[4]); + out[5] = _mm_sub_epi16(step1[2], step1[5]); + out[6] = _mm_sub_epi16(step1[1], step1[6]); + out[7] = _mm_sub_epi16(step1[0], step1[7]); +} + +// For each 8x32 block __m128i in[32], +// Input with index, 2, 6, 10, 14, 18, 22, 26, 30 +// output pixels: 8-15 in __m128i out[32] +static INLINE void idct32_1024_8x32_quarter_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[16]*/) { + __m128i step1[16], step2[16]; + + // stage 2 + butterfly(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], &step2[15]); + butterfly(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], &step2[14]); + butterfly(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], &step2[13]); + butterfly(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); + + // stage 3 + step1[8] = _mm_add_epi16(step2[8], step2[9]); + step1[9] = _mm_sub_epi16(step2[8], step2[9]); + step1[10] = _mm_sub_epi16(step2[11], step2[10]); + step1[11] = _mm_add_epi16(step2[11], step2[10]); + step1[12] = _mm_add_epi16(step2[12], step2[13]); + step1[13] = _mm_sub_epi16(step2[12], step2[13]); + step1[14] = _mm_sub_epi16(step2[15], step2[14]); + step1[15] = _mm_add_epi16(step2[15], step2[14]); + + idct32_8x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void idct32_1024_8x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + idct32_1024_8x32_quarter_1(in, temp); + idct32_1024_8x32_quarter_2(in, temp); + // stage 7 + add_sub_butterfly(temp, out, 16); +} + +// For each 8x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void idct32_1024_8x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32], step2[32]; + + // stage 1 + butterfly(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], &step1[31]); + butterfly(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], &step1[30]); + butterfly(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], &step1[29]); + butterfly(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]); + + butterfly(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], &step1[27]); + butterfly(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], &step1[26]); + + butterfly(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], &step1[25]); + butterfly(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]); + + // stage 2 + step2[16] = _mm_add_epi16(step1[16], step1[17]); + step2[17] = _mm_sub_epi16(step1[16], step1[17]); + step2[18] = _mm_sub_epi16(step1[19], step1[18]); + step2[19] = _mm_add_epi16(step1[19], step1[18]); + step2[20] = _mm_add_epi16(step1[20], step1[21]); + step2[21] = _mm_sub_epi16(step1[20], step1[21]); + step2[22] = _mm_sub_epi16(step1[23], step1[22]); + step2[23] = _mm_add_epi16(step1[23], step1[22]); + + step2[24] = _mm_add_epi16(step1[24], step1[25]); + step2[25] = _mm_sub_epi16(step1[24], step1[25]); + step2[26] = _mm_sub_epi16(step1[27], step1[26]); + step2[27] = _mm_add_epi16(step1[27], step1[26]); + step2[28] = _mm_add_epi16(step1[28], step1[29]); + step2[29] = _mm_sub_epi16(step1[28], step1[29]); + step2[30] = _mm_sub_epi16(step1[31], step1[30]); + step2[31] = _mm_add_epi16(step1[31], step1[30]); + + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + butterfly(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17], + &step1[30]); + butterfly(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18], + &step1[29]); + step1[19] = step2[19]; + step1[20] = step2[20]; + butterfly(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21], + &step1[26]); + butterfly(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22], + &step1[25]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + idct32_8x32_quarter_3_4_stage_4_to_7(step1, out); +} + +void idct32_1024_8x32(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[32]*/) { + __m128i temp[32]; + + idct32_1024_8x32_quarter_1_2(in, temp); + idct32_1024_8x32_quarter_3_4(in, temp); + // final stage + add_sub_butterfly(temp, out, 32); +} + +void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i col[4][32], io[32]; + int i; + + // rows + for (i = 0; i < 4; i++) { + load_transpose_16bit_8x8(&input[0], 32, &io[0]); + load_transpose_16bit_8x8(&input[8], 32, &io[8]); + load_transpose_16bit_8x8(&input[16], 32, &io[16]); + load_transpose_16bit_8x8(&input[24], 32, &io[24]); + idct32_1024_8x32(io, col[i]); + input += 32 << 3; + } + + // columns + for (i = 0; i < 32; i += 8) { + // Transpose 32x8 block to 8x32 block + transpose_16bit_8x8(col[0] + i, io); + transpose_16bit_8x8(col[1] + i, io + 8); + transpose_16bit_8x8(col[2] + i, io + 16); + transpose_16bit_8x8(col[3] + i, io + 24); + + idct32_1024_8x32(io, io); + store_buffer_8x32(io, dest, stride); + dest += 8; + } +} + +void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i col[2][32], in[32], out[32]; + int i; + + for (i = 16; i < 32; i++) { + in[i] = _mm_setzero_si128(); + } + + // rows + for (i = 0; i < 2; i++) { + load_transpose_16bit_8x8(&input[0], 32, &in[0]); + load_transpose_16bit_8x8(&input[8], 32, &in[8]); + idct32_1024_8x32(in, col[i]); + input += 32 << 3; + } + + // columns + for (i = 0; i < 32; i += 8) { + transpose_16bit_8x8(col[0] + i, in); + transpose_16bit_8x8(col[1] + i, in + 8); + idct32_1024_8x32(in, out); + store_buffer_8x32(out, dest, stride); + dest += 8; + } +} + +void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i dc_value; + int j; + tran_high_t a1; + tran_low_t out = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); + + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 6); + dc_value = _mm_set1_epi16((int16_t)a1); + + for (j = 0; j < 32; ++j) { + recon_and_store_16(dest + j * stride + 0, dc_value); + recon_and_store_16(dest + j * stride + 16, dc_value); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.h new file mode 100644 index 0000000000..b4bbd186d2 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.h @@ -0,0 +1,710 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_X86_INV_TXFM_SSE2_H_ +#define VPX_VPX_DSP_X86_INV_TXFM_SSE2_H_ + +#include // SSE2 + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/inv_txfm.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void idct8x8_12_transpose_16bit_4x8(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 30 31 32 33 00 01 02 03 + // in[1]: 20 21 22 23 10 11 12 13 + // in[2]: 40 41 42 43 70 71 72 73 + // in[3]: 50 51 52 53 60 61 62 63 + // to: + // tr0_0: 00 10 01 11 02 12 03 13 + // tr0_1: 20 30 21 31 22 32 23 33 + // tr0_2: 40 50 41 51 42 52 43 53 + // tr0_3: 60 70 61 71 62 72 63 73 + const __m128i tr0_0 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(in[1], in[0]); + const __m128i tr0_2 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i tr0_3 = _mm_unpackhi_epi16(in[3], in[2]); + + // Unpack 32 bit elements resulting in: + // tr1_0: 00 10 20 30 01 11 21 31 + // tr1_1: 02 12 22 32 03 13 23 33 + // tr1_2: 40 50 60 70 41 51 61 71 + // tr1_3: 42 52 62 72 43 53 63 73 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + out[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); + out[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); + out[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); + out[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); +} + +static INLINE __m128i dct_const_round_shift_sse2(const __m128i in) { + const __m128i t = _mm_add_epi32(in, _mm_set1_epi32(DCT_CONST_ROUNDING)); + return _mm_srai_epi32(t, DCT_CONST_BITS); +} + +static INLINE __m128i idct_madd_round_shift_sse2(const __m128i in, + const __m128i cospi) { + const __m128i t = _mm_madd_epi16(in, cospi); + return dct_const_round_shift_sse2(t); +} + +// Calculate the dot product between in0/1 and x and wrap to short. +static INLINE __m128i idct_calc_wraplow_sse2(const __m128i in0, + const __m128i in1, + const __m128i x) { + const __m128i t0 = idct_madd_round_shift_sse2(in0, x); + const __m128i t1 = idct_madd_round_shift_sse2(in1, x); + return _mm_packs_epi32(t0, t1); +} + +// Multiply elements by constants and add them together. +static INLINE void butterfly(const __m128i in0, const __m128i in1, const int c0, + const int c1, __m128i *const out0, + __m128i *const out1) { + const __m128i cst0 = pair_set_epi16(c0, -c1); + const __m128i cst1 = pair_set_epi16(c1, c0); + const __m128i lo = _mm_unpacklo_epi16(in0, in1); + const __m128i hi = _mm_unpackhi_epi16(in0, in1); + *out0 = idct_calc_wraplow_sse2(lo, hi, cst0); + *out1 = idct_calc_wraplow_sse2(lo, hi, cst1); +} + +static INLINE __m128i butterfly_cospi16(const __m128i in) { + const __m128i cst = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i lo = _mm_unpacklo_epi16(in, _mm_setzero_si128()); + const __m128i hi = _mm_unpackhi_epi16(in, _mm_setzero_si128()); + return idct_calc_wraplow_sse2(lo, hi, cst); +} + +// Functions to allow 8 bit optimisations to be used when profile 0 is used with +// highbitdepth enabled +static INLINE __m128i load_input_data4(const tran_low_t *data) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m128i zero = _mm_setzero_si128(); + const __m128i in = _mm_load_si128((const __m128i *)data); + return _mm_packs_epi32(in, zero); +#else + return _mm_loadl_epi64((const __m128i *)data); +#endif +} + +static INLINE __m128i load_input_data8(const tran_low_t *data) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m128i in0 = _mm_load_si128((const __m128i *)data); + const __m128i in1 = _mm_load_si128((const __m128i *)(data + 4)); + return _mm_packs_epi32(in0, in1); +#else + return _mm_load_si128((const __m128i *)data); +#endif +} + +static INLINE void load_transpose_16bit_8x8(const tran_low_t *input, + const int stride, + __m128i *const in) { + in[0] = load_input_data8(input + 0 * stride); + in[1] = load_input_data8(input + 1 * stride); + in[2] = load_input_data8(input + 2 * stride); + in[3] = load_input_data8(input + 3 * stride); + in[4] = load_input_data8(input + 4 * stride); + in[5] = load_input_data8(input + 5 * stride); + in[6] = load_input_data8(input + 6 * stride); + in[7] = load_input_data8(input + 7 * stride); + transpose_16bit_8x8(in, in); +} + +static INLINE void recon_and_store(uint8_t *const dest, const __m128i in_x) { + const __m128i zero = _mm_setzero_si128(); + __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); + d0 = _mm_unpacklo_epi8(d0, zero); + d0 = _mm_add_epi16(in_x, d0); + d0 = _mm_packus_epi16(d0, d0); + _mm_storel_epi64((__m128i *)(dest), d0); +} + +static INLINE void round_shift_8x8(const __m128i *const in, + __m128i *const out) { + const __m128i final_rounding = _mm_set1_epi16(1 << 4); + + out[0] = _mm_add_epi16(in[0], final_rounding); + out[1] = _mm_add_epi16(in[1], final_rounding); + out[2] = _mm_add_epi16(in[2], final_rounding); + out[3] = _mm_add_epi16(in[3], final_rounding); + out[4] = _mm_add_epi16(in[4], final_rounding); + out[5] = _mm_add_epi16(in[5], final_rounding); + out[6] = _mm_add_epi16(in[6], final_rounding); + out[7] = _mm_add_epi16(in[7], final_rounding); + + out[0] = _mm_srai_epi16(out[0], 5); + out[1] = _mm_srai_epi16(out[1], 5); + out[2] = _mm_srai_epi16(out[2], 5); + out[3] = _mm_srai_epi16(out[3], 5); + out[4] = _mm_srai_epi16(out[4], 5); + out[5] = _mm_srai_epi16(out[5], 5); + out[6] = _mm_srai_epi16(out[6], 5); + out[7] = _mm_srai_epi16(out[7], 5); +} + +static INLINE void write_buffer_8x8(const __m128i *const in, + uint8_t *const dest, const int stride) { + __m128i t[8]; + + round_shift_8x8(in, t); + + recon_and_store(dest + 0 * stride, t[0]); + recon_and_store(dest + 1 * stride, t[1]); + recon_and_store(dest + 2 * stride, t[2]); + recon_and_store(dest + 3 * stride, t[3]); + recon_and_store(dest + 4 * stride, t[4]); + recon_and_store(dest + 5 * stride, t[5]); + recon_and_store(dest + 6 * stride, t[6]); + recon_and_store(dest + 7 * stride, t[7]); +} + +static INLINE void recon_and_store4x4_sse2(const __m128i *const in, + uint8_t *const dest, + const int stride) { + const __m128i zero = _mm_setzero_si128(); + __m128i d[2]; + + // Reconstruction and Store + d[0] = _mm_cvtsi32_si128(*(const int *)(dest)); + d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)); + d[0] = _mm_unpacklo_epi32(d[0], + _mm_cvtsi32_si128(*(const int *)(dest + stride))); + d[1] = _mm_unpacklo_epi32( + _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]); + d[0] = _mm_unpacklo_epi8(d[0], zero); + d[1] = _mm_unpacklo_epi8(d[1], zero); + d[0] = _mm_add_epi16(d[0], in[0]); + d[1] = _mm_add_epi16(d[1], in[1]); + d[0] = _mm_packus_epi16(d[0], d[1]); + + *(int *)dest = _mm_cvtsi128_si32(d[0]); + d[0] = _mm_srli_si128(d[0], 4); + *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]); + d[0] = _mm_srli_si128(d[0], 4); + *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]); + d[0] = _mm_srli_si128(d[0], 4); + *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]); +} + +static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) { + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + int j = 0; + while (j < 32) { + in[j] = _mm_adds_epi16(in[j], final_rounding); + in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding); + + in[j] = _mm_srai_epi16(in[j], 6); + in[j + 1] = _mm_srai_epi16(in[j + 1], 6); + + recon_and_store(dst, in[j]); + dst += stride; + recon_and_store(dst, in[j + 1]); + dst += stride; + j += 2; + } +} + +static INLINE void write_buffer_8x1(uint8_t *const dest, const __m128i in) { + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + __m128i out; + out = _mm_adds_epi16(in, final_rounding); + out = _mm_srai_epi16(out, 6); + recon_and_store(dest, out); +} + +// Only do addition and subtraction butterfly, size = 16, 32 +static INLINE void add_sub_butterfly(const __m128i *in, __m128i *out, + int size) { + int i = 0; + const int num = size >> 1; + const int bound = size - 1; + while (i < num) { + out[i] = _mm_add_epi16(in[i], in[bound - i]); + out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]); + i++; + } +} + +static INLINE void idct8(const __m128i *const in /*in[8]*/, + __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 1 + butterfly(in[1], in[7], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + butterfly(in[5], in[3], cospi_12_64, cospi_20_64, &step1[5], &step1[6]); + + // stage 2 + butterfly(in[0], in[4], cospi_16_64, cospi_16_64, &step2[1], &step2[0]); + butterfly(in[2], in[6], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); + + step2[4] = _mm_add_epi16(step1[4], step1[5]); + step2[5] = _mm_sub_epi16(step1[4], step1[5]); + step2[6] = _mm_sub_epi16(step1[7], step1[6]); + step2[7] = _mm_add_epi16(step1[7], step1[6]); + + // stage 3 + step1[0] = _mm_add_epi16(step2[0], step2[3]); + step1[1] = _mm_add_epi16(step2[1], step2[2]); + step1[2] = _mm_sub_epi16(step2[1], step2[2]); + step1[3] = _mm_sub_epi16(step2[0], step2[3]); + butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); + + // stage 4 + out[0] = _mm_add_epi16(step1[0], step2[7]); + out[1] = _mm_add_epi16(step1[1], step1[6]); + out[2] = _mm_add_epi16(step1[2], step1[5]); + out[3] = _mm_add_epi16(step1[3], step2[4]); + out[4] = _mm_sub_epi16(step1[3], step2[4]); + out[5] = _mm_sub_epi16(step1[2], step1[5]); + out[6] = _mm_sub_epi16(step1[1], step1[6]); + out[7] = _mm_sub_epi16(step1[0], step2[7]); +} + +static INLINE void idct8x8_12_add_kernel_sse2(__m128i *const io /*io[8]*/) { + const __m128i zero = _mm_setzero_si128(); + const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + __m128i step1[8], step2[8], tmp[4]; + + transpose_16bit_4x4(io, io); + // io[0]: 00 10 20 30 01 11 21 31 + // io[1]: 02 12 22 32 03 13 23 33 + + // stage 1 + { + const __m128i cp_28_n4 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i cp_4_28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i cp_n20_12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i cp_12_20 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i lo_1 = _mm_unpackhi_epi16(io[0], zero); + const __m128i lo_3 = _mm_unpackhi_epi16(io[1], zero); + step1[4] = idct_calc_wraplow_sse2(cp_28_n4, cp_4_28, lo_1); // step1 4&7 + step1[5] = idct_calc_wraplow_sse2(cp_n20_12, cp_12_20, lo_3); // step1 5&6 + } + + // stage 2 + { + const __m128i cp_24_n8 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i cp_8_24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i lo_0 = _mm_unpacklo_epi16(io[0], zero); + const __m128i lo_2 = _mm_unpacklo_epi16(io[1], zero); + const __m128i t = idct_madd_round_shift_sse2(cp_16_16, lo_0); + step2[0] = _mm_packs_epi32(t, t); // step2 0&1 + step2[2] = idct_calc_wraplow_sse2(cp_8_24, cp_24_n8, lo_2); // step2 3&2 + step2[4] = _mm_add_epi16(step1[4], step1[5]); // step2 4&7 + step2[5] = _mm_sub_epi16(step1[4], step1[5]); // step2 5&6 + step2[6] = _mm_unpackhi_epi64(step2[5], zero); // step2 6 + } + + // stage 3 + { + const __m128i lo_65 = _mm_unpacklo_epi16(step2[6], step2[5]); + tmp[0] = _mm_add_epi16(step2[0], step2[2]); // step1 0&1 + tmp[1] = _mm_sub_epi16(step2[0], step2[2]); // step1 3&2 + step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]); // step1 2&1 + step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]); // step1 3&0 + step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, lo_65); // step1 5&6 + } + + // stage 4 + tmp[0] = _mm_add_epi16(step1[3], step2[4]); // output 3&0 + tmp[1] = _mm_add_epi16(step1[2], step1[5]); // output 2&1 + tmp[2] = _mm_sub_epi16(step1[3], step2[4]); // output 4&7 + tmp[3] = _mm_sub_epi16(step1[2], step1[5]); // output 5&6 + + idct8x8_12_transpose_16bit_4x8(tmp, io); + io[4] = io[5] = io[6] = io[7] = zero; + + idct8(io, io); +} + +static INLINE void idct16_8col(const __m128i *const in /*in[16]*/, + __m128i *const out /*out[16]*/) { + __m128i step1[16], step2[16]; + + // stage 2 + butterfly(in[1], in[15], cospi_30_64, cospi_2_64, &step2[8], &step2[15]); + butterfly(in[9], in[7], cospi_14_64, cospi_18_64, &step2[9], &step2[14]); + butterfly(in[5], in[11], cospi_22_64, cospi_10_64, &step2[10], &step2[13]); + butterfly(in[13], in[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); + + // stage 3 + butterfly(in[2], in[14], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + butterfly(in[10], in[6], cospi_12_64, cospi_20_64, &step1[5], &step1[6]); + step1[8] = _mm_add_epi16(step2[8], step2[9]); + step1[9] = _mm_sub_epi16(step2[8], step2[9]); + step1[10] = _mm_sub_epi16(step2[11], step2[10]); + step1[11] = _mm_add_epi16(step2[10], step2[11]); + step1[12] = _mm_add_epi16(step2[12], step2[13]); + step1[13] = _mm_sub_epi16(step2[12], step2[13]); + step1[14] = _mm_sub_epi16(step2[15], step2[14]); + step1[15] = _mm_add_epi16(step2[14], step2[15]); + + // stage 4 + butterfly(in[0], in[8], cospi_16_64, cospi_16_64, &step2[1], &step2[0]); + butterfly(in[4], in[12], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); + butterfly(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + butterfly(step1[10], step1[13], -cospi_8_64, -cospi_24_64, &step2[13], + &step2[10]); + step2[5] = _mm_sub_epi16(step1[4], step1[5]); + step1[4] = _mm_add_epi16(step1[4], step1[5]); + step2[6] = _mm_sub_epi16(step1[7], step1[6]); + step1[7] = _mm_add_epi16(step1[6], step1[7]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0] = _mm_add_epi16(step2[0], step2[3]); + step1[1] = _mm_add_epi16(step2[1], step2[2]); + step1[2] = _mm_sub_epi16(step2[1], step2[2]); + step1[3] = _mm_sub_epi16(step2[0], step2[3]); + butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); + step1[8] = _mm_add_epi16(step2[8], step2[11]); + step1[9] = _mm_add_epi16(step2[9], step2[10]); + step1[10] = _mm_sub_epi16(step2[9], step2[10]); + step1[11] = _mm_sub_epi16(step2[8], step2[11]); + step1[12] = _mm_sub_epi16(step2[15], step2[12]); + step1[13] = _mm_sub_epi16(step2[14], step2[13]); + step1[14] = _mm_add_epi16(step2[14], step2[13]); + step1[15] = _mm_add_epi16(step2[15], step2[12]); + + // stage 6 + step2[0] = _mm_add_epi16(step1[0], step1[7]); + step2[1] = _mm_add_epi16(step1[1], step1[6]); + step2[2] = _mm_add_epi16(step1[2], step1[5]); + step2[3] = _mm_add_epi16(step1[3], step1[4]); + step2[4] = _mm_sub_epi16(step1[3], step1[4]); + step2[5] = _mm_sub_epi16(step1[2], step1[5]); + step2[6] = _mm_sub_epi16(step1[1], step1[6]); + step2[7] = _mm_sub_epi16(step1[0], step1[7]); + butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10], + &step2[13]); + butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11], + &step2[12]); + + // stage 7 + out[0] = _mm_add_epi16(step2[0], step1[15]); + out[1] = _mm_add_epi16(step2[1], step1[14]); + out[2] = _mm_add_epi16(step2[2], step2[13]); + out[3] = _mm_add_epi16(step2[3], step2[12]); + out[4] = _mm_add_epi16(step2[4], step2[11]); + out[5] = _mm_add_epi16(step2[5], step2[10]); + out[6] = _mm_add_epi16(step2[6], step1[9]); + out[7] = _mm_add_epi16(step2[7], step1[8]); + out[8] = _mm_sub_epi16(step2[7], step1[8]); + out[9] = _mm_sub_epi16(step2[6], step1[9]); + out[10] = _mm_sub_epi16(step2[5], step2[10]); + out[11] = _mm_sub_epi16(step2[4], step2[11]); + out[12] = _mm_sub_epi16(step2[3], step2[12]); + out[13] = _mm_sub_epi16(step2[2], step2[13]); + out[14] = _mm_sub_epi16(step2[1], step1[14]); + out[15] = _mm_sub_epi16(step2[0], step1[15]); +} + +static INLINE void idct16x16_10_pass1(const __m128i *const input /*input[4]*/, + __m128i *const output /*output[16]*/) { + const __m128i zero = _mm_setzero_si128(); + const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); + __m128i step1[16], step2[16]; + + transpose_16bit_4x4(input, output); + + // stage 2 + { + const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); + const __m128i lo_1_15 = _mm_unpackhi_epi16(output[0], zero); + const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, output[1]); + step2[8] = idct_calc_wraplow_sse2(k__cospi_p30_m02, k__cospi_p02_p30, + lo_1_15); // step2 8&15 + step2[11] = idct_calc_wraplow_sse2(k__cospi_p06_m26, k__cospi_p26_p06, + lo_13_3); // step2 11&12 + } + + // stage 3 + { + const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i lo_2_14 = _mm_unpacklo_epi16(output[1], zero); + step1[4] = idct_calc_wraplow_sse2(k__cospi_p28_m04, k__cospi_p04_p28, + lo_2_14); // step1 4&7 + step1[13] = _mm_unpackhi_epi64(step2[11], zero); + step1[14] = _mm_unpackhi_epi64(step2[8], zero); + } + + // stage 4 + { + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i lo_0_8 = _mm_unpacklo_epi16(output[0], zero); + const __m128i lo_9_14 = _mm_unpacklo_epi16(step2[8], step1[14]); + const __m128i lo_10_13 = _mm_unpacklo_epi16(step2[11], step1[13]); + const __m128i t = idct_madd_round_shift_sse2(lo_0_8, k__cospi_p16_p16); + step1[0] = _mm_packs_epi32(t, t); // step2 0&1 + step2[9] = idct_calc_wraplow_sse2(k__cospi_m08_p24, k__cospi_p24_p08, + lo_9_14); // step2 9&14 + step2[10] = idct_calc_wraplow_sse2(k__cospi_m24_m08, k__cospi_m08_p24, + lo_10_13); // step2 10&13 + step2[6] = _mm_unpackhi_epi64(step1[4], zero); + } + + // stage 5 + { + const __m128i lo_5_6 = _mm_unpacklo_epi16(step1[4], step2[6]); + step1[6] = idct_calc_wraplow_sse2(k__cospi_p16_p16, k__cospi_m16_p16, + lo_5_6); // step1 6&5 + step1[8] = _mm_add_epi16(step2[8], step2[11]); + step1[9] = _mm_add_epi16(step2[9], step2[10]); + step1[10] = _mm_sub_epi16(step2[9], step2[10]); + step1[11] = _mm_sub_epi16(step2[8], step2[11]); + step1[12] = _mm_unpackhi_epi64(step1[11], zero); + step1[13] = _mm_unpackhi_epi64(step1[10], zero); + step1[14] = _mm_unpackhi_epi64(step1[9], zero); + step1[15] = _mm_unpackhi_epi64(step1[8], zero); + } + + // stage 6 + { + const __m128i lo_10_13 = _mm_unpacklo_epi16(step1[10], step1[13]); + const __m128i lo_11_12 = _mm_unpacklo_epi16(step1[11], step1[12]); + step2[10] = idct_calc_wraplow_sse2(k__cospi_m16_p16, k__cospi_p16_p16, + lo_10_13); // step2 10&13 + step2[11] = idct_calc_wraplow_sse2(k__cospi_m16_p16, k__cospi_p16_p16, + lo_11_12); // step2 11&12 + step2[13] = _mm_unpackhi_epi64(step2[10], zero); + step2[12] = _mm_unpackhi_epi64(step2[11], zero); + step2[3] = _mm_add_epi16(step1[0], step1[4]); + step2[1] = _mm_add_epi16(step1[0], step1[6]); + step2[6] = _mm_sub_epi16(step1[0], step1[6]); + step2[4] = _mm_sub_epi16(step1[0], step1[4]); + step2[0] = _mm_unpackhi_epi64(step2[3], zero); + step2[2] = _mm_unpackhi_epi64(step2[1], zero); + step2[5] = _mm_unpackhi_epi64(step2[6], zero); + step2[7] = _mm_unpackhi_epi64(step2[4], zero); + } + + // stage 7. Left 8x16 only. + output[0] = _mm_add_epi16(step2[0], step1[15]); + output[1] = _mm_add_epi16(step2[1], step1[14]); + output[2] = _mm_add_epi16(step2[2], step2[13]); + output[3] = _mm_add_epi16(step2[3], step2[12]); + output[4] = _mm_add_epi16(step2[4], step2[11]); + output[5] = _mm_add_epi16(step2[5], step2[10]); + output[6] = _mm_add_epi16(step2[6], step1[9]); + output[7] = _mm_add_epi16(step2[7], step1[8]); + output[8] = _mm_sub_epi16(step2[7], step1[8]); + output[9] = _mm_sub_epi16(step2[6], step1[9]); + output[10] = _mm_sub_epi16(step2[5], step2[10]); + output[11] = _mm_sub_epi16(step2[4], step2[11]); + output[12] = _mm_sub_epi16(step2[3], step2[12]); + output[13] = _mm_sub_epi16(step2[2], step2[13]); + output[14] = _mm_sub_epi16(step2[1], step1[14]); + output[15] = _mm_sub_epi16(step2[0], step1[15]); +} + +static INLINE void idct16x16_10_pass2(__m128i *const l /*l[8]*/, + __m128i *const io /*io[16]*/) { + const __m128i zero = _mm_setzero_si128(); + __m128i step1[16], step2[16]; + + transpose_16bit_4x8(l, io); + + // stage 2 + butterfly(io[1], zero, cospi_30_64, cospi_2_64, &step2[8], &step2[15]); + butterfly(zero, io[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); + + // stage 3 + butterfly(io[2], zero, cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + + // stage 4 + step1[0] = butterfly_cospi16(io[0]); + butterfly(step2[15], step2[8], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + butterfly(step2[11], step2[12], -cospi_8_64, -cospi_24_64, &step2[13], + &step2[10]); + + // stage 5 + butterfly(step1[7], step1[4], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); + step1[8] = _mm_add_epi16(step2[8], step2[11]); + step1[9] = _mm_add_epi16(step2[9], step2[10]); + step1[10] = _mm_sub_epi16(step2[9], step2[10]); + step1[11] = _mm_sub_epi16(step2[8], step2[11]); + step1[12] = _mm_sub_epi16(step2[15], step2[12]); + step1[13] = _mm_sub_epi16(step2[14], step2[13]); + step1[14] = _mm_add_epi16(step2[14], step2[13]); + step1[15] = _mm_add_epi16(step2[15], step2[12]); + + // stage 6 + step2[0] = _mm_add_epi16(step1[0], step1[7]); + step2[1] = _mm_add_epi16(step1[0], step1[6]); + step2[2] = _mm_add_epi16(step1[0], step1[5]); + step2[3] = _mm_add_epi16(step1[0], step1[4]); + step2[4] = _mm_sub_epi16(step1[0], step1[4]); + step2[5] = _mm_sub_epi16(step1[0], step1[5]); + step2[6] = _mm_sub_epi16(step1[0], step1[6]); + step2[7] = _mm_sub_epi16(step1[0], step1[7]); + butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10], + &step2[13]); + butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11], + &step2[12]); + + // stage 7 + io[0] = _mm_add_epi16(step2[0], step1[15]); + io[1] = _mm_add_epi16(step2[1], step1[14]); + io[2] = _mm_add_epi16(step2[2], step2[13]); + io[3] = _mm_add_epi16(step2[3], step2[12]); + io[4] = _mm_add_epi16(step2[4], step2[11]); + io[5] = _mm_add_epi16(step2[5], step2[10]); + io[6] = _mm_add_epi16(step2[6], step1[9]); + io[7] = _mm_add_epi16(step2[7], step1[8]); + io[8] = _mm_sub_epi16(step2[7], step1[8]); + io[9] = _mm_sub_epi16(step2[6], step1[9]); + io[10] = _mm_sub_epi16(step2[5], step2[10]); + io[11] = _mm_sub_epi16(step2[4], step2[11]); + io[12] = _mm_sub_epi16(step2[3], step2[12]); + io[13] = _mm_sub_epi16(step2[2], step2[13]); + io[14] = _mm_sub_epi16(step2[1], step1[14]); + io[15] = _mm_sub_epi16(step2[0], step1[15]); +} + +static INLINE void idct32_8x32_quarter_2_stage_4_to_6( + __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) { + __m128i step2[32]; + + // stage 4 + step2[8] = step1[8]; + step2[15] = step1[15]; + butterfly(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + butterfly(step1[13], step1[10], -cospi_8_64, cospi_24_64, &step2[10], + &step2[13]); + step2[11] = step1[11]; + step2[12] = step1[12]; + + // stage 5 + step1[8] = _mm_add_epi16(step2[8], step2[11]); + step1[9] = _mm_add_epi16(step2[9], step2[10]); + step1[10] = _mm_sub_epi16(step2[9], step2[10]); + step1[11] = _mm_sub_epi16(step2[8], step2[11]); + step1[12] = _mm_sub_epi16(step2[15], step2[12]); + step1[13] = _mm_sub_epi16(step2[14], step2[13]); + step1[14] = _mm_add_epi16(step2[14], step2[13]); + step1[15] = _mm_add_epi16(step2[15], step2[12]); + + // stage 6 + out[8] = step1[8]; + out[9] = step1[9]; + butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &out[10], &out[13]); + butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &out[11], &out[12]); + out[14] = step1[14]; + out[15] = step1[15]; +} + +static INLINE void idct32_8x32_quarter_3_4_stage_4_to_7( + __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) { + __m128i step2[32]; + + // stage 4 + step2[16] = _mm_add_epi16(step1[16], step1[19]); + step2[17] = _mm_add_epi16(step1[17], step1[18]); + step2[18] = _mm_sub_epi16(step1[17], step1[18]); + step2[19] = _mm_sub_epi16(step1[16], step1[19]); + step2[20] = _mm_sub_epi16(step1[23], step1[20]); + step2[21] = _mm_sub_epi16(step1[22], step1[21]); + step2[22] = _mm_add_epi16(step1[22], step1[21]); + step2[23] = _mm_add_epi16(step1[23], step1[20]); + + step2[24] = _mm_add_epi16(step1[24], step1[27]); + step2[25] = _mm_add_epi16(step1[25], step1[26]); + step2[26] = _mm_sub_epi16(step1[25], step1[26]); + step2[27] = _mm_sub_epi16(step1[24], step1[27]); + step2[28] = _mm_sub_epi16(step1[31], step1[28]); + step2[29] = _mm_sub_epi16(step1[30], step1[29]); + step2[30] = _mm_add_epi16(step1[29], step1[30]); + step2[31] = _mm_add_epi16(step1[28], step1[31]); + + // stage 5 + step1[16] = step2[16]; + step1[17] = step2[17]; + butterfly(step2[29], step2[18], cospi_24_64, cospi_8_64, &step1[18], + &step1[29]); + butterfly(step2[28], step2[19], cospi_24_64, cospi_8_64, &step1[19], + &step1[28]); + butterfly(step2[27], step2[20], -cospi_8_64, cospi_24_64, &step1[20], + &step1[27]); + butterfly(step2[26], step2[21], -cospi_8_64, cospi_24_64, &step1[21], + &step1[26]); + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // stage 6 + out[16] = _mm_add_epi16(step1[16], step1[23]); + out[17] = _mm_add_epi16(step1[17], step1[22]); + out[18] = _mm_add_epi16(step1[18], step1[21]); + out[19] = _mm_add_epi16(step1[19], step1[20]); + step2[20] = _mm_sub_epi16(step1[19], step1[20]); + step2[21] = _mm_sub_epi16(step1[18], step1[21]); + step2[22] = _mm_sub_epi16(step1[17], step1[22]); + step2[23] = _mm_sub_epi16(step1[16], step1[23]); + + step2[24] = _mm_sub_epi16(step1[31], step1[24]); + step2[25] = _mm_sub_epi16(step1[30], step1[25]); + step2[26] = _mm_sub_epi16(step1[29], step1[26]); + step2[27] = _mm_sub_epi16(step1[28], step1[27]); + out[28] = _mm_add_epi16(step1[27], step1[28]); + out[29] = _mm_add_epi16(step1[26], step1[29]); + out[30] = _mm_add_epi16(step1[25], step1[30]); + out[31] = _mm_add_epi16(step1[24], step1[31]); + + // stage 7 + butterfly(step2[27], step2[20], cospi_16_64, cospi_16_64, &out[20], &out[27]); + butterfly(step2[26], step2[21], cospi_16_64, cospi_16_64, &out[21], &out[26]); + butterfly(step2[25], step2[22], cospi_16_64, cospi_16_64, &out[22], &out[25]); + butterfly(step2[24], step2[23], cospi_16_64, cospi_16_64, &out[23], &out[24]); +} + +void idct4_sse2(__m128i *const in); +void vpx_idct8_sse2(__m128i *const in); +void idct16_sse2(__m128i *const in0, __m128i *const in1); +void iadst4_sse2(__m128i *const in); +void iadst8_sse2(__m128i *const in); +void vpx_iadst16_8col_sse2(__m128i *const in); +void iadst16_sse2(__m128i *const in0, __m128i *const in1); +void idct32_1024_8x32(const __m128i *const in, __m128i *const out); +void idct32_34_8x32_sse2(const __m128i *const in, __m128i *const out); +void idct32_34_8x32_ssse3(const __m128i *const in, __m128i *const out); + +#endif // VPX_VPX_DSP_X86_INV_TXFM_SSE2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c new file mode 100644 index 0000000000..6e99469b63 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/inv_txfm_ssse3.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void partial_butterfly_ssse3(const __m128i in, const int c0, + const int c1, __m128i *const out0, + __m128i *const out1) { + const __m128i cst0 = _mm_set1_epi16(2 * c0); + const __m128i cst1 = _mm_set1_epi16(2 * c1); + *out0 = _mm_mulhrs_epi16(in, cst0); + *out1 = _mm_mulhrs_epi16(in, cst1); +} + +static INLINE __m128i partial_butterfly_cospi16_ssse3(const __m128i in) { + const __m128i coef_pair = _mm_set1_epi16(2 * cospi_16_64); + return _mm_mulhrs_epi16(in, coef_pair); +} + +void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i io[8]; + + io[0] = load_input_data4(input + 0 * 8); + io[1] = load_input_data4(input + 1 * 8); + io[2] = load_input_data4(input + 2 * 8); + io[3] = load_input_data4(input + 3 * 8); + + idct8x8_12_add_kernel_ssse3(io); + write_buffer_8x8(io, dest, stride); +} + +// Group the coefficient calculation into smaller functions to prevent stack +// spillover in 32x32 idct optimizations: +// quarter_1: 0-7 +// quarter_2: 8-15 +// quarter_3_4: 16-23, 24-31 + +// For each 8x32 block __m128i in[32], +// Input with index, 0, 4 +// output pixels: 0-7 in __m128i out[32] +static INLINE void idct32_34_8x32_quarter_1(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + partial_butterfly_ssse3(in[4], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + + // stage 4 + step2[0] = partial_butterfly_cospi16_ssse3(in[0]); + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + + // stage 5 + step1[0] = step2[0]; + step1[1] = step2[0]; + step1[2] = step2[0]; + step1[3] = step2[0]; + step1[4] = step2[4]; + butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi16(step1[0], step1[7]); + out[1] = _mm_add_epi16(step1[1], step1[6]); + out[2] = _mm_add_epi16(step1[2], step1[5]); + out[3] = _mm_add_epi16(step1[3], step1[4]); + out[4] = _mm_sub_epi16(step1[3], step1[4]); + out[5] = _mm_sub_epi16(step1[2], step1[5]); + out[6] = _mm_sub_epi16(step1[1], step1[6]); + out[7] = _mm_sub_epi16(step1[0], step1[7]); +} + +// For each 8x32 block __m128i in[32], +// Input with index, 2, 6 +// output pixels: 8-15 in __m128i out[32] +static INLINE void idct32_34_8x32_quarter_2(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[16]*/) { + __m128i step1[16], step2[16]; + + // stage 2 + partial_butterfly_ssse3(in[2], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + partial_butterfly_ssse3(in[6], -cospi_26_64, cospi_6_64, &step2[11], + &step2[12]); + + // stage 3 + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[14] = step2[15]; + step1[15] = step2[15]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + + idct32_8x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void idct32_34_8x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + idct32_34_8x32_quarter_1(in, temp); + idct32_34_8x32_quarter_2(in, temp); + // stage 7 + add_sub_butterfly(temp, out, 16); +} + +// For each 8x32 block __m128i in[32], +// Input with odd index, 1, 3, 5, 7 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void idct32_34_8x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32]; + + // stage 1 + partial_butterfly_ssse3(in[1], cospi_31_64, cospi_1_64, &step1[16], + &step1[31]); + partial_butterfly_ssse3(in[7], -cospi_25_64, cospi_7_64, &step1[19], + &step1[28]); + partial_butterfly_ssse3(in[5], cospi_27_64, cospi_5_64, &step1[20], + &step1[27]); + partial_butterfly_ssse3(in[3], -cospi_29_64, cospi_3_64, &step1[23], + &step1[24]); + + // stage 3 + butterfly(step1[31], step1[16], cospi_28_64, cospi_4_64, &step1[17], + &step1[30]); + butterfly(step1[28], step1[19], -cospi_4_64, cospi_28_64, &step1[18], + &step1[29]); + butterfly(step1[27], step1[20], cospi_12_64, cospi_20_64, &step1[21], + &step1[26]); + butterfly(step1[24], step1[23], -cospi_20_64, cospi_12_64, &step1[22], + &step1[25]); + + idct32_8x32_quarter_3_4_stage_4_to_7(step1, out); +} + +void idct32_34_8x32_ssse3(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[32]*/) { + __m128i temp[32]; + + idct32_34_8x32_quarter_1_2(in, temp); + idct32_34_8x32_quarter_3_4(in, temp); + // final stage + add_sub_butterfly(temp, out, 32); +} + +// Only upper-left 8x8 has non-zero coeff +void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i io[32], col[32]; + int i; + + // Load input data. Only need to load the top left 8x8 block. + load_transpose_16bit_8x8(input, 32, io); + idct32_34_8x32_ssse3(io, col); + + for (i = 0; i < 32; i += 8) { + int j; + transpose_16bit_8x8(col + i, io); + idct32_34_8x32_ssse3(io, io); + + for (j = 0; j < 32; ++j) { + write_buffer_8x1(dest + j * stride, io[j]); + } + + dest += 8; + } +} + +// For each 8x32 block __m128i in[32], +// Input with index, 0, 4, 8, 12 +// output pixels: 0-7 in __m128i out[32] +static INLINE void idct32_135_8x32_quarter_1(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + partial_butterfly_ssse3(in[4], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + partial_butterfly_ssse3(in[12], -cospi_20_64, cospi_12_64, &step1[5], + &step1[6]); + + // stage 4 + step2[0] = partial_butterfly_cospi16_ssse3(in[0]); + partial_butterfly_ssse3(in[8], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); + step2[4] = _mm_add_epi16(step1[4], step1[5]); + step2[5] = _mm_sub_epi16(step1[4], step1[5]); + step2[6] = _mm_sub_epi16(step1[7], step1[6]); + step2[7] = _mm_add_epi16(step1[7], step1[6]); + + // stage 5 + step1[0] = _mm_add_epi16(step2[0], step2[3]); + step1[1] = _mm_add_epi16(step2[0], step2[2]); + step1[2] = _mm_sub_epi16(step2[0], step2[2]); + step1[3] = _mm_sub_epi16(step2[0], step2[3]); + step1[4] = step2[4]; + butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi16(step1[0], step1[7]); + out[1] = _mm_add_epi16(step1[1], step1[6]); + out[2] = _mm_add_epi16(step1[2], step1[5]); + out[3] = _mm_add_epi16(step1[3], step1[4]); + out[4] = _mm_sub_epi16(step1[3], step1[4]); + out[5] = _mm_sub_epi16(step1[2], step1[5]); + out[6] = _mm_sub_epi16(step1[1], step1[6]); + out[7] = _mm_sub_epi16(step1[0], step1[7]); +} + +// For each 8x32 block __m128i in[32], +// Input with index, 2, 6, 10, 14 +// output pixels: 8-15 in __m128i out[32] +static INLINE void idct32_135_8x32_quarter_2(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[16]*/) { + __m128i step1[16], step2[16]; + + // stage 2 + partial_butterfly_ssse3(in[2], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + partial_butterfly_ssse3(in[14], -cospi_18_64, cospi_14_64, &step2[9], + &step2[14]); + partial_butterfly_ssse3(in[10], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + partial_butterfly_ssse3(in[6], -cospi_26_64, cospi_6_64, &step2[11], + &step2[12]); + + // stage 3 + step1[8] = _mm_add_epi16(step2[8], step2[9]); + step1[9] = _mm_sub_epi16(step2[8], step2[9]); + step1[10] = _mm_sub_epi16(step2[11], step2[10]); + step1[11] = _mm_add_epi16(step2[11], step2[10]); + step1[12] = _mm_add_epi16(step2[12], step2[13]); + step1[13] = _mm_sub_epi16(step2[12], step2[13]); + step1[14] = _mm_sub_epi16(step2[15], step2[14]); + step1[15] = _mm_add_epi16(step2[15], step2[14]); + + idct32_8x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void idct32_135_8x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + idct32_135_8x32_quarter_1(in, temp); + idct32_135_8x32_quarter_2(in, temp); + // stage 7 + add_sub_butterfly(temp, out, 16); +} + +// For each 8x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7, 9, 11, 13, 15 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void idct32_135_8x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32], step2[32]; + + // stage 1 + partial_butterfly_ssse3(in[1], cospi_31_64, cospi_1_64, &step1[16], + &step1[31]); + partial_butterfly_ssse3(in[15], -cospi_17_64, cospi_15_64, &step1[17], + &step1[30]); + partial_butterfly_ssse3(in[9], cospi_23_64, cospi_9_64, &step1[18], + &step1[29]); + partial_butterfly_ssse3(in[7], -cospi_25_64, cospi_7_64, &step1[19], + &step1[28]); + + partial_butterfly_ssse3(in[5], cospi_27_64, cospi_5_64, &step1[20], + &step1[27]); + partial_butterfly_ssse3(in[11], -cospi_21_64, cospi_11_64, &step1[21], + &step1[26]); + + partial_butterfly_ssse3(in[13], cospi_19_64, cospi_13_64, &step1[22], + &step1[25]); + partial_butterfly_ssse3(in[3], -cospi_29_64, cospi_3_64, &step1[23], + &step1[24]); + + // stage 2 + step2[16] = _mm_add_epi16(step1[16], step1[17]); + step2[17] = _mm_sub_epi16(step1[16], step1[17]); + step2[18] = _mm_sub_epi16(step1[19], step1[18]); + step2[19] = _mm_add_epi16(step1[19], step1[18]); + step2[20] = _mm_add_epi16(step1[20], step1[21]); + step2[21] = _mm_sub_epi16(step1[20], step1[21]); + step2[22] = _mm_sub_epi16(step1[23], step1[22]); + step2[23] = _mm_add_epi16(step1[23], step1[22]); + + step2[24] = _mm_add_epi16(step1[24], step1[25]); + step2[25] = _mm_sub_epi16(step1[24], step1[25]); + step2[26] = _mm_sub_epi16(step1[27], step1[26]); + step2[27] = _mm_add_epi16(step1[27], step1[26]); + step2[28] = _mm_add_epi16(step1[28], step1[29]); + step2[29] = _mm_sub_epi16(step1[28], step1[29]); + step2[30] = _mm_sub_epi16(step1[31], step1[30]); + step2[31] = _mm_add_epi16(step1[31], step1[30]); + + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + butterfly(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17], + &step1[30]); + butterfly(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18], + &step1[29]); + step1[19] = step2[19]; + step1[20] = step2[20]; + butterfly(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21], + &step1[26]); + butterfly(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22], + &step1[25]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + idct32_8x32_quarter_3_4_stage_4_to_7(step1, out); +} + +void idct32_135_8x32_ssse3(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[32]*/) { + __m128i temp[32]; + idct32_135_8x32_quarter_1_2(in, temp); + idct32_135_8x32_quarter_3_4(in, temp); + // final stage + add_sub_butterfly(temp, out, 32); +} + +void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i col[2][32], io[32]; + int i; + + // rows + for (i = 0; i < 2; i++) { + load_transpose_16bit_8x8(&input[0], 32, &io[0]); + load_transpose_16bit_8x8(&input[8], 32, &io[8]); + idct32_135_8x32_ssse3(io, col[i]); + input += 32 << 3; + } + + // columns + for (i = 0; i < 32; i += 8) { + transpose_16bit_8x8(col[0] + i, io); + transpose_16bit_8x8(col[1] + i, io + 8); + idct32_135_8x32_ssse3(io, io); + store_buffer_8x32(io, dest, stride); + dest += 8; + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h new file mode 100644 index 0000000000..e9f0f69033 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_X86_INV_TXFM_SSSE3_H_ +#define VPX_VPX_DSP_X86_INV_TXFM_SSSE3_H_ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void idct8x8_12_add_kernel_ssse3(__m128i *const io /* io[8] */) { + const __m128i cp_28d_4d = dual_set_epi16(2 * cospi_28_64, 2 * cospi_4_64); + const __m128i cp_n20d_12d = dual_set_epi16(-2 * cospi_20_64, 2 * cospi_12_64); + const __m128i cp_8d_24d = dual_set_epi16(2 * cospi_8_64, 2 * cospi_24_64); + const __m128i cp_16_16 = _mm_set1_epi16(cospi_16_64); + const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i cospi_16_64d = _mm_set1_epi16((int16_t)(2 * cospi_16_64)); + const __m128i cospi_28_64d = _mm_set1_epi16((int16_t)(2 * cospi_28_64)); + const __m128i cospi_4_64d = _mm_set1_epi16((int16_t)(2 * cospi_4_64)); + const __m128i cospi_n20_64d = _mm_set1_epi16((int16_t)(-2 * cospi_20_64)); + const __m128i cospi_12_64d = _mm_set1_epi16((int16_t)(2 * cospi_12_64)); + const __m128i cospi_24_64d = _mm_set1_epi16((int16_t)(2 * cospi_24_64)); + const __m128i cospi_8_64d = _mm_set1_epi16((int16_t)(2 * cospi_8_64)); + __m128i step1[8], step2[8], tmp[4]; + + // pass 1 + + transpose_16bit_4x4(io, io); + // io[0]: 00 10 20 30 01 11 21 31 + // io[1]: 02 12 22 32 03 13 23 33 + + // stage 1 + tmp[0] = _mm_unpacklo_epi64(io[0], io[0]); + tmp[1] = _mm_unpackhi_epi64(io[0], io[0]); + tmp[2] = _mm_unpacklo_epi64(io[1], io[1]); + tmp[3] = _mm_unpackhi_epi64(io[1], io[1]); + step1[4] = _mm_mulhrs_epi16(tmp[1], cp_28d_4d); // step1 4&7 + step1[5] = _mm_mulhrs_epi16(tmp[3], cp_n20d_12d); // step1 5&6 + + // stage 2 + step2[0] = _mm_mulhrs_epi16(tmp[0], cospi_16_64d); // step2 0&1 + step2[2] = _mm_mulhrs_epi16(tmp[2], cp_8d_24d); // step2 3&2 + step2[4] = _mm_add_epi16(step1[4], step1[5]); // step2 4&7 + step2[5] = _mm_sub_epi16(step1[4], step1[5]); // step2 5&6 + step2[6] = _mm_unpackhi_epi64(step2[5], step2[5]); // step2 6 + + // stage 3 + tmp[0] = _mm_unpacklo_epi16(step2[6], step2[5]); + step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, tmp[0]); // step1 5&6 + tmp[0] = _mm_add_epi16(step2[0], step2[2]); // step1 0&1 + tmp[1] = _mm_sub_epi16(step2[0], step2[2]); // step1 3&2 + step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]); // step1 2&1 + step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]); // step1 3&0 + + // stage 4 + tmp[0] = _mm_add_epi16(step1[3], step2[4]); // output 3&0 + tmp[1] = _mm_add_epi16(step1[2], step1[5]); // output 2&1 + tmp[2] = _mm_sub_epi16(step1[3], step2[4]); // output 4&7 + tmp[3] = _mm_sub_epi16(step1[2], step1[5]); // output 5&6 + + // pass 2 + + idct8x8_12_transpose_16bit_4x8(tmp, io); + + // stage 1 + step1[4] = _mm_mulhrs_epi16(io[1], cospi_28_64d); + step1[7] = _mm_mulhrs_epi16(io[1], cospi_4_64d); + step1[5] = _mm_mulhrs_epi16(io[3], cospi_n20_64d); + step1[6] = _mm_mulhrs_epi16(io[3], cospi_12_64d); + + // stage 2 + step2[0] = _mm_mulhrs_epi16(io[0], cospi_16_64d); // step2[1] = step2[0] + step2[2] = _mm_mulhrs_epi16(io[2], cospi_24_64d); + step2[3] = _mm_mulhrs_epi16(io[2], cospi_8_64d); + step2[4] = _mm_add_epi16(step1[4], step1[5]); + step2[5] = _mm_sub_epi16(step1[4], step1[5]); + step2[6] = _mm_sub_epi16(step1[7], step1[6]); + step2[7] = _mm_add_epi16(step1[7], step1[6]); + + // stage 3 + step1[0] = _mm_add_epi16(step2[0], step2[3]); + step1[1] = _mm_add_epi16(step2[0], step2[2]); + step1[2] = _mm_sub_epi16(step2[0], step2[2]); + step1[3] = _mm_sub_epi16(step2[0], step2[3]); + butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); + + // stage 4 + io[0] = _mm_add_epi16(step1[0], step2[7]); + io[1] = _mm_add_epi16(step1[1], step1[6]); + io[2] = _mm_add_epi16(step1[2], step1[5]); + io[3] = _mm_add_epi16(step1[3], step2[4]); + io[4] = _mm_sub_epi16(step1[3], step2[4]); + io[5] = _mm_sub_epi16(step1[2], step1[5]); + io[6] = _mm_sub_epi16(step1[1], step1[6]); + io[7] = _mm_sub_epi16(step1[0], step2[7]); +} + +void idct32_135_8x32_ssse3(const __m128i *const in, __m128i *const out); + +#endif // VPX_VPX_DSP_X86_INV_TXFM_SSSE3_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/inv_wht_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/inv_wht_sse2.asm new file mode 100644 index 0000000000..bcf1a6ef98 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/inv_wht_sse2.asm @@ -0,0 +1,103 @@ +; +; Copyright (c) 2015 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" +%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm" + +SECTION .text + +%macro REORDER_INPUTS 0 + ; a c d b to a b c d + SWAP 1, 3, 2 +%endmacro + +%macro TRANSFORM_COLS 0 + ; input: + ; m0 a + ; m1 b + ; m2 c + ; m3 d + paddw m0, m2 + psubw m3, m1 + + ; wide subtract + punpcklwd m4, m0 + punpcklwd m5, m3 + psrad m4, 16 + psrad m5, 16 + psubd m4, m5 + psrad m4, 1 + packssdw m4, m4 ; e + + psubw m5, m4, m1 ; b + psubw m4, m2 ; c + psubw m0, m5 + paddw m3, m4 + ; m0 a + SWAP 1, 5 ; m1 b + SWAP 2, 4 ; m2 c + ; m3 d +%endmacro + +%macro TRANSPOSE_4X4 0 + punpcklwd m0, m2 + punpcklwd m1, m3 + mova m2, m0 + punpcklwd m0, m1 + punpckhwd m2, m1 + pshufd m1, m0, 0x0e + pshufd m3, m2, 0x0e +%endmacro + +; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3 +%macro TRANSPOSE_4X4_WIDE 0 + mova m3, m0 + punpcklwd m0, m1 + punpckhwd m3, m1 + mova m2, m0 + punpcklwd m0, m3 + punpckhwd m2, m3 + pshufd m1, m0, 0x0e + pshufd m3, m2, 0x0e +%endmacro + +%macro ADD_STORE_4P_2X 5 ; src1, src2, tmp1, tmp2, zero + movd m%3, [outputq] + movd m%4, [outputq + strideq] + punpcklbw m%3, m%5 + punpcklbw m%4, m%5 + paddw m%1, m%3 + paddw m%2, m%4 + packuswb m%1, m%5 + packuswb m%2, m%5 + movd [outputq], m%1 + movd [outputq + strideq], m%2 +%endmacro + +INIT_XMM sse2 +cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride + LOAD_TRAN_LOW 0, inputq, 0 + LOAD_TRAN_LOW 1, inputq, 8 + psraw m0, 2 + psraw m1, 2 + + TRANSPOSE_4X4_WIDE + REORDER_INPUTS + TRANSFORM_COLS + TRANSPOSE_4X4 + REORDER_INPUTS + TRANSFORM_COLS + + pxor m4, m4 + ADD_STORE_4P_2X 0, 1, 5, 6, 4 + lea outputq, [outputq + 2 * strideq] + ADD_STORE_4P_2X 2, 3, 5, 6, 4 + + RET diff --git a/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_avx2.c new file mode 100644 index 0000000000..a58fb65539 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_avx2.c @@ -0,0 +1,913 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include /* AVX2 */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_ports/mem.h" + +void vpx_lpf_horizontal_16_avx2(unsigned char *s, int pitch, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { + __m128i mask, hev, flat, flat2; + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi8(1); + __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; + __m128i abs_p1p0; + + const __m128i thresh_v = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)thresh[0])); + const __m128i limit_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)limit[0])); + const __m128i blimit_v = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)blimit[0])); + + q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * pitch)); + q4p4 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * pitch))); + q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * pitch)); + q3p3 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * pitch))); + q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * pitch)); + q2p2 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * pitch))); + q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * pitch)); + q1p1 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * pitch))); + p1q1 = _mm_shuffle_epi32(q1p1, 78); + q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * pitch)); + q0p0 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * pitch))); + p0q0 = _mm_shuffle_epi32(q0p0, 78); + + { + __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; + abs_p1p0 = + _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), _mm_subs_epu8(q0p0, q1p1)); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + fe = _mm_set1_epi8((int8_t)0xfe); + ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + abs_p0q0 = + _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), _mm_subs_epu8(p0q0, q0p0)); + abs_p1q1 = + _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1)); + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh_v); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), _mm_subs_epu8(q1p1, q2p2)), + _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); + mask = _mm_subs_epu8(mask, limit_v); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8((int8_t)0x80); + const __m128i t1 = _mm_set1_epi16(0x1); + __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); + __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); + __m128i qs0 = _mm_xor_si128(p0q0, t80); + __m128i qs1 = _mm_xor_si128(p1q1, t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; + __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; + + filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, qs0ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + /* (vpx_filter + 3 * (qs0 - ps0)) & mask */ + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + filter1 = _mm_unpacklo_epi8(zero, filter1); + filter1 = _mm_srai_epi16(filter1, 0xB); + filter2 = _mm_unpacklo_epi8(zero, filter2); + filter2 = _mm_srai_epi16(filter2, 0xB); + + /* Filter1 >> 3 */ + filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); + qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); + + /* filt >> 1 */ + filt = _mm_adds_epi16(filter1, t1); + filt = _mm_srai_epi16(filt, 1); + filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), + filt); + filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); + qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); + // loopfilter done + + { + __m128i work; + flat = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), _mm_subs_epu8(q0p0, q2p2)), + _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), _mm_subs_epu8(q0p0, q3p3))); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * pitch)); + q5p5 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * pitch))); + + q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * pitch)); + q6p6 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * pitch))); + + flat2 = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)), + _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5))); + + q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * pitch)); + q7p7 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * pitch))); + + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)), + _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), _mm_subs_epu8(q0p0, q7p7))); + + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + { + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; + __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; + __m128i pixelFilter_p, pixelFilter_q; + __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; + __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; + + p7_16 = _mm_unpacklo_epi8(q7p7, zero); + p6_16 = _mm_unpacklo_epi8(q6p6, zero); + p5_16 = _mm_unpacklo_epi8(q5p5, zero); + p4_16 = _mm_unpacklo_epi8(q4p4, zero); + p3_16 = _mm_unpacklo_epi8(q3p3, zero); + p2_16 = _mm_unpacklo_epi8(q2p2, zero); + p1_16 = _mm_unpacklo_epi8(q1p1, zero); + p0_16 = _mm_unpacklo_epi8(q0p0, zero); + q0_16 = _mm_unpackhi_epi8(q0p0, zero); + q1_16 = _mm_unpackhi_epi8(q1p1, zero); + q2_16 = _mm_unpackhi_epi8(q2p2, zero); + q3_16 = _mm_unpackhi_epi8(q3p3, zero); + q4_16 = _mm_unpackhi_epi8(q4p4, zero); + q5_16 = _mm_unpackhi_epi8(q5p5, zero); + q6_16 = _mm_unpackhi_epi8(q6p6, zero); + q7_16 = _mm_unpackhi_epi8(q7p7, zero); + + pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16), + _mm_add_epi16(p4_16, p3_16)); + pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16), + _mm_add_epi16(q4_16, q3_16)); + + pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16)); + pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); + + pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16)); + pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); + pixelFilter_p = + _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); + pixetFilter_p2p1p0 = _mm_add_epi16( + four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4); + flat2_q0p0 = _mm_packus_epi16(res_p, res_q); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3); + + flat_q0p0 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(p7_16, p7_16); + sum_q7 = _mm_add_epi16(q7_16, q7_16); + sum_p3 = _mm_add_epi16(p3_16, p3_16); + sum_q3 = _mm_add_epi16(q3_16, q3_16); + + pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4); + flat2_q1p1 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3); + flat_q1p1 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + sum_p3 = _mm_add_epi16(sum_p3, p3_16); + sum_q3 = _mm_add_epi16(sum_q3, q3_16); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4); + flat2_q2p2 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3); + flat_q2p2 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4); + flat2_q3p3 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4); + flat2_q4p4 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4); + flat2_q5p5 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4); + flat2_q6p6 = _mm_packus_epi16(res_p, res_q); + } + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + flat = _mm_shuffle_epi32(flat, 68); + flat2 = _mm_shuffle_epi32(flat2, 68); + + q2p2 = _mm_andnot_si128(flat, q2p2); + flat_q2p2 = _mm_and_si128(flat, flat_q2p2); + q2p2 = _mm_or_si128(q2p2, flat_q2p2); + + qs1ps1 = _mm_andnot_si128(flat, qs1ps1); + flat_q1p1 = _mm_and_si128(flat, flat_q1p1); + q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); + + qs0ps0 = _mm_andnot_si128(flat, qs0ps0); + flat_q0p0 = _mm_and_si128(flat, flat_q0p0); + q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); + + q6p6 = _mm_andnot_si128(flat2, q6p6); + flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); + q6p6 = _mm_or_si128(q6p6, flat2_q6p6); + _mm_storel_epi64((__m128i *)(s - 7 * pitch), q6p6); + _mm_storeh_pi((__m64 *)(s + 6 * pitch), _mm_castsi128_ps(q6p6)); + + q5p5 = _mm_andnot_si128(flat2, q5p5); + flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); + q5p5 = _mm_or_si128(q5p5, flat2_q5p5); + _mm_storel_epi64((__m128i *)(s - 6 * pitch), q5p5); + _mm_storeh_pi((__m64 *)(s + 5 * pitch), _mm_castsi128_ps(q5p5)); + + q4p4 = _mm_andnot_si128(flat2, q4p4); + flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); + q4p4 = _mm_or_si128(q4p4, flat2_q4p4); + _mm_storel_epi64((__m128i *)(s - 5 * pitch), q4p4); + _mm_storeh_pi((__m64 *)(s + 4 * pitch), _mm_castsi128_ps(q4p4)); + + q3p3 = _mm_andnot_si128(flat2, q3p3); + flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); + q3p3 = _mm_or_si128(q3p3, flat2_q3p3); + _mm_storel_epi64((__m128i *)(s - 4 * pitch), q3p3); + _mm_storeh_pi((__m64 *)(s + 3 * pitch), _mm_castsi128_ps(q3p3)); + + q2p2 = _mm_andnot_si128(flat2, q2p2); + flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); + q2p2 = _mm_or_si128(q2p2, flat2_q2p2); + _mm_storel_epi64((__m128i *)(s - 3 * pitch), q2p2); + _mm_storeh_pi((__m64 *)(s + 2 * pitch), _mm_castsi128_ps(q2p2)); + + q1p1 = _mm_andnot_si128(flat2, q1p1); + flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); + q1p1 = _mm_or_si128(q1p1, flat2_q1p1); + _mm_storel_epi64((__m128i *)(s - 2 * pitch), q1p1); + _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(q1p1)); + + q0p0 = _mm_andnot_si128(flat2, q0p0); + flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); + q0p0 = _mm_or_si128(q0p0, flat2_q0p0); + _mm_storel_epi64((__m128i *)(s - 1 * pitch), q0p0); + _mm_storeh_pi((__m64 *)(s - 0 * pitch), _mm_castsi128_ps(q0p0)); + } +} + +DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = { + 0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128, + 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128 +}; + +void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int pitch, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { + __m128i mask, hev, flat, flat2; + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi8(1); + __m128i p7, p6, p5; + __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; + __m128i q5, q6, q7; + __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, q256_4, + p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0; + + const __m128i thresh_v = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)thresh[0])); + const __m128i limit_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)limit[0])); + const __m128i blimit_v = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)blimit[0])); + + p256_4 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 5 * pitch))); + p256_3 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 4 * pitch))); + p256_2 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 3 * pitch))); + p256_1 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 2 * pitch))); + p256_0 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 1 * pitch))); + q256_0 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 0 * pitch))); + q256_1 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s + 1 * pitch))); + q256_2 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s + 2 * pitch))); + q256_3 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s + 3 * pitch))); + q256_4 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s + 4 * pitch))); + + p4 = _mm256_castsi256_si128(p256_4); + p3 = _mm256_castsi256_si128(p256_3); + p2 = _mm256_castsi256_si128(p256_2); + p1 = _mm256_castsi256_si128(p256_1); + p0 = _mm256_castsi256_si128(p256_0); + q0 = _mm256_castsi256_si128(q256_0); + q1 = _mm256_castsi256_si128(q256_1); + q2 = _mm256_castsi256_si128(q256_2); + q3 = _mm256_castsi256_si128(q256_3); + q4 = _mm256_castsi256_si128(q256_4); + + { + const __m128i abs_p1p0 = + _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); + const __m128i abs_q1q0 = + _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); + const __m128i fe = _mm_set1_epi8((int8_t)0xfe); + const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + __m128i abs_p0q0 = + _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); + __m128i abs_p1q1 = + _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); + __m128i work; + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh_v); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), + _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), + _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit_v); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8((int8_t)0x80); + const __m128i te0 = _mm_set1_epi8((int8_t)0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + + __m128i ps1 = _mm_xor_si128(p1, t80); + __m128i ps0 = _mm_xor_si128(p0, t80); + __m128i qs0 = _mm_xor_si128(q0, t80); + __m128i qs1 = _mm_xor_si128(q1, t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1, + flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4, flat2_q5, + flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, flat_q2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + /* (vpx_filter + 3 * (qs0 - ps0)) & mask */ + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + /* Filter1 >> 3 */ + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + + /* Filter2 >> 3 */ + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + + /* filt >> 1 */ + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + filt = _mm_andnot_si128(hev, filt); + ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + // loopfilter done + + { + __m128i work; + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)), + _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)), + _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)), + _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4))); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + p256_5 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 6 * pitch))); + q256_5 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s + 5 * pitch))); + p5 = _mm256_castsi256_si128(p256_5); + q5 = _mm256_castsi256_si128(q256_5); + flat2 = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)), + _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5))); + + flat2 = _mm_max_epu8(work, flat2); + p256_6 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 7 * pitch))); + q256_6 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s + 6 * pitch))); + p6 = _mm256_castsi256_si128(p256_6); + q6 = _mm256_castsi256_si128(q256_6); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)), + _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6))); + + flat2 = _mm_max_epu8(work, flat2); + + p256_7 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 8 * pitch))); + q256_7 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s + 7 * pitch))); + p7 = _mm256_castsi256_si128(p256_7); + q7 = _mm256_castsi256_si128(q256_7); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)), + _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7))); + + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + { + const __m256i eight = _mm256_set1_epi16(8); + const __m256i four = _mm256_set1_epi16(4); + __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0, + pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; + + const __m256i filter = + _mm256_load_si256((__m256i const *)filt_loopfilter_avx2); + p256_7 = _mm256_shuffle_epi8(p256_7, filter); + p256_6 = _mm256_shuffle_epi8(p256_6, filter); + p256_5 = _mm256_shuffle_epi8(p256_5, filter); + p256_4 = _mm256_shuffle_epi8(p256_4, filter); + p256_3 = _mm256_shuffle_epi8(p256_3, filter); + p256_2 = _mm256_shuffle_epi8(p256_2, filter); + p256_1 = _mm256_shuffle_epi8(p256_1, filter); + p256_0 = _mm256_shuffle_epi8(p256_0, filter); + q256_0 = _mm256_shuffle_epi8(q256_0, filter); + q256_1 = _mm256_shuffle_epi8(q256_1, filter); + q256_2 = _mm256_shuffle_epi8(q256_2, filter); + q256_3 = _mm256_shuffle_epi8(q256_3, filter); + q256_4 = _mm256_shuffle_epi8(q256_4, filter); + q256_5 = _mm256_shuffle_epi8(q256_5, filter); + q256_6 = _mm256_shuffle_epi8(q256_6, filter); + q256_7 = _mm256_shuffle_epi8(q256_7, filter); + + pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5), + _mm256_add_epi16(p256_4, p256_3)); + pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5), + _mm256_add_epi16(q256_4, q256_3)); + + pixetFilter_p2p1p0 = + _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1)); + pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); + + pixetFilter_q2q1q0 = + _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1)); + pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); + + pixelFilter_p = _mm256_add_epi16( + eight, _mm256_add_epi16(pixelFilter_p, pixelFilter_q)); + + pixetFilter_p2p1p0 = _mm256_add_epi16( + four, _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(p256_7, p256_0)), 4); + + flat2_p0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(q256_7, q256_0)), 4); + + flat2_q0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + + res_p = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, + _mm256_add_epi16(p256_3, p256_0)), + 3); + + flat_p0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + + res_q = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, + _mm256_add_epi16(q256_3, q256_0)), + 3); + + flat_q0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + + sum_p7 = _mm256_add_epi16(p256_7, p256_7); + + sum_q7 = _mm256_add_epi16(q256_7, q256_7); + + sum_p3 = _mm256_add_epi16(p256_3, p256_3); + + sum_q3 = _mm256_add_epi16(q256_3, q256_3); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_1)), 4); + + flat2_p1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_1)), 4); + + flat2_q1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + + pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2); + + pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2); + + res_p = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, + _mm256_add_epi16(sum_p3, p256_1)), + 3); + + flat_p1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + + res_q = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, + _mm256_add_epi16(sum_q3, q256_1)), + 3); + + flat_q1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + sum_p3 = _mm256_add_epi16(sum_p3, p256_3); + + sum_q3 = _mm256_add_epi16(sum_q3, q256_3); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_2)), 4); + + flat2_p2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_2)), 4); + + flat2_q2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + + pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1); + + pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1); + + res_p = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, + _mm256_add_epi16(sum_p3, p256_2)), + 3); + + flat_p2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + + res_q = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, + _mm256_add_epi16(sum_q3, q256_2)), + 3); + + flat_q2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_3)), 4); + + flat2_p3 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_3)), 4); + + flat2_q3 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_4)), 4); + + flat2_p4 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_4)), 4); + + flat2_q4 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_5)), 4); + + flat2_p5 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_5)), 4); + + flat2_q5 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_6)), 4); + + flat2_p6 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_6)), 4); + + flat2_q6 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + } + + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + p2 = _mm_andnot_si128(flat, p2); + flat_p2 = _mm_and_si128(flat, flat_p2); + p2 = _mm_or_si128(flat_p2, p2); + + p1 = _mm_andnot_si128(flat, ps1); + flat_p1 = _mm_and_si128(flat, flat_p1); + p1 = _mm_or_si128(flat_p1, p1); + + p0 = _mm_andnot_si128(flat, ps0); + flat_p0 = _mm_and_si128(flat, flat_p0); + p0 = _mm_or_si128(flat_p0, p0); + + q0 = _mm_andnot_si128(flat, qs0); + flat_q0 = _mm_and_si128(flat, flat_q0); + q0 = _mm_or_si128(flat_q0, q0); + + q1 = _mm_andnot_si128(flat, qs1); + flat_q1 = _mm_and_si128(flat, flat_q1); + q1 = _mm_or_si128(flat_q1, q1); + + q2 = _mm_andnot_si128(flat, q2); + flat_q2 = _mm_and_si128(flat, flat_q2); + q2 = _mm_or_si128(flat_q2, q2); + + p6 = _mm_andnot_si128(flat2, p6); + flat2_p6 = _mm_and_si128(flat2, flat2_p6); + p6 = _mm_or_si128(flat2_p6, p6); + _mm_storeu_si128((__m128i *)(s - 7 * pitch), p6); + + p5 = _mm_andnot_si128(flat2, p5); + flat2_p5 = _mm_and_si128(flat2, flat2_p5); + p5 = _mm_or_si128(flat2_p5, p5); + _mm_storeu_si128((__m128i *)(s - 6 * pitch), p5); + + p4 = _mm_andnot_si128(flat2, p4); + flat2_p4 = _mm_and_si128(flat2, flat2_p4); + p4 = _mm_or_si128(flat2_p4, p4); + _mm_storeu_si128((__m128i *)(s - 5 * pitch), p4); + + p3 = _mm_andnot_si128(flat2, p3); + flat2_p3 = _mm_and_si128(flat2, flat2_p3); + p3 = _mm_or_si128(flat2_p3, p3); + _mm_storeu_si128((__m128i *)(s - 4 * pitch), p3); + + p2 = _mm_andnot_si128(flat2, p2); + flat2_p2 = _mm_and_si128(flat2, flat2_p2); + p2 = _mm_or_si128(flat2_p2, p2); + _mm_storeu_si128((__m128i *)(s - 3 * pitch), p2); + + p1 = _mm_andnot_si128(flat2, p1); + flat2_p1 = _mm_and_si128(flat2, flat2_p1); + p1 = _mm_or_si128(flat2_p1, p1); + _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1); + + p0 = _mm_andnot_si128(flat2, p0); + flat2_p0 = _mm_and_si128(flat2, flat2_p0); + p0 = _mm_or_si128(flat2_p0, p0); + _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0); + + q0 = _mm_andnot_si128(flat2, q0); + flat2_q0 = _mm_and_si128(flat2, flat2_q0); + q0 = _mm_or_si128(flat2_q0, q0); + _mm_storeu_si128((__m128i *)(s - 0 * pitch), q0); + + q1 = _mm_andnot_si128(flat2, q1); + flat2_q1 = _mm_and_si128(flat2, flat2_q1); + q1 = _mm_or_si128(flat2_q1, q1); + _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1); + + q2 = _mm_andnot_si128(flat2, q2); + flat2_q2 = _mm_and_si128(flat2, flat2_q2); + q2 = _mm_or_si128(flat2_q2, q2); + _mm_storeu_si128((__m128i *)(s + 2 * pitch), q2); + + q3 = _mm_andnot_si128(flat2, q3); + flat2_q3 = _mm_and_si128(flat2, flat2_q3); + q3 = _mm_or_si128(flat2_q3, q3); + _mm_storeu_si128((__m128i *)(s + 3 * pitch), q3); + + q4 = _mm_andnot_si128(flat2, q4); + flat2_q4 = _mm_and_si128(flat2, flat2_q4); + q4 = _mm_or_si128(flat2_q4, q4); + _mm_storeu_si128((__m128i *)(s + 4 * pitch), q4); + + q5 = _mm_andnot_si128(flat2, q5); + flat2_q5 = _mm_and_si128(flat2, flat2_q5); + q5 = _mm_or_si128(flat2_q5, q5); + _mm_storeu_si128((__m128i *)(s + 5 * pitch), q5); + + q6 = _mm_andnot_si128(flat2, q6); + flat2_q6 = _mm_and_si128(flat2, flat2_q6); + q6 = _mm_or_si128(flat2_q6, q6); + _mm_storeu_si128((__m128i *)(s + 6 * pitch), q6); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_intrin_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_intrin_sse2.c new file mode 100644 index 0000000000..6ea34cdd16 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_intrin_sse2.c @@ -0,0 +1,1779 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE2 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/emmintrin_compat.h" +#include "vpx_dsp/x86/mem_sse2.h" + +static INLINE __m128i abs_diff(__m128i a, __m128i b) { + return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); +} + +// filter_mask and hev_mask +#define FILTER_HEV_MASK \ + do { \ + /* (abs(q1 - q0), abs(p1 - p0) */ \ + __m128i flat = abs_diff(q1p1, q0p0); \ + /* abs(p1 - q1), abs(p0 - q0) */ \ + const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); \ + __m128i abs_p0q0, abs_p1q1, work; \ + \ + /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \ + hev = \ + _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \ + hev = _mm_cmpgt_epi16(hev, thresh_v); \ + hev = _mm_packs_epi16(hev, hev); \ + \ + /* const int8_t mask = filter_mask(*limit, *blimit, */ \ + /* p3, p2, p1, p0, q0, q1, q2, q3); */ \ + abs_p0q0 = \ + _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ \ + abs_p1q1 = \ + _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */ \ + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); \ + abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ \ + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \ + mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); \ + /* abs(p3 - p2), abs(p2 - p1) */ \ + work = abs_diff(p3p2, p2p1); \ + flat = _mm_max_epu8(work, flat); \ + /* abs(q3 - q2), abs(q2 - q1) */ \ + work = abs_diff(q3q2, q2q1); \ + flat = _mm_max_epu8(work, flat); \ + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \ + mask = _mm_unpacklo_epi64(mask, flat); \ + mask = _mm_subs_epu8(mask, limit_v); \ + mask = _mm_cmpeq_epi8(mask, zero); \ + mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \ + } while (0) + +#define FILTER4 \ + do { \ + const __m128i t3t4 = \ + _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4); \ + const __m128i t80 = _mm_set1_epi8((int8_t)0x80); \ + __m128i filter, filter2filter1, work; \ + \ + ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */ \ + qs1qs0 = _mm_xor_si128(q1q0, t80); \ + \ + /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ \ + work = _mm_subs_epi8(ps1ps0, qs1qs0); \ + filter = _mm_and_si128(_mm_srli_si128(work, 8), hev); \ + /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ \ + filter = _mm_subs_epi8(filter, work); \ + filter = _mm_subs_epi8(filter, work); \ + filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ \ + filter = _mm_and_si128(filter, mask); /* & mask */ \ + filter = _mm_unpacklo_epi64(filter, filter); \ + \ + /* filter1 = signed_char_clamp(filter + 4) >> 3; */ \ + /* filter2 = signed_char_clamp(filter + 3) >> 3; */ \ + filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ \ + filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1); \ + filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1); \ + filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ \ + filter = _mm_srai_epi16(filter, 11); /* >> 3 */ \ + filter2filter1 = _mm_packs_epi16(filter2filter1, filter); \ + \ + /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ \ + filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ \ + filter = _mm_unpacklo_epi8(filter, filter); \ + filter = _mm_srai_epi16(filter, 9); /* round */ \ + filter = _mm_packs_epi16(filter, filter); \ + filter = _mm_andnot_si128(hev, filter); \ + \ + hev = _mm_unpackhi_epi64(filter2filter1, filter); \ + filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter); \ + \ + /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \ + qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1); \ + /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \ + ps1ps0 = _mm_adds_epi8(ps1ps0, hev); \ + qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */ \ + ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */ \ + } while (0) + +void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + const __m128i zero = _mm_setzero_si128(); + const __m128i limit_v = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit), + _mm_loadl_epi64((const __m128i *)limit)); + const __m128i thresh_v = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)thresh), zero); + const __m128i ff = _mm_cmpeq_epi8(zero, zero); + __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0; + __m128i mask, hev; + + p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * pitch)), + _mm_loadl_epi64((__m128i *)(s - 4 * pitch))); + q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)), + _mm_loadl_epi64((__m128i *)(s + 1 * pitch))); + q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)), + _mm_loadl_epi64((__m128i *)(s + 0 * pitch))); + q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * pitch)), + _mm_loadl_epi64((__m128i *)(s + 3 * pitch))); + p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); + p2p1 = _mm_unpacklo_epi64(q1p1, p3p2); + q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); + q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2); + + FILTER_HEV_MASK; + FILTER4; + + _mm_storeh_pi((__m64 *)(s - 2 * pitch), _mm_castsi128_ps(ps1ps0)); // *op1 + _mm_storel_epi64((__m128i *)(s - 1 * pitch), ps1ps0); // *op0 + _mm_storel_epi64((__m128i *)(s + 0 * pitch), qs1qs0); // *oq0 + _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(qs1qs0)); // *oq1 +} + +void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + const __m128i zero = _mm_setzero_si128(); + const __m128i limit_v = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit), + _mm_loadl_epi64((const __m128i *)limit)); + const __m128i thresh_v = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)thresh), zero); + const __m128i ff = _mm_cmpeq_epi8(zero, zero); + __m128i x0, x1, x2, x3; + __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0; + __m128i mask, hev; + + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * pitch - 4)), + _mm_loadl_epi64((__m128i *)(s + 1 * pitch - 4))); + + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * pitch - 4)), + _mm_loadl_epi64((__m128i *)(s + 3 * pitch - 4))); + + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * pitch - 4)), + _mm_loadl_epi64((__m128i *)(s + 5 * pitch - 4))); + + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * pitch - 4)), + _mm_loadl_epi64((__m128i *)(s + 7 * pitch - 4))); + + // Transpose 8x8 + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + p1p0 = _mm_unpacklo_epi16(q1q0, x1); + // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + x0 = _mm_unpacklo_epi16(x2, x3); + // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + p3p2 = _mm_unpacklo_epi32(p1p0, x0); + // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + p1p0 = _mm_unpackhi_epi32(p1p0, x0); + p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8)); // swap lo and high + p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8)); // swap lo and high + + // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + q1q0 = _mm_unpackhi_epi16(q1q0, x1); + // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + x2 = _mm_unpackhi_epi16(x2, x3); + // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + q3q2 = _mm_unpackhi_epi32(q1q0, x2); + // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + q1q0 = _mm_unpacklo_epi32(q1q0, x2); + + q0p0 = _mm_unpacklo_epi64(p1p0, q1q0); + q1p1 = _mm_unpackhi_epi64(p1p0, q1q0); + p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); + p2p1 = _mm_unpacklo_epi64(q1p1, p3p2); + q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2); + + FILTER_HEV_MASK; + FILTER4; + + // Transpose 8x4 to 4x8 + // qs1qs0: 20 21 22 23 24 25 26 27 30 31 32 33 34 34 36 37 + // ps1ps0: 10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07 + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8)); + // 10 30 11 31 12 32 13 33 14 34 15 35 16 36 17 37 + x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0); + // 00 20 01 21 02 22 03 23 04 24 05 25 06 26 07 27 + ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0); + // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0); + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0); + + storeu_int32(s + 0 * pitch - 2, _mm_cvtsi128_si32(ps1ps0)); + ps1ps0 = _mm_srli_si128(ps1ps0, 4); + storeu_int32(s + 1 * pitch - 2, _mm_cvtsi128_si32(ps1ps0)); + ps1ps0 = _mm_srli_si128(ps1ps0, 4); + storeu_int32(s + 2 * pitch - 2, _mm_cvtsi128_si32(ps1ps0)); + ps1ps0 = _mm_srli_si128(ps1ps0, 4); + storeu_int32(s + 3 * pitch - 2, _mm_cvtsi128_si32(ps1ps0)); + + storeu_int32(s + 4 * pitch - 2, _mm_cvtsi128_si32(qs1qs0)); + qs1qs0 = _mm_srli_si128(qs1qs0, 4); + storeu_int32(s + 5 * pitch - 2, _mm_cvtsi128_si32(qs1qs0)); + qs1qs0 = _mm_srli_si128(qs1qs0, 4); + storeu_int32(s + 6 * pitch - 2, _mm_cvtsi128_si32(qs1qs0)); + qs1qs0 = _mm_srli_si128(qs1qs0, 4); + storeu_int32(s + 7 * pitch - 2, _mm_cvtsi128_si32(qs1qs0)); +} + +void vpx_lpf_horizontal_16_sse2(unsigned char *s, int pitch, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi8(1); + const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit); + const __m128i limit_v = _mm_load_si128((const __m128i *)limit); + const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh); + __m128i mask, hev, flat, flat2; + __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; + __m128i abs_p1p0; + + q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * pitch)); + q4p4 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * pitch))); + q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * pitch)); + q3p3 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * pitch))); + q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * pitch)); + q2p2 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * pitch))); + q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * pitch)); + q1p1 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * pitch))); + p1q1 = _mm_shuffle_epi32(q1p1, 78); + q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * pitch)); + q0p0 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * pitch))); + p0q0 = _mm_shuffle_epi32(q0p0, 78); + + { + __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; + abs_p1p0 = abs_diff(q1p1, q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + fe = _mm_set1_epi8((int8_t)0xfe); + ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + abs_p0q0 = abs_diff(q0p0, p0q0); + abs_p1q1 = abs_diff(q1p1, p1q1); + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh_v); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); + mask = _mm_subs_epu8(mask, limit_v); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8((int8_t)0x80); + const __m128i t1 = _mm_set1_epi16(0x1); + __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); + __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); + __m128i qs0 = _mm_xor_si128(p0q0, t80); + __m128i qs1 = _mm_xor_si128(p1q1, t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; + __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; + + filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, qs0ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + // (vpx_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + filter1 = _mm_unpacklo_epi8(zero, filter1); + filter1 = _mm_srai_epi16(filter1, 0xB); + filter2 = _mm_unpacklo_epi8(zero, filter2); + filter2 = _mm_srai_epi16(filter2, 0xB); + + // Filter1 >> 3 + filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); + qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); + + // filt >> 1 + filt = _mm_adds_epi16(filter1, t1); + filt = _mm_srai_epi16(filt, 1); + filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), + filt); + filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); + qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); + // loopfilter done + + { + __m128i work; + flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * pitch)); + q5p5 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * pitch))); + + q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * pitch)); + q6p6 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * pitch))); + flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0)); + + q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * pitch)); + q7p7 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * pitch))); + work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0)); + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + { + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; + __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; + __m128i pixelFilter_p, pixelFilter_q; + __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; + __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; + + p7_16 = _mm_unpacklo_epi8(q7p7, zero); + p6_16 = _mm_unpacklo_epi8(q6p6, zero); + p5_16 = _mm_unpacklo_epi8(q5p5, zero); + p4_16 = _mm_unpacklo_epi8(q4p4, zero); + p3_16 = _mm_unpacklo_epi8(q3p3, zero); + p2_16 = _mm_unpacklo_epi8(q2p2, zero); + p1_16 = _mm_unpacklo_epi8(q1p1, zero); + p0_16 = _mm_unpacklo_epi8(q0p0, zero); + q0_16 = _mm_unpackhi_epi8(q0p0, zero); + q1_16 = _mm_unpackhi_epi8(q1p1, zero); + q2_16 = _mm_unpackhi_epi8(q2p2, zero); + q3_16 = _mm_unpackhi_epi8(q3p3, zero); + q4_16 = _mm_unpackhi_epi8(q4p4, zero); + q5_16 = _mm_unpackhi_epi8(q5p5, zero); + q6_16 = _mm_unpackhi_epi8(q6p6, zero); + q7_16 = _mm_unpackhi_epi8(q7p7, zero); + + pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16), + _mm_add_epi16(p4_16, p3_16)); + pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16), + _mm_add_epi16(q4_16, q3_16)); + + pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16)); + pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); + + pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16)); + pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); + pixelFilter_p = + _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); + pixetFilter_p2p1p0 = _mm_add_epi16( + four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4); + flat2_q0p0 = _mm_packus_epi16(res_p, res_q); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3); + + flat_q0p0 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(p7_16, p7_16); + sum_q7 = _mm_add_epi16(q7_16, q7_16); + sum_p3 = _mm_add_epi16(p3_16, p3_16); + sum_q3 = _mm_add_epi16(q3_16, q3_16); + + pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4); + flat2_q1p1 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3); + flat_q1p1 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + sum_p3 = _mm_add_epi16(sum_p3, p3_16); + sum_q3 = _mm_add_epi16(sum_q3, q3_16); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4); + flat2_q2p2 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3); + flat_q2p2 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4); + flat2_q3p3 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4); + flat2_q4p4 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4); + flat2_q5p5 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4); + flat2_q6p6 = _mm_packus_epi16(res_p, res_q); + } + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + flat = _mm_shuffle_epi32(flat, 68); + flat2 = _mm_shuffle_epi32(flat2, 68); + + q2p2 = _mm_andnot_si128(flat, q2p2); + flat_q2p2 = _mm_and_si128(flat, flat_q2p2); + q2p2 = _mm_or_si128(q2p2, flat_q2p2); + + qs1ps1 = _mm_andnot_si128(flat, qs1ps1); + flat_q1p1 = _mm_and_si128(flat, flat_q1p1); + q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); + + qs0ps0 = _mm_andnot_si128(flat, qs0ps0); + flat_q0p0 = _mm_and_si128(flat, flat_q0p0); + q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); + + q6p6 = _mm_andnot_si128(flat2, q6p6); + flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); + q6p6 = _mm_or_si128(q6p6, flat2_q6p6); + _mm_storel_epi64((__m128i *)(s - 7 * pitch), q6p6); + _mm_storeh_pi((__m64 *)(s + 6 * pitch), _mm_castsi128_ps(q6p6)); + + q5p5 = _mm_andnot_si128(flat2, q5p5); + flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); + q5p5 = _mm_or_si128(q5p5, flat2_q5p5); + _mm_storel_epi64((__m128i *)(s - 6 * pitch), q5p5); + _mm_storeh_pi((__m64 *)(s + 5 * pitch), _mm_castsi128_ps(q5p5)); + + q4p4 = _mm_andnot_si128(flat2, q4p4); + flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); + q4p4 = _mm_or_si128(q4p4, flat2_q4p4); + _mm_storel_epi64((__m128i *)(s - 5 * pitch), q4p4); + _mm_storeh_pi((__m64 *)(s + 4 * pitch), _mm_castsi128_ps(q4p4)); + + q3p3 = _mm_andnot_si128(flat2, q3p3); + flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); + q3p3 = _mm_or_si128(q3p3, flat2_q3p3); + _mm_storel_epi64((__m128i *)(s - 4 * pitch), q3p3); + _mm_storeh_pi((__m64 *)(s + 3 * pitch), _mm_castsi128_ps(q3p3)); + + q2p2 = _mm_andnot_si128(flat2, q2p2); + flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); + q2p2 = _mm_or_si128(q2p2, flat2_q2p2); + _mm_storel_epi64((__m128i *)(s - 3 * pitch), q2p2); + _mm_storeh_pi((__m64 *)(s + 2 * pitch), _mm_castsi128_ps(q2p2)); + + q1p1 = _mm_andnot_si128(flat2, q1p1); + flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); + q1p1 = _mm_or_si128(q1p1, flat2_q1p1); + _mm_storel_epi64((__m128i *)(s - 2 * pitch), q1p1); + _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(q1p1)); + + q0p0 = _mm_andnot_si128(flat2, q0p0); + flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); + q0p0 = _mm_or_si128(q0p0, flat2_q0p0); + _mm_storel_epi64((__m128i *)(s - 1 * pitch), q0p0); + _mm_storeh_pi((__m64 *)(s - 0 * pitch), _mm_castsi128_ps(q0p0)); + } +} + +static INLINE __m128i filter_add2_sub2(const __m128i *const total, + const __m128i *const a1, + const __m128i *const a2, + const __m128i *const s1, + const __m128i *const s2) { + __m128i x = _mm_add_epi16(*a1, *total); + x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2); + return x; +} + +static INLINE __m128i filter8_mask(const __m128i *const flat, + const __m128i *const other_filt, + const __m128i *const f8_lo, + const __m128i *const f8_hi) { + const __m128i f8 = + _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3)); + const __m128i result = _mm_and_si128(*flat, f8); + return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); +} + +static INLINE __m128i filter16_mask(const __m128i *const flat, + const __m128i *const other_filt, + const __m128i *const f_lo, + const __m128i *const f_hi) { + const __m128i f = + _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4)); + const __m128i result = _mm_and_si128(*flat, f); + return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); +} + +void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int pitch, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi8(1); + const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit); + const __m128i limit_v = _mm_load_si128((const __m128i *)limit); + const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh); + __m128i mask, hev, flat, flat2; + __m128i p7, p6, p5; + __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; + __m128i q5, q6, q7; + + __m128i op2, op1, op0, oq0, oq1, oq2; + + __m128i max_abs_p1p0q1q0; + + p7 = _mm_loadu_si128((__m128i *)(s - 8 * pitch)); + p6 = _mm_loadu_si128((__m128i *)(s - 7 * pitch)); + p5 = _mm_loadu_si128((__m128i *)(s - 6 * pitch)); + p4 = _mm_loadu_si128((__m128i *)(s - 5 * pitch)); + p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch)); + q4 = _mm_loadu_si128((__m128i *)(s + 4 * pitch)); + q5 = _mm_loadu_si128((__m128i *)(s + 5 * pitch)); + q6 = _mm_loadu_si128((__m128i *)(s + 6 * pitch)); + q7 = _mm_loadu_si128((__m128i *)(s + 7 * pitch)); + + { + const __m128i abs_p1p0 = abs_diff(p1, p0); + const __m128i abs_q1q0 = abs_diff(q1, q0); + const __m128i fe = _mm_set1_epi8((int8_t)0xfe); + const __m128i ff = _mm_cmpeq_epi8(zero, zero); + __m128i abs_p0q0 = abs_diff(p0, q0); + __m128i abs_p1q1 = abs_diff(p1, q1); + __m128i work; + max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(max_abs_p1p0q1q0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2)); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2)); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit_v); + mask = _mm_cmpeq_epi8(mask, zero); + } + + { + __m128i work; + work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0)); + flat = _mm_max_epu8(work, max_abs_p1p0q1q0); + work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0)); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0)); + flat2 = _mm_max_epu8(work, flat2); + work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0)); + flat2 = _mm_max_epu8(work, flat2); + work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0)); + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // filter4 + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8((int8_t)0x80); + const __m128i te0 = _mm_set1_epi8((int8_t)0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + const __m128i ff = _mm_cmpeq_epi8(t4, t4); + + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + op1 = _mm_xor_si128(p1, t80); + op0 = _mm_xor_si128(p0, t80); + oq0 = _mm_xor_si128(q0, t80); + oq1 = _mm_xor_si128(q1, t80); + + hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev); + + work_a = _mm_subs_epi8(oq0, op0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + // (vpx_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + // Filter1 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80); + + // Filter2 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80); + + // filt >> 1 + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + filt = _mm_andnot_si128(hev, filt); + op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80); + oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80); + // loopfilter done + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // filter8 + { + const __m128i four = _mm_set1_epi16(4); + const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero); + const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero); + const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero); + const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero); + const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero); + const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero); + const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero); + const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero); + + const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero); + const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero); + const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero); + const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero); + const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero); + const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero); + const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero); + const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero); + __m128i f8_lo, f8_hi; + + f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four), + _mm_add_epi16(p3_lo, p2_lo)); + f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo), + _mm_add_epi16(p2_lo, p1_lo)); + f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo); + + f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four), + _mm_add_epi16(p3_hi, p2_hi)); + f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi), + _mm_add_epi16(p2_hi, p1_hi)); + f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi); + + op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi); + + f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi); + op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi); + + f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi); + op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi); + + f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi); + oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi); + + f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi); + oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi); + + f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi); + oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi); + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // wide flat calculations + { + const __m128i eight = _mm_set1_epi16(8); + const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero); + const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero); + const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero); + const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero); + const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero); + const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero); + const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero); + const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero); + const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero); + const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero); + const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero); + const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero); + const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero); + const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero); + const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero); + const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero); + + const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero); + const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero); + const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero); + const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero); + const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero); + const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero); + const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero); + const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero); + const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero); + const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero); + const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero); + const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero); + const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero); + const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero); + const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero); + const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero); + + __m128i f_lo; + __m128i f_hi; + + f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo); // p7 * 7 + f_lo = + _mm_add_epi16(_mm_slli_epi16(p6_lo, 1), _mm_add_epi16(p4_lo, f_lo)); + f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo), + _mm_add_epi16(p2_lo, p1_lo)); + f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo); + f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo); + + f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi); // p7 * 7 + f_hi = + _mm_add_epi16(_mm_slli_epi16(p6_hi, 1), _mm_add_epi16(p4_hi, f_hi)); + f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi), + _mm_add_epi16(p2_hi, p1_hi)); + f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi); + f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi); + + p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 7 * pitch), p6); + + f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi); + p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 6 * pitch), p5); + + f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi); + p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 5 * pitch), p4); + + f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi); + p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 4 * pitch), p3); + + f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi); + op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 3 * pitch), op2); + + f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi); + op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 2 * pitch), op1); + + f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi); + op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 1 * pitch), op0); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi); + oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 0 * pitch), oq0); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi); + oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 1 * pitch), oq1); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi); + oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 2 * pitch), oq2); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi); + q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 3 * pitch), q3); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi); + q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 4 * pitch), q4); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi); + q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 5 * pitch), q5); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi); + q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 6 * pitch), q6); + } + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + } +} + +void vpx_lpf_horizontal_8_sse2(unsigned char *s, int pitch, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { + DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); + DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); + DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); + const __m128i zero = _mm_setzero_si128(); + const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit); + const __m128i limit_v = _mm_load_si128((const __m128i *)limit); + const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh); + __m128i mask, hev, flat; + __m128i p3, p2, p1, p0, q0, q1, q2, q3; + __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0; + + q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * pitch)), + _mm_loadl_epi64((__m128i *)(s + 3 * pitch))); + q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * pitch)), + _mm_loadl_epi64((__m128i *)(s + 2 * pitch))); + q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)), + _mm_loadl_epi64((__m128i *)(s + 1 * pitch))); + q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)), + _mm_loadl_epi64((__m128i *)(s - 0 * pitch))); + p1q1 = _mm_shuffle_epi32(q1p1, 78); + p0q0 = _mm_shuffle_epi32(q0p0, 78); + + { + // filter_mask and hev_mask + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8((int8_t)0xfe); + const __m128i ff = _mm_cmpeq_epi8(fe, fe); + __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + abs_p1p0 = abs_diff(q1p1, q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + + abs_p0q0 = abs_diff(q0p0, p0q0); + abs_p1q1 = abs_diff(q1p1, p1q1); + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh_v); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); + mask = _mm_subs_epu8(mask, limit_v); + mask = _mm_cmpeq_epi8(mask, zero); + + // flat_mask4 + + flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + } + + { + const __m128i four = _mm_set1_epi16(4); + unsigned char *src = s; + { + __m128i workp_a, workp_b, workp_shft; + p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * pitch)), + zero); + p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * pitch)), + zero); + p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * pitch)), + zero); + p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * pitch)), + zero); + q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * pitch)), + zero); + q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * pitch)), + zero); + q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * pitch)), + zero); + q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * pitch)), + zero); + + workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); + workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op2[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op1[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op0[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq0[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq1[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq2[0], + _mm_packus_epi16(workp_shft, workp_shft)); + } + } + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8((int8_t)0x80); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i ps1 = + _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)), t80); + const __m128i ps0 = + _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)), t80); + const __m128i qs0 = + _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * pitch)), t80); + const __m128i qs1 = + _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * pitch)), t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + // (vpx_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + // Filter1 >> 3 + filter1 = _mm_unpacklo_epi8(zero, filter1); + filter1 = _mm_srai_epi16(filter1, 11); + filter1 = _mm_packs_epi16(filter1, filter1); + + // Filter2 >> 3 + filter2 = _mm_unpacklo_epi8(zero, filter2); + filter2 = _mm_srai_epi16(filter2, 11); + filter2 = _mm_packs_epi16(filter2, zero); + + // filt >> 1 + filt = _mm_adds_epi8(filter1, t1); + filt = _mm_unpacklo_epi8(zero, filt); + filt = _mm_srai_epi16(filt, 9); + filt = _mm_packs_epi16(filt, zero); + + filt = _mm_andnot_si128(hev, filt); + + work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + q0 = _mm_loadl_epi64((__m128i *)flat_oq0); + work_a = _mm_andnot_si128(flat, work_a); + q0 = _mm_and_si128(flat, q0); + q0 = _mm_or_si128(work_a, q0); + + work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + q1 = _mm_loadl_epi64((__m128i *)flat_oq1); + work_a = _mm_andnot_si128(flat, work_a); + q1 = _mm_and_si128(flat, q1); + q1 = _mm_or_si128(work_a, q1); + + work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); + q2 = _mm_loadl_epi64((__m128i *)flat_oq2); + work_a = _mm_andnot_si128(flat, work_a); + q2 = _mm_and_si128(flat, q2); + q2 = _mm_or_si128(work_a, q2); + + work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + p0 = _mm_loadl_epi64((__m128i *)flat_op0); + work_a = _mm_andnot_si128(flat, work_a); + p0 = _mm_and_si128(flat, p0); + p0 = _mm_or_si128(work_a, p0); + + work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + p1 = _mm_loadl_epi64((__m128i *)flat_op1); + work_a = _mm_andnot_si128(flat, work_a); + p1 = _mm_and_si128(flat, p1); + p1 = _mm_or_si128(work_a, p1); + + work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch)); + p2 = _mm_loadl_epi64((__m128i *)flat_op2); + work_a = _mm_andnot_si128(flat, work_a); + p2 = _mm_and_si128(flat, p2); + p2 = _mm_or_si128(work_a, p2); + + _mm_storel_epi64((__m128i *)(s - 3 * pitch), p2); + _mm_storel_epi64((__m128i *)(s - 2 * pitch), p1); + _mm_storel_epi64((__m128i *)(s - 1 * pitch), p0); + _mm_storel_epi64((__m128i *)(s + 0 * pitch), q0); + _mm_storel_epi64((__m128i *)(s + 1 * pitch), q1); + _mm_storel_epi64((__m128i *)(s + 2 * pitch), q2); + } +} + +void vpx_lpf_horizontal_8_dual_sse2( + uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); + DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); + DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); + const __m128i zero = _mm_setzero_si128(); + const __m128i blimit = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0), + _mm_load_si128((const __m128i *)blimit1)); + const __m128i limit = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)limit0), + _mm_load_si128((const __m128i *)limit1)); + const __m128i thresh = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0), + _mm_load_si128((const __m128i *)thresh1)); + + __m128i mask, hev, flat; + __m128i p3, p2, p1, p0, q0, q1, q2, q3; + + p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch)); + { + const __m128i abs_p1p0 = + _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); + const __m128i abs_q1q0 = + _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8((int8_t)0xfe); + const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + __m128i abs_p0q0 = + _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); + __m128i abs_p1q1 = + _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); + __m128i work; + + // filter_mask and hev_mask + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), + _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), + _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + + // flat_mask4 + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)), + _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)), + _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3))); + flat = _mm_max_epu8(work, flat); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + } + { + const __m128i four = _mm_set1_epi16(4); + unsigned char *src = s; + int i = 0; + + do { + __m128i workp_a, workp_b, workp_shft; + p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * pitch)), + zero); + p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * pitch)), + zero); + p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * pitch)), + zero); + p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * pitch)), + zero); + q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * pitch)), + zero); + q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * pitch)), + zero); + q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * pitch)), + zero); + q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * pitch)), + zero); + + workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); + workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op2[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op1[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op0[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq0[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq1[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq2[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + src += 8; + } while (++i < 2); + } + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8((int8_t)0x80); + const __m128i te0 = _mm_set1_epi8((int8_t)0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + + const __m128i ps1 = + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80); + const __m128i ps0 = + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80); + const __m128i qs0 = + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80); + const __m128i qs1 = + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + // (vpx_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + // Filter1 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + + // Filter2 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + + // filt >> 1 + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + + filt = _mm_andnot_si128(hev, filt); + + work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + q0 = _mm_load_si128((__m128i *)flat_oq0); + work_a = _mm_andnot_si128(flat, work_a); + q0 = _mm_and_si128(flat, q0); + q0 = _mm_or_si128(work_a, q0); + + work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + q1 = _mm_load_si128((__m128i *)flat_oq1); + work_a = _mm_andnot_si128(flat, work_a); + q1 = _mm_and_si128(flat, q1); + q1 = _mm_or_si128(work_a, q1); + + work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); + q2 = _mm_load_si128((__m128i *)flat_oq2); + work_a = _mm_andnot_si128(flat, work_a); + q2 = _mm_and_si128(flat, q2); + q2 = _mm_or_si128(work_a, q2); + + work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + p0 = _mm_load_si128((__m128i *)flat_op0); + work_a = _mm_andnot_si128(flat, work_a); + p0 = _mm_and_si128(flat, p0); + p0 = _mm_or_si128(work_a, p0); + + work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + p1 = _mm_load_si128((__m128i *)flat_op1); + work_a = _mm_andnot_si128(flat, work_a); + p1 = _mm_and_si128(flat, p1); + p1 = _mm_or_si128(work_a, p1); + + work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch)); + p2 = _mm_load_si128((__m128i *)flat_op2); + work_a = _mm_andnot_si128(flat, work_a); + p2 = _mm_and_si128(flat, p2); + p2 = _mm_or_si128(work_a, p2); + + _mm_storeu_si128((__m128i *)(s - 3 * pitch), p2); + _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1); + _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0); + _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0); + _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1); + _mm_storeu_si128((__m128i *)(s + 2 * pitch), q2); + } +} + +void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int pitch, + const unsigned char *blimit0, + const unsigned char *limit0, + const unsigned char *thresh0, + const unsigned char *blimit1, + const unsigned char *limit1, + const unsigned char *thresh1) { + const __m128i blimit = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0), + _mm_load_si128((const __m128i *)blimit1)); + const __m128i limit = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)limit0), + _mm_load_si128((const __m128i *)limit1)); + const __m128i thresh = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0), + _mm_load_si128((const __m128i *)thresh1)); + const __m128i zero = _mm_setzero_si128(); + __m128i p3, p2, p1, p0, q0, q1, q2, q3; + __m128i mask, hev, flat; + + p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch)); + + // filter_mask and hev_mask + { + const __m128i abs_p1p0 = + _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); + const __m128i abs_q1q0 = + _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); + const __m128i fe = _mm_set1_epi8((int8_t)0xfe); + const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + __m128i abs_p0q0 = + _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); + __m128i abs_p1q1 = + _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); + __m128i work; + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), + _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), + _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // filter4 + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8((int8_t)0x80); + const __m128i te0 = _mm_set1_epi8((int8_t)0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + + const __m128i ps1 = + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80); + const __m128i ps0 = + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80); + const __m128i qs0 = + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80); + const __m128i qs1 = + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + // (vpx_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + // Filter1 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + + // Filter2 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + + // filt >> 1 + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + + filt = _mm_andnot_si128(hev, filt); + + q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + + _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1); + _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0); + _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0); + _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1); + } +} + +static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1, + int in_p, unsigned char *out, int out_p) { + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i x8, x9, x10, x11, x12, x13, x14, x15; + + // 2-way interleave w/hoisting of unpacks + x0 = _mm_loadl_epi64((__m128i *)in0); // 1 + x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p)); // 3 + x0 = _mm_unpacklo_epi8(x0, x1); // 1 + + x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p)); // 5 + x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p)); // 7 + x1 = _mm_unpacklo_epi8(x2, x3); // 2 + + x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p)); // 9 + x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p)); // 11 + x2 = _mm_unpacklo_epi8(x4, x5); // 3 + + x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p)); // 13 + x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p)); // 15 + x3 = _mm_unpacklo_epi8(x6, x7); // 4 + x4 = _mm_unpacklo_epi16(x0, x1); // 9 + + x8 = _mm_loadl_epi64((__m128i *)in1); // 2 + x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p)); // 4 + x8 = _mm_unpacklo_epi8(x8, x9); // 5 + x5 = _mm_unpacklo_epi16(x2, x3); // 10 + + x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p)); // 6 + x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p)); // 8 + x9 = _mm_unpacklo_epi8(x10, x11); // 6 + + x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p)); // 10 + x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p)); // 12 + x10 = _mm_unpacklo_epi8(x12, x13); // 7 + x12 = _mm_unpacklo_epi16(x8, x9); // 11 + + x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p)); // 14 + x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p)); // 16 + x11 = _mm_unpacklo_epi8(x14, x15); // 8 + x13 = _mm_unpacklo_epi16(x10, x11); // 12 + + x6 = _mm_unpacklo_epi32(x4, x5); // 13 + x7 = _mm_unpackhi_epi32(x4, x5); // 14 + x14 = _mm_unpacklo_epi32(x12, x13); // 15 + x15 = _mm_unpackhi_epi32(x12, x13); // 16 + + // Store first 4-line result + _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14)); + _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14)); + _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15)); + _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15)); + + x4 = _mm_unpackhi_epi16(x0, x1); + x5 = _mm_unpackhi_epi16(x2, x3); + x12 = _mm_unpackhi_epi16(x8, x9); + x13 = _mm_unpackhi_epi16(x10, x11); + + x6 = _mm_unpacklo_epi32(x4, x5); + x7 = _mm_unpackhi_epi32(x4, x5); + x14 = _mm_unpacklo_epi32(x12, x13); + x15 = _mm_unpackhi_epi32(x12, x13); + + // Store second 4-line result + _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14)); + _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14)); + _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15)); + _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15)); +} + +static INLINE void transpose(unsigned char *src[], int in_p, + unsigned char *dst[], int out_p, + int num_8x8_to_transpose) { + int idx8x8 = 0; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + do { + unsigned char *in = src[idx8x8]; + unsigned char *out = dst[idx8x8]; + + x0 = + _mm_loadl_epi64((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07 + x1 = + _mm_loadl_epi64((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17 + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + x0 = _mm_unpacklo_epi8(x0, x1); + + x2 = + _mm_loadl_epi64((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27 + x3 = + _mm_loadl_epi64((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37 + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + x1 = _mm_unpacklo_epi8(x2, x3); + + x4 = + _mm_loadl_epi64((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47 + x5 = + _mm_loadl_epi64((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57 + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + x2 = _mm_unpacklo_epi8(x4, x5); + + x6 = + _mm_loadl_epi64((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67 + x7 = + _mm_loadl_epi64((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77 + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + x3 = _mm_unpacklo_epi8(x6, x7); + + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + x4 = _mm_unpacklo_epi16(x0, x1); + // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + x5 = _mm_unpacklo_epi16(x2, x3); + // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + x6 = _mm_unpacklo_epi32(x4, x5); + mm_storelu(out + 0 * out_p, x6); // 00 10 20 30 40 50 60 70 + mm_storehu(out + 1 * out_p, x6); // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + x7 = _mm_unpackhi_epi32(x4, x5); + mm_storelu(out + 2 * out_p, x7); // 02 12 22 32 42 52 62 72 + mm_storehu(out + 3 * out_p, x7); // 03 13 23 33 43 53 63 73 + + // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + x4 = _mm_unpackhi_epi16(x0, x1); + // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + x5 = _mm_unpackhi_epi16(x2, x3); + // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + x6 = _mm_unpacklo_epi32(x4, x5); + mm_storelu(out + 4 * out_p, x6); // 04 14 24 34 44 54 64 74 + mm_storehu(out + 5 * out_p, x6); // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + x7 = _mm_unpackhi_epi32(x4, x5); + + mm_storelu(out + 6 * out_p, x7); // 06 16 26 36 46 56 66 76 + mm_storehu(out + 7 * out_p, x7); // 07 17 27 37 47 57 67 77 + } while (++idx8x8 < num_8x8_to_transpose); +} + +void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]); + unsigned char *src[2]; + unsigned char *dst[2]; + + // Transpose 8x16 + transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16); + + // Loop filtering + vpx_lpf_horizontal_4_dual(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, + blimit1, limit1, thresh1); + src[0] = t_dst; + src[1] = t_dst + 8; + dst[0] = s - 4; + dst[1] = s - 4 + pitch * 8; + + // Transpose back + transpose(src, 16, dst, pitch, 2); +} + +void vpx_lpf_vertical_8_sse2(unsigned char *s, int pitch, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { + DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]); + unsigned char *src[1]; + unsigned char *dst[1]; + + // Transpose 8x8 + src[0] = s - 4; + dst[0] = t_dst; + + transpose(src, pitch, dst, 8, 1); + + // Loop filtering + vpx_lpf_horizontal_8(t_dst + 4 * 8, 8, blimit, limit, thresh); + + src[0] = t_dst; + dst[0] = s - 4; + + // Transpose back + transpose(src, 8, dst, pitch, 1); +} + +void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]); + unsigned char *src[2]; + unsigned char *dst[2]; + + // Transpose 8x16 + transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16); + + // Loop filtering + vpx_lpf_horizontal_8_dual(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, + blimit1, limit1, thresh1); + src[0] = t_dst; + src[1] = t_dst + 8; + + dst[0] = s - 4; + dst[1] = s - 4 + pitch * 8; + + // Transpose back + transpose(src, 16, dst, pitch, 2); +} + +void vpx_lpf_vertical_16_sse2(unsigned char *s, int pitch, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { + DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]); + unsigned char *src[2]; + unsigned char *dst[2]; + + src[0] = s - 8; + src[1] = s; + dst[0] = t_dst; + dst[1] = t_dst + 8 * 8; + + // Transpose 16x8 + transpose(src, pitch, dst, 8, 2); + + // Loop filtering + vpx_lpf_horizontal_16(t_dst + 8 * 8, 8, blimit, limit, thresh); + + src[0] = t_dst; + src[1] = t_dst + 8 * 8; + dst[0] = s - 8; + dst[1] = s; + + // Transpose back + transpose(src, 8, dst, pitch, 2); +} + +void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + DECLARE_ALIGNED(16, unsigned char, t_dst[256]); + + // Transpose 16x16 + transpose8x16(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16); + transpose8x16(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16); + + // Loop filtering + vpx_lpf_horizontal_16_dual(t_dst + 8 * 16, 16, blimit, limit, thresh); + + // Transpose back + transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, pitch); + transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch, pitch); +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/mem_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/mem_sse2.h new file mode 100644 index 0000000000..031f361a41 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/mem_sse2.h @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_X86_MEM_SSE2_H_ +#define VPX_VPX_DSP_X86_MEM_SSE2_H_ + +#include // SSE2 +#include + +#include "./vpx_config.h" + +static INLINE void storeu_int32(void *dst, int32_t v) { + memcpy(dst, &v, sizeof(v)); +} + +static INLINE int32_t loadu_int32(const void *src) { + int32_t v; + memcpy(&v, src, sizeof(v)); + return v; +} + +static INLINE __m128i load_unaligned_u32(const void *a) { + int val; + memcpy(&val, a, sizeof(val)); + return _mm_cvtsi32_si128(val); +} + +static INLINE void store_unaligned_u32(void *const a, const __m128i v) { + const int val = _mm_cvtsi128_si32(v); + memcpy(a, &val, sizeof(val)); +} + +#define mm_storelu(dst, v) memcpy((dst), (const char *)&(v), 8) +#define mm_storehu(dst, v) memcpy((dst), (const char *)&(v) + 8, 8) + +static INLINE __m128i loadh_epi64(const __m128i s, const void *const src) { + return _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src)); +} + +static INLINE void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride, + __m128i *const d) { + d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride)); + d[1] = _mm_cvtsi32_si128(*(const int *)(s + 1 * stride)); + d[2] = _mm_cvtsi32_si128(*(const int *)(s + 2 * stride)); + d[3] = _mm_cvtsi32_si128(*(const int *)(s + 3 * stride)); +} + +static INLINE void load_8bit_4x8(const uint8_t *const s, const ptrdiff_t stride, + __m128i *const d) { + load_8bit_4x4(s + 0 * stride, stride, &d[0]); + load_8bit_4x4(s + 4 * stride, stride, &d[4]); +} + +static INLINE void load_8bit_8x4(const uint8_t *const s, const ptrdiff_t stride, + __m128i *const d) { + d[0] = _mm_loadl_epi64((const __m128i *)(s + 0 * stride)); + d[1] = _mm_loadl_epi64((const __m128i *)(s + 1 * stride)); + d[2] = _mm_loadl_epi64((const __m128i *)(s + 2 * stride)); + d[3] = _mm_loadl_epi64((const __m128i *)(s + 3 * stride)); +} + +static INLINE void load_8bit_8x8(const uint8_t *const s, const ptrdiff_t stride, + __m128i *const d) { + load_8bit_8x4(s + 0 * stride, stride, &d[0]); + load_8bit_8x4(s + 4 * stride, stride, &d[4]); +} + +static INLINE void load_8bit_16x8(const uint8_t *const s, + const ptrdiff_t stride, __m128i *const d) { + d[0] = _mm_load_si128((const __m128i *)(s + 0 * stride)); + d[1] = _mm_load_si128((const __m128i *)(s + 1 * stride)); + d[2] = _mm_load_si128((const __m128i *)(s + 2 * stride)); + d[3] = _mm_load_si128((const __m128i *)(s + 3 * stride)); + d[4] = _mm_load_si128((const __m128i *)(s + 4 * stride)); + d[5] = _mm_load_si128((const __m128i *)(s + 5 * stride)); + d[6] = _mm_load_si128((const __m128i *)(s + 6 * stride)); + d[7] = _mm_load_si128((const __m128i *)(s + 7 * stride)); +} + +static INLINE void loadu_8bit_16x4(const uint8_t *const s, + const ptrdiff_t stride, __m128i *const d) { + d[0] = _mm_loadu_si128((const __m128i *)(s + 0 * stride)); + d[1] = _mm_loadu_si128((const __m128i *)(s + 1 * stride)); + d[2] = _mm_loadu_si128((const __m128i *)(s + 2 * stride)); + d[3] = _mm_loadu_si128((const __m128i *)(s + 3 * stride)); +} + +static INLINE void loadu_8bit_16x8(const uint8_t *const s, + const ptrdiff_t stride, __m128i *const d) { + loadu_8bit_16x4(s + 0 * stride, stride, &d[0]); + loadu_8bit_16x4(s + 4 * stride, stride, &d[4]); +} + +static INLINE void _mm_storeh_epi64(__m128i *const d, const __m128i s) { + _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s)); +} + +static INLINE void store_8bit_4x4(const __m128i *const s, uint8_t *const d, + const ptrdiff_t stride) { + *(int *)(d + 0 * stride) = _mm_cvtsi128_si32(s[0]); + *(int *)(d + 1 * stride) = _mm_cvtsi128_si32(s[1]); + *(int *)(d + 2 * stride) = _mm_cvtsi128_si32(s[2]); + *(int *)(d + 3 * stride) = _mm_cvtsi128_si32(s[3]); +} + +static INLINE void store_8bit_4x4_sse2(const __m128i s, uint8_t *const d, + const ptrdiff_t stride) { + __m128i ss[4]; + + ss[0] = s; + ss[1] = _mm_srli_si128(s, 4); + ss[2] = _mm_srli_si128(s, 8); + ss[3] = _mm_srli_si128(s, 12); + store_8bit_4x4(ss, d, stride); +} + +static INLINE void store_8bit_8x4_from_16x2(const __m128i *const s, + uint8_t *const d, + const ptrdiff_t stride) { + _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]); + _mm_storeh_epi64((__m128i *)(d + 1 * stride), s[0]); + _mm_storel_epi64((__m128i *)(d + 2 * stride), s[1]); + _mm_storeh_epi64((__m128i *)(d + 3 * stride), s[1]); +} + +static INLINE void store_8bit_8x8(const __m128i *const s, uint8_t *const d, + const ptrdiff_t stride) { + _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]); + _mm_storel_epi64((__m128i *)(d + 1 * stride), s[1]); + _mm_storel_epi64((__m128i *)(d + 2 * stride), s[2]); + _mm_storel_epi64((__m128i *)(d + 3 * stride), s[3]); + _mm_storel_epi64((__m128i *)(d + 4 * stride), s[4]); + _mm_storel_epi64((__m128i *)(d + 5 * stride), s[5]); + _mm_storel_epi64((__m128i *)(d + 6 * stride), s[6]); + _mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]); +} + +static INLINE void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d, + const ptrdiff_t stride) { + _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]); + _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]); + _mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]); + _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]); +} + +#endif // VPX_VPX_DSP_X86_MEM_SSE2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/post_proc_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/post_proc_sse2.c new file mode 100644 index 0000000000..119fa7cd1a --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/post_proc_sse2.c @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/mem_sse2.h" + +extern const int16_t vpx_rv[]; + +void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, + int cols, int flimit) { + int col; + const __m128i zero = _mm_setzero_si128(); + const __m128i f = _mm_set1_epi32(flimit); + DECLARE_ALIGNED(16, int16_t, above_context[8 * 8]); + + // 8 columns are processed at a time. + // If rows is less than 8 the bottom border extension fails. + assert(cols % 8 == 0); + assert(rows >= 8); + + for (col = 0; col < cols; col += 8) { + int row, i; + __m128i s = _mm_loadl_epi64((__m128i *)dst); + __m128i sum, sumsq_0, sumsq_1; + __m128i tmp_0, tmp_1; + __m128i below_context = _mm_setzero_si128(); + + s = _mm_unpacklo_epi8(s, zero); + + for (i = 0; i < 8; ++i) { + _mm_store_si128((__m128i *)above_context + i, s); + } + + // sum *= 9 + sum = _mm_slli_epi16(s, 3); + sum = _mm_add_epi16(s, sum); + + // sum^2 * 9 == (sum * 9) * sum + tmp_0 = _mm_mullo_epi16(sum, s); + tmp_1 = _mm_mulhi_epi16(sum, s); + + sumsq_0 = _mm_unpacklo_epi16(tmp_0, tmp_1); + sumsq_1 = _mm_unpackhi_epi16(tmp_0, tmp_1); + + // Prime sum/sumsq + for (i = 1; i <= 6; ++i) { + __m128i a = _mm_loadl_epi64((__m128i *)(dst + i * pitch)); + a = _mm_unpacklo_epi8(a, zero); + sum = _mm_add_epi16(sum, a); + a = _mm_mullo_epi16(a, a); + sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(a, zero)); + sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(a, zero)); + } + + for (row = 0; row < rows + 8; row++) { + const __m128i above = + _mm_load_si128((__m128i *)above_context + (row & 7)); + __m128i this_row = _mm_loadl_epi64((__m128i *)(dst + row * pitch)); + __m128i above_sq, below_sq; + __m128i mask_0, mask_1; + __m128i multmp_0, multmp_1; + __m128i rv; + __m128i out; + + this_row = _mm_unpacklo_epi8(this_row, zero); + + if (row + 7 < rows) { + // Instead of copying the end context we just stop loading when we get + // to the last one. + below_context = _mm_loadl_epi64((__m128i *)(dst + (row + 7) * pitch)); + below_context = _mm_unpacklo_epi8(below_context, zero); + } + + sum = _mm_sub_epi16(sum, above); + sum = _mm_add_epi16(sum, below_context); + + // context^2 fits in 16 bits. Don't need to mulhi and combine. Just zero + // extend. Unfortunately we can't do below_sq - above_sq in 16 bits + // because x86 does not have unpack with sign extension. + above_sq = _mm_mullo_epi16(above, above); + sumsq_0 = _mm_sub_epi32(sumsq_0, _mm_unpacklo_epi16(above_sq, zero)); + sumsq_1 = _mm_sub_epi32(sumsq_1, _mm_unpackhi_epi16(above_sq, zero)); + + below_sq = _mm_mullo_epi16(below_context, below_context); + sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(below_sq, zero)); + sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(below_sq, zero)); + + // sumsq * 16 - sumsq == sumsq * 15 + mask_0 = _mm_slli_epi32(sumsq_0, 4); + mask_0 = _mm_sub_epi32(mask_0, sumsq_0); + mask_1 = _mm_slli_epi32(sumsq_1, 4); + mask_1 = _mm_sub_epi32(mask_1, sumsq_1); + + multmp_0 = _mm_mullo_epi16(sum, sum); + multmp_1 = _mm_mulhi_epi16(sum, sum); + + mask_0 = _mm_sub_epi32(mask_0, _mm_unpacklo_epi16(multmp_0, multmp_1)); + mask_1 = _mm_sub_epi32(mask_1, _mm_unpackhi_epi16(multmp_0, multmp_1)); + + // mask - f gives a negative value when mask < f + mask_0 = _mm_sub_epi32(mask_0, f); + mask_1 = _mm_sub_epi32(mask_1, f); + + // Shift the sign bit down to create a mask + mask_0 = _mm_srai_epi32(mask_0, 31); + mask_1 = _mm_srai_epi32(mask_1, 31); + + mask_0 = _mm_packs_epi32(mask_0, mask_1); + + rv = _mm_loadu_si128((__m128i const *)(vpx_rv + (row & 127))); + + mask_1 = _mm_add_epi16(rv, sum); + mask_1 = _mm_add_epi16(mask_1, this_row); + mask_1 = _mm_srai_epi16(mask_1, 4); + + mask_1 = _mm_and_si128(mask_0, mask_1); + mask_0 = _mm_andnot_si128(mask_0, this_row); + out = _mm_or_si128(mask_1, mask_0); + + _mm_storel_epi64((__m128i *)(dst + row * pitch), + _mm_packus_epi16(out, zero)); + + _mm_store_si128((__m128i *)above_context + ((row + 8) & 7), this_row); + } + + dst += 8; + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx.c b/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx.c new file mode 100644 index 0000000000..5ff5abc110 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx.c @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#if defined(_MSC_VER) +#include +#endif +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" +#include "vpx_dsp/x86/quantize_sse2.h" +#include "vpx_dsp/x86/quantize_ssse3.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" + +void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const __m128i zero = _mm_setzero_si128(); + const __m256i big_zero = _mm256_setzero_si256(); + int index; + const int16_t *iscan = scan_order->iscan; + + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i all_zero; + __m128i eob = zero, eob0; + + *eob_ptr = 0; + + load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_test_all_zeros(all_zero, all_zero)) { + _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero); +#endif // CONFIG_VP9_HIGHBITDEPTH + + if (n_coeffs == 16) return; + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + calculate_qcoeff(&qcoeff0, round, quant, shift); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + // Reinsert signs + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr); + dequant = _mm_unpackhi_epi64(dequant, dequant); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); + + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); + } + + // AC only loop. + for (index = 16; index < n_coeffs; index += 16) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_test_all_zeros(all_zero, all_zero)) { + _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero); +#endif // CONFIG_VP9_HIGHBITDEPTH + continue; + } + + calculate_qcoeff(&qcoeff0, round, quant, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); + eob = _mm_max_epi16(eob, eob0); + } + + *eob_ptr = accumulate_eob(eob); +} + +void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const __m128i zero = _mm_setzero_si128(); + const __m256i big_zero = _mm256_setzero_si256(); + int index; + const int16_t *iscan = scan_order->iscan; + + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i all_zero; + __m128i eob = zero, eob0; + + load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, + &shift); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC. + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_test_all_zeros(all_zero, all_zero)) { + _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero); +#endif // CONFIG_VP9_HIGHBITDEPTH + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + calculate_qcoeff(&qcoeff0, round, quant, shift); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + // Mask out zbin threshold coeffs. + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, dqcoeff_ptr); + dequant = _mm_unpackhi_epi64(dequant, dequant); + calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8); + + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); + } + + // AC only loop. + for (index = 16; index < 32 * 32; index += 16) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_test_all_zeros(all_zero, all_zero)) { + _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero); +#endif // CONFIG_VP9_HIGHBITDEPTH + continue; + } + + calculate_qcoeff(&qcoeff0, round, quant, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, + dqcoeff_ptr + index); + calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, + dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); + eob = _mm_max_epi16(eob, eob0); + } + + *eob_ptr = accumulate_eob(eob); +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx2.c new file mode 100644 index 0000000000..d4872f6bca --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx2.c @@ -0,0 +1,290 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" + +static VPX_FORCE_INLINE void load_b_values_avx2( + const struct macroblock_plane *mb_plane, __m256i *zbin, __m256i *round, + __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant, + __m256i *shift, int log_scale) { + *zbin = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->zbin)); + *zbin = _mm256_permute4x64_epi64(*zbin, 0x54); + if (log_scale > 0) { + const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1))); + *zbin = _mm256_add_epi16(*zbin, rnd); + *zbin = _mm256_srai_epi16(*zbin, log_scale); + } + // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when + // calculating the zbin mask. (See quantize_b_logscale{0,1,2}_16) + *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1)); + + *round = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->round)); + *round = _mm256_permute4x64_epi64(*round, 0x54); + if (log_scale > 0) { + const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1))); + *round = _mm256_add_epi16(*round, rnd); + *round = _mm256_srai_epi16(*round, log_scale); + } + + *quant = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->quant)); + *quant = _mm256_permute4x64_epi64(*quant, 0x54); + *dequant = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr)); + *dequant = _mm256_permute4x64_epi64(*dequant, 0x54); + *shift = _mm256_castsi128_si256( + _mm_load_si128((const __m128i *)mb_plane->quant_shift)); + *shift = _mm256_permute4x64_epi64(*shift, 0x54); +} + +static VPX_FORCE_INLINE __m256i +load_coefficients_avx2(const tran_low_t *coeff_ptr) { +#if CONFIG_VP9_HIGHBITDEPTH + // typedef int32_t tran_low_t; + const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)coeff_ptr); + const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(coeff_ptr + 8)); + return _mm256_packs_epi32(coeff1, coeff2); +#else + // typedef int16_t tran_low_t; + return _mm256_loadu_si256((const __m256i *)coeff_ptr); +#endif +} + +static VPX_FORCE_INLINE void store_coefficients_avx2(__m256i coeff_vals, + tran_low_t *coeff_ptr) { +#if CONFIG_VP9_HIGHBITDEPTH + // typedef int32_t tran_low_t; + __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15); + __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign); + __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign); + _mm256_storeu_si256((__m256i *)coeff_ptr, coeff_vals_lo); + _mm256_storeu_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi); +#else + // typedef int16_t tran_low_t; + _mm256_storeu_si256((__m256i *)coeff_ptr, coeff_vals); +#endif +} + +static VPX_FORCE_INLINE __m256i +quantize_b_16(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, __m256i *v_quant, __m256i *v_dequant, + __m256i *v_round, __m256i *v_zbin, __m256i *v_quant_shift) { + const __m256i v_coeff = load_coefficients_avx2(coeff_ptr); + const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff); + const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin); + + if (_mm256_movemask_epi8(v_zbin_mask) == 0) { + _mm256_storeu_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256()); + _mm256_storeu_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256()); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256()); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256()); +#endif // CONFIG_VP9_HIGHBITDEPTH + return _mm256_setzero_si256(); + } + { + // tmp = v_zbin_mask ? (int64_t)abs_coeff + log_scaled_round : 0 + const __m256i v_tmp_rnd = + _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask); + + const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant); + const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd); + const __m256i v_tmp32 = _mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift); + const __m256i v_nz_mask = + _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256()); + const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff); +#if CONFIG_VP9_HIGHBITDEPTH + const __m256i low = _mm256_mullo_epi16(v_qcoeff, *v_dequant); + const __m256i high = _mm256_mulhi_epi16(v_qcoeff, *v_dequant); + + const __m256i v_dqcoeff_lo = _mm256_unpacklo_epi16(low, high); + const __m256i v_dqcoeff_hi = _mm256_unpackhi_epi16(low, high); +#else + const __m256i v_dqcoeff = _mm256_mullo_epi16(v_qcoeff, *v_dequant); +#endif + + store_coefficients_avx2(v_qcoeff, qcoeff_ptr); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_storeu_si256((__m256i *)(dqcoeff_ptr), v_dqcoeff_lo); + _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + 8), v_dqcoeff_hi); +#else + store_coefficients_avx2(v_dqcoeff, dqcoeff_ptr); +#endif + return v_nz_mask; + } +} + +static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan, + __m256i v_eobmax, + __m256i v_mask) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m256i v_iscan = _mm256_permute4x64_epi64( + _mm256_loadu_si256((const __m256i *)iscan), 0xD8); +#else + const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan); +#endif + const __m256i v_nz_iscan = _mm256_and_si256(v_iscan, v_mask); + return _mm256_max_epi16(v_eobmax, v_nz_iscan); +} + +static VPX_FORCE_INLINE int16_t accumulate_eob256(__m256i eob256) { + const __m128i eob_lo = _mm256_castsi256_si128(eob256); + const __m128i eob_hi = _mm256_extractf128_si256(eob256, 1); + __m128i eob = _mm_max_epi16(eob_lo, eob_hi); + __m128i eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + return _mm_extract_epi16(eob, 1); +} + +void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift, v_nz_mask; + __m256i v_eobmax = _mm256_setzero_si256(); + intptr_t count; + const int16_t *iscan = scan_order->iscan; + + load_b_values_avx2(mb_plane, &v_zbin, &v_round, &v_quant, dequant_ptr, + &v_dequant, &v_quant_shift, 0); + // Do DC and first 15 AC. + v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant, + &v_dequant, &v_round, &v_zbin, &v_quant_shift); + + v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask); + + v_round = _mm256_unpackhi_epi64(v_round, v_round); + v_quant = _mm256_unpackhi_epi64(v_quant, v_quant); + v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant); + v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift); + v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin); + + for (count = n_coeffs - 16; count > 0; count -= 16) { + coeff_ptr += 16; + qcoeff_ptr += 16; + dqcoeff_ptr += 16; + iscan += 16; + v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant, + &v_dequant, &v_round, &v_zbin, &v_quant_shift); + + v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask); + } + + *eob_ptr = accumulate_eob256(v_eobmax); +} + +static VPX_FORCE_INLINE __m256i quantize_b_32x32_16( + const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *iscan, __m256i *v_quant, + __m256i *v_dequant, __m256i *v_round, __m256i *v_zbin, + __m256i *v_quant_shift, __m256i *v_eobmax) { + const __m256i v_coeff = load_coefficients_avx2(coeff_ptr); + const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff); + const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin); + + if (_mm256_movemask_epi8(v_zbin_mask) == 0) { + _mm256_store_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256()); + _mm256_store_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256()); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256()); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256()); +#endif + return *v_eobmax; + } + { + // tmp = v_zbin_mask ? (int64_t)abs_coeff + round : 0 + const __m256i v_tmp_rnd = + _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask); + // tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * + // quant_shift_ptr[rc != 0]) >> 15); + const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant); + const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd); + const __m256i v_tmp32_hi = + _mm256_slli_epi16(_mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift), 1); + const __m256i v_tmp32_lo = + _mm256_srli_epi16(_mm256_mullo_epi16(v_tmp32_b, *v_quant_shift), 15); + const __m256i v_tmp32 = _mm256_or_si256(v_tmp32_hi, v_tmp32_lo); + const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff); + const __m256i v_sign_lo = + _mm256_unpacklo_epi16(_mm256_setzero_si256(), v_coeff); + const __m256i v_sign_hi = + _mm256_unpackhi_epi16(_mm256_setzero_si256(), v_coeff); + const __m256i low = _mm256_mullo_epi16(v_tmp32, *v_dequant); + const __m256i high = _mm256_mulhi_epi16(v_tmp32, *v_dequant); + const __m256i v_dqcoeff_lo = _mm256_sign_epi32( + _mm256_srli_epi32(_mm256_unpacklo_epi16(low, high), 1), v_sign_lo); + const __m256i v_dqcoeff_hi = _mm256_sign_epi32( + _mm256_srli_epi32(_mm256_unpackhi_epi16(low, high), 1), v_sign_hi); + const __m256i v_nz_mask = + _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256()); + + store_coefficients_avx2(v_qcoeff, qcoeff_ptr); + +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_storeu_si256((__m256i *)(dqcoeff_ptr), v_dqcoeff_lo); + _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + 8), v_dqcoeff_hi); +#else + store_coefficients_avx2(_mm256_packs_epi32(v_dqcoeff_lo, v_dqcoeff_hi), + dqcoeff_ptr); +#endif + + return get_max_lane_eob(iscan, *v_eobmax, v_nz_mask); + } +} + +void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift; + __m256i v_eobmax = _mm256_setzero_si256(); + intptr_t count; + const int16_t *iscan = scan_order->iscan; + + load_b_values_avx2(mb_plane, &v_zbin, &v_round, &v_quant, dequant_ptr, + &v_dequant, &v_quant_shift, 1); + + // Do DC and first 15 AC. + v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan, + &v_quant, &v_dequant, &v_round, &v_zbin, + &v_quant_shift, &v_eobmax); + + v_round = _mm256_unpackhi_epi64(v_round, v_round); + v_quant = _mm256_unpackhi_epi64(v_quant, v_quant); + v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant); + v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift); + v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin); + + for (count = (32 * 32) - 16; count > 0; count -= 16) { + coeff_ptr += 16; + qcoeff_ptr += 16; + dqcoeff_ptr += 16; + iscan += 16; + v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan, + &v_quant, &v_dequant, &v_round, &v_zbin, + &v_quant_shift, &v_eobmax); + } + + *eob_ptr = accumulate_eob256(v_eobmax); +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.c new file mode 100644 index 0000000000..64838eaa7d --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.c @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" +#include "vpx_dsp/x86/quantize_sse2.h" +#include "vp9/common/vp9_scan.h" + +void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const __m128i zero = _mm_setzero_si128(); + int index = 16; + const int16_t *iscan = scan_order->iscan; + + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i eob, eob0; + + // Setup global values. + load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + // Poor man's abs(). + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + calculate_qcoeff(&qcoeff0, round, quant, shift); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + + calculate_qcoeff(&qcoeff1, round, quant, shift); + + // Reinsert signs + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr); + dequant = _mm_unpackhi_epi64(dequant, dequant); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); + + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + calculate_qcoeff(&qcoeff0, round, quant, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); + eob = _mm_max_epi16(eob, eob0); + + index += 16; + } + + *eob_ptr = accumulate_eob(eob); +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.h new file mode 100644 index 0000000000..82c755a0cf --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.h @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_X86_QUANTIZE_SSE2_H_ +#define VPX_VPX_DSP_X86_QUANTIZE_SSE2_H_ + +#include + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vp9/encoder/vp9_block.h" + +static INLINE void load_b_values(const struct macroblock_plane *const mb_plane, + __m128i *zbin, __m128i *round, __m128i *quant, + const int16_t *dequant_ptr, __m128i *dequant, + __m128i *shift) { + *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin); + *round = _mm_load_si128((const __m128i *)mb_plane->round); + *quant = _mm_load_si128((const __m128i *)mb_plane->quant); + *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1)); + *dequant = _mm_load_si128((const __m128i *)dequant_ptr); + *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift); +} + +static INLINE void load_b_values32x32( + const struct macroblock_plane *const mb_plane, __m128i *zbin, + __m128i *round, __m128i *quant, const int16_t *dequant_ptr, + __m128i *dequant, __m128i *shift) { + const __m128i one = _mm_set1_epi16(1); + // The 32x32 halves zbin and round. + *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin); + // Shift with rounding. + *zbin = _mm_add_epi16(*zbin, one); + *zbin = _mm_srli_epi16(*zbin, 1); + // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so + // it is a strict "greater" comparison. + *zbin = _mm_sub_epi16(*zbin, one); + + *round = _mm_load_si128((const __m128i *)mb_plane->round); + *round = _mm_add_epi16(*round, one); + *round = _mm_srli_epi16(*round, 1); + + *quant = _mm_load_si128((const __m128i *)mb_plane->quant); + *dequant = _mm_load_si128((const __m128i *)dequant_ptr); + *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift); + // I suspect this is not technically OK because quant_shift can be up + // to 1 << 16 and shifting up again will outrange that, but the test is not + // comprehensive enough to catch that and "it's been that way forever" + *shift = _mm_slli_epi16(*shift, 1); +} + +static INLINE void load_fp_values(const struct macroblock_plane *mb_plane, + __m128i *round, __m128i *quant, + const int16_t *dequant_ptr, + __m128i *dequant) { + *round = _mm_load_si128((const __m128i *)mb_plane->round_fp); + *quant = _mm_load_si128((const __m128i *)mb_plane->quant_fp); + *dequant = _mm_load_si128((const __m128i *)dequant_ptr); +} + +// With ssse3 and later abs() and sign() are preferred. +static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) { + a = _mm_xor_si128(a, sign); + return _mm_sub_epi16(a, sign); +} + +static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round, + const __m128i quant, const __m128i shift) { + __m128i tmp, qcoeff; + qcoeff = _mm_adds_epi16(*coeff, round); + tmp = _mm_mulhi_epi16(qcoeff, quant); + qcoeff = _mm_add_epi16(tmp, qcoeff); + *coeff = _mm_mulhi_epi16(qcoeff, shift); +} + +static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant, + tran_low_t *dqcoeff) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m128i low = _mm_mullo_epi16(qcoeff, dequant); + const __m128i high = _mm_mulhi_epi16(qcoeff, dequant); + + const __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high); + const __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high); + + _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0); + _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1); +#else + const __m128i dqcoeff16 = _mm_mullo_epi16(qcoeff, dequant); + + _mm_store_si128((__m128i *)(dqcoeff), dqcoeff16); +#endif // CONFIG_VP9_HIGHBITDEPTH +} + +// Scan 16 values for eob reference in scan. +static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1, + const int16_t *scan, const int index, + const __m128i zero) { + const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero); + const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero); + __m128i scan0 = _mm_load_si128((const __m128i *)(scan + index)); + __m128i scan1 = _mm_load_si128((const __m128i *)(scan + index + 8)); + __m128i eob0, eob1; + eob0 = _mm_andnot_si128(zero_coeff0, scan0); + eob1 = _mm_andnot_si128(zero_coeff1, scan1); + return _mm_max_epi16(eob0, eob1); +} + +static INLINE int16_t accumulate_eob(__m128i eob) { + __m128i eob_shuffled; + eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + return _mm_extract_epi16(eob, 1); +} + +#endif // VPX_VPX_DSP_X86_QUANTIZE_SSE2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.c b/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.c new file mode 100644 index 0000000000..2c6d851a16 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.c @@ -0,0 +1,228 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" +#include "vpx_dsp/x86/quantize_sse2.h" +#include "vpx_dsp/x86/quantize_ssse3.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" + +void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const __m128i zero = _mm_setzero_si128(); + int index = 16; + const int16_t *iscan = scan_order->iscan; + + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i eob, eob0; + + load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + calculate_qcoeff(&qcoeff0, round, quant, shift); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + // Reinsert signs + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr); + dequant = _mm_unpackhi_epi64(dequant, dequant); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); + + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + calculate_qcoeff(&qcoeff0, round, quant, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); + eob = _mm_max_epi16(eob, eob0); + + index += 16; + } + + *eob_ptr = accumulate_eob(eob); +} + +void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const __m128i zero = _mm_setzero_si128(); + int index; + const int16_t *iscan = scan_order->iscan; + + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i all_zero; + __m128i eob = zero, eob0; + + load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, + &shift); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC. + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero); +#if CONFIG_VP9_HIGHBITDEPTH + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero); +#endif // CONFIG_HIGHBITDEPTH + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + calculate_qcoeff(&qcoeff0, round, quant, shift); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + // Mask out zbin threshold coeffs. + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, dqcoeff_ptr); + dequant = _mm_unpackhi_epi64(dequant, dequant); + calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8); + + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); + } + + // AC only loop. + for (index = 16; index < 32 * 32; index += 16) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero); +#if CONFIG_VP9_HIGHBITDEPTH + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero); +#endif // CONFIG_VP9_HIGHBITDEPTH + continue; + } + + calculate_qcoeff(&qcoeff0, round, quant, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, + dqcoeff_ptr + index); + calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, + dqcoeff_ptr + 8 + index); + + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); + eob = _mm_max_epi16(eob, eob0); + } + + *eob_ptr = accumulate_eob(eob); +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.h b/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.h new file mode 100644 index 0000000000..e8d2a05771 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_X86_QUANTIZE_SSSE3_H_ +#define VPX_VPX_DSP_X86_QUANTIZE_SSSE3_H_ + +#include + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/quantize_sse2.h" + +static INLINE void calculate_dqcoeff_and_store_32x32(const __m128i qcoeff, + const __m128i dequant, + const __m128i zero, + tran_low_t *dqcoeff) { + // Un-sign to bias rounding like C. + const __m128i coeff = _mm_abs_epi16(qcoeff); + + const __m128i sign_0 = _mm_unpacklo_epi16(zero, qcoeff); + const __m128i sign_1 = _mm_unpackhi_epi16(zero, qcoeff); + + const __m128i low = _mm_mullo_epi16(coeff, dequant); + const __m128i high = _mm_mulhi_epi16(coeff, dequant); + __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high); + __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high); + + // "Divide" by 2. + dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, 1); + dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, 1); + + dqcoeff32_0 = _mm_sign_epi32(dqcoeff32_0, sign_0); + dqcoeff32_1 = _mm_sign_epi32(dqcoeff32_1, sign_1); + +#if CONFIG_VP9_HIGHBITDEPTH + _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0); + _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1); +#else + _mm_store_si128((__m128i *)(dqcoeff), + _mm_packs_epi32(dqcoeff32_0, dqcoeff32_1)); +#endif // CONFIG_VP9_HIGHBITDEPTH +} + +#endif // VPX_VPX_DSP_X86_QUANTIZE_SSSE3_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx2.c new file mode 100644 index 0000000000..cf7111983b --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx2.c @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include // AVX2 +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +// Note with sums[4] some versions of Visual Studio may fail due to parameter +// alignment, though the functions should be equivalent: +// error C2719: 'sums': formal parameter with requested alignment of 32 won't be +// aligned +static INLINE void calc_final_4(const __m256i *const sums /*[4]*/, + uint32_t sad_array[4]) { + const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]); + const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]); + const __m256i t2 = _mm256_hadd_epi32(t0, t1); + const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2), + _mm256_extractf128_si256(t2, 1)); + _mm_storeu_si128((__m128i *)sad_array, sum); +} + +static INLINE void sad32xhx4d_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], + int ref_stride, int h, + uint32_t sad_array[4]) { + int i; + const uint8_t *refs[4]; + __m256i sums[4]; + + refs[0] = ref_array[0]; + refs[1] = ref_array[1]; + refs[2] = ref_array[2]; + refs[3] = ref_array[3]; + sums[0] = _mm256_setzero_si256(); + sums[1] = _mm256_setzero_si256(); + sums[2] = _mm256_setzero_si256(); + sums[3] = _mm256_setzero_si256(); + + for (i = 0; i < h; i++) { + __m256i r[4]; + + // load src and all ref[] + const __m256i s = _mm256_load_si256((const __m256i *)src_ptr); + r[0] = _mm256_loadu_si256((const __m256i *)refs[0]); + r[1] = _mm256_loadu_si256((const __m256i *)refs[1]); + r[2] = _mm256_loadu_si256((const __m256i *)refs[2]); + r[3] = _mm256_loadu_si256((const __m256i *)refs[3]); + + // sum of the absolute differences between every ref[] to src + r[0] = _mm256_sad_epu8(r[0], s); + r[1] = _mm256_sad_epu8(r[1], s); + r[2] = _mm256_sad_epu8(r[2], s); + r[3] = _mm256_sad_epu8(r[3], s); + + // sum every ref[] + sums[0] = _mm256_add_epi32(sums[0], r[0]); + sums[1] = _mm256_add_epi32(sums[1], r[1]); + sums[2] = _mm256_add_epi32(sums[2], r[2]); + sums[3] = _mm256_add_epi32(sums[3], r[3]); + + src_ptr += src_stride; + refs[0] += ref_stride; + refs[1] += ref_stride; + refs[2] += ref_stride; + refs[3] += ref_stride; + } + + calc_final_4(sums, sad_array); +} + +static INLINE void sad64xhx4d_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], + int ref_stride, int h, + uint32_t sad_array[4]) { + __m256i sums[4]; + int i; + const uint8_t *refs[4]; + + refs[0] = ref_array[0]; + refs[1] = ref_array[1]; + refs[2] = ref_array[2]; + refs[3] = ref_array[3]; + sums[0] = _mm256_setzero_si256(); + sums[1] = _mm256_setzero_si256(); + sums[2] = _mm256_setzero_si256(); + sums[3] = _mm256_setzero_si256(); + + for (i = 0; i < h; i++) { + __m256i r_lo[4], r_hi[4]; + // load 64 bytes from src and all ref[] + const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr); + const __m256i s_hi = _mm256_load_si256((const __m256i *)(src_ptr + 32)); + r_lo[0] = _mm256_loadu_si256((const __m256i *)refs[0]); + r_hi[0] = _mm256_loadu_si256((const __m256i *)(refs[0] + 32)); + r_lo[1] = _mm256_loadu_si256((const __m256i *)refs[1]); + r_hi[1] = _mm256_loadu_si256((const __m256i *)(refs[1] + 32)); + r_lo[2] = _mm256_loadu_si256((const __m256i *)refs[2]); + r_hi[2] = _mm256_loadu_si256((const __m256i *)(refs[2] + 32)); + r_lo[3] = _mm256_loadu_si256((const __m256i *)refs[3]); + r_hi[3] = _mm256_loadu_si256((const __m256i *)(refs[3] + 32)); + + // sum of the absolute differences between every ref[] to src + r_lo[0] = _mm256_sad_epu8(r_lo[0], s_lo); + r_lo[1] = _mm256_sad_epu8(r_lo[1], s_lo); + r_lo[2] = _mm256_sad_epu8(r_lo[2], s_lo); + r_lo[3] = _mm256_sad_epu8(r_lo[3], s_lo); + r_hi[0] = _mm256_sad_epu8(r_hi[0], s_hi); + r_hi[1] = _mm256_sad_epu8(r_hi[1], s_hi); + r_hi[2] = _mm256_sad_epu8(r_hi[2], s_hi); + r_hi[3] = _mm256_sad_epu8(r_hi[3], s_hi); + + // sum every ref[] + sums[0] = _mm256_add_epi32(sums[0], r_lo[0]); + sums[1] = _mm256_add_epi32(sums[1], r_lo[1]); + sums[2] = _mm256_add_epi32(sums[2], r_lo[2]); + sums[3] = _mm256_add_epi32(sums[3], r_lo[3]); + sums[0] = _mm256_add_epi32(sums[0], r_hi[0]); + sums[1] = _mm256_add_epi32(sums[1], r_hi[1]); + sums[2] = _mm256_add_epi32(sums[2], r_hi[2]); + sums[3] = _mm256_add_epi32(sums[3], r_hi[3]); + + src_ptr += src_stride; + refs[0] += ref_stride; + refs[1] += ref_stride; + refs[2] += ref_stride; + refs[3] += ref_stride; + } + + calc_final_4(sums, sad_array); +} + +#define SAD64_H(h) \ + void vpx_sad64x##h##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + sad64xhx4d_avx2(src, src_stride, ref_array, ref_stride, h, sad_array); \ + } + +#define SAD32_H(h) \ + void vpx_sad32x##h##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + sad32xhx4d_avx2(src, src_stride, ref_array, ref_stride, h, sad_array); \ + } + +SAD64_H(64) +SAD32_H(32) + +#define SADS64_H(h) \ + void vpx_sad_skip_64x##h##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + sad64xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + ((h) >> 1), sad_array); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } + +#define SADS32_H(h) \ + void vpx_sad_skip_32x##h##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + sad32xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + ((h) >> 1), sad_array); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } + +SADS64_H(64) +SADS64_H(32) + +SADS32_H(64) +SADS32_H(32) +SADS32_H(16) diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx512.c b/media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx512.c new file mode 100644 index 0000000000..cfd23fedd9 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx512.c @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include // AVX512 +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, + uint32_t sad_array[4]) { + __m512i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; + __m512i sum_ref0, sum_ref1, sum_ref2, sum_ref3; + __m512i sum_mlow, sum_mhigh; + int i; + const uint8_t *ref0, *ref1, *ref2, *ref3; + + ref0 = ref_array[0]; + ref1 = ref_array[1]; + ref2 = ref_array[2]; + ref3 = ref_array[3]; + sum_ref0 = _mm512_set1_epi16(0); + sum_ref1 = _mm512_set1_epi16(0); + sum_ref2 = _mm512_set1_epi16(0); + sum_ref3 = _mm512_set1_epi16(0); + for (i = 0; i < 64; i++) { + // load src and all ref[] + src_reg = _mm512_loadu_si512((const __m512i *)src_ptr); + ref0_reg = _mm512_loadu_si512((const __m512i *)ref0); + ref1_reg = _mm512_loadu_si512((const __m512i *)ref1); + ref2_reg = _mm512_loadu_si512((const __m512i *)ref2); + ref3_reg = _mm512_loadu_si512((const __m512i *)ref3); + // sum of the absolute differences between every ref[] to src + ref0_reg = _mm512_sad_epu8(ref0_reg, src_reg); + ref1_reg = _mm512_sad_epu8(ref1_reg, src_reg); + ref2_reg = _mm512_sad_epu8(ref2_reg, src_reg); + ref3_reg = _mm512_sad_epu8(ref3_reg, src_reg); + // sum every ref[] + sum_ref0 = _mm512_add_epi32(sum_ref0, ref0_reg); + sum_ref1 = _mm512_add_epi32(sum_ref1, ref1_reg); + sum_ref2 = _mm512_add_epi32(sum_ref2, ref2_reg); + sum_ref3 = _mm512_add_epi32(sum_ref3, ref3_reg); + + src_ptr += src_stride; + ref0 += ref_stride; + ref1 += ref_stride; + ref2 += ref_stride; + ref3 += ref_stride; + } + { + __m256i sum256; + __m128i sum128; + // in sum_ref[] the result is saved in the first 4 bytes + // the other 4 bytes are zeroed. + // sum_ref1 and sum_ref3 are shifted left by 4 bytes + sum_ref1 = _mm512_bslli_epi128(sum_ref1, 4); + sum_ref3 = _mm512_bslli_epi128(sum_ref3, 4); + + // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 + sum_ref0 = _mm512_or_si512(sum_ref0, sum_ref1); + sum_ref2 = _mm512_or_si512(sum_ref2, sum_ref3); + + // merge every 64 bit from each sum_ref[] + sum_mlow = _mm512_unpacklo_epi64(sum_ref0, sum_ref2); + sum_mhigh = _mm512_unpackhi_epi64(sum_ref0, sum_ref2); + + // add the low 64 bit to the high 64 bit + sum_mlow = _mm512_add_epi32(sum_mlow, sum_mhigh); + + // add the low 128 bit to the high 128 bit + sum256 = _mm256_add_epi32(_mm512_castsi512_si256(sum_mlow), + _mm512_extracti32x8_epi32(sum_mlow, 1)); + sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum256), + _mm256_extractf128_si256(sum256, 1)); + + _mm_storeu_si128((__m128i *)(sad_array), sum128); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad4d_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/sad4d_sse2.asm new file mode 100644 index 0000000000..ed4ea3ef9b --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/sad4d_sse2.asm @@ -0,0 +1,278 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_4x2x4 5-6 0 + movd m0, [srcq +%2] +%if %1 == 1 + movd m6, [ref1q+%3] + movd m4, [ref2q+%3] + movd m7, [ref3q+%3] + movd m5, [ref4q+%3] + movd m1, [srcq +%4] + movd m2, [ref1q+%5] + punpckldq m0, m1 + punpckldq m6, m2 + movd m1, [ref2q+%5] + movd m2, [ref3q+%5] + movd m3, [ref4q+%5] + punpckldq m4, m1 + punpckldq m7, m2 + punpckldq m5, m3 + movlhps m0, m0 + movlhps m6, m4 + movlhps m7, m5 + psadbw m6, m0 + psadbw m7, m0 +%else + movd m1, [ref1q+%3] + movd m5, [ref1q+%5] + movd m2, [ref2q+%3] + movd m4, [ref2q+%5] + punpckldq m1, m5 + punpckldq m2, m4 + movd m3, [ref3q+%3] + movd m5, [ref3q+%5] + punpckldq m3, m5 + movd m4, [ref4q+%3] + movd m5, [ref4q+%5] + punpckldq m4, m5 + movd m5, [srcq +%4] + punpckldq m0, m5 + movlhps m0, m0 + movlhps m1, m2 + movlhps m3, m4 + psadbw m1, m0 + psadbw m3, m0 + paddd m6, m1 + paddd m7, m3 +%endif +%if %6 == 1 + lea srcq, [srcq +src_strideq*2] + lea ref1q, [ref1q+ref_strideq*2] + lea ref2q, [ref2q+ref_strideq*2] + lea ref3q, [ref3q+ref_strideq*2] + lea ref4q, [ref4q+ref_strideq*2] +%endif +%endmacro + +; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_8x2x4 5-6 0 + movh m0, [srcq +%2] +%if %1 == 1 + movh m4, [ref1q+%3] + movh m5, [ref2q+%3] + movh m6, [ref3q+%3] + movh m7, [ref4q+%3] + movhps m0, [srcq +%4] + movhps m4, [ref1q+%5] + movhps m5, [ref2q+%5] + movhps m6, [ref3q+%5] + movhps m7, [ref4q+%5] + psadbw m4, m0 + psadbw m5, m0 + psadbw m6, m0 + psadbw m7, m0 +%else + movh m1, [ref1q+%3] + movh m2, [ref2q+%3] + movh m3, [ref3q+%3] + movhps m0, [srcq +%4] + movhps m1, [ref1q+%5] + movhps m2, [ref2q+%5] + movhps m3, [ref3q+%5] + psadbw m1, m0 + psadbw m2, m0 + psadbw m3, m0 + paddd m4, m1 + movh m1, [ref4q+%3] + movhps m1, [ref4q+%5] + paddd m5, m2 + paddd m6, m3 + psadbw m1, m0 + paddd m7, m1 +%endif +%if %6 == 1 + lea srcq, [srcq +src_strideq*2] + lea ref1q, [ref1q+ref_strideq*2] + lea ref2q, [ref2q+ref_strideq*2] + lea ref3q, [ref3q+ref_strideq*2] + lea ref4q, [ref4q+ref_strideq*2] +%endif +%endmacro + +; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_16x2x4 5-6 0 + ; 1st 16 px + mova m0, [srcq +%2] +%if %1 == 1 + movu m4, [ref1q+%3] + movu m5, [ref2q+%3] + movu m6, [ref3q+%3] + movu m7, [ref4q+%3] + psadbw m4, m0 + psadbw m5, m0 + psadbw m6, m0 + psadbw m7, m0 +%else + movu m1, [ref1q+%3] + movu m2, [ref2q+%3] + movu m3, [ref3q+%3] + psadbw m1, m0 + psadbw m2, m0 + psadbw m3, m0 + paddd m4, m1 + movu m1, [ref4q+%3] + paddd m5, m2 + paddd m6, m3 + psadbw m1, m0 + paddd m7, m1 +%endif + + ; 2nd 16 px + mova m0, [srcq +%4] + movu m1, [ref1q+%5] + movu m2, [ref2q+%5] + movu m3, [ref3q+%5] + psadbw m1, m0 + psadbw m2, m0 + psadbw m3, m0 + paddd m4, m1 + movu m1, [ref4q+%5] + paddd m5, m2 + paddd m6, m3 +%if %6 == 1 + lea srcq, [srcq +src_strideq*2] + lea ref1q, [ref1q+ref_strideq*2] + lea ref2q, [ref2q+ref_strideq*2] + lea ref3q, [ref3q+ref_strideq*2] + lea ref4q, [ref4q+ref_strideq*2] +%endif + psadbw m1, m0 + paddd m7, m1 +%endmacro + +; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_32x2x4 5-6 0 + PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16 + PROCESS_16x2x4 0, %4, %5, %4 + 16, %5 + 16, %6 +%endmacro + +; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_64x2x4 5-6 0 + PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32 + PROCESS_32x2x4 0, %4, %5, %4 + 32, %5 + 32, %6 +%endmacro + +; void vpx_sadNxNx4d_sse2(uint8_t *src, int src_stride, +; uint8_t *ref[4], int ref_stride, +; uint32_t res[4]); +; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4 +%macro SADNXN4D 2-3 0 +%if %3 == 1 ; skip rows +%if UNIX64 +cglobal sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ + res, ref2, ref3, ref4 +%else +cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ + ref2, ref3, ref4 +%endif +%else ; normal sad +%if UNIX64 +cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ + res, ref2, ref3, ref4 +%else +cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ + ref2, ref3, ref4 +%endif +%endif +%if %3 == 1 + lea src_strided, [2*src_strided] + lea ref_strided, [2*ref_strided] +%endif + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + mov ref2q, [ref1q+gprsize*1] + mov ref3q, [ref1q+gprsize*2] + mov ref4q, [ref1q+gprsize*3] + mov ref1q, [ref1q+gprsize*0] + + PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 +%if %3 == 1 ; downsample number of rows by 2 +%define num_rep (%2-8)/4 +%else +%define num_rep (%2-4)/2 +%endif +%rep num_rep + PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 +%endrep +%undef num_rep + PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 + +%if %1 > 4 + pslldq m5, 4 + pslldq m7, 4 + por m4, m5 + por m6, m7 + mova m5, m4 + mova m7, m6 + punpcklqdq m4, m6 + punpckhqdq m5, m7 + movifnidn r4, r4mp + paddd m4, m5 +%if %3 == 1 + pslld m4, 1 +%endif + movu [r4], m4 + RET +%else + movifnidn r4, r4mp + pshufd m6, m6, 0x08 + pshufd m7, m7, 0x08 +%if %3 == 1 + pslld m6, 1 + pslld m7, 1 +%endif + movq [r4+0], m6 + movq [r4+8], m7 + RET +%endif +%endmacro + +INIT_XMM sse2 +SADNXN4D 64, 64 +SADNXN4D 64, 32 +SADNXN4D 32, 64 +SADNXN4D 32, 32 +SADNXN4D 32, 16 +SADNXN4D 16, 32 +SADNXN4D 16, 16 +SADNXN4D 16, 8 +SADNXN4D 8, 16 +SADNXN4D 8, 8 +SADNXN4D 8, 4 +SADNXN4D 4, 8 +SADNXN4D 4, 4 + +SADNXN4D 64, 64, 1 +SADNXN4D 64, 32, 1 +SADNXN4D 32, 64, 1 +SADNXN4D 32, 32, 1 +SADNXN4D 32, 16, 1 +SADNXN4D 16, 32, 1 +SADNXN4D 16, 16, 1 +SADNXN4D 16, 8, 1 +SADNXN4D 8, 16, 1 +SADNXN4D 8, 8, 1 +SADNXN4D 4, 8, 1 diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/sad_avx2.c new file mode 100644 index 0000000000..e00494d766 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/sad_avx2.c @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_ports/mem.h" + +static INLINE unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + int i, res; + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; + __m256i sum_sad = _mm256_setzero_si256(); + __m256i sum_sad_h; + __m128i sum_sad128; + for (i = 0; i < h; i++) { + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); + sad1_reg = + _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); + sad2_reg = _mm256_sad_epu8( + ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); + sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); + ref_ptr += ref_stride; + src_ptr += src_stride; + } + sum_sad_h = _mm256_srli_si256(sum_sad, 8); + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); + res = _mm_cvtsi128_si32(sum_sad128); + return res; +} + +static INLINE unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + int i, res; + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; + __m256i sum_sad = _mm256_setzero_si256(); + __m256i sum_sad_h; + __m128i sum_sad128; + const int ref2_stride = ref_stride << 1; + const int src2_stride = src_stride << 1; + const int max = h >> 1; + for (i = 0; i < max; i++) { + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); + sad1_reg = + _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); + sad2_reg = _mm256_sad_epu8( + ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); + sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); + ref_ptr += ref2_stride; + src_ptr += src2_stride; + } + sum_sad_h = _mm256_srli_si256(sum_sad, 8); + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); + res = _mm_cvtsi128_si32(sum_sad128); + return res; +} + +#define FSAD64_H(h) \ + unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \ + } + +#define FSADS64_H(h) \ + unsigned int vpx_sad_skip_64x##h##_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ + h / 2); \ + } + +#define FSAD32_H(h) \ + unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \ + } + +#define FSADS32_H(h) \ + unsigned int vpx_sad_skip_32x##h##_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ + h / 2); \ + } + +#define FSAD64 \ + FSAD64_H(64) \ + FSAD64_H(32) \ + FSADS64_H(64) \ + FSADS64_H(32) + +#define FSAD32 \ + FSAD32_H(64) \ + FSAD32_H(32) \ + FSAD32_H(16) \ + FSADS32_H(64) \ + FSADS32_H(32) \ + FSADS32_H(16) + +FSAD64 +FSAD32 + +#undef FSAD64 +#undef FSAD32 +#undef FSAD64_H +#undef FSAD32_H +#undef FSADS64_H +#undef FSADS32_H + +#define FSADAVG64_H(h) \ + unsigned int vpx_sad64x##h##_avg_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + int i; \ + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ + __m256i sum_sad = _mm256_setzero_si256(); \ + __m256i sum_sad_h; \ + __m128i sum_sad128; \ + for (i = 0; i < h; i++) { \ + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ + ref1_reg = _mm256_avg_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \ + ref2_reg = _mm256_avg_epu8( \ + ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \ + sad1_reg = _mm256_sad_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ + sad2_reg = _mm256_sad_epu8( \ + ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ + sum_sad = \ + _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ + ref_ptr += ref_stride; \ + src_ptr += src_stride; \ + second_pred += 64; \ + } \ + sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ + return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \ + } + +#define FSADAVG32_H(h) \ + unsigned int vpx_sad32x##h##_avg_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + int i; \ + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ + __m256i sum_sad = _mm256_setzero_si256(); \ + __m256i sum_sad_h; \ + __m128i sum_sad128; \ + int ref2_stride = ref_stride << 1; \ + int src2_stride = src_stride << 1; \ + int max = h >> 1; \ + for (i = 0; i < max; i++) { \ + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ + ref1_reg = _mm256_avg_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \ + ref2_reg = _mm256_avg_epu8( \ + ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \ + sad1_reg = _mm256_sad_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ + sad2_reg = _mm256_sad_epu8( \ + ref2_reg, \ + _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ + sum_sad = \ + _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ + ref_ptr += ref2_stride; \ + src_ptr += src2_stride; \ + second_pred += 64; \ + } \ + sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ + return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \ + } + +#define FSADAVG64 \ + FSADAVG64_H(64) \ + FSADAVG64_H(32) + +#define FSADAVG32 \ + FSADAVG32_H(64) \ + FSADAVG32_H(32) \ + FSADAVG32_H(16) + +FSADAVG64 +FSADAVG32 + +#undef FSADAVG64 +#undef FSADAVG32 +#undef FSADAVG64_H +#undef FSADAVG32_H diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/sad_sse2.asm new file mode 100644 index 0000000000..627e463bf8 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/sad_sse2.asm @@ -0,0 +1,332 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; Macro Arguments +; Arg 1: Width +; Arg 2: Height +; Arg 3: Number of general purpose registers +; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows +%macro SAD_FN 4 +%if %4 == 0 ; normal sad +%if %3 == 5 +cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 + +%elif %4 == 2 ; skip +%if %3 == 5 +cglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 + +%else +%if %3 == 5 +cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ + second_pred, n_rows +%else ; %3 == 7 +cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \ + ref, ref_stride, \ + second_pred, \ + src_stride3, ref_stride3 +%if VPX_ARCH_X86_64 +%define n_rowsd r7d +%else ; x86-32 +%define n_rowsd dword r0m +%endif ; x86-32/64 +%endif ; %3 == 5/7 +%endif ; sad/avg/skip +%if %4 == 2; skip rows so double the stride +lea src_strided, [src_strided*2] +lea ref_strided, [ref_strided*2] +%endif ; %4 skip + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided +%if %3 == 7 + lea src_stride3q, [src_strideq*3] + lea ref_stride3q, [ref_strideq*3] +%endif ; %3 == 7 +%endmacro + +; unsigned int vpx_sad64x64_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD64XN 1-2 0 + SAD_FN 64, %1, 5, %2 +%if %2 == 2 + mov n_rowsd, %1/2 +%else + mov n_rowsd, %1 +%endif + pxor m0, m0 +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + psadbw m1, [srcq] + psadbw m2, [srcq+16] + psadbw m3, [srcq+32] + psadbw m4, [srcq+48] + paddd m1, m2 + paddd m3, m4 + add refq, ref_strideq + paddd m0, m1 + add srcq, src_strideq + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD64XN 64 ; sad64x64_sse2 +SAD64XN 32 ; sad64x32_sse2 +SAD64XN 64, 1 ; sad64x64_avg_sse2 +SAD64XN 32, 1 ; sad64x32_avg_sse2 +SAD64XN 64, 2 ; sad64x64_skip_sse2 +SAD64XN 32, 2 ; sad64x32_skip_sse2 + +; unsigned int vpx_sad32x32_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD32XN 1-2 0 + SAD_FN 32, %1, 5, %2 +%if %2 == 2 + mov n_rowsd, %1/4 +%else + mov n_rowsd, %1/2 +%endif + pxor m0, m0 +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+ref_strideq] + movu m4, [refq+ref_strideq+16] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + psadbw m1, [srcq] + psadbw m2, [srcq+16] + psadbw m3, [srcq+src_strideq] + psadbw m4, [srcq+src_strideq+16] + paddd m1, m2 + paddd m3, m4 + lea refq, [refq+ref_strideq*2] + paddd m0, m1 + lea srcq, [srcq+src_strideq*2] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD32XN 64 ; sad32x64_sse2 +SAD32XN 32 ; sad32x32_sse2 +SAD32XN 16 ; sad32x16_sse2 +SAD32XN 64, 1 ; sad32x64_avg_sse2 +SAD32XN 32, 1 ; sad32x32_avg_sse2 +SAD32XN 16, 1 ; sad32x16_avg_sse2 +SAD32XN 64, 2 ; sad32x64_skip_sse2 +SAD32XN 32, 2 ; sad32x32_skip_sse2 +SAD32XN 16, 2 ; sad32x16_skip_sse2 + +; unsigned int vpx_sad16x{8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD16XN 1-2 0 + SAD_FN 16, %1, 7, %2 +%if %2 == 2 + mov n_rowsd, %1/8 +%else + mov n_rowsd, %1/4 +%endif + pxor m0, m0 + +.loop: + movu m1, [refq] + movu m2, [refq+ref_strideq] + movu m3, [refq+ref_strideq*2] + movu m4, [refq+ref_stride3q] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + psadbw m1, [srcq] + psadbw m2, [srcq+src_strideq] + psadbw m3, [srcq+src_strideq*2] + psadbw m4, [srcq+src_stride3q] + paddd m1, m2 + paddd m3, m4 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD16XN 32 ; sad16x32_sse2 +SAD16XN 16 ; sad16x16_sse2 +SAD16XN 8 ; sad16x8_sse2 +SAD16XN 32, 1 ; sad16x32_avg_sse2 +SAD16XN 16, 1 ; sad16x16_avg_sse2 +SAD16XN 8, 1 ; sad16x8_avg_sse2 +SAD16XN 32, 2 ; sad16x32_skip_sse2 +SAD16XN 16, 2 ; sad16x16_skip_sse2 +SAD16XN 8, 2 ; sad16x8_skip_sse2 + +; unsigned int vpx_sad8x{8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD8XN 1-2 0 + SAD_FN 8, %1, 7, %2 +%if %2 == 2 + mov n_rowsd, %1/8 +%else + mov n_rowsd, %1/4 +%endif + pxor m0, m0 + +.loop: + movh m1, [refq] + movhps m1, [refq+ref_strideq] + movh m2, [refq+ref_strideq*2] + movhps m2, [refq+ref_stride3q] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + lea second_predq, [second_predq+mmsize*2] +%endif + movh m3, [srcq] + movhps m3, [srcq+src_strideq] + movh m4, [srcq+src_strideq*2] + movhps m4, [srcq+src_stride3q] + psadbw m1, m3 + psadbw m2, m4 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m2 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD8XN 16 ; sad8x16_sse2 +SAD8XN 8 ; sad8x8_sse2 +SAD8XN 4 ; sad8x4_sse2 +SAD8XN 16, 1 ; sad8x16_avg_sse2 +SAD8XN 8, 1 ; sad8x8_avg_sse2 +SAD8XN 4, 1 ; sad8x4_avg_sse2 +SAD8XN 16, 2 ; sad8x16_skip_sse2 +SAD8XN 8, 2 ; sad8x8_skip_sse2 + +; unsigned int vpx_sad4x{4, 8}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD4XN 1-2 0 + SAD_FN 4, %1, 7, %2 +%if %2 == 2 + mov n_rowsd, %1/8 +%else + mov n_rowsd, %1/4 +%endif + pxor m0, m0 + +.loop: + movd m1, [refq] + movd m2, [refq+ref_strideq] + movd m3, [refq+ref_strideq*2] + movd m4, [refq+ref_stride3q] + punpckldq m1, m2 + punpckldq m3, m4 + movlhps m1, m3 +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + lea second_predq, [second_predq+mmsize*1] +%endif + movd m2, [srcq] + movd m5, [srcq+src_strideq] + movd m4, [srcq+src_strideq*2] + movd m3, [srcq+src_stride3q] + punpckldq m2, m5 + punpckldq m4, m3 + movlhps m2, m4 + psadbw m1, m2 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD4XN 8 ; sad4x8_sse +SAD4XN 4 ; sad4x4_sse +SAD4XN 8, 1 ; sad4x8_avg_sse +SAD4XN 4, 1 ; sad4x4_avg_sse +SAD4XN 8, 2 ; sad4x8_skip_sse diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sse_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/sse_avx2.c new file mode 100644 index 0000000000..dfe45b6115 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/sse_avx2.c @@ -0,0 +1,368 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_ports/mem.h" +#include "vpx_dsp/x86/mem_sse2.h" + +static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a, + const uint8_t *b) { + const __m256i v_a0 = _mm256_loadu_si256((const __m256i *)a); + const __m256i v_b0 = _mm256_loadu_si256((const __m256i *)b); + const __m256i zero = _mm256_setzero_si256(); + const __m256i v_a00_w = _mm256_unpacklo_epi8(v_a0, zero); + const __m256i v_a01_w = _mm256_unpackhi_epi8(v_a0, zero); + const __m256i v_b00_w = _mm256_unpacklo_epi8(v_b0, zero); + const __m256i v_b01_w = _mm256_unpackhi_epi8(v_b0, zero); + const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w); + const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w)); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w)); +} + +static INLINE int64_t summary_all_avx2(const __m256i *sum_all) { + int64_t sum; + __m256i zero = _mm256_setzero_si256(); + const __m256i sum0_4x64 = _mm256_unpacklo_epi32(*sum_all, zero); + const __m256i sum1_4x64 = _mm256_unpackhi_epi32(*sum_all, zero); + const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64); + const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64), + _mm256_extracti128_si256(sum_4x64, 1)); + const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); + _mm_storel_epi64((__m128i *)&sum, sum_1x64); + return sum; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void summary_32_avx2(const __m256i *sum32, __m256i *sum) { + const __m256i sum0_4x64 = + _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum32)); + const __m256i sum1_4x64 = + _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum32, 1)); + const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64); + *sum = _mm256_add_epi64(*sum, sum_4x64); +} + +static INLINE int64_t summary_4x64_avx2(const __m256i sum_4x64) { + int64_t sum; + const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64), + _mm256_extracti128_si256(sum_4x64, 1)); + const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); + + _mm_storel_epi64((__m128i *)&sum, sum_1x64); + return sum; +} +#endif + +static INLINE void sse_w4x4_avx2(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, __m256i *sum) { + const __m128i v_a0 = load_unaligned_u32(a); + const __m128i v_a1 = load_unaligned_u32(a + a_stride); + const __m128i v_a2 = load_unaligned_u32(a + a_stride * 2); + const __m128i v_a3 = load_unaligned_u32(a + a_stride * 3); + const __m128i v_b0 = load_unaligned_u32(b); + const __m128i v_b1 = load_unaligned_u32(b + b_stride); + const __m128i v_b2 = load_unaligned_u32(b + b_stride * 2); + const __m128i v_b3 = load_unaligned_u32(b + b_stride * 3); + const __m128i v_a0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_a0, v_a1), + _mm_unpacklo_epi32(v_a2, v_a3)); + const __m128i v_b0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_b0, v_b1), + _mm_unpacklo_epi32(v_b2, v_b3)); + const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123); + const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, __m256i *sum) { + const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a); + const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride)); + const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b); + const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride)); + const __m256i v_a_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1)); + const __m256i v_b_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1)); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +int64_t vpx_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int32_t y = 0; + int64_t sse = 0; + __m256i sum = _mm256_setzero_si256(); + __m256i zero = _mm256_setzero_si256(); + switch (width) { + case 4: + do { + sse_w4x4_avx2(a, a_stride, b, b_stride, &sum); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 8: + do { + sse_w8x2_avx2(a, a_stride, b, b_stride, &sum); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 16: + do { + const __m128i v_a0 = _mm_loadu_si128((const __m128i *)a); + const __m128i v_a1 = _mm_loadu_si128((const __m128i *)(a + a_stride)); + const __m128i v_b0 = _mm_loadu_si128((const __m128i *)b); + const __m128i v_b1 = _mm_loadu_si128((const __m128i *)(b + b_stride)); + const __m256i v_a = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_a0), v_a1, 0x01); + const __m256i v_b = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_b0), v_b1, 0x01); + const __m256i v_al = _mm256_unpacklo_epi8(v_a, zero); + const __m256i v_au = _mm256_unpackhi_epi8(v_a, zero); + const __m256i v_bl = _mm256_unpacklo_epi8(v_b, zero); + const __m256i v_bu = _mm256_unpackhi_epi8(v_b, zero); + const __m256i v_asub = _mm256_sub_epi16(v_al, v_bl); + const __m256i v_bsub = _mm256_sub_epi16(v_au, v_bu); + const __m256i temp = + _mm256_add_epi32(_mm256_madd_epi16(v_asub, v_asub), + _mm256_madd_epi16(v_bsub, v_bsub)); + sum = _mm256_add_epi32(sum, temp); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 32: + do { + sse_w32_avx2(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 64: + do { + sse_w32_avx2(&sum, a, b); + sse_w32_avx2(&sum, a + 32, b + 32); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + default: + if ((width & 0x07) == 0) { + do { + int i = 0; + do { + sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum); + i += 8; + } while (i < width); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + } else { + do { + int i = 0; + do { + const uint8_t *a2; + const uint8_t *b2; + sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum); + a2 = a + i + (a_stride << 1); + b2 = b + i + (b_stride << 1); + sse_w8x2_avx2(a2, a_stride, b2, b_stride, &sum); + i += 8; + } while (i + 4 < width); + sse_w4x4_avx2(a + i, a_stride, b + i, b_stride, &sum); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + } + sse = summary_all_avx2(&sum); + break; + } + + return sse; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a, + const uint16_t *b) { + const __m256i v_a_w = _mm256_loadu_si256((const __m256i *)a); + const __m256i v_b_w = _mm256_loadu_si256((const __m256i *)b); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void highbd_sse_w4x4_avx2(__m256i *sum, const uint16_t *a, + int a_stride, const uint16_t *b, + int b_stride) { + const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a); + const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride)); + const __m128i v_a2 = _mm_loadl_epi64((const __m128i *)(a + a_stride * 2)); + const __m128i v_a3 = _mm_loadl_epi64((const __m128i *)(a + a_stride * 3)); + const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b); + const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride)); + const __m128i v_b2 = _mm_loadl_epi64((const __m128i *)(b + b_stride * 2)); + const __m128i v_b3 = _mm_loadl_epi64((const __m128i *)(b + b_stride * 3)); + const __m128i v_a_hi = _mm_unpacklo_epi64(v_a0, v_a1); + const __m128i v_a_lo = _mm_unpacklo_epi64(v_a2, v_a3); + const __m256i v_a_w = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_a_lo), v_a_hi, 1); + const __m128i v_b_hi = _mm_unpacklo_epi64(v_b0, v_b1); + const __m128i v_b_lo = _mm_unpacklo_epi64(v_b2, v_b3); + const __m256i v_b_w = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_b_lo), v_b_hi, 1); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void highbd_sse_w8x2_avx2(__m256i *sum, const uint16_t *a, + int a_stride, const uint16_t *b, + int b_stride) { + const __m128i v_a_hi = _mm_loadu_si128((const __m128i *)(a + a_stride)); + const __m128i v_a_lo = _mm_loadu_si128((const __m128i *)a); + const __m256i v_a_w = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_a_lo), v_a_hi, 1); + const __m128i v_b_hi = _mm_loadu_si128((const __m128i *)(b + b_stride)); + const __m128i v_b_lo = _mm_loadu_si128((const __m128i *)b); + const __m256i v_b_w = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_b_lo), v_b_hi, 1); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +int64_t vpx_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8, + int b_stride, int width, int height) { + int32_t y = 0; + int64_t sse = 0; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + __m256i sum = _mm256_setzero_si256(); + switch (width) { + case 4: + do { + highbd_sse_w4x4_avx2(&sum, a, a_stride, b, b_stride); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 8: + do { + highbd_sse_w8x2_avx2(&sum, a, a_stride, b, b_stride); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 16: + do { + highbd_sse_w16_avx2(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 32: + do { + int l = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + highbd_sse_w16_avx2(&sum32, a, b); + highbd_sse_w16_avx2(&sum32, a + 16, b + 16); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 64 && l < (height - y)); + summary_32_avx2(&sum32, &sum); + y += 64; + } while (y < height); + sse = summary_4x64_avx2(sum); + break; + case 64: + do { + int l = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + highbd_sse_w16_avx2(&sum32, a, b); + highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1); + highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2); + highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 32 && l < (height - y)); + summary_32_avx2(&sum32, &sum); + y += 32; + } while (y < height); + sse = summary_4x64_avx2(sum); + break; + default: + if (width & 0x7) { + do { + int i = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + const uint16_t *a2; + const uint16_t *b2; + highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride); + a2 = a + i + (a_stride << 1); + b2 = b + i + (b_stride << 1); + highbd_sse_w8x2_avx2(&sum32, a2, a_stride, b2, b_stride); + i += 8; + } while (i + 4 < width); + highbd_sse_w4x4_avx2(&sum32, a + i, a_stride, b + i, b_stride); + summary_32_avx2(&sum32, &sum); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + } else { + do { + int l = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + int i = 0; + do { + highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride); + i += 8; + } while (i < width); + a += a_stride << 1; + b += b_stride << 1; + l += 2; + } while (l < 8 && l < (height - y)); + summary_32_avx2(&sum32, &sum); + y += 8; + } while (y < height); + } + sse = summary_4x64_avx2(sum); + break; + } + return sse; +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sse_sse4.c b/media/libvpx/libvpx/vpx_dsp/x86/sse_sse4.c new file mode 100644 index 0000000000..4a7585c57e --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/sse_sse4.c @@ -0,0 +1,312 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_ports/mem.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/mem_sse2.h" + +static INLINE int64_t summary_all_sse4(const __m128i *sum_all) { + int64_t sum; + const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all); + const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8)); + const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1); + const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); + _mm_storel_epi64((__m128i *)&sum, sum_1x64); + return sum; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void summary_32_sse4(const __m128i *sum32, __m128i *sum64) { + const __m128i sum0 = _mm_cvtepu32_epi64(*sum32); + const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum32, 8)); + *sum64 = _mm_add_epi64(sum0, *sum64); + *sum64 = _mm_add_epi64(sum1, *sum64); +} +#endif + +static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a, + const uint8_t *b) { + const __m128i v_a0 = _mm_loadu_si128((const __m128i *)a); + const __m128i v_b0 = _mm_loadu_si128((const __m128i *)b); + const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0); + const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8)); + const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0); + const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8)); + const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w); + const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w)); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w)); +} + +static INLINE void sse4x2_sse4_1(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, __m128i *sum) { + const __m128i v_a0 = load_unaligned_u32(a); + const __m128i v_a1 = load_unaligned_u32(a + a_stride); + const __m128i v_b0 = load_unaligned_u32(b); + const __m128i v_b1 = load_unaligned_u32(b + b_stride); + const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1)); + const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1)); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void sse8_sse4_1(const uint8_t *a, const uint8_t *b, + __m128i *sum) { + const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a); + const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b); + const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0); + const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + +int64_t vpx_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int y = 0; + int64_t sse = 0; + __m128i sum = _mm_setzero_si128(); + switch (width) { + case 4: + do { + sse4x2_sse4_1(a, a_stride, b, b_stride, &sum); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 8: + do { + sse8_sse4_1(a, b, &sum); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 16: + do { + sse_w16_sse4_1(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 32: + do { + sse_w16_sse4_1(&sum, a, b); + sse_w16_sse4_1(&sum, a + 16, b + 16); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 64: + do { + sse_w16_sse4_1(&sum, a, b); + sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1); + sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2); + sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + default: + if (width & 0x07) { + do { + int i = 0; + do { + sse8_sse4_1(a + i, b + i, &sum); + sse8_sse4_1(a + i + a_stride, b + i + b_stride, &sum); + i += 8; + } while (i + 4 < width); + sse4x2_sse4_1(a + i, a_stride, b + i, b_stride, &sum); + a += (a_stride << 1); + b += (b_stride << 1); + y += 2; + } while (y < height); + } else { + do { + int i = 0; + do { + sse8_sse4_1(a + i, b + i, &sum); + i += 8; + } while (i < width); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + } + sse = summary_all_sse4(&sum); + break; + } + + return sse; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void highbd_sse_w4x2_sse4_1(__m128i *sum, const uint16_t *a, + int a_stride, const uint16_t *b, + int b_stride) { + const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a); + const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride)); + const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b); + const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride)); + const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1); + const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a, + const uint16_t *b) { + const __m128i v_a_w = _mm_loadu_si128((const __m128i *)a); + const __m128i v_b_w = _mm_loadu_si128((const __m128i *)b); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + +int64_t vpx_highbd_sse_sse4_1(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int width, + int height) { + int32_t y = 0; + int64_t sse = 0; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + __m128i sum = _mm_setzero_si128(); + switch (width) { + case 4: + do { + highbd_sse_w4x2_sse4_1(&sum, a, a_stride, b, b_stride); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 8: + do { + highbd_sse_w8_sse4_1(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 16: + do { + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + highbd_sse_w8_sse4_1(&sum32, a, b); + highbd_sse_w8_sse4_1(&sum32, a + 8, b + 8); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 64 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 64; + } while (y < height); + _mm_storel_epi64((__m128i *)&sse, + _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); + break; + case 32: + do { + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + highbd_sse_w8_sse4_1(&sum32, a, b); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 32 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 32; + } while (y < height); + _mm_storel_epi64((__m128i *)&sse, + _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); + break; + case 64: + do { + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + highbd_sse_w8_sse4_1(&sum32, a, b); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 16 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 16; + } while (y < height); + _mm_storel_epi64((__m128i *)&sse, + _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); + break; + default: + if (width & 0x7) { + do { + __m128i sum32 = _mm_setzero_si128(); + int i = 0; + do { + highbd_sse_w8_sse4_1(&sum32, a + i, b + i); + highbd_sse_w8_sse4_1(&sum32, a + i + a_stride, b + i + b_stride); + i += 8; + } while (i + 4 < width); + highbd_sse_w4x2_sse4_1(&sum32, a + i, a_stride, b + i, b_stride); + a += (a_stride << 1); + b += (b_stride << 1); + y += 2; + summary_32_sse4(&sum32, &sum); + } while (y < height); + } else { + do { + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + int i = 0; + do { + highbd_sse_w8_sse4_1(&sum32, a + i, b + i); + i += 8; + } while (i < width); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 8 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 8; + } while (y < height); + } + _mm_storel_epi64((__m128i *)&sse, + _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); + break; + } + return sse; +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm b/media/libvpx/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm new file mode 100644 index 0000000000..41ffbb07e6 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm @@ -0,0 +1,219 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr +%macro TABULATE_SSIM 0 + paddusw xmm15, xmm3 ; sum_s + paddusw xmm14, xmm4 ; sum_r + movdqa xmm1, xmm3 + pmaddwd xmm1, xmm1 + paddd xmm13, xmm1 ; sum_sq_s + movdqa xmm2, xmm4 + pmaddwd xmm2, xmm2 + paddd xmm12, xmm2 ; sum_sq_r + pmaddwd xmm3, xmm4 + paddd xmm11, xmm3 ; sum_sxr +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_Q 1 + movdqa xmm2,%1 + punpckldq %1,xmm0 + punpckhdq xmm2,xmm0 + paddq %1,xmm2 + movdqa xmm2,%1 + punpcklqdq %1,xmm0 + punpckhqdq xmm2,xmm0 + paddq %1,xmm2 +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_W 1 + movdqa xmm1, %1 + punpcklwd %1,xmm0 + punpckhwd xmm1,xmm0 + paddd %1, xmm1 + SUM_ACROSS_Q %1 +%endmacro + +SECTION .text + +;void ssim_parms_sse2( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; uint32_t *sum_s, +; uint32_t *sum_r, +; uint32_t *sum_sq_s, +; uint32_t *sum_sq_r, +; uint32_t *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +globalsym(vpx_ssim_parms_16x16_sse2) +sym(vpx_ssim_parms_16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 15 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 16 ;row counter +.NextRow: + + ;grab source and reference pixels + movdqu xmm5, [rsi] + movdqu xmm6, [rdi] + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpckhbw xmm3, xmm0 ; high_s + punpckhbw xmm4, xmm0 ; high_r + + TABULATE_SSIM + + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz .NextRow + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movd [rdi], xmm15; + mov rdi,arg(5) + movd [rdi], xmm14; + mov rdi,arg(6) + movd [rdi], xmm13; + mov rdi,arg(7) + movd [rdi], xmm12; + mov rdi,arg(8) + movd [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void ssim_parms_sse2( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; uint32_t *sum_s, +; uint32_t *sum_r, +; uint32_t *sum_sq_s, +; uint32_t *sum_sq_r, +; uint32_t *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +globalsym(vpx_ssim_parms_8x8_sse2) +sym(vpx_ssim_parms_8x8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 15 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 8 ;row counter +.NextRow: + + ;grab source and reference pixels + movq xmm3, [rsi] + movq xmm4, [rdi] + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz .NextRow + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movd [rdi], xmm15; + mov rdi,arg(5) + movd [rdi], xmm14; + mov rdi,arg(6) + movd [rdi], xmm13; + mov rdi,arg(7) + movd [rdi], xmm12; + mov rdi,arg(8) + movd [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm new file mode 100644 index 0000000000..d1d8d3460e --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm @@ -0,0 +1,1467 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_8: times 8 dw 8 +bilin_filter_m_sse2: times 8 dw 16 + times 8 dw 0 + times 8 dw 14 + times 8 dw 2 + times 8 dw 12 + times 8 dw 4 + times 8 dw 10 + times 8 dw 6 + times 16 dw 8 + times 8 dw 6 + times 8 dw 10 + times 8 dw 4 + times 8 dw 12 + times 8 dw 2 + times 8 dw 14 + +bilin_filter_m_ssse3: times 8 db 16, 0 + times 8 db 14, 2 + times 8 db 12, 4 + times 8 db 10, 6 + times 16 db 8 + times 8 db 6, 10 + times 8 db 4, 12 + times 8 db 2, 14 + +SECTION .text + +; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, +; int x_offset, int y_offset, +; const uint8_t *ref, ptrdiff_t ref_stride, +; int height, unsigned int *sse); +; +; This function returns the SE and stores SSE in the given pointer. + +%macro SUM_SSE 6 ; src1, ref1, src2, ref2, sum, sse + psubw %3, %4 + psubw %1, %2 + paddw %5, %3 + pmaddwd %3, %3 + paddw %5, %1 + pmaddwd %1, %1 + paddd %6, %3 + paddd %6, %1 +%endmacro + +%macro STORE_AND_RET 1 +%if %1 > 4 + ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit + ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. + ; We have to sign-extend it before adding the words within the register + ; and outputing to a dword. + pcmpgtw m5, m6 ; mask for 0 > x + movhlps m3, m7 + punpcklwd m4, m6, m5 + punpckhwd m6, m5 ; sign-extend m6 word->dword + paddd m7, m3 + paddd m6, m4 + pshufd m3, m7, 0x1 + movhlps m4, m6 + paddd m7, m3 + paddd m6, m4 + mov r1, ssem ; r1 = unsigned int *sse + pshufd m4, m6, 0x1 + movd [r1], m7 ; store sse + paddd m6, m4 + movd raxd, m6 ; store sum as return value +%else ; 4xh + pshuflw m4, m6, 0xe + pshuflw m3, m7, 0xe + paddw m6, m4 + paddd m7, m3 + pcmpgtw m5, m6 ; mask for 0 > x + mov r1, ssem ; r1 = unsigned int *sse + punpcklwd m6, m5 ; sign-extend m6 word->dword + movd [r1], m7 ; store sse + pshuflw m4, m6, 0xe + paddd m6, m4 + movd raxd, m6 ; store sum as return value +%endif + RET +%endmacro + +%macro INC_SRC_BY_SRC_STRIDE 0 +%if VPX_ARCH_X86=1 && CONFIG_PIC=1 + add srcq, src_stridemp +%else + add srcq, src_strideq +%endif +%endmacro + +%macro SUBPEL_VARIANCE 1-2 0 ; W +%if cpuflag(ssse3) +%define bilin_filter_m bilin_filter_m_ssse3 +%define filter_idx_shift 4 +%else +%define bilin_filter_m bilin_filter_m_sse2 +%define filter_idx_shift 5 +%endif +; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses +; 11, not 13, if the registers are ordered correctly. May make a minor speed +; difference on Win64 + +%if VPX_ARCH_X86_64 + %if %2 == 1 ; avg + cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ + x_offset, y_offset, ref, ref_stride, \ + second_pred, second_stride, height, sse + %define second_str second_strideq + %else + cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ + x_offset, y_offset, ref, ref_stride, \ + height, sse + %endif + %define block_height heightd + %define bilin_filter sseq +%else + %if CONFIG_PIC=1 + %if %2 == 1 ; avg + cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, ref, ref_stride, \ + second_pred, second_stride, height, sse + %define block_height dword heightm + %define second_str second_stridemp + %else + cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, ref, ref_stride, \ + height, sse + %define block_height heightd + %endif + + ; reuse argument stack space + %define g_bilin_filterm x_offsetm + %define g_pw_8m y_offsetm + + ;Store bilin_filter and pw_8 location in stack + %if GET_GOT_DEFINED == 1 + GET_GOT eax + add esp, 4 ; restore esp + %endif + + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx + + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back + %else + %if %2 == 1 ; avg + cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + ref, ref_stride, second_pred, second_stride, \ + height, sse + %define block_height dword heightm + %define second_str second_stridemp + %else + cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, ref, ref_stride, \ + height, sse + %define block_height heightd + %endif + %define bilin_filter bilin_filter_m + %endif +%endif + +%if %1 == 4 + %define movx movd +%else + %define movx movh +%endif + + ASSERT %1 <= 16 ; m6 overflows if w > 16 + pxor m6, m6 ; sum + pxor m7, m7 ; sse + ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we + ; could perhaps use it for something more productive then + pxor m5, m5 ; dedicated zero register +%if %1 < 16 + sar block_height, 1 +%if %2 == 1 ; avg + shl second_str, 1 +%endif +%endif + + ; FIXME(rbultje) replace by jumptable? + test x_offsetd, x_offsetd + jnz .x_nonzero + ; x_offset == 0 + test y_offsetd, y_offsetd + jnz .x_zero_y_nonzero + + ; x_offset == 0 && y_offset == 0 +.x_zero_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + mova m1, [refq] +%if %2 == 1 ; avg + pavgb m0, [second_predq] + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%endif + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + +%if %2 == 0 ; !avg + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add refq, ref_strideq +%else ; %1 < 16 + movx m0, [srcq] +%if %2 == 1 ; avg +%if %1 > 4 + movhps m0, [srcq+src_strideq] +%else ; 4xh + movx m1, [srcq+src_strideq] + punpckldq m0, m1 +%endif +%else ; !avg + movx m2, [srcq+src_strideq] +%endif + + movx m1, [refq] + movx m3, [refq+ref_strideq] + +%if %2 == 1 ; avg +%if %1 > 4 + pavgb m0, [second_predq] +%else + movh m2, [second_predq] + pavgb m0, m2 +%endif + punpcklbw m3, m5 + punpcklbw m1, m5 +%if %1 > 4 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; 4xh + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%else ; !avg + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea refq, [refq+ref_strideq*2] +%endif +%if %2 == 1 ; avg + add second_predq, second_str +%endif + dec block_height + jg .x_zero_y_zero_loop + STORE_AND_RET %1 + +.x_zero_y_nonzero: + cmp y_offsetd, 4 + jne .x_zero_y_nonhalf + + ; x_offset == 0 && y_offset == 0.5 +.x_zero_y_half_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+src_strideq] + mova m1, [refq] + pavgb m0, m4 + punpckhbw m3, m1, m5 +%if %2 == 1 ; avg + pavgb m0, [second_predq] +%endif + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add refq, ref_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m2, [srcq+src_strideq] +%if %2 == 1 ; avg +%if %1 > 4 + movhps m2, [srcq+src_strideq*2] +%else ; 4xh + movx m1, [srcq+src_strideq*2] + punpckldq m2, m1 +%endif + movx m1, [refq] +%if %1 > 4 + movlhps m0, m2 +%else ; 4xh + punpckldq m0, m2 +%endif + movx m3, [refq+ref_strideq] + pavgb m0, m2 + punpcklbw m1, m5 +%if %1 > 4 + pavgb m0, [second_predq] + punpcklbw m3, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; 4xh + movh m4, [second_predq] + pavgb m0, m4 + punpcklbw m3, m5 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%else ; !avg + movx m4, [srcq+src_strideq*2] + movx m1, [refq] + pavgb m0, m2 + movx m3, [refq+ref_strideq] + pavgb m2, m4 + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea refq, [refq+ref_strideq*2] +%endif +%if %2 == 1 ; avg + add second_predq, second_str +%endif + dec block_height + jg .x_zero_y_half_loop + STORE_AND_RET %1 + +.x_zero_y_nonhalf: + ; x_offset == 0 && y_offset == bilin interpolation +%if VPX_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl y_offsetd, filter_idx_shift +%if VPX_ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+y_offsetq+16] +%endif + mova m10, [GLOBAL(pw_8)] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ; x86-32 or mmx +%if VPX_ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0, reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +.x_zero_y_other_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+src_strideq] + mova m1, [refq] +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + punpcklbw m0, m5 + punpcklbw m4, m5 + ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can + ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of + ; instructions is the same (5), but it is 1 mul instead of 2, so might be + ; slightly faster because of pmullw latency. It would also cut our rodata + ; tables in half for this function, and save 1-2 registers on x86-64. + pmullw m2, filter_y_a + pmullw m3, filter_y_b + paddw m2, filter_rnd + pmullw m0, filter_y_a + pmullw m4, filter_y_b + paddw m0, filter_rnd + paddw m2, m3 + paddw m0, m4 +%endif + psraw m2, 4 + psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [second_predq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpckhbw m3, m1, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add refq, ref_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m2, [srcq+src_strideq] + movx m4, [srcq+src_strideq*2] + movx m3, [refq+ref_strideq] +%if cpuflag(ssse3) + movx m1, [refq] + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_y_a + pmullw m1, m2, filter_y_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_y_a + pmullw m4, filter_y_b + paddw m0, m1 + paddw m2, filter_rnd + movx m1, [refq] + paddw m2, m4 +%endif + psraw m0, 4 + psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [second_predq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; 4xh + movh m2, [second_predq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea refq, [refq+ref_strideq*2] +%endif +%if %2 == 1 ; avg + add second_predq, second_str +%endif + dec block_height + jg .x_zero_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET %1 + +.x_nonzero: + cmp x_offsetd, 4 + jne .x_nonhalf + ; x_offset == 0.5 + test y_offsetd, y_offsetd + jnz .x_half_y_nonzero + + ; x_offset == 0.5 && y_offset == 0 +.x_half_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+1] + mova m1, [refq] + pavgb m0, m4 + punpckhbw m3, m1, m5 +%if %2 == 1 ; avg + pavgb m0, [second_predq] +%endif + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add refq, ref_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m4, [srcq+1] +%if %2 == 1 ; avg +%if %1 > 4 + movhps m0, [srcq+src_strideq] + movhps m4, [srcq+src_strideq+1] +%else ; 4xh + movx m1, [srcq+src_strideq] + punpckldq m0, m1 + movx m2, [srcq+src_strideq+1] + punpckldq m4, m2 +%endif + movx m1, [refq] + movx m3, [refq+ref_strideq] + pavgb m0, m4 + punpcklbw m3, m5 +%if %1 > 4 + pavgb m0, [second_predq] + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; 4xh + movh m2, [second_predq] + pavgb m0, m2 + punpcklbw m1, m5 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%else ; !avg + movx m2, [srcq+src_strideq] + movx m1, [refq] + pavgb m0, m4 + movx m4, [srcq+src_strideq+1] + movx m3, [refq+ref_strideq] + pavgb m2, m4 + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea refq, [refq+ref_strideq*2] +%endif +%if %2 == 1 ; avg + add second_predq, second_str +%endif + dec block_height + jg .x_half_y_zero_loop + STORE_AND_RET %1 + +.x_half_y_nonzero: + cmp y_offsetd, 4 + jne .x_half_y_nonhalf + + ; x_offset == 0.5 && y_offset == 0.5 +%if %1 == 16 + movu m0, [srcq] + movu m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_half_loop: + movu m4, [srcq] + movu m3, [srcq+1] + mova m1, [refq] + pavgb m4, m3 + punpckhbw m3, m1, m5 + pavgb m0, m4 +%if %2 == 1 ; avg + punpcklbw m1, m5 + pavgb m0, [second_predq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add refq, ref_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_half_loop: + movx m2, [srcq] + movx m3, [srcq+1] +%if %2 == 1 ; avg +%if %1 > 4 + movhps m2, [srcq+src_strideq] + movhps m3, [srcq+src_strideq+1] +%else + movx m1, [srcq+src_strideq] + punpckldq m2, m1 + movx m1, [srcq+src_strideq+1] + punpckldq m3, m1 +%endif + pavgb m2, m3 +%if %1 > 4 + movlhps m0, m2 + movhlps m4, m2 +%else ; 4xh + punpckldq m0, m2 + pshuflw m4, m2, 0xe +%endif + movx m1, [refq] + pavgb m0, m2 + movx m3, [refq+ref_strideq] +%if %1 > 4 + pavgb m0, [second_predq] +%else + movh m2, [second_predq] + pavgb m0, m2 +%endif + punpcklbw m3, m5 + punpcklbw m1, m5 +%if %1 > 4 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%else ; !avg + movx m4, [srcq+src_strideq] + movx m1, [srcq+src_strideq+1] + pavgb m2, m3 + pavgb m4, m1 + pavgb m0, m2 + pavgb m2, m4 + movx m1, [refq] + movx m3, [refq+ref_strideq] + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea refq, [refq+ref_strideq*2] +%endif +%if %2 == 1 ; avg + add second_predq, second_str +%endif + dec block_height + jg .x_half_y_half_loop + STORE_AND_RET %1 + +.x_half_y_nonhalf: + ; x_offset == 0.5 && y_offset == bilin interpolation +%if VPX_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl y_offsetd, filter_idx_shift +%if VPX_ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+y_offsetq+16] +%endif + mova m10, [GLOBAL(pw_8)] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ;x86_32 +%if VPX_ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0.5. We can reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +%if %1 == 16 + movu m0, [srcq] + movu m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_other_loop: + movu m4, [srcq] + movu m2, [srcq+1] + mova m1, [refq] + pavgb m4, m2 +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + paddw m2, filter_rnd + paddw m0, filter_rnd + psraw m2, 4 +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + pmullw m2, filter_y_a + pmullw m3, filter_y_b + paddw m2, filter_rnd + punpcklbw m0, m5 + paddw m2, m3 + punpcklbw m3, m4, m5 + pmullw m0, filter_y_a + pmullw m3, filter_y_b + paddw m0, filter_rnd + psraw m2, 4 + paddw m0, m3 +%endif + punpckhbw m3, m1, m5 + psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [second_predq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add refq, ref_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +%if notcpuflag(ssse3) + punpcklbw m0, m5 +%endif +.x_half_y_other_loop: + movx m2, [srcq] + movx m1, [srcq+1] + movx m4, [srcq+src_strideq] + movx m3, [srcq+src_strideq+1] + pavgb m2, m1 + pavgb m4, m3 + movx m3, [refq+ref_strideq] +%if cpuflag(ssse3) + movx m1, [refq] + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd +%else + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_y_a + pmullw m1, m2, filter_y_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_y_a + paddw m0, m1 + pmullw m1, m4, filter_y_b + paddw m2, filter_rnd + paddw m2, m1 + movx m1, [refq] +%endif + psraw m0, 4 + psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [second_predq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + movh m2, [second_predq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea refq, [refq+ref_strideq*2] +%endif +%if %2 == 1 ; avg + add second_predq, second_str +%endif + dec block_height + jg .x_half_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET %1 + +.x_nonhalf: + test y_offsetd, y_offsetd + jnz .x_nonhalf_y_nonzero + + ; x_offset == bilin interpolation && y_offset == 0 +%if VPX_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl x_offsetd, filter_idx_shift +%if VPX_ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [GLOBAL(pw_8)] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else ; x86-32 +%if VPX_ARCH_X86=1 && CONFIG_PIC=1 +;y_offset == 0. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +.x_other_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+1] + mova m1, [refq] +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + punpcklbw m0, m5 + punpcklbw m4, m5 + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + pmullw m0, filter_x_a + pmullw m4, filter_x_b + paddw m0, filter_rnd + paddw m2, m3 + paddw m0, m4 +%endif + psraw m2, 4 + psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [second_predq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpckhbw m3, m1, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add refq, ref_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m1, [srcq+1] + movx m2, [srcq+src_strideq] + movx m4, [srcq+src_strideq+1] + movx m3, [refq+ref_strideq] +%if cpuflag(ssse3) + punpcklbw m0, m1 + movx m1, [refq] + punpcklbw m2, m4 + pmaddubsw m0, filter_x_a + pmaddubsw m2, filter_x_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m0, m1 + paddw m2, filter_rnd + movx m1, [refq] + paddw m2, m4 +%endif + psraw m0, 4 + psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [second_predq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + movh m2, [second_predq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea refq, [refq+ref_strideq*2] +%endif +%if %2 == 1 ; avg + add second_predq, second_str +%endif + dec block_height + jg .x_other_y_zero_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET %1 + +.x_nonhalf_y_nonzero: + cmp y_offsetd, 4 + jne .x_nonhalf_y_nonhalf + + ; x_offset == bilin interpolation && y_offset == 0.5 +%if VPX_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl x_offsetd, filter_idx_shift +%if VPX_ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [GLOBAL(pw_8)] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else ; x86-32 +%if VPX_ARCH_X86=1 && CONFIG_PIC=1 +; y_offset == 0.5. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+1] +%if cpuflag(ssse3) + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m1, m5 + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + paddw m0, m1 + paddw m2, m3 +%endif + psraw m0, 4 + psraw m2, 4 + add srcq, src_strideq + packuswb m0, m2 +.x_other_y_half_loop: + movu m4, [srcq] + movu m3, [srcq+1] +%if cpuflag(ssse3) + mova m1, [refq] + punpckhbw m2, m4, m3 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m4, m2 + pavgb m0, m4 + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%else + punpckhbw m2, m4, m5 + punpckhbw m1, m3, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + paddw m4, m3 + paddw m2, m1 + mova m1, [refq] + psraw m4, 4 + psraw m2, 4 + punpckhbw m3, m1, m5 + ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we + ; have a 1-register shortage to be able to store the backup of the bilin + ; filtered second line as words as cache for the next line. Packing into + ; a byte costs 1 pack and 2 unpacks, but saves a register. + packuswb m4, m2 + punpcklbw m1, m5 + pavgb m0, m4 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + pavgb m0, [second_predq] +%endif + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add refq, ref_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m1, [srcq+1] +%if cpuflag(ssse3) + punpcklbw m0, m1 + pmaddubsw m0, filter_x_a + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + paddw m0, m1 +%endif + add srcq, src_strideq + psraw m0, 4 +.x_other_y_half_loop: + movx m2, [srcq] + movx m1, [srcq+1] + movx m4, [srcq+src_strideq] + movx m3, [srcq+src_strideq+1] +%if cpuflag(ssse3) + punpcklbw m2, m1 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + movx m1, [refq] + movx m3, [refq+ref_strideq] + paddw m2, filter_rnd + paddw m4, filter_rnd +%else + punpcklbw m2, m5 + punpcklbw m1, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + paddw m2, m1 + movx m1, [refq] + paddw m4, m3 + movx m3, [refq+ref_strideq] +%endif + psraw m2, 4 + psraw m4, 4 + pavgw m0, m2 + pavgw m2, m4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline - also consider going to bytes here +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [second_predq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + movh m2, [second_predq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + punpcklbw m3, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea refq, [refq+ref_strideq*2] +%endif +%if %2 == 1 ; avg + add second_predq, second_str +%endif + dec block_height + jg .x_other_y_half_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET %1 + +.x_nonhalf_y_nonhalf: +%if VPX_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl x_offsetd, filter_idx_shift + shl y_offsetd, filter_idx_shift +%if VPX_ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m11, [bilin_filter+y_offsetq+16] +%endif + mova m12, [GLOBAL(pw_8)] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_y_a m10 +%define filter_y_b m11 +%define filter_rnd m12 +%else ; x86-32 +%if VPX_ARCH_X86=1 && CONFIG_PIC=1 +; In this case, there is NO unused register. Used src_stride register. Later, +; src_stride has to be loaded from stack when it is needed. +%define tempq src_strideq + mov tempq, g_bilin_filterm + add x_offsetq, tempq + add y_offsetq, tempq +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter + add y_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + + ; x_offset == bilin interpolation && y_offset == bilin interpolation +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+1] +%if cpuflag(ssse3) + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m1, m5 + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + paddw m0, m1 + paddw m2, m3 +%endif + psraw m0, 4 + psraw m2, 4 + + INC_SRC_BY_SRC_STRIDE + + packuswb m0, m2 +.x_other_y_other_loop: +%if cpuflag(ssse3) + movu m4, [srcq] + movu m3, [srcq+1] + mova m1, [refq] + punpckhbw m2, m4, m3 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + punpckhbw m3, m1, m5 + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m4, m2 + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + punpcklbw m1, m5 + paddw m2, filter_rnd + paddw m0, filter_rnd + psraw m2, 4 + psraw m0, 4 +%else + movu m3, [srcq] + movu m4, [srcq+1] + punpckhbw m1, m3, m5 + punpckhbw m2, m4, m5 + punpcklbw m3, m5 + punpcklbw m4, m5 + pmullw m3, filter_x_a + pmullw m4, filter_x_b + paddw m3, filter_rnd + pmullw m1, filter_x_a + pmullw m2, filter_x_b + paddw m1, filter_rnd + paddw m3, m4 + paddw m1, m2 + psraw m3, 4 + psraw m1, 4 + packuswb m4, m3, m1 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + pmullw m2, filter_y_a + pmullw m1, filter_y_b + paddw m2, filter_rnd + pmullw m0, filter_y_a + pmullw m3, filter_y_b + paddw m2, m1 + mova m1, [refq] + paddw m0, filter_rnd + psraw m2, 4 + paddw m0, m3 + punpckhbw m3, m1, m5 + psraw m0, 4 + punpcklbw m1, m5 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [second_predq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + INC_SRC_BY_SRC_STRIDE + add refq, ref_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m1, [srcq+1] +%if cpuflag(ssse3) + punpcklbw m0, m1 + pmaddubsw m0, filter_x_a + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + paddw m0, m1 +%endif + psraw m0, 4 +%if cpuflag(ssse3) + packuswb m0, m0 +%endif + + INC_SRC_BY_SRC_STRIDE + +.x_other_y_other_loop: + movx m2, [srcq] + movx m1, [srcq+1] + + INC_SRC_BY_SRC_STRIDE + movx m4, [srcq] + movx m3, [srcq+1] + +%if cpuflag(ssse3) + punpcklbw m2, m1 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + movx m3, [refq+ref_strideq] + movx m1, [refq] + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m2, m2 + packuswb m4, m4 + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd + psraw m0, 4 + psraw m2, 4 + punpcklbw m1, m5 +%else + punpcklbw m2, m5 + punpcklbw m1, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + paddw m2, m1 + paddw m4, m3 + psraw m2, 4 + psraw m4, 4 + pmullw m0, filter_y_a + pmullw m3, m2, filter_y_b + paddw m0, filter_rnd + pmullw m2, filter_y_a + pmullw m1, m4, filter_y_b + paddw m2, filter_rnd + paddw m0, m3 + movx m3, [refq+ref_strideq] + paddw m2, m1 + movx m1, [refq] + psraw m0, 4 + psraw m2, 4 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [second_predq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + movh m2, [second_predq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + INC_SRC_BY_SRC_STRIDE + lea refq, [refq+ref_strideq*2] +%endif +%if %2 == 1 ; avg + add second_predq, second_str +%endif + dec block_height + jg .x_other_y_other_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd +%undef movx + STORE_AND_RET %1 +%endmacro + +; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical +; between the ssse3 and non-ssse3 version. It may make sense to merge their +; code in the sense that the ssse3 version would jump to the appropriate +; location in the sse/2 version, rather than duplicating that code in the +; binary. + +INIT_XMM sse2 +SUBPEL_VARIANCE 4 +SUBPEL_VARIANCE 8 +SUBPEL_VARIANCE 16 + +INIT_XMM ssse3 +SUBPEL_VARIANCE 4 +SUBPEL_VARIANCE 8 +SUBPEL_VARIANCE 16 + +INIT_XMM sse2 +SUBPEL_VARIANCE 4, 1 +SUBPEL_VARIANCE 8, 1 +SUBPEL_VARIANCE 16, 1 + +INIT_XMM ssse3 +SUBPEL_VARIANCE 4, 1 +SUBPEL_VARIANCE 8, 1 +SUBPEL_VARIANCE 16, 1 diff --git a/media/libvpx/libvpx/vpx_dsp/x86/subtract_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/subtract_avx2.c new file mode 100644 index 0000000000..4849581ed4 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/subtract_avx2.c @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +static VPX_FORCE_INLINE void subtract32_avx2(int16_t *diff_ptr, + const uint8_t *src_ptr, + const uint8_t *pred_ptr) { + const __m256i s = _mm256_lddqu_si256((const __m256i *)src_ptr); + const __m256i p = _mm256_lddqu_si256((const __m256i *)pred_ptr); + const __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s)); + const __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1)); + const __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p)); + const __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1)); + const __m256i d_0 = _mm256_sub_epi16(s_0, p_0); + const __m256i d_1 = _mm256_sub_epi16(s_1, p_1); + _mm256_storeu_si256((__m256i *)diff_ptr, d_0); + _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d_1); +} + +static VPX_FORCE_INLINE void subtract_block_16xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + int j; + for (j = 0; j < rows; ++j) { + const __m128i s = _mm_lddqu_si128((const __m128i *)src_ptr); + const __m128i p = _mm_lddqu_si128((const __m128i *)pred_ptr); + const __m256i s_0 = _mm256_cvtepu8_epi16(s); + const __m256i p_0 = _mm256_cvtepu8_epi16(p); + const __m256i d_0 = _mm256_sub_epi16(s_0, p_0); + _mm256_storeu_si256((__m256i *)diff_ptr, d_0); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +static VPX_FORCE_INLINE void subtract_block_32xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + int j; + for (j = 0; j < rows; ++j) { + subtract32_avx2(diff_ptr, src_ptr, pred_ptr); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +static VPX_FORCE_INLINE void subtract_block_64xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + int j; + for (j = 0; j < rows; ++j) { + subtract32_avx2(diff_ptr, src_ptr, pred_ptr); + subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, + ptrdiff_t pred_stride) { + switch (cols) { + case 16: + subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); + break; + case 32: + subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); + break; + case 64: + subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); + break; + default: + vpx_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_highbd_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, + const uint8_t *src8_ptr, + ptrdiff_t src_stride, + const uint8_t *pred8_ptr, + ptrdiff_t pred_stride, int bd) { + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr); + uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(pred8_ptr); + (void)bd; + if (cols == 64) { + int j = rows; + do { + const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr); + const __m256i s1 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 16)); + const __m256i s2 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 32)); + const __m256i s3 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 48)); + const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr); + const __m256i p1 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 16)); + const __m256i p2 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 32)); + const __m256i p3 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 48)); + const __m256i d0 = _mm256_sub_epi16(s0, p0); + const __m256i d1 = _mm256_sub_epi16(s1, p1); + const __m256i d2 = _mm256_sub_epi16(s2, p2); + const __m256i d3 = _mm256_sub_epi16(s3, p3); + _mm256_storeu_si256((__m256i *)diff_ptr, d0); + _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d1); + _mm256_storeu_si256((__m256i *)(diff_ptr + 32), d2); + _mm256_storeu_si256((__m256i *)(diff_ptr + 48), d3); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } while (--j != 0); + } else if (cols == 32) { + int j = rows; + do { + const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr); + const __m256i s1 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 16)); + const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr); + const __m256i p1 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 16)); + const __m256i d0 = _mm256_sub_epi16(s0, p0); + const __m256i d1 = _mm256_sub_epi16(s1, p1); + _mm256_storeu_si256((__m256i *)diff_ptr, d0); + _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d1); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } while (--j != 0); + } else if (cols == 16) { + int j = rows; + do { + const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr); + const __m256i s1 = + _mm256_lddqu_si256((const __m256i *)(src_ptr + src_stride)); + const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr); + const __m256i p1 = + _mm256_lddqu_si256((const __m256i *)(pred_ptr + pred_stride)); + const __m256i d0 = _mm256_sub_epi16(s0, p0); + const __m256i d1 = _mm256_sub_epi16(s1, p1); + _mm256_storeu_si256((__m256i *)diff_ptr, d0); + _mm256_storeu_si256((__m256i *)(diff_ptr + diff_stride), d1); + src_ptr += src_stride << 1; + pred_ptr += pred_stride << 1; + diff_ptr += diff_stride << 1; + j -= 2; + } while (j != 0); + } else if (cols == 8) { + int j = rows; + do { + const __m128i s0 = _mm_lddqu_si128((const __m128i *)src_ptr); + const __m128i s1 = + _mm_lddqu_si128((const __m128i *)(src_ptr + src_stride)); + const __m128i p0 = _mm_lddqu_si128((const __m128i *)pred_ptr); + const __m128i p1 = + _mm_lddqu_si128((const __m128i *)(pred_ptr + pred_stride)); + const __m128i d0 = _mm_sub_epi16(s0, p0); + const __m128i d1 = _mm_sub_epi16(s1, p1); + _mm_storeu_si128((__m128i *)diff_ptr, d0); + _mm_storeu_si128((__m128i *)(diff_ptr + diff_stride), d1); + src_ptr += src_stride << 1; + pred_ptr += pred_stride << 1; + diff_ptr += diff_stride << 1; + j -= 2; + } while (j != 0); + } else { + int j = rows; + assert(cols == 4); + do { + const __m128i s0 = _mm_loadl_epi64((const __m128i *)src_ptr); + const __m128i s1 = + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride)); + const __m128i p0 = _mm_loadl_epi64((const __m128i *)pred_ptr); + const __m128i p1 = + _mm_loadl_epi64((const __m128i *)(pred_ptr + pred_stride)); + const __m128i d0 = _mm_sub_epi16(s0, p0); + const __m128i d1 = _mm_sub_epi16(s1, p1); + _mm_storel_epi64((__m128i *)diff_ptr, d0); + _mm_storel_epi64((__m128i *)(diff_ptr + diff_stride), d1); + src_ptr += src_stride << 1; + pred_ptr += pred_stride << 1; + diff_ptr += diff_stride << 1; + j -= 2; + } while (j != 0); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/x86/subtract_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/subtract_sse2.asm new file mode 100644 index 0000000000..e3055ab292 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/subtract_sse2.asm @@ -0,0 +1,128 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; void vpx_subtract_block(int rows, int cols, +; int16_t *diff, ptrdiff_t diff_stride, +; const uint8_t *src, ptrdiff_t src_stride, +; const uint8_t *pred, ptrdiff_t pred_stride) + +INIT_XMM sse2 +cglobal subtract_block, 7, 7, 8, \ + rows, cols, diff, diff_stride, src, src_stride, \ + pred, pred_stride +%define pred_str colsq + pxor m7, m7 ; dedicated zero register + cmp colsd, 4 + je .case_4 + cmp colsd, 8 + je .case_8 + cmp colsd, 16 + je .case_16 + cmp colsd, 32 + je .case_32 + +%macro loop16 6 + mova m0, [srcq+%1] + mova m4, [srcq+%2] + mova m1, [predq+%3] + mova m5, [predq+%4] + punpckhbw m2, m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m0, m7 + punpcklbw m1, m7 + psubw m2, m3 + psubw m0, m1 + punpckhbw m1, m4, m7 + punpckhbw m3, m5, m7 + punpcklbw m4, m7 + punpcklbw m5, m7 + psubw m1, m3 + psubw m4, m5 + mova [diffq+mmsize*0+%5], m0 + mova [diffq+mmsize*1+%5], m2 + mova [diffq+mmsize*0+%6], m4 + mova [diffq+mmsize*1+%6], m1 +%endmacro + + mov pred_str, pred_stridemp +.loop_64: + loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize + loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize + lea diffq, [diffq+diff_strideq*2] + add predq, pred_str + add srcq, src_strideq + dec rowsd + jg .loop_64 + RET + +.case_32: + mov pred_str, pred_stridemp +.loop_32: + loop16 0, mmsize, 0, mmsize, 0, 2*mmsize + lea diffq, [diffq+diff_strideq*2] + add predq, pred_str + add srcq, src_strideq + dec rowsd + jg .loop_32 + RET + +.case_16: + mov pred_str, pred_stridemp +.loop_16: + loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2 + lea diffq, [diffq+diff_strideq*4] + lea predq, [predq+pred_str*2] + lea srcq, [srcq+src_strideq*2] + sub rowsd, 2 + jg .loop_16 + RET + +%macro loop_h 0 + movh m0, [srcq] + movh m2, [srcq+src_strideq] + movh m1, [predq] + movh m3, [predq+pred_str] + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + psubw m0, m1 + psubw m2, m3 + mova [diffq], m0 + mova [diffq+diff_strideq*2], m2 +%endmacro + +.case_8: + mov pred_str, pred_stridemp +.loop_8: + loop_h + lea diffq, [diffq+diff_strideq*4] + lea srcq, [srcq+src_strideq*2] + lea predq, [predq+pred_str*2] + sub rowsd, 2 + jg .loop_8 + RET + +INIT_MMX +.case_4: + mov pred_str, pred_stridemp +.loop_4: + loop_h + lea diffq, [diffq+diff_strideq*4] + lea srcq, [srcq+src_strideq*2] + lea predq, [predq+pred_str*2] + sub rowsd, 2 + jg .loop_4 + emms + RET diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sum_squares_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/sum_squares_sse2.c new file mode 100644 index 0000000000..df6514b2c4 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/sum_squares_sse2.c @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/mem_sse2.h" + +uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) { + // Over 75% of all calls are with size == 4. + if (size == 4) { + __m128i s[2], sq[2], ss; + + s[0] = _mm_loadl_epi64((const __m128i *)(src + 0 * stride)); + s[0] = loadh_epi64(s[0], src + 1 * stride); + s[1] = _mm_loadl_epi64((const __m128i *)(src + 2 * stride)); + s[1] = loadh_epi64(s[1], src + 3 * stride); + sq[0] = _mm_madd_epi16(s[0], s[0]); + sq[1] = _mm_madd_epi16(s[1], s[1]); + sq[0] = _mm_add_epi32(sq[0], sq[1]); + ss = _mm_add_epi32(sq[0], _mm_srli_si128(sq[0], 8)); + ss = _mm_add_epi32(ss, _mm_srli_epi64(ss, 32)); + + return (uint64_t)_mm_cvtsi128_si32(ss); + } else { + // Generic case + int r = size; + const __m128i v_zext_mask_q = _mm_set_epi32(0, -1, 0, -1); + __m128i v_acc_q = _mm_setzero_si128(); + + assert(size % 8 == 0); + + do { + int c = 0; + __m128i v_acc_d = _mm_setzero_si128(); + + do { + const int16_t *const b = src + c; + const __m128i v_val_0_w = + _mm_load_si128((const __m128i *)(b + 0 * stride)); + const __m128i v_val_1_w = + _mm_load_si128((const __m128i *)(b + 1 * stride)); + const __m128i v_val_2_w = + _mm_load_si128((const __m128i *)(b + 2 * stride)); + const __m128i v_val_3_w = + _mm_load_si128((const __m128i *)(b + 3 * stride)); + const __m128i v_val_4_w = + _mm_load_si128((const __m128i *)(b + 4 * stride)); + const __m128i v_val_5_w = + _mm_load_si128((const __m128i *)(b + 5 * stride)); + const __m128i v_val_6_w = + _mm_load_si128((const __m128i *)(b + 6 * stride)); + const __m128i v_val_7_w = + _mm_load_si128((const __m128i *)(b + 7 * stride)); + + const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); + const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); + const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); + const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); + const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w); + const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w); + const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w); + const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w); + + const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); + const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); + const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d); + const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d); + + const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); + const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d); + + v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d); + v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d); + c += 8; + } while (c < size); + + v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q)); + v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32)); + + src += 8 * stride; + r -= 8; + } while (r); + + v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8)); + +#if VPX_ARCH_X86_64 + return (uint64_t)_mm_cvtsi128_si64(v_acc_q); +#else + { + uint64_t tmp; + _mm_storel_epi64((__m128i *)&tmp, v_acc_q); + return tmp; + } +#endif + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/transpose_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/transpose_sse2.h new file mode 100644 index 0000000000..b4f1190d74 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/transpose_sse2.h @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_ +#define VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_ + +#include // SSE2 + +#include "./vpx_config.h" + +static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) { + // Unpack 8 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 10 11 12 13 + // in[2]: 20 21 22 23 + // in[3]: 30 31 32 33 + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]); + + // Unpack 16 bit elements resulting in: + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + return _mm_unpacklo_epi16(a0, a1); +} + +static INLINE void transpose_8bit_8x8(const __m128i *const in, + __m128i *const out) { + // Unpack 8 bit elements. Goes from: + // in[0]: 00 01 02 03 04 05 06 07 + // in[1]: 10 11 12 13 14 15 16 17 + // in[2]: 20 21 22 23 24 25 26 27 + // in[3]: 30 31 32 33 34 35 36 37 + // in[4]: 40 41 42 43 44 45 46 47 + // in[5]: 50 51 52 53 54 55 56 57 + // in[6]: 60 61 62 63 64 65 66 67 + // in[7]: 70 71 72 73 74 75 76 77 + // to: + // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]); + const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]); + const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]); + + // Unpack 16 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + // b1: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + // b2: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + // b3: 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + const __m128i b0 = _mm_unpacklo_epi16(a0, a1); + const __m128i b1 = _mm_unpackhi_epi16(a0, a1); + const __m128i b2 = _mm_unpacklo_epi16(a2, a3); + const __m128i b3 = _mm_unpackhi_epi16(a2, a3); + + // Unpack 32 bit elements resulting in: + // c0: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + // c1: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + // c2: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + // c3: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + const __m128i c0 = _mm_unpacklo_epi32(b0, b2); + const __m128i c1 = _mm_unpackhi_epi32(b0, b2); + const __m128i c2 = _mm_unpacklo_epi32(b1, b3); + const __m128i c3 = _mm_unpackhi_epi32(b1, b3); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + // out[4]: 04 14 24 34 44 54 64 74 + // out[5]: 05 15 25 35 45 55 65 75 + // out[6]: 06 16 26 36 46 56 66 76 + // out[7]: 07 17 27 37 47 57 67 77 + out[0] = _mm_unpacklo_epi64(c0, c0); + out[1] = _mm_unpackhi_epi64(c0, c0); + out[2] = _mm_unpacklo_epi64(c1, c1); + out[3] = _mm_unpackhi_epi64(c1, c1); + out[4] = _mm_unpacklo_epi64(c2, c2); + out[5] = _mm_unpackhi_epi64(c2, c2); + out[6] = _mm_unpacklo_epi64(c3, c3); + out[7] = _mm_unpackhi_epi64(c3, c3); +} + +static INLINE void transpose_16bit_4x4(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 XX XX XX XX + // in[1]: 10 11 12 13 XX XX XX XX + // in[2]: 20 21 22 23 XX XX XX XX + // in[3]: 30 31 32 33 XX XX XX XX + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + + // Unpack 32 bit elements resulting in: + // out[0]: 00 10 20 30 01 11 21 31 + // out[1]: 02 12 22 32 03 13 23 33 + out[0] = _mm_unpacklo_epi32(a0, a1); + out[1] = _mm_unpackhi_epi32(a0, a1); +} + +static INLINE void transpose_16bit_4x8(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 XX XX XX XX + // in[1]: 10 11 12 13 XX XX XX XX + // in[2]: 20 21 22 23 XX XX XX XX + // in[3]: 30 31 32 33 XX XX XX XX + // in[4]: 40 41 42 43 XX XX XX XX + // in[5]: 50 51 52 53 XX XX XX XX + // in[6]: 60 61 62 63 XX XX XX XX + // in[7]: 70 71 72 73 XX XX XX XX + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + // a2: 40 50 41 51 42 52 43 53 + // a3: 60 70 61 71 62 72 63 73 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); + + // Unpack 32 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 + // b1: 40 50 60 70 41 51 61 71 + // b2: 02 12 22 32 03 13 23 33 + // b3: 42 52 62 72 43 53 63 73 + const __m128i b0 = _mm_unpacklo_epi32(a0, a1); + const __m128i b1 = _mm_unpacklo_epi32(a2, a3); + const __m128i b2 = _mm_unpackhi_epi32(a0, a1); + const __m128i b3 = _mm_unpackhi_epi32(a2, a3); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + out[0] = _mm_unpacklo_epi64(b0, b1); + out[1] = _mm_unpackhi_epi64(b0, b1); + out[2] = _mm_unpacklo_epi64(b2, b3); + out[3] = _mm_unpackhi_epi64(b2, b3); +} + +static INLINE void transpose_16bit_8x8(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 04 05 06 07 + // in[1]: 10 11 12 13 14 15 16 17 + // in[2]: 20 21 22 23 24 25 26 27 + // in[3]: 30 31 32 33 34 35 36 37 + // in[4]: 40 41 42 43 44 45 46 47 + // in[5]: 50 51 52 53 54 55 56 57 + // in[6]: 60 61 62 63 64 65 66 67 + // in[7]: 70 71 72 73 74 75 76 77 + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + // a2: 40 50 41 51 42 52 43 53 + // a3: 60 70 61 71 62 72 63 73 + // a4: 04 14 05 15 06 16 07 17 + // a5: 24 34 25 35 26 36 27 37 + // a6: 44 54 45 55 46 56 47 57 + // a7: 64 74 65 75 66 76 67 77 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); + const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]); + const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]); + const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]); + + // Unpack 32 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 + // b1: 40 50 60 70 41 51 61 71 + // b2: 04 14 24 34 05 15 25 35 + // b3: 44 54 64 74 45 55 65 75 + // b4: 02 12 22 32 03 13 23 33 + // b5: 42 52 62 72 43 53 63 73 + // b6: 06 16 26 36 07 17 27 37 + // b7: 46 56 66 76 47 57 67 77 + const __m128i b0 = _mm_unpacklo_epi32(a0, a1); + const __m128i b1 = _mm_unpacklo_epi32(a2, a3); + const __m128i b2 = _mm_unpacklo_epi32(a4, a5); + const __m128i b3 = _mm_unpacklo_epi32(a6, a7); + const __m128i b4 = _mm_unpackhi_epi32(a0, a1); + const __m128i b5 = _mm_unpackhi_epi32(a2, a3); + const __m128i b6 = _mm_unpackhi_epi32(a4, a5); + const __m128i b7 = _mm_unpackhi_epi32(a6, a7); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + // out[4]: 04 14 24 34 44 54 64 74 + // out[5]: 05 15 25 35 45 55 65 75 + // out[6]: 06 16 26 36 46 56 66 76 + // out[7]: 07 17 27 37 47 57 67 77 + out[0] = _mm_unpacklo_epi64(b0, b1); + out[1] = _mm_unpackhi_epi64(b0, b1); + out[2] = _mm_unpacklo_epi64(b4, b5); + out[3] = _mm_unpackhi_epi64(b4, b5); + out[4] = _mm_unpacklo_epi64(b2, b3); + out[5] = _mm_unpackhi_epi64(b2, b3); + out[6] = _mm_unpacklo_epi64(b6, b7); + out[7] = _mm_unpackhi_epi64(b6, b7); +} + +// Transpose in-place +static INLINE void transpose_16bit_16x16(__m128i *const left, + __m128i *const right) { + __m128i tbuf[8]; + transpose_16bit_8x8(left, left); + transpose_16bit_8x8(right, tbuf); + transpose_16bit_8x8(left + 8, right); + transpose_16bit_8x8(right + 8, right + 8); + + left[8] = tbuf[0]; + left[9] = tbuf[1]; + left[10] = tbuf[2]; + left[11] = tbuf[3]; + left[12] = tbuf[4]; + left[13] = tbuf[5]; + left[14] = tbuf[6]; + left[15] = tbuf[7]; +} + +static INLINE void transpose_32bit_4x4(const __m128i *const in, + __m128i *const out) { + // Unpack 32 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 10 11 12 13 + // in[2]: 20 21 22 23 + // in[3]: 30 31 32 33 + // to: + // a0: 00 10 01 11 + // a1: 20 30 21 31 + // a2: 02 12 03 13 + // a3: 22 32 23 33 + + const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]); + const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]); + const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 + out[0] = _mm_unpacklo_epi64(a0, a1); + out[1] = _mm_unpackhi_epi64(a0, a1); + out[2] = _mm_unpacklo_epi64(a2, a3); + out[3] = _mm_unpackhi_epi64(a2, a3); +} + +static INLINE void transpose_32bit_4x4x2(const __m128i *const in, + __m128i *const out) { + // Unpack 32 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 10 11 12 13 + // in[2]: 20 21 22 23 + // in[3]: 30 31 32 33 + // in[4]: 04 05 06 07 + // in[5]: 14 15 16 17 + // in[6]: 24 25 26 27 + // in[7]: 34 35 36 37 + // to: + // a0: 00 10 01 11 + // a1: 20 30 21 31 + // a2: 02 12 03 13 + // a3: 22 32 23 33 + // a4: 04 14 05 15 + // a5: 24 34 25 35 + // a6: 06 16 07 17 + // a7: 26 36 27 37 + const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]); + const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]); + const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]); + const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]); + const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]); + const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]); + const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 + // out[4]: 04 14 24 34 + // out[5]: 05 15 25 35 + // out[6]: 06 16 26 36 + // out[7]: 07 17 27 37 + out[0] = _mm_unpacklo_epi64(a0, a1); + out[1] = _mm_unpackhi_epi64(a0, a1); + out[2] = _mm_unpacklo_epi64(a2, a3); + out[3] = _mm_unpackhi_epi64(a2, a3); + out[4] = _mm_unpacklo_epi64(a4, a5); + out[5] = _mm_unpackhi_epi64(a4, a5); + out[6] = _mm_unpacklo_epi64(a6, a7); + out[7] = _mm_unpackhi_epi64(a6, a7); +} + +static INLINE void transpose_32bit_8x4(const __m128i *const in, + __m128i *const out) { + // Unpack 32 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 04 05 06 07 + // in[2]: 10 11 12 13 + // in[3]: 14 15 16 17 + // in[4]: 20 21 22 23 + // in[5]: 24 25 26 27 + // in[6]: 30 31 32 33 + // in[7]: 34 35 36 37 + // to: + // a0: 00 10 01 11 + // a1: 20 30 21 31 + // a2: 02 12 03 13 + // a3: 22 32 23 33 + // a4: 04 14 05 15 + // a5: 24 34 25 35 + // a6: 06 16 07 17 + // a7: 26 36 27 37 + const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]); + const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]); + const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]); + const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]); + const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]); + const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]); + const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]); + const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 + // out[4]: 04 14 24 34 + // out[5]: 05 15 25 35 + // out[6]: 06 16 26 36 + // out[7]: 07 17 27 37 + out[0] = _mm_unpacklo_epi64(a0, a1); + out[1] = _mm_unpackhi_epi64(a0, a1); + out[2] = _mm_unpacklo_epi64(a2, a3); + out[3] = _mm_unpackhi_epi64(a2, a3); + out[4] = _mm_unpacklo_epi64(a4, a5); + out[5] = _mm_unpackhi_epi64(a4, a5); + out[6] = _mm_unpacklo_epi64(a6, a7); + out[7] = _mm_unpackhi_epi64(a6, a7); +} + +#endif // VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/txfm_common_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/txfm_common_sse2.h new file mode 100644 index 0000000000..de5ce43b00 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/txfm_common_sse2.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_X86_TXFM_COMMON_SSE2_H_ +#define VPX_VPX_DSP_X86_TXFM_COMMON_SSE2_H_ + +#include +#include "vpx/vpx_integer.h" + +#define pair_set_epi16(a, b) \ + _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) + +#define pair_set_epi32(a, b) \ + _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a)) + +#define dual_set_epi16(a, b) \ + _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \ + (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a)) + +#define octa_set_epi16(a, b, c, d, e, f, g, h) \ + _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \ + (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h)) + +#endif // VPX_VPX_DSP_X86_TXFM_COMMON_SSE2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/variance_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/variance_avx2.c new file mode 100644 index 0000000000..8305b9f20f --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/variance_avx2.c @@ -0,0 +1,872 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // AVX2 + +#include "./vpx_dsp_rtcd.h" + +/* clang-format off */ +DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { + 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, + 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, + 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, + 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, + 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, + 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, + 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, + 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, + 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, + 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, + 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, + 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, + 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, +}; + +DECLARE_ALIGNED(32, static const int8_t, adjacent_sub_avx2[32]) = { + 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, + 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1 +}; +/* clang-format on */ + +static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref, + __m256i *const sse, + __m256i *const sum) { + const __m256i adj_sub = _mm256_load_si256((__m256i const *)adjacent_sub_avx2); + + // unpack into pairs of source and reference values + const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref); + const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref); + + // subtract adjacent elements using src*1 + ref*-1 + const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub); + const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub); + const __m256i madd0 = _mm256_madd_epi16(diff0, diff0); + const __m256i madd1 = _mm256_madd_epi16(diff1, diff1); + + // add to the running totals + *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1)); + *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1)); +} + +static INLINE void variance_final_from_32bit_sum_avx2(__m256i vsse, + __m128i vsum, + unsigned int *const sse, + int *const sum) { + // extract the low lane and add it to the high lane + const __m128i sse_reg_128 = _mm_add_epi32(_mm256_castsi256_si128(vsse), + _mm256_extractf128_si256(vsse, 1)); + + // unpack sse and sum registers and add + const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum); + const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum); + const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi); + + // perform the final summation and extract the results + const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8)); + *((int *)sse) = _mm_cvtsi128_si32(res); + *((int *)sum) = _mm_extract_epi32(res, 1); +} + +static INLINE void variance_final_from_16bit_sum_avx2(__m256i vsse, + __m256i vsum, + unsigned int *const sse, + int *const sum) { + // extract the low lane and add it to the high lane + const __m128i sum_reg_128 = _mm_add_epi16(_mm256_castsi256_si128(vsum), + _mm256_extractf128_si256(vsum, 1)); + const __m128i sum_reg_64 = + _mm_add_epi16(sum_reg_128, _mm_srli_si128(sum_reg_128, 8)); + const __m128i sum_int32 = _mm_cvtepi16_epi32(sum_reg_64); + + variance_final_from_32bit_sum_avx2(vsse, sum_int32, sse, sum); +} + +static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) { + const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum)); + const __m256i sum_hi = + _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1)); + return _mm256_add_epi32(sum_lo, sum_hi); +} + +static INLINE void variance8_kernel_avx2( + const uint8_t *const src, const int src_stride, const uint8_t *const ref, + const int ref_stride, __m256i *const sse, __m256i *const sum) { + __m128i src0, src1, ref0, ref1; + __m256i ss, rr, diff; + + // 0 0 0.... 0 s07 s06 s05 s04 s03 s02 s01 s00 + src0 = _mm_loadl_epi64((const __m128i *)(src + 0 * src_stride)); + + // 0 0 0.... 0 s17 s16 s15 s14 s13 s12 s11 s10 + src1 = _mm_loadl_epi64((const __m128i *)(src + 1 * src_stride)); + + // s17 s16...s11 s10 s07 s06...s01 s00 (8bit) + src0 = _mm_unpacklo_epi64(src0, src1); + + // s17 s16...s11 s10 s07 s06...s01 s00 (16 bit) + ss = _mm256_cvtepu8_epi16(src0); + + // 0 0 0.... 0 r07 r06 r05 r04 r03 r02 r01 r00 + ref0 = _mm_loadl_epi64((const __m128i *)(ref + 0 * ref_stride)); + + // 0 0 0.... 0 r17 r16 0 r15 0 r14 0 r13 0 r12 0 r11 0 r10 + ref1 = _mm_loadl_epi64((const __m128i *)(ref + 1 * ref_stride)); + + // r17 r16...r11 r10 r07 r06...r01 r00 (8 bit) + ref0 = _mm_unpacklo_epi64(ref0, ref1); + + // r17 r16...r11 r10 r07 r06...r01 r00 (16 bit) + rr = _mm256_cvtepu8_epi16(ref0); + + diff = _mm256_sub_epi16(ss, rr); + *sse = _mm256_add_epi32(*sse, _mm256_madd_epi16(diff, diff)); + *sum = _mm256_add_epi16(*sum, diff); +} + +static INLINE void variance16_kernel_avx2( + const uint8_t *const src, const int src_stride, const uint8_t *const ref, + const int ref_stride, __m256i *const sse, __m256i *const sum) { + const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); + const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); + const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride)); + const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride)); + const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1); + const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1); + variance_kernel_avx2(s, r, sse, sum); +} + +static INLINE void variance32_kernel_avx2(const uint8_t *const src, + const uint8_t *const ref, + __m256i *const sse, + __m256i *const sum) { + const __m256i s = _mm256_loadu_si256((__m256i const *)(src)); + const __m256i r = _mm256_loadu_si256((__m256i const *)(ref)); + variance_kernel_avx2(s, r, sse, sum); +} + +static INLINE void variance8_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + int i; + *vsum = _mm256_setzero_si256(); + *vsse = _mm256_setzero_si256(); + + for (i = 0; i < h; i += 2) { + variance8_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum); + src += 2 * src_stride; + ref += 2 * ref_stride; + } +} + +static INLINE void variance16_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + int i; + *vsum = _mm256_setzero_si256(); + *vsse = _mm256_setzero_si256(); + + for (i = 0; i < h; i += 2) { + variance16_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum); + src += 2 * src_stride; + ref += 2 * ref_stride; + } +} + +static INLINE void variance32_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + int i; + *vsum = _mm256_setzero_si256(); + *vsse = _mm256_setzero_si256(); + + for (i = 0; i < h; i++) { + variance32_kernel_avx2(src, ref, vsse, vsum); + src += src_stride; + ref += ref_stride; + } +} + +static INLINE void variance64_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + int i; + *vsum = _mm256_setzero_si256(); + + for (i = 0; i < h; i++) { + variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum); + variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum); + src += src_stride; + ref += ref_stride; + } +} + +void vpx_get16x16var_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + __m256i vsse, vsum; + variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, sum); +} + +#define FILTER_SRC(filter) \ + /* filter the source */ \ + exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \ + exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \ + \ + /* add 8 to source */ \ + exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \ + exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \ + \ + /* divide source by 16 */ \ + exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \ + exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); + +#define CALC_SUM_SSE_INSIDE_LOOP \ + /* expand each byte to 2 bytes */ \ + exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \ + exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \ + /* source - dest */ \ + exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \ + exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \ + /* caculate sum */ \ + *sum_reg = _mm256_add_epi16(*sum_reg, exp_src_lo); \ + exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \ + *sum_reg = _mm256_add_epi16(*sum_reg, exp_src_hi); \ + exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \ + /* calculate sse */ \ + *sse_reg = _mm256_add_epi32(*sse_reg, exp_src_lo); \ + *sse_reg = _mm256_add_epi32(*sse_reg, exp_src_hi); + +// final calculation to sum and sse +#define CALC_SUM_AND_SSE \ + res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \ + sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \ + sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \ + sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \ + sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ + sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \ + \ + sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \ + sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \ + \ + sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ + sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ + *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \ + _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \ + sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \ + sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ + sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \ + _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1)); + +static INLINE void spv32_x0_y0(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg) { + const __m256i zero_reg = _mm256_setzero_si256(); + __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + int i; + for (i = 0; i < height; i++) { + const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); + const __m256i src_reg = _mm256_loadu_si256((__m256i const *)src); + if (do_sec) { + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); + const __m256i avg_reg = _mm256_avg_epu8(src_reg, sec_reg); + exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); + second_pred += second_stride; + } else { + exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg); + } + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } +} + +// (x == 0, y == 4) or (x == 4, y == 0). sstep determines the direction. +static INLINE void spv32_half_zero(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *second_pred, + int second_stride, int do_sec, int height, + __m256i *sum_reg, __m256i *sse_reg, + int sstep) { + const __m256i zero_reg = _mm256_setzero_si256(); + __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + int i; + for (i = 0; i < height; i++) { + const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); + const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + sstep)); + const __m256i src_avg = _mm256_avg_epu8(src_0, src_1); + if (do_sec) { + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); + const __m256i avg_reg = _mm256_avg_epu8(src_avg, sec_reg); + exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); + second_pred += second_stride; + } else { + exp_src_lo = _mm256_unpacklo_epi8(src_avg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(src_avg, zero_reg); + } + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } +} + +static INLINE void spv32_x0_y4(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg) { + spv32_half_zero(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, sum_reg, sse_reg, src_stride); +} + +static INLINE void spv32_x4_y0(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg) { + spv32_half_zero(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, sum_reg, sse_reg, 1); +} + +static INLINE void spv32_x4_y4(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg) { + const __m256i zero_reg = _mm256_setzero_si256(); + const __m256i src_a = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1)); + __m256i prev_src_avg = _mm256_avg_epu8(src_a, src_b); + __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + int i; + src += src_stride; + for (i = 0; i < height; i++) { + const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); + const __m256i src_0 = _mm256_loadu_si256((__m256i const *)(src)); + const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1)); + const __m256i src_avg = _mm256_avg_epu8(src_0, src_1); + const __m256i current_avg = _mm256_avg_epu8(prev_src_avg, src_avg); + prev_src_avg = src_avg; + + if (do_sec) { + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); + const __m256i avg_reg = _mm256_avg_epu8(current_avg, sec_reg); + exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); + second_pred += second_stride; + } else { + exp_src_lo = _mm256_unpacklo_epi8(current_avg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(current_avg, zero_reg); + } + // save current source average + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + src += src_stride; + } +} + +// (x == 0, y == bil) or (x == 4, y == bil). sstep determines the direction. +static INLINE void spv32_bilin_zero(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *second_pred, + int second_stride, int do_sec, int height, + __m256i *sum_reg, __m256i *sse_reg, + int offset, int sstep) { + const __m256i zero_reg = _mm256_setzero_si256(); + const __m256i pw8 = _mm256_set1_epi16(8); + const __m256i filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + (offset << 5))); + __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + int i; + for (i = 0; i < height; i++) { + const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); + const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + sstep)); + exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1); + exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1); + + FILTER_SRC(filter) + if (do_sec) { + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); + const __m256i exp_src = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + const __m256i avg_reg = _mm256_avg_epu8(exp_src, sec_reg); + second_pred += second_stride; + exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); + } + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } +} + +static INLINE void spv32_x0_yb(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg, int y_offset) { + spv32_bilin_zero(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, sum_reg, sse_reg, y_offset, src_stride); +} + +static INLINE void spv32_xb_y0(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg, int x_offset) { + spv32_bilin_zero(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, sum_reg, sse_reg, x_offset, 1); +} + +static INLINE void spv32_x4_yb(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg, int y_offset) { + const __m256i zero_reg = _mm256_setzero_si256(); + const __m256i pw8 = _mm256_set1_epi16(8); + const __m256i filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + (y_offset << 5))); + const __m256i src_a = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1)); + __m256i prev_src_avg = _mm256_avg_epu8(src_a, src_b); + __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + int i; + src += src_stride; + for (i = 0; i < height; i++) { + const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); + const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1)); + const __m256i src_avg = _mm256_avg_epu8(src_0, src_1); + exp_src_lo = _mm256_unpacklo_epi8(prev_src_avg, src_avg); + exp_src_hi = _mm256_unpackhi_epi8(prev_src_avg, src_avg); + prev_src_avg = src_avg; + + FILTER_SRC(filter) + if (do_sec) { + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); + const __m256i exp_src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + const __m256i avg_reg = _mm256_avg_epu8(exp_src_avg, sec_reg); + exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); + second_pred += second_stride; + } + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + src += src_stride; + } +} + +static INLINE void spv32_xb_y4(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg, int x_offset) { + const __m256i zero_reg = _mm256_setzero_si256(); + const __m256i pw8 = _mm256_set1_epi16(8); + const __m256i filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + (x_offset << 5))); + const __m256i src_a = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1)); + __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + __m256i src_reg, src_pack; + int i; + exp_src_lo = _mm256_unpacklo_epi8(src_a, src_b); + exp_src_hi = _mm256_unpackhi_epi8(src_a, src_b); + FILTER_SRC(filter) + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + + src += src_stride; + for (i = 0; i < height; i++) { + const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); + const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1)); + exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1); + exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1); + + FILTER_SRC(filter) + + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + // average between previous pack to the current + src_pack = _mm256_avg_epu8(src_pack, src_reg); + + if (do_sec) { + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); + const __m256i avg_pack = _mm256_avg_epu8(src_pack, sec_reg); + exp_src_lo = _mm256_unpacklo_epi8(avg_pack, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(avg_pack, zero_reg); + second_pred += second_stride; + } else { + exp_src_lo = _mm256_unpacklo_epi8(src_pack, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(src_pack, zero_reg); + } + CALC_SUM_SSE_INSIDE_LOOP + src_pack = src_reg; + dst += dst_stride; + src += src_stride; + } +} + +static INLINE void spv32_xb_yb(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg, int x_offset, int y_offset) { + const __m256i zero_reg = _mm256_setzero_si256(); + const __m256i pw8 = _mm256_set1_epi16(8); + const __m256i xfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + (x_offset << 5))); + const __m256i yfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + (y_offset << 5))); + const __m256i src_a = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1)); + __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + __m256i prev_src_pack, src_pack; + int i; + exp_src_lo = _mm256_unpacklo_epi8(src_a, src_b); + exp_src_hi = _mm256_unpackhi_epi8(src_a, src_b); + FILTER_SRC(xfilter) + // convert each 16 bit to 8 bit to each low and high lane source + prev_src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + src += src_stride; + + for (i = 0; i < height; i++) { + const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); + const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1)); + exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1); + exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1); + + FILTER_SRC(xfilter) + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + + // merge previous pack to current pack source + exp_src_lo = _mm256_unpacklo_epi8(prev_src_pack, src_pack); + exp_src_hi = _mm256_unpackhi_epi8(prev_src_pack, src_pack); + + FILTER_SRC(yfilter) + if (do_sec) { + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); + const __m256i exp_src = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + const __m256i avg_reg = _mm256_avg_epu8(exp_src, sec_reg); + exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); + second_pred += second_stride; + } + + prev_src_pack = src_pack; + + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + src += src_stride; + } +} + +static INLINE int sub_pix_var32xh(const uint8_t *src, int src_stride, + int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, + const uint8_t *second_pred, int second_stride, + int do_sec, int height, unsigned int *sse) { + const __m256i zero_reg = _mm256_setzero_si256(); + __m256i sum_reg = _mm256_setzero_si256(); + __m256i sse_reg = _mm256_setzero_si256(); + __m256i sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; + int sum; + // x_offset = 0 and y_offset = 0 + if (x_offset == 0) { + if (y_offset == 0) { + spv32_x0_y0(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg); + // x_offset = 0 and y_offset = 4 + } else if (y_offset == 4) { + spv32_x0_y4(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg); + // x_offset = 0 and y_offset = bilin interpolation + } else { + spv32_x0_yb(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg, y_offset); + } + // x_offset = 4 and y_offset = 0 + } else if (x_offset == 4) { + if (y_offset == 0) { + spv32_x4_y0(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg); + // x_offset = 4 and y_offset = 4 + } else if (y_offset == 4) { + spv32_x4_y4(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg); + // x_offset = 4 and y_offset = bilin interpolation + } else { + spv32_x4_yb(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg, y_offset); + } + // x_offset = bilin interpolation and y_offset = 0 + } else { + if (y_offset == 0) { + spv32_xb_y0(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg, x_offset); + // x_offset = bilin interpolation and y_offset = 4 + } else if (y_offset == 4) { + spv32_xb_y4(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg, x_offset); + // x_offset = bilin interpolation and y_offset = bilin interpolation + } else { + spv32_xb_yb(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg, x_offset, y_offset); + } + } + CALC_SUM_AND_SSE + return sum; +} + +static int sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, + int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, + int height, unsigned int *sse) { + return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride, + NULL, 0, 0, height, sse); +} + +static int sub_pixel_avg_variance32xh_avx2(const uint8_t *src, int src_stride, + int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, + const uint8_t *second_pred, + int second_stride, int height, + unsigned int *sse) { + return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride, + second_pred, second_stride, 1, height, sse); +} + +typedef void (*get_var_avx2)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum); + +unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m256i vsse, vsum; + int sum; + variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse - ((sum * sum) >> 5); +} + +unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m256i vsse, vsum; + int sum; + variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse - ((sum * sum) >> 6); +} + +unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m256i vsse, vsum; + int sum; + variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse - ((sum * sum) >> 7); +} + +unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + int sum; + __m256i vsse, vsum; + variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse - (uint32_t)(((int64_t)sum * sum) >> 7); +} + +unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + int sum; + __m256i vsse, vsum; + variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse - (uint32_t)(((int64_t)sum * sum) >> 8); +} + +unsigned int vpx_variance16x32_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + int sum; + __m256i vsse, vsum; + variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse - (uint32_t)(((int64_t)sum * sum) >> 9); +} + +unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + int sum; + __m256i vsse, vsum; + variance32_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse - (uint32_t)(((int64_t)sum * sum) >> 9); +} + +unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + int sum; + __m256i vsse, vsum; + __m128i vsum_128; + variance32_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum); + vsum_128 = _mm_add_epi16(_mm256_castsi256_si128(vsum), + _mm256_extractf128_si256(vsum, 1)); + vsum_128 = _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128), + _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8))); + variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum); + return *sse - (uint32_t)(((int64_t)sum * sum) >> 10); +} + +unsigned int vpx_variance32x64_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + int sum; + __m256i vsse, vsum; + __m128i vsum_128; + variance32_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, &vsse, &vsum); + vsum = sum_to_32bit_avx2(vsum); + vsum_128 = _mm_add_epi32(_mm256_castsi256_si128(vsum), + _mm256_extractf128_si256(vsum, 1)); + variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum); + return *sse - (uint32_t)(((int64_t)sum * sum) >> 11); +} + +unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m256i vsse = _mm256_setzero_si256(); + __m256i vsum = _mm256_setzero_si256(); + __m128i vsum_128; + int sum; + variance64_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum); + vsum = sum_to_32bit_avx2(vsum); + vsum_128 = _mm_add_epi32(_mm256_castsi256_si128(vsum), + _mm256_extractf128_si256(vsum, 1)); + variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum); + return *sse - (uint32_t)(((int64_t)sum * sum) >> 11); +} + +unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m256i vsse = _mm256_setzero_si256(); + __m256i vsum = _mm256_setzero_si256(); + __m128i vsum_128; + int sum; + int i = 0; + + for (i = 0; i < 2; i++) { + __m256i vsum16; + variance64_avx2(src_ptr + 32 * i * src_stride, src_stride, + ref_ptr + 32 * i * ref_stride, ref_stride, 32, &vsse, + &vsum16); + vsum = _mm256_add_epi32(vsum, sum_to_32bit_avx2(vsum16)); + } + vsum_128 = _mm_add_epi32(_mm256_castsi256_si128(vsum), + _mm256_extractf128_si256(vsum, 1)); + variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 12); +} + +unsigned int vpx_mse16x8_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + int sum; + __m256i vsse, vsum; + variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse; +} + +unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + int sum; + __m256i vsse, vsum; + variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse; +} + +unsigned int vpx_sub_pixel_variance64x64_avx2( + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { + unsigned int sse1; + const int se1 = sub_pixel_variance32xh_avx2( + src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, 64, &sse1); + unsigned int sse2; + const int se2 = + sub_pixel_variance32xh_avx2(src_ptr + 32, src_stride, x_offset, y_offset, + ref_ptr + 32, ref_stride, 64, &sse2); + const int se = se1 + se2; + *sse = sse1 + sse2; + return *sse - (uint32_t)(((int64_t)se * se) >> 12); +} + +unsigned int vpx_sub_pixel_variance32x32_avx2( + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { + const int se = sub_pixel_variance32xh_avx2( + src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, 32, sse); + return *sse - (uint32_t)(((int64_t)se * se) >> 10); +} + +unsigned int vpx_sub_pixel_avg_variance64x64_avx2( + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, + const uint8_t *second_pred) { + unsigned int sse1; + const int se1 = sub_pixel_avg_variance32xh_avx2(src_ptr, src_stride, x_offset, + y_offset, ref_ptr, ref_stride, + second_pred, 64, 64, &sse1); + unsigned int sse2; + const int se2 = sub_pixel_avg_variance32xh_avx2( + src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, ref_stride, + second_pred + 32, 64, 64, &sse2); + const int se = se1 + se2; + + *sse = sse1 + sse2; + + return *sse - (uint32_t)(((int64_t)se * se) >> 12); +} + +unsigned int vpx_sub_pixel_avg_variance32x32_avx2( + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, + const uint8_t *second_pred) { + // Process 32 elements in parallel. + const int se = sub_pixel_avg_variance32xh_avx2(src_ptr, src_stride, x_offset, + y_offset, ref_ptr, ref_stride, + second_pred, 32, 32, sse); + return *sse - (uint32_t)(((int64_t)se * se) >> 10); +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/variance_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/variance_sse2.c new file mode 100644 index 0000000000..d6eb12da1a --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/variance_sse2.c @@ -0,0 +1,565 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include // SSE2 + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_ports/mem.h" +#include "vpx_dsp/x86/mem_sse2.h" + +static INLINE unsigned int add32x4_sse2(__m128i val) { + val = _mm_add_epi32(val, _mm_srli_si128(val, 8)); + val = _mm_add_epi32(val, _mm_srli_si128(val, 4)); + return (unsigned int)_mm_cvtsi128_si32(val); +} + +unsigned int vpx_get_mb_ss_sse2(const int16_t *src_ptr) { + __m128i vsum = _mm_setzero_si128(); + int i; + + for (i = 0; i < 32; ++i) { + const __m128i v = _mm_loadu_si128((const __m128i *)src_ptr); + vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v)); + src_ptr += 8; + } + + return add32x4_sse2(vsum); +} + +static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) { + const __m128i p0 = _mm_cvtsi32_si128(loadu_int32(p + 0 * stride)); + const __m128i p1 = _mm_cvtsi32_si128(loadu_int32(p + 1 * stride)); + const __m128i p01 = _mm_unpacklo_epi32(p0, p1); + return _mm_unpacklo_epi8(p01, _mm_setzero_si128()); +} + +static INLINE void variance_kernel_sse2(const __m128i src_ptr, + const __m128i ref_ptr, + __m128i *const sse, + __m128i *const sum) { + const __m128i diff = _mm_sub_epi16(src_ptr, ref_ptr); + *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff)); + *sum = _mm_add_epi16(*sum, diff); +} + +// Can handle 128 pixels' diff sum (such as 8x16 or 16x8) +// Slightly faster than variance_final_256_pel_sse2() +static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum, + unsigned int *const sse, + int *const sum) { + *sse = add32x4_sse2(vsse); + + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0); +} + +// Can handle 256 pixels' diff sum (such as 16x16) +static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum, + unsigned int *const sse, + int *const sum) { + *sse = add32x4_sse2(vsse); + + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0); + *sum += (int16_t)_mm_extract_epi16(vsum, 1); +} + +// Can handle 512 pixels' diff sum (such as 16x32 or 32x16) +static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum, + unsigned int *const sse, + int *const sum) { + *sse = add32x4_sse2(vsse); + + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_unpacklo_epi16(vsum, vsum); + vsum = _mm_srai_epi32(vsum, 16); + *sum = (int)add32x4_sse2(vsum); +} + +static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) { + const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16); + const __m128i sum_hi = _mm_srai_epi32(_mm_unpackhi_epi16(sum, sum), 16); + return _mm_add_epi32(sum_lo, sum_hi); +} + +// Can handle 1024 pixels' diff sum (such as 32x32) +static INLINE int sum_final_sse2(const __m128i sum) { + const __m128i t = sum_to_32bit_sse2(sum); + return (int)add32x4_sse2(t); +} + +static INLINE void variance4_sse2(const uint8_t *src_ptr, const int src_stride, + const uint8_t *ref_ptr, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + int i; + + assert(h <= 256); // May overflow for larger height. + *sse = _mm_setzero_si128(); + *sum = _mm_setzero_si128(); + + for (i = 0; i < h; i += 2) { + const __m128i s = load4x2_sse2(src_ptr, src_stride); + const __m128i r = load4x2_sse2(ref_ptr, ref_stride); + + variance_kernel_sse2(s, r, sse, sum); + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + } +} + +static INLINE void variance8_sse2(const uint8_t *src_ptr, const int src_stride, + const uint8_t *ref_ptr, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + const __m128i zero = _mm_setzero_si128(); + int i; + + assert(h <= 128); // May overflow for larger height. + *sse = _mm_setzero_si128(); + *sum = _mm_setzero_si128(); + + for (i = 0; i < h; i++) { + const __m128i s = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)src_ptr), zero); + const __m128i r = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ref_ptr), zero); + + variance_kernel_sse2(s, r, sse, sum); + src_ptr += src_stride; + ref_ptr += ref_stride; + } +} + +static INLINE void variance16_kernel_sse2(const uint8_t *const src_ptr, + const uint8_t *const ref_ptr, + __m128i *const sse, + __m128i *const sum) { + const __m128i zero = _mm_setzero_si128(); + const __m128i s = _mm_loadu_si128((const __m128i *)src_ptr); + const __m128i r = _mm_loadu_si128((const __m128i *)ref_ptr); + const __m128i src0 = _mm_unpacklo_epi8(s, zero); + const __m128i ref0 = _mm_unpacklo_epi8(r, zero); + const __m128i src1 = _mm_unpackhi_epi8(s, zero); + const __m128i ref1 = _mm_unpackhi_epi8(r, zero); + + variance_kernel_sse2(src0, ref0, sse, sum); + variance_kernel_sse2(src1, ref1, sse, sum); +} + +static INLINE void variance16_sse2(const uint8_t *src_ptr, const int src_stride, + const uint8_t *ref_ptr, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + int i; + + assert(h <= 64); // May overflow for larger height. + *sse = _mm_setzero_si128(); + *sum = _mm_setzero_si128(); + + for (i = 0; i < h; ++i) { + variance16_kernel_sse2(src_ptr, ref_ptr, sse, sum); + src_ptr += src_stride; + ref_ptr += ref_stride; + } +} + +static INLINE void variance32_sse2(const uint8_t *src_ptr, const int src_stride, + const uint8_t *ref_ptr, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + int i; + + assert(h <= 32); // May overflow for larger height. + // Don't initialize sse here since it's an accumulation. + *sum = _mm_setzero_si128(); + + for (i = 0; i < h; ++i) { + variance16_kernel_sse2(src_ptr + 0, ref_ptr + 0, sse, sum); + variance16_kernel_sse2(src_ptr + 16, ref_ptr + 16, sse, sum); + src_ptr += src_stride; + ref_ptr += ref_stride; + } +} + +static INLINE void variance64_sse2(const uint8_t *src_ptr, const int src_stride, + const uint8_t *ref_ptr, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + int i; + + assert(h <= 16); // May overflow for larger height. + // Don't initialize sse here since it's an accumulation. + *sum = _mm_setzero_si128(); + + for (i = 0; i < h; ++i) { + variance16_kernel_sse2(src_ptr + 0, ref_ptr + 0, sse, sum); + variance16_kernel_sse2(src_ptr + 16, ref_ptr + 16, sse, sum); + variance16_kernel_sse2(src_ptr + 32, ref_ptr + 32, sse, sum); + variance16_kernel_sse2(src_ptr + 48, ref_ptr + 48, sse, sum); + src_ptr += src_stride; + ref_ptr += ref_stride; + } +} + +void vpx_get8x8var_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + __m128i vsse, vsum; + variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); + variance_final_128_pel_sse2(vsse, vsum, sse, sum); +} + +void vpx_get16x16var_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + __m128i vsse, vsum; + variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); + variance_final_256_pel_sse2(vsse, vsum, sse, sum); +} + +unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m128i vsse, vsum; + int sum; + variance4_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum); + variance_final_128_pel_sse2(vsse, vsum, sse, &sum); + return *sse - ((sum * sum) >> 4); +} + +unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m128i vsse, vsum; + int sum; + variance4_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); + variance_final_128_pel_sse2(vsse, vsum, sse, &sum); + return *sse - ((sum * sum) >> 5); +} + +unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m128i vsse, vsum; + int sum; + variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum); + variance_final_128_pel_sse2(vsse, vsum, sse, &sum); + return *sse - ((sum * sum) >> 5); +} + +unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m128i vsse, vsum; + int sum; + variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); + variance_final_128_pel_sse2(vsse, vsum, sse, &sum); + return *sse - ((sum * sum) >> 6); +} + +unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m128i vsse, vsum; + int sum; + variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); + variance_final_128_pel_sse2(vsse, vsum, sse, &sum); + return *sse - ((sum * sum) >> 7); +} + +unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m128i vsse, vsum; + int sum; + variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); + variance_final_128_pel_sse2(vsse, vsum, sse, &sum); + return *sse - ((sum * sum) >> 7); +} + +unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m128i vsse, vsum; + int sum; + variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); + variance_final_256_pel_sse2(vsse, vsum, sse, &sum); + return *sse - (uint32_t)(((int64_t)sum * sum) >> 8); +} + +unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m128i vsse, vsum; + int sum; + variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum); + variance_final_512_pel_sse2(vsse, vsum, sse, &sum); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 9); +} + +unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m128i vsse = _mm_setzero_si128(); + __m128i vsum; + int sum; + variance32_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); + variance_final_512_pel_sse2(vsse, vsum, sse, &sum); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 9); +} + +unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m128i vsse = _mm_setzero_si128(); + __m128i vsum; + int sum; + variance32_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum); + *sse = add32x4_sse2(vsse); + sum = sum_final_sse2(vsum); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 10); +} + +unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m128i vsse = _mm_setzero_si128(); + __m128i vsum = _mm_setzero_si128(); + int sum; + int i = 0; + + for (i = 0; i < 2; i++) { + __m128i vsum16; + variance32_sse2(src_ptr + 32 * i * src_stride, src_stride, + ref_ptr + 32 * i * ref_stride, ref_stride, 32, &vsse, + &vsum16); + vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); + } + *sse = add32x4_sse2(vsse); + sum = (int)add32x4_sse2(vsum); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 11); +} + +unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m128i vsse = _mm_setzero_si128(); + __m128i vsum = _mm_setzero_si128(); + int sum; + int i = 0; + + for (i = 0; i < 2; i++) { + __m128i vsum16; + variance64_sse2(src_ptr + 16 * i * src_stride, src_stride, + ref_ptr + 16 * i * ref_stride, ref_stride, 16, &vsse, + &vsum16); + vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); + } + *sse = add32x4_sse2(vsse); + sum = (int)add32x4_sse2(vsum); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 11); +} + +unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m128i vsse = _mm_setzero_si128(); + __m128i vsum = _mm_setzero_si128(); + int sum; + int i = 0; + + for (i = 0; i < 4; i++) { + __m128i vsum16; + variance64_sse2(src_ptr + 16 * i * src_stride, src_stride, + ref_ptr + 16 * i * ref_stride, ref_stride, 16, &vsse, + &vsum16); + vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); + } + *sse = add32x4_sse2(vsse); + sum = (int)add32x4_sse2(vsum); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 12); +} + +unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + vpx_variance8x8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse); + return *sse; +} + +unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + vpx_variance8x16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse); + return *sse; +} + +unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + vpx_variance16x8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse); + return *sse; +} + +unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + vpx_variance16x16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse); + return *sse; +} + +// The 2 unused parameters are place holders for PIC enabled build. +// These definitions are for functions defined in subpel_variance.asm +#define DECL(w, opt) \ + int vpx_sub_pixel_variance##w##xh_##opt( \ + const uint8_t *src_ptr, ptrdiff_t src_stride, int x_offset, \ + int y_offset, const uint8_t *ref_ptr, ptrdiff_t ref_stride, int height, \ + unsigned int *sse, void *unused0, void *unused) +#define DECLS(opt1, opt2) \ + DECL(4, opt1); \ + DECL(8, opt1); \ + DECL(16, opt1) + +DECLS(sse2, sse2); +DECLS(ssse3, ssse3); +#undef DECLS +#undef DECL + +#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ + unsigned int vpx_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { \ + unsigned int sse_tmp; \ + int se = vpx_sub_pixel_variance##wf##xh_##opt( \ + src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h, \ + &sse_tmp, NULL, NULL); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ + src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16, \ + ref_stride, h, &sse2, NULL, NULL); \ + se += se2; \ + sse_tmp += sse2; \ + if (w > wf * 2) { \ + se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ + src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, \ + ref_stride, h, &sse2, NULL, NULL); \ + se += se2; \ + sse_tmp += sse2; \ + se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ + src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48, \ + ref_stride, h, &sse2, NULL, NULL); \ + se += se2; \ + sse_tmp += sse2; \ + } \ + } \ + *sse = sse_tmp; \ + return sse_tmp - \ + (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ + } + +#define FNS(opt1, opt2) \ + FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)) \ + FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)) \ + FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)) \ + FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)) \ + FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)) \ + FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)) \ + FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)) \ + FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t)) \ + FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t)) \ + FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t)) \ + FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t)) \ + FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t)) \ + FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t)) + +FNS(sse2, sse2) +FNS(ssse3, ssse3) + +#undef FNS +#undef FN + +// The 2 unused parameters are place holders for PIC enabled build. +#define DECL(w, opt) \ + int vpx_sub_pixel_avg_variance##w##xh_##opt( \ + const uint8_t *src_ptr, ptrdiff_t src_stride, int x_offset, \ + int y_offset, const uint8_t *ref_ptr, ptrdiff_t ref_stride, \ + const uint8_t *second_pred, ptrdiff_t second_stride, int height, \ + unsigned int *sse, void *unused0, void *unused) +#define DECLS(opt1, opt2) \ + DECL(4, opt1); \ + DECL(8, opt1); \ + DECL(16, opt1) + +DECLS(sse2, sse2); +DECLS(ssse3, ssse3); +#undef DECL +#undef DECLS + +#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ + unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, \ + const uint8_t *second_pred) { \ + unsigned int sse_tmp; \ + int se = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ + src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, \ + second_pred, w, h, &sse_tmp, NULL, NULL); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ + src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16, \ + ref_stride, second_pred + 16, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse_tmp += sse2; \ + if (w > wf * 2) { \ + se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ + src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, \ + ref_stride, second_pred + 32, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse_tmp += sse2; \ + se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ + src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48, \ + ref_stride, second_pred + 48, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse_tmp += sse2; \ + } \ + } \ + *sse = sse_tmp; \ + return sse_tmp - \ + (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ + } + +#define FNS(opt1, opt2) \ + FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)) \ + FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)) \ + FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)) \ + FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)) \ + FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)) \ + FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)) \ + FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)) \ + FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t)) \ + FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t)) \ + FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t)) \ + FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t)) \ + FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t)) \ + FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t)) + +FNS(sse2, sse) +FNS(ssse3, ssse3) + +#undef FNS +#undef FN diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm new file mode 100644 index 0000000000..3f444e2e6a --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm @@ -0,0 +1,226 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro convolve_fn 1-2 +%ifidn %1, avg +%define AUX_XMM_REGS 4 +%else +%define AUX_XMM_REGS 0 +%endif +%ifidn %2, highbd +%define pavg pavgw +cglobal %2_convolve_%1, 4, 8, 4+AUX_XMM_REGS, src, src_stride, \ + dst, dst_stride, \ + f, fxo, fxs, fyo, fys, w, h, bd +%else +%define pavg pavgb +cglobal convolve_%1, 4, 8, 4+AUX_XMM_REGS, src, src_stride, \ + dst, dst_stride, \ + f, fxo, fxs, fyo, fys, w, h +%endif + mov r4d, dword wm +%ifidn %2, highbd + shl r4d, 1 + shl src_strideq, 1 + shl dst_strideq, 1 +%else + cmp r4d, 4 + je .w4 +%endif + cmp r4d, 8 + je .w8 + cmp r4d, 16 + je .w16 + cmp r4d, 32 + je .w32 +%ifidn %2, highbd + cmp r4d, 64 + je .w64 + + mov r4d, dword hm +.loop128: + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+32] + movu m3, [srcq+48] +%ifidn %1, avg + pavg m0, [dstq] + pavg m1, [dstq+16] + pavg m2, [dstq+32] + pavg m3, [dstq+48] +%endif + mova [dstq ], m0 + mova [dstq+16], m1 + mova [dstq+32], m2 + mova [dstq+48], m3 + movu m0, [srcq+64] + movu m1, [srcq+80] + movu m2, [srcq+96] + movu m3, [srcq+112] + add srcq, src_strideq +%ifidn %1, avg + pavg m0, [dstq+64] + pavg m1, [dstq+80] + pavg m2, [dstq+96] + pavg m3, [dstq+112] +%endif + mova [dstq+64], m0 + mova [dstq+80], m1 + mova [dstq+96], m2 + mova [dstq+112], m3 + add dstq, dst_strideq + dec r4d + jnz .loop128 + RET +%endif + +.w64: + mov r4d, dword hm +.loop64: + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+32] + movu m3, [srcq+48] + add srcq, src_strideq +%ifidn %1, avg + pavg m0, [dstq] + pavg m1, [dstq+16] + pavg m2, [dstq+32] + pavg m3, [dstq+48] +%endif + mova [dstq ], m0 + mova [dstq+16], m1 + mova [dstq+32], m2 + mova [dstq+48], m3 + add dstq, dst_strideq + dec r4d + jnz .loop64 + RET + +.w32: + mov r4d, dword hm +.loop32: + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+src_strideq] + movu m3, [srcq+src_strideq+16] + lea srcq, [srcq+src_strideq*2] +%ifidn %1, avg + pavg m0, [dstq] + pavg m1, [dstq +16] + pavg m2, [dstq+dst_strideq] + pavg m3, [dstq+dst_strideq+16] +%endif + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq+dst_strideq ], m2 + mova [dstq+dst_strideq+16], m3 + lea dstq, [dstq+dst_strideq*2] + sub r4d, 2 + jnz .loop32 + RET + +.w16: + mov r4d, dword hm + lea r5q, [src_strideq*3] + lea r6q, [dst_strideq*3] +.loop16: + movu m0, [srcq] + movu m1, [srcq+src_strideq] + movu m2, [srcq+src_strideq*2] + movu m3, [srcq+r5q] + lea srcq, [srcq+src_strideq*4] +%ifidn %1, avg + pavg m0, [dstq] + pavg m1, [dstq+dst_strideq] + pavg m2, [dstq+dst_strideq*2] + pavg m3, [dstq+r6q] +%endif + mova [dstq ], m0 + mova [dstq+dst_strideq ], m1 + mova [dstq+dst_strideq*2], m2 + mova [dstq+r6q ], m3 + lea dstq, [dstq+dst_strideq*4] + sub r4d, 4 + jnz .loop16 + RET + +.w8: + mov r4d, dword hm + lea r5q, [src_strideq*3] + lea r6q, [dst_strideq*3] +.loop8: + movh m0, [srcq] + movh m1, [srcq+src_strideq] + movh m2, [srcq+src_strideq*2] + movh m3, [srcq+r5q] + lea srcq, [srcq+src_strideq*4] +%ifidn %1, avg + movh m4, [dstq] + movh m5, [dstq+dst_strideq] + movh m6, [dstq+dst_strideq*2] + movh m7, [dstq+r6q] + pavg m0, m4 + pavg m1, m5 + pavg m2, m6 + pavg m3, m7 +%endif + movh [dstq ], m0 + movh [dstq+dst_strideq ], m1 + movh [dstq+dst_strideq*2], m2 + movh [dstq+r6q ], m3 + lea dstq, [dstq+dst_strideq*4] + sub r4d, 4 + jnz .loop8 + RET + +%ifnidn %2, highbd +.w4: + mov r4d, dword hm + lea r5q, [src_strideq*3] + lea r6q, [dst_strideq*3] +.loop4: + movd m0, [srcq] + movd m1, [srcq+src_strideq] + movd m2, [srcq+src_strideq*2] + movd m3, [srcq+r5q] + lea srcq, [srcq+src_strideq*4] +%ifidn %1, avg + movd m4, [dstq] + movd m5, [dstq+dst_strideq] + movd m6, [dstq+dst_strideq*2] + movd m7, [dstq+r6q] + pavg m0, m4 + pavg m1, m5 + pavg m2, m6 + pavg m3, m7 +%endif + movd [dstq ], m0 + movd [dstq+dst_strideq ], m1 + movd [dstq+dst_strideq*2], m2 + movd [dstq+r6q ], m3 + lea dstq, [dstq+dst_strideq*4] + sub r4d, 4 + jnz .loop4 + RET +%endif +%endmacro + +INIT_XMM sse2 +convolve_fn copy +convolve_fn avg +%if CONFIG_VP9_HIGHBITDEPTH +convolve_fn copy, highbd +convolve_fn avg, highbd +%endif diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm new file mode 100644 index 0000000000..fc301fb39e --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm @@ -0,0 +1,964 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;Note: tap3 and tap4 have to be applied and added after other taps to avoid +;overflow. + +%macro HIGH_GET_FILTERS_4 0 + mov rdx, arg(5) ;filter ptr + mov rcx, 0x00000040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + psrldq xmm7, 8 + pshuflw xmm4, xmm7, 0b ;k4 + pshuflw xmm5, xmm7, 01010101b ;k5 + pshuflw xmm6, xmm7, 10101010b ;k6 + pshuflw xmm7, xmm7, 11111111b ;k7 + + punpcklwd xmm0, xmm6 + punpcklwd xmm2, xmm5 + punpcklwd xmm3, xmm4 + punpcklwd xmm1, xmm7 + + movdqa k0k6, xmm0 + movdqa k2k5, xmm2 + movdqa k3k4, xmm3 + movdqa k1k7, xmm1 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 + + ;Compute max and min values of a pixel + mov rdx, 0x00010001 + movsxd rcx, DWORD PTR arg(6) ;bd + movq xmm0, rdx + movq xmm1, rcx + pshufd xmm0, xmm0, 0b + movdqa xmm2, xmm0 + psllw xmm0, xmm1 + psubw xmm0, xmm2 + pxor xmm1, xmm1 + movdqa max, xmm0 ;max value (for clamping) + movdqa min, xmm1 ;min value (for clamping) + +%endm + +%macro HIGH_APPLY_FILTER_4 1 + punpcklwd xmm0, xmm6 ;two row in one register + punpcklwd xmm1, xmm7 + punpcklwd xmm2, xmm5 + punpcklwd xmm3, xmm4 + + pmaddwd xmm0, k0k6 ;multiply the filter factors + pmaddwd xmm1, k1k7 + pmaddwd xmm2, k2k5 + pmaddwd xmm3, k3k4 + + paddd xmm0, xmm1 ;sum + paddd xmm0, xmm2 + paddd xmm0, xmm3 + + paddd xmm0, krd ;rounding + psrad xmm0, 7 ;shift + packssdw xmm0, xmm0 ;pack to word + + ;clamp the values + pminsw xmm0, max + pmaxsw xmm0, min + +%if %1 + movq xmm1, [rdi] + pavgw xmm0, xmm1 +%endif + movq [rdi], xmm0 +%endm + +%macro HIGH_GET_FILTERS 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x00000040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + pshufhw xmm4, xmm7, 0b ;k4 + pshufhw xmm5, xmm7, 01010101b ;k5 + pshufhw xmm6, xmm7, 10101010b ;k6 + pshufhw xmm7, xmm7, 11111111b ;k7 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + punpcklwd xmm0, xmm1 + punpckhwd xmm6, xmm7 + punpckhwd xmm2, xmm5 + punpckhwd xmm3, xmm4 + + movdqa k0k1, xmm0 ;store filter factors on stack + movdqa k6k7, xmm6 + movdqa k2k5, xmm2 + movdqa k3k4, xmm3 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 ;rounding + + ;Compute max and min values of a pixel + mov rdx, 0x00010001 + movsxd rcx, DWORD PTR arg(6) ;bd + movq xmm0, rdx + movq xmm1, rcx + pshufd xmm0, xmm0, 0b + movdqa xmm2, xmm0 + psllw xmm0, xmm1 + psubw xmm0, xmm2 + pxor xmm1, xmm1 + movdqa max, xmm0 ;max value (for clamping) + movdqa min, xmm1 ;min value (for clamping) +%endm + +%macro LOAD_VERT_8 1 + movdqu xmm0, [rsi + %1] ;0 + movdqu xmm1, [rsi + rax + %1] ;1 + movdqu xmm6, [rsi + rdx * 2 + %1] ;6 + lea rsi, [rsi + rax] + movdqu xmm7, [rsi + rdx * 2 + %1] ;7 + movdqu xmm2, [rsi + rax + %1] ;2 + movdqu xmm3, [rsi + rax * 2 + %1] ;3 + movdqu xmm4, [rsi + rdx + %1] ;4 + movdqu xmm5, [rsi + rax * 4 + %1] ;5 +%endm + +%macro HIGH_APPLY_FILTER_8 2 + movdqu temp, xmm4 + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm1 + punpckhwd xmm4, xmm1 + movdqa xmm1, xmm6 + punpcklwd xmm6, xmm7 + punpckhwd xmm1, xmm7 + movdqa xmm7, xmm2 + punpcklwd xmm2, xmm5 + punpckhwd xmm7, xmm5 + + movdqu xmm5, temp + movdqu temp, xmm4 + movdqa xmm4, xmm3 + punpcklwd xmm3, xmm5 + punpckhwd xmm4, xmm5 + movdqu xmm5, temp + + pmaddwd xmm0, k0k1 + pmaddwd xmm5, k0k1 + pmaddwd xmm6, k6k7 + pmaddwd xmm1, k6k7 + pmaddwd xmm2, k2k5 + pmaddwd xmm7, k2k5 + pmaddwd xmm3, k3k4 + pmaddwd xmm4, k3k4 + + paddd xmm0, xmm6 + paddd xmm0, xmm2 + paddd xmm0, xmm3 + paddd xmm5, xmm1 + paddd xmm5, xmm7 + paddd xmm5, xmm4 + + paddd xmm0, krd ;rounding + paddd xmm5, krd + psrad xmm0, 7 ;shift + psrad xmm5, 7 + packssdw xmm0, xmm5 ;pack back to word + + ;clamp the values + pminsw xmm0, max + pmaxsw xmm0, min + +%if %1 + movdqu xmm1, [rdi + %2] + pavgw xmm0, xmm1 +%endif + movdqu [rdi + %2], xmm0 +%endm + +SECTION .text + +;void vpx_highbd_filter_block1d4_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +globalsym(vpx_highbd_filter_block1d4_v8_sse2) +sym(vpx_highbd_filter_block1d4_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 7 + %define k0k6 [rsp + 16 * 0] + %define k2k5 [rsp + 16 * 1] + %define k3k4 [rsp + 16 * 2] + %define k1k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define max [rsp + 16 * 5] + %define min [rsp + 16 * 6] + + HIGH_GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movq xmm0, [rsi] ;load src: row 0 + movq xmm1, [rsi + rax] ;1 + movq xmm6, [rsi + rdx * 2] ;6 + lea rsi, [rsi + rax] + movq xmm7, [rsi + rdx * 2] ;7 + movq xmm2, [rsi + rax] ;2 + movq xmm3, [rsi + rax * 2] ;3 + movq xmm4, [rsi + rdx] ;4 + movq xmm5, [rsi + rax * 4] ;5 + + HIGH_APPLY_FILTER_4 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 7 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vpx_highbd_filter_block1d8_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +globalsym(vpx_highbd_filter_block1d8_v8_sse2) +sym(vpx_highbd_filter_block1d8_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + HIGH_APPLY_FILTER_8 0, 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vpx_highbd_filter_block1d16_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +globalsym(vpx_highbd_filter_block1d16_v8_sse2) +sym(vpx_highbd_filter_block1d16_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + HIGH_APPLY_FILTER_8 0, 0 + sub rsi, rax + + LOAD_VERT_8 16 + HIGH_APPLY_FILTER_8 0, 16 + add rdi, rbx + + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_highbd_filter_block1d4_v8_avg_sse2) +sym(vpx_highbd_filter_block1d4_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 7 + %define k0k6 [rsp + 16 * 0] + %define k2k5 [rsp + 16 * 1] + %define k3k4 [rsp + 16 * 2] + %define k1k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define max [rsp + 16 * 5] + %define min [rsp + 16 * 6] + + HIGH_GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movq xmm0, [rsi] ;load src: row 0 + movq xmm1, [rsi + rax] ;1 + movq xmm6, [rsi + rdx * 2] ;6 + lea rsi, [rsi + rax] + movq xmm7, [rsi + rdx * 2] ;7 + movq xmm2, [rsi + rax] ;2 + movq xmm3, [rsi + rax * 2] ;3 + movq xmm4, [rsi + rdx] ;4 + movq xmm5, [rsi + rax * 4] ;5 + + HIGH_APPLY_FILTER_4 1 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 7 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_highbd_filter_block1d8_v8_avg_sse2) +sym(vpx_highbd_filter_block1d8_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height +.loop: + LOAD_VERT_8 0 + HIGH_APPLY_FILTER_8 1, 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_highbd_filter_block1d16_v8_avg_sse2) +sym(vpx_highbd_filter_block1d16_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height +.loop: + LOAD_VERT_8 0 + HIGH_APPLY_FILTER_8 1, 0 + sub rsi, rax + + LOAD_VERT_8 16 + HIGH_APPLY_FILTER_8 1, 16 + add rdi, rbx + + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vpx_highbd_filter_block1d4_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +globalsym(vpx_highbd_filter_block1d4_h8_sse2) +sym(vpx_highbd_filter_block1d4_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 7 + %define k0k6 [rsp + 16 * 0] + %define k2k5 [rsp + 16 * 1] + %define k3k4 [rsp + 16 * 2] + %define k1k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define max [rsp + 16 * 5] + %define min [rsp + 16 * 6] + + HIGH_GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm4, [rsi + 2] + movdqa xmm1, xmm0 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + movdqa xmm5, xmm4 + + psrldq xmm1, 2 + psrldq xmm6, 4 + psrldq xmm7, 6 + psrldq xmm2, 4 + psrldq xmm3, 6 + psrldq xmm5, 2 + + HIGH_APPLY_FILTER_4 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 7 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vpx_highbd_filter_block1d8_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +globalsym(vpx_highbd_filter_block1d8_h8_sse2) +sym(vpx_highbd_filter_block1d8_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm1, [rsi - 4] + movdqu xmm2, [rsi - 2] + movdqu xmm3, [rsi] + movdqu xmm4, [rsi + 2] + movdqu xmm5, [rsi + 4] + movdqu xmm6, [rsi + 6] + movdqu xmm7, [rsi + 8] + + HIGH_APPLY_FILTER_8 0, 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vpx_highbd_filter_block1d16_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +globalsym(vpx_highbd_filter_block1d16_h8_sse2) +sym(vpx_highbd_filter_block1d16_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm1, [rsi - 4] + movdqu xmm2, [rsi - 2] + movdqu xmm3, [rsi] + movdqu xmm4, [rsi + 2] + movdqu xmm5, [rsi + 4] + movdqu xmm6, [rsi + 6] + movdqu xmm7, [rsi + 8] + + HIGH_APPLY_FILTER_8 0, 0 + + movdqu xmm0, [rsi + 10] ;load src + movdqu xmm1, [rsi + 12] + movdqu xmm2, [rsi + 14] + movdqu xmm3, [rsi + 16] + movdqu xmm4, [rsi + 18] + movdqu xmm5, [rsi + 20] + movdqu xmm6, [rsi + 22] + movdqu xmm7, [rsi + 24] + + HIGH_APPLY_FILTER_8 0, 16 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_highbd_filter_block1d4_h8_avg_sse2) +sym(vpx_highbd_filter_block1d4_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 7 + %define k0k6 [rsp + 16 * 0] + %define k2k5 [rsp + 16 * 1] + %define k3k4 [rsp + 16 * 2] + %define k1k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define max [rsp + 16 * 5] + %define min [rsp + 16 * 6] + + HIGH_GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm4, [rsi + 2] + movdqa xmm1, xmm0 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + movdqa xmm5, xmm4 + + psrldq xmm1, 2 + psrldq xmm6, 4 + psrldq xmm7, 6 + psrldq xmm2, 4 + psrldq xmm3, 6 + psrldq xmm5, 2 + + HIGH_APPLY_FILTER_4 1 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 7 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_highbd_filter_block1d8_h8_avg_sse2) +sym(vpx_highbd_filter_block1d8_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm1, [rsi - 4] + movdqu xmm2, [rsi - 2] + movdqu xmm3, [rsi] + movdqu xmm4, [rsi + 2] + movdqu xmm5, [rsi + 4] + movdqu xmm6, [rsi + 6] + movdqu xmm7, [rsi + 8] + + HIGH_APPLY_FILTER_8 1, 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_highbd_filter_block1d16_h8_avg_sse2) +sym(vpx_highbd_filter_block1d16_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm1, [rsi - 4] + movdqu xmm2, [rsi - 2] + movdqu xmm3, [rsi] + movdqu xmm4, [rsi + 2] + movdqu xmm5, [rsi + 4] + movdqu xmm6, [rsi + 6] + movdqu xmm7, [rsi + 8] + + HIGH_APPLY_FILTER_8 1, 0 + + movdqu xmm0, [rsi + 10] ;load src + movdqu xmm1, [rsi + 12] + movdqu xmm2, [rsi + 14] + movdqu xmm3, [rsi + 16] + movdqu xmm4, [rsi + 18] + movdqu xmm5, [rsi + 20] + movdqu xmm6, [rsi + 22] + movdqu xmm7, [rsi + 24] + + HIGH_APPLY_FILTER_8 1, 16 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm new file mode 100644 index 0000000000..bd51c75bcb --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm @@ -0,0 +1,496 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +%macro HIGH_GET_PARAM_4 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x00000040 + + movdqa xmm3, [rdx] ;load filters + pshuflw xmm4, xmm3, 11111111b ;k3 + psrldq xmm3, 8 + pshuflw xmm3, xmm3, 0b ;k4 + punpcklwd xmm4, xmm3 ;k3k4 + + movq xmm3, rcx ;rounding + pshufd xmm3, xmm3, 0 + + mov rdx, 0x00010001 + movsxd rcx, DWORD PTR arg(6) ;bd + movq xmm5, rdx + movq xmm2, rcx + pshufd xmm5, xmm5, 0b + movdqa xmm1, xmm5 + psllw xmm5, xmm2 + psubw xmm5, xmm1 ;max value (for clamping) + pxor xmm2, xmm2 ;min value (for clamping) + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro HIGH_APPLY_FILTER_4 1 + + punpcklwd xmm0, xmm1 ;two row in one register + pmaddwd xmm0, xmm4 ;multiply the filter factors + + paddd xmm0, xmm3 ;rounding + psrad xmm0, 7 ;shift + packssdw xmm0, xmm0 ;pack to word + + ;clamp the values + pminsw xmm0, xmm5 + pmaxsw xmm0, xmm2 + +%if %1 + movq xmm1, [rdi] + pavgw xmm0, xmm1 +%endif + + movq [rdi], xmm0 + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + dec rcx +%endm + +%if VPX_ARCH_X86_64 +%macro HIGH_GET_PARAM 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x00000040 + + movdqa xmm6, [rdx] ;load filters + + pshuflw xmm7, xmm6, 11111111b ;k3 + pshufhw xmm6, xmm6, 0b ;k4 + psrldq xmm6, 8 + punpcklwd xmm7, xmm6 ;k3k4k3k4k3k4k3k4 + + movq xmm4, rcx ;rounding + pshufd xmm4, xmm4, 0 + + mov rdx, 0x00010001 + movsxd rcx, DWORD PTR arg(6) ;bd + movq xmm8, rdx + movq xmm5, rcx + pshufd xmm8, xmm8, 0b + movdqa xmm1, xmm8 + psllw xmm8, xmm5 + psubw xmm8, xmm1 ;max value (for clamping) + pxor xmm5, xmm5 ;min value (for clamping) + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro HIGH_APPLY_FILTER_8 1 + movdqa xmm6, xmm0 + punpckhwd xmm6, xmm1 + punpcklwd xmm0, xmm1 + pmaddwd xmm6, xmm7 + pmaddwd xmm0, xmm7 + + paddd xmm6, xmm4 ;rounding + paddd xmm0, xmm4 ;rounding + psrad xmm6, 7 ;shift + psrad xmm0, 7 ;shift + packssdw xmm0, xmm6 ;pack back to word + + ;clamp the values + pminsw xmm0, xmm8 + pmaxsw xmm0, xmm5 + +%if %1 + movdqu xmm1, [rdi] + pavgw xmm0, xmm1 +%endif + movdqu [rdi], xmm0 ;store the result + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + dec rcx +%endm + +%macro HIGH_APPLY_FILTER_16 1 + movdqa xmm9, xmm0 + movdqa xmm6, xmm2 + punpckhwd xmm9, xmm1 + punpckhwd xmm6, xmm3 + punpcklwd xmm0, xmm1 + punpcklwd xmm2, xmm3 + + pmaddwd xmm9, xmm7 + pmaddwd xmm6, xmm7 + pmaddwd xmm0, xmm7 + pmaddwd xmm2, xmm7 + + paddd xmm9, xmm4 ;rounding + paddd xmm6, xmm4 + paddd xmm0, xmm4 + paddd xmm2, xmm4 + + psrad xmm9, 7 ;shift + psrad xmm6, 7 + psrad xmm0, 7 + psrad xmm2, 7 + + packssdw xmm0, xmm9 ;pack back to word + packssdw xmm2, xmm6 ;pack back to word + + ;clamp the values + pminsw xmm0, xmm8 + pmaxsw xmm0, xmm5 + pminsw xmm2, xmm8 + pmaxsw xmm2, xmm5 + +%if %1 + movdqu xmm1, [rdi] + movdqu xmm3, [rdi + 16] + pavgw xmm0, xmm1 + pavgw xmm2, xmm3 +%endif + movdqu [rdi], xmm0 ;store the result + movdqu [rdi + 16], xmm2 ;store the result + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + dec rcx +%endm +%endif + +SECTION .text + +globalsym(vpx_highbd_filter_block1d4_v2_sse2) +sym(vpx_highbd_filter_block1d4_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM_4 +.loop: + movq xmm0, [rsi] ;load src + movq xmm1, [rsi + 2*rax] + + HIGH_APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +%if VPX_ARCH_X86_64 +globalsym(vpx_highbd_filter_block1d8_v2_sse2) +sym(vpx_highbd_filter_block1d8_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 8 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + 2*rax] ;1 + + HIGH_APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_highbd_filter_block1d16_v2_sse2) +sym(vpx_highbd_filter_block1d16_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 9 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm2, [rsi + 16] + movdqu xmm1, [rsi + 2*rax] ;1 + movdqu xmm3, [rsi + 2*rax + 16] + + HIGH_APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%endif + +globalsym(vpx_highbd_filter_block1d4_v2_avg_sse2) +sym(vpx_highbd_filter_block1d4_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM_4 +.loop: + movq xmm0, [rsi] ;load src + movq xmm1, [rsi + 2*rax] + + HIGH_APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +%if VPX_ARCH_X86_64 +globalsym(vpx_highbd_filter_block1d8_v2_avg_sse2) +sym(vpx_highbd_filter_block1d8_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 8 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + 2*rax] ;1 + + HIGH_APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_highbd_filter_block1d16_v2_avg_sse2) +sym(vpx_highbd_filter_block1d16_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 9 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + 2*rax] ;1 + movdqu xmm2, [rsi + 16] + movdqu xmm3, [rsi + 2*rax + 16] + + HIGH_APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%endif + +globalsym(vpx_highbd_filter_block1d4_h2_sse2) +sym(vpx_highbd_filter_block1d4_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 2 + + HIGH_APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +%if VPX_ARCH_X86_64 +globalsym(vpx_highbd_filter_block1d8_h2_sse2) +sym(vpx_highbd_filter_block1d8_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 8 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 2] + + HIGH_APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_highbd_filter_block1d16_h2_sse2) +sym(vpx_highbd_filter_block1d16_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 9 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 2] + movdqu xmm2, [rsi + 16] + movdqu xmm3, [rsi + 18] + + HIGH_APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%endif + +globalsym(vpx_highbd_filter_block1d4_h2_avg_sse2) +sym(vpx_highbd_filter_block1d4_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 2 + + HIGH_APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +%if VPX_ARCH_X86_64 +globalsym(vpx_highbd_filter_block1d8_h2_avg_sse2) +sym(vpx_highbd_filter_block1d8_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 8 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 2] + + HIGH_APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_highbd_filter_block1d16_h2_avg_sse2) +sym(vpx_highbd_filter_block1d16_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 9 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 2] + movdqu xmm2, [rsi + 16] + movdqu xmm3, [rsi + 18] + + HIGH_APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%endif diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c new file mode 100644 index 0000000000..21a35ae3c3 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c @@ -0,0 +1,1161 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/convolve.h" +#include "vpx_dsp/x86/convolve_sse2.h" +#include "vpx_ports/mem.h" + +#define CONV8_ROUNDING_BITS (7) +#define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1)) + +static void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + int h; + + __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3; + __m128i dst_first, dst_second; + __m128i even, odd; + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + for (h = height; h > 0; --h) { + // We will load multiple shifted versions of the row and shuffle them into + // 16-bit words of the form + // ... s[2] s[1] s[0] s[-1] + // ... s[4] s[3] s[2] s[1] + // Then we call multiply and add to get partial results + // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2] + // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4] + // The two results are then added together for the first half of even + // output. + // Repeat multiple times to get the whole outoput + src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_shift_1 = _mm_srli_si128(src_reg, 1); + src_reg_shift_2 = _mm_srli_si128(src_reg, 2); + src_reg_shift_3 = _mm_srli_si128(src_reg, 3); + + // Output 6 4 2 0 + even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, + &kernel_reg_45); + + // Output 7 5 3 1 + odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3, + &kernel_reg_23, &kernel_reg_45); + + // Combine to get the first half of the dst + dst_first = mm_zip_epi32_sse2(&even, &odd); + + // Do again to get the second half of dst + src_reg = _mm_loadu_si128((const __m128i *)(src_ptr + 8)); + src_reg_shift_1 = _mm_srli_si128(src_reg, 1); + src_reg_shift_2 = _mm_srli_si128(src_reg, 2); + src_reg_shift_3 = _mm_srli_si128(src_reg, 3); + + // Output 14 12 10 8 + even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, + &kernel_reg_45); + + // Output 15 13 11 9 + odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3, + &kernel_reg_23, &kernel_reg_45); + + // Combine to get the second half of the dst + dst_second = mm_zip_epi32_sse2(&even, &odd); + + // Round each result + dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6); + dst_second = mm_round_epi16_sse2(&dst_second, ®_32, 6); + + // Finally combine to get the final dst + dst_first = _mm_packus_epi16(dst_first, dst_second); + _mm_store_si128((__m128i *)dst_ptr, dst_first); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +/* The macro used to generate functions shifts the src_ptr up by 3 rows already + * */ + +static void vpx_filter_block1d16_v4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // Register for source s[-1:3, :] + __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m128i src_reg_m10_lo, src_reg_m10_hi, src_reg_01_lo, src_reg_01_hi; + __m128i src_reg_12_lo, src_reg_12_hi, src_reg_23_lo, src_reg_23_hi; + // Half of half of the interleaved rows + __m128i src_reg_m10_lo_1, src_reg_m10_lo_2, src_reg_m10_hi_1, + src_reg_m10_hi_2; + __m128i src_reg_01_lo_1, src_reg_01_lo_2, src_reg_01_hi_1, src_reg_01_hi_2; + __m128i src_reg_12_lo_1, src_reg_12_lo_2, src_reg_12_hi_1, src_reg_12_hi_2; + __m128i src_reg_23_lo_1, src_reg_23_lo_2, src_reg_23_hi_1, src_reg_23_hi_2; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + + // Result after multiply and add + __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo; + __m128i res_reg_m10_hi, res_reg_01_hi, res_reg_12_hi, res_reg_23_hi; + __m128i res_reg_m1012, res_reg_0123; + __m128i res_reg_m1012_lo, res_reg_0123_lo, res_reg_m1012_hi, res_reg_0123_hi; + + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + // We will load two rows of pixels as 8-bit words, rearrange them as 16-bit + // words, + // shuffle the data into the form + // ... s[0,1] s[-1,1] s[0,0] s[-1,0] + // ... s[0,7] s[-1,7] s[0,6] s[-1,6] + // ... s[0,9] s[-1,9] s[0,8] s[-1,8] + // ... s[0,13] s[-1,13] s[0,12] s[-1,12] + // so that we can call multiply and add with the kernel to get 32-bit words of + // the form + // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2] + // Finally, we can add multiple rows together to get the desired output. + + // First shuffle the data + src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride)); + src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0); + src_reg_m10_hi = _mm_unpackhi_epi8(src_reg_m1, src_reg_0); + src_reg_m10_lo_1 = _mm_unpacklo_epi8(src_reg_m10_lo, _mm_setzero_si128()); + src_reg_m10_lo_2 = _mm_unpackhi_epi8(src_reg_m10_lo, _mm_setzero_si128()); + src_reg_m10_hi_1 = _mm_unpacklo_epi8(src_reg_m10_hi, _mm_setzero_si128()); + src_reg_m10_hi_2 = _mm_unpackhi_epi8(src_reg_m10_hi, _mm_setzero_si128()); + + // More shuffling + src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)); + src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1); + src_reg_01_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1); + src_reg_01_lo_1 = _mm_unpacklo_epi8(src_reg_01_lo, _mm_setzero_si128()); + src_reg_01_lo_2 = _mm_unpackhi_epi8(src_reg_01_lo, _mm_setzero_si128()); + src_reg_01_hi_1 = _mm_unpacklo_epi8(src_reg_01_hi, _mm_setzero_si128()); + src_reg_01_hi_2 = _mm_unpackhi_epi8(src_reg_01_hi, _mm_setzero_si128()); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)); + + src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2); + src_reg_12_hi = _mm_unpackhi_epi8(src_reg_1, src_reg_2); + + src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)); + + src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3); + src_reg_23_hi = _mm_unpackhi_epi8(src_reg_2, src_reg_3); + + // Partial output from first half + res_reg_m10_lo = mm_madd_packs_epi16_sse2( + &src_reg_m10_lo_1, &src_reg_m10_lo_2, &kernel_reg_23); + + res_reg_01_lo = mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &src_reg_01_lo_2, + &kernel_reg_23); + + src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128()); + src_reg_12_lo_2 = _mm_unpackhi_epi8(src_reg_12_lo, _mm_setzero_si128()); + res_reg_12_lo = mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &src_reg_12_lo_2, + &kernel_reg_45); + + src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128()); + src_reg_23_lo_2 = _mm_unpackhi_epi8(src_reg_23_lo, _mm_setzero_si128()); + res_reg_23_lo = mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &src_reg_23_lo_2, + &kernel_reg_45); + + // Add to get first half of the results + res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo); + res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo); + + // Now repeat everything again for the second half + // Partial output for second half + res_reg_m10_hi = mm_madd_packs_epi16_sse2( + &src_reg_m10_hi_1, &src_reg_m10_hi_2, &kernel_reg_23); + + res_reg_01_hi = mm_madd_packs_epi16_sse2(&src_reg_01_hi_1, &src_reg_01_hi_2, + &kernel_reg_23); + + src_reg_12_hi_1 = _mm_unpacklo_epi8(src_reg_12_hi, _mm_setzero_si128()); + src_reg_12_hi_2 = _mm_unpackhi_epi8(src_reg_12_hi, _mm_setzero_si128()); + res_reg_12_hi = mm_madd_packs_epi16_sse2(&src_reg_12_hi_1, &src_reg_12_hi_2, + &kernel_reg_45); + + src_reg_23_hi_1 = _mm_unpacklo_epi8(src_reg_23_hi, _mm_setzero_si128()); + src_reg_23_hi_2 = _mm_unpackhi_epi8(src_reg_23_hi, _mm_setzero_si128()); + res_reg_23_hi = mm_madd_packs_epi16_sse2(&src_reg_23_hi_1, &src_reg_23_hi_2, + &kernel_reg_45); + + // Second half of the results + res_reg_m1012_hi = _mm_adds_epi16(res_reg_m10_hi, res_reg_12_hi); + res_reg_0123_hi = _mm_adds_epi16(res_reg_01_hi, res_reg_23_hi); + + // Round the words + res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6); + res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, ®_32, 6); + res_reg_m1012_hi = mm_round_epi16_sse2(&res_reg_m1012_hi, ®_32, 6); + res_reg_0123_hi = mm_round_epi16_sse2(&res_reg_0123_hi, ®_32, 6); + + // Combine to get the result + res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, res_reg_m1012_hi); + res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, res_reg_0123_hi); + + _mm_store_si128((__m128i *)dst_ptr, res_reg_m1012); + _mm_store_si128((__m128i *)(dst_ptr + dst_stride), res_reg_0123); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m10_lo_1 = src_reg_12_lo_1; + src_reg_m10_lo_2 = src_reg_12_lo_2; + src_reg_m10_hi_1 = src_reg_12_hi_1; + src_reg_m10_hi_2 = src_reg_12_hi_2; + src_reg_01_lo_1 = src_reg_23_lo_1; + src_reg_01_lo_2 = src_reg_23_lo_2; + src_reg_01_hi_1 = src_reg_23_hi_1; + src_reg_01_hi_2 = src_reg_23_hi_2; + src_reg_1 = src_reg_3; + } +} + +static void vpx_filter_block1d8_h4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + int h; + + __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3; + __m128i dst_first; + __m128i even, odd; + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + for (h = height; h > 0; --h) { + // We will load multiple shifted versions of the row and shuffle them into + // 16-bit words of the form + // ... s[2] s[1] s[0] s[-1] + // ... s[4] s[3] s[2] s[1] + // Then we call multiply and add to get partial results + // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2] + // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4] + // The two results are then added together to get the even output + src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_shift_1 = _mm_srli_si128(src_reg, 1); + src_reg_shift_2 = _mm_srli_si128(src_reg, 2); + src_reg_shift_3 = _mm_srli_si128(src_reg, 3); + + // Output 6 4 2 0 + even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, + &kernel_reg_45); + + // Output 7 5 3 1 + odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3, + &kernel_reg_23, &kernel_reg_45); + + // Combine to get the first half of the dst + dst_first = mm_zip_epi32_sse2(&even, &odd); + dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6); + + // Saturate and convert to 8-bit words + dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128()); + + _mm_storel_epi64((__m128i *)dst_ptr, dst_first); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void vpx_filter_block1d8_v4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // Register for source s[-1:3, :] + __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m128i src_reg_m10_lo, src_reg_01_lo; + __m128i src_reg_12_lo, src_reg_23_lo; + // Half of half of the interleaved rows + __m128i src_reg_m10_lo_1, src_reg_m10_lo_2; + __m128i src_reg_01_lo_1, src_reg_01_lo_2; + __m128i src_reg_12_lo_1, src_reg_12_lo_2; + __m128i src_reg_23_lo_1, src_reg_23_lo_2; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + + // Result after multiply and add + __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo; + __m128i res_reg_m1012, res_reg_0123; + __m128i res_reg_m1012_lo, res_reg_0123_lo; + + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + // We will load two rows of pixels as 8-bit words, rearrange them as 16-bit + // words, + // shuffle the data into the form + // ... s[0,1] s[-1,1] s[0,0] s[-1,0] + // ... s[0,7] s[-1,7] s[0,6] s[-1,6] + // ... s[0,9] s[-1,9] s[0,8] s[-1,8] + // ... s[0,13] s[-1,13] s[0,12] s[-1,12] + // so that we can call multiply and add with the kernel to get 32-bit words of + // the form + // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2] + // Finally, we can add multiple rows together to get the desired output. + + // First shuffle the data + src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride)); + src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0); + src_reg_m10_lo_1 = _mm_unpacklo_epi8(src_reg_m10_lo, _mm_setzero_si128()); + src_reg_m10_lo_2 = _mm_unpackhi_epi8(src_reg_m10_lo, _mm_setzero_si128()); + + // More shuffling + src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)); + src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1); + src_reg_01_lo_1 = _mm_unpacklo_epi8(src_reg_01_lo, _mm_setzero_si128()); + src_reg_01_lo_2 = _mm_unpackhi_epi8(src_reg_01_lo, _mm_setzero_si128()); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)); + + src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2); + + src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)); + + src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3); + + // Partial output + res_reg_m10_lo = mm_madd_packs_epi16_sse2( + &src_reg_m10_lo_1, &src_reg_m10_lo_2, &kernel_reg_23); + + res_reg_01_lo = mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &src_reg_01_lo_2, + &kernel_reg_23); + + src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128()); + src_reg_12_lo_2 = _mm_unpackhi_epi8(src_reg_12_lo, _mm_setzero_si128()); + res_reg_12_lo = mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &src_reg_12_lo_2, + &kernel_reg_45); + + src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128()); + src_reg_23_lo_2 = _mm_unpackhi_epi8(src_reg_23_lo, _mm_setzero_si128()); + res_reg_23_lo = mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &src_reg_23_lo_2, + &kernel_reg_45); + + // Add to get results + res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo); + res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo); + + // Round the words + res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6); + res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, ®_32, 6); + + // Convert to 8-bit words + res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, _mm_setzero_si128()); + res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, _mm_setzero_si128()); + + // Save only half of the register (8 words) + _mm_storel_epi64((__m128i *)dst_ptr, res_reg_m1012); + _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res_reg_0123); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m10_lo_1 = src_reg_12_lo_1; + src_reg_m10_lo_2 = src_reg_12_lo_2; + src_reg_01_lo_1 = src_reg_23_lo_1; + src_reg_01_lo_2 = src_reg_23_lo_2; + src_reg_1 = src_reg_3; + } +} + +static void vpx_filter_block1d4_h4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + int h; + + __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3; + __m128i dst_first; + __m128i tmp_0, tmp_1; + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + for (h = height; h > 0; --h) { + // We will load multiple shifted versions of the row and shuffle them into + // 16-bit words of the form + // ... s[1] s[0] s[0] s[-1] + // ... s[3] s[2] s[2] s[1] + // Then we call multiply and add to get partial results + // s[1]k[3]+s[0]k[2] s[0]k[3]s[-1]k[2] + // s[3]k[5]+s[2]k[4] s[2]k[5]s[1]k[4] + // The two results are then added together to get the output + src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_shift_1 = _mm_srli_si128(src_reg, 1); + src_reg_shift_2 = _mm_srli_si128(src_reg, 2); + src_reg_shift_3 = _mm_srli_si128(src_reg, 3); + + // Convert to 16-bit words + src_reg = _mm_unpacklo_epi8(src_reg, _mm_setzero_si128()); + src_reg_shift_1 = _mm_unpacklo_epi8(src_reg_shift_1, _mm_setzero_si128()); + src_reg_shift_2 = _mm_unpacklo_epi8(src_reg_shift_2, _mm_setzero_si128()); + src_reg_shift_3 = _mm_unpacklo_epi8(src_reg_shift_3, _mm_setzero_si128()); + + // Shuffle into the right format + tmp_0 = _mm_unpacklo_epi32(src_reg, src_reg_shift_1); + tmp_1 = _mm_unpacklo_epi32(src_reg_shift_2, src_reg_shift_3); + + // Partial output + tmp_0 = _mm_madd_epi16(tmp_0, kernel_reg_23); + tmp_1 = _mm_madd_epi16(tmp_1, kernel_reg_45); + + // Output + dst_first = _mm_add_epi32(tmp_0, tmp_1); + dst_first = _mm_packs_epi32(dst_first, _mm_setzero_si128()); + + dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6); + + // Saturate and convert to 8-bit words + dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128()); + + *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // Register for source s[-1:3, :] + __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m128i src_reg_m10_lo, src_reg_01_lo; + __m128i src_reg_12_lo, src_reg_23_lo; + // Half of half of the interleaved rows + __m128i src_reg_m10_lo_1; + __m128i src_reg_01_lo_1; + __m128i src_reg_12_lo_1; + __m128i src_reg_23_lo_1; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + + // Result after multiply and add + __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo; + __m128i res_reg_m1012, res_reg_0123; + __m128i res_reg_m1012_lo, res_reg_0123_lo; + + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + const __m128i reg_zero = _mm_setzero_si128(); + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + // We will load two rows of pixels as 8-bit words, rearrange them as 16-bit + // words, + // shuffle the data into the form + // ... s[0,1] s[-1,1] s[0,0] s[-1,0] + // ... s[0,7] s[-1,7] s[0,6] s[-1,6] + // ... s[0,9] s[-1,9] s[0,8] s[-1,8] + // ... s[0,13] s[-1,13] s[0,12] s[-1,12] + // so that we can call multiply and add with the kernel to get 32-bit words of + // the form + // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2] + // Finally, we can add multiple rows together to get the desired output. + + // First shuffle the data + src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride)); + src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0); + src_reg_m10_lo_1 = _mm_unpacklo_epi8(src_reg_m10_lo, _mm_setzero_si128()); + + // More shuffling + src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)); + src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1); + src_reg_01_lo_1 = _mm_unpacklo_epi8(src_reg_01_lo, _mm_setzero_si128()); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)); + + src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2); + + src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)); + + src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3); + + // Partial output + res_reg_m10_lo = + mm_madd_packs_epi16_sse2(&src_reg_m10_lo_1, ®_zero, &kernel_reg_23); + + res_reg_01_lo = + mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, ®_zero, &kernel_reg_23); + + src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128()); + res_reg_12_lo = + mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, ®_zero, &kernel_reg_45); + + src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128()); + res_reg_23_lo = + mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, ®_zero, &kernel_reg_45); + + // Add to get results + res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo); + res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo); + + // Round the words + res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6); + res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, ®_32, 6); + + // Convert to 8-bit words + res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, reg_zero); + res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, reg_zero); + + // Save only half of the register (8 words) + *((int *)(dst_ptr)) = _mm_cvtsi128_si32(res_reg_m1012); + *((int *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(res_reg_0123); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m10_lo_1 = src_reg_12_lo_1; + src_reg_01_lo_1 = src_reg_23_lo_1; + src_reg_1 = src_reg_3; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64 +static void vpx_highbd_filter_block1d4_h4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + // We will load multiple shifted versions of the row and shuffle them into + // 16-bit words of the form + // ... s[2] s[1] s[0] s[-1] + // ... s[4] s[3] s[2] s[1] + // Then we call multiply and add to get partial results + // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2] + // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4] + // The two results are then added together to get the even output + + __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3; + __m128i res_reg; + __m128i even, odd; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + const __m128i reg_round = + _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1); + const __m128i reg_zero = _mm_setzero_si128(); + int h; + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + for (h = height; h > 0; --h) { + src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_shift_1 = _mm_srli_si128(src_reg, 2); + src_reg_shift_2 = _mm_srli_si128(src_reg, 4); + src_reg_shift_3 = _mm_srli_si128(src_reg, 6); + + // Output 2 0 + even = mm_madd_add_epi16_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, + &kernel_reg_45); + + // Output 3 1 + odd = mm_madd_add_epi16_sse2(&src_reg_shift_1, &src_reg_shift_3, + &kernel_reg_23, &kernel_reg_45); + + // Combine to get the first half of the dst + res_reg = _mm_unpacklo_epi32(even, odd); + res_reg = mm_round_epi32_sse2(&res_reg, ®_round, CONV8_ROUNDING_BITS); + res_reg = _mm_packs_epi32(res_reg, reg_zero); + + // Saturate the result and save + res_reg = _mm_min_epi16(res_reg, reg_max); + res_reg = _mm_max_epi16(res_reg, reg_zero); + _mm_storel_epi64((__m128i *)dst_ptr, res_reg); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void vpx_highbd_filter_block1d4_v4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + // We will load two rows of pixels as 16-bit words, and shuffle them into the + // form + // ... s[0,1] s[-1,1] s[0,0] s[-1,0] + // ... s[0,7] s[-1,7] s[0,6] s[-1,6] + // ... s[0,9] s[-1,9] s[0,8] s[-1,8] + // ... s[0,13] s[-1,13] s[0,12] s[-1,12] + // so that we can call multiply and add with the kernel to get 32-bit words of + // the form + // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2] + // Finally, we can add multiple rows together to get the desired output. + + // Register for source s[-1:3, :] + __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m128i src_reg_m10, src_reg_01; + __m128i src_reg_12, src_reg_23; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + + // Result after multiply and add + __m128i res_reg_m10, res_reg_01, res_reg_12, res_reg_23; + __m128i res_reg_m1012, res_reg_0123; + + const __m128i reg_round = + _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1); + const __m128i reg_zero = _mm_setzero_si128(); + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + // First shuffle the data + src_reg_m1 = _mm_loadl_epi64((const __m128i *)src_ptr); + src_reg_0 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride)); + src_reg_m10 = _mm_unpacklo_epi16(src_reg_m1, src_reg_0); + + // More shuffling + src_reg_1 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 2)); + src_reg_01 = _mm_unpacklo_epi16(src_reg_0, src_reg_1); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)); + + src_reg_12 = _mm_unpacklo_epi16(src_reg_1, src_reg_2); + + src_reg_3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)); + + src_reg_23 = _mm_unpacklo_epi16(src_reg_2, src_reg_3); + + // Partial output + res_reg_m10 = _mm_madd_epi16(src_reg_m10, kernel_reg_23); + res_reg_01 = _mm_madd_epi16(src_reg_01, kernel_reg_23); + res_reg_12 = _mm_madd_epi16(src_reg_12, kernel_reg_45); + res_reg_23 = _mm_madd_epi16(src_reg_23, kernel_reg_45); + + // Add to get results + res_reg_m1012 = _mm_add_epi32(res_reg_m10, res_reg_12); + res_reg_0123 = _mm_add_epi32(res_reg_01, res_reg_23); + + // Round the words + res_reg_m1012 = + mm_round_epi32_sse2(&res_reg_m1012, ®_round, CONV8_ROUNDING_BITS); + res_reg_0123 = + mm_round_epi32_sse2(&res_reg_0123, ®_round, CONV8_ROUNDING_BITS); + + res_reg_m1012 = _mm_packs_epi32(res_reg_m1012, reg_zero); + res_reg_0123 = _mm_packs_epi32(res_reg_0123, reg_zero); + + // Saturate according to bit depth + res_reg_m1012 = _mm_min_epi16(res_reg_m1012, reg_max); + res_reg_0123 = _mm_min_epi16(res_reg_0123, reg_max); + res_reg_m1012 = _mm_max_epi16(res_reg_m1012, reg_zero); + res_reg_0123 = _mm_max_epi16(res_reg_0123, reg_zero); + + // Save only half of the register (8 words) + _mm_storel_epi64((__m128i *)dst_ptr, res_reg_m1012); + _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res_reg_0123); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m10 = src_reg_12; + src_reg_01 = src_reg_23; + src_reg_1 = src_reg_3; + } +} + +static void vpx_highbd_filter_block1d8_h4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + // We will load multiple shifted versions of the row and shuffle them into + // 16-bit words of the form + // ... s[2] s[1] s[0] s[-1] + // ... s[4] s[3] s[2] s[1] + // Then we call multiply and add to get partial results + // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2] + // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4] + // The two results are then added together for the first half of even + // output. + // Repeat multiple times to get the whole outoput + + __m128i src_reg, src_reg_next, src_reg_shift_1, src_reg_shift_2, + src_reg_shift_3; + __m128i res_reg; + __m128i even, odd; + __m128i tmp_0, tmp_1; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + const __m128i reg_round = + _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1); + const __m128i reg_zero = _mm_setzero_si128(); + int h; + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + for (h = height; h > 0; --h) { + // We will put first half in the first half of the reg, and second half in + // second half + src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_next = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); + + // Output 6 4 2 0 + tmp_0 = _mm_srli_si128(src_reg, 4); + tmp_1 = _mm_srli_si128(src_reg_next, 2); + src_reg_shift_2 = _mm_unpacklo_epi64(tmp_0, tmp_1); + even = mm_madd_add_epi16_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, + &kernel_reg_45); + + // Output 7 5 3 1 + tmp_0 = _mm_srli_si128(src_reg, 2); + tmp_1 = src_reg_next; + src_reg_shift_1 = _mm_unpacklo_epi64(tmp_0, tmp_1); + + tmp_0 = _mm_srli_si128(src_reg, 6); + tmp_1 = _mm_srli_si128(src_reg_next, 4); + src_reg_shift_3 = _mm_unpacklo_epi64(tmp_0, tmp_1); + + odd = mm_madd_add_epi16_sse2(&src_reg_shift_1, &src_reg_shift_3, + &kernel_reg_23, &kernel_reg_45); + + // Combine to get the first half of the dst + even = mm_round_epi32_sse2(&even, ®_round, CONV8_ROUNDING_BITS); + odd = mm_round_epi32_sse2(&odd, ®_round, CONV8_ROUNDING_BITS); + res_reg = mm_zip_epi32_sse2(&even, &odd); + + // Saturate the result and save + res_reg = _mm_min_epi16(res_reg, reg_max); + res_reg = _mm_max_epi16(res_reg, reg_zero); + + _mm_store_si128((__m128i *)dst_ptr, res_reg); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void vpx_highbd_filter_block1d8_v4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + // We will load two rows of pixels as 16-bit words, and shuffle them into the + // form + // ... s[0,1] s[-1,1] s[0,0] s[-1,0] + // ... s[0,7] s[-1,7] s[0,6] s[-1,6] + // ... s[0,9] s[-1,9] s[0,8] s[-1,8] + // ... s[0,13] s[-1,13] s[0,12] s[-1,12] + // so that we can call multiply and add with the kernel to get 32-bit words of + // the form + // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2] + // Finally, we can add multiple rows together to get the desired output. + + // Register for source s[-1:3, :] + __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m128i src_reg_m10_lo, src_reg_01_lo, src_reg_m10_hi, src_reg_01_hi; + __m128i src_reg_12_lo, src_reg_23_lo, src_reg_12_hi, src_reg_23_hi; + + // Result after multiply and add + __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo; + __m128i res_reg_m10_hi, res_reg_01_hi, res_reg_12_hi, res_reg_23_hi; + __m128i res_reg_m1012, res_reg_0123; + __m128i res_reg_m1012_lo, res_reg_0123_lo; + __m128i res_reg_m1012_hi, res_reg_0123_hi; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + + const __m128i reg_round = + _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1); + const __m128i reg_zero = _mm_setzero_si128(); + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + // First shuffle the data + src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride)); + src_reg_m10_lo = _mm_unpacklo_epi16(src_reg_m1, src_reg_0); + src_reg_m10_hi = _mm_unpackhi_epi16(src_reg_m1, src_reg_0); + + // More shuffling + src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)); + src_reg_01_lo = _mm_unpacklo_epi16(src_reg_0, src_reg_1); + src_reg_01_hi = _mm_unpackhi_epi16(src_reg_0, src_reg_1); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)); + + src_reg_12_lo = _mm_unpacklo_epi16(src_reg_1, src_reg_2); + src_reg_12_hi = _mm_unpackhi_epi16(src_reg_1, src_reg_2); + + src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)); + + src_reg_23_lo = _mm_unpacklo_epi16(src_reg_2, src_reg_3); + src_reg_23_hi = _mm_unpackhi_epi16(src_reg_2, src_reg_3); + + // Partial output for first half + res_reg_m10_lo = _mm_madd_epi16(src_reg_m10_lo, kernel_reg_23); + res_reg_01_lo = _mm_madd_epi16(src_reg_01_lo, kernel_reg_23); + res_reg_12_lo = _mm_madd_epi16(src_reg_12_lo, kernel_reg_45); + res_reg_23_lo = _mm_madd_epi16(src_reg_23_lo, kernel_reg_45); + + // Add to get results + res_reg_m1012_lo = _mm_add_epi32(res_reg_m10_lo, res_reg_12_lo); + res_reg_0123_lo = _mm_add_epi32(res_reg_01_lo, res_reg_23_lo); + + // Round the words + res_reg_m1012_lo = + mm_round_epi32_sse2(&res_reg_m1012_lo, ®_round, CONV8_ROUNDING_BITS); + res_reg_0123_lo = + mm_round_epi32_sse2(&res_reg_0123_lo, ®_round, CONV8_ROUNDING_BITS); + + // Partial output for first half + res_reg_m10_hi = _mm_madd_epi16(src_reg_m10_hi, kernel_reg_23); + res_reg_01_hi = _mm_madd_epi16(src_reg_01_hi, kernel_reg_23); + res_reg_12_hi = _mm_madd_epi16(src_reg_12_hi, kernel_reg_45); + res_reg_23_hi = _mm_madd_epi16(src_reg_23_hi, kernel_reg_45); + + // Add to get results + res_reg_m1012_hi = _mm_add_epi32(res_reg_m10_hi, res_reg_12_hi); + res_reg_0123_hi = _mm_add_epi32(res_reg_01_hi, res_reg_23_hi); + + // Round the words + res_reg_m1012_hi = + mm_round_epi32_sse2(&res_reg_m1012_hi, ®_round, CONV8_ROUNDING_BITS); + res_reg_0123_hi = + mm_round_epi32_sse2(&res_reg_0123_hi, ®_round, CONV8_ROUNDING_BITS); + + // Combine the two halfs + res_reg_m1012 = _mm_packs_epi32(res_reg_m1012_lo, res_reg_m1012_hi); + res_reg_0123 = _mm_packs_epi32(res_reg_0123_lo, res_reg_0123_hi); + + // Saturate according to bit depth + res_reg_m1012 = _mm_min_epi16(res_reg_m1012, reg_max); + res_reg_0123 = _mm_min_epi16(res_reg_0123, reg_max); + res_reg_m1012 = _mm_max_epi16(res_reg_m1012, reg_zero); + res_reg_0123 = _mm_max_epi16(res_reg_0123, reg_zero); + + // Save only half of the register (8 words) + _mm_store_si128((__m128i *)dst_ptr, res_reg_m1012); + _mm_store_si128((__m128i *)(dst_ptr + dst_stride), res_reg_0123); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m10_lo = src_reg_12_lo; + src_reg_m10_hi = src_reg_12_hi; + src_reg_01_lo = src_reg_23_lo; + src_reg_01_hi = src_reg_23_hi; + src_reg_1 = src_reg_3; + } +} + +static void vpx_highbd_filter_block1d16_h4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + vpx_highbd_filter_block1d8_h4_sse2(src_ptr, src_stride, dst_ptr, dst_stride, + height, kernel, bd); + vpx_highbd_filter_block1d8_h4_sse2(src_ptr + 8, src_stride, dst_ptr + 8, + dst_stride, height, kernel, bd); +} + +static void vpx_highbd_filter_block1d16_v4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + vpx_highbd_filter_block1d8_v4_sse2(src_ptr, src_stride, dst_ptr, dst_stride, + height, kernel, bd); + vpx_highbd_filter_block1d8_v4_sse2(src_ptr + 8, src_stride, dst_ptr + 8, + dst_stride, height, kernel, bd); +} +#endif // CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64 + +// From vpx_subpixel_8t_sse2.asm. +filter8_1dfunction vpx_filter_block1d16_v8_sse2; +filter8_1dfunction vpx_filter_block1d16_h8_sse2; +filter8_1dfunction vpx_filter_block1d8_v8_sse2; +filter8_1dfunction vpx_filter_block1d8_h8_sse2; +filter8_1dfunction vpx_filter_block1d4_v8_sse2; +filter8_1dfunction vpx_filter_block1d4_h8_sse2; +filter8_1dfunction vpx_filter_block1d16_v8_avg_sse2; +filter8_1dfunction vpx_filter_block1d16_h8_avg_sse2; +filter8_1dfunction vpx_filter_block1d8_v8_avg_sse2; +filter8_1dfunction vpx_filter_block1d8_h8_avg_sse2; +filter8_1dfunction vpx_filter_block1d4_v8_avg_sse2; +filter8_1dfunction vpx_filter_block1d4_h8_avg_sse2; + +// Use the [vh]8 version because there is no [vh]4 implementation. +#define vpx_filter_block1d16_v4_avg_sse2 vpx_filter_block1d16_v8_avg_sse2 +#define vpx_filter_block1d16_h4_avg_sse2 vpx_filter_block1d16_h8_avg_sse2 +#define vpx_filter_block1d8_v4_avg_sse2 vpx_filter_block1d8_v8_avg_sse2 +#define vpx_filter_block1d8_h4_avg_sse2 vpx_filter_block1d8_h8_avg_sse2 +#define vpx_filter_block1d4_v4_avg_sse2 vpx_filter_block1d4_v8_avg_sse2 +#define vpx_filter_block1d4_h4_avg_sse2 vpx_filter_block1d4_h8_avg_sse2 + +// From vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm. +filter8_1dfunction vpx_filter_block1d16_v2_sse2; +filter8_1dfunction vpx_filter_block1d16_h2_sse2; +filter8_1dfunction vpx_filter_block1d8_v2_sse2; +filter8_1dfunction vpx_filter_block1d8_h2_sse2; +filter8_1dfunction vpx_filter_block1d4_v2_sse2; +filter8_1dfunction vpx_filter_block1d4_h2_sse2; +filter8_1dfunction vpx_filter_block1d16_v2_avg_sse2; +filter8_1dfunction vpx_filter_block1d16_h2_avg_sse2; +filter8_1dfunction vpx_filter_block1d8_v2_avg_sse2; +filter8_1dfunction vpx_filter_block1d8_h2_avg_sse2; +filter8_1dfunction vpx_filter_block1d4_v2_avg_sse2; +filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2; + +// void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +// void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +// void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h); +// void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0) +FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - (num_taps / 2 - 1) * src_stride, , + sse2, 0) +FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1) +FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, + src - (num_taps / 2 - 1) * src_stride, avg_, sse2, 1) + +// void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +// void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, sse2, 0) +FUN_CONV_2D(avg_, sse2, 1) + +#if CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64 +// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm. +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2; + +// Use the [vh]8 version because there is no [vh]4 implementation. +#define vpx_highbd_filter_block1d16_v4_avg_sse2 \ + vpx_highbd_filter_block1d16_v8_avg_sse2 +#define vpx_highbd_filter_block1d16_h4_avg_sse2 \ + vpx_highbd_filter_block1d16_h8_avg_sse2 +#define vpx_highbd_filter_block1d8_v4_avg_sse2 \ + vpx_highbd_filter_block1d8_v8_avg_sse2 +#define vpx_highbd_filter_block1d8_h4_avg_sse2 \ + vpx_highbd_filter_block1d8_h8_avg_sse2 +#define vpx_highbd_filter_block1d4_v4_avg_sse2 \ + vpx_highbd_filter_block1d4_v8_avg_sse2 +#define vpx_highbd_filter_block1d4_h4_avg_sse2 \ + vpx_highbd_filter_block1d4_h8_avg_sse2 + +// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm. +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2; + +// void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +// void vpx_highbd_convolve8_vert_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +// void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +// void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0) +HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, + src - src_stride * (num_taps / 2 - 1), , sse2, 0) +HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1) +HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, + src - src_stride * (num_taps / 2 - 1), avg_, sse2, 1) + +// void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h, int bd); +// void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h, int bd); +HIGH_FUN_CONV_2D(, sse2, 0) +HIGH_FUN_CONV_2D(avg_, sse2, 1) +#endif // CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64 diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c new file mode 100644 index 0000000000..526c283823 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -0,0 +1,1374 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/convolve.h" +#include "vpx_dsp/x86/convolve_avx2.h" +#include "vpx_dsp/x86/convolve_sse2.h" +#include "vpx_dsp/x86/convolve_ssse3.h" +#include "vpx_ports/mem.h" + +// filters for 16_h8 +DECLARE_ALIGNED(32, static const uint8_t, + filt1_global_avx2[32]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, + 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3, + 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + +DECLARE_ALIGNED(32, static const uint8_t, + filt2_global_avx2[32]) = { 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, + 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, + 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 }; + +DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[64]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, + 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, + 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, +}; + +#define CALC_CONVOLVE8_HORZ_ROW \ + srcReg = mm256_loadu2_si128(src_ptr - 3, src_ptr - 3 + src_pitch); \ + s1[0] = _mm256_shuffle_epi8(srcReg, filt[0]); \ + s1[1] = _mm256_shuffle_epi8(srcReg, filt[1]); \ + s1[2] = _mm256_shuffle_epi8(srcReg, filt[2]); \ + s1[3] = _mm256_shuffle_epi8(srcReg, filt[3]); \ + s1[0] = convolve8_16_avx2(s1, f1); \ + s1[0] = _mm256_packus_epi16(s1[0], s1[0]); \ + src_ptr += src_stride; \ + _mm_storel_epi64((__m128i *)&output_ptr[0], _mm256_castsi256_si128(s1[0])); \ + output_ptr += output_pitch; \ + _mm_storel_epi64((__m128i *)&output_ptr[0], \ + _mm256_extractf128_si256(s1[0], 1)); \ + output_ptr += output_pitch; + +static INLINE void vpx_filter_block1d16_h8_x_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter, + const int avg) { + __m128i outReg1, outReg2; + __m256i outReg32b1, outReg32b2; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + __m256i f[4], filt[4], s[4]; + + shuffle_filter_avx2(filter, f); + filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2); + filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2); + filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2); + filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2); + + // multiple the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + __m256i srcReg; + + // load the 2 strides of source + srcReg = mm256_loadu2_si128(src_ptr - 3, src_ptr + src_pixels_per_line - 3); + + // filter the source buffer + s[0] = _mm256_shuffle_epi8(srcReg, filt[0]); + s[1] = _mm256_shuffle_epi8(srcReg, filt[1]); + s[2] = _mm256_shuffle_epi8(srcReg, filt[2]); + s[3] = _mm256_shuffle_epi8(srcReg, filt[3]); + outReg32b1 = convolve8_16_avx2(s, f); + + // reading 2 strides of the next 16 bytes + // (part of it was being read by earlier read) + srcReg = mm256_loadu2_si128(src_ptr + 5, src_ptr + src_pixels_per_line + 5); + + // filter the source buffer + s[0] = _mm256_shuffle_epi8(srcReg, filt[0]); + s[1] = _mm256_shuffle_epi8(srcReg, filt[1]); + s[2] = _mm256_shuffle_epi8(srcReg, filt[2]); + s[3] = _mm256_shuffle_epi8(srcReg, filt[3]); + outReg32b2 = convolve8_16_avx2(s, f); + + // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane + // contain the first and second convolve result respectively + outReg32b1 = _mm256_packus_epi16(outReg32b1, outReg32b2); + + src_ptr += src_stride; + + if (avg) { + const __m256i outReg = mm256_loadu2_si128( + (__m128i *)output_ptr, (__m128i *)(output_ptr + output_pitch)); + outReg32b1 = _mm256_avg_epu8(outReg32b1, outReg); + } + mm256_store2_si128((__m128i *)output_ptr, + (__m128i *)(output_ptr + output_pitch), &outReg32b1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 16 bytes + if (i > 0) { + const __m128i srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + const __m128i srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); + const __m256i srcReg = + _mm256_inserti128_si256(_mm256_castsi128_si256(srcReg1), srcReg2, 1); + + // filter the source buffer + s[0] = _mm256_shuffle_epi8(srcReg, filt[0]); + s[1] = _mm256_shuffle_epi8(srcReg, filt[1]); + s[2] = _mm256_shuffle_epi8(srcReg, filt[2]); + s[3] = _mm256_shuffle_epi8(srcReg, filt[3]); + + // The low and high 128-bits of each lane contain the first and second + // convolve result respectively + outReg32b1 = convolve8_16_avx2(s, f); + outReg1 = _mm256_castsi256_si128(outReg32b1); + outReg2 = _mm256_extractf128_si256(outReg32b1, 1); + + // shrink to 8 bit each 16 bits + outReg1 = _mm_packus_epi16(outReg1, outReg2); + + // average if necessary + if (avg) { + outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr)); + } + + // save 16 bytes + _mm_store_si128((__m128i *)output_ptr, outReg1); + } +} + +static void vpx_filter_block1d16_h8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *output_ptr, + ptrdiff_t dst_stride, uint32_t output_height, const int16_t *filter) { + vpx_filter_block1d16_h8_x_avx2(src_ptr, src_stride, output_ptr, dst_stride, + output_height, filter, 0); +} + +static void vpx_filter_block1d16_h8_avg_avx2( + const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *output_ptr, + ptrdiff_t dst_stride, uint32_t output_height, const int16_t *filter) { + vpx_filter_block1d16_h8_x_avx2(src_ptr, src_stride, output_ptr, dst_stride, + output_height, filter, 1); +} + +static void vpx_filter_block1d8_h8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m256i filt[4], f1[4], s1[4], srcReg; + __m128i f[4], s[4]; + int y = output_height; + + // Multiply the size of the source stride by two + const ptrdiff_t src_stride = src_pitch << 1; + + shuffle_filter_avx2(filter, f1); + filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2); + filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2); + filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2); + filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2); + + // Process next 4 rows + while (y > 3) { + CALC_CONVOLVE8_HORZ_ROW + CALC_CONVOLVE8_HORZ_ROW + y -= 4; + } + + // If remaining, then process 2 rows at a time + while (y > 1) { + CALC_CONVOLVE8_HORZ_ROW + y -= 2; + } + + // For the remaining height. + if (y > 0) { + const __m128i src_reg_128 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + + f[0] = _mm256_castsi256_si128(f1[0]); + f[1] = _mm256_castsi256_si128(f1[1]); + f[2] = _mm256_castsi256_si128(f1[2]); + f[3] = _mm256_castsi256_si128(f1[3]); + + // filter the source buffer + s[0] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[0])); + s[1] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[1])); + s[2] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[2])); + s[3] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[3])); + s[0] = convolve8_8_ssse3(s, f); + + // Saturate 16bit value to 8bit. + s[0] = _mm_packus_epi16(s[0], s[0]); + + // Save only 8 bytes + _mm_storel_epi64((__m128i *)&output_ptr[0], s[0]); + } +} + +static INLINE void vpx_filter_block1d16_v8_x_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter, + const int avg) { + __m256i srcRegHead1; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + __m256i f[4], s1[4], s2[4]; + + shuffle_filter_avx2(filter, f); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + { + __m128i s[6]; + __m256i s32b[6]; + + // load 16 bytes 7 times in stride of src_pitch + s[0] = _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_pitch)); + s[1] = _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_pitch)); + s[2] = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_pitch)); + s[3] = _mm_loadu_si128((const __m128i *)(src_ptr + 3 * src_pitch)); + s[4] = _mm_loadu_si128((const __m128i *)(src_ptr + 4 * src_pitch)); + s[5] = _mm_loadu_si128((const __m128i *)(src_ptr + 5 * src_pitch)); + srcRegHead1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + 6 * src_pitch))); + + // have each consecutive loads on the same 256 register + s32b[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[1], 1); + s32b[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[2], 1); + s32b[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[3], 1); + s32b[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[4], 1); + s32b[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[5], 1); + s32b[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), + _mm256_castsi256_si128(srcRegHead1), 1); + + // merge every two consecutive registers except the last one + // the first lanes contain values for filtering odd rows (1,3,5...) and + // the second lanes contain values for filtering even rows (2,4,6...) + s1[0] = _mm256_unpacklo_epi8(s32b[0], s32b[1]); + s2[0] = _mm256_unpackhi_epi8(s32b[0], s32b[1]); + s1[1] = _mm256_unpacklo_epi8(s32b[2], s32b[3]); + s2[1] = _mm256_unpackhi_epi8(s32b[2], s32b[3]); + s1[2] = _mm256_unpacklo_epi8(s32b[4], s32b[5]); + s2[2] = _mm256_unpackhi_epi8(s32b[4], s32b[5]); + } + + // The output_height is always a multiple of two. + assert(!(output_height & 1)); + + for (i = output_height; i > 1; i -= 2) { + __m256i srcRegHead2, srcRegHead3; + + // load the next 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcRegHead2 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + 7 * src_pitch))); + srcRegHead1 = _mm256_inserti128_si256( + srcRegHead1, _mm256_castsi256_si128(srcRegHead2), 1); + srcRegHead3 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + 8 * src_pitch))); + srcRegHead2 = _mm256_inserti128_si256( + srcRegHead2, _mm256_castsi256_si128(srcRegHead3), 1); + + // merge the two new consecutive registers + // the first lane contain values for filtering odd rows (1,3,5...) and + // the second lane contain values for filtering even rows (2,4,6...) + s1[3] = _mm256_unpacklo_epi8(srcRegHead1, srcRegHead2); + s2[3] = _mm256_unpackhi_epi8(srcRegHead1, srcRegHead2); + + s1[0] = convolve8_16_avx2(s1, f); + s2[0] = convolve8_16_avx2(s2, f); + + // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane + // contain the first and second convolve result respectively + s1[0] = _mm256_packus_epi16(s1[0], s2[0]); + + src_ptr += src_stride; + + // average if necessary + if (avg) { + const __m256i outReg = mm256_loadu2_si128( + (__m128i *)output_ptr, (__m128i *)(output_ptr + out_pitch)); + s1[0] = _mm256_avg_epu8(s1[0], outReg); + } + + mm256_store2_si128((__m128i *)output_ptr, + (__m128i *)(output_ptr + out_pitch), s1); + + output_ptr += dst_stride; + + // shift down by two rows + s1[0] = s1[1]; + s2[0] = s2[1]; + s1[1] = s1[2]; + s2[1] = s2[2]; + s1[2] = s1[3]; + s2[2] = s2[3]; + srcRegHead1 = srcRegHead3; + } +} + +static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *filter) { + vpx_filter_block1d16_v8_x_avx2(src_ptr, src_stride, dst_ptr, dst_stride, + height, filter, 0); +} + +static void vpx_filter_block1d16_v8_avg_avx2( + const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *filter) { + vpx_filter_block1d16_v8_x_avx2(src_ptr, src_stride, dst_ptr, dst_stride, + height, filter, 1); +} + +static void vpx_filter_block1d16_h4_avx2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will cast the kernel from 16-bit words to 8-bit words, and then extract + // the middle four elements of the kernel into two registers in the form + // ... k[3] k[2] k[3] k[2] + // ... k[5] k[4] k[5] k[4] + // Then we shuffle the source into + // ... s[1] s[0] s[0] s[-1] + // ... s[3] s[2] s[2] s[1] + // Calling multiply and add gives us half of the sum. Calling add gives us + // first half of the output. Repeat again to get the second half of the + // output. Finally we shuffle again to combine the two outputs. + // Since avx2 allows us to use 256-bit buffer, we can do this two rows at a + // time. + + __m128i kernel_reg; // Kernel + __m256i kernel_reg_256, kernel_reg_23, + kernel_reg_45; // Segments of the kernel used + const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding + const ptrdiff_t unrolled_src_stride = src_stride << 1; + const ptrdiff_t unrolled_dst_stride = dst_stride << 1; + int h; + + __m256i src_reg, src_reg_shift_0, src_reg_shift_2; + __m256i dst_first, dst_second; + __m256i tmp_0, tmp_1; + __m256i idx_shift_0 = + _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, + 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); + __m256i idx_shift_2 = + _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, + 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10); + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg); + kernel_reg_256 = _mm256_broadcastsi128_si256(kernel_reg); + kernel_reg_23 = + _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0302u)); + kernel_reg_45 = + _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0504u)); + + for (h = height; h >= 2; h -= 2) { + // Load the source + src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + // Partial result for first half + tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23); + tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45); + dst_first = _mm256_adds_epi16(tmp_0, tmp_1); + + // Do again to get the second half of dst + // Load the source + src_reg = mm256_loadu2_si128(src_ptr + 8, src_ptr + src_stride + 8); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + // Partial result for second half + tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23); + tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45); + dst_second = _mm256_adds_epi16(tmp_0, tmp_1); + + // Round each result + dst_first = mm256_round_epi16(&dst_first, ®_32, 6); + dst_second = mm256_round_epi16(&dst_second, ®_32, 6); + + // Finally combine to get the final dst + dst_first = _mm256_packus_epi16(dst_first, dst_second); + mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &dst_first); + + src_ptr += unrolled_src_stride; + dst_ptr += unrolled_dst_stride; + } + + // Repeat for the last row if needed + if (h > 0) { + src_reg = _mm256_loadu_si256((const __m256i *)src_ptr); + // Reorder into 2 1 1 2 + src_reg = _mm256_permute4x64_epi64(src_reg, 0x94); + + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23); + tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45); + dst_first = _mm256_adds_epi16(tmp_0, tmp_1); + + dst_first = mm256_round_epi16(&dst_first, ®_32, 6); + + dst_first = _mm256_packus_epi16(dst_first, dst_first); + dst_first = _mm256_permute4x64_epi64(dst_first, 0x8); + + _mm_store_si128((__m128i *)dst_ptr, _mm256_castsi256_si128(dst_first)); + } +} + +static void vpx_filter_block1d16_v4_avx2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will load two rows of pixels as 8-bit words, rearrange them into the + // form + // ... s[1,0] s[0,0] s[0,0] s[-1,0] + // so that we can call multiply and add with the kernel partial output. Then + // we can call add with another row to get the output. + + // Register for source s[-1:3, :] + __m256i src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23; + __m256i src_reg_m1001_lo, src_reg_m1001_hi, src_reg_1223_lo, src_reg_1223_hi; + + __m128i kernel_reg; // Kernel + __m256i kernel_reg_256, kernel_reg_23, + kernel_reg_45; // Segments of the kernel used + + // Result after multiply and add + __m256i res_reg_m1001_lo, res_reg_1223_lo, res_reg_m1001_hi, res_reg_1223_hi; + __m256i res_reg, res_reg_lo, res_reg_hi; + + const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg); + kernel_reg_256 = _mm256_broadcastsi128_si256(kernel_reg); + kernel_reg_23 = + _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0302u)); + kernel_reg_45 = + _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0504u)); + + // Row -1 to row 0 + src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr, + (const __m128i *)(src_ptr + src_stride)); + + // Row 0 to row 1 + src_reg_1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2))); + src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21); + + // First three rows + src_reg_m1001_lo = _mm256_unpacklo_epi8(src_reg_m10, src_reg_01); + src_reg_m1001_hi = _mm256_unpackhi_epi8(src_reg_m10, src_reg_01); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3))); + + src_reg_12 = _mm256_inserti128_si256(src_reg_1, + _mm256_castsi256_si128(src_reg_2), 1); + + src_reg_3 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4))); + + src_reg_23 = _mm256_inserti128_si256(src_reg_2, + _mm256_castsi256_si128(src_reg_3), 1); + + // Last three rows + src_reg_1223_lo = _mm256_unpacklo_epi8(src_reg_12, src_reg_23); + src_reg_1223_hi = _mm256_unpackhi_epi8(src_reg_12, src_reg_23); + + // Output from first half + res_reg_m1001_lo = _mm256_maddubs_epi16(src_reg_m1001_lo, kernel_reg_23); + res_reg_1223_lo = _mm256_maddubs_epi16(src_reg_1223_lo, kernel_reg_45); + res_reg_lo = _mm256_adds_epi16(res_reg_m1001_lo, res_reg_1223_lo); + + // Output from second half + res_reg_m1001_hi = _mm256_maddubs_epi16(src_reg_m1001_hi, kernel_reg_23); + res_reg_1223_hi = _mm256_maddubs_epi16(src_reg_1223_hi, kernel_reg_45); + res_reg_hi = _mm256_adds_epi16(res_reg_m1001_hi, res_reg_1223_hi); + + // Round the words + res_reg_lo = mm256_round_epi16(&res_reg_lo, ®_32, 6); + res_reg_hi = mm256_round_epi16(&res_reg_hi, ®_32, 6); + + // Combine to get the result + res_reg = _mm256_packus_epi16(res_reg_lo, res_reg_hi); + + // Save the result + mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &res_reg); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m1001_lo = src_reg_1223_lo; + src_reg_m1001_hi = src_reg_1223_hi; + src_reg_1 = src_reg_3; + } +} + +static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will cast the kernel from 16-bit words to 8-bit words, and then extract + // the middle four elements of the kernel into two registers in the form + // ... k[3] k[2] k[3] k[2] + // ... k[5] k[4] k[5] k[4] + // Then we shuffle the source into + // ... s[1] s[0] s[0] s[-1] + // ... s[3] s[2] s[2] s[1] + // Calling multiply and add gives us half of the sum. Calling add gives us + // first half of the output. Repeat again to get the second half of the + // output. Finally we shuffle again to combine the two outputs. + // Since avx2 allows us to use 256-bit buffer, we can do this two rows at a + // time. + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg, kernel_reg_23, + kernel_reg_45; // Segments of the kernel used + const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding + const ptrdiff_t unrolled_src_stride = src_stride << 1; + const ptrdiff_t unrolled_dst_stride = dst_stride << 1; + int h; + + __m256i idx_shift_0 = + _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, + 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); + __m256i idx_shift_2 = + _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, + 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10); + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1); + kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg_23 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0302u)); + kernel_reg_45 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0504u)); + + for (h = height; h >= 2; h -= 2) { + // Load the source + const __m256i src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride); + __m256i dst_reg; + __m256i tmp_0, tmp_1; + const __m256i src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + const __m256i src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + // Get the output + tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23); + tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45); + dst_reg = _mm256_adds_epi16(tmp_0, tmp_1); + + // Round the result + dst_reg = mm256_round_epi16(&dst_reg, ®_32, 6); + + // Finally combine to get the final dst + dst_reg = _mm256_packus_epi16(dst_reg, dst_reg); + mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &dst_reg); + + src_ptr += unrolled_src_stride; + dst_ptr += unrolled_dst_stride; + } + + // Repeat for the last row if needed + if (h > 0) { + const __m128i src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + __m128i dst_reg; + const __m128i reg_32_128 = _mm_set1_epi16(32); // Used for rounding + __m128i tmp_0, tmp_1; + + __m128i src_reg_shift_0 = + _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(idx_shift_0)); + __m128i src_reg_shift_2 = + _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(idx_shift_2)); + + tmp_0 = _mm_maddubs_epi16(src_reg_shift_0, + _mm256_castsi256_si128(kernel_reg_23)); + tmp_1 = _mm_maddubs_epi16(src_reg_shift_2, + _mm256_castsi256_si128(kernel_reg_45)); + dst_reg = _mm_adds_epi16(tmp_0, tmp_1); + + dst_reg = mm_round_epi16_sse2(&dst_reg, ®_32_128, 6); + + dst_reg = _mm_packus_epi16(dst_reg, _mm_setzero_si128()); + + _mm_storel_epi64((__m128i *)dst_ptr, dst_reg); + } +} + +static void vpx_filter_block1d8_v4_avx2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will load two rows of pixels as 8-bit words, rearrange them into the + // form + // ... s[1,0] s[0,0] s[0,0] s[-1,0] + // so that we can call multiply and add with the kernel partial output. Then + // we can call add with another row to get the output. + + // Register for source s[-1:3, :] + __m256i src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23; + __m256i src_reg_m1001, src_reg_1223; + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg, kernel_reg_23, + kernel_reg_45; // Segments of the kernel used + + // Result after multiply and add + __m256i res_reg_m1001, res_reg_1223; + __m256i res_reg; + + const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1); + kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg_23 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0302u)); + kernel_reg_45 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0504u)); + + // Row -1 to row 0 + src_reg_m10 = mm256_loadu2_epi64((const __m128i *)src_ptr, + (const __m128i *)(src_ptr + src_stride)); + + // Row 0 to row 1 + src_reg_1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2))); + src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21); + + // First three rows + src_reg_m1001 = _mm256_unpacklo_epi8(src_reg_m10, src_reg_01); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3))); + + src_reg_12 = _mm256_inserti128_si256(src_reg_1, + _mm256_castsi256_si128(src_reg_2), 1); + + src_reg_3 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4))); + + src_reg_23 = _mm256_inserti128_si256(src_reg_2, + _mm256_castsi256_si128(src_reg_3), 1); + + // Last three rows + src_reg_1223 = _mm256_unpacklo_epi8(src_reg_12, src_reg_23); + + // Output + res_reg_m1001 = _mm256_maddubs_epi16(src_reg_m1001, kernel_reg_23); + res_reg_1223 = _mm256_maddubs_epi16(src_reg_1223, kernel_reg_45); + res_reg = _mm256_adds_epi16(res_reg_m1001, res_reg_1223); + + // Round the words + res_reg = mm256_round_epi16(&res_reg, ®_32, 6); + + // Combine to get the result + res_reg = _mm256_packus_epi16(res_reg, res_reg); + + // Save the result + mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &res_reg); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m1001 = src_reg_1223; + src_reg_1 = src_reg_3; + } +} + +static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will cast the kernel from 16-bit words to 8-bit words, and then extract + // the middle four elements of the kernel into a single register in the form + // k[5:2] k[5:2] k[5:2] k[5:2] + // Then we shuffle the source into + // s[5:2] s[4:1] s[3:0] s[2:-1] + // Calling multiply and add gives us half of the sum next to each other. + // Calling horizontal add then gives us the output. + // Since avx2 has 256-bit register, we can do 2 rows at a time. + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg; + const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding + int h; + const ptrdiff_t unrolled_src_stride = src_stride << 1; + const ptrdiff_t unrolled_dst_stride = dst_stride << 1; + + __m256i shuf_idx = + _mm256_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, + 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6); + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1); + kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi32(0x05040302u)); + + for (h = height; h > 1; h -= 2) { + // Load the source + const __m256i src_reg = mm256_loadu2_epi64( + (const __m128i *)src_ptr, (const __m128i *)(src_ptr + src_stride)); + const __m256i src_reg_shuf = _mm256_shuffle_epi8(src_reg, shuf_idx); + + // Get the result + __m256i dst = _mm256_maddubs_epi16(src_reg_shuf, kernel_reg); + dst = _mm256_hadds_epi16(dst, _mm256_setzero_si256()); + + // Round result + dst = mm256_round_epi16(&dst, ®_32, 6); + + // Pack to 8-bits + dst = _mm256_packus_epi16(dst, _mm256_setzero_si256()); + + // Save + mm256_storeu2_epi32((__m128i *const)dst_ptr, + (__m128i *const)(dst_ptr + dst_stride), &dst); + + src_ptr += unrolled_src_stride; + dst_ptr += unrolled_dst_stride; + } + + if (h > 0) { + // Load the source + const __m128i reg_32_128 = _mm_set1_epi16(32); // Used for rounding + __m128i src_reg = _mm_loadl_epi64((const __m128i *)src_ptr); + __m128i src_reg_shuf = + _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(shuf_idx)); + + // Get the result + __m128i dst = + _mm_maddubs_epi16(src_reg_shuf, _mm256_castsi256_si128(kernel_reg)); + dst = _mm_hadds_epi16(dst, _mm_setzero_si128()); + + // Round result + dst = mm_round_epi16_sse2(&dst, ®_32_128, 6); + + // Pack to 8-bits + dst = _mm_packus_epi16(dst, _mm_setzero_si128()); + *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst); + } +} + +static void vpx_filter_block1d4_v4_avx2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will load two rows of pixels as 8-bit words, rearrange them into the + // form + // ... s[3,0] s[2,0] s[1,0] s[0,0] s[2,0] s[1,0] s[0,0] s[-1,0] + // so that we can call multiply and add with the kernel to get partial output. + // Calling horizontal add then gives us the completely output + + // Register for source s[-1:3, :] + __m256i src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23; + __m256i src_reg_m1001, src_reg_1223, src_reg_m1012_1023; + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg; + + // Result after multiply and add + __m256i res_reg; + + const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1); + kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi32(0x05040302u)); + + // Row -1 to row 0 + src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr, + (const __m128i *)(src_ptr + src_stride)); + + // Row 0 to row 1 + src_reg_1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2))); + src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21); + + // First three rows + src_reg_m1001 = _mm256_unpacklo_epi8(src_reg_m10, src_reg_01); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3))); + + src_reg_12 = _mm256_inserti128_si256(src_reg_1, + _mm256_castsi256_si128(src_reg_2), 1); + + src_reg_3 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4))); + + src_reg_23 = _mm256_inserti128_si256(src_reg_2, + _mm256_castsi256_si128(src_reg_3), 1); + + // Last three rows + src_reg_1223 = _mm256_unpacklo_epi8(src_reg_12, src_reg_23); + + // Combine all the rows + src_reg_m1012_1023 = _mm256_unpacklo_epi16(src_reg_m1001, src_reg_1223); + + // Output + res_reg = _mm256_maddubs_epi16(src_reg_m1012_1023, kernel_reg); + res_reg = _mm256_hadds_epi16(res_reg, _mm256_setzero_si256()); + + // Round the words + res_reg = mm256_round_epi16(&res_reg, ®_32, 6); + + // Combine to get the result + res_reg = _mm256_packus_epi16(res_reg, res_reg); + + // Save the result + mm256_storeu2_epi32((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &res_reg); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m1001 = src_reg_1223; + src_reg_1 = src_reg_3; + } +} + +static void vpx_filter_block1d8_v8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m256i f[4], ss[4]; + __m256i r[8]; + __m128i s[9]; + + unsigned int y = output_height; + // Multiply the size of the source stride by two + const ptrdiff_t src_stride = src_pitch << 1; + + // The output_height is always a multiple of two. + assert(!(output_height & 1)); + + shuffle_filter_avx2(filter, f); + s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch)); + s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch)); + s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch)); + s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch)); + s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch)); + s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch)); + s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch)); + + // merge the result together + // r[0]: 0 0 0 0 0 0 0 0 r17 r16 r15 r14 r13 r12 r11 r10 | 0 0 0 0 0 0 0 0 + // r07 r06 r05 r04 r03 r02 r01 r00 + r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[1], 1); + + // r[1]: 0 0 0 0 0 0 0 0 r27 r26 r25 r24 r23 r22 r21 r20 | 0 0 0 0 0 0 0 0 + // r17 r16 r15 r14 r13 r12 r11 r10 + r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[2], 1); + + // r[2]: 0 0 0 0 0 0 0 0 r37 r36 r35 r34 r33 r32 r31 r30 | 0 0 0 0 0 0 0 0 + // r27 r26 r25 r24 r23 r22 r21 r20 + r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[3], 1); + + // r[3]: 0 0 0 0 0 0 0 0 r47 r46 r45 r44 r43 r42 r41 r40 | 0 0 0 0 0 0 0 0 + // r37 r36 r35 r34 r33 r32 r31 r30 + r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[4], 1); + + // r[4]: 0 0 0 0 0 0 0 0 r57 r56 r55 r54 r53 r52 r51 r50 | 0 0 0 0 0 0 0 0 + // r47 r46 r45 r44 r43 r42 r41 r40 + r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[5], 1); + + // r[5]: 0 0 0 0 0 0 0 0 r67 r66 r65 r64 r63 r62 r61 r60 | 0 0 0 0 0 0 0 0 + // r57 r56 r55 r54 r53 r52 r51 r50 + r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), s[6], 1); + + // Merge together + // ss[0]: |r27 r17|.......|r21 r11|r20 r10 || r17 r07|.....|r12 r02|r11 + // r01|r10 r00| + ss[0] = _mm256_unpacklo_epi8(r[0], r[1]); + + // ss[0]: |r47 r37|.......|r41 r31|r40 r30 || r37 r27|.....|r32 r22|r31 + // r21|r30 r20| + ss[1] = _mm256_unpacklo_epi8(r[2], r[3]); + + // ss[2]: |r67 r57|.......|r61 r51|r60 r50 || r57 r47|.....|r52 r42|r51 + // r41|r50 r40| + ss[2] = _mm256_unpacklo_epi8(r[4], r[5]); + + // Process 2 rows at a time + do { + s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch)); + s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch)); + + // r[6]: 0 0 0 0 0 0 0 0 r77 r76 r75 r74 r73 r72 r71 r70 | 0 0 0 0 0 0 0 + // 0 r67 r66 r65 r64 r63 r62 r61 r60 + r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[6]), s[7], 1); + // r[7]: 0 0 0 0 0 0 0 0 r87 r86 r85 r84 r83 r82 r81 r80 | 0 0 0 0 0 0 0 + // 0 r77 r76 r75 r74 r73 r72 r71 r70 + r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[7]), s[8], 1); + + // ss[3] : | r87 r77 | .......| r81 r71 | r80 r70 || r77 r67 | .....| r72 + // r62 | r71 r61|r70 r60| + ss[3] = _mm256_unpacklo_epi8(r[6], r[7]); + ss[0] = convolve8_16_avx2(ss, f); + ss[0] = _mm256_packus_epi16(ss[0], ss[0]); + src_ptr += src_stride; + + /* shift down two rows */ + s[6] = s[8]; + _mm_storel_epi64((__m128i *)&output_ptr[0], _mm256_castsi256_si128(ss[0])); + output_ptr += out_pitch; + _mm_storel_epi64((__m128i *)&output_ptr[0], + _mm256_extractf128_si256(ss[0], 1)); + output_ptr += out_pitch; + ss[0] = ss[1]; + ss[1] = ss[2]; + ss[2] = ss[3]; + y -= 2; + } while (y > 1); +} + +static void vpx_filter_block1d4_h8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg64_256bit; + unsigned int y = output_height; + + assert(output_height > 1); + + addFilterReg64_256bit = _mm256_set1_epi16(32); + + // f7 f6 f5 f4 f3 f2 f1 f0 (16 bit) + filtersReg = _mm_loadu_si128((const __m128i *)filter); + + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + // f7 f6 f5 f4 f3 f2 f1 f0 || f7 f6 f5 f4 f3 f2 f1 f0 (8 bit each) + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + { + ptrdiff_t src_stride; + __m256i filt1Reg, filt2Reg, firstFilters, secondFilters; + // have the same data in both lanes of a 256 bit register + // f7 f6 f5 f4 f3 f2 f1 f0 f7 f6 f5 f4 f3 f2 f1 f0 | f7 f6 f5 f4 f3 f2 f1 f0 + // f7 f6 f5 f4 f3 f2 f1 f0 (8bit each) + const __m256i filtersReg32 = _mm256_broadcastsi128_si256(filtersReg); + + // duplicate only the first 32 bits + // f3 f2 f1 f0|f3 f2 f1 f0|f3 f2 f1 f0|f3 f2 f1 f0 | f3 f2 f1 f0|f3 f2 f1 + // f0|f3 f2 f1 f0|f3 f2 f1 f0 + firstFilters = _mm256_shuffle_epi32(filtersReg32, 0); + // duplicate only the second 32 bits + // f7 f6 f5 f4|f7 f6 f5 f4|f7 f6 f5 f4|f7 f6 f5 f4 | f7 f6 f5 f4|f7 f6 f5 + // f4|f7 f6 f5 f4|f7 f6 f5 f4 + secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55); + + // s6 s5 s4 s3 s5 s4 s3 s2 s4 s3 s2 s1 s3 s2 s1 s0 | s6 s5 s4 s3 s5 s4 s3 + // s2 s4 s3 s2 s1 s3 s2 s1 s0 + filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2); + + // s10 s9 s8 s7 s9 s8 s7 s6 s8 s7 s6 s5 s7 s6 s5 s4 | s10 s9 s8 s7 s9 s8 s7 + // s6 s8 s7 s6 s5 s7 s6 s5 s4 + filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + + do { + __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcReg32b1; + // load the 2 strides of source + // r115 r114 ...... r15 r14 r13 r12 r11 r10 | r015 r014 r013 ...... r07 + // r06 r05 r04 r03 r02 r01 r00 + srcReg32b1 = mm256_loadu2_si128(src_ptr - 3, src_ptr - 3 + src_pitch); + + // filter the source buffer + // r16 r15 r14 r13 r15 r14 r13 r12 r14 r13 r12 r11 r13 r12 r11 r10 | r06 + // r05 r04 r03 r05 r04 r03 r02 r04 r03 r02 r01 r03 r02 r01 r00 + srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + + // multiply 4 adjacent elements with the filter and add the result + // ...|f3*r14+f2*r13|f1*r13+f0*r12|f3*r13+f2*r12|f1*r11+f0*r10||... + // |f1*r03+f0*r02|f3*r04+f2*r03|f1*r02+f0*r01|f3*r03+f2*r02|f1*r01+f0*r00 + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + + // filter the source buffer + // r110 r19 r18 r17|r19 r18 r17 r16|r18 r17 r16 r15|r17 r16 r15 r14||r010 + // r09 r08 r07|r09 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04 + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + + // multiply 4 adjacent elements with the filter and add the result + // r010 r09 r08 r07|r9 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04||r010 + // r09 r08 r07|r9 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04 + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); + + srcRegFilt32b1_1 = + _mm256_add_epi16(srcRegFilt32b1_1, addFilterReg64_256bit); + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); + + srcRegFilt32b1_1 = + _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + // 0 0 0 0 R13 R12 R11 R10 || 0 0 0 0 R03 R02 R01 R00 (16bit) + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); + + // 8zeros 0 0 0 0 R13 R12 R11 R10 || 8zeros 0 0 0 0 R03 R02 R01 R00 (8bit) + srcRegFilt32b1_1 = + _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + src_ptr += src_stride; + // save first row 4 values + *((int *)&output_ptr[0]) = + _mm_cvtsi128_si32(_mm256_castsi256_si128(srcRegFilt32b1_1)); + output_ptr += output_pitch; + + // save second row 4 values + *((int *)&output_ptr[0]) = + _mm_cvtsi128_si32(_mm256_extractf128_si256(srcRegFilt32b1_1, 1)); + output_ptr += output_pitch; + + y = y - 2; + } while (y > 1); + + // For remaining height + if (y > 0) { + __m128i srcReg1, srcRegFilt1_1, addFilterReg64; + __m128i srcRegFilt2; + + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + + // filter the source buffer + srcRegFilt1_1 = + _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, + _mm256_castsi256_si128(firstFilters)); + + // filter the source buffer + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters)); + + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128()); + // shift by 6 bit each 16 bit + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); + + // save 4 bytes + *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1); + } + } +} + +static void vpx_filter_block1d4_v8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m256i f[4], ss[4]; + __m256i r[9], rr[2]; + __m128i s[11]; + + unsigned int y = output_height; + // Multiply the size of the source stride by four + const ptrdiff_t src_stride = src_pitch << 2; + const ptrdiff_t out_stride = out_pitch << 2; + + // The output_height is always a multiple of two. + assert(!(output_height & 0x01)); + + shuffle_filter_avx2(filter, f); + + s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch)); + s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch)); + s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch)); + s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch)); + s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch)); + s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch)); + s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch)); + + r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[2], 1); + r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[3], 1); + r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[4], 1); + r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[5], 1); + r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[6], 1); + + // r37.....r24..r33..r31 r30 r23 r22 r21 r20|r17....r14 r07..r05 r04 r13 r12 + // r11 r10 r03 r02 r01 r00 + rr[0] = _mm256_unpacklo_epi32(r[0], r[1]); + + // r47.....r34..r43..r41 r40 r33 r32 r31 r30|r27....r24 r17..r15 r14 r23 r22 + // r21 r20 r13 r12 r11 r10 + rr[1] = _mm256_unpacklo_epi32(r[1], r[2]); + + // r43 r33....r40 r30|r33 r23....r30 r20||r23 r13....r20 r10|r13 r03....r10 + // r00| + ss[0] = _mm256_unpacklo_epi8(rr[0], rr[1]); + + // r37.....r24..r33..r31 r30 r23 r22 r21 r20||r17....r14 r07..r05 r04 r13 r12 + // r11 r10 r03 r02 r01 r00 + rr[0] = _mm256_unpacklo_epi32(r[2], r[3]); + + // r47.....r34..r43..r41 r40 r33 r32 r31 r30|r27....r24 r17..r15 r14 r23 r22 + // r21 r20 r13 r12 r11 r10 + rr[1] = _mm256_unpacklo_epi32(r[3], r[4]); + + // r63 r53....r60 r50|r53 r43....r50 r40||r43 r33....r40 r30|r33 r23....r30 + // r20| + ss[1] = _mm256_unpacklo_epi8(rr[0], rr[1]); + // Process 4 rows at a time + while (y >= 4) { + s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch)); + s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch)); + s[9] = _mm_loadl_epi64((const __m128i *)(src_ptr + 9 * src_pitch)); + s[10] = _mm_loadl_epi64((const __m128i *)(src_ptr + 10 * src_pitch)); + + r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), s[7], 1); + r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[6]), s[8], 1); + rr[0] = _mm256_unpacklo_epi32(r[4], r[5]); + rr[1] = _mm256_unpacklo_epi32(r[5], r[6]); + ss[2] = _mm256_unpacklo_epi8(rr[0], rr[1]); + + r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[7]), s[9], 1); + r[8] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[8]), s[10], 1); + rr[0] = _mm256_unpacklo_epi32(r[6], r[7]); + rr[1] = _mm256_unpacklo_epi32(r[7], r[8]); + ss[3] = _mm256_unpacklo_epi8(rr[0], rr[1]); + + ss[0] = convolve8_16_avx2(ss, f); + + // r3 r2 r3 r2 r1 r0 r1 r0 + ss[0] = _mm256_packus_epi16(ss[0], ss[0]); + src_ptr += src_stride; + + mm256_storeu2_epi32((__m128i *const)output_ptr, + (__m128i *const)(output_ptr + (2 * out_pitch)), ss); + + ss[0] = _mm256_srli_si256(ss[0], 4); + + mm256_storeu2_epi32((__m128i *const)(output_ptr + (1 * out_pitch)), + (__m128i *const)(output_ptr + (3 * out_pitch)), ss); + + output_ptr += out_stride; + + ss[0] = ss[2]; + ss[1] = ss[3]; + + s[6] = s[10]; + s[5] = s[9]; + + r[4] = r[8]; + y -= 4; + } + + // Process 2 rows + if (y == 2) { + __m128i ss1[4], f1[4], r1[4]; + + s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch)); + s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch)); + s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch)); + + f1[0] = _mm256_castsi256_si128(f[0]); + f1[1] = _mm256_castsi256_si128(f[1]); + f1[2] = _mm256_castsi256_si128(f[2]); + f1[3] = _mm256_castsi256_si128(f[3]); + + r1[0] = _mm_unpacklo_epi32(s[4], s[5]); + r1[1] = _mm_unpacklo_epi32(s[5], s[6]); + + // R7-6 xxxx .. . . x| r73 r72 r71 r70 r63 r62 r61 r60 + r1[2] = _mm_unpacklo_epi32(s[6], s[7]); + + // R8-7 xxxx .. . . x| r83 r82 r81 r80 r73 r72 r71 r70 + r1[3] = _mm_unpacklo_epi32(s[7], s[8]); + + // r23 r13....r20 r10|r13 r03....r10 r00 + ss1[0] = _mm256_castsi256_si128(ss[0]); + + // r43 r33....r40 r30|r33 r23....r30 r20 + ss1[1] = _mm256_castsi256_si128(ss[1]); + + // r63 r53....r60 r50|r53 r43....r50 r40 + ss1[2] = _mm_unpacklo_epi8(r1[0], r1[1]); + + // r83 r73....r80 r70|r73 r63....r70 r60 + ss1[3] = _mm_unpacklo_epi8(r1[2], r1[3]); + + ss1[0] = convolve8_8_ssse3(ss1, f1); + + // r1 r0 r1 r0 + ss1[0] = _mm_packus_epi16(ss1[0], ss1[0]); + + // Save first row 4 values + *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(ss1[0]); + output_ptr += out_pitch; + + ss1[0] = _mm_srli_si128(ss1[0], 4); + // Save second row 4 values + *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(ss1[0]); + } +} + +#if HAVE_AVX2 && HAVE_SSSE3 +#if VPX_ARCH_X86_64 +filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; +#else // VPX_ARCH_X86 +filter8_1dfunction vpx_filter_block1d8_v8_ssse3; +filter8_1dfunction vpx_filter_block1d8_h8_ssse3; +filter8_1dfunction vpx_filter_block1d4_h8_ssse3; +#endif // VPX_ARCH_X86_64 +filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3; +filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3; +filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3; +filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3; +#define vpx_filter_block1d8_v8_avg_avx2 vpx_filter_block1d8_v8_avg_ssse3 +#define vpx_filter_block1d8_h8_avg_avx2 vpx_filter_block1d8_h8_avg_ssse3 +#define vpx_filter_block1d4_v8_avg_avx2 vpx_filter_block1d4_v8_avg_ssse3 +#define vpx_filter_block1d4_h8_avg_avx2 vpx_filter_block1d4_h8_avg_ssse3 +filter8_1dfunction vpx_filter_block1d16_v2_ssse3; +filter8_1dfunction vpx_filter_block1d16_h2_ssse3; +filter8_1dfunction vpx_filter_block1d8_v2_ssse3; +filter8_1dfunction vpx_filter_block1d8_h2_ssse3; +filter8_1dfunction vpx_filter_block1d4_v2_ssse3; +filter8_1dfunction vpx_filter_block1d4_h2_ssse3; +#define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3 +#define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3 +#define vpx_filter_block1d8_v2_avx2 vpx_filter_block1d8_v2_ssse3 +#define vpx_filter_block1d8_h2_avx2 vpx_filter_block1d8_h2_ssse3 +#define vpx_filter_block1d4_v2_avx2 vpx_filter_block1d4_v2_ssse3 +#define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3 +filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3; +#define vpx_filter_block1d16_v2_avg_avx2 vpx_filter_block1d16_v2_avg_ssse3 +#define vpx_filter_block1d16_h2_avg_avx2 vpx_filter_block1d16_h2_avg_ssse3 +#define vpx_filter_block1d8_v2_avg_avx2 vpx_filter_block1d8_v2_avg_ssse3 +#define vpx_filter_block1d8_h2_avg_avx2 vpx_filter_block1d8_h2_avg_ssse3 +#define vpx_filter_block1d4_v2_avg_avx2 vpx_filter_block1d4_v2_avg_ssse3 +#define vpx_filter_block1d4_h2_avg_avx2 vpx_filter_block1d4_h2_avg_ssse3 + +#define vpx_filter_block1d16_v4_avg_avx2 vpx_filter_block1d16_v8_avg_avx2 +#define vpx_filter_block1d16_h4_avg_avx2 vpx_filter_block1d16_h8_avg_avx2 +#define vpx_filter_block1d8_v4_avg_avx2 vpx_filter_block1d8_v8_avg_avx2 +#define vpx_filter_block1d8_h4_avg_avx2 vpx_filter_block1d8_h8_avg_avx2 +#define vpx_filter_block1d4_v4_avg_avx2 vpx_filter_block1d4_v8_avg_avx2 +#define vpx_filter_block1d4_h4_avg_avx2 vpx_filter_block1d4_h8_avg_avx2 +// void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +// void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +// void vpx_convolve8_avg_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h); +// void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h); +FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0) +FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), , + avx2, 0) +FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1) +FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, + src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1) + +// void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +// void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, avx2, 0) +FUN_CONV_2D(avg_, avx2, 1) +#endif // HAVE_AX2 && HAVE_SSSE3 diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c new file mode 100644 index 0000000000..4ea2752d38 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c @@ -0,0 +1,1087 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSSE3 + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_dsp/x86/convolve.h" +#include "vpx_dsp/x86/convolve_sse2.h" +#include "vpx_dsp/x86/convolve_ssse3.h" +#include "vpx_dsp/x86/mem_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" + +static INLINE __m128i shuffle_filter_convolve8_8_ssse3( + const __m128i *const s, const int16_t *const filter) { + __m128i f[4]; + shuffle_filter_ssse3(filter, f); + return convolve8_8_ssse3(s, f); +} + +// Used by the avx2 implementation. +#if VPX_ARCH_X86_64 +// Use the intrinsics below +filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; +filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; +#define vpx_filter_block1d4_h8_ssse3 vpx_filter_block1d4_h8_intrin_ssse3 +#define vpx_filter_block1d8_h8_ssse3 vpx_filter_block1d8_h8_intrin_ssse3 +#define vpx_filter_block1d8_v8_ssse3 vpx_filter_block1d8_v8_intrin_ssse3 +#else // VPX_ARCH_X86 +// Use the assembly in vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm. +filter8_1dfunction vpx_filter_block1d4_h8_ssse3; +filter8_1dfunction vpx_filter_block1d8_h8_ssse3; +filter8_1dfunction vpx_filter_block1d8_v8_ssse3; +#endif + +#if VPX_ARCH_X86_64 +void vpx_filter_block1d4_h8_intrin_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i firstFilters, secondFilters, shuffle1, shuffle2; + __m128i srcRegFilt1, srcRegFilt2; + __m128i addFilterReg64, filtersReg, srcReg; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits in the filter into the first lane + firstFilters = _mm_shufflelo_epi16(filtersReg, 0); + // duplicate only the third 16 bit in the filter into the first lane + secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); + // duplicate only the seconds 16 bits in the filter into the second lane + // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3 + firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); + // duplicate only the forth 16 bits in the filter into the second lane + // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7 + secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); + + // loading the local filters + shuffle1 = _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6); + shuffle2 = _mm_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10); + + for (i = 0; i < output_height; i++) { + srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + + // filter the source buffer + srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1); + srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); + + // sum the results together, saturating only on the final step + // the specific order of the additions prevents outranges + srcRegFilt1 = _mm_add_epi16(srcRegFilt1, srcRegFilt2); + + // extract the higher half of the register + srcRegFilt2 = _mm_srli_si128(srcRegFilt1, 8); + + // add the rounding offset early to avoid another saturated add + srcRegFilt1 = _mm_add_epi16(srcRegFilt1, addFilterReg64); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); + + // shift by 7 bit each 16 bits + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + + // shrink to 8 bit each 16 bits + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + src_ptr += src_pitch; + + // save only 4 bytes + *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1); + + output_ptr += output_pitch; + } +} + +void vpx_filter_block1d8_h8_intrin_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + unsigned int i; + __m128i f[4], filt[4], s[4]; + + shuffle_filter_ssse3(filter, f); + filt[0] = _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); + filt[1] = _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10); + filt[2] = _mm_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12); + filt[3] = + _mm_setr_epi8(6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14); + + for (i = 0; i < output_height; i++) { + const __m128i srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + + // filter the source buffer + s[0] = _mm_shuffle_epi8(srcReg, filt[0]); + s[1] = _mm_shuffle_epi8(srcReg, filt[1]); + s[2] = _mm_shuffle_epi8(srcReg, filt[2]); + s[3] = _mm_shuffle_epi8(srcReg, filt[3]); + s[0] = convolve8_8_ssse3(s, f); + + // shrink to 8 bit each 16 bits + s[0] = _mm_packus_epi16(s[0], s[0]); + + src_ptr += src_pitch; + + // save only 8 bytes + _mm_storel_epi64((__m128i *)&output_ptr[0], s[0]); + + output_ptr += output_pitch; + } +} + +void vpx_filter_block1d8_v8_intrin_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + unsigned int i; + __m128i f[4], s[8], ss[4]; + + shuffle_filter_ssse3(filter, f); + + // load the first 7 rows of 8 bytes + s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch)); + s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch)); + s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch)); + s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch)); + s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch)); + s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch)); + s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch)); + + for (i = 0; i < output_height; i++) { + // load the last 8 bytes + s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch)); + + // merge the result together + ss[0] = _mm_unpacklo_epi8(s[0], s[1]); + ss[1] = _mm_unpacklo_epi8(s[2], s[3]); + + // merge the result together + ss[2] = _mm_unpacklo_epi8(s[4], s[5]); + ss[3] = _mm_unpacklo_epi8(s[6], s[7]); + + ss[0] = convolve8_8_ssse3(ss, f); + // shrink to 8 bit each 16 bits + ss[0] = _mm_packus_epi16(ss[0], ss[0]); + + src_ptr += src_pitch; + + // shift down a row + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + s[3] = s[4]; + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + + // save only 8 bytes convolve result + _mm_storel_epi64((__m128i *)&output_ptr[0], ss[0]); + + output_ptr += out_pitch; + } +} +#endif // VPX_ARCH_X86_64 + +static void vpx_filter_block1d16_h4_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will cast the kernel from 16-bit words to 8-bit words, and then extract + // the middle four elements of the kernel into two registers in the form + // ... k[3] k[2] k[3] k[2] + // ... k[5] k[4] k[5] k[4] + // Then we shuffle the source into + // ... s[1] s[0] s[0] s[-1] + // ... s[3] s[2] s[2] s[1] + // Calling multiply and add gives us half of the sum. Calling add gives us + // first half of the output. Repeat again to get the second half of the + // output. Finally we shuffle again to combine the two outputs. + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + int h; + + __m128i src_reg, src_reg_shift_0, src_reg_shift_2; + __m128i dst_first, dst_second; + __m128i tmp_0, tmp_1; + __m128i idx_shift_0 = + _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); + __m128i idx_shift_2 = + _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10); + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg); + kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u)); + kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u)); + + for (h = height; h > 0; --h) { + // Load the source + src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_shift_0 = _mm_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm_shuffle_epi8(src_reg, idx_shift_2); + + // Partial result for first half + tmp_0 = _mm_maddubs_epi16(src_reg_shift_0, kernel_reg_23); + tmp_1 = _mm_maddubs_epi16(src_reg_shift_2, kernel_reg_45); + dst_first = _mm_adds_epi16(tmp_0, tmp_1); + + // Do again to get the second half of dst + // Load the source + src_reg = _mm_loadu_si128((const __m128i *)(src_ptr + 8)); + src_reg_shift_0 = _mm_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm_shuffle_epi8(src_reg, idx_shift_2); + + // Partial result for first half + tmp_0 = _mm_maddubs_epi16(src_reg_shift_0, kernel_reg_23); + tmp_1 = _mm_maddubs_epi16(src_reg_shift_2, kernel_reg_45); + dst_second = _mm_adds_epi16(tmp_0, tmp_1); + + // Round each result + dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6); + dst_second = mm_round_epi16_sse2(&dst_second, ®_32, 6); + + // Finally combine to get the final dst + dst_first = _mm_packus_epi16(dst_first, dst_second); + _mm_store_si128((__m128i *)dst_ptr, dst_first); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void vpx_filter_block1d16_v4_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will load two rows of pixels as 8-bit words, rearrange them into the + // form + // ... s[0,1] s[-1,1] s[0,0] s[-1,0] + // ... s[0,9] s[-1,9] s[0,8] s[-1,8] + // so that we can call multiply and add with the kernel to get 16-bit words of + // the form + // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2] + // Finally, we can add multiple rows together to get the desired output. + + // Register for source s[-1:3, :] + __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m128i src_reg_m10_lo, src_reg_m10_hi, src_reg_01_lo, src_reg_01_hi; + __m128i src_reg_12_lo, src_reg_12_hi, src_reg_23_lo, src_reg_23_hi; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + + // Result after multiply and add + __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo; + __m128i res_reg_m10_hi, res_reg_01_hi, res_reg_12_hi, res_reg_23_hi; + __m128i res_reg_m1012, res_reg_0123; + __m128i res_reg_m1012_lo, res_reg_0123_lo, res_reg_m1012_hi, res_reg_0123_hi; + + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg); + kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u)); + kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u)); + + // First shuffle the data + src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride)); + src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0); + src_reg_m10_hi = _mm_unpackhi_epi8(src_reg_m1, src_reg_0); + + // More shuffling + src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)); + src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1); + src_reg_01_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)); + + src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2); + src_reg_12_hi = _mm_unpackhi_epi8(src_reg_1, src_reg_2); + + src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)); + + src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3); + src_reg_23_hi = _mm_unpackhi_epi8(src_reg_2, src_reg_3); + + // Partial output from first half + res_reg_m10_lo = _mm_maddubs_epi16(src_reg_m10_lo, kernel_reg_23); + res_reg_01_lo = _mm_maddubs_epi16(src_reg_01_lo, kernel_reg_23); + + res_reg_12_lo = _mm_maddubs_epi16(src_reg_12_lo, kernel_reg_45); + res_reg_23_lo = _mm_maddubs_epi16(src_reg_23_lo, kernel_reg_45); + + // Add to get first half of the results + res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo); + res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo); + + // Partial output for second half + res_reg_m10_hi = _mm_maddubs_epi16(src_reg_m10_hi, kernel_reg_23); + res_reg_01_hi = _mm_maddubs_epi16(src_reg_01_hi, kernel_reg_23); + + res_reg_12_hi = _mm_maddubs_epi16(src_reg_12_hi, kernel_reg_45); + res_reg_23_hi = _mm_maddubs_epi16(src_reg_23_hi, kernel_reg_45); + + // Second half of the results + res_reg_m1012_hi = _mm_adds_epi16(res_reg_m10_hi, res_reg_12_hi); + res_reg_0123_hi = _mm_adds_epi16(res_reg_01_hi, res_reg_23_hi); + + // Round the words + res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6); + res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, ®_32, 6); + res_reg_m1012_hi = mm_round_epi16_sse2(&res_reg_m1012_hi, ®_32, 6); + res_reg_0123_hi = mm_round_epi16_sse2(&res_reg_0123_hi, ®_32, 6); + + // Combine to get the result + res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, res_reg_m1012_hi); + res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, res_reg_0123_hi); + + _mm_store_si128((__m128i *)dst_ptr, res_reg_m1012); + _mm_store_si128((__m128i *)(dst_ptr + dst_stride), res_reg_0123); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m10_lo = src_reg_12_lo; + src_reg_m10_hi = src_reg_12_hi; + src_reg_01_lo = src_reg_23_lo; + src_reg_01_hi = src_reg_23_hi; + src_reg_1 = src_reg_3; + } +} + +static void vpx_filter_block1d8_h4_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will cast the kernel from 16-bit words to 8-bit words, and then extract + // the middle four elements of the kernel into two registers in the form + // ... k[3] k[2] k[3] k[2] + // ... k[5] k[4] k[5] k[4] + // Then we shuffle the source into + // ... s[1] s[0] s[0] s[-1] + // ... s[3] s[2] s[2] s[1] + // Calling multiply and add gives us half of the sum. Calling add gives us + // first half of the output. Repeat again to get the second half of the + // output. Finally we shuffle again to combine the two outputs. + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + int h; + + __m128i src_reg, src_reg_shift_0, src_reg_shift_2; + __m128i dst_first; + __m128i tmp_0, tmp_1; + __m128i idx_shift_0 = + _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); + __m128i idx_shift_2 = + _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10); + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg); + kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u)); + kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u)); + + for (h = height; h > 0; --h) { + // Load the source + src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_shift_0 = _mm_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm_shuffle_epi8(src_reg, idx_shift_2); + + // Get the result + tmp_0 = _mm_maddubs_epi16(src_reg_shift_0, kernel_reg_23); + tmp_1 = _mm_maddubs_epi16(src_reg_shift_2, kernel_reg_45); + dst_first = _mm_adds_epi16(tmp_0, tmp_1); + + // Round round result + dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6); + + // Pack to 8-bits + dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128()); + _mm_storel_epi64((__m128i *)dst_ptr, dst_first); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void vpx_filter_block1d8_v4_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will load two rows of pixels as 8-bit words, rearrange them into the + // form + // ... s[0,1] s[-1,1] s[0,0] s[-1,0] + // so that we can call multiply and add with the kernel to get 16-bit words of + // the form + // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2] + // Finally, we can add multiple rows together to get the desired output. + + // Register for source s[-1:3, :] + __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m128i src_reg_m10, src_reg_01; + __m128i src_reg_12, src_reg_23; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + + // Result after multiply and add + __m128i res_reg_m10, res_reg_01, res_reg_12, res_reg_23; + __m128i res_reg_m1012, res_reg_0123; + + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg); + kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u)); + kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u)); + + // First shuffle the data + src_reg_m1 = _mm_loadl_epi64((const __m128i *)src_ptr); + src_reg_0 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride)); + src_reg_m10 = _mm_unpacklo_epi8(src_reg_m1, src_reg_0); + + // More shuffling + src_reg_1 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 2)); + src_reg_01 = _mm_unpacklo_epi8(src_reg_0, src_reg_1); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)); + + src_reg_12 = _mm_unpacklo_epi8(src_reg_1, src_reg_2); + + src_reg_3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)); + + src_reg_23 = _mm_unpacklo_epi8(src_reg_2, src_reg_3); + + // Partial output + res_reg_m10 = _mm_maddubs_epi16(src_reg_m10, kernel_reg_23); + res_reg_01 = _mm_maddubs_epi16(src_reg_01, kernel_reg_23); + + res_reg_12 = _mm_maddubs_epi16(src_reg_12, kernel_reg_45); + res_reg_23 = _mm_maddubs_epi16(src_reg_23, kernel_reg_45); + + // Add to get entire output + res_reg_m1012 = _mm_adds_epi16(res_reg_m10, res_reg_12); + res_reg_0123 = _mm_adds_epi16(res_reg_01, res_reg_23); + + // Round the words + res_reg_m1012 = mm_round_epi16_sse2(&res_reg_m1012, ®_32, 6); + res_reg_0123 = mm_round_epi16_sse2(&res_reg_0123, ®_32, 6); + + // Pack from 16-bit to 8-bit + res_reg_m1012 = _mm_packus_epi16(res_reg_m1012, _mm_setzero_si128()); + res_reg_0123 = _mm_packus_epi16(res_reg_0123, _mm_setzero_si128()); + + _mm_storel_epi64((__m128i *)dst_ptr, res_reg_m1012); + _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res_reg_0123); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m10 = src_reg_12; + src_reg_01 = src_reg_23; + src_reg_1 = src_reg_3; + } +} + +static void vpx_filter_block1d4_h4_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will cast the kernel from 16-bit words to 8-bit words, and then extract + // the middle four elements of the kernel into a single register in the form + // k[5:2] k[5:2] k[5:2] k[5:2] + // Then we shuffle the source into + // s[5:2] s[4:1] s[3:0] s[2:-1] + // Calling multiply and add gives us half of the sum next to each other. + // Calling horizontal add then gives us the output. + + __m128i kernel_reg; // Kernel + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + int h; + + __m128i src_reg, src_reg_shuf; + __m128i dst_first; + __m128i shuf_idx = + _mm_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6); + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg); + kernel_reg = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi32(0x05040302u)); + + for (h = height; h > 0; --h) { + // Load the source + src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_shuf = _mm_shuffle_epi8(src_reg, shuf_idx); + + // Get the result + dst_first = _mm_maddubs_epi16(src_reg_shuf, kernel_reg); + dst_first = _mm_hadds_epi16(dst_first, _mm_setzero_si128()); + + // Round result + dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6); + + // Pack to 8-bits + dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128()); + *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void vpx_filter_block1d4_v4_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will load two rows of pixels as 8-bit words, rearrange them into the + // form + // ... s[2,0] s[1,0] s[0,0] s[-1,0] + // so that we can call multiply and add with the kernel partial output. Then + // we can call horizontal add to get the output. + // Finally, we can add multiple rows together to get the desired output. + // This is done two rows at a time + + // Register for source s[-1:3, :] + __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. + __m128i src_reg_m10, src_reg_01; + __m128i src_reg_12, src_reg_23; + __m128i src_reg_m1001, src_reg_1223; + __m128i src_reg_m1012_1023_lo, src_reg_m1012_1023_hi; + + __m128i kernel_reg; // Kernel + + // Result after multiply and add + __m128i reg_0, reg_1; + + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg); + kernel_reg = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi32(0x05040302u)); + + // First shuffle the data + src_reg_m1 = _mm_loadl_epi64((const __m128i *)src_ptr); + src_reg_0 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride)); + src_reg_m10 = _mm_unpacklo_epi32(src_reg_m1, src_reg_0); + + // More shuffling + src_reg_1 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 2)); + src_reg_01 = _mm_unpacklo_epi32(src_reg_0, src_reg_1); + + // Put three rows next to each other + src_reg_m1001 = _mm_unpacklo_epi8(src_reg_m10, src_reg_01); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)); + src_reg_12 = _mm_unpacklo_epi32(src_reg_1, src_reg_2); + + src_reg_3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)); + src_reg_23 = _mm_unpacklo_epi32(src_reg_2, src_reg_3); + + // Put three rows next to each other + src_reg_1223 = _mm_unpacklo_epi8(src_reg_12, src_reg_23); + + // Put all four rows next to each other + src_reg_m1012_1023_lo = _mm_unpacklo_epi16(src_reg_m1001, src_reg_1223); + src_reg_m1012_1023_hi = _mm_unpackhi_epi16(src_reg_m1001, src_reg_1223); + + // Get the results + reg_0 = _mm_maddubs_epi16(src_reg_m1012_1023_lo, kernel_reg); + reg_1 = _mm_maddubs_epi16(src_reg_m1012_1023_hi, kernel_reg); + reg_0 = _mm_hadds_epi16(reg_0, _mm_setzero_si128()); + reg_1 = _mm_hadds_epi16(reg_1, _mm_setzero_si128()); + + // Round the words + reg_0 = mm_round_epi16_sse2(®_0, ®_32, 6); + reg_1 = mm_round_epi16_sse2(®_1, ®_32, 6); + + // Pack from 16-bit to 8-bit and put them in the right order + reg_0 = _mm_packus_epi16(reg_0, reg_0); + reg_1 = _mm_packus_epi16(reg_1, reg_1); + + // Save the result + *((int *)(dst_ptr)) = _mm_cvtsi128_si32(reg_0); + *((int *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(reg_1); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m1001 = src_reg_1223; + src_reg_1 = src_reg_3; + } +} + +// From vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm +filter8_1dfunction vpx_filter_block1d16_v8_ssse3; +filter8_1dfunction vpx_filter_block1d16_h8_ssse3; +filter8_1dfunction vpx_filter_block1d4_v8_ssse3; +filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3; +filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3; +filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3; +filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3; +filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3; +filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3; + +// Use the [vh]8 version because there is no [vh]4 implementation. +#define vpx_filter_block1d16_v4_avg_ssse3 vpx_filter_block1d16_v8_avg_ssse3 +#define vpx_filter_block1d16_h4_avg_ssse3 vpx_filter_block1d16_h8_avg_ssse3 +#define vpx_filter_block1d8_v4_avg_ssse3 vpx_filter_block1d8_v8_avg_ssse3 +#define vpx_filter_block1d8_h4_avg_ssse3 vpx_filter_block1d8_h8_avg_ssse3 +#define vpx_filter_block1d4_v4_avg_ssse3 vpx_filter_block1d4_v8_avg_ssse3 +#define vpx_filter_block1d4_h4_avg_ssse3 vpx_filter_block1d4_h8_avg_ssse3 + +// From vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm +filter8_1dfunction vpx_filter_block1d16_v2_ssse3; +filter8_1dfunction vpx_filter_block1d16_h2_ssse3; +filter8_1dfunction vpx_filter_block1d8_v2_ssse3; +filter8_1dfunction vpx_filter_block1d8_h2_ssse3; +filter8_1dfunction vpx_filter_block1d4_v2_ssse3; +filter8_1dfunction vpx_filter_block1d4_h2_ssse3; +filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3; + +// void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +// void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +// void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h); +// void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h); +FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3, 0) +FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), , + ssse3, 0) +FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3, 1) +FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, + src - src_stride * (num_taps / 2 - 1), avg_, ssse3, 1) + +static void filter_horiz_w8_ssse3(const uint8_t *const src, + const ptrdiff_t src_stride, + uint8_t *const dst, + const int16_t *const x_filter) { + __m128i s[8], ss[4], temp; + + load_8bit_8x8(src, src_stride, s); + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 + // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 + transpose_16bit_4x8(s, ss); + temp = shuffle_filter_convolve8_8_ssse3(ss, x_filter); + // shrink to 8 bit each 16 bits + temp = _mm_packus_epi16(temp, temp); + // save only 8 bytes convolve result + _mm_storel_epi64((__m128i *)dst, temp); +} + +static void transpose8x8_to_dst(const uint8_t *const src, + const ptrdiff_t src_stride, uint8_t *const dst, + const ptrdiff_t dst_stride) { + __m128i s[8]; + + load_8bit_8x8(src, src_stride, s); + transpose_8bit_8x8(s, s); + store_8bit_8x8(s, dst, dst_stride); +} + +static void scaledconvolve_horiz_w8(const uint8_t *src, + const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, + const InterpKernel *const x_filters, + const int x0_q4, const int x_step_q4, + const int w, const int h) { + DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); + int x, y, z; + src -= SUBPEL_TAPS / 2 - 1; + + // This function processes 8x8 areas. The intermediate height is not always + // a multiple of 8, so force it to be a multiple of 8 here. + y = h + (8 - (h & 0x7)); + + do { + int x_q4 = x0_q4; + for (x = 0; x < w; x += 8) { + // process 8 src_x steps + for (z = 0; z < 8; ++z) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + if (x_q4 & SUBPEL_MASK) { + filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter); + } else { + int i; + for (i = 0; i < 8; ++i) { + temp[z * 8 + i] = src_x[i * src_stride + 3]; + } + } + x_q4 += x_step_q4; + } + + // transpose the 8x8 filters values back to dst + transpose8x8_to_dst(temp, 8, dst + x, dst_stride); + } + + src += src_stride * 8; + dst += dst_stride * 8; + } while (y -= 8); +} + +static void filter_horiz_w4_ssse3(const uint8_t *const src, + const ptrdiff_t src_stride, + uint8_t *const dst, + const int16_t *const filter) { + __m128i s[4], ss[2]; + __m128i temp; + + load_8bit_8x4(src, src_stride, s); + transpose_16bit_4x4(s, ss); + // 00 01 10 11 20 21 30 31 + s[0] = ss[0]; + // 02 03 12 13 22 23 32 33 + s[1] = _mm_srli_si128(ss[0], 8); + // 04 05 14 15 24 25 34 35 + s[2] = ss[1]; + // 06 07 16 17 26 27 36 37 + s[3] = _mm_srli_si128(ss[1], 8); + + temp = shuffle_filter_convolve8_8_ssse3(s, filter); + // shrink to 8 bit each 16 bits + temp = _mm_packus_epi16(temp, temp); + // save only 4 bytes + *(int *)dst = _mm_cvtsi128_si32(temp); +} + +static void transpose4x4_to_dst(const uint8_t *const src, + const ptrdiff_t src_stride, uint8_t *const dst, + const ptrdiff_t dst_stride) { + __m128i s[4]; + + load_8bit_4x4(src, src_stride, s); + s[0] = transpose_8bit_4x4(s); + s[1] = _mm_srli_si128(s[0], 4); + s[2] = _mm_srli_si128(s[0], 8); + s[3] = _mm_srli_si128(s[0], 12); + store_8bit_4x4(s, dst, dst_stride); +} + +static void scaledconvolve_horiz_w4(const uint8_t *src, + const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, + const InterpKernel *const x_filters, + const int x0_q4, const int x_step_q4, + const int w, const int h) { + DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); + int x, y, z; + src -= SUBPEL_TAPS / 2 - 1; + + for (y = 0; y < h; y += 4) { + int x_q4 = x0_q4; + for (x = 0; x < w; x += 4) { + // process 4 src_x steps + for (z = 0; z < 4; ++z) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + if (x_q4 & SUBPEL_MASK) { + filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter); + } else { + int i; + for (i = 0; i < 4; ++i) { + temp[z * 4 + i] = src_x[i * src_stride + 3]; + } + } + x_q4 += x_step_q4; + } + + // transpose the 4x4 filters values back to dst + transpose4x4_to_dst(temp, 4, dst + x, dst_stride); + } + + src += src_stride * 4; + dst += dst_stride * 4; + } +} + +static __m128i filter_vert_kernel(const __m128i *const s, + const int16_t *const filter) { + __m128i ss[4]; + __m128i temp; + + // 00 10 01 11 02 12 03 13 + ss[0] = _mm_unpacklo_epi8(s[0], s[1]); + // 20 30 21 31 22 32 23 33 + ss[1] = _mm_unpacklo_epi8(s[2], s[3]); + // 40 50 41 51 42 52 43 53 + ss[2] = _mm_unpacklo_epi8(s[4], s[5]); + // 60 70 61 71 62 72 63 73 + ss[3] = _mm_unpacklo_epi8(s[6], s[7]); + + temp = shuffle_filter_convolve8_8_ssse3(ss, filter); + // shrink to 8 bit each 16 bits + return _mm_packus_epi16(temp, temp); +} + +static void filter_vert_w4_ssse3(const uint8_t *const src, + const ptrdiff_t src_stride, uint8_t *const dst, + const int16_t *const filter) { + __m128i s[8]; + __m128i temp; + + load_8bit_4x8(src, src_stride, s); + temp = filter_vert_kernel(s, filter); + // save only 4 bytes + *(int *)dst = _mm_cvtsi128_si32(temp); +} + +static void scaledconvolve_vert_w4( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst, + const ptrdiff_t dst_stride, const InterpKernel *const y_filters, + const int y0_q4, const int y_step_q4, const int w, const int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (y = 0; y < h; ++y) { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + + if (y_q4 & SUBPEL_MASK) { + filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter); + } else { + memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); + } + + y_q4 += y_step_q4; + } +} + +static void filter_vert_w8_ssse3(const uint8_t *const src, + const ptrdiff_t src_stride, uint8_t *const dst, + const int16_t *const filter) { + __m128i s[8], temp; + + load_8bit_8x8(src, src_stride, s); + temp = filter_vert_kernel(s, filter); + // save only 8 bytes convolve result + _mm_storel_epi64((__m128i *)dst, temp); +} + +static void scaledconvolve_vert_w8( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst, + const ptrdiff_t dst_stride, const InterpKernel *const y_filters, + const int y0_q4, const int y_step_q4, const int w, const int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (y = 0; y < h; ++y) { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + if (y_q4 & SUBPEL_MASK) { + filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter); + } else { + memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); + } + y_q4 += y_step_q4; + } +} + +static void filter_vert_w16_ssse3(const uint8_t *src, + const ptrdiff_t src_stride, + uint8_t *const dst, + const int16_t *const filter, const int w) { + int i; + __m128i f[4]; + shuffle_filter_ssse3(filter, f); + + for (i = 0; i < w; i += 16) { + __m128i s[8], s_lo[4], s_hi[4], temp_lo, temp_hi; + + loadu_8bit_16x8(src, src_stride, s); + + // merge the result together + s_lo[0] = _mm_unpacklo_epi8(s[0], s[1]); + s_hi[0] = _mm_unpackhi_epi8(s[0], s[1]); + s_lo[1] = _mm_unpacklo_epi8(s[2], s[3]); + s_hi[1] = _mm_unpackhi_epi8(s[2], s[3]); + s_lo[2] = _mm_unpacklo_epi8(s[4], s[5]); + s_hi[2] = _mm_unpackhi_epi8(s[4], s[5]); + s_lo[3] = _mm_unpacklo_epi8(s[6], s[7]); + s_hi[3] = _mm_unpackhi_epi8(s[6], s[7]); + temp_lo = convolve8_8_ssse3(s_lo, f); + temp_hi = convolve8_8_ssse3(s_hi, f); + + // shrink to 8 bit each 16 bits, the first lane contain the first convolve + // result and the second lane contain the second convolve result + temp_hi = _mm_packus_epi16(temp_lo, temp_hi); + src += 16; + // save 16 bytes convolve result + _mm_store_si128((__m128i *)&dst[i], temp_hi); + } +} + +static void scaledconvolve_vert_w16( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst, + const ptrdiff_t dst_stride, const InterpKernel *const y_filters, + const int y0_q4, const int y_step_q4, const int w, const int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (y = 0; y < h; ++y) { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + if (y_q4 & SUBPEL_MASK) { + filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter, + w); + } else { + memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); + } + y_q4 += y_step_q4; + } +} + +void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + // --Require an additional 8 rows for the horiz_w8 transpose tail. + // When calling in frame scaling function, the smallest scaling factor is x1/4 + // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still + // big enough. + DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]); + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= 64); + assert(h <= 64); + assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); + assert(x_step_q4 <= 64); + + if (w >= 8) { + scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, 64, filter, x0_q4, x_step_q4, w, + intermediate_height); + } else { + scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, 64, filter, x0_q4, x_step_q4, w, + intermediate_height); + } + + if (w >= 16) { + scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, w, h); + } else if (w == 8) { + scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, w, h); + } else { + scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, w, h); + } +} + +// void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +// void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, ssse3, 0) +FUN_CONV_2D(avg_, ssse3, 1) diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm new file mode 100644 index 0000000000..c8455e13a2 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm @@ -0,0 +1,989 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;Note: tap3 and tap4 have to be applied and added after other taps to avoid +;overflow. + +%macro GET_FILTERS_4 0 + mov rdx, arg(5) ;filter ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + psrldq xmm7, 8 + pshuflw xmm4, xmm7, 0b ;k4 + pshuflw xmm5, xmm7, 01010101b ;k5 + pshuflw xmm6, xmm7, 10101010b ;k6 + pshuflw xmm7, xmm7, 11111111b ;k7 + + punpcklqdq xmm0, xmm1 + punpcklqdq xmm2, xmm3 + punpcklqdq xmm5, xmm4 + punpcklqdq xmm6, xmm7 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm2 + movdqa k5k4, xmm5 + movdqa k6k7, xmm6 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 + + pxor xmm7, xmm7 + movdqa zero, xmm7 +%endm + +%macro APPLY_FILTER_4 1 + punpckldq xmm0, xmm1 ;two row in one register + punpckldq xmm6, xmm7 + punpckldq xmm2, xmm3 + punpckldq xmm5, xmm4 + + punpcklbw xmm0, zero ;unpack to word + punpcklbw xmm6, zero + punpcklbw xmm2, zero + punpcklbw xmm5, zero + + pmullw xmm0, k0k1 ;multiply the filter factors + pmullw xmm6, k6k7 + pmullw xmm2, k2k3 + pmullw xmm5, k5k4 + + paddsw xmm0, xmm6 ;sum + movdqa xmm1, xmm0 + psrldq xmm1, 8 + paddsw xmm0, xmm1 + paddsw xmm0, xmm2 + psrldq xmm2, 8 + paddsw xmm0, xmm5 + psrldq xmm5, 8 + paddsw xmm0, xmm2 + paddsw xmm0, xmm5 + + paddsw xmm0, krd ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movd [rdi], xmm0 +%endm + +%macro GET_FILTERS 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + pshufhw xmm4, xmm7, 0b ;k4 + pshufhw xmm5, xmm7, 01010101b ;k5 + pshufhw xmm6, xmm7, 10101010b ;k6 + pshufhw xmm7, xmm7, 11111111b ;k7 + + punpcklwd xmm0, xmm0 + punpcklwd xmm1, xmm1 + punpcklwd xmm2, xmm2 + punpcklwd xmm3, xmm3 + punpckhwd xmm4, xmm4 + punpckhwd xmm5, xmm5 + punpckhwd xmm6, xmm6 + punpckhwd xmm7, xmm7 + + movdqa k0, xmm0 ;store filter factors on stack + movdqa k1, xmm1 + movdqa k2, xmm2 + movdqa k3, xmm3 + movdqa k4, xmm4 + movdqa k5, xmm5 + movdqa k6, xmm6 + movdqa k7, xmm7 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 ;rounding + + pxor xmm7, xmm7 + movdqa zero, xmm7 +%endm + +%macro LOAD_VERT_8 1 + movq xmm0, [rsi + %1] ;0 + movq xmm1, [rsi + rax + %1] ;1 + movq xmm6, [rsi + rdx * 2 + %1] ;6 + lea rsi, [rsi + rax] + movq xmm7, [rsi + rdx * 2 + %1] ;7 + movq xmm2, [rsi + rax + %1] ;2 + movq xmm3, [rsi + rax * 2 + %1] ;3 + movq xmm4, [rsi + rdx + %1] ;4 + movq xmm5, [rsi + rax * 4 + %1] ;5 +%endm + +%macro APPLY_FILTER_8 2 + punpcklbw xmm0, zero + punpcklbw xmm1, zero + punpcklbw xmm6, zero + punpcklbw xmm7, zero + punpcklbw xmm2, zero + punpcklbw xmm5, zero + punpcklbw xmm3, zero + punpcklbw xmm4, zero + + pmullw xmm0, k0 + pmullw xmm1, k1 + pmullw xmm6, k6 + pmullw xmm7, k7 + pmullw xmm2, k2 + pmullw xmm5, k5 + pmullw xmm3, k3 + pmullw xmm4, k4 + + paddsw xmm0, xmm1 + paddsw xmm0, xmm6 + paddsw xmm0, xmm7 + paddsw xmm0, xmm2 + paddsw xmm0, xmm5 + paddsw xmm0, xmm3 + paddsw xmm0, xmm4 + + paddsw xmm0, krd ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack back to byte +%if %1 + movq xmm1, [rdi + %2] + pavgb xmm0, xmm1 +%endif + movq [rdi + %2], xmm0 +%endm + +SECTION .text + +;void vpx_filter_block1d4_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +globalsym(vpx_filter_block1d4_v8_sse2) +sym(vpx_filter_block1d4_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 6 + %define k0k1 [rsp + 16 * 0] + %define k2k3 [rsp + 16 * 1] + %define k5k4 [rsp + 16 * 2] + %define k6k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define zero [rsp + 16 * 5] + + GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movd xmm0, [rsi] ;load src: row 0 + movd xmm1, [rsi + rax] ;1 + movd xmm6, [rsi + rdx * 2] ;6 + lea rsi, [rsi + rax] + movd xmm7, [rsi + rdx * 2] ;7 + movd xmm2, [rsi + rax] ;2 + movd xmm3, [rsi + rax * 2] ;3 + movd xmm4, [rsi + rdx] ;4 + movd xmm5, [rsi + rax * 4] ;5 + + APPLY_FILTER_4 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 6 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vpx_filter_block1d8_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +globalsym(vpx_filter_block1d8_v8_sse2) +sym(vpx_filter_block1d8_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + APPLY_FILTER_8 0, 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vpx_filter_block1d16_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +globalsym(vpx_filter_block1d16_v8_sse2) +sym(vpx_filter_block1d16_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + APPLY_FILTER_8 0, 0 + sub rsi, rax + + LOAD_VERT_8 8 + APPLY_FILTER_8 0, 8 + add rdi, rbx + + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_filter_block1d4_v8_avg_sse2) +sym(vpx_filter_block1d4_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 6 + %define k0k1 [rsp + 16 * 0] + %define k2k3 [rsp + 16 * 1] + %define k5k4 [rsp + 16 * 2] + %define k6k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define zero [rsp + 16 * 5] + + GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movd xmm0, [rsi] ;load src: row 0 + movd xmm1, [rsi + rax] ;1 + movd xmm6, [rsi + rdx * 2] ;6 + lea rsi, [rsi + rax] + movd xmm7, [rsi + rdx * 2] ;7 + movd xmm2, [rsi + rax] ;2 + movd xmm3, [rsi + rax * 2] ;3 + movd xmm4, [rsi + rdx] ;4 + movd xmm5, [rsi + rax * 4] ;5 + + APPLY_FILTER_4 1 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 6 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_filter_block1d8_v8_avg_sse2) +sym(vpx_filter_block1d8_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height +.loop: + LOAD_VERT_8 0 + APPLY_FILTER_8 1, 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_filter_block1d16_v8_avg_sse2) +sym(vpx_filter_block1d16_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height +.loop: + LOAD_VERT_8 0 + APPLY_FILTER_8 1, 0 + sub rsi, rax + + LOAD_VERT_8 8 + APPLY_FILTER_8 1, 8 + add rdi, rbx + + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vpx_filter_block1d4_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +globalsym(vpx_filter_block1d4_h8_sse2) +sym(vpx_filter_block1d4_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 6 + %define k0k1 [rsp + 16 * 0] + %define k2k3 [rsp + 16 * 1] + %define k5k4 [rsp + 16 * 2] + %define k6k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define zero [rsp + 16 * 5] + + GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + movdqa xmm5, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm3, 3 + psrldq xmm5, 5 + psrldq xmm4, 4 + + APPLY_FILTER_4 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 6 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vpx_filter_block1d8_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +globalsym(vpx_filter_block1d8_h8_sse2) +sym(vpx_filter_block1d8_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 0, 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vpx_filter_block1d16_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +globalsym(vpx_filter_block1d16_h8_sse2) +sym(vpx_filter_block1d16_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 0, 0 + + movdqu xmm0, [rsi + 5] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 0, 8 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_filter_block1d4_h8_avg_sse2) +sym(vpx_filter_block1d4_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 6 + %define k0k1 [rsp + 16 * 0] + %define k2k3 [rsp + 16 * 1] + %define k5k4 [rsp + 16 * 2] + %define k6k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define zero [rsp + 16 * 5] + + GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + movdqa xmm5, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm3, 3 + psrldq xmm5, 5 + psrldq xmm4, 4 + + APPLY_FILTER_4 1 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 6 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_filter_block1d8_h8_avg_sse2) +sym(vpx_filter_block1d8_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 1, 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_filter_block1d16_h8_avg_sse2) +sym(vpx_filter_block1d16_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 1, 0 + + movdqu xmm0, [rsi + 5] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 1, 8 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm new file mode 100644 index 0000000000..fe617f1207 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm @@ -0,0 +1,803 @@ +; +; Copyright (c) 2015 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_64: times 8 dw 64 + +; %define USE_PMULHRSW +; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss +; when using this instruction. +; +; The add order below (based on ffvp9) must be followed to prevent outranges. +; x = k0k1 + k4k5 +; y = k2k3 + k6k7 +; z = signed SAT(x + y) + +SECTION .text +%define LOCAL_VARS_SIZE 16*6 + +%macro SETUP_LOCAL_VARS 0 + ; TODO(slavarnway): using xmm registers for these on VPX_ARCH_X86_64 + + ; pmaddubsw has a higher latency on some platforms, this might be eased by + ; interleaving the instructions. + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + packsswb m4, m4 + ; TODO(slavarnway): multiple pshufb instructions had a higher latency on + ; some platforms. + pshuflw m0, m4, 0b ;k0_k1 + pshuflw m1, m4, 01010101b ;k2_k3 + pshuflw m2, m4, 10101010b ;k4_k5 + pshuflw m3, m4, 11111111b ;k6_k7 + punpcklqdq m0, m0 + punpcklqdq m1, m1 + punpcklqdq m2, m2 + punpcklqdq m3, m3 + mova k0k1, m0 + mova k2k3, m1 + mova k4k5, m2 + mova k6k7, m3 +%if VPX_ARCH_X86_64 + %define krd m12 + %define tmp0 [rsp + 16*4] + %define tmp1 [rsp + 16*5] + mova krd, [GLOBAL(pw_64)] +%else + %define krd [rsp + 16*4] +%if CONFIG_PIC=0 + mova m6, [GLOBAL(pw_64)] +%else + ; build constants without accessing global memory + pcmpeqb m6, m6 ;all ones + psrlw m6, 15 + psllw m6, 6 ;aka pw_64 +%endif + mova krd, m6 +%endif +%endm + +;------------------------------------------------------------------------------- +%if VPX_ARCH_X86_64 + %define LOCAL_VARS_SIZE_H4 0 +%else + %define LOCAL_VARS_SIZE_H4 16*4 +%endif + +%macro SUBPIX_HFILTER4 1 +cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + packsswb m4, m4 +%if VPX_ARCH_X86_64 + %define k0k1k4k5 m8 + %define k2k3k6k7 m9 + %define krd m10 + mova krd, [GLOBAL(pw_64)] + pshuflw k0k1k4k5, m4, 0b ;k0_k1 + pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5 + pshuflw k2k3k6k7, m4, 01010101b ;k2_k3 + pshufhw k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7 +%else + %define k0k1k4k5 [rsp + 16*0] + %define k2k3k6k7 [rsp + 16*1] + %define krd [rsp + 16*2] + pshuflw m6, m4, 0b ;k0_k1 + pshufhw m6, m6, 10101010b ;k0_k1_k4_k5 + pshuflw m7, m4, 01010101b ;k2_k3 + pshufhw m7, m7, 11111111b ;k2_k3_k6_k7 +%if CONFIG_PIC=0 + mova m1, [GLOBAL(pw_64)] +%else + ; build constants without accessing global memory + pcmpeqb m1, m1 ;all ones + psrlw m1, 15 + psllw m1, 6 ;aka pw_64 +%endif + mova k0k1k4k5, m6 + mova k2k3k6k7, m7 + mova krd, m1 +%endif + dec heightd + +.loop: + ;Do two rows at once + movu m4, [srcq - 3] + movu m5, [srcq + sstrideq - 3] + punpckhbw m1, m4, m4 + punpcklbw m4, m4 + punpckhbw m3, m5, m5 + punpcklbw m5, m5 + palignr m0, m1, m4, 1 + pmaddubsw m0, k0k1k4k5 + palignr m1, m4, 5 + pmaddubsw m1, k2k3k6k7 + palignr m2, m3, m5, 1 + pmaddubsw m2, k0k1k4k5 + palignr m3, m5, 5 + pmaddubsw m3, k2k3k6k7 + punpckhqdq m4, m0, m2 + punpcklqdq m0, m2 + punpckhqdq m5, m1, m3 + punpcklqdq m1, m3 + paddsw m0, m4 + paddsw m1, m5 +%ifidn %1, h8_avg + movd m4, [dstq] + movd m5, [dstq + dstrideq] +%endif + paddsw m0, m1 + paddsw m0, krd + psraw m0, 7 + packuswb m0, m0 + psrldq m1, m0, 4 + +%ifidn %1, h8_avg + pavgb m0, m4 + pavgb m1, m5 +%endif + movd [dstq], m0 + movd [dstq + dstrideq], m1 + + lea srcq, [srcq + sstrideq ] + prefetcht0 [srcq + 4 * sstrideq - 3] + lea srcq, [srcq + sstrideq ] + lea dstq, [dstq + 2 * dstrideq ] + prefetcht0 [srcq + 2 * sstrideq - 3] + + sub heightd, 2 + jg .loop + + ; Do last row if output_height is odd + jne .done + + movu m4, [srcq - 3] + punpckhbw m1, m4, m4 + punpcklbw m4, m4 + palignr m0, m1, m4, 1 + palignr m1, m4, 5 + pmaddubsw m0, k0k1k4k5 + pmaddubsw m1, k2k3k6k7 + psrldq m2, m0, 8 + psrldq m3, m1, 8 + paddsw m0, m2 + paddsw m1, m3 + paddsw m0, m1 + paddsw m0, krd + psraw m0, 7 + packuswb m0, m0 +%ifidn %1, h8_avg + movd m4, [dstq] + pavgb m0, m4 +%endif + movd [dstq], m0 +.done: + REP_RET +%endm + +;------------------------------------------------------------------------------- +%macro SUBPIX_HFILTER8 1 +cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + SETUP_LOCAL_VARS + dec heightd + +.loop: + ;Do two rows at once + movu m0, [srcq - 3] + movu m4, [srcq + sstrideq - 3] + punpckhbw m1, m0, m0 + punpcklbw m0, m0 + palignr m5, m1, m0, 13 + pmaddubsw m5, k6k7 + palignr m2, m1, m0, 5 + palignr m3, m1, m0, 9 + palignr m1, m0, 1 + pmaddubsw m1, k0k1 + punpckhbw m6, m4, m4 + punpcklbw m4, m4 + pmaddubsw m2, k2k3 + pmaddubsw m3, k4k5 + + palignr m7, m6, m4, 13 + palignr m0, m6, m4, 5 + pmaddubsw m7, k6k7 + paddsw m1, m3 + paddsw m2, m5 + paddsw m1, m2 +%ifidn %1, h8_avg + movh m2, [dstq] + movhps m2, [dstq + dstrideq] +%endif + palignr m5, m6, m4, 9 + palignr m6, m4, 1 + pmaddubsw m0, k2k3 + pmaddubsw m6, k0k1 + paddsw m1, krd + pmaddubsw m5, k4k5 + psraw m1, 7 + paddsw m0, m7 + paddsw m6, m5 + paddsw m6, m0 + paddsw m6, krd + psraw m6, 7 + packuswb m1, m6 +%ifidn %1, h8_avg + pavgb m1, m2 +%endif + movh [dstq], m1 + movhps [dstq + dstrideq], m1 + + lea srcq, [srcq + sstrideq ] + prefetcht0 [srcq + 4 * sstrideq - 3] + lea srcq, [srcq + sstrideq ] + lea dstq, [dstq + 2 * dstrideq ] + prefetcht0 [srcq + 2 * sstrideq - 3] + sub heightd, 2 + jg .loop + + ; Do last row if output_height is odd + jne .done + + movu m0, [srcq - 3] + punpckhbw m3, m0, m0 + punpcklbw m0, m0 + palignr m1, m3, m0, 1 + palignr m2, m3, m0, 5 + palignr m4, m3, m0, 13 + palignr m3, m0, 9 + pmaddubsw m1, k0k1 + pmaddubsw m2, k2k3 + pmaddubsw m3, k4k5 + pmaddubsw m4, k6k7 + paddsw m1, m3 + paddsw m4, m2 + paddsw m1, m4 + paddsw m1, krd + psraw m1, 7 + packuswb m1, m1 +%ifidn %1, h8_avg + movh m0, [dstq] + pavgb m1, m0 +%endif + movh [dstq], m1 +.done: + REP_RET +%endm + +;------------------------------------------------------------------------------- +%macro SUBPIX_HFILTER16 1 +cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + SETUP_LOCAL_VARS + +.loop: + prefetcht0 [srcq + 2 * sstrideq -3] + + movu m0, [srcq - 3] + movu m4, [srcq - 2] + pmaddubsw m0, k0k1 + pmaddubsw m4, k0k1 + movu m1, [srcq - 1] + movu m5, [srcq + 0] + pmaddubsw m1, k2k3 + pmaddubsw m5, k2k3 + movu m2, [srcq + 1] + movu m6, [srcq + 2] + pmaddubsw m2, k4k5 + pmaddubsw m6, k4k5 + movu m3, [srcq + 3] + movu m7, [srcq + 4] + pmaddubsw m3, k6k7 + pmaddubsw m7, k6k7 + paddsw m0, m2 + paddsw m1, m3 + paddsw m0, m1 + paddsw m4, m6 + paddsw m5, m7 + paddsw m4, m5 + paddsw m0, krd + paddsw m4, krd + psraw m0, 7 + psraw m4, 7 + packuswb m0, m0 + packuswb m4, m4 + punpcklbw m0, m4 +%ifidn %1, h8_avg + pavgb m0, [dstq] +%endif + lea srcq, [srcq + sstrideq] + mova [dstq], m0 + lea dstq, [dstq + dstrideq] + dec heightd + jnz .loop + REP_RET +%endm + +INIT_XMM ssse3 +SUBPIX_HFILTER16 h8 ; vpx_filter_block1d16_h8_ssse3 +SUBPIX_HFILTER16 h8_avg ; vpx_filter_block1d16_h8_avg_ssse3 +SUBPIX_HFILTER8 h8 ; vpx_filter_block1d8_h8_ssse3 +SUBPIX_HFILTER8 h8_avg ; vpx_filter_block1d8_h8_avg_ssse3 +SUBPIX_HFILTER4 h8 ; vpx_filter_block1d4_h8_ssse3 +SUBPIX_HFILTER4 h8_avg ; vpx_filter_block1d4_h8_avg_ssse3 + +;------------------------------------------------------------------------------- + +; TODO(Linfeng): Detect cpu type and choose the code with better performance. +%define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1 + +%if VPX_ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON + %define NUM_GENERAL_REG_USED 9 +%else + %define NUM_GENERAL_REG_USED 6 +%endif + +%macro SUBPIX_VFILTER 2 +cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + SETUP_LOCAL_VARS + +%ifidn %2, 8 + %define movx movh +%else + %define movx movd +%endif + + dec heightd + +%if VPX_ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON + +%if VPX_ARCH_X86_64 + %define src1q r7 + %define sstride6q r8 + %define dst_stride dstrideq +%else + %define src1q filterq + %define sstride6q dstrideq + %define dst_stride dstridemp +%endif + mov src1q, srcq + add src1q, sstrideq + lea sstride6q, [sstrideq + sstrideq * 4] + add sstride6q, sstrideq ;pitch * 6 + +.loop: + ;Do two rows at once + movx m0, [srcq ] ;A + movx m1, [src1q ] ;B + punpcklbw m0, m1 ;A B + movx m2, [srcq + sstrideq * 2 ] ;C + pmaddubsw m0, k0k1 + mova m6, m2 + movx m3, [src1q + sstrideq * 2] ;D + punpcklbw m2, m3 ;C D + pmaddubsw m2, k2k3 + movx m4, [srcq + sstrideq * 4 ] ;E + mova m7, m4 + movx m5, [src1q + sstrideq * 4] ;F + punpcklbw m4, m5 ;E F + pmaddubsw m4, k4k5 + punpcklbw m1, m6 ;A B next iter + movx m6, [srcq + sstride6q ] ;G + punpcklbw m5, m6 ;E F next iter + punpcklbw m3, m7 ;C D next iter + pmaddubsw m5, k4k5 + movx m7, [src1q + sstride6q ] ;H + punpcklbw m6, m7 ;G H + pmaddubsw m6, k6k7 + pmaddubsw m3, k2k3 + pmaddubsw m1, k0k1 + paddsw m0, m4 + paddsw m2, m6 + movx m6, [srcq + sstrideq * 8 ] ;H next iter + punpcklbw m7, m6 + pmaddubsw m7, k6k7 + paddsw m0, m2 + paddsw m0, krd + psraw m0, 7 + paddsw m1, m5 + packuswb m0, m0 + + paddsw m3, m7 + paddsw m1, m3 + paddsw m1, krd + psraw m1, 7 + lea srcq, [srcq + sstrideq * 2 ] + lea src1q, [src1q + sstrideq * 2] + packuswb m1, m1 + +%ifidn %1, v8_avg + movx m2, [dstq] + pavgb m0, m2 +%endif + movx [dstq], m0 + add dstq, dst_stride +%ifidn %1, v8_avg + movx m3, [dstq] + pavgb m1, m3 +%endif + movx [dstq], m1 + add dstq, dst_stride + sub heightd, 2 + jg .loop + + ; Do last row if output_height is odd + jne .done + + movx m0, [srcq ] ;A + movx m1, [srcq + sstrideq ] ;B + movx m6, [srcq + sstride6q ] ;G + punpcklbw m0, m1 ;A B + movx m7, [src1q + sstride6q ] ;H + pmaddubsw m0, k0k1 + movx m2, [srcq + sstrideq * 2 ] ;C + punpcklbw m6, m7 ;G H + movx m3, [src1q + sstrideq * 2] ;D + pmaddubsw m6, k6k7 + movx m4, [srcq + sstrideq * 4 ] ;E + punpcklbw m2, m3 ;C D + movx m5, [src1q + sstrideq * 4] ;F + punpcklbw m4, m5 ;E F + pmaddubsw m2, k2k3 + pmaddubsw m4, k4k5 + paddsw m2, m6 + paddsw m0, m4 + paddsw m0, m2 + paddsw m0, krd + psraw m0, 7 + packuswb m0, m0 +%ifidn %1, v8_avg + movx m1, [dstq] + pavgb m0, m1 +%endif + movx [dstq], m0 + +%else + ; VPX_ARCH_X86_64 + + movx m0, [srcq ] ;A + movx m1, [srcq + sstrideq ] ;B + lea srcq, [srcq + sstrideq * 2 ] + movx m2, [srcq] ;C + movx m3, [srcq + sstrideq] ;D + lea srcq, [srcq + sstrideq * 2 ] + movx m4, [srcq] ;E + movx m5, [srcq + sstrideq] ;F + lea srcq, [srcq + sstrideq * 2 ] + movx m6, [srcq] ;G + punpcklbw m0, m1 ;A B + punpcklbw m1, m2 ;A B next iter + punpcklbw m2, m3 ;C D + punpcklbw m3, m4 ;C D next iter + punpcklbw m4, m5 ;E F + punpcklbw m5, m6 ;E F next iter + +.loop: + ;Do two rows at once + movx m7, [srcq + sstrideq] ;H + lea srcq, [srcq + sstrideq * 2 ] + movx m14, [srcq] ;H next iter + punpcklbw m6, m7 ;G H + punpcklbw m7, m14 ;G H next iter + pmaddubsw m8, m0, k0k1 + pmaddubsw m9, m1, k0k1 + mova m0, m2 + mova m1, m3 + pmaddubsw m10, m2, k2k3 + pmaddubsw m11, m3, k2k3 + mova m2, m4 + mova m3, m5 + pmaddubsw m4, k4k5 + pmaddubsw m5, k4k5 + paddsw m8, m4 + paddsw m9, m5 + mova m4, m6 + mova m5, m7 + pmaddubsw m6, k6k7 + pmaddubsw m7, k6k7 + paddsw m10, m6 + paddsw m11, m7 + paddsw m8, m10 + paddsw m9, m11 + mova m6, m14 + paddsw m8, krd + paddsw m9, krd + psraw m8, 7 + psraw m9, 7 +%ifidn %2, 4 + packuswb m8, m8 + packuswb m9, m9 +%else + packuswb m8, m9 +%endif + +%ifidn %1, v8_avg + movx m7, [dstq] +%ifidn %2, 4 + movx m10, [dstq + dstrideq] + pavgb m9, m10 +%else + movhpd m7, [dstq + dstrideq] +%endif + pavgb m8, m7 +%endif + movx [dstq], m8 +%ifidn %2, 4 + movx [dstq + dstrideq], m9 +%else + movhpd [dstq + dstrideq], m8 +%endif + + lea dstq, [dstq + dstrideq * 2 ] + sub heightd, 2 + jg .loop + + ; Do last row if output_height is odd + jne .done + + movx m7, [srcq + sstrideq] ;H + punpcklbw m6, m7 ;G H + pmaddubsw m0, k0k1 + pmaddubsw m2, k2k3 + pmaddubsw m4, k4k5 + pmaddubsw m6, k6k7 + paddsw m0, m4 + paddsw m2, m6 + paddsw m0, m2 + paddsw m0, krd + psraw m0, 7 + packuswb m0, m0 +%ifidn %1, v8_avg + movx m1, [dstq] + pavgb m0, m1 +%endif + movx [dstq], m0 + +%endif ; VPX_ARCH_X86_64 + +.done: + REP_RET + +%endm + +;------------------------------------------------------------------------------- +%macro SUBPIX_VFILTER16 1 +cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + SETUP_LOCAL_VARS + +%if VPX_ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON + +%if VPX_ARCH_X86_64 + %define src1q r7 + %define sstride6q r8 + %define dst_stride dstrideq +%else + %define src1q filterq + %define sstride6q dstrideq + %define dst_stride dstridemp +%endif + lea src1q, [srcq + sstrideq] + lea sstride6q, [sstrideq + sstrideq * 4] + add sstride6q, sstrideq ;pitch * 6 + +.loop: + movh m0, [srcq ] ;A + movh m1, [src1q ] ;B + movh m2, [srcq + sstrideq * 2 ] ;C + movh m3, [src1q + sstrideq * 2] ;D + movh m4, [srcq + sstrideq * 4 ] ;E + movh m5, [src1q + sstrideq * 4] ;F + + punpcklbw m0, m1 ;A B + movh m6, [srcq + sstride6q] ;G + punpcklbw m2, m3 ;C D + movh m7, [src1q + sstride6q] ;H + punpcklbw m4, m5 ;E F + pmaddubsw m0, k0k1 + movh m3, [srcq + 8] ;A + pmaddubsw m2, k2k3 + punpcklbw m6, m7 ;G H + movh m5, [srcq + sstrideq + 8] ;B + pmaddubsw m4, k4k5 + punpcklbw m3, m5 ;A B + movh m7, [srcq + sstrideq * 2 + 8] ;C + pmaddubsw m6, k6k7 + movh m5, [src1q + sstrideq * 2 + 8] ;D + punpcklbw m7, m5 ;C D + paddsw m2, m6 + pmaddubsw m3, k0k1 + movh m1, [srcq + sstrideq * 4 + 8] ;E + paddsw m0, m4 + pmaddubsw m7, k2k3 + movh m6, [src1q + sstrideq * 4 + 8] ;F + punpcklbw m1, m6 ;E F + paddsw m0, m2 + paddsw m0, krd + movh m2, [srcq + sstride6q + 8] ;G + pmaddubsw m1, k4k5 + movh m5, [src1q + sstride6q + 8] ;H + psraw m0, 7 + punpcklbw m2, m5 ;G H + pmaddubsw m2, k6k7 + paddsw m7, m2 + paddsw m3, m1 + paddsw m3, m7 + paddsw m3, krd + psraw m3, 7 + packuswb m0, m3 + + add srcq, sstrideq + add src1q, sstrideq +%ifidn %1, v8_avg + pavgb m0, [dstq] +%endif + mova [dstq], m0 + add dstq, dst_stride + dec heightd + jnz .loop + REP_RET + +%else + ; VPX_ARCH_X86_64 + dec heightd + + movu m1, [srcq ] ;A + movu m3, [srcq + sstrideq ] ;B + lea srcq, [srcq + sstrideq * 2] + punpcklbw m0, m1, m3 ;A B + punpckhbw m1, m3 ;A B + movu m5, [srcq] ;C + punpcklbw m2, m3, m5 ;A B next iter + punpckhbw m3, m5 ;A B next iter + mova tmp0, m2 ;store to stack + mova tmp1, m3 ;store to stack + movu m7, [srcq + sstrideq] ;D + lea srcq, [srcq + sstrideq * 2] + punpcklbw m4, m5, m7 ;C D + punpckhbw m5, m7 ;C D + movu m9, [srcq] ;E + punpcklbw m6, m7, m9 ;C D next iter + punpckhbw m7, m9 ;C D next iter + movu m11, [srcq + sstrideq] ;F + lea srcq, [srcq + sstrideq * 2] + punpcklbw m8, m9, m11 ;E F + punpckhbw m9, m11 ;E F + movu m2, [srcq] ;G + punpcklbw m10, m11, m2 ;E F next iter + punpckhbw m11, m2 ;E F next iter + +.loop: + ;Do two rows at once + pmaddubsw m13, m0, k0k1 + mova m0, m4 + pmaddubsw m14, m8, k4k5 + pmaddubsw m15, m4, k2k3 + mova m4, m8 + paddsw m13, m14 + movu m3, [srcq + sstrideq] ;H + lea srcq, [srcq + sstrideq * 2] + punpcklbw m14, m2, m3 ;G H + mova m8, m14 + pmaddubsw m14, k6k7 + paddsw m15, m14 + paddsw m13, m15 + paddsw m13, krd + psraw m13, 7 + + pmaddubsw m14, m1, k0k1 + pmaddubsw m1, m9, k4k5 + pmaddubsw m15, m5, k2k3 + paddsw m14, m1 + mova m1, m5 + mova m5, m9 + punpckhbw m2, m3 ;G H + mova m9, m2 + pmaddubsw m2, k6k7 + paddsw m15, m2 + paddsw m14, m15 + paddsw m14, krd + psraw m14, 7 + packuswb m13, m14 +%ifidn %1, v8_avg + pavgb m13, [dstq] +%endif + mova [dstq], m13 + + ; next iter + pmaddubsw m15, tmp0, k0k1 + pmaddubsw m14, m10, k4k5 + pmaddubsw m13, m6, k2k3 + paddsw m15, m14 + mova tmp0, m6 + mova m6, m10 + movu m2, [srcq] ;G next iter + punpcklbw m14, m3, m2 ;G H next iter + mova m10, m14 + pmaddubsw m14, k6k7 + paddsw m13, m14 + paddsw m15, m13 + paddsw m15, krd + psraw m15, 7 + + pmaddubsw m14, tmp1, k0k1 + mova tmp1, m7 + pmaddubsw m13, m7, k2k3 + mova m7, m11 + pmaddubsw m11, k4k5 + paddsw m14, m11 + punpckhbw m3, m2 ;G H next iter + mova m11, m3 + pmaddubsw m3, k6k7 + paddsw m13, m3 + paddsw m14, m13 + paddsw m14, krd + psraw m14, 7 + packuswb m15, m14 +%ifidn %1, v8_avg + pavgb m15, [dstq + dstrideq] +%endif + mova [dstq + dstrideq], m15 + lea dstq, [dstq + dstrideq * 2] + sub heightd, 2 + jg .loop + + ; Do last row if output_height is odd + jne .done + + movu m3, [srcq + sstrideq] ;H + punpcklbw m6, m2, m3 ;G H + punpckhbw m2, m3 ;G H + pmaddubsw m0, k0k1 + pmaddubsw m1, k0k1 + pmaddubsw m4, k2k3 + pmaddubsw m5, k2k3 + pmaddubsw m8, k4k5 + pmaddubsw m9, k4k5 + pmaddubsw m6, k6k7 + pmaddubsw m2, k6k7 + paddsw m0, m8 + paddsw m1, m9 + paddsw m4, m6 + paddsw m5, m2 + paddsw m0, m4 + paddsw m1, m5 + paddsw m0, krd + paddsw m1, krd + psraw m0, 7 + psraw m1, 7 + packuswb m0, m1 +%ifidn %1, v8_avg + pavgb m0, [dstq] +%endif + mova [dstq], m0 + +.done: + REP_RET + +%endif ; VPX_ARCH_X86_64 + +%endm + +INIT_XMM ssse3 +SUBPIX_VFILTER16 v8 ; vpx_filter_block1d16_v8_ssse3 +SUBPIX_VFILTER16 v8_avg ; vpx_filter_block1d16_v8_avg_ssse3 +SUBPIX_VFILTER v8, 8 ; vpx_filter_block1d8_v8_ssse3 +SUBPIX_VFILTER v8_avg, 8 ; vpx_filter_block1d8_v8_avg_ssse3 +SUBPIX_VFILTER v8, 4 ; vpx_filter_block1d4_v8_ssse3 +SUBPIX_VFILTER v8_avg, 4 ; vpx_filter_block1d4_v8_avg_ssse3 diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm new file mode 100644 index 0000000000..65790b1c21 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm @@ -0,0 +1,450 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +%macro GET_PARAM_4 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm3, [rdx] ;load filters + pshuflw xmm4, xmm3, 11111111b ;k3 + psrldq xmm3, 8 + pshuflw xmm3, xmm3, 0b ;k4 + punpcklqdq xmm4, xmm3 ;k3k4 + + movq xmm3, rcx ;rounding + pshufd xmm3, xmm3, 0 + + pxor xmm2, xmm2 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_4 1 + + punpckldq xmm0, xmm1 ;two row in one register + punpcklbw xmm0, xmm2 ;unpack to word + pmullw xmm0, xmm4 ;multiply the filter factors + + movdqa xmm1, xmm0 + psrldq xmm1, 8 + paddsw xmm0, xmm1 + + paddsw xmm0, xmm3 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + + movd [rdi], xmm0 + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro GET_PARAM 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + + pshuflw xmm6, xmm7, 11111111b ;k3 + pshufhw xmm7, xmm7, 0b ;k4 + punpcklwd xmm6, xmm6 + punpckhwd xmm7, xmm7 + + movq xmm4, rcx ;rounding + pshufd xmm4, xmm4, 0 + + pxor xmm5, xmm5 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_8 1 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + + pmullw xmm0, xmm6 + pmullw xmm1, xmm7 + paddsw xmm0, xmm1 + paddsw xmm0, xmm4 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack back to byte +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movq [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro APPLY_FILTER_16 1 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpckhbw xmm2, xmm5 + punpckhbw xmm3, xmm5 + + pmullw xmm0, xmm6 + pmullw xmm1, xmm7 + pmullw xmm2, xmm6 + pmullw xmm3, xmm7 + + paddsw xmm0, xmm1 + paddsw xmm2, xmm3 + + paddsw xmm0, xmm4 ;rounding + paddsw xmm2, xmm4 + psraw xmm0, 7 ;shift + psraw xmm2, 7 + packuswb xmm0, xmm2 ;pack back to byte +%if %1 + movdqu xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movdqu [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +SECTION .text + +globalsym(vpx_filter_block1d4_v2_sse2) +sym(vpx_filter_block1d4_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_filter_block1d8_v2_sse2) +sym(vpx_filter_block1d8_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_filter_block1d16_v2_sse2) +sym(vpx_filter_block1d16_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_filter_block1d4_v2_avg_sse2) +sym(vpx_filter_block1d4_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_filter_block1d8_v2_avg_sse2) +sym(vpx_filter_block1d8_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_filter_block1d16_v2_avg_sse2) +sym(vpx_filter_block1d16_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_filter_block1d4_h2_sse2) +sym(vpx_filter_block1d4_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_filter_block1d8_h2_sse2) +sym(vpx_filter_block1d8_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_filter_block1d16_h2_sse2) +sym(vpx_filter_block1d16_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_filter_block1d4_h2_avg_sse2) +sym(vpx_filter_block1d4_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_filter_block1d8_h2_avg_sse2) +sym(vpx_filter_block1d8_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_filter_block1d16_h2_avg_sse2) +sym(vpx_filter_block1d16_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm new file mode 100644 index 0000000000..32e3cd3d9f --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm @@ -0,0 +1,420 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +%macro GET_PARAM_4 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov ecx, 0x01000100 + + movdqa xmm3, [rdx] ;load filters + psrldq xmm3, 6 + packsswb xmm3, xmm3 + pshuflw xmm3, xmm3, 0b ;k3_k4 + + movd xmm2, ecx ;rounding_shift + pshufd xmm2, xmm2, 0 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_4 1 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm3 + + pmulhrsw xmm0, xmm2 ;rounding(+64)+shift(>>7) + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movd [rdi], xmm0 + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro GET_PARAM 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov ecx, 0x01000100 + + movdqa xmm7, [rdx] ;load filters + psrldq xmm7, 6 + packsswb xmm7, xmm7 + pshuflw xmm7, xmm7, 0b ;k3_k4 + punpcklwd xmm7, xmm7 + + movd xmm6, ecx ;rounding_shift + pshufd xmm6, xmm6, 0 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_8 1 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm7 + + pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7) + packuswb xmm0, xmm0 ;pack back to byte + +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movq [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro APPLY_FILTER_16 1 + punpcklbw xmm0, xmm1 + punpckhbw xmm2, xmm1 + pmaddubsw xmm0, xmm7 + pmaddubsw xmm2, xmm7 + + pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7) + pmulhrsw xmm2, xmm6 + packuswb xmm0, xmm2 ;pack back to byte + +%if %1 + movdqu xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movdqu [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +SECTION .text + +globalsym(vpx_filter_block1d4_v2_ssse3) +sym(vpx_filter_block1d4_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_filter_block1d8_v2_ssse3) +sym(vpx_filter_block1d8_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_filter_block1d16_v2_ssse3) +sym(vpx_filter_block1d16_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_filter_block1d4_v2_avg_ssse3) +sym(vpx_filter_block1d4_v2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_filter_block1d8_v2_avg_ssse3) +sym(vpx_filter_block1d8_v2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_filter_block1d16_v2_avg_ssse3) +sym(vpx_filter_block1d16_v2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_filter_block1d4_h2_ssse3) +sym(vpx_filter_block1d4_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_filter_block1d8_h2_ssse3) +sym(vpx_filter_block1d8_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_filter_block1d16_h2_ssse3) +sym(vpx_filter_block1d16_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_filter_block1d4_h2_avg_ssse3) +sym(vpx_filter_block1d4_h2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_filter_block1d8_h2_avg_ssse3) +sym(vpx_filter_block1d8_h2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(vpx_filter_block1d16_h2_avg_ssse3) +sym(vpx_filter_block1d16_h2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/libvpx/vpx_mem/include/vpx_mem_intrnl.h b/media/libvpx/libvpx/vpx_mem/include/vpx_mem_intrnl.h new file mode 100644 index 0000000000..5631130243 --- /dev/null +++ b/media/libvpx/libvpx/vpx_mem/include/vpx_mem_intrnl.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_ +#define VPX_VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_ +#include "./vpx_config.h" + +#define ADDRESS_STORAGE_SIZE sizeof(size_t) + +#ifndef DEFAULT_ALIGNMENT +#if defined(VXWORKS) +/*default addr alignment to use in calls to vpx_* functions other than + * vpx_memalign*/ +#define DEFAULT_ALIGNMENT 32 +#else +#define DEFAULT_ALIGNMENT (2 * sizeof(void *)) /* NOLINT */ +#endif +#endif + +/*returns an addr aligned to the byte boundary specified by align*/ +#define align_addr(addr, align) \ + (void *)(((size_t)(addr) + ((align)-1)) & ~(size_t)((align)-1)) + +#endif // VPX_VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_ diff --git a/media/libvpx/libvpx/vpx_mem/vpx_mem.c b/media/libvpx/libvpx/vpx_mem/vpx_mem.c new file mode 100644 index 0000000000..18abf1158b --- /dev/null +++ b/media/libvpx/libvpx/vpx_mem/vpx_mem.c @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_mem.h" +#include +#include +#include +#include +#include "include/vpx_mem_intrnl.h" +#include "vpx/vpx_integer.h" + +#if !defined(VPX_MAX_ALLOCABLE_MEMORY) +#if SIZE_MAX > (1ULL << 40) +#define VPX_MAX_ALLOCABLE_MEMORY (1ULL << 40) +#else +// For 32-bit targets keep this below INT_MAX to avoid valgrind warnings. +#define VPX_MAX_ALLOCABLE_MEMORY ((1ULL << 31) - (1 << 16)) +#endif +#endif + +// Returns 0 in case of overflow of nmemb * size. +static int check_size_argument_overflow(uint64_t nmemb, uint64_t size) { + const uint64_t total_size = nmemb * size; + if (nmemb == 0) return 1; + if (size > VPX_MAX_ALLOCABLE_MEMORY / nmemb) return 0; + if (total_size != (size_t)total_size) return 0; + + return 1; +} + +static size_t *get_malloc_address_location(void *const mem) { + return ((size_t *)mem) - 1; +} + +static uint64_t get_aligned_malloc_size(size_t size, size_t align) { + return (uint64_t)size + align - 1 + ADDRESS_STORAGE_SIZE; +} + +static void set_actual_malloc_address(void *const mem, + const void *const malloc_addr) { + size_t *const malloc_addr_location = get_malloc_address_location(mem); + *malloc_addr_location = (size_t)malloc_addr; +} + +static void *get_actual_malloc_address(void *const mem) { + size_t *const malloc_addr_location = get_malloc_address_location(mem); + return (void *)(*malloc_addr_location); +} + +void *vpx_memalign(size_t align, size_t size) { + void *x = NULL, *addr; + const uint64_t aligned_size = get_aligned_malloc_size(size, align); + if (!check_size_argument_overflow(1, aligned_size)) return NULL; + + addr = malloc((size_t)aligned_size); + if (addr) { + x = align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE, align); + set_actual_malloc_address(x, addr); + } + return x; +} + +void *vpx_malloc(size_t size) { return vpx_memalign(DEFAULT_ALIGNMENT, size); } + +void *vpx_calloc(size_t num, size_t size) { + void *x; + if (!check_size_argument_overflow(num, size)) return NULL; + + x = vpx_malloc(num * size); + if (x) memset(x, 0, num * size); + return x; +} + +void vpx_free(void *memblk) { + if (memblk) { + void *addr = get_actual_malloc_address(memblk); + free(addr); + } +} diff --git a/media/libvpx/libvpx/vpx_mem/vpx_mem.h b/media/libvpx/libvpx/vpx_mem/vpx_mem.h new file mode 100644 index 0000000000..7689a05e6e --- /dev/null +++ b/media/libvpx/libvpx/vpx_mem/vpx_mem.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_MEM_VPX_MEM_H_ +#define VPX_VPX_MEM_VPX_MEM_H_ + +#include "vpx_config.h" +#if defined(__uClinux__) +#include +#endif + +#include +#include + +#include "vpx/vpx_integer.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +void *vpx_memalign(size_t align, size_t size); +void *vpx_malloc(size_t size); +void *vpx_calloc(size_t num, size_t size); +void vpx_free(void *memblk); + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void *vpx_memset16(void *dest, int val, size_t length) { + size_t i; + uint16_t *dest16 = (uint16_t *)dest; + for (i = 0; i < length; i++) *dest16++ = val; + return dest; +} +#endif + +#include + +#ifdef VPX_MEM_PLTFRM +#include VPX_MEM_PLTFRM +#endif + +#if defined(__cplusplus) +} +#endif + +#endif // VPX_VPX_MEM_VPX_MEM_H_ diff --git a/media/libvpx/libvpx/vpx_mem/vpx_mem.mk b/media/libvpx/libvpx/vpx_mem/vpx_mem.mk new file mode 100644 index 0000000000..7f275eabf9 --- /dev/null +++ b/media/libvpx/libvpx/vpx_mem/vpx_mem.mk @@ -0,0 +1,4 @@ +MEM_SRCS-yes += vpx_mem.mk +MEM_SRCS-yes += vpx_mem.c +MEM_SRCS-yes += vpx_mem.h +MEM_SRCS-yes += include/vpx_mem_intrnl.h diff --git a/media/libvpx/libvpx/vpx_ports/aarch32_cpudetect.c b/media/libvpx/libvpx/vpx_ports/aarch32_cpudetect.c new file mode 100644 index 0000000000..639f4ff8ea --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/aarch32_cpudetect.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +// Feature detection code for Armv7-A / AArch32. + +#include "./vpx_config.h" +#include "arm_cpudetect.h" + +#if !CONFIG_RUNTIME_CPU_DETECT + +static int arm_get_cpu_caps(void) { + // This function should actually be a no-op. There is no way to adjust any of + // these because the RTCD tables do not exist: the functions are called + // statically. + int flags = 0; +#if HAVE_NEON + flags |= HAS_NEON; +#endif // HAVE_NEON + return flags; +} + +#elif defined(_MSC_VER) // end !CONFIG_RUNTIME_CPU_DETECT + +static int arm_get_cpu_caps(void) { + int flags = 0; +#if HAVE_NEON || HAVE_NEON_ASM + // MSVC has no inline __asm support for Arm, but it does let you __emit + // instructions via their assembled hex code. + // All of these instructions should be essentially nops. + __try { + // VORR q0,q0,q0 + __emit(0xF2200150); + flags |= HAS_NEON; + } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) { + // Ignore exception. + } +#endif // HAVE_NEON || HAVE_NEON_ASM + return flags; +} + +#elif defined(ANDROID_USE_CPU_FEATURES_LIB) + +static int arm_get_cpu_caps(void) { + int flags = 0; +#if HAVE_NEON || HAVE_NEON_ASM + uint64_t features = android_getCpuFeatures(); + if (features & ANDROID_CPU_ARM_FEATURE_NEON) { + flags |= HAS_NEON; + } +#endif // HAVE_NEON || HAVE_NEON_ASM + return flags; +} + +#elif defined(__linux__) // end defined(AOM_USE_ANDROID_CPU_FEATURES) + +#include + +// Define hwcap values ourselves: building with an old auxv header where these +// hwcap values are not defined should not prevent features from being enabled. +#define VPX_AARCH32_HWCAP_NEON (1 << 12) + +static int arm_get_cpu_caps(void) { + int flags = 0; + unsigned long hwcap = getauxval(AT_HWCAP); +#if HAVE_NEON || HAVE_NEON_ASM + if (hwcap & VPX_AARCH32_HWCAP_NEON) { + flags |= HAS_NEON; + } +#endif // HAVE_NEON || HAVE_NEON_ASM + return flags; +} +#else // end __linux__ +#error \ + "Runtime CPU detection selected, but no CPU detection method available" \ +"for your platform. Rerun configure with --disable-runtime-cpu-detect." +#endif + +int arm_cpu_caps(void) { + int flags = 0; + if (arm_cpu_env_flags(&flags)) { + return flags; + } + return arm_get_cpu_caps() & arm_cpu_env_mask(); +} diff --git a/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c b/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c new file mode 100644 index 0000000000..539d09bb39 --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "arm_cpudetect.h" + +#if defined(__APPLE__) +#include +#endif + +#if !CONFIG_RUNTIME_CPU_DETECT + +static int arm_get_cpu_caps(void) { + // This function should actually be a no-op. There is no way to adjust any of + // these because the RTCD tables do not exist: the functions are called + // statically. + int flags = 0; +#if HAVE_NEON + flags |= HAS_NEON; +#endif // HAVE_NEON + return flags; +} + +#elif defined(__APPLE__) // end !CONFIG_RUNTIME_CPU_DETECT + +// sysctlbyname() parameter documentation for instruction set characteristics: +// https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics +static INLINE int64_t have_feature(const char *feature) { + int64_t feature_present = 0; + size_t size = sizeof(feature_present); + if (sysctlbyname(feature, &feature_present, &size, NULL, 0) != 0) { + return 0; + } + return feature_present; +} + +static int arm_get_cpu_caps(void) { + int flags = 0; +#if HAVE_NEON + flags |= HAS_NEON; +#endif // HAVE_NEON +#if HAVE_NEON_DOTPROD + if (have_feature("hw.optional.arm.FEAT_DotProd")) { + flags |= HAS_NEON_DOTPROD; + } +#endif // HAVE_NEON_DOTPROD +#if HAVE_NEON_I8MM + if (have_feature("hw.optional.arm.FEAT_I8MM")) { + flags |= HAS_NEON_I8MM; + } +#endif // HAVE_NEON_I8MM + return flags; +} + +#elif defined(_WIN32) // end __APPLE__ + +static int arm_get_cpu_caps(void) { + int flags = 0; +// IsProcessorFeaturePresent() parameter documentation: +// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-isprocessorfeaturepresent#parameters +#if HAVE_NEON + flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A. +#endif // HAVE_NEON +#if HAVE_NEON_DOTPROD +// Support for PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE was added in Windows SDK +// 20348, supported by Windows 11 and Windows Server 2022. +#if defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) + if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) { + flags |= HAS_NEON_DOTPROD; + } +#endif // defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) +#endif // HAVE_NEON_DOTPROD + // No I8MM or SVE feature detection available on Windows at time of writing. + return flags; +} + +#elif defined(ANDROID_USE_CPU_FEATURES_LIB) + +static int arm_get_cpu_caps(void) { + int flags = 0; +#if HAVE_NEON + flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A. +#endif // HAVE_NEON + return flags; +} + +#elif defined(__linux__) // end defined(VPX_USE_ANDROID_CPU_FEATURES) + +#include + +// Define hwcap values ourselves: building with an old auxv header where these +// hwcap values are not defined should not prevent features from being enabled. +#define VPX_AARCH64_HWCAP_ASIMDDP (1 << 20) +#define VPX_AARCH64_HWCAP_SVE (1 << 22) +#define VPX_AARCH64_HWCAP2_I8MM (1 << 13) + +static int arm_get_cpu_caps(void) { + int flags = 0; + unsigned long hwcap = getauxval(AT_HWCAP); +#if HAVE_NEON_I8MM + unsigned long hwcap2 = getauxval(AT_HWCAP2); +#endif // HAVE_NEON_I8MM +#if HAVE_NEON + flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A. +#endif // HAVE_NEON +#if HAVE_NEON_DOTPROD + if (hwcap & VPX_AARCH64_HWCAP_ASIMDDP) { + flags |= HAS_NEON_DOTPROD; + } +#endif // HAVE_NEON_DOTPROD +#if HAVE_NEON_I8MM + if (hwcap2 & VPX_AARCH64_HWCAP2_I8MM) { + flags |= HAS_NEON_I8MM; + } +#endif // HAVE_NEON_I8MM +#if HAVE_SVE + if (hwcap & VPX_AARCH64_HWCAP_SVE) { + flags |= HAS_SVE; + } +#endif // HAVE_SVE + return flags; +} + +#elif defined(__Fuchsia__) // end __linux__ + +#include +#include + +// Added in https://fuchsia-review.googlesource.com/c/fuchsia/+/894282. +#ifndef ZX_ARM64_FEATURE_ISA_I8MM +#define ZX_ARM64_FEATURE_ISA_I8MM ((uint32_t)(1u << 19)) +#endif +// Added in https://fuchsia-review.googlesource.com/c/fuchsia/+/895083. +#ifndef ZX_ARM64_FEATURE_ISA_SVE +#define ZX_ARM64_FEATURE_ISA_SVE ((uint32_t)(1u << 20)) +#endif + +static int arm_get_cpu_caps(void) { + int flags = 0; +#if HAVE_NEON + flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A. +#endif // HAVE_NEON + uint32_t features; + zx_status_t status = zx_system_get_features(ZX_FEATURE_KIND_CPU, &features); + if (status != ZX_OK) { + return flags; + } +#if HAVE_NEON_DOTPROD + if (features & ZX_ARM64_FEATURE_ISA_DP) { + flags |= HAS_NEON_DOTPROD; + } +#endif // HAVE_NEON_DOTPROD +#if HAVE_NEON_I8MM + if (features & ZX_ARM64_FEATURE_ISA_I8MM) { + flags |= HAS_NEON_I8MM; + } +#endif // HAVE_NEON_I8MM +#if HAVE_SVE + if (features & ZX_ARM64_FEATURE_ISA_SVE) { + flags |= HAS_SVE; + } +#endif // HAVE_SVE + return flags; +} + +#else // end __Fuchsia__ +#error \ + "Runtime CPU detection selected, but no CPU detection method available" \ +"for your platform. Rerun configure with --disable-runtime-cpu-detect." +#endif + +int arm_cpu_caps(void) { + int flags = 0; + if (!arm_cpu_env_flags(&flags)) { + flags = arm_get_cpu_caps() & arm_cpu_env_mask(); + } + + // Restrict flags: FEAT_I8MM assumes that FEAT_DotProd is available. + if (!(flags & HAS_NEON_DOTPROD)) { + flags &= ~HAS_NEON_I8MM; + } + + // Restrict flags: FEAT_SVE assumes that FEAT_{DotProd,I8MM} are available. + if (!(flags & HAS_NEON_DOTPROD)) { + flags &= ~HAS_SVE; + } + if (!(flags & HAS_NEON_I8MM)) { + flags &= ~HAS_SVE; + } + + return flags; +} diff --git a/media/libvpx/libvpx/vpx_ports/arm.h b/media/libvpx/libvpx/vpx_ports/arm.h new file mode 100644 index 0000000000..39365d18ee --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/arm.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_PORTS_ARM_H_ +#define VPX_VPX_PORTS_ARM_H_ +#include +#include "vpx_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Armv7-A optional Neon instructions, mandatory from Armv8.0-A. +#define HAS_NEON (1 << 0) +// Armv8.2-A optional Neon dot-product instructions, mandatory from Armv8.4-A. +#define HAS_NEON_DOTPROD (1 << 1) +// Armv8.2-A optional Neon i8mm instructions, mandatory from Armv8.6-A. +#define HAS_NEON_I8MM (1 << 2) +// Armv8.2-A optional SVE instructions, mandatory from Armv9.0-A. +#define HAS_SVE (1 << 3) + +int arm_cpu_caps(void); + +// Earlier gcc compilers have issues with some neon intrinsics +#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 4 && \ + __GNUC_MINOR__ <= 6 +#define VPX_INCOMPATIBLE_GCC +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_PORTS_ARM_H_ diff --git a/media/libvpx/libvpx/vpx_ports/arm_cpudetect.h b/media/libvpx/libvpx/vpx_ports/arm_cpudetect.h new file mode 100644 index 0000000000..881397abc2 --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/arm_cpudetect.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "vpx_config.h" +#include "vpx_ports/arm.h" + +#if defined(_WIN32) +#undef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#undef WIN32_EXTRA_LEAN +#define WIN32_EXTRA_LEAN +#include +#endif + +#ifdef WINAPI_FAMILY +#include +#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) +#define getenv(x) NULL +#endif +#endif + +#if defined(__ANDROID__) && (__ANDROID_API__ < 18) +#define ANDROID_USE_CPU_FEATURES_LIB 1 +// Use getauxval() when targeting (64-bit) Android with API level >= 18. +// getauxval() is supported since Android API level 18 (Android 4.3.) +// First Android version with 64-bit support was Android 5.x (API level 21). +#include +#endif + +static INLINE int arm_cpu_env_flags(int *flags) { + const char *env = getenv("VPX_SIMD_CAPS"); + if (env && *env) { + *flags = (int)strtol(env, NULL, 0); + return 1; + } + return 0; +} + +static INLINE int arm_cpu_env_mask(void) { + const char *env = getenv("VPX_SIMD_CAPS_MASK"); + return env && *env ? (int)strtol(env, NULL, 0) : ~0; +} diff --git a/media/libvpx/libvpx/vpx_ports/asmdefs_mmi.h b/media/libvpx/libvpx/vpx_ports/asmdefs_mmi.h new file mode 100644 index 0000000000..400a51cc32 --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/asmdefs_mmi.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_PORTS_ASMDEFS_MMI_H_ +#define VPX_VPX_PORTS_ASMDEFS_MMI_H_ + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" + +#if HAVE_MMI + +#if HAVE_MIPS64 +#define mips_reg int64_t +#define MMI_ADDU(reg1, reg2, reg3) \ + "daddu " #reg1 ", " #reg2 ", " #reg3 " \n\t" + +#define MMI_ADDIU(reg1, reg2, immediate) \ + "daddiu " #reg1 ", " #reg2 ", " #immediate " \n\t" + +#define MMI_ADDI(reg1, reg2, immediate) \ + "daddi " #reg1 ", " #reg2 ", " #immediate " \n\t" + +#define MMI_SUBU(reg1, reg2, reg3) \ + "dsubu " #reg1 ", " #reg2 ", " #reg3 " \n\t" + +#define MMI_L(reg, addr, bias) \ + "ld " #reg ", " #bias "(" #addr ") \n\t" + +#define MMI_SRL(reg1, reg2, shift) \ + "ssrld " #reg1 ", " #reg2 ", " #shift " \n\t" + +#define MMI_SLL(reg1, reg2, shift) \ + "dsll " #reg1 ", " #reg2 ", " #shift " \n\t" + +#define MMI_MTC1(reg, fp) \ + "dmtc1 " #reg ", " #fp " \n\t" + +#define MMI_LI(reg, immediate) \ + "dli " #reg ", " #immediate " \n\t" + +#else +#define mips_reg int32_t +#define MMI_ADDU(reg1, reg2, reg3) \ + "addu " #reg1 ", " #reg2 ", " #reg3 " \n\t" + +#define MMI_ADDIU(reg1, reg2, immediate) \ + "addiu " #reg1 ", " #reg2 ", " #immediate " \n\t" + +#define MMI_ADDI(reg1, reg2, immediate) \ + "addi " #reg1 ", " #reg2 ", " #immediate " \n\t" + +#define MMI_SUBU(reg1, reg2, reg3) \ + "subu " #reg1 ", " #reg2 ", " #reg3 " \n\t" + +#define MMI_L(reg, addr, bias) \ + "lw " #reg ", " #bias "(" #addr ") \n\t" + +#define MMI_SRL(reg1, reg2, shift) \ + "ssrlw " #reg1 ", " #reg2 ", " #shift " \n\t" + +#define MMI_SLL(reg1, reg2, shift) \ + "sll " #reg1 ", " #reg2 ", " #shift " \n\t" + +#define MMI_MTC1(reg, fp) \ + "mtc1 " #reg ", " #fp " \n\t" + +#define MMI_LI(reg, immediate) \ + "li " #reg ", " #immediate " \n\t" + +#endif /* HAVE_MIPS64 */ + +#endif /* HAVE_MMI */ + +#endif // VPX_VPX_PORTS_ASMDEFS_MMI_H_ diff --git a/media/libvpx/libvpx/vpx_ports/bitops.h b/media/libvpx/libvpx/vpx_ports/bitops.h new file mode 100644 index 0000000000..1b5cdaa6dd --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/bitops.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_PORTS_BITOPS_H_ +#define VPX_VPX_PORTS_BITOPS_H_ + +#include + +#include "vpx_ports/msvc.h" + +#ifdef _MSC_VER +#if defined(_M_X64) || defined(_M_IX86) +#include +#define USE_MSC_INTRINSICS +#endif +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +// These versions of get_lsb() and get_msb() are only valid when n != 0 +// because all of the optimized versions are undefined when n == 0: +// https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html + +// use GNU builtins where available. +#if defined(__GNUC__) && \ + ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4) +static INLINE int get_lsb(unsigned int n) { + assert(n != 0); + return __builtin_ctz(n); +} + +static INLINE int get_msb(unsigned int n) { + assert(n != 0); + return 31 ^ __builtin_clz(n); +} +#elif defined(USE_MSC_INTRINSICS) +#pragma intrinsic(_BitScanForward) +#pragma intrinsic(_BitScanReverse) + +static INLINE int get_lsb(unsigned int n) { + unsigned long first_set_bit; // NOLINT(runtime/int) + _BitScanForward(&first_set_bit, n); + return first_set_bit; +} + +static INLINE int get_msb(unsigned int n) { + unsigned long first_set_bit; + assert(n != 0); + _BitScanReverse(&first_set_bit, n); + return first_set_bit; +} +#undef USE_MSC_INTRINSICS +#else +static INLINE int get_lsb(unsigned int n) { + int i; + assert(n != 0); + for (i = 0; i < 32 && !(n & 1); ++i) n >>= 1; + return i; +} + +// Returns (int)floor(log2(n)). n must be > 0. +static INLINE int get_msb(unsigned int n) { + int log = 0; + unsigned int value = n; + int i; + + assert(n != 0); + + for (i = 4; i >= 0; --i) { + const int shift = (1 << i); + const unsigned int x = value >> shift; + if (x != 0) { + value = x; + log += shift; + } + } + return log; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_PORTS_BITOPS_H_ diff --git a/media/libvpx/libvpx/vpx_ports/compiler_attributes.h b/media/libvpx/libvpx/vpx_ports/compiler_attributes.h new file mode 100644 index 0000000000..4b468749b8 --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/compiler_attributes.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2020 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_PORTS_COMPILER_ATTRIBUTES_H_ +#define VPX_VPX_PORTS_COMPILER_ATTRIBUTES_H_ + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif // !defined(__has_feature) + +#if !defined(__has_attribute) +#define __has_attribute(x) 0 +#endif // !defined(__has_attribute) + +//------------------------------------------------------------------------------ +// Sanitizer attributes. + +#if __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__) +#define VPX_WITH_ASAN 1 +#else +#define VPX_WITH_ASAN 0 +#endif // __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__) + +#if defined(__clang__) && __has_attribute(no_sanitize) +// Both of these have defined behavior and are used in certain operations or +// optimizations thereof. There are cases where an overflow may be unintended, +// however, so use of these attributes should be done with care. +#define VPX_NO_UNSIGNED_OVERFLOW_CHECK \ + __attribute__((no_sanitize("unsigned-integer-overflow"))) +#if __clang_major__ >= 12 +#define VPX_NO_UNSIGNED_SHIFT_CHECK \ + __attribute__((no_sanitize("unsigned-shift-base"))) +#endif // __clang__ >= 12 +#endif // __clang__ + +#ifndef VPX_NO_UNSIGNED_OVERFLOW_CHECK +#define VPX_NO_UNSIGNED_OVERFLOW_CHECK +#endif +#ifndef VPX_NO_UNSIGNED_SHIFT_CHECK +#define VPX_NO_UNSIGNED_SHIFT_CHECK +#endif + +//------------------------------------------------------------------------------ +// Variable attributes. + +#if __has_attribute(uninitialized) +// Attribute "uninitialized" disables -ftrivial-auto-var-init=pattern for +// the specified variable. +// +// -ftrivial-auto-var-init is security risk mitigation feature, so attribute +// should not be used "just in case", but only to fix real performance +// bottlenecks when other approaches do not work. In general the compiler is +// quite effective at eliminating unneeded initializations introduced by the +// flag, e.g. when they are followed by actual initialization by a program. +// However if compiler optimization fails and code refactoring is hard, the +// attribute can be used as a workaround. +#define VPX_UNINITIALIZED __attribute__((uninitialized)) +#else +#define VPX_UNINITIALIZED +#endif // __has_attribute(uninitialized) + +#endif // VPX_VPX_PORTS_COMPILER_ATTRIBUTES_H_ diff --git a/media/libvpx/libvpx/vpx_ports/emmintrin_compat.h b/media/libvpx/libvpx/vpx_ports/emmintrin_compat.h new file mode 100644 index 0000000000..d6cc68ee4d --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/emmintrin_compat.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_PORTS_EMMINTRIN_COMPAT_H_ +#define VPX_VPX_PORTS_EMMINTRIN_COMPAT_H_ + +#if defined(__GNUC__) && __GNUC__ < 4 +/* From emmintrin.h (gcc 4.5.3) */ +/* Casts between various SP, DP, INT vector types. Note that these do no + conversion of values, they just change the type. */ +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_castpd_ps(__m128d __A) { + return (__m128)__A; +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_castpd_si128(__m128d __A) { + return (__m128i)__A; +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_castps_pd(__m128 __A) { + return (__m128d)__A; +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_castps_si128(__m128 __A) { + return (__m128i)__A; +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_castsi128_ps(__m128i __A) { + return (__m128)__A; +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_castsi128_pd(__m128i __A) { + return (__m128d)__A; +} +#endif + +#endif // VPX_VPX_PORTS_EMMINTRIN_COMPAT_H_ diff --git a/media/libvpx/libvpx/vpx_ports/emms_mmx.asm b/media/libvpx/libvpx/vpx_ports/emms_mmx.asm new file mode 100644 index 0000000000..b31b25ebde --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/emms_mmx.asm @@ -0,0 +1,18 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +section .text +globalsym(vpx_clear_system_state) +sym(vpx_clear_system_state): + emms + ret diff --git a/media/libvpx/libvpx/vpx_ports/emms_mmx.c b/media/libvpx/libvpx/vpx_ports/emms_mmx.c new file mode 100644 index 0000000000..f1036b98ed --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/emms_mmx.c @@ -0,0 +1,15 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vpx_ports/system_state.h" + +void vpx_clear_system_state() { _mm_empty(); } diff --git a/media/libvpx/libvpx/vpx_ports/float_control_word.asm b/media/libvpx/libvpx/vpx_ports/float_control_word.asm new file mode 100644 index 0000000000..bb75b7a31f --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/float_control_word.asm @@ -0,0 +1,33 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +section .text + +%if LIBVPX_YASM_WIN64 +globalsym(vpx_winx64_fldcw) +sym(vpx_winx64_fldcw): + sub rsp, 8 + mov [rsp], rcx ; win x64 specific + fldcw [rsp] + add rsp, 8 + ret + + +globalsym(vpx_winx64_fstcw) +sym(vpx_winx64_fstcw): + sub rsp, 8 + fstcw [rsp] + mov rax, [rsp] + add rsp, 8 + ret +%endif diff --git a/media/libvpx/libvpx/vpx_ports/loongarch.h b/media/libvpx/libvpx/vpx_ports/loongarch.h new file mode 100644 index 0000000000..d93ff9f5f0 --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/loongarch.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2021 Loongson Technology Corporation Limited + * Contributed by Jin Bo + * Contributed by Lu Wang + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_PORTS_LOONGARCH_H_ +#define VPX_VPX_PORTS_LOONGARCH_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#define HAS_LSX 0x01 +#define HAS_LASX 0x02 + +int loongarch_cpu_caps(void); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_PORTS_LOONGARCH_H_ diff --git a/media/libvpx/libvpx/vpx_ports/loongarch_cpudetect.c b/media/libvpx/libvpx/vpx_ports/loongarch_cpudetect.c new file mode 100644 index 0000000000..7b4322d35e --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/loongarch_cpudetect.c @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2021 Loongson Technology Corporation Limited + * Contributed by Jin Bo + * Contributed by Lu Wang + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "vpx_ports/loongarch.h" + +#define LOONGARCH_CFG2 0x02 +#define LOONGARCH_CFG2_LSX (1 << 6) +#define LOONGARCH_CFG2_LASX (1 << 7) + +#if CONFIG_RUNTIME_CPU_DETECT +#if defined(__loongarch__) && defined(__linux__) +int loongarch_cpu_caps(void) { + int reg = 0; + int flag = 0; + + __asm__ volatile("cpucfg %0, %1 \n\t" : "+&r"(reg) : "r"(LOONGARCH_CFG2)); + if (reg & LOONGARCH_CFG2_LSX) flag |= HAS_LSX; + + if (reg & LOONGARCH_CFG2_LASX) flag |= HAS_LASX; + + return flag; +} +#else /* end __loongarch__ && __linux__ */ +#error \ + "--enable-runtime-cpu-detect selected, but no CPU detection method " \ +"available for your platform. Reconfigure with --disable-runtime-cpu-detect." +#endif +#else /* end CONFIG_RUNTIME_CPU_DETECT */ +int loongarch_cpu_caps(void) { return 0; } +#endif diff --git a/media/libvpx/libvpx/vpx_ports/mem.h b/media/libvpx/libvpx/vpx_ports/mem.h new file mode 100644 index 0000000000..5eccfe8f50 --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/mem.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_PORTS_MEM_H_ +#define VPX_VPX_PORTS_MEM_H_ + +#include "vpx_config.h" +#include "vpx/vpx_integer.h" + +#if (defined(__GNUC__) && __GNUC__) || defined(__SUNPRO_C) +#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n))) +#elif defined(_MSC_VER) +#define DECLARE_ALIGNED(n, typ, val) __declspec(align(n)) typ val +#else +#warning No alignment directives known for this compiler. +#define DECLARE_ALIGNED(n, typ, val) typ val +#endif + +#if HAVE_NEON && defined(_MSC_VER) +#define __builtin_prefetch(x) +#endif + +/* Shift down with rounding */ +#define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n)-1))) >> (n)) +#define ROUND64_POWER_OF_TWO(value, n) (((value) + (1ULL << ((n)-1))) >> (n)) + +#define ALIGN_POWER_OF_TWO(value, n) \ + (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1)) + +#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1)) +#define CAST_TO_SHORTPTR(x) ((uint16_t *)((uintptr_t)(x))) +#if CONFIG_VP9_HIGHBITDEPTH +#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1)) +#define CAST_TO_BYTEPTR(x) ((uint8_t *)((uintptr_t)(x))) +#endif // CONFIG_VP9_HIGHBITDEPTH + +#endif // VPX_VPX_PORTS_MEM_H_ diff --git a/media/libvpx/libvpx/vpx_ports/mem_ops.h b/media/libvpx/libvpx/vpx_ports/mem_ops.h new file mode 100644 index 0000000000..b17015e7ec --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/mem_ops.h @@ -0,0 +1,227 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_PORTS_MEM_OPS_H_ +#define VPX_VPX_PORTS_MEM_OPS_H_ + +/* \file + * \brief Provides portable memory access primitives + * + * This function provides portable primitives for getting and setting of + * signed and unsigned integers in 16, 24, and 32 bit sizes. The operations + * can be performed on unaligned data regardless of hardware support for + * unaligned accesses. + * + * The type used to pass the integral values may be changed by defining + * MEM_VALUE_T with the appropriate type. The type given must be an integral + * numeric type. + * + * The actual functions instantiated have the MEM_VALUE_T type name pasted + * on to the symbol name. This allows the developer to instantiate these + * operations for multiple types within the same translation unit. This is + * of somewhat questionable utility, but the capability exists nonetheless. + * Users not making use of this functionality should call the functions + * without the type name appended, and the preprocessor will take care of + * it. + * + * NOTE: This code is not supported on platforms where char > 1 octet ATM. + */ + +#ifndef MAU_T +/* Minimum Access Unit for this target */ +#define MAU_T unsigned char +#endif + +#ifndef MEM_VALUE_T +#define MEM_VALUE_T int +#endif + +#undef MEM_VALUE_T_SZ_BITS +#define MEM_VALUE_T_SZ_BITS (sizeof(MEM_VALUE_T) << 3) + +#undef mem_ops_wrap_symbol +#define mem_ops_wrap_symbol(fn) mem_ops_wrap_symbol2(fn, MEM_VALUE_T) +#undef mem_ops_wrap_symbol2 +#define mem_ops_wrap_symbol2(fn, typ) mem_ops_wrap_symbol3(fn, typ) +#undef mem_ops_wrap_symbol3 +#define mem_ops_wrap_symbol3(fn, typ) fn##_as_##typ + +/* + * Include aligned access routines + */ +#define INCLUDED_BY_MEM_OPS_H +#include "mem_ops_aligned.h" +#undef INCLUDED_BY_MEM_OPS_H + +#undef mem_get_be16 +#define mem_get_be16 mem_ops_wrap_symbol(mem_get_be16) +static unsigned MEM_VALUE_T mem_get_be16(const void *vmem) { + unsigned MEM_VALUE_T val; + const MAU_T *mem = (const MAU_T *)vmem; + + val = mem[0] << 8; + val |= mem[1]; + return val; +} + +#undef mem_get_be24 +#define mem_get_be24 mem_ops_wrap_symbol(mem_get_be24) +static unsigned MEM_VALUE_T mem_get_be24(const void *vmem) { + unsigned MEM_VALUE_T val; + const MAU_T *mem = (const MAU_T *)vmem; + + val = mem[0] << 16; + val |= mem[1] << 8; + val |= mem[2]; + return val; +} + +#undef mem_get_be32 +#define mem_get_be32 mem_ops_wrap_symbol(mem_get_be32) +static unsigned MEM_VALUE_T mem_get_be32(const void *vmem) { + unsigned MEM_VALUE_T val; + const MAU_T *mem = (const MAU_T *)vmem; + + val = ((unsigned MEM_VALUE_T)mem[0]) << 24; + val |= mem[1] << 16; + val |= mem[2] << 8; + val |= mem[3]; + return val; +} + +#undef mem_get_le16 +#define mem_get_le16 mem_ops_wrap_symbol(mem_get_le16) +static unsigned MEM_VALUE_T mem_get_le16(const void *vmem) { + unsigned MEM_VALUE_T val; + const MAU_T *mem = (const MAU_T *)vmem; + + val = mem[1] << 8; + val |= mem[0]; + return val; +} + +#undef mem_get_le24 +#define mem_get_le24 mem_ops_wrap_symbol(mem_get_le24) +static unsigned MEM_VALUE_T mem_get_le24(const void *vmem) { + unsigned MEM_VALUE_T val; + const MAU_T *mem = (const MAU_T *)vmem; + + val = mem[2] << 16; + val |= mem[1] << 8; + val |= mem[0]; + return val; +} + +#undef mem_get_le32 +#define mem_get_le32 mem_ops_wrap_symbol(mem_get_le32) +static unsigned MEM_VALUE_T mem_get_le32(const void *vmem) { + unsigned MEM_VALUE_T val; + const MAU_T *mem = (const MAU_T *)vmem; + + val = ((unsigned MEM_VALUE_T)mem[3]) << 24; + val |= mem[2] << 16; + val |= mem[1] << 8; + val |= mem[0]; + return val; +} + +#define mem_get_s_generic(end, sz) \ + static VPX_INLINE signed MEM_VALUE_T mem_get_s##end##sz(const void *vmem) { \ + const MAU_T *mem = (const MAU_T *)vmem; \ + signed MEM_VALUE_T val = mem_get_##end##sz(mem); \ + return (val << (MEM_VALUE_T_SZ_BITS - sz)) >> (MEM_VALUE_T_SZ_BITS - sz); \ + } + +/* clang-format off */ +#undef mem_get_sbe16 +#define mem_get_sbe16 mem_ops_wrap_symbol(mem_get_sbe16) +mem_get_s_generic(be, 16) + +#undef mem_get_sbe24 +#define mem_get_sbe24 mem_ops_wrap_symbol(mem_get_sbe24) +mem_get_s_generic(be, 24) + +#undef mem_get_sbe32 +#define mem_get_sbe32 mem_ops_wrap_symbol(mem_get_sbe32) +mem_get_s_generic(be, 32) + +#undef mem_get_sle16 +#define mem_get_sle16 mem_ops_wrap_symbol(mem_get_sle16) +mem_get_s_generic(le, 16) + +#undef mem_get_sle24 +#define mem_get_sle24 mem_ops_wrap_symbol(mem_get_sle24) +mem_get_s_generic(le, 24) + +#undef mem_get_sle32 +#define mem_get_sle32 mem_ops_wrap_symbol(mem_get_sle32) +mem_get_s_generic(le, 32) + +#undef mem_put_be16 +#define mem_put_be16 mem_ops_wrap_symbol(mem_put_be16) +static VPX_INLINE void mem_put_be16(void *vmem, MEM_VALUE_T val) { + MAU_T *mem = (MAU_T *)vmem; + + mem[0] = (MAU_T)((val >> 8) & 0xff); + mem[1] = (MAU_T)((val >> 0) & 0xff); +} + +#undef mem_put_be24 +#define mem_put_be24 mem_ops_wrap_symbol(mem_put_be24) +static VPX_INLINE void mem_put_be24(void *vmem, MEM_VALUE_T val) { + MAU_T *mem = (MAU_T *)vmem; + + mem[0] = (MAU_T)((val >> 16) & 0xff); + mem[1] = (MAU_T)((val >> 8) & 0xff); + mem[2] = (MAU_T)((val >> 0) & 0xff); +} + +#undef mem_put_be32 +#define mem_put_be32 mem_ops_wrap_symbol(mem_put_be32) +static VPX_INLINE void mem_put_be32(void *vmem, MEM_VALUE_T val) { + MAU_T *mem = (MAU_T *)vmem; + + mem[0] = (MAU_T)((val >> 24) & 0xff); + mem[1] = (MAU_T)((val >> 16) & 0xff); + mem[2] = (MAU_T)((val >> 8) & 0xff); + mem[3] = (MAU_T)((val >> 0) & 0xff); +} + +#undef mem_put_le16 +#define mem_put_le16 mem_ops_wrap_symbol(mem_put_le16) +static VPX_INLINE void mem_put_le16(void *vmem, MEM_VALUE_T val) { + MAU_T *mem = (MAU_T *)vmem; + + mem[0] = (MAU_T)((val >> 0) & 0xff); + mem[1] = (MAU_T)((val >> 8) & 0xff); +} + +#undef mem_put_le24 +#define mem_put_le24 mem_ops_wrap_symbol(mem_put_le24) +static VPX_INLINE void mem_put_le24(void *vmem, MEM_VALUE_T val) { + MAU_T *mem = (MAU_T *)vmem; + + mem[0] = (MAU_T)((val >> 0) & 0xff); + mem[1] = (MAU_T)((val >> 8) & 0xff); + mem[2] = (MAU_T)((val >> 16) & 0xff); +} + +#undef mem_put_le32 +#define mem_put_le32 mem_ops_wrap_symbol(mem_put_le32) +static VPX_INLINE void mem_put_le32(void *vmem, MEM_VALUE_T val) { + MAU_T *mem = (MAU_T *)vmem; + + mem[0] = (MAU_T)((val >> 0) & 0xff); + mem[1] = (MAU_T)((val >> 8) & 0xff); + mem[2] = (MAU_T)((val >> 16) & 0xff); + mem[3] = (MAU_T)((val >> 24) & 0xff); +} +/* clang-format on */ +#endif // VPX_VPX_PORTS_MEM_OPS_H_ diff --git a/media/libvpx/libvpx/vpx_ports/mem_ops_aligned.h b/media/libvpx/libvpx/vpx_ports/mem_ops_aligned.h new file mode 100644 index 0000000000..8649b87623 --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/mem_ops_aligned.h @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_PORTS_MEM_OPS_ALIGNED_H_ +#define VPX_VPX_PORTS_MEM_OPS_ALIGNED_H_ + +#include "vpx/vpx_integer.h" + +/* \file + * \brief Provides portable memory access primitives for operating on aligned + * data + * + * This file is split from mem_ops.h for easier maintenance. See mem_ops.h + * for a more detailed description of these primitives. + */ +#ifndef INCLUDED_BY_MEM_OPS_H +#error Include mem_ops.h, not mem_ops_aligned.h directly. +#endif + +/* Architectures that provide instructions for doing this byte swapping + * could redefine these macros. + */ +#define swap_endian_16(val, raw) \ + do { \ + val = (uint16_t)(((raw >> 8) & 0x00ff) | ((raw << 8) & 0xff00)); \ + } while (0) +#define swap_endian_32(val, raw) \ + do { \ + val = ((raw >> 24) & 0x000000ff) | ((raw >> 8) & 0x0000ff00) | \ + ((raw << 8) & 0x00ff0000) | ((raw << 24) & 0xff000000); \ + } while (0) +#define swap_endian_16_se(val, raw) \ + do { \ + swap_endian_16(val, raw); \ + val = ((val << 16) >> 16); \ + } while (0) +#define swap_endian_32_se(val, raw) swap_endian_32(val, raw) + +#define mem_get_ne_aligned_generic(end, sz) \ + static VPX_INLINE unsigned MEM_VALUE_T mem_get_##end##sz##_aligned( \ + const void *vmem) { \ + const uint##sz##_t *mem = (const uint##sz##_t *)vmem; \ + return *mem; \ + } + +#define mem_get_sne_aligned_generic(end, sz) \ + static VPX_INLINE signed MEM_VALUE_T mem_get_s##end##sz##_aligned( \ + const void *vmem) { \ + const int##sz##_t *mem = (const int##sz##_t *)vmem; \ + return *mem; \ + } + +#define mem_get_se_aligned_generic(end, sz) \ + static VPX_INLINE unsigned MEM_VALUE_T mem_get_##end##sz##_aligned( \ + const void *vmem) { \ + const uint##sz##_t *mem = (const uint##sz##_t *)vmem; \ + unsigned MEM_VALUE_T val, raw = *mem; \ + swap_endian_##sz(val, raw); \ + return val; \ + } + +#define mem_get_sse_aligned_generic(end, sz) \ + static VPX_INLINE signed MEM_VALUE_T mem_get_s##end##sz##_aligned( \ + const void *vmem) { \ + const int##sz##_t *mem = (const int##sz##_t *)vmem; \ + unsigned MEM_VALUE_T val, raw = *mem; \ + swap_endian_##sz##_se(val, raw); \ + return val; \ + } + +#define mem_put_ne_aligned_generic(end, sz) \ + static VPX_INLINE void mem_put_##end##sz##_aligned(void *vmem, \ + MEM_VALUE_T val) { \ + uint##sz##_t *mem = (uint##sz##_t *)vmem; \ + *mem = (uint##sz##_t)val; \ + } + +#define mem_put_se_aligned_generic(end, sz) \ + static VPX_INLINE void mem_put_##end##sz##_aligned(void *vmem, \ + MEM_VALUE_T val) { \ + uint##sz##_t *mem = (uint##sz##_t *)vmem, raw; \ + swap_endian_##sz(raw, val); \ + *mem = (uint##sz##_t)raw; \ + } + +#include "vpx_config.h" +#if CONFIG_BIG_ENDIAN +#define mem_get_be_aligned_generic(sz) mem_get_ne_aligned_generic(be, sz) +#define mem_get_sbe_aligned_generic(sz) mem_get_sne_aligned_generic(be, sz) +#define mem_get_le_aligned_generic(sz) mem_get_se_aligned_generic(le, sz) +#define mem_get_sle_aligned_generic(sz) mem_get_sse_aligned_generic(le, sz) +#define mem_put_be_aligned_generic(sz) mem_put_ne_aligned_generic(be, sz) +#define mem_put_le_aligned_generic(sz) mem_put_se_aligned_generic(le, sz) +#else +#define mem_get_be_aligned_generic(sz) mem_get_se_aligned_generic(be, sz) +#define mem_get_sbe_aligned_generic(sz) mem_get_sse_aligned_generic(be, sz) +#define mem_get_le_aligned_generic(sz) mem_get_ne_aligned_generic(le, sz) +#define mem_get_sle_aligned_generic(sz) mem_get_sne_aligned_generic(le, sz) +#define mem_put_be_aligned_generic(sz) mem_put_se_aligned_generic(be, sz) +#define mem_put_le_aligned_generic(sz) mem_put_ne_aligned_generic(le, sz) +#endif + +/* clang-format off */ +#undef mem_get_be16_aligned +#define mem_get_be16_aligned mem_ops_wrap_symbol(mem_get_be16_aligned) +mem_get_be_aligned_generic(16) + +#undef mem_get_be32_aligned +#define mem_get_be32_aligned mem_ops_wrap_symbol(mem_get_be32_aligned) +mem_get_be_aligned_generic(32) + +#undef mem_get_le16_aligned +#define mem_get_le16_aligned mem_ops_wrap_symbol(mem_get_le16_aligned) +mem_get_le_aligned_generic(16) + +#undef mem_get_le32_aligned +#define mem_get_le32_aligned mem_ops_wrap_symbol(mem_get_le32_aligned) +mem_get_le_aligned_generic(32) + +#undef mem_get_sbe16_aligned +#define mem_get_sbe16_aligned mem_ops_wrap_symbol(mem_get_sbe16_aligned) +mem_get_sbe_aligned_generic(16) + +#undef mem_get_sbe32_aligned +#define mem_get_sbe32_aligned mem_ops_wrap_symbol(mem_get_sbe32_aligned) +mem_get_sbe_aligned_generic(32) + +#undef mem_get_sle16_aligned +#define mem_get_sle16_aligned mem_ops_wrap_symbol(mem_get_sle16_aligned) +mem_get_sle_aligned_generic(16) + +#undef mem_get_sle32_aligned +#define mem_get_sle32_aligned mem_ops_wrap_symbol(mem_get_sle32_aligned) +mem_get_sle_aligned_generic(32) + +#undef mem_put_be16_aligned +#define mem_put_be16_aligned mem_ops_wrap_symbol(mem_put_be16_aligned) +mem_put_be_aligned_generic(16) + +#undef mem_put_be32_aligned +#define mem_put_be32_aligned mem_ops_wrap_symbol(mem_put_be32_aligned) +mem_put_be_aligned_generic(32) + +#undef mem_put_le16_aligned +#define mem_put_le16_aligned mem_ops_wrap_symbol(mem_put_le16_aligned) +mem_put_le_aligned_generic(16) + +#undef mem_put_le32_aligned +#define mem_put_le32_aligned mem_ops_wrap_symbol(mem_put_le32_aligned) +mem_put_le_aligned_generic(32) + +#undef mem_get_ne_aligned_generic +#undef mem_get_se_aligned_generic +#undef mem_get_sne_aligned_generic +#undef mem_get_sse_aligned_generic +#undef mem_put_ne_aligned_generic +#undef mem_put_se_aligned_generic +#undef swap_endian_16 +#undef swap_endian_32 +#undef swap_endian_16_se +#undef swap_endian_32_se +/* clang-format on */ + +#endif // VPX_VPX_PORTS_MEM_OPS_ALIGNED_H_ diff --git a/media/libvpx/libvpx/vpx_ports/mips.h b/media/libvpx/libvpx/vpx_ports/mips.h new file mode 100644 index 0000000000..439de754fd --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/mips.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2020 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_PORTS_MIPS_H_ +#define VPX_VPX_PORTS_MIPS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#define HAS_MMI 0x01 +#define HAS_MSA 0x02 + +int mips_cpu_caps(void); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_PORTS_MIPS_H_ diff --git a/media/libvpx/libvpx/vpx_ports/mips_cpudetect.c b/media/libvpx/libvpx/vpx_ports/mips_cpudetect.c new file mode 100644 index 0000000000..e0eca2d48d --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/mips_cpudetect.c @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2020 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include +#include +#include "./vpx_config.h" +#include "vpx_ports/mips.h" + +#if CONFIG_RUNTIME_CPU_DETECT +#if defined(__mips__) && defined(__linux__) +int mips_cpu_caps(void) { + char cpuinfo_line[512]; + int flag = 0x0; + FILE *f = fopen("/proc/cpuinfo", "r"); + if (!f) { + // Assume nothing if /proc/cpuinfo is unavailable. + // This will occur for Chrome sandbox for Pepper or Render process. + return 0; + } + while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) { + if (memcmp(cpuinfo_line, "cpu model", 9) == 0) { + // Workaround early kernel without mmi in ASEs line. + if (strstr(cpuinfo_line, "Loongson-3")) { + flag |= HAS_MMI; + } else if (strstr(cpuinfo_line, "Loongson-2K")) { + flag |= HAS_MMI | HAS_MSA; + } + } + if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) { + if (strstr(cpuinfo_line, "loongson-mmi") && + strstr(cpuinfo_line, "loongson-ext")) { + flag |= HAS_MMI; + } + if (strstr(cpuinfo_line, "msa")) { + flag |= HAS_MSA; + } + // ASEs is the last line, so we can break here. + break; + } + } + fclose(f); + return flag; +} +#else /* end __mips__ && __linux__ */ +#error \ + "--enable-runtime-cpu-detect selected, but no CPU detection method " \ +"available for your platform. Reconfigure with --disable-runtime-cpu-detect." +#endif +#else /* end CONFIG_RUNTIME_CPU_DETECT */ +int mips_cpu_caps(void) { return 0; } +#endif diff --git a/media/libvpx/libvpx/vpx_ports/msvc.h b/media/libvpx/libvpx/vpx_ports/msvc.h new file mode 100644 index 0000000000..d58de3535a --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/msvc.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_PORTS_MSVC_H_ +#define VPX_VPX_PORTS_MSVC_H_ +#ifdef _MSC_VER + +#include "./vpx_config.h" + +#if _MSC_VER < 1900 // VS2015 provides snprintf +#define snprintf _snprintf +#endif // _MSC_VER < 1900 + +#if _MSC_VER < 1800 // VS2013 provides round +#include +static INLINE double round(double x) { + if (x < 0) + return ceil(x - 0.5); + else + return floor(x + 0.5); +} +#endif // _MSC_VER < 1800 + +#endif // _MSC_VER +#endif // VPX_VPX_PORTS_MSVC_H_ diff --git a/media/libvpx/libvpx/vpx_ports/ppc.h b/media/libvpx/libvpx/vpx_ports/ppc.h new file mode 100644 index 0000000000..a11f4e8732 --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/ppc.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_PORTS_PPC_H_ +#define VPX_VPX_PORTS_PPC_H_ +#include + +#include "./vpx_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define HAS_VSX 0x01 + +int ppc_simd_caps(void); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_PORTS_PPC_H_ diff --git a/media/libvpx/libvpx/vpx_ports/ppc_cpudetect.c b/media/libvpx/libvpx/vpx_ports/ppc_cpudetect.c new file mode 100644 index 0000000000..374a0271c9 --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/ppc_cpudetect.c @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include +#include + +#include "./vpx_config.h" +#include "vpx_ports/ppc.h" + +#if CONFIG_RUNTIME_CPU_DETECT +static int cpu_env_flags(int *flags) { + char *env; + env = getenv("VPX_SIMD_CAPS"); + if (env && *env) { + *flags = (int)strtol(env, NULL, 0); + return 0; + } + *flags = 0; + return -1; +} + +static int cpu_env_mask(void) { + char *env; + env = getenv("VPX_SIMD_CAPS_MASK"); + return env && *env ? (int)strtol(env, NULL, 0) : ~0; +} + +int ppc_simd_caps(void) { + int flags; + int mask; + int fd; + ssize_t count; + unsigned int i; + uint64_t buf[64]; + + // If VPX_SIMD_CAPS is set then allow only those capabilities. + if (!cpu_env_flags(&flags)) { + return flags; + } + + mask = cpu_env_mask(); + + fd = open("/proc/self/auxv", O_RDONLY); + if (fd < 0) { + return 0; + } + + while ((count = read(fd, buf, sizeof(buf))) > 0) { + for (i = 0; i < (count / sizeof(*buf)); i += 2) { + if (buf[i] == AT_HWCAP) { +#if HAVE_VSX + if (buf[i + 1] & PPC_FEATURE_HAS_VSX) { + flags |= HAS_VSX; + } +#endif // HAVE_VSX + goto out_close; + } else if (buf[i] == AT_NULL) { + goto out_close; + } + } + } +out_close: + close(fd); + return flags & mask; +} +#else +// If there is no RTCD the function pointers are not used and can not be +// changed. +int ppc_simd_caps(void) { return 0; } +#endif // CONFIG_RUNTIME_CPU_DETECT diff --git a/media/libvpx/libvpx/vpx_ports/static_assert.h b/media/libvpx/libvpx/vpx_ports/static_assert.h new file mode 100644 index 0000000000..f632d9f1e8 --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/static_assert.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2020 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_PORTS_STATIC_ASSERT_H_ +#define VPX_VPX_PORTS_STATIC_ASSERT_H_ + +#if defined(_MSC_VER) +#define VPX_STATIC_ASSERT(boolexp) \ + do { \ + char vpx_static_assert[(boolexp) ? 1 : -1]; \ + (void)vpx_static_assert; \ + } while (0) +#else // !_MSC_VER +#define VPX_STATIC_ASSERT(boolexp) \ + do { \ + struct { \ + unsigned int vpx_static_assert : (boolexp) ? 1 : -1; \ + } vpx_static_assert; \ + (void)vpx_static_assert; \ + } while (0) +#endif // _MSC_VER + +#endif // VPX_VPX_PORTS_STATIC_ASSERT_H_ diff --git a/media/libvpx/libvpx/vpx_ports/system_state.h b/media/libvpx/libvpx/vpx_ports/system_state.h new file mode 100644 index 0000000000..32ebd0ed8c --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/system_state.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_PORTS_SYSTEM_STATE_H_ +#define VPX_VPX_PORTS_SYSTEM_STATE_H_ + +#include "./vpx_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if (VPX_ARCH_X86 || VPX_ARCH_X86_64) && HAVE_MMX +extern void vpx_clear_system_state(void); +#else +#define vpx_clear_system_state() +#endif // (VPX_ARCH_X86 || VPX_ARCH_X86_64) && HAVE_MMX + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_PORTS_SYSTEM_STATE_H_ diff --git a/media/libvpx/libvpx/vpx_ports/vpx_once.h b/media/libvpx/libvpx/vpx_ports/vpx_once.h new file mode 100644 index 0000000000..d8a8ed89fe --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/vpx_once.h @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_PORTS_VPX_ONCE_H_ +#define VPX_VPX_PORTS_VPX_ONCE_H_ + +#include "vpx_config.h" + +/* Implement a function wrapper to guarantee initialization + * thread-safety for library singletons. + * + * NOTE: These functions use static locks, and can only be + * used with one common argument per compilation unit. So + * + * file1.c: + * vpx_once(foo); + * ... + * vpx_once(foo); + * + * file2.c: + * vpx_once(bar); + * + * will ensure foo() and bar() are each called only once, but in + * + * file1.c: + * vpx_once(foo); + * vpx_once(bar): + * + * bar() will never be called because the lock is used up + * by the call to foo(). + */ + +#if CONFIG_MULTITHREAD && defined(_WIN32) +#include +#include +/* Declare a per-compilation-unit state variable to track the progress + * of calling func() only once. This must be at global scope because + * local initializers are not thread-safe in MSVC prior to Visual + * Studio 2015. + * + * As a static, once_state will be zero-initialized as program start. + */ +static LONG once_state; +static void once(void (*func)(void)) { + /* Try to advance once_state from its initial value of 0 to 1. + * Only one thread can succeed in doing so. + */ + if (InterlockedCompareExchange(&once_state, 1, 0) == 0) { + /* We're the winning thread, having set once_state to 1. + * Call our function. */ + func(); + /* Now advance once_state to 2, unblocking any other threads. */ + InterlockedIncrement(&once_state); + return; + } + + /* We weren't the winning thread, but we want to block on + * the state variable so we don't return before func() + * has finished executing elsewhere. + * + * Try to advance once_state from 2 to 2, which is only possible + * after the winning thead advances it from 1 to 2. + */ + while (InterlockedCompareExchange(&once_state, 2, 2) != 2) { + /* State isn't yet 2. Try again. + * + * We are used for singleton initialization functions, + * which should complete quickly. Contention will likewise + * be rare, so it's worthwhile to use a simple but cpu- + * intensive busy-wait instead of successive backoff, + * waiting on a kernel object, or another heavier-weight scheme. + * + * We can at least yield our timeslice. + */ + Sleep(0); + } + + /* We've seen once_state advance to 2, so we know func() + * has been called. And we've left once_state as we found it, + * so other threads will have the same experience. + * + * It's safe to return now. + */ + return; +} + +#elif CONFIG_MULTITHREAD && defined(__OS2__) +#define INCL_DOS +#include +static void once(void (*func)(void)) { + static volatile int done; + + /* If the initialization is complete, return early. */ + if (done) return; + + /* Causes all other threads in the process to block themselves + * and give up their time slice. + */ + DosEnterCritSec(); + + if (!done) { + func(); + done = 1; + } + + /* Restores normal thread dispatching for the current process. */ + DosExitCritSec(); +} + +#elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H +#include +static void once(void (*func)(void)) { + static pthread_once_t lock = PTHREAD_ONCE_INIT; + pthread_once(&lock, func); +} + +#else +/* No-op version that performs no synchronization. *_rtcd() is idempotent, + * so as long as your platform provides atomic loads/stores of pointers + * no synchronization is strictly necessary. + */ + +static void once(void (*func)(void)) { + static volatile int done; + + if (!done) { + func(); + done = 1; + } +} +#endif + +#endif // VPX_VPX_PORTS_VPX_ONCE_H_ diff --git a/media/libvpx/libvpx/vpx_ports/vpx_ports.mk b/media/libvpx/libvpx/vpx_ports/vpx_ports.mk new file mode 100644 index 0000000000..6c6737c9bd --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/vpx_ports.mk @@ -0,0 +1,58 @@ +## +## Copyright (c) 2012 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + + +PORTS_SRCS-yes += vpx_ports.mk + +PORTS_SRCS-yes += bitops.h +PORTS_SRCS-yes += compiler_attributes.h +PORTS_SRCS-yes += mem.h +PORTS_SRCS-yes += msvc.h +PORTS_SRCS-yes += static_assert.h +PORTS_SRCS-yes += system_state.h +PORTS_SRCS-yes += vpx_timer.h + +ifeq ($(VPX_ARCH_X86),yes) +PORTS_SRCS-$(HAVE_MMX) += emms_mmx.c +endif +ifeq ($(VPX_ARCH_X86_64),yes) +# Visual Studio x64 does not support the _mm_empty() intrinsic. +PORTS_SRCS-$(HAVE_MMX) += emms_mmx.asm +endif + +ifeq ($(VPX_ARCH_X86_64),yes) +PORTS_SRCS-yes += float_control_word.asm +endif + +ifeq ($(VPX_ARCH_X86)$(VPX_ARCH_X86_64),yes) +PORTS_SRCS-yes += x86.h +PORTS_SRCS-yes += x86_abi_support.asm +endif + +ifeq ($(VPX_ARCH_AARCH64),yes) +PORTS_SRCS-yes += aarch64_cpudetect.c +else +PORTS_SRCS-$(VPX_ARCH_ARM) += aarch32_cpudetect.c +endif +PORTS_SRCS-$(VPX_ARCH_ARM) += arm_cpudetect.h +PORTS_SRCS-$(VPX_ARCH_ARM) += arm.h + +PORTS_SRCS-$(VPX_ARCH_PPC) += ppc_cpudetect.c +PORTS_SRCS-$(VPX_ARCH_PPC) += ppc.h + +PORTS_SRCS-$(VPX_ARCH_MIPS) += mips_cpudetect.c +PORTS_SRCS-$(VPX_ARCH_MIPS) += mips.h + +PORTS_SRCS-$(VPX_ARCH_LOONGARCH) += loongarch_cpudetect.c +PORTS_SRCS-$(VPX_ARCH_LOONGARCH) += loongarch.h + +ifeq ($(VPX_ARCH_MIPS), yes) +PORTS_SRCS-yes += asmdefs_mmi.h +endif diff --git a/media/libvpx/libvpx/vpx_ports/vpx_timer.h b/media/libvpx/libvpx/vpx_ports/vpx_timer.h new file mode 100644 index 0000000000..4934d5296a --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/vpx_timer.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_PORTS_VPX_TIMER_H_ +#define VPX_VPX_PORTS_VPX_TIMER_H_ + +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" + +#if CONFIG_OS_SUPPORT + +#if defined(_WIN32) +/* + * Win32 specific includes + */ +#undef NOMINMAX +#define NOMINMAX +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif +#include +#else +/* + * POSIX specific includes + */ +#include + +/* timersub is not provided by msys at this time. */ +#ifndef timersub +#define timersub(a, b, result) \ + do { \ + (result)->tv_sec = (a)->tv_sec - (b)->tv_sec; \ + (result)->tv_usec = (a)->tv_usec - (b)->tv_usec; \ + if ((result)->tv_usec < 0) { \ + --(result)->tv_sec; \ + (result)->tv_usec += 1000000; \ + } \ + } while (0) +#endif +#endif + +struct vpx_usec_timer { +#if defined(_WIN32) + LARGE_INTEGER begin, end; +#else + struct timeval begin, end; +#endif +}; + +static INLINE void vpx_usec_timer_start(struct vpx_usec_timer *t) { +#if defined(_WIN32) + QueryPerformanceCounter(&t->begin); +#else + gettimeofday(&t->begin, NULL); +#endif +} + +static INLINE void vpx_usec_timer_mark(struct vpx_usec_timer *t) { +#if defined(_WIN32) + QueryPerformanceCounter(&t->end); +#else + gettimeofday(&t->end, NULL); +#endif +} + +static INLINE int64_t vpx_usec_timer_elapsed(struct vpx_usec_timer *t) { +#if defined(_WIN32) + LARGE_INTEGER freq, diff; + + diff.QuadPart = t->end.QuadPart - t->begin.QuadPart; + + QueryPerformanceFrequency(&freq); + return diff.QuadPart * 1000000 / freq.QuadPart; +#else + struct timeval diff; + + timersub(&t->end, &t->begin, &diff); + return (int64_t)diff.tv_sec * 1000000 + diff.tv_usec; +#endif +} + +#else /* CONFIG_OS_SUPPORT = 0*/ + +/* Empty timer functions if CONFIG_OS_SUPPORT = 0 */ +#ifndef timersub +#define timersub(a, b, result) +#endif + +struct vpx_usec_timer { + void *dummy; +}; + +static INLINE void vpx_usec_timer_start(struct vpx_usec_timer *t) {} + +static INLINE void vpx_usec_timer_mark(struct vpx_usec_timer *t) {} + +static INLINE int vpx_usec_timer_elapsed(struct vpx_usec_timer *t) { return 0; } + +#endif /* CONFIG_OS_SUPPORT */ + +#endif // VPX_VPX_PORTS_VPX_TIMER_H_ diff --git a/media/libvpx/libvpx/vpx_ports/x86.h b/media/libvpx/libvpx/vpx_ports/x86.h new file mode 100644 index 0000000000..795fb2923f --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/x86.h @@ -0,0 +1,402 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_PORTS_X86_H_ +#define VPX_VPX_PORTS_X86_H_ +#include + +#if defined(_MSC_VER) +#include /* For __cpuidex, __rdtsc */ +#endif + +#include "vpx_config.h" +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + VPX_CPU_UNKNOWN = -1, + VPX_CPU_AMD, + VPX_CPU_AMD_OLD, + VPX_CPU_CENTAUR, + VPX_CPU_CYRIX, + VPX_CPU_INTEL, + VPX_CPU_NEXGEN, + VPX_CPU_NSC, + VPX_CPU_RISE, + VPX_CPU_SIS, + VPX_CPU_TRANSMETA, + VPX_CPU_TRANSMETA_OLD, + VPX_CPU_UMC, + VPX_CPU_VIA, + + VPX_CPU_LAST +} vpx_cpu_t; + +#if defined(__GNUC__) && __GNUC__ || defined(__ANDROID__) +#if VPX_ARCH_X86_64 +#define cpuid(func, func2, ax, bx, cx, dx) \ + __asm__ __volatile__("cpuid \n\t" \ + : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \ + : "a"(func), "c"(func2)) +#else +#define cpuid(func, func2, ax, bx, cx, dx) \ + __asm__ __volatile__( \ + "mov %%ebx, %%edi \n\t" \ + "cpuid \n\t" \ + "xchg %%edi, %%ebx \n\t" \ + : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \ + : "a"(func), "c"(func2)) +#endif +#elif defined(__SUNPRO_C) || \ + defined(__SUNPRO_CC) /* end __GNUC__ or __ANDROID__*/ +#if VPX_ARCH_X86_64 +#define cpuid(func, func2, ax, bx, cx, dx) \ + asm volatile( \ + "xchg %rsi, %rbx \n\t" \ + "cpuid \n\t" \ + "movl %ebx, %edi \n\t" \ + "xchg %rsi, %rbx \n\t" \ + : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \ + : "a"(func), "c"(func2)) +#else +#define cpuid(func, func2, ax, bx, cx, dx) \ + asm volatile( \ + "pushl %ebx \n\t" \ + "cpuid \n\t" \ + "movl %ebx, %edi \n\t" \ + "popl %ebx \n\t" \ + : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \ + : "a"(func), "c"(func2)) +#endif +#else /* end __SUNPRO__ */ +#if VPX_ARCH_X86_64 +#if defined(_MSC_VER) && _MSC_VER > 1500 +#define cpuid(func, func2, a, b, c, d) \ + do { \ + int regs[4]; \ + __cpuidex(regs, func, func2); \ + a = regs[0]; \ + b = regs[1]; \ + c = regs[2]; \ + d = regs[3]; \ + } while (0) +#else +#define cpuid(func, func2, a, b, c, d) \ + do { \ + int regs[4]; \ + __cpuid(regs, func); \ + a = regs[0]; \ + b = regs[1]; \ + c = regs[2]; \ + d = regs[3]; \ + } while (0) +#endif +#else +#define cpuid(func, func2, a, b, c, d) \ + __asm mov eax, func __asm mov ecx, func2 __asm cpuid __asm mov a, \ + eax __asm mov b, ebx __asm mov c, ecx __asm mov d, edx +#endif +#endif /* end others */ + +// NaCl has no support for xgetbv or the raw opcode. +#if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__)) +static INLINE uint64_t xgetbv(void) { + const uint32_t ecx = 0; + uint32_t eax, edx; + // Use the raw opcode for xgetbv for compatibility with older toolchains. + __asm__ volatile(".byte 0x0f, 0x01, 0xd0\n" + : "=a"(eax), "=d"(edx) + : "c"(ecx)); + return ((uint64_t)edx << 32) | eax; +} +#elif (defined(_M_X64) || defined(_M_IX86)) && defined(_MSC_FULL_VER) && \ + _MSC_FULL_VER >= 160040219 // >= VS2010 SP1 +#include +#define xgetbv() _xgetbv(0) +#elif defined(_MSC_VER) && defined(_M_IX86) +static INLINE uint64_t xgetbv(void) { + uint32_t eax_, edx_; + __asm { + xor ecx, ecx // ecx = 0 + // Use the raw opcode for xgetbv for compatibility with older toolchains. + __asm _emit 0x0f __asm _emit 0x01 __asm _emit 0xd0 + mov eax_, eax + mov edx_, edx + } + return ((uint64_t)edx_ << 32) | eax_; +} +#else +#define xgetbv() 0U // no AVX for older x64 or unrecognized toolchains. +#endif + +#if defined(_MSC_VER) && _MSC_VER >= 1700 +#undef NOMINMAX +#define NOMINMAX +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif +#include +#if WINAPI_FAMILY_PARTITION(WINAPI_FAMILY_APP) +#define getenv(x) NULL +#endif +#endif + +#define HAS_MMX 0x001 +#define HAS_SSE 0x002 +#define HAS_SSE2 0x004 +#define HAS_SSE3 0x008 +#define HAS_SSSE3 0x010 +#define HAS_SSE4_1 0x020 +#define HAS_AVX 0x040 +#define HAS_AVX2 0x080 +#define HAS_AVX512 0x100 +#ifndef BIT +#define BIT(n) (1u << (n)) +#endif + +static INLINE int x86_simd_caps(void) { + unsigned int flags = 0; + unsigned int mask = ~0u; + unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx; + char *env; + (void)reg_ebx; + + /* See if the CPU capabilities are being overridden by the environment */ + env = getenv("VPX_SIMD_CAPS"); + + if (env && *env) return (int)strtol(env, NULL, 0); + + env = getenv("VPX_SIMD_CAPS_MASK"); + + if (env && *env) mask = (unsigned int)strtoul(env, NULL, 0); + + /* Ensure that the CPUID instruction supports extended features */ + cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx); + + if (max_cpuid_val < 1) return 0; + + /* Get the standard feature flags */ + cpuid(1, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); + + if (reg_edx & BIT(23)) flags |= HAS_MMX; + + if (reg_edx & BIT(25)) flags |= HAS_SSE; /* aka xmm */ + + if (reg_edx & BIT(26)) flags |= HAS_SSE2; /* aka wmt */ + + if (reg_ecx & BIT(0)) flags |= HAS_SSE3; + + if (reg_ecx & BIT(9)) flags |= HAS_SSSE3; + + if (reg_ecx & BIT(19)) flags |= HAS_SSE4_1; + + // bits 27 (OSXSAVE) & 28 (256-bit AVX) + if ((reg_ecx & (BIT(27) | BIT(28))) == (BIT(27) | BIT(28))) { + // Check for OS-support of YMM state. Necessary for AVX and AVX2. + if ((xgetbv() & 0x6) == 0x6) { + flags |= HAS_AVX; + + if (max_cpuid_val >= 7) { + /* Get the leaf 7 feature flags. Needed to check for AVX2 support */ + cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); + + if (reg_ebx & BIT(5)) flags |= HAS_AVX2; + + // bits 16 (AVX-512F) & 17 (AVX-512DQ) & 28 (AVX-512CD) & + // 30 (AVX-512BW) & 32 (AVX-512VL) + if ((reg_ebx & (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))) == + (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))) { + // Check for OS-support of ZMM and YMM state. Necessary for AVX-512. + if ((xgetbv() & 0xe6) == 0xe6) flags |= HAS_AVX512; + } + } + } + } + + (void)reg_eax; // Avoid compiler warning on unused-but-set variable. + + return flags & mask; +} + +// Fine-Grain Measurement Functions +// +// If you are timing a small region of code, access the timestamp counter +// (TSC) via: +// +// unsigned int start = x86_tsc_start(); +// ... +// unsigned int end = x86_tsc_end(); +// unsigned int diff = end - start; +// +// The start/end functions introduce a few more instructions than using +// x86_readtsc directly, but prevent the CPU's out-of-order execution from +// affecting the measurement (by having earlier/later instructions be evaluated +// in the time interval). See the white paper, "How to Benchmark Code +// Execution Times on Intel(R) IA-32 and IA-64 Instruction Set Architectures" by +// Gabriele Paoloni for more information. +// +// If you are timing a large function (CPU time > a couple of seconds), use +// x86_readtsc64 to read the timestamp counter in a 64-bit integer. The +// out-of-order leakage that can occur is minimal compared to total runtime. +static INLINE unsigned int x86_readtsc(void) { +#if defined(__GNUC__) && __GNUC__ + unsigned int tsc; + __asm__ __volatile__("rdtsc\n\t" : "=a"(tsc) :); + return tsc; +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) + unsigned int tsc; + asm volatile("rdtsc\n\t" : "=a"(tsc) :); + return tsc; +#else +#if VPX_ARCH_X86_64 + return (unsigned int)__rdtsc(); +#else + __asm rdtsc; +#endif +#endif +} +// 64-bit CPU cycle counter +static INLINE uint64_t x86_readtsc64(void) { +#if defined(__GNUC__) && __GNUC__ + uint32_t hi, lo; + __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); + return ((uint64_t)hi << 32) | lo; +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) + uint_t hi, lo; + asm volatile("rdtsc\n\t" : "=a"(lo), "=d"(hi)); + return ((uint64_t)hi << 32) | lo; +#else +#if VPX_ARCH_X86_64 + return (uint64_t)__rdtsc(); +#else + __asm rdtsc; +#endif +#endif +} + +// 32-bit CPU cycle counter with a partial fence against out-of-order execution. +static INLINE unsigned int x86_readtscp(void) { +#if defined(__GNUC__) && __GNUC__ + unsigned int tscp; + __asm__ __volatile__("rdtscp\n\t" : "=a"(tscp) :); + return tscp; +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) + unsigned int tscp; + asm volatile("rdtscp\n\t" : "=a"(tscp) :); + return tscp; +#elif defined(_MSC_VER) + unsigned int ui; + return (unsigned int)__rdtscp(&ui); +#else +#if VPX_ARCH_X86_64 + return (unsigned int)__rdtscp(); +#else + __asm rdtscp; +#endif +#endif +} + +static INLINE unsigned int x86_tsc_start(void) { + unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx; + // This call should not be removed. See function notes above. + cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); + // Avoid compiler warnings on unused-but-set variables. + (void)reg_eax; + (void)reg_ebx; + (void)reg_ecx; + (void)reg_edx; + return x86_readtsc(); +} + +static INLINE unsigned int x86_tsc_end(void) { + uint32_t v = x86_readtscp(); + unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx; + // This call should not be removed. See function notes above. + cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); + // Avoid compiler warnings on unused-but-set variables. + (void)reg_eax; + (void)reg_ebx; + (void)reg_ecx; + (void)reg_edx; + return v; +} + +#if defined(__GNUC__) && __GNUC__ +#define x86_pause_hint() __asm__ __volatile__("pause \n\t") +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) +#define x86_pause_hint() asm volatile("pause \n\t") +#else +#if VPX_ARCH_X86_64 +#define x86_pause_hint() _mm_pause(); +#else +#define x86_pause_hint() __asm pause +#endif +#endif + +#if defined(__GNUC__) && __GNUC__ +static void x87_set_control_word(unsigned short mode) { + __asm__ __volatile__("fldcw %0" : : "m"(*&mode)); +} +static unsigned short x87_get_control_word(void) { + unsigned short mode; + __asm__ __volatile__("fstcw %0\n\t" : "=m"(*&mode) :); + return mode; +} +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) +static void x87_set_control_word(unsigned short mode) { + asm volatile("fldcw %0" : : "m"(*&mode)); +} +static unsigned short x87_get_control_word(void) { + unsigned short mode; + asm volatile("fstcw %0\n\t" : "=m"(*&mode) :); + return mode; +} +#elif VPX_ARCH_X86_64 +/* No fldcw intrinsics on Windows x64, punt to external asm */ +extern void vpx_winx64_fldcw(unsigned short mode); +extern unsigned short vpx_winx64_fstcw(void); +#define x87_set_control_word vpx_winx64_fldcw +#define x87_get_control_word vpx_winx64_fstcw +#else +static void x87_set_control_word(unsigned short mode) { + __asm { fldcw mode } +} +static unsigned short x87_get_control_word(void) { + unsigned short mode; + __asm { fstcw mode } + return mode; +} +#endif + +static INLINE unsigned int x87_set_double_precision(void) { + unsigned int mode = x87_get_control_word(); + // Intel 64 and IA-32 Architectures Developer's Manual: Vol. 1 + // https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-1-manual.pdf + // 8.1.5.2 Precision Control Field + // Bits 8 and 9 (0x300) of the x87 FPU Control Word ("Precision Control") + // determine the number of bits used in floating point calculations. To match + // later SSE instructions restrict x87 operations to Double Precision (0x200). + // Precision PC Field + // Single Precision (24-Bits) 00B + // Reserved 01B + // Double Precision (53-Bits) 10B + // Extended Precision (64-Bits) 11B + x87_set_control_word((mode & ~0x300u) | 0x200u); + return mode; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_PORTS_X86_H_ diff --git a/media/libvpx/libvpx/vpx_ports/x86_abi_support.asm b/media/libvpx/libvpx/vpx_ports/x86_abi_support.asm new file mode 100644 index 0000000000..6b2d6b9684 --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/x86_abi_support.asm @@ -0,0 +1,425 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_config.asm" + +; 32/64 bit compatibility macros +; +; In general, we make the source use 64 bit syntax, then twiddle with it using +; the preprocessor to get the 32 bit syntax on 32 bit platforms. +; +%ifidn __OUTPUT_FORMAT__,elf32 +%define ABI_IS_32BIT 1 +%elifidn __OUTPUT_FORMAT__,macho32 +%define ABI_IS_32BIT 1 +%elifidn __OUTPUT_FORMAT__,win32 +%define ABI_IS_32BIT 1 +%elifidn __OUTPUT_FORMAT__,aout +%define ABI_IS_32BIT 1 +%else +%define ABI_IS_32BIT 0 +%endif + +%if ABI_IS_32BIT +%define rax eax +%define rbx ebx +%define rcx ecx +%define rdx edx +%define rsi esi +%define rdi edi +%define rsp esp +%define rbp ebp +%define movsxd mov +%macro movq 2 + %ifidn %1,eax + movd %1,%2 + %elifidn %2,eax + movd %1,%2 + %elifidn %1,ebx + movd %1,%2 + %elifidn %2,ebx + movd %1,%2 + %elifidn %1,ecx + movd %1,%2 + %elifidn %2,ecx + movd %1,%2 + %elifidn %1,edx + movd %1,%2 + %elifidn %2,edx + movd %1,%2 + %elifidn %1,esi + movd %1,%2 + %elifidn %2,esi + movd %1,%2 + %elifidn %1,edi + movd %1,%2 + %elifidn %2,edi + movd %1,%2 + %elifidn %1,esp + movd %1,%2 + %elifidn %2,esp + movd %1,%2 + %elifidn %1,ebp + movd %1,%2 + %elifidn %2,ebp + movd %1,%2 + %else + movq %1,%2 + %endif +%endmacro +%endif + + +; LIBVPX_YASM_WIN64 +; Set LIBVPX_YASM_WIN64 if output is Windows 64bit so the code will work if x64 +; or win64 is defined on the Yasm command line. +%ifidn __OUTPUT_FORMAT__,win64 +%define LIBVPX_YASM_WIN64 1 +%elifidn __OUTPUT_FORMAT__,x64 +%define LIBVPX_YASM_WIN64 1 +%else +%define LIBVPX_YASM_WIN64 0 +%endif + +; Declare groups of platforms +%ifidn __OUTPUT_FORMAT__,elf32 + %define LIBVPX_ELF 1 +%elifidn __OUTPUT_FORMAT__,elfx32 + %define LIBVPX_ELF 1 +%elifidn __OUTPUT_FORMAT__,elf64 + %define LIBVPX_ELF 1 +%else + %define LIBVPX_ELF 0 +%endif + +%ifidn __OUTPUT_FORMAT__,macho32 + %define LIBVPX_MACHO 1 +%elifidn __OUTPUT_FORMAT__,macho64 + %define LIBVPX_MACHO 1 +%else + %define LIBVPX_MACHO 0 +%endif + +; sym() +; Return the proper symbol name for the target ABI. +; +; Certain ABIs, notably MS COFF and Darwin MACH-O, require that symbols +; with C linkage be prefixed with an underscore. +; +%if LIBVPX_ELF || LIBVPX_YASM_WIN64 + %define sym(x) x +%else + ; Mach-O / COFF + %define sym(x) _ %+ x +%endif + +; globalsym() +; Return a global declaration with the proper decoration for the target ABI. +; +; When CHROMIUM is defined, include attributes to hide the symbol from the +; global namespace. +; +; Chromium doesn't like exported global symbols due to symbol clashing with +; plugins among other things. +; +; Requires Chromium's patched copy of yasm: +; http://src.chromium.org/viewvc/chrome?view=rev&revision=73761 +; http://www.tortall.net/projects/yasm/ticket/236 +; or nasm > 2.14. +; +%ifdef CHROMIUM + %ifdef __NASM_VER__ + %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14 + ; nasm < 2.14 does not support :private_extern directive + %fatal Must use nasm 2.14 or newer + %endif + %endif + + %if LIBVPX_ELF + %define globalsym(x) global sym(x) %+ :function hidden + %elif LIBVPX_MACHO + %define globalsym(x) global sym(x) %+ :private_extern + %else + ; COFF / PE32+ + %define globalsym(x) global sym(x) + %endif +%else + %define globalsym(x) global sym(x) +%endif + +; arg() +; Return the address specification of the given argument +; +%if ABI_IS_32BIT + %define arg(x) [ebp+8+4*x] +%else + ; 64 bit ABI passes arguments in registers. This is a workaround to get up + ; and running quickly. Relies on SHADOW_ARGS_TO_STACK + %if LIBVPX_YASM_WIN64 + %define arg(x) [rbp+16+8*x] + %else + %define arg(x) [rbp-8-8*x] + %endif +%endif + +; REG_SZ_BYTES, REG_SZ_BITS +; Size of a register +%if ABI_IS_32BIT +%define REG_SZ_BYTES 4 +%define REG_SZ_BITS 32 +%else +%define REG_SZ_BYTES 8 +%define REG_SZ_BITS 64 +%endif + + +; ALIGN_STACK +; This macro aligns the stack to the given alignment (in bytes). The stack +; is left such that the previous value of the stack pointer is the first +; argument on the stack (ie, the inverse of this macro is 'pop rsp.') +; This macro uses one temporary register, which is not preserved, and thus +; must be specified as an argument. +%macro ALIGN_STACK 2 + mov %2, rsp + and rsp, -%1 + lea rsp, [rsp - (%1 - REG_SZ_BYTES)] + push %2 +%endmacro + + +; +; The Microsoft assembler tries to impose a certain amount of type safety in +; its register usage. YASM doesn't recognize these directives, so we just +; %define them away to maintain as much compatibility as possible with the +; original inline assembler we're porting from. +; +%idefine PTR +%idefine XMMWORD +%idefine MMWORD + +; PIC macros +; +%if ABI_IS_32BIT + %if CONFIG_PIC=1 + %ifidn __OUTPUT_FORMAT__,elf32 + %define WRT_PLT wrt ..plt + %macro GET_GOT 1 + extern _GLOBAL_OFFSET_TABLE_ + push %1 + call %%get_got + %%sub_offset: + jmp %%exitGG + %%get_got: + mov %1, [esp] + add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc + ret + %%exitGG: + %undef GLOBAL + %define GLOBAL(x) x + %1 wrt ..gotoff + %undef RESTORE_GOT + %define RESTORE_GOT pop %1 + %endmacro + %elifidn __OUTPUT_FORMAT__,macho32 + %macro GET_GOT 1 + push %1 + call %%get_got + %%get_got: + pop %1 + %undef GLOBAL + %define GLOBAL(x) x + %1 - %%get_got + %undef RESTORE_GOT + %define RESTORE_GOT pop %1 + %endmacro + %endif + %endif + + %ifdef CHROMIUM + %ifidn __OUTPUT_FORMAT__,macho32 + %define HIDDEN_DATA(x) x:private_extern + %else + %define HIDDEN_DATA(x) x + %endif + %else + %define HIDDEN_DATA(x) x + %endif +%else + %macro GET_GOT 1 + %endmacro + %define GLOBAL(x) rel x + %ifidn __OUTPUT_FORMAT__,elf64 + %define WRT_PLT wrt ..plt + %define HIDDEN_DATA(x) x:data hidden + %elifidn __OUTPUT_FORMAT__,elfx32 + %define WRT_PLT wrt ..plt + %define HIDDEN_DATA(x) x:data hidden + %elifidn __OUTPUT_FORMAT__,macho64 + %ifdef CHROMIUM + %define HIDDEN_DATA(x) x:private_extern + %else + %define HIDDEN_DATA(x) x + %endif + %else + %define HIDDEN_DATA(x) x + %endif +%endif +%ifnmacro GET_GOT + %macro GET_GOT 1 + %endmacro + %define GLOBAL(x) x +%endif +%ifndef RESTORE_GOT +%define RESTORE_GOT +%endif +%ifndef WRT_PLT +%define WRT_PLT +%endif + +%if ABI_IS_32BIT + %macro SHADOW_ARGS_TO_STACK 1 + %endm + %define UNSHADOW_ARGS +%else +%if LIBVPX_YASM_WIN64 + %macro SHADOW_ARGS_TO_STACK 1 ; argc + %if %1 > 0 + mov arg(0),rcx + %endif + %if %1 > 1 + mov arg(1),rdx + %endif + %if %1 > 2 + mov arg(2),r8 + %endif + %if %1 > 3 + mov arg(3),r9 + %endif + %endm +%else + %macro SHADOW_ARGS_TO_STACK 1 ; argc + %if %1 > 0 + push rdi + %endif + %if %1 > 1 + push rsi + %endif + %if %1 > 2 + push rdx + %endif + %if %1 > 3 + push rcx + %endif + %if %1 > 4 + push r8 + %endif + %if %1 > 5 + push r9 + %endif + %if %1 > 6 + %assign i %1-6 + %assign off 16 + %rep i + mov rax,[rbp+off] + push rax + %assign off off+8 + %endrep + %endif + %endm +%endif + %define UNSHADOW_ARGS mov rsp, rbp +%endif + +; Win64 ABI requires that XMM6:XMM15 are callee saved +; SAVE_XMM n, [u] +; store registers 6-n on the stack +; if u is specified, use unaligned movs. +; Win64 ABI requires 16 byte stack alignment, but then pushes an 8 byte return +; value. Typically we follow this up with 'push rbp' - re-aligning the stack - +; but in some cases this is not done and unaligned movs must be used. +%if LIBVPX_YASM_WIN64 +%macro SAVE_XMM 1-2 a + %if %1 < 6 + %error Only xmm registers 6-15 must be preserved + %else + %assign last_xmm %1 + %define movxmm movdq %+ %2 + %assign xmm_stack_space ((last_xmm - 5) * 16) + sub rsp, xmm_stack_space + %assign i 6 + %rep (last_xmm - 5) + movxmm [rsp + ((i - 6) * 16)], xmm %+ i + %assign i i+1 + %endrep + %endif +%endmacro +%macro RESTORE_XMM 0 + %ifndef last_xmm + %error RESTORE_XMM must be paired with SAVE_XMM n + %else + %assign i last_xmm + %rep (last_xmm - 5) + movxmm xmm %+ i, [rsp +((i - 6) * 16)] + %assign i i-1 + %endrep + add rsp, xmm_stack_space + ; there are a couple functions which return from multiple places. + ; otherwise, we could uncomment these: + ; %undef last_xmm + ; %undef xmm_stack_space + ; %undef movxmm + %endif +%endmacro +%else +%macro SAVE_XMM 1-2 +%endmacro +%macro RESTORE_XMM 0 +%endmacro +%endif + +; Name of the rodata section +; +; .rodata seems to be an elf-ism, as it doesn't work on OSX. +; +%ifidn __OUTPUT_FORMAT__,macho64 +%define SECTION_RODATA section .text +%elifidn __OUTPUT_FORMAT__,macho32 +%macro SECTION_RODATA 0 +section .text +%endmacro +%elifidn __OUTPUT_FORMAT__,aout +%define SECTION_RODATA section .data +%else +%define SECTION_RODATA section .rodata +%endif + + +; Tell GNU ld that we don't require an executable stack. +%ifidn __OUTPUT_FORMAT__,elf32 +section .note.GNU-stack noalloc noexec nowrite progbits +section .text +%elifidn __OUTPUT_FORMAT__,elf64 +section .note.GNU-stack noalloc noexec nowrite progbits +section .text +%elifidn __OUTPUT_FORMAT__,elfx32 +section .note.GNU-stack noalloc noexec nowrite progbits +section .text +%endif + +; On Android platforms use lrand48 when building postproc routines. Prior to L +; rand() was not available. +%if CONFIG_POSTPROC=1 || CONFIG_VP9_POSTPROC=1 +%ifdef __ANDROID__ +extern sym(lrand48) +%define LIBVPX_RAND lrand48 +%else +extern sym(rand) +%define LIBVPX_RAND rand +%endif +%endif ; CONFIG_POSTPROC || CONFIG_VP9_POSTPROC diff --git a/media/libvpx/libvpx/vpx_scale/generic/gen_scalers.c b/media/libvpx/libvpx/vpx_scale/generic/gen_scalers.c new file mode 100644 index 0000000000..d8db4b3547 --- /dev/null +++ b/media/libvpx/libvpx/vpx_scale/generic/gen_scalers.c @@ -0,0 +1,228 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_scale_rtcd.h" +#include "vpx_scale/vpx_scale.h" +#include "vpx_mem/vpx_mem.h" +/**************************************************************************** + * Imports + ****************************************************************************/ + +/**************************************************************************** + * + * + * INPUTS : const unsigned char *source : Pointer to source data. + * unsigned int source_width : Stride of source. + * unsigned char *dest : Pointer to destination data. + * unsigned int dest_width : Stride of dest (UNUSED). + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Copies horizontal line of pixels from source to + * destination scaling up by 4 to 5. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width) { + unsigned i; + unsigned int a, b, c, d, e; + unsigned char *des = dest; + const unsigned char *src = source; + + (void)dest_width; + + for (i = 0; i < source_width; i += 5) { + a = src[0]; + b = src[1]; + c = src[2]; + d = src[3]; + e = src[4]; + + des[0] = (unsigned char)a; + des[1] = (unsigned char)((b * 192 + c * 64 + 128) >> 8); + des[2] = (unsigned char)((c * 128 + d * 128 + 128) >> 8); + des[3] = (unsigned char)((d * 64 + e * 192 + 128) >> 8); + + src += 5; + des += 4; + } +} + +void vp8_vertical_band_5_4_scale_c(unsigned char *source, + unsigned int src_pitch, unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width) { + unsigned int i; + unsigned int a, b, c, d, e; + unsigned char *des = dest; + unsigned char *src = source; + + for (i = 0; i < dest_width; i++) { + a = src[0 * src_pitch]; + b = src[1 * src_pitch]; + c = src[2 * src_pitch]; + d = src[3 * src_pitch]; + e = src[4 * src_pitch]; + + des[0 * dest_pitch] = (unsigned char)a; + des[1 * dest_pitch] = (unsigned char)((b * 192 + c * 64 + 128) >> 8); + des[2 * dest_pitch] = (unsigned char)((c * 128 + d * 128 + 128) >> 8); + des[3 * dest_pitch] = (unsigned char)((d * 64 + e * 192 + 128) >> 8); + + src++; + des++; + } +} + +/*7*************************************************************************** + * + * ROUTINE : vp8_horizontal_line_3_5_scale_c + * + * INPUTS : const unsigned char *source : Pointer to source data. + * unsigned int source_width : Stride of source. + * unsigned char *dest : Pointer to destination data. + * unsigned int dest_width : Stride of dest (UNUSED). + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Copies horizontal line of pixels from source to + * destination scaling up by 3 to 5. + * + * SPECIAL NOTES : None. + * + * + ****************************************************************************/ +void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width) { + unsigned int i; + unsigned int a, b, c, d, e; + unsigned char *des = dest; + const unsigned char *src = source; + + (void)dest_width; + + for (i = 0; i < source_width; i += 5) { + a = src[0]; + b = src[1]; + c = src[2]; + d = src[3]; + e = src[4]; + + des[0] = (unsigned char)a; + des[1] = (unsigned char)((b * 85 + c * 171 + 128) >> 8); + des[2] = (unsigned char)((d * 171 + e * 85 + 128) >> 8); + + src += 5; + des += 3; + } +} + +void vp8_vertical_band_5_3_scale_c(unsigned char *source, + unsigned int src_pitch, unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width) { + unsigned int i; + unsigned int a, b, c, d, e; + unsigned char *des = dest; + unsigned char *src = source; + + for (i = 0; i < dest_width; i++) { + a = src[0 * src_pitch]; + b = src[1 * src_pitch]; + c = src[2 * src_pitch]; + d = src[3 * src_pitch]; + e = src[4 * src_pitch]; + + des[0 * dest_pitch] = (unsigned char)a; + des[1 * dest_pitch] = (unsigned char)((b * 85 + c * 171 + 128) >> 8); + des[2 * dest_pitch] = (unsigned char)((d * 171 + e * 85 + 128) >> 8); + + src++; + des++; + } +} + +/**************************************************************************** + * + * ROUTINE : vp8_horizontal_line_1_2_scale_c + * + * INPUTS : const unsigned char *source : Pointer to source data. + * unsigned int source_width : Stride of source. + * unsigned char *dest : Pointer to destination data. + * unsigned int dest_width : Stride of dest (UNUSED). + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Copies horizontal line of pixels from source to + * destination scaling up by 1 to 2. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width) { + unsigned int i; + unsigned int a; + unsigned char *des = dest; + const unsigned char *src = source; + + (void)dest_width; + + for (i = 0; i < source_width; i += 2) { + a = src[0]; + des[0] = (unsigned char)(a); + src += 2; + des += 1; + } +} + +void vp8_vertical_band_2_1_scale_c(unsigned char *source, + unsigned int src_pitch, unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width) { + (void)dest_pitch; + (void)src_pitch; + memcpy(dest, source, dest_width); +} + +void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, + unsigned int src_pitch, + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width) { + int i; + int temp; + int width = dest_width; + + (void)dest_pitch; + + for (i = 0; i < width; i++) { + temp = 8; + temp += source[i - (int)src_pitch] * 3; + temp += source[i] * 10; + temp += source[i + src_pitch] * 3; + temp >>= 4; + dest[i] = (unsigned char)(temp); + } +} diff --git a/media/libvpx/libvpx/vpx_scale/generic/vpx_scale.c b/media/libvpx/libvpx/vpx_scale/generic/vpx_scale.c new file mode 100644 index 0000000000..958bb320fc --- /dev/null +++ b/media/libvpx/libvpx/vpx_scale/generic/vpx_scale.c @@ -0,0 +1,529 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/**************************************************************************** + * + * Module Title : scale.c + * + * Description : Image scaling functions. + * + ***************************************************************************/ + +/**************************************************************************** + * Header Files + ****************************************************************************/ +#include "./vpx_scale_rtcd.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_scale/vpx_scale.h" +#include "vpx_scale/yv12config.h" + +typedef struct { + int expanded_frame_width; + int expanded_frame_height; + + int HScale; + int HRatio; + int VScale; + int VRatio; + + YV12_BUFFER_CONFIG *src_yuv_config; + YV12_BUFFER_CONFIG *dst_yuv_config; + +} SCALE_VARS; + +/**************************************************************************** + * + * ROUTINE : scale1d_2t1_i + * + * INPUTS : const unsigned char *source : Pointer to data to be scaled. + * int source_step : Number of pixels to step on in + * source. + * unsigned int source_scale : Scale for source (UNUSED). + * unsigned int source_length : Length of source (UNUSED). + * unsigned char *dest : Pointer to output data array. + * int dest_step : Number of pixels to step on in + * destination. + * unsigned int dest_scale : Scale for destination + * (UNUSED). + * unsigned int dest_length : Length of destination. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Performs 2-to-1 interpolated scaling. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static void scale1d_2t1_i(const unsigned char *source, int source_step, + unsigned int source_scale, unsigned int source_length, + unsigned char *dest, int dest_step, + unsigned int dest_scale, unsigned int dest_length) { + unsigned int i, j; + unsigned int temp; + int source_pitch = source_step; + (void)source_length; + (void)source_scale; + (void)dest_scale; + + source_step *= 2; + dest[0] = source[0]; + + for (i = dest_step, j = source_step; i < dest_length * dest_step; + i += dest_step, j += source_step) { + temp = 8; + temp += 3 * source[j - source_pitch]; + temp += 10 * source[j]; + temp += 3 * source[j + source_pitch]; + temp >>= 4; + dest[i] = (char)(temp); + } +} + +/**************************************************************************** + * + * ROUTINE : scale1d_2t1_ps + * + * INPUTS : const unsigned char *source : Pointer to data to be scaled. + * int source_step : Number of pixels to step on in + * source. + * unsigned int source_scale : Scale for source (UNUSED). + * unsigned int source_length : Length of source (UNUSED). + * unsigned char *dest : Pointer to output data array. + * int dest_step : Number of pixels to step on in + * destination. + * unsigned int dest_scale : Scale for destination + * (UNUSED). + * unsigned int dest_length : Length of destination. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Performs 2-to-1 point subsampled scaling. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static void scale1d_2t1_ps(const unsigned char *source, int source_step, + unsigned int source_scale, + unsigned int source_length, unsigned char *dest, + int dest_step, unsigned int dest_scale, + unsigned int dest_length) { + unsigned int i, j; + + (void)source_length; + (void)source_scale; + (void)dest_scale; + + source_step *= 2; + j = 0; + + for (i = 0; i < dest_length * dest_step; i += dest_step, j += source_step) + dest[i] = source[j]; +} +/**************************************************************************** + * + * ROUTINE : scale1d_c + * + * INPUTS : const unsigned char *source : Pointer to data to be scaled. + * int source_step : Number of pixels to step on in + * source. + * unsigned int source_scale : Scale for source. + * unsigned int source_length : Length of source (UNUSED). + * unsigned char *dest : Pointer to output data array. + * int dest_step : Number of pixels to step on in + * destination. + * unsigned int dest_scale : Scale for destination. + * unsigned int dest_length : Length of destination. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Performs linear interpolation in one dimension. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static void scale1d_c(const unsigned char *source, int source_step, + unsigned int source_scale, unsigned int source_length, + unsigned char *dest, int dest_step, + unsigned int dest_scale, unsigned int dest_length) { + unsigned int i; + unsigned int round_value = dest_scale / 2; + unsigned int left_modifier = dest_scale; + unsigned int right_modifier = 0; + unsigned char left_pixel = *source; + unsigned char right_pixel = *(source + source_step); + + (void)source_length; + + /* These asserts are needed if there are boundary issues... */ + /*assert ( dest_scale > source_scale );*/ + /*assert ( (source_length-1) * dest_scale >= (dest_length-1) * source_scale + * );*/ + + for (i = 0; i < dest_length * dest_step; i += dest_step) { + dest[i] = (char)((left_modifier * left_pixel + + right_modifier * right_pixel + round_value) / + dest_scale); + + right_modifier += source_scale; + + while (right_modifier > dest_scale) { + right_modifier -= dest_scale; + source += source_step; + left_pixel = *source; + right_pixel = *(source + source_step); + } + + left_modifier = dest_scale - right_modifier; + } +} + +/**************************************************************************** + * + * ROUTINE : Scale2D + * + * INPUTS : const unsigned char *source : Pointer to data to be + * scaled. + * int source_pitch : Stride of source image. + * unsigned int source_width : Width of input image. + * unsigned int source_height : Height of input image. + * unsigned char *dest : Pointer to output data + * array. + * int dest_pitch : Stride of destination + * image. + * unsigned int dest_width : Width of destination image. + * unsigned int dest_height : Height of destination + * image. + * unsigned char *temp_area : Pointer to temp work area. + * unsigned char temp_area_height : Height of temp work area. + * unsigned int hscale : Horizontal scale factor + * numerator. + * unsigned int hratio : Horizontal scale factor + * denominator. + * unsigned int vscale : Vertical scale factor + * numerator. + * unsigned int vratio : Vertical scale factor + * denominator. + * unsigned int interlaced : Interlace flag. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Performs 2-tap linear interpolation in two dimensions. + * + * SPECIAL NOTES : Expansion is performed one band at a time to help with + * caching. + * + ****************************************************************************/ +static void Scale2D( + /*const*/ + unsigned char *source, int source_pitch, unsigned int source_width, + unsigned int source_height, unsigned char *dest, int dest_pitch, + unsigned int dest_width, unsigned int dest_height, unsigned char *temp_area, + unsigned char temp_area_height, unsigned int hscale, unsigned int hratio, + unsigned int vscale, unsigned int vratio, unsigned int interlaced) { + /*unsigned*/ + int i, j, k; + int bands; + int dest_band_height; + int source_band_height; + + typedef void (*Scale1D)(const unsigned char *source, int source_step, + unsigned int source_scale, unsigned int source_length, + unsigned char *dest, int dest_step, + unsigned int dest_scale, unsigned int dest_length); + + Scale1D Scale1Dv = scale1d_c; + Scale1D Scale1Dh = scale1d_c; + + void (*horiz_line_scale)(const unsigned char *, unsigned int, unsigned char *, + unsigned int) = NULL; + void (*vert_band_scale)(unsigned char *, unsigned int, unsigned char *, + unsigned int, unsigned int) = NULL; + + int ratio_scalable = 1; + int interpolation = 0; + + unsigned char *source_base; + unsigned char *line_src; + + source_base = (unsigned char *)source; + + if (source_pitch < 0) { + int offset; + + offset = (source_height - 1); + offset *= source_pitch; + + source_base += offset; + } + + /* find out the ratio for each direction */ + switch (hratio * 10 / hscale) { + case 8: + /* 4-5 Scale in Width direction */ + horiz_line_scale = vp8_horizontal_line_5_4_scale; + break; + case 6: + /* 3-5 Scale in Width direction */ + horiz_line_scale = vp8_horizontal_line_5_3_scale; + break; + case 5: + /* 1-2 Scale in Width direction */ + horiz_line_scale = vp8_horizontal_line_2_1_scale; + break; + default: + /* The ratio is not acceptable now */ + /* throw("The ratio is not acceptable for now!"); */ + ratio_scalable = 0; + break; + } + + switch (vratio * 10 / vscale) { + case 8: + /* 4-5 Scale in vertical direction */ + vert_band_scale = vp8_vertical_band_5_4_scale; + source_band_height = 5; + dest_band_height = 4; + break; + case 6: + /* 3-5 Scale in vertical direction */ + vert_band_scale = vp8_vertical_band_5_3_scale; + source_band_height = 5; + dest_band_height = 3; + break; + case 5: + /* 1-2 Scale in vertical direction */ + + if (interlaced) { + /* if the content is interlaced, point sampling is used */ + vert_band_scale = vp8_vertical_band_2_1_scale; + } else { + interpolation = 1; + /* if the content is progressive, interplo */ + vert_band_scale = vp8_vertical_band_2_1_scale_i; + } + + source_band_height = 2; + dest_band_height = 1; + break; + default: + /* The ratio is not acceptable now */ + /* throw("The ratio is not acceptable for now!"); */ + ratio_scalable = 0; + break; + } + + if (ratio_scalable) { + if (source_height == dest_height) { + /* for each band of the image */ + for (k = 0; k < (int)dest_height; k++) { + horiz_line_scale(source, source_width, dest, dest_width); + source += source_pitch; + dest += dest_pitch; + } + + return; + } + + if (interpolation) { + if (source < source_base) source = source_base; + + horiz_line_scale(source, source_width, temp_area, dest_width); + } + + for (k = 0; + k < (int)(dest_height + dest_band_height - 1) / dest_band_height; + k++) { + /* scale one band horizontally */ + for (i = 0; i < source_band_height; i++) { + /* Trap case where we could read off the base of the source buffer */ + + line_src = (unsigned char *)source + i * source_pitch; + + if (line_src < source_base) line_src = source_base; + + horiz_line_scale(line_src, source_width, + temp_area + (i + 1) * dest_pitch, dest_width); + } + + /* Vertical scaling is in place */ + vert_band_scale(temp_area + dest_pitch, dest_pitch, dest, dest_pitch, + dest_width); + + if (interpolation) + memcpy(temp_area, temp_area + source_band_height * dest_pitch, + dest_width); + + /* Next band... */ + source += (unsigned long)source_band_height * source_pitch; + dest += (unsigned long)dest_band_height * dest_pitch; + } + + return; + } + + if (hscale == 2 && hratio == 1) Scale1Dh = scale1d_2t1_ps; + + if (vscale == 2 && vratio == 1) { + if (interlaced) + Scale1Dv = scale1d_2t1_ps; + else + Scale1Dv = scale1d_2t1_i; + } + + if (source_height == dest_height) { + /* for each band of the image */ + for (k = 0; k < (int)dest_height; k++) { + Scale1Dh(source, 1, hscale, source_width + 1, dest, 1, hratio, + dest_width); + source += source_pitch; + dest += dest_pitch; + } + + return; + } + + if (dest_height > source_height) { + dest_band_height = temp_area_height - 1; + source_band_height = dest_band_height * source_height / dest_height; + } else { + source_band_height = temp_area_height - 1; + dest_band_height = source_band_height * vratio / vscale; + } + + /* first row needs to be done so that we can stay one row ahead for vertical + * zoom */ + Scale1Dh(source, 1, hscale, source_width + 1, temp_area, 1, hratio, + dest_width); + + /* for each band of the image */ + bands = (dest_height + dest_band_height - 1) / dest_band_height; + + for (k = 0; k < bands; k++) { + /* scale one band horizontally */ + for (i = 1; i < source_band_height + 1; i++) { + if (k * source_band_height + i < (int)source_height) { + Scale1Dh(source + i * source_pitch, 1, hscale, source_width + 1, + temp_area + i * dest_pitch, 1, hratio, dest_width); + } else { /* Duplicate the last row */ + /* copy temp_area row 0 over from last row in the past */ + memcpy(temp_area + i * dest_pitch, temp_area + (i - 1) * dest_pitch, + dest_pitch); + } + } + + /* scale one band vertically */ + for (j = 0; j < (int)dest_width; j++) { + Scale1Dv(&temp_area[j], dest_pitch, vscale, source_band_height + 1, + &dest[j], dest_pitch, vratio, dest_band_height); + } + + /* copy temp_area row 0 over from last row in the past */ + memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_pitch); + + /* move to the next band */ + source += source_band_height * source_pitch; + dest += dest_band_height * dest_pitch; + } +} + +/**************************************************************************** + * + * ROUTINE : vpx_scale_frame + * + * INPUTS : YV12_BUFFER_CONFIG *src : Pointer to frame to be + * scaled. + * YV12_BUFFER_CONFIG *dst : Pointer to buffer to hold + * scaled frame. + * unsigned char *temp_area : Pointer to temp work area. + * unsigned char temp_area_height : Height of temp work area. + * unsigned int hscale : Horizontal scale factor + * numerator. + * unsigned int hratio : Horizontal scale factor + * denominator. + * unsigned int vscale : Vertical scale factor + * numerator. + * unsigned int vratio : Vertical scale factor + * denominator. + * unsigned int interlaced : Interlace flag. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Performs 2-tap linear interpolation in two dimensions. + * + * SPECIAL NOTES : Expansion is performed one band at a time to help with + * caching. + * + ****************************************************************************/ +void vpx_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, + unsigned char *temp_area, unsigned char temp_height, + unsigned int hscale, unsigned int hratio, + unsigned int vscale, unsigned int vratio, + unsigned int interlaced) { + int i; + int dw = (hscale - 1 + src->y_width * hratio) / hscale; + int dh = (vscale - 1 + src->y_height * vratio) / vscale; + + /* call our internal scaling routines!! */ + Scale2D((unsigned char *)src->y_buffer, src->y_stride, src->y_width, + src->y_height, (unsigned char *)dst->y_buffer, dst->y_stride, dw, dh, + temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced); + + if (dw < (int)dst->y_width) + for (i = 0; i < dh; i++) + memset(dst->y_buffer + i * dst->y_stride + dw - 1, + dst->y_buffer[i * dst->y_stride + dw - 2], dst->y_width - dw + 1); + + if (dh < (int)dst->y_height) + for (i = dh - 1; i < (int)dst->y_height; i++) + memcpy(dst->y_buffer + i * dst->y_stride, + dst->y_buffer + (dh - 2) * dst->y_stride, dst->y_width + 1); + + Scale2D((unsigned char *)src->u_buffer, src->uv_stride, src->uv_width, + src->uv_height, (unsigned char *)dst->u_buffer, dst->uv_stride, + dw / 2, dh / 2, temp_area, temp_height, hscale, hratio, vscale, + vratio, interlaced); + + if (dw / 2 < (int)dst->uv_width) + for (i = 0; i < dst->uv_height; i++) + memset(dst->u_buffer + i * dst->uv_stride + dw / 2 - 1, + dst->u_buffer[i * dst->uv_stride + dw / 2 - 2], + dst->uv_width - dw / 2 + 1); + + if (dh / 2 < (int)dst->uv_height) + for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++) + memcpy(dst->u_buffer + i * dst->uv_stride, + dst->u_buffer + (dh / 2 - 2) * dst->uv_stride, dst->uv_width); + + Scale2D((unsigned char *)src->v_buffer, src->uv_stride, src->uv_width, + src->uv_height, (unsigned char *)dst->v_buffer, dst->uv_stride, + dw / 2, dh / 2, temp_area, temp_height, hscale, hratio, vscale, + vratio, interlaced); + + if (dw / 2 < (int)dst->uv_width) + for (i = 0; i < dst->uv_height; i++) + memset(dst->v_buffer + i * dst->uv_stride + dw / 2 - 1, + dst->v_buffer[i * dst->uv_stride + dw / 2 - 2], + dst->uv_width - dw / 2 + 1); + + if (dh / 2 < (int)dst->uv_height) + for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++) + memcpy(dst->v_buffer + i * dst->uv_stride, + dst->v_buffer + (dh / 2 - 2) * dst->uv_stride, dst->uv_width); +} diff --git a/media/libvpx/libvpx/vpx_scale/generic/yv12config.c b/media/libvpx/libvpx/vpx_scale/generic/yv12config.c new file mode 100644 index 0000000000..c52dab0588 --- /dev/null +++ b/media/libvpx/libvpx/vpx_scale/generic/yv12config.c @@ -0,0 +1,308 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "vpx_scale/yv12config.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" + +#if defined(VPX_MAX_ALLOCABLE_MEMORY) +#include "vp9/common/vp9_onyxc_int.h" +#endif // VPX_MAX_ALLOCABLE_MEMORY +/**************************************************************************** + * Exports + ****************************************************************************/ + +/**************************************************************************** + * + ****************************************************************************/ +#define yv12_align_addr(addr, align) \ + (void *)(((size_t)(addr) + ((align)-1)) & (size_t) - (align)) + +int vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf) { + if (ybf) { + // If libvpx is using frame buffer callbacks then buffer_alloc_sz must + // not be set. + if (ybf->buffer_alloc_sz > 0) { + vpx_free(ybf->buffer_alloc); + } + + /* buffer_alloc isn't accessed by most functions. Rather y_buffer, + u_buffer and v_buffer point to buffer_alloc and are used. Clear out + all of this so that a freed pointer isn't inadvertently used */ + memset(ybf, 0, sizeof(YV12_BUFFER_CONFIG)); + } else { + return -1; + } + + return 0; +} + +int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, + int height, int border) { + if (ybf) { + int aligned_width = (width + 15) & ~15; + int aligned_height = (height + 15) & ~15; + int y_stride = ((aligned_width + 2 * border) + 31) & ~31; + int yplane_size = (aligned_height + 2 * border) * y_stride; + int uv_width = aligned_width >> 1; + int uv_height = aligned_height >> 1; + /** There is currently a bunch of code which assumes + * uv_stride == y_stride/2, so enforce this here. */ + int uv_stride = y_stride >> 1; + int uvplane_size = (uv_height + border) * uv_stride; + const size_t frame_size = yplane_size + 2 * uvplane_size; + + if (!ybf->buffer_alloc) { + ybf->buffer_alloc = (uint8_t *)vpx_memalign(32, frame_size); + if (!ybf->buffer_alloc) { + ybf->buffer_alloc_sz = 0; + return -1; + } +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) + // This memset is needed for fixing the issue of using uninitialized + // value in msan test. It will cause a perf loss, so only do this for + // msan test. + memset(ybf->buffer_alloc, 0, frame_size); +#endif +#endif + ybf->buffer_alloc_sz = frame_size; + } + + if (ybf->buffer_alloc_sz < frame_size) return -1; + + /* Only support allocating buffers that have a border that's a multiple + * of 32. The border restriction is required to get 16-byte alignment of + * the start of the chroma rows without introducing an arbitrary gap + * between planes, which would break the semantics of things like + * vpx_img_set_rect(). */ + if (border & 0x1f) return -3; + + ybf->y_crop_width = width; + ybf->y_crop_height = height; + ybf->y_width = aligned_width; + ybf->y_height = aligned_height; + ybf->y_stride = y_stride; + + ybf->uv_crop_width = (width + 1) / 2; + ybf->uv_crop_height = (height + 1) / 2; + ybf->uv_width = uv_width; + ybf->uv_height = uv_height; + ybf->uv_stride = uv_stride; + + ybf->alpha_width = 0; + ybf->alpha_height = 0; + ybf->alpha_stride = 0; + + ybf->border = border; + ybf->frame_size = frame_size; + + ybf->y_buffer = ybf->buffer_alloc + (border * y_stride) + border; + ybf->u_buffer = + ybf->buffer_alloc + yplane_size + (border / 2 * uv_stride) + border / 2; + ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + + (border / 2 * uv_stride) + border / 2; + ybf->alpha_buffer = NULL; + + ybf->corrupted = 0; /* assume not currupted by errors */ + return 0; + } + return -2; +} + +int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, + int border) { + if (ybf) { + vp8_yv12_de_alloc_frame_buffer(ybf); + return vp8_yv12_realloc_frame_buffer(ybf, width, height, border); + } + return -2; +} + +#if CONFIG_VP9 +// TODO(jkoleszar): Maybe replace this with struct vpx_image + +int vpx_free_frame_buffer(YV12_BUFFER_CONFIG *ybf) { + if (ybf) { + if (ybf->buffer_alloc_sz > 0) { + vpx_free(ybf->buffer_alloc); + } + + /* buffer_alloc isn't accessed by most functions. Rather y_buffer, + u_buffer and v_buffer point to buffer_alloc and are used. Clear out + all of this so that a freed pointer isn't inadvertently used */ + memset(ybf, 0, sizeof(YV12_BUFFER_CONFIG)); + } else { + return -1; + } + + return 0; +} + +int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, + int ss_x, int ss_y, +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth, +#endif + int border, int byte_alignment, + vpx_codec_frame_buffer_t *fb, + vpx_get_frame_buffer_cb_fn_t cb, void *cb_priv) { +#if CONFIG_SIZE_LIMIT + if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) return -1; +#endif + + /* Only support allocating buffers that have a border that's a multiple + * of 32. The border restriction is required to get 16-byte alignment of + * the start of the chroma rows without introducing an arbitrary gap + * between planes, which would break the semantics of things like + * vpx_img_set_rect(). */ + if (border & 0x1f) return -3; + + if (ybf) { + const int vp9_byte_align = (byte_alignment == 0) ? 1 : byte_alignment; + const int aligned_width = (width + 7) & ~7; + const int aligned_height = (height + 7) & ~7; + const int y_stride = ((aligned_width + 2 * border) + 31) & ~31; + const uint64_t yplane_size = + (aligned_height + 2 * border) * (uint64_t)y_stride + byte_alignment; + const int uv_width = aligned_width >> ss_x; + const int uv_height = aligned_height >> ss_y; + const int uv_stride = y_stride >> ss_x; + const int uv_border_w = border >> ss_x; + const int uv_border_h = border >> ss_y; + const uint64_t uvplane_size = + (uv_height + 2 * uv_border_h) * (uint64_t)uv_stride + byte_alignment; + +#if CONFIG_VP9_HIGHBITDEPTH + const uint64_t frame_size = + (1 + use_highbitdepth) * (yplane_size + 2 * uvplane_size); +#else + const uint64_t frame_size = yplane_size + 2 * uvplane_size; +#endif // CONFIG_VP9_HIGHBITDEPTH + + uint8_t *buf = NULL; + +#if defined(VPX_MAX_ALLOCABLE_MEMORY) + // The decoder may allocate REF_FRAMES frame buffers in the frame buffer + // pool. Bound the total amount of allocated memory as if these REF_FRAMES + // frame buffers were allocated in a single allocation. + if (frame_size > VPX_MAX_ALLOCABLE_MEMORY / REF_FRAMES) return -1; +#endif // VPX_MAX_ALLOCABLE_MEMORY + + // frame_size is stored in buffer_alloc_sz, which is a size_t. If it won't + // fit, fail early. + if (frame_size > SIZE_MAX) { + return -1; + } + + if (cb != NULL) { + const int align_addr_extra_size = 31; + const uint64_t external_frame_size = frame_size + align_addr_extra_size; + + assert(fb != NULL); + + if (external_frame_size != (size_t)external_frame_size) return -1; + + // Allocation to hold larger frame, or first allocation. + if (cb(cb_priv, (size_t)external_frame_size, fb) < 0) return -1; + + if (fb->data == NULL || fb->size < external_frame_size) return -1; + + ybf->buffer_alloc = (uint8_t *)yv12_align_addr(fb->data, 32); + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) + // This memset is needed for fixing the issue of using uninitialized + // value in msan test. It will cause a perf loss, so only do this for + // msan test. + memset(ybf->buffer_alloc, 0, (size_t)frame_size); +#endif +#endif + } else if (frame_size > ybf->buffer_alloc_sz) { + // Allocation to hold larger frame, or first allocation. + vpx_free(ybf->buffer_alloc); + ybf->buffer_alloc = NULL; + ybf->buffer_alloc_sz = 0; + + ybf->buffer_alloc = (uint8_t *)vpx_memalign(32, (size_t)frame_size); + if (!ybf->buffer_alloc) return -1; + + ybf->buffer_alloc_sz = (size_t)frame_size; + + // This memset is needed for fixing valgrind error from C loop filter + // due to access uninitialized memory in frame border. It could be + // removed if border is totally removed. + memset(ybf->buffer_alloc, 0, ybf->buffer_alloc_sz); + } + + ybf->y_crop_width = width; + ybf->y_crop_height = height; + ybf->y_width = aligned_width; + ybf->y_height = aligned_height; + ybf->y_stride = y_stride; + + ybf->uv_crop_width = (width + ss_x) >> ss_x; + ybf->uv_crop_height = (height + ss_y) >> ss_y; + ybf->uv_width = uv_width; + ybf->uv_height = uv_height; + ybf->uv_stride = uv_stride; + + ybf->border = border; + ybf->frame_size = (size_t)frame_size; + ybf->subsampling_x = ss_x; + ybf->subsampling_y = ss_y; + + buf = ybf->buffer_alloc; +#if CONFIG_VP9_HIGHBITDEPTH + if (use_highbitdepth) { + // Store uint16 addresses when using 16bit framebuffers + buf = CONVERT_TO_BYTEPTR(ybf->buffer_alloc); + ybf->flags = YV12_FLAG_HIGHBITDEPTH; + } else { + ybf->flags = 0; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + + ybf->y_buffer = (uint8_t *)yv12_align_addr( + buf + (border * y_stride) + border, vp9_byte_align); + ybf->u_buffer = (uint8_t *)yv12_align_addr( + buf + yplane_size + (uv_border_h * uv_stride) + uv_border_w, + vp9_byte_align); + ybf->v_buffer = + (uint8_t *)yv12_align_addr(buf + yplane_size + uvplane_size + + (uv_border_h * uv_stride) + uv_border_w, + vp9_byte_align); + + ybf->corrupted = 0; /* assume not corrupted by errors */ + return 0; + } + return -2; +} + +int vpx_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, + int ss_x, int ss_y, +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth, +#endif + int border, int byte_alignment) { + if (ybf) { + vpx_free_frame_buffer(ybf); + return vpx_realloc_frame_buffer(ybf, width, height, ss_x, ss_y, +#if CONFIG_VP9_HIGHBITDEPTH + use_highbitdepth, +#endif + border, byte_alignment, NULL, NULL, NULL); + } + return -2; +} +#endif diff --git a/media/libvpx/libvpx/vpx_scale/generic/yv12extend.c b/media/libvpx/libvpx/vpx_scale/generic/yv12extend.c new file mode 100644 index 0000000000..e231806505 --- /dev/null +++ b/media/libvpx/libvpx/vpx_scale/generic/yv12extend.c @@ -0,0 +1,335 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_config.h" +#include "./vpx_scale_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" +#include "vpx_scale/yv12config.h" +#if CONFIG_VP9_HIGHBITDEPTH +#include "vp9/common/vp9_common.h" +#endif + +static void extend_plane(uint8_t *const src, int src_stride, int width, + int height, int extend_top, int extend_left, + int extend_bottom, int extend_right) { + int i; + const int linesize = extend_left + extend_right + width; + + /* copy the left and right most columns out */ + uint8_t *src_ptr1 = src; + uint8_t *src_ptr2 = src + width - 1; + uint8_t *dst_ptr1 = src - extend_left; + uint8_t *dst_ptr2 = src + width; + + for (i = 0; i < height; ++i) { + memset(dst_ptr1, src_ptr1[0], extend_left); + memset(dst_ptr2, src_ptr2[0], extend_right); + src_ptr1 += src_stride; + src_ptr2 += src_stride; + dst_ptr1 += src_stride; + dst_ptr2 += src_stride; + } + + /* Now copy the top and bottom lines into each line of the respective + * borders + */ + src_ptr1 = src - extend_left; + src_ptr2 = src + src_stride * (height - 1) - extend_left; + dst_ptr1 = src + src_stride * -extend_top - extend_left; + dst_ptr2 = src + src_stride * height - extend_left; + + for (i = 0; i < extend_top; ++i) { + memcpy(dst_ptr1, src_ptr1, linesize); + dst_ptr1 += src_stride; + } + + for (i = 0; i < extend_bottom; ++i) { + memcpy(dst_ptr2, src_ptr2, linesize); + dst_ptr2 += src_stride; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +static void extend_plane_high(uint8_t *const src8, int src_stride, int width, + int height, int extend_top, int extend_left, + int extend_bottom, int extend_right) { + int i; + const int linesize = extend_left + extend_right + width; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + + /* copy the left and right most columns out */ + uint16_t *src_ptr1 = src; + uint16_t *src_ptr2 = src + width - 1; + uint16_t *dst_ptr1 = src - extend_left; + uint16_t *dst_ptr2 = src + width; + + for (i = 0; i < height; ++i) { + vpx_memset16(dst_ptr1, src_ptr1[0], extend_left); + vpx_memset16(dst_ptr2, src_ptr2[0], extend_right); + src_ptr1 += src_stride; + src_ptr2 += src_stride; + dst_ptr1 += src_stride; + dst_ptr2 += src_stride; + } + + /* Now copy the top and bottom lines into each line of the respective + * borders + */ + src_ptr1 = src - extend_left; + src_ptr2 = src + src_stride * (height - 1) - extend_left; + dst_ptr1 = src + src_stride * -extend_top - extend_left; + dst_ptr2 = src + src_stride * height - extend_left; + + for (i = 0; i < extend_top; ++i) { + memcpy(dst_ptr1, src_ptr1, linesize * sizeof(uint16_t)); + dst_ptr1 += src_stride; + } + + for (i = 0; i < extend_bottom; ++i) { + memcpy(dst_ptr2, src_ptr2, linesize * sizeof(uint16_t)); + dst_ptr2 += src_stride; + } +} +#endif + +void vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) { + const int uv_border = ybf->border / 2; + + assert(ybf->border % 2 == 0); + assert(ybf->y_height - ybf->y_crop_height < 16); + assert(ybf->y_width - ybf->y_crop_width < 16); + assert(ybf->y_height - ybf->y_crop_height >= 0); + assert(ybf->y_width - ybf->y_crop_width >= 0); + + extend_plane(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width, + ybf->y_crop_height, ybf->border, ybf->border, + ybf->border + ybf->y_height - ybf->y_crop_height, + ybf->border + ybf->y_width - ybf->y_crop_width); + + extend_plane(ybf->u_buffer, ybf->uv_stride, ybf->uv_crop_width, + ybf->uv_crop_height, uv_border, uv_border, + uv_border + ybf->uv_height - ybf->uv_crop_height, + uv_border + ybf->uv_width - ybf->uv_crop_width); + + extend_plane(ybf->v_buffer, ybf->uv_stride, ybf->uv_crop_width, + ybf->uv_crop_height, uv_border, uv_border, + uv_border + ybf->uv_height - ybf->uv_crop_height, + uv_border + ybf->uv_width - ybf->uv_crop_width); +} + +#if CONFIG_VP9 +static void extend_frame(YV12_BUFFER_CONFIG *const ybf, int ext_size) { + const int c_w = ybf->uv_crop_width; + const int c_h = ybf->uv_crop_height; + const int ss_x = ybf->uv_width < ybf->y_width; + const int ss_y = ybf->uv_height < ybf->y_height; + const int c_et = ext_size >> ss_y; + const int c_el = ext_size >> ss_x; + const int c_eb = c_et + ybf->uv_height - ybf->uv_crop_height; + const int c_er = c_el + ybf->uv_width - ybf->uv_crop_width; + + assert(ybf->y_height - ybf->y_crop_height < 16); + assert(ybf->y_width - ybf->y_crop_width < 16); + assert(ybf->y_height - ybf->y_crop_height >= 0); + assert(ybf->y_width - ybf->y_crop_width >= 0); + +#if CONFIG_VP9_HIGHBITDEPTH + if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) { + extend_plane_high(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width, + ybf->y_crop_height, ext_size, ext_size, + ext_size + ybf->y_height - ybf->y_crop_height, + ext_size + ybf->y_width - ybf->y_crop_width); + extend_plane_high(ybf->u_buffer, ybf->uv_stride, c_w, c_h, c_et, c_el, c_eb, + c_er); + extend_plane_high(ybf->v_buffer, ybf->uv_stride, c_w, c_h, c_et, c_el, c_eb, + c_er); + return; + } +#endif + extend_plane(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width, + ybf->y_crop_height, ext_size, ext_size, + ext_size + ybf->y_height - ybf->y_crop_height, + ext_size + ybf->y_width - ybf->y_crop_width); + + extend_plane(ybf->u_buffer, ybf->uv_stride, c_w, c_h, c_et, c_el, c_eb, c_er); + + extend_plane(ybf->v_buffer, ybf->uv_stride, c_w, c_h, c_et, c_el, c_eb, c_er); +} + +void vpx_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) { + extend_frame(ybf, ybf->border); +} + +void vpx_extend_frame_inner_borders_c(YV12_BUFFER_CONFIG *ybf) { + const int inner_bw = (ybf->border > VP9INNERBORDERINPIXELS) + ? VP9INNERBORDERINPIXELS + : ybf->border; + extend_frame(ybf, inner_bw); +} + +#if CONFIG_VP9_HIGHBITDEPTH +static void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) { + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + memcpy(dst, src, num * sizeof(uint16_t)); +} +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // CONFIG_VP9 + +// Copies the source image into the destination image and updates the +// destination's UMV borders. +// Note: The frames are assumed to be identical in size. + +void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc, + YV12_BUFFER_CONFIG *dst_ybc) { + int row; + const uint8_t *src = src_ybc->y_buffer; + uint8_t *dst = dst_ybc->y_buffer; + +#if 0 + /* These assertions are valid in the codec, but the libvpx-tester uses + * this code slightly differently. + */ + assert(src_ybc->y_width == dst_ybc->y_width); + assert(src_ybc->y_height == dst_ybc->y_height); +#endif + + for (row = 0; row < src_ybc->y_height; ++row) { + memcpy(dst, src, src_ybc->y_width); + src += src_ybc->y_stride; + dst += dst_ybc->y_stride; + } + + src = src_ybc->u_buffer; + dst = dst_ybc->u_buffer; + + for (row = 0; row < src_ybc->uv_height; ++row) { + memcpy(dst, src, src_ybc->uv_width); + src += src_ybc->uv_stride; + dst += dst_ybc->uv_stride; + } + + src = src_ybc->v_buffer; + dst = dst_ybc->v_buffer; + + for (row = 0; row < src_ybc->uv_height; ++row) { + memcpy(dst, src, src_ybc->uv_width); + src += src_ybc->uv_stride; + dst += dst_ybc->uv_stride; + } + + vp8_yv12_extend_frame_borders_c(dst_ybc); +} + +#if CONFIG_VP9 +void vpx_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc, + YV12_BUFFER_CONFIG *dst_ybc) { + int row; + const uint8_t *src = src_ybc->y_buffer; + uint8_t *dst = dst_ybc->y_buffer; + +#if 0 + /* These assertions are valid in the codec, but the libvpx-tester uses + * this code slightly differently. + */ + assert(src_ybc->y_width == dst_ybc->y_width); + assert(src_ybc->y_height == dst_ybc->y_height); +#endif + +#if CONFIG_VP9_HIGHBITDEPTH + if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) { + assert(dst_ybc->flags & YV12_FLAG_HIGHBITDEPTH); + for (row = 0; row < src_ybc->y_height; ++row) { + memcpy_short_addr(dst, src, src_ybc->y_width); + src += src_ybc->y_stride; + dst += dst_ybc->y_stride; + } + + src = src_ybc->u_buffer; + dst = dst_ybc->u_buffer; + + for (row = 0; row < src_ybc->uv_height; ++row) { + memcpy_short_addr(dst, src, src_ybc->uv_width); + src += src_ybc->uv_stride; + dst += dst_ybc->uv_stride; + } + + src = src_ybc->v_buffer; + dst = dst_ybc->v_buffer; + + for (row = 0; row < src_ybc->uv_height; ++row) { + memcpy_short_addr(dst, src, src_ybc->uv_width); + src += src_ybc->uv_stride; + dst += dst_ybc->uv_stride; + } + + vpx_extend_frame_borders_c(dst_ybc); + return; + } else { + assert(!(dst_ybc->flags & YV12_FLAG_HIGHBITDEPTH)); + } +#endif + + for (row = 0; row < src_ybc->y_height; ++row) { + memcpy(dst, src, src_ybc->y_width); + src += src_ybc->y_stride; + dst += dst_ybc->y_stride; + } + + src = src_ybc->u_buffer; + dst = dst_ybc->u_buffer; + + for (row = 0; row < src_ybc->uv_height; ++row) { + memcpy(dst, src, src_ybc->uv_width); + src += src_ybc->uv_stride; + dst += dst_ybc->uv_stride; + } + + src = src_ybc->v_buffer; + dst = dst_ybc->v_buffer; + + for (row = 0; row < src_ybc->uv_height; ++row) { + memcpy(dst, src, src_ybc->uv_width); + src += src_ybc->uv_stride; + dst += dst_ybc->uv_stride; + } + + vpx_extend_frame_borders_c(dst_ybc); +} +#endif // CONFIG_VP9 + +void vpx_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc, + YV12_BUFFER_CONFIG *dst_ybc) { + int row; + const uint8_t *src = src_ybc->y_buffer; + uint8_t *dst = dst_ybc->y_buffer; + +#if CONFIG_VP9_HIGHBITDEPTH + if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) { + const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); + uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); + for (row = 0; row < src_ybc->y_height; ++row) { + memcpy(dst16, src16, src_ybc->y_width * sizeof(uint16_t)); + src16 += src_ybc->y_stride; + dst16 += dst_ybc->y_stride; + } + return; + } +#endif + + for (row = 0; row < src_ybc->y_height; ++row) { + memcpy(dst, src, src_ybc->y_width); + src += src_ybc->y_stride; + dst += dst_ybc->y_stride; + } +} diff --git a/media/libvpx/libvpx/vpx_scale/mips/dspr2/yv12extend_dspr2.c b/media/libvpx/libvpx/vpx_scale/mips/dspr2/yv12extend_dspr2.c new file mode 100644 index 0000000000..d3d1b07f45 --- /dev/null +++ b/media/libvpx/libvpx/vpx_scale/mips/dspr2/yv12extend_dspr2.c @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "vpx_scale/yv12config.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_scale/vpx_scale.h" + +#if HAVE_DSPR2 +static void extend_plane(uint8_t *const src, int src_stride, int width, + int height, int extend_top, int extend_left, + int extend_bottom, int extend_right) { + int i, j; + uint8_t *left_src, *right_src; + uint8_t *left_dst_start, *right_dst_start; + uint8_t *left_dst, *right_dst; + uint8_t *top_src, *bot_src; + uint8_t *top_dst, *bot_dst; + uint32_t left_pix; + uint32_t right_pix; + uint32_t linesize; + + /* copy the left and right most columns out */ + left_src = src; + right_src = src + width - 1; + left_dst_start = src - extend_left; + right_dst_start = src + width; + + for (i = height; i--;) { + left_dst = left_dst_start; + right_dst = right_dst_start; + + __asm__ __volatile__( + "lb %[left_pix], 0(%[left_src]) \n\t" + "lb %[right_pix], 0(%[right_src]) \n\t" + "replv.qb %[left_pix], %[left_pix] \n\t" + "replv.qb %[right_pix], %[right_pix] \n\t" + + : [left_pix] "=&r"(left_pix), [right_pix] "=&r"(right_pix) + : [left_src] "r"(left_src), [right_src] "r"(right_src)); + + for (j = extend_left / 4; j--;) { + __asm__ __volatile__( + "sw %[left_pix], 0(%[left_dst]) \n\t" + "sw %[right_pix], 0(%[right_dst]) \n\t" + + : + : [left_dst] "r"(left_dst), [left_pix] "r"(left_pix), + [right_dst] "r"(right_dst), [right_pix] "r"(right_pix)); + + left_dst += 4; + right_dst += 4; + } + + for (j = extend_left % 4; j--;) { + __asm__ __volatile__( + "sb %[left_pix], 0(%[left_dst]) \n\t" + "sb %[right_pix], 0(%[right_dst]) \n\t" + + : + : [left_dst] "r"(left_dst), [left_pix] "r"(left_pix), + [right_dst] "r"(right_dst), [right_pix] "r"(right_pix)); + + left_dst += 1; + right_dst += 1; + } + + left_src += src_stride; + right_src += src_stride; + left_dst_start += src_stride; + right_dst_start += src_stride; + } + + /* Now copy the top and bottom lines into each line of the respective + * borders + */ + top_src = src - extend_left; + bot_src = src + src_stride * (height - 1) - extend_left; + top_dst = src + src_stride * (-extend_top) - extend_left; + bot_dst = src + src_stride * (height)-extend_left; + linesize = extend_left + extend_right + width; + + for (i = 0; i < extend_top; i++) { + memcpy(top_dst, top_src, linesize); + top_dst += src_stride; + } + + for (i = 0; i < extend_bottom; i++) { + memcpy(bot_dst, bot_src, linesize); + bot_dst += src_stride; + } +} + +static void extend_frame(YV12_BUFFER_CONFIG *const ybf, int ext_size) { + const int c_w = ybf->uv_crop_width; + const int c_h = ybf->uv_crop_height; + const int ss_x = ybf->uv_width < ybf->y_width; + const int ss_y = ybf->uv_height < ybf->y_height; + const int c_et = ext_size >> ss_y; + const int c_el = ext_size >> ss_x; + const int c_eb = c_et + ybf->uv_height - ybf->uv_crop_height; + const int c_er = c_el + ybf->uv_width - ybf->uv_crop_width; + + assert(ybf->y_height - ybf->y_crop_height < 16); + assert(ybf->y_width - ybf->y_crop_width < 16); + assert(ybf->y_height - ybf->y_crop_height >= 0); + assert(ybf->y_width - ybf->y_crop_width >= 0); + + extend_plane(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width, + ybf->y_crop_height, ext_size, ext_size, + ext_size + ybf->y_height - ybf->y_crop_height, + ext_size + ybf->y_width - ybf->y_crop_width); + + extend_plane(ybf->u_buffer, ybf->uv_stride, c_w, c_h, c_et, c_el, c_eb, c_er); + + extend_plane(ybf->v_buffer, ybf->uv_stride, c_w, c_h, c_et, c_el, c_eb, c_er); +} + +void vpx_extend_frame_borders_dspr2(YV12_BUFFER_CONFIG *ybf) { + extend_frame(ybf, ybf->border); +} + +void vpx_extend_frame_inner_borders_dspr2(YV12_BUFFER_CONFIG *ybf) { + const int inner_bw = (ybf->border > VP9INNERBORDERINPIXELS) + ? VP9INNERBORDERINPIXELS + : ybf->border; + extend_frame(ybf, inner_bw); +} +#endif diff --git a/media/libvpx/libvpx/vpx_scale/vpx_scale.h b/media/libvpx/libvpx/vpx_scale/vpx_scale.h new file mode 100644 index 0000000000..fd5ba7ccdc --- /dev/null +++ b/media/libvpx/libvpx/vpx_scale/vpx_scale.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_SCALE_VPX_SCALE_H_ +#define VPX_VPX_SCALE_VPX_SCALE_H_ + +#include "vpx_scale/yv12config.h" + +extern void vpx_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, + unsigned char *temp_area, unsigned char temp_height, + unsigned int hscale, unsigned int hratio, + unsigned int vscale, unsigned int vratio, + unsigned int interlaced); + +#endif // VPX_VPX_SCALE_VPX_SCALE_H_ diff --git a/media/libvpx/libvpx/vpx_scale/vpx_scale.mk b/media/libvpx/libvpx/vpx_scale/vpx_scale.mk new file mode 100644 index 0000000000..a49abf3b4b --- /dev/null +++ b/media/libvpx/libvpx/vpx_scale/vpx_scale.mk @@ -0,0 +1,16 @@ +SCALE_SRCS-yes += vpx_scale.mk +SCALE_SRCS-yes += yv12config.h +SCALE_SRCS-$(CONFIG_SPATIAL_RESAMPLING) += vpx_scale.h +SCALE_SRCS-$(CONFIG_SPATIAL_RESAMPLING) += generic/vpx_scale.c +SCALE_SRCS-yes += generic/yv12config.c +SCALE_SRCS-yes += generic/yv12extend.c +SCALE_SRCS-$(CONFIG_SPATIAL_RESAMPLING) += generic/gen_scalers.c +SCALE_SRCS-yes += vpx_scale_rtcd.c +SCALE_SRCS-yes += vpx_scale_rtcd.pl + +#mips(dspr2) +SCALE_SRCS-$(HAVE_DSPR2) += mips/dspr2/yv12extend_dspr2.c + +SCALE_SRCS-no += $(SCALE_SRCS_REMOVE-yes) + +$(eval $(call rtcd_h_template,vpx_scale_rtcd,vpx_scale/vpx_scale_rtcd.pl)) diff --git a/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c b/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c new file mode 100644 index 0000000000..dc4d9593a8 --- /dev/null +++ b/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c @@ -0,0 +1,15 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_config.h" +#define RTCD_C +#include "./vpx_scale_rtcd.h" +#include "vpx_ports/vpx_once.h" + +void vpx_scale_rtcd() { once(setup_rtcd_internal); } diff --git a/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.pl b/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.pl new file mode 100644 index 0000000000..1281071a7d --- /dev/null +++ b/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.pl @@ -0,0 +1,44 @@ +## +## Copyright (c) 2017 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +sub vpx_scale_forward_decls() { +print < +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" + +#if defined(__GNUC__) +#define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__) +#define LOCAL_GCC_PREREQ(maj, min) (LOCAL_GCC_VERSION >= (((maj) << 8) | (min))) +#else +#define LOCAL_GCC_VERSION 0 +#define LOCAL_GCC_PREREQ(maj, min) 0 +#endif + +// handle clang compatibility +#ifndef __has_builtin +#define __has_builtin(x) 0 +#endif + +// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__) +#if !defined(WORDS_BIGENDIAN) && \ + (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \ + (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__))) +#define WORDS_BIGENDIAN +#endif + +#if defined(WORDS_BIGENDIAN) +#define HToLE32 BSwap32 +#define HToLE16 BSwap16 +#define HToBE64(x) (x) +#define HToBE32(x) (x) +#else +#define HToLE32(x) (x) +#define HToLE16(x) (x) +#define HToBE64(X) BSwap64(X) +#define HToBE32(X) BSwap32(X) +#endif + +#if LOCAL_GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16) +#define HAVE_BUILTIN_BSWAP16 +#endif + +#if LOCAL_GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32) +#define HAVE_BUILTIN_BSWAP32 +#endif + +#if LOCAL_GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64) +#define HAVE_BUILTIN_BSWAP64 +#endif + +#if HAVE_MIPS32 && defined(__mips__) && !defined(__mips64) && \ + defined(__mips_isa_rev) && (__mips_isa_rev >= 2) && (__mips_isa_rev < 6) +#define VPX_USE_MIPS32_R2 +#endif + +static INLINE uint16_t BSwap16(uint16_t x) { +#if defined(HAVE_BUILTIN_BSWAP16) + return __builtin_bswap16(x); +#elif defined(_MSC_VER) + return _byteswap_ushort(x); +#else + // gcc will recognize a 'rorw $8, ...' here: + return (x >> 8) | ((x & 0xff) << 8); +#endif // HAVE_BUILTIN_BSWAP16 +} + +static INLINE uint32_t BSwap32(uint32_t x) { +#if defined(VPX_USE_MIPS32_R2) + uint32_t ret; + __asm__ volatile( + "wsbh %[ret], %[x] \n\t" + "rotr %[ret], %[ret], 16 \n\t" + : [ret] "=r"(ret) + : [x] "r"(x)); + return ret; +#elif defined(HAVE_BUILTIN_BSWAP32) + return __builtin_bswap32(x); +#elif defined(__i386__) || defined(__x86_64__) + uint32_t swapped_bytes; + __asm__ volatile("bswap %0" : "=r"(swapped_bytes) : "0"(x)); + return swapped_bytes; +#elif defined(_MSC_VER) + return (uint32_t)_byteswap_ulong(x); +#else + return (x >> 24) | ((x >> 8) & 0xff00) | ((x << 8) & 0xff0000) | (x << 24); +#endif // HAVE_BUILTIN_BSWAP32 +} + +static INLINE uint64_t BSwap64(uint64_t x) { +#if defined(HAVE_BUILTIN_BSWAP64) + return __builtin_bswap64(x); +#elif defined(__x86_64__) + uint64_t swapped_bytes; + __asm__ volatile("bswapq %0" : "=r"(swapped_bytes) : "0"(x)); + return swapped_bytes; +#elif defined(_MSC_VER) + return (uint64_t)_byteswap_uint64(x); +#else // generic code for swapping 64-bit values (suggested by bdb@) + x = ((x & 0xffffffff00000000ull) >> 32) | ((x & 0x00000000ffffffffull) << 32); + x = ((x & 0xffff0000ffff0000ull) >> 16) | ((x & 0x0000ffff0000ffffull) << 16); + x = ((x & 0xff00ff00ff00ff00ull) >> 8) | ((x & 0x00ff00ff00ff00ffull) << 8); + return x; +#endif // HAVE_BUILTIN_BSWAP64 +} + +#endif // VPX_VPX_UTIL_ENDIAN_INL_H_ diff --git a/media/libvpx/libvpx/vpx_util/loongson_intrinsics.h b/media/libvpx/libvpx/vpx_util/loongson_intrinsics.h new file mode 100644 index 0000000000..b8b9e6db02 --- /dev/null +++ b/media/libvpx/libvpx/vpx_util/loongson_intrinsics.h @@ -0,0 +1,2090 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + */ + +#ifndef VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_ +#define VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_ + +/* + * Copyright (c) 2021 Loongson Technology Corporation Limited + * All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + * Contributed by Shiyou Yin + * Xiwei Gu + * Lu Wang + * + * This file is a header file for loongarch builtin extension. + * + */ + +#ifndef LOONGSON_INTRINSICS_H +#define LOONGSON_INTRINSICS_H + +/** + * MAJOR version: Macro usage changes. + * MINOR version: Add new functions, or bug fixes. + * MICRO version: Comment changes or implementation changes. + */ +#define LSOM_VERSION_MAJOR 1 +#define LSOM_VERSION_MINOR 2 +#define LSOM_VERSION_MICRO 1 + +#define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \ + { \ + _OUT0 = _INS(_IN0); \ + _OUT1 = _INS(_IN1); \ + } + +#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \ + { \ + _OUT0 = _INS(_IN0, _IN1); \ + _OUT1 = _INS(_IN2, _IN3); \ + } + +#define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \ + { \ + _OUT0 = _INS(_IN0, _IN1, _IN2); \ + _OUT1 = _INS(_IN3, _IN4, _IN5); \ + } + +#define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \ + { \ + DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1); \ + DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3); \ + } + +#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \ + _OUT1, _OUT2, _OUT3) \ + { \ + DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1); \ + DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3); \ + } + +#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \ + _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3) \ + { \ + DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1); \ + DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3); \ + } + +#ifdef __loongarch_sx +#include +/* + * ============================================================================= + * Description : Dot product & addition of byte vector elements + * Arguments : Inputs - in_c, in_h, in_l + * Outputs - out + * Return Type - halfword + * Details : Signed byte elements from in_h are multiplied by + * signed byte elements from in_l, and then added adjacent to + * each other to get a result twice the size of input. Then + * the results are added to signed half-word elements from in_c. + * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l) + * in_c : 1,2,3,4, 1,2,3,4 + * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 + * out : 23,40,41,26, 23,40,41,26 + * ============================================================================= + */ +static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h, + __m128i in_l) { + __m128i out; + + out = __lsx_vmaddwev_h_b(in_c, in_h, in_l); + out = __lsx_vmaddwod_h_b(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product & addition of byte vector elements + * Arguments : Inputs - in_c, in_h, in_l + * Outputs - out + * Return Type - halfword + * Details : Unsigned byte elements from in_h are multiplied by + * unsigned byte elements from in_l, and then added adjacent to + * each other to get a result twice the size of input. + * The results are added to signed half-word elements from in_c. + * Example : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l) + * in_c : 1,2,3,4, 1,2,3,4 + * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 + * out : 23,40,41,26, 23,40,41,26 + * ============================================================================= + */ +static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h, + __m128i in_l) { + __m128i out; + + out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l); + out = __lsx_vmaddwod_h_bu(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product & addition of byte vector elements + * Arguments : Inputs - in_c, in_h, in_l + * Outputs - out + * Return Type - halfword + * Details : Unsigned byte elements from in_h are multiplied by + * signed byte elements from in_l, and then added adjacent to + * each other to get a result twice the size of input. + * The results are added to signed half-word elements from in_c. + * Example : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l) + * in_c : 1,1,1,1, 1,1,1,1 + * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8 + * out : -4,-24,-60,-112, 6,26,62,114 + * ============================================================================= + */ +static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h, + __m128i in_l) { + __m128i out; + + out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l); + out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product & addition of half-word vector elements + * Arguments : Inputs - in_c, in_h, in_l + * Outputs - out + * Return Type - __m128i + * Details : Signed half-word elements from in_h are multiplied by + * signed half-word elements from in_l, and then added adjacent to + * each other to get a result twice the size of input. + * Then the results are added to signed word elements from in_c. + * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l) + * in_c : 1,2,3,4 + * in_h : 1,2,3,4, 5,6,7,8 + * in_l : 8,7,6,5, 4,3,2,1 + * out : 23,40,41,26 + * ============================================================================= + */ +static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h, + __m128i in_l) { + __m128i out; + + out = __lsx_vmaddwev_w_h(in_c, in_h, in_l); + out = __lsx_vmaddwod_w_h(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of byte vector elements + * Arguments : Inputs - in_h, in_l + * Outputs - out + * Return Type - halfword + * Details : Signed byte elements from in_h are multiplied by + * signed byte elements from in_l, and then added adjacent to + * each other to get a result twice the size of input. + * Example : out = __lsx_vdp2_h_b(in_h, in_l) + * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 + * out : 22,38,38,22, 22,38,38,22 + * ============================================================================= + */ +static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) { + __m128i out; + + out = __lsx_vmulwev_h_b(in_h, in_l); + out = __lsx_vmaddwod_h_b(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of byte vector elements + * Arguments : Inputs - in_h, in_l + * Outputs - out + * Return Type - halfword + * Details : Unsigned byte elements from in_h are multiplied by + * unsigned byte elements from in_l, and then added adjacent to + * each other to get a result twice the size of input. + * Example : out = __lsx_vdp2_h_bu(in_h, in_l) + * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 + * out : 22,38,38,22, 22,38,38,22 + * ============================================================================= + */ +static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) { + __m128i out; + + out = __lsx_vmulwev_h_bu(in_h, in_l); + out = __lsx_vmaddwod_h_bu(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of byte vector elements + * Arguments : Inputs - in_h, in_l + * Outputs - out + * Return Type - halfword + * Details : Unsigned byte elements from in_h are multiplied by + * signed byte elements from in_l, and then added adjacent to + * each other to get a result twice the size of input. + * Example : out = __lsx_vdp2_h_bu_b(in_h, in_l) + * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1 + * out : 22,38,38,22, 22,38,38,6 + * ============================================================================= + */ +static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) { + __m128i out; + + out = __lsx_vmulwev_h_bu_b(in_h, in_l); + out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of byte vector elements + * Arguments : Inputs - in_h, in_l + * Outputs - out + * Return Type - halfword + * Details : Signed byte elements from in_h are multiplied by + * signed byte elements from in_l, and then added adjacent to + * each other to get a result twice the size of input. + * Example : out = __lsx_vdp2_w_h(in_h, in_l) + * in_h : 1,2,3,4, 5,6,7,8 + * in_l : 8,7,6,5, 4,3,2,1 + * out : 22,38,38,22 + * ============================================================================= + */ +static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) { + __m128i out; + + out = __lsx_vmulwev_w_h(in_h, in_l); + out = __lsx_vmaddwod_w_h(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of byte vector elements + * Arguments : Inputs - in_h, in_l + * Outputs - out + * Return Type - double + * Details : Signed byte elements from in_h are multiplied by + * signed byte elements from in_l, and then added adjacent to + * each other to get a result twice the size of input. + * Example : out = __lsx_vdp2_d_w(in_h, in_l) + * in_h : 1,2,3,4 + * in_l : 8,7,6,5 + * out : 22,38 + * ============================================================================= + */ +static inline __m128i __lsx_vdp2_d_w(__m128i in_h, __m128i in_l) { + __m128i out; + + out = __lsx_vmulwev_d_w(in_h, in_l); + out = __lsx_vmaddwod_d_w(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Clip all halfword elements of input vector between min & max + * out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) : + * (_in)) + * Arguments : Inputs - _in (input vector) + * - min (min threshold) + * - max (max threshold) + * Outputs - out (output vector with clipped elements) + * Return Type - signed halfword + * Example : out = __lsx_vclip_h(_in) + * _in : -8,2,280,249, -8,255,280,249 + * min : 1,1,1,1, 1,1,1,1 + * max : 9,9,9,9, 9,9,9,9 + * out : 1,2,9,9, 1,9,9,9 + * ============================================================================= + */ +static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) { + __m128i out; + + out = __lsx_vmax_h(min, _in); + out = __lsx_vmin_h(max, out); + return out; +} + +/* + * ============================================================================= + * Description : Set each element of vector between 0 and 255 + * Arguments : Inputs - _in + * Outputs - out + * Return Type - halfword + * Details : Signed byte elements from _in are clamped between 0 and 255. + * Example : out = __lsx_vclip255_h(_in) + * _in : -8,255,280,249, -8,255,280,249 + * out : 0,255,255,249, 0,255,255,249 + * ============================================================================= + */ +static inline __m128i __lsx_vclip255_h(__m128i _in) { + __m128i out; + + out = __lsx_vmaxi_h(_in, 0); + out = __lsx_vsat_hu(out, 7); + return out; +} + +/* + * ============================================================================= + * Description : Set each element of vector between 0 and 255 + * Arguments : Inputs - _in + * Outputs - out + * Return Type - word + * Details : Signed byte elements from _in are clamped between 0 and 255. + * Example : out = __lsx_vclip255_w(_in) + * _in : -8,255,280,249 + * out : 0,255,255,249 + * ============================================================================= + */ +static inline __m128i __lsx_vclip255_w(__m128i _in) { + __m128i out; + + out = __lsx_vmaxi_w(_in, 0); + out = __lsx_vsat_wu(out, 7); + return out; +} + +/* + * ============================================================================= + * Description : Swap two variables + * Arguments : Inputs - _in0, _in1 + * Outputs - _in0, _in1 (in-place) + * Details : Swapping of two input variables using xor + * Example : LSX_SWAP(_in0, _in1) + * _in0 : 1,2,3,4 + * _in1 : 5,6,7,8 + * _in0(out) : 5,6,7,8 + * _in1(out) : 1,2,3,4 + * ============================================================================= + */ +#define LSX_SWAP(_in0, _in1) \ + { \ + _in0 = __lsx_vxor_v(_in0, _in1); \ + _in1 = __lsx_vxor_v(_in0, _in1); \ + _in0 = __lsx_vxor_v(_in0, _in1); \ + } + +/* + * ============================================================================= + * Description : Transpose 4x4 block with word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + * Details : + * Example : + * 1, 2, 3, 4 1, 5, 9,13 + * 5, 6, 7, 8 to 2, 6,10,14 + * 9,10,11,12 =====> 3, 7,11,15 + * 13,14,15,16 4, 8,12,16 + * ============================================================================= + */ +#define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + __m128i _t0, _t1, _t2, _t3; \ + \ + _t0 = __lsx_vilvl_w(_in1, _in0); \ + _t1 = __lsx_vilvh_w(_in1, _in0); \ + _t2 = __lsx_vilvl_w(_in3, _in2); \ + _t3 = __lsx_vilvh_w(_in3, _in2); \ + _out0 = __lsx_vilvl_d(_t2, _t0); \ + _out1 = __lsx_vilvh_d(_t2, _t0); \ + _out2 = __lsx_vilvl_d(_t3, _t1); \ + _out3 = __lsx_vilvh_d(_t3, _t1); \ + } + +/* + * ============================================================================= + * Description : Transpose 8x8 block with byte elements in vectors + * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7 + * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, + * _out7 + * Details : The rows of the matrix become columns, and the columns + * become rows. + * Example : LSX_TRANSPOSE8x8_B + * _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00 + * _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00 + * _in2 : 20,21,22,23,24,25,26,27, 00,00,00,00,00,00,00,00 + * _in3 : 30,31,32,33,34,35,36,37, 00,00,00,00,00,00,00,00 + * _in4 : 40,41,42,43,44,45,46,47, 00,00,00,00,00,00,00,00 + * _in5 : 50,51,52,53,54,55,56,57, 00,00,00,00,00,00,00,00 + * _in6 : 60,61,62,63,64,65,66,67, 00,00,00,00,00,00,00,00 + * _in7 : 70,71,72,73,74,75,76,77, 00,00,00,00,00,00,00,00 + * + * _ out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00 + * _ out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00 + * _ out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00 + * _ out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00 + * _ out4 : 04,14,24,34,44,54,64,74, 00,00,00,00,00,00,00,00 + * _ out5 : 05,15,25,35,45,55,65,75, 00,00,00,00,00,00,00,00 + * _ out6 : 06,16,26,36,46,56,66,76, 00,00,00,00,00,00,00,00 + * _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00 + * ============================================================================= + */ +#define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + __m128i zero = { 0 }; \ + __m128i shuf8 = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; \ + __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \ + \ + _t0 = __lsx_vilvl_b(_in2, _in0); \ + _t1 = __lsx_vilvl_b(_in3, _in1); \ + _t2 = __lsx_vilvl_b(_in6, _in4); \ + _t3 = __lsx_vilvl_b(_in7, _in5); \ + _t4 = __lsx_vilvl_b(_t1, _t0); \ + _t5 = __lsx_vilvh_b(_t1, _t0); \ + _t6 = __lsx_vilvl_b(_t3, _t2); \ + _t7 = __lsx_vilvh_b(_t3, _t2); \ + _out0 = __lsx_vilvl_w(_t6, _t4); \ + _out2 = __lsx_vilvh_w(_t6, _t4); \ + _out4 = __lsx_vilvl_w(_t7, _t5); \ + _out6 = __lsx_vilvh_w(_t7, _t5); \ + _out1 = __lsx_vshuf_b(zero, _out0, shuf8); \ + _out3 = __lsx_vshuf_b(zero, _out2, shuf8); \ + _out5 = __lsx_vshuf_b(zero, _out4, shuf8); \ + _out7 = __lsx_vshuf_b(zero, _out6, shuf8); \ + } + +/* + * ============================================================================= + * Description : Transpose 8x8 block with half-word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + * Details : + * Example : + * 00,01,02,03,04,05,06,07 00,10,20,30,40,50,60,70 + * 10,11,12,13,14,15,16,17 01,11,21,31,41,51,61,71 + * 20,21,22,23,24,25,26,27 02,12,22,32,42,52,62,72 + * 30,31,32,33,34,35,36,37 to 03,13,23,33,43,53,63,73 + * 40,41,42,43,44,45,46,47 ======> 04,14,24,34,44,54,64,74 + * 50,51,52,53,54,55,56,57 05,15,25,35,45,55,65,75 + * 60,61,62,63,64,65,66,67 06,16,26,36,46,56,66,76 + * 70,71,72,73,74,75,76,77 07,17,27,37,47,57,67,77 + * ============================================================================= + */ +#define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \ + \ + _s0 = __lsx_vilvl_h(_in6, _in4); \ + _s1 = __lsx_vilvl_h(_in7, _in5); \ + _t0 = __lsx_vilvl_h(_s1, _s0); \ + _t1 = __lsx_vilvh_h(_s1, _s0); \ + _s0 = __lsx_vilvh_h(_in6, _in4); \ + _s1 = __lsx_vilvh_h(_in7, _in5); \ + _t2 = __lsx_vilvl_h(_s1, _s0); \ + _t3 = __lsx_vilvh_h(_s1, _s0); \ + _s0 = __lsx_vilvl_h(_in2, _in0); \ + _s1 = __lsx_vilvl_h(_in3, _in1); \ + _t4 = __lsx_vilvl_h(_s1, _s0); \ + _t5 = __lsx_vilvh_h(_s1, _s0); \ + _s0 = __lsx_vilvh_h(_in2, _in0); \ + _s1 = __lsx_vilvh_h(_in3, _in1); \ + _t6 = __lsx_vilvl_h(_s1, _s0); \ + _t7 = __lsx_vilvh_h(_s1, _s0); \ + \ + _out0 = __lsx_vpickev_d(_t0, _t4); \ + _out2 = __lsx_vpickev_d(_t1, _t5); \ + _out4 = __lsx_vpickev_d(_t2, _t6); \ + _out6 = __lsx_vpickev_d(_t3, _t7); \ + _out1 = __lsx_vpickod_d(_t0, _t4); \ + _out3 = __lsx_vpickod_d(_t1, _t5); \ + _out5 = __lsx_vpickod_d(_t2, _t6); \ + _out7 = __lsx_vpickod_d(_t3, _t7); \ + } + +/* + * ============================================================================= + * Description : Transpose input 8x4 byte block into 4x8 + * Arguments : Inputs - _in0, _in1, _in2, _in3 (input 8x4 byte block) + * Outputs - _out0, _out1, _out2, _out3 (output 4x8 byte block) + * Return Type - as per RTYPE + * Details : The rows of the matrix become columns, and the columns become + * rows. + * Example : LSX_TRANSPOSE8x4_B + * _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00 + * _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00 + * _in2 : 20,21,22,23,00,00,00,00, 00,00,00,00,00,00,00,00 + * _in3 : 30,31,32,33,00,00,00,00, 00,00,00,00,00,00,00,00 + * _in4 : 40,41,42,43,00,00,00,00, 00,00,00,00,00,00,00,00 + * _in5 : 50,51,52,53,00,00,00,00, 00,00,00,00,00,00,00,00 + * _in6 : 60,61,62,63,00,00,00,00, 00,00,00,00,00,00,00,00 + * _in7 : 70,71,72,73,00,00,00,00, 00,00,00,00,00,00,00,00 + * + * _out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00 + * _out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00 + * _out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00 + * _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00 + * ============================================================================= + */ +#define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3) \ + { \ + __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ + \ + _tmp0_m = __lsx_vpackev_w(_in4, _in0); \ + _tmp1_m = __lsx_vpackev_w(_in5, _in1); \ + _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \ + _tmp0_m = __lsx_vpackev_w(_in6, _in2); \ + _tmp1_m = __lsx_vpackev_w(_in7, _in3); \ + \ + _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \ + _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m); \ + _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m); \ + \ + _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m); \ + _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m); \ + _out1 = __lsx_vilvh_d(_out2, _out0); \ + _out3 = __lsx_vilvh_d(_out0, _out2); \ + } + +/* + * ============================================================================= + * Description : Transpose 16x8 block with byte elements in vectors + * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, in8 + * in9, in10, in11, in12, in13, in14, in15 + * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + * Details : + * Example : + * 000,001,002,003,004,005,006,007 + * 008,009,010,011,012,013,014,015 + * 016,017,018,019,020,021,022,023 + * 024,025,026,027,028,029,030,031 + * 032,033,034,035,036,037,038,039 + * 040,041,042,043,044,045,046,047 000,008,...,112,120 + * 048,049,050,051,052,053,054,055 001,009,...,113,121 + * 056,057,058,059,060,061,062,063 to 002,010,...,114,122 + * 064,068,066,067,068,069,070,071 =====> 003,011,...,115,123 + * 072,073,074,075,076,077,078,079 004,012,...,116,124 + * 080,081,082,083,084,085,086,087 005,013,...,117,125 + * 088,089,090,091,092,093,094,095 006,014,...,118,126 + * 096,097,098,099,100,101,102,103 007,015,...,119,127 + * 104,105,106,107,108,109,110,111 + * 112,113,114,115,116,117,118,119 + * 120,121,122,123,124,125,126,127 + * ============================================================================= + */ +#define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _in8, _in9, _in10, _in11, _in12, _in13, _in14, \ + _in15, _out0, _out1, _out2, _out3, _out4, _out5, \ + _out6, _out7) \ + { \ + __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \ + __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \ + DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \ + _tmp0, _tmp1, _tmp2, _tmp3); \ + DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15, \ + _in13, _tmp4, _tmp5, _tmp6, _tmp7); \ + DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2); \ + DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3); \ + DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6); \ + DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7); \ + DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4); \ + DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6); \ + DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5); \ + DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7); \ + DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2); \ + DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3); \ + DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6); \ + DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7); \ + } + +/* + * ============================================================================= + * Description : Butterfly of 4 input vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + * Details : Butterfly operation + * Example : + * out0 = in0 + in3; + * out1 = in1 + in2; + * out2 = in1 - in2; + * out3 = in0 - in3; + * ============================================================================= + */ +#define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + _out0 = __lsx_vadd_b(_in0, _in3); \ + _out1 = __lsx_vadd_b(_in1, _in2); \ + _out2 = __lsx_vsub_b(_in1, _in2); \ + _out3 = __lsx_vsub_b(_in0, _in3); \ + } +#define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + _out0 = __lsx_vadd_h(_in0, _in3); \ + _out1 = __lsx_vadd_h(_in1, _in2); \ + _out2 = __lsx_vsub_h(_in1, _in2); \ + _out3 = __lsx_vsub_h(_in0, _in3); \ + } +#define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + _out0 = __lsx_vadd_w(_in0, _in3); \ + _out1 = __lsx_vadd_w(_in1, _in2); \ + _out2 = __lsx_vsub_w(_in1, _in2); \ + _out3 = __lsx_vsub_w(_in0, _in3); \ + } +#define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + _out0 = __lsx_vadd_d(_in0, _in3); \ + _out1 = __lsx_vadd_d(_in1, _in2); \ + _out2 = __lsx_vsub_d(_in1, _in2); \ + _out3 = __lsx_vsub_d(_in0, _in3); \ + } + +/* + * ============================================================================= + * Description : Butterfly of 8 input vectors + * Arguments : Inputs - _in0, _in1, _in2, _in3, ~ + * Outputs - _out0, _out1, _out2, _out3, ~ + * Details : Butterfly operation + * Example : + * _out0 = _in0 + _in7; + * _out1 = _in1 + _in6; + * _out2 = _in2 + _in5; + * _out3 = _in3 + _in4; + * _out4 = _in3 - _in4; + * _out5 = _in2 - _in5; + * _out6 = _in1 - _in6; + * _out7 = _in0 - _in7; + * ============================================================================= + */ +#define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + _out0 = __lsx_vadd_b(_in0, _in7); \ + _out1 = __lsx_vadd_b(_in1, _in6); \ + _out2 = __lsx_vadd_b(_in2, _in5); \ + _out3 = __lsx_vadd_b(_in3, _in4); \ + _out4 = __lsx_vsub_b(_in3, _in4); \ + _out5 = __lsx_vsub_b(_in2, _in5); \ + _out6 = __lsx_vsub_b(_in1, _in6); \ + _out7 = __lsx_vsub_b(_in0, _in7); \ + } + +#define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + _out0 = __lsx_vadd_h(_in0, _in7); \ + _out1 = __lsx_vadd_h(_in1, _in6); \ + _out2 = __lsx_vadd_h(_in2, _in5); \ + _out3 = __lsx_vadd_h(_in3, _in4); \ + _out4 = __lsx_vsub_h(_in3, _in4); \ + _out5 = __lsx_vsub_h(_in2, _in5); \ + _out6 = __lsx_vsub_h(_in1, _in6); \ + _out7 = __lsx_vsub_h(_in0, _in7); \ + } + +#define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + _out0 = __lsx_vadd_w(_in0, _in7); \ + _out1 = __lsx_vadd_w(_in1, _in6); \ + _out2 = __lsx_vadd_w(_in2, _in5); \ + _out3 = __lsx_vadd_w(_in3, _in4); \ + _out4 = __lsx_vsub_w(_in3, _in4); \ + _out5 = __lsx_vsub_w(_in2, _in5); \ + _out6 = __lsx_vsub_w(_in1, _in6); \ + _out7 = __lsx_vsub_w(_in0, _in7); \ + } + +#define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + _out0 = __lsx_vadd_d(_in0, _in7); \ + _out1 = __lsx_vadd_d(_in1, _in6); \ + _out2 = __lsx_vadd_d(_in2, _in5); \ + _out3 = __lsx_vadd_d(_in3, _in4); \ + _out4 = __lsx_vsub_d(_in3, _in4); \ + _out5 = __lsx_vsub_d(_in2, _in5); \ + _out6 = __lsx_vsub_d(_in1, _in6); \ + _out7 = __lsx_vsub_d(_in0, _in7); \ + } + +/* + * ============================================================================= + * Description : Butterfly of 16 input vectors + * Arguments : Inputs - _in0, _in1, _in2, _in3, ~ + * Outputs - _out0, _out1, _out2, _out3, ~ + * Details : Butterfly operation + * Example : + * _out0 = _in0 + _in15; + * _out1 = _in1 + _in14; + * _out2 = _in2 + _in13; + * _out3 = _in3 + _in12; + * _out4 = _in4 + _in11; + * _out5 = _in5 + _in10; + * _out6 = _in6 + _in9; + * _out7 = _in7 + _in8; + * _out8 = _in7 - _in8; + * _out9 = _in6 - _in9; + * _out10 = _in5 - _in10; + * _out11 = _in4 - _in11; + * _out12 = _in3 - _in12; + * _out13 = _in2 - _in13; + * _out14 = _in1 - _in14; + * _out15 = _in0 - _in15; + * ============================================================================= + */ + +#define LSX_BUTTERFLY_16_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _in8, _in9, _in10, _in11, _in12, _in13, _in14, \ + _in15, _out0, _out1, _out2, _out3, _out4, _out5, \ + _out6, _out7, _out8, _out9, _out10, _out11, _out12, \ + _out13, _out14, _out15) \ + { \ + _out0 = __lsx_vadd_b(_in0, _in15); \ + _out1 = __lsx_vadd_b(_in1, _in14); \ + _out2 = __lsx_vadd_b(_in2, _in13); \ + _out3 = __lsx_vadd_b(_in3, _in12); \ + _out4 = __lsx_vadd_b(_in4, _in11); \ + _out5 = __lsx_vadd_b(_in5, _in10); \ + _out6 = __lsx_vadd_b(_in6, _in9); \ + _out7 = __lsx_vadd_b(_in7, _in8); \ + \ + _out8 = __lsx_vsub_b(_in7, _in8); \ + _out9 = __lsx_vsub_b(_in6, _in9); \ + _out10 = __lsx_vsub_b(_in5, _in10); \ + _out11 = __lsx_vsub_b(_in4, _in11); \ + _out12 = __lsx_vsub_b(_in3, _in12); \ + _out13 = __lsx_vsub_b(_in2, _in13); \ + _out14 = __lsx_vsub_b(_in1, _in14); \ + _out15 = __lsx_vsub_b(_in0, _in15); \ + } + +#define LSX_BUTTERFLY_16_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _in8, _in9, _in10, _in11, _in12, _in13, _in14, \ + _in15, _out0, _out1, _out2, _out3, _out4, _out5, \ + _out6, _out7, _out8, _out9, _out10, _out11, _out12, \ + _out13, _out14, _out15) \ + { \ + _out0 = __lsx_vadd_h(_in0, _in15); \ + _out1 = __lsx_vadd_h(_in1, _in14); \ + _out2 = __lsx_vadd_h(_in2, _in13); \ + _out3 = __lsx_vadd_h(_in3, _in12); \ + _out4 = __lsx_vadd_h(_in4, _in11); \ + _out5 = __lsx_vadd_h(_in5, _in10); \ + _out6 = __lsx_vadd_h(_in6, _in9); \ + _out7 = __lsx_vadd_h(_in7, _in8); \ + \ + _out8 = __lsx_vsub_h(_in7, _in8); \ + _out9 = __lsx_vsub_h(_in6, _in9); \ + _out10 = __lsx_vsub_h(_in5, _in10); \ + _out11 = __lsx_vsub_h(_in4, _in11); \ + _out12 = __lsx_vsub_h(_in3, _in12); \ + _out13 = __lsx_vsub_h(_in2, _in13); \ + _out14 = __lsx_vsub_h(_in1, _in14); \ + _out15 = __lsx_vsub_h(_in0, _in15); \ + } + +#define LSX_BUTTERFLY_16_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _in8, _in9, _in10, _in11, _in12, _in13, _in14, \ + _in15, _out0, _out1, _out2, _out3, _out4, _out5, \ + _out6, _out7, _out8, _out9, _out10, _out11, _out12, \ + _out13, _out14, _out15) \ + { \ + _out0 = __lsx_vadd_w(_in0, _in15); \ + _out1 = __lsx_vadd_w(_in1, _in14); \ + _out2 = __lsx_vadd_w(_in2, _in13); \ + _out3 = __lsx_vadd_w(_in3, _in12); \ + _out4 = __lsx_vadd_w(_in4, _in11); \ + _out5 = __lsx_vadd_w(_in5, _in10); \ + _out6 = __lsx_vadd_w(_in6, _in9); \ + _out7 = __lsx_vadd_w(_in7, _in8); \ + \ + _out8 = __lsx_vsub_w(_in7, _in8); \ + _out9 = __lsx_vsub_w(_in6, _in9); \ + _out10 = __lsx_vsub_w(_in5, _in10); \ + _out11 = __lsx_vsub_w(_in4, _in11); \ + _out12 = __lsx_vsub_w(_in3, _in12); \ + _out13 = __lsx_vsub_w(_in2, _in13); \ + _out14 = __lsx_vsub_w(_in1, _in14); \ + _out15 = __lsx_vsub_w(_in0, _in15); \ + } + +#define LSX_BUTTERFLY_16_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _in8, _in9, _in10, _in11, _in12, _in13, _in14, \ + _in15, _out0, _out1, _out2, _out3, _out4, _out5, \ + _out6, _out7, _out8, _out9, _out10, _out11, _out12, \ + _out13, _out14, _out15) \ + { \ + _out0 = __lsx_vadd_d(_in0, _in15); \ + _out1 = __lsx_vadd_d(_in1, _in14); \ + _out2 = __lsx_vadd_d(_in2, _in13); \ + _out3 = __lsx_vadd_d(_in3, _in12); \ + _out4 = __lsx_vadd_d(_in4, _in11); \ + _out5 = __lsx_vadd_d(_in5, _in10); \ + _out6 = __lsx_vadd_d(_in6, _in9); \ + _out7 = __lsx_vadd_d(_in7, _in8); \ + \ + _out8 = __lsx_vsub_d(_in7, _in8); \ + _out9 = __lsx_vsub_d(_in6, _in9); \ + _out10 = __lsx_vsub_d(_in5, _in10); \ + _out11 = __lsx_vsub_d(_in4, _in11); \ + _out12 = __lsx_vsub_d(_in3, _in12); \ + _out13 = __lsx_vsub_d(_in2, _in13); \ + _out14 = __lsx_vsub_d(_in1, _in14); \ + _out15 = __lsx_vsub_d(_in0, _in15); \ + } + +#endif // LSX + +#ifdef __loongarch_asx +#include +/* + * ============================================================================= + * Description : Dot product of byte vector elements + * Arguments : Inputs - in_h, in_l + * Output - out + * Return Type - signed halfword + * Details : Unsigned byte elements from in_h are multiplied with + * unsigned byte elements from in_l producing a result + * twice the size of input i.e. signed halfword. + * Then these multiplied results of adjacent odd-even elements + * are added to the out vector + * Example : See out = __lasx_xvdp2_w_h(in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvmulwev_h_bu(in_h, in_l); + out = __lasx_xvmaddwod_h_bu(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of byte vector elements + * Arguments : Inputs - in_h, in_l + * Output - out + * Return Type - signed halfword + * Details : Signed byte elements from in_h are multiplied with + * signed byte elements from in_l producing a result + * twice the size of input i.e. signed halfword. + * Then these multiplication results of adjacent odd-even elements + * are added to the out vector + * Example : See out = __lasx_xvdp2_w_h(in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvmulwev_h_b(in_h, in_l); + out = __lasx_xvmaddwod_h_b(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of halfword vector elements + * Arguments : Inputs - in_h, in_l + * Output - out + * Return Type - signed word + * Details : Signed halfword elements from in_h are multiplied with + * signed halfword elements from in_l producing a result + * twice the size of input i.e. signed word. + * Then these multiplied results of adjacent odd-even elements + * are added to the out vector. + * Example : out = __lasx_xvdp2_w_h(in_h, in_l) + * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 + * out : 22,38,38,22, 22,38,38,22 + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvmulwev_w_h(in_h, in_l); + out = __lasx_xvmaddwod_w_h(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of word vector elements + * Arguments : Inputs - in_h, in_l + * Output - out + * Return Type - signed double + * Details : Signed word elements from in_h are multiplied with + * signed word elements from in_l producing a result + * twice the size of input i.e. signed double-word. + * Then these multiplied results of adjacent odd-even elements + * are added to the out vector. + * Example : See out = __lasx_xvdp2_w_h(in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvmulwev_d_w(in_h, in_l); + out = __lasx_xvmaddwod_d_w(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of halfword vector elements + * Arguments : Inputs - in_h, in_l + * Output - out + * Return Type - signed word + * Details : Unsigned halfword elements from in_h are multiplied with + * signed halfword elements from in_l producing a result + * twice the size of input i.e. unsigned word. + * Multiplication result of adjacent odd-even elements + * are added to the out vector + * Example : See out = __lasx_xvdp2_w_h(in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvmulwev_w_hu_h(in_h, in_l); + out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product & addition of byte vector elements + * Arguments : Inputs - in_h, in_l + * Output - out + * Return Type - halfword + * Details : Signed byte elements from in_h are multiplied with + * signed byte elements from in_l producing a result + * twice the size of input i.e. signed halfword. + * Then these multiplied results of adjacent odd-even elements + * are added to the in_c vector. + * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h, + __m256i in_l) { + __m256i out; + + out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l); + out = __lasx_xvmaddwod_h_b(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product & addition of byte vector elements + * Arguments : Inputs - in_h, in_l + * Output - out + * Return Type - halfword + * Details : Unsigned byte elements from in_h are multiplied with + * unsigned byte elements from in_l producing a result + * twice the size of input i.e. signed halfword. + * Then these multiplied results of adjacent odd-even elements + * are added to the in_c vector. + * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h, + __m256i in_l) { + __m256i out; + + out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l); + out = __lasx_xvmaddwod_h_bu(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product & addition of byte vector elements + * Arguments : Inputs - in_h, in_l + * Output - out + * Return Type - halfword + * Details : Unsigned byte elements from in_h are multiplied with + * signed byte elements from in_l producing a result + * twice the size of input i.e. signed halfword. + * Then these multiplied results of adjacent odd-even elements + * are added to the in_c vector. + * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h, + __m256i in_l) { + __m256i out; + + out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l); + out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of halfword vector elements + * Arguments : Inputs - in_c, in_h, in_l + * Output - out + * Return Type - per RTYPE + * Details : Signed halfword elements from in_h are multiplied with + * signed halfword elements from in_l producing a result + * twice the size of input i.e. signed word. + * Multiplication result of adjacent odd-even elements + * are added to the in_c vector. + * Example : out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) + * in_c : 1,2,3,4, 1,2,3,4 + * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8, + * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1, + * out : 23,40,41,26, 23,40,41,26 + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h, + __m256i in_l) { + __m256i out; + + out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l); + out = __lasx_xvmaddwod_w_h(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of halfword vector elements + * Arguments : Inputs - in_c, in_h, in_l + * Output - out + * Return Type - signed word + * Details : Unsigned halfword elements from in_h are multiplied with + * unsigned halfword elements from in_l producing a result + * twice the size of input i.e. signed word. + * Multiplication result of adjacent odd-even elements + * are added to the in_c vector. + * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h, + __m256i in_l) { + __m256i out; + + out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l); + out = __lasx_xvmaddwod_w_hu(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of halfword vector elements + * Arguments : Inputs - in_c, in_h, in_l + * Output - out + * Return Type - signed word + * Details : Unsigned halfword elements from in_h are multiplied with + * signed halfword elements from in_l producing a result + * twice the size of input i.e. signed word. + * Multiplication result of adjacent odd-even elements + * are added to the in_c vector + * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h, + __m256i in_l) { + __m256i out; + + out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l); + out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Vector Unsigned Dot Product and Subtract + * Arguments : Inputs - in_c, in_h, in_l + * Output - out + * Return Type - signed halfword + * Details : Unsigned byte elements from in_h are multiplied with + * unsigned byte elements from in_l producing a result + * twice the size of input i.e. signed halfword. + * Multiplication result of adjacent odd-even elements + * are added together and subtracted from double width elements + * in_c vector. + * Example : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h, + __m256i in_l) { + __m256i out; + + out = __lasx_xvmulwev_h_bu(in_h, in_l); + out = __lasx_xvmaddwod_h_bu(out, in_h, in_l); + out = __lasx_xvsub_h(in_c, out); + return out; +} + +/* + * ============================================================================= + * Description : Vector Signed Dot Product and Subtract + * Arguments : Inputs - in_c, in_h, in_l + * Output - out + * Return Type - signed word + * Details : Signed halfword elements from in_h are multiplied with + * Signed halfword elements from in_l producing a result + * twice the size of input i.e. signed word. + * Multiplication result of adjacent odd-even elements + * are added together and subtracted from double width elements + * in_c vector. + * Example : out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l) + * in_c : 0,0,0,0, 0,0,0,0 + * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1 + * in_l : 2,1,1,0, 1,0,0,0, 0,0,1,0, 1,0,0,1 + * out : -7,-3,0,0, 0,-1,0,-1 + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h, + __m256i in_l) { + __m256i out; + + out = __lasx_xvmulwev_w_h(in_h, in_l); + out = __lasx_xvmaddwod_w_h(out, in_h, in_l); + out = __lasx_xvsub_w(in_c, out); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of halfword vector elements + * Arguments : Inputs - in_h, in_l + * Output - out + * Return Type - signed word + * Details : Signed halfword elements from in_h are multiplied with + * signed halfword elements from in_l producing a result + * four times the size of input i.e. signed doubleword. + * Then these multiplication results of four adjacent elements + * are added together and stored to the out vector. + * Example : out = __lasx_xvdp4_d_h(in_h, in_l) + * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1 + * in_l : -2,1,1,0, 1,0,0,0, 0,0,1, 0, 1,0,0,1 + * out : -2,0,1,1 + * ============================================================================= + */ +static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvmulwev_w_h(in_h, in_l); + out = __lasx_xvmaddwod_w_h(out, in_h, in_l); + out = __lasx_xvhaddw_d_w(out, out); + return out; +} + +/* + * ============================================================================= + * Description : The high half of the vector elements are expanded and + * added after being doubled. + * Arguments : Inputs - in_h, in_l + * Output - out + * Details : The in_h vector and the in_l vector are added after the + * higher half of the two-fold sign extension (signed byte + * to signed halfword) and stored to the out vector. + * Example : See out = __lasx_xvaddwh_w_h(in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvilvh_b(in_h, in_l); + out = __lasx_xvhaddw_h_b(out, out); + return out; +} + +/* + * ============================================================================= + * Description : The high half of the vector elements are expanded and + * added after being doubled. + * Arguments : Inputs - in_h, in_l + * Output - out + * Details : The in_h vector and the in_l vector are added after the + * higher half of the two-fold sign extension (signed halfword + * to signed word) and stored to the out vector. + * Example : out = __lasx_xvaddwh_w_h(in_h, in_l) + * in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1 + * in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1 + * out : 1,0,0,-1, 1,0,0, 2 + * ============================================================================= + */ +static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvilvh_h(in_h, in_l); + out = __lasx_xvhaddw_w_h(out, out); + return out; +} + +/* + * ============================================================================= + * Description : The low half of the vector elements are expanded and + * added after being doubled. + * Arguments : Inputs - in_h, in_l + * Output - out + * Details : The in_h vector and the in_l vector are added after the + * lower half of the two-fold sign extension (signed byte + * to signed halfword) and stored to the out vector. + * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvilvl_b(in_h, in_l); + out = __lasx_xvhaddw_h_b(out, out); + return out; +} + +/* + * ============================================================================= + * Description : The low half of the vector elements are expanded and + * added after being doubled. + * Arguments : Inputs - in_h, in_l + * Output - out + * Details : The in_h vector and the in_l vector are added after the + * lower half of the two-fold sign extension (signed halfword + * to signed word) and stored to the out vector. + * Example : out = __lasx_xvaddwl_w_h(in_h, in_l) + * in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1 + * in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1 + * out : 5,-1,4,2, 1,0,2,-1 + * ============================================================================= + */ +static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvilvl_h(in_h, in_l); + out = __lasx_xvhaddw_w_h(out, out); + return out; +} + +/* + * ============================================================================= + * Description : The low half of the vector elements are expanded and + * added after being doubled. + * Arguments : Inputs - in_h, in_l + * Output - out + * Details : The out vector and the out vector are added after the + * lower half of the two-fold zero extension (unsigned byte + * to unsigned halfword) and stored to the out vector. + * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvilvl_b(in_h, in_l); + out = __lasx_xvhaddw_hu_bu(out, out); + return out; +} + +/* + * ============================================================================= + * Description : The low half of the vector elements are expanded and + * added after being doubled. + * Arguments : Inputs - in_h, in_l + * Output - out + * Details : The in_l vector after double zero extension (unsigned byte to + * signed halfword),added to the in_h vector. + * Example : See out = __lasx_xvaddw_w_w_h(in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvsllwil_hu_bu(in_l, 0); + out = __lasx_xvadd_h(in_h, out); + return out; +} + +/* + * ============================================================================= + * Description : The low half of the vector elements are expanded and + * added after being doubled. + * Arguments : Inputs - in_h, in_l + * Output - out + * Details : The in_l vector after double sign extension (signed halfword to + * signed word), added to the in_h vector. + * Example : out = __lasx_xvaddw_w_w_h(in_h, in_l) + * in_h : 0, 1,0,0, -1,0,0,1, + * in_l : 2,-1,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1, + * out : 2, 0,1,2, -1,0,1,1, + * ============================================================================= + */ +static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvsllwil_w_h(in_l, 0); + out = __lasx_xvadd_w(in_h, out); + return out; +} + +/* + * ============================================================================= + * Description : Multiplication and addition calculation after expansion + * of the lower half of the vector. + * Arguments : Inputs - in_c, in_h, in_l + * Output - out + * Details : The in_h vector and the in_l vector are multiplied after + * the lower half of the two-fold sign extension (signed halfword + * to signed word), and the result is added to the vector in_c, + * then stored to the out vector. + * Example : out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l) + * in_c : 1,2,3,4, 5,6,7,8 + * in_h : 1,2,3,4, 1,2,3,4, 5,6,7,8, 5,6,7,8 + * in_l : 200, 300, 400, 500, 2000, 3000, 4000, 5000, + * -200,-300,-400,-500, -2000,-3000,-4000,-5000 + * out : 201, 602,1203,2004, -995, -1794,-2793,-3992 + * ============================================================================= + */ +static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h, + __m256i in_l) { + __m256i tmp0, tmp1, out; + + tmp0 = __lasx_xvsllwil_w_h(in_h, 0); + tmp1 = __lasx_xvsllwil_w_h(in_l, 0); + tmp0 = __lasx_xvmul_w(tmp0, tmp1); + out = __lasx_xvadd_w(tmp0, in_c); + return out; +} + +/* + * ============================================================================= + * Description : Multiplication and addition calculation after expansion + * of the higher half of the vector. + * Arguments : Inputs - in_c, in_h, in_l + * Output - out + * Details : The in_h vector and the in_l vector are multiplied after + * the higher half of the two-fold sign extension (signed + * halfword to signed word), and the result is added to + * the vector in_c, then stored to the out vector. + * Example : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h, + __m256i in_l) { + __m256i tmp0, tmp1, out; + + tmp0 = __lasx_xvilvh_h(in_h, in_h); + tmp1 = __lasx_xvilvh_h(in_l, in_l); + tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1); + out = __lasx_xvadd_w(tmp0, in_c); + return out; +} + +/* + * ============================================================================= + * Description : Multiplication calculation after expansion of the lower + * half of the vector. + * Arguments : Inputs - in_h, in_l + * Output - out + * Details : The in_h vector and the in_l vector are multiplied after + * the lower half of the two-fold sign extension (signed + * halfword to signed word), then stored to the out vector. + * Example : out = __lasx_xvmulwl_w_h(in_h, in_l) + * in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1 + * in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1 + * out : 6,1,3,0, 0,0,1,0 + * ============================================================================= + */ +static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) { + __m256i tmp0, tmp1, out; + + tmp0 = __lasx_xvsllwil_w_h(in_h, 0); + tmp1 = __lasx_xvsllwil_w_h(in_l, 0); + out = __lasx_xvmul_w(tmp0, tmp1); + return out; +} + +/* + * ============================================================================= + * Description : Multiplication calculation after expansion of the lower + * half of the vector. + * Arguments : Inputs - in_h, in_l + * Output - out + * Details : The in_h vector and the in_l vector are multiplied after + * the lower half of the two-fold sign extension (signed + * halfword to signed word), then stored to the out vector. + * Example : out = __lasx_xvmulwh_w_h(in_h, in_l) + * in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1 + * in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1 + * out : 0,0,0,0, 0,0,0,1 + * ============================================================================= + */ +static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) { + __m256i tmp0, tmp1, out; + + tmp0 = __lasx_xvilvh_h(in_h, in_h); + tmp1 = __lasx_xvilvh_h(in_l, in_l); + out = __lasx_xvmulwev_w_h(tmp0, tmp1); + return out; +} + +/* + * ============================================================================= + * Description : The low half of the vector elements are added to the high half + * after being doubled, then saturated. + * Arguments : Inputs - in_h, in_l + * Output - out + * Details : The in_h vector adds the in_l vector after the lower half of + * the two-fold zero extension (unsigned byte to unsigned + * halfword) and then saturated. The results are stored to the out + * vector. + * Example : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l) + * in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1 + * in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1, + * 0,0,0,1 + * out : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2, + * ============================================================================= + */ +static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) { + __m256i tmp1, out; + __m256i zero = { 0 }; + + tmp1 = __lasx_xvilvl_b(zero, in_l); + out = __lasx_xvsadd_hu(in_h, tmp1); + return out; +} + +/* + * ============================================================================= + * Description : Clip all halfword elements of input vector between min & max + * out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in)) + * Arguments : Inputs - in (input vector) + * - min (min threshold) + * - max (max threshold) + * Outputs - in (output vector with clipped elements) + * Return Type - signed halfword + * Example : out = __lasx_xvclip_h(in, min, max) + * in : -8,2,280,249, -8,255,280,249, 4,4,4,4, 5,5,5,5 + * min : 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1 + * max : 9,9,9,9, 9,9,9,9, 9,9,9,9, 9,9,9,9 + * out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5 + * ============================================================================= + */ +static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max) { + __m256i out; + + out = __lasx_xvmax_h(min, in); + out = __lasx_xvmin_h(max, out); + return out; +} + +/* + * ============================================================================= + * Description : Clip all signed halfword elements of input vector + * between 0 & 255 + * Arguments : Inputs - in (input vector) + * Outputs - out (output vector with clipped elements) + * Return Type - signed halfword + * Example : See out = __lasx_xvclip255_w(in) + * ============================================================================= + */ +static inline __m256i __lasx_xvclip255_h(__m256i in) { + __m256i out; + + out = __lasx_xvmaxi_h(in, 0); + out = __lasx_xvsat_hu(out, 7); + return out; +} + +/* + * ============================================================================= + * Description : Clip all signed word elements of input vector + * between 0 & 255 + * Arguments : Inputs - in (input vector) + * Output - out (output vector with clipped elements) + * Return Type - signed word + * Example : out = __lasx_xvclip255_w(in) + * in : -8,255,280,249, -8,255,280,249 + * out : 0,255,255,249, 0,255,255,249 + * ============================================================================= + */ +static inline __m256i __lasx_xvclip255_w(__m256i in) { + __m256i out; + + out = __lasx_xvmaxi_w(in, 0); + out = __lasx_xvsat_wu(out, 7); + return out; +} + +/* + * ============================================================================= + * Description : Indexed halfword element values are replicated to all + * elements in output vector. If 'idx < 8' use xvsplati_l_*, + * if 'idx >= 8' use xvsplati_h_*. + * Arguments : Inputs - in, idx + * Output - out + * Details : Idx element value from in vector is replicated to all + * elements in out vector. + * Valid index range for halfword operation is 0-7 + * Example : out = __lasx_xvsplati_l_h(in, idx) + * in : 20,10,11,12, 13,14,15,16, 0,0,2,0, 0,0,0,0 + * idx : 0x02 + * out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11 + * ============================================================================= + */ +static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) { + __m256i out; + + out = __lasx_xvpermi_q(in, in, 0x02); + out = __lasx_xvreplve_h(out, idx); + return out; +} + +/* + * ============================================================================= + * Description : Indexed halfword element values are replicated to all + * elements in output vector. If 'idx < 8' use xvsplati_l_*, + * if 'idx >= 8' use xvsplati_h_*. + * Arguments : Inputs - in, idx + * Output - out + * Details : Idx element value from in vector is replicated to all + * elements in out vector. + * Valid index range for halfword operation is 0-7 + * Example : out = __lasx_xvsplati_h_h(in, idx) + * in : 20,10,11,12, 13,14,15,16, 0,2,0,0, 0,0,0,0 + * idx : 0x09 + * out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2 + * ============================================================================= + */ +static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) { + __m256i out; + + out = __lasx_xvpermi_q(in, in, 0x13); + out = __lasx_xvreplve_h(out, idx); + return out; +} + +/* + * ============================================================================= + * Description : Transpose 4x4 block with double-word elements in vectors + * Arguments : Inputs - _in0, _in1, _in2, _in3 + * Outputs - _out0, _out1, _out2, _out3 + * Example : LASX_TRANSPOSE4x4_D + * _in0 : 1,2,3,4 + * _in1 : 1,2,3,4 + * _in2 : 1,2,3,4 + * _in3 : 1,2,3,4 + * + * _out0 : 1,1,1,1 + * _out1 : 2,2,2,2 + * _out2 : 3,3,3,3 + * _out3 : 4,4,4,4 + * ============================================================================= + */ +#define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \ + _out3) \ + { \ + __m256i _tmp0, _tmp1, _tmp2, _tmp3; \ + _tmp0 = __lasx_xvilvl_d(_in1, _in0); \ + _tmp1 = __lasx_xvilvh_d(_in1, _in0); \ + _tmp2 = __lasx_xvilvl_d(_in3, _in2); \ + _tmp3 = __lasx_xvilvh_d(_in3, _in2); \ + _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20); \ + _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31); \ + _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20); \ + _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31); \ + } + +/* + * ============================================================================= + * Description : Transpose 8x8 block with word elements in vectors + * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7 + * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, + * _out7 + * Example : LASX_TRANSPOSE8x8_W + * _in0 : 1,2,3,4,5,6,7,8 + * _in1 : 2,2,3,4,5,6,7,8 + * _in2 : 3,2,3,4,5,6,7,8 + * _in3 : 4,2,3,4,5,6,7,8 + * _in4 : 5,2,3,4,5,6,7,8 + * _in5 : 6,2,3,4,5,6,7,8 + * _in6 : 7,2,3,4,5,6,7,8 + * _in7 : 8,2,3,4,5,6,7,8 + * + * _out0 : 1,2,3,4,5,6,7,8 + * _out1 : 2,2,2,2,2,2,2,2 + * _out2 : 3,3,3,3,3,3,3,3 + * _out3 : 4,4,4,4,4,4,4,4 + * _out4 : 5,5,5,5,5,5,5,5 + * _out5 : 6,6,6,6,6,6,6,6 + * _out6 : 7,7,7,7,7,7,7,7 + * _out7 : 8,8,8,8,8,8,8,8 + * ============================================================================= + */ +#define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + __m256i _s0_m, _s1_m; \ + __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ + __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ + \ + _s0_m = __lasx_xvilvl_w(_in2, _in0); \ + _s1_m = __lasx_xvilvl_w(_in3, _in1); \ + _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m); \ + _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m); \ + _s0_m = __lasx_xvilvh_w(_in2, _in0); \ + _s1_m = __lasx_xvilvh_w(_in3, _in1); \ + _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m); \ + _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m); \ + _s0_m = __lasx_xvilvl_w(_in6, _in4); \ + _s1_m = __lasx_xvilvl_w(_in7, _in5); \ + _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m); \ + _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m); \ + _s0_m = __lasx_xvilvh_w(_in6, _in4); \ + _s1_m = __lasx_xvilvh_w(_in7, _in5); \ + _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m); \ + _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m); \ + _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20); \ + _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20); \ + _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20); \ + _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20); \ + _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31); \ + _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31); \ + _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31); \ + _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31); \ + } + +/* + * ============================================================================= + * Description : Transpose input 16x8 byte block + * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, + * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15 + * (input 16x8 byte block) + * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, + * _out7 (output 8x16 byte block) + * Details : The rows of the matrix become columns, and the columns become + * rows. + * Example : See LASX_TRANSPOSE16x8_H + * ============================================================================= + */ +#define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _in8, _in9, _in10, _in11, _in12, _in13, _in14, \ + _in15, _out0, _out1, _out2, _out3, _out4, _out5, \ + _out6, _out7) \ + { \ + __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ + __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ + \ + _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \ + _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \ + _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \ + _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \ + _tmp4_m = __lasx_xvilvl_b(_in10, _in8); \ + _tmp5_m = __lasx_xvilvl_b(_in11, _in9); \ + _tmp6_m = __lasx_xvilvl_b(_in14, _in12); \ + _tmp7_m = __lasx_xvilvl_b(_in15, _in13); \ + _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \ + _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \ + _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \ + _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \ + _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m); \ + _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m); \ + _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m); \ + _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m); \ + _tmp0_m = __lasx_xvilvl_w(_out2, _out0); \ + _tmp2_m = __lasx_xvilvh_w(_out2, _out0); \ + _tmp4_m = __lasx_xvilvl_w(_out3, _out1); \ + _tmp6_m = __lasx_xvilvh_w(_out3, _out1); \ + _tmp1_m = __lasx_xvilvl_w(_out6, _out4); \ + _tmp3_m = __lasx_xvilvh_w(_out6, _out4); \ + _tmp5_m = __lasx_xvilvl_w(_out7, _out5); \ + _tmp7_m = __lasx_xvilvh_w(_out7, _out5); \ + _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m); \ + _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m); \ + _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m); \ + _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m); \ + _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m); \ + _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m); \ + _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m); \ + _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m); \ + } + +/* + * ============================================================================= + * Description : Transpose input 16x8 byte block + * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, + * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15 + * (input 16x8 byte block) + * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, + * _out7 (output 8x16 byte block) + * Details : The rows of the matrix become columns, and the columns become + * rows. + * Example : LASX_TRANSPOSE16x8_H + * _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in2 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in3 : 4,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in4 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in5 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in6 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in7 : 8,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in8 : 9,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in9 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in10 : 0,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in11 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in12 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in13 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in14 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in15 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * + * _out0 : 1,2,3,4,5,6,7,8,9,1,0,2,3,7,5,6 + * _out1 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2 + * _out2 : 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3 + * _out3 : 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4 + * _out4 : 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5 + * _out5 : 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6 + * _out6 : 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7 + * _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8 + * ============================================================================= + */ +#define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _in8, _in9, _in10, _in11, _in12, _in13, _in14, \ + _in15, _out0, _out1, _out2, _out3, _out4, _out5, \ + _out6, _out7) \ + { \ + __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ + __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ + __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \ + \ + _tmp0_m = __lasx_xvilvl_h(_in2, _in0); \ + _tmp1_m = __lasx_xvilvl_h(_in3, _in1); \ + _tmp2_m = __lasx_xvilvl_h(_in6, _in4); \ + _tmp3_m = __lasx_xvilvl_h(_in7, _in5); \ + _tmp4_m = __lasx_xvilvl_h(_in10, _in8); \ + _tmp5_m = __lasx_xvilvl_h(_in11, _in9); \ + _tmp6_m = __lasx_xvilvl_h(_in14, _in12); \ + _tmp7_m = __lasx_xvilvl_h(_in15, _in13); \ + _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \ + _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \ + _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \ + _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \ + _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \ + _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \ + _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \ + _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \ + _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \ + _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \ + _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \ + _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \ + _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \ + _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \ + _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \ + _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \ + _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \ + _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \ + _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \ + _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \ + \ + _tmp0_m = __lasx_xvilvh_h(_in2, _in0); \ + _tmp1_m = __lasx_xvilvh_h(_in3, _in1); \ + _tmp2_m = __lasx_xvilvh_h(_in6, _in4); \ + _tmp3_m = __lasx_xvilvh_h(_in7, _in5); \ + _tmp4_m = __lasx_xvilvh_h(_in10, _in8); \ + _tmp5_m = __lasx_xvilvh_h(_in11, _in9); \ + _tmp6_m = __lasx_xvilvh_h(_in14, _in12); \ + _tmp7_m = __lasx_xvilvh_h(_in15, _in13); \ + _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \ + _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \ + _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \ + _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \ + _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \ + _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \ + _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \ + _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \ + _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \ + _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \ + _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \ + _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \ + _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \ + _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \ + _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \ + _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \ + _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \ + _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \ + _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \ + _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \ + } + +/* + * ============================================================================= + * Description : Transpose 4x4 block with halfword elements in vectors + * Arguments : Inputs - _in0, _in1, _in2, _in3 + * Outputs - _out0, _out1, _out2, _out3 + * Return Type - signed halfword + * Details : The rows of the matrix become columns, and the columns become + * rows. + * Example : See LASX_TRANSPOSE8x8_H + * ============================================================================= + */ +#define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \ + _out3) \ + { \ + __m256i _s0_m, _s1_m; \ + \ + _s0_m = __lasx_xvilvl_h(_in1, _in0); \ + _s1_m = __lasx_xvilvl_h(_in3, _in2); \ + _out0 = __lasx_xvilvl_w(_s1_m, _s0_m); \ + _out2 = __lasx_xvilvh_w(_s1_m, _s0_m); \ + _out1 = __lasx_xvilvh_d(_out0, _out0); \ + _out3 = __lasx_xvilvh_d(_out2, _out2); \ + } + +/* + * ============================================================================= + * Description : Transpose input 8x8 byte block + * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7 + * (input 8x8 byte block) + * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, + * _out7 (output 8x8 byte block) + * Example : See LASX_TRANSPOSE8x8_H + * ============================================================================= + */ +#define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ + __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ + _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \ + _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \ + _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \ + _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \ + _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \ + _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \ + _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \ + _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \ + _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m); \ + _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m); \ + _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m); \ + _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m); \ + _out1 = __lasx_xvbsrl_v(_out0, 8); \ + _out3 = __lasx_xvbsrl_v(_out2, 8); \ + _out5 = __lasx_xvbsrl_v(_out4, 8); \ + _out7 = __lasx_xvbsrl_v(_out6, 8); \ + } + +/* + * ============================================================================= + * Description : Transpose 8x8 block with halfword elements in vectors. + * Arguments : Inputs - _in0, _in1, ~ + * Outputs - _out0, _out1, ~ + * Details : The rows of the matrix become columns, and the columns become + * rows. + * Example : LASX_TRANSPOSE8x8_H + * _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8 + * _in2 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8 + * _in3 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * _in4 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8 + * _in5 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * _in6 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * _in7 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8 + * + * _out0 : 1,8,8,1, 9,1,1,9, 1,8,8,1, 9,1,1,9 + * _out1 : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2 + * _out2 : 3,3,3,3, 3,3,3,3, 3,3,3,3, 3,3,3,3 + * _out3 : 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4 + * _out4 : 5,5,5,5, 5,5,5,5, 5,5,5,5, 5,5,5,5 + * _out5 : 6,6,6,6, 6,6,6,6, 6,6,6,6, 6,6,6,6 + * _out6 : 7,7,7,7, 7,7,7,7, 7,7,7,7, 7,7,7,7 + * _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8 + * ============================================================================= + */ +#define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + __m256i _s0_m, _s1_m; \ + __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ + __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ + \ + _s0_m = __lasx_xvilvl_h(_in6, _in4); \ + _s1_m = __lasx_xvilvl_h(_in7, _in5); \ + _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m); \ + _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m); \ + _s0_m = __lasx_xvilvh_h(_in6, _in4); \ + _s1_m = __lasx_xvilvh_h(_in7, _in5); \ + _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m); \ + _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m); \ + \ + _s0_m = __lasx_xvilvl_h(_in2, _in0); \ + _s1_m = __lasx_xvilvl_h(_in3, _in1); \ + _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m); \ + _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m); \ + _s0_m = __lasx_xvilvh_h(_in2, _in0); \ + _s1_m = __lasx_xvilvh_h(_in3, _in1); \ + _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m); \ + _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m); \ + \ + _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m); \ + _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m); \ + _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m); \ + _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m); \ + _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m); \ + _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m); \ + _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m); \ + _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m); \ + } + +/* + * ============================================================================= + * Description : Butterfly of 4 input vectors + * Arguments : Inputs - _in0, _in1, _in2, _in3 + * Outputs - _out0, _out1, _out2, _out3 + * Details : Butterfly operation + * Example : LASX_BUTTERFLY_4 + * _out0 = _in0 + _in3; + * _out1 = _in1 + _in2; + * _out2 = _in1 - _in2; + * _out3 = _in0 - _in3; + * ============================================================================= + */ +#define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + _out0 = __lasx_xvadd_b(_in0, _in3); \ + _out1 = __lasx_xvadd_b(_in1, _in2); \ + _out2 = __lasx_xvsub_b(_in1, _in2); \ + _out3 = __lasx_xvsub_b(_in0, _in3); \ + } +#define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + _out0 = __lasx_xvadd_h(_in0, _in3); \ + _out1 = __lasx_xvadd_h(_in1, _in2); \ + _out2 = __lasx_xvsub_h(_in1, _in2); \ + _out3 = __lasx_xvsub_h(_in0, _in3); \ + } +#define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + _out0 = __lasx_xvadd_w(_in0, _in3); \ + _out1 = __lasx_xvadd_w(_in1, _in2); \ + _out2 = __lasx_xvsub_w(_in1, _in2); \ + _out3 = __lasx_xvsub_w(_in0, _in3); \ + } +#define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + _out0 = __lasx_xvadd_d(_in0, _in3); \ + _out1 = __lasx_xvadd_d(_in1, _in2); \ + _out2 = __lasx_xvsub_d(_in1, _in2); \ + _out3 = __lasx_xvsub_d(_in0, _in3); \ + } + +/* + * ============================================================================= + * Description : Butterfly of 8 input vectors + * Arguments : Inputs - _in0, _in1, _in2, _in3, ~ + * Outputs - _out0, _out1, _out2, _out3, ~ + * Details : Butterfly operation + * Example : LASX_BUTTERFLY_8 + * _out0 = _in0 + _in7; + * _out1 = _in1 + _in6; + * _out2 = _in2 + _in5; + * _out3 = _in3 + _in4; + * _out4 = _in3 - _in4; + * _out5 = _in2 - _in5; + * _out6 = _in1 - _in6; + * _out7 = _in0 - _in7; + * ============================================================================= + */ +#define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + _out0 = __lasx_xvadd_b(_in0, _in7); \ + _out1 = __lasx_xvadd_b(_in1, _in6); \ + _out2 = __lasx_xvadd_b(_in2, _in5); \ + _out3 = __lasx_xvadd_b(_in3, _in4); \ + _out4 = __lasx_xvsub_b(_in3, _in4); \ + _out5 = __lasx_xvsub_b(_in2, _in5); \ + _out6 = __lasx_xvsub_b(_in1, _in6); \ + _out7 = __lasx_xvsub_b(_in0, _in7); \ + } + +#define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + _out0 = __lasx_xvadd_h(_in0, _in7); \ + _out1 = __lasx_xvadd_h(_in1, _in6); \ + _out2 = __lasx_xvadd_h(_in2, _in5); \ + _out3 = __lasx_xvadd_h(_in3, _in4); \ + _out4 = __lasx_xvsub_h(_in3, _in4); \ + _out5 = __lasx_xvsub_h(_in2, _in5); \ + _out6 = __lasx_xvsub_h(_in1, _in6); \ + _out7 = __lasx_xvsub_h(_in0, _in7); \ + } + +#define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + _out0 = __lasx_xvadd_w(_in0, _in7); \ + _out1 = __lasx_xvadd_w(_in1, _in6); \ + _out2 = __lasx_xvadd_w(_in2, _in5); \ + _out3 = __lasx_xvadd_w(_in3, _in4); \ + _out4 = __lasx_xvsub_w(_in3, _in4); \ + _out5 = __lasx_xvsub_w(_in2, _in5); \ + _out6 = __lasx_xvsub_w(_in1, _in6); \ + _out7 = __lasx_xvsub_w(_in0, _in7); \ + } + +#define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + _out0 = __lasx_xvadd_d(_in0, _in7); \ + _out1 = __lasx_xvadd_d(_in1, _in6); \ + _out2 = __lasx_xvadd_d(_in2, _in5); \ + _out3 = __lasx_xvadd_d(_in3, _in4); \ + _out4 = __lasx_xvsub_d(_in3, _in4); \ + _out5 = __lasx_xvsub_d(_in2, _in5); \ + _out6 = __lasx_xvsub_d(_in1, _in6); \ + _out7 = __lasx_xvsub_d(_in0, _in7); \ + } + +#endif // LASX + +/* + * ============================================================================= + * Description : Print out elements in vector. + * Arguments : Inputs - RTYPE, _element_num, _in0, _enter + * Outputs - + * Details : Print out '_element_num' elements in 'RTYPE' vector '_in0', if + * '_enter' is TRUE, prefix "\nVP:" will be added first. + * Example : VECT_PRINT(v4i32,4,in0,1); // in0: 1,2,3,4 + * VP:1,2,3,4, + * ============================================================================= + */ +#define VECT_PRINT(RTYPE, element_num, in0, enter) \ + { \ + RTYPE _tmp0 = (RTYPE)in0; \ + int _i = 0; \ + if (enter) printf("\nVP:"); \ + for (_i = 0; _i < element_num; _i++) printf("%d,", _tmp0[_i]); \ + } + +#endif /* LOONGSON_INTRINSICS_H */ +#endif /* VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_ */ diff --git a/media/libvpx/libvpx/vpx_util/vpx_atomics.h b/media/libvpx/libvpx/vpx_util/vpx_atomics.h new file mode 100644 index 0000000000..23ad566851 --- /dev/null +++ b/media/libvpx/libvpx/vpx_util/vpx_atomics.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_UTIL_VPX_ATOMICS_H_ +#define VPX_VPX_UTIL_VPX_ATOMICS_H_ + +#include "./vpx_config.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +#if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD + +// Look for built-in atomic support. We cannot use or +// since neither is guaranteed to exist on both C and C++ platforms, and we need +// to back the atomic type with the same type (g++ needs to be able to use +// gcc-built code). g++ 6 doesn't support _Atomic as a keyword and can't use the +// stdatomic.h header. Even if both and existed it's not +// guaranteed that atomic_int is the same type as std::atomic_int. +// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=60932#c13. +#if !defined(__has_builtin) +#define __has_builtin(x) 0 // Compatibility with non-clang compilers. +#endif // !defined(__has_builtin) + +#if (__has_builtin(__atomic_load_n)) || \ + (defined(__GNUC__) && \ + (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7))) +// For GCC >= 4.7 and Clang versions that support __atomic builtins, use those. +#define VPX_USE_ATOMIC_BUILTINS +#else +// Use platform-specific asm barriers. +#if defined(_MSC_VER) +// TODO(pbos): This assumes that newer versions of MSVC are building with the +// default /volatile:ms (or older, where this is always true. Consider adding +// support for using instead of stdatomic.h when building C++11 under +// MSVC. It's unclear what to do for plain C under /volatile:iso (inline asm?), +// there're no explicit Interlocked* functions for only storing or loading +// (presumably because volatile has historically implied that on MSVC). +// +// For earlier versions of MSVC or the default /volatile:ms volatile int are +// acquire/release and require no barrier. +#define vpx_atomic_memory_barrier() \ + do { \ + } while (0) +#else +#if VPX_ARCH_X86 || VPX_ARCH_X86_64 +// Use a compiler barrier on x86, no runtime penalty. +#define vpx_atomic_memory_barrier() __asm__ __volatile__("" ::: "memory") +#elif VPX_ARCH_ARM +#define vpx_atomic_memory_barrier() __asm__ __volatile__("dmb ish" ::: "memory") +#elif VPX_ARCH_MIPS +#define vpx_atomic_memory_barrier() __asm__ __volatile__("sync" ::: "memory") +#else +#error Unsupported architecture! +#endif // VPX_ARCH_X86 || VPX_ARCH_X86_64 +#endif // defined(_MSC_VER) +#endif // atomic builtin availability check + +// These are wrapped in a struct so that they are not easily accessed directly +// on any platform (to discourage programmer errors by setting values directly). +// This primitive MUST be initialized using vpx_atomic_init or VPX_ATOMIC_INIT +// (NOT memset) and accessed through vpx_atomic_ functions. +typedef struct vpx_atomic_int { + volatile int value; +} vpx_atomic_int; + +#define VPX_ATOMIC_INIT(num) \ + { num } + +// Initialization of an atomic int, not thread safe. +static INLINE void vpx_atomic_init(vpx_atomic_int *atomic, int value) { + atomic->value = value; +} + +static INLINE void vpx_atomic_store_release(vpx_atomic_int *atomic, int value) { +#if defined(VPX_USE_ATOMIC_BUILTINS) + __atomic_store_n(&atomic->value, value, __ATOMIC_RELEASE); +#else + vpx_atomic_memory_barrier(); + atomic->value = value; +#endif // defined(VPX_USE_ATOMIC_BUILTINS) +} + +static INLINE int vpx_atomic_load_acquire(const vpx_atomic_int *atomic) { +#if defined(VPX_USE_ATOMIC_BUILTINS) + return __atomic_load_n(&atomic->value, __ATOMIC_ACQUIRE); +#else + int v = atomic->value; + vpx_atomic_memory_barrier(); + return v; +#endif // defined(VPX_USE_ATOMIC_BUILTINS) +} + +#undef VPX_USE_ATOMIC_BUILTINS +#undef vpx_atomic_memory_barrier + +#endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */ + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // VPX_VPX_UTIL_VPX_ATOMICS_H_ diff --git a/media/libvpx/libvpx/vpx_util/vpx_debug_util.c b/media/libvpx/libvpx/vpx_util/vpx_debug_util.c new file mode 100644 index 0000000000..3ce4065ba5 --- /dev/null +++ b/media/libvpx/libvpx/vpx_util/vpx_debug_util.c @@ -0,0 +1,282 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include "vpx_util/vpx_debug_util.h" + +#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG +static int frame_idx_w = 0; +static int frame_idx_r = 0; + +void bitstream_queue_set_frame_write(int frame_idx) { frame_idx_w = frame_idx; } + +int bitstream_queue_get_frame_write(void) { return frame_idx_w; } + +void bitstream_queue_set_frame_read(int frame_idx) { frame_idx_r = frame_idx; } + +int bitstream_queue_get_frame_read(void) { return frame_idx_r; } +#endif + +#if CONFIG_BITSTREAM_DEBUG +#define QUEUE_MAX_SIZE 2000000 +static int result_queue[QUEUE_MAX_SIZE]; +static int prob_queue[QUEUE_MAX_SIZE]; + +static int queue_r = 0; +static int queue_w = 0; +static int queue_prev_w = -1; +static int skip_r = 0; +static int skip_w = 0; +void bitstream_queue_set_skip_write(int skip) { skip_w = skip; } + +void bitstream_queue_set_skip_read(int skip) { skip_r = skip; } + +void bitstream_queue_record_write(void) { queue_prev_w = queue_w; } + +void bitstream_queue_reset_write(void) { queue_w = queue_prev_w; } + +int bitstream_queue_get_write(void) { return queue_w; } + +int bitstream_queue_get_read(void) { return queue_r; } + +void bitstream_queue_pop(int *result, int *prob) { + if (!skip_r) { + if (queue_w == queue_r) { + printf("buffer underflow queue_w %d queue_r %d\n", queue_w, queue_r); + assert(0); + } + *result = result_queue[queue_r]; + *prob = prob_queue[queue_r]; + queue_r = (queue_r + 1) % QUEUE_MAX_SIZE; + } +} + +void bitstream_queue_push(int result, const int prob) { + if (!skip_w) { + result_queue[queue_w] = result; + prob_queue[queue_w] = prob; + queue_w = (queue_w + 1) % QUEUE_MAX_SIZE; + if (queue_w == queue_r) { + printf("buffer overflow queue_w %d queue_r %d\n", queue_w, queue_r); + assert(0); + } + } +} +#endif // CONFIG_BITSTREAM_DEBUG + +#if CONFIG_MISMATCH_DEBUG +static int frame_buf_idx_r = 0; +static int frame_buf_idx_w = 0; +#define MAX_FRAME_BUF_NUM 20 +#define MAX_FRAME_STRIDE 1920 +#define MAX_FRAME_HEIGHT 1080 +static uint16_t + frame_pre[MAX_FRAME_BUF_NUM][3] + [MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT]; // prediction only +static uint16_t + frame_tx[MAX_FRAME_BUF_NUM][3] + [MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT]; // prediction + txfm +static int frame_stride = MAX_FRAME_STRIDE; +static int frame_height = MAX_FRAME_HEIGHT; +static int frame_size = MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT; +void mismatch_move_frame_idx_w(void) { + frame_buf_idx_w = (frame_buf_idx_w + 1) % MAX_FRAME_BUF_NUM; + if (frame_buf_idx_w == frame_buf_idx_r) { + printf("frame_buf overflow\n"); + assert(0); + } +} + +void mismatch_reset_frame(int num_planes) { + int plane; + for (plane = 0; plane < num_planes; ++plane) { + memset(frame_pre[frame_buf_idx_w][plane], 0, + sizeof(frame_pre[frame_buf_idx_w][plane][0]) * frame_size); + memset(frame_tx[frame_buf_idx_w][plane], 0, + sizeof(frame_tx[frame_buf_idx_w][plane][0]) * frame_size); + } +} + +void mismatch_move_frame_idx_r(void) { + if (frame_buf_idx_w == frame_buf_idx_r) { + printf("frame_buf underflow\n"); + assert(0); + } + frame_buf_idx_r = (frame_buf_idx_r + 1) % MAX_FRAME_BUF_NUM; +} + +void mismatch_record_block_pre(const uint8_t *src, int src_stride, int plane, + int pixel_c, int pixel_r, int blk_w, int blk_h, + int highbd) { + const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL; + int r, c; + + if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) { + printf("frame_buf undersized\n"); + assert(0); + } + + for (r = 0; r < blk_h; ++r) { + for (c = 0; c < blk_w; ++c) { + frame_pre[frame_buf_idx_w][plane] + [(r + pixel_r) * frame_stride + c + pixel_c] = + src16 ? src16[r * src_stride + c] : src[r * src_stride + c]; + } + } +#if 0 + { + int ref_frame_idx = 3; + int ref_plane = 1; + int ref_pixel_c = 162; + int ref_pixel_r = 16; + if (frame_idx_w == ref_frame_idx && plane == ref_plane && + ref_pixel_c >= pixel_c && ref_pixel_c < pixel_c + blk_w && + ref_pixel_r >= pixel_r && ref_pixel_r < pixel_r + blk_h) { + printf( + "\nrecord_block_pre frame_idx %d plane %d pixel_c %d pixel_r %d blk_w" + " %d blk_h %d\n", + frame_idx_w, plane, pixel_c, pixel_r, blk_w, blk_h); + } + } +#endif +} +void mismatch_record_block_tx(const uint8_t *src, int src_stride, int plane, + int pixel_c, int pixel_r, int blk_w, int blk_h, + int highbd) { + const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL; + int r, c; + if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) { + printf("frame_buf undersized\n"); + assert(0); + } + + for (r = 0; r < blk_h; ++r) { + for (c = 0; c < blk_w; ++c) { + frame_tx[frame_buf_idx_w][plane] + [(r + pixel_r) * frame_stride + c + pixel_c] = + src16 ? src16[r * src_stride + c] : src[r * src_stride + c]; + } + } +#if 0 + { + int ref_frame_idx = 3; + int ref_plane = 1; + int ref_pixel_c = 162; + int ref_pixel_r = 16; + if (frame_idx_w == ref_frame_idx && plane == ref_plane && + ref_pixel_c >= pixel_c && ref_pixel_c < pixel_c + blk_w && + ref_pixel_r >= pixel_r && ref_pixel_r < pixel_r + blk_h) { + printf( + "\nrecord_block_tx frame_idx %d plane %d pixel_c %d pixel_r %d blk_w " + "%d blk_h %d\n", + frame_idx_w, plane, pixel_c, pixel_r, blk_w, blk_h); + } + } +#endif +} +void mismatch_check_block_pre(const uint8_t *src, int src_stride, int plane, + int pixel_c, int pixel_r, int blk_w, int blk_h, + int highbd) { + const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL; + int mismatch = 0; + int r, c; + if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) { + printf("frame_buf undersized\n"); + assert(0); + } + + for (r = 0; r < blk_h; ++r) { + for (c = 0; c < blk_w; ++c) { + if (frame_pre[frame_buf_idx_r][plane] + [(r + pixel_r) * frame_stride + c + pixel_c] != + (uint16_t)(src16 ? src16[r * src_stride + c] + : src[r * src_stride + c])) { + mismatch = 1; + } + } + } + if (mismatch) { + int rr, cc; + printf( + "\ncheck_block_pre failed frame_idx %d plane %d " + "pixel_c %d pixel_r " + "%d blk_w %d blk_h %d\n", + frame_idx_r, plane, pixel_c, pixel_r, blk_w, blk_h); + printf("enc\n"); + for (rr = 0; rr < blk_h; ++rr) { + for (cc = 0; cc < blk_w; ++cc) { + printf("%d ", frame_pre[frame_buf_idx_r][plane] + [(rr + pixel_r) * frame_stride + cc + pixel_c]); + } + printf("\n"); + } + + printf("dec\n"); + for (rr = 0; rr < blk_h; ++rr) { + for (cc = 0; cc < blk_w; ++cc) { + printf("%d ", + src16 ? src16[rr * src_stride + cc] : src[rr * src_stride + cc]); + } + printf("\n"); + } + assert(0); + } +} +void mismatch_check_block_tx(const uint8_t *src, int src_stride, int plane, + int pixel_c, int pixel_r, int blk_w, int blk_h, + int highbd) { + const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL; + int mismatch = 0; + int r, c; + if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) { + printf("frame_buf undersized\n"); + assert(0); + } + + for (r = 0; r < blk_h; ++r) { + for (c = 0; c < blk_w; ++c) { + if (frame_tx[frame_buf_idx_r][plane] + [(r + pixel_r) * frame_stride + c + pixel_c] != + (uint16_t)(src16 ? src16[r * src_stride + c] + : src[r * src_stride + c])) { + mismatch = 1; + } + } + } + if (mismatch) { + int rr, cc; + printf( + "\ncheck_block_tx failed frame_idx %d plane %d pixel_c " + "%d pixel_r " + "%d blk_w %d blk_h %d\n", + frame_idx_r, plane, pixel_c, pixel_r, blk_w, blk_h); + printf("enc\n"); + for (rr = 0; rr < blk_h; ++rr) { + for (cc = 0; cc < blk_w; ++cc) { + printf("%d ", frame_tx[frame_buf_idx_r][plane] + [(rr + pixel_r) * frame_stride + cc + pixel_c]); + } + printf("\n"); + } + + printf("dec\n"); + for (rr = 0; rr < blk_h; ++rr) { + for (cc = 0; cc < blk_w; ++cc) { + printf("%d ", + src16 ? src16[rr * src_stride + cc] : src[rr * src_stride + cc]); + } + printf("\n"); + } + assert(0); + } +} +#endif // CONFIG_MISMATCH_DEBUG diff --git a/media/libvpx/libvpx/vpx_util/vpx_debug_util.h b/media/libvpx/libvpx/vpx_util/vpx_debug_util.h new file mode 100644 index 0000000000..df1a1aab2c --- /dev/null +++ b/media/libvpx/libvpx/vpx_util/vpx_debug_util.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_UTIL_VPX_DEBUG_UTIL_H_ +#define VPX_VPX_UTIL_VPX_DEBUG_UTIL_H_ + +#include "./vpx_config.h" + +#include "vpx_dsp/prob.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG +void bitstream_queue_set_frame_write(int frame_idx); +int bitstream_queue_get_frame_write(void); +void bitstream_queue_set_frame_read(int frame_idx); +int bitstream_queue_get_frame_read(void); +#endif + +#if CONFIG_BITSTREAM_DEBUG +/* This is a debug tool used to detect bitstream error. On encoder side, it + * pushes each bit and probability into a queue before the bit is written into + * the Arithmetic coder. On decoder side, whenever a bit is read out from the + * Arithmetic coder, it pops out the reference bit and probability from the + * queue as well. If the two results do not match, this debug tool will report + * an error. This tool can be used to pin down the bitstream error precisely. + * By combining gdb's backtrace method, we can detect which module causes the + * bitstream error. */ +int bitstream_queue_get_write(void); +int bitstream_queue_get_read(void); +void bitstream_queue_record_write(void); +void bitstream_queue_reset_write(void); +void bitstream_queue_pop(int *result, int *prob); +void bitstream_queue_push(int result, const int prob); +void bitstream_queue_set_skip_write(int skip); +void bitstream_queue_set_skip_read(int skip); +#endif // CONFIG_BITSTREAM_DEBUG + +#if CONFIG_MISMATCH_DEBUG +void mismatch_move_frame_idx_w(void); +void mismatch_move_frame_idx_r(void); +void mismatch_reset_frame(int num_planes); +void mismatch_record_block_pre(const uint8_t *src, int src_stride, int plane, + int pixel_c, int pixel_r, int blk_w, int blk_h, + int highbd); +void mismatch_record_block_tx(const uint8_t *src, int src_stride, int plane, + int pixel_c, int pixel_r, int blk_w, int blk_h, + int highbd); +void mismatch_check_block_pre(const uint8_t *src, int src_stride, int plane, + int pixel_c, int pixel_r, int blk_w, int blk_h, + int highbd); +void mismatch_check_block_tx(const uint8_t *src, int src_stride, int plane, + int pixel_c, int pixel_r, int blk_w, int blk_h, + int highbd); +#endif // CONFIG_MISMATCH_DEBUG + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_UTIL_VPX_DEBUG_UTIL_H_ diff --git a/media/libvpx/libvpx/vpx_util/vpx_thread.c b/media/libvpx/libvpx/vpx_util/vpx_thread.c new file mode 100644 index 0000000000..04c5fb6f26 --- /dev/null +++ b/media/libvpx/libvpx/vpx_util/vpx_thread.c @@ -0,0 +1,181 @@ +// Copyright 2013 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// Multi-threaded worker +// +// Original source: +// https://chromium.googlesource.com/webm/libwebp + +#include +#include // for memset() +#include "./vpx_thread.h" +#include "vpx_mem/vpx_mem.h" + +#if CONFIG_MULTITHREAD + +struct VPxWorkerImpl { + pthread_mutex_t mutex_; + pthread_cond_t condition_; + pthread_t thread_; +}; + +//------------------------------------------------------------------------------ + +static void execute(VPxWorker *const worker); // Forward declaration. + +static THREADFN thread_loop(void *ptr) { + VPxWorker *const worker = (VPxWorker *)ptr; + int done = 0; + while (!done) { + pthread_mutex_lock(&worker->impl_->mutex_); + while (worker->status_ == OK) { // wait in idling mode + pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_); + } + if (worker->status_ == WORK) { + execute(worker); + worker->status_ = OK; + } else if (worker->status_ == NOT_OK) { // finish the worker + done = 1; + } + // signal to the main thread that we're done (for sync()) + pthread_cond_signal(&worker->impl_->condition_); + pthread_mutex_unlock(&worker->impl_->mutex_); + } + return THREAD_RETURN(NULL); // Thread is finished +} + +// main thread state control +static void change_state(VPxWorker *const worker, VPxWorkerStatus new_status) { + // No-op when attempting to change state on a thread that didn't come up. + // Checking status_ without acquiring the lock first would result in a data + // race. + if (worker->impl_ == NULL) return; + + pthread_mutex_lock(&worker->impl_->mutex_); + if (worker->status_ >= OK) { + // wait for the worker to finish + while (worker->status_ != OK) { + pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_); + } + // assign new status and release the working thread if needed + if (new_status != OK) { + worker->status_ = new_status; + pthread_cond_signal(&worker->impl_->condition_); + } + } + pthread_mutex_unlock(&worker->impl_->mutex_); +} + +#endif // CONFIG_MULTITHREAD + +//------------------------------------------------------------------------------ + +static void init(VPxWorker *const worker) { + memset(worker, 0, sizeof(*worker)); + worker->status_ = NOT_OK; +} + +static int sync(VPxWorker *const worker) { +#if CONFIG_MULTITHREAD + change_state(worker, OK); +#endif + assert(worker->status_ <= OK); + return !worker->had_error; +} + +static int reset(VPxWorker *const worker) { + int ok = 1; + worker->had_error = 0; + if (worker->status_ < OK) { +#if CONFIG_MULTITHREAD + worker->impl_ = (VPxWorkerImpl *)vpx_calloc(1, sizeof(*worker->impl_)); + if (worker->impl_ == NULL) { + return 0; + } + if (pthread_mutex_init(&worker->impl_->mutex_, NULL)) { + goto Error; + } + if (pthread_cond_init(&worker->impl_->condition_, NULL)) { + pthread_mutex_destroy(&worker->impl_->mutex_); + goto Error; + } + pthread_mutex_lock(&worker->impl_->mutex_); + ok = !pthread_create(&worker->impl_->thread_, NULL, thread_loop, worker); + if (ok) worker->status_ = OK; + pthread_mutex_unlock(&worker->impl_->mutex_); + if (!ok) { + pthread_mutex_destroy(&worker->impl_->mutex_); + pthread_cond_destroy(&worker->impl_->condition_); + Error: + vpx_free(worker->impl_); + worker->impl_ = NULL; + return 0; + } +#else + worker->status_ = OK; +#endif + } else if (worker->status_ > OK) { + ok = sync(worker); + } + assert(!ok || (worker->status_ == OK)); + return ok; +} + +static void execute(VPxWorker *const worker) { + if (worker->hook != NULL) { + worker->had_error |= !worker->hook(worker->data1, worker->data2); + } +} + +static void launch(VPxWorker *const worker) { +#if CONFIG_MULTITHREAD + change_state(worker, WORK); +#else + execute(worker); +#endif +} + +static void end(VPxWorker *const worker) { +#if CONFIG_MULTITHREAD + if (worker->impl_ != NULL) { + change_state(worker, NOT_OK); + pthread_join(worker->impl_->thread_, NULL); + pthread_mutex_destroy(&worker->impl_->mutex_); + pthread_cond_destroy(&worker->impl_->condition_); + vpx_free(worker->impl_); + worker->impl_ = NULL; + } +#else + worker->status_ = NOT_OK; + assert(worker->impl_ == NULL); +#endif + assert(worker->status_ == NOT_OK); +} + +//------------------------------------------------------------------------------ + +static VPxWorkerInterface g_worker_interface = { init, reset, sync, + launch, execute, end }; + +int vpx_set_worker_interface(const VPxWorkerInterface *const winterface) { + if (winterface == NULL || winterface->init == NULL || + winterface->reset == NULL || winterface->sync == NULL || + winterface->launch == NULL || winterface->execute == NULL || + winterface->end == NULL) { + return 0; + } + g_worker_interface = *winterface; + return 1; +} + +const VPxWorkerInterface *vpx_get_worker_interface(void) { + return &g_worker_interface; +} + +//------------------------------------------------------------------------------ diff --git a/media/libvpx/libvpx/vpx_util/vpx_thread.h b/media/libvpx/libvpx/vpx_util/vpx_thread.h new file mode 100644 index 0000000000..6d308e949b --- /dev/null +++ b/media/libvpx/libvpx/vpx_util/vpx_thread.h @@ -0,0 +1,438 @@ +// Copyright 2013 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// Multi-threaded worker +// +// Original source: +// https://chromium.googlesource.com/webm/libwebp + +#ifndef VPX_VPX_UTIL_VPX_THREAD_H_ +#define VPX_VPX_UTIL_VPX_THREAD_H_ + +#include "./vpx_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Set maximum decode threads to be 8 due to the limit of frame buffers +// and not enough semaphores in the emulation layer on windows. +#define MAX_DECODE_THREADS 8 + +#if CONFIG_MULTITHREAD + +#if defined(_WIN32) && !HAVE_PTHREAD_H +#include // NOLINT +#include // NOLINT +#include // NOLINT +typedef HANDLE pthread_t; +typedef CRITICAL_SECTION pthread_mutex_t; + +#if _WIN32_WINNT >= 0x0600 // Windows Vista / Server 2008 or greater +#define USE_WINDOWS_CONDITION_VARIABLE +typedef CONDITION_VARIABLE pthread_cond_t; +#else +typedef struct { + HANDLE waiting_sem_; + HANDLE received_sem_; + HANDLE signal_event_; +} pthread_cond_t; +#endif // _WIN32_WINNT >= 0x600 + +#ifndef WINAPI_FAMILY_PARTITION +#define WINAPI_PARTITION_DESKTOP 1 +#define WINAPI_FAMILY_PARTITION(x) x +#endif + +#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) +#define USE_CREATE_THREAD +#endif + +//------------------------------------------------------------------------------ +// simplistic pthread emulation layer + +// _beginthreadex requires __stdcall +#if defined(__GNUC__) && \ + (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)) +#define THREADFN __attribute__((force_align_arg_pointer)) unsigned int __stdcall +#else +#define THREADFN unsigned int __stdcall +#endif +#define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val) + +#if _WIN32_WINNT >= 0x0501 // Windows XP or greater +#define WaitForSingleObject(obj, timeout) \ + WaitForSingleObjectEx(obj, timeout, FALSE /*bAlertable*/) +#endif + +static INLINE int pthread_create(pthread_t *const thread, const void *attr, + unsigned int(__stdcall *start)(void *), + void *arg) { + (void)attr; +#ifdef USE_CREATE_THREAD + *thread = CreateThread(NULL, /* lpThreadAttributes */ + 0, /* dwStackSize */ + start, arg, 0, /* dwStackSize */ + NULL); /* lpThreadId */ +#else + *thread = (pthread_t)_beginthreadex(NULL, /* void *security */ + 0, /* unsigned stack_size */ + start, arg, 0, /* unsigned initflag */ + NULL); /* unsigned *thrdaddr */ +#endif + if (*thread == NULL) return 1; + SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL); + return 0; +} + +static INLINE int pthread_join(pthread_t thread, void **value_ptr) { + (void)value_ptr; + return (WaitForSingleObject(thread, INFINITE) != WAIT_OBJECT_0 || + CloseHandle(thread) == 0); +} + +// Mutex +static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex, + void *mutexattr) { + (void)mutexattr; +#if _WIN32_WINNT >= 0x0600 // Windows Vista / Server 2008 or greater + InitializeCriticalSectionEx(mutex, 0 /*dwSpinCount*/, 0 /*Flags*/); +#else + InitializeCriticalSection(mutex); +#endif + return 0; +} + +static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) { + return TryEnterCriticalSection(mutex) ? 0 : EBUSY; +} + +static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) { + EnterCriticalSection(mutex); + return 0; +} + +static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) { + LeaveCriticalSection(mutex); + return 0; +} + +static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) { + DeleteCriticalSection(mutex); + return 0; +} + +// Condition +static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) { + int ok = 1; +#ifdef USE_WINDOWS_CONDITION_VARIABLE + (void)condition; +#else + ok &= (CloseHandle(condition->waiting_sem_) != 0); + ok &= (CloseHandle(condition->received_sem_) != 0); + ok &= (CloseHandle(condition->signal_event_) != 0); +#endif + return !ok; +} + +static INLINE int pthread_cond_init(pthread_cond_t *const condition, + void *cond_attr) { + (void)cond_attr; +#ifdef USE_WINDOWS_CONDITION_VARIABLE + InitializeConditionVariable(condition); +#else + condition->waiting_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL); + condition->received_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL); + condition->signal_event_ = CreateEvent(NULL, FALSE, FALSE, NULL); + if (condition->waiting_sem_ == NULL || condition->received_sem_ == NULL || + condition->signal_event_ == NULL) { + pthread_cond_destroy(condition); + return 1; + } +#endif + return 0; +} + +static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) { + int ok = 1; +#ifdef USE_WINDOWS_CONDITION_VARIABLE + WakeAllConditionVariable(condition); +#else + while (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) { + // a thread is waiting in pthread_cond_wait: allow it to be notified + ok &= SetEvent(condition->signal_event_); + // wait until the event is consumed so the signaler cannot consume + // the event via its own pthread_cond_wait. + ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) != + WAIT_OBJECT_0); + } +#endif + return !ok; +} + +static INLINE int pthread_cond_signal(pthread_cond_t *const condition) { + int ok = 1; +#ifdef USE_WINDOWS_CONDITION_VARIABLE + WakeConditionVariable(condition); +#else + if (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) { + // a thread is waiting in pthread_cond_wait: allow it to be notified + ok = SetEvent(condition->signal_event_); + // wait until the event is consumed so the signaler cannot consume + // the event via its own pthread_cond_wait. + ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) != + WAIT_OBJECT_0); + } +#endif + return !ok; +} + +static INLINE int pthread_cond_wait(pthread_cond_t *const condition, + pthread_mutex_t *const mutex) { + int ok; +#ifdef USE_WINDOWS_CONDITION_VARIABLE + ok = SleepConditionVariableCS(condition, mutex, INFINITE); +#else + // note that there is a consumer available so the signal isn't dropped in + // pthread_cond_signal + if (!ReleaseSemaphore(condition->waiting_sem_, 1, NULL)) return 1; + // now unlock the mutex so pthread_cond_signal may be issued + pthread_mutex_unlock(mutex); + ok = (WaitForSingleObject(condition->signal_event_, INFINITE) == + WAIT_OBJECT_0); + ok &= ReleaseSemaphore(condition->received_sem_, 1, NULL); + pthread_mutex_lock(mutex); +#endif + return !ok; +} + +#elif defined(__OS2__) +#define INCL_DOS +#include // NOLINT + +#include // NOLINT +#include // NOLINT +#include // NOLINT + +#if defined(__STRICT_ANSI__) +// _beginthread() is not declared on __STRICT_ANSI__ mode. Declare here. +int _beginthread(void (*)(void *), void *, unsigned, void *); +#endif + +#define pthread_t TID +#define pthread_mutex_t HMTX + +typedef struct { + HEV event_sem_; + HEV ack_sem_; + volatile unsigned wait_count_; +} pthread_cond_t; + +//------------------------------------------------------------------------------ +// simplistic pthread emulation layer + +#define THREADFN void * +#define THREAD_RETURN(val) (val) + +typedef struct { + void *(*start_)(void *); + void *arg_; +} thread_arg; + +static void thread_start(void *arg) { + thread_arg targ = *(thread_arg *)arg; + free(arg); + + targ.start_(targ.arg_); +} + +static INLINE int pthread_create(pthread_t *const thread, const void *attr, + void *(*start)(void *), void *arg) { + int tid; + thread_arg *targ = (thread_arg *)malloc(sizeof(*targ)); + if (targ == NULL) return 1; + + (void)attr; + + targ->start_ = start; + targ->arg_ = arg; + tid = (pthread_t)_beginthread(thread_start, NULL, 1024 * 1024, targ); + if (tid == -1) { + free(targ); + return 1; + } + + *thread = tid; + return 0; +} + +static INLINE int pthread_join(pthread_t thread, void **value_ptr) { + (void)value_ptr; + return DosWaitThread(&thread, DCWW_WAIT) != 0; +} + +// Mutex +static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex, + void *mutexattr) { + (void)mutexattr; + return DosCreateMutexSem(NULL, mutex, 0, FALSE) != 0; +} + +static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) { + return DosRequestMutexSem(*mutex, SEM_IMMEDIATE_RETURN) == 0 ? 0 : EBUSY; +} + +static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) { + return DosRequestMutexSem(*mutex, SEM_INDEFINITE_WAIT) != 0; +} + +static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) { + return DosReleaseMutexSem(*mutex) != 0; +} + +static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) { + return DosCloseMutexSem(*mutex) != 0; +} + +// Condition +static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) { + int ok = 1; + ok &= DosCloseEventSem(condition->event_sem_) == 0; + ok &= DosCloseEventSem(condition->ack_sem_) == 0; + return !ok; +} + +static INLINE int pthread_cond_init(pthread_cond_t *const condition, + void *cond_attr) { + int ok = 1; + (void)cond_attr; + + ok &= + DosCreateEventSem(NULL, &condition->event_sem_, DCE_POSTONE, FALSE) == 0; + ok &= DosCreateEventSem(NULL, &condition->ack_sem_, DCE_POSTONE, FALSE) == 0; + if (!ok) { + pthread_cond_destroy(condition); + return 1; + } + condition->wait_count_ = 0; + return 0; +} + +static INLINE int pthread_cond_signal(pthread_cond_t *const condition) { + int ok = 1; + + if (!__atomic_cmpxchg32(&condition->wait_count_, 0, 0)) { + ok &= DosPostEventSem(condition->event_sem_) == 0; + ok &= DosWaitEventSem(condition->ack_sem_, SEM_INDEFINITE_WAIT) == 0; + } + + return !ok; +} + +static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) { + int ok = 1; + + while (!__atomic_cmpxchg32(&condition->wait_count_, 0, 0)) + ok &= pthread_cond_signal(condition) == 0; + + return !ok; +} + +static INLINE int pthread_cond_wait(pthread_cond_t *const condition, + pthread_mutex_t *const mutex) { + int ok = 1; + + __atomic_increment(&condition->wait_count_); + + ok &= pthread_mutex_unlock(mutex) == 0; + + ok &= DosWaitEventSem(condition->event_sem_, SEM_INDEFINITE_WAIT) == 0; + + __atomic_decrement(&condition->wait_count_); + + ok &= DosPostEventSem(condition->ack_sem_) == 0; + + pthread_mutex_lock(mutex); + + return !ok; +} +#else // _WIN32 +#include // NOLINT +#define THREADFN void * +#define THREAD_RETURN(val) val +#endif + +#endif // CONFIG_MULTITHREAD + +// State of the worker thread object +typedef enum { + NOT_OK = 0, // object is unusable + OK, // ready to work + WORK // busy finishing the current task +} VPxWorkerStatus; + +// Function to be called by the worker thread. Takes two opaque pointers as +// arguments (data1 and data2), and should return false in case of error. +typedef int (*VPxWorkerHook)(void *, void *); + +// Platform-dependent implementation details for the worker. +typedef struct VPxWorkerImpl VPxWorkerImpl; + +// Synchronization object used to launch job in the worker thread +typedef struct { + VPxWorkerImpl *impl_; + VPxWorkerStatus status_; + VPxWorkerHook hook; // hook to call + void *data1; // first argument passed to 'hook' + void *data2; // second argument passed to 'hook' + int had_error; // return value of the last call to 'hook' +} VPxWorker; + +// The interface for all thread-worker related functions. All these functions +// must be implemented. +typedef struct { + // Must be called first, before any other method. + void (*init)(VPxWorker *const worker); + // Must be called to initialize the object and spawn the thread. Re-entrant. + // Will potentially launch the thread. Returns false in case of error. + int (*reset)(VPxWorker *const worker); + // Makes sure the previous work is finished. Returns true if worker->had_error + // was not set and no error condition was triggered by the working thread. + int (*sync)(VPxWorker *const worker); + // Triggers the thread to call hook() with data1 and data2 arguments. These + // hook/data1/data2 values can be changed at any time before calling this + // function, but not be changed afterward until the next call to Sync(). + void (*launch)(VPxWorker *const worker); + // This function is similar to launch() except that it calls the + // hook directly instead of using a thread. Convenient to bypass the thread + // mechanism while still using the VPxWorker structs. sync() must + // still be called afterward (for error reporting). + void (*execute)(VPxWorker *const worker); + // Kill the thread and terminate the object. To use the object again, one + // must call reset() again. + void (*end)(VPxWorker *const worker); +} VPxWorkerInterface; + +// Install a new set of threading functions, overriding the defaults. This +// should be done before any workers are started, i.e., before any encoding or +// decoding takes place. The contents of the interface struct are copied, it +// is safe to free the corresponding memory after this call. This function is +// not thread-safe. Return false in case of invalid pointer or methods. +int vpx_set_worker_interface(const VPxWorkerInterface *const winterface); + +// Retrieve the currently set thread worker interface. +const VPxWorkerInterface *vpx_get_worker_interface(void); + +//------------------------------------------------------------------------------ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_UTIL_VPX_THREAD_H_ diff --git a/media/libvpx/libvpx/vpx_util/vpx_timestamp.h b/media/libvpx/libvpx/vpx_util/vpx_timestamp.h new file mode 100644 index 0000000000..5296458fad --- /dev/null +++ b/media/libvpx/libvpx/vpx_util/vpx_timestamp.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_UTIL_VPX_TIMESTAMP_H_ +#define VPX_VPX_UTIL_VPX_TIMESTAMP_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +// Rational Number with an int64 numerator +typedef struct vpx_rational64 { + int64_t num; // fraction numerator + int den; // fraction denominator +} vpx_rational64_t; // alias for struct vpx_rational64_t + +static INLINE int gcd(int64_t a, int b) { + int r; // remainder + assert(a >= 0); + assert(b > 0); + while (b != 0) { + r = (int)(a % b); + a = b; + b = r; + } + + return (int)a; +} + +static INLINE void reduce_ratio(vpx_rational64_t *ratio) { + const int denom = gcd(ratio->num, ratio->den); + ratio->num /= denom; + ratio->den /= denom; +} + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // VPX_VPX_UTIL_VPX_TIMESTAMP_H_ diff --git a/media/libvpx/libvpx/vpx_util/vpx_util.mk b/media/libvpx/libvpx/vpx_util/vpx_util.mk new file mode 100644 index 0000000000..1162714956 --- /dev/null +++ b/media/libvpx/libvpx/vpx_util/vpx_util.mk @@ -0,0 +1,20 @@ +## +## Copyright (c) 2015 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +UTIL_SRCS-yes += vpx_atomics.h +UTIL_SRCS-yes += vpx_util.mk +UTIL_SRCS-yes += vpx_thread.c +UTIL_SRCS-yes += vpx_thread.h +UTIL_SRCS-yes += endian_inl.h +UTIL_SRCS-yes += vpx_write_yuv_frame.h +UTIL_SRCS-yes += vpx_write_yuv_frame.c +UTIL_SRCS-yes += vpx_timestamp.h +UTIL_SRCS-$(or $(CONFIG_BITSTREAM_DEBUG),$(CONFIG_MISMATCH_DEBUG)) += vpx_debug_util.h +UTIL_SRCS-$(or $(CONFIG_BITSTREAM_DEBUG),$(CONFIG_MISMATCH_DEBUG)) += vpx_debug_util.c diff --git a/media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.c b/media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.c new file mode 100644 index 0000000000..4ef57a2fee --- /dev/null +++ b/media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.c @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_dsp/skin_detection.h" +#include "vpx_util/vpx_write_yuv_frame.h" + +void vpx_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s) { +#if defined(OUTPUT_YUV_SRC) || defined(OUTPUT_YUV_DENOISED) || \ + defined(OUTPUT_YUV_SKINMAP) || defined(OUTPUT_YUV_SVC_SRC) + + unsigned char *src = s->y_buffer; + int h = s->y_crop_height; + + do { + fwrite(src, s->y_width, 1, yuv_file); + src += s->y_stride; + } while (--h); + + src = s->u_buffer; + h = s->uv_crop_height; + + do { + fwrite(src, s->uv_width, 1, yuv_file); + src += s->uv_stride; + } while (--h); + + src = s->v_buffer; + h = s->uv_crop_height; + + do { + fwrite(src, s->uv_width, 1, yuv_file); + src += s->uv_stride; + } while (--h); + +#else + (void)yuv_file; + (void)s; +#endif +} diff --git a/media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.h b/media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.h new file mode 100644 index 0000000000..ce1102458e --- /dev/null +++ b/media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_UTIL_VPX_WRITE_YUV_FRAME_H_ +#define VPX_VPX_UTIL_VPX_WRITE_YUV_FRAME_H_ + +#include +#include "vpx_scale/yv12config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vpx_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_UTIL_VPX_WRITE_YUV_FRAME_H_ diff --git a/media/libvpx/libvpx/vpxdec.c b/media/libvpx/libvpx/vpxdec.c new file mode 100644 index 0000000000..bfe6c1d6ba --- /dev/null +++ b/media/libvpx/libvpx/vpxdec.c @@ -0,0 +1,1146 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include +#include +#include + +#include "./vpx_config.h" + +#if CONFIG_LIBYUV +#include "third_party/libyuv/include/libyuv/scale.h" +#endif + +#include "./args.h" +#include "./ivfdec.h" + +#include "vpx/vpx_decoder.h" +#include "vpx_ports/mem_ops.h" +#include "vpx_ports/vpx_timer.h" + +#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER +#include "vpx/vp8dx.h" +#endif + +#include "./md5_utils.h" + +#include "./tools_common.h" +#if CONFIG_WEBM_IO +#include "./webmdec.h" +#endif +#include "./y4menc.h" + +static const char *exec_name; + +struct VpxDecInputContext { + struct VpxInputContext *vpx_input_ctx; + struct WebmInputContext *webm_ctx; +}; + +static const arg_def_t help = + ARG_DEF(NULL, "help", 0, "Show usage options and exit"); +static const arg_def_t looparg = + ARG_DEF(NULL, "loops", 1, "Number of times to decode the file"); +static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1, "Codec to use"); +static const arg_def_t use_yv12 = + ARG_DEF(NULL, "yv12", 0, "Output raw YV12 frames"); +static const arg_def_t use_i420 = + ARG_DEF(NULL, "i420", 0, "Output raw I420 frames"); +static const arg_def_t flipuvarg = + ARG_DEF(NULL, "flipuv", 0, "Flip the chroma planes in the output"); +static const arg_def_t rawvideo = + ARG_DEF(NULL, "rawvideo", 0, "Output raw YUV frames"); +static const arg_def_t noblitarg = + ARG_DEF(NULL, "noblit", 0, "Don't process the decoded frames"); +static const arg_def_t progressarg = + ARG_DEF(NULL, "progress", 0, "Show progress after each frame decodes"); +static const arg_def_t limitarg = + ARG_DEF(NULL, "limit", 1, "Stop decoding after n frames"); +static const arg_def_t skiparg = + ARG_DEF(NULL, "skip", 1, "Skip the first n input frames"); +static const arg_def_t postprocarg = + ARG_DEF(NULL, "postproc", 0, "Postprocess decoded frames"); +static const arg_def_t summaryarg = + ARG_DEF(NULL, "summary", 0, "Show timing summary"); +static const arg_def_t outputfile = + ARG_DEF("o", "output", 1, "Output file name pattern (see below)"); +static const arg_def_t threadsarg = + ARG_DEF("t", "threads", 1, "Max threads to use"); +static const arg_def_t frameparallelarg = + ARG_DEF(NULL, "frame-parallel", 0, "Frame parallel decode (ignored)"); +static const arg_def_t verbosearg = + ARG_DEF("v", "verbose", 0, "Show version string"); +static const arg_def_t error_concealment = + ARG_DEF(NULL, "error-concealment", 0, "Enable decoder error-concealment"); +static const arg_def_t scalearg = + ARG_DEF("S", "scale", 0, "Scale output frames uniformly"); +static const arg_def_t continuearg = + ARG_DEF("k", "keep-going", 0, "(debug) Continue decoding after error"); +static const arg_def_t fb_arg = + ARG_DEF(NULL, "frame-buffers", 1, "Number of frame buffers to use"); +static const arg_def_t md5arg = + ARG_DEF(NULL, "md5", 0, "Compute the MD5 sum of the decoded frame"); +#if CONFIG_VP9_HIGHBITDEPTH +static const arg_def_t outbitdeptharg = + ARG_DEF(NULL, "output-bit-depth", 1, "Output bit-depth for decoded frames"); +#endif +static const arg_def_t svcdecodingarg = ARG_DEF( + NULL, "svc-decode-layer", 1, "Decode SVC stream up to given spatial layer"); +static const arg_def_t framestatsarg = + ARG_DEF(NULL, "framestats", 1, "Output per-frame stats (.csv format)"); +static const arg_def_t rowmtarg = + ARG_DEF(NULL, "row-mt", 1, "Enable multi-threading to run row-wise in VP9"); +static const arg_def_t lpfoptarg = + ARG_DEF(NULL, "lpf-opt", 1, + "Do loopfilter without waiting for all threads to sync."); + +static const arg_def_t *all_args[] = { &help, + &codecarg, + &use_yv12, + &use_i420, + &flipuvarg, + &rawvideo, + &noblitarg, + &progressarg, + &limitarg, + &skiparg, + &postprocarg, + &summaryarg, + &outputfile, + &threadsarg, + &frameparallelarg, + &verbosearg, + &scalearg, + &fb_arg, + &md5arg, + &error_concealment, + &continuearg, +#if CONFIG_VP9_HIGHBITDEPTH + &outbitdeptharg, +#endif + &svcdecodingarg, + &framestatsarg, + &rowmtarg, + &lpfoptarg, + NULL }; + +#if CONFIG_VP8_DECODER +static const arg_def_t addnoise_level = + ARG_DEF(NULL, "noise-level", 1, "Enable VP8 postproc add noise"); +static const arg_def_t deblock = + ARG_DEF(NULL, "deblock", 0, "Enable VP8 deblocking"); +static const arg_def_t demacroblock_level = ARG_DEF( + NULL, "demacroblock-level", 1, "Enable VP8 demacroblocking, w/ level"); +static const arg_def_t mfqe = + ARG_DEF(NULL, "mfqe", 0, "Enable multiframe quality enhancement"); + +static const arg_def_t *vp8_pp_args[] = { &addnoise_level, &deblock, + &demacroblock_level, &mfqe, NULL }; +#endif + +#if CONFIG_LIBYUV +static INLINE int libyuv_scale(vpx_image_t *src, vpx_image_t *dst, + FilterModeEnum mode) { +#if CONFIG_VP9_HIGHBITDEPTH + if (src->fmt == VPX_IMG_FMT_I42016) { + assert(dst->fmt == VPX_IMG_FMT_I42016); + return I420Scale_16( + (uint16_t *)src->planes[VPX_PLANE_Y], src->stride[VPX_PLANE_Y] / 2, + (uint16_t *)src->planes[VPX_PLANE_U], src->stride[VPX_PLANE_U] / 2, + (uint16_t *)src->planes[VPX_PLANE_V], src->stride[VPX_PLANE_V] / 2, + src->d_w, src->d_h, (uint16_t *)dst->planes[VPX_PLANE_Y], + dst->stride[VPX_PLANE_Y] / 2, (uint16_t *)dst->planes[VPX_PLANE_U], + dst->stride[VPX_PLANE_U] / 2, (uint16_t *)dst->planes[VPX_PLANE_V], + dst->stride[VPX_PLANE_V] / 2, dst->d_w, dst->d_h, mode); + } +#endif + assert(src->fmt == VPX_IMG_FMT_I420); + assert(dst->fmt == VPX_IMG_FMT_I420); + return I420Scale(src->planes[VPX_PLANE_Y], src->stride[VPX_PLANE_Y], + src->planes[VPX_PLANE_U], src->stride[VPX_PLANE_U], + src->planes[VPX_PLANE_V], src->stride[VPX_PLANE_V], src->d_w, + src->d_h, dst->planes[VPX_PLANE_Y], dst->stride[VPX_PLANE_Y], + dst->planes[VPX_PLANE_U], dst->stride[VPX_PLANE_U], + dst->planes[VPX_PLANE_V], dst->stride[VPX_PLANE_V], dst->d_w, + dst->d_h, mode); +} +#endif +static void show_help(FILE *fout, int shorthelp) { + int i; + + fprintf(fout, "Usage: %s filename\n\n", exec_name); + + if (shorthelp) { + fprintf(fout, "Use --help to see the full list of options.\n"); + return; + } + + fprintf(fout, "Options:\n"); + arg_show_usage(fout, all_args); +#if CONFIG_VP8_DECODER + fprintf(fout, "\nVP8 Postprocessing Options:\n"); + arg_show_usage(fout, vp8_pp_args); +#endif + fprintf(fout, + "\nOutput File Patterns:\n\n" + " The -o argument specifies the name of the file(s) to " + "write to. If the\n argument does not include any escape " + "characters, the output will be\n written to a single file. " + "Otherwise, the filename will be calculated by\n expanding " + "the following escape characters:\n"); + fprintf(fout, + "\n\t%%w - Frame width" + "\n\t%%h - Frame height" + "\n\t%% - Frame number, zero padded to places (1..9)" + "\n\n Pattern arguments are only supported in conjunction " + "with the --yv12 and\n --i420 options. If the -o option is " + "not specified, the output will be\n directed to stdout.\n"); + fprintf(fout, "\nIncluded decoders:\n\n"); + + for (i = 0; i < get_vpx_decoder_count(); ++i) { + const VpxInterface *const decoder = get_vpx_decoder_by_index(i); + fprintf(fout, " %-6s - %s\n", decoder->name, + vpx_codec_iface_name(decoder->codec_interface())); + } +} + +void usage_exit(void) { + show_help(stderr, 1); + exit(EXIT_FAILURE); +} + +static int raw_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read, + size_t *buffer_size) { + char raw_hdr[RAW_FRAME_HDR_SZ]; + size_t frame_size = 0; + + if (fread(raw_hdr, RAW_FRAME_HDR_SZ, 1, infile) != 1) { + if (!feof(infile)) warn("Failed to read RAW frame size\n"); + } else { + const size_t kCorruptFrameThreshold = 256 * 1024 * 1024; + const size_t kFrameTooSmallThreshold = 256 * 1024; + frame_size = mem_get_le32(raw_hdr); + + if (frame_size > kCorruptFrameThreshold) { + warn("Read invalid frame size (%u)\n", (unsigned int)frame_size); + frame_size = 0; + } + + if (frame_size < kFrameTooSmallThreshold) { + warn("Warning: Read invalid frame size (%u) - not a raw file?\n", + (unsigned int)frame_size); + } + + if (frame_size > *buffer_size) { + uint8_t *new_buf = realloc(*buffer, 2 * frame_size); + if (new_buf) { + *buffer = new_buf; + *buffer_size = 2 * frame_size; + } else { + warn("Failed to allocate compressed data buffer\n"); + frame_size = 0; + } + } + } + + if (!feof(infile)) { + if (fread(*buffer, 1, frame_size, infile) != frame_size) { + warn("Failed to read full frame\n"); + return 1; + } + *bytes_read = frame_size; + return 0; + } + + return 1; +} + +static int dec_read_frame(struct VpxDecInputContext *input, uint8_t **buf, + size_t *bytes_in_buffer, size_t *buffer_size) { + switch (input->vpx_input_ctx->file_type) { +#if CONFIG_WEBM_IO + case FILE_TYPE_WEBM: + return webm_read_frame(input->webm_ctx, buf, bytes_in_buffer); +#endif + case FILE_TYPE_RAW: + return raw_read_frame(input->vpx_input_ctx->file, buf, bytes_in_buffer, + buffer_size); + case FILE_TYPE_IVF: + return ivf_read_frame(input->vpx_input_ctx->file, buf, bytes_in_buffer, + buffer_size); + default: return 1; + } +} + +static void update_image_md5(const vpx_image_t *img, const int planes[3], + MD5Context *md5) { + int i, y; + + for (i = 0; i < 3; ++i) { + const int plane = planes[i]; + const unsigned char *buf = img->planes[plane]; + const int stride = img->stride[plane]; + const int w = vpx_img_plane_width(img, plane) * + ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); + const int h = vpx_img_plane_height(img, plane); + + for (y = 0; y < h; ++y) { + MD5Update(md5, buf, w); + buf += stride; + } + } +} + +static void write_image_file(const vpx_image_t *img, const int planes[3], + FILE *file) { + int i, y; +#if CONFIG_VP9_HIGHBITDEPTH + const int bytes_per_sample = ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); +#else + const int bytes_per_sample = 1; +#endif + + for (i = 0; i < 3; ++i) { + const int plane = planes[i]; + const unsigned char *buf = img->planes[plane]; + const int stride = img->stride[plane]; + const int w = vpx_img_plane_width(img, plane); + const int h = vpx_img_plane_height(img, plane); + + for (y = 0; y < h; ++y) { + fwrite(buf, bytes_per_sample, w, file); + buf += stride; + } + } +} + +static int file_is_raw(struct VpxInputContext *input) { + uint8_t buf[32]; + int is_raw = 0; + vpx_codec_stream_info_t si; + + si.sz = sizeof(si); + + if (fread(buf, 1, 32, input->file) == 32) { + int i; + + if (mem_get_le32(buf) < 256 * 1024 * 1024) { + for (i = 0; i < get_vpx_decoder_count(); ++i) { + const VpxInterface *const decoder = get_vpx_decoder_by_index(i); + if (!vpx_codec_peek_stream_info(decoder->codec_interface(), buf + 4, + 32 - 4, &si)) { + is_raw = 1; + input->fourcc = decoder->fourcc; + input->width = si.w; + input->height = si.h; + input->framerate.numerator = 30; + input->framerate.denominator = 1; + break; + } + } + } + } + + rewind(input->file); + return is_raw; +} + +static void show_progress(int frame_in, int frame_out, uint64_t dx_time) { + fprintf(stderr, + "%d decoded frames/%d showed frames in %" PRId64 " us (%.2f fps)\r", + frame_in, frame_out, dx_time, + (double)frame_out * 1000000.0 / (double)dx_time); +} + +struct ExternalFrameBuffer { + uint8_t *data; + size_t size; + int in_use; +}; + +struct ExternalFrameBufferList { + int num_external_frame_buffers; + struct ExternalFrameBuffer *ext_fb; +}; + +// Callback used by libvpx to request an external frame buffer. |cb_priv| +// Application private data passed into the set function. |min_size| is the +// minimum size in bytes needed to decode the next frame. |fb| pointer to the +// frame buffer. +static int get_vp9_frame_buffer(void *cb_priv, size_t min_size, + vpx_codec_frame_buffer_t *fb) { + int i; + struct ExternalFrameBufferList *const ext_fb_list = + (struct ExternalFrameBufferList *)cb_priv; + if (ext_fb_list == NULL) return -1; + + // Find a free frame buffer. + for (i = 0; i < ext_fb_list->num_external_frame_buffers; ++i) { + if (!ext_fb_list->ext_fb[i].in_use) break; + } + + if (i == ext_fb_list->num_external_frame_buffers) return -1; + + if (ext_fb_list->ext_fb[i].size < min_size) { + free(ext_fb_list->ext_fb[i].data); + ext_fb_list->ext_fb[i].data = (uint8_t *)calloc(min_size, sizeof(uint8_t)); + if (!ext_fb_list->ext_fb[i].data) return -1; + + ext_fb_list->ext_fb[i].size = min_size; + } + + fb->data = ext_fb_list->ext_fb[i].data; + fb->size = ext_fb_list->ext_fb[i].size; + ext_fb_list->ext_fb[i].in_use = 1; + + // Set the frame buffer's private data to point at the external frame buffer. + fb->priv = &ext_fb_list->ext_fb[i]; + return 0; +} + +// Callback used by libvpx when there are no references to the frame buffer. +// |cb_priv| user private data passed into the set function. |fb| pointer +// to the frame buffer. +static int release_vp9_frame_buffer(void *cb_priv, + vpx_codec_frame_buffer_t *fb) { + struct ExternalFrameBuffer *const ext_fb = + (struct ExternalFrameBuffer *)fb->priv; + (void)cb_priv; + ext_fb->in_use = 0; + return 0; +} + +static void generate_filename(const char *pattern, char *out, size_t q_len, + unsigned int d_w, unsigned int d_h, + unsigned int frame_in) { + const char *p = pattern; + char *q = out; + + do { + char *next_pat = strchr(p, '%'); + + if (p == next_pat) { + size_t pat_len; + + /* parse the pattern */ + q[q_len - 1] = '\0'; + switch (p[1]) { + case 'w': snprintf(q, q_len - 1, "%d", d_w); break; + case 'h': snprintf(q, q_len - 1, "%d", d_h); break; + case '1': snprintf(q, q_len - 1, "%d", frame_in); break; + case '2': snprintf(q, q_len - 1, "%02d", frame_in); break; + case '3': snprintf(q, q_len - 1, "%03d", frame_in); break; + case '4': snprintf(q, q_len - 1, "%04d", frame_in); break; + case '5': snprintf(q, q_len - 1, "%05d", frame_in); break; + case '6': snprintf(q, q_len - 1, "%06d", frame_in); break; + case '7': snprintf(q, q_len - 1, "%07d", frame_in); break; + case '8': snprintf(q, q_len - 1, "%08d", frame_in); break; + case '9': snprintf(q, q_len - 1, "%09d", frame_in); break; + default: die("Unrecognized pattern %%%c\n", p[1]); + } + + pat_len = strlen(q); + if (pat_len >= q_len - 1) die("Output filename too long.\n"); + q += pat_len; + p += 2; + q_len -= pat_len; + } else { + size_t copy_len; + + /* copy the next segment */ + if (!next_pat) + copy_len = strlen(p); + else + copy_len = next_pat - p; + + if (copy_len >= q_len - 1) die("Output filename too long.\n"); + + memcpy(q, p, copy_len); + q[copy_len] = '\0'; + q += copy_len; + p += copy_len; + q_len -= copy_len; + } + } while (*p); +} + +static int is_single_file(const char *outfile_pattern) { + const char *p = outfile_pattern; + + do { + p = strchr(p, '%'); + if (p && p[1] >= '1' && p[1] <= '9') + return 0; // pattern contains sequence number, so it's not unique + if (p) p++; + } while (p); + + return 1; +} + +static void print_md5(unsigned char digest[16], const char *filename) { + int i; + + for (i = 0; i < 16; ++i) printf("%02x", digest[i]); + printf(" %s\n", filename); +} + +static FILE *open_outfile(const char *name) { + if (strcmp("-", name) == 0) { + set_binary_mode(stdout); + return stdout; + } else { + FILE *file = fopen(name, "wb"); + if (!file) fatal("Failed to open output file '%s'", name); + return file; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +static int img_shifted_realloc_required(const vpx_image_t *img, + const vpx_image_t *shifted, + vpx_img_fmt_t required_fmt) { + return img->d_w != shifted->d_w || img->d_h != shifted->d_h || + required_fmt != shifted->fmt; +} +#endif + +static int main_loop(int argc, const char **argv_) { + vpx_codec_ctx_t decoder; + char *fn = NULL; + int i; + int ret = EXIT_FAILURE; + uint8_t *buf = NULL; + size_t bytes_in_buffer = 0, buffer_size = 0; + FILE *infile; + int frame_in = 0, frame_out = 0, flipuv = 0, noblit = 0; + int do_md5 = 0, progress = 0; + int stop_after = 0, postproc = 0, summary = 0, quiet = 1; + int arg_skip = 0; + int ec_enabled = 0; + int keep_going = 0; + int enable_row_mt = 0; + int enable_lpf_opt = 0; + const VpxInterface *interface = NULL; + const VpxInterface *fourcc_interface = NULL; + uint64_t dx_time = 0; + struct arg arg; + char **argv, **argi, **argj; + + int single_file; + int use_y4m = 1; + int opt_yv12 = 0; + int opt_i420 = 0; + vpx_codec_dec_cfg_t cfg = { 0, 0, 0 }; +#if CONFIG_VP9_HIGHBITDEPTH + unsigned int output_bit_depth = 0; +#endif + int svc_decoding = 0; + int svc_spatial_layer = 0; +#if CONFIG_VP8_DECODER + vp8_postproc_cfg_t vp8_pp_cfg = { 0, 0, 0 }; +#endif + int frames_corrupted = 0; + int dec_flags = 0; + int do_scale = 0; + vpx_image_t *scaled_img = NULL; +#if CONFIG_VP9_HIGHBITDEPTH + vpx_image_t *img_shifted = NULL; +#endif + int frame_avail, got_data, flush_decoder = 0; + int num_external_frame_buffers = 0; + struct ExternalFrameBufferList ext_fb_list = { 0, NULL }; + + const char *outfile_pattern = NULL; + char outfile_name[PATH_MAX] = { 0 }; + FILE *outfile = NULL; + + FILE *framestats_file = NULL; + + MD5Context md5_ctx; + unsigned char md5_digest[16]; + + struct VpxDecInputContext input = { NULL, NULL }; + struct VpxInputContext vpx_input_ctx; +#if CONFIG_WEBM_IO + struct WebmInputContext webm_ctx; + memset(&(webm_ctx), 0, sizeof(webm_ctx)); + input.webm_ctx = &webm_ctx; +#endif + input.vpx_input_ctx = &vpx_input_ctx; + + /* Parse command line */ + exec_name = argv_[0]; + argv = argv_dup(argc - 1, argv_ + 1); + if (!argv) { + fprintf(stderr, "Error allocating argument list\n"); + return EXIT_FAILURE; + } + for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) { + memset(&arg, 0, sizeof(arg)); + arg.argv_step = 1; + + if (arg_match(&arg, &help, argi)) { + show_help(stdout, 0); + exit(EXIT_SUCCESS); + } else if (arg_match(&arg, &codecarg, argi)) { + interface = get_vpx_decoder_by_name(arg.val); + if (!interface) + die("Error: Unrecognized argument (%s) to --codec\n", arg.val); + } else if (arg_match(&arg, &looparg, argi)) { + // no-op + } else if (arg_match(&arg, &outputfile, argi)) + outfile_pattern = arg.val; + else if (arg_match(&arg, &use_yv12, argi)) { + use_y4m = 0; + flipuv = 1; + opt_yv12 = 1; + } else if (arg_match(&arg, &use_i420, argi)) { + use_y4m = 0; + flipuv = 0; + opt_i420 = 1; + } else if (arg_match(&arg, &rawvideo, argi)) { + use_y4m = 0; + } else if (arg_match(&arg, &flipuvarg, argi)) + flipuv = 1; + else if (arg_match(&arg, &noblitarg, argi)) + noblit = 1; + else if (arg_match(&arg, &progressarg, argi)) + progress = 1; + else if (arg_match(&arg, &limitarg, argi)) + stop_after = arg_parse_uint(&arg); + else if (arg_match(&arg, &skiparg, argi)) + arg_skip = arg_parse_uint(&arg); + else if (arg_match(&arg, &postprocarg, argi)) + postproc = 1; + else if (arg_match(&arg, &md5arg, argi)) + do_md5 = 1; + else if (arg_match(&arg, &summaryarg, argi)) + summary = 1; + else if (arg_match(&arg, &threadsarg, argi)) + cfg.threads = arg_parse_uint(&arg); +#if CONFIG_VP9_DECODER + else if (arg_match(&arg, &frameparallelarg, argi)) { + /* ignored for compatibility */ + } +#endif + else if (arg_match(&arg, &verbosearg, argi)) + quiet = 0; + else if (arg_match(&arg, &scalearg, argi)) + do_scale = 1; + else if (arg_match(&arg, &fb_arg, argi)) + num_external_frame_buffers = arg_parse_uint(&arg); + else if (arg_match(&arg, &continuearg, argi)) + keep_going = 1; +#if CONFIG_VP9_HIGHBITDEPTH + else if (arg_match(&arg, &outbitdeptharg, argi)) { + output_bit_depth = arg_parse_uint(&arg); + } +#endif + else if (arg_match(&arg, &svcdecodingarg, argi)) { + svc_decoding = 1; + svc_spatial_layer = arg_parse_uint(&arg); + } else if (arg_match(&arg, &framestatsarg, argi)) { + framestats_file = fopen(arg.val, "w"); + if (!framestats_file) { + die("Error: Could not open --framestats file (%s) for writing.\n", + arg.val); + } + } else if (arg_match(&arg, &rowmtarg, argi)) { + enable_row_mt = arg_parse_uint(&arg); + } else if (arg_match(&arg, &lpfoptarg, argi)) { + enable_lpf_opt = arg_parse_uint(&arg); + } +#if CONFIG_VP8_DECODER + else if (arg_match(&arg, &addnoise_level, argi)) { + postproc = 1; + vp8_pp_cfg.post_proc_flag |= VP8_ADDNOISE; + vp8_pp_cfg.noise_level = arg_parse_uint(&arg); + } else if (arg_match(&arg, &demacroblock_level, argi)) { + postproc = 1; + vp8_pp_cfg.post_proc_flag |= VP8_DEMACROBLOCK; + vp8_pp_cfg.deblocking_level = arg_parse_uint(&arg); + } else if (arg_match(&arg, &deblock, argi)) { + postproc = 1; + vp8_pp_cfg.post_proc_flag |= VP8_DEBLOCK; + } else if (arg_match(&arg, &mfqe, argi)) { + postproc = 1; + vp8_pp_cfg.post_proc_flag |= VP8_MFQE; + } else if (arg_match(&arg, &error_concealment, argi)) { + ec_enabled = 1; + } +#endif // CONFIG_VP8_DECODER + else + argj++; + } + + /* Check for unrecognized options */ + for (argi = argv; *argi; argi++) + if (argi[0][0] == '-' && strlen(argi[0]) > 1) + die("Error: Unrecognized option %s\n", *argi); + + /* Handle non-option arguments */ + fn = argv[0]; + + if (!fn) { + free(argv); + fprintf(stderr, "No input file specified!\n"); + usage_exit(); + } + /* Open file */ + infile = strcmp(fn, "-") ? fopen(fn, "rb") : set_binary_mode(stdin); + + if (!infile) { + fatal("Failed to open input file '%s'", strcmp(fn, "-") ? fn : "stdin"); + } +#if CONFIG_OS_SUPPORT + /* Make sure we don't dump to the terminal, unless forced to with -o - */ + if (!outfile_pattern && isatty(fileno(stdout)) && !do_md5 && !noblit) { + fprintf(stderr, + "Not dumping raw video to your terminal. Use '-o -' to " + "override.\n"); + return EXIT_FAILURE; + } +#endif + input.vpx_input_ctx->file = infile; + if (file_is_ivf(input.vpx_input_ctx)) + input.vpx_input_ctx->file_type = FILE_TYPE_IVF; +#if CONFIG_WEBM_IO + else if (file_is_webm(input.webm_ctx, input.vpx_input_ctx)) + input.vpx_input_ctx->file_type = FILE_TYPE_WEBM; +#endif + else if (file_is_raw(input.vpx_input_ctx)) + input.vpx_input_ctx->file_type = FILE_TYPE_RAW; + else { + fprintf(stderr, "Unrecognized input file type.\n"); +#if !CONFIG_WEBM_IO + fprintf(stderr, "vpxdec was built without WebM container support.\n"); +#endif + free(argv); + return EXIT_FAILURE; + } + + outfile_pattern = outfile_pattern ? outfile_pattern : "-"; + single_file = is_single_file(outfile_pattern); + + if (!noblit && single_file) { + generate_filename(outfile_pattern, outfile_name, PATH_MAX, + vpx_input_ctx.width, vpx_input_ctx.height, 0); + if (do_md5) + MD5Init(&md5_ctx); + else + outfile = open_outfile(outfile_name); + } + + if (use_y4m && !noblit) { + if (!single_file) { + fprintf(stderr, + "YUV4MPEG2 not supported with output patterns," + " try --i420 or --yv12 or --rawvideo.\n"); + return EXIT_FAILURE; + } + +#if CONFIG_WEBM_IO + if (vpx_input_ctx.file_type == FILE_TYPE_WEBM) { + if (webm_guess_framerate(input.webm_ctx, input.vpx_input_ctx)) { + fprintf(stderr, + "Failed to guess framerate -- error parsing " + "webm file?\n"); + return EXIT_FAILURE; + } + } +#endif + } + + fourcc_interface = get_vpx_decoder_by_fourcc(vpx_input_ctx.fourcc); + if (interface && fourcc_interface && interface != fourcc_interface) + warn("Header indicates codec: %s\n", fourcc_interface->name); + else + interface = fourcc_interface; + + if (!interface) interface = get_vpx_decoder_by_index(0); + + dec_flags = (postproc ? VPX_CODEC_USE_POSTPROC : 0) | + (ec_enabled ? VPX_CODEC_USE_ERROR_CONCEALMENT : 0); + if (vpx_codec_dec_init(&decoder, interface->codec_interface(), &cfg, + dec_flags)) { + fprintf(stderr, "Failed to initialize decoder: %s\n", + vpx_codec_error(&decoder)); + goto fail2; + } + if (svc_decoding) { + if (vpx_codec_control(&decoder, VP9_DECODE_SVC_SPATIAL_LAYER, + svc_spatial_layer)) { + fprintf(stderr, "Failed to set spatial layer for svc decode: %s\n", + vpx_codec_error(&decoder)); + goto fail; + } + } + if (interface->fourcc == VP9_FOURCC && + vpx_codec_control(&decoder, VP9D_SET_ROW_MT, enable_row_mt)) { + fprintf(stderr, "Failed to set decoder in row multi-thread mode: %s\n", + vpx_codec_error(&decoder)); + goto fail; + } + if (interface->fourcc == VP9_FOURCC && + vpx_codec_control(&decoder, VP9D_SET_LOOP_FILTER_OPT, enable_lpf_opt)) { + fprintf(stderr, "Failed to set decoder in optimized loopfilter mode: %s\n", + vpx_codec_error(&decoder)); + goto fail; + } + if (!quiet) fprintf(stderr, "%s\n", decoder.name); + +#if CONFIG_VP8_DECODER + if (vp8_pp_cfg.post_proc_flag && + vpx_codec_control(&decoder, VP8_SET_POSTPROC, &vp8_pp_cfg)) { + fprintf(stderr, "Failed to configure postproc: %s\n", + vpx_codec_error(&decoder)); + goto fail; + } +#endif + + if (arg_skip) fprintf(stderr, "Skipping first %d frames.\n", arg_skip); + while (arg_skip) { + if (dec_read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) break; + arg_skip--; + } + + if (num_external_frame_buffers > 0) { + ext_fb_list.num_external_frame_buffers = num_external_frame_buffers; + ext_fb_list.ext_fb = (struct ExternalFrameBuffer *)calloc( + num_external_frame_buffers, sizeof(*ext_fb_list.ext_fb)); + if (!ext_fb_list.ext_fb) { + fprintf(stderr, "Failed to allocate ExternalFrameBuffer\n"); + goto fail; + } + if (vpx_codec_set_frame_buffer_functions(&decoder, get_vp9_frame_buffer, + release_vp9_frame_buffer, + &ext_fb_list)) { + fprintf(stderr, "Failed to configure external frame buffers: %s\n", + vpx_codec_error(&decoder)); + goto fail; + } + } + + frame_avail = 1; + got_data = 0; + + if (framestats_file) fprintf(framestats_file, "bytes,qp\n"); + + /* Decode file */ + while (frame_avail || got_data) { + vpx_codec_iter_t iter = NULL; + vpx_image_t *img; + struct vpx_usec_timer timer; + int corrupted = 0; + + frame_avail = 0; + if (!stop_after || frame_in < stop_after) { + if (!dec_read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) { + frame_avail = 1; + frame_in++; + + vpx_usec_timer_start(&timer); + + if (vpx_codec_decode(&decoder, buf, (unsigned int)bytes_in_buffer, NULL, + 0)) { + const char *detail = vpx_codec_error_detail(&decoder); + warn("Failed to decode frame %d: %s", frame_in, + vpx_codec_error(&decoder)); + if (detail) warn("Additional information: %s", detail); + corrupted = 1; + if (!keep_going) goto fail; + } + + if (framestats_file) { + int qp; + if (vpx_codec_control(&decoder, VPXD_GET_LAST_QUANTIZER, &qp)) { + warn("Failed VPXD_GET_LAST_QUANTIZER: %s", + vpx_codec_error(&decoder)); + if (!keep_going) goto fail; + } + fprintf(framestats_file, "%d,%d\n", (int)bytes_in_buffer, qp); + } + + vpx_usec_timer_mark(&timer); + dx_time += vpx_usec_timer_elapsed(&timer); + } else { + flush_decoder = 1; + } + } else { + flush_decoder = 1; + } + + vpx_usec_timer_start(&timer); + + if (flush_decoder) { + // Flush the decoder in frame parallel decode. + if (vpx_codec_decode(&decoder, NULL, 0, NULL, 0)) { + warn("Failed to flush decoder: %s", vpx_codec_error(&decoder)); + corrupted = 1; + if (!keep_going) goto fail; + } + } + + got_data = 0; + if ((img = vpx_codec_get_frame(&decoder, &iter))) { + ++frame_out; + got_data = 1; + } + + vpx_usec_timer_mark(&timer); + dx_time += (unsigned int)vpx_usec_timer_elapsed(&timer); + + if (!corrupted && + vpx_codec_control(&decoder, VP8D_GET_FRAME_CORRUPTED, &corrupted)) { + warn("Failed VP8_GET_FRAME_CORRUPTED: %s", vpx_codec_error(&decoder)); + if (!keep_going) goto fail; + } + frames_corrupted += corrupted; + + if (progress) show_progress(frame_in, frame_out, dx_time); + + if (!noblit && img) { + const int PLANES_YUV[] = { VPX_PLANE_Y, VPX_PLANE_U, VPX_PLANE_V }; + const int PLANES_YVU[] = { VPX_PLANE_Y, VPX_PLANE_V, VPX_PLANE_U }; + const int *planes = flipuv ? PLANES_YVU : PLANES_YUV; + + if (do_scale) { + if (frame_out == 1) { + // If the output frames are to be scaled to a fixed display size then + // use the width and height specified in the container. If either of + // these is set to 0, use the display size set in the first frame + // header. If that is unavailable, use the raw decoded size of the + // first decoded frame. + int render_width = vpx_input_ctx.width; + int render_height = vpx_input_ctx.height; + if (!render_width || !render_height) { + int render_size[2]; + if (vpx_codec_control(&decoder, VP9D_GET_DISPLAY_SIZE, + render_size)) { + // As last resort use size of first frame as display size. + render_width = img->d_w; + render_height = img->d_h; + } else { + render_width = render_size[0]; + render_height = render_size[1]; + } + } + scaled_img = + vpx_img_alloc(NULL, img->fmt, render_width, render_height, 16); + if (!scaled_img) { + fprintf(stderr, "Failed to allocate scaled image (%d x %d)\n", + render_width, render_height); + goto fail; + } + scaled_img->bit_depth = img->bit_depth; + } + + if (img->d_w != scaled_img->d_w || img->d_h != scaled_img->d_h) { +#if CONFIG_LIBYUV + libyuv_scale(img, scaled_img, kFilterBox); + img = scaled_img; +#else + fprintf(stderr, + "Failed to scale output frame: %s.\n" + "Scaling is disabled in this configuration. " + "To enable scaling, configure with --enable-libyuv\n", + vpx_codec_error(&decoder)); + goto fail; +#endif + } + } +#if CONFIG_VP9_HIGHBITDEPTH + // Default to codec bit depth if output bit depth not set + if (!output_bit_depth && single_file && !do_md5) { + output_bit_depth = img->bit_depth; + } + // Shift up or down if necessary + if (output_bit_depth != 0 && output_bit_depth != img->bit_depth) { + const vpx_img_fmt_t shifted_fmt = + output_bit_depth == 8 + ? img->fmt ^ (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) + : img->fmt | VPX_IMG_FMT_HIGHBITDEPTH; + if (img_shifted && + img_shifted_realloc_required(img, img_shifted, shifted_fmt)) { + vpx_img_free(img_shifted); + img_shifted = NULL; + } + if (!img_shifted) { + img_shifted = + vpx_img_alloc(NULL, shifted_fmt, img->d_w, img->d_h, 16); + if (!img_shifted) { + fprintf(stderr, "Failed to allocate image\n"); + goto fail; + } + img_shifted->bit_depth = output_bit_depth; + } + if (output_bit_depth > img->bit_depth) { + vpx_img_upshift(img_shifted, img, output_bit_depth - img->bit_depth); + } else { + vpx_img_downshift(img_shifted, img, + img->bit_depth - output_bit_depth); + } + img = img_shifted; + } +#endif + + if (single_file) { + if (use_y4m) { + char y4m_buf[Y4M_BUFFER_SIZE] = { 0 }; + size_t len = 0; + if (img->fmt == VPX_IMG_FMT_I440 || img->fmt == VPX_IMG_FMT_I44016) { + fprintf(stderr, "Cannot produce y4m output for 440 sampling.\n"); + goto fail; + } + if (frame_out == 1) { + // Y4M file header + len = y4m_write_file_header( + y4m_buf, sizeof(y4m_buf), vpx_input_ctx.width, + vpx_input_ctx.height, &vpx_input_ctx.framerate, img->fmt, + img->bit_depth); + if (do_md5) { + MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len); + } else { + fputs(y4m_buf, outfile); + } + } + + // Y4M frame header + len = y4m_write_frame_header(y4m_buf, sizeof(y4m_buf)); + if (do_md5) { + MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len); + } else { + fputs(y4m_buf, outfile); + } + } else { + if (frame_out == 1) { + // Check if --yv12 or --i420 options are consistent with the + // bit-stream decoded + if (opt_i420) { + if (img->fmt != VPX_IMG_FMT_I420 && + img->fmt != VPX_IMG_FMT_I42016) { + fprintf(stderr, "Cannot produce i420 output for bit-stream.\n"); + goto fail; + } + } + if (opt_yv12) { + if ((img->fmt != VPX_IMG_FMT_I420 && + img->fmt != VPX_IMG_FMT_YV12) || + img->bit_depth != 8) { + fprintf(stderr, "Cannot produce yv12 output for bit-stream.\n"); + goto fail; + } + } + } + } + + if (do_md5) { + update_image_md5(img, planes, &md5_ctx); + } else { + if (!corrupted) write_image_file(img, planes, outfile); + } + } else { + generate_filename(outfile_pattern, outfile_name, PATH_MAX, img->d_w, + img->d_h, frame_in); + if (do_md5) { + MD5Init(&md5_ctx); + update_image_md5(img, planes, &md5_ctx); + MD5Final(md5_digest, &md5_ctx); + print_md5(md5_digest, outfile_name); + } else { + outfile = open_outfile(outfile_name); + write_image_file(img, planes, outfile); + fclose(outfile); + } + } + } + } + + if (summary || progress) { + show_progress(frame_in, frame_out, dx_time); + fprintf(stderr, "\n"); + } + + if (frames_corrupted) { + fprintf(stderr, "WARNING: %d frames corrupted.\n", frames_corrupted); + } else { + ret = EXIT_SUCCESS; + } + +fail: + + if (vpx_codec_destroy(&decoder)) { + fprintf(stderr, "Failed to destroy decoder: %s\n", + vpx_codec_error(&decoder)); + } + +fail2: + + if (!noblit && single_file) { + if (do_md5) { + MD5Final(md5_digest, &md5_ctx); + print_md5(md5_digest, outfile_name); + } else { + fclose(outfile); + } + } + +#if CONFIG_WEBM_IO + if (input.vpx_input_ctx->file_type == FILE_TYPE_WEBM) + webm_free(input.webm_ctx); +#endif + + if (input.vpx_input_ctx->file_type != FILE_TYPE_WEBM) free(buf); + + if (scaled_img) vpx_img_free(scaled_img); +#if CONFIG_VP9_HIGHBITDEPTH + if (img_shifted) vpx_img_free(img_shifted); +#endif + + for (i = 0; i < ext_fb_list.num_external_frame_buffers; ++i) { + free(ext_fb_list.ext_fb[i].data); + } + free(ext_fb_list.ext_fb); + + fclose(infile); + if (framestats_file) fclose(framestats_file); + + free(argv); + + return ret; +} + +int main(int argc, const char **argv_) { + unsigned int loops = 1, i; + char **argv, **argi, **argj; + struct arg arg; + int error = 0; + + argv = argv_dup(argc - 1, argv_ + 1); + if (!argv) { + fprintf(stderr, "Error allocating argument list\n"); + return EXIT_FAILURE; + } + for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) { + memset(&arg, 0, sizeof(arg)); + arg.argv_step = 1; + + if (arg_match(&arg, &looparg, argi)) { + loops = arg_parse_uint(&arg); + break; + } + } + free(argv); + for (i = 0; !error && i < loops; i++) error = main_loop(argc, argv_); + return error; +} diff --git a/media/libvpx/libvpx/vpxenc.c b/media/libvpx/libvpx/vpxenc.c new file mode 100644 index 0000000000..d20bd3f967 --- /dev/null +++ b/media/libvpx/libvpx/vpxenc.c @@ -0,0 +1,2070 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpxenc.h" +#include "./vpx_config.h" + +#include +#include +#include +#include +#include +#include +#include + +#if CONFIG_LIBYUV +#include "third_party/libyuv/include/libyuv/scale.h" +#endif + +#include "vpx/vpx_encoder.h" +#if CONFIG_DECODERS +#include "vpx/vpx_decoder.h" +#endif + +#include "./args.h" +#include "./ivfenc.h" +#include "./tools_common.h" + +#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER +#include "vpx/vp8cx.h" +#endif +#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER +#include "vpx/vp8dx.h" +#endif + +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem_ops.h" +#include "vpx_ports/vpx_timer.h" +#include "./rate_hist.h" +#include "./vpxstats.h" +#include "./warnings.h" +#if CONFIG_WEBM_IO +#include "./webmenc.h" +#endif +#include "./y4minput.h" + +static size_t wrap_fwrite(const void *ptr, size_t size, size_t nmemb, + FILE *stream) { + return fwrite(ptr, size, nmemb, stream); +} +#define fwrite wrap_fwrite + +static const char *exec_name; + +static VPX_TOOLS_FORMAT_PRINTF(3, 0) void warn_or_exit_on_errorv( + vpx_codec_ctx_t *ctx, int fatal, const char *s, va_list ap) { + if (ctx->err) { + const char *detail = vpx_codec_error_detail(ctx); + + vfprintf(stderr, s, ap); + fprintf(stderr, ": %s\n", vpx_codec_error(ctx)); + + if (detail) fprintf(stderr, " %s\n", detail); + + if (fatal) exit(EXIT_FAILURE); + } +} + +static VPX_TOOLS_FORMAT_PRINTF(2, + 3) void ctx_exit_on_error(vpx_codec_ctx_t *ctx, + const char *s, ...) { + va_list ap; + + va_start(ap, s); + warn_or_exit_on_errorv(ctx, 1, s, ap); + va_end(ap); +} + +static VPX_TOOLS_FORMAT_PRINTF(3, 4) void warn_or_exit_on_error( + vpx_codec_ctx_t *ctx, int fatal, const char *s, ...) { + va_list ap; + + va_start(ap, s); + warn_or_exit_on_errorv(ctx, fatal, s, ap); + va_end(ap); +} + +static const arg_def_t help = + ARG_DEF(NULL, "help", 0, "Show usage options and exit"); +static const arg_def_t debugmode = + ARG_DEF("D", "debug", 0, "Debug mode (makes output deterministic)"); +static const arg_def_t outputfile = + ARG_DEF("o", "output", 1, "Output filename"); +static const arg_def_t use_nv12 = + ARG_DEF(NULL, "nv12", 0, "Input file is NV12 "); +static const arg_def_t use_yv12 = + ARG_DEF(NULL, "yv12", 0, "Input file is YV12 "); +static const arg_def_t use_i420 = + ARG_DEF(NULL, "i420", 0, "Input file is I420 (default)"); +static const arg_def_t use_i422 = + ARG_DEF(NULL, "i422", 0, "Input file is I422"); +static const arg_def_t use_i444 = + ARG_DEF(NULL, "i444", 0, "Input file is I444"); +static const arg_def_t use_i440 = + ARG_DEF(NULL, "i440", 0, "Input file is I440"); +static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1, "Codec to use"); +static const arg_def_t passes = + ARG_DEF("p", "passes", 1, "Number of passes (1/2)"); +static const arg_def_t pass_arg = + ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2)"); +static const arg_def_t fpf_name = + ARG_DEF(NULL, "fpf", 1, "First pass statistics file name"); +static const arg_def_t limit = + ARG_DEF(NULL, "limit", 1, "Stop encoding after n input frames"); +static const arg_def_t skip = + ARG_DEF(NULL, "skip", 1, "Skip the first n input frames"); +static const arg_def_t deadline = + ARG_DEF("d", "deadline", 1, "Deadline per frame (usec)"); +static const arg_def_t best_dl = + ARG_DEF(NULL, "best", 0, "Use Best Quality Deadline"); +static const arg_def_t good_dl = + ARG_DEF(NULL, "good", 0, "Use Good Quality Deadline"); +static const arg_def_t rt_dl = + ARG_DEF(NULL, "rt", 0, "Use Realtime Quality Deadline"); +static const arg_def_t quietarg = + ARG_DEF("q", "quiet", 0, "Do not print encode progress"); +static const arg_def_t verbosearg = + ARG_DEF("v", "verbose", 0, "Show encoder parameters"); +static const arg_def_t psnrarg = + ARG_DEF(NULL, "psnr", 0, "Show PSNR in status line"); + +static const struct arg_enum_list test_decode_enum[] = { + { "off", TEST_DECODE_OFF }, + { "fatal", TEST_DECODE_FATAL }, + { "warn", TEST_DECODE_WARN }, + { NULL, 0 } +}; +static const arg_def_t recontest = ARG_DEF_ENUM( + NULL, "test-decode", 1, "Test encode/decode mismatch", test_decode_enum); +static const arg_def_t framerate = + ARG_DEF(NULL, "fps", 1, "Stream frame rate (rate/scale)"); +static const arg_def_t use_webm = + ARG_DEF(NULL, "webm", 0, "Output WebM (default when WebM IO is enabled)"); +static const arg_def_t use_ivf = ARG_DEF(NULL, "ivf", 0, "Output IVF"); +static const arg_def_t out_part = + ARG_DEF("P", "output-partitions", 0, + "Makes encoder output partitions. Requires IVF output!"); +static const arg_def_t q_hist_n = + ARG_DEF(NULL, "q-hist", 1, "Show quantizer histogram (n-buckets)"); +static const arg_def_t rate_hist_n = + ARG_DEF(NULL, "rate-hist", 1, "Show rate histogram (n-buckets)"); +static const arg_def_t disable_warnings = + ARG_DEF(NULL, "disable-warnings", 0, + "Disable warnings about potentially incorrect encode settings."); +static const arg_def_t disable_warning_prompt = + ARG_DEF("y", "disable-warning-prompt", 0, + "Display warnings, but do not prompt user to continue."); + +#if CONFIG_VP9_HIGHBITDEPTH +static const arg_def_t test16bitinternalarg = ARG_DEF( + NULL, "test-16bit-internal", 0, "Force use of 16 bit internal buffer"); +#endif + +static const arg_def_t *main_args[] = { &help, + &debugmode, + &outputfile, + &codecarg, + &passes, + &pass_arg, + &fpf_name, + &limit, + &skip, + &deadline, + &best_dl, + &good_dl, + &rt_dl, + &quietarg, + &verbosearg, + &psnrarg, + &use_webm, + &use_ivf, + &out_part, + &q_hist_n, + &rate_hist_n, + &disable_warnings, + &disable_warning_prompt, + &recontest, + NULL }; + +static const arg_def_t usage = + ARG_DEF("u", "usage", 1, "Usage profile number to use"); +static const arg_def_t threads = + ARG_DEF("t", "threads", 1, "Max number of threads to use"); +static const arg_def_t profile = + ARG_DEF(NULL, "profile", 1, "Bitstream profile number to use"); +static const arg_def_t width = ARG_DEF("w", "width", 1, "Frame width"); +static const arg_def_t height = ARG_DEF("h", "height", 1, "Frame height"); +#if CONFIG_WEBM_IO +static const struct arg_enum_list stereo_mode_enum[] = { + { "mono", STEREO_FORMAT_MONO }, + { "left-right", STEREO_FORMAT_LEFT_RIGHT }, + { "bottom-top", STEREO_FORMAT_BOTTOM_TOP }, + { "top-bottom", STEREO_FORMAT_TOP_BOTTOM }, + { "right-left", STEREO_FORMAT_RIGHT_LEFT }, + { NULL, 0 } +}; +static const arg_def_t stereo_mode = ARG_DEF_ENUM( + NULL, "stereo-mode", 1, "Stereo 3D video format", stereo_mode_enum); +#endif +static const arg_def_t timebase = ARG_DEF( + NULL, "timebase", 1, "Output timestamp precision (fractional seconds)"); +static const arg_def_t error_resilient = + ARG_DEF(NULL, "error-resilient", 1, "Enable error resiliency features"); +static const arg_def_t lag_in_frames = + ARG_DEF(NULL, "lag-in-frames", 1, "Max number of frames to lag"); + +static const arg_def_t *global_args[] = { &use_nv12, + &use_yv12, + &use_i420, + &use_i422, + &use_i444, + &use_i440, + &usage, + &threads, + &profile, + &width, + &height, +#if CONFIG_WEBM_IO + &stereo_mode, +#endif + &timebase, + &framerate, + &error_resilient, +#if CONFIG_VP9_HIGHBITDEPTH + &test16bitinternalarg, +#endif + &lag_in_frames, + NULL }; + +static const arg_def_t dropframe_thresh = + ARG_DEF(NULL, "drop-frame", 1, "Temporal resampling threshold (buf %)"); +static const arg_def_t resize_allowed = + ARG_DEF(NULL, "resize-allowed", 1, "Spatial resampling enabled (bool)"); +static const arg_def_t resize_width = + ARG_DEF(NULL, "resize-width", 1, "Width of encoded frame"); +static const arg_def_t resize_height = + ARG_DEF(NULL, "resize-height", 1, "Height of encoded frame"); +static const arg_def_t resize_up_thresh = + ARG_DEF(NULL, "resize-up", 1, "Upscale threshold (buf %)"); +static const arg_def_t resize_down_thresh = + ARG_DEF(NULL, "resize-down", 1, "Downscale threshold (buf %)"); +static const struct arg_enum_list end_usage_enum[] = { { "vbr", VPX_VBR }, + { "cbr", VPX_CBR }, + { "cq", VPX_CQ }, + { "q", VPX_Q }, + { NULL, 0 } }; +static const arg_def_t end_usage = + ARG_DEF_ENUM(NULL, "end-usage", 1, "Rate control mode", end_usage_enum); +static const arg_def_t target_bitrate = + ARG_DEF(NULL, "target-bitrate", 1, "Bitrate (kbps)"); +static const arg_def_t min_quantizer = + ARG_DEF(NULL, "min-q", 1, "Minimum (best) quantizer"); +static const arg_def_t max_quantizer = + ARG_DEF(NULL, "max-q", 1, "Maximum (worst) quantizer"); +static const arg_def_t undershoot_pct = + ARG_DEF(NULL, "undershoot-pct", 1, "Datarate undershoot (min) target (%)"); +static const arg_def_t overshoot_pct = + ARG_DEF(NULL, "overshoot-pct", 1, "Datarate overshoot (max) target (%)"); +static const arg_def_t buf_sz = + ARG_DEF(NULL, "buf-sz", 1, "Client buffer size (ms)"); +static const arg_def_t buf_initial_sz = + ARG_DEF(NULL, "buf-initial-sz", 1, "Client initial buffer size (ms)"); +static const arg_def_t buf_optimal_sz = + ARG_DEF(NULL, "buf-optimal-sz", 1, "Client optimal buffer size (ms)"); +static const arg_def_t *rc_args[] = { + &dropframe_thresh, &resize_allowed, &resize_width, &resize_height, + &resize_up_thresh, &resize_down_thresh, &end_usage, &target_bitrate, + &min_quantizer, &max_quantizer, &undershoot_pct, &overshoot_pct, + &buf_sz, &buf_initial_sz, &buf_optimal_sz, NULL +}; + +#if CONFIG_VP9_ENCODER +static const arg_def_t use_vizier_rc_params = + ARG_DEF(NULL, "use-vizier-rc-params", 1, "Use vizier rc params"); +static const arg_def_t active_wq_factor = + ARG_DEF(NULL, "active-wq-factor", 1, "Active worst quality factor"); +static const arg_def_t err_per_mb_factor = + ARG_DEF(NULL, "err-per-mb-factor", 1, "Error per macroblock factor"); +static const arg_def_t sr_default_decay_limit = ARG_DEF( + NULL, "sr-default-decay-limit", 1, "Second reference default decay limit"); +static const arg_def_t sr_diff_factor = + ARG_DEF(NULL, "sr-diff-factor", 1, "Second reference diff factor"); +static const arg_def_t kf_err_per_mb_factor = ARG_DEF( + NULL, "kf-err-per-mb-factor", 1, "Keyframe error per macroblock factor"); +static const arg_def_t kf_frame_min_boost_factor = + ARG_DEF(NULL, "kf-frame-min-boost-factor", 1, "Keyframe min boost"); +static const arg_def_t kf_frame_max_boost_first_factor = + ARG_DEF(NULL, "kf-frame-max-boost-first-factor", 1, + "Max keyframe boost adjustment factor for first frame"); +static const arg_def_t kf_frame_max_boost_subs_factor = + ARG_DEF(NULL, "kf-frame-max-boost-subs-factor", 1, + "Max boost adjustment factor for subsequent KFs"); +static const arg_def_t kf_max_total_boost_factor = ARG_DEF( + NULL, "kf-max-total-boost-factor", 1, "Keyframe max total boost factor"); +static const arg_def_t gf_max_total_boost_factor = + ARG_DEF(NULL, "gf-max-total-boost-factor", 1, + "Golden frame max total boost factor"); +static const arg_def_t gf_frame_max_boost_factor = + ARG_DEF(NULL, "gf-frame-max-boost-factor", 1, + "Golden frame max per frame boost factor"); +static const arg_def_t zm_factor = + ARG_DEF(NULL, "zm-factor", 1, "Zero motion power factor"); +static const arg_def_t rd_mult_inter_qp_fac = + ARG_DEF(NULL, "rd-mult-inter-qp-fac", 1, + "RD multiplier adjustment for inter frames"); +static const arg_def_t rd_mult_arf_qp_fac = + ARG_DEF(NULL, "rd-mult-arf-qp-fac", 1, + "RD multiplier adjustment for alt-ref frames"); +static const arg_def_t rd_mult_key_qp_fac = ARG_DEF( + NULL, "rd-mult-key-qp-fac", 1, "RD multiplier adjustment for key frames"); +static const arg_def_t *vizier_rc_args[] = { &use_vizier_rc_params, + &active_wq_factor, + &err_per_mb_factor, + &sr_default_decay_limit, + &sr_diff_factor, + &kf_err_per_mb_factor, + &kf_frame_min_boost_factor, + &kf_frame_max_boost_first_factor, + &kf_frame_max_boost_subs_factor, + &kf_max_total_boost_factor, + &gf_max_total_boost_factor, + &gf_frame_max_boost_factor, + &zm_factor, + &rd_mult_inter_qp_fac, + &rd_mult_arf_qp_fac, + &rd_mult_key_qp_fac, + NULL }; +#endif + +static const arg_def_t bias_pct = + ARG_DEF(NULL, "bias-pct", 1, "CBR/VBR bias (0=CBR, 100=VBR)"); +static const arg_def_t minsection_pct = + ARG_DEF(NULL, "minsection-pct", 1, "GOP min bitrate (% of target)"); +static const arg_def_t maxsection_pct = + ARG_DEF(NULL, "maxsection-pct", 1, "GOP max bitrate (% of target)"); +static const arg_def_t corpus_complexity = + ARG_DEF(NULL, "corpus-complexity", 1, "corpus vbr complexity midpoint"); +static const arg_def_t *rc_twopass_args[] = { &bias_pct, &minsection_pct, + &maxsection_pct, + &corpus_complexity, NULL }; + +static const arg_def_t kf_min_dist = + ARG_DEF(NULL, "kf-min-dist", 1, "Minimum keyframe interval (frames)"); +static const arg_def_t kf_max_dist = + ARG_DEF(NULL, "kf-max-dist", 1, "Maximum keyframe interval (frames)"); +static const arg_def_t kf_disabled = + ARG_DEF(NULL, "disable-kf", 0, "Disable keyframe placement"); +static const arg_def_t *kf_args[] = { &kf_min_dist, &kf_max_dist, &kf_disabled, + NULL }; + +static const arg_def_t noise_sens = + ARG_DEF(NULL, "noise-sensitivity", 1, "Noise sensitivity (frames to blur)"); +static const arg_def_t sharpness = + ARG_DEF(NULL, "sharpness", 1, + "Increase sharpness at the expense of lower PSNR. (0..7)"); +static const arg_def_t static_thresh = + ARG_DEF(NULL, "static-thresh", 1, "Motion detection threshold"); +static const arg_def_t arnr_maxframes = + ARG_DEF(NULL, "arnr-maxframes", 1, "AltRef max frames (0..15)"); +static const arg_def_t arnr_strength = + ARG_DEF(NULL, "arnr-strength", 1, "AltRef filter strength (0..6)"); +static const arg_def_t arnr_type = + ARG_DEF(NULL, "arnr-type", 1, "AltRef filter type (1..3)"); +static const struct arg_enum_list tuning_enum[] = { { "psnr", VP8_TUNE_PSNR }, + { "ssim", VP8_TUNE_SSIM }, + { NULL, 0 } }; +static const arg_def_t tune_ssim = + ARG_DEF_ENUM(NULL, "tune", 1, "Material to favor", tuning_enum); +static const arg_def_t cq_level = + ARG_DEF(NULL, "cq-level", 1, "Constant/Constrained Quality level"); +static const arg_def_t max_intra_rate_pct = + ARG_DEF(NULL, "max-intra-rate", 1, "Max I-frame bitrate (pct)"); +static const arg_def_t gf_cbr_boost_pct = ARG_DEF( + NULL, "gf-cbr-boost", 1, "Boost for Golden Frame in CBR mode (pct)"); + +#if CONFIG_VP8_ENCODER +static const arg_def_t cpu_used_vp8 = + ARG_DEF(NULL, "cpu-used", 1, "CPU Used (-16..16)"); +static const arg_def_t auto_altref_vp8 = ARG_DEF( + NULL, "auto-alt-ref", 1, "Enable automatic alt reference frames. (0..1)"); +static const arg_def_t token_parts = + ARG_DEF(NULL, "token-parts", 1, "Number of token partitions to use, log2"); +static const arg_def_t screen_content_mode = + ARG_DEF(NULL, "screen-content-mode", 1, "Screen content mode"); +static const arg_def_t *vp8_args[] = { &cpu_used_vp8, + &auto_altref_vp8, + &noise_sens, + &sharpness, + &static_thresh, + &token_parts, + &arnr_maxframes, + &arnr_strength, + &arnr_type, + &tune_ssim, + &cq_level, + &max_intra_rate_pct, + &gf_cbr_boost_pct, + &screen_content_mode, + NULL }; +static const int vp8_arg_ctrl_map[] = { VP8E_SET_CPUUSED, + VP8E_SET_ENABLEAUTOALTREF, + VP8E_SET_NOISE_SENSITIVITY, + VP8E_SET_SHARPNESS, + VP8E_SET_STATIC_THRESHOLD, + VP8E_SET_TOKEN_PARTITIONS, + VP8E_SET_ARNR_MAXFRAMES, + VP8E_SET_ARNR_STRENGTH, + VP8E_SET_ARNR_TYPE, + VP8E_SET_TUNING, + VP8E_SET_CQ_LEVEL, + VP8E_SET_MAX_INTRA_BITRATE_PCT, + VP8E_SET_GF_CBR_BOOST_PCT, + VP8E_SET_SCREEN_CONTENT_MODE, + 0 }; +#endif + +#if CONFIG_VP9_ENCODER +static const arg_def_t cpu_used_vp9 = + ARG_DEF(NULL, "cpu-used", 1, "CPU Used (-9..9)"); +static const arg_def_t auto_altref_vp9 = ARG_DEF( + NULL, "auto-alt-ref", 1, + "Enable automatic alt reference frames, 2+ enables multi-layer. (0..6)"); +static const arg_def_t tile_cols = + ARG_DEF(NULL, "tile-columns", 1, "Number of tile columns to use, log2"); +static const arg_def_t tile_rows = + ARG_DEF(NULL, "tile-rows", 1, + "Number of tile rows to use, log2 (set to 0 while threads > 1)"); + +static const arg_def_t enable_tpl_model = + ARG_DEF(NULL, "enable-tpl", 1, "Enable temporal dependency model"); + +static const arg_def_t lossless = + ARG_DEF(NULL, "lossless", 1, "Lossless mode (0: false (default), 1: true)"); +static const arg_def_t frame_parallel_decoding = ARG_DEF( + NULL, "frame-parallel", 1, "Enable frame parallel decodability features"); +static const arg_def_t aq_mode = ARG_DEF( + NULL, "aq-mode", 1, + "Adaptive quantization mode (0: off (default), 1: variance 2: complexity, " + "3: cyclic refresh, 4: equator360)"); +static const arg_def_t alt_ref_aq = ARG_DEF(NULL, "alt-ref-aq", 1, + "Special adaptive quantization for " + "the alternate reference frames."); +static const arg_def_t frame_periodic_boost = + ARG_DEF(NULL, "frame-boost", 1, + "Enable frame periodic boost (0: off (default), 1: on)"); +static const arg_def_t max_inter_rate_pct = + ARG_DEF(NULL, "max-inter-rate", 1, "Max P-frame bitrate (pct)"); +static const arg_def_t min_gf_interval = ARG_DEF( + NULL, "min-gf-interval", 1, + "min gf/arf frame interval (default 0, indicating in-built behavior)"); +static const arg_def_t max_gf_interval = ARG_DEF( + NULL, "max-gf-interval", 1, + "max gf/arf frame interval (default 0, indicating in-built behavior)"); + +static const struct arg_enum_list color_space_enum[] = { + { "unknown", VPX_CS_UNKNOWN }, + { "bt601", VPX_CS_BT_601 }, + { "bt709", VPX_CS_BT_709 }, + { "smpte170", VPX_CS_SMPTE_170 }, + { "smpte240", VPX_CS_SMPTE_240 }, + { "bt2020", VPX_CS_BT_2020 }, + { "reserved", VPX_CS_RESERVED }, + { "sRGB", VPX_CS_SRGB }, + { NULL, 0 } +}; + +static const arg_def_t input_color_space = + ARG_DEF_ENUM(NULL, "color-space", 1, + "The color space of input content:", color_space_enum); + +#if CONFIG_VP9_HIGHBITDEPTH +static const struct arg_enum_list bitdepth_enum[] = { + { "8", VPX_BITS_8 }, { "10", VPX_BITS_10 }, { "12", VPX_BITS_12 }, { NULL, 0 } +}; + +static const arg_def_t bitdeptharg = ARG_DEF_ENUM( + "b", "bit-depth", 1, + "Bit depth for codec (8 for version <=1, 10 or 12 for version 2)", + bitdepth_enum); +static const arg_def_t inbitdeptharg = + ARG_DEF(NULL, "input-bit-depth", 1, "Bit depth of input"); +#endif + +static const struct arg_enum_list tune_content_enum[] = { + { "default", VP9E_CONTENT_DEFAULT }, + { "screen", VP9E_CONTENT_SCREEN }, + { "film", VP9E_CONTENT_FILM }, + { NULL, 0 } +}; + +static const arg_def_t tune_content = ARG_DEF_ENUM( + NULL, "tune-content", 1, "Tune content type", tune_content_enum); + +static const arg_def_t target_level = ARG_DEF( + NULL, "target-level", 1, + "Target level\n" + " 255: off (default)\n" + " 0: only keep level stats\n" + " 1: adaptively set alt-ref " + "distance and column tile limit based on picture size, and keep" + " level stats\n" + " 10: level 1.0 11: level 1.1 " + "... 62: level 6.2"); + +static const arg_def_t row_mt = + ARG_DEF(NULL, "row-mt", 1, + "Enable row based non-deterministic multi-threading in VP9"); + +static const arg_def_t disable_loopfilter = + ARG_DEF(NULL, "disable-loopfilter", 1, + "Control Loopfilter in VP9:\n" + " " + "0: Loopfilter on for all frames (default)\n" + " " + "1: Loopfilter off for non reference frames\n" + " " + "2: Loopfilter off for all frames"); +#endif + +#if CONFIG_VP9_ENCODER +static const arg_def_t *vp9_args[] = { &cpu_used_vp9, + &auto_altref_vp9, + &sharpness, + &static_thresh, + &tile_cols, + &tile_rows, + &enable_tpl_model, + &arnr_maxframes, + &arnr_strength, + &arnr_type, + &tune_ssim, + &cq_level, + &max_intra_rate_pct, + &max_inter_rate_pct, + &gf_cbr_boost_pct, + &lossless, + &frame_parallel_decoding, + &aq_mode, + &alt_ref_aq, + &frame_periodic_boost, + &noise_sens, + &tune_content, + &input_color_space, + &min_gf_interval, + &max_gf_interval, + &target_level, + &row_mt, + &disable_loopfilter, +// NOTE: The entries above have a corresponding entry in vp9_arg_ctrl_map. The +// entries below do not have a corresponding entry in vp9_arg_ctrl_map. They +// must be listed at the end of vp9_args. +#if CONFIG_VP9_HIGHBITDEPTH + &bitdeptharg, + &inbitdeptharg, +#endif // CONFIG_VP9_HIGHBITDEPTH + NULL }; +static const int vp9_arg_ctrl_map[] = { VP8E_SET_CPUUSED, + VP8E_SET_ENABLEAUTOALTREF, + VP8E_SET_SHARPNESS, + VP8E_SET_STATIC_THRESHOLD, + VP9E_SET_TILE_COLUMNS, + VP9E_SET_TILE_ROWS, + VP9E_SET_TPL, + VP8E_SET_ARNR_MAXFRAMES, + VP8E_SET_ARNR_STRENGTH, + VP8E_SET_ARNR_TYPE, + VP8E_SET_TUNING, + VP8E_SET_CQ_LEVEL, + VP8E_SET_MAX_INTRA_BITRATE_PCT, + VP9E_SET_MAX_INTER_BITRATE_PCT, + VP9E_SET_GF_CBR_BOOST_PCT, + VP9E_SET_LOSSLESS, + VP9E_SET_FRAME_PARALLEL_DECODING, + VP9E_SET_AQ_MODE, + VP9E_SET_ALT_REF_AQ, + VP9E_SET_FRAME_PERIODIC_BOOST, + VP9E_SET_NOISE_SENSITIVITY, + VP9E_SET_TUNE_CONTENT, + VP9E_SET_COLOR_SPACE, + VP9E_SET_MIN_GF_INTERVAL, + VP9E_SET_MAX_GF_INTERVAL, + VP9E_SET_TARGET_LEVEL, + VP9E_SET_ROW_MT, + VP9E_SET_DISABLE_LOOPFILTER, + 0 }; +#endif + +static const arg_def_t *no_args[] = { NULL }; + +static void show_help(FILE *fout, int shorthelp) { + int i; + const int num_encoder = get_vpx_encoder_count(); + + fprintf(fout, "Usage: %s -o dst_filename src_filename \n", + exec_name); + + if (shorthelp) { + fprintf(fout, "Use --help to see the full list of options.\n"); + return; + } + + fprintf(fout, "\nOptions:\n"); + arg_show_usage(fout, main_args); + fprintf(fout, "\nEncoder Global Options:\n"); + arg_show_usage(fout, global_args); + fprintf(fout, "\nRate Control Options:\n"); + arg_show_usage(fout, rc_args); + fprintf(fout, "\nTwopass Rate Control Options:\n"); + arg_show_usage(fout, rc_twopass_args); + fprintf(fout, "\nKeyframe Placement Options:\n"); + arg_show_usage(fout, kf_args); +#if CONFIG_VP8_ENCODER + fprintf(fout, "\nVP8 Specific Options:\n"); + arg_show_usage(fout, vp8_args); +#endif +#if CONFIG_VP9_ENCODER + fprintf(fout, "\nVP9 Specific Options:\n"); + arg_show_usage(fout, vp9_args); + fprintf(fout, "\nVizier Rate Control Options:\n"); + arg_show_usage(fout, vizier_rc_args); +#endif + fprintf(fout, + "\nStream timebase (--timebase):\n" + " The desired precision of timestamps in the output, expressed\n" + " in fractional seconds. Default is 1/1000.\n"); + fprintf(fout, "\nIncluded encoders:\n\n"); + + for (i = 0; i < num_encoder; ++i) { + const VpxInterface *const encoder = get_vpx_encoder_by_index(i); + const char *defstr = (i == (num_encoder - 1)) ? "(default)" : ""; + fprintf(fout, " %-6s - %s %s\n", encoder->name, + vpx_codec_iface_name(encoder->codec_interface()), defstr); + } + fprintf(fout, "\n "); + fprintf(fout, "Use --codec to switch to a non-default encoder.\n\n"); +} + +void usage_exit(void) { + show_help(stderr, 1); + exit(EXIT_FAILURE); +} + +#define NELEMENTS(x) (sizeof(x) / sizeof(x[0])) +#if CONFIG_VP9_ENCODER +#define ARG_CTRL_CNT_MAX NELEMENTS(vp9_arg_ctrl_map) +#else +#define ARG_CTRL_CNT_MAX NELEMENTS(vp8_arg_ctrl_map) +#endif + +#if !CONFIG_WEBM_IO +typedef int stereo_format_t; +struct WebmOutputContext { + int debug; +}; +#endif + +/* Per-stream configuration */ +struct stream_config { + struct vpx_codec_enc_cfg cfg; + const char *out_fn; + const char *stats_fn; + stereo_format_t stereo_fmt; + int arg_ctrls[ARG_CTRL_CNT_MAX][2]; + int arg_ctrl_cnt; + int write_webm; +#if CONFIG_VP9_HIGHBITDEPTH + // whether to use 16bit internal buffers + int use_16bit_internal; +#endif +}; + +struct stream_state { + int index; + struct stream_state *next; + struct stream_config config; + FILE *file; + struct rate_hist *rate_hist; + struct WebmOutputContext webm_ctx; + uint64_t psnr_sse_total; + uint64_t psnr_samples_total; + double psnr_totals[4]; + int psnr_count; + int counts[64]; + vpx_codec_ctx_t encoder; + unsigned int frames_out; + uint64_t cx_time; + size_t nbytes; + stats_io_t stats; + struct vpx_image *img; + vpx_codec_ctx_t decoder; + int mismatch_seen; +}; + +static void validate_positive_rational(const char *msg, + struct vpx_rational *rat) { + if (rat->den < 0) { + rat->num *= -1; + rat->den *= -1; + } + + if (rat->num < 0) die("Error: %s must be positive\n", msg); + + if (!rat->den) die("Error: %s has zero denominator\n", msg); +} + +static void parse_global_config(struct VpxEncoderConfig *global, char **argv) { + char **argi, **argj; + struct arg arg; + const int num_encoder = get_vpx_encoder_count(); + + if (num_encoder < 1) die("Error: no valid encoder available\n"); + + /* Initialize default parameters */ + memset(global, 0, sizeof(*global)); + global->codec = get_vpx_encoder_by_index(num_encoder - 1); + global->passes = 0; + global->color_type = I420; + /* Assign default deadline to good quality */ + global->deadline = VPX_DL_GOOD_QUALITY; + + for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) { + arg.argv_step = 1; + + if (arg_match(&arg, &help, argi)) { + show_help(stdout, 0); + exit(EXIT_SUCCESS); + } else if (arg_match(&arg, &codecarg, argi)) { + global->codec = get_vpx_encoder_by_name(arg.val); + if (!global->codec) + die("Error: Unrecognized argument (%s) to --codec\n", arg.val); + } else if (arg_match(&arg, &passes, argi)) { + global->passes = arg_parse_uint(&arg); + + if (global->passes < 1 || global->passes > 2) + die("Error: Invalid number of passes (%d)\n", global->passes); + } else if (arg_match(&arg, &pass_arg, argi)) { + global->pass = arg_parse_uint(&arg); + + if (global->pass < 1 || global->pass > 2) + die("Error: Invalid pass selected (%d)\n", global->pass); + } else if (arg_match(&arg, &usage, argi)) + global->usage = arg_parse_uint(&arg); + else if (arg_match(&arg, &deadline, argi)) + global->deadline = arg_parse_uint(&arg); + else if (arg_match(&arg, &best_dl, argi)) + global->deadline = VPX_DL_BEST_QUALITY; + else if (arg_match(&arg, &good_dl, argi)) + global->deadline = VPX_DL_GOOD_QUALITY; + else if (arg_match(&arg, &rt_dl, argi)) + global->deadline = VPX_DL_REALTIME; + else if (arg_match(&arg, &use_yv12, argi)) + global->color_type = YV12; + else if (arg_match(&arg, &use_nv12, argi)) + global->color_type = NV12; + else if (arg_match(&arg, &use_i420, argi)) + global->color_type = I420; + else if (arg_match(&arg, &use_i422, argi)) + global->color_type = I422; + else if (arg_match(&arg, &use_i444, argi)) + global->color_type = I444; + else if (arg_match(&arg, &use_i440, argi)) + global->color_type = I440; + else if (arg_match(&arg, &quietarg, argi)) + global->quiet = 1; + else if (arg_match(&arg, &verbosearg, argi)) + global->verbose = 1; + else if (arg_match(&arg, &limit, argi)) + global->limit = arg_parse_uint(&arg); + else if (arg_match(&arg, &skip, argi)) + global->skip_frames = arg_parse_uint(&arg); + else if (arg_match(&arg, &psnrarg, argi)) + global->show_psnr = 1; + else if (arg_match(&arg, &recontest, argi)) + global->test_decode = arg_parse_enum_or_int(&arg); + else if (arg_match(&arg, &framerate, argi)) { + global->framerate = arg_parse_rational(&arg); + validate_positive_rational(arg.name, &global->framerate); + global->have_framerate = 1; + } else if (arg_match(&arg, &out_part, argi)) + global->out_part = 1; + else if (arg_match(&arg, &debugmode, argi)) + global->debug = 1; + else if (arg_match(&arg, &q_hist_n, argi)) + global->show_q_hist_buckets = arg_parse_uint(&arg); + else if (arg_match(&arg, &rate_hist_n, argi)) + global->show_rate_hist_buckets = arg_parse_uint(&arg); + else if (arg_match(&arg, &disable_warnings, argi)) + global->disable_warnings = 1; + else if (arg_match(&arg, &disable_warning_prompt, argi)) + global->disable_warning_prompt = 1; + else + argj++; + } + + if (global->pass) { + /* DWIM: Assume the user meant passes=2 if pass=2 is specified */ + if (global->pass > global->passes) { + warn("Assuming --pass=%d implies --passes=%d\n", global->pass, + global->pass); + global->passes = global->pass; + } + } + /* Validate global config */ + if (global->passes == 0) { +#if CONFIG_VP9_ENCODER + // Make default VP9 passes = 2 until there is a better quality 1-pass + // encoder + if (global->codec != NULL && global->codec->name != NULL) + global->passes = (strcmp(global->codec->name, "vp9") == 0 && + global->deadline != VPX_DL_REALTIME) + ? 2 + : 1; +#else + global->passes = 1; +#endif + } + + if (global->deadline == VPX_DL_REALTIME && global->passes > 1) { + warn("Enforcing one-pass encoding in realtime mode\n"); + global->passes = 1; + } +} + +static struct stream_state *new_stream(struct VpxEncoderConfig *global, + struct stream_state *prev) { + struct stream_state *stream; + + stream = calloc(1, sizeof(*stream)); + if (stream == NULL) { + fatal("Failed to allocate new stream."); + } + + if (prev) { + memcpy(stream, prev, sizeof(*stream)); + stream->index++; + prev->next = stream; + } else { + vpx_codec_err_t res; + + /* Populate encoder configuration */ + res = vpx_codec_enc_config_default(global->codec->codec_interface(), + &stream->config.cfg, global->usage); + if (res) fatal("Failed to get config: %s\n", vpx_codec_err_to_string(res)); + + /* Change the default timebase to a high enough value so that the + * encoder will always create strictly increasing timestamps. + */ + stream->config.cfg.g_timebase.den = 1000; + + /* Never use the library's default resolution, require it be parsed + * from the file or set on the command line. + */ + stream->config.cfg.g_w = 0; + stream->config.cfg.g_h = 0; + + /* Initialize remaining stream parameters */ + stream->config.write_webm = 1; +#if CONFIG_WEBM_IO + stream->config.stereo_fmt = STEREO_FORMAT_MONO; + stream->webm_ctx.last_pts_ns = -1; + stream->webm_ctx.writer = NULL; + stream->webm_ctx.segment = NULL; +#endif + + /* Allows removal of the application version from the EBML tags */ + stream->webm_ctx.debug = global->debug; + + /* Default lag_in_frames is 0 in realtime mode CBR mode*/ + if (global->deadline == VPX_DL_REALTIME && + stream->config.cfg.rc_end_usage == VPX_CBR) + stream->config.cfg.g_lag_in_frames = 0; + } + + /* Output files must be specified for each stream */ + stream->config.out_fn = NULL; + + stream->next = NULL; + return stream; +} + +static int parse_stream_params(struct VpxEncoderConfig *global, + struct stream_state *stream, char **argv) { + char **argi, **argj; + struct arg arg; + static const arg_def_t **ctrl_args = no_args; + static const int *ctrl_args_map = NULL; + struct stream_config *config = &stream->config; + int eos_mark_found = 0; +#if CONFIG_VP9_HIGHBITDEPTH + int test_16bit_internal = 0; +#endif + + // Handle codec specific options + if (0) { +#if CONFIG_VP8_ENCODER + } else if (strcmp(global->codec->name, "vp8") == 0) { + ctrl_args = vp8_args; + ctrl_args_map = vp8_arg_ctrl_map; +#endif +#if CONFIG_VP9_ENCODER + } else if (strcmp(global->codec->name, "vp9") == 0) { + ctrl_args = vp9_args; + ctrl_args_map = vp9_arg_ctrl_map; +#endif + } + + for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) { + arg.argv_step = 1; + + /* Once we've found an end-of-stream marker (--) we want to continue + * shifting arguments but not consuming them. + */ + if (eos_mark_found) { + argj++; + continue; + } else if (!strcmp(*argj, "--")) { + eos_mark_found = 1; + continue; + } + + if (arg_match(&arg, &outputfile, argi)) { + config->out_fn = arg.val; + } else if (arg_match(&arg, &fpf_name, argi)) { + config->stats_fn = arg.val; + } else if (arg_match(&arg, &use_webm, argi)) { +#if CONFIG_WEBM_IO + config->write_webm = 1; +#else + die("Error: --webm specified but webm is disabled."); +#endif + } else if (arg_match(&arg, &use_ivf, argi)) { + config->write_webm = 0; + } else if (arg_match(&arg, &threads, argi)) { + config->cfg.g_threads = arg_parse_uint(&arg); + } else if (arg_match(&arg, &profile, argi)) { + config->cfg.g_profile = arg_parse_uint(&arg); + } else if (arg_match(&arg, &width, argi)) { + config->cfg.g_w = arg_parse_uint(&arg); + } else if (arg_match(&arg, &height, argi)) { + config->cfg.g_h = arg_parse_uint(&arg); +#if CONFIG_VP9_HIGHBITDEPTH + } else if (arg_match(&arg, &bitdeptharg, argi)) { + config->cfg.g_bit_depth = arg_parse_enum_or_int(&arg); + } else if (arg_match(&arg, &inbitdeptharg, argi)) { + config->cfg.g_input_bit_depth = arg_parse_uint(&arg); +#endif +#if CONFIG_WEBM_IO + } else if (arg_match(&arg, &stereo_mode, argi)) { + config->stereo_fmt = arg_parse_enum_or_int(&arg); +#endif + } else if (arg_match(&arg, &timebase, argi)) { + config->cfg.g_timebase = arg_parse_rational(&arg); + validate_positive_rational(arg.name, &config->cfg.g_timebase); + } else if (arg_match(&arg, &error_resilient, argi)) { + config->cfg.g_error_resilient = arg_parse_uint(&arg); + } else if (arg_match(&arg, &end_usage, argi)) { + config->cfg.rc_end_usage = arg_parse_enum_or_int(&arg); + } else if (arg_match(&arg, &lag_in_frames, argi)) { + config->cfg.g_lag_in_frames = arg_parse_uint(&arg); + if (global->deadline == VPX_DL_REALTIME && + config->cfg.rc_end_usage == VPX_CBR && + config->cfg.g_lag_in_frames != 0) { + warn("non-zero %s option ignored in realtime CBR mode.\n", arg.name); + config->cfg.g_lag_in_frames = 0; + } + } else if (arg_match(&arg, &dropframe_thresh, argi)) { + config->cfg.rc_dropframe_thresh = arg_parse_uint(&arg); + } else if (arg_match(&arg, &resize_allowed, argi)) { + config->cfg.rc_resize_allowed = arg_parse_uint(&arg); + } else if (arg_match(&arg, &resize_width, argi)) { + config->cfg.rc_scaled_width = arg_parse_uint(&arg); + } else if (arg_match(&arg, &resize_height, argi)) { + config->cfg.rc_scaled_height = arg_parse_uint(&arg); + } else if (arg_match(&arg, &resize_up_thresh, argi)) { + config->cfg.rc_resize_up_thresh = arg_parse_uint(&arg); + } else if (arg_match(&arg, &resize_down_thresh, argi)) { + config->cfg.rc_resize_down_thresh = arg_parse_uint(&arg); + } else if (arg_match(&arg, &end_usage, argi)) { + config->cfg.rc_end_usage = arg_parse_enum_or_int(&arg); + } else if (arg_match(&arg, &target_bitrate, argi)) { + config->cfg.rc_target_bitrate = arg_parse_uint(&arg); + } else if (arg_match(&arg, &min_quantizer, argi)) { + config->cfg.rc_min_quantizer = arg_parse_uint(&arg); + } else if (arg_match(&arg, &max_quantizer, argi)) { + config->cfg.rc_max_quantizer = arg_parse_uint(&arg); + } else if (arg_match(&arg, &undershoot_pct, argi)) { + config->cfg.rc_undershoot_pct = arg_parse_uint(&arg); + } else if (arg_match(&arg, &overshoot_pct, argi)) { + config->cfg.rc_overshoot_pct = arg_parse_uint(&arg); + } else if (arg_match(&arg, &buf_sz, argi)) { + config->cfg.rc_buf_sz = arg_parse_uint(&arg); + } else if (arg_match(&arg, &buf_initial_sz, argi)) { + config->cfg.rc_buf_initial_sz = arg_parse_uint(&arg); + } else if (arg_match(&arg, &buf_optimal_sz, argi)) { + config->cfg.rc_buf_optimal_sz = arg_parse_uint(&arg); + } else if (arg_match(&arg, &bias_pct, argi)) { + config->cfg.rc_2pass_vbr_bias_pct = arg_parse_uint(&arg); + if (global->passes < 2) + warn("option %s ignored in one-pass mode.\n", arg.name); + } else if (arg_match(&arg, &minsection_pct, argi)) { + config->cfg.rc_2pass_vbr_minsection_pct = arg_parse_uint(&arg); + + if (global->passes < 2) + warn("option %s ignored in one-pass mode.\n", arg.name); + } else if (arg_match(&arg, &maxsection_pct, argi)) { + config->cfg.rc_2pass_vbr_maxsection_pct = arg_parse_uint(&arg); + + if (global->passes < 2) + warn("option %s ignored in one-pass mode.\n", arg.name); + } else if (arg_match(&arg, &corpus_complexity, argi)) { + config->cfg.rc_2pass_vbr_corpus_complexity = arg_parse_uint(&arg); + + if (global->passes < 2) + warn("option %s ignored in one-pass mode.\n", arg.name); + } else if (arg_match(&arg, &kf_min_dist, argi)) { + config->cfg.kf_min_dist = arg_parse_uint(&arg); + } else if (arg_match(&arg, &kf_max_dist, argi)) { + config->cfg.kf_max_dist = arg_parse_uint(&arg); + } else if (arg_match(&arg, &kf_disabled, argi)) { + config->cfg.kf_mode = VPX_KF_DISABLED; +#if CONFIG_VP9_ENCODER + } else if (arg_match(&arg, &use_vizier_rc_params, argi)) { + config->cfg.use_vizier_rc_params = arg_parse_int(&arg); + } else if (arg_match(&arg, &active_wq_factor, argi)) { + config->cfg.active_wq_factor = arg_parse_rational(&arg); + } else if (arg_match(&arg, &err_per_mb_factor, argi)) { + config->cfg.err_per_mb_factor = arg_parse_rational(&arg); + } else if (arg_match(&arg, &sr_default_decay_limit, argi)) { + config->cfg.sr_default_decay_limit = arg_parse_rational(&arg); + } else if (arg_match(&arg, &sr_diff_factor, argi)) { + config->cfg.sr_diff_factor = arg_parse_rational(&arg); + } else if (arg_match(&arg, &kf_err_per_mb_factor, argi)) { + config->cfg.kf_err_per_mb_factor = arg_parse_rational(&arg); + } else if (arg_match(&arg, &kf_frame_min_boost_factor, argi)) { + config->cfg.kf_frame_min_boost_factor = arg_parse_rational(&arg); + } else if (arg_match(&arg, &kf_frame_max_boost_first_factor, argi)) { + config->cfg.kf_frame_max_boost_first_factor = arg_parse_rational(&arg); + } else if (arg_match(&arg, &kf_frame_max_boost_subs_factor, argi)) { + config->cfg.kf_frame_max_boost_subs_factor = arg_parse_rational(&arg); + } else if (arg_match(&arg, &kf_max_total_boost_factor, argi)) { + config->cfg.kf_max_total_boost_factor = arg_parse_rational(&arg); + } else if (arg_match(&arg, &gf_max_total_boost_factor, argi)) { + config->cfg.gf_max_total_boost_factor = arg_parse_rational(&arg); + } else if (arg_match(&arg, &gf_frame_max_boost_factor, argi)) { + config->cfg.gf_frame_max_boost_factor = arg_parse_rational(&arg); + } else if (arg_match(&arg, &zm_factor, argi)) { + config->cfg.zm_factor = arg_parse_rational(&arg); + } else if (arg_match(&arg, &rd_mult_inter_qp_fac, argi)) { + config->cfg.rd_mult_inter_qp_fac = arg_parse_rational(&arg); + } else if (arg_match(&arg, &rd_mult_arf_qp_fac, argi)) { + config->cfg.rd_mult_arf_qp_fac = arg_parse_rational(&arg); + } else if (arg_match(&arg, &rd_mult_key_qp_fac, argi)) { + config->cfg.rd_mult_key_qp_fac = arg_parse_rational(&arg); +#endif +#if CONFIG_VP9_HIGHBITDEPTH + } else if (arg_match(&arg, &test16bitinternalarg, argi)) { + if (strcmp(global->codec->name, "vp9") == 0) { + test_16bit_internal = 1; + } +#endif + } else { + int i, match = 0; + for (i = 0; ctrl_args[i]; i++) { + if (arg_match(&arg, ctrl_args[i], argi)) { + int j; + match = 1; + + /* Point either to the next free element or the first + * instance of this control. + */ + for (j = 0; j < config->arg_ctrl_cnt; j++) + if (ctrl_args_map != NULL && + config->arg_ctrls[j][0] == ctrl_args_map[i]) + break; + + /* Update/insert */ + assert(j < (int)ARG_CTRL_CNT_MAX); + if (ctrl_args_map != NULL && j < (int)ARG_CTRL_CNT_MAX) { + config->arg_ctrls[j][0] = ctrl_args_map[i]; + config->arg_ctrls[j][1] = arg_parse_enum_or_int(&arg); + if (j == config->arg_ctrl_cnt) config->arg_ctrl_cnt++; + } + } + } + if (!match) argj++; + } + } +#if CONFIG_VP9_HIGHBITDEPTH + if (strcmp(global->codec->name, "vp9") == 0) { + config->use_16bit_internal = + test_16bit_internal | (config->cfg.g_profile > 1); + } +#endif + return eos_mark_found; +} + +#define FOREACH_STREAM(func) \ + do { \ + struct stream_state *stream; \ + for (stream = streams; stream; stream = stream->next) { \ + func; \ + } \ + } while (0) + +static void validate_stream_config(const struct stream_state *stream, + const struct VpxEncoderConfig *global) { + const struct stream_state *streami; + (void)global; + + if (!stream->config.cfg.g_w || !stream->config.cfg.g_h) + fatal( + "Stream %d: Specify stream dimensions with --width (-w) " + " and --height (-h)", + stream->index); + + // Check that the codec bit depth is greater than the input bit depth. + if (stream->config.cfg.g_input_bit_depth > + (unsigned int)stream->config.cfg.g_bit_depth) { + fatal("Stream %d: codec bit depth (%d) less than input bit depth (%d)", + stream->index, (int)stream->config.cfg.g_bit_depth, + stream->config.cfg.g_input_bit_depth); + } + + for (streami = stream; streami; streami = streami->next) { + /* All streams require output files */ + if (!streami->config.out_fn) + fatal("Stream %d: Output file is required (specify with -o)", + streami->index); + + /* Check for two streams outputting to the same file */ + if (streami != stream) { + const char *a = stream->config.out_fn; + const char *b = streami->config.out_fn; + if (!strcmp(a, b) && strcmp(a, "/dev/null") && strcmp(a, ":nul")) + fatal("Stream %d: duplicate output file (from stream %d)", + streami->index, stream->index); + } + + /* Check for two streams sharing a stats file. */ + if (streami != stream) { + const char *a = stream->config.stats_fn; + const char *b = streami->config.stats_fn; + if (a && b && !strcmp(a, b)) + fatal("Stream %d: duplicate stats file (from stream %d)", + streami->index, stream->index); + } + } +} + +static void set_stream_dimensions(struct stream_state *stream, unsigned int w, + unsigned int h) { + if (!stream->config.cfg.g_w) { + if (!stream->config.cfg.g_h) + stream->config.cfg.g_w = w; + else + stream->config.cfg.g_w = w * stream->config.cfg.g_h / h; + } + if (!stream->config.cfg.g_h) { + stream->config.cfg.g_h = h * stream->config.cfg.g_w / w; + } +} + +static const char *file_type_to_string(enum VideoFileType t) { + switch (t) { + case FILE_TYPE_RAW: return "RAW"; + case FILE_TYPE_Y4M: return "Y4M"; + default: return "Other"; + } +} + +static const char *image_format_to_string(vpx_img_fmt_t f) { + switch (f) { + case VPX_IMG_FMT_I420: return "I420"; + case VPX_IMG_FMT_I422: return "I422"; + case VPX_IMG_FMT_I444: return "I444"; + case VPX_IMG_FMT_I440: return "I440"; + case VPX_IMG_FMT_YV12: return "YV12"; + case VPX_IMG_FMT_I42016: return "I42016"; + case VPX_IMG_FMT_I42216: return "I42216"; + case VPX_IMG_FMT_I44416: return "I44416"; + case VPX_IMG_FMT_I44016: return "I44016"; + default: return "Other"; + } +} + +static void show_stream_config(struct stream_state *stream, + struct VpxEncoderConfig *global, + struct VpxInputContext *input) { +#define SHOW(field) \ + fprintf(stderr, " %-28s = %d\n", #field, stream->config.cfg.field) + + if (stream->index == 0) { + fprintf(stderr, "Codec: %s\n", + vpx_codec_iface_name(global->codec->codec_interface())); + fprintf(stderr, "Source file: %s File Type: %s Format: %s\n", + input->filename, file_type_to_string(input->file_type), + image_format_to_string(input->fmt)); + } + if (stream->next || stream->index) + fprintf(stderr, "\nStream Index: %d\n", stream->index); + fprintf(stderr, "Destination file: %s\n", stream->config.out_fn); + fprintf(stderr, "Encoder parameters:\n"); + + SHOW(g_usage); + SHOW(g_threads); + SHOW(g_profile); + SHOW(g_w); + SHOW(g_h); + SHOW(g_bit_depth); + SHOW(g_input_bit_depth); + SHOW(g_timebase.num); + SHOW(g_timebase.den); + SHOW(g_error_resilient); + SHOW(g_pass); + SHOW(g_lag_in_frames); + SHOW(rc_dropframe_thresh); + SHOW(rc_resize_allowed); + SHOW(rc_scaled_width); + SHOW(rc_scaled_height); + SHOW(rc_resize_up_thresh); + SHOW(rc_resize_down_thresh); + SHOW(rc_end_usage); + SHOW(rc_target_bitrate); + SHOW(rc_min_quantizer); + SHOW(rc_max_quantizer); + SHOW(rc_undershoot_pct); + SHOW(rc_overshoot_pct); + SHOW(rc_buf_sz); + SHOW(rc_buf_initial_sz); + SHOW(rc_buf_optimal_sz); + SHOW(rc_2pass_vbr_bias_pct); + SHOW(rc_2pass_vbr_minsection_pct); + SHOW(rc_2pass_vbr_maxsection_pct); + SHOW(rc_2pass_vbr_corpus_complexity); + SHOW(kf_mode); + SHOW(kf_min_dist); + SHOW(kf_max_dist); + // Temporary use for debug + SHOW(use_vizier_rc_params); + SHOW(active_wq_factor.num); + SHOW(active_wq_factor.den); +} + +static void open_output_file(struct stream_state *stream, + struct VpxEncoderConfig *global, + const struct VpxRational *pixel_aspect_ratio) { + const char *fn = stream->config.out_fn; + const struct vpx_codec_enc_cfg *const cfg = &stream->config.cfg; + + if (cfg->g_pass == VPX_RC_FIRST_PASS) return; + + stream->file = strcmp(fn, "-") ? fopen(fn, "wb") : set_binary_mode(stdout); + + if (!stream->file) fatal("Failed to open output file"); + + if (stream->config.write_webm && fseek(stream->file, 0, SEEK_CUR)) + fatal("WebM output to pipes not supported."); + +#if CONFIG_WEBM_IO + if (stream->config.write_webm) { + stream->webm_ctx.stream = stream->file; + write_webm_file_header(&stream->webm_ctx, cfg, stream->config.stereo_fmt, + global->codec->fourcc, pixel_aspect_ratio); + } +#else + (void)pixel_aspect_ratio; +#endif + + if (!stream->config.write_webm) { + ivf_write_file_header(stream->file, cfg, global->codec->fourcc, 0); + } +} + +static void close_output_file(struct stream_state *stream, + unsigned int fourcc) { + const struct vpx_codec_enc_cfg *const cfg = &stream->config.cfg; + + if (cfg->g_pass == VPX_RC_FIRST_PASS) return; + +#if CONFIG_WEBM_IO + if (stream->config.write_webm) { + write_webm_file_footer(&stream->webm_ctx); + } +#endif + + if (!stream->config.write_webm) { + if (!fseek(stream->file, 0, SEEK_SET)) + ivf_write_file_header(stream->file, &stream->config.cfg, fourcc, + stream->frames_out); + } + + fclose(stream->file); +} + +static void setup_pass(struct stream_state *stream, + struct VpxEncoderConfig *global, int pass) { + if (stream->config.stats_fn) { + if (!stats_open_file(&stream->stats, stream->config.stats_fn, pass)) + fatal("Failed to open statistics store"); + } else { + if (!stats_open_mem(&stream->stats, pass)) + fatal("Failed to open statistics store"); + } + + stream->config.cfg.g_pass = global->passes == 2 + ? pass ? VPX_RC_LAST_PASS : VPX_RC_FIRST_PASS + : VPX_RC_ONE_PASS; + if (pass) { + stream->config.cfg.rc_twopass_stats_in = stats_get(&stream->stats); + } + + stream->cx_time = 0; + stream->nbytes = 0; + stream->frames_out = 0; +} + +static void initialize_encoder(struct stream_state *stream, + struct VpxEncoderConfig *global) { + int i; + int flags = 0; + + flags |= global->show_psnr ? VPX_CODEC_USE_PSNR : 0; + flags |= global->out_part ? VPX_CODEC_USE_OUTPUT_PARTITION : 0; +#if CONFIG_VP9_HIGHBITDEPTH + flags |= stream->config.use_16bit_internal ? VPX_CODEC_USE_HIGHBITDEPTH : 0; +#endif + + /* Construct Encoder Context */ + vpx_codec_enc_init(&stream->encoder, global->codec->codec_interface(), + &stream->config.cfg, flags); + ctx_exit_on_error(&stream->encoder, "Failed to initialize encoder"); + + /* Note that we bypass the vpx_codec_control wrapper macro because + * we're being clever to store the control IDs in an array. Real + * applications will want to make use of the enumerations directly + */ + for (i = 0; i < stream->config.arg_ctrl_cnt; i++) { + int ctrl = stream->config.arg_ctrls[i][0]; + int value = stream->config.arg_ctrls[i][1]; + if (vpx_codec_control_(&stream->encoder, ctrl, value)) + fprintf(stderr, "Error: Tried to set control %d = %d\n", ctrl, value); + + ctx_exit_on_error(&stream->encoder, "Failed to control codec"); + } + +#if CONFIG_DECODERS + if (global->test_decode != TEST_DECODE_OFF) { + const VpxInterface *decoder = get_vpx_decoder_by_name(global->codec->name); + vpx_codec_dec_init(&stream->decoder, decoder->codec_interface(), NULL, 0); + } +#endif +} + +static void encode_frame(struct stream_state *stream, + struct VpxEncoderConfig *global, struct vpx_image *img, + unsigned int frames_in) { + vpx_codec_pts_t frame_start, next_frame_start; + struct vpx_codec_enc_cfg *cfg = &stream->config.cfg; + struct vpx_usec_timer timer; + + frame_start = + (cfg->g_timebase.den * (int64_t)(frames_in - 1) * global->framerate.den) / + cfg->g_timebase.num / global->framerate.num; + next_frame_start = + (cfg->g_timebase.den * (int64_t)(frames_in)*global->framerate.den) / + cfg->g_timebase.num / global->framerate.num; + +/* Scale if necessary */ +#if CONFIG_VP9_HIGHBITDEPTH + if (img) { + if ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) && + (img->d_w != cfg->g_w || img->d_h != cfg->g_h)) { + if (img->fmt != VPX_IMG_FMT_I42016) { + fprintf(stderr, "%s can only scale 4:2:0 inputs\n", exec_name); + exit(EXIT_FAILURE); + } +#if CONFIG_LIBYUV + if (!stream->img) { + stream->img = + vpx_img_alloc(NULL, VPX_IMG_FMT_I42016, cfg->g_w, cfg->g_h, 16); + } + I420Scale_16( + (uint16_t *)img->planes[VPX_PLANE_Y], img->stride[VPX_PLANE_Y] / 2, + (uint16_t *)img->planes[VPX_PLANE_U], img->stride[VPX_PLANE_U] / 2, + (uint16_t *)img->planes[VPX_PLANE_V], img->stride[VPX_PLANE_V] / 2, + img->d_w, img->d_h, (uint16_t *)stream->img->planes[VPX_PLANE_Y], + stream->img->stride[VPX_PLANE_Y] / 2, + (uint16_t *)stream->img->planes[VPX_PLANE_U], + stream->img->stride[VPX_PLANE_U] / 2, + (uint16_t *)stream->img->planes[VPX_PLANE_V], + stream->img->stride[VPX_PLANE_V] / 2, stream->img->d_w, + stream->img->d_h, kFilterBox); + img = stream->img; +#else + stream->encoder.err = 1; + ctx_exit_on_error(&stream->encoder, + "Stream %d: Failed to encode frame.\n" + "Scaling disabled in this configuration. \n" + "To enable, configure with --enable-libyuv\n", + stream->index); +#endif + } + } +#endif + if (img && (img->d_w != cfg->g_w || img->d_h != cfg->g_h)) { + if (img->fmt != VPX_IMG_FMT_I420 && img->fmt != VPX_IMG_FMT_YV12) { + fprintf(stderr, "%s can only scale 4:2:0 8bpp inputs\n", exec_name); + exit(EXIT_FAILURE); + } +#if CONFIG_LIBYUV + if (!stream->img) + stream->img = + vpx_img_alloc(NULL, VPX_IMG_FMT_I420, cfg->g_w, cfg->g_h, 16); + I420Scale( + img->planes[VPX_PLANE_Y], img->stride[VPX_PLANE_Y], + img->planes[VPX_PLANE_U], img->stride[VPX_PLANE_U], + img->planes[VPX_PLANE_V], img->stride[VPX_PLANE_V], img->d_w, img->d_h, + stream->img->planes[VPX_PLANE_Y], stream->img->stride[VPX_PLANE_Y], + stream->img->planes[VPX_PLANE_U], stream->img->stride[VPX_PLANE_U], + stream->img->planes[VPX_PLANE_V], stream->img->stride[VPX_PLANE_V], + stream->img->d_w, stream->img->d_h, kFilterBox); + img = stream->img; +#else + stream->encoder.err = 1; + ctx_exit_on_error(&stream->encoder, + "Stream %d: Failed to encode frame.\n" + "Scaling disabled in this configuration. \n" + "To enable, configure with --enable-libyuv\n", + stream->index); +#endif + } + + vpx_usec_timer_start(&timer); + vpx_codec_encode(&stream->encoder, img, frame_start, + (unsigned long)(next_frame_start - frame_start), 0, + global->deadline); + vpx_usec_timer_mark(&timer); + stream->cx_time += vpx_usec_timer_elapsed(&timer); + ctx_exit_on_error(&stream->encoder, "Stream %d: Failed to encode frame", + stream->index); +} + +static void update_quantizer_histogram(struct stream_state *stream) { + if (stream->config.cfg.g_pass != VPX_RC_FIRST_PASS) { + int q; + + vpx_codec_control(&stream->encoder, VP8E_GET_LAST_QUANTIZER_64, &q); + ctx_exit_on_error(&stream->encoder, "Failed to read quantizer"); + stream->counts[q]++; + } +} + +static void get_cx_data(struct stream_state *stream, + struct VpxEncoderConfig *global, int *got_data) { + const vpx_codec_cx_pkt_t *pkt; + const struct vpx_codec_enc_cfg *cfg = &stream->config.cfg; + vpx_codec_iter_t iter = NULL; + + *got_data = 0; + while ((pkt = vpx_codec_get_cx_data(&stream->encoder, &iter))) { + static size_t fsize = 0; + static FileOffset ivf_header_pos = 0; + + switch (pkt->kind) { + case VPX_CODEC_CX_FRAME_PKT: + if (!(pkt->data.frame.flags & VPX_FRAME_IS_FRAGMENT)) { + stream->frames_out++; + } + if (!global->quiet) + fprintf(stderr, " %6luF", (unsigned long)pkt->data.frame.sz); + + update_rate_histogram(stream->rate_hist, cfg, pkt); +#if CONFIG_WEBM_IO + if (stream->config.write_webm) { + write_webm_block(&stream->webm_ctx, cfg, pkt); + } +#endif + if (!stream->config.write_webm) { + if (pkt->data.frame.partition_id <= 0) { + ivf_header_pos = ftello(stream->file); + fsize = pkt->data.frame.sz; + + ivf_write_frame_header(stream->file, pkt->data.frame.pts, fsize); + } else { + fsize += pkt->data.frame.sz; + + if (!(pkt->data.frame.flags & VPX_FRAME_IS_FRAGMENT)) { + const FileOffset currpos = ftello(stream->file); + fseeko(stream->file, ivf_header_pos, SEEK_SET); + ivf_write_frame_size(stream->file, fsize); + fseeko(stream->file, currpos, SEEK_SET); + } + } + + (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, + stream->file); + } + stream->nbytes += pkt->data.raw.sz; + + *got_data = 1; +#if CONFIG_DECODERS + if (global->test_decode != TEST_DECODE_OFF && !stream->mismatch_seen) { + vpx_codec_decode(&stream->decoder, pkt->data.frame.buf, + (unsigned int)pkt->data.frame.sz, NULL, 0); + if (stream->decoder.err) { + warn_or_exit_on_error(&stream->decoder, + global->test_decode == TEST_DECODE_FATAL, + "Failed to decode frame %d in stream %d", + stream->frames_out + 1, stream->index); + stream->mismatch_seen = stream->frames_out + 1; + } + } +#endif + break; + case VPX_CODEC_STATS_PKT: + stream->frames_out++; + stats_write(&stream->stats, pkt->data.twopass_stats.buf, + pkt->data.twopass_stats.sz); + stream->nbytes += pkt->data.raw.sz; + break; + case VPX_CODEC_PSNR_PKT: + + if (global->show_psnr) { + int i; + + stream->psnr_sse_total += pkt->data.psnr.sse[0]; + stream->psnr_samples_total += pkt->data.psnr.samples[0]; + for (i = 0; i < 4; i++) { + if (!global->quiet) + fprintf(stderr, "%.3f ", pkt->data.psnr.psnr[i]); + stream->psnr_totals[i] += pkt->data.psnr.psnr[i]; + } + stream->psnr_count++; + } + + break; + default: break; + } + } +} + +static void show_psnr(struct stream_state *stream, double peak) { + int i; + double ovpsnr; + + if (!stream->psnr_count) return; + + fprintf(stderr, "Stream %d PSNR (Overall/Avg/Y/U/V)", stream->index); + ovpsnr = sse_to_psnr((double)stream->psnr_samples_total, peak, + (double)stream->psnr_sse_total); + fprintf(stderr, " %.3f", ovpsnr); + + for (i = 0; i < 4; i++) { + fprintf(stderr, " %.3f", stream->psnr_totals[i] / stream->psnr_count); + } + fprintf(stderr, "\n"); +} + +static float usec_to_fps(uint64_t usec, unsigned int frames) { + return (float)(usec > 0 ? frames * 1000000.0 / (float)usec : 0); +} + +static void test_decode(struct stream_state *stream, + enum TestDecodeFatality fatal, + const VpxInterface *codec) { + vpx_image_t enc_img, dec_img; + + if (stream->mismatch_seen) return; + + /* Get the internal reference frame */ + if (strcmp(codec->name, "vp8") == 0) { + struct vpx_ref_frame ref_enc, ref_dec; + int aligned_width = (stream->config.cfg.g_w + 15) & ~15; + int aligned_height = (stream->config.cfg.g_h + 15) & ~15; + + vpx_img_alloc(&ref_enc.img, VPX_IMG_FMT_I420, aligned_width, aligned_height, + 1); + enc_img = ref_enc.img; + vpx_img_alloc(&ref_dec.img, VPX_IMG_FMT_I420, aligned_width, aligned_height, + 1); + dec_img = ref_dec.img; + + ref_enc.frame_type = VP8_LAST_FRAME; + ref_dec.frame_type = VP8_LAST_FRAME; + vpx_codec_control(&stream->encoder, VP8_COPY_REFERENCE, &ref_enc); + vpx_codec_control(&stream->decoder, VP8_COPY_REFERENCE, &ref_dec); + } else { + struct vp9_ref_frame ref_enc, ref_dec; + + ref_enc.idx = 0; + ref_dec.idx = 0; + vpx_codec_control(&stream->encoder, VP9_GET_REFERENCE, &ref_enc); + enc_img = ref_enc.img; + vpx_codec_control(&stream->decoder, VP9_GET_REFERENCE, &ref_dec); + dec_img = ref_dec.img; +#if CONFIG_VP9_HIGHBITDEPTH + if ((enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) != + (dec_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH)) { + if (enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) { + vpx_img_alloc(&enc_img, enc_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH, + enc_img.d_w, enc_img.d_h, 16); + vpx_img_truncate_16_to_8(&enc_img, &ref_enc.img); + } + if (dec_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) { + vpx_img_alloc(&dec_img, dec_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH, + dec_img.d_w, dec_img.d_h, 16); + vpx_img_truncate_16_to_8(&dec_img, &ref_dec.img); + } + } +#endif + } + ctx_exit_on_error(&stream->encoder, "Failed to get encoder reference frame"); + ctx_exit_on_error(&stream->decoder, "Failed to get decoder reference frame"); + + if (!compare_img(&enc_img, &dec_img)) { + int y[4], u[4], v[4]; +#if CONFIG_VP9_HIGHBITDEPTH + if (enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) { + find_mismatch_high(&enc_img, &dec_img, y, u, v); + } else { + find_mismatch(&enc_img, &dec_img, y, u, v); + } +#else + find_mismatch(&enc_img, &dec_img, y, u, v); +#endif + stream->decoder.err = 1; + warn_or_exit_on_error(&stream->decoder, fatal == TEST_DECODE_FATAL, + "Stream %d: Encode/decode mismatch on frame %d at" + " Y[%d, %d] {%d/%d}," + " U[%d, %d] {%d/%d}," + " V[%d, %d] {%d/%d}", + stream->index, stream->frames_out, y[0], y[1], y[2], + y[3], u[0], u[1], u[2], u[3], v[0], v[1], v[2], v[3]); + stream->mismatch_seen = stream->frames_out; + } + + vpx_img_free(&enc_img); + vpx_img_free(&dec_img); +} + +static void print_time(const char *label, int64_t etl) { + int64_t hours; + int64_t mins; + int64_t secs; + + if (etl >= 0) { + hours = etl / 3600; + etl -= hours * 3600; + mins = etl / 60; + etl -= mins * 60; + secs = etl; + + fprintf(stderr, "[%3s %2" PRId64 ":%02" PRId64 ":%02" PRId64 "] ", label, + hours, mins, secs); + } else { + fprintf(stderr, "[%3s unknown] ", label); + } +} + +int main(int argc, const char **argv_) { + int pass; + vpx_image_t raw; +#if CONFIG_VP9_HIGHBITDEPTH + vpx_image_t raw_shift; + int allocated_raw_shift = 0; + int use_16bit_internal = 0; + int input_shift = 0; +#endif + int frame_avail, got_data; + + struct VpxInputContext input; + struct VpxEncoderConfig global; + struct stream_state *streams = NULL; + char **argv, **argi; + uint64_t cx_time = 0; + int stream_cnt = 0; + int res = 0; + + memset(&input, 0, sizeof(input)); + memset(&raw, 0, sizeof(raw)); + exec_name = argv_[0]; + + /* Setup default input stream settings */ + input.framerate.numerator = 30; + input.framerate.denominator = 1; + input.only_i420 = 1; + input.bit_depth = 0; + + /* First parse the global configuration values, because we want to apply + * other parameters on top of the default configuration provided by the + * codec. + */ + argv = argv_dup(argc - 1, argv_ + 1); + if (!argv) { + fprintf(stderr, "Error allocating argument list\n"); + return EXIT_FAILURE; + } + parse_global_config(&global, argv); + + if (argc < 3) usage_exit(); + + switch (global.color_type) { + case I420: input.fmt = VPX_IMG_FMT_I420; break; + case I422: input.fmt = VPX_IMG_FMT_I422; break; + case I444: input.fmt = VPX_IMG_FMT_I444; break; + case I440: input.fmt = VPX_IMG_FMT_I440; break; + case YV12: input.fmt = VPX_IMG_FMT_YV12; break; + case NV12: input.fmt = VPX_IMG_FMT_NV12; break; + } + + { + /* Now parse each stream's parameters. Using a local scope here + * due to the use of 'stream' as loop variable in FOREACH_STREAM + * loops + */ + struct stream_state *stream = NULL; + + do { + stream = new_stream(&global, stream); + stream_cnt++; + if (!streams) streams = stream; + } while (parse_stream_params(&global, stream, argv)); + } + + /* Check for unrecognized options */ + for (argi = argv; *argi; argi++) + if (argi[0][0] == '-' && argi[0][1]) + die("Error: Unrecognized option %s\n", *argi); + + FOREACH_STREAM(check_encoder_config(global.disable_warning_prompt, &global, + &stream->config.cfg);); + + /* Handle non-option arguments */ + input.filename = argv[0]; + + if (!input.filename) { + fprintf(stderr, "No input file specified!\n"); + usage_exit(); + } + + /* Decide if other chroma subsamplings than 4:2:0 are supported */ + if (global.codec->fourcc == VP9_FOURCC) input.only_i420 = 0; + + for (pass = global.pass ? global.pass - 1 : 0; pass < global.passes; pass++) { + int frames_in = 0, seen_frames = 0; + int64_t estimated_time_left = -1; + int64_t average_rate = -1; + int64_t lagged_count = 0; + + open_input_file(&input); + + /* If the input file doesn't specify its w/h (raw files), try to get + * the data from the first stream's configuration. + */ + if (!input.width || !input.height) { + FOREACH_STREAM({ + if (stream->config.cfg.g_w && stream->config.cfg.g_h) { + input.width = stream->config.cfg.g_w; + input.height = stream->config.cfg.g_h; + break; + } + }); + } + + /* Update stream configurations from the input file's parameters */ + if (!input.width || !input.height) + fatal( + "Specify stream dimensions with --width (-w) " + " and --height (-h)"); + + /* If input file does not specify bit-depth but input-bit-depth parameter + * exists, assume that to be the input bit-depth. However, if the + * input-bit-depth paramter does not exist, assume the input bit-depth + * to be the same as the codec bit-depth. + */ + if (!input.bit_depth) { + FOREACH_STREAM({ + if (stream->config.cfg.g_input_bit_depth) + input.bit_depth = stream->config.cfg.g_input_bit_depth; + else + input.bit_depth = stream->config.cfg.g_input_bit_depth = + (int)stream->config.cfg.g_bit_depth; + }); + if (input.bit_depth > 8) input.fmt |= VPX_IMG_FMT_HIGHBITDEPTH; + } else { + FOREACH_STREAM( + { stream->config.cfg.g_input_bit_depth = input.bit_depth; }); + } + + FOREACH_STREAM(set_stream_dimensions(stream, input.width, input.height)); + FOREACH_STREAM(validate_stream_config(stream, &global)); + + /* Ensure that --passes and --pass are consistent. If --pass is set and + * --passes=2, ensure --fpf was set. + */ + if (global.pass && global.passes == 2) + FOREACH_STREAM({ + if (!stream->config.stats_fn) + die("Stream %d: Must specify --fpf when --pass=%d" + " and --passes=2\n", + stream->index, global.pass); + }); + +#if !CONFIG_WEBM_IO + FOREACH_STREAM({ + if (stream->config.write_webm) { + stream->config.write_webm = 0; + warn( + "vpxenc was compiled without WebM container support." + "Producing IVF output"); + } + }); +#endif + + /* Use the frame rate from the file only if none was specified + * on the command-line. + */ + if (!global.have_framerate) { + global.framerate.num = input.framerate.numerator; + global.framerate.den = input.framerate.denominator; + FOREACH_STREAM(stream->config.cfg.g_timebase.den = global.framerate.num; + stream->config.cfg.g_timebase.num = global.framerate.den); + } + + /* Show configuration */ + if (global.verbose && pass == 0) + FOREACH_STREAM(show_stream_config(stream, &global, &input)); + + if (pass == (global.pass ? global.pass - 1 : 0)) { + // The Y4M reader does its own allocation. + if (input.file_type != FILE_TYPE_Y4M) { + vpx_img_alloc(&raw, input.fmt, input.width, input.height, 32); + } + FOREACH_STREAM(stream->rate_hist = init_rate_histogram( + &stream->config.cfg, &global.framerate)); + } + + FOREACH_STREAM(setup_pass(stream, &global, pass)); + FOREACH_STREAM( + open_output_file(stream, &global, &input.pixel_aspect_ratio)); + FOREACH_STREAM(initialize_encoder(stream, &global)); + +#if CONFIG_VP9_HIGHBITDEPTH + if (strcmp(global.codec->name, "vp9") == 0) { + // Check to see if at least one stream uses 16 bit internal. + // Currently assume that the bit_depths for all streams using + // highbitdepth are the same. + FOREACH_STREAM({ + if (stream->config.use_16bit_internal) { + use_16bit_internal = 1; + } + if (stream->config.cfg.g_profile == 0) { + input_shift = 0; + } else { + input_shift = (int)stream->config.cfg.g_bit_depth - + stream->config.cfg.g_input_bit_depth; + } + }); + } +#endif + + frame_avail = 1; + got_data = 0; + + while (frame_avail || got_data) { + struct vpx_usec_timer timer; + + if (!global.limit || frames_in < global.limit) { + frame_avail = read_frame(&input, &raw); + + if (frame_avail) frames_in++; + seen_frames = + frames_in > global.skip_frames ? frames_in - global.skip_frames : 0; + + if (!global.quiet) { + float fps = usec_to_fps(cx_time, seen_frames); + fprintf(stderr, "\rPass %d/%d ", pass + 1, global.passes); + + if (stream_cnt == 1) + fprintf(stderr, "frame %4d/%-4d %7" PRId64 "B ", frames_in, + streams->frames_out, (int64_t)streams->nbytes); + else + fprintf(stderr, "frame %4d ", frames_in); + + fprintf(stderr, "%7" PRId64 " %s %.2f %s ", + cx_time > 9999999 ? cx_time / 1000 : cx_time, + cx_time > 9999999 ? "ms" : "us", fps >= 1.0 ? fps : fps * 60, + fps >= 1.0 ? "fps" : "fpm"); + print_time("ETA", estimated_time_left); + } + + } else + frame_avail = 0; + + if (frames_in > global.skip_frames) { +#if CONFIG_VP9_HIGHBITDEPTH + vpx_image_t *frame_to_encode; + if (input_shift || (use_16bit_internal && input.bit_depth == 8)) { + assert(use_16bit_internal); + // Input bit depth and stream bit depth do not match, so up + // shift frame to stream bit depth + if (!allocated_raw_shift) { + vpx_img_alloc(&raw_shift, raw.fmt | VPX_IMG_FMT_HIGHBITDEPTH, + input.width, input.height, 32); + allocated_raw_shift = 1; + } + vpx_img_upshift(&raw_shift, &raw, input_shift); + frame_to_encode = &raw_shift; + } else { + frame_to_encode = &raw; + } + vpx_usec_timer_start(&timer); + if (use_16bit_internal) { + assert(frame_to_encode->fmt & VPX_IMG_FMT_HIGHBITDEPTH); + FOREACH_STREAM({ + if (stream->config.use_16bit_internal) + encode_frame(stream, &global, + frame_avail ? frame_to_encode : NULL, frames_in); + else + assert(0); + }); + } else { + assert((frame_to_encode->fmt & VPX_IMG_FMT_HIGHBITDEPTH) == 0); + FOREACH_STREAM(encode_frame(stream, &global, + frame_avail ? frame_to_encode : NULL, + frames_in)); + } +#else + vpx_usec_timer_start(&timer); + FOREACH_STREAM(encode_frame(stream, &global, frame_avail ? &raw : NULL, + frames_in)); +#endif + vpx_usec_timer_mark(&timer); + cx_time += vpx_usec_timer_elapsed(&timer); + + FOREACH_STREAM(update_quantizer_histogram(stream)); + + got_data = 0; + FOREACH_STREAM(get_cx_data(stream, &global, &got_data)); + + if (!got_data && input.length && streams != NULL && + !streams->frames_out) { + lagged_count = global.limit ? seen_frames : ftello(input.file); + } else if (input.length) { + int64_t remaining; + int64_t rate; + + if (global.limit) { + const int64_t frame_in_lagged = (seen_frames - lagged_count) * 1000; + + rate = cx_time ? frame_in_lagged * (int64_t)1000000 / cx_time : 0; + remaining = 1000 * (global.limit - global.skip_frames - + seen_frames + lagged_count); + } else { + const int64_t input_pos = ftello(input.file); + const int64_t input_pos_lagged = input_pos - lagged_count; + + rate = cx_time ? input_pos_lagged * (int64_t)1000000 / cx_time : 0; + remaining = input.length - input_pos + lagged_count; + } + + average_rate = + (average_rate <= 0) ? rate : (average_rate * 7 + rate) / 8; + estimated_time_left = average_rate ? remaining / average_rate : -1; + } + + if (got_data && global.test_decode != TEST_DECODE_OFF) + FOREACH_STREAM(test_decode(stream, global.test_decode, global.codec)); + } + + fflush(stdout); + if (!global.quiet) fprintf(stderr, "\033[K"); + } + + if (stream_cnt > 1) fprintf(stderr, "\n"); + + if (!global.quiet) { + FOREACH_STREAM(fprintf( + stderr, + "\rPass %d/%d frame %4d/%-4d %7" PRId64 "B %7" PRId64 "b/f %7" PRId64 + "b/s %7" PRId64 " %s (%.2f fps)\033[K\n", + pass + 1, global.passes, frames_in, stream->frames_out, + (int64_t)stream->nbytes, + seen_frames ? (int64_t)(stream->nbytes * 8 / seen_frames) : 0, + seen_frames + ? (int64_t)stream->nbytes * 8 * (int64_t)global.framerate.num / + global.framerate.den / seen_frames + : 0, + stream->cx_time > 9999999 ? stream->cx_time / 1000 : stream->cx_time, + stream->cx_time > 9999999 ? "ms" : "us", + usec_to_fps(stream->cx_time, seen_frames))); + } + + if (global.show_psnr) { + if (global.codec->fourcc == VP9_FOURCC) { + FOREACH_STREAM( + show_psnr(stream, (1 << stream->config.cfg.g_input_bit_depth) - 1)); + } else { + FOREACH_STREAM(show_psnr(stream, 255.0)); + } + } + + FOREACH_STREAM(vpx_codec_destroy(&stream->encoder)); + + if (global.test_decode != TEST_DECODE_OFF) { + FOREACH_STREAM(vpx_codec_destroy(&stream->decoder)); + } + + close_input_file(&input); + + if (global.test_decode == TEST_DECODE_FATAL) { + FOREACH_STREAM(res |= stream->mismatch_seen); + } + FOREACH_STREAM(close_output_file(stream, global.codec->fourcc)); + + FOREACH_STREAM(stats_close(&stream->stats, global.passes - 1)); + + if (global.pass) break; + } + + if (global.show_q_hist_buckets) + FOREACH_STREAM( + show_q_histogram(stream->counts, global.show_q_hist_buckets)); + + if (global.show_rate_hist_buckets) + FOREACH_STREAM(show_rate_histogram(stream->rate_hist, &stream->config.cfg, + global.show_rate_hist_buckets)); + FOREACH_STREAM(destroy_rate_histogram(stream->rate_hist)); + +#if CONFIG_INTERNAL_STATS + /* TODO(jkoleszar): This doesn't belong in this executable. Do it for now, + * to match some existing utilities. + */ + if (!(global.pass == 1 && global.passes == 2)) + FOREACH_STREAM({ + FILE *f = fopen("opsnr.stt", "a"); + if (stream->mismatch_seen) { + fprintf(f, "First mismatch occurred in frame %d\n", + stream->mismatch_seen); + } else { + fprintf(f, "No mismatch detected in recon buffers\n"); + } + fclose(f); + }); +#endif + +#if CONFIG_VP9_HIGHBITDEPTH + if (allocated_raw_shift) vpx_img_free(&raw_shift); +#endif + vpx_img_free(&raw); + free(argv); + free(streams); + return res ? EXIT_FAILURE : EXIT_SUCCESS; +} diff --git a/media/libvpx/libvpx/vpxenc.h b/media/libvpx/libvpx/vpxenc.h new file mode 100644 index 0000000000..be54840f7d --- /dev/null +++ b/media/libvpx/libvpx/vpxenc.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_VPXENC_H_ +#define VPX_VPXENC_H_ + +#include "vpx/vpx_encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +enum TestDecodeFatality { + TEST_DECODE_OFF, + TEST_DECODE_FATAL, + TEST_DECODE_WARN, +}; + +typedef enum { + I420, // 4:2:0 8+ bit-depth + I422, // 4:2:2 8+ bit-depth + I444, // 4:4:4 8+ bit-depth + I440, // 4:4:0 8+ bit-depth + YV12, // 4:2:0 with uv flipped, only 8-bit depth + NV12, // 4:2:0 with uv interleaved +} ColorInputType; + +struct VpxInterface; + +/* Configuration elements common to all streams. */ +struct VpxEncoderConfig { + const struct VpxInterface *codec; + int passes; + int pass; + int usage; + int deadline; + ColorInputType color_type; + int quiet; + int verbose; + int limit; + int skip_frames; + int show_psnr; + enum TestDecodeFatality test_decode; + int have_framerate; + struct vpx_rational framerate; + int out_part; + int debug; + int show_q_hist_buckets; + int show_rate_hist_buckets; + int disable_warnings; + int disable_warning_prompt; + int experimental_bitstream; +}; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPXENC_H_ diff --git a/media/libvpx/libvpx/vpxstats.c b/media/libvpx/libvpx/vpxstats.c new file mode 100644 index 0000000000..c0dd14e450 --- /dev/null +++ b/media/libvpx/libvpx/vpxstats.c @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpxstats.h" + +#include +#include +#include + +#include "./tools_common.h" + +int stats_open_file(stats_io_t *stats, const char *fpf, int pass) { + int res; + stats->pass = pass; + + if (pass == 0) { + stats->file = fopen(fpf, "wb"); + stats->buf.sz = 0; + stats->buf.buf = NULL; + res = (stats->file != NULL); + } else { + size_t nbytes; + + stats->file = fopen(fpf, "rb"); + + if (stats->file == NULL) fatal("First-pass stats file does not exist!"); + + if (fseek(stats->file, 0, SEEK_END)) + fatal("First-pass stats file must be seekable!"); + + stats->buf.sz = stats->buf_alloc_sz = ftell(stats->file); + rewind(stats->file); + + stats->buf.buf = malloc(stats->buf_alloc_sz); + + if (!stats->buf.buf) + fatal("Failed to allocate first-pass stats buffer (%u bytes)", + (unsigned int)stats->buf_alloc_sz); + + nbytes = fread(stats->buf.buf, 1, stats->buf.sz, stats->file); + res = (nbytes == stats->buf.sz); + } + + return res; +} + +int stats_open_mem(stats_io_t *stats, int pass) { + int res; + stats->pass = pass; + + if (!pass) { + stats->buf.sz = 0; + stats->buf_alloc_sz = 64 * 1024; + stats->buf.buf = malloc(stats->buf_alloc_sz); + } + + stats->buf_ptr = stats->buf.buf; + res = (stats->buf.buf != NULL); + return res; +} + +void stats_close(stats_io_t *stats, int last_pass) { + if (stats->file) { + if (stats->pass == last_pass) { + free(stats->buf.buf); + } + + fclose(stats->file); + stats->file = NULL; + } else { + if (stats->pass == last_pass) free(stats->buf.buf); + } +} + +void stats_write(stats_io_t *stats, const void *pkt, size_t len) { + if (stats->file) { + (void)fwrite(pkt, 1, len, stats->file); + } else { + if (stats->buf.sz + len > stats->buf_alloc_sz) { + size_t new_sz = stats->buf_alloc_sz + 64 * 1024; + char *new_ptr = realloc(stats->buf.buf, new_sz); + + if (new_ptr) { + stats->buf_ptr = new_ptr + (stats->buf_ptr - (char *)stats->buf.buf); + stats->buf.buf = new_ptr; + stats->buf_alloc_sz = new_sz; + } else { + fatal("Failed to realloc firstpass stats buffer."); + } + } + + memcpy(stats->buf_ptr, pkt, len); + stats->buf.sz += len; + stats->buf_ptr += len; + } +} + +vpx_fixed_buf_t stats_get(stats_io_t *stats) { return stats->buf; } diff --git a/media/libvpx/libvpx/vpxstats.h b/media/libvpx/libvpx/vpxstats.h new file mode 100644 index 0000000000..3625ee3291 --- /dev/null +++ b/media/libvpx/libvpx/vpxstats.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPXSTATS_H_ +#define VPX_VPXSTATS_H_ + +#include + +#include "vpx/vpx_encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* This structure is used to abstract the different ways of handling + * first pass statistics + */ +typedef struct { + vpx_fixed_buf_t buf; + int pass; + FILE *file; + char *buf_ptr; + size_t buf_alloc_sz; +} stats_io_t; + +int stats_open_file(stats_io_t *stats, const char *fpf, int pass); +int stats_open_mem(stats_io_t *stats, int pass); +void stats_close(stats_io_t *stats, int last_pass); +void stats_write(stats_io_t *stats, const void *pkt, size_t len); +vpx_fixed_buf_t stats_get(stats_io_t *stats); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPXSTATS_H_ diff --git a/media/libvpx/libvpx/warnings.c b/media/libvpx/libvpx/warnings.c new file mode 100644 index 0000000000..3e6e702536 --- /dev/null +++ b/media/libvpx/libvpx/warnings.c @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./warnings.h" + +#include +#include +#include +#include + +#include "vpx/vpx_encoder.h" + +#include "./tools_common.h" +#include "./vpxenc.h" + +static const char quantizer_warning_string[] = + "Bad quantizer values. Quantizer values should not be equal, and should " + "differ by at least 8."; +static const char lag_in_frames_with_realtime[] = + "Lag in frames is ignored when deadline is set to realtime for cbr mode."; + +struct WarningListNode { + const char *warning_string; + struct WarningListNode *next_warning; +}; + +struct WarningList { + struct WarningListNode *warning_node; +}; + +static void add_warning(const char *warning_string, + struct WarningList *warning_list) { + struct WarningListNode **node = &warning_list->warning_node; + + struct WarningListNode *new_node = malloc(sizeof(*new_node)); + if (new_node == NULL) { + fatal("Unable to allocate warning node."); + } + + new_node->warning_string = warning_string; + new_node->next_warning = NULL; + + while (*node != NULL) node = &(*node)->next_warning; + + *node = new_node; +} + +static void free_warning_list(struct WarningList *warning_list) { + while (warning_list->warning_node != NULL) { + struct WarningListNode *const node = warning_list->warning_node; + warning_list->warning_node = node->next_warning; + free(node); + } +} + +static int continue_prompt(int num_warnings) { + int c; + fprintf(stderr, + "%d encoder configuration warning(s). Continue? (y to continue) ", + num_warnings); + c = getchar(); + return c == 'y'; +} + +static void check_quantizer(int min_q, int max_q, + struct WarningList *warning_list) { + const int lossless = min_q == 0 && max_q == 0; + if (!lossless && (min_q == max_q || abs(max_q - min_q) < 8)) + add_warning(quantizer_warning_string, warning_list); +} + +static void check_lag_in_frames_realtime_deadline( + int lag_in_frames, int deadline, int rc_end_usage, + struct WarningList *warning_list) { + if (deadline == VPX_DL_REALTIME && lag_in_frames != 0 && rc_end_usage == 1) + add_warning(lag_in_frames_with_realtime, warning_list); +} + +void check_encoder_config(int disable_prompt, + const struct VpxEncoderConfig *global_config, + const struct vpx_codec_enc_cfg *stream_config) { + int num_warnings = 0; + struct WarningListNode *warning = NULL; + struct WarningList warning_list = { 0 }; + + check_quantizer(stream_config->rc_min_quantizer, + stream_config->rc_max_quantizer, &warning_list); + check_lag_in_frames_realtime_deadline( + stream_config->g_lag_in_frames, global_config->deadline, + stream_config->rc_end_usage, &warning_list); + /* Count and print warnings. */ + for (warning = warning_list.warning_node; warning != NULL; + warning = warning->next_warning, ++num_warnings) { + warn("%s", warning->warning_string); + } + + free_warning_list(&warning_list); + + if (num_warnings) { + if (!disable_prompt && !continue_prompt(num_warnings)) exit(EXIT_FAILURE); + } +} diff --git a/media/libvpx/libvpx/warnings.h b/media/libvpx/libvpx/warnings.h new file mode 100644 index 0000000000..15558c6437 --- /dev/null +++ b/media/libvpx/libvpx/warnings.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_WARNINGS_H_ +#define VPX_WARNINGS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +struct vpx_codec_enc_cfg; +struct VpxEncoderConfig; + +/* + * Checks config for improperly used settings. Warns user upon encountering + * settings that will lead to poor output quality. Prompts user to continue + * when warnings are issued. + */ +void check_encoder_config(int disable_prompt, + const struct VpxEncoderConfig *global_config, + const struct vpx_codec_enc_cfg *stream_config); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_WARNINGS_H_ diff --git a/media/libvpx/libvpx/webmdec.cc b/media/libvpx/libvpx/webmdec.cc new file mode 100644 index 0000000000..f7671bb641 --- /dev/null +++ b/media/libvpx/libvpx/webmdec.cc @@ -0,0 +1,226 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./webmdec.h" + +#include +#include + +#include "third_party/libwebm/mkvparser/mkvparser.h" +#include "third_party/libwebm/mkvparser/mkvreader.h" + +namespace { + +void reset(struct WebmInputContext *const webm_ctx) { + if (webm_ctx->reader != nullptr) { + mkvparser::MkvReader *const reader = + reinterpret_cast(webm_ctx->reader); + delete reader; + } + if (webm_ctx->segment != nullptr) { + mkvparser::Segment *const segment = + reinterpret_cast(webm_ctx->segment); + delete segment; + } + if (webm_ctx->buffer != nullptr) { + delete[] webm_ctx->buffer; + } + webm_ctx->reader = nullptr; + webm_ctx->segment = nullptr; + webm_ctx->buffer = nullptr; + webm_ctx->cluster = nullptr; + webm_ctx->block_entry = nullptr; + webm_ctx->block = nullptr; + webm_ctx->block_frame_index = 0; + webm_ctx->video_track_index = 0; + webm_ctx->timestamp_ns = 0; + webm_ctx->is_key_frame = false; +} + +void get_first_cluster(struct WebmInputContext *const webm_ctx) { + mkvparser::Segment *const segment = + reinterpret_cast(webm_ctx->segment); + const mkvparser::Cluster *const cluster = segment->GetFirst(); + webm_ctx->cluster = cluster; +} + +void rewind_and_reset(struct WebmInputContext *const webm_ctx, + struct VpxInputContext *const vpx_ctx) { + rewind(vpx_ctx->file); + reset(webm_ctx); +} + +} // namespace + +int file_is_webm(struct WebmInputContext *webm_ctx, + struct VpxInputContext *vpx_ctx) { + mkvparser::MkvReader *const reader = new mkvparser::MkvReader(vpx_ctx->file); + webm_ctx->reader = reader; + webm_ctx->reached_eos = 0; + + mkvparser::EBMLHeader header; + long long pos = 0; + if (header.Parse(reader, pos) < 0) { + rewind_and_reset(webm_ctx, vpx_ctx); + return 0; + } + + mkvparser::Segment *segment; + if (mkvparser::Segment::CreateInstance(reader, pos, segment)) { + rewind_and_reset(webm_ctx, vpx_ctx); + return 0; + } + webm_ctx->segment = segment; + if (segment->Load() < 0) { + rewind_and_reset(webm_ctx, vpx_ctx); + return 0; + } + + const mkvparser::Tracks *const tracks = segment->GetTracks(); + const mkvparser::VideoTrack *video_track = nullptr; + for (unsigned long i = 0; i < tracks->GetTracksCount(); ++i) { + const mkvparser::Track *const track = tracks->GetTrackByIndex(i); + if (track->GetType() == mkvparser::Track::kVideo) { + video_track = static_cast(track); + webm_ctx->video_track_index = static_cast(track->GetNumber()); + break; + } + } + + if (video_track == nullptr || video_track->GetCodecId() == nullptr) { + rewind_and_reset(webm_ctx, vpx_ctx); + return 0; + } + + if (!strncmp(video_track->GetCodecId(), "V_VP8", 5)) { + vpx_ctx->fourcc = VP8_FOURCC; + } else if (!strncmp(video_track->GetCodecId(), "V_VP9", 5)) { + vpx_ctx->fourcc = VP9_FOURCC; + } else { + rewind_and_reset(webm_ctx, vpx_ctx); + return 0; + } + + vpx_ctx->framerate.denominator = 0; + vpx_ctx->framerate.numerator = 0; + vpx_ctx->width = static_cast(video_track->GetWidth()); + vpx_ctx->height = static_cast(video_track->GetHeight()); + + get_first_cluster(webm_ctx); + + return 1; +} + +int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer, + size_t *buffer_size) { + // This check is needed for frame parallel decoding, in which case this + // function could be called even after it has reached end of input stream. + if (webm_ctx->reached_eos) { + return 1; + } + mkvparser::Segment *const segment = + reinterpret_cast(webm_ctx->segment); + const mkvparser::Cluster *cluster = + reinterpret_cast(webm_ctx->cluster); + const mkvparser::Block *block = + reinterpret_cast(webm_ctx->block); + const mkvparser::BlockEntry *block_entry = + reinterpret_cast(webm_ctx->block_entry); + bool block_entry_eos = false; + do { + long status = 0; + bool get_new_block = false; + if (block_entry == nullptr && !block_entry_eos) { + status = cluster->GetFirst(block_entry); + get_new_block = true; + } else if (block_entry_eos || block_entry->EOS()) { + cluster = segment->GetNext(cluster); + if (cluster == nullptr || cluster->EOS()) { + *buffer_size = 0; + webm_ctx->reached_eos = 1; + return 1; + } + status = cluster->GetFirst(block_entry); + block_entry_eos = false; + get_new_block = true; + } else if (block == nullptr || + webm_ctx->block_frame_index == block->GetFrameCount() || + block->GetTrackNumber() != webm_ctx->video_track_index) { + status = cluster->GetNext(block_entry, block_entry); + if (block_entry == nullptr || block_entry->EOS()) { + block_entry_eos = true; + continue; + } + get_new_block = true; + } + if (status || block_entry == nullptr) { + return -1; + } + if (get_new_block) { + block = block_entry->GetBlock(); + if (block == nullptr) return -1; + webm_ctx->block_frame_index = 0; + } + } while (block_entry_eos || + block->GetTrackNumber() != webm_ctx->video_track_index); + + webm_ctx->cluster = cluster; + webm_ctx->block_entry = block_entry; + webm_ctx->block = block; + + const mkvparser::Block::Frame &frame = + block->GetFrame(webm_ctx->block_frame_index); + ++webm_ctx->block_frame_index; + if (frame.len > static_cast(*buffer_size)) { + delete[] * buffer; + *buffer = new uint8_t[frame.len]; + if (*buffer == nullptr) { + return -1; + } + webm_ctx->buffer = *buffer; + } + *buffer_size = frame.len; + webm_ctx->timestamp_ns = block->GetTime(cluster); + webm_ctx->is_key_frame = block->IsKey(); + + mkvparser::MkvReader *const reader = + reinterpret_cast(webm_ctx->reader); + return frame.Read(reader, *buffer) ? -1 : 0; +} + +int webm_guess_framerate(struct WebmInputContext *webm_ctx, + struct VpxInputContext *vpx_ctx) { + uint32_t i = 0; + uint8_t *buffer = nullptr; + size_t buffer_size = 0; + while (webm_ctx->timestamp_ns < 1000000000 && i < 50) { + if (webm_read_frame(webm_ctx, &buffer, &buffer_size)) { + break; + } + ++i; + } + vpx_ctx->framerate.numerator = (i - 1) * 1000000; + vpx_ctx->framerate.denominator = + static_cast(webm_ctx->timestamp_ns / 1000); + delete[] buffer; + // webm_ctx->buffer is assigned to the buffer pointer in webm_read_frame(). + webm_ctx->buffer = nullptr; + + get_first_cluster(webm_ctx); + webm_ctx->block = nullptr; + webm_ctx->block_entry = nullptr; + webm_ctx->block_frame_index = 0; + webm_ctx->timestamp_ns = 0; + webm_ctx->reached_eos = 0; + + return 0; +} + +void webm_free(struct WebmInputContext *webm_ctx) { reset(webm_ctx); } diff --git a/media/libvpx/libvpx/webmdec.h b/media/libvpx/libvpx/webmdec.h new file mode 100644 index 0000000000..6ae7ee16d0 --- /dev/null +++ b/media/libvpx/libvpx/webmdec.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_WEBMDEC_H_ +#define VPX_WEBMDEC_H_ + +#include "./tools_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct VpxInputContext; + +struct WebmInputContext { + void *reader; + void *segment; + uint8_t *buffer; + const void *cluster; + const void *block_entry; + const void *block; + int block_frame_index; + int video_track_index; + int64_t timestamp_ns; + int is_key_frame; + int reached_eos; +}; + +// Checks if the input is a WebM file. If so, initializes WebMInputContext so +// that webm_read_frame can be called to retrieve a video frame. +// Returns 1 on success and 0 on failure or input is not WebM file. +// TODO(vigneshv): Refactor this function into two smaller functions specific +// to their task. +int file_is_webm(struct WebmInputContext *webm_ctx, + struct VpxInputContext *vpx_ctx); + +// Reads a WebM Video Frame. Memory for the buffer is created, owned and managed +// by this function. For the first call, |buffer| should be NULL and +// |*buffer_size| should be 0. Once all the frames are read and used, +// webm_free() should be called, otherwise there will be a leak. +// Parameters: +// webm_ctx - WebmInputContext object +// buffer - pointer where the frame data will be filled. +// buffer_size - pointer to buffer size. +// Return values: +// 0 - Success +// 1 - End of Stream +// -1 - Error +int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer, + size_t *buffer_size); + +// Guesses the frame rate of the input file based on the container timestamps. +int webm_guess_framerate(struct WebmInputContext *webm_ctx, + struct VpxInputContext *vpx_ctx); + +// Resets the WebMInputContext. +void webm_free(struct WebmInputContext *webm_ctx); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_WEBMDEC_H_ diff --git a/media/libvpx/libvpx/webmenc.cc b/media/libvpx/libvpx/webmenc.cc new file mode 100644 index 0000000000..c718ab5a9f --- /dev/null +++ b/media/libvpx/libvpx/webmenc.cc @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./webmenc.h" + +#include + +#include "third_party/libwebm/mkvmuxer/mkvmuxer.h" +#include "third_party/libwebm/mkvmuxer/mkvmuxerutil.h" +#include "third_party/libwebm/mkvmuxer/mkvwriter.h" + +namespace { +const uint64_t kDebugTrackUid = 0xDEADBEEF; +const int kVideoTrackNumber = 1; +} // namespace + +void write_webm_file_header(struct WebmOutputContext *webm_ctx, + const vpx_codec_enc_cfg_t *cfg, + stereo_format_t stereo_fmt, unsigned int fourcc, + const struct VpxRational *par) { + mkvmuxer::MkvWriter *const writer = new mkvmuxer::MkvWriter(webm_ctx->stream); + mkvmuxer::Segment *const segment = new mkvmuxer::Segment(); + segment->Init(writer); + segment->set_mode(mkvmuxer::Segment::kFile); + segment->OutputCues(true); + + mkvmuxer::SegmentInfo *const info = segment->GetSegmentInfo(); + const uint64_t kTimecodeScale = 1000000; + info->set_timecode_scale(kTimecodeScale); + std::string version = "vpxenc"; + if (!webm_ctx->debug) { + version.append(std::string(" ") + vpx_codec_version_str()); + } + info->set_writing_app(version.c_str()); + + const uint64_t video_track_id = + segment->AddVideoTrack(static_cast(cfg->g_w), + static_cast(cfg->g_h), kVideoTrackNumber); + mkvmuxer::VideoTrack *const video_track = static_cast( + segment->GetTrackByNumber(video_track_id)); + video_track->SetStereoMode(stereo_fmt); + const char *codec_id; + switch (fourcc) { + case VP8_FOURCC: codec_id = "V_VP8"; break; + case VP9_FOURCC: + default: codec_id = "V_VP9"; break; + } + video_track->set_codec_id(codec_id); + if (par->numerator > 1 || par->denominator > 1) { + // TODO(fgalligan): Add support of DisplayUnit, Display Aspect Ratio type + // to WebM format. + const uint64_t display_width = static_cast( + ((cfg->g_w * par->numerator * 1.0) / par->denominator) + .5); + video_track->set_display_width(display_width); + video_track->set_display_height(cfg->g_h); + } + if (webm_ctx->debug) { + video_track->set_uid(kDebugTrackUid); + } + webm_ctx->writer = writer; + webm_ctx->segment = segment; +} + +void write_webm_block(struct WebmOutputContext *webm_ctx, + const vpx_codec_enc_cfg_t *cfg, + const vpx_codec_cx_pkt_t *pkt) { + mkvmuxer::Segment *const segment = + reinterpret_cast(webm_ctx->segment); + int64_t pts_ns = pkt->data.frame.pts * 1000000000ll * cfg->g_timebase.num / + cfg->g_timebase.den; + if (pts_ns <= webm_ctx->last_pts_ns) pts_ns = webm_ctx->last_pts_ns + 1000000; + webm_ctx->last_pts_ns = pts_ns; + + segment->AddFrame(static_cast(pkt->data.frame.buf), + pkt->data.frame.sz, kVideoTrackNumber, pts_ns, + pkt->data.frame.flags & VPX_FRAME_IS_KEY); +} + +void write_webm_file_footer(struct WebmOutputContext *webm_ctx) { + mkvmuxer::MkvWriter *const writer = + reinterpret_cast(webm_ctx->writer); + mkvmuxer::Segment *const segment = + reinterpret_cast(webm_ctx->segment); + segment->Finalize(); + delete segment; + delete writer; + webm_ctx->writer = nullptr; + webm_ctx->segment = nullptr; +} diff --git a/media/libvpx/libvpx/webmenc.h b/media/libvpx/libvpx/webmenc.h new file mode 100644 index 0000000000..4176e82081 --- /dev/null +++ b/media/libvpx/libvpx/webmenc.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_WEBMENC_H_ +#define VPX_WEBMENC_H_ + +#include +#include + +#include "tools_common.h" +#include "vpx/vpx_encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct WebmOutputContext { + int debug; + FILE *stream; + int64_t last_pts_ns; + void *writer; + void *segment; +}; + +/* Stereo 3D packed frame format */ +typedef enum stereo_format { + STEREO_FORMAT_MONO = 0, + STEREO_FORMAT_LEFT_RIGHT = 1, + STEREO_FORMAT_BOTTOM_TOP = 2, + STEREO_FORMAT_TOP_BOTTOM = 3, + STEREO_FORMAT_RIGHT_LEFT = 11 +} stereo_format_t; + +void write_webm_file_header(struct WebmOutputContext *webm_ctx, + const vpx_codec_enc_cfg_t *cfg, + stereo_format_t stereo_fmt, unsigned int fourcc, + const struct VpxRational *par); + +void write_webm_block(struct WebmOutputContext *webm_ctx, + const vpx_codec_enc_cfg_t *cfg, + const vpx_codec_cx_pkt_t *pkt); + +void write_webm_file_footer(struct WebmOutputContext *webm_ctx); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_WEBMENC_H_ diff --git a/media/libvpx/libvpx/y4menc.c b/media/libvpx/libvpx/y4menc.c new file mode 100644 index 0000000000..1877981279 --- /dev/null +++ b/media/libvpx/libvpx/y4menc.c @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./y4menc.h" + +int y4m_write_file_header(char *buf, size_t len, int width, int height, + const struct VpxRational *framerate, + vpx_img_fmt_t fmt, unsigned int bit_depth) { + const char *color; + switch (bit_depth) { + case 8: + color = fmt == VPX_IMG_FMT_I444 ? "C444\n" + : fmt == VPX_IMG_FMT_I422 ? "C422\n" + : "C420jpeg\n"; + break; + case 9: + color = fmt == VPX_IMG_FMT_I44416 ? "C444p9 XYSCSS=444P9\n" + : fmt == VPX_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9\n" + : "C420p9 XYSCSS=420P9\n"; + break; + case 10: + color = fmt == VPX_IMG_FMT_I44416 ? "C444p10 XYSCSS=444P10\n" + : fmt == VPX_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10\n" + : "C420p10 XYSCSS=420P10\n"; + break; + case 12: + color = fmt == VPX_IMG_FMT_I44416 ? "C444p12 XYSCSS=444P12\n" + : fmt == VPX_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12\n" + : "C420p12 XYSCSS=420P12\n"; + break; + case 14: + color = fmt == VPX_IMG_FMT_I44416 ? "C444p14 XYSCSS=444P14\n" + : fmt == VPX_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14\n" + : "C420p14 XYSCSS=420P14\n"; + break; + case 16: + color = fmt == VPX_IMG_FMT_I44416 ? "C444p16 XYSCSS=444P16\n" + : fmt == VPX_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16\n" + : "C420p16 XYSCSS=420P16\n"; + break; + default: color = NULL; assert(0); + } + return snprintf(buf, len, "YUV4MPEG2 W%u H%u F%u:%u I%c %s", width, height, + framerate->numerator, framerate->denominator, 'p', color); +} + +int y4m_write_frame_header(char *buf, size_t len) { + return snprintf(buf, len, "FRAME\n"); +} diff --git a/media/libvpx/libvpx/y4menc.h b/media/libvpx/libvpx/y4menc.h new file mode 100644 index 0000000000..9a367e34c6 --- /dev/null +++ b/media/libvpx/libvpx/y4menc.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_Y4MENC_H_ +#define VPX_Y4MENC_H_ + +#include "./tools_common.h" + +#include "vpx/vpx_decoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define Y4M_BUFFER_SIZE 128 + +int y4m_write_file_header(char *buf, size_t len, int width, int height, + const struct VpxRational *framerate, + vpx_img_fmt_t fmt, unsigned int bit_depth); +int y4m_write_frame_header(char *buf, size_t len); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_Y4MENC_H_ diff --git a/media/libvpx/libvpx/y4minput.c b/media/libvpx/libvpx/y4minput.c new file mode 100644 index 0000000000..210ce52fce --- /dev/null +++ b/media/libvpx/libvpx/y4minput.c @@ -0,0 +1,1170 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + * Based on code from the OggTheora software codec source code, + * Copyright (C) 2002-2010 The Xiph.Org Foundation and contributors. + */ +#include +#include +#include +#include + +#include "vpx/vpx_integer.h" +#include "y4minput.h" + +// Reads 'size' bytes from 'file' into 'buf' with some fault tolerance. +// Returns true on success. +static int file_read(void *buf, size_t size, FILE *file) { + const int kMaxTries = 5; + int try_count = 0; + int file_error = 0; + size_t len = 0; + while (!feof(file) && len < size && try_count < kMaxTries) { + const size_t n = fread((uint8_t *)buf + len, 1, size - len, file); + ++try_count; + len += n; + file_error = ferror(file); + if (file_error) { + if (errno == EINTR || errno == EAGAIN) { + clearerr(file); + continue; + } else { + fprintf(stderr, "Error reading file: %u of %u bytes read, %d: %s\n", + (uint32_t)len, (uint32_t)size, errno, strerror(errno)); + return 0; + } + } + } + + if (!feof(file) && len != size) { + fprintf(stderr, + "Error reading file: %u of %u bytes read," + " error: %d, tries: %d, %d: %s\n", + (uint32_t)len, (uint32_t)size, file_error, try_count, errno, + strerror(errno)); + } + return len == size; +} + +static int y4m_parse_tags(y4m_input *_y4m, char *_tags) { + char *p; + char *q; + for (p = _tags;; p = q) { + /*Skip any leading spaces.*/ + while (*p == ' ') p++; + /*If that's all we have, stop.*/ + if (p[0] == '\0') break; + /*Find the end of this tag.*/ + for (q = p + 1; *q != '\0' && *q != ' '; q++) { + } + /*Process the tag.*/ + switch (p[0]) { + case 'W': { + if (sscanf(p + 1, "%d", &_y4m->pic_w) != 1) return -1; + break; + } + case 'H': { + if (sscanf(p + 1, "%d", &_y4m->pic_h) != 1) return -1; + break; + } + case 'F': { + if (sscanf(p + 1, "%d:%d", &_y4m->fps_n, &_y4m->fps_d) != 2) { + return -1; + } + break; + } + case 'I': { + _y4m->interlace = p[1]; + break; + } + case 'A': { + if (sscanf(p + 1, "%d:%d", &_y4m->par_n, &_y4m->par_d) != 2) { + return -1; + } + break; + } + case 'C': { + if (q - p > 16) return -1; + memcpy(_y4m->chroma_type, p + 1, q - p - 1); + _y4m->chroma_type[q - p - 1] = '\0'; + break; + } + /*Ignore unknown tags.*/ + } + } + return 0; +} + +// Copy a single tag into the buffer, along with a null character. +// Returns 0 if any file IO errors occur. +static int copy_tag(char *buf, size_t buf_len, char *end_tag, FILE *file) { + size_t i; + assert(buf_len >= 1); + // Skip leading space characters. + do { + if (!file_read(buf, 1, file)) { + return 0; + } + } while (buf[0] == ' '); + + // If we hit the newline, treat this as the "empty" tag. + if (buf[0] == '\n') { + buf[0] = '\0'; + *end_tag = '\n'; + return 1; + } + + // Copy over characters until a space is hit, or the buffer is exhausted. + for (i = 1; i < buf_len; ++i) { + if (!file_read(buf + i, 1, file)) { + return 0; + } + if (buf[i] == ' ' || buf[i] == '\n') { + break; + } + } + if (i == buf_len) { + fprintf(stderr, "Error: Y4M header tags must be less than %lu characters\n", + (unsigned long)i); + return 0; + } + *end_tag = buf[i]; + buf[i] = '\0'; + return 1; +} + +/* Returns 1 if tags were parsed successfully, 0 otherwise. */ +static int parse_tags(y4m_input *y4m_ctx, FILE *file) { + char tag[256]; + char end; /* Character denoting the end of the tag, ' ' or '\n'. */ + /* Set Y4M tags to defaults, updating them as processing occurs. Mandatory + fields are marked with -1 and will be checked after the tags are parsed. */ + y4m_ctx->pic_w = -1; + y4m_ctx->pic_h = -1; + y4m_ctx->fps_n = -1; /* Also serves as marker for fps_d */ + y4m_ctx->par_n = 0; + y4m_ctx->par_d = 0; + y4m_ctx->interlace = '?'; + snprintf(y4m_ctx->chroma_type, sizeof(y4m_ctx->chroma_type), "420"); + + /* Find one tag at a time. */ + do { + if (!copy_tag(tag, sizeof(tag), &end, file)) { + return 0; + } + /* y4m_parse_tags returns 0 on success. */ + if (y4m_parse_tags(y4m_ctx, tag)) { + return 0; + } + } while (end != '\n'); + + /* Check the mandatory fields. */ + if (y4m_ctx->pic_w == -1) { + fprintf(stderr, "Width field missing\n"); + return 0; + } + if (y4m_ctx->pic_h == -1) { + fprintf(stderr, "Height field missing\n"); + return 0; + } + if (y4m_ctx->fps_n == -1) { + fprintf(stderr, "FPS field missing\n"); + return 0; + } + return 1; +} + +/*All anti-aliasing filters in the following conversion functions are based on + one of two window functions: + The 6-tap Lanczos window (for down-sampling and shifts): + sinc(\pi*t)*sinc(\pi*t/3), |t|<3 (sinc(t)==sin(t)/t) + 0, |t|>=3 + The 4-tap Mitchell window (for up-sampling): + 7|t|^3-12|t|^2+16/3, |t|<1 + -(7/3)|x|^3+12|x|^2-20|x|+32/3, |t|<2 + 0, |t|>=2 + The number of taps is intentionally kept small to reduce computational + overhead and limit ringing. + + The taps from these filters are scaled so that their sum is 1, and the + result is scaled by 128 and rounded to integers to create a filter whose + intermediate values fit inside 16 bits. + Coefficients are rounded in such a way as to ensure their sum is still 128, + which is usually equivalent to normal rounding. + + Conversions which require both horizontal and vertical filtering could + have these steps pipelined, for less memory consumption and better cache + performance, but we do them separately for simplicity.*/ +#define OC_MINI(_a, _b) ((_a) > (_b) ? (_b) : (_a)) +#define OC_MAXI(_a, _b) ((_a) < (_b) ? (_b) : (_a)) +#define OC_CLAMPI(_a, _b, _c) (OC_MAXI(_a, OC_MINI(_b, _c))) + +/*420jpeg chroma samples are sited like: + Y-------Y-------Y-------Y------- + | | | | + | BR | | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | BR | | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + + 420mpeg2 chroma samples are sited like: + Y-------Y-------Y-------Y------- + | | | | + BR | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + Y-------Y-------Y-------Y------- + | | | | + BR | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + + We use a resampling filter to shift the site locations one quarter pixel (at + the chroma plane's resolution) to the right. + The 4:2:2 modes look exactly the same, except there are twice as many chroma + lines, and they are vertically co-sited with the luma samples in both the + mpeg2 and jpeg cases (thus requiring no vertical resampling).*/ +static void y4m_42xmpeg2_42xjpeg_helper(unsigned char *_dst, + const unsigned char *_src, int _c_w, + int _c_h) { + int y; + int x; + for (y = 0; y < _c_h; y++) { + /*Filter: [4 -17 114 35 -9 1]/128, derived from a 6-tap Lanczos + window.*/ + for (x = 0; x < OC_MINI(_c_w, 2); x++) { + _dst[x] = (unsigned char)OC_CLAMPI( + 0, + (4 * _src[0] - 17 * _src[OC_MAXI(x - 1, 0)] + 114 * _src[x] + + 35 * _src[OC_MINI(x + 1, _c_w - 1)] - + 9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[OC_MINI(x + 3, _c_w - 1)] + + 64) >> + 7, + 255); + } + for (; x < _c_w - 3; x++) { + _dst[x] = (unsigned char)OC_CLAMPI( + 0, + (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] + + 35 * _src[x + 1] - 9 * _src[x + 2] + _src[x + 3] + 64) >> + 7, + 255); + } + for (; x < _c_w; x++) { + _dst[x] = (unsigned char)OC_CLAMPI( + 0, + (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] + + 35 * _src[OC_MINI(x + 1, _c_w - 1)] - + 9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[_c_w - 1] + 64) >> + 7, + 255); + } + _dst += _c_w; + _src += _c_w; + } +} + +/*This format is only used for interlaced content, but is included for + completeness. + + 420jpeg chroma samples are sited like: + Y-------Y-------Y-------Y------- + | | | | + | BR | | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | BR | | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + + 420paldv chroma samples are sited like: + YR------Y-------YR------Y------- + | | | | + | | | | + | | | | + YB------Y-------YB------Y------- + | | | | + | | | | + | | | | + YR------Y-------YR------Y------- + | | | | + | | | | + | | | | + YB------Y-------YB------Y------- + | | | | + | | | | + | | | | + + We use a resampling filter to shift the site locations one quarter pixel (at + the chroma plane's resolution) to the right. + Then we use another filter to move the C_r location down one quarter pixel, + and the C_b location up one quarter pixel.*/ +static void y4m_convert_42xpaldv_42xjpeg(y4m_input *_y4m, unsigned char *_dst, + unsigned char *_aux) { + unsigned char *tmp; + int c_w; + int c_h; + int c_sz; + int pli; + int y; + int x; + /*Skip past the luma data.*/ + _dst += _y4m->pic_w * _y4m->pic_h; + /*Compute the size of each chroma plane.*/ + c_w = (_y4m->pic_w + 1) / 2; + c_h = (_y4m->pic_h + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h; + c_sz = c_w * c_h; + tmp = _aux + 2 * c_sz; + for (pli = 1; pli < 3; pli++) { + /*First do the horizontal re-sampling. + This is the same as the mpeg2 case, except that after the horizontal + case, we need to apply a second vertical filter.*/ + y4m_42xmpeg2_42xjpeg_helper(tmp, _aux, c_w, c_h); + _aux += c_sz; + switch (pli) { + case 1: { + /*Slide C_b up a quarter-pel. + This is the same filter used above, but in the other order.*/ + for (x = 0; x < c_w; x++) { + for (y = 0; y < OC_MINI(c_h, 3); y++) { + _dst[y * c_w] = (unsigned char)OC_CLAMPI( + 0, + (tmp[0] - 9 * tmp[OC_MAXI(y - 2, 0) * c_w] + + 35 * tmp[OC_MAXI(y - 1, 0) * c_w] + 114 * tmp[y * c_w] - + 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] + + 4 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + 64) >> + 7, + 255); + } + for (; y < c_h - 2; y++) { + _dst[y * c_w] = (unsigned char)OC_CLAMPI( + 0, + (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] + + 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] - + 17 * tmp[(y + 1) * c_w] + 4 * tmp[(y + 2) * c_w] + 64) >> + 7, + 255); + } + for (; y < c_h; y++) { + _dst[y * c_w] = (unsigned char)OC_CLAMPI( + 0, + (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] + + 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] - + 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] + + 4 * tmp[(c_h - 1) * c_w] + 64) >> + 7, + 255); + } + _dst++; + tmp++; + } + _dst += c_sz - c_w; + tmp -= c_w; + break; + } + case 2: { + /*Slide C_r down a quarter-pel. + This is the same as the horizontal filter.*/ + for (x = 0; x < c_w; x++) { + for (y = 0; y < OC_MINI(c_h, 2); y++) { + _dst[y * c_w] = (unsigned char)OC_CLAMPI( + 0, + (4 * tmp[0] - 17 * tmp[OC_MAXI(y - 1, 0) * c_w] + + 114 * tmp[y * c_w] + 35 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] - + 9 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + + tmp[OC_MINI(y + 3, c_h - 1) * c_w] + 64) >> + 7, + 255); + } + for (; y < c_h - 3; y++) { + _dst[y * c_w] = (unsigned char)OC_CLAMPI( + 0, + (4 * tmp[(y - 2) * c_w] - 17 * tmp[(y - 1) * c_w] + + 114 * tmp[y * c_w] + 35 * tmp[(y + 1) * c_w] - + 9 * tmp[(y + 2) * c_w] + tmp[(y + 3) * c_w] + 64) >> + 7, + 255); + } + for (; y < c_h; y++) { + _dst[y * c_w] = (unsigned char)OC_CLAMPI( + 0, + (4 * tmp[(y - 2) * c_w] - 17 * tmp[(y - 1) * c_w] + + 114 * tmp[y * c_w] + 35 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] - + 9 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + tmp[(c_h - 1) * c_w] + + 64) >> + 7, + 255); + } + _dst++; + tmp++; + } + break; + } + } + /*For actual interlaced material, this would have to be done separately on + each field, and the shift amounts would be different. + C_r moves down 1/8, C_b up 3/8 in the top field, and C_r moves down 3/8, + C_b up 1/8 in the bottom field. + The corresponding filters would be: + Down 1/8 (reverse order for up): [3 -11 125 15 -4 0]/128 + Down 3/8 (reverse order for up): [4 -19 98 56 -13 2]/128*/ + } +} + +/*Perform vertical filtering to reduce a single plane from 4:2:2 to 4:2:0. + This is used as a helper by several converation routines.*/ +static void y4m_422jpeg_420jpeg_helper(unsigned char *_dst, + const unsigned char *_src, int _c_w, + int _c_h) { + int y; + int x; + /*Filter: [3 -17 78 78 -17 3]/128, derived from a 6-tap Lanczos window.*/ + for (x = 0; x < _c_w; x++) { + for (y = 0; y < OC_MINI(_c_h, 2); y += 2) { + _dst[(y >> 1) * _c_w] = + OC_CLAMPI(0, + (64 * _src[0] + 78 * _src[OC_MINI(1, _c_h - 1) * _c_w] - + 17 * _src[OC_MINI(2, _c_h - 1) * _c_w] + + 3 * _src[OC_MINI(3, _c_h - 1) * _c_w] + 64) >> + 7, + 255); + } + for (; y < _c_h - 3; y += 2) { + _dst[(y >> 1) * _c_w] = + OC_CLAMPI(0, + (3 * (_src[(y - 2) * _c_w] + _src[(y + 3) * _c_w]) - + 17 * (_src[(y - 1) * _c_w] + _src[(y + 2) * _c_w]) + + 78 * (_src[y * _c_w] + _src[(y + 1) * _c_w]) + 64) >> + 7, + 255); + } + for (; y < _c_h; y += 2) { + _dst[(y >> 1) * _c_w] = OC_CLAMPI( + 0, + (3 * (_src[(y - 2) * _c_w] + _src[(_c_h - 1) * _c_w]) - + 17 * (_src[(y - 1) * _c_w] + _src[OC_MINI(y + 2, _c_h - 1) * _c_w]) + + 78 * (_src[y * _c_w] + _src[OC_MINI(y + 1, _c_h - 1) * _c_w]) + + 64) >> + 7, + 255); + } + _src++; + _dst++; + } +} + +/*420jpeg chroma samples are sited like: + Y-------Y-------Y-------Y------- + | | | | + | BR | | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | BR | | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + + 422jpeg chroma samples are sited like: + Y---BR--Y-------Y---BR--Y------- + | | | | + | | | | + | | | | + Y---BR--Y-------Y---BR--Y------- + | | | | + | | | | + | | | | + Y---BR--Y-------Y---BR--Y------- + | | | | + | | | | + | | | | + Y---BR--Y-------Y---BR--Y------- + | | | | + | | | | + | | | | + + We use a resampling filter to decimate the chroma planes by two in the + vertical direction.*/ +static void y4m_convert_422jpeg_420jpeg(y4m_input *_y4m, unsigned char *_dst, + unsigned char *_aux) { + int c_w; + int c_h; + int c_sz; + int dst_c_w; + int dst_c_h; + int dst_c_sz; + int pli; + /*Skip past the luma data.*/ + _dst += _y4m->pic_w * _y4m->pic_h; + /*Compute the size of each chroma plane.*/ + c_w = (_y4m->pic_w + _y4m->src_c_dec_h - 1) / _y4m->src_c_dec_h; + c_h = _y4m->pic_h; + dst_c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h; + dst_c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v; + c_sz = c_w * c_h; + dst_c_sz = dst_c_w * dst_c_h; + for (pli = 1; pli < 3; pli++) { + y4m_422jpeg_420jpeg_helper(_dst, _aux, c_w, c_h); + _aux += c_sz; + _dst += dst_c_sz; + } +} + +/*420jpeg chroma samples are sited like: + Y-------Y-------Y-------Y------- + | | | | + | BR | | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | BR | | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + + 422 chroma samples are sited like: + YBR-----Y-------YBR-----Y------- + | | | | + | | | | + | | | | + YBR-----Y-------YBR-----Y------- + | | | | + | | | | + | | | | + YBR-----Y-------YBR-----Y------- + | | | | + | | | | + | | | | + YBR-----Y-------YBR-----Y------- + | | | | + | | | | + | | | | + + We use a resampling filter to shift the original site locations one quarter + pixel (at the original chroma resolution) to the right. + Then we use a second resampling filter to decimate the chroma planes by two + in the vertical direction.*/ +static void y4m_convert_422_420jpeg(y4m_input *_y4m, unsigned char *_dst, + unsigned char *_aux) { + unsigned char *tmp; + int c_w; + int c_h; + int c_sz; + int dst_c_h; + int dst_c_sz; + int pli; + /*Skip past the luma data.*/ + _dst += _y4m->pic_w * _y4m->pic_h; + /*Compute the size of each chroma plane.*/ + c_w = (_y4m->pic_w + _y4m->src_c_dec_h - 1) / _y4m->src_c_dec_h; + c_h = _y4m->pic_h; + dst_c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v; + c_sz = c_w * c_h; + dst_c_sz = c_w * dst_c_h; + tmp = _aux + 2 * c_sz; + for (pli = 1; pli < 3; pli++) { + /*In reality, the horizontal and vertical steps could be pipelined, for + less memory consumption and better cache performance, but we do them + separately for simplicity.*/ + /*First do horizontal filtering (convert to 422jpeg)*/ + y4m_42xmpeg2_42xjpeg_helper(tmp, _aux, c_w, c_h); + /*Now do the vertical filtering.*/ + y4m_422jpeg_420jpeg_helper(_dst, tmp, c_w, c_h); + _aux += c_sz; + _dst += dst_c_sz; + } +} + +/*420jpeg chroma samples are sited like: + Y-------Y-------Y-------Y------- + | | | | + | BR | | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | BR | | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + + 411 chroma samples are sited like: + YBR-----Y-------Y-------Y------- + | | | | + | | | | + | | | | + YBR-----Y-------Y-------Y------- + | | | | + | | | | + | | | | + YBR-----Y-------Y-------Y------- + | | | | + | | | | + | | | | + YBR-----Y-------Y-------Y------- + | | | | + | | | | + | | | | + + We use a filter to resample at site locations one eighth pixel (at the source + chroma plane's horizontal resolution) and five eighths of a pixel to the + right. + Then we use another filter to decimate the planes by 2 in the vertical + direction.*/ +static void y4m_convert_411_420jpeg(y4m_input *_y4m, unsigned char *_dst, + unsigned char *_aux) { + unsigned char *tmp; + int c_w; + int c_h; + int c_sz; + int dst_c_w; + int dst_c_h; + int dst_c_sz; + int tmp_sz; + int pli; + int y; + int x; + /*Skip past the luma data.*/ + _dst += _y4m->pic_w * _y4m->pic_h; + /*Compute the size of each chroma plane.*/ + c_w = (_y4m->pic_w + _y4m->src_c_dec_h - 1) / _y4m->src_c_dec_h; + c_h = _y4m->pic_h; + dst_c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h; + dst_c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v; + c_sz = c_w * c_h; + dst_c_sz = dst_c_w * dst_c_h; + tmp_sz = dst_c_w * c_h; + tmp = _aux + 2 * c_sz; + for (pli = 1; pli < 3; pli++) { + /*In reality, the horizontal and vertical steps could be pipelined, for + less memory consumption and better cache performance, but we do them + separately for simplicity.*/ + /*First do horizontal filtering (convert to 422jpeg)*/ + for (y = 0; y < c_h; y++) { + /*Filters: [1 110 18 -1]/128 and [-3 50 86 -5]/128, both derived from a + 4-tap Mitchell window.*/ + for (x = 0; x < OC_MINI(c_w, 1); x++) { + tmp[x << 1] = (unsigned char)OC_CLAMPI( + 0, + (111 * _aux[0] + 18 * _aux[OC_MINI(1, c_w - 1)] - + _aux[OC_MINI(2, c_w - 1)] + 64) >> + 7, + 255); + tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI( + 0, + (47 * _aux[0] + 86 * _aux[OC_MINI(1, c_w - 1)] - + 5 * _aux[OC_MINI(2, c_w - 1)] + 64) >> + 7, + 255); + } + for (; x < c_w - 2; x++) { + tmp[x << 1] = + (unsigned char)OC_CLAMPI(0, + (_aux[x - 1] + 110 * _aux[x] + + 18 * _aux[x + 1] - _aux[x + 2] + 64) >> + 7, + 255); + tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI( + 0, + (-3 * _aux[x - 1] + 50 * _aux[x] + 86 * _aux[x + 1] - + 5 * _aux[x + 2] + 64) >> + 7, + 255); + } + for (; x < c_w; x++) { + tmp[x << 1] = (unsigned char)OC_CLAMPI( + 0, + (_aux[x - 1] + 110 * _aux[x] + 18 * _aux[OC_MINI(x + 1, c_w - 1)] - + _aux[c_w - 1] + 64) >> + 7, + 255); + if ((x << 1 | 1) < dst_c_w) { + tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI( + 0, + (-3 * _aux[x - 1] + 50 * _aux[x] + + 86 * _aux[OC_MINI(x + 1, c_w - 1)] - 5 * _aux[c_w - 1] + 64) >> + 7, + 255); + } + } + tmp += dst_c_w; + _aux += c_w; + } + tmp -= tmp_sz; + /*Now do the vertical filtering.*/ + y4m_422jpeg_420jpeg_helper(_dst, tmp, dst_c_w, c_h); + _dst += dst_c_sz; + } +} + +/*Convert 444 to 420jpeg.*/ +static void y4m_convert_444_420jpeg(y4m_input *_y4m, unsigned char *_dst, + unsigned char *_aux) { + unsigned char *tmp; + int c_w; + int c_h; + int c_sz; + int dst_c_w; + int dst_c_h; + int dst_c_sz; + int tmp_sz; + int pli; + int y; + int x; + /*Skip past the luma data.*/ + _dst += _y4m->pic_w * _y4m->pic_h; + /*Compute the size of each chroma plane.*/ + c_w = (_y4m->pic_w + _y4m->src_c_dec_h - 1) / _y4m->src_c_dec_h; + c_h = _y4m->pic_h; + dst_c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h; + dst_c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v; + c_sz = c_w * c_h; + dst_c_sz = dst_c_w * dst_c_h; + tmp_sz = dst_c_w * c_h; + tmp = _aux + 2 * c_sz; + for (pli = 1; pli < 3; pli++) { + /*Filter: [3 -17 78 78 -17 3]/128, derived from a 6-tap Lanczos window.*/ + for (y = 0; y < c_h; y++) { + for (x = 0; x < OC_MINI(c_w, 2); x += 2) { + tmp[x >> 1] = OC_CLAMPI(0, + (64 * _aux[0] + 78 * _aux[OC_MINI(1, c_w - 1)] - + 17 * _aux[OC_MINI(2, c_w - 1)] + + 3 * _aux[OC_MINI(3, c_w - 1)] + 64) >> + 7, + 255); + } + for (; x < c_w - 3; x += 2) { + tmp[x >> 1] = OC_CLAMPI(0, + (3 * (_aux[x - 2] + _aux[x + 3]) - + 17 * (_aux[x - 1] + _aux[x + 2]) + + 78 * (_aux[x] + _aux[x + 1]) + 64) >> + 7, + 255); + } + for (; x < c_w; x += 2) { + tmp[x >> 1] = + OC_CLAMPI(0, + (3 * (_aux[x - 2] + _aux[c_w - 1]) - + 17 * (_aux[x - 1] + _aux[OC_MINI(x + 2, c_w - 1)]) + + 78 * (_aux[x] + _aux[OC_MINI(x + 1, c_w - 1)]) + 64) >> + 7, + 255); + } + tmp += dst_c_w; + _aux += c_w; + } + tmp -= tmp_sz; + /*Now do the vertical filtering.*/ + y4m_422jpeg_420jpeg_helper(_dst, tmp, dst_c_w, c_h); + _dst += dst_c_sz; + } +} + +/*The image is padded with empty chroma components at 4:2:0.*/ +static void y4m_convert_mono_420jpeg(y4m_input *_y4m, unsigned char *_dst, + unsigned char *_aux) { + int c_sz; + (void)_aux; + _dst += _y4m->pic_w * _y4m->pic_h; + c_sz = ((_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h) * + ((_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v); + memset(_dst, 128, c_sz * 2); +} + +/*No conversion function needed.*/ +static void y4m_convert_null(y4m_input *_y4m, unsigned char *_dst, + unsigned char *_aux) { + (void)_y4m; + (void)_dst; + (void)_aux; +} + +static const char TAG[] = "YUV4MPEG2"; + +int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer, + int num_skip, int only_420) { + // File must start with |TAG|. + char tag_buffer[9]; // 9 == strlen(TAG) + // Read as much as possible from |skip_buffer|, which were characters + // that were previously read from the file to do input-type detection. + assert(num_skip >= 0 && num_skip <= 8); + if (num_skip > 0) { + memcpy(tag_buffer, skip_buffer, num_skip); + } + // Start reading from the file now that the |skip_buffer| is depleted. + if (!file_read(tag_buffer + num_skip, 9 - num_skip, file)) { + return -1; + } + if (memcmp(TAG, tag_buffer, 9) != 0) { + fprintf(stderr, "Error parsing header: must start with %s\n", TAG); + return -1; + } + // Next character must be a space. + if (!file_read(tag_buffer, 1, file) || tag_buffer[0] != ' ') { + fprintf(stderr, "Error parsing header: space must follow %s\n", TAG); + return -1; + } + if (!parse_tags(y4m_ctx, file)) { + fprintf(stderr, "Error parsing %s header.\n", TAG); + } + if (y4m_ctx->interlace == '?') { + fprintf(stderr, + "Warning: Input video interlacing format unknown; " + "assuming progressive scan.\n"); + } else if (y4m_ctx->interlace != 'p') { + fprintf(stderr, + "Input video is interlaced; " + "Only progressive scan handled.\n"); + return -1; + } + y4m_ctx->vpx_fmt = VPX_IMG_FMT_I420; + y4m_ctx->bps = 12; + y4m_ctx->bit_depth = 8; + y4m_ctx->aux_buf = NULL; + y4m_ctx->dst_buf = NULL; + if (strcmp(y4m_ctx->chroma_type, "420") == 0 || + strcmp(y4m_ctx->chroma_type, "420jpeg") == 0 || + strcmp(y4m_ctx->chroma_type, "420mpeg2") == 0) { + y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_v = + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = + y4m_ctx->pic_w * y4m_ctx->pic_h + + 2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2); + /* Natively supported: no conversion required. */ + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; + } else if (strcmp(y4m_ctx->chroma_type, "420p10") == 0) { + y4m_ctx->src_c_dec_h = 2; + y4m_ctx->dst_c_dec_h = 2; + y4m_ctx->src_c_dec_v = 2; + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = + 2 * (y4m_ctx->pic_w * y4m_ctx->pic_h + + 2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2)); + /* Natively supported: no conversion required. */ + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; + y4m_ctx->bit_depth = 10; + y4m_ctx->bps = 15; + y4m_ctx->vpx_fmt = VPX_IMG_FMT_I42016; + if (only_420) { + fprintf(stderr, "Unsupported conversion from 420p10 to 420jpeg\n"); + return -1; + } + } else if (strcmp(y4m_ctx->chroma_type, "420p12") == 0) { + y4m_ctx->src_c_dec_h = 2; + y4m_ctx->dst_c_dec_h = 2; + y4m_ctx->src_c_dec_v = 2; + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = + 2 * (y4m_ctx->pic_w * y4m_ctx->pic_h + + 2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2)); + /* Natively supported: no conversion required. */ + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; + y4m_ctx->bit_depth = 12; + y4m_ctx->bps = 18; + y4m_ctx->vpx_fmt = VPX_IMG_FMT_I42016; + if (only_420) { + fprintf(stderr, "Unsupported conversion from 420p12 to 420jpeg\n"); + return -1; + } + } else if (strcmp(y4m_ctx->chroma_type, "420paldv") == 0) { + y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_v = + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; + /*Chroma filter required: read into the aux buf first. + We need to make two filter passes, so we need some extra space in the + aux buffer.*/ + y4m_ctx->aux_buf_sz = + 3 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2); + y4m_ctx->aux_buf_read_sz = + 2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2); + y4m_ctx->convert = y4m_convert_42xpaldv_42xjpeg; + } else if (strcmp(y4m_ctx->chroma_type, "422jpeg") == 0) { + y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = 2; + y4m_ctx->src_c_dec_v = 1; + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; + /*Chroma filter required: read into the aux buf first.*/ + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = + 2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h; + y4m_ctx->convert = y4m_convert_422jpeg_420jpeg; + } else if (strcmp(y4m_ctx->chroma_type, "422") == 0) { + y4m_ctx->src_c_dec_h = 2; + y4m_ctx->src_c_dec_v = 1; + if (only_420) { + y4m_ctx->dst_c_dec_h = 2; + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; + /*Chroma filter required: read into the aux buf first. + We need to make two filter passes, so we need some extra space in the + aux buffer.*/ + y4m_ctx->aux_buf_read_sz = + 2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h; + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz + + ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h; + y4m_ctx->convert = y4m_convert_422_420jpeg; + } else { + y4m_ctx->vpx_fmt = VPX_IMG_FMT_I422; + y4m_ctx->bps = 16; + y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h; + y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v; + y4m_ctx->dst_buf_read_sz = + y4m_ctx->pic_w * y4m_ctx->pic_h + + 2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h; + /*Natively supported: no conversion required.*/ + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; + } + } else if (strcmp(y4m_ctx->chroma_type, "422p10") == 0) { + y4m_ctx->src_c_dec_h = 2; + y4m_ctx->src_c_dec_v = 1; + y4m_ctx->vpx_fmt = VPX_IMG_FMT_I42216; + y4m_ctx->bps = 20; + y4m_ctx->bit_depth = 10; + y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h; + y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v; + y4m_ctx->dst_buf_read_sz = + 2 * (y4m_ctx->pic_w * y4m_ctx->pic_h + + 2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h); + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; + if (only_420) { + fprintf(stderr, "Unsupported conversion from 422p10 to 420jpeg\n"); + return -1; + } + } else if (strcmp(y4m_ctx->chroma_type, "422p12") == 0) { + y4m_ctx->src_c_dec_h = 2; + y4m_ctx->src_c_dec_v = 1; + y4m_ctx->vpx_fmt = VPX_IMG_FMT_I42216; + y4m_ctx->bps = 24; + y4m_ctx->bit_depth = 12; + y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h; + y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v; + y4m_ctx->dst_buf_read_sz = + 2 * (y4m_ctx->pic_w * y4m_ctx->pic_h + + 2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h); + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; + if (only_420) { + fprintf(stderr, "Unsupported conversion from 422p12 to 420jpeg\n"); + return -1; + } + } else if (strcmp(y4m_ctx->chroma_type, "411") == 0) { + y4m_ctx->src_c_dec_h = 4; + y4m_ctx->dst_c_dec_h = 2; + y4m_ctx->src_c_dec_v = 1; + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; + /*Chroma filter required: read into the aux buf first. + We need to make two filter passes, so we need some extra space in the + aux buffer.*/ + y4m_ctx->aux_buf_read_sz = 2 * ((y4m_ctx->pic_w + 3) / 4) * y4m_ctx->pic_h; + y4m_ctx->aux_buf_sz = + y4m_ctx->aux_buf_read_sz + ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h; + y4m_ctx->convert = y4m_convert_411_420jpeg; + fprintf(stderr, "Unsupported conversion from yuv 411\n"); + return -1; + } else if (strcmp(y4m_ctx->chroma_type, "444") == 0) { + y4m_ctx->src_c_dec_h = 1; + y4m_ctx->src_c_dec_v = 1; + if (only_420) { + y4m_ctx->dst_c_dec_h = 2; + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; + /*Chroma filter required: read into the aux buf first. + We need to make two filter passes, so we need some extra space in the + aux buffer.*/ + y4m_ctx->aux_buf_read_sz = 2 * y4m_ctx->pic_w * y4m_ctx->pic_h; + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz + + ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h; + y4m_ctx->convert = y4m_convert_444_420jpeg; + } else { + y4m_ctx->vpx_fmt = VPX_IMG_FMT_I444; + y4m_ctx->bps = 24; + y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h; + y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v; + y4m_ctx->dst_buf_read_sz = 3 * y4m_ctx->pic_w * y4m_ctx->pic_h; + /*Natively supported: no conversion required.*/ + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; + } + } else if (strcmp(y4m_ctx->chroma_type, "444p10") == 0) { + y4m_ctx->src_c_dec_h = 1; + y4m_ctx->src_c_dec_v = 1; + y4m_ctx->vpx_fmt = VPX_IMG_FMT_I44416; + y4m_ctx->bps = 30; + y4m_ctx->bit_depth = 10; + y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h; + y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v; + y4m_ctx->dst_buf_read_sz = 2 * 3 * y4m_ctx->pic_w * y4m_ctx->pic_h; + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; + if (only_420) { + fprintf(stderr, "Unsupported conversion from 444p10 to 420jpeg\n"); + return -1; + } + } else if (strcmp(y4m_ctx->chroma_type, "444p12") == 0) { + y4m_ctx->src_c_dec_h = 1; + y4m_ctx->src_c_dec_v = 1; + y4m_ctx->vpx_fmt = VPX_IMG_FMT_I44416; + y4m_ctx->bps = 36; + y4m_ctx->bit_depth = 12; + y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h; + y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v; + y4m_ctx->dst_buf_read_sz = 2 * 3 * y4m_ctx->pic_w * y4m_ctx->pic_h; + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; + if (only_420) { + fprintf(stderr, "Unsupported conversion from 444p12 to 420jpeg\n"); + return -1; + } + } else if (strcmp(y4m_ctx->chroma_type, "mono") == 0) { + y4m_ctx->src_c_dec_h = y4m_ctx->src_c_dec_v = 0; + y4m_ctx->dst_c_dec_h = y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; + /*No extra space required, but we need to clear the chroma planes.*/ + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_mono_420jpeg; + } else { + fprintf(stderr, "Unknown chroma sampling type: %s\n", y4m_ctx->chroma_type); + return -1; + } + /*The size of the final frame buffers is always computed from the + destination chroma decimation type.*/ + y4m_ctx->dst_buf_sz = + y4m_ctx->pic_w * y4m_ctx->pic_h + + 2 * ((y4m_ctx->pic_w + y4m_ctx->dst_c_dec_h - 1) / y4m_ctx->dst_c_dec_h) * + ((y4m_ctx->pic_h + y4m_ctx->dst_c_dec_v - 1) / y4m_ctx->dst_c_dec_v); + if (y4m_ctx->bit_depth == 8) + y4m_ctx->dst_buf = (unsigned char *)malloc(y4m_ctx->dst_buf_sz); + else + y4m_ctx->dst_buf = (unsigned char *)malloc(2 * y4m_ctx->dst_buf_sz); + if (!y4m_ctx->dst_buf) return -1; + + if (y4m_ctx->aux_buf_sz > 0) { + y4m_ctx->aux_buf = (unsigned char *)malloc(y4m_ctx->aux_buf_sz); + if (!y4m_ctx->aux_buf) { + free(y4m_ctx->dst_buf); + return -1; + } + } + return 0; +} + +void y4m_input_close(y4m_input *_y4m) { + free(_y4m->dst_buf); + free(_y4m->aux_buf); +} + +int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *_img) { + char frame[6]; + int pic_sz; + int c_w; + int c_h; + int c_sz; + int bytes_per_sample = _y4m->bit_depth > 8 ? 2 : 1; + /*Read and skip the frame header.*/ + if (!file_read(frame, 6, _fin)) return 0; + if (memcmp(frame, "FRAME", 5)) { + fprintf(stderr, "Loss of framing in Y4M input data\n"); + return -1; + } + if (frame[5] != '\n') { + char c; + int j; + for (j = 0; j < 79 && file_read(&c, 1, _fin) && c != '\n'; j++) { + } + if (j == 79) { + fprintf(stderr, "Error parsing Y4M frame header\n"); + return -1; + } + } + /*Read the frame data that needs no conversion.*/ + if (!file_read(_y4m->dst_buf, _y4m->dst_buf_read_sz, _fin)) { + fprintf(stderr, "Error reading Y4M frame data.\n"); + return -1; + } + /*Read the frame data that does need conversion.*/ + if (!file_read(_y4m->aux_buf, _y4m->aux_buf_read_sz, _fin)) { + fprintf(stderr, "Error reading Y4M frame data.\n"); + return -1; + } + /*Now convert the just read frame.*/ + (*_y4m->convert)(_y4m, _y4m->dst_buf, _y4m->aux_buf); + /*Fill in the frame buffer pointers. + We don't use vpx_img_wrap() because it forces padding for odd picture + sizes, which would require a separate fread call for every row.*/ + memset(_img, 0, sizeof(*_img)); + /*Y4M has the planes in Y'CbCr order, which libvpx calls Y, U, and V.*/ + _img->fmt = _y4m->vpx_fmt; + _img->w = _img->d_w = _y4m->pic_w; + _img->h = _img->d_h = _y4m->pic_h; + _img->bit_depth = _y4m->bit_depth; + _img->x_chroma_shift = _y4m->dst_c_dec_h >> 1; + _img->y_chroma_shift = _y4m->dst_c_dec_v >> 1; + _img->bps = _y4m->bps; + + /*Set up the buffer pointers.*/ + pic_sz = _y4m->pic_w * _y4m->pic_h * bytes_per_sample; + c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h; + c_w *= bytes_per_sample; + c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v; + c_sz = c_w * c_h; + _img->stride[VPX_PLANE_Y] = _img->stride[VPX_PLANE_ALPHA] = + _y4m->pic_w * bytes_per_sample; + _img->stride[VPX_PLANE_U] = _img->stride[VPX_PLANE_V] = c_w; + _img->planes[VPX_PLANE_Y] = _y4m->dst_buf; + _img->planes[VPX_PLANE_U] = _y4m->dst_buf + pic_sz; + _img->planes[VPX_PLANE_V] = _y4m->dst_buf + pic_sz + c_sz; + _img->planes[VPX_PLANE_ALPHA] = _y4m->dst_buf + pic_sz + 2 * c_sz; + return 1; +} diff --git a/media/libvpx/libvpx/y4minput.h b/media/libvpx/libvpx/y4minput.h new file mode 100644 index 0000000000..573750d749 --- /dev/null +++ b/media/libvpx/libvpx/y4minput.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + * Based on code from the OggTheora software codec source code, + * Copyright (C) 2002-2010 The Xiph.Org Foundation and contributors. + */ + +#ifndef VPX_Y4MINPUT_H_ +#define VPX_Y4MINPUT_H_ + +#include +#include "vpx/vpx_image.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct y4m_input y4m_input; + +/*The function used to perform chroma conversion.*/ +typedef void (*y4m_convert_func)(y4m_input *_y4m, unsigned char *_dst, + unsigned char *_src); + +struct y4m_input { + int pic_w; + int pic_h; + int fps_n; + int fps_d; + int par_n; + int par_d; + char interlace; + int src_c_dec_h; + int src_c_dec_v; + int dst_c_dec_h; + int dst_c_dec_v; + char chroma_type[16]; + /*The size of each converted frame buffer.*/ + size_t dst_buf_sz; + /*The amount to read directly into the converted frame buffer.*/ + size_t dst_buf_read_sz; + /*The size of the auxilliary buffer.*/ + size_t aux_buf_sz; + /*The amount to read into the auxilliary buffer.*/ + size_t aux_buf_read_sz; + y4m_convert_func convert; + unsigned char *dst_buf; + unsigned char *aux_buf; + enum vpx_img_fmt vpx_fmt; + int bps; + unsigned int bit_depth; +}; + +/** + * Open the input file, treating it as Y4M. |y4m_ctx| is filled in after + * reading it. The |skip_buffer| indicates bytes that were previously read + * from |file|, to do input-type detection; this buffer will be read before + * the |file| is read. It is of size |num_skip|, which *must* be 8 or less. + * + * Returns 0 on success, -1 on failure. + */ +int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer, + int num_skip, int only_420); +void y4m_input_close(y4m_input *_y4m); +int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *img); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_Y4MINPUT_H_ diff --git a/media/libvpx/lint_config.sh b/media/libvpx/lint_config.sh new file mode 100755 index 0000000000..1a6c96dfbb --- /dev/null +++ b/media/libvpx/lint_config.sh @@ -0,0 +1,112 @@ +#!/bin/bash -e +# +# Copyright (c) 2012 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +# This script is used to compare vpx_config.h and vpx_config.asm to +# verify the two files match. +# +# Arguments: +# +# -h - C Header file. +# -a - ASM file. +# -p - Print the options if correct. +# -o - Output file. +# +# Usage: +# +# # Compare the two configuration files and output the final results. +# ./lint_config.sh -h vpx_config.h -a vpx_config.asm -o libvpx.config -p + +export LC_ALL=C +print_final="no" + +while getopts "h:a:o:p" flag +do + if [ "$flag" = "h" ]; then + header_file=$OPTARG + elif [ "$flag" = "a" ]; then + asm_file=$OPTARG + elif [ "$flag" = "o" ]; then + out_file=$OPTARG + elif [ "$flag" = "p" ]; then + print_final="yes" + fi +done + +if [ -z "$header_file" ]; then + echo "Header file not specified." + false + exit +fi + +if [ -z "$asm_file" ]; then + echo "ASM file not specified." + false + exit +fi + +# Concat header file and assembly file and select those ended with 0 or 1. +combined_config="$(cat $header_file $asm_file | grep -E ' +[01] *$')" + +# Extra filtering for known exceptions. +combined_config="$(echo "$combined_config" | grep -v WIDE_REFERENCE)" +combined_config="$(echo "$combined_config" | grep -v ARCHITECTURE)" +combined_config="$(echo "$combined_config" | grep -v DO1STROUNDING)" + +# Remove all spaces. +combined_config="$(echo "$combined_config" | sed 's/[ \t]//g')" + +# Remove #define in the header file. +combined_config="$(echo "$combined_config" | sed 's/.*define//')" + +# Remove equ in the ASM file. +combined_config="$(echo "$combined_config" | sed 's/\.equ//')" # gas style +combined_config="$(echo "$combined_config" | sed 's/equ//')" # rvds style +combined_config="$(echo "$combined_config" | sed 's/\.set//')" # apple style + +# Remove %define in YASM ASM files. +combined_config="$(echo "$combined_config" | sed 's/%define\s *//')" # yasm style + +# Remove useless comma in gas style assembly file. +combined_config="$(echo "$combined_config" | sed 's/,//')" + +# Substitute 0 with =no. +combined_config="$(echo "$combined_config" | sed 's/0$/=no/')" + +# Substitute 1 with =yes. +combined_config="$(echo "$combined_config" | sed 's/1$/=yes/')" + +# Find the mismatch variables. +odd_config="$(echo "$combined_config" | sort | uniq -u)" +odd_vars="$(echo "$odd_config" | sed 's/=.*//' | uniq)" + +for var in $odd_vars; do + echo "Error: Configuration mismatch for $var." + echo "Header file: $header_file" + echo "$(cat -n $header_file | grep "$var[ \t]")" + echo "Assembly file: $asm_file" + echo "$(cat -n $asm_file | grep "$var[ \t]")" + echo "" +done + +if [ -n "$odd_vars" ]; then + false + exit +fi + +if [ "$print_final" = "no" ]; then + exit +fi + +# Do some additional filter to make libvpx happy. +combined_config="$(echo "$combined_config" | grep -v ARCH_X86=no)" +combined_config="$(echo "$combined_config" | grep -v ARCH_X86_64=no)" + +# Print out the unique configurations. +if [ -n "$out_file" ]; then + echo "$combined_config" | sort | uniq > $out_file +else + echo "$combined_config" | sort | uniq +fi diff --git a/media/libvpx/moz.build b/media/libvpx/moz.build new file mode 100644 index 0000000000..582bc6fd5d --- /dev/null +++ b/media/libvpx/moz.build @@ -0,0 +1,155 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +with Files('*'): + BUG_COMPONENT = ('Core', 'Audio/Video') + +include('sources.mozbuild') + +if CONFIG['VPX_USE_NASM']: + USE_NASM = True + +# Linux, Mac and Win share file lists for x86* but not configurations. +if CONFIG['TARGET_CPU'] == 'x86_64': + EXPORTS.vpx += files['X64_EXPORTS'] + SOURCES += files['X64_SOURCES'] + if CONFIG['OS_TARGET'] == 'WINNT': + ASFLAGS += [ '-I%s/media/libvpx/config/win/x64/' % TOPSRCDIR ] + LOCAL_INCLUDES += [ '/media/libvpx/config/win/x64/' ] + SOURCES += [ '/media/libvpx/config/win/x64/vpx_config.c' ] + elif CONFIG['OS_TARGET'] == 'Darwin': + ASFLAGS += [ '-I%s/media/libvpx/config/mac/x64/' % TOPSRCDIR ] + LOCAL_INCLUDES += [ '/media/libvpx/config/mac/x64/' ] + SOURCES += [ '/media/libvpx/config/mac/x64/vpx_config.c' ] + else: # Android, Linux, BSDs, etc. + ASFLAGS += [ '-I%s/media/libvpx/config/linux/x64/' % TOPSRCDIR ] + LOCAL_INCLUDES += [ '/media/libvpx/config/linux/x64/' ] + SOURCES += [ '/media/libvpx/config/linux/x64/vpx_config.c' ] +elif CONFIG['TARGET_CPU'] == 'x86': + EXPORTS.vpx += files['IA32_EXPORTS'] + SOURCES += files['IA32_SOURCES'] + if CONFIG['OS_TARGET'] == 'WINNT': + ASFLAGS += [ '-I%s/media/libvpx/config/win/ia32/' % TOPSRCDIR ] + LOCAL_INCLUDES += [ '/media/libvpx/config/win/ia32/' ] + SOURCES += [ '/media/libvpx/config/win/ia32/vpx_config.c' ] + elif CONFIG['OS_TARGET'] == 'Darwin': + ASFLAGS += [ '-I%s/media/libvpx/config/mac/ia32/' % TOPSRCDIR ] + LOCAL_INCLUDES += [ '/media/libvpx/config/mac/ia32/' ] + SOURCES += [ '/media/libvpx/config/mac/ia32/vpx_config.c' ] + else: # Android, Linux, BSDs, etc. + ASFLAGS += [ '-I%s/media/libvpx/config/linux/ia32/' % TOPSRCDIR ] + LOCAL_INCLUDES += [ '/media/libvpx/config/linux/ia32/' ] + SOURCES += [ '/media/libvpx/config/linux/ia32/vpx_config.c' ] +elif CONFIG['TARGET_CPU'] == 'arm': + EXPORTS.vpx += files['ARM_EXPORTS'] + ASFLAGS += [ + '-I%s/media/libvpx/config/linux/arm/' % TOPSRCDIR, + '-I%s/libvpx' % OBJDIR, + ] + LOCAL_INCLUDES += [ '/media/libvpx/config/linux/arm/' ] + SOURCES += [ '/media/libvpx/config/linux/arm/vpx_config.c' ] + + arm_asm_files = files['ARM_SOURCES'] + + if CONFIG['GNU_AS']: + SOURCES += sorted([ + "!%s.S" % f if f.endswith('.asm') else f for f in arm_asm_files + ]) + else: + SOURCES += sorted(arm_asm_files) + + for f in SOURCES: + if f.endswith('.c') and 'neon' in f: + SOURCES[f].flags += CONFIG['VPX_ASFLAGS'] + + if CONFIG['OS_TARGET'] == 'Android': + # For cpu-features.h + LOCAL_INCLUDES += [ + '%%%s/sources/android/cpufeatures' % CONFIG['ANDROID_NDK'], + ] +elif CONFIG['TARGET_CPU'] == 'aarch64' and CONFIG['OS_TARGET'] == 'WINNT': + EXPORTS.vpx += files['ARM64_EXPORTS'] + SOURCES += files['ARM64_SOURCES'] + ASFLAGS += [ '-I%s/media/libvpx/config/win/aarch64/' % TOPSRCDIR ] + LOCAL_INCLUDES += [ '/media/libvpx/config/win/aarch64/' ] + SOURCES += [ '/media/libvpx/config/win/aarch64/vpx_config.c' ] +elif CONFIG['TARGET_CPU'] == 'aarch64': + EXPORTS.vpx += files['ARM64_EXPORTS'] + SOURCES += files['ARM64_SOURCES'] + ASFLAGS += [ '-I%s/media/libvpx/config/linux/arm64/' % TOPSRCDIR ] + LOCAL_INCLUDES += [ '/media/libvpx/config/linux/arm64/' ] + SOURCES += [ '/media/libvpx/config/linux/arm64/vpx_config.c' ] +else: + # Generic C-only configuration + EXPORTS.vpx += files['GENERIC_EXPORTS'] + SOURCES += files['GENERIC_SOURCES'] + ASFLAGS += [ '-I%s/media/libvpx/config/generic/' % TOPSRCDIR ] + LOCAL_INCLUDES += [ '/media/libvpx/config/generic/' ] + SOURCES += [ '/media/libvpx/config/generic/vpx_config.c' ] + +# We allow warnings for third-party code that can be updated from upstream. +AllowCompilerWarnings() + +FINAL_LIBRARY = 'gkcodecs' +NoVisibilityFlags() + +DEFINES['HAVE_CONFIG_H'] = 'vpx_config.h' + +if CONFIG['OS_TARGET'] == 'Android': + # Older versions of the Android NDK don't pre-define anything to indicate + # the OS they're on, so do it for them. + DEFINES['__linux__'] = True + + SOURCES += [ + '%%%s/sources/android/cpufeatures/cpu-features.c' % CONFIG['ANDROID_NDK'], + ] + +for f in SOURCES: + if f.endswith('.c'): + if 'mmx.c' in f: + SOURCES[f].flags += ['-mmmx'] + if 'sse2.c' in f: + SOURCES[f].flags += CONFIG['SSE2_FLAGS'] + if 'ssse3.c' in f: + SOURCES[f].flags += ['-mssse3'] + if 'sse4.c' in f: + SOURCES[f].flags += ['-msse4.1'] + if 'avx.c' in f: + SOURCES[f].flags += ['-mavx'] + if 'avx2.c' in f: + SOURCES[f].flags += ['-mavx2'] + if 'neon_dotprod.c' in f: + SOURCES[f].flags += ['-march=armv8.2-a+dotprod'] + if 'neon_i8mm.c' in f: + SOURCES[f].flags += ['-march=armv8.2-a+dotprod+i8mm'] + +# Suppress warnings in third-party code. +CFLAGS += [ + '-Wno-sign-compare', + '-Wno-unused-function', # so many of these warnings; just ignore them +] +if CONFIG['CC_TYPE'] in ('clang', 'clang-cl'): + CFLAGS += [ + '-Wno-unreachable-code', + '-Wno-unneeded-internal-declaration', + ] + +ASFLAGS += CONFIG['VPX_ASFLAGS'] +ASFLAGS += [ + '-I./', + '-I%s/media/libvpx/libvpx/' % TOPSRCDIR, +] + +LOCAL_INCLUDES += [ + '/media/libvpx/config', # vpx_version.h + '/media/libvpx/libvpx', +] + +if CONFIG['OS_TARGET'] == 'Android': + # For LIBVPX_RAND + ASFLAGS += [ + '-D__ANDROID__' + ] diff --git a/media/libvpx/moz.yaml b/media/libvpx/moz.yaml new file mode 100644 index 0000000000..17704a1905 --- /dev/null +++ b/media/libvpx/moz.yaml @@ -0,0 +1,76 @@ +# Version of this schema +schema: 1 + +bugzilla: + # Bugzilla product and component for this directory and subdirectories + product: Core + component: "Audio/Video" + +# Document the source of externally hosted code +origin: + + # Short name of the package/library + name: libvpx + + description: VP8/VP9 Codec SDK + + # Full URL for the package's homepage/etc + # Usually different from repository url + url: https://chromium.googlesource.com/webm/libvpx + + # Human-readable identifier for this version/release + # Generally "version NNN", "tag SSS", "bookmark SSS" + release: f6b7166a2b6bac544c2c487d3a7e49bc265cdf9d (Tue Jan 02 20:08:06 2024). + + # Revision to pull in + # Must be a long or short commit SHA (long preferred) + revision: f6b7166a2b6bac544c2c487d3a7e49bc265cdf9d + + # The package's license, where possible using the mnemonic from + # https://spdx.org/licenses/ + # Multiple licenses can be specified (as a YAML list) + # A "LICENSE" file must exist containing the full license text + license: BSD-3-Clause + + license-file: LICENSE + +updatebot: + maintainer-phab: chunmin + maintainer-bz: cchang@mozilla.com + tasks: + - type: vendoring + enabled: true + frequency: release + +vendoring: + url: https://chromium.googlesource.com/webm/libvpx + source-hosting: googlesource + vendor-directory: media/libvpx/libvpx/ + skip-vendoring-steps: ['update-moz-build'] + + exclude: + - third_party/libwebm + - tools/ + + patches: + - input_frame_validation.patch + - input_frame_validation_vp9.patch + + update-actions: + - action: move-file + from: '{vendor_dir}/vpx_dsp/x86/loopfilter_sse2.c' + to: '{vendor_dir}/vpx_dsp/x86/loopfilter_intrin_sse2.c' + - action: move-file + from: '{vendor_dir}/vpx_dsp/loongarch/quantize_lsx.c' + to: '{vendor_dir}/vpx_dsp/loongarch/quantize_intrin_lsx.c' + - action: run-command + command: patch + args: ['-p1', '-i', '{yaml_dir}/rename_duplicate_files.patch', '-d', '{yaml_dir}/libvpx'] + cwd: '{yaml_dir}' + - action: run-command + command: patch + args: ['-p1', '-i', '{yaml_dir}/win64_build_fix.patch', '-d', '{yaml_dir}/libvpx'] + cwd: '{yaml_dir}' + - action: run-script + script: '{yaml_dir}/generate_sources_mozbuild.sh' + cwd: '{yaml_dir}' diff --git a/media/libvpx/rename_duplicate_files.patch b/media/libvpx/rename_duplicate_files.patch new file mode 100644 index 0000000000..0215b2d2c0 --- /dev/null +++ b/media/libvpx/rename_duplicate_files.patch @@ -0,0 +1,22 @@ +diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk +index 13999af04..6519d828c 100644 +--- a/vpx_dsp/vpx_dsp.mk ++++ b/vpx_dsp/vpx_dsp.mk +@@ -177,7 +177,7 @@ DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve_lsx.h + # loop filters + DSP_SRCS-yes += loopfilter.c + +-DSP_SRCS-$(HAVE_SSE2) += x86/loopfilter_sse2.c ++DSP_SRCS-$(HAVE_SSE2) += x86/loopfilter_intrin_sse2.c + DSP_SRCS-$(HAVE_AVX2) += x86/loopfilter_avx2.c + + ifeq ($(HAVE_NEON_ASM),yes) +@@ -328,7 +328,7 @@ DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3.h + DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx.c + DSP_SRCS-$(HAVE_NEON) += arm/quantize_neon.c + DSP_SRCS-$(HAVE_VSX) += ppc/quantize_vsx.c +-DSP_SRCS-$(HAVE_LSX) += loongarch/quantize_lsx.c ++DSP_SRCS-$(HAVE_LSX) += loongarch/quantize_intrin_lsx.c + ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) + DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c + endif diff --git a/media/libvpx/sources.mozbuild b/media/libvpx/sources.mozbuild new file mode 100644 index 0000000000..2960dee255 --- /dev/null +++ b/media/libvpx/sources.mozbuild @@ -0,0 +1,1235 @@ +# This file is generated. Do not edit. + +files = { + 'X64_EXPORTS': [ + 'libvpx/vpx/vp8.h', + 'libvpx/vpx/vp8cx.h', + 'libvpx/vpx/vp8dx.h', + 'libvpx/vpx/vpx_codec.h', + 'libvpx/vpx/vpx_decoder.h', + 'libvpx/vpx/vpx_encoder.h', + 'libvpx/vpx/vpx_ext_ratectrl.h', + 'libvpx/vpx/vpx_frame_buffer.h', + 'libvpx/vpx/vpx_image.h', + 'libvpx/vpx/vpx_integer.h', + 'libvpx/vpx/vpx_tpl.h', + 'libvpx/vpx_mem/include/vpx_mem_intrnl.h', + 'libvpx/vpx_mem/vpx_mem.h', + 'libvpx/vpx_ports/bitops.h', + 'libvpx/vpx_ports/compiler_attributes.h', + 'libvpx/vpx_ports/mem.h', + 'libvpx/vpx_ports/static_assert.h', + 'libvpx/vpx_ports/system_state.h', + 'libvpx/vpx_ports/vpx_timer.h', + 'libvpx/vpx_ports/x86.h', + 'libvpx/vpx_scale/vpx_scale.h', + 'libvpx/vpx_scale/yv12config.h', +], + 'X64_SOURCES': [ + 'libvpx/vp8/common/alloccommon.c', + 'libvpx/vp8/common/blockd.c', + 'libvpx/vp8/common/dequantize.c', + 'libvpx/vp8/common/entropy.c', + 'libvpx/vp8/common/entropymode.c', + 'libvpx/vp8/common/entropymv.c', + 'libvpx/vp8/common/extend.c', + 'libvpx/vp8/common/filter.c', + 'libvpx/vp8/common/findnearmv.c', + 'libvpx/vp8/common/generic/systemdependent.c', + 'libvpx/vp8/common/idct_blk.c', + 'libvpx/vp8/common/idctllm.c', + 'libvpx/vp8/common/loopfilter_filters.c', + 'libvpx/vp8/common/mbpitch.c', + 'libvpx/vp8/common/mfqe.c', + 'libvpx/vp8/common/modecont.c', + 'libvpx/vp8/common/postproc.c', + 'libvpx/vp8/common/quant_common.c', + 'libvpx/vp8/common/reconinter.c', + 'libvpx/vp8/common/reconintra.c', + 'libvpx/vp8/common/reconintra4x4.c', + 'libvpx/vp8/common/rtcd.c', + 'libvpx/vp8/common/setupintrarecon.c', + 'libvpx/vp8/common/swapyv12buffer.c', + 'libvpx/vp8/common/treecoder.c', + 'libvpx/vp8/common/vp8_loopfilter.c', + 'libvpx/vp8/common/vp8_skin_detection.c', + 'libvpx/vp8/common/x86/bilinear_filter_sse2.c', + 'libvpx/vp8/common/x86/dequantize_mmx.asm', + 'libvpx/vp8/common/x86/idct_blk_mmx.c', + 'libvpx/vp8/common/x86/idct_blk_sse2.c', + 'libvpx/vp8/common/x86/idctllm_mmx.asm', + 'libvpx/vp8/common/x86/idctllm_sse2.asm', + 'libvpx/vp8/common/x86/iwalsh_sse2.asm', + 'libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm', + 'libvpx/vp8/common/x86/loopfilter_sse2.asm', + 'libvpx/vp8/common/x86/loopfilter_x86.c', + 'libvpx/vp8/common/x86/mfqe_sse2.asm', + 'libvpx/vp8/common/x86/recon_mmx.asm', + 'libvpx/vp8/common/x86/recon_sse2.asm', + 'libvpx/vp8/common/x86/subpixel_mmx.asm', + 'libvpx/vp8/common/x86/subpixel_sse2.asm', + 'libvpx/vp8/common/x86/subpixel_ssse3.asm', + 'libvpx/vp8/common/x86/vp8_asm_stubs.c', + 'libvpx/vp8/decoder/dboolhuff.c', + 'libvpx/vp8/decoder/decodeframe.c', + 'libvpx/vp8/decoder/decodemv.c', + 'libvpx/vp8/decoder/detokenize.c', + 'libvpx/vp8/decoder/onyxd_if.c', + 'libvpx/vp8/decoder/threading.c', + 'libvpx/vp8/encoder/bitstream.c', + 'libvpx/vp8/encoder/boolhuff.c', + 'libvpx/vp8/encoder/copy_c.c', + 'libvpx/vp8/encoder/dct.c', + 'libvpx/vp8/encoder/denoising.c', + 'libvpx/vp8/encoder/encodeframe.c', + 'libvpx/vp8/encoder/encodeintra.c', + 'libvpx/vp8/encoder/encodemb.c', + 'libvpx/vp8/encoder/encodemv.c', + 'libvpx/vp8/encoder/ethreading.c', + 'libvpx/vp8/encoder/firstpass.c', + 'libvpx/vp8/encoder/lookahead.c', + 'libvpx/vp8/encoder/mcomp.c', + 'libvpx/vp8/encoder/modecosts.c', + 'libvpx/vp8/encoder/mr_dissim.c', + 'libvpx/vp8/encoder/onyx_if.c', + 'libvpx/vp8/encoder/pickinter.c', + 'libvpx/vp8/encoder/picklpf.c', + 'libvpx/vp8/encoder/ratectrl.c', + 'libvpx/vp8/encoder/rdopt.c', + 'libvpx/vp8/encoder/segmentation.c', + 'libvpx/vp8/encoder/temporal_filter.c', + 'libvpx/vp8/encoder/tokenize.c', + 'libvpx/vp8/encoder/treewriter.c', + 'libvpx/vp8/encoder/vp8_quantize.c', + 'libvpx/vp8/encoder/x86/block_error_sse2.asm', + 'libvpx/vp8/encoder/x86/copy_sse2.asm', + 'libvpx/vp8/encoder/x86/copy_sse3.asm', + 'libvpx/vp8/encoder/x86/dct_sse2.asm', + 'libvpx/vp8/encoder/x86/denoising_sse2.c', + 'libvpx/vp8/encoder/x86/fwalsh_sse2.asm', + 'libvpx/vp8/encoder/x86/quantize_sse4.c', + 'libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm', + 'libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c', + 'libvpx/vp8/encoder/x86/vp8_quantize_sse2.c', + 'libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c', + 'libvpx/vp8/vp8_cx_iface.c', + 'libvpx/vp8/vp8_dx_iface.c', + 'libvpx/vp9/common/vp9_alloccommon.c', + 'libvpx/vp9/common/vp9_blockd.c', + 'libvpx/vp9/common/vp9_common_data.c', + 'libvpx/vp9/common/vp9_entropy.c', + 'libvpx/vp9/common/vp9_entropymode.c', + 'libvpx/vp9/common/vp9_entropymv.c', + 'libvpx/vp9/common/vp9_filter.c', + 'libvpx/vp9/common/vp9_frame_buffers.c', + 'libvpx/vp9/common/vp9_idct.c', + 'libvpx/vp9/common/vp9_loopfilter.c', + 'libvpx/vp9/common/vp9_mfqe.c', + 'libvpx/vp9/common/vp9_mvref_common.c', + 'libvpx/vp9/common/vp9_postproc.c', + 'libvpx/vp9/common/vp9_pred_common.c', + 'libvpx/vp9/common/vp9_quant_common.c', + 'libvpx/vp9/common/vp9_reconinter.c', + 'libvpx/vp9/common/vp9_reconintra.c', + 'libvpx/vp9/common/vp9_rtcd.c', + 'libvpx/vp9/common/vp9_scale.c', + 'libvpx/vp9/common/vp9_scan.c', + 'libvpx/vp9/common/vp9_seg_common.c', + 'libvpx/vp9/common/vp9_thread_common.c', + 'libvpx/vp9/common/vp9_tile_common.c', + 'libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c', + 'libvpx/vp9/common/x86/vp9_mfqe_sse2.asm', + 'libvpx/vp9/decoder/vp9_decodeframe.c', + 'libvpx/vp9/decoder/vp9_decodemv.c', + 'libvpx/vp9/decoder/vp9_decoder.c', + 'libvpx/vp9/decoder/vp9_detokenize.c', + 'libvpx/vp9/decoder/vp9_dsubexp.c', + 'libvpx/vp9/decoder/vp9_job_queue.c', + 'libvpx/vp9/encoder/vp9_alt_ref_aq.c', + 'libvpx/vp9/encoder/vp9_aq_360.c', + 'libvpx/vp9/encoder/vp9_aq_complexity.c', + 'libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c', + 'libvpx/vp9/encoder/vp9_aq_variance.c', + 'libvpx/vp9/encoder/vp9_bitstream.c', + 'libvpx/vp9/encoder/vp9_context_tree.c', + 'libvpx/vp9/encoder/vp9_cost.c', + 'libvpx/vp9/encoder/vp9_dct.c', + 'libvpx/vp9/encoder/vp9_encodeframe.c', + 'libvpx/vp9/encoder/vp9_encodemb.c', + 'libvpx/vp9/encoder/vp9_encodemv.c', + 'libvpx/vp9/encoder/vp9_encoder.c', + 'libvpx/vp9/encoder/vp9_ethread.c', + 'libvpx/vp9/encoder/vp9_ext_ratectrl.c', + 'libvpx/vp9/encoder/vp9_extend.c', + 'libvpx/vp9/encoder/vp9_firstpass.c', + 'libvpx/vp9/encoder/vp9_frame_scale.c', + 'libvpx/vp9/encoder/vp9_lookahead.c', + 'libvpx/vp9/encoder/vp9_mbgraph.c', + 'libvpx/vp9/encoder/vp9_mcomp.c', + 'libvpx/vp9/encoder/vp9_multi_thread.c', + 'libvpx/vp9/encoder/vp9_noise_estimate.c', + 'libvpx/vp9/encoder/vp9_picklpf.c', + 'libvpx/vp9/encoder/vp9_pickmode.c', + 'libvpx/vp9/encoder/vp9_quantize.c', + 'libvpx/vp9/encoder/vp9_ratectrl.c', + 'libvpx/vp9/encoder/vp9_rd.c', + 'libvpx/vp9/encoder/vp9_rdopt.c', + 'libvpx/vp9/encoder/vp9_resize.c', + 'libvpx/vp9/encoder/vp9_segmentation.c', + 'libvpx/vp9/encoder/vp9_skin_detection.c', + 'libvpx/vp9/encoder/vp9_speed_features.c', + 'libvpx/vp9/encoder/vp9_subexp.c', + 'libvpx/vp9/encoder/vp9_svc_layercontext.c', + 'libvpx/vp9/encoder/vp9_temporal_filter.c', + 'libvpx/vp9/encoder/vp9_tokenize.c', + 'libvpx/vp9/encoder/vp9_tpl_model.c', + 'libvpx/vp9/encoder/vp9_treewriter.c', + 'libvpx/vp9/encoder/x86/temporal_filter_sse4.c', + 'libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c', + 'libvpx/vp9/encoder/x86/vp9_dct_sse2.asm', + 'libvpx/vp9/encoder/x86/vp9_error_avx2.c', + 'libvpx/vp9/encoder/x86/vp9_error_sse2.asm', + 'libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c', + 'libvpx/vp9/encoder/x86/vp9_quantize_avx2.c', + 'libvpx/vp9/encoder/x86/vp9_quantize_sse2.c', + 'libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c', + 'libvpx/vp9/vp9_cx_iface.c', + 'libvpx/vp9/vp9_dx_iface.c', + 'libvpx/vp9/vp9_iface_common.c', + 'libvpx/vpx/src/vpx_codec.c', + 'libvpx/vpx/src/vpx_decoder.c', + 'libvpx/vpx/src/vpx_encoder.c', + 'libvpx/vpx/src/vpx_image.c', + 'libvpx/vpx/src/vpx_tpl.c', + 'libvpx/vpx_dsp/add_noise.c', + 'libvpx/vpx_dsp/avg.c', + 'libvpx/vpx_dsp/bitreader.c', + 'libvpx/vpx_dsp/bitreader_buffer.c', + 'libvpx/vpx_dsp/bitwriter.c', + 'libvpx/vpx_dsp/bitwriter_buffer.c', + 'libvpx/vpx_dsp/deblock.c', + 'libvpx/vpx_dsp/fwd_txfm.c', + 'libvpx/vpx_dsp/intrapred.c', + 'libvpx/vpx_dsp/inv_txfm.c', + 'libvpx/vpx_dsp/loopfilter.c', + 'libvpx/vpx_dsp/prob.c', + 'libvpx/vpx_dsp/psnr.c', + 'libvpx/vpx_dsp/quantize.c', + 'libvpx/vpx_dsp/sad.c', + 'libvpx/vpx_dsp/skin_detection.c', + 'libvpx/vpx_dsp/sse.c', + 'libvpx/vpx_dsp/subtract.c', + 'libvpx/vpx_dsp/sum_squares.c', + 'libvpx/vpx_dsp/variance.c', + 'libvpx/vpx_dsp/vpx_convolve.c', + 'libvpx/vpx_dsp/vpx_dsp_rtcd.c', + 'libvpx/vpx_dsp/x86/add_noise_sse2.asm', + 'libvpx/vpx_dsp/x86/avg_intrin_avx2.c', + 'libvpx/vpx_dsp/x86/avg_intrin_sse2.c', + 'libvpx/vpx_dsp/x86/avg_pred_avx2.c', + 'libvpx/vpx_dsp/x86/avg_pred_sse2.c', + 'libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm', + 'libvpx/vpx_dsp/x86/deblock_sse2.asm', + 'libvpx/vpx_dsp/x86/fwd_txfm_avx2.c', + 'libvpx/vpx_dsp/x86/fwd_txfm_sse2.c', + 'libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm', + 'libvpx/vpx_dsp/x86/intrapred_sse2.asm', + 'libvpx/vpx_dsp/x86/intrapred_ssse3.asm', + 'libvpx/vpx_dsp/x86/inv_txfm_avx2.c', + 'libvpx/vpx_dsp/x86/inv_txfm_sse2.c', + 'libvpx/vpx_dsp/x86/inv_txfm_ssse3.c', + 'libvpx/vpx_dsp/x86/inv_wht_sse2.asm', + 'libvpx/vpx_dsp/x86/loopfilter_avx2.c', + 'libvpx/vpx_dsp/x86/loopfilter_intrin_sse2.c', + 'libvpx/vpx_dsp/x86/post_proc_sse2.c', + 'libvpx/vpx_dsp/x86/quantize_avx.c', + 'libvpx/vpx_dsp/x86/quantize_avx2.c', + 'libvpx/vpx_dsp/x86/quantize_sse2.c', + 'libvpx/vpx_dsp/x86/quantize_ssse3.c', + 'libvpx/vpx_dsp/x86/sad4d_avx2.c', + 'libvpx/vpx_dsp/x86/sad4d_sse2.asm', + 'libvpx/vpx_dsp/x86/sad_avx2.c', + 'libvpx/vpx_dsp/x86/sad_sse2.asm', + 'libvpx/vpx_dsp/x86/sse_avx2.c', + 'libvpx/vpx_dsp/x86/sse_sse4.c', + 'libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm', + 'libvpx/vpx_dsp/x86/subpel_variance_sse2.asm', + 'libvpx/vpx_dsp/x86/subtract_avx2.c', + 'libvpx/vpx_dsp/x86/subtract_sse2.asm', + 'libvpx/vpx_dsp/x86/sum_squares_sse2.c', + 'libvpx/vpx_dsp/x86/variance_avx2.c', + 'libvpx/vpx_dsp/x86/variance_sse2.c', + 'libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm', + 'libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c', + 'libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c', + 'libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c', + 'libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm', + 'libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm', + 'libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm', + 'libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm', + 'libvpx/vpx_mem/vpx_mem.c', + 'libvpx/vpx_ports/emms_mmx.asm', + 'libvpx/vpx_ports/float_control_word.asm', + 'libvpx/vpx_scale/generic/gen_scalers.c', + 'libvpx/vpx_scale/generic/vpx_scale.c', + 'libvpx/vpx_scale/generic/yv12config.c', + 'libvpx/vpx_scale/generic/yv12extend.c', + 'libvpx/vpx_scale/vpx_scale_rtcd.c', + 'libvpx/vpx_util/vpx_thread.c', + 'libvpx/vpx_util/vpx_write_yuv_frame.c', +], + 'IA32_EXPORTS': [ + 'libvpx/vpx/vp8.h', + 'libvpx/vpx/vp8cx.h', + 'libvpx/vpx/vp8dx.h', + 'libvpx/vpx/vpx_codec.h', + 'libvpx/vpx/vpx_decoder.h', + 'libvpx/vpx/vpx_encoder.h', + 'libvpx/vpx/vpx_ext_ratectrl.h', + 'libvpx/vpx/vpx_frame_buffer.h', + 'libvpx/vpx/vpx_image.h', + 'libvpx/vpx/vpx_integer.h', + 'libvpx/vpx/vpx_tpl.h', + 'libvpx/vpx_mem/include/vpx_mem_intrnl.h', + 'libvpx/vpx_mem/vpx_mem.h', + 'libvpx/vpx_ports/bitops.h', + 'libvpx/vpx_ports/compiler_attributes.h', + 'libvpx/vpx_ports/mem.h', + 'libvpx/vpx_ports/static_assert.h', + 'libvpx/vpx_ports/system_state.h', + 'libvpx/vpx_ports/vpx_timer.h', + 'libvpx/vpx_ports/x86.h', + 'libvpx/vpx_scale/vpx_scale.h', + 'libvpx/vpx_scale/yv12config.h', +], + 'IA32_SOURCES': [ + 'libvpx/vp8/common/alloccommon.c', + 'libvpx/vp8/common/blockd.c', + 'libvpx/vp8/common/dequantize.c', + 'libvpx/vp8/common/entropy.c', + 'libvpx/vp8/common/entropymode.c', + 'libvpx/vp8/common/entropymv.c', + 'libvpx/vp8/common/extend.c', + 'libvpx/vp8/common/filter.c', + 'libvpx/vp8/common/findnearmv.c', + 'libvpx/vp8/common/generic/systemdependent.c', + 'libvpx/vp8/common/idct_blk.c', + 'libvpx/vp8/common/idctllm.c', + 'libvpx/vp8/common/loopfilter_filters.c', + 'libvpx/vp8/common/mbpitch.c', + 'libvpx/vp8/common/mfqe.c', + 'libvpx/vp8/common/modecont.c', + 'libvpx/vp8/common/postproc.c', + 'libvpx/vp8/common/quant_common.c', + 'libvpx/vp8/common/reconinter.c', + 'libvpx/vp8/common/reconintra.c', + 'libvpx/vp8/common/reconintra4x4.c', + 'libvpx/vp8/common/rtcd.c', + 'libvpx/vp8/common/setupintrarecon.c', + 'libvpx/vp8/common/swapyv12buffer.c', + 'libvpx/vp8/common/treecoder.c', + 'libvpx/vp8/common/vp8_loopfilter.c', + 'libvpx/vp8/common/vp8_skin_detection.c', + 'libvpx/vp8/common/x86/bilinear_filter_sse2.c', + 'libvpx/vp8/common/x86/dequantize_mmx.asm', + 'libvpx/vp8/common/x86/idct_blk_mmx.c', + 'libvpx/vp8/common/x86/idct_blk_sse2.c', + 'libvpx/vp8/common/x86/idctllm_mmx.asm', + 'libvpx/vp8/common/x86/idctllm_sse2.asm', + 'libvpx/vp8/common/x86/iwalsh_sse2.asm', + 'libvpx/vp8/common/x86/loopfilter_sse2.asm', + 'libvpx/vp8/common/x86/loopfilter_x86.c', + 'libvpx/vp8/common/x86/mfqe_sse2.asm', + 'libvpx/vp8/common/x86/recon_mmx.asm', + 'libvpx/vp8/common/x86/recon_sse2.asm', + 'libvpx/vp8/common/x86/subpixel_mmx.asm', + 'libvpx/vp8/common/x86/subpixel_sse2.asm', + 'libvpx/vp8/common/x86/subpixel_ssse3.asm', + 'libvpx/vp8/common/x86/vp8_asm_stubs.c', + 'libvpx/vp8/decoder/dboolhuff.c', + 'libvpx/vp8/decoder/decodeframe.c', + 'libvpx/vp8/decoder/decodemv.c', + 'libvpx/vp8/decoder/detokenize.c', + 'libvpx/vp8/decoder/onyxd_if.c', + 'libvpx/vp8/decoder/threading.c', + 'libvpx/vp8/encoder/bitstream.c', + 'libvpx/vp8/encoder/boolhuff.c', + 'libvpx/vp8/encoder/copy_c.c', + 'libvpx/vp8/encoder/dct.c', + 'libvpx/vp8/encoder/denoising.c', + 'libvpx/vp8/encoder/encodeframe.c', + 'libvpx/vp8/encoder/encodeintra.c', + 'libvpx/vp8/encoder/encodemb.c', + 'libvpx/vp8/encoder/encodemv.c', + 'libvpx/vp8/encoder/ethreading.c', + 'libvpx/vp8/encoder/firstpass.c', + 'libvpx/vp8/encoder/lookahead.c', + 'libvpx/vp8/encoder/mcomp.c', + 'libvpx/vp8/encoder/modecosts.c', + 'libvpx/vp8/encoder/mr_dissim.c', + 'libvpx/vp8/encoder/onyx_if.c', + 'libvpx/vp8/encoder/pickinter.c', + 'libvpx/vp8/encoder/picklpf.c', + 'libvpx/vp8/encoder/ratectrl.c', + 'libvpx/vp8/encoder/rdopt.c', + 'libvpx/vp8/encoder/segmentation.c', + 'libvpx/vp8/encoder/temporal_filter.c', + 'libvpx/vp8/encoder/tokenize.c', + 'libvpx/vp8/encoder/treewriter.c', + 'libvpx/vp8/encoder/vp8_quantize.c', + 'libvpx/vp8/encoder/x86/block_error_sse2.asm', + 'libvpx/vp8/encoder/x86/copy_sse2.asm', + 'libvpx/vp8/encoder/x86/copy_sse3.asm', + 'libvpx/vp8/encoder/x86/dct_sse2.asm', + 'libvpx/vp8/encoder/x86/denoising_sse2.c', + 'libvpx/vp8/encoder/x86/fwalsh_sse2.asm', + 'libvpx/vp8/encoder/x86/quantize_sse4.c', + 'libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm', + 'libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c', + 'libvpx/vp8/encoder/x86/vp8_quantize_sse2.c', + 'libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c', + 'libvpx/vp8/vp8_cx_iface.c', + 'libvpx/vp8/vp8_dx_iface.c', + 'libvpx/vp9/common/vp9_alloccommon.c', + 'libvpx/vp9/common/vp9_blockd.c', + 'libvpx/vp9/common/vp9_common_data.c', + 'libvpx/vp9/common/vp9_entropy.c', + 'libvpx/vp9/common/vp9_entropymode.c', + 'libvpx/vp9/common/vp9_entropymv.c', + 'libvpx/vp9/common/vp9_filter.c', + 'libvpx/vp9/common/vp9_frame_buffers.c', + 'libvpx/vp9/common/vp9_idct.c', + 'libvpx/vp9/common/vp9_loopfilter.c', + 'libvpx/vp9/common/vp9_mfqe.c', + 'libvpx/vp9/common/vp9_mvref_common.c', + 'libvpx/vp9/common/vp9_postproc.c', + 'libvpx/vp9/common/vp9_pred_common.c', + 'libvpx/vp9/common/vp9_quant_common.c', + 'libvpx/vp9/common/vp9_reconinter.c', + 'libvpx/vp9/common/vp9_reconintra.c', + 'libvpx/vp9/common/vp9_rtcd.c', + 'libvpx/vp9/common/vp9_scale.c', + 'libvpx/vp9/common/vp9_scan.c', + 'libvpx/vp9/common/vp9_seg_common.c', + 'libvpx/vp9/common/vp9_thread_common.c', + 'libvpx/vp9/common/vp9_tile_common.c', + 'libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c', + 'libvpx/vp9/common/x86/vp9_mfqe_sse2.asm', + 'libvpx/vp9/decoder/vp9_decodeframe.c', + 'libvpx/vp9/decoder/vp9_decodemv.c', + 'libvpx/vp9/decoder/vp9_decoder.c', + 'libvpx/vp9/decoder/vp9_detokenize.c', + 'libvpx/vp9/decoder/vp9_dsubexp.c', + 'libvpx/vp9/decoder/vp9_job_queue.c', + 'libvpx/vp9/encoder/vp9_alt_ref_aq.c', + 'libvpx/vp9/encoder/vp9_aq_360.c', + 'libvpx/vp9/encoder/vp9_aq_complexity.c', + 'libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c', + 'libvpx/vp9/encoder/vp9_aq_variance.c', + 'libvpx/vp9/encoder/vp9_bitstream.c', + 'libvpx/vp9/encoder/vp9_context_tree.c', + 'libvpx/vp9/encoder/vp9_cost.c', + 'libvpx/vp9/encoder/vp9_dct.c', + 'libvpx/vp9/encoder/vp9_encodeframe.c', + 'libvpx/vp9/encoder/vp9_encodemb.c', + 'libvpx/vp9/encoder/vp9_encodemv.c', + 'libvpx/vp9/encoder/vp9_encoder.c', + 'libvpx/vp9/encoder/vp9_ethread.c', + 'libvpx/vp9/encoder/vp9_ext_ratectrl.c', + 'libvpx/vp9/encoder/vp9_extend.c', + 'libvpx/vp9/encoder/vp9_firstpass.c', + 'libvpx/vp9/encoder/vp9_frame_scale.c', + 'libvpx/vp9/encoder/vp9_lookahead.c', + 'libvpx/vp9/encoder/vp9_mbgraph.c', + 'libvpx/vp9/encoder/vp9_mcomp.c', + 'libvpx/vp9/encoder/vp9_multi_thread.c', + 'libvpx/vp9/encoder/vp9_noise_estimate.c', + 'libvpx/vp9/encoder/vp9_picklpf.c', + 'libvpx/vp9/encoder/vp9_pickmode.c', + 'libvpx/vp9/encoder/vp9_quantize.c', + 'libvpx/vp9/encoder/vp9_ratectrl.c', + 'libvpx/vp9/encoder/vp9_rd.c', + 'libvpx/vp9/encoder/vp9_rdopt.c', + 'libvpx/vp9/encoder/vp9_resize.c', + 'libvpx/vp9/encoder/vp9_segmentation.c', + 'libvpx/vp9/encoder/vp9_skin_detection.c', + 'libvpx/vp9/encoder/vp9_speed_features.c', + 'libvpx/vp9/encoder/vp9_subexp.c', + 'libvpx/vp9/encoder/vp9_svc_layercontext.c', + 'libvpx/vp9/encoder/vp9_temporal_filter.c', + 'libvpx/vp9/encoder/vp9_tokenize.c', + 'libvpx/vp9/encoder/vp9_tpl_model.c', + 'libvpx/vp9/encoder/vp9_treewriter.c', + 'libvpx/vp9/encoder/x86/temporal_filter_sse4.c', + 'libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c', + 'libvpx/vp9/encoder/x86/vp9_dct_sse2.asm', + 'libvpx/vp9/encoder/x86/vp9_error_avx2.c', + 'libvpx/vp9/encoder/x86/vp9_error_sse2.asm', + 'libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c', + 'libvpx/vp9/encoder/x86/vp9_quantize_avx2.c', + 'libvpx/vp9/encoder/x86/vp9_quantize_sse2.c', + 'libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c', + 'libvpx/vp9/vp9_cx_iface.c', + 'libvpx/vp9/vp9_dx_iface.c', + 'libvpx/vp9/vp9_iface_common.c', + 'libvpx/vpx/src/vpx_codec.c', + 'libvpx/vpx/src/vpx_decoder.c', + 'libvpx/vpx/src/vpx_encoder.c', + 'libvpx/vpx/src/vpx_image.c', + 'libvpx/vpx/src/vpx_tpl.c', + 'libvpx/vpx_dsp/add_noise.c', + 'libvpx/vpx_dsp/avg.c', + 'libvpx/vpx_dsp/bitreader.c', + 'libvpx/vpx_dsp/bitreader_buffer.c', + 'libvpx/vpx_dsp/bitwriter.c', + 'libvpx/vpx_dsp/bitwriter_buffer.c', + 'libvpx/vpx_dsp/deblock.c', + 'libvpx/vpx_dsp/fwd_txfm.c', + 'libvpx/vpx_dsp/intrapred.c', + 'libvpx/vpx_dsp/inv_txfm.c', + 'libvpx/vpx_dsp/loopfilter.c', + 'libvpx/vpx_dsp/prob.c', + 'libvpx/vpx_dsp/psnr.c', + 'libvpx/vpx_dsp/quantize.c', + 'libvpx/vpx_dsp/sad.c', + 'libvpx/vpx_dsp/skin_detection.c', + 'libvpx/vpx_dsp/sse.c', + 'libvpx/vpx_dsp/subtract.c', + 'libvpx/vpx_dsp/sum_squares.c', + 'libvpx/vpx_dsp/variance.c', + 'libvpx/vpx_dsp/vpx_convolve.c', + 'libvpx/vpx_dsp/vpx_dsp_rtcd.c', + 'libvpx/vpx_dsp/x86/add_noise_sse2.asm', + 'libvpx/vpx_dsp/x86/avg_intrin_avx2.c', + 'libvpx/vpx_dsp/x86/avg_intrin_sse2.c', + 'libvpx/vpx_dsp/x86/avg_pred_avx2.c', + 'libvpx/vpx_dsp/x86/avg_pred_sse2.c', + 'libvpx/vpx_dsp/x86/deblock_sse2.asm', + 'libvpx/vpx_dsp/x86/fwd_txfm_avx2.c', + 'libvpx/vpx_dsp/x86/fwd_txfm_sse2.c', + 'libvpx/vpx_dsp/x86/intrapred_sse2.asm', + 'libvpx/vpx_dsp/x86/intrapred_ssse3.asm', + 'libvpx/vpx_dsp/x86/inv_txfm_avx2.c', + 'libvpx/vpx_dsp/x86/inv_txfm_sse2.c', + 'libvpx/vpx_dsp/x86/inv_txfm_ssse3.c', + 'libvpx/vpx_dsp/x86/inv_wht_sse2.asm', + 'libvpx/vpx_dsp/x86/loopfilter_avx2.c', + 'libvpx/vpx_dsp/x86/loopfilter_intrin_sse2.c', + 'libvpx/vpx_dsp/x86/post_proc_sse2.c', + 'libvpx/vpx_dsp/x86/quantize_avx.c', + 'libvpx/vpx_dsp/x86/quantize_avx2.c', + 'libvpx/vpx_dsp/x86/quantize_sse2.c', + 'libvpx/vpx_dsp/x86/quantize_ssse3.c', + 'libvpx/vpx_dsp/x86/sad4d_avx2.c', + 'libvpx/vpx_dsp/x86/sad4d_sse2.asm', + 'libvpx/vpx_dsp/x86/sad_avx2.c', + 'libvpx/vpx_dsp/x86/sad_sse2.asm', + 'libvpx/vpx_dsp/x86/sse_avx2.c', + 'libvpx/vpx_dsp/x86/sse_sse4.c', + 'libvpx/vpx_dsp/x86/subpel_variance_sse2.asm', + 'libvpx/vpx_dsp/x86/subtract_avx2.c', + 'libvpx/vpx_dsp/x86/subtract_sse2.asm', + 'libvpx/vpx_dsp/x86/sum_squares_sse2.c', + 'libvpx/vpx_dsp/x86/variance_avx2.c', + 'libvpx/vpx_dsp/x86/variance_sse2.c', + 'libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm', + 'libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c', + 'libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c', + 'libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c', + 'libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm', + 'libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm', + 'libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm', + 'libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm', + 'libvpx/vpx_mem/vpx_mem.c', + 'libvpx/vpx_ports/emms_mmx.c', + 'libvpx/vpx_scale/generic/gen_scalers.c', + 'libvpx/vpx_scale/generic/vpx_scale.c', + 'libvpx/vpx_scale/generic/yv12config.c', + 'libvpx/vpx_scale/generic/yv12extend.c', + 'libvpx/vpx_scale/vpx_scale_rtcd.c', + 'libvpx/vpx_util/vpx_thread.c', + 'libvpx/vpx_util/vpx_write_yuv_frame.c', +], + 'ARM_EXPORTS': [ + 'libvpx/vpx/vp8.h', + 'libvpx/vpx/vp8cx.h', + 'libvpx/vpx/vp8dx.h', + 'libvpx/vpx/vpx_codec.h', + 'libvpx/vpx/vpx_decoder.h', + 'libvpx/vpx/vpx_encoder.h', + 'libvpx/vpx/vpx_ext_ratectrl.h', + 'libvpx/vpx/vpx_frame_buffer.h', + 'libvpx/vpx/vpx_image.h', + 'libvpx/vpx/vpx_integer.h', + 'libvpx/vpx/vpx_tpl.h', + 'libvpx/vpx_mem/include/vpx_mem_intrnl.h', + 'libvpx/vpx_mem/vpx_mem.h', + 'libvpx/vpx_ports/arm.h', + 'libvpx/vpx_ports/arm_cpudetect.h', + 'libvpx/vpx_ports/bitops.h', + 'libvpx/vpx_ports/compiler_attributes.h', + 'libvpx/vpx_ports/mem.h', + 'libvpx/vpx_ports/static_assert.h', + 'libvpx/vpx_ports/system_state.h', + 'libvpx/vpx_ports/vpx_timer.h', + 'libvpx/vpx_scale/vpx_scale.h', + 'libvpx/vpx_scale/yv12config.h', +], + 'ARM_SOURCES': [ + 'libvpx/vp8/common/alloccommon.c', + 'libvpx/vp8/common/arm/loopfilter_arm.c', + 'libvpx/vp8/common/arm/neon/bilinearpredict_neon.c', + 'libvpx/vp8/common/arm/neon/copymem_neon.c', + 'libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c', + 'libvpx/vp8/common/arm/neon/dequant_idct_neon.c', + 'libvpx/vp8/common/arm/neon/dequantizeb_neon.c', + 'libvpx/vp8/common/arm/neon/idct_blk_neon.c', + 'libvpx/vp8/common/arm/neon/iwalsh_neon.c', + 'libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c', + 'libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c', + 'libvpx/vp8/common/arm/neon/mbloopfilter_neon.c', + 'libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c', + 'libvpx/vp8/common/arm/neon/sixtappredict_neon.c', + 'libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c', + 'libvpx/vp8/common/blockd.c', + 'libvpx/vp8/common/dequantize.c', + 'libvpx/vp8/common/entropy.c', + 'libvpx/vp8/common/entropymode.c', + 'libvpx/vp8/common/entropymv.c', + 'libvpx/vp8/common/extend.c', + 'libvpx/vp8/common/filter.c', + 'libvpx/vp8/common/findnearmv.c', + 'libvpx/vp8/common/generic/systemdependent.c', + 'libvpx/vp8/common/idct_blk.c', + 'libvpx/vp8/common/idctllm.c', + 'libvpx/vp8/common/loopfilter_filters.c', + 'libvpx/vp8/common/mbpitch.c', + 'libvpx/vp8/common/modecont.c', + 'libvpx/vp8/common/quant_common.c', + 'libvpx/vp8/common/reconinter.c', + 'libvpx/vp8/common/reconintra.c', + 'libvpx/vp8/common/reconintra4x4.c', + 'libvpx/vp8/common/rtcd.c', + 'libvpx/vp8/common/setupintrarecon.c', + 'libvpx/vp8/common/swapyv12buffer.c', + 'libvpx/vp8/common/treecoder.c', + 'libvpx/vp8/common/vp8_loopfilter.c', + 'libvpx/vp8/common/vp8_skin_detection.c', + 'libvpx/vp8/decoder/dboolhuff.c', + 'libvpx/vp8/decoder/decodeframe.c', + 'libvpx/vp8/decoder/decodemv.c', + 'libvpx/vp8/decoder/detokenize.c', + 'libvpx/vp8/decoder/onyxd_if.c', + 'libvpx/vp8/decoder/threading.c', + 'libvpx/vp8/encoder/arm/neon/denoising_neon.c', + 'libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c', + 'libvpx/vp8/encoder/arm/neon/shortfdct_neon.c', + 'libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c', + 'libvpx/vp8/encoder/bitstream.c', + 'libvpx/vp8/encoder/boolhuff.c', + 'libvpx/vp8/encoder/copy_c.c', + 'libvpx/vp8/encoder/dct.c', + 'libvpx/vp8/encoder/denoising.c', + 'libvpx/vp8/encoder/encodeframe.c', + 'libvpx/vp8/encoder/encodeintra.c', + 'libvpx/vp8/encoder/encodemb.c', + 'libvpx/vp8/encoder/encodemv.c', + 'libvpx/vp8/encoder/ethreading.c', + 'libvpx/vp8/encoder/lookahead.c', + 'libvpx/vp8/encoder/mcomp.c', + 'libvpx/vp8/encoder/modecosts.c', + 'libvpx/vp8/encoder/mr_dissim.c', + 'libvpx/vp8/encoder/onyx_if.c', + 'libvpx/vp8/encoder/pickinter.c', + 'libvpx/vp8/encoder/picklpf.c', + 'libvpx/vp8/encoder/ratectrl.c', + 'libvpx/vp8/encoder/rdopt.c', + 'libvpx/vp8/encoder/segmentation.c', + 'libvpx/vp8/encoder/tokenize.c', + 'libvpx/vp8/encoder/treewriter.c', + 'libvpx/vp8/encoder/vp8_quantize.c', + 'libvpx/vp8/vp8_cx_iface.c', + 'libvpx/vp8/vp8_dx_iface.c', + 'libvpx/vp9/common/arm/neon/vp9_iht16x16_add_neon.c', + 'libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c', + 'libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c', + 'libvpx/vp9/common/vp9_alloccommon.c', + 'libvpx/vp9/common/vp9_blockd.c', + 'libvpx/vp9/common/vp9_common_data.c', + 'libvpx/vp9/common/vp9_entropy.c', + 'libvpx/vp9/common/vp9_entropymode.c', + 'libvpx/vp9/common/vp9_entropymv.c', + 'libvpx/vp9/common/vp9_filter.c', + 'libvpx/vp9/common/vp9_frame_buffers.c', + 'libvpx/vp9/common/vp9_idct.c', + 'libvpx/vp9/common/vp9_loopfilter.c', + 'libvpx/vp9/common/vp9_mvref_common.c', + 'libvpx/vp9/common/vp9_pred_common.c', + 'libvpx/vp9/common/vp9_quant_common.c', + 'libvpx/vp9/common/vp9_reconinter.c', + 'libvpx/vp9/common/vp9_reconintra.c', + 'libvpx/vp9/common/vp9_rtcd.c', + 'libvpx/vp9/common/vp9_scale.c', + 'libvpx/vp9/common/vp9_scan.c', + 'libvpx/vp9/common/vp9_seg_common.c', + 'libvpx/vp9/common/vp9_thread_common.c', + 'libvpx/vp9/common/vp9_tile_common.c', + 'libvpx/vp9/decoder/vp9_decodeframe.c', + 'libvpx/vp9/decoder/vp9_decodemv.c', + 'libvpx/vp9/decoder/vp9_decoder.c', + 'libvpx/vp9/decoder/vp9_detokenize.c', + 'libvpx/vp9/decoder/vp9_dsubexp.c', + 'libvpx/vp9/decoder/vp9_job_queue.c', + 'libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c', + 'libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c', + 'libvpx/vp9/encoder/arm/neon/vp9_error_neon.c', + 'libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c', + 'libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c', + 'libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c', + 'libvpx/vp9/encoder/vp9_bitstream.c', + 'libvpx/vp9/encoder/vp9_context_tree.c', + 'libvpx/vp9/encoder/vp9_cost.c', + 'libvpx/vp9/encoder/vp9_dct.c', + 'libvpx/vp9/encoder/vp9_encodeframe.c', + 'libvpx/vp9/encoder/vp9_encodemb.c', + 'libvpx/vp9/encoder/vp9_encodemv.c', + 'libvpx/vp9/encoder/vp9_encoder.c', + 'libvpx/vp9/encoder/vp9_ethread.c', + 'libvpx/vp9/encoder/vp9_ext_ratectrl.c', + 'libvpx/vp9/encoder/vp9_extend.c', + 'libvpx/vp9/encoder/vp9_frame_scale.c', + 'libvpx/vp9/encoder/vp9_lookahead.c', + 'libvpx/vp9/encoder/vp9_mcomp.c', + 'libvpx/vp9/encoder/vp9_multi_thread.c', + 'libvpx/vp9/encoder/vp9_noise_estimate.c', + 'libvpx/vp9/encoder/vp9_picklpf.c', + 'libvpx/vp9/encoder/vp9_pickmode.c', + 'libvpx/vp9/encoder/vp9_quantize.c', + 'libvpx/vp9/encoder/vp9_ratectrl.c', + 'libvpx/vp9/encoder/vp9_rd.c', + 'libvpx/vp9/encoder/vp9_rdopt.c', + 'libvpx/vp9/encoder/vp9_resize.c', + 'libvpx/vp9/encoder/vp9_segmentation.c', + 'libvpx/vp9/encoder/vp9_skin_detection.c', + 'libvpx/vp9/encoder/vp9_speed_features.c', + 'libvpx/vp9/encoder/vp9_subexp.c', + 'libvpx/vp9/encoder/vp9_svc_layercontext.c', + 'libvpx/vp9/encoder/vp9_tokenize.c', + 'libvpx/vp9/encoder/vp9_tpl_model.c', + 'libvpx/vp9/encoder/vp9_treewriter.c', + 'libvpx/vp9/vp9_cx_iface.c', + 'libvpx/vp9/vp9_dx_iface.c', + 'libvpx/vp9/vp9_iface_common.c', + 'libvpx/vpx/src/vpx_codec.c', + 'libvpx/vpx/src/vpx_decoder.c', + 'libvpx/vpx/src/vpx_encoder.c', + 'libvpx/vpx/src/vpx_image.c', + 'libvpx/vpx/src/vpx_tpl.c', + 'libvpx/vpx_dsp/arm/avg_neon.c', + 'libvpx/vpx_dsp/arm/avg_pred_neon.c', + 'libvpx/vpx_dsp/arm/fdct16x16_neon.c', + 'libvpx/vpx_dsp/arm/fdct32x32_neon.c', + 'libvpx/vpx_dsp/arm/fdct4x4_neon.c', + 'libvpx/vpx_dsp/arm/fdct8x8_neon.c', + 'libvpx/vpx_dsp/arm/fdct_partial_neon.c', + 'libvpx/vpx_dsp/arm/hadamard_neon.c', + 'libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c', + 'libvpx/vpx_dsp/arm/idct16x16_add_neon.c', + 'libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c', + 'libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c', + 'libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c', + 'libvpx/vpx_dsp/arm/idct32x32_add_neon.c', + 'libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm', + 'libvpx/vpx_dsp/arm/idct4x4_add_neon.asm', + 'libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c', + 'libvpx/vpx_dsp/arm/idct8x8_add_neon.c', + 'libvpx/vpx_dsp/arm/idct_neon.asm', + 'libvpx/vpx_dsp/arm/intrapred_neon.c', + 'libvpx/vpx_dsp/arm/intrapred_neon_asm.asm', + 'libvpx/vpx_dsp/arm/loopfilter_16_neon.asm', + 'libvpx/vpx_dsp/arm/loopfilter_4_neon.asm', + 'libvpx/vpx_dsp/arm/loopfilter_8_neon.asm', + 'libvpx/vpx_dsp/arm/quantize_neon.c', + 'libvpx/vpx_dsp/arm/sad4d_neon.c', + 'libvpx/vpx_dsp/arm/sad_neon.c', + 'libvpx/vpx_dsp/arm/save_reg_neon.asm', + 'libvpx/vpx_dsp/arm/sse_neon.c', + 'libvpx/vpx_dsp/arm/subpel_variance_neon.c', + 'libvpx/vpx_dsp/arm/subtract_neon.c', + 'libvpx/vpx_dsp/arm/sum_squares_neon.c', + 'libvpx/vpx_dsp/arm/variance_neon.c', + 'libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm', + 'libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm', + 'libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm', + 'libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm', + 'libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm', + 'libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm', + 'libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c', + 'libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm', + 'libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm', + 'libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm', + 'libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm', + 'libvpx/vpx_dsp/arm/vpx_convolve_neon.c', + 'libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c', + 'libvpx/vpx_dsp/avg.c', + 'libvpx/vpx_dsp/bitreader.c', + 'libvpx/vpx_dsp/bitreader_buffer.c', + 'libvpx/vpx_dsp/bitwriter.c', + 'libvpx/vpx_dsp/bitwriter_buffer.c', + 'libvpx/vpx_dsp/fwd_txfm.c', + 'libvpx/vpx_dsp/intrapred.c', + 'libvpx/vpx_dsp/inv_txfm.c', + 'libvpx/vpx_dsp/loopfilter.c', + 'libvpx/vpx_dsp/prob.c', + 'libvpx/vpx_dsp/psnr.c', + 'libvpx/vpx_dsp/quantize.c', + 'libvpx/vpx_dsp/sad.c', + 'libvpx/vpx_dsp/skin_detection.c', + 'libvpx/vpx_dsp/sse.c', + 'libvpx/vpx_dsp/subtract.c', + 'libvpx/vpx_dsp/sum_squares.c', + 'libvpx/vpx_dsp/variance.c', + 'libvpx/vpx_dsp/vpx_convolve.c', + 'libvpx/vpx_dsp/vpx_dsp_rtcd.c', + 'libvpx/vpx_mem/vpx_mem.c', + 'libvpx/vpx_ports/aarch32_cpudetect.c', + 'libvpx/vpx_scale/generic/gen_scalers.c', + 'libvpx/vpx_scale/generic/vpx_scale.c', + 'libvpx/vpx_scale/generic/yv12config.c', + 'libvpx/vpx_scale/generic/yv12extend.c', + 'libvpx/vpx_scale/vpx_scale_rtcd.c', + 'libvpx/vpx_util/vpx_thread.c', + 'libvpx/vpx_util/vpx_write_yuv_frame.c', +], + 'ARM64_EXPORTS': [ + 'libvpx/vpx/vp8.h', + 'libvpx/vpx/vp8cx.h', + 'libvpx/vpx/vp8dx.h', + 'libvpx/vpx/vpx_codec.h', + 'libvpx/vpx/vpx_decoder.h', + 'libvpx/vpx/vpx_encoder.h', + 'libvpx/vpx/vpx_ext_ratectrl.h', + 'libvpx/vpx/vpx_frame_buffer.h', + 'libvpx/vpx/vpx_image.h', + 'libvpx/vpx/vpx_integer.h', + 'libvpx/vpx/vpx_tpl.h', + 'libvpx/vpx_mem/include/vpx_mem_intrnl.h', + 'libvpx/vpx_mem/vpx_mem.h', + 'libvpx/vpx_ports/arm.h', + 'libvpx/vpx_ports/arm_cpudetect.h', + 'libvpx/vpx_ports/bitops.h', + 'libvpx/vpx_ports/compiler_attributes.h', + 'libvpx/vpx_ports/mem.h', + 'libvpx/vpx_ports/static_assert.h', + 'libvpx/vpx_ports/system_state.h', + 'libvpx/vpx_ports/vpx_timer.h', + 'libvpx/vpx_scale/vpx_scale.h', + 'libvpx/vpx_scale/yv12config.h', +], + 'ARM64_SOURCES': [ + 'libvpx/vp8/common/alloccommon.c', + 'libvpx/vp8/common/arm/loopfilter_arm.c', + 'libvpx/vp8/common/arm/neon/bilinearpredict_neon.c', + 'libvpx/vp8/common/arm/neon/copymem_neon.c', + 'libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c', + 'libvpx/vp8/common/arm/neon/dequant_idct_neon.c', + 'libvpx/vp8/common/arm/neon/dequantizeb_neon.c', + 'libvpx/vp8/common/arm/neon/idct_blk_neon.c', + 'libvpx/vp8/common/arm/neon/iwalsh_neon.c', + 'libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c', + 'libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c', + 'libvpx/vp8/common/arm/neon/mbloopfilter_neon.c', + 'libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c', + 'libvpx/vp8/common/arm/neon/sixtappredict_neon.c', + 'libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c', + 'libvpx/vp8/common/blockd.c', + 'libvpx/vp8/common/dequantize.c', + 'libvpx/vp8/common/entropy.c', + 'libvpx/vp8/common/entropymode.c', + 'libvpx/vp8/common/entropymv.c', + 'libvpx/vp8/common/extend.c', + 'libvpx/vp8/common/filter.c', + 'libvpx/vp8/common/findnearmv.c', + 'libvpx/vp8/common/generic/systemdependent.c', + 'libvpx/vp8/common/idct_blk.c', + 'libvpx/vp8/common/idctllm.c', + 'libvpx/vp8/common/loopfilter_filters.c', + 'libvpx/vp8/common/mbpitch.c', + 'libvpx/vp8/common/modecont.c', + 'libvpx/vp8/common/quant_common.c', + 'libvpx/vp8/common/reconinter.c', + 'libvpx/vp8/common/reconintra.c', + 'libvpx/vp8/common/reconintra4x4.c', + 'libvpx/vp8/common/rtcd.c', + 'libvpx/vp8/common/setupintrarecon.c', + 'libvpx/vp8/common/swapyv12buffer.c', + 'libvpx/vp8/common/treecoder.c', + 'libvpx/vp8/common/vp8_loopfilter.c', + 'libvpx/vp8/common/vp8_skin_detection.c', + 'libvpx/vp8/decoder/dboolhuff.c', + 'libvpx/vp8/decoder/decodeframe.c', + 'libvpx/vp8/decoder/decodemv.c', + 'libvpx/vp8/decoder/detokenize.c', + 'libvpx/vp8/decoder/onyxd_if.c', + 'libvpx/vp8/decoder/threading.c', + 'libvpx/vp8/encoder/arm/neon/denoising_neon.c', + 'libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c', + 'libvpx/vp8/encoder/arm/neon/shortfdct_neon.c', + 'libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c', + 'libvpx/vp8/encoder/bitstream.c', + 'libvpx/vp8/encoder/boolhuff.c', + 'libvpx/vp8/encoder/copy_c.c', + 'libvpx/vp8/encoder/dct.c', + 'libvpx/vp8/encoder/denoising.c', + 'libvpx/vp8/encoder/encodeframe.c', + 'libvpx/vp8/encoder/encodeintra.c', + 'libvpx/vp8/encoder/encodemb.c', + 'libvpx/vp8/encoder/encodemv.c', + 'libvpx/vp8/encoder/ethreading.c', + 'libvpx/vp8/encoder/lookahead.c', + 'libvpx/vp8/encoder/mcomp.c', + 'libvpx/vp8/encoder/modecosts.c', + 'libvpx/vp8/encoder/mr_dissim.c', + 'libvpx/vp8/encoder/onyx_if.c', + 'libvpx/vp8/encoder/pickinter.c', + 'libvpx/vp8/encoder/picklpf.c', + 'libvpx/vp8/encoder/ratectrl.c', + 'libvpx/vp8/encoder/rdopt.c', + 'libvpx/vp8/encoder/segmentation.c', + 'libvpx/vp8/encoder/tokenize.c', + 'libvpx/vp8/encoder/treewriter.c', + 'libvpx/vp8/encoder/vp8_quantize.c', + 'libvpx/vp8/vp8_cx_iface.c', + 'libvpx/vp8/vp8_dx_iface.c', + 'libvpx/vp9/common/arm/neon/vp9_iht16x16_add_neon.c', + 'libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c', + 'libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c', + 'libvpx/vp9/common/vp9_alloccommon.c', + 'libvpx/vp9/common/vp9_blockd.c', + 'libvpx/vp9/common/vp9_common_data.c', + 'libvpx/vp9/common/vp9_entropy.c', + 'libvpx/vp9/common/vp9_entropymode.c', + 'libvpx/vp9/common/vp9_entropymv.c', + 'libvpx/vp9/common/vp9_filter.c', + 'libvpx/vp9/common/vp9_frame_buffers.c', + 'libvpx/vp9/common/vp9_idct.c', + 'libvpx/vp9/common/vp9_loopfilter.c', + 'libvpx/vp9/common/vp9_mvref_common.c', + 'libvpx/vp9/common/vp9_pred_common.c', + 'libvpx/vp9/common/vp9_quant_common.c', + 'libvpx/vp9/common/vp9_reconinter.c', + 'libvpx/vp9/common/vp9_reconintra.c', + 'libvpx/vp9/common/vp9_rtcd.c', + 'libvpx/vp9/common/vp9_scale.c', + 'libvpx/vp9/common/vp9_scan.c', + 'libvpx/vp9/common/vp9_seg_common.c', + 'libvpx/vp9/common/vp9_thread_common.c', + 'libvpx/vp9/common/vp9_tile_common.c', + 'libvpx/vp9/decoder/vp9_decodeframe.c', + 'libvpx/vp9/decoder/vp9_decodemv.c', + 'libvpx/vp9/decoder/vp9_decoder.c', + 'libvpx/vp9/decoder/vp9_detokenize.c', + 'libvpx/vp9/decoder/vp9_dsubexp.c', + 'libvpx/vp9/decoder/vp9_job_queue.c', + 'libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c', + 'libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c', + 'libvpx/vp9/encoder/arm/neon/vp9_error_neon.c', + 'libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c', + 'libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c', + 'libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c', + 'libvpx/vp9/encoder/vp9_bitstream.c', + 'libvpx/vp9/encoder/vp9_context_tree.c', + 'libvpx/vp9/encoder/vp9_cost.c', + 'libvpx/vp9/encoder/vp9_dct.c', + 'libvpx/vp9/encoder/vp9_encodeframe.c', + 'libvpx/vp9/encoder/vp9_encodemb.c', + 'libvpx/vp9/encoder/vp9_encodemv.c', + 'libvpx/vp9/encoder/vp9_encoder.c', + 'libvpx/vp9/encoder/vp9_ethread.c', + 'libvpx/vp9/encoder/vp9_ext_ratectrl.c', + 'libvpx/vp9/encoder/vp9_extend.c', + 'libvpx/vp9/encoder/vp9_frame_scale.c', + 'libvpx/vp9/encoder/vp9_lookahead.c', + 'libvpx/vp9/encoder/vp9_mcomp.c', + 'libvpx/vp9/encoder/vp9_multi_thread.c', + 'libvpx/vp9/encoder/vp9_noise_estimate.c', + 'libvpx/vp9/encoder/vp9_picklpf.c', + 'libvpx/vp9/encoder/vp9_pickmode.c', + 'libvpx/vp9/encoder/vp9_quantize.c', + 'libvpx/vp9/encoder/vp9_ratectrl.c', + 'libvpx/vp9/encoder/vp9_rd.c', + 'libvpx/vp9/encoder/vp9_rdopt.c', + 'libvpx/vp9/encoder/vp9_resize.c', + 'libvpx/vp9/encoder/vp9_segmentation.c', + 'libvpx/vp9/encoder/vp9_skin_detection.c', + 'libvpx/vp9/encoder/vp9_speed_features.c', + 'libvpx/vp9/encoder/vp9_subexp.c', + 'libvpx/vp9/encoder/vp9_svc_layercontext.c', + 'libvpx/vp9/encoder/vp9_tokenize.c', + 'libvpx/vp9/encoder/vp9_tpl_model.c', + 'libvpx/vp9/encoder/vp9_treewriter.c', + 'libvpx/vp9/vp9_cx_iface.c', + 'libvpx/vp9/vp9_dx_iface.c', + 'libvpx/vp9/vp9_iface_common.c', + 'libvpx/vpx/src/vpx_codec.c', + 'libvpx/vpx/src/vpx_decoder.c', + 'libvpx/vpx/src/vpx_encoder.c', + 'libvpx/vpx/src/vpx_image.c', + 'libvpx/vpx/src/vpx_tpl.c', + 'libvpx/vpx_dsp/arm/avg_neon.c', + 'libvpx/vpx_dsp/arm/avg_pred_neon.c', + 'libvpx/vpx_dsp/arm/fdct16x16_neon.c', + 'libvpx/vpx_dsp/arm/fdct32x32_neon.c', + 'libvpx/vpx_dsp/arm/fdct4x4_neon.c', + 'libvpx/vpx_dsp/arm/fdct8x8_neon.c', + 'libvpx/vpx_dsp/arm/fdct_partial_neon.c', + 'libvpx/vpx_dsp/arm/hadamard_neon.c', + 'libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c', + 'libvpx/vpx_dsp/arm/idct16x16_add_neon.c', + 'libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c', + 'libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c', + 'libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c', + 'libvpx/vpx_dsp/arm/idct32x32_add_neon.c', + 'libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c', + 'libvpx/vpx_dsp/arm/idct4x4_add_neon.c', + 'libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c', + 'libvpx/vpx_dsp/arm/idct8x8_add_neon.c', + 'libvpx/vpx_dsp/arm/intrapred_neon.c', + 'libvpx/vpx_dsp/arm/loopfilter_neon.c', + 'libvpx/vpx_dsp/arm/quantize_neon.c', + 'libvpx/vpx_dsp/arm/sad4d_neon.c', + 'libvpx/vpx_dsp/arm/sad4d_neon_dotprod.c', + 'libvpx/vpx_dsp/arm/sad_neon.c', + 'libvpx/vpx_dsp/arm/sad_neon_dotprod.c', + 'libvpx/vpx_dsp/arm/sse_neon.c', + 'libvpx/vpx_dsp/arm/sse_neon_dotprod.c', + 'libvpx/vpx_dsp/arm/subpel_variance_neon.c', + 'libvpx/vpx_dsp/arm/subtract_neon.c', + 'libvpx/vpx_dsp/arm/sum_squares_neon.c', + 'libvpx/vpx_dsp/arm/variance_neon.c', + 'libvpx/vpx_dsp/arm/variance_neon_dotprod.c', + 'libvpx/vpx_dsp/arm/vpx_convolve8_neon.c', + 'libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c', + 'libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c', + 'libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c', + 'libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c', + 'libvpx/vpx_dsp/arm/vpx_convolve_neon.c', + 'libvpx/vpx_dsp/arm/vpx_convolve_neon_dotprod.c', + 'libvpx/vpx_dsp/arm/vpx_convolve_neon_i8mm.c', + 'libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c', + 'libvpx/vpx_dsp/avg.c', + 'libvpx/vpx_dsp/bitreader.c', + 'libvpx/vpx_dsp/bitreader_buffer.c', + 'libvpx/vpx_dsp/bitwriter.c', + 'libvpx/vpx_dsp/bitwriter_buffer.c', + 'libvpx/vpx_dsp/fwd_txfm.c', + 'libvpx/vpx_dsp/intrapred.c', + 'libvpx/vpx_dsp/inv_txfm.c', + 'libvpx/vpx_dsp/loopfilter.c', + 'libvpx/vpx_dsp/prob.c', + 'libvpx/vpx_dsp/psnr.c', + 'libvpx/vpx_dsp/quantize.c', + 'libvpx/vpx_dsp/sad.c', + 'libvpx/vpx_dsp/skin_detection.c', + 'libvpx/vpx_dsp/sse.c', + 'libvpx/vpx_dsp/subtract.c', + 'libvpx/vpx_dsp/sum_squares.c', + 'libvpx/vpx_dsp/variance.c', + 'libvpx/vpx_dsp/vpx_convolve.c', + 'libvpx/vpx_dsp/vpx_dsp_rtcd.c', + 'libvpx/vpx_mem/vpx_mem.c', + 'libvpx/vpx_ports/aarch64_cpudetect.c', + 'libvpx/vpx_scale/generic/gen_scalers.c', + 'libvpx/vpx_scale/generic/vpx_scale.c', + 'libvpx/vpx_scale/generic/yv12config.c', + 'libvpx/vpx_scale/generic/yv12extend.c', + 'libvpx/vpx_scale/vpx_scale_rtcd.c', + 'libvpx/vpx_util/vpx_thread.c', + 'libvpx/vpx_util/vpx_write_yuv_frame.c', +], + 'GENERIC_EXPORTS': [ + 'libvpx/vpx/vp8.h', + 'libvpx/vpx/vp8cx.h', + 'libvpx/vpx/vp8dx.h', + 'libvpx/vpx/vpx_codec.h', + 'libvpx/vpx/vpx_decoder.h', + 'libvpx/vpx/vpx_encoder.h', + 'libvpx/vpx/vpx_ext_ratectrl.h', + 'libvpx/vpx/vpx_frame_buffer.h', + 'libvpx/vpx/vpx_image.h', + 'libvpx/vpx/vpx_integer.h', + 'libvpx/vpx/vpx_tpl.h', + 'libvpx/vpx_mem/include/vpx_mem_intrnl.h', + 'libvpx/vpx_mem/vpx_mem.h', + 'libvpx/vpx_ports/bitops.h', + 'libvpx/vpx_ports/compiler_attributes.h', + 'libvpx/vpx_ports/mem.h', + 'libvpx/vpx_ports/static_assert.h', + 'libvpx/vpx_ports/system_state.h', + 'libvpx/vpx_ports/vpx_timer.h', + 'libvpx/vpx_scale/vpx_scale.h', + 'libvpx/vpx_scale/yv12config.h', +], + 'GENERIC_SOURCES': [ + 'libvpx/vp8/common/alloccommon.c', + 'libvpx/vp8/common/blockd.c', + 'libvpx/vp8/common/dequantize.c', + 'libvpx/vp8/common/entropy.c', + 'libvpx/vp8/common/entropymode.c', + 'libvpx/vp8/common/entropymv.c', + 'libvpx/vp8/common/extend.c', + 'libvpx/vp8/common/filter.c', + 'libvpx/vp8/common/findnearmv.c', + 'libvpx/vp8/common/generic/systemdependent.c', + 'libvpx/vp8/common/idct_blk.c', + 'libvpx/vp8/common/idctllm.c', + 'libvpx/vp8/common/loopfilter_filters.c', + 'libvpx/vp8/common/mbpitch.c', + 'libvpx/vp8/common/modecont.c', + 'libvpx/vp8/common/quant_common.c', + 'libvpx/vp8/common/reconinter.c', + 'libvpx/vp8/common/reconintra.c', + 'libvpx/vp8/common/reconintra4x4.c', + 'libvpx/vp8/common/rtcd.c', + 'libvpx/vp8/common/setupintrarecon.c', + 'libvpx/vp8/common/swapyv12buffer.c', + 'libvpx/vp8/common/treecoder.c', + 'libvpx/vp8/common/vp8_loopfilter.c', + 'libvpx/vp8/common/vp8_skin_detection.c', + 'libvpx/vp8/decoder/dboolhuff.c', + 'libvpx/vp8/decoder/decodeframe.c', + 'libvpx/vp8/decoder/decodemv.c', + 'libvpx/vp8/decoder/detokenize.c', + 'libvpx/vp8/decoder/onyxd_if.c', + 'libvpx/vp8/decoder/threading.c', + 'libvpx/vp8/encoder/bitstream.c', + 'libvpx/vp8/encoder/boolhuff.c', + 'libvpx/vp8/encoder/copy_c.c', + 'libvpx/vp8/encoder/dct.c', + 'libvpx/vp8/encoder/denoising.c', + 'libvpx/vp8/encoder/encodeframe.c', + 'libvpx/vp8/encoder/encodeintra.c', + 'libvpx/vp8/encoder/encodemb.c', + 'libvpx/vp8/encoder/encodemv.c', + 'libvpx/vp8/encoder/ethreading.c', + 'libvpx/vp8/encoder/firstpass.c', + 'libvpx/vp8/encoder/lookahead.c', + 'libvpx/vp8/encoder/mcomp.c', + 'libvpx/vp8/encoder/modecosts.c', + 'libvpx/vp8/encoder/mr_dissim.c', + 'libvpx/vp8/encoder/onyx_if.c', + 'libvpx/vp8/encoder/pickinter.c', + 'libvpx/vp8/encoder/picklpf.c', + 'libvpx/vp8/encoder/ratectrl.c', + 'libvpx/vp8/encoder/rdopt.c', + 'libvpx/vp8/encoder/segmentation.c', + 'libvpx/vp8/encoder/temporal_filter.c', + 'libvpx/vp8/encoder/tokenize.c', + 'libvpx/vp8/encoder/treewriter.c', + 'libvpx/vp8/encoder/vp8_quantize.c', + 'libvpx/vp8/vp8_cx_iface.c', + 'libvpx/vp8/vp8_dx_iface.c', + 'libvpx/vp9/common/vp9_alloccommon.c', + 'libvpx/vp9/common/vp9_blockd.c', + 'libvpx/vp9/common/vp9_common_data.c', + 'libvpx/vp9/common/vp9_entropy.c', + 'libvpx/vp9/common/vp9_entropymode.c', + 'libvpx/vp9/common/vp9_entropymv.c', + 'libvpx/vp9/common/vp9_filter.c', + 'libvpx/vp9/common/vp9_frame_buffers.c', + 'libvpx/vp9/common/vp9_idct.c', + 'libvpx/vp9/common/vp9_loopfilter.c', + 'libvpx/vp9/common/vp9_mvref_common.c', + 'libvpx/vp9/common/vp9_pred_common.c', + 'libvpx/vp9/common/vp9_quant_common.c', + 'libvpx/vp9/common/vp9_reconinter.c', + 'libvpx/vp9/common/vp9_reconintra.c', + 'libvpx/vp9/common/vp9_rtcd.c', + 'libvpx/vp9/common/vp9_scale.c', + 'libvpx/vp9/common/vp9_scan.c', + 'libvpx/vp9/common/vp9_seg_common.c', + 'libvpx/vp9/common/vp9_thread_common.c', + 'libvpx/vp9/common/vp9_tile_common.c', + 'libvpx/vp9/decoder/vp9_decodeframe.c', + 'libvpx/vp9/decoder/vp9_decodemv.c', + 'libvpx/vp9/decoder/vp9_decoder.c', + 'libvpx/vp9/decoder/vp9_detokenize.c', + 'libvpx/vp9/decoder/vp9_dsubexp.c', + 'libvpx/vp9/decoder/vp9_job_queue.c', + 'libvpx/vp9/encoder/vp9_alt_ref_aq.c', + 'libvpx/vp9/encoder/vp9_aq_360.c', + 'libvpx/vp9/encoder/vp9_aq_complexity.c', + 'libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c', + 'libvpx/vp9/encoder/vp9_aq_variance.c', + 'libvpx/vp9/encoder/vp9_bitstream.c', + 'libvpx/vp9/encoder/vp9_context_tree.c', + 'libvpx/vp9/encoder/vp9_cost.c', + 'libvpx/vp9/encoder/vp9_dct.c', + 'libvpx/vp9/encoder/vp9_encodeframe.c', + 'libvpx/vp9/encoder/vp9_encodemb.c', + 'libvpx/vp9/encoder/vp9_encodemv.c', + 'libvpx/vp9/encoder/vp9_encoder.c', + 'libvpx/vp9/encoder/vp9_ethread.c', + 'libvpx/vp9/encoder/vp9_ext_ratectrl.c', + 'libvpx/vp9/encoder/vp9_extend.c', + 'libvpx/vp9/encoder/vp9_firstpass.c', + 'libvpx/vp9/encoder/vp9_frame_scale.c', + 'libvpx/vp9/encoder/vp9_lookahead.c', + 'libvpx/vp9/encoder/vp9_mbgraph.c', + 'libvpx/vp9/encoder/vp9_mcomp.c', + 'libvpx/vp9/encoder/vp9_multi_thread.c', + 'libvpx/vp9/encoder/vp9_noise_estimate.c', + 'libvpx/vp9/encoder/vp9_picklpf.c', + 'libvpx/vp9/encoder/vp9_pickmode.c', + 'libvpx/vp9/encoder/vp9_quantize.c', + 'libvpx/vp9/encoder/vp9_ratectrl.c', + 'libvpx/vp9/encoder/vp9_rd.c', + 'libvpx/vp9/encoder/vp9_rdopt.c', + 'libvpx/vp9/encoder/vp9_resize.c', + 'libvpx/vp9/encoder/vp9_segmentation.c', + 'libvpx/vp9/encoder/vp9_skin_detection.c', + 'libvpx/vp9/encoder/vp9_speed_features.c', + 'libvpx/vp9/encoder/vp9_subexp.c', + 'libvpx/vp9/encoder/vp9_svc_layercontext.c', + 'libvpx/vp9/encoder/vp9_temporal_filter.c', + 'libvpx/vp9/encoder/vp9_tokenize.c', + 'libvpx/vp9/encoder/vp9_tpl_model.c', + 'libvpx/vp9/encoder/vp9_treewriter.c', + 'libvpx/vp9/vp9_cx_iface.c', + 'libvpx/vp9/vp9_dx_iface.c', + 'libvpx/vp9/vp9_iface_common.c', + 'libvpx/vpx/src/vpx_codec.c', + 'libvpx/vpx/src/vpx_decoder.c', + 'libvpx/vpx/src/vpx_encoder.c', + 'libvpx/vpx/src/vpx_image.c', + 'libvpx/vpx/src/vpx_tpl.c', + 'libvpx/vpx_dsp/avg.c', + 'libvpx/vpx_dsp/bitreader.c', + 'libvpx/vpx_dsp/bitreader_buffer.c', + 'libvpx/vpx_dsp/bitwriter.c', + 'libvpx/vpx_dsp/bitwriter_buffer.c', + 'libvpx/vpx_dsp/fwd_txfm.c', + 'libvpx/vpx_dsp/intrapred.c', + 'libvpx/vpx_dsp/inv_txfm.c', + 'libvpx/vpx_dsp/loopfilter.c', + 'libvpx/vpx_dsp/prob.c', + 'libvpx/vpx_dsp/psnr.c', + 'libvpx/vpx_dsp/quantize.c', + 'libvpx/vpx_dsp/sad.c', + 'libvpx/vpx_dsp/skin_detection.c', + 'libvpx/vpx_dsp/sse.c', + 'libvpx/vpx_dsp/subtract.c', + 'libvpx/vpx_dsp/sum_squares.c', + 'libvpx/vpx_dsp/variance.c', + 'libvpx/vpx_dsp/vpx_convolve.c', + 'libvpx/vpx_dsp/vpx_dsp_rtcd.c', + 'libvpx/vpx_mem/vpx_mem.c', + 'libvpx/vpx_scale/generic/gen_scalers.c', + 'libvpx/vpx_scale/generic/vpx_scale.c', + 'libvpx/vpx_scale/generic/yv12config.c', + 'libvpx/vpx_scale/generic/yv12extend.c', + 'libvpx/vpx_scale/vpx_scale_rtcd.c', + 'libvpx/vpx_util/vpx_thread.c', + 'libvpx/vpx_util/vpx_write_yuv_frame.c', +], +} diff --git a/media/libvpx/win64_build_fix.patch b/media/libvpx/win64_build_fix.patch new file mode 100644 index 0000000000..bfe0ed131a --- /dev/null +++ b/media/libvpx/win64_build_fix.patch @@ -0,0 +1,22 @@ +diff --git a/vpx_ports/vpx_ports.mk b/vpx_ports/vpx_ports.mk +--- a/vpx_ports/vpx_ports.mk ++++ b/vpx_ports/vpx_ports.mk +@@ -23,17 +23,17 @@ ifeq ($(VPX_ARCH_X86),yes) + PORTS_SRCS-$(HAVE_MMX) += emms_mmx.c + endif + ifeq ($(VPX_ARCH_X86_64),yes) + # Visual Studio x64 does not support the _mm_empty() intrinsic. + PORTS_SRCS-$(HAVE_MMX) += emms_mmx.asm + endif + + ifeq ($(VPX_ARCH_X86_64),yes) +-PORTS_SRCS-$(CONFIG_MSVS) += float_control_word.asm ++PORTS_SRCS-yes += float_control_word.asm + endif + + ifeq ($(VPX_ARCH_X86)$(VPX_ARCH_X86_64),yes) + PORTS_SRCS-yes += x86.h + PORTS_SRCS-yes += x86_abi_support.asm + endif + + PORTS_SRCS-$(VPX_ARCH_ARM) += arm_cpudetect.c -- cgit v1.2.3